diff --git a/.bazelignore b/.bazelignore
index 61b5e9458df6e..01fcdd0d8e050 100644
--- a/.bazelignore
+++ b/.bazelignore
@@ -1,3 +1,4 @@
 # We do not use this library in our Bazel build. It contains an
 # infinitely recursing symlink that makes Bazel very unhappy.
 third_party/ittapi/
+third_party/opentelemetry-cpp
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index a00502dd81d24..426f4698c2b00 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -204,7 +204,7 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    ROCM_VERSION=5.6
+    ROCM_VERSION=6.0
     NINJA_VERSION=1.9.0
     CONDA_CMAKE=yes
     TRITON=yes
@@ -215,7 +215,7 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    ROCM_VERSION=5.7
+    ROCM_VERSION=6.1
     NINJA_VERSION=1.9.0
     CONDA_CMAKE=yes
     TRITON=yes
@@ -229,6 +229,7 @@ case "$image" in
     BASEKIT_VERSION=2024.0.0-49522
     NINJA_VERSION=1.9.0
     CONDA_CMAKE=yes
+    TRITON=yes
     ;;
     pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks)
     ANACONDA_PYTHON_VERSION=3.8
@@ -277,6 +278,7 @@ case "$image" in
     CONDA_CMAKE=yes
     TRITON=yes
     DOCS=yes
+    UNINSTALL_DILL=yes
     ;;
   pytorch-linux-jammy-py3-clang12-executorch)
     ANACONDA_PYTHON_VERSION=3.10
@@ -296,6 +298,21 @@ case "$image" in
     CUDA_VERSION=11.8
     CONDA_CMAKE=yes
     ;;
+  pytorch-linux-jammy-aarch64-py3.10-gcc11)
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=11
+    ACL=yes
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    CONDA_CMAKE=yes
+    # snadampal: skipping sccache due to the following issue
+    # https://github.com/pytorch/pytorch/issues/121559
+    SKIP_SCCACHE_INSTALL=yes
+    # snadampal: skipping llvm src build install because the current version
+    # from pytorch/llvm:9.0.1 is x86 specific
+    SKIP_LLVM_SRC_BUILD_INSTALL=yes
+    ;;
   *)
     # Catch-all for builds that are not hardcoded.
     PROTOBUF=yes
@@ -387,6 +404,9 @@ docker build \
        --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
        --build-arg "EXECUTORCH=${EXECUTORCH}" \
        --build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
+       --build-arg "ACL=${ACL:-}" \
+       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
+       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
        -f $(dirname ${DOCKERFILE})/Dockerfile \
        -t "$tmp_tag" \
        "$@" \
diff --git a/.ci/docker/ci_commit_pins/executorch.txt b/.ci/docker/ci_commit_pins/executorch.txt
index 258c01ef0ca02..adf618bf2fedd 100644
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@@ -1 +1 @@
-663882fe7dc518c04adf3d2ee5ccb7d99f41ade4
+d4b3e5cc607e97afdba79dc90f8ef968142f347c
diff --git a/.ci/docker/ci_commit_pins/huggingface.txt b/.ci/docker/ci_commit_pins/huggingface.txt
index a5f4dc315ee17..f00d6ca4f9ca7 100644
--- a/.ci/docker/ci_commit_pins/huggingface.txt
+++ b/.ci/docker/ci_commit_pins/huggingface.txt
@@ -1 +1 @@
-6c26faa159b79a42d7fa46cb66e2d21523351987
+243e186efbf7fb93328dd6b34927a4e8c8f24395
diff --git a/.ci/docker/ci_commit_pins/triton-rocm.txt b/.ci/docker/ci_commit_pins/triton-rocm.txt
index 4a873428eaa69..2df035af1fdd7 100644
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@@ -1 +1 @@
-dafe1459823b9549417ed95e9720f1b594fab329
+bbe6246e37d8aa791c67daaf9d9d61b26c9ccfdc
diff --git a/.ci/docker/ci_commit_pins/triton-xpu.txt b/.ci/docker/ci_commit_pins/triton-xpu.txt
new file mode 100644
index 0000000000000..36ca144cb6ed5
--- /dev/null
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@@ -0,0 +1 @@
+b8c64f64c18d8cac598b3adb355c21e7439c21de
diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index dc4dffc8b700c..26516efc0b525 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-e28a256d71f3cf2bcc7b69d6bda73a9b855e385e
+45fff310c891f5a92d55445adf8cc9d29df5841e
diff --git a/.ci/docker/common/install_acl.sh b/.ci/docker/common/install_acl.sh
new file mode 100644
index 0000000000000..f5e5ce92af4af
--- /dev/null
+++ b/.ci/docker/common/install_acl.sh
@@ -0,0 +1,16 @@
+set -euo pipefail
+
+readonly version=v23.08
+readonly src_host=https://review.mlplatform.org/ml
+readonly src_repo=ComputeLibrary
+
+# Clone ACL
+[[ ! -d ${src_repo} ]] && git clone ${src_host}/${src_repo}.git
+cd ${src_repo}
+
+git checkout $version
+
+# Build with scons
+scons -j8  Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \
+  os=linux arch=armv8a build=native multi_isa=1 \
+  fixed_format_kernels=1 openmp=1 cppthreads=0
diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh
index e3568b200060b..ebaa17878ade4 100755
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@@ -113,7 +113,6 @@ install_centos() {
     glibc-devel \
     glibc-headers \
     glog-devel \
-    hiredis-devel \
     libstdc++-devel \
     libsndfile-devel \
     make \
@@ -153,7 +152,7 @@ wget https://ossci-linux.s3.amazonaws.com/valgrind-${VALGRIND_VERSION}.tar.bz2
 tar -xjf valgrind-${VALGRIND_VERSION}.tar.bz2
 cd valgrind-${VALGRIND_VERSION}
 ./configure --prefix=/usr/local
-make -j6
+make -j$[$(nproc) - 2]
 sudo make install
 cd ../../
 rm -rf valgrind_build
diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
index 2cbb49c6af312..3a4b48c4d7a33 100755
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@@ -9,10 +9,19 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1)
   MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2)
 
+if [[ $(uname -m) == "aarch64" ]]; then
+  BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"
   case "$MAJOR_PYTHON_VERSION" in
-    2)
-      CONDA_FILE="Miniconda2-latest-Linux-x86_64.sh"
+    3)
+      CONDA_FILE="Miniforge3-Linux-aarch64.sh"
     ;;
+    *)
+      echo "Unsupported ANACONDA_PYTHON_VERSION: $ANACONDA_PYTHON_VERSION"
+      exit 1
+      ;;
+  esac
+else
+  case "$MAJOR_PYTHON_VERSION" in
     3)
       CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
     ;;
@@ -21,6 +30,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
       exit 1
       ;;
   esac
+fi
 
   mkdir -p /opt/conda
   chown jenkins:jenkins /opt/conda
@@ -47,15 +57,39 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   # Uncomment the below when resolved to track the latest conda update
   # as_jenkins conda update -y -n base conda
 
+  if [[ $(uname -m) == "aarch64" ]]; then
+    export SYSROOT_DEP="sysroot_linux-aarch64=2.17"
+  else
+    export SYSROOT_DEP="sysroot_linux-64=2.17"
+  fi
+
   # Install correct Python version
-  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION"
+  # Also ensure sysroot is using a modern GLIBC to match system compilers
+  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
+             python="$ANACONDA_PYTHON_VERSION" \
+             ${SYSROOT_DEP}
+
+  # libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
+  # which is provided in libstdcxx 12 and up.
+  conda_install libstdcxx-ng=12.3.0 -c conda-forge
 
   # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
-  CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
-  if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ]; then
-    conda_install numpy=1.23.5 ${CONDA_COMMON_DEPS}
+  if [[ $(uname -m) == "aarch64" ]]; then
+    CONDA_COMMON_DEPS="astunparse pyyaml setuptools openblas==0.3.25=*openmp* ninja==1.11.1 scons==4.5.2"
+
+    if [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then
+      conda_install numpy=1.24.4 ${CONDA_COMMON_DEPS}
+    else
+      conda_install numpy=1.26.2 ${CONDA_COMMON_DEPS}
+    fi
   else
-    conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
+    CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
+
+    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then
+      conda_install numpy=1.26.0 ${CONDA_COMMON_DEPS}
+    else
+      conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
+    fi
   fi
 
   # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
@@ -89,14 +123,5 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
     pip_install -r /opt/conda/requirements-docs.txt
   fi
 
-  # HACK HACK HACK
-  # gcc-9 for ubuntu-18.04 from http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu
-  # Pulls llibstdc++6 13.1.0-8ubuntu1~18.04 which is too new for conda
-  # So remove libstdc++6.so.3.29 installed by https://anaconda.org/anaconda/libstdcxx-ng/files?version=11.2.0
-  # Same is true for gcc-12 from Ubuntu-22.04
-  if grep -e [12][82].04.[623] /etc/issue >/dev/null; then
-    rm /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/lib/libstdc++.so.6
-  fi
-
   popd
 fi
diff --git a/.ci/docker/common/install_db.sh b/.ci/docker/common/install_db.sh
index 7e18947acbd3b..7e7234063b917 100755
--- a/.ci/docker/common/install_db.sh
+++ b/.ci/docker/common/install_db.sh
@@ -4,11 +4,6 @@ set -ex
 
 install_ubuntu() {
   apt-get update
-  apt-get install -y --no-install-recommends \
-          libhiredis-dev \
-          libleveldb-dev \
-          liblmdb-dev \
-          libsnappy-dev
 
   # Cleanup
   apt-get autoclean && apt-get clean
@@ -20,12 +15,6 @@ install_centos() {
   # See http://fedoraproject.org/wiki/EPEL
   yum --enablerepo=extras install -y epel-release
 
-  yum install -y \
-      hiredis-devel \
-      leveldb-devel \
-      lmdb-devel \
-      snappy-devel
-
   # Cleanup
   yum clean all
   rm -rf /var/cache/yum
diff --git a/.ci/docker/common/install_executorch.sh b/.ci/docker/common/install_executorch.sh
index e6588098e8a49..a3296dc0df3ed 100755
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@@ -48,7 +48,6 @@ setup_executorch() {
 
   install_flatc_from_source
   pip_install .
-  build_executorch_runner "cmake"
 
   # Make sure that all the newly generate files are owned by Jenkins
   chown -R jenkins .
diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh
index de283b18c6fe1..a1a5fde7d2f5b 100755
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@@ -26,18 +26,19 @@ pip_install \
   pytest-cov==4.0.0 \
   pytest-subtests==0.10.0 \
   tabulate==0.9.0 \
-  transformers==4.32.1
+  transformers==4.36.2
 
 pip_install coloredlogs packaging
-retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.17.0.dev20231005006
 
-pip_install -i https://test.pypi.org/simple/ onnx==1.15.0rc2
-pip_install onnxscript==0.1.0.dev20231128 --no-deps
+pip_install onnxruntime==1.17.0
+pip_install onnx==1.15.0
+# pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
+pip_install onnxscript==0.1.0.dev20240315 --no-deps
 
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
 IMPORT_SCRIPT_FILENAME="/tmp/onnx_import_script.py"
-as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2");' > "${IMPORT_SCRIPT_FILENAME}"
+as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3");' > "${IMPORT_SCRIPT_FILENAME}"
 
 # Need a PyTorch version for transformers to work
 pip_install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
diff --git a/.ci/docker/common/install_openssl.sh b/.ci/docker/common/install_openssl.sh
index 2f645f0bcb5e1..c73c9c333c002 100644
--- a/.ci/docker/common/install_openssl.sh
+++ b/.ci/docker/common/install_openssl.sh
@@ -9,7 +9,8 @@ tar xf "${OPENSSL}.tar.gz"
 cd "${OPENSSL}"
 ./config --prefix=/opt/openssl -d '-Wl,--enable-new-dtags,-rpath,$(LIBRPATH)'
 # NOTE: openssl install errors out when built with the -j option
-make -j6; make install_sw
+NPROC=$[$(nproc) - 2]
+make -j${NPROC}; make install_sw
 # Link the ssl libraries to the /usr/lib folder.
 sudo ln -s /opt/openssl/lib/lib* /usr/lib
 cd ..
diff --git a/.ci/docker/common/install_protobuf.sh b/.ci/docker/common/install_protobuf.sh
index 4b7a7a6ac23f7..7c966bcae91d3 100755
--- a/.ci/docker/common/install_protobuf.sh
+++ b/.ci/docker/common/install_protobuf.sh
@@ -2,55 +2,18 @@
 
 set -ex
 
-# This function installs protobuf 3.17
-install_protobuf_317() {
-  pb_dir="/usr/temp_pb_install_dir"
-  mkdir -p $pb_dir
+pb_dir="/usr/temp_pb_install_dir"
+mkdir -p $pb_dir
 
-  # On the nvidia/cuda:9-cudnn7-devel-centos7 image we need this symlink or
-  # else it will fail with
-  #   g++: error: ./../lib64/crti.o: No such file or directory
-  ln -s /usr/lib64 "$pb_dir/lib64"
+# On the nvidia/cuda:9-cudnn7-devel-centos7 image we need this symlink or
+# else it will fail with
+#   g++: error: ./../lib64/crti.o: No such file or directory
+ln -s /usr/lib64 "$pb_dir/lib64"
 
-  curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
-  tar -xvz -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
-  # -j6 to balance memory usage and speed.
-  # naked `-j` seems to use too much memory.
-  pushd "$pb_dir" && ./configure && make -j6 && make -j6 check && sudo make -j6 install && sudo ldconfig
-  popd
-  rm -rf $pb_dir
-}
+curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
 
-install_ubuntu() {
-  # Ubuntu 14.04 has cmake 2.8.12 as the default option, so we will
-  # install cmake3 here and use cmake3.
-  apt-get update
-  if [[ "$UBUNTU_VERSION" == 14.04 ]]; then
-    apt-get install -y --no-install-recommends cmake3
-  fi
-
-  # Cleanup
-  apt-get autoclean && apt-get clean
-  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-  install_protobuf_317
-}
-
-install_centos() {
-  install_protobuf_317
-}
-
-# Install base packages depending on the base OS
-ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
-case "$ID" in
-  ubuntu)
-    install_ubuntu
-    ;;
-  centos)
-    install_centos
-    ;;
-  *)
-    echo "Unable to determine OS..."
-    exit 1
-    ;;
-esac
+tar -xvz --no-same-owner -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
+NPROC=$[$(nproc) - 2]
+pushd "$pb_dir" && ./configure && make -j${NPROC} && make -j${NPROC} check && sudo make -j${NRPOC} install && sudo ldconfig
+popd
+rm -rf $pb_dir
diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh
index caae5c112b581..085304ac7c978 100644
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@@ -61,6 +61,10 @@ install_ubuntu() {
                    rocprofiler-dev \
                    roctracer-dev
 
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.1) ]]; then
+        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
+    fi
+
     # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
     # search for all unversioned packages
     # if search fails it will abort this script; use true to avoid case where search fails
@@ -80,6 +84,14 @@ install_ubuntu() {
         fi
     fi
 
+    # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then
+        for kdb in /opt/rocm/share/miopen/db/*.kdb
+        do
+            sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
+        done
+    fi
+
     # Cleanup
     apt-get autoclean && apt-get clean
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
@@ -151,6 +163,14 @@ install_centos() {
       fi
   fi
 
+  # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
+  if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then
+      for kdb in /opt/rocm/share/miopen/db/*.kdb
+      do
+          sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
+      done
+  fi
+
   # Cleanup
   yum clean all
   rm -rf /var/cache/yum
diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh
index 457e0ad77361a..94b94661c4606 100644
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@@ -7,7 +7,7 @@ git clone https://bitbucket.org/icl/magma.git
 pushd magma
 
 # Version 2.7.2 + ROCm related updates
-git checkout 823531632140d0edcb7e77c3edc0e837421471c5
+git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6
 
 cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
 echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
index ebde6c3f44761..de009c1a3adbf 100755
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@@ -13,8 +13,11 @@ conda_reinstall() {
 }
 
 if [ -n "${ROCM_VERSION}" ]; then
-  TRITON_REPO="https://github.com/ROCmSoftwarePlatform/triton"
+  TRITON_REPO="https://github.com/openai/triton"
   TRITON_TEXT_FILE="triton-rocm"
+elif [ -n "${BASEKIT_VERSION}" ]; then
+  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
+  TRITON_TEXT_FILE="triton-xpu"
 else
   TRITON_REPO="https://github.com/openai/triton"
   TRITON_TEXT_FILE="triton"
@@ -64,5 +67,6 @@ if [ -n "${CONDA_CMAKE}" ]; then
   # latest numpy version, which fails ASAN tests with the following import error: Numba
   # needs NumPy 1.20 or less.
   conda_reinstall cmake="${CMAKE_VERSION}"
-  conda_reinstall numpy="${NUMPY_VERSION}"
+  # Note that we install numpy with pip as conda might not have the version we want
+  pip_install --force-reinstall numpy=="${NUMPY_VERSION}"
 fi
diff --git a/.ci/docker/common/install_ucc.sh b/.ci/docker/common/install_ucc.sh
index 333e44e6f779f..2224811bd987b 100755
--- a/.ci/docker/common/install_ucc.sh
+++ b/.ci/docker/common/install_ucc.sh
@@ -36,7 +36,12 @@ function install_ucc() {
   git submodule update --init --recursive
 
   ./autogen.sh
-  ./configure --prefix=$UCC_HOME --with-ucx=$UCX_HOME --with-cuda=$with_cuda
+  # We only run distributed tests on Tesla M60 and A10G
+  NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
+  ./configure --prefix=$UCC_HOME          \
+    --with-ucx=$UCX_HOME                  \
+    --with-cuda=$with_cuda                \
+    --with-nvcc-gencode="${NVCC_GENCODE}"
   time make -j
   sudo make install
 
diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh
index 813a7c4e278eb..d98ad2049b47c 100644
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@@ -3,7 +3,7 @@ set -xe
 
 
 # Intel® software for general purpose GPU capabilities.
-# Refer to https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html
+# Refer to https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html
 
 # Intel® oneAPI Base Toolkit (version 2024.0.0) has been updated to include functional and security updates.
 # Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html
@@ -21,7 +21,7 @@ function install_ubuntu() {
         | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
 
     # Add the signed entry to APT sources and configure the APT client to use the Intel repository
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/production/2328 unified" \
+    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
         | tee /etc/apt/sources.list.d/intel-gpu-jammy.list
     echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
         | tee /etc/apt/sources.list.d/oneAPI.list
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index b12cc8c236e66..75852c6b81ce2 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -15,7 +15,7 @@ click
 #Pinned versions:
 #test that import:
 
-coremltools==5.0b5
+coremltools==5.0b5 ; python_version < "3.12"
 #Description: Apple framework for ML integration
 #Pinned versions: 5.0b5
 #test that import:
@@ -25,6 +25,11 @@ coremltools==5.0b5
 #Pinned versions:
 #test that import:
 
+dill==0.3.7
+#Description: dill extends pickle with serializing and de-serializing for most built-ins
+#Pinned versions: 0.3.7
+#test that import: dynamo/test_replay_record.py test_dataloader.py test_datapipe.py test_serialization.py
+
 expecttest==0.1.6
 #Description: method for writing tests where test framework auto populates
 # the expected output based on previous runs
@@ -47,6 +52,11 @@ junitparser==2.1.1
 #Pinned versions: 2.1.1
 #test that import:
 
+lark==0.12.0
+#Description: parser
+#Pinned versions: 0.12.0
+#test that import:
+
 librosa>=0.6.2 ; python_version < "3.11"
 #Description: A python package for music and audio analysis
 #Pinned versions: >=0.6.2
@@ -66,7 +76,7 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Description: A testing library that allows you to replace parts of your
 #system under test with mock objects
 #Pinned versions:
-#test that import: test_module_init.py, test_modules.py, test_nn.py,
+#test that import: test_modules.py, test_nn.py,
 #test_testing.py
 
 #MonkeyType # breaks pytorch-xla-linux-bionic-py3.7-clang8
@@ -75,10 +85,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:
 
-mypy==1.7.0
+mypy==1.9.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.7.0
+#Pinned versions: 1.9.0
 #test that import: test_typing.py, test_type_hints.py
 
 networkx==2.8.8
@@ -124,9 +134,9 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py
 
-optree==0.9.1
+optree==0.11.0
 #Description: A library for tree manipulation
-#Pinned versions: 0.9.1
+#Pinned versions: 0.11.0
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
 #test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
 #common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
@@ -137,9 +147,9 @@ optree==0.9.1
 #test_pointwise_ops.py, test_dtensor_ops.py, test_torchinductor.py, test_fx.py,
 #test_fake_tensor.py, test_mps.py
 
-pillow==10.0.1
+pillow==10.3.0
 #Description:  Python Imaging Library fork
-#Pinned versions: 10.0.1
+#Pinned versions: 10.3.0
 #test that import:
 
 protobuf==3.20.2
@@ -162,11 +172,6 @@ pytest-xdist==3.3.1
 #Pinned versions:
 #test that import:
 
-pytest-shard==0.1.2
-#Description: plugin spliting up tests in pytest
-#Pinned versions:
-#test that import:
-
 pytest-flakefinder==1.1.0
 #Description: plugin for rerunning tests a fixed number of times in pytest
 #Pinned versions: 1.1.0
@@ -223,12 +228,11 @@ scikit-image==0.20.0 ; python_version >= "3.10"
 #Pinned versions: 0.20.3
 #test that import:
 
-scipy==1.6.3 ; python_version < "3.10"
-scipy==1.8.1 ; python_version == "3.10"
-scipy==1.10.1 ; python_version == "3.11"
+scipy==1.10.1 ; python_version <= "3.11"
+scipy==1.12.0 ; python_version == "3.12"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
-#Pinned versions: 1.6.3
+#Pinned versions: 1.10.1
 #test that import: test_unary_ufuncs.py, test_torch.py,test_tensor_creation_ops.py
 #test_spectral_ops.py, test_sparse_csr.py, test_reductions.py,test_nn.py
 #test_linalg.py, test_binary_ufuncs.py
@@ -243,7 +247,8 @@ tb-nightly==2.13.0a20230426
 #Pinned versions:
 #test that import:
 
-#typing-extensions
+# needed by torchgen utils
+typing-extensions
 #Description: type hints for python
 #Pinned versions:
 #test that import:
@@ -258,9 +263,10 @@ unittest-xml-reporting<=3.2.0,>=2.0.0
 #Pinned versions:
 #test that import:
 
-lintrunner==0.10.7
+#lintrunner is supported on aarch64-linux only from 0.12.4 version
+lintrunner==0.12.5
 #Description: all about linters!
-#Pinned versions: 0.10.7
+#Pinned versions: 0.12.5
 #test that import:
 
 rockset==1.0.3
@@ -268,14 +274,14 @@ rockset==1.0.3
 #Pinned versions: 1.0.3
 #test that import:
 
-ghstack==0.7.1
+ghstack==0.8.0
 #Description: ghstack tool
-#Pinned versions: 0.7.1
+#Pinned versions: 0.8.0
 #test that import:
 
-jinja2==3.1.2
+jinja2==3.1.4
 #Description: jinja2 template engine
-#Pinned versions: 3.1.2
+#Pinned versions: 3.1.4
 #test that import:
 
 pytest-cpp==2.3.0
@@ -293,7 +299,8 @@ tensorboard==2.13.0
 #Pinned versions:
 #test that import: test_tensorboard
 
-pywavelets==1.4.1
+pywavelets==1.4.1 ; python_version < "3.12"
+pywavelets==1.5.0 ; python_version >= "3.12"
 #Description: This is a requirement of scikit-image, we need to pin
 # it here because 1.5.0 conflicts with numpy 1.21.2 used in CI
 #Pinned versions: 1.4.1
diff --git a/.ci/docker/triton_version.txt b/.ci/docker/triton_version.txt
index ccbccc3dc6263..4a36342fcab70 100644
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@@ -1 +1 @@
-2.2.0
+3.0.0
diff --git a/.ci/docker/ubuntu-xpu/Dockerfile b/.ci/docker/ubuntu-xpu/Dockerfile
index a34cb3b20887f..9a3ff68d159b9 100644
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@@ -61,15 +61,20 @@ COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
 
+# Install XPU Dependencies
+ARG BASEKIT_VERSION
+COPY ./common/install_xpu.sh install_xpu.sh
+RUN bash ./install_xpu.sh && rm install_xpu.sh
+
 ARG TRITON
 # Install triton, this needs to be done before sccache because the latter will
 # try to reach out to S3, which docker build runners don't have access
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
-# TODO: will add triton xpu commit
-COPY ci_commit_pins/triton.txt triton.txt
+COPY ci_commit_pins/triton-xpu.txt triton-xpu.txt
+COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton.txt
+RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt
 
 # (optional) Install database packages like LMDB and LevelDB
 ARG DB
@@ -85,11 +90,6 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}
 
-# Install XPU Dependencies
-ARG BASEKIT_VERSION
-COPY ./common/install_xpu.sh install_xpu.sh
-RUN bash ./install_xpu.sh && rm install_xpu.sh
-
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 0f269e84b09d6..b471ce3b8963c 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -37,6 +37,7 @@ COPY requirements-ci.txt requirements-docs.txt /opt/conda/
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt /opt/conda/requirements-docs.txt
+RUN if [ -n "${UNINSTALL_DILL}" ]; then pip uninstall -y dill; fi
 
 # Install gcc
 ARG GCC_VERSION
@@ -160,10 +161,19 @@ COPY ./common/install_onnx.sh ./common/common_utils.sh ./
 RUN if [ -n "${ONNX}" ]; then bash ./install_onnx.sh; fi
 RUN rm install_onnx.sh common_utils.sh
 
+# (optional) Build ACL
+ARG ACL
+COPY ./common/install_acl.sh install_acl.sh
+RUN if [ -n "${ACL}" ]; then bash ./install_acl.sh; fi
+RUN rm install_acl.sh
+ENV INSTALLED_ACL ${ACL}
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
+ARG SKIP_SCCACHE_INSTALL
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
-RUN bash ./install_cache.sh && rm install_cache.sh
+RUN if [ -z "${SKIP_SCCACHE_INSTALL}" ]; then bash ./install_cache.sh; fi
+RUN rm install_cache.sh
 
 # Add jni.h for java host build
 COPY ./common/install_jni.sh install_jni.sh
@@ -180,7 +190,9 @@ ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
 
 # Install LLVM dev version (Defined in the pytorch/builder github repository)
+ARG SKIP_LLVM_SRC_BUILD_INSTALL
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
+RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi
 
 # AWS specific CUDA build guidance
 ENV TORCH_CUDA_ARCH_LIST Maxwell
diff --git a/.ci/onnx/common.sh b/.ci/onnx/common.sh
index 2c49e3ed3a2e5..3de5836a02858 100644
--- a/.ci/onnx/common.sh
+++ b/.ci/onnx/common.sh
@@ -1,5 +1,9 @@
+#!/bin/bash
+
 set -ex
 
+source "$(dirname "${BASH_SOURCE[0]}")/../pytorch/common_utils.sh"
+
 LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
 TEST_DIR="$ROOT_DIR/test"
diff --git a/.ci/onnx/test.sh b/.ci/onnx/test.sh
index 88fad177b7e21..a7d3b72c62a7e 100755
--- a/.ci/onnx/test.sh
+++ b/.ci/onnx/test.sh
@@ -3,6 +3,20 @@
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 
+# Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
+WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
+cleanup_workspace() {
+  echo "sudo may print the following warning message that can be ignored. The chown command will still run."
+  echo "    sudo: setrlimit(RLIMIT_STACK): Operation not permitted"
+  echo "For more details refer to https://github.com/sudo-project/sudo/issues/42"
+  sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
+}
+# Disable shellcheck SC2064 as we want to parse the original owner immediately.
+# shellcheck disable=SC2064
+trap_add cleanup_workspace EXIT
+sudo chown -R jenkins /var/lib/jenkins/workspace
+git config --global --add safe.directory /var/lib/jenkins/workspace
+
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
   # TODO: This can be removed later once vision is also part of the Docker image
   pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
index b72461b5a68cb..b81caa0513691 100755
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@@ -81,7 +81,35 @@ if ! which conda; then
     export USE_MKLDNN=0
   fi
 else
-  export CMAKE_PREFIX_PATH=/opt/conda
+  # CMAKE_PREFIX_PATH precedences
+  # 1. $CONDA_PREFIX, if defined. This follows the pytorch official build instructions.
+  # 2. /opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}, if ANACONDA_PYTHON_VERSION defined.
+  #    This is for CI, which defines ANACONDA_PYTHON_VERSION but not CONDA_PREFIX.
+  # 3. $(conda info --base). The fallback value of pytorch official build
+  #    instructions actually refers to this.
+  #    Commonly this is /opt/conda/
+  if [[ -v CONDA_PREFIX ]]; then
+    export CMAKE_PREFIX_PATH=${CONDA_PREFIX}
+  elif [[ -v ANACONDA_PYTHON_VERSION ]]; then
+    export CMAKE_PREFIX_PATH="/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}"
+  else
+    # already checked by `! which conda`
+    CMAKE_PREFIX_PATH="$(conda info --base)"
+    export CMAKE_PREFIX_PATH
+  fi
+
+  # Workaround required for MKL library linkage
+  # https://github.com/pytorch/pytorch/issues/119557
+  if [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then
+    export CMAKE_LIBRARY_PATH="/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/lib/"
+    export CMAKE_INCLUDE_PATH="/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/include/"
+  fi
+fi
+
+if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
+  export USE_MKLDNN=1
+  export USE_MKLDNN_ACL=1
+  export ACL_ROOT_DIR=/ComputeLibrary
 fi
 
 if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
@@ -210,6 +238,24 @@ if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]
   export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi
 
+# Do not change workspace permissions for ROCm CI jobs
+# as it can leave workspace with bad permissions for cancelled jobs
+if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
+  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
+  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
+  cleanup_workspace() {
+    echo "sudo may print the following warning message that can be ignored. The chown command will still run."
+    echo "    sudo: setrlimit(RLIMIT_STACK): Operation not permitted"
+    echo "For more details refer to https://github.com/sudo-project/sudo/issues/42"
+    sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
+  }
+  # Disable shellcheck SC2064 as we want to parse the original owner immediately.
+  # shellcheck disable=SC2064
+  trap_add cleanup_workspace EXIT
+  sudo chown -R jenkins /var/lib/jenkins/workspace
+  git config --global --add safe.directory /var/lib/jenkins/workspace
+fi
+
 if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
   set -e
 
@@ -235,13 +281,17 @@ else
   ( ! get_exit_code python setup.py clean bad_argument )
 
   if [[ "$BUILD_ENVIRONMENT" != *libtorch* ]]; then
-
     # rocm builds fail when WERROR=1
     # XLA test build fails when WERROR=1
     # set only when building other architectures
     # or building non-XLA tests.
     if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
           "$BUILD_ENVIRONMENT" != *xla* ]]; then
+      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
+        # Install numpy-2.0 release candidate for builds
+        # Which should be backward compatible with Numpy-1.X
+        python -mpip install --pre numpy==2.0.0rc1
+      fi
       WERROR=1 python setup.py bdist_wheel
     else
       python setup.py bdist_wheel
@@ -341,4 +391,8 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
   python tools/stats/export_test_times.py
 fi
 
-print_sccache_stats
+# snadampal: skipping it till sccache support added for aarch64
+# https://github.com/pytorch/pytorch/issues/121559
+if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
+  print_sccache_stats
+fi
diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index fa46391552217..51297f7bfff88 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -158,6 +158,11 @@ function install_torchvision() {
   fi
 }
 
+function install_tlparse() {
+  pip_install --user "tlparse==0.3.7"
+  PATH="$(python -m site --user-base)/bin:$PATH"
+}
+
 function install_torchrec_and_fbgemm() {
   local torchrec_commit
   torchrec_commit=$(get_pinned_commit torchrec)
diff --git a/.ci/pytorch/macos-common.sh b/.ci/pytorch/macos-common.sh
index eef066b4dc9b5..1c7bc103673de 100755
--- a/.ci/pytorch/macos-common.sh
+++ b/.ci/pytorch/macos-common.sh
@@ -9,7 +9,7 @@ sysctl -a | grep machdep.cpu
 
 # These are required for both the build job and the test job.
 # In the latter to test cpp extensions.
-export MACOSX_DEPLOYMENT_TARGET=11.0
+export MACOSX_DEPLOYMENT_TARGET=11.1
 export CXX=clang++
 export CC=clang
 
diff --git a/.ci/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh
index 739d0ba3357fe..a54b8c360eba5 100755
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@@ -149,6 +149,8 @@ test_jit_hooks() {
   assert_git_not_dirty
 }
 
+install_tlparse
+
 if [[ $NUM_TEST_SHARDS -gt 1 ]]; then
   test_python_shard "${SHARD_NUMBER}"
   if [[ "${SHARD_NUMBER}" == 1 ]]; then
diff --git a/.ci/pytorch/multigpu-test.sh b/.ci/pytorch/multigpu-test.sh
index 70ae4d2974e8c..7e04e92919cb7 100755
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@@ -34,7 +34,6 @@ time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/test
 # functional collective tests
 time python test/run_test.py --verbose -i distributed/test_functional_api
 
-
 # DTensor tests
 time python test/run_test.py --verbose -i distributed/_tensor/test_random_ops
 time python test/run_test.py --verbose -i distributed/_tensor/test_dtensor_compile
@@ -46,9 +45,14 @@ time python test/run_test.py --verbose -i distributed/test_device_mesh
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
+time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state
+
+# FSDP2 tests
+time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh
 
 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
-time python test/run_test.py --verbose -i test_optim -- -k optimizers_with_varying_tensors
+time python test/run_test.py --verbose -i test_optim -- -k test_forloop_goes_right_direction_multigpu
+time python test/run_test.py --verbose -i test_optim -- -k test_mixed_device_dtype
 time python test/run_test.py --verbose -i test_foreach -- -k test_tensors_grouping
 assert_git_not_dirty
diff --git a/.ci/pytorch/perf_test/compare_with_baseline.py b/.ci/pytorch/perf_test/compare_with_baseline.py
index 49b77cbba2a5d..caf9e993bd29c 100644
--- a/.ci/pytorch/perf_test/compare_with_baseline.py
+++ b/.ci/pytorch/perf_test/compare_with_baseline.py
@@ -59,16 +59,16 @@
 print("sample sigma: ", sample_sigma)
 
 if math.isnan(sample_mean):
-    raise Exception("""Error: sample mean is NaN""")
+    raise Exception("""Error: sample mean is NaN""")  # noqa: TRY002
 elif math.isnan(sample_sigma):
-    raise Exception("""Error: sample sigma is NaN""")
+    raise Exception("""Error: sample sigma is NaN""")  # noqa: TRY002
 
 z_value = (sample_mean - mean) / sigma
 
 print("z-value: ", z_value)
 
 if z_value >= 3:
-    raise Exception(
+    raise Exception(  # noqa: TRY002
         f"""\n
 z-value >= 3, there is high chance of perf regression.\n
 To reproduce this regression, run
diff --git a/.ci/pytorch/python_doc_push_script.sh b/.ci/pytorch/python_doc_push_script.sh
index 86c2037b12868..d4076d3469e9f 100755
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@@ -26,8 +26,8 @@ echo "error: python_doc_push_script.sh: version (arg2) not specified"
 fi
 
 # Argument 1: Where to copy the built documentation to
-# (pytorch.github.io/$install_path)
-install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}"
+# (pytorch_docs/$install_path)
+install_path="${1:-${DOCS_INSTALL_PATH:-${DOCS_VERSION}}}"
 if [ -z "$install_path" ]; then
 echo "error: python_doc_push_script.sh: install_path (arg1) not specified"
   exit 1
@@ -68,8 +68,8 @@ build_docs () {
 }
 
 
-git clone https://github.com/pytorch/pytorch.github.io -b "$branch" --depth 1
-pushd pytorch.github.io
+git clone https://github.com/pytorch/docs pytorch_docs -b "$branch" --depth 1
+pushd pytorch_docs
 
 export LC_ALL=C
 export PATH=/opt/conda/bin:$PATH
@@ -105,6 +105,7 @@ if [ "$is_main_doc" = true ]; then
     echo undocumented objects found:
     cat build/coverage/python.txt
     echo "Make sure you've updated relevant .rsts in docs/source!"
+    echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
     exit 1
   fi
 else
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index e15fc73cf7320..a22bebc166792 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -6,6 +6,27 @@
 
 set -ex
 
+# shellcheck source=./common.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+# Do not change workspace permissions for ROCm CI jobs
+# as it can leave workspace with bad permissions for cancelled jobs
+if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
+  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
+  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
+  cleanup_workspace() {
+    echo "sudo may print the following warning message that can be ignored. The chown command will still run."
+    echo "    sudo: setrlimit(RLIMIT_STACK): Operation not permitted"
+    echo "For more details refer to https://github.com/sudo-project/sudo/issues/42"
+    sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
+  }
+  # Disable shellcheck SC2064 as we want to parse the original owner immediately.
+  # shellcheck disable=SC2064
+  trap_add cleanup_workspace EXIT
+  sudo chown -R jenkins /var/lib/jenkins/workspace
+  git config --global --add safe.directory /var/lib/jenkins/workspace
+fi
+
 echo "Environment variables:"
 env
 
@@ -90,9 +111,6 @@ if [[ -n $TESTS_TO_INCLUDE ]]; then
   INCLUDE_CLAUSE="--include $TESTS_TO_INCLUDE"
 fi
 
-# shellcheck source=./common.sh
-source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
-
 echo "Environment variables"
 env
 
@@ -130,6 +148,8 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
 elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
   export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
+  # setting PYTHON_TEST_EXTRA_OPTION
+  export PYTHON_TEST_EXTRA_OPTION="--xpu"
 fi
 
 if [[ "$TEST_CONFIG" == *crossref* ]]; then
@@ -137,6 +157,8 @@ if [[ "$TEST_CONFIG" == *crossref* ]]; then
 fi
 
 if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  # regression in ROCm 6.0 on MI50 CI runners due to hipblaslt; remove in 6.1
+  export VALGRIND=OFF
   # Print GPU info
   rocminfo
   rocminfo | grep -E 'Name:.*\sgfx|Marketing'
@@ -159,6 +181,13 @@ if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
   export PATH="$HOME/.local/bin:$PATH"
 fi
 
+if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
+  # TODO: revisit this once the CI is stabilized on aarch64 linux
+  export VALGRIND=OFF
+fi
+
+install_tlparse
+
 # DANGER WILL ROBINSON.  The LD_PRELOAD here could cause you problems
 # if you're not careful.  Check this if you made some changes and the
 # ASAN test is not working
@@ -205,8 +234,6 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
     export LD_PRELOAD=/usr/lib/llvm-15/lib/clang/15.0.7/lib/linux/libclang_rt.asan-x86_64.so
     # Disable valgrind for asan
     export VALGRIND=OFF
-    # Increase stack size, because ASAN red zones use more stack
-    ulimit -s 81920
 
     (cd test && python -c "import torch; print(torch.__version__, torch.version.git_version)")
     echo "The next four invocations are expected to crash; if they don't that means ASAN/UBSAN is misconfigured"
@@ -250,14 +277,14 @@ test_python_shard() {
 
   # Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
   # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION
 
   assert_git_not_dirty
 }
 
 test_python() {
   # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
   assert_git_not_dirty
 }
 
@@ -268,34 +295,13 @@ test_dynamo_shard() {
     exit 1
   fi
   python tools/dynamo/verify_dynamo.py
-  # Temporarily disable test_fx for dynamo pending the investigation on TTS
-  # regression in https://github.com/pytorch/torchdynamo/issues/784
+  # PLEASE DO NOT ADD ADDITIONAL EXCLUDES HERE.
+  # Instead, use @skipIfTorchDynamo on your tests.
   time python test/run_test.py --dynamo \
+    --exclude-inductor-tests \
     --exclude-jit-executor \
     --exclude-distributed-tests \
-    --exclude \
-      test_ao_sparsity \
-      test_autograd \
-      test_jit \
-      test_proxy_tensor \
-      test_quantization \
-      test_public_bindings \
-      test_dataloader \
-      test_reductions \
-      test_namedtensor \
-      test_namedtuple_return_api \
-      profiler/test_profiler \
-      profiler/test_profiler_tree \
-      test_overrides \
-      test_python_dispatch \
-      test_fx \
-      test_package \
-      test_legacy_vmap \
-      test_custom_ops \
-      test_content_store \
-      export/test_db \
-      functorch/test_dims \
-      functorch/test_aotdispatch \
+    --exclude-torch-export-tests \
     --shard "$1" "$NUM_TEST_SHARDS" \
     --verbose
   assert_git_not_dirty
@@ -304,11 +310,23 @@ test_dynamo_shard() {
 test_inductor_distributed() {
   # Smuggle a few multi-gpu tests here so that we don't have to request another large node
   echo "Testing multi_gpu tests in test_torchinductor"
-  pytest test/inductor/test_torchinductor.py -k test_multi_gpu
-  pytest test/inductor/test_aot_inductor.py -k test_non_default_cuda_device
-  pytest test/inductor/test_aot_inductor.py -k test_replicate_on_devices
-  pytest test/distributed/_tensor/test_dtensor_compile.py
-  pytest test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
+  python test/run_test.py -i inductor/test_torchinductor.py -k test_multi_gpu --verbose
+  python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_cuda_device --verbose
+  python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
+  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
+  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
+  python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
+  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose
 
   # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
   # with if required # gpus aren't available
@@ -320,16 +338,24 @@ test_inductor() {
   python tools/dynamo/verify_dynamo.py
   python test/run_test.py --inductor --include test_modules test_ops test_ops_gradients test_torch --verbose
   # Do not add --inductor for the following inductor unit tests, otherwise we will fail because of nested dynamo state
-  python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo --verbose
+  python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor --verbose
 
   # docker build uses bdist_wheel which does not work with test_aot_inductor
   # TODO: need a faster way to build
   if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
       BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
-      CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aot_inductor
+      CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
   fi
 }
 
+test_inductor_cpp_wrapper_abi_compatible() {
+  export TORCHINDUCTOR_ABI_COMPATIBLE=1
+  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
+  # cpu stack allocation causes segfault and needs more investigation
+  TORCHINDUCTOR_STACK_ALLOCATION=0 python test/run_test.py --include inductor/test_cpu_cpp_wrapper
+  python test/run_test.py --include inductor/test_cuda_cpp_wrapper
+}
+
 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
 # For example 'dynamic_aot_eager_torchbench' TEST_CONFIG means we run
 # the benchmark script with '--dynamic-shapes --backend aot_eager --device cuda'
@@ -422,7 +448,7 @@ test_perf_for_dashboard() {
             --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv"
       fi
       if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
-        python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_ABI_COMPATIBLE=1 python "benchmarks/dynamo/$suite.py" \
             "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
             --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_cuda_${target}.csv"
       fi
@@ -431,6 +457,17 @@ test_perf_for_dashboard() {
             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
             --output "$TEST_REPORTS_DIR/${backend}_max_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv"
       fi
+      if [[ "$DASHBOARD_TAG" == *cudagraphs_low_precision-true* ]] && [[ "$mode" == "inference" ]]; then
+        # TODO: This has a new dtype called quant and the benchmarks script needs to be updated to support this.
+        # The tentative command is as follows. It doesn't work now, but it's ok because we only need mock data
+        # to fill the dashboard.
+        python "benchmarks/dynamo/$suite.py" \
+          "${target_flag[@]}" --"$mode" --quant --backend "$backend" "$@" \
+          --output "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_cuda_${target}.csv" || true
+        # Copy cudagraph results as mock data, easiest choice?
+        cp "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_cuda_${target}.csv" \
+          "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_cuda_${target}.csv"
+      fi
     done
   done
 }
@@ -466,6 +503,11 @@ test_single_dynamo_benchmark() {
     test_perf_for_dashboard "$suite" \
       "${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}"
   else
+    if [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
+      # Test AOTInductor with the ABI-compatible mode on CI
+      # This can be removed once the ABI-compatible mode becomes default.
+      export TORCHINDUCTOR_ABI_COMPATIBLE=1
+    fi
     python "benchmarks/dynamo/$suite.py" \
       --ci --accuracy --timing --explain \
       "${DYNAMO_BENCHMARK_FLAGS[@]}" \
@@ -480,6 +522,11 @@ test_single_dynamo_benchmark() {
   fi
 }
 
+test_inductor_micro_benchmark() {
+  TEST_REPORTS_DIR=$(pwd)/test/test-micro-reports
+  python benchmarks/gpt_fast/benchmark.py
+}
+
 test_dynamo_benchmark() {
   # Usage: test_dynamo_benchmark huggingface 0
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
@@ -522,7 +569,7 @@ test_inductor_torchbench_smoketest_perf() {
   # The threshold value needs to be actively maintained to make this check useful
   python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4
 
-  python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \
+  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \
     --export-aot-inductor --only nanogpt --output "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv"
   # The threshold value needs to be actively maintained to make this check useful
   # The perf number of nanogpt seems not very stable, e.g.
@@ -543,6 +590,56 @@ test_inductor_torchbench_smoketest_perf() {
   done
 }
 
+test_inductor_torchbench_cpu_smoketest_perf(){
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+
+  #set jemalloc
+  JEMALLOC_LIB="/usr/lib/x86_64-linux-gnu/libjemalloc.so.2"
+  IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
+  export LD_PRELOAD="$JEMALLOC_LIB":"$IOMP_LIB":"$LD_PRELOAD"
+  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
+  export KMP_AFFINITY=granularity=fine,compact,1,0
+  export KMP_BLOCKTIME=1
+  CORES=$(lscpu | grep Core | awk '{print $4}')
+  export OMP_NUM_THREADS=$CORES
+  end_core=$(( CORES-1 ))
+
+  MODELS_SPEEDUP_TARGET=benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
+
+  grep -v '^ *#' < "$MODELS_SPEEDUP_TARGET" | while IFS=',' read -r -a model_cfg
+  do
+    local model_name=${model_cfg[0]}
+    local data_type=${model_cfg[1]}
+    local speedup_target=${model_cfg[4]}
+    if [[ ${model_cfg[3]} == "cpp" ]]; then
+      export TORCHINDUCTOR_CPP_WRAPPER=1
+    else
+      unset TORCHINDUCTOR_CPP_WRAPPER
+    fi
+    local output_name="$TEST_REPORTS_DIR/inductor_inference_${model_cfg[0]}_${model_cfg[1]}_${model_cfg[2]}_${model_cfg[3]}_cpu_smoketest.csv"
+
+    if [[ ${model_cfg[2]} == "dynamic" ]]; then
+      taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
+        --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" --dynamic-shapes \
+        --dynamic-batch-only --freezing --timeout 9000 --backend=inductor --output "$output_name"
+    else
+      taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
+        --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" \
+        --freezing --timeout 9000 --backend=inductor --output "$output_name"
+    fi
+    cat "$output_name"
+    # The threshold value needs to be actively maintained to make this check useful.
+    python benchmarks/dynamo/check_perf_csv.py -f "$output_name" -t "$speedup_target"
+  done
+}
+
+test_torchbench_gcp_smoketest(){
+  pushd "${TORCHBENCHPATH}"
+  python test.py -v
+  popd
+}
+
 test_python_gloo_with_tls() {
   source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh"
   assert_git_not_dirty
@@ -693,9 +790,8 @@ test_xpu_bin(){
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
 
-  for xpu_case in "${BUILD_BIN_DIR}"/*{xpu,sycl}*
-  do
-    if [[ "$xpu_case" != *"*"* ]]; then
+  for xpu_case in "${BUILD_BIN_DIR}"/*{xpu,sycl}*; do
+    if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then
       case_name=$(basename "$xpu_case")
       echo "Testing ${case_name} ..."
       "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml
@@ -943,7 +1039,8 @@ test_bazel() {
 
     tools/bazel test --config=cpu-only --test_timeout=480 --test_output=all --test_tag_filters=-gpu-required --test_filter=-*CUDA :all_tests
   else
-    tools/bazel test --test_output=errors \
+    # Increase the test timeout to 480 like CPU tests because modules_test frequently timeout
+    tools/bazel test --test_timeout=480 --test_output=errors \
       //:any_test \
       //:autograd_test \
       //:dataloader_test \
@@ -1038,14 +1135,17 @@ test_docs_test() {
 }
 
 test_executorch() {
+  echo "Install torchvision and torchaudio"
+  install_torchvision
+  install_torchaudio
+
   pushd /executorch
 
-  echo "Install torchvision and torchaudio"
-  # TODO(huydhn): Switch this to the pinned commits on ExecuTorch once they are
-  # there.  These libraries need to be built here, and not part of the Docker
-  # image because they require the target version of torch to be installed first
-  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git"
-  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git"
+  # NB: We need to build ExecuTorch runner here and not inside the Docker image
+  # because it depends on PyTorch
+  # shellcheck disable=SC1091
+  source .ci/scripts/utils.sh
+  build_executorch_runner "cmake"
 
   echo "Run ExecuTorch regression tests for some models"
   # NB: This is a sample model, more can be added here
@@ -1063,11 +1163,33 @@ test_executorch() {
   assert_git_not_dirty
 }
 
+test_linux_aarch64(){
+  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
+       test_transformers test_multiprocessing test_numpy_interop --verbose
+
+  # Dynamo tests
+  python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
+       dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
+       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles --verbose
+
+  # Inductor tests
+  python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
+       inductor/test_config inductor/test_control_flow inductor/test_coordinate_descent_tuner inductor/test_fx_fusion \
+       inductor/test_group_batch_fusion inductor/test_inductor_freezing inductor/test_inductor_utils \
+       inductor/test_inplacing_pass inductor/test_kernel_benchmark inductor/test_layout_optim \
+       inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
+       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
+       inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
+       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes --verbose
+}
+
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   (cd test && python -c "import torch; print(torch.__config__.show())")
   (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
 fi
-if [[ "${TEST_CONFIG}" == *backward* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
+  test_linux_aarch64
+elif [[ "${TEST_CONFIG}" == *backward* ]]; then
   test_forward_backward_compatibility
   # Do NOT add tests after bc check tests, see its comment.
 elif [[ "${TEST_CONFIG}" == *xla* ]]; then
@@ -1092,6 +1214,8 @@ elif [[ "$TEST_CONFIG" == deploy ]]; then
   test_torch_deploy
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
   test_inductor_distributed
+elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
+  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
   install_torchvision
   id=$((SHARD_NUMBER-1))
@@ -1114,6 +1238,14 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
   if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
     checkout_install_torchbench hf_Bert hf_Albert nanogpt timm_vision_transformer
     PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
+  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
+    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_gcn \
+      llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
+      shufflenet_v2_x1_0 hf_GPT2
+    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
+  elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
+    checkout_install_torchbench
+    TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
   else
     checkout_install_torchbench
     # Do this after checkout_install_torchbench to ensure we clobber any
@@ -1123,6 +1255,9 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
     fi
     PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
   fi
+elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper_abi_compatible* ]]; then
+  install_torchvision
+  test_inductor_cpp_wrapper_abi_compatible
 elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
   install_torchvision
   test_inductor
diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat
index 070e7a14687ee..28bd083f984ab 100644
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@@ -16,24 +16,23 @@ set PATH=C:\Program Files\CMake\bin;C:\Program Files\7-Zip;C:\ProgramData\chocol
 
 set INSTALLER_DIR=%SCRIPT_HELPERS_DIR%\installation-helpers
 
-
-call %INSTALLER_DIR%\install_mkl.bat
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
-
 call %INSTALLER_DIR%\install_magma.bat
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
 
 call %INSTALLER_DIR%\install_sccache.bat
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
 
 :: Miniconda has been installed as part of the Windows AMI with all the dependencies.
 :: We just need to activate it here
 call %INSTALLER_DIR%\activate_miniconda3.bat
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
+
+call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
 
 :: Override VS env here
 pushd .
@@ -42,8 +41,8 @@ if "%VC_VERSION%" == "" (
 ) else (
     call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64 -vcvars_ver=%VC_VERSION%
 )
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
 @echo on
 popd
 
@@ -53,12 +52,12 @@ set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION%
 
 if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
     echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
-    exit /b 1
+    goto fail
 )
 rem version transformer, for example 10.1 to 10_1.
 if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
     echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
-    exit /b 1
+    goto fail
 )
 set VERSION_SUFFIX=%CUDA_VERSION:.=_%
 set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
@@ -89,8 +88,8 @@ set SCCACHE_IGNORE_SERVER_IO_ERROR=1
 sccache --stop-server
 sccache --start-server
 sccache --zero-stats
-set CC=sccache-cl
-set CXX=sccache-cl
+set CMAKE_C_COMPILER_LAUNCHER=sccache
+set CMAKE_CXX_COMPILER_LAUNCHER=sccache
 
 set CMAKE_GENERATOR=Ninja
 
@@ -102,8 +101,8 @@ if "%USE_CUDA%"=="1" (
   :: CMake requires a single command as CUDA_NVCC_EXECUTABLE, so we push the wrappers
   :: randomtemp.exe and sccache.exe into a batch file which CMake invokes.
   curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.4/randomtemp.exe --output %TMP_DIR_WIN%\bin\randomtemp.exe
-  if errorlevel 1 exit /b
-  if not errorlevel 0 exit /b
+  if errorlevel 1 goto fail
+  if not errorlevel 0 goto fail
   echo @"%TMP_DIR_WIN%\bin\randomtemp.exe" "%TMP_DIR_WIN%\bin\sccache.exe" "%CUDA_PATH%\bin\nvcc.exe" %%* > "%TMP_DIR%/bin/nvcc.bat"
   cat %TMP_DIR%/bin/nvcc.bat
   set CUDA_NVCC_EXECUTABLE=%TMP_DIR%/bin/nvcc.bat
@@ -115,8 +114,8 @@ if "%USE_CUDA%"=="1" (
 set
 
 python setup.py bdist_wheel
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
 sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
@@ -136,3 +135,8 @@ python -c "import os, glob; os.system('python -mpip install --no-index --no-deps
 
 sccache --show-stats --stats-format json | jq .stats > sccache-stats-%BUILD_ENVIRONMENT%-%OUR_GITHUB_JOB_ID%.json
 sccache --stop-server
+
+exit /b 0
+
+:fail
+exit /b 1
diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat b/.ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
deleted file mode 100644
index 6c676d1baeded..0000000000000
--- a/.ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
+++ /dev/null
@@ -1,14 +0,0 @@
-if "%REBUILD%"=="" (
-  if "%BUILD_ENVIRONMENT%"=="" (
-    curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z --output %TMP_DIR_WIN%\mkl.7z
-  ) else (
-    aws s3 cp s3://ossci-windows/mkl_2020.2.254.7z %TMP_DIR_WIN%\mkl.7z --quiet
-  )
-  if errorlevel 1 exit /b
-  if not errorlevel 0 exit /b
-  7z x -aoa %TMP_DIR_WIN%\mkl.7z -o%TMP_DIR_WIN%\mkl
-  if errorlevel 1 exit /b
-  if not errorlevel 0 exit /b
-)
-set CMAKE_INCLUDE_PATH=%TMP_DIR_WIN%\mkl\include
-set LIB=%TMP_DIR_WIN%\mkl\lib;%LIB%
diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat b/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat
index 6f8cc15ba8684..7989f7c6ece3f 100644
--- a/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat
@@ -1,18 +1,13 @@
 mkdir %TMP_DIR_WIN%\bin
 
 if "%REBUILD%"=="" (
-  :check_sccache
-  %TMP_DIR_WIN%\bin\sccache.exe --show-stats || (
+  IF EXIST %TMP_DIR_WIN%\bin\sccache.exe (
     taskkill /im sccache.exe /f /t || ver > nul
     del %TMP_DIR_WIN%\bin\sccache.exe || ver > nul
-    del %TMP_DIR_WIN%\bin\sccache-cl.exe || ver > nul
-    if "%BUILD_ENVIRONMENT%"=="" (
-      curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output %TMP_DIR_WIN%\bin\sccache.exe
-      curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache-cl.exe --output %TMP_DIR_WIN%\bin\sccache-cl.exe
-    ) else (
-      aws s3 cp s3://ossci-windows/sccache.exe %TMP_DIR_WIN%\bin\sccache.exe
-      aws s3 cp s3://ossci-windows/sccache-cl.exe %TMP_DIR_WIN%\bin\sccache-cl.exe
-    )
-    goto :check_sccache
   )
-)
+  if "%BUILD_ENVIRONMENT%"=="" (
+    curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache-v0.7.4.exe --output %TMP_DIR_WIN%\bin\sccache.exe
+  ) else (
+    aws s3 cp s3://ossci-windows/sccache-v0.7.4.exe %TMP_DIR_WIN%\bin\sccache.exe
+  )
+)
\ No newline at end of file
diff --git a/.circleci/README.md b/.circleci/README.md
index 569f58a1242e6..24dde8b47666f 100644
--- a/.circleci/README.md
+++ b/.circleci/README.md
@@ -1,468 +1,4 @@
 Warning
 =======
 
-Contents may be out of date. Our CircleCI workflows are gradually being migrated to Github actions.
-
-Structure of CI
-===============
-
-setup job:
-1. Does a git checkout
-2. Persists CircleCI scripts (everything in `.circleci`) into a workspace.  Why?
-   We don't always do a Git checkout on all subjobs, but we usually
-   still want to be able to call scripts one way or another in a subjob.
-   Persisting files this way lets us have access to them without doing a
-   checkout.  This workspace is conventionally mounted on `~/workspace`
-   (this is distinguished from `~/project`, which is the conventional
-   working directory that CircleCI will default to starting your jobs
-   in.)
-3. Write out the commit message to `.circleci/COMMIT_MSG`.  This is so
-   we can determine in subjobs if we should actually run the jobs or
-   not, even if there isn't a Git checkout.
-
-
-CircleCI configuration generator
-================================
-
-One may no longer make changes to the `.circleci/config.yml` file directly.
-Instead, one must edit these Python scripts or files in the `verbatim-sources/` directory.
-
-
-Usage
-----------
-
-1. Make changes to these scripts.
-2. Run the `regenerate.sh` script in this directory and commit the script changes and the resulting change to `config.yml`.
-
-You'll see a build failure on GitHub if the scripts don't agree with the checked-in version.
-
-
-Motivation
-----------
-
-These scripts establish a single, authoritative source of documentation for the CircleCI configuration matrix.
-The documentation, in the form of diagrams, is automatically generated and cannot drift out of sync with the YAML content.
-
-Furthermore, consistency is enforced within the YAML config itself, by using a single source of data to generate
-multiple parts of the file.
-
-* Facilitates one-off culling/enabling of CI configs for testing PRs on special targets
-
-Also see https://github.com/pytorch/pytorch/issues/17038
-
-
-Future direction
-----------------
-
-### Declaring sparse config subsets
-See comment [here](https://github.com/pytorch/pytorch/pull/17323#pullrequestreview-206945747):
-
-In contrast with a full recursive tree traversal of configuration dimensions,
-> in the future I think we actually want to decrease our matrix somewhat and have only a few mostly-orthogonal builds that taste as many different features as possible on PRs, plus a more complete suite on every PR and maybe an almost full suite nightly/weekly (we don't have this yet). Specifying PR jobs in the future might be easier to read with an explicit list when we come to this.
-----------------
-----------------
-
-# How do the binaries / nightlies / releases work?
-
-### What is a binary?
-
-A binary or package (used interchangeably) is a pre-built collection of c++ libraries, header files, python bits, and other files. We build these and distribute them so that users do not need to install from source.
-
-A **binary configuration** is a collection of
-
-* release or nightly
-    * releases are stable, nightlies are beta and built every night
-* python version
-    * linux: 3.7m (mu is wide unicode or something like that. It usually doesn't matter but you should know that it exists)
-    * macos: 3.7, 3.8
-    * windows: 3.7, 3.8
-* cpu version
-    * cpu, cuda 9.0, cuda 10.0
-    * The supported cuda versions occasionally change
-* operating system
-    * Linux - these are all built on CentOS. There haven't been any problems in the past building on CentOS and using on Ubuntu
-    * MacOS
-    * Windows - these are built on Azure pipelines
-* devtoolset version (gcc compiler version)
-    * This only matters on Linux cause only Linux uses gcc. tldr is gcc made a backwards incompatible change from gcc 4.8 to gcc 5, because it had to change how it implemented std::vector and std::string
-
-### Where are the binaries?
-
-The binaries are built in CircleCI. There are nightly binaries built every night at 9pm PST (midnight EST) and release binaries corresponding to Pytorch releases, usually every few months.
-
-We have 3 types of binary packages
-
-* pip packages - nightlies are stored on s3 (pip install -f \<a s3 url\>). releases are stored in a pip repo (pip install torch) (ask Soumith about this)
-* conda packages - nightlies and releases are both stored in a conda repo. Nighty packages have a '_nightly' suffix
-* libtorch packages - these are zips of all the c++ libraries, header files, and sometimes dependencies. These are c++ only
-    * shared with dependencies (the only supported option for Windows)
-    * static with dependencies
-    * shared without dependencies
-    * static without dependencies
-
-All binaries are built in CircleCI workflows except Windows. There are checked-in workflows (committed into the .circleci/config.yml) to build the nightlies every night. Releases are built by manually pushing a PR that builds the suite of release binaries (overwrite the config.yml to build the release)
-
-# CircleCI structure of the binaries
-
-Some quick vocab:
-
-* A \**workflow** is a CircleCI concept; it is a DAG of '**jobs**'. ctrl-f 'workflows' on https://github.com/pytorch/pytorch/blob/main/.circleci/config.yml to see the workflows.
-* **jobs** are a sequence of '**steps**'
-* **steps** are usually just a bash script or a builtin CircleCI command. *All steps run in new environments, environment variables declared in one script DO NOT persist to following steps*
-* CircleCI has a **workspace**, which is essentially a cache between steps of the *same job* in which you can store artifacts between steps.
-
-## How are the workflows structured?
-
-The nightly binaries have 3 workflows. We have one job (actually 3 jobs:  build, test, and upload) per binary configuration
-
-1. binary_builds
-    1. every day midnight EST
-    2. linux: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/linux-binary-build-defaults.yml
-    3. macos: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/macos-binary-build-defaults.yml
-    4. For each binary configuration, e.g. linux_conda_3.7_cpu there is a
-        1. binary_linux_conda_3.7_cpu_build
-            1. Builds the build. On linux jobs this uses the 'docker executor'.
-            2. Persists the package to the workspace
-        2. binary_linux_conda_3.7_cpu_test
-            1. Loads the package to the workspace
-            2. Spins up a docker image (on Linux), mapping the package and code repos into the docker
-            3. Runs some smoke tests in the docker
-            4. (Actually, for macos this is a step rather than a separate job)
-        3. binary_linux_conda_3.7_cpu_upload
-            1. Logs in to aws/conda
-            2. Uploads the package
-2. update_s3_htmls
-    1. every day 5am EST
-    2. https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/binary_update_htmls.yml
-    3. See below for what these are for and why they're needed
-    4. Three jobs that each examine the current contents of aws and the conda repo and update some html files in s3
-3. binarysmoketests
-    1. every day
-    2. https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml
-    3. For each binary configuration, e.g. linux_conda_3.7_cpu there is a
-        1. smoke_linux_conda_3.7_cpu
-            1. Downloads the package from the cloud, e.g. using the official pip or conda instructions
-            2. Runs the smoke tests
-
-## How are the jobs structured?
-
-The jobs are in https://github.com/pytorch/pytorch/tree/main/.circleci/verbatim-sources. Jobs are made of multiple steps. There are some shared steps used by all the binaries/smokes. Steps of these jobs are all delegated to scripts in https://github.com/pytorch/pytorch/tree/main/.circleci/scripts .
-
-* Linux jobs: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/linux-binary-build-defaults.yml
-    * binary_linux_build.sh
-    * binary_linux_test.sh
-    * binary_linux_upload.sh
-* MacOS jobs: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/macos-binary-build-defaults.yml
-    * binary_macos_build.sh
-    * binary_macos_test.sh
-    * binary_macos_upload.sh
-* Update html jobs: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/binary_update_htmls.yml
-    * These delegate from the pytorch/builder repo
-    * https://github.com/pytorch/builder/blob/main/cron/update_s3_htmls.sh
-    * https://github.com/pytorch/builder/blob/main/cron/upload_binary_sizes.sh
-* Smoke jobs (both linux and macos): https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml
-    * These delegate from the pytorch/builder repo
-    * https://github.com/pytorch/builder/blob/main/run_tests.sh
-    * https://github.com/pytorch/builder/blob/main/smoke_test.sh
-    * https://github.com/pytorch/builder/blob/main/check_binary.sh
-* Common shared code (shared across linux and macos): https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/nightly-binary-build-defaults.yml
-    * binary_checkout.sh - checks out pytorch/builder repo. Right now this also checks out pytorch/pytorch, but it shouldn't. pytorch/pytorch should just be shared through the workspace. This can handle being run before binary_populate_env.sh
-    * binary_populate_env.sh - parses BUILD_ENVIRONMENT into the separate env variables that make up a binary configuration. Also sets lots of default values, the date, the version strings, the location of folders in s3, all sorts of things. This generally has to be run before other steps.
-    * binary_install_miniconda.sh - Installs miniconda, cross platform. Also hacks this for the update_binary_sizes job that doesn't have the right env variables
-    * binary_run_in_docker.sh - Takes a bash script file (the actual test code) from a hardcoded location, spins up a docker image, and runs the script inside the docker image
-
-### **Why do the steps all refer to scripts?**
-
-CircleCI creates a  final yaml file by inlining every <<* segment, so if we were to keep all the code in the config.yml itself then the config size would go over 4 MB and cause infra problems.
-
-### **What is binary_run_in_docker for?**
-
-So, CircleCI has several executor types: macos, machine, and docker are the ones we use. The 'machine' executor gives you two cores on some linux vm. The 'docker' executor gives you considerably more cores (nproc was 32 instead of 2 back when I tried in February). Since the dockers are faster, we try to run everything that we can in dockers. Thus
-
-* linux build jobs use the docker executor. Running them on the docker executor was at least 2x faster than running them on the machine executor
-* linux test jobs use the machine executor in order for them to properly interface with GPUs since docker executors cannot execute with attached GPUs
-* linux upload jobs use the machine executor. The upload jobs are so short that it doesn't really matter what they use
-* linux smoke test jobs use the machine executor for the same reason as the linux test jobs
-
-binary_run_in_docker.sh is a way to share the docker start-up code between the binary test jobs and the binary smoke test jobs
-
-### **Why does binary_checkout also checkout pytorch? Why shouldn't it?**
-
-We want all the nightly binary jobs to run on the exact same git commit, so we wrote our own checkout logic to ensure that the same commit was always picked. Later circleci changed that to use a single pytorch checkout and persist it through the workspace (they did this because our config file was too big, so they wanted to take a lot of the setup code into scripts, but the scripts needed the code repo to exist to be called, so they added a prereq step called 'setup' to checkout the code and persist the needed scripts to the workspace). The changes to the binary jobs were not properly tested, so they all broke from missing pytorch code no longer existing. We hotfixed the problem by adding the pytorch checkout back to binary_checkout, so now there's two checkouts of pytorch on the binary jobs. This problem still needs to be fixed, but it takes careful tracing of which code is being called where.
-
-# Code structure of the binaries (circleci agnostic)
-
-## Overview
-
-The code that runs the binaries lives in two places, in the normal [github.com/pytorch/pytorch](http://github.com/pytorch/pytorch), but also in [github.com/pytorch/builder](http://github.com/pytorch/builder), which is a repo that defines how all the binaries are built. The relevant code is
-
-
-```
-# All code needed to set-up environments for build code to run in,
-# but only code that is specific to the current CI system
-pytorch/pytorch
-- .circleci/                # Folder that holds all circleci related stuff
-  - config.yml              # GENERATED file that actually controls all circleci behavior
-  - verbatim-sources        # Used to generate job/workflow sections in ^
-  - scripts/                # Code needed to prepare circleci environments for binary build scripts
-- setup.py                  # Builds pytorch. This is wrapped in pytorch/builder
-- cmake files               # used in normal building of pytorch
-# All code needed to prepare a binary build, given an environment
-# with all the right variables/packages/paths.
-pytorch/builder
-# Given an installed binary and a proper python env, runs some checks
-# to make sure the binary was built the proper way. Checks things like
-# the library dependencies, symbols present, etc.
-- check_binary.sh
-# Given an installed binary, runs python tests to make sure everything
-# is in order. These should be de-duped. Right now they both run smoke
-# tests, but are called from different places. Usually just call some
-# import statements, but also has overlap with check_binary.sh above
-- run_tests.sh
-- smoke_test.sh
-# Folders that govern how packages are built. See paragraphs below
-- conda/
-  - build_pytorch.sh          # Entrypoint. Delegates to proper conda build folder
-  - switch_cuda_version.sh    # Switches activate CUDA installation in Docker
-  - pytorch-nightly/          # Build-folder
-- manywheel/
-  - build_cpu.sh              # Entrypoint for cpu builds
-  - build.sh                  # Entrypoint for CUDA builds
-  - build_common.sh           # Actual build script that ^^ call into
-- wheel/
-  - build_wheel.sh            # Entrypoint for wheel builds
-- windows/
-  - build_pytorch.bat         # Entrypoint for wheel builds on Windows
-```
-
-Every type of package has an entrypoint build script that handles the all the important logic.
-
-## Conda
-
-Linux, MacOS and Windows use the same code flow for the conda builds.
-
-Conda packages are built with conda-build, see https://conda.io/projects/conda-build/en/latest/resources/commands/conda-build.html
-
-Basically, you pass `conda build` a build folder (pytorch-nightly/ above) that contains a build script and a meta.yaml. The meta.yaml specifies in what python environment to build the package in, and what dependencies the resulting package should have, and the build script gets called in the env to build the thing.
-tl;dr on conda-build is
-
-1. Creates a brand new conda environment, based off of deps in the meta.yaml
-    1. Note that environment variables do not get passed into this build env unless they are specified in the meta.yaml
-    2. If the build fails this environment will stick around. You can activate it for much easier debugging. The “General Python” section below explains what exactly a python “environment” is.
-2. Calls build.sh in the environment
-3. Copies the finished package to a new conda env, also specified by the meta.yaml
-4. Runs some simple import tests (if specified in the meta.yaml)
-5. Saves the finished package as a tarball
-
-The build.sh we use is essentially a wrapper around `python setup.py build`, but it also manually copies in some of our dependent libraries into the resulting tarball and messes with some rpaths.
-
-The entrypoint file `builder/conda/build_conda.sh` is complicated because
-
-* It works for Linux, MacOS and Windows
-    * The mac builds used to create their own environments, since they all used to be on the same machine. There’s now a lot of extra logic to handle conda envs. This extra machinery could be removed
-* It used to handle testing too, which adds more logic messing with python environments too. This extra machinery could be removed.
-
-## Manywheels (linux pip and libtorch packages)
-
-Manywheels are pip packages for linux distros. Note that these manywheels are not actually manylinux compliant.
-
-`builder/manywheel/build_cpu.sh` and `builder/manywheel/build.sh` (for CUDA builds) just set different env vars and then call into `builder/manywheel/build_common.sh`
-
-The entrypoint file `builder/manywheel/build_common.sh` is really really complicated because
-
-* This used to handle building for several different python versions at the same time. The loops have been removed, but there's still unnecessary folders and movements here and there.
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This used to handle testing the pip packages too. This is why there’s testing code at the end that messes with python installations and stuff
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This also builds libtorch packages
-    * This should really be separate. libtorch packages are c++ only and have no python. They should not share infra with all the python specific stuff in this file.
-* There is a lot of messing with rpaths. This is necessary, but could be made much much simpler if the above issues were fixed.
-
-## Wheels (MacOS pip and libtorch packages)
-
-The entrypoint file `builder/wheel/build_wheel.sh` is complicated because
-
-* The mac builds used to all run on one machine (we didn’t have autoscaling mac machines till circleci). So this script handled siloing itself by setting-up and tearing-down its build env and siloing itself into its own build directory.
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This also builds libtorch packages
-    * Ditto the comment above. This should definitely be separated out.
-
-Note that the MacOS Python wheels are still built in conda environments. Some of the dependencies present during build also come from conda.
-
-## Windows Wheels (Windows pip and libtorch packages)
-
-The entrypoint file `builder/windows/build_pytorch.bat` is complicated because
-
-* This used to handle building for several different python versions at the same time. This is why there are loops everywhere
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This used to handle testing the pip packages too. This is why there’s testing code at the end that messes with python installations and stuff
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This also builds libtorch packages
-    * This should really be separate. libtorch packages are c++ only and have no python. They should not share infra with all the python specific stuff in this file.
-
-Note that the Windows Python wheels are still built in conda environments. Some of the dependencies present during build also come from conda.
-
-## General notes
-
-### Note on run_tests.sh, smoke_test.sh, and check_binary.sh
-
-* These should all be consolidated
-* These must run on all OS types: MacOS, Linux, and Windows
-* These all run smoke tests at the moment. They inspect the packages some, maybe run a few import statements. They DO NOT run the python tests nor the cpp tests. The idea is that python tests on main and PR merges will catch all breakages. All these tests have to do is make sure the special binary machinery didn’t mess anything up.
-* There are separate run_tests.sh and smoke_test.sh because one used to be called by the smoke jobs and one used to be called by the binary test jobs (see circleci structure section above). This is still true actually, but these could be united into a single script that runs these checks, given an installed pytorch package.
-
-### Note on libtorch
-
-Libtorch packages are built in the wheel build scripts: manywheel/build_*.sh for linux and build_wheel.sh for mac. There are several things wrong with this
-
-* It’s confusing. Most of those scripts deal with python specifics.
-* The extra conditionals everywhere severely complicate the wheel build scripts
-* The process for building libtorch is different from the official instructions (a plain call to cmake, or a call to a script)
-
-### Note on docker images / Dockerfiles
-
-All linux builds occur in docker images. The docker images are
-
-* pytorch/conda-cuda
-    * Has ALL CUDA versions installed. The script pytorch/builder/conda/switch_cuda_version.sh sets /usr/local/cuda to a symlink to e.g. /usr/local/cuda-10.0 to enable different CUDA builds
-    * Also used for cpu builds
-* pytorch/manylinux-cuda90
-* pytorch/manylinux-cuda100
-    * Also used for cpu builds
-
-The Dockerfiles are available in pytorch/builder, but there is no circleci job or script to build these docker images, and they cannot be run locally (unless you have the correct local packages/paths). Only Soumith can build them right now.
-
-### General Python
-
-* This is still a good explanation of python installations https://caffe2.ai/docs/faq.html#why-do-i-get-import-errors-in-python-when-i-try-to-use-caffe2
-
-# How to manually rebuild the binaries
-
-tl;dr make a PR that looks like https://github.com/pytorch/pytorch/pull/21159
-
-Sometimes we want to push a change to mainand then rebuild all of today's binaries after that change. As of May 30, 2019 there isn't a way to manually run a workflow in the UI. You can manually re-run a workflow, but it will use the exact same git commits as the first run and will not include any changes. So we have to make a PR and then force circleci to run the binary workflow instead of the normal tests. The above PR is an example of how to do this; essentially you copy-paste the binarybuilds workflow steps into the default workflow steps. If you need to point the builder repo to a different commit then you'd need to change https://github.com/pytorch/pytorch/blob/main/.circleci/scripts/binary_checkout.sh#L42-L45 to checkout what you want.
-
-## How to test changes to the binaries via .circleci
-
-Writing PRs that test the binaries is annoying, since the default circleci jobs that run on PRs are not the jobs that you want to run. Likely, changes to the binaries will touch something under .circleci/ and require that .circleci/config.yml be regenerated (.circleci/config.yml controls all .circleci behavior, and is generated using `.circleci/regenerate.sh` in python 3.7). But you also need to manually hardcode the binary jobs that you want to test into the .circleci/config.yml workflow, so you should actually make at least two commits, one for your changes and one to temporarily hardcode jobs. See https://github.com/pytorch/pytorch/pull/22928 as an example of how to do this.
-
-```sh
-# Make your changes
-touch .circleci/verbatim-sources/nightly-binary-build-defaults.yml
-# Regenerate the yaml, has to be in python 3.7
-.circleci/regenerate.sh
-# Make a commit
-git add .circleci *
-git commit -m "My real changes"
-git push origin my_branch
-# Now hardcode the jobs that you want in the .circleci/config.yml workflows section
-# Also eliminate ensure-consistency and should_run_job checks
-# e.g. https://github.com/pytorch/pytorch/commit/2b3344bfed8772fe86e5210cc4ee915dee42b32d
-# Make a commit you won't keep
-git add .circleci
-git commit -m "[DO NOT LAND] testing binaries for above changes"
-git push origin my_branch
-# Now you need to make some changes to the first commit.
-git rebase -i HEAD~2 # mark the first commit as 'edit'
-# Make the changes
-touch .circleci/verbatim-sources/nightly-binary-build-defaults.yml
-.circleci/regenerate.sh
-# Ammend the commit and recontinue
-git add .circleci
-git commit --amend
-git rebase --continue
-# Update the PR, need to force since the commits are different now
-git push origin my_branch --force
-```
-
-The advantage of this flow is that you can make new changes to the base commit and regenerate the .circleci without having to re-write which binary jobs you want to test on. The downside is that all updates will be force pushes.
-
-## How to build a binary locally
-
-### Linux
-
-You can build Linux binaries locally easily using docker.
-
-```sh
-# Run the docker
-# Use the correct docker image, pytorch/conda-cuda used here as an example
-#
-# -v path/to/foo:path/to/bar makes path/to/foo on your local machine (the
-#    machine that you're running the command on) accessible to the docker
-#    container at path/to/bar. So if you then run `touch path/to/bar/baz`
-#    in the docker container then you will see path/to/foo/baz on your local
-#    machine. You could also clone the pytorch and builder repos in the docker.
-#
-# If you know how, add ccache as a volume too and speed up everything
-docker run \
-    -v your/pytorch/repo:/pytorch \
-    -v your/builder/repo:/builder \
-    -v where/you/want/packages/to/appear:/final_pkgs \
-    -it pytorch/conda-cuda /bin/bash
-# Export whatever variables are important to you. All variables that you'd
-# possibly need are in .circleci/scripts/binary_populate_env.sh
-# You should probably always export at least these 3 variables
-export PACKAGE_TYPE=conda
-export DESIRED_PYTHON=3.7
-export DESIRED_CUDA=cpu
-# Call the entrypoint
-# `|& tee foo.log` just copies all stdout and stderr output to foo.log
-# The builds generate lots of output so you probably need this when
-# building locally.
-/builder/conda/build_pytorch.sh |& tee build_output.log
-```
-
-**Building CUDA binaries on docker**
-
-You can build CUDA binaries on CPU only machines, but you can only run CUDA binaries on CUDA machines. This means that you can build a CUDA binary on a docker on your laptop if you so choose (though it’s gonna take a long time).
-
-For Facebook employees, ask about beefy machines that have docker support and use those instead of your laptop; it will be 5x as fast.
-
-### MacOS
-
-There’s no easy way to generate reproducible hermetic MacOS environments. If you have a Mac laptop then you can try emulating the .circleci environments as much as possible, but you probably have packages in /usr/local/, possibly installed by brew, that will probably interfere with the build. If you’re trying to repro an error on a Mac build in .circleci and you can’t seem to repro locally, then my best advice is actually to iterate on .circleci    :/
-
-But if you want to try, then I’d recommend
-
-```sh
-# Create a new terminal
-# Clear your LD_LIBRARY_PATH and trim as much out of your PATH as you
-# know how to do
-# Install a new miniconda
-# First remove any other python or conda installation from your PATH
-# Always install miniconda 3, even if building for Python <3
-new_conda="~/my_new_conda"
-conda_sh="$new_conda/install_miniconda.sh"
-curl -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-chmod +x "$conda_sh"
-"$conda_sh" -b -p "$MINICONDA_ROOT"
-rm -f "$conda_sh"
-export PATH="~/my_new_conda/bin:$PATH"
-# Create a clean python env
-# All MacOS builds use conda to manage the python env and dependencies
-# that are built with, even the pip packages
-conda create -yn binary python=2.7
-conda activate binary
-# Export whatever variables are important to you. All variables that you'd
-# possibly need are in .circleci/scripts/binary_populate_env.sh
-# You should probably always export at least these 3 variables
-export PACKAGE_TYPE=conda
-export DESIRED_PYTHON=3.7
-export DESIRED_CUDA=cpu
-# Call the entrypoint you want
-path/to/builder/wheel/build_wheel.sh
-```
-
-N.B. installing a brand new miniconda is important. This has to do with how conda installations work. See the “General Python” section above, but tldr; is that
-
-1. You make the ‘conda’ command accessible by prepending `path/to/conda_root/bin` to your PATH.
-2. You make a new env and activate it, which then also gets prepended to your PATH. Now you have `path/to/conda_root/envs/new_env/bin:path/to/conda_root/bin:$PATH`
-3. Now say you (or some code that you ran) call python executable `foo`
-    1. if you installed `foo` in `new_env`, then `path/to/conda_root/envs/new_env/bin/foo` will get called, as expected.
-    2. But if you forgot to installed `foo` in `new_env` but happened to previously install it in your root conda env (called ‘base’), then unix/linux will still find `path/to/conda_root/bin/foo` . This is dangerous, since `foo` can be a different version than you want; `foo` can even be for an incompatible python version!
-
-Newer conda versions and proper python hygiene can prevent this, but just install a new miniconda to be safe.
-
-### Windows
-
-TODO: fill in
+PyTorch migration from CircleCI to github actions has been completed. All continuous integration & deployment workflows are defined in  `.github/workflows` folder
diff --git a/.circleci/scripts/binary_checkout.sh b/.circleci/scripts/binary_checkout.sh
deleted file mode 100755
index 7bcf0b7b6431d..0000000000000
--- a/.circleci/scripts/binary_checkout.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-retry () {
-    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
-}
-
-
-# This step runs on multiple executors with different envfile locations
-if [[ "$(uname)" == Darwin ]]; then
-  # macos executor (builds and tests)
-  workdir="/Users/distiller/project"
-elif [[ "$OSTYPE" == "msys" ]]; then
-  # windows executor (builds and tests)
-  rm -rf /c/w
-  ln -s "/c/Users/circleci/project" /c/w
-  workdir="/c/w"
-elif [[ -d "/home/circleci/project" ]]; then
-  # machine executor (binary tests)
-  workdir="/home/circleci/project"
-else
-  # docker executor (binary builds)
-  workdir="/"
-fi
-
-# It is very important that this stays in sync with binary_populate_env.sh
-if [[ "$OSTYPE" == "msys" ]]; then
-  # We need to make the paths as short as possible on Windows
-  export PYTORCH_ROOT="$workdir/p"
-  export BUILDER_ROOT="$workdir/b"
-else
-  export PYTORCH_ROOT="$workdir/pytorch"
-  export BUILDER_ROOT="$workdir/builder"
-fi
-
-# Try to extract PR number from branch if not already set
-if [[ -z "${CIRCLE_PR_NUMBER:-}" ]]; then
-  CIRCLE_PR_NUMBER="$(echo ${CIRCLE_BRANCH} | sed -E -n 's/pull\/([0-9]*).*/\1/p')"
-fi
-
-# Clone the Pytorch branch
-retry git clone https://github.com/pytorch/pytorch.git "$PYTORCH_ROOT"
-pushd "$PYTORCH_ROOT"
-if [[ -n "${CIRCLE_PR_NUMBER:-}" ]]; then
-  # "smoke" binary build on PRs
-  git fetch --force origin "pull/${CIRCLE_PR_NUMBER}/head:remotes/origin/pull/${CIRCLE_PR_NUMBER}"
-  git reset --hard "$CIRCLE_SHA1"
-  git checkout -q -B "$CIRCLE_BRANCH"
-  git reset --hard "$CIRCLE_SHA1"
-elif [[ -n "${CIRCLE_SHA1:-}" ]]; then
-  # Scheduled workflows & "smoke" binary build on trunk on PR merges
-  DEFAULT_BRANCH="$(git remote show $CIRCLE_REPOSITORY_URL | awk '/HEAD branch/ {print $NF}')"
-  git reset --hard "$CIRCLE_SHA1"
-  git checkout -q -B $DEFAULT_BRANCH
-else
-  echo "Can't tell what to checkout"
-  exit 1
-fi
-retry git submodule update --init --recursive
-echo "Using Pytorch from "
-git --no-pager log --max-count 1
-popd
-
-# Clone the Builder main repo
-retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT"
-pushd "$BUILDER_ROOT"
-echo "Using builder from "
-git --no-pager log --max-count 1
-popd
diff --git a/.circleci/scripts/binary_install_miniconda.sh b/.circleci/scripts/binary_install_miniconda.sh
deleted file mode 100755
index ce08805bd5b04..0000000000000
--- a/.circleci/scripts/binary_install_miniconda.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-set -eux -o pipefail
-
-# This step runs on multiple executors with different envfile locations
-if [[ "$(uname)" == Darwin ]]; then
-  envfile="/Users/distiller/project/env"
-elif [[ -d "/home/circleci/project" ]]; then
-  # machine executor (binary tests)
-  envfile="/home/circleci/project/env"
-else
-  # docker executor (binary builds)
-  envfile="/env"
-fi
-
-# TODO this is super hacky and ugly. Basically, the binary_update_html job does
-# not have an env file, since it does not call binary_populate_env.sh, since it
-# does not have a BUILD_ENVIRONMENT. So for this one case, which we detect by a
-# lack of an env file, we manually export the environment variables that we
-# need to install miniconda
-if [[ ! -f "$envfile" ]]; then
-  MINICONDA_ROOT="/home/circleci/project/miniconda"
-  workdir="/home/circleci/project"
-  retry () {
-      $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
-  }
-  export -f retry
-else
-  source "$envfile"
-fi
-
-conda_sh="$workdir/install_miniconda.sh"
-if [[ "$(uname)" == Darwin ]]; then
-  curl --retry 3 --retry-all-errors -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-MacOSX-x86_64.sh
-else
-  curl --retry 3 --retry-all-errors -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-fi
-chmod +x "$conda_sh"
-"$conda_sh" -b -p "$MINICONDA_ROOT"
-rm -f "$conda_sh"
-
-# We can't actually add miniconda to the PATH in the envfile, because that
-# breaks 'unbuffer' in Mac jobs. This is probably because conda comes with
-# a tclsh, which then gets inserted before the tclsh needed in /usr/bin
diff --git a/.circleci/scripts/binary_macos_build.sh b/.circleci/scripts/binary_macos_build.sh
index 8ee131de0435f..3f9e6e8eb5156 100755
--- a/.circleci/scripts/binary_macos_build.sh
+++ b/.circleci/scripts/binary_macos_build.sh
@@ -4,10 +4,6 @@ set -eux -o pipefail
 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
 
-if [[ -z "${GITHUB_ACTIONS:-}" ]]; then
-  export PATH="${workdir:-${HOME}}/miniconda/bin:${PATH}"
-fi
-
 # Build
 export USE_PYTORCH_METAL_EXPORT=1
 export USE_COREML_DELEGATE=1
diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index 668de45e2c7b1..287423641d777 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -3,17 +3,9 @@ set -eux -o pipefail
 export TZ=UTC
 
 tagged_version() {
-  # Grabs version from either the env variable CIRCLE_TAG
-  # or the pytorch git described version
-  if [[ "$OSTYPE" == "msys" &&  -z "${GITHUB_ACTIONS:-}" ]]; then
-    GIT_DIR="${workdir}/p/.git"
-  else
-    GIT_DIR="${workdir}/pytorch/.git"
-  fi
+  GIT_DIR="${workdir}/pytorch/.git"
   GIT_DESCRIBE="git --git-dir ${GIT_DIR} describe --tags --match v[0-9]*.[0-9]*.[0-9]*"
-  if [[ -n "${CIRCLE_TAG:-}" ]]; then
-    echo "${CIRCLE_TAG}"
-  elif [[ ! -d "${GIT_DIR}" ]]; then
+  if [[ ! -d "${GIT_DIR}" ]]; then
     echo "Abort, abort! Git dir ${GIT_DIR} does not exists!"
     kill $$
   elif ${GIT_DESCRIBE} --exact >/dev/null; then
@@ -59,6 +51,7 @@ PIP_UPLOAD_FOLDER='nightly/'
 # We put this here so that OVERRIDE_PACKAGE_VERSION below can read from it
 export DATE="$(date -u +%Y%m%d)"
 BASE_BUILD_VERSION="$(cat ${PYTORCH_ROOT}/version.txt|cut -da -f1).dev${DATE}"
+
 # Change BASE_BUILD_VERSION to git tag when on a git tag
 # Use 'git -C' to make doubly sure we're in the correct directory for checking
 # the git tag
@@ -78,6 +71,35 @@ fi
 
 export PYTORCH_BUILD_NUMBER=1
 
+# Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
+TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
+
+# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
+if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
+  # Only linux Python < 3.12 are supported wheels for triton
+  TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.12'"
+  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
+  if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
+      TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
+      TRITON_REQUIREMENT="pytorch-triton==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
+  fi
+  export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
+fi
+
+# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
+if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then
+    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}"
+    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
+        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt)
+        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}"
+    fi
+    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
+        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
+    else
+        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
+    fi
+fi
+
 JAVA_HOME=
 BUILD_JNI=OFF
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
@@ -123,12 +145,13 @@ if [[ "${OSTYPE}" == "msys" ]]; then
 else
   export DESIRED_DEVTOOLSET="${DESIRED_DEVTOOLSET:-}"
 fi
-export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}"
+
 export DATE="$DATE"
 export NIGHTLIES_DATE_PREAMBLE=1.14.0.dev
 export PYTORCH_BUILD_VERSION="$PYTORCH_BUILD_VERSION"
 export PYTORCH_BUILD_NUMBER="$PYTORCH_BUILD_NUMBER"
 export OVERRIDE_PACKAGE_VERSION="$PYTORCH_BUILD_VERSION"
+export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}"
 
 # TODO: We don't need this anymore IIUC
 export TORCH_PACKAGE_NAME='torch'
@@ -161,28 +184,6 @@ if [[ "$(uname)" != Darwin ]]; then
 EOL
 fi
 
-if [[ -z "${GITHUB_ACTIONS:-}" ]]; then
-  cat >>"$envfile" <<EOL
-  export workdir="$workdir"
-  export MAC_PACKAGE_WORK_DIR="$workdir"
-  if [[ "$OSTYPE" == "msys" ]]; then
-    export PYTORCH_ROOT="$workdir/p"
-    export BUILDER_ROOT="$workdir/b"
-  else
-    export PYTORCH_ROOT="$workdir/pytorch"
-    export BUILDER_ROOT="$workdir/builder"
-  fi
-  export MINICONDA_ROOT="$workdir/miniconda"
-  export PYTORCH_FINAL_PACKAGE_DIR="$workdir/final_pkgs"
-
-  export CIRCLE_TAG="${CIRCLE_TAG:-}"
-  export CIRCLE_SHA1="$CIRCLE_SHA1"
-  export CIRCLE_PR_NUMBER="${CIRCLE_PR_NUMBER:-}"
-  export CIRCLE_BRANCH="$CIRCLE_BRANCH"
-  export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID"
-EOL
-fi
-
 echo 'retry () {' >> "$envfile"
 echo '    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)' >> "$envfile"
 echo '}' >> "$envfile"
diff --git a/.circleci/scripts/binary_run_in_docker.sh b/.circleci/scripts/binary_run_in_docker.sh
deleted file mode 100755
index 4af14becb4264..0000000000000
--- a/.circleci/scripts/binary_run_in_docker.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-# This section is used in the binary_test and smoke_test jobs. It expects
-# 'binary_populate_env' to have populated /home/circleci/project/env and it
-# expects another section to populate /home/circleci/project/ci_test_script.sh
-# with the code to run in the docker
-
-# Expect all needed environment variables to be written to this file
-source /home/circleci/project/env
-echo "Running the following code in Docker"
-cat /home/circleci/project/ci_test_script.sh
-echo
-echo
-set -eux -o pipefail
-
-# Expect actual code to be written to this file
-chmod +x /home/circleci/project/ci_test_script.sh
-
-VOLUME_MOUNTS="-v /home/circleci/project/:/circleci_stuff -v /home/circleci/project/final_pkgs:/final_pkgs -v ${PYTORCH_ROOT}:/pytorch -v ${BUILDER_ROOT}:/builder"
-# Run the docker
-if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
-  export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --gpus all ${VOLUME_MOUNTS} -t -d "${DOCKER_IMAGE}")
-else
-  export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined ${VOLUME_MOUNTS} -t -d "${DOCKER_IMAGE}")
-fi
-
-# Execute the test script that was populated by an earlier section
-export COMMAND='((echo "source /circleci_stuff/env && /circleci_stuff/ci_test_script.sh") | docker exec -i "$id" bash) 2>&1'
-echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
diff --git a/.circleci/scripts/setup_ci_environment.sh b/.circleci/scripts/setup_ci_environment.sh
deleted file mode 100755
index 42a605cd44451..0000000000000
--- a/.circleci/scripts/setup_ci_environment.sh
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/usr/bin/env bash
-set -ex -o pipefail
-
-# Remove unnecessary sources
-sudo rm -f /etc/apt/sources.list.d/google-chrome.list
-sudo rm -f /etc/apt/heroku.list
-sudo rm -f /etc/apt/openjdk-r-ubuntu-ppa-xenial.list
-sudo rm -f /etc/apt/partner.list
-
-# To increase the network reliability, let apt decide which mirror is best to use
-sudo sed -i -e 's/http:\/\/.*archive/mirror:\/\/mirrors/' -e 's/\/ubuntu\//\/mirrors.txt/' /etc/apt/sources.list
-
-retry () {
-  $*  || $* || $* || $* || $*
-}
-
-# Method adapted from here: https://askubuntu.com/questions/875213/apt-get-to-retry-downloading
-# (with use of tee to avoid permissions problems)
-# This is better than retrying the whole apt-get command
-echo "APT::Acquire::Retries \"3\";" | sudo tee /etc/apt/apt.conf.d/80-retries
-
-retry sudo apt-get update -qq
-retry sudo apt-get -y install \
-  moreutils \
-  expect-dev
-
-echo "== DOCKER VERSION =="
-docker version
-
-if ! command -v aws >/dev/null; then
-  retry sudo pip3 -q install awscli==1.19.64
-fi
-
-if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
-  DRIVER_FN="NVIDIA-Linux-x86_64-515.76.run"
-  wget "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
-  sudo /bin/bash "$DRIVER_FN" -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
-  nvidia-smi
-
-  # Taken directly from https://github.com/NVIDIA/nvidia-docker
-  # Add the package repositories
-  distribution=$(. /etc/os-release;echo "$ID$VERSION_ID")
-  curl -s -L --retry 3 --retry-all-errors https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
-  curl -s -L --retry 3 --retry-all-errors "https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.list" | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
-
-  retry sudo apt-get update -qq
-  # Necessary to get the `--gpus` flag to function within docker
-  retry sudo apt-get install -y nvidia-container-toolkit
-  sudo systemctl restart docker
-else
-  # Explicitly remove nvidia docker apt repositories if not building for cuda
-  sudo rm -rf /etc/apt/sources.list.d/nvidia-docker.list
-fi
-
-add_to_env_file() {
-  local name=$1
-  local value=$2
-  case "$value" in
-    *\ *)
-      # BASH_ENV should be set by CircleCI
-      echo "${name}='${value}'" >> "${BASH_ENV:-/tmp/env}"
-      ;;
-    *)
-      echo "${name}=${value}" >> "${BASH_ENV:-/tmp/env}"
-      ;;
-  esac
-}
-
-add_to_env_file CI_MASTER "${CI_MASTER:-}"
-add_to_env_file COMMIT_SOURCE "${CIRCLE_BRANCH:-}"
-add_to_env_file BUILD_ENVIRONMENT "${BUILD_ENVIRONMENT}"
-add_to_env_file CIRCLE_PULL_REQUEST "${CIRCLE_PULL_REQUEST}"
-
-
-if [[ "${BUILD_ENVIRONMENT}" == *-build ]]; then
-  add_to_env_file SCCACHE_BUCKET ossci-compiler-cache-circleci-v2
-
-  SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
-  MEMORY_LIMIT_MAX_JOBS=8  # the "large" resource class on CircleCI has 32 CPU cores, if we use all of them we'll OOM
-  MAX_JOBS=$(( ${SCCACHE_MAX_JOBS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${SCCACHE_MAX_JOBS} ))
-  add_to_env_file MAX_JOBS "${MAX_JOBS}"
-
-  if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
-    add_to_env_file TORCH_CUDA_ARCH_LIST 5.2
-  fi
-
-  if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
-    # This IAM user allows write access to S3 bucket for sccache & bazels3cache
-    set +x
-    add_to_env_file XLA_CLANG_CACHE_S3_BUCKET_NAME "${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}"
-    add_to_env_file AWS_ACCESS_KEY_ID "${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}"
-    add_to_env_file AWS_SECRET_ACCESS_KEY "${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}"
-    set -x
-  else
-    # This IAM user allows write access to S3 bucket for sccache
-    set +x
-    add_to_env_file XLA_CLANG_CACHE_S3_BUCKET_NAME "${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}"
-    add_to_env_file AWS_ACCESS_KEY_ID "${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}"
-    add_to_env_file AWS_SECRET_ACCESS_KEY "${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}"
-    set -x
-  fi
-fi
-
-# This IAM user only allows read-write access to ECR
-set +x
-export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V4:-}
-export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V4:-}
-export AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-export AWS_REGION=us-east-1
-aws ecr get-login-password --region $AWS_REGION|docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com
-set -x
diff --git a/.circleci/scripts/setup_linux_system_environment.sh b/.circleci/scripts/setup_linux_system_environment.sh
deleted file mode 100755
index 780f7c1bd3790..0000000000000
--- a/.circleci/scripts/setup_linux_system_environment.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env bash
-set -eux -o pipefail
-
-# Set up CircleCI GPG keys for apt, if needed
-curl --retry 3 --retry-all-errors -s -L https://packagecloud.io/circleci/trusty/gpgkey | sudo apt-key add -
-
-# Stop background apt updates.  Hypothetically, the kill should not
-# be necessary, because stop is supposed to send a kill signal to
-# the process, but we've added it for good luck.  Also
-# hypothetically, it's supposed to be unnecessary to wait for
-# the process to block.  We also have that line for good luck.
-# If you like, try deleting them and seeing if it works.
-sudo systemctl stop apt-daily.service || true
-sudo systemctl kill --kill-who=all apt-daily.service || true
-
-sudo systemctl stop unattended-upgrades.service || true
-sudo systemctl kill --kill-who=all unattended-upgrades.service || true
-
-# wait until `apt-get update` has been killed
-while systemctl is-active --quiet apt-daily.service
-do
-    sleep 1;
-done
-while systemctl is-active --quiet unattended-upgrades.service
-do
-    sleep 1;
-done
-
-# See if we actually were successful
-systemctl list-units --all | cat
-
-# For good luck, try even harder to kill apt-get
-sudo pkill apt-get || true
-
-# For even better luck, purge unattended-upgrades
-sudo apt-get purge -y unattended-upgrades || true
-
-cat /etc/apt/sources.list
-
-# For the bestest luck, kill again now
-sudo pkill apt || true
-sudo pkill dpkg || true
-
-# Try to detect if apt/dpkg is stuck
-if ps auxfww | grep '[a]pt'; then
-  echo "WARNING: There are leftover apt processes; subsequent apt update will likely fail"
-fi
-if ps auxfww | grep '[d]pkg'; then
-  echo "WARNING: There are leftover dpkg processes; subsequent apt update will likely fail"
-fi
diff --git a/.clang-tidy b/.clang-tidy
index 7e2313b94c061..d0d74d154c6cf 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -36,13 +36,13 @@ hicpp-exception-baseclass,
 hicpp-avoid-goto,
 misc-*,
 -misc-const-correctness,
+-misc-include-cleaner,
 -misc-use-anonymous-namespace,
 -misc-unused-parameters,
 -misc-no-recursion,
 -misc-non-private-member-variables-in-classes,
 -misc-confusable-identifiers,
 modernize-*,
--modernize-concat-nested-namespaces,
 -modernize-macro-to-enum,
 -modernize-return-braced-init-list,
 -modernize-use-auto,
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 97f56d61cba89..e151576219af2 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -30,5 +30,5 @@ RUN if [ -n "$CLANG_VERSION" ]; then \
 # Install cuda if version is specified
 ARG CUDA_VERSION
 RUN if [ -n "$CUDA_VERSION" ]; then \
-       conda install cuda -c "nvidia/label/cuda-${CUDA_VERSION}"; \
+       conda install -y cuda -c "nvidia/label/cuda-${CUDA_VERSION}"; \
     fi
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
index ef4067a326012..17e4e4958ca85 100644
--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@@ -46,7 +46,7 @@ If you are using [Visual Studio Code Remote - SSH](https://code.visualstudio.com
 
 ## Step 6: Open in DevContainer
 
-1. In VSCode, use the Command Palette (`Ctrl+Shift+P` or `Cmd+Shift+P` on macOS) to run the "Remote-Containers: Open Folder in Container..." command.
+1. In VSCode, use the Command Palette (`Ctrl+Shift+P` or `Cmd+Shift+P` on macOS) to run the "Dev Containers: Open Folder in Container..." command.
 2. You will be prompted with two options: CPU dev container or CUDA dev container. Choose the one you want to run.
 
 ## Step 7: Wait for Building the Environment
diff --git a/.flake8 b/.flake8
index c59af78be7bc4..e3a90f36aaf9a 100644
--- a/.flake8
+++ b/.flake8
@@ -2,7 +2,7 @@
 # NOTE: **Mirror any changes** to this file the [tool.ruff] config in pyproject.toml
 # before we can fully move to use ruff
 enable-extensions = G
-select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2
+select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2,TOR9
 max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
@@ -27,6 +27,9 @@ ignore =
     # TODO(kit1980): fix all TOR102 issues
     # `torch.load` without `weights_only` parameter is unsafe
     TOR102,
+    # TODO(kit1980): resolve all TOR003 issues
+    # pass `use_reentrant` explicitly to `checkpoint`.
+    TOR003
 per-file-ignores =
     __init__.py: F401
     test/**: F821
@@ -34,6 +37,24 @@ per-file-ignores =
     torch/utils/cpp_extension.py: B950
     torchgen/api/types/__init__.py: F401,F403
     torchgen/executorch/api/types/__init__.py: F401,F403
+    test/dynamo/test_higher_order_ops.py: B950
+    torch/testing/_internal/dynamo_test_failures.py: B950
+    # TOR901 is only for test, we want to ignore it for everything else.
+    # It's not easy to configure this without affecting other per-file-ignores,
+    # so we explicitly list every file where it's violated outside of test.
+    torch/__init__.py: F401,TOR901
+    torch/_custom_op/impl.py: TOR901
+    torch/_export/serde/upgrade.py: TOR901
+    torch/_functorch/vmap.py: TOR901
+    torch/_inductor/test_operators.py: TOR901
+    torch/_library/abstract_impl.py: TOR901
+    torch/_meta_registrations.py: TOR901
+    torch/_prims/__init__.py: F401,TOR901
+    torch/_prims/rng_prims.py: TOR901
+    torch/ao/quantization/fx/_decomposed.py: TOR901
+    torch/distributed/_functional_collectives.py: TOR901
+    torch/distributed/_spmd/data_parallel.py: TOR901
+    torch/distributed/_tensor/_collective_utils.py: TOR901
 optional-ascii-coding = True
 exclude =
     ./.git,
diff --git a/.gitattributes b/.gitattributes
index 8bccf04bbb7dc..e904301752950 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -4,3 +4,4 @@
 .github/generated-* linguist-generated=true
 .github/scripts/gql_mocks.json linguist-generated=true
 third_party/LICENSES_BUNDLED.txt linguist-generated=true
+tools/build/bazel/requirements.txt linguist-generated=true
diff --git a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
index c4dc9aa772fee..7ba631fb05cc6 100644
--- a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
@@ -8,7 +8,18 @@ body:
       value: >
         #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the
         existing and past issues](https://github.com/pytorch/pytorch/issues)
-        It's likely that your bug will be resolved by checking our FAQ or troubleshooting guide [documentation](https://pytorch.org/docs/master/dynamo/index.html)
+        It's likely that your bug will be resolved by checking our FAQ or troubleshooting guide [documentation](https://pytorch.org/docs/main/dynamo/index.html)
+
+        Note: if you're submitting an issue that you generated from a fuzzer. Please do the following:
+
+        - Ensure rtol/atol are at default tolerances
+
+        - Dont compare indices of max/min etc, because that avoids the above requirement
+
+        - If comparing eager and torch.compile at fp16/bf16, you should use fp32 as baseline
+
+        If the above requirements are met, add the label "topic: fuzzer" to your issue.
+
   - type: textarea
     attributes:
       label: 🐛 Describe the bug
@@ -33,7 +44,7 @@ body:
       label: Minified repro
       description: |
         Please run the minifier on your example and paste the minified code below
-        Learn more here https://pytorch.org/docs/master/compile/troubleshooting.html
+        Learn more here https://pytorch.org/docs/main/torch.compiler_troubleshooting.html
       placeholder: |
         env TORCHDYNAMO_REPRO_AFTER="aot" python your_model.py
         or
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index 58cdbd2659768..05fc1243251b1 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -19,8 +19,9 @@ self-hosted-runner:
     - windows.g5.4xlarge.nvidia.gpu
     - bm-runner
     - linux.rocm.gpu
-    - macos-m1-12
+    - macos-m1-stable
     - macos-m1-13
+    - macos-m1-14
     - macos-12-xl
     - macos-12
     - macos12.3-m1
diff --git a/.github/actions/download-build-artifacts/action.yml b/.github/actions/download-build-artifacts/action.yml
index a7107f2067def..2deeda72802dd 100644
--- a/.github/actions/download-build-artifacts/action.yml
+++ b/.github/actions/download-build-artifacts/action.yml
@@ -9,6 +9,10 @@ inputs:
   use-gha:
     description: If set to any value, use GHA to download the artifact. Otherwise use s3.
     required: false
+  s3-bucket:
+    description: S3 bucket to download builds
+    required: false
+    default: "gha-artifacts"
 
 runs:
   using: composite
@@ -18,9 +22,10 @@ runs:
       uses: seemethere/download-artifact-s3@v4
       with:
         name: ${{ inputs.name }}
+        s3-bucket: ${{ inputs.s3-bucket }}
 
     - name: Download PyTorch Build Artifacts from GHA
-      if: inputs.use-gha
+      if: ${{ inputs.use-gha }}
       uses: actions/download-artifact@v3
       with:
         name: ${{ inputs.name }}
@@ -29,6 +34,10 @@ runs:
       shell: bash
       run: unzip -o artifacts.zip
 
+    - name: Remove artifacts.zip
+      shell: bash
+      run: rm artifacts.zip
+
     - name: Output disk space left
       shell: bash
       run: df -H
diff --git a/.github/actions/download-td-artifacts/action.yml b/.github/actions/download-td-artifacts/action.yml
new file mode 100644
index 0000000000000..595093abaead0
--- /dev/null
+++ b/.github/actions/download-td-artifacts/action.yml
@@ -0,0 +1,29 @@
+name: Download TD Artifacts
+
+description: Download artifacts from target_determination.yml
+
+inputs:
+  use-gha:
+    description: If set to any value, use GHA to download the artifact. Otherwise use s3.
+    required: false
+
+runs:
+  using: composite
+  steps:
+    - name: Download TD Artifacts from S3
+      if: ${{ !inputs.use-gha }}
+      uses: seemethere/download-artifact-s3@v4
+      with:
+        name: td_results
+
+    - name: Download TD Artifacts from GHA
+      if: inputs.use-gha
+      uses: actions/download-artifact@v3
+      with:
+        name: td_results.json
+
+    - name: Move artifacts to .additional_ci_files folder
+      shell: bash
+      run: |
+        mkdir -p .additional_ci_files
+        mv td_results.json .additional_ci_files/td_results.json
diff --git a/.github/actions/filter-test-configs/action.yml b/.github/actions/filter-test-configs/action.yml
index 3fb107a17b95e..e1f2067d58076 100644
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@@ -13,6 +13,13 @@ inputs:
     required: true
     type: string
     description: JSON description of what test configs to run.
+  selected-test-configs:
+    required: false
+    type: string
+    description: |
+      A comma-separated list of test configurations from the test matrix to keep,
+      The empty list means we are going to keep every configurations by defaults
+    default: ""
   job-name:
     type: string
     required: false
@@ -26,11 +33,23 @@ outputs:
     description: True if the filtered test configs matrix is empty. False otherwise.
     value: ${{ steps.filter.outputs.is-test-matrix-empty }}
   keep-going:
-    description: True if keep-going label was on PR.
+    description: True if keep-going label was on PR or [keep-going] in PR body.
     value: ${{ steps.filter.outputs.keep-going }}
   reenabled-issues:
     description: Comma separated list of issue numbers that should correspond to disable test issues that the PR fixes
     value: ${{ steps.filter.outputs.reenabled-issues }}
+  ci-verbose-test-logs:
+    description: True if ci-verbose-test-logs label was on PR or [ci-verbose-test-logs] in PR body.
+    value: ${{ steps.filter.outputs.ci-verbose-test-logs }}
+  ci-no-test-timeout:
+    description: True if ci-no-test-timeout label was on PR or [ci-no-test-timeout] in PR body.
+    value: ${{ steps.filter.outputs.ci-no-test-timeout }}
+  ci-no-td:
+    description: True if ci-no-td label was on PR or [ci-no-td] in PR body.
+    value: ${{ steps.filter.outputs.ci-no-td }}
+  ci-td-distributed:
+    description: True if ci-td-distributed label was on PR or [ci-td-distributed] in PR body.
+    value: ${{ steps.filter.outputs.ci-td-distributed }}
 
 runs:
   using: composite
@@ -114,6 +133,7 @@ runs:
           --workflow "${GITHUB_WORKFLOW}" \
           --job-name "${JOB_NAME}" \
           --test-matrix "${{ inputs.test-matrix }}" \
+          --selected-test-configs "${{ inputs.selected-test-configs }}" \
           --pr-number "${PR_NUMBER}" \
           --tag "${TAG}" \
           --event-name "${EVENT_NAME}" \
diff --git a/.github/actions/linux-build/action.yml b/.github/actions/linux-build/action.yml
new file mode 100644
index 0000000000000..c0f74160507bb
--- /dev/null
+++ b/.github/actions/linux-build/action.yml
@@ -0,0 +1,207 @@
+name: linux-build
+
+inputs:
+  build-environment:
+    required: true
+    description: Top-level label for what's being built/tested.
+  docker-image-name:
+    required: true
+    description: Name of the base docker image to build with.
+  build-generates-artifacts:
+    required: false
+    default: "true"
+    description: If set, upload generated build artifacts.
+  build-with-debug:
+    required: false
+    default: "false"
+    description: If set, build in debug mode.
+  sync-tag:
+    required: false
+    default: ""
+    description: |
+      If this is set, our linter will use this to make sure that every other
+      job with the same `sync-tag` is identical.
+  cuda-arch-list:
+    required: false
+    default: "5.2"
+    description: Runner label to select worker type
+  runner:
+    required: false
+    default: "linux.2xlarge"
+    description: |
+      List of CUDA architectures CI build should target.
+  test-matrix:
+    required: false
+    type: string
+    description: |
+      An option JSON description of what test configs to run later on. This
+      is moved here from the Linux test workflow so that we can apply filter
+      logic using test-config labels earlier and skip unnecessary builds
+  s3-bucket:
+    description: S3 bucket to download artifact
+    required: false
+    default: "gha-artifacts"
+  aws-role-to-assume:
+    description: role to assume for downloading artifacts
+    required: false
+    default: ""
+  GITHUB_TOKEN:
+    description: GitHub token
+    required: true
+  HUGGING_FACE_HUB_TOKEN:
+    description: Hugging Face Hub token
+    required: false
+    default: ""
+outputs:
+  docker-image:
+    value: ${{ steps.calculate-docker-image.outputs.docker-image }}
+    description: The docker image containing the built PyTorch.
+  test-matrix:
+    value: ${{ steps.filter.outputs.test-matrix }}
+    description: An optional JSON description of what test configs to run later on.
+
+runs:
+  using: composite
+  steps:
+    - name: Setup Linux
+      uses: ./.github/actions/setup-linux
+
+    - name: configure aws credentials
+      uses: aws-actions/configure-aws-credentials@v3
+      if: ${{ inputs.aws-role-to-assume != '' }}
+      with:
+        role-to-assume: ${{ inputs.aws-role-to-assume }}
+        role-session-name: gha-linux-build
+        role-duration-seconds: 10800
+        aws-region: us-east-1
+
+    - name: Calculate docker image
+      id: calculate-docker-image
+      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+      with:
+        docker-image-name: ${{ inputs.docker-image-name }}
+
+    - name: Use following to pull public copy of the image
+      id: print-ghcr-mirror
+      env:
+        ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      shell: bash
+      run: |
+        tag=${ECR_DOCKER_IMAGE##*/}
+        echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
+
+    - name: Pull docker image
+      uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+      with:
+        docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+    - name: Parse ref
+      id: parse-ref
+      shell: bash
+      run: .github/scripts/parse_ref.py
+
+    - name: Get workflow job id
+      id: get-job-id
+      uses: ./.github/actions/get-workflow-job-id
+      if: always()
+      with:
+        github-token: ${{ inputs.GITHUB_TOKEN }}
+
+    # Apply the filter logic to the build step too if the test-config label is already there
+    - name: Select all requested test configurations (if the test matrix is available)
+      id: filter
+      uses: ./.github/actions/filter-test-configs
+      with:
+        github-token: ${{ inputs.GITHUB_TOKEN }}
+        test-matrix: ${{ inputs.test-matrix }}
+        job-name: ${{ steps.get-job-id.outputs.job-name }}
+
+    - name: Download pytest cache
+      uses: ./.github/actions/pytest-cache-download
+      continue-on-error: true
+      with:
+        cache_dir: .pytest_cache
+        job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
+        s3_bucket: ${{ inputs.s3-bucket }}
+
+    - name: Build
+      if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
+      id: build
+      env:
+        BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+        BRANCH: ${{ steps.parse-ref.outputs.branch }}
+        # TODO duplicated
+        AWS_DEFAULT_REGION: us-east-1
+        PR_NUMBER: ${{ github.event.pull_request.number }}
+        SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+        SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
+        XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+        PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+        TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
+        DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+        XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
+        DEBUG: ${{ inputs.build-with-debug == 'true' && '1' || '0' }}
+        OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
+      shell: bash
+      run: |
+        # detached container should get cleaned up by teardown_ec2_linux
+        container_name=$(docker run \
+          -e BUILD_ENVIRONMENT \
+          -e MAX_JOBS="$(nproc --ignore=2)" \
+          -e AWS_DEFAULT_REGION \
+          -e PR_NUMBER \
+          -e SHA1 \
+          -e BRANCH \
+          -e SCCACHE_BUCKET \
+          -e SCCACHE_S3_KEY_PREFIX \
+          -e XLA_CUDA \
+          -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+          -e SKIP_SCCACHE_INITIALIZATION=1 \
+          -e TORCH_CUDA_ARCH_LIST \
+          -e PR_LABELS \
+          -e OUR_GITHUB_JOB_ID \
+          -e HUGGING_FACE_HUB_TOKEN \
+          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+          --security-opt seccomp=unconfined \
+          --cap-add=SYS_PTRACE \
+          --tty \
+          --detach \
+          --user jenkins \
+          -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+          -w /var/lib/jenkins/workspace \
+          "${DOCKER_IMAGE}"
+        )
+        docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'
+
+    - name: Archive artifacts into zip
+      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped'
+      shell: bash
+      run: |
+        zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files
+
+    - name: Store PyTorch Build Artifacts on S3
+      uses: seemethere/upload-artifact-s3@v5
+      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped'
+      with:
+        name: ${{ inputs.build-environment }}
+        retention-days: 14
+        if-no-files-found: error
+        path: artifacts.zip
+        s3-bucket: ${{ inputs.s3-bucket }}
+
+    - name: Upload sccache stats
+      if: steps.build.outcome != 'skipped'
+      uses: seemethere/upload-artifact-s3@v5
+      with:
+        s3-prefix: |
+          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
+        retention-days: 365
+        if-no-files-found: warn
+        path: sccache-stats-*.json
+        s3-bucket: ${{ inputs.s3-bucket }}
+
+    - name: Teardown Linux
+      uses: pytorch/test-infra/.github/actions/teardown-linux@main
+      if: always()
diff --git a/.github/actions/linux-test/action.yml b/.github/actions/linux-test/action.yml
new file mode 100644
index 0000000000000..6c8e761444b0a
--- /dev/null
+++ b/.github/actions/linux-test/action.yml
@@ -0,0 +1,384 @@
+name: linux-test
+
+inputs:
+  build-environment:
+    required: true
+    type: string
+    description: Top-level label for what's being built/tested.
+  test-matrix:
+    required: true
+    type: string
+    description: JSON description of what test configs to run.
+  docker-image:
+    required: true
+    type: string
+    description: Docker image to run in.
+  sync-tag:
+    required: false
+    type: string
+    default: ""
+    description: |
+      If this is set, our linter will use this to make sure that every other
+      job with the same `sync-tag` is identical.
+  use-gha:
+    required: false
+    type: string
+    default: ""
+    description: If set to any value, upload to GHA. Otherwise upload to S3.
+  dashboard-tag:
+    required: false
+    type: string
+    default: ""
+  s3-bucket:
+    description: S3 bucket to download artifact
+    required: false
+    type: string
+    default: "gha-artifacts"
+  aws-role-to-assume:
+    description: role to assume for downloading artifacts
+    required: false
+    type: string
+    default: ""
+  HUGGING_FACE_HUB_TOKEN:
+    description: |
+      HF Auth token to avoid rate limits when downloading models or datasets from hub
+    required: false
+    default: ""
+  GITHUB_TOKEN:
+    description: GitHub token
+    required: true
+
+#env:
+#  GIT_DEFAULT_BRANCH: ${{ inputs.default_branch }}
+
+runs:
+  using: composite
+  steps:
+    - name: Setup Linux
+      uses: ./.github/actions/setup-linux
+
+    - name: configure aws credentials
+      if : ${{ inputs.aws-role-to-assume != '' }}
+      uses: aws-actions/configure-aws-credentials@v3
+      with:
+        role-to-assume: ${{ inputs.aws-role-to-assume }}
+        role-session-name: gha-linux-test
+        aws-region: us-east-1
+
+    - name: Calculate docker image
+      id: calculate-docker-image
+      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+      with:
+        docker-image-name: ${{ inputs.docker-image }}
+
+    - name: Use following to pull public copy of the image
+      id: print-ghcr-mirror
+      env:
+        ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      shell: bash
+      run: |
+        tag=${ECR_DOCKER_IMAGE##*/}
+        echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
+
+    - name: Pull docker image
+      uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+      with:
+        docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+    - name: Check if in a ARC runner
+      shell: bash
+      id: check_arc_runner
+      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+
+    - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+      id: install-nvidia-driver
+      uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+
+    - name: Lock NVIDIA A100 40GB Frequency
+      shell: bash
+      run: |
+        sudo nvidia-smi -pm 1
+        sudo nvidia-smi -ac 1215,1410
+        nvidia-smi
+      if: contains(matrix.runner, 'a100')
+
+    - name: Start monitoring script
+      id: monitor-script
+      shell: bash
+      continue-on-error: true
+      run: |
+        python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
+        python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
+        echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
+
+    - name: Download build artifacts
+      uses: ./.github/actions/download-build-artifacts
+      with:
+        name: ${{ inputs.build-environment }}
+        s3-bucket: ${{ inputs.s3-bucket }}
+
+    - name: Download TD artifacts
+      continue-on-error: true
+      uses: ./.github/actions/download-td-artifacts
+
+    - name: Parse ref
+      id: parse-ref
+      shell: bash
+      run: .github/scripts/parse_ref.py
+
+    - name: Get workflow job id
+      id: get-job-id
+      uses: ./.github/actions/get-workflow-job-id
+      if: always()
+      with:
+        github-token: ${{ inputs.GITHUB_TOKEN }}
+
+    - name: Check for keep-going label and re-enabled test issues
+      # This uses the filter-test-configs action because it conviniently
+      # checks for labels and re-enabled test issues.  It does not actually do
+      # any filtering.  All filtering is done in the build step.
+      id: keep-going
+      uses: ./.github/actions/filter-test-configs
+      with:
+        github-token: ${{ inputs.GITHUB_TOKEN }}
+        test-matrix: ${{ inputs.test-matrix }}
+        job-name: ${{ steps.get-job-id.outputs.job-name }}
+
+    - name: Test
+      id: test
+      env:
+        BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+        PR_NUMBER: ${{ github.event.pull_request.number }}
+        GITHUB_REPOSITORY: ${{ github.repository }}
+        GITHUB_WORKFLOW: ${{ github.workflow }}
+        GITHUB_JOB: ${{ github.job }}
+        GITHUB_RUN_ID: ${{ github.run_id }}
+        GITHUB_RUN_NUMBER: ${{ github.run_number }}
+        GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
+        JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+        JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+        BRANCH: ${{ steps.parse-ref.outputs.branch }}
+        SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+        BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
+        TEST_CONFIG: ${{ matrix.config }}
+        SHARD_NUMBER: ${{ matrix.shard }}
+        NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+        REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
+        CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+        VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+        NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
+        NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
+        TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
+        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+        SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
+        SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
+        DOCKER_IMAGE: ${{ inputs.docker-image }}
+        XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
+        XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+        PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
+        PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
+        DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
+        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
+      shell: bash
+      run: |
+        set -x
+
+        if [[ $TEST_CONFIG == 'multigpu' ]]; then
+          TEST_COMMAND=.ci/pytorch/multigpu-test.sh
+        elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
+          TEST_COMMAND=.ci/onnx/test.sh
+        else
+          TEST_COMMAND=.ci/pytorch/test.sh
+        fi
+
+        # detached container should get cleaned up by teardown_ec2_linux
+        # TODO: Stop building test binaries as part of the build phase
+        # Used for GPU_FLAG since that doesn't play nice
+        # shellcheck disable=SC2086,SC2090
+        container_name=$(docker run \
+          ${GPU_FLAG:-} \
+          -e BUILD_ENVIRONMENT \
+          -e PR_NUMBER \
+          -e GITHUB_ACTIONS \
+          -e GITHUB_REPOSITORY \
+          -e GITHUB_WORKFLOW \
+          -e GITHUB_JOB \
+          -e GITHUB_RUN_ID \
+          -e GITHUB_RUN_NUMBER \
+          -e GITHUB_RUN_ATTEMPT \
+          -e JOB_ID \
+          -e JOB_NAME \
+          -e BASE_SHA \
+          -e BRANCH \
+          -e SHA1 \
+          -e AWS_DEFAULT_REGION \
+          -e IN_WHEEL_TEST \
+          -e SHARD_NUMBER \
+          -e TEST_CONFIG \
+          -e NUM_TEST_SHARDS \
+          -e REENABLED_ISSUES \
+          -e CONTINUE_THROUGH_ERROR \
+          -e VERBOSE_TEST_LOGS \
+          -e NO_TEST_TIMEOUT \
+          -e NO_TD \
+          -e TD_DISTRIBUTED \
+          -e PR_LABELS \
+          -e MAX_JOBS="$(nproc --ignore=2)" \
+          -e SCCACHE_BUCKET \
+          -e SCCACHE_S3_KEY_PREFIX \
+          -e XLA_CUDA \
+          -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+          -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
+          -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
+          -e SKIP_SCCACHE_INITIALIZATION=1 \
+          -e HUGGING_FACE_HUB_TOKEN \
+          -e DASHBOARD_TAG \
+          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+          --security-opt seccomp=unconfined \
+          --cap-add=SYS_PTRACE \
+          --ipc=host \
+          --shm-size="${SHM_SIZE}" \
+          --tty \
+          --detach \
+          --name="${container_name}" \
+          --user jenkins \
+          -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+          -w /var/lib/jenkins/workspace \
+          "${DOCKER_IMAGE}"
+        )
+        # Propagate download.pytorch.org IP to container
+        grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
+        echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
+        docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
+
+    - name: Upload pytest cache if tests failed
+      uses: ./.github/actions/pytest-cache-upload
+      continue-on-error: true
+      if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure'
+      with:
+        cache_dir: .pytest_cache
+        shard: ${{ matrix.shard }}
+        sha: ${{ github.event.pull_request.head.sha || github.sha }}
+        test_config: ${{ matrix.config }}
+        job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
+
+    - name: Print remaining test logs
+      shell: bash
+      if: always() && steps.test.conclusion
+      run: |
+        cat test/**/*_toprint.log || true
+
+    - name: Stop monitoring script
+      if: always() && steps.monitor-script.outputs.monitor-script-pid
+      shell: bash
+      continue-on-error: true
+      env:
+        MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
+      run: |
+        kill "$MONITOR_SCRIPT_PID"
+
+    - name: Upload test artifacts
+      uses: ./.github/actions/upload-test-artifacts
+      if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
+      with:
+        file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
+        use-gha: ${{ inputs.use-gha }}
+        s3-bucket: ${{ inputs.s3-bucket }}
+
+    - name: Collect backtraces from coredumps (if any)
+      if: always()
+      shell: bash
+      run: |
+        # shellcheck disable=SC2156
+        find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
+
+    - name: Store Core dumps on S3
+      uses: seemethere/upload-artifact-s3@v5
+      if: failure()
+      with:
+        name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
+        retention-days: 14
+        if-no-files-found: ignore
+        path: ./**/core.[1-9]*
+
+    - name: Teardown Linux
+      uses: pytorch/test-infra/.github/actions/teardown-linux@main
+      if: always()
+
+    # NB: We are currently having an intermittent GPU-related issue on G5 runners with
+    # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
+    # not seem to help. Here are some symptoms:
+    #   * Calling nvidia-smi timeouts after 60 second
+    #   * Fail to run nvidia-smi with an unable to determine the device handle for GPU
+    #     unknown error
+    #   * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
+    #   * Run docker --gpus all fails with error response from daemon
+    #
+    # As both the root cause and recovery path are unclear, let's take the runner out of
+    # service so that it doesn't get any more jobs
+    - name: Check NVIDIA driver installation step
+      if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
+      shell: bash
+      env:
+        RUNNER_WORKSPACE: ${{ runner.workspace }}
+      run: |
+        set +e
+        set -x
+
+        nvidia-smi
+        # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in
+        # the case where the driver has already crashed as it still can get the driver version
+        # and some basic information like the bus ID.  However, the rest of the information
+        # would be missing (ERR!), for example:
+        #
+        # +-----------------------------------------------------------------------------+
+        # | NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
+        # |-------------------------------+----------------------+----------------------+
+        # | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+        # | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+        # |                               |                      |               MIG M. |
+        # |===============================+======================+======================|
+        # |   0  ERR!                Off  | 00000000:00:1E.0 Off |                 ERR! |
+        # |ERR!  ERR! ERR!    ERR! / ERR! |   4184MiB / 23028MiB |    ERR!      Default |
+        # |                               |                      |                 ERR! |
+        # +-------------------------------+----------------------+----------------------+
+        #
+        # +-----------------------------------------------------------------------------+
+        # | Processes:                                                                  |
+        # |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
+        # |        ID   ID                                                   Usage      |
+        # |=============================================================================|
+        # +-----------------------------------------------------------------------------+
+        #
+        # This should be reported as a failure instead as it will guarantee to fail when
+        # Docker tries to run with --gpus all
+        #
+        # So, the correct check here is to query one of the missing piece of info like
+        # GPU name, so that the command can fail accordingly
+        nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
+        NVIDIA_SMI_STATUS=$?
+
+        # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
+        if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
+          echo "NVIDIA driver installation has failed, shutting down the runner..."
+          .github/scripts/stop_runner_service.sh
+        fi
+
+        # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the
+        # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails
+        # https://github.com/pytorch/test-infra/issues/4000
+        GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
+        NVIDIA_SMI_STATUS=$?
+
+        # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
+        if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
+          echo "NVIDIA driver installation has failed, shutting down the runner..."
+          .github/scripts/stop_runner_service.sh
+        fi
+
+        # Check the GPU count to be a power of 2
+        if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then
+          echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
+          .github/scripts/stop_runner_service.sh
+        fi
diff --git a/.github/actions/pytest-cache-download/action.yml b/.github/actions/pytest-cache-download/action.yml
index 1cf2f9f4ab57a..bbbeb9f43090d 100644
--- a/.github/actions/pytest-cache-download/action.yml
+++ b/.github/actions/pytest-cache-download/action.yml
@@ -9,6 +9,10 @@ inputs:
   job_identifier:
     description: Text that uniquely identifies a given job type within a workflow. All shards of a job should share the same job identifier.
     required: true
+  s3_bucket:
+    description: S3 bucket to download PyTest cache
+    required: false
+    default: "gha-artifacts"
 
 runs:
   using: composite
@@ -30,6 +34,7 @@ runs:
         CACHE_DIR: ${{ inputs.cache_dir }}
         JOB_IDENTIFIER: ${{ inputs.job_identifier }}
         REPO: ${{ github.repository }}
+        BUCKET: ${{ inputs.s3_bucket }}
       run: |
         python3 .github/scripts/pytest_cache.py \
           --download \
@@ -38,3 +43,4 @@ runs:
           --job_identifier $JOB_IDENTIFIER \
           --temp_dir $RUNNER_TEMP \
           --repo $REPO \
+          --bucket $BUCKET \
diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml
index 98eab13c44077..193dc7d6fd93d 100644
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -15,10 +15,12 @@ runs:
           category=$1
           # If it is GCP runner (runner name contains gcp), do not run this
           runner_name_str=${{ runner.name }}
-          if [[ $runner_name_str != *"gcp"* ]]; then
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          else
+          if [[ -f /.inarc ]]; then
+            echo "ARC Runner, no info on ec2 metadata"
+          elif [[ $runner_name_str == *"gcp"* ]]; then
             echo "Runner is from Google Cloud Platform, No info on ec2 metadata"
+          else
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
           fi
         }
         echo "ami-id: $(get_ec2_metadata ami-id)"
@@ -26,8 +28,14 @@ runs:
         echo "instance-type: $(get_ec2_metadata instance-type)"
         echo "system info $(uname -a)"
 
+    - name: Check if in a ARC runner
+      shell: bash
+      id: check_arc_runner
+      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)"  >> $GITHUB_OUTPUT
+
     - name: Start docker if docker deamon is not running
       shell: bash
+      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
       run: |
         if systemctl is-active --quiet docker; then
             echo "Docker daemon is running...";
@@ -58,6 +66,7 @@ runs:
         env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
 
     - name: Kill any existing containers, clean up images
+      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
       shell: bash
       run: |
         # ignore expansion of "docker ps -q" since it could be empty
@@ -96,3 +105,28 @@ runs:
 
         echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
         cat /etc/hosts
+
+    - name: Check that the docker daemon is running
+      shell: bash
+      continue-on-error: true
+      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'true' }}
+      run: |
+        set +x
+
+        max_attempts=30
+        delay=10
+        attempt=1
+
+        for attempt in $(seq 1 $max_attempts); do
+          echo "Attempt $attempt of $max_attempts: Checking if Docker daemon is running..."
+          if docker info > /dev/null 2>&1; then
+            echo "Docker is running. Proceeding with the next steps"
+            exit 0
+          else
+            echo "Docker is not running yet."
+            echo "Retrying in $delay seconds..."
+            sleep $delay
+          fi
+        done
+        echo "Reached maximum attempts to connect to Docker. Exiting."
+        exit 1
diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml
index b9833480954b9..232a1e33a9c86 100644
--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@@ -9,6 +9,16 @@ runs:
       shell: bash
       run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
 
+    - name: Remove leftover Docker config file
+      shell: bash
+      continue-on-error: true
+      run: |
+        set -ex
+
+        cat ~/.docker/config.json || true
+        # https://stackoverflow.com/questions/64455468/error-when-logging-into-ecr-with-docker-login-error-saving-credentials-not
+        rm -f ~/.docker/config.json
+
     - name: Stop all running docker containers
       if: always()
       shell: bash
diff --git a/.github/actions/update-commit-hash/action.yml b/.github/actions/update-commit-hash/action.yml
deleted file mode 100644
index 5a21d592d78f7..0000000000000
--- a/.github/actions/update-commit-hash/action.yml
+++ /dev/null
@@ -1,59 +0,0 @@
-name: Update commit hash
-
-inputs:
-  repo-owner:
-    required: false
-    type: string
-    description: Name of repository's owner.
-    default: pytorch
-  repo-name:
-    required: true
-    type: string
-    description: Name of the repository we're updating commit hash for.
-  branch:
-    required: true
-    type: string
-    description: Branch to fetch commit of
-  pin-folder:
-    type: string
-    description: Path to folder with commit pin
-    required: false
-    default: .github/ci_commit_pins
-  updatebot-token:
-    required: true
-    type: string
-    description: update bot token
-  pytorchbot-token:
-    required: true
-    type: string
-    description: update bot token
-
-description: update commit hash
-
-runs:
-  using: composite
-  steps:
-    - name: Checkout repo
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 1
-        submodules: false
-        token: ${{ inputs.updatebot-token }}
-    - name: Checkout
-      shell: bash
-      run: |
-        git clone https://github.com/${{ inputs.repo-owner }}/${{ inputs.repo-name }}.git --quiet
-    - name: Check if there already exists a PR
-      shell: bash
-      env:
-        REPO_NAME: ${{ inputs.repo-name }}
-        BRANCH: ${{ inputs.branch }}
-        PIN_FOLDER: ${{ inputs.pin-folder }}
-        UPDATEBOT_TOKEN: ${{ inputs.updatebot-token }}
-        PYTORCHBOT_TOKEN: ${{ inputs.pytorchbot-token }}
-        NEW_BRANCH_NAME: update-${{ inputs.repo-name }}-commit-hash/${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}
-      run: |
-        # put this here instead of the script to prevent accidentally changing the config when running the script locally
-        git config --global user.name "PyTorch UpdateBot"
-        git config --global user.email "pytorchupdatebot@users.noreply.github.com"
-        python .github/scripts/update_commit_hashes.py --repo-name "${REPO_NAME}" --branch "${BRANCH}" --pin-folder "${PIN_FOLDER}"
diff --git a/.github/actions/upload-test-artifacts/action.yml b/.github/actions/upload-test-artifacts/action.yml
index 87fa30a675861..04cb43b20c389 100644
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@@ -11,6 +11,10 @@ inputs:
       Suffix to add to the filename of the artifacts. This should include the
       workflow job id, see [Job id in artifacts].
     required: true
+  s3-bucket:
+    description: S3 bucket to download builds
+    required: false
+    default: "gha-artifacts"
 
 runs:
   using: composite
@@ -42,7 +46,7 @@ runs:
       env:
         FILE_SUFFIX: ${{ inputs.file-suffix }}
       run: |
-        # Remove any previous test reports if they exist
+        # Remove any previous usage logs if they exist
         rm -f logs-*.zip
         # this workflow is also run in bazel build test, but we dont generate usage reports for it
         # so check to see if the file exists first
@@ -53,6 +57,18 @@ runs:
             zip -r "logs-${FILE_SUFFIX}.zip" test -i '*.log'
         fi
 
+    - name: Zip debugging artifacts for upload
+      if: runner.os != 'Windows' && !inputs.use-gha
+      shell: bash
+      env:
+        FILE_SUFFIX: ${{ inputs.file-suffix }}
+      run: |
+        # Remove any previous debugging artifacts if they exist
+        rm -f debug-*.zip
+        if [ -d 'test/debug' ]; then
+          zip -r "debug-${FILE_SUFFIX}.zip" test/debug
+        fi
+
     # Windows zip
     - name: Zip JSONs for upload
       if: runner.os == 'Windows' && !inputs.use-gha
@@ -87,6 +103,7 @@ runs:
       uses: seemethere/upload-artifact-s3@v5
       if: ${{ !inputs.use-gha }}
       with:
+        s3-bucket: ${{ inputs.s3-bucket }}
         s3-prefix: |
           ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
         retention-days: 14
@@ -97,6 +114,7 @@ runs:
       uses: seemethere/upload-artifact-s3@v5
       if: ${{ !inputs.use-gha }}
       with:
+        s3-bucket: ${{ inputs.s3-bucket }}
         s3-prefix: |
           ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
         retention-days: 14
@@ -108,12 +126,25 @@ runs:
       if: ${{ !inputs.use-gha }}
       continue-on-error: true
       with:
+        s3-bucket: ${{ inputs.s3-bucket }}
         s3-prefix: |
           ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
         retention-days: 14
         if-no-files-found: ignore
         path: logs-*.zip
 
+    - name: Store Debug Artifacts on S3
+      uses: seemethere/upload-artifact-s3@v5
+      if: ${{ !inputs.use-gha }}
+      continue-on-error: true
+      with:
+        s3-bucket: ${{ inputs.s3-bucket }}
+        s3-prefix: |
+          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
+        retention-days: 14
+        if-no-files-found: ignore
+        path: debug-*.zip
+
     # GHA upload
     - name: Store Test Downloaded JSONs on Github
       uses: actions/upload-artifact@v3
diff --git a/.github/auto_request_review.yml b/.github/auto_request_review.yml
index 05162f87ea5d7..3ec436d107622 100644
--- a/.github/auto_request_review.yml
+++ b/.github/auto_request_review.yml
@@ -6,7 +6,6 @@ reviewers:
       - albanD
       - miladm
       - bdhirsh
-      - voznesenskym
 
   per_author:
     symbolic-shapes:
diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
index 4145279c4cf16..9830a3ce9650e 100644
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@@ -1 +1 @@
-e3efbc2d9094685dd2d4ae143853941f82f167af
+ea437b31ce316ea3d66fe73768c0dcb94edb79ad
diff --git a/.github/ci_commit_pins/torchbench.txt b/.github/ci_commit_pins/torchbench.txt
index f2d5071375cf6..3df9dd6cf8038 100644
--- a/.github/ci_commit_pins/torchbench.txt
+++ b/.github/ci_commit_pins/torchbench.txt
@@ -1 +1 @@
-99944a2fb8624947f9c0e2edc898ff42a16124da
+d6015d42d9a1834bc7595c4bd6852562fb80b30b
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index afca955c6c27d..c642e5d08c80d 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-d23430765b5df76cd1267f438f129f51b7d6e3e1
+d23a6e1664d20707c11781299611436e1f0c104f
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 807efc6f54204..5258ab80f18a8 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-e1c94dfa5a74331a376537c23bf74a2c367f24bd
+e3fc03314dab5f44e3ed9ccbba6c15fbca3285cd
diff --git a/.github/label_to_label.yml b/.github/label_to_label.yml
new file mode 100644
index 0000000000000..e6c66a5e56cf6
--- /dev/null
+++ b/.github/label_to_label.yml
@@ -0,0 +1,13 @@
+# Use this to auto apply labels based on other labels.  Applies to both PRs and
+# issues. Currently only supports any and all
+- any:
+  - "module: custom operators"
+  - "module: aotdispatch"
+  then:
+  - "module: pt2-dispatcher"
+- any:
+  - "module: dynamo"
+  - "module: pt2-dispatcher"
+  - "module: inductor"
+  then:
+  - "oncall: pt2"
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 4b7e5488e7881..f436ec684ffb9 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -26,10 +26,18 @@
 - .github/ci_commit_pins/**
 - c10/core/Sym*
 - torch/fx/experimental/symbolic_shapes.py
+- torch/fx/experimental/recording.py
+- torch/fx/experimental/sym_node.py
+- torch/fx/experimental/validator.py
+- torch/fx/experimental/_sym_dispatch_mode.py
+- torch/fx/experimental/proxy_tensor.py
 - test/distributed/_tensor/test_dtensor_compile.py
 - test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
 - torch/distributed/_tensor/**
 - torch/distributed/fsdp/**
+- torch/csrc/inductor/**
+- test/cpp/aoti_abi_check/**
+- test/cpp/aoti_inference/**
 
 "module: cpu":
 - aten/src/ATen/cpu/**
@@ -39,6 +47,7 @@
 - aten/src/ATen/native/mkldnn/**
 - torch/cpu/**
 - torch/utils/mkldnn.py
+- torch/utils/_sympy/**
 - test/test_mkldnn.py
 
 "module: mkldnn":
@@ -49,6 +58,17 @@
 - third_party/mkl-dnn.BUILD
 - torch/csrc/jit/codegen/onednn/**
 - test/test_jit_llga_fuser.py
+- test/test_mkldnn.py
+
+"ciflow/linux-aarch64":
+- third_party/ideep
+- caffe2/ideep/**
+- caffe2/python/ideep/**
+- cmake/Modules/FindMKLDNN.cmake
+- third_party/mkl-dnn.BUILD
+- torch/csrc/jit/codegen/onednn/**
+- test/test_jit_llga_fuser.py
+- test/test_mkldnn.py
 
 "module: amp (automated mixed precision)":
 - torch/amp/**
diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index fec99bd9e1ffe..db0ec3c51aa79 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -28,12 +28,13 @@
   - caffe2/python/onnx/**
   approved_by:
   - BowenBao
-  - abock
   - justinchuby
+  - liqunfu
   - shubhambhokare1
   - thiagocrepaldi
   - titaiwangms
   - wschin
+  - xadupre
   mandatory_checks_name:
   - EasyCLA
   - Lint
@@ -236,6 +237,23 @@
   - Lint
   - pull
 
+- name: XPU ATen
+  patterns:
+  - aten/src/ATen/xpu/**
+  - c10/xpu/**
+  - torch/csrc/xpu/**
+  - torch/xpu/**
+  - test/xpu/**
+  - third_party/xpu.txt
+  approved_by:
+  - EikanWang
+  - jgong5
+  - gujinghui
+  mandatory_checks_name:
+  - EasyCLA
+  - Lint
+  - pull
+
 - name: Distributions
   patterns:
   - torch/distributions/**
@@ -275,17 +293,20 @@
   - wanchaol
   - fduwjj
   - H-Huang
-  - aazzolini
   - kwen2501
   - XilunWu
   - wz337
   - awgu
   - fegin
-  - kumpera
-  - yhcharles
+  - kurman
+  - LucasLLC
+  - sanketpurandare
+  - shuqiangzhang
+  - tianyu-l
   - kiukchung
   - d4l3k
   - shuqiangzhang
+  - weifengpy
   mandatory_checks_name:
   - EasyCLA
   - Lint
@@ -354,12 +375,14 @@
 
 - name: CPU inductor
   patterns:
+  - torch/_inductor/mkldnn_lowerings.py
   - torch/_inductor/fx_passes/mkldnn_fusion.py
   - torch/_inductor/fx_passes/quantization.py
   - torch/_inductor/codegen/cpp.py
   - test/inductor/test_mkldnn_pattern_matcher.py
   - test/inductor/test_cpu_repo.py
   - test/inductor/test_cpu_cpp_wrapper.py
+  - aten/src/ATen/cpu/**
   - aten/src/ATen/native/quantized/cpu/**
   - test/quantization/core/test_quantized_op.py
   - torch/ao/quantization/quantizer/x86_inductor_quantizer.py
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index be63e07105349..4bf7526e79141 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -1,5 +1,6 @@
 tracking_issue: 24422
 ciflow_tracking_issue: 64124
+TD_rollout_issue: 123120
 ciflow_push_tags:
 - ciflow/binaries
 - ciflow/binaries_conda
@@ -7,6 +8,8 @@ ciflow_push_tags:
 - ciflow/binaries_wheel
 - ciflow/inductor
 - ciflow/inductor-perf-compare
+- ciflow/inductor-micro-benchmark
+- ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
 - ciflow/periodic
@@ -15,9 +18,12 @@ ciflow_push_tags:
 - ciflow/trunk
 - ciflow/unstable
 - ciflow/xpu
+- ciflow/torchbench
 retryable_workflows:
 - lint
 - pull
 - trunk
 - linux-binary
 - windows-binary
+labeler_config: labeler.yml
+label_to_label_config: label_to_label.yml
diff --git a/.github/requirements-gha-cache.txt b/.github/requirements-gha-cache.txt
index 80f04544225df..1064212c5b4ed 100644
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@@ -1,12 +1,11 @@
 # This file is to cache other dependencies not specified elsewhere in:
 #   requirement.txt
-#   requirements-flake8.txt
 #   docs/requirements.txt
 #   docs/cpp/requirements.txt
 #   functorch/docs/requirements.txt
 #   .ci/docker/requirements-ci.txt
 boto3==1.19.12
-jinja2==3.0.1
+jinja2==3.1.4
 lintrunner==0.10.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84
diff --git a/.github/requirements/conda-env-Linux-X64.txt b/.github/requirements/conda-env-Linux-X64.txt
index 43afafcd26011..16bbc57dd3be2 100644
--- a/.github/requirements/conda-env-Linux-X64.txt
+++ b/.github/requirements/conda-env-Linux-X64.txt
@@ -4,6 +4,6 @@ mkl-include=2022.1.0
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
-requests=2.28.1
-setuptools=65.5.0
+requests=2.31.0
+setuptools=68.2.2
 typing-extensions=4.3.0
diff --git a/.github/requirements/conda-env-iOS.txt b/.github/requirements/conda-env-iOS.txt
index cd94a40a21ab8..205c07925a016 100644
--- a/.github/requirements/conda-env-iOS.txt
+++ b/.github/requirements/conda-env-iOS.txt
@@ -3,6 +3,6 @@ cmake=3.22.1
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
-requests=2.28.1
-setuptools=63.4.1
+requests=2.31.0
+setuptools=68.2.2
 typing-extensions=4.3.0
diff --git a/.github/requirements/pip-requirements-iOS.txt b/.github/requirements/pip-requirements-iOS.txt
index 30e67abc5c863..01290e4c7102d 100644
--- a/.github/requirements/pip-requirements-iOS.txt
+++ b/.github/requirements/pip-requirements-iOS.txt
@@ -1,4 +1,4 @@
 # iOS simulator requirements
 coremltools==5.0b5
 protobuf==3.20.2
-optree==0.9.1
+optree==0.11.0
diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt
index 9b6986287391c..f0e4890328b35 100644
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@@ -16,7 +16,6 @@ pytest==7.3.2
 pytest-xdist==3.3.1
 pytest-rerunfailures==10.3
 pytest-flakefinder==1.1.0
-pytest-shard==0.1.2
 scipy==1.10.1
 sympy==1.11.1
 unittest-xml-reporting<=3.2.0,>=2.0.0
@@ -27,4 +26,7 @@ pytest-cpp==2.3.0
 rockset==1.0.3
 z3-solver==4.12.2.0
 tensorboard==2.13.0
-optree==0.9.1
+optree==0.11.0
+# NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
+# which the stringify metadata is wrong when escaping double quote
+protobuf==3.20.2
diff --git a/.github/scripts/amd/package_triton_wheel.sh b/.github/scripts/amd/package_triton_wheel.sh
new file mode 100755
index 0000000000000..4295a97a340e4
--- /dev/null
+++ b/.github/scripts/amd/package_triton_wheel.sh
@@ -0,0 +1,99 @@
+set -ex
+
+# Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm
+ROCM_HOME="${ROCM_HOME:-${ROCM_PATH:-/opt/rocm}}"
+
+# Find rocm_version.h header file for ROCm version extract
+rocm_version_h="${ROCM_HOME}/include/rocm-core/rocm_version.h"
+if [ ! -f "$rocm_version_h" ]; then
+    rocm_version_h="${ROCM_HOME}/include/rocm_version.h"
+fi
+
+# Error out if rocm_version.h not found
+if [ ! -f "$rocm_version_h" ]; then
+    echo "Error: rocm_version.h not found in expected locations." >&2
+    exit 1
+fi
+
+# Extract major, minor and patch ROCm version numbers
+MAJOR_VERSION=$(grep 'ROCM_VERSION_MAJOR' "$rocm_version_h" | awk '{print $3}')
+MINOR_VERSION=$(grep 'ROCM_VERSION_MINOR' "$rocm_version_h" | awk '{print $3}')
+PATCH_VERSION=$(grep 'ROCM_VERSION_PATCH' "$rocm_version_h" | awk '{print $3}')
+ROCM_INT=$(($MAJOR_VERSION * 10000 + $MINOR_VERSION * 100 + $PATCH_VERSION))
+echo "ROCm version: $ROCM_INT"
+
+# Check TRITON_ROCM_DIR is set
+if [[ -z "${TRITON_ROCM_DIR}" ]]; then
+    export TRITON_ROCM_DIR=third_party/amd/backend
+fi
+
+# Remove packaged libs and headers
+rm -rf $TRITON_ROCM_DIR/include/*
+
+LIBTINFO_PATH="/usr/lib64/libtinfo.so.5"
+LIBNUMA_PATH="/usr/lib64/libnuma.so.1"
+LIBELF_PATH="/usr/lib64/libelf.so.1"
+
+OS_SO_PATHS=(
+    $LIBELF_PATH
+    $LIBNUMA_PATH
+    $LIBTINFO_PATH
+)
+
+for lib in "${OS_SO_PATHS[@]}"
+do
+    cp $lib $TRITON_ROCM_DIR/lib/
+done
+
+# Required ROCm libraries
+if [[ "${MAJOR_VERSION}" == "6" ]]; then
+    libamdhip="libamdhip64.so.6"
+else
+    libamdhip="libamdhip64.so.5"
+fi
+
+# Required ROCm libraries - ROCm 6.0
+ROCM_SO=(
+    "${libamdhip}"
+    "libhsa-runtime64.so.1"
+    "libamd_comgr.so.2"
+    "libdrm.so.2"
+    "libdrm_amdgpu.so.1"
+)
+
+if [[ $ROCM_INT -ge 60100 ]]; then
+    ROCM_SO+=("librocprofiler-register.so.0")
+fi
+
+for lib in "${ROCM_SO[@]}"
+do
+    file_path=($(find $ROCM_HOME/lib/ -name "$lib")) # First search in lib
+    if [[ -z $file_path ]]; then
+        if [ -d "$ROCM_HOME/lib64/" ]; then
+            file_path=($(find $ROCM_HOME/lib64/ -name "$lib")) # Then search in lib64
+        fi
+    fi
+    if [[ -z $file_path ]]; then
+        file_path=($(find $ROCM_HOME/ -name "$lib")) # Then search in ROCM_HOME
+    fi
+    if [[ -z $file_path ]]; then
+        file_path=($(find /opt/ -name "$lib")) # Then search in /opt
+    fi
+    if [[ -z $file_path ]]; then
+            echo "Error: Library file $lib is not found." >&2
+            exit 1
+    fi
+
+    cp $file_path $TRITON_ROCM_DIR/lib
+    # When running locally, and not building a wheel, we need to satisfy shared objects requests that don't look for versions
+    LINKNAME=$(echo $lib | sed -e 's/\.so.*/.so/g')
+    ln -sf $lib $TRITON_ROCM_DIR/lib/$LINKNAME
+
+done
+
+# Copy Include Files
+cp -r $ROCM_HOME/include/hip $TRITON_ROCM_DIR/include
+
+# Copy linker
+mkdir -p $TRITON_ROCM_DIR/llvm/bin
+cp $ROCM_HOME/llvm/bin/ld.lld $TRITON_ROCM_DIR/llvm/bin/
diff --git a/.github/scripts/amd/patch_triton_wheel.sh b/.github/scripts/amd/patch_triton_wheel.sh
new file mode 100755
index 0000000000000..667fcb645587c
--- /dev/null
+++ b/.github/scripts/amd/patch_triton_wheel.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+set -x
+
+if [ -z "$1" ]; then
+    echo "Need wheel location argument" && exit 1
+fi
+
+WHEELHOUSE_DIR=$1
+PATCHELF_BIN=patchelf
+ROCM_LIB=backends/amd/lib
+ROCM_LD=backends/amd/llvm/bin
+PREFIX=triton
+fname_without_so_number() {
+    LINKNAME=$(echo $1 | sed -e 's/\.so.*/.so/g')
+    echo "$LINKNAME"
+}
+
+replace_needed_sofiles() {
+    find $1 -name '*.so*' -o -name 'ld.lld' | while read sofile; do
+        origname=$2
+        patchedname=$3
+        if [[ "$origname" != "$patchedname" ]]; then
+            set +e
+            origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
+            ERRCODE=$?
+            set -e
+            if [ "$ERRCODE" -eq "0" ]; then
+                echo "patching $sofile entry $origname to $patchedname"
+                $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
+            fi
+        fi
+    done
+}
+
+mkdir  -p "/tmp_dir"
+pushd /tmp_dir
+for pkg in /$WHEELHOUSE_DIR/*triton*.whl; do
+    echo "Modifying $pkg"
+    rm -rf tmp
+    mkdir -p tmp
+    cd tmp
+    cp $pkg .
+    unzip -q $(basename $pkg)
+    rm -f $(basename $pkg)
+    $PATCHELF_BIN --set-rpath ${LD_SO_RPATH:-'$ORIGIN:$ORIGIN/../../lib'} $PREFIX/$ROCM_LD/ld.lld
+    $PATCHELF_BIN --print-rpath $PREFIX/$ROCM_LD/ld.lld
+    # Modify libtriton.so as it sits in _C directory apart from its dependencies
+    find $PREFIX/_C -type f -name "*.so*" | while read sofile; do
+        echo "Setting rpath of $sofile"
+        $PATCHELF_BIN --set-rpath ${C_SO_RPATH:-'$ORIGIN:$ORIGIN/'../$ROCM_LIB} ${FORCE_RPATH:-} $sofile
+        $PATCHELF_BIN --print-rpath $sofile
+    done
+
+    # All included dependencies are included in a single lib directory
+    deps=()
+    deps_soname=()
+    while read sofile; do
+        echo "Setting rpath of $sofile to ${LIB_SO_RPATH:-'$ORIGIN'}"
+        $PATCHELF_BIN --set-rpath ${LIB_SO_RPATH:-'$ORIGIN'} ${FORCE_RPATH:-} $sofile
+        $PATCHELF_BIN --print-rpath $sofile
+        deps+=("$sofile")
+        deps_soname+=("$(basename $sofile)")
+    done < <(find $PREFIX/$ROCM_LIB -type f -name "*.so*")
+
+    patched=()
+    for filepath in "${deps[@]}"; do
+        filename=$(basename $filepath)
+        destpath=$PREFIX/$ROCM_LIB/$filename
+        if [[ "$filepath" != "$destpath" ]]; then
+            cp $filepath $destpath
+        fi
+        patchedpath=$(fname_without_so_number $destpath)
+        patchedname=$(basename $patchedpath)
+        if [[ "$destpath" != "$patchedpath" ]]; then
+            mv $destpath $patchedpath
+        fi
+        patched+=("$patchedname")
+        echo "Copied $filepath to $patchedpath"
+    done
+
+    # Go through all required shared objects and see if any of our other objects are dependants.  If so, replace so.ver wth so
+    for ((i=0;i<${#deps[@]};++i)); do
+        echo "replacing "${deps_soname[i]} ${patched[i]}
+        replace_needed_sofiles $PREFIX/$ROCM_LIB ${deps_soname[i]} ${patched[i]}
+        replace_needed_sofiles $PREFIX/_C ${deps_soname[i]} ${patched[i]}
+        replace_needed_sofiles $PREFIX/$ROCM_LD ${deps_soname[i]} ${patched[i]}
+    done
+
+    # Re-bundle whl with so adjustments
+    zip -rqy $(basename $pkg) *
+
+    if [[ -z "${MANYLINUX_VERSION}" ]]; then
+        newpkg=$pkg
+    else
+        newpkg=$(echo $pkg | sed -e "s/\linux_x86_64/${MANYLINUX_VERSION}/g")
+    fi
+
+    # Remove original whl
+    rm -f $pkg
+
+    # Move rebuilt whl to original location with new name.
+    mv $(basename $pkg) $newpkg
+done
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index 693d6892ff592..f422f6766cc40 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -10,9 +10,6 @@
 SCRIPT_DIR = Path(__file__).parent
 REPO_DIR = SCRIPT_DIR.parent.parent
 
-# TODO: Remove me once Triton version is again in sync for vanilla and ROCm
-ROCM_TRITION_VERSION = "2.1.0"
-
 
 def read_triton_pin(rocm_hash: bool = False) -> str:
     triton_file = "triton.txt" if not rocm_hash else "triton-rocm.txt"
@@ -32,27 +29,6 @@ def check_and_replace(inp: str, src: str, dst: str) -> str:
     return inp.replace(src, dst)
 
 
-def patch_setup_py(
-    path: Path,
-    *,
-    version: str,
-    name: str = "triton",
-    expected_version: Optional[str] = None,
-) -> None:
-    with open(path) as f:
-        orig = f.read()
-    # Replace name
-    orig = check_and_replace(orig, 'name="triton",', f'name="{name}",')
-    # Replace version
-    if not expected_version:
-        expected_version = read_triton_version()
-    orig = check_and_replace(
-        orig, f'version="{expected_version}",', f'version="{version}",'
-    )
-    with open(path, "w") as f:
-        f.write(orig)
-
-
 def patch_init_py(
     path: Path, *, version: str, expected_version: Optional[str] = None
 ) -> None:
@@ -92,14 +68,20 @@ def build_triton(
     with TemporaryDirectory() as tmpdir:
         triton_basedir = Path(tmpdir) / "triton"
         triton_pythondir = triton_basedir / "python"
+        triton_repo = "https://github.com/openai/triton"
         if build_rocm:
-            triton_repo = "https://github.com/ROCmSoftwarePlatform/triton"
             triton_pkg_name = "pytorch-triton-rocm"
         else:
-            triton_repo = "https://github.com/openai/triton"
             triton_pkg_name = "pytorch-triton"
         check_call(["git", "clone", triton_repo], cwd=tmpdir)
-        check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
+        if release:
+            ver, rev, patch = version.split(".")
+            check_call(
+                ["git", "checkout", f"release/{ver}.{rev}.x"], cwd=triton_basedir
+            )
+        else:
+            check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
+
         if build_conda:
             with open(triton_basedir / "meta.yaml", "w") as meta:
                 print(
@@ -109,7 +91,7 @@ def build_triton(
                 print("source:\n  path: .\n", file=meta)
                 print(
                     "build:\n  string: py{{py}}\n  number: 1\n  script: cd python; "
-                    "python setup.py install --single-version-externally-managed --record=record.txt\n",
+                    "python setup.py install --record=record.txt\n",
                     " script_env:\n   - MAX_JOBS\n",
                     file=meta,
                 )
@@ -155,18 +137,15 @@ def build_triton(
         patch_init_py(
             triton_pythondir / "triton" / "__init__.py",
             version=f"{version}",
-            expected_version=ROCM_TRITION_VERSION if build_rocm else None,
+            expected_version=None,
         )
 
         if build_rocm:
-            # TODO: Remove me when ROCM triton is updated
-            patch_setup_py(
-                triton_pythondir / "setup.py",
-                name=triton_pkg_name,
-                version=f"{version}",
-                expected_version=ROCM_TRITION_VERSION,
+            check_call(
+                [f"{SCRIPT_DIR}/amd/package_triton_wheel.sh"],
+                cwd=triton_basedir,
+                shell=True,
             )
-            check_call("scripts/amd/setup_rocm_libs.sh", cwd=triton_basedir, shell=True)
             print("ROCm libraries setup for triton installation...")
 
         check_call(
@@ -177,7 +156,10 @@ def build_triton(
         shutil.copy(whl_path, Path.cwd())
 
         if build_rocm:
-            check_call("scripts/amd/fix_so.sh", cwd=triton_basedir, shell=True)
+            check_call(
+                [f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh", Path.cwd()],
+                cwd=triton_basedir,
+            )
 
         return Path.cwd() / whl_path.name
 
diff --git a/.github/scripts/cherry_pick.py b/.github/scripts/cherry_pick.py
new file mode 100755
index 0000000000000..4c892de21da8a
--- /dev/null
+++ b/.github/scripts/cherry_pick.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+
+import json
+import os
+import re
+from typing import Any, Optional
+
+from urllib.error import HTTPError
+
+from github_utils import gh_fetch_url, gh_post_pr_comment
+
+from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
+from trymerge import get_pr_commit_sha, GitHubPR
+
+
+# This is only a suggestion for now, not a strict requirement
+REQUIRES_ISSUE = {
+    "regression",
+    "critical",
+    "fixnewfeature",
+}
+
+
+def parse_args() -> Any:
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser("cherry pick a landed PR onto a release branch")
+    parser.add_argument(
+        "--onto-branch", type=str, required=True, help="the target release branch"
+    )
+    parser.add_argument(
+        "--github-actor", type=str, required=True, help="all the world's a stage"
+    )
+    parser.add_argument(
+        "--classification",
+        choices=["regression", "critical", "fixnewfeature", "docs", "release"],
+        required=True,
+        help="the cherry pick category",
+    )
+    parser.add_argument("pr_num", type=int)
+    parser.add_argument(
+        "--fixes",
+        type=str,
+        default="",
+        help="the GitHub issue that the cherry pick fixes",
+    )
+    parser.add_argument("--dry-run", action="store_true")
+
+    return parser.parse_args()
+
+
+def get_merge_commit_sha(repo: GitRepo, pr: GitHubPR) -> Optional[str]:
+    """
+    Return the merge commit SHA iff the PR has been merged. For simplicity, we
+    will only cherry pick PRs that have been merged into main
+    """
+    commit_sha = get_pr_commit_sha(repo, pr)
+    return commit_sha if pr.is_closed() else None
+
+
+def cherry_pick(
+    github_actor: str,
+    repo: GitRepo,
+    pr: GitHubPR,
+    commit_sha: str,
+    onto_branch: str,
+    classification: str,
+    fixes: str,
+    dry_run: bool = False,
+) -> None:
+    """
+    Create a local branch to cherry pick the commit and submit it as a pull request
+    """
+    current_branch = repo.current_branch()
+    cherry_pick_branch = create_cherry_pick_branch(
+        github_actor, repo, pr, commit_sha, onto_branch
+    )
+
+    try:
+        if not dry_run:
+            org, project = repo.gh_owner_and_name()
+            cherry_pick_pr = submit_pr(repo, pr, cherry_pick_branch, onto_branch)
+
+            msg = f"The cherry pick PR is at {cherry_pick_pr}"
+            if fixes:
+                msg += f" and it is linked with issue {fixes}"
+            elif classification in REQUIRES_ISSUE:
+                msg += f" and it is recommended to link a {classification} cherry pick PR with an issue"
+
+            post_comment(org, project, pr.pr_num, msg)
+
+    finally:
+        if current_branch:
+            repo.checkout(branch=current_branch)
+
+
+def create_cherry_pick_branch(
+    github_actor: str, repo: GitRepo, pr: GitHubPR, commit_sha: str, onto_branch: str
+) -> str:
+    """
+    Create a local branch and cherry pick the commit. Return the name of the local
+    cherry picking branch.
+    """
+    repo.checkout(branch=onto_branch)
+    repo._run_git("submodule", "update", "--init", "--recursive")
+
+    # Remove all special characters if we want to include the actor in the branch name
+    github_actor = re.sub("[^0-9a-zA-Z]+", "_", github_actor)
+
+    cherry_pick_branch = f"cherry-pick-{pr.pr_num}-by-{github_actor}"
+    repo.create_branch_and_checkout(branch=cherry_pick_branch)
+
+    # We might want to support ghstack later
+    repo._run_git("cherry-pick", "-x", "-X", "theirs", commit_sha)
+    repo.push(branch=cherry_pick_branch, dry_run=False)
+
+    return cherry_pick_branch
+
+
+def submit_pr(
+    repo: GitRepo,
+    pr: GitHubPR,
+    cherry_pick_branch: str,
+    onto_branch: str,
+) -> str:
+    """
+    Submit the cherry pick PR and return the link to the PR
+    """
+    org, project = repo.gh_owner_and_name()
+
+    default_msg = f"Cherry pick #{pr.pr_num} onto {onto_branch} branch"
+    title = pr.info.get("title", default_msg)
+    body = pr.info.get("body", default_msg)
+
+    try:
+        response = gh_fetch_url(
+            f"https://api.github.com/repos/{org}/{project}/pulls",
+            method="POST",
+            data={
+                "title": title,
+                "body": body,
+                "head": cherry_pick_branch,
+                "base": onto_branch,
+            },
+            headers={"Accept": "application/vnd.github.v3+json"},
+            reader=json.load,
+        )
+
+        cherry_pick_pr = response.get("html_url", "")
+        if not cherry_pick_pr:
+            raise RuntimeError(
+                f"Fail to find the cherry pick PR: {json.dumps(response)}"
+            )
+
+        return str(cherry_pick_pr)
+
+    except HTTPError as error:
+        msg = f"Fail to submit the cherry pick PR: {error}"
+        raise RuntimeError(msg) from error
+
+
+def post_comment(org: str, project: str, pr_num: int, msg: str) -> None:
+    """
+    Post a comment on the PR itself to point to the cherry picking PR when success
+    or print the error when failure
+    """
+    internal_debugging = ""
+
+    run_url = os.getenv("GH_RUN_URL")
+    # Post a comment to tell folks that the PR is being cherry picked
+    if run_url is not None:
+        internal_debugging = "\n".join(
+            line
+            for line in (
+                "<details><summary>Details for Dev Infra team</summary>",
+                f'Raised by <a href="{run_url}">workflow job</a>\n',
+                "</details>",
+            )
+            if line
+        )
+
+    comment = "\n".join(
+        (f"### Cherry picking #{pr_num}", f"{msg}", "", f"{internal_debugging}")
+    )
+    gh_post_pr_comment(org, project, pr_num, comment)
+
+
+def main() -> None:
+    args = parse_args()
+    pr_num = args.pr_num
+
+    repo = GitRepo(get_git_repo_dir(), get_git_remote_name())
+    org, project = repo.gh_owner_and_name()
+
+    pr = GitHubPR(org, project, pr_num)
+
+    try:
+        commit_sha = get_merge_commit_sha(repo, pr)
+        if not commit_sha:
+            raise RuntimeError(
+                f"Refuse to cherry pick #{pr_num} because it hasn't been merged yet"
+            )
+
+        cherry_pick(
+            args.github_actor,
+            repo,
+            pr,
+            commit_sha,
+            args.onto_branch,
+            args.classification,
+            args.fixes,
+            args.dry_run,
+        )
+
+    except RuntimeError as error:
+        if not args.dry_run:
+            post_comment(org, project, pr_num, str(error))
+        else:
+            raise error
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/comment_on_pr.py b/.github/scripts/comment_on_pr.py
index 88edcce27cd5d..57fce4bf97399 100644
--- a/.github/scripts/comment_on_pr.py
+++ b/.github/scripts/comment_on_pr.py
@@ -23,8 +23,10 @@ def main() -> None:
 
     job_link = f"[job]({run_url})" if run_url is not None else "job"
     msg = (
-        f"The {args.action} {job_link} was canceled. If you believe this is a mistake,"
-        + f" then you can re trigger it through [pytorch-bot]({BOT_COMMANDS_WIKI})."
+        f"The {args.action} {job_link} was canceled or timed out. This most often happen if two merge requests were issued"
+        + " for the same PR, or if merge job was waiting for more than 6 hours for tests to finish."
+        + " In later case, please do not hesitate to reissue the merge command\n"
+        + f" For more information see [pytorch-bot wiki]({BOT_COMMANDS_WIKI})."
     )
 
     gh_post_pr_comment(org, project, args.pr_num, msg)
diff --git a/.github/scripts/delete_old_branches.py b/.github/scripts/delete_old_branches.py
new file mode 100644
index 0000000000000..21b86fefa1a89
--- /dev/null
+++ b/.github/scripts/delete_old_branches.py
@@ -0,0 +1,274 @@
+# Delete old branches
+import os
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Set
+
+from github_utils import gh_fetch_json_dict, gh_graphql
+from gitutils import GitRepo
+
+SEC_IN_DAY = 24 * 60 * 60
+CLOSED_PR_RETENTION = 30 * SEC_IN_DAY
+NO_PR_RETENTION = 1.5 * 365 * SEC_IN_DAY
+PR_WINDOW = 90 * SEC_IN_DAY  # Set to None to look at all PRs (may take a lot of tokens)
+REPO_OWNER = "pytorch"
+REPO_NAME = "pytorch"
+ESTIMATED_TOKENS = [0]
+
+TOKEN = os.environ["GITHUB_TOKEN"]
+if not TOKEN:
+    raise Exception("GITHUB_TOKEN is not set")  # noqa: TRY002
+
+REPO_ROOT = Path(__file__).parent.parent.parent
+
+# Query for all PRs instead of just closed/merged because it's faster
+GRAPHQL_ALL_PRS_BY_UPDATED_AT = """
+query ($owner: String!, $repo: String!, $cursor: String) {
+  repository(owner: $owner, name: $repo) {
+    pullRequests(
+      first: 100
+      after: $cursor
+      orderBy: {field: UPDATED_AT, direction: DESC}
+    ) {
+      totalCount
+      pageInfo {
+        hasNextPage
+        endCursor
+      }
+      nodes {
+        headRefName
+        number
+        updatedAt
+        state
+      }
+    }
+  }
+}
+"""
+
+GRAPHQL_OPEN_PRS = """
+query ($owner: String!, $repo: String!, $cursor: String) {
+  repository(owner: $owner, name: $repo) {
+    pullRequests(
+      first: 100
+      after: $cursor
+      states: [OPEN]
+    ) {
+      totalCount
+      pageInfo {
+        hasNextPage
+        endCursor
+      }
+      nodes {
+        headRefName
+        number
+        updatedAt
+        state
+      }
+    }
+  }
+}
+"""
+
+GRAPHQL_NO_DELETE_BRANCH_LABEL = """
+query ($owner: String!, $repo: String!, $cursor: String) {
+  repository(owner: $owner, name: $repo) {
+    label(name: "no-delete-branch") {
+      pullRequests(first: 100, after: $cursor) {
+        totalCount
+        pageInfo {
+          hasNextPage
+          endCursor
+        }
+        nodes {
+          headRefName
+          number
+          updatedAt
+          state
+        }
+      }
+    }
+  }
+}
+"""
+
+
+def is_protected(branch: str) -> bool:
+    try:
+        ESTIMATED_TOKENS[0] += 1
+        res = gh_fetch_json_dict(
+            f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/branches/{branch}"
+        )
+        return bool(res["protected"])
+    except Exception as e:
+        print(f"[{branch}] Failed to fetch branch protections: {e}")
+        return True
+
+
+def convert_gh_timestamp(date: str) -> float:
+    return datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ").timestamp()
+
+
+def get_branches(repo: GitRepo) -> Dict[str, Any]:
+    # Query locally for branches, group by branch base name (e.g. gh/blah/base -> gh/blah), and get the most recent branch
+    git_response = repo._run_git(
+        "for-each-ref",
+        "--sort=creatordate",
+        "--format=%(refname) %(committerdate:iso-strict)",
+        "refs/remotes/origin",
+    )
+    branches_by_base_name: Dict[str, Any] = {}
+    for line in git_response.splitlines():
+        branch, date = line.split(" ")
+        re_branch = re.match(r"refs/remotes/origin/(.*)", branch)
+        assert re_branch
+        branch = branch_base_name = re_branch.group(1)
+        if x := re.match(r"(gh\/.+)\/(head|base|orig)", branch):
+            branch_base_name = x.group(1)
+        date = datetime.fromisoformat(date).timestamp()
+        if branch_base_name not in branches_by_base_name:
+            branches_by_base_name[branch_base_name] = [date, [branch]]
+        else:
+            branches_by_base_name[branch_base_name][1].append(branch)
+            if date > branches_by_base_name[branch_base_name][0]:
+                branches_by_base_name[branch_base_name][0] = date
+    return branches_by_base_name
+
+
+def paginate_graphql(
+    query: str,
+    kwargs: Dict[str, Any],
+    termination_func: Callable[[List[Dict[str, Any]]], bool],
+    get_data: Callable[[Dict[str, Any]], List[Dict[str, Any]]],
+    get_page_info: Callable[[Dict[str, Any]], Dict[str, Any]],
+) -> List[Any]:
+    hasNextPage = True
+    endCursor = None
+    data: List[Dict[str, Any]] = []
+    while hasNextPage:
+        ESTIMATED_TOKENS[0] += 1
+        res = gh_graphql(query, cursor=endCursor, **kwargs)
+        data.extend(get_data(res))
+        hasNextPage = get_page_info(res)["hasNextPage"]
+        endCursor = get_page_info(res)["endCursor"]
+        if termination_func(data):
+            break
+    return data
+
+
+def get_recent_prs() -> Dict[str, Any]:
+    now = datetime.now().timestamp()
+
+    # Grab all PRs updated in last CLOSED_PR_RETENTION days
+    pr_infos: List[Dict[str, Any]] = paginate_graphql(
+        GRAPHQL_ALL_PRS_BY_UPDATED_AT,
+        {"owner": "pytorch", "repo": "pytorch"},
+        lambda data: (
+            PR_WINDOW is not None
+            and (now - convert_gh_timestamp(data[-1]["updatedAt"]) > PR_WINDOW)
+        ),
+        lambda res: res["data"]["repository"]["pullRequests"]["nodes"],
+        lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"],
+    )
+
+    # Get the most recent PR for each branch base (group gh together)
+    prs_by_branch_base = {}
+    for pr in pr_infos:
+        pr["updatedAt"] = convert_gh_timestamp(pr["updatedAt"])
+        branch_base_name = pr["headRefName"]
+        if x := re.match(r"(gh\/.+)\/(head|base|orig)", branch_base_name):
+            branch_base_name = x.group(1)
+        if branch_base_name not in prs_by_branch_base:
+            prs_by_branch_base[branch_base_name] = pr
+        else:
+            if pr["updatedAt"] > prs_by_branch_base[branch_base_name]["updatedAt"]:
+                prs_by_branch_base[branch_base_name] = pr
+    return prs_by_branch_base
+
+
+def get_branches_with_magic_label_or_open_pr() -> Set[str]:
+    pr_infos: List[Dict[str, Any]] = paginate_graphql(
+        GRAPHQL_NO_DELETE_BRANCH_LABEL,
+        {"owner": "pytorch", "repo": "pytorch"},
+        lambda data: False,
+        lambda res: res["data"]["repository"]["label"]["pullRequests"]["nodes"],
+        lambda res: res["data"]["repository"]["label"]["pullRequests"]["pageInfo"],
+    )
+
+    pr_infos.extend(
+        paginate_graphql(
+            GRAPHQL_OPEN_PRS,
+            {"owner": "pytorch", "repo": "pytorch"},
+            lambda data: False,
+            lambda res: res["data"]["repository"]["pullRequests"]["nodes"],
+            lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"],
+        )
+    )
+
+    # Get the most recent PR for each branch base (group gh together)
+    branch_bases = set()
+    for pr in pr_infos:
+        branch_base_name = pr["headRefName"]
+        if x := re.match(r"(gh\/.+)\/(head|base|orig)", branch_base_name):
+            branch_base_name = x.group(1)
+        branch_bases.add(branch_base_name)
+    return branch_bases
+
+
+def delete_branch(repo: GitRepo, branch: str) -> None:
+    repo._run_git("push", "origin", "-d", branch)
+
+
+def delete_branches() -> None:
+    now = datetime.now().timestamp()
+    git_repo = GitRepo(str(REPO_ROOT), "origin", debug=True)
+    branches = get_branches(git_repo)
+    prs_by_branch = get_recent_prs()
+    keep_branches = get_branches_with_magic_label_or_open_pr()
+
+    delete = []
+    # Do not delete if:
+    # * associated PR is open, closed but updated recently, or contains the magic string
+    # * no associated PR and branch was updated in last 1.5 years
+    # * is protected
+    # Setting different values of PR_WINDOW will change how branches with closed
+    # PRs are treated depending on how old the branch is.  The default value of
+    # 90 will allow branches with closed PRs to be deleted if the PR hasn't been
+    # updated in 90 days and the branch hasn't been updated in 1.5 years
+    for base_branch, (date, sub_branches) in branches.items():
+        print(f"[{base_branch}] Updated {(now - date) / SEC_IN_DAY} days ago")
+        if base_branch in keep_branches:
+            print(f"[{base_branch}] Has magic label or open PR, skipping")
+            continue
+        pr = prs_by_branch.get(base_branch)
+        if pr:
+            print(
+                f"[{base_branch}] Has PR {pr['number']}: {pr['state']}, updated {(now - pr['updatedAt']) / SEC_IN_DAY} days ago"
+            )
+            if (
+                now - pr["updatedAt"] < CLOSED_PR_RETENTION
+                or (now - date) < CLOSED_PR_RETENTION
+            ):
+                continue
+        elif now - date < NO_PR_RETENTION:
+            continue
+        print(f"[{base_branch}] Checking for branch protections")
+        if any(is_protected(sub_branch) for sub_branch in sub_branches):
+            print(f"[{base_branch}] Is protected")
+            continue
+        for sub_branch in sub_branches:
+            print(f"[{base_branch}] Deleting {sub_branch}")
+            delete.append(sub_branch)
+        if ESTIMATED_TOKENS[0] > 400:
+            print("Estimated tokens exceeded, exiting")
+            break
+
+    print(f"To delete ({len(delete)}):")
+    for branch in delete:
+        print(f"About to delete branch {branch}")
+        delete_branch(git_repo, branch)
+
+
+if __name__ == "__main__":
+    delete_branches()
diff --git a/.github/scripts/drci_mocks.json.gz b/.github/scripts/drci_mocks.json.gz
index 5e272a1493a4f..a4c1db752cb09 100644
Binary files a/.github/scripts/drci_mocks.json.gz and b/.github/scripts/drci_mocks.json.gz differ
diff --git a/.github/scripts/fetch_latest_green_commit.py b/.github/scripts/fetch_latest_green_commit.py
deleted file mode 100644
index 1f0cd91233b98..0000000000000
--- a/.github/scripts/fetch_latest_green_commit.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import os
-import re
-import sys
-from typing import Any, cast, Dict, List, NamedTuple, Tuple
-
-import rockset  # type: ignore[import]
-from gitutils import _check_output
-
-
-def eprint(msg: str) -> None:
-    print(msg, file=sys.stderr)
-
-
-class WorkflowCheck(NamedTuple):
-    workflowName: str
-    name: str
-    jobName: str
-    conclusion: str
-
-
-def get_latest_commits() -> List[str]:
-    latest_viable_commit = _check_output(
-        [
-            "git",
-            "log",
-            "-n",
-            "1",
-            "--pretty=format:%H",
-            "origin/viable/strict",
-        ],
-        encoding="ascii",
-    )
-    commits = _check_output(
-        [
-            "git",
-            "rev-list",
-            f"{latest_viable_commit}^..HEAD",
-            "--remotes=*origin/main",
-        ],
-        encoding="ascii",
-    ).splitlines()
-
-    return commits
-
-
-def query_commits(commits: List[str]) -> List[Dict[str, Any]]:
-    rs = rockset.RocksetClient(
-        host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
-    )
-    params = [{"name": "shas", "type": "string", "value": ",".join(commits)}]
-    res = rs.QueryLambdas.execute_query_lambda(
-        # https://console.rockset.com/lambdas/details/commons.commit_jobs_batch_query
-        query_lambda="commit_jobs_batch_query",
-        version="19c74e10819104f9",
-        workspace="commons",
-        parameters=params,
-    )
-
-    return cast(List[Dict[str, Any]], res.results)
-
-
-def print_commit_status(commit: str, results: Dict[str, Any]) -> None:
-    print(commit)
-    for check in results["results"]:
-        if check["sha"] == commit:
-            print(f"\t{check['conclusion']:>10}: {check['name']}")
-
-
-def get_commit_results(
-    commit: str, results: List[Dict[str, Any]]
-) -> List[Dict[str, Any]]:
-    workflow_checks = []
-    for check in results:
-        if check["sha"] == commit:
-            workflow_checks.append(
-                WorkflowCheck(
-                    workflowName=check["workflowName"],
-                    name=check["name"],
-                    jobName=check["jobName"],
-                    conclusion=check["conclusion"],
-                )._asdict()
-            )
-    return workflow_checks
-
-
-def isGreen(commit: str, results: List[Dict[str, Any]]) -> Tuple[bool, str]:
-    workflow_checks = get_commit_results(commit, results)
-
-    regex = {
-        "pull": False,
-        "trunk": False,
-        "lint": False,
-        "linux-binary": False,
-    }
-
-    for check in workflow_checks:
-        jobName = check["jobName"]
-        # Ignore result from unstable job, be it success or failure
-        if "unstable" in jobName:
-            continue
-
-        workflowName = check["workflowName"]
-        conclusion = check["conclusion"]
-        for required_check in regex:
-            if re.match(required_check, workflowName, flags=re.IGNORECASE):
-                if conclusion not in ["success", "skipped"]:
-                    return (False, workflowName + " checks were not successful")
-                else:
-                    regex[required_check] = True
-
-    missing_workflows = [x for x in regex.keys() if not regex[x]]
-    if len(missing_workflows) > 0:
-        return (False, "missing required workflows: " + ", ".join(missing_workflows))
-
-    return (True, "")
-
-
-def get_latest_green_commit(commits: List[str], results: List[Dict[str, Any]]) -> Any:
-    for commit in commits:
-        eprint(f"Checking {commit}")
-        is_green, msg = isGreen(commit, results)
-        if is_green:
-            eprint("GREEN")
-            return commit
-        else:
-            eprint("RED: " + msg)
-    return None
-
-
-def main() -> None:
-    commits = get_latest_commits()
-    results = query_commits(commits)
-
-    latest_viable_commit = get_latest_green_commit(commits, results)
-    print(latest_viable_commit)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index c25b0f6fe84d0..c2e45bac81100 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 import json
+import logging
 import os
 import re
 import subprocess
@@ -8,6 +9,7 @@
 import warnings
 from enum import Enum
 from functools import lru_cache
+from logging import info
 from typing import Any, Callable, Dict, List, Optional, Set
 from urllib.request import Request, urlopen
 
@@ -17,33 +19,7 @@
 
 PREFIX = "test-config/"
 
-# Same as shard names
-VALID_TEST_CONFIG_LABELS = {
-    f"{PREFIX}{label}"
-    for label in {
-        "backwards_compat",
-        "crossref",
-        "default",
-        "deploy",
-        "distributed",
-        "docs_tests",
-        "dynamo",
-        "force_on_cpu",
-        "functorch",
-        "inductor",
-        "inductor_distributed",
-        "inductor_huggingface",
-        "inductor_timm",
-        "inductor_torchbench",
-        "jit_legacy",
-        "multigpu",
-        "nogpu_AVX512",
-        "nogpu_NO_AVX2",
-        "slow",
-        "tsan",
-        "xla",
-    }
-}
+logging.basicConfig(level=logging.INFO)
 
 
 def is_cuda_or_rocm_job(job_name: Optional[str]) -> bool:
@@ -90,6 +66,12 @@ def parse_args() -> Any:
     parser.add_argument(
         "--test-matrix", type=str, required=True, help="the original test matrix"
     )
+    parser.add_argument(
+        "--selected-test-configs",
+        type=str,
+        default="",
+        help="a comma-separated list of test configurations from the test matrix to keep",
+    )
     parser.add_argument(
         "--workflow", type=str, help="the name of the current workflow, i.e. pull"
     )
@@ -155,19 +137,25 @@ def get_labels(pr_number: int) -> Set[str]:
     }
 
 
+def filter_labels(labels: Set[str], label_regex: Any) -> Set[str]:
+    """
+    Return the list of matching labels
+    """
+    return {l for l in labels if re.match(label_regex, l)}
+
+
 def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, List[Any]]:
     """
     Select the list of test config to run from the test matrix. The logic works
     as follows:
 
-    If the PR has one or more labels as specified in the VALID_TEST_CONFIG_LABELS set, only
-    these test configs will be selected.  This also works with ciflow labels, for example,
-    if a PR has both ciflow/trunk and test-config/functorch, only trunk functorch builds
-    and tests will be run
+    If the PR has one or more test-config labels as specified, only these test configs
+    will be selected.  This also works with ciflow labels, for example, if a PR has both
+    ciflow/trunk and test-config/functorch, only trunk functorch builds and tests will
+    be run.
 
     If the PR has none of the test-config label, all tests are run as usual.
     """
-
     filtered_test_matrix: Dict[str, List[Any]] = {"include": []}
 
     for entry in test_matrix.get("include", []):
@@ -177,23 +165,46 @@ def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, Lis
 
         label = f"{PREFIX}{config_name.strip()}"
         if label in labels:
-            print(
-                f"Select {config_name} because label {label} is presented in the pull request by the time the test starts"
-            )
+            msg = f"Select {config_name} because label {label} is present in the pull request by the time the test starts"
+            info(msg)
             filtered_test_matrix["include"].append(entry)
 
-    valid_test_config_labels = labels.intersection(VALID_TEST_CONFIG_LABELS)
-
-    if not filtered_test_matrix["include"] and not valid_test_config_labels:
-        # Found no valid label and the filtered test matrix is empty, return the same
+    test_config_labels = filter_labels(labels, re.compile(f"{PREFIX}.+"))
+    if not filtered_test_matrix["include"] and not test_config_labels:
+        info("Found no test-config label on the PR, so all test configs are included")
+        # Found no test-config label and the filtered test matrix is empty, return the same
         # test matrix as before so that all tests can be run normally
         return test_matrix
     else:
+        msg = f"Found {test_config_labels} on the PR so only these test configs are run"
+        info(msg)
         # When the filter test matrix contain matches or if a valid test config label
         # is found in the PR, return the filtered test matrix
         return filtered_test_matrix
 
 
+def filter_selected_test_configs(
+    test_matrix: Dict[str, List[Any]], selected_test_configs: Set[str]
+) -> Dict[str, List[Any]]:
+    """
+    Keep only the selected configs if the list if not empty. Otherwise, keep all test configs.
+    This filter is used when the workflow is dispatched manually.
+    """
+    if not selected_test_configs:
+        return test_matrix
+
+    filtered_test_matrix: Dict[str, List[Any]] = {"include": []}
+    for entry in test_matrix.get("include", []):
+        config_name = entry.get("config", "")
+        if not config_name:
+            continue
+
+        if config_name in selected_test_configs:
+            filtered_test_matrix["include"].append(entry)
+
+    return filtered_test_matrix
+
+
 def set_periodic_modes(
     test_matrix: Dict[str, List[Any]], job_name: Optional[str]
 ) -> Dict[str, List[Any]]:
@@ -374,30 +385,33 @@ def process_jobs(
         # - If the target record has the job (config) name, only that test config
         #   will be skipped or marked as unstable
         if not target_job_cfg:
-            print(
+            msg = (
                 f"Issue {target_url} created by {author} has {issue_type.value} "
                 + f"all CI jobs for {workflow} / {job_name}"
             )
+            info(msg)
             return _filter_jobs(
                 test_matrix=test_matrix,
                 issue_type=issue_type,
             )
 
         if target_job_cfg == BUILD_JOB_NAME:
-            print(
+            msg = (
                 f"Issue {target_url} created by {author} has {issue_type.value} "
                 + f"the build job for {workflow} / {job_name}"
             )
+            info(msg)
             return _filter_jobs(
                 test_matrix=test_matrix,
                 issue_type=issue_type,
             )
 
         if target_job_cfg in (TEST_JOB_NAME, BUILD_AND_TEST_JOB_NAME):
-            print(
+            msg = (
                 f"Issue {target_url} created by {author} has {issue_type.value} "
                 + f"all the test jobs for {workflow} / {job_name}"
             )
+            info(msg)
             return _filter_jobs(
                 test_matrix=test_matrix,
                 issue_type=issue_type,
@@ -463,7 +477,7 @@ def parse_reenabled_issues(s: Optional[str]) -> List[str]:
 
 
 def get_reenabled_issues(pr_body: str = "") -> List[str]:
-    default_branch = os.getenv("GIT_DEFAULT_BRANCH", "main")
+    default_branch = f"origin/{os.environ.get('GIT_DEFAULT_BRANCH', 'main')}"
     try:
         commit_messages = subprocess.check_output(
             f"git cherry -v {default_branch}".split(" ")
@@ -474,6 +488,10 @@ def get_reenabled_issues(pr_body: str = "") -> List[str]:
     return parse_reenabled_issues(pr_body) + parse_reenabled_issues(commit_messages)
 
 
+def check_for_setting(labels: Set[str], body: str, setting: str) -> bool:
+    return setting in labels or f"[{setting}]" in body
+
+
 def perform_misc_tasks(
     labels: Set[str], test_matrix: Dict[str, List[Any]], job_name: str, pr_body: str
 ) -> None:
@@ -481,11 +499,24 @@ def perform_misc_tasks(
     In addition to apply the filter logic, the script also does the following
     misc tasks to set keep-going and is-unstable variables
     """
-    set_output("keep-going", "keep-going" in labels)
+    set_output("keep-going", check_for_setting(labels, pr_body, "keep-going"))
+    set_output(
+        "ci-verbose-test-logs",
+        check_for_setting(labels, pr_body, "ci-verbose-test-logs"),
+    )
+    set_output(
+        "ci-no-test-timeout", check_for_setting(labels, pr_body, "ci-no-test-timeout")
+    )
+    set_output("ci-no-td", check_for_setting(labels, pr_body, "ci-no-td"))
+    # Only relevant for the one linux distributed cuda job, delete this when TD
+    # is rolled out completely
+    set_output(
+        "ci-td-distributed", check_for_setting(labels, pr_body, "ci-td-distributed")
+    )
 
     # Obviously, if the job name includes unstable, then this is an unstable job
     is_unstable = job_name and IssueType.UNSTABLE.value in job_name
-    if not is_unstable and test_matrix:
+    if not is_unstable and test_matrix and test_matrix.get("include"):
         # Even when the job name doesn't mention unstable, we will also mark it as
         # unstable when the test matrix only includes unstable jobs. Basically, this
         # logic allows build or build-and-test jobs to be marked as unstable too.
@@ -555,6 +586,16 @@ def main() -> None:
         # No PR number, no tag, we can just return the test matrix as it is
         filtered_test_matrix = test_matrix
 
+    if args.selected_test_configs:
+        selected_test_configs = {
+            v.strip().lower()
+            for v in args.selected_test_configs.split(",")
+            if v.strip()
+        }
+        filtered_test_matrix = filter_selected_test_configs(
+            filtered_test_matrix, selected_test_configs
+        )
+
     if args.event_name == "schedule" and args.schedule == "29 8 * * *":
         # we don't want to run the mem leak check or disabled tests on normal
         # periodically scheduled jobs, only the ones at this time
@@ -577,7 +618,7 @@ def main() -> None:
         labels=labels,
         test_matrix=filtered_test_matrix,
         job_name=args.job_name,
-        pr_body=pr_body,
+        pr_body=pr_body if pr_body else "",
     )
 
     # Set the filtered test matrix as the output
diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index dcfa328c1875d..1e8bd57d44ac6 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -13,16 +13,16 @@
 import os
 from typing import Dict, List, Optional, Tuple
 
-CUDA_ARCHES = ["11.8", "12.1"]
+CUDA_ARCHES = ["11.8", "12.1", "12.4"]
 
 
-CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1"}
+CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.0"}
 
 
-CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8"}
+CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8", "12.4": "8"}
 
 
-ROCM_ARCHES = ["5.6", "5.7"]
+ROCM_ARCHES = ["6.0", "6.1"]
 
 
 CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
@@ -42,7 +42,7 @@
         "nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'"
     ),
     "12.1": (
@@ -55,9 +55,23 @@
         "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
         "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
     ),
+    "12.4": (
+        "nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'"
+    ),
 }
 
 
@@ -324,7 +338,7 @@ def generate_wheels_matrix(
             )
 
             # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
-            if arch_version in ["12.1", "11.8"] and os == "linux":
+            if arch_version in ["12.4", "12.1", "11.8"] and os == "linux":
                 ret.append(
                     {
                         "python_version": python_version,
@@ -367,5 +381,6 @@ def generate_wheels_matrix(
     return ret
 
 
+validate_nccl_dep_consistency("12.4")
 validate_nccl_dep_consistency("12.1")
 validate_nccl_dep_consistency("11.8")
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 1075db4255ed0..5b2b473d2a597 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -274,42 +274,6 @@ class OperatingSystem:
 ]
 
 MACOS_BINARY_BUILD_WORKFLOWS = [
-    BinaryBuildWorkflow(
-        os=OperatingSystem.MACOS,
-        package_type="wheel",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-            OperatingSystem.MACOS
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
-            isolated_workflow=True,
-        ),
-    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.MACOS,
-        package_type="conda",
-        build_configs=generate_binary_build_matrix.generate_conda_matrix(
-            OperatingSystem.MACOS
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA},
-            isolated_workflow=True,
-        ),
-    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.MACOS,
-        package_type="libtorch",
-        abi_version=generate_binary_build_matrix.CXX11_ABI,
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.MACOS,
-            generate_binary_build_matrix.CXX11_ABI,
-            libtorch_variants=["shared-with-deps"],
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
-            isolated_workflow=True,
-        ),
-    ),
     BinaryBuildWorkflow(
         os=OperatingSystem.MACOS_ARM64,
         package_type="libtorch",
@@ -342,7 +306,8 @@ class OperatingSystem:
     BinaryBuildWorkflow(
         os=OperatingSystem.MACOS_ARM64,
         package_type="conda",
-        cross_compile_arm64=True,
+        cross_compile_arm64=False,
+        macos_runner="macos-13-xlarge",
         build_configs=generate_binary_build_matrix.generate_conda_matrix(
             OperatingSystem.MACOS_ARM64
         ),
@@ -413,7 +378,9 @@ def main() -> None:
     for template, workflows in template_and_workflows:
         # added Iterable check to appease the mypy gods
         if not isinstance(workflows, Iterable):
-            raise Exception(f"How is workflows not iterable? {workflows}")
+            raise Exception(  # noqa: TRY002
+                f"How is workflows not iterable? {workflows}"
+            )  # noqa: TRY002
         for workflow in workflows:
             workflow.generate_workflow_file(workflow_template=template)
 
diff --git a/.github/scripts/generate_docker_release_matrix.py b/.github/scripts/generate_docker_release_matrix.py
index 4dfa016be86cb..49d567ceadf8d 100644
--- a/.github/scripts/generate_docker_release_matrix.py
+++ b/.github/scripts/generate_docker_release_matrix.py
@@ -4,7 +4,7 @@
 
 Will output a condensed version of the matrix. Will include fllowing:
     * CUDA version short
-    * CUDA full verison
+    * CUDA full version
     * CUDNN version short
     * Image type either runtime or devel
     * Platform linux/arm64,linux/amd64
@@ -21,6 +21,8 @@
 
 def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]:
     ret: List[Dict[str, str]] = []
+    # CUDA amd64 Docker images are available as both runtime and devel while
+    # CPU arm64 image is only available as runtime.
     for cuda, version in generate_binary_build_matrix.CUDA_ARCHES_FULL_VERSION.items():
         for image in DOCKER_IMAGE_TYPES:
             ret.append(
@@ -31,9 +33,19 @@ def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]:
                         cuda
                     ],
                     "image_type": image,
-                    "platform": "linux/arm64,linux/amd64",
+                    "platform": "linux/amd64",
                 }
             )
+    ret.append(
+        {
+            "cuda": "cpu",
+            "cuda_full_version": "",
+            "cudnn_version": "",
+            "image_type": "runtime",
+            "platform": "linux/arm64",
+        }
+    )
+
     return {"include": ret}
 
 
diff --git a/.github/scripts/get_aws_session_tokens.py b/.github/scripts/get_aws_session_tokens.py
new file mode 100755
index 0000000000000..81a046f92778b
--- /dev/null
+++ b/.github/scripts/get_aws_session_tokens.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+import boto3  # type: ignore[import]
+
+
+def main() -> None:
+    creds_dict = boto3.Session().get_credentials().get_frozen_credentials()._asdict()
+    print(f"export AWS_ACCESS_KEY_ID={creds_dict['access_key']}")
+    print(f"export AWS_SECRET_ACCESS_KEY={creds_dict['secret_key']}")
+    print(f"export AWS_SESSION_TOKEN={creds_dict['token']}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/get_workflow_job_id.py b/.github/scripts/get_workflow_job_id.py
index 75bc7e016175b..28f337a30710a 100644
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@@ -4,6 +4,7 @@
 
 import argparse
 import json
+import operator
 import os
 import re
 import sys
@@ -126,7 +127,7 @@ def find_job_id_name(args: Any) -> Tuple[str, str]:
 
     # Sort the jobs list by start time, in descending order. We want to get the most
     # recently scheduled job on the runner.
-    jobs.sort(key=lambda job: job["started_at"], reverse=True)
+    jobs.sort(key=operator.itemgetter("started_at"), reverse=True)
 
     for job in jobs:
         if job["runner_name"] == args.runner_name:
diff --git a/.github/scripts/github_utils.py b/.github/scripts/github_utils.py
index 05b95fc916646..d76d32f624d8a 100644
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@@ -119,6 +119,19 @@ def gh_fetch_json_dict(
     return cast(Dict[str, Any], _gh_fetch_json_any(url, params, data))
 
 
+def gh_graphql(query: str, **kwargs: Any) -> Dict[str, Any]:
+    rc = gh_fetch_url(
+        "https://api.github.com/graphql",
+        data={"query": query, "variables": kwargs},
+        reader=json.load,
+    )
+    if "errors" in rc:
+        raise RuntimeError(
+            f"GraphQL query {query}, args {kwargs} failed: {rc['errors']}"
+        )
+    return cast(Dict[str, Any], rc)
+
+
 def _gh_post_comment(
     url: str, comment: str, dry_run: bool = False
 ) -> List[Dict[str, Any]]:
diff --git a/.github/scripts/gitutils.py b/.github/scripts/gitutils.py
index 280fa991f9466..1640e4354f90d 100644
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@@ -155,12 +155,19 @@ def branches_containing_ref(
         )
         return [x.strip() for x in rc.split("\n") if x.strip()] if len(rc) > 0 else []
 
-    def current_branch(self) -> str:
-        return self._run_git("symbolic-ref", "--short", "HEAD").strip()
+    def current_branch(self) -> Optional[str]:
+        try:
+            return self._run_git("symbolic-ref", "--short", "HEAD").strip()
+        except RuntimeError:
+            # we are in detached HEAD state
+            return None
 
     def checkout(self, branch: str) -> None:
         self._run_git("checkout", branch)
 
+    def create_branch_and_checkout(self, branch: str) -> None:
+        self._run_git("checkout", "-b", branch)
+
     def fetch(self, ref: Optional[str] = None, branch: Optional[str] = None) -> None:
         if branch is None and ref is None:
             self._run_git("fetch", self.remote)
@@ -273,6 +280,7 @@ def compute_branch_diffs(
 
     def cherry_pick_commits(self, from_branch: str, to_branch: str) -> None:
         orig_branch = self.current_branch()
+        assert orig_branch is not None, "Must be on a branch"
         self.checkout(to_branch)
         from_commits, to_commits = self.compute_branch_diffs(from_branch, to_branch)
         if len(from_commits) == 0:
diff --git a/.github/scripts/gql_mocks.json.gz b/.github/scripts/gql_mocks.json.gz
index d11489b339031..31a5230dbae9a 100644
Binary files a/.github/scripts/gql_mocks.json.gz and b/.github/scripts/gql_mocks.json.gz differ
diff --git a/.github/scripts/label_utils.py b/.github/scripts/label_utils.py
index 64128c065c663..eeb82ec316081 100644
--- a/.github/scripts/label_utils.py
+++ b/.github/scripts/label_utils.py
@@ -74,15 +74,23 @@ def gh_get_labels(org: str, repo: str) -> List[str]:
 
 
 def gh_add_labels(
-    org: str, repo: str, pr_num: int, labels: Union[str, List[str]]
+    org: str, repo: str, pr_num: int, labels: Union[str, List[str]], dry_run: bool
 ) -> None:
+    if dry_run:
+        print(f"Dryrun: Adding labels {labels} to PR {pr_num}")
+        return
     gh_fetch_url_and_headers(
         url=f"https://api.github.com/repos/{org}/{repo}/issues/{pr_num}/labels",
         data={"labels": labels},
     )
 
 
-def gh_remove_label(org: str, repo: str, pr_num: int, label: str) -> None:
+def gh_remove_label(
+    org: str, repo: str, pr_num: int, label: str, dry_run: bool
+) -> None:
+    if dry_run:
+        print(f"Dryrun: Removing {label} from PR {pr_num}")
+        return
     gh_fetch_url_and_headers(
         url=f"https://api.github.com/repos/{org}/{repo}/issues/{pr_num}/labels/{label}",
         method="DELETE",
diff --git a/.github/scripts/lintrunner.sh b/.github/scripts/lintrunner.sh
new file mode 100755
index 0000000000000..82f472b0f16b7
--- /dev/null
+++ b/.github/scripts/lintrunner.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+set -ex
+
+# The generic Linux job chooses to use base env, not the one setup by the image
+CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"
+conda activate "${CONDA_ENV}"
+
+# Use uv to speed up lintrunner init
+python3 -m pip install uv
+
+CACHE_DIRECTORY="/tmp/.lintbin"
+# Try to recover the cached binaries
+if [[ -d "${CACHE_DIRECTORY}" ]]; then
+    # It's ok to fail this as lintrunner init would download these binaries
+    # again if they do not exist
+    cp -r "${CACHE_DIRECTORY}" . || true
+fi
+
+# This has already been cached in the docker image
+lintrunner init 2> /dev/null
+
+# Do build steps necessary for linters
+if [[ "${CLANG}" == "1" ]]; then
+    python3 -m tools.linter.clang_tidy.generate_build_files
+fi
+python3 -m tools.generate_torch_version --is_debug=false
+python3 -m tools.pyi.gen_pyi \
+    --native-functions-path aten/src/ATen/native/native_functions.yaml \
+    --tags-path aten/src/ATen/native/tags.yaml \
+    --deprecated-functions-path "tools/autograd/deprecated.yaml"
+
+RC=0
+# Run lintrunner on all files
+if ! lintrunner --force-color --all-files --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
+    echo ""
+    echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner -m origin/main\`. (If you don't get the same results, run \'lintrunner init\' to update your local linter)\e[0m"
+    echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
+    RC=1
+fi
+
+# Use jq to massage the JSON lint output into GitHub Actions workflow commands.
+jq --raw-output \
+    '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \
+    lint.json || true
+
+exit $RC
diff --git a/.github/scripts/s390x-ci/README.md b/.github/scripts/s390x-ci/README.md
new file mode 100644
index 0000000000000..f62b02e24aa3e
--- /dev/null
+++ b/.github/scripts/s390x-ci/README.md
@@ -0,0 +1,51 @@
+# Configuring the builder.
+
+## Install prerequisites.
+
+```
+$ sudo dnf install docker
+```
+
+## Add services.
+
+```
+$ sudo cp self-hosted-builder/*.service /etc/systemd/system/
+$ sudo systemctl daemon-reload
+```
+
+## Download qemu-user-static image
+
+```
+# sudo docker pull docker.io/iiilinuxibmcom/qemu-user-static:6.1.0-1
+```
+
+## Autostart the x86_64 emulation support.
+
+```
+$ sudo systemctl enable --now qemu-user-static
+```
+
+## Rebuild the image
+
+In order to build or update the `iiilinuxibmcom/actions-runner` image, e.g. to get the
+latest OS security fixes, use the following commands:
+
+```
+$ cd self-hosted-builder
+$ sudo docker build \
+      --build-arg repo=<owner>/<name> \
+      --build-arg token=<***> \
+      --pull \
+      -f actions-runner.Dockerfile \
+      -t iiilinuxibmcom/actions-runner \
+      .
+```
+
+If it fails, ensure that selinux doesn't prevent it from working.
+In worst case, selinux can be disabled with `setenforce 0`.
+
+## Autostart the runner.
+
+```
+$ sudo systemctl enable --now actions-runner@$NAME
+```
diff --git a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
new file mode 100644
index 0000000000000..416a6d8e50df5
--- /dev/null
+++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
@@ -0,0 +1,66 @@
+# Self-Hosted IBM Z Github Actions Runner.
+
+# Temporary image: amd64 dependencies.
+FROM docker.io/amd64/ubuntu:22.04 as ld-prefix
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get -y install ca-certificates libicu70 libssl3
+
+# Main image.
+FROM docker.io/s390x/ubuntu:22.04
+
+# Packages for pytorch building and testing.
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get -y install \
+        cmake \
+        curl \
+        gcc \
+        git \
+        jq \
+        libxml2-dev \
+        libxslt-dev \
+        ninja-build \
+        python-is-python3 \
+        python3 \
+        python3-dev \
+        python3-pip \
+        pybind11-dev \
+        python3-numpy \
+        libopenblas-dev \
+        liblapack-dev \
+        libgloo-dev \
+        python3-yaml \
+        python3-scipy \
+        virtualenv
+
+# amd64 dependencies.
+COPY --from=ld-prefix / /usr/x86_64-linux-gnu/
+RUN ln -fs ../lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 /usr/x86_64-linux-gnu/lib64/
+RUN ln -fs /etc/resolv.conf /usr/x86_64-linux-gnu/etc/
+ENV QEMU_LD_PREFIX=/usr/x86_64-linux-gnu
+
+# Scripts.
+COPY fs/ /
+
+RUN chmod +x /usr/bin/actions-runner /usr/bin/entrypoint
+
+# amd64 Github Actions Runner.
+RUN useradd -m actions-runner
+USER actions-runner
+WORKDIR /home/actions-runner
+RUN curl -L https://github.com/actions/runner/releases/download/v2.309.0/actions-runner-linux-x64-2.309.0.tar.gz | tar -xz
+
+# repository
+ARG repo
+
+# repository token
+ARG token
+
+RUN ./config.sh \
+        --unattended \
+        --url "https://github.com/${repo}" \
+        --token "${token}" \
+        --no-default-labels \
+        --labels self-hosted,linux.s390x
+
+ENTRYPOINT ["/usr/bin/entrypoint"]
+CMD ["/usr/bin/actions-runner"]
diff --git a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner@.service b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner@.service
new file mode 100644
index 0000000000000..158be9ccb6c1d
--- /dev/null
+++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner@.service
@@ -0,0 +1,22 @@
+[Unit]
+Description=Self-Hosted IBM Z Github Actions Runner
+Wants=qemu-user-static
+After=qemu-user-static
+StartLimitIntervalSec=0
+
+[Service]
+Type=simple
+Restart=always
+ExecStartPre=-/usr/bin/docker rm --force actions-runner.%i
+ExecStart=/usr/bin/docker run \
+              --init \
+              --interactive \
+              --name=actions-runner.%i \
+              --rm \
+              iiilinuxibmcom/actions-runner
+ExecStop=/bin/sh -c "docker exec actions-runner.%i kill -INT -- -1"
+ExecStop=/bin/sh -c "docker wait actions-runner.%i"
+ExecStop=/bin/sh -c "docker rm actions-runner.%i"
+
+[Install]
+WantedBy=multi-user.target
diff --git a/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner
new file mode 100644
index 0000000000000..760784b21c396
--- /dev/null
+++ b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+set -e -u
+
+# Run one job.
+./run.sh --once
diff --git a/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/entrypoint b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/entrypoint
new file mode 100644
index 0000000000000..14f6c84ca602e
--- /dev/null
+++ b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/entrypoint
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+#
+# Container entrypoint that waits for all spawned processes.
+#
+
+set -e -u
+
+# Create a FIFO and start reading from its read end.
+tempdir=$(mktemp -d "/tmp/done.XXXXXXXXXX")
+trap 'rm -r "$tempdir"' EXIT
+done="$tempdir/pipe"
+mkfifo "$done"
+cat "$done" & waiter=$!
+
+# Start the workload. Its descendants will inherit the FIFO's write end.
+status=0
+if [ "$#" -eq 0 ]; then
+    bash 9>"$done" || status=$?
+else
+    "$@" 9>"$done" || status=$?
+fi
+
+# When the workload and all of its descendants exit, the FIFO's write end will
+# be closed and `cat "$done"` will exit. Wait until it happens. This is needed
+# in order to handle SelfUpdater, which the workload may start in background
+# before exiting.
+wait "$waiter"
+
+exit "$status"
diff --git a/.github/scripts/s390x-ci/self-hosted-builder/qemu-user-static.service b/.github/scripts/s390x-ci/self-hosted-builder/qemu-user-static.service
new file mode 100644
index 0000000000000..40b6c5b17f3ea
--- /dev/null
+++ b/.github/scripts/s390x-ci/self-hosted-builder/qemu-user-static.service
@@ -0,0 +1,11 @@
+[Unit]
+Description=Support for transparent execution of non-native binaries with QEMU user emulation
+
+[Service]
+Type=oneshot
+# The source code for iiilinuxibmcom/qemu-user-static is at https://github.com/iii-i/qemu-user-static/tree/v6.1.0-1
+# TODO: replace it with multiarch/qemu-user-static once version >6.1 is available
+ExecStart=/usr/bin/docker run --rm --interactive --privileged docker.io/iiilinuxibmcom/qemu-user-static:6.1.0-1 --reset -p yes
+
+[Install]
+WantedBy=multi-user.target
diff --git a/.github/scripts/td_llm_indexer.sh b/.github/scripts/td_llm_indexer.sh
new file mode 100644
index 0000000000000..97565b5db0386
--- /dev/null
+++ b/.github/scripts/td_llm_indexer.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+# Download requirements
+cd llm-target-determinator
+pip install -q -r requirements.txt
+cd ../codellama
+pip install -e .
+
+# Run indexer
+cd ../llm-target-determinator
+
+torchrun \
+    --standalone \
+    --nnodes=1 \
+    --nproc-per-node=1 \
+    indexer.py \
+    --experiment-name indexer-files \
+    --granularity FILE
diff --git a/.github/scripts/test_fetch_latest_green_commit.py b/.github/scripts/test_fetch_latest_green_commit.py
deleted file mode 100644
index 0888d0556d12e..0000000000000
--- a/.github/scripts/test_fetch_latest_green_commit.py
+++ /dev/null
@@ -1,148 +0,0 @@
-from typing import Any, Dict, List
-from unittest import main, mock, TestCase
-
-from fetch_latest_green_commit import isGreen, WorkflowCheck
-
-workflowNames = [
-    "pull",
-    "trunk",
-    "Lint",
-    "linux-binary-libtorch-pre-cxx11",
-    "android-tests",
-    "windows-binary-wheel",
-    "periodic",
-    "docker-release-builds",
-    "nightly",
-    "pr-labels",
-    "Close stale pull requests",
-    "Update S3 HTML indices for download.pytorch.org",
-    "Create Release",
-]
-
-
-def set_workflow_job_status(
-    workflow: List[Dict[str, Any]], name: str, status: str
-) -> List[Dict[str, Any]]:
-    for check in workflow:
-        if check["workflowName"] == name:
-            check["conclusion"] = status
-    return workflow
-
-
-class TestChecks:
-    def make_test_checks(self) -> List[Dict[str, Any]]:
-        workflow_checks = []
-        for i in range(len(workflowNames)):
-            workflow_checks.append(
-                WorkflowCheck(
-                    workflowName=workflowNames[i],
-                    name="test/job",
-                    jobName="job",
-                    conclusion="success",
-                )._asdict()
-            )
-        return workflow_checks
-
-
-class TestPrintCommits(TestCase):
-    @mock.patch(
-        "fetch_latest_green_commit.get_commit_results",
-        return_value=TestChecks().make_test_checks(),
-    )
-    def test_all_successful(self, mock_get_commit_results: Any) -> None:
-        "Test with workflows are successful"
-        workflow_checks = mock_get_commit_results()
-        self.assertTrue(isGreen("sha", workflow_checks)[0])
-
-    @mock.patch(
-        "fetch_latest_green_commit.get_commit_results",
-        return_value=TestChecks().make_test_checks(),
-    )
-    def test_necessary_successful(self, mock_get_commit_results: Any) -> None:
-        "Test with necessary workflows are successful"
-        workflow_checks = mock_get_commit_results()
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, workflowNames[8], "failed"
-        )
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, workflowNames[9], "failed"
-        )
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, workflowNames[10], "failed"
-        )
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, workflowNames[11], "failed"
-        )
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, workflowNames[12], "failed"
-        )
-        self.assertTrue(isGreen("sha", workflow_checks)[0])
-
-    @mock.patch(
-        "fetch_latest_green_commit.get_commit_results",
-        return_value=TestChecks().make_test_checks(),
-    )
-    def test_necessary_skipped(self, mock_get_commit_results: Any) -> None:
-        "Test with necessary job (ex: pull) skipped"
-        workflow_checks = mock_get_commit_results()
-        workflow_checks = set_workflow_job_status(workflow_checks, "pull", "skipped")
-        result = isGreen("sha", workflow_checks)
-        self.assertTrue(result[0])
-
-    @mock.patch(
-        "fetch_latest_green_commit.get_commit_results",
-        return_value=TestChecks().make_test_checks(),
-    )
-    def test_skippable_skipped(self, mock_get_commit_results: Any) -> None:
-        "Test with skippable jobs (periodic and docker-release-builds skipped"
-        workflow_checks = mock_get_commit_results()
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, "periodic", "skipped"
-        )
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, "docker-release-builds", "skipped"
-        )
-        self.assertTrue(isGreen("sha", workflow_checks))
-
-    @mock.patch(
-        "fetch_latest_green_commit.get_commit_results",
-        return_value=TestChecks().make_test_checks(),
-    )
-    def test_necessary_failed(self, mock_get_commit_results: Any) -> None:
-        "Test with necessary job (ex: Lint) failed"
-        workflow_checks = mock_get_commit_results()
-        workflow_checks = set_workflow_job_status(workflow_checks, "Lint", "failed")
-        result = isGreen("sha", workflow_checks)
-        self.assertFalse(result[0])
-        self.assertEqual(result[1], "Lint checks were not successful")
-
-    @mock.patch(
-        "fetch_latest_green_commit.get_commit_results",
-        return_value=TestChecks().make_test_checks(),
-    )
-    def test_skippable_failed(self, mock_get_commit_results: Any) -> None:
-        "Test with failing skippable jobs (ex: docker-release-builds) should pass"
-        workflow_checks = mock_get_commit_results()
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, "periodic", "skipped"
-        )
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, "docker-release-builds", "failed"
-        )
-        result = isGreen("sha", workflow_checks)
-        self.assertTrue(result[0])
-
-    @mock.patch("fetch_latest_green_commit.get_commit_results", return_value={})
-    def test_no_workflows(self, mock_get_commit_results: Any) -> None:
-        "Test with missing workflows"
-        workflow_checks = mock_get_commit_results()
-        result = isGreen("sha", workflow_checks)
-        self.assertFalse(result[0])
-        self.assertEqual(
-            result[1],
-            "missing required workflows: pull, trunk, lint, linux-binary",
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/.github/scripts/test_filter_test_configs.py b/.github/scripts/test_filter_test_configs.py
index 8722ff7a44e9f..2f73d022c3dab 100755
--- a/.github/scripts/test_filter_test_configs.py
+++ b/.github/scripts/test_filter_test_configs.py
@@ -9,6 +9,7 @@
 import yaml
 from filter_test_configs import (
     filter,
+    filter_selected_test_configs,
     get_labels,
     mark_unstable_jobs,
     parse_reenabled_issues,
@@ -17,7 +18,6 @@
     remove_disabled_jobs,
     set_periodic_modes,
     SUPPORTED_PERIODICAL_MODES,
-    VALID_TEST_CONFIG_LABELS,
 )
 
 
@@ -273,13 +273,13 @@ def test_filter(self) -> None:
         testcases = [
             {
                 "test_matrix": '{include: [{config: "default", runner: "linux"}]}',
-                "expected": '{"include": [{"config": "default", "runner": "linux"}]}',
-                "description": "No match, keep the same test matrix",
+                "expected": '{"include": []}',
+                "description": "Request test-config/cfg but the test matrix doesn't have it",
             },
             {
                 "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "plain-cfg"}]}',
-                "expected": '{"include": [{"config": "default", "runner": "linux"}, {"config": "plain-cfg"}]}',
-                "description": "No match because there is no prefix or suffix, keep the same test matrix",
+                "expected": '{"include": []}',
+                "description": "A valid test config label needs to start with test-config/",
             },
             {
                 "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", shard: 1}]}',
@@ -294,9 +294,8 @@ def test_filter(self) -> None:
             )
             self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
 
-    def test_filter_with_valid_label(self) -> None:
+    def test_filter_with_test_config_label(self) -> None:
         mocked_labels = {f"{PREFIX}cfg", "ciflow/trunk"}
-        VALID_TEST_CONFIG_LABELS.add(f"{PREFIX}cfg")
 
         testcases = [
             {
@@ -317,6 +316,51 @@ def test_filter_with_valid_label(self) -> None:
             )
             self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
 
+    def test_filter_selected_test_configs(self) -> None:
+        testcases = [
+            {
+                "test_matrix": '{include: [{config: "default"}]}',
+                "selected_test_configs": "",
+                "expected": '{"include": [{"config": "default"}]}',
+                "description": "No selected test configs",
+            },
+            {
+                "test_matrix": '{include: [{config: "default"}]}',
+                "selected_test_configs": "foo",
+                "expected": '{"include": []}',
+                "description": "A different test config is selected",
+            },
+            {
+                "test_matrix": '{include: [{config: "default"}]}',
+                "selected_test_configs": "foo, bar",
+                "expected": '{"include": []}',
+                "description": "A different set of test configs is selected",
+            },
+            {
+                "test_matrix": '{include: [{config: "default"}]}',
+                "selected_test_configs": "foo, bar,default",
+                "expected": '{"include": [{"config": "default"}]}',
+                "description": "One of the test config is selected",
+            },
+            {
+                "test_matrix": '{include: [{config: "default"}, {config: "bar"}]}',
+                "selected_test_configs": "foo, bar,Default",
+                "expected": '{"include": [{"config": "default"}, {"config": "bar"}]}',
+                "description": "Several test configs are selected",
+            },
+        ]
+
+        for case in testcases:
+            selected_test_configs = {
+                v.strip().lower()
+                for v in case["selected_test_configs"].split(",")
+                if v.strip()
+            }
+            filtered_test_matrix = filter_selected_test_configs(
+                yaml.safe_load(case["test_matrix"]), selected_test_configs
+            )
+            self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
+
     def test_set_periodic_modes(self) -> None:
         testcases: List[Dict[str, str]] = [
             {
@@ -636,55 +680,110 @@ def test_mark_unstable_jobs(self, mock_download_json: Any) -> None:
 
     @mock.patch("subprocess.check_output")
     def test_perform_misc_tasks(self, mocked_subprocess: Any) -> None:
+        def _gen_expected_string(
+            keep_going: bool = False,
+            ci_verbose_test_logs: bool = False,
+            ci_no_test_timeout: bool = False,
+            ci_no_td: bool = False,
+            ci_td_distributed: bool = False,
+            is_unstable: bool = False,
+            reenabled_issues: str = "",
+        ) -> str:
+            return (
+                f"keep-going={keep_going}\n"
+                f"ci-verbose-test-logs={ci_verbose_test_logs}\n"
+                f"ci-no-test-timeout={ci_no_test_timeout}\n"
+                f"ci-no-td={ci_no_td}\n"
+                f"ci-td-distributed={ci_td_distributed}\n"
+                f"is-unstable={is_unstable}\n"
+                f"reenabled-issues={reenabled_issues}\n"
+            )
+
         mocked_subprocess.return_value = b""
         testcases: List[Dict[str, Any]] = [
             {
                 "labels": {},
                 "test_matrix": '{include: [{config: "default"}]}',
                 "job_name": "A job name",
-                "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n",
+                "expected": _gen_expected_string(),
                 "description": "No keep-going, no is-unstable",
             },
             {
                 "labels": {"keep-going"},
                 "test_matrix": '{include: [{config: "default"}]}',
                 "job_name": "A job name",
-                "expected": "keep-going=True\nis-unstable=False\nreenabled-issues=\n",
+                "expected": _gen_expected_string(keep_going=True),
                 "description": "Has keep-going, no is-unstable",
             },
+            {
+                "labels": {},
+                "test_matrix": '{include: [{config: "default"}]}',
+                "job_name": "A job name",
+                "pr_body": "[keep-going]",
+                "expected": _gen_expected_string(keep_going=True),
+                "description": "Keep-going in PR body",
+            },
+            {
+                "labels": {"ci-verbose-test-logs"},
+                "test_matrix": '{include: [{config: "default"}]}',
+                "job_name": "A job name",
+                "pr_body": "[ci-no-test-timeout]",
+                "expected": _gen_expected_string(
+                    ci_verbose_test_logs=True, ci_no_test_timeout=True
+                ),
+                "description": "No pipe logs label and no test timeout in PR body",
+            },
+            {
+                "labels": {"ci-no-test-timeout"},
+                "test_matrix": '{include: [{config: "default"}]}',
+                "job_name": "A job name",
+                "pr_body": "[ci-verbose-test-logs]",
+                "expected": _gen_expected_string(
+                    ci_verbose_test_logs=True, ci_no_test_timeout=True
+                ),
+                "description": "No pipe logs in PR body and no test timeout in label (same as the above but swapped)",
+            },
+            {
+                "labels": {"ci-no-td"},
+                "test_matrix": '{include: [{config: "default"}]}',
+                "job_name": "A job name",
+                "pr_body": "",
+                "expected": _gen_expected_string(ci_no_td=True),
+                "description": "No pipe logs in PR body and no test timeout in label (same as the above but swapped)",
+            },
             {
                 "labels": {},
                 "test_matrix": '{include: [{config: "default"}]}',
                 "job_name": None,
-                "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n",
+                "expected": _gen_expected_string(),
                 "description": "No job name",
             },
             {
                 "labels": {},
                 "test_matrix": '{include: [{config: "default"}]}',
-                "job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-12, unstable)",
-                "expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n",
+                "job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-stable, unstable)",
+                "expected": _gen_expected_string(is_unstable=True),
                 "description": "Unstable job",
             },
             {
                 "labels": {},
                 "test_matrix": '{include: [{config: "default"}]}',
-                "job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-12, unstable)",
-                "expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n",
+                "job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-stable, unstable)",
+                "expected": _gen_expected_string(is_unstable=True),
                 "description": "Unstable job",
             },
             {
                 "labels": {},
                 "test_matrix": '{include: [{config: "1", unstable: "unstable"}, {config: "2", unstable: "unstable"}]}',
                 "job_name": "macos-12-py3-arm64 / build",
-                "expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n",
+                "expected": _gen_expected_string(is_unstable=True),
                 "description": "All configs are unstable",
             },
             {
                 "labels": {},
                 "test_matrix": '{include: [{config: "1", unstable: "unstable"}, {config: "2"}]}',
                 "job_name": "macos-12-py3-arm64 / build",
-                "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n",
+                "expected": _gen_expected_string(is_unstable=False),
                 "description": "Only mark some configs as unstable",
             },
             {
@@ -692,7 +791,7 @@ def test_perform_misc_tasks(self, mocked_subprocess: Any) -> None:
                 "test_matrix": '{include: [{config: "default"}]}',
                 "job_name": "A job name",
                 "pr_body": "resolves #123 fixes #234",
-                "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=123,234\n",
+                "expected": _gen_expected_string(reenabled_issues="123,234"),
                 "description": "Reenable some issues",
             },
         ]
diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py
index fbcbe048df14f..2641fd30f348e 100755
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@@ -16,6 +16,8 @@
 from unittest import main, mock, skip, TestCase
 from urllib.error import HTTPError
 
+from github_utils import gh_graphql
+
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 
 from trymerge import (
@@ -26,7 +28,6 @@
     get_drci_classifications,
     get_rockset_results,
     gh_get_team_members,
-    gh_graphql,
     GitHubPR,
     JobCheckState,
     main as trymerge_main,
@@ -140,11 +141,14 @@ def __init__(self) -> None:
             self.comment_id = 0
             self.reason = "this is for testing"
             self.ignore_current = False
+            self.check_mergeability = False
 
     return Object()
 
 
-def mock_remove_label(org: str, repo: str, pr_num: str, label: str) -> None:
+def mock_remove_label(
+    org: str, repo: str, pr_num: str, label: str, dry_run: bool
+) -> None:
     pass
 
 
@@ -201,7 +205,6 @@ def mocked_read_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule
             approved_by=["pytorch/metamates", "ngimel"],
             mandatory_checks_name=[
                 "Lint",
-                "Facebook CLA Check",
                 "pull / linux-xenial-cuda11.3-py3.7-gcc7 / build",
             ],
             ignore_flaky_failures=True,
@@ -394,7 +397,7 @@ def test_gql_complexity(self, *args: Any) -> None:
     def test_gql_retrieve_checksuites(self, *args: Any) -> None:
         "Fetch comments and conclusions for PR with 60 commits"
         pr = GitHubPR("pytorch", "pytorch", 94787)
-        self.assertEqual(len(pr.get_checkrun_conclusions()), 183)
+        self.assertEqual(len(pr.get_checkrun_conclusions()), 182)
 
     def test_team_members(self, *args: Any) -> None:
         "Test fetching team members works"
@@ -431,6 +434,13 @@ def test_get_author_many_reviews(self, *args: Any) -> None:
         assert pr._reviews is not None  # to pacify mypy
         self.assertGreater(len(pr._reviews), 100)
 
+    def get_co_authors(self, *args: Any) -> None:
+        """Tests that co-authors are recognized"""
+        pr = GitHubPR("pytorch", "pytorch", 118347)
+        authors = pr.get_authors()
+        self.assertIn("kit1980", authors)
+        self.assertIn("Co-authored-by:", pr.gen_commit_message())
+
     def test_get_checkruns_many_runs(self, *args: Any) -> None:
         """Tests that all checkruns can be fetched"""
         pr = GitHubPR("pytorch", "pytorch", 105260)
@@ -731,6 +741,30 @@ def test_get_classifications_unstable(self, *args: Any) -> None:
         self.assertTrue(len(failed) == 0)
         self.assertTrue(len(ignorable["UNSTABLE"]) == 1)
 
+        # Add another test case where there is no unstable keyword in the job name, but
+        # the job has already been marked as unstable
+        pr = GitHubPR("pytorch", "executorch", 3318)
+        checks = pr.get_checkrun_conclusions()
+        checks = get_classifications(
+            pr.pr_num,
+            pr.project,
+            checks,
+            [],
+        )
+        print(checks)
+        workflow_name = "test-llama-app"
+        job_name = "mobile-job (android)"
+        self.assertTrue(
+            checks[f"Android / {workflow_name} / {job_name}"].classification
+            == "UNSTABLE"
+        )
+        pending, failed, ignorable = categorize_checks(
+            checks, list(checks.keys()), ok_failed_checks_threshold=1
+        )
+        self.assertTrue(len(pending) == 0)
+        self.assertTrue(len(failed) == 0)
+        self.assertTrue(len(ignorable["UNSTABLE"]) == 1)
+
     def test_get_classifications_broken_trunk(self, *args: Any) -> None:
         # The mock merge base is the actual value returned by gh_fetch_merge_base
         test_cases = [
@@ -822,6 +856,41 @@ def test_ignore_current(self, *args: Any) -> None:
         self.assertTrue(len(ignorable["FLAKY"]) == 4)
         self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 2)
 
+    def test_get_classifications_wrong_workflow_name(self, *args: Any) -> None:
+        pr = GitHubPR("pytorch", "pytorch", 123104)
+        checks = pr.get_checkrun_conclusions()
+
+        check_name = "linux-binary-conda / conda-py3_8-cuda11_8-build / build"
+        check_name_workflow_path = ".github/workflows/generated-linux-binary-conda-nightly.yml / conda-py3_8-cuda11_8-build / build"
+
+        # Mock a check where the workflow name uses the full path
+        checks[check_name_workflow_path] = JobCheckState(
+            check_name_workflow_path,
+            checks[check_name].url,
+            checks[check_name].status,
+            checks[check_name].classification,
+            checks[check_name].job_id,
+            checks[check_name].title,
+            checks[check_name].summary,
+        )
+        del checks[check_name]
+
+        checks = get_classifications(
+            pr.pr_num,
+            pr.project,
+            checks,
+            [],
+        )
+        pending, failed, ignorable = categorize_checks(
+            checks,
+            list(checks.keys()),
+        )
+
+        self.assertTrue(len(pending) == 0)
+        self.assertTrue(len(failed) == 0)
+        self.assertTrue(len(ignorable["FLAKY"]) == 1)
+        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 0)
+
     @mock.patch("trymerge.read_merge_rules", side_effect=xla_merge_rules)
     def test_dont_ignore_flaky_failures(self, *args: Any) -> None:
         """
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
index c2b513e47f41c..95311d2d9b836 100755
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@@ -39,6 +39,7 @@
     gh_fetch_json_list,
     gh_fetch_merge_base,
     gh_fetch_url,
+    gh_graphql,
     gh_post_commit_comment,
     gh_post_pr_comment,
     gh_update_pr_state,
@@ -122,6 +123,7 @@ def __init__(self, name: str, url: str, status: Optional[str]):
         workflow {
           name
         }
+        databaseId
         url
       }
       checkRuns(first: 50) {
@@ -152,12 +154,14 @@ def __init__(self, name: str, url: str, status: Optional[str]):
 fragment CommitAuthors on PullRequestCommitConnection {
   nodes {
     commit {
-      author {
-        user {
-          login
+      authors(first: 2) {
+        nodes {
+          user {
+            login
+          }
+          email
+          name
         }
-        email
-        name
       }
       oid
     }
@@ -458,19 +462,6 @@ def __init__(self, name: str, url: str, status: Optional[str]):
 IGNORABLE_FAILED_CHECKS_THESHOLD = 10
 
 
-def gh_graphql(query: str, **kwargs: Any) -> Dict[str, Any]:
-    rc = gh_fetch_url(
-        "https://api.github.com/graphql",
-        data={"query": query, "variables": kwargs},
-        reader=json.load,
-    )
-    if "errors" in rc:
-        raise RuntimeError(
-            f"GraphQL query {query}, args {kwargs} failed: {rc['errors']}"
-        )
-    return cast(Dict[str, Any], rc)
-
-
 def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
     rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no)
     return rc["data"]["repository"]["pullRequest"]
@@ -608,6 +599,7 @@ def parse_args() -> Any:
     parser.add_argument("--revert", action="store_true")
     parser.add_argument("--force", action="store_true")
     parser.add_argument("--ignore-current", action="store_true")
+    parser.add_argument("--check-mergeability", action="store_true")
     parser.add_argument("--comment-id", type=int)
     parser.add_argument("--reason", type=str)
     parser.add_argument("pr_num", type=int)
@@ -745,7 +737,7 @@ def get_merge_base(self) -> str:
         # work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base,
         # so let's just use main instead
         self.merge_base = gh_fetch_merge_base(
-            self.org, self.project, last_commit_oid, "main"
+            self.org, self.project, last_commit_oid, self.default_branch()
         )
 
         # Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid
@@ -845,14 +837,14 @@ def _fetch_authors(self) -> List[Tuple[str, str]]:
 
         def add_authors(info: Dict[str, Any]) -> None:
             for node in info["commits_with_authors"]["nodes"]:
-                author_node = node["commit"]["author"]
-                user_node = author_node["user"]
-                author = f"{author_node['name']} <{author_node['email']}>"
-                if user_node is None:
-                    # If author is not github user, user node will be null
-                    authors.append(("", author))
-                else:
-                    authors.append((cast(str, user_node["login"]), author))
+                for author_node in node["commit"]["authors"]["nodes"]:
+                    user_node = author_node["user"]
+                    author = f"{author_node['name']} <{author_node['email']}>"
+                    if user_node is None:
+                        # If author is not github user, user node will be null
+                        authors.append(("", author))
+                    else:
+                        authors.append((cast(str, user_node["login"]), author))
 
         info = self.info
         for _ in range(100):
@@ -948,11 +940,6 @@ def get_pr_next_checksuites(checksuites: Any) -> Any:
 
     def get_authors(self) -> Dict[str, str]:
         rc = {}
-        # TODO: replace with  `self.get_commit_count()` when GraphQL pagination can be used
-        # to fetch all commits, see https://gist.github.com/malfet/4f35321b0c9315bcd7116c7b54d83372
-        # and https://support.github.com/ticket/enterprise/1642/1659119
-        if self.get_commit_count() <= 250:
-            assert len(self._fetch_authors()) == self.get_commit_count()
         for idx in range(len(self._fetch_authors())):
             rc[self.get_committer_login(idx)] = self.get_committer_author(idx)
 
@@ -1068,6 +1055,7 @@ def merge_ghstack_into(
         repo: GitRepo,
         skip_mandatory_checks: bool,
         comment_id: Optional[int] = None,
+        skip_all_rule_checks: bool = False,
     ) -> List["GitHubPR"]:
         assert self.is_ghstack_pr()
         ghstack_prs = get_ghstack_prs(
@@ -1082,7 +1070,7 @@ def merge_ghstack_into(
             commit_msg = pr.gen_commit_message(
                 filter_ghstack=True, ghstack_deps=pr_dependencies
             )
-            if pr.pr_num != self.pr_num:
+            if pr.pr_num != self.pr_num and not skip_all_rule_checks:
                 # Raises exception if matching rule is not found
                 find_matching_merge_rule(
                     pr,
@@ -1113,13 +1101,19 @@ def gen_commit_message(
             msg_body = re.sub(RE_GHSTACK_DESC, "", msg_body)
         msg = self.get_title() + f" (#{self.pr_num})\n\n"
         msg += msg_body
+
+        # Mention PR co-authors
+        for author_login, author_name in self.get_authors().items():
+            if author_login != self.get_pr_creator_login():
+                msg += f"\nCo-authored-by: {author_name}"
+
         msg += f"\nPull Request resolved: {self.get_pr_url()}\n"
         msg += f"Approved by: {approved_by_urls}\n"
         if ghstack_deps:
             msg += f"ghstack dependencies: {', '.join([f'#{pr.pr_num}' for pr in ghstack_deps])}\n"
         return msg
 
-    def add_numbered_label(self, label_base: str) -> None:
+    def add_numbered_label(self, label_base: str, dry_run: bool) -> None:
         labels = self.get_labels() if self.labels is not None else []
         full_label = label_base
         count = 0
@@ -1127,7 +1121,7 @@ def add_numbered_label(self, label_base: str) -> None:
             if label_base in label:
                 count += 1
                 full_label = f"{label_base}X{count}"
-        gh_add_labels(self.org, self.project, self.pr_num, [full_label])
+        gh_add_labels(self.org, self.project, self.pr_num, [full_label], dry_run)
 
     def merge_into(
         self,
@@ -1157,9 +1151,9 @@ def merge_into(
 
         repo.push(self.default_branch(), dry_run)
         if not dry_run:
-            self.add_numbered_label(MERGE_COMPLETE_LABEL)
+            self.add_numbered_label(MERGE_COMPLETE_LABEL, dry_run)
             for pr in additional_merged_prs:
-                pr.add_numbered_label(MERGE_COMPLETE_LABEL)
+                pr.add_numbered_label(MERGE_COMPLETE_LABEL, dry_run)
 
         if comment_id and self.pr_num:
             # When the merge process reaches this part, we can assume that the commit
@@ -1199,7 +1193,11 @@ def merge_changes(
         skip_mandatory_checks: bool = False,
         comment_id: Optional[int] = None,
         branch: Optional[str] = None,
+        skip_all_rule_checks: bool = False,
     ) -> List["GitHubPR"]:
+        """
+        :param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally
+        """
         branch_to_merge_into = self.default_branch() if branch is None else branch
         if repo.current_branch() != branch_to_merge_into:
             repo.checkout(branch_to_merge_into)
@@ -1215,6 +1213,7 @@ def merge_changes(
                 repo,
                 skip_mandatory_checks,
                 comment_id=comment_id,
+                skip_all_rule_checks=skip_all_rule_checks,
             )
 
 
@@ -1400,7 +1399,10 @@ def find_matching_merge_rule(
         )
         required_checks = list(
             filter(
-                lambda x: "EasyCLA" in x or not skip_mandatory_checks, mandatory_checks
+                lambda x: ("EasyCLA" in x)
+                or ("Facebook CLA Check" in x)
+                or not skip_mandatory_checks,
+                mandatory_checks,
             )
         )
         pending_checks, failed_checks, _ = categorize_checks(
@@ -1411,6 +1413,13 @@ def find_matching_merge_rule(
             else 0,
         )
 
+        # categorize_checks assumes all tests are required if required_checks is empty.
+        # this is a workaround as we want to keep that behavior for categorize_checks
+        # generally.
+        if not required_checks:
+            pending_checks = []
+            failed_checks = []
+
         hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}"
         if len(failed_checks) > 0:
             if reject_reason_score < 30000:
@@ -1610,28 +1619,59 @@ def remove_job_name_suffix(name: str, replacement: str = ")") -> str:
 
 
 def is_broken_trunk(
-    name: str,
+    check: JobCheckState,
     drci_classifications: Any,
 ) -> bool:
-    if not name or not drci_classifications:
+    if not check or not drci_classifications:
         return False
 
+    name = check.name
+    job_id = check.job_id
+
     # Consult the list of broken trunk failures from Dr.CI
     return any(
-        name == broken_trunk["name"]
+        (name == broken_trunk["name"]) or (job_id and job_id == broken_trunk["id"])
         for broken_trunk in drci_classifications.get("BROKEN_TRUNK", [])
     )
 
 
+def is_unstable(
+    check: JobCheckState,
+    drci_classifications: Any,
+) -> bool:
+    if not check or not drci_classifications:
+        return False
+
+    name = check.name
+    job_id = check.job_id
+
+    # The job name has the unstable keyword. This is the original way to mark a job
+    # as unstable on HUD, Dr.CI, and trymerge
+    if "unstable" in name:
+        return True
+
+    # Consult the list of unstable failures from Dr.CI
+    return any(
+        (name == unstable["name"] or (job_id and job_id == unstable["id"]))
+        for unstable in drci_classifications.get("UNSTABLE", [])
+    )
+
+
 def is_flaky(
-    name: str,
+    check: JobCheckState,
     drci_classifications: Any,
 ) -> bool:
-    if not name or not drci_classifications:
+    if not check or not drci_classifications:
         return False
 
+    name = check.name
+    job_id = check.job_id
+
     # Consult the list of flaky failures from Dr.CI
-    return any(name == flaky["name"] for flaky in drci_classifications.get("FLAKY", []))
+    return any(
+        (name == flaky["name"] or (job_id and job_id == flaky["id"]))
+        for flaky in drci_classifications.get("FLAKY", [])
+    )
 
 
 def is_invalid_cancel(
@@ -1669,7 +1709,19 @@ def get_classifications(
     # going forward. It's preferable to try calling Dr.CI API directly first
     # to get the latest results as well as update Dr.CI PR comment
     drci_classifications = get_drci_classifications(pr_num=pr_num, project=project)
-    print(f"From Dr.CI API: {json.dumps(drci_classifications)}")
+
+    def get_readable_drci_results(drci_classifications: Any) -> str:
+        try:
+            s = f"From Dr.CI API ({pr_num}):\n"
+            for classification, jobs in drci_classifications.items():
+                s += f"  {classification}: \n"
+                for job in jobs:
+                    s += f"    {job['id']} {job['name']}\n"
+            return s
+        except Exception:
+            return f"From Dr.CI API: {json.dumps(drci_classifications)}"
+
+    print(get_readable_drci_results(drci_classifications))
 
     # NB: if the latest results from Dr.CI is not available, i.e. when calling from
     # SandCastle, we fallback to any results we can find on Dr.CI check run summary
@@ -1692,7 +1744,7 @@ def get_classifications(
         if check.status == "SUCCESS" or check.status == "NEUTRAL":
             continue
 
-        if "unstable" in name:
+        if is_unstable(check, drci_classifications):
             checks_with_classifications[name] = JobCheckState(
                 check.name,
                 check.url,
@@ -1706,7 +1758,7 @@ def get_classifications(
 
         # NB: It's important to note that when it comes to ghstack and broken trunk classification,
         # Dr.CI uses the base of the whole stack
-        if is_broken_trunk(name, drci_classifications):
+        if is_broken_trunk(check, drci_classifications):
             checks_with_classifications[name] = JobCheckState(
                 check.name,
                 check.url,
@@ -1718,7 +1770,7 @@ def get_classifications(
             )
             continue
 
-        elif is_flaky(name, drci_classifications):
+        elif is_flaky(check, drci_classifications):
             checks_with_classifications[name] = JobCheckState(
                 check.name,
                 check.url,
@@ -1882,8 +1934,8 @@ def do_revert_prs(
             pr.org, pr.project, pr.pr_num, revert_message, dry_run=dry_run
         )
 
+        pr.add_numbered_label("reverted", dry_run)
         if not dry_run:
-            pr.add_numbered_label("reverted")
             gh_post_commit_comment(pr.org, pr.project, commit_sha, revert_msg)
             gh_update_pr_state(pr.org, pr.project, pr.pr_num)
 
@@ -2053,7 +2105,7 @@ def merge(
     print(f"Attempting merge of {initial_commit_sha} ({pr_link})")
 
     if MERGE_IN_PROGRESS_LABEL not in pr.get_labels():
-        gh_add_labels(pr.org, pr.project, pr.pr_num, [MERGE_IN_PROGRESS_LABEL])
+        gh_add_labels(pr.org, pr.project, pr.pr_num, [MERGE_IN_PROGRESS_LABEL], dry_run)
 
     explainer = TryMergeExplainer(
         skip_mandatory_checks,
@@ -2073,8 +2125,7 @@ def merge(
 
     check_for_sev(pr.org, pr.project, skip_mandatory_checks)
 
-    if skip_mandatory_checks or can_skip_internal_checks(pr, comment_id):
-        # do not wait for any pending signals if PR is closed as part of co-development process
+    if skip_mandatory_checks:
         gh_post_pr_comment(
             pr.org,
             pr.project,
@@ -2201,8 +2252,7 @@ def merge(
     # Finally report timeout back
     msg = f"Merged timed out after {timeout_minutes} minutes. Please contact the pytorch_dev_infra team."
     msg += f"The last exception was: {last_exception}"
-    if not dry_run:
-        gh_add_labels(pr.org, pr.project, pr.pr_num, ["land-failed"])
+    gh_add_labels(pr.org, pr.project, pr.pr_num, ["land-failed"], dry_run)
     raise RuntimeError(msg)
 
 
@@ -2281,6 +2331,16 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
         )
         return
 
+    if args.check_mergeability:
+        if pr.is_ghstack_pr():
+            get_ghstack_prs(repo, pr)  # raises error if out of sync
+        pr.merge_changes(
+            repo,
+            skip_mandatory_checks=True,
+            skip_all_rule_checks=True,
+        )
+        return
+
     if not args.force and pr.has_invalid_submodule_updates():
         message = (
             f"This PR updates submodules {', '.join(pr.get_changed_submodules())}\n"
@@ -2329,7 +2389,10 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
         else:
             print("Missing comment ID or PR number, couldn't upload to Rockset")
     finally:
-        gh_remove_label(org, project, args.pr_num, MERGE_IN_PROGRESS_LABEL)
+        if not args.check_mergeability:
+            gh_remove_label(
+                org, project, args.pr_num, MERGE_IN_PROGRESS_LABEL, args.dry_run
+            )
 
 
 if __name__ == "__main__":
diff --git a/.github/scripts/tryrebase.py b/.github/scripts/tryrebase.py
index 641b354ef7dce..39a38aaf364c7 100755
--- a/.github/scripts/tryrebase.py
+++ b/.github/scripts/tryrebase.py
@@ -60,7 +60,7 @@ def rebase_onto(
     repo._run_git("rebase", onto_branch, branch)
 
     if repo.rev_parse(branch) == repo.rev_parse(onto_branch):
-        raise Exception(SAME_SHA_ERROR)
+        raise Exception(SAME_SHA_ERROR)  # noqa: TRY002
 
     if dry_run:
         push_result = repo._run_git("push", "--dry-run", "-f", remote_url, refspec)
@@ -100,7 +100,7 @@ def rebase_ghstack_onto(
     repo._run_git("rebase", onto_branch, orig_ref)
 
     if repo.rev_parse(orig_ref) == repo.rev_parse(onto_branch):
-        raise Exception(SAME_SHA_ERROR)
+        raise Exception(SAME_SHA_ERROR)  # noqa: TRY002
 
     # steal the identity of the committer of the commit on the orig branch
     email = repo._run_git("log", orig_ref, "--pretty=format:%ae", "-1")
@@ -126,7 +126,7 @@ def rebase_ghstack_onto(
         print(push_result)
         if ghstack_result.returncode != 0:
             print(ghstack_result.stderr.decode("utf-8"))
-            raise Exception(f"\n```{push_result}```")
+            raise Exception(f"\n```{push_result}```")  # noqa: TRY002
         # The contents of a successful push result should look like:
         # Summary of changes (ghstack 0.6.0)
 
diff --git a/.github/scripts/update_commit_hashes.py b/.github/scripts/update_commit_hashes.py
deleted file mode 100644
index 095e21d21e773..0000000000000
--- a/.github/scripts/update_commit_hashes.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import json
-import os
-import subprocess
-from argparse import ArgumentParser
-from typing import Any, Dict
-
-import requests
-
-UPDATEBOT_TOKEN = os.environ["UPDATEBOT_TOKEN"]
-PYTORCHBOT_TOKEN = os.environ["PYTORCHBOT_TOKEN"]
-OWNER, REPO = "pytorch", "pytorch"
-
-
-def git_api(
-    url: str, params: Dict[str, str], type: str = "get", token: str = UPDATEBOT_TOKEN
-) -> Any:
-    headers = {
-        "Accept": "application/vnd.github.v3+json",
-        "Authorization": f"token {token}",
-    }
-    if type == "post":
-        return requests.post(
-            f"https://api.github.com{url}",
-            data=json.dumps(params),
-            headers=headers,
-        ).json()
-    elif type == "patch":
-        return requests.patch(
-            f"https://api.github.com{url}",
-            data=json.dumps(params),
-            headers=headers,
-        ).json()
-    else:
-        return requests.get(
-            f"https://api.github.com{url}",
-            params=params,
-            headers=headers,
-        ).json()
-
-
-def parse_args() -> Any:
-    parser = ArgumentParser("Rebase PR into branch")
-    parser.add_argument("--repo-name", type=str)
-    parser.add_argument("--branch", type=str)
-    parser.add_argument("--pin-folder", type=str)
-    return parser.parse_args()
-
-
-def make_pr(repo_name: str, branch_name: str) -> Any:
-    params = {
-        "title": f"[{repo_name} hash update] update the pinned {repo_name} hash",
-        "head": branch_name,
-        "base": "main",
-        "body": "This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/"
-        + f".github/workflows/_update-commit-hash.yml).\nUpdate the pinned {repo_name} hash.",
-    }
-    response = git_api(f"/repos/{OWNER}/{REPO}/pulls", params, type="post")
-    print(f"made pr {response['html_url']}")
-    return response["number"]
-
-
-def approve_pr(pr_number: str) -> None:
-    params = {"event": "APPROVE"}
-    # use pytorchbot to approve the pr
-    git_api(
-        f"/repos/{OWNER}/{REPO}/pulls/{pr_number}/reviews",
-        params,
-        type="post",
-        token=PYTORCHBOT_TOKEN,
-    )
-
-
-def make_comment(pr_number: str, msg: str) -> None:
-    params = {"body": msg}
-    # comment with pytorchbot because pytorchmergebot gets ignored
-    git_api(
-        f"/repos/{OWNER}/{REPO}/issues/{pr_number}/comments",
-        params,
-        type="post",
-        token=PYTORCHBOT_TOKEN,
-    )
-
-
-def close_pr(pr_number: str) -> None:
-    params = {"state": "closed"}
-    git_api(
-        f"/repos/{OWNER}/{REPO}/pulls/{pr_number}",
-        params,
-        type="patch",
-    )
-
-
-def is_newer_hash(new_hash: str, old_hash: str, repo_name: str) -> bool:
-    def _get_date(hash: str) -> int:
-        # this git command prints the unix timestamp of the hash
-        return int(
-            subprocess.run(
-                f"git show --no-patch --no-notes --pretty=%ct {hash}".split(),
-                capture_output=True,
-                cwd=f"{repo_name}",
-            )
-            .stdout.decode("utf-8")
-            .strip()
-        )
-
-    return _get_date(new_hash) > _get_date(old_hash)
-
-
-def main() -> None:
-    args = parse_args()
-
-    branch_name = os.environ["NEW_BRANCH_NAME"]
-    pr_num = None
-
-    # query to see if a pr already exists
-    params = {
-        "q": f"is:pr is:open in:title author:pytorchupdatebot repo:{OWNER}/{REPO} {args.repo_name} hash update",
-        "sort": "created",
-    }
-    response = git_api("/search/issues", params)
-    if response["total_count"] != 0:
-        # pr does exist
-        pr_num = response["items"][0]["number"]
-        link = response["items"][0]["html_url"]
-        response = git_api(f"/repos/{OWNER}/{REPO}/pulls/{pr_num}", {})
-        branch_name = response["head"]["ref"]
-        print(
-            f"pr does exist, number is {pr_num}, branch name is {branch_name}, link is {link}"
-        )
-
-    hash = (
-        subprocess.run(
-            f"git rev-parse {args.branch}".split(),
-            capture_output=True,
-            cwd=f"{args.repo_name}",
-        )
-        .stdout.decode("utf-8")
-        .strip()
-    )
-    with open(f"{args.pin_folder}/{args.repo_name}.txt", "r+") as f:
-        old_hash = f.read().strip()
-        subprocess.run(f"git checkout {old_hash}".split(), cwd=args.repo_name)
-        f.seek(0)
-        f.truncate()
-        f.write(f"{hash}\n")
-    if is_newer_hash(hash, old_hash, args.repo_name):
-        # if there was an update, push to branch
-        subprocess.run(f"git checkout -b {branch_name}".split())
-        subprocess.run(f"git add {args.pin_folder}/{args.repo_name}.txt".split())
-        subprocess.run(
-            "git commit -m".split() + [f"update {args.repo_name} commit hash"]
-        )
-        subprocess.run(f"git push --set-upstream origin {branch_name} -f".split())
-        print(f"changes pushed to branch {branch_name}")
-        if pr_num is None:
-            # no existing pr, so make a new one and approve it
-            pr_num = make_pr(args.repo_name, branch_name)
-            approve_pr(pr_num)
-        make_comment(pr_num, "@pytorchbot merge")
-    else:
-        print(
-            f"tried to update from old hash: {old_hash} to new hash: {hash} but the old hash seems to be newer, not creating pr"
-        )
-        if pr_num is not None:
-            make_comment(pr_num, "closing pr as the current hash seems up to date")
-            close_pr(pr_num)
-            print(f"closing PR {pr_num}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2
index 762c473ce2ff9..d44915f41d85f 100644
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@@ -7,6 +7,7 @@
 name: !{{ build_environment }}
 {%- endblock %}
 
+
 on:
   push:
     {%- if branches == "nightly" %}
@@ -45,7 +46,7 @@ env:
   PYTORCH_FINAL_PACKAGE_DIR: /artifacts
   PYTORCH_ROOT: /pytorch
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 !{{ common.concurrency(build_environment) }}
 
 jobs:
diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2
index 505bde406d6a8..591dc52ef9c01 100644
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@@ -48,7 +48,7 @@ env:
   BUILD_ENVIRONMENT: !{{ build_environment }}
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 {%- if cross_compile_arm64 %}
   CROSS_COMPILE_ARM64: 1
 {% endif %}
diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2
index 62153da0cbf01..2d488d4f14dda 100644
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@@ -53,6 +53,9 @@
 {%- macro upload_binaries(config, is_windows=False, has_test=True, use_s3=True) -%}
 !{{ config["build_name"] }}-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
 {%- if has_test %}
     needs: !{{ config["build_name"] }}-test
 {%- else %}
@@ -65,8 +68,6 @@
       {%- endif %}
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/_android-build-test.yml b/.github/workflows/_android-build-test.yml
index 9cb8bb287a9f1..d599e769b8b6a 100644
--- a/.github/workflows/_android-build-test.yml
+++ b/.github/workflows/_android-build-test.yml
@@ -131,7 +131,7 @@ jobs:
 
           export COMMAND
           # shellcheck disable=SC2016
-          COMMAND='(echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh" | docker exec -u jenkins -e BUILD_LITE_INTERPRETER -e GRADLE_OFFLINE=1 -i "$id" bash) 2>&1'
+          COMMAND='(echo "sudo chown -R jenkins workspace && cd workspace && ./scripts/build_android_gradle.sh" | docker exec -u jenkins -e BUILD_LITE_INTERPRETER -e GRADLE_OFFLINE=1 -i "$id" bash) 2>&1'
           echo "${COMMAND}" > ./command.sh && bash ./command.sh
           # Skip docker push as this job is purely for size analysis purpose.
           # Result binaries are already in `/home/circleci/project/` as it's mounted instead of copied.
diff --git a/.github/workflows/_android-full-build-test.yml b/.github/workflows/_android-full-build-test.yml
index 965667b7da7a1..7a0c4377eca4e 100644
--- a/.github/workflows/_android-full-build-test.yml
+++ b/.github/workflows/_android-full-build-test.yml
@@ -157,7 +157,7 @@ jobs:
           docker cp "${GITHUB_WORKSPACE}/build_android_install_x86_32" "${ID_X86_32}:/var/lib/jenkins/workspace/build_android_install_x86_32"
 
           # run gradle buildRelease
-          (echo "./.circleci/scripts/build_android_gradle.sh" | docker exec \
+          (echo "./scripts/build_android_gradle.sh" | docker exec \
             -e BUILD_ENVIRONMENT="pytorch-linux-focal-py3-clang9-android-ndk-r21e-gradle-build" \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e AWS_DEFAULT_REGION \
diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml
index d7fbdc8b1ded8..ca65ce64bc657 100644
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@@ -86,9 +86,14 @@ jobs:
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
+      - name: Check if in a ARC runner
+        shell: bash
+        id: check_arc_runner
+        run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        if: ${{ inputs.cuda-version != 'cpu' }}
+        if: ${{ inputs.cuda-version != 'cpu' && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
 
       - name: Output disk space left
         run: |
diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml
index 3dee4ba92e7f0..34400149e53ff 100644
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@@ -78,7 +78,7 @@ on:
 jobs:
   build:
     runs-on: ${{ inputs.runs_on }}
-    timeout-minutes: 180
+    timeout-minutes: 210
     env:
       PYTORCH_ROOT: ${{ inputs.PYTORCH_ROOT }}
       BUILDER_ROOT: ${{ inputs.BUILDER_ROOT }}
diff --git a/.github/workflows/_binary-upload.yml b/.github/workflows/_binary-upload.yml
index 79014e30ef196..1231dd0e8c7d4 100644
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@@ -59,18 +59,13 @@ on:
       github-token:
         required: true
         description: Github Token
-      aws-pytorch-uploader-access-key-id:
-        required: true
-        description: AWS access key id
-      aws-pytorch-uploader-secret-access-key:
-        required: true
-        description: AWS secret access key
       conda-pytorchbot-token:
         required: true
         description: Conda PyTorchBot token
       conda-pytorchbot-token-test:
         required: true
         description: Conda PyTorchBot token
+
 jobs:
   upload:
     runs-on: ubuntu-22.04
@@ -104,6 +99,20 @@ jobs:
         with:
           no-sudo: true
 
+      - name: Configure AWS credentials(PyTorch account) for nightly
+        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/nightly' }}
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
+          aws-region: us-east-1
+
+      - name: Configure AWS credentials(PyTorch account) for RC builds
+        if: ${{ github.event_name == 'push' &&  (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }}
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels
+          aws-region: us-east-1
+
       - name: Download Build Artifacts
         id: download-artifacts
         # NB: When the previous build job is skipped, there won't be any artifacts and
@@ -135,8 +144,6 @@ jobs:
           PKG_DIR: "${{ runner.temp }}/artifacts"
           UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
           # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.aws-pytorch-uploader-access-key-id }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.aws-pytorch-uploader-secret-access-key }}
           CONDA_PYTORCHBOT_TOKEN: ${{ secrets.conda-pytorchbot-token }}
           CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.conda-pytorchbot-token-test }}
           BUILD_NAME: ${{ inputs.build_name }}
diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml
index 6b354fe92606a..069bcb4d2a14e 100644
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@@ -28,7 +28,21 @@ on:
         description: |
           If this is set, our linter will use this to make sure that every other
           job with the same `sync-tag` is identical.
-
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
+      upload-aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
     secrets:
       GH_PYTORCHBOT_TOKEN:
         required: false
@@ -82,6 +96,14 @@ jobs:
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
+      - name: configure aws credentials
+        if : ${{ inputs.aws-role-to-assume != '' }}
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: ${{ inputs.aws-role-to-assume }}
+          role-session-name: gha-linux-test
+          aws-region: us-east-1
+
       - name: Calculate docker image
         id: calculate-docker-image
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
@@ -97,6 +119,7 @@ jobs:
         uses: ./.github/actions/download-build-artifacts
         with:
           name: ${{ inputs.build-environment }}
+          s3-bucket: ${{ inputs.s3-bucket }}
 
       - name: Generate netrc (only for docs-push)
         if: inputs.push
@@ -156,6 +179,14 @@ jobs:
         uses: ./.github/actions/chown-workspace
         if: always()
 
+      - name: configure aws credentials
+        if : ${{ inputs.upload-aws-role-to-assume != '' }}
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: ${{ inputs.upload-aws-role-to-assume }}
+          role-session-name: gha-linux-test
+          aws-region: us-east-1
+
       - name: Upload Python Docs Preview
         uses: seemethere/upload-artifact-s3@v5
         if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' && steps.build-docs.outcome == 'success' }}
@@ -163,7 +194,7 @@ jobs:
           retention-days: 14
           s3-bucket: doc-previews
           if-no-files-found: error
-          path: pytorch.github.io/docs/main/
+          path: pytorch_docs/main/
           s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}
 
       - name: Upload C++ Docs Preview
diff --git a/.github/workflows/_linux-build-label.yml b/.github/workflows/_linux-build-label.yml
new file mode 100644
index 0000000000000..427f993b48530
--- /dev/null
+++ b/.github/workflows/_linux-build-label.yml
@@ -0,0 +1,109 @@
+name: linux-build
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      docker-image-name:
+        required: true
+        type: string
+        description: Name of the base docker image to build with.
+      build-generates-artifacts:
+        required: false
+        type: boolean
+        default: true
+        description: If set, upload generated build artifacts.
+      build-with-debug:
+        required: false
+        type: boolean
+        default: false
+        description: If set, build in debug mode.
+      sync-tag:
+        required: false
+        type: string
+        default: ""
+        description: |
+          If this is set, our linter will use this to make sure that every other
+          job with the same `sync-tag` is identical.
+      cuda-arch-list:
+        required: false
+        type: string
+        default: "5.2"
+        description: Runner label to select worker type
+      runner:
+        required: false
+        type: string
+        default: "linux.2xlarge"
+        description: |
+          List of CUDA architectures CI build should target.
+      test-matrix:
+        required: false
+        type: string
+        description: |
+          An option JSON description of what test configs to run later on. This
+          is moved here from the Linux test workflow so that we can apply filter
+          logic using test-config labels earlier and skip unnecessary builds
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
+
+    outputs:
+      docker-image:
+        value: ${{ jobs.build.outputs.docker-image }}
+        description: The docker image containing the built PyTorch.
+      test-matrix:
+        value: ${{ jobs.build.outputs.test-matrix }}
+        description: An optional JSON description of what test configs to run later on.
+
+jobs:
+  build:
+    # Don't run on forked repos
+    if: github.repository_owner == 'pytorch'
+    runs-on: ${{ inputs.runner }}
+    timeout-minutes: 240
+    outputs:
+      docker-image: ${{ steps.linux-build.outputs.docker-image }}
+      test-matrix: ${{ steps.linux-build.outputs.test-matrix }}
+    steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      # [pytorch repo ref]
+      # Use a pytorch/pytorch reference instead of a reference to the local
+      # checkout because when we run this action we don't *have* a local
+      # checkout. In other cases you should prefer a local checkout.
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+
+      - name: Linux Build
+        id: linux-build
+        uses: ./.github/actions/linux-build
+        with:
+          build-environment: ${{ inputs.build-environment }}
+          docker-image-name: ${{ inputs.docker-image-name }}
+          build-generates-artifacts: ${{ inputs.build-generates-artifacts }}
+          build-with-debug: ${{ inputs.build-with-debug }}
+          sync-tag: ${{ inputs.sync-tag }}
+          cuda-arch-list: ${{ inputs.cuda-arch-list }}
+          test-matrix: ${{ inputs.test-matrix }}
+          s3-bucket: ${{ inputs.s3-bucket }}
+          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
diff --git a/.github/workflows/_linux-build-rg.yml b/.github/workflows/_linux-build-rg.yml
new file mode 100644
index 0000000000000..6c6a4827e1672
--- /dev/null
+++ b/.github/workflows/_linux-build-rg.yml
@@ -0,0 +1,105 @@
+name: linux-build-rg
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      docker-image-name:
+        required: true
+        type: string
+        description: Name of the base docker image to build with.
+      build-generates-artifacts:
+        required: false
+        type: boolean
+        default: true
+        description: If set, upload generated build artifacts.
+      build-with-debug:
+        required: false
+        type: boolean
+        default: false
+        description: If set, build in debug mode.
+      sync-tag:
+        required: false
+        type: string
+        default: ""
+        description: |
+          If this is set, our linter will use this to make sure that every other
+          job with the same `sync-tag` is identical.
+      cuda-arch-list:
+        required: false
+        type: string
+        default: "5.2"
+        description: |
+          List of CUDA architectures CI build should target.
+      runner-group:
+        required: false
+        type: string
+        default: "arc-lf-linux.2xlarge"
+        description: Runner group to select group type
+      test-matrix:
+        required: false
+        type: string
+        description: |
+          An option JSON description of what test configs to run later on. This
+          is moved here from the Linux test workflow so that we can apply filter
+          logic using test-config labels earlier and skip unnecessary builds
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
+
+    outputs:
+      docker-image:
+        value: ${{ jobs.build.outputs.docker-image }}
+        description: The docker image containing the built PyTorch.
+      test-matrix:
+        value: ${{ jobs.build.outputs.test-matrix }}
+        description: An optional JSON description of what test configs to run later on.
+
+jobs:
+  build:
+    # Don't run on forked repos
+    if: github.repository_owner == 'pytorch'
+    runs-on:
+      group: ${{ inputs.runner-group }}
+    timeout-minutes: 240
+    outputs:
+      docker-image: ${{ steps.linux-build.outputs.docker-image }}
+      test-matrix: ${{ steps.linux-build.outputs.test-matrix }}
+    steps:
+      # [pytorch repo ref]
+      # Use a pytorch/pytorch reference instead of a reference to the local
+      # checkout because when we run this action we don't *have* a local
+      # checkout. In other cases you should prefer a local checkout.
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+
+      - name: Linux Build
+        id: linux-build
+        uses: ./.github/actions/linux-build
+        with:
+          build-environment: ${{ inputs.build-environment }}
+          docker-image-name: ${{ inputs.docker-image-name }}
+          build-generates-artifacts: ${{ inputs.build-generates-artifacts }}
+          build-with-debug: ${{ inputs.build-with-debug }}
+          sync-tag: ${{ inputs.sync-tag }}
+          cuda-arch-list: ${{ inputs.cuda-arch-list }}
+          test-matrix: ${{ inputs.test-matrix }}
+          s3-bucket: ${{ inputs.s3-bucket }}
+          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml
index 9a88ed70b7f2a..c3bcb0d888dfc 100644
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@@ -47,6 +47,23 @@ on:
           An option JSON description of what test configs to run later on. This
           is moved here from the Linux test workflow so that we can apply filter
           logic using test-config labels earlier and skip unnecessary builds
+      selected-test-configs:
+        description: |
+          A comma-separated list of test configurations from the test matrix to keep,
+          The empty list means we are going to keep every configurations by defaults
+        required: false
+        type: string
+        default: ""
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: Role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
     secrets:
       HUGGING_FACE_HUB_TOKEN:
         required: false
@@ -87,6 +104,14 @@ jobs:
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
+      - name: configure aws credentials
+        uses: aws-actions/configure-aws-credentials@v3
+        if: ${{ inputs.aws-role-to-assume != '' }}
+        with:
+          role-to-assume: ${{ inputs.aws-role-to-assume }}
+          role-session-name: gha-linux-build
+          aws-region: us-east-1
+
       - name: Calculate docker image
         id: calculate-docker-image
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
@@ -125,6 +150,7 @@ jobs:
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           test-matrix: ${{ inputs.test-matrix }}
+          selected-test-configs: ${{ inputs.selected-test-configs }}
           job-name: ${{ steps.get-job-id.outputs.job-name }}
 
       - name: Download pytest cache
@@ -133,6 +159,7 @@ jobs:
         with:
           cache_dir: .pytest_cache
           job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
+          s3_bucket: ${{ inputs.s3-bucket }}
 
       - name: Build
         if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
@@ -197,6 +224,7 @@ jobs:
           retention-days: 14
           if-no-files-found: error
           path: artifacts.zip
+          s3-bucket: ${{ inputs.s3-bucket }}
 
       - name: Upload sccache stats
         if: steps.build.outcome != 'skipped'
@@ -207,6 +235,7 @@ jobs:
           retention-days: 365
           if-no-files-found: warn
           path: sccache-stats-*.json
+          s3-bucket: ${{ inputs.s3-bucket }}
 
       - name: Teardown Linux
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
diff --git a/.github/workflows/_linux-test-label.yml b/.github/workflows/_linux-test-label.yml
new file mode 100644
index 0000000000000..7056c0168a19e
--- /dev/null
+++ b/.github/workflows/_linux-test-label.yml
@@ -0,0 +1,85 @@
+name: linux-test-rg
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      test-matrix:
+        required: true
+        type: string
+        description: JSON description of what test configs to run.
+      docker-image:
+        required: true
+        type: string
+        description: Docker image to run in.
+      sync-tag:
+        required: false
+        type: string
+        default: ""
+        description: |
+          If this is set, our linter will use this to make sure that every other
+          job with the same `sync-tag` is identical.
+      timeout-minutes:
+        required: false
+        type: number
+        default: 240
+        description: |
+          Set the maximum (in minutes) how long the workflow should take to finish
+      use-gha:
+        required: false
+        type: string
+        default: ""
+        description: If set to any value, upload to GHA. Otherwise upload to S3.
+      dashboard-tag:
+        required: false
+        type: string
+        default: ""
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
+
+env:
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+
+jobs:
+  test:
+    # Don't run on forked repos or empty test matrix
+    if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
+    strategy:
+      matrix: ${{ fromJSON(inputs.test-matrix) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+
+      - name: Linux Test
+        id: linux-test
+        uses: ./.github/actions/linux-test
+        with:
+          build-environment: ${{ inputs.build-environment }}
+          test-matrix: ${{ inputs.test-matrix }}
+          docker-image: ${{ inputs.docker-image }}
+          sync-tag: ${{ inputs.sync-tag }}
+          use-gha: ${{ inputs.use-gha }}
+          dashboard-tag: ${{ inputs.dashboard-tag }}
+          s3-bucket: ${{ inputs.s3-bucket }}
+          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/_linux-test-rg.yml b/.github/workflows/_linux-test-rg.yml
new file mode 100644
index 0000000000000..6dc2f6c63bf3e
--- /dev/null
+++ b/.github/workflows/_linux-test-rg.yml
@@ -0,0 +1,86 @@
+name: linux-test-label
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      test-matrix:
+        required: true
+        type: string
+        description: JSON description of what test configs to run.
+      docker-image:
+        required: true
+        type: string
+        description: Docker image to run in.
+      sync-tag:
+        required: false
+        type: string
+        default: ""
+        description: |
+          If this is set, our linter will use this to make sure that every other
+          job with the same `sync-tag` is identical.
+      timeout-minutes:
+        required: false
+        type: number
+        default: 240
+        description: |
+          Set the maximum (in minutes) how long the workflow should take to finish
+      use-gha:
+        required: false
+        type: string
+        default: ""
+        description: If set to any value, upload to GHA. Otherwise upload to S3.
+      dashboard-tag:
+        required: false
+        type: string
+        default: ""
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
+
+env:
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+
+jobs:
+  test:
+    # Don't run on forked repos or empty test matrix
+    if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
+    strategy:
+      matrix: ${{ fromJSON(inputs.test-matrix) }}
+      fail-fast: false
+    runs-on:
+      group: ${{ matrix.runner }}
+    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+
+      - name: Linux Test
+        id: linux-test
+        uses: ./.github/actions/linux-test
+        with:
+          build-environment: ${{ inputs.build-environment }}
+          test-matrix: ${{ inputs.test-matrix }}
+          docker-image: ${{ inputs.docker-image }}
+          sync-tag: ${{ inputs.sync-tag }}
+          use-gha: ${{ inputs.use-gha }}
+          dashboard-tag: ${{ inputs.dashboard-tag }}
+          s3-bucket: ${{ inputs.s3-bucket }}
+          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 1d14950549a8e..5f3f290dd31da 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -37,6 +37,16 @@ on:
         required: false
         type: string
         default: ""
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
     secrets:
       HUGGING_FACE_HUB_TOKEN:
         required: false
@@ -71,6 +81,14 @@ jobs:
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
+      - name: configure aws credentials
+        if : ${{ inputs.aws-role-to-assume != '' }}
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: ${{ inputs.aws-role-to-assume }}
+          role-session-name: gha-linux-test
+          aws-region: us-east-1
+
       - name: Calculate docker image
         id: calculate-docker-image
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
@@ -91,10 +109,15 @@ jobs:
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
+      - name: Check if in a ARC runner
+        shell: bash
+        id: check_arc_runner
+        run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         id: install-nvidia-driver
         uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
 
       - name: Lock NVIDIA A100 40GB Frequency
         run: |
@@ -116,6 +139,11 @@ jobs:
         uses: ./.github/actions/download-build-artifacts
         with:
           name: ${{ inputs.build-environment }}
+          s3-bucket: ${{ inputs.s3-bucket }}
+
+      - name: Download TD artifacts
+        continue-on-error: true
+        uses: ./.github/actions/download-td-artifacts
 
       - name: Parse ref
         id: parse-ref
@@ -169,6 +197,10 @@ jobs:
           NUM_TEST_SHARDS: ${{ matrix.num_shards }}
           REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
+          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
+          TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
           SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
           SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
           SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
@@ -218,6 +250,10 @@ jobs:
             -e NUM_TEST_SHARDS \
             -e REENABLED_ISSUES \
             -e CONTINUE_THROUGH_ERROR \
+            -e VERBOSE_TEST_LOGS \
+            -e NO_TEST_TIMEOUT \
+            -e NO_TD \
+            -e TD_DISTRIBUTED \
             -e PR_LABELS \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
@@ -230,7 +266,6 @@ jobs:
             -e HUGGING_FACE_HUB_TOKEN \
             -e DASHBOARD_TAG \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-            --ulimit stack=10485760:83886080 \
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
             --ipc=host \
@@ -280,6 +315,7 @@ jobs:
         with:
           file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
           use-gha: ${{ inputs.use-gha }}
+          s3-bucket: ${{ inputs.s3-bucket }}
 
       - name: Collect backtraces from coredumps (if any)
         if: always()
diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml
index b10c1f84bd7ff..2c0da2f8afd7c 100644
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@@ -34,12 +34,14 @@ jobs:
       test-matrix: ${{ steps.filter.outputs.test-matrix }}
       is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
       keep-going: ${{ steps.filter.outputs.keep-going }}
+      ci-verbose-test-logs: ${{ steps.filter.outputs.ci-verbose-test-logs }}
+      ci-no-test-timeout: ${{ steps.filter.outputs.ci-no-test-timeout }}
+      ci-no-td: ${{ steps.filter.outputs.ci-no-td }}
       reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
     steps:
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
         with:
-          fetch-depth: 1
           submodules: false
 
       - name: Select all requested test configurations
@@ -95,6 +97,9 @@ jobs:
           PY_VERS: 3.9
           PR_BODY: ${{ github.event.pull_request.body }}
           CONTINUE_THROUGH_ERROR: ${{ needs.filter.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ needs.filter.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ needs.filter.outputs.ci-no-test-timeout }}
+          NO_TD: ${{ needs.filter.outputs.ci-no-td }}
           PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
           REENABLED_ISSUES: ${{ needs.filter.outputs.reenabled-issues }}
         run: |
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index 4848a566f15ed..b8e90771ec73b 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -91,6 +91,12 @@ jobs:
           name: ${{ inputs.build-environment }}
           use-gha: true
 
+      - name: Download TD artifacts
+        continue-on-error: true
+        uses: ./.github/actions/download-td-artifacts
+        with:
+          use-gha: true
+
       - name: Setup miniconda
         uses: pytorch/test-infra/.github/actions/setup-miniconda@main
         with:
@@ -148,6 +154,9 @@ jobs:
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
           PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
+          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
           PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
           GITHUB_REPOSITORY: ${{ github.repository }}
           GITHUB_WORKFLOW: ${{ github.workflow }}
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index 649cae5a2c20b..1f2d86273ee14 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -42,6 +42,10 @@ on:
 env:
   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
 
+permissions:
+  id-token: write
+  contents: read
+
 jobs:
   test:
     # Don't run on forked repos or empty test matrix
@@ -61,6 +65,19 @@ jobs:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
 
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        continue-on-error: true
+        uses: aws-actions/amazon-ecr-login@v2
+
       - name: Calculate docker image
         id: calculate-docker-image
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
@@ -86,6 +103,10 @@ jobs:
         with:
           name: ${{ inputs.build-environment }}
 
+      - name: Download TD artifacts
+        continue-on-error: true
+        uses: ./.github/actions/download-td-artifacts
+
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
@@ -132,6 +153,9 @@ jobs:
           BRANCH: ${{ steps.parse-ref.outputs.branch }}
           SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
+          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
           TEST_CONFIG: ${{ matrix.config }}
           SHARD_NUMBER: ${{ matrix.shard }}
           NUM_TEST_SHARDS: ${{ matrix.num_shards }}
@@ -180,6 +204,9 @@ jobs:
             -e NUM_TEST_SHARDS \
             -e REENABLED_ISSUES \
             -e CONTINUE_THROUGH_ERROR \
+            -e VERBOSE_TEST_LOGS \
+            -e NO_TEST_TIMEOUT \
+            -e NO_TD \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
index 6d041a19c4dc7..bc381c50628d1 100644
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@@ -128,6 +128,7 @@ jobs:
           PYTHON_VERSION: "3.8"
           SCCACHE_BUCKET: "ossci-compiler-cache"
           SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
+          SCCACHE_REGION: us-east-1
           VC_PRODUCT: "BuildTools"
           VC_VERSION: ""
           VC_YEAR: "2019"
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index ebfe4211b34c7..99d037f0355ce 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -25,7 +25,7 @@ on:
       timeout-minutes:
         required: false
         type: number
-        default: 300
+        default: 240
         description: |
           Set the maximum (in minutes) how long the workflow should take to finish
 
@@ -92,7 +92,7 @@ jobs:
           retry_wait_seconds: 30
           command: |
             set -eu
-            python3 -m pip install rockset==1.0.3
+            python3 -m pip install rockset==1.0.3 'xdoctest>=1.1.0'
 
       - name: Start monitoring script
         id: monitor-script
@@ -114,6 +114,10 @@ jobs:
         run: |
           tree /F C:\$Env:GITHUB_RUN_ID\build-results
 
+      - name: Download TD artifacts
+        continue-on-error: true
+        uses: ./.github/actions/download-td-artifacts
+
       - name: Get workflow job id
         id: get-job-id
         uses: ./.github/actions/get-workflow-job-id
@@ -132,14 +136,26 @@ jobs:
           test-matrix: ${{ inputs.test-matrix }}
           job-name: ${{ steps.get-job-id.outputs.job-name }}
 
+      - name: Set Test step time
+        id: test-timeout
+        shell: bash
+        env:
+          JOB_TIMEOUT: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+        run: |
+          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
+
       - name: Test
         id: test
         shell: bash
+        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
         env:
           USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
           INSTALL_WINDOWS_SDK: 1
           PYTHON_VERSION: 3.8
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
+          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
           VC_PRODUCT: "BuildTools"
           VC_VERSION: ""
           VS_VERSION: "16.8.6"
diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml
index 1122454b46fc7..d7af711f8adb4 100644
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@@ -42,10 +42,6 @@ on:
 env:
   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
 
-permissions:
-  id-token: write
-  contents: read
-
 jobs:
   test:
     # Don't run on forked repos or empty test matrix
@@ -67,7 +63,7 @@ jobs:
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v1.7.0
         with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_pytorch_artifacts
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
           aws-region: us-east-1
 
       - name: Login to Amazon ECR
@@ -147,6 +143,9 @@ jobs:
           PYTORCH_RETRY_TEST_CASES: 1
           PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
+          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
           TEST_CONFIG: ${{ matrix.config }}
           SHARD_NUMBER: ${{ matrix.shard }}
           NUM_TEST_SHARDS: ${{ matrix.num_shards }}
@@ -189,6 +188,9 @@ jobs:
             -e PYTORCH_RETRY_TEST_CASES \
             -e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
             -e CONTINUE_THROUGH_ERROR \
+            -e VERBOSE_TEST_LOGS \
+            -e NO_TEST_TIMEOUT \
+            -e NO_TD \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
diff --git a/.github/workflows/auto_request_review.yml b/.github/workflows/auto_request_review.yml
index 7c98c2990fba7..25eb72bc2faab 100644
--- a/.github/workflows/auto_request_review.yml
+++ b/.github/workflows/auto_request_review.yml
@@ -3,11 +3,13 @@ name: Auto Request Review
 on:
   pull_request:
     types: [opened, ready_for_review, reopened]
-
 jobs:
   auto-request-review:
     # Don't run on forked repos
     if: ${{ !github.event.pull_request.head.repo.fork }}
+    permissions:
+      contents: read
+      pull-requests: write
     name: Auto Request Review
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
index ef73a386ef590..ddba8ff8907cc 100644
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@@ -37,7 +37,7 @@ jobs:
         device: ["cuda", "rocm"]
         include:
           - device: "rocm"
-            rocm_version: "5.7"
+            rocm_version: "6.1"
           - device: "cuda"
             rocm_version: ""
     timeout-minutes: 40
@@ -119,8 +119,7 @@ jobs:
 
       - uses: actions/upload-artifact@v3
         with:
-          # NB: Use the same name here and all wheels can be downloaded by referring to the same artifact
-          name: pytorch-triton-wheel
+          name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }}
           if-no-files-found: error
           path: ${{ runner.temp }}/artifacts/*
 
@@ -131,17 +130,41 @@ jobs:
   upload-wheel:
     runs-on: ubuntu-22.04
     needs: build-wheel
+    permissions:
+      id-token: write
+      contents: read
     container:
       image: continuumio/miniconda3:4.12.0
     environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
     steps:
       - uses: actions/checkout@v3
 
+      - name: Configure AWS credentials(PyTorch account) for main
+        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
+          aws-region: us-east-1
+
+      - name: Configure AWS credentials(PyTorch account) for RC builds
+        if: ${{ github.event_name == 'push' &&  (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }}
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels
+          aws-region: us-east-1
+
       - name: Download Build Artifacts
         uses: actions/download-artifact@v3
         with:
-          name: pytorch-triton-wheel
-          path: ${{ runner.temp }}/artifacts/
+          # Download all available artifacts
+          path: ${{ runner.temp }}/artifacts-all
+
+      - name: Select Wheel Artifacts
+        shell: bash
+        run: |
+          set -x
+          mkdir -p "${RUNNER_TEMP}/artifacts/"
+          mv "${RUNNER_TEMP}"/artifacts-all/pytorch-triton-wheel-*/* "${RUNNER_TEMP}/artifacts/"
 
       - name: Set DRY_RUN (only for tagged pushes)
         if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
@@ -168,9 +191,6 @@ jobs:
           # to nightly or test
           UPLOAD_SUBFOLDER: ""
           PKG_DIR: ${{ runner.temp }}/artifacts
-          # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
         shell: bash
         run: |
           set -ex
@@ -232,8 +252,7 @@ jobs:
 
       - uses: actions/upload-artifact@v3
         with:
-          # NB: Use the same name here and all wheels can be downloaded by referring to the same artifact
-          name: pytorch-triton-conda
+          name: pytorch-triton-conda-${{ matrix.py_vers }}
           if-no-files-found: error
           path: ${{ runner.temp }}/artifacts/*
 
@@ -253,8 +272,15 @@ jobs:
       - name: Download Build Artifacts
         uses: actions/download-artifact@v3
         with:
-          name: pytorch-triton-conda
-          path: ${{ runner.temp }}/artifacts/
+          # Download all available artifacts
+          path: ${{ runner.temp }}/artifacts-all
+
+      - name: Select Conda Artifacts
+        shell: bash
+        run: |
+          set -x
+          mkdir -p "${RUNNER_TEMP}/artifacts/"
+          mv "${RUNNER_TEMP}"/artifacts-all/pytorch-triton-conda-*/* "${RUNNER_TEMP}/artifacts/"
 
       - name: Set DRY_RUN (only for tagged pushes)
         if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml
index d44f91936c342..d638d588504f2 100644
--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@@ -9,13 +9,14 @@ on:
   pull_request_target:
     types: [opened, synchronize, reopened, labeled, unlabeled]
     branches: [main]
-    paths-ignore: [.github]
 
-  # To allow testing PRs that change workflows.
-  # May be triggered together with pull_request_target, it's OK.
+  # To check labels on ghstack PRs.
+  # Note: as pull_request doesn't trigger on PRs targeting main,
+  # to test changes to the workflow itself one needs to create
+  # a PR that targets a gh/**/base branch.
   pull_request:
     types: [opened, synchronize, reopened, labeled, unlabeled]
-    paths: [.github]
+    branches: [gh/**/base]
 
   workflow_dispatch:
 
@@ -26,6 +27,7 @@ concurrency:
 jobs:
   check-labels:
     name: Check labels
+    if: github.repository_owner == 'pytorch'
     runs-on: linux.20_04.4x
     steps:
       - name: Checkout PyTorch
diff --git a/.github/workflows/check_mergeability_ghstack.yml b/.github/workflows/check_mergeability_ghstack.yml
index 41994c7ebbf77..562687564054f 100644
--- a/.github/workflows/check_mergeability_ghstack.yml
+++ b/.github/workflows/check_mergeability_ghstack.yml
@@ -1,29 +1,84 @@
-name: Check mergeability and dependencies for ghstack prs
+name: Check mergeability of ghstack PR
 
 on:
   pull_request:
-    types: [opened, synchronize, reopened, edited]
+    types: [opened, synchronize, reopened]
+    branches: [gh/**/base]
 
 jobs:
-  check-regex:
+  ghstack-mergeability-check:
     runs-on: ubuntu-latest
-    outputs:
-      regex-match: ${{ steps.regex-match.outputs.match }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup git
+        shell: bash
+        run: |
+          git config --global user.email "pytorchmergebot@users.noreply.github.com"
+          git config --global user.name "PyTorch MergeBot"
+          git fetch origin main:main
+
+      - name: Wait for orig branch
+        shell: bash
+        run: |
+          BRANCH="${{ github.base_ref }}"
+          echo "$BRANCH"
+          BRANCH="${BRANCH%/base}/orig"
+          echo "$BRANCH"
+
+          WAIT_SECONDS=300
+          END_WAIT=$((SECONDS+WAIT_SECONDS))
+          BRANCH_EXISTS=0
+
+          while [ $SECONDS -lt $END_WAIT ]; do
+            git fetch --prune origin "${BRANCH}" || true
+            if git rev-parse --verify "origin/${BRANCH}"; then
+              BRANCH_EXISTS=1
+              break
+            fi
+            echo "Waiting for branch ${BRANCH} to exist..."
+            sleep 30  # Wait for 30 seconds before retrying
+          done
 
-      - id: regex-match
-        uses: actions-ecosystem/action-regex-match@d50fd2e7a37d0e617aea3d7ada663bd56862b9cc
+          if [ $BRANCH_EXISTS -eq 0 ]; then
+            echo "Branch ${BRANCH} not found after ${WAIT_SECONDS} seconds."
+            echo "Mergeability check failed for infrastructure reasons."
+            exit 1
+          fi
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
         with:
-          text: ${{ github.head_ref }}
-          regex: '^(gh/[^/]+/[0-9]+/)head$'
-
-  pr-dependencies-check:
-    needs: check-regex
-    if: ${{ needs.check-regex.outputs.regex-match != '' }}
-    uses: pytorch/test-infra/.github/workflows/pr-dependencies-check.yml@main
-    with:
-      pr_number: ${{ github.event.pull_request.number }}
+          python-version: '3.8'
+          cache: pip
+          architecture: x64
+
+      - run: pip install pyyaml==6.0 rockset==1.0.3
+        shell: bash
+
+      - name: Verify mergeability
+        shell: bash
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUM: ${{ github.event.pull_request.number }}
+        run: |
+          set -ex
+          python3 .github/scripts/trymerge.py --check-mergeability "${PR_NUM}"
+
+      - name: Print debug info
+        if: failure()
+        shell: bash
+        env:
+          PR_NUM: ${{ github.event.pull_request.number }}
+        run: |
+          {
+            echo "# PR $PR_NUM is not mergeable into main"
+            echo "To debug, run the diagnostic workflow:"
+            echo "https://github.com/pytorch/test-infra/actions/workflows/pr-dependencies-check.yml"
+          } >> "$GITHUB_STEP_SUMMARY"
+
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
diff --git a/.github/workflows/cherry-pick.yml b/.github/workflows/cherry-pick.yml
new file mode 100644
index 0000000000000..059ad781d748d
--- /dev/null
+++ b/.github/workflows/cherry-pick.yml
@@ -0,0 +1,57 @@
+name: Create a cherry pick from a PR
+
+on:
+  repository_dispatch:
+    types: [try-cherry-pick]
+
+jobs:
+  cherry-pick:
+    name: cherry-pick-pr-${{ github.event.client_payload.pr_num }}
+    runs-on: ubuntu-latest
+    environment: cherry-pick-bot
+    env:
+        GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+    steps:
+      - name: Checkout repo
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.GH_PYTORCHBOT_CHERRY_PICK_TOKEN }}
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+          cache: pip
+
+      # Not the direct dependencies but the script uses trymerge
+      - run: pip install pyyaml==6.0 rockset==1.0.3
+
+      - name: Setup committer id
+        run: |
+          git config --global user.name "PyTorch Bot"
+          git config --global user.email "pytorchbot@users.noreply.github.com"
+
+      - name: Cherry pick the PR
+        shell: bash
+        env:
+          PR_NUM: ${{ github.event.client_payload.pr_num }}
+          BRANCH: ${{ github.event.client_payload.branch }}
+          CLASSIFICATION: ${{ github.event.client_payload.classification }}
+          FIXES: ${{ github.event.client_payload.fixes || '' }}
+          ACTOR: ${{ github.actor }}
+          GITHUB_TOKEN: ${{ secrets.GH_PYTORCHBOT_CHERRY_PICK_TOKEN }}
+        run: |
+          set -ex
+
+          python .github/scripts/cherry_pick.py \
+            --onto-branch "${BRANCH}" \
+            --classification "${CLASSIFICATION}" \
+            --fixes "${FIXES}" \
+            --github-actor "${ACTOR}" \
+            "${PR_NUM}"
+
+concurrency:
+  group: cherry-pick-pr-${{ github.event.client_payload.pr_num }}
+  cancel-in-progress: true
diff --git a/.github/workflows/close-nonexistent-disable-issues.yml b/.github/workflows/close-nonexistent-disable-issues.yml
index 26c74286114a6..f384295b84b8a 100644
--- a/.github/workflows/close-nonexistent-disable-issues.yml
+++ b/.github/workflows/close-nonexistent-disable-issues.yml
@@ -6,6 +6,7 @@ on:
 
 jobs:
   close-nonexistent-disable-issues:
+    environment: rockset-read-only
     if: github.repository_owner == 'pytorch'
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index ef263c5a3d656..c80b61c22c5e7 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -15,6 +15,9 @@ jobs:
     if: ${{ github.repository == 'pytorch/pytorch' }}
     name: Create Release
     runs-on: ubuntu-latest
+    # https://github.com/softprops/action-gh-release?tab=readme-ov-file#permissions
+    permissions:
+      contents: write
     steps:
       - uses: malfet/checkout@silent-checkout
         with:
diff --git a/.github/workflows/delete_old_branches.yml b/.github/workflows/delete_old_branches.yml
new file mode 100644
index 0000000000000..04a0521419a8e
--- /dev/null
+++ b/.github/workflows/delete_old_branches.yml
@@ -0,0 +1,39 @@
+# A workflow that deletes branches of closed PRs
+
+name: Delete old branches
+
+on:
+  schedule:
+    # Run daily.
+    - cron: 30 1 * * *
+  workflow_dispatch:
+
+concurrency:
+  group: delete-old-branches
+  cancel-in-progress: true
+
+permissions:
+  contents: write
+
+jobs:
+  delete:
+    if: ${{ github.repository == 'pytorch/pytorch' }}
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
+          architecture: x64
+          check-latest: false
+
+      - name: Delete old branches
+        run: python .github/scripts/delete_old_branches.py
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index c006b0cfac27d..6d822165895eb 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -27,32 +27,40 @@ env:
   ALPINE_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine
   AWS_DEFAULT_REGION: us-east-1
 
+permissions: read-all
+
 jobs:
   docker-build:
-    runs-on: [self-hosted, linux.2xlarge]
     environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
     timeout-minutes: 240
     strategy:
       fail-fast: false
       matrix:
+        runner: [linux.12xlarge]
+        docker-image-name: [
+          pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9,
+          pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9,
+          pytorch-linux-focal-py3.8-clang10,
+          pytorch-linux-focal-py3.11-clang10,
+          pytorch-linux-focal-py3.12-clang10,
+          pytorch-linux-focal-rocm-n-1-py3,
+          pytorch-linux-focal-rocm-n-py3,
+          pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12,
+          pytorch-linux-focal-py3-clang9-android-ndk-r21e,
+          pytorch-linux-jammy-py3.8-gcc11,
+          pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks,
+          pytorch-linux-jammy-xpu-2024.0-py3,
+          pytorch-linux-jammy-py3-clang15-asan,
+          pytorch-linux-focal-py3-clang10-onnx,
+          pytorch-linux-focal-linter,
+          pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter,
+          pytorch-linux-jammy-py3-clang12-executorch
+          ]
         include:
-          - docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
-          - docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
-          - docker-image-name: pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9
-          - docker-image-name: pytorch-linux-focal-py3.8-clang10
-          - docker-image-name: pytorch-linux-focal-py3.11-clang10
-          - docker-image-name: pytorch-linux-focal-rocm-n-1-py3
-          - docker-image-name: pytorch-linux-focal-rocm-n-py3
-          - docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12
-          - docker-image-name: pytorch-linux-focal-py3-clang9-android-ndk-r21e
-          - docker-image-name: pytorch-linux-jammy-py3.8-gcc11
-          - docker-image-name: pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks
-          - docker-image-name: pytorch-linux-jammy-xpu-2024.0-py3
-          - docker-image-name: pytorch-linux-jammy-py3-clang15-asan
-          - docker-image-name: pytorch-linux-focal-py3-clang10-onnx
-          - docker-image-name: pytorch-linux-focal-linter
-          - docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter
-          - docker-image-name: pytorch-linux-jammy-py3-clang12-executorch
+          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
+            runner: linux.arm64.2xlarge
+    runs-on: [self-hosted, "${{ matrix.runner }}"]
     env:
       DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}
     steps:
@@ -107,6 +115,8 @@ jobs:
 
       - name: Chown workspace
         uses: ./.github/actions/chown-workspace
+        with:
+          ALPINE_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/${{ (matrix.runner == 'linux.arm64.2xlarge') && 'arm64v8' || 'tool' }}/alpine
         if: always()
 
       - name: Teardown Linux
diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
index bdc4fa3b0c010..4ece88d5e47da 100644
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@@ -7,10 +7,13 @@ on:
       - Dockerfile
       - docker.Makefile
       - .github/workflows/docker-release.yml
+      - .github/scripts/generate_docker_release_matrix.py
   push:
     branches:
       - nightly
     tags:
+      # Final Release tags look like: v1.11.0
+      - v[0-9]+.[0-9]+.[0-9]+
       # Release candidate tags look like: v1.11.0-rc1
       - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
       - ciflow/nightly/*
@@ -28,6 +31,8 @@ env:
   USE_BUILDX: 1
   WITH_PUSH: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v')) }}
 
+permissions: read-all
+
 jobs:
   generate-matrix:
     if: github.repository_owner == 'pytorch'
@@ -99,6 +104,16 @@ jobs:
           echo "${RUNNER_TEMP}/bin" >> "${GITHUB_PATH}"
           # Generate PyTorch version to use
           echo "PYTORCH_VERSION=$(python3 .github/scripts/generate_pytorch_version.py --no-build-suffix)" >> "${GITHUB_ENV}"
+      - name: Setup test specific variables
+        if: ${{ startsWith(github.event.ref, 'refs/tags/v') }}
+        run: |
+          if [[ ${{ github.event.ref }} =~ ^refs/tags/v[0-9]+\.[0-9]+\.[0-9]+-rc[0-9]+$ ]]; then
+            {
+              echo "DOCKER_IMAGE=pytorch-test";
+              echo "INSTALL_CHANNEL=pytorch-test";
+              echo "TRITON_VERSION=$(cut -f 1 .ci/docker/triton_version.txt)";
+            } >> "${GITHUB_ENV}"
+          fi
       - name: Setup nightly specific variables
         if: ${{ github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/ciflow/nightly/') }}
         run: |
@@ -115,17 +130,27 @@ jobs:
         if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' }}
         run: |
           PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime"
+          CUDA_SUFFIX="-cu${CUDA_VERSION}"
+          if [[ ${CUDA_VERSION_SHORT} == "cpu" ]]; then
+            PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-runtime"
+            CUDA_SUFFIX=""
+          fi
 
           PYTORCH_NIGHTLY_COMMIT=$(docker run ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \
                                           python -c 'import torch; print(torch.version.git_version[:7],end="")')
 
           docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \
-                 ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}"
-          docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}"
+                 ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}"
+
+          docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}"
+
+          # Please note, here we ned to pin specific verison of CUDA as with latest label
+          if [[ ${CUDA_VERSION_SHORT} == "12.1" ]]; then
+            docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" \
+                    ghcr.io/pytorch/pytorch-nightly:latest
+            docker push ghcr.io/pytorch/pytorch-nightly:latest
+          fi
 
-          docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}" \
-                 ghcr.io/pytorch/pytorch-nightly:latest
-          docker push ghcr.io/pytorch/pytorch-nightly:latest
       - name: Teardown Linux
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
         if: always()
diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
index 50e5eb0eef115..79a73abda9f76 100644
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@@ -4,6 +4,7 @@
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-aarch64-binary-manywheel
 
+
 on:
   push:
     # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
@@ -30,7 +31,7 @@ env:
   PYTORCH_FINAL_PACKAGE_DIR: /artifacts
   PYTORCH_ROOT: /pytorch
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
   group: linux-aarch64-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -53,7 +54,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_8-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_8-cpu-aarch64-test:  # Testing
@@ -78,6 +79,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_8-cpu-aarch64-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_8-cpu-aarch64-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -92,8 +96,6 @@ jobs:
       build_name: manywheel-py3_8-cpu-aarch64
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -115,7 +117,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_9-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cpu-aarch64-test:  # Testing
@@ -140,6 +142,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cpu-aarch64-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_9-cpu-aarch64-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -154,8 +159,6 @@ jobs:
       build_name: manywheel-py3_9-cpu-aarch64
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -177,7 +180,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cpu-aarch64-test:  # Testing
@@ -202,6 +205,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cpu-aarch64-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_10-cpu-aarch64-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -216,8 +222,6 @@ jobs:
       build_name: manywheel-py3_10-cpu-aarch64
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -239,7 +243,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cpu-aarch64-test:  # Testing
@@ -264,6 +268,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cpu-aarch64-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_11-cpu-aarch64-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -278,8 +285,6 @@ jobs:
       build_name: manywheel-py3_11-cpu-aarch64
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -301,7 +306,7 @@ jobs:
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cpu-aarch64-test:  # Testing
@@ -326,6 +331,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cpu-aarch64-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_12-cpu-aarch64-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -340,8 +348,6 @@ jobs:
       build_name: manywheel-py3_12-cpu-aarch64
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-linux-binary-conda-nightly.yml b/.github/workflows/generated-linux-binary-conda-nightly.yml
index 8b5b68c9a1866..50a6d986255f7 100644
--- a/.github/workflows/generated-linux-binary-conda-nightly.yml
+++ b/.github/workflows/generated-linux-binary-conda-nightly.yml
@@ -4,6 +4,7 @@
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-binary-conda
 
+
 on:
   push:
     # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
@@ -30,7 +31,7 @@ env:
   PYTORCH_FINAL_PACKAGE_DIR: /artifacts
   PYTORCH_ROOT: /pytorch
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
   group: linux-binary-conda-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -74,6 +75,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_8-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_8-cpu-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -88,8 +92,6 @@ jobs:
       build_name: conda-py3_8-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -135,6 +137,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_8-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_8-cuda11_8-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -150,8 +155,6 @@ jobs:
       build_name: conda-py3_8-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -197,6 +200,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_8-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_8-cuda12_1-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -212,8 +218,69 @@ jobs:
       build_name: conda-py3_8-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  conda-py3_8-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.8"
+      runs_on: linux.24xlarge
+      build_name: conda-py3_8-cuda12_4
+      build_environment: linux-binary-conda
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_8-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cuda12_4
+      build_environment: linux-binary-conda
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_8-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_8-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -256,6 +323,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_9-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_9-cpu-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -270,8 +340,6 @@ jobs:
       build_name: conda-py3_9-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -317,6 +385,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_9-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_9-cuda11_8-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -332,8 +403,6 @@ jobs:
       build_name: conda-py3_9-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -379,6 +448,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_9-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_9-cuda12_1-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -394,8 +466,69 @@ jobs:
       build_name: conda-py3_9-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  conda-py3_9-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.9"
+      runs_on: linux.24xlarge
+      build_name: conda-py3_9-cuda12_4
+      build_environment: linux-binary-conda
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_9-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cuda12_4
+      build_environment: linux-binary-conda
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_9-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_9-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -438,6 +571,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_10-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_10-cpu-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -452,8 +588,6 @@ jobs:
       build_name: conda-py3_10-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -499,6 +633,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_10-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_10-cuda11_8-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -514,8 +651,6 @@ jobs:
       build_name: conda-py3_10-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -561,6 +696,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_10-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_10-cuda12_1-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -576,8 +714,69 @@ jobs:
       build_name: conda-py3_10-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  conda-py3_10-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.10"
+      runs_on: linux.24xlarge
+      build_name: conda-py3_10-cuda12_4
+      build_environment: linux-binary-conda
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_10-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cuda12_4
+      build_environment: linux-binary-conda
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_10-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_10-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -620,6 +819,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_11-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_11-cpu-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -634,8 +836,6 @@ jobs:
       build_name: conda-py3_11-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -681,6 +881,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_11-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_11-cuda11_8-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -696,8 +899,6 @@ jobs:
       build_name: conda-py3_11-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -743,6 +944,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_11-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_11-cuda12_1-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -758,8 +962,69 @@ jobs:
       build_name: conda-py3_11-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  conda-py3_11-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.11"
+      runs_on: linux.24xlarge
+      build_name: conda-py3_11-cuda12_4
+      build_environment: linux-binary-conda
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_11-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_11-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cuda12_4
+      build_environment: linux-binary-conda
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_11-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_11-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -802,6 +1067,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_12-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_12-cpu-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -816,8 +1084,6 @@ jobs:
       build_name: conda-py3_12-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -863,6 +1129,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_12-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_12-cuda11_8-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -878,8 +1147,6 @@ jobs:
       build_name: conda-py3_12-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -925,6 +1192,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   conda-py3_12-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_12-cuda12_1-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -940,8 +1210,69 @@ jobs:
       build_name: conda-py3_12-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  conda-py3_12-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.12"
+      runs_on: linux.24xlarge
+      build_name: conda-py3_12-cuda12_4
+      build_environment: linux-binary-conda
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_12-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_12-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.12"
+      build_name: conda-py3_12-cuda12_4
+      build_environment: linux-binary-conda
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  conda-py3_12-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_12-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.12"
+      build_name: conda-py3_12-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml
index 5b2869c793502..5577a5e7d9c3a 100644
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml
@@ -4,6 +4,7 @@
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-binary-libtorch-cxx11-abi
 
+
 on:
   push:
     branches:
@@ -25,7 +26,7 @@ env:
   PYTORCH_FINAL_PACKAGE_DIR: /artifacts
   PYTORCH_ROOT: /pytorch
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
   group: linux-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
index 4e887565d569f..d400e82249867 100644
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
@@ -4,6 +4,7 @@
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-binary-libtorch-cxx11-abi
 
+
 on:
   push:
     # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
@@ -30,7 +31,7 @@ env:
   PYTORCH_FINAL_PACKAGE_DIR: /artifacts
   PYTORCH_ROOT: /pytorch
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
   group: linux-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -76,6 +77,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cpu-shared-with-deps-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: libtorch-cpu-shared-with-deps-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -91,8 +95,6 @@ jobs:
       build_name: libtorch-cpu-shared-with-deps-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -139,6 +141,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda11_8-shared-with-deps-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: libtorch-cuda11_8-shared-with-deps-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -155,8 +160,6 @@ jobs:
       build_name: libtorch-cuda11_8-shared-with-deps-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -203,6 +206,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda12_1-shared-with-deps-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: libtorch-cuda12_1-shared-with-deps-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -219,13 +225,76 @@ jobs:
       build_name: libtorch-cuda12_1-shared-with-deps-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  libtorch-rocm5_6-shared-with-deps-cxx11-abi-build:
+  libtorch-cuda12_4-shared-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.4-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      build_name: libtorch-cuda12_4-shared-with-deps-cxx11-abi
+      build_environment: linux-binary-libtorch-cxx11-abi
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_4-shared-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda12_4-shared-with-deps-cxx11-abi-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.4-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      build_name: libtorch-cuda12_4-shared-with-deps-cxx11-abi
+      build_environment: linux-binary-libtorch-cxx11-abi
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_4-shared-with-deps-cxx11-abi-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_4-shared-with-deps-cxx11-abi-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.4-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      build_name: libtorch-cuda12_4-shared-with-deps-cxx11-abi
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-rocm6_0-shared-with-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -234,19 +303,19 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.0-main
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_6-shared-with-deps-cxx11-abi
+      build_name: libtorch-rocm6_0-shared-with-deps-cxx11-abi
       build_environment: linux-binary-libtorch-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm5_6-shared-with-deps-cxx11-abi-test:  # Testing
+  libtorch-rocm6_0-shared-with-deps-cxx11-abi-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_6-shared-with-deps-cxx11-abi-build
+    needs: libtorch-rocm6_0-shared-with-deps-cxx11-abi-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -255,11 +324,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.0-main
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
     steps:
@@ -268,7 +337,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_6-shared-with-deps-cxx11-abi
+          name: libtorch-rocm6_0-shared-with-deps-cxx11-abi
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -301,36 +370,37 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.6-main
+          docker-image: pytorch/libtorch-cxx11-builder:rocm6.0-main
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  libtorch-rocm5_6-shared-with-deps-cxx11-abi-upload:  # Uploading
+  libtorch-rocm6_0-shared-with-deps-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_6-shared-with-deps-cxx11-abi-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm6_0-shared-with-deps-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.0-main
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_6-shared-with-deps-cxx11-abi
+      build_name: libtorch-rocm6_0-shared-with-deps-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  libtorch-rocm5_7-shared-with-deps-cxx11-abi-build:
+  libtorch-rocm6_1-shared-with-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -339,19 +409,19 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.1-main
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_7-shared-with-deps-cxx11-abi
+      build_name: libtorch-rocm6_1-shared-with-deps-cxx11-abi
       build_environment: linux-binary-libtorch-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm5_7-shared-with-deps-cxx11-abi-test:  # Testing
+  libtorch-rocm6_1-shared-with-deps-cxx11-abi-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_7-shared-with-deps-cxx11-abi-build
+    needs: libtorch-rocm6_1-shared-with-deps-cxx11-abi-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -360,11 +430,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.1-main
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
     steps:
@@ -373,7 +443,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_7-shared-with-deps-cxx11-abi
+          name: libtorch-rocm6_1-shared-with-deps-cxx11-abi
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -406,31 +476,32 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.7-main
+          docker-image: pytorch/libtorch-cxx11-builder:rocm6.1-main
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  libtorch-rocm5_7-shared-with-deps-cxx11-abi-upload:  # Uploading
+  libtorch-rocm6_1-shared-with-deps-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_7-shared-with-deps-cxx11-abi-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm6_1-shared-with-deps-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.1-main
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_7-shared-with-deps-cxx11-abi
+      build_name: libtorch-rocm6_1-shared-with-deps-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml
index 2fec2021b636c..0158860d6f942 100644
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml
@@ -4,6 +4,7 @@
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-binary-libtorch-pre-cxx11
 
+
 on:
   push:
     branches:
@@ -25,7 +26,7 @@ env:
   PYTORCH_FINAL_PACKAGE_DIR: /artifacts
   PYTORCH_ROOT: /pytorch
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
   group: linux-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
index e93aa4177b530..3205c3c78dad4 100644
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
@@ -4,6 +4,7 @@
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-binary-libtorch-pre-cxx11
 
+
 on:
   push:
     # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
@@ -30,7 +31,7 @@ env:
   PYTORCH_FINAL_PACKAGE_DIR: /artifacts
   PYTORCH_ROOT: /pytorch
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
   group: linux-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -76,6 +77,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cpu-shared-with-deps-pre-cxx11-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: libtorch-cpu-shared-with-deps-pre-cxx11-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -91,8 +95,6 @@ jobs:
       build_name: libtorch-cpu-shared-with-deps-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -139,6 +141,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda11_8-shared-with-deps-pre-cxx11-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: libtorch-cuda11_8-shared-with-deps-pre-cxx11-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -155,8 +160,6 @@ jobs:
       build_name: libtorch-cuda11_8-shared-with-deps-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -203,6 +206,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   libtorch-cuda12_1-shared-with-deps-pre-cxx11-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: libtorch-cuda12_1-shared-with-deps-pre-cxx11-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -219,13 +225,76 @@ jobs:
       build_name: libtorch-cuda12_1-shared-with-deps-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  libtorch-rocm5_6-shared-with-deps-pre-cxx11-build:
+  libtorch-cuda12_4-shared-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+      build_name: libtorch-cuda12_4-shared-with-deps-pre-cxx11
+      build_environment: linux-binary-libtorch-pre-cxx11
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_4-shared-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda12_4-shared-with-deps-pre-cxx11-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+      build_name: libtorch-cuda12_4-shared-with-deps-pre-cxx11
+      build_environment: linux-binary-libtorch-pre-cxx11
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_4-shared-with-deps-pre-cxx11-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_4-shared-with-deps-pre-cxx11-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+      build_name: libtorch-cuda12_4-shared-with-deps-pre-cxx11
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-rocm6_0-shared-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -234,19 +303,19 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_6-shared-with-deps-pre-cxx11
+      build_name: libtorch-rocm6_0-shared-with-deps-pre-cxx11
       build_environment: linux-binary-libtorch-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm5_6-shared-with-deps-pre-cxx11-test:  # Testing
+  libtorch-rocm6_0-shared-with-deps-pre-cxx11-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_6-shared-with-deps-pre-cxx11-build
+    needs: libtorch-rocm6_0-shared-with-deps-pre-cxx11-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -255,11 +324,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
     steps:
@@ -268,7 +337,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_6-shared-with-deps-pre-cxx11
+          name: libtorch-rocm6_0-shared-with-deps-pre-cxx11
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -301,36 +370,37 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.6-main
+          docker-image: pytorch/manylinux-builder:rocm6.0-main
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  libtorch-rocm5_6-shared-with-deps-pre-cxx11-upload:  # Uploading
+  libtorch-rocm6_0-shared-with-deps-pre-cxx11-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_6-shared-with-deps-pre-cxx11-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm6_0-shared-with-deps-pre-cxx11-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_6-shared-with-deps-pre-cxx11
+      build_name: libtorch-rocm6_0-shared-with-deps-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  libtorch-rocm5_7-shared-with-deps-pre-cxx11-build:
+  libtorch-rocm6_1-shared-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -339,19 +409,19 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_7-shared-with-deps-pre-cxx11
+      build_name: libtorch-rocm6_1-shared-with-deps-pre-cxx11
       build_environment: linux-binary-libtorch-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm5_7-shared-with-deps-pre-cxx11-test:  # Testing
+  libtorch-rocm6_1-shared-with-deps-pre-cxx11-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_7-shared-with-deps-pre-cxx11-build
+    needs: libtorch-rocm6_1-shared-with-deps-pre-cxx11-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -360,11 +430,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
     steps:
@@ -373,7 +443,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_7-shared-with-deps-pre-cxx11
+          name: libtorch-rocm6_1-shared-with-deps-pre-cxx11
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -406,31 +476,32 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.7-main
+          docker-image: pytorch/manylinux-builder:rocm6.1-main
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  libtorch-rocm5_7-shared-with-deps-pre-cxx11-upload:  # Uploading
+  libtorch-rocm6_1-shared-with-deps-pre-cxx11-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_7-shared-with-deps-pre-cxx11-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm6_1-shared-with-deps-pre-cxx11-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_7-shared-with-deps-pre-cxx11
+      build_name: libtorch-rocm6_1-shared-with-deps-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml
index e10d4ef7f725f..4764ede6bcb2c 100644
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@@ -4,6 +4,7 @@
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-binary-manywheel
 
+
 on:
   push:
     branches:
@@ -25,7 +26,7 @@ env:
   PYTORCH_FINAL_PACKAGE_DIR: /artifacts
   PYTORCH_ROOT: /pytorch
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
   group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -47,7 +48,7 @@ jobs:
       DESIRED_PYTHON: "3.8"
       build_name: manywheel-py3_8-cuda11_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_8-cuda11_8-test:  # Testing
@@ -87,7 +88,7 @@ jobs:
       DESIRED_PYTHON: "3.8"
       build_name: manywheel-py3_8-cuda12_1
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_8-cuda12_1-test:  # Testing
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index 783d54b8157c8..8ad43b4c36607 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -4,6 +4,7 @@
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: linux-binary-manywheel
 
+
 on:
   push:
     # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
@@ -30,7 +31,7 @@ env:
   PYTORCH_FINAL_PACKAGE_DIR: /artifacts
   PYTORCH_ROOT: /pytorch
   SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
   group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -74,6 +75,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_8-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_8-cpu-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -88,8 +92,6 @@ jobs:
       build_name: manywheel-py3_8-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -134,6 +136,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_8-cpu-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_8-cpu-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -149,8 +154,6 @@ jobs:
       build_name: manywheel-py3_8-cpu-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -171,7 +174,7 @@ jobs:
       DESIRED_PYTHON: "3.8"
       build_name: manywheel-py3_8-cuda11_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_8-cuda11_8-test:  # Testing
@@ -196,6 +199,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_8-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_8-cuda11_8-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -211,8 +217,6 @@ jobs:
       build_name: manywheel-py3_8-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -233,7 +237,7 @@ jobs:
       DESIRED_PYTHON: "3.8"
       build_name: manywheel-py3_8-cuda12_1
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_8-cuda12_1-test:  # Testing
@@ -258,6 +262,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_8-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_8-cuda12_1-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -273,13 +280,11 @@ jobs:
       build_name: manywheel-py3_8-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_8-rocm5_6-build:
+  manywheel-py3_8-cuda12_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -288,18 +293,81 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda12_4
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_8-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda12_4
+      build_environment: linux-binary-manywheel
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_8-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_8-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_8-rocm6_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm5_6
+      build_name: manywheel-py3_8-rocm6_0
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-rocm5_6-test:  # Testing
+  manywheel-py3_8-rocm6_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm5_6-build
+    needs: manywheel-py3_8-rocm6_0-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -308,11 +376,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.8"
     steps:
       - name: Setup ROCm
@@ -320,7 +388,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_8-rocm5_6
+          name: manywheel-py3_8-rocm6_0
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -353,35 +421,36 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.6-main
+          docker-image: pytorch/manylinux-builder:rocm6.0-main
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_8-rocm5_6-upload:  # Uploading
+  manywheel-py3_8-rocm6_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm5_6-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_8-rocm6_0-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm5_6
+      build_name: manywheel-py3_8-rocm6_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_8-rocm5_7-build:
+  manywheel-py3_8-rocm6_1-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -390,18 +459,18 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm5_7
+      build_name: manywheel-py3_8-rocm6_1
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-rocm5_7-test:  # Testing
+  manywheel-py3_8-rocm6_1-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm5_7-build
+    needs: manywheel-py3_8-rocm6_1-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -410,11 +479,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.8"
     steps:
       - name: Setup ROCm
@@ -422,7 +491,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_8-rocm5_7
+          name: manywheel-py3_8-rocm6_1
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -455,30 +524,31 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.7-main
+          docker-image: pytorch/manylinux-builder:rocm6.1-main
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_8-rocm5_7-upload:  # Uploading
+  manywheel-py3_8-rocm6_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm5_7-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_8-rocm6_1-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm5_7
+      build_name: manywheel-py3_8-rocm6_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -521,6 +591,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_9-cpu-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -535,8 +608,6 @@ jobs:
       build_name: manywheel-py3_9-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -581,6 +652,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cpu-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_9-cpu-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -596,8 +670,6 @@ jobs:
       build_name: manywheel-py3_9-cpu-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -618,7 +690,7 @@ jobs:
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cuda11_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cuda11_8-test:  # Testing
@@ -643,6 +715,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_9-cuda11_8-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -658,8 +733,6 @@ jobs:
       build_name: manywheel-py3_9-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -680,7 +753,7 @@ jobs:
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cuda12_1
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cuda12_1-test:  # Testing
@@ -705,6 +778,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_9-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_9-cuda12_1-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -720,13 +796,74 @@ jobs:
       build_name: manywheel-py3_9-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_9-rocm5_6-build:
+  manywheel-py3_9-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_4
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_9-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_4
+      build_environment: linux-binary-manywheel
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_9-rocm6_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -735,18 +872,18 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-rocm5_6
+      build_name: manywheel-py3_9-rocm6_0
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-rocm5_6-test:  # Testing
+  manywheel-py3_9-rocm6_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm5_6-build
+    needs: manywheel-py3_9-rocm6_0-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -755,11 +892,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.9"
     steps:
       - name: Setup ROCm
@@ -767,7 +904,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_9-rocm5_6
+          name: manywheel-py3_9-rocm6_0
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -800,35 +937,36 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.6-main
+          docker-image: pytorch/manylinux-builder:rocm6.0-main
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_9-rocm5_6-upload:  # Uploading
+  manywheel-py3_9-rocm6_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm5_6-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-rocm6_0-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-rocm5_6
+      build_name: manywheel-py3_9-rocm6_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_9-rocm5_7-build:
+  manywheel-py3_9-rocm6_1-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -837,18 +975,18 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-rocm5_7
+      build_name: manywheel-py3_9-rocm6_1
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-rocm5_7-test:  # Testing
+  manywheel-py3_9-rocm6_1-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm5_7-build
+    needs: manywheel-py3_9-rocm6_1-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -857,11 +995,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.9"
     steps:
       - name: Setup ROCm
@@ -869,7 +1007,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_9-rocm5_7
+          name: manywheel-py3_9-rocm6_1
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -902,30 +1040,31 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.7-main
+          docker-image: pytorch/manylinux-builder:rocm6.1-main
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_9-rocm5_7-upload:  # Uploading
+  manywheel-py3_9-rocm6_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm5_7-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-rocm6_1-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-rocm5_7
+      build_name: manywheel-py3_9-rocm6_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -968,6 +1107,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_10-cpu-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -982,8 +1124,6 @@ jobs:
       build_name: manywheel-py3_10-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1028,6 +1168,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cpu-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_10-cpu-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -1043,8 +1186,6 @@ jobs:
       build_name: manywheel-py3_10-cpu-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1065,7 +1206,7 @@ jobs:
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda11_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda11_8-test:  # Testing
@@ -1090,6 +1231,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_10-cuda11_8-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -1105,8 +1249,6 @@ jobs:
       build_name: manywheel-py3_10-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1127,7 +1269,7 @@ jobs:
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda12_1
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_1-test:  # Testing
@@ -1152,6 +1294,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_10-cuda12_1-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -1167,13 +1312,11 @@ jobs:
       build_name: manywheel-py3_10-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_10-rocm5_6-build:
+  manywheel-py3_10-cuda12_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1182,18 +1325,81 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda12_4
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_10-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda12_4
+      build_environment: linux-binary-manywheel
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_10-rocm6_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-rocm5_6
+      build_name: manywheel-py3_10-rocm6_0
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-rocm5_6-test:  # Testing
+  manywheel-py3_10-rocm6_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm5_6-build
+    needs: manywheel-py3_10-rocm6_0-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1202,11 +1408,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.10"
     steps:
       - name: Setup ROCm
@@ -1214,7 +1420,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_10-rocm5_6
+          name: manywheel-py3_10-rocm6_0
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1247,35 +1453,36 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.6-main
+          docker-image: pytorch/manylinux-builder:rocm6.0-main
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_10-rocm5_6-upload:  # Uploading
+  manywheel-py3_10-rocm6_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm5_6-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-rocm6_0-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-rocm5_6
+      build_name: manywheel-py3_10-rocm6_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_10-rocm5_7-build:
+  manywheel-py3_10-rocm6_1-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1284,18 +1491,18 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-rocm5_7
+      build_name: manywheel-py3_10-rocm6_1
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-rocm5_7-test:  # Testing
+  manywheel-py3_10-rocm6_1-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm5_7-build
+    needs: manywheel-py3_10-rocm6_1-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1304,11 +1511,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.10"
     steps:
       - name: Setup ROCm
@@ -1316,7 +1523,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_10-rocm5_7
+          name: manywheel-py3_10-rocm6_1
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1349,30 +1556,31 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.7-main
+          docker-image: pytorch/manylinux-builder:rocm6.1-main
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_10-rocm5_7-upload:  # Uploading
+  manywheel-py3_10-rocm6_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm5_7-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-rocm6_1-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-rocm5_7
+      build_name: manywheel-py3_10-rocm6_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1415,6 +1623,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_11-cpu-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -1429,8 +1640,6 @@ jobs:
       build_name: manywheel-py3_11-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1475,6 +1684,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cpu-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_11-cpu-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -1490,8 +1702,6 @@ jobs:
       build_name: manywheel-py3_11-cpu-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1512,7 +1722,7 @@ jobs:
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda11_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda11_8-test:  # Testing
@@ -1537,6 +1747,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_11-cuda11_8-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -1552,8 +1765,6 @@ jobs:
       build_name: manywheel-py3_11-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1574,7 +1785,7 @@ jobs:
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda12_1
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_1-test:  # Testing
@@ -1599,6 +1810,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_11-cuda12_1-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -1614,13 +1828,11 @@ jobs:
       build_name: manywheel-py3_11-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_11-rocm5_6-build:
+  manywheel-py3_11-cuda12_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1629,18 +1841,81 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_4
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_11-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_4
+      build_environment: linux-binary-manywheel
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_11-rocm6_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-rocm5_6
+      build_name: manywheel-py3_11-rocm6_0
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-rocm5_6-test:  # Testing
+  manywheel-py3_11-rocm6_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_11-rocm5_6-build
+    needs: manywheel-py3_11-rocm6_0-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1649,11 +1924,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.11"
     steps:
       - name: Setup ROCm
@@ -1661,7 +1936,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_11-rocm5_6
+          name: manywheel-py3_11-rocm6_0
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1694,35 +1969,36 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.6-main
+          docker-image: pytorch/manylinux-builder:rocm6.0-main
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_11-rocm5_6-upload:  # Uploading
+  manywheel-py3_11-rocm6_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_11-rocm5_6-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-rocm6_0-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-rocm5_6
+      build_name: manywheel-py3_11-rocm6_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_11-rocm5_7-build:
+  manywheel-py3_11-rocm6_1-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1731,18 +2007,18 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-rocm5_7
+      build_name: manywheel-py3_11-rocm6_1
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-rocm5_7-test:  # Testing
+  manywheel-py3_11-rocm6_1-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_11-rocm5_7-build
+    needs: manywheel-py3_11-rocm6_1-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1751,11 +2027,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.11"
     steps:
       - name: Setup ROCm
@@ -1763,7 +2039,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_11-rocm5_7
+          name: manywheel-py3_11-rocm6_1
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1796,30 +2072,31 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.7-main
+          docker-image: pytorch/manylinux-builder:rocm6.1-main
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_11-rocm5_7-upload:  # Uploading
+  manywheel-py3_11-rocm6_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_11-rocm5_7-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-rocm6_1-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-rocm5_7
+      build_name: manywheel-py3_11-rocm6_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1862,6 +2139,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_12-cpu-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -1876,8 +2156,6 @@ jobs:
       build_name: manywheel-py3_12-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1922,6 +2200,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cpu-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_12-cpu-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -1937,8 +2218,6 @@ jobs:
       build_name: manywheel-py3_12-cpu-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1959,7 +2238,7 @@ jobs:
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda11_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda11_8-test:  # Testing
@@ -1984,6 +2263,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_12-cuda11_8-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -1999,8 +2281,6 @@ jobs:
       build_name: manywheel-py3_12-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -2021,7 +2301,7 @@ jobs:
       DESIRED_PYTHON: "3.12"
       build_name: manywheel-py3_12-cuda12_1
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_1-test:  # Testing
@@ -2046,6 +2326,9 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: manywheel-py3_12-cuda12_1-test
     with:
       PYTORCH_ROOT: /pytorch
@@ -2061,13 +2344,74 @@ jobs:
       build_name: manywheel-py3_12-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_12-rocm5_6-build:
+  manywheel-py3_12-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda12_4
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_12-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda12_4
+      build_environment: linux-binary-manywheel
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_12-rocm6_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -2076,18 +2420,18 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-rocm5_6
+      build_name: manywheel-py3_12-rocm6_0
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-rocm5_6-test:  # Testing
+  manywheel-py3_12-rocm6_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_12-rocm5_6-build
+    needs: manywheel-py3_12-rocm6_0-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -2096,11 +2440,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.12"
     steps:
       - name: Setup ROCm
@@ -2108,7 +2452,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_12-rocm5_6
+          name: manywheel-py3_12-rocm6_0
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2141,35 +2485,36 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.6-main
+          docker-image: pytorch/manylinux-builder:rocm6.0-main
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_12-rocm5_6-upload:  # Uploading
+  manywheel-py3_12-rocm6_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_12-rocm5_6-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-rocm6_0-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.6
-      GPU_ARCH_VERSION: 5.6
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
       DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-rocm5_6
+      build_name: manywheel-py3_12-rocm6_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_12-rocm5_7-build:
+  manywheel-py3_12-rocm6_1-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -2178,18 +2523,18 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-rocm5_7
+      build_name: manywheel-py3_12-rocm6_1
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-rocm5_7-test:  # Testing
+  manywheel-py3_12-rocm6_1-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_12-rocm5_7-build
+    needs: manywheel-py3_12-rocm6_1-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -2198,11 +2543,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.12"
     steps:
       - name: Setup ROCm
@@ -2210,7 +2555,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_12-rocm5_7
+          name: manywheel-py3_12-rocm6_1
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2243,30 +2588,31 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.7-main
+          docker-image: pytorch/manylinux-builder:rocm6.1-main
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_12-rocm5_7-upload:  # Uploading
+  manywheel-py3_12-rocm6_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_12-rocm5_7-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-rocm6_1-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.7
-      GPU_ARCH_VERSION: 5.7
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
       DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-rocm5_7
+      build_name: manywheel-py3_12-rocm6_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
index 40f0ece4ff4a2..a8cbdb7cd6feb 100644
--- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
@@ -26,9 +26,7 @@ env:
   BUILD_ENVIRONMENT: macos-arm64-binary-conda
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
-  CROSS_COMPILE_ARM64: 1
-
+  SKIP_ALL_TESTS: 0
 concurrency:
   group: macos-arm64-binary-conda-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -36,7 +34,7 @@ concurrency:
 jobs:
   conda-py3_8-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
+    runs-on: macos-13-xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -131,6 +129,9 @@ jobs:
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
   conda-py3_8-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_8-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
@@ -146,14 +147,12 @@ jobs:
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
   conda-py3_9-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
+    runs-on: macos-13-xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -248,6 +247,9 @@ jobs:
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
   conda-py3_9-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_9-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
@@ -263,14 +265,12 @@ jobs:
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
   conda-py3_10-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
+    runs-on: macos-13-xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -365,6 +365,9 @@ jobs:
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
   conda-py3_10-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_10-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
@@ -380,14 +383,12 @@ jobs:
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
   conda-py3_11-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
+    runs-on: macos-13-xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -482,6 +483,9 @@ jobs:
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
   conda-py3_11-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_11-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
@@ -497,14 +501,12 @@ jobs:
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
   conda-py3_12-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
+    runs-on: macos-13-xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -599,6 +601,9 @@ jobs:
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
   conda-py3_12-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_12-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
@@ -614,8 +619,6 @@ jobs:
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
index 7a7e7d563ae80..0ed7ba10a07d5 100644
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
@@ -26,7 +26,7 @@ env:
   BUILD_ENVIRONMENT: macos-arm64-binary-libtorch-cxx11-abi
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
   group: macos-arm64-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -133,6 +133,9 @@ jobs:
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
   libtorch-cpu-shared-with-deps-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: libtorch-cpu-shared-with-deps-cxx11-abi-build
     with:
       PYTORCH_ROOT: /pytorch
@@ -149,8 +152,6 @@ jobs:
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
index a0114ebe2f75b..167161de3645c 100644
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@@ -26,7 +26,7 @@ env:
   BUILD_ENVIRONMENT: macos-arm64-binary-wheel
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
+  SKIP_ALL_TESTS: 0
 concurrency:
   group: macos-arm64-binary-wheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
@@ -46,7 +46,7 @@ jobs:
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -130,6 +130,9 @@ jobs:
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
   wheel-py3_8-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: wheel-py3_8-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
@@ -145,8 +148,6 @@ jobs:
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -164,7 +165,7 @@ jobs:
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -248,6 +249,9 @@ jobs:
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
   wheel-py3_9-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: wheel-py3_9-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
@@ -263,8 +267,6 @@ jobs:
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -282,7 +284,7 @@ jobs:
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -366,6 +368,9 @@ jobs:
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
   wheel-py3_10-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: wheel-py3_10-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
@@ -381,8 +386,6 @@ jobs:
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -400,7 +403,7 @@ jobs:
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -484,6 +487,9 @@ jobs:
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
   wheel-py3_11-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: wheel-py3_11-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
@@ -499,8 +505,6 @@ jobs:
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -518,7 +522,7 @@ jobs:
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -602,6 +606,9 @@ jobs:
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
   wheel-py3_12-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: wheel-py3_12-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
@@ -617,8 +624,6 @@ jobs:
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-macos-binary-conda-nightly.yml b/.github/workflows/generated-macos-binary-conda-nightly.yml
deleted file mode 100644
index 984b54cd19a43..0000000000000
--- a/.github/workflows/generated-macos-binary-conda-nightly.yml
+++ /dev/null
@@ -1,619 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: macos-binary-conda
-
-on:
-# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_conda/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: macos-binary-conda
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: macos-binary-conda-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  conda-py3_8-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-            sudo chmod +x /usr/local/bin/sccache
-            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_8-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_8-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu-main
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-            sudo chmod +x /usr/local/bin/sccache
-            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_9-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_9-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu-main
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-            sudo chmod +x /usr/local/bin/sccache
-            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_10-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_10-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu-main
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_11-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-            sudo chmod +x /usr/local/bin/sccache
-            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_11-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_11-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu-main
-      DESIRED_PYTHON: "3.11"
-      build_name: conda-py3_11-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_12-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-            sudo chmod +x /usr/local/bin/sccache
-            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_12-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_12-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu-main
-      DESIRED_PYTHON: "3.12"
-      build_name: conda-py3_12-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
deleted file mode 100644
index 1e0a7bfbe84dd..0000000000000
--- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
+++ /dev/null
@@ -1,156 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: macos-binary-libtorch-cxx11-abi
-
-on:
-# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_libtorch/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: macos-binary-libtorch-cxx11-abi
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: macos-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  libtorch-cpu-shared-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.8"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-            sudo chmod +x /usr/local/bin/sccache
-            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cpu-shared-with-deps-cxx11-abi
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  libtorch-cpu-shared-with-deps-cxx11-abi-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-cxx11-abi-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-main
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-macos-binary-wheel-nightly.yml b/.github/workflows/generated-macos-binary-wheel-nightly.yml
deleted file mode 100644
index fc5bc266e2c51..0000000000000
--- a/.github/workflows/generated-macos-binary-wheel-nightly.yml
+++ /dev/null
@@ -1,624 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/macos_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: macos-binary-wheel
-
-on:
-# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_wheel/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: macos-binary-wheel
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SKIP_ALL_TESTS: 1
-concurrency:
-  group: macos-binary-wheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  wheel-py3_8-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-            sudo chmod +x /usr/local/bin/sccache
-            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_8-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_8-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      DESIRED_PYTHON: "3.8"
-      build_name: wheel-py3_8-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_9-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-            sudo chmod +x /usr/local/bin/sccache
-            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_9-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_9-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-            sudo chmod +x /usr/local/bin/sccache
-            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_10-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_10-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-            sudo chmod +x /usr/local/bin/sccache
-            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_11-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_11-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_11-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-            sudo chmod +x /usr/local/bin/sccache
-            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_12-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_12-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_12-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-windows-binary-conda-nightly.yml b/.github/workflows/generated-windows-binary-conda-nightly.yml
index d87d28c270733..c3e4a038896e7 100644
--- a/.github/workflows/generated-windows-binary-conda-nightly.yml
+++ b/.github/workflows/generated-windows-binary-conda-nightly.yml
@@ -255,6 +255,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   conda-py3_8-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_8-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -268,8 +271,6 @@ jobs:
       build_name: conda-py3_8-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -498,6 +499,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   conda-py3_8-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_8-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -512,8 +516,6 @@ jobs:
       build_name: conda-py3_8-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -742,6 +744,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   conda-py3_8-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_8-cuda12_1-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -756,8 +761,251 @@ jobs:
       build_name: conda-py3_8-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+  conda-py3_8-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: conda-py3_8-cuda12_4
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_8-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_8-cuda12_4-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_8-cuda12_4
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_8-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_8-cuda12_4-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -984,6 +1232,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   conda-py3_9-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_9-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -997,8 +1248,6 @@ jobs:
       build_name: conda-py3_9-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1227,6 +1476,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   conda-py3_9-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_9-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1241,8 +1493,6 @@ jobs:
       build_name: conda-py3_9-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1471,6 +1721,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   conda-py3_9-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: conda-py3_9-cuda12_1-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1485,12 +1738,987 @@ jobs:
       build_name: conda-py3_9-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cpu-build:
+  conda-py3_9-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: conda-py3_9-cuda12_4
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_9-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_9-cuda12_4-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_9-cuda12_4
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_9-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_9-cuda12_4-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+  conda-py3_10-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: conda-py3_10-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_10-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cpu-build
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_10-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_10-cpu-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+  conda-py3_10-cuda11_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: conda-py3_10-cuda11_8
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_10-cuda11_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cuda11_8-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cuda11_8
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_10-cuda11_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_10-cuda11_8-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cuda11_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+  conda-py3_10-cuda12_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: conda-py3_10-cuda12_1
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_10-cuda12_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_10-cuda12_1-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: conda-py3_10-cuda12_1
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  conda-py3_10-cuda12_1-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_10-cuda12_1-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cuda12_1
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+  conda-py3_10-cuda12_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -1500,8 +2728,9 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
     steps:
@@ -1584,7 +2813,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_10-cpu
+          name: conda-py3_10-cuda12_4
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1601,10 +2830,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cpu-test:  # Testing
+  conda-py3_10-cuda12_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs: conda-py3_10-cuda12_4-build
+    runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1612,8 +2841,9 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
     steps:
@@ -1663,7 +2893,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_10-cpu
+          name: conda-py3_10-cuda12_4
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1711,27 +2941,29 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cpu-upload:  # Uploading
+  conda-py3_10-cuda12_4-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_10-cuda12_4-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cpu
+      build_name: conda-py3_10-cuda12_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cuda11_8-build:
+  conda-py3_11-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -1741,11 +2973,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1826,7 +3057,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_10-cuda11_8
+          name: conda-py3_11-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1843,10 +3074,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cuda11_8-test:  # Testing
+  conda-py3_11-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: conda-py3_11-cpu-build
+    runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1854,11 +3085,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1906,7 +3136,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_10-cuda11_8
+          name: conda-py3_11-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1954,28 +3184,28 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cuda11_8-upload:  # Uploading
+  conda-py3_11-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_8-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_11-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_8
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cuda12_1-build:
+  conda-py3_11-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -1985,11 +3215,11 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2070,7 +3300,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_10-cuda12_1
+          name: conda-py3_11-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2087,9 +3317,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cuda12_1-test:  # Testing
+  conda-py3_11-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda12_1-build
+    needs: conda-py3_11-cuda11_8-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -2098,11 +3328,11 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2150,7 +3380,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_10-cuda12_1
+          name: conda-py3_11-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2198,28 +3428,29 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cuda12_1-upload:  # Uploading
+  conda-py3_11-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda12_1-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_11-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda12_1
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_11-cpu-build:
+  conda-py3_11-cuda12_1-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -2229,8 +3460,9 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
     steps:
@@ -2313,7 +3545,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_11-cpu
+          name: conda-py3_11-cuda12_1
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2330,10 +3562,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_11-cpu-test:  # Testing
+  conda-py3_11-cuda12_1-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs: conda-py3_11-cuda12_1-build
+    runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -2341,8 +3573,9 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
     steps:
@@ -2392,7 +3625,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_11-cpu
+          name: conda-py3_11-cuda12_1
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2440,27 +3673,29 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_11-cpu-upload:  # Uploading
+  conda-py3_11-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cpu-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_11-cuda12_1-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.11"
-      build_name: conda-py3_11-cpu
+      build_name: conda-py3_11-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_11-cuda11_8-build:
+  conda-py3_11-cuda12_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -2470,8 +3705,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -2555,7 +3790,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_11-cuda11_8
+          name: conda-py3_11-cuda12_4
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2572,9 +3807,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_11-cuda11_8-test:  # Testing
+  conda-py3_11-cuda12_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cuda11_8-build
+    needs: conda-py3_11-cuda12_4-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -2583,8 +3818,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -2635,7 +3870,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_11-cuda11_8
+          name: conda-py3_11-cuda12_4
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2683,28 +3918,29 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_11-cuda11_8-upload:  # Uploading
+  conda-py3_11-cuda12_4-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cuda11_8-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_11-cuda12_4-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.11"
-      build_name: conda-py3_11-cuda11_8
+      build_name: conda-py3_11-cuda12_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_11-cuda12_1-build:
+  conda-py3_12-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -2714,11 +3950,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.12"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2799,7 +4034,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_11-cuda12_1
+          name: conda-py3_12-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2816,10 +4051,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_11-cuda12_1-test:  # Testing
+  conda-py3_12-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cuda12_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: conda-py3_12-cpu-build
+    runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -2827,11 +4062,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.12"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2879,7 +4113,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_11-cuda12_1
+          name: conda-py3_12-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2927,28 +4161,28 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_11-cuda12_1-upload:  # Uploading
+  conda-py3_12-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cuda12_1-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_12-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.11"
-      build_name: conda-py3_11-cuda12_1
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.12"
+      build_name: conda-py3_12-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_12-cpu-build:
+  conda-py3_12-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -2958,8 +4192,9 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
     steps:
@@ -3042,7 +4277,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_12-cpu
+          name: conda-py3_12-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3059,10 +4294,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_12-cpu-test:  # Testing
+  conda-py3_12-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs: conda-py3_12-cuda11_8-build
+    runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -3070,8 +4305,9 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
     steps:
@@ -3121,7 +4357,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_12-cpu
+          name: conda-py3_12-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -3169,27 +4405,29 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_12-cpu-upload:  # Uploading
+  conda-py3_12-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cpu-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_12-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
-      build_name: conda-py3_12-cpu
+      build_name: conda-py3_12-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_12-cuda11_8-build:
+  conda-py3_12-cuda12_1-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -3199,8 +4437,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -3284,7 +4522,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_12-cuda11_8
+          name: conda-py3_12-cuda12_1
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3301,9 +4539,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_12-cuda11_8-test:  # Testing
+  conda-py3_12-cuda12_1-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cuda11_8-build
+    needs: conda-py3_12-cuda12_1-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -3312,8 +4550,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -3364,7 +4602,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_12-cuda11_8
+          name: conda-py3_12-cuda12_1
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -3412,28 +4650,29 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_12-cuda11_8-upload:  # Uploading
+  conda-py3_12-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cuda11_8-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_12-cuda12_1-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
-      build_name: conda-py3_12-cuda11_8
+      build_name: conda-py3_12-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_12-cuda12_1-build:
+  conda-py3_12-cuda12_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -3443,8 +4682,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -3528,7 +4767,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_12-cuda12_1
+          name: conda-py3_12-cuda12_4
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3545,9 +4784,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_12-cuda12_1-test:  # Testing
+  conda-py3_12-cuda12_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cuda12_1-build
+    needs: conda-py3_12-cuda12_4-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -3556,8 +4795,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -3608,7 +4847,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_12-cuda12_1
+          name: conda-py3_12-cuda12_4
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -3656,24 +4895,25 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_12-cuda12_1-upload:  # Uploading
+  conda-py3_12-cuda12_4-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cuda12_1-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: conda-py3_12-cuda12_4-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
-      build_name: conda-py3_12-cuda12_1
+      build_name: conda-py3_12-cuda12_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
index 7b94603344e38..60ba59556926f 100644
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@@ -263,6 +263,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   libtorch-cpu-shared-with-deps-debug-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: libtorch-cpu-shared-with-deps-debug-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -280,8 +283,6 @@ jobs:
       build_name: libtorch-cpu-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -518,6 +519,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   libtorch-cuda11_8-shared-with-deps-debug-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: libtorch-cuda11_8-shared-with-deps-debug-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -536,8 +540,6 @@ jobs:
       build_name: libtorch-cuda11_8-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -774,6 +776,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   libtorch-cuda12_1-shared-with-deps-debug-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: libtorch-cuda12_1-shared-with-deps-debug-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -792,8 +797,263 @@ jobs:
       build_name: libtorch-cuda12_1-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+  libtorch-cuda12_4-shared-with-deps-debug-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: libtorch-cuda12_4-shared-with-deps-debug
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda12_4-shared-with-deps-debug-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda12_4-shared-with-deps-debug-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda12_4-shared-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda12_4-shared-with-deps-debug-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_4-shared-with-deps-debug-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      LIBTORCH_CONFIG: debug
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.8"
+      build_name: libtorch-cuda12_4-shared-with-deps-debug
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
index 7f13d5a2f5f88..842de97a1fbe9 100644
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@@ -263,6 +263,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   libtorch-cpu-shared-with-deps-release-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: libtorch-cpu-shared-with-deps-release-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -280,8 +283,6 @@ jobs:
       build_name: libtorch-cpu-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -518,6 +519,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   libtorch-cuda11_8-shared-with-deps-release-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: libtorch-cuda11_8-shared-with-deps-release-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -536,8 +540,6 @@ jobs:
       build_name: libtorch-cuda11_8-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -774,6 +776,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   libtorch-cuda12_1-shared-with-deps-release-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: libtorch-cuda12_1-shared-with-deps-release-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -792,8 +797,263 @@ jobs:
       build_name: libtorch-cuda12_1-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+  libtorch-cuda12_4-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: libtorch-cuda12_4-shared-with-deps-release
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda12_4-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: libtorch-cuda12_4-shared-with-deps-release-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cuda12_4-shared-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  libtorch-cuda12_4-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_4-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      # This is a dummy value for libtorch to work correctly with our batch scripts
+      # without this value pip does not get installed for some reason
+      DESIRED_PYTHON: "3.8"
+      build_name: libtorch-cuda12_4-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml
index 69917c03d4ce0..d64c221e7895f 100644
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@@ -46,7 +46,7 @@ jobs:
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -256,6 +256,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   wheel-py3_8-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: wheel-py3_8-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -269,8 +272,6 @@ jobs:
       build_name: wheel-py3_8-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -289,7 +290,7 @@ jobs:
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -500,6 +501,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   wheel-py3_8-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: wheel-py3_8-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -514,8 +518,6 @@ jobs:
       build_name: wheel-py3_8-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -534,7 +536,7 @@ jobs:
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -745,6 +747,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   wheel-py3_8-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: wheel-py3_8-cuda12_1-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -759,8 +764,252 @@ jobs:
       build_name: wheel-py3_8-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_8-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: wheel-py3_8-cuda12_4
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_8-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_8-cuda12_4-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.8"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_8-cuda12_4
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_8-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_8-cuda12_4-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.8"
+      build_name: wheel-py3_8-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -778,7 +1027,7 @@ jobs:
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -988,6 +1237,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   wheel-py3_9-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: wheel-py3_9-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1001,8 +1253,6 @@ jobs:
       build_name: wheel-py3_9-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1021,7 +1271,7 @@ jobs:
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1232,6 +1482,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   wheel-py3_9-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: wheel-py3_9-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1246,8 +1499,6 @@ jobs:
       build_name: wheel-py3_9-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1266,7 +1517,7 @@ jobs:
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1477,6 +1728,9 @@ jobs:
           .github\scripts\kill_active_ssh_sessions.ps1
   wheel-py3_9-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     needs: wheel-py3_9-cuda12_1-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1491,12 +1745,991 @@ jobs:
       build_name: wheel-py3_9-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cpu-build:
+  wheel-py3_9-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: wheel-py3_9-cuda12_4
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_9-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_9-cuda12_4-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.9"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_9-cuda12_4
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_9-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_9-cuda12_4-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.9"
+      build_name: wheel-py3_9-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_10-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: wheel-py3_10-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_10-cpu-build
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_10-cpu-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_10-cuda11_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: wheel-py3_10-cuda11_8
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cuda11_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_10-cuda11_8-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cuda11_8
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cuda11_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_10-cuda11_8-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-cuda11_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_10-cuda12_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: windows.4xlarge.nonephemeral
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: wheel-py3_10-cuda12_1
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cuda12_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_10-cuda12_1-build
+    runs-on: windows.8xlarge.nvidia.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.10"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
+      - name: Enable long paths on Windows
+        shell: powershell
+        run: |
+          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
+      # removed once Windows Defender is removed from the AMI
+      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
+        continue-on-error: true
+        shell: powershell
+        run: |
+          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
+          # Let's both exclude the path and disable Windows Defender completely just to be sure
+          # that it doesn't interfere
+          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_10-cuda12_1
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+      - name: Wait until all sessions have drained
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        timeout-minutes: 120
+        run: |
+          .github\scripts\wait_for_ssh_to_drain.ps1
+      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
+        shell: powershell
+        working-directory: pytorch
+        if: always()
+        run: |
+          .github\scripts\kill_active_ssh_sessions.ps1
+  wheel-py3_10-cuda12_1-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_10-cuda12_1-test
+    with:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-cuda12_1
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_10-cuda12_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -1506,11 +2739,12 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1591,7 +2825,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_10-cpu
+          name: wheel-py3_10-cuda12_4
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1608,10 +2842,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cpu-test:  # Testing
+  wheel-py3_10-cuda12_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs: wheel-py3_10-cuda12_4-build
+    runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1619,8 +2853,9 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
     steps:
@@ -1670,7 +2905,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cpu
+          name: wheel-py3_10-cuda12_4
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1718,27 +2953,29 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cpu-upload:  # Uploading
+  wheel-py3_10-cuda12_4-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cpu-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_10-cuda12_4-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cpu
+      build_name: wheel-py3_10-cuda12_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cuda11_8-build:
+  wheel-py3_11-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -1748,12 +2985,11 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      DESIRED_PYTHON: "3.11"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1834,7 +3070,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_10-cuda11_8
+          name: wheel-py3_11-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1851,10 +3087,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_8-test:  # Testing
+  wheel-py3_11-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: wheel-py3_11-cpu-build
+    runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1862,11 +3098,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1914,7 +3149,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cuda11_8
+          name: wheel-py3_11-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1962,28 +3197,28 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_8-upload:  # Uploading
+  wheel-py3_11-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_8-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_11-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cuda11_8
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cuda12_1-build:
+  wheel-py3_11-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -1993,12 +3228,12 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      DESIRED_PYTHON: "3.11"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2079,7 +3314,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_10-cuda12_1
+          name: wheel-py3_11-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2096,9 +3331,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda12_1-test:  # Testing
+  wheel-py3_11-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda12_1-build
+    needs: wheel-py3_11-cuda11_8-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -2107,11 +3342,11 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2159,7 +3394,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cuda12_1
+          name: wheel-py3_11-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2207,28 +3442,29 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda12_1-upload:  # Uploading
+  wheel-py3_11-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda12_1-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_11-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cuda12_1
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-cpu-build:
+  wheel-py3_11-cuda12_1-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -2238,11 +3474,12 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2323,7 +3560,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_11-cpu
+          name: wheel-py3_11-cuda12_1
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2340,10 +3577,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cpu-test:  # Testing
+  wheel-py3_11-cuda12_1-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_11-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs: wheel-py3_11-cuda12_1-build
+    runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -2351,8 +3588,9 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
     steps:
@@ -2402,7 +3640,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_11-cpu
+          name: wheel-py3_11-cuda12_1
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2450,27 +3688,29 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cpu-upload:  # Uploading
+  wheel-py3_11-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_11-cpu-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_11-cuda12_1-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-cpu
+      build_name: wheel-py3_11-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-cuda11_8-build:
+  wheel-py3_11-cuda12_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -2480,12 +3720,12 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2566,7 +3806,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_11-cuda11_8
+          name: wheel-py3_11-cuda12_4
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2583,9 +3823,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cuda11_8-test:  # Testing
+  wheel-py3_11-cuda12_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_11-cuda11_8-build
+    needs: wheel-py3_11-cuda12_4-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -2594,8 +3834,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -2646,7 +3886,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_11-cuda11_8
+          name: wheel-py3_11-cuda12_4
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2694,28 +3934,29 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cuda11_8-upload:  # Uploading
+  wheel-py3_11-cuda12_4-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_11-cuda11_8-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_11-cuda12_4-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-cuda11_8
+      build_name: wheel-py3_11-cuda12_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-cuda12_1-build:
+  wheel-py3_12-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -2725,12 +3966,11 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      DESIRED_PYTHON: "3.12"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2811,7 +4051,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_11-cuda12_1
+          name: wheel-py3_12-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2828,10 +4068,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cuda12_1-test:  # Testing
+  wheel-py3_12-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_11-cuda12_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: wheel-py3_12-cpu-build
+    runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -2839,11 +4079,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.11"
+      DESIRED_PYTHON: "3.12"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2891,7 +4130,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_11-cuda12_1
+          name: wheel-py3_12-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2939,28 +4178,28 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cuda12_1-upload:  # Uploading
+  wheel-py3_12-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_11-cuda12_1-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_12-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-cuda12_1
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.12"
+      build_name: wheel-py3_12-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-cpu-build:
+  wheel-py3_12-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -2970,11 +4209,12 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3055,7 +4295,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_12-cpu
+          name: wheel-py3_12-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3072,10 +4312,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cpu-test:  # Testing
+  wheel-py3_12-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_12-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs: wheel-py3_12-cuda11_8-build
+    runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -3083,8 +4323,9 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
     steps:
@@ -3134,7 +4375,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_12-cpu
+          name: wheel-py3_12-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -3182,27 +4423,29 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cpu-upload:  # Uploading
+  wheel-py3_12-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_12-cpu-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_12-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-cpu
+      build_name: wheel-py3_12-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-cuda11_8-build:
+  wheel-py3_12-cuda12_1-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -3212,12 +4455,12 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3298,7 +4541,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_12-cuda11_8
+          name: wheel-py3_12-cuda12_1
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3315,9 +4558,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cuda11_8-test:  # Testing
+  wheel-py3_12-cuda12_1-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_12-cuda11_8-build
+    needs: wheel-py3_12-cuda12_1-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -3326,8 +4569,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -3378,7 +4621,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_12-cuda11_8
+          name: wheel-py3_12-cuda12_1
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -3426,28 +4669,29 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cuda11_8-upload:  # Uploading
+  wheel-py3_12-cuda12_1-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_12-cuda11_8-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_12-cuda12_1-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-cuda11_8
+      build_name: wheel-py3_12-cuda12_1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-cuda12_1-build:
+  wheel-py3_12-cuda12_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge.nonephemeral
     timeout-minutes: 240
@@ -3457,12 +4701,12 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3543,7 +4787,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_12-cuda12_1
+          name: wheel-py3_12-cuda12_4
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3560,9 +4804,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cuda12_1-test:  # Testing
+  wheel-py3_12-cuda12_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_12-cuda12_1-build
+    needs: wheel-py3_12-cuda12_4-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -3571,8 +4815,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -3623,7 +4867,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_12-cuda12_1
+          name: wheel-py3_12-cuda12_4
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -3671,24 +4915,25 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cuda12_1-upload:  # Uploading
+  wheel-py3_12-cuda12_4-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_12-cuda12_1-test
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_12-cuda12_4-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-cuda12_1
+      build_name: wheel-py3_12-cuda12_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
       conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/inductor-micro-benchmark.yml b/.github/workflows/inductor-micro-benchmark.yml
new file mode 100644
index 0000000000000..4fe0ddf50ef2a
--- /dev/null
+++ b/.github/workflows/inductor-micro-benchmark.yml
@@ -0,0 +1,40 @@
+name: inductor-micro-benchmark
+
+on:
+  schedule:
+    - cron: 0 7 * * *
+  push:
+    tags:
+      - ciflow/inductor-micro-benchmark/*
+  workflow_dispatch:
+
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
+        ]}
+
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-test:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+      timeout-minutes: 720
diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml
index 444cf3c428136..e485a8bfce1b7 100644
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@@ -10,6 +10,8 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
+permissions: read-all
+
 jobs:
   linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
     name: cuda12.1-py3.10-gcc9-sm80
diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml
index e8a811d55b6b5..e77c915749f3f 100644
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@@ -4,15 +4,17 @@ on:
   schedule:
     - cron: 0 7 * * 1-6
     - cron: 0 7 * * 0
+  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
+  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
   workflow_dispatch:
     inputs:
       training:
-        description: Run training?
+        description: Run training (on by default)?
         required: false
         type: boolean
         default: true
       inference:
-        description: Run inference?
+        description: Run inference (off by default)?
         required: false
         type: boolean
         default: false
@@ -20,22 +22,17 @@ on:
         description: Run inductor_default?
         required: false
         type: boolean
-        default: true
+        default: false
       dynamic:
         description: Run inductor_dynamic_shapes?
         required: false
         type: boolean
-        default: true
+        default: false
       cudagraphs:
         description: Run inductor_cudagraphs?
         required: false
         type: boolean
         default: true
-      cppwrapper:
-        description: Run inductor_cpp_wrapper for inference?
-        required: false
-        type: boolean
-        default: false
       freezing_cudagraphs:
         description: Run inductor_cudagraphs with freezing for inference?
         required: false
@@ -56,11 +53,18 @@ on:
         required: false
         type: boolean
         default: false
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+        default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+permissions: read-all
+
 jobs:
   linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
     name: cuda12.1-py3.10-gcc9-sm80
@@ -84,6 +88,7 @@ jobs:
           { config: "inductor_torchbench_perf", shard: 3, num_shards: 4, runner: "linux.gcp.a100.large" },
           { config: "inductor_torchbench_perf", shard: 4, num_shards: 4, runner: "linux.gcp.a100.large" },
         ]}
+      selected-test-configs: ${{ inputs.benchmark_configs }}
     secrets:
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
 
@@ -94,7 +99,7 @@ jobs:
     if: github.event.schedule == '0 7 * * 1-6'
     with:
       build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-aotinductor-true-freezing_cudagraphs-true
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
       docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }}
       use-gha: anything-non-empty-to-use-gha
@@ -109,7 +114,7 @@ jobs:
     if: github.event.schedule == '0 7 * * 0'
     with:
       build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
       docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }}
       use-gha: anything-non-empty-to-use-gha
@@ -124,7 +129,7 @@ jobs:
     if: github.event_name == 'workflow_dispatch'
     with:
       build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-false-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
       docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }}
       use-gha: anything-non-empty-to-use-gha
diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml
index f775acf1e9e78..6f8c06ed030b0 100644
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@@ -14,6 +14,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
+
+permissions: read-all
+
 jobs:
   linux-focal-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build:
     name: cuda12.1-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index 015b197c2b3cf..0ad799a80bcc0 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -13,26 +13,31 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
+permissions: read-all
+
 jobs:
-  linux-focal-rocm5_7-py3_8-inductor-build:
-    name: rocm5.7-py3.8-inductor
+  linux-focal-rocm6_1-py3_8-inductor-build:
+    name: rocm6.1-py3.8-inductor
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-rocm5.7-py3.8
+      build-environment: linux-focal-rocm6.1-py3.8
       docker-image-name: pytorch-linux-focal-rocm-n-py3
       test-matrix: |
         { include: [
           { config: "inductor", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.2" },
         ]}
 
-  linux-focal-rocm5_7-py3_8-inductor-test:
-    name: rocm5.7-py3.8-inductor
+  linux-focal-rocm6_1-py3_8-inductor-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: rocm6.1-py3.8-inductor
     uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_7-py3_8-inductor-build
+    needs: linux-focal-rocm6_1-py3_8-inductor-build
     with:
-      build-environment: linux-focal-rocm5.7-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_7-py3_8-inductor-build.outputs.docker-image }}
-      test-matrix:  ${{ needs.linux-focal-rocm5_7-py3_8-inductor-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.1-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.docker-image }}
+      test-matrix:  ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.test-matrix }}
 
   linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
     name: cuda12.1-py3.10-gcc9-sm86
@@ -60,6 +65,7 @@ jobs:
           { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
         ]}
     secrets:
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
@@ -105,7 +111,7 @@ jobs:
     name: linux-jammy-cpu-py3.8-gcc11-inductor
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-jammy-py3_8-gcc11-build
+      build-environment: linux-jammy-py3.8-gcc11-build
       docker-image-name: pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
@@ -119,6 +125,7 @@ jobs:
           { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
           { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
           { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
+          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
         ]}
     secrets:
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
@@ -128,7 +135,7 @@ jobs:
     uses: ./.github/workflows/_linux-test.yml
     needs: linux-jammy-cpu-py3_8-gcc11-inductor-build
     with:
-      build-environment: linux-jammy-py3_8-gcc11-build
+      build-environment: linux-jammy-py3.8-gcc11-build
       docker-image: ${{ needs.linux-jammy-cpu-py3_8-gcc11-inductor-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-cpu-py3_8-gcc11-inductor-build.outputs.test-matrix }}
     secrets:
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
deleted file mode 100644
index f70d715c4d473..0000000000000
--- a/.github/workflows/labeler.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: Labeler
-
-on:
-- pull_request_target
-
-jobs:
-  triage:
-    permissions:
-      contents: read
-      pull-requests: write
-    runs-on: ubuntu-latest
-    # Do not auto-label nightly builds PR
-    if: ${{ github.event.pull_request.number != 26921 }}
-    steps:
-    - uses: actions/labeler@v4
-      with:
-        repo-token: "${{ secrets.GITHUB_TOKEN }}"
-        sync-labels: ''
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/lint-bc.yml b/.github/workflows/lint-bc.yml
index f2d26a0aec9a4..73d7805082026 100644
--- a/.github/workflows/lint-bc.yml
+++ b/.github/workflows/lint-bc.yml
@@ -1,24 +1,13 @@
 name: BC Lint
 
 on:
-  # Copied from check-labels.yml to get around needing approval for first time contributors
-  # See https://docs.github.com/en/actions/managing-workflow-runs/approving-workflow-runs-from-public-forks
-  # Only allow pull_request_target when merging to main, not some historical branch.
-  #
-  # Make sure to don't introduce explicit checking out and installing/running
-  # untrusted user code into this workflow!
-  pull_request_target:
-    types: [opened, synchronize, reopened, labeled, unlabeled]
-    branches: [main]
-    paths-ignore: [.github/workflows/lint-bc.yml]
-
-  # To allow testing PRs that change workflows.
-  # May be triggered together with pull_request_target, it's OK.
   pull_request:
-    types: [opened, synchronize, reopened, labeled, unlabeled]
-    paths: [.github/workflows/lint-bc.yml]
-    branches-ignore: [nightly]
-
+    types:
+      - opened
+      - synchronize
+      - reopened
+    branches-ignore:
+      - nightly
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 90845e82d67b3..f1b6611d00e03 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -11,56 +11,40 @@ on:
       - landchecks/*
   workflow_dispatch:
 
+permissions: read-all
 # The names of steps that actually test the code should be suffixed with `(nonretryable)`.
 # When any other step fails, it's job will be retried once by retryBot.
 jobs:
-  lintrunner:
+  lintrunner-clang:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       timeout: 120
       runner: linux.2xlarge
       docker-image: pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter
+      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
+      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
       fetch-depth: 0
+      submodules: true
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        CACHE_DIRECTORY="/tmp/.lintbin"
-        # Try to recover the cached binaries
-        if [[ -d "${CACHE_DIRECTORY}" ]]; then
-          # It's ok to fail this as lintrunner init would download these binaries
-          # again if they do not exist
-          cp -r "${CACHE_DIRECTORY}" . || true
-        fi
-
-        # This has already been cached in the docker image
-        lintrunner init 2> /dev/null
-
-        # Do build steps necessary for linters
-        python3 -m tools.linter.clang_tidy.generate_build_files
-        python3 -m tools.generate_torch_version --is_debug=false
-        python3 -m tools.pyi.gen_pyi \
-          --native-functions-path aten/src/ATen/native/native_functions.yaml \
-          --tags-path aten/src/ATen/native/tags.yaml \
-          --deprecated-functions-path "tools/autograd/deprecated.yaml"
-
-        RC=0
-        # Run lintrunner on all files
-        if ! lintrunner --force-color --all-files --tee-json=lint.json 2> /dev/null; then
-          echo ""
-          echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`. (If you don't get the same results, run \'lintrunner init\' to update your local linter)\e[0m"
-          echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
-          RC=1
-        fi
+        export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT"
+        export CLANG=1
+        .github/scripts/lintrunner.sh
 
-        # Use jq to massage the JSON lint output into GitHub Actions workflow commands.
-        jq --raw-output \
-          '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \
-          lint.json || true
-
-        exit $RC
+  lintrunner-noclang:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      timeout: 120
+      runner: linux.2xlarge
+      docker-image: pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter
+      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
+      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
+      fetch-depth: 0
+      submodules: true
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT"
+        .github/scripts/lintrunner.sh
 
   quick-checks:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
@@ -224,7 +208,7 @@ jobs:
           cache: pip
       - name: Install dependencies
         run: |
-          pip install pytest-rerunfailures==11.1.* pytest-shard==0.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.1.* numpy==1.24.*
+          pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.1.* numpy==1.24.*
           pip install torch --pre --index-url https://download.pytorch.org/whl/nightly/cpu/
       - name: Run run_test.py (nonretryable)
         run: |
@@ -246,11 +230,11 @@ jobs:
         with:
           submodules: false
           fetch-depth: 1
-      - name: Setup Python 3.5
+      - name: Setup Python 3.6
         if: matrix.test_type == 'older_python_version'
         uses: actions/setup-python@v4
         with:
-          python-version: '3.5'
+          python-version: '3.6'
           architecture: x64
           check-latest: false
           cache: pip
diff --git a/.github/workflows/linux-aarch64.yml b/.github/workflows/linux-aarch64.yml
new file mode 100644
index 0000000000000..acdb6884971b6
--- /dev/null
+++ b/.github/workflows/linux-aarch64.yml
@@ -0,0 +1,39 @@
+name: linux-aarch64
+
+on:
+  push:
+    tags:
+      - ciflow/linux-aarch64/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} but found ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  linux-jammy-aarch64-py3_10-build:
+    name: linux-jammy-aarch64-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-aarch64-py3.10
+      docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
+      runner: linux.arm64.2xlarge
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 4, runner: "linux.arm64.2xlarge" },
+          { config: "default", shard: 2, num_shards: 4, runner: "linux.arm64.2xlarge" },
+          { config: "default", shard: 3, num_shards: 4, runner: "linux.arm64.2xlarge" },
+          { config: "default", shard: 4, num_shards: 4, runner: "linux.arm64.2xlarge" },
+        ]}
+
+  linux-jammy-aarch64-py3_10-test:
+    name: linux-jammy-aarch64-py3.10
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-aarch64-py3_10-build
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      build-environment: linux-jammy-aarch64-py3.10
+      docker-image: ${{ needs.linux-jammy-aarch64-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-aarch64-py3_10-build.outputs.test-matrix }}
diff --git a/.github/workflows/llm_td_retrieval.yml b/.github/workflows/llm_td_retrieval.yml
new file mode 100644
index 0000000000000..047e8ace0049d
--- /dev/null
+++ b/.github/workflows/llm_td_retrieval.yml
@@ -0,0 +1,120 @@
+name: Retrieval PyTorch Tests for Target Determination
+
+on:
+  workflow_call:
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  llm-retrieval:
+    runs-on: linux.4xlarge
+    continue-on-error: true
+    steps:
+      - name: Clone PyTorch
+        uses: actions/checkout@v3
+        with:
+          repository: pytorch/pytorch
+          fetch-depth: 0
+          path: pytorch
+
+      - name: Setup Linux
+        uses: ./pytorch/.github/actions/setup-linux
+
+      - name: Clone CodeLlama
+        uses: actions/checkout@v3
+        with:
+          repository: osalpekar/codellama
+          ref: main
+          path: codellama
+
+      - name: Clone Target Determination Code
+        uses: actions/checkout@v3
+        with:
+          repository: osalpekar/llm-target-determinator
+          ref: v0.0.2
+          path: llm-target-determinator
+
+      - name: Setup Conda
+        uses: conda-incubator/setup-miniconda@v2.1.1
+        with:
+          miniconda-version: "py39_4.12.0"
+          python-version: 3.9
+
+      - name: Install Requirements
+        shell: bash -l {0}
+        run: |
+          set -euxo pipefail
+          conda create \
+            --yes \
+            --quiet \
+            --name "tdenv" \
+            "python=3.9"
+          conda activate tdenv
+          cd "${GITHUB_WORKSPACE}/llm-target-determinator"
+          pip install -r requirements.txt
+          cd ../codellama
+          pip install -e .
+
+      - name: Fetch CodeLlama Checkpoint
+        shell: bash -l {0}
+        run: |
+          set -euxo pipefail
+          conda activate tdenv
+          cd codellama/
+          mkdir "CodeLlama-7b-Python"
+          aws s3 cp "s3://target-determinator-assets/CodeLlama-7b-Python" "CodeLlama-7b-Python" --recursive --no-progress
+
+      - name: Fetch indexes
+        uses: nick-fields/retry@v2.8.2
+        with:
+          max_attempts: 3
+          retry_wait_seconds: 10
+          timeout_minutes: 5
+          shell: bash
+          command: |
+            set -euxo pipefail
+            python3 -m pip install awscli==1.29.40
+            cd "${GITHUB_WORKSPACE}"/llm-target-determinator/assets
+            aws s3 cp "s3://target-determinator-assets/indexes/latest" . --recursive
+
+            unzip -o indexer-files\*.zip
+            rm indexer-files*.zip
+
+      - name: Run Retriever
+        id: run_retriever
+        continue-on-error: true  # ghstack not currently supported due to problems getting git diff
+        shell: bash -l {0}
+        run: |
+          set -euxo pipefail
+          conda activate tdenv
+          cd "${GITHUB_WORKSPACE}"/llm-target-determinator
+          torchrun \
+            --standalone \
+            --nnodes=1 \
+            --nproc-per-node=1 \
+            retriever.py \
+            --experiment-name indexer-files \
+            --pr-parse-format GITDIFF
+          cd assets
+          zip -r mappings.zip mappings
+
+      - name: Upload results to s3
+        uses: seemethere/upload-artifact-s3@v5
+        if: ${{ steps.run_retriever.outcome == 'success' }}
+        with:
+          name: llm_results
+          retention-days: 14
+          if-no-files-found: warn
+          path: llm-target-determinator/assets/mappings.zip
+        env:
+          AWS_ACCESS_KEY_ID: ""
+          AWS_SECRET_ACCESS_KEY: ""
+          AWS_SESSION_TOKEN: ""
+          AWS_DEFAULT_REGION: ""
+          AWS_REGION: ""
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always()
diff --git a/.github/workflows/mac-mps.yml b/.github/workflows/mac-mps.yml
index d2ec160e07f4b..f57ea0fdd07df 100644
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@@ -10,6 +10,8 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
+permissions: read-all
+
 jobs:
   macos-12-py3-arm64-build:
     name: macos-12-py3-arm64
@@ -17,7 +19,7 @@ jobs:
     with:
       sync-tag: macos-12-py3-arm64-build
       build-environment: macos-12-py3-arm64
-      runner-type: macos-m1-12
+      runner-type: macos-m1-stable
       build-generates-artifacts: true
       # To match the one pre-installed in the m1 runners
       python-version: 3.9.12
@@ -27,7 +29,7 @@ jobs:
       environment-file: .github/requirements/conda-env-macOS-ARM64
       test-matrix: |
         { include: [
-          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-12" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-stable" },
           { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-14" },
         ]}
 
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 76c38c032f573..25f71c70e9486 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -31,7 +31,7 @@ jobs:
     with:
       build-environment: linux-jammy-py3.8-gcc11
       docker-image: ${{ needs.docs-build.outputs.docker-image }}
-      push: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }}
+      push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }}
       run-doxygen: true
     secrets:
       GH_PYTORCHBOT_TOKEN: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
@@ -40,16 +40,14 @@ jobs:
     runs-on: ubuntu-latest
     environment: update-commit-hash
     steps:
-      - name: Checkout repo
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
       - name: update-vision-commit-hash
-        uses: ./.github/actions/update-commit-hash
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@main
         if: ${{ github.event_name == 'schedule' }}
         with:
           repo-name: vision
           branch: main
+          pin-folder: .github/ci_commit_pins
+          test-infra-ref: main
           updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }}
           pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
 
@@ -57,16 +55,14 @@ jobs:
     runs-on: ubuntu-latest
     environment: update-commit-hash
     steps:
-      - name: Checkout repo
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
       - name: update-audio-commit-hash
-        uses: ./.github/actions/update-commit-hash
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@main
         if: ${{ github.event_name == 'schedule' }}
         with:
           repo-name: audio
           branch: main
+          pin-folder: .github/ci_commit_pins
+          test-infra-ref: main
           updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }}
           pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
 
@@ -74,16 +70,13 @@ jobs:
     runs-on: ubuntu-latest
     environment: update-commit-hash
     steps:
-      - name: Checkout repo
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
       - name: update-executorch-commit-hash
-        uses: ./.github/actions/update-commit-hash
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@main
         if: ${{ github.event_name == 'schedule' }}
         with:
           repo-name: executorch
           branch: main
           pin-folder: .ci/docker/ci_commit_pins
+          test-infra-ref: main
           updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }}
           pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index a6a8c6efffe97..716a72cc6d235 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -20,7 +20,24 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
   cancel-in-progress: true
 
+permissions: read-all
+
 jobs:
+  llm-td:
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
   parallelnative-linux-jammy-py3_8-gcc11-build:
     name: parallelnative-linux-jammy-py3.8-gcc11
     uses: ./.github/workflows/_linux-build.yml
@@ -37,7 +54,9 @@ jobs:
   parallelnative-linux-jammy-py3_8-gcc11-test:
     name: parallelnative-linux-jammy-py3.8-gcc11
     uses: ./.github/workflows/_linux-test.yml
-    needs: parallelnative-linux-jammy-py3_8-gcc11-build
+    needs:
+      - parallelnative-linux-jammy-py3_8-gcc11-build
+      - target-determination
     with:
       build-environment: parallelnative-linux-jammy-py3.8-gcc11
       docker-image: ${{ needs.parallelnative-linux-jammy-py3_8-gcc11-build.outputs.docker-image }}
@@ -84,7 +103,9 @@ jobs:
   linux-focal-cuda11_8-py3_10-gcc9-debug-test:
     name: linux-focal-cuda11.8-py3.10-gcc9-debug
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda11_8-py3_10-gcc9-debug-build
+    needs:
+      - linux-focal-cuda11_8-py3_10-gcc9-debug-build
+      - target-determination
     with:
       build-environment: linux-focal-cuda11.8-py3.10-gcc9-debug
       docker-image: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-debug-build.outputs.docker-image }}
@@ -108,7 +129,9 @@ jobs:
   win-vs2019-cuda11_8-py3-test:
     name: win-vs2019-cuda11.8-py3
     uses: ./.github/workflows/_win-test.yml
-    needs: win-vs2019-cuda11_8-py3-build
+    needs:
+      - win-vs2019-cuda11_8-py3-build
+      - target-determination
     with:
       build-environment: win-vs2019-cuda11.8-py3
       cuda-version: "11.8"
@@ -194,11 +217,11 @@ jobs:
       docker-image: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.test-matrix }}
 
-  linux-focal-rocm5_7-py3_8-build:
-    name: linux-focal-rocm5.7-py3.8
+  linux-focal-rocm6_1-py3_8-build:
+    name: linux-focal-rocm6.1-py3.8
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-rocm5.7-py3.8
+      build-environment: linux-focal-rocm6.1-py3.8
       docker-image-name: pytorch-linux-focal-rocm-n-py3
       test-matrix: |
         { include: [
@@ -206,11 +229,16 @@ jobs:
           { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
         ]}
 
-  linux-focal-rocm5_7-py3_8-test:
-    name: linux-focal-rocm5.7-py3.8
+  linux-focal-rocm6_1-py3_8-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-focal-rocm6.1-py3.8
     uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_7-py3_8-build
+    needs:
+      - linux-focal-rocm6_1-py3_8-build
+      - target-determination
     with:
-      build-environment: linux-focal-rocm5.7-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.1-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 4c4b33a38868f..0ca9e0d33c8f9 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -17,10 +17,27 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+permissions: read-all
+
 jobs:
+  llm-td:
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
   linux-jammy-py3_8-gcc11-build:
     name: linux-jammy-py3.8-gcc11
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-jammy-py3.8-gcc11
       docker-image-name: pytorch-linux-jammy-py3.8-gcc11
@@ -39,7 +56,9 @@ jobs:
   linux-jammy-py3_8-gcc11-test:
     name: linux-jammy-py3.8-gcc11
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3_8-gcc11-build
+    needs:
+      - linux-jammy-py3_8-gcc11-build
+      - target-determination
     with:
       build-environment: linux-jammy-py3.8-gcc11
       docker-image: ${{ needs.linux-jammy-py3_8-gcc11-build.outputs.docker-image }}
@@ -55,7 +74,7 @@ jobs:
 
   linux-jammy-py3_8-gcc11-no-ops:
     name: linux-jammy-py3.8-gcc11-no-ops
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-jammy-py3.8-gcc11-no-ops
       docker-image-name: pytorch-linux-jammy-py3.8-gcc11
@@ -66,7 +85,7 @@ jobs:
 
   linux-jammy-py3_8-gcc11-pch:
     name: linux-jammy-py3.8-gcc11-pch
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-jammy-py3.8-gcc11-pch
       docker-image-name: pytorch-linux-jammy-py3.8-gcc11
@@ -75,9 +94,10 @@ jobs:
           { config: "default", shard: 1, num_shards: 1 },
         ]}
 
+
   linux-jammy-py3_10-clang15-asan-build:
     name: linux-jammy-py3.10-clang15-asan
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-jammy-py3.10-clang15-asan
       docker-image-name: pytorch-linux-jammy-py3-clang15-asan
@@ -92,10 +112,13 @@ jobs:
         ]}
       sync-tag: asan-build
 
+
   linux-jammy-py3_10-clang15-asan-test:
     name: linux-jammy-py3.10-clang15-asan
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3_10-clang15-asan-build
+    needs:
+      - linux-jammy-py3_10-clang15-asan-build
+      - target-determination
     with:
       build-environment: linux-jammy-py3.10-clang15-asan
       docker-image: ${{ needs.linux-jammy-py3_10-clang15-asan-build.outputs.docker-image }}
@@ -104,7 +127,7 @@ jobs:
 
   linux-focal-py3_8-clang10-onnx-build:
     name: linux-focal-py3.8-clang10-onnx
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-focal-py3.8-clang10-onnx
       docker-image-name: pytorch-linux-focal-py3-clang10-onnx
@@ -117,7 +140,9 @@ jobs:
   linux-focal-py3_8-clang10-onnx-test:
     name: linux-focal-py3.8-clang10-onnx
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_8-clang10-onnx-build
+    needs:
+      - linux-focal-py3_8-clang10-onnx-build
+      - target-determination
     with:
       build-environment: linux-focal-py3.8-clang10-onnx
       docker-image: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.docker-image }}
@@ -125,7 +150,7 @@ jobs:
 
   linux-focal-py3_8-clang10-build:
     name: linux-focal-py3.8-clang10
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-focal-py3.8-clang10
       docker-image-name: pytorch-linux-focal-py3.8-clang10
@@ -136,19 +161,16 @@ jobs:
           { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
           { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
           { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 7, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 7, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 3, num_shards: 7, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 4, num_shards: 7, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 5, num_shards: 7, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 6, num_shards: 7, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 7, num_shards: 7, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
         ]}
-
   linux-focal-py3_8-clang10-test:
     name: linux-focal-py3.8-clang10
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_8-clang10-build
+    needs:
+      - linux-focal-py3_8-clang10-build
+      - target-determination
     with:
       build-environment: linux-focal-py3.8-clang10
       docker-image: ${{ needs.linux-focal-py3_8-clang10-build.outputs.docker-image }}
@@ -156,7 +178,7 @@ jobs:
 
   linux-focal-py3_11-clang10-build:
     name: linux-focal-py3.11-clang10
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-focal-py3.11-clang10
       docker-image-name: pytorch-linux-focal-py3.11-clang10
@@ -167,27 +189,52 @@ jobs:
           { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
           { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
           { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 7, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 7, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 3, num_shards: 7, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 4, num_shards: 7, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 5, num_shards: 7, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 6, num_shards: 7, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 7, num_shards: 7, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
         ]}
 
+
   linux-focal-py3_11-clang10-test:
     name: linux-focal-py3.11-clang10
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_11-clang10-build
+    needs:
+      - linux-focal-py3_11-clang10-build
+      - target-determination
     with:
       build-environment: linux-focal-py3.11-clang10
       docker-image: ${{ needs.linux-focal-py3_11-clang10-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-py3_11-clang10-build.outputs.test-matrix }}
 
+  linux-focal-py3_12-clang10-build:
+    name: linux-focal-py3.12-clang10
+    uses: ./.github/workflows/_linux-build-label.yml
+    with:
+      build-environment: linux-focal-py3.12-clang10
+      docker-image-name: pytorch-linux-focal-py3.12-clang10
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+        ]}
+
+  linux-focal-py3_12-clang10-test:
+    name: linux-focal-py3.12-clang10
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-py3_12-clang10-build
+    with:
+      build-environment: linux-focal-py3.12-clang10
+      docker-image: ${{ needs.linux-focal-py3_12-clang10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_12-clang10-build.outputs.test-matrix }}
+      timeout-minutes: 600
+
   linux-focal-cuda11_8-py3_10-gcc9-build:
     name: linux-focal-cuda11.8-py3.10-gcc9
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-focal-cuda11.8-py3.10-gcc9
       docker-image-name: pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9
@@ -201,7 +248,9 @@ jobs:
   linux-focal-cuda11_8-py3_10-gcc9-test:
     name: linux-focal-cuda11.8-py3.10-gcc9
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda11_8-py3_10-gcc9-build
+    needs:
+      - linux-focal-cuda11_8-py3_10-gcc9-build
+      - target-determination
     with:
       timeout-minutes: 360
       build-environment: linux-focal-cuda11.8-py3.10-gcc9
@@ -210,7 +259,7 @@ jobs:
 
   linux-focal-cuda12_1-py3_10-gcc9-build:
     name: linux-focal-cuda12.1-py3.10-gcc9
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-focal-cuda12.1-py3.10-gcc9
       docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
@@ -227,7 +276,9 @@ jobs:
   linux-focal-cuda12_1-py3_10-gcc9-test:
     name: linux-focal-cuda12.1-py3.10-gcc9
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_1-py3_10-gcc9-build
+    needs:
+      - linux-focal-cuda12_1-py3_10-gcc9-build
+      - target-determination
     with:
       timeout-minutes: 360
       build-environment: linux-focal-cuda12.1-py3.10-gcc9
@@ -236,10 +287,10 @@ jobs:
 
   linux-jammy-py3-clang12-mobile-build:
     name: linux-jammy-py3-clang12-mobile-build
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-jammy-py3-clang12-mobile-build
-      docker-image-name: pytorch-linux-jammy-py3-clang12-asan
+      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
       build-generates-artifacts: false
       test-matrix: |
         { include: [
@@ -248,7 +299,7 @@ jobs:
 
   linux-jammy-cuda-11_8-cudnn8-py3_8-clang12-build:
     name: linux-jammy-cuda11.8-cudnn8-py3.8-clang12
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-jammy-cuda11.8-cudnn8-py3.8-clang12
       docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12
@@ -259,7 +310,7 @@ jobs:
 
   linux-focal-py3-clang9-mobile-custom-build-static:
     name: linux-focal-py3-clang9-mobile-custom-build-static
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-focal-py3-clang9-mobile-custom-build-static
       docker-image-name: pytorch-linux-focal-py3-clang9-android-ndk-r21e
@@ -271,9 +322,9 @@ jobs:
 
   linux-focal-py3_8-clang9-xla-build:
     name: linux-focal-py3_8-clang9-xla
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
-      build-environment: linux-focal-py3_8-clang9-xla
+      build-environment: linux-focal-py3.8-clang9-xla
       docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.1-lite
       test-matrix: |
         { include: [
@@ -285,7 +336,7 @@ jobs:
     uses: ./.github/workflows/_linux-test.yml
     needs: linux-focal-py3_8-clang9-xla-build
     with:
-      build-environment: linux-focal-py3_8-clang9-xla
+      build-environment: linux-focal-py3.8-clang9-xla
       docker-image: ${{ needs.linux-focal-py3_8-clang9-xla-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-py3_8-clang9-xla-build.outputs.test-matrix }}
 
@@ -353,7 +404,7 @@ jobs:
 
   linux-jammy-py3_8-gcc11-mobile-lightweight-dispatch-build:
     name: linux-jammy-py3.8-gcc11-mobile-lightweight-dispatch-build
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-jammy-py3.8-gcc111-mobile-lightweight-dispatch-build
       docker-image-name: pytorch-linux-jammy-py3.8-gcc11
@@ -363,13 +414,13 @@ jobs:
           { config: "default", shard: 1, num_shards: 1 },
         ]}
 
-  linux-focal-rocm5_7-py3_8-build:
+  linux-focal-rocm6_1-py3_8-build:
     # don't run build twice on main
     if: github.event_name == 'pull_request'
-    name: linux-focal-rocm5.7-py3.8
-    uses: ./.github/workflows/_linux-build.yml
+    name: linux-focal-rocm6.1-py3.8
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
-      build-environment: linux-focal-rocm5.7-py3.8
+      build-environment: linux-focal-rocm6.1-py3.8
       docker-image-name: pytorch-linux-focal-rocm-n-py3
       sync-tag: rocm-build
       test-matrix: |
@@ -381,7 +432,7 @@ jobs:
 
   linux-focal-cuda12_1-py3_10-gcc9-sm86-build:
     name: linux-focal-cuda12.1-py3.10-gcc9-sm86
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
       docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
@@ -398,7 +449,9 @@ jobs:
   linux-focal-cuda12_1-py3_10-gcc9-sm86-test:
     name: linux-focal-cuda12.1-py3.10-gcc9-sm86
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_1-py3_10-gcc9-sm86-build
+    needs:
+      - linux-focal-cuda12_1-py3_10-gcc9-sm86-build
+      - target-determination
     with:
       build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
       docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-sm86-build.outputs.docker-image }}
@@ -406,7 +459,7 @@ jobs:
 
   linux-jammy-py3-clang12-executorch-build:
     name: linux-jammy-py3-clang12-executorch
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-jammy-py3-clang12-executorch
       docker-image-name: pytorch-linux-jammy-py3-clang12-executorch
diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml
index 856d13a33a4c3..c32abe592bef2 100644
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@@ -15,12 +15,21 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+permissions: read-all
+
 jobs:
-  linux-focal-rocm5_7-py3_8-build:
-    name: linux-focal-rocm5.7-py3.8
-    uses: ./.github/workflows/_linux-build.yml
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  linux-focal-rocm6_1-py3_8-build:
+    name: linux-focal-rocm6.1-py3.8
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
-      build-environment: linux-focal-rocm5.7-py3.8
+      build-environment: linux-focal-rocm6.1-py3.8
       docker-image-name: pytorch-linux-focal-rocm-n-py3
       sync-tag: rocm-build
       test-matrix: |
@@ -33,11 +42,16 @@ jobs:
           { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" },
         ]}
 
-  linux-focal-rocm5_7-py3_8-test:
-    name: linux-focal-rocm5.7-py3.8
+  linux-focal-rocm6_1-py3_8-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-focal-rocm6.1-py3.8
     uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_7-py3_8-build
+    needs:
+      - linux-focal-rocm6_1-py3_8-build
+      - target-determination
     with:
-      build-environment: linux-focal-rocm5.7-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.1-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml
index a9b0c654b4cbc..31db7af8fc550 100644
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@@ -18,7 +18,24 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
   cancel-in-progress: true
 
+permissions: read-all
+
 jobs:
+  llm-td:
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
   linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-build:
     name: linux-focal-cuda12.1-py3-gcc9-slow-gradcheck
     uses: ./.github/workflows/_linux-build.yml
@@ -28,16 +45,20 @@ jobs:
       cuda-arch-list: 8.6
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
         ]}
 
   linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-test:
     name: linux-focal-cuda12.1-py3-gcc9-slow-gradcheck
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-build
+    needs:
+      - linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-build
+      - target-determination
     with:
       build-environment: linux-focal-cuda12.1-py3-gcc9-slow-gradcheck
       docker-image: ${{ needs.linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-build.outputs.docker-image }}
@@ -60,7 +81,9 @@ jobs:
   linux-focal-cuda12_1-py3_10-gcc9-sm86-test:
     name: linux-focal-cuda12.1-py3.10-gcc9-sm86
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_1-py3_10-gcc9-sm86-build
+    needs:
+      - linux-focal-cuda12_1-py3_10-gcc9-sm86-build
+      - target-determination
     with:
       build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
       docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-sm86-build.outputs.docker-image }}
@@ -80,49 +103,59 @@ jobs:
   linux-focal-py3_8-clang10-test:
     name: linux-focal-py3.8-clang10
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_8-clang10-build
+    needs:
+      - linux-focal-py3_8-clang10-build
+      - target-determination
     with:
       build-environment: linux-focal-py3.8-clang10
       docker-image: ${{ needs.linux-focal-py3_8-clang10-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-py3_8-clang10-build.outputs.test-matrix }}
 
-  linux-focal-rocm5_6-py3_8-build:
-    name: linux-focal-rocm5.6-py3.8
+  linux-focal-rocm6_1-py3_8-build:
+    name: linux-focal-rocm6.1-py3.8
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-rocm5.6-py3.8
+      build-environment: linux-focal-rocm6.1-py3.8
       docker-image-name: pytorch-linux-focal-rocm-n-py3
       test-matrix: |
         { include: [
           { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
         ]}
 
-  linux-focal-rocm5_6-py3_8-test:
-    name: linux-focal-rocm5.6-py3.8
+  linux-focal-rocm6_1-py3_8-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-focal-rocm6.1-py3.8
     uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_6-py3_8-build
+    needs:
+      - linux-focal-rocm6_1-py3_8-build
+      - target-determination
     with:
-      build-environment: linux-focal-rocm5.6-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_6-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm5_6-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.1-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
 
   linux-jammy-py3_10-clang15-asan-build:
     name: linux-jammy-py3.10-clang15-asan
-    uses: ./.github/workflows/_linux-build.yml
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
       build-environment: linux-jammy-py3.10-clang15-asan
       docker-image-name: pytorch-linux-jammy-py3-clang15-asan
       test-matrix: |
         { include: [
-          { config: "slow", shard: 1, num_shards: 2, runner: "linux.4xlarge" },
-          { config: "slow", shard: 2, num_shards: 2, runner: "linux.4xlarge" },
+          { config: "slow", shard: 1, num_shards: 3, runner: "linux.4xlarge" },
+          { config: "slow", shard: 2, num_shards: 3, runner: "linux.4xlarge" },
+          { config: "slow", shard: 3, num_shards: 3, runner: "linux.4xlarge" },
         ]}
       sync-tag: asan-build
 
   linux-jammy-py3_10-clang15-asan-test:
     name: linux-jammy-py3.10-clang15-asan
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3_10-clang15-asan-build
+    needs:
+      - linux-jammy-py3_10-clang15-asan-build
+      - target-determination
     with:
       build-environment: linux-jammy-py3.10-clang15-asan
       docker-image: ${{ needs.linux-jammy-py3_10-clang15-asan-build.outputs.docker-image }}
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 39fe67da05d5a..56e349dfa1b82 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -22,6 +22,9 @@ jobs:
   stale:
     if: ${{ github.repository == 'pytorch/pytorch' }}
     runs-on: linux.large.arc
+    permissions:
+      contents: read
+      pull-requests: write
 
     steps:
       - uses: actions/github-script@v6
diff --git a/.github/workflows/target-determination-indexer.yml b/.github/workflows/target-determination-indexer.yml
new file mode 100644
index 0000000000000..0ce1bae6a4138
--- /dev/null
+++ b/.github/workflows/target-determination-indexer.yml
@@ -0,0 +1,144 @@
+name: Index PyTorch Tests for Target Determination
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * *'
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  index:
+    runs-on: linux.g5.4xlarge.nvidia.gpu # 1 GPU A10G 24GB each
+    environment: target-determinator-env
+    steps:
+      - name: Clone PyTorch
+        uses: actions/checkout@v3
+        with:
+          path: pytorch
+
+      - name: Setup Linux
+        uses: ./pytorch/.github/actions/setup-linux
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
+          working-directory: pytorch
+
+      - name: Use following to pull public copy of the image
+        id: print-ghcr-mirror
+        env:
+          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+        shell: bash
+        run: |
+          tag=${ECR_DOCKER_IMAGE##*/}
+          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        id: install-nvidia-driver
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+
+      - name: Clone CodeLlama
+        uses: actions/checkout@v3
+        with:
+          repository: osalpekar/codellama
+          ref: 1ec50e0cfc0fadc3b6ceb146617e2119ab26eb34
+          path: codellama
+
+      - name: Clone Target Determination Code
+        uses: actions/checkout@v3
+        with:
+          repository: osalpekar/llm-target-determinator
+          ref: v0.0.2
+          path: llm-target-determinator
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_target_determinator_s3_read_write
+          aws-region: us-east-1
+
+      - name: Download checkpoint
+        shell: bash
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+        run: |
+          # Do this outside of docker so I don't have to put env vars in
+          pip3 install awscli==1.29.40
+          cd codellama
+          mkdir "CodeLlama-7b-Python"
+          aws s3 cp \
+            "s3://target-determinator-assets/CodeLlama-7b-Python" \
+            "CodeLlama-7b-Python" \
+            --recursive
+
+      - name: Run indexer
+        shell: bash -l {0}
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          AWS_DEFAULT_REGION: us-east-1
+        run: |
+          # detached container should get cleaned up by teardown_ec2_linux
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e AWS_DEFAULT_REGION \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --detach \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          chmod +x pytorch/.github/scripts/td_llm_indexer.sh
+          docker exec -t "${container_name}" sh -c 'pytorch/.github/scripts/td_llm_indexer.sh'
+
+      - name: Upload to s3
+        shell: bash -l {0}
+        env:
+          AWS_DEFAULT_REGION: us-east-1
+        run: |
+          cd llm-target-determinator/assets
+
+          TIMESTAMP=$(date -Iseconds)
+          ZIP_NAME="indexer-files-${TIMESTAMP}.zip"
+
+          # Create a zipfile with all the generated indices
+          zip -r "${ZIP_NAME}" indexer-files
+
+          # Note that because the below 2 operations are not atomic, there will
+          # be a period of a few seconds between these where there is no index
+          # present in the latest/ folder. To account for this, the retriever
+          # should have some retry logic with backoff to ensure fetching the
+          # index doesn't fail.
+          # Move the old index into the archived/ folder
+          aws s3 mv \
+            "s3://target-determinator-assets/indexes/latest" \
+            "s3://target-determinator-assets/indexes/archived" \
+            --recursive
+
+          # Move the new index into the latestl/ folder
+          aws s3 cp \
+            "${ZIP_NAME}" \
+            "s3://target-determinator-assets/indexes/latest/${ZIP_NAME}"
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always()
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
diff --git a/.github/workflows/target_determination.yml b/.github/workflows/target_determination.yml
new file mode 100644
index 0000000000000..cd5e758345b59
--- /dev/null
+++ b/.github/workflows/target_determination.yml
@@ -0,0 +1,81 @@
+name: target-determination
+
+on:
+  workflow_call:
+
+jobs:
+  target-determination:
+    # Don't run on forked repos
+    if: github.repository_owner == 'pytorch'
+    runs-on: linux.2xlarge
+    steps:
+      # [pytorch repo ref]
+      # Use a pytorch/pytorch reference instead of a reference to the local
+      # checkout because when we run this action we don't *have* a local
+      # checkout. In other cases you should prefer a local checkout.
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Download pytest cache
+        uses: ./.github/actions/pytest-cache-download
+        continue-on-error: true
+        with:
+          cache_dir: .pytest_cache
+          job_identifier: ${{ github.workflow }}
+
+      - name: Download LLM Artifacts from S3
+        uses: seemethere/download-artifact-s3@v4
+        continue-on-error: true
+        with:
+          name: llm_results
+          path: .additional_ci_files/llm_results
+
+      - name: Do TD
+        id: td
+        continue-on-error: true
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_WORKFLOW: ${{ github.workflow }}
+          GITHUB_JOB: ${{ github.job }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_RUN_NUMBER: ${{ github.run_number }}
+          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
+          GITHUB_REF: ${{ github.ref }}
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+        run: |
+          unzip -o .additional_ci_files/llm_results/mappings.zip -d .additional_ci_files/llm_results || true
+          python3 -m pip install boto3==1.19.12
+          python3 tools/testing/do_target_determination_for_s3.py
+
+      - name: Upload TD results to s3
+        uses: seemethere/upload-artifact-s3@v5
+        if: steps.td.outcome == 'success'
+        with:
+          name: td_results
+          retention-days: 14
+          if-no-files-found: error
+          path: td_results.json
+
+      - name: Store TD results on GHA
+        uses: actions/upload-artifact@v3
+        if: steps.td.outcome == 'success'
+        with:
+          name: td_results.json
+          retention-days: 14
+          if-no-files-found: error
+          path: td_results.json
diff --git a/.github/workflows/torchbench.yml b/.github/workflows/torchbench.yml
new file mode 100644
index 0000000000000..73befe34c0782
--- /dev/null
+++ b/.github/workflows/torchbench.yml
@@ -0,0 +1,38 @@
+name: torchbench
+
+on:
+  push:
+    tags:
+      - ciflow/torchbench/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  linux-focal-cuda12_1-py3_10-gcc9-torchbench-build-gcp:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "torchbench_gcp_smoketest", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
+        ]}
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
+  linux-focal-cuda12_1-py3_10-gcc9-torchbench-test-gcp:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_1-py3_10-gcc9-torchbench-build-gcp
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-torchbench-build-gcp.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-torchbench-build-gcp.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 5ded7ac152cf1..a990ad6941db1 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -16,7 +16,24 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
+permissions: read-all
+
 jobs:
+  llm-td:
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
   # Build PyTorch with BUILD_CAFFE2=ON
   caffe2-linux-jammy-py3_8-gcc11-build:
     name: caffe2-linux-jammy-py3.8-gcc11
@@ -45,7 +62,9 @@ jobs:
   linux-focal-cuda12_1-py3_10-gcc9-test:
     name: linux-focal-cuda12.1-py3.10-gcc9
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_1-py3_10-gcc9-build
+    needs:
+      - linux-focal-cuda12_1-py3_10-gcc9-build
+      - target-determination
     with:
       build-environment: linux-focal-cuda12.1-py3.10-gcc9
       docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-build.outputs.docker-image }}
@@ -93,7 +112,7 @@ jobs:
     with:
       sync-tag: macos-12-py3-arm64-build
       build-environment: macos-12-py3-arm64
-      runner-type: macos-m1-12
+      runner-type: macos-m1-stable
       build-generates-artifacts: true
       # To match the one pre-installed in the m1 runners
       python-version: 3.9.12
@@ -103,9 +122,9 @@ jobs:
       environment-file: .github/requirements/conda-env-macOS-ARM64
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-12" },
-          { config: "default", shard: 2, num_shards: 3, runner: "macos-m1-12" },
-          { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-12" },
+          { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
+          { config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" },
+          { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" },
         ]}
 
   macos-12-py3-arm64-mps-test:
@@ -120,13 +139,17 @@ jobs:
       python-version: 3.9.12
       test-matrix: |
         { include: [
-          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-12" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-stable" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
+
         ]}
 
   macos-12-py3-arm64-test:
     name: macos-12-py3-arm64
     uses: ./.github/workflows/_mac-test.yml
-    needs: macos-12-py3-arm64-build
+    needs:
+      - macos-12-py3-arm64-build
+      - target-determination
     with:
       build-environment: macos-12-py3-arm64
       # Same as the build job
@@ -151,7 +174,9 @@ jobs:
   win-vs2019-cpu-py3-test:
     name: win-vs2019-cpu-py3
     uses: ./.github/workflows/_win-test.yml
-    needs: win-vs2019-cpu-py3-build
+    needs:
+      - win-vs2019-cpu-py3-build
+      - target-determination
     with:
       build-environment: win-vs2019-cpu-py3
       cuda-version: cpu
@@ -175,11 +200,11 @@ jobs:
           { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge.nonephemeral" },
         ]}
 
-  linux-focal-rocm5_7-py3_8-build:
-    name: linux-focal-rocm5.7-py3.8
-    uses: ./.github/workflows/_linux-build.yml
+  linux-focal-rocm6_1-py3_8-build:
+    name: linux-focal-rocm6.1-py3.8
+    uses: ./.github/workflows/_linux-build-label.yml
     with:
-      build-environment: linux-focal-rocm5.7-py3.8
+      build-environment: linux-focal-rocm6.1-py3.8
       docker-image-name: pytorch-linux-focal-rocm-n-py3
       sync-tag: rocm-build
       test-matrix: |
@@ -187,12 +212,17 @@ jobs:
           { config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
         ]}
 
-  linux-focal-rocm5_7-py3_8-test:
-    name: linux-focal-rocm5.7-py3.8
+  linux-focal-rocm6_1-py3_8-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-focal-rocm6.1-py3.8
     uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_7-py3_8-build
+    needs:
+      - linux-focal-rocm6_1-py3_8-build
+      - target-determination
     with:
-      build-environment: linux-focal-rocm5.7-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.1-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
       tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
diff --git a/.github/workflows/unstable-periodic.yml b/.github/workflows/unstable-periodic.yml
index df422752f7e3e..9a41bbd44f268 100644
--- a/.github/workflows/unstable-periodic.yml
+++ b/.github/workflows/unstable-periodic.yml
@@ -13,6 +13,8 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
   cancel-in-progress: true
 
+permissions: read-all
+
 jobs:
   # There must be at least one job here to satisfy GitHub action workflow syntax
   introduction:
diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml
index 7a803b54ef8bb..ac1d49d1cce57 100644
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@@ -12,6 +12,8 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
+permissions: read-all
+
 jobs:
   # There must be at least one job here to satisfy GitHub action workflow syntax
   introduction:
@@ -30,3 +32,174 @@ jobs:
           echo
           echo "Once the jobs are deemed stable enough (% red signal < 5% and TTS < 3h),"
           echo " they can graduate and move back to pull or trunk."
+
+  #
+  # Experimental ARC jobs
+  #
+  llm-td:
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
+  linux-jammy-py3_8-gcc11-build:
+    name: linux-jammy-py3.8-gcc11
+    uses: ./.github/workflows/_linux-build-rg.yml
+    with:
+      build-environment: linux-jammy-py3.8-gcc11
+      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "docs_test", shard: 1, num_shards: 1,  runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "distributed", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+        ]}
+
+  linux-jammy-py3_8-gcc11-test:
+    name: linux-jammy-py3.8-gcc11
+    uses: ./.github/workflows/_linux-test-rg.yml
+    needs:
+      - linux-jammy-py3_8-gcc11-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.8-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_8-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_8-gcc11-build.outputs.test-matrix }}
+
+  linux-jammy-py3_8-gcc11-no-ops:
+    name: linux-jammy-py3.8-gcc11-no-ops
+    uses: ./.github/workflows/_linux-build-rg.yml
+    with:
+      build-environment: linux-jammy-py3.8-gcc11-no-ops
+      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+
+  linux-jammy-py3_8-gcc11-pch:
+    name: linux-jammy-py3.8-gcc11-pch
+    uses: ./.github/workflows/_linux-build-rg.yml
+    with:
+      build-environment: linux-jammy-py3.8-gcc11-pch
+      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+
+  linux-focal-py3_8-clang10-onnx-build:
+    name: linux-focal-py3.8-clang10-onnx
+    uses: ./.github/workflows/_linux-build-rg.yml
+    with:
+      build-environment: linux-focal-py3.8-clang10-onnx
+      docker-image-name: pytorch-linux-focal-py3-clang10-onnx
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+        ]}
+
+  linux-focal-py3_8-clang10-onnx-test:
+    name: linux-focal-py3.8-clang10-onnx
+    uses: ./.github/workflows/_linux-test-rg.yml
+    needs:
+      - linux-focal-py3_8-clang10-onnx-build
+      - target-determination
+    with:
+      build-environment: linux-focal-py3.8-clang10-onnx
+      docker-image: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.test-matrix }}
+
+  linux-jammy-py3_10-clang15-asan-build:
+    name: linux-jammy-py3.10-clang15-asan
+    uses: ./.github/workflows/_linux-build-rg.yml
+    with:
+      build-environment: linux-jammy-py3.10-clang15-asan
+      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.4xlarge" },
+        ]}
+      sync-tag: asan-build-arc
+
+  linux-focal-py3_8-clang10-build:
+    name: linux-focal-py3.8-clang10
+    uses: ./.github/workflows/_linux-build-rg.yml
+    with:
+      build-environment: linux-focal-py3.8-clang10
+      docker-image-name: pytorch-linux-focal-py3.8-clang10
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+        ]}
+
+  linux-focal-py3_8-clang10-test:
+    name: linux-focal-py3.8-clang10
+    uses: ./.github/workflows/_linux-test-rg.yml
+    needs:
+      - linux-focal-py3_8-clang10-build
+      - target-determination
+    with:
+      build-environment: linux-focal-py3.8-clang10
+      docker-image: ${{ needs.linux-focal-py3_8-clang10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_8-clang10-build.outputs.test-matrix }}
+
+  linux-focal-py3_11-clang10-build:
+    name: linux-focal-py3.11-clang10
+    uses: ./.github/workflows/_linux-build-rg.yml
+    with:
+      build-environment: linux-focal-py3.11-clang10
+      docker-image-name: pytorch-linux-focal-py3.11-clang10
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+        ]}
+
+  linux-focal-py3_11-clang10-test:
+    name: linux-focal-py3.11-clang10
+    uses: ./.github/workflows/_linux-test-rg.yml
+    needs:
+      - linux-focal-py3_11-clang10-build
+      - target-determination
+    with:
+      build-environment: linux-focal-py3.11-clang10
+      docker-image: ${{ needs.linux-focal-py3_11-clang10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_11-clang10-build.outputs.test-matrix }}
+
+  #
+  # End of Experimental ARC jobs
+  #
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 7b3d42f70ee8a..94a712b377484 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -13,46 +13,13 @@ jobs:
   do_update_viablestrict:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ubuntu-20.04
-    environment: mergebot
+    environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }}
     steps:
-      - name: Checkout repo
-        uses: actions/checkout@v3
+      - name: Update viable/strict
+        uses: pytorch/test-infra/.github/actions/update-viablestrict@main
         with:
-          fetch-depth: 0
-          token: ${{ secrets.MERGEBOT_TOKEN }}
-
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/.ci/docker/requirements-ci.txt
-            **/.github/requirements-gha-cache.txt
-
-      - name: Install Python Packages
-        run: |
-          pip3 install rockset==1.0.3
-          pip3 install boto3==1.19.12
-
-      - name: Get latest viable commit
-        env:
-          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
-        run: |
-          output=$(python3 .github/scripts/fetch_latest_green_commit.py)
-          echo "latest_viable_sha=$output" >> "${GITHUB_OUTPUT}"
-        id: get-latest-commit
-
-      - name: Push SHA to viable/strict branch
-        if: steps.get-latest-commit.outputs.latest_viable_sha != 'None'
-        env:
-          GITHUB_TOKEN: ${{ secrets.MERGEBOT_TOKEN }}
-        run: |
-          git config --global user.email "pytorchmergebot@users.noreply.github.com"
-          git config --global user.name "PyTorch MergeBot"
-          echo "Set the latest sha variable to be ${{ steps.get-latest-commit.outputs.latest_viable_sha }}"
-          # Pushing an older green commit here will fail because it's non-fast-forward, which is ok
-          # to ignore because we already have the later green commit in visable/strict
-          git push origin "${{ steps.get-latest-commit.outputs.latest_viable_sha }}":viable/strict || true
+          repository: pytorch/pytorch
+          stable-branch: viable/strict
+          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
+          secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
+          rockset-api-key: ${{ secrets.ROCKSET_API_KEY }}
diff --git a/.github/workflows/update_pytorch_labels.yml b/.github/workflows/update_pytorch_labels.yml
index 81dc264cbcd67..db09474fb2120 100644
--- a/.github/workflows/update_pytorch_labels.yml
+++ b/.github/workflows/update_pytorch_labels.yml
@@ -12,16 +12,22 @@ jobs:
   update-labels-in-S3:
     runs-on: ubuntu-22.04
     if: ${{ github.repository == 'pytorch/pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
         with:
           fetch-depth: 1
           submodules: false
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_update_pytorch_labels
+          aws-region: us-east-1
       - name: Update PyTorch labels list in S3
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
         run: |
           python3 -m pip install boto3==1.19.12
           .github/scripts/export_pytorch_labels.py pytorch pytorch
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index 836d25cf5f280..f097b146c21f8 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -8,6 +8,8 @@ on:
     - cron: 37 7 * * 1
   workflow_dispatch:
 
+permissions: read-all
+
 jobs:
   update-commit-hash:
     runs-on: ubuntu-latest
@@ -19,18 +21,21 @@ jobs:
           fetch-depth: 0
       - name: update-xla-commit-hash
         continue-on-error: true
-        uses: ./.github/actions/update-commit-hash
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@main
         with:
           repo-name: xla
           branch: master
+          pin-folder: .github/ci_commit_pins
+          test-infra-ref: main
           updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }}
           pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
       - name: update-triton-commit-hash
-        uses: ./.github/actions/update-commit-hash
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@main
         with:
           repo-owner: openai
           repo-name: triton
           branch: main
           pin-folder: .ci/docker/ci_commit_pins
+          test-infra-ref: main
           updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }}
           pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml
index 6cd8909ba8007..b48a7c01cc3be 100644
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@@ -30,6 +30,9 @@ jobs:
     name: linux-jammy-xpu-py3.8
     uses: ./.github/workflows/_xpu-test.yml
     needs: linux-jammy-xpu-py3_8-build
+    permissions:
+      id-token: write
+      contents: read
     with:
       build-environment: linux-jammy-xpu-py3.8
       docker-image: ${{ needs.linux-jammy-xpu-py3_8-build.outputs.docker-image }}
diff --git a/.gitignore b/.gitignore
index 20019ecd170f8..bfb3013c6d191 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,6 +54,7 @@ test/.coverage
 test/.hypothesis/
 test/cpp/api/mnist
 test/custom_operator/model.pt
+test/debug/
 test/jit_hooks/*.pt
 test/data/legacy_modules.t7
 test/data/*.pt
@@ -86,6 +87,7 @@ torch/csrc/api/include/torch/version.h
 torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
+torch/csrc/inductor/aoti_torch/generated/*.cpp
 torch/csrc/jit/generated/*
 torch/csrc/jit/fuser/config.h
 torch/csrc/nn/THCUNN.cpp
@@ -126,6 +128,7 @@ env
 .circleci/scripts/COMMIT_MSG
 scripts/release_notes/*.json
 sccache-stats*.json
+lint.json
 
 # These files get copied over on invoking setup.py
 torchgen/packaged/*
diff --git a/.gitmodules b/.gitmodules
index 7e1b09e591cd5..c9b84a3701674 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -149,3 +149,6 @@
 [submodule "third_party/mimalloc"]
 	path = third_party/mimalloc
 	url = https://github.com/microsoft/mimalloc.git
+[submodule "third_party/opentelemetry-cpp"]
+	path = third_party/opentelemetry-cpp
+	url = https://github.com/open-telemetry/opentelemetry-cpp.git
diff --git a/.isort.cfg b/.isort.cfg
deleted file mode 100644
index d14d9bf207e6f..0000000000000
--- a/.isort.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-[settings]
-include_trailing_comma=True
-multi_line_output=3
-skip=third_party
-skip_gitignore=True
-use_parentheses=True
diff --git a/.lintrunner.toml b/.lintrunner.toml
index c1f4cb6b5b9b4..7f76a35415ca7 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -1,5 +1,3 @@
-merge_base_with = "origin/main"
-
 [[linter]]
 code = 'FLAKE8'
 include_patterns = ['**/*.py']
@@ -48,7 +46,7 @@ init_command = [
     'mccabe==0.7.0',
     'pycodestyle==2.11.1',
     'pyflakes==3.1.0',
-    'torchfix==0.2.0',
+    'torchfix==0.4.0 ; python_version >= "3.9"',
 ]
 
 
@@ -57,6 +55,8 @@ code = 'CLANGFORMAT'
 include_patterns = [
     'aten/src/ATen/*.h',
     'aten/src/ATen/mps/**/*.mm',
+    'aten/src/ATen/xpu/**/*.h',
+    'aten/src/ATen/xpu/**/*.cpp',
     'aten/src/ATen/native/mps/**/*.mm',
     'aten/src/ATen/native/vulkan/**/*.h',
     'aten/src/ATen/native/vulkan/**/*.cpp',
@@ -64,6 +64,8 @@ include_patterns = [
     'aten/src/ATen/native/**/Foreach*.*',
     'aten/src/ATen/native/cuda/fused*.*',
     'aten/src/ATen/native/cuda/Fused*.cu',
+    'aten/src/ATen/native/cudnn/*.h',
+    'aten/src/ATen/native/cudnn/*.cpp',
     'c10/**/*.h',
     'c10/**/*.cpp',
     'torch/csrc/**/*.h',
@@ -76,6 +78,7 @@ exclude_patterns = [
     'aten/src/ATen/native/vulkan/api/vk_mem_alloc.h',
     'c10/util/strong_type.h',
     '**/fb/**',
+    'torch/csrc/inductor/aoti_torch/generated/**',
     'torch/csrc/jit/serialization/mobile_bytecode_generated.h',
     'torch/csrc/utils/pythoncapi_compat.h',
     'aten/src/ATen/dlpack.h',
@@ -118,39 +121,6 @@ include_patterns = [
 ]
 exclude_patterns = [
     '**/fb/**',
-    'torch/include/**',
-    'torch/csrc/**',
-    'torch/_dynamo/**/*.py',
-    'torch/_inductor/**/*.py',
-    'torch/_numpy/**/*.py',
-    'torch/_functorch/aot_autograd.py',
-    'torch/_functorch/benchmark_utils.py',
-    'torch/_functorch/compile_utils.py',
-    'torch/_functorch/compilers.py',
-    'torch/_functorch/eager_transforms.py',
-    'torch/_functorch/fx_minifier.py',
-    'torch/_functorch/partitioners.py',
-    'torch/_functorch/top_operators_github_usage.py',
-    'torch/_functorch/vmap.py',
-    'torch/_subclasses/schema_check_mode.py',
-    'torch/distributed/elastic/agent/server/api.py',
-    'torch/testing/_internal/**',
-    'torch/distributed/fsdp/fully_sharded_data_parallel.py',
-    # TODO(suo): these exclusions were added just to get lint clean on master.
-    # Follow up to do more target suppressions and remove them.
-    'torch/ao/quantization/fx/convert.py',
-    'torch/ao/quantization/_dbr/function_fusion.py',
-    'test/test_datapipe.py',
-    'caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py',
-    'test/test_numpy_interop.py',
-    'torch/torch_version.py',
-    'torch/fx/proxy.py',
-    'torch/fx/passes/shape_prop.py',
-    'torch/fx/node.py',
-    'torch/fx/experimental/symbolic_shapes.py',
-    'torch/fx/experimental/proxy_tensor.py',
-    'torch/_subclasses/fake_utils.py',
-    'torch/_subclasses/fake_tensor.py',
 ]
 command = [
     'python3',
@@ -166,45 +136,20 @@ init_command = [
     'numpy==1.24.3 ; python_version == "3.8"',
     'numpy==1.26.0 ; python_version >= "3.9"',
     'expecttest==0.1.6',
-    'mypy==1.7.0',
+    'mypy==1.9.0',
+    'sympy==1.11.1',
     'types-requests==2.27.25',
     'types-PyYAML==6.0.7',
     'types-tabulate==0.8.8',
     'types-protobuf==3.19.18',
     'types-pkg-resources==0.1.3',
     'types-Jinja2==2.11.9',
+    'types-colorama==0.4.6',
+    'filelock==3.13.1',
     'junitparser==2.1.1',
     'rich==10.9.0',
-    'pyyaml==6.0',
-    'optree==0.10.0',
-]
-
-[[linter]]
-code = 'MYPYINDUCTOR'
-include_patterns = [
-    'torch/_dynamo/**/*.py',
-    'torch/_inductor/**/*.py',
-]
-exclude_patterns = [
-    '**/fb/**',
-    'torch/_dynamo/backends/**/*.py',
-    'torch/_dynamo/variables/**/*.py',
-    'torch/_dynamo/polyfill.py',
-    'torch/_inductor/fx_passes/serialized_patterns/**',
-]
-command = [
-    'python3',
-    'tools/linter/adapters/mypy_linter.py',
-    '--config=mypy-inductor.ini',
-    '--code=MYPYINDUCTOR',
-    '--',
-    '@{{PATHSFILE}}'
-]
-init_command = [
-    'python3',
-    'tools/linter/adapters/pip_init.py',
-    '--dry-run={{DRYRUN}}',
-    'types-colorama==0.4.6',
+    'pyyaml==6.0.1',
+    'optree==0.11.0',
 ]
 
 [[linter]]
@@ -242,10 +187,19 @@ command = [
 [[linter]]
 code = 'CLANGTIDY'
 include_patterns = [
+    # Enable coverage of headers in aten/src/ATen
+    # and excluding most sub-directories for now.
+    'aten/src/ATen/*.h',
+    'aten/src/ATen/*.cpp',
+    'aten/src/ATen/core/*.h',
     'aten/src/ATen/core/*.cpp',
+    'aten/src/ATen/functorch/*.h',
+    'aten/src/ATen/functorch/*.cpp',
     'c10/**/*.cpp',
-    'c10/core/**/*.h',
-    'c10/util/**/*.h',
+    'c10/**/*.h',
+    'torch/csrc/*.h',
+    'torch/csrc/*.cpp',
+    'torch/csrc/**/*.h',
     'torch/csrc/**/*.cpp',
 ]
 exclude_patterns = [
@@ -254,8 +208,10 @@ exclude_patterns = [
     # CUDA files are also excluded.
     '**/fb/**',
     '**/*pb.h',
-    '**/*CUDA*',
-    '**/cuda/*pp',
+    'aten/**/cuda/*pp',
+    'c10/xpu/**/*.h',
+    'c10/xpu/**/*.cpp',
+    'c10/cuda/CUDAAlgorithm.h',
     'c10/util/complex_math.h',
     'c10/util/complex_utils.h',
     'c10/util/flat_hash_map.h',
@@ -266,14 +222,13 @@ exclude_patterns = [
     'c10/util/SmallVector.h',
     'c10/util/win32-headers.h',
     'c10/util/*inl.h',
+    'c10/test/**/*.h',
     'aten/src/ATen/core/TensorImpl_test.cpp',
     'third_party/**/*',
     'torch/csrc/api/**',
     'torch/csrc/autograd/generated/**',
-    'torch/csrc/autograd/profiler_legacy.cpp',
-    'torch/csrc/cuda/**',
-    'torch/csrc/dynamo/*',
     'torch/csrc/distributed/**/*',
+    'torch/csrc/dynamo/eval_frame.h',
     'torch/csrc/inductor/**/*',
     'torch/csrc/jit/**/*',
     'torch/csrc/jit/serialization/import_legacy.cpp',
@@ -322,6 +277,26 @@ command = [
     '@{{PATHSFILE}}'
 ]
 
+[[linter]]
+code = 'TYPENOSKIP'
+include_patterns = ['mypy.ini']
+command = [
+    'python3',
+    'tools/linter/adapters/grep_linter.py',
+    '--pattern=follow_imports\s*=\s*skip',
+    '--linter-name=TYPENOSKIP',
+    '--error-name=use of follow_imports = skip',
+    """--error-description=\
+        follow_imports = skip is forbidden from mypy.ini configuration as it \
+        is extremely easy to accidentally turn off type checking unintentionally.  If \
+        you need to suppress type errors, use a top level # mypy: ignore-errors.  \
+        Do not rely on automatic Any substitution; instead, manually # type: ignore \
+        at use sites or define a pyi type stub with more relaxed types. \
+    """,
+    '--',
+    '@{{PATHSFILE}}'
+]
+
 [[linter]]
 code = 'NOQA'
 include_patterns = ['**/*.py', '**/*.pyi']
@@ -1006,7 +981,7 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
-    'PyYAML==6.0',
+    'PyYAML==6.0.1',
 ]
 
 # Black + usort
@@ -1040,424 +1015,9 @@ exclude_patterns = [
     'test/_nvfuser/test_dynamo.py',
     'test/_nvfuser/test_python_frontend.py',
     'test/_nvfuser/test_torchscript.py',
-    'test/_test_bazel.py',
-    'test/ao/sparsity/test_activation_sparsifier.py',
-    'test/ao/sparsity/test_composability.py',
-    'test/ao/sparsity/test_data_scheduler.py',
-    'test/ao/sparsity/test_data_sparsifier.py',
-    'test/ao/sparsity/test_kernels.py',
-    'test/ao/sparsity/test_parametrization.py',
-    'test/ao/sparsity/test_qlinear_packed_params.py',
-    'test/ao/sparsity/test_scheduler.py',
-    'test/ao/sparsity/test_sparsifier.py',
-    'test/ao/sparsity/test_sparsity_utils.py',
-    'test/ao/sparsity/test_structured_sparsifier.py',
-    'test/autograd/test_complex.py',
-    'test/autograd/test_fallback.py',
-    'test/autograd/test_functional.py',
-    'test/backends/xeon/test_launch.py',
-    'test/benchmark_utils/test_benchmark_utils.py',
-    'test/bottleneck_test/test.py',
-    'test/bottleneck_test/test_args.py',
-    'test/bottleneck_test/test_cuda.py',
-    'test/conftest.py',
-    'test/cpp/__init__.py',
-    'test/cpp/aot_inductor/test.py',
-    'test/cpp/api/init_baseline.py',
-    'test/cpp/api/optim_baseline.py',
-    'test/cpp/jit/__init__.py',
-    'test/cpp/jit/tests_setup.py',
-    'test/cpp_api_parity/__init__.py',
-    'test/cpp_api_parity/functional_impl_check.py',
-    'test/cpp_api_parity/module_impl_check.py',
-    'test/cpp_api_parity/parity_table_parser.py',
-    'test/cpp_api_parity/sample_functional.py',
-    'test/cpp_api_parity/sample_module.py',
-    'test/cpp_api_parity/utils.py',
-    'test/cpp_extensions/no_python_abi_suffix_test/setup.py',
-    'test/cpp_extensions/setup.py',
-    'test/cpp_extensions/torch_test_cpp_extension/__init__.py',
-    'test/create_dummy_torchscript_model.py',
-    'test/custom_backend/backend.py',
-    'test/custom_backend/test_custom_backend.py',
-    'test/custom_operator/model.py',
-    'test/custom_operator/test_custom_ops.py',
     'test/delete.py',
-    'test/distributed/_shard/sharded_optim/test_sharded_optim.py',
-    'test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py',
-    'test/distributed/_shard/sharded_tensor/ops/test_embedding.py',
-    'test/distributed/_shard/sharded_tensor/ops/test_embedding_bag.py',
-    'test/distributed/_shard/sharded_tensor/ops/test_init.py',
-    'test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py',
-    'test/distributed/_shard/sharded_tensor/test_logger.py',
-    'test/distributed/_shard/sharded_tensor/test_sharded_tensor.py',
-    'test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py',
-    'test/distributed/_shard/sharding_plan/test_sharding_plan.py',
-    'test/distributed/_shard/sharding_spec/test_sharding_spec.py',
-    'test/distributed/_shard/test_sharder.py',
-    'test/distributed/_tools/test_memory_tracker.py',
-    'test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py',
-    'test/distributed/algorithms/quantization/test_quantization.py',
-    'test/distributed/algorithms/test_join.py',
-    'test/distributed/argparse_util_test.py',
-    'test/distributed/bin/test_script.py',
-    'test/distributed/elastic/agent/server/test/__init__.py',
-    'test/distributed/elastic/agent/server/test/api_test.py',
-    'test/distributed/elastic/agent/server/test/local_elastic_agent_test.py',
-    'test/distributed/elastic/events/lib_test.py',
-    'test/distributed/elastic/metrics/__init__.py',
-    'test/distributed/elastic/metrics/api_test.py',
-    'test/distributed/elastic/multiprocessing/api_test.py',
-    'test/distributed/elastic/multiprocessing/bin/echo1.py',
-    'test/distributed/elastic/multiprocessing/bin/echo2.py',
-    'test/distributed/elastic/multiprocessing/bin/echo3.py',
-    'test/distributed/elastic/multiprocessing/bin/test_script.py',
-    'test/distributed/elastic/multiprocessing/bin/zombie_test.py',
-    'test/distributed/elastic/multiprocessing/errors/api_test.py',
-    'test/distributed/elastic/multiprocessing/errors/error_handler_test.py',
-    'test/distributed/elastic/multiprocessing/redirects_test.py',
-    'test/distributed/elastic/multiprocessing/tail_log_test.py',
-    'test/distributed/elastic/rendezvous/__init__.py',
-    'test/distributed/elastic/rendezvous/api_test.py',
-    'test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py',
-    'test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py',
-    'test/distributed/elastic/rendezvous/etcd_rendezvous_backend_test.py',
-    'test/distributed/elastic/rendezvous/etcd_rendezvous_test.py',
-    'test/distributed/elastic/rendezvous/etcd_server_test.py',
-    'test/distributed/elastic/rendezvous/rendezvous_backend_test.py',
-    'test/distributed/elastic/rendezvous/static_rendezvous_test.py',
-    'test/distributed/elastic/rendezvous/utils_test.py',
-    'test/distributed/elastic/timer/__init__.py',
-    'test/distributed/elastic/timer/api_test.py',
-    'test/distributed/elastic/timer/file_based_local_timer_test.py',
-    'test/distributed/elastic/timer/local_timer_example.py',
-    'test/distributed/elastic/timer/local_timer_test.py',
-    'test/distributed/elastic/utils/__init__.py',
-    'test/distributed/elastic/utils/data/__init__.py',
-    'test/distributed/elastic/utils/data/cycling_iterator_test.py',
-    'test/distributed/elastic/utils/distributed_test.py',
-    'test/distributed/elastic/utils/logging_test.py',
-    'test/distributed/elastic/utils/util_test.py',
-    'test/distributed/launcher/__init__.py',
-    'test/distributed/launcher/api_test.py',
-    'test/distributed/launcher/bin/test_script.py',
-    'test/distributed/launcher/bin/test_script_init_method.py',
-    'test/distributed/launcher/bin/test_script_is_torchelastic_launched.py',
-    'test/distributed/launcher/bin/test_script_local_rank.py',
-    'test/distributed/launcher/launch_test.py',
-    'test/distributed/launcher/run_test.py',
-    'test/distributed/nn/jit/__init__.py',
-    'test/distributed/nn/jit/test_instantiator.py',
-    'test/distributed/optim/test_apply_optimizer_in_backward.py',
-    'test/distributed/optim/test_named_optimizer.py',
-    'test/distributed/optim/test_zero_redundancy_optimizer.py',
-    'test/distributed/pipeline/sync/__init__.py',
-    'test/distributed/pipeline/sync/conftest.py',
-    'test/distributed/pipeline/sync/skip/__init__.py',
-    'test/distributed/pipeline/sync/skip/test_api.py',
-    'test/distributed/pipeline/sync/skip/test_gpipe.py',
-    'test/distributed/pipeline/sync/skip/test_inspect_skip_layout.py',
-    'test/distributed/pipeline/sync/skip/test_leak.py',
-    'test/distributed/pipeline/sync/skip/test_portal.py',
-    'test/distributed/pipeline/sync/skip/test_stash_pop.py',
-    'test/distributed/pipeline/sync/skip/test_tracker.py',
-    'test/distributed/pipeline/sync/skip/test_verify_skippables.py',
-    'test/distributed/pipeline/sync/test_balance.py',
-    'test/distributed/pipeline/sync/test_bugs.py',
-    'test/distributed/pipeline/sync/test_checkpoint.py',
-    'test/distributed/pipeline/sync/test_copy.py',
-    'test/distributed/pipeline/sync/test_deferred_batch_norm.py',
-    'test/distributed/pipeline/sync/test_dependency.py',
-    'test/distributed/pipeline/sync/test_inplace.py',
-    'test/distributed/pipeline/sync/test_microbatch.py',
-    'test/distributed/pipeline/sync/test_phony.py',
-    'test/distributed/pipeline/sync/test_pipe.py',
-    'test/distributed/pipeline/sync/test_pipeline.py',
-    'test/distributed/pipeline/sync/test_stream.py',
-    'test/distributed/pipeline/sync/test_transparency.py',
-    'test/distributed/pipeline/sync/test_worker.py',
-    'test/distributed/rpc/cuda/test_tensorpipe_agent.py',
-    'test/distributed/rpc/test_faulty_agent.py',
-    'test/distributed/rpc/test_share_memory.py',
-    'test/distributed/rpc/test_tensorpipe_agent.py',
-    'test/distributed/tensor/parallel/__init__.py',
-    'test/distributed/tensor/parallel/test_ddp_2d_parallel.py',
-    'test/distributed/tensor/parallel/test_fsdp_2d_parallel.py',
-    'test/distributed/tensor/parallel/test_parallelize_api.py',
-    'test/distributed/tensor/parallel/test_tp_examples.py',
-    'test/distributed/tensor/parallel/test_tp_random_state.py',
-    'test/distributed/tensor/parallel/test_tp_style.py',
-    'test/distributed/tensor/parallel/test_view_sharding_dim_change.py',
-    'test/distributed/test_c10d_common.py',
-    'test/distributed/test_c10d_gloo.py',
-    'test/distributed/test_c10d_logger.py',
-    'test/distributed/test_c10d_nccl.py',
-    'test/distributed/test_c10d_object_collectives.py',
-    'test/distributed/test_c10d_pypg.py',
-    'test/distributed/test_c10d_spawn.py',
-    'test/distributed/test_c10d_spawn_gloo.py',
-    'test/distributed/test_c10d_spawn_nccl.py',
-    'test/distributed/test_c10d_spawn_ucc.py',
-    'test/distributed/test_c10d_ucc.py',
-    'test/distributed/test_collective_utils.py',
-    'test/distributed/test_data_parallel.py',
-    'test/distributed/test_distributed_spawn.py',
-    'test/distributed/test_dynamo_distributed.py',
-    'test/distributed/test_fake_pg.py',
-    'test/distributed/test_functional_api.py',
-    'test/distributed/test_inductor_collectives.py',
-    'test/distributed/test_launcher.py',
-    'test/distributed/test_multi_threaded_pg.py',
-    'test/distributed/test_nccl.py',
-    'test/distributed/test_pg_wrapper.py',
-    'test/distributed/test_store.py',
-    'test/distributions/test_constraints.py',
-    'test/distributions/test_distributions.py',
-    'test/distributions/test_transforms.py',
-    'test/distributions/test_utils.py',
-    'test/error_messages/storage.py',
     'test/expect/__init__.py',
-    'test/export/test_db.py',
-    'test/export/test_export.py',
-    'test/export/test_funtionalized_assertions.py',
-    'test/export/test_pass_infra.py',
-    'test/export/test_passes.py',
-    'test/export/test_serialize.py',
-    'test/export/test_upgrade.py',
-    'test/export/test_verifier.py',
-    'test/export/test_unflatten.py',
-    'test/forward_backward_compatibility/check_forward_backward_compatibility.py',
-    'test/forward_backward_compatibility/dump_all_function_schemas.py',
-    'test/functorch/attn_ft.py',
-    'test/functorch/attn_positional.py',
-    'test/functorch/common_utils.py',
-    'test/functorch/discover_coverage.py',
-    'test/functorch/functorch_additional_op_db.py',
-    'test/functorch/test_aotdispatch.py',
-    'test/functorch/test_control_flow.py',
-    'test/functorch/test_dims.py',
-    'test/functorch/test_eager_transforms.py',
-    'test/functorch/test_logging.py',
-    'test/functorch/test_memory_efficient_fusion.py',
-    'test/functorch/test_minifier.py',
-    'test/functorch/test_ops.py',
-    'test/functorch/test_parsing.py',
-    'test/functorch/test_rearrange.py',
-    'test/functorch/test_vmap.py',
-    'test/functorch/test_vmap_registrations.py',
-    'test/functorch/xfail_suggester.py',
-    'test/fx/named_tup.py',
-    'test/fx/quantization.py',
-    'test/fx/test_common_passes.py',
-    'test/fx/test_cse_pass.py',
-    'test/fx/test_dce_pass.py',
-    'test/fx/test_future.py',
-    'test/fx/test_fx_const_fold.py',
-    'test/fx/test_fx_param_shape_control_flow.py',
-    'test/fx/test_gradual_type.py',
-    'test/fx/test_matcher_utils.py',
-    'test/fx/test_pass_infra.py',
-    'test/fx/test_source_matcher_utils.py',
-    'test/fx/test_subgraph_rewriter.py',
-    'test/fx/test_z3_gradual_types.py',
-    'test/fx/test_fx_split.py',
-    'test/jit/__init__.py',
-    'test/jit/_imported_class_test/__init__.py',
-    'test/jit/_imported_class_test/bar.py',
-    'test/jit/_imported_class_test/foo.py',
-    'test/jit/_imported_class_test/very/__init__.py',
-    'test/jit/_imported_class_test/very/very/__init__.py',
-    'test/jit/_imported_class_test/very/very/nested.py',
-    'test/jit/fixtures_srcs/__init__.py',
-    'test/jit/fixtures_srcs/fixtures_src.py',
-    'test/jit/fixtures_srcs/generate_models.py',
-    'test/jit/fixtures_srcs/test_upgrader_models_generation.py',
-    'test/jit/myexception.py',
-    'test/jit/test_alias_analysis.py',
-    'test/jit/test_async.py',
-    'test/jit/test_aten_pow.py',
-    'test/jit/test_attr.py',
-    'test/jit/test_autodiff.py',
-    'test/jit/test_autodiff_subgraph_slicing.py',
-    'test/jit/test_await.py',
-    'test/jit/test_backend_nnapi.py',
-    'test/jit/test_backends.py',
-    'test/jit/test_batch_mm.py',
-    'test/jit/test_builtins.py',
-    'test/jit/test_class_type.py',
-    'test/jit/test_complex.py',
-    'test/jit/test_complexity.py',
-    'test/jit/test_convert_activation.py',
-    'test/jit/test_cuda.py',
-    'test/jit/test_custom_operators.py',
-    'test/jit/test_data_parallel.py',
-    'test/jit/test_dataclasses.py',
-    'test/jit/test_dce.py',
-    'test/jit/test_device_analysis.py',
-    'test/jit/test_dtype_analysis.py',
-    'test/jit/test_enum.py',
-    'test/jit/test_exception.py',
-    'test/jit/test_freezing.py',
-    'test/jit/test_functional_blocks.py',
-    'test/jit/test_fuser_common.py',
-    'test/jit/test_graph_rewrite_passes.py',
-    'test/jit/test_hash.py',
-    'test/jit/test_hooks.py',
-    'test/jit/test_hooks_modules.py',
-    'test/jit/test_ignorable_args.py',
-    'test/jit/test_ignore_context_manager.py',
-    'test/jit/test_isinstance.py',
-    'test/jit/test_jit_utils.py',
-    'test/jit/test_list_dict.py',
-    'test/jit/test_logging.py',
-    'test/jit/test_misc.py',
-    'test/jit/test_models.py',
-    'test/jit/test_module_apis.py',
-    'test/jit/test_module_containers.py',
-    'test/jit/test_module_interface.py',
-    'test/jit/test_modules.py',
-    'test/jit/test_op_decompositions.py',
-    'test/jit/test_optimize_for_mobile_preserve_debug_info.py',
-    'test/jit/test_parametrization.py',
-    'test/jit/test_pdt.py',
-    'test/jit/test_peephole.py',
-    'test/jit/test_profiler.py',
-    'test/jit/test_python_bindings.py',
-    'test/jit/test_python_builtins.py',
-    'test/jit/test_python_ir.py',
-    'test/jit/test_recursive_script.py',
-    'test/jit/test_remove_mutation.py',
-    'test/jit/test_save_load.py',
-    'test/jit/test_save_load_for_op_version.py',
-    'test/jit/test_script_profile.py',
-    'test/jit/test_scriptmod_ann.py',
-    'test/jit/test_slice.py',
-    'test/jit/test_sparse.py',
-    'test/jit/test_string_formatting.py',
-    'test/jit/test_symbolic_shape_analysis.py',
-    'test/jit/test_tensor_creation_ops.py',
-    'test/jit/test_tensor_methods.py',
-    'test/jit/test_torchbind.py',
-    'test/jit/test_tracer.py',
-    'test/jit/test_type_sharing.py',
-    'test/jit/test_types.py',
-    'test/jit/test_typing.py',
-    'test/jit/test_union.py',
-    'test/jit/test_unsupported_ops.py',
-    'test/jit/test_upgraders.py',
-    'test/jit/test_warn.py',
-    'test/jit/test_with.py',
-    'test/jit/xnnpack/test_xnnpack_delegate.py',
-    'test/jit_hooks/model.py',
-    'test/lazy/__init__.py',
-    'test/lazy/test_bindings.py',
-    'test/lazy/test_debug_util.py',
-    'test/lazy/test_extract_compiled_graph.py',
-    'test/lazy/test_meta_kernel.py',
-    'test/lazy/test_reuse_ir.py',
-    'test/lazy/test_step_closures.py',
-    'test/lazy/test_ts_opinfo.py',
-    'test/linear.py',
-    'test/load_torchscript_model.py',
-    'test/mkl_verbose.py',
-    'test/mkldnn_verbose.py',
-    'test/mobile/custom_build/prepare_model.py',
-    'test/mobile/lightweight_dispatch/tests_setup.py',
-    'test/mobile/model_test/android_api_module.py',
-    'test/mobile/model_test/builtin_ops.py',
-    'test/mobile/model_test/gen_test_model.py',
-    'test/mobile/model_test/math_ops.py',
-    'test/mobile/model_test/nn_ops.py',
-    'test/mobile/model_test/quantization_ops.py',
-    'test/mobile/model_test/sampling_ops.py',
-    'test/mobile/model_test/tensor_ops.py',
-    'test/mobile/model_test/torchvision_models.py',
-    'test/mobile/model_test/update_production_ops.py',
-    'test/mobile/nnc/aot_test_model.py',
-    'test/mobile/test_bytecode.py',
-    'test/mobile/test_lite_script_module.py',
-    'test/mobile/test_lite_script_type.py',
-    'test/mobile/test_quantize_fx_lite_script_module.py',
-    'test/mobile/test_upgrader_codegen.py',
-    'test/mobile/test_upgraders.py',
-    'test/nn/test_convolution.py',
-    'test/nn/test_dropout.py',
-    'test/nn/test_embedding.py',
-    'test/nn/test_init.py',
-    'test/nn/test_lazy_modules.py',
-    'test/nn/test_module_hooks.py',
-    'test/nn/test_multihead_attention.py',
-    'test/nn/test_packed_sequence.py',
-    'test/nn/test_parametrization.py',
-    'test/nn/test_pooling.py',
-    'test/nn/test_pruning.py',
-    'test/onnx_caffe2/export_onnx_tests_filter.py',
-    'test/onnx_caffe2/export_onnx_tests_generator.py',
-    'test/onnx_caffe2/test_caffe2_common.py',
-    'test/onnx_caffe2/test_custom_ops.py',
-    'test/onnx_caffe2/test_pytorch_helper.py',
-    'test/onnx_caffe2/test_pytorch_onnx_caffe2.py',
-    'test/onnx_caffe2/test_pytorch_onnx_caffe2_quantized.py',
-    'test/onnx_caffe2/test_verify.py',
-    'test/optim/test_lrscheduler.py',
-    'test/optim/test_optim.py',
-    'test/optim/test_swa_utils.py',
-    'test/package/__init__.py',
-    'test/package/common.py',
-    'test/package/generate_bc_packages.py',
-    'test/package/module_a.py',
-    'test/package/module_a_remapped_path.py',
-    'test/package/package_a/__init__.py',
-    'test/package/package_a/fake_interface.py',
-    'test/package/package_a/fake_script_class.py',
-    'test/package/package_a/long_name.py',
-    'test/package/package_a/std_sys_module_hacks.py',
-    'test/package/package_a/subpackage.py',
-    'test/package/package_a/test_all_leaf_modules_tracer.py',
-    'test/package/package_a/test_module.py',
-    'test/package/package_a/test_nn_module.py',
-    'test/package/package_a/use_dunder_package.py',
-    'test/package/package_a/use_torch_package_importer.py',
-    'test/package/package_b/__init__.py',
-    'test/package/package_b/subpackage_0/__init__.py',
-    'test/package/package_b/subpackage_0/subsubpackage_0/__init__.py',
-    'test/package/package_b/subpackage_1.py',
-    'test/package/package_b/subpackage_2.py',
-    'test/package/package_c/__init__.py',
-    'test/package/package_c/test_module.py',
-    'test/package/package_d/__init__.py',
-    'test/package/package_d/imports_directly.py',
-    'test/package/package_d/imports_indirectly.py',
-    'test/package/package_d/subpackage_0/__init__.py',
-    'test/package/package_d/subpackage_0/subsubpackage_0/__init__.py',
-    'test/package/test_analyze.py',
-    'test/package/test_dependency_api.py',
-    'test/package/test_dependency_hooks.py',
-    'test/package/test_digraph.py',
-    'test/package/test_directory_reader.py',
-    'test/package/test_glob_group.py',
-    'test/package/test_importer.py',
-    'test/package/test_load_bc_packages.py',
-    'test/package/test_mangling.py',
-    'test/package/test_misc.py',
-    'test/package/test_model.py',
-    'test/package/test_package_fx.py',
-    'test/package/test_package_script.py',
-    'test/package/test_repackage.py',
-    'test/package/test_resources.py',
-    'test/package/test_save_load.py',
-    'test/package/test_trace_dep/__init__.py',
-    'test/profiler/test_memory_profiler.py',
-    'test/profiler/test_profiler.py',
-    'test/profiler/test_profiler_tree.py',
     'test/quantization/__init__.py',
-    'test/quantization/ao_migration/__init__.py',
-    'test/quantization/ao_migration/common.py',
-    'test/quantization/ao_migration/test_ao_migration.py',
-    'test/quantization/ao_migration/test_quantization.py',
-    'test/quantization/ao_migration/test_quantization_fx.py',
-    'test/quantization/bc/__init__.py',
-    'test/quantization/bc/test_backward_compatibility.py',
     'test/quantization/core/__init__.py',
     'test/quantization/core/experimental/apot_fx_graph_mode_ptq.py',
     'test/quantization/core/experimental/apot_fx_graph_mode_qat.py',
@@ -1492,54 +1052,12 @@ exclude_patterns = [
     'test/quantization/fx/test_numeric_suite_fx.py',
     'test/quantization/fx/test_quantize_fx.py',
     'test/quantization/fx/test_subgraph_rewriter.py',
-    'test/quantization/jit/__init__.py',
-    'test/quantization/jit/test_deprecated_jit_quant.py',
-    'test/quantization/jit/test_fusion_passes.py',
-    'test/quantization/jit/test_ondevice_quantization.py',
-    'test/quantization/jit/test_quantize_jit.py',
-    'test/quantization/pt2e/test_graph_utils.py',
-    'test/quantization/pt2e/test_quantize_pt2e.py',
-    'test/quantization/pt2e/test_x86inductor_quantizer.py',
-    'test/scripts/cuda_memcheck_common.py',
-    'test/scripts/run_cuda_memcheck.py',
-    'test/simulate_nccl_errors.py',
-    'test/test_ao_sparsity.py',
-    'test/test_autocast.py',
-    'test/test_autograd.py',
-    'test/test_binary_ufuncs.py',
-    'test/test_bundled_images.py',
-    'test/test_bundled_inputs.py',
-    'test/test_comparison_utils.py',
-    'test/test_compile_benchmark_util.py',
-    'test/test_complex.py',
-    'test/test_cpp_api_parity.py',
-    'test/test_cpp_extensions_aot.py',
-    'test/test_cpp_extensions_jit.py',
-    'test/test_cpp_extensions_open_device_registration.py',
-    'test/test_cuda.py',
-    'test/test_cuda_expandable_segments.py',
-    'test/test_cuda_multigpu.py',
-    'test/test_cuda_nvml_based_avail.py',
-    'test/test_cuda_primary_ctx.py',
-    'test/test_cuda_sanitizer.py',
-    'test/test_cuda_trace.py',
-    'test/test_custom_op_testing.py',
-    'test/test_dataloader.py',
     'test/test_datapipe.py',
-    'test/test_decomp.py',
-    'test/test_deploy.py',
-    'test/test_determination.py',
-    'test/test_dispatch.py',
-    'test/test_dlpack.py',
-    'test/test_dynamic_shapes.py',
-    'test/test_expanded_weights.py',
     'test/test_fake_tensor.py',
     'test/test_flop_counter.py',
-    'test/test_foreach.py',
     'test/test_function_schema.py',
     'test/test_functional_autograd_benchmark.py',
     'test/test_functional_optim.py',
-    'test/test_functionalization.py',
     'test/test_functionalization_of_rng_ops.py',
     'test/test_futures.py',
     'test/test_fx.py',
@@ -1548,7 +1066,6 @@ exclude_patterns = [
     'test/test_fx_reinplace_pass.py',
     'test/test_hub.py',
     'test/test_import_stats.py',
-    'test/test_indexing.py',
     'test/test_itt.py',
     'test/test_jit.py',
     'test/test_jit_autocast.py',
@@ -1564,7 +1081,6 @@ exclude_patterns = [
     'test/test_jit_string.py',
     'test/test_jiterator.py',
     'test/test_kernel_launch_checks.py',
-    'test/test_legacy_vmap.py',
     'test/test_license.py',
     'test/test_linalg.py',
     'test/test_logging.py',
@@ -1579,11 +1095,9 @@ exclude_patterns = [
     'test/test_mkldnn_verbose.py',
     'test/test_mobile_optimizer.py',
     'test/test_model_dump.py',
-    'test/test_module_init.py',
     'test/test_modules.py',
     'test/test_monitor.py',
     'test/test_mps.py',
-    'test/test_multiprocessing.py',
     'test/test_multiprocessing_spawn.py',
     'test/test_namedtensor.py',
     'test/test_namedtuple_return_api.py',
@@ -1597,10 +1111,6 @@ exclude_patterns = [
     'test/test_nvfuser_dynamo.py',
     'test/test_nvfuser_frontend.py',
     'test/test_openmp.py',
-    'test/test_ops.py',
-    'test/test_ops_fwd_gradients.py',
-    'test/test_ops_gradients.py',
-    'test/test_ops_jit.py',
     'test/test_optim.py',
     'test/test_out_dtype_op.py',
     'test/test_overrides.py',
@@ -1610,7 +1120,6 @@ exclude_patterns = [
     'test/test_proxy_tensor.py',
     'test/test_pruning_op.py',
     'test/test_public_bindings.py',
-    'test/test_python_dispatch.py',
     'test/test_quantization.py',
     'test/test_reductions.py',
     'test/test_scatter_gather_ops.py',
@@ -1642,7 +1151,6 @@ exclude_patterns = [
     'test/test_type_promotion.py',
     'test/test_unary_ufuncs.py',
     'test/test_utils.py',
-    'test/test_view_ops.py',
     'test/test_vulkan.py',
     'test/test_xnnpack_integration.py',
     'test/torch_np/numpy_test/**/*.py',
@@ -1719,24 +1227,6 @@ exclude_patterns = [
     'torch/_export/serde/upgrade.py',
     'torch/_export/trace.py',
     'torch/_export/verifier.py',
-    'torch/_functorch/__init__.py',
-    'torch/_functorch/aot_autograd.py',
-    'torch/_functorch/apis.py',
-    'torch/_functorch/autograd_function.py',
-    'torch/_functorch/batch_norm_replacement.py',
-    'torch/_functorch/benchmark_utils.py',
-    'torch/_functorch/compile_utils.py',
-    'torch/_functorch/compilers.py',
-    'torch/_functorch/config.py',
-    'torch/_functorch/deprecated.py',
-    'torch/_functorch/eager_transforms.py',
-    'torch/_functorch/fx_minifier.py',
-    'torch/_functorch/partitioners.py',
-    'torch/_functorch/pyfunctorch.py',
-    'torch/_functorch/python_key.py',
-    'torch/_functorch/top_operators_github_usage.py',
-    'torch/_functorch/utils.py',
-    'torch/_functorch/vmap.py',
     'torch/_higher_order_ops/__init__.py',
     'torch/_higher_order_ops/out_dtype.py',
     'torch/_higher_order_ops/wrap.py',
@@ -1934,6 +1424,7 @@ exclude_patterns = [
     'torch/compiler/__init__.py',
     'torch/contrib/__init__.py',
     'torch/contrib/_tensorboard_vis.py',
+    "torch/cuda/_gpu_trace.py",
     'torch/cuda/_memory_viz.py',  # mypy: Value of type "object" is not indexable
     'torch/distributed/__init__.py',
     'torch/distributed/_composable_state.py',
@@ -2346,25 +1837,6 @@ exclude_patterns = [
     'torch/nn/utils/rnn.py',
     'torch/nn/utils/spectral_norm.py',
     'torch/nn/utils/weight_norm.py',
-    'torch/optim/__init__.py',
-    'torch/optim/_functional.py',
-    'torch/optim/_multi_tensor/__init__.py',
-    'torch/optim/adadelta.py',
-    'torch/optim/adagrad.py',
-    'torch/optim/adam.py',
-    'torch/optim/adamax.py',
-    'torch/optim/adamw.py',
-    'torch/optim/asgd.py',
-    'torch/optim/lbfgs.py',
-    'torch/optim/lr_scheduler.py',
-    'torch/optim/nadam.py',
-    'torch/optim/optimizer.py',
-    'torch/optim/radam.py',
-    'torch/optim/rmsprop.py',
-    'torch/optim/rprop.py',
-    'torch/optim/sgd.py',
-    'torch/optim/sparse_adam.py',
-    'torch/optim/swa_utils.py',
     'torch/overrides.py',
     'torch/quasirandom.py',
     'torch/random.py',
@@ -2399,7 +1871,7 @@ exclude_patterns = [
     'torch/testing/_internal/common_subclass.py',
     'torch/testing/_internal/common_utils.py',
     'torch/testing/_internal/composite_compliance.py',
-    'torch/testing/_internal/control_flow_opinfo_db.py',
+    'torch/testing/_internal/hop_db.py',
     'torch/testing/_internal/custom_op_db.py',
     'torch/testing/_internal/data/__init__.py',
     'torch/testing/_internal/data/network1.py',
@@ -2455,13 +1927,10 @@ exclude_patterns = [
     'torch/testing/_internal/test_module/__init__.py',
     'torch/testing/_internal/test_module/future_div.py',
     'torch/testing/_internal/test_module/no_future_div.py',
-    'torch/torch_version.py',
-    'torch/types.py',
     'torch/utils/__init__.py',
     'torch/utils/_contextlib.py',
     'torch/utils/_cpp_extension_versioner.py',
     'torch/utils/_crash_handler.py',
-    'torch/utils/_cuda_trace.py',
     'torch/utils/_device.py',
     'torch/utils/_foreach_utils.py',
     'torch/utils/_freeze.py',
@@ -2470,7 +1939,6 @@ exclude_patterns = [
     'torch/utils/_stats.py',
     'torch/utils/_sympy/__init__.py',
     'torch/utils/_sympy/functions.py',
-    'torch/utils/_sympy/value_ranges.py',
     'torch/utils/_traceback.py',
     'torch/utils/_zip.py',
     'torch/utils/backcompat/__init__.py',
@@ -2590,6 +2058,7 @@ exclude_patterns = [
     'torch/utils/viz/__init__.py',
     'torch/utils/viz/_cycles.py',
     'torch/utils/weak.py',
+    'torch/xpu/_gpu_trace.py',
 ]
 init_command = [
     'python3',
@@ -2682,7 +2151,7 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
-    'ruff==0.1.11',
+    'ruff==0.4.1',
 ]
 is_formatter = true
 
@@ -2699,3 +2168,92 @@ command = [
     '@{{PATHSFILE}}'
 ]
 is_formatter = true
+
+[[linter]]
+code = 'ATEN_CPU_GPU_AGNOSTIC'
+include_patterns = [
+    # aten source
+    "aten/src/ATen/*.cpp",
+    "aten/src/ATen/cpu/*.cpp",
+    "aten/src/ATen/functorch/**/*.cpp",
+    "aten/src/ATen/nnapi/*.cpp",
+    "aten/src/ATen/quantized/*.cpp",
+    "aten/src/ATen/vulkan/*.cpp",
+    "aten/src/ATen/metal/*.cpp",
+    "aten/src/ATen/detail/CPUGuardImpl.cpp",
+    "aten/src/ATen/detail/MetaGuardImpl.cpp",
+    # aten native source
+    "aten/src/ATen/native/cpu/*.cpp",
+    "aten/src/ATen/native/ao_sparse/cpu/kernels/*.cpp",
+    "aten/src/ATen/native/ao_sparse/quantized/cpu/kernels/*.cpp",
+    "aten/src/ATen/native/quantized/cpu/kernels/*.cpp",
+    "aten/src/ATen/native/*.cpp",
+    "aten/src/ATen/native/cpu/**/*.cpp",
+    "aten/src/ATen/native/ao_sparse/*.cpp",
+    "aten/src/ATen/native/ao_sparse/**/*.cpp",
+    "aten/src/ATen/native/ao_sparse/quantized/*.cpp",
+    "aten/src/ATen/native/ao_sparse/quantized/**/*.cpp",
+    "aten/src/ATen/native/nested/*.cpp",
+    "aten/src/ATen/native/quantized/*.cpp",
+    "aten/src/ATen/native/quantized/**/*.cpp",
+    "aten/src/ATen/native/sparse/*.cpp",
+    "aten/src/ATen/native/transformers/*.cpp",
+    "aten/src/ATen/native/utils/*.cpp",
+    "aten/src/ATen/native/xnnpack/*.cpp",
+    "aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp",
+    # aten headers
+    "aten/src/ATen/*.h",
+    "aten/src/ATen/functorch/**/*.h",
+    "aten/src/ATen/ops/*.h",
+    "aten/src/ATen/cpu/**/*.h",
+    "aten/src/ATen/nnapi/*.h",
+    "aten/src/ATen/quantized/*.h",
+    "aten/src/ATen/vulkan/*.h",
+    "aten/src/ATen/metal/*.h",
+    "aten/src/ATen/mps/*.h",
+    # aten native headers
+    "aten/src/ATen/native/*.h",
+    "aten/src/ATen/native/cpu/**/*.h",
+    "aten/src/ATen/native/nested/*.h",
+    "aten/src/ATen/native/sparse/*.h",
+    "aten/src/ATen/native/ao_sparse/*.h",
+    "aten/src/ATen/native/ao_sparse/cpu/*.h",
+    "aten/src/ATen/native/ao_sparse/quantized/*.h",
+    "aten/src/ATen/native/ao_sparse/quantized/cpu/*.h",
+    "aten/src/ATen/native/quantized/*.h",
+    "aten/src/ATen/native/quantized/cpu/*.h",
+    "aten/src/ATen/native/transformers/*.h",
+    "aten/src/ATen/native/quantized/cpu/qnnpack/include/*.h",
+    "aten/src/ATen/native/utils/*.h",
+    "aten/src/ATen/native/vulkan/ops/*.h",
+    "aten/src/ATen/native/xnnpack/*.h",
+    "aten/src/ATen/native/metal/MetalPrepackOpContext.h",
+    "aten/src/ATen/native/mps/Copy.h",
+    "aten/src/ATen/native/mkldnn/**/*.h",
+]
+exclude_patterns = [
+    "aten/src/ATen/Context.h",
+    "aten/src/ATen/Context.cpp",
+    "aten/src/ATen/DLConvertor.cpp",
+    "aten/src/ATen/core/Array.h",
+    "aten/src/ATen/native/quantized/ConvUtils.h",
+    "aten/src/ATen/native/sparse/SparseBlasImpl.cpp",  # triton implementation
+    "aten/src/ATen/native/transformers/attention.cpp",
+    "aten/src/ATen/native/**/cudnn/**",  # cudnn is cuda specific
+]
+command = [
+    'python3',
+    'tools/linter/adapters/grep_linter.py',
+    '--pattern=(^#if.*USE_ROCM.*)|(^#if.*USE_CUDA.*)',
+    '--linter-name=ATEN_CPU',
+    '--error-name=aten-cpu should be gpu agnostic',
+    """--error-description=\
+        We strongly discourage the compile-time divergence \
+        on ATen-CPU code for different GPU code. This \
+        disallows sharing the same aten-cpu shared object \
+        between different GPU backends \
+    """,
+    '--',
+    '@{{PATHSFILE}}'
+]
+is_formatter = true
diff --git a/BUILD.bazel b/BUILD.bazel
index 0afee2d8d71c9..d3084d9ebd447 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -228,6 +228,7 @@ filegroup(
         [
             "aten/src/ATen/cuda/*.cpp",
             "aten/src/ATen/cuda/detail/*.cpp",
+            "aten/src/ATen/cuda/tunable/*.cpp",
             "aten/src/ATen/cudnn/*.cpp",
             "aten/src/ATen/native/cuda/*.cpp",
             "aten/src/ATen/native/cuda/linalg/*.cpp",
@@ -445,7 +446,6 @@ cu_library(
 # caffe2
 CAFFE2_COPTS = COMMON_COPTS + [
     "-Dcaffe2_EXPORTS",
-    "-DCAFFE2_USE_GLOO",
     "-DCAFFE2_USE_CUDNN",
     "-DCAFFE2_BUILD_MAIN_LIB",
     "-fvisibility-inlines-hidden",
@@ -453,22 +453,6 @@ CAFFE2_COPTS = COMMON_COPTS + [
     "-fno-trapping-math",
 ]
 
-filegroup(
-    name = "caffe2_contrib_srcs",
-    srcs = [
-        "caffe2/contrib/aten/aten_op.cc",
-        "caffe2/contrib/gloo/allgather_ops.cc",
-        "caffe2/contrib/gloo/allreduce_ops.cc",
-        "caffe2/contrib/gloo/barrier_ops.cc",
-        "caffe2/contrib/gloo/broadcast_ops.cc",
-        "caffe2/contrib/gloo/common.cc",
-        "caffe2/contrib/gloo/common_world_ops.cc",
-        "caffe2/contrib/gloo/context.cc",
-        "caffe2/contrib/gloo/reduce_scatter_ops.cc",
-        "caffe2/contrib/gloo/store_handler.cc",
-    ],
-)
-
 filegroup(
     name = "caffe2_core_srcs",
     srcs = [
@@ -519,363 +503,6 @@ filegroup(
     ],
 )
 
-filegroup(
-    name = "caffe2_distributed_srcs",
-    srcs = [
-        "caffe2/distributed/file_store_handler.cc",
-        "caffe2/distributed/file_store_handler_op.cc",
-        "caffe2/distributed/store_handler.cc",
-        "caffe2/distributed/store_ops.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_ideep_srcs",
-    srcs = [
-        "caffe2/ideep/operators/adam_op.cc",
-        "caffe2/ideep/operators/channel_shuffle_op.cc",
-        "caffe2/ideep/operators/concat_split_op.cc",
-        "caffe2/ideep/operators/conv_op.cc",
-        "caffe2/ideep/operators/conv_transpose_op.cc",
-        "caffe2/ideep/operators/dropout_op.cc",
-        "caffe2/ideep/operators/elementwise_sum_op.cc",
-        "caffe2/ideep/operators/expand_squeeze_dims_op.cc",
-        "caffe2/ideep/operators/fully_connected_op.cc",
-        "caffe2/ideep/operators/local_response_normalization_op.cc",
-        "caffe2/ideep/operators/momentum_sgd_op.cc",
-        "caffe2/ideep/operators/operator_fallback_ideep.cc",
-        "caffe2/ideep/operators/order_switch_ops.cc",
-        "caffe2/ideep/operators/pool_op.cc",
-        "caffe2/ideep/operators/quantization/int8_add_op.cc",
-        "caffe2/ideep/operators/quantization/int8_conv_op.cc",
-        "caffe2/ideep/operators/quantization/int8_dequantize_op.cc",
-        "caffe2/ideep/operators/quantization/int8_fully_connected_op.cc",
-        "caffe2/ideep/operators/quantization/int8_given_tensor_fill_op.cc",
-        "caffe2/ideep/operators/quantization/int8_pool_op.cc",
-        "caffe2/ideep/operators/quantization/int8_quantize_op.cc",
-        "caffe2/ideep/operators/quantization/int8_relu_op.cc",
-        "caffe2/ideep/operators/queue_ops.cc",
-        "caffe2/ideep/operators/relu_op.cc",
-        "caffe2/ideep/operators/reshape_op.cc",
-        "caffe2/ideep/operators/shape_op.cc",
-        "caffe2/ideep/operators/sigmoid_op.cc",
-        "caffe2/ideep/operators/spatial_batch_norm_op.cc",
-        "caffe2/ideep/operators/transpose_op.cc",
-        "caffe2/ideep/operators/utility_ops.cc",
-        "caffe2/ideep/utils/ideep_register.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_onnx_srcs",
-    srcs = [
-        "caffe2/onnx/backend.cc",
-        "caffe2/onnx/backend_rep.cc",
-        "caffe2/onnx/device.cc",
-        "caffe2/onnx/helper.cc",
-        "caffe2/onnx/offline_tensor.cc",
-        "caffe2/onnx/onnx_exporter.cc",
-        "caffe2/onnx/onnxifi_graph_info.cc",
-        "caffe2/onnx/onnxifi_init.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_operators_srcs",
-    srcs = [
-        "caffe2/operators/abs_op.cc",
-        "caffe2/operators/accumulate_op.cc",
-        "caffe2/operators/accuracy_op.cc",
-        "caffe2/operators/acos_op.cc",
-        "caffe2/operators/affine_channel_op.cc",
-        "caffe2/operators/alias_with_name.cc",
-        "caffe2/operators/apmeter_op.cc",
-        "caffe2/operators/arg_ops.cc",
-        "caffe2/operators/asin_op.cc",
-        "caffe2/operators/assert_op.cc",
-        "caffe2/operators/atan_op.cc",
-        "caffe2/operators/atomic_ops.cc",
-        "caffe2/operators/batch_box_cox_op.cc",
-        "caffe2/operators/batch_bucketize_op.cc",
-        "caffe2/operators/batch_gather_ops.cc",
-        "caffe2/operators/batch_matmul_op.cc",
-        "caffe2/operators/batch_moments_op.cc",
-        "caffe2/operators/batch_permutation_op.cc",
-        "caffe2/operators/batch_sparse_to_dense_op.cc",
-        "caffe2/operators/bbox_transform_op.cc",
-        "caffe2/operators/bisect_percentile_op.cc",
-        "caffe2/operators/boolean_mask_ops.cc",
-        "caffe2/operators/boolean_unmask_ops.cc",
-        "caffe2/operators/box_with_nms_limit_op.cc",
-        "caffe2/operators/bucketize_op.cc",
-        "caffe2/operators/byte_weight_dequant_op.cc",
-        "caffe2/operators/cast_op.cc",
-        "caffe2/operators/cbrt_op.cc",
-        "caffe2/operators/cc_bmm_bg_op.cc",
-        "caffe2/operators/ceil_op.cc",
-        "caffe2/operators/channel_backprop_stats_op.cc",
-        "caffe2/operators/channel_shuffle_op.cc",
-        "caffe2/operators/channel_stats_op.cc",
-        "caffe2/operators/clip_op.cc",
-        "caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc",
-        "caffe2/operators/communicator_op.cc",
-        "caffe2/operators/concat_split_op.cc",
-        "caffe2/operators/conditional_op.cc",
-        "caffe2/operators/conv_gradient_op.cc",
-        "caffe2/operators/conv_op.cc",
-        "caffe2/operators/conv_op_eigen.cc",
-        "caffe2/operators/conv_op_shared.cc",
-        "caffe2/operators/conv_transpose_gradient_op.cc",
-        "caffe2/operators/conv_transpose_op.cc",
-        "caffe2/operators/conv_transpose_op_mobile.cc",
-        "caffe2/operators/copy_op.cc",
-        "caffe2/operators/copy_rows_to_tensor_op.cc",
-        "caffe2/operators/cos_op.cc",
-        "caffe2/operators/cosh_op.cc",
-        "caffe2/operators/cosine_embedding_criterion_op.cc",
-        "caffe2/operators/counter_ops.cc",
-        "caffe2/operators/crash_op.cc",
-        "caffe2/operators/create_scope_op.cc",
-        "caffe2/operators/crf_viterbi_op.cc",
-        "caffe2/operators/cross_entropy_op.cc",
-        "caffe2/operators/ctc_beam_search_decoder_op.cc",
-        "caffe2/operators/ctc_greedy_decoder_op.cc",
-        "caffe2/operators/cube_op.cc",
-        "caffe2/operators/data_couple.cc",
-        "caffe2/operators/dataset_ops.cc",
-        "caffe2/operators/deform_conv_gradient_op.cc",
-        "caffe2/operators/deform_conv_op.cc",
-        "caffe2/operators/dense_vector_to_id_list_op.cc",
-        "caffe2/operators/distance_op.cc",
-        "caffe2/operators/do_op.cc",
-        "caffe2/operators/dropout_op.cc",
-        "caffe2/operators/elementwise_add_gradient_op.cc",
-        "caffe2/operators/elementwise_add_op.cc",
-        "caffe2/operators/elementwise_div_gradient_op.cc",
-        "caffe2/operators/elementwise_div_op.cc",
-        "caffe2/operators/elementwise_linear_op.cc",
-        "caffe2/operators/elementwise_logical_ops.cc",
-        "caffe2/operators/elementwise_mul_gradient_op.cc",
-        "caffe2/operators/elementwise_mul_op.cc",
-        "caffe2/operators/elementwise_ops.cc",
-        "caffe2/operators/elementwise_ops_schema.cc",
-        "caffe2/operators/elementwise_ops_utils.cc",
-        "caffe2/operators/elementwise_sub_gradient_op.cc",
-        "caffe2/operators/elementwise_sub_op.cc",
-        "caffe2/operators/elementwise_sum_op.cc",
-        "caffe2/operators/elu_op.cc",
-        "caffe2/operators/enforce_finite_op.cc",
-        "caffe2/operators/ensure_clipped_op.cc",
-        "caffe2/operators/ensure_cpu_output_op.cc",
-        "caffe2/operators/erf_op.cc",
-        "caffe2/operators/exp_op.cc",
-        "caffe2/operators/expand_op.cc",
-        "caffe2/operators/expand_squeeze_dims_op.cc",
-        "caffe2/operators/fc_inference.cc",
-        "caffe2/operators/feature_maps_ops.cc",
-        "caffe2/operators/feed_blob_op.cc",
-        "caffe2/operators/filler_op.cc",
-        "caffe2/operators/find_duplicate_elements_op.cc",
-        "caffe2/operators/find_op.cc",
-        "caffe2/operators/flatten_op.cc",
-        "caffe2/operators/flexible_top_k.cc",
-        "caffe2/operators/floor_op.cc",
-        "caffe2/operators/free_op.cc",
-        "caffe2/operators/fully_connected_op.cc",
-        "caffe2/operators/fused_rowwise_8bit_conversion_ops.cc",
-        "caffe2/operators/fused_rowwise_random_quantization_ops.cc",
-        "caffe2/operators/gather_fused_8bit_rowwise_op.cc",
-        "caffe2/operators/gather_op.cc",
-        "caffe2/operators/gather_ranges_to_dense_op.cc",
-        "caffe2/operators/gelu_op.cc",
-        "caffe2/operators/generate_proposals_op.cc",
-        "caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cc",
-        "caffe2/operators/given_tensor_fill_op.cc",
-        "caffe2/operators/glu_op.cc",
-        "caffe2/operators/group_norm_op.cc",
-        "caffe2/operators/gru_unit_op.cc",
-        "caffe2/operators/h_softmax_op.cc",
-        "caffe2/operators/half_float_ops.cc",
-        "caffe2/operators/hard_sigmoid_op.cc",
-        "caffe2/operators/heatmap_max_keypoint_op.cc",
-        "caffe2/operators/if_op.cc",
-        "caffe2/operators/im2col_op.cc",
-        "caffe2/operators/index_hash_ops.cc",
-        "caffe2/operators/index_ops.cc",
-        "caffe2/operators/inference_lstm_op.cc",
-        "caffe2/operators/instance_norm_gradient_op.cc",
-        "caffe2/operators/instance_norm_op.cc",
-        "caffe2/operators/integral_image_op.cc",
-        "caffe2/operators/is_empty_op.cc",
-        "caffe2/operators/jsd_op.cc",
-        "caffe2/operators/key_split_ops.cc",
-        "caffe2/operators/last_n_window_collector.cc",
-        "caffe2/operators/layer_norm_op.cc",
-        "caffe2/operators/leaky_relu_op.cc",
-        "caffe2/operators/length_split_op.cc",
-        "caffe2/operators/lengths_pad_op.cc",
-        "caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc",
-        "caffe2/operators/lengths_reducer_ops.cc",
-        "caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc",
-        "caffe2/operators/lengths_tile_op.cc",
-        "caffe2/operators/lengths_top_k_op.cc",
-        "caffe2/operators/listwise_l2r_op.cc",
-        "caffe2/operators/load_save_op.cc",
-        "caffe2/operators/load_save_op_util.cc",
-        "caffe2/operators/local_response_normalization_op.cc",
-        "caffe2/operators/locally_connected_op.cc",
-        "caffe2/operators/locally_connected_op_util.cc",
-        "caffe2/operators/log_op.cc",
-        "caffe2/operators/logit_op.cc",
-        "caffe2/operators/loss_op.cc",
-        "caffe2/operators/lp_pool_op.cc",
-        "caffe2/operators/lpnorm_op.cc",
-        "caffe2/operators/lstm_unit_op.cc",
-        "caffe2/operators/map_ops.cc",
-        "caffe2/operators/margin_ranking_criterion_op.cc",
-        "caffe2/operators/matmul_op.cc",
-        "caffe2/operators/mean_op.cc",
-        "caffe2/operators/merge_id_lists_op.cc",
-        "caffe2/operators/minmax_gradient_ops.cc",
-        "caffe2/operators/minmax_ops.cc",
-        "caffe2/operators/mod_op.cc",
-        "caffe2/operators/moments_op.cc",
-        "caffe2/operators/multi_class_accuracy_op.cc",
-        "caffe2/operators/negate_gradient_op.cc",
-        "caffe2/operators/negative_op.cc",
-        "caffe2/operators/ngram_ops.cc",
-        "caffe2/operators/norm_planar_yuv_op.cc",
-        "caffe2/operators/normalize_l1_op.cc",
-        "caffe2/operators/normalize_op.cc",
-        "caffe2/operators/numpy_tile_op.cc",
-        "caffe2/operators/one_hot_ops.cc",
-        "caffe2/operators/onnx_while_op.cc",
-        "caffe2/operators/order_switch_ops.cc",
-        "caffe2/operators/pack_rnn_sequence_op.cc",
-        "caffe2/operators/pack_segments.cc",
-        "caffe2/operators/pad_op.cc",
-        "caffe2/operators/partition_ops.cc",
-        "caffe2/operators/percentile_op.cc",
-        "caffe2/operators/perplexity_op.cc",
-        "caffe2/operators/piecewise_linear_transform_op.cc",
-        "caffe2/operators/pool_gradient_op.cc",
-        "caffe2/operators/pool_op.cc",
-        "caffe2/operators/pool_op_util.cc",
-        "caffe2/operators/pow_op.cc",
-        "caffe2/operators/prelu_op.cc",
-        "caffe2/operators/prepend_dim_op.cc",
-        "caffe2/operators/quant_decode_op.cc",
-        "caffe2/operators/rank_loss_op.cc",
-        "caffe2/operators/reciprocal_gradient_op.cc",
-        "caffe2/operators/reciprocal_op.cc",
-        "caffe2/operators/reduce_front_back_max_ops.cc",
-        "caffe2/operators/reduce_front_back_mean_ops.cc",
-        "caffe2/operators/reduce_front_back_sum_ops.cc",
-        "caffe2/operators/reduce_ops.cc",
-        "caffe2/operators/reduction_ops.cc",
-        "caffe2/operators/relu_n_op.cc",
-        "caffe2/operators/relu_op.cc",
-        "caffe2/operators/remove_data_blocks_op.cc",
-        "caffe2/operators/replace_nan_op.cc",
-        "caffe2/operators/reservoir_sampling.cc",
-        "caffe2/operators/reshape_op.cc",
-        "caffe2/operators/resize_3d_op.cc",
-        "caffe2/operators/resize_op.cc",
-        "caffe2/operators/reverse_packed_segs_op.cc",
-        "caffe2/operators/rmac_regions_op.cc",
-        "caffe2/operators/rnn/recurrent_network_blob_fetcher_op.cc",
-        "caffe2/operators/rnn/recurrent_network_executor.cc",
-        "caffe2/operators/rnn/recurrent_network_op.cc",
-        "caffe2/operators/roi_align_gradient_op.cc",
-        "caffe2/operators/roi_align_op.cc",
-        "caffe2/operators/roi_align_rotated_gradient_op.cc",
-        "caffe2/operators/roi_align_rotated_op.cc",
-        "caffe2/operators/roi_pool_op.cc",
-        "caffe2/operators/rowmul_op.cc",
-        "caffe2/operators/rsqrt_op.cc",
-        "caffe2/operators/scale_blobs_op.cc",
-        "caffe2/operators/scale_op.cc",
-        "caffe2/operators/segment_reduction_op.cc",
-        "caffe2/operators/selu_op.cc",
-        "caffe2/operators/sequence_ops.cc",
-        "caffe2/operators/shape_op.cc",
-        "caffe2/operators/sigmoid_gradient_op.cc",
-        "caffe2/operators/sigmoid_op.cc",
-        "caffe2/operators/sin_op.cc",
-        "caffe2/operators/sinh_op.cc",
-        "caffe2/operators/sinusoid_position_encoding_op.cc",
-        "caffe2/operators/slice_op.cc",
-        "caffe2/operators/softmax_op.cc",
-        "caffe2/operators/softmax_utils.cc",
-        "caffe2/operators/softmax_with_loss_op.cc",
-        "caffe2/operators/softplus_op.cc",
-        "caffe2/operators/softsign_op.cc",
-        "caffe2/operators/space_batch_op.cc",
-        "caffe2/operators/sparse_dropout_with_replacement_op.cc",
-        "caffe2/operators/sparse_normalize_op.cc",
-        "caffe2/operators/sparse_to_dense_mask_op.cc",
-        "caffe2/operators/sparse_to_dense_op.cc",
-        "caffe2/operators/spatial_batch_norm_gradient_op.cc",
-        "caffe2/operators/spatial_batch_norm_op.cc",
-        "caffe2/operators/spatial_softmax_with_loss_op.cc",
-        "caffe2/operators/sqr_op.cc",
-        "caffe2/operators/sqrt_op.cc",
-        "caffe2/operators/square_root_divide_op.cc",
-        "caffe2/operators/stats_ops.cc",
-        "caffe2/operators/stats_put_ops.cc",
-        "caffe2/operators/stop_gradient.cc",
-        "caffe2/operators/string_ops.cc",
-        "caffe2/operators/stump_func_op.cc",
-        "caffe2/operators/stylizer_ops.cc",
-        "caffe2/operators/summarize_op.cc",
-        "caffe2/operators/swish_op.cc",
-        "caffe2/operators/tan_op.cc",
-        "caffe2/operators/tanh_gradient_op.cc",
-        "caffe2/operators/tanh_op.cc",
-        "caffe2/operators/tensor_protos_db_input.cc",
-        "caffe2/operators/text_file_reader.cc",
-        "caffe2/operators/text_file_reader_utils.cc",
-        "caffe2/operators/thresholded_relu_op.cc",
-        "caffe2/operators/tile_op.cc",
-        "caffe2/operators/top_k.cc",
-        "caffe2/operators/transpose_op.cc",
-        "caffe2/operators/tt_linear_op.cc",
-        "caffe2/operators/unique_ops.cc",
-        "caffe2/operators/upsample_op.cc",
-        "caffe2/operators/utility_ops.cc",
-        "caffe2/operators/variable_length_sequence_padding.cc",
-        "caffe2/operators/weighted_multi_sampling_op.cc",
-        "caffe2/operators/weighted_sample_op.cc",
-        "caffe2/operators/while_op.cc",
-        "caffe2/operators/workspace_ops.cc",
-        "caffe2/operators/zero_gradient_op.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_opt_srcs",
-    srcs = [
-        "caffe2/opt/annotations.cc",
-        "caffe2/opt/backend_cutting.cc",
-        "caffe2/opt/backend_transformer_base.cc",
-        "caffe2/opt/bound_shape_inferencer.cc",
-        "caffe2/opt/converter.cc",
-        "caffe2/opt/dead_code_elim.cc",
-        "caffe2/opt/device.cc",
-        "caffe2/opt/distributed.cc",
-        "caffe2/opt/distributed_converter.cc",
-        "caffe2/opt/fusion.cc",
-        "caffe2/opt/mobile.cc",
-        "caffe2/opt/onnxifi_op.cc",
-        "caffe2/opt/onnxifi_transformer.cc",
-        "caffe2/opt/optimize_ideep.cc",
-        "caffe2/opt/optimizer.cc",
-        "caffe2/opt/passes.cc",
-        "caffe2/opt/shape_info.cc",
-        "caffe2/opt/tvm_transformer.cc",
-    ],
-)
-
 filegroup(
     name = "caffe2_perfkernels_srcs",
     srcs = [
@@ -891,70 +518,6 @@ filegroup(
     ],
 )
 
-filegroup(
-    name = "caffe2_predictor_srcs",
-    srcs = [
-        "caffe2/predictor/emulator/data_filler.cc",
-        "caffe2/predictor/emulator/data_filler.h",
-        "caffe2/predictor/predictor.cc",
-        "caffe2/predictor/predictor_config.cc",
-        "caffe2/predictor/predictor_utils.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_quantization_srcs",
-    srcs = [
-        "caffe2/quantization/server/activation_distribution_observer.cc",
-        "caffe2/quantization/server/batch_matmul_dnnlowp_op.cc",
-        "caffe2/quantization/server/caffe2_dnnlowp_utils.cc",
-        "caffe2/quantization/server/channel_shuffle_dnnlowp_op.cc",
-        "caffe2/quantization/server/concat_dnnlowp_op.cc",
-        "caffe2/quantization/server/conv_dnnlowp_acc16_op.cc",
-        "caffe2/quantization/server/conv_dnnlowp_op.cc",
-        "caffe2/quantization/server/conv_relu_op.cc",
-        "caffe2/quantization/server/dequantize_dnnlowp_op.cc",
-        "caffe2/quantization/server/dnnlowp.cc",
-        "caffe2/quantization/server/dnnlowp_partition.cc",
-        "caffe2/quantization/server/dynamic_histogram.cc",
-        "caffe2/quantization/server/elementwise_add_dnnlowp_op.cc",
-        "caffe2/quantization/server/elementwise_linear_dnnlowp_op.cc",
-        "caffe2/quantization/server/elementwise_mul_dnnlowp_op.cc",
-        "caffe2/quantization/server/elementwise_sum_dnnlowp_op.cc",
-        "caffe2/quantization/server/elementwise_sum_relu_op.cc",
-        "caffe2/quantization/server/fbgemm_pack_matrix_cache.cc",
-        "caffe2/quantization/server/fbgemm_pack_op.cc",
-        "caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.cc",
-        "caffe2/quantization/server/fully_connected_dnnlowp_op.cc",
-        "caffe2/quantization/server/fully_connected_fake_lowp_op.cc",
-        "caffe2/quantization/server/group_norm_dnnlowp_op.cc",
-        "caffe2/quantization/server/int8_gen_quant_params.cc",
-        "caffe2/quantization/server/kl_minimization.cc",
-        "caffe2/quantization/server/lstm_unit_dnnlowp_op.cc",
-        "caffe2/quantization/server/norm_minimization.cc",
-        "caffe2/quantization/server/p99.cc",
-        "caffe2/quantization/server/pool_dnnlowp_op.cc",
-        "caffe2/quantization/server/quantize_dnnlowp_op.cc",
-        "caffe2/quantization/server/relu_dnnlowp_op.cc",
-        "caffe2/quantization/server/sigmoid.cc",
-        "caffe2/quantization/server/sigmoid_dnnlowp_op.cc",
-        "caffe2/quantization/server/spatial_batch_norm_dnnlowp_op.cc",
-        "caffe2/quantization/server/tanh.cc",
-        "caffe2/quantization/server/tanh_dnnlowp_op.cc",
-        "caffe2/quantization/server/utility_dnnlowp_ops.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_queue_srcs",
-    srcs = [
-        "caffe2/queue/blobs_queue.cc",
-        "caffe2/queue/blobs_queue_db.cc",
-        "caffe2/queue/queue_ops.cc",
-        "caffe2/queue/rebatching_queue.cc",
-        "caffe2/queue/rebatching_queue_ops.cc",
-    ],
-)
 
 filegroup(
     name = "caffe2_serialize_srcs",
@@ -966,36 +529,6 @@ filegroup(
     ],
 )
 
-filegroup(
-    name = "caffe2_sgd_srcs",
-    srcs = [
-        "caffe2/sgd/adadelta_op.cc",
-        "caffe2/sgd/adagrad_op.cc",
-        "caffe2/sgd/adam_op.cc",
-        "caffe2/sgd/clip_tensor_op.cc",
-        "caffe2/sgd/ftrl_op.cc",
-        "caffe2/sgd/gftrl_op.cc",
-        "caffe2/sgd/iter_op.cc",
-        "caffe2/sgd/lars_op.cc",
-        "caffe2/sgd/learning_rate_adaption_op.cc",
-        "caffe2/sgd/learning_rate_op.cc",
-        "caffe2/sgd/momentum_sgd_op.cc",
-        "caffe2/sgd/rmsprop_op.cc",
-        "caffe2/sgd/wngrad_op.cc",
-        "caffe2/sgd/yellowfin_op.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_transforms_srcs",
-    srcs = [
-        "caffe2/transforms/common_subexpression_elimination.cc",
-        "caffe2/transforms/conv_to_nnpack_transform.cc",
-        "caffe2/transforms/pattern_net_transform.cc",
-        "caffe2/transforms/single_op_transform.cc",
-    ],
-)
-
 filegroup(
     name = "caffe2_utils_srcs",
     srcs = [
@@ -1020,228 +553,6 @@ filegroup(
     ],
 )
 
-filegroup(
-    name = "caffe2_cuda_cpp_srcs",
-    srcs = [
-        "caffe2/contrib/aten/aten_op_gpu.cc",
-        "caffe2/contrib/gloo/allreduce_ops_gpu.cc",
-        "caffe2/contrib/gloo/broadcast_ops_gpu.cc",
-        "caffe2/contrib/gloo/common_world_ops_gpu.cc",
-        "caffe2/core/blob_serialization_gpu.cc",
-        "caffe2/core/common_cudnn.cc",
-        "caffe2/core/common_gpu.cc",
-        "caffe2/core/event_gpu.cc",
-        "caffe2/db/create_db_op_gpu.cc",
-        "caffe2/distributed/file_store_handler_op_gpu.cc",
-        "caffe2/operators/communicator_op_gpu.cc",
-        "caffe2/operators/concat_split_op_gpu.cc",
-        "caffe2/operators/conv_op_cache_cudnn.cc",
-        "caffe2/operators/conv_op_cudnn.cc",
-        "caffe2/operators/conv_op_gpu.cc",
-        "caffe2/operators/conv_op_shared_gpu.cc",
-        "caffe2/operators/conv_transpose_op_cudnn.cc",
-        "caffe2/operators/conv_transpose_op_gpu.cc",
-        "caffe2/operators/counter_ops_gpu.cc",
-        "caffe2/operators/do_op_gpu.cc",
-        "caffe2/operators/dropout_op_cudnn.cc",
-        "caffe2/operators/elementwise_add_op_gpu.cc",
-        "caffe2/operators/elementwise_sub_op_gpu.cc",
-        "caffe2/operators/elu_op_cudnn.cc",
-        "caffe2/operators/exp_op_gpu.cc",
-        "caffe2/operators/expand_op_gpu.cc",
-        "caffe2/operators/expand_squeeze_dims_op_gpu.cc",
-        "caffe2/operators/free_op_gpu.cc",
-        "caffe2/operators/fully_connected_op_gpu.cc",
-        "caffe2/operators/if_op_gpu.cc",
-        "caffe2/operators/im2col_op_gpu.cc",
-        "caffe2/operators/load_save_op_gpu.cc",
-        "caffe2/operators/local_response_normalization_op_cudnn.cc",
-        "caffe2/operators/locally_connected_op_gpu.cc",
-        "caffe2/operators/log_op_gpu.cc",
-        "caffe2/operators/matmul_op_gpu.cc",
-        "caffe2/operators/negate_gradient_op_gpu.cc",
-        "caffe2/operators/negative_op_gpu.cc",
-        "caffe2/operators/order_switch_ops_cudnn.cc",
-        "caffe2/operators/order_switch_ops_gpu.cc",
-        "caffe2/operators/pool_op_cudnn.cc",
-        "caffe2/operators/prepend_dim_op_gpu.cc",
-        "caffe2/operators/reshape_op_gpu.cc",
-        "caffe2/operators/rnn/recurrent_network_blob_fetcher_op_gpu.cc",
-        "caffe2/operators/rnn/recurrent_network_executor_gpu.cc",
-        "caffe2/operators/rnn/recurrent_op_cudnn.cc",
-        "caffe2/operators/scale_op_gpu.cc",
-        "caffe2/operators/shape_op_gpu.cc",
-        "caffe2/operators/sigmoid_op_cudnn.cc",
-        "caffe2/operators/softmax_op_cudnn.cc",
-        "caffe2/operators/sqr_op_gpu.cc",
-        "caffe2/operators/sqrt_op_gpu.cc",
-        "caffe2/operators/stop_gradient_gpu.cc",
-        "caffe2/operators/tanh_op_cudnn.cc",
-        "caffe2/operators/tensor_protos_db_input_gpu.cc",
-        "caffe2/operators/transpose_op_cudnn.cc",
-        "caffe2/operators/while_op_gpu.cc",
-        "caffe2/operators/zero_gradient_op_gpu.cc",
-        "caffe2/queue/queue_ops_gpu.cc",
-        "caffe2/sgd/iter_op_gpu.cc",
-        "caffe2/sgd/learning_rate_op_gpu.cc",
-    ],
-)
-
-filegroup(
-    name = "caffe2_cu_srcs",
-    srcs = [
-        "caffe2/core/context_gpu.cu",
-        "caffe2/operators/abs_op.cu",
-        "caffe2/operators/accumulate_op.cu",
-        "caffe2/operators/accuracy_op.cu",
-        "caffe2/operators/acos_op.cu",
-        "caffe2/operators/affine_channel_op.cu",
-        "caffe2/operators/alias_with_name.cu",
-        "caffe2/operators/arg_ops.cu",
-        "caffe2/operators/asin_op.cu",
-        "caffe2/operators/assert_op.cu",
-        "caffe2/operators/atan_op.cu",
-        "caffe2/operators/batch_gather_ops.cu",
-        "caffe2/operators/batch_matmul_op.cu",
-        "caffe2/operators/batch_moments_op.cu",
-        "caffe2/operators/batch_permutation_op.cu",
-        "caffe2/operators/batch_sparse_to_dense_op.cu",
-        "caffe2/operators/boolean_mask_ops.cu",
-        "caffe2/operators/boolean_unmask_ops.cu",
-        "caffe2/operators/bucketize_op.cu",
-        "caffe2/operators/cast_op.cu",
-        "caffe2/operators/cbrt_op.cu",
-        "caffe2/operators/ceil_op.cu",
-        "caffe2/operators/channel_backprop_stats_op.cu",
-        "caffe2/operators/channel_shuffle_op.cu",
-        "caffe2/operators/channel_stats_op.cu",
-        "caffe2/operators/channelwise_conv3d_op_cudnn.cu",
-        "caffe2/operators/clip_op.cu",
-        "caffe2/operators/copy_op.cu",
-        "caffe2/operators/cos_op.cu",
-        "caffe2/operators/cosh_op.cu",
-        "caffe2/operators/cosine_embedding_criterion_op.cu",
-        "caffe2/operators/cross_entropy_op.cu",
-        "caffe2/operators/cube_op.cu",
-        "caffe2/operators/data_couple_gpu.cu",
-        "caffe2/operators/deform_conv_op.cu",
-        "caffe2/operators/depthwise_3x3_conv_op_cudnn.cu",
-        "caffe2/operators/distance_op.cu",
-        "caffe2/operators/dropout_op.cu",
-        "caffe2/operators/elementwise_div_op.cu",
-        "caffe2/operators/elementwise_linear_op.cu",
-        "caffe2/operators/elementwise_mul_op.cu",
-        "caffe2/operators/elementwise_ops.cu",
-        "caffe2/operators/elu_op.cu",
-        "caffe2/operators/enforce_finite_op.cu",
-        "caffe2/operators/ensure_cpu_output_op.cu",
-        "caffe2/operators/erf_op.cu",
-        "caffe2/operators/filler_op.cu",
-        "caffe2/operators/find_op.cu",
-        "caffe2/operators/floor_op.cu",
-        "caffe2/operators/gather_op.cu",
-        "caffe2/operators/gelu_op.cu",
-        "caffe2/operators/generate_proposals_op.cu",
-        "caffe2/operators/generate_proposals_op_util_nms_gpu.cu",
-        "caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cu",
-        "caffe2/operators/given_tensor_fill_op.cu",
-        "caffe2/operators/glu_op.cu",
-        "caffe2/operators/group_norm_op.cu",
-        "caffe2/operators/gru_unit_op_gpu.cu",
-        "caffe2/operators/half_float_ops.cu",
-        "caffe2/operators/hard_sigmoid_op.cu",
-        "caffe2/operators/instance_norm_op.cu",
-        "caffe2/operators/integral_image_op.cu",
-        "caffe2/operators/layer_norm_op.cu",
-        "caffe2/operators/leaky_relu_op.cu",
-        "caffe2/operators/lengths_pad_op.cu",
-        "caffe2/operators/lengths_tile_op.cu",
-        "caffe2/operators/local_response_normalization_op.cu",
-        "caffe2/operators/logit_op.cu",
-        "caffe2/operators/loss_op.cu",
-        "caffe2/operators/lp_pool_op.cu",
-        "caffe2/operators/lstm_unit_op_gpu.cu",
-        "caffe2/operators/margin_ranking_criterion_op.cu",
-        "caffe2/operators/max_pool_with_index.cu",
-        "caffe2/operators/mean_op.cu",
-        "caffe2/operators/mem_query_op.cu",
-        "caffe2/operators/minmax_ops.cu",
-        "caffe2/operators/moments_op.cu",
-        "caffe2/operators/multi_class_accuracy_op.cu",
-        "caffe2/operators/normalize_ops.cu",
-        "caffe2/operators/one_hot_ops.cu",
-        "caffe2/operators/pack_segments.cu",
-        "caffe2/operators/pad_op_gpu.cu",
-        "caffe2/operators/perplexity_op.cu",
-        "caffe2/operators/piecewise_linear_transform_op.cu",
-        "caffe2/operators/pool_op.cu",
-        "caffe2/operators/pow_op.cu",
-        "caffe2/operators/prelu_op.cu",
-        "caffe2/operators/reciprocal_op.cu",
-        "caffe2/operators/reduce_front_back_max_ops.cu",
-        "caffe2/operators/reduce_front_back_sum_mean_ops.cu",
-        "caffe2/operators/reduce_ops.cu",
-        "caffe2/operators/reduction_ops.cu",
-        "caffe2/operators/relu_n_op.cu",
-        "caffe2/operators/relu_op.cu",
-        "caffe2/operators/replace_nan_op.cu",
-        "caffe2/operators/resize_3d_op.cu",
-        "caffe2/operators/resize_op.cu",
-        "caffe2/operators/reverse_packed_segs_op.cu",
-        "caffe2/operators/rmac_regions_op.cu",
-        "caffe2/operators/rnn/recurrent_network_op_gpu.cu",
-        "caffe2/operators/roi_align_gradient_op.cu",
-        "caffe2/operators/roi_align_op.cu",
-        "caffe2/operators/roi_align_rotated_gradient_op.cu",
-        "caffe2/operators/roi_align_rotated_op.cu",
-        "caffe2/operators/roi_pool_op.cu",
-        "caffe2/operators/rsqrt_op.cu",
-        "caffe2/operators/scale_blobs_op.cu",
-        "caffe2/operators/segment_reduction_op_gpu.cu",
-        "caffe2/operators/selu_op.cu",
-        "caffe2/operators/sequence_ops.cu",
-        "caffe2/operators/sigmoid_op.cu",
-        "caffe2/operators/sin_op.cu",
-        "caffe2/operators/sinh_op.cu",
-        "caffe2/operators/slice_op.cu",
-        "caffe2/operators/softmax_ops.cu",
-        "caffe2/operators/softplus_op.cu",
-        "caffe2/operators/softsign_op.cu",
-        "caffe2/operators/space_batch_op_gpu.cu",
-        "caffe2/operators/sparse_normalize_op_gpu.cu",
-        "caffe2/operators/sparse_to_dense_op.cu",
-        "caffe2/operators/spatial_batch_norm_op.cu",
-        "caffe2/operators/spatial_batch_norm_op_cudnn.cu",
-        "caffe2/operators/stump_func_op.cu",
-        "caffe2/operators/summarize_op.cu",
-        "caffe2/operators/swish_op.cu",
-        "caffe2/operators/tan_op.cu",
-        "caffe2/operators/tanh_op.cu",
-        "caffe2/operators/thresholded_relu_op.cu",
-        "caffe2/operators/tile_op.cu",
-        "caffe2/operators/top_k.cu",
-        "caffe2/operators/transpose_op.cu",
-        "caffe2/operators/unique_ops.cu",
-        "caffe2/operators/upsample_op.cu",
-        "caffe2/operators/utility_ops.cu",
-        "caffe2/operators/weighted_sample_op.cu",
-        "caffe2/sgd/adadelta_op_gpu.cu",
-        "caffe2/sgd/adagrad_op_gpu.cu",
-        "caffe2/sgd/adam_op_gpu.cu",
-        "caffe2/sgd/fp16_momentum_sgd_op.cu",
-        "caffe2/sgd/fp32_momentum_sgd_op.cu",
-        "caffe2/sgd/lars_op_gpu.cu",
-        "caffe2/sgd/momentum_sgd_op_gpu.cu",
-        "caffe2/sgd/rmsprop_op_gpu.cu",
-        "caffe2/sgd/yellowfin_op_gpu.cu",
-        "caffe2/utils/math/broadcast.cu",
-        "caffe2/utils/math/elementwise.cu",
-        "caffe2/utils/math/reduce.cu",
-        "caffe2/utils/math/transpose.cu",
-        "caffe2/utils/math_gpu.cu",
-    ],
-)
-
 # To achieve finer granularity and make debug easier, caffe2 is split into three libraries:
 # ATen, caffe2 and caffe2_for_aten_headers. ATen lib group up source codes under
 # aten/ directory and caffe2 contains most files under `caffe2/` directory. Since the
@@ -1270,35 +581,10 @@ cc_library(
     ],
 )
 
-py_binary(
-    name = "gen_op",
-    srcs = ["caffe2/contrib/aten/gen_op.py"],
-    deps = ["//torchgen"],
-)
-
-genrule(
-    name = "generated_caffe2_aten_op_headers",
-    srcs = [
-        "caffe2/contrib/aten/aten_op_template.h",
-        "aten/src/ATen/Declarations.yaml",
-    ],
-    outs = ["caffe2/caffe2/contrib/aten/gen_aten_op.h"],
-    cmd = """
-    $(location :gen_op) \
-        --output_prefix gen_ \
-        --install_dir $(@D) \
-        --aten_root `dirname $(location aten/src/ATen/Declarations.yaml)`/../.. \
-        --template_dir `dirname $(location caffe2/contrib/aten/aten_op_template.h)` \
-        --yaml_dir `dirname $(location aten/src/ATen/Declarations.yaml)`""",
-    tools = [":gen_op"],
-)
-
 cc_library(
     name = "caffe2_headers",
     hdrs = glob(
         [
-            "caffe2/contrib/aten/*.h",
-            "caffe2/contrib/gloo/*.h",
             "caffe2/core/*.h",
             "caffe2/core/nomnigraph/include/nomnigraph/Converters/*.h",
             "caffe2/core/nomnigraph/include/nomnigraph/Generated/*.h",
@@ -1307,25 +593,8 @@ cc_library(
             "caffe2/core/nomnigraph/include/nomnigraph/Support/*.h",
             "caffe2/core/nomnigraph/include/nomnigraph/Transformations/*.h",
             "caffe2/core/nomnigraph/tests/*.h",
-            "caffe2/db/*.h",
-            "caffe2/distributed/*.h",
-            "caffe2/ideep/*.h",
-            "caffe2/ideep/operators/*.h",
-            "caffe2/ideep/operators/quantization/*.h",
-            "caffe2/ideep/utils/*.h",
-            "caffe2/onnx/*.h",
-            "caffe2/operators/*.h",
-            "caffe2/operators/rnn/*.h",
-            "caffe2/opt/*.h",
             "caffe2/perfkernels/*.h",
-            "caffe2/predictor/*.h",
-            "caffe2/predictor/emulator/*.h",
-            "caffe2/quantization/server/*.h",
-            "caffe2/queue/*.h",
             "caffe2/serialize/*.h",
-            "caffe2/sgd/*.h",
-            "caffe2/share/contrib/depthwise/*.h",
-            "caffe2/transforms/*.h",
             "caffe2/utils/*.h",
             "caffe2/utils/math/*.h",
             "caffe2/utils/threadpool/*.h",
@@ -1337,10 +606,9 @@ cc_library(
     ) + if_cuda(glob([
         "caffe2/**/*.cuh",
         "caffe2/image/*.h",
-    ])) + [":generated_caffe2_aten_op_headers"],
+    ])),
     copts = CAFFE2_COPTS,
     includes = [
-        "caffe2/contrib/aten",
         "caffe2/core/nomnigraph/include",
     ],
     visibility = ["//visibility:public"],
@@ -1352,52 +620,12 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "caffe2_dnnlowp_avx2_ops",
-    srcs = [
-        "caffe2/quantization/server/elementwise_sum_dnnlowp_op_avx2.cc",
-        "caffe2/quantization/server/fully_connected_fake_lowp_op_avx2.cc",
-        "caffe2/quantization/server/group_norm_dnnlowp_op_avx2.cc",
-        "caffe2/quantization/server/norm_minimization_avx2.cc",
-        "caffe2/quantization/server/pool_dnnlowp_op_avx2.cc",
-        "caffe2/quantization/server/relu_dnnlowp_op_avx2.cc",
-        "caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_avx2.cc",
-        "caffe2/quantization/server/transpose.cc",
-    ],
-    copts = CAFFE2_COPTS + [
-        "-mf16c",
-        "-mavx2",
-        "-mfma",
-        "-mxsave",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":caffe2_headers",
-        "@fbgemm",
-    ],
-    alwayslink = True,
-)
-
 cc_library(
     name = "caffe2",
     srcs = [
-        "caffe2/db/create_db_op.cc",
-        "caffe2/db/protodb.cc",
-        "caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc",
-        ":caffe2_contrib_srcs",
         ":caffe2_core_srcs",
-        ":caffe2_distributed_srcs",
-        ":caffe2_ideep_srcs",
-        ":caffe2_onnx_srcs",
-        ":caffe2_operators_srcs",
-        ":caffe2_opt_srcs",
         ":caffe2_perfkernels_srcs",
-        ":caffe2_predictor_srcs",
-        ":caffe2_quantization_srcs",
-        ":caffe2_queue_srcs",
         ":caffe2_serialize_srcs",
-        ":caffe2_sgd_srcs",
-        ":caffe2_transforms_srcs",
         ":caffe2_utils_srcs",
     ],
     copts = CAFFE2_COPTS + ["-mf16c"],
@@ -1405,7 +633,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":caffe2_core_macros",
-        ":caffe2_dnnlowp_avx2_ops",
         ":caffe2_headers",
         ":caffe2_perfkernels_avx",
         ":caffe2_perfkernels_avx2",
@@ -1418,11 +645,9 @@ cc_library(
         "@fbgemm//:fbgemm_src_headers",
         "@fmt",
         "@foxi",
-        "@gloo",
         "@onnx",
     ] + if_cuda(
         [
-            ":caffe2_cuda_cpp",
             ":aten_cuda",
             "@tensorpipe//:tensorpipe_cuda",
         ],
@@ -1434,39 +659,20 @@ cc_library(
     alwayslink = True,
 )
 
-cc_library(
-    name = "caffe2_cuda_cpp",
-    srcs = [":caffe2_cuda_cpp_srcs"],
-    copts = CAFFE2_COPTS,
-    visibility = ["//visibility:public"],
-    deps = [
-        ":caffe2_cuda",
-        ":caffe2_headers",
-    ],
-    alwayslink = True,
-)
-
 cu_library(
-    name = "caffe2_cuda",
-    # one may think that `quantization_gpu.cu` could be a separate kernel,
-    # however that leads to de-registration problem that's described in
-    # https://github.com/pytorch/pytorch/issues/79236
-    # To solve it we add it into the `caffe2_cuda`,
-    # this is also aligned with the CMake build.
-    srcs = [":caffe2_cu_srcs"] + [
+    name = "torch_cuda",
+    srcs = [
         "torch/csrc/distributed/c10d/intra_node_comm.cu",
         "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
     ],
-    copts = CAFFE2_COPTS + torch_cuda_half_options,
+    copts = torch_cuda_half_options,
     visibility = ["//visibility:public"],
     deps = [
         ":aten",
-        ":caffe2_headers",
         "@cuda//:cublas",
         "@cuda//:curand",
         "@cudnn",
         "@eigen",
-        "@gloo",
         "@tensorpipe//:tensorpipe_cuda",
     ],
     alwayslink = True,
@@ -1640,6 +846,7 @@ cc_library(
     ] + if_cuda([
         "@cuda//:nvToolsExt",
         "@cutlass",
+        ":torch_cuda",
     ]),
     alwayslink = True,
 )
@@ -1761,12 +968,10 @@ py_library(
     visibility = ["//visibility:public"],
     srcs = glob(["torch/**/*.py"], exclude = ["torch/version.py"]) + [":torch/version.py"] + glob(["functorch/**/*.py"]),
     deps = [
-        rules.requirement("future"),
         rules.requirement("numpy"),
         rules.requirement("pyyaml"),
         rules.requirement("requests"),
         rules.requirement("setuptools"),
-        rules.requirement("six"),
         rules.requirement("sympy"),
         rules.requirement("typing_extensions"),
         "//torchgen",
diff --git a/CITATION.cff b/CITATION.cff
index 2bebc947bfb2f..e6de8772cbf21 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -6,68 +6,111 @@ authors:
 url: https://pytorch.org
 preferred-citation:
   type: conference-paper
-  title: "PyTorch: An Imperative Style, High-Performance Deep Learning Library"
+  title: "PyTorch 2: Faster Machine Learning Through Dynamic Python Bytecode Transformation and Graph Compilation"
   authors:
-    - family-names: Paszke
-      given-names: Adam
-    - family-names: Gross
-      given-names: Sam
-    - family-names: Massa
-      given-names: Francisco
-    - family-names: Lerer
-      given-names: Adam
-    - family-names: Bradbury
-      given-names: James
-    - family-names: Chanan
-      given-names: Gregory
-    - family-names: Killeen
-      given-names: Trevor
-    - family-names: Lin
-      given-names: Zeming
+    - family-names: Ansel
+      given-names: Jason
+    - family-names: Yang
+      given-names: Edward
+    - family-names: He
+      given-names: Horace
     - family-names: Gimelshein
       given-names: Natalia
-    - family-names: Antiga
-      given-names: Luca
+    - family-names: Jain
+      given-names: Animesh
+    - family-names: Voznesensky
+      given-names: Michael
+    - family-names: Bao
+      given-names: Bin
+    - family-names: Bell
+      given-names: Peter
+    - family-names: Berard
+      given-names: David
+    - family-names: Burovski
+      given-names: Evgeni
+    - family-names: Chauhan
+      given-names: Geeta
+    - family-names: Chourdia
+      given-names: Anjali
+    - family-names: Constable
+      given-names: Will
     - family-names: Desmaison
       given-names: Alban
-    - family-names: Kopf
-      given-names: Andreas
-    - family-names: Yang
-      given-names: Edward
     - family-names: DeVito
       given-names: Zachary
-    - family-names: Raison
-      given-names: Martin
-    - family-names: Tejani
-      given-names: Alykhan
-    - family-names: Chilamkurthy
-      given-names: Sasank
-    - family-names: Steiner
-      given-names: Benoit
-    - family-names: Fang
-      given-names: Lu
-    - family-names: Bai
-      given-names: Junjie
+    - family-names: Ellison
+      given-names: Elias
+    - family-names: Feng
+      given-names: Will
+    - family-names: Gong
+      given-names: Jiong
+    - family-names: Gschwind
+      given-names: Michael
+    - family-names: Hirsh
+      given-names: Brian
+    - family-names: Huang
+      given-names: Sherlock
+    - family-names: Kalambarkar
+      given-names: Kshiteej
+    - family-names: Kirsch
+      given-names: Laurent
+    - family-names: Lazos
+      given-names: Michael
+    - family-names: Lezcano
+      given-names: Mario
+    - family-names: Liang
+      given-names: Yanbo
+    - family-names: Liang
+      given-names: Jason
+    - family-names: Lu
+      given-names: Yinghai
+    - family-names: Luk
+      given-names: CK
+    - family-names: Maher
+      given-names: Bert
+    - family-names: Pan
+      given-names: Yunjie
+    - family-names: Puhrsch
+      given-names: Christian
+    - family-names: Reso
+      given-names: Matthias
+    - family-names: Saroufim
+      given-names: Mark
+    - family-names: Siraichi
+      given-names: Marcos Yukio
+    - family-names: Suk
+      given-names: Helen
+    - family-names: Suo
+      given-names: Michael
+    - family-names: Tillet
+      given-names: Phil
+    - family-names: Wang
+      given-names: Eikan
+    - family-names: Wang
+      given-names: Xiaodong
+    - family-names: Wen
+      given-names: William
+    - family-names: Zhang
+      given-names: Shunting
+    - family-names: Zhao
+      given-names: Xu
+    - family-names: Zhou
+      given-names: Keren
+    - family-names: Zou
+      given-names: Richard
+    - family-names: Mathews
+      given-names: Ajit
+    - family-names: Chanan
+      given-names: Gregory
+    - family-names: Wu
+      given-names: Peng
     - family-names: Chintala
       given-names: Soumith
-  collection-title: Advances in Neural Information Processing Systems 32
+  collection-title: "29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24)"
   collection-type: proceedings
-  editors:
-    - family-names: Wallach
-      given-names: H.
-    - family-names: Larochelle
-      given-names: H.
-    - family-names: Beygelzimer
-      given-names: A.
-    - family-names: "d'Alché-Buc"
-      given-names: F.
-    - family-names: Fox
-      given-names: E.
-    - family-names: Garnett
-      given-names: R.
-  start: 8024
-  end: 8035
-  year: 2019
+  month: 4
+  year: 2024
   publisher:
-    name: Curran Associates, Inc.
-  url: http://papers.neurips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf
+    name: ACM
+  doi: "10.1145/3620665.3640366"
+  url: "https://pytorch.org/assets/pytorch2-2.pdf"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ba7c20e434fd6..79db67e7357b5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,6 +19,12 @@ cmake_policy(SET CMP0069 NEW)
 # nice when it's possible, and it's possible on our Windows configs.
 cmake_policy(SET CMP0092 NEW)
 
+# Prohibit in-source builds
+if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
+message(FATAL_ERROR "In-source build are not supported")
+endif()
+
+
 # ---[ Project and semantic versioning.
 project(Torch CXX C)
 
@@ -43,11 +49,15 @@ set(CMAKE_C_STANDARD   11 CACHE STRING "The C standard whose features are reques
 # ---[ Utils
 include(cmake/public/utils.cmake)
 
-# --- [ Check that minimal gcc version is 9.4+
-if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.4)
-  message(FATAL "GCC-9.4 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}")
+# --- [ Check that minimal gcc version is 9.3+
+if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.3)
+  message(FATAL_ERROR "GCC-9.3 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}")
 endif()
 
+# This define is needed to preserve behavior given anticpated changes to cccl/thrust
+# https://nvidia.github.io/libcudacxx/standard_api/numerics_library/complex.html
+string(APPEND CMAKE_CUDA_FLAGS " -DLIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS")
+
 if(LINUX)
   include(cmake/CheckAbi.cmake)
   string(APPEND CMAKE_CXX_FLAGS " -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}")
@@ -198,6 +208,9 @@ option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
 option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
 option(USE_TSAN "Use Thread Sanitizer" OFF)
 option(USE_CUDA "Use CUDA" ON)
+cmake_dependent_option(
+    USE_XPU "Use XPU. Only available on Linux." ON
+    "LINUX" OFF)
 cmake_dependent_option(
      BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
 cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
@@ -215,14 +228,10 @@ option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
 option(USE_FAKELOWP "Use FakeLowp operators" OFF)
-option(USE_FFMPEG "Use ffmpeg" OFF)
 option(USE_GFLAGS "Use GFLAGS" OFF)
 option(USE_GLOG "Use GLOG" OFF)
-option(USE_LEVELDB "Use LEVELDB" OFF)
 option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF)
-option(USE_LMDB "Use LMDB" OFF)
 option(USE_MAGMA "Use MAGMA" ON)
-option(USE_METAL "Use Metal for Caffe2 iOS build" ON)
 option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
@@ -251,15 +260,12 @@ cmake_dependent_option(
 option(USE_NUMPY "Use NumPy" ON)
 option(USE_OBSERVERS "Use observers module." OFF)
 option(USE_OPENCL "Use OpenCL" OFF)
-option(USE_OPENCV "Use OpenCV" OFF)
 option(USE_OPENMP "Use OpenMP for parallel code" ON)
 option(USE_PRECOMPILED_HEADERS "Use pre-compiled headers to accelerate build." OFF)
 
 option(USE_PROF "Use profiling" OFF)
 option(USE_QNNPACK "Use QNNPACK (quantized 8-bit operators)" ON)
 option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON)
-option(USE_REDIS "Use Redis" OFF)
-option(USE_ROCKSDB "Use RocksDB" OFF)
 option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
 option(USE_SYSTEM_EIGEN_INSTALL
     "Use system Eigen instead of the one under third_party" OFF)
@@ -281,7 +287,6 @@ option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
 option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
 # option USE_XNNPACK: try to enable xnnpack by default.
 option(USE_XNNPACK "Use XNNPACK" ON)
-option(USE_ZMQ "Use ZMQ" OFF)
 option(USE_ZSTD "Use ZSTD" OFF)
 option(USE_ROCM_KERNEL_ASSERT "Use Kernel Assert for ROCm" OFF)
 # Ensure that an ITT build is the default for x86 CPUs
@@ -344,6 +349,8 @@ cmake_dependent_option(
     "NOT INTERN_BUILD_MOBILE" OFF)
 cmake_dependent_option(
     BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
+cmake_dependent_option(
+    BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler" OFF "USE_CUDA" OFF)
 
 option(USE_MIMALLOC "Use mimalloc" OFF)
 # Enable third party mimalloc library to improve memory allocation performance on Windows.
@@ -727,13 +734,34 @@ if(MSVC)
   append_cxx_flag_if_supported("/utf-8" CMAKE_CXX_FLAGS)
 endif()
 
-# CAVEAT: do NOT check USE_ROCM here, because USE_ROCM is always True until
-#         include(cmake/Dependencies.cmake)
+# Note for ROCM platform:
+# 1. USE_ROCM is always ON until include(cmake/Dependencies.cmake)
+# 2. USE_CUDA will become OFF during re-configuration
+# Truth Table:
+# CUDA 1st pass: USE_CUDA=True;USE_ROCM=True, FLASH evaluates to ON by default
+# CUDA 2nd pass: USE_CUDA=True;USE_ROCM=False, FLASH evaluates to ON by default
+# ROCM 1st pass: USE_CUDA=True;USE_ROCM=True, FLASH evaluates to ON by default
+# ROCM 2nd pass: USE_CUDA=False;USE_ROCM=True, FLASH evaluates to ON by default
+# CPU 1st pass: USE_CUDA=False(Cmd Option);USE_ROCM=True, FLASH evaluates to OFF by default
+# CPU 2nd pass: USE_CUDA=False(Cmd Option);USE_ROCM=False, FLASH evaluates to OFF by default
+# Thus we cannot tell ROCM 2nd pass and CPU 1st pass
+#
+# The only solution is to include(cmake/Dependencies.cmake), and defer the
+# aotriton build decision later.
+
+include(cmake/Dependencies.cmake)
+
 cmake_dependent_option(
   USE_FLASH_ATTENTION
   "Whether to build the flash_attention kernel for scaled dot product attention.\
   Will be disabled if not supported by the platform" ON
-  "USE_CUDA AND NOT MSVC" OFF)
+  "USE_CUDA OR USE_ROCM;NOT MSVC" OFF)
+
+# We are currenlty not using alibi attention for Flash
+# So we disable this feature by default
+# We dont currently document this feature because we don't
+# Suspect users building from source will need this
+add_definitions(-DFLASHATTENTION_DISABLE_ALIBI)
 
 # CAVEAT: Again, do not check USE_ROCM here
 # Flash Attention2 will error while building for sm52 while Mem Eff Attention won't
@@ -743,8 +771,6 @@ cmake_dependent_option(
   Will be disabled if not supported by the platform" ON
   "USE_CUDA" OFF)
 
-include(cmake/Dependencies.cmake)
-
 if(DEBUG_CUDA)
   string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -lineinfo")
   string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -lineinfo")
@@ -1146,6 +1172,7 @@ if(BUILD_SHARED_LIBS)
       COMPONENT dev)
   install(FILES
       ${PROJECT_SOURCE_DIR}/cmake/public/cuda.cmake
+      ${PROJECT_SOURCE_DIR}/cmake/public/xpu.cmake
       ${PROJECT_SOURCE_DIR}/cmake/public/glog.cmake
       ${PROJECT_SOURCE_DIR}/cmake/public/gflags.cmake
       ${PROJECT_SOURCE_DIR}/cmake/public/mkl.cmake
@@ -1167,6 +1194,10 @@ if(BUILD_SHARED_LIBS)
       ${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUSPARSELT.cmake
       DESTINATION share/cmake/Caffe2/
       COMPONENT dev)
+  install(FILES
+      ${PROJECT_SOURCE_DIR}/cmake/Modules/FindSYCLToolkit.cmake
+      DESTINATION share/cmake/Caffe2/
+      COMPONENT dev)
 
   install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2
       FILE Caffe2Targets.cmake
@@ -1229,3 +1260,12 @@ if(DEFINED USE_CUSTOM_DEBINFO)
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -g")
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -g")
 endif()
+
+# Bundle PTXAS if needed
+if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
+   if(NOT EXISTS "${PROJECT_SOURCE_DIR}/build/bin/ptxas")
+     message(STATUS "Copying PTXAS into the bin folder")
+     file(COPY "${CUDAToolkit_BIN_DIR}/ptxas" DESTINATION "${PROJECT_BINARY_DIR}")
+   endif()
+   install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" DESTINATION "${CMAKE_INSTALL_BINDIR}")
+endif()
diff --git a/CODEOWNERS b/CODEOWNERS
index a07f5f81a2a51..e481e6611279a 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -43,12 +43,12 @@ nn/qat/ @jerryzh168
 /torch/csrc/distributed/rpc/tensorpipe_agent.h @jiayisuse @osalpekar @lw
 
 # ONNX Export
-/torch/_dynamo/backends/onnxrt.py @bowenbao @abock @thiagocrepaldi @wschin
-/torch/csrc/jit/passes/onnx.h @bowenbao @abock @thiagocrepaldi
-/torch/csrc/jit/passes/onnx.cpp @bowenbao @abock @thiagocrepaldi
-/torch/csrc/jit/passes/onnx/ @bowenbao @abock @thiagocrepaldi
-/torch/onnx/ @bowenbao @abock @thiagocrepaldi @wschin
-/test/onnx/ @bowenbao @abock @thiagocrepaldi @wschin
+/torch/_dynamo/backends/onnxrt.py @bowenbao @thiagocrepaldi @wschin
+/torch/csrc/jit/passes/onnx.h @bowenbao @thiagocrepaldi
+/torch/csrc/jit/passes/onnx.cpp @bowenbao @thiagocrepaldi
+/torch/csrc/jit/passes/onnx/ @bowenbao @thiagocrepaldi
+/torch/onnx/ @bowenbao @thiagocrepaldi @wschin
+/test/onnx/ @bowenbao @thiagocrepaldi @wschin
 
 # CI
 /.ci  @pytorch/pytorch-dev-infra
@@ -67,6 +67,7 @@ nn/qat/ @jerryzh168
 /test/run_test.py @pytorch/pytorch-dev-infra
 /torch/testing/_internal/common_device_type.py @mruberry
 /torch/testing/_internal/common_utils.py @pytorch/pytorch-dev-infra
+/torch/testing/_internal/hop_db.py @tugsbayasgalan @zou3519 @ydwu4
 
 # Parametrizations
 /torch/nn/utils/parametriz*.py @lezcano
@@ -97,9 +98,13 @@ test/functorch/test_ops.py @zou3519 @chillee @kshitij12345
 test/functorch/test_vmap.py @zou3519 @chillee @kshitij12345
 
 # torch MPS
-test/test_mps.py @kulinseth
-aten/src/ATen/mps/ @kulinseth
-aten/src/ATen/native/mps/ @kulinseth
+test/test_mps.py @kulinseth @malfet
+aten/src/ATen/mps/ @kulinseth @malfet
+aten/src/ATen/native/mps/ @kulinseth @malfet
+
+# MTIA
+aten/src/ATen/detail/MTIAHooksInterface.h @egienvalue
+torch/csrc/mtia/ @egienvalue
 
 # Profiler
 torch/csrc/autograd/profiler* @aaronenyeshi
@@ -111,7 +116,7 @@ torch/profiler/ @aaronenyeshi
 test/functorch/test_aotdispatch.py @ezyang @Chillee
 
 # Dataloader
-torch/utils/data/ @ejguan
+torch/utils/data/ @andrewkho @gokulavasan
 
 # hipify
 torch/utils/hipify/ @jeffdaily @jithunnair-amd
@@ -130,3 +135,23 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 # torch.export
 /torch/export/ @avikchaudhuri @gmagogsfm @tugsbayasgalan @zhxchen17
 /torch/_export/ @avikchaudhuri @gmagogsfm @tugsbayasgalan @zhxchen17
+
+# serialization-related files
+/aten/src/ATen/MapAllocator* @mikaylagawarecki
+/caffe2/serialize/ @mikaylagawarecki
+/torch/serialization.py @mikaylagawarecki
+/torch/storage.py @mikaylagawarecki
+/torch/csrc/Storage* @mikaylagawarecki
+# subscribing for PyTorchFileWriter/PyTorchFileReader changes
+/torch/csrc/jit/python/init.cpp @mikaylagawarecki
+
+# CUDA and CUDA math libraries
+aten/src/ATen/cuda/ @eqy
+aten/src/ATen/cudnn/ @eqy
+aten/src/ATen/native/cuda/ @eqy
+aten/src/ATen/native/cudnn/ @eqy
+c10/cuda @eqy
+torch/cuda/ @eqy
+torch/csrc/cuda/ @eqy
+torch/backends/cuda/ @eqy
+torch/backends/cudnn/ @eqy
diff --git a/Dockerfile b/Dockerfile
index a2c4bef96598c..b8ff65fdd1e9a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,14 +1,12 @@
-# syntax = docker/dockerfile:experimental
-#
-# NOTE: To build this you will need a docker version > 18.06 with
-#       experimental enabled and DOCKER_BUILDKIT=1
-#
-#       If you do not use buildkit you are not going to have a good time
+# syntax=docker/dockerfile:1
+
+# NOTE: Building this image require's docker version >= 23.0.
 #
-#       For reference:
-#           https://docs.docker.com/develop/develop-images/build_enhancements/
-ARG BASE_IMAGE=ubuntu:20.04
-ARG PYTHON_VERSION=3.8
+# For reference:
+# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
+
+ARG BASE_IMAGE=ubuntu:22.04
+ARG PYTHON_VERSION=3.11
 
 FROM ${BASE_IMAGE} as dev-base
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
@@ -26,7 +24,7 @@ RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
 ENV PATH /opt/conda/bin:$PATH
 
 FROM dev-base as conda
-ARG PYTHON_VERSION=3.8
+ARG PYTHON_VERSION=3.11
 # Automatically set by buildx
 ARG TARGETPLATFORM
 # translating Docker's TARGETPLATFORM into miniconda arches
@@ -57,12 +55,12 @@ COPY --from=submodule-update /opt/pytorch /opt/pytorch
 RUN make triton
 RUN --mount=type=cache,target=/opt/ccache \
     export eval ${CMAKE_VARS} && \
-    TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
+    TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
     CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
     python setup.py install
 
 FROM conda as conda-installs
-ARG PYTHON_VERSION=3.8
+ARG PYTHON_VERSION=3.11
 ARG CUDA_VERSION=12.1
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch-nightly
@@ -99,6 +97,7 @@ ENV PATH /opt/conda/bin:$PATH
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
 ENV PYTORCH_VERSION ${PYTORCH_VERSION}
 WORKDIR /workspace
 
diff --git a/README.md b/README.md
index ae3e1330c02e9..3ff42586109c3 100644
--- a/README.md
+++ b/README.md
@@ -158,16 +158,16 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
 #### Prerequisites
 If you are installing from source, you will need:
 - Python 3.8 or later (for Linux, Python 3.8.1+ is needed)
-- A compiler that fully supports C++17, such as clang or gcc (especially for aarch64, gcc 9.4.0 or newer is required)
+- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required)
 
 We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
 
 If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following:
 - [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
-- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v7 or above
+- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above
 - [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA
 
-Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/pdf/cuDNN-Support-Matrix.pdf) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware
+Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware
 
 If you want to disable CUDA support, export the environment variable `USE_CUDA=0`.
 Other potentially useful environment variables may be found in `setup.py`.
diff --git a/RELEASE.md b/RELEASE.md
index aeb98825c592b..cfb1a089a02fe 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -17,6 +17,7 @@
     - [Release Candidate Storage](#release-candidate-storage)
     - [Release Candidate health validation](#release-candidate-health-validation)
     - [Cherry Picking Fixes](#cherry-picking-fixes)
+      - [How to do Cherry Picking](#how-to-do-cherry-picking)
     - [Cherry Picking Reverts](#cherry-picking-reverts)
   - [Preparing and Creating Final Release candidate](#preparing-and-creating-final-release-candidate)
   - [Promoting RCs to Stable](#promoting-rcs-to-stable)
@@ -49,7 +50,8 @@ Following is the Release Compatibility Matrix for PyTorch releases:
 
 | PyTorch version | Python | Stable CUDA | Experimental CUDA |
 | --- | --- | --- | --- |
-| 2.2 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 |
+| 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 |
+| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 |
 | 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 |
 | 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 |
 | 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 |
@@ -217,7 +219,7 @@ Validate the release jobs for pytorch and domain libraries should be green. Vali
   * [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/release%2F1.12)
   * [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/release%2F1.12)
 
-Validate that the documentation build has completed and generated entry corresponding to the release in  [docs folder](https://github.com/pytorch/pytorch.github.io/tree/site/docs/) of pytorch.github.io repository
+Validate that the documentation build has completed and generated entry corresponding to the release in  [docs repository](https://github.com/pytorch/docs/tree/main/).
 
 ### Cherry Picking Fixes
 
@@ -232,6 +234,32 @@ Please also make sure to add milestone target to the PR/issue, especially if it
 
 **NOTE**: The cherry pick process is not an invitation to add new features, it is mainly there to fix regressions
 
+#### How to do Cherry Picking
+
+You can now use `pytorchbot` to cherry pick a PyTorch PR that has been committed
+to the main branch using `@pytorchbot cherry-pick` command as follows.
+
+```
+usage: @pytorchbot cherry-pick --onto ONTO [--fixes FIXES] -c
+                               {regression,critical,fixnewfeature,docs,release}
+
+Cherry pick a pull request onto a release branch for inclusion in a release
+
+optional arguments:
+  --onto ONTO           Branch you would like to cherry pick onto (Example: release/2.2)
+  --fixes FIXES         Link to the issue that your PR fixes (i.e. https://github.com/pytorch/pytorch/issues/110666)
+  -c {regression,critical,fixnewfeature,docs,release}
+                        A machine-friendly classification of the cherry-pick reason.
+```
+
+For example, [#120567](https://github.com/pytorch/pytorch/pull/120567#issuecomment-1978964376)
+created a cherry pick PR [#121232](https://github.com/pytorch/pytorch/pull/121232) onto `release/2.2`
+branch to fix a regression issue. You can then refer to the original
+and the cherry-picked PRs on the release tracker issue. Please note
+that the cherry-picked PR will still need to be reviewed by PyTorch
+RelEng team before it can go into the release branch. This feature
+requires `pytorchbot`, so it's only available in PyTorch atm.
+
 ### Cherry Picking Reverts
 
 If PR that has been cherry-picked into release branch has been reverted, it's cherry-pick must be reverted as well.
diff --git a/SECURITY.md b/SECURITY.md
index 0651f82b70c6e..e8e0249fc896c 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,9 +1,56 @@
-# Reporting Security Issues
+# Security Policy
 
-If you believe you have found a security vulnerability in PyTorch, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
+ - [**Reporting a Vulnerability**](#reporting-a-vulnerability)
+ - [**Using Pytorch Securely**](#using-pytorch-securely)
+   - [Untrusted models](#untrusted-models)
+   - [Untrusted inputs](#untrusted-inputs)
+   - [Data privacy](#data-privacy)
+
+## Reporting Security Issues
+
+Beware that none of the topics under [Using Pytorch Securely](#using-pytorch-securely) are considered vulnerabilities of Pytorch.
+
+However, if you believe you have found a security vulnerability in PyTorch, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
 
 Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new
 
 Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
 
 https://www.facebook.com/whitehat
+
+
+## Using Pytorch Securely
+**Pytorch models are programs**, so treat its security seriously -- running untrusted models is equivalent to running untrusted code. In general we recommend that model weights and the python code for the model are distributed independently. That said, be careful about where you get the python code from and who wrote it (preferentially check for a provenance or checksums, do not run any pip installed package).
+
+### Untrusted models
+Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources[^data-poisoning-sources].
+
+**Prefer to execute untrusted models within a secure, isolated environment such as a sandbox** (e.g., containers, virtual machines). This helps protect your system from potentially malicious code. You can find further details and instructions in [this page](https://developers.google.com/code-sandboxing).
+
+**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) with `weights_only=True` is also secure to our knowledge even though it offers significantly larger surface of attack. Loading un-trusted checkpoint with `weights_only=False` MUST never be done.
+
+
+
+Important Note: The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
+
+[^data-poisoning-sources]: To understand risks of utilization of data from unknown sources, read the following Cornell papers on Data poisoning:
+    https://arxiv.org/abs/2312.04748
+    https://arxiv.org/abs/2401.05566
+
+### Untrusted inputs during training and prediction
+
+If you plan to open your model to untrusted inputs, be aware that inputs can also be used as vectors by malicious agents. To minimize risks, make sure to give your model only the permisisons strictly required, and keep your libraries updated with the lates security patches.
+
+If applicable, prepare your model against bad inputs and prompt injections. Some recommendations:
+- Pre-analysis: check how the model performs by default when exposed to prompt injection (e.g. using fuzzing for prompt injection).
+- Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as:
+    - Validation: Enforce strict rules on allowed characters and data types.
+    - Filtering: Remove potentially malicious scripts or code fragments.
+    - Encoding: Convert special characters into safe representations.
+    - Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)).
+
+### Data privacy
+
+**Take special security measures if your model if you train models with sensitive data**. Prioritize [sandboxing](https://developers.google.com/code-sandboxing) your models and:
+- Do not feed sensitive data to untrusted model (even if runs in a sandboxed environment)
+- If you consider publishing a model that was partially trained with sensitive data, be aware that data can potentially be recovered from the trained weights (especially if model overfits).
diff --git a/WORKSPACE b/WORKSPACE
index b187949d663e7..8eabea571a571 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -71,6 +71,13 @@ http_archive(
     ],
 )
 
+http_archive(
+    name = "com_github_opentelemetry-cpp",
+    urls = [
+        "https://github.com/open-telemetry/opentelemetry-cpp/archive/refs/tags/v1.14.2.tar.gz",
+    ],
+)
+
 new_local_repository(
     name = "gloo",
     build_file = "//third_party:gloo.BUILD",
@@ -155,6 +162,12 @@ new_local_repository(
     path = "third_party/kineto",
 )
 
+new_local_repository(
+    name = "opentelemetry-cpp",
+    build_file = "//third_party::opentelemetry-cpp.BUILD",
+    path = "third_party/opentelemetry-cpp",
+)
+
 new_patched_local_repository(
     name = "tbb",
     build_file = "//third_party:tbb.BUILD",
@@ -206,11 +219,11 @@ py_repositories()
 load("@rules_python//python:repositories.bzl", "python_register_toolchains")
 
 python_register_toolchains(
-    name = "python3_8",
-    python_version = "3.8",
+    name = "python3_10",
+    python_version = "3.10",
 )
 
-load("@python3_8//:defs.bzl", "interpreter")
+load("@python3_10//:defs.bzl", "interpreter")
 load("@rules_python//python:pip.bzl", "pip_parse")
 
 pip_parse(
diff --git a/android/README.md b/android/README.md
index e13344aebe52b..d6a1ba1d4479b 100644
--- a/android/README.md
+++ b/android/README.md
@@ -9,7 +9,7 @@ Demo applications with code walk-through can be find in [this github repo](https
 ##### Release
 Release artifacts are published to jcenter:
 
-```
+```groovy
 repositories {
     jcenter()
 }
@@ -32,7 +32,7 @@ dependencies {
 Nightly(snapshots) builds are published every night from `master` branch to [nexus sonatype snapshots repository](https://oss.sonatype.org/#nexus-search;quick~pytorch_android)
 
 To use them repository must be specified explicitly:
-```
+```groovy
 repositories {
     maven {
         url "https://oss.sonatype.org/content/repositories/snapshots"
@@ -62,7 +62,7 @@ The current nightly(snapshots) version is the value of `VERSION_NAME` in `gradle
 In some cases you might want to use a local build of pytorch android, for example you may build custom libtorch binary with another set of operators or to make local changes.
 
 For this you can use `./scripts/build_pytorch_android.sh` script.
-```
+```bash
 git clone https://github.com/pytorch/pytorch.git
 cd pytorch
 git submodule update --init --recursive
@@ -91,14 +91,14 @@ They are specified as environment variables:
 
 After successful build you should see the result as aar file:
 
-```
+```bash
 $ find pytorch_android/build/ -type f -name *aar
 pytorch_android/build/outputs/aar/pytorch_android.aar
 pytorch_android_torchvision/build/outputs/aar/pytorch_android.aar
 ```
 
 It can be used directly in android projects, as a gradle dependency:
-```
+```groovy
 allprojects {
     repositories {
         flatDir {
@@ -126,7 +126,7 @@ You can check out [test app example](https://github.com/pytorch/pytorch/blob/mas
 In some cases, you may want to use libtorch from your android native build.
 You can do it without building libtorch android, using native libraries from PyTorch android gradle dependency.
 For that, you will need to add the next lines to your gradle build.
-```
+```groovy
 android {
 ...
     configurations {
@@ -181,7 +181,7 @@ The added task will unpack them to gradle build directory.
 In your native build you can link to them adding these lines to your CMakeLists.txt:
 
 
-```
+```cmake
 # Relative path of gradle build directory to CMakeLists.txt
 set(build_DIR ${CMAKE_SOURCE_DIR}/build)
 
@@ -209,7 +209,7 @@ target_link_libraries(${PROJECT_NAME}
 If your CMakeLists.txt file is located in the same directory as your build.gradle, `set(build_DIR ${CMAKE_SOURCE_DIR}/build)` should work for you. But if you have another location of it, you may need to change it.
 
 After that, you can use libtorch C++ API from your native code.
-```
+```cpp
 #include <string>
 #include <ATen/NativeFunctions.h>
 #include <torch/script.h>
diff --git a/android/pytorch_android/generate_test_torchscripts.py b/android/pytorch_android/generate_test_torchscripts.py
index c3c9518517ae7..a487bd1242e0c 100644
--- a/android/pytorch_android/generate_test_torchscripts.py
+++ b/android/pytorch_android/generate_test_torchscripts.py
@@ -125,6 +125,15 @@ def conv2d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor:
             r = r.contiguous()
         return r
 
+    @torch.jit.script_method
+    def conv3d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor:
+        r = torch.nn.functional.conv3d(x, w)
+        if toChannelsLast:
+            r = r.contiguous(memory_format=torch.channels_last_3d)
+        else:
+            r = r.contiguous()
+        return r
+
     @torch.jit.script_method
     def contiguous(self, x: Tensor) -> Tensor:
         return x.contiguous()
diff --git a/android/pytorch_android/src/androidTest/assets/android_api_module.ptl b/android/pytorch_android/src/androidTest/assets/android_api_module.ptl
index df62dd8620881..9adfb84bf8551 100644
Binary files a/android/pytorch_android/src/androidTest/assets/android_api_module.ptl and b/android/pytorch_android/src/androidTest/assets/android_api_module.ptl differ
diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
index d2dfa93da17a3..7980a34c04347 100644
--- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
@@ -348,15 +348,32 @@ public void testChannelsLast3d() throws IOException {
   @Test
   public void testChannelsLastConv2d() throws IOException {
     long[] inputShape = new long[] {1, 3, 2, 2};
-    long[] dataNCHW = new long[] {1, 2, 3, 4, 11, 12, 13, 14, 101, 102, 103, 104};
+    long[] dataNCHW = new long[] {
+      111, 112,
+      121, 122,
+
+      211, 212,
+      221, 222,
+
+      311, 312,
+      321, 322};
     Tensor inputNCHW = Tensor.fromBlob(dataNCHW, inputShape, MemoryFormat.CONTIGUOUS);
-    long[] dataNHWC = new long[] {1, 11, 101, 2, 12, 102, 3, 13, 103, 4, 14, 104};
-    Tensor inputNHWC = Tensor.fromBlob(dataNHWC, inputShape, MemoryFormat.CHANNELS_LAST);
+    long[] dataNHWC = new long[] {
+      111, 211, 311,       112, 212, 312,
 
+      121, 221, 321,       122, 222, 322};
+    Tensor inputNHWC = Tensor.fromBlob(dataNHWC, inputShape, MemoryFormat.CHANNELS_LAST);
     long[] weightShape = new long[] {3, 3, 1, 1};
-    long[] dataWeightOIHW = new long[] {2, 0, 0, 0, 1, 0, 0, 0, -1};
+    long[] dataWeightOIHW = new long[] {
+      2, 0, 0,
+      0, 1, 0,
+      0, 0, -1};
     Tensor wNCHW = Tensor.fromBlob(dataWeightOIHW, weightShape, MemoryFormat.CONTIGUOUS);
-    long[] dataWeightOHWI = new long[] {2, 0, 0, 0, 1, 0, 0, 0, -1};
+    long[] dataWeightOHWI = new long[] {
+      2, 0, 0,
+      0, 1, 0,
+      0, 0, -1};
+
     Tensor wNHWC = Tensor.fromBlob(dataWeightOHWI, weightShape, MemoryFormat.CHANNELS_LAST);
 
     final Module module = loadModel(TEST_MODULE_ASSET_NAME);
@@ -367,7 +384,15 @@ public void testChannelsLastConv2d() throws IOException {
         outputNCHW,
         MemoryFormat.CONTIGUOUS,
         new long[] {1, 3, 2, 2},
-        new long[] {2, 4, 6, 8, 11, 12, 13, 14, -101, -102, -103, -104});
+        new long[] {
+          2*111, 2*112,
+          2*121, 2*122,
+
+          211, 212,
+          221, 222,
+
+          -311, -312,
+          -321, -322});
 
     final IValue outputNHWC =
         module.runMethod("conv2d", IValue.from(inputNHWC), IValue.from(wNHWC), IValue.from(true));
@@ -375,7 +400,89 @@ public void testChannelsLastConv2d() throws IOException {
         outputNHWC,
         MemoryFormat.CHANNELS_LAST,
         new long[] {1, 3, 2, 2},
-        new long[] {2, 11, -101, 4, 12, -102, 6, 13, -103, 8, 14, -104});
+        new long[] {
+          2*111, 211, -311,      2*112, 212, -312,
+          2*121, 221, -321,      2*122, 222, -322});
+  }
+
+  @Test
+  public void testChannelsLastConv3d() throws IOException {
+    long[] inputShape = new long[] {1, 3, 2, 2, 2};
+    long[] dataNCDHW = new long[] {
+      1111, 1112,
+      1121, 1122,
+      1211, 1212,
+      1221, 1222,
+
+      2111, 2112,
+      2121, 2122,
+      2211, 2212,
+      2221, 2222,
+
+      3111, 3112,
+      3121, 3122,
+      3211, 3212,
+      3221, 3222};
+    Tensor inputNCDHW = Tensor.fromBlob(dataNCDHW, inputShape, MemoryFormat.CONTIGUOUS);
+    long[] dataNDHWC = new long[] {
+      1111, 2111, 3111,
+      1112, 2112, 3112,
+
+      1121, 2121, 3121,
+      1122, 2122, 3122,
+
+      1211, 2211, 3211,
+      1212, 2212, 3212,
+
+      1221, 2221, 3221,
+      1222, 2222, 3222};
+
+    Tensor inputNDHWC = Tensor.fromBlob(dataNDHWC, inputShape, MemoryFormat.CHANNELS_LAST_3D);
+
+    long[] weightShape = new long[] {3, 3, 1, 1, 1};
+    long[] dataWeightOIDHW = new long[] {
+      2, 0, 0,
+      0, 1, 0,
+      0, 0, -1,
+    };
+    Tensor wNCDHW = Tensor.fromBlob(dataWeightOIDHW, weightShape, MemoryFormat.CONTIGUOUS);
+    long[] dataWeightODHWI = new long[] {
+      2, 0, 0,
+      0, 1, 0,
+      0, 0, -1,
+    };
+    Tensor wNDHWC = Tensor.fromBlob(dataWeightODHWI, weightShape, MemoryFormat.CHANNELS_LAST_3D);
+
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
+
+    final IValue outputNCDHW =
+        module.runMethod("conv3d", IValue.from(inputNCDHW), IValue.from(wNCDHW), IValue.from(false));
+    assertIValueTensor(
+        outputNCDHW,
+        MemoryFormat.CONTIGUOUS,
+        new long[] {1, 3, 2, 2, 2},
+        new long[] {
+          2*1111, 2*1112,     2*1121, 2*1122,
+          2*1211, 2*1212,     2*1221, 2*1222,
+
+          2111, 2112,     2121, 2122,
+          2211, 2212,     2221, 2222,
+
+          -3111, -3112,     -3121, -3122,
+          -3211, -3212,     -3221, -3222});
+
+    final IValue outputNDHWC =
+        module.runMethod("conv3d", IValue.from(inputNDHWC), IValue.from(wNDHWC), IValue.from(true));
+    assertIValueTensor(
+        outputNDHWC,
+        MemoryFormat.CHANNELS_LAST_3D,
+        new long[] {1, 3, 2, 2, 2},
+        new long[] {
+          2*1111, 2111, -3111,      2*1112, 2112, -3112,
+          2*1121, 2121, -3121,      2*1122, 2122, -3122,
+
+          2*1211, 2211, -3211,      2*1212, 2212, -3212,
+          2*1221, 2221, -3221,      2*1222, 2222, -3222});
   }
 
   @Test
diff --git a/android/pytorch_android/test_asset.jit b/android/pytorch_android/test_asset.jit
index 3bd9037da4ee6..8605ab13d555e 100644
--- a/android/pytorch_android/test_asset.jit
+++ b/android/pytorch_android/test_asset.jit
@@ -84,6 +84,15 @@ def conv2d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor:
         r = r.contiguous()
     return r
 
+def conv3d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor:
+    r = torch.conv3d(x, w)
+    if (toChannelsLast):
+        # memory_format=torch.channels_last_3d
+        r = r.contiguous(memory_format=2)
+    else:
+        r = r.contiguous()
+    return r
+
 def contiguous(self, x: Tensor) -> Tensor:
     return x.contiguous()
 
diff --git a/aten.bzl b/aten.bzl
index 9c6325d16abfb..6d8cff6d1ae7d 100644
--- a/aten.bzl
+++ b/aten.bzl
@@ -62,11 +62,10 @@ def generate_aten_impl(ctx):
     outputs = [ops_dir] + ctx.outputs.outs
 
     install_dir = paths.dirname(ops_dir.path)
-    tool_inputs, tool_inputs_manifest = ctx.resolve_tools(tools = [ctx.attr.generator])
-    ctx.actions.run_shell(
+    ctx.actions.run(
         outputs = outputs,
         inputs = ctx.files.srcs,
-        command = ctx.executable.generator.path + " $@",
+        executable = ctx.executable.generator,
         arguments = [
             "--source-path",
             "aten/src/ATen",
@@ -74,8 +73,6 @@ def generate_aten_impl(ctx):
             "--install_dir",
             install_dir,
         ],
-        tools = tool_inputs,
-        input_manifests = tool_inputs_manifest,
         use_default_shell_env = True,
         mnemonic = "GenerateAten",
     )
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index 412b2a603231d..bda6aea327062 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -18,6 +18,8 @@ cmake_policy(SET CMP0012 NEW)
 #############################################
 
 set(ATen_CPU_SRCS)
+set(ATen_XPU_SRCS)
+set(ATen_XPU_INCLUDE)
 set(ATen_CPU_TEST_SRCS)
 set(ATen_CPU_INCLUDE)
 set(ATen_THIRD_PARTY_INCLUDE)
@@ -34,8 +36,12 @@ set(ATen_HIP_TEST_SRCS)
 set(ATen_HIP_INCLUDE)
 set(ATen_MPS_SRCS)
 set(ATen_MPS_TEST_SRCS)
+set(ATen_XPU_SRCS)
+set(ATen_XPU_INCLUDE)
+set(ATen_XPU_TEST_SRCS)
 set(ATen_VULKAN_TEST_SRCS)
 set(ATen_CPU_DEPENDENCY_LIBS)
+set(ATen_XPU_DEPENDENCY_LIBS)
 set(ATen_CUDA_DEPENDENCY_LIBS)
 set(ATen_HIP_DEPENDENCY_LIBS)
 set(ATen_PUBLIC_CUDA_DEPENDENCY_LIBS)
@@ -102,6 +108,8 @@ add_subdirectory(src/ATen)
 # Pass source, includes, and libs to parent
 set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
 set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
+set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE)
+set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE)
 set(ATen_CUDA_CU_SRCS ${ATen_CUDA_CU_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_CPP_SRCS ${ATen_CUDA_CPP_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_LINALG_SRCS ${ATen_CUDA_LINALG_SRCS} PARENT_SCOPE)
@@ -111,6 +119,8 @@ set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
 set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE)
 set(ATen_MPS_TEST_SRCS ${ATen_MPS_TEST_SRCS} PARENT_SCOPE)
 set(ATen_HIP_SRCS_W_SORT_BY_KEY ${ATen_HIP_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
+set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE)
+set(ATen_XPU_TEST_SRCS ${ATen_XPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
@@ -122,8 +132,10 @@ set(ATen_VEC_TEST_SRCS ${ATen_VEC_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
 set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
 set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE)
+set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE)
 set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
 set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
+set(ATen_XPU_DEPENDENCY_LIBS ${ATen_XPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/AccumulateType.h b/aten/src/ATen/AccumulateType.h
index f96f34e1e6b6d..0275ef099b03d 100644
--- a/aten/src/ATen/AccumulateType.h
+++ b/aten/src/ATen/AccumulateType.h
@@ -4,7 +4,9 @@
 #include <c10/core/ScalarType.h>
 #include <c10/util/BFloat16.h>
 #include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
 #include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
 #include <c10/util/Half.h>
 
 // Defines the accumulation type for a scalar type.
@@ -87,6 +89,8 @@ MPS_ACC_TYPE(BFloat16, float);
 MPS_ACC_TYPE(Half, float);
 MPS_ACC_TYPE(Float8_e5m2, float);
 MPS_ACC_TYPE(Float8_e4m3fn, float);
+MPS_ACC_TYPE(Float8_e5m2fnuz, float);
+MPS_ACC_TYPE(Float8_e4m3fnuz, float);
 MPS_ACC_TYPE(float, float);
 MPS_ACC_TYPE(double, float);
 MPS_ACC_TYPE(int8_t, int64_t);
@@ -107,6 +111,8 @@ CUDA_ACC_TYPE(BFloat16, float);
 CUDA_ACC_TYPE(Half, float);
 CUDA_ACC_TYPE(Float8_e5m2, float);
 CUDA_ACC_TYPE(Float8_e4m3fn, float);
+CUDA_ACC_TYPE(Float8_e5m2fnuz, float);
+CUDA_ACC_TYPE(Float8_e4m3fnuz, float);
 CUDA_ACC_TYPE(float, float);
 CUDA_ACC_TYPE(double, double);
 CUDA_ACC_TYPE(int8_t, int64_t);
@@ -123,8 +129,8 @@ CUDA_ACC_TYPE(c10::complex<double>, c10::complex<double>);
 CPU_ACC_TYPE(BFloat16, float);
 CPU_ACC_TYPE(Half, float);
 CPU_ACC_TYPE(Float8_e5m2, float);
-CPU_ACC_TYPE(Float8_e5m2fnuz, float);
 CPU_ACC_TYPE(Float8_e4m3fn, float);
+CPU_ACC_TYPE(Float8_e5m2fnuz, float);
 CPU_ACC_TYPE(Float8_e4m3fnuz, float);
 CPU_ACC_TYPE(float, double);
 CPU_ACC_TYPE(double, double);
diff --git a/aten/src/ATen/BlasBackend.h b/aten/src/ATen/BlasBackend.h
new file mode 100644
index 0000000000000..7f8c321ad9fa2
--- /dev/null
+++ b/aten/src/ATen/BlasBackend.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+
+#include <ostream>
+#include <string>
+
+namespace at {
+
+enum class BlasBackend : int8_t { Cublas, Cublaslt };
+
+inline std::string BlasBackendToString(at::BlasBackend backend) {
+  switch (backend) {
+    case BlasBackend::Cublas:
+      return "at::BlasBackend::Cublas";
+    case BlasBackend::Cublaslt:
+      return "at::BlasBackend::Cublaslt";
+    default:
+      TORCH_CHECK(false, "Unknown blas backend");
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) {
+  return stream << BlasBackendToString(backend);
+}
+
+} // namespace at
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index d4ccca9746654..583662e6c63d0 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -48,6 +48,7 @@ endif()
 # NB: If you edit these globs, you'll have to update setup.py package_data as well
 file(GLOB_RECURSE ATen_CORE_HEADERS  "core/*.h")
 file(GLOB_RECURSE ATen_CORE_SRCS "core/*.cpp")
+file(GLOB_RECURSE ATen_TRANSFORMER_HEADERS "native/transformers/*.h")
 if(NOT BUILD_LITE_INTERPRETER)
   file(GLOB_RECURSE ATen_CORE_TEST_SRCS "core/*_test.cpp")
 endif()
@@ -60,19 +61,22 @@ endif()
 
 file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
 file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp" "functorch/*.cpp")
-file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh")
-file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")
+file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh" "cuda/tunable/*.cuh" "cuda/tunable/*.h")
+file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp" "cuda/tunable/*.cpp")
 file(GLOB cuda_nvrtc_stub_h "cuda/nvrtc_stub/*.h")
 file(GLOB cuda_nvrtc_stub_cpp "cuda/nvrtc_stub/*.cpp")
-file(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu")
+file(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu" "cuda/tunable/*.cu")
 file(GLOB cudnn_h "cudnn/*.h" "cudnn/*.cuh")
 file(GLOB cudnn_cpp "cudnn/*.cpp")
 file(GLOB ops_h "ops/*.h")
 
-file(GLOB hip_h "hip/*.h" "hip/detail/*.h" "hip/*.cuh" "hip/detail/*.cuh" "hip/impl/*.h")
-file(GLOB hip_cpp "hip/*.cpp" "hip/detail/*.cpp" "hip/impl/*.cpp")
+file(GLOB xpu_h "xpu/*.h" "xpu/detail/*.h")
+file(GLOB xpu_cpp "xpu/*.cpp" "xpu/detail/*.cpp")
+
+file(GLOB hip_h "hip/*.h" "hip/detail/*.h" "hip/*.cuh" "hip/detail/*.cuh" "hip/impl/*.h" "hip/tunable/*.cuh" "hip/tunable/*.h")
+file(GLOB hip_cpp "hip/*.cpp" "hip/detail/*.cpp" "hip/impl/*.cpp" "hip/tunable/*.cpp")
 list(REMOVE_ITEM hip_cpp "${CMAKE_CURRENT_SOURCE_DIR}/hip/detail/LazyNVRTC.cpp")
-file(GLOB hip_hip "hip/*.hip" "hip/detail/*.hip" "hip/impl/*.hip")
+file(GLOB hip_hip "hip/*.hip" "hip/detail/*.hip" "hip/impl/*.hip" "hip/tunable/*.hip")
 file(GLOB hip_nvrtc_stub_h "hip/nvrtc_stub/*.h")
 file(GLOB hip_nvrtc_stub_cpp "hip/nvrtc_stub/*.cpp")
 file(GLOB miopen_h "miopen/*.h")
@@ -81,6 +85,8 @@ file(GLOB miopen_cpp "miopen/*.cpp")
 file(GLOB mkl_cpp "mkl/*.cpp")
 file(GLOB mkldnn_cpp "mkldnn/*.cpp")
 
+file(GLOB mkldnn_xpu_cpp "native/mkldnn/xpu/*.cpp" "native/mkldnn/xpu/detail/*.cpp")
+
 file(GLOB native_cpp "native/*.cpp")
 file(GLOB native_mkl_cpp "native/mkl/*.cpp")
 file(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
@@ -138,6 +144,7 @@ file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
 file(GLOB native_quantized_cuda_cu "native/quantized/cuda/*.cu")
 file(GLOB native_quantized_cuda_cpp "native/quantized/cuda/*.cpp")
 file(GLOB native_quantized_cudnn_cpp "native/quantized/cudnn/*.cpp")
+file(GLOB native_nested_h "native/nested/*.h")
 file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu")
 file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp")
 
@@ -233,6 +240,20 @@ else()
   set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp})
 endif()
 
+if(USE_XPU)
+  list(APPEND ATen_XPU_SRCS ${mkldnn_xpu_cpp})
+  list(APPEND ATen_XPU_DEPENDENCY_LIBS xpu_mkldnn)
+
+  list(APPEND ATen_XPU_DEPENDENCY_LIBS ${OCL_LIBRARY})
+  list(APPEND ATen_XPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/mkldnn/xpu)
+  list(APPEND ATen_XPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/mkldnn/xpu/detail)
+  list(APPEND ATen_XPU_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/ideep/mkl-dnn/include)
+  list(APPEND ATen_XPU_INCLUDE ${XPU_MKLDNN_INCLUDE})
+
+  list(APPEND ATen_XPU_INCLUDE ${SYCL_INCLUDE_DIR})
+  list(APPEND ATen_XPU_DEPENDENCY_LIBS ${SYCL_LIBRARY})
+endif()
+
 # Metal
 if(USE_PYTORCH_METAL_EXPORT)
   # Add files needed from exporting metal models(optimized_for_mobile)
@@ -321,6 +342,11 @@ if(USE_ROCM)
   )
 endif()
 
+if(USE_XPU)
+  list(APPEND ATen_XPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/xpu)
+  list(APPEND ATen_XPU_SRCS ${xpu_cpp})
+endif()
+
 list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/..)
 
 if(USE_TBB)
@@ -410,50 +436,31 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$")
   list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo)
 endif()
 
-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
-  # Preserve values for the main build
-  set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS})
-  set(__aten_sleef_build_tests ${BUILD_TESTS})
-
-  # Unset our restrictive C++ flags here and reset them later.
-  # Remove this once we use proper target_compile_options.
-  set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-  set(CMAKE_CXX_FLAGS)
-
-  # Bump up optimization level for sleef to -O1, since at -O0 the compiler
-  # excessively spills intermediate vector registers to the stack
-  # and makes things run impossibly slowly
-  set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-  if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
-    string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-  else()
-    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
+if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
+  if(NOT MSVC)
+    # Bump up optimization level for sleef to -O1, since at -O0 the compiler
+    # excessively spills intermediate vector registers to the stack
+    # and makes things run impossibly slowly
+    set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+    if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
+      string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
+    else()
+      set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
+    endif()
   endif()
 
   if(NOT USE_SYSTEM_SLEEF)
-    set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
-    set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
-    set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
-    set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
-    set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
+    set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
+    set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
+    set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
+    set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
+    set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE)
     if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
       if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
         set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
       endif()
     endif()
-    if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND
-        CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.9 AND CMAKE_C_COMPILER_VERSION VERSION_LESS 8)
-      set(GCC_7 True)
-    else()
-      set(GCC_7 False)
-    endif()
-    if(GCC_7)
-      set(CMAKE_BUILD_TYPE Release)  # Always build Sleef as a Release build to work around a gcc-7 bug
-    endif()
     add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/sleef" ${CMAKE_BINARY_DIR}/sleef)
-    if(GCC_7)
-      set(CMAKE_BUILD_TYPE ${OLD_CMAKE_BUILD_TYPE})
-    endif()
     set_property(TARGET sleef PROPERTY FOLDER "dependencies")
     list(APPEND ATen_THIRD_PARTY_INCLUDE ${CMAKE_BINARY_DIR}/include)
     link_directories(${CMAKE_BINARY_DIR}/sleef/lib)
@@ -468,12 +475,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
   endif()
   list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef)
 
-  set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-  set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
-
-  # Set these back. TODO: Use SLEEF_ to pass these instead
-  set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE)
-  set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
+  if(NOT MSVC)
+    set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
+  endif()
 endif()
 
 if(USE_CUDA AND NOT USE_ROCM)
@@ -577,9 +581,9 @@ configure_file(ATenConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/AT
 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
   DESTINATION "${AT_INSTALL_SHARE_DIR}/cmake/ATen")
 
-set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS})
+set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS} ${native_nested_h} ${ATen_TRANSFORMER_HEADERS})
 if(NOT INTERN_BUILD_MOBILE)
-  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${mps_h} ${native_mps_h} ${native_utils_h} ${miopen_h})
+  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${xpu_h} ${mps_h} ${native_mps_h} ${native_utils_h} ${miopen_h})
   # Metal
   if(USE_PYTORCH_METAL_EXPORT)
     # Add files needed from exporting metal models(optimized_for_mobile)
@@ -641,6 +645,7 @@ list(APPEND ATen_MOBILE_BENCHMARK_SRCS
 # Pass source, includes, and libs to parent
 set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
 set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
+set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_CU_SRCS ${ATen_CUDA_CU_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_CPP_SRCS ${ATen_CUDA_CPP_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_LINALG_SRCS ${ATen_CUDA_LINALG_SRCS} PARENT_SCOPE)
@@ -649,9 +654,11 @@ set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SC
 set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
 set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
 set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE)
+set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE)
 set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE)
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
+set(ATen_XPU_TEST_SRCS ${ATen_XPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
 set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
 set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
@@ -664,9 +671,11 @@ set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
 set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
 set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
 set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE)
+set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE)
 set(ATen_VULKAN_INCLUDE ${ATen_VULKAN_INCLUDE} PARENT_SCOPE)
 set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
+set(ATen_XPU_DEPENDENCY_LIBS ${ATen_XPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(FLASH_ATTENTION_CUDA_SOURCES ${FLASH_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
 set(MEM_EFF_ATTENTION_CUDA_SOURCES ${MEM_EFF_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h
index 225390ab0beea..5c524ef97c475 100644
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@@ -6,7 +6,6 @@
 #include <c10/util/irange.h>
 #include <cstring>
 #include <limits>
-#include <utility>
 
 namespace at {
 
diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp
index bf4306fce5772..2d086ebbe71fe 100644
--- a/aten/src/ATen/CPUGeneratorImpl.cpp
+++ b/aten/src/ATen/CPUGeneratorImpl.cpp
@@ -141,8 +141,8 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
   using detail::CPUGeneratorImplState;
   using detail::CPUGeneratorImplStateLegacy;
 
-  static_assert(std::is_standard_layout<CPUGeneratorImplStateLegacy>::value, "CPUGeneratorImplStateLegacy is not a PODType");
-  static_assert(std::is_standard_layout<CPUGeneratorImplState>::value, "CPUGeneratorImplState is not a PODType");
+  static_assert(std::is_standard_layout_v<CPUGeneratorImplStateLegacy>, "CPUGeneratorImplStateLegacy is not a PODType");
+  static_assert(std::is_standard_layout_v<CPUGeneratorImplState>, "CPUGeneratorImplState is not a PODType");
 
   static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy);
   static const size_t size_current = sizeof(CPUGeneratorImplState);
@@ -155,8 +155,7 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
   auto double_normal_sample = c10::optional<double>();
 
   // Construct the state of at::CPUGeneratorImpl based on input byte tensor size.
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  CPUGeneratorImplStateLegacy* legacy_pod;
+  CPUGeneratorImplStateLegacy* legacy_pod{nullptr};
   auto new_state_size = new_state.numel();
   if (new_state_size == size_legacy) {
     legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data();
@@ -221,7 +220,7 @@ c10::intrusive_ptr<c10::TensorImpl> CPUGeneratorImpl::get_state() const {
   using detail::CPUGeneratorImplState;
 
   static const size_t size = sizeof(CPUGeneratorImplState);
-  static_assert(std::is_standard_layout<CPUGeneratorImplState>::value, "CPUGeneratorImplState is not a PODType");
+  static_assert(std::is_standard_layout_v<CPUGeneratorImplState>, "CPUGeneratorImplState is not a PODType");
 
   auto state_tensor = at::detail::empty_cpu({(int64_t)size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
   auto rng_state = state_tensor.data_ptr();
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index da13bf05c4390..7fd191ef3f38c 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -7,12 +7,16 @@
 #include <algorithm>
 #include <cctype>
 #include <string>
+#include <stdexcept>
 
 #include <ATen/cpu/FlushDenormal.h>
 
 #ifdef USE_FBGEMM
 #include <fbgemm/Fbgemm.h>
 #endif // USE_FBGEMM
+#if defined(__aarch64__) && !defined(C10_MOBILE)
+#include <cpuinfo.h>
+#endif
 
 namespace at {
 
@@ -133,6 +137,15 @@ void Context::setSDPUseMath(bool e) {
   enabled_mathSDP = e;
 }
 
+bool Context::userEnabledCuDNNSDP() const {
+  return enabled_cudnnSDP;
+}
+
+void Context::setSDPUseCuDNN(bool e) {
+  enabled_cudnnSDP = e;
+}
+
+
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
 static const char cublas_config_var_name[] = "CUBLAS_WORKSPACE_CONFIG";
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
@@ -165,7 +178,7 @@ void Context::alertCuBLASConfigNotDeterministic() const {
     "case, you must set an environment variable before running your PyTorch application: ",
     cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ",
     cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ",
-    "https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility"
+    "https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility"
   );
 
   if (deterministicAlgorithmsWarnOnly()) {
@@ -250,6 +263,30 @@ void Context::setLinalgPreferredBackend(at::LinalgBackend b) {
   }
 }
 
+at::BlasBackend Context::blasPreferredBackend() const {
+  return blas_preferred_backend;
+}
+
+void Context::setBlasPreferredBackend(at::BlasBackend b) {
+#ifdef _MSC_VER
+  TORCH_WARN_ONCE(
+    "torch.backends.cuda.preferred_blas_library is an experimental feature. "
+    "It is not supported on Windows."
+  );
+#else
+  TORCH_CHECK((b != at::BlasBackend::Cublaslt) || hasCuBLASLt(),
+      "Cannot set preferred backend to cuBLASLt if PyTorch has not been compiled with cuBLASLt.");
+  if (b != at::BlasBackend::Cublas) {
+    TORCH_WARN_ONCE(
+      "torch.backends.cuda.preferred_blas_library is an experimental feature. "
+      "If you see any error or unexpected behavior when this flag is set "
+      "please file an issue on GitHub."
+    );
+  }
+  blas_preferred_backend = b;
+#endif
+}
+
 bool Context::allowFP16ReductionCuBLAS() const {
   return allow_fp16_reduction_cublas;
 }
@@ -424,25 +461,23 @@ bool NoTF32Guard::should_disable_tf32() {
   return override_allow_tf32_flag;
 }
 
-#ifdef USE_ROCM
 // Ops can query this flag to know they are in the backward pass.
 // This information can be used, for example, to select implementations
 // with different numerical or performance characteristics.
 // See https://pytorch.org/docs/stable/notes/numerical_accuracy.html for details.
-thread_local bool ROCmBackwardPassGuard::is_backward_pass_;
+thread_local bool rocm_is_backward_pass;
 
 ROCmBackwardPassGuard::ROCmBackwardPassGuard() {
-  is_backward_pass_ = true;
+  rocm_is_backward_pass = true;
 }
 
 ROCmBackwardPassGuard::~ROCmBackwardPassGuard() {
-  is_backward_pass_ = false;
+  rocm_is_backward_pass = false;
 }
 
 bool ROCmBackwardPassGuard::is_backward_pass() {
-  return is_backward_pass_;
+  return rocm_is_backward_pass;
 }
-#endif
 
 bool Context::areVmapFallbackWarningsEnabled() const {
   return display_vmap_fallback_warnings_;
@@ -469,4 +504,21 @@ void Context::unsetDefaultMobileCPUAllocator() {
   c10::SetCPUAllocator(prev_allocator_ptr_ , /*priority*/ 100);
   prev_allocator_ptr_ = nullptr;
 }
+
+bool Context::allowFP16ReductionCPU() const {
+  return allow_fp16_reduction_cpu;
+}
+
+void Context::setAllowFP16ReductionCPU(bool b) {
+  if ( b && !allow_fp16_reduction_cpu) {
+    // Check that CPU supports fp16 reductions
+#if defined(__aarch64__) && !defined(C10_MOBILE)
+    if (!cpuinfo_initialize() || !cpuinfo_has_arm_fp16_arith())
+#else
+    if (true)
+#endif
+      throw std::runtime_error("Float16 arithmetic is not supported by the CPU!");
+  }
+  allow_fp16_reduction_cpu = b;
+}
 } // namespace at
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 5baad73669af3..b50f0479e2fab 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -1,17 +1,20 @@
 #pragma once
 
+#include <ATen/BlasBackend.h>
 #include <ATen/CPUGeneratorImpl.h>
+#include <ATen/DeviceAccelerator.h>
 #include <ATen/LinalgBackend.h>
 #include <ATen/core/ATenGeneral.h>
 #include <ATen/core/DeprecatedTypeProperties.h>
 #include <ATen/core/Generator.h>
 #include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/detail/AcceleratorHooksInterface.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/detail/HIPHooksInterface.h>
 #include <ATen/detail/IPUHooksInterface.h>
+#include <ATen/detail/MAIAHooksInterface.h>
 #include <ATen/detail/MPSHooksInterface.h>
 #include <ATen/detail/MTIAHooksInterface.h>
-#include <ATen/detail/ORTHooksInterface.h>
 #include <ATen/detail/PrivateUse1HooksInterface.h>
 #include <ATen/detail/XPUHooksInterface.h>
 #include <c10/core/QEngine.h>
@@ -22,7 +25,6 @@
 #include <c10/util/irange.h>
 
 #include <cstdint>
-#include <memory>
 #include <mutex>
 
 namespace at {
@@ -56,13 +58,34 @@ class TORCH_API Context {
       AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
     }
   }
+  const AcceleratorHooksInterface& getAcceleratorHooksInterface(
+      c10::optional<c10::DeviceType> opt_device_type = c10::nullopt) {
+    c10::DeviceType device_type = opt_device_type.has_value()
+        ? opt_device_type.value()
+        : at::getAccelerator(true).value();
+    if (device_type == at::kCUDA) {
+      return at::detail::getCUDAHooks();
+    } else if (device_type == at::kMPS) {
+      return at::detail::getMPSHooks();
+    } else if (device_type == at::kPrivateUse1) {
+      return at::detail::getPrivateUse1Hooks();
+    } else if (device_type == at::kMTIA) {
+      return at::detail::getMTIAHooks();
+    } else {
+      AT_ERROR(
+          c10::DeviceTypeName(device_type), " device type not an accelerator.");
+    }
+  }
   Device getDeviceFromPtr(void* data, c10::DeviceType device_type) {
     initCUDAIfNeeded(device_type);
     initHIPIfNeeded(device_type);
+    initXPUIfNeeded(device_type);
     if (device_type == at::kCPU) {
       return c10::DeviceType::CPU;
     } else if (device_type == at::kCUDA) {
       return at::detail::getCUDAHooks().getDeviceFromPtr(data);
+    } else if (device_type == at::kXPU) {
+      return at::detail::getXPUHooks().getDeviceFromPtr(data);
     } else if (device_type == at::kPrivateUse1) {
       return at::GetPrivateUse1HooksInterface()->getDeviceFromPtr(data);
     } else {
@@ -100,6 +123,9 @@ class TORCH_API Context {
   static bool hasCuSOLVER() {
     return detail::getCUDAHooks().hasCuSOLVER();
   }
+  static bool hasCuBLASLt() {
+    return detail::getCUDAHooks().hasCuBLASLt();
+  }
   static bool hasHIP() {
     return detail::getHIPHooks().hasHIP();
   }
@@ -118,8 +144,8 @@ class TORCH_API Context {
   static bool hasLazy() {
     return c10::impl::hasDeviceGuardImpl(c10::DeviceType::Lazy);
   }
-  static bool hasORT() {
-    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::ORT);
+  static bool hasMAIA() {
+    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::MAIA);
   }
   // defined in header so that getNonVariableType has ability to inline
   // call_once check. getNonVariableType is called fairly frequently
@@ -129,6 +155,12 @@ class TORCH_API Context {
   void lazyInitHIP() {
     c10::call_once(thh_init, [&] { detail::getHIPHooks().initHIP(); });
   }
+  void lazyInitXPU() {
+    c10::call_once(thx_init, [&] { detail::getXPUHooks().initXPU(); });
+  }
+  void lazyInitMTIA() {
+    c10::call_once(th_mtia_init, [&] { detail::getMTIAHooks().initMTIA(); });
+  }
   void lazyInitPrivateUse1() {
     c10::call_once(thp_init, [&] {
       if (isPrivateUse1HooksRegistered()) {
@@ -179,9 +211,15 @@ class TORCH_API Context {
   void setSDPUseMath(bool);
   bool userEnabledMathSDP() const;
 
+  void setSDPUseCuDNN(bool);
+  bool userEnabledCuDNNSDP() const;
+
   at::LinalgBackend linalgPreferredBackend() const;
   void setLinalgPreferredBackend(at::LinalgBackend);
 
+  at::BlasBackend blasPreferredBackend() const;
+  void setBlasPreferredBackend(at::BlasBackend);
+
   // Note [Enabling Deterministic Operations]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   // Operations in PyTorch that normally act nondeterministically, but have an
@@ -262,7 +300,7 @@ class TORCH_API Context {
   // Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
   // >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
   // ":4096:8". For more details:
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
+  // https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
   void alertCuBLASConfigNotDeterministic() const;
 
   void setFloat32MatmulPrecision(const std::string& s);
@@ -293,6 +331,8 @@ class TORCH_API Context {
 
   void setDefaultMobileCPUAllocator();
   void unsetDefaultMobileCPUAllocator();
+  bool allowFP16ReductionCPU() const;
+  void setAllowFP16ReductionCPU(bool);
 
  private:
   void initCUDAIfNeeded(c10::DeviceType p) {
@@ -305,9 +345,16 @@ class TORCH_API Context {
       lazyInitHIP();
     }
   }
+  void initXPUIfNeeded(c10::DeviceType p) {
+    if (p == c10::DeviceType::XPU) {
+      lazyInitXPU();
+    }
+  }
   static bool checkCuBLASConfigDeterministic();
   c10::once_flag thc_init;
   c10::once_flag thh_init;
+  c10::once_flag thx_init;
+  c10::once_flag th_mtia_init;
   c10::once_flag thp_init;
   bool enabled_cudnn = true;
   bool deterministic_cudnn = false;
@@ -317,6 +364,7 @@ class TORCH_API Context {
   bool enabled_flashSDP = true;
   bool enabled_mem_efficientSDP = true;
   bool enabled_mathSDP = true;
+  bool enabled_cudnnSDP = false;
 #ifdef USE_ROCM
   bool benchmark_cudnn = true;
 #else
@@ -336,6 +384,11 @@ class TORCH_API Context {
       c10::utils::check_env("TORCH_LINALG_PREFER_CUSOLVER") == true
       ? at::LinalgBackend::Cusolver
       : at::LinalgBackend::Default;
+  at::BlasBackend blas_preferred_backend =
+      (c10::utils::check_env("TORCH_BLAS_PREFER_CUBLASLT") == true ||
+       c10::utils::check_env("TORCH_BLAS_PREFER_HIPBLASLT") == true)
+      ? at::BlasBackend::Cublaslt
+      : at::BlasBackend::Cublas;
 #ifdef C10_MOBILE
   bool release_original_weights = true;
 #else
@@ -344,6 +397,7 @@ class TORCH_API Context {
   bool display_vmap_fallback_warnings_ = false;
   c10::optional<at::QEngine> quantized_engine = c10::nullopt;
   bool enable_sparse_tensor_invariant_checks = false;
+  bool allow_fp16_reduction_cpu = false;
 
   Allocator* prev_allocator_ptr_{nullptr};
 };
@@ -407,8 +461,8 @@ static inline bool hasMPS() {
   return globalContext().hasMPS();
 }
 
-static inline bool hasORT() {
-  return globalContext().hasORT();
+static inline bool hasMAIA() {
+  return globalContext().hasMAIA();
 }
 
 static inline bool hasXPU() {
@@ -478,7 +532,7 @@ static inline void manual_seed(uint64_t seed) {
   }
 
   const auto xpu_num_gpus = detail::getXPUHooks().getNumGPUs();
-  if (hasXPU() && xpu_num_gpus > 0) {
+  if (hasXPU() && xpu_num_gpus) {
     for (const auto i : c10::irange(xpu_num_gpus)) {
       auto xpu_gen = globalContext().defaultGenerator(
           Device(at::kXPU, static_cast<c10::DeviceIndex>(i)));
@@ -515,15 +569,10 @@ struct TORCH_API NoTF32Guard {
   bool changed = false;
 };
 
-#ifdef USE_ROCM
 struct TORCH_API ROCmBackwardPassGuard {
   ROCmBackwardPassGuard();
   ~ROCmBackwardPassGuard();
   static bool is_backward_pass();
-
- private:
-  static thread_local bool is_backward_pass_;
 };
-#endif
 
 } // namespace at
diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
index 8f2cac8206d4d..3d2350d261013 100644
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@@ -9,6 +9,13 @@ DLDataType getDLDataType(const Tensor& t) {
   dtype.lanes = 1;
   dtype.bits = t.element_size() * 8;
   switch (t.scalar_type()) {
+    case ScalarType::UInt1:
+    case ScalarType::UInt2:
+    case ScalarType::UInt3:
+    case ScalarType::UInt4:
+    case ScalarType::UInt5:
+    case ScalarType::UInt6:
+    case ScalarType::UInt7:
     case ScalarType::Byte:
     case ScalarType::UInt16:
     case ScalarType::UInt32:
@@ -81,9 +88,9 @@ DLDataType getDLDataType(const Tensor& t) {
   return dtype;
 }
 
-static DLDevice getDLDevice(const Tensor& tensor, const int64_t& device_id) {
+static DLDevice getDLDevice(const Tensor& tensor, c10::DeviceIndex device_id) {
   DLDevice ctx;
-  ctx.device_id = device_id;
+  ctx.device_id = static_cast<int32_t>(device_id);
   switch (tensor.device().type()) {
     case DeviceType::CPU:
       ctx.device_type = DLDeviceType::kDLCPU;
@@ -104,8 +111,9 @@ static DLDevice getDLDevice(const Tensor& tensor, const int64_t& device_id) {
       ctx.device_type = DLDeviceType::kDLROCM;
       break;
     case DeviceType::XPU:
-      ctx = at::detail::getXPUHooks().getDLPackDeviceFromATenDevice(
-          ctx, tensor.device(), tensor.data_ptr());
+      ctx.device_type = DLDeviceType::kDLOneAPI;
+      ctx.device_id =
+          at::detail::getXPUHooks().getGlobalIdxFromDevice(tensor.device());
       break;
     default:
       TORCH_CHECK(false, "Cannot pack tensors on " + tensor.device().str());
@@ -132,7 +140,7 @@ static Device getATenDevice(const DLDevice& ctx, void* data) {
       return at::Device(DeviceType::HIP, ctx.device_id);
 #endif
     case DLDeviceType::kDLOneAPI:
-      return at::detail::getXPUHooks().getATenDeviceFromDLPackDevice(ctx, data);
+      return at::detail::getXPUHooks().getDeviceFromPtr(data);
     default:
       TORCH_CHECK(
           false, "Unsupported device_type: " + c10::to_string(ctx.device_type));
@@ -140,7 +148,7 @@ static Device getATenDevice(const DLDevice& ctx, void* data) {
 }
 
 ScalarType toScalarType(const DLDataType& dtype) {
-  ScalarType stype;
+  ScalarType stype = ScalarType::Undefined;
   TORCH_CHECK(dtype.lanes == 1, "ATen does not support lanes != 1");
   switch (dtype.code) {
     case DLDataTypeCode::kDLUInt:
@@ -148,6 +156,15 @@ ScalarType toScalarType(const DLDataType& dtype) {
         case 8:
           stype = ScalarType::Byte;
           break;
+        case 16:
+          stype = ScalarType::UInt16;
+          break;
+        case 32:
+          stype = ScalarType::UInt32;
+          break;
+        case 64:
+          stype = ScalarType::UInt64;
+          break;
         default:
           TORCH_CHECK(
               false, "Unsupported kUInt bits " + c10::to_string(dtype.bits));
@@ -225,8 +242,7 @@ ScalarType toScalarType(const DLDataType& dtype) {
       }
       break;
     default:
-      TORCH_CHECK(
-          false, "Unsupported code " + c10::to_string(dtype.code));
+      TORCH_CHECK(false, "Unsupported code " + c10::to_string(dtype.code));
   }
   return stype;
 }
@@ -248,7 +264,7 @@ DLManagedTensor* toDLPack(const Tensor& src) {
   // gh-83069
   auto shape = src.sizes();
   auto strides = src.strides().vec();
-  for (int i=0; i<src.dim(); i++) {
+  for (int i = 0; i < src.dim(); i++) {
     if (shape[i] < 2) {
       strides[i] = 1;
     }
@@ -260,35 +276,30 @@ DLManagedTensor* toDLPack(const Tensor& src) {
   atDLMTensor->tensor.manager_ctx = atDLMTensor;
   atDLMTensor->tensor.deleter = &deleter;
   atDLMTensor->tensor.dl_tensor.data = view.data_ptr();
-  int64_t device_id = 0;
+  c10::DeviceIndex device_id = 0;
   if (src.is_cuda()) {
     device_id = src.get_device();
   }
   atDLMTensor->tensor.dl_tensor.device = getDLDevice(src, device_id);
   atDLMTensor->tensor.dl_tensor.ndim = src.dim();
   atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
-  atDLMTensor->tensor.dl_tensor.shape =
-      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-      const_cast<int64_t*>(view.sizes().data());
-  atDLMTensor->tensor.dl_tensor.strides =
-      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-      const_cast<int64_t*>(view.strides().data());
+  atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
+  atDLMTensor->tensor.dl_tensor.strides = view.strides().data();
   atDLMTensor->tensor.dl_tensor.byte_offset = 0;
   return &(atDLMTensor->tensor);
 }
 
-Tensor fromDLPack(const DLManagedTensor* src) {
+Tensor fromDLPack(DLManagedTensor* src) {
   auto deleter = [src](void* self) {
     if (src->deleter) {
-      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-      src->deleter(const_cast<DLManagedTensor*>(src));
+      src->deleter(src);
     }
   };
   return fromDLPack(src, std::move(deleter));
 }
 
 Tensor fromDLPack(
-    const DLManagedTensor* src,
+    DLManagedTensor* src,
     std::function<void(void*)> deleter) {
   Device device = getATenDevice(src->dl_tensor.device, src->dl_tensor.data);
   ScalarType stype = toScalarType(src->dl_tensor.dtype);
@@ -296,7 +307,7 @@ Tensor fromDLPack(
     return at::from_blob(
         src->dl_tensor.data,
         IntArrayRef(src->dl_tensor.shape, src->dl_tensor.ndim),
-        deleter,
+        std::move(deleter),
         at::device(device).dtype(stype),
         {device});
   }
@@ -306,6 +317,6 @@ Tensor fromDLPack(
       IntArrayRef(src->dl_tensor.strides, src->dl_tensor.ndim),
       deleter,
       at::device(device).dtype(stype),
-      { device });
+      {device});
 }
 } // namespace at
diff --git a/aten/src/ATen/DLConvertor.h b/aten/src/ATen/DLConvertor.h
index 9b8fce1015fe4..b35c9657527d8 100644
--- a/aten/src/ATen/DLConvertor.h
+++ b/aten/src/ATen/DLConvertor.h
@@ -12,9 +12,13 @@ namespace at {
 
 TORCH_API ScalarType toScalarType(const DLDataType& dtype);
 TORCH_API DLManagedTensor* toDLPack(const Tensor& src);
-TORCH_API Tensor fromDLPack(const DLManagedTensor* src);
+TORCH_API Tensor fromDLPack(DLManagedTensor* src);
+C10_DEPRECATED_MESSAGE("Please migrate to a non-const variant")
+inline Tensor fromDLPack(const DLManagedTensor* src) {
+  return fromDLPack(const_cast<DLManagedTensor*>(src));
+}
 TORCH_API Tensor
-fromDLPack(const DLManagedTensor* src, std::function<void(void*)> deleter);
+fromDLPack(DLManagedTensor* src, std::function<void(void*)> deleter);
 TORCH_API DLDataType getDLDataType(const Tensor& t);
 TORCH_API DLDevice getDLContext(const Tensor& tensor, const int64_t& device_id);
 
diff --git a/aten/src/ATen/DeviceAccelerator.cpp b/aten/src/ATen/DeviceAccelerator.cpp
new file mode 100644
index 0000000000000..ec3cd2a2f5527
--- /dev/null
+++ b/aten/src/ATen/DeviceAccelerator.cpp
@@ -0,0 +1,39 @@
+#include <ATen/DeviceAccelerator.h>
+#include <ATen/Context.h>
+
+namespace at {
+
+C10_API std::optional<DeviceType> getAccelerator(bool checked) {
+#define CHECK_NO_CUDA \
+  TORCH_CHECK(!at::hasCUDA(), "Cannot have both CUDA and PrivateUse1");
+
+#define CHECK_NO_PU1 \
+  TORCH_CHECK(!is_privateuse1_backend_registered(), "Cannot have both CUDA and PrivateUse1");
+
+#define CHECK_NO_MTIA \
+  TORCH_CHECK(!at::hasMTIA(), "Cannot have MTIA with other devices");
+
+    if (is_privateuse1_backend_registered()) {
+        // We explicitly allow PrivateUse1 and another device at the same time
+        // as we use this for testing.
+        // Whenever a PrivateUse1 device is registered, use it first.
+        return kPrivateUse1;
+    } else if (at::hasCUDA()) {
+        CHECK_NO_PU1
+        CHECK_NO_MTIA
+        return kCUDA;
+    } else if (at::hasMTIA()) {
+        CHECK_NO_CUDA
+        CHECK_NO_PU1
+        return kMTIA;
+    } else {
+        TORCH_CHECK(!checked, "Cannot access accelerator device when none is available.")
+        return std::nullopt;
+    }
+
+#undef CHECK_NO_CUDA
+#undef CHECK_NO_PU1
+}
+
+
+} // namespace at
diff --git a/aten/src/ATen/DeviceAccelerator.h b/aten/src/ATen/DeviceAccelerator.h
new file mode 100644
index 0000000000000..c3e800c7e07c6
--- /dev/null
+++ b/aten/src/ATen/DeviceAccelerator.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Macros.h>
+
+#include <ATen/detail/MTIAHooksInterface.h>
+#include <optional>
+
+// This file defines the top level Accelerator concept for PyTorch.
+// A device is an accelerator per the definition here if:
+// - It is mutually exclusive with all other accelerators
+// - It performs asynchronous compute via a Stream/Event system
+// - It provides a set of common APIs as defined by AcceleratorHooksInterface
+//
+// As of today, accelerator devices are (in no particular order):
+// CUDA, MTIA, PrivateUse1
+// We want to add once all the proper APIs are supported and tested:
+// HIP, MPS, XPU
+
+namespace at {
+
+// Ensures that only one accelerator is available (at
+// compile time if possible) and return it.
+// When checked is true, the returned optional always has a value.
+TORCH_API std::optional<c10::DeviceType> getAccelerator(bool checked = false);
+
+} // namespace at
diff --git a/aten/src/ATen/Dispatch_v2.h b/aten/src/ATen/Dispatch_v2.h
index f5f41ac47647c..e0764834c02fd 100644
--- a/aten/src/ATen/Dispatch_v2.h
+++ b/aten/src/ATen/Dispatch_v2.h
@@ -112,12 +112,12 @@
 
 // Ensure we never have too many scalar types for the expansion here to
 // support.  To bump this, you must regenerate the macros below.
-static_assert(static_cast<int>(c10::ScalarType::NumOptions) < 32);
+static_assert(static_cast<int>(c10::ScalarType::NumOptions) < 45);
 
 // Python code to regenerate generate code below:
 #if 0
 
-num_args = 32
+num_args = 45
 
 nums = ', '.join(str(i) for i in reversed(range(num_args+1)))
 args = ', '.join(f'_{i}' for i in range(1, num_args+1))
@@ -135,8 +135,8 @@ for i in range(1, num_args+1):
 // Begin generated code
 // clang-format off
 
-#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
-#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, N, ...) N
+#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
+#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, N, ...) N
 #define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N)
 #define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N)
 #define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N)
@@ -169,6 +169,18 @@ for i in range(1, num_args+1):
 #define AT_AP30(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N)
 #define AT_AP31(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N)
 #define AT_AP32(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N)
-
+#define AT_AP33(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N)
+#define AT_AP34(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N)
+#define AT_AP35(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N)
+#define AT_AP36(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N)
+#define AT_AP37(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N)
+#define AT_AP38(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N)
+#define AT_AP39(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N)
+#define AT_AP40(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N)
+#define AT_AP41(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N)
+#define AT_AP42(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N)
+#define AT_AP43(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N)
+#define AT_AP44(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N)
+#define AT_AP45(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N)
 // End generated code
 // clang-format on
diff --git a/aten/src/ATen/DynamicLibrary.cpp b/aten/src/ATen/DynamicLibrary.cpp
index f3287121b2e26..7dc27f38fa7f0 100644
--- a/aten/src/ATen/DynamicLibrary.cpp
+++ b/aten/src/ATen/DynamicLibrary.cpp
@@ -25,9 +25,7 @@ static void* checkDL(void* x) {
 
   return x;
 }
-DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name, bool leak_handle_): leak_handle(leak_handle_) {
-  // NOLINTNEXTLINE(hicpp-signed-bitwise)
-  handle = dlopen(name, RTLD_LOCAL | RTLD_NOW);
+DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name, bool leak_handle_): leak_handle(leak_handle_), handle(dlopen(name, RTLD_LOCAL | RTLD_NOW)) {
   if (!handle) {
     if (alt_name) {
       handle = dlopen(alt_name, RTLD_LOCAL | RTLD_NOW);
diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp
index 459960bbf86ad..0b35fc67b53ac 100644
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@@ -1,6 +1,9 @@
 #define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/EmptyTensor.h>
 #include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/detail/XPUHooksInterface.h>
+#include <ATen/Context.h>
+#include <ATen/detail/PrivateUse1HooksInterface.h>
 #include <c10/core/CPUAllocator.h>
 #include <c10/util/safe_numerics.h>
 
@@ -10,7 +13,18 @@ namespace at::detail {
 namespace {
 c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
   if (pin_memory) {
-    return at::detail::getCUDAHooks().getPinnedMemoryAllocator();
+    // NB: This is not quite right, if you somehow had both CUDA and PrivateUse1 initialized
+    // in the same PyTorch build, you would ONLY ever get the CUDA pinned memory allocator.
+    // To properly support this, see https://github.com/pytorch/pytorch/issues/14560
+    if (at::globalContext().hasCUDA()) {
+      return at::detail::getCUDAHooks().getPinnedMemoryAllocator();
+    } else if (at::globalContext().hasXPU()) {
+      return at::detail::getXPUHooks().getPinnedMemoryAllocator();
+    } else if(at::isPrivateUse1HooksRegistered()) {
+      return at::GetPrivateUse1HooksInterface()->getPinnedMemoryAllocator();
+    } else {
+      TORCH_CHECK(false, "Need to provide pin_memory allocator to use pin memory.")
+    }
   }
   return c10::GetCPUAllocator();
 }
@@ -80,7 +94,7 @@ size_t computeStorageNbytes(
       return 0;
     }
 
-    uint64_t strided_size;
+    uint64_t strided_size = 0;
     overflowed |= c10::mul_overflows(strides[i], sizes[i] - 1, &strided_size);
     overflowed |= c10::add_overflows(size, strided_size, &size);
   }
@@ -134,7 +148,7 @@ SymInt computeStorageNbytes(
   // of the last element according to stride
   SymInt size = 1;
   for (const auto i : c10::irange(sizes.size())) {
-    if (sizes[i] == 0) {
+    if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[i].sym_eq(0))) {
       return 0;
     }
 
@@ -187,6 +201,15 @@ TensorBase empty_generic(
   return _empty_generic(size, allocator, ks, scalar_type, memory_format_opt);
 }
 
+TensorBase empty_generic_symint(
+    SymIntArrayRef size,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type,
+    c10::optional<c10::MemoryFormat> memory_format_opt) {
+  return _empty_generic(size, allocator, ks, scalar_type, memory_format_opt);
+}
+
 template <typename T>
 TensorBase _empty_strided_generic(
     T size,
@@ -305,12 +328,13 @@ struct MetaAllocator final : public at::Allocator {
   static void deleter(void* const pointer) {
     TORCH_INTERNAL_ASSERT(!pointer);
   }
-  DataPtr allocate(const size_t nbytes) const override {
+  DataPtr allocate(const size_t nbytes) override {
     return {nullptr, nullptr, &deleter, at::Device(DeviceType::Meta)};
   }
   DeleterFnPtr raw_deleter() const override {
     return deleter;
   }
+  void copy_data(void* dest, const void* src, std::size_t count) const final {}
 };
 
 static MetaAllocator g_meta_alloc;
diff --git a/aten/src/ATen/EmptyTensor.h b/aten/src/ATen/EmptyTensor.h
index 5f8681ce37f96..f6e2e53bc99f5 100644
--- a/aten/src/ATen/EmptyTensor.h
+++ b/aten/src/ATen/EmptyTensor.h
@@ -51,6 +51,13 @@ TORCH_API TensorBase empty_generic(
     ScalarType scalar_type,
     c10::optional<c10::MemoryFormat> memory_format_opt);
 
+TORCH_API TensorBase empty_generic_symint(
+    SymIntArrayRef size,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
 TORCH_API TensorBase empty_strided_generic(
     IntArrayRef size,
     IntArrayRef stride,
diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp
index d066f99242ab0..cfa2f63a5b8a8 100644
--- a/aten/src/ATen/ExpandUtils.cpp
+++ b/aten/src/ATen/ExpandUtils.cpp
@@ -15,13 +15,13 @@ namespace {
 // NOTE: are_expandable did a similar check, please keep them sync if change is needed
 template <typename Container, typename ArrayType>
 Container infer_size_impl(ArrayType a, ArrayType b) {
-  size_t dimsA = a.size();
-  size_t dimsB = b.size();
-  size_t ndim = dimsA > dimsB ? dimsA : dimsB;
+  // Use ptrdiff_t to ensure signed comparison.
+  auto dimsA = static_cast<ptrdiff_t>(a.size());
+  auto dimsB = static_cast<ptrdiff_t>(b.size());
+  auto ndim = dimsA > dimsB ? dimsA : dimsB;
   Container expandedSizes(ndim);
 
-  // Use ptrdiff_t to ensure signed comparison.
-  for (ptrdiff_t i = (ptrdiff_t)ndim - 1; i >= 0; --i) {
+  for (ptrdiff_t i = ndim - 1; i >= 0; --i) {
     ptrdiff_t offset = ndim - 1 - i;
     ptrdiff_t dimA = dimsA - 1 - offset;
     ptrdiff_t dimB = dimsB - 1 - offset;
@@ -63,8 +63,8 @@ C10_ALWAYS_INLINE InferExpandGeometryResult<Container> inferExpandGeometryImpl(
     IntArrayRef tensor_sizes,
     IntArrayRef tensor_strides,
     IntArrayRef sizes) {
-  int64_t ndim = sizes.size();
-  int64_t tensor_dim = tensor_sizes.size();
+  int64_t ndim = static_cast<int64_t>(sizes.size());
+  int64_t tensor_dim = static_cast<int64_t>(tensor_sizes.size());
 
   if (tensor_dim == 0) {
     return InferExpandGeometryResult<Container>(sizes, ndim);
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 82db1f8b6517c..03cfca36e7227 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -462,7 +462,8 @@ inline Tensor _sum_to(
     reduce_dims.push_back(i);
   }
   for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
-    if (shape[i - leading_dims] == 1 && sizes[i] != 1) {
+    if (shape[i - leading_dims] == 1 &&
+        TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(sizes[i], 1))) {
       reduce_dims.push_back(i);
     }
   }
diff --git a/aten/src/ATen/FunctionalInverses.cpp b/aten/src/ATen/FunctionalInverses.cpp
index 953636df5abc9..ebc24085a74a8 100644
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@@ -174,8 +174,8 @@ Tensor FunctionalInverses::expand_inverse(const Tensor& base, const Tensor& muta
       return mutated_view.as_strided_symint(
           base.sym_sizes(), base.sym_strides(), base.sym_storage_offset());
     } else {
-      return at::sum_to(
-          mutated_view,
+      return base + at::sum_to(
+          mutated_view - base,
           base.sym_sizes(),
           /*always_return_non_view=*/inverse_return_mode == InverseReturnMode::NeverView
       );
@@ -224,48 +224,48 @@ Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor
     if (inverse_return_mode == InverseReturnMode::AlwaysView) {
       // NB: assumes mutated_view is a narrowed view of base.
       // We should NOT do this for functionalization
-      return mutated_view.as_strided_symint(
-          base.sym_sizes(), base.sym_strides(), base.sym_storage_offset());
+      return mutated_view.slice_inverse_symint(
+          base, dim, std::move(start), std::move(end), std::move(step));
     } else {
       return base.slice_scatter_symint(mutated_view, dim, std::move(start), std::move(end), std::move(step));
     }
 }
 
 Tensor FunctionalInverses::split_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t mutated_view_idx, c10::SymInt split_size, int64_t dim) {
+    // It would be nice if this logic could be re-used from autograd's split_backward(), but I don't think it can.
+    // For functionalization, we have only have one of the tensors from the TensorList outputed by split(), and we want to layer i
+    // on top of the base tensor.
+    // For autograd, we have all of the tensors outputted by split() and we just want to stack them.
+    dim = at::maybe_wrap_dim(dim, base.dim());
+    auto dim_size = base.sym_size(dim);
+    auto start = split_size * mutated_view_idx;
+    auto end = split_size + start;
+    if (end > dim_size) end = dim_size;
+
     if (inverse_return_mode == InverseReturnMode::AlwaysView) {
       // NB: assumes mutated_view is a narrowed view of base.
       // We should NOT do this for functionalization
-      return mutated_view.as_strided_symint(
-          base.sym_sizes(), base.sym_strides(), base.sym_storage_offset());
+      return mutated_view.slice_inverse_symint(base, dim, start, end, 1);
     } else {
-      // It would be nice if this logic could be re-used from autograd's split_backward(), but I don't think it can.
-      // For functionalization, we have only have one of the tensors from the TensorList outputed by split(), and we want to layer i
-      // on top of the base tensor.
-      // For autograd, we have all of the tensors outputted by split() and we just want to stack them.
-      dim = at::maybe_wrap_dim(dim, base.dim());
-      auto dim_size = base.sym_size(dim);
-      auto start = split_size * mutated_view_idx;
-      auto end = split_size + start;
-      if (end > dim_size) end = dim_size;
       return base.slice_scatter_symint(mutated_view, dim, start, end, 1);
     }
 }
 
 Tensor FunctionalInverses::split_with_sizes_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t mutated_view_idx, c10::SymIntArrayRef split_sizes, int64_t dim) {
+    dim = at::maybe_wrap_dim(dim, base.dim());
+    auto dim_size = base.sym_size(dim);
+    c10::SymInt start = 0;
+    for (auto i = 0; i < mutated_view_idx; ++i) {
+        start += split_sizes[i];
+    }
+    auto end = start + split_sizes[mutated_view_idx];
+    if (end > dim_size) end = dim_size;
+
     if (inverse_return_mode == InverseReturnMode::AlwaysView) {
       // NB: assumes mutated_view is a narrowed view of base.
       // We should NOT do this for functionalization
-      return mutated_view.as_strided_symint(
-          base.sym_sizes(), base.sym_strides(), base.sym_storage_offset());
+      return mutated_view.slice_inverse_symint(base, dim, start, end, 1);
     } else {
-      dim = at::maybe_wrap_dim(dim, base.dim());
-      auto dim_size = base.sym_size(dim);
-      c10::SymInt start = 0;
-      for (auto i = 0; i < mutated_view_idx; ++i) {
-          start += split_sizes[i];
-      }
-      auto end = start + split_sizes[mutated_view_idx];
-      if (end > dim_size) end = dim_size;
       return base.slice_scatter_symint(mutated_view, dim, start, end, 1);
     }
 }
@@ -303,6 +303,29 @@ Tensor FunctionalInverses::_nested_view_from_buffer_inverse(const Tensor& base,
     return Tensor();
 }
 
+Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional<Tensor>& lengths, int64_t ragged_idx) {
+  auto values = at::_nested_get_values(mutated_view);
+  if (inverse_return_mode != InverseReturnMode::NeverView) {
+    return values;
+  } else {
+    return values.clone(/*memory_format=*/at::MemoryFormat::Contiguous);
+  }
+}
+
+Tensor FunctionalInverses::_nested_get_values_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode) {
+  auto offsets = at::_nested_get_offsets(base);
+  auto lengths = at::_nested_get_lengths(base);
+  auto ragged_idx = at::_nested_get_ragged_idx(base);
+  auto dummy = at::_nested_get_jagged_dummy(base);
+  auto nt = at::_nested_view_from_jagged(mutated_view, offsets, dummy, lengths, ragged_idx);
+
+  if (inverse_return_mode != InverseReturnMode::NeverView) {
+    return nt;
+  } else {
+    return nt.clone(/*memory_format=*/at::MemoryFormat::Contiguous);
+  }
+}
+
 Tensor FunctionalInverses::unsqueeze_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t dim) {
     if (inverse_return_mode != InverseReturnMode::NeverView) {
       return at::squeeze(mutated_view, dim);
@@ -428,12 +451,22 @@ Tensor FunctionalInverses::narrow_inverse(const at::Tensor & base, const at::Ten
     if (inverse_return_mode == InverseReturnMode::AlwaysView) {
       // NB: assumes mutated_view is a narrowed view of base.
       // We should NOT do this for functionalization
-      return mutated_view.as_strided_symint(
-          base.sym_sizes(), base.sym_strides(), base.sym_storage_offset());
+      return mutated_view.slice_inverse_symint(base, dim, std::move(start), start + length, 1);
     } else {
       return base.slice_scatter_symint(
           mutated_view, dim, std::move(start), start + length, 1);
     }
 }
 
+Tensor FunctionalInverses::slice_inverse_inverse(const at::Tensor & base, const at::Tensor & mutated_view, InverseReturnMode inverse_return_mode, const at::Tensor & src, int64_t dim, std::optional<c10::SymInt> start, std::optional<c10::SymInt> end, c10::SymInt step) {
+    // slice_inverse() inverse is just slice()
+    if (inverse_return_mode == InverseReturnMode::NeverView) {
+      return at::slice_copy_symint(
+          mutated_view, dim, std::move(start), std::move(end), std::move(step));
+    } else {
+      return mutated_view.slice_symint(
+          dim, std::move(start), std::move(end), std::move(step));
+    }
+}
+
 } // namespace at::functionalization
diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp
index a2f486b7db681..78a5b6a9cfbe9 100644
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@@ -10,7 +10,7 @@ namespace at::functionalization {
 
 ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
   if (out_idx == this->out_index) return *this;
-  return ViewMeta(forward_fn, reverse_fn, is_multi_output, out_idx);
+  return ViewMeta(forward_fn, reverse_fn, is_multi_output, is_as_strided, out_idx);
 }
 
 // Note [Functionalization: Alias Removal Part 2]
@@ -94,7 +94,7 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
       get_nbytes(base),
       DataPtr{nullptr, base.device()},
       GetAllocator(kMeta),
-      /*resizeable=*/true
+      /*resizable=*/true
     ),
     base_(base)
   {
@@ -103,6 +103,18 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
 
 void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
   TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");
+
+  if (metas.size() > 1) {
+    for (size_t i = 1; i < metas.size(); ++i) {
+      // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
+      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
+"During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
+" was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
+"so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "
+"can insert a graph break right before the mutation with torch._dynamo.graph_break(). If you would like this behavior to "
+"work properly, please comment on https://github.com/pytorch/pytorch/issues/104505.");
+    }
+  }
   updates_.push_back({updated_val, metas});
   generation_++;
 }
diff --git a/aten/src/ATen/FunctionalStorageImpl.h b/aten/src/ATen/FunctionalStorageImpl.h
index 52c01c3a53c0c..8d899fe01624a 100644
--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@@ -32,11 +32,13 @@ struct ViewMeta {
       std::function<Tensor(const Tensor&, int64_t)> forward,
       std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
       bool is_multi_output = false,
+      bool is_as_strided = false,
       int64_t out_idx = 0)
       : forward_fn(std::move(forward)),
         reverse_fn(std::move(reverse)),
         out_index(out_idx),
-        is_multi_output(is_multi_output) {}
+        is_multi_output(is_multi_output),
+        is_as_strided(is_as_strided) {}
 
   std::function<Tensor(const Tensor&, int64_t)> forward_fn;
   std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
@@ -46,6 +48,8 @@ struct ViewMeta {
   // Tells us if this is a multi-output view
   bool is_multi_output;
 
+  bool is_as_strided;
+
   // Returns a copy of the current ViewMeta, if out_idx matches the current
   // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
   // functions, but a new out index.
@@ -79,7 +83,9 @@ struct ViewMeta {
 struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
  public:
   struct Update {
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
     const at::Tensor new_val;
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
     const std::vector<ViewMeta> view_metas;
   };
 
@@ -101,6 +107,31 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
 
   ~FunctionalStorageImpl() override = default;
 
+  void mark_mutation() {
+    mutation_counter_++;
+  }
+  void mark_mutation_during_no_grad_or_inference_mode() {
+    mutation_counter_during_no_grad_or_inference_mode_++;
+  }
+  void mark_mutation_hidden_from_autograd() {
+    mutation_counter_hidden_from_autograd_++;
+  }
+
+  bool are_all_mutations_under_no_grad_or_inference_mode() const {
+    auto non_autograd_mutations =
+        mutation_counter_during_no_grad_or_inference_mode_ +
+        mutation_counter_hidden_from_autograd_;
+    // The <= is because both counters will technically be incremented, if we
+    // perform e.g. a triton kernel mutation under no_grad
+    return mutation_counter_ <= non_autograd_mutations;
+  }
+
+  bool are_all_mutations_hidden_from_autograd() const {
+    // mutations under no_grad / inference_mode are technically not hidden from
+    // autograd - they change the version counter
+    return mutation_counter_ <= mutation_counter_hidden_from_autograd_;
+  }
+
  private:
   // NB: base_ should always point to a tensor BELOW the current
   // functionalization layer. This is mainly to avoid reference cycles. e.g.
@@ -119,6 +150,28 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
   // If frozen, no more mutations are allowed on this storage.  Once frozen, a
   // storage cannot be unfrozen.
   bool frozen_ = false;
+
+  // These mutation counters are bumped on the storage
+  // whenever a FunctionalTensorWrapper experiences a mutation.
+  // When the mutation is under no_grad, or comes from a triton kernel, we also
+  // bump the corresponding during_no_grad or hidden_from_autograd counters. Why
+  // do we need to detect these two situations separately from "normal" input
+  // mutations? (1) "normal" input mutations can mutate autograd metadata like
+  // .grad_fn,
+  //     in which case they need to be replayed outside of the compiled graph
+  // (2) "no_grad" input mutations are generally safe to keep in the graph (and
+  // compile),
+  //     but they bump the tensor's VC, so we need to mark_dirty() on the inputs
+  //     in torch.compile
+  // (3) mutations that are fully hidden from autograd (e.g. from a triton
+  // kernel)
+  //     do not mutate any autograd state, and be fully kept in the graph
+  // When we detect that an input was mutated, we need to be able to tell if:
+  // (1) all of the mutations were from triton kernels
+  // (2) all of the mutations were under no_grad
+  uint64_t mutation_counter_during_no_grad_or_inference_mode_ = 0;
+  uint64_t mutation_counter_ = 0;
+  uint64_t mutation_counter_hidden_from_autograd_ = 0;
 };
 
 } // namespace at::functionalization
diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp
index bd260f241e00e..a7ba697d13932 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@@ -129,7 +129,7 @@ void FunctionalTensorWrapper::freeze_storage() const {
 // - view_value: The output tensor that we need to wrap.
 // - base: The "base" of the view that `view_value` was generated from.
 // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
-FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, functionalization::ViewMeta meta)
+FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
   : c10::TensorImpl(
       c10::DispatchKeySet(DispatchKey::Functionalize),
       view_value.dtype(),
@@ -174,7 +174,7 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
 }
 
 // See Note [Functionalization Pass - Inplace View Ops]
-void FunctionalTensorWrapper::mutate_view_meta(at::functionalization::ViewMeta meta) {
+void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
   view_metas_.push_back(meta);
   // Manually track the fact that this tensor recieved a metadata mutation!
   has_metadata_mutation_ = true;
@@ -212,7 +212,7 @@ void FunctionalTensorWrapper::mutate_view_meta(at::functionalization::ViewMeta m
 // In the above, tmp is a batched tensor (because adding a normal tensor to a batched tensor does broadcasting and creates a batched tensor).
 // But we can't just replace the underlying memory backing `tensor` with `tmp` - a batched tensor takes up more space!
 // Instead, every input, intermediate and output of the program is wrapped in a FunctionalTensorImpl, which wraps the underlying tensor.
-void FunctionalTensorWrapper::replace_(const Tensor& other) {
+void FunctionalTensorWrapper::replace_(const Tensor& other, bool from_lazy_regenerate) {
   // TODO: going to need to change this if we want nested functionalize() transforms.
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(other));
   value_ = other;
@@ -231,10 +231,19 @@ void FunctionalTensorWrapper::replace_(const Tensor& other) {
     value_ = at::_to_copy(value_, c10::TensorOptions().dtype(dtype()).layout(layout()));
     TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
   }
-  mutation_counter_++;
-  if (!at::GradMode::is_enabled() || InferenceMode::is_enabled()) {
-    // This mutation happened under no_grad or inference_mode
-    mark_mutation_during_no_grad_or_inference_mode();
+  // might not be until after the no_grad region is exited.
+  // Therefore, replace_() is not unconditionally safe to check the current no_grad state.
+  // If this is a lazy regeneration, then it is guaranteed that we have already
+  // done the mutation for the storage alias (when we originally performed the mutation),
+  // so no counter update may be needed.
+  // Example: if a mutation happens to a view under a no_grad,
+  // we won't call replace_() on the other alias until the alias is later used, which
+  if (!from_lazy_regenerate) {
+    mark_mutation();
+    if (!at::GradMode::is_enabled() || InferenceMode::is_enabled()) {
+      // This mutation happened under no_grad or inference_mode
+      mark_mutation_during_no_grad_or_inference_mode();
+    }
   }
 }
 
@@ -328,17 +337,27 @@ void FunctionalTensorWrapper::sync_() {
   regenerate_from_base();
 }
 
+Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
+  auto t = base;
+
+  // Reapply views to get the viewed tensor from the base in alias_
+  for (auto& view_meta: view_metas_) {
+    t = view_meta.forward_fn(t, view_meta.out_index);
+  }
+
+  return t;
+}
+
 void FunctionalTensorWrapper::regenerate_from_base() {
   at::AutoDispatchSkipFunctionalize guard;
   auto storage_impl = functional_storage_impl();
   auto t = storage_impl->base();
+
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  // Reapply views to get the viewed tensor from the base in alias_
-  for (auto& view_meta: view_metas_) {
-    t = view_meta.forward_fn(t, view_meta.out_index);
-  }
+  t = apply_view_metas(t);
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  replace_(t);
+
+  replace_(t, /*from_lazy_regenerate=*/true);
   generation_ = storage_impl->generation();
 }
 
@@ -352,6 +371,38 @@ const char* FunctionalTensorWrapper::tensorimpl_type_name() const {
     return "FunctionalTensorWrapper";
 }
 
+void FunctionalTensorWrapper::copy_tensor_metadata(
+    const FunctionalTensorWrapper* src_impl,
+    FunctionalTensorWrapper* dest_impl,
+    const c10::VariableVersion& version_counter,
+    bool allow_tensor_metadata_change) {
+    TensorImpl::copy_tensor_metadata(
+        src_impl,
+        dest_impl,
+        version_counter,
+        allow_tensor_metadata_change);
+
+    // FunctionalTensorWrapper-specific fields.
+    dest_impl->value_ = src_impl->value_;
+    dest_impl->level_ = src_impl->level_;
+    dest_impl->has_metadata_mutation_ = src_impl->has_metadata_mutation_;
+    dest_impl->is_multi_output_view_ = src_impl->is_multi_output_view_;
+    dest_impl->was_storage_changed_ = src_impl->was_storage_changed_;
+    dest_impl->generation_ = src_impl->generation_;
+    dest_impl->view_metas_ = src_impl->view_metas_;
+}
+
+
+void FunctionalTensorWrapper::copy_tensor_metadata_and_refresh(
+    const FunctionalTensorWrapper* src_impl,
+    FunctionalTensorWrapper* dest_impl,
+    const c10::VariableVersion& version_counter,
+    bool allow_tensor_metadata_change) const {
+    copy_tensor_metadata(src_impl, dest_impl, version_counter, allow_tensor_metadata_change);
+    dest_impl->refresh_numel();
+    dest_impl->refresh_contiguous();
+}
+
 template <typename VariableVersion>
 c10::intrusive_ptr<TensorImpl> FunctionalTensorWrapper::shallow_copy_and_detach_core(
     VariableVersion&& version_counter,
@@ -367,16 +418,11 @@ c10::intrusive_ptr<TensorImpl> FunctionalTensorWrapper::shallow_copy_and_detach_
   }
 
   auto impl = c10::make_intrusive<FunctionalTensorWrapper>(value_);
-  copy_tensor_metadata(
+  copy_tensor_metadata_and_refresh(
       /*src_impl=*/this,
       /*dest_impl=*/impl.get(),
       /*version_counter=*/std::forward<VariableVersion>(version_counter),
       /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-  impl->level_ = level_;
-  impl->generation_ = generation_;
-  impl->view_metas_ = view_metas_;
-  impl->refresh_numel();
-  impl->refresh_contiguous();
   return impl;
 }
 
@@ -394,6 +440,18 @@ c10::intrusive_ptr<TensorImpl> FunctionalTensorWrapper::shallow_copy_and_detach(
       std::move(version_counter), allow_tensor_metadata_change);
 }
 
+void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) {
+    AT_ASSERT(has_compatible_shallow_copy_type(impl->key_set()));
+    auto functional_impl =
+        static_cast<FunctionalTensorWrapper*>(impl.get());
+    copy_tensor_metadata_and_refresh(
+        /*src_impl=*/functional_impl,
+        /*dest_impl=*/this,
+        /*version_counter=*/version_counter(),
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change());
+}
+
+
 c10::Device FunctionalTensorWrapper::device_custom() const {
   return value_.unsafeGetTensorImpl()->device();
 }
@@ -442,8 +500,8 @@ c10::optional<Tensor> to_functional_tensor(const c10::optional<Tensor>& tensor)
   }
   return c10::nullopt;
 }
-c10::List<c10::optional<Tensor>> to_functional_tensor(const c10::List<c10::optional<Tensor>>& t_list) {
-  c10::List<c10::optional<Tensor>> outputs;
+c10::List<::std::optional<Tensor>> to_functional_tensor(const c10::List<::std::optional<Tensor>>& t_list) {
+  c10::List<::std::optional<Tensor>> outputs;
   outputs.reserve(t_list.size());
   for (const auto i : c10::irange(t_list.size())) {
     outputs.push_back(to_functional_tensor(t_list[i]));
@@ -494,8 +552,8 @@ std::vector<Tensor> from_functional_tensor(ITensorListRef t_list) {
   }
   return outputs;
 }
-c10::List<c10::optional<Tensor>> from_functional_tensor(const c10::List<c10::optional<Tensor>>& t_list) {
-  c10::List<c10::optional<Tensor>> outputs;
+c10::List<::std::optional<Tensor>> from_functional_tensor(const c10::List<::std::optional<Tensor>>& t_list) {
+  c10::List<::std::optional<Tensor>> outputs;
   outputs.reserve(t_list.size());
   for (const auto i : c10::irange(t_list.size())) {
     outputs.push_back(from_functional_tensor(t_list[i], /*assert_functional=*/false));
@@ -530,7 +588,7 @@ void sync(ITensorListRef t_list) {
     sync(t);
   }
 }
-void sync(const c10::List<c10::optional<Tensor>>& t_list) {
+void sync(const c10::List<::std::optional<Tensor>>& t_list) {
   for (const auto i : c10::irange(t_list.size())) {
     sync(t_list[i]);
   }
@@ -610,7 +668,7 @@ bool isFunctionalTensor(const c10::optional<Tensor>& t) {
   }
 }
 
-bool isFunctionalTensor(const c10::List<c10::optional<Tensor>>& t_list) {
+bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
   if (t_list.empty()) return false;
   auto functional_count = 0;
   for (const auto i : c10::irange(t_list.size())) {
@@ -658,7 +716,7 @@ Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, c
   return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
 }
 
-std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta) {
+std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
   std::vector<Tensor> outputs(view_to_wrap.size());
   int64_t i = 0;
   for (const auto& tensor : view_to_wrap) {
@@ -668,10 +726,10 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_
   return outputs;
 }
 
-void mutate_view_meta(const at::Tensor& self, functionalization::ViewMeta meta) {
+void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
   auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
-  self_impl->mutate_view_meta(std::move(meta));
+  self_impl->mutate_view_meta(meta);
 }
 
 // Note [Propagating strides in the functionalization pass]
diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h
index 1dd9104968592..d3237080535c0 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   explicit FunctionalTensorWrapper(
       const Tensor& view_value,
       const FunctionalTensorWrapper* base,
-      functionalization::ViewMeta meta);
+      const functionalization::ViewMeta& meta);
 
   // Get the underlying, actual tensor, that doesn't know anything about
   // functionalization.
@@ -75,26 +75,32 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
     return has_metadata_mutation_;
   };
 
+  void mark_mutation() {
+    functional_storage_impl()->mark_mutation();
+  }
   // Denotes a mutation that's hidden from autograd,
   // e.g. for the purposes of passing a tensor to a triton kernel
   void mark_mutation_hidden_from_autograd() {
-    mutation_hidden_from_autograd_counter_++;
+    functional_storage_impl()->mark_mutation_hidden_from_autograd();
   }
   void mark_mutation_during_no_grad_or_inference_mode() {
-    mutation_during_no_grad_or_inference_mode_++;
+    functional_storage_impl()->mark_mutation_during_no_grad_or_inference_mode();
   }
   // Are all the mutations happening to the tensor hidden from autograd
   bool are_all_mutations_hidden_from_autograd() const {
-    return mutation_hidden_from_autograd_counter_ == mutation_counter_;
+    return functional_storage_impl()->are_all_mutations_hidden_from_autograd();
   }
   // Did all mutations happen under no_grad or inference_mode
   // (We also need to ignore mutations fully hidden from autograd here)
   bool are_all_mutations_under_no_grad_or_inference_mode() const {
-    return mutation_hidden_from_autograd_counter_ +
-        mutation_during_no_grad_or_inference_mode_ ==
-        mutation_counter_;
+    return functional_storage_impl()
+        ->are_all_mutations_under_no_grad_or_inference_mode();
   }
 
+  // Runs the forward_fn of every ViewMeta collected in the current instance
+  // to some other base.
+  Tensor apply_view_metas(const Tensor& base);
+
   // Sync's the underlying tensor with its alias, if it's out of date. This
   // involves two steps: 1) Apply any pending updates/mutations to the alias 2)
   // Replay the views (if any) to regenerate the current tensor off of the
@@ -130,7 +136,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   // from the base tensor. This method is used by inplace-view ops like
   // transpose_. It appends a ViewMeta to the existing stack, and refreshes the
   // tensor by replaying the views off of the alias.
-  void mutate_view_meta(at::functionalization::ViewMeta meta);
+  void mutate_view_meta(const at::functionalization::ViewMeta& meta);
 
   // Custom implementation of self.set_(src)
   void set__impl(const FunctionalTensorWrapper* other);
@@ -156,7 +162,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   // a.replace_(tmp)
   //
   // replace_() swaps out the wrapped tensor, value_, with tmp.
-  void replace_(const Tensor& other);
+  void replace_(const Tensor& other, bool from_lazy_regenerate = false);
 
   bool is_multi_output_view() {
     return is_multi_output_view_;
@@ -211,18 +217,22 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
       VariableVersion&& version_counter,
       bool allow_tensor_metadata_change) const;
 
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override;
+  void copy_tensor_metadata_and_refresh(
+      const FunctionalTensorWrapper* src_impl,
+      FunctionalTensorWrapper* dest_impl,
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const;
+
   // Note that value is not taken by reference: internally, the wrapper will
   // change the value tensor that it points to over time.
   Tensor value_;
-  int64_t level_;
+  int64_t level_{};
   // These two counters are used for identifying
   // whether all the mutations on a given tensor are hidden from autograd or
   // not. If we have an input mutation that is hidden from autograd, then once
   // we convert the input mutation to a copy_() we know it will be safe to hide
   // the copy_() from autograd as well.
-  uint64_t mutation_counter_ = 0;
-  uint64_t mutation_hidden_from_autograd_counter_ = 0;
-  uint64_t mutation_during_no_grad_or_inference_mode_ = 0;
   bool has_metadata_mutation_ = false;
   bool is_multi_output_view_ = false;
   // Did the tensor experience a set_() call.
@@ -230,6 +240,13 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
 
   size_t generation_ = 0;
   std::vector<at::functionalization::ViewMeta> view_metas_;
+
+ protected:
+  static void copy_tensor_metadata(
+      const FunctionalTensorWrapper* src_impl,
+      FunctionalTensorWrapper* dest_impl,
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change);
 };
 
 // Utility functions for the functionalization pass.
@@ -310,9 +327,11 @@ Tensor create_functional_tensor_with_view_meta(
 std::vector<Tensor> create_functional_tensor_with_view_meta(
     ITensorListRef view_to_wrap,
     const Tensor& base,
-    functionalization::ViewMeta meta);
+    const functionalization::ViewMeta& meta);
 
-void mutate_view_meta(const Tensor& self, functionalization::ViewMeta meta);
+void mutate_view_meta(
+    const Tensor& self,
+    const functionalization::ViewMeta& meta);
 
 void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
 void set_sizes_strides_offset(
diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
index 783a925d69833..594f627e17ccf 100644
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@@ -30,17 +30,29 @@
 namespace {
   void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet, torch::jit::Stack* stack) {
     const auto& schema = op.schema();
+    // NB: auto_functionalize handles the case where outputs do not have alias info.
+    // This error message therefore suggests users to modify their custom op to the
+    // point where auto_functionalize works instead of asking them to try the raw
+    // functionalization API (because that is a bit difficult to use).
+    // If you're here and want to try the raw functionalizaton kernel approach,
+    // see https://gist.github.com/bdhirsh/7dadbf6296f8f7d1abcf4c482f438aaa
     TORCH_CHECK(
       !schema.hasAnyAliasInfo(),
-      "Found a custom (non-ATen) operator that either mutates or its inputs: ",
-      op.operator_name().name, ".", op.operator_name().overload_name,
-      ". Getting these operators to work with functionalization requires some extra work",
-      ". For mutable ops you need to register a corresponding out-of-place variant of the op,",
-      " and you also need to register a Functionalization kernel that performs some boilerplate,",
-      " telling functionalization to map from the mutable op to the out-of-place op",
-      ". See a more complete example of how to do this at ",
-      "https://gist.github.com/bdhirsh/7dadbf6296f8f7d1abcf4c482f438aaa.",
-      " Please file a GitHub issue if you run into any problems.");
+      "Found a custom (non-ATen) operator whose output has alias annotations: ",
+      op.schema(),
+      ". We only support functionalizing operators whose outputs do not have alias ",
+      "annotations (e.g. 'Tensor(a)' is a Tensor with an alias annotation whereas ",
+      "'Tensor' is a Tensor without. The '(a)' is the alias annotation). "
+      "The alias annotation specifies that the output ",
+      "Tensor shares storage with an input that has the same annotation. ",
+      "Please check if ",
+      "(1) the output needs to be an output (if not, don't return it), ",
+      "(2) if the output doesn't share storage with any inputs, then ",
+      "delete the alias annotation. ",
+      "(3) if the output indeed shares storage with an input, then add a ",
+      ".clone() before returning it to prevent storage sharing and then "
+      "delete the alias annotation. ",
+      "Otherwise, please file an issue on GitHub.");
     const auto num_arguments = schema.arguments().size();
     const auto arguments_begin = stack->size() - num_arguments;
     auto arguments = torch::jit::last(stack, num_arguments);
@@ -168,7 +180,7 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
       return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
     }
   );
-  at::functionalization::impl::mutate_view_meta(self, std::move(view_meta));
+  at::functionalization::impl::mutate_view_meta(self, view_meta);
   return self;
 }
 
@@ -198,7 +210,13 @@ static at::Tensor lift_fresh_functionalize_copy(const at::Tensor & self) {
   // but that isn't really a use case today.
   // Needed for https://github.com/pytorch/pytorch/issues/105327
   if (at::functionalization::impl::isFunctionalTensor(self)) {
-    return self.clone();
+    // Note [Composite Functionalization under PreDispatch mode]
+    // When we are tracing under PreDispatch, PreDispatch key will be
+    // in the local include TLS. As a result, when we redispatch here,
+    // we will end up hitting PreDispatch stack first. So, we should
+    // directly redispatch to the functionalize key manually.
+    static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("aten::clone", "").typed<at::Tensor(const at::Tensor &, c10::optional<at::MemoryFormat>)>();
+    return op.redispatch(c10::DispatchKeySet({c10::DispatchKey::Functionalize}), self, c10::nullopt);
   }
 
   at::AutoDispatchSkipFunctionalize guard;
@@ -304,15 +322,15 @@ static at::Tensor& set__functionalize(at::Tensor& self, const at::Tensor& src) {
   TORCH_CHECK(at::functionalization::impl::isFunctionalTensor(self) || !at::functionalization::impl::isFunctionalTensor(src),
     "set__functionalize: Tried to mutate a non-functional tensor with a functional tensor, which is not allowed");
 
-  TORCH_CHECK(at::functionalization::impl::isFunctionalTensor(src),
-    "set__functionalize: We do not currently support x.set_(y) where y is not a FunctionalTensor. Please file an issue");
-
   // nop case
   if (!at::functionalization::impl::isFunctionalTensor(self) && !at::functionalization::impl::isFunctionalTensor(src)) {
     at::AutoDispatchSkipFunctionalize guard;
     return self.set_(src);
   }
 
+  TORCH_CHECK(at::functionalization::impl::isFunctionalTensor(src),
+    "set__functionalize: We do not currently support x.set_(y) where y is not a FunctionalTensor. Please file an issue");
+
   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(src));
   auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
diff --git a/aten/src/ATen/InferSize.h b/aten/src/ATen/InferSize.h
index 111c7eb8f5fc7..caa8ec42003c9 100644
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@@ -37,7 +37,8 @@ inline void infer_size_impl(
     }
   }
 
-  if (numel == newsize || (infer_dim && newsize > 0 && numel % newsize == 0)) {
+  if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(numel, newsize)) ||
+      (infer_dim && newsize > 0 && numel % newsize == 0)) {
     if (infer_dim) {
       // We have a degree of freedom here to select the dimension size; follow
       // NumPy semantics and just bail.  However, a nice error message is needed
diff --git a/aten/src/ATen/LegacyBatchedTensorImpl.h b/aten/src/ATen/LegacyBatchedTensorImpl.h
index 732e252165ca6..098fbf9d6292f 100644
--- a/aten/src/ATen/LegacyBatchedTensorImpl.h
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <bitset>
-#include <utility>
 
 #include <ATen/ArrayRef.h>
 #include <ATen/SmallVector.h>
diff --git a/aten/src/ATen/LegacyVmapTransforms.cpp b/aten/src/ATen/LegacyVmapTransforms.cpp
index ca43993ed7d35..5560f9a0d7963 100644
--- a/aten/src/ATen/LegacyVmapTransforms.cpp
+++ b/aten/src/ATen/LegacyVmapTransforms.cpp
@@ -135,9 +135,7 @@ static Tensor alignBatchDimsAtFront(
     const Tensor& self,
     std::bitset<kVmapNumLevels> requested_levels,
     int64_t requested_example_dim) {
-  Tensor physical_tensor;
-  std::bitset<kVmapNumLevels> tensor_levels;
-  std::tie(physical_tensor, tensor_levels) = getPhysicalTensorAndLevels(self);
+  auto [physical_tensor, tensor_levels] = getPhysicalTensorAndLevels(self);
 
   TORCH_INTERNAL_ASSERT(
     (tensor_levels | requested_levels) == requested_levels,
@@ -263,10 +261,7 @@ VmapPhysicalViewVec BroadcastingVmapTransform::logicalToPhysical(TensorList logi
 
   VmapPhysicalViewVec result;
 
-  std::bitset<kVmapNumLevels> levels;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int64_t largest_logical_dim;
-  std::tie(levels, largest_logical_dim) = getLevelsAndLargestLogicalDim(logical_tensors);
+  auto [levels, largest_logical_dim] = getLevelsAndLargestLogicalDim(logical_tensors);
 
   for (const auto& tensor : logical_tensors) {
     // NB: It's possible that we didn't actually need to align `tensor`.
diff --git a/aten/src/ATen/LegacyVmapTransforms.h b/aten/src/ATen/LegacyVmapTransforms.h
index b32b182056556..97729b3254e74 100644
--- a/aten/src/ATen/LegacyVmapTransforms.h
+++ b/aten/src/ATen/LegacyVmapTransforms.h
@@ -113,8 +113,8 @@ struct VmapPhysicalToLogicalMap;
 //   levels: 012345
 struct TORCH_API VmapPhysicalView {
   VmapPhysicalView(Tensor&& tensor, std::bitset<kVmapNumLevels> levels)
-      : levels_(levels), tensor_(tensor) {
-    TORCH_INTERNAL_ASSERT(!isBatchedTensor(tensor));
+      : levels_(levels), tensor_(std::move(tensor)) {
+    TORCH_INTERNAL_ASSERT(!isBatchedTensor(tensor_));
   }
 
   Tensor& tensor() {
diff --git a/aten/src/ATen/MapAllocator.cpp b/aten/src/ATen/MapAllocator.cpp
index 497b53d35b048..19c08634d2cf9 100644
--- a/aten/src/ATen/MapAllocator.cpp
+++ b/aten/src/ATen/MapAllocator.cpp
@@ -63,7 +63,6 @@ constexpr const char* unknown_eventname = "eventname not specified";
 
 MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags, size_t size)
   : filename_(filename.empty() ? unknown_filename : filename)
-  , flags_(0) // to be filled later
   , size_(0) // to be filled later
 #ifdef _WIN32
   , handle_(INVALID_HANDLE_VALUE) // to be filled later
@@ -72,7 +71,6 @@ MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags,
 #else
   , fd_(fd)
 #endif
-  , base_ptr_(nullptr)
 {
 
   if (!(flags & ALLOCATOR_MAPPED_SHARED) && !(flags & ALLOCATOR_MAPPED_SHAREDMEM)) {
@@ -252,11 +250,13 @@ MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags,
 
     if (!(flags_ & ALLOCATOR_MAPPED_FROMFD)) {
       if (flags_ & ALLOCATOR_MAPPED_SHARED) {
+        // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
         if ((fd = open(filename_.c_str(), flags, (mode_t)0600)) == -1) {
           TORCH_CHECK(false, "unable to open file <", filename_, "> in read-write mode: ", strerror(errno), " (", errno, ")");
         }
       } else if (flags_ & ALLOCATOR_MAPPED_SHAREDMEM) {
 #ifdef HAVE_SHM_OPEN
+        // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
         if((fd = shm_open(filename_.c_str(), flags, (mode_t)0600)) == -1) {
           TORCH_CHECK(false, "unable to open shared memory object <", filename_, "> in read-write mode: ", strerror(errno), " (", errno, ")");
         }
@@ -264,6 +264,7 @@ MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags,
         TORCH_CHECK(false, "unable to open file <", filename_, "> in sharedmem mode, shm_open unavailable on this platform");
 #endif
       } else {
+        // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
         if ((fd = open(filename_.c_str(), O_RDONLY)) == -1) {
           TORCH_CHECK(false, "unable to open file <", filename_, "> in read-only mode: ", strerror(errno), " (", errno, ")");
         }
@@ -272,7 +273,7 @@ MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags,
       fd = fd_;
     }
 
-    struct stat file_stat;
+    struct stat file_stat{};
     if (fstat(fd, &file_stat) == -1) {
       int last_err = errno;
       if (!(flags_ & ALLOCATOR_MAPPED_FROMFD)) {
@@ -284,7 +285,7 @@ MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags,
     if (size > 0) {
       if (static_cast<int64_t>(size) > file_stat.st_size) {
         if (flags_) {
-          if (ftruncate(fd, size) == -1) {
+          if (ftruncate(fd, static_cast<off_t>(size)) == -1) {
             TORCH_CHECK(false, "unable to resize file <", filename_, "> to the right size: ", strerror(errno), " (", errno, ")");
           }
           if (fstat(fd, &file_stat) == -1 || file_stat.st_size < static_cast<int64_t>(size)) {
@@ -311,7 +312,7 @@ MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags,
       size = file_stat.st_size;
     }
 
-    size_ = size; /* if we are here, it must be the right size */
+    size_ = static_cast<ptrdiff_t>(size); /* if we are here, it must be the right size */
 
     /* map it */
     if (flags_ & (ALLOCATOR_MAPPED_SHARED | ALLOCATOR_MAPPED_SHAREDMEM)) {
@@ -325,6 +326,11 @@ MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags,
       TORCH_CHECK(false, "unable to mmap ", size_, " bytes from file <", filename_, ">: ", strerror(errno), " (", errno, ")");
     }
 
+#if !defined(__APPLE__) && !defined(__ANDROID__)
+    /* attempt to use larger block size on Linux, which is important for getting better CUDA upload speed */
+    posix_fadvise(fd, 0, static_cast<off_t>(size), POSIX_FADV_SEQUENTIAL);
+#endif
+
     if (flags_ & ALLOCATOR_MAPPED_KEEPFD) {
       fd_ = fd;
     } else {
@@ -601,8 +607,7 @@ void* RefcountedMapAllocator::data() const {
 }
 
 MapAllocator::~MapAllocator() {
-  // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
-  close();
+  MapAllocator::close();
   c10::reportMemoryUsageToProfiler(base_ptr_, -size_, 0, 0, c10::Device(c10::DeviceType::CPU));
 }
 
diff --git a/aten/src/ATen/MapAllocator.h b/aten/src/ATen/MapAllocator.h
index 3354ab84577f1..f4a30edef6239 100644
--- a/aten/src/ATen/MapAllocator.h
+++ b/aten/src/ATen/MapAllocator.h
@@ -128,7 +128,7 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
   void close() override;
 
   ~RefcountedMapAllocator() override {
-    close();
+    RefcountedMapAllocator::close();
   }
 
  protected:
diff --git a/aten/src/ATen/MatrixRef.h b/aten/src/ATen/MatrixRef.h
index ba693ab7d5809..901efff4cc23f 100644
--- a/aten/src/ATen/MatrixRef.h
+++ b/aten/src/ATen/MatrixRef.h
@@ -94,16 +94,16 @@ class MatrixRef {
   /// The declaration here is extra complicated so that "arrayRef = {}"
   /// continues to select the move assignment operator.
   template <typename U>
-  typename std::enable_if<std::is_same<U, T>::value, MatrixRef<T>>::type&
-  operator=(U&& Temporary) = delete;
+  std::enable_if_t<std::is_same_v<U, T>, MatrixRef<T>>& operator=(
+      U&& Temporary) = delete;
 
   /// Disallow accidental assignment from a temporary.
   ///
   /// The declaration here is extra complicated so that "arrayRef = {}"
   /// continues to select the move assignment operator.
   template <typename U>
-  typename std::enable_if<std::is_same<U, T>::value, MatrixRef<T>>::type&
-  operator=(std::initializer_list<U>) = delete;
+  std::enable_if_t<std::is_same_v<U, T>, MatrixRef<T>>& operator=(
+      std::initializer_list<U>) = delete;
 };
 
 } // end namespace at
diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp
index 7195d04f0f4cd..a76156c03402d 100644
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@@ -297,7 +297,7 @@ static int64_t num_batch_dims(DimnameList names) {
   if (names.size() <= 2) {
     return 0;
   }
-  return names.size() - 2;
+  return static_cast<int64_t>(names.size() - 2);
 }
 
 static std::vector<Dimname> compute_matmul_outnames(
diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp
index 223a95ebc132b..2f73b7b304ee3 100644
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@@ -10,6 +10,7 @@
 
 #include <numeric>
 #include <functional>
+#include <utility>
 
 namespace {
 inline void validate_nested_tensor_metadata(
@@ -67,8 +68,8 @@ c10::DispatchKeySet get_view_key_set(const at::Tensor& base) {
 }
 
 } // namespace
-namespace at {
-namespace native {
+
+namespace at::native {
 
 inline std::vector<int64_t> construct_opt_sizes(const at::Tensor& sizes) {
   // torch.tensor([]) is considered to have `dim() = 1` and `size(0) = 0`
@@ -80,7 +81,7 @@ inline std::vector<int64_t> construct_opt_sizes(const at::Tensor& sizes) {
   std::vector<int64_t> result(1, sizes.sizes()[0]);
   if (sizes.dim() > 0) {
     size_t nested_dim = result.size();
-    int64_t* sizes_ptr = sizes.data_ptr<int64_t>();
+    const int64_t* sizes_ptr = sizes.const_data_ptr<int64_t>();
     result.resize(nested_dim + sizes.sizes()[1]);
     int64_t sizes_size_0 = sizes.sizes()[0];
     int64_t sizes_size_1 = sizes.sizes()[1];
@@ -100,7 +101,7 @@ inline std::vector<int64_t> construct_opt_sizes(const at::Tensor& sizes) {
 }
 
 // assume contiguous, we can construct stride from size
-inline at::Tensor construct_nested_strides(const at::Tensor& sizes) {
+at::Tensor construct_nested_strides(const at::Tensor& sizes) {
   // empty `sizes` means empty nested tensor, so return empty strides
   if (sizes.dim() == 0) {
     return sizes;
@@ -113,7 +114,7 @@ inline at::Tensor construct_nested_strides(const at::Tensor& sizes) {
     return sizes;
   }
   at::Tensor strides = sizes.new_empty(sizes.sizes());
-  const int64_t* sizes_ptr = sizes.data_ptr<int64_t>();
+  const int64_t* sizes_ptr = sizes.const_data_ptr<int64_t>();
   int64_t* strides_ptr = strides.data_ptr<int64_t>();
   for (int64_t i = 0; i < sizes.size(0); i++) {
     strides_ptr[orig_dim - 1] = 1;
@@ -138,7 +139,7 @@ inline at::Tensor construct_nested_strides(const at::Tensor& sizes) {
    *
    * @return A tensor of offsets
   */
-inline at::Tensor construct_offsets(const at::Tensor& sizes) {
+at::Tensor construct_offsets(const at::Tensor& sizes) {
   // empty `sizes` means empty nested tensor, so return empty strides
   if (sizes.dim() == 0) {
     return at::empty({0}, sizes.options().dtype(kLong));
@@ -151,10 +152,10 @@ inline at::Tensor construct_offsets(const at::Tensor& sizes) {
     std::iota(offsets_ptr, offsets_ptr + ntensors, 0);
     return offsets;
   }
-  const int64_t* sizes_ptr = sizes.data_ptr<int64_t>();
+  const int64_t* sizes_ptr = sizes.const_data_ptr<int64_t>();
   offsets_ptr[0] = 0;
   for (const auto i : c10::irange(ntensors - 1)) {
-    const int64_t row_product = std::accumulate(sizes_ptr, sizes_ptr + orig_dim, 1, std::multiplies<int64_t>());
+    const int64_t row_product = std::accumulate(sizes_ptr, sizes_ptr + orig_dim, 1, std::multiplies());
     offsets_ptr[i + 1] = offsets_ptr[i] + row_product;
     sizes_ptr += orig_dim;
   }
@@ -188,7 +189,7 @@ NestedTensorImpl::NestedTensorImpl(
 }
 
 NestedTensorImpl::NestedTensorImpl(
-    at::Tensor buffer,
+    const at::Tensor& buffer,
     at::Tensor nested_sizes,
     at::Tensor nested_strides,
     at::Tensor storage_offsets)
@@ -196,9 +197,9 @@ NestedTensorImpl::NestedTensorImpl(
           buffer.storage(),
           generate_nested_key_set_from_buffer(buffer),
           buffer.dtype(),
-          nested_sizes,
-          nested_strides,
-          storage_offsets) {
+          std::move(nested_sizes),
+          std::move(nested_strides),
+          std::move(storage_offsets)) {
 
   TORCH_INTERNAL_ASSERT(
       buffer.dim() == 1,
@@ -210,8 +211,8 @@ NestedTensorImpl::NestedTensorImpl(
 // assume contiguous, `nested_strides` and `offsets`
 // can be infered from `nested_sizes`
 NestedTensorImpl::NestedTensorImpl(
-    at::Tensor buffer,
-    at::Tensor nested_sizes)
+    const at::Tensor& buffer,
+    const at::Tensor& nested_sizes)
     : NestedTensorImpl(
           buffer,
           nested_sizes,
@@ -343,7 +344,7 @@ int64_t get_numel_from_nested_size_tensor(const at::Tensor& tensor) {
       static_cast<uint64_t>(std::numeric_limits<int64_t>::max()),
       static_cast<uint64_t>(std::numeric_limits<size_t>::max()));
 
-  const int64_t* sizes_ptr = tensor.data_ptr<int64_t>();
+  const int64_t* sizes_ptr = tensor.const_data_ptr<int64_t>();
   const auto nt_dim = tensor.size(1);
   uint64_t num_elements{0};
 
@@ -359,5 +360,4 @@ int64_t get_numel_from_nested_size_tensor(const at::Tensor& tensor) {
   return static_cast<int64_t>(num_elements);
 }
 
-} // namespace native
-} // namespace at
+} // namespace at::native
diff --git a/aten/src/ATen/NestedTensorImpl.h b/aten/src/ATen/NestedTensorImpl.h
index 11d7e2f165548..0bd3d98e73c5c 100644
--- a/aten/src/ATen/NestedTensorImpl.h
+++ b/aten/src/ATen/NestedTensorImpl.h
@@ -14,6 +14,8 @@ namespace at::native {
 struct NestedTensorImpl;
 inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt);
 int64_t get_numel_from_nested_size_tensor(const at::Tensor& tensor);
+at::Tensor construct_nested_strides(const at::Tensor& nested_size);
+at::Tensor construct_offsets(const at::Tensor& nested_size);
 
 struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
   explicit NestedTensorImpl(
@@ -25,13 +27,15 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
       at::Tensor storage_offsets);
 
   explicit NestedTensorImpl(
-      at::Tensor buffer,
+      const at::Tensor& buffer,
       at::Tensor nested_sizes,
       at::Tensor nested_strides,
       at::Tensor storage_offsets);
   // assume contiguous, `nested_strides` and `offsets`
   // can be infered from `nested_sizes`
-  explicit NestedTensorImpl(at::Tensor buffer, at::Tensor nested_sizes);
+  explicit NestedTensorImpl(
+      const at::Tensor& buffer,
+      const at::Tensor& nested_sizes);
 
   // This constructor is used creating view tensors from nested tensors
   explicit NestedTensorImpl(
@@ -224,7 +228,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
   }
   const Tensor &sizemat = nt->get_nested_sizes(),
                &stridemat = nt->get_nested_strides();
-  int64_t* offsets_ptr = nt->get_storage_offsets().data_ptr<int64_t>();
+  const int64_t* offsets_ptr =
+      nt->get_storage_offsets().const_data_ptr<int64_t>();
   int64_t orig_dim = sizemat.size(1);
   // nesting scalars
   if (orig_dim == 0) {
@@ -239,8 +244,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
   // nesting tensors
   else {
     // if any underlying tensor is non-contiguous
-    const int64_t *sizemat_ptr = sizemat.data_ptr<int64_t>(),
-                  *stridemat_ptr = stridemat.data_ptr<int64_t>();
+    const int64_t *sizemat_ptr = sizemat.const_data_ptr<int64_t>(),
+                  *stridemat_ptr = stridemat.const_data_ptr<int64_t>();
     for (int64_t i = 0; i < ntensors; i++) {
       if (stridemat_ptr[orig_dim - 1] != 1) {
         return false;
@@ -259,8 +264,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
     if (offsets_ptr[0] != 0) {
       return false;
     }
-    sizemat_ptr = sizemat.data_ptr<int64_t>();
-    stridemat_ptr = stridemat.data_ptr<int64_t>();
+    sizemat_ptr = sizemat.const_data_ptr<int64_t>();
+    stridemat_ptr = stridemat.const_data_ptr<int64_t>();
     for (int64_t i = 1; i < ntensors; i++) {
       if (offsets_ptr[i] !=
           offsets_ptr[i - 1] + *sizemat_ptr * *stridemat_ptr) {
diff --git a/aten/src/ATen/NumericUtils.h b/aten/src/ATen/NumericUtils.h
index 06b25334bb13e..788da64b4e427 100644
--- a/aten/src/ATen/NumericUtils.h
+++ b/aten/src/ATen/NumericUtils.h
@@ -7,7 +7,9 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/BFloat16.h>
 #include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
 #include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
 #include <c10/util/Half.h>
 #include <c10/util/complex.h>
 
@@ -20,16 +22,12 @@ namespace at {
 // (uselessly) convert to floating point and then do the test.
 // This function is.
 
-template <
-    typename T,
-    typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
 inline C10_HOST_DEVICE bool _isnan(T /*val*/) {
   return false;
 }
 
-template <
-    typename T,
-    typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>, int> = 0>
 inline C10_HOST_DEVICE bool _isnan(T val) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
   return ::isnan(val);
@@ -38,24 +36,19 @@ inline C10_HOST_DEVICE bool _isnan(T val) {
 #endif
 }
 
-template <
-    typename T,
-    typename std::enable_if<c10::is_complex<T>::value, int>::type = 0>
+template <typename T, std::enable_if_t<c10::is_complex<T>::value, int> = 0>
 inline C10_HOST_DEVICE bool _isnan(T val) {
   return std::isnan(val.real()) || std::isnan(val.imag());
 }
 
-template <
-    typename T,
-    typename std::enable_if<std::is_same<T, at::Half>::value, int>::type = 0>
+template <typename T, std::enable_if_t<std::is_same_v<T, at::Half>, int> = 0>
 inline C10_HOST_DEVICE bool _isnan(T val) {
   return at::_isnan(static_cast<float>(val));
 }
 
 template <
     typename T,
-    typename std::enable_if<std::is_same<T, at::BFloat16>::value, int>::type =
-        0>
+    std::enable_if_t<std::is_same_v<T, at::BFloat16>, int> = 0>
 inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) {
   return at::_isnan(static_cast<float>(val));
 }
@@ -66,16 +59,28 @@ inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) {
 
 template <
     typename T,
-    typename std::enable_if<std::is_same<T, at::Float8_e5m2>::value, int>::
-        type = 0>
+    std::enable_if_t<std::is_same_v<T, at::Float8_e5m2>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+  return val.isnan();
+}
+
+template <
+    typename T,
+    std::enable_if_t<std::is_same_v<T, at::Float8_e4m3fn>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+  return val.isnan();
+}
+
+template <
+    typename T,
+    std::enable_if_t<std::is_same_v<T, at::Float8_e5m2fnuz>, int> = 0>
 inline C10_HOST_DEVICE bool _isnan(T val) {
   return val.isnan();
 }
 
 template <
     typename T,
-    typename std::enable_if<std::is_same<T, at::Float8_e4m3fn>::value, int>::
-        type = 0>
+    std::enable_if_t<std::is_same_v<T, at::Float8_e4m3fnuz>, int> = 0>
 inline C10_HOST_DEVICE bool _isnan(T val) {
   return val.isnan();
 }
@@ -84,16 +89,12 @@ inline C10_HOST_DEVICE bool _isnan(T val) {
 // (uselessly) convert to floating point and then do the test.
 // This function is.
 
-template <
-    typename T,
-    typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
 inline C10_HOST_DEVICE bool _isinf(T /*val*/) {
   return false;
 }
 
-template <
-    typename T,
-    typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>, int> = 0>
 inline C10_HOST_DEVICE bool _isinf(T val) {
 #if defined(__CUDACC__) || defined(__HIPCC__)
   return ::isinf(val);
@@ -118,10 +119,18 @@ inline C10_HOST_DEVICE bool _isinf(at::Float8_e4m3fn val) {
   return false;
 }
 
+inline C10_HOST_DEVICE bool _isinf(at::Float8_e5m2fnuz val) {
+  return false;
+}
+
+inline C10_HOST_DEVICE bool _isinf(at::Float8_e4m3fnuz val) {
+  return false;
+}
+
 template <typename T>
 C10_HOST_DEVICE inline T exp(T x) {
   static_assert(
-      !std::is_same<T, double>::value,
+      !std::is_same_v<T, double>,
       "this template must be used with float or less precise type");
 #if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__)
   // use __expf fast approximation for peak bandwidth
@@ -139,7 +148,7 @@ C10_HOST_DEVICE inline double exp<double>(double x) {
 template <typename T>
 C10_HOST_DEVICE inline T log(T x) {
   static_assert(
-      !std::is_same<T, double>::value,
+      !std::is_same_v<T, double>,
       "this template must be used with float or less precise type");
 #if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__)
   // use __logf fast approximation for peak bandwidth
@@ -157,7 +166,7 @@ C10_HOST_DEVICE inline double log<double>(double x) {
 template <typename T>
 C10_HOST_DEVICE inline T log1p(T x) {
   static_assert(
-      !std::is_same<T, double>::value,
+      !std::is_same_v<T, double>,
       "this template must be used with float or less precise type");
 #if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__)
   // use __logf fast approximation for peak bandwidth
@@ -176,7 +185,7 @@ C10_HOST_DEVICE inline double log1p<double>(double x) {
 template <typename T>
 C10_HOST_DEVICE inline T tan(T x) {
   static_assert(
-      !std::is_same<T, double>::value,
+      !std::is_same_v<T, double>,
       "this template must be used with float or less precise type");
 #if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__)
   // use __tanf fast approximation for peak bandwidth
diff --git a/aten/src/ATen/OpMathType.h b/aten/src/ATen/OpMathType.h
index ddb2ce71be05f..d00195b07e490 100644
--- a/aten/src/ATen/OpMathType.h
+++ b/aten/src/ATen/OpMathType.h
@@ -4,7 +4,9 @@
 #include <c10/util/BFloat16.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
 #include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
 #include <c10/util/Half.h>
 
 namespace at {
@@ -31,6 +33,14 @@ struct OpMathType<at::Float8_e4m3fn> {
   using type = float;
 };
 template <>
+struct OpMathType<at::Float8_e5m2fnuz> {
+  using type = float;
+};
+template <>
+struct OpMathType<at::Float8_e4m3fnuz> {
+  using type = float;
+};
+template <>
 struct OpMathType<c10::complex<Half>> {
   using type = c10::complex<float>;
 };
diff --git a/aten/src/ATen/OpaqueTensorImpl.h b/aten/src/ATen/OpaqueTensorImpl.h
index e6c6413815bbd..f71ae5358f299 100644
--- a/aten/src/ATen/OpaqueTensorImpl.h
+++ b/aten/src/ATen/OpaqueTensorImpl.h
@@ -33,6 +33,7 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
     set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
     sizes_and_strides_.set_sizes(sizes);
     refresh_numel();
+    // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
     is_non_overlapping_and_dense_ = is_non_overlapping_and_dense;
   }
 
diff --git a/aten/src/ATen/Parallel-inl.h b/aten/src/ATen/Parallel-inl.h
index 62f287fc33c42..a5e682281abe5 100644
--- a/aten/src/ATen/Parallel-inl.h
+++ b/aten/src/ATen/Parallel-inl.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/util/Exception.h>
+#include <c10/util/ParallelGuard.h>
 #include <c10/util/SmallVector.h>
 
 namespace at {
@@ -24,13 +25,19 @@ inline void parallel_for(
        at::get_num_threads() > 1);
   if (!use_parallel) {
     internal::ThreadIdGuard tid_guard(0);
+    c10::ParallelGuard guard(true);
     f(begin, end);
     return;
   }
 
-  internal::invoke_parallel(begin, end, grain_size, f);
+  internal::invoke_parallel(
+      begin, end, grain_size, [&](int64_t begin, int64_t end) {
+        c10::ParallelGuard guard(true);
+        f(begin, end);
+      });
 #else
   internal::ThreadIdGuard tid_guard(0);
+  c10::ParallelGuard guard(true);
   f(begin, end);
 #endif
 }
@@ -56,6 +63,7 @@ inline scalar_t parallel_reduce(
        max_threads > 1);
   if (!use_parallel) {
     internal::ThreadIdGuard tid_guard(0);
+    c10::ParallelGuard guard(true);
     return f(begin, end, ident);
   }
 
@@ -66,6 +74,7 @@ inline scalar_t parallel_reduce(
       grain_size,
       [&](const int64_t my_begin, const int64_t my_end) {
         const auto tid = at::get_thread_num();
+        c10::ParallelGuard guard(true);
         results[tid] = f(my_begin, my_end, ident);
       });
 
@@ -76,6 +85,7 @@ inline scalar_t parallel_reduce(
   return result;
 #else
   internal::ThreadIdGuard tid_guard(0);
+  c10::ParallelGuard guard(true);
   return f(begin, end, ident);
 #endif
 }
diff --git a/aten/src/ATen/ParallelCommon.cpp b/aten/src/ATen/ParallelCommon.cpp
index 285713417cb4c..0504a066eef50 100644
--- a/aten/src/ATen/ParallelCommon.cpp
+++ b/aten/src/ATen/ParallelCommon.cpp
@@ -15,6 +15,10 @@
 #include <omp.h>
 #endif
 
+#if defined(__APPLE__) && defined(__aarch64__) && !defined(C10_MOBILE)
+#include <sys/sysctl.h>
+#endif
+
 namespace at {
 
 namespace {
@@ -46,30 +50,30 @@ std::string get_parallel_info() {
   std::ostringstream ss;
 
   ss << "ATen/Parallel:\n\tat::get_num_threads() : "
-     << at::get_num_threads() << std::endl;
+     << at::get_num_threads() << '\n';
   ss << "\tat::get_num_interop_threads() : "
-     << at::get_num_interop_threads() << std::endl;
+     << at::get_num_interop_threads() << '\n';
 
-  ss << at::get_openmp_version() << std::endl;
+  ss << at::get_openmp_version() << '\n';
 #ifdef _OPENMP
-  ss << "\tomp_get_max_threads() : " << omp_get_max_threads() << std::endl;
+  ss << "\tomp_get_max_threads() : " << omp_get_max_threads() << '\n';
 #endif
 
-  ss << at::get_mkl_version() << std::endl;
+  ss << at::get_mkl_version() << '\n';
 #if AT_MKL_ENABLED()
-  ss << "\tmkl_get_max_threads() : " << mkl_get_max_threads() << std::endl;
+  ss << "\tmkl_get_max_threads() : " << mkl_get_max_threads() << '\n';
 #endif
 
-  ss << at::get_mkldnn_version() << std::endl;
+  ss << at::get_mkldnn_version() << '\n';
 
   ss << "std::thread::hardware_concurrency() : "
-     << std::thread::hardware_concurrency() << std::endl;
+     << std::thread::hardware_concurrency() << '\n';
 
-  ss << "Environment variables:" << std::endl;
+  ss << "Environment variables:" << '\n';
   ss << "\tOMP_NUM_THREADS : "
-     << get_env_var("OMP_NUM_THREADS", "[not set]") << std::endl;
+     << get_env_var("OMP_NUM_THREADS", "[not set]") << '\n';
   ss << "\tMKL_NUM_THREADS : "
-     << get_env_var("MKL_NUM_THREADS", "[not set]") << std::endl;
+     << get_env_var("MKL_NUM_THREADS", "[not set]") << '\n';
 
   ss << "ATen parallel backend: ";
   #if AT_PARALLEL_OPENMP
@@ -82,7 +86,7 @@ std::string get_parallel_info() {
   #ifdef C10_MOBILE
   ss << " [mobile]";
   #endif
-  ss << std::endl;
+  ss << '\n';
 
   #if AT_EXPERIMENTAL_SINGLE_THREAD_POOL
   ss << "Experimental: single thread pool" << std::endl;
@@ -104,11 +108,23 @@ int intraop_default_num_threads() {
 #if defined(FBCODE_CAFFE2) && defined(__aarch64__)
     nthreads = 1;
 #else
+#if defined(__aarch64__) && defined(__APPLE__)
+    // On Apple Silicon there are efficient and performance core
+    // Restrict parallel algorithms to performance cores by default
+    int32_t num_cores = -1;
+    size_t num_cores_len = sizeof(num_cores);
+    if (sysctlbyname("hw.perflevel0.physicalcpu", &num_cores, &num_cores_len, nullptr, 0) == 0) {
+      if (num_cores > 1) {
+        nthreads = num_cores;
+        return num_cores;
+      }
+    }
+#endif
     nthreads = TaskThreadPoolBase::defaultNumThreads();
 #endif
   }
-  return nthreads;
-#endif
+  return static_cast<int>(nthreads);
+#endif /* !defined(C10_MOBILE) */
 }
 
 } // namespace at
diff --git a/aten/src/ATen/ParallelNative.cpp b/aten/src/ATen/ParallelNative.cpp
index 948d1e5921c7a..a2e1992650009 100644
--- a/aten/src/ATen/ParallelNative.cpp
+++ b/aten/src/ATen/ParallelNative.cpp
@@ -152,7 +152,7 @@ void invoke_parallel(
     std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
     std::exception_ptr eptr;
     std::mutex mutex;
-    volatile size_t remaining{0};
+    std::atomic_size_t remaining{0};
     std::condition_variable cv;
   } state;
 
diff --git a/aten/src/ATen/ParallelOpenMP.h b/aten/src/ATen/ParallelOpenMP.h
index b983571f09a2e..84e744ba10b10 100644
--- a/aten/src/ATen/ParallelOpenMP.h
+++ b/aten/src/ATen/ParallelOpenMP.h
@@ -11,10 +11,8 @@
 #include <omp.h>
 #endif
 
-namespace at {
-
 #ifdef _OPENMP
-namespace internal {
+namespace at::internal {
 template <typename F>
 inline void invoke_parallel(
     int64_t begin,
@@ -52,7 +50,5 @@ inline void invoke_parallel(
     std::rethrow_exception(eptr);
   }
 }
-} // namespace internal
+} // namespace at::internal
 #endif // _OPENMP
-
-} // namespace at
diff --git a/aten/src/ATen/SparseCsrTensorImpl.cpp b/aten/src/ATen/SparseCsrTensorImpl.cpp
index ec9ade9695ece..8dc1fd05452a7 100644
--- a/aten/src/ATen/SparseCsrTensorImpl.cpp
+++ b/aten/src/ATen/SparseCsrTensorImpl.cpp
@@ -55,7 +55,11 @@ SparseCsrTensorImpl::SparseCsrTensorImpl(
                   "to https://github.com/pytorch/pytorch/issues.");
 
   TORCH_INTERNAL_ASSERT(((key_set.has(DispatchKey::SparseCsrCPU) && device().type() == kCPU)
-                         || (key_set.has(DispatchKey::SparseCsrCUDA) && device().type() == kCUDA)),
+                         || (key_set.has(DispatchKey::SparseCsrCUDA) && device().type() == kCUDA)
+                         || (key_set.has(DispatchKey::SparseCsrMeta) && device().type() == kMeta)
+                         || (key_set.has(DispatchKey::SparseCsrCPU) && device().type() == kMeta)   // fake tensor
+                         || (key_set.has(DispatchKey::SparseCsrCUDA) && device().type() == kMeta)  // fake tensor
+                         || (key_set.has(DispatchKey::SparseCsrPrivateUse1) && device().type() == kPrivateUse1)),
                         "Inconsistent key_set (=", key_set, ") and device (=", device(), ")");
 
   set_storage_access_should_throw();
@@ -166,9 +170,7 @@ void SparseCsrTensorImpl::resize_as_sparse_compressed_tensor_(
       src.layout(),
       ")");
 
-  Tensor compressed_indices;
-  Tensor plain_indices;
-  std::tie(compressed_indices, plain_indices) =
+  auto [compressed_indices, plain_indices] =
       sparse_csr::getCompressedPlainIndices(src);
   // reuse self indices storage
   if (crow_indices_.sizes() != compressed_indices.sizes()) {
diff --git a/aten/src/ATen/SparseCsrTensorImpl.h b/aten/src/ATen/SparseCsrTensorImpl.h
index c39aeb4c5d82b..94ac1e1c39344 100644
--- a/aten/src/ATen/SparseCsrTensorImpl.h
+++ b/aten/src/ATen/SparseCsrTensorImpl.h
@@ -2,6 +2,7 @@
 
 #include <ATen/Tensor.h>
 #include <c10/core/TensorImpl.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/Exception.h>
 namespace at {
 
@@ -107,6 +108,39 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
     }
   }
 
+  template <typename VariableVersion>
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
+      VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const {
+    const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
+    c10::impl::PyInterpreter&& interpreter = nullptr;
+    if (mode_stack_len > 0 &&
+        !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
+      const auto& cur_torch_dispatch_mode_state =
+          c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
+      interpreter = cur_torch_dispatch_mode_state->pyinterpreter();
+    } else if (
+        key_set_.has(DispatchKey::Python) &&
+        !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
+      interpreter = pyobj_slot_.load_pyobj_interpreter();
+    } else {
+      // otherwise just copy the SparseTensorImpl and not the PyObject.
+      auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
+          key_set(), device(), layout_impl(), dtype());
+      copy_tensor_metadata(
+          /*src_sparse_impl=*/this,
+          /*dest_sparse_impl=*/impl.get(),
+          /*version_counter=*/version_counter,
+          /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+      impl->refresh_numel();
+      return impl;
+    }
+    auto r = interpreter->detach(this);
+    r->set_version_counter(std::forward<VariableVersion>(version_counter));
+    r->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+    return r;
+  }
+
   /**
    * Return a TensorImpl that is a shallow-copy of this TensorImpl.
    *
@@ -116,15 +150,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
   c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
       const c10::VariableVersion& version_counter,
       bool allow_tensor_metadata_change) const override {
-    auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
-        key_set(), device(), layout_impl(), dtype());
-    copy_tensor_metadata(
-        /*src_sparse_impl=*/this,
-        /*dest_sparse_impl=*/impl.get(),
-        /*version_counter=*/version_counter,
-        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-    impl->refresh_numel();
-    return impl;
+    return shallow_copy_and_detach_core(
+        version_counter, allow_tensor_metadata_change);
   }
 
   /**
@@ -136,15 +163,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
   c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
       c10::VariableVersion&& version_counter,
       bool allow_tensor_metadata_change) const override {
-    auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
-        key_set(), device(), layout_impl(), dtype());
-    copy_tensor_metadata(
-        /*src_sparse_impl=*/this,
-        /*dest_sparse_impl=*/impl.get(),
-        /*version_counter=*/version_counter,
-        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-    impl->refresh_numel();
-    return impl;
+    return shallow_copy_and_detach_core(
+        std::move(version_counter), allow_tensor_metadata_change);
   }
 
  private:
@@ -168,12 +188,12 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
   static void copy_tensor_metadata(
       const SparseCsrTensorImpl* src_sparse_impl,
       SparseCsrTensorImpl* dest_sparse_impl,
-      const c10::VariableVersion& version_counter,
+      c10::VariableVersion version_counter,
       bool allow_tensor_metadata_change) {
     TensorImpl::copy_tensor_metadata(
         src_sparse_impl,
         dest_sparse_impl,
-        version_counter,
+        std::move(version_counter),
         allow_tensor_metadata_change);
 
     // Sparse-specific fields
diff --git a/aten/src/ATen/SparseCsrTensorUtils.h b/aten/src/ATen/SparseCsrTensorUtils.h
index d3071c27b87da..348978293b8ac 100644
--- a/aten/src/ATen/SparseCsrTensorUtils.h
+++ b/aten/src/ATen/SparseCsrTensorUtils.h
@@ -137,8 +137,7 @@
       AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND4(      \
           kComplexHalf, kHalf, kBool, kBFloat16, __VA_ARGS__))
 
-namespace at {
-namespace sparse_csr {
+namespace at::sparse_csr {
 
 using SparseCsrTensor = Tensor;
 
@@ -244,22 +243,22 @@ inline std::string plainDimName(Layout layout) {
   }
 }
 
-inline int rowDimension(Layout layout, IntArrayRef size) {
+inline size_t rowDimension(Layout layout, IntArrayRef size) {
   return size.size() - (isCompressedRow(layout) ? 2 : 1);
 }
 
-inline int columnDimension(Layout layout, IntArrayRef size) {
+inline size_t columnDimension(Layout layout, IntArrayRef size) {
   return size.size() - (isCompressedColumn(layout) ? 2 : 1);
 }
 
-inline int compressedDimension(
+inline size_t compressedDimension(
     Layout layout,
     IntArrayRef size,
     size_t dense_ndim = 0) {
   return size.size() - dense_ndim - (isCompressedRow(layout) ? 2 : 1);
 }
 
-inline int plainDimension(
+inline size_t plainDimension(
     Layout layout,
     IntArrayRef size,
     size_t dense_ndim = 0) {
@@ -286,6 +285,21 @@ inline std::pair<Tensor, Tensor> getCompressedPlainIndices(Tensor const& self) {
       });
 }
 
+inline ScalarType getIndexDtype(Tensor const& self) {
+  switch (self.layout()) {
+    case kSparseCsr:
+    case kSparseBsr:
+      return self.crow_indices().scalar_type();
+    case kSparseCsc:
+    case kSparseBsc:
+      return self.ccol_indices().scalar_type();
+    case kSparse:
+      return self._indices().scalar_type();
+    default:
+      return ScalarType::Long;
+  }
+}
+
 inline Layout flip_compressed_layout(Layout layout) {
   switch (layout) {
     case kSparseCsr:
@@ -335,8 +349,7 @@ inline bool only_sparse_compressed_binary_op_trivial_cases(
     return true;
   }
   if (self.is_same(other)) {
-    Tensor compressed_indices, plain_indices;
-    std::tie(compressed_indices, plain_indices) =
+    auto [compressed_indices, plain_indices] =
         at::sparse_csr::getCompressedPlainIndices(self);
     static_cast<SparseCsrTensorImpl*>(out.unsafeGetTensorImpl())
         ->set_member_tensors(
@@ -367,13 +380,12 @@ inline bool only_sparse_compressed_add_trivial_cases(
       });
 }
 
-inline Tensor to_type(Tensor input, ScalarType dtype) {
-  Tensor compressed_indices, plain_indices;
-  std::tie(compressed_indices, plain_indices) =
+inline Tensor to_type(const Tensor& input, ScalarType dtype) {
+  auto [compressed_indices, plain_indices] =
       at::sparse_csr::getCompressedPlainIndices(input);
   return at::_sparse_compressed_tensor_unsafe(
-      std::move(compressed_indices),
-      std::move(plain_indices),
+      compressed_indices,
+      plain_indices,
       std::move(input.values()).to(dtype),
       input.sizes(),
       dtype,
@@ -388,7 +400,7 @@ inline std::tuple<Tensor, Tensor> create_acc_buffer(
     ScalarType type,
     int64_t nnz = -1) {
   Tensor new_values, new_values_acc;
-  constexpr bool need_acc = !std::is_same<scalar_t, acc_t>::value;
+  constexpr bool need_acc = !std::is_same_v<scalar_t, acc_t>;
   bool is_integral = at::isIntegralType(type, /*includeBool=*/true);
   if constexpr (need_acc) {
     auto acc_dtype = CppTypeToScalarType<acc_t>::value;
@@ -411,5 +423,4 @@ inline void copy_from_acc_buffer(Tensor& new_values, Tensor& new_values_acc) {
   }
 }
 
-} // namespace sparse_csr
-} // namespace at
+} // namespace at::sparse_csr
diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
index 36c93b706db86..0c0286f6c7a8c 100644
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -35,7 +35,6 @@ SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::Typ
 SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::TypeMeta data_type, at::Tensor indices, at::Tensor values)
     : TensorImpl(key_set, data_type, values.device())
     , sparse_dim_(1)
-    , dense_dim_(0)
     , indices_(std::move(indices))
     , values_(std::move(values)) {
   // we proxy to this constructor so we can initialize the device correctly, but really only indices/values of this shape are allowed.
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index d90734100ca6c..af9cbd28b1c35 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -2,6 +2,7 @@
 
 #include <ATen/Tensor.h>
 #include <c10/core/TensorImpl.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 
@@ -306,6 +307,38 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
       const Tensor& indices,
       const Tensor& values);
 
+  template <typename VariableVersion>
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
+      VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const {
+    const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
+    c10::impl::PyInterpreter&& interpreter = nullptr;
+    if (mode_stack_len > 0 &&
+        !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
+      const auto& cur_torch_dispatch_mode_state =
+          c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
+      interpreter = cur_torch_dispatch_mode_state->pyinterpreter();
+    } else if (
+        key_set_.has(DispatchKey::Python) &&
+        !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
+      interpreter = pyobj_slot_.load_pyobj_interpreter();
+    } else {
+      // otherwise just copy the SparseTensorImpl and not the PyObject.
+      auto impl = c10::make_intrusive<SparseTensorImpl>(key_set(), dtype());
+      copy_tensor_metadata(
+          /*src_sparse_impl=*/this,
+          /*dest_sparse_impl=*/impl.get(),
+          /*version_counter=*/version_counter,
+          /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+      impl->refresh_numel();
+      return impl;
+    }
+    auto r = interpreter->detach(this);
+    r->set_version_counter(std::forward<VariableVersion>(version_counter));
+    r->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+    return r;
+  }
+
   /**
    * Return a TensorImpl that is a shallow-copy of this TensorImpl.
    *
@@ -315,14 +348,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
   c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
       const c10::VariableVersion& version_counter,
       bool allow_tensor_metadata_change) const override {
-    auto impl = c10::make_intrusive<SparseTensorImpl>(key_set(), dtype());
-    copy_tensor_metadata(
-        /*src_impl=*/this,
-        /*dest_impl=*/impl.get(),
-        /*version_counter=*/version_counter,
-        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-    impl->refresh_numel();
-    return impl;
+    return shallow_copy_and_detach_core(
+        version_counter, allow_tensor_metadata_change);
   }
 
   /**
@@ -334,14 +361,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
   c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
       c10::VariableVersion&& version_counter,
       bool allow_tensor_metadata_change) const override {
-    auto impl = c10::make_intrusive<SparseTensorImpl>(key_set(), dtype());
-    copy_tensor_metadata(
-        /*src_impl=*/this,
-        /*dest_impl=*/impl.get(),
-        /*version_counter=*/std::move(version_counter),
-        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-    impl->refresh_numel();
-    return impl;
+    return shallow_copy_and_detach_core(
+        std::move(version_counter), allow_tensor_metadata_change);
   }
 
   /**
@@ -354,8 +375,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
     AT_ASSERT(has_compatible_shallow_copy_type(impl->key_set()));
     auto sparse_impl = static_cast<const SparseTensorImpl*>(impl.get());
     copy_tensor_metadata(
-        /*src_impl=*/sparse_impl,
-        /*dest_impl=*/this,
+        /*src_sparse_impl=*/sparse_impl,
+        /*dest_sparse_impl=*/this,
         /*version_counter=*/version_counter(),
         /*allow_tensor_metadata_change=*/allow_tensor_metadata_change());
     refresh_numel();
@@ -378,12 +399,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
   static void copy_tensor_metadata(
       const SparseTensorImpl* src_sparse_impl,
       SparseTensorImpl* dest_sparse_impl,
-      const c10::VariableVersion& version_counter,
+      c10::VariableVersion version_counter,
       bool allow_tensor_metadata_change) {
     TensorImpl::copy_tensor_metadata(
         src_sparse_impl,
         dest_sparse_impl,
-        version_counter,
+        std::move(version_counter),
         allow_tensor_metadata_change);
 
     // Sparse-specific fields
diff --git a/aten/src/ATen/StorageUtils.cpp b/aten/src/ATen/StorageUtils.cpp
index df84464fc687e..19c240ed89048 100644
--- a/aten/src/ATen/StorageUtils.cpp
+++ b/aten/src/ATen/StorageUtils.cpp
@@ -25,10 +25,10 @@ C10_EXPORT void storage_copy(
     const c10::Storage& src,
     bool non_blocking) {
   auto dst_options = c10::TensorOptions().device(dst.device()).dtype(at::kByte);
-  auto dst_t = at::empty({0}, {}, dst_options).set_(dst);
+  auto dst_t = at::empty({0}, dst_options).set_(dst);
 
   auto src_options = c10::TensorOptions().device(src.device()).dtype(at::kByte);
-  auto src_t = at::empty({0}, {}, src_options).set_(src);
+  auto src_t = at::empty({0}, src_options).set_(src);
   dst_t.copy_(src_t, non_blocking);
 }
 
diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h
index b64393e64a770..41f14a15ba99c 100644
--- a/aten/src/ATen/TensorGeometry.h
+++ b/aten/src/ATen/TensorGeometry.h
@@ -20,7 +20,7 @@ struct TORCH_API TensorGeometry {
         strides_(sizes.size()),
         has_symbolic_sizes_strides_(
             !c10::asIntArrayRefSlowOpt(sizes).has_value()) {
-    int64_t dim = sizes.size();
+    int64_t dim = static_cast<int64_t>(sizes.size());
     c10::SymInt expected_stride = 1;
     for (int64_t i = dim - 1; i >= 0; i--) {
       strides_[i] = expected_stride;
@@ -41,7 +41,7 @@ struct TORCH_API TensorGeometry {
   bool is_contiguous() const;
 
   int64_t dim() const {
-    return sizes_.size();
+    return static_cast<int64_t>(sizes_.size());
   }
 
   int64_t size(int64_t dim) const {
diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h
index b6e14addb4a7e..eb29b4d5ad739 100644
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@@ -24,8 +24,8 @@
 
 namespace at::indexing {
 
-const int64_t INDEX_MIN = c10::SymInt::min_representable_int();
-const int64_t INDEX_MAX = -(INDEX_MIN + 1);
+constexpr int64_t INDEX_MIN = c10::SymInt::min_representable_int();
+constexpr int64_t INDEX_MAX = -(INDEX_MIN + 1);
 
 enum class TensorIndexType { None, Ellipsis, SymInt, Boolean, Slice, Tensor };
 
@@ -130,9 +130,7 @@ struct TORCH_API TensorIndex final {
   TensorIndex(int integer) : TensorIndex(SymInt(integer)) {}
 
   // Case 4: Boolean value
-  template <
-      class T,
-      class = typename std::enable_if<std::is_same<bool, T>::value>::type>
+  template <class T, class = std::enable_if_t<std::is_same_v<bool, T>>>
   TensorIndex(T boolean) : boolean_(boolean), type_(TensorIndexType::Boolean) {}
 
   // Case 5: Slice represented in `at::indexing::Slice` form
@@ -219,7 +217,8 @@ static inline Tensor applySlice(
     SymInt length = (self_device == at::kCPU || self_device == at::kCUDA)
         ? (*self_sizes)[dim]
         : self.sym_size(dim);
-    if (!disable_slice_optimization && start == 0 && length == stop &&
+    if (!disable_slice_optimization &&
+        TORCH_GUARD_SIZE_OBLIVIOUS(start.sym_eq(0)) && length == stop &&
         step == 1) {
       return self;
     }
@@ -273,9 +272,9 @@ static inline Tensor boolToIndexingTensorCPUOrCUDA(
   // booleans add a dimension of size 1. true indexes this dimension as if 0:,
   // false as empty.
   if (value) {
-    return at::empty({1}, {}, self.options().dtype(kLong)).fill_(0.);
+    return at::empty({1}, self.options().dtype(kLong)).fill_(0.);
   } else {
-    return at::empty({0}, {}, self.options().dtype(kLong));
+    return at::empty({0}, self.options().dtype(kLong));
   }
 }
 
@@ -285,9 +284,9 @@ static inline Tensor boolToIndexingTensorNonNativeDeviceType(
   // booleans add a dimension of size 1. true indexes this dimension as if 0:,
   // false as empty.
   if (value) {
-    return at::zeros({1}, {}, self.options().dtype(kLong));
+    return at::zeros({1}, self.options().dtype(kLong));
   } else {
-    return at::empty({0}, {}, self.options().dtype(kLong));
+    return at::empty({0}, self.options().dtype(kLong));
   }
 }
 
@@ -318,12 +317,12 @@ static inline void recordTensorIndex(
   (*dim_ptr)++;
 };
 
-static inline c10::List<c10::optional<Tensor>> typeConvertIndices(
+static inline c10::List<::std::optional<Tensor>> typeConvertIndices(
     const Tensor& /*self*/,
     std::vector<Tensor>&& indices) {
-  c10::List<c10::optional<Tensor>> converted_inds;
+  c10::List<::std::optional<Tensor>> converted_inds;
   converted_inds.reserve(indices.size());
-  for (const auto& i : indices) {
+  for (auto&& i : std::move(indices)) {
     converted_inds.push_back(std::move(i));
   }
   return converted_inds;
@@ -539,9 +538,9 @@ static inline Tensor applySlicing(
         /*prev_dim_result=*/result,
         /*original_tensor=*/self,
         /*index=*/obj,
-        /*dim=*/&dim,
+        /*dim_ptr=*/&dim,
         /*specified_dims_ptr=*/&specified_dims,
-        /*real_dim=*/i,
+        /*real_dim=*/static_cast<int64_t>(i),
         /*outIndices=*/outIndices,
         /*disable_slice_optimization=*/disable_slice_optimization,
         /*original_tensor_device=*/self_device,
diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
index 99c8eda122cfc..0afac10d44fbf 100644
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -52,7 +52,7 @@ inline void get_strides(int64_t* strides, ArrayRef<OperandInfo> operands, int64_
   }
   // Always at least 2d strides to support 2d for_each loops
   if (ndim < 2) {
-    const int64_t ntensors = operands.size();
+    auto ntensors = operands.size();
     std::fill_n(strides, (2 - ndim) * ntensors, 0);
   }
 }
@@ -92,7 +92,7 @@ void OperandInfo::tensor(c10::MaybeOwned<TensorBase> &&tensor) {
 
 void OperandInfo::exchange_tensor(c10::MaybeOwned<TensorBase> &&new_tensor) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!original_tensor_base_->defined());
-  original_tensor_base_ = std::exchange(tensor_base_, new_tensor);
+  original_tensor_base_ = std::exchange(tensor_base_, std::move(new_tensor));
   *original_tensor_storage_ = std::exchange(*tensor_storage_, make_otr(*tensor_base_));
 }
 
@@ -119,6 +119,13 @@ TensorIteratorConfig& TensorIteratorConfig::add_owned_input(const TensorBase& in
   return *this;
 }
 
+TensorIteratorConfig& TensorIteratorConfig::add_owned_const_input(const TensorBase& input) {
+  const_tensor_indices_.push_back(tensors_.size());
+  tensors_.push_back(c10::MaybeOwned<TensorBase>::owned(std::in_place, input));
+  num_inputs_++;
+  return *this;
+}
+
 TensorIteratorConfig& TensorIteratorConfig::add_borrowed_output(const TensorBase& output) {
   TORCH_INTERNAL_ASSERT(
       num_inputs_ == 0,
@@ -135,6 +142,13 @@ TensorIteratorConfig& TensorIteratorConfig::add_borrowed_input(const TensorBase&
   return *this;
 }
 
+TensorIteratorConfig& TensorIteratorConfig::add_borrowed_const_input(const TensorBase& input) {
+  const_tensor_indices_.push_back(tensors_.size());
+  tensors_.push_back(c10::MaybeOwned<TensorBase>::borrowed(input));
+  num_inputs_++;
+  return *this;
+}
+
 TensorIteratorConfig& TensorIteratorConfig::declare_static_dtype_and_device(ScalarType dtype, Device device) {
   TORCH_CHECK(!check_all_same_dtype_, "check_all_same_dtype(false) must be called before declare_static_dtype(...)");
   static_dtype_ = dtype;
@@ -173,6 +187,10 @@ TensorIteratorConfig& TensorIteratorConfig::declare_static_shape(IntArrayRef sha
   return *this;
 }
 
+bool TensorIteratorConfig::is_tensor_const(size_t idx) {
+  return std::find(const_tensor_indices_.begin(), const_tensor_indices_.end(), idx) != const_tensor_indices_.end();
+}
+
 // NOTE: [Computing output strides]
 // We use the following algorithm to compute output strides
 // If correctly sized output is provided, we respect its strides and don't change them
@@ -531,7 +549,7 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) {
   }
 }
 
-StrideVector TensorIteratorBase::compatible_stride(int element_size) const {
+StrideVector TensorIteratorBase::compatible_stride(int64_t element_size) const {
   auto stride = StrideVector();
   int64_t next_stride = element_size;
   for (const auto dim : c10::irange(ndim())) {
@@ -558,8 +576,8 @@ void TensorIteratorBase::allocate_or_resize_outputs() {
     auto& op = operands_[i];
     if (!op.tensor_base().defined() || op.will_resize) {
       TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
-      int element_size = elementSize(op.target_dtype);
-      op.stride_bytes = compatible_stride(element_size);
+      auto element_size = elementSize(op.target_dtype);
+      op.stride_bytes = compatible_stride(static_cast<int64_t>(element_size));
       // check if permutation is just an inverted order
       bool inverted = true;
       for (const auto j : c10::irange(ndim())) {
@@ -577,7 +595,7 @@ void TensorIteratorBase::allocate_or_resize_outputs() {
       } else {
         auto tensor_stride = invert_perm(op.stride_bytes);
         for (const auto dim : c10::irange(ndim())) {
-          tensor_stride[dim] /= element_size;
+          tensor_stride[dim] /= static_cast<int64_t>(element_size);
         }
         set_output_raw_strided(i, tensor_shape, tensor_stride, original_options(op), names_);
       }
@@ -757,7 +775,7 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
 
 StrideVector TensorIteratorBase::get_strides() const {
   const auto dim = ndim();
-  StrideVector strides(std::max(dim, 2) * ntensors());
+  StrideVector strides(static_cast<size_t>(std::max(dim, 2)) * ntensors());
   at::get_strides(strides.data(), operands_, dim);
   return strides;
 }
@@ -771,7 +789,7 @@ void TensorIteratorBase::serial_for_each(loop2d_t loop, Range range) const {
   const auto ndim = this->ndim();
 
   c10::SmallBuffer<char*, 4> ptrs(ntensors);
-  c10::SmallBuffer<int64_t, 8> strides(ntensors * std::max(ndim, 2));
+  c10::SmallBuffer<int64_t, 8> strides(ntensors * static_cast<size_t>(std::max(ndim, 2)));
 
   at::get_base_ptrs(ptrs.data(), operands_);
   at::get_strides(strides.data(), operands_, ndim);
@@ -795,7 +813,7 @@ bool TensorIteratorBase::is_contiguous() const {
 }
 
 
-bool TensorIteratorBase::is_scalar(int arg) const {
+bool TensorIteratorBase::is_scalar(int64_t arg) const {
   const auto& stride = operands_[arg].stride_bytes;
   for (const auto i : c10::irange(ndim())) {
     if (stride[i] != 0 && shape_[i] != 1) {
@@ -805,7 +823,7 @@ bool TensorIteratorBase::is_scalar(int arg) const {
   return true;
 }
 
-bool TensorIteratorBase::is_cpu_scalar(int arg) const {
+bool TensorIteratorBase::is_cpu_scalar(int64_t arg) const {
   return is_scalar(arg) && device(arg).is_cpu();
 }
 
@@ -817,7 +835,7 @@ void TensorIteratorBase::cast_outputs() {
       // and tensor, this condition should no longer ever be true
       const auto &original_tensor = op.original_tensor();
       const auto &tensor = op.tensor();
-      if (original_tensor.sizes() != tensor.sizes()){
+      if (original_tensor.sizes() != tensor.sizes()) {
         original_tensor.resize_as_(tensor).as_strided_(tensor.sizes(), tensor.strides());
       }
       original_tensor.copy_(tensor);
@@ -826,15 +844,15 @@ void TensorIteratorBase::cast_outputs() {
   }
 }
 
-void* TensorIteratorBase::data_ptr(int arg) const {
+void* TensorIteratorBase::data_ptr(int64_t arg) const {
   return operands_[arg].data;
 }
 
-void TensorIteratorBase::remove_operand(int arg) {
+void TensorIteratorBase::remove_operand(int64_t arg) {
   operands_.erase(operands_.begin() + arg);
 }
 
-void TensorIteratorBase::unsafe_replace_operand(int arg, void* data) {
+void TensorIteratorBase::unsafe_replace_operand(int64_t arg, void* data) {
   operands_[arg].data = data;
 }
 
@@ -874,16 +892,16 @@ void TensorIteratorBase::build_binary_float_op(
     const TensorBase& out, const TensorBase& a, const TensorBase& b) {
   build(BINARY_FLOAT_OP_CONFIG()
         .add_owned_output(out)
-        .add_owned_input(a)
-        .add_owned_input(b));
+        .add_owned_const_input(a)
+        .add_owned_const_input(b));
 }
 
 void TensorIteratorBase::build_borrowing_binary_float_op(
     const TensorBase& out, const TensorBase& a, const TensorBase& b) {
   build(BINARY_FLOAT_OP_CONFIG()
         .add_output(out)
-        .add_input(a)
-        .add_input(b));
+        .add_const_input(a)
+        .add_const_input(b));
 }
 
 static void set_up_comparison_op_config(TensorIteratorConfig& config, const TensorBase& out) {
@@ -916,8 +934,8 @@ void TensorIteratorBase::build_comparison_op(
   set_up_comparison_op_config(config, out);
 
   config.add_owned_output(out);
-  config.add_owned_input(a);
-  config.add_owned_input(b);
+  config.add_owned_const_input(a);
+  config.add_owned_const_input(b);
   build(config);
 }
 
@@ -927,8 +945,8 @@ void TensorIteratorBase::build_borrowing_comparison_op(
   set_up_comparison_op_config(config, out);
 
   config.add_borrowed_output(out);
-  config.add_borrowed_input(a);
-  config.add_borrowed_input(b);
+  config.add_borrowed_const_input(a);
+  config.add_borrowed_const_input(b);
   build(config);
 }
 
@@ -938,8 +956,8 @@ void TensorIteratorBase::build_borrowing_except_last_argument_comparison_op(
   set_up_comparison_op_config(config, out);
 
   config.add_borrowed_output(out);
-  config.add_borrowed_input(a);
-  config.add_owned_input(b);
+  config.add_borrowed_const_input(a);
+  config.add_owned_const_input(b);
   build(config);
 }
 
@@ -951,9 +969,9 @@ void TensorIteratorBase::build_ternary_op(
       .cast_common_dtype_to_outputs(true)
       .enforce_safe_casting_to_output(true)
       .add_owned_output(out)
-      .add_owned_input(a)
-      .add_owned_input(b)
-      .add_owned_input(c));
+      .add_owned_const_input(a)
+      .add_owned_const_input(b)
+      .add_owned_const_input(c));
 }
 
 // This cannot be a function because TensorIteratorConfig is not
@@ -969,16 +987,16 @@ void TensorIteratorBase::build_ternary_op(
 void TensorIteratorBase::build_binary_op(const TensorBase& out, const TensorBase& a, const TensorBase& b) {
   build(BINARY_OP_CONFIG()
       .add_owned_output(out)
-      .add_owned_input(a)
-      .add_owned_input(b));
+      .add_owned_const_input(a)
+      .add_owned_const_input(b));
 }
 
 void TensorIteratorBase::build_borrowing_binary_op(
     const TensorBase& out, const TensorBase& a, const TensorBase& b) {
   build(BINARY_OP_CONFIG()
       .add_output(out)
-      .add_input(a)
-      .add_input(b));
+      .add_const_input(a)
+      .add_const_input(b));
 }
 
 // This cannot be a function because TensorIteratorConfig is not
@@ -994,13 +1012,13 @@ void TensorIteratorBase::build_borrowing_binary_op(
 void TensorIteratorBase::build_unary_float_op(const TensorBase& out, const TensorBase& a) {
   build(UNARY_FLOAT_OP_CONFIG()
       .add_owned_output(out)
-      .add_owned_input(a));
+      .add_owned_const_input(a));
 }
 
 void TensorIteratorBase::build_borrowing_unary_float_op(const TensorBase& out, const TensorBase& a) {
   build(UNARY_FLOAT_OP_CONFIG()
       .add_output(out)
-      .add_input(a));
+      .add_const_input(a));
 }
 
 // This cannot be a function because TensorIteratorConfig is not
@@ -1015,19 +1033,19 @@ void TensorIteratorBase::build_borrowing_unary_float_op(const TensorBase& out, c
 void TensorIteratorBase::build_unary_op(const TensorBase& out, const TensorBase& a) {
   build(UNARY_OP_CONFIG()
       .add_owned_output(out)
-      .add_owned_input(a));
+      .add_owned_const_input(a));
 }
 
 void TensorIteratorBase::build_borrowing_unary_op(const TensorBase& out, const TensorBase& a) {
   build(UNARY_OP_CONFIG()
       .add_output(out)
-      .add_input(a));
+      .add_const_input(a));
 }
 
 void TensorIteratorBase::build_output_borrowing_argument_owning_unary_op(const TensorBase& out, const TensorBase& a) {
   build(UNARY_OP_CONFIG()
       .add_output(out)
-      .add_owned_input(a));
+      .add_owned_const_input(a));
 }
 
 // Helper to construct a unary op that forcibly promotes output to boolean.
@@ -1039,7 +1057,7 @@ void TensorIteratorBase::build_borrowing_unary_force_boolean_op(const TensorBase
       .declare_static_dtype(at::kBool)
       .declare_static_device(a.device())
       .add_output(out)
-      .add_input(a));
+      .add_const_input(a));
 }
 
 TensorIterator TensorIterator::binary_op(TensorBase& out, const TensorBase& a, const TensorBase& b) {
@@ -1104,7 +1122,7 @@ TensorIterator TensorIterator::reduce_op(TensorBase& out, const TensorBase& a) {
   return TensorIteratorConfig()
     .set_check_mem_overlap(false)
     .add_owned_output(out)
-    .add_owned_input(a)
+    .add_owned_const_input(a)
     .resize_outputs(false)
     .is_reduction(true)
     // TODO: not supporting casting to outputs is only really necessary for arg{min,max}
@@ -1128,7 +1146,7 @@ TensorIterator TensorIterator::reduce_op(TensorBase& out1, TensorBase& out2, con
     .set_check_mem_overlap(false)
     .add_owned_output(out1)
     .add_owned_output(out2)
-    .add_owned_input(a)
+    .add_owned_const_input(a)
     .resize_outputs(false)
     .is_reduction(true)
     .check_all_same_dtype(false)
@@ -1136,7 +1154,8 @@ TensorIterator TensorIterator::reduce_op(TensorBase& out1, TensorBase& out2, con
 }
 
 void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) {
-  for (auto& tensor: config.tensors_) {
+  for (const auto idx : c10::irange(config.tensors_.size())) {
+    auto& tensor = config.tensors_[idx];
     // If *any* of the arguments is a meta tensor, the overall
     // computation is a meta computation (don't do any work,
     // just compute output information).  This aligns with
@@ -1145,6 +1164,7 @@ void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) {
       is_meta_ = true;
     }
     operands_.emplace_back(std::move(tensor));
+    operands_[idx].is_const = config.is_tensor_const(idx);
   }
   num_outputs_ = config.num_outputs_;
 }
@@ -1176,6 +1196,9 @@ void TensorIteratorBase::mark_resize_outputs(const TensorIteratorConfig& config)
   }
   for (const auto i : c10::irange(num_outputs_)) {
     const auto& output = tensor(i);
+    if (!output.defined()) {
+      operands_[i].will_resize = true;
+    }
     if (output.defined() && !output.sizes().equals(shape_)) {
       if (config.resize_outputs_ && !operands_[i].is_read_write) {
         operands_[i].will_resize = true;
@@ -1507,18 +1530,23 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) {
 
   // XLA and lazy tensors don't have storage, so they don't have an underlying data pointer.
   // Nothing beyond this point is important for meta functions, so it's fine to exit early here.
-  // Extend the condition to ORT tesnors as ORT tensors also don't have storage.
+  // Extend the condition to MAIA tesnors as MAIA tensors also don't have storage.
   if (privateuse1_without_storage  ||
       common_device_.type() == DeviceType::MTIA ||
       common_device_.type() == DeviceType::XLA  ||
       common_device_.type() == DeviceType::IPU  ||
       common_device_.type() == DeviceType::Lazy ||
-      common_device_.type() == DeviceType::ORT  ||
+      common_device_.type() == DeviceType::MAIA  ||
       common_device_.type() == DeviceType::HPU) return;
 
   for (auto& op : operands_) {
     TORCH_INTERNAL_ASSERT(op.tensor_base().defined());
-    op.data = op.tensor_base().data_ptr();
+    if (op.is_const) {
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+      op.data = const_cast<void*>(op.tensor_base().const_data_ptr());
+    } else {
+      op.data = op.tensor_base().mutable_data_ptr();
+    }
   }
 
   // zero out offsets
@@ -1654,7 +1682,7 @@ SplitUntil32Bit::iterator& SplitUntil32Bit::iterator::operator++() {
   vec.pop_back();
   while (!vec.empty() && !vec.back()->can_use_32bit_indexing()) {
     auto& iter = *vec.back();
-    int64_t split_dim = iter.get_dim_to_split();
+    auto split_dim = iter.get_dim_to_split();
     vec.emplace_back(iter.split(split_dim));
   }
   return *this;
@@ -1683,7 +1711,7 @@ DimCounter::DimCounter(IntArrayRef shape, Range range)
   }
 
   int64_t linear_offset = range.begin;
-  int64_t ndim = values.size();
+  auto ndim = values.size();
   for (const auto dim : c10::irange(ndim)) {
     int64_t size = shape[dim];
     if (size > 0) {
@@ -1700,9 +1728,9 @@ bool DimCounter::is_done() const {
 
 void DimCounter::increment(const std::array<int64_t, 2>& step) {
   offset += step[0] * step[1];
-  int64_t ndim = values.size();
+  auto ndim = values.size();
   int64_t overflow = step[0];
-  int i = 0;
+  size_t i = 0;
   if (step[1] != 1) {
     TORCH_INTERNAL_ASSERT(step[0] == shape[0] && values[0] == 0);
     i = 1;
@@ -1719,7 +1747,7 @@ void DimCounter::increment(const std::array<int64_t, 2>& step) {
     } else {
       overflow = 0;
     }
-    values[i] = value;
+    values[i] = static_cast<int64_t>(value);
   }
   TORCH_INTERNAL_ASSERT(overflow == 0 || overflow == 1);
 }
diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h
index f34ffad3f3b43..a241244a5744c 100644
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@@ -79,7 +79,7 @@ constexpr int64_t GRAIN_SIZE = 32768;
 
 // Storage for a non-owning Tensor, without needing to include Tensor.h
 class TORCH_API OpaqueOptionalTensorRef {
-  alignas(alignof(TensorBase)) std::array<char, sizeof(TensorBase)> data_;
+  alignas(alignof(TensorBase)) std::array<char, sizeof(TensorBase)> data_{};
 
  public:
   OpaqueOptionalTensorRef();
@@ -167,10 +167,23 @@ struct TORCH_API OperandInfo {
 
   bool is_output = false;
 
+  // will_resize is only for output tensor.
+  // 1) Functional call(like torch.add(self, other)): output tensor is
+  //    undefined, and pytorch creates a new tensor by using common shape
+  //    and computed stride in TensorIterator;
+  // 2) Inplace call(like torch.add_(self, other)): output tensor is same
+  //    with input tensor, and can't to modify tensor's size and stride;
+  // 3) Op call with output(like torch.add(self, other, out = output)):
+  //    output tensor is defined, but tensor shape maybe different with common
+  //    shape. If tensor shape is not same with common shape, this output
+  //    tensor will be resized by using common shape and computed stride in
+  //    TensorIterator. Otherwise can't modify tensor's size and stride.
   bool will_resize = false;
 
   bool is_read_write = false;
 
+  bool is_const = false;
+
   void validate() {
     TORCH_CHECK(
         !tensor_base_->defined() || tensor_base_->layout() == kStrided,
@@ -291,11 +304,11 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   bool is_dim_reduced(int dim) const;
 
   /// Accessors for each operand
-  IntArrayRef strides(int arg) const {
+  IntArrayRef strides(int64_t arg) const {
     return operands_[arg].stride_bytes;
   }
-  void* data_ptr(int arg) const;
-  ScalarType dtype(int arg = 0) const {
+  void* data_ptr(int64_t arg) const;
+  ScalarType dtype(int64_t arg = 0) const {
     return operands_[arg].current_dtype;
   }
   ScalarType common_dtype() const {
@@ -304,43 +317,43 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
         "Queried for invalid common dtype!");
     return common_dtype_;
   }
-  ScalarType input_dtype(int arg = 0) const {
+  ScalarType input_dtype(int64_t arg = 0) const {
     return operands_[num_outputs_ + arg].current_dtype;
   }
-  Device device(int arg = 0) const {
+  Device device(int64_t arg = 0) const {
     return operands_[arg].device.value();
   }
-  c10::DeviceType device_type(int arg = 0) const {
+  c10::DeviceType device_type(int64_t arg = 0) const {
     return device(arg).type();
   }
-  int64_t element_size(int arg) const {
+  int64_t element_size(int64_t arg) const {
     return static_cast<int64_t>(elementSize(dtype(arg)));
   }
-  bool is_scalar(int arg) const;
-  bool is_cpu_scalar(int arg) const;
+  bool is_scalar(int64_t arg) const;
+  bool is_cpu_scalar(int64_t arg) const;
 
-  const TensorBase& tensor_base(int arg) const {
+  const TensorBase& tensor_base(int64_t arg) const {
     return operands_[arg].tensor_base();
   }
-  const Tensor& tensor(int arg) const {
+  const Tensor& tensor(int64_t arg) const {
     return operands_[arg].tensor();
   }
 
-  const TensorBase& output_base(int arg = 0) const {
+  const TensorBase& output_base(int64_t arg = 0) const {
     AT_ASSERT(arg < num_outputs_);
     return tensor_base(arg);
   }
 
-  const Tensor& output(int arg = 0) const {
+  const Tensor& output(int64_t arg = 0) const {
     AT_ASSERT(arg < num_outputs_);
     return tensor(arg);
   }
 
-  const TensorBase& input_base(int arg = 0) const {
+  const TensorBase& input_base(int64_t arg = 0) const {
     AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_);
     return tensor_base(num_outputs_ + arg);
   }
-  const Tensor& input(int arg = 0) const {
+  const Tensor& input(int64_t arg = 0) const {
     AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_);
     return tensor(num_outputs_ + arg);
   }
@@ -350,7 +363,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   void cast_outputs();
 
   /// Removes an operand from this iterator
-  void remove_operand(int arg);
+  void remove_operand(int64_t arg);
   /// Shrinks an iterated dimension
   void narrow(int dim, int64_t start, int64_t size);
   /// Narrows every dim after and including `start_dim` to size one.
@@ -358,7 +371,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   /// Replaces the data pointer for the operand at index `arg`.
   /// The new pointer should have the same sizes, strides and dtype as the
   /// original
-  void unsafe_replace_operand(int arg, void* data);
+  void unsafe_replace_operand(int64_t arg, void* data);
 
   /// Splits this TensorIterator into two iterators. Together they iterate over
   /// the entire operation. Used by `with_32bit_indexing()`.
@@ -368,7 +381,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   int get_dim_to_split() const;
 
   template <typename T>
-  T scalar_value(int arg) {
+  T scalar_value(int64_t arg) {
     auto& op = operands_[arg];
     return c10::fetch_and_cast<T>(op.tensor_base().scalar_type(), op.data);
   }
@@ -378,13 +391,14 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   /// If the scalar is aleady given in the type of Half, then return scalar
   /// value from tensor_base.
   template <typename T>
-  T original_scalar_value(int arg) {
+  T original_scalar_value(int64_t arg) {
     auto& original_tensor_base = operands_[arg].original_tensor_base();
     if (original_tensor_base.defined()) {
       TORCH_INTERNAL_ASSERT(
           original_tensor_base.scalar_type() != common_dtype());
       return c10::fetch_and_cast<T>(
-          original_tensor_base.scalar_type(), original_tensor_base.data_ptr());
+          original_tensor_base.scalar_type(),
+          original_tensor_base.const_data_ptr());
     } else {
       return scalar_value<T>(arg);
     }
@@ -413,10 +427,10 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   template <
       typename loop1d_t,
       std::enable_if_t<
-          std::is_convertible<
+          std::is_convertible_v<
               loop1d_t,
               c10::function_ref<
-                  void(char**, const int64_t* strides, int64_t size)>>::value,
+                  void(char**, const int64_t* strides, int64_t size)>>,
           int> = 0>
   void for_each(loop1d_t loop, int64_t grain_size = at::internal::GRAIN_SIZE) {
     for_each(loop_2d_from_1d(loop), grain_size);
@@ -429,10 +443,10 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   template <
       typename loop1d_t,
       std::enable_if_t<
-          std::is_convertible<
+          std::is_convertible_v<
               loop1d_t,
               c10::function_ref<
-                  void(char**, const int64_t* strides, int64_t size)>>::value,
+                  void(char**, const int64_t* strides, int64_t size)>>,
           int> = 0>
   void serial_for_each(loop1d_t loop, Range range) {
     serial_for_each(loop_2d_from_1d(loop), range);
@@ -443,7 +457,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   /// Create a strides array for a Tensor with shape of this iterator. The
   /// parameter `element_size` specifies the size of Tensor's data type in
   /// bytes (e.g. `4` for `float`)
-  StrideVector compatible_stride(int element_size) const;
+  StrideVector compatible_stride(int64_t element_size) const;
 
   /// Inverts the re-ordering done by reorder_dimensions. This can only be
   /// called *before* coalesce_dimensions() is called.
@@ -462,13 +476,28 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   PtrVector get_base_ptrs() const;
 
   // Helper functions for advanced stride manipulations (e.g. torch.flip)
-  void _unsafe_set_arg_strides(const int arg, IntArrayRef strides) {
+  void _unsafe_set_arg_strides(const int64_t arg, IntArrayRef strides) {
     operands_[arg].stride_bytes = strides;
   }
-  void _unsafe_set_arg_data(const int arg, void* data) {
+  void _unsafe_set_arg_data(const int64_t arg, void* data) {
     operands_[arg].data = data;
   }
 
+  // Helper functions for custom device, custom device can get OperandInfo and
+  // NameVector in their side.
+  const OperandInfo& operand(int arg = 0) const {
+    return operands_[arg];
+  }
+  OperandInfo& operand(int arg = 0) {
+    return operands_[arg];
+  }
+  NameVector& get_dim_names() {
+    return names_;
+  }
+  const NameVector& get_dim_names() const {
+    return names_;
+  }
+
   /// true if the stride computation can use 32-bit arithmetic. Used by GPU
   /// kernels
   bool can_use_32bit_indexing() const;
@@ -769,10 +798,14 @@ class TORCH_API TensorIteratorConfig final {
   TensorIteratorConfig& add_input(const TensorBase& input) {
     return add_borrowed_input(input);
   }
+  TensorIteratorConfig& add_const_input(const TensorBase& input) {
+    return add_borrowed_const_input(input);
+  }
 
   // Borrowing from temporaries is unlikely to go well.
   TensorIteratorConfig& add_output(TensorBase&& output) = delete;
   TensorIteratorConfig& add_input(TensorBase&& input) = delete;
+  TensorIteratorConfig& add_const_input(TensorBase&& input) = delete;
 
   // Stores input/output Tensors while incrementing the reference count.
   // Note that add_{in,out}put are nearly always what you
@@ -780,6 +813,7 @@ class TORCH_API TensorIteratorConfig final {
   // compile.
   TensorIteratorConfig& add_owned_output(const TensorBase& output);
   TensorIteratorConfig& add_owned_input(const TensorBase& input);
+  TensorIteratorConfig& add_owned_const_input(const TensorBase& input);
 
   // Advanced API: stores input/output Tensors without incrementing
   // the reference count. The caller must ensure that these Tensors
@@ -788,10 +822,12 @@ class TORCH_API TensorIteratorConfig final {
   // Important: the outputs have to be added before the inputs.
   TensorIteratorConfig& add_borrowed_output(const TensorBase& output);
   TensorIteratorConfig& add_borrowed_input(const TensorBase& input);
+  TensorIteratorConfig& add_borrowed_const_input(const TensorBase& input);
 
   // Borrowing from temporaries is unlikely to go well.
   TensorIteratorConfig& add_borrowed_output(TensorBase&& output) = delete;
   TensorIteratorConfig& add_borrowed_input(TensorBase&& input) = delete;
+  TensorIteratorConfig& add_borrowed_const_input(TensorBase&& input) = delete;
 
   // Sets the check_mem_overlap_ flag, which is true by default.
   // If true, inputs are checked for partial overlap with the outputs and
@@ -929,6 +965,8 @@ class TORCH_API TensorIteratorConfig final {
   }
 
  private:
+  bool is_tensor_const(size_t idx);
+
   SmallVector<c10::MaybeOwned<TensorBase>, 4> tensors_;
   int num_outputs_ = 0;
   int num_inputs_ = 0;
@@ -947,6 +985,8 @@ class TORCH_API TensorIteratorConfig final {
   bool promote_inputs_to_common_dtype_ = false;
   bool promote_integer_inputs_to_float_ = false;
   bool cast_common_dtype_to_outputs_ = false;
+
+  SmallVector<size_t, 4> const_tensor_indices_;
 };
 
 /// A container-like struct that acts as if it contains splits of a
@@ -981,6 +1021,7 @@ struct TORCH_API SplitUntil32Bit {
   iterator end() const;
 
  private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const TensorIteratorBase& iter;
 };
 
diff --git a/aten/src/ATen/TensorIteratorInternal.h b/aten/src/ATen/TensorIteratorInternal.h
index 1b4d4963b8638..ec0cb6c8fdfcb 100644
--- a/aten/src/ATen/TensorIteratorInternal.h
+++ b/aten/src/ATen/TensorIteratorInternal.h
@@ -25,8 +25,8 @@ inline void get_data_ptrs(
     ArrayRef<char*> base,
     IntArrayRef strides,
     IntArrayRef counter) {
-  const int64_t ntensors = base.size();
-  const int64_t ndim = counter.size();
+  const auto ntensors = base.size();
+  const auto ndim = counter.size();
   std::copy(base.begin(), base.end(), ptrs);
   for (const auto dim : c10::irange(ndim)) {
     int64_t value = counter[dim];
diff --git a/aten/src/ATen/TensorNames.cpp b/aten/src/ATen/TensorNames.cpp
index fe7165816eb87..bff12aa8de65f 100644
--- a/aten/src/ATen/TensorNames.cpp
+++ b/aten/src/ATen/TensorNames.cpp
@@ -53,8 +53,9 @@ TensorNames::TensorNames(ArrayRef<Dimname> names) {
 }
 
 TensorNames::TensorNames(ArrayRef<Dimname> names, int64_t start, int64_t end) {
-  start = maybe_wrap_dim(start, names.size());
-  end = maybe_wrap_dim(end, names.size());
+  int64_t names_size = static_cast<int64_t>(names.size());
+  start = maybe_wrap_dim(start, names_size);
+  end = maybe_wrap_dim(end, names_size);
   names_.reserve(end - start);
   for (const auto idx : c10::irange(start, end)) {
     names_.emplace_back(names, idx);
@@ -83,7 +84,7 @@ TensorNames& TensorNames::unifyFromRightInplace(const TensorNames& other, const
   return *this;
 }
 
-void TensorNames::append(TensorName&& name) {
+void TensorNames::append(TensorName name) {
   names_.emplace_back(name);
 }
 
diff --git a/aten/src/ATen/TensorNames.h b/aten/src/ATen/TensorNames.h
index 4ec3d064867fb..616efc14d2599 100644
--- a/aten/src/ATen/TensorNames.h
+++ b/aten/src/ATen/TensorNames.h
@@ -63,11 +63,11 @@ struct TORCH_API TensorNames {
       const char* op_name = "unify");
   void checkUnique(const char* op_name) const;
 
-  void append(TensorName&& name);
+  void append(TensorName name);
   std::vector<Dimname> toDimnameVec() const;
 
  private:
-  explicit TensorNames(TensorNameVec&& names) : names_(names){};
+  explicit TensorNames(TensorNameVec&& names) : names_(std::move(names)){};
 
   TensorNameVec names_;
 };
diff --git a/aten/src/ATen/TensorOperators.h b/aten/src/ATen/TensorOperators.h
index feaad09438a80..7567af4cbfe46 100644
--- a/aten/src/ATen/TensorOperators.h
+++ b/aten/src/ATen/TensorOperators.h
@@ -9,9 +9,6 @@
 #include <ATen/ops/empty_like.h>
 #endif
 
-#include <stdexcept>
-#include <string>
-
 namespace at {
 
 #define AT_FORALL_BINARY_OPS(_)                                             \
diff --git a/aten/src/ATen/TensorSubclassLikeUtils.h b/aten/src/ATen/TensorSubclassLikeUtils.h
index 44b4223245903..a9a0b4ecdcf8b 100644
--- a/aten/src/ATen/TensorSubclassLikeUtils.h
+++ b/aten/src/ATen/TensorSubclassLikeUtils.h
@@ -43,8 +43,7 @@ constexpr auto kTensorSubclassLike =
          // no matter the backend component
          DispatchKey::Batched,
          DispatchKey::Sparse,
-         DispatchKey::SparseCsrCPU,
-         DispatchKey::SparseCsrCUDA,
+         DispatchKey::SparseCsr,
          DispatchKey::Python}) |
     DispatchKeySet(BackendComponent::MetaBit);
 
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index 6e1ce8166cb77..e425a0a8ed130 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -9,7 +9,7 @@
 
 namespace at {
 
-std::ostream& operator<<(std::ostream & out, TensorGeometryArg t) {
+std::ostream& operator<<(std::ostream & out, const TensorGeometryArg& t) {
   if (t.pos == 0) {
     // 0 is distinguished; it usually indicates 'self' or the return
     // tensor
@@ -68,7 +68,7 @@ void checkAllContiguous(CheckedFrom c, at::ArrayRef<TensorArg> ts) {
 }
 
 void checkSize(CheckedFrom c, const TensorGeometryArg& t, IntArrayRef sizes) {
-  checkDim(c, t, sizes.size());
+  checkDim(c, t, static_cast<int64_t>(sizes.size()));
   TORCH_CHECK(
     t->sizes().equals(sizes),
     "Expected tensor of size ", sizes, ", but got tensor of size ", t->sizes(),
@@ -76,7 +76,7 @@ void checkSize(CheckedFrom c, const TensorGeometryArg& t, IntArrayRef sizes) {
 }
 
 void checkSize_symint(CheckedFrom c, const TensorGeometryArg& t, c10::SymIntArrayRef sizes) {
-  checkDim(c, t, sizes.size());
+  checkDim(c, t, static_cast<int64_t>(sizes.size()));
   TORCH_CHECK(
     t->sym_sizes().equals(sizes),
     "Expected tensor of size ", sizes, ", but got tensor of size ", t->sizes(),
@@ -91,7 +91,7 @@ void checkSize(CheckedFrom c, const TensorGeometryArg& t, int64_t dim, int64_t s
     " (while checking arguments for ", c, ")");
 }
 
-void checkSize_symint(CheckedFrom c, const TensorGeometryArg& t, int64_t dim, c10::SymInt size) {
+void checkSize_symint(CheckedFrom c, const TensorGeometryArg& t, int64_t dim, const c10::SymInt& size) {
   TORCH_CHECK(
     t->sym_size(dim) == size,
     "Expected tensor to have size ", size, " at dimension ", dim,
@@ -343,12 +343,13 @@ inline c10::optional<ResultVec> computeStride_impl(
   // This could perhaps be combined with the below code, but the complexity
   // didn't seem worth it.
   const Numel numel = c10::multiply_integers(oldshape);
-  if (numel == 0 && oldshape.equals(newshape)) {
+  bool zero_numel = TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(numel, 0));
+  if (zero_numel && oldshape.equals(newshape)) {
     return toResult(oldstride);
   }
 
   ResultVec newstride(newshape.size());
-  if (numel == 0) {
+  if (zero_numel) {
     for (int64_t view_d = newshape.size() - 1; view_d >= 0; view_d--) {
       if (view_d == (int64_t)(newshape.size() - 1)) {
         newstride[view_d] = 1;
@@ -370,10 +371,10 @@ inline c10::optional<ResultVec> computeStride_impl(
     tensor_numel *= oldshape[tensor_d];
     // if end of tensor size chunk, check view
     if ((tensor_d == 0) ||
-        (oldshape[tensor_d - 1] != 1 &&
+        (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(oldshape[tensor_d - 1], 1)) &&
          oldstride[tensor_d - 1] != tensor_numel * chunk_base_stride)) {
       while (view_d >= 0 &&
-            (view_numel < tensor_numel || newshape[view_d] == 1)) {
+            (TORCH_GUARD_SIZE_OBLIVIOUS(sym_lt(view_numel, tensor_numel)) || TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(newshape[view_d], 1)))) {
         newstride[view_d] = view_numel * chunk_base_stride;
         view_numel *= newshape[view_d];
         view_d--;
diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h
index 4a95e622257e7..4615ab50606ee 100644
--- a/aten/src/ATen/TensorUtils.h
+++ b/aten/src/ATen/TensorUtils.h
@@ -20,12 +20,14 @@ namespace at {
 // which do NO argument checking by default.
 
 struct TORCH_API TensorArg {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const Tensor& tensor;
   const char* name;
   int pos; // 1-indexed
   TensorArg(const Tensor& tensor, const char* name, int pos)
       : tensor(tensor), name(name), pos(pos) {}
   // Try to mitigate any possibility of dangling reference to temporaries.
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
   TensorArg(Tensor&& tensor, const char* name, int pos) = delete;
   const Tensor* operator->() const {
     return &tensor;
@@ -66,7 +68,9 @@ using CheckedFrom = const char*;
 // not TensorGeometryArg, because the Tensor to TensorGeometry
 // conversion will blow up if you have undefined tensors.
 
-TORCH_API std::ostream& operator<<(std::ostream& out, TensorGeometryArg t);
+TORCH_API std::ostream& operator<<(
+    std::ostream& out,
+    const TensorGeometryArg& t);
 TORCH_API void checkDim(
     CheckedFrom c,
     const Tensor& tensor,
@@ -103,7 +107,7 @@ TORCH_API void checkSize_symint(
     CheckedFrom c,
     const TensorGeometryArg& t,
     int64_t dim,
-    c10::SymInt size);
+    const c10::SymInt& size);
 TORCH_API void checkNumel(
     CheckedFrom c,
     const TensorGeometryArg& t,
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index 7cae9997ab05a..8419499c3a563 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <stack>
-
 #include <c10/core/InferenceMode.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/util/Exception.h>
@@ -98,6 +96,7 @@ class TORCH_API ThreadLocalStateGuard {
   }
 
  private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const ThreadLocalState prev_state_;
 };
 
diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp
index eb71fe315d430..cf33d89e0814e 100644
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@@ -190,8 +190,8 @@ std::string show_config() {
     ss << detail::getCUDAHooks().showConfig();
   }
 
-  if (hasORT()) {
-    ss << detail::getORTHooks().showConfig();
+  if (hasMAIA()) {
+    ss << detail::getMAIAHooks().showConfig();
   }
 
   if (hasXPU()) {
diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
index 142665b7c8b27..8b1ad3026cd04 100644
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@@ -104,37 +104,40 @@ inline void maybe_wrap_dims(
 // dimension behavior and dimension size checking). We maintain this behavior
 // for backwards compatibility, but only for this specific size (i.e. other
 // empty sizes are not skipped).
-template <typename T>
-inline int64_t _legacy_cat_wrap_dim(
+inline int64_t legacy_cat_wrap_dim(
     int64_t dim,
-    const std::vector<std::vector<T>>& tensor_sizes) {
+    const std::vector<std::vector<int64_t>>& tensor_sizes) {
   for (auto& sizes : tensor_sizes) {
     if (sizes.size() == 1 && sizes[0] == 0) {
       continue;
     }
-    return maybe_wrap_dim(dim, sizes.size());
+    return maybe_wrap_dim(dim, static_cast<int64_t>(sizes.size()));
   }
   return dim;
 }
 
-inline int64_t legacy_cat_wrap_dim(
-    int64_t dim,
-    const std::vector<std::vector<int64_t>>& tensor_sizes) {
-  return _legacy_cat_wrap_dim<int64_t>(dim, tensor_sizes);
-}
-
 inline int64_t legacy_cat_wrap_dim_symint(
     int64_t dim,
     const std::vector<std::vector<c10::SymInt>>& tensor_sizes) {
-  return _legacy_cat_wrap_dim<c10::SymInt>(dim, tensor_sizes);
+  for (auto& sizes : tensor_sizes) {
+    if (sizes.size() == 1) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[0].sym_eq(0))) {
+        continue;
+      }
+    }
+    return maybe_wrap_dim(dim, static_cast<int64_t>(sizes.size()));
+  }
+  return dim;
 }
 
 inline int64_t legacy_cat_wrap_dim(
     int64_t dim,
     const MaterializedITensorListRef& tensors) {
   for (const Tensor& tensor : tensors) {
-    if (tensor.dim() == 1 && tensor.sizes()[0] == 0) {
-      continue;
+    if (tensor.dim() == 1) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(tensor.sym_sizes()[0].sym_eq(0))) {
+        continue;
+      }
     }
     return maybe_wrap_dim(dim, tensor.dim());
   }
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index 7282bba9e6889..c233f17b44580 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -6,60 +6,14 @@
 
 namespace at::autocast {
 
-bool is_enabled() {
-  return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastCUDA);
+bool is_autocast_enabled(at::DeviceType device_type) {
+  at::DispatchKey dispatch_key = get_autocast_dispatch_key_from_device_type(device_type);
+  return !c10::impl::tls_is_dispatch_key_excluded(dispatch_key);
 }
 
-void set_enabled(bool new_enabled) {
-  c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastCUDA, !new_enabled);
-}
-
-bool is_cpu_enabled() {
-  return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastCPU);
-}
-
-void set_cpu_enabled(bool new_enabled) {
-  c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastCPU, !new_enabled);
-}
-
-bool is_xpu_enabled() {
-  return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastXPU);
-}
-
-void set_xpu_enabled(bool new_enabled) {
-  c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastXPU, !new_enabled);
-}
-
-bool is_ipu_enabled() {
-  return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastIPU);
-}
-
-void set_ipu_enabled(bool new_enabled) {
-  c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastIPU, !new_enabled);
-}
-
-bool is_hpu_enabled() {
-  return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastHPU);
-}
-
-void set_hpu_enabled(bool new_enabled) {
-  c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastHPU, !new_enabled);
-}
-
-bool is_xla_enabled() {
-  return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastXLA);
-}
-
-void set_xla_enabled(bool new_enabled) {
-  c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastXLA, !new_enabled);
-}
-
-bool is_privateuseone_enabled() {
-  return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastPrivateUse1);
-}
-
-void set_privateuseone_enabled(bool new_enabled) {
-  c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastPrivateUse1, !new_enabled);
+void set_autocast_enabled(at::DeviceType device_type, bool enabled) {
+  at::DispatchKey dispatch_key = get_autocast_dispatch_key_from_device_type(device_type);
+  c10::impl::tls_set_dispatch_key_excluded(dispatch_key, !enabled);
 }
 
 namespace {
@@ -91,30 +45,40 @@ std::mutex cached_casts_mutex;
 // it calls clear_cache() to ensure cached Tensors don't leak outside the autocasting region.
 thread_local int nesting = 0;
 
-// autocast_cpu_dtype is the lower_precision_fp used by AutocastCPU.
-thread_local at::ScalarType autocast_cpu_dtype = at::kBFloat16;
-
-// autocast_xpu_dtype is the lower_precision_fp used by AutocastXPU.
-thread_local at::ScalarType autocast_xpu_dtype = at::kBFloat16;
-
-// autocast_ipu_dtype is the lower_precision_fp used by AutocastIPU.
-thread_local at::ScalarType autocast_ipu_dtype = at::kHalf;
-
-// autocast_hpu_dtype is the lower_precision_fp used by AutocastHPU.
-thread_local at::ScalarType autocast_hpu_dtype = at::kBFloat16;
-
-// autocast_xla_dtype is the lower_precision_fp used by AutocastXLA.
-thread_local at::ScalarType autocast_xla_dtype = at::kBFloat16;
+// The order of this array MUST exactly match the definition order of DeviceType
+// in c10/core/DeviceType.h.
+static_assert(
+    at::COMPILE_TIME_MAX_DEVICE_TYPES == 21,
+    "The definition of the default autocast data type per device backend doesn't match with the definition of the device type.");
+thread_local std::array<at::ScalarType, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+    autocast_dtype = {
+        at::kBFloat16, // CPU
+        at::kHalf, // CUDA.
+        at::ScalarType::Undefined, // Reserved for explicit MKLDNN
+        at::ScalarType::Undefined, // OpenGL
+        at::ScalarType::Undefined, // OpenCL
+        at::ScalarType::Undefined, // IDEEP.
+        at::kHalf, // AMD HIP
+        at::ScalarType::Undefined, // FPGA
+        at::ScalarType::Undefined, // ONNX Runtime / Microsoft
+        at::kBFloat16, // XLA / TPU
+        at::ScalarType::Undefined, // Vulkan
+        at::ScalarType::Undefined, // Metal
+        at::kBFloat16, // XPU
+        at::ScalarType::Undefined, // MPS
+        at::ScalarType::Undefined, // Meta (tensors with no data)
+        at::kBFloat16, // HPU / HABANA
+        at::ScalarType::Undefined, // SX-Aurora / NEC
+        at::ScalarType::Undefined, // Lazy Tensors
+        at::kHalf, // Graphcore IPU
+        at::ScalarType::Undefined, // Meta training and inference devices
+        at::kHalf, // PrivateUse1 device
+};
 
 // should we enabled the cache inside autocast.
 thread_local bool cache_enabled = true;
 
-// autocast_gpu_dtype is the lower_precision_fp used by AutocastGPU.
-thread_local at::ScalarType autocast_gpu_dtype = at::kHalf;
-
-// autocast_privateuseone_dtype is the lower_precision_fp used by AutocastPrivateUse1.
-thread_local at::ScalarType autocast_privateuseone_dtype = at::kHalf;
-}
+} // anonymous namespace
 
 void clear_cache() {
   const std::lock_guard<std::mutex> lock(cached_casts_mutex);
@@ -129,60 +93,12 @@ int decrement_nesting() {
   return --nesting;
 }
 
-at::ScalarType get_autocast_gpu_dtype() {
-  return autocast_gpu_dtype;
+at::ScalarType get_autocast_dtype(at::DeviceType device_type) {
+  return autocast_dtype[static_cast<int>(device_type)];
 }
 
-at::ScalarType get_autocast_cpu_dtype() {
-  return autocast_cpu_dtype;
-}
-
-at::ScalarType get_autocast_xpu_dtype() {
-  return autocast_xpu_dtype;
-}
-
-at::ScalarType get_autocast_ipu_dtype() {
-  return autocast_ipu_dtype;
-}
-
-at::ScalarType get_autocast_hpu_dtype() {
-  return autocast_hpu_dtype;
-}
-
-at::ScalarType get_autocast_xla_dtype() {
-  return autocast_xla_dtype;
-}
-
-at::ScalarType get_autocast_privateuseone_dtype() {
-  return autocast_privateuseone_dtype;
-}
-
-void set_autocast_cpu_dtype(at::ScalarType dtype) {
-  autocast_cpu_dtype = dtype;
-}
-
-void set_autocast_gpu_dtype(at::ScalarType dtype) {
-  autocast_gpu_dtype = dtype;
-}
-
-void set_autocast_xpu_dtype(at::ScalarType dtype) {
-  autocast_xpu_dtype = dtype;
-}
-
-void set_autocast_ipu_dtype(at::ScalarType dtype) {
-  autocast_ipu_dtype = dtype;
-}
-
-void set_autocast_hpu_dtype(at::ScalarType dtype) {
-  autocast_hpu_dtype = dtype;
-}
-
-void set_autocast_xla_dtype(at::ScalarType dtype) {
-  autocast_xla_dtype = dtype;
-}
-
-void set_autocast_privateuseone_dtype(at::ScalarType dtype) {
-  autocast_privateuseone_dtype = dtype;
+void set_autocast_dtype(at::DeviceType device_type, at::ScalarType dtype) {
+  autocast_dtype[static_cast<int>(device_type)] = dtype;
 }
 
 bool is_autocast_cache_enabled() {
@@ -241,135 +157,46 @@ namespace {
 /*****************************************
 Explicit registration for out-of-place ops
 *****************************************/
+
 TORCH_LIBRARY_IMPL(_, Autocast, m) {
   m.fallback(torch::CppFunction::makeFallthrough());
 }
 
 TORCH_LIBRARY_IMPL(aten, Autocast, m) {
   // lower_precision_fp
-  KERNEL_CUDA2(_convolution, deprecated, lower_precision_fp)
-  KERNEL_CUDA(_convolution, lower_precision_fp)
-  KERNEL_CUDA(conv1d, lower_precision_fp)
-  KERNEL_CUDA(conv2d, lower_precision_fp)
-  KERNEL_CUDA(conv3d, lower_precision_fp)
-  KERNEL_CUDA(conv_tbc, lower_precision_fp)
-  KERNEL_CUDA(conv_transpose1d, lower_precision_fp)
-  KERNEL_CUDA2(conv_transpose2d, input, lower_precision_fp)
-  KERNEL_CUDA2(conv_transpose3d, input, lower_precision_fp)
-  KERNEL_CUDA(convolution, lower_precision_fp)
+#define _KERNEL_CUDA_LOW_PRECISION_FP(...) \
+  KERNEL_CUDA(__VA_ARGS__, lower_precision_fp)
+
+  AT_FORALL_LOWER_PRECISION_FP(_KERNEL_CUDA_LOW_PRECISION_FP)
   KERNEL_CUDA(cudnn_convolution, lower_precision_fp)
   KERNEL_CUDA(cudnn_convolution_transpose, lower_precision_fp)
-  KERNEL_CUDA(prelu, lower_precision_fp)
-  KERNEL_CUDA(addmm, lower_precision_fp)
-  KERNEL_CUDA(addmv, lower_precision_fp)
-  KERNEL_CUDA(addr, lower_precision_fp)
-  KERNEL_CUDA(matmul, lower_precision_fp)
-  KERNEL_CUDA(einsum, lower_precision_fp)
-  KERNEL_CUDA(mm, lower_precision_fp)
-  KERNEL_CUDA(mv, lower_precision_fp)
-  KERNEL_CUDA(linalg_vecdot, lower_precision_fp)
-  KERNEL_CUDA(linear, lower_precision_fp)
-  KERNEL_CUDA(addbmm, lower_precision_fp)
-  KERNEL_CUDA(baddbmm, lower_precision_fp)
-  KERNEL_CUDA(bmm, lower_precision_fp)
-  KERNEL_CUDA(chain_matmul, lower_precision_fp)
-  KERNEL_CUDA(linalg_multi_dot, lower_precision_fp)
-  KERNEL_CUDA(_thnn_fused_lstm_cell, lower_precision_fp)
-  KERNEL_CUDA(_thnn_fused_gru_cell, lower_precision_fp)
-  KERNEL_CUDA(lstm_cell, lower_precision_fp)
-  KERNEL_CUDA(gru_cell, lower_precision_fp)
-  KERNEL_CUDA(rnn_tanh_cell, lower_precision_fp)
-  KERNEL_CUDA(rnn_relu_cell, lower_precision_fp)
-  KERNEL_CUDA(_scaled_dot_product_flash_attention, lower_precision_fp)
-  KERNEL_CUDA(scaled_dot_product_attention, lower_precision_fp)
 
   // fp32
-  KERNEL_CUDA(acos, fp32)
-  KERNEL_CUDA(asin, fp32)
-  KERNEL_CUDA(cosh, fp32)
-  KERNEL_CUDA(erfinv, fp32)
-  KERNEL_CUDA(exp, fp32)
-  KERNEL_CUDA(expm1, fp32)
-  KERNEL_CUDA(log, fp32)
-  KERNEL_CUDA(log10, fp32)
-  KERNEL_CUDA(log2, fp32)
-  KERNEL_CUDA(log1p, fp32)
-  KERNEL_CUDA(reciprocal, fp32)
-  KERNEL_CUDA(rsqrt, fp32)
-  KERNEL_CUDA(sinh, fp32)
-  KERNEL_CUDA(tan, fp32)
-  KERNEL_CUDA2(pow, Tensor_Scalar, fp32)
-  KERNEL_CUDA2(pow, Tensor_Tensor, fp32)
-  KERNEL_CUDA2(pow, Scalar, fp32)
-  KERNEL_CUDA(softplus, fp32)
-  KERNEL_CUDA(layer_norm, fp32)
-  KERNEL_CUDA(native_layer_norm, fp32)
-  KERNEL_CUDA(group_norm, fp32)
-  KERNEL_CUDA2(frobenius_norm, dim, fp32)
-  KERNEL_CUDA(nuclear_norm, fp32)
-  KERNEL_CUDA2(nuclear_norm, dim, fp32)
-  KERNEL_CUDA(cosine_similarity, fp32)
-  KERNEL_CUDA(poisson_nll_loss, fp32)
-  KERNEL_CUDA(cosine_embedding_loss, fp32)
-  KERNEL_CUDA(nll_loss, fp32)
-  KERNEL_CUDA(nll_loss2d, fp32)
-  KERNEL_CUDA(hinge_embedding_loss, fp32)
-  KERNEL_CUDA(kl_div, fp32)
-  KERNEL_CUDA(l1_loss, fp32)
-  KERNEL_CUDA(smooth_l1_loss, fp32)
-  KERNEL_CUDA(huber_loss, fp32)
-  KERNEL_CUDA(mse_loss, fp32)
-  KERNEL_CUDA(margin_ranking_loss, fp32)
-  KERNEL_CUDA(multilabel_margin_loss, fp32)
-  KERNEL_CUDA(soft_margin_loss, fp32)
-  KERNEL_CUDA(triplet_margin_loss, fp32)
-  KERNEL_CUDA(multi_margin_loss, fp32)
-  KERNEL_CUDA(binary_cross_entropy_with_logits, fp32)
-  KERNEL_CUDA(dist, fp32)
-  KERNEL_CUDA(pdist, fp32)
-  KERNEL_CUDA(cdist, fp32)
-  KERNEL_CUDA(renorm, fp32)
-  KERNEL_CUDA(logsumexp, fp32)
+#define _KERNEL_CUDA_FP32(...) KERNEL_CUDA(__VA_ARGS__, fp32)
+
+  AT_FORALL_FP32(_KERNEL_CUDA_FP32)
+
   // fp32_set_opt_dtype
-  KERNEL_CUDA(prod, fp32_set_opt_dtype)
-  KERNEL_CUDA2(prod, dim_int, fp32_set_opt_dtype)
-  KERNEL_CUDA2(prod, dim_Dimname, fp32_set_opt_dtype)
-  KERNEL_CUDA2(softmax, int, fp32_set_opt_dtype)
-  KERNEL_CUDA2(softmax, Dimname, fp32_set_opt_dtype)
-  KERNEL_CUDA2(log_softmax, int, fp32_set_opt_dtype)
-  KERNEL_CUDA2(log_softmax, Dimname, fp32_set_opt_dtype)
-  KERNEL_CUDA(cumprod, fp32_set_opt_dtype)
-  KERNEL_CUDA2(cumprod, dimname, fp32_set_opt_dtype)
-  KERNEL_CUDA(cumsum, fp32_set_opt_dtype)
-  KERNEL_CUDA2(cumsum, dimname, fp32_set_opt_dtype)
-  KERNEL_CUDA(linalg_vector_norm, fp32_set_opt_dtype)
-  KERNEL_CUDA(linalg_matrix_norm, fp32_set_opt_dtype)
-  KERNEL_CUDA2(linalg_matrix_norm, str_ord, fp32_set_opt_dtype)
+#define _KERNEL_CUDA_FP32_SET_OPT_DTYPE(...) \
+  KERNEL_CUDA(__VA_ARGS__, fp32_set_opt_dtype)
+
+  AT_FORALL_FP32_SET_OPT_DTYPE(_KERNEL_CUDA_FP32_SET_OPT_DTYPE)
   // commenting these out because they accept an explicit (not-optional) dtype, and we shouldn't try to flip that even
   // when autocasting.
-  // KERNEL_CUDA2(norm, ScalarOpt_dtype, fp32_set_opt_dtype)
-  // KERNEL_CUDA2(norm, ScalarOpt_dim_dtype, fp32_set_opt_dtype)
-  // KERNEL_CUDA2(norm, names_ScalarOpt_dim_dtype, fp32_set_opt_dtype)
-  KERNEL_CUDA(sum, fp32_set_opt_dtype)
-  KERNEL_CUDA2(sum, dim_IntList, fp32_set_opt_dtype)
-  KERNEL_CUDA2(sum, dim_DimnameList, fp32_set_opt_dtype)
+  // KERNEL_CUDA(norm, ScalarOpt_dtype, fp32_set_opt_dtype)
+  // KERNEL_CUDA(norm, ScalarOpt_dim_dtype, fp32_set_opt_dtype)
+  // KERNEL_CUDA(norm, names_ScalarOpt_dim_dtype, fp32_set_opt_dtype)
+
   // fp32_append_dtype
   // The fp32_append_dtype wrapper overrides implicit promotion behavior.
   // norm does not implicitly promote, but be aware when adding new ops to this policy.
-  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA(ADD_NS(norm), "norm.Scalar", Tensor (const Tensor &, const Scalar&), Tensor (const Tensor &, const c10::optional<Scalar>&, ScalarType), fp32_append_dtype)
-  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA(ADD_NS(norm), "norm.ScalarOpt_dim", Tensor (const Tensor &, const c10::optional<Scalar>&, IntArrayRef, bool), Tensor (const Tensor &, const c10::optional<Scalar>&, IntArrayRef, bool, ScalarType), fp32_append_dtype)
-  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA(ADD_NS(norm), "norm.names_ScalarOpt_dim", Tensor (const Tensor &, const c10::optional<Scalar>&, DimnameList, bool), Tensor (const Tensor &, const c10::optional<Scalar>&, DimnameList, bool, ScalarType), fp32_append_dtype)
+  AT_FORALL_DIFFERENT_REDISPATCH_SIGNATURE(
+      KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA)
+
   // promote
-  KERNEL_CUDA(addcdiv, promote)
-  KERNEL_CUDA(addcmul, promote)
-  KERNEL_CUDA(atan2, promote)
-  KERNEL_CUDA(bilinear, promote)
-  KERNEL_CUDA(cross, promote)
-  KERNEL_CUDA(dot, promote)
-  KERNEL_CUDA(grid_sampler, promote)
-  KERNEL_CUDA(index_put, promote)
-  KERNEL_CUDA(tensordot, promote)
-  KERNEL_CUDA(scatter_add, promote)
+#define _KERNEL_CUDA_PROMOTE(...) KERNEL_CUDA(__VA_ARGS__, promote)
+
+  AT_FORALL_PROMOTE(_KERNEL_CUDA_PROMOTE)
 
   m.impl(TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"),
          TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
@@ -383,11 +210,11 @@ TORCH_LIBRARY_IMPL(_, AutocastCPU, m) {
 TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   // lower_precision_fp cast policy
   KERNEL_CPU(conv1d, lower_precision_fp)
-  KERNEL_CPU2(conv1d, padding, lower_precision_fp)
+  KERNEL_CPU(conv1d, padding, lower_precision_fp)
   KERNEL_CPU(conv2d, lower_precision_fp)
-  KERNEL_CPU2(conv2d, padding, lower_precision_fp)
+  KERNEL_CPU(conv2d, padding, lower_precision_fp)
   KERNEL_CPU(conv3d, lower_precision_fp)
-  KERNEL_CPU2(conv3d, padding, lower_precision_fp)
+  KERNEL_CPU(conv3d, padding, lower_precision_fp)
   KERNEL_CPU(bmm, lower_precision_fp)
   KERNEL_CPU(mm, lower_precision_fp)
   KERNEL_CPU(linalg_vecdot, lower_precision_fp)
@@ -395,13 +222,13 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(addmm, lower_precision_fp)
   KERNEL_CPU(addbmm, lower_precision_fp)
   KERNEL_CPU(linear, lower_precision_fp)
-  KERNEL_CPU2(_convolution, deprecated, lower_precision_fp)
+  KERNEL_CPU(_convolution, deprecated, lower_precision_fp)
   KERNEL_CPU(matmul, lower_precision_fp)
   KERNEL_CPU(conv_tbc, lower_precision_fp)
   KERNEL_CPU(mkldnn_rnn_layer, lower_precision_fp)
   KERNEL_CPU(conv_transpose1d, lower_precision_fp)
-  KERNEL_CPU2(conv_transpose2d, input, lower_precision_fp)
-  KERNEL_CPU2(conv_transpose3d, input, lower_precision_fp)
+  KERNEL_CPU(conv_transpose2d, input, lower_precision_fp)
+  KERNEL_CPU(conv_transpose3d, input, lower_precision_fp)
   KERNEL_CPU(prelu, lower_precision_fp)
   KERNEL_CPU(scaled_dot_product_attention, lower_precision_fp)
   KERNEL_CPU(_native_multi_head_attention, lower_precision_fp)
@@ -412,14 +239,14 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(grid_sampler, fp32)
   KERNEL_CPU(polar, fp32)
   KERNEL_CPU(prod, fp32)
-  KERNEL_CPU2(prod, dim_int, fp32)
-  KERNEL_CPU2(prod, dim_Dimname, fp32)
+  KERNEL_CPU(prod, dim_int, fp32)
+  KERNEL_CPU(prod, dim_Dimname, fp32)
   KERNEL_CPU(quantile, fp32)
-  KERNEL_CPU2(quantile, scalar, fp32)
+  KERNEL_CPU(quantile, scalar, fp32)
   KERNEL_CPU(nanquantile, fp32)
-  KERNEL_CPU2(nanquantile, scalar, fp32)
+  KERNEL_CPU(nanquantile, scalar, fp32)
   KERNEL_CPU(stft, fp32)
-  KERNEL_CPU2(stft, center, fp32)
+  KERNEL_CPU(stft, center, fp32)
   KERNEL_CPU(cdist, fp32)
   KERNEL_CPU(grid_sampler_2d, fp32)
   KERNEL_CPU(_grid_sampler_2d_cpu_fallback, fp32)
@@ -457,8 +284,8 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(soft_margin_loss, fp32)
   KERNEL_CPU(triplet_margin_loss, fp32)
   KERNEL_CPU(multi_margin_loss, fp32)
-  KERNEL_CPU2(ctc_loss, IntList, fp32)
-  KERNEL_CPU2(ctc_loss, Tensor, fp32)
+  KERNEL_CPU(ctc_loss, IntList, fp32)
+  KERNEL_CPU(ctc_loss, Tensor, fp32)
   KERNEL_CPU(kl_div, fp32)
   KERNEL_CPU(multilabel_margin_loss, fp32)
   KERNEL_CPU(binary_cross_entropy_with_logits, fp32)
@@ -477,11 +304,11 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(fft_hfft, fp32)
   KERNEL_CPU(fft_ihfft, fp32)
   KERNEL_CPU(linalg_cond, fp32)
-  KERNEL_CPU2(linalg_cond, p_str, fp32)
+  KERNEL_CPU(linalg_cond, p_str, fp32)
   KERNEL_CPU(linalg_matrix_rank, fp32)
-  KERNEL_CPU2(linalg_matrix_rank, tol_tensor, fp32)
-  KERNEL_CPU2(linalg_matrix_rank, atol_rtol_tensor, fp32)
-  KERNEL_CPU2(linalg_matrix_rank, atol_rtol_float, fp32)
+  KERNEL_CPU(linalg_matrix_rank, tol_tensor, fp32)
+  KERNEL_CPU(linalg_matrix_rank, atol_rtol_tensor, fp32)
+  KERNEL_CPU(linalg_matrix_rank, atol_rtol_float, fp32)
   KERNEL_CPU(linalg_solve, fp32)
   KERNEL_CPU(linalg_cholesky, fp32)
   KERNEL_CPU(linalg_svdvals, fp32)
@@ -513,8 +340,45 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(stack, promote)
   KERNEL_CPU(cat, promote)
   KERNEL_CPU(index_copy, promote)
-  KERNEL_CPU2(index_copy, dimname, promote)
+  KERNEL_CPU(index_copy, dimname, promote)
+
+}
 
+TORCH_LIBRARY_IMPL(_, AutocastXPU, m) {
+  m.fallback(torch::CppFunction::makeFallthrough());
+}
+
+TORCH_LIBRARY_IMPL(aten, AutocastXPU, m) {
+  // lower_precision_fp
+#define _KERNEL_XPU_LOW_PRECISION_FP(...) \
+  KERNEL_XPU(__VA_ARGS__, lower_precision_fp)
+
+  AT_FORALL_LOWER_PRECISION_FP(_KERNEL_XPU_LOW_PRECISION_FP)
+
+  // fp32
+#define _KERNEL_XPU_FP32(...) KERNEL_XPU(__VA_ARGS__, fp32)
+
+  AT_FORALL_FP32(_KERNEL_XPU_FP32)
+
+  // fp32_set_opt_dtype
+#define _KERNEL_XPU_FP32_SET_OPT_DTYPE(...) \
+  KERNEL_XPU(__VA_ARGS__, fp32_set_opt_dtype)
+
+  AT_FORALL_FP32_SET_OPT_DTYPE(_KERNEL_XPU_FP32_SET_OPT_DTYPE)
+
+  // fp32_append_dtype
+  // The fp32_append_dtype wrapper overrides implicit promotion behavior.
+  // norm does not implicitly promote, but be aware when adding new ops to this policy.
+  AT_FORALL_DIFFERENT_REDISPATCH_SIGNATURE(
+      KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_XPU)
+
+  // promote
+#define _KERNEL_XPU_PROMOTE(...) KERNEL_XPU(__VA_ARGS__, promote)
+
+  AT_FORALL_PROMOTE(_KERNEL_XPU_PROMOTE)
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"),
+         TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
 }
 
 } // namespace
diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h
index b3f2fcd511ff6..59a91848a5175 100644
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@@ -10,40 +10,120 @@
 
 namespace at::autocast {
 
-TORCH_API bool is_enabled();
-TORCH_API void set_enabled(bool enabled);
+TORCH_API bool is_autocast_enabled(at::DeviceType device_type);
+TORCH_API void set_autocast_enabled(at::DeviceType device_type, bool enabled);
+TORCH_API at::ScalarType get_autocast_dtype(at::DeviceType device_type);
+TORCH_API void set_autocast_dtype(
+    at::DeviceType device_type,
+    at::ScalarType dtype);
 TORCH_API void clear_cache();
 TORCH_API int increment_nesting();
 TORCH_API int decrement_nesting();
-TORCH_API bool is_cpu_enabled();
-TORCH_API void set_cpu_enabled(bool enabled);
-TORCH_API at::ScalarType get_autocast_gpu_dtype();
-TORCH_API at::ScalarType get_autocast_cpu_dtype();
-TORCH_API void set_autocast_gpu_dtype(at::ScalarType dtype);
-TORCH_API void set_autocast_cpu_dtype(at::ScalarType dtype);
-TORCH_API bool is_xpu_enabled();
-TORCH_API void set_xpu_enabled(bool enabled);
-TORCH_API at::ScalarType get_autocast_xpu_dtype();
-TORCH_API void set_autocast_xpu_dtype(at::ScalarType dtype);
-TORCH_API bool is_ipu_enabled();
-TORCH_API void set_ipu_enabled(bool enabled);
-TORCH_API at::ScalarType get_autocast_ipu_dtype();
-TORCH_API void set_autocast_ipu_dtype(at::ScalarType dtype);
-TORCH_API bool is_hpu_enabled();
-TORCH_API void set_hpu_enabled(bool enabled);
-TORCH_API at::ScalarType get_autocast_hpu_dtype();
-TORCH_API void set_autocast_hpu_dtype(at::ScalarType dtype);
-TORCH_API bool is_xla_enabled();
-TORCH_API void set_xla_enabled(bool enabled);
-TORCH_API at::ScalarType get_autocast_xla_dtype();
-TORCH_API void set_autocast_xla_dtype(at::ScalarType dtype);
-TORCH_API bool is_privateuseone_enabled();
-TORCH_API void set_privateuseone_enabled(bool enabled);
-TORCH_API at::ScalarType get_autocast_privateuseone_dtype();
-TORCH_API void set_autocast_privateuseone_dtype(at::ScalarType dtype);
 TORCH_API bool is_autocast_cache_enabled();
 TORCH_API void set_autocast_cache_enabled(bool enabled);
 
+// deprecated CUDA-specific autocast APIs
+C10_DEPRECATED_MESSAGE(
+    "at::autocast::is_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(at::kCUDA) instead.")
+TORCH_API inline bool is_enabled() {
+  TORCH_WARN_DEPRECATION(
+      "at::autocast::",
+      __func__,
+      "() is deprecated. Please use at::autocast::is_autocast_enabled(at::kCUDA) instead.")
+  return is_autocast_enabled(at::kCUDA);
+}
+C10_DEPRECATED_MESSAGE(
+    "at::autocast::set_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(at::kCUDA, enabled) instead.")
+TORCH_API inline void set_enabled(bool enabled) {
+  TORCH_WARN_DEPRECATION(
+      "at::autocast::",
+      __func__,
+      "(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(at::kCUDA, enabled) instead.")
+  set_autocast_enabled(at::kCUDA, enabled);
+}
+C10_DEPRECATED_MESSAGE(
+    "at::autocast::get_autocast_gpu_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(at::kCUDA) instead.")
+TORCH_API inline at::ScalarType get_autocast_gpu_dtype() {
+  TORCH_WARN_DEPRECATION(
+      "at::autocast::",
+      __func__,
+      "() is deprecated. Please use at::autocast::get_autocast_dtype(at::kCUDA) instead.")
+  return get_autocast_dtype(at::kCUDA);
+}
+C10_DEPRECATED_MESSAGE(
+    "at::autocast::set_autocast_gpu_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(at::kCUDA, dtype) instead.")
+TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
+  TORCH_WARN_DEPRECATION(
+      "at::autocast::",
+      __func__,
+      "(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(at::kCUDA, dtype) instead.")
+  set_autocast_dtype(at::kCUDA, dtype);
+}
+
+#define DECLARE_DEPRECATED_AUTOCAST_APIS(name, device_type)                                          \
+  C10_DEPRECATED_MESSAGE(                                                                            \
+      "at::autocast::is_" #name                                                                      \
+      "_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(" #device_type         \
+      ") instead.")                                                                                  \
+  TORCH_API inline bool is_##name##_enabled() {                                                      \
+    TORCH_WARN_DEPRECATION(                                                                          \
+        "at::autocast::",                                                                            \
+        __func__,                                                                                    \
+        "() is deprecated. Please use at::autocast::is_autocast_enabled(" #device_type               \
+        ") instead.")                                                                                \
+    return is_autocast_enabled(device_type);                                                         \
+  }                                                                                                  \
+                                                                                                     \
+  C10_DEPRECATED_MESSAGE(                                                                            \
+      "at::autocast::set_" #name                                                                     \
+      "_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(" #device_type \
+      ", enabled) instead.")                                                                         \
+  TORCH_API inline void set_##name##_enabled(bool enabled) {                                         \
+    TORCH_WARN_DEPRECATION(                                                                          \
+        "at::autocast::",                                                                            \
+        __func__,                                                                                    \
+        "(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(" #device_type       \
+        ", enabled) instead.")                                                                       \
+    set_autocast_enabled(device_type, enabled);                                                      \
+  }                                                                                                  \
+                                                                                                     \
+  C10_DEPRECATED_MESSAGE(                                                                            \
+      "at::autocast::get_autocast_" #name                                                            \
+      "_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(" #device_type            \
+      ") instead.")                                                                                  \
+  TORCH_API inline at::ScalarType get_autocast_##name##_dtype() {                                    \
+    TORCH_WARN_DEPRECATION(                                                                          \
+        "at::autocast::",                                                                            \
+        __func__,                                                                                    \
+        "() is deprecated. Please at::autocast::get_autocast_dtype(" #device_type                    \
+        ") instead.")                                                                                \
+    return get_autocast_dtype(device_type);                                                          \
+  }                                                                                                  \
+                                                                                                     \
+  C10_DEPRECATED_MESSAGE(                                                                            \
+      "at::autocast::set_autocast_" #name                                                            \
+      "_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(" #device_type       \
+      ", dtype) instead.")                                                                           \
+  TORCH_API inline void set_autocast_##name##_dtype(at::ScalarType dtype) {                          \
+    TORCH_WARN_DEPRECATION(                                                                          \
+        "at::autocast::",                                                                            \
+        __func__,                                                                                    \
+        "(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(" #device_type           \
+        ", dtype) instead.")                                                                         \
+    set_autocast_dtype(device_type, dtype);                                                          \
+  }
+
+#define AT_FORALL_DEPRECATED_AUTOCAST_BAKCNEDS(_) \
+  _(cpu, at::kCPU)                                \
+  _(xpu, at::kXPU)                                \
+  _(xla, at::kXLA)                                \
+  _(hpu, at::kHPU)                                \
+  _(ipu, at::kIPU)                                \
+  _(privateuseone, at::kPrivateUse1)
+
+// deprecated other backend specific autocast APIs
+AT_FORALL_DEPRECATED_AUTOCAST_BAKCNEDS(DECLARE_DEPRECATED_AUTOCAST_APIS)
+
 namespace {
 inline bool is_autocast_eligible(
     const Tensor& tensor,
@@ -94,26 +174,24 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type(
   }
 }
 
+inline bool is_autocast_available(c10::DeviceType device_type) {
+  if (device_type == at::kCPU || device_type == at::kCUDA ||
+      device_type == at::kXPU || device_type == at::kIPU ||
+      device_type == at::kHPU || device_type == at::kXLA ||
+      device_type == at::kPrivateUse1) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 inline at::ScalarType get_lower_precision_fp_from_device_type(
     c10::DeviceType device_type) {
-  switch (device_type) {
-    case c10::DeviceType::CUDA:
-      return get_autocast_gpu_dtype();
-    case c10::DeviceType::CPU:
-      return get_autocast_cpu_dtype();
-    case c10::DeviceType::XPU:
-      return get_autocast_xpu_dtype();
-    case c10::DeviceType::IPU:
-      return get_autocast_ipu_dtype();
-    case c10::DeviceType::HPU:
-      return get_autocast_hpu_dtype();
-    case c10::DeviceType::XLA:
-      return get_autocast_xla_dtype();
-    case c10::DeviceType::PrivateUse1:
-      return get_autocast_privateuseone_dtype();
-    default:
-      throw std::runtime_error(
-          "unknown device type for autocast in get_lower_precision_fp_from_device_type");
+  if (is_autocast_available(device_type)) {
+    return get_autocast_dtype(device_type);
+  } else {
+    throw std::runtime_error(
+        "unknown device type for autocast in get_lower_precision_fp_from_device_type");
   }
 }
 
@@ -541,9 +619,13 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
 
 #define ADD_NS(RAW_OP) at::RAW_OP
 
+#define _KERNEL_OVERLOAD_NARG_IMPL(_0, _1, _2, N, ...) N
+#define _KERNEL_OVERLOAD_NARG(...) \
+  C10_EXPAND_MSVC_WORKAROUND(_KERNEL_OVERLOAD_NARG_IMPL(__VA_ARGS__, 2, 1))
+
 // Common cases where registration signature matches redispatch signature
 // (that's why SIGNATURE is repeated in the WrapFunction instantiation)
-#define KERNEL(DISPATCHKEY, OP, POLICY)       \
+#define KERNEL1(DISPATCHKEY, OP, POLICY)      \
   m.impl(                                     \
       TORCH_SELECTIVE_NAME("aten::" #OP),     \
       &::at::autocast::WrapFunction<          \
@@ -563,6 +645,15 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
           decltype(ATEN_FN2(OP, OVERLOAD)),             \
           &ATEN_FN2(OP, OVERLOAD)>::type::call);
 
+#define _KERNEL_DISPATCH(DISPATCHKEY, NARG, ...) \
+  C10_CONCATENATE(KERNEL, NARG)(DISPATCHKEY, __VA_ARGS__)
+
+#define _KERNEL_IMPL(DISPATCHKEY, ...) \
+  _KERNEL_DISPATCH(DISPATCHKEY, _KERNEL_OVERLOAD_NARG(__VA_ARGS__), __VA_ARGS__)
+
+// It will dispatch to KERNEL1 or KERNEL2 based on its inputs.
+#define KERNEL(DISPATCHKEY, ...) _KERNEL_IMPL(DISPATCHKEY, __VA_ARGS__)
+
 // Less-common but still useful case: redispatching to a function
 // with a new signature (e.g. appending a dtype)
 #define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(      \
@@ -581,12 +672,9 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
           REDISPATCH_SIGNATURE,                     \
           &REDISPATCH_FUNC>::type::call);
 
-// KERNEL_CPU/KERNEL_CPU2/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CPU
-// registration for AutocastCPU
-#define KERNEL_CPU(OP, POLICY) KERNEL(c10::DeviceType::CPU, OP, POLICY)
-
-#define KERNEL_CPU2(OP, OVERLOAD, POLICY) \
-  KERNEL2(c10::DeviceType::CPU, OP, OVERLOAD, POLICY)
+// KERNEL_CPU/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CPU
+// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastCPU
+#define KERNEL_CPU(...) KERNEL(c10::DeviceType::CPU, __VA_ARGS__)
 
 #define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CPU( \
     REDISPATCH_FUNC,                               \
@@ -602,12 +690,9 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
       REDISPATCH_SIGNATURE,                        \
       POLICY)
 
-// KERNEL_CUDA/KERNEL_CUDA2/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA
-// registration for AutocastCUDA
-#define KERNEL_CUDA(OP, POLICY) KERNEL(c10::DeviceType::CUDA, OP, POLICY)
-
-#define KERNEL_CUDA2(OP, OVERLOAD, POLICY) \
-  KERNEL2(c10::DeviceType::CUDA, OP, OVERLOAD, POLICY)
+// KERNEL_CUDA/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA
+// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastCUDA
+#define KERNEL_CUDA(...) KERNEL(c10::DeviceType::CUDA, __VA_ARGS__)
 
 #define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA( \
     REDISPATCH_FUNC,                                \
@@ -623,14 +708,28 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
       REDISPATCH_SIGNATURE,                         \
       POLICY)
 
-// KERNEL_PRIVATEUSEONE/KERNEL_PRIVATEUSEONE2/
-// KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE
-// registration for AutocastPrivateUse1
-#define KERNEL_PRIVATEUSEONE(OP, POLICY) \
-  KERNEL(c10::DeviceType::PrivateUse1, OP, POLICY)
+// KERNEL_XPU/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_XPU
+// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastXPU
+#define KERNEL_XPU(...) KERNEL(c10::DeviceType::XPU, __VA_ARGS__)
 
-#define KERNEL_PRIVATEUSEONE2(OP, OVERLOAD, POLICY) \
-  KERNEL2(c10::DeviceType::PrivateUse1, OP, OVERLOAD, POLICY)
+#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_XPU( \
+    REDISPATCH_FUNC,                               \
+    REGISTER_NAME,                                 \
+    REGISTER_SIGNATURE,                            \
+    REDISPATCH_SIGNATURE,                          \
+    POLICY)                                        \
+  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(           \
+      c10::DeviceType::XPU,                        \
+      REDISPATCH_FUNC,                             \
+      REGISTER_NAME,                               \
+      REGISTER_SIGNATURE,                          \
+      REDISPATCH_SIGNATURE,                        \
+      POLICY)
+
+// KERNEL_PRIVATEUSEONE/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE
+// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastPrivateUse1
+#define KERNEL_PRIVATEUSEONE(...) \
+  KERNEL(c10::DeviceType::PrivateUse1, __VA_ARGS__)
 
 #define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE( \
     REDISPATCH_FUNC,                                         \
@@ -645,3 +744,158 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
       REGISTER_SIGNATURE,                                    \
       REDISPATCH_SIGNATURE,                                  \
       POLICY)
+
+// Op lists for different policies.
+// To make sure other backends can reuse the policy op list.
+#define AT_FORALL_LOWER_PRECISION_FP(_)  \
+  _(_convolution, deprecated)            \
+  _(_convolution)                        \
+  _(conv1d)                              \
+  _(conv2d)                              \
+  _(conv3d)                              \
+  _(conv_tbc)                            \
+  _(conv_transpose1d)                    \
+  _(conv_transpose2d, input)             \
+  _(conv_transpose3d, input)             \
+  _(convolution)                         \
+  _(prelu)                               \
+  _(addmm)                               \
+  _(addmv)                               \
+  _(addr)                                \
+  _(matmul)                              \
+  _(einsum)                              \
+  _(mm)                                  \
+  _(mv)                                  \
+  _(linalg_vecdot)                       \
+  _(linear)                              \
+  _(addbmm)                              \
+  _(baddbmm)                             \
+  _(bmm)                                 \
+  _(chain_matmul)                        \
+  _(linalg_multi_dot)                    \
+  _(_thnn_fused_lstm_cell)               \
+  _(_thnn_fused_gru_cell)                \
+  _(lstm_cell)                           \
+  _(gru_cell)                            \
+  _(rnn_tanh_cell)                       \
+  _(rnn_relu_cell)                       \
+  _(_scaled_dot_product_flash_attention) \
+  _(scaled_dot_product_attention)
+
+#define AT_FORALL_FP32(_)             \
+  _(acos)                             \
+  _(asin)                             \
+  _(cosh)                             \
+  _(erfinv)                           \
+  _(exp)                              \
+  _(expm1)                            \
+  _(log)                              \
+  _(log10)                            \
+  _(log2)                             \
+  _(log1p)                            \
+  _(reciprocal)                       \
+  _(rsqrt)                            \
+  _(sinh)                             \
+  _(tan)                              \
+  _(pow, Tensor_Scalar)               \
+  _(pow, Tensor_Tensor)               \
+  _(pow, Scalar)                      \
+  _(softplus)                         \
+  _(layer_norm)                       \
+  _(native_layer_norm)                \
+  _(group_norm)                       \
+  _(frobenius_norm, dim)              \
+  _(nuclear_norm)                     \
+  _(nuclear_norm, dim)                \
+  _(cosine_similarity)                \
+  _(poisson_nll_loss)                 \
+  _(cosine_embedding_loss)            \
+  _(nll_loss)                         \
+  _(nll_loss2d)                       \
+  _(hinge_embedding_loss)             \
+  _(kl_div)                           \
+  _(l1_loss)                          \
+  _(smooth_l1_loss)                   \
+  _(huber_loss)                       \
+  _(mse_loss)                         \
+  _(margin_ranking_loss)              \
+  _(multilabel_margin_loss)           \
+  _(soft_margin_loss)                 \
+  _(triplet_margin_loss)              \
+  _(multi_margin_loss)                \
+  _(binary_cross_entropy_with_logits) \
+  _(dist)                             \
+  _(pdist)                            \
+  _(cdist)                            \
+  _(renorm)                           \
+  _(logsumexp)                        \
+  _(upsample_nearest1d)               \
+  _(_upsample_nearest_exact1d)        \
+  _(upsample_nearest2d)               \
+  _(_upsample_nearest_exact2d)        \
+  _(upsample_nearest3d)               \
+  _(_upsample_nearest_exact3d)        \
+  _(upsample_linear1d)                \
+  _(upsample_bilinear2d)              \
+  _(_upsample_bilinear2d_aa)          \
+  _(upsample_trilinear3d)             \
+  _(upsample_bicubic2d)               \
+  _(_upsample_bicubic2d_aa)
+
+#define AT_FORALL_FP32_SET_OPT_DTYPE(_) \
+  _(prod)                               \
+  _(prod, dim_int)                      \
+  _(prod, dim_Dimname)                  \
+  _(softmax, int)                       \
+  _(softmax, Dimname)                   \
+  _(log_softmax, int)                   \
+  _(log_softmax, Dimname)               \
+  _(cumprod)                            \
+  _(cumprod, dimname)                   \
+  _(cumsum)                             \
+  _(cumsum, dimname)                    \
+  _(linalg_vector_norm)                 \
+  _(linalg_matrix_norm)                 \
+  _(linalg_matrix_norm, str_ord)        \
+  _(sum)                                \
+  _(sum, dim_IntList)                   \
+  _(sum, dim_DimnameList)
+
+#define AT_FORALL_DIFFERENT_REDISPATCH_SIGNATURE(_)                         \
+  _(ADD_NS(norm),                                                           \
+    "norm.Scalar",                                                          \
+    Tensor(const Tensor&, const Scalar&),                                   \
+    Tensor(const Tensor&, const c10::optional<Scalar>&, ScalarType),        \
+    fp32_append_dtype)                                                      \
+  _(ADD_NS(norm),                                                           \
+    "norm.ScalarOpt_dim",                                                   \
+    Tensor(const Tensor&, const c10::optional<Scalar>&, IntArrayRef, bool), \
+    Tensor(                                                                 \
+        const Tensor&,                                                      \
+        const c10::optional<Scalar>&,                                       \
+        IntArrayRef,                                                        \
+        bool,                                                               \
+        ScalarType),                                                        \
+    fp32_append_dtype)                                                      \
+  _(ADD_NS(norm),                                                           \
+    "norm.names_ScalarOpt_dim",                                             \
+    Tensor(const Tensor&, const c10::optional<Scalar>&, DimnameList, bool), \
+    Tensor(                                                                 \
+        const Tensor&,                                                      \
+        const c10::optional<Scalar>&,                                       \
+        DimnameList,                                                        \
+        bool,                                                               \
+        ScalarType),                                                        \
+    fp32_append_dtype)
+
+#define AT_FORALL_PROMOTE(_) \
+  _(addcdiv)                 \
+  _(addcmul)                 \
+  _(atan2)                   \
+  _(bilinear)                \
+  _(cross)                   \
+  _(dot)                     \
+  _(grid_sampler)            \
+  _(index_put)               \
+  _(tensordot)               \
+  _(scatter_add)
diff --git a/aten/src/ATen/ceil_div.h b/aten/src/ATen/ceil_div.h
index 2c13ff8115a09..37d67b232a22c 100644
--- a/aten/src/ATen/ceil_div.h
+++ b/aten/src/ATen/ceil_div.h
@@ -7,7 +7,7 @@ namespace at {
 /**
    Computes ceil(a / b)
 */
-template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
+template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
 C10_ALWAYS_INLINE C10_HOST_DEVICE T ceil_div(T a, T b) {
   return (a + b - 1) / b;
 }
diff --git a/aten/src/ATen/code_template.h b/aten/src/ATen/code_template.h
index 14ac2fa171561..393e322e6fe66 100644
--- a/aten/src/ATen/code_template.h
+++ b/aten/src/ATen/code_template.h
@@ -7,8 +7,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace at {
-namespace jit {
+namespace at::jit {
 
 // A template environment is a mapping from template variable names, e.g.,
 // identifier (corresponding to $identifier) to their expansions.
@@ -241,5 +240,4 @@ static inline std::string format(const std::string& fmt, TemplateEnv& env) {
   return CodeTemplate(fmt).format(env);
 }
 
-} // namespace jit
-} // namespace at
+} // namespace at::jit
diff --git a/aten/src/ATen/core/ATen_pch.h b/aten/src/ATen/core/ATen_pch.h
index 1f36d0ab9f87b..57ca22bf4377a 100644
--- a/aten/src/ATen/core/ATen_pch.h
+++ b/aten/src/ATen/core/ATen_pch.h
@@ -110,6 +110,8 @@
 #include <c10/util/Flags.h>
 #include <c10/util/Float8_e4m3fn.h>
 #include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2fnuz.h>
 #include <c10/util/FunctionRef.h>
 #include <c10/util/Half.h>
 #include <c10/util/IdWrapper.h>
diff --git a/aten/src/ATen/core/Array.h b/aten/src/ATen/core/Array.h
index 300ae51cef6b9..8372fe81c5c5a 100644
--- a/aten/src/ATen/core/Array.h
+++ b/aten/src/ATen/core/Array.h
@@ -6,10 +6,11 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
 
-namespace at { namespace detail {
+namespace at::detail {
 
 template <typename T, int size_>
 struct Array {
+  // NOLINTNEXTLINE(*c-array*)
   T data[size_];
 
   C10_HOST_DEVICE T operator[](int i) const {
@@ -27,7 +28,9 @@ struct Array {
   Array(const Array&) = default;
   Array& operator=(const Array&) = default;
 #endif
-  static constexpr int size(){return size_;}
+  static constexpr int size() {
+    return size_;
+  }
   // Fill the array with x.
   C10_HOST_DEVICE Array(T x) {
     for (int i = 0; i < size_; i++) {
@@ -36,4 +39,4 @@ struct Array {
   }
 };
 
-}}
+} // namespace at::detail
diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h
new file mode 100644
index 0000000000000..d04cb1c6b8a70
--- /dev/null
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@@ -0,0 +1,380 @@
+#include <c10/core/Allocator.h>
+#include <c10/util/Optional.h>
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/llvmMathExtras.h>
+
+#include <deque>
+#include <mutex>
+#include <set>
+
+namespace at {
+
+/**
+ * HostBlock is typically a fundamental memory block used in pinned memory. It
+ * is likely related to Event and Stream of device runtime. It is probably a
+ * base struct or interface that can be inherited and extended by each backend.
+ */
+template <typename S>
+struct HostBlock {
+  // constructor for search key
+  HostBlock(size_t size) : size_(size) {}
+
+  HostBlock(size_t size, void* ptr) : size_(size), ptr_(ptr) {}
+
+  std::mutex mutex_;
+  size_t size_{0}; // block size in bytes
+  void* ptr_{nullptr}; // memory address
+  bool allocated_{false}; // in-use flag
+  size_t event_count_{0}; // number of related events
+  ska::flat_hash_set<S> streams_; // streams on which the block was used
+};
+
+/**
+ * ComparatorSize is used for lookup support in the set of host memory blocks
+ * using the block size.
+ */
+template <typename B>
+struct ComparatorSize {
+  bool operator()(const B* a, const B* b) const {
+    if (a->size_ != b->size_) {
+      return a->size_ < b->size_;
+    }
+    return (uintptr_t)a->ptr_ < (uintptr_t)b->ptr_;
+  }
+};
+
+/**
+ * Note [HostAllocator design]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * We have three key data structures - the free list which stores blocks that
+ * are not currently used, the block list which stores all blocks that have been
+ * allocated, and the event queue which stores runtime events and their
+ * corresponding blocks.
+ *
+ * Each of these are protected by a separate mutex. The key design principles
+ * are to 1) only hold each mutex for the minimal amount of time possible, 2)
+ * never do any possible expensive operations (such as CUDA runtime API calls)
+ * while holding the lock.
+ *
+ * There are four public methods: allocate, free, record_event and empty_cache.
+ *   1) In the allocate path, we first check to see if we can service our
+ * request from this free list, and otherwise we create a new block with
+ * allocate_host_memory.
+ *   2) In the free path, we insert events (if required) into the event queue,
+ * and if possible insert our block back into the free list. In allocate, we
+ * first eagerly query events until we find one that is not ready, and insert
+ * the corresponding block onto the free list if all the events recorded for a
+ * block are ready.
+ *   3) In the record_event path, we simply insert the given stream into the set
+ * of streams tracked by the specified block. This set of streams is then
+ * consumed in the free path.
+ *   4) In the empty_cache path, we flush any available blocks into the free
+ * list. Remove all element of free list, then remove them from block list and
+ * release the associated pinned memory allocation via free_block.
+ *
+ * We generalize the caching host allocator into two parts: interface and
+ * implementation. For any new backend looking to integrate with host allocator
+ * and reuse caching mechanism, these two parts are necessary to be specialized.
+ *
+ * For the implementation, we provide a CachingHostAllocatorImpl struct
+ * to abstract the caching mechanism. Any backend needs to provide a customized
+ * implementation by specializing its own public functions and the related
+ * runtime functions. Its template parameter S represents runtime Stream, E
+ * denotes runtime Event, B indicates the fundamental memory block, and C
+ * signifies the sorting compartor algorithm for the memory blocks.
+ *
+ * For the interface, we provide a CachingHostAllocatorInterface struct as an
+ * interface. Any backend needs to derive its own host allocator from this
+ * interface. Its template parameter T refers to an implementation that
+ * inherited from CachingHostAllocatorImpl.
+ *
+ * So this design can share the caching mechanism across each backend, and
+ * provide flexibility to each backend. A backend can choose to follow this
+ * implementation or reuse them by extending and overriding them as necessary.
+ * Taking CUDA as an example, it specializes runtime related functions to reuse
+ * the caching mechanism. Additionally, it extends the allocator's functionality
+ * by adding the allocWithCudaHostRegister function to support page-locking the
+ * memory range used by CUDA. Of course, you can also refer to
+ * XPUCachingHostAllocator, which is a host caching allocator supported on XPU
+ * backend, to implement a basic host caching allocator.
+ *
+ * Some of the invariants here are less strict than they could be - for example,
+ * we do not enforce that free(Block* block) => block->event_count == 0. This is
+ * for compatibility reasons, and we can explore enforcing these in subsequent
+ * versions.
+ *
+ * Note that this caching host allocator does not split larger allocations into
+ * smaller blocks, unlike the caching device allocator.
+ */
+
+template <
+    typename S,
+    typename E,
+    typename B = HostBlock<S>,
+    typename C = ComparatorSize<B>>
+struct CachingHostAllocatorImpl {
+  virtual ~CachingHostAllocatorImpl() = default;
+
+ public:
+  // return data_ptr and block pair.
+  virtual std::pair<void*, void*> allocate(size_t size) {
+    if (size == 0) {
+      return {nullptr, nullptr};
+    }
+
+    process_events();
+
+    // First, try to allocate from the free list
+    auto* block = get_free_block(size);
+    if (block) {
+      return {block->ptr_, reinterpret_cast<void*>(block)};
+    }
+
+    // Round up the allocation to the nearest power of two to improve reuse.
+    size_t roundSize = c10::llvm::PowerOf2Ceil(size);
+    void* ptr = nullptr;
+    allocate_host_memory(roundSize, &ptr);
+
+    // Then, create a new block.
+    block = new B(roundSize, ptr);
+    block->allocated_ = true;
+
+    add_allocated_block(block);
+    return {block->ptr_, reinterpret_cast<void*>(block)};
+  }
+
+  virtual void free(void* ctx) {
+    if (!ctx) {
+      return;
+    }
+
+    // Note: we can assume that free is correctly paired with alloc, and thus we
+    // do not need to look up the ctx in blocks_.
+    auto* block = reinterpret_cast<B*>(ctx);
+
+    c10::optional<std::vector<E>> events;
+    {
+      std::lock_guard<std::mutex> g(block->mutex_);
+      block->allocated_ = false;
+      if (block->streams_.empty()) {
+        TORCH_INTERNAL_ASSERT(block->event_count_ == 0);
+      } else {
+        events = std::vector<E>();
+        events->reserve(block->streams_.size());
+        for (auto stream : block->streams_) {
+          record_stream(events, stream);
+        }
+        block->event_count_ += events->size();
+        block->streams_.clear();
+      }
+    }
+
+    if (!events) {
+      std::lock_guard<std::mutex> g(free_list_mutex_);
+      free_list_.insert(block);
+    } else {
+      // restore these events that record by used streams.
+      std::lock_guard<std::mutex> g(events_mutex_);
+      for (auto&& event : *events) {
+        events_.emplace_front(std::move(event), block);
+      }
+    }
+  }
+
+  virtual bool record_event(void* ptr, void* ctx, S stream) {
+    auto* block = reinterpret_cast<B*>(ctx);
+
+    // Note: we need to check if the passed-in `ctx` is valid. This is because
+    // `record_event` (via `CachingHostAllocator_recordEvent`) can be invoked on
+    // an arbitrary tensor, and is not guaranteed to correspond to a pinned
+    // memory allocation. Therefore, we need to check that `ctx` is valid before
+    // proceeding.
+    {
+      std::lock_guard<std::mutex> g(blocks_mutex_);
+      if (blocks_.find(block) != blocks_.end()) {
+        // Now we know this object is safe to access.
+        std::lock_guard<std::mutex> gb(block->mutex_);
+        TORCH_INTERNAL_ASSERT(block->allocated_);
+        block->streams_.insert(stream);
+        return true;
+      }
+      auto it = ptr_to_block_.find(ptr);
+      if (it != ptr_to_block_.end()) {
+        block = it->second;
+        std::lock_guard<std::mutex> g(block->mutex_);
+        TORCH_INTERNAL_ASSERT(block->allocated_);
+        block->streams_.insert(stream);
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  virtual void empty_cache() {
+    // Flush any available blocks into the free_list.
+    process_events();
+
+    // Remove all elements from the free list, remove them from the blocks
+    // list, and free the associated pinned memory allocation. This requires
+    // concurrently holding both the free list mutex and the blocks mutex, and
+    // is the only function that concurrently holds multiple mutexes.
+    std::lock(free_list_mutex_, blocks_mutex_);
+    std::lock_guard<std::mutex> gf(free_list_mutex_, std::adopt_lock);
+    std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
+
+    std::vector<B*> blocks_to_remove(free_list_.begin(), free_list_.end());
+    free_list_.clear();
+    for (auto* block : blocks_to_remove) {
+      blocks_.erase(block);
+      ptr_to_block_.erase(block->ptr_);
+      free_block(block);
+      delete block;
+    }
+  }
+
+  virtual void copy_data(void* dest, const void* src, std::size_t count) const {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for copy_data");
+  }
+
+ private:
+  virtual void add_allocated_block(B* block) {
+    std::lock_guard<std::mutex> g(blocks_mutex_);
+    blocks_.insert(block);
+    ptr_to_block_.insert({block->ptr_, block});
+  }
+
+  virtual B* get_free_block(size_t size) {
+    std::lock_guard<std::mutex> g(free_list_mutex_);
+    B key(size);
+    auto it = free_list_.lower_bound(&key);
+    if (it != free_list_.end()) {
+      B* block = *it;
+      block->allocated_ = true;
+      free_list_.erase(it);
+      return block;
+    }
+    return nullptr;
+  }
+
+  virtual void process_events() {
+
+    while (true) {
+      // Avoid calling cudaEventDestroy while holding a mutex, so move
+      // intermediate events out of the lock into this object.
+      // process the last event
+      c10::optional<std::pair<E, B*>> processed;
+      {
+        std::lock_guard<std::mutex> g(events_mutex_);
+        if (!events_.empty()) {
+          processed = std::move(events_.back());
+          events_.pop_back();
+        }
+      }
+
+      if (!processed) {
+        return;
+      }
+
+      // otherwise, query the event
+      {
+        // now, see if we can handle this element
+        auto& event = processed->first;
+        if (!query_event(event)) {
+          // push the event onto the back if it's not ready.
+          {
+            std::lock_guard<std::mutex> g(events_mutex_);
+            events_.push_back(std::move(*processed));
+          }
+          return;
+        }
+      }
+
+      // Process the events.
+      TORCH_INTERNAL_ASSERT(processed);
+      auto* block = processed->second;
+      bool available = false;
+      {
+        std::lock_guard<std::mutex> g(block->mutex_);
+        TORCH_INTERNAL_ASSERT(!block->allocated_)
+        block->event_count_--;
+        if (block->event_count_ == 0) {
+          available = true;
+        }
+      }
+
+      if (available) {
+        std::lock_guard<std::mutex> g(free_list_mutex_);
+        free_list_.insert(block);
+      }
+    }
+  }
+
+  /* These following functions are runtime-related. */
+
+  // Allocate page-locked memory on the host.
+  virtual void allocate_host_memory(size_t size, void** ptr) {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, "Not implemented for allocate_host_memory");
+  }
+
+  // Free block and release the pointer contained in block.
+  virtual void free_block(B* block) {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for free_block");
+  }
+
+  // Record an event on stream and store event into events.
+  virtual void record_stream(c10::optional<std::vector<E>>& events, S stream) {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream");
+  }
+
+  // Query event if it is completed.
+  virtual bool query_event(E& event) {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
+  }
+
+  alignas(64) std::mutex blocks_mutex_;
+  ska::flat_hash_set<B*> blocks_; // block list
+  ska::flat_hash_map<void*, B*> ptr_to_block_;
+
+  // Note: sharding this mutex seems to be profitable in heavily multi-threaded
+  // scenarios.
+  alignas(64) std::mutex free_list_mutex_;
+  // Note: an alternative datastructure can yield significant wins here in
+  // microbenchmarks.
+  std::set<B*, C> free_list_; // free list
+
+  alignas(64) std::mutex events_mutex_;
+  std::deque<std::pair<E, B*>> events_; // event queue paired with block
+};
+
+template <typename T>
+struct CachingHostAllocatorInterface : public at::Allocator {
+  CachingHostAllocatorInterface() :impl_(std::make_unique<T>()) {}
+
+  at::DataPtr allocate(size_t size) override {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for allocate");
+  }
+
+  void free(void* ctx) {
+    impl_->free(ctx);
+  }
+
+  template <typename S>
+  bool record_event(void* ptr, void* ctx, S stream) {
+    return impl_->record_event(ptr, ctx, stream);
+  }
+
+  void empty_cache() {
+    impl_->empty_cache();
+  }
+
+  void copy_data(void* dest, const void* src, std::size_t count)
+      const override {
+    impl_->copy_data(dest, src, count);
+  }
+
+  std::unique_ptr<T> impl_;
+};
+
+} // namespace at
diff --git a/aten/src/ATen/core/CheckMemoryFormat.h b/aten/src/ATen/core/CheckMemoryFormat.h
index 3d1712a2ff19b..442889e2eec6f 100644
--- a/aten/src/ATen/core/CheckMemoryFormat.h
+++ b/aten/src/ATen/core/CheckMemoryFormat.h
@@ -1,6 +1,6 @@
 #include <c10/core/TensorOptions.h>
 
-namespace c10 { namespace impl {
+namespace c10::impl {
 
 inline c10::optional<MemoryFormat>
 check_tensor_options_and_extract_memory_format(
@@ -22,4 +22,4 @@ check_tensor_options_and_extract_memory_format(
   }
 }
 
-}} // namespace impl namespace c10
+} // namespace impl namespace c10
diff --git a/aten/src/ATen/core/DeprecatedTypeProperties.h b/aten/src/ATen/core/DeprecatedTypeProperties.h
index b77b09d595d3c..222465eac56f2 100644
--- a/aten/src/ATen/core/DeprecatedTypeProperties.h
+++ b/aten/src/ATen/core/DeprecatedTypeProperties.h
@@ -94,6 +94,10 @@ class TORCH_API DeprecatedTypeProperties {
     return toBackend(Backend::HIP);
   }
 
+  DeprecatedTypeProperties & privateUser1() const {
+    return toBackend(Backend::PrivateUse1);
+  }
+
   /// Constructs the `TensorOptions` from a type and a `device_index`.
   TensorOptions options(int16_t device_index = -1) const {
     return TensorOptions().dtype(typeMeta())
diff --git a/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.h b/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.h
index a21f1abbe97f4..78f0cfdfa5530 100644
--- a/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.h
+++ b/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.h
@@ -5,6 +5,7 @@
 
 #include <c10/core/Backend.h>
 #include <c10/core/ScalarType.h>
+#include <memory>
 
 namespace at {
 
@@ -21,6 +22,7 @@ class TORCH_API DeprecatedTypePropertiesRegistry {
   DeprecatedTypeProperties& getDeprecatedTypeProperties(Backend p, ScalarType s) const;
 
 private:
+  // NOLINTNEXTLINE(*c-array*)
   std::unique_ptr<DeprecatedTypeProperties> registry
     [static_cast<int>(Backend::NumOptions)]
     [static_cast<int>(ScalarType::NumOptions)];
diff --git a/aten/src/ATen/core/Dict.cpp b/aten/src/ATen/core/Dict.cpp
index 3721ad25c9b92..fb49f75d63cd7 100644
--- a/aten/src/ATen/core/Dict.cpp
+++ b/aten/src/ATen/core/Dict.cpp
@@ -1,7 +1,7 @@
 #include <ATen/core/Dict.h>
 
-namespace c10 {
-namespace detail {
+
+namespace c10::detail {
 bool operator==(const DictImpl& lhs, const DictImpl& rhs) {
   bool isEqualFastChecks =
       *lhs.elementTypes.keyType == *rhs.elementTypes.keyType &&
@@ -25,5 +25,4 @@ bool operator==(const DictImpl& lhs, const DictImpl& rhs) {
 
   return true;
 }
-} // namespace detail
-} // namespace c10
+} // namespace c10::detail
diff --git a/aten/src/ATen/core/Dict.h b/aten/src/ATen/core/Dict.h
index c4fb44ce0c636..964b4a152b5ae 100644
--- a/aten/src/ATen/core/Dict.h
+++ b/aten/src/ATen/core/Dict.h
@@ -207,7 +207,7 @@ template<class Key, class Value> Dict<IValue, IValue> toGenericDict(Dict<Key, Va
 template<class Key, class Value>
 class Dict final {
 private:
-  static_assert((std::is_same<IValue, Key>::value && std::is_same<IValue, Value>::value) || guts::typelist::contains<impl::valid_dict_key_types, Key>::value, "Invalid Key type for Dict. We only support int64_t, double, bool, and string.");
+  static_assert((std::is_same_v<IValue, Key> && std::is_same_v<IValue, Value>) || guts::typelist::contains<impl::valid_dict_key_types, Key>::value, "Invalid Key type for Dict. We only support int64_t, double, bool, and string.");
 
   // impl_ stores the underlying map as a ska_ordered::order_preserving_flat_hash_map.
   // We intentionally don't offer conversion from/to
diff --git a/aten/src/ATen/core/Dict_inl.h b/aten/src/ATen/core/Dict_inl.h
index 69f6791d91cac..0419b3bd49e91 100644
--- a/aten/src/ATen/core/Dict_inl.h
+++ b/aten/src/ATen/core/Dict_inl.h
@@ -120,9 +120,9 @@ template<class Key_, class Value_>
 std::pair<typename Dict<Key, Value>::iterator, bool> Dict<Key, Value>::insert(Key_&& key, Value_&& value) const {
   static_assert(std::is_constructible<Key, Key_>::value, "Wrong type for the key argument of Dict::insert");
   static_assert(std::is_constructible<Value, Value_>::value, "Wrong type for the value argument of Dict::insert");
-  auto inserted = impl_->dict.insert(std::pair<IValue, IValue>{
-    Key(std::forward<Key_>(key)),
-    Value(std::forward<Value_>(value))});
+  auto inserted = impl_->dict.emplace(
+      Key(std::forward<Key_>(key)),
+      Value(std::forward<Value_>(value)));
   return {iterator{inserted.first}, inserted.second};
 }
 
diff --git a/aten/src/ATen/core/Dimname.cpp b/aten/src/ATen/core/Dimname.cpp
index b39b7f00b5c66..47526b6511edd 100644
--- a/aten/src/ATen/core/Dimname.cpp
+++ b/aten/src/ATen/core/Dimname.cpp
@@ -20,7 +20,7 @@ bool Dimname::isValidName(const std::string& name) {
   // letters A through Z, the underscore _ and, except for the first
   // character, the digits 0 through 9" (at least length 1)
   // https://docs.python.org/3/reference/lexical_analysis.html#identifiers
-  if (name.length() == 0) {
+  if (name.empty()) {
     return false;
   }
   for (auto it = name.begin(); it != name.end(); ++it) {
diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
index 957b89c7a1f16..824640705238a 100644
--- a/aten/src/ATen/core/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@@ -72,7 +72,7 @@ static std::tuple<double, int> __printFormat(std::ostream& stream, const Tensor&
     return std::make_tuple(1., 0);
   }
   bool intMode = true;
-  auto self_p = self.data_ptr<double>();
+  auto self_p = self.const_data_ptr<double>();
   for (const auto i : c10::irange(size)) {
     auto z = self_p[i];
     if(std::isfinite(z)) {
@@ -160,7 +160,7 @@ static void __printIndent(std::ostream &stream, int64_t indent)
 
 static void printScale(std::ostream & stream, double scale) {
   FormatGuard guard(stream);
-  stream << defaultfloat << scale << " *" << std::endl;
+  stream << defaultfloat << scale << " *" << '\n';
 }
 static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t linesize, int64_t indent)
 {
@@ -178,7 +178,7 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line
     }
     if(nColumnPerLine < self.size(1)) {
       if(firstColumn != 0) {
-        stream << std::endl;
+        stream << '\n';
       }
       stream << "Columns " << firstColumn+1 << " to " << lastColumn+1;
       __printIndent(stream, indent);
@@ -189,11 +189,11 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line
     }
     for (const auto l : c10::irange(self.size(0))) {
       Tensor row = self.select(0,l);
-      double *row_ptr = row.data_ptr<double>();
+      const double *row_ptr = row.const_data_ptr<double>();
       for (const auto c : c10::irange(firstColumn, lastColumn+1)) {
         stream << std::setw(sz) << row_ptr[c]/scale;
         if(c == lastColumn) {
-          stream << std::endl;
+          stream << '\n';
           if(l != self.size(0)-1) {
             if(scale != 1) {
               __printIndent(stream, indent);
@@ -239,7 +239,7 @@ static void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize)
     if(start) {
       start = false;
     } else {
-      stream << std::endl;
+      stream << '\n';
     }
     stream << "(";
     Tensor tensor = self;
@@ -247,7 +247,7 @@ static void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize)
       tensor = tensor.select(0, counter[i]);
       stream << counter[i]+1 << ",";
     }
-    stream << ".,.) = " << std::endl;
+    stream << ".,.) = " << '\n';
     __printMatrix(stream, tensor, linesize, 1);
   }
 }
@@ -279,7 +279,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
       tensor = tensor_.to(kCPU, kDouble).contiguous();
     }
     if(tensor.ndimension() == 0) {
-      stream << defaultfloat << tensor.data_ptr<double>()[0] << std::endl;
+      stream << defaultfloat << tensor.const_data_ptr<double>()[0] << '\n';
       stream << "[ " << tensor_.toString() << "{}";
     } else if(tensor.ndimension() == 1) {
       if (tensor.numel() > 0) {
@@ -287,9 +287,9 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
         if(scale != 1) {
           printScale(stream, scale);
         }
-        double* tensor_p = tensor.data_ptr<double>();
+        const double* tensor_p = tensor.const_data_ptr<double>();
         for (const auto i : c10::irange(tensor.size(0))) {
-          stream << std::setw(sz) << tensor_p[i]/scale << std::endl;
+          stream << std::setw(sz) << tensor_p[i]/scale << '\n';
         }
       }
       stream << "[ " << tensor_.toString() << "{" << tensor.size(0) << "}";
@@ -329,7 +329,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
     if (tensor.getIntrusivePtr()->autograd_meta()) {
       auto& fw_grad = tensor._fw_grad(/* level */ 0);
       if (fw_grad.defined()) {
-        stream << ", tangent:" << std::endl << fw_grad;
+        stream << ", tangent:" << '\n' << fw_grad;
       }
     }
     stream << " ]";
diff --git a/aten/src/ATen/core/Generator.cpp b/aten/src/ATen/core/Generator.cpp
index 800f8c7c88ec6..0334161f54e73 100644
--- a/aten/src/ATen/core/Generator.cpp
+++ b/aten/src/ATen/core/Generator.cpp
@@ -13,4 +13,12 @@ at::Tensor Generator::get_state() const {
   return at::Tensor::wrap_tensor_impl(this->impl_->get_state());
 }
 
+void Generator::graphsafe_set_state(const Generator& new_state) {
+  this->impl_->graphsafe_set_state(new_state.getIntrusivePtr());
+}
+
+Generator Generator::graphsafe_get_state() const {
+  return Generator(this->impl_->graphsafe_get_state());
+}
+
 } // namespace at
diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h
index 36f2eac9667fc..b237c571b22d3 100644
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@@ -1,15 +1,11 @@
 #pragma once
 
-#include <mutex>
+#include <cstdint>
 #include <deque>
-#include <atomic>
-#include <typeinfo>
+#include <mutex>
 #include <utility>
-#include <cstddef>
-#include <cstdint>
 
 #include <c10/util/Exception.h>
-#include <c10/util/C++17.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/core/Device.h>
 #include <c10/core/DispatchKeySet.h>
@@ -111,6 +107,10 @@ struct TORCH_API Generator {
 
   at::Tensor get_state() const;
 
+  void graphsafe_set_state(const Generator& new_state);
+
+  Generator graphsafe_get_state() const;
+
   std::mutex& mutex() {
     return impl_->mutex_;
   }
diff --git a/aten/src/ATen/core/IListRef.h b/aten/src/ATen/core/IListRef.h
index 2bbf43975eacc..01e52f52f684c 100644
--- a/aten/src/ATen/core/IListRef.h
+++ b/aten/src/ATen/core/IListRef.h
@@ -307,10 +307,10 @@ class IListRefTagImplBase {};
  * reference type, then it's left unchanged.
  */
 template <typename T>
-using _MaterializedIListRefElem = typename std::conditional<
-    std::is_reference<T>::value,
-    typename std::reference_wrapper<typename std::remove_reference<T>::type>,
-    T>::type;
+using _MaterializedIListRefElem = std::conditional_t<
+    std::is_reference_v<T>,
+    typename std::reference_wrapper<std::remove_reference_t<T>>,
+    T>;
 
 template <typename T>
 using MaterializedIListRefElem = _MaterializedIListRefElem<IListRefConstRef<T>>;
@@ -540,7 +540,7 @@ class IListRef {
   template <
       typename... UnboxedConstructorArgs,
       typename = std::enable_if_t<
-          std::is_constructible<unboxed_type, UnboxedConstructorArgs...>::value>>
+          std::is_constructible_v<unboxed_type, UnboxedConstructorArgs...>>>
   IListRef(UnboxedConstructorArgs&&... args) : tag_(IListRefTag::Unboxed) {
     payload_.unboxed = unboxed_type(std::forward<UnboxedConstructorArgs>(args)...);
   }
diff --git a/aten/src/ATen/core/IListRef_inl.h b/aten/src/ATen/core/IListRef_inl.h
index 534272f69b64f..34673d6bf2b24 100644
--- a/aten/src/ATen/core/IListRef_inl.h
+++ b/aten/src/ATen/core/IListRef_inl.h
@@ -8,8 +8,8 @@ class Tensor;
 class OptionalTensorRef;
 }
 
-namespace c10 {
-namespace detail {
+
+namespace c10::detail {
 
 /*
  * Specializations of `IListRefTagImplBase` that implement the default
@@ -184,8 +184,8 @@ class IListRefTagImpl<IListRefTag::Materialized, at::OptionalTensorRef>
           at::OptionalTensorRef,
           MaterializedIListRefElem<at::OptionalTensorRef>> {};
 
-} // namespace detail
-} // namespace c10
+} // namespace c10::detail
+
 
 namespace at {
 
diff --git a/aten/src/ATen/core/IListRef_test.cpp b/aten/src/ATen/core/IListRef_test.cpp
index 0530dea5f28b5..3fcb3858e657f 100644
--- a/aten/src/ATen/core/IListRef_test.cpp
+++ b/aten/src/ATen/core/IListRef_test.cpp
@@ -103,7 +103,7 @@ TEST(ITensorListRefTest, Boxed_GetConstRefTensor) {
   const List<at::Tensor> boxed(vec);
   at::ITensorListRef list(boxed);
   static_assert(
-      std::is_same<decltype(*list.begin()), const at::Tensor&>::value,
+      std::is_same_v<decltype(*list.begin()), const at::Tensor&>,
       "Accessing elements from List<Tensor> through a ITensorListRef should be const references.");
   EXPECT_TRUE(boxed[0].is_same(*list.begin()));
   EXPECT_TRUE(boxed[1].is_same(*(++list.begin())));
@@ -113,7 +113,7 @@ TEST(ITensorListRefTest, Unboxed_GetConstRefTensor) {
   auto vec = get_tensor_vector();
   at::ITensorListRef list(vec);
   static_assert(
-      std::is_same<decltype(*list.begin()), const at::Tensor&>::value,
+      std::is_same_v<decltype(*list.begin()), const at::Tensor&>,
       "Accessing elements from ArrayRef<Tensor> through a ITensorListRef should be const references.");
   EXPECT_TRUE(vec[0].is_same(*list.begin()));
   EXPECT_TRUE(vec[1].is_same(*(++list.begin())));
diff --git a/aten/src/ATen/core/List.cpp b/aten/src/ATen/core/List.cpp
index 6fb9b11ef156f..a9f041517062e 100644
--- a/aten/src/ATen/core/List.cpp
+++ b/aten/src/ATen/core/List.cpp
@@ -1,7 +1,7 @@
 #include <ATen/core/List.h>
 
-namespace c10 {
-namespace detail {
+
+namespace c10::detail {
 bool operator==(const ListImpl& lhs, const ListImpl& rhs) {
   return *lhs.elementType == *rhs.elementType &&
       lhs.list.size() == rhs.list.size() &&
@@ -16,5 +16,4 @@ bool operator==(const ListImpl& lhs, const ListImpl& rhs) {
 ListImpl::ListImpl(list_type list_, TypePtr elementType_)
   : list(std::move(list_))
   , elementType(std::move(elementType_)) {}
-} // namespace detail
-} // namespace c10
+} // namespace c10::detail
diff --git a/aten/src/ATen/core/List.h b/aten/src/ATen/core/List.h
index d1271dadec2ac..68ecf5ed343f8 100644
--- a/aten/src/ATen/core/List.h
+++ b/aten/src/ATen/core/List.h
@@ -44,7 +44,7 @@ template<class T, class Iterator> class ListIterator;
 template<class T, class Iterator> class ListElementReference;
 
 template<class T, class Iterator>
-void swap(ListElementReference<T, Iterator>&& lhs, ListElementReference<T, Iterator>&& rhs);
+void swap(ListElementReference<T, Iterator>&& lhs, ListElementReference<T, Iterator>&& rhs) noexcept;
 
 template<class T, class Iterator>
 bool operator==(const ListElementReference<T, Iterator>& lhs, const T& rhs);
@@ -68,8 +68,8 @@ template<class T, class Iterator>
 class ListElementReference final {
 public:
   operator std::conditional_t<
-      std::is_reference<typename c10::detail::
-                            ivalue_to_const_ref_overload_return<T>::type>::value,
+      std::is_reference_v<typename c10::detail::
+                            ivalue_to_const_ref_overload_return<T>::type>,
       const T&,
       T>() const;
 
@@ -84,7 +84,7 @@ class ListElementReference final {
     return *iterator_;
   }
 
-  friend void swap<T, Iterator>(ListElementReference&& lhs, ListElementReference&& rhs);
+  friend void swap<T, Iterator>(ListElementReference&& lhs, ListElementReference&& rhs) noexcept;
 
   ListElementReference(const ListElementReference&) = delete;
   ListElementReference& operator=(const ListElementReference&) = delete;
@@ -285,7 +285,7 @@ class List final {
    * Returns the element at specified location pos, with bounds checking.
    * If pos is not within the range of the container, an exception of type std::out_of_range is thrown.
    */
-  value_type get(size_type pos) const;
+  internal_const_reference_type get(size_type pos) const;
 
   /**
    * Moves out the element at the specified location pos and returns it, with bounds checking.
diff --git a/aten/src/ATen/core/List_inl.h b/aten/src/ATen/core/List_inl.h
index 0fb911278a919..f8ce73eb3f9cc 100644
--- a/aten/src/ATen/core/List_inl.h
+++ b/aten/src/ATen/core/List_inl.h
@@ -120,8 +120,8 @@ namespace impl {
 
 template <class T, class Iterator>
 ListElementReference<T, Iterator>::operator std::conditional_t<
-    std::is_reference<typename c10::detail::ivalue_to_const_ref_overload_return<
-        T>::type>::value,
+    std::is_reference_v<typename c10::detail::ivalue_to_const_ref_overload_return<
+        T>::type>,
     const T&,
     T>() const {
   return iterator_->template to<T>();
@@ -146,7 +146,7 @@ ListElementReference<T, Iterator>& ListElementReference<T, Iterator>::operator=(
 }
 
 template<class T, class Iterator>
-void swap(ListElementReference<T, Iterator>&& lhs, ListElementReference<T, Iterator>&& rhs) {
+void swap(ListElementReference<T, Iterator>&& lhs, ListElementReference<T, Iterator>&& rhs)  noexcept {
   std::swap(*lhs.iterator_, *rhs.iterator_);
 }
 
@@ -186,8 +186,8 @@ void List<T>::set(size_type pos, value_type&& value) const {
 }
 
 template<class T>
-typename List<T>::value_type List<T>::get(size_type pos) const {
-  return c10::detail::list_element_to<T>(impl_->list.at(pos));
+typename List<T>::internal_const_reference_type List<T>::get(size_type pos) const {
+  return operator[](pos);
 }
 
 template<class T>
diff --git a/aten/src/ATen/core/List_test.cpp b/aten/src/ATen/core/List_test.cpp
index 1fe14309a9330..56da3cf299e90 100644
--- a/aten/src/ATen/core/List_test.cpp
+++ b/aten/src/ATen/core/List_test.cpp
@@ -1118,7 +1118,7 @@ TEST(ListTestNonIValueBasedList, sameValueDifferentStorage_thenIsReturnsFalse) {
 TEST(ListTest, canAccessStringByReference) {
   List<std::string> list({"one", "two"});
   const auto& listRef = list;
-  static_assert(std::is_same<decltype(listRef[1]), const std::string&>::value,
+  static_assert(std::is_same_v<decltype(listRef[1]), const std::string&>,
                 "const List<std::string> access should be by const reference");
   std::string str = list[1];
   const std::string& strRef = listRef[1];
@@ -1130,7 +1130,7 @@ TEST(ListTest, canAccessOptionalStringByReference) {
   List<c10::optional<std::string>> list({"one", "two", c10::nullopt});
   const auto& listRef = list;
   static_assert(
-      std::is_same<decltype(listRef[1]), c10::optional<std::reference_wrapper<const std::string>>>::value,
+      std::is_same_v<decltype(listRef[1]), c10::optional<std::reference_wrapper<const std::string>>>,
       "List<c10::optional<std::string>> access should be by const reference");
   c10::optional<std::string> str1 = list[1];
   c10::optional<std::string> str2 = list[2];
@@ -1148,7 +1148,7 @@ TEST(ListTest, canAccessTensorByReference) {
   List<at::Tensor> list;
   const auto& listRef = list;
   static_assert(
-      std::is_same<decltype(listRef[0]), const at::Tensor&>::value,
+      std::is_same_v<decltype(listRef[0]), const at::Tensor&>,
       "List<at::Tensor> access should be by const reference");
 }
 
diff --git a/aten/src/ATen/core/MetaFallbackKernel.cpp b/aten/src/ATen/core/MetaFallbackKernel.cpp
index fe56568bbbcd1..8523a55878103 100644
--- a/aten/src/ATen/core/MetaFallbackKernel.cpp
+++ b/aten/src/ATen/core/MetaFallbackKernel.cpp
@@ -8,14 +8,14 @@ static void metaFallback(
     const c10::OperatorHandle& op,
     c10::DispatchKeySet dispatch_keys,
     torch::jit::Stack* stack) {
-  c10::Dispatcher::singleton().throwIfHasAbstractImplPyStub(op.operator_name());
+  c10::Dispatcher::singleton().throwIfHasPythonModule(op.operator_name());
   TORCH_CHECK_NOT_IMPLEMENTED(
       false,
       op.operator_name(),
       ": attempted to run this operator with Meta tensors, but there was no ",
-      "abstract impl or Meta kernel registered. You may have run into this message "
+      "fake impl or Meta kernel registered. You may have run into this message "
       "while using an operator with PT2 compilation APIs (torch.compile/torch.export); "
-      "in order to use this operator with those APIs you'll need to add an abstract impl."
+      "in order to use this operator with those APIs you'll need to add a fake impl."
       "Please see the following doc for next steps: "
       "https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit");
 }
diff --git a/aten/src/ATen/core/NamedTensor.cpp b/aten/src/ATen/core/NamedTensor.cpp
index 846178f5a00d1..b224a7c9608cc 100644
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@@ -87,8 +87,8 @@ void check_names_valid_for(TensorImpl* impl, DimnameList names) {
 void internal_set_names_inplace(TensorImpl* impl, optional<DimnameList> names, bool validate_names) {
   TORCH_CHECK(impl->layout() == Layout::Strided,
       "NYI: named tensors only support strided layout");
-  TORCH_CHECK(impl->device().is_cpu() || impl->device().is_cuda() || impl->device().is_privateuseone(),
-      "NYI: named tensors only support CPU, CUDA or ", c10::get_privateuse1_backend(), " tensors.");
+  TORCH_CHECK(impl->device().is_cpu() || impl->device().is_cuda() || impl->device().is_xpu() || impl->device().is_privateuseone(),
+      "NYI: named tensors only support CPU, CUDA, XPU or ", c10::get_privateuse1_backend(), " tensors.");
   if (!names) {
     impl->set_named_tensor_meta(nullptr);
     return;
@@ -121,9 +121,9 @@ void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names,
   }
   auto* meta = get_named_tensor_meta(impl);
   if (meta == nullptr) {
-    impl->set_named_tensor_meta(std::make_unique<NamedTensorMeta>(NamedTensorMeta::HasNonWildcard, names));
+    impl->set_named_tensor_meta(std::make_unique<NamedTensorMeta>(NamedTensorMeta::HasNonWildcard, std::move(names)));
   } else {
-    meta->set_names(NamedTensorMeta::HasNonWildcard, names);
+    meta->set_names(NamedTensorMeta::HasNonWildcard, std::move(names));
   }
 }
 
diff --git a/aten/src/ATen/core/NamedTensor.h b/aten/src/ATen/core/NamedTensor.h
index 73a0d7d02551b..d6ff30ce00838 100644
--- a/aten/src/ATen/core/NamedTensor.h
+++ b/aten/src/ATen/core/NamedTensor.h
@@ -2,7 +2,6 @@
 
 #include <ATen/core/Dimname.h>
 #include <c10/core/TensorImpl.h>
-#include <c10/util/C++17.h>
 
 namespace at {
 
@@ -45,7 +44,7 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
 
   // Used for an assertion in TensorImpl.h
   int64_t slow_dim() const override {
-    return names_.size();
+    return static_cast<int64_t>(names_.size());
   }
 
   void check_invariants() const {
@@ -80,7 +79,7 @@ struct TORCH_API NamesMode {
 // A RAII, thread local (!) guard that enables or disables names upon
 // construction, and sets it back to the original value upon destruction.
 struct TORCH_API NoNamesGuard {
-  NoNamesGuard() : prev_mode(NamesMode::is_enabled()), initialized(true) {
+  NoNamesGuard() : prev_mode(NamesMode::is_enabled()) {
     NamesMode::set_enabled(false);
   }
   ~NoNamesGuard() {
@@ -94,7 +93,7 @@ struct TORCH_API NoNamesGuard {
   }
  private:
   bool prev_mode;
-  bool initialized;
+  bool initialized{true};
 };
 
 void check_names_valid_for(const TensorBase& tensor, DimnameList names);
diff --git a/aten/src/ATen/core/NestedIntSymNodeImpl.cpp b/aten/src/ATen/core/NestedIntSymNodeImpl.cpp
new file mode 100644
index 0000000000000..b703f76773b46
--- /dev/null
+++ b/aten/src/ATen/core/NestedIntSymNodeImpl.cpp
@@ -0,0 +1,80 @@
+#include <ATen/core/NestedIntSymNodeImpl.h>
+#include <c10/core/SymNodeImpl.h>
+#include <c10/util/Exception.h>
+
+namespace c10 {
+
+namespace {
+bool _eq(const char* op, c10::SymNodeImpl* lhs, c10::SymNodeImpl* rhs) {
+  TORCH_INTERNAL_ASSERT(lhs->is_nested_int());
+  c10::optional<int64_t> c = rhs->nested_int();
+  return (
+      c.has_value() && lhs->nested_int() == *c &&
+      lhs->nested_int_coeff() == rhs->nested_int_coeff());
+}
+bool _ge(const char* op, c10::SymNodeImpl* lhs, c10::SymNodeImpl* rhs) {
+  if (auto mb_si = lhs->nested_int()) {
+    if (auto mb_si2 = rhs->nested_int()) {
+      if (*mb_si == *mb_si2) {
+        return lhs->nested_int_coeff() >= rhs->nested_int_coeff();
+      }
+      TORCH_CHECK(false, "nested int ", op, ": Relation is indeterminate");
+    }
+    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+    if (rhs->constant_int() && *rhs->constant_int() <= 2) {
+      return true;
+    }
+    TORCH_CHECK(false, "nested int ", op, ": Relation is indeterminate");
+  } else if (rhs->nested_int()) {
+    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+    if (lhs->constant_int() && *lhs->constant_int() < 2) {
+      return false;
+    }
+    TORCH_CHECK(false, "nested int ", op, ": Relation is indeterminate");
+  }
+  TORCH_INTERNAL_ASSERT(false, "expect at least one nested int");
+}
+} // namespace
+
+c10::SymNode NestedIntSymNodeImpl::eq(const c10::SymNode& other) {
+  return SymNode(c10::make_intrusive<ConstantSymNodeImpl<bool>>(
+      _eq("eq", this, other.get())));
+}
+
+c10::SymNode NestedIntSymNodeImpl::ne(const c10::SymNode& other) {
+  return SymNode(c10::make_intrusive<ConstantSymNodeImpl<bool>>(
+      !_eq("ne", this, other.get())));
+}
+
+c10::SymNode NestedIntSymNodeImpl::ge(const c10::SymNode& other) {
+  return SymNode(c10::make_intrusive<ConstantSymNodeImpl<bool>>(
+      _ge("ge", this, other.get())));
+}
+
+c10::SymNode NestedIntSymNodeImpl::gt(const c10::SymNode& other) {
+  return SymNode(c10::make_intrusive<ConstantSymNodeImpl<bool>>(
+      !_ge("gt", other.get(), this)));
+}
+
+c10::SymNode NestedIntSymNodeImpl::lt(const c10::SymNode& other) {
+  return SymNode(c10::make_intrusive<ConstantSymNodeImpl<bool>>(
+      !_ge("lt", this, other.get())));
+}
+
+c10::SymNode NestedIntSymNodeImpl::le(const c10::SymNode& other) {
+  return SymNode(c10::make_intrusive<ConstantSymNodeImpl<bool>>(
+      _ge("le", other.get(), this)));
+}
+
+c10::SymNode NestedIntSymNodeImpl::mul(const c10::SymNode& other) {
+  TORCH_CHECK(!other->nested_int(), "nested int cannot be multiplied by nested int");
+  c10::optional<int64_t> c = other->constant_int();
+  TORCH_CHECK(c.has_value());
+  return SymNode(c10::make_intrusive<NestedIntSymNodeImpl>(val_, coeff_ * *c));
+}
+
+c10::SymNode NestedIntSymNodeImpl::clone() {
+  return SymNode(c10::make_intrusive<NestedIntSymNodeImpl>(val_, coeff_));
+}
+
+} // namespace c10
diff --git a/aten/src/ATen/core/NestedIntSymNodeImpl.h b/aten/src/ATen/core/NestedIntSymNodeImpl.h
new file mode 100644
index 0000000000000..228f4310a38fc
--- /dev/null
+++ b/aten/src/ATen/core/NestedIntSymNodeImpl.h
@@ -0,0 +1,187 @@
+#pragma once
+
+#include <c10/core/ConstantSymNodeImpl.h>
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <c10/util/intrusive_ptr.h>
+#include <cstdint>
+#include <string>
+
+namespace c10 {
+
+// The motivating usecase for this is to represent the ragged size structure
+// of a jagged tensor [B, [s_0, s_1, s_2], D] as a single integer j0. This
+// allows us to simply return [B, j0, D] if someone queries for the size of our
+// tensor.
+//
+// Morally we define comparison between two nested ints to return true if
+// that comparison holds for all corresponding elements of the arrays they
+// represent. Comparison between a nested int and a plain int is defined
+// similarly.
+//
+// To simulate this desired behavior but also avoid the O(N) cost of checking,
+// we associate each raggedness pattern with an integer "id" that can be used as
+// a proxy to evaluate equality. We also constrain the range of values for this
+// as to enable inequality checks.
+//
+// We also support a positive integer scalar "coeff" that is used for computing
+// strides. For example given, a [B, j0, D] tensor, it can be strided in two
+// different ways: [D * j0, D, 1] and [j0, 1, sum(j0)]. The coeff is used to
+// differentiate the two cases.
+//
+// During tracing the strides of the outputs need to be a function of the size
+// and strides of the inputs so it is important that NestedIntSymNode itself is
+// able to express this.
+class TORCH_API NestedIntSymNodeImpl : public SymNodeImpl {
+ public:
+  // CAUTION: you should probably not be constructing these directly; please
+  // the higher-level API in python instead (TODO: actually introduce that).
+  explicit NestedIntSymNodeImpl(int64_t val, int64_t coeff)
+      : val_(val), coeff_(coeff) {}
+
+  bool bool_() override {
+    return false;
+  }
+
+  bool is_int() override {
+    return true;
+  }
+
+  bool is_float() override {
+    return false;
+  }
+
+  bool is_bool() override {
+    return false;
+  }
+
+  bool is_nested_int() const override {
+    return true;
+  }
+
+  bool has_hint() override {
+    return true;
+  }
+
+  c10::SymNode wrap_int(int64_t num) override {
+    return SymNode(c10::make_intrusive<ConstantSymNodeImpl<int64_t>>(num));
+  };
+
+  int64_t guard_int(const char* file, int64_t line) override {
+    TORCH_CHECK(false);
+  }
+
+  double guard_float(const char* file, int64_t line) override {
+    TORCH_CHECK(false, "not a float");
+  }
+
+  bool guard_bool(const char* file, int64_t line) override {
+    TORCH_CHECK(false, "not a bool");
+  }
+
+  int64_t int_() override {
+    TORCH_CHECK(false);
+  }
+
+  std::string str() override {
+    if (coeff_ == 1) {
+      return "j" + std::to_string(val_);
+    }
+    return std::to_string(coeff_) + "*j" + std::to_string(val_);
+  }
+
+  // NOTE [ Inequalities with nested int ]
+  //
+  // The semantics of nested int when it comes to relations is that it is
+  // treated as integer known to be within a certain range,
+  //
+  //     j0 \in [2, int64_t::max]
+  //
+  // allowing us to answer queries like j0 >= 1 (True), and j0 == 0 (False).
+  // This is a useful default range for the raggedness pattern of a jagged
+  // tensor (1) since sizes are non-negative, and (2) we need to get past 0/1
+  // specialization checks.
+  //
+  // [ Indeterminate inequalities error out ]
+  //
+  // Given the semantic defined above, certain relations like j0 < 3 are thus
+  // indeterminable. In our impl today, evaluating such relations error
+  //
+  // It may seem convenient to just define indeterminate relations to return
+  // False, but the implementation we maintain in parallel using sympy does not
+  // allow this.
+  //
+  // Sympy only allows overriding of Ge. The other relations (Lt, Gt, Le) are,
+  // by consequence, all derived from Ge e.g., Lt(a, b) := !Ge(a, b). This
+  // would mean that means that if we define the indeterminate j0 >= 3 to be
+  // False, the also indeterminate j0 < 3 will be evaluated to be True!
+  //
+  // [ Coefficient are assumed positive ]
+  //
+  // For the purpose of computing inequalities, we consider the coefficient of
+  // the nested int to be a positive integer.
+  //
+  // Thus, no modifications are needed to the logic since
+  // j0 >= k implies coeff * j0 >= k
+  //
+  c10::SymNode eq(const c10::SymNode& other) override;
+  c10::SymNode ne(const c10::SymNode& other) override;
+  c10::SymNode ge(const c10::SymNode& other) override;
+  c10::SymNode gt(const c10::SymNode& other) override;
+  c10::SymNode lt(const c10::SymNode& other) override;
+  c10::SymNode le(const c10::SymNode& other) override;
+  c10::SymNode mul(const c10::SymNode& other) override;
+
+  c10::optional<int64_t> nested_int() override {
+    return val_;
+  }
+
+  c10::optional<int64_t> nested_int_coeff() override {
+    return coeff_;
+  }
+
+  bool is_symbolic() override {
+    return false;
+  }
+
+  c10::SymNode clone() override;
+
+#define DEFINE_BINARY_NOT_SUPPORTED(name)                           \
+  c10::SymNode name(const c10::SymNode& other) override {           \
+    TORCH_CHECK(false, #name " not supported by NestedIntSymNode"); \
+  }
+
+  DEFINE_BINARY_NOT_SUPPORTED(add)
+  DEFINE_BINARY_NOT_SUPPORTED(sub)
+  DEFINE_BINARY_NOT_SUPPORTED(truediv)
+  DEFINE_BINARY_NOT_SUPPORTED(pow)
+  DEFINE_BINARY_NOT_SUPPORTED(floordiv)
+  DEFINE_BINARY_NOT_SUPPORTED(mod)
+  DEFINE_BINARY_NOT_SUPPORTED(sym_min)
+  DEFINE_BINARY_NOT_SUPPORTED(sym_max)
+  DEFINE_BINARY_NOT_SUPPORTED(sym_and)
+  DEFINE_BINARY_NOT_SUPPORTED(sym_or)
+
+#undef DEFINE_BINARY_NOT_SUPPORTED
+
+#define DEFINE_NOT_SUPPORTED(name)                                     \
+  c10::SymNode name() override {                                       \
+    TORCH_CHECK(false, #name " is not supported by NestedIntSymNode"); \
+  }
+
+  DEFINE_NOT_SUPPORTED(sym_not)
+  DEFINE_NOT_SUPPORTED(ceil)
+  DEFINE_NOT_SUPPORTED(floor)
+  DEFINE_NOT_SUPPORTED(neg)
+  DEFINE_NOT_SUPPORTED(sym_float)
+
+#undef DEFINE_NOT_SUPPORTED
+
+ private:
+  int64_t val_;
+  int64_t coeff_;
+};
+
+} // namespace c10
diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp
index 4758942c529b9..a34341b4a9437 100644
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@@ -120,8 +120,8 @@ void preDispatchFallback(const c10::OperatorHandle& op, c10::DispatchKeySet disp
 
 } // anonymous namespace
 
-namespace at {
-namespace impl {
+
+namespace at::impl {
 
 RestorePythonTLSSnapshot::RestorePythonTLSSnapshot() : saved_(safe_get_tls_on_entry()), guard_(safe_get_tls_on_entry()) {
   tls_on_entry = c10::nullopt;
@@ -148,8 +148,7 @@ MaybeSetTLSOnEntryGuard::~MaybeSetTLSOnEntryGuard() {
 }
 
 
-} // namespace impl
-} // namespace at
+} // namespace at::impl
 
 TORCH_LIBRARY_IMPL(_, Python, m) {
   m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonFallback>());
diff --git a/aten/src/ATen/core/PythonFallbackKernel.h b/aten/src/ATen/core/PythonFallbackKernel.h
index f38bdd2ada90a..67f24795eeb58 100644
--- a/aten/src/ATen/core/PythonFallbackKernel.h
+++ b/aten/src/ATen/core/PythonFallbackKernel.h
@@ -1,8 +1,8 @@
 #pragma once
 #include <ATen/core/TorchDispatchUtils.h>
 
-namespace at {
-namespace impl {
+
+namespace at::impl {
 
 struct TORCH_API RestorePythonTLSSnapshot {
   RestorePythonTLSSnapshot();
@@ -24,5 +24,4 @@ struct TORCH_API MaybeSetTLSOnEntryGuard {
   bool value_set_;
 };
 
-} // namespace impl
-} // namespace at
+} // namespace at::impl
diff --git a/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp b/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp
index 69f3fcc644f0f..219d774de3a54 100644
--- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp
@@ -1,7 +1,6 @@
 #include <ATen/core/PythonOpRegistrationTrampoline.h>
 
-namespace at {
-namespace impl {
+namespace at::impl {
 
 // The strategy is that all python interpreters attempt to register themselves
 // as the main interpreter, but only one wins.  Only that interpreter is
@@ -9,14 +8,15 @@ namespace impl {
 // logic on that interpreter, we do so hermetically, never setting pyobj field
 // on Tensor.
 
-std::atomic<c10::impl::PyInterpreter*> PythonOpRegistrationTrampoline::interpreter_{nullptr};
+std::atomic<c10::impl::PyInterpreter*>
+    PythonOpRegistrationTrampoline::interpreter_{nullptr};
 
 c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() {
   return PythonOpRegistrationTrampoline::interpreter_.load();
-
 }
 
-bool PythonOpRegistrationTrampoline::registerInterpreter(c10::impl::PyInterpreter* interp) {
+bool PythonOpRegistrationTrampoline::registerInterpreter(
+    c10::impl::PyInterpreter* interp) {
   c10::impl::PyInterpreter* expected = nullptr;
   interpreter_.compare_exchange_strong(expected, interp);
   if (expected != nullptr) {
@@ -29,5 +29,4 @@ bool PythonOpRegistrationTrampoline::registerInterpreter(c10::impl::PyInterprete
   }
 }
 
-} // namespace impl
-} // namespace at
+} // namespace at::impl
diff --git a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
index b1a2b30685f30..bec323c7d25bf 100644
--- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
@@ -4,8 +4,8 @@
 
 // TODO: this can probably live in c10
 
-namespace at {
-namespace impl {
+
+namespace at::impl {
 
 class TORCH_API PythonOpRegistrationTrampoline final {
   static std::atomic<c10::impl::PyInterpreter*> interpreter_;
@@ -19,5 +19,4 @@ class TORCH_API PythonOpRegistrationTrampoline final {
   static c10::impl::PyInterpreter* getInterpreter();
 };
 
-} // namespace impl
-} // namespace at
+} // namespace at::impl
diff --git a/aten/src/ATen/core/QuantizerBase.h b/aten/src/ATen/core/QuantizerBase.h
index b6031f0d77983..0d2eaeece8898 100644
--- a/aten/src/ATen/core/QuantizerBase.h
+++ b/aten/src/ATen/core/QuantizerBase.h
@@ -37,6 +37,7 @@ using QuantizerPtr = c10::intrusive_ptr<Quantizer>;
  * share the same Quantizer. Quantizer should be immutable.
  */
 struct TORCH_API Quantizer : public c10::intrusive_ptr_target {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const ScalarType scalar_type_;
   explicit Quantizer(ScalarType scalar_type) : scalar_type_(scalar_type) {}
   ~Quantizer() override;
diff --git a/aten/src/ATen/core/Reduction.h b/aten/src/ATen/core/Reduction.h
index 23c6ea3cabefb..340e9f91ae8f7 100644
--- a/aten/src/ATen/core/Reduction.h
+++ b/aten/src/ATen/core/Reduction.h
@@ -1,16 +1,14 @@
 #pragma once
 
-namespace at {
-namespace Reduction {
+namespace at::Reduction {
 
 // NB: Keep this in sync with Reduction class in torch/nn/_reduction.py
 // These constants control the reduction behavior of loss functions.
 // Ideally, this would be a scoped enum, but jit doesn't support that
 enum Reduction {
-  None,             // Do not reduce
-  Mean,             // (Possibly weighted) mean of losses
-  Sum,              // Sum losses
+  None, // Do not reduce
+  Mean, // (Possibly weighted) mean of losses
+  Sum, // Sum losses
   END
 };
-} // namespace Reduction
-} // namespace at
+} // namespace at::Reduction
diff --git a/aten/src/ATen/core/SingletonSymNodeImpl.cpp b/aten/src/ATen/core/SingletonSymNodeImpl.cpp
deleted file mode 100644
index 3ac668d987825..0000000000000
--- a/aten/src/ATen/core/SingletonSymNodeImpl.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-#include <ATen/core/SingletonSymNodeImpl.h>
-#include <c10/core/SymNodeImpl.h>
-#include <c10/util/Exception.h>
-
-namespace c10 {
-
-namespace {
-bool _eq(const char* op, c10::SymNodeImpl* lhs, c10::SymNodeImpl* rhs) {
-  TORCH_INTERNAL_ASSERT(lhs->singleton_int().has_value());
-  c10::optional<int64_t> c = rhs->singleton_int();
-  return (
-      c.has_value() && lhs->singleton_int() == *c &&
-      lhs->singleton_coeff() == rhs->singleton_coeff());
-}
-bool _ge(const char* op, c10::SymNodeImpl* lhs, c10::SymNodeImpl* rhs) {
-  if (auto mb_si = lhs->singleton_int()) {
-    if (auto mb_si2 = rhs->singleton_int()) {
-      if (*mb_si == *mb_si2) {
-        return lhs->singleton_coeff() >= rhs->singleton_coeff();
-      }
-      TORCH_CHECK(false, "Singleton int ", op, ": Relation is indeterminate");
-    }
-    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
-    if (rhs->constant_int() && *rhs->constant_int() <= 2) {
-      return true;
-    }
-    TORCH_CHECK(false, "Singleton int ", op, ": Relation is indeterminate");
-  } else if (rhs->singleton_int()) {
-    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
-    if (lhs->constant_int() && *lhs->constant_int() < 2) {
-      return false;
-    }
-    TORCH_CHECK(false, "Singleton int ", op, ": Relation is indeterminate");
-  }
-  TORCH_INTERNAL_ASSERT(false, "expect at least one singleton");
-}
-} // namespace
-
-c10::SymNode SingletonSymNodeImpl::eq(const c10::SymNode& other) {
-  return SymNode(c10::make_intrusive<ConstantSymNodeImpl<bool>>(
-      _eq("eq", this, other.get())));
-}
-
-c10::SymNode SingletonSymNodeImpl::ne(const c10::SymNode& other) {
-  return SymNode(c10::make_intrusive<ConstantSymNodeImpl<bool>>(
-      !_eq("ne", this, other.get())));
-}
-
-c10::SymNode SingletonSymNodeImpl::ge(const c10::SymNode& other) {
-  return SymNode(c10::make_intrusive<ConstantSymNodeImpl<bool>>(
-      _ge("ge", this, other.get())));
-}
-
-c10::SymNode SingletonSymNodeImpl::gt(const c10::SymNode& other) {
-  return SymNode(c10::make_intrusive<ConstantSymNodeImpl<bool>>(
-      !_ge("gt", other.get(), this)));
-}
-
-c10::SymNode SingletonSymNodeImpl::lt(const c10::SymNode& other) {
-  return SymNode(c10::make_intrusive<ConstantSymNodeImpl<bool>>(
-      !_ge("lt", this, other.get())));
-}
-
-c10::SymNode SingletonSymNodeImpl::le(const c10::SymNode& other) {
-  return SymNode(c10::make_intrusive<ConstantSymNodeImpl<bool>>(
-      _ge("le", other.get(), this)));
-}
-
-c10::SymNode SingletonSymNodeImpl::mul(const c10::SymNode& other) {
-  if (auto mb_si = other->singleton_int()) {
-    TORCH_CHECK(false, "Singleton int cannot be multiplied by singleton int");
-  }
-  c10::optional<int64_t> c = other->constant_int();
-  TORCH_CHECK(c.has_value());
-  return SymNode(c10::make_intrusive<SingletonSymNodeImpl>(val_, coeff_ * *c));
-}
-
-} // namespace c10
diff --git a/aten/src/ATen/core/SingletonSymNodeImpl.h b/aten/src/ATen/core/SingletonSymNodeImpl.h
deleted file mode 100644
index 5c4c9720f8487..0000000000000
--- a/aten/src/ATen/core/SingletonSymNodeImpl.h
+++ /dev/null
@@ -1,182 +0,0 @@
-#pragma once
-
-#include <c10/core/ConstantSymNodeImpl.h>
-#include <c10/core/SymNodeImpl.h>
-#include <c10/macros/Export.h>
-#include <c10/util/Exception.h>
-#include <c10/util/Optional.h>
-#include <c10/util/intrusive_ptr.h>
-#include <cstdint>
-#include <string>
-
-namespace c10 {
-
-// The motivating usecase for this is to represent the ragged size structure
-// of a jagged tensor [B, [s_0, s_1, s_2], D] as a single integer j0. This
-// allows us to simply return [B, j0, D] if someone queries for the size of our
-// tensor.
-//
-// Morally we define comparison between two singleton ints to return true if
-// that comparison holds for all corresponding elements of the arrays they
-// represent. Comparison between a singleton int and a plain int is defined
-// similarly.
-//
-// To simulate this desired behavior but also avoid the O(N) cost of checking,
-// we associate each raggedness pattern with an integer "id" that can be used as
-// a proxy to evaluate equality. We also constrain the range of values for this
-// as to enable inequality checks.
-//
-// We also support a positive integer scalar "coeff" that is used for computing
-// strides. For example given, a [B, j0, D] tensor, it can be strided in two
-// different ways: [D * j0, D, 1] and [j0, 1, sum(j0)]. The coeff is used to
-// differentiate the two cases.
-//
-// During tracing the strides of the outputs need to be a function of the size
-// and strides of the inputs so it is important that SingletonSymNode itself is
-// able to express this.
-class TORCH_API SingletonSymNodeImpl : public SymNodeImpl {
- public:
-  // CAUTION: you should probably not be constructing these directly; please
-  // the higher-level API in python instead (TODO: actually introduce that).
-  explicit SingletonSymNodeImpl(int64_t val, int64_t coeff)
-      : val_(val), coeff_(coeff) {}
-
-  bool bool_() override {
-    return false;
-  }
-
-  bool is_int() override {
-    return true;
-  }
-
-  bool is_float() override {
-    return false;
-  }
-
-  bool is_bool() override {
-    return false;
-  }
-
-  bool has_hint() override {
-    return true;
-  }
-
-  c10::SymNode wrap_int(int64_t num) override {
-    return SymNode(c10::make_intrusive<ConstantSymNodeImpl<int64_t>>(num));
-  };
-
-  int64_t guard_int(const char* file, int64_t line) override {
-    TORCH_CHECK(false);
-  }
-
-  double guard_float(const char* file, int64_t line) override {
-    TORCH_CHECK(false, "not a float");
-  }
-
-  bool guard_bool(const char* file, int64_t line) override {
-    TORCH_CHECK(false, "not a bool");
-  }
-
-  int64_t int_() override {
-    TORCH_CHECK(false);
-  }
-
-  std::string str() override {
-    if (coeff_ == 1) {
-      return "j" + std::to_string(val_);
-    }
-    return std::to_string(coeff_) + "*j" + std::to_string(val_);
-  }
-
-  // NOTE [ Inequalities with SingletonInt ]
-  //
-  // The semantics of SingletonInt when it comes to relations is that it is
-  // treated as integer known to be within a certain range,
-  //
-  //     j0 \in [2, int64_t::max]
-  //
-  // allowing us to answer queries like j0 >= 1 (True), and j0 == 0 (False).
-  // This is a useful default range for the raggedness pattern of a jagged
-  // tensor (1) since sizes are non-negative, and (2) we need to get past 0/1
-  // specialization checks.
-  //
-  // [ Indeterminate inequalities error out ]
-  //
-  // Given the semantic defined above, certain relations like j0 < 3 are thus
-  // indeterminable. In our impl today, evaluating such relations error
-  //
-  // It may seem convenient to just define indeterminate relations to return
-  // False, but the implementation we maintain in parallel using sympy does not
-  // allow this.
-  //
-  // Sympy only allows overriding of Ge. The other relations (Lt, Gt, Le) are,
-  // by consequence, all derived from Ge e.g., Lt(a, b) := !Ge(a, b). This
-  // would mean that means that if we define the indeterminate j0 >= 3 to be
-  // False, the also indeterminate j0 < 3 will be evaluated to be True!
-  //
-  // [ Coefficient are assumed positive ]
-  //
-  // For the purpose of computing inequalities, we consider the coefficient of
-  // the SingletonInt to be a positive integer.
-  //
-  // Thus, no modifications are needed to the logic since
-  // j0 >= k implies coeff * j0 >= k
-  //
-  c10::SymNode eq(const c10::SymNode& other) override;
-  c10::SymNode ne(const c10::SymNode& other) override;
-  c10::SymNode ge(const c10::SymNode& other) override;
-  c10::SymNode gt(const c10::SymNode& other) override;
-  c10::SymNode lt(const c10::SymNode& other) override;
-  c10::SymNode le(const c10::SymNode& other) override;
-  c10::SymNode mul(const c10::SymNode& other) override;
-
-  c10::optional<int64_t> singleton_int() override {
-    return val_;
-  }
-
-  c10::optional<int64_t> singleton_coeff() override {
-    return coeff_;
-  }
-
-  bool is_symbolic() override {
-    return false;
-  }
-
-#define DEFINE_BINARY_NOT_SUPPORTED(name)                           \
-  c10::SymNode name(const c10::SymNode& other) override {           \
-    TORCH_CHECK(false, #name " not supported by SingletonSymNode"); \
-  }
-
-  DEFINE_BINARY_NOT_SUPPORTED(add)
-  DEFINE_BINARY_NOT_SUPPORTED(sub)
-  DEFINE_BINARY_NOT_SUPPORTED(truediv)
-  DEFINE_BINARY_NOT_SUPPORTED(pow)
-  DEFINE_BINARY_NOT_SUPPORTED(floordiv)
-  DEFINE_BINARY_NOT_SUPPORTED(mod)
-  DEFINE_BINARY_NOT_SUPPORTED(sym_min)
-  DEFINE_BINARY_NOT_SUPPORTED(sym_max)
-  DEFINE_BINARY_NOT_SUPPORTED(sym_and)
-  DEFINE_BINARY_NOT_SUPPORTED(sym_or)
-
-#undef DEFINE_BINARY_NOT_SUPPORTED
-
-#define DEFINE_NOT_SUPPORTED(name)                                     \
-  c10::SymNode name() override {                                       \
-    TORCH_CHECK(false, #name " is not supported by SingletonSymNode"); \
-  }
-
-  DEFINE_NOT_SUPPORTED(sym_not)
-  DEFINE_NOT_SUPPORTED(ceil)
-  DEFINE_NOT_SUPPORTED(floor)
-  DEFINE_NOT_SUPPORTED(neg)
-  DEFINE_NOT_SUPPORTED(clone)
-  DEFINE_NOT_SUPPORTED(sym_float)
-
-#undef DEFINE_NOT_SUPPORTED
-
- private:
-  int64_t val_;
-  int64_t coeff_;
-};
-
-} // namespace c10
diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp
index 92befbf481f24..ed19144d0eaff 100644
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@@ -72,9 +72,9 @@ void TensorBase::enforce_invariants() {
 
 void TensorBase::print() const {
   if (defined()) {
-    std::cerr << "[" << toString() << " " << sizes() << "]" << std::endl;
+    std::cerr << "[" << toString() << " " << sizes() << "]" << '\n';
   } else {
-    std::cerr << "[UndefinedTensor]" << std::endl;
+    std::cerr << "[UndefinedTensor]" << '\n';
   }
 }
 
diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index d20ab49d42783..8172cf31e7522 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -68,6 +68,7 @@ class TORCH_API TensorRef {
 };
 
 template <typename T>
+// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
 auto Tensor::register_hook(T&& hook) const -> Tensor::hook_return_void_t<T> {
   // Return the grad argument in case of a hook with void return type to have an
   // std::function with Tensor return type
@@ -81,6 +82,7 @@ auto Tensor::register_hook(T&& hook) const -> Tensor::hook_return_void_t<T> {
 }
 
 template <typename T>
+// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
 auto Tensor::register_hook(T&& hook) const -> Tensor::hook_return_var_t<T> {
   return _register_hook([fn=std::forward<T>(hook)](const TensorBase& grad_base) {
     TensorRef grad(grad_base);
diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h
index 7440376d0e976..a1a4e0972d3ac 100644
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@@ -7,6 +7,7 @@
 #include <c10/util/irange.h>
 #include <cstddef>
 #include <cstdint>
+#include <type_traits>
 
 namespace at {
 
@@ -131,7 +132,7 @@ class GenericPackedTensorAccessorBase {
   }
 
   // if index_t is not int64_t, we want to have an int64_t constructor
-  template <typename source_index_t, class = typename std::enable_if<std::is_same<source_index_t, int64_t>::value>::type>
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   C10_HOST GenericPackedTensorAccessorBase(
       PtrType data_,
@@ -184,7 +185,7 @@ class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase<T,N,P
       : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
 
   // if index_t is not int64_t, we want to have an int64_t constructor
-  template <typename source_index_t, class = typename std::enable_if<std::is_same<source_index_t, int64_t>::value>::type>
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
   C10_HOST GenericPackedTensorAccessor(
       PtrType data_,
       const source_index_t* sizes_,
@@ -231,7 +232,7 @@ class GenericPackedTensorAccessor<T,1,PtrTraits,index_t> : public GenericPackedT
       : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
 
   // if index_t is not int64_t, we want to have an int64_t constructor
-  template <typename source_index_t, class = typename std::enable_if<std::is_same<source_index_t, int64_t>::value>::type>
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
   C10_HOST GenericPackedTensorAccessor(
       PtrType data_,
       const source_index_t* sizes_,
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index b82e6b25e1d80..e03c6bdf2bd10 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -11,6 +11,7 @@
 #include <c10/core/TensorOptions.h>
 #include <c10/core/UndefinedTensorImpl.h>
 #include <c10/core/WrapDimMinimal.h>
+#include <c10/util/C++17.h>
 #include <c10/util/Exception.h>
 #include <c10/util/ExclusivelyOwned.h>
 #include <c10/util/ExclusivelyOwnedTensorTraits.h>
@@ -27,11 +28,11 @@ namespace c10 {
 class Scalar;
 }
 
-namespace torch { namespace autograd {
+namespace torch::autograd {
 
 struct Node;
 
-}} // namespace torch::autograd
+} // namespace torch::autograd
 
 namespace at {
 
@@ -415,7 +416,7 @@ class TORCH_API TensorBase {
   }
 
   /// Returns a `Tensor`'s device index.
-  int64_t get_device() const {
+  DeviceIndex get_device() const {
     // NB: this is not a native function to avoid dispatching overhead.
     return impl_->get_device();
   }
@@ -506,10 +507,10 @@ class TORCH_API TensorBase {
     return impl_->is_mps();
   }
 
-  /// Returns if a `Tensor` is ort tensor.
-  bool is_ort() const {
+  /// Returns if a `Tensor` is maia tensor.
+  bool is_maia() const {
     // NB: this is not a native function to avoid dispatching overhead.
-    return impl_->is_ort();
+    return impl_->is_maia();
   }
 
   /// Returns if a `Tensor` is vulkan tensor.
@@ -593,9 +594,12 @@ class TORCH_API TensorBase {
     return mutable_data_ptr();
   }
 
-  template <typename T>
+  template <typename T, std::enable_if_t<!std::is_const_v<T>, int> = 0>
   const T* const_data_ptr() const;
 
+  template <typename T, std::enable_if_t<std::is_const_v<T>, int> = 0>
+  const std::remove_const_t<T>* const_data_ptr() const;
+
   template <typename T>
   T* mutable_data_ptr() const;
 
@@ -620,7 +624,13 @@ class TORCH_API TensorBase {
   TensorAccessor<T,N> accessor() const& {
     static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data_ptr<T>()");
     TORCH_CHECK(dim() == N, "TensorAccessor expected ", N, " dims but tensor has ", dim());
-    return TensorAccessor<T,N>(data_ptr<T>(),sizes().data(),strides().data());
+    T* ptr = nullptr;
+    if constexpr (std::is_const<T>::value) {
+      ptr = const_data_ptr<T>();
+    } else {
+      ptr = mutable_data_ptr<T>();
+    }
+    return TensorAccessor<T,N>(ptr,sizes().data(),strides().data());
   }
   template<typename T, size_t N>
   TensorAccessor<T,N> accessor() && = delete;
@@ -634,7 +644,13 @@ class TORCH_API TensorBase {
   GenericPackedTensorAccessor<T,N,PtrTraits,index_t> generic_packed_accessor() const& {
     static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data_ptr<T>()");
     TORCH_CHECK(dim() == N, "TensorAccessor expected ", N, " dims but tensor has ", dim());
-    return GenericPackedTensorAccessor<T,N,PtrTraits,index_t>(static_cast<typename PtrTraits<T>::PtrType>(data_ptr<T>()),sizes().data(),strides().data());
+    T* ptr = nullptr;
+    if constexpr (std::is_const<T>::value) {
+      ptr = const_data_ptr<T>();
+    } else {
+      ptr = mutable_data_ptr<T>();
+    }
+    return GenericPackedTensorAccessor<T,N,PtrTraits,index_t>(static_cast<typename PtrTraits<T>::PtrType>(ptr),sizes().data(),strides().data());
   }
   template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
   GenericPackedTensorAccessor<T,N> generic_packed_accessor() && = delete;
@@ -815,9 +831,9 @@ class TORCH_API TensorBase {
   //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
   template <typename T>
-  using hook_return_void_t = std::enable_if_t<std::is_void<typename c10::invoke_result_t<T&, TensorBase>>::value, unsigned>;
+  using hook_return_void_t = std::enable_if_t<std::is_void_v<typename c10::invoke_result_t<T&, TensorBase>>, unsigned>;
   template <typename T>
-  using hook_return_var_t = std::enable_if_t<std::is_same<typename c10::invoke_result_t<T&, TensorBase>, TensorBase>::value, unsigned>;
+  using hook_return_var_t = std::enable_if_t<std::is_same_v<typename c10::invoke_result_t<T&, TensorBase>, TensorBase>, unsigned>;
 
   /// Registers a backward hook.
   ///
@@ -904,15 +920,16 @@ class TORCH_API TensorBase {
   TensorBase __dispatch_contiguous(c10::MemoryFormat) const;
 };
 
-inline int64_t get_device(const TensorBase& self) {
+inline DeviceIndex get_device(const TensorBase& self) {
   return self.get_device();
 }
 
 template <typename T>
+// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
 auto TensorBase::register_hook(T&& hook) const -> TensorBase::hook_return_void_t<T> {
   // Return the grad argument in case of a hook with void return type to have an
   // std::function with Tensor return type
-  static_assert(std::is_same<decltype(hook(TensorBase())), void>::value,
+  static_assert(std::is_same_v<decltype(hook(TensorBase())), void>,
                 "Expected hook to return void");
   return _register_hook([fn=std::forward<T>(hook)](const TensorBase& grad) {
     fn(grad);
@@ -1010,9 +1027,9 @@ inline c10::MaybeOwned<TensorBase> TensorBase::expect_contiguous(MemoryFormat me
 namespace symint {
 
 template <typename T>
-using enable_if_symint = std::enable_if_t<std::is_same<T, c10::SymInt>::value>;
+using enable_if_symint = std::enable_if_t<std::is_same_v<T, c10::SymInt>>;
 template <typename T>
-using enable_if_int = std::enable_if_t<std::is_same<T, int64_t>::value>;
+using enable_if_int = std::enable_if_t<std::is_same_v<T, int64_t>>;
 
 template <typename T, typename = enable_if_symint<T>>
 c10::SymIntArrayRef sizes(const TensorBase& t) { return t.sym_sizes(); }
diff --git a/aten/src/ATen/core/TorchDispatchUtils.cpp b/aten/src/ATen/core/TorchDispatchUtils.cpp
index e2f981c6a8332..8f666e5a476ab 100644
--- a/aten/src/ATen/core/TorchDispatchUtils.cpp
+++ b/aten/src/ATen/core/TorchDispatchUtils.cpp
@@ -1,7 +1,7 @@
 #include <ATen/core/TorchDispatchUtils.h>
 
-namespace at {
-namespace impl {
+
+namespace at::impl {
 
 bool tensor_has_dispatch(const at::Tensor& t) {
   DispatchKeySet key_set({DispatchKey::Python, DispatchKey::PythonTLSSnapshot});
@@ -27,5 +27,4 @@ bool tensorlist_has_dispatch(const c10::List<c10::optional<at::Tensor>>& li) {
   return false;
 }
 
-} // namespace impl
-} // namespace at
+} // namespace at::impl
diff --git a/aten/src/ATen/core/TorchDispatchUtils.h b/aten/src/ATen/core/TorchDispatchUtils.h
index a55074812b612..0ead779360097 100644
--- a/aten/src/ATen/core/TorchDispatchUtils.h
+++ b/aten/src/ATen/core/TorchDispatchUtils.h
@@ -6,12 +6,11 @@
 #include <c10/util/Optional.h>
 #include <c10/core/impl/TorchDispatchModeTLS.h>
 
-namespace at {
-namespace impl {
+namespace at::impl {
 
 TORCH_API bool tensor_has_dispatch(const at::Tensor& t);
 TORCH_API bool tensorlist_has_dispatch(at::ITensorListRef li);
 TORCH_API bool tensorlist_has_dispatch(const c10::List<c10::optional<at::Tensor>>& li);
 using c10::impl::dispatch_mode_enabled;
 
-}}
+}
diff --git a/aten/src/ATen/core/TransformationHelper.h b/aten/src/ATen/core/TransformationHelper.h
index 1061a732ddb7e..f81018a8e674f 100644
--- a/aten/src/ATen/core/TransformationHelper.h
+++ b/aten/src/ATen/core/TransformationHelper.h
@@ -1,11 +1,13 @@
+#include <ATen/NumericUtils.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Half.h>
 #include <c10/util/BFloat16.h>
 #include <c10/util/MathConstants.h>
-#include <ATen/NumericUtils.h>
-#include <limits>
+#include <cmath>
 #include <cstdint>
 #include <cassert>
+#include <limits>
+#include <type_traits>
 
 namespace at {
 
@@ -54,12 +56,12 @@ C10_HOST_DEVICE inline T uniform_int_full_range(V val) {
  * in this overloaded version
  */
 template <typename T, typename V>
-C10_HOST_DEVICE inline typename std::enable_if<!(std::is_floating_point<T>::value), T>::type uniform_int(V val) {
+C10_HOST_DEVICE inline std::enable_if_t<!(std::is_floating_point_v<T>), T>uniform_int(V val) {
   if constexpr (std::is_same_v<T, bool>) {
     return static_cast<bool>(val & 1);
   } else if constexpr (std::is_same_v<T, int64_t>) {
     return static_cast<T>(val % (static_cast<uint64_t>(std::numeric_limits<T>::max()) + 1));
-  } else if constexpr (std::is_same_v<T, at::Half> || std::is_same<T, at::BFloat16>::value) {
+  } else if constexpr (std::is_same_v<T, at::Half> || std::is_same_v<T, at::BFloat16>) {
     return static_cast<T>(val % static_cast<uint64_t>((1ULL << std::numeric_limits<T>::digits) + 1));
   } else if constexpr (std::is_integral_v<T>) {
     return static_cast<T>(val % (static_cast<uint64_t>(std::numeric_limits<T>::max()) + 1));
@@ -74,7 +76,7 @@ C10_HOST_DEVICE inline typename std::enable_if<!(std::is_floating_point<T>::valu
  * added to fix compiler warnings reported in GitHub issue 46391. T is either float or double in this version.
  */
 template<typename T, typename V>
-C10_HOST_DEVICE inline typename std::enable_if<std::is_floating_point<T>::value, T>::type uniform_int(V val) {
+C10_HOST_DEVICE inline std::enable_if_t<std::is_floating_point_v<T>, T>uniform_int(V val) {
   return static_cast<T>(val % static_cast<uint64_t>((1ULL << std::numeric_limits<T>::digits) + 1));
 }
 
diff --git a/aten/src/ATen/core/VariableHooksInterface.cpp b/aten/src/ATen/core/VariableHooksInterface.cpp
index 7525584e0d7d0..a062582c2101f 100644
--- a/aten/src/ATen/core/VariableHooksInterface.cpp
+++ b/aten/src/ATen/core/VariableHooksInterface.cpp
@@ -1,6 +1,6 @@
 #include <ATen/core/VariableHooksInterface.h>
 
-namespace at { namespace impl {
+namespace at::impl {
 
 namespace {
 VariableHooksInterface* hooks = nullptr;
@@ -17,4 +17,4 @@ bool HasVariableHooks() {
   return hooks != nullptr;
 }
 
-}} // namespace at::impl
+} // namespace at::impl
diff --git a/aten/src/ATen/core/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h
index f75342dea76a9..47d74f5433ac2 100644
--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <c10/macros/Export.h>
 #include <ATen/core/Tensor.h>
+#include <c10/macros/Export.h>
 
 // A little explanation about why this file exists at all.  We have
 // a few methods on Tensor class which require access to reified access to
@@ -29,20 +29,20 @@
 // have weird signatures that are not supported by autograd, and (2)
 // see this bug https://github.com/pytorch/pytorch/issues/30102
 
-namespace torch { namespace autograd {
+namespace torch::autograd {
 
 struct Node;
 
-}} // namespace torch::autograd
+} // namespace torch::autograd
 
-namespace at {
-namespace impl {
+namespace at::impl {
 
 struct TORCH_API VariableHooksInterface {
   virtual ~VariableHooksInterface() = default;
   virtual TensorBase tensor_data(const TensorBase&) const = 0;
   virtual TensorBase variable_data(const TensorBase&) const = 0;
-  virtual const std::shared_ptr<torch::autograd::Node>& grad_fn(const TensorBase&) const = 0;
+  virtual const std::shared_ptr<torch::autograd::Node>& grad_fn(
+      const TensorBase&) const = 0;
   virtual unsigned _register_hook(
       const TensorBase&,
       std::function<TensorBase(const TensorBase&)> hook) const = 0;
@@ -57,9 +57,17 @@ struct TORCH_API VariableHooksInterface {
   virtual int64_t _version(const TensorBase&) const = 0;
   virtual void retain_grad(const TensorBase&) const = 0;
   virtual bool retains_grad(const TensorBase&) const = 0;
-  virtual void _backward(const Tensor&, TensorList, const c10::optional<Tensor>&, c10::optional<bool>, bool) const = 0;
+  virtual void _backward(
+      const Tensor&,
+      TensorList,
+      const c10::optional<Tensor>&,
+      c10::optional<bool>,
+      bool) const = 0;
   virtual void requires_grad_(const TensorBase&, bool) const = 0;
-  virtual void basic_autograd_not_implemented_fallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) const = 0;
+  virtual void basic_autograd_not_implemented_fallback(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet dispatch_keys,
+      torch::jit::Stack* stack) const = 0;
 };
 
 TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
@@ -72,4 +80,4 @@ struct TORCH_API VariableHooksRegisterer {
   }
 };
 
-}} // namespace at::impl
+} // namespace at::impl
diff --git a/aten/src/ATen/core/Variadic.h b/aten/src/ATen/core/Variadic.h
index 61b6a35a0b1cb..da4df1b1b1a66 100644
--- a/aten/src/ATen/core/Variadic.h
+++ b/aten/src/ATen/core/Variadic.h
@@ -1,8 +1,5 @@
 #pragma once
 
-#include <cstdint>
-#include <tuple>
-#include <type_traits>
 #include <utility>
 
 #include <c10/util/ArrayRef.h>
diff --git a/aten/src/ATen/core/Vitals.cpp b/aten/src/ATen/core/Vitals.cpp
index 6746540f43e12..a854be6756bfa 100644
--- a/aten/src/ATen/core/Vitals.cpp
+++ b/aten/src/ATen/core/Vitals.cpp
@@ -2,8 +2,7 @@
 #include <cstdlib>
 #include <iostream>
 
-namespace at {
-namespace vitals {
+namespace at::vitals {
 
 APIVitals VitalsAPI;
 
@@ -78,8 +77,7 @@ bool APIVitals::setVital(
   auto iter = name_map_.find(vital_name);
   TorchVital* vital = nullptr;
   if (iter == name_map_.end()) {
-    auto r =
-        name_map_.emplace(vital_name, TorchVital(vital_name));
+    auto r = name_map_.emplace(vital_name, TorchVital(vital_name));
     vital = &r.first->second;
   } else {
     vital = &iter->second;
@@ -95,5 +93,4 @@ APIVitals::APIVitals() : vitals_enabled(false), name_map_() {
   setVital("CUDA", "used", "False", /* force = */ true);
 }
 
-} // namespace vitals
-} // namespace at
+} // namespace at::vitals
diff --git a/aten/src/ATen/core/Vitals.h b/aten/src/ATen/core/Vitals.h
index 2d4fe1cc0995c..8a7a51e81e1d2 100644
--- a/aten/src/ATen/core/Vitals.h
+++ b/aten/src/ATen/core/Vitals.h
@@ -1,15 +1,11 @@
 #pragma once
-#include <cstring>
-#include <map>
-#include <memory>
 #include <ostream>
 #include <sstream>
 #include <unordered_map>
 
 #include <c10/core/impl/LocalDispatchKeySet.h>
 
-namespace at {
-namespace vitals {
+namespace at::vitals {
 
 TORCH_API bool torchVitalEnabled();
 
@@ -82,8 +78,7 @@ class TORCH_API APIVitals {
 
 extern TORCH_API APIVitals VitalsAPI;
 
-} // namespace vitals
-} // namespace at
+} // namespace at::vitals
 
 #define TORCH_VITAL_DECLARE(name) \
   TORCH_API at::vitals::TorchVital TorchVital_##name;
diff --git a/aten/src/ATen/core/adaption.cpp b/aten/src/ATen/core/adaption.cpp
index 0c2976ab09219..ef06b9606ba7e 100644
--- a/aten/src/ATen/core/adaption.cpp
+++ b/aten/src/ATen/core/adaption.cpp
@@ -1,15 +1,13 @@
 #include <ATen/core/op_registration/adaption.h>
 
-namespace c10 {
-namespace impl {
+
+namespace c10::impl {
 
 void common_device_check_failure(Device common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
   TORCH_CHECK(false,
     "Expected all tensors to be on the same device, but "
-    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
     "found at least two devices, ", common_device, " and ", tensor.device(), "! "
     "(when checking argument for argument ", argName, " in method ", methodName, ")");
 }
 
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/aten/src/ATen/core/blob.h b/aten/src/ATen/core/blob.h
index 3e8da3e4e7a67..35ee3b358c991 100644
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@@ -1,10 +1,6 @@
 #pragma once
 
-#include <cstddef>
-#include <sstream>
 #include <type_traits>
-#include <typeinfo>
-#include <vector>
 
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/typeid.h>
@@ -26,7 +22,7 @@ class TORCH_API Blob final : public c10::intrusive_ptr_target {
   /**
    * Initializes an empty Blob.
    */
-  Blob() noexcept : meta_(), pointer_(nullptr), has_ownership_(false) {}
+  Blob() noexcept : meta_() {}
   ~Blob() override {
     Reset();
   }
@@ -148,11 +144,11 @@ class TORCH_API Blob final : public c10::intrusive_ptr_target {
    * call is made or the blob is destructed.
    */
   template <class T>
-  typename std::remove_const<T>::type* ShareExternal(
-      typename std::remove_const<T>::type* allocated) {
+  std::remove_const_t<T>* ShareExternal(
+      std::remove_const_t<T>* allocated) {
     return static_cast<T*>(ShareExternal(
         static_cast<void*>(allocated),
-        TypeMeta::Make<typename std::remove_const<T>::type>()));
+        TypeMeta::Make<std::remove_const_t<T>>()));
   }
 
   void* ShareExternal(void* allocated, const TypeMeta meta) {
@@ -176,7 +172,7 @@ class TORCH_API Blob final : public c10::intrusive_ptr_target {
   /**
    * @brief Swaps the underlying storage of two blobs.
    */
-  void swap(Blob& rhs) {
+  void swap(Blob& rhs)  noexcept {
     using std::swap;
     swap(meta_, rhs.meta_);
     swap(pointer_, rhs.pointer_);
@@ -191,13 +187,13 @@ class TORCH_API Blob final : public c10::intrusive_ptr_target {
   }
 
   TypeMeta meta_;
-  void* pointer_;
-  bool has_ownership_;
+  void* pointer_{nullptr};
+  bool has_ownership_{false};
 
   C10_DISABLE_COPY_AND_ASSIGN(Blob);
 };
 
-inline void swap(Blob& lhs, Blob& rhs) {
+inline void swap(Blob& lhs, Blob& rhs)  noexcept {
   lhs.swap(rhs);
 }
 
diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h
index d8d0a3d151407..c950f4c80ffc7 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.h
+++ b/aten/src/ATen/core/boxing/KernelFunction.h
@@ -6,6 +6,7 @@
 #include <c10/core/DispatchKeySet.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/TypeList.h>
+#include <type_traits>
 
 namespace c10 {
 
@@ -17,7 +18,7 @@ class KernelFunction;
 
 template <typename T>
 using has_symint =
-  guts::disjunction<
+  std::disjunction<
     std::is_same<c10::SymInt, T>,
     std::is_same<c10::SymIntArrayRef, T>,
     std::is_same<at::OptionalSymIntArrayRef, T>,
diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h
index 8ef5315fbc7cc..0d6149c8090a9 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@@ -3,6 +3,9 @@
 #include <ATen/core/boxing/impl/WrapFunctionIntoFunctor.h>
 #include <ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h>
 
+#include <c10/util/C++17.h>
+#include <type_traits>
+
 namespace c10 {
 
 inline KernelFunction::KernelFunction()
@@ -83,8 +86,7 @@ C10_ALWAYS_INLINE Return KernelFunction::call(const OperatorHandle& opHandle, Di
     // forwarding, which would require Args to be deduced, but instead we
     // want callers to explicitly specify the Args.
 
-    // This should get inlined by compiler
-    if (guts::disjunction<has_symint<Args>...>::value) {
+    if constexpr (std::disjunction_v<has_symint<Args>...>) {
       if (sym_unboxed_kernel_func_ != nullptr) {
         auto *functor = boxed_kernel_func_.getFunctor();
         return callUnboxedKernelFunction<Return, Args...>(
diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h
index efc68570924ef..82fdd824ea65b 100644
--- a/aten/src/ATen/core/boxing/impl/boxing.h
+++ b/aten/src/ATen/core/boxing/impl/boxing.h
@@ -10,6 +10,7 @@
 #include <ATen/core/boxing/BoxedKernel.h>
 
 #include <c10/util/Metaprogramming.h>
+#include <type_traits>
 
 namespace c10 {
 namespace impl {
@@ -38,7 +39,15 @@ template <class T, class Enable = void>
 struct has_ivalue_to : std::false_type {};
 
 template <class T>
-struct has_ivalue_to<T, guts::void_t<decltype(std::declval<IValue>().to<T>())>>
+struct ivalue_to_helper
+{
+    using type = decltype(std::declval<IValue>().template to<T>());
+};
+template <class T>
+using ivalue_to_helper_t = typename ivalue_to_helper<T>::type;
+
+template <class T>
+struct has_ivalue_to<T, std::void_t<ivalue_to_helper_t<T>>>
 : std::true_type
 {};
 
@@ -49,7 +58,7 @@ struct has_ivalue_to<T, guts::void_t<decltype(std::declval<IValue>().to<T>())>>
 // A boxable arg type is one that IValue has a constructor for.
 template <typename T>
 using can_box =
-  guts::disjunction<
+  std::disjunction<
     std::is_constructible<IValue, std::decay_t<T>>,
     // TensorOptions are not directly constructible into IValue,
     // but torch::jit::push knows how to handle them
@@ -57,18 +66,18 @@ using can_box =
   >;
 
 template <typename... Ts>
-using can_box_all = guts::conjunction<can_box<Ts>...>;
+using can_box_all = std::conjunction<can_box<Ts>...>;
 
 // an unboxable result is one that can be extracted from an IValue
 template <typename T>
 using can_unbox =
-  guts::conjunction<
-    guts::disjunction<
+   std::conjunction<
+    std::disjunction<
       has_ivalue_to<T>,
       // void returns are ok
       std::is_same<void, T>
     >,
-    guts::negation<std::is_lvalue_reference<T>>
+    std::negation<std::is_lvalue_reference<T>>
   >;
 
 //
diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
index 5308499edd439..ccd94ff1de2be 100644
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@@ -183,7 +183,7 @@ namespace impl {
   struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<float, T>::value>> {
     // There is no reason to support float when we have double. Keep the API lean.
     static_assert(guts::false_t<T>::value,
-      "You tried to register a kernel with an unsupported input type: float. Please use double instead.");
+      "You tried to register a kernel with an unsupported input type: float. Please use double instead; you should use `double` in the C++ function signature and `float` in the schema string.");
   };
   template<class T, bool AllowDeprecatedTypes>
   struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<const char*, T>::value>> {
@@ -198,7 +198,7 @@ namespace impl {
   template<class T, bool AllowDeprecatedTypes>
   struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_integral<T>::value && !guts::typelist::contains<supported_primitive_arg_types, T>::value>> {
     static_assert(guts::false_t<T>::value,
-      "You tried to register a kernel with an unsupported integral input type. Please use int64_t instead.");
+      "You tried to register a kernel with an unsupported integral input type. Please use int64_t instead; you should use `int64_t` in the C++ function signature and `int` in the schema string.");
   };
   template<class T, bool AllowDeprecatedTypes>
   struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<const c10::SymInt&, T>::value>> {
@@ -283,7 +283,7 @@ namespace impl {
   struct assert_is_valid_output_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<float, T>::value>> {
     // There is no reason to support float when we have double. Keep the API lean.
     static_assert(guts::false_t<T>::value,
-      "You tried to register a kernel with an unsupported output type: float. Please use double instead.");
+      "You tried to register a kernel with an unsupported output type: float. Please use double instead; you should use `double` in the C++ function signature and `float` in the schema string.");
   };
   template<class T, bool AllowDeprecatedTypes>
   struct assert_is_valid_output_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<const char*, T>::value>> {
@@ -298,7 +298,7 @@ namespace impl {
   template<class T, bool AllowDeprecatedTypes>
   struct assert_is_valid_output_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_integral<T>::value && !guts::typelist::contains<supported_primitive_arg_types, T>::value>> {
     static_assert(guts::false_t<T>::value,
-      "You tried to register a kernel with an unsupported integral output type. Please use int64_t instead.");
+      "You tried to register a kernel with an unsupported integral output type. Please use int64_t instead; you should use `int64_t` in the C++ function signature and `int` in the schema string.");
   };
 
   // ivalue_to_arg
diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h
index 887e57b157ed5..b25ca55c16851 100644
--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@@ -7,8 +7,7 @@
 #include <functional>
 #include <utility>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 struct BuiltinOpFunction : public Function {
   BuiltinOpFunction(
@@ -62,12 +61,16 @@ struct BuiltinOpFunction : public Function {
     return *this;
   }
 
-  bool call(Stack& stack, c10::optional<size_t>, c10::function_ref<void(const Code&)>) override {
+  bool call(
+      Stack& stack,
+      c10::optional<size_t>,
+      c10::function_ref<void(const Code&)>) override {
     run(stack);
     return false;
   }
 
-  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>) override {
+  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>)
+      override {
     run(stack);
     return false;
   }
@@ -84,5 +87,4 @@ struct BuiltinOpFunction : public Function {
   std::string doc_string_;
 };
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/aten/src/ATen/core/class_type.h b/aten/src/ATen/core/class_type.h
index 310ed332aec77..99fd27bba5426 100644
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@@ -6,12 +6,12 @@
 #include <ATen/core/jit_type_base.h>
 #include <c10/util/Optional.h>
 
-namespace torch {
-namespace jit {
+
+namespace torch::jit {
 struct CompilationUnit;
 struct Function;
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
+
 
 namespace c10 {
 
@@ -390,7 +390,7 @@ struct TORCH_API ClassType : public NamedType {
       std::string doc_string = "",
       std::vector<std::string> unresolved_class_attributes = {});
 
-  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
     const auto& n = name().value();
     return n.qualifiedName();
   }
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
index 66b199de3cd18..6077ac8e34cc8 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -266,24 +266,25 @@ void Dispatcher::deregisterDef_(
 
 namespace {
 
-using AbstractImplPyStubsType = std::unordered_map<at::OperatorName, std::pair<const char*, const char*>>;
-AbstractImplPyStubsType& abstractImplPyStubsSingleton() {
-  static AbstractImplPyStubsType _data;
+// Maps OperatorName to (python module name, description) tuple.
+using PythonModuleMapType = std::unordered_map<at::OperatorName, std::pair<const char*, const char*>>;
+PythonModuleMapType& pythonModulesSingleton() {
+  static PythonModuleMapType _data;
   return _data;
 }
 
 }
 
-c10::optional<std::pair<const char*, const char*>> Dispatcher::getAbstractImplPyStub(OperatorName op_name) {
+c10::optional<std::pair<const char*, const char*>> Dispatcher::getPyStub(OperatorName op_name) {
   std::lock_guard<std::mutex> lock(guard_->mutex);
-  auto found = abstractImplPyStubsSingleton().find(op_name);
-  if (found == abstractImplPyStubsSingleton().end()) {
+  auto found = pythonModulesSingleton().find(op_name);
+  if (found == pythonModulesSingleton().end()) {
     return c10::nullopt;
   }
   return found->second;
 }
 
-RegistrationHandleRAII Dispatcher::registerAbstractImplPyStub(
+RegistrationHandleRAII Dispatcher::registerPythonModule(
   const OperatorName& op_name,
   const char* pymodule,
   const char* context
@@ -292,28 +293,28 @@ RegistrationHandleRAII Dispatcher::registerAbstractImplPyStub(
   // If there are duplicates, we just let it through and warn about it.
   // Throwing an error during static initialization causes a crash that
   // doesn't give any sign of what happened.
-  auto found = abstractImplPyStubsSingleton().find(op_name);
-  if (found != abstractImplPyStubsSingleton().end()) {
+  auto found = pythonModulesSingleton().find(op_name);
+  if (found != pythonModulesSingleton().end()) {
     TORCH_WARN(
-        "Tried to register an abstract impl pystub for ", op_name, " ",
+        "Tried to register an python registration stub (pystub) for ", op_name, " ",
         "that specifies the Python module ", pymodule, " "
         "but there already was a pystub that specifies the Python module ",
         found->second.first, ". We will override the existing pystub.");
   }
-  abstractImplPyStubsSingleton()[op_name] = std::make_pair(pymodule, context);
+  pythonModulesSingleton()[op_name] = std::make_pair(pymodule, context);
   return RegistrationHandleRAII([guard = this->guard_, op_name] {
     std::lock_guard<std::mutex> lock(guard->mutex);
     if (!guard->alive.load()) {
       return;
     }
-    abstractImplPyStubsSingleton().erase(op_name);
+    pythonModulesSingleton().erase(op_name);
   });
 }
 
-void Dispatcher::throwIfHasAbstractImplPyStub(OperatorName op_name) {
+void Dispatcher::throwIfHasPythonModule(OperatorName op_name) {
   std::lock_guard<std::mutex> lock(guard_->mutex);
-  auto elt = abstractImplPyStubsSingleton().find(op_name);
-  if (elt == abstractImplPyStubsSingleton().end()) {
+  auto elt = pythonModulesSingleton().find(op_name);
+  if (elt == pythonModulesSingleton().end()) {
     return;
   }
   const char* pymodule = elt->second.first;
@@ -498,37 +499,51 @@ std::vector<OperatorName> Dispatcher::getRegistrationsForDispatchKey(c10::option
   });
 }
 
-int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchKey) {
+int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchKey, DispatchKeySet dispatchKeySet) {
   int64_t seq_num = -1;
   // Setting sequence number in the Autograd case to associate
   // the forward range with the corresponding Autograd's node
-  if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) {
+
+  // Note: this records a sequence number for both Autograd keys, and for
+  // non-Autograd keys where the dispatchKeySet still contains an autograd key.
+  // This means that we might collect the same sequence nubmer two different
+  // events if they all occurred above Autograd and still had the Autograd
+  // dispatch key in the dispatch key set.
+  // However, this usually doesn't happen: normally the first call will
+  // go through the call() or callBoxed() path in the dispatcher, while
+  // subsequent redispatches go through redispatch() or redispatchBoxed().
+  // `call` has profiler instrumentation, whereas `redispatch` doesn't.
+  // So usually, we'll collect a sequence number on the first call() if the
+  // dispatch keys contain autograd, and not on subsequent redispatches.
+  bool dispatchHasAutograd = !(dispatchKeySet & autograd_dispatch_keyset).empty();
+
+  if (dispatchHasAutograd && at::GradMode::is_enabled()) {
     seq_num = at::sequence_number::peek();
   }
   return seq_num;
 }
 
-void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, c10::ArrayRef<const c10::IValue> args) {
-  guard.before(schema_ref, args, sequenceNumberForRunningRecordFunction(dispatchKey));
+void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, DispatchKeySet dispatchKeySet, c10::ArrayRef<const c10::IValue> args) {
+  guard.before(schema_ref, args, sequenceNumberForRunningRecordFunction(dispatchKey, dispatchKeySet));
 }
 
-void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey) {
+void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, DispatchKeySet dispatchKeySet) {
   // Setting sequence number in the Autograd case to associate
   // the forward range with the corresponding Autograd's node
-  guard.before(schema_ref, sequenceNumberForRunningRecordFunction(dispatchKey));
+  guard.before(schema_ref, sequenceNumberForRunningRecordFunction(dispatchKey, dispatchKeySet));
 }
 #ifdef FBCODE_CAFFE2
 bool Dispatcher::profilingOperatorEvents() {
   return TORCH_SDT_IS_ENABLED(operator_start) || TORCH_SDT_IS_ENABLED(operator_end);
 }
 
-void Dispatcher::fireOpStartUSDT(at::RecordFunction::schema_ref_t schema_ref) {
+C10_NOINLINE void Dispatcher::fireOpStartUSDT(at::RecordFunction::schema_ref_t schema_ref) {
   if (TORCH_SDT_IS_ENABLED(operator_start)) {
     TORCH_SDT_WITH_SEMAPHORE(operator_start, schema_ref.get().name().c_str());
   }
 }
 
-void Dispatcher::fireOpEndUSDT(at::RecordFunction::schema_ref_t schema_ref) {
+C10_NOINLINE void Dispatcher::fireOpEndUSDT(at::RecordFunction::schema_ref_t schema_ref) {
   if (TORCH_SDT_IS_ENABLED(operator_end)) {
     TORCH_SDT_WITH_SEMAPHORE(operator_end, schema_ref.get().name().c_str());
   }
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index d383ee95569a2..caf73d7cebb21 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -224,17 +224,17 @@ class TORCH_API Dispatcher final {
   RegistrationHandleRAII registerImpl(OperatorName op_name, c10::optional<DispatchKey> dispatch_key, KernelFunction kernel, c10::optional<impl::CppSignature> cpp_signature, std::unique_ptr<FunctionSchema> inferred_function_schema, std::string debug);
 
   /**
-   * Given an operator, tells the Dispatcher that we have implemented an abstract impl
+   * Given an operator, tells the Dispatcher that we have implemented a fake impl
    * for this op in the given Python module. Call this a "pystub".
    */
-  RegistrationHandleRAII registerAbstractImplPyStub(const OperatorName& op_name, const char* pymodule, const char* context);
+  RegistrationHandleRAII registerPythonModule(const OperatorName& op_name, const char* pymodule, const char* context);
 
   /**
-   * Given an operator, throws if we have an abstract impl pystub.
+   * Given an operator, throws if we have a pystub.
    */
-  void throwIfHasAbstractImplPyStub(OperatorName op_name);
+  void throwIfHasPythonModule(OperatorName op_name);
 
-  c10::optional<std::pair<const char*, const char*>> getAbstractImplPyStub(OperatorName op_name);
+  c10::optional<std::pair<const char*, const char*>> getPyStub(OperatorName op_name);
 
   /**
    * Register a new operator by name.
@@ -304,9 +304,9 @@ class TORCH_API Dispatcher final {
 private:
   Dispatcher();
 
-  static int64_t sequenceNumberForRunningRecordFunction(DispatchKey dispatchKey);
-  static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey);
-  static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, c10::ArrayRef<const c10::IValue> args);
+  static int64_t sequenceNumberForRunningRecordFunction(DispatchKey dispatchKey, DispatchKeySet dispatchKeySet);
+  static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, DispatchKeySet dispatchKeySet);
+  static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, DispatchKeySet dispatchKeySet, c10::ArrayRef<const c10::IValue> args);
 
   #ifdef FBCODE_CAFFE2
   static bool profilingOperatorEvents();
@@ -403,6 +403,10 @@ class TORCH_API OperatorHandle {
     return operatorDef_->op.hasKernelForDispatchKey(k);
   }
 
+  bool isKernelFallthroughKernel(DispatchKey k) const {
+    return operatorDef_->op.kernelForDispatchKey(k).isFallthrough();
+  }
+
   bool hasKernelForAnyDispatchKey(DispatchKeySet k) const {
     return operatorDef_->op.hasKernelForAnyDispatchKey(k);
   }
@@ -630,15 +634,15 @@ inline Return Dispatcher::callWithDispatchKeySlowPath(const TypedOperatorHandle<
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(lastArgIdx == num_boxed_args);
       // I don't *think* we need std::launder here, because IValue has
       // no subclasses and no const or reference fields.
-      runRecordFunction(guard, schema_ref, dispatchKey, c10::ArrayRef<const c10::IValue>(reinterpret_cast<IValue *>(boxedArgs), num_boxed_args));
+      runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet, c10::ArrayRef<const c10::IValue>(reinterpret_cast<IValue *>(boxedArgs), num_boxed_args));
       for (size_t ii = 0; ii < num_boxed_args; ++ii) {
         reinterpret_cast<IValue *>(&boxedArgs[ii])->~IValue();
       }
     } else {
-      runRecordFunction(guard, schema_ref, dispatchKey);
+      runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet);
     }
   } else {
-    runRecordFunction(guard, schema_ref, dispatchKey);
+    runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet);
   }
 
   if (C10_UNLIKELY(guard.needsOutputs())) {
@@ -732,8 +736,8 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const
     auto dispatchKey = dispatchKeySet.highestPriorityTypeId();
     auto& schema = op.schema();
     auto schema_ref = std::reference_wrapper<const FunctionSchema>(schema);
-    guard.needsInputs() ? runRecordFunction(guard, schema_ref, dispatchKey, c10::ArrayRef<const c10::IValue>(stack->data(), stack->size()))
-                        : runRecordFunction(guard, schema_ref, dispatchKey);
+    guard.needsInputs() ? runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet, c10::ArrayRef<const c10::IValue>(stack->data(), stack->size()))
+                        : runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet);
 
     // keeping the guard alive while executing the kernel
     kernel.callBoxed(op, dispatchKeySet, stack);
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index 627109c516daf..5f4538f2c9790 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -421,7 +421,7 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp
   // In theory, we should only have to check if the given runtime key has "dense" functionality,
   // e.g. DispatchKey::CPU (which is composed of DispatchKey::Dense and BackendComponent::CPUBit).
   // However, there are some backends that should be included in this set that don't have the dense key set.
-  // E.g. DispatchKey::Meta, DispatchKey::ORT.
+  // E.g. DispatchKey::Meta, DispatchKey::MAIA.
   if (c10::isBackendDispatchKey(dispatch_key)) {
     DispatchKey autograd_key = getAutogradKeyFromBackend(toBackendComponent(dispatch_key));
     updateDispatchTableEntry_(dispatcher, autograd_key);
diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h
index ecfb6a999bc2e..25b75b9e51114 100644
--- a/aten/src/ATen/core/dynamic_type.h
+++ b/aten/src/ATen/core/dynamic_type.h
@@ -187,7 +187,7 @@ class DynamicType : public SharedType {
   bool equals(const DynamicType& other) const;
 
   template <typename F>
-  bool compareArguments(const DynamicType& other, F&& f) const {
+  bool compareArguments(const DynamicType& other, const F& f) const {
     if (arguments_.elems.size() != other.arguments_.elems.size()) {
       return false;
     }
diff --git a/aten/src/ATen/core/enum_type.h b/aten/src/ATen/core/enum_type.h
index bd60c1e77224f..136fe59e22fb5 100644
--- a/aten/src/ATen/core/enum_type.h
+++ b/aten/src/ATen/core/enum_type.h
@@ -88,7 +88,7 @@ struct TORCH_API EnumType : public NamedType {
         cu_(std::move(cu)) {}
 
   std::string annotation_str_impl(
-      C10_UNUSED TypePrinter printer = nullptr) const override {
+      C10_UNUSED const TypePrinter& printer = nullptr) const override {
     const auto& n = name().value();
     return n.qualifiedName();
   }
diff --git a/aten/src/ATen/core/function.h b/aten/src/ATen/core/function.h
index b4f82712a57d0..f55e15e50b4fa 100644
--- a/aten/src/ATen/core/function.h
+++ b/aten/src/ATen/core/function.h
@@ -14,8 +14,7 @@ namespace at {
 TORCH_API void launch(std::function<void()> func);
 }
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 struct Graph;
 struct Code;
@@ -29,7 +28,9 @@ using Kwargs = std::unordered_map<std::string, at::IValue>;
 struct RecursiveMethodCallError : public std::exception {};
 using TaskLauncher = std::function<void(std::function<void()>)>;
 
-TORCH_API void preoptimizeGraph(std::shared_ptr<Graph>& graph, bool disable_autocast=false);
+TORCH_API void preoptimizeGraph(
+    std::shared_ptr<Graph>& graph,
+    bool disable_autocast = false);
 
 // A Function is a pure Graph with no implicit `self` object bound.
 // It contains schema information and the executor that manages the
@@ -54,14 +55,13 @@ struct TORCH_API Function {
 
   virtual c10::intrusive_ptr<c10::ivalue::Future> runAsync(
       Stack& /*stack*/,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
       C10_UNUSED TaskLauncher taskLauncher = at::launch) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
     return {};
   }
 
-  at::IValue operator()(
-    Stack stack,
-    const Kwargs& kwargs = Kwargs()) {
+  at::IValue operator()(Stack stack, const Kwargs& kwargs = Kwargs()) {
     getSchema().checkAndNormalizeInputs(stack, kwargs);
     run(stack);
     return stack.front();
@@ -93,8 +93,12 @@ struct TORCH_API Function {
   // If call() returns true, then callback completes successfully, otherwise
   // call() returns false.
 
-  // Overload for server interpreter, a bailout size is needed for graph executor.
-  virtual bool call(Stack&, c10::optional<size_t>, c10::function_ref<void(const Code&)>) {
+  // Overload for server interpreter, a bailout size is needed for graph
+  // executor.
+  virtual bool call(
+      Stack&,
+      c10::optional<size_t>,
+      c10::function_ref<void(const Code&)>) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
     return false;
   }
@@ -107,5 +111,4 @@ struct TORCH_API Function {
 
   virtual ~Function() = default;
 };
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index b3818784561f4..79e7ffed1a14f 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -143,10 +143,10 @@ struct Argument {
         inferred_type_hint);
   }
 
-  Argument cloneWithType(TypePtr new_type) const {
+  Argument cloneWithType(const TypePtr& new_type) const {
     return Argument(
         name_,
-        std::move(new_type),
+        new_type,
         N_,
         default_value_,
         kwarg_only_,
diff --git a/aten/src/ATen/core/functional.h b/aten/src/ATen/core/functional.h
index 6b4f3447f5d48..1ddc674182010 100644
--- a/aten/src/ATen/core/functional.h
+++ b/aten/src/ATen/core/functional.h
@@ -9,7 +9,7 @@ namespace c10 {
 // const reference (const T&); taking T by non-const reference
 // will result in an error like:
 //
-//    error: no type named 'type' in 'class std::result_of<foobar::__lambda(T)>'
+//    error: no type named 'type' in 'class std::invoke_result<foobar::__lambda, T>'
 //
 // No explicit template parameters are required.
 
diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index 15857eec9df06..4f6abd66cb887 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -1,9 +1,4 @@
 #pragma once
-#include <vector>
-#include <cstdint>
-#include <string>
-#include <unordered_map>
-#include <algorithm>
 
 #include <c10/macros/Macros.h>
 
@@ -232,6 +227,7 @@ namespace c10 {
   _(aten, is_autocast_enabled)       \
   _(aten, is_autocast_cpu_enabled)   \
   _(aten, is_autocast_xla_enabled)   \
+  _(aten, get_autocast_dtype)        \
   FORALL_ATEN_BASE_SYMBOLS(_)        \
   _(onnx, Add)                       \
   _(onnx, Concat)                    \
diff --git a/aten/src/ATen/core/interned_strings_class.h b/aten/src/ATen/core/interned_strings_class.h
index 6e57332b99f97..a215fa62c7e91 100644
--- a/aten/src/ATen/core/interned_strings_class.h
+++ b/aten/src/ATen/core/interned_strings_class.h
@@ -1,5 +1,3 @@
-#include <cstdint>
-#include <cstring>
 #include <mutex>
 #include <string>
 #include <unordered_map>
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 4ce4b9c42774c..7343d66fcb97d 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -258,7 +258,6 @@ void IValue::getSubValues(HashAliasedIValues& subValues) const {
     case Tag::Capsule:
       TORCH_CHECK_TYPE(
           false, "Cannot inspect value of type ", this->tagKind());
-      [[fallthrough]];
     default:
       // don't record scalars.
       break;
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 57812446d0764..07e85677c3c75 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -10,11 +10,9 @@
 #include <c10/core/SymBool.h>
 #include <c10/core/SymFloat.h>
 #include <c10/macros/Export.h>
-#include <c10/util/C++17.h>
 #include <c10/util/MaybeOwned.h>
 #include <c10/util/intrusive_ptr.h>
 #include <type_traits>
-#include <typeindex>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -492,9 +490,7 @@ struct TORCH_API IValue final {
   // Custom C++ classes
   template <
       typename T,
-      std::enable_if_t<
-          std::is_base_of<torch::CustomClassHolder, T>::value,
-          int> = 0>
+      std::enable_if_t<std::is_base_of_v<torch::CustomClassHolder, T>, int> = 0>
   IValue(intrusive_ptr<T> custom_class);
   bool isCustomClass() const;
   template <typename T>
@@ -508,17 +504,17 @@ struct TORCH_API IValue final {
   template <
       typename... Args,
       std::enable_if_t<
-          !std::disjunction<
+          !std::disjunction_v<
               std::is_lvalue_reference<Args>...,
-              std::negation<std::is_constructible<IValue, Args>>...>::value,
+              std::negation<std::is_constructible<IValue, Args>>...>,
           std::nullptr_t> = nullptr>
   IValue(const std::tuple<Args...>& t);
   template <
       typename... Args,
       std::enable_if_t<
-          !std::disjunction<
+          !std::disjunction_v<
               std::is_lvalue_reference<Args>...,
-              std::negation<std::is_constructible<IValue, Args>>...>::value,
+              std::negation<std::is_constructible<IValue, Args>>...>,
           std::nullptr_t> = nullptr>
   IValue(std::tuple<Args...>&& t);
   bool isTuple() const {
@@ -536,8 +532,13 @@ struct TORCH_API IValue final {
     return Tag::Double == tag;
   }
   double toDouble() const {
-    AT_ASSERT(isDouble());
-    return payload.u.as_double;
+    if (isDouble()) {
+      return payload.u.as_double;
+    } else if (isSymFloat()) {
+      return toSymFloat().guard_float(__FILE__, __LINE__);
+    } else {
+      TORCH_INTERNAL_ASSERT(0, "expected double");
+    }
   }
 
   // ComplexDouble
@@ -643,8 +644,13 @@ struct TORCH_API IValue final {
   }
 
   int64_t toInt() const {
-    AT_ASSERT(isInt());
-    return payload.u.as_int;
+    if (isInt()) {
+      return payload.u.as_int;
+    } else if (isSymInt()) {
+      return toSymInt().guard_int(__FILE__, __LINE__);
+    } else {
+      TORCH_INTERNAL_ASSERT(0, "expected int");
+    }
   }
 
   // Bool
@@ -662,8 +668,13 @@ struct TORCH_API IValue final {
     return Tag::Bool == tag;
   }
   bool toBool() const {
-    AT_ASSERT(isBool());
-    return payload.u.as_bool;
+    if (isBool()) {
+      return payload.u.as_bool;
+    } else if (isSymBool()) {
+      return toSymBool().guard_bool(__FILE__, __LINE__);
+    } else {
+      TORCH_INTERNAL_ASSERT(0, "expected bool");
+    }
   }
 
   // IntList
@@ -732,7 +743,7 @@ struct TORCH_API IValue final {
   // This SFINAEs the called constructor exists.
   template <class T>
   using enable_if_ivalue_constructible =
-      std::enable_if_t<std::is_constructible<IValue, T>::value, std::nullptr_t>;
+      std::enable_if_t<std::is_constructible_v<IValue, T>, std::nullptr_t>;
 
   // The rule for lists is more complicated; the generic constructor is only
   // acceptable if your element isn't SymInt.  If you do have a SymInt element,
@@ -744,8 +755,7 @@ struct TORCH_API IValue final {
   // they're not selectable.
   template <class T>
   using enable_if_list_is_ivalue_constructible = std::enable_if_t<
-      std::is_constructible<IValue, T>::value &&
-          !std::is_same<T, c10::SymInt>::value,
+      std::is_constructible_v<IValue, T> && !std::is_same_v<T, c10::SymInt>,
       std::nullptr_t>;
 
   template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
@@ -756,6 +766,8 @@ struct TORCH_API IValue final {
   IValue(at::ArrayRef<T> v);
   template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
   IValue(const std::vector<T>& v);
+  template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
+  IValue(std::vector<T>&& v);
   template <class T, size_t N>
   IValue(std::array<T, N> v);
 
@@ -764,7 +776,7 @@ struct TORCH_API IValue final {
   // to prevent implicit conversions
   template <class T>
   using enable_if_symint =
-      std::enable_if_t<std::is_same<T, c10::SymInt>::value, std::nullptr_t>;
+      std::enable_if_t<std::is_same_v<T, c10::SymInt>, std::nullptr_t>;
 
   template <class T, enable_if_symint<T> = nullptr>
   IValue(at::ArrayRef<T> v);
@@ -772,13 +784,14 @@ struct TORCH_API IValue final {
   IValue(at::OptionalArrayRef<T> v);
   template <class T, enable_if_symint<T> = nullptr>
   IValue(const std::vector<T>& v);
+  template <class T, enable_if_symint<T> = nullptr>
+  IValue(std::vector<T>&& v);
 
   template <class T>
   using enable_if_ilist_is_ivalue_constructible = std::enable_if_t<
-      std::is_constructible<IValue, T>::value &&
-          std::is_constructible<IValue, typename IListRef<T>::boxed_type>::
-              value &&
-          !std::is_same<T, c10::SymInt>::value,
+      std::is_constructible_v<IValue, T> &&
+          std::is_constructible_v<IValue, typename IListRef<T>::boxed_type> &&
+          !std::is_same_v<T, c10::SymInt>,
       std::nullptr_t>;
 
   template <class T, enable_if_ilist_is_ivalue_constructible<T> = nullptr>
@@ -839,7 +852,7 @@ struct TORCH_API IValue final {
   c10::intrusive_ptr<ivalue::EnumHolder> toEnumHolder() const&;
 
   // None
-  IValue() : tag(Tag::None) {}
+  IValue() = default;
   bool isNone() const {
     return Tag::None == tag;
   }
@@ -932,21 +945,20 @@ struct TORCH_API IValue final {
 
   // ScalarType
   IValue(ScalarType t)
-      : IValue(static_cast<std::underlying_type<ScalarType>::type>(t)) {}
+      : IValue(static_cast<std::underlying_type_t<ScalarType>>(t)) {}
   at::ScalarType toScalarType() const {
     return static_cast<at::ScalarType>(toInt());
   }
 
   // Layout
-  IValue(Layout l)
-      : IValue(static_cast<std::underlying_type<Layout>::type>(l)) {}
+  IValue(Layout l) : IValue(static_cast<std::underlying_type_t<Layout>>(l)) {}
   at::Layout toLayout() const {
     return static_cast<at::Layout>(toInt());
   }
 
   // MemoryFormat
   IValue(MemoryFormat m)
-      : IValue(static_cast<std::underlying_type<MemoryFormat>::type>(m)) {}
+      : IValue(static_cast<std::underlying_type_t<MemoryFormat>>(m)) {}
   at::MemoryFormat toMemoryFormat() const {
     return static_cast<at::MemoryFormat>(toInt());
   }
@@ -1171,6 +1183,7 @@ struct TORCH_API IValue final {
     }
   }
 
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
   C10_ALWAYS_INLINE void moveFrom(IValue&& rhs) noexcept {
     if (rhs.isTensor()) {
       new (&payload.as_tensor) at::Tensor(std::move(rhs.payload.as_tensor));
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index d59d33219d82e..3e3525c274118 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -361,10 +361,10 @@ struct TORCH_API TupleElements {
     switch (inlineSize_) {
       case 3:
         new (&elementsInline_[2]) IValue(elements[2]);
-        C10_FALLTHROUGH;
+        [[fallthrough]];
       case 2:
         new (&elementsInline_[1]) IValue(elements[1]);
-        C10_FALLTHROUGH;
+        [[fallthrough]];
       case 1:
         new (&elementsInline_[0]) IValue(elements[0]);
         break;
@@ -1034,11 +1034,9 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
    */
   template <typename T>
   void addCallback(T callback, bool uses_future = true) {
-#if __cpp_lib_is_invocable >= 201703
     static_assert(
         std::is_invocable_r<void, T, Future&>::value,
         "The callback must have signature void(Future&)");
-#endif
 
     std::unique_lock<std::mutex> lock(mutex_);
     if (completed()) {
@@ -1057,14 +1055,13 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
   template <typename T>
   c10::intrusive_ptr<Future> then(T callback, TypePtr type) {
     using IValueWithStorages = std::tuple<IValue, std::vector<WeakStorage>>;
-#if __cpp_lib_is_invocable >= 201703
     static_assert(
         std::disjunction<
             std::is_invocable_r<IValue, T, Future&>,
             std::is_invocable_r<IValueWithStorages, T, Future&>>::value,
         "The callback must have signature IValue(Future&) or "
         "std::tuple<IValue, std::vector<Storage>>(Future&)");
-#endif
+
     auto childFut = createInstance(::std::move(type));
     addCallback([childFut,
                  cb = std::move(callback)](Future& parentFut) mutable {
@@ -1084,11 +1081,10 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
 
   template <typename T>
   c10::intrusive_ptr<Future> thenAsync(T callback, TypePtr type) {
-#if __cpp_lib_is_invocable >= 201703
     static_assert(
         std::is_invocable_r<c10::intrusive_ptr<Future>, T, Future&>::value,
         "The callback must have signature c10::intrusive_ptr<Future>(Future&)");
-#endif
+
     auto childFut = createInstance(std::move(type));
     addCallback(
         [childFut, cb = std::move(callback)](Future& parentFut) mutable {
@@ -1165,11 +1161,9 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
   // synchronize them with the value, and so on (if needed).
   template<typename T>
   void invokeCallback(T callback, bool uses_future) {
-#if __cpp_lib_is_invocable >= 201703
     static_assert(
         std::is_invocable_r<void, T, Future&>::value,
         "The callback must have signature void(Future&)");
-#endif
 
     // The synchronization performed below shouldn't be needed when the future
     // is not used by the callback.
@@ -1675,8 +1669,8 @@ struct _guarded_unsigned_long_unique_dummy final {
   _guarded_unsigned_long_unique_dummy(int64_t){};
 };
 using _guarded_unsigned_long = std::conditional_t<
-    std::is_same<unsigned long, uint32_t>::value ||
-        std::is_same<unsigned long, uint64_t>::value,
+    std::is_same_v<unsigned long, uint32_t> ||
+        std::is_same_v<unsigned long, uint64_t>,
     _guarded_unsigned_long_unique_dummy,
     unsigned long>;
 
@@ -1936,9 +1930,9 @@ template <
     typename... Args,
     typename Indices = std::make_index_sequence<sizeof...(Args)>,
     std::enable_if_t<
-        !std::disjunction<
+        !std::disjunction_v<
             std::is_lvalue_reference<Args>...,
-            std::negation<std::is_constructible<IValue, Args>>...>::value,
+            std::negation<std::is_constructible<IValue, Args>>...>,
         std::nullptr_t> = nullptr>
 std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>>) {
   const auto& vals = ivalue.toTupleRef().elements();
@@ -2116,9 +2110,9 @@ inline IValue::IValue(c10::intrusive_ptr<ivalue::Tuple> v)
 template <
     typename... Args,
     std::enable_if_t<
-        !std::disjunction<
+        !std::disjunction_v<
             std::is_lvalue_reference<Args>...,
-            std::negation<std::is_constructible<IValue, Args>>...>::value,
+            std::negation<std::is_constructible<IValue, Args>>...>,
         std::nullptr_t>>
 inline IValue::IValue(const std::tuple<Args...>& t)
     : IValue(c10::guts::apply(c10::ivalue::Tuple::create<const Args&...>, t)) {
@@ -2127,9 +2121,9 @@ inline IValue::IValue(const std::tuple<Args...>& t)
 template <
     typename... Args,
     std::enable_if_t<
-        !std::disjunction<
+        !std::disjunction_v<
             std::is_lvalue_reference<Args>...,
-            std::negation<std::is_constructible<IValue, Args>>...>::value,
+            std::negation<std::is_constructible<IValue, Args>>...>,
         std::nullptr_t>>
 inline IValue::IValue(std::tuple<Args...>&& t)
     : IValue(c10::guts::apply(c10::ivalue::Tuple::create<Args&&...>, std::move(t))) {
@@ -2185,6 +2179,23 @@ template <class T, IValue::enable_if_symint<T>>
 inline IValue::IValue(const std::vector<T>& v) : IValue() {
   *this = IValue(at::ArrayRef<T>(v));
 }
+template <class T, IValue::enable_if_symint<T>>
+inline IValue::IValue(std::vector<T>&& v) : IValue() {
+  auto vi = c10::asIntArrayRefSlowOpt(v);
+  if (vi.has_value()) {
+    // This list is entirely integers; ensure it is typed as
+    // an IntList so toIntList works
+    *this = IValue(*vi);
+  } else {
+    // This list has SymInts; type it as a SymInt
+    *this = IValue(impl::toList<c10::SymInt>(c10::List<c10::SymInt>()));
+    auto list = to<c10::List<c10::SymInt>>();
+    list.reserve(v.size());
+    for (auto&& e : std::move(v)) {
+      list.push_back(std::move(e));
+    }
+  }
+}
 template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
 inline IValue::IValue(const std::vector<T>& v) : IValue(c10::List<T>()) {
   auto list = to<c10::List<T>>();
@@ -2193,6 +2204,22 @@ inline IValue::IValue(const std::vector<T>& v) : IValue(c10::List<T>()) {
     list.push_back(e);
   }
 }
+
+template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
+inline IValue::IValue(std::vector<T>&& v) : IValue(c10::List<T>()) {
+  auto list = to<c10::List<T>>();
+  list.reserve(v.size());
+  if constexpr (std::is_same_v<T, bool>) {
+    for (auto e : v) {
+      list.push_back(e);
+    }
+  } else {
+    for (auto&& e : std::move(v)) {
+      list.push_back(std::move(e));
+    }
+  }
+}
+
 template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
 inline IValue::IValue(c10::OptionalArrayRef<T> v) : IValue() {
   if (v.has_value()) {
@@ -2280,7 +2307,7 @@ inline IValue IValue::make_capsule(
 
 template <
     typename T,
-    std::enable_if_t<std::is_base_of<torch::CustomClassHolder, T>::value, int>>
+    std::enable_if_t<std::is_base_of_v<torch::CustomClassHolder, T>, int>>
 IValue::IValue(c10::intrusive_ptr<T> custom_class) : tag(Tag::Object) {
   auto classType = []() {
     try {
@@ -2288,8 +2315,7 @@ IValue::IValue(c10::intrusive_ptr<T> custom_class) : tag(Tag::Object) {
     } catch (const c10::Error&) {
       throw c10::Error(
           "Trying to instantiate a class that isn't a registered custom class: " +
-          std::string(c10::util::get_fully_qualified_type_name<T>()),
-          "");
+          std::string(c10::util::get_fully_qualified_type_name<T>()));
     }
   }();
   auto ivalue_obj = c10::ivalue::Object::create(std::move(classType), /* numSlots */1);
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 3f0d7970a10f9..05f7242855417 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -17,14 +17,13 @@
 #include <memory>
 #include <ostream>
 #include <sstream>
-#include <type_traits>
 #include <utility>
 
-namespace torch {
-namespace jit {
+
+namespace torch::jit {
 struct Function;
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
+
 
 namespace c10 {
 
@@ -171,9 +170,9 @@ struct TORCH_API UnionType : public SharedType {
 
  protected:
     explicit UnionType(std::vector<TypePtr> types, TypeKind kind=TypeKind::UnionType);
-    std::string annotation_str_impl(TypePrinter printer = nullptr) const override;
+    std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override;
     std::string unionStr(
-        TypePrinter printer = nullptr,
+        const TypePrinter& printer = nullptr,
         bool is_annotation_str = false) const;
     // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
     bool has_free_variables_;
@@ -240,9 +239,9 @@ struct TORCH_API OptionalType : public UnionType {
 
   TypePtr contained_;
 
-  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "Optional[" << getElementType()->annotation_str(std::move(printer)) << "]";
+    ss << "Optional[" << getElementType()->annotation_str(printer) << "]";
     return ss.str();
   }
 };
@@ -546,6 +545,7 @@ struct VaryingShape {
       return c10::nullopt;
     }
     std::vector<T> sizes;
+    sizes.reserve(dims_.value().size());
     for (auto d : *dims_) {
       if (!d) {
         return c10::nullopt;
@@ -909,9 +909,9 @@ struct TORCH_API ListType
  private:
   ListType(TypePtr elem) : SingleElementType(std::move(elem)) {}
 
-  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "List[" << getElementType()->annotation_str(std::move(printer)) << "]";
+    ss << "List[" << getElementType()->annotation_str(printer) << "]";
     return ss.str();
   }
 };
@@ -1003,7 +1003,7 @@ struct TORCH_API DictType : public SharedType {
     types.push_back(std::move(value));
   }
 
-  std::string annotation_str_impl(TypePrinter printer = nullptr) const override;
+  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override;
 
   std::vector<TypePtr> types;
   bool has_free_variables;
@@ -1044,9 +1044,9 @@ struct TORCH_API FutureType
  private:
   FutureType(TypePtr elem) : SingleElementType(std::move(elem)) {}
 
-  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "Future[" << getElementType()->annotation_str(std::move(printer)) << "]";
+    ss << "Future[" << getElementType()->annotation_str(printer) << "]";
     return ss.str();
   }
 };
@@ -1086,7 +1086,7 @@ struct TORCH_API AwaitType
  private:
   AwaitType(TypePtr elem) : SingleElementType(std::move(elem)) {}
 
-  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
     ss << "Await[" << getElementType()->annotation_str(printer) << "]";
     return ss.str();
@@ -1118,9 +1118,9 @@ struct TORCH_API RRefType
  private:
   RRefType(TypePtr elem) : SingleElementType(std::move(elem)) {}
 
-  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "RRef[" << getElementType()->annotation_str(std::move(printer)) << "]";
+    ss << "RRef[" << getElementType()->annotation_str(printer) << "]";
     return ss.str();
   }
 };
@@ -1225,7 +1225,7 @@ struct TORCH_API TupleType : public NamedType {
     return true;
   }
 
-  std::string annotation_str_impl(TypePrinter printer = nullptr) const override;
+  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override;
 
   std::vector<TypePtr> elements_;
   bool has_free_variables_;
@@ -1278,7 +1278,7 @@ struct TORCH_API NumberType : public Type {
  protected:
   NumberType(TypeKind kind = TypeKind::NumberType) : Type(kind) {}
 
-  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
     return "number"; // technically not a valid python type, but
                      // we need to use it when parsing back in annotations
                      // for implicit conversions
@@ -1305,7 +1305,7 @@ struct TORCH_API FloatType : public NumberType {
 
  private:
   FloatType() : NumberType(TypeKind::FloatType) {}
-  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
     return "float";
   }
 };
@@ -1330,7 +1330,7 @@ struct TORCH_API ComplexType : public NumberType {
 
  private:
   ComplexType() : NumberType(TypeKind::ComplexType) {}
-  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
     return "complex";
   }
 };
@@ -1348,7 +1348,7 @@ struct TORCH_API SymIntType : public Type {
   std::string str() const override {
     return "SymInt";
   }
-  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     return "int";
   }
   static const TypeKind Kind = TypeKind::SymIntType;
@@ -1368,7 +1368,7 @@ struct TORCH_API SymFloatType : public Type {
   std::string str() const override {
     return "SymFloat";
   }
-  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     return "float";
   }
   static const TypeKind Kind = TypeKind::SymFloatType;
@@ -1388,7 +1388,7 @@ struct TORCH_API SymBoolType : public Type {
   std::string str() const override {
     return "SymBool";
   }
-  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     return "bool";
   }
   static const TypeKind Kind = TypeKind::SymBoolType;
@@ -1419,7 +1419,7 @@ struct TORCH_API IntType : public NumberType {
 
  private:
   IntType() : NumberType(TypeKind::IntType) {}
-  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
     return "int";
   }
 };
@@ -1453,7 +1453,7 @@ struct TORCH_API StringType : public Type {
     // we only use "str" (not "string") in both FunctionSchema and script
     return annotation_str();
   }
-  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
     return "str";
   }
   static const TypeKind Kind = TypeKind::StringType;
@@ -1473,7 +1473,7 @@ struct TORCH_API StorageType : public Type {
   std::string str() const override {
     return annotation_str();
   }
-  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
     return "Storage";
   }
   static const TypeKind Kind = TypeKind::StorageType;
@@ -1508,7 +1508,7 @@ struct TORCH_API FunctionType : public NamedType {
 
  private:
   FunctionType(torch::jit::Function* function);
-  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
     const auto& n = name().value();
     return n.qualifiedName();
   }
@@ -2199,7 +2199,7 @@ struct TORCH_API InterfaceType : public NamedType {
       const InterfaceType& rhs,
       std::ostream* why_not);
 
-  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
     return name()->qualifiedName();
   }
 
diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h
index bf4909aaefec7..21692db56dd87 100644
--- a/aten/src/ATen/core/jit_type_base.h
+++ b/aten/src/ATen/core/jit_type_base.h
@@ -118,7 +118,7 @@ struct CastReturnType {
 };
 
 template <typename T>
-struct CastReturnType<T, typename std::enable_if<IsSingletonType<T>::value>::type> {
+struct CastReturnType<T, std::enable_if_t<IsSingletonType<T>::value>> {
   using type = SingletonTypePtr<T>;
 };
 
@@ -128,7 +128,7 @@ struct CastConstReturnType {
 };
 
 template <typename T>
-struct CastConstReturnType<T, typename std::enable_if<IsSingletonType<T>::value>::type> {
+struct CastConstReturnType<T, std::enable_if_t<IsSingletonType<T>::value>> {
   using type = SingletonTypePtr<const T>;
 };
 
@@ -156,7 +156,7 @@ struct TORCH_API Type {
   Type(Type&&) noexcept = default;
   Type& operator=(Type&&) noexcept = default;
 
-  virtual std::string annotation_str_impl(TypePrinter /*printer*/) const {
+  virtual std::string annotation_str_impl(const TypePrinter& /*printer*/) const {
     return str();
   }
   // a == b
@@ -177,7 +177,7 @@ struct TORCH_API Type {
     /* implicit */ SingletonOrSharedTypePtr(std::shared_ptr<T> x)
         : repr_(std::move(x)) {}
 
-    template <typename U, std::enable_if_t<std::is_convertible<U*, T*>::value, bool> = true>
+    template <typename U, std::enable_if_t<std::is_convertible_v<U*, T*>, bool> = true>
     /* implicit */ SingletonOrSharedTypePtr(std::shared_ptr<U> x)
         : repr_(std::move(x)) {}
 
@@ -187,7 +187,7 @@ struct TORCH_API Type {
     /* implicit */ SingletonOrSharedTypePtr(SingletonTypePtr<T> p)
         : repr_(p) {}
 
-    template <typename U, std::enable_if_t<std::is_convertible<U*, T*>::value, bool> = true>
+    template <typename U, std::enable_if_t<std::is_convertible_v<U*, T*>, bool> = true>
     /* implicit */ SingletonOrSharedTypePtr(SingletonTypePtr<U> p)
         : repr_(SingletonTypePtr<T>(p.get())) {}
 
@@ -205,10 +205,10 @@ struct TORCH_API Type {
     // Case 3: Otherwise, T is not a SharedType. (debug-check this
     // assumption!) Use a singleton pointer.
 
-    template <typename U = T, std::enable_if_t<std::is_base_of<SharedType, U>::value, bool> = true>
+    template <typename U = T, std::enable_if_t<std::is_base_of_v<SharedType, U>, bool> = true>
     /* implicit */ SingletonOrSharedTypePtr(T* p) : SingletonOrSharedTypePtr(static_cast<typename detail::as_shared_type<U>::type>(p)->shared_from_this()) {}
 
-    template <typename U = T, std::enable_if_t<std::is_same<Type, U>::value, bool> = true>
+    template <typename U = T, std::enable_if_t<std::is_same_v<Type, U>, bool> = true>
     /* implicit */ SingletonOrSharedTypePtr(T* p) {
       if (auto* shared_p = dynamic_cast<typename detail::as_shared_type<U>::type>(p)) {
         repr_ = Repr(shared_p->shared_from_this());
@@ -217,7 +217,7 @@ struct TORCH_API Type {
       }
     }
 
-    template <typename U = T, std::enable_if_t<!std::is_same<Type, U>::value && !std::is_base_of<SharedType, U>::value, bool> = true>
+    template <typename U = T, std::enable_if_t<!std::is_same_v<Type, U> && !std::is_base_of_v<SharedType, U>, bool> = true>
     /* implicit */ SingletonOrSharedTypePtr(T* p)
         : repr_(p) {
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dynamic_cast<typename detail::as_shared_type<U>::type>(p) == nullptr);
@@ -244,7 +244,7 @@ struct TORCH_API Type {
       return repr_.isNonNull();
     }
 
-    template <typename U = T, std::enable_if_t<!std::is_same<std::remove_const_t<U>, void>::value, bool> = true>
+    template <typename U = T, std::enable_if_t<!std::is_same_v<std::remove_const_t<U>, void>, bool> = true>
     U& operator*() const {
       return *get();
     }
@@ -409,37 +409,37 @@ struct TORCH_API Type {
   // Compatibility shims to accommodate existing code that passes shared_ptrs
   // around. Ideally, we would just delete this, but it should be harmless.
   template <typename T>
-  typename std::enable_if<std::is_base_of<Type, T>::value, bool>::type
+  std::enable_if_t<std::is_base_of_v<Type, T>, bool>
   isSubtypeOf(const std::shared_ptr<T>& rhs) const {
     return isSubtypeOf(*rhs);
   }
 
   template <typename T>
-  typename std::enable_if<std::is_base_of<Type, T>::value, bool>::type
+  std::enable_if_t<std::is_base_of_v<Type, T>, bool>
   isSubtypeOf(const SingletonOrSharedTypePtr<T>& rhs) const {
     return isSubtypeOf(*rhs);
   }
 
   template <typename T>
-  typename std::enable_if<std::is_base_of<Type, T>::value, bool>::type
+  std::enable_if_t<std::is_base_of_v<Type, T>, bool>
   isSubtypeOf(SingletonTypePtr<T> rhs) const {
     return isSubtypeOf(*rhs);
   }
 
   template <typename T>
-  typename std::enable_if<std::is_base_of<Type, T>::value, bool>::type
+  std::enable_if_t<std::is_base_of_v<Type, T>, bool>
   isSubtypeOfExt(const SingletonOrSharedTypePtr<T>& rhs, std::ostream* why_not) const {
     return isSubtypeOfExt(*rhs, why_not);
   }
 
   template <typename T>
-  typename std::enable_if<std::is_base_of<Type, T>::value, bool>::type
+  std::enable_if_t<std::is_base_of_v<Type, T>, bool>
   isSubtypeOfExt(const std::shared_ptr<T>& rhs, std::ostream* why_not) const {
     return isSubtypeOfExt(*rhs, why_not);
   }
 
   template <typename T>
-  typename std::enable_if<std::is_base_of<Type, T>::value, bool>::type
+  std::enable_if_t<std::is_base_of_v<Type, T>, bool>
   isSubtypeOfExt(SingletonTypePtr<T> rhs, std::ostream* why_not) const {
     return isSubtypeOfExt(*rhs, why_not);
   }
@@ -453,14 +453,14 @@ struct TORCH_API Type {
   //
   // Takes a custom printer that users can pass in to customize the output of
   // this method.
-  std::string annotation_str(TypePrinter printer) const {
+  std::string annotation_str(const TypePrinter& printer) const {
     if (printer) {
       // the printer can return nullopt to fall through to the default impl
       if (auto renamed = printer(*this)) {
         return *renamed;
       }
     }
-    return annotation_str_impl(std::move(printer));
+    return annotation_str_impl(printer);
   }
   std::string annotation_str() const {
     // Overload instead of define a default value for `printer` to help
@@ -583,6 +583,7 @@ struct TORCH_API Type {
   // per-type constructor, you only need to override this if the
   // containedTypes() is not empty
   virtual TypePtr createWithContained(
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
       std::vector<TypePtr> /*contained_types*/) const {
     AT_ERROR(
         "type with contained types did not overload createWithContained: ",
diff --git a/aten/src/ATen/core/library.cpp b/aten/src/ATen/core/library.cpp
index 1341ee0c8b8d3..fd349da2f8b0c 100644
--- a/aten/src/ATen/core/library.cpp
+++ b/aten/src/ATen/core/library.cpp
@@ -51,6 +51,10 @@ CppFunction::CppFunction(c10::KernelFunction func, c10::optional<c10::impl::CppS
 
 CppFunction::~CppFunction() = default;
 
+void Library::reset() {
+  registrars_.clear();
+}
+
 #define ERROR_CONTEXT "(Error occurred while processing ", toString(kind_), " block at ", file_, ":", line_, ")"
 
 Library::Library(Kind kind, std::string ns, c10::optional<c10::DispatchKey> k, const char* file, uint32_t line)
@@ -129,12 +133,12 @@ Library& Library::_def(c10::FunctionSchema&& schema, c10::OperatorName* out_name
   }
   switch (rv) {
     case _RegisterOrVerify::REGISTER:
-      if (impl_abstract_pystub_.has_value()) {
+      if (python_module_.has_value()) {
         registrars_.emplace_back(
-          c10::Dispatcher::singleton().registerAbstractImplPyStub(
+          c10::Dispatcher::singleton().registerPythonModule(
             schema.operator_name(),
-            impl_abstract_pystub_->first,
-            impl_abstract_pystub_->second)
+            python_module_->first,
+            python_module_->second)
         );
       }
       registrars_.emplace_back(
@@ -153,6 +157,7 @@ Library& Library::_def(c10::FunctionSchema&& schema, c10::OperatorName* out_name
 }
 #undef DEF_PRELUDE
 
+// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
 Library& Library::_def(std::variant<c10::OperatorName, c10::FunctionSchema>&& name_or_schema, CppFunction&& f, const std::vector<at::Tag>& tags) & {
   c10::FunctionSchema schema = [&] {
     if (std::holds_alternative<c10::FunctionSchema>(name_or_schema)){
@@ -214,6 +219,7 @@ at::OperatorName Library::_parseNameForLib(const char* name_str) const {
   return name;
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
 Library& Library::_impl(const char* name_str, CppFunction&& f, _RegisterOrVerify rv) & {
   at::OperatorName name = _parseNameForLib(name_str);
   // See Note [Redundancy in registration code is OK]
@@ -253,6 +259,7 @@ c10::OperatorName Library::_resolve(const char* name_str) const {
 }
 #undef IMPL_PRELUDE
 
+// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
 Library& Library::_fallback(CppFunction&& f) & {
   TORCH_CHECK(kind_ == IMPL,
     "fallback(...): Cannot define an operator inside of a ", toString(kind_), " block.  "
@@ -275,8 +282,8 @@ Library& Library::_fallback(CppFunction&& f) & {
     registrars_.emplace_back(
       c10::Dispatcher::singleton().registerFallback(
         k,
-        std::move(f.func_),
-        debugString(std::move(f.debug_), file_, line_)
+        f.func_,
+        debugString(f.debug_, file_, line_)
       )
     );
   }
diff --git a/aten/src/ATen/core/op_registration/README.md b/aten/src/ATen/core/op_registration/README.md
index 5605e962a6e5e..61b41b48c4a67 100644
--- a/aten/src/ATen/core/op_registration/README.md
+++ b/aten/src/ATen/core/op_registration/README.md
@@ -13,13 +13,13 @@ There’s four main use cases
 * You’re writing a new operator that isn’t supposed to be part of the public PyTorch API.
 * You’re writing a new operator but don’t want to change the core pytorch code base, say you’re developing a shared library with operators.
 * You’re writing a C++ extension for PyTorch or you’re using inline c++ in your .py model files.
-* You’re writing a backend library like XLA or ORT that adds new kernels to all operators defined in `native_functions.yaml`.
+* You’re writing a backend library like XLA or MAIA that adds new kernels to all operators defined in `native_functions.yaml`.
 
 For these use cases, the custom operator API is the better solution.
 
 ### What is the price for using the custom operator API instead of `native_functions.yaml`?
 
-If you’re just using the custom operator API to add new kernels for existing operators (e.g. the XLA/ORT example above), then you’re fine and don’t pay any price. If, however, you define a new operator purely using the custom op API, i.e. your operator never shows up in `native_functions.yaml`, then you need to be aware of a few caveats.
+If you’re just using the custom operator API to add new kernels for existing operators (e.g. the XLA/MAIA example above), then you’re fine and don’t pay any price. If, however, you define a new operator purely using the custom op API, i.e. your operator never shows up in `native_functions.yaml`, then you need to be aware of a few caveats.
 
 * It will not get a C++ API generated. There will not be `Tensor::your_op()` methods or `at::your_op()` functions to call your operator.
 * The API for calling the operator from Python looks a little bit different. It needs to be called through `torch.ops.your_op()` instead of `torch._C`.
diff --git a/aten/src/ATen/core/op_registration/infer_schema.h b/aten/src/ATen/core/op_registration/infer_schema.h
index a00ef76f460b9..57409442950f2 100644
--- a/aten/src/ATen/core/op_registration/infer_schema.h
+++ b/aten/src/ATen/core/op_registration/infer_schema.h
@@ -6,7 +6,6 @@
  */
 
 #include <ATen/core/function_schema.h>
-#include <c10/util/C++17.h>
 #include <c10/util/Metaprogramming.h>
 
 namespace c10 {
@@ -37,10 +36,10 @@ template <class... Types>
 constexpr int checkStaticTypes() {
  // Give nice error messages for some of the common error cases.
  // Use a LOUD ERROR MESSAGE SO USERS SEE THE STATIC_ASSERT
- static_assert(guts::conjunction<
+ static_assert(std::conjunction<
      bool_t<!std::is_integral<Types>::value || std::is_same<Types, int8_t>::value || std::is_same<Types, int64_t>::value || std::is_same<Types, bool>::value>...
    >::value, "INVALID TYPE: Only int8_t, int64_t and bool are supported as an integral argument type");
- static_assert(guts::conjunction<
+ static_assert(std::conjunction<
      bool_t<!std::is_same<Types, float>::value>...
    >::value, "INVALID TYPE: float is not supported as an argument type, use double instead");
  return 0;
diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp
index a1c9c63052f1d..377cb403cdcfd 100644
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@@ -1154,15 +1154,15 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) {
     "(int[]? a) -> int[]?");
 
   // Test list of optional (with empty list)
-  testArgTypes<c10::List<c10::optional<int64_t>>>::test(
-    c10::List<c10::optional<int64_t>>(c10::List<c10::optional<int64_t>>({})), [] (const c10::List<c10::optional<int64_t>>& v) {EXPECT_EQ(0, v.size());},
-    c10::List<c10::optional<int64_t>>(c10::List<c10::optional<int64_t>>({})), [] (const IValue& v) {EXPECT_EQ(0, v.to<c10::List<c10::optional<int64_t>>>().size());},
+  testArgTypes<c10::List<::std::optional<int64_t>>>::test(
+    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({})), [] (const c10::List<::std::optional<int64_t>>& v) {EXPECT_EQ(0, v.size());},
+    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({})), [] (const IValue& v) {EXPECT_EQ(0, v.to<c10::List<::std::optional<int64_t>>>().size());},
     "(int?[] a) -> int?[]");
 
   // Test list of optional (with values)
-  testArgTypes<c10::List<c10::optional<int64_t>>>::test(
-    c10::List<c10::optional<int64_t>>(c10::List<c10::optional<int64_t>>({3, c10::nullopt, 2})), [] (const c10::List<c10::optional<int64_t>>& v) {expectListEquals<c10::optional<int64_t>>({3, c10::nullopt, 2}, v);},
-    c10::List<c10::optional<int64_t>>(c10::List<c10::optional<int64_t>>({3, c10::nullopt, 2})), [] (const IValue& v) {expectListEquals<c10::optional<int64_t>>({3, c10::nullopt, 2}, v.to<c10::List<c10::optional<int64_t>>>());},
+  testArgTypes<c10::List<::std::optional<int64_t>>>::test(
+    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({3, c10::nullopt, 2})), [] (const c10::List<::std::optional<int64_t>>& v) {expectListEquals<c10::optional<int64_t>>({3, c10::nullopt, 2}, v);},
+    c10::List<::std::optional<int64_t>>(c10::List<::std::optional<int64_t>>({3, c10::nullopt, 2})), [] (const IValue& v) {expectListEquals<c10::optional<int64_t>>({3, c10::nullopt, 2}, v.to<c10::List<::std::optional<int64_t>>>());},
     "(int?[] a) -> int?[]");
 
   // dict types
@@ -1234,15 +1234,15 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) {
     "(Dict(int, Tensor) a) -> Dict(int, Tensor)");
 
   // weird deeply nested type
-  using DeeplyNestedType = c10::List<c10::Dict<std::string, c10::List<c10::optional<c10::Dict<int64_t, std::string>>>>>;
+  using DeeplyNestedType = c10::List<c10::Dict<std::string, c10::List<::std::optional<c10::Dict<int64_t, std::string>>>>>;
   auto makeDeeplyNestedObject = [] () -> DeeplyNestedType {
     c10::Dict<int64_t, std::string> inner3;
     inner3.insert(1, "1");
-    c10::List<c10::optional<c10::Dict<int64_t, std::string>>> inner2;
+    c10::List<::std::optional<c10::Dict<int64_t, std::string>>> inner2;
     inner2.push_back(std::move(inner3));
-    c10::Dict<std::string, c10::List<c10::optional<c10::Dict<int64_t, std::string>>>> inner1;
+    c10::Dict<std::string, c10::List<::std::optional<c10::Dict<int64_t, std::string>>>> inner1;
     inner1.insert("key", std::move(inner2));
-    c10::List<c10::Dict<std::string, c10::List<c10::optional<c10::Dict<int64_t, std::string>>>>> result;
+    c10::List<c10::Dict<std::string, c10::List<::std::optional<c10::Dict<int64_t, std::string>>>>> result;
     result.push_back(inner1);
     return result;
   };
diff --git a/aten/src/ATen/core/rref_interface.h b/aten/src/ATen/core/rref_interface.h
index cefb29c08ddc6..f0749d368792f 100644
--- a/aten/src/ATen/core/rref_interface.h
+++ b/aten/src/ATen/core/rref_interface.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <c10/util/intrusive_ptr.h>
-#include <ATen/core/type_ptr.h>
+#include <ATen/core/jit_type_base.h>
 
 namespace c10 {
 
diff --git a/aten/src/ATen/core/stack.h b/aten/src/ATen/core/stack.h
index 1695e5995ab69..5dc89da6c5627 100644
--- a/aten/src/ATen/core/stack.h
+++ b/aten/src/ATen/core/stack.h
@@ -8,8 +8,8 @@
 
 // TODO move this to c10 namespace
 
-namespace torch {
-namespace jit {
+
+namespace torch::jit {
 
 using c10::IValue;
 using Stack = std::vector<IValue>;
@@ -22,13 +22,14 @@ class Operation {
   template <typename F,
             std::enable_if_t<accepts<F, Stack*>::value, int> = 0>
   C10_DEPRECATED_MESSAGE("Please use void(Stack&) to register operator instead.")
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
   Operation(F&& raw): op_([raw = std::forward<F>(raw)](Stack& stack) {
     raw(&stack);
   }) {}
 
   template <typename F,
             std::enable_if_t<accepts<F, Stack&>::value &&
-                !std::is_same<std::decay_t<F>, Operation>::value, int> = 0>
+                !std::is_same_v<std::decay_t<F>, Operation>, int> = 0>
   Operation(F&& op): op_(std::forward<F>(op)) {}
 
   Operation(std::nullptr_t) noexcept {}
@@ -66,12 +67,14 @@ class Operation {
 // treat the last N elements of the stack as a list, looking up
 // element i
 static inline IValue& peek(Stack& stack, size_t i, size_t N) {
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions)
   return *(stack.end() - N + i);
 }
 static inline IValue& peek(Stack* stack, size_t i, size_t N) {
   return peek(*stack, i, N);
 }
 static inline const IValue& peek(const Stack& stack, size_t i, size_t N) {
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions)
   return *(stack.end() - N + i);
 }
 static inline const IValue& peek(const Stack* stack, size_t i, size_t N) {
@@ -93,6 +96,7 @@ static inline at::ArrayRef<IValue> last(const Stack* stack, size_t N) {
   return last(*stack, N);
 }
 static inline void drop(Stack& stack, size_t n) {
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions)
   stack.erase(stack.end() - n, stack.end());
 }
 static inline void drop(Stack* stack, size_t n) {
@@ -188,6 +192,7 @@ struct TuplePacker {
 
 template <typename... Args>
 struct TuplePacker<0, Args...> {
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
   static void execute(Stack& /*stack*/, std::tuple<Args...>&& /*t*/){};
 };
 
@@ -196,5 +201,4 @@ inline void pack(Stack& stack, std::tuple<Args...>&& t) {
   TuplePacker<sizeof...(Args), Args...>::execute(stack, std::move(t));
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/aten/src/ATen/core/tensor_type.cpp b/aten/src/ATen/core/tensor_type.cpp
index efe2d4cb18703..c7f8c8b05f91e 100644
--- a/aten/src/ATen/core/tensor_type.cpp
+++ b/aten/src/ATen/core/tensor_type.cpp
@@ -223,9 +223,9 @@ VaryingShape<Stride> TensorType::computeStrideProps(
       has_overlap = possible_cross_dimension_overlap(sizes, strides);
     }
   }
-  std::vector<Stride> stride_properties;
-
 
+  std::vector<Stride> stride_properties;
+  stride_properties.reserve(stride_indices.size());
   for (size_t i = 0; i < stride_indices.size(); i++) {
     bool contiguous_ = tensor_contiguity;
     if (!contiguous_) {
@@ -338,6 +338,7 @@ template struct VaryingShape<c10::ShapeSymbol>;
 template struct VaryingShape<bool>;
 template struct VaryingShape<size_t>;
 template struct VaryingShape<int64_t>;
+template struct VaryingShape<c10::Stride>;
 
 VaryingShape<int64_t> TensorType::sizes() const {
   if (!sizes_.rank()) {
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index 53c08ae26d5df..f7d67ca84861a 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -41,7 +41,7 @@ static_assert(
     sizeof(SingletonOrSharedTypePtr<void>) == sizeof(std::shared_ptr<void>) && sizeof(std::shared_ptr<void>) == 2 * sizeof(void*),
     "std::shared_ptr has an unexpected representation on this platform!");
 static_assert(
-    std::is_same<decltype(getTypePtr<std::tuple<int64_t, int64_t>>()), const TupleTypePtr&>::value,
+    std::is_same_v<decltype(getTypePtr<std::tuple<int64_t, int64_t>>()), const TupleTypePtr&>,
     "getTypePtr<std::tuple<int64_t, int64_t>> not returning const ref!");
 
 TypeVerbosity type_verbosity() {
@@ -314,9 +314,9 @@ TypePtr DictType::get(const std::string& identifier, TypePtr key, TypePtr value)
   return containerTypePtrs[map_key];
 }
 
-std::string DictType::annotation_str_impl(TypePrinter printer) const {
+std::string DictType::annotation_str_impl(const TypePrinter& printer) const {
   auto keyAnnotation = getKeyType()->annotation_str(printer);
-  auto valueAnnotation = getValueType()->annotation_str(std::move(printer));
+  auto valueAnnotation = getValueType()->annotation_str(printer);
 
   std::string result;
   result.reserve(5 /* "Dict[" */ + keyAnnotation.size() + 2 /* ", " */ + valueAnnotation.size() + 1 /* "]" */);
@@ -500,7 +500,7 @@ MatchTypeReturn matchTypeVariables(
     if (it == type_env.end()) {
       type_env[vt->name()] = actual;
       return MatchTypeReturn::Success();
-    } else if (auto unified = unifyTypes(it->second, actual)) {
+    } else if (unifyTypes(it->second, actual)) {
       // note: unifyTypes allows subtyping in either direction, so actual
       // may be a supertype of the current binding. we're not responsible
       // for reporting the error, only for keeping type_env stable
@@ -916,7 +916,7 @@ std::string TupleType::str() const {
   }
   return ss.str();
 }
-std::string TupleType::annotation_str_impl(TypePrinter printer) const {
+std::string TupleType::annotation_str_impl(const TypePrinter& printer) const {
   if (schema_ && name()) {
     return name()->qualifiedName();
   }
diff --git a/aten/src/ATen/core/type_ptr.h b/aten/src/ATen/core/type_ptr.h
index d14c3b8a45641..0859e04c7d2d8 100644
--- a/aten/src/ATen/core/type_ptr.h
+++ b/aten/src/ATen/core/type_ptr.h
@@ -20,7 +20,7 @@ class SingletonTypePtr {
 
   using element_type = typename std::shared_ptr<T>::element_type;
 
-  template <typename U = T, std::enable_if_t<!std::is_same<std::remove_const_t<U>, void>::value, bool> = true>
+  template <typename U = T, std::enable_if_t<!std::is_same_v<std::remove_const_t<U>, void>, bool> = true>
   T& operator*() const {
     return *repr_;
   }
diff --git a/aten/src/ATen/core/union_type.cpp b/aten/src/ATen/core/union_type.cpp
index a35c59584095b..2acc4c497ba56 100644
--- a/aten/src/ATen/core/union_type.cpp
+++ b/aten/src/ATen/core/union_type.cpp
@@ -359,7 +359,7 @@ bool UnionType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
   });
 }
 
-std::string UnionType::unionStr(TypePrinter printer, bool is_annotation_str)
+std::string UnionType::unionStr(const TypePrinter& printer, bool is_annotation_str)
     const {
   std::stringstream ss;
 
@@ -399,7 +399,7 @@ std::string UnionType::unionStr(TypePrinter printer, bool is_annotation_str)
       ss << ", ";
     }
     if (is_annotation_str) {
-      ss << NumberType::get()->annotation_str(std::move(printer));
+      ss << NumberType::get()->annotation_str(printer);
     } else {
       ss << NumberType::get()->str();
     }
@@ -412,8 +412,8 @@ std::string UnionType::str() const {
   return this->unionStr(nullptr, /*is_annotation_str=*/false);
 }
 
-std::string UnionType::annotation_str_impl(TypePrinter printer) const {
-  return this->unionStr(std::move(printer), /*is_annotation_str=*/true);
+std::string UnionType::annotation_str_impl(const TypePrinter& printer) const {
+  return this->unionStr(printer, /*is_annotation_str=*/true);
 }
 
 bool UnionType::canHoldType(const Type& type) const {
diff --git a/aten/src/ATen/cpp_custom_type_hack.h b/aten/src/ATen/cpp_custom_type_hack.h
index 75b900c0d694d..1367ef94df738 100644
--- a/aten/src/ATen/cpp_custom_type_hack.h
+++ b/aten/src/ATen/cpp_custom_type_hack.h
@@ -57,8 +57,7 @@
 #include <ATen/ops/empty.h>
 #endif
 
-namespace at {
-namespace cpp_custom_type_hack {
+namespace at::cpp_custom_type_hack {
 
 template <typename T>
 [[deprecated(
@@ -108,5 +107,4 @@ create(std::unique_ptr<T> ptr, TensorOptions options) {
   return retval;
 }
 
-} // namespace cpp_custom_type_hack
-} // namespace at
+} // namespace at::cpp_custom_type_hack
diff --git a/aten/src/ATen/cpu/vec/functional_base.h b/aten/src/ATen/cpu/vec/functional_base.h
index 3b183ad965279..48d44dc42c33c 100644
--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@@ -78,6 +78,35 @@ struct VecReduceAllSIMD<float, Op> {
 #endif // defined(CPU_CAPABILITY_AVX512)
 #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE)
 
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(const Op& vec_fun, const Vectorized<float>& acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+
+    // 128-bit shuffle: [a1, a2, a3, a4, a5, a6, a7, a8] -> [a5, a6, a7, a8, a1, a2, a3, a4]
+    Vec v1 = {v.get_high(), v.get_low()};
+    // [a1+a5, a2+a6, a3+a7, a4+a8, -, -, -, -] ('+' stands for the reduction function. Note that the last 4 elements are not required)
+    v = vec_fun(v, v1);
+
+    // 64-bit shuffle: [a1+a5, a2+a6, a3+a7, a4+a8, -, -, -, -] -> [a3+a7, a4+a8, a1+a5, a2+a6, -, -, -, -]
+    float32x4_t v1_1 = vextq_f32(v.get_low(), v.get_low(), 2);
+    v1 = {v1_1, v1_1};
+    // [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -, -, -, -]
+    v = vec_fun(v, v1);
+
+    // 32-bit shuffle: [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -, -, -, -] -> [a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, -, -, -, -]
+    v1_1 = vrev64q_f32(v.get_low());
+    v1 = {v1_1, v1_1};
+    // [a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, -, -, -, -]
+    v = vec_fun(v, v1);
+
+    return v.get_low()[0];
+  }
+};
+#endif // defined(__aarch64__)
+
 template <typename scalar_t, typename Op>
 inline scalar_t vec_reduce_all(const Op& vec_fun, const Vectorized<scalar_t>& acc_vec) {
   return VecReduceAllSIMD<scalar_t, Op>::apply(vec_fun, acc_vec);
diff --git a/aten/src/ATen/cpu/vec/functional_bfloat16.h b/aten/src/ATen/cpu/vec/functional_bfloat16.h
index 03cb017549ce9..3bd22b3820f0b 100644
--- a/aten/src/ATen/cpu/vec/functional_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/functional_bfloat16.h
@@ -45,6 +45,34 @@ inline Vectorized<Half> convert_from_float<Half>(const Vectorized<float>& a, con
   return convert_float_half(a, b);
 }
 
+template <typename scalar_t,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void load_to_float(const scalar_t *data, Vectorized<float> &out1, Vectorized<float> &out2);
+
+template <>
+inline void load_to_float<BFloat16> (const BFloat16 *data, Vectorized<float> &out1, Vectorized<float> &out2) {
+  load_fp32_from_bf16(data, out1, out2);
+}
+
+template <>
+inline void load_to_float<Half> (const Half *data, Vectorized<float> &out1, Vectorized<float> &out2) {
+  load_fp32_from_fp16(data, out1, out2);
+}
+
+template <typename scalar_t,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void load_to_float(const scalar_t *data, Vectorized<float> &out);
+
+template <>
+inline void load_to_float<BFloat16> (const BFloat16 *data, Vectorized<float> &out) {
+  load_fp32_from_bf16(data, out);
+}
+
+template <>
+inline void load_to_float<Half> (const Half *data, Vectorized<float> &out) {
+  load_fp32_from_fp16(data, out);
+}
+
 // Note that we already have specialized member of Vectorized<scalar_t> for BFloat16
 // so the following functions would run smoothly:
 //   using Vec = Vectorized<BFloat16>;
@@ -74,8 +102,7 @@ inline float reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) {
   using fVec = vec::Vectorized<float>;
   if (size < bVec::size()) {
     bVec data_bvec = bVec::loadu(data, size);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     if (size > fVec::size()) {
       data_fvec0 = fVec::set(data_fvec0, vec_fun(data_fvec0, data_fvec1), size - fVec::size());
       return vec_reduce_all<float>(vec_fun, data_fvec0, fVec::size());
@@ -85,19 +112,16 @@ inline float reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) {
   }
   int64_t d = bVec::size();
   bVec acc_bvec = bVec::loadu(data);
-  fVec acc_fvec0, acc_fvec1;
-  std::tie(acc_fvec0, acc_fvec1) = convert_to_float<scalar_t>(acc_bvec);
+  auto [acc_fvec0, acc_fvec1] = convert_to_float<scalar_t>(acc_bvec);
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec data_bvec = bVec::loadu(data + d);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     acc_fvec0 = vec_fun(acc_fvec0, data_fvec0);
     acc_fvec1 = vec_fun(acc_fvec1, data_fvec1);
   }
   if (size - d > 0) {
     bVec data_bvec = bVec::loadu(data + d, size - d);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     if (size - d > fVec::size()) {
       acc_fvec0 = vec_fun(acc_fvec0, data_fvec0);
       acc_fvec1 = fVec::set(acc_fvec1, vec_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
@@ -117,8 +141,7 @@ inline std::pair<float, float> reduce2_all(const Op1& vec_fun1, const Op2& vec_f
   using fVec = vec::Vectorized<float>;
   if (size < bVec::size()) {
     bVec data_bvec = bVec::loadu(data, size);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     if (size > fVec::size()) {
       fVec acc1_fvec = fVec::set(data_fvec0, vec_fun1(data_fvec0, data_fvec1), size - fVec::size());
       fVec acc2_fvec = fVec::set(data_fvec0, vec_fun2(data_fvec0, data_fvec1), size - fVec::size());
@@ -133,14 +156,11 @@ inline std::pair<float, float> reduce2_all(const Op1& vec_fun1, const Op2& vec_f
   }
   int64_t d = bVec::size();
   bVec acc_bvec = bVec::loadu(data);
-  fVec acc1_fvec0, acc1_fvec1;
-  std::tie(acc1_fvec0, acc1_fvec1) = convert_to_float<scalar_t>(acc_bvec);
-  fVec acc2_fvec0, acc2_fvec1;
-  std::tie(acc2_fvec0, acc2_fvec1) = convert_to_float<scalar_t>(acc_bvec);
+  auto [acc1_fvec0, acc1_fvec1] = convert_to_float<scalar_t>(acc_bvec);
+  auto [acc2_fvec0, acc2_fvec1] = convert_to_float<scalar_t>(acc_bvec);
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec data_bvec = bVec::loadu(data + d);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     acc1_fvec0 = vec_fun1(acc1_fvec0, data_fvec0);
     acc1_fvec1 = vec_fun1(acc1_fvec1, data_fvec1);
     acc2_fvec0 = vec_fun2(acc2_fvec0, data_fvec0);
@@ -148,8 +168,7 @@ inline std::pair<float, float> reduce2_all(const Op1& vec_fun1, const Op2& vec_f
   }
   if (size - d > 0) {
     bVec data_bvec = bVec::loadu(data + d, size - d);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     if (size - d > fVec::size()) {
       acc1_fvec0 = vec_fun1(acc1_fvec0, data_fvec0);
       acc1_fvec1 = fVec::set(acc1_fvec1, vec_fun1(acc1_fvec1, data_fvec1), size - d - fVec::size());
@@ -178,8 +197,7 @@ inline float map_reduce_all(
   using fVec = vec::Vectorized<float>;
   if (size < bVec::size()) {
     bVec data_bvec = bVec::loadu(data, size);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     if (size > fVec::size()) {
       data_fvec0 = map_fun(data_fvec0);
       data_fvec1 = map_fun(data_fvec1);
@@ -192,14 +210,12 @@ inline float map_reduce_all(
   }
   int64_t d = bVec::size();
   bVec acc_bvec = bVec::loadu(data);
-  fVec acc_fvec0, acc_fvec1;
-  std::tie(acc_fvec0, acc_fvec1) = convert_to_float<scalar_t>(acc_bvec);
+  auto [acc_fvec0, acc_fvec1] = convert_to_float<scalar_t>(acc_bvec);
   acc_fvec0 = map_fun(acc_fvec0);
   acc_fvec1 = map_fun(acc_fvec1);
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec data_bvec = bVec::loadu(data + d);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     data_fvec0 = map_fun(data_fvec0);
     data_fvec1 = map_fun(data_fvec1);
     acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
@@ -207,8 +223,7 @@ inline float map_reduce_all(
   }
   if (size - d > 0) {
     bVec data_bvec = bVec::loadu(data + d, size - d);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     if (size - d > fVec::size()) {
       data_fvec0 = map_fun(data_fvec0);
       data_fvec1 = map_fun(data_fvec1);
@@ -235,11 +250,9 @@ inline float map2_reduce_all(
   using fVec = vec::Vectorized<float>;
   if (size < bVec::size()) {
     bVec data_bvec = bVec::loadu(data, size);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     bVec data2_bvec = bVec::loadu(data2, size);
-    fVec data2_fvec0, data2_fvec1;
-    std::tie(data2_fvec0, data2_fvec1) = convert_to_float<scalar_t>(data2_bvec);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
     if (size > fVec::size()) {
       data_fvec0 = map_fun(data_fvec0, data2_fvec0);
       data_fvec1 = map_fun(data_fvec1, data2_fvec1);
@@ -252,20 +265,16 @@ inline float map2_reduce_all(
   }
   int64_t d = bVec::size();
   bVec acc_bvec = bVec::loadu(data);
-  fVec acc_fvec0, acc_fvec1;
-  std::tie(acc_fvec0, acc_fvec1) = convert_to_float<scalar_t>(acc_bvec);
+  auto [acc_fvec0, acc_fvec1] = convert_to_float<scalar_t>(acc_bvec);
   bVec acc2_bvec = bVec::loadu(data2);
-  fVec acc2_fvec0, acc2_fvec1;
-  std::tie(acc2_fvec0, acc2_fvec1) = convert_to_float<scalar_t>(acc2_bvec);
+  auto [acc2_fvec0, acc2_fvec1] = convert_to_float<scalar_t>(acc2_bvec);
   acc_fvec0 = map_fun(acc_fvec0, acc2_fvec0);
   acc_fvec1 = map_fun(acc_fvec1, acc2_fvec1);
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec data_bvec = bVec::loadu(data + d);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     bVec data2_bvec = bVec::loadu(data2 + d);
-    fVec data2_fvec0, data2_fvec1;
-    std::tie(data2_fvec0, data2_fvec1) = convert_to_float<scalar_t>(data2_bvec);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
     data_fvec0 = map_fun(data_fvec0, data2_fvec0);
     data_fvec1 = map_fun(data_fvec1, data2_fvec1);
     acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
@@ -273,11 +282,9 @@ inline float map2_reduce_all(
   }
   if (size - d > 0) {
     bVec data_bvec = bVec::loadu(data + d, size - d);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     bVec data2_bvec = bVec::loadu(data2 + d, size - d);
-    fVec data2_fvec0, data2_fvec1;
-    std::tie(data2_fvec0, data2_fvec1) = convert_to_float<scalar_t>(data2_bvec);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
     if (size - d > fVec::size()) {
       data_fvec0 = map_fun(data_fvec0, data2_fvec0);
       data_fvec1 = map_fun(data_fvec1, data2_fvec1);
@@ -305,14 +312,11 @@ inline float map3_reduce_all(
   using fVec = vec::Vectorized<float>;
   if (size < bVec::size()) {
     bVec data_bvec = bVec::loadu(data, size);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     bVec data2_bvec = bVec::loadu(data2, size);
-    fVec data2_fvec0, data2_fvec1;
-    std::tie(data2_fvec0, data2_fvec1) = convert_to_float<scalar_t>(data2_bvec);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
     bVec data3_bvec = bVec::loadu(data3, size);
-    fVec data3_fvec0, data3_fvec1;
-    std::tie(data3_fvec0, data3_fvec1) = convert_to_float<scalar_t>(data3_bvec);
+    auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
     if (size > fVec::size()) {
       data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
       data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1);
@@ -325,26 +329,20 @@ inline float map3_reduce_all(
   }
   int64_t d = bVec::size();
   bVec acc_bvec = bVec::loadu(data);
-  fVec acc_fvec0, acc_fvec1;
-  std::tie(acc_fvec0, acc_fvec1) = convert_to_float<scalar_t>(acc_bvec);
+  auto [acc_fvec0, acc_fvec1] = convert_to_float<scalar_t>(acc_bvec);
   bVec acc2_bvec = bVec::loadu(data2);
-  fVec acc2_fvec0, acc2_fvec1;
-  std::tie(acc2_fvec0, acc2_fvec1) = convert_to_float<scalar_t>(acc2_bvec);
+  auto [acc2_fvec0, acc2_fvec1] = convert_to_float<scalar_t>(acc2_bvec);
   bVec acc3_bvec = bVec::loadu(data3);
-  fVec acc3_fvec0, acc3_fvec1;
-  std::tie(acc3_fvec0, acc3_fvec1) = convert_to_float<scalar_t>(acc3_bvec);
+  auto [acc3_fvec0, acc3_fvec1] = convert_to_float<scalar_t>(acc3_bvec);
   acc_fvec0 = map_fun(acc_fvec0, acc2_fvec0, acc3_fvec0);
   acc_fvec1 = map_fun(acc_fvec1, acc2_fvec1, acc3_fvec1);
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec data_bvec = bVec::loadu(data + d);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     bVec data2_bvec = bVec::loadu(data2 + d);
-    fVec data2_fvec0, data2_fvec1;
-    std::tie(data2_fvec0, data2_fvec1) = convert_to_float<scalar_t>(data2_bvec);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
     bVec data3_bvec = bVec::loadu(data3 + d);
-    fVec data3_fvec0, data3_fvec1;
-    std::tie(data3_fvec0, data3_fvec1) = convert_to_float<scalar_t>(data3_bvec);
+    auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
     data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
     data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1);
     acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
@@ -352,14 +350,11 @@ inline float map3_reduce_all(
   }
   if (size - d > 0) {
     bVec data_bvec = bVec::loadu(data + d, size - d);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     bVec data2_bvec = bVec::loadu(data2 + d, size - d);
-    fVec data2_fvec0, data2_fvec1;
-    std::tie(data2_fvec0, data2_fvec1) = convert_to_float<scalar_t>(data2_bvec);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
     bVec data3_bvec = bVec::loadu(data3 + d, size - d);
-    fVec data3_fvec0, data3_fvec1;
-    std::tie(data3_fvec0, data3_fvec1) = convert_to_float<scalar_t>(data3_bvec);
+    auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
     if (size - d > fVec::size()) {
       data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
       data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1);
@@ -386,8 +381,7 @@ inline void map(
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec data_bvec = bVec::loadu(input_data + d);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     fVec output_fvec0 = vec_fun(data_fvec0);
     fVec output_fvec1 = vec_fun(data_fvec1);
     bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
@@ -395,8 +389,7 @@ inline void map(
   }
   if (size - d > 0) {
     bVec data_bvec = bVec::loadu(input_data + d, size - d);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     fVec output_fvec0 = vec_fun(data_fvec0);
     fVec output_fvec1 = vec_fun(data_fvec1);
     bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
@@ -452,11 +445,9 @@ inline void map2(
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec data_bvec = bVec::loadu(input_data + d);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     bVec data2_bvec = bVec::loadu(input_data2 + d);
-    fVec data2_fvec0, data2_fvec1;
-    std::tie(data2_fvec0, data2_fvec1) = convert_to_float<scalar_t>(data2_bvec);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
     fVec output_fvec0 = vec_fun(data_fvec0, data2_fvec0);
     fVec output_fvec1 = vec_fun(data_fvec1, data2_fvec1);
     bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
@@ -464,11 +455,9 @@ inline void map2(
   }
   if (size - d > 0) {
     bVec data_bvec = bVec::loadu(input_data + d, size - d);
-    fVec data_fvec0, data_fvec1;
-    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
     bVec data2_bvec = bVec::loadu(input_data2 + d, size - d);
-    fVec data2_fvec0, data2_fvec1;
-    std::tie(data2_fvec0, data2_fvec1) = convert_to_float<scalar_t>(data2_bvec);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
     fVec output_fvec0 = vec_fun(data_fvec0, data2_fvec0);
     fVec output_fvec1 = vec_fun(data_fvec1, data2_fvec1);
     bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
@@ -490,14 +479,11 @@ inline void map3(
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec data1_bvec = bVec::loadu(input_data1 + d);
-    fVec data1_fvec0, data1_fvec1;
-    std::tie(data1_fvec0, data1_fvec1) = convert_to_float<scalar_t>(data1_bvec);
+    auto [data1_fvec0, data1_fvec1] = convert_to_float<scalar_t>(data1_bvec);
     bVec data2_bvec = bVec::loadu(input_data2 + d);
-    fVec data2_fvec0, data2_fvec1;
-    std::tie(data2_fvec0, data2_fvec1) = convert_to_float<scalar_t>(data2_bvec);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
     bVec data3_bvec = bVec::loadu(input_data3 + d);
-    fVec data3_fvec0, data3_fvec1;
-    std::tie(data3_fvec0, data3_fvec1) = convert_to_float<scalar_t>(data3_bvec);
+    auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
     fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0);
     fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1);
     bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
@@ -505,14 +491,11 @@ inline void map3(
   }
   if (size - d > 0) {
     bVec data1_bvec = bVec::loadu(input_data1 + d, size - d);
-    fVec data1_fvec0, data1_fvec1;
-    std::tie(data1_fvec0, data1_fvec1) = convert_to_float<scalar_t>(data1_bvec);
+    auto [data1_fvec0, data1_fvec1] = convert_to_float<scalar_t>(data1_bvec);
     bVec data2_bvec = bVec::loadu(input_data2 + d, size - d);
-    fVec data2_fvec0, data2_fvec1;
-    std::tie(data2_fvec0, data2_fvec1) = convert_to_float<scalar_t>(data2_bvec);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
     bVec data3_bvec = bVec::loadu(input_data3 + d, size - d);
-    fVec data3_fvec0, data3_fvec1;
-    std::tie(data3_fvec0, data3_fvec1) = convert_to_float<scalar_t>(data3_bvec);
+    auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
     fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0);
     fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1);
     bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
@@ -535,17 +518,13 @@ inline void map4(
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec data1_bvec = bVec::loadu(input_data1 + d);
-    fVec data1_fvec0, data1_fvec1;
-    std::tie(data1_fvec0, data1_fvec1) = convert_to_float<scalar_t>(data1_bvec);
+    auto [data1_fvec0, data1_fvec1] = convert_to_float<scalar_t>(data1_bvec);
     bVec data2_bvec = bVec::loadu(input_data2 + d);
-    fVec data2_fvec0, data2_fvec1;
-    std::tie(data2_fvec0, data2_fvec1) = convert_to_float<scalar_t>(data2_bvec);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
     bVec data3_bvec = bVec::loadu(input_data3 + d);
-    fVec data3_fvec0, data3_fvec1;
-    std::tie(data3_fvec0, data3_fvec1) = convert_to_float<scalar_t>(data3_bvec);
+    auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
     bVec data4_bvec = bVec::loadu(input_data4 + d);
-    fVec data4_fvec0, data4_fvec1;
-    std::tie(data4_fvec0, data4_fvec1) = convert_to_float<scalar_t>(data4_bvec);
+    auto [data4_fvec0, data4_fvec1] = convert_to_float<scalar_t>(data4_bvec);
     fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0);
     fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1);
     bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
@@ -553,17 +532,13 @@ inline void map4(
   }
   if (size - d > 0) {
     bVec data1_bvec = bVec::loadu(input_data1 + d, size - d);
-    fVec data1_fvec0, data1_fvec1;
-    std::tie(data1_fvec0, data1_fvec1) = convert_to_float<scalar_t>(data1_bvec);
+    auto [data1_fvec0, data1_fvec1] = convert_to_float<scalar_t>(data1_bvec);
     bVec data2_bvec = bVec::loadu(input_data2 + d, size - d);
-    fVec data2_fvec0, data2_fvec1;
-    std::tie(data2_fvec0, data2_fvec1) = convert_to_float<scalar_t>(data2_bvec);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
     bVec data3_bvec = bVec::loadu(input_data3 + d, size - d);
-    fVec data3_fvec0, data3_fvec1;
-    std::tie(data3_fvec0, data3_fvec1) = convert_to_float<scalar_t>(data3_bvec);
+    auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
     bVec data4_bvec = bVec::loadu(input_data4 + d, size - d);
-    fVec data4_fvec0, data4_fvec1;
-    std::tie(data4_fvec0, data4_fvec1) = convert_to_float<scalar_t>(data4_bvec);
+    auto [data4_fvec0, data4_fvec1] = convert_to_float<scalar_t>(data4_bvec);
     fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0);
     fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1);
     bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
index 8127ddd4a9a4f..84c3e8b6e5ce4 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@@ -9,6 +9,7 @@
 #if !(defined(__VSX__)  || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_ZVECTOR))
 #include <ATen/cpu/vec/vec256/vec256_float.h>
 #include <ATen/cpu/vec/vec256/vec256_float_neon.h>
+#include <ATen/cpu/vec/vec256/vec256_half_neon.h>
 #include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
 #include <ATen/cpu/vec/vec256/vec256_double.h>
 #include <ATen/cpu/vec/vec256/vec256_int.h>
@@ -22,6 +23,9 @@
 #include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
 #endif
 
+#include <ATen/cpu/vec/vec256/vec256_convert.h>
+#include <ATen/cpu/vec/vec256/vec256_mask.h>
+
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
@@ -69,7 +73,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
 }
 
 
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -94,7 +98,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
+#ifndef _MSC_VER
+// MSVC is not working well on complex function overload.
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
 inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
@@ -106,9 +111,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
 inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
   return _mm256_i32gather_ps(base_addr, vindex, scale);
 }
-
+#endif
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
+#ifndef _MSC_VER
+// MSVC is not working well on complex function overload.
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
 inline mask_gather(const Vectorized<double>& src, const double* base_addr,
@@ -122,7 +128,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
                    const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
   return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
 }
-
+#endif
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 // Only works for inputs in the range: [-2^51, 2^51]
@@ -143,6 +149,24 @@ inline convert_to_int_of_same_size<float>(const Vectorized<float> &src) {
   return _mm256_cvttps_epi32(src);
 }
 
+// Only works for inputs in the range: [-2^51, 2^51]
+// From: https://stackoverflow.com/a/41148578
+template<>
+Vectorized<double>
+inline convert_to_fp_of_same_size<double>(const Vectorized<int64_t> &src) {
+  auto x = _mm256_add_epi64(src, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
+  return _mm256_sub_pd(
+    _mm256_castsi256_pd(x),
+    _mm256_set1_pd(0x0018000000000000)
+  );
+}
+
+template<>
+Vectorized<float>
+inline convert_to_fp_of_same_size<float>(const Vectorized<int32_t> &src) {
+  return _mm256_cvtepi32_ps(src);
+}
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <>
@@ -284,6 +308,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
   return flip8(v);
 }
 
-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#endif // (defined(CPU_CAPABILITY_AVX2)
 
 }} // namepsace at::vec::CPU_CAPABILITY
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
index 5e302be45acce..19e0320d8abf6 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@@ -7,7 +7,8 @@
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
 
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif
 
@@ -18,7 +19,18 @@ namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
+
+#ifndef SLEEF_CONST
+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
+#define SLEEF_CONST const
+#else
+#define SLEEF_CONST
+#endif
+#define SLEEF_CONST_OLD SLEEF_CONST
+#else
+#define SLEEF_CONST_OLD
+#endif
 
 // bfloat16 conversion
 static inline void cvtbf16_fp32(const __m128i& a, __m256& o) {
@@ -31,6 +43,28 @@ static inline void cvtbf16_fp32(const __m256i& a, __m256& o1, __m256& o2) {
   cvtbf16_fp32(lo, o1);
   cvtbf16_fp32(hi, o2);
 }
+
+static inline __m128i cvtfp32_bf16(const __m256& src) {
+  __m256i value = _mm256_castps_si256(src);
+  __m256i nan = _mm256_set1_epi32(0xffff);
+  __m256i mask = _mm256_castps_si256(_mm256_cmp_ps(src, src, _CMP_ORD_Q));
+  __m256i ones = _mm256_set1_epi32(0x1);
+  __m256i vec_bias = _mm256_set1_epi32(0x7fff);
+  // uint32_t lsb = (input >> 16) & 1;
+  auto t_value = _mm256_and_si256(_mm256_srli_epi32(value, 16), ones);
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t_value = _mm256_add_epi32(t_value, vec_bias);
+  // input += rounding_bias;
+  t_value = _mm256_add_epi32(t_value, value);
+  // input = input >> 16;
+  t_value = _mm256_srli_epi32(t_value, 16);
+  // Check NaN before converting back to bf16
+  t_value = _mm256_blendv_epi8(nan, t_value, mask);
+  t_value = _mm256_packus_epi32(t_value, t_value);   // t[4-7] t[4-7] t[0-4] t[0-4]
+  t_value = _mm256_permute4x64_epi64(t_value, 0xd8); // 11     01     10     00
+  return _mm256_castsi256_si128(t_value);
+}
+
 static inline __m256i cvtfp32_bf16(const __m256& a, const __m256& b) {
   __m256i lo = _mm256_castps_si256(a);
   __m256i hi = _mm256_castps_si256(b);
@@ -80,6 +114,11 @@ static inline void cvtfp16_fp32(const __m256i& a, __m256& o1, __m256& o2) {
   cvtfp16_fp32(hi, o2);
 }
 
+static inline __m128i cvtfp32_fp16(const __m256& src) {
+  return _mm256_cvtps_ph(
+      src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+}
+
 static inline __m256i cvtfp32_fp16(const __m256& a, const __m256& b) {
   __m128i lo = _mm256_cvtps_ph(
       a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
@@ -265,7 +304,8 @@ static_assert(
     }
     return b;
   }
-  Vectorized<T> map(const __m256 (*const vop)(__m256)) const {
+
+  Vectorized<T> map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const {
     __m256 lo, hi;
     cvt_to_fp32<T>(values, lo, hi);
     const auto o1 = vop(lo);
@@ -285,14 +325,14 @@ static_assert(
   Vectorized<T> angle() const {
     __m256 lo, hi;
     cvt_to_fp32<T>(values, lo, hi);
-    auto angle_lambda = [](__m256 values) {
+    auto angle_lambda = [](__m256 values_2) {
       const auto zero_vec = _mm256_set1_ps(0.f);
       const auto nan_vec = _mm256_set1_ps(NAN);
-      const auto not_nan_mask = _mm256_cmp_ps(values, values, _CMP_EQ_OQ);
+      const auto not_nan_mask = _mm256_cmp_ps(values_2, values_2, _CMP_EQ_OQ);
       const auto nan_mask = _mm256_cmp_ps(not_nan_mask, zero_vec, _CMP_EQ_OQ);
       const auto pi = _mm256_set1_ps(c10::pi<float>);
 
-      const auto neg_mask = _mm256_cmp_ps(values, zero_vec, _CMP_LT_OQ);
+      const auto neg_mask = _mm256_cmp_ps(values_2, zero_vec, _CMP_LT_OQ);
       auto angle = _mm256_blendv_ps(zero_vec, pi, neg_mask);
       angle = _mm256_blendv_ps(angle, nan_vec, nan_mask);
       return angle;
@@ -313,6 +353,9 @@ static_assert(
   Vectorized<T> acos() const {
     return map(Sleef_acosf8_u10);
   }
+  Vectorized<T> acosh() const {
+    return map(Sleef_acoshf8_u10);
+  }
   Vectorized<T> asin() const {
     return map(Sleef_asinf8_u10);
   }
@@ -1023,7 +1066,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
 CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
 CONVERT_VECTORIZED_INIT(Half, half);
 
-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#else // defined(CPU_CAPABILITY_AVX2)
 
 #define CONVERT_NON_VECTORIZED_INIT(type, name) \
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
@@ -1046,11 +1089,49 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
   return Vectorized<type>::loadu(arr2); \
 }
 CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_half_float(const Vectorized<Half>& a) {
+  static_assert(Vectorized<Half>::size() == 2 * Vectorized<float>::size());
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  float16x8x2_t arr = a;
+  float16x8_t x = arr.val[0];
+  float16x8_t y = arr.val[1];
+#else
+  auto arr = reinterpret_cast<const float16_t*>(a.operator const Half*());
+  float16x8_t x = vld1q_f16(arr);
+  float16x8_t y = vld1q_f16(arr + Vectorized<float>::size());
+#endif
+  float32x4_t x1 = vcvt_f32_f16(vget_low_f16(x));
+  float32x4_t x2 = vcvt_f32_f16(vget_high_f16(x));
+  float32x4_t y1 = vcvt_f32_f16(vget_low_f16(y));
+  float32x4_t y2 = vcvt_f32_f16(vget_high_f16(y));
+  return { Vectorized<float>(x1, x2), Vectorized<float>(y1, y2) };
+}
+inline Vectorized<Half> convert_float_half(const Vectorized<float>& a, const Vectorized<float>& b) {
+  static_assert(Vectorized<Half>::size() == 2 * Vectorized<float>::size());
+  float32x4x2_t x = a;
+  float32x4x2_t y = b;
+  float16x4_t x1 = vcvt_f16_f32(x.val[0]);
+  float16x4_t x2 = vcvt_f16_f32(x.val[1]);
+  float16x4_t y1 = vcvt_f16_f32(y.val[0]);
+  float16x4_t y2 = vcvt_f16_f32(y.val[1]);
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  return Vectorized<Half>(vcombine_f16(x1, x2), vcombine_f16(y1, y2));
+#else
+  Vectorized<Half> rc;
+  auto arr = reinterpret_cast<float16_t*>(rc.operator Half*());
+  vst1q_f16(arr, vcombine_f16(x1, x2));
+  vst1q_f16(arr + Vectorized<float>::size(), vcombine_f16(y1, y2));
+  return rc;
+#endif
+}
+#else
 CONVERT_NON_VECTORIZED_INIT(Half, half);
+#endif
 
-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#endif // defined(CPU_CAPABILITY_AVX2)
 
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
 #define LOAD_FP32_VECTORIZED_INIT(type, name) \
 inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
   auto values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); \
@@ -1069,7 +1150,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
 LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
 LOAD_FP32_VECTORIZED_INIT(Half, fp16);
 
-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#else // defined(CPU_CAPABILITY_AVX2)
 #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
 inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
   __at_align__ float values[Vectorized<float>::size()]; \
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
index f93ea1e63c38d..6c198fb37d3d1 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
@@ -8,7 +8,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif
 
@@ -16,7 +17,7 @@ namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
 
 template <> class Vectorized<c10::complex<double>> {
 private:
@@ -145,7 +146,7 @@ template <> class Vectorized<c10::complex<double>> {
     auto abs = abs_();
     auto zero = _mm256_setzero_pd();
     auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
-    auto div = values / abs;
+    auto div = _mm256_div_pd(values, abs);
     return _mm256_blendv_pd(div, zero, mask);
   }
   __m256d real_() const {
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
index 7c142c04b79c0..c72d4d49274a0 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
@@ -7,7 +7,8 @@
 #include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif
 
@@ -15,7 +16,7 @@ namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
 
 template <> class Vectorized<c10::complex<float>> {
 private:
@@ -180,7 +181,7 @@ template <> class Vectorized<c10::complex<float>> {
     auto abs = abs_();
     auto zero = _mm256_setzero_ps();
     auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
-    auto div = values / abs;
+    auto div = _mm256_div_ps(values, abs);
     return _mm256_blendv_ps(div, zero, mask);
   }
   __m256 real_() const {
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
new file mode 100644
index 0000000000000..55f26c606d8bd
--- /dev/null
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@@ -0,0 +1,215 @@
+#pragma once
+
+#include <ATen/cpu/vec/functional_bfloat16.h>
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_convert.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+template <>
+struct VecConvert<float, 1, BFloat16, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 1> result;
+    __m256 value;
+    cvtbf16_fp32(_mm256_castsi256_si128(src[0]), value);
+    result[0] = value;
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<float, 1, Half, 1> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<Half, 1>& src) {
+    VectorizedN<float, 1> result;
+    __m256 value;
+    cvtfp16_fp32(_mm256_castsi256_si128(src[0]), value);
+    result[0] = value;
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<BFloat16, 1, float, 1> {
+  static inline VectorizedN<BFloat16, 1> apply(
+      const VectorizedN<float, 1>& src) {
+    VectorizedN<BFloat16, 1> result;
+    result[0] = _mm256_castsi128_si256(cvtfp32_bf16(src[0]));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<Half, 1, float, 1> {
+  static inline VectorizedN<Half, 1> apply(const VectorizedN<float, 1>& src) {
+    VectorizedN<Half, 1> result;
+    result[0] = _mm256_castsi128_si256(cvtfp32_fp16(src[0]));
+    return result;
+  }
+};
+
+template <>
+inline Vectorized<double> convert_to_fp_of_same_size<double>(
+    const Vectorized<int64_t>& src);
+
+template <>
+struct VecConvert<float, 1, int64_t, 2> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    auto low_double = at::vec::convert_to_fp_of_same_size<double>(src[0]);
+    auto low = _mm256_cvtpd_ps(low_double);
+    auto high_double = at::vec::convert_to_fp_of_same_size<double>(src[1]);
+    auto high = _mm256_cvtpd_ps(high_double);
+    return Vectorized<float>(
+        _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1));
+  }
+};
+
+template <>
+inline Vectorized<int32_t> convert_to_int_of_same_size<float>(
+    const Vectorized<float>& src);
+
+template <>
+struct VecConvert<int64_t, 2, float, 1> {
+  static inline VectorizedN<int64_t, 2> apply(
+      const VectorizedN<float, 1>& src) {
+    at::vec::VectorizedN<int64_t, 2> result;
+    auto int32_vec = at::vec::convert_to_int_of_same_size(src[0]);
+    result[0] = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(int32_vec));
+    result[1] = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(int32_vec, 1));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, int64_t, 2> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    auto low = _mm256_shuffle_epi32(src[0], _MM_SHUFFLE(2, 0, 2, 0));
+    auto high = _mm256_shuffle_epi32(src[1], _MM_SHUFFLE(2, 0, 2, 0));
+    auto low_perm = _mm256_permute4x64_epi64(low, _MM_SHUFFLE(3, 1, 2, 0));
+    auto high_perm = _mm256_permute4x64_epi64(high, _MM_SHUFFLE(3, 1, 2, 0));
+    return Vectorized<int32_t>(_mm256_blend_epi32(low_perm, high_perm, 0xF0));
+  }
+};
+
+template <>
+struct VecConvert<int64_t, 2, int32_t, 1> {
+  static inline VectorizedN<int64_t, 2> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    at::vec::VectorizedN<int64_t, 2> result;
+    result[0] = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(src[0]));
+    result[1] = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src[0], 1));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, int8_t, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<int8_t, 1>& src) {
+    auto src128 = _mm256_castsi256_si128(src[0]);
+    return Vectorized<int32_t>(_mm256_cvtepi8_epi32(src128));
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, uint8_t, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<uint8_t, 1>& src) {
+    auto src128 = _mm256_castsi256_si128(src[0]);
+    return Vectorized<int32_t>(_mm256_cvtepu8_epi32(src128));
+  }
+};
+
+template <typename dst_t, typename src_t>
+struct VecConvert<
+    dst_t,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<
+        (is_reduced_floating_point_v<dst_t> && is_8bit_integer_v<src_t>) ||
+            (is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
+    VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
+    return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>,
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 1>& src) {
+    return convert_float_to_int8<dst_t>(src[0]);
+  }
+};
+
+template <typename src_t>
+struct VecConvert<
+    float,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>,
+        void>> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
+    return convert_int8_to_float<src_t>(src[0]);
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    int64_t,
+    2,
+    typename std::enable_if<
+        std::is_same_v<dst_t, int8_t> ||
+        std::is_same_v<dst_t, uint8_t>>::type> {
+  static inline VectorizedN<dst_t, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    return VecConvert<dst_t, 1, int32_t, 1>::apply(
+        VecConvert<int32_t, 1, int64_t, 2>::apply(src));
+  }
+};
+
+#endif
+
+template <typename src_t>
+struct VecConvert<
+    float,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<is_reduced_floating_point_v<src_t>, void>> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
+    auto [res_vec1, res_vec2] = convert_to_float<src_t>(src[0]);
+    return res_vec1;
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    1,
+    typename std::enable_if_t<is_reduced_floating_point_v<dst_t>, void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 1>& src) {
+    return convert_from_float<dst_t>(src[0], src[0]);
+  }
+};
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
index 612f1ac6d21ba..bed6da627af2d 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
@@ -6,7 +6,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif
 
@@ -15,7 +16,7 @@ namespace at::vec {
 inline namespace CPU_CAPABILITY {
 
 
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
 
 template <> class Vectorized<double> {
 private:
@@ -100,6 +101,10 @@ template <> class Vectorized<double> {
   Vectorized<double> isnan() const {
     return _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_UNORD_Q);
   }
+  bool has_inf_nan() const {
+    __m256d self_sub  = _mm256_sub_pd(values, values);
+    return (_mm256_movemask_epi8(_mm256_castpd_si256(self_sub)) & 0x77777777) != 0;
+  }
   Vectorized<double> map(double (*const f)(double)) const {
     __at_align__ double tmp[size()];
     store(tmp);
@@ -136,6 +141,9 @@ template <> class Vectorized<double> {
   Vectorized<double> acos() const {
     return Vectorized<double>(Sleef_acosd4_u10(values));
   }
+  Vectorized<double> acosh() const {
+    return Vectorized<double>(Sleef_acoshd4_u10(values));
+  }
   Vectorized<double> asin() const {
     return Vectorized<double>(Sleef_asind4_u10(values));
   }
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
index 2ec41d7593da8..0e3664cd37b6a 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@@ -6,7 +6,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif
 
@@ -14,7 +15,7 @@ namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
 
 template <> class Vectorized<float> {
 private:
@@ -106,6 +107,12 @@ template <> class Vectorized<float> {
   Vectorized<float> isnan() const {
     return _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_UNORD_Q);
   }
+
+  bool has_inf_nan() const {
+    __m256 self_sub  = _mm256_sub_ps(values, values);
+    return (_mm256_movemask_epi8(_mm256_castps_si256(self_sub)) & 0x77777777) != 0;
+  }
+
   Vectorized<float> map(float (*const f)(float)) const {
     __at_align__ float tmp[size()];
     store(tmp);
@@ -142,6 +149,9 @@ template <> class Vectorized<float> {
   Vectorized<float> acos() const {
     return Vectorized<float>(Sleef_acosf8_u10(values));
   }
+  Vectorized<float> acosh() const {
+    return Vectorized<float>(Sleef_acoshf8_u10(values));
+  }
   Vectorized<float> asin() const {
     return Vectorized<float>(Sleef_asinf8_u10(values));
   }
@@ -217,14 +227,14 @@ template <> class Vectorized<float> {
     static __m256 vec_factorial_5 =
         _mm256_set1_ps(0.00828929059f); // 1/factorial(5)
     static __m256 vec_exp_log2ef =
-        (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e)
+        _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
     static __m256 vec_half = _mm256_set1_ps(0.5f);
     static __m256 vec_one = _mm256_set1_ps(1.f);
     static __m256 vec_zero = _mm256_set1_ps(0.f);
     static __m256 vec_two = _mm256_set1_ps(2.f);
-    static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2)
-    static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50);
-    static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218);
+    static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
+    static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
+    static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
     static __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
     static int n_mantissa_bits = 23;
 
@@ -257,7 +267,7 @@ template <> class Vectorized<float> {
     auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number);
     auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127);
     vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
-    auto vec_two_pow_n = (__m256)vec_two_pow_n_i;
+    auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i);
     vec_two_pow_n =
         _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask);
 
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
index bf16d7236e50a..a5b993f2b9e10 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
@@ -307,6 +307,16 @@ template <> class Vectorized<float> {
     }
     return loadu(res);
   };
+  bool has_inf_nan() const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      if(_isnan(tmp[i]) || _isinf(tmp[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
   Vectorized<float> map(float (*const f)(float)) const {
     __at_align__ float tmp[size()];
     store(tmp);
@@ -339,6 +349,12 @@ template <> class Vectorized<float> {
       map(std::acos)
     );
   }
+  Vectorized<float> acosh() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_acoshf4_u10(values.val[0]), Sleef_acoshf4_u10(values.val[1])),
+      map(std::acosh)
+    );
+  }
   Vectorized<float> asin() const {
     return USE_SLEEF(
       Vectorized<float>(Sleef_asinf4_u10(values.val[0]), Sleef_asinf4_u10(values.val[1])),
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_half_neon.h b/aten/src/ATen/cpu/vec/vec256/vec256_half_neon.h
new file mode 100644
index 0000000000000..aaf1d5995fc05
--- /dev/null
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_half_neon.h
@@ -0,0 +1,819 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vec256_float_neon.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/Half.h>
+#include <c10/util/irange.h>
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+// Right now contains only aarch64 implementation.
+// Due to follow two reasons aarch32 is not currently supported.
+// 1. Due to difference in ISA been aarch32 and aarch64, intrinsics
+//    that work for aarch64 dont work for aarch32.
+// 2. Android NDK r21 has problems with compiling aarch32.
+//    Clang seg faults.
+//    https://github.com/android/ndk/issues/1248
+//    https://bugs.llvm.org/show_bug.cgi?id=45824
+// Most likely we will do aarch32 support with inline asm.
+#if !defined(C10_MOBILE) && defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+#ifdef __BIG_ENDIAN__
+#error "Big endian is not supported."
+#endif
+
+template <int index, bool mask_val>
+struct BlendHalfRegs {
+  static float16x8_t impl(
+      const float16x8_t& a,
+      const float16x8_t& b,
+      float16x8_t& res);
+};
+
+template <int index>
+struct BlendHalfRegs<index, true> {
+  static float16x8_t impl(
+      const float16x8_t& a,
+      const float16x8_t& b,
+      float16x8_t& res) {
+    return vsetq_lane_f16(vgetq_lane_f16(b, index), res, index);
+  }
+};
+
+template <int index>
+struct BlendHalfRegs<index, false> {
+  static float16x8_t impl(
+      const float16x8_t& a,
+      const float16x8_t& b,
+      float16x8_t& res) {
+    return vsetq_lane_f16(vgetq_lane_f16(a, index), res, index);
+  }
+};
+
+// On ARM, Half type supports float16_t->Half constructor and Half->float16_t
+// conversion
+template <>
+class Vectorized<c10::Half> {
+ private:
+  float16x8x2_t values;
+
+ public:
+  // value_type should be c10::Half to fit interface with vec_base.h
+  using value_type = c10::Half;
+  using size_type = int;
+  static constexpr size_type size() {
+    static_assert(sizeof(float16x8x2_t) == 16 * sizeof(value_type));
+    return 16;
+  }
+
+ private:
+  // We use these private map functions to implement various methods
+  Vectorized<c10::Half> map2(
+      const Vectorized<c10::Half>& second,
+      c10::Half (*const f)(c10::Half, c10::Half)) const {
+    __at_align__ c10::Half tmp_first[size()];
+    __at_align__ c10::Half tmp_second[size()];
+    store(tmp_first); // store this to tmp_first
+    second.store(tmp_second);
+    for (const auto i : c10::irange(size())) {
+      tmp_first[i] = f(tmp_first[i], tmp_second[i]);
+    }
+    return loadu(tmp_first);
+  }
+
+  Vectorized<c10::Half> map_with_vec_float_method(
+      Vectorized<float> (Vectorized<float>::*m)() const) const {
+    // Convert low float16x8_t to 2 float32x4_t variables, apply m, and convert
+    // back
+    float32x4_t v00 = vcvt_f32_f16(vget_low_f16(values.val[0]));
+    float32x4_t v01 = vcvt_f32_f16(vget_high_f16(values.val[0]));
+    Vectorized<float> mv0 = (Vectorized<float>(v00, v01).*m)();
+    float16x4_t r00 = vcvt_f16_f32(mv0.get_low());
+    float16x4_t r01 = vcvt_f16_f32(mv0.get_high());
+
+    // Convert high float16x8_t to 2 float32x4_t variables, apply m, and convert
+    // back
+    float32x4_t v10 = vcvt_f32_f16(vget_low_f16(values.val[1]));
+    float32x4_t v11 = vcvt_f32_f16(vget_high_f16(values.val[1]));
+    Vectorized<float> mv1 = (Vectorized<float>(v10, v11).*m)();
+    float16x4_t r10 = vcvt_f16_f32(mv1.get_low());
+    float16x4_t r11 = vcvt_f16_f32(mv1.get_high());
+
+    // Pack result into Vectorized<c10::Half>
+    return Vectorized<c10::Half>(
+        vcombine_f16(r00, r01), vcombine_f16(r10, r11));
+  }
+
+  Vectorized<c10::Half> map2_with_vec_float_method(
+      const Vectorized<c10::Half>& second,
+      Vectorized<float> (Vectorized<float>::*m)(const Vectorized<float>&)
+          const) const {
+    // Convert low float16x8_t to 2 float32x4_t variables, apply m, and convert
+    // back
+    float32x4_t v00 = vcvt_f32_f16(vget_low_f16(values.val[0]));
+    float32x4_t v01 = vcvt_f32_f16(vget_high_f16(values.val[0]));
+    float32x4_t second_v00 = vcvt_f32_f16(vget_low_f16(second.get_low()));
+    float32x4_t second_v01 = vcvt_f32_f16(vget_high_f16(second.get_low()));
+    Vectorized<float> mv0 = (Vectorized<float>(v00, v01).*m)(
+        Vectorized<float>(second_v00, second_v01));
+    float16x4_t r00 = vcvt_f16_f32(mv0.get_low());
+    float16x4_t r01 = vcvt_f16_f32(mv0.get_high());
+
+    // Convert high float16x8_t to 2 float32x4_t variables, apply m, and convert
+    // back
+    float32x4_t v10 = vcvt_f32_f16(vget_low_f16(values.val[1]));
+    float32x4_t v11 = vcvt_f32_f16(vget_high_f16(values.val[1]));
+    float32x4_t second_v10 = vcvt_f32_f16(vget_low_f16(second.get_high()));
+    float32x4_t second_v11 = vcvt_f32_f16(vget_high_f16(second.get_high()));
+    Vectorized<float> mv1 = (Vectorized<float>(v10, v11).*m)(
+        Vectorized<float>(second_v10, second_v11));
+    float16x4_t r10 = vcvt_f16_f32(mv1.get_low());
+    float16x4_t r11 = vcvt_f16_f32(mv1.get_high());
+
+    // Pack result into Vectorized<c10::Half>
+    return Vectorized<c10::Half>(
+        vcombine_f16(r00, r01), vcombine_f16(r10, r11));
+  }
+
+ public:
+   // constructor
+  Vectorized() {}
+  Vectorized(float16x8x2_t v) : values(v) {}
+
+  // A ctor that accepts c10::Half is needed to fit interface with vec_base.h
+  // A second constructor that takes float16_t is also included
+  Vectorized(c10::Half val)
+      : values{vdupq_n_f16((float16_t)val), vdupq_n_f16((float16_t)val)} {
+  }
+  Vectorized(float16_t val) : values{vdupq_n_f16(val), vdupq_n_f16(val)} {}
+  Vectorized(
+      float16_t val0,
+      float16_t val1,
+      float16_t val2,
+      float16_t val3,
+      float16_t val4,
+      float16_t val5,
+      float16_t val6,
+      float16_t val7,
+      float16_t val8,
+      float16_t val9,
+      float16_t val10,
+      float16_t val11,
+      float16_t val12,
+      float16_t val13,
+      float16_t val14,
+      float16_t val15)
+      : values{
+            val0,
+            val1,
+            val2,
+            val3,
+            val4,
+            val5,
+            val6,
+            val7,
+            val8,
+            val9,
+            val10,
+            val11,
+            val12,
+            val13,
+            val14,
+            val15} {}
+  Vectorized(float16x8_t val0, float16x8_t val1) : values{val0, val1} {}
+  operator float16x8x2_t() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<c10::Half> blend(
+      const Vectorized<c10::Half>& a,
+      const Vectorized<c10::Half>& b) {
+    Vectorized<c10::Half> vec;
+    // 0.
+    vec.values.val[0] = BlendHalfRegs<0, (mask & 0x01) != 0>::impl(
+        a.values.val[0], b.values.val[0], vec.values.val[0]);
+    vec.values.val[0] = BlendHalfRegs<1, (mask & 0x02) != 0>::impl(
+        a.values.val[0], b.values.val[0], vec.values.val[0]);
+    vec.values.val[0] = BlendHalfRegs<2, (mask & 0x04) != 0>::impl(
+        a.values.val[0], b.values.val[0], vec.values.val[0]);
+    vec.values.val[0] = BlendHalfRegs<3, (mask & 0x08) != 0>::impl(
+        a.values.val[0], b.values.val[0], vec.values.val[0]);
+
+    vec.values.val[0] = BlendHalfRegs<4, (mask & 0x10) != 0>::impl(
+        a.values.val[0], b.values.val[0], vec.values.val[0]);
+    vec.values.val[0] = BlendHalfRegs<5, (mask & 0x20) != 0>::impl(
+        a.values.val[0], b.values.val[0], vec.values.val[0]);
+    vec.values.val[0] = BlendHalfRegs<6, (mask & 0x40) != 0>::impl(
+        a.values.val[0], b.values.val[0], vec.values.val[0]);
+    vec.values.val[0] = BlendHalfRegs<7, (mask & 0x80) != 0>::impl(
+        a.values.val[0], b.values.val[0], vec.values.val[0]);
+
+    // 1.
+    vec.values.val[1] = BlendHalfRegs<0, (mask & 0x10) != 0>::impl(
+        a.values.val[1], b.values.val[1], vec.values.val[1]);
+    vec.values.val[1] = BlendHalfRegs<1, (mask & 0x20) != 0>::impl(
+        a.values.val[1], b.values.val[1], vec.values.val[1]);
+    vec.values.val[1] = BlendHalfRegs<2, (mask & 0x40) != 0>::impl(
+        a.values.val[1], b.values.val[1], vec.values.val[1]);
+    vec.values.val[1] = BlendHalfRegs<3, (mask & 0x80) != 0>::impl(
+        a.values.val[1], b.values.val[1], vec.values.val[1]);
+
+    vec.values.val[1] = BlendHalfRegs<4, (mask & 0x10) != 0>::impl(
+        a.values.val[1], b.values.val[1], vec.values.val[1]);
+    vec.values.val[1] = BlendHalfRegs<5, (mask & 0x20) != 0>::impl(
+        a.values.val[1], b.values.val[1], vec.values.val[1]);
+    vec.values.val[1] = BlendHalfRegs<6, (mask & 0x40) != 0>::impl(
+        a.values.val[1], b.values.val[1], vec.values.val[1]);
+    vec.values.val[1] = BlendHalfRegs<7, (mask & 0x80) != 0>::impl(
+        a.values.val[1], b.values.val[1], vec.values.val[1]);
+
+    return vec;
+  }
+  static Vectorized<c10::Half> blendv(
+      const Vectorized<c10::Half>& a,
+      const Vectorized<c10::Half>& b,
+      const Vectorized<c10::Half>& mask) {
+    // Note: using blendv is very awkward because 0xFFFF is one of many NaN's in
+    // FP16 It's unfortunate that the mask has type Half (required from
+    // vec_base)
+
+    // TODO
+    // NB: This requires that each value, i.e., each uint value,
+    // of the mask either all be zeros or all be 1s.
+    // We perhaps need some kind of an assert?
+    // But that will affect performance.
+    Vectorized<c10::Half> vec(mask.values);
+    vec.values.val[0] = vbslq_f16(
+        vreinterpretq_u16_f16(vec.values.val[0]),
+        b.values.val[0],
+        a.values.val[0]);
+    vec.values.val[1] = vbslq_f16(
+        vreinterpretq_u16_f16(vec.values.val[1]),
+        b.values.val[1],
+        a.values.val[1]);
+    return vec;
+  }
+  template <typename step_t>
+  static Vectorized<c10::Half> arange(
+      c10::Half base = 0.0,
+      step_t step = static_cast<step_t>(1)) {
+    const Vectorized<c10::Half> base_vec(base);
+    const Vectorized<c10::Half> step_vec(step);
+    const Vectorized<c10::Half> step_sizes(
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    return fmadd(step_sizes, step_vec, base_vec);
+  }
+  static Vectorized<c10::Half> set(
+      const Vectorized<c10::Half>& a,
+      const Vectorized<c10::Half>& b,
+      int64_t count = size()) {
+    uint16_t pre_mask[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    for (int i = 0; i < count; i++) {
+      pre_mask[i] = 0xFFFF;
+    }
+    uint16x8x2_t mask = vld1q_u16_x2(pre_mask);
+
+    // Using blendv is awkward because 0xFFFF is one of many NaN's in FP16
+    // so we directly use vbslq_f16 instead
+    Vectorized<c10::Half> vec(
+        vbslq_f16(
+            // Low bits
+            mask.val[0],
+            b.values.val[0],
+            a.values.val[0]),
+        // High bits
+        vbslq_f16(mask.val[1], b.values.val[1], a.values.val[1]));
+
+    return vec;
+  }
+  static Vectorized<c10::Half> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size()) {
+      return vld1q_f16_x2(reinterpret_cast<const float16_t*>(ptr));
+    } else if (count == (size() >> 1)) {
+      Vectorized<c10::Half> res;
+      res.values.val[0] = vld1q_f16(reinterpret_cast<const float16_t*>(ptr));
+      res.values.val[1] = vdupq_n_f16(0);
+      return res;
+    } else {
+      __at_align__ float16_t tmp_values[size()];
+      for (const auto i : c10::irange(size())) {
+        tmp_values[i] = 0;
+      }
+      std::memcpy(
+          tmp_values,
+          reinterpret_cast<const float16_t*>(ptr),
+          count * sizeof(float16_t));
+      return vld1q_f16_x2(reinterpret_cast<const float16_t*>(tmp_values));
+    }
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      vst1q_f16_x2(reinterpret_cast<float16_t*>(ptr), values);
+      return;
+    } else if (count == (size() >> 1)) {
+      vst1q_f16(reinterpret_cast<float16_t*>(ptr), values.val[0]);
+    } else {
+      float16_t tmp_values[size()];
+      vst1q_f16_x2(reinterpret_cast<float16_t*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(float16_t));
+    }
+  }
+  inline const float16x8_t& get_low() const {
+    return values.val[0];
+  }
+  inline float16x8_t& get_low() {
+    return values.val[0];
+  }
+  inline const float16x8_t& get_high() const {
+    return values.val[1];
+  }
+  inline float16x8_t& get_high() {
+    return values.val[1];
+  }
+  // Very slow implementation of indexing.
+  // Only required because vec256_qint refers to this.
+  // Once we specialize that implementation for ARM
+  // this should be removed. TODO (kimishpatel)
+  c10::Half operator[](int idx) const {
+    __at_align__ c10::Half tmp[size()];
+    store(tmp);
+    return tmp[idx];
+  }
+  c10::Half operator[](int idx) {
+    __at_align__ c10::Half tmp[size()];
+    store(tmp);
+    return tmp[idx];
+  }
+  // For boolean version where we want to if any 1/all zero
+  // etc. can be done faster in a different way.
+  int zero_mask() const {
+    __at_align__ c10::Half tmp[size()];
+    store(tmp);
+    int mask = 0;
+    for (int i = 0; i < size(); ++i) {
+      if (tmp[i] == 0) {
+        mask |= (1 << i);
+      }
+    }
+    return mask;
+  }
+  Vectorized<c10::Half> isnan() const {
+    __at_align__ c10::Half tmp[size()];
+    __at_align__ c10::Half res[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      if (_isnan(tmp[i])) {
+        std::memset(static_cast<void*>(&res[i]), 0xFF, sizeof(c10::Half));
+      } else {
+        std::memset(static_cast<void*>(&res[i]), 0, sizeof(c10::Half));
+      }
+    }
+    return loadu(res);
+  };
+  bool has_inf_nan() const {
+    __at_align__ c10::Half tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      if (_isnan(tmp[i]) || _isinf(tmp[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+  Vectorized<c10::Half> map(c10::Half (*const f)(c10::Half)) const {
+    __at_align__ c10::Half tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<c10::Half> abs() const {
+    return Vectorized<c10::Half>(
+        vabsq_f16(values.val[0]), vabsq_f16(values.val[1]));
+  }
+  Vectorized<c10::Half> angle() const {
+    auto zero = Vectorized<c10::Half>(0);
+    auto pi = Vectorized<c10::Half>(c10::pi<c10::Half>);
+    auto tmp = blendv(zero, pi, *this < zero);
+    return blendv(tmp, *this, isnan());
+  }
+  Vectorized<c10::Half> real() const {
+    return *this;
+  }
+  Vectorized<c10::Half> imag() const {
+    return Vectorized<c10::Half>(0);
+  }
+  Vectorized<c10::Half> conj() const {
+    return *this;
+  }
+
+  // Sleef does not support FP16, so many math functions are applied by
+  // converting to FP32, applying the math function, and then converting back to
+  // FP16.
+  Vectorized<c10::Half> acos() const {
+    return map_with_vec_float_method(&Vectorized<float>::acos);
+  }
+  Vectorized<c10::Half> acosh() const {
+    return map_with_vec_float_method(&Vectorized<float>::acosh);
+  }
+  Vectorized<c10::Half> asin() const {
+    return map_with_vec_float_method(&Vectorized<float>::asin);
+  }
+  Vectorized<c10::Half> atan() const {
+    return map_with_vec_float_method(&Vectorized<float>::atan);
+  }
+  Vectorized<c10::Half> atanh() const {
+    return map_with_vec_float_method(&Vectorized<float>::atanh);
+  }
+  Vectorized<c10::Half> atan2(const Vectorized<c10::Half>& exp) const {
+    return map2_with_vec_float_method(exp, &Vectorized<float>::atan2);
+  }
+  Vectorized<c10::Half> copysign(const Vectorized<c10::Half>& sign) const {
+    return map2_with_vec_float_method(sign, &Vectorized<float>::copysign);
+  }
+  Vectorized<c10::Half> erf() const {
+    return map_with_vec_float_method(&Vectorized<float>::erf);
+  }
+  Vectorized<c10::Half> erfc() const {
+    return map_with_vec_float_method(&Vectorized<float>::erfc);
+  }
+  Vectorized<c10::Half> erfinv() const {
+    return map_with_vec_float_method(&Vectorized<float>::erfinv);
+  }
+  Vectorized<c10::Half> exp() const {
+    return map_with_vec_float_method(&Vectorized<float>::exp);
+  }
+  Vectorized<c10::Half> exp2() const {
+    return map_with_vec_float_method(&Vectorized<float>::exp2);
+  }
+  Vectorized<c10::Half> expm1() const {
+    return map_with_vec_float_method(&Vectorized<float>::expm1);
+  }
+  Vectorized<c10::Half> exp_u20() const {
+    return map_with_vec_float_method(&Vectorized<float>::exp_u20);
+  }
+  Vectorized<c10::Half> fmod(const Vectorized<c10::Half>& q) const {
+    // This function is questionable with a conversion, so we use map2
+    return map2(q, std::fmod);
+  }
+  Vectorized<c10::Half> hypot(const Vectorized<c10::Half>& b) const {
+    return map2_with_vec_float_method(b, &Vectorized<float>::hypot);
+  }
+  Vectorized<c10::Half> i0() const {
+    return map_with_vec_float_method(&Vectorized<float>::i0);
+  }
+  Vectorized<c10::Half> i0e() const {
+    return map_with_vec_float_method(&Vectorized<float>::i0e);
+  }
+  Vectorized<c10::Half> digamma() const {
+    return map_with_vec_float_method(&Vectorized<float>::digamma);
+  }
+  Vectorized<c10::Half> igamma(const Vectorized<c10::Half>& x) const {
+    return map2_with_vec_float_method(x, &Vectorized<float>::igamma);
+  }
+  Vectorized<c10::Half> igammac(const Vectorized<c10::Half>& x) const {
+    return map2_with_vec_float_method(x, &Vectorized<float>::igammac);
+  }
+  Vectorized<c10::Half> log() const {
+    return map_with_vec_float_method(&Vectorized<float>::log);
+  }
+  Vectorized<c10::Half> log10() const {
+    return map_with_vec_float_method(&Vectorized<float>::log10);
+  }
+  Vectorized<c10::Half> log1p() const {
+    return map_with_vec_float_method(&Vectorized<float>::log1p);
+  }
+  Vectorized<c10::Half> log2() const {
+    return map_with_vec_float_method(&Vectorized<float>::log2);
+  }
+  Vectorized<c10::Half> nextafter(const Vectorized<c10::Half>& b) const {
+    // This function does not make sense with conversion, so we use map2
+    return map2(b, std::nextafter);
+  }
+  Vectorized<c10::Half> frac() const;
+  Vectorized<c10::Half> sin() const {
+    return map_with_vec_float_method(&Vectorized<float>::sin);
+  }
+  Vectorized<c10::Half> sinh() const {
+    return map_with_vec_float_method(&Vectorized<float>::sinh);
+  }
+  Vectorized<c10::Half> cos() const {
+    return map_with_vec_float_method(&Vectorized<float>::cos);
+  }
+  Vectorized<c10::Half> cosh() const {
+    return map_with_vec_float_method(&Vectorized<float>::cosh);
+  }
+  Vectorized<c10::Half> ceil() const {
+    // This function is questionable with a conversion, so we use map
+    return map(at::native::ceil_impl);
+  }
+  Vectorized<c10::Half> floor() const {
+    // This function is questionable with a conversion, so we use map
+    return map(at::native::floor_impl);
+  }
+  Vectorized<c10::Half> neg() const {
+    return Vectorized<c10::Half>(
+        vnegq_f16(values.val[0]), vnegq_f16(values.val[1]));
+  }
+  inline Vectorized<c10::Half> round() const {
+    // This function is questionable with a conversion, so we use map
+    return map(at::native::round_impl);
+  }
+  inline Vectorized<c10::Half> tan() const {
+    return map_with_vec_float_method(&Vectorized<float>::tan);
+  }
+  inline Vectorized<c10::Half> tanh() const {
+    return map_with_vec_float_method(&Vectorized<float>::tanh);
+  }
+  Vectorized<c10::Half> trunc() const {
+    float16x8_t r0 = vrndq_f16(values.val[0]);
+    float16x8_t r1 = vrndq_f16(values.val[1]);
+    return Vectorized<c10::Half>(r0, r1);
+  }
+  Vectorized<c10::Half> lgamma() const {
+    return map_with_vec_float_method(&Vectorized<float>::lgamma);
+  }
+  Vectorized<c10::Half> sqrt() const {
+    return Vectorized<c10::Half>(
+        vsqrtq_f16(values.val[0]), vsqrtq_f16(values.val[1]));
+  }
+  Vectorized<c10::Half> reciprocal() const {
+    auto ones = vdupq_n_f16(1.0f);
+    auto r0 = vdivq_f16(ones, values.val[0]);
+    auto r1 = vdivq_f16(ones, values.val[1]);
+    return Vectorized<c10::Half>(r0, r1);
+  }
+  Vectorized<c10::Half> rsqrt() const {
+    return this->sqrt().reciprocal();
+  }
+  Vectorized<c10::Half> pow(const Vectorized<c10::Half>& exp) const {
+    return map2_with_vec_float_method(exp, &Vectorized<float>::pow);
+  }
+  Vectorized<c10::Half> operator==(const Vectorized<c10::Half>& other) const {
+    float16x8_t r0 =
+        vreinterpretq_f16_u16(vceqq_f16(values.val[0], other.values.val[0]));
+    float16x8_t r1 =
+        vreinterpretq_f16_u16(vceqq_f16(values.val[1], other.values.val[1]));
+    return Vectorized<c10::Half>(r0, r1);
+  }
+
+  Vectorized<c10::Half> operator!=(const Vectorized<c10::Half>& other) const {
+    float16x8_t r0 = vreinterpretq_f16_u16(
+        vmvnq_u16(vceqq_f16(values.val[0], other.values.val[0])));
+    float16x8_t r1 = vreinterpretq_f16_u16(
+        vmvnq_u16(vceqq_f16(values.val[1], other.values.val[1])));
+    return Vectorized<c10::Half>(r0, r1);
+  }
+
+  Vectorized<c10::Half> operator<(const Vectorized<c10::Half>& other) const {
+    float16x8_t r0 =
+        vreinterpretq_f16_u16(vcltq_f16(values.val[0], other.values.val[0]));
+    float16x8_t r1 =
+        vreinterpretq_f16_u16(vcltq_f16(values.val[1], other.values.val[1]));
+    return Vectorized<c10::Half>(r0, r1);
+  }
+
+  Vectorized<c10::Half> operator<=(const Vectorized<c10::Half>& other) const {
+    float16x8_t r0 =
+        vreinterpretq_f16_u16(vcleq_f16(values.val[0], other.values.val[0]));
+    float16x8_t r1 =
+        vreinterpretq_f16_u16(vcleq_f16(values.val[1], other.values.val[1]));
+    return Vectorized<c10::Half>(r0, r1);
+  }
+
+  Vectorized<c10::Half> operator>(const Vectorized<c10::Half>& other) const {
+    float16x8_t r0 =
+        vreinterpretq_f16_u16(vcgtq_f16(values.val[0], other.values.val[0]));
+    float16x8_t r1 =
+        vreinterpretq_f16_u16(vcgtq_f16(values.val[1], other.values.val[1]));
+    return Vectorized<c10::Half>(r0, r1);
+  }
+
+  Vectorized<c10::Half> operator>=(const Vectorized<c10::Half>& other) const {
+    float16x8_t r0 =
+        vreinterpretq_f16_u16(vcgeq_f16(values.val[0], other.values.val[0]));
+    float16x8_t r1 =
+        vreinterpretq_f16_u16(vcgeq_f16(values.val[1], other.values.val[1]));
+    return Vectorized<c10::Half>(r0, r1);
+  }
+
+  Vectorized<c10::Half> eq(const Vectorized<c10::Half>& other) const;
+  Vectorized<c10::Half> ne(const Vectorized<c10::Half>& other) const;
+  Vectorized<c10::Half> gt(const Vectorized<c10::Half>& other) const;
+  Vectorized<c10::Half> ge(const Vectorized<c10::Half>& other) const;
+  Vectorized<c10::Half> lt(const Vectorized<c10::Half>& other) const;
+  Vectorized<c10::Half> le(const Vectorized<c10::Half>& other) const;
+}; // Vectorized<Half>
+
+template <>
+Vectorized<c10::Half> inline operator+(
+    const Vectorized<c10::Half>& a,
+    const Vectorized<c10::Half>& b) {
+  float16x8_t r0 = vaddq_f16(a.get_low(), b.get_low());
+  float16x8_t r1 = vaddq_f16(a.get_high(), b.get_high());
+  return Vectorized<c10::Half>(r0, r1);
+}
+
+template <>
+Vectorized<c10::Half> inline operator-(
+    const Vectorized<c10::Half>& a,
+    const Vectorized<c10::Half>& b) {
+  float16x8_t r0 = vsubq_f16(a.get_low(), b.get_low());
+  float16x8_t r1 = vsubq_f16(a.get_high(), b.get_high());
+  return Vectorized<c10::Half>(r0, r1);
+}
+
+template <>
+Vectorized<c10::Half> inline operator*(
+    const Vectorized<c10::Half>& a,
+    const Vectorized<c10::Half>& b) {
+  float16x8_t r0 = vmulq_f16(a.get_low(), b.get_low());
+  float16x8_t r1 = vmulq_f16(a.get_high(), b.get_high());
+  return Vectorized<c10::Half>(r0, r1);
+}
+
+template <>
+Vectorized<c10::Half> inline operator/(
+    const Vectorized<c10::Half>& a,
+    const Vectorized<c10::Half>& b) {
+  float16x8_t r0 = vdivq_f16(a.get_low(), b.get_low());
+  float16x8_t r1 = vdivq_f16(a.get_high(), b.get_high());
+  return Vectorized<c10::Half>(r0, r1);
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<c10::Half> Vectorized<c10::Half>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<c10::Half> inline maximum(
+    const Vectorized<c10::Half>& a,
+    const Vectorized<c10::Half>& b) {
+  float16x8_t r0 = vmaxq_f16(a.get_low(), b.get_low());
+  float16x8_t r1 = vmaxq_f16(a.get_high(), b.get_high());
+  return Vectorized<c10::Half>(r0, r1);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<c10::Half> inline minimum(
+    const Vectorized<c10::Half>& a,
+    const Vectorized<c10::Half>& b) {
+  float16x8_t r0 = vminq_f16(a.get_low(), b.get_low());
+  float16x8_t r1 = vminq_f16(a.get_high(), b.get_high());
+  return Vectorized<c10::Half>(r0, r1);
+}
+
+template <>
+Vectorized<c10::Half> inline clamp(
+    const Vectorized<c10::Half>& a,
+    const Vectorized<c10::Half>& min,
+    const Vectorized<c10::Half>& max) {
+  return minimum(max, maximum(min, a));
+}
+
+template <>
+Vectorized<c10::Half> inline clamp_max(
+    const Vectorized<c10::Half>& a,
+    const Vectorized<c10::Half>& max) {
+  return minimum(max, a);
+}
+
+template <>
+Vectorized<c10::Half> inline clamp_min(
+    const Vectorized<c10::Half>& a,
+    const Vectorized<c10::Half>& min) {
+  return maximum(min, a);
+}
+
+template <>
+Vectorized<c10::Half> inline operator&(
+    const Vectorized<c10::Half>& a,
+    const Vectorized<c10::Half>& b) {
+  float16x8_t r0 = vreinterpretq_f16_u16(vandq_u16(
+      vreinterpretq_u16_f16(a.get_low()), vreinterpretq_u16_f16(b.get_low())));
+  float16x8_t r1 = vreinterpretq_f16_u16(vandq_u16(
+      vreinterpretq_u16_f16(a.get_high()),
+      vreinterpretq_u16_f16(b.get_high())));
+  return Vectorized<c10::Half>(r0, r1);
+}
+
+template <>
+Vectorized<c10::Half> inline operator|(
+    const Vectorized<c10::Half>& a,
+    const Vectorized<c10::Half>& b) {
+  float16x8_t r0 = vreinterpretq_f16_u16(vorrq_u16(
+      vreinterpretq_u16_f16(a.get_low()), vreinterpretq_u16_f16(b.get_low())));
+  float16x8_t r1 = vreinterpretq_f16_u16(vorrq_u16(
+      vreinterpretq_u16_f16(a.get_high()),
+      vreinterpretq_u16_f16(b.get_high())));
+  return Vectorized<c10::Half>(r0, r1);
+}
+
+template <>
+Vectorized<c10::Half> inline operator^(
+    const Vectorized<c10::Half>& a,
+    const Vectorized<c10::Half>& b) {
+  float16x8_t r0 = vreinterpretq_f16_u16(veorq_u16(
+      vreinterpretq_u16_f16(a.get_low()), vreinterpretq_u16_f16(b.get_low())));
+  float16x8_t r1 = vreinterpretq_f16_u16(veorq_u16(
+      vreinterpretq_u16_f16(a.get_high()),
+      vreinterpretq_u16_f16(b.get_high())));
+  return Vectorized<c10::Half>(r0, r1);
+}
+
+inline Vectorized<c10::Half> Vectorized<c10::Half>::eq(
+    const Vectorized<c10::Half>& other) const {
+  return (*this == other) & Vectorized<c10::Half>(1);
+}
+
+inline Vectorized<c10::Half> Vectorized<c10::Half>::ne(
+    const Vectorized<c10::Half>& other) const {
+  return (*this != other) & Vectorized<c10::Half>(1);
+}
+
+inline Vectorized<c10::Half> Vectorized<c10::Half>::gt(
+    const Vectorized<c10::Half>& other) const {
+  return (*this > other) & Vectorized<c10::Half>(1);
+}
+
+inline Vectorized<c10::Half> Vectorized<c10::Half>::ge(
+    const Vectorized<c10::Half>& other) const {
+  return (*this >= other) & Vectorized<c10::Half>(1);
+}
+
+inline Vectorized<c10::Half> Vectorized<c10::Half>::lt(
+    const Vectorized<c10::Half>& other) const {
+  return (*this < other) & Vectorized<c10::Half>(1);
+}
+
+inline Vectorized<c10::Half> Vectorized<c10::Half>::le(
+    const Vectorized<c10::Half>& other) const {
+  return (*this <= other) & Vectorized<c10::Half>(1);
+}
+
+template <>
+inline void convert(const float16_t* src, int16_t* dst, int64_t n) {
+  int64_t i;
+#pragma unroll
+  for (i = 0; i <= (n - Vectorized<c10::Half>::size());
+       i += Vectorized<c10::Half>::size()) {
+    vst1q_s16(dst + i, vcvtq_s16_f16(vld1q_f16(src + i)));
+    vst1q_s16(dst + i + 8, vcvtq_s16_f16(vld1q_f16(src + i + 8)));
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = static_cast<int16_t>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int16_t* src, float16_t* dst, int64_t n) {
+  int64_t i;
+#pragma unroll
+  for (i = 0; i <= (n - Vectorized<c10::Half>::size());
+       i += Vectorized<c10::Half>::size()) {
+    vst1q_f16(dst + i, vcvtq_f16_s16(vld1q_s16(src + i)));
+    vst1q_f16(dst + i + 8, vcvtq_f16_s16(vld1q_s16(src + i + 8)));
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = static_cast<float16_t>(src[i]);
+  }
+}
+
+template <>
+Vectorized<c10::Half> inline fmadd(
+    const Vectorized<c10::Half>& a,
+    const Vectorized<c10::Half>& b,
+    const Vectorized<c10::Half>& c) {
+  float16x8_t r0 = vfmaq_f16(c.get_low(), a.get_low(), b.get_low());
+  float16x8_t r1 = vfmaq_f16(c.get_high(), a.get_high(), b.get_high());
+  return Vectorized<c10::Half>(r0, r1);
+}
+
+template <>
+Vectorized<c10::Half> inline fmsub(
+    const Vectorized<c10::Half>& a,
+    const Vectorized<c10::Half>& b,
+    const Vectorized<c10::Half>& c) {
+  float16x8_t r0 = vfmsq_f16(c.get_low(), a.get_low(), b.get_low());
+  float16x8_t r1 = vfmsq_f16(c.get_high(), a.get_high(), b.get_high());
+  return Vectorized<c10::Half>(r0, r1);
+}
+
+#endif /* defined(aarch64) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(C10_MOBILE) */
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
index 392a22bee62ae..6263efd2039ce 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@@ -494,7 +494,7 @@ class Vectorized<int16_t> : public Vectorizedi {
 template <typename T>
 class Vectorized8 : public Vectorizedi {
   static_assert(
-    std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
+    std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
     "Only int8_t/uint8_t are supported");
 protected:
   static const Vectorized<T> ones;
@@ -1382,7 +1382,7 @@ Vectorized<int16_t> inline shift_256_16(const Vectorized<int16_t>& a, const Vect
   return c;
 }
 
-template <bool left_shift, typename T, typename std::enable_if_t<std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value, int> = 0>
+template <bool left_shift, typename T, typename std::enable_if_t<std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>, int> = 0>
 Vectorized<T> inline shift_256_8(const Vectorized<T>& a, const Vectorized<T>& b) {
   // No vector instruction for shifting int8_t/uint8_t, so emulating
   // it instead.
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_mask.h b/aten/src/ATen/cpu/vec/vec256/vec256_mask.h
new file mode 100644
index 0000000000000..dd6a8c52d8265
--- /dev/null
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_mask.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_mask.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+template <typename T, typename mask_t>
+struct VecMaskLoad<
+    T,
+    1,
+    mask_t,
+    1,
+    typename std::enable_if_t<
+        std::is_same_v<T, float> || std::is_same_v<T, int32_t> ||
+            std::is_same_v<T, uint32_t>,
+        void>> {
+  static inline VectorizedN<T, 1> apply(
+      const T* ptr,
+      const VecMask<mask_t, 1>& vec_mask) {
+    auto int_mask = vec_mask.template cast<int, 1>()[0];
+    if constexpr (std::is_same_v<T, float>) {
+      return Vectorized<T>(_mm256_maskload_ps(ptr, int_mask));
+    } else {
+      return Vectorized<T>(_mm256_maskload_epi32(ptr, int_mask));
+    }
+  }
+};
+
+// TODO: add specialization of VecMaskLoad for bfloat16/half and int8/uint8
+
+template <>
+struct VecMaskCast<float, 1, int, 1> {
+  static inline VecMask<float, 1> apply(const VecMask<int, 1>& vec_mask) {
+    return Vectorized<float>(_mm256_castsi256_ps(vec_mask[0]));
+  }
+};
+
+template <>
+struct VecMaskCast<int, 1, float, 1> {
+  static inline VecMask<int, 1> apply(const VecMask<float, 1>& vec_mask) {
+    return Vectorized<int>(_mm256_castps_si256(vec_mask[0]));
+  }
+};
+
+template <typename dst_t>
+struct VecMaskCast<dst_t, 1, int64_t, 2> {
+  static inline VecMask<dst_t, 1> apply(const VecMask<int64_t, 2>& vec_mask) {
+    auto int_vec = convert<int, 1, int64_t, 2>(VectorizedN<int64_t, 2>(vec_mask));
+    return VecMask<int, 1>(int_vec).cast<dst_t, 1>();
+  }
+};
+
+template <>
+inline bool VecMask<int, 1>::all_zero() const {
+  return _mm256_testz_si256(mask_[0], mask_[0]);
+}
+
+template <>
+inline bool VecMask<int, 1>::is_masked(int i) const {
+  return _mm256_movemask_ps(_mm256_castsi256_ps(mask_[0])) & (1 << i);
+}
+
+template <>
+inline bool VecMask<int, 1>::all_masked() const {
+  int mask = _mm256_movemask_ps(_mm256_castsi256_ps(mask_[0]));
+  return mask == 0xff;
+}
+
+#define VEC_MASK_METHOD_WITH_CAST_TO_INT(                   \
+    T, N, return_type, method, args_def, args)              \
+  template <>                                               \
+  inline return_type VecMask<T, N>::method args_def const { \
+    return cast<int, 1>().method args;                      \
+  }
+
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_zero, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_zero, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, is_masked, (int i), (i))
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, is_masked, (int i), (i))
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_masked, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_masked, (), ())
+
+#undef VEC_MASK_DEFINE_METHOD_WITH_CAST_TO_INT
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
index ee14de69324fa..c1defcdfd5189 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@@ -41,11 +41,17 @@
 namespace at::vec {
 inline namespace CPU_CAPABILITY {
 
-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX2)
 
+#ifdef _MSC_VER
+__declspec(align(64)) struct Vectorizedqi {
+ protected:
+  __m256i vals;
+#else
 struct Vectorizedqi {
  protected:
   __m256i vals __attribute__((aligned(64)));
+#endif
 
  public:
   Vectorizedqi() {}
@@ -96,28 +102,36 @@ inline __m256i pack_saturate_and_clamp<uint8_t>(
       _mm256_min_epu8(packed_and_sat, _mm256_set1_epi8(max_val)));
 }
 
-inline Vectorized<float> convert_uint8_to_float(at::vec::Vectorized<uint8_t> src) {
+template <typename T>
+typename std::enable_if_t<std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>, at::vec::Vectorized<float>>
+inline convert_int8_to_float(at::vec::Vectorized<T> src) {
   // Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
-  // Only handle first 64 bits
+  // Only handle first 8*8 bits
   __m128i input_128 = _mm256_castsi256_si128(src);
-  // Convert from 8*uint8 to 8*int32
-  __m256i input_256_int32 = _mm256_cvtepu8_epi32(input_128);
+  // Convert from 8*uint8/int8 to 8*int32
+  __m256i input_256_int32;
+  if constexpr (std::is_same_v<T, uint8_t>)
+    input_256_int32 = _mm256_cvtepu8_epi32(input_128);
+  else
+    input_256_int32 = _mm256_cvtepi8_epi32(input_128);
   // Convert from 8*int32 to 8*float
   return _mm256_cvtepi32_ps(input_256_int32);
 }
 
-inline Vectorized<uint8_t> convert_float_to_uint8(at::vec::Vectorized<float> src) {
+template <typename T>
+typename std::enable_if_t<std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>, at::vec::Vectorized<T>>
+inline convert_float_to_int8(at::vec::Vectorized<float> src) {
   // Convert from float32 to int32 with truncation
   __m256i x_values_int32 = _mm256_cvttps_epi32(src);
 
   // Convert from int32 to int16 using signed saturation
   __m256i xy_packed_v = _mm256_packs_epi32(x_values_int32, x_values_int32);
 
-  constexpr auto min_val = std::numeric_limits<uint8_t>::min();
-  constexpr auto max_val = std::numeric_limits<uint8_t>::max();
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
 
-  // Convert from int16 to uint8 using unsigned saturation
-  __m256i xyzw_clamped_v = pack_saturate_and_clamp<uint8_t>(
+  // Convert from int16 to uint8/int8 using unsigned saturation
+  __m256i xyzw_clamped_v = pack_saturate_and_clamp<T>(
       xy_packed_v, xy_packed_v, min_val, max_val);
   __m256i permute_mask_v =
     _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
@@ -125,7 +139,7 @@ inline Vectorized<uint8_t> convert_float_to_uint8(at::vec::Vectorized<float> src
 }
 
 template <typename T>
-inline void __attribute__((always_inline)) QuantizeAvx2(
+__FORCE_INLINE void QuantizeAvx2(
     const float* src,
     T* dst,
     int len,
@@ -394,7 +408,7 @@ __m256i RequantizeAvx2(
     __m256 multiplier,
     __m256i zp) {
   static_assert(
-      std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
+      std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
       "Only int8_t/uint8_t are supported");
   constexpr auto min_val = std::numeric_limits<T>::min();
   constexpr auto max_val = std::numeric_limits<T>::max();
@@ -1323,5 +1337,5 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
   return a.maximum(b);
 }
 
-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#endif // if defined(CPU_CAPABILITY_AVX2)
 }} // namespace at::vec::CPU_CAPABILITY
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
index e48dd542265fb..2d8afd9ef2952 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
@@ -51,6 +51,23 @@ inline void load_fp32_from_bf16(
   load_fp32_from_bf16(data, out2);
 }
 
+inline void load_fp32_from_fp16(const c10::Half* data, Vectorized<float>& out) {
+  __at_align__ float values[Vectorized<float>::size()];
+  for (const auto k : c10::irange(Vectorized<float>::size())) {
+    values[k] = data[k];
+  }
+  out = Vectorized<float>::loadu(values);
+}
+
+inline void load_fp32_from_fp16(
+    const c10::Half* data,
+    Vectorized<float>& out1,
+    Vectorized<float>& out2) {
+  load_fp32_from_fp16(data, out1);
+  data += Vectorized<float>::size();
+  load_fp32_from_fp16(data, out2);
+}
+
 } // namespace
 } // namespace vec
 } // namespace at
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
index 05b2f6499261b..9f4d38c920f7b 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
@@ -212,12 +212,19 @@ class Vectorized<ComplexDbl> {
   static Vectorized<ComplexDbl> el_mergee(
       Vectorized<ComplexDbl>& first,
       Vectorized<ComplexDbl>& second) {
-    // as mergee phased in , we can use vec_perm with mask
     return {
         vec_mergeh(first._vec0, second._vec0),
         vec_mergeh(first._vec1, second._vec1)};
   }
 
+  static Vectorized<ComplexDbl> el_mergeo(
+      Vectorized<ComplexDbl>& first,
+      Vectorized<ComplexDbl>& second) {
+    return {
+        vec_mergel(first._vec0, second._vec0),
+        vec_mergel(first._vec1, second._vec1)};
+  }
+
   Vectorized<ComplexDbl> abs_2_() const {
     auto a = (*this).elwise_mult(*this);
     auto permuted = a.el_swapped();
@@ -385,13 +392,11 @@ class Vectorized<ComplexDbl> {
   static Vectorized<ComplexDbl> horizontal_add(
       Vectorized<ComplexDbl>& first,
       Vectorized<ComplexDbl>& second) {
-    auto first_perm = first.el_swapped(); // 2perm
-    auto second_perm = second.el_swapped(); // 2perm
-    // summ
-    auto first_ret = first + first_perm; // 2add
-    auto second_ret = second + second_perm; // 2 add
-    // now lets choose evens
-    return el_mergee(first_ret, second_ret); // 2 mergee's
+    // Operates on individual floats, see _mm_hadd_ps
+    // {f0+f1, s0+s1, f2+f3, s2+s3, ...}
+    // i.e. it sums the re and im of each value and interleaves first and second:
+    // {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...}
+    return el_mergee(first, second) + el_mergeo(first, second);
   }
 
   static Vectorized<ComplexDbl> horizontal_sub(
@@ -432,25 +437,20 @@ class Vectorized<ComplexDbl> {
     // re + im*i = (a + bi)  / (c + di)
     // re = (ac + bd)/abs_2()
     // im = (bc - ad)/abs_2()
-#if 1
-    auto vi = b.el_mergeo();
-    auto vr = b.el_mergee();
-    auto abs_b = b.abs_2_();
-    vi = vi ^ vd_isign_mask;
-    auto ret = elwise_mult(vr);
-    auto vx_swapped = el_swapped();
-    ret = vx_swapped.el_madd(vi, ret);
-    ret = ret.elwise_div(abs_b);
-#else
-    // Vectorized x86 simulation
-    auto ac_bd = elwise_mult(b);
-    auto d_c = b.el_swapped();
-    d_c = d_c ^ vd_rsign_mask;
-    auto ad_bc = elwise_mult(d_c);
-    auto abs_b = b.abs_2_();
-    auto re_im = horizontal_add(ac_bd, ad_bc);
-    auto ret = re_im.elwise_div(abs_b);
-#endif
+    auto fabs_cd =  Vectorized{
+      vec_andc(b._vec0, vd_sign_mask),
+      vec_andc(b._vec1, vd_sign_mask)};       // |c|            |d|
+    auto fabs_dc =  fabs_cd.el_swapped();     // |d|            |c|
+    auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|)
+    auto a2 = elwise_div(scale);              // a/sc           b/sc
+    auto b2 = b.elwise_div(scale);            // c/sc           d/sc
+    auto acbd2 = a2.elwise_mult(b2);          // ac/sc^2        bd/sc^2
+    auto dc2 = b2.el_swapped();               // d/sc           c/sc
+    dc2 = dc2 ^ vd_rsign_mask;                // -d/sc          c/sc
+    auto adbc2 = a2.elwise_mult(dc2);         // -ad/sc^2       bc/sc^2
+    auto ret = horizontal_add(acbd2, adbc2);  // (ac+bd)/sc^2   (bc-ad)/sc^2
+    auto denom2 = b2.abs_2_();                // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2
+    ret = ret.elwise_div(denom2);
     return ret;
   }
 
@@ -511,13 +511,14 @@ class Vectorized<ComplexDbl> {
   DEFINE_MEMBER_OP(operator&, ComplexDbl, vec_and)
   DEFINE_MEMBER_OP(operator|, ComplexDbl, vec_or)
   DEFINE_MEMBER_OP(operator^, ComplexDbl, vec_xor)
-  // elelemtwise helpers
+  // elementwise helpers
   DEFINE_MEMBER_OP(elwise_mult, ComplexDbl, vec_mul)
   DEFINE_MEMBER_OP(elwise_div, ComplexDbl, vec_div)
   DEFINE_MEMBER_OP(elwise_gt, ComplexDbl, vec_cmpgt)
   DEFINE_MEMBER_OP(elwise_ge, ComplexDbl, vec_cmpge)
   DEFINE_MEMBER_OP(elwise_lt, ComplexDbl, vec_cmplt)
   DEFINE_MEMBER_OP(elwise_le, ComplexDbl, vec_cmple)
+  DEFINE_MEMBER_OP(elwise_max, ComplexDbl, vec_max)
 };
 
 template <>
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
index 91bf616db4bc4..53e80523f761a 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
@@ -238,18 +238,14 @@ class Vectorized<ComplexFlt> {
     return loadu(tmp);
   }
 
-  static Vectorized<ComplexFlt> horizontal_add_permD8(
+  static Vectorized<ComplexFlt> horizontal_add(
       Vectorized<ComplexFlt>& first,
       Vectorized<ComplexFlt>& second) {
-    // we will simulate it differently with 6 instructions total
-    // lets permute second so that we can add it getting horizontal sums
-    auto first_perm = first.el_swapped(); // 2perm
-    auto second_perm = second.el_swapped(); // 2perm
-    // sum
-    auto first_ret = first + first_perm; // 2add
-    auto second_ret = second + second_perm; // 2 add
-    // now lets choose evens
-    return el_mergee(first_ret, second_ret); // 2 mergee's
+    // Operates on individual floats, see _mm_hadd_ps
+    // {f0+f1, s0+s1, f2+f3, s2+s3, ...}
+    // i.e. it sums the re and im of each value and interleaves first and second:
+    // {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...}
+    return el_mergee(first, second) + el_mergeo(first, second);
   }
 
   static Vectorized<ComplexFlt> horizontal_sub_permD8(
@@ -353,12 +349,19 @@ class Vectorized<ComplexFlt> {
   static Vectorized<ComplexFlt> el_mergee(
       Vectorized<ComplexFlt>& first,
       Vectorized<ComplexFlt>& second) {
-    // as mergee phased in , we can use vec_perm with mask
     return {
         vec_mergee(first._vecb0, second._vecb0),
         vec_mergee(first._vecb1, second._vecb1)};
   }
 
+  static Vectorized<ComplexFlt> el_mergeo(
+      Vectorized<ComplexFlt>& first,
+      Vectorized<ComplexFlt>& second) {
+    return {
+        vec_mergeo(first._vecb0, second._vecb0),
+        vec_mergeo(first._vecb1, second._vecb1)};
+  }
+
   Vectorized<ComplexFlt> angle_() const {
     // angle = atan2(b/a)
     // auto b_a = _mm256_permute_ps(values, 0xB1); // b        a
@@ -488,25 +491,20 @@ class Vectorized<ComplexFlt> {
     // re + im*i = (a + bi)  / (c + di)
     // re = (ac + bd)/abs_2()
     // im = (bc - ad)/abs_2()
-#if 1
-    auto vi = b.el_mergeo();
-    auto vr = b.el_mergee();
-    auto abs_b = b.abs_2_();
-    vi = vi ^ isign_mask;
-    auto ret = elwise_mult(vr);
-    auto vx_swapped = el_swapped();
-    ret = vx_swapped.el_madd(vi, ret);
-    ret = ret.elwise_div(abs_b);
-#else
-    // Vectorized x86 simulation
-    auto ac_bd = elwise_mult(b);
-    auto d_c = b.el_swapped();
-    d_c = d_c ^ rsign_mask;
-    auto ad_bc = elwise_mult(d_c);
-    auto abs_b = b.abs_2_();
-    auto re_im = horizontal_add_permD8(ac_bd, ad_bc);
-    auto ret = re_im.elwise_div(abs_b);
-#endif
+    auto fabs_cd =  Vectorized{
+      vec_andc(b._vec0, sign_mask),
+      vec_andc(b._vec1, sign_mask)};          // |c|            |d|
+    auto fabs_dc =  fabs_cd.el_swapped();     // |d|            |c|
+    auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|)
+    auto a2 = elwise_div(scale);              // a/sc           b/sc
+    auto b2 = b.elwise_div(scale);            // c/sc           d/sc
+    auto acbd2 = a2.elwise_mult(b2);          // ac/sc^2        bd/sc^2
+    auto dc2 = b2.el_swapped();               // d/sc           c/sc
+    dc2 = dc2 ^ rsign_mask;                   // -d/sc          c/sc
+    auto adbc2 = a2.elwise_mult(dc2);         // -ad/sc^2       bc/sc^2
+    auto ret = horizontal_add(acbd2, adbc2);  // (ac+bd)/sc^2   (bc-ad)/sc^2
+    auto denom2 = b2.abs_2_();                // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2
+    ret = ret.elwise_div(denom2);
     return ret;
   }
 
@@ -589,6 +587,7 @@ class Vectorized<ComplexFlt> {
   DEFINE_MEMBER_OP(elwise_ge, ComplexFlt, vec_cmpge)
   DEFINE_MEMBER_OP(elwise_lt, ComplexFlt, vec_cmplt)
   DEFINE_MEMBER_OP(elwise_le, ComplexFlt, vec_cmple)
+  DEFINE_MEMBER_OP(elwise_max, ComplexFlt, vec_max)
 };
 
 template <>
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
index bfcf5d984987e..139044cbd4698 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
@@ -383,6 +383,19 @@ class Vectorized<double> {
     auto ret = (x == x);
     return ret._nor();
   }
+  bool has_inf_nan() const {
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
 
   DEFINE_MEMBER_OP(operator==, double, vec_cmpeq)
   DEFINE_MEMBER_OP(operator!=, double, vec_cmpne)
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
index d08fb54fd56ec..0003773e37c89 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
@@ -234,9 +234,18 @@ class Vectorized<float> {
     return ret._nor();
   }
 
-  Vectorized<float> _isinf() const {
-    auto x = *this;
-    return (x == v_inf) | (x == v_minus_inf);
+  bool has_inf_nan() const {
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+        return true;
+      }
+    }
+    return false;
   }
 
   int zero_mask() const {
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
index 26c90a371f903..e8d12eb938e54 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
@@ -91,7 +91,7 @@ struct Vectorized<c10::qint8> {
           vec_vsx_ld(offset0, reinterpret_cast<const vint8*>(ptr)),
           vec_vsx_ld(offset16, reinterpret_cast<const vint8*>(ptr))};
     }
-    __at_align__ value_type tmp_values[size()];
+    __at_align__ value_type tmp_values[size()] = {};
     std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
     return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
   }
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
index c3a320af156de..93f80a14638e9 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
@@ -94,7 +94,7 @@ struct Vectorized<c10::quint8> {
           vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
           vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
     }
-    __at_align__ value_type tmp_values[size()];
+    __at_align__ value_type tmp_values[size()] = {};
     std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
     return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
   }
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
index d5033990f60c7..1dc742f3cbb1c 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
@@ -391,6 +391,7 @@ const vbool32 imag_mask = vbool32{0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF};
 const vbool32 isign_mask = vbool32{0x0, 0x80000000, 0x0, 0x80000000};
 const vbool32 rsign_mask = vbool32{0x80000000, 0x0, 0x80000000, 0x0};
 
+const vbool64 vd_sign_mask  = vbool64{0x8000000000000000, 0x8000000000000000};
 const vbool64 vd_imag_mask  = vbool64{0x0, 0xFFFFFFFFFFFFFFFF};
 const vbool64 vd_real_mask  = vbool64{0xFFFFFFFFFFFFFFFF, 0x0};
 const vbool64 vd_isign_mask = vbool64{0x0, 0x8000000000000000};
diff --git a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
index e28c999983c0c..b70b494649b36 100644
--- a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
+++ b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
@@ -13,8 +13,6 @@
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/complex.h>
 
-#define SLEEF_MEMORY_WORKAROUND
-
 namespace at {
 namespace vec {
 
@@ -393,40 +391,84 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
   C10_ALWAYS_INLINE Vectorized(T s)
       : _vec0{vec_splats((ElementType)s)}, _vec1{vec_splats((ElementType)s)} {}
 
-  static Vectorized<value_type> C10_ALWAYS_INLINE
-  loadu(const void* ptr, int count = size()) {
-    if (count == size()) {
+  template <typename U, typename DUMMY = void>
+  struct LoaduHelper {
+    static Vectorized<T> C10_ALWAYS_INLINE
+    loadu(const U* ptr, int count = size()) {
+      __at_align__ ElementType tmp_values[size()] = {};
+      std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(ElementType));
+
       return {
-          vec_xl(offset0, reinterpret_cast<const ElementType*>(ptr)),
-          vec_xl(offset16, reinterpret_cast<const ElementType*>(ptr))};
+          vec_xl(offset0, &(tmp_values[0])),
+          vec_xl(offset16, &(tmp_values[0]))};
     }
+  };
+
+  template <typename DUMMY>
+  struct LoaduHelper<ElementType, DUMMY> {
+    static Vectorized<T> C10_ALWAYS_INLINE
+    loadu(const ElementType* ptr, int count = size()) {
+      if (count == size()) {
+        return {
+            vec_xl(offset0, ptr),
+            vec_xl(offset16, ptr)};
+      }
 
-    __at_align__ ElementType tmp_values[size()] = {};
-    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(ElementType));
+      __at_align__ ElementType tmp_values[size()] = {};
+      std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(ElementType));
 
-    return {
-        vec_xl(offset0, reinterpret_cast<const ElementType*>(tmp_values)),
-        vec_xl(offset16, reinterpret_cast<const ElementType*>(tmp_values))};
+      return {
+          vec_xl(offset0, &(tmp_values[0])),
+          vec_xl(offset16, &(tmp_values[0]))};
+    }
+  };
+
+  template <typename U>
+  static Vectorized<T> C10_ALWAYS_INLINE
+  loadu(const U* ptr, int count = size()) {
+    return LoaduHelper<U>::loadu(ptr, count);
   }
 
-  static Vectorized<value_type> C10_ALWAYS_INLINE
-  loadu_one_fourth(const void* ptr) {
+  template <typename U>
+  static Vectorized<T> C10_ALWAYS_INLINE
+  loadu_one_fourth(const U* ptr) {
     // load only first 8 bytes
     // only intended to be used with uint8_t
     return loadu(ptr, 8 / sizeof(ElementType));
   }
 
-  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
-    if (count == size()) {
-      vec_xst(_vec0, offset0, reinterpret_cast<ElementType*>(ptr));
-      vec_xst(_vec1, offset16, reinterpret_cast<ElementType*>(ptr));
-    } else if (count > 0) {
-      __at_align__ ElementType tmp_values[size()];
-      vec_xst(_vec0, offset0, reinterpret_cast<ElementType*>(tmp_values));
-      vec_xst(_vec1, offset16, reinterpret_cast<ElementType*>(tmp_values));
-      std::memcpy(
-          ptr, tmp_values, std::min(count, size()) * sizeof(ElementType));
+  template <typename U, typename DUMMY = void>
+  struct StoreHelper {
+    static void C10_ALWAYS_INLINE store(const Vectorized<T> &vec, U* ptr, int count = size()) {
+      if (count > 0) {
+        __at_align__ ElementType tmp_values[size()];
+        vec_xst(vec._vec0, offset0, &(tmp_values[0]));
+        vec_xst(vec._vec1, offset16, &(tmp_values[0]));
+        std::memcpy(
+            ptr, tmp_values, std::min(count, size()) * sizeof(ElementType));
+      }
+    }
+  };
+
+  template <typename DUMMY>
+  struct StoreHelper<ElementType, DUMMY> {
+    static void C10_ALWAYS_INLINE store(const Vectorized<T> &vec, ElementType* ptr, int count = size()) {
+      if (count == size()) {
+        vec_xst(vec._vec0, offset0, ptr);
+        vec_xst(vec._vec1, offset16, ptr);
+      } else if (count > 0) {
+        __at_align__ ElementType tmp_values[size()];
+        vec_xst(vec._vec0, offset0, &(tmp_values[0]));
+        vec_xst(vec._vec1, offset16, &(tmp_values[0]));
+        std::memcpy(
+            ptr, tmp_values, std::min(count, size()) * sizeof(ElementType));
+      }
     }
+  };
+
+  template <typename U>
+  void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const {
+    return StoreHelper<U>::store(*this, ptr, count);
   }
 
   C10_ALWAYS_INLINE const vtype& vec0() const {
@@ -875,6 +917,20 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
     return ret._not();
   }
 
+  bool has_inf_nan() const {
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   template <
       typename U = T,
       std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
@@ -1090,32 +1146,20 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
   }
 
   Vectorized<T> sin() const {
-#ifndef SLEEF_MEMORY_WORKAROUND
     return mapSleef(Sleef_sinf4_u10, Sleef_sind2_u10);
-#else
-    return mapOrdinary(std::sin);
-#endif
   }
   Vectorized<T> sinh() const {
     return mapSleef(Sleef_sinhf4_u10, Sleef_sinhd2_u10);
   }
   Vectorized<T> cos() const {
-#ifndef SLEEF_MEMORY_WORKAROUND
     return mapSleef(Sleef_cosf4_u10, Sleef_cosd2_u10);
-#else
-    return mapOrdinary(std::cos);
-#endif
   }
   Vectorized<T> cosh() const {
     return mapSleef(Sleef_coshf4_u10, Sleef_coshd2_u10);
   }
 
   Vectorized<T> tan() const {
-#ifndef SLEEF_MEMORY_WORKAROUND
     return mapSleef(Sleef_tanf4_u10, Sleef_tand2_u10);
-#else
-    return mapOrdinary(std::tan);
-#endif
   }
   Vectorized<T> tanh() const {
     return mapSleef(Sleef_tanhf4_u10, Sleef_tanhd2_u10);
@@ -1447,19 +1491,19 @@ inline ZSimdVect<int> vec_flt_int(const ZSimdVect<float> x) {
 #define vec_flt_int vec_signed
 #endif
 
-Vectorized<float> convert_to_float(const Vectorized<int32_t>& x) {
+Vectorized<float> zvec_convert_to_float(const Vectorized<int32_t>& x) {
   return {vec_int_flt(x.vec0()), vec_int_flt(x.vec1())};
 }
 
-Vectorized<int32_t> convert_to_int(const Vectorized<float>& x) {
+Vectorized<int32_t> zvec_convert_to_int(const Vectorized<float>& x) {
   return {vec_flt_int(x.vec0()), vec_flt_int(x.vec1())};
 }
 
-Vectorized<double> convert_to_float(const Vectorized<int64_t>& x) {
+Vectorized<double> zvec_convert_to_float(const Vectorized<int64_t>& x) {
   return {vec_double(x.vec0()), vec_double(x.vec1())};
 }
 
-Vectorized<int64_t> convert_to_int(const Vectorized<double>& x) {
+Vectorized<int64_t> zvec_convert_to_int(const Vectorized<double>& x) {
   return {vec_signed(x.vec0()), vec_signed(x.vec1())};
 }
 
@@ -1517,13 +1561,13 @@ Vectorized<int64_t> C10_ALWAYS_INLINE fmadd(
 template <>
 Vectorized<int64_t> C10_ALWAYS_INLINE
 convert_to_int_of_same_size<double>(const Vectorized<double>& src) {
-  return convert_to_int(src);
+  return zvec_convert_to_int(src);
 }
 
 template <>
 Vectorized<int32_t> C10_ALWAYS_INLINE
 convert_to_int_of_same_size<float>(const Vectorized<float>& src) {
-  return convert_to_int(src);
+  return zvec_convert_to_int(src);
 }
 
 template <>
@@ -1535,7 +1579,7 @@ inline void convert(const int32_t* src, float* dst, int64_t n) {
     const int32_t* src_a = src + i;
     float* dst_a = dst + i;
     auto input_vec = Vectorized<int32_t>::loadu(src_a);
-    auto output_vec = convert_to_float(input_vec);
+    auto output_vec = zvec_convert_to_float(input_vec);
     output_vec.store(dst_a);
   }
 
@@ -1552,7 +1596,7 @@ inline void convert(const int64_t* src, double* dst, int64_t n) {
     const int64_t* src_a = src + i;
     double* dst_a = dst + i;
     auto input_vec = Vectorized<int64_t>::loadu(src_a);
-    auto output_vec = convert_to_float(input_vec);
+    auto output_vec = zvec_convert_to_float(input_vec);
     output_vec.store(dst_a);
   }
   for (; i < n; i++) {
@@ -1696,12 +1740,14 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
     return _vec;
   }
 
+  template <typename U>
   static Vectorized<T> C10_ALWAYS_INLINE
-  loadu(const void* ptr, int count = size()) {
+  loadu(const U* ptr, int count = size()) {
     return Vectorized<T>{vinner_type::loadu(ptr, count)};
   }
 
-  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+  template <typename U>
+  void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const {
     _vec.store(ptr, count);
   }
 
@@ -1729,7 +1775,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
       Vectorized<float> scale,
       Vectorized<float> zero_point,
       Vectorized<float> scale_zp_premul) const {
-    auto float_val = convert_to_float(_vec);
+    auto float_val = zvec_convert_to_float(_vec);
     return {fmadd(scale, float_val, scale_zp_premul)};
   }
 
@@ -1739,7 +1785,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
   float_vec_return_type dequantize(
       Vectorized<float> scale,
       Vectorized<float> zero_point) const {
-    auto float_val = convert_to_float(_vec);
+    auto float_val = zvec_convert_to_float(_vec);
     return {(float_val - zero_point) * scale};
   }
 
@@ -1754,7 +1800,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
     Vectorized<float> vecf = rhs[0];
     vecf = vecf * Vectorized<float>(inverse_scale);
     vecf = vecf.rint() + Vectorized<float>((float)(zero_point));
-    auto veci = convert_to_int(vecf);
+    auto veci = zvec_convert_to_int(vecf);
 
     return Vectorized<T>{veci};
   }
@@ -1767,10 +1813,10 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
       float multiplier,
       int32_t zero_point) {
     Vectorized<T> vi = inp[0];
-    auto vecf = convert_to_float(vi.vec());
+    auto vecf = zvec_convert_to_float(vi.vec());
     vecf = vecf * Vectorized<float>(multiplier);
     vecf = vecf.rint();
-    auto veci = convert_to_int(vecf) + Vectorized<int>(zero_point);
+    auto veci = zvec_convert_to_int(vecf) + Vectorized<int>(zero_point);
 
     return Vectorized<T>{veci};
   }
@@ -1805,11 +1851,11 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
     auto ret32_0 = unpack(ret16.first);
     auto ret32_1 = unpack(ret16.second);
 
-    auto vecf_0 = convert_to_float(ret32_0.first);
-    auto vecf_1 = convert_to_float(ret32_0.second);
+    auto vecf_0 = zvec_convert_to_float(ret32_0.first);
+    auto vecf_1 = zvec_convert_to_float(ret32_0.second);
 
-    auto vecf_2 = convert_to_float(ret32_1.first);
-    auto vecf_3 = convert_to_float(ret32_1.second);
+    auto vecf_2 = zvec_convert_to_float(ret32_1.first);
+    auto vecf_3 = zvec_convert_to_float(ret32_1.second);
     return {
         fmadd(scale, vecf_0, scale_zp_premul),
         fmadd(scale, vecf_1, scale_zp_premul),
@@ -1828,11 +1874,11 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
     auto ret32_0 = unpack(ret16.first);
     auto ret32_1 = unpack(ret16.second);
 
-    auto vecf_0 = convert_to_float(ret32_0.first);
-    auto vecf_1 = convert_to_float(ret32_0.second);
+    auto vecf_0 = zvec_convert_to_float(ret32_0.first);
+    auto vecf_1 = zvec_convert_to_float(ret32_0.second);
 
-    auto vecf_2 = convert_to_float(ret32_1.first);
-    auto vecf_3 = convert_to_float(ret32_1.second);
+    auto vecf_2 = zvec_convert_to_float(ret32_1.first);
+    auto vecf_3 = zvec_convert_to_float(ret32_1.second);
 
     return {
         (vecf_0 - zero_point) * scale,
@@ -1867,10 +1913,10 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
     vecf4 = vecf4.rint() + vec_zero_point;
     vecf6 = vecf6.rint() + vec_zero_point;
 
-    auto veci0 = convert_to_int(vecf0);
-    auto veci2 = convert_to_int(vecf2);
-    auto veci4 = convert_to_int(vecf4);
-    auto veci6 = convert_to_int(vecf6);
+    auto veci0 = zvec_convert_to_int(vecf0);
+    auto veci2 = zvec_convert_to_int(vecf2);
+    auto veci4 = zvec_convert_to_int(vecf4);
+    auto veci6 = zvec_convert_to_int(vecf6);
 
     auto vecshi0 = pack(veci0, veci2);
     auto vecshi2 = pack(veci4, veci6);
@@ -1894,11 +1940,11 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
     Vectorized<c10::qint32> vi2 = inp[2];
     Vectorized<c10::qint32> vi3 = inp[3];
 
-    auto vecf0 = convert_to_float(vi0.vec());
-    auto vecf2 = convert_to_float(vi1.vec());
+    auto vecf0 = zvec_convert_to_float(vi0.vec());
+    auto vecf2 = zvec_convert_to_float(vi1.vec());
 
-    auto vecf4 = convert_to_float(vi2.vec());
-    auto vecf6 = convert_to_float(vi3.vec());
+    auto vecf4 = zvec_convert_to_float(vi2.vec());
+    auto vecf6 = zvec_convert_to_float(vi3.vec());
 
     vecf0 = vecf0 * vec_multiplier;
     vecf2 = vecf2 * vec_multiplier;
@@ -1911,10 +1957,10 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
     vecf4 = vecf4.rint();
     vecf6 = vecf6.rint();
 
-    auto veci0 = convert_to_int(vecf0);
-    auto veci2 = convert_to_int(vecf2);
-    auto veci4 = convert_to_int(vecf4);
-    auto veci6 = convert_to_int(vecf6);
+    auto veci0 = zvec_convert_to_int(vecf0);
+    auto veci2 = zvec_convert_to_int(vecf2);
+    auto veci4 = zvec_convert_to_int(vecf4);
+    auto veci6 = zvec_convert_to_int(vecf6);
 
     veci0 = veci0 + vec_zero_point;
     veci2 = veci2 + vec_zero_point;
@@ -2160,12 +2206,14 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented_complex<T>()>> {
     return _vec.data();
   }
 
+  template <typename U>
   static Vectorized<T> C10_ALWAYS_INLINE
-  loadu(const void* ptr, int count = size()) {
+  loadu(const U* ptr, int count = size()) {
     return Vectorized<T>{vinner_type::loadu(ptr, 2 * count)};
   }
 
-  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+  template <typename U>
+  void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const {
     return _vec.store(ptr, 2 * count);
   }
 
@@ -2776,19 +2824,23 @@ std::pair<Vectorized<int64_t>, Vectorized<int64_t>> inline deinterleave2<
   return inner_deinterleave2<int64_t>(a, b);
 }
 
-inline Vectorized<float> convert_uint8_to_float(const Vectorized<uint8_t> &src) {
+template <typename T>
+typename std::enable_if<std::is_same<T, uint8_t>::value, at::vec::Vectorized<float>>::type
+inline convert_int8_to_float(const Vectorized<T> &src) {
   // Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
   // Only handle first 64 bits
   auto vec_int = src.to_vec_float_helper();
 
-  return convert_to_float(vec_int);
+  return zvec_convert_to_float(vec_int);
 }
 
-inline Vectorized<uint8_t> convert_float_to_uint8(const Vectorized<float> &src) {
-  constexpr auto min_val = std::numeric_limits<uint8_t>::min();
-  constexpr auto max_val = std::numeric_limits<uint8_t>::max();
+template <typename T>
+typename std::enable_if<std::is_same<T, uint8_t>::value, at::vec::Vectorized<T>>::type
+inline convert_float_to_int8(const Vectorized<float> &src) {
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
 
-  auto vec_int = clamp(convert_to_int(src), Vectorized<int32_t>(min_val), Vectorized<int32_t>(max_val));
+  auto vec_int = clamp(zvec_convert_to_int(src), Vectorized<int32_t>(min_val), Vectorized<int32_t>(max_val));
 
   return vec_int.to_vec_uint8_helper();
 }
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
index fd2e058b63ac6..c7fa23b23a607 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@@ -13,6 +13,8 @@
 #include <ATen/cpu/vec/vec512/vec512_qint.h>
 #include <ATen/cpu/vec/vec512/vec512_complex_float.h>
 #include <ATen/cpu/vec/vec512/vec512_complex_double.h>
+#include <ATen/cpu/vec/vec512/vec512_convert.h>
+#include <ATen/cpu/vec/vec512/vec512_mask.h>
 
 #include <algorithm>
 #include <cstddef>
@@ -55,7 +57,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
 }
 
 
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -80,7 +82,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
+#ifndef _MSC_VER
+// MSVC is not working well on complex function overload.
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
 inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
@@ -92,9 +95,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
 inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
   return _mm512_i32gather_ps(vindex, base_addr, scale);
 }
-
+#endif
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
+#ifndef _MSC_VER
+// MSVC is not working well on complex function overload.
 template<int64_t scale = 1>
 std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
 inline mask_gather(const Vectorized<double>& src, const double* base_addr,
@@ -112,7 +116,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
   auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ);
   return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale);
 }
-
+#endif
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template<>
@@ -127,6 +131,18 @@ inline convert_to_int_of_same_size<float>(const Vectorized<float> &src) {
   return _mm512_cvttps_epi32(src);
 }
 
+template<>
+Vectorized<double>
+inline convert_to_fp_of_same_size<double>(const Vectorized<int64_t> &src) {
+  return _mm512_cvtepi64_pd(src);
+}
+
+template<>
+Vectorized<float>
+inline convert_to_fp_of_same_size<float>(const Vectorized<int32_t> &src) {
+  return _mm512_cvtepi32_ps(src);
+}
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <>
@@ -258,6 +274,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
   return flip8(v);
 }
 
-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#endif // defined(CPU_CAPABILITY_AVX512)
 
 }}}
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
index acc074435a489..c7132349418de 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
@@ -7,7 +7,8 @@
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
 
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif
 
@@ -16,7 +17,18 @@ namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
+
+#ifndef SLEEF_CONST
+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
+#define SLEEF_CONST const
+#else
+#define SLEEF_CONST
+#endif
+#define SLEEF_CONST_OLD SLEEF_CONST
+#else
+#define SLEEF_CONST_OLD
+#endif
 
 // bfloat16 conversion
 static inline void cvtbf16_fp32(const __m256i& a, __m512& o) {
@@ -30,6 +42,25 @@ static inline void cvtbf16_fp32(const __m512i& a, __m512& o1, __m512& o2) {
   cvtbf16_fp32(hi, o2);
 }
 
+static inline __m256i cvtfp32_bf16(const __m512& src) {
+  __m512i value = _mm512_castps_si512(src);
+  __m512i nan = _mm512_set1_epi32(0xffff);
+  auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q);
+  __m512i ones = _mm512_set1_epi32(0x1);
+  __m512i vec_bias = _mm512_set1_epi32(0x7fff);
+  // uint32_t lsb = (input >> 16) & 1;
+  auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones);
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t_value = _mm512_add_epi32(t_value, vec_bias);
+  // input += rounding_bias;
+  t_value = _mm512_add_epi32(t_value, value);
+  // input = input >> 16;
+  t_value = _mm512_srli_epi32(t_value, 16);
+  // Check NaN before converting back to bf16
+  t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value);
+  return _mm512_cvtusepi32_epi16(t_value);
+}
+
 static inline __m512i cvtfp32_bf16(const __m512& a, const __m512& b) {
   __m512i lo = _mm512_castps_si512(a);
   __m512i hi = _mm512_castps_si512(b);
@@ -81,6 +112,11 @@ static inline void cvtfp16_fp32(const __m512i& a, __m512& o1, __m512& o2) {
   cvtfp16_fp32(hi, o2);
 }
 
+static inline __m256i cvtfp32_fp16(const __m512& src) {
+  return _mm512_cvtps_ph(
+      src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+}
+
 static inline __m512i cvtfp32_fp16(const __m512& a, const __m512& b) {
   __m256i lo = _mm512_cvtps_ph(
       a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
@@ -343,7 +379,8 @@ static_assert(
   }
   #pragma clang diagnostic push
   #pragma clang diagnostic ignored "-Wignored-qualifiers"
-  Vectorized<T> map(const __m512 (*const vop)(__m512)) const {
+
+  Vectorized<T> map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const {
     __m512 lo, hi;
     cvt_to_fp32<T>(values, lo, hi);
     const auto o1 = vop(lo);
@@ -400,6 +437,9 @@ static_assert(
   Vectorized<T> acos() const {
     return map(Sleef_acosf16_u10);
   }
+  Vectorized<T> acosh() const {
+    return map(Sleef_acoshf16_u10);
+  }
   Vectorized<T> asin() const {
     return map(Sleef_asinf16_u10);
   }
@@ -936,6 +976,395 @@ Vectorized<BFloat16> inline fmadd(const Vectorized<BFloat16>& a,
   return cvtfp32_bf16(o1, o2);
 }
 
+static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
+  __m512i r[8];
+  // a0a1 a2a3 a4a5 a6a7 a8a9 a10a11 a12a13 a14a15   e0e1 e2e3 e4e5 e6e7 e8e9 e10e11 e12e13 e14e15
+  // b0-b15  f0-f15
+  // c0-c15  g0-g15
+  // d0-d15  h0-h15
+  // i0-i15  m0-m15
+  // j0-j15  n0-n15
+  // k0-k15  o0-o15
+  // l0-l15  p0-p15
+#pragma unroll(4)
+  for (int i = 0; i < 4; i++) {
+    r[i] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i]), t[i + 4], 0x01);
+    r[i + 4] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i + 8]), t[i + 12], 0x01);
+  }
+
+  // u0: a0a1 b0b1 a2a3 b2b3 a8a9 b8b9 a10a11 b10b11   e0e1 f0f1 e2e3 f2f3 e8e9 f8f9 e10e11 f10f11
+  // u1: a4a5 b4b5 a6a7 b6b7 a12a13 b12b13 a14a15 b14b15   e4e5 f4f5 e6e7 f6f7 e12e13 f12f13 e14e15 f14f15
+  // u2: c0c1 d0d1 c2c3 d2d3 c8c9 d8d9 c10c11 d10d11   g0g1 h0h1 g2g3 h2h3 g8g9 h8h9 g10g11 h10h11
+  // u3: c4c5 d4b5 c6c7 d6b7 c12c13 d12d13 c14c15 d14d15   g4g5 h4h5 g6g7 h6h7 g12g13 h12h13 g14g15 h14h15
+  // i j  m n
+  // k l  o p
+#pragma unroll(4)
+  for (int i = 0; i < 8; i += 2) {
+    u[i] = _mm512_unpacklo_epi32(r[i], r[i + 1]);
+    u[i + 1] = _mm512_unpackhi_epi32(r[i], r[i + 1]);
+  }
+
+  // r0: a0a1 b0b1 c0c1 d0d1 a8a9 b8b9 c8c9 d8d9  e0e1 f0f1 g0g1 h0h1 e8e9 f8f9 g8g9 h8h9
+  // r1: a2a3 b2b3 c2c3 d2d3 a10a11 b10b11 c10c11 d10d11  e2e3 f2f3 g2g3 h2h3 e10e11 f10f11 g10g11 h10h11
+  // r2: a4a5 b4b5 c4c5 d4b5 a12a13 b12b13 c12c13 d12d13
+  // r3: a6a7 b6b7 c6c7 d6b7 a14a15 b14b15 c14c15 d14d15
+  // r4: i j k l m n o p
+  r[0] = _mm512_unpacklo_epi64(u[0], u[2]);
+  r[1] = _mm512_unpackhi_epi64(u[0], u[2]);
+  r[2] = _mm512_unpacklo_epi64(u[1], u[3]);
+  r[3] = _mm512_unpackhi_epi64(u[1], u[3]);
+  r[4] = _mm512_unpacklo_epi64(u[4], u[6]);
+  r[5] = _mm512_unpackhi_epi64(u[4], u[6]);
+  r[6] = _mm512_unpacklo_epi64(u[5], u[7]);
+  r[7] = _mm512_unpackhi_epi64(u[5], u[7]);
+
+  __m512i const1 = _mm512_set_epi32(
+      0x00370035,
+      0x00330031,
+      0x00270025,
+      0x00230021,
+      0x00170015,
+      0x00130011,
+      0x00070005,
+      0x00030001,
+      0x00360034,
+      0x00320030,
+      0x00260024,
+      0x00220020,
+      0x00160014,
+      0x00120010,
+      0x00060004,
+      0x00020000);
+  __m512i const2 = _mm512_set_epi32(
+      0x003f003d,
+      0x003b0039,
+      0x002f002d,
+      0x002b0029,
+      0x001f001d,
+      0x001b0019,
+      0x000f000d,
+      0x000b0009,
+      0x003e003c,
+      0x003a0038,
+      0x002e002c,
+      0x002a0028,
+      0x001e001c,
+      0x001a0018,
+      0x000e000c,
+      0x000a0008);
+  // merge values from two regs
+  // 0-- 1--
+  // 8-- 9--
+  // 2-- 3--
+  // 10-- 11--
+  // 4-- 5--
+  // 12-- 13--
+  // 6-- 7--
+  // 14-- 15--
+#pragma unroll(4)
+  for (int i = 0; i < 4; i++) {
+    u[i] = _mm512_permutex2var_epi16(r[i], const1, r[i + 4]);
+    u[i + 4] = _mm512_permutex2var_epi16(r[i], const2, r[i + 4]);
+  }
+}
+
+// TODO(Leslie): Add the AVX2 Version of transpose_mxn for BFloat16 and Float16
+// Code referred to FBGEMM:
+// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L1483-L1607
+template<>
+inline void transpose_mxn<BFloat16, 16, 16>(
+    const BFloat16* src,
+    int64_t ld_src,
+    BFloat16* dst,
+    int64_t ld_dst) {
+  __m256i t[16];
+  // load from src to registers
+  // a: a0  a1  a2  a3  a4  a5  a6  a7  a8  a9  a10 a11 a12 a13 a14 a15
+  // b: b0  b1  b2  b3  b4  b5  b6  b7  b8  b9  b10 b11 b12 b13 b14 b15
+  // c: c0  c1  c2  c3  c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
+  // d: d0  d1  d2  d3  d4  d5  d6  d7  d8  d9  d10 d11 d12 d13 d14 d15
+  // e: e0  e1  e2  e3  e4  e5  e6  e7  e8  e9  e10 e11 e12 e13 e14 e15
+  // f: f0  f1  f2  f3  f4  f5  f6  f7  f8  f9  f10 f11 f12 f13 f14 f15
+  // g: g0  g1  g2  g3  g4  g5  g6  g7  g8  g9  g10 g11 g12 g13 g14 g15
+  // h: h0  h1  h2  h3  h4  h5  h6  h7  h8  h9  h10 h11 h12 h13 h14 h15
+  // i: i0  i1  i2  i3  i4  i5  i6  i7  i8  i9  i10 i11 i12 i13 i14 i15
+  // j: j0  j1  j2  j3  j4  j5  j6  j7  j8  j9  j10 j11 j12 j13 j14 j15
+  // k: k0  k1  k2  k3  k4  k5  k6  k7  k8  k9  k10 k11 k12 k13 k14 k15
+  // l: l0  l1  l2  l3  l4  l5  l6  l7  l8  l9  l10 l11 l12 l13 l14 l15
+  // m: m0  m1  m2  m3  m4  m5  m6  m7  m8  m9  m10 m11 m12 m13 m14 m15
+  // n: n0  n1  n2  n3  n4  n5  n6  n7  n8  n9  n10 n11 n12 n13 n14 n15
+  // o: o0  o1  o2  o3  o4  o5  o6  o7  o8  o9  o10 o11 o12 o13 o14 o15
+  // p: p0  p1  p2  p3  p4  p5  p6  p7  p8  p9  p10 p11 p12 p13 p14 p15
+#pragma unroll(16)
+  for (int i = 0; i < 16; i++) {
+    t[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
+  }
+
+  __m512i u[8];
+  _transpose_mxn_half_16_16(t, u);
+
+#pragma unroll(8)
+  for (int i = 0; i < 8; i++) {
+    _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
+      _mm512_extracti32x8_epi32(u[i], 0x0));
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i*>(dst + (i * 2 + 1) * ld_dst),
+        _mm512_extracti32x8_epi32(u[i], 0x01));
+  }
+}
+
+// Code referred to FBGEMM:
+// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L1483-L1607
+template<>
+inline void transpose_mxn<Half, 16, 16>(
+    const Half* src,
+    int64_t ld_src,
+    Half* dst,
+    int64_t ld_dst) {
+  __m256i t[16];
+  // load from src to registers
+  // Same matrix indices as above transpose_mxn<BFloat16, 16, 16>
+#pragma unroll(16)
+  for (int i = 0; i < 16; i++) {
+    t[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
+  }
+
+  __m512i u[8];
+  _transpose_mxn_half_16_16(t, u);
+
+#pragma unroll(8)
+  for (int i = 0; i < 8; i++) {
+    _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
+      _mm512_extracti32x8_epi32(u[i], 0x0));
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i*>(dst + (i * 2 + 1) * ld_dst),
+        _mm512_extracti32x8_epi32(u[i], 0x01));
+  }
+}
+
+static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
+  // t[0]: 0 32 1 33 2 34 3 35 8 40 9 41 10 42 11 43 16 ... 59
+  // t[1]: 4 36 5 37 6 38 7 39 12 44 13 45 14 46 15 47 20 ... 63
+  // t[2]: 64 96 65 97 66 98 67 99 72 104 73 105 74 106 75 ... 123
+  // t[3]: 68 100 69 101 70 102 71 103 76 108 77 109 78 110 79 111 84 ... 127
+  // t[4]: 128 160 129 161 130 162 131 163 136 168 137 169 138 170 139 171 144 ... 187
+  // t[5]: 132 164 133 165 134 166 135 167 140 172 141 173 142 174 143 175 148 ... 191
+  // t[6]: 192 224 193 225 194 226 195 227 200 232 201 233 202 234 203 235 208 ... 251
+  // t[7]: 196 228 197 229 198 230 199 231 204 236 205 237 206 238 207 239 212 ... 255
+  // t[8]: 256 288 257 289 258 290 259 291 264 296 265 297 266 298 267 299 272 ... 315
+  // t[9]: 260 292 261 293 262 294 263 295 268 300 269 301 270 302 271 303 276 ... 319
+  // t[10]: 320 352 321 353 322 354 323 355 328 360 329 361 330 362 331 363 336 ... 379
+  // t[11]: 324 356 325 357 326 358 327 359 332 364 333 365 334 366 335 367 340 ... 383
+  // t[12]: 384 416 385 417 386 418 387 419 392 424 393 425 394 426 395 427 400 ... 443
+  // t[13]: 388 420 389 421 390 422 391 423 396 428 397 429 398 430 399 431 404 ... 447
+  // t[14]: 448 480 449 481 450 482 451 483 456 488 457 489 458 490 459 491 464 ... 507
+  // t[15]: 452 484 453 485 454 486 455 487 460 492 461 493 462 494 463 495 468 ... 511
+  // t[16]: 512 544 513 545 514 546 515 547 520 552 521 553 522 554 523 555 528 ... 571
+  // ...
+  // t[31]: 964 996 965 997 966 998 967 999 972 1004 973 1005 974 1006 975 1007 980 ... 1023
+#pragma unroll(16)
+  for (int i = 0; i < 16; ++i) {
+    d[i * 2] = _mm512_unpacklo_epi16(r[i * 2], r[i * 2 + 1]);
+    d[i * 2 + 1] = _mm512_unpackhi_epi16(r[i * 2], r[i * 2 + 1]);
+  }
+
+  // t[0]: 0 32 64 96 1 33 65 97 8 40 72 104 9 41 73 105 16 ... 121
+  // t[1]: 2 34 66 98 3 35 67 99 10 42 74 106 11 43 75 107 18 ... 123
+  // t[2]: 4 36 68 100 5 37 69 101 12 44 76 108 13 45 77 109 20 ... 125
+  // t[3]: 6 38 70 102 7 39 71 103 14 46 78 110 15 47 79 111 22 ... 127
+  // t[4]: 128 160 192 224 129 161 193 225 136 168 200 232 137 169 201 233 144 ... 249
+  // t[5]: 130 162 194 226 131 163 195 227 138 170 202 234 139 171 203 235 146 ... 251
+  // t[6]: 132 164 196 228 133 165 197 229 140 172 204 236 141 173 205 237 148 ... 253
+  // t[7]: 134 166 198 230 135 167 199 231 142 174 206 238 143 175 207 239 150 ... 255
+  // t[8]: 256 288 320 352 257 289 321 353 264 296 328 360 265 297 329 361 272 ... 377
+  // t[9]: 258 290 322 354 259 291 323 355 266 298 330 362 267 299 331 363 274 ... 379
+  // t[10]: 260 292 324 356 261 293 325 357 268 300 332 364 269 301 333 365 276 ... 381
+  // t[11]: 262 294 326 358 263 295 327 359 270 302 334 366 271 303 335 367 278 ... 383
+  // t[12]: 384 416 448 480 385 417 449 481 392 424 456 488 393 425 457 489 400 ... 505
+  // t[13]: 386 418 450 482 387 419 451 483 394 426 458 490 395 427 459 491 402 ... 507
+  // t[14]: 388 420 452 484 389 421 453 485 396 428 460 492 397 429 461 493 404 ... 509
+  // t[15]: 390 422 454 486 391 423 455 487 398 430 462 494 399 431 463 495 406 ... 511
+  // t[16]: 512 544 576 608 513 545 577 609 520 552 584 616 521 553 585 617 528 ... 633
+  // ...
+  // t[31]: 902 934 966 998 903 935 967 999 910 942 974 1006 911 943 975 1007 918 ... 1023
+#pragma unroll(8)
+  for (int i = 0; i < 8; ++i) {
+    r[i * 4] = _mm512_unpacklo_epi32(d[i * 4], d[i * 4 + 2]);
+    r[i * 4 + 1] = _mm512_unpackhi_epi32(d[i * 4], d[i * 4 + 2]);
+    r[i * 4 + 2] = _mm512_unpacklo_epi32(d[i * 4 + 1], d[i * 4 + 3]);
+    r[i * 4 + 3] = _mm512_unpackhi_epi32(d[i * 4 + 1], d[i * 4 + 3]);
+  }
+
+  // t[0]: 0 32 64 96 128 160 192 224 8 40 72 104 136 168 200 232 16 ... 248
+  // t[1]: 1 33 65 97 129 161 193 225 9 41 73 105 137 169 201 233 17 ... 249
+  // t[2]: 2 34 66 98 130 162 194 226 10 42 74 106 138 170 202 234 18 ... 250
+  // t[3]: 3 35 67 99 131 163 195 227 11 43 75 107 139 171 203 235 19 ... 251
+  // t[4]: 4 36 68 100 132 164 196 228 12 44 76 108 140 172 204 236 20 ... 252
+  // t[5]: 5 37 69 101 133 165 197 229 13 45 77 109 141 173 205 237 21 ... 253
+  // t[6]: 6 38 70 102 134 166 198 230 14 46 78 110 142 174 206 238 22 ... 254
+  // t[7]: 7 39 71 103 135 167 199 231 15 47 79 111 143 175 207 239 23 ... 255
+  // t[8]: 256 288 320 352 384 416 448 480 264 296 328 360 392 424 456 488 272 ... 504
+  // t[9]: 257 289 321 353 385 417 449 481 265 297 329 361 393 425 457 489 273 ... 505
+  // t[10]: 258 290 322 354 386 418 450 482 266 298 330 362 394 426 458 490 274 ... 506
+  // t[11]: 259 291 323 355 387 419 451 483 267 299 331 363 395 427 459 491 275 ... 507
+  // t[12]: 260 292 324 356 388 420 452 484 268 300 332 364 396 428 460 492 276 ... 508
+  // t[13]: 261 293 325 357 389 421 453 485 269 301 333 365 397 429 461 493 277 ... 509
+  // t[14]: 262 294 326 358 390 422 454 486 270 302 334 366 398 430 462 494 278 ... 510
+  // t[15]: 263 295 327 359 391 423 455 487 271 303 335 367 399 431 463 495 279 ... 511
+  // t[16]: 512 544 576 608 640 672 704 736 520 552 584 616 648 680 712 744 528 ... 760
+  // ...
+  // t[31]: 775 807 839 871 903 935 967 999 783 815 847 879 911 943 975 1007 791 ... 1023
+#pragma unroll(4)
+  for (int i = 0; i < 4; ++i) {
+    d[i * 8] = _mm512_unpacklo_epi64(r[i * 8], r[i * 8 + 4]);
+    d[i * 8 + 1] = _mm512_unpackhi_epi64(r[i * 8], r[i * 8 + 4]);
+    d[i * 8 + 2] = _mm512_unpacklo_epi64(r[i * 8 + 1], r[i * 8 + 5]);
+    d[i * 8 + 3] = _mm512_unpackhi_epi64(r[i * 8 + 1], r[i * 8 + 5]);
+    d[i * 8 + 4] = _mm512_unpacklo_epi64(r[i * 8 + 2], r[i * 8 + 6]);
+    d[i * 8 + 5] = _mm512_unpackhi_epi64(r[i * 8 + 2], r[i * 8 + 6]);
+    d[i * 8 + 6] = _mm512_unpacklo_epi64(r[i * 8 + 3], r[i * 8 + 7]);
+    d[i * 8 + 7] = _mm512_unpackhi_epi64(r[i * 8 + 3], r[i * 8 + 7]);
+  }
+
+  // t[0]: 0 32 64 96 128 160 192 224 256 288 320 352 384 416 448 480 16 ... 496
+  // t[1]: 1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481 17 ... 497
+  // t[2]: 2 34 66 98 130 162 194 226 258 290 322 354 386 418 450 482 18 ... 498
+  // t[3]: 3 35 67 99 131 163 195 227 259 291 323 355 387 419 451 483 19 ... 499
+  // t[4]: 4 36 68 100 132 164 196 228 260 292 324 356 388 420 452 484 20 ... 500
+  // t[5]: 5 37 69 101 133 165 197 229 261 293 325 357 389 421 453 485 21 ... 501
+  // t[6]: 6 38 70 102 134 166 198 230 262 294 326 358 390 422 454 486 22 ... 502
+  // t[7]: 7 39 71 103 135 167 199 231 263 295 327 359 391 423 455 487 23 ... 503
+  // t[8]: 8 40 72 104 136 168 200 232 264 296 328 360 392 424 456 488 24 ... 504
+  // t[9]: 9 41 73 105 137 169 201 233 265 297 329 361 393 425 457 489 25 ... 505
+  // t[10]: 10 42 74 106 138 170 202 234 266 298 330 362 394 426 458 490 26 ... 506
+  // t[11]: 11 43 75 107 139 171 203 235 267 299 331 363 395 427 459 491 27 ... 507
+  // t[12]: 12 44 76 108 140 172 204 236 268 300 332 364 396 428 460 492 28 ... 508
+  // t[13]: 13 45 77 109 141 173 205 237 269 301 333 365 397 429 461 493 29 ... 509
+  // t[14]: 14 46 78 110 142 174 206 238 270 302 334 366 398 430 462 494 30 ... 510
+  // t[15]: 15 47 79 111 143 175 207 239 271 303 335 367 399 431 463 495 31 ... 511
+  // t[16]: 512 544 576 608 640 672 704 736 768 800 832 864 896 928 960 992 528 ... 1008
+  // ...
+  // t[31]: 527 559 591 623 655 687 719 751 783 815 847 879 911 943 975 1007 543 ... 1023
+  __m512i const1 = _mm512_set_epi64(
+      0x000000000000000d,
+      0x000000000000000c,
+      0x0000000000000005,
+      0x0000000000000004,
+      0x0000000000000009,
+      0x0000000000000008,
+      0x0000000000000001,
+      0x0000000000000000);
+  __m512i const2 = _mm512_set_epi64(
+      0x000000000000000f,
+      0x000000000000000e,
+      0x0000000000000007,
+      0x0000000000000006,
+      0x000000000000000b,
+      0x000000000000000a,
+      0x0000000000000003,
+      0x0000000000000002);
+#pragma unroll(8)
+  for (int i = 0; i < 8; ++i) {
+    r[i] = _mm512_permutex2var_epi64(d[i], /*idx*/const1, d[i + 8]);
+    r[i + 8] = _mm512_permutex2var_epi64(d[i], /*idx*/const2, d[i + 8]);
+    r[i + 16] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/const1, d[i + 24]);
+    r[i + 24] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/const2, d[i + 24]);
+  }
+
+  // t[0]: 0 32 64 96 128 160 192 224 256 288 320 352 384 416 448 480 512 544 ... 992
+  // t[1]: 1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481 513 545 ... 993
+  // t[2]: 2 34 66 98 130 162 194 226 258 290 322 354 386 418 450 482 514 546 ... 994
+  // t[3]: 3 35 67 99 131 163 195 227 259 291 323 355 387 419 451 483 515 547 ... 995
+  // t[4]: 4 36 68 100 132 164 196 228 260 292 324 356 388 420 452 484 516 548 ... 996
+  // t[5]: 5 37 69 101 133 165 197 229 261 293 325 357 389 421 453 485 517 549 ... 997
+  // t[6]: 6 38 70 102 134 166 198 230 262 294 326 358 390 422 454 486 518 550 ... 998
+  // t[7]: 7 39 71 103 135 167 199 231 263 295 327 359 391 423 455 487 519 551 ... 999
+  // t[8]: 8 40 72 104 136 168 200 232 264 296 328 360 392 424 456 488 520 552 ... 1000
+  // t[9]: 9 41 73 105 137 169 201 233 265 297 329 361 393 425 457 489 521 553 ... 1001
+  // t[10]: 10 42 74 106 138 170 202 234 266 298 330 362 394 426 458 490 522 554 ... 1002
+  // t[11]: 11 43 75 107 139 171 203 235 267 299 331 363 395 427 459 491 523 555 ... 1003
+  // t[12]: 12 44 76 108 140 172 204 236 268 300 332 364 396 428 460 492 524 556 ... 1004
+  // t[13]: 13 45 77 109 141 173 205 237 269 301 333 365 397 429 461 493 525 557 ... 1005
+  // t[14]: 14 46 78 110 142 174 206 238 270 302 334 366 398 430 462 494 526 558 ... 1006
+  // t[15]: 15 47 79 111 143 175 207 239 271 303 335 367 399 431 463 495 527 559 ... 1007
+  // t[16]: 16 48 80 112 144 176 208 240 272 304 336 368 400 432 464 496 528 560 ... 1008
+  // ...
+  // t[31]: 31 63 95 127 159 191 223 255 287 319 351 383 415 447 479 511 543 575 ... 1023
+  __m512i const3 = _mm512_set_epi64(
+      0x000000000000000b,
+      0x000000000000000a,
+      0x0000000000000009,
+      0x0000000000000008,
+      0x0000000000000003,
+      0x0000000000000002,
+      0x0000000000000001,
+      0x0000000000000000);
+  __m512i const4 = _mm512_set_epi64(
+      0x000000000000000f,
+      0x000000000000000e,
+      0x000000000000000d,
+      0x000000000000000c,
+      0x0000000000000007,
+      0x0000000000000006,
+      0x0000000000000005,
+      0x0000000000000004);
+#pragma unroll(16)
+  for (int i = 0; i < 16; ++i) {
+    d[i] = _mm512_permutex2var_epi64(r[i], /*idx*/const3, r[i + 16]);
+    d[i + 16] = _mm512_permutex2var_epi64(r[i], /*idx*/const4, r[i + 16]);
+  }
+}
+
+// Code referred to FBGEMM:
+// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#LL19C6-L19C6
+template<>
+inline void transpose_mxn<BFloat16, 32, 32>(
+    const BFloat16* src,
+    int64_t ld_src,
+    BFloat16* dst,
+    int64_t ld_dst) {
+  // Load from memory
+  __m512i r[32];
+#pragma unroll(32)
+  for (int i = 0; i < 32; ++i) {
+    r[i] = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i* ld_src));
+  }
+
+  __m512i d[32];
+  _transpose_mxn_half_32_32(r, d);
+
+  // Store to dst
+#pragma unroll(32)
+  for (int i = 0; i < 32; ++i) {
+    _mm512_storeu_si512(dst + i* ld_dst, d[i]);
+  }
+}
+
+template<>
+inline void transpose_mxn<Half, 32, 32>(
+    const Half* src,
+    int64_t ld_src,
+    Half* dst,
+    int64_t ld_dst) {
+  // Load from memory
+  __m512i r[32];
+#pragma unroll(32)
+  for (int i = 0; i < 32; ++i) {
+    r[i] = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i* ld_src));
+  }
+
+  __m512i d[32];
+  _transpose_mxn_half_32_32(r, d);
+
+  // Store to dst
+#pragma unroll(32)
+  for (int i = 0; i < 32; ++i) {
+    _mm512_storeu_si512(dst + i* ld_dst, d[i]);
+  }
+}
+
 template <>
 class Vectorized<Half>: public Vectorized16<Half> {
 public:
@@ -1160,7 +1589,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
 CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
 CONVERT_VECTORIZED_INIT(Half, half);
 
-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#else //defined(CPU_CAPABILITY_AVX512)
 
 #define CONVERT_NON_VECTORIZED_INIT(type, name) \
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
@@ -1190,9 +1619,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
 CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
 CONVERT_NON_VECTORIZED_INIT(Half, half);
 
-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#endif // defined(CPU_CAPABILITY_AVX512)
 
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
 #define LOAD_FP32_VECTORIZED_INIT(type, name) \
 inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
   auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
@@ -1211,7 +1640,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
 LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
 LOAD_FP32_VECTORIZED_INIT(Half, fp16);
 
-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#else // defined(CPU_CAPABILITY_AVX512)
 #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
 inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
   __at_align__ float values[Vectorized<float>::size()]; \
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
index 02aa3a87cc130..c35204f9da23e 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
@@ -7,7 +7,8 @@
 #include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif
 
@@ -16,7 +17,7 @@ namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
 
 template <> class Vectorized<c10::complex<double>> {
 private:
@@ -203,7 +204,7 @@ template <> class Vectorized<c10::complex<double>> {
     auto abs = abs_();
     auto zero = _mm512_setzero_pd();
     auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ);
-    auto div = values / abs;
+    auto div = _mm512_div_pd(values, abs);
     return _mm512_mask_blend_pd(mask, div, zero);
   }
   __m512d real_() const {
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
index a5d790c98b2f2..2801e484d94ce 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
@@ -7,7 +7,8 @@
 #include <c10/util/irange.h>
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif
 
@@ -16,7 +17,7 @@ namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
 
 template <> class Vectorized<c10::complex<float>> {
 private:
@@ -708,7 +709,7 @@ template <> class Vectorized<c10::complex<float>> {
     auto abs = abs_();
     auto zero = _mm512_setzero_ps();
     auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ);
-    auto div = values / abs;
+    auto div = _mm512_div_ps(values, abs);
     return _mm512_mask_blend_ps(mask, div, zero);
   }
   __m512 real_() const {
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_convert.h b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
new file mode 100644
index 0000000000000..e8ad662a99fc2
--- /dev/null
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
@@ -0,0 +1,181 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec512/vec512_bfloat16.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_convert.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+template <>
+struct VecConvert<float, 1, BFloat16, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 1> result;
+    __m512 value;
+    cvtbf16_fp32(_mm512_castsi512_si256(src[0]), value);
+    result[0] = value;
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<float, 1, Half, 1> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<Half, 1>& src) {
+    VectorizedN<float, 1> result;
+    __m512 value;
+    cvtfp16_fp32(_mm512_castsi512_si256(src[0]), value);
+    result[0] = value;
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<BFloat16, 1, float, 1> {
+  static inline VectorizedN<BFloat16, 1> apply(
+      const VectorizedN<float, 1>& src) {
+    VectorizedN<BFloat16, 1> result;
+    result[0] = _mm512_castsi256_si512(cvtfp32_bf16(src[0]));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<Half, 1, float, 1> {
+  static inline VectorizedN<Half, 1> apply(const VectorizedN<float, 1>& src) {
+    VectorizedN<Half, 1> result;
+    result[0] = _mm512_castsi256_si512(cvtfp32_fp16(src[0]));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<float, 1, int64_t, 2> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    auto low = _mm512_cvtepi64_ps(src[0]);
+    auto high = _mm512_cvtepi64_ps(src[1]);
+    return Vectorized<float>(
+        _mm512_insertf32x8(_mm512_castps256_ps512(low), high, 1));
+  }
+};
+
+template <>
+struct VecConvert<int64_t, 2, float, 1> {
+  static inline VectorizedN<int64_t, 2> apply(
+      const VectorizedN<float, 1>& src) {
+    at::vec::VectorizedN<int64_t, 2> result;
+    result[0] = _mm512_cvt_roundps_epi64(
+        _mm512_castps512_ps256(src[0]), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+    result[1] = _mm512_cvt_roundps_epi64(
+        _mm512_extractf32x8_ps(src[0], 1),
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, int64_t, 2> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    auto low = _mm512_cvtepi64_epi32(src[0]);
+    auto high = _mm512_cvtepi64_epi32(src[1]);
+    return Vectorized<int32_t>(
+        _mm512_inserti32x8(_mm512_castsi256_si512(low), high, 1));
+  }
+};
+
+template <>
+struct VecConvert<int64_t, 2, int32_t, 1> {
+  static inline VectorizedN<int64_t, 2> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    at::vec::VectorizedN<int64_t, 2> result;
+    result[0] = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(src[0]));
+    result[1] = _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(src[0], 1));
+    return result;
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, int8_t, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<int8_t, 1>& src) {
+    auto src128 = _mm512_castsi512_si128(src[0]);
+    return Vectorized<int32_t>(_mm512_cvtepi8_epi32(src128));
+  }
+};
+
+template <>
+struct VecConvert<int32_t, 1, uint8_t, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<uint8_t, 1>& src) {
+    auto src128 = _mm512_castsi512_si128(src[0]);
+    return Vectorized<int32_t>(_mm512_cvtepu8_epi32(src128));
+  }
+};
+
+template <typename dst_t, typename src_t>
+struct VecConvert<
+    dst_t,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<
+        (is_reduced_floating_point_v<dst_t> && is_8bit_integer_v<src_t>) ||
+            (is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
+    VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
+    return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>,
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 1>& src) {
+    return convert_float_to_int8<dst_t>(src[0]);
+  }
+};
+
+template <typename src_t>
+struct VecConvert<
+    float,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>,
+        void>> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
+    return convert_int8_to_float<src_t>(src[0]);
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    int64_t,
+    2,
+    typename std::enable_if<
+        std::is_same_v<dst_t, int8_t> ||
+        std::is_same_v<dst_t, uint8_t>>::type> {
+  static inline VectorizedN<dst_t, 1> apply(
+      const VectorizedN<int64_t, 2>& src) {
+    return VecConvert<dst_t, 1, int32_t, 1>::apply(
+        VecConvert<int32_t, 1, int64_t, 2>::apply(src));
+  }
+};
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
index 2089e3a6c620b..508ab257e603b 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
@@ -6,7 +6,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
+#if (defined(CPU_CAPABILITY_AVX512))
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif
 
@@ -15,7 +16,7 @@ namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
 
 template <> class Vectorized<double> {
 private:
@@ -106,6 +107,10 @@ template <> class Vectorized<double> {
     return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
                                                       0xFFFFFFFFFFFFFFFF));
   }
+  bool has_inf_nan() const {
+    __m512d self_sub  = _mm512_sub_pd(values, values);
+    return (_mm512_movepi8_mask(_mm512_castpd_si512(self_sub)) & 0x7777777777777777) != 0;
+  }
   Vectorized<double> map(double (*const f)(double)) const {
     __at_align__ double tmp[size()];
     store(tmp);
@@ -145,6 +150,9 @@ template <> class Vectorized<double> {
   Vectorized<double> acos() const {
     return Vectorized<double>(Sleef_acosd8_u10(values));
   }
+  Vectorized<double> acosh() const {
+    return Vectorized<double>(Sleef_acoshd8_u10(values));
+  }
   Vectorized<double> asin() const {
     return Vectorized<double>(Sleef_asind8_u10(values));
   }
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
index 633b5990a26b1..a08df3c141a38 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@@ -6,7 +6,8 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
+#define SLEEF_STATIC_LIBS
 #include <sleef.h>
 #endif
 
@@ -15,7 +16,7 @@ namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {
 
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
 
 template <> class Vectorized<float> {
 private:
@@ -125,6 +126,10 @@ template <> class Vectorized<float> {
     return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
                                                       0xFFFFFFFF));
   }
+  bool has_inf_nan() const {
+    __m512 self_sub  = _mm512_sub_ps(values, values);
+    return (_mm512_movepi8_mask(_mm512_castps_si512(self_sub)) & 0x7777777777777777) != 0;
+  }
   Vectorized<float> map(float (*const f)(float)) const {
     __at_align__ float tmp[size()];
     store(tmp);
@@ -164,6 +169,9 @@ template <> class Vectorized<float> {
   Vectorized<float> acos() const {
     return Vectorized<float>(Sleef_acosf16_u10(values));
   }
+  Vectorized<float> acosh() const {
+    return Vectorized<float>(Sleef_acoshf16_u10(values));
+  }
   Vectorized<float> asin() const {
     return Vectorized<float>(Sleef_asinf16_u10(values));
   }
@@ -239,14 +247,14 @@ template <> class Vectorized<float> {
     static __m512 vec_factorial_5 =
         _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
     static __m512 vec_exp_log2ef =
-        (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e)
+        _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
     static __m512 vec_half = _mm512_set1_ps(0.5f);
     static __m512 vec_one = _mm512_set1_ps(1.f);
     static __m512 vec_zero = _mm512_set1_ps(0.f);
     static __m512 vec_two = _mm512_set1_ps(2.f);
-    static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2)
-    static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50);
-    static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218);
+    static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
+    static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
+    static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
     static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
     static int n_mantissa_bits = 23;
 
@@ -281,7 +289,7 @@ template <> class Vectorized<float> {
     auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);
     auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);
     vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
-    auto vec_two_pow_n = (__m512)vec_two_pow_n_i;
+    auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);
     vec_two_pow_n =
         _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero);
 
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
index 2610d344380b3..1022221c81a19 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@@ -540,7 +540,7 @@ class Vectorized<int16_t> : public Vectorizedi {
 template <typename T>
 class Vectorized8 : public Vectorizedi {
   static_assert(
-    std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
+    std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
     "Only int8_t/uint8_t are supported");
 protected:
   static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
@@ -1069,7 +1069,7 @@ Vectorized<int8_t> inline maximum(const Vectorized<int8_t>& a, const Vectorized<
 
 template <>
 Vectorized<uint8_t> inline maximum(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
-  return _mm512_max_epi8(a, b);
+  return _mm512_max_epu8(a, b);
 }
 
 template <>
@@ -1320,7 +1320,7 @@ inline Vectorized<uint8_t> Vectorized<uint8_t>::le(const Vectorized<uint8_t>& ot
   return (*this <= other) & Vectorized<uint8_t>(1);
 }
 
-template <bool left_shift, typename T, typename std::enable_if_t<std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value, int> = 0>
+template <bool left_shift, typename T, typename std::enable_if_t<std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>, int> = 0>
 Vectorized<T> inline shift_512_8(const Vectorized<T>& a, const Vectorized<T>& b) {
   // No vector instruction for shifting int8_t/uint8_t, so emulating
   // it instead.
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_mask.h b/aten/src/ATen/cpu/vec/vec512/vec512_mask.h
new file mode 100644
index 0000000000000..9ba1b18372eb5
--- /dev/null
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_mask.h
@@ -0,0 +1,155 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_mask.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+template <typename T, typename mask_t>
+struct VecMaskLoad<
+    T,
+    1,
+    mask_t,
+    1,
+    typename std::enable_if_t<
+        std::is_same_v<T, float> || std::is_same_v<T, int32_t> ||
+            std::is_same_v<T, uint32_t>,
+        void>> {
+  static inline VectorizedN<T, 1> apply(
+      const T* ptr,
+      const VecMask<mask_t, 1>& vec_mask) {
+    at::vec::Vectorized<T> zero_vec(0);
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto int_mask = vec_mask.template cast<int, 1>()[0];
+    auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ);
+    if constexpr (std::is_same_v<T, float>) {
+      return Vectorized<T>(_mm512_mask_loadu_ps(zero_vec, mmask, ptr));
+    } else {
+      return Vectorized<T>(_mm512_mask_loadu_epi32(zero_vec, mmask, ptr));
+    }
+  }
+};
+
+template <typename data_t, typename mask_t>
+struct VecMaskLoad<
+    data_t,
+    1,
+    mask_t,
+    1,
+    typename std::enable_if<
+        std::is_same_v<data_t, BFloat16> ||
+        std::is_same_v<data_t, Half>>::type> {
+  static inline VectorizedN<data_t, 1> apply(
+      const data_t* ptr,
+      const VecMask<mask_t, 1>& vec_mask) {
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto int_mask = vec_mask.template cast<int, 1>()[0];
+    auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ);
+    auto zero = _mm256_set1_epi16(0);
+    auto temp = _mm256_mask_loadu_epi16(zero, mmask, ptr);
+    return Vectorized<data_t>(
+        _mm512_inserti32x8(_mm512_castsi256_si512(temp), zero, 1));
+  }
+};
+
+template <typename data_t, typename mask_t>
+struct VecMaskLoad<
+    data_t,
+    1,
+    mask_t,
+    1,
+    typename std::enable_if<
+        std::is_same_v<data_t, int8_t> ||
+        std::is_same_v<data_t, uint8_t>>::type> {
+  static inline VectorizedN<data_t, 1> apply(
+      const data_t* ptr,
+      const VecMask<mask_t, 1>& vec_mask) {
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto int_mask = vec_mask.template cast<int, 1>()[0];
+    auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ);
+    auto zero = _mm_set1_epi8(0);
+    auto temp = _mm_mask_loadu_epi8(zero, mmask, ptr);
+    return Vectorized<data_t>(
+        _mm512_inserti64x2(_mm512_set1_epi32(0), temp, 0));
+  }
+};
+
+template <typename mask_t>
+struct VecMaskLoad<int64_t, 2, mask_t, 1> {
+  static inline VectorizedN<int64_t, 2> apply(
+      const int64_t* ptr,
+      const VecMask<mask_t, 1>& vec_mask) {
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto zero = _mm512_set1_epi64(0);
+    auto int_mask = vec_mask.template cast<int, 1>()[0];
+    auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ);
+    at::vec::VectorizedN<int64_t, 2> result;
+    result[0] = _mm512_mask_loadu_epi64(zero, (__mmask8)mmask, ptr);
+    result[1] = _mm512_mask_loadu_epi64(zero, (__mmask8)(mmask >> 8), ptr + 8);
+    return result;
+  }
+};
+
+template <>
+struct VecMaskCast<float, 1, int, 1> {
+  static inline VecMask<float, 1> apply(const VecMask<int, 1>& vec_mask) {
+    return Vectorized<float>(_mm512_castsi512_ps(vec_mask[0]));
+  }
+};
+
+template <>
+struct VecMaskCast<int, 1, float, 1> {
+  static inline VecMask<int, 1> apply(const VecMask<float, 1>& vec_mask) {
+    return Vectorized<int>(_mm512_castps_si512(vec_mask[0]));
+  }
+};
+
+template <typename dst_t>
+struct VecMaskCast<dst_t, 1, int64_t, 2> {
+  static inline VecMask<dst_t, 1> apply(const VecMask<int64_t, 2>& vec_mask) {
+    auto int_vec = convert<int, 1, int64_t, 2>(VectorizedN<int64_t, 2>(vec_mask));
+    return VecMask<int, 1>(int_vec).cast<dst_t, 1>();
+  }
+};
+
+template <>
+inline bool VecMask<int, 1>::all_zero() const {
+  __mmask16 mask = _mm512_test_epi32_mask(mask_[0], mask_[0]);
+  return mask == 0;
+}
+
+template <>
+inline bool VecMask<int, 1>::is_masked(int i) const {
+  return _mm512_movepi32_mask(mask_[0]) & (1 << i);
+}
+
+template <>
+inline bool VecMask<int, 1>::all_masked() const {
+  __mmask16 mask = _mm512_movepi32_mask(mask_[0]);
+  return mask == 0xffff;
+}
+
+#define VEC_MASK_METHOD_WITH_CAST_TO_INT(                   \
+    T, N, return_type, method, args_def, args)              \
+  template <>                                               \
+  inline return_type VecMask<T, N>::method args_def const { \
+    return cast<int, 1>().method args;                      \
+  }
+
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_zero, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_zero, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, is_masked, (int i), (i))
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, is_masked, (int i), (i))
+VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_masked, (), ())
+VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_masked, (), ())
+
+#undef VEC_MASK_DEFINE_METHOD_WITH_CAST_TO_INT
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
index b03da5d2c3e95..21389da3cdc03 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
@@ -42,11 +42,17 @@ namespace at {
 namespace vec {
 inline namespace CPU_CAPABILITY {
 
-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#if defined(CPU_CAPABILITY_AVX512)
 
+#ifdef _MSC_VER
+__declspec(align(64)) struct Vectorizedqi {
+ protected:
+  __m512i vals;
+#else
 struct Vectorizedqi {
  protected:
   __m512i vals __attribute__((aligned(64)));
+#endif
 
  public:
   Vectorizedqi() {}
@@ -98,28 +104,36 @@ inline __m512i pack_saturate_and_clamp<uint8_t>(
       _mm512_min_epu8(packed_and_sat, _mm512_set1_epi8(max_val)));
 }
 
-inline Vectorized<float> convert_uint8_to_float(at::vec::Vectorized<uint8_t> src) {
+template <typename T>
+typename std::enable_if_t<std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>, at::vec::Vectorized<float>>
+inline convert_int8_to_float(at::vec::Vectorized<T> src) {
   // Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
-  // Only handle first 128 bits
+  // Only handle first 16*8 bits
   __m128i input_128 = _mm512_castsi512_si128(src);
-  // Convert from 16*u8 to 16*int32
-  __m512i input_512_extended = _mm512_cvtepu8_epi32(input_128);
+  // Convert from 16*uint8/int8 to 16*int32
+  __m512i input_512_extended;
+  if constexpr (std::is_same_v<T, uint8_t>)
+    input_512_extended = _mm512_cvtepu8_epi32(input_128);
+  else
+    input_512_extended = _mm512_cvtepi8_epi32(input_128);
   // Convert from 16*int32 to 16*float32
   return _mm512_cvtepi32_ps(input_512_extended);
 }
 
-inline Vectorized<uint8_t> convert_float_to_uint8(at::vec::Vectorized<float> src) {
+template <typename T>
+typename std::enable_if_t<std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>, at::vec::Vectorized<T>>
+inline convert_float_to_int8(at::vec::Vectorized<float> src) {
   // Convert from float32 to int32 with truncation
   __m512i x_values_int32 = _mm512_cvttps_epi32(src);
 
   // Convert from int32 to int16 using signed saturation
   __m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32);
 
-  constexpr auto min_val = std::numeric_limits<uint8_t>::min();
-  constexpr auto max_val = std::numeric_limits<uint8_t>::max();
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
 
-  // Convert from int16 to uint8 using unsigned saturation
-  __m512i xyzw_clamped_v = pack_saturate_and_clamp<uint8_t>(
+  // Convert from int16 to uint8/int8 using unsigned saturation
+  __m512i xyzw_clamped_v = pack_saturate_and_clamp<T>(
       xy_packed_v, xy_packed_v, min_val, max_val);
   __m512i permute_mask_v =
       _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02,
@@ -128,7 +142,7 @@ inline Vectorized<uint8_t> convert_float_to_uint8(at::vec::Vectorized<float> src
 }
 
 template <typename T>
-inline void __attribute__((always_inline)) QuantizeAvx512(
+__FORCE_INLINE void QuantizeAvx512(
     const float* src,
     T* dst,
     int len,
@@ -406,7 +420,7 @@ __m512i RequantizeAvx512(
     __m512 multiplier,
     __m512i zp) {
   static_assert(
-      std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
+      std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>,
       "Only int8_t/uint8_t are supported");
   constexpr auto min_val = std::numeric_limits<T>::min();
   constexpr auto max_val = std::numeric_limits<T>::max();
@@ -517,10 +531,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
       Vectorized<float> scale,
       Vectorized<float> zero_point,
       Vectorized<float> scale_neg_zp_premul) const {
+    #if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+    #else
     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+    #endif
 
     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
@@ -541,10 +562,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
   float_vec_return_type dequantize(
       Vectorized<float> scale,
       Vectorized<float> zero_point) const {
+    #if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+    #else
     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+    #endif
 
     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
@@ -590,20 +618,34 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
     }
 
     int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+      #if defined(_MSC_VER) && !defined(__clang__)
+      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+      #else
       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+      #endif
 
       __m512i int32_val0 = cvtepi8_epi32(int_val0);
       __m512i int32_val1 = cvtepi8_epi32(int_val1);
       __m512i int32_val2 = cvtepi8_epi32(int_val2);
       __m512i int32_val3 = cvtepi8_epi32(int_val3);
 
+      #if defined(_MSC_VER) && !defined(__clang__)
+      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
+      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
+      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
+      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
+      #else
       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
+      #endif
 
       __m512i int32_b0 = cvtepi8_epi32(int_b0);
       __m512i int32_b1 = cvtepi8_epi32(int_b1);
@@ -713,10 +755,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
       Vectorized<float> scale,
       Vectorized<float> zero_point,
       Vectorized<float> scale_zp_premul) const {
+    #if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+    #else
     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+    #endif
 
     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
@@ -738,10 +787,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
   float_vec_return_type dequantize(
       Vectorized<float> scale,
       Vectorized<float> zero_point) const {
+    #if defined(_MSC_VER) && !defined(__clang__)
+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+    #else
     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+    #endif
 
     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
@@ -788,20 +844,34 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
     }
 
     int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+      #if defined(_MSC_VER) && !defined(__clang__)
+      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+      #else
       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+      #endif
 
       __m512i int32_val0 = cvtepu8_epi32(int_val0);
       __m512i int32_val1 = cvtepu8_epi32(int_val1);
       __m512i int32_val2 = cvtepu8_epi32(int_val2);
       __m512i int32_val3 = cvtepu8_epi32(int_val3);
 
+      #if defined(_MSC_VER) && !defined(__clang__)
+      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
+      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
+      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
+      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
+      #else
       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
+      #endif
 
       __m512i int32_b0 = cvtepu8_epi32(int_b0);
       __m512i int32_b1 = cvtepu8_epi32(int_b1);
diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
index e5955a802d016..d696c97b59497 100644
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@@ -26,18 +26,22 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/native/Math.h>
 #include <ATen/NumericUtils.h>
-#include <c10/util/C++17.h>
 #include <c10/util/Half.h>
 #include <c10/util/BFloat16.h>
 #include <c10/util/BFloat16-math.h>
 #include <c10/util/copysign.h>
-#include <c10/util/math_compat.h>
 #include <ATen/native/cpu/zmath.h>
 #include <c10/util/TypeCast.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
 #include <c10/util/Load.h>
 
+#if defined(__GNUC__)
+#define __FORCE_INLINE __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#define __FORCE_INLINE __forceinline
+#endif
+
 // These macros helped us unify vec_base.h
 #ifdef CPU_CAPABILITY_AVX512
 #if defined(__GNUC__)
@@ -68,9 +72,9 @@ inline namespace CPU_CAPABILITY {
 template <typename T>
 struct is_floating_point:
     std::integral_constant<bool,
-      std::is_floating_point<T>::value ||
-      std::is_same<T, at::Half>::value ||
-      std::is_same<T, at::BFloat16>::value> {
+      std::is_floating_point_v<T> ||
+      std::is_same_v<T, at::Half> ||
+      std::is_same_v<T, at::BFloat16>> {
 };
 
 template<typename T>
@@ -79,13 +83,23 @@ constexpr bool is_floating_point_v = is_floating_point<T>::value;
 template <typename T>
 struct is_reduced_floating_point:
     std::integral_constant<bool,
-      std::is_same<T, at::Half>::value ||
-      std::is_same<T, at::BFloat16>::value> {
+      std::is_same_v<T, at::Half> ||
+      std::is_same_v<T, at::BFloat16>> {
 };
 
 template <typename T>
 constexpr bool is_reduced_floating_point_v = is_reduced_floating_point<T>::value;
 
+template <typename T>
+struct is_8bit_integer:
+    std::integral_constant<bool,
+      std::is_same_v<T, unsigned char> ||
+      std::is_same_v<T, signed char>> {
+};
+
+template <typename T>
+constexpr bool is_8bit_integer_v = is_8bit_integer<T>::value;
+
 template<size_t n> struct int_of_size;
 
 #define DEFINE_INT_OF_SIZE(int_t) \
@@ -147,9 +161,8 @@ struct Vectorized {
   // versions GCC/Clang have buggy determinations on whether or not an
   // identifier is odr-used or not, and in any case it's hard to tell if
   // a variable is odr-used or not.  So best to just cut the problem at the root.
-  static constexpr size_type size_T = sizeof(T);  // Workaround to compile with VS2022.
   static constexpr size_type size() {
-    return VECTOR_WIDTH / size_T;
+    return VECTOR_WIDTH / sizeof(T);
   }
   Vectorized() : values{static_cast<T>(0)} {}
   Vectorized(T val) {
@@ -231,6 +244,11 @@ struct Vectorized {
     std::memcpy(vector.values, ptr, count * sizeof(T));
     return vector;
   }
+  static Vectorized<T> loadu_one_fourth(const void* ptr) {
+    static_assert(std::is_same_v<T, signed char> || std::is_same_v<T, unsigned char>, "For byte types only");
+    return Vectorized::loadu(ptr, 8);
+  }
+
   void store(void* ptr, int count = size()) const {
     std::memcpy(ptr, values, count * sizeof(T));
   }
@@ -255,6 +273,14 @@ struct Vectorized {
     }
     return vector;
   }
+  bool has_inf_nan() const {
+    for (int64_t i = 0; i != size(); i++) {
+      if(_isnan(values[i]) || _isinf(values[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
   Vectorized<T> map(T (*const f)(T)) const {
     Vectorized<T> ret;
     for (int64_t i = 0; i != size(); i++) {
@@ -270,95 +296,98 @@ struct Vectorized {
     return ret;
   }
   template <typename other_t_abs = T,
-            typename std::enable_if<!is_floating_point_v<other_t_abs> && !c10::is_complex<other_t_abs>::value, int>::type = 0>
+            typename std::enable_if_t<!is_floating_point_v<other_t_abs> && !c10::is_complex<other_t_abs>::value, int> = 0>
   Vectorized<T> abs() const {
     // other_t_abs is for SFINAE and clarity. Make sure it is not changed.
-    static_assert(std::is_same<other_t_abs, T>::value, "other_t_abs must be T");
+    static_assert(std::is_same_v<other_t_abs, T>, "other_t_abs must be T");
     return map([](T x) -> T { return x < static_cast<T>(0) ? -x : x; });
   }
   template <typename float_t_abs = T,
-            typename std::enable_if<is_floating_point_v<float_t_abs>, int>::type = 0>
+            typename std::enable_if_t<is_floating_point_v<float_t_abs>, int> = 0>
   Vectorized<T> abs() const {
     // float_t_abs is for SFINAE and clarity. Make sure it is not changed.
-    static_assert(std::is_same<float_t_abs, T>::value, "float_t_abs must be T");
+    static_assert(std::is_same_v<float_t_abs, T>, "float_t_abs must be T");
     // Specifically deal with floating-point because the generic code above won't handle -0.0 (which should result in
     // 0.0) properly.
     return map([](T x) -> T { return std::abs(x); });
   }
   template <typename complex_t_abs = T,
-            typename std::enable_if<c10::is_complex<complex_t_abs>::value, int>::type = 0>
+            typename std::enable_if_t<c10::is_complex<complex_t_abs>::value, int> = 0>
   Vectorized<T> abs() const {
     // complex_t_abs is for SFINAE and clarity. Make sure it is not changed.
-    static_assert(std::is_same<complex_t_abs, T>::value, "complex_t_abs must be T");
+    static_assert(std::is_same_v<complex_t_abs, T>, "complex_t_abs must be T");
     // Specifically map() does not perform the type conversion needed by abs.
     return map([](T x) { return static_cast<T>(std::abs(x)); });
   }
 
   template <typename other_t_sgn = T,
-            typename std::enable_if<c10::is_complex<other_t_sgn>::value, int>::type = 0>
+            typename std::enable_if_t<c10::is_complex<other_t_sgn>::value, int> = 0>
   Vectorized<T> sgn() const {
     return map(at::native::sgn_impl);
   }
 
   template <typename other_t_angle = T,
-            typename std::enable_if<!c10::is_complex<other_t_angle>::value, int>::type = 0>
+            typename std::enable_if_t<!c10::is_complex<other_t_angle>::value, int> = 0>
   Vectorized<T> angle() const {
     // other_t_angle is for SFINAE and clarity. Make sure it is not changed.
-    static_assert(std::is_same<other_t_angle, T>::value, "other_t_angle must be T");
+    static_assert(std::is_same_v<other_t_angle, T>, "other_t_angle must be T");
     return map(at::native::angle_impl<T>);  // compiler is unable to resolve the overload without <T>
   }
   template <typename complex_t_angle = T,
-            typename std::enable_if<c10::is_complex<complex_t_angle>::value, int>::type = 0>
+            typename std::enable_if_t<c10::is_complex<complex_t_angle>::value, int> = 0>
   Vectorized<T> angle() const {
     // complex_t_angle is for SFINAE and clarity. Make sure it is not changed.
-    static_assert(std::is_same<complex_t_angle, T>::value, "complex_t_angle must be T");
+    static_assert(std::is_same_v<complex_t_angle, T>, "complex_t_angle must be T");
     return map([](T x) { return static_cast<T>(std::arg(x)); });
   }
   template <typename other_t_real = T,
-            typename std::enable_if<!c10::is_complex<other_t_real>::value, int>::type = 0>
+            typename std::enable_if_t<!c10::is_complex<other_t_real>::value, int> = 0>
   Vectorized<T> real() const {
     // other_t_real is for SFINAE and clarity. Make sure it is not changed.
-    static_assert(std::is_same<other_t_real, T>::value, "other_t_real must be T");
+    static_assert(std::is_same_v<other_t_real, T>, "other_t_real must be T");
     return *this;
   }
   template <typename complex_t_real = T,
-            typename std::enable_if<c10::is_complex<complex_t_real>::value, int>::type = 0>
+            typename std::enable_if_t<c10::is_complex<complex_t_real>::value, int> = 0>
   Vectorized<T> real() const {
     // complex_t_real is for SFINAE and clarity. Make sure it is not changed.
-    static_assert(std::is_same<complex_t_real, T>::value, "complex_t_real must be T");
+    static_assert(std::is_same_v<complex_t_real, T>, "complex_t_real must be T");
     return map([](T x) { return static_cast<T>(x.real()); });
   }
   template <typename other_t_imag = T,
-            typename std::enable_if<!c10::is_complex<other_t_imag>::value, int>::type = 0>
+            typename std::enable_if_t<!c10::is_complex<other_t_imag>::value, int> = 0>
   Vectorized<T> imag() const {
     // other_t_imag is for SFINAE and clarity. Make sure it is not changed.
-    static_assert(std::is_same<other_t_imag, T>::value, "other_t_imag must be T");
+    static_assert(std::is_same_v<other_t_imag, T>, "other_t_imag must be T");
     return Vectorized(0);
   }
   template <typename complex_t_imag = T,
-            typename std::enable_if<c10::is_complex<complex_t_imag>::value, int>::type = 0>
+            typename std::enable_if_t<c10::is_complex<complex_t_imag>::value, int> = 0>
   Vectorized<T> imag() const {
     // complex_t_imag is for SFINAE and clarity. Make sure it is not changed.
-    static_assert(std::is_same<complex_t_imag, T>::value, "complex_t_imag must be T");
+    static_assert(std::is_same_v<complex_t_imag, T>, "complex_t_imag must be T");
     return map([](T x) { return static_cast<T>(x.imag()); });
   }
   template <typename other_t_conj = T,
-            typename std::enable_if<!c10::is_complex<other_t_conj>::value, int>::type = 0>
+            typename std::enable_if_t<!c10::is_complex<other_t_conj>::value, int> = 0>
   Vectorized<T> conj() const {
     // other_t_conj is for SFINAE and clarity. Make sure it is not changed.
-    static_assert(std::is_same<other_t_conj, T>::value, "other_t_conj must be T");
+    static_assert(std::is_same_v<other_t_conj, T>, "other_t_conj must be T");
     return *this;
   }
   template <typename complex_t_conj = T,
-            typename std::enable_if<c10::is_complex<complex_t_conj>::value, int>::type = 0>
+            typename std::enable_if_t<c10::is_complex<complex_t_conj>::value, int> = 0>
   Vectorized<T> conj() const {
     // complex_t_conj is for SFINAE and clarity. Make sure it is not changed.
-    static_assert(std::is_same<complex_t_conj, T>::value, "complex_t_conj must be T");
+    static_assert(std::is_same_v<complex_t_conj, T>, "complex_t_conj must be T");
     return map([](T x) { return static_cast<T>(std::conj(x)); });
   }
   Vectorized<T> acos() const {
     return map(std::acos);
   }
+  Vectorized<T> acosh() const {
+    return map(std::acosh);
+  }
   Vectorized<T> asin() const {
     return map(std::asin);
   }
@@ -414,7 +443,7 @@ struct Vectorized {
     typename std::enable_if_t<is_floating_point_v<U>, int> = 0>
   Vectorized<T> fmod(const Vectorized<T>& q) const {
     // U is for SFINAE purposes only. Make sure it is not changed.
-    static_assert(std::is_same<U, T>::value, "U must be T");
+    static_assert(std::is_same_v<U, T>, "U must be T");
     Vectorized<T> ret;
     for (const auto i : c10::irange(size())) {
       ret[i] = std::fmod(values[i], q[i]);
@@ -431,17 +460,17 @@ struct Vectorized {
     return map(std::log1p);
   }
   template <typename other_t_log2 = T,
-            typename std::enable_if<!c10::is_complex<other_t_log2>::value, int>::type = 0>
+            typename std::enable_if_t<!c10::is_complex<other_t_log2>::value, int> = 0>
   Vectorized<T> log2() const {
     // other_t_log2 is for SFINAE and clarity. Make sure it is not changed.
-    static_assert(std::is_same<other_t_log2, T>::value, "other_t_log2 must be T");
+    static_assert(std::is_same_v<other_t_log2, T>, "other_t_log2 must be T");
     return map(std::log2);
   }
   template <typename complex_t_log2 = T,
-            typename std::enable_if<c10::is_complex<complex_t_log2>::value, int>::type = 0>
+            typename std::enable_if_t<c10::is_complex<complex_t_log2>::value, int> = 0>
   Vectorized<T> log2() const {
     // complex_t_log2 is for SFINAE and clarity. Make sure it is not changed.
-    static_assert(std::is_same<complex_t_log2, T>::value, "complex_t_log2 must be T");
+    static_assert(std::is_same_v<complex_t_log2, T>, "complex_t_log2 must be T");
     const T log_2 = T(std::log(2.0));
     return Vectorized(map(std::log))/Vectorized(log_2);
   }
@@ -613,6 +642,12 @@ template <class T> Vectorized<T> inline operator/(const Vectorized<T> &a, const
   return c;
 }
 
+template <class T,
+          typename std::enable_if_t<!is_floating_point_v<T>, int> = 0>
+Vectorized<T> inline operator%(const Vectorized<T> &a, const Vectorized<T> &b) __ubsan_ignore_float_divide_by_zero__ {
+  return a - a / b * b;
+}
+
 template <class T> Vectorized<T> inline operator||(
     const Vectorized<T> &a, const Vectorized<T> &b) {
   Vectorized<T> c;
@@ -625,7 +660,7 @@ template <class T> Vectorized<T> inline operator||(
 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
 // either input is a NaN.
 template <class T,
-          typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0>
+          typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
 Vectorized<T> inline maximum(const Vectorized<T> &a, const Vectorized<T> &b) {
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
@@ -641,7 +676,7 @@ Vectorized<T> inline maximum(const Vectorized<T> &a, const Vectorized<T> &b) {
 }
 
 template <class T,
-          typename std::enable_if<c10::is_complex<T>::value, int>::type = 0>
+          typename std::enable_if_t<c10::is_complex<T>::value, int> = 0>
 Vectorized<T> inline maximum(const Vectorized<T> &a, const Vectorized<T> &b) {
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
@@ -659,7 +694,7 @@ Vectorized<T> inline maximum(const Vectorized<T> &a, const Vectorized<T> &b) {
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
 template <class T,
-          typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0>
+          typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
 Vectorized<T> inline minimum(const Vectorized<T> &a, const Vectorized<T> &b) {
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
@@ -675,7 +710,7 @@ Vectorized<T> inline minimum(const Vectorized<T> &a, const Vectorized<T> &b) {
 }
 
 template <class T,
-          typename std::enable_if<c10::is_complex<T>::value, int>::type = 0>
+          typename std::enable_if_t<c10::is_complex<T>::value, int> = 0>
 Vectorized<T> inline minimum(const Vectorized<T> &a, const Vectorized<T> &b) {
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
@@ -691,7 +726,7 @@ Vectorized<T> inline minimum(const Vectorized<T> &a, const Vectorized<T> &b) {
 }
 
 template <class T,
-          typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0>
+          typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
 Vectorized<T> inline clamp(const Vectorized<T> &a, const Vectorized<T> &min_vec, const Vectorized<T> &max_vec) {
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
@@ -701,7 +736,7 @@ Vectorized<T> inline clamp(const Vectorized<T> &a, const Vectorized<T> &min_vec,
 }
 
 template <class T,
-          typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0>
+          typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
 Vectorized<T> inline clamp_max(const Vectorized<T> &a, const Vectorized<T> &max_vec) {
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
@@ -711,7 +746,7 @@ Vectorized<T> inline clamp_max(const Vectorized<T> &a, const Vectorized<T> &max_
 }
 
 template <class T,
-          typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0>
+          typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
 Vectorized<T> inline clamp_min(const Vectorized<T> &a, const Vectorized<T> &min_vec) {
   Vectorized<T> c;
   for (int i = 0; i != Vectorized<T>::size(); i++) {
@@ -821,8 +856,8 @@ inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
 
 template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
 inline Vectorized<T> operator~(const Vectorized<T>& a) {
-  Vectorized<T> ones;  // All bits are 1
-  memset((T*) ones, 0xFF, VECTOR_WIDTH);
+  using int_t = int_same_size_t<T>;
+  Vectorized<T> ones(c10::bit_cast<T>((int_t)(~(int_t)0)));  // All bits are 1
   return a ^ ones;
 }
 
@@ -980,6 +1015,19 @@ inline Vectorized<IntType> convert_to_int_of_same_size(const Vectorized<T>& src)
   return Vectorized<IntType>::loadu(static_cast<const void*>(buffer.data()));
 }
 
+template <typename T, typename IntType = int_same_size_t<T>>
+inline Vectorized<T> convert_to_fp_of_same_size(const Vectorized<IntType>& src) {
+  static_assert(sizeof(T) == sizeof(IntType));
+  static constexpr int size = Vectorized<T>::size();
+
+  std::array<IntType, size> src_arr;
+  src.store(static_cast<void*>(src_arr.data()));
+  std::array<T, size> buffer;
+  std::transform(src_arr.cbegin(), src_arr.cend(), buffer.begin(),
+                 [](const IntType& x) { return static_cast<T>(x); });
+  return Vectorized<T>::loadu(static_cast<const void*>(buffer.data()));
+}
+
 // Example inputs for AVX512:
 // a   Vectorized<float>   = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7}
 // b   Vectorized<float>   = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, b15}
@@ -1079,3 +1127,8 @@ inline void transpose_mxn(const T* src, int64_t ld_src, T* dst, int64_t ld_dst)
 }
 
 }} // namespace at::vec::CPU_CAPABILITY
+
+// additional headers for more operations that depend on vec_base
+#include <ATen/cpu/vec/vec_n.h>
+#include <ATen/cpu/vec/vec_mask.h>
+#include <ATen/cpu/vec/vec_convert.h>
diff --git a/aten/src/ATen/cpu/vec/vec_convert.h b/aten/src/ATen/cpu/vec/vec_convert.h
new file mode 100644
index 0000000000000..56488928156af
--- /dev/null
+++ b/aten/src/ATen/cpu/vec/vec_convert.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_n.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+template <
+    typename dst_t,
+    int dst_n,
+    typename src_t,
+    int src_n,
+    typename Enabled = void>
+struct VecConvert {
+  static inline VectorizedN<dst_t, dst_n> apply(
+      const VectorizedN<src_t, src_n>& src) {
+    constexpr int count = std::min(
+        VectorizedN<src_t, src_n>::size(), VectorizedN<dst_t, dst_n>::size());
+    __at_align__ src_t src_buf[VectorizedN<src_t, src_n>::size()];
+    src.store(src_buf);
+    __at_align__ dst_t dst_buf[VectorizedN<dst_t, dst_n>::size()];
+    for (int i = 0; i < count; i++) {
+      dst_buf[i] = static_cast<dst_t>(src_buf[i]);
+    }
+    return VectorizedN<dst_t, dst_n>::loadu(dst_buf, count);
+  }
+};
+
+template <typename dst_t, typename src_t>
+inline Vectorized<dst_t> convert(const Vectorized<src_t>& src) {
+  return VecConvert<dst_t, 1, src_t, 1>::apply(src);
+}
+
+template <
+    typename dst_t,
+    int dst_n,
+    typename src_t,
+    int src_n,
+    std::enable_if_t<dst_n != 1, int> = 0>
+inline VectorizedN<dst_t, dst_n> convert(const VectorizedN<src_t, src_n>& src) {
+  return VecConvert<dst_t, dst_n, src_t, src_n>::apply(src);
+}
+
+template <
+    typename dst_t,
+    int dst_n,
+    typename src_t,
+    int src_n,
+    std::enable_if_t<dst_n == 1, int> = 0>
+inline Vectorized<dst_t> convert(const VectorizedN<src_t, src_n>& src) {
+  return VecConvert<dst_t, dst_n, src_t, src_n>::apply(src);
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
diff --git a/aten/src/ATen/cpu/vec/vec_mask.h b/aten/src/ATen/cpu/vec/vec_mask.h
new file mode 100644
index 0000000000000..90f0f98962d90
--- /dev/null
+++ b/aten/src/ATen/cpu/vec/vec_mask.h
@@ -0,0 +1,266 @@
+#pragma once
+
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec_n.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+/**
+ * The `VecMask` class provides a convenient interface for working with
+ * vectorized masks in SIMD operations. It encapsulates a `Vectorized<T, N>`
+ * mask that can be directly usable in masked vectorized operations. It provides
+ * various methods for manipulating and accessing the mask elements:
+ * 1. `from` and `to`: Conversion between a vector of boolean values and a
+ * vectorized mask.
+ * 2. `cast`: Casts the mask to a different base type.
+ * 3. `all_zero`: Checks if all mask elements are zero.
+ * 4. `is_masked`: Checks if a specific element is masked.
+ * 5. `loadu`: Loads data from memory using the mask.
+ * 6. `all_masked`: Checks if all mask elements are masked.
+ *
+ * Some helper template classes are provided to simplify the specialization of
+ * the `VecMask` for the specific CPU arch:
+ * 1. `VecMaskLoad`: Loads data from memory using the mask.
+ * 2. `VecMaskTo`: Converts the mask to boolean.
+ * 3. `VecMaskCast`: Casts the mask to a different base type.
+ *
+ */
+template <typename T, int N>
+class VecMask;
+
+template <
+    typename data_t,
+    int data_n,
+    typename mask_t,
+    int mask_n,
+    typename Enabled = void>
+struct VecMaskLoad {
+  static inline VectorizedN<data_t, data_n> apply(
+      const data_t* ptr,
+      const VecMask<mask_t, mask_n>& vec_mask) {
+    constexpr typename VecMask<mask_t, mask_n>::size_type size =
+        VecMask<mask_t, mask_n>::size();
+    static_assert(VectorizedN<data_t, data_n>::size() >= size);
+    __at_align__ data_t data[size];
+    __at_align__ mask_t mask[size];
+    auto mask_ = VectorizedN<mask_t, mask_n>(vec_mask);
+    mask_.store(mask);
+    for (int i = 0; i < size; i++) {
+      data[i] = mask[i] ? ptr[i] : static_cast<data_t>(0);
+    }
+    return VectorizedN<data_t, data_n>::loadu(data, size);
+  }
+};
+
+template <
+    typename dst_t,
+    int dst_n,
+    typename src_t,
+    int src_n,
+    typename Enabled = void>
+struct VecMaskTo {
+  static inline VecMask<dst_t, dst_n> apply(
+      const VecMask<src_t, src_n>& vec_mask) {
+    auto zeros = VectorizedN<dst_t, dst_n>(static_cast<dst_t>(0));
+    auto ones = VectorizedN<dst_t, dst_n>(static_cast<dst_t>(1));
+    return VectorizedN<dst_t, dst_n>::blendv(
+        zeros, ones, vec_mask.template cast<dst_t, dst_n>());
+  }
+};
+
+template <typename dst_t, int dst_n, typename src_t, int src_n>
+struct VecMaskCast {
+  static inline VecMask<dst_t, dst_n> apply(
+      const VecMask<src_t, src_n>& vec_mask) {
+    return VecMask<dst_t, dst_n>::from(VectorizedN<src_t, src_n>(vec_mask));
+  }
+};
+
+template <typename T, int N>
+struct VecMaskCast<T, N, T, N> {
+  static inline VecMask<T, N> apply(const VecMask<T, N>& vec_mask) {
+    return vec_mask;
+  }
+};
+
+template <typename T, int N>
+class VecMask {
+ public:
+  using size_type = int;
+  static constexpr size_type size() {
+    return VectorizedN<T, N>::size();
+  }
+
+ private:
+  VectorizedN<T, N> mask_;
+
+ public:
+  VecMask() : mask_(static_cast<T>(0)) {}
+  VecMask(const VectorizedN<T, N>& mask) : mask_(mask) {}
+
+  template <int L = N, typename std::enable_if_t<L == 1, int> = 0>
+  VecMask(const Vectorized<T>& mask) : mask_(mask) {}
+
+  template <typename U, int L>
+  static VecMask<T, N> from(const VectorizedN<U, L>& b_vec) {
+    __at_align__ U b_buf[size()];
+    if constexpr (size() >= VectorizedN<U, L>::size()) {
+      b_vec.store(b_buf);
+      for (int i = VectorizedN<U, L>::size(); i < size(); i++) {
+        b_buf[i] = static_cast<U>(0);
+      }
+    } else {
+      b_vec.store(b_buf, size());
+    }
+    return from(b_buf);
+  }
+
+  template <typename U>
+  static VecMask<T, N> from(U b) {
+    using int_t = int_same_size_t<T>;
+    T mask = b ? c10::bit_cast<T>((int_t)(~(int_t)0)) : (T)0;
+    return VectorizedN<T, N>(mask);
+  }
+
+  template <typename U>
+  static VecMask<T, N> from(U* b) {
+    using int_t = int_same_size_t<T>;
+    __at_align__ T mask[size()];
+#pragma unroll
+    for (int i = 0; i < size(); i++) {
+      *(int_t*)(mask + i) = b[i] ? ~(int_t)0 : (int_t)0;
+    }
+    return VectorizedN<T, N>(VectorizedN<T, N>::loadu(mask));
+  }
+
+  static VecMask<T, N> blendv(
+    const VecMask<T, N>& c,
+    const VecMask<T, N>& b,
+    const VecMask<T, N>& a) {
+    VectorizedN<T, N> result = VectorizedN<T, N>::blendv(
+      VectorizedN<T, N>(c),
+      VectorizedN<T, N>(b),
+      VectorizedN<T, N>(a));
+    return result;
+  }
+
+  void store(bool* b, int count = size()) {
+    constexpr int L = (VectorizedN<T, N>::size() + Vectorized<bool>::size() - 1)/ Vectorized<bool>::size();
+    auto res = this->to<bool, L>();
+    res.store(b, count);
+    return;
+  }
+
+  template <typename U, int L, std::enable_if_t<L >= 2, int> = 0>
+  inline VectorizedN<U, L> to() const {
+    return VecMaskTo<U, L, T, N>::apply(*this);
+  }
+
+  template <typename U, int L, std::enable_if_t<L == 1, int> = 0>
+  inline Vectorized<U> to() const {
+    return VecMaskTo<U, L, T, N>::apply(*this);
+  }
+
+  template <typename U, int L>
+  inline VecMask<U, L> cast() const {
+    return VecMaskCast<U, L, T, N>::apply(*this);
+  }
+
+  inline bool all_zero() const {
+    __at_align__ T mask[size()];
+    mask_.store(mask);
+    return std::all_of(
+        mask, mask + size(), [](T m) { return m == static_cast<T>(0); });
+  }
+
+  inline bool all_masked() const {
+    __at_align__ T mask[size()];
+    mask_.store(mask);
+    return std::all_of(
+        mask, mask + size(), [](T m) { return m != static_cast<T>(0); });
+  }
+
+  inline bool is_masked(int i) const {
+    __at_align__ T mask[size()];
+    mask_.store(mask);
+    return mask[i] != static_cast<T>(0);
+  }
+
+  inline operator VectorizedN<T, N>() const {
+    return mask_;
+  }
+
+  template <int L = N, typename std::enable_if_t<L == 1, int> = 0>
+  inline operator Vectorized<T>() const {
+    return mask_[0];
+  }
+
+  inline Vectorized<T> operator[](int i) const {
+    return mask_[i];
+  }
+
+  template <
+      typename U,
+      int L,
+      std::enable_if_t<L >= 2 && VectorizedN<U, L>::size() >= size(), int> = 0>
+  VectorizedN<U, L> loadu(const U* ptr) const {
+    return VecMaskLoad<U, L, T, N>::apply(ptr, *this);
+  }
+
+  template <
+      typename U,
+      int L,
+      std::enable_if_t<L == 1 && Vectorized<U>::size() >= size(), int> = 0>
+  Vectorized<U> loadu(const U* ptr) const {
+    return VecMaskLoad<U, L, T, N>::apply(ptr, *this);
+  }
+};
+
+#define VEC_MASK_DEFINE_UNARY_OP_GLOBAL(op)         \
+  template <typename T, int N>                      \
+  inline VecMask<T, N> op(const VecMask<T, N>& a) { \
+    return op(VectorizedN<T, N>(a));                \
+  }
+
+#define VEC_MASK_DEFINE_BINARY_OP_GLOBAL(op)                                  \
+  template <                                                                  \
+      typename T,                                                             \
+      int N,                                                                  \
+      typename V,                                                             \
+      int M,                                                                  \
+      std::enable_if_t<VecMask<T, N>::size() == VecMask<V, M>::size(), int> = \
+          0>                                                                  \
+  inline VecMask<T, N> op(const VecMask<T, N>& a, const VecMask<V, M>& b) {   \
+    return op(                                                                \
+        VectorizedN<T, N>(a), VectorizedN<T, N>(b.template cast<T, N>()));    \
+  }
+
+#define VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(op, EXPR)                  \
+  template <                                                                  \
+      typename T,                                                             \
+      int N,                                                                  \
+      typename V,                                                             \
+      int M,                                                                  \
+      std::enable_if_t<VecMask<T, N>::size() == VecMask<V, M>::size(), int> = \
+          0>                                                                  \
+  inline VecMask<T, N> op(const VecMask<T, N>& a, const VecMask<V, M>& b) {   \
+    return EXPR;                                                              \
+  }
+
+VEC_MASK_DEFINE_UNARY_OP_GLOBAL(operator~)
+VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator&)
+VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator|)
+VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator^)
+VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator>, a & ~b)
+VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<, ~a& b)
+VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator==, ~(a ^ b))
+VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator>=, (a == b) | (a > b))
+VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<=, (a == b) | (a < b))
+
+#undef VEC_MASK_DEFINE_UNARY_OP_GLOBAL
+#undef VEC_MASK_DEFINE_BINARY_OP_GLOBAL
+#undef VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
diff --git a/aten/src/ATen/cpu/vec/vec_n.h b/aten/src/ATen/cpu/vec/vec_n.h
new file mode 100644
index 0000000000000..5b0eb352d6627
--- /dev/null
+++ b/aten/src/ATen/cpu/vec/vec_n.h
@@ -0,0 +1,356 @@
+#pragma once
+
+#include <ATen/cpu/vec/vec_base.h>
+#include <array>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+/**
+ * @brief A class template representing a vectorized type with
+ * `N * Vectorized<T>::size()` elements, aiming to support vectors of
+ * arbitrary size. A specific use case of it is to represent vectors
+ * converted from data types with different sizes but with the same
+ * number of vector elements, e.g., `VectorizedN<float, 2>` can be
+ * a vector converted from two `Vectorized<bfloat16>`, `VectorizedN<int64_t, 2>`
+ * can be a vector converted from two `Vectorized<int32_t>` etc.
+ *
+ * It supports most of the operations of `Vectorized<T>`
+ * and the implementation delegates to `Vectorized<T>` with loops over `N`.
+ *
+ * @tparam T The underlying type of the vectorized elements.
+ * @tparam N The number of underlying `Vectorized<T>`.
+ */
+template <typename T, int N>
+class VectorizedN {
+ public:
+  using value_type = T;
+  using size_type = int;
+
+  static constexpr size_type size_T = sizeof(T);
+  static constexpr size_type size() {
+    return Vectorized<T>::size() * N;
+  }
+
+ private:
+  std::array<Vectorized<T>, N> values;
+
+ public:
+  // methods not implemented yet:
+  // variadic constructor, operator T*, as_bytes, zero_mask
+
+#define VECTORIZEDN_DEFINE_UNARY_OP(op)                             \
+  VectorizedN<T, N> op() const {                                    \
+    return unary_op([](const Vectorized<T>& a) { return a.op(); }); \
+  }
+
+#define VECTORIZEDN_DEFINE_BINARY_OP(op)                            \
+  VectorizedN<T, N> op(const VectorizedN<T, N>& other) const {      \
+    return binary_op(                                               \
+        other, [](const Vectorized<T>& a, const Vectorized<T>& b) { \
+          return a.op(b);                                           \
+        });                                                         \
+  }
+
+  template <typename Op>
+  inline VectorizedN<T, N> unary_op(Op op) const {
+    VectorizedN<T, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = op(values[i]);
+    }
+    return result;
+  }
+
+  template <typename Op>
+  inline VectorizedN<T, N> binary_op(const VectorizedN<T, N>& other, Op op)
+      const {
+    VectorizedN<T, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = op(values[i], other.values[i]);
+    }
+    return result;
+  }
+
+  VectorizedN() = default;
+
+  explicit VectorizedN(T val) {
+    for (int i = 0; i < N; ++i) {
+      values[i] = Vectorized<T>(val);
+    }
+  }
+
+  template <int L = N, typename std::enable_if_t<L == 1, int> = 0>
+  VectorizedN(const Vectorized<T>& val) : values({val}) {}
+
+  template <int L = N, typename std::enable_if_t<L == 1, int> = 0>
+  inline operator Vectorized<T>() const {
+    return values[0];
+  }
+
+  inline const Vectorized<T>& operator[](int i) const {
+    return values[i];
+  }
+
+  inline Vectorized<T>& operator[](int i) {
+    return values[i];
+  }
+
+  template <int64_t mask>
+  static VectorizedN<T, N> blend(
+      const VectorizedN<T, N>& a,
+      const VectorizedN<T, N>& b) {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = Vectorized<T>::template blend<mask>(a.values[i], b.values[i]);
+    }
+    return result;
+  }
+
+  static VectorizedN<T, N> blendv(
+      const VectorizedN<T, N>& a,
+      const VectorizedN<T, N>& b,
+      const VectorizedN<T, N>& mask) {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] =
+          Vectorized<T>::blendv(a.values[i], b.values[i], mask.values[i]);
+    }
+    return result;
+  }
+
+  template <typename step_t>
+  static VectorizedN<T, N> arange(
+      T base = static_cast<T>(0),
+      step_t step = static_cast<step_t>(1)) {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = Vectorized<T>::arange(base, step);
+      base += step * Vectorized<T>::size();
+    }
+    return result;
+  }
+
+  static VectorizedN<T, N> set(
+      const VectorizedN<T, N>& a,
+      const VectorizedN<T, N>& b,
+      int64_t count = size()) {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = Vectorized<T>::set(
+          a.values[i],
+          b.values[i],
+          std::min(count, (int64_t)Vectorized<T>::size()));
+      count -= Vectorized<T>::size();
+      if (count <= 0) {
+        break;
+      }
+    }
+    return result;
+  }
+
+  static VectorizedN<T, N> loadu(const void* ptr) {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = Vectorized<T>::loadu(ptr);
+      ptr = static_cast<const T*>(ptr) + Vectorized<T>::size();
+    }
+    return result;
+  }
+
+  static VectorizedN<T, N> loadu(const void* ptr, int64_t count) {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = Vectorized<T>::loadu(
+          ptr, std::min(count, (int64_t)Vectorized<T>::size()));
+      ptr = static_cast<const T*>(ptr) + Vectorized<T>::size();
+      count -= Vectorized<T>::size();
+      if (count <= 0) {
+        break;
+      }
+    }
+    return result;
+  }
+
+  void store(void* ptr) const {
+    for (int i = 0; i < N; ++i) {
+      values[i].store(ptr);
+      ptr = static_cast<T*>(ptr) + Vectorized<T>::size();
+    }
+  }
+
+  void store(void* ptr, int count) const {
+    for (int i = 0; i < N; ++i) {
+      values[i].store(ptr, std::min(count, (int)Vectorized<T>::size()));
+      ptr = static_cast<T*>(ptr) + Vectorized<T>::size();
+      count -= Vectorized<T>::size();
+      if (count <= 0) {
+        break;
+      }
+    }
+  }
+
+  bool has_inf_nan() const {
+    for (int i = 0; i < N; ++i) {
+      if (values[i].has_inf_nan()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  VectorizedN<T, N> map(T (*const f)(T)) const {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = values[i].map(f);
+    }
+    return result;
+  }
+
+  VectorizedN<T, N> map(T (*const f)(const T&)) const {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = values[i].map(f);
+    }
+    return result;
+  }
+
+  VECTORIZEDN_DEFINE_UNARY_OP(abs)
+  VECTORIZEDN_DEFINE_UNARY_OP(sgn)
+  VECTORIZEDN_DEFINE_UNARY_OP(angle)
+  VECTORIZEDN_DEFINE_UNARY_OP(real)
+  VECTORIZEDN_DEFINE_UNARY_OP(imag)
+  VECTORIZEDN_DEFINE_UNARY_OP(conj)
+  VECTORIZEDN_DEFINE_UNARY_OP(acos)
+  VECTORIZEDN_DEFINE_UNARY_OP(acosh)
+  VECTORIZEDN_DEFINE_UNARY_OP(asin)
+  VECTORIZEDN_DEFINE_UNARY_OP(atan)
+  VECTORIZEDN_DEFINE_UNARY_OP(atanh)
+  VECTORIZEDN_DEFINE_BINARY_OP(atan2)
+  VECTORIZEDN_DEFINE_BINARY_OP(copysign)
+  VECTORIZEDN_DEFINE_UNARY_OP(erf)
+  VECTORIZEDN_DEFINE_UNARY_OP(erfc)
+  VECTORIZEDN_DEFINE_UNARY_OP(erfinv)
+  VECTORIZEDN_DEFINE_UNARY_OP(exp)
+  VECTORIZEDN_DEFINE_UNARY_OP(exp2)
+  VECTORIZEDN_DEFINE_UNARY_OP(expm1)
+  VECTORIZEDN_DEFINE_UNARY_OP(exp_u20)
+  VECTORIZEDN_DEFINE_UNARY_OP(frac)
+  VECTORIZEDN_DEFINE_BINARY_OP(fmod)
+  VECTORIZEDN_DEFINE_UNARY_OP(log)
+  VECTORIZEDN_DEFINE_UNARY_OP(log10)
+  VECTORIZEDN_DEFINE_UNARY_OP(log1p)
+  VECTORIZEDN_DEFINE_UNARY_OP(log2)
+  VECTORIZEDN_DEFINE_UNARY_OP(ceil)
+  VECTORIZEDN_DEFINE_UNARY_OP(cos)
+  VECTORIZEDN_DEFINE_UNARY_OP(cosh)
+  VECTORIZEDN_DEFINE_UNARY_OP(floor)
+  VECTORIZEDN_DEFINE_BINARY_OP(hypot)
+  VECTORIZEDN_DEFINE_UNARY_OP(i0)
+  VECTORIZEDN_DEFINE_UNARY_OP(i0e)
+  VECTORIZEDN_DEFINE_UNARY_OP(digamma)
+  VECTORIZEDN_DEFINE_BINARY_OP(igamma)
+  VECTORIZEDN_DEFINE_BINARY_OP(igammac)
+  VECTORIZEDN_DEFINE_UNARY_OP(neg)
+  VECTORIZEDN_DEFINE_BINARY_OP(nextafter)
+  VECTORIZEDN_DEFINE_UNARY_OP(round)
+  VECTORIZEDN_DEFINE_UNARY_OP(sin)
+  VECTORIZEDN_DEFINE_UNARY_OP(sinh)
+  VECTORIZEDN_DEFINE_UNARY_OP(tan)
+  VECTORIZEDN_DEFINE_UNARY_OP(tanh)
+  VECTORIZEDN_DEFINE_UNARY_OP(trunc)
+  VECTORIZEDN_DEFINE_UNARY_OP(lgamma)
+  VECTORIZEDN_DEFINE_UNARY_OP(sqrt)
+  VECTORIZEDN_DEFINE_UNARY_OP(reciprocal)
+  VECTORIZEDN_DEFINE_UNARY_OP(rsqrt)
+  VECTORIZEDN_DEFINE_BINARY_OP(pow)
+  VECTORIZEDN_DEFINE_BINARY_OP(operator==)
+  VECTORIZEDN_DEFINE_BINARY_OP(operator!=)
+  VECTORIZEDN_DEFINE_BINARY_OP(operator>=)
+  VECTORIZEDN_DEFINE_BINARY_OP(operator<=)
+  VECTORIZEDN_DEFINE_BINARY_OP(operator>)
+  VECTORIZEDN_DEFINE_BINARY_OP(operator<)
+  VECTORIZEDN_DEFINE_BINARY_OP(eq)
+  VECTORIZEDN_DEFINE_BINARY_OP(ne)
+  VECTORIZEDN_DEFINE_BINARY_OP(gt)
+  VECTORIZEDN_DEFINE_BINARY_OP(ge)
+  VECTORIZEDN_DEFINE_BINARY_OP(lt)
+  VECTORIZEDN_DEFINE_BINARY_OP(le)
+
+#undef VECTORIZEDN_DEFINE_UNARY_OP
+#undef VECTORIZEDN_DEFINE_BINARY_OP
+};
+
+#define VECTORIZEDN_DEFINE_UNARY_OP_GLOBAL(op)                       \
+  template <typename T, int N>                                       \
+  inline VectorizedN<T, N> op(const VectorizedN<T, N>& a) {          \
+    return a.unary_op([](const Vectorized<T>& a) { return op(a); }); \
+  }
+
+#define VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(op)                                \
+  template <typename T, int N>                                                 \
+  inline VectorizedN<T, N> op(                                                 \
+      const VectorizedN<T, N>& a, const VectorizedN<T, N>& b) {                \
+    return a.binary_op(b, [](const Vectorized<T>& a, const Vectorized<T>& b) { \
+      return op(a, b);                                                         \
+    });                                                                        \
+  }
+
+#define VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(op)                     \
+  template <typename T, int N>                                              \
+  inline VectorizedN<T, N>& op(                                             \
+      VectorizedN<T, N>& a, const VectorizedN<T, N>& b) {                   \
+    a = a.binary_op(b, [](const Vectorized<T>& a, const Vectorized<T>& b) { \
+      return op(a, b);                                                      \
+    });                                                                     \
+    return a;                                                               \
+  }
+
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator+)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator-)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator*)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator/)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator%)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator||)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator<<)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator>>)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(maximum)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(minimum)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(fmadd)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(fmsub)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(clamp)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(clamp_max)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(clamp_min)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator&)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator|)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator^)
+VECTORIZEDN_DEFINE_UNARY_OP_GLOBAL(operator~)
+
+VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator+=)
+VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator-=)
+VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator*=)
+VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator/=)
+VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator%=)
+VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator<<=)
+VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator>>=)
+
+#undef VECTORIZEDN_DEFINE_UNARY_OP_GLOBAL
+#undef VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL
+#undef VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL
+
+template <typename T, int N, typename OpVec>
+inline T vec_reduce_all(const OpVec& vec_fun, VectorizedN<T, N> acc_vec) {
+  Vectorized<T> vec_result = acc_vec[0];
+  for (int i = 1; i < N; i++) {
+    vec_result = vec_fun(vec_result, acc_vec[i]);
+  }
+  return vec_reduce_all(vec_fun, vec_result);
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
diff --git a/aten/src/ATen/cpu/vml.h b/aten/src/ATen/cpu/vml.h
index ba99583b02b23..fe57a27a04d9f 100644
--- a/aten/src/ATen/cpu/vml.h
+++ b/aten/src/ATen/cpu/vml.h
@@ -100,11 +100,11 @@ IMPLEMENT_VML(lgamma)
 #if AT_MKL_ENABLED() && !defined(__APPLE__)
 
 // NB: LP64 MKL is the most commonly used and thus we assume it here. That means
-// we need to expect MKL_INT to be of type int, which implies int32_t in most
+// we need to expect MKL_INT to be of type int, which implies int32_t or int64_t in most
 // cases.
 static_assert(
-    std::is_same<MKL_INT, int32_t>::value,
-    "MKL_INT is assumed to be int32_t");
+    std::is_same_v<MKL_INT, int32_t> || std::is_same_v<MKL_INT, int64_t>,
+    "MKL_INT is assumed to be int32_t or int64_t");
 #define IMPLEMENT_VML_MKL_STUB(op, mklop, type, mkltype)                \
   template <>                                                           \
   inline void v##op(type * out, const type * in, int64_t size) {        \
diff --git a/aten/src/ATen/cuda/ApplyGridUtils.cuh b/aten/src/ATen/cuda/ApplyGridUtils.cuh
index 18ce3ba34e87c..b0b1412298d7b 100644
--- a/aten/src/ATen/cuda/ApplyGridUtils.cuh
+++ b/aten/src/ATen/cuda/ApplyGridUtils.cuh
@@ -20,7 +20,7 @@ constexpr uint32_t AT_APPLY_THREADS_PER_BLOCK = 512;
 constexpr uint32_t AT_APPLY_BLOCKS_PER_SM = 4;
 
 template <int step = 1>
-inline bool getApplyGrid(uint64_t totalElements, dim3& grid, int64_t curDevice, int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) {
+inline bool getApplyGrid(uint64_t totalElements, dim3& grid, c10::DeviceIndex curDevice, int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) {
   if (curDevice == -1) return false;
   uint64_t numel_per_thread = static_cast<uint64_t>(max_threads_per_block) * static_cast<uint64_t>(step);
   uint64_t numBlocks = ATenCeilDiv(totalElements, numel_per_thread);
diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
index cd857c00988ba..4dcdabf17b3b9 100644
--- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh
+++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
@@ -403,7 +403,7 @@ inline bool CUDA_tensor_apply2(at::TensorBase a,
   const dim3 block = getApplyBlock(max_threads_per_block);
 
   dim3 grid;
-  int64_t curDevice = current_device();
+  auto curDevice = current_device();
   if (curDevice == -1) return false;
   if (!getApplyGrid<step>(totalElements, grid, curDevice, max_threads_per_block)) {
     return false;
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 9ae84c418d255..bfe6a02741ede 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -6,12 +6,17 @@
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/cuda/CUDADataType.h>
+#include <ATen/cuda/tunable/Tunable.h>
+#include <ATen/cuda/tunable/TunableGemm.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/macros/Export.h>
 #include <c10/util/irange.h>
 
 #ifdef USE_ROCM
+#if ROCM_VERSION >= 60000
+#include <hipblaslt/hipblaslt-ext.hpp>
+#endif
 // until hipblas has an API to accept flags, we must use rocblas here
 #include <hipblas/hipblas.h>
 #include <rocblas/rocblas.h>
@@ -190,10 +195,10 @@ static size_t _parseChosenWorkspaceSize() {
       workspace_size = std::stoi(val);
     } catch(std::invalid_argument const& e) {
       TORCH_WARN("invalid CUBLASLT_WORKSPACE_SIZE,",
-                 " using default workspace size of ", workspace_size, " bytes.");
+                 " using default workspace size of ", workspace_size, " KiB.");
     } catch(std::out_of_range const& e) {
       TORCH_WARN("CUBLASLT_WORKSPACE_SIZE out of range,",
-                 " using default workspace size of ", workspace_size, " bytes.");
+                 " using default workspace size of ", workspace_size, " KiB.");
     }
   }
   return workspace_size * 1024;
@@ -231,8 +236,289 @@ namespace at::cuda::blas {
     CUDABLAS_NONNEGINT_CHECK(bgemm<Dtype>, num_batches);  \
   } while (0)
 
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+
+#if defined(USE_ROCM) && ROCM_VERSION >= 50700 && ROCM_VERSION < 60000
+// only for rocm 5.7 where we first supported hipblaslt, it was difficult
+// to hipify correctly without this change.
+#define hipDataType hipblasDatatype_t
+#endif
+
+// hipblaslt custom types were a temporary work-around
+#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && defined(HIPBLASLT_CUSTOM_DATA_TYPE)
+hipblasltDatatype_t hipToLt(hipDataType type) {
+    switch (type) {
+        case HIP_R_32F: return HIPBLASLT_R_32F;
+        case HIP_R_64F: return HIPBLASLT_R_64F;
+        case HIP_R_16F: return HIPBLASLT_R_16F;
+        case HIP_R_8I: return HIPBLASLT_R_8I;
+        case HIP_C_32F: return HIPBLASLT_C_32F;
+        case HIP_C_64F: return HIPBLASLT_C_64F;
+        case HIP_C_16F: return HIPBLASLT_C_16F;
+        case HIP_C_8I: return HIPBLASLT_C_8I;
+        case HIP_R_8U: return HIPBLASLT_R_8U;
+        case HIP_C_8U: return HIPBLASLT_C_8U;
+        case HIP_R_32I: return HIPBLASLT_R_32I;
+        case HIP_C_32I: return HIPBLASLT_C_32I;
+        case HIP_R_32U: return HIPBLASLT_R_32U;
+        case HIP_C_32U: return HIPBLASLT_C_32U;
+        case HIP_R_16BF: return HIPBLASLT_R_16B;
+        case HIP_C_16BF: return HIPBLASLT_C_16B;
+        default: TORCH_CHECK(false, "unknown hipDataType");
+    }
+}
+#define HIPTOLT(type) hipToLt(type)
+#else
+#define HIPTOLT(type) type
+#endif
+
+#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && defined(HIPBLASLT_CUSTOM_COMPUTE_TYPE)
+hipblasLtComputeType_t hipblasToLt(hipblasComputeType_t type) {
+    switch (type) {
+        case HIPBLAS_COMPUTE_32F: return HIPBLASLT_COMPUTE_F32;
+        case HIPBLAS_COMPUTE_32F_FAST_16F: return HIPBLASLT_COMPUTE_F32_FAST_F16;
+        case HIPBLAS_COMPUTE_32F_FAST_TF32: return HIPBLASLT_COMPUTE_F32_FAST_XF32;
+        case HIPBLAS_COMPUTE_64F: return HIPBLASLT_COMPUTE_F64;
+        case HIPBLAS_COMPUTE_32I: return HIPBLASLT_COMPUTE_I32;
+        default: TORCH_CHECK(false, "unknown hipblasComputeType_t");
+    }
+}
+#define HIPCOMPTOLT(type) hipblasToLt(type)
+#else
+#define HIPCOMPTOLT(type) type
+#endif
+
+namespace {
+// Following the pattern of CuSparseDescriptor
+// Defined here for now because this is the only place cublas_lt interface is
+// used but can be moved to a header once cublas_lt interface is used in
+// multiple places.
+template <typename T, cublasStatus_t (*destructor)(T*)>
+struct CuBlasLtDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      TORCH_CUDABLAS_CHECK(destructor(x));
+    }
+  }
+};
+
+template <typename T, cublasStatus_t (*destructor)(T*)>
+class CuBlasLtDescriptor {
+ public:
+  T* descriptor() const {
+    return descriptor_.get();
+  }
+  T* descriptor() {
+    return descriptor_.get();
+  }
+
+ protected:
+  std::unique_ptr<T, CuBlasLtDeleter<T, destructor>> descriptor_;
+};
+
+class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
+                                     cublasLtMatmulDescOpaque_t,
+                                     &cublasLtMatmulDescDestroy> {
+ public:
+  CuBlasLtMatmulDescriptor(
+      cublasComputeType_t compute_type,
+      cudaDataType_t scale_type) {
+    cublasLtMatmulDesc_t raw_descriptor = nullptr;
+    TORCH_CUDABLAS_CHECK(
+        cublasLtMatmulDescCreate(&raw_descriptor, HIPCOMPTOLT(compute_type), HIPTOLT(scale_type)));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
+    TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+
+class CuBlasLtMatrixLayout : public CuBlasLtDescriptor<
+                                 cublasLtMatrixLayoutOpaque_t,
+                                 &cublasLtMatrixLayoutDestroy> {
+ public:
+  CuBlasLtMatrixLayout(
+      cudaDataType_t type,
+      uint64_t rows,
+      uint64_t cols,
+      int64_t ld,
+      bool t = false) {
+    cublasLtMatrixLayout_t raw_descriptor = nullptr;
+    TORCH_CUDABLAS_CHECK(
+        cublasLtMatrixLayoutCreate(&raw_descriptor, HIPTOLT(type), t ? cols : rows, t ? rows : cols, ld));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
+    TORCH_CUDABLAS_CHECK(::cublasLtMatrixLayoutSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+
+class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
+                                     cublasLtMatmulPreferenceOpaque_t,
+                                     &cublasLtMatmulPreferenceDestroy> {
+ public:
+  CuBlasLtMatmulPreference() {
+    cublasLtMatmulPreference_t raw_descriptor = nullptr;
+    TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceCreate(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
+    TORCH_CUDABLAS_CHECK(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+} // namespace
+
+#endif
+
+template <typename Dtype>
+inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
+  cudaDataType_t abcType = CUDA_R_32F;
+  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
+  cudaDataType_t scaleType = CUDA_R_32F;
+  if constexpr (std::is_same_v<Dtype, double>) {
+    abcType = CUDA_R_64F;
+    computeType = CUBLAS_COMPUTE_64F;
+    scaleType = CUDA_R_64F;
+  } else if constexpr (std::is_same_v<Dtype, float>) {
+#ifndef USE_ROCM
+    if (at::globalContext().allowTF32CuBLAS()) {
+      computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
+    }
+#endif
+  } else if constexpr (std::is_same_v<Dtype, c10::complex<double>>) {
+    abcType = CUDA_C_64F;
+    computeType = CUBLAS_COMPUTE_64F;
+    scaleType = CUDA_C_64F;
+  } else if constexpr (std::is_same_v<Dtype, c10::complex<float>>) {
+    abcType = CUDA_C_32F;
+    scaleType = CUDA_C_32F;
+  } else if constexpr (std::is_same_v<Dtype, at::Half>) {
+    abcType = CUDA_R_16F;
+  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
+    abcType = CUDA_R_16BF;
+  } else {
+    AT_ERROR("at::cuda::blas::bgemm_internal_cublaslt: not implemented for ", typeid(Dtype).name());
+  }
+
+  globalContext().alertCuBLASConfigNotDeterministic();
+  cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
+  cublasOperation_t opa = _cublasOpFromChar(transa);
+  cublasOperation_t opb = _cublasOpFromChar(transb);
+  _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+
+  CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, opa);
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, opb);
+  CuBlasLtMatrixLayout Adesc(abcType, m, k, lda, opa == CUBLAS_OP_T);
+  CuBlasLtMatrixLayout Bdesc(abcType, k, n, ldb, opb == CUBLAS_OP_T);
+  CuBlasLtMatrixLayout Cdesc(abcType, m, n, ldc);
+
+  if (num_batches > 1) {
+    int num_batches_as_int = static_cast<int>(num_batches);
+    Adesc.setAttribute(CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, num_batches_as_int);
+    Bdesc.setAttribute(CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, num_batches_as_int);
+    Cdesc.setAttribute(CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, num_batches_as_int);
+    Adesc.setAttribute(CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, stridea);
+    Bdesc.setAttribute(CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, strideb);
+    Cdesc.setAttribute(CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, stridec);
+  }
+
+  CuBlasLtMatmulPreference preference;
+  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
+  // setting this to 1M.
+  size_t workspaceSize = _getWorkspaceSize();
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize);
+
+#ifndef USE_ROCM
+  uint32_t a_alignment = _getAlignment(reinterpret_cast<uintptr_t>(a));
+  uint32_t b_alignment = _getAlignment(reinterpret_cast<uintptr_t>(b));
+  uint32_t c_alignment = _getAlignment(reinterpret_cast<uintptr_t>(c));
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES, a_alignment);
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES, b_alignment);
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES, c_alignment);
+#endif
+
+  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+  auto workspace = allocator.allocate(workspaceSize);
+  TORCH_CHECK(workspace.get() != nullptr, "OOM trying to allocate workspace for cublaslt");
+
+  cublasLtMatmulHeuristicResult_t heuristicResult = {};
+  int returnedResult = 0;
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
+      ltHandle,
+      computeDesc.descriptor(),
+      Adesc.descriptor(),
+      Bdesc.descriptor(),
+      Cdesc.descriptor(),
+      Cdesc.descriptor(),
+      preference.descriptor(),
+      1,
+      &heuristicResult,
+      &returnedResult));
+  if (returnedResult == 0) {
+    TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
+  }
+
+  cublasStatus_t cublasStatus = cublasLtMatmul(
+      ltHandle,
+      computeDesc.descriptor(),
+      &alpha,
+      a,
+      Adesc.descriptor(),
+      b,
+      Bdesc.descriptor(),
+      &beta,
+      c,
+      Cdesc.descriptor(),
+      c,
+      Cdesc.descriptor(),
+      &heuristicResult.algo,
+      workspace.mutable_get(),
+      workspaceSize,
+      at::cuda::getCurrentCUDAStream());
+  TORCH_CHECK(
+      cublasStatus == CUBLAS_STATUS_SUCCESS,
+      "CUDA error: ",
+      at::cuda::blas::_cublasGetErrorEnum(cublasStatus),
+      " when calling cublasLtMatmul with transpose_mat1 ",
+      (opa == CUBLAS_OP_T),
+      " transpose_mat2 ",
+      (opb == CUBLAS_OP_T),
+      " m ",
+      m,
+      " n ",
+      n,
+      " k ",
+      k,
+      " lda ",
+      lda,
+      " ldb ",
+      ldb,
+      " ldc ",
+      ldc,
+      " abcType ",
+      abcType,
+      " computeType ",
+      computeType,
+      " scaleType ",
+      scaleType);
+#else
+  AT_ERROR("at::cuda::blas::bgemm_internal_cublaslt: not implemented for ", typeid(Dtype).name());
+#endif
+}
+
+
+template <typename Dtype>
+inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+  AT_ERROR("at::cuda::blas::bgemm_internal_cublas: not implemented for ", typeid(Dtype).name());
+}
+
 template <>
-void bgemm<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
+void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@@ -245,7 +531,7 @@ void bgemm<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
 }
 
 template <>
-void bgemm<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
+void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@@ -258,7 +544,7 @@ void bgemm<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
 }
 
 template <>
-void bgemm<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
+void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@@ -273,7 +559,7 @@ void bgemm<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>))
 }
 
 template <>
-void bgemm<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
+void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@@ -288,7 +574,7 @@ void bgemm<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
 }
 
 template <>
-void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
+void bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@@ -335,7 +621,7 @@ void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
 }
 
 template <>
-void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
+void bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   BGEMM_CHECK_ARGVALUES(at::BFloat16);
@@ -346,23 +632,226 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
   const float fbeta = beta;
   _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
 
-#if defined(USE_ROCM) && ROCM_VERSION >= 60000
-  auto compute_type = CUBLAS_COMPUTE_32F;
-#else
-  auto compute_type = CUDA_R_32F;
-#endif
-  TORCH_CUDABLAS_CHECK(cublasGemmStridedBatchedEx(handle,
-                                  opa, opb, (int)m, (int)n, (int)k,
-                                  (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea,
-                                  b, CUDA_R_16BF, (int)ldb, strideb,
-                                  (void*)&fbeta, c, CUDA_R_16BF, (int)ldc, stridec,
-                                  (int)num_batches,
-                                  compute_type,
-                                  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+#if defined(USE_ROCM) && ROCM_VERSION >= 60000
+  auto compute_type = CUBLAS_COMPUTE_32F;
+#else
+  auto compute_type = CUDA_R_32F;
+#endif
+  TORCH_CUDABLAS_CHECK(cublasGemmStridedBatchedEx(handle,
+                                  opa, opb, (int)m, (int)n, (int)k,
+                                  (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea,
+                                  b, CUDA_R_16BF, (int)ldb, strideb,
+                                  (void*)&fbeta, c, CUDA_R_16BF, (int)ldc, stridec,
+                                  (int)num_batches,
+                                  compute_type,
+                                  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+}
+
+template <>
+void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+#ifdef USE_ROCM
+    // hipblaslt does not support double gemm yet
+    bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGS(double));
+#else
+    bgemm_internal_cublaslt<double>(CUDABLAS_BGEMM_ARGS(double));
+#endif
+  }
+  else {
+    bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGS(double));
+  }
+}
+
+template <>
+void bgemm_internal<float>(CUDABLAS_BGEMM_ARGTYPES(float))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    bgemm_internal_cublaslt<float>(CUDABLAS_BGEMM_ARGS(float));
+  }
+  else {
+    bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGS(float));
+  }
+}
+
+template <>
+void bgemm_internal<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+#ifdef USE_ROCM
+    // hipblaslt does not support complex<double> gemm yet
+    bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGS(c10::complex<double>));
+#else
+    bgemm_internal_cublaslt<c10::complex<double>>(CUDABLAS_BGEMM_ARGS(c10::complex<double>));
+#endif
+  }
+  else {
+    bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGS(c10::complex<double>));
+  }
+}
+
+template <>
+void bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+#ifdef USE_ROCM
+    // hipblaslt does not support complex<float> gemm yet
+    bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGS(c10::complex<float>));
+#else
+    bgemm_internal_cublaslt<c10::complex<float>>(CUDABLAS_BGEMM_ARGS(c10::complex<float>));
+#endif
+  }
+  else {
+    bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGS(c10::complex<float>));
+  }
+}
+
+template <>
+void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    bgemm_internal_cublaslt<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+  }
+  else {
+    bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+  }
+}
+
+template <>
+void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+  }
+  else {
+    bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+  }
+}
+
+template <typename DType>
+inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES(DType)) {
+  tunable::GemmStridedBatchedParams<DType> params;
+  params.transa = transa;
+  params.transb = transb;
+  params.m = m;
+  params.n = n;
+  params.k = k;
+  params.alpha = alpha;
+  params.a = a;
+  params.lda = lda;
+  params.stride_a = stridea;
+  params.b = b;
+  params.ldb = ldb;
+  params.stride_b = strideb;
+  params.beta = beta;
+  params.c = c;
+  params.ldc = ldc;
+  params.stride_c = stridec;
+  params.batch = num_batches;
+
+  bool transa_ = ((transa != 'n') && (transa != 'N'));
+  bool transb_ = ((transb != 'n') && (transb != 'N'));
+
+  if (transa_ && transb_) {
+    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::T> bgemm{};
+    bgemm(&params);
+  }
+  else if (transa_ && !transb_) {
+    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::N> bgemm{};
+    bgemm(&params);
+  }
+  else if (!transa_ && transb_) {
+    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::T> bgemm{};
+    bgemm(&params);
+  }
+  else if (!transa_ && !transb_) {
+    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::N> bgemm{};
+    bgemm(&params);
+  }
+  else {
+    TORCH_CHECK(false, "unreachable");
+  }
+}
+
+template <>
+void bgemm<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    bgemm_tunable<double>(CUDABLAS_BGEMM_ARGS(double));
+  }
+  else {
+    bgemm_internal<double>(CUDABLAS_BGEMM_ARGS(double));
+  }
+}
+
+template <>
+void bgemm<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    bgemm_tunable<float>(CUDABLAS_BGEMM_ARGS(float));
+  }
+  else {
+    bgemm_internal<float>(CUDABLAS_BGEMM_ARGS(float));
+  }
+}
+
+template <>
+void bgemm<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    bgemm_tunable<c10::complex<double>>(CUDABLAS_BGEMM_ARGS(c10::complex<double>));
+  }
+  else {
+    bgemm_internal<c10::complex<double>>(CUDABLAS_BGEMM_ARGS(c10::complex<double>));
+  }
+}
+
+template <>
+void bgemm<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    bgemm_tunable<c10::complex<float>>(CUDABLAS_BGEMM_ARGS(c10::complex<float>));
+  }
+  else {
+    bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGS(c10::complex<float>));
+  }
+}
+
+template <>
+void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    bgemm_tunable<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+  }
+  else {
+    bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+  }
+}
+
+template <>
+void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    bgemm_tunable<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+  }
+  else {
+    bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+  }
+}
+
+template <typename Dtype>
+inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+  // forward to bgemm implementation but set strides and batches to 0
+  bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0);
+}
+
+template <typename Dtype>
+inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+  AT_ERROR("at::cuda::blas::gemm_internal_cublas: not implemented for ", typeid(Dtype).name());
 }
 
 template <>
-void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
+void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@@ -375,7 +864,7 @@ void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
 }
 
 template <>
-void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
+void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@@ -388,7 +877,7 @@ void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
 }
 
 template <>
-void gemm<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
+void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@@ -403,7 +892,7 @@ void gemm<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
 }
 
 template <>
-void gemm<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
+void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@@ -418,7 +907,7 @@ void gemm<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
 }
 
 template <>
-void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
+void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@@ -514,7 +1003,7 @@ void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
 }
 
 template <>
-void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+void gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
@@ -558,136 +1047,195 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
   TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
 }
 
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
-
-#if defined(USE_ROCM) && ROCM_VERSION >= 50700 && ROCM_VERSION < 60000
-// only for rocm 5.7 where we first supported hipblaslt, it was difficult
-// to hipify correctly without this change.
-#define hipDataType hipblasDatatype_t
+template <>
+void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+#ifdef USE_ROCM
+    // hipblaslt does not support double gemm yet
+    gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGS(double));
+#else
+    gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
 #endif
+  }
+  else {
+    gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGS(double));
+  }
+}
 
-// hipblaslt custom types were a temporary work-around
-#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && HIPBLASLT_CUSTOM_DATA_TYPE
-hipblasltDatatype_t hipToLt(hipDataType type) {
-    switch (type) {
-        case HIP_R_32F: return HIPBLASLT_R_32F;
-        case HIP_R_64F: return HIPBLASLT_R_64F;
-        case HIP_R_16F: return HIPBLASLT_R_16F;
-        case HIP_R_8I: return HIPBLASLT_R_8I;
-        case HIP_C_32F: return HIPBLASLT_C_32F;
-        case HIP_C_64F: return HIPBLASLT_C_64F;
-        case HIP_C_16F: return HIPBLASLT_C_16F;
-        case HIP_C_8I: return HIPBLASLT_C_8I;
-        case HIP_R_8U: return HIPBLASLT_R_8U;
-        case HIP_C_8U: return HIPBLASLT_C_8U;
-        case HIP_R_32I: return HIPBLASLT_R_32I;
-        case HIP_C_32I: return HIPBLASLT_C_32I;
-        case HIP_R_32U: return HIPBLASLT_R_32U;
-        case HIP_C_32U: return HIPBLASLT_C_32U;
-        case HIP_R_16BF: return HIPBLASLT_R_16B;
-        case HIP_C_16BF: return HIPBLASLT_C_16B;
-        default: TORCH_CHECK(false);
-    }
+template <>
+void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
+  }
+  else {
+    gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGS(float));
+  }
 }
-#define HIPTOLT(type) hipToLt(type)
+
+template <>
+void gemm_internal<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+#ifdef USE_ROCM
+    // hipblaslt does not support complex gemm yet
+    gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGS(c10::complex<double>));
 #else
-#define HIPTOLT(type) type
+    gemm_internal_cublaslt<c10::complex<double>>(CUDABLAS_GEMM_ARGS(c10::complex<double>));
 #endif
-
-#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && HIPBLASLT_CUSTOM_COMPUTE_TYPE
-hipblasLtComputeType_t hipblasToLt(hipblasComputeType_t type) {
-    switch (type) {
-        case HIPBLAS_COMPUTE_32F: return HIPBLASLT_COMPUTE_F32;
-        case HIPBLAS_COMPUTE_32F_FAST_16F: return HIPBLASLT_COMPUTE_F32_FAST_F16;
-        case HIPBLAS_COMPUTE_32F_FAST_TF32: return HIPBLASLT_COMPUTE_F32_FAST_XF32;
-        case HIPBLAS_COMPUTE_64F: return HIPBLASLT_COMPUTE_F64;
-        case HIPBLAS_COMPUTE_32I: return HIPBLASLT_COMPUTE_I32;
-        default: TORCH_CHECK(false);
-    }
+  }
+  else {
+    gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGS(c10::complex<double>));
+  }
 }
-#define HIPCOMPTOLT(type) hipblasToLt(type)
+
+template <>
+void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+#ifdef USE_ROCM
+    // hipblaslt does not support complex gemm yet
+    gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGS(c10::complex<float>));
 #else
-#define HIPCOMPTOLT(type) type
+    gemm_internal_cublaslt<c10::complex<float>>(CUDABLAS_GEMM_ARGS(c10::complex<float>));
 #endif
+  }
+  else {
+    gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGS(c10::complex<float>));
+  }
+}
 
-namespace {
-// Following the pattern of CuSparseDescriptor
-// Defined here for now because this is the only place cublas_lt interface is
-// used but can be moved to a header once cublas_lt interface is used in
-// multiple places.
-template <typename T, cublasStatus_t (*destructor)(T*)>
-struct CuBlasLtDeleter {
-  void operator()(T* x) {
-    if (x != nullptr) {
-      TORCH_CUDABLAS_CHECK(destructor(x));
-    }
+template <>
+void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
   }
-};
+  else {
+    gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+  }
+}
 
-template <typename T, cublasStatus_t (*destructor)(T*)>
-class CuBlasLtDescriptor {
- public:
-  T* descriptor() const {
-    return descriptor_.get();
+template <>
+void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
   }
-  T* descriptor() {
-    return descriptor_.get();
+  else {
+    gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
   }
+}
 
- protected:
-  std::unique_ptr<T, CuBlasLtDeleter<T, destructor>> descriptor_;
-};
+template <typename DType>
+inline void gemm_tunable(CUDABLAS_GEMM_ARGTYPES(DType)) {
+  tunable::GemmParams<DType> params;
+  params.transa = transa;
+  params.transb = transb;
+  params.m = m;
+  params.n = n;
+  params.k = k;
+  params.alpha = alpha;
+  params.a = a;
+  params.lda = lda;
+  params.b = b;
+  params.ldb = ldb;
+  params.beta = beta;
+  params.c = c;
+  params.ldc = ldc;
 
-class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
-                                     cublasLtMatmulDescOpaque_t,
-                                     &cublasLtMatmulDescDestroy> {
- public:
-  CuBlasLtMatmulDescriptor(
-      cublasComputeType_t compute_type,
-      cudaDataType_t scale_type) {
-    cublasLtMatmulDesc_t raw_descriptor = nullptr;
-    TORCH_CUDABLAS_CHECK(
-        cublasLtMatmulDescCreate(&raw_descriptor, HIPCOMPTOLT(compute_type), HIPTOLT(scale_type)));
-    descriptor_.reset(raw_descriptor);
+  bool transa_ = ((transa != 'n') && (transa != 'N'));
+  bool transb_ = ((transb != 'n') && (transb != 'N'));
+
+  if (transa_ && transb_) {
+    static tunable::GemmTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::T> gemm{};
+    gemm(&params);
   }
-  template <typename T>
-  inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
-    TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  else if (transa_ && !transb_) {
+    static tunable::GemmTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::N> gemm{};
+    gemm(&params);
   }
-};
+  else if (!transa_ && transb_) {
+    static tunable::GemmTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::T> gemm{};
+    gemm(&params);
+  }
+  else if (!transa_ && !transb_) {
+    static tunable::GemmTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::N> gemm{};
+    gemm(&params);
+  }
+  else {
+    TORCH_CHECK(false, "unreachable");
+  }
+}
 
-class CuBlasLtMatrixLayout : public CuBlasLtDescriptor<
-                                 cublasLtMatrixLayoutOpaque_t,
-                                 &cublasLtMatrixLayoutDestroy> {
- public:
-  CuBlasLtMatrixLayout(
-      cudaDataType_t type,
-      uint64_t rows,
-      uint64_t cols,
-      int64_t ld,
-      bool t = false) {
-    cublasLtMatrixLayout_t raw_descriptor = nullptr;
-    TORCH_CUDABLAS_CHECK(
-        cublasLtMatrixLayoutCreate(&raw_descriptor, HIPTOLT(type), t ? cols : rows, t ? rows : cols, ld));
-    descriptor_.reset(raw_descriptor);
+template <>
+void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    gemm_tunable<double>(CUDABLAS_GEMM_ARGS(double));
   }
-};
+  else {
+    gemm_internal<double>(CUDABLAS_GEMM_ARGS(double));
+  }
+}
 
-class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
-                                     cublasLtMatmulPreferenceOpaque_t,
-                                     &cublasLtMatmulPreferenceDestroy> {
- public:
-  CuBlasLtMatmulPreference() {
-    cublasLtMatmulPreference_t raw_descriptor = nullptr;
-    TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceCreate(&raw_descriptor));
-    descriptor_.reset(raw_descriptor);
+template <>
+void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    gemm_tunable<float>(CUDABLAS_GEMM_ARGS(float));
   }
-  template <typename T>
-  inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
-    TORCH_CUDABLAS_CHECK(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  else {
+    gemm_internal<float>(CUDABLAS_GEMM_ARGS(float));
   }
-};
-} // namespace
+}
+
+template <>
+void gemm<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    gemm_tunable<c10::complex<double>>(CUDABLAS_GEMM_ARGS(c10::complex<double>));
+  }
+  else {
+    gemm_internal<c10::complex<double>>(CUDABLAS_GEMM_ARGS(c10::complex<double>));
+  }
+}
+
+template <>
+void gemm<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    gemm_tunable<c10::complex<float>>(CUDABLAS_GEMM_ARGS(c10::complex<float>));
+  }
+  else {
+    gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGS(c10::complex<float>));
+  }
+}
+
+template <>
+void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    gemm_tunable<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+  }
+  else {
+    gemm_internal<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+  }
+}
+
+template <>
+void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    gemm_tunable<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  }
+  else {
+    gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  }
+}
+
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 
 template <typename Dtype>
 void gemm_and_bias(
@@ -745,8 +1293,11 @@ void gemm_and_bias(
     epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
 #endif
   }
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue);
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias);
+
+  if (bias != nullptr) {
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue);
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias);
+  }
 
   CuBlasLtMatrixLayout Adesc(abcType, m, k, mat1_ld, transpose_mat1);
   CuBlasLtMatrixLayout Bdesc(abcType, k, n, mat2_ld, transpose_mat2);
@@ -771,6 +1322,7 @@ void gemm_and_bias(
 
   auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
   auto workspace = allocator.allocate(workspaceSize);
+  TORCH_CHECK(workspace.get() != nullptr, "OOM trying to allocate workspace for cublaslt");
 
   cublasLtMatmulHeuristicResult_t heuristicResult = {};
   int returnedResult = 0;
@@ -921,21 +1473,32 @@ void scaled_gemm(
     ScalarType result_dtype,
     void* amax_ptr,
     bool use_fast_accum) {
-  #if CUDA_VERSION >= 11080
+#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
   const auto computeType = CUBLAS_COMPUTE_32F;
   const auto scaleType = CUDA_R_32F;
   const int8_t fastAccuMode = use_fast_accum ? 1 : 0;
+  const float alpha_val = 1.0;
+  const float beta_val = 0.0;
   CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, _cublasOpFromChar(transa));
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
+#ifndef USE_ROCM
+if (isFloat8Type(result_dtype)) {
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, amax_ptr);
+}
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_FAST_ACCUM, fastAccuMode);
+#endif
   CuBlasLtMatrixLayout Adesc(ScalarTypeToCudaDataType(mat1_dtype), m, k, mat1_ld, transa == 't');
   CuBlasLtMatrixLayout Bdesc(ScalarTypeToCudaDataType(mat2_dtype), k, n, mat2_ld, transb == 't');
+#ifdef USE_ROCM
+  // Cdesc is unused, beta is 0. But hipblaslt needs this set to something reasonable.
+  CuBlasLtMatrixLayout Cdesc(ScalarTypeToCudaDataType(result_dtype), m, n, result_ld);
+#else
   CuBlasLtMatrixLayout Cdesc(ScalarTypeToCudaDataType(bias_dtype), m, n, result_ld);
+#endif
   CuBlasLtMatrixLayout Ddesc(ScalarTypeToCudaDataType(result_dtype), m, n, result_ld);
   if (bias_ptr) {
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr);
@@ -945,6 +1508,7 @@ void scaled_gemm(
   size_t workspaceSize = _getWorkspaceSize();
   auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
   auto workspace = allocator.allocate(workspaceSize);
+  TORCH_CHECK(workspace.get() != nullptr, "OOM trying to allocate workspace for cublaslt");
 
   CuBlasLtMatmulPreference preference;
   preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize);
@@ -963,10 +1527,53 @@ void scaled_gemm(
       &heuristicResult,
       &returnedResult));
   if (returnedResult == 0) {
+#ifndef USE_ROCM
     TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
+#else
+    // hipblaslt might be able to recover by returning all algos
+    std::vector<hipblasLtMatmulHeuristicResult_t> all_algos;
+    TORCH_CUDABLAS_CHECK(hipblaslt_ext::getAllAlgos(
+        ltHandle,
+        hipblaslt_ext::GemmType::HIPBLASLT_GEMM,
+        _cublasOpFromChar(transa),
+        _cublasOpFromChar(transb),
+        HIPTOLT(ScalarTypeToCudaDataType(mat1_dtype)),
+        HIPTOLT(ScalarTypeToCudaDataType(mat2_dtype)),
+        // C is nullptr and beta=0, so set to something reasonable. See above.
+        //HIPTOLT(ScalarTypeToCudaDataType(bias_dtype)),
+        HIPTOLT(ScalarTypeToCudaDataType(result_dtype)),
+        HIPTOLT(ScalarTypeToCudaDataType(result_dtype)),
+        HIPCOMPTOLT(CUBLAS_COMPUTE_32F),
+        all_algos));
+    if (all_algos.size() == 0) {
+      TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
+    }
+    // pick first valid solution
+    bool found = false;
+    for (size_t i = 0; i < all_algos.size(); i++) {
+        size_t ret_workspace_size = 0;
+        auto is_valid_status = hipblaslt_ext::matmulIsAlgoSupported(
+                ltHandle,
+                computeDesc.descriptor(),
+                &alpha_val,
+                Adesc.descriptor(),
+                Bdesc.descriptor(),
+                &beta_val,
+                Cdesc.descriptor(),
+                Ddesc.descriptor(),
+                all_algos[i].algo,
+                ret_workspace_size);
+        if (is_valid_status == HIPBLAS_STATUS_SUCCESS) {
+            if (ret_workspace_size <= workspaceSize) {
+                heuristicResult = all_algos[i];
+                found = true;
+                break;
+            }
+        }
+    }
+    TORCH_CHECK(found, "could not find valid hipblaslt solution");
+#endif
   }
-  float alpha_val = 1.0;
-  float beta_val = 0.0;
   cublasStatus_t cublasStatus = cublasLtMatmul(
       ltHandle,
       computeDesc.descriptor(),
@@ -976,7 +1583,11 @@ void scaled_gemm(
       mat2_ptr,
       Bdesc.descriptor(),
       &beta_val,
+#ifdef USE_ROCM
+      result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr
+#else
       nullptr,
+#endif
       Cdesc.descriptor(),
       result_ptr,
       Ddesc.descriptor(),
@@ -1009,7 +1620,7 @@ void scaled_gemm(
       " scaleType ",
       scaleType);
   return;
-  #endif // CUDA_VERSION >= 11080
+#endif // CUDA_VERSION >= 11080 || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
   TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
 }
 
@@ -1044,11 +1655,34 @@ void int8_gemm(
   CuBlasLtMatrixLayout Bdesc(abType, k, n, mat2_ld, transpose_mat2);
   CuBlasLtMatrixLayout Cdesc(cType, m, n, result_ld);
 
-  cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
-
   // cublas team: alpha and beta need to be the same dtype as of scaleType
   at::opmath_type<int32_t> alpha_val = 1;
   int32_t beta_val = 0;
+  cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
+
+#ifdef USE_ROCM
+  CuBlasLtMatmulPreference preference;
+  size_t workspaceSize = _getWorkspaceSize();
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize);
+  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+  auto workspace = allocator.allocate(workspaceSize);
+  cublasLtMatmulHeuristicResult_t heuristicResult = {};
+  int returnedResult = 0;
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
+      ltHandle,
+      computeDesc.descriptor(),
+      Adesc.descriptor(),
+      Bdesc.descriptor(),
+      Cdesc.descriptor(),
+      Cdesc.descriptor(),
+      preference.descriptor(),
+      1,
+      &heuristicResult,
+      &returnedResult));
+  if (returnedResult == 0) {
+    TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
+  }
+#endif
 
   cublasStatus_t cublasStatus = cublasLtMatmul(
       ltHandle,
@@ -1063,9 +1697,21 @@ void int8_gemm(
       Cdesc.descriptor(),
       result_ptr,
       Cdesc.descriptor(),
+#ifdef USE_ROCM
+      &heuristicResult.algo,
+#else
       nullptr, // Heuristics don't seem to work for int8
+#endif
+#ifdef USE_ROCM
+      workspace.mutable_get(),
+#else
       nullptr, // Non-zero workspace doesn't seem to work.
+#endif
+#ifdef USE_ROCM
+      workspaceSize,
+#else
       0,
+#endif
       at::cuda::getCurrentCUDAStream());
   TORCH_CHECK(
       cublasStatus == CUBLAS_STATUS_SUCCESS,
@@ -1099,7 +1745,7 @@ void int8_gemm(
   TORCH_CHECK(false, "int8_gemm is only supported for ROCm 6.0 and above");
 #endif // !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
 }
-#endif // (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#endif // !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 
 // ROCm 5.6 hipblas matches the const Dtype *A API, but prior hipblas does not.
 #if defined(USE_ROCM) && ROCM_VERSION < 50600
diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
index ee3b41b4376a9..24aad7678ec49 100644
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@@ -44,6 +44,8 @@ class PointerModeGuard {
       const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, at::opmath_type<Dtype> beta,\
       Dtype *c, int64_t ldc
 
+#define CUDABLAS_GEMM_ARGS(Dtype) transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc
+
 template <typename Dtype>
 inline void gemm(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
   AT_ERROR("at::cuda::blas::gemm: not implemented for ", typeid(Dtype).name());
@@ -62,7 +64,25 @@ void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
 void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
 
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+template <typename Dtype>
+inline void gemm_internal(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+  AT_ERROR("at::cuda::blas::gemm_internal: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double));
+template <>
+void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float));
+template <>
+void gemm_internal<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>));
+template <>
+void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>));
+template <>
+void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
+template <>
+void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 enum GEMMAndBiasActivationEpilogue {
   None,
   RELU,
@@ -131,6 +151,9 @@ void scaled_gemm(
       const Dtype *b, int64_t ldb, int64_t strideb,                                           \
       at::opmath_type<Dtype> beta, Dtype *c, int64_t ldc, int64_t stridec, int64_t num_batches
 
+#define CUDABLAS_BGEMM_ARGS(Dtype) \
+  transa, transb, m, n, k, alpha, a, lda, stridea, b, ldb, strideb, beta, c, ldc, stridec, num_batches
+
 template <typename Dtype>
 inline void bgemm(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
   AT_ERROR("at::cuda::blas::bgemm: not implemented for ", typeid(Dtype).name());
@@ -149,6 +172,24 @@ void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half));
 template <>
 void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16));
 
+template <typename Dtype>
+inline void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+  AT_ERROR("at::cuda::blas::bgemm_internal: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double));
+template <>
+void bgemm_internal<float>(CUDABLAS_BGEMM_ARGTYPES(float));
+template <>
+void bgemm_internal<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>));
+template <>
+void bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>));
+template <>
+void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half));
+template <>
+void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16));
+
 #if defined(USE_ROCM) && ROCM_VERSION <= 50500
 // ROCm 5.6 hipblas matches the const Dtype *A API, but prior hipblas does not.
 #define CUDABLAS_TRSM_ARGTYPES(Dtype)                                  \
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 946bbf77497ef..ab92001f5ef0d 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -44,7 +44,7 @@ cudaDeviceProp* getCurrentDeviceProperties() {
   return getDeviceProperties(device);
 }
 
-cudaDeviceProp* getDeviceProperties(int64_t device) {
+cudaDeviceProp* getDeviceProperties(c10::DeviceIndex device) {
   c10::call_once(init_flag, initCUDAContextVectors);
   if (device == -1) device = c10::cuda::current_device();
   AT_ASSERT(device >= 0 && device < num_gpus, "device=", device, ", num_gpus=", num_gpus);
@@ -52,7 +52,7 @@ cudaDeviceProp* getDeviceProperties(int64_t device) {
   return &device_properties[device];
 }
 
-bool canDeviceAccessPeer(int64_t device, int64_t peer_device) {
+bool canDeviceAccessPeer(c10::DeviceIndex device, c10::DeviceIndex peer_device) {
   c10::call_once(init_flag, initCUDAContextVectors);
   if (device == -1) device = c10::cuda::current_device();
   AT_ASSERT(device >= 0 && device < num_gpus, "device=", device, ", num_gpus=", num_gpus);
diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
index c189ed20fd4b0..60d09dfaee169 100644
--- a/aten/src/ATen/cuda/CUDAContextLight.h
+++ b/aten/src/ATen/cuda/CUDAContextLight.h
@@ -9,7 +9,7 @@
 
 // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
 // added bf16 support
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 #include <cublasLt.h>
 #endif
 
@@ -71,18 +71,18 @@ TORCH_CUDA_CPP_API cudaDeviceProp* getCurrentDeviceProperties();
 
 TORCH_CUDA_CPP_API int warp_size();
 
-TORCH_CUDA_CPP_API cudaDeviceProp* getDeviceProperties(int64_t device);
+TORCH_CUDA_CPP_API cudaDeviceProp* getDeviceProperties(c10::DeviceIndex device);
 
 TORCH_CUDA_CPP_API bool canDeviceAccessPeer(
-    int64_t device,
-    int64_t peer_device);
+    c10::DeviceIndex device,
+    c10::DeviceIndex peer_device);
 
 TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
 
 /* Handles */
 TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
 TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
 #endif
 
diff --git a/aten/src/ATen/cuda/CUDADataType.h b/aten/src/ATen/cuda/CUDADataType.h
index 3068eb787a837..8615bcdae9117 100644
--- a/aten/src/ATen/cuda/CUDADataType.h
+++ b/aten/src/ATen/cuda/CUDADataType.h
@@ -31,8 +31,6 @@ template<> inline cudaDataType getCudaDataType<c10::complex<double>>() {
   return CUDA_C_64F;
 }
 
-// HIP doesn't define integral types
-#ifndef USE_ROCM
 template<> inline cudaDataType getCudaDataType<uint8_t>() {
   return CUDA_R_8U;
 }
@@ -42,9 +40,7 @@ template<> inline cudaDataType getCudaDataType<int8_t>() {
 template<> inline cudaDataType getCudaDataType<int>() {
   return CUDA_R_32I;
 }
-#endif
 
-#if !defined(USE_ROCM)
 template<> inline cudaDataType getCudaDataType<int16_t>() {
   return CUDA_R_16I;
 }
@@ -54,19 +50,15 @@ template<> inline cudaDataType getCudaDataType<int64_t>() {
 template<> inline cudaDataType getCudaDataType<at::BFloat16>() {
   return CUDA_R_16BF;
 }
-#endif
 
 inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type) {
   switch (scalar_type) {
-// HIP doesn't define integral types
-#ifndef USE_ROCM
     case c10::ScalarType::Byte:
       return CUDA_R_8U;
     case c10::ScalarType::Char:
       return CUDA_R_8I;
     case c10::ScalarType::Int:
       return CUDA_R_32I;
-#endif
     case c10::ScalarType::Half:
       return CUDA_R_16F;
     case c10::ScalarType::Float:
@@ -79,7 +71,6 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
       return CUDA_C_32F;
     case c10::ScalarType::ComplexDouble:
       return CUDA_C_64F;
-#if !defined(USE_ROCM)
     case c10::ScalarType::Short:
       return CUDA_R_16I;
     case c10::ScalarType::Long:
@@ -92,6 +83,18 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
     case c10::ScalarType::Float8_e5m2:
       return CUDA_R_8F_E5M2;
 #endif
+#if defined(USE_ROCM)
+#if defined(HIP_NEW_TYPE_ENUMS)
+    case c10::ScalarType::Float8_e4m3fnuz:
+      return HIP_R_8F_E4M3_FNUZ;
+    case c10::ScalarType::Float8_e5m2fnuz:
+      return HIP_R_8F_E5M2_FNUZ;
+#else
+    case c10::ScalarType::Float8_e4m3fnuz:
+      return static_cast<hipDataType>(1000);
+    case c10::ScalarType::Float8_e5m2fnuz:
+      return static_cast<hipDataType>(1001);
+#endif
 #endif
     default:
       TORCH_INTERNAL_ASSERT(false, "Cannot convert ScalarType ", scalar_type, " to cudaDataType.")
diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h
index ca6878721406b..e3c331a9c99fa 100644
--- a/aten/src/ATen/cuda/CUDAEvent.h
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@@ -48,9 +48,9 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
         CUDAGuard guard(device_index_);
         const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
         if (C10_UNLIKELY(interp)) {
-          (*interp)->trace_gpu_event_deletion(reinterpret_cast<uintptr_t>(event_));
+          (*interp)->trace_gpu_event_deletion(at::kCUDA, reinterpret_cast<uintptr_t>(event_));
         }
-        cudaEventDestroy(event_);
+        AT_CUDA_CHECK(cudaEventDestroy(event_));
       }
     } catch (...) { /* No throw */ }
   }
@@ -122,7 +122,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
     AT_CUDA_CHECK(cudaEventRecord(event_, stream));
     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
     if (C10_UNLIKELY(interp)) {
-      (*interp)->trace_gpu_event_record(
+      (*interp)->trace_gpu_event_record(at::kCUDA,
           reinterpret_cast<uintptr_t>(event_),
           reinterpret_cast<uintptr_t>(stream.stream())
       );
@@ -138,7 +138,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
       AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_, 0));
       const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
       if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_event_wait(
+        (*interp)->trace_gpu_event_wait(at::kCUDA,
             reinterpret_cast<uintptr_t>(event_),
             reinterpret_cast<uintptr_t>(stream.stream())
         );
@@ -151,6 +151,10 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
     TORCH_CHECK(is_created_ && other.isCreated(),
       "Both events must be recorded before calculating elapsed time.");
     float time_ms = 0;
+    // We do not strictly have to set the device index to the same as our event,
+    // but if we don't and the current device is not initialized, it will
+    // create a new cuda context, which will consume a lot of memory.
+    CUDAGuard guard(device_index_);
     // raise cudaErrorNotReady if either event is recorded but not yet completed
     AT_CUDA_CHECK(cudaEventElapsedTime(&time_ms, event_, other.event_));
     return time_ms;
@@ -161,7 +165,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
     if (is_created_) {
       const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
       if (C10_UNLIKELY(interp)) {
-          (*interp)->trace_gpu_event_synchronization(reinterpret_cast<uintptr_t>(event_));
+          (*interp)->trace_gpu_event_synchronization(at::kCUDA, reinterpret_cast<uintptr_t>(event_));
       }
       AT_CUDA_CHECK(cudaEventSynchronize(event_));
     }
@@ -191,7 +195,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
     AT_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_));
     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
     if (C10_UNLIKELY(interp)) {
-      (*interp)->trace_gpu_event_creation(reinterpret_cast<uintptr_t>(event_));
+      (*interp)->trace_gpu_event_creation(at::kCUDA, reinterpret_cast<uintptr_t>(event_));
     }
     is_created_ = true;
   }
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
index b8004ec7e7e37..7e19ce98fbf9d 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@@ -1,10 +1,13 @@
+#include <ATen/Functions.h>
+#include <ATen/Tensor.h>
 #include <ATen/Utils.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <ATen/cuda/CUDAGraph.h>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
 #include <c10/core/StreamGuard.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/util/CallOnce.h>
-#include <ATen/Utils.h>
+#include <deque>
 
 namespace at {
 namespace cuda::detail {
@@ -24,10 +27,10 @@ static std::deque<c10::once_flag> cuda_gens_init_flag;
 static std::vector<Generator> default_gens_cuda;
 
 /*
-* Populates the global variables related to CUDA generators
-* Warning: this function must only be called once!
-*/
-static void initCUDAGenVector(){
+ * Populates the global variables related to CUDA generators
+ * Warning: this function must only be called once!
+ */
+static void initCUDAGenVector() {
   num_gpus = c10::cuda::device_count();
   cuda_gens_init_flag.resize(num_gpus);
   default_gens_cuda.resize(num_gpus);
@@ -77,6 +80,150 @@ Generator createCUDAGenerator(DeviceIndex device_index) {
 
 } // namespace cuda::detail
 
+/**
+ * Creates a clone of this CUDA Generator State.
+ */
+c10::intrusive_ptr<CUDAGeneratorState> CUDAGeneratorState::clone() {
+  return make_intrusive<CUDAGeneratorState>(
+      seed_, philox_offset_per_thread_, offset_intragraph_);
+}
+
+/**
+ * Function to increase the internal offset based on the specified increment.
+ */
+void CUDAGeneratorState::increase(uint64_t increment) {
+  // Rounds increment up to the nearest multiple of 4 to meet alignment
+  // requirements.
+  // see Note [Why enforce RNG offset % 4 == 0?]
+  increment = ((increment + 3) / 4) * 4;
+  // Handling different behaviors based on whether capturing is active.
+  if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) {
+    // Ensures that the state is actually capturing.
+    TORCH_CHECK(
+        capturing_,
+        "Attempt to increase offset for a CUDA generator not in capture mode.");
+    // Ensures the offset is a multiple of 4
+    // see Note [Why enforce RNG offset % 4 == 0?]
+    TORCH_INTERNAL_ASSERT(
+        offset_intragraph_ % 4 == 0, "RNG offset must be a multiple of 4.");
+    // Ensures the increment does not cause overflow.
+    TORCH_INTERNAL_ASSERT(
+        offset_intragraph_ <= std::numeric_limits<uint32_t>::max() - increment,
+        "Increment causes overflow in the offset value.");
+    offset_intragraph_ += increment;
+  } else {
+    // Checks that the increment is expected outside graph capturing.
+    TORCH_CHECK(
+        !capturing_,
+        "Offset increment outside graph capture encountered unexpectedly.");
+    // Ensures the offset is a multiple of 4
+    // see Note [Why enforce RNG offset % 4 == 0?]
+    TORCH_INTERNAL_ASSERT(
+        philox_offset_per_thread_ % 4 == 0,
+        "RNG offset must be a multiple of 4.");
+    philox_offset_per_thread_ += increment;
+  }
+}
+
+/**
+ * Registers this state to a CUDA graph to manage within the graph.
+ */
+void CUDAGeneratorState::register_graph(cuda::CUDAGraph* graph) {
+  // Ensures that the RNG state is not currently being captured.
+  at::cuda::assertNotCapturing(
+      "Cannot register the state during capturing stage.");
+
+  // If this is the first graph to be registered, allocate memory for the seed
+  // and offset on the GPU.
+  if (registered_graphs_.empty()) {
+    auto options = at::TensorOptions().device(at::kCUDA).dtype(at::kLong);
+    seed_extragraph_ = at::empty({1}, options);
+    offset_extragraph_ = at::empty({1}, options);
+  }
+
+  // Insert the graph into the set of registered graphs if it's not already
+  // registered.
+  if (registered_graphs_.find(graph) == registered_graphs_.end()) {
+    registered_graphs_.insert(graph);
+  }
+}
+
+/**
+ * Unregisters a CUDA graph from the RNG state.
+ */
+void CUDAGeneratorState::unregister_graph(cuda::CUDAGraph* graph) {
+  // Ensures that the RNG state is not currently being captured.
+  at::cuda::assertNotCapturing(
+      "Cannot unregister the state during capturing stage.");
+  // Verify the graph was previously registered.
+  TORCH_CHECK(
+      registered_graphs_.find(graph) != registered_graphs_.end(),
+      "The graph should be registered to the state");
+
+  // Remove the graph from the set of registered graphs.
+  registered_graphs_.erase(graph);
+
+  // If no more graphs are registered, deallocate the GPU memory for the seed
+  // and offset.
+  if (registered_graphs_.empty()) {
+    seed_extragraph_.reset();
+    offset_extragraph_.reset();
+  }
+}
+
+/**
+ * Note [Explicit Registration of Generators to the CUDA Graph]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * Ideally, it would be more user-friendly if the state could be exchanged and generators
+ * could be registered with the CUDA graph implicitly. However, resetting GPU tensors during
+ * the capture stage causes these reset operations to be recorded within the CUDA graph.
+ * This behavior is undesirable because we do not want these tensors to be reset during
+ * the replay stage of the graph.
+ *
+ * As of now, there is no available method to perform a CUDA operation during the graph's
+ * recording phase without having that operation be included in the CUDA graph.
+ * This limitation necessitates explicit user action to register generators with the graph.
+ * By requiring users to manually register their generators, we can ensure that state resets
+ * (capture_prologue) only occur before the graph capture begins, thus avoiding unintended
+ * resets during the replay of the graph. See https://github.com/pytorch/pytorch/pull/114068.
+ */
+
+/**
+ * Performs the prologue steps for capturing a CUDA graph state.
+ * This method is intended to reset graph-related state variables before capturing begins.
+ */
+void CUDAGeneratorState::capture_prologue() {
+  capturing_ = true;
+  offset_intragraph_ = 0;
+  seed_extragraph_.fill_(int64_t(seed_));
+  offset_extragraph_.fill_(int64_t(0));
+}
+
+/**
+ * Ends the capturing phase and resets related variables, returning the whole
+ * graph increment.
+ */
+uint64_t CUDAGeneratorState::capture_epilogue() {
+  capturing_ = false;
+  return offset_intragraph_;
+}
+
+/**
+ * Prepares the state for replay by setting initial state tensors and applying
+ * total increment.
+ */
+void CUDAGeneratorState::replay_prologue(uint64_t wholegraph_increment) {
+  // Ensures the generator is not in capturing mode.
+  at::cuda::assertNotCapturing(
+      "Cannot prepare for replay during capturing stage.");
+  seed_extragraph_.fill_(int64_t(seed_));
+  offset_extragraph_.fill_(int64_t(philox_offset_per_thread_));
+  // Applies the total increment achieved during previous captures to update the
+  // offset.
+  increase(wholegraph_increment);
+}
+
 /**
  * Note [Why enforce RNG offset % 4 == 0?]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -97,8 +244,18 @@ Generator createCUDAGenerator(DeviceIndex device_index) {
  */
 CUDAGeneratorImpl::CUDAGeneratorImpl(DeviceIndex device_index)
   : c10::GeneratorImpl{Device(DeviceType::CUDA, device_index),
-              DispatchKeySet(c10::DispatchKey::CUDA)} {
+          DispatchKeySet(c10::DispatchKey::CUDA)} {
   at::cuda::assertNotCapturing("Cannot construct a new CUDAGeneratorImpl");
+  state_ = make_intrusive<CUDAGeneratorState>();
+  no_reset_rnn_state_.clear();
+}
+
+CUDAGeneratorImpl::CUDAGeneratorImpl(
+    DeviceIndex device_index,
+    c10::intrusive_ptr<CUDAGeneratorState> state)
+    : c10::
+          GeneratorImpl{Device(DeviceType::CUDA, device_index), DispatchKeySet(c10::DispatchKey::CUDA)},
+      state_(std::move(state)) {
   no_reset_rnn_state_.clear();
 }
 
@@ -109,9 +266,10 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(DeviceIndex device_index)
  * See Note [Acquire lock when using random generators]
  */
 void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
-  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::set_current_seed");
-  seed_ = seed;
-  philox_offset_per_thread_ = 0;
+  at::cuda::assertNotCapturing(
+      "Cannot call CUDAGeneratorImpl::set_current_seed");
+  state_->seed_ = seed;
+  state_->philox_offset_per_thread_ = 0;
   no_reset_rnn_state_.clear();
 }
 
@@ -134,15 +292,9 @@ uint64_t CUDAGeneratorImpl::get_offset() const {
   // Debatable if get_offset() should be allowed in captured regions.
   // Conservatively disallow it for now.
   at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::get_offset");
-  return philox_offset_per_thread_;
+  return state_->philox_offset_per_thread_;
 }
 
-#define CAPTURE_DEFAULT_GENS_MSG \
-"In regions captured by CUDA graphs, you may only use the default CUDA RNG " \
-"generator on the device that's current when capture begins. " \
-"If you need a non-default (user-supplied) generator, or a generator on another " \
-"device, please file an issue."
-
 /**
  * Gets the current seed of CUDAGeneratorImpl.
  */
@@ -150,7 +302,7 @@ uint64_t CUDAGeneratorImpl::current_seed() const {
   // Debatable if current_seed() should be allowed in captured regions.
   // Conservatively disallow it for now.
   at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed");
-  return seed_;
+  return state_->seed_;
 }
 
 /**
@@ -194,6 +346,8 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
  * and size of the internal state.
  */
 void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
+  at::cuda::assertNotCapturing(
+      "Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing.");
   static const size_t seed_size = sizeof(uint64_t);
   static const size_t offset_size = sizeof(int64_t);
   static const size_t total_size = seed_size + offset_size;
@@ -208,7 +362,7 @@ void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
     TORCH_CHECK(new_state_size == total_size, "RNG state is wrong size");
   }
 
-  uint64_t input_seed;
+  uint64_t input_seed = 0;
   auto new_rng_state = new_state.data_dtype_initialized<uint8_t>();
   memcpy(&input_seed, new_rng_state, seed_size);
   this->set_current_seed(input_seed);
@@ -219,44 +373,59 @@ void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
   this->set_philox_offset_per_thread(static_cast<uint64_t>(philox_offset));
 }
 
+/**
+ * Sets the generator's current state to
+ * This function allows switching between different registered states of
+ * the generator.
+ */
+void CUDAGeneratorImpl::graphsafe_set_state(
+    const c10::intrusive_ptr<GeneratorImpl>& gen) {
+  c10::intrusive_ptr<CUDAGeneratorImpl> cuda_gen =
+      dynamic_intrusive_pointer_cast<CUDAGeneratorImpl>(gen);
+  TORCH_CHECK(cuda_gen, "Expected a CUDA Generator");
+  state_ = cuda_gen->state_;
+}
+
+/**
+ * Get the GeneratorImpl that point to current state_
+ */
+c10::intrusive_ptr<c10::GeneratorImpl> CUDAGeneratorImpl::graphsafe_get_state()
+    const {
+  auto gen = make_intrusive<CUDAGeneratorImpl>(device().index(), state_);
+  return gen;
+}
+
 /**
  * Sets the philox_offset_per_thread_ to be used by curandStatePhilox4_32_10
  *
  * See Note [Acquire lock when using random generators]
  */
 void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
-  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::set_philox_offset_per_thread");
   // see Note [Why enforce RNG offset % 4 == 0?]
   TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
-  philox_offset_per_thread_ = offset;
+  state_->philox_offset_per_thread_ = offset;
 }
 
 /**
  * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl.
  */
 uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const {
-  at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::philox_offset_per_thread");
-  return philox_offset_per_thread_;
+  return state_->philox_offset_per_thread_;
 }
 
 /**
- * Called by CUDAGraph to prepare this instance for a graph capture region.
- * offset_extragraph is the initial offset at the start of the graphed region.
- * offset_intragraph tracks the offset in the graphed region.
+ * Registers this state to a CUDA graph to manage within the graph.
  */
-void CUDAGeneratorImpl::capture_prologue(int64_t* seed_extragraph, int64_t* offset_extragraph) {
-  seed_extragraph_ = seed_extragraph;
-  offset_extragraph_ = offset_extragraph;
-  offset_intragraph_ = 0;
-  graph_expects_this_gen_ = true;
+void CUDAGeneratorImpl::register_graph(cuda::CUDAGraph* graph) {
+  graph->register_generator_state(state_);
+  state_->register_graph(graph);
 }
 
 /**
- * Called by CUDAGraph to finalize a graph capture region for this instance.
+ * Unregisters a CUDA graph from the RNG state.
  */
-uint64_t CUDAGeneratorImpl::capture_epilogue() {
-  graph_expects_this_gen_ = false;
-  return offset_intragraph_;
+void CUDAGeneratorImpl::unregister_graph(cuda::CUDAGraph* graph) {
+  state_->unregister_graph(graph);
 }
 
 /**
@@ -281,30 +450,17 @@ uint64_t CUDAGeneratorImpl::capture_epilogue() {
  * See Note [Acquire lock when using random generators]
  */
 PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) {
-  // rounds increment up to the nearest multiple of 4
-  increment = ((increment + 3) / 4) * 4;
   if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) {
-    TORCH_CHECK(graph_expects_this_gen_,
-                "philox_cuda_state for an unexpected CUDA generator used during capture. "
-                CAPTURE_DEFAULT_GENS_MSG);
-    // see Note [Why enforce RNG offset % 4 == 0?]
-    TORCH_INTERNAL_ASSERT(this->offset_intragraph_ % 4 == 0);
-    uint32_t offset = this->offset_intragraph_;
-    TORCH_INTERNAL_ASSERT(this->offset_intragraph_ <=
-                          std::numeric_limits<uint32_t>::max() - increment);
-    this->offset_intragraph_ += increment;
-    return PhiloxCudaState(this->seed_extragraph_,
-                           this->offset_extragraph_,
-                           offset);
+    uint32_t offset = state_->offset_intragraph_;
+    state_->increase(increment);
+    return PhiloxCudaState(
+        state_->seed_extragraph_.data_ptr<int64_t>(),
+        state_->offset_extragraph_.data_ptr<int64_t>(),
+        offset);
   } else {
-    TORCH_CHECK(!graph_expects_this_gen_,
-                "CUDA generator expects graph capture to be underway, "
-                "but the current stream is not capturing.");
-    // see Note [Why enforce RNG offset % 4 == 0?]
-    TORCH_INTERNAL_ASSERT(this->philox_offset_per_thread_ % 4 == 0);
-    uint64_t offset = this->philox_offset_per_thread_;
-    this->philox_offset_per_thread_ += increment;
-    return PhiloxCudaState(this->seed_, offset);
+    uint64_t offset = state_->philox_offset_per_thread_;
+    state_->increase(increment);
+    return PhiloxCudaState(state_->seed_, offset);
   }
 }
 
@@ -312,16 +468,13 @@ PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) {
  * Temporarily accommodates call sites that use philox_engine_inputs.
  * Allows incremental refactor of call sites to use philox_cuda_state.
  */
-std::pair<uint64_t, uint64_t> CUDAGeneratorImpl::philox_engine_inputs(uint64_t increment) {
-  at::cuda::assertNotCapturing("Refactor this op to use CUDAGeneratorImpl::philox_cuda_state. "
-                               "Cannot call CUDAGeneratorImpl::philox_engine_inputs");
-  // rounds increment up to the nearest multiple of 4
-  increment = ((increment + 3) / 4) * 4;
-  // see Note [Why enforce RNG offset % 4 == 0?]
-  TORCH_INTERNAL_ASSERT(this->philox_offset_per_thread_ % 4 == 0);
-  uint64_t offset = this->philox_offset_per_thread_;
-  this->philox_offset_per_thread_ += increment;
-  return std::make_pair(this->seed_, offset);
+std::pair<uint64_t, uint64_t> CUDAGeneratorImpl::philox_engine_inputs(
+    uint64_t increment) {
+  at::cuda::assertNotCapturing(
+      "Refactor this op to use CUDAGeneratorImpl::philox_cuda_state. Cannot call CUDAGeneratorImpl::philox_engine_inputs");
+  uint64_t offset = state_->philox_offset_per_thread_;
+  state_->increase(increment);
+  return std::make_pair(state_->seed_, offset);
 }
 
 /*
@@ -348,9 +501,7 @@ std::shared_ptr<CUDAGeneratorImpl> CUDAGeneratorImpl::clone() const {
  */
 CUDAGeneratorImpl* CUDAGeneratorImpl::clone_impl() const {
   at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::clone_impl");
-  auto gen = new CUDAGeneratorImpl(this->device().index());
-  gen->set_current_seed(this->seed_);
-  gen->set_philox_offset_per_thread(this->philox_offset_per_thread_);
+  auto gen = new CUDAGeneratorImpl(this->device().index(), state_->clone());
   return gen;
 }
 
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.h b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
index 2fe8a6f6c8f4f..0fe664e35f54c 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
@@ -1,12 +1,19 @@
 #pragma once
 
+#include <ATen/Context.h>
 #include <ATen/core/Generator.h>
+#include <ATen/core/TensorBase.h>
 #include <ATen/cuda/PhiloxCudaState.h>
-#include <ATen/Context.h>
-#include <limits>
 #include <atomic>
-
+#include <limits>
+#include <memory>
+#include <unordered_set>
 namespace at {
+
+namespace cuda {
+struct CUDAGraph;
+}
+
 /**
  * Note [CUDA Graph-safe RNG states]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -87,9 +94,41 @@ namespace at {
  *
  */
 
+struct CUDAGeneratorState : public c10::intrusive_ptr_target {
+  uint64_t seed_;
+  uint64_t philox_offset_per_thread_;
+  uint32_t offset_intragraph_;
+  bool capturing_{};
+  std::unordered_set<cuda::CUDAGraph*> registered_graphs_;
+  at::TensorBase seed_extragraph_{};
+  at::TensorBase offset_extragraph_{};
+
+  CUDAGeneratorState(
+      uint64_t seed = default_rng_seed_val,
+      uint64_t philox_offset_per_thread = 0,
+      uint32_t offset_intragraph = 0)
+      : seed_(seed),
+        philox_offset_per_thread_(philox_offset_per_thread),
+        offset_intragraph_(offset_intragraph) {}
+
+  void increase(uint64_t increment);
+
+  void register_graph(cuda::CUDAGraph* graph);
+  void unregister_graph(cuda::CUDAGraph* graph);
+
+  void capture_prologue();
+  // capture_epilogue returns the wholegraph_increment
+  uint64_t capture_epilogue();
+  void replay_prologue(uint64_t wholegraph_increment);
+  c10::intrusive_ptr<CUDAGeneratorState> clone();
+};
+
 struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   // Constructors
   CUDAGeneratorImpl(DeviceIndex device_index = -1);
+  CUDAGeneratorImpl(
+      DeviceIndex device_index,
+      c10::intrusive_ptr<CUDAGeneratorState> state_);
   ~CUDAGeneratorImpl() override = default;
 
   // CUDAGeneratorImpl methods
@@ -101,10 +140,18 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   uint64_t seed() override;
   void set_state(const c10::TensorImpl& new_state) override;
   c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
+  void graphsafe_set_state(
+      const c10::intrusive_ptr<GeneratorImpl>& state) override;
+  c10::intrusive_ptr<c10::GeneratorImpl> graphsafe_get_state() const override;
+
   void set_philox_offset_per_thread(uint64_t offset);
   uint64_t philox_offset_per_thread() const;
-  void capture_prologue(int64_t* seed_extragraph, int64_t* offset_extragraph);
-  uint64_t capture_epilogue();
+
+  void register_graph(cuda::CUDAGraph* graph);
+  void unregister_graph(cuda::CUDAGraph* graph);
+
+  // Generates a PhiloxCudaState with a specified increment, and increment
+  // current state
   PhiloxCudaState philox_cuda_state(uint64_t increment);
 
   bool reset_rnn_state() {
@@ -117,14 +164,10 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
 
   static c10::DeviceType device_type();
 
-private:
+ private:
   CUDAGeneratorImpl* clone_impl() const override;
-  uint64_t seed_ = default_rng_seed_val;
-  uint64_t philox_offset_per_thread_ = 0;
-  int64_t* seed_extragraph_{};
-  int64_t* offset_extragraph_{};
-  uint32_t offset_intragraph_ = 0;
-  bool graph_expects_this_gen_ = false;
+
+  c10::intrusive_ptr<CUDAGeneratorState> state_;
   std::atomic_flag no_reset_rnn_state_;
 };
 
diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp
index 1093426c983b6..436408f88a519 100644
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@@ -6,7 +6,10 @@
 #include <c10/cuda/CUDAFunctions.h>
 
 #include <chrono>
+#include <cstddef>
+#include <cstdint>
 #include <thread>
+#include <vector>
 
 namespace at::cuda {
 
@@ -86,26 +89,33 @@ CUDAGraph::CUDAGraph()
 #endif
 }
 
+void CUDAGraph::register_generator_state(
+    c10::intrusive_ptr<at::CUDAGeneratorState> state) {
+  captured_generator_states_[std::move(state)] = 0;
+}
+
+void CUDAGraph::register_generator_state(const at::Generator& generator) {
+  c10::intrusive_ptr<CUDAGeneratorImpl> cuda_gen =
+      dynamic_intrusive_pointer_cast<CUDAGeneratorImpl>(
+          generator.getIntrusivePtr());
+  cuda_gen->register_graph(this);
+}
+
 void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capture_mode) {
 #if !defined(USE_ROCM) || ROCM_VERSION >= 50300
   TORCH_CHECK(!has_graph_exec_,
               "This CUDAGraph instance already owns a captured graph. "
               "To capture a new graph, create a new instance.");
 
-  // For now, a CUDAGraph instance only accommodates the default generator on the device that's
-  // current when capture begins. If any op in the captured region uses a non-default generator,
-  // or a generator on another device, the offending generator will throw an error.
-  // These restrictions simplify CUDAGraph, but could be relaxed in the future:
-  // in principle, the underlying Cuda calls do permit cross-device ops to be captured.
+  // default generator is always registered
   auto* gen = get_generator_or_default<CUDAGeneratorImpl>(
       c10::nullopt, cuda::detail::getDefaultCUDAGenerator());
+  gen->register_graph(this);
 
-  auto options = TensorOptions().device(at::kCUDA).dtype(at::kLong);
-  seed_extragraph_ = at::empty({1}, options);
-  offset_extragraph_ = at::empty({1}, options);
-
-  seed_extragraph_.fill_(int64_t(gen->current_seed()));
-  gen->capture_prologue(seed_extragraph_.data_ptr<int64_t>(), offset_extragraph_.mutable_data_ptr<int64_t>());
+  for (auto& [generator_state, wholegraph_increments] :
+       captured_generator_states_) {
+    generator_state->capture_prologue();
+  }
 
   auto stream = at::cuda::getCurrentCUDAStream();
 
@@ -115,7 +125,6 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
               "default stream.)");
 
   capture_stream_ = stream;
-  capture_gen_ = gen;
   capture_dev_ = c10::cuda::current_device();
 
   id_ = capture_sequence_id();
@@ -215,13 +224,10 @@ void CUDAGraph::capture_end() {
 
   has_graph_exec_ = true;
 
-  auto* gen = get_generator_or_default<CUDAGeneratorImpl>(
-      c10::nullopt, cuda::detail::getDefaultCUDAGenerator());
-  TORCH_CHECK(gen == capture_gen_,
-              "Default CUDA RNG generator on current device at capture end "
-              "is different from default generator on current device "
-              "when capture began");
-  wholegraph_increment_ = gen->capture_epilogue();
+  for (auto& [generator_state, wholegraph_increments] :
+       captured_generator_states_) {
+    wholegraph_increments = generator_state->capture_epilogue();
+  }
 
   size_t numCUDAGraphNodes = 0;
   AT_CUDA_CHECK(cudaGraphGetNodes(graph_, NULL, &numCUDAGraphNodes));
@@ -251,17 +257,10 @@ void CUDAGraph::replay() {
 
   c10::OptionalDeviceGuard device_guard{capture_stream_.device()};
 
-  // Just like any RNG consumer kernel!
-  auto* gen = get_generator_or_default<CUDAGeneratorImpl>(
-      c10::nullopt, cuda::detail::getDefaultCUDAGenerator());
-  PhiloxCudaState rng_engine_inputs;
-  {
-    std::lock_guard<std::mutex> lock(gen->mutex_);
-    rng_engine_inputs = gen->philox_cuda_state(wholegraph_increment_);
+  for (auto& [generator_state, wholegraph_increments] :
+       captured_generator_states_) {
+    generator_state->replay_prologue(wholegraph_increments);
   }
-  seed_extragraph_.fill_(int64_t(gen->current_seed()));
-  offset_extragraph_.fill_(int64_t(rng_engine_inputs.offset_.val));
-
   // graph_exec_ may be replayed in any stream.
   AT_CUDA_CHECK(cudaGraphLaunch(graph_exec_, at::cuda::getCurrentCUDAStream()));
 
@@ -355,6 +354,10 @@ TORCH_CHECK(has_graph_exec_,
 }
 
 CUDAGraph::~CUDAGraph() {
+  for (auto& [generator_state, wholegraph_increments] :
+       captured_generator_states_) {
+    generator_state->unregister_graph(this);
+  }
   reset();
 }
 
diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h
index b395de9a252a7..3acdad18b0eee 100644
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@@ -4,12 +4,13 @@
 #include <c10/core/Device.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAStream.h>
-
-#include <mutex>
+#include <c10/util/flat_hash_map.h>
 
 namespace at {
 
+struct Generator;
 struct CUDAGeneratorImpl;
+struct CUDAGeneratorState;
 
 namespace cuda {
 
@@ -24,7 +25,12 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   static void inc_pending_event_queries();
   static void dec_pending_event_queries();
   static int num_pending_event_queries();
-  void capture_begin(MempoolId_t pool={0, 0}, cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal);
+  // See Note [Explicit Registration of Generators to the CUDA Graph]
+  void register_generator_state(c10::intrusive_ptr<at::CUDAGeneratorState> state);
+  void register_generator_state(const at::Generator& generator);
+  void capture_begin(
+      MempoolId_t pool = {0, 0},
+      cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal);
   void capture_end();
   void replay();
   void reset();
@@ -32,7 +38,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   void enable_debug_mode();
   void debug_dump(const std::string& debug_path);
 
-  protected:
+ protected:
 #if !defined(USE_ROCM) || ROCM_VERSION >= 50300
   cudaGraph_t graph_ = NULL;
   cudaGraphExec_t graph_exec_ = NULL;
@@ -73,19 +79,16 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   // Stream on which capture began
   at::cuda::CUDAStream capture_stream_;
 
-  // Default generator on device where capture began
-  at::CUDAGeneratorImpl* capture_gen_;
+  // multiple generator states and their wholegraph_increments in this graph
+  // that are managed by the CUDA Graph
+  ska::flat_hash_map<c10::intrusive_ptr<at::CUDAGeneratorState>, uint64_t>
+      captured_generator_states_;
 
   // Device where capture occurred. Right now, for simplicity, we require all ops
   // in a capture to run on the same device, but this is a limitation of CUDAGraph,
   // not CUDA itself.  We can straightforwardly modify CUDAGraph to support multi-device
   // captures if needed.
   int capture_dev_;
-
-  // RNG state trackers
-  at::Tensor seed_extragraph_;
-  at::Tensor offset_extragraph_;
-  uint64_t wholegraph_increment_;
 };
 
 } // namespace cuda
diff --git a/aten/src/ATen/cuda/CUDASparse.h b/aten/src/ATen/cuda/CUDASparse.h
index 0d4520938291c..1052469ea7d8a 100644
--- a/aten/src/ATen/cuda/CUDASparse.h
+++ b/aten/src/ATen/cuda/CUDASparse.h
@@ -30,33 +30,19 @@
 #endif
 
 #if defined(USE_ROCM)
-
 // hipSparse const API added in v2.4.0
 #if HIPSPARSE_VERSION >= 200400
 #define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 1
-#define AT_USE_HIPSPARSE_GENERIC_52_API() 0
+#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
 #define AT_USE_HIPSPARSE_GENERIC_API() 1
 #else
 #define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
-
-// hipSparse Generic API ROCm 5.2
-#if ROCM_VERSION >= 50200
-#define AT_USE_HIPSPARSE_GENERIC_52_API() 1
-#else
-#define AT_USE_HIPSPARSE_GENERIC_52_API() 0
-#endif
-
-// hipSparse Generic API ROCm 5.1
-#if ROCM_VERSION >= 50100
+#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 1
 #define AT_USE_HIPSPARSE_GENERIC_API() 1
-#else
-#define AT_USE_HIPSPARSE_GENERIC_API() 0
 #endif
-
-#endif // HIPSPARSE_VERSION >= 200400
 #else // USE_ROCM
 #define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
-#define AT_USE_HIPSPARSE_GENERIC_52_API() 0
+#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
 #define AT_USE_HIPSPARSE_GENERIC_API() 0
 #endif // USE_ROCM
 
diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
index e01663b3f28c9..3004eb142684f 100644
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
@@ -7,6 +7,10 @@
 
 namespace at::cuda::sparse {
 
+cusparseStatus_t destroyConstDnMat(const cusparseDnMatDescr* dnMatDescr) {
+  return cusparseDestroyDnMat(const_cast<cusparseDnMatDescr*>(dnMatDescr));
+}
+
 #if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
 
 namespace {
@@ -51,8 +55,8 @@ cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type) {
   }
 }
 
-#if AT_USE_HIPSPARSE_GENERIC_52_API() || AT_USE_CUSPARSE_GENERIC_API()
-CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset) {
+#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
+cusparseDnMatDescr_t createRawDnMatDescriptor(const Tensor& input, int64_t batch_offset, bool is_const=false) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.layout() == kStrided);
   IntArrayRef input_strides = input.strides();
   IntArrayRef input_sizes = input.sizes();
@@ -79,12 +83,16 @@ CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input, int64_t ba
 #endif
 
   auto batch_stride = ndim > 2 && batch_offset >= 0 ? input_strides[ndim - 3] : 0;
-  void* values_ptr = static_cast<char*>(input.data_ptr()) +
+  void* data_ptr = is_const ? const_cast<void*>(input.const_data_ptr()) : input.data_ptr();
+  void* values_ptr = static_cast<char*>(data_ptr) +
       batch_offset * batch_stride * input.itemsize();
 
   cudaDataType value_type = ScalarTypeToCudaDataType(input.scalar_type());
   check_supported_cuda_type(value_type);
 
+  // NOTE: Ideally, in the const case, we would use cusparseConstDnMatDescr_t
+  // and cusparseCreateConstDnMat, but those were introduced in CUDA 12, and we
+  // still need to support CUDA 11
   cusparseDnMatDescr_t raw_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseCreateDnMat(
       &raw_descriptor,
@@ -101,10 +109,17 @@ CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input, int64_t ba
     TORCH_CUDASPARSE_CHECK(cusparseDnMatSetStridedBatch(
         raw_descriptor, batch_count, input_strides[ndim - 3]));
   }
+  return raw_descriptor;
+}
 
-  descriptor_.reset(raw_descriptor);
+CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset) {
+  descriptor_.reset(createRawDnMatDescriptor(input, batch_offset));
+}
+
+CuSparseConstDnMatDescriptor::CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset) {
+  descriptor_.reset(createRawDnMatDescriptor(input, batch_offset, /*is_const*/true));
 }
-#endif // AT_USE_HIPSPARSE_GENERIC_52_API() || AT_USE_CUSPARSE_GENERIC_API()
+#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
 
 CuSparseDnVecDescriptor::CuSparseDnVecDescriptor(const Tensor& input) {
   // cuSPARSE doesn't support batched vectors
@@ -175,7 +190,6 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6
       value_type // data type of values
       ));
 
-#if AT_USE_HIPSPARSE_GENERIC_52_API() || !defined(USE_ROCM)
   if (ndim == 3 && batch_offset == -1) {
     int batch_count =
         at::native::cuda_int_cast(at::native::batchCount(input), "batch_count");
@@ -197,9 +211,6 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6
           cusparseCsrSetStridedBatch(raw_descriptor, batch_count, 0, 0));
     }
   }
-#else
-  TORCH_CHECK(ndim == 2, "Experimental support for batched CSR matrices is implemented only for CUDA 11+");
-#endif
 
   descriptor_.reset(raw_descriptor);
 }
diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.h b/aten/src/ATen/cuda/CUDASparseDescriptors.h
index 03958b1d404b9..9e3d50f34e77b 100644
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.h
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.h
@@ -73,6 +73,10 @@ using bsrsm2Info = std::remove_pointer<bsrsm2Info_t>::type;
 #endif
 #endif
 
+// NOTE: This is only needed for CUDA 11 and earlier, since CUDA 12 introduced
+// API for const descriptors
+cusparseStatus_t destroyConstDnMat(const cusparseDnMatDescr* dnMatDescr);
+
 class TORCH_CUDA_CPP_API CuSparseMatDescriptor
     : public CuSparseDescriptor<cusparseMatDescr, &cusparseDestroyMatDescr> {
  public:
@@ -123,14 +127,25 @@ class TORCH_CUDA_CPP_API CuSparseBsrsm2Info
 
 cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
 
-#if AT_USE_HIPSPARSE_GENERIC_52_API() || \
-    (AT_USE_CUSPARSE_GENERIC_API() && AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS())
+#if AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS()
 class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
     : public CuSparseDescriptor<cusparseDnMatDescr, &cusparseDestroyDnMat> {
  public:
   explicit CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
 };
 
+class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor
+    : public CuSparseDescriptor<const cusparseDnMatDescr, &destroyConstDnMat> {
+ public:
+  explicit CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
+  cusparseDnMatDescr* unsafe_mutable_descriptor() const {
+    return const_cast<cusparseDnMatDescr*>(descriptor());
+  }
+  cusparseDnMatDescr* unsafe_mutable_descriptor() {
+    return const_cast<cusparseDnMatDescr*>(descriptor());
+  }
+};
+
 class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor
     : public CuSparseDescriptor<cusparseDnVecDescr, &cusparseDestroyDnVec> {
  public:
@@ -140,8 +155,6 @@ class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor
 class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
     : public CuSparseDescriptor<cusparseSpMatDescr, &cusparseDestroySpMat> {};
 
-//AT_USE_HIPSPARSE_GENERIC_52_API() || (AT_USE_CUSPARSE_GENERIC_API() && AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS())
-
 #elif AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
   class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
       : public ConstCuSparseDescriptor<
@@ -153,6 +166,22 @@ class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
         int64_t batch_offset = -1);
   };
 
+  class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor
+      : public ConstCuSparseDescriptor<
+            const cusparseDnMatDescr,
+            &destroyConstDnMat> {
+   public:
+    explicit CuSparseConstDnMatDescriptor(
+        const Tensor& input,
+        int64_t batch_offset = -1);
+  cusparseDnMatDescr* unsafe_mutable_descriptor() const {
+    return const_cast<cusparseDnMatDescr*>(descriptor());
+  }
+  cusparseDnMatDescr* unsafe_mutable_descriptor() {
+    return const_cast<cusparseDnMatDescr*>(descriptor());
+  }
+  };
+
   class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor
       : public ConstCuSparseDescriptor<
             cusparseDnVecDescr,
@@ -165,7 +194,7 @@ class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
       : public ConstCuSparseDescriptor<
             cusparseSpMatDescr,
             &cusparseDestroySpMat> {};
-#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS()
+#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
 
 class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor
     : public CuSparseSpMatDescriptor {
diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp
index 22dbb661f18b4..f4f22711d61a3 100644
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@@ -8,34 +8,11 @@
 #include <c10/cuda/CUDAAllocatorConfig.h>
 
 #include <cuda_runtime_api.h>
-#include <stdint.h>
-#include <deque>
 #include <future>
-#include <memory>
-#include <mutex>
-#include <set>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
 
 namespace at::cuda {
 namespace {
 
-struct BlockSize {
-  size_t size_{0};
-  void* ptr_{nullptr};
-};
-
-struct Block {
-  size_t size_{0};
-  void* ptr_{nullptr};
-
-  std::mutex mutex_;
-  bool allocated_{false};
-  size_t event_count_{0};
-  std::unordered_set<at::cuda::CUDAStream> streams_;
-};
-
 // Note: cudaEventCreate when concurrently invoked from multiple threads can be
 // very expensive (at least on certain device/driver combinations). Thus, we a)
 // serialize event creation at a per-device level, and b) pool the events to
@@ -89,81 +66,12 @@ class EventPool {
   std::vector<PerDevicePool> pools_;
 };
 
-// Used for heterogenous lookup support in the free list.
-struct BlockComparator {
-  using is_transparent = void;
-  bool operator()(const Block* a, const Block* b) const {
-    if (a->size_ != b->size_) {
-      return a->size_ < b->size_;
-    }
-    return (uintptr_t)a->ptr_ < (uintptr_t)b->ptr_;
-  }
-
-  // Transparent overloads
-  bool operator()(const Block* a, BlockSize b) const {
-    if (a->size_ != b.size_) {
-      return a->size_ < b.size_;
-    }
-    return (uintptr_t)a->ptr_ < (uintptr_t)b.ptr_;
-  }
-  bool operator()(BlockSize a, const Block* b) const {
-    if (a.size_ != b->size_) {
-      return a.size_ < b->size_;
-    }
-    return (uintptr_t)a.ptr_ < (uintptr_t)b->ptr_;
-  }
-};
-
-/**
- * Note [CUDAHostAllocator design]
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- * We have three key data structures - the free list which stores blocks that
- * are not currently used, the block list which stores all blocks that have been
- * allocated, and the event queue which stores CUDA events and their
- * corresponding blocks.
- *
- * Each of these are protected by a separate mutex. The key design principles
- * are to 1) only hold each mutex for the minimal amount of time possible, 2)
- * never do any possible expensive operations (such as CUDA runtime API calls)
- * while holding the lock.
- *
- * There are three public methods: allocate, free, and record_event. In the
- * allocate path, we first check to see if we can service our request from this
- * free list, and otherwise we create a new block with cudaHostAlloc. In the
- * free path, we insert events (if required) into the event queue, and if
- * possible insert our block back into the free list. In allocate, we first
- * eagerly query events until we find one that is not ready, and insert the
- * corresponding block onto the free list if all the events recorded for a
- * block are ready. In the record_event path, we simply insert the given
- * stream into the set of streams tracked by the specified block. This set of
- * streams is then consumed in the free path.
- *
- * Some of the invariants here are less strict than they could be - for example,
- * we do not enforce that free(Block* block) => block->event_count == 0. This is
- * for compatibility reasons, and we can explore enforcing these in subsequent
- * versions.
- */
-class CUDAHostAllocator {
- public:
-  std::pair<void*, void*> allocate(size_t size) {
-    if (size == 0) {
-      return {nullptr, nullptr};
-    }
+using Block = HostBlock<CUDAStream>;
 
-    process_events();
-
-    // First, try to allocate from the free list
-    {
-      std::lock_guard<std::mutex> g(free_list_mutex_);
-      auto it = free_list_.lower_bound(BlockSize{size, nullptr});
-      if (it != free_list_.end()) {
-        auto block = *it;
-        block->allocated_ = true;
-        free_list_.erase(it);
-        return {block->ptr_, reinterpret_cast<void*>(block)};
-      }
-    }
-    // Then, create a new block.
+struct CUDACachingHostAllocatorImpl
+    : public CachingHostAllocatorImpl<CUDAStream, EventPool::Event> {
+ private:
+  void allocate_host_memory(size_t size, void** ptr) override {
     // Pinned memory pointers allocated by any device can be directly used by
     // any other device, regardless of the current device at the time of
     // allocation, since we assume unified addressing. So we grab any existing
@@ -176,188 +84,49 @@ class CUDAHostAllocator {
           at::Device(at::DeviceType::CUDA, *primary_ctx_device_index));
     }
 
-    // Round up the allocation to the nearest power of two to improve reuse.
-    size_t roundSize = c10::llvm::PowerOf2Ceil(size);
-    void* ptr = nullptr;
     if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
             pinned_use_cuda_host_register()) {
-      allocWithCudaHostRegister(&ptr, roundSize);
+      allocWithCudaHostRegister(ptr, size);
     } else {
       // Use cudaHostAlloc for allocating pinned memory (global lock in driver)
-      C10_CUDA_CHECK(cudaHostAlloc(&ptr, roundSize, cudaHostAllocDefault));
+      C10_CUDA_CHECK(cudaHostAlloc(ptr, size, cudaHostAllocDefault));
     }
-
-    auto block = new Block();
-    block->size_ = roundSize;
-    block->ptr_ = ptr;
-    block->allocated_ = true;
-
-    {
-      std::lock_guard<std::mutex> g(blocks_mutex_);
-      blocks_.insert(block);
-      ptr_to_block_.insert({block->ptr_, block});
-    }
-    return {block->ptr_, reinterpret_cast<void*>(block)};
   }
 
-  void free(void* ctx) {
-    if (!ctx) {
-      return;
-    }
-
-    // Note: we can assume that free is correctly paired with alloc,
-    // and thus we do not need to look up the ctx in blocks_.
-    auto* block = reinterpret_cast<Block*>(ctx);
-
-    c10::optional<std::vector<EventPool::Event>> events;
-    {
-      std::lock_guard<std::mutex> g(block->mutex_);
-      block->allocated_ = false;
-      if (block->streams_.empty()) {
-        TORCH_INTERNAL_ASSERT(block->event_count_ == 0);
-      } else {
-        events = std::vector<EventPool::Event>();
-        events->reserve(block->streams_.size());
-        for (auto stream : block->streams_) {
-          auto event = event_pool_.get(stream.device_index());
-          event->record(stream);
-          events->push_back(std::move(event));
-        }
-        block->event_count_ += events->size();
-        block->streams_.clear();
-      }
-    }
-
-    if (!events) {
-      std::lock_guard<std::mutex> g(free_list_mutex_);
-      free_list_.insert(block);
+  void free_block(Block* block) override {
+    if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
+            pinned_use_cuda_host_register()) {
+      void* ptr = block->ptr_;
+      AT_CUDA_CHECK(cudaHostUnregister(ptr));
+      free(ptr);
     } else {
-      std::lock_guard<std::mutex> g(cuda_events_mutex_);
-      for (auto&& event : *events) {
-        cuda_events_.emplace_front(std::move(event), block);
-      }
+      AT_CUDA_CHECK(cudaFreeHost(block->ptr_));
     }
   }
 
-  bool record_event(void* ptr, void* ctx, at::cuda::CUDAStream stream) {
-    auto* block = reinterpret_cast<Block*>(ctx);
-
-    // Note: we need to check if the passed-in `ctx` is valid. This is because
-    // `record_event` (via `CachingHostAllocator_recordEvent`) can be invoked on
-    // an arbitrary tensor, and is not guaranteed to correspond to a pinned
-    // memory allocation. Therefore, we need to check that `ctx` is valid before
-    // proceeding.
-    {
-      std::lock_guard<std::mutex> g(blocks_mutex_);
-      if (blocks_.find(block) != blocks_.end()) {
-        // Now we know this object is safe to access.
-        std::lock_guard<std::mutex> gb(block->mutex_);
-        TORCH_INTERNAL_ASSERT(block->allocated_);
-        block->streams_.insert(stream);
-        return true;
-      }
-      auto it = ptr_to_block_.find(ptr);
-      if (it != ptr_to_block_.end()) {
-        block = it->second;
-        std::lock_guard<std::mutex> g(block->mutex_);
-        TORCH_INTERNAL_ASSERT(block->allocated_);
-        block->streams_.insert(stream);
-        return true;
-      }
-    }
-
-    return false;
+  void record_stream(
+      c10::optional<std::vector<EventPool::Event>>& events,
+      CUDAStream stream) override {
+    auto event = create_event_internal(stream.device_index());
+    event->record(stream);
+    events->push_back(std::move(event));
   }
 
-  void empty_cache() {
-    // Flush any available blocks into the free_list.
-    process_events();
-
-    // Release cached events from the event pool.
-    event_pool_.empty_cache();
-
-    // Remove all elements from the free list, remove them from the blocks
-    // list, and free the associated pinned memory allocation. This requires
-    // concurrently holding both the free list mutex and the blocks mutex, and
-    // is the only function that concurrently holds multiple mutexes.
-    std::lock(free_list_mutex_, blocks_mutex_);
-    std::lock_guard<std::mutex> gf(free_list_mutex_, std::adopt_lock);
-    std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
-
-    std::vector<Block*> blocks_to_remove(free_list_.begin(), free_list_.end());
-    free_list_.clear();
-    for (auto* block : blocks_to_remove) {
-      blocks_.erase(block);
-      ptr_to_block_.erase(block->ptr_);
-      if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
-              pinned_use_cuda_host_register()) {
-        void* ptr = block->ptr_;
-        AT_CUDA_CHECK(cudaHostUnregister(ptr));
-        free(ptr);
-      } else {
-        AT_CUDA_CHECK(cudaFreeHost(block->ptr_));
-      }
-      delete block;
+  bool query_event(EventPool::Event& event) override {
+    cudaError_t err = cudaEventQuery(*event);
+    if (err == cudaErrorNotReady) {
+      (void)cudaGetLastError(); // clear CUDA error
+      return false;
+    } else if (err != cudaSuccess) {
+      C10_CUDA_CHECK(err);
     }
+    return true;
   }
 
- private:
-  void process_events() {
-    while (true) {
-      // Avoid calling cudaEventDestroy while holding a mutex, so move
-      // intermediate events out of the lock into this object.
-      c10::optional<std::pair<EventPool::Event, Block*>> processed;
-
-      {
-        std::lock_guard<std::mutex> g(cuda_events_mutex_);
-        if (!cuda_events_.empty()) {
-          processed = std::move(cuda_events_.back());
-          cuda_events_.pop_back();
-        }
-      }
-
-      if (!processed) {
-        return;
-      }
-
-      // otherwise, query the event
-      {
-        // now, see if we can handle this element
-        auto& event = processed->first;
-        cudaError_t err = cudaEventQuery(*event);
-        if (err == cudaErrorNotReady) {
-          (void)cudaGetLastError(); // clear CUDA error
-          // push the event onto the back of the queue if it's not
-          // ready. TODO: do we need some debouncing logic to avoid allocating
-          // threads repeatedly spinning on an event?
-          {
-            std::lock_guard<std::mutex> g(cuda_events_mutex_);
-            cuda_events_.push_back(std::move(*processed));
-          }
-          return;
-        } else if (err != cudaSuccess) {
-          C10_CUDA_CHECK(err);
-        }
-      }
-
-      // Process the events.
-      TORCH_INTERNAL_ASSERT(processed);
-      auto* block = processed->second;
-      bool available = false;
-      {
-        std::lock_guard<std::mutex> g(block->mutex_);
-        TORCH_INTERNAL_ASSERT(!block->allocated_)
-        block->event_count_--;
-        if (block->event_count_ == 0) {
-          available = true;
-        }
-      }
-
-      if (available) {
-        std::lock_guard<std::mutex> g(free_list_mutex_);
-        free_list_.insert(block);
-      }
-    }
+  EventPool::Event create_event_internal(DeviceIndex idx) {
+    // Leak the event pool to avoid shutdown issue.
+    static auto* event_pool = new EventPool();
+    return event_pool->get(idx);
   }
 
   TaskThreadPool* getThreadPool() {
@@ -402,7 +171,7 @@ class CUDAHostAllocator {
         "");
   }
 
-  inline void allocWithCudaHostRegister(void** ptr, size_t roundSize) {
+  void allocWithCudaHostRegister(void** ptr, size_t roundSize) {
     // Here we do regular allocation, pre-fault/map the pages, and then do
     // cudaHostRegister with GPU mapping flags to lock the pages, so we
     // can minimize the cost for the cuda global lock.
@@ -423,13 +192,19 @@ class CUDAHostAllocator {
       for (size_t i = 0; i < numMapThreads; i++) {
         promises.emplace_back();
         futures.push_back(promises[i].get_future());
-        auto task = [this, i, ptr, roundSize, numMapThreads, pageSize, &promises]() mutable {
+        auto task = [this,
+                     i,
+                     ptr,
+                     roundSize,
+                     numMapThreads,
+                     pageSize,
+                     &promises]() mutable {
           mapPagesForRegister(
-            *ptr,
-            roundSize,
-            i, // thread task-id
-            numMapThreads,
-            pageSize);
+              *ptr,
+              roundSize,
+              i, // thread task-id
+              numMapThreads,
+              pageSize);
           // set the promise when mapping pages are done
           promises[i].set_value();
         };
@@ -446,62 +221,48 @@ class CUDAHostAllocator {
     // Register the mapped pages using cudaHostRegister
     registerPages(*ptr, roundSize);
   }
+};
 
-  EventPool event_pool_;
-
-  alignas(64) std::mutex blocks_mutex_;
-  std::unordered_set<Block*> blocks_;
-  std::unordered_map<void*, Block*> ptr_to_block_;
-  // Note: sharding this mutex seems to be profitable in heavily multi-threaded
-  // scenarios.
-  alignas(64) std::mutex free_list_mutex_;
-  // Note: an alternative datastructure can yield significant wins here in
-  // microbenchmarks.
-  std::set<Block*, BlockComparator> free_list_;
+void raw_local_deleter(void* ptr);
 
-  alignas(64) std::mutex cuda_events_mutex_;
-  std::deque<std::pair<EventPool::Event, Block*>> cuda_events_;
+struct CUDACachingHostAllocator final
+    : public CachingHostAllocatorInterface<CUDACachingHostAllocatorImpl> {
+  at::DataPtr allocate(size_t size) override {
+    auto ptr_and_ctx = impl_->allocate(size);
+    return {
+        ptr_and_ctx.first,
+        ptr_and_ctx.second,
+        &raw_local_deleter,
+        at::DeviceType::CPU};
+  }
 };
 
-} // namespace
+CUDACachingHostAllocator caching_host_allocator;
 
-static CUDAHostAllocator& getCUDAHostAllocator() {
-  // leak and don't worry about shutdown
-  static auto* r = new CUDAHostAllocator();
-  return *r;
+static inline CUDACachingHostAllocator& getCUDACachingHostAllocator() {
+  return caching_host_allocator;
 }
 
-static void CUDAHostAllocatorDeleter(void* ctx) {
-  getCUDAHostAllocator().free(ctx);
+void raw_local_deleter(void* ptr) {
+  getCUDACachingHostAllocator().free(ptr);
 }
 
+} // anonymous namespace
+
 bool CachingHostAllocator_recordEvent(
     void* ptr,
     void* ctx,
     at::cuda::CUDAStream stream) {
-  return getCUDAHostAllocator().record_event(ptr, ctx, stream);
+  return getCUDACachingHostAllocator().record_event(ptr, ctx, stream);
 }
 
 // Releases cached pinned memory allocations via cudaHostFree
 void CachingHostAllocator_emptyCache() {
-  getCUDAHostAllocator().empty_cache();
+  getCUDACachingHostAllocator().empty_cache();
 }
 
-struct CUDAHostAllocatorWrapper final : public at::Allocator {
-  at::DataPtr allocate(size_t size) const override {
-    auto ptr_and_ctx = getCUDAHostAllocator().allocate(size);
-    return {
-        ptr_and_ctx.first,
-        ptr_and_ctx.second,
-        &CUDAHostAllocatorDeleter,
-        at::DeviceType::CPU};
-  }
-};
-
-static CUDAHostAllocatorWrapper cuda_host_allocator;
-
 at::Allocator* getCachingHostAllocator() {
-  return &cuda_host_allocator;
+  return &getCUDACachingHostAllocator();
 }
 
 } // namespace at::cuda
diff --git a/aten/src/ATen/cuda/CachingHostAllocator.h b/aten/src/ATen/cuda/CachingHostAllocator.h
index 65ad7f7d16e24..a7209582b2ba1 100644
--- a/aten/src/ATen/cuda/CachingHostAllocator.h
+++ b/aten/src/ATen/cuda/CachingHostAllocator.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <ATen/core/CachingHostAllocator.h>
 #include <c10/core/Allocator.h>
 #include <c10/cuda/CUDAStream.h>
 
@@ -17,15 +18,14 @@ namespace at::cuda {
 // call between host and device, and passed the corresponding context from the
 // allocation. This is currently invoked by at::native::copy_kernel_cuda.
 //
-// Note that this allocator does not split larger allocations into smaller
-// blocks, unlike the caching device allocator.
-//
 TORCH_CUDA_CPP_API c10::Allocator* getCachingHostAllocator();
 
 // Records an event in the specified stream. The allocation corresponding to the
 // input `ptr`/`ctx` will not be re-used until the event has occurred.
-TORCH_CUDA_CPP_API bool
-CachingHostAllocator_recordEvent(void* ptr, void* ctx, c10::cuda::CUDAStream stream);
+TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent(
+    void* ptr,
+    void* ctx,
+    c10::cuda::CUDAStream stream);
 
 // Releases cached pinned memory allocations via cudaHostFree
 TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache();
diff --git a/aten/src/ATen/cuda/CuSparseHandlePool.cpp b/aten/src/ATen/cuda/CuSparseHandlePool.cpp
index 1a57044138ab2..58ba5019dff50 100644
--- a/aten/src/ATen/cuda/CuSparseHandlePool.cpp
+++ b/aten/src/ATen/cuda/CuSparseHandlePool.cpp
@@ -26,7 +26,7 @@ using CuSparsePoolType = DeviceThreadHandlePool<cusparseHandle_t, createCusparse
 } // namespace
 
 cusparseHandle_t getCurrentCUDASparseHandle() {
-  int device;
+  c10::DeviceIndex device = 0;
   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
 
   // Thread local PoolWindows are lazily-initialized
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
index 55a7be519b526..6a03803720b5a 100644
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -1,4 +1,5 @@
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <ATen/cuda/detail/DeviceThreadHandles.h>
 
 #include <c10/cuda/CUDACachingAllocator.h>
@@ -76,7 +77,7 @@ using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle
 } // namespace
 
 void clearCublasWorkspaces() {
-  #if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION < 12200
+  #if !defined(USE_ROCM)
       cublas_handle_stream_to_workspace().clear();
   #endif
 }
@@ -122,9 +123,21 @@ at::DataPtr getNewWorkspace() {
 }
 
 cublasHandle_t getCurrentCUDABlasHandle() {
-  int device;
+  c10::DeviceIndex device = 0;
   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
 
+#if !defined(USE_ROCM)
+  CUcontext pctx = nullptr;
+  at::globalContext().getNVRTC().cuCtxGetCurrent(&pctx);
+  if (C10_UNLIKELY(!pctx)) {
+    // workaround for corner case where a primary context exists but is not
+    // the current context, seen in multithreaded use-cases
+    TORCH_WARN_ONCE("Attempting to run cuBLAS, but there was no current CUDA context! Attempting to set the primary context...");
+    at::globalContext().getNVRTC().cuDevicePrimaryCtxRetain(&pctx, device);
+    at::globalContext().getNVRTC().cuCtxSetCurrent(pctx);
+  }
+#endif
+
   // Thread local PoolWindows are lazily-initialized
   // to avoid initialization issues that caused hangs on Windows.
   // See: https://github.com/pytorch/pytorch/pull/22405
@@ -143,10 +156,14 @@ cublasHandle_t getCurrentCUDABlasHandle() {
   auto handle = myPoolWindow->reserve(device);
   auto stream = c10::cuda::getCurrentCUDAStream();
   TORCH_CUDABLAS_CHECK(cublasSetStream(handle, stream));
-#if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION < 12200
-  // cuBLAS should not need an explicitly allocated workspace after CUDA 12.2
-  // to avoid increasing memory usage during graph captures
+#if !defined(USE_ROCM)
+  // We explicitly set the cublas workspace even though CUDA 12.2+ fixed the
+  // issue where memory usage increased during graph capture.
   // original issue: https://github.com/pytorch/pytorch/pull/83461
+  // This is because in CUDA 12.2+, the use of cudaMallocAsync in cublas
+  // will allocate memory dynamically (even if they're cheap) outside
+  // PyTorch's CUDA caching allocator. It's possible that CCA used up
+  // all the memory and cublas's cudaMallocAsync will return OOM
   cudaStream_t _stream = stream;
   auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
   auto workspace_it = cublas_handle_stream_to_workspace().find(key);
@@ -154,8 +171,6 @@ cublasHandle_t getCurrentCUDABlasHandle() {
     workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
   }
   TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
-#endif
-#if !defined(USE_ROCM)
   // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
   // FP32 data type calculations based on the value of the allow_tf32 flag.
   // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH.
@@ -164,8 +179,7 @@ cublasHandle_t getCurrentCUDABlasHandle() {
   } else {
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
   }
-#endif
-#if defined(USE_ROCM)
+#else
   hipblasAtomicsMode_t hipblas_mode;
   if (at::globalContext().deterministicAlgorithms()) {
     hipblas_mode = HIPBLAS_ATOMICS_NOT_ALLOWED;
@@ -177,10 +191,10 @@ cublasHandle_t getCurrentCUDABlasHandle() {
   return handle;
 }
 
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 cublasLtHandle_t getCurrentCUDABlasLtHandle() {
 #ifdef USE_ROCM
-  int device;
+  c10::DeviceIndex device = 0;
   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
 
   // Thread local PoolWindows are lazily-initialized
diff --git a/aten/src/ATen/cuda/Exceptions.h b/aten/src/ATen/cuda/Exceptions.h
index a15f0d7947ec2..c647bc2531b4b 100644
--- a/aten/src/ATen/cuda/Exceptions.h
+++ b/aten/src/ATen/cuda/Exceptions.h
@@ -21,6 +21,15 @@ class CuDNNError : public c10::Error {
 
 }  // namespace c10
 
+#define AT_CUDNN_FRONTEND_CHECK(EXPR, ...)                                                      \
+  do {                                                                                          \
+    auto error_object = EXPR;                                                                   \
+    if (!error_object.is_good()) {                                                              \
+      TORCH_CHECK_WITH(CuDNNError, false,                                                       \
+            "cuDNN Frontend error: ", error_object.get_message());                              \
+    }                                                                                           \
+  } while (0)                                                                                   \
+
 #define AT_CUDNN_CHECK_WITH_SHAPES(EXPR, ...) AT_CUDNN_CHECK(EXPR, "\n", ##__VA_ARGS__)
 
 // See Note [CHECK macro]
diff --git a/aten/src/ATen/cuda/cub-RadixSortKeys.cu b/aten/src/ATen/cuda/cub-RadixSortKeys.cu
index cf88c8aa0cc89..74e82ae55cdee 100644
--- a/aten/src/ATen/cuda/cub-RadixSortKeys.cu
+++ b/aten/src/ATen/cuda/cub-RadixSortKeys.cu
@@ -51,5 +51,8 @@ void radix_sort_keys(
       int64_t end_bit);
 
 AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTATIATE_CUB_TEMPLATES)
+AT_INSTATIATE_CUB_TEMPLATES(uint16_t, UInt16)
+AT_INSTATIATE_CUB_TEMPLATES(uint32_t, UInt32)
+AT_INSTATIATE_CUB_TEMPLATES(uint64_t, UInt64)
 
 } // namespace at::cuda::cub
diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs.cu b/aten/src/ATen/cuda/cub-RadixSortPairs.cu
index bd20069cf6ad0..cc7c969300104 100644
--- a/aten/src/ATen/cuda/cub-RadixSortPairs.cu
+++ b/aten/src/ATen/cuda/cub-RadixSortPairs.cu
@@ -77,6 +77,9 @@ AT_INSTANTIATE_SORT_PAIRS(int64_t, 4)
   AT_INSTANTIATE_SORT_PAIRS(scalar_t, 8)
 
 AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTANTIATE_SORT_PAIRS_8)
+AT_INSTANTIATE_SORT_PAIRS(uint16_t, 8)
+AT_INSTANTIATE_SORT_PAIRS(uint32_t, 8)
+AT_INSTANTIATE_SORT_PAIRS(uint64_t, 8)
 
 // BFloat16 Radix sort is supported from ROCm 4.5 onwards
 #if !AT_ROCM_ENABLED() || (AT_ROCM_ENABLED() && ROCM_VERSION >= 40500)
diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
index 9663f354f764c..062c365a4e1a9 100644
--- a/aten/src/ATen/cuda/cub.cuh
+++ b/aten/src/ATen/cuda/cub.cuh
@@ -6,8 +6,6 @@
 #include <iterator>
 #include <limits>
 
-#include <c10/util/C++17.h>
-
 #include <ATen/cuda/cub_definitions.cuh>
 
 #if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 24c29a6381544..d3b80af2e8599 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -137,7 +137,7 @@ bool CUDAHooks::isPinnedPtr(const void* data) const {
   cudaPointerAttributes attr;
   // We do not believe that CUDA needs mutable access to the data
   // here.
-  cudaError_t err = cudaPointerGetAttributes(&attr, const_cast<void*>(data));
+  cudaError_t err = cudaPointerGetAttributes(&attr, data);
 #if !defined(USE_ROCM)
   if (err == cudaErrorInvalidValue) {
     (void)cudaGetLastError(); // clear CUDA error
@@ -184,6 +184,16 @@ bool CUDAHooks::hasCuSOLVER() const {
 #endif
 }
 
+bool CUDAHooks::hasCuBLASLt() const {
+#if defined(CUDART_VERSION)
+  return true;
+#elif AT_ROCM_ENABLED() && defined(ROCM_VERSION) && ROCM_VERSION >= 50700
+  return true;
+#else
+  return false;
+#endif
+}
+
 bool CUDAHooks::hasROCM() const {
   // Currently, this is same as `compiledWithMIOpen`.
   // But in future if there are ROCm builds without MIOpen,
@@ -227,7 +237,7 @@ const at::cuda::NVRTC& CUDAHooks::nvrtc() const {
 }
 
 DeviceIndex current_device() {
-  int device;
+  c10::DeviceIndex device = 0;
   cudaError_t err = c10::cuda::GetDevice(&device);
   if (err == cudaSuccess) {
     return device;
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index dddeab1e2675f..2002bd1b77402 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -27,6 +27,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   bool hasMAGMA() const override;
   bool hasCuDNN() const override;
   bool hasCuSOLVER() const override;
+  bool hasCuBLASLt() const override;
   bool hasROCM() const override;
   const at::cuda::NVRTC& nvrtc() const override;
   DeviceIndex current_device() const override;
diff --git a/aten/src/ATen/cuda/detail/IndexUtils.cuh b/aten/src/ATen/cuda/detail/IndexUtils.cuh
index 1eceaf690f5a1..db8519389e9ff 100644
--- a/aten/src/ATen/cuda/detail/IndexUtils.cuh
+++ b/aten/src/ATen/cuda/detail/IndexUtils.cuh
@@ -21,8 +21,16 @@ getTensorInfo(const at::TensorBase &t) {
     st[i] = t.stride(i);
   }
 
+  scalar* data_ptr = nullptr;
+
+  if constexpr (std::is_const<scalar>::value) {
+    data_ptr = t.const_data_ptr<scalar>();
+  } else {
+    data_ptr = t.mutable_data_ptr<scalar>();
+  }
+
   return TensorInfo<scalar, IndexType>(
-    t.data_ptr<scalar>(), dims, sz, st);
+    data_ptr, dims, sz, st);
 }
 
 } // namespace at::cuda::detail
diff --git a/aten/src/ATen/cuda/jiterator.cu b/aten/src/ATen/cuda/jiterator.cu
index 0a4ac757b1ada..db751e33c43d2 100644
--- a/aten/src/ATen/cuda/jiterator.cu
+++ b/aten/src/ATen/cuda/jiterator.cu
@@ -339,7 +339,7 @@ c10::SmallVector<at::Tensor> CompileAndLaunchKernel(
     config.add_owned_output(outs[i]);
   }
   for (const auto& t: tensors) {
-    config.add_input(t);
+    config.add_const_input(t);
   }
   TensorIterator iter = config.build();
 
diff --git a/aten/src/ATen/cuda/tunable/GemmCommon.h b/aten/src/ATen/cuda/tunable/GemmCommon.h
new file mode 100644
index 0000000000000..a1d7d0dc21638
--- /dev/null
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@@ -0,0 +1,217 @@
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <string>
+
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/util/StringUtil.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/allclose.h>
+#include <ATen/ops/from_blob.h>
+#endif
+
+namespace at::cuda::tunable {
+
+enum class BlasOp {
+  N = 0,
+  T = 1
+};
+
+inline std::string BlasOpToString(BlasOp op) {
+  switch (op) {
+    case BlasOp::N:
+      return "N";
+    case BlasOp::T:
+      return "T";
+  }
+  TORCH_CHECK(false, "unrecognized BlasOp");
+  return "N";
+}
+
+namespace detail {
+
+static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size) {
+  auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA);
+  // comparison done as 1D tensor
+  at::Tensor ref = at::from_blob(c,       {size}, options);
+  at::Tensor oth = at::from_blob(other_c, {size}, options);
+  at::Tensor ref_float = ref.to(at::kFloat);
+  at::Tensor oth_float = oth.to(at::kFloat);
+  std::vector<double> atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
+  std::vector<double> rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
+  double last_succeed_atol = 1;
+  double last_succeed_rtol = 1;
+  for (auto& atol : atols) {
+    for (auto& rtol : rtols) {
+      if (at::allclose(ref_float, oth_float, rtol, atol)) {
+        last_succeed_atol = atol;
+        last_succeed_rtol = rtol;
+      }
+    }
+  }
+  if (last_succeed_atol == 1) {
+    return false;
+  }
+  else {
+    TUNABLE_LOG("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol);
+  }
+
+  return true;
+}
+
+}
+
+template <typename T>
+struct GemmParams : OpParams {
+  std::string Signature() const override {
+    return c10::str(transa, transb, "_", m, "_", n, "_", k);
+  }
+
+  GemmParams* DeepCopy() const {
+    GemmParams* copy = new GemmParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = m * n * sizeof(T);
+    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    return copy;
+  }
+
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+  }
+
+  TuningStatus NumericalCheck(GemmParams<T> *other) {
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, m*n) ? OK : FAIL;
+  }
+
+  char transa;
+  char transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  at::opmath_type<T> alpha;
+  const T* a;
+  int64_t lda;
+  const T* b;
+  int64_t ldb;
+  at::opmath_type<T> beta;
+  T* c;
+  int64_t ldc;
+};
+
+template <typename T>
+struct GemmStridedBatchedParams : OpParams {
+  std::string Signature() const override {
+    return c10::str(transa, transb, "_", m, "_", n, "_", k, "_B_", batch);
+  }
+
+  GemmStridedBatchedParams* DeepCopy() const {
+    GemmStridedBatchedParams* copy = new GemmStridedBatchedParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = batch * stride_c * sizeof(T);
+    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    return copy;
+  }
+
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+  }
+
+  TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, batch*stride_c) ? OK : FAIL;
+  }
+
+  char transa;
+  char transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  at::opmath_type<T> alpha;
+  const T* a;
+  int64_t lda;
+  int64_t stride_a;
+  const T* b;
+  int64_t ldb;
+  int64_t stride_b;
+  at::opmath_type<T> beta;
+  T* c;
+  int64_t ldc;
+  int64_t stride_c;
+  int64_t batch;
+};
+
+template <typename T>
+struct ScaledGemmParams : OpParams {
+  std::string Signature() const override {
+    return c10::str(transa, transb, "_", m, "_", n, "_", k);
+  }
+
+  ScaledGemmParams* DeepCopy() const {
+    ScaledGemmParams* copy = new ScaledGemmParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = m * n * sizeof(T);
+    copy->c = c10::cuda::CUDACachingAllocator::raw_alloc(c_size);
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    return copy;
+  }
+
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+  }
+
+  TuningStatus NumericalCheck(ScaledGemmParams<T> *other) {
+    return detail::NumericalCheck(c_dtype, c, other->c, m*n) ? OK : FAIL;
+  }
+
+  char transa;
+  char transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  const void* a;
+  const void* a_scale_ptr;
+  int64_t lda;
+  ScalarType a_dtype;
+  const void* b;
+  const void* b_scale_ptr;
+  int64_t ldb;
+  ScalarType b_dtype;
+  const void* bias_ptr;
+  ScalarType bias_dtype;
+  void* c;
+  const void* c_scale_ptr;
+  int64_t ldc;
+  ScalarType c_dtype;
+  void* amax_ptr;
+  bool use_fast_accum;
+};
+
+} // namespace at::cuda::tunable
diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
new file mode 100644
index 0000000000000..da1483aee72c0
--- /dev/null
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@@ -0,0 +1,591 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDADataType.h>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/tunable/GemmCommon.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/StringUtil.h>
+
+#include <hipblaslt/hipblaslt.h>
+#include <hipblaslt/hipblaslt-ext.hpp>
+
+#define TORCH_HIPBLASLT_CHECK(EXPR)               \
+  do {                                            \
+    hipblasStatus_t __err = EXPR;                 \
+    TORCH_CHECK(__err == HIPBLAS_STATUS_SUCCESS,  \
+                "hipblaslt error: ",              \
+                hipblasStatusToString(__err),     \
+                " when calling `" #EXPR "`");     \
+  } while (0)
+
+namespace at::cuda::tunable {
+
+#ifdef HIPBLASLT_HAS_GETINDEXFROMALGO
+#define GETINDEXFROMALGO(algo) hipblaslt_ext::getIndexFromAlgo(algo)
+#else
+static int getIndexFromAlgo(hipblasLtMatmulAlgo_t& algo) {
+    int* algo_ptr = (int*)algo.data;
+    if(*algo_ptr < 0) {
+        return -1;
+    }
+    return *algo_ptr;
+}
+#define GETINDEXFROMALGO(algo) getIndexFromAlgo(algo)
+#endif
+
+#ifdef HIPBLASLT_CUSTOM_COMPUTE_TYPE
+#define COMPUTE_TYPE_32 HIPBLASLT_COMPUTE_F32
+#else
+#define COMPUTE_TYPE_32 HIPBLAS_COMPUTE_32F
+#endif
+
+#ifdef HIPBLASLT_CUSTOM_DATA_TYPE
+
+template <typename T>
+constexpr hipblasltDatatype_t HipBlasDataTypeFor();
+
+template <>
+constexpr hipblasltDatatype_t HipBlasDataTypeFor<float>() {
+  return HIPBLASLT_R_32F;
+}
+
+template <>
+constexpr hipblasltDatatype_t HipBlasDataTypeFor<Half>() {
+  return HIPBLASLT_R_16F;
+}
+
+template <>
+constexpr hipblasltDatatype_t HipBlasDataTypeFor<BFloat16>() {
+  return HIPBLASLT_R_16B;
+}
+
+template <>
+constexpr hipblasltDatatype_t HipBlasDataTypeFor<double>() {
+  return HIPBLASLT_R_64F;
+}
+
+template <>
+constexpr hipblasltDatatype_t HipBlasDataTypeFor<c10::Float8_e4m3fnuz>() {
+  return HIPBLASLT_R_8F_E4M3;
+}
+
+template <>
+constexpr hipblasltDatatype_t HipBlasDataTypeFor<c10::Float8_e5m2fnuz>() {
+  return HIPBLASLT_R_8F_E5M3;
+}
+
+#define DATA_TYPE_R_32 HIPBLASLT_R_32F
+
+#else
+
+template <typename T>
+constexpr hipblasDatatype_t HipBlasDataTypeFor();
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<float>() {
+  return HIPBLAS_R_32F;
+}
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<Half>() {
+  return HIPBLAS_R_16F;
+}
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<BFloat16>() {
+  return HIPBLAS_R_16B;
+}
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<double>() {
+  return HIPBLAS_R_64F;
+}
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<c10::Float8_e4m3fnuz>() {
+  return HIP_R_8F_E4M3_FNUZ;
+}
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<c10::Float8_e5m2fnuz>() {
+  return HIP_R_8F_E5M2_FNUZ;
+}
+
+#ifdef HIPBLAS_V2
+#define DATA_TYPE_R_32 HIP_R_32F
+#else
+#define DATA_TYPE_R_32 HIPBLAS_R_32F
+#endif
+
+#endif
+
+template <typename T>
+int GetBatchFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetBatchFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->batch;
+}
+
+template <typename T>
+int GetBatchFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideAFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideAFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_a;
+}
+
+template <typename T>
+int GetStrideAFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideBFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideBFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_b;
+}
+
+template <typename T>
+int GetStrideBFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideCFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideCFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_c;
+}
+
+template <typename T>
+int GetStrideCFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+float GetAlphaFromParams(const GemmParams<T>* params) {
+  return params->alpha;
+}
+
+template <typename T>
+float GetAlphaFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->alpha;
+}
+
+template <typename T>
+float GetAlphaFromParams(const ScaledGemmParams<T>* params) {
+  return 1.0;
+}
+
+template <typename T>
+float GetBetaFromParams(const GemmParams<T>* params) {
+  return params->beta;
+}
+
+template <typename T>
+float GetBetaFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->beta;
+}
+
+template <typename T>
+float GetBetaFromParams(const ScaledGemmParams<T>* params) {
+  return 0.0;
+}
+
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetAScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->a_scale_ptr;
+}
+
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->b_scale_ptr;
+}
+
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetDScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->c_scale_ptr;
+}
+
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBiasPointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->bias_ptr;
+}
+
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmParams<T>* params) {
+  return HIP_R_32F;
+}
+
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmStridedBatchedParams<T>* params) {
+  return HIP_R_32F;
+}
+
+template <typename T>
+hipDataType GetBiasTypeFromParams(const ScaledGemmParams<T>* params) {
+  return at::cuda::ScalarTypeToCudaDataType(params->bias_dtype);
+}
+
+static hipblasOperation_t _hipblasOpFromChar(char op) {
+  switch (op) {
+    case 'n':
+    case 'N':
+      return HIPBLAS_OP_N;
+    case 't':
+    case 'T':
+      return HIPBLAS_OP_T;
+    case 'c':
+    case 'C':
+      return HIPBLAS_OP_C;
+  }
+  AT_ERROR(
+      "_hipblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
+}
+
+static char _charFromhipblasOp(hipblasOperation_t op) {
+  switch (op) {
+    case HIPBLAS_OP_N:
+      return 'N';
+    case HIPBLAS_OP_T:
+      return 'T';
+    case HIPBLAS_OP_C:
+      return 'C';
+  }
+  AT_ERROR(
+      "_charFromhipblasOp input should be HIPBLAS_OP_N/T/C but got `", op, "`");
+}
+
+static hipblasOperation_t MapLayoutToHipBlasLt(BlasOp layout) {
+  if (layout == BlasOp::N) {
+    return HIPBLAS_OP_N;
+  }
+  return HIPBLAS_OP_T;
+}
+
+static size_t GetHipblasltWorkspaceSize() {
+  static const char * env = getenv("HIPBLASLT_WORKSPACE_SIZE");
+  // 256MB is max workspace size allowed for hipblaslt
+  // hipblaslt-bench uses 32MB
+  // recommendation from hipblaslt author was 76MB
+  size_t workspace_size = 2*128*1024*1024; // default 256MB
+  if (env) {
+    try {
+      workspace_size = std::stoi(env);
+    } catch(std::invalid_argument const& e) {
+      TORCH_WARN("invalid HIPBLASLT_WORKSPACE_SIZE,",
+                 " using default workspace size of ", workspace_size, " bytes.");
+    } catch(std::out_of_range const& e) {
+      TORCH_WARN("HIPBLASLT_WORKSPACE_SIZE out of range,",
+                 " using default workspace size of ", workspace_size, " bytes.");
+    }
+  }
+  return workspace_size;
+}
+
+template <typename T, cublasStatus_t (*destructor)(T*)>
+struct HipBlasLtDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      TORCH_CUDABLAS_CHECK(destructor(x));
+    }
+  }
+};
+
+template <typename T, hipblasStatus_t (*destructor)(T*)>
+class HipBlasLtDescriptor {
+ public:
+  T* descriptor() const {
+    return descriptor_.get();
+  }
+  T* descriptor() {
+    return descriptor_.get();
+  }
+
+ protected:
+  std::unique_ptr<T, HipBlasLtDeleter<T, destructor>> descriptor_;
+};
+
+class HipBlasLtMatmulDescriptor : public HipBlasLtDescriptor<
+                                     hipblasLtMatmulDescOpaque_t,
+                                     &hipblasLtMatmulDescDestroy> {
+ public:
+  HipBlasLtMatmulDescriptor(
+      hipblasComputeType_t compute_type,
+      hipDataType scale_type) {
+    hipblasLtMatmulDesc_t raw_descriptor = nullptr;
+    TORCH_HIPBLASLT_CHECK(
+        hipblasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(hipblasLtMatmulDescAttributes_t attr, const T value) {
+    TORCH_HIPBLASLT_CHECK(::hipblasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
+class HipblasltGemmOp : public Callable<ParamsT> {
+  public:
+    HipblasltGemmOp(hipblasLtMatmulAlgo_t algo) : algo_{algo} {}
+
+    TuningStatus Call(const ParamsT* params) override {
+      hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
+      hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
+      auto a_datatype = HipBlasDataTypeFor<AT>();
+      auto b_datatype = HipBlasDataTypeFor<BT>();
+      auto in_out_datatype = HipBlasDataTypeFor<CT>();
+      auto opa = _hipblasOpFromChar(params->transa);
+      auto opb = _hipblasOpFromChar(params->transb);
+
+      TORCH_CHECK(transa_outer == opa && transb_outer == opb, "trans mismatch, shouldn't happen");
+
+      float alpha = GetAlphaFromParams<CT>(params);
+      float beta = GetBetaFromParams<CT>(params);
+
+      hipblasLtMatrixLayout_t mat_a, mat_b, mat_c;
+      if (opa == HIPBLAS_OP_N) {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->m, params->k, params->lda));
+      }
+      else {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->k, params->m, params->lda));
+      }
+      if (opb == HIPBLAS_OP_N) {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->k, params->n, params->ldb));
+      }
+      else {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->n, params->k, params->ldb));
+      }
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_c, in_out_datatype, params->m, params->n, params->ldc));
+
+      // specific to batched gemmm
+      int batch = GetBatchFromParams<CT>(params);
+      if (batch > 1) {
+        int64_t stride_a = GetStrideAFromParams<CT>(params);
+        int64_t stride_b = GetStrideBFromParams<CT>(params);
+        int64_t stride_c = GetStrideCFromParams<CT>(params);
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_a, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_a, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_a, sizeof(stride_a)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_b, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_b, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_b, sizeof(stride_b)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_c, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c)));
+      }
+
+      HipBlasLtMatmulDescriptor matmul(COMPUTE_TYPE_32, DATA_TYPE_R_32);
+      matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa);
+      matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb);
+
+      // specific to scaled gemm
+      const void* mat1_scale_ptr = GetAScalePointerFromParams<CT>(params);
+      const void* mat2_scale_ptr = GetBScalePointerFromParams<CT>(params);
+      const void* result_scale_ptr = GetDScalePointerFromParams<CT>(params);
+      if (mat1_scale_ptr && mat2_scale_ptr && result_scale_ptr) {
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
+
+        const void* bias_ptr = GetBiasPointerFromParams<CT>(params);
+        auto bias_datatype = GetBiasTypeFromParams<CT>(params);
+        if (bias_ptr) {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr);
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_BIAS);
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, bias_datatype);
+        }
+      }
+
+      size_t workspace_size = GetHipblasltWorkspaceSize();
+
+      auto op_handle = at::cuda::getCurrentCUDABlasLtHandle();
+
+      size_t ret_workspace_size = 0;
+      auto status = hipblaslt_ext::matmulIsAlgoSupported(op_handle,
+          matmul.descriptor(),
+          &alpha,
+          mat_a,
+          mat_b,
+          &beta,
+          mat_c,
+          mat_c,
+          algo_,
+          ret_workspace_size);
+
+      if (status == HIPBLAS_STATUS_SUCCESS) {
+        if (ret_workspace_size >= workspace_size) {
+          //TUNABLE_LOG("[hipBLASLt] Solution #", algo_index, " workspace too large");
+          return FAIL;
+        }
+      }
+      else {
+        //TUNABLE_LOG("[hipBLASLt] Solution #", algo_index, " not supported");
+        return FAIL;
+      }
+
+      void* workspace_buffer = nullptr;
+      if (workspace_size > 0) {
+        workspace_buffer = c10::cuda::CUDACachingAllocator::raw_alloc(workspace_size);
+      }
+
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle,
+            matmul.descriptor(),
+            &alpha,
+            params->a,
+            mat_a,
+            params->b,
+            mat_b,
+            &beta,
+            params->c,
+            mat_c,
+            params->c,
+            mat_c,
+            &algo_,
+            workspace_buffer,
+            workspace_size,
+            at::cuda::getCurrentCUDAStream()));
+
+      //TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescDestroy(matmul));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c));
+      if (workspace_size > 0) {
+        c10::cuda::CUDACachingAllocator::raw_delete(workspace_buffer);
+      }
+      return OK;
+    }
+
+  private:
+    hipblasLtMatmulAlgo_t algo_;
+};
+
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
+auto GetHipBlasLtTypeStringAndOps() {
+  hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
+  hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
+  auto a_datatype = HipBlasDataTypeFor<AT>();
+  auto b_datatype = HipBlasDataTypeFor<BT>();
+  auto in_out_datatype = HipBlasDataTypeFor<CT>();
+  std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
+
+  hipblasLtHandle_t handle;
+  TORCH_HIPBLASLT_CHECK(hipblasLtCreate(&handle));
+  TORCH_HIPBLASLT_CHECK(hipblaslt_ext::getAllAlgos(handle,
+        hipblaslt_ext::GemmType::HIPBLASLT_GEMM,
+        transa_outer,
+        transb_outer,
+        a_datatype,
+        b_datatype,
+        in_out_datatype,
+        in_out_datatype,
+        COMPUTE_TYPE_32,
+        heuristic_result));
+  TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle));
+
+  // Sort heuristic_result by algo index to make sure the order of returned algos is deterministic.
+  std::sort(heuristic_result.begin(),
+      heuristic_result.end(),
+      [](hipblasLtMatmulHeuristicResult_t& a, hipblasLtMatmulHeuristicResult_t& b) {
+      return GETINDEXFROMALGO(a.algo) < GETINDEXFROMALGO(b.algo);
+      });
+
+  int returned_algo_count = heuristic_result.size();
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<ParamsT>>>> ret;
+  for (int i = 0; i < returned_algo_count; i++) {
+    auto algo = heuristic_result[i].algo;
+    int algo_index = GETINDEXFROMALGO(algo);
+    auto callable = std::make_unique<HipblasltGemmOp<AT, BT, CT, ALayout, BLayout, ParamsT>>(algo);
+    std::string type_string = c10::str(
+        "Gemm_Hipblaslt_", _charFromhipblasOp(transa_outer), _charFromhipblasOp(transb_outer), "_", algo_index);
+    ret.emplace_back(type_string, std::move(callable));
+  }
+
+  return ret;
+}
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmParams<T>>();
+}
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmStridedBatchedTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmStridedBatchedParams<T>>();
+}
+
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtScaledGemmTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<AT, BT, CT, ALayout, BLayout, ScaledGemmParams<CT>>();
+}
+
+#undef TORCH_HIPBLASLT_CHECK
+#undef GETINDEXFROMALGO
+#undef COMPUTE_TYPE_32
+#undef DATA_TYPE_R_32
+
+}  // namespace at::cuda::tunable
diff --git a/aten/src/ATen/cuda/tunable/GemmRocblas.h b/aten/src/ATen/cuda/tunable/GemmRocblas.h
new file mode 100644
index 0000000000000..f096ff00fd9b4
--- /dev/null
+++ b/aten/src/ATen/cuda/tunable/GemmRocblas.h
@@ -0,0 +1,275 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/tunable/GemmCommon.h>
+#include <c10/util/StringUtil.h>
+
+#define ROCBLAS_BETA_FEATURES_API
+#include <rocblas/rocblas.h>
+
+#define TORCH_ROCBLAS_CHECK(EXPR)                 \
+  do {                                            \
+    rocblas_status __err = EXPR;                  \
+    TORCH_CHECK(__err == rocblas_status_success,  \
+                "rocblas error: ",                \
+                rocblas_status_to_string(__err),  \
+                " when calling `" #EXPR "`");     \
+  } while (0)
+
+namespace at::cuda::tunable {
+
+template <typename T>
+constexpr rocblas_datatype RocBlasDataTypeFor();
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<float>() {
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<double>() {
+  return rocblas_datatype_f64_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<Half>() {
+  return rocblas_datatype_f16_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<BFloat16>() {
+  return rocblas_datatype_bf16_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<c10::complex<float>>() {
+  return rocblas_datatype_f32_c;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<c10::complex<double>>() {
+  return rocblas_datatype_f64_c;
+}
+
+template <typename T>
+constexpr rocblas_datatype RocBlasComputeTypeFor();
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<float>() {
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<double>() {
+  return rocblas_datatype_f64_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<Half>() {
+  // Note that we're returning the _compute_ type for a given datatype.
+  // As of 12/2022, using compute type FP16 for 16-bit floats was much
+  // slower than using compute type FP32. So we use FP32 compute even for
+  // FP16 datatypes. This is how GEMM is implemented even in the function
+  // rocblasGemmHelper (see fpgeneric.h)
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<BFloat16>() {
+  // Note that we're returning the _compute_ type for a given datatype.
+  // As of 12/2022, using compute type FP16 for 16-bit floats was much
+  // slower than using compute type FP32. So we use FP32 compute even for
+  // BF16 datatypes. This is how GEMM is implemented even in the function
+  // rocblasGemmHelper (see fpgeneric.h)
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<c10::complex<float>>() {
+  return rocblas_datatype_f32_c;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<c10::complex<double>>() {
+  return rocblas_datatype_f64_c;
+}
+
+template <typename T>
+auto DoCastForHalfOrBfloat16(const T fp) {
+  return fp;
+}
+
+template <>
+inline auto DoCastForHalfOrBfloat16<Half>(const Half fp) {
+  // alpha and beta should be the same as compute_type, in Half case it is float.
+  float h = fp;
+  return h;
+}
+
+template <>
+inline auto DoCastForHalfOrBfloat16<BFloat16>(const BFloat16 fp) {
+  // alpha and beta should be the same as compute_type, in bfloat16 case it is float.
+  float h = fp;
+  return h;
+}
+
+static rocblas_operation _rocblasOpFromChar(char op) {
+  switch (op) {
+    case 'n':
+    case 'N':
+      return rocblas_operation_none;
+    case 't':
+    case 'T':
+      return rocblas_operation_transpose;
+    case 'c':
+    case 'C':
+      return rocblas_operation_conjugate_transpose;
+  }
+  AT_ERROR(
+      "_rocblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
+}
+
+template <typename T>
+class RocblasGemmOp : public Callable<GemmParams<T>> {
+  public:
+    RocblasGemmOp(int solution) : solution_{solution} {}
+
+    TuningStatus Call(const GemmParams<T>* params) override {
+      auto input_output_type = RocBlasDataTypeFor<T>();
+      auto compute_type = RocBlasComputeTypeFor<T>();
+      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
+      auto h_b = DoCastForHalfOrBfloat16(params->beta);
+      auto status = rocblas_gemm_ex(
+          (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(),
+          _rocblasOpFromChar(params->transa),
+          _rocblasOpFromChar(params->transb),
+          params->m, params->n, params->k,
+          &h_a,
+          params->a, input_output_type, params->lda,
+          params->b, input_output_type, params->ldb,
+          &h_b,
+          params->c, input_output_type, params->ldc,
+          params->c, input_output_type, params->ldc,
+          compute_type,
+          rocblas_gemm_algo_solution_index,
+          solution_,
+          rocblas_gemm_flags_none);
+      if (status != rocblas_status_success) {
+        return FAIL;
+      }
+      return OK;
+    }
+
+  private:
+    int solution_;
+};
+
+template <typename T>
+auto GetRocBlasGemmTypeStringAndOps() {
+  rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle();
+  int solution_size;
+  auto input_output_type = RocBlasDataTypeFor<T>();
+  auto compute_type = RocBlasComputeTypeFor<T>();
+  // Get the number of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            nullptr,
+                                                            &solution_size));
+  std::vector<int> solutions(solution_size);
+  // Get the list of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            solutions.data(),
+                                                            &solution_size));
+  // Sort the solutions in ascending order to make the solution vector deterministic across runs
+  std::sort(solutions.begin(), solutions.end());
+
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<GemmParams<T>>>>> ret;
+  for (size_t i = 0; i < solutions.size(); ++i) {
+    auto callable = std::make_unique<RocblasGemmOp<T>>(solutions[i]);
+    ret.emplace_back(std::make_pair(c10::str("Gemm_Rocblas_", solutions[i]), std::move(callable)));
+  }
+  return ret;
+}
+
+template <typename T>
+class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>> {
+  public:
+    RocblasGemmStridedBatchedOp(int solution) : solution_{solution} {}
+
+    TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
+      auto input_output_type = RocBlasDataTypeFor<T>();
+      auto compute_type = RocBlasComputeTypeFor<T>();
+      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
+      auto h_b = DoCastForHalfOrBfloat16(params->beta);
+      auto status = rocblas_gemm_strided_batched_ex(
+          (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(),
+          _rocblasOpFromChar(params->transa),
+          _rocblasOpFromChar(params->transb),
+          params->m, params->n, params->k,
+          &h_a,
+          params->a, input_output_type, params->lda, params->stride_a,
+          params->b, input_output_type, params->ldb, params->stride_b,
+          &h_b,
+          params->c, input_output_type, params->ldc, params->stride_c,
+          params->c, input_output_type, params->ldc, params->stride_c,
+          params->batch,
+          compute_type,
+          rocblas_gemm_algo_solution_index,
+          solution_,
+          rocblas_gemm_flags_none);
+      if (status != rocblas_status_success) {
+        return FAIL;
+      }
+      return OK;
+    }
+
+  private:
+    int solution_;
+};
+
+template <typename T>
+auto GetRocBlasGemmStridedBatchedTypeStringAndOps() {
+  rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle();
+  int solution_size;
+  auto input_output_type = RocBlasDataTypeFor<T>();
+  auto compute_type = RocBlasComputeTypeFor<T>();
+  // Get the number of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            nullptr,
+                                                            &solution_size));
+  std::vector<int> solutions(solution_size);
+  // Get the list of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            solutions.data(),
+                                                            &solution_size));
+  // Sort the solutions in ascending order to make the solution vector deterministic across runs
+  std::sort(solutions.begin(), solutions.end());
+
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<GemmStridedBatchedParams<T>>>>> ret;
+  for (size_t i = 0; i < solutions.size(); ++i) {
+    auto callable = std::make_unique<RocblasGemmStridedBatchedOp<T>>(solutions[i]);
+    ret.emplace_back(std::make_pair(c10::str("Gemm_Rocblas_", solutions[i]), std::move(callable)));
+  }
+  return ret;
+}
+
+}  // namespace at::cuda::tunable
diff --git a/aten/src/ATen/cuda/tunable/README.md b/aten/src/ATen/cuda/tunable/README.md
new file mode 100644
index 0000000000000..364e6975c6c64
--- /dev/null
+++ b/aten/src/ATen/cuda/tunable/README.md
@@ -0,0 +1,88 @@
+# TunableOp
+
+This directory implements a TunableOp interface.
+
+Some operations, such as GEMMs, could be implemented using more than one library or more than one technique.  For
+example, a GEMM could be implemented for CUDA or ROCm using either the blas or blasLt libraries.  Further, ROCm's
+rocblas and hipblaslt libraries allow the user to query for all possible algorithms and then choose one.  How does one
+know which implementation is the fastest and should be chosen?  That's what TunableOp provides.
+
+The behavior of TunableOp is currently easily manipulated through environment variables, though you could use the C++
+interface of at::cuda::tunable::getTuningContext().  A Python interface to the TuningContext does not yet exist.
+
+Currently only a TunableGemm for ROCm is implemented.  Any call to at::cuda::blas::gemm() can optionally use the
+TunableGemm.  Calling gemm() for a given set of input arguments (transa, transb, m, n, k) will attempt to use the
+fastest available implementation.
+
+## Environment Variables
+
+#### PYTORCH_TUNABLEOP_ENABLED
+Default is 0. Set to 1 to enable.
+This is the big on/off switch for all TunableOp implementations.
+
+#### PYTORCH_TUNABLEOP_TUNING
+Default is 1. Set to 0 to disable.
+When enabled, if a tuned entry isn't found, run the tuning step and record the entry.
+
+#### PYTORCH_TUNABLEOP_VERBOSE
+Default is 0. Set to 1 to enable.
+This will produce a lot of diagnostic messages but may be useful to see if TunableOp is being used at all.
+Otherwise, TunableOp is completely silent unless there is a warning or error during its use.
+
+#### PYTORCH_TUNABLEOP_FILENAME
+Default is 'tunableop_results.csv'.  If you provide a filename, the TuningContext will attempt to read it the first time
+the context is used.  If tuning is enabled and new tunings are discovered, it will also write out to this same filename
+with all tunings, both the ones it read in at startup as well as the new ones found at runtime.  This can be used, for
+example, to build up a tunings file across many workloads by reusing the same file.  Unsetting this variable is not
+recommended but can be done, in which case the tuning results will not be saved.
+
+#### PYTORCH_TUNABLEOP_NUMERICAL_CHECK
+Default is 1. Set to 0 to disable. Compare the results of each possible solution against the default solution and reject
+those with low accuracy.
+
+#### PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED
+Default is 1. Set to 0 to disable hipblaslt being considered during tuning.
+
+### Tuning Iterations
+By default, each possible solution for a given operator will be run for either 100 iterations or as many iterations can
+be run within 30ms, whichever is smaller. Its average execution will be calculated. The fastest solution is chosen. In
+addition, a set of warm up iterations can optionally be run prior to the timed iterations. The following environment
+variables can be used to set either the maximum number of iterations to attempt or the maximum amount of time allowed in
+milliseconds, or both, in which case the smaller of the two values used.
+
+#### PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS
+Default is 30.
+
+#### PYTORCH_TUNABLEOP_MAX_TUNING_ITERATIONS
+Default is 100.
+
+#### PYTORCH_TUNABLEOP_MAX_WARMUP_DURATION_MS
+Default is 0, meaning it is not used.
+
+#### PYTORCH_TUNABLEOP_MAX_WARMUP_ITERATIONS
+Default is 1.
+
+## File Output
+
+Assuming you specified a filename, you'll end up with a CSV file with contents like so:
+
+```
+Validator,PT_VERSION,2.2.0
+Validator,ROCM_VERSION,6.0.0.0-12969-1544e39
+Validator,HIPBLASLT_VERSION,0.6.0-a9c5cc7
+Validator,ROCBLAS_VERSION,4.0.0-72e57364-dirty
+GemmTunableOp_float_NT,nt_25088_4096_64,1219,1.262
+GemmTunableOp_float_NT,nt_4096_4096_64,1216,0.033
+```
+
+Note the "Validator" lines.  If you change a library verison, or rocm version, or pytorch version, TunableOp will detect
+this and not load the tunings because they are likely affected by other software changes.
+
+The remaining lines are the tuned solutions for each TunableOp encountered during your execution. Each line consists of
+4 comma-separated fields: operator name, operator parameters, solution name, and average execution time. The execution
+time is an optional field. The CSV file can be edited, but with caution. For example, the solution name (field 3) can be
+changed to "Default" and it will fall back to the original PyTorch untuned implementation. Or, in the case of ROCm's
+hipBLAS or hipBLASLt libraries, if you know the specific solution index you can override the solution that TunableOp
+selected by replacing the value. The operator name and parameters (fields 1 and 2) are internally named and should not
+be modified. In the case of GemmTunableOp, field 1 indicates the datatype and whether the inputs are transposed (T) or
+not (N) and field 2 indicates the M, N, K input shapes.
diff --git a/aten/src/ATen/cuda/tunable/StreamTimer.cpp b/aten/src/ATen/cuda/tunable/StreamTimer.cpp
new file mode 100644
index 0000000000000..1407c32dbb352
--- /dev/null
+++ b/aten/src/ATen/cuda/tunable/StreamTimer.cpp
@@ -0,0 +1,43 @@
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#include <cuda_runtime.h>
+
+#include <c10/cuda/CUDAStream.h>
+#include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/tunable/StreamTimer.h>
+
+namespace at::cuda::tunable {
+
+StreamTimer::StreamTimer() {
+  AT_CUDA_CHECK(cudaEventCreate(&start_));
+  AT_CUDA_CHECK(cudaEventCreate(&end_));
+}
+
+StreamTimer::~StreamTimer() {
+}
+
+void StreamTimer::Start() {
+  AT_CUDA_CHECK(cudaDeviceSynchronize());
+  AT_CUDA_CHECK(cudaEventRecord(start_, at::cuda::getCurrentCUDAStream()));
+}
+
+void StreamTimer::End() {
+  AT_CUDA_CHECK(cudaEventRecord(end_, at::cuda::getCurrentCUDAStream()));
+  AT_CUDA_CHECK(cudaEventSynchronize(end_));
+}
+
+float StreamTimer::Duration() {
+  float time;
+  // time is in ms with a resolution of 1 us
+  AT_CUDA_CHECK(cudaEventElapsedTime(&time, start_, end_));
+  return time;
+}
+
+} // namespace at::cuda::tunable
diff --git a/aten/src/ATen/cuda/tunable/StreamTimer.h b/aten/src/ATen/cuda/tunable/StreamTimer.h
new file mode 100644
index 0000000000000..69889cbbcbfc6
--- /dev/null
+++ b/aten/src/ATen/cuda/tunable/StreamTimer.h
@@ -0,0 +1,34 @@
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include <ATen/cuda/tunable/Tunable.h>
+
+namespace at::cuda::tunable {
+
+class StreamTimer : public ITimer {
+  public:
+    StreamTimer();
+    virtual ~StreamTimer();
+
+    void Start() override;
+
+    void End() override;
+
+    float Duration() override;
+
+  private:
+    cudaEvent_t start_;
+    cudaEvent_t end_;
+};
+
+} // namespace at::cuda::tunable
diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp
new file mode 100644
index 0000000000000..22bde7f4c4270
--- /dev/null
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@@ -0,0 +1,564 @@
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#include <cuda_runtime.h>
+
+#include <ATen/cuda/CUDAContextLight.h>
+#include <ATen/cuda/tunable/Tunable.h>
+#include <c10/util/Exception.h>
+#include <c10/util/StringUtil.h>
+#include <torch/version.h>
+
+#ifndef _WIN32
+#include <cxxabi.h>
+#endif
+
+#include <chrono>
+#include <fstream>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace at::cuda::tunable {
+
+namespace {
+
+TuningContext tuning_context;
+
+} // anonymous namespace
+
+TuningContext* getTuningContext() {
+  return &tuning_context;
+}
+
+std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry) {
+  return stream << entry.key_ << "," << entry.time_;
+}
+
+// TuningResultsManager
+
+KernelMap TuningResultsManager::Lookup(const std::string& op_signature) {
+  std::scoped_lock l{lock_};
+  auto it = results_.find(op_signature);
+  if (it == results_.cend()) {
+    return {};
+  }
+  return it->second;  // copied
+}
+
+ResultEntry TuningResultsManager::Lookup(const std::string& op_signature, const std::string& params_signature) {
+  std::scoped_lock l{lock_};
+  auto kernel_map_it = results_.find(op_signature);
+  if (kernel_map_it == results_.cend()) {
+    TUNABLE_LOG("missing op_signature, returning null ResultEntry");
+    return ResultEntry::Null();
+  }
+
+  const auto& km = kernel_map_it->second;
+  auto it = km.find(params_signature);
+  if (it == km.cend()) {
+    TUNABLE_LOG("missing params_signature, returning null ResultEntry");
+    return ResultEntry::Null();
+  }
+  return it->second;
+}
+
+inline void TuningResultsManager::AddImpl(const std::string& op_signature,
+    const std::string& params_signature,
+    ResultEntry best,
+    KernelMap& kernel_map) {
+  auto it = kernel_map.find(params_signature);
+  if (it != kernel_map.end()) {
+    if (it->second != best) {
+      TUNABLE_LOG(op_signature, "(", params_signature, ") already has a best kernel ",
+          "id=", it->second, " selected, want to add a different best kernel ", best,
+          ", the new kernel id will be ignored.");
+    }
+    return;
+  }
+
+  TUNABLE_LOG(op_signature, "(", params_signature, ") -> ", best);
+  kernel_map.emplace(params_signature, best);
+}
+
+void TuningResultsManager::Add(const std::string& op_signature, const std::string& params_signature, ResultEntry best) {
+  std::scoped_lock l{lock_};
+
+  auto it = results_.find(op_signature);
+  if (it == results_.end()) {
+    it = results_.insert({op_signature, {}}).first;
+  }
+
+  AddImpl(op_signature, params_signature, best, it->second);
+}
+
+void TuningResultsManager::Delete(const std::string& op_signature, const std::string& params_signature) {
+  std::scoped_lock l{lock_};
+
+  auto it = results_.find(op_signature);
+  if (it == results_.end()) {
+    return;
+  }
+
+  auto it2 = it->second.find(params_signature);
+  if (it2 == it->second.end()) {
+    return;
+  }
+
+  TUNABLE_LOG(op_signature, "(", params_signature, ")");
+  it->second.erase(it2);
+}
+
+inline void TuningResultsManager::DisjointMergeImpl(
+    const std::string& op_signature,
+    const KernelMap& kernel_map,
+    /*out*/ std::unordered_map<std::string, KernelMap>& results) {
+  auto it = results.find(op_signature);
+  if (it == results.end()) {
+    for (const auto& [param_sig, kernel_id] : kernel_map) {
+      TUNABLE_LOG(op_signature, "(", param_sig, ") -> ", kernel_id);
+    }
+    results[op_signature] = kernel_map;
+    return;
+  }
+
+  for (const auto& [params_signature, best] : kernel_map) {
+    AddImpl(op_signature, params_signature, best, it->second);
+  }
+}
+
+void TuningResultsManager::Load(const std::unordered_map<std::string, KernelMap>& results_to_load) {
+  TUNABLE_LOG("Loading results");
+  std::scoped_lock l{lock_};
+  for (const auto& [op_signature, kernel_map] : results_to_load) {
+    DisjointMergeImpl(op_signature, kernel_map, results_);
+  }
+}
+
+ResultsMap TuningResultsManager::Dump() {
+  std::scoped_lock l{lock_};
+  return results_;
+}
+
+void TuningResultsManager::DisjointMerge(const std::string& op_signature, const KernelMap& kernel_map) {
+  std::scoped_lock l{lock_};
+  DisjointMergeImpl(op_signature, kernel_map, results_);
+}
+
+size_t TuningResultsManager::GetSize() {
+  size_t size = 0;
+  std::scoped_lock l{lock_};
+  for (const auto& [op_signature, kernel_map] : results_) {
+    size += kernel_map.size();
+  }
+  return size;
+}
+
+// TuningResultsValidator
+
+TuningResultsValidator::TuningResultsValidator() {
+  RegisterValidator(
+      "PT_VERSION",
+      [this]() { return GetPyTorchVersion(); },
+      [this](auto&& k) { return ValidatePyTorchVersion(std::forward<decltype(k)>(k)); });
+}
+
+std::unordered_map<std::string, std::string> TuningResultsValidator::GetAllValidators() const {
+  std::unordered_map<std::string, std::string> ret;
+  for (const auto& [key, get_validate_func_pair] : validators_) {
+    const GetFunc& getter = get_validate_func_pair.first;
+    ret[key] = getter();
+  }
+  return ret;
+}
+
+static bool CheckMandatoryKeys(
+    const TuningResultsValidator::GetValidateFuncs& gv_funcs,
+    const std::unordered_map<std::string, std::string>& to_check) {
+  bool passed = true;
+  for (const auto& k : TuningResultsValidator::mandatory_keys) {
+    if (gv_funcs.find(k) == gv_funcs.end()) {
+      passed = false;
+      TUNABLE_LOG("key=\"", k, "\" is not registered for Get and Validate. ");
+    }
+
+    if (to_check.find(k) == to_check.end()) {
+      passed = false;
+      TUNABLE_LOG("key=\"", k, "\" is not provided for validation. ");
+    }
+  }
+  return passed;
+}
+
+static bool CheckKeysMatching(
+    const TuningResultsValidator::GetValidateFuncs& gv_funcs,
+    const std::unordered_map<std::string, std::string>& to_check) {
+  auto get_keys = [](const auto& it) -> std::string { return it.first; };
+  std::vector<std::string> required_keys;
+  std::vector<std::string> provided_keys;
+  std::transform(gv_funcs.cbegin(), gv_funcs.cend(), std::back_inserter(required_keys), get_keys);
+  std::transform(to_check.cbegin(), to_check.cend(), std::back_inserter(provided_keys), get_keys);
+  std::sort(required_keys.begin(), required_keys.end());
+  std::sort(provided_keys.begin(), provided_keys.end());
+
+  std::unordered_set<std::string> intersection;
+  std::set_intersection(required_keys.cbegin(), required_keys.cend(),
+                        provided_keys.cbegin(), provided_keys.cend(),
+                        std::inserter(intersection, intersection.end()));
+  bool matched = true;
+  if (intersection.size() != required_keys.size()) {
+    matched = false;
+    for (const auto& k : required_keys) {
+      if (intersection.find(k) == intersection.end()) {
+        TORCH_WARN("Unmatched validator: \"", k, "\" is required, but the tuning results does not provide it. ");
+      }
+    }
+  }
+  if (intersection.size() != provided_keys.size()) {
+    matched = false;
+    for (const auto& k : provided_keys) {
+      if (intersection.find(k) == intersection.end()) {
+        TORCH_WARN("Unmatched validator: \"", k, "\" is provided, but pytorch is unable to consume it. ");
+      }
+    }
+  }
+  return matched;
+}
+
+TuningStatus TuningResultsValidator::ValidateAll(
+        const std::unordered_map<std::string, std::string>& to_validate) const {
+  if (!CheckMandatoryKeys(validators_, to_validate)) {
+    return FAIL;
+  }
+  if (!CheckKeysMatching(validators_, to_validate)) {
+    return FAIL;
+  }
+
+  for (const auto& [key, value] : to_validate) {
+    const auto& it = validators_.find(key);
+    if (it == validators_.cend()) {
+      TORCH_WARN("Failed to lookup validator using key ", key);
+      for (const auto& [key2, val2] : validators_) {
+        TORCH_WARN("available key ", key2);
+      }
+      return FAIL;
+    }
+    const ValidateFunc& validator = it->second.second;
+    if (validator(value) != OK) {
+      TORCH_WARN("Failed validator: ", key);
+      return FAIL;
+    }
+  }
+
+  return OK;
+}
+
+void TuningResultsValidator::RegisterValidator(const std::string& key, const GetFunc& gf, const ValidateFunc& vf) {
+  if (validators_.find(key) != validators_.end()) {
+    TORCH_WARN("Attempting to re-register validator with key ", key);
+  }
+  else {
+    validators_[key] = std::make_pair(gf, vf);
+  }
+}
+
+std::string TuningResultsValidator::GetPyTorchVersion() const {
+  return TORCH_VERSION;
+}
+
+TuningStatus TuningResultsValidator::ValidatePyTorchVersion(const std::string& value) const {
+  if (value == GetPyTorchVersion()) {
+    return OK;
+  }
+  return FAIL;
+}
+
+// TuningContext
+
+TuningContext::TuningContext() :
+    enable_{false},
+    tuning_enable_{true},
+    manager_initialized_{false},
+    max_tuning_duration_ms_{30},
+    max_tuning_iterations_{100},
+    max_warmup_duration_ms_{0},
+    max_warmup_iterations_{0},
+    filename_{},
+    results_count_from_input_file_{0}
+{
+}
+
+TuningContext::~TuningContext() {
+  if (!manager_initialized_) {
+    // TuningResultsManager was never initialized, no tuning requested or performed.
+    // This can happen in a DDP job where a python process spawns other workers
+    // but doesn't do any computation itself.
+    return;
+  }
+  auto filename = GetFilename();
+  if (IsTunableOpEnabled() && IsTuningEnabled() && !filename.empty()) {
+    if (results_count_from_input_file_ < GetTuningResultsManager().GetSize()) {
+      if (results_count_from_input_file_ > 0) {
+        TUNABLE_LOG("additional tuning results available, rewriting file ", filename);
+      }
+      else {
+        TUNABLE_LOG("writing file ", filename);
+      }
+      if (!WriteFile(filename)) {
+        TUNABLE_LOG("failed to write file ", filename);
+      }
+    }
+  }
+}
+
+void TuningContext::EnableTunableOp() {
+  TUNABLE_LOG("Enable TunableOp");
+  enable_ = true;
+}
+
+void TuningContext::DisableTunableOp() {
+  TUNABLE_LOG("Disable TunableOp");
+  enable_ = false;
+}
+
+bool TuningContext::IsTunableOpEnabled() const {
+  static const char *env = std::getenv("PYTORCH_TUNABLEOP_ENABLED");
+  if (env != nullptr && strcmp(env, "1") == 0) {
+    //TUNABLE_LOG("PYTORCH_TUNABLEOP_ENABLED=1");
+    return true;
+  }
+  return enable_;
+}
+
+void TuningContext::EnableTuning() {
+  TUNABLE_LOG("Enable Tuning for TunableOp");
+  tuning_enable_ = true;
+}
+
+void TuningContext::DisableTuning() {
+  TUNABLE_LOG("Disable Tuning for TunableOp");
+  tuning_enable_ = false;
+}
+
+bool TuningContext::IsTuningEnabled() const {
+  static const char *env = std::getenv("PYTORCH_TUNABLEOP_TUNING");
+  if (env != nullptr && strcmp(env, "0") == 0) {
+    //TUNABLE_LOG("PYTORCH_TUNABLEOP_TUNING=1");
+    return false;
+  }
+  return tuning_enable_;
+}
+
+void TuningContext::SetMaxTuningDurationMs(int max_duration_ms) {
+  max_tuning_duration_ms_ = max_duration_ms;
+}
+
+int TuningContext::GetMaxTuningDurationMs() const {
+  static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS");
+  if (env != nullptr) {
+    return atoi(env);
+  }
+  return max_tuning_duration_ms_;
+}
+
+void TuningContext::SetMaxTuningIterations(int max_iter) {
+  max_tuning_iterations_ = max_iter;
+}
+
+int TuningContext::GetMaxTuningIterations() const {
+  static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_TUNING_ITERATIONS");
+  if (env != nullptr) {
+    return atoi(env);
+  }
+  return max_tuning_iterations_;
+}
+
+void TuningContext::SetMaxWarmupDurationMs(int max_duration_ms) {
+  max_warmup_duration_ms_ = max_duration_ms;
+}
+
+int TuningContext::GetMaxWarmupDurationMs() const {
+  static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_WARMUP_DURATION_MS");
+  if (env != nullptr) {
+    return atoi(env);
+  }
+  return max_warmup_duration_ms_;
+}
+
+void TuningContext::SetMaxWarmupIterations(int max_iter) {
+  max_warmup_iterations_ = max_iter;
+}
+
+int TuningContext::GetMaxWarmupIterations() const {
+  static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_WARMUP_ITERATIONS");
+  if (env != nullptr) {
+    return atoi(env);
+  }
+  return max_warmup_iterations_;
+}
+
+void TuningContext::EnableTunableOpAndTuning() {
+  EnableTunableOp();
+  EnableTuning();
+}
+
+void TuningContext::DisableTunableOpAndTuning() {
+  DisableTunableOp();
+  DisableTuning();
+}
+
+TuningResultsManager& TuningContext::GetTuningResultsManager() {
+  c10::call_once(manager_init_once_, [this]() {
+    manager_initialized_ = true;
+    if (GetFilename().empty()) {
+      // if SetFilename() was not already called, call it now with the default or env var
+      const char *env = std::getenv("PYTORCH_TUNABLEOP_FILENAME");
+      std::string filename = (env == nullptr) ? "tunableop_results.csv" : env;
+      SetFilename(filename);
+    }
+    auto filename = GetFilename();
+    if (!filename.empty()) {
+      ReadFile(filename);
+      // attempt immediately to open file for writing to catch errors early
+      std::ofstream file(filename, std::ios::out | std::ios::app);
+      if (!file.good()) {
+        TORCH_WARN("failed to open file '", filename, "' for writing; your tuning results will not be saved");
+      }
+    }
+  });
+  return manager_;
+}
+
+TuningResultsValidator& TuningContext::GetTuningResultsValidator() {
+  return validator_;
+}
+
+TuningResults TuningContext::GetTuningResults() {
+  TuningResults tr;
+  tr.validators = GetTuningResultsValidator().GetAllValidators();
+  tr.results = GetTuningResultsManager().Dump();
+  return tr;
+}
+
+TuningStatus TuningContext::LoadTuningResults(const TuningResults& tr) {
+  TORCH_CHECK(GetTuningResultsValidator().ValidateAll(tr.validators));
+  GetTuningResultsManager().Load(tr.results);
+  return OK;
+}
+
+void TuningContext::SetFilename(const std::string& filename) {
+  filename_ = filename;
+
+  if (filename_.empty()) {
+    return;
+  }
+
+  // differentiate filename based on device ordinal to avoid
+  // use case of one process per device writing to same file
+  std::string device = c10::str(int(c10::cuda::current_device()));
+
+  // does filename contain %d to insert device ordinal in specific location?
+  const std::string TOKEN("%d");
+  std::size_t found = filename_.find(TOKEN);
+  if (found != std::string::npos) {
+    filename_.replace(found, TOKEN.length(), device);
+  }
+  else {
+    // no %d present, so append device ordinal before final '.'
+    found = filename_.rfind(".");
+    if (found != std::string::npos) {
+      filename_.insert(found, device);
+    }
+    else {
+      // all else fails, just append
+      filename_.append(device);
+    }
+  }
+}
+
+std::string TuningContext::GetFilename() const {
+  return filename_;
+}
+
+bool TuningContext::ReadFile(const std::string& filename) {
+  TUNABLE_LOG("reading tuning results from ", filename);
+  ResultsMap results;
+  std::unordered_map<std::string, std::string> validators;
+  std::string line;
+  std::ifstream file(filename);
+  if (!file) {
+    TUNABLE_LOG("could not open ", filename, " for reading tuning results");
+    return false;
+  }
+  while (std::getline(file, line)) {
+    if (line.empty()) {
+      continue;
+    }
+    std::string part;
+    std::vector<std::string> parts;
+    std::stringstream line_as_stream(line);
+    while (std::getline(line_as_stream, part, ',')) {
+      parts.push_back(part);
+    }
+    if (parts[0] == "Validator" && parts.size() >= 3) {
+      validators[parts[1]] = parts[2];
+      TUNABLE_LOG("Validator ", parts[1], "=", parts[2]);
+    }
+    else if (parts.size() >= 4) {
+      results[parts[0]].emplace(parts[1], ResultEntry(parts[2], atof(parts[3].c_str())));
+    }
+    else if (parts.size() >= 3) {
+      // the timestamp from the file is optional
+      results[parts[0]].emplace(parts[1], ResultEntry(parts[2], 0));
+    }
+    else {
+      TUNABLE_LOG("could not parse line: ", line);
+    }
+  }
+  if (GetTuningResultsValidator().ValidateAll(validators) != FAIL) {
+    manager_.Load(results);
+    results_count_from_input_file_ = manager_.GetSize();
+  }
+  else {
+    TUNABLE_LOG("results validator check failed");
+    return false;
+  }
+  return true;
+}
+
+bool TuningContext::WriteFile(const std::string& filename) {
+  std::ofstream file(filename, std::ios::out | std::ios::trunc);
+  if (!file.good()) {
+    TUNABLE_LOG("error opening tuning results file for writing ", filename);
+    return false;
+  }
+  auto validators = GetTuningResultsValidator().GetAllValidators();
+  for (const auto& [key, val] : validators) {
+    file << "Validator," << key << "," << val << std::endl;
+  }
+  auto results = GetTuningResultsManager().Dump();
+  for (const auto& [op_sig, kernelmap] : results) {
+    for (const auto& [param_sig, result] : kernelmap) {
+      file << op_sig << "," << param_sig << "," << result << std::endl;
+    }
+  }
+  file.close();
+  return true;
+}
+
+} // namespace at::cuda::tunable
diff --git a/aten/src/ATen/cuda/tunable/Tunable.h b/aten/src/ATen/cuda/tunable/Tunable.h
new file mode 100644
index 0000000000000..eb849a213fe5a
--- /dev/null
+++ b/aten/src/ATen/cuda/tunable/Tunable.h
@@ -0,0 +1,205 @@
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <c10/util/CallOnce.h>
+
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace at::cuda::tunable {
+
+static void TunableLog(const std::string& msg) {
+  static const char *env = getenv("PYTORCH_TUNABLEOP_VERBOSE");
+  if (env != nullptr && strcmp(env, "1") == 0) {
+    std::cerr << msg << std::endl;
+  }
+}
+#define TUNABLE_LOG(...) TunableLog(c10::str(__VA_ARGS__))
+
+enum TuningStatus {
+  OK = 0,
+  FAIL = 1,
+  UNSUPPORTED = 2,
+};
+
+// Mapping from params signature to kernel id
+class ResultEntry {
+  public:
+    explicit ResultEntry(const std::string& key, double time) : key_(key), time_(time) {}
+    bool operator==(const ResultEntry& other) { return key_ == other.key_; }
+    bool operator!=(const ResultEntry& other) { return key_ != other.key_; }
+    operator std::string () { return key_; }
+    friend std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry);
+    static ResultEntry Null() { return ResultEntry("Null", 0.0); }
+    static ResultEntry Default() { return ResultEntry("Default", 0.0); }
+
+  private:
+    std::string key_;
+    double time_;
+};
+
+typedef std::unordered_map<std::string, ResultEntry> KernelMap;
+typedef std::unordered_map<std::string, KernelMap> ResultsMap;
+
+struct TuningResults {
+  // Validates if these results are compatible with the libraries
+  std::unordered_map<std::string, std::string> validators;
+
+  // Mapping from Callable signature to Callable's tuning result
+  ResultsMap results;
+};
+
+class TuningResultsManager {
+  public:
+    TuningResultsManager() = default;
+    ~TuningResultsManager() = default;
+
+    KernelMap Lookup(const std::string& op_signature);
+
+    ResultEntry Lookup(const std::string& op_signature, const std::string& params_signature);
+
+    inline void AddImpl(const std::string& op_signature,
+        const std::string& params_signature,
+        ResultEntry best,
+        KernelMap& kernel_map);
+
+    void Add(const std::string& op_signature,
+        const std::string& params_signature,
+        ResultEntry best);
+
+    void Delete(const std::string& op_signature, const std::string& params_signature);
+
+    inline void DisjointMergeImpl(
+        const std::string& op_signature,
+        const KernelMap& kernel_map,
+        /*out*/ ResultsMap& results);
+
+    void Load(const ResultsMap& results_to_load);
+
+    ResultsMap Dump();
+
+    void DisjointMerge(const std::string& op_signature, const KernelMap& kernel_map);
+
+    size_t GetSize();
+
+  private:
+    std::mutex lock_;
+    ResultsMap results_;
+};
+
+class TuningResultsValidator {
+  public:
+    using GetFunc = std::function<std::string()>;
+    using ValidateFunc = std::function<TuningStatus(const std::string&)>;
+    using GetValidateFuncs = std::unordered_map<std::string, std::pair<GetFunc, ValidateFunc>>;
+
+    TuningResultsValidator();
+    ~TuningResultsValidator() = default;
+
+    std::unordered_map<std::string, std::string> GetAllValidators() const;
+    TuningStatus ValidateAll(const std::unordered_map<std::string, std::string>& to_validate) const;
+    void RegisterValidator(const std::string& key, const GetFunc& gf, const ValidateFunc& vf);
+
+  protected:
+    std::string GetPyTorchVersion() const;
+    TuningStatus ValidatePyTorchVersion(const std::string& value) const;
+
+  public:
+    static constexpr const std::array mandatory_keys{"PT_VERSION"};
+
+  private:
+    GetValidateFuncs validators_;
+};
+
+class TuningContext {
+  public:
+    TuningContext();
+    ~TuningContext();
+    TuningContext(TuningContext &) = delete;
+    TuningContext(TuningContext &&) = delete;
+    TuningContext &operator=(TuningContext &) = delete;
+    TuningContext &operator=(TuningContext &&) = delete;
+
+    void EnableTunableOp();
+    void DisableTunableOp();
+    bool IsTunableOpEnabled() const;
+
+    void EnableTuning();
+    void DisableTuning();
+    bool IsTuningEnabled() const;
+
+    void SetMaxTuningDurationMs(int max_duration_ms);
+    int GetMaxTuningDurationMs() const;
+
+    void SetMaxTuningIterations(int max_iter);
+    int GetMaxTuningIterations() const;
+
+    void SetMaxWarmupDurationMs(int max_duration_ms);
+    int GetMaxWarmupDurationMs() const;
+
+    void SetMaxWarmupIterations(int max_iter);
+    int GetMaxWarmupIterations() const;
+
+    void EnableTunableOpAndTuning();
+    void DisableTunableOpAndTuning();
+
+    TuningResultsManager& GetTuningResultsManager();
+
+    TuningResultsValidator& GetTuningResultsValidator();
+
+    TuningResults GetTuningResults();
+
+    TuningStatus LoadTuningResults(const TuningResults& tr);
+
+    void SetFilename(const std::string& filename);
+    std::string GetFilename() const;
+
+  protected:
+    bool ReadFile(const std::string& filename);
+    bool WriteFile(const std::string& filename);
+
+  private:
+    bool enable_;
+    bool tuning_enable_;
+    bool manager_initialized_;
+    int max_tuning_duration_ms_;
+    int max_tuning_iterations_;
+    int max_warmup_duration_ms_;
+    int max_warmup_iterations_;
+    mutable TuningResultsManager manager_;
+    mutable c10::once_flag manager_init_once_;
+    TuningResultsValidator validator_;
+    std::string filename_;
+    size_t results_count_from_input_file_;
+};
+
+TuningContext* getTuningContext();
+
+class ITimer {
+  public:
+    ITimer() = default;
+    virtual ~ITimer() = default;
+
+    virtual void Start() = 0;
+    virtual void End() = 0;
+
+    /// Computes the elapsed time in milliseconds between Start() and End()
+    virtual float Duration() = 0;
+};
+
+} // namespace at::cuda::tunable
diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h
new file mode 100644
index 0000000000000..3b5e7e0903c89
--- /dev/null
+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
@@ -0,0 +1,368 @@
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <ATen/cuda/tunable/GemmCommon.h>
+#ifdef USE_ROCM
+#if ROCM_VERSION >= 50700
+#include <ATen/cuda/tunable/GemmHipblaslt.h>
+#endif
+#include <ATen/cuda/tunable/GemmRocblas.h>
+#endif
+#include <ATen/cuda/tunable/StreamTimer.h>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/StringUtil.h>
+
+#ifdef USE_ROCM
+#include <rocm-core/rocm_version.h>
+#endif
+
+#define STRINGIFY(s) #s
+#define XSTRINGIFY(s) STRINGIFY(s)
+
+namespace at::cuda::tunable {
+
+template <typename T>
+class DefaultGemmOp : public Callable<GemmParams<T>> {
+  public:
+    TuningStatus Call(const GemmParams<T>* params) override {
+      at::cuda::blas::gemm_internal<T>(
+          params->transa, params->transb,
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda,
+          params->b, params->ldb,
+          params->beta,
+          params->c, params->ldc);
+      return OK;
+    }
+};
+
+template <typename T>
+class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>> {
+  public:
+    TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
+      at::cuda::blas::bgemm_internal<T>(
+          params->transa, params->transb,
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda, params->stride_a,
+          params->b, params->ldb, params->stride_b,
+          params->beta,
+          params->c, params->ldc, params->stride_c,
+          params->batch);
+      return OK;
+    }
+};
+
+template <typename T>
+class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
+  public:
+    TuningStatus Call(const ScaledGemmParams<T>* params) override {
+      at::cuda::blas::scaled_gemm(
+          params->transa,
+          params->transb,
+          params->m,
+          params->n,
+          params->k,
+          params->a,
+          params->a_scale_ptr,
+          params->lda,
+          params->a_dtype,
+          params->b,
+          params->b_scale_ptr,
+          params->ldb,
+          params->b_dtype,
+          params->bias_ptr,
+          params->bias_dtype,
+          params->c,
+          params->c_scale_ptr,
+          params->ldc,
+          params->c_dtype,
+          params->amax_ptr,
+          params->use_fast_accum);
+      return OK;
+    }
+};
+
+template <typename T>
+inline bool IsZero(T v) {
+  return v == 0.0f;
+}
+
+template <>
+inline bool IsZero(BFloat16 v) {
+  return v.x == 0;
+}
+
+template <>
+inline bool IsZero(Half v) {
+  return float(v) == 0.0f;
+}
+
+template <>
+inline bool IsZero(c10::complex<double> v) {
+  return v == 0.0;
+}
+
+template <>
+inline bool IsZero(c10::complex<float> v) {
+  return v == 0.0f;
+}
+
+template <typename T>
+inline std::string TypeName(T v) {
+  return "unknown";
+}
+
+template <>
+inline std::string TypeName(float v) {
+  return "float";
+}
+
+template <>
+inline std::string TypeName(double v) {
+  return "double";
+}
+
+template <>
+inline std::string TypeName(BFloat16 v) {
+  return "BFloat16";
+}
+
+template <>
+inline std::string TypeName(Half v) {
+  return "Half";
+}
+
+template <>
+inline std::string TypeName(Float8_e4m3fn v) {
+  return "Float8_e4m3fn";
+}
+
+template <>
+inline std::string TypeName(Float8_e5m2 v) {
+  return "Float8_e5m2";
+}
+
+template <>
+inline std::string TypeName(Float8_e4m3fnuz v) {
+  return "Float8_e4m3fnuz";
+}
+
+template <>
+inline std::string TypeName(Float8_e5m2fnuz v) {
+  return "Float8_e5m2fnuz";
+}
+
+template <>
+inline std::string TypeName(c10::complex<double> v) {
+  return "c10::complex<double>";
+}
+
+template <>
+inline std::string TypeName(c10::complex<float> v) {
+  return "c10::complex<float>";
+}
+
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
+ public:
+  GemmTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmOp<T>>());
+
+    auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+
+#ifdef USE_ROCM
+    for (auto&& [name, op] : GetRocBlasGemmTypeStringAndOps<T>()) {
+      this->RegisterOp(std::move(name), std::move(op));
+    }
+
+    if (validators.find("ROCM_VERSION") == validators.end()) {
+      std::string rocm_version = ROCM_BUILD_INFO;
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "ROCM_VERSION",
+          [rocm_version]() { return rocm_version; },
+          [rocm_version](auto&& k) { return rocm_version == k ? OK : FAIL; });
+    }
+
+    if (validators.find("GCN_ARCH_NAME") == validators.end()) {
+      std::string gcn_arch_name = at::cuda::getCurrentDeviceProperties()->gcnArchName;
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "GCN_ARCH_NAME",
+          [gcn_arch_name]() { return gcn_arch_name; },
+          [gcn_arch_name](auto&& k) { return gcn_arch_name == k ? OK : FAIL; });
+    }
+
+    if (validators.find("ROCBLAS_VERSION") == validators.end()) {
+      std::string rocblas_version = c10::str(
+          XSTRINGIFY(ROCBLAS_VERSION_MAJOR), ".",
+          XSTRINGIFY(ROCBLAS_VERSION_MINOR), ".",
+          XSTRINGIFY(ROCBLAS_VERSION_PATCH), "-",
+          XSTRINGIFY(ROCBLAS_VERSION_TWEAK));
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "ROCBLAS_VERSION",
+          [rocblas_version]() { return rocblas_version; },
+          [rocblas_version](auto&& k) { return rocblas_version == k ? OK : FAIL; });
+    }
+#endif
+
+#if defined(USE_ROCM) && ROCM_VERSION >= 50700
+    static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (env == nullptr || strcmp(env, "1") == 0) {
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+
+      if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
+        std::string hipblaslt_version = c10::str(
+            XSTRINGIFY(HIPBLASLT_VERSION_MAJOR), ".",
+            XSTRINGIFY(HIPBLASLT_VERSION_MINOR), ".",
+            XSTRINGIFY(HIPBLASLT_VERSION_PATCH), "-",
+            XSTRINGIFY(HIPBLASLT_VERSION_TWEAK));
+        getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+            "HIPBLASLT_VERSION",
+            [hipblaslt_version]() { return hipblaslt_version; },
+            [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
+      }
+    }
+#endif
+  }
+
+  std::string Signature() override {
+    return c10::str("GemmTunableOp_", TypeName<T>(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>, StreamTimer> {
+ public:
+  GemmStridedBatchedTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmStridedBatchedOp<T>>());
+
+    auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+
+#ifdef USE_ROCM
+    for (auto&& [name, op] : GetRocBlasGemmStridedBatchedTypeStringAndOps<T>()) {
+      this->RegisterOp(std::move(name), std::move(op));
+    }
+
+    if (validators.find("ROCM_VERSION") == validators.end()) {
+      std::string rocm_version = ROCM_BUILD_INFO;
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "ROCM_VERSION",
+          [rocm_version]() { return rocm_version; },
+          [rocm_version](auto&& k) { return rocm_version == k ? OK : FAIL; });
+    }
+
+    if (validators.find("GCN_ARCH_NAME") == validators.end()) {
+      std::string gcn_arch_name = at::cuda::getCurrentDeviceProperties()->gcnArchName;
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "GCN_ARCH_NAME",
+          [gcn_arch_name]() { return gcn_arch_name; },
+          [gcn_arch_name](auto&& k) { return gcn_arch_name == k ? OK : FAIL; });
+    }
+
+    if (validators.find("ROCBLAS_VERSION") == validators.end()) {
+      std::string rocblas_version = c10::str(
+          XSTRINGIFY(ROCBLAS_VERSION_MAJOR), ".",
+          XSTRINGIFY(ROCBLAS_VERSION_MINOR), ".",
+          XSTRINGIFY(ROCBLAS_VERSION_PATCH), "-",
+          XSTRINGIFY(ROCBLAS_VERSION_TWEAK));
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "ROCBLAS_VERSION",
+          [rocblas_version]() { return rocblas_version; },
+          [rocblas_version](auto&& k) { return rocblas_version == k ? OK : FAIL; });
+    }
+#endif
+
+#if defined(USE_ROCM) && ROCM_VERSION >= 50700
+    static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (env == nullptr || strcmp(env, "1") == 0) {
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmStridedBatchedTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+
+      if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
+        std::string hipblaslt_version = c10::str(
+            XSTRINGIFY(HIPBLASLT_VERSION_MAJOR), ".",
+            XSTRINGIFY(HIPBLASLT_VERSION_MINOR), ".",
+            XSTRINGIFY(HIPBLASLT_VERSION_PATCH), "-",
+            XSTRINGIFY(HIPBLASLT_VERSION_TWEAK));
+        getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+            "HIPBLASLT_VERSION",
+            [hipblaslt_version]() { return hipblaslt_version; },
+            [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
+      }
+    }
+#endif
+  }
+
+  std::string Signature() override {
+    return c10::str("GemmStridedBatchedTunableOp_", TypeName<T>(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
+class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer> {
+ public:
+  ScaledGemmTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultScaledGemmOp<CT>>());
+
+    auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+
+#if defined(USE_ROCM) && ROCM_VERSION >= 50700
+    for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps<AT, BT, CT, ALayout, BLayout>()) {
+      this->RegisterOp(std::move(name), std::move(op));
+    }
+
+    if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
+      std::string hipblaslt_version = c10::str(
+          XSTRINGIFY(HIPBLASLT_VERSION_MAJOR), ".",
+          XSTRINGIFY(HIPBLASLT_VERSION_MINOR), ".",
+          XSTRINGIFY(HIPBLASLT_VERSION_PATCH), "-",
+          XSTRINGIFY(HIPBLASLT_VERSION_TWEAK));
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "HIPBLASLT_VERSION",
+          [hipblaslt_version]() { return hipblaslt_version; },
+          [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
+    }
+#endif
+  }
+
+  std::string Signature() override {
+    return c10::str("ScaledGemmTunableOp",
+            "_", TypeName<AT>(AT{}),
+            "_", TypeName<BT>(BT{}),
+            "_", TypeName<CT>(CT{}),
+            "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+
+#undef XSTRINGIFY
+#undef STRINGIFY
+
+} // namespace at::cuda::tunable
diff --git a/aten/src/ATen/cuda/tunable/TunableOp.h b/aten/src/ATen/cuda/tunable/TunableOp.h
new file mode 100644
index 0000000000000..65257974ab0cd
--- /dev/null
+++ b/aten/src/ATen/cuda/tunable/TunableOp.h
@@ -0,0 +1,242 @@
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <ATen/cuda/tunable/Tunable.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+
+#ifndef _WIN32
+#include <cxxabi.h>
+#endif
+
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+namespace at::cuda::tunable {
+
+template <typename ParamsT>
+class Callable {
+  public:
+    Callable() = default;
+    Callable(Callable&&) = default;
+    virtual ~Callable() = default;
+    virtual TuningStatus Call(const ParamsT*) {
+      return FAIL;
+    }
+    virtual TuningStatus IsSupported(const ParamsT* params) {
+      return Call(params);
+    }
+};
+
+template <typename ParamsT, typename TimerT>
+class TunableOp {
+  public:
+    TunableOp() = default;
+    TunableOp(TunableOp&&) = default;
+    virtual ~TunableOp() = default;
+
+    TuningStatus operator()(const ParamsT* params) {
+      ResultEntry result = ResultEntry::Null();
+      TuningContext* ctx = getTuningContext();
+      if (ctx->IsTunableOpEnabled()) {
+        auto& mgr = ctx->GetTuningResultsManager();
+        auto op_sig = Signature();
+        auto params_sig = params->Signature();
+        result = mgr.Lookup(op_sig, params_sig);
+        // If there is not previous tuning result been found, we do the tuning iff tuning is enabled
+        if (result == ResultEntry::Null() && ctx->IsTuningEnabled()) {
+          result = FindFastest(params);
+          mgr.Add(op_sig, params_sig, result);
+        }
+      }
+      else {
+        result = ResultEntry::Default();
+      }
+      if (result == ResultEntry::Null()) {
+        TUNABLE_LOG("no result, using default");
+        result = ResultEntry::Default();
+      }
+      auto iter = ops_.find(result);
+      TORCH_CHECK(iter != ops_.end());
+      return iter->second->Call(params);
+    }
+
+    virtual std::string Signature() {
+      // According to C++17 standard https://wg21.link/n4659 section 15.7.4
+      // > if the operand of typeid refers to the
+      // > object under construction or destruction, typeid yields the std::type_info object representing the constructor
+      // > or destructor’s class.
+      // So delay the op signature generation.
+      c10::call_once(signature_init_once_, [this]() { signature_ = CreateSignature(); });
+      return signature_;
+    }
+
+  protected:
+    void RegisterOp(const std::string& name, std::unique_ptr<Callable<ParamsT>> op) {
+      this->op_names_.emplace_back(name);
+      this->ops_.emplace(name, std::move(op));
+    }
+
+  private:
+    static void WarmUp(Callable<ParamsT> *op, ParamsT* param, size_t num_iter) {
+      for (size_t i = 0; i < num_iter; i++) {
+        TORCH_CHECK(op->Call(param) == OK);
+      }
+    }
+
+    static double Profile(Callable<ParamsT> *op, ParamsT* param, size_t num_iter) {
+      TimerT timer{};
+      timer.Start();
+      for (size_t i = 0; i < num_iter; i++) {
+        TORCH_CHECK(op->Call(param) == OK);
+      }
+      timer.End();
+      return timer.Duration() / num_iter;
+    }
+
+  protected:
+    bool IsNumericsCheckEnabled() {
+      static const char *env = getenv("PYTORCH_TUNABLEOP_NUMERICAL_CHECK");
+      if (env != nullptr && strcmp(env, "0") == 0) {
+        return false;
+      }
+      return true;
+    }
+
+    virtual ResultEntry FindFastest(const ParamsT* params) {
+      TuningContext* ctx = getTuningContext();
+      auto op_sig = Signature();
+      auto params_sig = params->Signature();
+      TUNABLE_LOG("finding fastest for ", op_sig, '(', params_sig, ')', " out of ", op_names_.size(), " candidates");
+      auto min_duration_ms = std::numeric_limits<double>::infinity();
+      std::string id_name = "Default";
+
+      // calcaulte a reference answer for numerical check
+      ParamsT* reference_params = params->DeepCopy();
+      TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);
+
+      // need a copy of params to reuse
+      ParamsT* reusable_params = params->DeepCopy();
+
+      for (size_t i = 0; i < op_names_.size(); i++) {
+        auto* candidate = ops_[op_names_[i]].get(); // borrow pointer
+        auto status = candidate->Call(reusable_params);
+        if (status != OK) {
+          TUNABLE_LOG("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+          continue;
+        }
+
+        if (IsNumericsCheckEnabled()) {
+          ParamsT* numerical_params = params->DeepCopy();
+          WarmUp(candidate, numerical_params, 1);
+          status = reference_params->NumericalCheck(numerical_params);
+          numerical_params->Delete();
+          if (status != OK) {
+            TUNABLE_LOG("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+        }
+
+        // collect a small profile
+        constexpr const int approx_num_iter = 3;
+        auto approx_duration = Profile(candidate, reusable_params, approx_num_iter);
+        // bail if too slow
+        if (approx_duration > 2 * min_duration_ms) {
+          TUNABLE_LOG("├──skip slow instance id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+          continue;
+        }
+
+        // for warmup does user set max duration, max iters, or both?
+        double max_warmup_duration = ctx->GetMaxWarmupDurationMs();
+        int max_warmup_iter = ctx->GetMaxWarmupIterations();
+        int warmup_iter = 1; // default
+        if (max_warmup_duration > 0) {
+          int duration_iters = max_warmup_duration / approx_duration;
+          if (max_warmup_iter > 0) {
+            warmup_iter = std::min(max_warmup_iter, duration_iters);
+          }
+          else {
+            warmup_iter = duration_iters;
+          }
+        }
+        else if (max_warmup_iter > 0) {
+          warmup_iter = max_warmup_iter;
+        }
+
+        // for tuning does user set max duration, max iters, or both?
+        double max_tuning_duration = ctx->GetMaxTuningDurationMs();
+        int max_tuning_iter = ctx->GetMaxTuningIterations();
+        int tuning_iter = 100; // default
+        if (max_tuning_duration > 0) {
+          int duration_iters = max_tuning_duration / approx_duration;
+          if (max_tuning_iter > 0) {
+            tuning_iter = std::min(max_tuning_iter, duration_iters);
+          }
+          else {
+            tuning_iter = duration_iters;
+          }
+        }
+        else if (max_tuning_iter > 0) {
+          tuning_iter = max_tuning_iter;
+        }
+
+        // do the full warmup followed by tuning
+        double warmup_ms = warmup_iter * approx_duration;
+        double tuning_ms = tuning_iter * approx_duration;
+        TUNABLE_LOG("├──tuning using "
+            "warmup iters ", warmup_iter, " [", warmup_ms, " ms] "
+            "and tuning iters ", tuning_iter, " [", tuning_ms, " ms] ",
+            "instance id=", i, ", ", op_sig, "(", params_sig, ") ", op_names_[i]);
+        WarmUp(candidate, reusable_params, warmup_iter);
+        auto duration_ms = Profile(candidate, reusable_params, tuning_iter);
+        if (duration_ms < min_duration_ms) {
+          TUNABLE_LOG("├──found better instance id=", i, ". " , duration_ms, "ms. ", op_names_[i]);
+          min_duration_ms = duration_ms;
+          id_name = op_names_[i];
+        }
+      }
+
+      reusable_params->Delete();
+      reference_params->Delete();
+
+      TUNABLE_LOG("└──found fastest for ", op_sig, '(', params_sig, ") ", id_name);
+      return ResultEntry(id_name, min_duration_ms);
+    }
+
+  private:
+    std::string CreateSignature() {
+#ifndef _WIN32
+      const auto* name = typeid(*this).name();
+      char buf[256];
+      size_t buf_len = 256;
+      abi::__cxa_demangle(name, buf, &buf_len, nullptr);
+      buf[255] = '\0';
+      return buf;
+#else
+      return typeid(*this).name();
+#endif
+    }
+
+    mutable c10::once_flag signature_init_once_;
+    std::string signature_;
+
+    std::unordered_map<std::string, std::unique_ptr<Callable<ParamsT>>> ops_;
+    std::vector<std::string> op_names_;
+};
+
+struct OpParams {
+  OpParams() {}
+  virtual ~OpParams() = default;
+  virtual std::string Signature() const = 0;
+};
+
+} // namespace at::cuda::tunable
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index 694e93216b7a8..79a2fe58ad007 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -210,9 +210,7 @@ struct TORCH_CUDA_CPP_API ConvolutionDescriptor
     if(dataType == CUDNN_DATA_HALF) {
       AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
     } else if (dataType == CUDNN_DATA_FLOAT && !allow_tf32) {
-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8000
       AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_FMA_MATH));
-#endif
     }
   }
 };
@@ -304,13 +302,9 @@ struct TORCH_CUDA_CPP_API RNNDescriptor : public Descriptor<
       if (input_type == CUDNN_DATA_HALF) {
         cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_TENSOR_OP_MATH);
       }
-#endif
-#if !defined(USE_CUDNN_RNN_V8_API) && defined(CUDNN_VERSION) && CUDNN_VERSION >= 8000
       else if (input_type == CUDNN_DATA_FLOAT && !allow_tf32) {
         cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_FMA_MATH);
       }
-#endif
-#ifndef USE_CUDNN_RNN_V8_API
       else {
         // Technically, as the default it's not necessary to explicitly
         // set this.
@@ -318,6 +312,15 @@ struct TORCH_CUDA_CPP_API RNNDescriptor : public Descriptor<
       }
     }
 #else
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+    auto math_type = CUDNN_DEFAULT_MATH;
+    if (prop->major >= 7) {
+      if (input_type == CUDNN_DATA_HALF) {
+        math_type = CUDNN_TENSOR_OP_MATH;
+      } else if (!allow_tf32) {
+        math_type = CUDNN_FMA_MATH;
+      }
+    }
     AT_CUDNN_CHECK(cudnnSetRNNDescriptor_v8(
           mut_desc(),
           algo,
@@ -327,7 +330,7 @@ struct TORCH_CUDA_CPP_API RNNDescriptor : public Descriptor<
           input_mode,
           input_type,
           datatype,
-          allow_tf32 ? CUDNN_DEFAULT_MATH : CUDNN_FMA_MATH,
+          math_type,
           input_size,
           hidden_size,
           proj_size ? proj_size : hidden_size,
diff --git a/aten/src/ATen/cudnn/Handle.cpp b/aten/src/ATen/cudnn/Handle.cpp
index ec0f416e85aea..f57744f129d98 100644
--- a/aten/src/ATen/cudnn/Handle.cpp
+++ b/aten/src/ATen/cudnn/Handle.cpp
@@ -34,7 +34,7 @@ using CudnnPoolType = at::cuda::DeviceThreadHandlePool<cudnnHandle_t, createCuDN
 } // namespace
 
 cudnnHandle_t getCudnnHandle() {
-  int device;
+  c10::DeviceIndex device = 0;
   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
 
   // Thread local PoolWindows are lazily-initialized
diff --git a/aten/src/ATen/detail/AcceleratorHooksInterface.h b/aten/src/ATen/detail/AcceleratorHooksInterface.h
new file mode 100644
index 0000000000000..96e15e1f69da4
--- /dev/null
+++ b/aten/src/ATen/detail/AcceleratorHooksInterface.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/Stream.h>
+namespace at {
+
+// AcceleratorHooksInterface is a shared interface provided by all
+// accelerators to allow generic code.
+// This inferface is hook-based as it corresponds to all the functions
+// that are going to be called in a generic way from the CPU code.
+
+struct TORCH_API AcceleratorHooksInterface {
+  // This should never actually be implemented, but it is used to
+  // squelch -Werror=non-virtual-dtor
+  virtual ~AcceleratorHooksInterface() = default;
+
+  // Whether the device at device_index is fully initialized or not.
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const = 0;
+
+  virtual DeviceIndex deviceCount() const {
+    return 0;
+  }
+
+  virtual void setCurrentDevice(DeviceIndex device) const {
+    TORCH_CHECK(false, "Backend doesn't support setCurrentDevice()");
+  }
+
+  virtual DeviceIndex getCurrentDevice() const {
+    TORCH_CHECK(false, "Backend doesn't support getCurrentDevice()");
+    return -1;
+  }
+
+  virtual DeviceIndex exchangeDevice(DeviceIndex device) const {
+    TORCH_CHECK(false, "Backend doesn't support exchangeDevice()");
+    return -1;
+  }
+
+  virtual DeviceIndex maybeExchangeDevice(DeviceIndex device) const {
+    TORCH_CHECK(false, "Backend doesn't support maybeExchangeDevice()");
+    return -1;
+  }
+};
+
+} // namespace at
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index 981fd1227a81d..860e49ff3d6f5 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -4,6 +4,8 @@
 #include <c10/util/Exception.h>
 #include <c10/util/Registry.h>
 
+#include <ATen/detail/AcceleratorHooksInterface.h>
+
 // Forward-declares at::Generator and at::cuda::NVRTC
 namespace at {
 struct Generator;
@@ -57,10 +59,10 @@ constexpr const char* CUDA_HELP =
 // TODO: Consider putting the stub definitions in another class, so that one
 // never forgets to implement each virtual function in the real implementation
 // in CUDAHooks.  This probably doesn't buy us much though.
-struct TORCH_API CUDAHooksInterface {
+struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
   // This should never actually be implemented, but it is used to
   // squelch -Werror=non-virtual-dtor
-  virtual ~CUDAHooksInterface() = default;
+  virtual ~CUDAHooksInterface() override = default;
 
   // Initialize THCState and, transitively, the CUDA state
   virtual void initCUDA() const {
@@ -99,6 +101,10 @@ struct TORCH_API CUDAHooksInterface {
     return false;
   }
 
+  virtual bool hasCuBLASLt() const {
+    return false;
+  }
+
   virtual bool hasROCM() const {
     return false;
   }
@@ -107,7 +113,7 @@ struct TORCH_API CUDAHooksInterface {
     TORCH_CHECK(false, "NVRTC requires CUDA. ", CUDA_HELP);
   }
 
-  virtual bool hasPrimaryContext(DeviceIndex device_index) const {
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
     TORCH_CHECK(false, "Cannot call hasPrimaryContext(", device_index, ") without ATen_cuda library. ", CUDA_HELP);
   }
 
diff --git a/aten/src/ATen/detail/HIPHooksInterface.h b/aten/src/ATen/detail/HIPHooksInterface.h
index 3ce351b623908..7f4862c408680 100644
--- a/aten/src/ATen/detail/HIPHooksInterface.h
+++ b/aten/src/ATen/detail/HIPHooksInterface.h
@@ -38,7 +38,7 @@ struct TORCH_API HIPHooksInterface {
     return false;
   }
 
-  virtual int64_t current_device() const {
+  virtual c10::DeviceIndex current_device() const {
     return -1;
   }
 
diff --git a/aten/src/ATen/detail/MAIAHooksInterface.cpp b/aten/src/ATen/detail/MAIAHooksInterface.cpp
new file mode 100644
index 0000000000000..e82ad8f677018
--- /dev/null
+++ b/aten/src/ATen/detail/MAIAHooksInterface.cpp
@@ -0,0 +1,29 @@
+#include <ATen/detail/MAIAHooksInterface.h>
+
+#include <c10/util/CallOnce.h>
+#include <c10/util/Registry.h>
+
+#include <cstddef>
+#include <memory>
+
+namespace at {
+namespace detail {
+
+// See getCUDAHooks for some more commentary
+const MAIAHooksInterface& getMAIAHooks() {
+  static std::unique_ptr<MAIAHooksInterface> maia_hooks;
+  static c10::once_flag once;
+  c10::call_once(once, [] {
+    maia_hooks = MAIAHooksRegistry()->Create("MAIAHooks", {});
+    if (!maia_hooks) {
+      maia_hooks = std::make_unique<MAIAHooksInterface>();
+    }
+  });
+  return *maia_hooks;
+}
+} // namespace detail
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+C10_DEFINE_REGISTRY(MAIAHooksRegistry, MAIAHooksInterface, MAIAHooksArgs)
+
+} // namespace at
diff --git a/aten/src/ATen/detail/MAIAHooksInterface.h b/aten/src/ATen/detail/MAIAHooksInterface.h
new file mode 100644
index 0000000000000..ad4ef146eccd9
--- /dev/null
+++ b/aten/src/ATen/detail/MAIAHooksInterface.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
+
+// NB: Class must live in `at` due to limitations of Registry.h.
+namespace at {
+
+struct TORCH_API MAIAHooksInterface {
+  // This should never actually be implemented, but it is used to
+  // squelch -Werror=non-virtual-dtor
+  virtual ~MAIAHooksInterface() = default;
+
+  virtual std::string showConfig() const {
+    TORCH_CHECK(false, "Cannot query detailed MAIA version information.");
+  }
+};
+
+// NB: dummy argument to suppress "ISO C++11 requires at least one argument
+// for the "..." in a variadic macro"
+struct TORCH_API MAIAHooksArgs {};
+
+TORCH_DECLARE_REGISTRY(MAIAHooksRegistry, MAIAHooksInterface, MAIAHooksArgs);
+#define REGISTER_MAIA_HOOKS(clsname) \
+  C10_REGISTER_CLASS(MAIAHooksRegistry, clsname, clsname)
+
+namespace detail {
+TORCH_API const MAIAHooksInterface& getMAIAHooks();
+} // namespace detail
+
+} // namespace at
diff --git a/aten/src/ATen/detail/MPSHooksInterface.h b/aten/src/ATen/detail/MPSHooksInterface.h
index a982437505a4a..f82a802618d43 100644
--- a/aten/src/ATen/detail/MPSHooksInterface.h
+++ b/aten/src/ATen/detail/MPSHooksInterface.h
@@ -4,6 +4,7 @@
 
 #include <c10/core/Allocator.h>
 #include <ATen/core/Generator.h>
+#include <ATen/detail/AcceleratorHooksInterface.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Registry.h>
 
@@ -11,13 +12,13 @@
 
 namespace at {
 
-struct TORCH_API MPSHooksInterface {
+struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface {
   // this fails the implementation if MPSHooks functions are called, but
   // MPS backend is not present.
   #define FAIL_MPSHOOKS_FUNC(func) \
     TORCH_CHECK(false, "Cannot execute ", func, "() without MPS backend.");
 
-  virtual ~MPSHooksInterface() = default;
+  virtual ~MPSHooksInterface() override = default;
 
   // Initialize the MPS library state
   virtual void initMPS() const {
@@ -86,7 +87,9 @@ struct TORCH_API MPSHooksInterface {
   virtual double elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event_id) const {
     FAIL_MPSHOOKS_FUNC(__func__);
   }
-
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
   #undef FAIL_MPSHOOKS_FUNC
 };
 
diff --git a/aten/src/ATen/detail/MTIAHooksInterface.cpp b/aten/src/ATen/detail/MTIAHooksInterface.cpp
index 6b69fdb03f3d8..0963881713861 100644
--- a/aten/src/ATen/detail/MTIAHooksInterface.cpp
+++ b/aten/src/ATen/detail/MTIAHooksInterface.cpp
@@ -8,19 +8,22 @@
 namespace at {
 namespace detail {
 
-
-const MTIAHooksInterface &getMTIAHooks() {
-  static MTIAHooksInterface* MTIA_hooks = nullptr;
+const MTIAHooksInterface& getMTIAHooks() {
+  static std::unique_ptr<MTIAHooksInterface> mtia_hooks = nullptr;
   static c10::once_flag once;
   c10::call_once(once, [] {
-    MTIA_hooks =
-        MTIAHooksRegistry()->Create("MTIAHooks", MTIAHooksArgs{}).release();
-    if (!MTIA_hooks) {
-      MTIA_hooks = new MTIAHooksInterface();
+    mtia_hooks = MTIAHooksRegistry()->Create("MTIAHooks", MTIAHooksArgs{});
+    if (!mtia_hooks) {
+      mtia_hooks = std::make_unique<MTIAHooksInterface>();
     }
   });
-  return *MTIA_hooks;
+  return *mtia_hooks;
+}
+
+bool isMTIAHooksBuilt() {
+  return MTIAHooksRegistry()->Has("MTIAHooks");
 }
+
 } // namespace detail
 
 C10_DEFINE_REGISTRY(MTIAHooksRegistry, MTIAHooksInterface, MTIAHooksArgs)
diff --git a/aten/src/ATen/detail/MTIAHooksInterface.h b/aten/src/ATen/detail/MTIAHooksInterface.h
index f969beef7a36e..1da1bda4e6130 100644
--- a/aten/src/ATen/detail/MTIAHooksInterface.h
+++ b/aten/src/ATen/detail/MTIAHooksInterface.h
@@ -1,9 +1,13 @@
 #pragma once
 
+#include <c10/core/Device.h>
 #include <c10/util/Exception.h>
 
+#include <c10/core/Stream.h>
 #include <c10/util/Registry.h>
 
+#include <ATen/detail/AcceleratorHooksInterface.h>
+
 #include <string>
 
 namespace at {
@@ -17,25 +21,72 @@ constexpr const char* MTIA_HELP =
     "this error has occurred because you are trying "
     "to use some MTIA's functionality without MTIA extension included.";
 
-struct TORCH_API MTIAHooksInterface {
-  virtual ~MTIAHooksInterface() = default;
+struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
+// this fails the implementation if MTIAHooks functions are called, but
+// MTIA backend is not present.
+#define FAIL_MTIAHOOKS_FUNC(func) \
+  TORCH_CHECK(false, "Cannot execute ", func, "() without MTIA backend.");
+
+  virtual ~MTIAHooksInterface() override = default;
 
   virtual void initMTIA() const {
-    TORCH_CHECK(
-        false,
-        "Cannot initialize MTIA without MTIA Extension for PyTorch.",
-        MTIA_HELP);
+    // Avoid logging here, since MTIA needs init devices first then it will know
+    // how many devices are available. Make it as no-op if mtia extension is not
+    // dynamically loaded.
+    return;
   }
 
   virtual bool hasMTIA() const {
     return false;
   }
 
+  virtual DeviceIndex deviceCount() const override {
+    return 0;
+  }
+
+  virtual void deviceSynchronize(c10::DeviceIndex device_index) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+  }
+
   virtual std::string showConfig() const {
-    TORCH_CHECK(
-        false,
-        "Cannot query detailed MTIA version without MTIA Extension for PyTorch.",
-        MTIA_HELP);
+    FAIL_MTIAHOOKS_FUNC(__func__);
+  }
+
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+    return false;
+  }
+
+  virtual void setCurrentDevice(DeviceIndex device) const override {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+  }
+
+  virtual DeviceIndex getCurrentDevice() const override {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+    return -1;
+  }
+
+  virtual DeviceIndex exchangeDevice(DeviceIndex device) const override {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+    return -1;
+  }
+
+  virtual DeviceIndex maybeExchangeDevice(DeviceIndex device) const override {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+    return -1;
+  }
+
+  virtual c10::Stream getCurrentStream(DeviceIndex device) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+    return c10::Stream::unpack3(-1, 0, c10::DeviceType::MTIA);
+  }
+
+  virtual c10::Stream getDefaultStream(DeviceIndex device) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
+    return c10::Stream::unpack3(-1, 0, c10::DeviceType::MTIA);
+  }
+
+  virtual void setCurrentStream(const c10::Stream& stream) const {
+    FAIL_MTIAHOOKS_FUNC(__func__);
   }
 };
 
@@ -47,5 +98,6 @@ C10_DECLARE_REGISTRY(MTIAHooksRegistry, MTIAHooksInterface, MTIAHooksArgs);
 
 namespace detail {
 TORCH_API const MTIAHooksInterface& getMTIAHooks();
+TORCH_API bool isMTIAHooksBuilt();
 } // namespace detail
 } // namespace at
diff --git a/aten/src/ATen/detail/ORTHooksInterface.cpp b/aten/src/ATen/detail/ORTHooksInterface.cpp
deleted file mode 100644
index bbb69809e8770..0000000000000
--- a/aten/src/ATen/detail/ORTHooksInterface.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <ATen/detail/ORTHooksInterface.h>
-
-#include <c10/util/CallOnce.h>
-#include <c10/util/Registry.h>
-
-#include <cstddef>
-#include <memory>
-
-namespace at {
-namespace detail {
-
-// See getCUDAHooks for some more commentary
-const ORTHooksInterface& getORTHooks() {
-  static std::unique_ptr<ORTHooksInterface> ort_hooks;
-  static c10::once_flag once;
-  c10::call_once(once, [] {
-    ort_hooks = ORTHooksRegistry()->Create("ORTHooks", {});
-    if (!ort_hooks) {
-      ort_hooks = std::make_unique<ORTHooksInterface>();
-    }
-  });
-  return *ort_hooks;
-}
-} // namespace detail
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-C10_DEFINE_REGISTRY(ORTHooksRegistry, ORTHooksInterface, ORTHooksArgs)
-
-} // namespace at
diff --git a/aten/src/ATen/detail/ORTHooksInterface.h b/aten/src/ATen/detail/ORTHooksInterface.h
deleted file mode 100644
index f49969ec66a5b..0000000000000
--- a/aten/src/ATen/detail/ORTHooksInterface.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-
-#include <c10/util/Exception.h>
-#include <c10/util/Registry.h>
-
-constexpr const char* ORT_HELP =
-  " You need to 'import torch_ort' to use the 'ort' device in PyTorch. "
-  "The 'torch_ort' module is provided by the ONNX Runtime itself "
-  "(https://onnxruntime.ai).";
-
-// NB: Class must live in `at` due to limitations of Registry.h.
-namespace at {
-
-struct TORCH_API ORTHooksInterface {
-  // This should never actually be implemented, but it is used to
-  // squelch -Werror=non-virtual-dtor
-  virtual ~ORTHooksInterface() = default;
-
-  virtual std::string showConfig() const {
-    TORCH_CHECK(false, "Cannot query detailed ORT version information.", ORT_HELP);
-  }
-};
-
-// NB: dummy argument to suppress "ISO C++11 requires at least one argument
-// for the "..." in a variadic macro"
-struct TORCH_API ORTHooksArgs {};
-
-TORCH_DECLARE_REGISTRY(ORTHooksRegistry, ORTHooksInterface, ORTHooksArgs);
-#define REGISTER_ORT_HOOKS(clsname) \
-  C10_REGISTER_CLASS(ORTHooksRegistry, clsname, clsname)
-
-namespace detail {
-TORCH_API const ORTHooksInterface& getORTHooks();
-} // namespace detail
-
-} // namespace at
diff --git a/aten/src/ATen/detail/PrivateUse1HooksInterface.cpp b/aten/src/ATen/detail/PrivateUse1HooksInterface.cpp
index 8c3861c617ccc..ff267a41506bb 100644
--- a/aten/src/ATen/detail/PrivateUse1HooksInterface.cpp
+++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.cpp
@@ -22,4 +22,15 @@ TORCH_API bool isPrivateUse1HooksRegistered() {
   return privateuse1_hooks != nullptr;
 }
 
+namespace detail {
+
+TORCH_API const at::PrivateUse1HooksInterface& getPrivateUse1Hooks() {
+  TORCH_CHECK(
+      privateuse1_hooks != nullptr,
+      "Please register PrivateUse1HooksInterface by `RegisterPrivateUse1HooksInterface` first.");
+  return *privateuse1_hooks;
 }
+
+} // namespace detail
+
+} // namespace at
diff --git a/aten/src/ATen/detail/PrivateUse1HooksInterface.h b/aten/src/ATen/detail/PrivateUse1HooksInterface.h
index 142e812d28375..0b1b028ab4021 100644
--- a/aten/src/ATen/detail/PrivateUse1HooksInterface.h
+++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.h
@@ -1,13 +1,17 @@
 #pragma once
 
 #include <ATen/core/Generator.h>
+#include <ATen/detail/AcceleratorHooksInterface.h>
+#include <c10/core/Allocator.h>
 #include <c10/core/Device.h>
+#include <c10/core/Storage.h>
 #include <c10/util/Exception.h>
 namespace at {
 
-struct TORCH_API PrivateUse1HooksInterface {
-  virtual ~PrivateUse1HooksInterface() = default;
-  virtual const at::Generator& getDefaultGenerator(c10::DeviceIndex device_index) {
+struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
+  virtual ~PrivateUse1HooksInterface() override = default;
+  virtual const at::Generator& getDefaultGenerator(
+      c10::DeviceIndex device_index) {
     TORCH_CHECK_NOT_IMPLEMENTED(
         false,
         "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDefaultGenerator`.");
@@ -19,15 +23,39 @@ struct TORCH_API PrivateUse1HooksInterface {
         "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDeviceFromPtr`.");
   }
 
+  virtual Allocator* getPinnedMemoryAllocator() const {
+    TORCH_CHECK(
+        false,
+        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getPinnedMemoryAllocator`.");
+  }
+
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `hasPrimaryContext`.");
+  }
+
   virtual void initPrivateUse1() const {}
+  virtual void resizePrivateUse1Bytes(const c10::Storage &storage, size_t newsize) const {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `resizePrivateUse1Bytes`.");
+  }
 };
 
 struct TORCH_API PrivateUse1HooksArgs {};
 
-TORCH_API void RegisterPrivateUse1HooksInterface(at::PrivateUse1HooksInterface* hook_);
+TORCH_API void RegisterPrivateUse1HooksInterface(
+    at::PrivateUse1HooksInterface* hook_);
 
 TORCH_API at::PrivateUse1HooksInterface* GetPrivateUse1HooksInterface();
 
 TORCH_API bool isPrivateUse1HooksRegistered();
 
-}
+namespace detail {
+
+TORCH_API const at::PrivateUse1HooksInterface& getPrivateUse1Hooks();
+
+} // namespace detail
+
+} // namespace at
diff --git a/aten/src/ATen/detail/XPUHooksInterface.h b/aten/src/ATen/detail/XPUHooksInterface.h
index 9a3836dbcc78e..8e5e0d8243ab7 100644
--- a/aten/src/ATen/detail/XPUHooksInterface.h
+++ b/aten/src/ATen/detail/XPUHooksInterface.h
@@ -9,11 +9,6 @@
 #include <functional>
 #include <memory>
 
-// We use forward declaration here instead of #include <ATen/dlpack.h> to avoid
-// leaking DLPack implementation detail to every project that includes `ATen/Context.h`, which in turn
-// would lead to a conflict when linked with another project using DLPack (for example TVM)
-struct DLDevice_;
-
 namespace at {
 
 constexpr const char* XPU_HELP =
@@ -44,23 +39,8 @@ struct TORCH_API XPUHooksInterface {
         XPU_HELP);
   }
 
-  virtual Device getATenDeviceFromDLPackDevice(
-      const DLDevice_& dl_device,
-      void* data) const {
-    TORCH_CHECK(
-        false,
-        "Cannot get XPU device without Intel Extension for Pytorch. ",
-        XPU_HELP);
-  }
-
-  virtual DLDevice_& getDLPackDeviceFromATenDevice(
-      DLDevice_& dl_device,
-      const Device& aten_device,
-      void* data) const {
-    TORCH_CHECK(
-        false,
-        "Cannot get XPU DL device without Intel Extension for Pytorch. ",
-        XPU_HELP);
+  virtual int32_t getGlobalIdxFromDevice(const Device& device) const {
+    TORCH_CHECK(false, "Cannot get XPU global device index without ATen_xpu library.");
   }
 
   virtual Generator getXPUGenerator(C10_UNUSED DeviceIndex device_index = -1) const {
@@ -71,9 +51,29 @@ struct TORCH_API XPUHooksInterface {
     TORCH_CHECK(false, "Cannot get default XPU generator without Intel Extension for Pytorch. ", XPU_HELP);
   }
 
-  virtual int getNumGPUs() const {
+  virtual DeviceIndex getNumGPUs() const {
     return 0;
   }
+
+  virtual DeviceIndex current_device() const {
+    TORCH_CHECK(false, "Cannot get current device on XPU without ATen_xpu library.");
+  }
+
+  virtual Device getDeviceFromPtr(void* /*data*/) const {
+    TORCH_CHECK(false, "Cannot get device of pointer on XPU without ATen_xpu library.");
+  }
+
+  virtual void deviceSynchronize(DeviceIndex /*device_index*/) const {
+    TORCH_CHECK(false, "Cannot synchronize XPU device without ATen_xpu library.");
+  }
+
+  virtual Allocator* getPinnedMemoryAllocator() const  {
+    TORCH_CHECK(false, "Cannot get XPU pinned memory allocator without ATen_xpu library.");
+  }
+
+  virtual bool isPinnedPtr(const void* /*data*/) const {
+    return false;
+  }
 };
 
 struct TORCH_API XPUHooksArgs {};
diff --git a/aten/src/ATen/dlpack.h b/aten/src/ATen/dlpack.h
index eb33058807718..9601a2478ddde 100644
--- a/aten/src/ATen/dlpack.h
+++ b/aten/src/ATen/dlpack.h
@@ -94,10 +94,7 @@ typedef enum {
 /*!
  * \brief A Device for Tensor and operator.
  */
-// NB: This is the only difference from
-// https://github.com/dmlc/dlpack/blob/v0.7/include/dlpack/dlpack.h Required to
-// allow forward declaration of DLDevice.
-typedef struct DLDevice_ {
+typedef struct {
   /*! \brief The device type used in the device. */
   DLDeviceType device_type;
   /*!
@@ -198,12 +195,12 @@ typedef struct {
   /*! \brief The data type of the pointer*/
   DLDataType dtype;
   /*! \brief The shape of the tensor */
-  int64_t* shape;
+  const int64_t* shape;
   /*!
    * \brief strides of the tensor (in number of elements, not bytes)
    *  can be NULL, indicating tensor is compact and row-majored.
    */
-  int64_t* strides;
+  const int64_t* strides;
   /*! \brief The offset in bytes to the beginning pointer to data */
   uint64_t byte_offset;
 } DLTensor;
diff --git a/aten/src/ATen/functorch/ADInterpreters.cpp b/aten/src/ATen/functorch/ADInterpreters.cpp
index e113f5b01ad73..2f0de0b159b6a 100644
--- a/aten/src/ATen/functorch/ADInterpreters.cpp
+++ b/aten/src/ATen/functorch/ADInterpreters.cpp
@@ -3,7 +3,7 @@
 #include <ATen/functorch/TensorWrapper.h>
 #include <bitset>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 constexpr size_t default_bitset_size = 64;
 
@@ -73,7 +73,7 @@ static void autogradBasedTransformProcess(
     return materializeGradWrappers(tensor, current_level);
   };
   auto num_args = op.schema().arguments().size();
-  foreachTensorInplace(*stack, stack->size() - num_args, stack->size(), maybeTransformGradWrappers);
+  foreachTensorInplace(*stack, static_cast<int64_t>(stack->size() - num_args), static_cast<int64_t>(stack->size()), maybeTransformGradWrappers);
 
   setup_dispatch_key_tls(transform_type, {});
   op.callBoxed(stack);
@@ -133,7 +133,7 @@ static void autogradBasedTransformSendToNext(
   auto args_size = op.schema().arguments().size();
   const auto ret_size = op.schema().returns().size();
   // Step 1
-  auto front = stack->size() - args_size;
+  auto front = static_cast<int64_t>(stack->size()) - args_size;
   for (const auto arg_idx : c10::irange(0, args_size)) {
     stack->push_back((*stack)[front + arg_idx]);
   }
@@ -151,7 +151,7 @@ static void autogradBasedTransformSendToNext(
         // if the input is immutable, we find if it aliases anything, noting that
         // args are in reverse order on stack, so the last arg is at the top of the stack
         const auto relative_pos = idx - (stack->size() - args_size);
-        const auto aliased_out = findAliasedOutput(op.schema(), relative_pos);
+        const auto aliased_out = findAliasedOutput(op.schema(), static_cast<int64_t>(relative_pos));
         if (aliased_out.has_value()) {
           outputs_aliasing_immutable.flip(*aliased_out); // each output aliases at most one input, so we can only hit this once
         }
@@ -160,7 +160,7 @@ static void autogradBasedTransformSendToNext(
   }
 
   // Step 2
-  foreachTensorInplace(*stack, stack->size() - args_size, stack->size(), unwrap);
+  foreachTensorInplace(*stack, static_cast<int64_t>(stack->size() - args_size), static_cast<int64_t>(stack->size()), unwrap);
 
   // See NOTE [grad and vjp interaction with no_grad]
   optional<c10::AutoGradMode> grad_guard;
@@ -183,7 +183,7 @@ static void autogradBasedTransformSendToNext(
   op.callBoxed(stack);
 
   // Step 4
-  foreachTensorInplaceWithFlag(*stack, stack->size() - ret_size, stack->size(), outputs_aliasing_immutable, wrap);
+  foreachTensorInplaceWithFlag(*stack, static_cast<int64_t>(stack->size() - ret_size), static_cast<int64_t>(stack->size()), outputs_aliasing_immutable, wrap);
 
   // Step 5
   auto args_front = stack->size() - args_size - ret_size;
@@ -200,7 +200,7 @@ static void autogradBasedTransformSendToNext(
   }
 
   // Step 6
-  stack->erase(stack->end() - (args_size + ret_size), stack->end() - ret_size);
+  stack->erase(stack->end() - std::ptrdiff_t(args_size + ret_size), stack->end() - std::ptrdiff_t(ret_size));
 }
 
 void GradInterpreterPtr::processImpl(
@@ -239,4 +239,4 @@ void JvpInterpreterPtr::sendToNextInterpreterImpl(
       grad_special_case);
 }
 
-}} // namespace at::functorch
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesActivation.cpp b/aten/src/ATen/functorch/BatchRulesActivation.cpp
index b26ec74d84af7..87a7865b05054 100644
--- a/aten/src/ATen/functorch/BatchRulesActivation.cpp
+++ b/aten/src/ATen/functorch/BatchRulesActivation.cpp
@@ -10,7 +10,7 @@
 
 // NB: most activation functions fit pointwise unary or binary rules.
 // These are only the ones that have special batch rules to help with organization
-namespace at { namespace functorch {
+namespace at::functorch {
 static std::tuple<Tensor,optional<int64_t>>
 glu_batch_rule(const Tensor& self, optional<int64_t> self_bdim, int64_t dim) {
   // repeated error message from glu because 0D -> 1D when batched
@@ -53,4 +53,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(glu_backward, glu_backward_batch_rule);
   VMAP_SUPPORT(glu, glu_batch_rule);
 }
-}} // namespace at::functorch
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
index 1dd417052cf10..44ca2802bf3a2 100644
--- a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
@@ -11,7 +11,7 @@
 
 #include <utility>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 template <typename F, F Func, typename... ExtraArgs>
 std::tuple<Tensor,optional<int64_t>> _binary_pointwise_batch_rule(
@@ -60,13 +60,9 @@ struct BinaryRandomPointwiseBatchRuleHelper<F, Func, typelist<T1, T2, T...>> {
     auto cur_level = maybe_layer->layerId();
     RandomnessType randomness = maybe_layer->randomness();
 
-    Tensor tensor_value;
-    optional<int64_t> tensor_bdim;
-    std::tie(tensor_value, tensor_bdim) = unwrapTensorAtLevel(tensor, cur_level);
+    auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(tensor, cur_level);
 
-    Tensor other_value;
-    optional<int64_t> other_bdim;
-    std::tie(other_value, other_bdim) = unwrapTensorAtLevel(other, cur_level);
+    auto [other_value, other_bdim] = unwrapTensorAtLevel(other, cur_level);
 
     check_randomness(randomness, (tensor_bdim || other_bdim));
     if (randomness == RandomnessType::Different && !tensor_bdim && !other_bdim) {
@@ -520,4 +516,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT2(fill_, Tensor, fill__Tensor_batch_rule);
 }
 
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesConvolution.cpp b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
index c25c4972da25d..ca4eda19a36fb 100644
--- a/aten/src/ATen/functorch/BatchRulesConvolution.cpp
+++ b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
@@ -8,7 +8,7 @@
 #include <ATen/functorch/PlumbingHelper.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 // convolution_batch_rule translated from jax with modifications:
 // https://github.com/google/jax/blob/master/jax/_src/lax/lax.py#L3143
@@ -29,7 +29,7 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
 
   // If we have a batched bias or weight, we need to perform the computation separately.
   optional<Tensor> unbatched_bias;
-  bool separate_bias;
+  bool separate_bias = false;
   if ((rhs_bdim && bias && bias->defined()) || bias_bdim) {
     TORCH_INTERNAL_ASSERT(bias.has_value());
     TORCH_INTERNAL_ASSERT(bias->defined());
@@ -245,7 +245,7 @@ convolution_backward_input_batch_rule(
     const Tensor& input, optional<int64_t> input_bdim,
     const Tensor& weight, optional<int64_t> weight_bdim,
     c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed,
-    c10::SymIntArrayRef output_padding, c10::SymInt groups) {
+    c10::SymIntArrayRef output_padding, const c10::SymInt& groups) {
   const std::array<bool, 3> mask = {true, false, false};
   if (grad_output_bdim && weight_bdim) {
     // regular: BNO, BOI -> N(BO), (BO)I -> N(BI)
@@ -326,7 +326,7 @@ convolution_backward_weight_batch_rule(
     const Tensor& input, optional<int64_t> input_bdim,
     const Tensor& weight, optional<int64_t> weight_bdim,
     c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed,
-    c10::SymIntArrayRef output_padding, c10::SymInt groups) {
+    c10::SymIntArrayRef output_padding, const c10::SymInt& groups) {
   const std::array<bool, 3> mask = {false, true, false};
   if (grad_output_bdim && input_bdim) {
     // BNO, BNI -> N(BO), N(BI) -> (BO)I (regular) (BI)O (transposed)
@@ -449,15 +449,9 @@ static std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
         dilation, transposed, output_padding, groups, output_mask);
   }
 
-  Tensor grad_output;
-  optional<int64_t> grad_output_bdim;
-  std::tie(grad_output, grad_output_bdim) = unwrapTensorAtLevel(grad_output_, cur_level);
-  Tensor input;
-  optional<int64_t> input_bdim;
-  std::tie(input, input_bdim) = unwrapTensorAtLevel(input_, cur_level);
-  Tensor weight;
-  optional<int64_t> weight_bdim;
-  std::tie(weight, weight_bdim) = unwrapTensorAtLevel(weight_, cur_level);
+  auto [grad_output, grad_output_bdim] = unwrapTensorAtLevel(grad_output_, cur_level);
+  auto [input, input_bdim] = unwrapTensorAtLevel(input_, cur_level);
+  auto [weight, weight_bdim] = unwrapTensorAtLevel(weight_, cur_level);
 
   const auto grad_bias = compute_grad_bias(grad_output_, output_mask);
   output_mask[2] = false;
@@ -542,4 +536,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   m.impl("convolution_backward", convolution_backward_plumbing);
 }
 
-}} // namespace at;:functorch
+} // namespace at;:functorch
diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index 1b179a505e9a9..3e064d6c39dc7 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -13,7 +13,7 @@
 #include <ATen/functorch/DynamicLayer.h>
 #include <ATen/functorch/PlumbingHelper.h>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 #define OP_DECOMPOSE(op)  m.impl(#op, static_cast<decltype(&ATEN_FN(op))>(native::op));
 #define OP_DECOMPOSE2(op, overload)  m.impl(#op"."#overload, static_cast<decltype(&ATEN_FN2(op, overload))>(native::op));
@@ -226,6 +226,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   m.impl("reshape", native::reshape_symint);
   OP_DECOMPOSE(resolve_conj);
   OP_DECOMPOSE(resolve_neg);
+  OP_DECOMPOSE(rms_norm);
   OP_DECOMPOSE(row_stack);
   OP_DECOMPOSE(rrelu);
   OP_DECOMPOSE(rrelu_);
@@ -383,4 +384,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   OP_DECOMPOSE2(to, other);
 }
 
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesDynamic.cpp b/aten/src/ATen/functorch/BatchRulesDynamic.cpp
index b31d68a5768dd..e001e17f5d931 100644
--- a/aten/src/ATen/functorch/BatchRulesDynamic.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDynamic.cpp
@@ -15,7 +15,7 @@
 // errors for them.
 
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 namespace {
 void unsupportedDynamicOp(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
@@ -76,4 +76,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
     m.impl("allclose", torch::CppFunction::makeFromBoxedFunction<&unsupportedAllclose>());
 }
 
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesFactory.cpp b/aten/src/ATen/functorch/BatchRulesFactory.cpp
index 09430ce5f2483..f317fee6af6c7 100644
--- a/aten/src/ATen/functorch/BatchRulesFactory.cpp
+++ b/aten/src/ATen/functorch/BatchRulesFactory.cpp
@@ -7,7 +7,7 @@
 #include <ATen/functorch/BatchRulesHelper.h>
 #include <c10/core/SymIntArrayRef.h>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 template <typename A, A a, typename C>
 struct NewBlahBatchRuleHelperSymInt;
@@ -243,4 +243,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(_new_zeros_with_same_feature_meta, _new_zeros_with_same_feature_meta_batch_rule);
   // Not sure how to add the ones with irregular args to the mix cleanly (i.e. randint takes an extra int parameter)
 }
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesHelper.cpp b/aten/src/ATen/functorch/BatchRulesHelper.cpp
index 89e2b4c5c772d..edac0ebde7914 100644
--- a/aten/src/ATen/functorch/BatchRulesHelper.cpp
+++ b/aten/src/ATen/functorch/BatchRulesHelper.cpp
@@ -7,7 +7,7 @@
 #include <ATen/functorch/BatchRulesHelper.h>
 #include <ATen/WrapDimUtils.h>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 Tensor moveBatchDimToFront(const Tensor& tensor, optional<int64_t> maybe_batch_dim) {
   if (!maybe_batch_dim.has_value()) {
@@ -118,11 +118,9 @@ Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x) {
     // NOTE: 0 % 0 leads to FPE
     TORCH_INTERNAL_ASSERT(shape[src] % size1 == 0);
   }
-  int64_t size2;
   // split any size out of `0`-sized dim
-  if (shape[src] == 0) {
-    size2 = 0;
-  } else {
+  int64_t size2 = 0;
+  if (shape[src] != 0) {
     size2 = shape[src] / size1;
   }
   shape[src] = size1;
@@ -130,7 +128,7 @@ Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x) {
   return at::reshape(x, shape);
 }
 
-Tensor reshape_dim_outof_symint(int64_t src, c10::SymInt size1, const Tensor& x) {
+Tensor reshape_dim_outof_symint(int64_t src, const c10::SymInt& size1, const Tensor& x) {
   src = maybe_wrap_dim(src, x.dim());
   c10::SymDimVector shape(x.sym_sizes().begin(), x.sym_sizes().end());
   if (shape[src] != 0) {
@@ -204,4 +202,4 @@ std::tuple<Tensor, Tensor> _binary_pointwise_helper(
   return std::make_tuple(tensor_, other_);
 }
 
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesHelper.h b/aten/src/ATen/functorch/BatchRulesHelper.h
index 74217d8464d0a..9bb31e09ce4c5 100644
--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@@ -28,7 +28,7 @@ namespace at::functorch {
 TORCH_API Tensor reshape_dim_into(int64_t src, int64_t dst, const Tensor& x);
 TORCH_API Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x);
 
-TORCH_API Tensor reshape_dim_outof_symint(int64_t src, c10::SymInt size1, const Tensor& x);
+TORCH_API Tensor reshape_dim_outof_symint(int64_t src, const c10::SymInt& size1, const Tensor& x);
 
 Tensor moveBatchDimToFront(const Tensor& tensor, optional<int64_t> maybe_batch_dim);
 int64_t rankWithoutBatchDim(const Tensor& tensor, optional<int64_t> maybe_batch_dim);
@@ -144,11 +144,9 @@ void boxed_tensor_inputs_batch_rule(const c10::OperatorHandle& op, torch::jit::S
   for (const auto idx : c10::irange(0, num_arguments)) {
     const auto& ivalue = arguments[idx];
     if (ivalue.isTensor()) {
-      Tensor tensor_value;
-      optional<int64_t> tensor_bdim;
-      std::tie(tensor_value, tensor_bdim) = unwrapTensorAtLevel(ivalue.toTensor(), cur_level);
+      auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(ivalue.toTensor(), cur_level);
       tensor_inputs.emplace_back(tensor_value, tensor_bdim);
-      tensor_pos.push_back(idx);
+      tensor_pos.push_back(static_cast<int64_t>(idx));
     }
   }
   Func(tensor_inputs);
@@ -214,7 +212,7 @@ inline void find_and_unpack_tensors(
     int64_t* batch_size) {
 
   int64_t computed_batch_size = -1;
-  int64_t args_begin = stack->size() - num_args;
+  int64_t args_begin = static_cast<int64_t>(stack->size()) - num_args;
 
   for (const auto idx : c10::irange(0, num_args)) {
     const auto& ivalue = (*stack)[args_begin + idx];
@@ -243,7 +241,7 @@ inline void boxed_existing_bdim_all_batch_rule(
     const c10::OperatorHandle& op, torch::jit::Stack* stack) {
   const auto& schema = op.schema();
   const auto num_returns = schema.returns().size();
-  const auto num_arguments = schema.arguments().size();
+  const auto num_arguments = static_cast<int64_t>(schema.arguments().size());
 
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
@@ -256,10 +254,10 @@ inline void boxed_existing_bdim_all_batch_rule(
     return;
   }
 
-  int64_t args_begin = stack->size() - num_arguments;
+  int64_t args_begin = static_cast<int64_t>(stack->size()) - num_arguments;
   SmallVector<UnpackedBatchedTensor, 5> tensor_inputs;
   SmallVector<int64_t, 5> tensor_pos;
-  int64_t batch_size;
+  int64_t batch_size = 0;
 
   find_and_unpack_tensors(
       stack, num_arguments, cur_level,
@@ -312,13 +310,13 @@ inline void boxed_all_tensors_have_optional_bdim(
     return;
   }
 
-  int64_t args_begin = stack->size() - num_arguments;
+  int64_t args_begin = static_cast<int64_t>(stack->size() - num_arguments);
   SmallVector<UnpackedBatchedTensor, 5> tensor_inputs;
   SmallVector<int64_t, 5> tensor_pos;
-  int64_t batch_size;
+  int64_t batch_size = 0;
 
   find_and_unpack_tensors(
-      stack, num_arguments, cur_level,
+      stack, static_cast<int64_t>(num_arguments), cur_level,
       &tensor_inputs, &tensor_pos, &batch_size);
 
   optional<bool> is_no_batch_dim_case;
diff --git a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
index c3158214ba087..6a17adb4e268c 100644
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@@ -6,7 +6,7 @@
 
 #include <ATen/functorch/BatchRulesHelper.h>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 typedef std::tuple<Tensor, optional<int64_t>> oneOutput;
 typedef std::tuple<Tensor, optional<int64_t>, Tensor, optional<int64_t>> twoOutputs;
@@ -265,6 +265,28 @@ static void expect_at_least_rank(
       rank, " dimensions instead.");
 }
 
+threeOutputs linalg_lu_unpack_batch_rule(
+    const Tensor& LU, optional<int64_t> LU_bdim,
+    const Tensor& pivots, optional<int64_t> pivots_bdim,
+    bool unpack_data, bool unpack_pivots) {
+  auto LU_ = moveBatchDimToFront(LU, LU_bdim);
+  auto pivots_ = moveBatchDimToFront(pivots, pivots_bdim);
+
+  // LU and pivots's first {N-2} (for LU), {N-1} (for pivots) dimensions must
+  // match So if only one of them is being vmapped over, we must expand out that
+  // dimension.
+  if (LU_bdim.has_value() != pivots_bdim.has_value()) {
+    auto bdim_size = get_bdim_size2(LU, LU_bdim, pivots, pivots_bdim);
+    LU_ = ensure_has_bdim(LU_, LU_bdim.has_value(), bdim_size);
+    pivots_ = ensure_has_bdim(pivots_, pivots_bdim.has_value(), bdim_size);
+    pivots_bdim = 0;
+    LU_bdim = 0;
+  }
+
+  const auto res = at::lu_unpack(LU_, pivots_, unpack_data, unpack_pivots);
+  return std::make_tuple(std::get<0>(res), 0, std::get<1>(res), 0, std::get<2>(res), 0);
+}
+
 oneOutput linalg_lu_solve_batch_rule(
     const Tensor& LU, optional<int64_t> LU_bdim,
     const Tensor& pivots, optional<int64_t> pivots_bdim,
@@ -348,7 +370,7 @@ fourOutputs solve_ex_batch_rule(
   TORCH_CHECK(A_logical_rank >= 2,
             "linalg.solve: The input tensor A must have at least 2 dimensions.");
 
-  int b_logical_rank = max_logical_rank;
+  auto b_logical_rank = max_logical_rank;
   if (A_logical_rank > B_logical_rank) {  // vector case: B was a vector or batched vector
     // not accurate but matches linalg error message
     TORCH_CHECK(B_logical_rank >= 1, "linalg.solve: The input tensor B must have at least 2 dimensions.");
@@ -417,8 +439,7 @@ fourOutputs linalg_lstsq_batch_rule(
   const auto self_ = ensure_has_bdim(std::get<0>(tensor_other), self_bdim.has_value(), batch_size);
   const auto b_ = ensure_has_bdim(std::get<1>(tensor_other), b_bdim.has_value(), batch_size);
 
-  Tensor res, res_1, res_2, res_3;
-  std::tie(res, res_1, res_2, res_3) = at::linalg_lstsq(self_, b_, rcond, driver);
+  auto [res, res_1, res_2, res_3] = at::linalg_lstsq(self_, b_, rcond, driver);
 
   // everything but the 0th output are only sometimes computed. When they aren't, they're empty tensors without a bdim
   const auto res_1_bdim = batch_dim_if_not_empty(res_1);
@@ -553,6 +574,7 @@ pinv_batch_rule(
   }
 
 // These need to be outside. String constant must be declared outside of a macro to be used as template param
+// NOLINTBEGIN(*array*)
 LINALG_CHECK_MATRIX_UNARY_ONE_OUT(cholesky, cholesky);
 LINALG_CHECK_MATRIX_UNARY_ONE_OUT(cholesky_inverse, cholesky_inverse);
 LINALG_CHECK_MATRIX_UNARY_TWO_OUT(linalg_cholesky_ex, linalg.cholesky);
@@ -569,6 +591,7 @@ LINALG_CHECK_MATRIX_UNARY_THREE_OUT(_linalg_det, linalg.det);
 LINALG_CHECK_MATRIX_UNARY_TWO_OUT(_linalg_eigh, linalg.eigh);
 LINALG_CHECK_MATRIX_UNARY_FOUR_OUT(_linalg_slogdet, linalg.slogdet);
 LINALG_CHECK_MATRIX_UNARY_THREE_OUT(_linalg_svd, linalg.svd);
+// NOLINTEND(*array*)
 
 TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(bmm, bmm_batch_rule);
@@ -579,6 +602,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(dot, dot_batch_rule);
   VMAP_SUPPORT(mv, mv_batch_rule);
   VMAP_SUPPORT(mm, mm_batch_rule);
+  VMAP_SUPPORT(lu_unpack, linalg_lu_unpack_batch_rule);
   VMAP_SUPPORT(linalg_lu_solve, linalg_lu_solve_batch_rule);
   VMAP_SUPPORT(linalg_householder_product, householder_product_batch_rule);
   VMAP_SUPPORT(cholesky_solve, cholesky_solve_batch_rule);  // custom dim error
@@ -593,4 +617,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
 
   m.impl("vdot", vdot_decomp);
 }
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesLoss.cpp b/aten/src/ATen/functorch/BatchRulesLoss.cpp
index 470f18305f033..22f3adff95a01 100644
--- a/aten/src/ATen/functorch/BatchRulesLoss.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLoss.cpp
@@ -9,7 +9,7 @@
 #include <ATen/functorch/BatchedFallback.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 // Flattens out all dims except the batch dim, and also moves batch dim
 // (if it exists) to front.
 static at::Tensor flatten_logical(const Tensor& tensor, optional<int64_t> bdim) {
@@ -98,12 +98,8 @@ static Tensor binary_cross_entropy_plumbing(
     return at::binary_cross_entropy(self, target, weight, reduction);
   }
 
-  Tensor self_value;
-  optional<int64_t> self_bdim;
-  std::tie(self_value, self_bdim) = unwrapTensorAtLevel(self, cur_level);
-  Tensor target_value;
-  optional<int64_t> target_bdim;
-  std::tie(target_value, target_bdim) = unwrapTensorAtLevel(target, cur_level);
+  auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level);
+  auto [target_value, target_bdim] = unwrapTensorAtLevel(target, cur_level);
 
   Tensor result;
   if (self_bdim || target_bdim) {
@@ -137,16 +133,10 @@ static Tensor binary_cross_entropy_backward_plumbing(
     return at::binary_cross_entropy_backward(grad, input, target, weight_opt, reduction);
   }
 
-  Tensor grad_value;
-  optional<int64_t> grad_bdim;
-  std::tie(grad_value, grad_bdim) = unwrapTensorAtLevel(
+  auto [grad_value, grad_bdim] = unwrapTensorAtLevel(
       reduction == Reduction::None ? grad : grad.expand_as(input), cur_level);
-  Tensor input_value;
-  optional<int64_t> input_bdim;
-  std::tie(input_value, input_bdim) = unwrapTensorAtLevel(input, cur_level);
-  Tensor target_value;
-  optional<int64_t> target_bdim;
-  std::tie(target_value, target_bdim) = unwrapTensorAtLevel(target, cur_level);
+  auto [input_value, input_bdim] = unwrapTensorAtLevel(input, cur_level);
+  auto [target_value, target_bdim] = unwrapTensorAtLevel(target, cur_level);
 
   Tensor grad_input;
   if (grad_bdim || input_bdim || target_bdim) {
@@ -190,4 +180,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   m.impl("binary_cross_entropy_backward", binary_cross_entropy_backward_plumbing);
 }
 
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp
index 987c2edaabff1..875af39214453 100644
--- a/aten/src/ATen/functorch/BatchRulesModules.cpp
+++ b/aten/src/ATen/functorch/BatchRulesModules.cpp
@@ -10,7 +10,7 @@
 
 #include <utility>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 static Tensor getStepTensor(const Tensor& indices, const c10::SymInt& bdim_size, const c10::SymInt& num_embeddings) {
   // [batch_size, 1, 1, 1, ..., 1]
@@ -218,16 +218,16 @@ cudnn_grid_sample_backward_batch_rule(
 // TODO: replace with targetable functionalization
 static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes) {
     TORCH_CHECK(self.dtype() == kLong, "one_hot is only applicable to index tensor.");
-    auto shape = self.sizes().vec();
+    auto shape = self.sym_sizes().vec();
 
     // empty tensor could be converted to one hot representation,
     // but shape inference is not possible.
-    if (self.numel() == 0) {
+    if (self.sym_numel() == 0) {
         if (num_classes <= 0) {
             AT_ERROR("Can not infer total number of classes from empty tensor.");
         } else {
             shape.push_back(num_classes);
-            return at::empty(shape, self.options());
+            return at::empty_symint(shape, self.options());
         }
     }
 
@@ -247,7 +247,7 @@ static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes
     // }
 
     shape.push_back(num_classes);
-    Tensor ret = at::zeros(shape, self.options());
+    Tensor ret = at::zeros_symint(shape, self.options());
     return ret.scatter(-1, self.unsqueeze(-1), 1);
 }
 
@@ -402,4 +402,5 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
 
   m.impl("one_hot", one_hot_decomposition_hack);
 }
-}}
+
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesNorm.cpp b/aten/src/ATen/functorch/BatchRulesNorm.cpp
index 42f68b731af45..faf39d8e374a3 100644
--- a/aten/src/ATen/functorch/BatchRulesNorm.cpp
+++ b/aten/src/ATen/functorch/BatchRulesNorm.cpp
@@ -9,7 +9,7 @@
 #include <ATen/functorch/BatchedFallback.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 static bool is_empty_tensor(const Tensor& tensor) {
   const auto shape = tensor.sizes();
@@ -225,12 +225,8 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> batch_norm_backward_plumbing(
   vmap_check_escaped(maybe_layer, "batch_norm_backward_plumbing");
   int64_t cur_level = maybe_layer->layerId();
 
-  Tensor grad_out_value;
-  optional<int64_t> grad_out_bdim;
-  std::tie(grad_out_value, grad_out_bdim) = unwrapTensorAtLevel(grad_out, cur_level);
-  Tensor input_value;
-  optional<int64_t> input_bdim;
-  std::tie(input_value, input_bdim) = unwrapTensorAtLevel(input, cur_level);
+  auto [grad_out_value, grad_out_bdim] = unwrapTensorAtLevel(grad_out, cur_level);
+  auto [input_value, input_bdim] = unwrapTensorAtLevel(input, cur_level);
   Tensor mean_value;
   optional<Tensor> weight_value;
   optional<int64_t> weight_bdim;
@@ -247,12 +243,8 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> batch_norm_backward_plumbing(
   if (running_var.defined()) {
     std::tie(running_var_value, running_var_bdim) = unwrapTensorAtLevel(running_var, cur_level);
   }
-  Tensor save_mean_value;
-  optional<int64_t> save_mean_bdim;
-  std::tie(save_mean_value, save_mean_bdim) = unwrapTensorAtLevel(save_mean, cur_level);
-  Tensor save_rstd_value;
-  optional<int64_t> save_rstd_bdim;
-  std::tie(save_rstd_value, save_rstd_bdim) = unwrapTensorAtLevel(save_rstd, cur_level);
+  auto [save_mean_value, save_mean_bdim] = unwrapTensorAtLevel(save_mean, cur_level);
+  auto [save_rstd_value, save_rstd_bdim] = unwrapTensorAtLevel(save_rstd, cur_level);
 
   // results
   Tensor grad_bias;
@@ -274,9 +266,7 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> batch_norm_backward_plumbing(
   if (output_mask[0]) {
     const auto grad_normalized_input = weight.defined() ?
       grad_out.transpose(0, 1) * padRight(weight, nullopt, grad_out.dim()) : grad_out.transpose(0, 1);           // [B0, C, B, *]
-    Tensor grad_normalized_input_value;
-    optional<int64_t> grad_normalized_input_bdim;
-    std::tie(grad_normalized_input_value, grad_normalized_input_bdim) =
+    auto [grad_normalized_input_value, grad_normalized_input_bdim] =
         unwrapTensorAtLevel(grad_normalized_input.transpose(0, 1), cur_level);       // [B0, B, C, *]
 
     c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
@@ -312,9 +302,7 @@ static std::tuple<Tensor,Tensor,Tensor> native_group_norm_plumbing(
     return at::native_group_norm(input, weight_opt, bias_opt, N, C, HxW, group, eps);
   }
 
-  Tensor input_value;
-  optional<int64_t> input_bdim;
-  std::tie(input_value, input_bdim) = unwrapTensorAtLevel(input, cur_level);
+  auto [input_value, input_bdim] = unwrapTensorAtLevel(input, cur_level);
 
   Tensor result0;
   Tensor mean;
@@ -401,20 +389,14 @@ static std::tuple<Tensor,Tensor,Tensor> native_group_norm_backward_plumbing(
     return at::native_group_norm_backward(grad_out, input, mean, rstd, weight_opt, N, C, HxW, group, output_mask);
   }
 
-  Tensor input_value;
-  optional<int64_t> input_bdim;
-  std::tie(input_value, input_bdim) = unwrapTensorAtLevel(input, cur_level);
+  auto [input_value, input_bdim] = unwrapTensorAtLevel(input, cur_level);
   Tensor weight_value;
   optional<int64_t> weight_bdim;
   if (weight.defined()){
     std::tie(weight_value, weight_bdim) = unwrapTensorAtLevel(weight, cur_level);
   }
-  Tensor mean_value;
-  optional<int64_t> mean_bdim;
-  std::tie(mean_value, mean_bdim) = unwrapTensorAtLevel(mean, cur_level);
-    Tensor rstd_value;
-  optional<int64_t> rstd_bdim;
-  std::tie(rstd_value, rstd_bdim) = unwrapTensorAtLevel(rstd, cur_level);
+  auto [mean_value, mean_bdim] = unwrapTensorAtLevel(mean, cur_level);
+  auto [rstd_value, rstd_bdim] = unwrapTensorAtLevel(rstd, cur_level);
 
   // results
   Tensor grad_input;
@@ -436,9 +418,7 @@ static std::tuple<Tensor,Tensor,Tensor> native_group_norm_backward_plumbing(
   if (output_mask[0]) {
     const auto grad_normalized_input = weight.defined() ?
       grad_out * padRight(weight, nullopt, grad_out.dim() - 1) : grad_out;
-    Tensor grad_normalized_input_value;
-    optional<int64_t> grad_normalized_input_bdim;
-    std::tie(grad_normalized_input_value, grad_normalized_input_bdim) =
+    auto [grad_normalized_input_value, grad_normalized_input_bdim] =
         unwrapTensorAtLevel(grad_normalized_input, cur_level);
 
     c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
@@ -494,7 +474,7 @@ C10_ALWAYS_INLINE void _check_layer_norm_inputs(
     const Tensor& weight, optional<int64_t> weight_bdim,
     const Tensor& bias, optional<int64_t> bias_bdim) {
 
-  const int normalized_ndim = normalized_shape.size();
+  const auto normalized_ndim = normalized_shape.size();
   TORCH_CHECK(
       normalized_ndim >= 1,
       "Expected normalized_shape to be at least 1-dimensional, i.e., ",
@@ -611,18 +591,10 @@ static std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_p
     return at::native_layer_norm_backward(grad_out, input, normalized_shape, mean, rstd,
         weight_opt, bias_opt, output_mask);
   }
-  Tensor grad_out_value;
-  optional<int64_t> grad_out_bdim;
-  std::tie(grad_out_value, grad_out_bdim) = unwrapTensorAtLevel(grad_out, cur_level);
-  Tensor input_value;
-  optional<int64_t> input_bdim;
-  std::tie(input_value, input_bdim) = unwrapTensorAtLevel(input, cur_level);
-  Tensor mean_value;
-  optional<int64_t> mean_bdim;
-  std::tie(mean_value, mean_bdim) = unwrapTensorAtLevel(mean, cur_level);
-  Tensor rstd_value;
-  optional<int64_t> rstd_bdim;
-  std::tie(rstd_value, rstd_bdim) = unwrapTensorAtLevel(rstd, cur_level);
+  auto [grad_out_value, grad_out_bdim] = unwrapTensorAtLevel(grad_out, cur_level);
+  auto [input_value, input_bdim] = unwrapTensorAtLevel(input, cur_level);
+  auto [mean_value, mean_bdim] = unwrapTensorAtLevel(mean, cur_level);
+  auto [rstd_value, rstd_bdim] = unwrapTensorAtLevel(rstd, cur_level);
   optional<Tensor> weight_value;
   optional<int64_t> weight_bdim;
   if (weight.defined()) {
@@ -644,7 +616,7 @@ static std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_p
     if (num_front_dims_to_reduce == 0) {
       grad_bias = grad_out;
     } else {
-      grad_bias = grad_out.sum(range(0, num_front_dims_to_reduce));
+      grad_bias = grad_out.sum(range(0, static_cast<int64_t>(num_front_dims_to_reduce)));
     }
   }
   if (output_mask[1] && weight_value.has_value()) {
@@ -656,15 +628,13 @@ static std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_p
     if (num_front_dims_to_reduce == 0) {
       grad_weight = expanded_grad_weight;
     } else {
-      grad_weight = expanded_grad_weight.sum(range(0, num_front_dims_to_reduce));
+      grad_weight = expanded_grad_weight.sum(range(0, static_cast<int64_t>(num_front_dims_to_reduce)));
     }
   }
   if (output_mask[0]) {
     const auto grad_normalized_input = weight.defined() ?
       grad_out * weight : grad_out;
-    Tensor grad_normalized_input_value;
-    optional<int64_t> grad_normalized_input_bdim;
-    std::tie(grad_normalized_input_value, grad_normalized_input_bdim) =
+    auto [grad_normalized_input_value, grad_normalized_input_bdim] =
         unwrapTensorAtLevel(grad_normalized_input, cur_level);
 
     c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
@@ -906,4 +876,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   m.impl("native_layer_norm_backward", native_layer_norm_backward_plumbing);
 }
 
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesPooling.cpp b/aten/src/ATen/functorch/BatchRulesPooling.cpp
index b6ebb2e788089..68c25e6053d65 100644
--- a/aten/src/ATen/functorch/BatchRulesPooling.cpp
+++ b/aten/src/ATen/functorch/BatchRulesPooling.cpp
@@ -9,7 +9,7 @@
 #include <ATen/functorch/BatchedFallback.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 template <typename Func>
 std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>>
@@ -72,4 +72,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   ALL_TENSORS_HAVE_OPTIONAL_BDIM_BOXED_CONTIG1(4, max_pool3d_with_indices_backward, 2);
 }
 
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesRandomness.cpp b/aten/src/ATen/functorch/BatchRulesRandomness.cpp
index 47cff54575cfd..79572f22ea3f6 100644
--- a/aten/src/ATen/functorch/BatchRulesRandomness.cpp
+++ b/aten/src/ATen/functorch/BatchRulesRandomness.cpp
@@ -16,8 +16,7 @@
 // registered to FuncTorchVmapMode. This is because we need to interpose on
 // random operations even if they're not on a BatchedTensor.
 
-namespace at {
-namespace functorch {
+namespace at::functorch {
 
 template <typename F, F Func, typename... ExtraArgs>
 Tensor random_batching_rule(SymIntArrayRef shape, ExtraArgs... extra_args) {
@@ -40,9 +39,7 @@ Tensor& random_inplace_batching_rule(Tensor& self, ExtraArgs... extra_args) {
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
   auto maybe_layer = maybeCurrentDynamicLayer();
   const auto cur_level = maybe_layer->layerId();
-  Tensor self_value;
-  optional<int64_t> self_bdim;
-  std::tie(self_value, self_bdim) = unwrapTensorAtLevel(self, cur_level);
+  auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level);
   self_value = moveBatchDimToFront(self_value, self_bdim);
   RandomnessType randomness = maybe_layer->randomness();
   check_randomness(randomness);
@@ -67,13 +64,9 @@ static Tensor& bernoulli_inplace_Tensor_batching_rule(Tensor& self, const Tensor
   auto cur_level = maybe_layer->layerId();
   RandomnessType randomness = maybe_layer->randomness();
 
-  Tensor self_value;
-  optional<int64_t> self_bdim;
-  std::tie(self_value, self_bdim) = unwrapTensorAtLevel(self, cur_level);
+  auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level);
 
-  Tensor other_value;
-  optional<int64_t> other_bdim;
-  std::tie(other_value, other_bdim) = unwrapTensorAtLevel(p_, cur_level);
+  auto [other_value, other_bdim] = unwrapTensorAtLevel(p_, cur_level);
 
   check_randomness(randomness, other_bdim.has_value());
 
@@ -135,9 +128,7 @@ Tensor unary_pointwise_random_batch_rule(const Tensor& tensor, ExtraArgs... extr
   auto maybe_layer = maybeCurrentDynamicLayer();
   const auto cur_level = maybe_layer->layerId();
 
-  Tensor tensor_value;
-  optional<int64_t> tensor_bdim;
-  std::tie(tensor_value, tensor_bdim) = unwrapTensorAtLevel(tensor, cur_level);
+  auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(tensor, cur_level);
   tensor_value = moveBatchDimToFront(tensor_value, tensor_bdim);
 
   RandomnessType randomness = maybe_layer->randomness();
@@ -165,9 +156,7 @@ Tensor tensor_like_random_batch_rule(const Tensor& self, ExtraArgs... extra_args
   RandomnessType randomness = maybe_layer->randomness();
   check_randomness(randomness);
 
-  Tensor tensor_value;
-  optional<int64_t> tensor_bdim;
-  std::tie(tensor_value, tensor_bdim) = unwrapTensorAtLevel(self, cur_level);
+  auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(self, cur_level);
   tensor_value = moveBatchDimToFront(tensor_value, tensor_bdim);
 
   if (randomness == RandomnessType::Same && tensor_bdim) {
@@ -190,9 +179,7 @@ static std::tuple<Tensor,Tensor> native_dropout_batching_rule(const Tensor& tens
   const auto cur_level = maybe_layer->layerId();
   RandomnessType randomness = maybe_layer->randomness();
 
-  Tensor tensor_value;
-  optional<int64_t> tensor_bdim;
-  std::tie(tensor_value, tensor_bdim) = unwrapTensorAtLevel(tensor, cur_level);
+  auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(tensor, cur_level);
   tensor_value = moveBatchDimToFront(tensor_value, tensor_bdim);
 
   if (!train.has_value() || train) {
@@ -212,8 +199,8 @@ static std::tuple<Tensor,Tensor> native_dropout_batching_rule(const Tensor& tens
     }
     auto [output, mask] = at::native_dropout(tensor_value, p, train);
     return std::make_tuple(
-        makeBatched(std::move(output), 0, cur_level),
-        makeBatched(std::move(mask), 0, cur_level));
+        makeBatched(output, 0, cur_level),
+        makeBatched(mask, 0, cur_level));
   }
 
   // repeated code from the CPU kernel since the CUDA one doesn't call bernoulli_ explicitly
@@ -231,9 +218,7 @@ static Tensor multinomial_batching_rule(const Tensor& self, const int64_t num_sa
   auto maybe_layer = maybeCurrentDynamicLayer();
   const auto cur_level = maybe_layer->layerId();
 
-  Tensor self_value;
-  optional<int64_t> self_bdim;
-  std::tie(self_value, self_bdim) = unwrapTensorAtLevel(self, cur_level);
+  auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level);
   self_value = moveBatchDimToFront(self_value, self_bdim);
 
   RandomnessType randomness = maybe_layer->randomness();
@@ -279,7 +264,7 @@ struct RandomBatchRuleHelper<F, Func, typelist<T1, T...>> {
 
 template <typename F, F Func, typename... T>
 Tensor rand_int_wrapper(SymIntArrayRef shape, c10::SymInt high, T... extra_args) {
-  return Func(high, std::move(shape), std::forward<T>(extra_args)...);
+  return Func(high, shape, std::forward<T>(extra_args)...);
 }
 
 template <typename A, A a, typename C>
@@ -505,4 +490,5 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) {
   #undef UNARY_POINTWISE_RANDOM_LEADING_FLOAT
   #undef TENSOR_LIKE_COMMON_ARG_TYPES
 }
-}} // namespace at::functorch
+
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
index 62afe2699f395..cb6d6ac519dd8 100644
--- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
@@ -11,7 +11,7 @@
 
 #include <utility>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 static bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
   return dim == 0 || dim == -1;
@@ -75,7 +75,7 @@ static Tensor any_decomp(const Tensor& self) {
   return at::any(self.flatten(), 0, false);
 }
 
-enum ReductionCase { DimArray, Dim };
+enum class ReductionCase:uint8_t { DimArray, Dim };
 
 // Macros and templates have a difficult time dealing with enums,
 // so we didn't turn this into an enum.
@@ -115,7 +115,7 @@ void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack
 
   auto orig_arguments = torch::jit::last(*stack, num_arguments);
   if (std::none_of(orig_arguments.begin(), orig_arguments.end(), ivalueParticipatesInCurrentLevel)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
+    c10::impl::ExcludeDispatchKeyGuard guard_2(DispatchKey::FuncTorchBatched);
     op.callBoxed(stack);
     return;
   }
@@ -123,15 +123,13 @@ void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack
   auto arguments = torch::jit::pop(*stack, num_arguments);
 
   TORCH_INTERNAL_ASSERT(arguments[0].isTensor());
-  Tensor self;
-  optional<int64_t> self_bdim;
-  std::tie(self, self_bdim) = unwrapTensorAtLevel(arguments[0].toTensor(), cur_level);
+  auto [self, self_bdim] = unwrapTensorAtLevel(arguments[0].toTensor(), cur_level);
 
   self = moveBatchDimToFront(self, self_bdim);
 
   auto logical_dim = rankWithoutBatchDim(self, self_bdim);
   std::vector<int64_t> dims;
-  ReductionCase reduction_case;
+  ReductionCase reduction_case{};
   if (arguments[dim_arg_pos].isIntList()) {
     reduction_case = ReductionCase::DimArray;
     dims = arguments[dim_arg_pos].toIntList().vec();
@@ -509,4 +507,5 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(_is_all_true, _is_all_true_batch_rule);
   VMAP_SUPPORT(_is_any_true, _is_any_true_batch_rule);
 }
-}}
+
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
index d253fa0047db6..0a1475497b03d 100644
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@@ -11,11 +11,10 @@
 #include <ATen/native/TensorAdvancedIndexing.h>
 #include <ATen/native/IndexKernel.h>
 #include <ATen/native/IndexingUtils.h>
-#include <iostream>
 #include <torch/library.h>
 
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 namespace {
 static bool any_has_value(ArrayRef<optional<int64_t>> bdims) {
@@ -326,9 +325,7 @@ Tensor index_plumbing(const Tensor & self, const List<optional<Tensor>> & indice
   if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(indices, cur_level)) {
     return at::index(self, indices);
   }
-  Tensor self_value;
-  optional<int64_t> self_bdim;
-  std::tie(self_value, self_bdim) = unwrapTensorAtLevel(self, cur_level);
+  auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level);
   std::vector<optional<Tensor>> indices_value;
   std::vector<optional<int64_t>> indices_bdims;
   for (const auto&& indRef : indices) {
@@ -458,9 +455,7 @@ namespace {
                                                    const List<optional<Tensor>> &indices,
                                                    const Tensor &values, int64_t cur_level)
   {
-    Tensor self_value;
-    optional<int64_t> self_bdim;
-    std::tie(self_value, self_bdim) = unwrapTensorAtLevel(self, cur_level);
+    auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level);
     std::vector<optional<Tensor>> indices_value;
     std::vector<optional<int64_t>> indices_bdims;
     for (const auto &&indRef : indices)
@@ -468,16 +463,13 @@ namespace {
       optional<Tensor> ind = indRef;
       optional<Tensor> index;
       optional<int64_t> index_bdim;
-      if (ind.has_value())
-      {
+      if (ind.has_value()) {
         std::tie(index, index_bdim) = unwrapTensorAtLevel(ind.value(), cur_level);
       }
       indices_value.push_back(index);
       indices_bdims.push_back(index_bdim);
     }
-    Tensor values_value;
-    optional<int64_t> values_bdim;
-    std::tie(values_value, values_bdim) = unwrapTensorAtLevel(values, cur_level);
+    auto [values_value, values_bdim] = unwrapTensorAtLevel(values, cur_level);
     return std::make_tuple(self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim);
   }
 
@@ -494,9 +486,7 @@ void index_put__batch_rule(
   if (!self_bdim.has_value()) {
     vmapIncompatibleInplaceError("index_put_");
   }
-  Tensor self_, values_;
-  std::vector<optional<Tensor>> indices_;
-  std::tie(self_, indices_, values_) = index_put_batch_rule_helper(
+  auto [self_, indices_, values_] = index_put_batch_rule_helper(
       self, self_bdim, indices, indices_bdims, values, values_bdim);
   at::index_put_(self_, List<optional<Tensor>>(indices_), values_, accumulate);
 }
@@ -511,11 +501,7 @@ Tensor& index_put__plumbing(Tensor & self, const List<optional<Tensor>> & indice
   if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(indices, cur_level) && !isBatchedAtLevel(values, cur_level)) {
     return self.index_put_(indices, values, accumulate);
   }
-  Tensor self_value, values_value;
-  optional<int64_t> self_bdim, values_bdim;
-  std::vector<optional<Tensor>> indices_value;
-  std::vector<optional<int64_t>> indices_bdims;
-  std::tie(self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim) =
+  auto [self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim] =
       unpackSelfAndIndicesAndValuesAtCurrentLevel(self, indices, values, cur_level);
   index_put__batch_rule(self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim, accumulate);
   return self;
@@ -533,9 +519,7 @@ void _index_put_impl__batch_rule(
   if (!self_bdim.has_value()) {
     vmapIncompatibleInplaceError("_index_put_impl_");
   }
-  Tensor self_, values_;
-  std::vector<optional<Tensor>> indices_;
-  std::tie(self_, indices_, values_) = index_put_batch_rule_helper(
+  auto [self_, indices_, values_] = index_put_batch_rule_helper(
       self, self_bdim, indices, indices_bdims, values, values_bdim);
   at::_index_put_impl_(self_, List<optional<Tensor>>(indices_), values_, accumulate, unsafe);
 }
@@ -550,11 +534,7 @@ Tensor &_index_put_impl__plumbing(Tensor &self, const List<optional<Tensor>> &in
   if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(indices, cur_level) && !isBatchedAtLevel(values, cur_level)) {
     return at::_index_put_impl_(self, indices, values, accumulate, unsafe);
   }
-  Tensor self_value, values_value;
-  optional<int64_t> self_bdim, values_bdim;
-  std::vector<optional<Tensor>> indices_value;
-  std::vector<optional<int64_t>> indices_bdims;
-  std::tie(self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim) =
+  auto [self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim] =
       unpackSelfAndIndicesAndValuesAtCurrentLevel(self, indices, values, cur_level);
   _index_put_impl__batch_rule(self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim, accumulate, unsafe);
   return self;
@@ -639,9 +619,7 @@ std::tuple<Tensor,optional<int64_t>> index_put_batch_rule(
     }
   }
 
-  Tensor self_, values_;
-  std::vector<optional<Tensor>> indices_;
-  std::tie(self_, indices_, values_) = index_put_batch_rule_helper(
+  auto [self_, indices_, values_] = index_put_batch_rule_helper(
       self, self_bdim, indices, indices_bdims, values, values_bdim, batch_size);
 
   // Why do we need to permute values?
@@ -670,11 +648,7 @@ Tensor index_put_plumbing(const Tensor & self, const List<optional<Tensor>> & in
   if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(indices, cur_level) && !isBatchedAtLevel(values, cur_level)) {
     return self.index_put(indices, values, accumulate);
   }
-  Tensor self_value, values_value;
-  optional<int64_t> self_bdim, values_bdim;
-  std::vector<optional<Tensor>> indices_value;
-  std::vector<optional<int64_t>> indices_bdims;
-  std::tie(self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim) =
+  auto [self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim] =
       unpackSelfAndIndicesAndValuesAtCurrentLevel(self, indices, values, cur_level);
   auto results = index_put_batch_rule(self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim, accumulate);
   return makeBatched(std::get<0>(results), std::get<1>(results), cur_level);
@@ -835,7 +809,7 @@ Tensor get_expanded_index(const Tensor& index, IntArrayRef self_size, int64_t di
   if (index.dim() == 0) {
     return index.expand(self_size);
   }
-  dim = maybe_wrap_dim(dim, self_size.size());
+  dim = maybe_wrap_dim(dim, static_cast<int64_t>(self_size.size()));
 
   // setup new_index_shape as [BS, 1, ..., idx_size, ..., 1]
   // to reshape index_
@@ -1270,4 +1244,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   m.impl("as_strided_scatter", torch::CppFunction::makeFromBoxedFunction<&vmapErrorFallback>());
 }
 
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
index fbf058addfbf6..f44000674db8a 100644
--- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
@@ -7,7 +7,7 @@
 #include <ATen/functorch/BatchRulesHelper.h>
 #include <ATen/functorch/PlumbingHelper.h>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 namespace{
 std::tuple<Tensor,optional<int64_t>>
@@ -52,7 +52,7 @@ std::tuple<Tensor,optional<int64_t>>
 view_as_complex_batch_rule(const Tensor& self, optional<int64_t> self_bdim) {
   // guard against the user passing in a batch of scalar tensors with batch
   // size equal to 2.
-  TORCH_CHECK(self.sizes().size() > 1, "Input tensor must have one or more dimensions");
+  TORCH_CHECK(self.sym_sizes().size() > 1, "Input tensor must have one or more dimensions");
 
   auto self_ = moveBatchDimToFront(self, self_bdim);
   auto result = at::view_as_complex(self_);
@@ -185,4 +185,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
 }
 
 #undef INVOKE
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp
index 345163cd5efdf..81e9d5b9aa21c 100644
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@@ -5,7 +5,6 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <ATen/functorch/BatchRulesHelper.h>
-#include <iostream>
 #include <utility>
 
 #include <ATen/Operators.h>
@@ -17,7 +16,7 @@
 #include <c10/util/SmallBuffer.h>
 #include <ATen/InferSize.h>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 // Note [Adding vmap support for an operator]
 // Hey there! So you have an operator and you want to get it to work with vmap.
@@ -163,9 +162,7 @@ const Tensor& resize__plumbing(
     return self.resize_(size, optional_memory_format);
   }
 
-  Tensor self_value;
-  optional<int64_t> self_bdim;
-  std::tie(self_value, self_bdim) = unwrapTensorAtLevel(self, cur_level);
+  auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level);
   TORCH_INTERNAL_ASSERT(self_bdim.has_value());
 
   // TODO: The following algorithm only works for batch dim == 0.
@@ -204,7 +201,7 @@ std::tuple<Tensor, optional<int64_t>> squeeze_batch_rule(const Tensor& self, opt
   int64_t new_batch_idx = 0;
   int64_t original_idx = 0;
 
-  for (auto it : shape) {
+  for (const auto& it : shape) {
     // Keep only dimensions != 1 and the batch dimension (irrespective of size).
     if (it != 1 || original_idx == bdim) {
       squeezed_sizes.push_back(it);
@@ -294,7 +291,7 @@ std::tuple<Tensor, optional<int64_t>> roll_batch_rule(const Tensor& self, option
     return std::make_tuple(at::roll_symint(self_, shifts, new_dims), 0);
   }
   // We will do something like: t.reshape(a, -1).roll(1, dims=[1, ]).reshape(old_shape)
-  auto old_shape = self_.sizes();
+  auto old_shape = self_.sym_sizes();
   new_dims.push_back(1);
   auto logical_rank = rankWithoutBatchDim(self, bdim);
   if (logical_rank == 0) {
@@ -304,7 +301,7 @@ std::tuple<Tensor, optional<int64_t>> roll_batch_rule(const Tensor& self, option
   auto output = at::roll_symint(self_.flatten(1), shifts, new_dims);
   // NOTE: For scalar tensor, we don't need to unsqueeze as reshape
   // with `old_shape` takes care of it.
-  output = output.reshape(old_shape);
+  output = output.reshape_symint(old_shape);
   return std::make_tuple(output, 0);
 }
 
@@ -454,7 +451,7 @@ std::tuple<Tensor, optional<int64_t>> expand_batch_rule(
 
   auto self_ = moveBatchDimToFront(self, self_bdim);
   auto self_sizes = self_.sym_sizes();
-  auto batch_size = self_sizes[0];
+  const auto& batch_size = self_sizes[0];
 
   c10::SmallVector<c10::SymInt> size_(size.size() + 1);
   size_[0] = batch_size;
@@ -589,4 +586,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT2(unsafe_split, Tensor, unsafe_split_batch_rule);
 }
 
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchedFallback.cpp b/aten/src/ATen/functorch/BatchedFallback.cpp
index 8014933056e7e..ef24406846c6c 100644
--- a/aten/src/ATen/functorch/BatchedFallback.cpp
+++ b/aten/src/ATen/functorch/BatchedFallback.cpp
@@ -17,8 +17,7 @@
 #include <c10/util/llvmMathExtras.h>
 #include <c10/util/irange.h>
 
-namespace at {
-namespace functorch {
+namespace at::functorch {
 
 bool kVmapFallbackWarningEnabled = true;
 
@@ -160,7 +159,7 @@ static void batchedTensorInplaceForLoopFallback(const c10::OperatorHandle& op, t
         "please file a bug report instead.");
     }
     batched_tensor_inputs.push_back(tensor);
-    batched_tensor_inputs_position.push_back(idx);
+    batched_tensor_inputs_position.push_back(static_cast<int64_t>(idx));
   }
   TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty());
 
@@ -305,7 +304,7 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
       continue;
     }
     batched_tensor_inputs.push_back(tensor);
-    batched_tensor_inputs_position.push_back(idx);
+    batched_tensor_inputs_position.push_back(static_cast<int64_t>(idx));
   }
   TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty());
 
@@ -446,18 +445,18 @@ void batchedNestedTensorForLoopFallback(const c10::OperatorHandle& op, torch::ji
       continue;
     }
     batched_tensor_inputs.push_back(tensor);
-    batched_tensor_inputs_position.push_back(idx);
+    batched_tensor_inputs_position.push_back(static_cast<int64_t>(idx));
   }
   TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty());
 
   std::vector<std::vector<Tensor>> unbound;
-  for (auto iter = batched_tensor_inputs.begin(); iter != batched_tensor_inputs.end(); ++iter) {
-    auto *batched_impl = maybeGetBatchedImpl(*iter);
+  for (auto const &batched_tensor_input: batched_tensor_inputs) {
+    auto *batched_impl = maybeGetBatchedImpl(batched_tensor_input);
     TORCH_INTERNAL_ASSERT(batched_impl->value().is_nested() || batched_impl->bdim() == 0,
         "Fallback not supported for mixed nested / non-nested arguments without bdim=0");
     c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::BatchedNestedTensor);
     auto this_unbound = batched_impl->value().unbind();
-    if (unbound.size() > 0) {
+    if (!unbound.empty()) {
       TORCH_INTERNAL_ASSERT(unbound.front().size() == this_unbound.size(),
           "Fallback not supported for differently-sized nested arguments");
     }
@@ -514,5 +513,4 @@ void vmapErrorFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack)
   TORCH_CHECK(false, "Error: ", op.operator_name(), " requires special handling, and does not yet have a batching rule. Feel free to file a github issue!");
 }
 
-}
-} // namespace at
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.cpp b/aten/src/ATen/functorch/BatchedTensorImpl.cpp
index 1af054a2eba46..7eae8303d2af6 100644
--- a/aten/src/ATen/functorch/BatchedTensorImpl.cpp
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.cpp
@@ -10,8 +10,7 @@
 
 #include <c10/util/irange.h>
 
-namespace at {
-namespace functorch {
+namespace at::functorch {
 
 BatchedTensorImpl::BatchedTensorImpl(DispatchKeySet key_set, Tensor value, int64_t bdim, int64_t level)
   : TensorImpl(
@@ -71,7 +70,7 @@ void BatchedTensorImpl::refreshTensorMetadata() {
 int64_t BatchedTensorImpl::actualDim(int64_t dim, bool wrap_dim) const {
   if (wrap_dim) {
     const auto ndim = sizes_and_strides_.size();
-    dim = maybe_wrap_dim(dim, ndim);
+    dim = maybe_wrap_dim(dim, static_cast<int64_t>(ndim));
   }
   if (bdim_ <= dim) {
     return dim + 1;
@@ -161,6 +160,7 @@ c10::intrusive_ptr<TensorImpl> BatchedTensorImpl::shallow_copy_and_detach(
 }
 
 c10::intrusive_ptr<TensorImpl> BatchedTensorImpl::shallow_copy_and_detach(
+    // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
     c10::VariableVersion&& version_counter,
     bool allow_tensor_metadata_change) const {
   TORCH_CHECK(false, "accessing `data` under vmap transform is not allowed");
@@ -185,5 +185,4 @@ Tensor addBatchDim(const Tensor& tensor, int64_t dim, int64_t level) {
   return makeBatched(tensor, dim, level);
 }
 
-}
-} // namespace at
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.h b/aten/src/ATen/functorch/BatchedTensorImpl.h
index d29f3f6d6a370..f3754e3c30816 100644
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@@ -7,7 +7,6 @@
 #pragma once
 
 #include <bitset>
-#include <utility>
 
 #include <ATen/ArrayRef.h>
 #include <ATen/SmallVector.h>
@@ -119,15 +118,15 @@ inline bool isBatchedTensor(const Tensor& tensor) {
 
 // It is unsafe to call this on a Tensor that is not backed by a
 // BatchedTensorImpl. Please use `maybeGetBatchedImpl` whenever possible.
-inline BatchedTensorImpl* unsafeGetBatchedImpl(Tensor tensor) {
+inline BatchedTensorImpl* unsafeGetBatchedImpl(const Tensor& tensor) {
   return static_cast<BatchedTensorImpl*>(tensor.unsafeGetTensorImpl());
 }
 
-inline BatchedTensorImpl* maybeGetBatchedImpl(Tensor tensor) {
+inline BatchedTensorImpl* maybeGetBatchedImpl(const Tensor& tensor) {
   if (!isBatchedTensor(tensor)) {
     return nullptr;
   }
-  return unsafeGetBatchedImpl(std::move(tensor));
+  return unsafeGetBatchedImpl(tensor);
 }
 
 // Returns a bitset. If bit i is set, then that means dim i is a batchdim.
diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp
index 2d271a613340a..45976fa855f32 100644
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@@ -17,8 +17,7 @@
 #include <ATen/FuncTorchTLS.h>
 #include <iostream>
 
-namespace at {
-namespace functorch {
+namespace at::functorch {
 
 void setDynamicLayerFrontBackKeysIncluded(bool included) {
   c10::impl::tls_set_dispatch_key_included(DispatchKey::FuncTorchDynamicLayerFrontMode, included);
@@ -235,7 +234,7 @@ int64_t pushDynamicLayer(DynamicLayer&& dynamic_layer) {
   auto& dynamicLayerStack = dynamicLayerStackAccessor();
   int64_t layerId = 1 + dynamicLayerStack.size();
   TORCH_INTERNAL_ASSERT(layerId == dynamic_layer.layerId());
-  dynamicLayerStack.emplace_back(dynamic_layer);
+  dynamicLayerStack.emplace_back(std::move(dynamic_layer));
 
   if (layerId == 1) {
     setDynamicLayerFrontBackKeysIncluded(true);
@@ -258,7 +257,7 @@ int64_t initAndPushDynamicLayer(
     optional<bool> functionalize_add_back_views) {
   const auto& dynamicLayerStack = dynamicLayerStackAccessor();
   const auto layerId = 1 + dynamicLayerStack.size();
-  DynamicLayer new_layer(transform_type, layerId, batch_size, randomness, prev_grad_mode, prev_fwd_grad_mode, functionalize_add_back_views);
+  DynamicLayer new_layer(transform_type, layerId, std::move(batch_size), randomness, prev_grad_mode, prev_fwd_grad_mode, functionalize_add_back_views);
   // NB: this function should be called while holding the GIL to avoid races
   new_layer.interpreter().set_is_alive(true);
   pushDynamicLayer(std::move(new_layer));
@@ -307,7 +306,7 @@ void foreachTensorInplace(std::vector<IValue>& args, int64_t begin, int64_t end,
 }
 
 void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int64_t end,
-    const std::bitset<64> use_flag_relative, std::function<Tensor(const Tensor&, bool)> func){
+    const std::bitset<64> use_flag_relative, const std::function<Tensor(const Tensor&, bool)>& func){
   TORCH_INTERNAL_ASSERT(begin >= 0);
   TORCH_INTERNAL_ASSERT(end >= 0);
   TORCH_INTERNAL_ASSERT(begin <= end);
@@ -511,5 +510,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchDynamicLayerBackMode, m) {
   SPECIAL_GRAD_CASE(alias);
 }
 
-}
-} // namespace at
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/FunctionalizeInterpreter.cpp b/aten/src/ATen/functorch/FunctionalizeInterpreter.cpp
index 717eb87ae1f31..89175cc79c5ec 100644
--- a/aten/src/ATen/functorch/FunctionalizeInterpreter.cpp
+++ b/aten/src/ATen/functorch/FunctionalizeInterpreter.cpp
@@ -2,7 +2,7 @@
 #include <ATen/functorch/DynamicLayer.h>
 #include <ATen/FunctionalTensorWrapper.h>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 static void sanityCheckNotFunctional(const c10::OperatorHandle& op, torch::jit::Stack* stack, size_t num_args) {
   foreachTensorInplace(*stack, stack->size() - num_args, stack->size(),
@@ -64,4 +64,4 @@ void FunctionalizeInterpreterPtr::sendToNextInterpreterImpl(
   sanityCheckNotFunctional(op, stack, ret_size);
 }
 
-}} // namespace at::functorch
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/Interpreter.cpp b/aten/src/ATen/functorch/Interpreter.cpp
index 8d672c4128280..609cda8562953 100644
--- a/aten/src/ATen/functorch/Interpreter.cpp
+++ b/aten/src/ATen/functorch/Interpreter.cpp
@@ -6,9 +6,7 @@
 #include <ATen/functorch/ADInterpreters.h>
 #include <ATen/functorch/DynamicLayer.h>
 
-#include <utility>
-
-namespace at { namespace functorch {
+namespace at::functorch {
 
 static DispatchKeySet get_all_dynlayer_keyset() {
   // NB: FULL_AFTER does not include the dispatch key
@@ -92,12 +90,12 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t) {
 
 void sanityCheckStack(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
   auto num_args = op.schema().arguments().size();
-  foreachTensorInplace(*stack, stack->size() - num_args, stack->size(),
+  foreachTensorInplace(*stack, static_cast<int64_t>(stack->size() - num_args), static_cast<int64_t>(stack->size()),
       [](const Tensor& tensor) {
         auto result = unwrapIfDead(tensor);
         auto* wrapper = maybeGetTensorWrapper(result);
         TORCH_INTERNAL_ASSERT(wrapper == nullptr);
-        auto* batched = maybeGetBatchedImpl(std::move(result));
+        auto* batched = maybeGetBatchedImpl(result);
         TORCH_INTERNAL_ASSERT(batched == nullptr);
         return tensor;
       });
@@ -129,4 +127,4 @@ void Interpreter::sendToNextInterpreter(const c10::OperatorHandle& op, torch::ji
   INTERPRETER_DISPATCH(key_, SINGLE_ARG(sendToNextInterpreterImpl(op, stack, grad_special_case)));
 }
 
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/Interpreter.h b/aten/src/ATen/functorch/Interpreter.h
index 81190ffde1686..c08882fc30137 100644
--- a/aten/src/ATen/functorch/Interpreter.h
+++ b/aten/src/ATen/functorch/Interpreter.h
@@ -5,6 +5,7 @@
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/util/Optional.h>
 #include <bitset>
+#include <utility>
 #include <variant>
 
 namespace at::functorch {
@@ -144,7 +145,7 @@ struct Interpreter {
 
   void saveLocalDispatchKeySet(c10::impl::LocalDispatchKeySet keyset) {
     TORCH_INTERNAL_ASSERT(!savedLocalDispatchKeySet_.has_value());
-    savedLocalDispatchKeySet_ = std::move(keyset);
+    savedLocalDispatchKeySet_ = keyset;
   }
   void clearSavedLocalDispatchKeySet() {
     TORCH_INTERNAL_ASSERT(savedLocalDispatchKeySet_.has_value());
@@ -173,11 +174,11 @@ struct Interpreter {
 
  private:
   explicit Interpreter(TransformType type, int64_t level, InterpreterMeta meta):
-    type_(type), level_(level), is_alive_(std::make_shared<bool>(false)), meta_(meta) {}
+    type_(type), level_(level), is_alive_(std::make_shared<bool>(false)), meta_(std::move(meta)) {}
 
   // fields
-  TransformType type_;
-  int64_t level_;
+  TransformType type_{};
+  int64_t level_{};
   optional<c10::impl::LocalDispatchKeySet> savedLocalDispatchKeySet_;
   std::shared_ptr<bool> is_alive_;
   InterpreterMeta meta_;
@@ -195,7 +196,7 @@ void foreachTensorInplace(std::vector<IValue>& args, int64_t begin, int64_t end,
 //     args[i] = func(args[i], i - begin, true)
 //   args[i] = func(args[i], i - begin)
 void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int64_t end,
-    const std::bitset<64> use_flag_relative, std::function<Tensor(const Tensor&, bool)> func);
+    const std::bitset<64> use_flag_relative, const std::function<Tensor(const Tensor&, bool)>& func);
 
 std::vector<int64_t> findUnwrappedInputs(std::vector<IValue>& args, int64_t begin, int64_t end);
 
diff --git a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
index 5dd569bea1e36..b7a131766ec86 100644
--- a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
@@ -19,8 +19,7 @@
 
 #include <utility>
 
-namespace at {
-namespace functorch {
+namespace at::functorch {
 
 
 // NOTE: [What is a batching rule?]
@@ -259,6 +258,18 @@ std::vector<Tensor> split_with_sizes_batching_rule(const Tensor& self, SymIntArr
   return result;
 }
 
+std::vector<Tensor> split_with_sizes_copy_batching_rule(const Tensor& self, SymIntArrayRef split_sizes, int64_t dim) {
+  if (!participatesInCurrentLevel(self)) {
+    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
+    return split_with_sizes_copy_symint(self, split_sizes, dim);
+  }
+  auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
+  auto dim_physical = self_physical.getPhysicalDim(dim);
+  auto result = split_with_sizes_copy_symint(self_physical.tensor(), split_sizes, dim_physical);
+  self_physical.getPhysicalToLogicalMap().applyInplace(result);
+  return result;
+}
+
 std::vector<Tensor> unbind_batching_rule(const Tensor& self, int64_t dim) {
   if (!participatesInCurrentLevel(self)) {
     c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
@@ -275,7 +286,7 @@ std::vector<Tensor> unbind_batching_rule(const Tensor& self, int64_t dim) {
 // can be indexed (or nullopt if such a location doesn't exist, e.g., tensors
 // with zero-size dims).
 static optional<c10::SymInt> maximum_indexable_location(
-    c10::SymIntArrayRef sizes, c10::SymIntArrayRef strides, c10::SymInt storage_offset) {
+    c10::SymIntArrayRef sizes, c10::SymIntArrayRef strides, const c10::SymInt& storage_offset) {
   auto result = native::storage_size_for(sizes, strides);
   if (result == 0) {
     return nullopt;
@@ -292,7 +303,7 @@ static void checkBasicAsStridedValidForSlice(
     int64_t num_batch_dims,
     c10::SymIntArrayRef sizes,
     c10::SymIntArrayRef strides,
-    optional<c10::SymInt> maybe_storage_offset) {
+    const optional<c10::SymInt>& maybe_storage_offset) {
   auto slice_sizes = physical_tensor.sym_sizes().slice(num_batch_dims);
   auto slice_strides = physical_tensor.sym_strides().slice(num_batch_dims);
   auto base_offset = physical_tensor.sym_storage_offset();
@@ -682,17 +693,17 @@ Tensor new_empty_strided_batching_rule(
 }
 
 Tensor nested_cat_batching_rule(const ITensorListRef& tensors, int64_t dim) {
-  TORCH_CHECK(tensors.size() > 0, "cat() not supported on empty tensor list");
+  TORCH_CHECK(!tensors.empty(), "cat() not supported on empty tensor list");
 
   std::vector<std::vector<Tensor>> unbound;
-  for (auto tensor_iter = tensors.begin(); tensor_iter != tensors.end(); ++tensor_iter) {
-    auto* maybe_batched_impl = maybeGetBatchedImpl(*tensor_iter);
+  for (const auto & tensor : tensors) {
+    auto* maybe_batched_impl = maybeGetBatchedImpl(tensor);
     TORCH_CHECK(maybe_batched_impl, "Tried to run batching rule for cat() on a non-batched tensor");
     auto nt = maybe_batched_impl->value();
     TORCH_CHECK(nt.is_nested(), "Tried to run batching rule for cat() on a non-nested tensor");
     c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::BatchedNestedTensor);
     auto this_unbound = nt.unbind();
-    if (unbound.size() > 0) {
+    if (!unbound.empty()) {
       TORCH_INTERNAL_ASSERT(unbound.front().size() == this_unbound.size(),
           "cat() not supported for differently-sized nested arguments");
     }
@@ -725,6 +736,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   // still legacy b/c teturns multiple tensors
   m.impl("split.Tensor", split_batching_rule);
   m.impl("split_with_sizes", split_with_sizes_batching_rule);
+  m.impl("split_with_sizes_copy", split_with_sizes_copy_batching_rule);
   m.impl("unbind.int", unbind_batching_rule);
   m.impl("cat", cat_batching_rule);
   m.impl("block_diag", block_diag_batching_rule);
@@ -751,5 +763,5 @@ TORCH_LIBRARY_IMPL(_, BatchedNestedTensor, m) {
 TORCH_LIBRARY_IMPL(aten, BatchedNestedTensor, m) {
   m.impl("cat", nested_cat_batching_rule);
 }
-} // namespace functorch
-} // namespace at
+
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/LegacyVmapTransforms.cpp b/aten/src/ATen/functorch/LegacyVmapTransforms.cpp
index 682169a52622d..e7242cc8f07fc 100644
--- a/aten/src/ATen/functorch/LegacyVmapTransforms.cpp
+++ b/aten/src/ATen/functorch/LegacyVmapTransforms.cpp
@@ -10,8 +10,7 @@
 #include <ATen/ATen.h>
 #include <c10/util/irange.h>
 
-namespace at {
-namespace functorch {
+namespace at::functorch {
 
 // Takes a BatchedTensorImpl, permutes all of the batch dims to the front,
 // and then returns a physical version of the Tensor.
@@ -20,7 +19,7 @@ static Tensor permuteBatchDimsToFront(const BatchedTensorImpl* batched) {
   if (batched->bdim() == 0) {
     return physical_tensor;
   }
-  const auto sizes = physical_tensor.sizes();
+  const auto sizes = physical_tensor.sym_sizes();
   VmapDimVector permutation(sizes.size(), 0);
   permutation.reserve(sizes.size());
   const auto is_bdim = createBatchDimBitset(batched->bdim());
@@ -97,14 +96,14 @@ static std::tuple<int64_t, int64_t> computeFrontBatchDimsFromLevels(std::bitset<
   return std::make_tuple(dim, level);
 }
 
-static Tensor moveDimToFrontAndExpand(Tensor tensor, optional<int64_t> dim, int64_t size) {
+static Tensor moveDimToFrontAndExpand(Tensor tensor, optional<int64_t> dim, c10::SymInt size) {
   if (dim) {
     tensor = tensor.movedim(*dim, 0);
   } else {
     tensor = tensor.unsqueeze(0);
-    auto expanded_sizes = tensor.sizes().vec();
+    auto expanded_sizes = tensor.sym_sizes().vec();
     expanded_sizes[0] = size;
-    tensor = tensor.expand(expanded_sizes);
+    tensor = tensor.expand_symint(expanded_sizes);
   }
   return tensor;
 }
@@ -120,7 +119,7 @@ static Tensor moveDimToFrontAndExpand(Tensor tensor, optional<int64_t> dim, int6
 VmapPhysicalViewVec
 MultiBatchVmapTransform::logicalToPhysical(ITensorListRef logical_tensors) {
   auto cur_level = maybeCurrentDynamicLayer().value().layerId();
-  auto bdim_size = -1;
+  c10::SymInt bdim_size = -1;
 
   // Figure out the batch size first
   for (const auto& logical_tensor : logical_tensors) {
@@ -131,12 +130,12 @@ MultiBatchVmapTransform::logicalToPhysical(ITensorListRef logical_tensors) {
     if (batched->level() != cur_level) {
       continue;
     }
-    bdim_size = batched->value().size(batched->bdim());
+    bdim_size = batched->value().sym_size(batched->bdim());
   }
   TORCH_INTERNAL_ASSERT(bdim_size != -1);
 
   std::bitset<kVmapNumLevels> levels;
-  levels[cur_level] = 1;
+  levels[cur_level] = true;
 
   VmapPhysicalViewVec result;
   for (const auto& logical_tensor : logical_tensors) {
@@ -185,7 +184,7 @@ VmapPhysicalViewVec BroadcastingVmapTransform::logicalToPhysical(TensorList logi
   TORCH_INTERNAL_ASSERT(bdim_size != -1);
 
   std::bitset<kVmapNumLevels> levels;
-  levels[cur_level] = 1;
+  levels[cur_level] = true;
 
   // figure out the example ndim
   int64_t max_example_dim = -1;
@@ -227,5 +226,4 @@ void VmapPhysicalToLogicalMap::applyInplace(std::vector<Tensor>& physical_tensor
   }
 }
 
-}
-} // namespace at
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/LegacyVmapTransforms.h b/aten/src/ATen/functorch/LegacyVmapTransforms.h
index 7944d99bae45d..390989d45bf73 100644
--- a/aten/src/ATen/functorch/LegacyVmapTransforms.h
+++ b/aten/src/ATen/functorch/LegacyVmapTransforms.h
@@ -120,7 +120,7 @@ struct VmapPhysicalToLogicalMap;
 //   levels: 012345
 struct TORCH_API VmapPhysicalView {
   VmapPhysicalView(Tensor&& tensor, std::bitset<kVmapNumLevels> levels)
-      : levels_(levels), tensor_(tensor) {
+      : levels_(levels), tensor_(std::move(tensor)) {
     // TORCH_INTERNAL_ASSERT(!isBatchedTensor(tensor));
   }
 
diff --git a/aten/src/ATen/functorch/PlumbingHelper.cpp b/aten/src/ATen/functorch/PlumbingHelper.cpp
index 43ab457d4ad62..76982fd1b6480 100644
--- a/aten/src/ATen/functorch/PlumbingHelper.cpp
+++ b/aten/src/ATen/functorch/PlumbingHelper.cpp
@@ -9,7 +9,7 @@
 #include <ATen/functorch/BatchedTensorImpl.h>
 #include <ATen/functorch/PlumbingHelper.h>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 void vmap_check_escaped(const optional<DynamicLayer> &layer, const char* what) {
   TORCH_CHECK(
@@ -92,4 +92,4 @@ bool areAnyBatchedAtLevel(ArrayRef<optional<Tensor>> maybe_tensors, int64_t leve
 }
 
 
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
index 448145b14e480..355ac5965da51 100644
--- a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
+++ b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
@@ -11,7 +11,7 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/xnnpack/Engine.h>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 // NOTE: [functorch's PyTorch Operator Hacks]
 //
@@ -167,7 +167,7 @@ namespace dropout_hack {
 namespace {
 
 template<bool inplace>
-using Ctype = typename std::conditional<inplace, Tensor&, Tensor>::type;
+using Ctype = std::conditional_t<inplace, Tensor&, Tensor>;
 
 static Tensor make_feature_noise(const Tensor& input) {
   auto input_sizes = input.sizes();
@@ -312,4 +312,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchDynamicLayerFrontMode, m) {
   m.impl("feature_alpha_dropout_", dropout_hack::feature_alpha_dropout_);
 }
 
-}}
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/TensorWrapper.cpp b/aten/src/ATen/functorch/TensorWrapper.cpp
index a8411570801db..4be5725e800f3 100644
--- a/aten/src/ATen/functorch/TensorWrapper.cpp
+++ b/aten/src/ATen/functorch/TensorWrapper.cpp
@@ -13,8 +13,7 @@
 
 #include <iostream>
 
-namespace at {
-namespace functorch {
+namespace at::functorch {
 
 void dumpTensor(std::ostream& ss, const Tensor& tensor) {
   auto* wrapped = maybeGetTensorWrapper(tensor);
@@ -51,7 +50,7 @@ void TensorWrapper::refreshMetadata() {
 void dumpTensorCout(const Tensor& tensor) {
   dumpTensor(std::cout, tensor);
 
-  std::cout << std::endl;
+  std::cout << '\n';
 }
 
 static c10::intrusive_ptr<TensorWrapper> makeTensorWrapperPtr(const Tensor& tensor, int64_t level, const std::shared_ptr<bool>& life_handle) {
@@ -82,6 +81,11 @@ static Tensor unsafeMakeTensorWrapper(
   auto result = at::detail::make_tensor<TensorWrapper>(
       key_set, tensor, level, life_handle, is_immutable);
   TORCH_INTERNAL_ASSERT(result.key_set().has(DispatchKey::FuncTorchGradWrapper));
+
+  if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) {
+    result.unsafeGetTensorImpl()->set_wrapped_number(true);
+  }
+
   return result;
 }
 
@@ -204,5 +208,4 @@ TORCH_LIBRARY_IMPL(_, FuncTorchGradWrapper, m) {
   m.fallback(torch::CppFunction::makeFromBoxedFunction<&dead_tensor_wrapper_fallback>());
 }
 
-}
-} // namespace at
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/VmapInterpreter.cpp b/aten/src/ATen/functorch/VmapInterpreter.cpp
index 21f40adb0d140..33774e0433264 100644
--- a/aten/src/ATen/functorch/VmapInterpreter.cpp
+++ b/aten/src/ATen/functorch/VmapInterpreter.cpp
@@ -1,7 +1,7 @@
 #include <ATen/functorch/VmapInterpreter.h>
 #include <ATen/functorch/DynamicLayer.h>
 
-namespace at { namespace functorch {
+namespace at::functorch {
 
 void VmapInterpreterPtr::processImpl(
     const c10::OperatorHandle& op,
@@ -21,4 +21,4 @@ void VmapInterpreterPtr::sendToNextInterpreterImpl(
   op.callBoxed(stack);
 }
 
-}} // namespace at::functorch
+} // namespace at::functorch
diff --git a/aten/src/ATen/functorch/VmapModeRegistrations.cpp b/aten/src/ATen/functorch/VmapModeRegistrations.cpp
index ad413d48c7ebc..195afd80bc713 100644
--- a/aten/src/ATen/functorch/VmapModeRegistrations.cpp
+++ b/aten/src/ATen/functorch/VmapModeRegistrations.cpp
@@ -17,8 +17,7 @@
 // FuncTorchVmapMode -- these registrations are to error out on operations
 // that we don't support on regular Tensors.
 
-namespace at {
-namespace functorch {
+namespace at::functorch {
 
 static void unsupportedRandomOp(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
   TORCH_CHECK(false, "vmap: We do not support calling out variants of random operations inside of vmap. ",
@@ -68,6 +67,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) {
   NYI_RANDOM(rrelu);
 }
 
-
-}
-} // namespace at
+} // namespace at::functorch
diff --git a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
index c5a607c51b391..8e2654bafe90b 100644
--- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
@@ -15,7 +15,7 @@ class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
 public:
   explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator)
     : allocator_(allocator) {}
-  DataPtr allocate(size_t size) const override {
+  DataPtr allocate(size_t size) override {
     DataPtr r = allocator_->allocate(size);
     r.unsafe_set_device(Device(c10::DeviceType::CUDA, r.device().index()));
     return r;
@@ -23,6 +23,9 @@ class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
   DeleterFnPtr raw_deleter() const override {
     return allocator_->raw_deleter();
   }
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    allocator_->copy_data(dest, src, count);
+  }
 };
 
 }} // namespace c10::hip
diff --git a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
index 5ef7765519de9..a0fc211e4c8ae 100644
--- a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
@@ -88,6 +88,9 @@ struct HIPGuardImplMasqueradingAsCUDA final : public c10::impl::DeviceGuardImplI
   Stream getDefaultStream(Device d) const override {
     return getDefaultHIPStreamMasqueradingAsCUDA(d.index());
   }
+  Stream getNewStream(Device d, int priority = 0) const override {
+    return getStreamFromPoolMasqueradingAsCUDA(priority, d.index());
+  }
   Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false) const override {
     return getStreamFromPoolMasqueradingAsCUDA(isHighPriority, d.index());
   }
@@ -120,11 +123,9 @@ struct HIPGuardImplMasqueradingAsCUDA final : public c10::impl::DeviceGuardImplI
     auto hip_flag = hipEventDefault;
     switch (flag) {
       case EventFlag::PYTORCH_DEFAULT:
-      case EventFlag::HIP_EVENT_DISABLE_TIMING:
         hip_flag = hipEventDisableTiming;
         break;
       case EventFlag::BACKEND_DEFAULT:
-      case EventFlag::HIP_EVENT_DEFAULT:
         hip_flag = hipEventDefault;
         break;
       default:
diff --git a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
index 2b30018b4a888..fb13ada5ad88e 100644
--- a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
@@ -96,6 +96,11 @@ inline getStreamFromPoolMasqueradingAsCUDA(const bool isHighPriority = false, De
   return HIPStreamMasqueradingAsCUDA(getStreamFromPool(isHighPriority, device));
 }
 
+HIPStreamMasqueradingAsCUDA
+inline getStreamFromPoolMasqueradingAsCUDA(const int priority, DeviceIndex device = -1) {
+  return HIPStreamMasqueradingAsCUDA(getStreamFromPool(priority, device));
+}
+
 HIPStreamMasqueradingAsCUDA
 inline getStreamFromExternalMasqueradingAsCUDA(hipStream_t ext_stream, DeviceIndex device) {
   return HIPStreamMasqueradingAsCUDA(getStreamFromExternal(ext_stream, device));
diff --git a/aten/src/ATen/miopen/AutocastRNN.cpp b/aten/src/ATen/miopen/AutocastRNN.cpp
new file mode 100644
index 0000000000000..271d80ea03cd4
--- /dev/null
+++ b/aten/src/ATen/miopen/AutocastRNN.cpp
@@ -0,0 +1,64 @@
+#include <ATen/ATen.h>
+#include <ATen/autocast_mode.h>
+#include <ATen/cuda/CUDAConfig.h>
+#include <torch/library.h>
+
+namespace at {
+namespace autocast {
+
+/**********************************************************************
+Autocast wrapper for MIOpen RNNs
+**********************************************************************/
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>
+miopen_rnn(const Tensor & input_r,
+           TensorList weight,
+           int64_t weight_stride0,
+           const Tensor & hx,
+           const c10::optional<Tensor>& cx_opt,
+           int64_t fn_mode,
+           int64_t fn_hidden_size,
+           int64_t fn_num_layers,
+           bool batch_first,
+           double fn_dropout,
+           bool fn_train,
+           bool fn_bidirectional,
+           IntArrayRef fn_batch_sizes,
+           const c10::optional<Tensor>& fn_dropout_state_opt) {
+
+#if AT_ROCM_ENABLED()
+
+    c10::impl::ExcludeDispatchKeyGuard no_autocast(DispatchKey::Autocast);
+
+    return at::miopen_rnn(
+                cached_cast(at::kHalf, input_r),
+                cached_cast(at::kHalf, weight),
+                weight_stride0,
+                cached_cast(at::kHalf, hx),
+                cached_cast(at::kHalf, cx_opt),
+                fn_mode,
+                fn_hidden_size,
+                fn_num_layers,
+                batch_first,
+                fn_dropout,
+                fn_train,
+                fn_bidirectional,
+                fn_batch_sizes,
+                fn_dropout_state_opt);
+
+#else
+    AT_ERROR("autocast::miopen_rnn: ATen not compiled with ROCm enabled");
+    return {Tensor{}, Tensor{}, Tensor{}, Tensor{}, Tensor{}}; // placate the compiler
+#endif
+
+}
+
+// Register Autocast dispatch
+namespace {
+TORCH_LIBRARY_IMPL(aten, Autocast, m) {
+  m.impl("miopen_rnn",
+         TORCH_FN((&at::autocast::miopen_rnn)));
+}
+} // anonymous namespace
+
+} // namespace autocast
+} // namespace at
diff --git a/aten/src/ATen/mkl/Sparse.h b/aten/src/ATen/mkl/Sparse.h
index 2763feef47c5c..9a09b042c9fe0 100644
--- a/aten/src/ATen/mkl/Sparse.h
+++ b/aten/src/ATen/mkl/Sparse.h
@@ -4,7 +4,7 @@
 
 // MKL Sparse is not currently supported on Windows
 // See https://github.com/pytorch/pytorch/issues/97352
-#if AT_MKL_ENABLED() && (!defined(_WIN32))
+#if AT_MKL_ENABLED()
 #define AT_USE_MKL_SPARSE() 1
 #else
 #define AT_USE_MKL_SPARSE() 0
diff --git a/aten/src/ATen/mkl/SparseBlas.cpp b/aten/src/ATen/mkl/SparseBlas.cpp
index 90a60b42c1bdc..d84e6abb34f23 100644
--- a/aten/src/ATen/mkl/SparseBlas.cpp
+++ b/aten/src/ATen/mkl/SparseBlas.cpp
@@ -19,9 +19,7 @@ MKL_Complex to_mkl_complex(c10::complex<scalar_t> scalar) {
 
 } // namespace
 
-// There are link errors when compiling with create_csr functions on Windows.
-// See https://github.com/pytorch/pytorch/pull/50937#issuecomment-779272492
-#if !defined(_WIN32)
+
 template <>
 void create_csr<float>(MKL_SPARSE_CREATE_CSR_ARGTYPES(float)) {
   TORCH_MKLSPARSE_CHECK(mkl_sparse_s_create_csr(
@@ -117,7 +115,6 @@ void create_bsr<c10::complex<double>>(
       col_indx,
       reinterpret_cast<MKL_Complex16*>(values)));
 }
-#endif // !defined(_WIN32)
 
 template <>
 void mv<float>(MKL_SPARSE_MV_ARGTYPES(float)) {
@@ -152,7 +149,6 @@ void mv<c10::complex<double>>(MKL_SPARSE_MV_ARGTYPES(c10::complex<double>)) {
       reinterpret_cast<MKL_Complex16*>(y)));
 }
 
-#if !defined(_WIN32)
 template <>
 void add<float>(MKL_SPARSE_ADD_ARGTYPES(float)) {
   TORCH_MKLSPARSE_CHECK(mkl_sparse_s_add(operation, A, alpha, B, C));
@@ -171,7 +167,6 @@ void add<c10::complex<double>>(MKL_SPARSE_ADD_ARGTYPES(c10::complex<double>)) {
   TORCH_MKLSPARSE_CHECK(mkl_sparse_z_add(
       operation, A, to_mkl_complex<double, MKL_Complex16>(alpha), B, C));
 }
-#endif // !defined(_WIN32)
 
 template <>
 void export_csr<float>(MKL_SPARSE_EXPORT_CSR_ARGTYPES(float)) {
@@ -251,7 +246,6 @@ void mm<c10::complex<double>>(MKL_SPARSE_MM_ARGTYPES(c10::complex<double>)) {
       ldc));
 }
 
-#if !defined(_WIN32)
 template <>
 void spmmd<float>(MKL_SPARSE_SPMMD_ARGTYPES(float)) {
   TORCH_MKLSPARSE_CHECK(mkl_sparse_s_spmmd(
@@ -282,7 +276,6 @@ void spmmd<c10::complex<double>>(MKL_SPARSE_SPMMD_ARGTYPES(c10::complex<double>)
       reinterpret_cast<MKL_Complex16*>(C),
       ldc));
 }
-#endif
 
 template <>
 void trsv<float>(MKL_SPARSE_TRSV_ARGTYPES(float)) {
diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp
index 3361cca8201c8..f7918ac18993c 100644
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@@ -43,7 +43,8 @@ TensorBase empty_mps(
     int64_t nelements = c10::multiply_integers(size);
     auto dtype = dtype_or_default(dtype_opt);
     TORCH_CHECK_TYPE(dtype != ScalarType::Double, MPS_ERROR_DOUBLE_NOT_SUPPORTED);
-    TORCH_CHECK_TYPE(dtype != ScalarType::BFloat16, "BFloat16 is not supported on MPS");
+    TORCH_CHECK_TYPE(dtype != ScalarType::BFloat16 || is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_14_0_PLUS), "MPS BFloat16 is only supported on MacOS 14 or newer");
+
 
     auto dtype_meta = scalarTypeToTypeMeta(dtype);
     int64_t size_bytes = nelements * dtype_meta.itemsize();
diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm
index 5b59cd5d1ddcd..76280fb469e58 100644
--- a/aten/src/ATen/mps/MPSAllocator.mm
+++ b/aten/src/ATen/mps/MPSAllocator.mm
@@ -748,7 +748,7 @@ DeleterFnPtr raw_deleter() const override {
     return &Delete;
   }
 
-  DataPtr allocate(const size_t nbytes) const override {
+  DataPtr allocate(const size_t nbytes) override {
     __block id<MTLBuffer> buf = nbytes > 0 ? _getAllocImpl().malloc(nbytes, m_usage) : nullptr;
     return {buf, buf, &Delete, at::Device(at::DeviceType::MPS, 0)};
   }
@@ -819,6 +819,10 @@ bool waitForEvents(c10::ArrayRef<const void*> buffers) const override {
     return _getAllocImpl().format_size(size);
   }
 
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    default_copy_data(dest, src, count);
+  }
+
  private:
   bool m_has_unified_memory;
   uint32_t m_usage;
diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h
index 40ab07077293d..084820ab42e41 100644
--- a/aten/src/ATen/mps/MPSDevice.h
+++ b/aten/src/ATen/mps/MPSDevice.h
@@ -22,8 +22,6 @@ typedef void* MTLComputePipelineState_t;
 typedef void* MTLLibrary_t;
 #endif
 
-using namespace std;
-
 namespace at::mps {
 
 // Helper enum to check if a MPSGraph op is supported in a given macOS version
diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm
index a0e3b70e98769..c6e8fd732e70a 100644
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@@ -14,8 +14,8 @@
 
 static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& device, bool macOS13Plus) {
   // MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants)
-  // host_name attribute needs at least Metal 2.2
-  MTLLanguageVersion languageVersion = MTLLanguageVersion2_2;
+  // host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+
+  MTLLanguageVersion languageVersion = MTLLanguageVersion2_3;
 #if defined(__MAC_13_0)
   if (macOS13Plus) {
     languageVersion = MTLLanguageVersion3_0;
diff --git a/aten/src/ATen/mps/MPSHooks.h b/aten/src/ATen/mps/MPSHooks.h
index 546b47bf55aa6..667430eaf8114 100644
--- a/aten/src/ATen/mps/MPSHooks.h
+++ b/aten/src/ATen/mps/MPSHooks.h
@@ -46,6 +46,12 @@ struct MPSHooks : public at::MPSHooksInterface {
   void synchronizeEvent(uint32_t event_id) const override;
   bool queryEvent(uint32_t event_id) const override;
   double elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event_id) const override;
+
+  // Compatibility with Accelerator API
+  bool hasPrimaryContext(DeviceIndex device_index) const override {
+    // When MPS is available, it is always in use for the one device.
+    return true;
+  }
 };
 
 } // namespace at::mps
diff --git a/aten/src/ATen/mps/MPSProfiler.h b/aten/src/ATen/mps/MPSProfiler.h
index 994c50ad9e61c..7ee9db5dd3242 100644
--- a/aten/src/ATen/mps/MPSProfiler.h
+++ b/aten/src/ATen/mps/MPSProfiler.h
@@ -9,12 +9,12 @@
 #include <os/signpost.h>
 #include <os/log.h>
 
+#include <atomic>
+#include <ctime>
 #include <sstream>
 #include <string>
-#include <atomic>
 #include <unordered_map>
 #include <utility>
-#include <ctime>
 
 namespace at::mps {
 
@@ -296,9 +296,15 @@ class MPSProfiler {
   // during runtime (instead of environment variables).
   // The "mode" could be either "interval", "event", or both "interval,event"
   // for interval-based and/or event-based signpost tracing.
-  void StartTrace(const string& mode, bool waitUntilCompleted);
+  void StartTrace(const std::string& mode, bool waitUntilCompleted);
   void StopTrace();
 
+  // Abstractions for GPU trace capturing
+  bool isCaptureEnabled() const;
+  bool isCapturing() const;
+  void startCapture(const std::string& name, MPSStream* stream = nullptr);
+  void stopCapture(MPSStream* stream = nullptr);
+
   // convenience functions to indicate whether signpost tracing or
   // logging are enabled for the SignpostTypes
   bool isOperationProfilingEnabled() const {
@@ -356,6 +362,9 @@ class MPSProfiler {
   // a short list that contains copy stats
   std::unordered_map<CopyInfo::Kind, std::unique_ptr<CopyStat>> m_copy_stat_list{};
 
+  mutable MTLCaptureManager *captureManager = nil;
+  unsigned captureCount = 0;
+
   void initialize();
   void beginProfileExecution(BaseInfo& info, bool cpuExecution = false);
   void endProfileExecution(BaseInfo& info, os_signpost_id_t event_signpost_id,
diff --git a/aten/src/ATen/mps/MPSProfiler.mm b/aten/src/ATen/mps/MPSProfiler.mm
index e6e1a7257923b..522328277787b 100644
--- a/aten/src/ATen/mps/MPSProfiler.mm
+++ b/aten/src/ATen/mps/MPSProfiler.mm
@@ -195,7 +195,7 @@
   }
 }
 
-void MPSProfiler::StartTrace(const string& mode, bool waitUntilCompleted) {
+void MPSProfiler::StartTrace(const std::string& mode, bool waitUntilCompleted) {
   TORCH_CHECK(m_profile_options == ProfileOptions::OPTIONS_NONE, "Tracing Signposts is already enabled ");
 
   std::stringstream ss(mode);
@@ -765,6 +765,41 @@
 struct sigaction MPSProfiler::currentSigint {};
 struct sigaction MPSProfiler::previousSigint {};
 
+bool MPSProfiler::isCapturing() const {
+  return [captureManager isCapturing];
+}
+
+bool MPSProfiler::isCaptureEnabled() const {
+  if (captureManager == nil) {
+    captureManager = [MTLCaptureManager sharedCaptureManager];
+  }
+  static bool isEnabled = [this]() {
+    return [captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument];
+  }();
+  return isEnabled;
+}
+
+void MPSProfiler::startCapture(const std::string& name, MPSStream* stream) {
+  if (captureManager == nil) {
+    captureManager = [MTLCaptureManager sharedCaptureManager];
+  }
+  NSError* err = nil;
+  NSString* fname = [NSString stringWithFormat:@"%04d-%s.gputrace", captureCount++, name.c_str()];
+  MTLCaptureDescriptor* captureDescriptor = [MTLCaptureDescriptor new];
+  captureDescriptor.captureObject = stream ? (id)stream->commandQueue() : (id)MPSDevice::getInstance()->device();
+  captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument;
+  captureDescriptor.outputURL = [NSURL fileURLWithPath:fname];
+  auto rc = [captureManager startCaptureWithDescriptor:captureDescriptor error:&err];
+  TORCH_CHECK(rc, "Failed to start capture of ", [fname UTF8String], " error ", [[err description] UTF8String]);
+}
+
+void MPSProfiler::stopCapture(MPSStream* stream) {
+  if (stream) {
+    stream->synchronize(SyncType::COMMIT);
+  }
+  [captureManager stopCapture];
+}
+
 } // namespace Profiler
 
 Profiler::MPSProfiler& getMPSProfiler() {
diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm
index 2ac8b0cc64a36..0542a9fbd4c24 100644
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@@ -22,7 +22,7 @@ @interface MPSGraphExecutionDescriptor ()
   _compilationDescriptor = [MPSGraphCompilationDescriptor new];
 
   // disable commitAndContinue if Signpost tracing is enabled
-  if (getMPSProfiler().isSignpostTracingEnabled()) {
+  if (getMPSProfiler().isSignpostTracingEnabled() || getMPSProfiler().isCaptureEnabled()) {
     _enableCommitAndContinue = false;
   }
   _executionDescriptor.enableCommitAndContinue = _enableCommitAndContinue;
@@ -173,11 +173,22 @@ @interface MPSGraphExecutionDescriptor ()
       endKernelCoalescing();
       id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
 
-      [blitEncoder copyFromBuffer:srcBuffer
-                     sourceOffset:(NSUInteger)srcOffset
-                         toBuffer:dstBuffer
-                destinationOffset:(NSUInteger)dstOffset
-                             size:(NSUInteger)length];
+      // For some reason copyFromBuffer for 4Gb fails without returning an error
+      // See https://github.com/pytorch/pytorch/issues/124335
+      // Workaround by batching copy commands into 2Gb chunks
+      constexpr size_t max_copy_size = 0x80000000; // 2GB
+      size_t bytes_copied = 0;
+      size_t bytes_remains = length;
+      while (bytes_remains > 0) {
+        NSUInteger bytes_to_copy = std::min(max_copy_size, bytes_remains);
+        [blitEncoder copyFromBuffer:srcBuffer
+                       sourceOffset:(NSUInteger)srcOffset + bytes_copied
+                           toBuffer:dstBuffer
+                  destinationOffset:(NSUInteger)dstOffset + bytes_copied
+                               size:bytes_to_copy];
+        bytes_copied += bytes_to_copy;
+        bytes_remains -= bytes_to_copy;
+      }
       [blitEncoder endEncoding];
 
       // profilerId has a value only if copy profiling is enabled
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index 7f5c696d1f6e7..533bc32216365 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -76,7 +76,6 @@
 #include <ATen/ops/tanh.h>
 #include <ATen/ops/threshold_backward_native.h>
 #include <ATen/ops/threshold_native.h>
-#include <ATen/ops/zeros_like.h>
 
 #include <utility>
 #endif
@@ -89,8 +88,8 @@ TORCH_META_FUNC(threshold)(const Tensor& self, const Scalar& threshold, const Sc
   build(TensorIteratorConfig()
     .set_check_mem_overlap(false)  // threshold is idempotent, so overlap is okay
     .add_output(result)
-    .add_input(self)
-    .add_input(self) // other
+    .add_const_input(self)
+    .add_const_input(self) // other
     .allow_cpu_scalars(true)
     .promote_inputs_to_common_dtype(true)
     .cast_common_dtype_to_outputs(true)
@@ -103,8 +102,8 @@ TORCH_META_FUNC(threshold_backward)(const Tensor& grad, const Tensor& self, cons
   build(TensorIteratorConfig()
     .set_check_mem_overlap(false)  // threshold is idempotent, so overlap is okay
     .add_output(gradInput)
-    .add_input(self)
-    .add_input(grad)  // other
+    .add_const_input(self)
+    .add_const_input(grad)  // other
     .allow_cpu_scalars(true)
     .promote_inputs_to_common_dtype(true)
     .cast_common_dtype_to_outputs(true)
@@ -393,7 +392,7 @@ TORCH_IMPL_FUNC(gelu_out_cpu) (
 auto approximate_type = get_gelutype_enum(approximate);
 #if AT_MKLDNN_ENABLED()
   if (use_mkldnn(self) && (approximate_type == GeluType::None)) {
-    const ideep::tensor& x = itensor_from_tensor(self);
+    const ideep::tensor& x = itensor_from_tensor(self, /*from_const_data_ptr*/true);
     ideep::tensor y = itensor_from_tensor(result);
     ideep::eltwise_forward::compute(
       x, y, ideep::algorithm::eltwise_gelu_erf, ideep::prop_kind::forward_training, /*alpha*/ 0.0);
@@ -411,8 +410,8 @@ TORCH_IMPL_FUNC(gelu_backward_out_cpu) (
 auto approximate_type = get_gelutype_enum(approximate);
 #if AT_MKLDNN_ENABLED()
   if (use_mkldnn(self) && (approximate_type == GeluType::None)) {
-    const ideep::tensor& x = itensor_from_tensor(self);
-    ideep::tensor grady = itensor_from_tensor(grad);
+    const ideep::tensor& x = itensor_from_tensor(self, /*from_const_data_ptr*/true);
+    ideep::tensor grady = itensor_from_tensor(grad, /*from_const_data_ptr*/true);
     ideep::tensor gradx = itensor_from_tensor(grad_input);
     ideep::eltwise_backward::compute(x, grady, gradx,
       ideep::algorithm::eltwise_gelu_erf, /*alpha*/ 0.0);
@@ -579,7 +578,7 @@ inline void _rrelu_with_noise_train(
   opmath_t upper = upper_.to<opmath_t>();
   Tensor tmp_tensor = output.contiguous();
   scalar_t* output_data = tmp_tensor.data_ptr<scalar_t>();
-  scalar_t* input_data = input.data_ptr<scalar_t>();
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
   scalar_t* noise_data = noise.data_ptr<scalar_t>();
   auto gen  = at::get_generator_or_default<CPUGeneratorImpl>(generator, detail::getDefaultCPUGenerator());
   std::lock_guard<std::mutex> lock(gen->mutex_);
@@ -717,8 +716,8 @@ Tensor _prelu_kernel(const Tensor& self, const Tensor& weight) {
   auto result = at::empty_like(self);
   auto iter = TensorIteratorConfig()
     .add_output(result)
-    .add_input(self)
-    .add_input(weight)
+    .add_const_input(self)
+    .add_const_input(weight)
     .build();
   prelu_stub(iter.device_type(), iter);
   return result;
@@ -730,9 +729,9 @@ std::tuple<Tensor, Tensor> _prelu_kernel_backward(const Tensor& grad_out, const
   auto iter = TensorIteratorConfig()
     .add_output(grad_self)
     .add_output(grad_weight)
-    .add_input(self)
-    .add_input(weight)
-    .add_input(grad_out)
+    .add_const_input(self)
+    .add_const_input(weight)
+    .add_const_input(grad_out)
     .build();
   prelu_backward_stub(iter.device_type(), iter);
   return {grad_self, grad_weight};
@@ -748,9 +747,8 @@ Tensor infinitely_differentiable_gelu_backward(
 }
 
 std::tuple<Tensor, Tensor> log_sigmoid_forward_cpu(const Tensor& input) {
-  // FIXME: do these actually need to be zeros_like or can they be empty_like?
-  auto result = at::zeros_like(input, at::MemoryFormat::Contiguous);
-  auto buffer = at::zeros_like(input, at::MemoryFormat::Contiguous);
+  auto result = at::empty_like(input, at::MemoryFormat::Contiguous);
+  auto buffer = at::empty_like(input, at::MemoryFormat::Contiguous);
   log_sigmoid_cpu_stub(kCPU, result, buffer, input.contiguous());
   return std::make_tuple(result, buffer);
 }
@@ -781,8 +779,8 @@ Tensor log_sigmoid_backward_cuda(const Tensor& grad_output, const Tensor& input,
   // NOTE: buffer is only used by CPU dispatch, we just ignore it here
   auto iter = at::TensorIteratorConfig()
       .add_output(grad_input)
-      .add_input(input)
-      .add_input(grad_output)
+      .add_const_input(input)
+      .add_const_input(grad_output)
       .build();
   log_sigmoid_backward_stub(kCUDA, iter);
   return iter.output();
@@ -792,9 +790,9 @@ Tensor log_sigmoid_backward_cpu(const Tensor& grad_output, const Tensor& input,
   auto grad_input = at::empty_like(grad_output);
   auto iter = at::TensorIteratorConfig()
       .add_output(grad_input)
-      .add_input(input)
-      .add_input(buffer)
-      .add_input(grad_output)
+      .add_const_input(input)
+      .add_const_input(buffer)
+      .add_const_input(grad_output)
       .build();
   log_sigmoid_backward_stub(kCPU, iter);
   return iter.output();
@@ -804,8 +802,8 @@ Tensor& log_sigmoid_backward_cuda_out(const Tensor& grad_output, const Tensor& i
                                       const Tensor& buffer, Tensor& grad_input) {
   auto iter = TensorIteratorConfig()
       .add_output(grad_input)
-      .add_input(input)
-      .add_input(grad_output)
+      .add_const_input(input)
+      .add_const_input(grad_output)
       .build();
   log_sigmoid_backward_stub(kCUDA, iter);
   return grad_input;
@@ -817,9 +815,9 @@ Tensor& log_sigmoid_backward_cpu_out(const Tensor& grad_output,
     Tensor& grad_input) {
   auto iter = TensorIteratorConfig()
       .add_output(grad_input)
-      .add_input(input)
-      .add_input(buffer)
-      .add_input(grad_output)
+      .add_const_input(input)
+      .add_const_input(buffer)
+      .add_const_input(grad_output)
       .build();
   log_sigmoid_backward_stub(kCPU, iter);
   return grad_input;
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
index 38a6e2322ab75..bbd4f68d40d09 100644
--- a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
@@ -25,7 +25,7 @@ namespace {
 
 template <typename scalar_t>
 static void adaptive_avg_pool3d_out_frame(
-    scalar_t* input_p,
+    const scalar_t* input_p,
     scalar_t* output_p,
     int64_t sizeD,
     int64_t isizeT,
@@ -57,7 +57,7 @@ static void adaptive_avg_pool3d_out_frame(
             int kW = iendW - istartW;
 
             /* local pointers */
-            scalar_t* ip = input_p + d * istrideD + istartT * istrideT +
+            const scalar_t* ip = input_p + d * istrideD + istartT * istrideT +
                 istartH * istrideH + istartW * istrideW;
             scalar_t* op = output_p + d * osizeT * osizeH * osizeW +
                 ot * osizeH * osizeW + oh * osizeW + ow;
@@ -128,7 +128,7 @@ void adaptive_avg_pool3d_out_cpu_template(
 
     AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
         input.scalar_type(), "adaptive_avg_pool3d_cpu", [&] {
-          auto input_data = input.data_ptr<scalar_t>();
+          auto input_data = input.const_data_ptr<scalar_t>();
           auto output_data = output.data_ptr<scalar_t>();
           adaptive_avg_pool3d_out_frame<scalar_t>(
               input_data,
@@ -151,7 +151,7 @@ void adaptive_avg_pool3d_out_cpu_template(
 
     AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
         input.scalar_type(), "adaptive_avg_pool3d_cpu", [&] {
-          auto input_data = input.data_ptr<scalar_t>();
+          auto input_data = input.const_data_ptr<scalar_t>();
           auto output_data = output.data_ptr<scalar_t>();
           at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) {
             for (const auto b : c10::irange(start, end)) {
@@ -178,7 +178,7 @@ void adaptive_avg_pool3d_out_cpu_template(
 template <typename scalar_t>
 static void adaptive_avg_pool3d_backward_out_frame(
     scalar_t* gradInput_p,
-    scalar_t* gradOutput_p,
+    const scalar_t* gradOutput_p,
     int64_t sizeD,
     int64_t isizeT,
     int64_t isizeH,
@@ -189,7 +189,7 @@ static void adaptive_avg_pool3d_backward_out_frame(
   at::parallel_for(0, sizeD, 1, [&](int64_t start, int64_t end) {
     for (const auto d : c10::irange(start, end)) {
       scalar_t* gradInput_p_d = gradInput_p + d * isizeT * isizeW * isizeH;
-      scalar_t* gradOutput_p_d = gradOutput_p + d * osizeT * osizeW * osizeH;
+      const scalar_t* gradOutput_p_d = gradOutput_p + d * osizeT * osizeW * osizeH;
 
       /* calculate average */
       for (const auto ot : c10::irange(osizeT)) {
@@ -251,7 +251,7 @@ Tensor& adaptive_avg_pool3d_backward_out_cpu_template(
         input.scalar_type(), "adaptive_avg_pool3d_backward_cpu", [&] {
           /* get raw pointers */
           scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
-          scalar_t* gradOutput_data = gradOutput.data_ptr<scalar_t>();
+          const scalar_t* gradOutput_data = gradOutput.const_data_ptr<scalar_t>();
 
           adaptive_avg_pool3d_backward_out_frame<scalar_t>(
               gradInput_data,
@@ -271,7 +271,7 @@ Tensor& adaptive_avg_pool3d_backward_out_cpu_template(
         input.scalar_type(), "adaptive_avg_pool3d_backward_cpu", [&] {
           /* get raw pointers */
           scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
-          scalar_t* gradOutput_data = gradOutput.data_ptr<scalar_t>();
+          const scalar_t* gradOutput_data = gradOutput.const_data_ptr<scalar_t>();
           at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) {
             for (const auto b : c10::irange(start, end)) {
               adaptive_avg_pool3d_backward_out_frame<scalar_t>(
@@ -317,6 +317,12 @@ Tensor adaptive_avg_pool3d_symint(Tensor const& input, SymIntArrayRef output_siz
     // in this case, adaptive pooling is just computing mean over hw
     // dimensions, which can be done more efficiently
     Tensor out = input.mean({-1, -2, -3}, /* keepdim = */ true);
+    if (input.suggest_memory_format() == at::MemoryFormat::ChannelsLast3d) {
+      // assert ndim == 5, since ndim = 4 doesn't give channels_last
+      const auto n = input.sym_size(0);
+      const auto c = input.sym_size(1);
+      out.as_strided__symint({n, c, 1, 1, 1}, {c, 1, c, c, c});
+    }
     return out;
   } else {
     return _adaptive_avg_pool3d_symint(input, output_size);
diff --git a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
index 78c355d2467a8..001e3c7d2d56e 100644
--- a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
@@ -82,7 +82,7 @@ namespace {
 
 template <typename scalar_t>
 static void adaptive_max_pool3d_single_out_frame(
-          scalar_t *input_p,
+          const scalar_t *input_p,
           scalar_t *output_p,
           int64_t *ind_p,
           int64_t sizeD,
@@ -121,7 +121,7 @@ static void adaptive_max_pool3d_single_out_frame(
             int64_t kW = iendW - istartW;
 
             /* local pointers */
-            scalar_t *ip = input_p   + d*istrideD + istartT *istrideT + istartH*istrideH + istartW*istrideW;
+            const scalar_t *ip = input_p   + d*istrideD + istartT *istrideT + istartH*istrideH + istartW*istrideW;
             scalar_t *op = output_p  + d*osizeT*osizeH*osizeW + ot*osizeH*osizeW + oh*osizeW + ow;
             int64_t *indp = ind_p   + d*osizeT*osizeH*osizeW + ot*osizeH*osizeW + oh*osizeW + ow;
 
@@ -159,7 +159,7 @@ static void adaptive_max_pool3d_single_out_frame(
 
 template <typename scalar_t>
 static void adaptive_max_pool3d_out_frame(
-          scalar_t *input_data,
+          const scalar_t *input_data,
           scalar_t *output_data,
           int64_t *indices_data,
           int64_t sizeB,
@@ -192,8 +192,8 @@ static void adaptive_max_pool3d_out_frame(
 template <typename scalar_t>
 static void adaptive_max_pool3d_backward_single_out_frame(
           scalar_t *gradInput_p,
-          scalar_t *gradOutput_p,
-          int64_t *ind_p,
+          const scalar_t *gradOutput_p,
+          const int64_t *ind_p,
           int64_t sizeD,
           int64_t isizeT,
           int64_t isizeH,
@@ -205,8 +205,8 @@ static void adaptive_max_pool3d_backward_single_out_frame(
   at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
     for (const auto d : c10::irange(start, end)) {
       scalar_t *gradInput_p_d = gradInput_p + d*isizeT*isizeH*isizeW;
-      scalar_t *gradOutput_p_d = gradOutput_p + d*osizeT*osizeH*osizeW;
-      int64_t *ind_p_d = ind_p + d*osizeT*osizeH*osizeW;
+      const scalar_t *gradOutput_p_d = gradOutput_p + d*osizeT*osizeH*osizeW;
+      const int64_t *ind_p_d = ind_p + d*osizeT*osizeH*osizeW;
 
       /* calculate max points */
       int64_t ot, oh, ow;
@@ -231,8 +231,8 @@ static void adaptive_max_pool3d_backward_single_out_frame(
 template <typename scalar_t>
 static void adaptive_max_pool3d_backward_out_frame(
           scalar_t *gradInput_data,
-          scalar_t *gradOutput_data,
-          int64_t *indices_data,
+          const scalar_t *gradOutput_data,
+          const int64_t *indices_data,
           int64_t sizeB,
           int64_t sizeD,
           int64_t isizeT,
@@ -299,7 +299,7 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_out_cpu)
   if (input.ndimension() == 4) {
     AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16,
         input.scalar_type(), "adaptive_max_pool3d_cpu", [&] {
-          auto input_data = input.data_ptr<scalar_t>();
+          auto input_data = input.const_data_ptr<scalar_t>();
           auto output_data = output.data_ptr<scalar_t>();
           auto indices_data = indices.data_ptr<int64_t>();
 
@@ -322,7 +322,7 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_out_cpu)
   } else {
     AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16,
         input.scalar_type(), "adaptive_max_pool3d_cpu", [&] {
-          auto input_data = input.data_ptr<scalar_t>();
+          auto input_data = input.const_data_ptr<scalar_t>();
           auto output_data = output.data_ptr<scalar_t>();
           auto indices_data = indices.data_ptr<int64_t>();
 
@@ -394,8 +394,8 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_backward_out_cpu)
         input.scalar_type(), "adaptive_max_pool3d_backward", [&] {
           /* get raw pointers */
           scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
-          scalar_t* gradOutput_data = gradOutput_.data_ptr<scalar_t>();
-          int64_t* indices_data = indices.data_ptr<int64_t>();
+          const scalar_t* gradOutput_data = gradOutput_.const_data_ptr<scalar_t>();
+          const int64_t* indices_data = indices.const_data_ptr<int64_t>();
 
           adaptive_max_pool3d_backward_single_out_frame<scalar_t>(
               gradInput_data,
@@ -414,8 +414,8 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_backward_out_cpu)
         input.scalar_type(), "adaptive_max_pool3d_backward", [&] {
           /* get raw pointers */
           scalar_t* gradInput_data = gradInput.data_ptr<scalar_t>();
-          scalar_t* gradOutput_data = gradOutput_.data_ptr<scalar_t>();
-          int64_t* indices_data = indices.data_ptr<int64_t>();
+          const scalar_t* gradOutput_data = gradOutput_.const_data_ptr<scalar_t>();
+          const int64_t* indices_data = indices.const_data_ptr<int64_t>();
 
           adaptive_max_pool3d_backward_out_frame<scalar_t>(
               gradInput_data,
diff --git a/aten/src/ATen/native/AdaptivePooling.h b/aten/src/ATen/native/AdaptivePooling.h
index d342d218e449a..bb2fda9906abe 100644
--- a/aten/src/ATen/native/AdaptivePooling.h
+++ b/aten/src/ATen/native/AdaptivePooling.h
@@ -8,15 +8,25 @@
 
 namespace at::native {
 
-using adaptive_avg_pooling_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size);
-using adaptive_avg_pooling_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output);
-DECLARE_DISPATCH(adaptive_avg_pooling_fn, adaptive_avg_pool2d_kernel);
-DECLARE_DISPATCH(adaptive_avg_pooling_backward_fn, adaptive_avg_pool2d_backward_kernel);
-
-using adaptive_max_pooling_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size);
-using adaptive_max_pooling_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
-DECLARE_DISPATCH(adaptive_max_pooling_fn, adaptive_max_pool2d_kernel);
-DECLARE_DISPATCH(adaptive_max_pooling_backward_fn, adaptive_max_pool2d_backward_kernel);
+using adaptive_avg_pooling2d_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size);
+using adaptive_avg_pooling2d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output);
+DECLARE_DISPATCH(adaptive_avg_pooling2d_fn, adaptive_avg_pool2d_kernel);
+DECLARE_DISPATCH(adaptive_avg_pooling2d_backward_fn, adaptive_avg_pool2d_backward_kernel);
+
+using adaptive_max_pooling2d_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size);
+using adaptive_max_pooling2d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
+DECLARE_DISPATCH(adaptive_max_pooling2d_fn, adaptive_max_pool2d_kernel);
+DECLARE_DISPATCH(adaptive_max_pooling2d_backward_fn, adaptive_max_pool2d_backward_kernel);
+
+using adaptive_avg_pooling3d_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size);
+using adaptive_avg_pooling3d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output);
+DECLARE_DISPATCH(adaptive_avg_pooling3d_fn, adaptive_avg_pool3d_kernel);
+DECLARE_DISPATCH(adaptive_avg_pooling3d_backward_fn, adaptive_avg_pool3d_backward_kernel);
+
+using adaptive_max_pooling3d_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size);
+using adaptive_max_pooling3d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
+DECLARE_DISPATCH(adaptive_max_pooling3d_fn, adaptive_max_pool3d_kernel);
+DECLARE_DISPATCH(adaptive_max_pooling3d_backward_fn, adaptive_max_pool3d_backward_kernel);
 
 static inline int64_t start_index(int64_t a, int64_t b, int64_t c) {
   return (a / b) * c + ((a % b) * c) / b;
diff --git a/aten/src/ATen/native/AffineGridGenerator.cpp b/aten/src/ATen/native/AffineGridGenerator.cpp
index 17e45acb1bb76..315027d7069b4 100644
--- a/aten/src/ATen/native/AffineGridGenerator.cpp
+++ b/aten/src/ATen/native/AffineGridGenerator.cpp
@@ -110,7 +110,7 @@ static Tensor affine_grid_generator_4D_backward(
   AT_ASSERT(grad_grid.sizes() == IntArrayRef({N, H, W, 2}));
   auto grad_theta = base_grid.view({N, H * W, 3})
                         .transpose(1, 2)
-                        .bmm(grad_grid.view({N, H * W, 2}));
+                        .bmm(grad_grid.reshape({N, H * W, 2}));
   return grad_theta.transpose(1, 2);
 }
 
@@ -126,7 +126,7 @@ static Tensor affine_grid_generator_5D_backward(
   AT_ASSERT(grad_grid.sizes() == IntArrayRef({N, D, H, W, 3}));
   auto grad_theta = base_grid.view({N, D * H * W, 4})
                         .transpose(1, 2)
-                        .bmm(grad_grid.view({N, D * H * W, 3}));
+                        .bmm(grad_grid.reshape({N, D * H * W, 3}));
   return grad_theta.transpose(1, 2);
 }
 
diff --git a/aten/src/ATen/native/AmpKernels.cpp b/aten/src/ATen/native/AmpKernels.cpp
new file mode 100644
index 0000000000000..32248c943193a
--- /dev/null
+++ b/aten/src/ATen/native/AmpKernels.cpp
@@ -0,0 +1,41 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/AmpKernels.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale.h>
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale_native.h>
+#include <ATen/ops/_amp_update_scale.h>
+#include <ATen/ops/_amp_update_scale_native.h>
+#endif
+
+namespace at::native {
+
+void _amp_foreach_non_finite_check_and_unscale_cpu_(
+    TensorList scaled_grads,
+    at::Tensor& found_inf,
+    const at::Tensor& inv_scale) {
+    _amp_foreach_non_finite_check_and_unscale_cpu_stub(
+        found_inf.device().type(), scaled_grads, found_inf, inv_scale);
+}
+
+at::Tensor& _amp_update_scale_cpu_ (
+    at::Tensor& current_scale,
+    at::Tensor& growth_tracker,
+    const at::Tensor& found_inf,
+    double growth_factor,
+    double backoff_factor,
+    int64_t growth_interval) {
+    return _amp_update_scale_cpu_stub(
+        growth_tracker.device().type(), current_scale, growth_tracker,
+        found_inf, growth_factor, backoff_factor, growth_interval);
+}
+
+DEFINE_DISPATCH(_amp_foreach_non_finite_check_and_unscale_cpu_stub);
+DEFINE_DISPATCH(_amp_update_scale_cpu_stub);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/AmpKernels.h b/aten/src/ATen/native/AmpKernels.h
new file mode 100644
index 0000000000000..c463c80e1c6dc
--- /dev/null
+++ b/aten/src/ATen/native/AmpKernels.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+class Tensor;
+
+namespace native {
+
+using _amp_foreach_non_finite_check_and_unscale_cpu__fn = void (*)(
+    TensorList,
+    Tensor&,
+    const Tensor&);
+
+using _amp_update_scale_cpu__fn = Tensor& (*)(
+    Tensor&,
+    Tensor&,
+    const Tensor&,
+    double,
+    double,
+    int64_t);
+
+DECLARE_DISPATCH(_amp_foreach_non_finite_check_and_unscale_cpu__fn, _amp_foreach_non_finite_check_and_unscale_cpu_stub);
+DECLARE_DISPATCH(_amp_update_scale_cpu__fn, _amp_update_scale_cpu_stub);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/AutogradComposite.cpp b/aten/src/ATen/native/AutogradComposite.cpp
index c97c7e2b139a4..dc98c90a596dd 100644
--- a/aten/src/ATen/native/AutogradComposite.cpp
+++ b/aten/src/ATen/native/AutogradComposite.cpp
@@ -1,6 +1,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <c10/util/SmallBuffer.h>
+#include <c10/core/impl/COW.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -10,6 +11,7 @@
 #include <ATen/ops/_make_dual_native.h>
 #include <ATen/ops/_new_zeros_with_same_feature_meta_native.h>
 #include <ATen/ops/_unpack_dual_native.h>
+#include <ATen/ops/_lazy_clone_native.h>
 #include <ATen/ops/alias.h>
 #include <ATen/ops/zeros.h>
 #endif
@@ -41,17 +43,17 @@ Tensor _new_zeros_with_same_feature_meta(
     const at::Tensor& self,
     const at::Tensor& other,
     int64_t self_num_batch_dims) {
-  auto other_sizes = other.sizes();
-  auto other_strides = other.strides();
+  auto other_sizes = other.sym_sizes();
+  auto other_strides = other.sym_strides();
   auto other_storage_offset = other.storage_offset();
-  int64_t other_storage_numel = other.storage().nbytes() / other.itemsize();
+  auto other_storage_numel = other.storage().sym_nbytes() / other.itemsize();
 
   if (self_num_batch_dims == 0) {
-    auto new_tensor = at::zeros({other_storage_numel}, other.options());
-    return new_tensor.as_strided(other_sizes, other_strides, other_storage_offset);
+    auto new_tensor = at::zeros_symint({other_storage_numel}, other.options());
+    return new_tensor.as_strided_symint(other_sizes, other_strides, other_storage_offset);
   }
 
-  auto self_sizes = self.sizes();
+  auto self_sizes = self.sym_sizes();
 
   // NB: We don't check that the sizes of self is the same as that of other
   //     because this function is also used in the inplace over view case
@@ -63,14 +65,14 @@ Tensor _new_zeros_with_same_feature_meta(
   //     this case.
   constexpr int64_t kSmallBufferSizeHint = 8;
 
-  auto out_sizes = c10::SmallBuffer<int64_t, kSmallBufferSizeHint>(other.dim() + self_num_batch_dims);
+  auto out_sizes = c10::SmallVector<c10::SymInt, kSmallBufferSizeHint>(other.dim() + self_num_batch_dims);
   std::copy(self_sizes.begin(), self_sizes.begin() + self_num_batch_dims, out_sizes.begin());
   std::copy(other_sizes.begin(), other_sizes.end(), out_sizes.begin() + self_num_batch_dims);
 
   // We use the strides of other, and tack on the strides computed with
   // the batch dims of self, so that the slices are arranged contiguously
-  auto out_strides = c10::SmallBuffer<int64_t, kSmallBufferSizeHint>(other.dim() + self_num_batch_dims);
-  int64_t prod = other_storage_numel;
+  auto out_strides = c10::SmallVector<c10::SymInt, kSmallBufferSizeHint>(other.dim() + self_num_batch_dims);
+  auto prod = other_storage_numel;
 
   for (int64_t i = self_num_batch_dims - 1; i >= 0; --i) {
     out_strides[i] = prod;
@@ -78,15 +80,30 @@ Tensor _new_zeros_with_same_feature_meta(
   }
   std::copy(other_strides.begin(), other_strides.end(), out_strides.begin() + self_num_batch_dims);
 
-  int64_t storage_numel = prod;
+  auto storage_numel = prod;
 
   // Inherit the TensorOptions of the primal
-  auto new_tensor = at::zeros({storage_numel}, other.options());
-  return new_tensor.as_strided(out_sizes, out_strides, other_storage_offset);
+  auto new_tensor = at::zeros_symint({storage_numel}, other.options());
+  return new_tensor.as_strided_symint(out_sizes, out_strides, other_storage_offset);
 }
 
 bool _has_same_storage_numel(const at::Tensor& base, const at::Tensor& other) {
-  return base.storage().nbytes() / base.itemsize() == other.storage().nbytes() / other.itemsize();
+  return base.storage().sym_nbytes() / base.itemsize() == other.storage().sym_nbytes() / other.itemsize();
+}
+
+Tensor _lazy_clone(Tensor const& self) {
+  c10::StorageImpl* self_storage = self.storage().unsafeGetStorageImpl();
+  c10::intrusive_ptr<c10::StorageImpl> storage =
+    c10::impl::cow::lazy_clone_storage(*self_storage);
+  TORCH_CHECK(storage != nullptr);
+  auto tensor = c10::make_intrusive<c10::TensorImpl>(
+      c10::Storage(std::move(storage)),
+      self.key_set(),
+      self.dtype());
+  tensor->set_sizes_and_strides(self.sym_sizes(),
+                                self.sym_strides(),
+                                self.sym_storage_offset());
+  return Tensor(std::move(tensor));
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/AveragePool3d.cpp b/aten/src/ATen/native/AveragePool3d.cpp
index 110095e6fff00..c2d7b44a5076c 100644
--- a/aten/src/ATen/native/AveragePool3d.cpp
+++ b/aten/src/ATen/native/AveragePool3d.cpp
@@ -155,7 +155,7 @@ namespace {
 
 template <typename scalar_t>
 static void avg_pool3d_out_frame(
-          scalar_t *input_p,
+          const scalar_t *input_p,
           scalar_t *output_p,
           int64_t nslices,
           int64_t itime,
@@ -182,7 +182,7 @@ static void avg_pool3d_out_frame(
       int64_t i, j, ti;
 
       /* local pointers. */
-      scalar_t *ip = input_p + k * itime * iwidth * iheight;
+      const scalar_t *ip = input_p + k * itime * iwidth * iheight;
       scalar_t *op = output_p + k * otime * owidth * oheight;
       for (i = 0; i < otime * oheight * owidth; ++i)
         *(op + i) = 0;
@@ -295,7 +295,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) (
     AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(),
       "avg_pool3d_out_frame",
       [&] {
-        scalar_t *input_data = input.data_ptr<scalar_t>();
+        const scalar_t *input_data = input.const_data_ptr<scalar_t>();
         scalar_t *output_data = output.data_ptr<scalar_t>();
 
         avg_pool3d_out_frame(
@@ -318,7 +318,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) (
     AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(),
       "avg_pool3d_out_frame",
       [&] {
-        scalar_t *input_data = input.data_ptr<scalar_t>();
+        const scalar_t *input_data = input.const_data_ptr<scalar_t>();
         scalar_t *output_data = output.data_ptr<scalar_t>();
 
         at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
@@ -344,7 +344,7 @@ namespace {
 template <typename scalar_t>
 static void avg_pool3d_backward_out_frame(
           scalar_t *gradInput_p,
-          scalar_t *gradOutput_p,
+          const scalar_t *gradOutput_p,
           int64_t nslices,
           int64_t itime,
           int64_t iwidth,
@@ -371,7 +371,7 @@ static void avg_pool3d_backward_out_frame(
 
       /* local pointers */
       scalar_t *ip = gradInput_p + k * itime * iwidth * iheight;
-      scalar_t *op = gradOutput_p + k * otime * owidth * oheight;
+      const scalar_t *op = gradOutput_p + k * otime * owidth * oheight;
       for (i = 0; i < itime*iwidth*iheight; i++)
         *(ip + i) = 0;
 
@@ -479,7 +479,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cpu) (
       "avg_pool3d_backward_out_frame",
       [&] {
        scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
-       scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
+       const scalar_t *gradOutput_data = gradOutput.const_data_ptr<scalar_t>();
 
        avg_pool3d_backward_out_frame(
          gradInput_data, gradOutput_data,
@@ -503,7 +503,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cpu) (
       "avg_pool3d_backward_out_frame",
       [&] {
         scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
-        scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
+        const scalar_t *gradOutput_data = gradOutput.const_data_ptr<scalar_t>();
 
         at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) {
           for (const auto p : c10::irange(start, end)) {
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 0719cd3ab5600..40e6b34dc9725 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -28,6 +28,8 @@
 #include <ATen/ops/_linalg_eigh.h>
 #include <ATen/ops/_linalg_eigh_meta.h>
 #include <ATen/ops/_linalg_eigh_native.h>
+#include <ATen/ops/_linalg_eigvals.h>
+#include <ATen/ops/_linalg_eigvals_native.h>
 #include <ATen/ops/_linalg_solve_ex.h>
 #include <ATen/ops/_linalg_solve_ex_meta.h>
 #include <ATen/ops/_linalg_solve_ex_native.h>
@@ -463,8 +465,7 @@ TORCH_META_FUNC(linalg_ldl_solve)
       " does not match b dtype ",
       B.scalar_type());
 
-    std::vector<int64_t> B_broadcast_size;
-    std::tie(B_broadcast_size, std::ignore) = at::native::_linalg_broadcast_batch_dims(B, LD);
+    auto [B_broadcast_size, _] = at::native::_linalg_broadcast_batch_dims(B, LD);
 
   // prefer column major strides
   auto result_strides = at::native::batched_matrix_contiguous_strides(B_broadcast_size, /*column_major=*/true);
@@ -480,8 +481,7 @@ TORCH_META_FUNC(triangular_solve)(const Tensor& self, const Tensor& A, bool uppe
   at::native::linearSolveCheckInputs(self, A, "triangular_solve");
 
   if (A.layout() == Layout::Strided) {
-    std::vector<int64_t> self_broadcast_size, A_broadcast_size;
-    std::tie(self_broadcast_size, A_broadcast_size) = at::native::_linalg_broadcast_batch_dims(self, A);
+    auto [self_broadcast_size, A_broadcast_size] = at::native::_linalg_broadcast_batch_dims(self, A);
 
     // make column major strides for BLAS
     const auto solution_strides = at::native::batched_matrix_contiguous_strides(self_broadcast_size, /*f-contig=*/true);
@@ -629,8 +629,7 @@ TORCH_META_FUNC(linalg_qr)(const Tensor& A,
                            c10::string_view mode) {
   at::native::checkIsMatrix(A, "linalg.qr");
   at::native::checkFloatingOrComplex(A, "linalg.qr");
-  bool compute_q, reduced_mode;
-  std::tie(compute_q, reduced_mode) = at::native::_parse_qr_mode(mode);
+  auto [compute_q, reduced_mode] = at::native::_parse_qr_mode(mode);
 
   auto A_shape = A.sizes().vec();
   const auto m = A_shape.cend()[-2];
@@ -1517,7 +1516,7 @@ void _linalg_check_errors(
   } else {
     // Find the first non-zero info
     auto infos_cpu = infos.to(at::kCPU);
-    auto ptr = infos_cpu.data_ptr<int32_t>();
+    auto ptr = infos_cpu.const_data_ptr<int32_t>();
     auto n = infos.numel();
     auto info_ptr = std::find_if(ptr, ptr + n, [](int32_t x) { return x != 0; });
     info = *info_ptr;
@@ -1604,8 +1603,7 @@ Tensor& linalg_inv_out(const Tensor& A, Tensor& result) {
 }
 
 Tensor linalg_inv(const Tensor& A) {
-  Tensor result, info;
-  std::tie(result, info) = at::linalg_inv_ex(A);
+  auto [result, info] = at::linalg_inv_ex(A);
   at::_linalg_check_errors(info, "linalg.inv", A.dim() == 2);
   return result;
 }
@@ -1627,7 +1625,7 @@ static void apply_cholesky_solve(Tensor& b, Tensor& A, bool upper, Tensor& infos
 #else
   char uplo = upper ? 'U' : 'L';
 
-  auto A_data = A.data_ptr<scalar_t>();
+  auto A_data = A.const_data_ptr<scalar_t>();
   auto b_data = b.data_ptr<scalar_t>();
   auto infos_data = infos.data_ptr<int>();
   auto A_mat_stride = matrixStride(A);
@@ -1640,9 +1638,9 @@ static void apply_cholesky_solve(Tensor& b, Tensor& A, bool upper, Tensor& infos
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   int info;
   for (const auto i : c10::irange(batch_size)) {
-    scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
+    const scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
     scalar_t* b_working_ptr = &b_data[i * b_mat_stride];
-    lapackCholeskySolve<scalar_t>(uplo, n, nrhs, A_working_ptr, ldab, b_working_ptr, ldab, &info);
+    lapackCholeskySolve<scalar_t>(uplo, n, nrhs, const_cast<scalar_t*>(A_working_ptr), ldab, b_working_ptr, ldab, &info);
     infos_data[i] = info;
     if (info != 0) {
       return;
@@ -1669,8 +1667,7 @@ Tensor cholesky_solve(const Tensor& self, const Tensor& A, bool upper) {
            "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
   TORCH_CHECK(A.dim() >= 2,
            "u should have at least 2 dimensions, but has ", A.dim(), " dimensions instead");
-  Tensor self_broadcasted, A_broadcasted;
-  std::tie(self_broadcasted, A_broadcasted) = _linalg_broadcast_batch_dims(self, A, "cholesky_solve");
+  auto [self_broadcasted, A_broadcasted] = _linalg_broadcast_batch_dims(self, A, "cholesky_solve");
   return at::_cholesky_solve_helper(self_broadcasted, A_broadcasted, upper);
 }
 
@@ -1783,8 +1780,7 @@ TORCH_IMPL_FUNC(linalg_cholesky_ex_out)(const Tensor& A,
 }
 
 Tensor linalg_cholesky(const Tensor& A, bool upper) {
-  Tensor L, info;
-  std::tie(L, info) = at::linalg_cholesky_ex(A, upper, /*check_errors=*/false);
+  auto [L, info] = at::linalg_cholesky_ex(A, upper, /*check_errors=*/false);
   at::_linalg_check_errors(info, "linalg.cholesky", A.dim() == 2);
   return L;
 }
@@ -1921,8 +1917,7 @@ std::tuple<Tensor, Tensor> linalg_solve_ex(const Tensor& A,
                                            const Tensor& B,
                                            bool left,
                                            bool check_errors) {
-  Tensor result, LU, pivots, info;
-  std::tie(result, LU, pivots, info) = at::_linalg_solve_ex(A, B, left, check_errors);
+  auto [result, LU, pivots, info] = at::_linalg_solve_ex(A, B, left, check_errors);
   return std::make_tuple(std::move(result), std::move(info));
 }
 
@@ -1939,8 +1934,7 @@ Tensor& linalg_solve_out(const Tensor& A,
 Tensor linalg_solve(const Tensor& A,
                     const Tensor& B,
                     bool left) {
-  Tensor result, info;
-  std::tie(result, info) = at::linalg_solve_ex(A, B, left);
+  auto [result, info] = at::linalg_solve_ex(A, B, left);
   at::_linalg_check_errors(info, "torch.linalg.solve", A.dim() == 2);
   return result;
 }
@@ -1980,8 +1974,7 @@ std::tuple<Tensor&, Tensor&> linalg_lu_factor_out(const Tensor& A, bool pivot, T
 }
 
 std::tuple<Tensor, Tensor> linalg_lu_factor(const Tensor& A, bool pivot) {
-  Tensor LU, pivots, info;
-  std::tie(LU, pivots, info) = at::linalg_lu_factor_ex(A, pivot, /*check_errors=*/false);
+  auto [LU, pivots, info] = at::linalg_lu_factor_ex(A, pivot, /*check_errors=*/false);
   at::_linalg_check_errors(info, "torch.linalg.lu_factor", A.dim() == 2);
   return std::make_tuple(std::move(LU), std::move(pivots));
 }
@@ -2088,7 +2081,7 @@ TORCH_IMPL_FUNC(lu_unpack_out)(const Tensor& LU,
       .resize_outputs(false)
       .declare_static_shape(pivots.sizes(), /*squash_dim=*/pivots.dim() - 1)
       .add_output(perm)
-      .add_owned_input(pivots.contiguous())
+      .add_owned_const_input(pivots.contiguous())
       .build();
 
     unpack_pivots_stub(pivots.device().type(), iter, std::min(m, n), m);
@@ -2237,8 +2230,7 @@ static void triangular_solve_out_impl(
 }
 
 TORCH_IMPL_FUNC(triangular_solve_out)(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular, const Tensor& result, const Tensor& clone_A) {
-  Tensor self_broadcast, A_broadcast;
-  std::tie(self_broadcast, A_broadcast) = _linalg_broadcast_batch_dims(self, A, "triangular_solve");
+  auto [self_broadcast, A_broadcast] = _linalg_broadcast_batch_dims(self, A, "triangular_solve");
 
   bool copy_needed = !result.transpose(-2, -1).is_contiguous();
   copy_needed |= !clone_A.transpose(-2, -1).is_contiguous();
@@ -2370,8 +2362,7 @@ TORCH_IMPL_FUNC(linalg_qr_out)(const Tensor& A,
   auto m = A.size(-2);
   auto n = A.size(-1);
   auto k = std::min(m, n);
-  bool compute_q, reduced_mode;
-  std::tie(compute_q, reduced_mode) = at::native::_parse_qr_mode(mode);
+  auto [compute_q, reduced_mode] = at::native::_parse_qr_mode(mode);
 
 
   // We need an auxiliary tensor to call geqrf
@@ -2783,7 +2774,7 @@ Tensor linalg_eigvalsh(const Tensor& A, c10::string_view uplo) {
 
 Tensor& linalg_eigvalsh_out(const Tensor& A, c10::string_view uplo, Tensor& L) {
   auto V = at::empty({0}, A.options());
-  at::_linalg_eigh_out(L, V, A, uplo, /*comptue_v=*/false);
+  at::_linalg_eigh_out(L, V, A, uplo, /*compute_v=*/false);
   return L;
 }
 
@@ -2803,13 +2794,13 @@ static void linalg_eig_make_complex_eigenvectors_impl(Tensor& result, const Tens
   auto matrix_stride = matrixStride(real_vectors);
 
   auto result_data = result.data_ptr<c10::complex<scalar_t>>();
-  auto real_vectors_data = real_vectors.data_ptr<scalar_t>();
-  auto values_data = complex_values.data_ptr<c10::complex<scalar_t>>();
+  auto real_vectors_data = real_vectors.const_data_ptr<scalar_t>();
+  auto values_data = complex_values.const_data_ptr<c10::complex<scalar_t>>();
 
   for (auto b = decltype(batch_size){0}; b < batch_size; b++) {
-    scalar_t* vecs = &real_vectors_data[b * matrix_stride];
+    const scalar_t* vecs = &real_vectors_data[b * matrix_stride];
     c10::complex<scalar_t>* res = &result_data[b * matrix_stride];
-    c10::complex<scalar_t>* vals = &values_data[b * n];
+    const c10::complex<scalar_t>* vals = &values_data[b * n];
     for (auto j = decltype(n){0}; j < n; j++) {
       if (vals[j].imag() == 0.0) {  // eigenvalue is real, then v(j) = VR(:,j)
         for (auto i = decltype(n){0}; i < n; i++) {
@@ -3111,12 +3102,13 @@ Tensor linalg_eigvals(const Tensor& input) {
   if (_may_require_fw_or_bw_grad(input)) {
     return std::get<0>(at::linalg_eig(input));
   }
+  return at::_linalg_eigvals(input);
+}
 
+Tensor _linalg_eigvals(const Tensor& input) {
   ScalarType complex_dtype = toComplexType(input.scalar_type());
   Tensor values = at::empty({0}, input.options().dtype(complex_dtype));
-
-  at::linalg_eigvals_outf(input, values);
-
+  linalg_eigvals_out(input, values);
   return values;
 }
 
@@ -3164,7 +3156,7 @@ TORCH_IMPL_FUNC(_linalg_svd_out)(const Tensor& A,
   TORCH_CHECK(use_cusolver || !driver.has_value(),
     "torch.linalg.svd: keyword argument `driver=` is only supported on CUDA inputs with cuSOLVER backend.");
 
-  // A always needs to be copied as its contents will be destroyed during the computaton of the SVD
+  // A always needs to be copied as its contents will be destroyed during the computation of the SVD
   // Now, MAGMA needs the copy to be on CPU, while cuSOLVER needs it to be on CUDA, so we'll defer
   // the copy as a column major matrix to the backends.
   const auto info = at::zeros(IntArrayRef(A.sizes().begin(), A.sizes().end() - 2), A.options().dtype(kInt));
@@ -3213,7 +3205,7 @@ Tensor& linalg_svdvals_out(const Tensor& A, c10::optional<c10::string_view> driv
   // Dummies
   auto U = at::empty({0}, A.options());
   auto Vh = at::empty({0}, A.options());
-  at::_linalg_svd_out(U, S, Vh, A, /*full_matrices=*/false, /*comptue_uv=*/false, /*driver=*/driver);
+  at::_linalg_svd_out(U, S, Vh, A, /*full_matrices=*/false, /*compute_uv=*/false, /*driver=*/driver);
   return S;
 }
 
@@ -3740,8 +3732,7 @@ std::tuple<Tensor&, Tensor&> linalg_ldl_factor_out(
 std::tuple<Tensor, Tensor> linalg_ldl_factor(
     const Tensor& self,
     bool hermitian) {
-  Tensor LD, pivots, info;
-  std::tie(LD, pivots, info) =
+  auto [LD, pivots, info] =
       at::linalg_ldl_factor_ex(self, hermitian, /*check_errors=*/false);
   at::_linalg_check_errors(info, "torch.linalg.ldl_factor", self.dim() == 2);
   return std::make_tuple(std::move(LD), std::move(pivots));
@@ -3820,8 +3811,7 @@ Tensor& linalg_solve_triangular_out(
     bool unitriangular,
     Tensor& out) {
   checkInputsSolver(A, B, left, "linalg.solve_triangular");
-  Tensor A_, B_;
-  std::tie(B_, A_) = _linalg_broadcast_batch_dims(B, A, /*don't check errors*/nullptr);
+  auto [B_, A_] = _linalg_broadcast_batch_dims(B, A, /*don't check errors*/nullptr);
 
   // We'll write F-contig / F-transpose for FORTRAN contiguous / FORTRAN transpose etc
   // We say that a matrix is F-ready if it's F-contig OR F-transpose
@@ -3913,7 +3903,7 @@ Tensor& linalg_solve_triangular_out(
   }
 
   // No need to conjugate anything if out_f is conj as AX = conj(B) <=> conj(A)conj(X) = B
-  // and X = B after the algortihm. We just anotate that A is conjugated later on
+  // and X = B after the algorithm. We just annotate that A is conjugated later on
   // The solution will be written into out_f, so it'll be conjugated already
 
   Tensor A_f = std::move(A_);  // The A that will go into fortran
@@ -3922,7 +3912,7 @@ Tensor& linalg_solve_triangular_out(
   bool A_is_neg = A_f.is_neg() != out_f.is_neg();
   bool A_is_f_contig = (A_f.stride(-1) == 1) == transpose_A;
   if C10_UNLIKELY (!is_row_or_column_contiguous(A_f)) {
-    // We first anotate with flags on A_f all the conj / transpose / neg coming from out
+    // We first annotate with flags on A_f all the conj / transpose / neg coming from out
     // and then we clone the resulting tensor to resolve all of them in memory
     if (out_f.is_conj()) {
       A_f = A_f.conj();
diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
index 8a04a66f388b1..f29970afe2b44 100644
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@@ -402,7 +402,7 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau) {
 
   using value_t = typename c10::scalar_value_type<scalar_t>::type;
   auto self_data = self.data_ptr<scalar_t>();
-  auto tau_data = tau.data_ptr<scalar_t>();
+  auto tau_data = tau.const_data_ptr<scalar_t>();
   auto self_matrix_stride = matrixStride(self);
   auto tau_stride = tau.size(-1);
   auto batch_size = batchCount(self);
@@ -423,17 +423,17 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau) {
   // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
   int lwork = -1;
   scalar_t wkopt;
-  lapackOrgqr<scalar_t>(m, n, k, self_data, lda, tau_data, &wkopt, lwork, &info);
+  lapackOrgqr<scalar_t>(m, n, k, self_data, lda, const_cast<scalar_t*>(tau_data), &wkopt, lwork, &info);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0);
   lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
   Tensor work = at::empty({lwork}, self.options());
 
   for (const auto i : c10::irange(batch_size)) {
     scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
-    scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
+    const scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
 
     // now compute the actual Q
-    lapackOrgqr<scalar_t>(m, n, k, self_working_ptr, lda, tau_working_ptr, work.data_ptr<scalar_t>(), lwork, &info);
+    lapackOrgqr<scalar_t>(m, n, k, self_working_ptr, lda, const_cast<scalar_t*>(tau_working_ptr), work.data_ptr<scalar_t>(), lwork, &info);
 
     // info from lapackOrgqr only reports if the i-th parameter is wrong
     // so we don't need to check it all the time
@@ -649,8 +649,8 @@ void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& other, bo
   char side = left ? 'L' : 'R';
   char trans = transpose ? (input.is_complex() ? 'C' : 'T') : 'N';
 
-  auto input_data = input.data_ptr<scalar_t>();
-  auto tau_data = tau.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
+  auto tau_data = tau.const_data_ptr<scalar_t>();
   auto other_data = other.data_ptr<scalar_t>();
 
   auto input_matrix_stride = matrixStride(input);
@@ -670,21 +670,21 @@ void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& other, bo
   // Query for the optimal size of the workspace tensor
   int lwork = -1;
   scalar_t wkopt;
-  lapackOrmqr<scalar_t>(side, trans, m, n, k, input_data, lda, tau_data, other_data, ldc, &wkopt, lwork, &info);
+  lapackOrmqr<scalar_t>(side, trans, m, n, k, const_cast<scalar_t*>(input_data), lda, const_cast<scalar_t*>(tau_data), other_data, ldc, &wkopt, lwork, &info);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0);
   lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
   Tensor work = at::empty({lwork}, input.options());
 
   for (const auto i : c10::irange(batch_size)) {
-    scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
+    const scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
     scalar_t* other_working_ptr = &other_data[i * other_matrix_stride];
-    scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
+    const scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
 
     // now compute the actual result
     lapackOrmqr<scalar_t>(
         side, trans, m, n, k,
-        input_working_ptr, lda,
-        tau_working_ptr,
+        const_cast<scalar_t*>(input_working_ptr), lda,
+        const_cast<scalar_t*>(tau_working_ptr),
         other_working_ptr, ldc,
         work.data_ptr<scalar_t>(), lwork, &info);
 
@@ -725,7 +725,7 @@ void apply_triangular_solve(const Tensor& A, const Tensor& B, bool left, bool up
   char side = left ? 'L' : 'R';
   const char trans = to_blas(transpose);
 
-  auto A_data = A.data_ptr<scalar_t>();
+  auto A_data = A.const_data_ptr<scalar_t>();
   auto B_data = B.data_ptr<scalar_t>();
   auto A_mat_stride = matrixStride(A);
   auto B_mat_stride = matrixStride(B);
@@ -737,9 +737,9 @@ void apply_triangular_solve(const Tensor& A, const Tensor& B, bool left, bool up
   auto ldb = std::max<int64_t>(1, B.size(-2));
 
   for (const auto i : c10::irange(batch_size)) {
-    scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
+    const scalar_t* A_working_ptr = &A_data[i * A_mat_stride];
     scalar_t* B_working_ptr = &B_data[i * B_mat_stride];
-    blasTriangularSolve<scalar_t>(side, uplo, trans, diag, m, n, A_working_ptr, lda, B_working_ptr, ldb);
+    blasTriangularSolve<scalar_t>(side, uplo, trans, diag, m, n, const_cast<scalar_t*>(A_working_ptr), lda, B_working_ptr, ldb);
   }
 #endif
 }
@@ -841,26 +841,26 @@ void apply_ldl_solve(
   auto b_stride = B.dim() > 2 ? B.stride(-3) : 0;
   auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0;
 
-  auto a_data = A.data_ptr<scalar_t>();
+  auto a_data = A.const_data_ptr<scalar_t>();
   auto b_data = B.data_ptr<scalar_t>();
   auto pivots_ = pivots.to(kInt);
-  auto pivots_data = pivots_.data_ptr<int>();
+  auto pivots_data = pivots_.const_data_ptr<int>();
 
   auto ldl_solve_func = hermitian ? lapackLdlSolveHermitian<scalar_t>
                                   : lapackLdlSolveSymmetric<scalar_t>;
 
   int info = 0;
   for (const auto i : c10::irange(batch_size)) {
-    scalar_t* a_working_ptr = &a_data[i * a_stride];
+    const scalar_t* a_working_ptr = &a_data[i * a_stride];
     scalar_t* b_working_ptr = &b_data[i * b_stride];
-    auto* pivots_working_ptr = &pivots_data[i * pivots_stride];
+    const auto* pivots_working_ptr = &pivots_data[i * pivots_stride];
     ldl_solve_func(
         uplo,
         n,
         nrhs,
-        a_working_ptr,
+        const_cast<scalar_t*>(a_working_ptr),
         lda,
-        pivots_working_ptr,
+        const_cast<int*>(pivots_working_ptr),
         b_working_ptr,
         ldb,
         &info);
@@ -968,9 +968,9 @@ void apply_lu_solve(const Tensor& LU, const Tensor& pivots, const Tensor& B, Tra
       "PyTorch with LAPACK. Please use PyTorch built with LAPACK support.");
 #else
   auto b_data = B.data_ptr<scalar_t>();
-  auto lu_data = LU.data_ptr<scalar_t>();
+  auto lu_data = LU.const_data_ptr<scalar_t>();
   const auto trans = to_blas(transpose);
-  auto pivots_data = pivots.data_ptr<int>();
+  auto pivots_data = pivots.const_data_ptr<int>();
   auto b_stride = matrixStride(B);
   auto lu_stride = LU.dim() > 2 ? LU.stride(-3) : 0;
   auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0;
@@ -992,10 +992,10 @@ void apply_lu_solve(const Tensor& LU, const Tensor& pivots, const Tensor& B, Tra
   for (const auto i : c10::irange(batch_size)) {
     int64_t lu_index_i = lu_index(i);
     scalar_t* b_working_ptr = &b_data[i * b_stride];
-    scalar_t* lu_working_ptr = &lu_data[lu_index_i * lu_stride];
-    int* pivots_working_ptr = &pivots_data[lu_index_i * pivots_stride];
+    const scalar_t* lu_working_ptr = &lu_data[lu_index_i * lu_stride];
+    const int* pivots_working_ptr = &pivots_data[lu_index_i * pivots_stride];
 
-    lapackLuSolve<scalar_t>(trans, n, nrhs, lu_working_ptr, leading_dimension, pivots_working_ptr,
+    lapackLuSolve<scalar_t>(trans, n, nrhs, const_cast<scalar_t*>(lu_working_ptr), leading_dimension, const_cast<int*>(pivots_working_ptr),
                             b_working_ptr, leading_dimension, &info);
 
     // info from lapackLuSolve only reports if the i-th parameter is wrong
diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index 8816f9622d85c..78f57470a922d 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -1416,7 +1416,7 @@ Tensor& comparison_op_(Tensor& self, const Scalar& other, OutImpl& out_impl) {
 }
 
 // We need explicit cast to OutFunc because each *_out func is overloaded twice. Without An explicit cast, merely
-// referring to *_out function is ambiguious.
+// referring to *_out function is ambiguous.
 using OutFunc = std::add_const<Tensor&(&)(Tensor&, const Tensor&, const Tensor&)>::type;
 
 // less, alias for torch.lt
diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp
index 665602fcf18a4..88069616bf8e7 100644
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@@ -81,7 +81,7 @@ TORCH_IMPL_FUNC(addmv_out_cpu)(const Tensor &self, const Tensor &mat, const Tens
     if (result.numel() != 0) {
 
       NoNamesGuard guard;
-      if (use_mkldnn_lower_precision_matmul(mat, vec, /*result=*/Tensor())){
+      if (use_mkldnn_matmul(mat, vec, /*result=*/Tensor())){
         mkldnn_matmul(mat, vec, result, beta_.to<float>(), alpha_.to<float>());
         return;
       }
@@ -176,7 +176,7 @@ Tensor dot(const Tensor &self, const Tensor &other){
     return at::_efficientzerotensor({}, self.options());
   }
 
-  if (use_mkldnn_lower_precision_matmul(self, other, /*result=*/Tensor())){
+  if (use_mkldnn_matmul(self, other, /*result=*/Tensor())){
     // mkldnn matmul expect result have sizes info to create ideep tensor
     auto r =  at::empty({1, 1}, self.options());
     mkldnn_matmul(self, other, r, /*beta=*/0);
@@ -185,7 +185,7 @@ Tensor dot(const Tensor &self, const Tensor &other){
 
   return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, self.scalar_type(), "dot", [&] {
     Tensor result = at::empty({}, self.options());
-    result.fill_(dot_impl<scalar_t>(self.numel(), self.data_ptr<scalar_t>(), self.stride(0), other.data_ptr<scalar_t>(), other.stride(0)));
+    result.fill_(dot_impl<scalar_t>(self.numel(), const_cast<scalar_t*>(self.const_data_ptr<scalar_t>()), self.stride(0), const_cast<scalar_t*>(other.const_data_ptr<scalar_t>()), other.stride(0)));
     return result;
   });
 }
@@ -216,7 +216,7 @@ Tensor vdot(const Tensor &self, const Tensor &other){
 
   return AT_DISPATCH_COMPLEX_TYPES(self.scalar_type(), "vdot", [&] {
     Tensor result = at::empty({}, self.options());
-    result.fill_(vdot_impl<scalar_t>(self.numel(), self.data_ptr<scalar_t>(), self.stride(0), other.data_ptr<scalar_t>(), other.stride(0)));
+    result.fill_(vdot_impl<scalar_t>(self.numel(), const_cast<scalar_t*>(self.const_data_ptr<scalar_t>()), self.stride(0), const_cast<scalar_t *>(other.const_data_ptr<scalar_t>()), other.stride(0)));
     return result;
   });
 
diff --git a/aten/src/ATen/native/BlasKernel.cpp b/aten/src/ATen/native/BlasKernel.cpp
index f2c61646b1b5c..48a077814880b 100644
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@@ -1,6 +1,8 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Context.h>
 #include <ATen/Config.h>
 #include <ATen/OpMathType.h>
+#include <ATen/Parallel.h>
 #include <c10/core/ScalarType.h>
 #include <c10/util/Exception.h>
 #include <c10/util/complex.h>
@@ -9,6 +11,10 @@
 #include <climits>
 #include <limits>
 
+#if defined(__aarch64__) && !defined(C10_MOBILE)
+#include <arm_neon.h>
+#endif
+
 namespace {
 
 /// Wrapper for const_cast<T*> with type-inference.
@@ -74,24 +80,53 @@ extern "C" void sgemv_(char *trans, int *m, int *n, float *alpha, float *a, int
 namespace at::native {
 
 namespace blas_impl {
+#if defined(__aarch64__) && !defined(C10_MOBILE)
+void fp16_gemv_notrans(
+    const int m,
+    const int n,
+    const float alpha,
+    const float16_t* a,
+    const int lda,
+    const float16_t* x,
+    const int incx,
+    const float beta,
+    float16_t* y,
+    const int incy);
+
+void fp16_gemv_trans(
+    const int m,
+    const int n,
+    const float alpha,
+    const float16_t* a,
+    const int lda,
+    const float16_t* x,
+    const int incx,
+    const float beta,
+    float16_t* y,
+    const int incy);
+#endif
 
 template <typename scalar_t>
-bool scal_use_fast_path(int64_t n, int64_t incx) {
+bool scal_use_fast_path(C10_UNUSED int64_t n, C10_UNUSED int64_t incx) {
   return false;
 }
 
 template <typename scalar_t>
-bool gemv_use_fast_path(int64_t m, int64_t n, int64_t lda, int64_t incx, int64_t incy) {
+bool gemv_use_fast_path(C10_UNUSED int64_t m, C10_UNUSED int64_t n,
+                        C10_UNUSED int64_t lda, C10_UNUSED int64_t incx, C10_UNUSED int64_t incy) {
   return false;
 }
 
 template <typename scalar_t>
-void scal_fast_path(int *n, scalar_t *a, scalar_t *x, int *incx) {
+void scal_fast_path(C10_UNUSED int *n, C10_UNUSED scalar_t *a, C10_UNUSED scalar_t *x, C10_UNUSED int *incx) {
   TORCH_INTERNAL_ASSERT(false, "scal_fast_path shouldn't be called for this configuration");
 }
 
 template <typename scalar_t>
-void gemv_fast_path(const char *trans, const int *m, const int *n, const scalar_t *alpha, const scalar_t *a, const int *lda, const scalar_t *x, const int *incx, const scalar_t *beta, scalar_t *y, const int *incy) {
+void gemv_fast_path(C10_UNUSED const char *trans, C10_UNUSED const int *m, C10_UNUSED const int *n,
+                    C10_UNUSED  const scalar_t *alpha, C10_UNUSED const scalar_t *a, C10_UNUSED const int *lda,
+                    C10_UNUSED  const scalar_t *x, C10_UNUSED const int *incx, C10_UNUSED const scalar_t *beta,
+                    C10_UNUSED  scalar_t *y, C10_UNUSED const int *incy) {
   TORCH_INTERNAL_ASSERT(false, "gemv_fast_path shouldn't be called for this configuration");
 }
 
@@ -155,7 +190,248 @@ INSTANTIATE(int16_t);
 INSTANTIATE(int);
 INSTANTIATE(int64_t);
 INSTANTIATE(c10::BFloat16);
+#if defined(__aarch64__) && !defined(C10_MOBILE)
+template <>
+bool scal_use_fast_path<at::Half>(C10_UNUSED int64_t n, C10_UNUSED int64_t incx) {
+  return false;
+}
+
+template <>
+bool gemv_use_fast_path<at::Half>(
+    C10_UNUSED int64_t m,
+    C10_UNUSED int64_t n,
+    C10_UNUSED int64_t lda,
+    C10_UNUSED int64_t incx,
+    C10_UNUSED int64_t incy) {
+  return true;
+}
+
+#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+static inline float16_t reduce(float16x4_t x) {
+        auto sum = vpadd_f16(x, x);
+        return vget_lane_f16(vpadd_f16(sum, sum), 0);
+}
+static inline float16_t reduce(float16x8_t x) {
+        return reduce(vadd_f16(vget_low_f16(x), vget_high_f16(x)));
+}
+
+
+static void fp16_gemv_trans_fp16_arith(const int m, const int n, const float16_t* a, const int lda, const float16_t *x, float16_t* y, int incy) {
+  parallel_for(0, n / 4, 1, [&](int begin, int end) {
+    for (auto i = begin * 4 ; i < end * 4; i += 4) {
+      float16x8_t sum0Vec = vdupq_n_f16(0);
+      float16x8_t sum1Vec = vdupq_n_f16(0);
+      float16x8_t sum2Vec = vdupq_n_f16(0);
+      float16x8_t sum3Vec = vdupq_n_f16(0);
+      const auto row0 = a + lda * (i + 0);
+      const auto row1 = a + lda * (i + 1);
+      const auto row2 = a + lda * (i + 2);
+      const auto row3 = a + lda * (i + 3);
+      for (auto j = 0; j < m; j += 8) {
+        float16x8_t xVec = vld1q_f16(x + j);
+        float16x8_t a0Vec = vld1q_f16(row0 + j);
+        sum0Vec = vaddq_f16(sum0Vec, vmulq_f16(a0Vec, xVec));
+        float16x8_t a1Vec = vld1q_f16(row1 + j);
+        sum1Vec = vaddq_f16(sum1Vec, vmulq_f16(a1Vec, xVec));
+        float16x8_t a2Vec = vld1q_f16(row2 + j);
+        sum2Vec = vaddq_f16(sum2Vec, vmulq_f16(a2Vec, xVec));
+        float16x8_t a3Vec = vld1q_f16(row3 + j);
+        sum3Vec = vaddq_f16(sum3Vec, vmulq_f16(a3Vec, xVec));
+      }
+      y[(i + 0) * incy] = reduce(sum0Vec);
+      y[(i + 1) * incy] = reduce(sum1Vec);
+      y[(i + 2) * incy] = reduce(sum2Vec);
+      y[(i + 3) * incy] = reduce(sum3Vec);
+    }
+  });
+}
+#endif
+
+static inline float reduce(float32x4_t x) {
+        auto sum = vpaddq_f32(x, x);
+        return vgetq_lane_f32(vpaddq_f32(sum, sum), 0);
+}
+
+static void fp16_gemv_trans_fp32_arith(const int m, const int n, const float16_t* a, const int lda, const float16_t *x, float16_t* y, int incy) {
+  parallel_for(0, n / 4, 1, [&](int begin, int end) {
+    for (auto i =  begin * 4 ; i < end * 4; i += 4) {
+      float32x4_t sum0Vec = vdupq_n_f32(0);
+      float32x4_t sum1Vec = vdupq_n_f32(0);
+      float32x4_t sum2Vec = vdupq_n_f32(0);
+      float32x4_t sum3Vec = vdupq_n_f32(0);
+      const auto row0 = a + lda * (i + 0);
+      const auto row1 = a + lda * (i + 1);
+      const auto row2 = a + lda * (i + 2);
+      const auto row3 = a + lda * (i + 3);
+      for (auto j = 0; j < m; j += 4) {
+        float32x4_t xVec = vcvt_f32_f16(vld1_f16(x + j));
+        float32x4_t a0Vec = vcvt_f32_f16(vld1_f16(row0 + j));
+        sum0Vec = vaddq_f32(sum0Vec, vmulq_f32(a0Vec, xVec));
+        float32x4_t a1Vec = vcvt_f32_f16(vld1_f16(row1 + j));
+        sum1Vec = vaddq_f32(sum1Vec, vmulq_f32(a1Vec, xVec));
+        float32x4_t a2Vec = vcvt_f32_f16(vld1_f16(row2 + j));
+        sum2Vec = vaddq_f32(sum2Vec, vmulq_f32(a2Vec, xVec));
+        float32x4_t a3Vec = vcvt_f32_f16(vld1_f16(row3 + j));
+        sum3Vec = vaddq_f32(sum3Vec, vmulq_f32(a3Vec, xVec));
+      }
+      y[(i + 0) * incy] = reduce(sum0Vec);
+      y[(i + 1) * incy] = reduce(sum1Vec);
+      y[(i + 2) * incy] = reduce(sum2Vec);
+      y[(i + 3) * incy] = reduce(sum3Vec);
+    }
+  });
+}
+
+void fp16_gemv_trans(
+    const int m,
+    const int n,
+    const float alpha,
+    const float16_t* a,
+    const int lda,
+    const float16_t* x,
+    const int incx,
+    const float beta,
+    float16_t* y,
+    const int incy) {
+  if (incx == 1 && alpha == 1.0 && beta == 0.0 && m % 4 == 0 && n % 4 == 0) {
+#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+    return at::globalContext().allowFP16ReductionCPU() && m % 8 == 0 ? fp16_gemv_trans_fp16_arith(m, n, a, lda, x, y, incy)
+                                                                     : fp16_gemv_trans_fp32_arith(m, n, a, lda, x, y, incy);
+#else
+    return fp16_gemv_trans_fp32_arith(m, n, a, lda, x, y, incy);
+#endif
+  }
+  for (const auto i : c10::irange(n)) {
+    float sum = 0;
+    const auto row_ = a + lda * i;
+    for (const auto j : c10::irange(m)) {
+      sum += x[j * incx] * row_[j];
+    }
+    if (beta == 0.0) {
+      y[i * incy] = alpha * sum;
+    } else {
+      y[i * incy] = beta * y[i * incy] + alpha * sum;
+    }
+  }
+}
+
+
+#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+static void fp16_gemv_notrans_fp16_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
+  for (auto j = 0; j < n; j++) {
+    auto vecCol = vdup_n_f16(x[j]);
+    const auto* column = a + lda * j;
+    for (auto i = 0; i < m; i += 4) {
+      auto yf16 = y + i;
+      auto matRow = vld1_f16(column + i);
+      auto resVec = j != 0 ? vld1_f16(yf16) : vdup_n_f16(0);
+      resVec = vfma_lane_f16(resVec, matRow, vecCol, 0);
+      vst1_f16(yf16, resVec);
+    }
+  }
+}
+#endif
+
+static void fp16_gemv_notrans_fp32_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
+  std::vector<float> sum(m);
+  for (auto j = 0; j < n; j++) {
+    auto vecCol = vdup_n_f32(x[j]);
+    const auto* column = a + lda * j;
+    for (auto i = 0; i < m; i += 4) {
+      auto sf32 = sum.data() + i;
+      auto matRow = vcvt_f32_f16(vld1_f16(column + i));
+      auto resVec = j != 0 ? vld1q_f32(sf32) : vdupq_n_f32(0);
+      resVec = vfmaq_lane_f32(resVec, matRow, vecCol, 0);
+      vst1q_f32(sf32, resVec);
+    }
+  }
+
+  for (auto i = 0; i < m; i+= 4) {
+    vst1_f16(y + i, vcvt_f16_f32(vld1q_f32(sum.data() + i)));
+  }
+}
+
+void fp16_gemv_notrans(
+    const int m,
+    const int n,
+    const float alpha,
+    const float16_t* a,
+    const int lda,
+    const float16_t* x,
+    const int incx,
+    const float beta,
+    float16_t* y,
+    const int incy) {
+  if (incx == 1 && alpha == 1.0 && beta == 0.0 && m % 4 == 0 && incy == 1) {
+#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+    return at::globalContext().allowFP16ReductionCPU() ? fp16_gemv_notrans_fp16_arith(m, n, a, lda, x, y)
+                                                       : fp16_gemv_notrans_fp32_arith(m, n, a, lda, x, y);
+#else
+    return fp16_gemv_notrans_fp32_arith(m, n, a, lda, x, y);
+#endif
+  }
+  std::vector<float> sum(m);
+  for (const auto j : c10::irange(n)) {
+    const auto* column_ = a + lda * j;
+    auto z = alpha * x[j * incx];
+    for (const auto i : c10::irange(m)) {
+      sum[i] += z * column_[i];
+    }
+  }
+  if (beta == 0.0) {
+    for (const auto i : c10::irange(m)) {
+      y[i * incy] = sum[i];
+    }
+  } else {
+    for (const auto i : c10::irange(m)) {
+      y[i * incy] += sum[i];
+    }
+  }
+}
+
+template <>
+void gemv_fast_path<at::Half>(
+    const char* trans,
+    const int* m,
+    const int* n,
+    const at::Half* alpha,
+    const at::Half* a,
+    const int* lda,
+    const at::Half* x,
+    const int* incx,
+    const at::Half* beta,
+    at::Half* y,
+    const int* incy) {
+  using namespace c10::detail;
+  if ((trans[0] == 'T') || (trans[0] == 't')) {
+    fp16_gemv_trans(
+        *m,
+        *n,
+        fp16_from_bits(alpha->x),
+        reinterpret_cast<const float16_t*>(a),
+        *lda,
+        reinterpret_cast<const float16_t*>(x),
+        *incx,
+        fp16_from_bits(beta->x),
+        reinterpret_cast<float16_t*>(y),
+        *incy);
+  } else {
+    fp16_gemv_notrans(
+        *m,
+        *n,
+        fp16_from_bits(alpha->x),
+        reinterpret_cast<const float16_t*>(a),
+        *lda,
+        reinterpret_cast<const float16_t*>(x),
+        *incx,
+        fp16_from_bits(beta->x),
+        reinterpret_cast<float16_t*>(y),
+        *incy);
+  }
+}
+#else
 INSTANTIATE(c10::Half);
+#endif
 #undef INSTANTIATE
 
 } // namespace blas_impl
diff --git a/aten/src/ATen/native/Bucketization.cpp b/aten/src/ATen/native/Bucketization.cpp
index 688512f2711d0..736273a40cb09 100644
--- a/aten/src/ATen/native/Bucketization.cpp
+++ b/aten/src/ATen/native/Bucketization.cpp
@@ -16,7 +16,7 @@
 
 /* Implement a numpy like searchsorted and a TF like bucketize function running on cpu
  *
- * - torch.searchsorted(sorted_sequence, values, right=False, side='left', out_int32=False, sorter=None)
+ * - torch.searchsorted(sorted_sequence, values, right=False, side=None, out_int32=False, sorter=None)
  *   sorted_sequence - N*D or 1D (apply to all values) tensor containing sorted sequences in last dimension
  *   values          - N*D tensor or a Scalar (when sorted_sequence is 1D) containing the search values
  *   right           - corresponding to lower bound if False and upper bound if True
@@ -92,9 +92,9 @@ void searchsorted_cpu_contiguous(Tensor& result, const Tensor& input, const Tens
   int64_t idim_in = is_scalar_input ? 1 : input.sizes().back();
   int64_t idim_bd = boundaries.sizes().back();
 
-  const input_t *data_in = input.data_ptr<input_t>();
-  const input_t *data_bd = boundaries.data_ptr<input_t>();
-  const int64_t *data_st = sorter.defined() ? sorter.data_ptr<int64_t>() : nullptr;
+  const input_t *data_in = input.const_data_ptr<input_t>();
+  const input_t *data_bd = boundaries.const_data_ptr<input_t>();
+  const int64_t *data_st = sorter.defined() ? sorter.const_data_ptr<int64_t>() : nullptr;
   output_t *data_out = result.data_ptr<output_t>();
 
   bool is_1d_boundaries = boundaries.dim() == 1;
@@ -162,7 +162,7 @@ Tensor& searchsorted_out_cpu(
     return result;
   }
 
-  // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaing the original result tensor
+  // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaining the original result tensor
   Tensor out = result;
   if (!result.is_contiguous()) {
     out = result.contiguous();
diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp
index f0e5f333df80a..ac49364573c48 100644
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@@ -164,6 +164,11 @@ void gemm(
     const float beta,
     float *c, int64_t ldc) {
   internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
+#if AT_MKLDNN_ENABLED()
+   if (mkldnn_bf32_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
+     return;
+   }
+#endif
 #if AT_BUILD_WITH_BLAS()
   if (use_blas_gemm(transa, transb, m, n, k, lda, ldb, ldc)) {
     int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc;
@@ -394,6 +399,42 @@ void gemm(
   }
 }
 
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const float alpha,
+    const at::Half *a, int64_t lda,
+    const at::Half *b, int64_t ldb,
+    const float beta,
+    float *c, int64_t ldc) {
+  internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
+#ifdef MKL_HAS_SHGEMM
+  if (use_blas_gemm(transa, transb, m, n, k, lda, ldb, ldc)) {
+    int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc;
+    mkl_gemm_f16f16f32(transa, transb, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_);
+    return;
+  }
+#endif
+  // for the fallback path, first compute gemm with beta = 0,
+  // and then add c in full precision.
+  int64_t c_size = n * m;
+  std::vector<at::Half> float16_c(c_size, 0.f);
+  gemm_stub(
+      at::kCPU, at::kHalf,
+      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float16_c.data(), m);
+  for (const auto j : c10::irange(n)) {
+    for (const auto i : c10::irange(m)) {
+      auto offset = j * ldc + i;
+      // beta == 0 won't propagate NaN from C
+      if (beta == 0.f) {
+        c[offset] = c10::convert<float>(float16_c[j * m + i]);
+      } else {
+        c[offset] = beta * c[offset] + c10::convert<float>(float16_c[j * m + i]);
+      }
+    }
+  }
+}
+
 void gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
diff --git a/aten/src/ATen/native/CPUBlas.h b/aten/src/ATen/native/CPUBlas.h
index 8c9075a06780e..3b30df1c21fad 100644
--- a/aten/src/ATen/native/CPUBlas.h
+++ b/aten/src/ATen/native/CPUBlas.h
@@ -88,6 +88,15 @@ void gemm(
     float beta,
     at::Half *c, int64_t ldc);
 
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const float alpha,
+    const at::Half *a, int64_t lda,
+    const at::Half *b, int64_t ldb,
+    const float beta,
+    float *c, int64_t ldc);
+
 void gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
diff --git a/aten/src/ATen/native/CPUFallback.cpp b/aten/src/ATen/native/CPUFallback.cpp
index b8d9d3b9347d9..502c61e4d144c 100644
--- a/aten/src/ATen/native/CPUFallback.cpp
+++ b/aten/src/ATen/native/CPUFallback.cpp
@@ -89,6 +89,7 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool
   std::vector<c10::List<at::Tensor>> tensorlist_args;
   std::vector<int> tensorlist_args_indices;
 
+  c10::optional<c10::Device> tgt_device = c10::nullopt;
   // save converted cpu tensor for TensorList
   std::vector<c10::IValue> tensorlist_cpu_args;
 
@@ -108,6 +109,25 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool
       auto cpu_ivalue = c10::IValue(c10::List<at::Tensor>(to_cpu(ivalue.toTensorList().vec())));
       tensorlist_cpu_args.push_back(cpu_ivalue);
       (*stack)[arguments_begin + idx] = std::move(cpu_ivalue);
+      tensorlist_args.push_back(ivalue.toTensorList());
+    } else if (ivalue.isOptionalTensorList()) {
+      auto opt_tensors = ivalue.toOptionalTensorList().vec();
+      std::vector<at::Tensor> need_convert_tensors;
+      std::vector<int> need_convert_tensors_index;
+      for (auto i : c10::irange(opt_tensors.size())) {
+        if (!opt_tensors[i].has_value() || !opt_tensors[i]->defined()) continue;
+        need_convert_tensors.push_back(opt_tensors[i].value());
+        need_convert_tensors_index.push_back(i);
+      }
+      auto cpu_tensors = to_cpu(need_convert_tensors);
+      for (const auto i : c10::irange(need_convert_tensors_index.size())) {
+        auto idx = need_convert_tensors_index[i];
+        opt_tensors[idx] = cpu_tensors[i];
+      }
+      (*stack)[arguments_begin + idx] = c10::IValue(opt_tensors);
+    } else if (ivalue.isDevice()) {
+      tgt_device = ivalue.toDevice();
+      (*stack)[arguments_begin + idx] = c10::IValue(c10::Device(kCPU));
     }
   }
   // XLA requires all of the tensor arguments to be gathered up and converted to CPU together.
@@ -151,7 +171,7 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool
   // the temporary CPU output tensor that we created.
   //
   // Note [CPU Fallback Does Not Handle View Operators]
-  // Also note that we are incapable of handling immutable alises properly.
+  // Also note that we are incapable of handling immutable aliases properly.
   // Why?
   // Schemas with an immutable alias'd tensor outputs correspond to view operators.
   // For example, the `view_as` schema from native_functions.yaml:
@@ -168,8 +188,9 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool
   auto returns = torch::jit::last(stack, num_returns);
   const auto returns_begin = stack->size() - num_returns;
 
-  c10::optional<c10::Device> tgt_device =
-      compute_target_device(tensor_args, tensorlist_args);
+  if (tgt_device == c10::nullopt) {
+    tgt_device = compute_target_device(tensor_args, tensorlist_args);
+  }
 
   for (const auto idx : c10::irange(returns.size())) {
     const AliasInfo* alias_info = schema_returns[idx].alias_info();
diff --git a/aten/src/ATen/native/ChanelShuffle.cpp b/aten/src/ATen/native/ChanelShuffle.cpp
index 2d9ca7cb459d7..be57917967fa9 100644
--- a/aten/src/ATen/native/ChanelShuffle.cpp
+++ b/aten/src/ATen/native/ChanelShuffle.cpp
@@ -20,11 +20,16 @@
 namespace at::native {
 
 Tensor channel_shuffle_cpu(const Tensor& self, int64_t groups) {
-  auto memory_format = self.suggest_memory_format();
-  auto output = at::empty({0}, self.options());
-  output.resize_(self.sizes(), memory_format);
-  auto input = self.contiguous(memory_format);
-  channel_shuffle_kernel(kCPU, output, input, groups);
+  Tensor output;
+  if (self.numel() == 0) {
+    output = self.alias();
+  } else {
+    auto memory_format = self.suggest_memory_format();
+    output = at::empty({0}, self.options());
+    output.resize_(self.sizes(), memory_format);
+    auto input = self.contiguous(memory_format);
+    channel_shuffle_kernel(kCPU, output, input, groups);
+  }
   return namedinference::propagate_names_if_nonempty(
       output,
       self.has_names() ? self.names() : at::ArrayRef<Dimname>{});
@@ -69,7 +74,7 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) {
   // It is not clear, however from initial looking around it feels that
   // this may not be correct.
   // In this case channels last will likely require custom implementation
-  // if we want to preseve the memory order.
+  // if we want to preserve the memory order.
   // XNNPACK has channel shuffle op for NHWC. For mobile usecase this is good.
   // For server we will have to do a custom implementation.
   // For ChannelsFirst, a.k.a Contiguous, memory format we will also need
diff --git a/aten/src/ATen/native/CompositeRandomAccessorCommon.h b/aten/src/ATen/native/CompositeRandomAccessorCommon.h
index 919647992ccab..9111c3515afce 100644
--- a/aten/src/ATen/native/CompositeRandomAccessorCommon.h
+++ b/aten/src/ATen/native/CompositeRandomAccessorCommon.h
@@ -118,7 +118,7 @@ class CompositeRandomAccessor {
   using value_type = composite_value_type;
   using reference = references_holder<composite_value_type, composite_reference>;
   // Note that CompositeRandomAccessor does not hold key and values
-  // in a specific datastrcture, which means that a pointer to a (key, value)
+  // in a specific datastructure, which means that a pointer to a (key, value)
   // is not defined. Hence we just use a pointer type of the KeyAccessor.
   using pointer = typename std::iterator_traits<KeyAccessor>::pointer;
   using difference_type = typename std::iterator_traits<KeyAccessor>::difference_type;
diff --git a/aten/src/ATen/native/Constraints.cpp b/aten/src/ATen/native/Constraints.cpp
index 9b7703313528d..8f3f8c11e696c 100644
--- a/aten/src/ATen/native/Constraints.cpp
+++ b/aten/src/ATen/native/Constraints.cpp
@@ -29,7 +29,7 @@ void sym_constrain_range(
 
     int64_t min_val = min.has_value() ? min.value() : std::numeric_limits<int64_t>::min();
     int64_t max_val = max.has_value() ? max.value() : std::numeric_limits<int64_t>::max();
-    int64_t size_as_int = size.toInt();
+    int64_t size_as_int = size.toLong();
 
     TORCH_CHECK(
       max_val >= min_val,
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 5d2691b9761ee..4b814f3e442cb 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -358,7 +358,6 @@ static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const
   }
 
   bool can_use_miopen_channels_last_2d = false;
-#if defined(USE_ROCM) && (ROCM_VERSION >= 40300)
   // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
   // See #64427
   static c10::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
@@ -370,7 +369,6 @@ static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const
             ( (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
             (weight_memory_format == at::MemoryFormat::ChannelsLast) )
         );
-#endif
 
   bool can_use_miopen_channels_last_3d = false;
 
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index e76128db847a0..717280a6cdcab 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -848,9 +848,8 @@ at::Tensor complex_convolution(
     SymIntArrayRef output_padding,
     c10::SymInt groups) {
   check_input_same_type_as_parameters(input, weight, bias);
-  Tensor i_r, i_i, w_r, w_i;
-  std::tie(i_r, i_i) = complex_to_real(input.resolve_conj());
-  std::tie(w_r, w_i) = complex_to_real(weight.resolve_conj());
+  auto [i_r, i_i] = complex_to_real(input.resolve_conj());
+  auto [w_r, w_i] = complex_to_real(weight.resolve_conj());
 
   // [NOTE] Complex Convolution
   // conv(W, x, b) = conv(Wr, xr, br) - conv(Wi, xi, 0) + i(conv(Wi, xr, bi) + conv(Wr, xi, 0))
@@ -866,8 +865,7 @@ at::Tensor complex_convolution(
     b = at::convolution_symint(i_i, w_i, bias, stride, padding, dilation, transposed, output_padding, groups);
     c = at::convolution_symint(i_r + i_i, w_r + w_i, bias, stride, padding, dilation, transposed, output_padding, groups);
   } else {
-    Tensor b_r, b_i;
-    std::tie(b_r, b_i) = complex_to_real(bias.resolve_conj());
+    auto [b_r, b_i] = complex_to_real(bias.resolve_conj());
     a = at::convolution_symint(i_r, w_r, b_r, stride, padding, dilation, transposed, output_padding, groups);
     b = at::convolution_symint(i_i, w_i, Tensor(), stride, padding, dilation, transposed, output_padding, groups);
     c = at::convolution_symint(i_r + i_i, w_r + w_i, b_r + b_i, stride, padding, dilation, transposed, output_padding, groups);
@@ -887,9 +885,8 @@ at::Tensor complex_convolution_mode(
     c10::SymInt groups) {
   auto bias = bias_opt.value_or(Tensor());
   check_input_same_type_as_parameters(input, weight, bias);
-  Tensor i_r, i_i, w_r, w_i;
-  std::tie(i_r, i_i) = complex_to_real(input.resolve_conj());
-  std::tie(w_r, w_i) = complex_to_real(weight.resolve_conj());
+  auto [i_r, i_i] = complex_to_real(input.resolve_conj());
+  auto [w_r, w_i] = complex_to_real(weight.resolve_conj());
 
   // See [NOTE] Complex Convolution
   Tensor a, b, c;
@@ -898,8 +895,7 @@ at::Tensor complex_convolution_mode(
     b = at::_convolution_mode_symint(i_i, w_i, bias, stride, padding, dilation, groups);
     c = at::_convolution_mode_symint(i_r + i_i, w_r + w_i, bias, stride, padding, dilation, groups);
   } else {
-    Tensor b_r, b_i;
-    std::tie(b_r, b_i) = complex_to_real(bias.resolve_conj());
+    auto [b_r, b_i] = complex_to_real(bias.resolve_conj());
     a = at::_convolution_mode_symint(i_r, w_r, b_r, stride, padding, dilation, groups);
     b = at::_convolution_mode_symint(i_i, w_i, Tensor(), stride, padding, dilation, groups);
     c = at::_convolution_mode_symint(i_r + i_i, w_r + w_i, b_r + b_i, stride, padding, dilation, groups);
@@ -926,9 +922,7 @@ at::Tensor conv1d_symint(
     bias.dtype().name(),
     ") should be the same");
 
-  Tensor input;
-  bool is_batched;
-  std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d");
+  auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d");
   Tensor output;
   if (at::isComplexType(input_.scalar_type())) {
     output = complex_convolution(input, weight, bias, stride, padding, dilation, false, {0}, groups);
@@ -953,9 +947,7 @@ at::Tensor conv2d_symint(
     bias.dtype().name(),
     ") should be the same");
 
-  Tensor input;
-  bool is_batched;
-  std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d");
+  auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d");
   Tensor output;
   if (at::isComplexType(input_.scalar_type())) {
     output = complex_convolution(input, weight, bias, stride, padding, dilation, false, {{0, 0}}, groups);
@@ -980,9 +972,7 @@ at::Tensor conv3d_symint(
     bias.dtype().name(),
     ") should be the same");
 
-  Tensor input;
-  bool is_batched;
-  std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d");
+  auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d");
   Tensor output;
   if (at::isComplexType(input_.scalar_type())) {
     output = complex_convolution(input, weight, bias, stride, padding, dilation, false, {{0, 0, 0}}, groups);
@@ -1080,9 +1070,7 @@ at::Tensor conv1d_padding_symint(
     const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias,
     c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation,
     c10::SymInt groups) {
-  Tensor input;
-  bool is_batched;
-  std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d");
+  auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d");
   Tensor output;
   if (at::isComplexType(input_.scalar_type())) {
     output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
@@ -1096,9 +1084,7 @@ at::Tensor conv2d_padding_symint(
     const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias,
     c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation,
     c10::SymInt groups) {
-  Tensor input;
-  bool is_batched;
-  std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d");
+  auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d");
   Tensor output;
   if (at::isComplexType(input_.scalar_type())) {
     output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
@@ -1112,9 +1098,7 @@ at::Tensor conv3d_padding_symint(
     const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias,
     c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation,
     c10::SymInt groups) {
-  Tensor input;
-  bool is_batched;
-  std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d");
+  auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d");
   Tensor output;
   if (at::isComplexType(input_.scalar_type())) {
     output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
@@ -1131,9 +1115,7 @@ at::Tensor conv_transpose1d_symint(
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
 
-  Tensor input;
-  bool is_batched;
-  std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 1, "conv_transpose1d");
+  auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 1, "conv_transpose1d");
   Tensor output;
   if (at::isComplexType(input_.scalar_type())) {
     output = complex_convolution(
@@ -1152,9 +1134,7 @@ at::Tensor conv_transpose2d_symint(
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
 
-  Tensor input;
-  bool is_batched;
-  std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 2, "conv_transpose2d");
+  auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 2, "conv_transpose2d");
   Tensor output;
   if (at::isComplexType(input_.scalar_type())) {
     output = complex_convolution(
@@ -1173,9 +1153,7 @@ at::Tensor conv_transpose3d_symint(
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
 
-  Tensor input;
-  bool is_batched;
-  std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 3, "conv_transpose3d");
+  auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 3, "conv_transpose3d");
   Tensor output;
   if (at::isComplexType(input_.scalar_type())) {
     output = complex_convolution(
@@ -1420,8 +1398,8 @@ static inline std::vector<int64_t> calc_output_size(
     conv_output_size(input.sizes(), weight.sizes(), params.padding, params.stride, params.dilation);
 
   // Handle empty # of channels.
-  if (input.size(1) == 0) {
-    output_size[input_channels_dim] = 0;
+  if (input.size(input_channels_dim) == 0) {
+    output_size[output_channels_dim] = 0;
   }
   return output_size;
 }
diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp
index 7b0a7300459f6..6f8a3477c239c 100644
--- a/aten/src/ATen/native/ConvolutionMM2d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM2d.cpp
@@ -61,7 +61,7 @@ static Tensor compute_columns2d(
         kernel_height * kernel_width * n_input_plane : output_height * output_width;
     columns = at::empty({batch_size, row, col}, input.options());
     AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "slow_conv2d_cpu", [&]{
-      auto input_a = input.accessor<scalar_t, 4>();
+      auto input_a = input.accessor<const scalar_t, 4>();
       auto columns_a = columns.accessor<scalar_t, 3>();
 
       at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
@@ -220,9 +220,9 @@ static inline Tensor view_weight_2d(const Tensor& weight_,
 
 template <typename scalar_t>
 static void slow_conv2d_update_output_frame(
-    TensorAccessor<scalar_t, 3> input,
+    TensorAccessor<const scalar_t, 3> input,
     TensorAccessor<scalar_t, 3> output,
-    TensorAccessor<scalar_t, 2> weight,
+    TensorAccessor<const scalar_t, 2> weight,
     bool has_bias,
     TensorAccessor<scalar_t, 2> finput,
     int64_t kernel_height,
@@ -285,8 +285,8 @@ static void slow_conv2d_update_output_frame(
 template <typename scalar_t>
 void slow_conv2d_backward_update_grad_input_frame(
     TensorAccessor<scalar_t, 3> grad_input,
-    TensorAccessor<scalar_t, 3> grad_output,
-    TensorAccessor<scalar_t, 2> weight,
+    TensorAccessor<const scalar_t, 3> grad_output,
+    TensorAccessor<const scalar_t, 2> weight,
     scalar_t *fgrad_input,
     int64_t kernel_height,
     int64_t kernel_width,
@@ -405,9 +405,9 @@ void slow_conv2d_backward_out_cpu_template(
 
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kBFloat16, kHalf, input.scalar_type(), "slow_conv2d_cpu_grad_input", [&] {
-    auto grad_output_a = grad_output.accessor<scalar_t, 4>();
+    auto grad_output_a = grad_output.accessor<const scalar_t, 4>();
     auto grad_input_a = grad_input.accessor<scalar_t, 4>();
-    auto weight_a = weight.accessor<scalar_t, 2>();
+    auto weight_a = weight.accessor<const scalar_t, 2>();
 
     at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
       auto fgrad_input = std::make_unique<scalar_t[]>(fgrad_input_size);
@@ -434,8 +434,8 @@ void slow_conv2d_backward_out_cpu_template(
 template <typename scalar_t>
 void slow_conv2d_backward_weight_frame(
     TensorAccessor<scalar_t, 2> grad_weight,
-    TensorAccessor<scalar_t, 3> grad_output,
-    TensorAccessor<scalar_t, 2> finput,
+    TensorAccessor<const scalar_t, 3> grad_output,
+    TensorAccessor<const scalar_t, 2> finput,
     bool is_channels_last) {
   // Compute grad_weight += grad_output.reshape({grad_output.shape(0), -1}) * finput.T
   // Note gemm expects fortran order, so all 3 matrices are transposed.
@@ -519,9 +519,9 @@ static void slow_conv2d_backward_weight_out_cpu_template(
 
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kBFloat16, kHalf, input.scalar_type(), "slow_conv2d_cpu_grad_weight", [&] {
-    auto grad_output_a = grad_output.accessor<scalar_t, 4>();
+    auto grad_output_a = grad_output.accessor<const scalar_t, 4>();
     auto grad_weight_2d_a = grad_weight_2d.accessor<scalar_t, 2>();
-    auto finput_a = finput.accessor<scalar_t, 3>();
+    auto finput_a = finput.accessor<const scalar_t, 3>();
 
     for (const auto t : c10::irange(batch_size)) {
       auto grad_output_t = grad_output_a[t];
@@ -588,10 +588,10 @@ Tensor& slow_conv2d_forward_out_cpu(
   TORCH_CHECK(output.is_contiguous(memory_format), "slow_conv2d output tensor must be contiguous");
 
   AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "slow_conv2d_cpu", [&]{
-    auto input_a = input.accessor<scalar_t, 4>();
+    auto input_a = input.accessor<const scalar_t, 4>();
     auto output_a = output.accessor<scalar_t, 4>();
     auto finput_a = finput.accessor<scalar_t, 3>();
-    auto weight_2d_a = weight_2d.accessor<scalar_t, 2>();
+    auto weight_2d_a = weight_2d.accessor<const scalar_t, 2>();
 
     at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
       for (const auto t : c10::irange(start, end)) {
diff --git a/aten/src/ATen/native/ConvolutionMM3d.cpp b/aten/src/ATen/native/ConvolutionMM3d.cpp
index c194721acd491..1d5e7a8333def 100644
--- a/aten/src/ATen/native/ConvolutionMM3d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM3d.cpp
@@ -72,7 +72,7 @@ static Tensor compute_columns3d(
                         input.options());
 
     AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "compute_columns3d", [&] {
-      auto input_a = input.accessor<scalar_t, 5>();
+      auto input_a = input.accessor<const scalar_t, 5>();
       auto columns_a = columns.accessor<scalar_t, 3>();
 
       at::parallel_for(0, batch_size, CONV3D_GRAIN_SALT, [&](int64_t start, int64_t end) {
@@ -261,11 +261,11 @@ static Tensor view_weight_2d(const Tensor& weight_) {
 
 template <typename scalar_t>
 static void slow_conv3d_update_output_frame(
-    TensorAccessor<scalar_t, 4> input,
+    TensorAccessor<const scalar_t, 4> input,
     TensorAccessor<scalar_t, 4> output,
-    TensorAccessor<scalar_t, 2> weight,
+    TensorAccessor<const scalar_t, 2> weight,
     bool has_bias,
-    TensorAccessor<scalar_t, 2> finput,
+    TensorAccessor<const scalar_t, 2> finput,
     int64_t kernel_depth,
     int64_t kernel_height,
     int64_t kernel_width,
@@ -311,8 +311,8 @@ static void slow_conv3d_update_output_frame(
 template <typename scalar_t>
 void slow_conv3d_backward_update_grad_input_frame(
     TensorAccessor<scalar_t, 4> grad_input,
-    TensorAccessor<scalar_t, 4> grad_output,
-    TensorAccessor<scalar_t, 2> weight,
+    TensorAccessor<const scalar_t, 4> grad_output,
+    TensorAccessor<const scalar_t, 2> weight,
     TensorAccessor<scalar_t, 2> fgrad_input,
     int64_t kernel_depth,
     int64_t kernel_height,
@@ -430,12 +430,12 @@ void slow_conv3d_backward_out_cpu_template(
 
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kBFloat16, kHalf, input.scalar_type(), "slow_conv3d_cpu_grad_input", [&] {
+    auto grad_input_a = grad_input.accessor<scalar_t, 5>();
+    auto grad_output_a = grad_output_contiguous.accessor<const scalar_t, 5>();
+    auto fgrad_input_a = fgrad_input.accessor<scalar_t, 3>();
+    auto weight_2d_a = weight2d.accessor<const scalar_t, 2>();
     at::parallel_for(0, batch_size, CONV3D_GRAIN_SALT,
                     [&](int64_t start, int64_t end) {
-        auto grad_input_a = grad_input.accessor<scalar_t, 5>();
-        auto grad_output_a = grad_output_contiguous.accessor<scalar_t, 5>();
-        auto fgrad_input_a = fgrad_input.accessor<scalar_t, 3>();
-        auto weight_2d_a = weight2d.accessor<scalar_t, 2>();
 
         for (const auto t : c10::irange(start, end)) {
           auto grad_input_t = grad_input_a[t];
@@ -464,8 +464,8 @@ void slow_conv3d_backward_out_cpu_template(
 template <typename scalar_t>
 void slow_conv3d_backward_weight_frame(
     TensorAccessor<scalar_t, 2> grad_weight,
-    TensorAccessor<scalar_t, 4> grad_output,
-    TensorAccessor<scalar_t, 2> finput,
+    TensorAccessor<const scalar_t, 4> grad_output,
+    TensorAccessor<const scalar_t, 2> finput,
     int64_t groups) {
   // Compute grad_weight += grad_output.reshape({grad_output.shape(0), -1}) * finput.T
   // Note gemm expects fortran order, so all 3 matrices are transposed.
@@ -538,8 +538,8 @@ static void slow_conv3d_backward_parameters_out_cpu_template(
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kBFloat16, kHalf, input.scalar_type(), "slow_conv3d_cpu_grad_weight", [&] {
     auto grad_weight_2d_a = grad_weight_2d.accessor<scalar_t, 2>();
-    auto grad_output_a = grad_output_contiguous.accessor<scalar_t, 5>();
-    auto finput_a = finput.accessor<scalar_t, 3>();
+    auto grad_output_a = grad_output_contiguous.accessor<const scalar_t, 5>();
+    auto finput_a = finput.accessor<const scalar_t, 3>();
     for (const auto t : c10::irange(batch_size)) {
       auto grad_output_t = grad_output_a[t];
       auto finput_t = finput_a[t];
@@ -623,10 +623,10 @@ Tensor& slow_conv3d_forward_out_cpu(const Tensor& self,
   TORCH_CHECK(output.is_contiguous(), "slow_conv3d output must be contiguous");
 
   AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "slow_conv3d_cpu", [&] {
-    auto input_a = input.accessor<scalar_t, 5>();
+    auto input_a = input.accessor<const scalar_t, 5>();
     auto output_a = output.accessor<scalar_t, 5>();
-    auto finput_a = finput.accessor<scalar_t, 3>();
-    auto weight_2d_a = weight_2d.accessor<scalar_t, 2>();
+    auto finput_a = finput.accessor<const scalar_t, 3>();
+    auto weight_2d_a = weight_2d.accessor<const scalar_t, 2>();
 
     at::parallel_for(
         0, batch_size, CONV3D_GRAIN_SALT, [&](int64_t start, int64_t end) {
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index bdd07ea01ff47..eaaa394036866 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -81,7 +81,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.sizes().equals(src.sizes()));
 
   _AT_DISPATCH_CP_TYPES(self.scalar_type(), "copy_", [&] {
-    scalar_t* sp = src.data_ptr<scalar_t>();
+    const scalar_t* sp = src.const_data_ptr<scalar_t>();
     scalar_t* rp = self.data_ptr<scalar_t>();
     scalar_t* bp = buf.data_ptr<scalar_t>();
 
@@ -89,7 +89,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
     int64_t NC = src.size(1);
     for (int64_t R = 0; R < NR; R += BLOCK_SZ) {
       for (int64_t C = 0; C < NC; C += BLOCK_SZ) {
-        scalar_t* spo = sp + R + C * NR;
+        const scalar_t* spo = sp + R + C * NR;
         scalar_t* rpo = rp + C + R * NC;
 
         int nr = std::min(NR - R, BLOCK_SZ);
@@ -156,7 +156,7 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
         auto* output_ptr =
             reinterpret_cast<fbgemm::float16*>(self.data_ptr<at::Half>());
         if (self.numel() < at::internal::GRAIN_SIZE) {
-          fbgemm::FloatToFloat16_simd(src.data_ptr<float>(), output_ptr, self.numel());
+          fbgemm::FloatToFloat16_simd(src.const_data_ptr<float>(), output_ptr, self.numel());
         } else {
           at::parallel_for(
               0,
@@ -164,14 +164,14 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
               at::internal::GRAIN_SIZE,
               [&](int64_t begin, int64_t end) {
                 fbgemm::FloatToFloat16_simd(
-                    src.data_ptr<float>() + begin,
+                    src.const_data_ptr<float>() + begin,
                     output_ptr + begin,
                   end - begin);
               });
         }
       } else {
-        auto in_data = reinterpret_cast<fbgemm::float16*>(
-            src.data_ptr<at::Half>());
+        auto in_data = reinterpret_cast<const fbgemm::float16*>(
+            src.const_data_ptr<at::Half>());
         auto* output_ptr = self.data_ptr<float>();
         if (self.numel() < at::internal::GRAIN_SIZE) {
           fbgemm::Float16ToFloat_simd(in_data, output_ptr, self.numel());
@@ -265,7 +265,7 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
 
   auto iter = TensorIteratorConfig()
     .add_output(self)
-    .add_input(src)
+    .add_const_input(src)
     .resize_outputs(false)
     .check_all_same_dtype(false)
     .check_all_same_device(false)
@@ -296,7 +296,7 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
   }
 #endif
 
-  if(!self.is_complex() && src.is_complex()) {
+  if(!(self.is_complex() || self.dtype() == at::kBool) && src.is_complex()) {
     TORCH_WARN_ONCE("Casting complex values to real discards the imaginary part");
   }
   copy_stub(device_type, iter, non_blocking);
@@ -335,7 +335,7 @@ void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src) {
   // FIXME: really, overlapping writes should be illegal/an error in Torch
   auto iter = TensorIteratorConfig()
       .add_output(dst)
-      .add_input(src)
+      .add_const_input(src)
       .resize_outputs(false)
       .set_check_mem_overlap(false)
       .check_all_same_dtype(true)
diff --git a/aten/src/ATen/native/DispatchStub.cpp b/aten/src/ATen/native/DispatchStub.cpp
index c7db94889c58d..93c004acdc17c 100644
--- a/aten/src/ATen/native/DispatchStub.cpp
+++ b/aten/src/ATen/native/DispatchStub.cpp
@@ -10,8 +10,19 @@
 #include <cstdlib>
 #include <cstring>
 
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+#include <sys/auxv.h>
+#endif
+
 namespace at::native {
 
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+static inline bool cpu_has_vxe()
+{
+  return (getauxval(AT_HWCAP) & HWCAP_S390_VXE);
+}
+#endif
+
 static CPUCapability compute_cpu_capability() {
   auto envar = std::getenv("ATEN_CPU_CAPABILITY");
   if (envar) {
@@ -60,10 +71,16 @@ static CPUCapability compute_cpu_capability() {
 #endif
   }
 #endif
+
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+  // vxe is needed for fp32 vector instructions
+  if (cpu_has_vxe()) {
+    return CPUCapability::ZVECTOR;
+  }
+#endif
+
 #ifdef HAVE_VSX_CPU_DEFINITION
   return CPUCapability::VSX;
-#elif HAVE_ZVECTOR_CPU_DEFINITION
-  return CPUCapability::ZVECTOR;
 #else
   return CPUCapability::DEFAULT;
 #endif
diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h
index a7df275edf1de..1b3e29a963f18 100644
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@@ -227,18 +227,18 @@ struct RegisterPRIVATEUSE1Dispatch {
 // adding parentheses and using helper struct to get rid of the parentheses, do
 // not work with MSVC. So do a `using`-declaration if you need to pass in such
 // `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h.
-#define DECLARE_DISPATCH(fn, name)         \
-  struct name : DispatchStub<fn, name> {   \
-    name() = default;                      \
-    name(const name&) = delete;            \
-    name& operator=(const name&) = delete; \
-  };                                       \
-  extern TORCH_API struct name name
+#define DECLARE_DISPATCH(fn, name)                                                         \
+  struct name##_DECLARE_DISPATCH_type : DispatchStub<fn, name##_DECLARE_DISPATCH_type> {   \
+    name##_DECLARE_DISPATCH_type() = default;                                              \
+    name##_DECLARE_DISPATCH_type(const name##_DECLARE_DISPATCH_type&) = delete;            \
+    name##_DECLARE_DISPATCH_type& operator=(const name##_DECLARE_DISPATCH_type&) = delete; \
+  };                                                                                       \
+  extern TORCH_API struct name##_DECLARE_DISPATCH_type name;
 
-#define DEFINE_DISPATCH(name) struct name name
+#define DEFINE_DISPATCH(name) struct name##_DECLARE_DISPATCH_type name
 
 #define REGISTER_ARCH_DISPATCH(name, arch, fn) \
-  template <> name::FnPtr TORCH_API DispatchStub<name::FnPtr, struct name>::arch = fn;
+  template <> name##_DECLARE_DISPATCH_type::FnPtr TORCH_API DispatchStub<name##_DECLARE_DISPATCH_type::FnPtr, struct name##_DECLARE_DISPATCH_type>::arch = fn;
 
 #ifdef HAVE_AVX512_CPU_DEFINITION
 #define REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX512, fn)
@@ -277,16 +277,16 @@ struct RegisterPRIVATEUSE1Dispatch {
   REGISTER_ALL_CPU_DISPATCH(name, nullptr)
 
 #define REGISTER_CUDA_DISPATCH(name, fn) \
-  static RegisterCUDADispatch<struct name> name ## __register(name, fn);
+  static RegisterCUDADispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);
 
 #define REGISTER_HIP_DISPATCH(name, fn) \
-  static RegisterHIPDispatch<struct name> name ## __register(name, fn);
+  static RegisterHIPDispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);
 
 #define REGISTER_MPS_DISPATCH(name, fn) \
-  static RegisterMPSDispatch<struct name> name ## __register(name, fn);
+  static RegisterMPSDispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);
 
 #define REGISTER_PRIVATEUSE1_DISPATCH(name, fn) \
-  static RegisterPRIVATEUSE1Dispatch<struct name> name ## __register(name, fn);
+  static RegisterPRIVATEUSE1Dispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);
 
 // NB: This macro must be used in an actual 'cu' file; if you try using
 // it from a 'cpp' file it will not work!
diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp
index 4b33b713a5b6c..5af87802a1246 100644
--- a/aten/src/ATen/native/Distance.cpp
+++ b/aten/src/ATen/native/Distance.cpp
@@ -310,14 +310,13 @@ Tensor cosine_similarity(const Tensor& x1_, const Tensor& x2_, int64_t dim, doub
   auto x2_is_int = c10::isIntegralType(x2_.scalar_type(), /*încludeBool=*/true);
   auto x1_t = x1_is_int ? x1_.to(commonDtype) : x1_;
   auto x2_t = x2_is_int ? x2_.to(commonDtype) : x2_;
-  c10::MaybeOwned<Tensor> x1, x2;
-  std::tie(x1, x2) = expand_outplace(x1_t, x2_t);
+  auto [x1, x2] = expand_outplace(x1_t, x2_t);
 
 
   // We want to divide each tensor by its norm first, as it's more numerically stable.
   // This keeps the result between -1.0 and 1.0
   // We clone them, as we're going to modify them in-place
-  // This allows the gradients to propagate propertly all the way to x1 and x2
+  // This allows the gradients to propagate properly all the way to x1 and x2
   auto x1_norm = at::linalg_vector_norm(*x1, 2, /*dim=*/dim, /*keepdim=*/true).clone();
   auto x2_norm = at::linalg_vector_norm(*x2, 2, /*dim=*/dim, /*keepdim=*/true).clone();
 
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index c183beab157ff..4d4eb2efaf401 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -4,7 +4,6 @@
 #include <ATen/TensorIterator.h>
 #include <ATen/TensorOperators.h>
 #include <c10/util/Exception.h>
-#include <c10/util/math_compat.h>
 #include <c10/util/Optional.h>
 
 #include <ATen/CPUGeneratorImpl.h>
diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp
index 47da7c1bf8cc2..7014ec65d1f5a 100644
--- a/aten/src/ATen/native/Dropout.cpp
+++ b/aten/src/ATen/native/Dropout.cpp
@@ -99,7 +99,7 @@ ALIAS_SPECIALIZATION(_feature_dropout,       true,  false)
 ALIAS_SPECIALIZATION(_alpha_dropout,         false, true )
 ALIAS_SPECIALIZATION(_feature_alpha_dropout, true,  true )
 
-} // anomymous namepsace
+} // anonymous namespace
 
 std::tuple<Tensor,Tensor>
 native_dropout_cpu(const Tensor& input, double p, c10::optional<bool> train) {
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index 07253c5ed566d..705b08ab39f06 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -124,18 +124,18 @@ Tensor embedding_dense_backward_cpu(
   auto add_iter = TensorIteratorConfig()
     .add_output(grad_weight)
     .add_input(grad_weight)
-    .add_input(grad)
+    .add_const_input(grad)
     .resize_outputs(false)
     .declare_static_shape(grad.sizes(), /*squash_dims=*/0)
     .build();
 
   const auto gW_data = reinterpret_cast<char*>(grad_weight.data_ptr());
-  const auto gO_data = reinterpret_cast<char*>(grad.data_ptr());
+  const auto gO_data = reinterpret_cast<const char*>(grad.const_data_ptr());
   const auto gW_stride = grad_weight.strides()[0] * grad_weight.element_size();
   const auto gO_stride = grad.strides()[0] * grad.element_size();
 
   AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cpu", [&] () {
-    auto indices_data = indices_contig.data_ptr<index_t>();
+    auto indices_data = indices_contig.const_data_ptr<index_t>();
 
     // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
     std::unique_ptr<index_t[]> counts;
@@ -164,7 +164,7 @@ Tensor embedding_dense_backward_cpu(
             // grad_weight[k].add_(grad[i], scale);
             iter.unsafe_replace_operand(0, gW_data + k * gW_stride);
             iter.unsafe_replace_operand(1, gW_data + k * gW_stride);
-            iter.unsafe_replace_operand(2, gO_data + i * gO_stride);
+            iter.unsafe_replace_operand(2, const_cast<char*>(gO_data + i * gO_stride));
             add_stub(kCPU, iter, scale);
           }
         }
@@ -189,7 +189,7 @@ Tensor & embedding_renorm_cpu_(
   auto num_indices = indices.numel();
 
   AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_renorm_cpu_", [&]() {
-    auto data_ptr = indices_contig.data_ptr<index_t>();
+    auto data_ptr = indices_contig.const_data_ptr<index_t>();
     auto sorted_indices = std::vector<index_t>(data_ptr, data_ptr + num_indices);
     std::sort(sorted_indices.begin(), sorted_indices.end());
 
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index 0062e6b0804b6..8b6c90dae2375 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -70,16 +70,16 @@ static void make_offset2bag(const Tensor &offsets, Tensor& offset2bag) {
 
 namespace {
 
-std::pair<Tensor, Tensor> promoteIndicesAndOffsets(
+std::pair<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>> promoteIndicesAndOffsets(
     const Tensor& indices,
     const Tensor& offsets) {
   const auto commonType =
       promoteTypes(offsets.scalar_type(), indices.scalar_type());
   return {
-      indices.scalar_type() == commonType ? indices
-                                          : indices.toType(commonType),
-      offsets.scalar_type() == commonType ? offsets
-                                          : offsets.toType(commonType)};
+      indices.scalar_type() == commonType ? c10::MaybeOwned<Tensor>::borrowed(indices)
+                                          : c10::MaybeOwned<Tensor>::owned(indices.toType(commonType)),
+      offsets.scalar_type() == commonType ? c10::MaybeOwned<Tensor>::borrowed(offsets)
+                                          : c10::MaybeOwned<Tensor>::owned(offsets.toType(commonType))};
 }
 
 // Determines if we can use a fast implementation for index_select_add, which
@@ -125,9 +125,9 @@ index_select_add(
     index_t padding_idx,
     _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) {
   TORCH_CHECK(select_indices.numel() == add_indices.numel());
-  auto* add_indices_data = add_indices.data_ptr<index_t>();
-  auto* select_indices_data = select_indices.data_ptr<index_t>();
-  auto* src_data = src.data_ptr<data_t>();
+  auto* add_indices_data = add_indices.const_data_ptr<index_t>();
+  auto* select_indices_data = select_indices.const_data_ptr<index_t>();
+  auto* src_data = src.const_data_ptr<data_t>();
   auto* output_data = output.data_ptr<data_t>();
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   index_t* bag_size_data = nullptr;
@@ -208,14 +208,14 @@ index_select_add(
     index_t padding_idx,
     _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
   int64_t ddim = src.size(1);
-  auto* select_indices_data = select_indices.data_ptr<index_t>();
+  auto* select_indices_data = select_indices.const_data_ptr<index_t>();
   auto* output_data = output.data_ptr<data_t>();
 
   if (is_fast_path_index_select(src, output, padding_idx)) {
     auto src_contig = src.contiguous();
-    auto* src_data = src_contig.data_ptr<data_t>();
+    auto* src_data = src_contig.const_data_ptr<data_t>();
     int64_t output_size = offsets.numel() - 1;
-    auto* offsets_data = offsets.data_ptr<index_t>();
+    auto* offsets_data = offsets.const_data_ptr<index_t>();
     std::vector<index_t> offsets_include_last;
 
     if (include_last_offset) {
@@ -316,8 +316,8 @@ index_select_add(
 #endif
   } else {
     TORCH_CHECK(select_indices.numel() == add_indices.numel());
-    auto* src_data = src.data_ptr<data_t>();
-    auto* add_indices_data = add_indices.data_ptr<index_t>();
+    auto* src_data = src.const_data_ptr<data_t>();
+    auto* add_indices_data = add_indices.const_data_ptr<index_t>();
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     index_t* bag_size_data = nullptr;
     if (bag_size.defined()) {
@@ -388,14 +388,14 @@ index_select_add(const Tensor &select_indices,
                              index_t padding_idx,
                              _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
   int64_t ddim = src.size(1);
-  auto* select_indices_data = select_indices.data_ptr<index_t>();
+  auto* select_indices_data = select_indices.const_data_ptr<index_t>();
   auto* output_data = output.data_ptr<float>();
 
   if (is_fast_path_index_select(src, output, padding_idx)) {
     auto src_contig = src.contiguous();
-    auto* src_data = src_contig.data_ptr<float>();
+    auto* src_data = src_contig.const_data_ptr<float>();
     int64_t output_size = offsets.numel() - 1;
-    auto* offsets_data = offsets.data_ptr<index_t>();
+    auto* offsets_data = offsets.const_data_ptr<index_t>();
     std::vector<index_t> offsets_include_last;
 
     if (include_last_offset) {
@@ -463,8 +463,8 @@ index_select_add(const Tensor &select_indices,
         });
   } else {
     AT_ASSERT(select_indices.numel() == add_indices.numel());
-    auto* src_data = src.data_ptr<float>();
-    auto* add_indices_data = add_indices.data_ptr<index_t>();
+    auto* src_data = src.const_data_ptr<float>();
+    auto* add_indices_data = add_indices.const_data_ptr<index_t>();
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     index_t* bag_size_data = nullptr;
     if (bag_size.defined()) {
@@ -519,9 +519,9 @@ index_select_scale_add(
     index_t padding_idx,
     _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) {
   AT_ASSERT(select_indices.numel() == add_indices.numel());
-  auto* add_indices_data = add_indices.data_ptr<index_t>();
-  auto* select_indices_data = select_indices.data_ptr<index_t>();
-  auto* src_data = src.data_ptr<data_t>();
+  auto* add_indices_data = add_indices.const_data_ptr<index_t>();
+  auto* select_indices_data = select_indices.const_data_ptr<index_t>();
+  auto* src_data = src.const_data_ptr<data_t>();
   auto* output_data = output.data_ptr<data_t>();
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   index_t* bag_size_data = nullptr;
@@ -536,7 +536,7 @@ index_select_scale_add(
   auto output_stride0 = output.strides()[0];
   auto output_stride1 = output.strides()[1];
 
-  auto* scale_data = scale.data_ptr<data_t>();
+  auto* scale_data = scale.const_data_ptr<data_t>();
   auto scale_stride = scale.strides()[0];
 
   for (const auto i : c10::irange(numel)) {
@@ -579,15 +579,15 @@ index_select_scale_add(
     index_t padding_idx,
     _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
   int64_t ddim = src.size(1);
-  auto* scale_data = scale.data_ptr<data_t>();
-  auto* select_indices_data = select_indices.data_ptr<index_t>();
+  auto* scale_data = scale.const_data_ptr<data_t>();
+  auto* select_indices_data = select_indices.const_data_ptr<index_t>();
   auto* output_data = output.data_ptr<data_t>();
 
   if (is_fast_path_index_select_scale(src, scale, output, padding_idx)) {
     auto src_contig = src.contiguous();
-    auto* src_data = src_contig.data_ptr<data_t>();
+    auto* src_data = src_contig.const_data_ptr<data_t>();
     int64_t output_size = offsets.numel() - 1;
-    auto* offsets_data = offsets.data_ptr<index_t>();
+    auto* offsets_data = offsets.const_data_ptr<index_t>();
     std::vector<index_t> offsets_include_last;
 
     if (include_last_offset) {
@@ -705,8 +705,8 @@ index_select_scale_add(
 #endif
   } else {
     AT_ASSERT(select_indices.numel() == add_indices.numel());
-    auto* src_data = src.data_ptr<data_t>();
-    auto* add_indices_data = add_indices.data_ptr<index_t>();
+    auto* src_data = src.const_data_ptr<data_t>();
+    auto* add_indices_data = add_indices.const_data_ptr<index_t>();
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     index_t* bag_size_data = nullptr;
     if (bag_size.defined()) {
@@ -770,15 +770,15 @@ index_select_scale_add(const Tensor &select_indices,
                                           index_t padding_idx,
                                           _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
   int64_t ddim = src.size(1);
-  auto* scale_data = scale.data_ptr<float>();
-  auto* select_indices_data = select_indices.data_ptr<index_t>();
+  auto* scale_data = scale.const_data_ptr<float>();
+  auto* select_indices_data = select_indices.const_data_ptr<index_t>();
   auto* output_data = output.data_ptr<float>();
 
   if (is_fast_path_index_select_scale(src, scale, output, padding_idx)) {
     auto src_contig = src.contiguous();
-    auto* src_data = src_contig.data_ptr<float>();
+    auto* src_data = src_contig.const_data_ptr<float>();
     int64_t output_size = offsets.numel() - 1;
-    auto* offsets_data = offsets.data_ptr<index_t>();
+    auto* offsets_data = offsets.const_data_ptr<index_t>();
     std::vector<index_t> offsets_include_last;
 
     if (include_last_offset) {
@@ -844,8 +844,8 @@ index_select_scale_add(const Tensor &select_indices,
         });
   } else {
     AT_ASSERT(select_indices.numel() == add_indices.numel());
-    auto* src_data = src.data_ptr<float>();
-    auto* add_indices_data = add_indices.data_ptr<index_t>();
+    auto* src_data = src.const_data_ptr<float>();
+    auto* add_indices_data = add_indices.const_data_ptr<index_t>();
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     index_t* bag_size_data = nullptr;
     if (bag_size.defined()) {
@@ -1089,7 +1089,7 @@ void embedding_bag_cpu_max_out(
   int64_t featureSize = weight.size(1);
   int64_t vocab_size = weight.size(0);
   AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_cpu_max_out", [&] {
-    auto* indices_data = indices.data_ptr<index_t>();
+    auto* indices_data = indices.const_data_ptr<index_t>();
     auto* offset2bag_data = offset2bag.data_ptr<index_t>();
 
     index_t* max_indices_data = nullptr;
@@ -1099,7 +1099,7 @@ void embedding_bag_cpu_max_out(
       max_indices_stride = max_indices->strides()[0];
     }
 
-    auto* weight_data = weight.data_ptr<scalar_t>();
+    auto* weight_data = weight.const_data_ptr<scalar_t>();
     auto* output_data = output.data_ptr<scalar_t>();
     auto* bag_size_data = bag_size.data_ptr<index_t>();
     auto weight_stride0 = weight.strides()[0];
@@ -1210,8 +1210,9 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_cpu_impl(
   TORCH_CHECK(weight.dim() == 2,
       "weight has to be a 2D Tensor, but got Tensor of dimension ",
       weight.dim());
-  Tensor indices, offsets;
-  std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
+  auto [indicesMaybeOwned, offsetsMaybeOwned] = promoteIndicesAndOffsets(indices_, offsets_);
+  const auto& indices = *indicesMaybeOwned;
+  const auto& offsets = *offsetsMaybeOwned;
   check_arguments(weight, indices, offsets, mode, per_sample_weights, include_last_offset);
 
   Tensor output = at::empty(
@@ -1331,8 +1332,8 @@ void _embedding_bag_cpu_out(
     at::Tensor& bag_size,
     at::Tensor* p_max_indices,
     const at::Tensor& weight,
-    const at::Tensor& indices,
-    const at::Tensor& offsets,
+    const at::Tensor& indices_,
+    const at::Tensor& offsets_,
     const bool /* scale_grad_by_freq */,
     const int64_t mode,
     const bool /* sparse */,
@@ -1340,6 +1341,9 @@ void _embedding_bag_cpu_out(
     const bool include_last_offset,
     const c10::optional<int64_t>& padding_idx,
     _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
+  auto [indicesMaybeOwned, offsetsMaybeOwned] = promoteIndicesAndOffsets(indices_, offsets_);
+  const auto& indices = *indicesMaybeOwned;
+  const auto& offsets = *offsetsMaybeOwned;
   at::native::check_arguments(
       weight, indices, offsets, mode, per_sample_weights, include_last_offset);
 
@@ -1410,8 +1414,9 @@ Tensor _embedding_bag_backward_symint(const Tensor &grad, const Tensor &indices_
   c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
   const Tensor& per_sample_weights = *per_sample_weights_maybe_owned;
 
-  Tensor indices, offsets;
-  std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
+  auto [indicesMaybeOwned, offsetsMaybeOwned] = promoteIndicesAndOffsets(indices_, offsets_);
+  const auto& indices = *indicesMaybeOwned;
+  const auto& offsets = *offsetsMaybeOwned;
   auto indices_arg = TensorArg(indices, "indices", 1);
   checkScalarTypes("embedding_bag", indices_arg, {kLong, kInt});
   checkContiguous("embedding_bag", indices_arg);
@@ -1473,7 +1478,7 @@ static Tensor _embedding_bag_dense_backward_cpu_max(
 template<typename index_t>
 static std::vector<index_t> compute_counts(
     int64_t num_weights,
-    index_t* indices_data,
+    const index_t* indices_data,
     int64_t indices_length) {
   std::vector<index_t> counts(num_weights, 0);
   for (const auto i : c10::irange(indices_length)) {
@@ -1494,7 +1499,7 @@ static std::vector<index_t> compute_counts(
 template<typename index_t>
 static std::vector<index_t> compute_counts_uniq(
     int64_t num_weights,
-    index_t* indices_data,
+    const index_t* indices_data,
     int64_t indices_length,
     const std::vector<index_t>& counts) {
   std::vector<index_t> counts_uniq;
@@ -1533,11 +1538,11 @@ void _embedding_bag_dense_backward_cpu_sum_mean(
 
   optional<Tensor> per_sample_weights;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  scalar_t* per_sample_weights_data;
+  const scalar_t* per_sample_weights_data;
   optional<int64_t> per_sample_weights_stride;
   if (per_sample_weights_.defined()) {
     per_sample_weights = per_sample_weights_.index_select(0, ind_sort);
-    per_sample_weights_data = per_sample_weights->data_ptr<scalar_t>();
+    per_sample_weights_data = per_sample_weights->const_data_ptr<scalar_t>();
     per_sample_weights_stride = per_sample_weights->strides()[0];
   }
 
@@ -1549,9 +1554,9 @@ void _embedding_bag_dense_backward_cpu_sum_mean(
     [&indices, &offset2bag, &bag_size_, &num_weights, &numel, &per_sample_weights,
       &per_sample_weights_data, &per_sample_weights_stride, &mode, &scale_grad_by_freq,
       &grad, &index_grad_weight, &padding_idx] {
-    auto* indices_data = indices.data_ptr<index_t>();
-    auto* offset2bag_data = offset2bag.data_ptr<index_t>();
-    auto* bag_size_data = bag_size_.data_ptr<index_t>();
+    auto* indices_data = indices.const_data_ptr<index_t>();
+    auto* offset2bag_data = offset2bag.const_data_ptr<index_t>();
+    auto* bag_size_data = bag_size_.const_data_ptr<index_t>();
 
     auto counts = compute_counts(num_weights, indices_data, numel);
     auto next_unique_index_idx =
@@ -1585,7 +1590,7 @@ void _embedding_bag_dense_backward_cpu_sum_mean(
             }
             int64_t ddim = grad.size(1);
             auto igwd = index_grad_weight.data_ptr<scalar_t>();
-            auto gd = grad.data_ptr<scalar_t>();
+            auto gd = grad.const_data_ptr<scalar_t>();
             at::native::cpublas::axpy<scalar_t>(ddim, (scalar_t)scale, gd + ddim * source, 1,
                         igwd + ddim * index, 1);
           }
@@ -1666,8 +1671,10 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu_template(
   AT_ASSERT(grad.dim() == 2);
   auto embedding_features = grad.sizes()[1];
 
-  Tensor indices, offsets;
-  std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
+  auto [indicesMaybeOwned, offsetsMaybeOwned] = promoteIndicesAndOffsets(indices_, offsets_);
+  const auto& indices = *indicesMaybeOwned;
+  const auto& offsets = *offsetsMaybeOwned;
+
   AT_ASSERT(indices.dim() == 1);
   auto num_samples = indices.size(0);
 
@@ -1695,11 +1702,11 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu_template(
     offset2bag_ = offset2bag;
   }
 
-  auto* grad_data = grad.data_ptr<scalar_t>();
+  auto* grad_data = grad.const_data_ptr<scalar_t>();
   auto grad_stride0 = grad.strides()[0];
   auto grad_stride1 = grad.strides()[1];
 
-  auto* weight_data = weight.data_ptr<scalar_t>();
+  auto* weight_data = weight.const_data_ptr<scalar_t>();
   auto weight_stride0 = weight.strides()[0];
   auto weight_stride1 = weight.strides()[1];
 
@@ -1709,11 +1716,11 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu_template(
     [&indices, &output, &offset2bag_, &num_samples, &embedding_features,
       &grad_data, &grad_stride0, &grad_stride1, &weight_data, &weight_stride0, &weight_stride1,
       &padding_idx] () {
-    auto* indices_data = indices.data_ptr<index_t>();
+    auto* indices_data = indices.const_data_ptr<index_t>();
 
     // The following are contiguous
     auto* output_data = output.data_ptr<scalar_t>();
-    auto* offset2bag_data = offset2bag_.data_ptr<index_t>();
+    auto* offset2bag_data = offset2bag_.const_data_ptr<index_t>();
 
     // XXX: 64 was arbitrarily chosen. There is probably a sweet spot for this number.
     parallel_for(0, num_samples, 64,
@@ -1726,8 +1733,8 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu_template(
         if (embedding_idx != static_cast<index_t>(padding_idx)) {
           output_data[sample_idx] = dot_impl<scalar_t>(
               embedding_features,
-              grad_data + grad_stride0 * bag_idx, grad_stride1,
-              weight_data + weight_stride0 * embedding_idx, weight_stride1);
+              const_cast<scalar_t*>(grad_data + grad_stride0 * bag_idx), grad_stride1,
+              const_cast<scalar_t*>(weight_data + weight_stride0 * embedding_idx), weight_stride1);
         }
       }
     });
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index 07ad16d3110b0..4e8963da05521 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -216,7 +216,7 @@ inline std::vector<c10::Scalar> convert_tensor_to_scalar_list(
       scalarList_.scalar_type(),
       "convert_tensor_to_scalar_list",
       [&]() {
-        const scalar_t* scalar_data = scalarList_.data_ptr<scalar_t>();
+        const scalar_t* scalar_data = scalarList_.const_data_ptr<scalar_t>();
         TORCH_CHECK(
             (expect_length == scalarList_.size(0)),
             "Expected length of scalars to match input of length ",
@@ -248,7 +248,7 @@ inline bool can_use_fast_route(
 }
 
 using DeviceDtypeKey = std::pair<at::Device, at::ScalarType>;
-using IndicesT = std::vector<int>;
+using IndicesT = std::vector<size_t>;
 using nested_optional_tensorvec_t =
     std::vector<std::vector<c10::optional<at::Tensor>>>;
 using TensorsAndIndicesT = std::pair<nested_optional_tensorvec_t, IndicesT>;
diff --git a/aten/src/ATen/native/FractionalMaxPool2d.cpp b/aten/src/ATen/native/FractionalMaxPool2d.cpp
index c9797b2e70a53..d1a5808d0c66c 100644
--- a/aten/src/ATen/native/FractionalMaxPool2d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool2d.cpp
@@ -131,10 +131,10 @@ namespace {
 
 template <typename scalar_t>
 static void fractional_max_pool2d_out_single_batch_frame(
-  scalar_t* input,
+  const scalar_t* input,
   scalar_t* output,
   int64_t* indices,
-  scalar_t* randomSamples,
+  const scalar_t* randomSamples,
   int numPlanes,
   int inputW, int inputH,
   int outputW, int outputH,
@@ -142,7 +142,7 @@ static void fractional_max_pool2d_out_single_batch_frame(
   at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
     for (const auto plane : c10::irange(start, end)) {
       /* each plane contains 2 random samples, one for W and one for H */
-      scalar_t* randomSamplesForPlane = randomSamples + plane * 2;
+      const scalar_t* randomSamplesForPlane = randomSamples + plane * 2;
 
       /* Generate interval sequence */
       auto sequenceW = generate_intervals<scalar_t>(
@@ -154,7 +154,7 @@ static void fractional_max_pool2d_out_single_batch_frame(
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       int h, w;
 
-      scalar_t* inputForPlane = input + plane * inputW * inputH;
+      const scalar_t* inputForPlane = input + plane * inputW * inputH;
       scalar_t* outputForPlane = output + plane * outputW * outputH;
       int64_t* indicesForPlane = indices + plane * outputW * outputH;
 
@@ -192,10 +192,10 @@ static void fractional_max_pool2d_out_single_batch_frame(
 
 template <typename scalar_t>
 static void fractional_max_pool2d_out_frame(
-  scalar_t* input,
+  const scalar_t* input,
   scalar_t* output,
   int64_t* indices,
-  scalar_t* randomSamples,
+  const scalar_t* randomSamples,
   int numBatch, int numPlanes,
   int inputW, int inputH,
   int outputW, int outputH,
@@ -225,16 +225,16 @@ static void fractional_max_pool2d_out_frame(
 template <typename scalar_t>
 static void fractional_max_pool2d_backward_out_single_batch_frame(
   scalar_t* gradInput,
-  scalar_t* gradOutput,
-  int64_t* indices,
+  const scalar_t* gradOutput,
+  const int64_t* indices,
   int numPlanes,
   int inputW, int inputH,
   int outputW, int outputH) {
   at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
     for (const auto plane : c10::irange(start, end)) {
       scalar_t* gradInputForPlane = gradInput + plane * inputW * inputH;
-      scalar_t* gradOutputForPlane = gradOutput + plane * outputW * outputH;
-      int64_t* indicesForPlane = indices + plane * outputW * outputH;
+      const scalar_t* gradOutputForPlane = gradOutput + plane * outputW * outputH;
+      const int64_t* indicesForPlane = indices + plane * outputW * outputH;
 
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       int h, w;
@@ -254,8 +254,8 @@ static void fractional_max_pool2d_backward_out_single_batch_frame(
 template <typename scalar_t>
 static void fractional_max_pool2d_backward_out_frame(
   scalar_t* gradInput,
-  scalar_t* gradOutput,
-  int64_t* indices,
+  const scalar_t* gradOutput,
+  const int64_t* indices,
   int numBatch, int numPlanes,
   int inputW, int inputH,
   int outputW, int outputH) {
@@ -326,10 +326,10 @@ TORCH_IMPL_FUNC(fractional_max_pool2d_out_cpu) (
     kHalf,
     input.scalar_type(),
     "fractional_max_pool2d_out_frame", [&] {
-      auto input_data = input.data_ptr<scalar_t>();
+      auto input_data = input.const_data_ptr<scalar_t>();
       auto output_data = output.data_ptr<scalar_t>();
       auto indices_data = indices.data_ptr<int64_t>();
-      auto randomSamples_data = randomSamples.data_ptr<scalar_t>();
+      auto randomSamples_data = randomSamples.const_data_ptr<scalar_t>();
       fractional_max_pool2d_out_frame<scalar_t>(
         input_data,
         output_data,
@@ -383,8 +383,8 @@ TORCH_IMPL_FUNC(fractional_max_pool2d_backward_cpu) (
     kHalf,
     input.scalar_type(), "fractional_max_pool2d_backward_out_frame", [&] {
       auto gradInput_data = gradInput.data_ptr<scalar_t>();
-      auto gradOutput_data = gradOutput.data_ptr<scalar_t>();
-      auto indices_data = indices.data_ptr<int64_t>();
+      auto gradOutput_data = gradOutput.const_data_ptr<scalar_t>();
+      auto indices_data = indices.const_data_ptr<int64_t>();
       fractional_max_pool2d_backward_out_frame<scalar_t>(
         gradInput_data,
         gradOutput_data,
diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp
index 9763cebd3ffbd..79da29a6f4462 100644
--- a/aten/src/ATen/native/FractionalMaxPool3d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp
@@ -101,10 +101,10 @@ namespace {
 
 template<typename scalar_t>
 static void fractional_max_pool3d_out_single_batch_frame(
-  scalar_t* input,
+  const scalar_t* input,
   scalar_t* output,
   int64_t* indices,
-  scalar_t* randomSamples,
+  const scalar_t* randomSamples,
   int64_t numPlanes,
   int64_t inputT, int64_t inputH, int64_t inputW,
   int64_t outputT, int64_t outputH, int64_t outputW,
@@ -114,7 +114,7 @@ static void fractional_max_pool3d_out_single_batch_frame(
     for (const auto plane : c10::irange(start, end)) {
       /* each plane contains 3 random samples,
          one for T, one for W, and one for H */
-      scalar_t* randomSamplesForPlane = randomSamples + plane * 3;
+      const scalar_t* randomSamplesForPlane = randomSamples + plane * 3;
 
       /* Generate interval sequence */
       auto sequenceT = generate_intervals<scalar_t>(
@@ -128,7 +128,7 @@ static void fractional_max_pool3d_out_single_batch_frame(
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       int64_t t, h, w;
 
-      scalar_t* inputForPlane = input + plane * inputT * inputH * inputW;
+      const scalar_t* inputForPlane = input + plane * inputT * inputH * inputW;
       scalar_t* outputForPlane = output + plane * outputT * outputH * outputW;
       int64_t* indicesForPlane = indices + plane * outputT * outputH * outputW;
 
@@ -173,10 +173,10 @@ static void fractional_max_pool3d_out_single_batch_frame(
 
 template<typename scalar_t>
 static void fractional_max_pool3d_out_frame(
-  scalar_t* input,
+  const scalar_t* input,
   scalar_t* output,
   int64_t* indices,
-  scalar_t* randomSamples,
+  const scalar_t* randomSamples,
   int64_t numBatch, int64_t numPlanes,
   int64_t inputT, int64_t inputH, int64_t inputW,
   int64_t outputT, int64_t outputH, int64_t outputW,
@@ -244,10 +244,10 @@ TORCH_IMPL_FUNC(fractional_max_pool3d_out_cpu)(
     "fractional_max_pool3d_out_frame",
     [&] {
       fractional_max_pool3d_out_frame<scalar_t>(
-        input.data_ptr<scalar_t>(),
+        input.const_data_ptr<scalar_t>(),
         output.data_ptr<scalar_t>(),
         indices.data_ptr<int64_t>(),
-        randomSamples.data_ptr<scalar_t>(),
+        randomSamples.const_data_ptr<scalar_t>(),
         numBatch, numPlanes,
         inputT, inputH, inputW,
         outputT, outputH, outputW,
@@ -262,8 +262,8 @@ namespace {
 template<typename scalar_t>
 static void fractional_max_pool3d_backward_out_single_batch_frame(
   scalar_t* gradInput,
-  scalar_t* gradOutput,
-  int64_t* indices,
+  const scalar_t* gradOutput,
+  const int64_t* indices,
   int64_t numPlanes,
   int64_t inputT, int64_t inputH, int64_t inputW,
   int64_t outputT, int64_t outputH, int64_t outputW) {
@@ -271,9 +271,9 @@ static void fractional_max_pool3d_backward_out_single_batch_frame(
   at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) {
     for (const auto plane : c10::irange(start, end)) {
       scalar_t* gradInputForPlane = gradInput + plane * inputT * inputH * inputW;
-      scalar_t* gradOutputForPlane = gradOutput +
+      const scalar_t* gradOutputForPlane = gradOutput +
                   plane * outputT * outputH * outputW;
-      int64_t* indicesForPlane = indices + plane * outputT * outputH * outputW;
+      const int64_t* indicesForPlane = indices + plane * outputT * outputH * outputW;
 
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       int64_t h, w, t;
@@ -294,8 +294,8 @@ static void fractional_max_pool3d_backward_out_single_batch_frame(
 template<typename scalar_t>
 static void fractional_max_pool3d_backward_out_frame(
   scalar_t* gradInput,
-  scalar_t* gradOutput,
-  int64_t* indices,
+  const scalar_t* gradOutput,
+  const int64_t* indices,
   int64_t numBatch, int64_t numPlanes,
   int64_t inputT, int64_t inputH, int64_t inputW,
   int64_t outputT, int64_t outputH, int64_t outputW) {
@@ -381,8 +381,8 @@ void fractional_max_pool3d_backward_out_cpu_template(
     [&]{
       fractional_max_pool3d_backward_out_frame<scalar_t>(
         gradInput.data_ptr<scalar_t>(),
-        gradOutput.data_ptr<scalar_t>(),
-        indices.data_ptr<int64_t>(),
+        gradOutput.const_data_ptr<scalar_t>(),
+        indices.const_data_ptr<int64_t>(),
         numBatch, numPlanes,
         inputT, inputH, inputW,
         outputT, outputH, outputW
diff --git a/aten/src/ATen/native/FusedAdam.cpp b/aten/src/ATen/native/FusedAdam.cpp
new file mode 100644
index 0000000000000..b3be769b24f18
--- /dev/null
+++ b/aten/src/ATen/native/FusedAdam.cpp
@@ -0,0 +1,175 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/FusedAdam.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_fused_adam.h>
+#include <ATen/ops/_fused_adam_native.h>
+#include <ATen/ops/_fused_adamw.h>
+#include <ATen/ops/_fused_adamw_native.h>
+#endif
+namespace at {
+
+namespace native {
+
+void _fused_adam_kernel_cpu_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool amsgrad,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf) {
+  const float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  const float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  if (found_inf_ptr && *found_inf_ptr == 1.0) {
+      return;
+  }
+  size_t n_tensors = params.size();
+  TORCH_CHECK(grads.size() == n_tensors);
+  TORCH_CHECK(exp_avgs.size() == n_tensors);
+  TORCH_CHECK(exp_avg_sqs.size() == n_tensors);
+  if (amsgrad) {
+    TORCH_CHECK(max_exp_avg_sqs.size() == n_tensors);
+  } else {
+    TORCH_CHECK(max_exp_avg_sqs.size() == 0);
+  }
+  TORCH_CHECK(state_steps.size() == n_tensors);
+  at::Tensor max_exp_avg_sq = at::Tensor();
+  for (size_t i = 0; i < n_tensors; i++){
+    if (amsgrad) max_exp_avg_sq = max_exp_avg_sqs[i];
+    fused_adam_stub(
+      kCPU,
+      params[i],
+      grads[i],
+      exp_avgs[i],
+      exp_avg_sqs[i],
+      max_exp_avg_sq,
+      state_steps[i],
+      lr,
+      beta1,
+      beta2,
+      weight_decay,
+      eps,
+      amsgrad,
+      maximize,
+      grad_scale_ptr,
+      ADAM_MODE::ORIGINAL);
+  }
+}
+
+// The following overload simply has a Tensor lr
+void _fused_adam_kernel_cpu_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool amsgrad,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf) {
+  _fused_adam_kernel_cpu_(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr.item<double>(), beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf);
+}
+
+void _fused_adamw_kernel_cpu_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool amsgrad,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf) {
+  const float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  const float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  if (found_inf_ptr && *found_inf_ptr == 1.0) {
+      return;
+  }
+  size_t n_tensors = params.size();
+  TORCH_CHECK(grads.size() == n_tensors);
+  TORCH_CHECK(exp_avgs.size() == n_tensors);
+  TORCH_CHECK(exp_avg_sqs.size() == n_tensors);
+  if (amsgrad) {
+    TORCH_CHECK(max_exp_avg_sqs.size() == n_tensors);
+  } else {
+    TORCH_CHECK(max_exp_avg_sqs.size() == 0);
+  }
+  TORCH_CHECK(state_steps.size() == n_tensors);
+  at::Tensor max_exp_avg_sq = at::Tensor();
+  for (size_t i = 0; i < n_tensors; i++){
+    if (amsgrad) max_exp_avg_sq = max_exp_avg_sqs[i];
+    fused_adam_stub(
+      kCPU,
+      params[i],
+      grads[i],
+      exp_avgs[i],
+      exp_avg_sqs[i],
+      max_exp_avg_sq,
+      state_steps[i],
+      lr,
+      beta1,
+      beta2,
+      weight_decay,
+      eps,
+      amsgrad,
+      maximize,
+      grad_scale_ptr,
+      ADAM_MODE::ADAMW);
+  }
+}
+
+// The following overload simply has a Tensor lr
+void _fused_adamw_kernel_cpu_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool amsgrad,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf) {
+  _fused_adamw_kernel_cpu_(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr.item<double>(), beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf);
+}
+
+
+DEFINE_DISPATCH(fused_adam_stub);
+
+}
+}
diff --git a/aten/src/ATen/native/FusedAdam.h b/aten/src/ATen/native/FusedAdam.h
new file mode 100644
index 0000000000000..6fbbaf2441297
--- /dev/null
+++ b/aten/src/ATen/native/FusedAdam.h
@@ -0,0 +1,30 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+
+namespace native {
+
+enum class ADAM_MODE : uint8_t { ORIGINAL = 0, ADAMW = 1 };
+
+using fused_adam_fn = void (*)(
+    const at::Tensor& param,
+    const at::Tensor& grad,
+    const at::Tensor& exp_avg,
+    const at::Tensor& exp_avg_sq,
+    const at::Tensor& max_exp_avg_sq,
+    const at::Tensor& state_step,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool amsgrad,
+    const bool maximize,
+    const float* grad_scale_ptr,
+    const ADAM_MODE);
+
+DECLARE_DISPATCH(fused_adam_fn, fused_adam_stub);
+
+}
+}
diff --git a/aten/src/ATen/native/FusedSGD.cpp b/aten/src/ATen/native/FusedSGD.cpp
new file mode 100644
index 0000000000000..56e2e91759113
--- /dev/null
+++ b/aten/src/ATen/native/FusedSGD.cpp
@@ -0,0 +1,86 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/FusedSGD.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_fused_sgd.h>
+#include <ATen/ops/_fused_sgd_native.h>
+#endif
+namespace at {
+
+namespace native {
+
+
+void _fused_sgd_kernel_cpu_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList momentum_buffer_list,
+    const double weight_decay,
+    const double momentum,
+    const double lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf) {
+  const float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  const float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  if (found_inf_ptr && *found_inf_ptr == 1.0) {
+      return;
+  }
+  size_t n_tensors = params.size();
+  TORCH_CHECK(grads.size() == n_tensors);
+  bool no_momentum_buffer = momentum == 0.0;
+  if (no_momentum_buffer) {
+    TORCH_CHECK(momentum_buffer_list.size() == 0);
+  } else {
+    TORCH_CHECK(momentum_buffer_list.size() == n_tensors);
+  }
+  for (size_t i = 0; i < n_tensors; i++){
+    fused_sgd_stub(
+      kCPU,
+      params[i],
+      grads[i],
+      no_momentum_buffer ? Tensor() : momentum_buffer_list[i],
+      weight_decay,
+      momentum,
+      lr,
+      dampening,
+      nesterov,
+      maximize,
+      is_first_step,
+      grad_scale_ptr);
+  }
+}
+
+void _fused_sgd_kernel_cpu_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList momentum_buffer_list,
+    const double weight_decay,
+    const double momentum,
+    const at::Tensor& lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf) {
+    _fused_sgd_kernel_cpu_(
+        params, grads, momentum_buffer_list, weight_decay,
+        momentum, lr.item<double>(), dampening, nesterov,
+        maximize, is_first_step, grad_scale, found_inf
+    );
+}
+
+DEFINE_DISPATCH(fused_sgd_stub);
+
+}
+}
diff --git a/aten/src/ATen/native/FusedSGD.h b/aten/src/ATen/native/FusedSGD.h
new file mode 100644
index 0000000000000..62cd3c8aef73b
--- /dev/null
+++ b/aten/src/ATen/native/FusedSGD.h
@@ -0,0 +1,24 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+
+namespace native {
+
+using fused_sgd_fn = void (*)(
+    const at::Tensor& param,
+    const at::Tensor& grad,
+    const at::Tensor& momentum_buffer,
+    const double weight_decay,
+    const double momentum,
+    const double lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const float* grad_scale_ptr);
+
+DECLARE_DISPATCH(fused_sgd_fn, fused_sgd_stub);
+
+}
+}
diff --git a/aten/src/ATen/native/GatedLinearUnit.cpp b/aten/src/ATen/native/GatedLinearUnit.cpp
index 73028d12f9c8b..3a4aaab632ced 100644
--- a/aten/src/ATen/native/GatedLinearUnit.cpp
+++ b/aten/src/ATen/native/GatedLinearUnit.cpp
@@ -71,9 +71,9 @@ Tensor& glu_backward_cpu_out(const Tensor& grad_output, const Tensor& input,
   // for second gradinput half, can get a better performance by fusion
   auto iter = at::TensorIteratorConfig()
     .add_output(gradInputsecondHalf)
-    .add_input(gradInputfirstHalf)
-    .add_input(firstHalf)
-    .add_input(grad_output)
+    .add_const_input(gradInputfirstHalf)
+    .add_const_input(firstHalf)
+    .add_const_input(grad_output)
     .build();
   glu_backward_stub(iter.device_type(), iter);
   gradInputfirstHalf.mul_(grad_output);
@@ -99,10 +99,10 @@ Tensor glu_jvp(
   auto dglu = at::empty_like(glu);
   auto iter = at::TensorIteratorConfig()
     .add_output(dglu)
-    .add_input(glu)
-    .add_input(b)
-    .add_input(da)
-    .add_input(db)
+    .add_const_input(glu)
+    .add_const_input(b)
+    .add_const_input(da)
+    .add_const_input(db)
     .build();
   glu_jvp_stub(iter.device_type(), iter);
   return dglu;
diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp
index 329f22bd99777..5d0259eeb1ba2 100644
--- a/aten/src/ATen/native/GridSampler.cpp
+++ b/aten/src/ATen/native/GridSampler.cpp
@@ -75,19 +75,19 @@ namespace {
     int64_t out_sD = output.stride(2);
     int64_t out_sH = output.stride(3);
     int64_t out_sW = output.stride(4);
-    scalar_t *inp_ptr = input.data_ptr<scalar_t>();
+    const scalar_t *inp_ptr = input.const_data_ptr<scalar_t>();
     scalar_t *out_ptr = output.data_ptr<scalar_t>();
-    scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
+    const scalar_t *grid_ptr = grid.const_data_ptr<scalar_t>();
     // loop over each output pixel
     at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
       for (const auto n : c10::irange(start, end)) {
-        scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-        scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
+        const scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
+        const scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
         for (const auto d : c10::irange(out_D)) {
           for (const auto h : c10::irange(out_H)) {
             for (const auto w : c10::irange(out_W)) {
               // get the corresponding input x, y, z co-ordinates from grid
-              scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
+              const scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
               scalar_t ix = *grid_ptr_NDHW;
               scalar_t iy = grid_ptr_NDHW[grid_sCoor];
               scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor];
@@ -144,7 +144,7 @@ namespace {
 
                 // calculate bilinear weighted pixel value and set output pixel
                 scalar_t *out_ptr_NCDHW = out_ptr + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
-                scalar_t *inp_ptr_NC = inp_ptr_N;
+                const scalar_t *inp_ptr_NC = inp_ptr_N;
                 for (int64_t c = 0; c < C; ++c, out_ptr_NCDHW += out_sC, inp_ptr_NC += inp_sC) {
                   //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne
                   // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse
@@ -181,9 +181,9 @@ namespace {
                 int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
                 int64_t iz_nearest = static_cast<int64_t>(std::nearbyint(iz));
 
-                // assign nearest neighor pixel value to output pixel
+                // assign nearest neighbour pixel value to output pixel
                 scalar_t *out_ptr_NCDHW = out_ptr + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
-                scalar_t *inp_ptr_NC = inp_ptr_N;
+                const scalar_t *inp_ptr_NC = inp_ptr_N;
                 for (int64_t c = 0; c < C; ++c, out_ptr_NCDHW += out_sC, inp_ptr_NC += inp_sC) {
                   if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H, inp_W)) {
                     *out_ptr_NCDHW = inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH + ix_nearest * inp_sW];
@@ -268,9 +268,9 @@ namespace {
     }
     int64_t gGrid_sN = grad_grid.stride(0);
     int64_t gGrid_sW = grad_grid.stride(3);
-    scalar_t *inp_ptr = input.data_ptr<scalar_t>();
-    scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
-    scalar_t *gOut_ptr = grad_output.data_ptr<scalar_t>();
+    const scalar_t *inp_ptr = input.const_data_ptr<scalar_t>();
+    const scalar_t *grid_ptr = grid.const_data_ptr<scalar_t>();
+    const scalar_t *gOut_ptr = grad_output.const_data_ptr<scalar_t>();
     scalar_t *gInp_ptr = nullptr;
     if (input_requires_grad) {
       gInp_ptr = grad_input.mutable_data_ptr<scalar_t>();
@@ -279,14 +279,14 @@ namespace {
     // loop over each output pixel
     at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
       for (const auto n : c10::irange(start, end)) {
-        scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-        scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
+        const scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
+        const scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
         scalar_t *gGrid_ptr_NDHW = gGrid_ptr + n * gGrid_sN;
         for (const auto d : c10::irange(out_D)) {
           for (const auto h : c10::irange(out_H)) {
             for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) {
               // get the corresponding input x, y, z co-ordinates from grid
-              scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
+              const scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
               scalar_t ix = *grid_ptr_NDHW;
               scalar_t iy = grid_ptr_NDHW[grid_sCoor];
               scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor];
@@ -344,8 +344,8 @@ namespace {
                 scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
 
                 scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0), giz = static_cast<scalar_t>(0);
-                scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
-                scalar_t *inp_ptr_NC = inp_ptr_N;
+                const scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+                const scalar_t *inp_ptr_NC = inp_ptr_N;
                 scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
                 // calculate bilinear weighted pixel value and set output pixel
                 for (int64_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
@@ -422,8 +422,8 @@ namespace {
                 int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
                 int64_t iz_nearest = static_cast<int64_t>(std::nearbyint(iz));
 
-                // assign nearest neighor pixel value to output pixel
-                scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+                // assign nearest neighbour pixel value to output pixel
+                const scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
                 if (input_requires_grad) {
                   scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
                   for (int64_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC) {
@@ -589,18 +589,18 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
   int64_t out_sC = output.stride(1);
   int64_t out_sH = output.stride(2);
   int64_t out_sW = output.stride(3);
-  scalar_t *inp_ptr = input.data_ptr<scalar_t>();
+  const scalar_t *inp_ptr = input.const_data_ptr<scalar_t>();
   scalar_t *out_ptr = output.data_ptr<scalar_t>();
-  scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
+  const scalar_t *grid_ptr = grid.const_data_ptr<scalar_t>();
   // loop over each output pixel
   at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
     for (const auto n : c10::irange(start, end)) {
-      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
+      const scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
+      const scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
       for (const auto h : c10::irange(out_H)) {
         for (const auto w : c10::irange(out_W)) {
           // get the corresponding input x, y, z co-ordinates from grid
-          scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
+          const scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
           scalar_t x = *grid_ptr_NHW;
           scalar_t y = grid_ptr_NHW[grid_sCoor];
 
@@ -630,7 +630,7 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
             scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
 
             // calculate bilinear weighted pixel value and set output pixel
-            scalar_t *inp_ptr_NC = inp_ptr_N;
+            const scalar_t *inp_ptr_NC = inp_ptr_N;
             scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
             for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
               auto res = static_cast<scalar_t>(0);
@@ -652,9 +652,9 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
             int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
             int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
 
-            // assign nearest neighor pixel value to output pixel
+            // assign nearest neighbour pixel value to output pixel
             scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-            scalar_t *inp_ptr_NC = inp_ptr_N;
+            const scalar_t *inp_ptr_NC = inp_ptr_N;
             for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
               if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
                 *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
@@ -676,13 +676,13 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
             const scalar_t tx = ix - ix_nw;
             const scalar_t ty = iy - iy_nw;
 
-            scalar_t *inp_ptr_NC = inp_ptr_N;
+            const scalar_t *inp_ptr_NC = inp_ptr_N;
             scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
             for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
               // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
               scalar_t coefficients[4];
 
-              // Interpolate 4 values in the x directon
+              // Interpolate 4 values in the x direction
               for (const auto i : c10::irange(4)) {
                 coefficients[i] = cubic_interp1d<scalar_t>(
                   get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
@@ -758,21 +758,21 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
   int64_t gInp_sW = grad_input.stride(3);
   int64_t gGrid_sN = grad_grid.stride(0);
   int64_t gGrid_sW = grad_grid.stride(2);
-  scalar_t *inp_ptr = input.data_ptr<scalar_t>();
-  scalar_t *grid_ptr = grid.data_ptr<scalar_t>();
-  scalar_t *gOut_ptr = grad_output.data_ptr<scalar_t>();
+  const scalar_t *inp_ptr = input.const_data_ptr<scalar_t>();
+  const scalar_t *grid_ptr = grid.const_data_ptr<scalar_t>();
+  const scalar_t *gOut_ptr = grad_output.const_data_ptr<scalar_t>();
   scalar_t *gInp_ptr = grad_input.mutable_data_ptr<scalar_t>();
   scalar_t *gGrid_ptr = grad_grid.data_ptr<scalar_t>();
   // loop over each output pixel
   at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
     for (const auto n : c10::irange(start, end)) {
-      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
+      const scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
+      const scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
       scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN;
       for (const auto h : c10::irange(out_H)) {
         for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) {
           // get the corresponding input x, y co-ordinates from grid
-          scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
+          const scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
           scalar_t x = *grid_ptr_NHW;
           scalar_t y = grid_ptr_NHW[grid_sCoor];
 
@@ -804,9 +804,9 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
             scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
 
             scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0);
-            scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
+            const scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
             scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
-            scalar_t *inp_ptr_NC = inp_ptr_N;
+            const scalar_t *inp_ptr_NC = inp_ptr_N;
             // calculate bilinear weighted pixel value and set output pixel
             for (int64_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
               scalar_t gOut = *gOut_ptr_NCHW;
@@ -847,8 +847,8 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
             int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
             int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
 
-            // assign nearest neighor pixel value to output pixel
-            scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
+            // assign nearest neighbour pixel value to output pixel
+            const scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
             scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
             for (int64_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC) {
               // calculate and set grad_input
@@ -883,9 +883,9 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
             scalar_t gix = static_cast<scalar_t>(0);
             scalar_t giy = static_cast<scalar_t>(0);
 
-            scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
+            const scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
             scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
-            scalar_t *inp_ptr_NC = inp_ptr_N;
+            const scalar_t *inp_ptr_NC = inp_ptr_N;
 
             for (int64_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC+= inp_sC) {
               scalar_t gOut = *gOut_ptr_NCHW;
diff --git a/aten/src/ATen/native/GridSampler.h b/aten/src/ATen/native/GridSampler.h
index aaeb7331c3e88..509a305fe4b5e 100644
--- a/aten/src/ATen/native/GridSampler.h
+++ b/aten/src/ATen/native/GridSampler.h
@@ -211,7 +211,7 @@ static inline bool within_bounds_3d(int64_t d, int64_t h, int64_t w, int64_t D,
 
 template<typename scalar_t>
 static inline scalar_t get_value_bounded(
-    scalar_t* data,
+    const scalar_t* data,
     scalar_t x,
     scalar_t y,
     int64_t W,
diff --git a/aten/src/ATen/native/LegacyBatching.cpp b/aten/src/ATen/native/LegacyBatching.cpp
index cb461932b9a64..8aa08a875f7d9 100644
--- a/aten/src/ATen/native/LegacyBatching.cpp
+++ b/aten/src/ATen/native/LegacyBatching.cpp
@@ -115,10 +115,7 @@ Tensor _remove_batch_dim(const Tensor& self, int64_t level, int64_t batch_size,
   const auto* batched = maybeGetBatchedImpl(self);
   TORCH_INTERNAL_ASSERT(batched != nullptr);
 
-  Tensor self_without_bdim;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int64_t newly_exposed_logical_dim;
-  std::tie(self_without_bdim, newly_exposed_logical_dim) = remove_existing_batch_dim(batched, level);
+  auto [self_without_bdim, newly_exposed_logical_dim] = remove_existing_batch_dim(batched, level);
   return maybe_movedim(self_without_bdim, newly_exposed_logical_dim, out_dim);
 }
 
diff --git a/aten/src/ATen/native/Lerp.cpp b/aten/src/ATen/native/Lerp.cpp
index b7e04f0841311..cded246ec7bad 100644
--- a/aten/src/ATen/native/Lerp.cpp
+++ b/aten/src/ATen/native/Lerp.cpp
@@ -20,9 +20,9 @@ TORCH_META_FUNC(lerp_Tensor)(
               " for `weight` but got dtype ", weight.dtype());
   build(at::TensorIteratorConfig()
         .add_output(maybe_get_output())
-        .add_input(self)
-        .add_input(end)
-        .add_input(weight));
+        .add_const_input(self)
+        .add_const_input(end)
+        .add_const_input(weight));
 }
 
 TORCH_META_FUNC(lerp_Scalar)(
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index 8dfcff2342069..9322776b03f5a 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -30,6 +30,7 @@
 #endif
 
 #include <cctype>
+#include <deque>
 #include <string>
 #include <utility>
 #include <vector>
@@ -70,6 +71,14 @@ static inline Tensor _flatten_nd_linear(const Tensor& input, const Tensor& weigh
 
 
 Tensor linear(const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt) {
+  // _matmul_impl checks this again later, but _flatten_nd_linear does not work on scalars inputs,
+  // so let's try to catch this here already
+  const auto input_dim = input.dim();
+  const auto weight_dim = weight.dim();
+  TORCH_CHECK(input_dim != 0 && weight_dim != 0,
+              "both arguments to linear need to be at least 1D, but they are ",
+              input_dim, "D and ", weight_dim, "D");
+
   // See [Note: hacky wrapper removal for optional tensor]
   auto bias = bias_opt.has_value()
     ? c10::MaybeOwned<Tensor>::borrowed(*bias_opt)
@@ -82,7 +91,6 @@ Tensor linear(const Tensor& input, const Tensor& weight, const c10::optional<Ten
     return xnnpack::linear(input, weight, *bias);
   }
 #endif
-  const auto input_dim = input.dim();
   if (input_dim == 2 && bias->defined()) {
     // Fused op is marginally faster.
     return at::addmm(*bias, input, weight.t());
@@ -703,6 +711,28 @@ Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
+  if (bias.defined()) {
+    TORCH_CHECK(
+        input1.dtype() == input2.dtype() && input1.dtype() == weight.dtype() &&
+            input1.dtype() == bias.dtype(),
+        "All tensors must have the same dtype, got input1: ",
+        input1.dtype(),
+        ", input2: ",
+        input2.dtype(),
+        ", weight: ",
+        weight.dtype(),
+        ", bias: ",
+        bias.dtype());
+  } else {
+    TORCH_CHECK(
+        input1.dtype() == input2.dtype() && input1.dtype() == weight.dtype(),
+        "All tensors must have the same dtype, got input1: ",
+        input1.dtype(),
+        ", input2: ",
+        input2.dtype(),
+        ", weight: ",
+        weight.dtype());
+  }
 
   TORCH_CHECK(input1.dim() == input2.dim(), "bilinear(): input dimensions do not match: got ", input1.dim(), " and ", input2.dim());
   for (const auto i : c10::irange(input1.dim() - 1)) {
@@ -785,7 +815,7 @@ Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1,
       rsizes.emplace_back(t2.sym_size(i));
     }
   }
-  // permut and reshape for matrix multiplication
+  // permute and reshape for matrix multiplication
   t1 = t1.permute(p1).reshape_symint({size1, csize});
   t2 = t2.permute(p2).reshape_symint({csize, size2});
   // multiply and reshape to target size
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 397b546fbb0f6..81f461f6c95b8 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -12,12 +12,14 @@
 #include <ATen/TensorUtils.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/CPUBlas.h>
+#include <ATen/native/cpu/int_mm_kernel.h>
 #include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/ReduceOps.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/mkldnn/Matmul.h>
+#include <c10/core/GradMode.h>
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
 #include <variant>
@@ -28,12 +30,17 @@
 #else
 #include <ATen/ops/_addmm_activation_native.h>
 #include <ATen/ops/_compute_linear_combination_native.h>
+#include <ATen/ops/_convert_weight_to_int4pack_native.h>
+#include <ATen/ops/_int_mm_native.h>
 #include <ATen/ops/_linalg_check_errors.h>
 #include <ATen/ops/_linalg_det.h>
 #include <ATen/ops/_linalg_det_native.h>
 #include <ATen/ops/_linalg_slogdet.h>
 #include <ATen/ops/_linalg_slogdet_native.h>
 #include <ATen/ops/_unsafe_view.h>
+#include <ATen/ops/_weight_int4pack_mm_native.h>
+#include <ATen/ops/_weight_int8pack_mm_native.h>
+#include <ATen/ops/abs.h>
 #include <ATen/ops/addbmm_native.h>
 #include <ATen/ops/addmm_native.h>
 #include <ATen/ops/addr.h>
@@ -114,6 +121,7 @@
 #include <ATen/ops/mul.h>
 #include <ATen/ops/mv.h>
 #include <ATen/ops/narrow.h>
+#include <ATen/ops/ne.h>
 #include <ATen/ops/norm.h>
 #include <ATen/ops/nuclear_norm_native.h>
 #include <ATen/ops/ones.h>
@@ -421,8 +429,7 @@ std::tuple<Tensor&, Tensor&> slogdet_out(const Tensor& A, Tensor& sign, Tensor&
 Tensor logdet(const Tensor& A) {
   squareCheckInputs(A, "logdet");
   checkFloatingOrComplex(A, "logdet", /*low_precision*/false);
-  Tensor sign, logabsdet;
-  std::tie(sign, logabsdet) = at::linalg_slogdet(A);
+  auto [sign, logabsdet] = at::linalg_slogdet(A);
 
   if (A.is_complex()) {
     return sign.log() + logabsdet;
@@ -443,7 +450,12 @@ std::tuple<Tensor, Tensor> get_atol_rtol(
     const optional<Tensor>& atol_opt,
     const optional<Tensor>& rtol_opt,
     const c10::string_view function_name) {
-  auto options = input.options().dtype(ScalarType::Double);
+  auto options = input.options();
+  if (input.device().type() == kMetal || input.device().type() == kMPS) {
+    options = options.dtype(ScalarType::Float);
+  } else {
+    options = options.dtype(ScalarType::Double);
+  }
   auto atol = atol_opt.has_value() ? atol_opt.value() : at::zeros({}, options);
   checkNotComplexTolerance(atol, function_name, "atol");
   Tensor rtol;
@@ -464,7 +476,7 @@ std::tuple<Tensor, Tensor> get_atol_rtol(
     const Tensor& input,
     optional<double> atol_opt,
     optional<double> rtol_opt) {
-  double atol = atol_opt.has_value() ? atol_opt.value() : 0.0;
+  auto atol = atol_opt.has_value() ? atol_opt.value() : 0.0;
   c10::SymFloat rtol;
   if (rtol_opt.has_value()) {
     rtol = rtol_opt.value();
@@ -475,7 +487,12 @@ std::tuple<Tensor, Tensor> get_atol_rtol(
            ? 0.0
            : default_rtol;
   }
-  auto options = input.options().dtype(ScalarType::Double);
+  auto options = input.options();
+  if (input.device().type() == kMetal || input.device().type() == kMPS) {
+    options = options.dtype(ScalarType::Float);
+  } else {
+    options = options.dtype(ScalarType::Double);
+  }
   auto atol_tensor = at::full({}, atol, options);
   auto rtol_tensor = at::full({}, rtol, options);
   return std::make_tuple(atol_tensor, rtol_tensor);
@@ -498,32 +515,28 @@ Tensor linalg_pinv(
               "linalg.pinv(", t, "{", input.sizes(), "}): expected a tensor with 2 or more dimensions "
               "of float, double, cfloat or cdouble types");
 
-  Tensor atol, rtol;
-  std::tie(atol, rtol) = get_atol_rtol(input, atol_opt, rtol_opt, "torch.linalg.pinv");
+  auto [atol, rtol] = get_atol_rtol(input, atol_opt, rtol_opt, "torch.linalg.pinv");
 
   if (input.sym_numel() == 0) {
     // The implementation below uses operations that do not work for zero numel tensors
     // therefore we need this early return for 'input.numel() == 0' case
-    Tensor U, S, V;
     // TODO: replace input.svd with linalg_svd when torch/xla can work with at::linalg_svd
-    std::tie(U, S, V) = input.svd();
+    auto [U, S, V] = input.svd();
     return at::matmul(V * S.reciprocal().unsqueeze(-2), U.mH());
   }
 
   // If not Hermitian use singular value decomposition, else use eigenvalue decomposition
   if (!hermitian) {
-    Tensor U, S, V;
     // TODO: replace input.svd with linalg_svd
     // using linalg_svd breaks pytorch/xla, see https://github.com/pytorch/xla/issues/2755
-    std::tie(U, S, V) = input.svd();
+    auto [U, S, V] = input.svd();
     Tensor max_val = at::narrow(S, /*dim=*/-1, /*start=*/0, /*length=*/1);  // singular values are sorted in descending order
     Tensor tol = at::max(atol.unsqueeze(-1), rtol.unsqueeze(-1) * max_val);
     Tensor S_pseudoinv = at::where(S > tol, S.reciprocal(), at::zeros({}, S.options())).to(input.dtype());
     // computes V @ diag(S_pseudoinv) @ U.conj().T
     return at::matmul(V * S_pseudoinv.unsqueeze(-2), U.mH());
   } else {
-    Tensor S, U;
-    std::tie(S, U) = at::linalg_eigh(input);
+    auto [S, U] = at::linalg_eigh(input);
     // For Hermitian matrices, singular values equal to abs(eigenvalues)
     Tensor S_abs = S.abs();
     // eigenvalues are sorted in ascending order starting with negative values, we need a maximum value of abs(eigenvalues)
@@ -536,15 +549,19 @@ Tensor linalg_pinv(
 }
 
 Tensor linalg_pinv(const Tensor& input, optional<double> atol, optional<double> rtol, bool hermitian) {
-  Tensor atol_tensor, rtol_tensor;
-  std::tie(atol_tensor, rtol_tensor) = get_atol_rtol(input, atol, rtol);
+  auto [atol_tensor, rtol_tensor] = get_atol_rtol(input, atol, rtol);
   return at::linalg_pinv(input, atol_tensor, rtol_tensor, hermitian);
 }
 
 Tensor linalg_pinv(const Tensor& input, const Tensor& rcond, bool hermitian) {
   // For NumPy compatibility the rcond argument is used as relative tolerance
   checkNotComplexTolerance(rcond, "torch.linalg.pinv", "rcond");
-  auto options = input.options().dtype(ScalarType::Double);
+  auto options = input.options();
+  if (input.device().type() == kMetal || input.device().type() == kMPS) {
+    options = options.dtype(ScalarType::Float);
+  } else {
+    options = options.dtype(ScalarType::Double);
+  }
   return at::linalg_pinv(input, at::zeros({}, options), rcond, hermitian);
 }
 
@@ -713,8 +730,7 @@ Tensor& matrix_rank_impl(
     const optional<Tensor>& rtol_opt,
     bool hermitian,
     Tensor& result) {
-  Tensor atol, rtol;
-  std::tie(atol, rtol) = get_atol_rtol(input, atol_opt, rtol_opt, "torch.linalg.matrix_rank");
+  auto [atol, rtol] = get_atol_rtol(input, atol_opt, rtol_opt, "torch.linalg.matrix_rank");
 
   checkSameDevice("torch.linalg.matrix_rank", result, input);
   checkSameDevice("torch.linalg.matrix_rank", atol, input, "atol");
@@ -788,8 +804,7 @@ Tensor& linalg_matrix_rank_out(
 }
 
 Tensor& linalg_matrix_rank_out(const Tensor& input, optional<double> atol, optional<double> rtol, bool hermitian, Tensor& result) {
-  Tensor atol_tensor, rtol_tensor;
-  std::tie(atol_tensor, rtol_tensor) = get_atol_rtol(input, atol, rtol);
+  auto [atol_tensor, rtol_tensor] = get_atol_rtol(input, atol, rtol);
   result = linalg_matrix_rank_out(input, atol_tensor, rtol_tensor, hermitian, result);
   return result;
 }
@@ -802,8 +817,7 @@ Tensor linalg_matrix_rank(const Tensor& input, const optional<Tensor>& atol, con
 Tensor linalg_matrix_rank(const Tensor& input, optional<double> atol, optional<double> rtol, bool hermitian) {
   auto result = get_matrix_rank_result_tensor(input);
 
-  Tensor atol_tensor, rtol_tensor;
-  std::tie(atol_tensor, rtol_tensor) = get_atol_rtol(input, atol, rtol);
+  auto [atol_tensor, rtol_tensor] = get_atol_rtol(input, atol, rtol);
 
   return matrix_rank_impl(input, atol_tensor, rtol_tensor, hermitian, result);
 }
@@ -831,8 +845,7 @@ Tensor linalg_matrix_rank(const Tensor& input, const Tensor& tol, bool hermitian
 Tensor linalg_matrix_rank(const Tensor& input, double tol, bool hermitian) {
   auto result = get_matrix_rank_result_tensor(input);
 
-  Tensor atol_tensor, rtol_tensor;
-  std::tie(atol_tensor, rtol_tensor) = get_atol_rtol(input, tol, 0.0);
+  auto [atol_tensor, rtol_tensor] = get_atol_rtol(input, tol, 0.0);
 
   return matrix_rank_impl(input, atol_tensor, rtol_tensor, hermitian, result);
 }
@@ -1016,7 +1029,7 @@ Tensor multi_dot_impl(TensorList _tensors, c10::optional<Tensor> _out) {
 
     // If the last and last tensors have shapes (a, b) and (b, c) the
     // output has shape (a, c). If either the first or last tensor is 1D
-    // a and/or c dimensions will be implicitely size 1 and will be ommited
+    // a and/or c dimensions will be implicitly size 1 and will be omitted
     // from the output. e.g. for inputs (a, b) x (b) the output has shape (a,).
     at::native::resize_output(out, out_shape);
 
@@ -1166,9 +1179,9 @@ static TensorIterator build_addr_iter(Tensor& result,
   auto iter = TensorIteratorConfig()
     .set_check_mem_overlap(true)
     .add_output(result)
-    .add_owned_input(*self_)
-    .add_owned_input(vec1.reshape({vec1_size0, 1}))
-    .add_input(vec2)
+    .add_owned_const_input(*self_)
+    .add_owned_const_input(vec1.reshape({vec1_size0, 1}))
+    .add_const_input(vec2)
     .allow_cpu_scalars(true)
     .promote_inputs_to_common_dtype(true)
     .cast_common_dtype_to_outputs(true)
@@ -1323,15 +1336,22 @@ Tensor outer(const Tensor& self, const Tensor& vec2) {
 
 
 #if !defined(C10_MOBILE)
-#define _AT_DISPATCH_ADDMM_TYPES(TYPE, NAME, ...)    \
-        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(      \
-            kBFloat16, kHalf, kFloat8_e5m2, kFloat8_e4m3fn, \
+#define _AT_DISPATCH_ADDMM_TYPES(TYPE, NAME, ...)                                               \
+        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND6(                                                 \
+            kBFloat16, kHalf, kFloat8_e5m2, kFloat8_e4m3fn, kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, \
+            TYPE, NAME, __VA_ARGS__)
+#else
+// Include half dtype in ADDMM. Used to build ExecuTorch in xplat.
+#if defined(C10_MOBILE_HALF)
+#define _AT_DISPATCH_ADDMM_TYPES(TYPE, NAME, ...)        \
+        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, \
             TYPE, NAME, __VA_ARGS__)
 #else
 #define _AT_DISPATCH_ADDMM_TYPES(TYPE, NAME, ...)        \
         AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, \
             TYPE, NAME, __VA_ARGS__)
 #endif
+#endif
 
 
 static inline int64_t get_mkldnn_matmul_min_dim() {
@@ -1635,8 +1655,8 @@ inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const T
   opmath_t beta = beta_.to<opmath_t>();
 
   auto r0 = result.accessor<scalar_t, 3>();
-  auto s0 = self.accessor<scalar_t, 3>();
-  auto m0 = mat2.accessor<scalar_t, 3>();
+  auto s0 = self.accessor<const scalar_t, 3>();
+  auto m0 = mat2.accessor<const scalar_t, 3>();
 
   int64_t grain_size = std::max(internal::GRAIN_SIZE / (is * js * ks), (int64_t)1);
   using opmath_t = at::opmath_type<scalar_t>;
@@ -1705,8 +1725,8 @@ static void baddbmm_with_gemm_(const Tensor &result, const Tensor &mat1, const T
         transpose_a ? TransposeType::Transpose : TransposeType::NoTranspose,
         transpose_b ? TransposeType::Transpose : TransposeType::NoTranspose,
         batch_size, m, n, k, alpha,
-        mat2.data_ptr<scalar_t>(), lda, mat2_strides[0],
-        mat1.data_ptr<scalar_t>(), ldb, mat1_strides[0],
+        mat2.const_data_ptr<scalar_t>(), lda, mat2_strides[0],
+        mat1.const_data_ptr<scalar_t>(), ldb, mat1_strides[0],
         beta,
         result.data_ptr<scalar_t>(), ldc, result_strides[0]);
   });
@@ -1756,7 +1776,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
   };
 
   bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
-  if (apply_heur && use_mkldnn_lower_precision_matmul(batch1, batch2, self_or_result)) {
+  if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) {
     try {
       mkldnn_matmul(batch1, batch2, self_or_result, beta.to<float>(), alpha.to<float>());
       return;
@@ -1796,7 +1816,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
      * vs. other threads, leading to undefined behavior.
      * Thus it is recommended to not use at::parallel_for where lambdas do
      * ops that go through dispatcher.
-     * For now we circument this by InferenceMode guard in order to unlock
+     * For now we circumvent this by InferenceMode guard in order to unlock
      * performance.
      * Longer term we probably want a separate API that explicitly calls out
      * the TLS that it propagates.
@@ -1822,6 +1842,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
                 r, r, batch1.select(0, b), batch2.select(0, b), 0, 1);
           }
         };
+        // Materialize if COW, since we cannot do so during parallel_for
+        self_or_result.mutable_data_ptr();
         at::parallel_for(0, bs, 1, bmm_out_fn);
       } else {
         for (const auto b : c10::irange(bs)) {
@@ -1838,6 +1860,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
                 batch1.select(0, b), batch2.select(0, b), beta, alpha);
           }
         };
+        // Materialize if COW, since we cannot do so during parallel_for
+        self_or_result.mutable_data_ptr();
         at::parallel_for(0, bs, 1, bmm_fn);
       } else {
         for (const auto b : c10::irange(bs)) {
@@ -1907,7 +1931,7 @@ Tensor& vdot_out(const Tensor& self, const Tensor& other, Tensor& result) {
   return result.fill_(self.vdot(other));
 }
 
-static bool should_fold(const Tensor& tensor1, const Tensor& tensor2) {
+static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_out) {
   // We check that we can fold the larger tensor into a matrix and dispatch to mm or mv rather than
   // to bmm. We want to make sure we can do so without incurring in any extra copy
   const auto tensor1_larger = tensor1.dim() >= tensor2.dim();
@@ -1933,10 +1957,13 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2) {
   // The output gradient g of this operation would have shape [b, m, k]
   // The backward wrt. t2 of bmm would be given by t1.mH @ g, which has shape [b, n, k]
   // Then, the backward of expand is simply `sum(0)`. As such, we are instantiating a tensor
-  // of shape [b, n, k] unnacessarily, which may cause a large memory footprint, and in the
+  // of shape [b, n, k] unnecessarily, which may cause a large memory footprint, and in the
   // worst case, an OOM
   bool t2_requires_grad = tensor1_larger ? tensor2.requires_grad() : tensor1.requires_grad();
-  if (t2_requires_grad) {
+  if (t2_requires_grad && !has_out) {
+    // We should be checking !at::GradMode::is_enabled(), but apparently
+    // this regresses performance in some cases:
+    // https://github.com/pytorch/pytorch/issues/118548#issuecomment-1916022394
     return true;
   }
 
@@ -1995,6 +2022,15 @@ static Tensor _matmul_impl(
 
   const bool has_out = out.defined();
 
+  if (has_out) {
+    // Usually we would rely on the out= kernels we decompose into to check this, but
+    // for matmul there is logic at the composite level that relies on this invariant.
+    TORCH_CHECK(!(tensor1.requires_grad() || tensor2.requires_grad() || out.requires_grad()) || !at::GradMode::is_enabled(),
+      "matmul(): functions with out=... arguments don't support automatic differentiation, "
+      "but one of the arguments requires grad."
+    );
+  }
+
   if (dim_tensor1 == 1 && dim_tensor2 == 1) {
     return has_out ? at::dot_out(out, tensor1, tensor2) : tensor1.dot(tensor2);
   } else if (dim_tensor1 == 2 && dim_tensor2 == 1) {
@@ -2004,7 +2040,7 @@ static Tensor _matmul_impl(
                    : tensor1.unsqueeze(0).mm(tensor2).squeeze_(0);
   } else if (dim_tensor1 == 2 && dim_tensor2 == 2) {
     return has_out ? at::mm_out(out, tensor1, tensor2) : tensor1.mm(tensor2);
-  } else if (should_fold(tensor1, tensor2)) {
+  } else if (should_fold(tensor1, tensor2, has_out)) {
     // dim_tensor1 >=3 && (dim_tensor2 == 1 || dim_tensor2 == 2) ||
     // dim_tensor2 >=3 && (dim_tensor1 == 1 || dim_tensor1 == 2)
     // and at least one of the following two conditions hold
@@ -2553,10 +2589,9 @@ Tensor compute_T18_scale_square(
   // gives us an opportunity to calculate the matrix multiplication in a batch.
   // The first thing we need to do is sort tensor `s`, which will be helpful to
   // do the matrix multiplication by range.
-  Tensor sorted_s, sorted_s_inds;
   // With above example, `sorted_s` is [0, 1, 1, 4], we also will need the index
   // info, so we can use it to compose the result back.
-  std::tie(sorted_s, sorted_s_inds) = at::sort(s, /*dim=*/0);
+  auto [sorted_s, sorted_s_inds] = at::sort(s, /*dim=*/0);
   sorted_s = sorted_s.to(at::kLong);
   // Then we call `unique_consecutive` and we will use it to split `sorted_s`,
   // with above example, `split_counts` is [1, 2, 1].
@@ -2575,10 +2610,10 @@ Tensor compute_T18_scale_square(
 
   TORCH_INTERNAL_ASSERT(section_values.is_contiguous());
   const auto section_numel = section_values.numel() / 2;
-  auto scs = section_values.data_ptr<int64_t>();
+  auto scs = section_values. template data_ptr<int64_t>();
   auto pts = &scs[section_numel];
 
-  // We now will do the matrix muplication in a batch, with above example:
+  // We now will do the matrix multiplication in a batch, with above example:
   // 1. Multiply all matrices by 0 (`mul_times[0]`) times, then do `slice`
   // to get the remain matrices by acc[1:] (`split_counts[0]`),
   // 2. Multiply remain matrices by 1 times and slice to acc[2:]
@@ -2737,7 +2772,7 @@ Tensor backward_analytic_function_of_a_matrix(
 } // end anon namespace
 
 // Computes the matrix exponential for a given batch of squared matrices.
-// The implementaion is based on:
+// The implementation is based on:
 //
 // Bader, P.; Blanes, S.; Casas, F.
 // Computing the Matrix Exponential with an Optimized Taylor Polynomial Approximation.
@@ -2782,13 +2817,49 @@ TORCH_IMPL_FUNC(linalg_vector_norm_out)(const Tensor& self, const Scalar& scalar
   // values larger than 10^53 (same for negative numbers), so that's fine.
   auto ord = scalar_ord.toDouble();
   auto dim = opt_dim.value_or(IntArrayRef{});
+  auto size = self.sizes();
+  auto ndim = self.dim();
+
+  auto opt_dim_ = dim.vec();
+  maybe_wrap_dims(opt_dim_, ndim);
+
+  using Int = IntArrayRef::value_type;
+  std::vector<Int> all_dim(ndim);
+  std::iota(all_dim.begin(), all_dim.end(), 0);
+
+  bool is_all_reduce = !opt_dim.has_value() || opt_dim.value().empty();
+  auto reduce_dim = is_all_reduce ? all_dim : opt_dim_;
+
+  bool is_reduce_over_1D_vector = true;
+  for (auto i : reduce_dim) {
+    if (size[i] != 1){
+      is_reduce_over_1D_vector = false;
+      break;
+    }
+  }
+
+  if (is_reduce_over_1D_vector) {
+    Tensor self_;
+    if (opt_dtype.has_value()) {
+      self_ = self.to(*opt_dtype);
+    } else {
+      self_ = self;
+    }
+    if (ord != 0.0) {
+      keepdim ? at::abs_outf(self_, const_cast<Tensor&>(result)) : at::abs_outf(self_.squeeze(reduce_dim), const_cast<Tensor&>(result));
+    } else {
+      keepdim ? at::ne_outf(self_, 0, const_cast<Tensor&>(result)) : at::ne_outf(self_.squeeze(reduce_dim), 0, const_cast<Tensor&>(result));
+    }
+    return;
+  }
+
   // No need to handle opt_dtype explicitly as it is already encoded in the dtype of result
 
   // https://github.com/pytorch/pytorch/issues/52648
   // Reductions always use `std::abs` to compute the absolute value. In the backward of this
   // function, we need to locate the index that was selected as the largest value. To do so
   // we do self.abs() == result to locate the index of the largest element.
-  // Now, self.abs() may dispatch to a vectorized implementation which gives sliiightly different
+  // Now, self.abs() may dispatch to a vectorized implementation which gives slightly different
   // results to the std::abs(std::complex<T>) implementation.
   // As such, to be able to compute the correct index in the backward, we need to use self.abs()
   // both in the forward and in the backward
@@ -3360,5 +3431,178 @@ Tensor kron(const Tensor& self, const Tensor& other) {
   return KronImpl(self, other).kron();
 }
 
+// Weight Only Quantization Gemm
+DEFINE_DISPATCH(weight_to_int4pack_stub);
+DEFINE_DISPATCH(int4pack_mm_stub);
+DEFINE_DISPATCH(int8pack_mm_stub);
+
+Tensor _convert_weight_to_int4pack_cpu(
+    const Tensor& in,
+    int64_t innerKTiles) {
+
+  TORCH_CHECK(in.dim() == 2,
+      __func__, " : expect weight to be 2D tensor.");
+  TORCH_CHECK(in.dtype() == at::kInt,
+      __func__, " : expect weight to be kInt.");
+  TORCH_CHECK(innerKTiles == 2 || innerKTiles == 4 || innerKTiles == 8,
+      __func__, " : innerKTiles need to be 2, 4, or 8, got ", innerKTiles);
+
+  auto weight = in.contiguous();
+  auto N = weight.size(0);
+  auto K = weight.size(1);
+
+  // Create fake shapes for cpu. The meta registration in dynamo requires
+  // operator has the same output shape for each device. So creating a fake
+  // shape {N / 8, K / (16 * innerKTiles), 32, innerKTiles / 2}
+  constexpr int64_t kNTileSize = 8;
+  constexpr int64_t kKTileSize = 16;
+  auto nTiles = (N + kNTileSize - 1) / kNTileSize;
+
+  TORCH_CHECK(N % 16 == 0,
+      __func__, " : expect N to be dividable by 16");
+  const int64_t kSuperKTileSize = kKTileSize * innerKTiles;
+  TORCH_CHECK( K % kSuperKTileSize == 0,
+      __func__, " : epxect K to be dividable by ", kSuperKTileSize);
+  auto kSuperTiles = (K + kSuperKTileSize - 1) / kSuperKTileSize;
+
+  auto weight_packed = at::empty(
+      {nTiles, kSuperTiles, 32, innerKTiles / 2},
+      at::TensorOptions().dtype(at::kInt));
+
+  weight_to_int4pack_stub(kCPU, weight_packed, weight, N, K);
+  return weight_packed;
+}
+
+Tensor _weight_int4pack_mm_cpu(
+    const Tensor& A,
+    const Tensor& B,
+    int64_t qGroupSize,
+    const Tensor& qScaleAndZeros) {
+
+  constexpr int64_t kNTileSize = 8;
+
+  auto M = A.size(0);
+  auto N = B.size(0) * kNTileSize;
+  auto K = A.size(1);
+
+  TORCH_CHECK(A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
+      __func__, " : expect A to be either 32-bit or 16-bit float tensor.");
+  TORCH_CHECK(A.is_contiguous(),
+      __func__, " : expect A to be contiguous.");
+  TORCH_CHECK(A.dim() == 2,
+      __func__, " : expect A to be 2D tensor.");
+
+  TORCH_CHECK(B.dtype() == kInt,
+      __func__, " : expect B to be int32 tensor.");
+  TORCH_CHECK(B.is_contiguous(),
+      __func__, " : expect B to be contiguous.");
+  TORCH_CHECK(B.dim() == 4,
+      __func__, " : expect B to 4d tensor.");
+
+  TORCH_CHECK(qGroupSize == 32 || qGroupSize == 64 || qGroupSize == 128
+      || qGroupSize == 256,
+      __func__, ": expect qGroupSize to be 32, 64, 128 or 256, got ", qGroupSize);
+
+  TORCH_CHECK(qScaleAndZeros.dim() == 3 && qScaleAndZeros.size(1) == N
+      && qScaleAndZeros.size(2) == 2,
+      __func__, ": expect qScaleAndZeros to be 3d tensor with sizes [:, ", N, ", 2]");
+
+  auto C = at::empty({M, N}, A.options());
+  int4pack_mm_stub(kCPU, C, A, B, qGroupSize, qScaleAndZeros, N, K);
+
+  return C;
+}
+
+Tensor _weight_int8pack_mm_cpu(
+    const Tensor& A,
+    const Tensor& B,
+    const Tensor& scales) {
+
+  auto M = A.size(0);
+  auto N = B.size(0);
+  auto K = A.size(1);
+
+  TORCH_CHECK(A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
+      __func__, " : expect A to be either 32-bit or 16-bit float tensor.");
+  TORCH_CHECK(A.is_contiguous(),
+      __func__, " : expect A to be contiguous.");
+  TORCH_CHECK(A.dim() == 2,
+      __func__, " : expect A to be 2D tensor.");
+
+  TORCH_CHECK(B.dtype() == kChar,
+      __func__, " : expect B to be int8 tensor.");
+  TORCH_CHECK(B.is_contiguous(),
+      __func__, " : expect B to be contiguous.");
+  TORCH_CHECK(B.size(1) == K,
+      __func__, " : expect B.size(1) == ", K);
+
+  TORCH_CHECK(scales.dim() == 1 && scales.size(0) == N,
+      __func__, " : expect scales to be 1d tensor with size ", N);
+
+  auto C = at::empty({M, N}, A.options());
+  int8pack_mm_stub(kCPU, C, A, B, scales);
+
+  return C;
+}
+
+Tensor& _int_mm_out_cpu(const Tensor& self, const Tensor& mat2, Tensor& result) {
+  static constexpr c10::string_view func_name = "int_mm_out_cpu";
+  TORCH_CHECK(self.dim() == 2, func_name, ": Expected self to be of dimension 2 but got ", self.dim());
+  TORCH_CHECK(mat2.dim() == 2, func_name, ": Expected mat2 to be of dimension 2 but got ", mat2.dim());
+  TORCH_CHECK(self.size(1) == mat2.size(0), func_name, ": self.size(1) needs to match mat2.size(0) but got ", self.size(1), " and ", mat2.size(0));
+  TORCH_CHECK(self.dtype() == at::kChar, func_name, ": Expected self dtype to be of type int8 but got ", self.dtype());
+  TORCH_CHECK(mat2.dtype() == at::kChar, func_name, ": Expected mat2 dtype to be of type int8 but got ", mat2.dtype());
+  TORCH_CHECK(result.dtype() == at::kInt, func_name, ": Expected result dtype to be of type kInt but got ", result.dtype());
+  TORCH_CHECK(result.size(0) == self.size(0), func_name, ": Expected result.size(0) to be ", self.size(0), " but got ", result.size(0));
+  TORCH_CHECK(result.size(1) == mat2.size(1), func_name, ": Expected result.size(1) to be ", mat2.size(1), " but got ", result.size(1));
+  TORCH_CHECK(result.dim() == 2, func_name, ": Expected result to be of dimension 2 but got ", result.dim());
+  TORCH_CHECK(result.is_contiguous(), func_name, ": Expected result to be contiguous.");
+
+  if (result.numel() == 0 || self.size(1) == 0) {
+    return result.zero_();
+  }
+
+  bool dispatched = false;
+  if (at::globalContext().userEnabledMkldnn()) {
+    try {
+      mkldnn_matmul_i8i8i32(self, mat2, result);
+      dispatched = true;
+    } catch (const std::exception& e) {
+      TORCH_WARN(func_name, " failed, switching to BLAS gemm: ", e.what());
+    }
+  }
+  if (!dispatched) {
+    auto a = reinterpret_cast<int8_t*>(self.data_ptr());
+    auto b = reinterpret_cast<int8_t*>(mat2.data_ptr());
+    auto c = reinterpret_cast<int32_t*>(result.data_ptr());
+    const int64_t m = result.size(0);
+    const int64_t n = result.size(1);
+    const int64_t k = self.size(1);
+    const int64_t lda_0 = self.strides()[0];
+    const int64_t lda_1 = self.strides()[1];
+    const int64_t ldb_0 = mat2.strides()[0];
+    const int64_t ldb_1 = mat2.strides()[1];
+    const int64_t ldc = result.strides()[0];
+    parallel_for(0, m * n, 1, [&](int64_t start, int64_t end) {
+      for (const auto i : c10::irange(start, end)) {
+        auto row = i / n;
+        auto col = i % n;
+        c[row * ldc + col] = 0;
+        for (const auto k : c10::irange(k)) {
+          c[row * ldc + col] = c[row * ldc + col] +
+              static_cast<int32_t>(a[row * lda_0 + k * lda_1]) *
+                  static_cast<int32_t>(b[k * ldb_0 + col * ldb_1]);
+        }
+      }
+    });
+  }
+  return result;
+}
+
+Tensor _int_mm_cpu(const Tensor& self, const Tensor& mat2) {
+  Tensor result = at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
+  return _int_mm_out_cpu(self, mat2, result);
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h
index 141caa5236825..0b05d5162e668 100644
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@@ -331,8 +331,7 @@ static inline std::tuple<Tensor,Tensor> _linalg_broadcast_batch_dims(const Tenso
     linearSolveCheckInputs(arg1, arg2, name);
   }
 
-  std::vector<int64_t> arg1_expand_size, arg2_expand_size;
-  std::tie(arg1_expand_size, arg2_expand_size) = at::native::_linalg_broadcast_batch_dims(arg1, arg2);
+  auto [arg1_expand_size, arg2_expand_size] = at::native::_linalg_broadcast_batch_dims(arg1, arg2);
 
   auto arg1_broadcasted  = arg1_expand_size == arg1.sizes() ? arg1 : arg1.expand(arg1_expand_size);
   auto arg2_broadcasted  = arg2_expand_size == arg2.sizes() ? arg2 : arg2.expand(arg2_expand_size);
diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
index 0eafdf27648d2..e21d9f6008e8e 100644
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@@ -30,6 +30,7 @@
 #include <ATen/ops/kl_div_native.h>
 #include <ATen/ops/l1_loss_native.h>
 #include <ATen/ops/log.h>
+#include <ATen/ops/log_sigmoid.h>
 #include <ATen/ops/margin_ranking_loss_native.h>
 #include <ATen/ops/mean.h>
 #include <ATen/ops/min.h>
@@ -268,31 +269,34 @@ Tensor& binary_cross_entropy_out_cpu(const Tensor& input, const Tensor& target,
 
     auto iter = TensorIteratorConfig()
       .add_output(loss_squeezed)
-      .add_owned_input(at::squeeze(input))
-      .add_owned_input(at::squeeze(target))
+      .add_owned_const_input(at::squeeze(input))
+      .add_owned_const_input(at::squeeze(target))
       .build();
 
-    AT_DISPATCH_FLOATING_TYPES(loss.scalar_type(), "binary_cross_entropy", [&] {
-        at::native::cpu_kernel(
-            iter,
-            [] (scalar_t input_val, scalar_t target_val) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        loss.scalar_type(),
+        "binary_cross_entropy",
+        [&] {
+          at::native::cpu_kernel(
+              iter, [](scalar_t input_val, scalar_t target_val) {
                 TORCH_CHECK(
                     (input_val >= 0) && (input_val <= 1),
-                    "all elements of input should be between 0 and 1"
-                );
+                    "all elements of input should be between 0 and 1");
                 TORCH_CHECK(
                     (target_val >= 0) && (target_val <= 1),
-                    "all elements of target should be between 0 and 1"
-                );
+                    "all elements of target should be between 0 and 1");
 
                 // Binary cross entropy tensor is defined by the equation:
                 // L = -w (y ln(x) + (1-y) ln(1-x))
-                return (target_val - scalar_t(1))
-                    * std::max(scalar_t(std::log1p(-input_val)), scalar_t(-100))
-                    - target_val * std::max(scalar_t(std::log(input_val)), scalar_t(-100));
-            }
-        );
-    });
+                return (target_val - scalar_t(1)) *
+                    std::max(scalar_t(std::log1p(-input_val)), scalar_t(-100)) -
+                    target_val *
+                    std::max(scalar_t(std::log(input_val)), scalar_t(-100));
+              });
+        });
+
     if (weight.defined()) {
         loss.mul_(weight);
     }
@@ -322,26 +326,30 @@ Tensor& binary_cross_entropy_backward_out_cpu(const Tensor& grad, const Tensor&
 
     auto iter = TensorIteratorConfig()
       .add_output(grad_input_squeezed)
-      .add_owned_input(at::squeeze(grad))
-      .add_owned_input(at::squeeze(input))
-      .add_owned_input(at::squeeze(target))
+      .add_owned_const_input(at::squeeze(grad))
+      .add_owned_const_input(at::squeeze(input))
+      .add_owned_const_input(at::squeeze(target))
       .build();
 
-    AT_DISPATCH_FLOATING_TYPES(grad_input.scalar_type(), "binary_cross_entropy_backward", [&] {
-        at::native::cpu_kernel(
-            iter,
-            [] (scalar_t grad_val, scalar_t input_val, scalar_t target_val) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        grad_input.scalar_type(),
+        "binary_cross_entropy_backward",
+        [&] {
+          at::native::cpu_kernel(
+              iter,
+              [](scalar_t grad_val, scalar_t input_val, scalar_t target_val) {
                 // The gradient is the partial derivative of BCELoss
                 // with respect to x
                 // d(L)/d(x) = -w (y - x) / (x - x^2)
-                return grad_val * (input_val - target_val)
-                    / (scalar_t(std::max(
+                return grad_val * (input_val - target_val) /
+                    (scalar_t(std::max(
                         (scalar_t(1) - input_val) * input_val,
-                        scalar_t(EPSILON)
-                    )));
-            }
-        );
-    });
+                        scalar_t(EPSILON))));
+              });
+        });
+
     if (weight.defined()) {
         grad_input.mul_(weight);
     }
@@ -358,21 +366,20 @@ Tensor binary_cross_entropy_with_logits(const Tensor& input, const Tensor& targe
   c10::MaybeOwned<Tensor> pos_weight_maybe_owned = at::borrow_from_optional_tensor(pos_weight_opt);
   const Tensor& pos_weight = *pos_weight_maybe_owned;
 
-    Tensor loss;
-    auto max_val = (-input).clamp_min_(0);
-    if (pos_weight.defined()) {
-        // pos_weight need to be broadcasted, thus mul(target) is not inplace.
-        auto log_weight = (pos_weight - 1).mul(target).add_(1);
-        loss = (1 - target).mul_(input).add_(log_weight.mul_(((-max_val).exp_().add_((-input - max_val).exp_())).log_().add_(max_val)));
-    } else {
-        loss = (1 - target).mul_(input).add_(max_val).add_((-max_val).exp_().add_((-input -max_val).exp_()).log_());
-    }
+  auto log_sigmoid_input = at::log_sigmoid(input);
+  if (pos_weight.defined()) {
+      // pos_weight need to be broadcasted, thus mul(target) is not inplace.
+      auto log_weight = (pos_weight - 1).mul(target).add_(1);
+      log_sigmoid_input.mul_(log_weight);
+  }
 
-    if (weight.defined()) {
-        loss.mul_(weight);
-    }
+  Tensor loss = (1 - target).mul_(input).sub_(log_sigmoid_input);
 
-    return apply_loss_reduction(loss, reduction);
+  if (weight.defined()) {
+      loss.mul_(weight);
+  }
+
+  return apply_loss_reduction(loss, reduction);
 }
 
 Tensor poisson_nll_loss(const Tensor& input, const Tensor& target, const bool log_input, const bool full, const double eps, const int64_t reduction)
@@ -435,9 +442,9 @@ Tensor& smooth_l1_loss_backward_out(const Tensor& grad_output, const Tensor& inp
   auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.;
   auto iter = at::TensorIteratorConfig()
     .add_output(grad_input)
-    .add_input(input)
-    .add_input(target)
-    .add_input(grad_output)
+    .add_const_input(input)
+    .add_const_input(target)
+    .add_const_input(grad_output)
     .promote_inputs_to_common_dtype(true)
     .cast_common_dtype_to_outputs(true)
     .enforce_safe_casting_to_output(true)
@@ -480,9 +487,9 @@ Tensor& huber_loss_backward_out(const Tensor& grad_output, const Tensor& input,
   auto norm = (reduction == Reduction::Mean) ? (1. / input.numel()) : 1.;
   auto iter = at::TensorIteratorConfig()
     .add_output(grad_input)
-    .add_input(input)
-    .add_input(target)
-    .add_input(grad_output)
+    .add_const_input(input)
+    .add_const_input(target)
+    .add_const_input(grad_output)
     .build();
   huber_backward_stub(iter.device_type(), iter, norm, delta);
   return grad_input;
@@ -498,9 +505,9 @@ Tensor& mse_loss_backward_out(const Tensor& grad_output,
   auto norm = reduction == Reduction::Mean ? 2. / input.numel() : 2.;
   auto iter = at::TensorIteratorConfig()
     .add_output(grad_input)
-    .add_input(input)
-    .add_input(target)
-    .add_input(grad_output)
+    .add_const_input(input)
+    .add_const_input(target)
+    .add_const_input(grad_output)
     .build();
   mse_backward_stub(iter.device_type(), iter, norm);
   return grad_input;
diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
index b6ad40b344b23..b13ed7e2ce921 100644
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@@ -77,6 +77,9 @@ std::tuple<Tensor, Tensor, size_t, std::vector<int64_t>> ctc_loss_allocate_outpu
   if (targets.dim() == 1) { // concatenated targets
     int64_t pos = 0;
     for (const auto i : c10::irange(batch_size)) {
+      TORCH_CHECK(target_lengths[i] >= 0,
+                  "Expected target_lengths to have value at least ", 0, ", but got value ", target_lengths[i],
+                  " (while checking arguments for ", c, ")");
       tg_batch_offsets[i] = pos;
       pos += target_lengths[i];
       if (max_target_length < target_lengths[i])
@@ -89,6 +92,9 @@ std::tuple<Tensor, Tensor, size_t, std::vector<int64_t>> ctc_loss_allocate_outpu
     // dim is 2
     int64_t tg_batch_stride = targets.stride(0);
     for (const auto i : c10::irange(batch_size)) {
+      TORCH_CHECK(target_lengths[i] >= 0,
+                  "Expected target_lengths to have value at least ", 0, ", but got value ", target_lengths[i],
+                  " (while checking arguments for ", c, ")");
       tg_batch_offsets[i] = i * tg_batch_stride;
       if (max_target_length < target_lengths[i])
         max_target_length = target_lengths[i];
@@ -101,6 +107,9 @@ std::tuple<Tensor, Tensor, size_t, std::vector<int64_t>> ctc_loss_allocate_outpu
   }
   int64_t max_input_length = log_probs.size(0);
   for (const auto b : c10::irange(batch_size)) {
+    TORCH_CHECK(input_lengths[b] >= 0,
+             "Expected input_lengths to have value at least ", 0, ", but got value ", input_lengths[b],
+             " (while checking arguments for ", c, ")");
     TORCH_CHECK(input_lengths[b] <= max_input_length,
              "Expected input_lengths to have value at most ", max_input_length, ", but got value ", input_lengths[b],
              " (while checking arguments for ", c, ")");
@@ -139,9 +148,9 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
 
   int64_t batch_size = log_probs.size(1);
   auto lpp  = log_probs.permute({1,0,2});
-  auto log_probs_a_global = lpp.accessor<scalar_t, 3>();
+  auto log_probs_a_global = lpp.accessor<const scalar_t, 3>();
   auto log_alpha_a_global = log_alpha.accessor<scalar_t, 3>();
-  auto targets_data = targets.data_ptr<target_t>();
+  auto targets_data = targets.const_data_ptr<target_t>();
   auto neg_log_likelihood_a = neg_log_likelihood.accessor<scalar_t, 1>();
 
   // alpha calculation for the first row, the three equations for alpha_1 above eq (6)
@@ -155,6 +164,12 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
       auto log_alpha_a = log_alpha_a_global[b];
       int64_t tg_batch_offset = tg_batch_offsets[b];
 
+      if (input_length == 0) {
+        scalar_t log_likelihood = target_length == 0 ? 0 : neginf;
+        neg_log_likelihood_a[b] = -log_likelihood;
+        continue;
+      }
+
       // the first two items of alpha_t above eq (6)
       log_alpha_a[0][0] = log_probs_a[0][BLANK];
       if (target_length > 0)
@@ -254,12 +269,13 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
 
   Tensor log_beta = at::empty_like(log_alpha, LEGACY_CONTIGUOUS_MEMORY_FORMAT);  // could be optimized to use only 2 rows
   auto lpp  = log_probs.permute({1,0,2});
-  auto log_probs_a_global = lpp.accessor<scalar_t, 3>();
-  auto log_alpha_a_global = log_alpha.accessor<scalar_t, 3>();
+  auto log_probs_a_global = lpp.accessor<const scalar_t, 3>();
+  auto log_alpha_a_global = log_alpha.accessor<const scalar_t, 3>();
   auto log_beta_a_global = log_beta.accessor<scalar_t, 3>();
   auto gp = grad.permute({1,0,2});
   auto grad_a_global = gp.accessor<scalar_t, 3>();
-  auto targets_data = targets.data_ptr<target_t>();
+  auto targets_data = targets.const_data_ptr<target_t>();
+  auto grad_out_a = grad_out.accessor<const scalar_t, 1>();
 
   auto create_fill_iterator = [](const Tensor& tensor, IntArrayRef squash_dims) {
     return TensorIteratorConfig()
@@ -366,7 +382,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
       // now we wrap up the calculation by adding in the remaining items of eq (16)
       // this could be a great target for further vectorization.
       // grad is the output gradient, nll is the loss. Note that the likelihood -nll is the Z of eq (16)
-      scalar_t gr = grad_out.accessor<scalar_t, 1>()[b];
+      scalar_t gr = grad_out_a[b];
       for (const auto t : c10::irange(input_length)) { // or go for the full thing?
         for (const auto c : c10::irange(num_labels)) {
           scalar_t& res = grad_a[t][c];
@@ -422,8 +438,8 @@ std::tuple<Tensor, Tensor> ctc_loss_tensor(const Tensor& log_probs, const Tensor
 
   Tensor ilc = input_lengths.to(Device(at::kCPU), at::kLong).contiguous();
   Tensor tlc = target_lengths.to(Device(at::kCPU), at::kLong).contiguous();
-  IntArrayRef il(ilc.data_ptr<int64_t>(), ilc.numel());
-  IntArrayRef tl(tlc.data_ptr<int64_t>(), tlc.numel());
+  IntArrayRef il(ilc.const_data_ptr<int64_t>(), ilc.numel());
+  IntArrayRef tl(tlc.const_data_ptr<int64_t>(), tlc.numel());
 
   return at::_ctc_loss(log_probs, targets, il, tl, BLANK, zero_infinity);
 }
@@ -536,8 +552,8 @@ Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, const Tensor& in
 
   Tensor ilc = input_lengths.to(Device(at::kCPU), at::kLong).contiguous();
   Tensor tlc = target_lengths.to(Device(at::kCPU), at::kLong).contiguous();
-  IntArrayRef il(ilc.data_ptr<int64_t>(), ilc.numel());
-  IntArrayRef tl(tlc.data_ptr<int64_t>(), tlc.numel());
+  IntArrayRef il(ilc.const_data_ptr<int64_t>(), ilc.numel());
+  IntArrayRef tl(tlc.const_data_ptr<int64_t>(), tlc.numel());
   return at::native::ctc_loss(log_probs, targets, il, tl, BLANK, reduction, zero_infinity);
 }
 
diff --git a/aten/src/ATen/native/LossMultiLabelMargin.cpp b/aten/src/ATen/native/LossMultiLabelMargin.cpp
index f87c0755f5e12..58ca609eaed54 100644
--- a/aten/src/ATen/native/LossMultiLabelMargin.cpp
+++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp
@@ -24,8 +24,8 @@ namespace {
 
 template <typename scalar_t>
 inline scalar_t multilabel_margin_loss_forward_inner_sum_cpu(
-    scalar_t* input_data,
-    int64_t* target_data,
+    const scalar_t* input_data,
+    const int64_t* target_data,
     scalar_t* is_target_data,
     int64_t dim) {
   using accscalar_t = at::acc_type<scalar_t, false>;
@@ -67,8 +67,8 @@ static void multilabel_margin_loss_forward_out_frame(
     int64_t nframe,
     int64_t dim) {
   using accscalar_t = at::acc_type<scalar_t, false>;
-  scalar_t* input_data = input_contiguous.data_ptr<scalar_t>();
-  int64_t* target_data = target_contiguous.data_ptr<int64_t>();
+  const scalar_t* input_data = input_contiguous.const_data_ptr<scalar_t>();
+  const int64_t* target_data = target_contiguous.const_data_ptr<int64_t>();
   scalar_t* is_target_data = is_target.data_ptr<scalar_t>();
 
   if (reduction != Reduction::None || output.dim() == 0) {
@@ -168,9 +168,9 @@ static void multilabel_margin_loss_backward_out_frame(
   TORCH_CHECK(
       is_target_contiguous.max().item<scalar_t>() <= 1, is_target_arg, " is out of range");
 
-  scalar_t* input_data = input_contiguous.data_ptr<scalar_t>();
-  int64_t* target_data = target_contiguous.data_ptr<int64_t>();
-  scalar_t* is_target_data = is_target_contiguous.data_ptr<scalar_t>();
+  const scalar_t* input_data = input_contiguous.const_data_ptr<scalar_t>();
+  const int64_t* target_data = target_contiguous.const_data_ptr<int64_t>();
+  const scalar_t* is_target_data = is_target_contiguous.const_data_ptr<scalar_t>();
   scalar_t g = static_cast<scalar_t>(
       // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
       reduction == Reduction::Mean ? 1. / (nframe * dim) : 1. / dim);
@@ -204,13 +204,13 @@ static void multilabel_margin_loss_backward_out_frame(
   if (reduction != Reduction::None || grad_output.dim() == 0) {
     assert(
         reduction != Reduction::None || grad_output.dim() > 0 || nframe == 1);
-    const auto d = *grad_output.data_ptr<scalar_t>();
+    const auto d = *grad_output.const_data_ptr<scalar_t>();
     for (int64_t t = 0; t < nframe * dim; t++) {
       grad_input_data[t] *= d;
     }
   } else {
     check_dim_size(grad_output, 1, 0, nframe);
-    auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
+    auto grad_output_acc = grad_output.accessor<const scalar_t, 1>();
     for (const auto t : c10::irange(nframe)) {
       for (const auto d : c10::irange(dim)) {
         grad_input_data[t * dim + d] *= grad_output_acc[t];
diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp
index 32495aab10fc6..5b2f5ae1863b7 100644
--- a/aten/src/ATen/native/LossMultiMargin.cpp
+++ b/aten/src/ATen/native/LossMultiMargin.cpp
@@ -59,11 +59,11 @@ inline int64_t target_index_checked(
 template <typename scalar_t>
 static inline void multi_margin_loss_cpu_kernel(
     Tensor& output,
-    scalar_t* input_data,
-    int64_t* target_data,
+    const scalar_t* input_data,
+    const int64_t* target_data,
     const int p,
     scalar_t margin,
-    scalar_t* weight_data,
+    const scalar_t* weight_data,
     const int64_t nframe,
     const int64_t dim,
     const int64_t reduction) {
@@ -131,10 +131,10 @@ void multi_margin_loss_out_cpu_template(
 
   AT_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "multi_margin_loss_cpu_kernel", [&] {
-        auto input_data = input_contiguous.data_ptr<scalar_t>();
-        auto target_data = target_contiguous.data_ptr<int64_t>();
+        auto input_data = input_contiguous.const_data_ptr<scalar_t>();
+        auto target_data = target_contiguous.const_data_ptr<int64_t>();
         auto weight_data =
-            weight_contiguous.defined() ? weight_contiguous.data_ptr<scalar_t>() : nullptr;
+            weight_contiguous.defined() ? weight_contiguous.const_data_ptr<scalar_t>() : nullptr;
         multi_margin_loss_cpu_kernel<scalar_t>(
             output,
             input_data,
@@ -152,12 +152,12 @@ template <typename scalar_t>
 static void multi_margin_loss_backward_cpu_kernel(
     scalar_t* grad_input_data,
     const Tensor& grad_output,
-    scalar_t* input_data,
-    int64_t* target_data,
+    const scalar_t* input_data,
+    const int64_t* target_data,
     int p,
     scalar_t margin,
     scalar_t g,
-    scalar_t* weight_data,
+    const scalar_t* weight_data,
     int64_t nframe,
     int64_t dim,
     int64_t reduction) {
@@ -193,12 +193,12 @@ static void multi_margin_loss_backward_cpu_kernel(
     assert(
         reduction != Reduction::None || grad_output.dim() > 0 ||
         nframe == 1); // check 1d scalar fallback-case
-    const auto d = *grad_output.data_ptr<scalar_t>();
+    const auto d = *grad_output.const_data_ptr<scalar_t>();
     for (int64_t t = 0; t < nframe * dim; t++) {
       grad_input_data[t] *= d;
     }
   } else {
-    auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
+    auto grad_output_acc = grad_output.accessor<const scalar_t, 1>();
     for (const auto t : c10::irange(nframe)) {
       for (const auto d : c10::irange(dim)) {
         grad_input_data[t * dim + d] *= grad_output_acc[t];
@@ -236,10 +236,10 @@ void multi_margin_loss_backward_out_cpu_template(
   AT_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "multi_margin_loss_backward_cpu_kernel", [&] {
         auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
-        auto input_data = input_contiguous.data_ptr<scalar_t>();
-        auto target_data = target_contiguous.data_ptr<int64_t>();
+        auto input_data = input_contiguous.const_data_ptr<scalar_t>();
+        auto target_data = target_contiguous.const_data_ptr<int64_t>();
         auto weight_data = weight_contiguous.defined()
-            ? weight_contiguous.data_ptr<scalar_t>()
+            ? weight_contiguous.const_data_ptr<scalar_t>()
             : nullptr;
         scalar_t g = reduction == Reduction::Mean
             ? static_cast<scalar_t>(1. / (nframe * dim))
diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index 86d74e8fec012..0e7de9c27252a 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -147,7 +147,11 @@ inline Tensor optional_contiguous(const Tensor& source) {
 // or nullptr if the tensor is undefined.
 template <typename scalar_t>
 inline scalar_t* optional_data(const Tensor& source) {
-  return source.defined() ? source.data_ptr<scalar_t>() : nullptr;
+  if constexpr (std::is_const<scalar_t>::value) {
+    return source.defined() ? source.const_data_ptr<scalar_t>() : nullptr;
+  } else {
+    return source.defined() ? source.data_ptr<scalar_t>() : nullptr;
+  }
 }
 
 template <typename scalar_t, typename target_t>
@@ -166,14 +170,14 @@ static void nll_loss_out_frame(
   *total_weight_data = 0;
 
   auto weight_contiguous = optional_contiguous(weight);
-  const scalar_t* weight_data = optional_data<scalar_t>(weight_contiguous);
+  const scalar_t* weight_data = optional_data<const scalar_t>(weight_contiguous);
 
   if (reduction == Reduction::None && n_dims == 2) {
     const auto batch_size = input.size(0);
     at::native::resize_output(output, {batch_size});
 
-    auto input_acc = input.accessor<scalar_t, 2>();
-    auto target_acc = target.accessor<target_t, 1>();
+    auto input_acc = input.accessor<const scalar_t, 2>();
+    auto target_acc = target.accessor<const target_t, 1>();
     auto output_acc = output.accessor<scalar_t, 1>();
 
     at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
@@ -219,8 +223,8 @@ static void nll_loss_out_frame(
   auto input_contiguous = input.contiguous();
   auto target_contiguous = target.contiguous();
 
-  const scalar_t* input_data = input_contiguous.data_ptr<scalar_t>();
-  const target_t* target_data = target_contiguous.data_ptr<target_t>();
+  const scalar_t* input_data = input_contiguous.const_data_ptr<scalar_t>();
+  const target_t* target_data = target_contiguous.const_data_ptr<target_t>();
 
   const int64_t ndim = input.dim();
   const int64_t batch_size = ndim == 1 ? 1 : input.size(0);
@@ -300,8 +304,12 @@ void nll_loss_forward_out_cpu_template(
     const Tensor& weight,
     int64_t reduction,
     int64_t ignore_index) {
-  AT_DISPATCH_FLOATING_TYPES_AND(
-      ScalarType::BFloat16, input.scalar_type(), "nll_loss_out_frame", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::BFloat16,
+      ScalarType::Half,
+      input.scalar_type(),
+      "nll_loss_out_frame",
+      [&] {
         if (target.scalar_type() == kByte) {
           nll_loss_out_frame<scalar_t, uint8_t>(
               output,
@@ -342,15 +350,15 @@ static void nll_loss_backward_out_frame(
   if (target.dim() == 0) {
     target_ = target.unsqueeze(0);
   }
-  auto target_acc = target_.accessor<target_t, 1>();
+  auto target_acc = target_.accessor<const target_t, 1>();
 
   auto weight_contiguous = optional_contiguous(weight);
-  const scalar_t* weight_data = optional_data<scalar_t>(weight_contiguous);
+  const scalar_t* weight_data = optional_data<const scalar_t>(weight_contiguous);
 
   if (reduction == Reduction::None && n_dims == 2) {
     const auto batch_size = input.size(0);
     auto grad_input_acc = grad_input.accessor<scalar_t, 2>();
-    auto grad_output_acc = grad_output.accessor<scalar_t, 1>();
+    auto grad_output_acc = grad_output.accessor<const scalar_t, 1>();
     at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
       for (const auto i : c10::irange(start, end)) {
         auto cur_target = target_acc[i];
@@ -365,9 +373,9 @@ static void nll_loss_backward_out_frame(
     return;
   }
 
-  const scalar_t total_weight_value = *total_weight.data_ptr<scalar_t>();
+  const scalar_t total_weight_value = *total_weight.const_data_ptr<scalar_t>();
 
-  const scalar_t grad_output_value = *grad_output.data_ptr<scalar_t>();
+  const scalar_t grad_output_value = *grad_output.const_data_ptr<scalar_t>();
 
   if (input.dim() == 1) {
     auto grad_input_acc = grad_input.accessor<scalar_t, 1>();
@@ -411,8 +419,9 @@ void nll_loss_backward_out_cpu_template(
     const Tensor& total_weight) {
   grad_input.zero_();
 
-  AT_DISPATCH_FLOATING_TYPES_AND(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
       ScalarType::BFloat16,
+      ScalarType::Half,
       input.scalar_type(),
       "nll_loss_backward_out_frame",
       [&] {
@@ -720,12 +729,12 @@ Tensor nll_loss_nd_symint(
     input_ = input_.contiguous();
     target_ = target_.contiguous();
     // support empty batches, see #15870
-    if (input_.numel() > 0) {
+    if (input_.sym_numel() > 0) {
       input_ = input_.view_symint({n, std::move(c), 1, -1});
     } else {
       input_ = input_.view_symint({n, std::move(c), 0, 0});
     }
-    if (target_.numel() > 0) {
+    if (target_.sym_numel() > 0) {
       target_ = target_.view_symint({std::move(n), 1, -1});
     } else {
       target_ = target_.view_symint({std::move(n), 0, 0});
diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp
index 2d210901efc39..94c667dcb1b2b 100644
--- a/aten/src/ATen/native/LossNLL2d.cpp
+++ b/aten/src/ATen/native/LossNLL2d.cpp
@@ -35,7 +35,11 @@ inline Tensor optional_contiguous(const Tensor& source) {
 // or nullptr if the tensor is undefined.
 template <typename scalar_t>
 inline scalar_t* optional_data(const Tensor& source) {
-  return source.defined() ? source.data_ptr<scalar_t>() : nullptr;
+  if constexpr (std::is_const<scalar_t>::value) {
+    return source.defined() ? source.const_data_ptr<scalar_t>() : nullptr;
+  } else {
+    return source.defined() ? source.data_ptr<scalar_t>() : nullptr;
+  }
 }
 
 inline void check_inputs_nll_loss2d(
@@ -109,7 +113,7 @@ static void nll_loss2d_forward_out_frame(
   *total_weight_data = 0;
 
   auto weight_contiguous = optional_contiguous(weight);
-  const scalar_t* weight_data = optional_data<scalar_t>(weight_contiguous);
+  const scalar_t* weight_data = optional_data<const scalar_t>(weight_contiguous);
 
   if (reduction == Reduction::None) {
     const int64_t batch_size = input.size(0);
@@ -117,9 +121,9 @@ static void nll_loss2d_forward_out_frame(
     const int64_t W = input.size(3);
 
     at::native::resize_output(output, {batch_size, H, W});
-    auto input_acc = input.accessor<scalar_t, 4>();
+    auto input_acc = input.accessor<const scalar_t, 4>();
     auto output_acc = output.accessor<scalar_t, 3>();
-    auto target_acc = target.accessor<int64_t, 3>();
+    auto target_acc = target.accessor<const int64_t, 3>();
 
     at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
       for (const auto b : c10::irange(start, end)) {
@@ -170,8 +174,8 @@ static void nll_loss2d_forward_out_frame(
   auto input_contiguous = input.contiguous();
   auto target_contiguous = target.contiguous();
 
-  const scalar_t* input_data = input_contiguous.data_ptr<scalar_t>();
-  const int64_t* target_data = target_contiguous.data_ptr<int64_t>();
+  const scalar_t* input_data = input_contiguous.const_data_ptr<scalar_t>();
+  const int64_t* target_data = target_contiguous.const_data_ptr<int64_t>();
 
   const int64_t batch_size = input.size(0);
   const int64_t map_size = input.size(2) * input.size(3);
@@ -258,8 +262,9 @@ void nll_loss2d_forward_out_cpu_template(
   check_inputs_nll_loss2d(input, target, weight);
   total_weight.resize_({});
 
-  AT_DISPATCH_FLOATING_TYPES_AND(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
       ScalarType::BFloat16,
+      ScalarType::Half,
       input.scalar_type(),
       "nll_loss2d_forward_out_frame",
       [&] {
@@ -285,7 +290,7 @@ static void nll_loss2d_backward_out_frame(
     int64_t ignore_index,
     const Tensor& total_weight) {
   auto weight_contiguous = optional_contiguous(weight);
-  const scalar_t* weight_data = optional_data<scalar_t>(weight_contiguous);
+  const scalar_t* weight_data = optional_data<const scalar_t>(weight_contiguous);
 
   if (reduction == at::Reduction::None) {
     check_gradout_shape_nll_loss2d(grad_output, target);
@@ -295,8 +300,8 @@ static void nll_loss2d_backward_out_frame(
     const int64_t W = input.size(3);
 
     auto grad_input_acc = grad_input.accessor<scalar_t, 4>();
-    auto grad_output_acc = grad_output.accessor<scalar_t, 3>();
-    auto target_acc = target.accessor<int64_t, 3>();
+    auto grad_output_acc = grad_output.accessor<const scalar_t, 3>();
+    auto target_acc = target.accessor<const int64_t, 3>();
 
     at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
       for (const auto b : c10::irange(start, end)) {
@@ -319,17 +324,17 @@ static void nll_loss2d_backward_out_frame(
     return;
   }
 
-  const scalar_t total_weight_value = *total_weight.data_ptr<scalar_t>();
+  const scalar_t total_weight_value = *total_weight.const_data_ptr<scalar_t>();
 
   TORCH_CHECK(
       grad_output.dim() <= 1 && grad_output.numel() == 1,
       "Expected a single element grad_output tensor, but got: ",
       grad_output.sizes());
 
-  const scalar_t grad_output_value = *grad_output.data_ptr<scalar_t>();
+  const scalar_t grad_output_value = *grad_output.const_data_ptr<scalar_t>();
 
   const auto target_contiguous = target.contiguous();
-  const int64_t* target_data = target_contiguous.data_ptr<int64_t>();
+  const int64_t* target_data = target_contiguous.const_data_ptr<int64_t>();
 
   scalar_t* grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
 
@@ -379,8 +384,9 @@ void nll_loss2d_backward_out_cpu_template(
       total_weight.numel(),
       " elements)");
 
-  AT_DISPATCH_FLOATING_TYPES_AND(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
       ScalarType::BFloat16,
+      ScalarType::Half,
       input.scalar_type(),
       "nll_loss2d_backward_out_frame",
       [&] {
diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h
index f944a518ed63f..092ee00992e9d 100644
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@@ -6,7 +6,6 @@
 #include <c10/util/BFloat16.h>
 #include <c10/util/Half.h>
 #include <c10/util/MathConstants.h>
-#include <c10/util/math_compat.h>
 #include <cfloat>
 #include <cmath>
 #include <cstdint>
diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
index de0806923a32f..fbac5d4cc72c2 100644
--- a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
@@ -298,13 +298,16 @@ void slow_conv_transpose2d_out_cpu_template(
   }
   columns.zero_();
 
+  // Materialize if COW, since we cannot do so during parallel_for
+  output.mutable_data_ptr();
+
   AT_DISPATCH_FLOATING_TYPES_AND3(at::ScalarType::Long, at::ScalarType::BFloat16,
       at::ScalarType::Half, input.scalar_type(), "slow_conv_transpose2d_out_cpu", [&] {
 
     at::parallel_for(0, batch_size, 0, [&](int64_t begin, int64_t end) {
       // For each elt in batch, do:
       for (const auto elt : c10::irange(begin, end)) {
-        // Matrix mulitply per output:
+        // Matrix multiply per output:
         Tensor input_n = input_.select(0, elt);
         Tensor output_n = output.select(0, elt);
         Tensor columns_n = columns.select(0, elt);
@@ -353,7 +356,7 @@ void slow_conv_transpose2d_out_cpu_template(
 
         // Unpack columns back into input:
         col2im<scalar_t>(
-            columns_n.data_ptr<scalar_t>(),
+            columns_n.const_data_ptr<scalar_t>(),
             n_output_plane,
             output_height,
             output_width,
@@ -501,14 +504,14 @@ static void slow_conv_transpose2d_backward_out_cpu_template(
 
         // For each elt in batch, do:
         for (const auto elt : c10::irange(batch_size)) {
-          // Matrix mulitply per sample:
+          // Matrix multiply per sample:
           grad_input_n = grad_input.select(0, elt);
           grad_output_n = grad_output.select(0, elt);
 
           if (need_columns) {
             // Extract columns:
             im2col<scalar_t>(
-                  grad_output_n.data_ptr<scalar_t>(),
+                  grad_output_n.const_data_ptr<scalar_t>(),
                   n_output_plane,
                   output_height,
                   output_width,
@@ -526,8 +529,8 @@ static void slow_conv_transpose2d_backward_out_cpu_template(
                   use_channels_last);
           }
 
-          auto gemm_in_ptr = need_columns ? grad_columns.data_ptr<scalar_t>()
-              : grad_output_n.data_ptr<scalar_t>();
+          auto gemm_in_ptr = need_columns ? grad_columns.const_data_ptr<scalar_t>()
+              : grad_output_n.const_data_ptr<scalar_t>();
 
           if (use_channels_last) {
             int64_t m = n_input_plane;
@@ -695,18 +698,18 @@ void slow_conv_transpose2d_acc_grad_parameters_cpu(
 
         // For each elt in batch, do:
         for (const auto elt : c10::irange(batch_size)) {
-          // Matrix mulitply per output:
+          // Matrix multiply per output:
           grad_output_n = grad_output.select(0, elt);
 
           // Do Weight:
           if (grad_weight.defined()) {
-            // Matrix mulitply per output:
+            // Matrix multiply per output:
             input_n = input.select(0, elt);
 
             if (need_columns) {
               // Extract columns:
               im2col<scalar_t>(
-                  grad_output_n.data_ptr<scalar_t>(),
+                  grad_output_n.const_data_ptr<scalar_t>(),
                   n_output_plane,
                   output_height,
                   output_width,
@@ -724,8 +727,8 @@ void slow_conv_transpose2d_acc_grad_parameters_cpu(
                   use_channels_last);
             }
 
-            auto gemm_in_ptr = need_columns ? columns.data_ptr<scalar_t>()
-                : grad_output_n.data_ptr<scalar_t>();
+            auto gemm_in_ptr = need_columns ? columns.const_data_ptr<scalar_t>()
+                : grad_output_n.const_data_ptr<scalar_t>();
 
             if (use_channels_last) {
               int64_t m = kernel_height * kernel_width * n_output_plane;
diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
index a9f02117dd802..624e820c7ba66 100644
--- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
@@ -329,7 +329,7 @@ void slow_conv_transpose3d_out_cpu_template(
 
           // Unpack columns back into input:
           at::native::col2vol<scalar_t>(
-              columns.data_ptr<scalar_t>(),
+              columns.const_data_ptr<scalar_t>(),
               n_output_plane,
               output_depth,
               output_height,
@@ -562,8 +562,8 @@ void slow_conv_transpose3d_backward_out_cpu_template(
 
           // Do GEMM (note: this is a bit confusing because gemm assumes
           // column-major matrices)
-          auto gemm_in_ptr = need_columns ? grad_columns.data_ptr<scalar_t>()
-              : grad_output_n.data_ptr<scalar_t>();
+          auto gemm_in_ptr = need_columns ? grad_columns.const_data_ptr<scalar_t>()
+              : grad_output_n.const_data_ptr<scalar_t>();
           cpublas::gemm(
               TransposeType::NoTranspose,
               TransposeType::NoTranspose,
@@ -782,8 +782,8 @@ void slow_conv_transpose3d_acc_grad_parameters_cpu(
 
             // Do GEMM (note: this is a bit confusing because gemm assumes
             // column-major matrices)
-            auto gemm_in_ptr = need_columns ? columns.data_ptr<scalar_t>()
-                : grad_output_n.data_ptr<scalar_t>();
+            auto gemm_in_ptr = need_columns ? columns.const_data_ptr<scalar_t>()
+                : grad_output_n.const_data_ptr<scalar_t>();
             cpublas::gemm(
                 TransposeType::Transpose,
                 TransposeType::NoTranspose,
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 4bc7dbc139b68..93d2ce11d934f 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -29,6 +29,11 @@
 #include <ATen/ops/_native_batch_norm_legit_native.h>
 #include <ATen/ops/_native_batch_norm_legit_no_training.h>
 #include <ATen/ops/_native_batch_norm_legit_no_training_native.h>
+#include <ATen/ops/_batch_norm_with_update.h>
+#include <ATen/ops/_batch_norm_with_update_native.h>
+#include <ATen/ops/_batch_norm_no_update.h>
+#include <ATen/ops/_batch_norm_no_update_native.h>
+#include <ATen/ops/batch_norm_backward_native.h>
 #include <ATen/ops/alias.h>
 #include <ATen/ops/batch_norm.h>
 #include <ATen/ops/batch_norm_native.h>
@@ -199,6 +204,7 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
   using accscalar_t = at::acc_type<scalar_t, false>;
 
   int64_t n_input = input.size(1);
+  TORCH_CHECK(input.numel() != 0, "input tensor must have at least one element, but got input_sizes = ", input.sizes());
   int64_t n = input.numel() / n_input;
 
   bool all_contiguous = is_contiguous(input);
@@ -332,18 +338,18 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
     return std::make_tuple(grad_input, grad_weight, grad_bias);
   }
 
-  auto weight_a = conditional_accessor_1d<param_t>(weight);
+  auto weight_a = conditional_accessor_1d<const param_t>(weight);
   auto grad_weight_a = conditional_accessor_1d<param_t>(grad_weight);
   auto grad_bias_a = conditional_accessor_1d<param_t>(grad_bias);
 
   int64_t n_input = input.size(1);
   int64_t n = input.numel() / n_input;
 
-  auto save_mean_a = conditional_accessor_1d<param_t>(save_mean);
-  auto save_invstd_a = conditional_accessor_1d<param_t>(save_invstd);
+  auto save_mean_a = conditional_accessor_1d<const param_t>(save_mean);
+  auto save_invstd_a = conditional_accessor_1d<const param_t>(save_invstd);
 
-  auto running_mean_a = conditional_accessor_1d<param_t>(running_mean);
-  auto running_var_a = conditional_accessor_1d<param_t>(running_var);
+  auto running_mean_a = conditional_accessor_1d<const param_t>(running_mean);
+  auto running_var_a = conditional_accessor_1d<const param_t>(running_var);
 
   const int64_t ndim = input.dim();
 
@@ -358,8 +364,8 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
   auto sum_a = sum.accessor<scalar_t, 1>();
 
   auto reduce_iter = TensorIteratorConfig()
-      .add_input(input)
-      .add_input(grad_out_)
+      .add_const_input(input)
+      .add_const_input(grad_out_)
       .resize_outputs(false)
       .declare_static_shape(input.sizes(), /*squash_dims=*/1)
       .build();
@@ -370,7 +376,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
     unary_iter.build(
         TensorIteratorConfig()
         .add_output(grad_input)
-        .add_input(train ? input : grad_out_)
+        .add_const_input(train ? input : grad_out_)
         .resize_outputs(false)
         .declare_static_shape(input.sizes(), /*squash_dims=*/1));
 
@@ -379,18 +385,18 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
           TensorIteratorConfig()
           .add_output(grad_input)
           .add_input(grad_input)
-          .add_input(grad_out_)
+          .add_const_input(grad_out_)
           .resize_outputs(false)
           .declare_static_shape(input.sizes(), /*squash_dims=*/1));
     }
   }
 
   auto in_channel_stride = input.strides()[1];
-  auto in_data = input.data_ptr<scalar_t>();
+  auto in_data = input.const_data_ptr<scalar_t>();
   auto grad_in_channel_stride = grad_input_mask[0] ? grad_input.strides()[1] : 0;
   auto grad_in_data = grad_input_mask[0] ? grad_input.mutable_data_ptr<scalar_t>() : nullptr;
   auto grad_out_channel_stride = grad_out_.strides()[1];
-  auto grad_out_data = grad_out_.data_ptr<scalar_t>();
+  auto grad_out_data = grad_out_.const_data_ptr<scalar_t>();
 
   parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) {
       TensorIterator reduce_iter_local(reduce_iter);
@@ -409,12 +415,12 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
           invstd = 1 / std::sqrt(running_var_a[f] + eps);
         }
 
-        // dot product of the Q(X) and gradOuput
+        // dot product of the Q(X) and gradOutput
         accscalar_t dotp = 0;
         reduce_iter_local.unsafe_replace_operand(
-            0, in_data + f * in_channel_stride);
+            0, const_cast<scalar_t*>(in_data + f * in_channel_stride));
         reduce_iter_local.unsafe_replace_operand(
-            1, grad_out_data + f * grad_out_channel_stride);
+            1, const_cast<scalar_t*>(grad_out_data + f * grad_out_channel_stride));
 
         cpu_serial_kernel(reduce_iter_local, [&](const scalar_t i, const scalar_t go) -> void {
           dotp += (i - mean) * go;
@@ -433,7 +439,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
               unary_iter_local.unsafe_replace_operand(
                   0, grad_in_data + f * grad_in_channel_stride);
               unary_iter_local.unsafe_replace_operand(
-                  1, in_data + f * in_channel_stride);
+                  1, const_cast<scalar_t*>(in_data + f * in_channel_stride));
               cpu_serial_kernel(unary_iter_local, [&](const scalar_t i) -> scalar_t {
                 return (i - mean) * k;
               });
@@ -445,7 +451,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
               binary_iter_local.unsafe_replace_operand(0, gI_data);
               binary_iter_local.unsafe_replace_operand(1, gI_data);
               binary_iter_local.unsafe_replace_operand(
-                  2, grad_out_data + f * grad_out_channel_stride);
+                  2, const_cast<scalar_t*>(grad_out_data + f * grad_out_channel_stride));
               cpu_serial_kernel(binary_iter_local, [&](scalar_t gi, scalar_t go) -> scalar_t {
                 return (go - grad_mean - gi) * invstd * w;
               });
@@ -459,7 +465,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
               unary_iter_local.unsafe_replace_operand(
                   0, grad_in_data + f * grad_in_channel_stride);
               unary_iter_local.unsafe_replace_operand(
-                  1, grad_out_data + f * grad_out_channel_stride);
+                  1, const_cast<scalar_t*>(grad_out_data + f * grad_out_channel_stride));
               cpu_serial_kernel(unary_iter_local, [&](const scalar_t i) -> scalar_t {
                 return i * invstd * w;
               });
@@ -478,10 +484,58 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
   return std::make_tuple(grad_input, grad_weight, grad_bias);
 }
 
+BatchNormBackend _select_batch_norm_backend(
+    const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& running_mean,
+    const Tensor& running_var, bool training, double eps) {
+
+  auto& ctx = at::globalContext();
+  bool cudnn_enabled = ctx.userEnabledCuDNN();
+
+  if (
+      input.is_cuda()
+      && input.scalar_type() != at::kBFloat16 && weight.scalar_type() != at::kBFloat16
+      && (input.scalar_type() != at::kHalf
+        || weight.scalar_type() == at::kFloat)
+      && weight.defined() && bias.defined()
+      && ((running_mean.defined() && running_var.defined())
+        || (!running_mean.defined() && !running_var.defined() && training))
+      && (input.dim() >= 3)
+      && ((input.sym_size(0) <= 880801 && training) // spatial, training
+          ||(input.sym_size(0) <= 65535 && !training)) //spatial, eval
+      && detail::getCUDAHooks().compiledWithCuDNN()
+      && eps >= detail::getCUDAHooks().batchnormMinEpsilonCuDNN()
+      && cudnn_enabled && detail::getCUDAHooks().versionCuDNN() >= 5110L
+      && input.sym_numel() < std::numeric_limits<std::int32_t>::max() // some cuDNN kernels have 32-bit indexing limitations
+  ) {
+    return BatchNormBackend::Cudnn;
+  }
+
+  if (
+      input.is_cuda()
+      && input.dim() <= MIOPEN_DIM_MAX
+      && input.scalar_type() != at::kDouble
+      && input.scalar_type() != at::kBFloat16
+      && (weight.scalar_type() != at::kHalf)
+      && weight.defined() && bias.defined()
+      && ((running_mean.defined() && running_var.defined())
+        || (!running_mean.defined() && !running_var.defined() && training))
+      && detail::getCUDAHooks().compiledWithMIOpen()
+      && cudnn_enabled
+      && input.suggest_memory_format() != MemoryFormat::ChannelsLast
+      && input.suggest_memory_format() != MemoryFormat::ChannelsLast3d
+  ) {
+    return BatchNormBackend::Miopen;
+  }
+
+  return BatchNormBackend::Native;
+}
+
+
 // _batch_norm_impl_index(_backward) are used in the JIT be able to keep the run-time selection
 // of backends, while enabling it to keep the information about the used backend, so that it can
 // use its corresponding backward implementation.
 // XXX: The indices of backends need to be kept synchronized between this function and its _backward.
+// TODO: remove cudnn_enabled arg
 std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
     const Tensor& input, const c10::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */, const c10::optional<Tensor>& running_mean_opt /* optional */, const c10::optional<Tensor>& running_var_opt /* optional */,
     bool training, double momentum, double eps, bool cudnn_enabled) {
@@ -526,32 +580,16 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
     check_dims_match_num_input_features("bias", std::move(num_features), bias.sym_numel());
   }
 
-  const bool use_cudnn = (
-      input.is_cuda()
-      && input.scalar_type() != at::kBFloat16 && weight.scalar_type() != at::kBFloat16
-      && (input.scalar_type() != at::kHalf
-        || weight.scalar_type() == at::kFloat)
-      && weight.defined() && bias.defined()
-      && ((running_mean.defined() && running_var.defined())
-        || (!running_mean.defined() && !running_var.defined() && training))
-      && (input.dim() >= 3)
-      && ((input.sym_size(0) <= 880801 && training) // spatial, training
-          ||(input.sym_size(0) <= 65535 && !training)) //spatial, eval
-      && detail::getCUDAHooks().compiledWithCuDNN()
-      && eps >= detail::getCUDAHooks().batchnormMinEpsilonCuDNN()
-      && cudnn_enabled && detail::getCUDAHooks().versionCuDNN() >= 5110L
-      && input.sym_numel() < std::numeric_limits<std::int32_t>::max() // some cuDNN kernels have 32-bit indexing limitations
-      );
+  BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, training, eps);
 
-  if (use_cudnn) {
+  if (backend == BatchNormBackend::Cudnn) {
     auto input_c = input.contiguous(input.suggest_memory_format());
     auto weight_c = weight.contiguous();
     auto bias_c = bias.contiguous();
     auto rmean_c = running_mean.defined() ? running_mean.contiguous() : running_mean;
     auto rvar_c = running_var.defined() ? running_var.contiguous() : running_var;
 
-    Tensor output, save_mean, save_var, reserve;
-    std::tie(output, save_mean, save_var, reserve) =
+    auto [output, save_mean, save_var, reserve] =
         at::cudnn_batch_norm(input_c, weight_c, bias_c, rmean_c, rvar_c,
                              training, momentum, eps);
 
@@ -561,19 +599,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
 
   Tensor reserve = at::empty({0}, input.options().dtype(kByte));
 
-  bool use_miopen = (input.is_cuda()
-               && input.dim() <= MIOPEN_DIM_MAX
-               && input.scalar_type() != at::kDouble
-               && input.scalar_type() != at::kBFloat16
-               && (weight.scalar_type() != at::kHalf)
-               && weight.defined() && bias.defined()
-               && ((running_mean.defined() && running_var.defined())
-                 || (!running_mean.defined() && !running_var.defined() && training))
-               && detail::getCUDAHooks().compiledWithMIOpen()
-               && cudnn_enabled
-               );
-
-  if (use_miopen && input.suggest_memory_format() != MemoryFormat::ChannelsLast && input.suggest_memory_format() != MemoryFormat::ChannelsLast3d) {
+  if (backend == BatchNormBackend::Miopen) {
     return std::tuple_cat(
              at::miopen_batch_norm(
                input.contiguous(), weight.contiguous(), bias.contiguous(),
@@ -637,6 +663,7 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_impl_index_backward(
   TORCH_INTERNAL_ASSERT(false, "Unsupported impl_index in _batch_norm_impl_index_backward: ", impl_index);
 }
 
+// TODO: remove cudnn_enabled arg
 Tensor batch_norm(
     const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
     const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
@@ -647,6 +674,30 @@ Tensor batch_norm(
   const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
   return std::get<0>(at::_batch_norm_impl_index(input, weight, bias, running_mean, running_var,
                                                 training, momentum, eps, cudnn_enabled));
+  // TODO: switch to the new stack after the 2 week FC window
+  // if (training) {
+  //   BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, training, eps);
+  //   if (backend == BatchNormBackend::Cudnn || backend == BatchNormBackend::Miopen) {
+  //     auto input_c = input;
+  //     if (backend == BatchNormBackend::Cudnn) {
+  //         input_c = input.contiguous(input.suggest_memory_format());
+  //     } else {
+  //         input_c = input.contiguous();
+  //     }
+  //     auto weight_c = weight.contiguous();
+  //     auto bias_c = bias.contiguous();
+  //     auto rmean_c = running_mean.defined() ? running_mean.contiguous() : running_mean;
+  //     auto rvar_c = running_var.defined() ? running_var.contiguous() : running_var;
+  //     return std::get<0>(at::_batch_norm_with_update(input_c, weight_c, bias_c, const_cast<Tensor&>(rmean_c),
+  //                                                   const_cast<Tensor&>(rvar_c), momentum, eps));
+  //   } else {
+  //     return std::get<0>(at::_batch_norm_with_update(input, weight, bias, const_cast<Tensor&>(running_mean),
+  //                                                   const_cast<Tensor&>(running_var), momentum, eps));
+  //   }
+  // } else {
+  //   return std::get<0>(at::_batch_norm_no_update(input, weight, bias, running_mean, running_var,
+  //                                               momentum, eps));
+  // }
 }
 
 Tensor instance_norm(
@@ -798,6 +849,38 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const c10:
   return batch_norm_cpu_out(self, weight_opt, bias_opt, running_mean_opt, running_var_opt, train, momentum, eps, output, save_mean, save_var);
 }
 
+std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_cpu(
+    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    Tensor& running_mean, Tensor& running_var, double momentum, double eps) {
+  Tensor output, save_mean, save_var;
+  std::tie(output, save_mean, save_var) =
+    batch_norm_cpu(input, weight_opt, bias_opt, running_mean, running_var, /*update*/true, momentum, eps);
+  Tensor reserve = at::empty({0}, input.options().dtype(kByte));
+  return std::tuple<Tensor, Tensor, Tensor, Tensor>(output, save_mean, save_var, reserve);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> _batch_norm_with_update_cpu_out(
+    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    Tensor& running_mean, Tensor& running_var, double momentum, double eps,
+    Tensor& out, Tensor& save_mean, Tensor& save_var, Tensor& reserve) {
+  std::tie(out, save_mean, save_var) =
+    batch_norm_cpu_out(input, weight_opt, bias_opt, running_mean, running_var, /*update*/true, momentum, eps, out, save_mean, save_var);
+  return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>(out, save_mean, save_var, reserve);
+}
+
+
+std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_no_update(
+    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    double momentum, double eps) {
+  const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
+  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+  Tensor output, save_mean, save_var;
+  std::tie(output, save_mean, save_var) =
+    batch_norm_cpu(input, weight_opt, bias_opt, const_cast<Tensor&>(running_mean), const_cast<Tensor&>(running_var), /*update*/false, momentum, eps);
+  Tensor reserve = at::empty({0}, input.options().dtype(kByte));
+  return std::tuple<Tensor, Tensor, Tensor, Tensor>(output, save_mean, save_var, reserve);
+}
 
 std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_cpu(
     const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
@@ -826,6 +909,13 @@ std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_no_stats_cpu_out(const T
   return batch_norm_cpu_out(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, eps, out, save_mean, save_var);
 }
 
+std::tuple<Tensor, Tensor, Tensor> _new_batch_norm_backward_cpu(
+    const Tensor& grad_output, const Tensor& input, const Tensor& weight,
+    const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
+    bool update, double eps, std::array<bool,3> grad_input_mask, const Tensor& reserve) {
+  return batch_norm_backward_cpu(grad_output, input, weight, running_mean_opt, running_var_opt, save_mean_opt, save_var_opt, update, eps, grad_input_mask);
+}
 
 std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu(const Tensor& grad_out, const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt,
                                                            bool train, double eps, std::array<bool,3> grad_input_mask) {
diff --git a/aten/src/ATen/native/Normalization.h b/aten/src/ATen/native/Normalization.h
index 6cd4dcde37052..1ba99e77b65c8 100644
--- a/aten/src/ATen/native/Normalization.h
+++ b/aten/src/ATen/native/Normalization.h
@@ -8,4 +8,12 @@ namespace at::native {
 using renorm_scale_factor_fn = void (*) (TensorIteratorBase& iter, double maxnorm);
 DECLARE_DISPATCH(renorm_scale_factor_fn, renorm_scale_factor_stub);
 
+enum class BatchNormBackend {
+  Native,
+  Cudnn,
+  Miopen,
+};
+
+TORCH_API BatchNormBackend _select_batch_norm_backend(const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& running_mean, const Tensor& running_var, bool training, double eps);
+
 }  // namespace at::native
diff --git a/aten/src/ATen/native/Onehot.cpp b/aten/src/ATen/native/Onehot.cpp
index 91c3c8f1611e4..97c35599f791c 100644
--- a/aten/src/ATen/native/Onehot.cpp
+++ b/aten/src/ATen/native/Onehot.cpp
@@ -42,15 +42,17 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) {
     }
 
     // non-empty tensor
-    if (self.device().type() != at::kCUDA && self.device().type() != at::kMPS) {
-      //for cuda, rely on device assert thrown by scatter
+    if (self.device().type() != at::kCUDA && self.device().type() != at::kMPS &&
+        self.device().type() != at::kPrivateUse1) {
+      // for cuda, rely on device assert thrown by scatter
       TORCH_CHECK(self.min().item().toLong() >= 0, "Class values must be non-negative.");
     }
     if (num_classes == -1) {
         num_classes = self.max().item().toLong() + 1;
     } else {
-        if (self.device().type() != at::kCUDA && self.device().type() != at::kMPS) {
-          //rely on device asserts from scatter to avoid sync here
+        if (self.device().type() != at::kCUDA && self.device().type() != at::kMPS &&
+            self.device().type() != at::kPrivateUse1) {
+          // rely on device asserts from scatter to avoid sync here
           TORCH_CHECK(num_classes > self.max().item().toLong(), "Class values must be smaller than num_classes.");
         } else {
             //for cuda, assert that num_classes is at least 1
diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index 33a733273a80a..07940729fda8c 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -26,6 +26,19 @@ using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input
 DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel);
 DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel);
 
+// averge pooling has same signature for forward and backward
+using avg_pool3d_fn = void(*)(const Tensor& output, const Tensor& input,
+    int64_t kW, int64_t kH, int64_t kD, int64_t dW, int64_t dH, int64_t dD,
+    int64_t padW, int64_t padH, int64_t padD, bool count_include_pad,
+    c10::optional<int64_t> divisor_override);
+using avg_pool3d_backward_fn = void(*)(const Tensor& output, const Tensor& input,
+    int kW, int kH, int kD, int dW, int dH, int dD,
+    int padW, int padH, int padD, bool count_include_pad,
+    c10::optional<int64_t> divisor_override);
+
+DECLARE_DISPATCH(avg_pool3d_fn, avg_pool3d_kernel);
+DECLARE_DISPATCH(avg_pool3d_backward_fn, avg_pool3d_backward_kernel);
+
 using max_pool3d_fn = void(*)(Tensor& output, Tensor& indices, const Tensor& input,
     int kW, int kH, int kD, int dW, int dH, int dD, int pW, int pH, int pD, int dilationW, int dilationH, int dilationD);
 using max_pool3d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
@@ -67,9 +80,9 @@ static inline T pooling_output_shape(
     TORCH_CHECK(stride != 0, "stride should not be zero");
     TORCH_CHECK(pad >= 0,
                 "pad must be non-negative, but got pad: ", pad);
-    TORCH_CHECK(pad <= kernelSize / 2,
-                "pad should be at most half of kernel size, but got pad=",
-                pad, " and kernel_size=", kernelSize)
+    TORCH_CHECK(pad <= ((kernelSize - 1) * dilation + 1) / 2,
+                "pad should be at most half of effective kernel size, but got pad=",
+                pad, ", kernel_size=", kernelSize, " and dilation=", dilation)
     return pooling_output_shape_pad_lr(
         inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode);
 }
diff --git a/aten/src/ATen/native/Pooling.cpp b/aten/src/ATen/native/Pooling.cpp
index 89eb276ed418b..0dd877a552f1d 100644
--- a/aten/src/ATen/native/Pooling.cpp
+++ b/aten/src/ATen/native/Pooling.cpp
@@ -68,8 +68,7 @@ std::tuple<Tensor,Tensor> adaptive_max_pool1d(const Tensor & self, IntArrayRef o
         " being empty");
   }
 
-  Tensor output, indices;
-  std::tie(output, indices) = at::adaptive_max_pool2d(
+  auto [output, indices] = at::adaptive_max_pool2d(
       self.unsqueeze(-2),
       {1, output_size[0]});
 
@@ -94,8 +93,7 @@ std::tuple<Tensor, Tensor> max_pool1d_with_indices(
 
   NoNamesGuard guard;
 
-  Tensor output, indices;
-  std::tie(output, indices) = at::max_pool2d_with_indices(
+  auto [output, indices] = at::max_pool2d_with_indices(
       self.unsqueeze(-2),
       {1, kernel_size[0]},
       {1, stride[0]},
diff --git a/aten/src/ATen/native/QuantizedLinear.cpp b/aten/src/ATen/native/QuantizedLinear.cpp
index 73fd1c1a94189..5fa45f3099844 100644
--- a/aten/src/ATen/native/QuantizedLinear.cpp
+++ b/aten/src/ATen/native/QuantizedLinear.cpp
@@ -64,7 +64,7 @@ Tensor fbgemm_linear_int8_weight_fp32_activation(
                   "and will be removed in a future PyTorch release.")
 
   const Tensor input_contig = input.contiguous();
-  const float* input_ptr = input_contig.data_ptr<float>();
+  const float* input_ptr = input_contig.const_data_ptr<float>();
 
   TORCH_CHECK(input.dim() >= 2);
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
@@ -125,6 +125,9 @@ Tensor fbgemm_linear_int8_weight_fp32_activation(
   auto& pack_b =
       cpp_custom_type_hack::cast<fbgemm::PackBMatrix<int8_t>>(packed);
 
+  int32_t* col_offsets_data = col_offsets.data_ptr<int32_t>();
+  float* bias_contig_data = bias_contig.data_ptr<float>();
+
   const int num_tasks = at::get_num_threads();
   at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) {
     // This operation does the following:
@@ -162,8 +165,8 @@ Tensor fbgemm_linear_int8_weight_fp32_activation(
           /*Aq_zero_point=*/q_params.zero_point,
           /*Bq_zero_point=*/&weight_zero_point_int32,
           /*row_offsets=*/pack_a.getRowOffsetBuffer(),
-          /*col_offsets=*/col_offsets.data_ptr<int32_t>(),
-          /*bias=*/bias_contig.data_ptr<float>(),
+          /*col_offsets=*/col_offsets_data,
+          /*bias=*/bias_contig_data,
           /*nCol=*/N);
       // Do the GEMM
       fbgemm::fbgemmPacked(
@@ -302,7 +305,7 @@ Tensor fbgemm_pack_quantized_matrix(const Tensor& weight) {
   const int64_t K = weight.size(1);
   const int64_t N = weight.size(0);
   const Tensor weight_contig = weight.contiguous();
-  const int8_t* weight_ptr = weight_contig.data_ptr<int8_t>();
+  const int8_t* weight_ptr = weight_contig.const_data_ptr<int8_t>();
   auto ptr = std::make_unique<fbgemm::PackBMatrix<int8_t>>(
       /*trans=*/fbgemm::matrix_op_t::Transpose,
       /*nRow=*/K,
@@ -421,7 +424,7 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
   TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU doesn't support FBGEMM.");
 
   const Tensor input_contig = input.contiguous();
-  const float* input_ptr = input_contig.data_ptr<float>();
+  const float* input_ptr = input_contig.const_data_ptr<float>();
 
   // Pull out the PackedGemmMatrixFP16 instance from the owning tensor
   const fbgemm::PackedGemmMatrixFP16& packed_weight_fp16 =
diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
index c2a901989717c..97ce09ac8e51d 100644
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@@ -71,6 +71,12 @@ bool use_miopen(const at::Tensor& input, const double dropout_state) {
                                 (detail::getCUDAHooks().compiledWithMIOpen()) &&
                                 (input.is_cuda()) &&
                                 (at::globalContext().userEnabledCuDNN());
+    // MIOpen functions returns miopenStatusBadParm on empty
+    // tensors. Maybe some functions actually support empty tensors, but
+    // native kernels shouldn't be much slower because the output is also
+    // likely empty.
+    if (input.sym_numel() == 0) return false;
+
     return is_miopen_acceptable;
 }
 
@@ -290,10 +296,7 @@ struct QuantizedCellParams : public CellParamsBase {
   }
   static c10::intrusive_ptr<CellParamsBase> __setstate__(
       CellParamsSerializationType state) {
-    std::vector<at::Tensor> tensors;
-    std::vector<double> doubles;
-    std::vector<int64_t> longs;
-    std::tie(std::ignore, tensors, doubles, longs, std::ignore) =
+    auto [_, tensors, doubles, longs, __] =
         std::move(state);
     TORCH_INTERNAL_ASSERT(tensors.size() == 6);
     TORCH_INTERNAL_ASSERT(doubles.size() == 2);
@@ -338,12 +341,9 @@ c10::intrusive_ptr<CellParamsBase> make_quantized_cell_params(
         std::make_tuple(std::move(packed_weight)), std::move(params));
   };
 
-  at::Tensor qw_ih, qw_hh, packed_ih, packed_hh, col_offsets_ih, col_offsets_hh;
-  at::Scalar scale_ih, scale_hh, zero_point_ih, zero_point_hh;
-
-  std::tie(packed_ih, qw_ih, col_offsets_ih, scale_ih, zero_point_ih) =
+  auto [packed_ih, qw_ih, col_offsets_ih, scale_ih, zero_point_ih] =
       make_vals(w_ih);
-  std::tie(packed_hh, qw_hh, col_offsets_hh, scale_hh, zero_point_hh) =
+  auto [packed_hh, qw_hh, col_offsets_hh, scale_hh, zero_point_hh] =
       make_vals(w_hh);
 
   return c10::make_intrusive<QuantizedCellParams>(
@@ -438,10 +438,7 @@ struct QuantizedCellParamsDynamic : public CellParamsBase {
   }
   static c10::intrusive_ptr<CellParamsBase> __setstate__(
       CellParamsSerializationType state) {
-    std::vector<at::Tensor> tensors;
-    std::vector<c10::intrusive_ptr<LinearPackedParamsBase>> packed_params;
-    std::vector<int64_t> serialized_ints;
-    std::tie(std::ignore, tensors, std::ignore, serialized_ints, packed_params) =
+    auto [_, tensors, __, serialized_ints, packed_params] =
         std::move(state);
     TORCH_INTERNAL_ASSERT(tensors.size() == 2);
     TORCH_INTERNAL_ASSERT(packed_params.size() == 2);
@@ -514,10 +511,7 @@ struct QuantizedCellParamsFP16 : public CellParamsBase {
   }
   static c10::intrusive_ptr<CellParamsBase> __setstate__(
       CellParamsSerializationType state) {
-    std::vector<c10::intrusive_ptr<LinearPackedParamsBase>> packed_params;
-    std::tie(
-        std::ignore, std::ignore, std::ignore, std::ignore, packed_params) =
-        std::move(state);
+    auto packed_params = std::get<4>(std::move(state));
     TORCH_INTERNAL_ASSERT(packed_params.size() == 2);
     return make_quantized_cell_params_fp16(
         /*w_ih_packed=*/std::move(packed_params[0]),
@@ -730,7 +724,7 @@ struct LSTMCell : Cell<std::tuple<Tensor, Tensor>, cell_params> {
     const auto& hx = std::get<0>(hidden);
     const auto& cx = std::get<1>(hidden);
 
-    if (input.is_cuda()) {
+    if (input.is_cuda() || input.is_privateuseone()) {
       TORCH_CHECK(!pre_compute_input);
       auto igates = params.matmul_ih(input);
       auto hgates = params.matmul_hh(hx);
@@ -766,7 +760,7 @@ struct GRUCell : Cell<Tensor, cell_params> {
       const hidden_type& hidden,
       const cell_params& params,
       bool pre_compute_input = false) const override {
-    if (input.is_cuda() || input.is_xpu()) {
+    if (input.is_cuda() || input.is_xpu() || input.is_privateuseone()) {
       TORCH_CHECK(!pre_compute_input);
       auto igates = params.matmul_ih(input);
       auto hgates = params.matmul_hh(hidden);
@@ -1167,7 +1161,7 @@ bool _use_cudnn_rnn_flatten_weight() {
 }
 
 // NB: This a (composite) wrapper for _thnn_fused_lstm_cell_backward_impl.
-//     It duplicates the outputs of this function so the non-composite verison doesn't have to.
+//     It duplicates the outputs of this function so the non-composite version doesn't have to.
 //     The point is so that we avoid triggering TensorImpl use count asserts in debug mode
 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backward( const c10::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt,
       const Tensor& cx, const Tensor& cy,
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index 5fa3e2b0af2d6..d29b177c13960 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -4,6 +4,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
 #include <ATen/Parallel.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/WrapDimUtilsMulti.h>
@@ -173,7 +174,7 @@ static void check_result_is_bytebool(const char* name, const Tensor& self, const
 
 // Note [all, any : uint8 compatibility]:
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-// For NumPy comptability, `all` and `any` return
+// For NumPy compatibility, `all` and `any` return
 // Tensor of dtype `bool`. However for compatibility reason,
 // for `uint8`, they return Tensor of same dtype `uint8`.
 // Reference: https://github.com/pytorch/pytorch/pull/47878#issuecomment-747108561
@@ -510,7 +511,7 @@ static Tensor reversed_cumsum(const Tensor& w, int64_t dim) {
 Tensor cumprod_backward(const Tensor& grad, const Tensor& input, int64_t dim, const Tensor& output) {
   /*
     We show here how to derive an O(n) gradient formula for
-    abitrary inputs. It follows via a basic application of the
+    arbitrary inputs. It follows via a basic application of the
     chain rule together with a number of observations for different
     cases. We assume that x is an n-dimensional vector and y = cumprod(x).
     In the actual implementation we will need to play a bit with masks
@@ -527,7 +528,7 @@ Tensor cumprod_backward(const Tensor& grad, const Tensor& input, int64_t dim, co
     The term dF / dy_j is just grad_output[j] (assuming again
     everything is one-dimensional).
 
-    The term (dy_j / dx_k) is easilly seen to be
+    The term (dy_j / dx_k) is easily seen to be
 
     if j >= k
       dy_j / dx_k = prod_{1 <= i <= j, i != k} x_i
@@ -589,7 +590,7 @@ Tensor cumprod_backward(const Tensor& grad, const Tensor& input, int64_t dim, co
 
     dy_j / dx_z1 = prod(x[:z1]) * (grad_output[z1] + sum(grad_output[z1+1:z2] * cumprod(x[z1+1:z2])))
 
-    When the imputs are complex, this is map is holomorphic. As such, to compute
+    When the inputs are complex, this is map is holomorphic. As such, to compute
     its backwards is just the conjugate of the usual backwards. This simplifies to
     conjugating the input. We may also reuse the output as, since the map is holomorphic,
     cumprod(input.conj()) = cumprod(input).conj()
@@ -1170,6 +1171,25 @@ std::vector<Tensor> gradient(const Tensor& self, IntArrayRef dim, int64_t edge_o
 
 // ALL REDUCE #################################################################
 
+inline bool should_use_acc_buffer(at::TensorIterator& iter) {
+  const auto ndim = iter.ndim();
+  if (!iter.device().is_cpu() || iter.noutputs() != 1) {
+    return false;
+  }
+  if (!at::isReducedFloatingType(iter.common_dtype())) {
+    return false;
+  }
+  if (ndim < 2) {
+    return false;
+  }
+  auto out_strides = iter.strides(0);
+  for (const auto dim : c10::irange(0, 2)) {
+      if (out_strides[dim] != 0) {
+        return false;
+      }
+  }
+  return true;
+}
 
 TORCH_IMPL_FUNC(sum_out)
 (const Tensor& self,
@@ -1181,7 +1201,19 @@ TORCH_IMPL_FUNC(sum_out)
   if (iter.numel() == 0) {
     result.zero_();
   } else {
-    sum_stub(iter.device_type(), iter);
+    // Here is a limitation of TensorIterator reductions for permuted input with lower precision on CPU.
+    // Consider the case: TensorIterator coalesces such input and output to >= 2 dims tensors,
+    // and the output stride is [0, 0, x, x, ...] with x >= 0 (two reduced dimensions and non-reduced dims).
+    // Since the reduction loop only operates on two dimensions at a time,
+    // the intermediate sums is forced to do accumulation in the second reduced dim with lower precision.
+    // See https://github.com/pytorch/pytorch/issues/83149
+    if (should_use_acc_buffer(iter)) {
+      auto tmp_output = at::empty(result.sizes(), result.options().dtype(kFloat));
+      at::sum_outf(self.to(ScalarType::Float), opt_dim, keepdim, /*dtype=*/c10::nullopt, tmp_output);
+      result.copy_(tmp_output);
+    } else{
+      sum_stub(iter.device_type(), iter);
+    }
   }
 }
 
@@ -1250,7 +1282,7 @@ Tensor trace_cpu(const Tensor& self) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX(self.scalar_type(), "trace", [&] {
     using accscalar_t = at::acc_type<scalar_t, false>;
     accscalar_t sum = 0;
-    const auto* t_data = self.data_ptr<scalar_t>();
+    const auto* t_data = self.const_data_ptr<scalar_t>();
 
     int64_t t_stride_0, t_stride_1, t_diag_size;
 
@@ -1726,7 +1758,7 @@ static double std_var_all_cpu(const Tensor& self, double correction, bool take_s
 
   auto mean = self.mean().item<double>();
   auto iter = TensorIteratorConfig()
-      .add_input(self)
+      .add_const_input(self)
       .build();
 
   auto reduction = [&](int64_t begin, int64_t end, double thread_sum) {
@@ -2197,7 +2229,7 @@ bool cpu_equal(const Tensor& self, const Tensor& other) {
       return true;
     }
     std::atomic<bool> result{true};
-    auto iter = TensorIteratorConfig().add_input(self).build();
+    auto iter = TensorIteratorConfig().add_const_input(self).build();
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "equal_notnan_cpu", [&] {
       iter.for_each([&](char** data, const int64_t *strides, int64_t dim_size) {
         if (!result) {
@@ -2218,13 +2250,13 @@ bool cpu_equal(const Tensor& self, const Tensor& other) {
 
   std::atomic<bool> result{true};
   auto iter = TensorIteratorConfig()
-    .add_input(self)
-    .add_input(other)
+    .add_const_input(self)
+    .add_const_input(other)
     .allow_cpu_scalars(true)
     .promote_inputs_to_common_dtype(true)
     .build();
 
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "equal_cpu", [&] {
+  AT_DISPATCH_V2(iter.input_dtype(), "equal_cpu", AT_WRAP([&] {
     iter.for_each([&](char** data, const int64_t *strides, int64_t dim_size) {
       if (!result) {
           return;
@@ -2240,7 +2272,7 @@ bool cpu_equal(const Tensor& self, const Tensor& other) {
         other_data += strides[1];
       }
     });
-  });
+  }), kBool, kBFloat16, kHalf, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
   return result.load();
 }
 
diff --git a/aten/src/ATen/native/ReduceOpsUtils.h b/aten/src/ATen/native/ReduceOpsUtils.h
index bec04f0cd935b..6989b00f6f3e6 100644
--- a/aten/src/ATen/native/ReduceOpsUtils.h
+++ b/aten/src/ATen/native/ReduceOpsUtils.h
@@ -368,7 +368,13 @@ static void resize_reduction(
   DimVector dims_ = at::native::make_dim_vector(opt_dims, self.dim());
   maybe_wrap_dims(dims_, self.dim());
   auto shape = get_reduction_shape(self, dims_, keepdim, allow_empty_dims);
-  meta.set_output_raw_strided(0, shape, {}, self.options().dtype(out_dtype));
+  if (self.layout() == kStrided) {
+    meta.set_output_raw_strided(0, shape, {}, self.options().dtype(out_dtype));
+  } else if (shape.size() == 0) {
+    meta.set_output_raw_strided(0, shape, {}, self.options().dtype(out_dtype).layout(kStrided));
+  } else {
+    TORCH_CHECK(false, "resize_reduction: support for output with ", self.layout(), " layout is not implemented yet");
+  }
   namedinference::propagate_names_for_reduction(
       meta.maybe_get_output(), self, dims_, keepdim);
 }
diff --git a/aten/src/ATen/native/Repeat.cpp b/aten/src/ATen/native/Repeat.cpp
index 81d6be7b85200..dd87cead1f480 100644
--- a/aten/src/ATen/native/Repeat.cpp
+++ b/aten/src/ATen/native/Repeat.cpp
@@ -16,8 +16,8 @@
 
 template <typename index_t>
 static void compute_cpu(
-    index_t* repeat_ptr,
-    int64_t* cumsum_ptr,
+    const index_t* repeat_ptr,
+    const int64_t* cumsum_ptr,
     index_t* result_ptr,
     int64_t size,
     int64_t result_size) {
diff --git a/aten/src/ATen/native/Repeat.h b/aten/src/ATen/native/Repeat.h
index a90ed815f9352..e9a471d16f931 100644
--- a/aten/src/ATen/native/Repeat.h
+++ b/aten/src/ATen/native/Repeat.h
@@ -14,7 +14,7 @@ namespace at::native {
 
 template <
     typename index_t,
-    void compute(index_t*, int64_t*, index_t*, int64_t, int64_t)>
+    void compute(const index_t*, const int64_t*, index_t*, int64_t, int64_t)>
 static inline Tensor repeat_interleave_common(
     const Tensor& repeats,
     c10::optional<int64_t> output_size) {
@@ -38,8 +38,8 @@ static inline Tensor repeat_interleave_common(
   }
 
   Tensor result = at::empty({total}, repeats.options());
-  index_t* repeat_ptr = repeats_.data_ptr<index_t>();
-  int64_t* cumsum_ptr = cumsum.data_ptr<int64_t>();
+  const index_t* repeat_ptr = repeats_.const_data_ptr<index_t>();
+  const int64_t* cumsum_ptr = cumsum.const_data_ptr<int64_t>();
   index_t* result_ptr = result.data_ptr<index_t>();
   compute(repeat_ptr, cumsum_ptr, result_ptr, repeats.size(0), total);
   return result;
diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp
index 13d784b1fbb20..d0762d28459f6 100644
--- a/aten/src/ATen/native/ReplicationPadding.cpp
+++ b/aten/src/ATen/native/ReplicationPadding.cpp
@@ -25,6 +25,7 @@ namespace at::meta {
 TORCH_META_FUNC(replication_pad1d) (
   const Tensor& input, IntArrayRef paddingSize  // no out argument!
 ) {
+  TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2");
 
   int64_t dimw = 1;
   int64_t dimslices = 0;
@@ -85,6 +86,7 @@ TORCH_META_FUNC(replication_pad1d_backward) (
 TORCH_META_FUNC(replication_pad2d) (
   const Tensor& input, IntArrayRef paddingSize
 ) {
+  TORCH_CHECK(paddingSize.size() == 4, "padding size is expected to be 4");
   int64_t pad_l = paddingSize[0];
   int64_t pad_r = paddingSize[1];
   int64_t pad_t = paddingSize[2];
@@ -124,6 +126,7 @@ TORCH_META_FUNC(replication_pad2d) (
 TORCH_META_FUNC(replication_pad3d) (
   const Tensor& input, IntArrayRef paddingSize
 ) {
+  TORCH_CHECK(paddingSize.size() == 6, "padding size is expected to be 6");
   int64_t pleft = paddingSize[0];
   int64_t pright = paddingSize[1];
   int64_t ptop = paddingSize[2];
diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp
index 415d3d65bef42..be88538ed7082 100644
--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@@ -94,13 +94,14 @@ void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes) {
   if (size_bytes != 0) {
     new_data = storage->allocator()->allocate(size_bytes);
   }
-  at::DataPtr old_data = storage->set_data_ptr(std::move(new_data));
+  const at::DataPtr& old_data = storage->data_ptr();
   const auto old_capacity = storage->nbytes();
-  storage->set_nbytes(size_bytes);
   const auto copy_capacity = std::min(size_bytes, old_capacity);
   if (old_data != nullptr && copy_capacity > 0) {
-    memcpy(storage->mutable_data(), old_data.get(), copy_capacity);
+    memcpy(new_data.get(), old_data.get(), copy_capacity);
   }
+  storage->set_data_ptr_noswap(std::move(new_data));
+  storage->set_nbytes(size_bytes);
 }
 
 // Call the sparse implementation in SparseTensor.cpp directly.
@@ -281,4 +282,50 @@ const Tensor& resize__symint(
   return _resize_(self, size, optional_memory_format);
 }
 
+void resize_bytes_nocuda(const Storage& storage, c10::SymInt newsize) {
+  // handles all devices except cuda (which needs to be in a different .so)
+  c10::DeviceType device_type = storage.device_type();
+  if (device_type == at::kCPU) {
+    at::native::resize_bytes_cpu(storage.unsafeGetStorageImpl(), newsize.expect_int());
+  } else if (device_type == at::kMeta) {
+    at::native::resize_bytes_meta(storage.unsafeGetStorageImpl(), newsize);
+  } else if (device_type == at::kPrivateUse1) {
+    at::GetPrivateUse1HooksInterface()->resizePrivateUse1Bytes(
+        storage, newsize.expect_int());
+  } else if (device_type == at::kXPU || device_type == at::kHPU) {
+    ptrdiff_t size_bytes_i = newsize.expect_int();
+    TORCH_CHECK(
+        !c10::overflows<int64_t>(size_bytes_i),
+        "Requested storage size (",
+        size_bytes_i,
+        ") cannot be represented as a int64_t");
+    const auto size_bytes = static_cast<int64_t>(size_bytes_i);
+    void* original_data_ptr = storage.data_ptr().get();
+
+    auto src_option =
+        c10::TensorOptions().device(storage.device()).dtype(at::kByte);
+    auto src_tensor = at::empty({0}, src_option).set_(storage);
+    src_tensor.resize_({size_bytes});
+
+    // When using resize_ to replace resize_bytes_xxx, in some cases
+    // the original data_ptr is still returned, which is an inconsistent
+    // behavior when compared to resize_bytes_xxx. For these cases,
+    // an additional memory copy and update for storage are required.
+    if (original_data_ptr == src_tensor.storage().data_ptr().get()) {
+      auto new_tensor = at::empty(src_tensor.sizes(), src_tensor.options());
+      new_tensor.copy_(src_tensor);
+      storage.set_data_ptr_noswap(
+          std::move(new_tensor.storage().mutable_data_ptr()));
+      storage.unsafeGetStorageImpl()->set_allocator(
+          new_tensor.storage().unsafeGetStorageImpl()->allocator());
+      storage.set_nbytes(new_tensor.storage().nbytes());
+    }
+  } else {
+    TORCH_CHECK(
+        false,
+        "UntypedStorage.resize_: got unexpected device type ",
+        device_type);
+  }
+}
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h
index b752b91e04f38..0a1f21298957d 100644
--- a/aten/src/ATen/native/Resize.h
+++ b/aten/src/ATen/native/Resize.h
@@ -38,6 +38,7 @@ TORCH_API bool resize_output_check_symint(const Tensor& output, SymIntArrayRef s
 
 TORCH_API void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes);
 TORCH_API void resize_bytes_meta(StorageImpl* storage, c10::SymInt size_bytes);
+TORCH_API void resize_bytes_nocuda(const Storage& storage, c10::SymInt size_bytes);
 
 static inline void maybe_resize_storage_cpu(TensorImpl* self, size_t new_size_bytes) {
   // It does not make sense to try to resize a storage
diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp
index 1d7adc5f569f8..ec19449d4133e 100644
--- a/aten/src/ATen/native/Scalar.cpp
+++ b/aten/src/ATen/native/Scalar.cpp
@@ -40,7 +40,7 @@ Scalar _local_scalar_dense_cpu(const Tensor& self) {
     self.scalar_type(),
     "_local_scalar_dense_cpu",
     AT_WRAP([&] {
-      scalar_t value = *self.data_ptr<scalar_t>();
+      scalar_t value = *self.const_data_ptr<scalar_t>();
       r = Scalar(value);
     }),
     AT_EXPAND(AT_SD_TYPES)
diff --git a/aten/src/ATen/native/SegmentReduce.cpp b/aten/src/ATen/native/SegmentReduce.cpp
index 61d2a1f60ca11..3c7b539ee4b6d 100644
--- a/aten/src/ATen/native/SegmentReduce.cpp
+++ b/aten/src/ATen/native/SegmentReduce.cpp
@@ -52,7 +52,7 @@ void _segment_reduce_lengths_cpu_kernel1(
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kBFloat16, kHalf, data.scalar_type(), "_segment_reduce_cpu", [&]() {
         auto* output_data = output.data_ptr<scalar_t>();
-        const auto* values_data = data.data_ptr<scalar_t>();
+        const auto* values_data = data.const_data_ptr<scalar_t>();
         for (const auto outer_idx : c10::irange(outer_offset)) {
           int64_t segment_start, segment_length;
           int64_t segment_end = is_offsets_like ?
@@ -145,7 +145,7 @@ Tensor _segment_reduce_lengths_cpu_kernel(
   auto output = at::empty(output_shape, data.options());
 
   AT_DISPATCH_INDEX_TYPES(lengths.scalar_type(), "_segment_reduce_lengths_cpu_kernel1", [&]() {
-    const auto* lengths_data = lengths.data_ptr<index_t>();
+    const auto* lengths_data = lengths.const_data_ptr<index_t>();
     _segment_reduce_lengths_cpu_kernel1(
         reduction, data, lengths_data, axis, initial, output, segment_count, lengths_stride_axis);
   });
@@ -171,7 +171,7 @@ Tensor _segment_reduce_offsets_cpu_kernel(
   auto output = at::empty(output_shape, data.options());
 
   AT_DISPATCH_INDEX_TYPES(offsets.scalar_type(), "_segment_reduce_offsets_cpu_kernel1", [&]() {
-    const auto* offsets_data = offsets.data_ptr<index_t>();
+    const auto* offsets_data = offsets.const_data_ptr<index_t>();
     _segment_reduce_lengths_cpu_kernel1<index_t, /*is_offsets_like=*/true>(
         reduction, data, offsets_data, axis, initial, output, segment_count, offsets_stride_axis);
   });
@@ -211,10 +211,10 @@ void _segment_reduce_cpu_lengths_backward_kernel1(
       data_contig.scalar_type(),
       "_segment_reduce_cpu",
       [&]() {
-        auto* output_data = output_contig.data_ptr<scalar_t>();
-        auto* grad_data = grad_contig.data_ptr<scalar_t>();
+        auto* output_data = output_contig.const_data_ptr<scalar_t>();
+        auto* grad_data = grad_contig.const_data_ptr<scalar_t>();
         auto* grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
-        const auto* values_data = data_contig.data_ptr<scalar_t>();
+        const auto* values_data = data_contig.const_data_ptr<scalar_t>();
         // Used to calculate exclusive prod
         scalar_t initial_prod_value;
         if (reduction == ReductionType::PROD) {
@@ -331,7 +331,7 @@ Tensor _segment_reduce_cpu_lengths_backward_kernel(
 
   AT_DISPATCH_INDEX_TYPES(
       lengths_contig.scalar_type(), "_segment_reduce_cpu_lengths_backward_kernel1", [&] {
-        const auto* lengths_data = lengths_contig.data_ptr<index_t>();
+        const auto* lengths_data = lengths_contig.const_data_ptr<index_t>();
         _segment_reduce_cpu_lengths_backward_kernel1(
             grad_contig,
             output_contig,
@@ -364,7 +364,7 @@ Tensor _segment_reduce_cpu_offsets_backward_kernel(
 
   AT_DISPATCH_INDEX_TYPES(
       offsets_contig.scalar_type(), "_segment_reduce_cpu_offsets_backward_kernel1", [&] {
-        const auto* offsets_data = offsets_contig.data_ptr<index_t>();
+        const auto* offsets_data = offsets_contig.const_data_ptr<index_t>();
         _segment_reduce_cpu_lengths_backward_kernel1<index_t, /*is_offsets_like=*/true>(
             grad_contig,
             output_contig,
diff --git a/aten/src/ATen/native/SobolEngineOpsUtils.cpp b/aten/src/ATen/native/SobolEngineOpsUtils.cpp
index 1e129673accdd..3d492221c5057 100644
--- a/aten/src/ATen/native/SobolEngineOpsUtils.cpp
+++ b/aten/src/ATen/native/SobolEngineOpsUtils.cpp
@@ -3,7 +3,7 @@
 #include <ATen/native/SobolEngineOpsUtils.h>
 
 /*
-The direction nubmers in this file  were generated using the
+The direction numbers in this file  were generated using the
 python script below (thius this assumes that the file
 https://web.maths.unsw.edu.au/~fkuo/sobol/new-joe-kuo-6.21201
 is present in the working directory). For additional details see [1].
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
index 30bb6da1d8642..bd321a0a88e7a 100644
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -637,8 +637,8 @@ Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const c10::
     input = input.view(1);
   }
 
-  AT_DISPATCH_FLOATING_TYPES_AND(
-      at::ScalarType::BFloat16, input.scalar_type(), "masked_softmax", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::BFloat16, at::ScalarType::Half, input.scalar_type(), "masked_softmax", [&] {
         host_softmax<
             scalar_t,
             false /* LogSoftMax */,
@@ -670,8 +670,8 @@ Tensor masked_softmax_backward_cpu(
   mask = mask.dim() == 0 ? mask.view(1) : mask;
 
   Tensor grad_input = at::empty_like(grad, grad.options());
-  AT_DISPATCH_FLOATING_TYPES_AND(
-      at::ScalarType::BFloat16, grad.scalar_type(), "masked_softmax_backward", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::BFloat16, at::ScalarType::Half, grad.scalar_type(), "masked_softmax_backward", [&] {
         host_softmax_backward<
             scalar_t,
             false /* LogSoftMax */,
diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
index 91f05e367fed2..b31007408c7ae 100644
--- a/aten/src/ATen/native/Sorting.cpp
+++ b/aten/src/ATen/native/Sorting.cpp
@@ -72,9 +72,6 @@ TORCH_META_FUNC(topk)
 
 TORCH_META_FUNC2(sort, stable)
 (const Tensor& self, c10::optional<bool> stable, int64_t dim, bool descending) {
-  TORCH_INTERNAL_ASSERT(
-      stable.has_value(),
-      "sort(): c10::optional<bool> for stable has to have value.");
   maybe_wrap_dim(dim, self.dim());
 
   // See issue: https://github.com/pytorch/pytorch/issues/65863
@@ -549,7 +546,7 @@ std::tuple<Tensor&, Tensor&> median_with_indices_impl(
     .declare_static_shape(sizes, /*squash_dims=*/dim)
     .add_output(vals)
     .add_output(inds)
-    .add_input(in)
+    .add_const_input(in)
     .build();
 
   AT_DISPATCH_ALL_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, in.scalar_type(), "median_out", [&] {
@@ -953,7 +950,7 @@ TORCH_IMPL_FUNC(sort_stable_out)
     indices.zero_();
   } else {
     dim = maybe_wrap_dim(dim, self.dim());
-    sort_stub(self.device().type(), self, values, indices, dim, descending, stable.value());
+    sort_stub(self.device().type(), self, values, indices, dim, descending, stable.value_or(false));
   }
 }
 
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 26422ed8130a7..7ed068874e68a 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -83,8 +83,7 @@ ScalarType promote_type_fft(ScalarType type, bool require_complex, Device device
   const bool maybe_support_half = (
     // Only CUDA supports half precision, but since meta tensors don't have a
     // device we err on the side of accepting it
-    (device.is_cuda() || device.is_meta()) &&
-    !at::detail::getCUDAHooks().hasROCM()
+    device.is_cuda() || device.is_meta()
   );
   if (maybe_support_half) {
     TORCH_CHECK(type == kHalf || type == kFloat || type == kDouble, "Unsupported dtype ", type);
diff --git a/aten/src/ATen/native/SpectralOpsUtils.h b/aten/src/ATen/native/SpectralOpsUtils.h
index 7d9852b8e7b0b..279e4ff595567 100644
--- a/aten/src/ATen/native/SpectralOpsUtils.h
+++ b/aten/src/ATen/native/SpectralOpsUtils.h
@@ -3,7 +3,11 @@
 #include <string>
 #include <stdexcept>
 #include <sstream>
+#include <c10/core/ScalarType.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
 #include <ATen/native/DispatchStub.h>
+#include <ATen/core/TensorBase.h>
 
 namespace at::native {
 
diff --git a/aten/src/ATen/native/SummaryOps.cpp b/aten/src/ATen/native/SummaryOps.cpp
index 81a0ccd6d8337..4c158f81a47e9 100644
--- a/aten/src/ATen/native/SummaryOps.cpp
+++ b/aten/src/ATen/native/SummaryOps.cpp
@@ -43,7 +43,7 @@ Tensor _bincount_cpu_template(
   int64_t nbins = static_cast<int64_t>(*self.max().data_ptr<input_t>()) + 1L;
   nbins = std::max(nbins, minlength); // at least minlength # of bins
 
-  const input_t* self_p = self.data_ptr<input_t>();
+  const input_t* self_p = self.const_data_ptr<input_t>();
   if (has_weights) {
     output = at::zeros(
         {nbins},
@@ -52,7 +52,7 @@ Tensor _bincount_cpu_template(
         weights.options().device_opt(),
         weights.options().pinned_memory_opt());
     weights_t* output_p = output.data_ptr<weights_t>();
-    const weights_t* weights_p = weights.data_ptr<weights_t>();
+    const weights_t* weights_p = weights.const_data_ptr<weights_t>();
     for (const auto i : c10::irange(self_size)) {
       output_p[self_p[i]] += weights_p[i];
     }
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index ba00cbf344569..f1e385d8eeac8 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -408,9 +408,9 @@ static void build_index_op(
   config.set_check_mem_overlap(false)
       .check_all_same_dtype(false)
       .add_output(result)
-      .add_owned_input(info.src);
+      .add_owned_const_input(info.src);
   for (auto& index : info.indices) {
-    config.add_owned_input(index);
+    config.add_owned_const_input(index);
   }
   if (!result.defined()) {
     config.declare_static_dtype_and_device(info.src.scalar_type(), info.src.device());
@@ -614,9 +614,9 @@ static TensorIterator make_index_put_iterator(const AdvancedIndex& info, const T
   config.resize_outputs(false);
   config.check_all_same_dtype(false);
   config.add_output(info.src);
-  config.add_input(value);
+  config.add_const_input(value);
   for (auto& index : info.indices) {
-    config.add_input(index);
+    config.add_const_input(index);
   }
   return config.build();
 }
@@ -689,8 +689,8 @@ Tensor & put_(Tensor & self, const Tensor& index, const Tensor & source, const b
   auto iter = TensorIteratorConfig()
     .set_check_mem_overlap(false)
     .check_all_same_dtype(false)
-    .add_input(source)
-    .add_input(index_reshaped)
+    .add_const_input(source)
+    .add_const_input(index_reshaped)
     .build();
 
   put_stub(iter.device_type(), iter, self, accumulate);
@@ -769,7 +769,7 @@ Tensor& take_out(const Tensor& self, const Tensor& index, Tensor& out) {
     .set_check_mem_overlap(false)
     .check_all_same_dtype(false)
     .add_output(out)
-    .add_input(index)
+    .add_const_input(index)
     .build();
 
   // Early return after out has been resized
@@ -848,8 +848,8 @@ TORCH_IMPL_FUNC(index_copy_out)
       .check_all_same_dtype(false)
       .resize_outputs(false)
       .add_output(result_restrided)
-      .add_input(index_restrided)
-      .add_input(source_nonzero)
+      .add_const_input(index_restrided)
+      .add_const_input(source_nonzero)
       .build();
 
     auto result_dim_size = result_nonzero.size(dim);
@@ -943,15 +943,15 @@ TORCH_IMPL_FUNC(index_add_cpu_out)
     auto iter = TensorIterator::borrowing_binary_op(selfSlice, selfSlice, sourceSlice);
 
     AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_add_cpu_", [&] () {
-      auto index_data = index_contig.data_ptr<index_t>();
+      auto index_data = index_contig.const_data_ptr<index_t>();
       for (const auto i : c10::irange(numel)) {
           auto self_i = index_data[i];
           TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_dim_size), "index out of range in self");
           auto self_data = static_cast<char*>(selfSlice.data_ptr()) + self_i * self_stride_bytes;
-          auto source_data = static_cast<char*>(sourceSlice.data_ptr()) + i * source_stride_bytes;
+          auto source_data = static_cast<const char*>(sourceSlice.const_data_ptr()) + i * source_stride_bytes;
           iter.unsafe_replace_operand(0, self_data);
           iter.unsafe_replace_operand(1, self_data);
-          iter.unsafe_replace_operand(2, source_data);
+          iter.unsafe_replace_operand(2, const_cast<char*>(source_data));
           add_stub(iter.device_type(), iter, alpha);
       }
     });
@@ -967,10 +967,10 @@ TORCH_IMPL_FUNC(index_add_cpu_out)
       auto source_stride = source.dim() == 0 ? 1 : source.stride(dim);
       // TODO: Maybe TensorAccessor can be used here?
       auto* result_ptr = result.data_ptr<scalar_t>();
-      auto* source_ptr = source.data_ptr<scalar_t>();
+      auto* source_ptr = source.const_data_ptr<scalar_t>();
       AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_add_cpu_",
         [&index_contig, &numel, &result, &result_ptr, &result_stride, &source_ptr, &source_stride, &alpha_value] {
-        auto index_data = index_contig.data_ptr<index_t>();
+        auto index_data = index_contig.const_data_ptr<index_t>();
         for (const auto i : c10::irange(numel)) {
             auto self_i = index_data[i];
             TORCH_CHECK_INDEX((self_i >= 0) && (self_i < result.numel()), "index out of range in self");
@@ -1040,15 +1040,15 @@ static void index_reduce_func_impl(
     auto iter = TensorIterator::borrowing_binary_op(selfSlice, selfSlice, sourceSlice);
 
     AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_func_cpu_", [&] () {
-      auto index_data = index_contig.data_ptr<index_t>();
+      auto index_data = index_contig.const_data_ptr<index_t>();
       for (const auto i : c10::irange(numel)) {
         auto self_i = index_data[i];
         TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_dim_size), "index out of range in self");
         auto self_data = static_cast<char*>(selfSlice.data_ptr()) + self_i * self_stride_bytes;
-        auto source_data = static_cast<char*>(sourceSlice.data_ptr()) + i * source_stride_bytes;
+        auto source_data = static_cast<const char*>(sourceSlice.const_data_ptr()) + i * source_stride_bytes;
         iter.unsafe_replace_operand(0, self_data);
         iter.unsafe_replace_operand(1, self_data);
-        iter.unsafe_replace_operand(2, source_data);
+        iter.unsafe_replace_operand(2, const_cast<char*>(source_data));
 
         switch (op) {
           case ReductionType::PROD :
@@ -1090,11 +1090,11 @@ static void index_reduce_func_impl(
       auto counts_stride = counts.dim() == 0 ? 1 : counts.stride(dim);
       // TODO: Maybe TensorAccessor can be used here?
       auto* result_ptr = result.data_ptr<scalar_t>();
-      auto* source_ptr = source.data_ptr<scalar_t>();
+      auto* source_ptr = source.const_data_ptr<scalar_t>();
       auto counts_ptr = counts.data_ptr<scalar_t>();
       AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_func_cpu_",
         [&index_contig, &numel, &result, &result_ptr, &result_stride, &source_ptr, &source_stride, &op, &counts_ptr, &counts_stride] {
-        auto index_data = index_contig.data_ptr<index_t>();
+        auto index_data = index_contig.const_data_ptr<index_t>();
         for (const auto i : c10::irange(numel)) {
             auto self_i = index_data[i];
             TORCH_CHECK_INDEX((self_i >= 0) && (self_i < result.numel()), "index out of range in self");
@@ -1175,7 +1175,7 @@ static Tensor & index_select_out_cpu_dim1_(
 
   auto out = static_cast<char*>(result_contig.data_ptr());
 
-  auto src_base = static_cast<const char*>(self_contig.data_ptr());
+  auto src_base = static_cast<const char*>(self_contig.const_data_ptr());
 
   auto self_sizes = self_contig.sizes();
   auto outer_dims_product = c10::size_to_dim_(1, self_sizes);
@@ -1191,7 +1191,7 @@ static Tensor & index_select_out_cpu_dim1_(
   AT_DISPATCH_INDEX_TYPES(
     index_contig.scalar_type(), "batch_index_select_compute", [&]() {
 
-      const auto* idxs = index_contig.data_ptr<index_t>();
+      const auto* idxs = index_contig.const_data_ptr<index_t>();
       check_indexarray_range<index_t>(idxs, N, src_indexing_axis_dim);
 
       // Special-case single-float copy for efficiency
@@ -1256,7 +1256,7 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor &
                   "index_select(): self indexing axis dim should be positive");
       AT_DISPATCH_INDEX_TYPES(
       index_contig.scalar_type(), "index_select_empty_self_bound_check", [&]() {
-        const auto* idxs = index_contig.data_ptr<index_t>();
+        const auto* idxs = index_contig.const_data_ptr<index_t>();
         check_indexarray_range<index_t>(idxs, numel, src_indexing_axis_dim);
       });
       return result;
@@ -1269,7 +1269,7 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor &
 
     auto selfSlice = self.select(dim, 0);
     auto resultSlice = result.select(dim, 0);
-    auto selfSlice_data = selfSlice.data_ptr();
+    auto selfSlice_data = selfSlice.const_data_ptr();
     auto resultSlice_data = resultSlice.data_ptr();
     auto self_stride_bytes = self.stride(dim) * elementSize(self.scalar_type());
     auto result_stride_bytes = result.stride(dim) * elementSize(result.scalar_type());
@@ -1280,7 +1280,7 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor &
       .check_all_same_dtype(false)
       .resize_outputs(false)
       .add_output(resultSlice)
-      .add_input(selfSlice)
+      .add_const_input(selfSlice)
       .build();
 
     auto grain_size = at::internal::GRAIN_SIZE;
@@ -1293,14 +1293,14 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor &
       AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_select_out_cpu_",
         [&index_contig, &start, &end, &sub_iter, &self_dim_size, &selfSlice_data, &self_stride_bytes,
           &resultSlice_data, &result_stride_bytes] () {
-        auto index_data = index_contig.data_ptr<index_t>();
+        auto index_data = index_contig.const_data_ptr<index_t>();
         for (const auto i : c10::irange(start, end)) {
           auto self_i = index_data[i];
           TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_dim_size), "index out of range in self");
-          auto self_data = static_cast<char*>(selfSlice_data) + self_i * self_stride_bytes;
+          auto self_data = static_cast<const char*>(selfSlice_data) + self_i * self_stride_bytes;
           auto result_data = static_cast<char*>(resultSlice_data) + i * result_stride_bytes;
           sub_iter.unsafe_replace_operand(0, result_data);
-          sub_iter.unsafe_replace_operand(1, self_data);
+          sub_iter.unsafe_replace_operand(1, const_cast<char*>(self_data));
           copy_stub(sub_iter.device_type(), sub_iter, false);
         };
       });
@@ -1322,11 +1322,11 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor &
           AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_select_out_cpu_",
             [&index_contig, &slice_size_bytes, &self_dim_size, &selfSlice_data,
               &self_stride_bytes, &resultSlice_data, &result_stride_bytes, &start, &end] () {
-            auto index_data = index_contig.data_ptr<index_t>();
+            auto index_data = index_contig.const_data_ptr<index_t>();
             for (const auto i : c10::irange(start, end)) {
               auto self_i = index_data[i];
               TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_dim_size), "index out of range in self");
-              auto self_data = static_cast<char*>(selfSlice_data) + self_i * self_stride_bytes;
+              auto self_data = static_cast<const char*>(selfSlice_data) + self_i * self_stride_bytes;
               auto result_data = static_cast<char*>(resultSlice_data) + i * result_stride_bytes;
               memcpy(result_data, self_data, slice_size_bytes);
             }
@@ -1344,16 +1344,16 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor &
       AT_DISPATCH_QINT_TYPES(self.scalar_type(), "index_select_quant", [&index_contig, &self, &result, &dim, &numel] {
         auto self_stride = self.dim() == 0 ? 1 : self.stride(dim);
         auto result_stride = result.dim() == 0 ? 1 : result.stride(dim);
-        auto self_data_ptr = self.data_ptr<scalar_t>();
+        auto self_data_ptr = self.const_data_ptr<scalar_t>();
         auto result_data_ptr = result.data_ptr<scalar_t>();
         auto self_numel = self.numel();
         AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_select_out_cpu_quant_",
           [&index_contig, &numel, &self_numel, &self_data_ptr, &self_stride, &result_data_ptr, &result_stride] {
-          auto index_data = index_contig.data_ptr<index_t>();
+          auto index_data = index_contig.const_data_ptr<index_t>();
           for (const auto i : c10::irange(numel)) {
             auto self_i = index_data[i];
             TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_numel), "index out of range in self");
-            scalar_t *self_ip = self_data_ptr + self_i * self_stride;
+            const scalar_t *self_ip = self_data_ptr + self_i * self_stride;
             *(result_data_ptr + i * result_stride) = *self_ip;
           }
         });
@@ -1364,16 +1364,16 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor &
         auto self_stride = self.dim() == 0 ? 1 : self.stride(dim);
         auto result_stride = result.dim() == 0 ? 1 : result.stride(dim);
 
-        auto self_data_ptr = self.data_ptr<scalar_t>();
+        auto self_data_ptr = self.const_data_ptr<scalar_t>();
         auto result_data_ptr = result.data_ptr<scalar_t>();
         auto self_numel = self.numel();
         AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_select_out_cpu_",
           [&index_contig, &numel, &self_numel, &self_data_ptr, &self_stride, &result_data_ptr, &result_stride] {
-          auto index_data = index_contig.data_ptr<index_t>();
+          auto index_data = index_contig.const_data_ptr<index_t>();
           for (const auto i : c10::irange(numel)) {
             auto self_i = index_data[i];
             TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_numel), "index out of range in self");
-            scalar_t *self_ip = self_data_ptr + self_i * self_stride;
+            const scalar_t *self_ip = self_data_ptr + self_i * self_stride;
             *(result_data_ptr + i * result_stride) = *self_ip;
           }
         });
@@ -1462,7 +1462,7 @@ Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, const Sca
     .check_all_same_dtype(false)
     .resize_outputs(false)
     .add_output(self_restrided)
-    .add_input(index_restrided)
+    .add_const_input(index_restrided)
     .build();
 
   auto self_dim_size = (self_nonzero_dim.sizes())[dim];
@@ -1880,8 +1880,7 @@ TORCH_IMPL_FUNC(scatter_reduce_two)
 }
 
 Tensor masked_scatter(const Tensor & self, const Tensor & mask, const Tensor & source) {
-  c10::MaybeOwned<Tensor> _mask, _self;
-  std::tie(_mask, _self) = expand_outplace(mask, self);
+  auto [_mask, _self] = expand_outplace(mask, self);
   return _self->clone(at::MemoryFormat::Contiguous).masked_scatter_(*_mask, source);
 }
 
@@ -1924,7 +1923,7 @@ static Tensor & masked_fill_impl_cpu(Tensor & self, const Tensor & mask, const S
     .check_all_same_dtype(false)
     .resize_outputs(false)
     .add_output(self)
-    .add_input(mask)
+    .add_const_input(mask)
     .build();
 
   masked_fill_stub(iter.device_type(), iter, value);
@@ -1954,8 +1953,7 @@ Tensor masked_fill(const Tensor & self, const Tensor & mask, const Scalar& sourc
   auto maybe_outnames = namedinference::broadcast_to_outnames(mask, self, "masked_fill");
   {
     NoNamesGuard guard;
-    c10::MaybeOwned<Tensor> _mask, _self;
-    std::tie(_mask, _self) = expand_outplace(mask, self);
+    auto [_mask, _self] = expand_outplace(mask, self);
     result = _self->clone(at::MemoryFormat::Contiguous);
     result.masked_fill_(mask, source);
   }
@@ -1968,8 +1966,7 @@ Tensor masked_fill(const Tensor & self, const Tensor & mask, const Tensor & sour
   auto maybe_outnames = namedinference::broadcast_to_outnames(mask, self, "masked_fill");
   {
     NoNamesGuard guard;
-    c10::MaybeOwned<Tensor> _mask, _self;
-    std::tie(_mask, _self) = expand_outplace(mask, self);
+    auto [_mask, _self] = expand_outplace(mask, self);
     result = _self->clone(at::MemoryFormat::Contiguous);
     result.masked_fill_(mask, source);
   }
@@ -1989,8 +1986,7 @@ static Tensor & masked_select_out_impl_cpu(Tensor & result, const Tensor & self,
   at::assert_no_overlap(result, self);
   at::assert_no_overlap(result, mask);
 
-  c10::MaybeOwned<Tensor> _mask, _self;
-  std::tie(_mask, _self) = expand_outplace(mask, self);
+  auto [_mask, _self] = expand_outplace(mask, self);
 
   auto shape = _self->sizes();
   int64_t numel = _mask->sum().item().toLong();
@@ -2017,8 +2013,8 @@ static Tensor & masked_select_out_impl_cpu(Tensor & result, const Tensor & self,
       .check_all_same_dtype(false)
       .resize_outputs(false)
       .add_output(result_strided)
-      .add_input(*_self)
-      .add_input(*_mask)
+      .add_const_input(*_self)
+      .add_const_input(*_mask)
       .build();
 
     masked_select_serial_stub(iter.device_type(), iter, orig_stride);
@@ -2041,9 +2037,9 @@ static Tensor & masked_select_out_impl_cpu(Tensor & result, const Tensor & self,
     .check_all_same_dtype(false)
     .resize_outputs(false)
     .add_output(result_strided)
-    .add_input(*_self)
-    .add_input(*_mask)
-    .add_input(mask_prefix_sum)
+    .add_const_input(*_self)
+    .add_const_input(*_mask)
+    .add_const_input(mask_prefix_sum)
     .build();
 
   masked_select_stub(iter.device_type(), iter, orig_stride);
@@ -2130,10 +2126,7 @@ static inline void checkDevice(CheckedFrom c, at::ArrayRef<Tensor> tensors, Devi
 Tensor take_along_dim(const Tensor& self, const Tensor& indices, c10::optional<int64_t> opt_dim) {
   checkDevice("torch.take_along_dim():", {self, indices}, self.device());
   if (opt_dim.has_value()) {
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int64_t dim;
-    Tensor self_broadcasted, indices_broadcasted;
-    std::tie(self_broadcasted, indices_broadcasted, dim) =
+    auto [self_broadcasted, indices_broadcasted, dim] =
         _take_along_dim_helper(self, indices, opt_dim.value());
     return self_broadcasted.gather(dim, indices_broadcasted);
   }
@@ -2145,10 +2138,7 @@ Tensor take_along_dim(const Tensor& self, const Tensor& indices, c10::optional<i
 Tensor& take_along_dim_out(const Tensor& self, const Tensor& indices, c10::optional<int64_t> opt_dim, Tensor& result) {
   checkDevice("torch.take_along_dim():", {self, indices, result}, self.device());
   if (opt_dim.has_value()) {
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int64_t dim;
-    Tensor self_broadcasted, indices_broadcasted;
-    std::tie(self_broadcasted, indices_broadcasted, dim) =
+    auto [self_broadcasted, indices_broadcasted, dim] =
         _take_along_dim_helper(self, indices, opt_dim.value());
     return at::gather_out(result, self_broadcasted, dim, indices_broadcasted);
   }
@@ -2228,7 +2218,7 @@ Tensor count_nonzero_cpu(const Tensor& self, IntArrayRef dims){
 
   // Optimized all-reduce
   auto iter = TensorIteratorConfig()
-      .add_input(self)
+      .add_const_input(self)
       .build();
 
   const auto num_threads = at::get_num_threads();
@@ -2267,7 +2257,7 @@ Tensor& nonzero_out_cpu(const Tensor& self, Tensor& result) {
   at::assert_no_overlap(result, self);
 
   auto iter = TensorIteratorConfig()
-    .add_input(self)
+    .add_const_input(self)
     .enforce_linear_iteration()
     .build();
 
@@ -2303,6 +2293,8 @@ Tensor& nonzero_out_cpu(const Tensor& self, Tensor& result) {
     return result;
   }
 
+  auto out_accessor = result.accessor<int64_t, 2>();
+
   // Pass 2: Write indexes
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
       kComplexHalf, kHalf, kBFloat16, kBool, self.scalar_type(), "nonzero_cpu", [&] {
@@ -2323,7 +2315,6 @@ Tensor& nonzero_out_cpu(const Tensor& self, Tensor& result) {
         }
       }
 
-      auto out_accessor = result.accessor<int64_t, 2>();
       auto out_ptr = out_accessor[thread_count_nonzero[tid]].data();
 
       auto loop = [&](char** data, const int64_t* strides, int64_t n1, int64_t n2) {
@@ -2495,7 +2486,7 @@ Tensor & masked_scatter__cpu(Tensor& self, const Tensor & mask, const Tensor & s
       // order of indexing matters
       .enforce_linear_iteration()
       .add_output(self)
-      .add_input(*b_mask)
+      .add_const_input(*b_mask)
       .build();
 
   masked_scatter_stub(iter.device_type(), iter, src_cont);
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index f7a2d0f766858..04d8e8cbf8313 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -13,6 +13,7 @@
 #include <ATen/native/TensorCompare.h>
 #include <ATen/native/TypeProperties.h>
 #include <ATen/TensorSubclassLikeUtils.h>
+#include <iostream>
 #include <c10/util/Exception.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -22,6 +23,7 @@
 #include <ATen/ops/_aminmax_native.h>
 #include <ATen/ops/_assert_async_native.h>
 #include <ATen/ops/_functional_assert_async_native.h>
+#include <ATen/ops/_print_native.h>
 #include <ATen/ops/_assert_scalar_native.h>
 #include <ATen/ops/_functional_assert_scalar_native.h>
 #include <ATen/ops/_make_per_tensor_quantized_tensor.h>
@@ -71,6 +73,7 @@
 #include <ATen/ops/where_native.h>
 #include <ATen/ops/zeros_like.h>
 
+#include <iostream>
 #include <utility>
 #endif
 
@@ -128,17 +131,17 @@ const OptionalTensorRef max) {
     TensorIteratorConfig()                  \
       .set_check_mem_overlap(true)          \
       .add_output(maybe_get_output())       \
-      .add_input(self)                      \
+      .add_const_input(self)                \
       .promote_inputs_to_common_dtype(true) \
       .cast_common_dtype_to_outputs(true)   \
       .enforce_safe_casting_to_output(true)
 
   if (min && max) {
-    build(CLAMP_CONFIG().add_input(*min).add_input(*max));
+    build(CLAMP_CONFIG().add_const_input(*min).add_const_input(*max));
   } else if (min) {
-    build(CLAMP_CONFIG().add_input(*min));
+    build(CLAMP_CONFIG().add_const_input(*min));
   } else if (max) {
-    build(CLAMP_CONFIG().add_input(*max));
+    build(CLAMP_CONFIG().add_const_input(*max));
   }
 }
 
@@ -440,6 +443,9 @@ Tensor _functional_assert_async_msg_cpu(
   return dep_token.clone();
 }
 
+void _print(c10::string_view s) {
+  std::cout << s << "\n";
+}
 
 // Sorting-based algorithm for isin(); used when the number of test elements is large.
 static void isin_sorting(
@@ -455,17 +461,16 @@ static void isin_sorting(
     elements_flat = elements.ravel();
     test_elements_flat = test_elements.ravel();
   } else {
-    std::tie (elements_flat, unique_order) = at::_unique(
+    std::tie(elements_flat, unique_order) = at::_unique(
         elements, /*sorted=*/ false, /*return_inverse=*/ true);
-    std::tie (test_elements_flat, std::ignore) = at::_unique(test_elements, /*sorted=*/ false);
+    std::tie(test_elements_flat, std::ignore) = at::_unique(test_elements, /*sorted=*/ false);
   }
 
   // 2. Stable sort all elements, maintaining order indices to reverse the
   //    operation. Stable sort is necessary to keep elements before test
   //    elements within the sorted list.
   Tensor all_elements = at::cat({std::move(elements_flat), std::move(test_elements_flat)});
-  Tensor sorted_elements, sorted_order;
-  std::tie (sorted_elements, sorted_order) = all_elements.sort(
+  auto [sorted_elements, sorted_order] = all_elements.sort(
       /*stable=*/ true, /*dim=*/ 0, /*descending=*/ false);
 
   // 3. Create a mask for locations of adjacent duplicate values within the
@@ -503,17 +508,13 @@ Device out_device(Args&... inps){
 
 
 Tensor& where_self_out(const Tensor& condition, const Tensor& self, const Tensor& other, Tensor& out) {
-  Tensor self_, other_, condition_;
-  if (self.dtype() != other.dtype()) {
-    auto result_type = at::native::result_type(self, other);
-    self_ = self.to(result_type);
-    other_ = other.to(result_type);
-  } else {
-    self_ = self;
-    other_ = other;
-  }
+  const auto result_type = at::native::result_type(self, other);
+  TORCH_CHECK(out.scalar_type() == result_type, "Expected out type to be ", result_type, " but got ", out.scalar_type());
+
+  auto self_ = self.scalar_type() != result_type ? self.to(result_type): self;
+  auto other_ = other.scalar_type() != result_type ? other.to(result_type): other;
+  auto condition_ = condition;
   auto device = out_device(condition, self_, other_);
-  condition_ = condition;
   if (device != at::kCPU) { // allow CPU scalars on non-cpu device
     if (condition.device() != device && condition.ndimension() == 0) {
       condition_ = condition.to(device);
@@ -525,19 +526,18 @@ Tensor& where_self_out(const Tensor& condition, const Tensor& self, const Tensor
         other_ = other_.to(device);
     }
   }
-  if (condition.scalar_type() == ScalarType::Byte) {
-  TORCH_WARN_ONCE("where received a uint8 condition tensor. This behavior is deprecated and will be removed in a future version of PyTorch. Use a boolean condition instead.");
-  } else {
-  TORCH_CHECK(condition.scalar_type() == ScalarType::Bool, "where expected condition to be a boolean tensor, but got a tensor with dtype ", condition.scalar_type());
+  if (condition_.scalar_type() == ScalarType::Byte) {
+    TORCH_WARN_ONCE("where received a uint8 condition tensor. This behavior is deprecated and will be removed in a future version of PyTorch. Use a boolean condition instead.");
+    condition_ = condition_.to(kBool);
   }
-  condition_ = condition_.scalar_type() == ScalarType::Byte ? condition_.to(ScalarType::Bool) : condition_;
+  TORCH_CHECK(condition_.scalar_type() == kBool, "where expected condition to be a boolean tensor, but got a tensor with dtype ", condition_.scalar_type());
   // if there's still a device mismatch, let tensoriterator error out with it
   auto iter = at::TensorIteratorConfig()
     .check_all_same_dtype(false)
     .add_output(out)
-    .add_input(condition_)
-    .add_input(self_)
-    .add_input(other_)
+    .add_const_input(condition_)
+    .add_const_input(self_)
+    .add_const_input(other_)
     .build();
   where_kernel(iter.device_type(), iter);
   return out;
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index ac3f6d4763ac8..c70da8334a5e9 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -254,15 +254,51 @@ Tensor _to_copy(
 
   // TODO: Use the dispatcher for this.
   // Currently there are unenumerated extensibility issues preventing this.
-  if (at::sparse_csr::is_sparse_compressed(self)) {
+  if (self.layout() == kSparse) {
+      TORCH_CHECK(
+          memory_format == MemoryFormat::Preserve,
+          "to(options): COO only supports memory format Preserve, but got ", memory_format,
+          " instead.");
+    if (options.device().is_meta()) {
+        return zeros_like(self, options);
+    }
+    auto indices = self._indices();
+    const auto new_indices = at::native::to(
+        indices,
+        indices.scalar_type(),
+        c10::kStrided,
+        device,
+        pin_memory,
+        non_blocking,
+        true, // force copy since we are in _to_copy
+        memory_format);
+    const auto new_values = at::native::to(
+        self._values(),
+        dtype,
+        c10::kStrided,
+        device,
+        pin_memory,
+        non_blocking,
+        true, // force copy since we are in _to_copy
+        memory_format);
+
+    return at::_sparse_coo_tensor_unsafe(
+        new_indices,
+        new_values,
+        self.sizes(),
+        options, self.is_coalesced());
+  } else if (at::sparse_csr::is_sparse_compressed(self)) {
       TORCH_CHECK(
           memory_format == MemoryFormat::Preserve,
           "to(options): ", at::sparse_csr::layoutToString(self.layout()),
           " only supports memory format Preserve, but got ", memory_format,
           " instead.");
 
-      Tensor compressed_indices, plain_indices;
-      std::tie(compressed_indices, plain_indices) = at::sparse_csr::getCompressedPlainIndices(self);
+      if (options.device().is_meta()) {
+        return zeros_like(self, options);
+      }
+
+      auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(self);
 
       const auto new_values = at::native::to(
           self.values(),
@@ -340,7 +376,7 @@ Tensor _to_copy(
   }
   // See Note [Explicit nullopt MemoryFormat argument]
   // TODO: empty_quantized does not work here. It raises an exception in CheckMemoryFormat.h prior to
-  // empty_affine_quantizd/_empty_per_channel_affine_quantized calls
+  // empty_affine_quantized/_empty_per_channel_affine_quantized calls
   // at::empty also does not work here because there is no proper at::empty support for quantized tensors
   // as it would return a quantized tensor with an UnknownQuantizer
   auto r = self.is_quantized() ? at::empty_like(self, memory_format)
@@ -609,9 +645,7 @@ Tensor sparse_compressed_to_dense(
   auto compressed_rows = self.layout() == kSparseCsr || self.layout() == kSparseBsr;
   auto block_sparse = self.layout() == kSparseBsr || self.layout() == kSparseBsc;
 
-  Tensor compressed_indices;
-  Tensor plain_indices;
-  std::tie(compressed_indices, plain_indices) =
+  auto [compressed_indices, plain_indices] =
       sparse_csr::getCompressedPlainIndices(self);
 
   auto values = self.values();
@@ -656,7 +690,7 @@ Tensor sparse_compressed_to_dense(
   dense = dense.reshape(dense_reshaped_sizes);
 
   // Calculate batch, row and column indices for non-zeros in the
-  // sparse matrix, and use these to calculate correspoding indices
+  // sparse matrix, and use these to calculate corresponding indices
   // into the dense matrix reshaped as above.  Then, update dense
   // matrix by adding sparse matrix values into elements with indices
   // calculated this way.
@@ -1482,7 +1516,7 @@ void convert_indices_from_coo_to_csr_cpu(
     const Tensor& input,
     const int64_t size) {
   int64_t numel = input.numel();
-  const input_t* data_in = input.data_ptr<input_t>();
+  const input_t* data_in = input.const_data_ptr<input_t>();
   output_t* data_out = result.data_ptr<output_t>();
 
   if (numel == 0) {
@@ -1528,7 +1562,7 @@ void convert_indices_from_csr_to_coo_cpu(
     batch_indices.copy_(at::sparse::full_coo_indices(crow_indices.sizes().slice(0, batch_ndim), crow_indices.options())
                         .repeat_interleave(nnz, 1));
   }
-  const input_t* crow_indices_data_in = crow_indices_->data_ptr<input_t>();
+  const input_t* crow_indices_data_in = crow_indices_->const_data_ptr<input_t>();
   TORCH_INTERNAL_ASSERT(indices.is_contiguous());
   auto row0 = indices.select(0, transpose ? batch_ndim + 1 : batch_ndim + 0);
   auto row1 = indices.select(0, transpose ? batch_ndim + 0 : batch_ndim + 1);
@@ -1836,8 +1870,7 @@ Tensor sparse_compressed_to_sparse(const Tensor& self, const int64_t sparse_dim)
   _to_sparse_check_arguments("sparse_compressed_to_sparse", self, sparse_dim);
 
   Layout layout = self.layout();
-  Tensor compressed_indices, plain_indices;
-  std::tie(compressed_indices, plain_indices) = at::sparse_csr::getCompressedPlainIndices(self);
+  auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(self);
   Tensor values;
   Tensor indices = at::_convert_indices_from_csr_to_coo(compressed_indices, plain_indices,
                                                         false, (layout == kSparseCsc || layout == kSparseBsc));
diff --git a/aten/src/ATen/native/TensorDimApply.h b/aten/src/ATen/native/TensorDimApply.h
index 65d90f6fda1f5..4d52446446316 100644
--- a/aten/src/ATen/native/TensorDimApply.h
+++ b/aten/src/ATen/native/TensorDimApply.h
@@ -10,7 +10,7 @@ void tensor_dim_apply3(const Tensor& self, Tensor& values, Tensor& indices, int6
   int ndims = self.dim();
   int tensor_dim_apply_has_finished = 0;
   std::vector<int64_t> counter(ndims, 0);
-  T1* self_data = self.data_ptr<T1>();
+  const T1* self_data = self.const_data_ptr<T1>();
   T1* values_data = values.data_ptr<T1>();
   T2* indices_data = indices.data_ptr<T2>();
   int64_t self_stride = self.stride(dim);
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 2cb92baf79f9b..c8fddc3756353 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -35,6 +35,7 @@
 #include <ATen/ops/_efficientzerotensor_native.h>
 #include <ATen/ops/_empty_affine_quantized.h>
 #include <ATen/ops/_empty_per_channel_affine_quantized.h>
+#include <ATen/ops/_sparse_compressed_tensor_with_dims_native.h>
 #include <ATen/ops/arange.h>
 #include <ATen/ops/arange_native.h>
 #include <ATen/ops/bartlett_window_native.h>
@@ -214,8 +215,8 @@ Tensor& complex_out(const Tensor& real, const Tensor& imag, Tensor& result) {
   complex_check_dtype(result, real, imag);
   auto iter = TensorIteratorConfig()
       .add_output(result)
-      .add_input(real)
-      .add_input(imag)
+      .add_const_input(real)
+      .add_const_input(imag)
       .check_all_same_dtype(false)
       .build();
   complex_stub(iter.device_type(), iter);
@@ -234,8 +235,8 @@ Tensor& polar_out(const Tensor& abs, const Tensor& angle, Tensor& result) {
   complex_check_dtype(result, abs, angle);
   auto iter = TensorIteratorConfig()
       .add_output(result)
-      .add_input(abs)
-      .add_input(angle)
+      .add_const_input(abs)
+      .add_const_input(angle)
       .check_all_same_dtype(false)
       .build();
   polar_stub(iter.device_type(), iter);
@@ -277,8 +278,8 @@ Tensor empty_names(
   }
   TORCH_CHECK(options.layout() == Layout::Strided,
       "NYI: named tensors only support strided layout");
-  TORCH_CHECK(options.device().is_cpu() || options.device().is_cuda() || options.device().is_privateuseone(),
-      "NYI: named tensors only support CPU, CUDA or ", c10::get_privateuse1_backend(), " tensors.");
+  TORCH_CHECK(options.device().is_cpu() || options.device().is_cuda() || options.device().is_xpu() || options.device().is_privateuseone(),
+      "NYI: named tensors only support CPU, CUDA, XPU or ", c10::get_privateuse1_backend(), " tensors.");
   auto result = at::empty(size, options, optional_memory_format);
   internal_set_names_inplace(result, names);
   return result;
@@ -368,10 +369,9 @@ Tensor& empty_out(IntArrayRef size,
 
 // Some scalar types in CAST_OP have no declarations, they may be unused in Pytorch.
 // But we keep them and ignore the warning here until verified in the future.
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wmissing-prototypes"
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-prototypes")
 AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DEFINE_CAST_OP)
-#pragma clang diagnostic pop
+C10_DIAGNOSTIC_POP()
 
 #undef DEFINE_CAST_OP
 
@@ -1339,16 +1339,16 @@ Tensor _efficientzerotensor(IntArrayRef size,
     return out;
 }
 
-Tensor _efficientzerotensor_meta(IntArrayRef size,
-                                 c10::optional<ScalarType> dtype,
-                                 c10::optional<Layout> layout,
-                                 c10::optional<Device> device,
-                                 c10::optional<bool> pin_memory) {
+Tensor _efficientzerotensor_meta_symint(SymIntArrayRef size,
+                                        c10::optional<ScalarType> dtype,
+                                        c10::optional<Layout> layout,
+                                        c10::optional<Device> device,
+                                        c10::optional<bool> pin_memory) {
   auto device_ = device_or_default(device);
   auto allocator = at::native::ZeroTensorAllocator(device_);
   auto dtype_ = dtype_or_default(dtype);
   auto zero_ks = at::DispatchKeySet(c10::DispatchKey::Meta) | at::DispatchKeySet(c10::DispatchKey::ZeroTensor);
-  auto out = at::detail::empty_generic(size, &allocator, zero_ks, dtype_, c10::nullopt);
+  auto out = at::detail::empty_generic_symint(size, &allocator, zero_ks, dtype_, c10::nullopt);
   return out;
 }
 
@@ -1391,11 +1391,29 @@ Tensor zeros_like(
     if (self.is_sparse()) {
       res.sparse_resize_and_clear_(
           self.sizes(), self.sparse_dim(), self.dense_dim());
+    } else if (at::sparse_csr::is_sparse_compressed(self)) {
+      res.sparse_resize_and_clear_(
+          self.sizes(), self.sizes().size() - self.dense_dim(), self.dense_dim());
     } else {
       res.sparse_resize_and_clear_(self.sizes(), self.sizes().size(), 0);
     }
     res._coalesced_(true);
 
+    return res;
+  } else if (at::sparse_csr::is_sparse_compressed(options.layout())) {
+    int64_t nnz = 0;
+    int64_t dense_dim = (self.layout() == kStrided ? self.dim() - 2: self.dense_dim());
+    DimVector blocksize{};
+    if (self.layout() == kSparseBsr || self.layout() == kSparseBsc) {
+      blocksize.append(at::sparse_csr::getBlockSize(self));
+    }
+    ScalarType index_dtype = at::sparse_csr::getIndexDtype(self);
+    auto res = at::native::sparse_compressed_tensor_with_dims(
+      nnz, dense_dim, self.sizes(), blocksize, index_dtype,
+      typeMetaToScalarType(options.dtype()), options.layout(), options.device(), options.pinned_memory());
+    Tensor compressed_indices, plain_indices;
+    std::tie(compressed_indices, plain_indices) = at::sparse_csr::getCompressedPlainIndices(res);
+    compressed_indices.zero_();
     return res;
   }
   auto result = at::empty_like(self, options, optional_memory_format);
diff --git a/aten/src/ATen/native/TensorFactories.h b/aten/src/ATen/native/TensorFactories.h
index 26b5739791114..f9b2893d768a9 100644
--- a/aten/src/ATen/native/TensorFactories.h
+++ b/aten/src/ATen/native/TensorFactories.h
@@ -124,12 +124,13 @@ struct ZeroTensorAllocator final : public at::Allocator {
   static void deleter(void* const pointer) {
     TORCH_INTERNAL_ASSERT(!pointer);
   }
-  DataPtr allocate(const size_t /*nbytes*/) const override {
+  DataPtr allocate(const size_t /*nbytes*/) override {
     return {nullptr, nullptr, &deleter, device_};
   }
   DeleterFnPtr raw_deleter() const override {
     return deleter;
   }
+  void copy_data(void* dest, const void* src, std::size_t count) const final {}
   at::Device device_;
 };
 
diff --git a/aten/src/ATen/native/TensorIteratorDynamicCasting.h b/aten/src/ATen/native/TensorIteratorDynamicCasting.h
index b042ebae27bfc..a2bdd6eb13e4b 100644
--- a/aten/src/ATen/native/TensorIteratorDynamicCasting.h
+++ b/aten/src/ATen/native/TensorIteratorDynamicCasting.h
@@ -3,12 +3,11 @@
 #include <complex>
 #include <type_traits>
 #include <c10/core/ScalarType.h>
-#include <c10/util/C++17.h>
 #include <ATen/detail/FunctionTraits.h>
 #include <ATen/native/TensorIterator.h>
 
 
-// This file includes utilties for dynamic_casting done by TensorIterator, see CUDALoops.cuh and Loops.h.
+// This file includes utilities for dynamic_casting done by TensorIterator, see CUDALoops.cuh and Loops.h.
 
 // dynamic_casting handles when the types expected by the iterator do not match the types of the arguments
 // to the function that is being called.
diff --git a/aten/src/ATen/native/TensorIteratorReduce.cpp b/aten/src/ATen/native/TensorIteratorReduce.cpp
index 7164b89554c59..9c4e4e9459d46 100644
--- a/aten/src/ATen/native/TensorIteratorReduce.cpp
+++ b/aten/src/ATen/native/TensorIteratorReduce.cpp
@@ -59,7 +59,7 @@ static void two_pass_reduction(TensorIteratorBase& iter, loop2d_t loop) {
     auto shape = first_reduce.shape();
     auto strides = first_reduce.get_strides();
 
-    // Bump output ptr so each thread has its own ouput slice
+    // Bump output ptr so each thread has its own output slice
     auto base_ptrs = first_reduce.get_base_ptrs();
     base_ptrs[0] += buffer_stride * thread_num;
 
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 05bd5f4cafa2a..a99e6e3a50c11 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -36,9 +36,11 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_chunk_cat_native.h>
 #include <ATen/ops/_conj_copy_native.h>
 #include <ATen/ops/_convert_indices_from_coo_to_csr.h>
 #include <ATen/ops/_convert_indices_from_csr_to_coo.h>
+#include <ATen/ops/_foreach_copy.h>
 #include <ATen/ops/_fw_primal_copy_native.h>
 #include <ATen/ops/_indices_copy_native.h>
 #include <ATen/ops/_make_dual.h>
@@ -151,6 +153,7 @@
 #include <ATen/ops/slice.h>
 #include <ATen/ops/slice_backward_native.h>
 #include <ATen/ops/slice_copy_native.h>
+#include <ATen/ops/slice_inverse_native.h>
 #include <ATen/ops/slice_native.h>
 #include <ATen/ops/slice_scatter_native.h>
 #include <ATen/ops/sparse_coo_tensor.h>
@@ -409,7 +412,7 @@ Tensor& set_storage_meta__symint(Tensor& result, Storage storage, c10::SymInt st
   result.unsafeGetTensorImpl()->set_sizes_and_strides(size, stride, storage_offset);
 
   // Matches maybe_resize_storage_cpu no-numel behavior
-  if (result.sym_numel() != 0) {
+  if (TORCH_GUARD_SIZE_OBLIVIOUS(result.sym_numel().sym_ne(0))) {
     // maybe_resize_storage_cpu can handle no storage exists at all but
     // that should never be the case here
     TORCH_INTERNAL_ASSERT(storage);
@@ -418,9 +421,19 @@ Tensor& set_storage_meta__symint(Tensor& result, Storage storage, c10::SymInt st
     // it.  TODO: Actually this might not quite be correct if we use special
     // pointers to track whether or not fake cuda tensors are pinned or not
     const auto itemsize = result.dtype().itemsize();
-    c10::SymInt size_bytes = at::detail::computeStorageNbytes(
+    c10::SymInt new_size_bytes = at::detail::computeStorageNbytes(
         size, stride, itemsize, std::move(storage_offset));
-    storage.set_nbytes(std::move(size_bytes));
+    // TODO: When there are unbacked SymInts, we unconditionally skip the
+    // setter.  This is technically wrong, but we cannot conveniently test
+    // the real condition in many cases, because a lot of people are using
+    // set_ just to swizzle metadata on a tensor, they didn't actually want
+    // to see if they need to resize the storage.
+    //
+    // The old behavior was to unconditionally set_nbytes, but I think not
+    // setting it is more safe.
+    if (new_size_bytes.has_hint() && storage.sym_nbytes().has_hint() && TORCH_GUARD_SIZE_OBLIVIOUS(new_size_bytes.sym_gt(storage.sym_nbytes()))) {
+      storage.set_nbytes(std::move(new_size_bytes));
+    }
   }
   return result;
 }
@@ -508,7 +521,7 @@ Tensor sparse_broadcast_to(const Tensor& self, IntArrayRef size) {
     }
   }
   // to_broadcast conserves is_coalesced property iff only the last
-  // sparse dimensions are expaned. Possible expansion of dense
+  // sparse dimensions are expanded. Possible expansion of dense
   // dimensions can be discarded as it does not affect the is_coalesce
   // property.
   bool is_coalesced = self.dim()==0 || (self.is_coalesced() && (max_unchanged_dim < min_broadcast_dim || min_broadcast_dim == -1));
@@ -553,7 +566,7 @@ static void fastCatOutDim0(const Tensor& out, const MaterializedITensorListRef&
   for (const Tensor& input : inputs) {
     TORCH_CHECK(outBytes >= totalBytes);
     if (input.nbytes() > 0) {
-      std::memcpy(dataPtr + totalBytes, input.data_ptr(), input.nbytes());
+      std::memcpy(dataPtr + totalBytes, input.const_data_ptr(), input.nbytes());
     }
     totalBytes += input.nbytes();
   }
@@ -608,7 +621,7 @@ TORCH_IMPL_FUNC(cat_out_cpu)
       .set_check_mem_overlap(false)
       .resize_outputs(false)
       .add_output(result_slice)
-      .add_input(source_slice)
+      .add_const_input(source_slice)
       .enforce_safe_casting_to_output(true)
       .build();
 
@@ -616,10 +629,10 @@ TORCH_IMPL_FUNC(cat_out_cpu)
       if (cat_should_skip_tensor(tensor)) {
         continue;
       }
-      auto source_data = static_cast<char*>(tensor.data_ptr());
+      auto source_data = static_cast<const char*>(tensor.const_data_ptr());
       auto result_data = static_cast<char*>(result_slice_data) + offset * result_stride_bytes;
       iter.unsafe_replace_operand(0, result_data);
-      iter.unsafe_replace_operand(1, source_data);
+      iter.unsafe_replace_operand(1, const_cast<char*>(source_data));
       copy_stub(iter.device_type(), iter, false);
       offset += slice_dim_size;
     }
@@ -635,7 +648,7 @@ TORCH_IMPL_FUNC(cat_out_cpu)
         .set_check_mem_overlap(false)  // Already checked above
         .resize_outputs(false)
         .add_output(result_slice)
-        .add_input(tensor)
+        .add_const_input(tensor)
         .promote_inputs_to_common_dtype(true)
         .cast_common_dtype_to_outputs(true)
         .enforce_safe_casting_to_output(true)
@@ -1003,7 +1016,7 @@ std::vector<Tensor> tensor_split(const Tensor& self, const Tensor& tensor_indice
     int64_t sections = tensor_indices_or_sections.item<int64_t>();
     return self.tensor_split(sections, dim);
   } else {
-    auto indices_data = tensor_indices_or_sections.data_ptr<int64_t>();
+    auto indices_data = tensor_indices_or_sections.const_data_ptr<int64_t>();
     auto stride = tensor_indices_or_sections.stride(0);
     auto numel = tensor_indices_or_sections.numel();
     std::vector<int64_t> indices(numel);
@@ -1343,22 +1356,22 @@ Tensor& narrow_copy_dense_cpu_out(
     return output;
   }
 
-  char* src_bytes = static_cast<char*>(self_contig->data_ptr());
+  const char* src_bytes = static_cast<const char*>(self_contig->const_data_ptr());
   char* dst_bytes = static_cast<char*>(output.data_ptr());
 
   size_t src_block_size_bytes = itemsize * src_block_size;
   size_t dst_block_size_bytes = itemsize * dst_block_size;
   size_t src_offset = unit * start;
 
-  char* src_offset_bytes = src_bytes + itemsize * src_offset;
+  const char* src_offset_bytes = src_bytes + itemsize * src_offset;
   char* dst_offset_bytes = dst_bytes;
 
   for (const auto i : c10::irange(num_blocks)) {
-    char* local_src_offset_bytes = src_offset_bytes + i * src_block_size_bytes;
+    const char* local_src_offset_bytes = src_offset_bytes + i * src_block_size_bytes;
     char* local_dst_offset_bytes = dst_offset_bytes + i * dst_block_size_bytes;
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        static_cast<void*>(local_src_offset_bytes + dst_block_size_bytes) <=
-        static_cast<void*>(src_bytes + src_nbytes));
+        static_cast<const void*>(local_src_offset_bytes + dst_block_size_bytes) <=
+        static_cast<const void*>(src_bytes + src_nbytes));
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         static_cast<void*>(local_dst_offset_bytes + dst_block_size_bytes) <=
         static_cast<void*>(dst_bytes + dst_nbytes));
@@ -1443,16 +1456,12 @@ static _permute_size_stride_estimation(const Tensor& self, IntArrayRef dims) {
 }
 
 Tensor permute(const Tensor& self, IntArrayRef dims) {
-  DimVector new_sizes, new_strides;
-  std::vector<int64_t> _;
-  std::tie(new_sizes, new_strides, _) = _permute_size_stride_estimation(self, dims);
+  auto [new_sizes, new_strides, _] = _permute_size_stride_estimation(self, dims);
   return self.as_strided(new_sizes, new_strides);
 }
 
 Tensor permute_sparse_coo(const Tensor& self, IntArrayRef dims) {
-  DimVector new_sizes, _;
-  std::vector<int64_t> wrapped_dims;
-  std::tie(new_sizes, _, wrapped_dims) = _permute_size_stride_estimation(self, dims);
+  auto [new_sizes, _, wrapped_dims] = _permute_size_stride_estimation(self, dims);
 
   const auto ndim = self.dim();
   const auto sparse_ndim = self.sparse_dim();
@@ -1826,7 +1835,7 @@ Tensor select_symint(const Tensor& self, int64_t dim, c10::SymInt index) {
   auto size = self.sym_sizes()[dim];
   // Note: `size < -index` is not equivalent to `size <= -1 - index` if index is INT64_MIN
   // For std::numeric_limits<int64_t>::min() result of unary minus is undefined by the standard
-  // but in practice is equal to self. On the other hand, indexing wraping is valid for all
+  // but in practice is equal to self. On the other hand, indexing wrapping is valid for all
   // negative int64_t values, as x[INT64_MIN] is the same as x[INT64_MAX]
   if (size <= -1 - index || size <= index) {
     if (self.has_names() && self.names()[dim] != Dimname::wildcard()) {
@@ -1877,7 +1886,7 @@ Tensor select_backward_symint(const Tensor& grad, c10::SymIntArrayRef input_size
 Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& index) {
   /*
     Algorithm:
-    index - a 1-D tensor of indicies with shape (n,)
+    index - a 1-D tensor of indices with shape (n,)
     self - sparse tensor, its shape is sizes = sparse_shape + dense_shape
       indices - 2-D tensor of indices, shape is (sparse_dims, nnz)
       values - (1+len(dense_shape))-D tensor of values, shape is (nnz,) + dense_shape
@@ -2022,15 +2031,13 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
             return std::make_tuple(dim_indices, at::arange(nnz, dim_indices.options()), nneg_index);
           }
           else {
-            Tensor sorted_dim_indices, sorted_dim_indices_idx;
-            std::tie(sorted_dim_indices, sorted_dim_indices_idx) = dim_indices.sort();
+            auto [sorted_dim_indices, sorted_dim_indices_idx] = dim_indices.sort();
             return std::make_tuple(sorted_dim_indices, sorted_dim_indices_idx, nneg_index);
           }
         }
         // sort nneg_index to binary search into it
         else {
-          Tensor sorted_nneg_index, sorted_nneg_index_idx;
-          std::tie(sorted_nneg_index, sorted_nneg_index_idx) = nneg_index.sort();
+          auto [sorted_nneg_index, sorted_nneg_index_idx] = nneg_index.sort();
           return std::make_tuple(sorted_nneg_index, sorted_nneg_index_idx, dim_indices);
         }
       }();
@@ -2061,7 +2068,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
       // fill in src_int_idx, sorted_int_idx, int_counts
       {
         const auto sorted_len = sorted.numel();
-        const auto* ptr_sorted = sorted.data_ptr<int64_t>();
+        const auto* ptr_sorted = sorted.const_data_ptr<int64_t>();
         const auto* ptr_sorted_start = ptr_sorted;
         const auto* ptr_sorted_end = ptr_sorted + sorted_len;
 
@@ -2071,7 +2078,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
             auto* ptr_tid_src_int_idx = src_int_idx.select(0, tid).data_ptr<int64_t>();
             auto* ptr_tid_sorted_int_idx = sorted_int_idx.select(0, tid).data_ptr<int64_t>();
             auto* ptr_tid_int_counts = int_counts.select(0, tid).data_ptr<int64_t>();
-            const auto* ptr_src = src.data_ptr<int64_t>() + start;
+            const auto* ptr_src = src.const_data_ptr<int64_t>() + start;
 
             for (const auto i : c10::irange(start, end)) {
               const auto src_val = *ptr_src++;
@@ -2124,14 +2131,14 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
         auto* ptr_selected_src = selected_src.data_ptr<int64_t>();
 
         const auto thread_offsets = compressed_int_counts.cumsum(0).sub_(compressed_int_counts);
-        const auto* ptr_sorted_idx = sorted_idx.data_ptr<int64_t>();
+        const auto* ptr_sorted_idx = sorted_idx.const_data_ptr<int64_t>();
         at::parallel_for(0, n_threads_src, 1, [&](int64_t tid, C10_UNUSED int64_t _) {
             const auto start = tid * chunk_size_src;
             const auto end = std::min(start + chunk_size_src, src_len);
             const auto tid_offset = thread_offsets.const_data_ptr<int64_t>()[tid];
-            const auto* ptr_tid_src_int_idx = src_int_idx.select(0, tid).data_ptr<int64_t>();
-            const auto* ptr_tid_sorted_int_idx = sorted_int_idx.select(0, tid).data_ptr<int64_t>();
-            const auto* ptr_tid_int_counts = int_counts.select(0, tid).data_ptr<int64_t>();
+            const auto* ptr_tid_src_int_idx = src_int_idx.select(0, tid).const_data_ptr<int64_t>();
+            const auto* ptr_tid_sorted_int_idx = sorted_int_idx.select(0, tid).const_data_ptr<int64_t>();
+            const auto* ptr_tid_int_counts = int_counts.select(0, tid).const_data_ptr<int64_t>();
             auto* ptr_tid_selected_sorted = ptr_selected_sorted + tid_offset;
             auto* ptr_tid_selected_src = ptr_selected_src + tid_offset;
 
@@ -2166,7 +2173,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
         bool run_in_parallel = true) -> Tensor {
       auto cidx = at::empty({len + 1}, idx.options());
 
-      const auto* ptr_idx = idx.data_ptr<int64_t>();
+      const auto* ptr_idx = idx.const_data_ptr<int64_t>();
       auto* ptr_cidx = cidx.data_ptr<int64_t>();
 
       const auto idx_len = idx.numel();
@@ -2205,7 +2212,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
         }
         else {
           auto* ptr_counts = counts.data_ptr<int64_t>();
-          const auto* ptr_vals = t.data_ptr<int64_t>();
+          const auto* ptr_vals = t.const_data_ptr<int64_t>();
           for (C10_UNUSED const auto _ : c10::irange(t.numel())) {
             ++ptr_counts[*ptr_vals++];
           }
@@ -2313,10 +2320,10 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
         const auto src_idx_len = src_intersection_offsets.const_data_ptr<int64_t>()[size - 1];
         auto src_idx = at::empty({src_idx_len}, src.options());
 
-        const auto* ptr_src = src.data_ptr<int64_t>();
-        const auto* ptr_intersection_counts = intersection_counts.data_ptr<int64_t>();
-        const auto* ptr_src_intersection_counts = src_intersection_counts.data_ptr<int64_t>();
-        const auto* ptr_src_intersection_offsets = src_intersection_offsets.data_ptr<int64_t>();
+        const auto* ptr_src = src.const_data_ptr<int64_t>();
+        const auto* ptr_intersection_counts = intersection_counts.const_data_ptr<int64_t>();
+        const auto* ptr_src_intersection_counts = src_intersection_counts.const_data_ptr<int64_t>();
+        const auto* ptr_src_intersection_offsets = src_intersection_offsets.const_data_ptr<int64_t>();
         auto* ptr_src_idx = src_idx.data_ptr<int64_t>();
 
         const auto src_len = src.numel();
@@ -2329,9 +2336,9 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
             const auto end = std::min(start + chunk_size, src_len);
             auto* ptr_src_tid = ptr_src + start;
             const auto* ptr_src_counts_per_thread
-              = src_counts_per_thread.select(0, tid).data_ptr<int64_t>();
+              = src_counts_per_thread.select(0, tid).const_data_ptr<int64_t>();
             const auto* ptr_src_offset_counts_per_thread
-              = src_offset_counts_per_thread.select(0, tid).data_ptr<int64_t>();
+              = src_offset_counts_per_thread.select(0, tid).const_data_ptr<int64_t>();
             auto tid_counts = at::zeros({size}, src.options());
             auto* ptr_tid_counts = tid_counts.data_ptr<int64_t>();
 
@@ -2356,8 +2363,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
         return std::make_tuple(src_idx, src_idx_offsets);
       }();
 
-      Tensor idx_selected, src_selected;
-      std::tie(idx_selected, src_selected) = [&](
+      auto [idx_selected, src_selected] = [&](
           int64_t grain_size = at::internal::GRAIN_SIZE
       ) -> std::tuple<Tensor, Tensor> {
         const auto thread_offset = [&]() {
@@ -2366,16 +2372,16 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
           auto counts_per_thread = idx_counts_per_thread.mul_(src_counts).sum(-1);
           return counts_per_thread.cumsum(0).sub_(counts_per_thread);
         }();
-        const auto* ptr_thread_offset = thread_offset.data_ptr<int64_t>();
+        const auto* ptr_thread_offset = thread_offset.const_data_ptr<int64_t>();
 
         auto idx_selected = at::empty({res_len}, idx.options());
         auto src_selected = at::empty({res_len}, src.options());
 
-        const auto* ptr_idx = idx.data_ptr<int64_t>();
-        const auto* ptr_src_counts = src_counts.data_ptr<int64_t>();
-        const auto* ptr_intersection_counts = intersection_counts.data_ptr<int64_t>();
-        const auto* ptr_src_idx = src_idx.data_ptr<int64_t>();
-        const auto* ptr_src_idx_offsets = src_idx_offsets.data_ptr<int64_t>();
+        const auto* ptr_idx = idx.const_data_ptr<int64_t>();
+        const auto* ptr_src_counts = src_counts.const_data_ptr<int64_t>();
+        const auto* ptr_intersection_counts = intersection_counts.const_data_ptr<int64_t>();
+        const auto* ptr_src_idx = src_idx.const_data_ptr<int64_t>();
+        const auto* ptr_src_idx_offsets = src_idx_offsets.const_data_ptr<int64_t>();
         auto* ptr_idx_selected = idx_selected.data_ptr<int64_t>();
         auto* ptr_src_selected = src_selected.data_ptr<int64_t>();
 
@@ -2428,8 +2434,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
     const auto get_result_small_nnz_small_index = [&]()
       -> Tensor {
       const auto dim_indices_in_inner_loop = nnz >= index_len;
-      Tensor outer, inner;
-      std::tie(outer, inner) = [&]() -> std::tuple<Tensor, Tensor> {
+      auto [outer, inner] = [&]() -> std::tuple<Tensor, Tensor> {
         if (dim_indices_in_inner_loop) {
           return std::make_tuple(nneg_index, dim_indices);
         }
@@ -2438,8 +2443,8 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in
         }
       }();
 
-      const auto* ptr_outer = outer.data_ptr<int64_t>();
-      const auto* ptr_inner = inner.data_ptr<int64_t>();
+      const auto* ptr_outer = outer.const_data_ptr<int64_t>();
+      const auto* ptr_inner = inner.const_data_ptr<int64_t>();
       // NOTE: if very critical, replace std::vector with
       // a data structure that operates on stack up to some limit.
       auto outer_selected_idx = std::vector<int64_t>();
@@ -2559,6 +2564,17 @@ Tensor slice(
   return result;
 }
 
+Tensor slice_inverse_symint(
+    const Tensor& self,
+    const Tensor& base,
+    int64_t /* dim */,
+    c10::optional<SymInt> /* start */,
+    c10::optional<SymInt> /* end */,
+    SymInt /* step */) {
+  // assume self has enough to storage to be viewed with base's metadata
+  return self.as_strided_symint(base.sym_sizes(), base.sym_strides(), base.sym_storage_offset());
+}
+
 Tensor slice_backward(const Tensor& grad, IntArrayRef input_sizes, int64_t dim, int64_t start, int64_t end, int64_t step) {
   auto grad_input = at::zeros(input_sizes, grad.options());
   grad_input.slice(dim, start, end, step).copy_(grad);
@@ -2719,6 +2735,38 @@ static void check_stack_inputs(TensorList tensors, int64_t dim) {
   }
 }
 
+// Pads each tensor on `dim`-th dimension such that padded_dim % num_chunks == 0.
+static std::vector<Tensor> _pad_chunk(TensorList tensors, int64_t dim, int64_t num_chunks) {
+  auto num_tensors = tensors.size();
+  std::vector<Tensor> padded_tensors;
+  padded_tensors.reserve(num_tensors);
+  for (const auto & tensor : tensors) {
+    auto tensor_size = tensor.sizes();
+    std::vector<int64_t> padded_size(tensor_size.vec());
+    padded_size[dim] = (tensor_size[dim] + num_chunks - 1) / num_chunks * num_chunks;
+    Tensor padded_tensor = tensor;
+    if (padded_size != tensor_size) {
+      padded_tensor = tensor.new_zeros(padded_size);
+      padded_tensor.narrow(dim, 0, tensor_size[dim]).copy_(tensor);
+    }
+    std::vector<int64_t> view_sizes(tensor_size.begin(), tensor_size.begin()+dim);
+    view_sizes.insert(view_sizes.end(), {num_chunks, -1});
+    padded_tensors.push_back(padded_tensor.view(view_sizes));
+  }
+  return padded_tensors;
+}
+
+Tensor _chunk_cat(TensorList tensors, int64_t dim, int64_t num_chunks) {
+  auto wrapped_dim = at::native::preprocess_chunk_cat_inputs(tensors, dim, num_chunks);
+  return at::cat(_pad_chunk(tensors, wrapped_dim, num_chunks), wrapped_dim+1);
+}
+
+Tensor& _chunk_cat_out(TensorList tensors, int64_t dim, int64_t num_chunks, Tensor& out) {
+  auto wrapped_dim = at::native::preprocess_chunk_cat_inputs(tensors, dim, num_chunks);
+  at::cat_out(out, _pad_chunk(tensors, wrapped_dim, num_chunks), wrapped_dim+1);
+  return out;
+}
+
 // TODO(msubkhankulov): refactor to use _stack
 Tensor stack(TensorList tensors, int64_t dim) {
   TORCH_CHECK(!tensors.empty(),
@@ -2928,11 +2976,11 @@ Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) {
   }
 
   // Sparse COO is an exceptional sparse format as it allows transpose
-  // to be a view operation which is a convinient property for
+  // to be a view operation which is a convenient property for
   // in-place operations. For other sparse formats, the in-place
   // transpose would not be possible without shuffling the specified
   // values. So we don't support this as it would defeat the purpose
-  // of in-place opeations of being memory-efficient.
+  // of in-place opreations of being memory-efficient.
   if (self.is_sparse()) {
     return sparse_transpose_(self, dim0, dim1);
   }
@@ -3199,13 +3247,11 @@ inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) {
 // dim is present if squeezing a single dimension and absent if squeezing all dimensions
 Tensor squeeze_qtensor(const Tensor& self, c10::OptionalIntArrayRef dims) {
   auto quantizer = get_qtensorimpl(self)->quantizer();
-  SymDimVector sizes;
-  SymDimVector strides;
   const auto ndim = self.dim();
   auto mask = dims.has_value()
       ? dim_list_to_bitset(dims, self.dim())
       : std::bitset<dim_bitset_size>((1ull << self.dim()) - 1);
-  std::tie(sizes, strides) = inferSqueezeGeometry(self, mask);
+  auto [sizes, strides] = inferSqueezeGeometry(self, mask);
   if (quantizer->qscheme() == QScheme::PER_CHANNEL_AFFINE) {
     const auto* per_channel_quantizer = static_cast<at::PerChannelAffineQuantizer*>(quantizer.get());
     auto axis = per_channel_quantizer->axis();
@@ -3417,6 +3463,10 @@ Tensor flatten(const Tensor& self, int64_t start_dim, int64_t end_dim) {
 }
 
 Tensor flatten(const Tensor& self, int64_t start_dim, int64_t end_dim, Dimname out_dim) {
+  start_dim = maybe_wrap_dim(start_dim, self.dim());
+  end_dim = maybe_wrap_dim(end_dim, self.dim());
+  TORCH_CHECK(start_dim <= end_dim, "flatten() has invalid args: start_dim cannot come after end_dim");
+
   auto outnames = self.names().vec();
   outnames.erase(outnames.begin() + start_dim, outnames.begin() + end_dim + 1);
   outnames.insert(outnames.begin() + start_dim, out_dim);
@@ -4012,6 +4062,13 @@ void split_with_sizes_copy_out(const at::Tensor & self, at::IntArrayRef split_si
 
   TORCH_CHECK(out.size() == tmp.size(), "split_with_sizes_copy_out() expected an out= argument of size ", tmp.size(), ", got size ", out.size());
   for (const auto i : c10::irange(out.size())) {
+    if (resize_output_check(out[i], tmp[i].sizes())) {
+      out[i].resize_(tmp[i].sizes());
+    }
+    TORCH_CHECK(out[i].dtype() == tmp[i].dtype(),
+        "Expected out tensor to have dtype ", tmp[i].dtype(), ", but got ", out[i].dtype(), " instead");
+    TORCH_CHECK(out[i].device() == tmp[i].device(),
+        "Expected out tensor to have device ", tmp[i].device(), ", but got ", out[i].device(), " instead");
     out[i].copy_(tmp[i]);
   }
 }
@@ -4025,11 +4082,13 @@ void unbind_copy_int_out(const at::Tensor & self, int64_t dim, at::TensorList  o
   }
 }
 
-int64_t sparse_dim_strided(const at::Tensor& self) {
+int64_t sparse_dim_default(const Tensor& self) {
+  TORCH_CHECK(self.layout() == kStrided, "sparse_dim expected sparse or strided tensor layout but got ", self.layout());
   return 0;
 }
 
-int64_t dense_dim_strided(const at::Tensor& self) {
+int64_t dense_dim_default(const Tensor& self) {
+  TORCH_CHECK(self.layout() == kStrided, "dense_dim expected sparse or strided tensor layout but got ", self.layout());
   return self.dim();
 }
 
diff --git a/aten/src/ATen/native/TensorShape.h b/aten/src/ATen/native/TensorShape.h
index 1c84abb822aba..c35023d076e73 100644
--- a/aten/src/ATen/native/TensorShape.h
+++ b/aten/src/ATen/native/TensorShape.h
@@ -8,7 +8,7 @@ namespace at::native {
 TORCH_API at::Tensor clone_preserve_strides(const at::Tensor& self);
 
 inline bool cat_should_skip_tensor(const Tensor& t) {
-  return t.numel() == 0 && t.dim() == 1;
+  return t.sym_numel() == 0 && t.dim() == 1;
 }
 
  // Check to see if the shape of tensors is compatible
@@ -55,4 +55,51 @@ inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t di
   return num_splits;
 }
 
+inline bool have_same_ndims(TensorList tensors) {
+  auto ndim = tensors[0].dim();
+  for (const auto tensor_idx : c10::irange(tensors.size())) {
+    if(tensors[tensor_idx].dim() != ndim) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void leading_dimension_matches(TensorList tensors, int64_t dim) {
+  auto tensor_zero_size = tensors[0].sizes();
+  std::vector<c10::SymInt> leading_dim_sizes(tensor_zero_size.begin(), tensor_zero_size.begin() + dim);
+  for (const auto i : c10::irange(tensors.size())) {
+    at::Tensor tensor = tensors[i];
+    for(const auto j : c10::irange(dim)) {
+      TORCH_CHECK(
+        tensor.size(j) == leading_dim_sizes[j],
+        "_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors"
+      );
+    }
+  }
+}
+
+inline int64_t preprocess_chunk_cat_inputs(TensorList tensors, int64_t dim, int64_t num_chunks) {
+  TORCH_CHECK(num_chunks >= 1, "_chunk_cat expects positive num_chunks");
+  TORCH_CHECK(!tensors.empty(),
+           "_chunk_cat expects a non-empty input tensor list");
+  auto expected_dtype = tensors[0].dtype();
+  auto expected_device = tensors[0].device();
+  for(const auto i : c10::irange(tensors.size())) {
+    TORCH_CHECK(tensors[i].numel() > 0, "_chunk_cat expects non-empty tensor");
+    TORCH_CHECK(tensors[i].dtype() == expected_dtype, "_chunk_cat expects all input tensors with the same dtype");
+    TORCH_CHECK(tensors[i].device() == expected_device, "_chunk_cat expects all inputs tensors on the same device");
+  }
+  if (have_same_ndims(tensors)) {
+    dim = maybe_wrap_dim(dim, tensors[0].dim());
+  } else {
+    TORCH_CHECK(dim >= 0, "_chunk_cat expects non-negative dim when input tensors have different ndims")
+    for(const auto i : c10::irange(tensors.size())) {
+      TORCH_CHECK(dim < tensors[i].ndimension(), "_chunk_cat expects dim < ndim for all input tensors");
+    }
+  }
+  leading_dimension_matches(tensors, dim);
+  return dim;
+}
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp
index c2f54c5b66574..5a7c3a6de965f 100644
--- a/aten/src/ATen/native/TensorTransformations.cpp
+++ b/aten/src/ATen/native/TensorTransformations.cpp
@@ -63,8 +63,8 @@ Tensor flip(const Tensor& self, IntArrayRef dims) {
     .check_all_same_dtype(false)
     .declare_static_dtype_and_device(self.scalar_type(), self.device())
     .add_output(out_tensor)
-    .add_input(self)
-    .add_input(restrided_self)
+    .add_const_input(self)
+    .add_const_input(restrided_self)
     .build();
 
   auto* data = reinterpret_cast<char*>(iter.data_ptr(0));
diff --git a/aten/src/ATen/native/TestOps.cpp b/aten/src/ATen/native/TestOps.cpp
index 6dd2d1aa55517..e2fce123035ba 100644
--- a/aten/src/ATen/native/TestOps.cpp
+++ b/aten/src/ATen/native/TestOps.cpp
@@ -4,6 +4,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/FunctionalInverses.h>
 #include <ATen/ScalarOps.h>
+#include <ATen/Parallel.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -13,6 +14,7 @@
 #include <ATen/ops/_test_autograd_multiple_dispatch_native.h>
 #include <ATen/ops/_test_autograd_multiple_dispatch_view_native.h>
 #include <ATen/ops/_test_check_tensor_native.h>
+#include <ATen/ops/_test_parallel_materialize_native.h>
 #include <ATen/ops/_test_optional_filled_intlist_native.h>
 #include <ATen/ops/_test_optional_floatlist_native.h>
 #include <ATen/ops/_test_optional_intlist_native.h>
@@ -111,6 +113,21 @@ Tensor _test_check_tensor(const Tensor& self) {
   return self.clone();
 }
 
+Tensor _test_parallel_materialize(const Tensor& self, int64_t num_parallel, bool skip_first) {
+  at::parallel_for(0, num_parallel, 1, [&](int64_t begin, int64_t end){
+    // NOTE: skip_first is meant to avoid triggering the materialization from
+    // the first thread, to ensure that the subthreads throw the error
+    // correctly. On some platforms, the first thread is the main thread and it
+    // begins executing the loop function much earlier than the subthreads.
+    if (skip_first && begin == 0 && end == 1) {
+      return;
+    } else {
+      self.mutable_data_ptr();
+    }
+  });
+  return self;
+}
+
 } // namespace at::native
 
 namespace at::functionalization {
diff --git a/aten/src/ATen/native/TopKImpl.h b/aten/src/ATen/native/TopKImpl.h
index a9790e892c642..0a11f5f408753 100644
--- a/aten/src/ATen/native/TopKImpl.h
+++ b/aten/src/ATen/native/TopKImpl.h
@@ -36,14 +36,14 @@ void topk_impl_loop(
     TensorAccessor<int64_t, 1> mode_indices(
         reinterpret_cast<int64_t*>(data[1] + i * strides[1]),
         &k, &mode_indices_stride);
-    TensorAccessor<scalar_t, 1> tmp_values(
+    TensorAccessor<const scalar_t, 1> tmp_values(
         reinterpret_cast<scalar_t*>(data[2] + i * strides[2]),
         &dim_size, &tmp_values_stride);
 
-    auto n = dim_size;
-    auto use_partial_sort = k * 64 <= n;
+    auto n_2 = dim_size;
+    auto use_partial_sort = k * 64 <= n_2;
 
-    for (const auto j : c10::irange(n)) {
+    for (const auto j : c10::irange(n_2)) {
       queue[j].first = tmp_values[j];
       queue[j].second = j;
     }
diff --git a/aten/src/ATen/native/TriangularOps.cpp b/aten/src/ATen/native/TriangularOps.cpp
index 66147b441fd74..9cb75a0eccf4e 100644
--- a/aten/src/ATen/native/TriangularOps.cpp
+++ b/aten/src/ATen/native/TriangularOps.cpp
@@ -41,7 +41,7 @@ namespace {
 template <typename scalar_t>
 void apply_triu_tril_single(
     scalar_t* result,
-    scalar_t* self,
+    const scalar_t* self,
     bool inplace,
     int64_t k,
     int64_t n,
@@ -86,7 +86,7 @@ template <typename scalar_t>
 void apply_triu_tril(const Tensor& result, const Tensor& self, bool inplace, int64_t k, bool upper) {
   auto n = self.size(-2);
   auto m = self.size(-1);
-  auto self_data = self.data_ptr<scalar_t>();
+  auto self_data = self.const_data_ptr<scalar_t>();
   auto self_stride = (self.dim() > 2 && self.stride(-3) > 0) ? self.stride(-3) : 1;
   auto batchsize = batchCountTrilTriu(result);
   auto self_row_stride = self.stride(-2);
@@ -107,7 +107,7 @@ void apply_triu_tril(const Tensor& result, const Tensor& self, bool inplace, int
 
   parallel_for(0, batchsize, 0, [&](int64_t start, int64_t end) {
     for (const auto b : c10::irange(start, end)) {
-      scalar_t* self_batch = &self_data[b * self_stride];
+      const scalar_t* self_batch = &self_data[b * self_stride];
       scalar_t* result_batch = &result_data[b * result_stride];
       apply_triu_tril_single<scalar_t>(
           result_batch,
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index c978ffcc2d89a..6c22d2583f130 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -412,7 +412,6 @@ template <typename Stub, typename ...Args>
 static inline Tensor& unary_op_impl_float_out(Tensor& result, const Tensor& self, Stub& stub, Args... args) {
   auto iter = TensorIterator::unary_float_op(result, self);
   stub(iter.device_type(), iter, args...);
-  iter.cast_outputs();
   return result;
 }
 
@@ -868,7 +867,7 @@ Tensor& logical_not_out(const Tensor& self, Tensor& result) {
   TensorIterator iter = TensorIteratorConfig()
     .check_all_same_dtype(false)
     .add_output(result)
-    .add_input(self)
+    .add_const_input(self)
     .build();
   logical_not_stub(iter.device_type(), iter);
   return result;
@@ -964,7 +963,7 @@ std::tuple<Tensor&, Tensor&> frexp_out(const Tensor& self,
   auto iter = TensorIteratorConfig()
     .add_output(mantissa)
     .add_output(exponent)
-    .add_input(self)
+    .add_const_input(self)
     .check_all_same_dtype(false)
     .set_check_mem_overlap(true)
     .build();
@@ -973,7 +972,7 @@ std::tuple<Tensor&, Tensor&> frexp_out(const Tensor& self,
   return std::tuple<Tensor&, Tensor&>(mantissa, exponent);
 }
 
-// alias for lgamma, implements special.gammanln equivalent to
+// alias for lgamma, implements special.gammaln equivalent to
 // scipy.special.gammaln
 Tensor special_gammaln(const Tensor& self) { return self.lgamma(); }
 Tensor& special_gammaln_out(const Tensor& self, Tensor& result) { return at::lgamma_out(result, self); }
diff --git a/aten/src/ATen/native/Unfold2d.h b/aten/src/ATen/native/Unfold2d.h
index 98d628f7bf2ca..e5fe7d4468217 100644
--- a/aten/src/ATen/native/Unfold2d.h
+++ b/aten/src/ATen/native/Unfold2d.h
@@ -6,7 +6,25 @@
 
 namespace at::native {
 
-using unfold2d_fn = void (*)(
+using unfold2d_copy_fn = void (*)(
+    ScalarType dtype,
+    void *finput,
+    const void *input,
+    int64_t kH,
+    int64_t kW,
+    int64_t dH,
+    int64_t dW,
+    int64_t padH,
+    int64_t padW,
+    int64_t n_input_plane,
+    int64_t input_height,
+    int64_t input_width,
+    int64_t output_height,
+    int64_t output_width,
+    bool is_channels_last
+);
+
+using unfold2d_acc_fn = void (*)(
     ScalarType dtype,
     void *finput,
     void *input,
@@ -24,7 +42,7 @@ using unfold2d_fn = void (*)(
     bool is_channels_last
 );
 
-DECLARE_DISPATCH(unfold2d_fn, unfolded2d_copy_stub);
-DECLARE_DISPATCH(unfold2d_fn, unfolded2d_acc_stub);
+DECLARE_DISPATCH(unfold2d_copy_fn, unfolded2d_copy_stub);
+DECLARE_DISPATCH(unfold2d_acc_fn, unfolded2d_acc_stub);
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/UnfoldBackward.h b/aten/src/ATen/native/UnfoldBackward.h
index 7ff39f84c6fdd..44e05c125913e 100644
--- a/aten/src/ATen/native/UnfoldBackward.h
+++ b/aten/src/ATen/native/UnfoldBackward.h
@@ -100,8 +100,8 @@ static C10_UNUSED TensorIterator _make_unfold_backward_iter_over_grad_out(
     .check_all_same_dtype(false)
     .resize_outputs(false)
     .add_owned_output(grad_out_restrided)
-    .add_owned_input(grad_in_restrided)
-    .add_owned_input(idx_dim_restrided)
+    .add_owned_const_input(grad_in_restrided)
+    .add_owned_const_input(idx_dim_restrided)
     .build();
 
   return iter;
diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp
index be220fc40ec7e..801af5d5e79fe 100644
--- a/aten/src/ATen/native/Unique.cpp
+++ b/aten/src/ATen/native/Unique.cpp
@@ -2,7 +2,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 
 #include <ATen/core/Tensor.h>
-#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
 #include <ATen/Parallel.h>
 #include <ATen/native/TensorIterator.h>
 #include <c10/util/irange.h>
@@ -37,7 +37,7 @@ std::tuple<Tensor, Tensor, Tensor> unique_cpu_bool_template(
     const bool return_inverse,
     const bool return_counts) {
   const Tensor& input = self.contiguous();
-  bool* input_data = input.data_ptr<bool>();
+  const bool* input_data = input.const_data_ptr<bool>();
 
   int64_t numel = input.numel();
   Tensor output = at::empty({0}, self.options());
@@ -270,7 +270,7 @@ std::tuple<Tensor, Tensor, Tensor> unique_consecutive_cpu_template(
     const bool return_inverse,
     const bool return_counts) {
   const Tensor& input = self.contiguous();
-  const scalar_t* input_data = input.data_ptr<scalar_t>();
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
   int64_t numel = input.numel();
   Tensor output = at::empty({numel}, input.options());
   Tensor inverse_indices = at::empty({0}, self.options().dtype(kLong));
@@ -390,7 +390,7 @@ std::tuple<Tensor, Tensor, Tensor> _unique_dim_cpu_template(
   std::vector<int64_t> indices(input_flat.size(0));
   std::iota(indices.begin(), indices.end(), 0);
   int64_t numel = input_flat.size(1);
-  scalar_t* input_flat_ptr = ((scalar_t*)input_flat.data_ptr());
+  const scalar_t* input_flat_ptr = ((const scalar_t*)input_flat.const_data_ptr());
 
   // sort indices using data
   if (!consecutive) {
@@ -442,19 +442,17 @@ std::tuple<Tensor, Tensor, Tensor> _unique_dim_cpu_template(
 std::tuple<Tensor, Tensor>
 _unique_cpu(const Tensor& self, const bool sorted, const bool return_inverse) {
   if (self.scalar_type() == kBool) {
-    Tensor output, inverse;
-    std::tie(output, inverse, std::ignore) = unique_cpu_bool_template(
+    auto [output, inverse, _] = unique_cpu_bool_template(
         self, return_inverse, /* return_counts */false);
     return std::make_tuple(output, inverse);
   }
-  return AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, self.scalar_type(), "unique", [&] {
-    Tensor output, inverse;
+  return AT_DISPATCH_V2(self.scalar_type(), "unique", [&] AT_WRAP({
     // The current CPU implementation of unique always sort due to
     // this is faster than hash table
-    std::tie(output, inverse, std::ignore) = unique_cpu_sorted_template<scalar_t>(
+    auto [output, inverse, _] = unique_cpu_sorted_template<scalar_t>(
         self, return_inverse, /* return_counts */false, IsUnique<scalar_t, /* equal_nan */false>());
     return std::make_tuple(output, inverse);
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
 }
 
 std::tuple<Tensor, Tensor, Tensor>
@@ -462,35 +460,35 @@ _unique2_cpu(const Tensor& self, const bool sorted, const bool return_inverse, c
   if (self.scalar_type() == kBool) {
     return unique_cpu_bool_template(self, return_inverse, return_counts);
   }
-  return AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, self.scalar_type(), "unique", [&] {
+  return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] {
     // The current CPU implementation of unique always sort due to
     // this is faster than hash table
     return unique_cpu_sorted_template<scalar_t>(
         self, return_inverse, return_counts, IsUnique<scalar_t, /* equal_nan */ false>());
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
 }
 
 std::tuple<Tensor, Tensor, Tensor>
 unique_dim_cpu(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse, const bool return_counts) {
-  return AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kBool, kHalf, self.scalar_type(), "unique_dim", [&] {
+  return AT_DISPATCH_V2(self.scalar_type(), "unique_dim", AT_WRAP([&] {
     // The current implementation using `dim` always sorts due to unhashable tensors
     return _unique_dim_cpu_template<scalar_t>(self, dim, false, return_inverse, return_counts);
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
 }
 
 std::tuple<Tensor, Tensor, Tensor>
 unique_dim_consecutive_cpu(const Tensor& self, const int64_t dim, const bool return_inverse, const bool return_counts) {
-  return AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kBool, kHalf, self.scalar_type(), "unique_dim", [&] {
+  return AT_DISPATCH_V2(self.scalar_type(), "unique_dim", AT_WRAP([&] {
     return _unique_dim_cpu_template<scalar_t>(self, dim, true, return_inverse, return_counts);
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
 }
 
 std::tuple<Tensor, Tensor, Tensor>
 unique_consecutive_cpu(const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional<int64_t> dim) {
   if (!dim.has_value() || (dim.value() == 0 && self.dim() == 1)) {
-    return AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kBool, kHalf, self.scalar_type(), "unique", [&] {
+    return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] {
       return unique_consecutive_cpu_template<scalar_t>(self, return_inverse, return_counts);
-    });
+    }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
   }
   return unique_dim_consecutive_cpu(self, dim.value(), return_inverse, return_counts);
 }
diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
index 95797cb538284..8dadc7cee3ae4 100644
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@@ -4,9 +4,12 @@
 
 #include <ATen/OpMathType.h>
 #include <ATen/TensorUtils.h>
+#include <ATen/OpMathType.h>
 #include <ATen/core/Tensor.h>
+#include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/DispatchStub.h>
+#include <ATen/native/cpu/utils.h>
 
 /**
  * Note [compute_scales_value]
@@ -467,30 +470,32 @@ static inline void compute_source_index_and_lambda(
   }
 }
 
-// It will not be used by data types other than BFloat16.
-template <typename scalar_in, typename scalar_out>
+// It will not be used by data types other than BFloat16 and Half.
+template <typename scalar_in, typename scalar_out,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_out> || !std::is_same<scalar_in, float>::value, int> = 0>
 void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) {
-  TORCH_CHECK((std::is_same<scalar_out, BFloat16>::value),
-              "Upsample backward only support BFloat16 in the lower percision data types on CPU.")
+  TORCH_CHECK((is_reduced_floating_point_v<scalar_out>),
+              "Upsample backward only support BFloat16 and Half in the lower precision data types on CPU.")
   TORCH_CHECK((std::is_same<scalar_in, float>::value),
-              "Upsample backward should use float as acc buffer for BFloat16 grad input on CPU.")
+              "Upsample backward should use float as acc buffer for BFloat16 and Half grad input on CPU.")
   return;
 }
 
-template <>
-void inline apply_grad_input(float* buffer_ptr, BFloat16* gin, int64_t size) {
-  using bVec = vec::Vectorized<BFloat16>;
-  using fVec = vec::Vectorized<float>;
+template <typename scalar_in, typename scalar_out,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_out> && std::is_same<scalar_in, float>::value, int> = 0>
+void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) {
+  using bVec = Vectorized<scalar_out>;
+  using fVec = Vectorized<float>;
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec gin_bvec = bVec::loadu(gin + d);
     fVec gin_fvec0, gin_fvec1;
-    std::tie(gin_fvec0, gin_fvec1) = convert_bfloat16_float(gin_bvec);
+    std::tie(gin_fvec0, gin_fvec1) = convert_to_float<scalar_out>(gin_bvec);
     gin_fvec0 += fVec::loadu(buffer_ptr + d);
     gin_fvec1 += fVec::loadu(buffer_ptr + d + fVec::size());
     fVec(0).store(buffer_ptr + d);
     fVec(0).store(buffer_ptr + d + fVec::size());
-    convert_float_bfloat16(gin_fvec0, gin_fvec1).store(gin + d);
+    convert_from_float<scalar_out>(gin_fvec0, gin_fvec1).store(gin + d);
   }
   for (; d < size; d++) {
     gin[d] += buffer_ptr[d];
diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp
index d08c8d3a48a67..f5e523c4a9114 100644
--- a/aten/src/ATen/native/UpSampleBicubic2d.cpp
+++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp
@@ -106,7 +106,7 @@ namespace {
 
 template <typename scalar_t>
 static void upsample_bicubic2d_backward_out_frame(
-    scalar_t* odata,
+    const scalar_t* odata,
     scalar_t* idata,
     int64_t input_height,
     int64_t input_width,
@@ -136,7 +136,7 @@ static void upsample_bicubic2d_backward_out_frame(
     }
     for (const auto i : c10::irange(start, end)) {
       scalar_t* in = idata + i * input_slice_size;
-      scalar_t* out = odata + i * output_slice_size;
+      const scalar_t* out = odata + i * output_slice_size;
       for (const auto output_y : c10::irange(output_height)) {
         for (const auto output_x : c10::irange(output_width)) {
 
@@ -205,7 +205,7 @@ static void upsample_bicubic2d_backward_kernel(
   AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16,
       grad_output.scalar_type(), "upsample_bicubic2d_backward", [&] {
         scalar_t* idata = grad_input.mutable_data_ptr<scalar_t>();
-        scalar_t* odata = grad_output.data_ptr<scalar_t>();
+        const scalar_t* odata = grad_output.const_data_ptr<scalar_t>();
 
         upsample_bicubic2d_backward_out_frame<scalar_t>(
             odata,
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
index afdece7f0d491..aa2bab7c6b945 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
@@ -11,18 +11,18 @@ namespace ao {
 namespace sparse {
 
 namespace {
-const int64_t serialization_version_index = 0;
-const int64_t bias_index = 1;
-const int64_t out_features_block_size_index = 2;
-const int64_t in_features_block_size_index = 3;
-const int64_t weight_scales_index = 4;
-const int64_t weight_zero_point_index = 5;
-const int64_t quantization_scheme_index = 6;
-const int64_t row_block_indices_index = 7;
-const int64_t col_block_indices_index = 8;
-const int64_t weight_values_index = 9;
-const int64_t num_output_channels_index = 10;
-const int64_t num_input_channels_index = 11;
+constexpr int64_t serialization_version_index = 0;
+constexpr int64_t bias_index = 1;
+constexpr int64_t out_features_block_size_index = 2;
+constexpr int64_t in_features_block_size_index = 3;
+constexpr int64_t weight_scales_index = 4;
+constexpr int64_t weight_zero_point_index = 5;
+constexpr int64_t quantization_scheme_index = 6;
+constexpr int64_t row_block_indices_index = 7;
+constexpr int64_t col_block_indices_index = 8;
+constexpr int64_t weight_values_index = 9;
+constexpr int64_t num_output_channels_index = 10;
+constexpr int64_t num_input_channels_index = 11;
 
 template <typename TENSOR_DTYPE, typename VEC_DTYPE>
 std::vector<VEC_DTYPE> unwrap_vector(at::Tensor tensor) {
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
index bedf2f4461f3a..8f80d920e3652 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
@@ -186,7 +186,7 @@ PackedLinearWeightQnnp::PackedLinearWeightQnnp(
 
   std::tie(w_zero_points_, w_scales_) =
       make_zero_points_and_scales_tensor(weight_contig);
-  const float* weight_scales_data = w_scales_.data_ptr<float>();
+  const float* weight_scales_data = w_scales_.const_data_ptr<float>();
   at::Tensor qnnp_weight = at::_empty_affine_quantized(
       weight_contig.sizes(),
       at::device(c10::kCPU).dtype(c10::kQUInt8),
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_serialize.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_serialize.cpp
index e557ec3994134..d5790b5bc223e 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_serialize.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_serialize.cpp
@@ -160,7 +160,7 @@ BCSRSerializationType PackedLinearWeight::serialize() {
 BCSRSerializationType PackedLinearWeightQnnp::serialize() {
   at::Tensor w_scales_compact;
   at::Tensor w_zero_points_compact;
-  const float* w_scales_data_ptr = w_scales_.data_ptr<float>();
+  const float* w_scales_data_ptr = w_scales_.const_data_ptr<float>();
   std::function<int8_t(uint8_t)> subtract_128 = [](uint8_t v) {
     return static_cast<int8_t>(static_cast<int16_t>(v) - 128);
   };
diff --git a/aten/src/ATen/native/batch_norm.h b/aten/src/ATen/native/batch_norm.h
index cbddde86ad8ba..eba4b0a963241 100644
--- a/aten/src/ATen/native/batch_norm.h
+++ b/aten/src/ATen/native/batch_norm.h
@@ -26,8 +26,13 @@ static TensorAccessor<scalar_t, 1> conditional_accessor_1d(const Tensor& t) {
 
 template <typename scalar_t>
 static scalar_t* conditional_data_ptr(const Tensor& t) {
-  return t.defined() ? t.contiguous().data_ptr<scalar_t>()
-                     : nullptr;
+  if constexpr (std::is_const_v<scalar_t>) {
+    return t.defined() ? t.contiguous().const_data_ptr<scalar_t>()
+                      : nullptr;
+  } else {
+    return t.defined() ? t.contiguous().data_ptr<scalar_t>()
+                      : nullptr;
+  }
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
index a7349d8299ca6..88b43015d9906 100644
--- a/aten/src/ATen/native/cpu/Activation.cpp
+++ b/aten/src/ATen/native/cpu/Activation.cpp
@@ -30,14 +30,13 @@ static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const
     using Vec = Vectorized<scalar_t>;
     scalar_t* output_data = output.data_ptr<scalar_t>();
     scalar_t* buffer_data = buffer.data_ptr<scalar_t>();
-    scalar_t* input_data = input.data_ptr<scalar_t>();
+    const scalar_t* input_data = input.const_data_ptr<scalar_t>();
     parallel_for(0, input.numel(), 1, [&] (int64_t begin, int64_t end) {
       int64_t size = end - begin;
       int64_t d = 0;
       for (; d < size - (size % Vec::size()); d += Vec::size()) {
         Vec data_vec = Vec::loadu(input_data + begin+ d);
-        Vectorized<float> data_vec0, data_vec1;
-        std::tie(data_vec0, data_vec1) = convert_to_float<scalar_t>(data_vec);
+        auto [data_vec0, data_vec1] = convert_to_float<scalar_t>(data_vec);
         Vectorized<float> min_vec = minimum(data_vec0, Vectorized<float>(float(0)));
         Vectorized<float> buffer_vec0 = data_vec0.abs().neg().exp();
         Vectorized<float> output_vec0 = min_vec - buffer_vec0.log1p();
@@ -49,8 +48,7 @@ static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const
       }
       if (size - d > 0) {
         Vec data_vec = Vec::loadu(input_data + begin + d, size - d);
-        Vectorized<float> data_vec0, data_vec1;
-        std::tie(data_vec0, data_vec1) = convert_to_float<scalar_t>(data_vec);
+        auto [data_vec0, data_vec1] = convert_to_float<scalar_t>(data_vec);
         Vectorized<float> min_vec = minimum(data_vec0, Vectorized<float>(float(0)));
         Vectorized<float> buffer_vec0 = data_vec0.abs().neg().exp();
         Vectorized<float> output_vec0 = min_vec - buffer_vec0.log1p();
@@ -67,7 +65,7 @@ static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const
       using Vec = Vectorized<scalar_t>;
       scalar_t* output_data = output.data_ptr<scalar_t>();
       scalar_t* buffer_data = buffer.data_ptr<scalar_t>();
-      scalar_t* input_data = input.data_ptr<scalar_t>();
+      const scalar_t* input_data = input.const_data_ptr<scalar_t>();
       parallel_for(0, input.numel(), 1, [&] (int64_t begin, int64_t end) {
         int64_t size = end - begin;
         int64_t d = 0;
@@ -108,10 +106,9 @@ static void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) {
           return (max_deriv - sign * (float(b) / (float(1) + b))) * float(c);
         },
         [=](Vec a, Vec b, Vec c) -> Vec {
-          Vectorized<float> a0, a1, b0, b1, c0, c1;
-          std::tie(a0, a1) = convert_to_float<scalar_t>(a);
-          std::tie(b0, b1) = convert_to_float<scalar_t>(b);
-          std::tie(c0, c1) = convert_to_float<scalar_t>(c);
+          auto [a0, a1] = convert_to_float<scalar_t>(a);
+          auto [b0, b1] = convert_to_float<scalar_t>(b);
+          auto [c0, c1] = convert_to_float<scalar_t>(c);
           auto mask = a0 < zero_vec;
           auto max_deriv_vec = Vectorized<float>::blendv(zero_vec, one_vec, mask);
           auto sign_vec = Vectorized<float>::blendv(one_vec.neg(), one_vec, mask);
@@ -164,9 +161,8 @@ static void threshold_kernel(
             return float(x) <= threshold ? value : other;
           },
           [&](Vectorized<scalar_t> x, Vectorized<scalar_t> other) -> Vectorized<scalar_t> {
-            Vec x0, x1, other0, other1;
-            std::tie(x0, x1) = convert_to_float<scalar_t>(x);
-            std::tie(other0, other1) = convert_to_float<scalar_t>(other);
+            auto [x0, x1] = convert_to_float<scalar_t>(x);
+            auto [other0, other1] = convert_to_float<scalar_t>(other);
             return convert_from_float<scalar_t>(Vec::blendv(other0, value_v, x0 <= threshold_v),
                                                 Vec::blendv(other1, value_v, x1 <= threshold_v));
           });
@@ -207,16 +203,15 @@ void elu_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scalar& scale
           return float(a) <= float(0) ? (std::exp(float(a) * negiptcoef) - float(1)) * negcoef : float(a) * poscoef;
         },
         [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &one_vec, &zero_vec](Vectorized<scalar_t> a) -> Vectorized<scalar_t> {
-          Vectorized<float> a0, a1, res0, res1;
-          std::tie(a0, a1) = convert_to_float<scalar_t>(a);
+          auto [a0, a1] = convert_to_float<scalar_t>(a);
           auto cmp0 = (a0 > zero_vec);
           auto cmp1 = (a1 > zero_vec);
           auto get_res_masked = [&](Vectorized<float>& cmp, Vectorized<float>& a) {
             return !cmp.zero_mask() ? a * poscoef_vec :
               Vectorized<float>::blendv(((a * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a * poscoef_vec, cmp);
           };
-          res0 = get_res_masked(cmp0, a0);
-          res1 = get_res_masked(cmp1, a1);
+          auto res0 = get_res_masked(cmp0, a0);
+          auto res1 = get_res_masked(cmp1, a1);
           return convert_from_float<scalar_t>(res0, res1);
         });
     });
@@ -268,10 +263,8 @@ void elu_backward_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scal
           }
         },
         [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &zero_vec, is_result](Vectorized<scalar_t> a, Vectorized<scalar_t> b) -> Vectorized<scalar_t> {
-          Vectorized<float> a0, a1, res0, res1;
-          std::tie(a0, a1) = convert_to_float<scalar_t>(a);
-          Vectorized<float> b0, b1;
-          std::tie(b0, b1) = convert_to_float<scalar_t>(b);
+          auto [a0, a1] = convert_to_float<scalar_t>(a);
+          auto [b0, b1] = convert_to_float<scalar_t>(b);
           auto cmp0 = (b0 > zero_vec);
           auto cmp1 = (b1 > zero_vec);
           auto get_res_masked = [&](Vectorized<float>& cmp, Vectorized<float>& a, Vectorized<float>& b) {
@@ -282,8 +275,8 @@ void elu_backward_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scal
               return Vectorized<float>::blendv(a * negiptcoef_vec * negcoef_vec * (b * negiptcoef_vec).exp(), a * poscoef_vec, cmp);
             }
           };
-          res0 = get_res_masked(cmp0, a0, b0);
-          res1 = get_res_masked(cmp1, a1, b1);
+          auto res0 = get_res_masked(cmp0, a0, b0);
+          auto res1 = get_res_masked(cmp1, a1, b1);
           return convert_from_float<scalar_t>(res0, res1);
         });
     });
@@ -364,8 +357,7 @@ void GeluKernelImpl(TensorIteratorBase& it, GeluType approximate) {
               return float(0.5) * float(x) * (float(1) + std::tanh(inner));
             },
             [&](Vectorized<scalar_t> x) -> Vectorized<scalar_t> {
-              Vectorized<float> x0, x1;
-              std::tie(x0, x1) = convert_to_float<scalar_t>(x);
+              auto [x0, x1] = convert_to_float<scalar_t>(x);
               auto x0_cube = x0 * x0 * x0;
               auto x1_cube = x1 * x1 * x1;
               auto inner_vec0 = kBetaVec * (x0 + kKappaVec * x0_cube);
@@ -414,8 +406,7 @@ void GeluKernelImpl(TensorIteratorBase& it, GeluType approximate) {
               return float(x) * float(0.5) * (float(1) + std::erf(float(x) * kAlpha));
             },
             [&](Vectorized<scalar_t> x) -> Vectorized<scalar_t> {
-              Vectorized<float> x0, x1;
-              std::tie(x0, x1) = convert_to_float<scalar_t>(x);
+              auto [x0, x1] = convert_to_float<scalar_t>(x);
               auto res0 = x0 * kPointFiveVec * (kOneVec + (x0 * kAlphaVec).erf());
               auto res1 = x1 * kPointFiveVec * (kOneVec + (x1 * kAlphaVec).erf());
               return convert_from_float<scalar_t>(res0, res1);
@@ -477,10 +468,8 @@ void GeluBackwardKernelImpl(TensorIteratorBase& it, GeluType approximate) {
             return float(dy) * (left_derivative + right_derivative);
           },
           [&](Vectorized<scalar_t> dy_vec, Vectorized<scalar_t> x_vec) -> Vectorized<scalar_t> {
-            Vectorized<float> x0_vec, x1_vec;
-            std::tie(x0_vec, x1_vec) = convert_to_float<scalar_t>(x_vec);
-            Vectorized<float> dy0_vec, dy1_vec;
-            std::tie(dy0_vec, dy1_vec) = convert_to_float<scalar_t>(dy_vec);
+            auto [x0_vec, x1_vec] = convert_to_float<scalar_t>(x_vec);
+            auto [dy0_vec, dy1_vec] = convert_to_float<scalar_t>(dy_vec);
             auto x0_sq = x0_vec * x0_vec;
             auto x1_sq = x1_vec * x1_vec;
             auto x0_cube = x0_vec * x0_vec * x0_vec;
@@ -583,10 +572,8 @@ void GeluBackwardKernelImpl(TensorIteratorBase& it, GeluType approximate) {
               return float(dy) * (cdf + float(x) * pdf);
           },
           [&](Vectorized<scalar_t> dy, Vectorized<scalar_t> x) -> Vectorized<scalar_t> {
-              Vectorized<float> x0, x1;
-              std::tie(x0, x1) = convert_to_float<scalar_t>(x);
-              Vectorized<float> dy0, dy1;
-              std::tie(dy0, dy1) = convert_to_float<scalar_t>(dy);
+              auto [x0, x1] = convert_to_float<scalar_t>(x);
+              auto [dy0, dy1] = convert_to_float<scalar_t>(dy);
               auto cdf_vec0 = kPointFiveVec * (kOneVec + (x0 * kAlphaVec).erf());
               auto cdf_vec1 = kPointFiveVec * (kOneVec + (x1 * kAlphaVec).erf());
               auto pdf_vec0 = kBetaVec * (x0 * x0 * kMinusPointFiveVec).exp();
@@ -643,8 +630,7 @@ void hardsigmoid_kernel(TensorIteratorBase& iter) {
           return std::min(std::max(float(self_val) + three, zero), six) / six;
         },
         [&](vec::Vectorized<scalar_t> self_val) -> vec::Vectorized<scalar_t> {
-          Vectorized<float> self_val0, self_val1;
-          std::tie(self_val0, self_val1) = convert_to_float<scalar_t>(self_val);
+          auto [self_val0, self_val1] = convert_to_float<scalar_t>(self_val);
           self_val0 = minimum(
             maximum(self_val0 + kThreeVec, kZeroVec),
             kSixVec
@@ -698,9 +684,8 @@ void hardsigmoid_backward_kernel(TensorIteratorBase& iter) {
             : zero;
         },
         [=](Vectorized<scalar_t> grad_val, Vectorized<scalar_t> self_val) -> Vectorized<scalar_t> {
-          Vec self_val0, self_val1, grad_val0, grad_val1;
-          std::tie(self_val0, self_val1) = convert_to_float<scalar_t>(self_val);
-          std::tie(grad_val0, grad_val1) = convert_to_float<scalar_t>(grad_val);
+          auto [self_val0, self_val1] = convert_to_float<scalar_t>(self_val);
+          auto [grad_val0, grad_val1] = convert_to_float<scalar_t>(grad_val);
           Vec gradNonZeroMask = (self_val0 > neg_three) & (self_val0 < three);
           self_val0 = Vec::blendv(kZeroVec, grad_val0 * kOneSixthVec, gradNonZeroMask);
           gradNonZeroMask = (self_val1 > neg_three) & (self_val1 < three);
@@ -759,11 +744,9 @@ void softshrink_kernel(TensorIteratorBase& iter, const Scalar& lambd) {
         return float(a) > lambd_val ? a - lambd_val : (float(a) < -lambd_val ? a + lambd_val : float(0));
       },
       [=](Vectorized<scalar_t> self_val) -> Vectorized<scalar_t> {
-          Vectorized<float> self_val0, self_val1;
-          Vectorized<scalar_t> self_val_t0, self_val_t1;
-          std::tie(self_val0, self_val1) = convert_to_float<scalar_t>(self_val);
-          self_val_t0 = convert_from_float<scalar_t>((self_val0 > lambdVec) & (self_val0 - lambdVec), (self_val1 > lambdVec) & (self_val1 - lambdVec));
-          self_val_t1 = convert_from_float<scalar_t>((self_val0 < -lambd_val) & (self_val0 + lambdVec), (self_val1 < -lambd_val) & (self_val1 + lambdVec));
+          auto [self_val0, self_val1] = convert_to_float<scalar_t>(self_val);
+          auto self_val_t0 = convert_from_float<scalar_t>((self_val0 > lambdVec) & (self_val0 - lambdVec), (self_val1 > lambdVec) & (self_val1 - lambdVec));
+          auto self_val_t1 = convert_from_float<scalar_t>((self_val0 < -lambd_val) & (self_val0 + lambdVec), (self_val1 < -lambd_val) & (self_val1 + lambdVec));
           return (self_val_t0 | self_val_t1);
       });
     });
@@ -812,9 +795,8 @@ void hardtanh_backward_kernel(TensorIterator& iter, const Scalar& min, const Sca
             return (float(self_val) <= min_val || float(self_val) >= max_val) ? scalar_t(0) : grad_val;
           },
           [=](Vectorized<scalar_t> grad_val, Vectorized<scalar_t> self_val) -> Vectorized<scalar_t> {
-            Vectorized<float> grad_val0, grad_val1, self_val0, self_val1;
-            std::tie(grad_val0, grad_val1) = convert_to_float<scalar_t>(grad_val);
-            std::tie(self_val0, self_val1) = convert_to_float<scalar_t>(self_val);
+            auto [grad_val0, grad_val1] = convert_to_float<scalar_t>(grad_val);
+            auto [self_val0, self_val1] = convert_to_float<scalar_t>(self_val);
             return convert_from_float<scalar_t>(
               ((self_val0 > min_val) & (self_val0 < max_val)) & grad_val0,
               ((self_val1 > min_val) & (self_val1 < max_val)) & grad_val1
@@ -853,8 +835,7 @@ void hardswish_kernel(TensorIterator& iter) {
         return float(x) * std::min(std::max(float(x) + three, zero), six) / six;
       },
       [&](vec::Vectorized<scalar_t> x_vec) {
-        Vectorized<float> x_vec0, x_vec1;
-        std::tie(x_vec0, x_vec1) = convert_to_float<scalar_t>(x_vec);
+        auto [x_vec0, x_vec1] = convert_to_float<scalar_t>(x_vec);
         x_vec0 = x_vec0 * minimum(
           maximum(x_vec0 + kThreeVec, kZeroVec),
           kSixVec
@@ -915,9 +896,8 @@ void hardswish_backward_kernel(TensorIterator& iter) {
         }
       },
       [&](vec::Vectorized<scalar_t> grad_val, vec::Vectorized<scalar_t> self_val) {
-        Vectorized<float> self_val0, self_val1, grad_val0, grad_val1;
-        std::tie(self_val0, self_val1) = convert_to_float<scalar_t>(self_val);
-        std::tie(grad_val0, grad_val1) = convert_to_float<scalar_t>(grad_val);
+        auto [self_val0, self_val1] = convert_to_float<scalar_t>(self_val);
+        auto [grad_val0, grad_val1] = convert_to_float<scalar_t>(grad_val);
         self_val0 = Vec::blendv(
           Vec::blendv(
             grad_val0 * ((self_val0 / kThreeVec) + kOneHalfVec),
@@ -990,8 +970,7 @@ static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
           return float(a) > float(0) ? float(a) : float(a) * negval;
         },
         [&](Vectorized<scalar_t> a) -> Vectorized<scalar_t> {
-          Vectorized<float> a0, a1;
-          std::tie(a0, a1) = convert_to_float<scalar_t>(a);
+          auto [a0, a1] = convert_to_float<scalar_t>(a);
           auto res0 = a0 * (Vectorized<float>::blendv(negval_v, one_vec, a0 > zero_vec));
           auto res1 = a1 * (Vectorized<float>::blendv(negval_v, one_vec, a1 > zero_vec));
           return convert_from_float<scalar_t>(res0, res1);
@@ -1030,9 +1009,8 @@ static void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& n
         return float(a) > float(0) ? float(b) : float(b) * negval;
       },
       [&](Vectorized<scalar_t> a, Vectorized<scalar_t> b) -> Vectorized<scalar_t> {
-        Vectorized<float> a0, a1, b0, b1;
-        std::tie(a0, a1) = convert_to_float<scalar_t>(a);
-        std::tie(b0, b1) = convert_to_float<scalar_t>(b);
+        auto [a0, a1] = convert_to_float<scalar_t>(a);
+        auto [b0, b1] = convert_to_float<scalar_t>(b);
         auto res0 = b0 * (Vectorized<float>::blendv(negval_v, one_vec, a0 > zero_vec));
         auto res1 = b1 * (Vectorized<float>::blendv(negval_v, one_vec, a1 > zero_vec));
         return convert_from_float<scalar_t>(res0, res1);
@@ -1073,8 +1051,7 @@ void softplus_kernel(TensorIteratorBase& iter, const Scalar& beta_, const Scalar
               : static_cast<scalar_t>((std::log1p(std::exp(float(a) * beta))) / beta);
           },
           [beta_vec, threshold_vec](Vectorized<scalar_t> a) -> Vectorized<scalar_t> {
-            Vectorized<float> a0, a1;
-            std::tie(a0, a1) = convert_to_float<scalar_t>(a);
+            auto [a0, a1] = convert_to_float<scalar_t>(a);
             a0 = Vec::blendv((a0 * beta_vec).exp().log1p() / beta_vec, a0, (a0 * beta_vec) > threshold_vec);
             a1 = Vec::blendv((a1 * beta_vec).exp().log1p() / beta_vec, a1, (a1 * beta_vec) > threshold_vec);
             return convert_from_float<scalar_t>(a0, a1);
@@ -1118,9 +1095,8 @@ void softplus_backward_kernel(TensorIteratorBase& iter, const Scalar& beta_, con
           return (float(b) * beta) > threshold ? a : static_cast<scalar_t>(float(a) * z / (z + float(1.)));
         },
         [beta_vec, one_vec, threshold_vec](Vectorized<scalar_t> a, Vectorized<scalar_t> b) -> Vectorized<scalar_t> {
-          Vectorized<float> a0, a1, b0, b1;
-          std::tie(a0, a1) = convert_to_float<scalar_t>(a);
-          std::tie(b0, b1) = convert_to_float<scalar_t>(b);
+          auto [a0, a1] = convert_to_float<scalar_t>(a);
+          auto [b0, b1] = convert_to_float<scalar_t>(b);
           Vec z = (b0 * beta_vec).exp();
           a0 = Vec::blendv(a0 * z / (z + one_vec), a0, (b0 * beta_vec) > threshold_vec);
           z = (b1 * beta_vec).exp();
@@ -1162,9 +1138,8 @@ void glu_kernel(TensorIteratorBase& iter) {
         return float(a) * (float_one_val / (float_one_val + std::exp(- float(b))));
       },
       [float_one_vec](Vectorized<scalar_t> a, Vectorized<scalar_t> b) -> Vectorized<scalar_t> {
-        Vectorized<float> a0, a1, b0, b1;
-        std::tie(a0, a1) = convert_to_float<scalar_t>(a);
-        std::tie(b0, b1) = convert_to_float<scalar_t>(b);
+        auto [a0, a1] = convert_to_float<scalar_t>(a);
+        auto [b0, b1] = convert_to_float<scalar_t>(b);
         return convert_from_float<scalar_t>(a0 * (float_one_vec / (float_one_vec + b0.neg().exp())),
                                             a1 * (float_one_vec / (float_one_vec + b1.neg().exp())));
       });
@@ -1217,10 +1192,9 @@ void glu_backward_kernel(TensorIterator& iter) {
         return  (float_one_val - float(a)) * float(a) * float(b) * float(c);
       },
       [float_one_vec](Vectorized<scalar_t> a, Vectorized<scalar_t> b, Vectorized<scalar_t> c) -> Vectorized<scalar_t> {
-        Vectorized<float> a0, a1, b0, b1, c0, c1;
-        std::tie(a0, a1) = convert_to_float<scalar_t>(a);
-        std::tie(b0, b1) = convert_to_float<scalar_t>(b);
-        std::tie(c0, c1) = convert_to_float<scalar_t>(c);
+        auto [a0, a1] = convert_to_float<scalar_t>(a);
+        auto [b0, b1] = convert_to_float<scalar_t>(b);
+        auto [c0, c1] = convert_to_float<scalar_t>(c);
         a0 = (float_one_vec - a0) * a0 * b0 * c0;
         a1 = (float_one_vec - a1) * a1 * b1 * c1;
         return convert_from_float<scalar_t>(a0, a1);
@@ -1254,8 +1228,7 @@ void silu_kernel(TensorIteratorBase& iter) {
             return float(x) / (1.0f + std::exp(-float(x)));
           },
           [kOneVec](Vectorized<scalar_t> x_vec) -> Vectorized<scalar_t> {
-            Vectorized<float> x_vec0, x_vec1;
-            std::tie(x_vec0, x_vec1) = convert_to_float<scalar_t>(x_vec);
+            auto [x_vec0, x_vec1] = convert_to_float<scalar_t>(x_vec);
             return convert_from_float<scalar_t>(
               x_vec0 / (kOneVec + x_vec0.neg().exp()),
               x_vec1 / (kOneVec + x_vec1.neg().exp()));
@@ -1289,9 +1262,8 @@ void silu_backward_kernel(TensorIteratorBase& iter) {
           return dy * sigmoid * (1.0f + x * (1.0f - sigmoid));
         },
         [kOneVec](Vectorized<scalar_t> dy_vec, Vectorized<scalar_t> x_vec) -> Vectorized<scalar_t> {
-          Vectorized<float> x_vec0, x_vec1, dy_vec0, dy_vec1;
-          std::tie(x_vec0, x_vec1) = convert_to_float<scalar_t>(x_vec);
-          std::tie(dy_vec0, dy_vec1) = convert_to_float<scalar_t>(dy_vec);
+          auto [x_vec0, x_vec1] = convert_to_float<scalar_t>(x_vec);
+          auto [dy_vec0, dy_vec1] = convert_to_float<scalar_t>(dy_vec);
           const Vectorized<float> sigmoid0 =
               kOneVec / (kOneVec + x_vec0.neg().exp());
           const Vectorized<float> sigmoid1 =
@@ -1330,8 +1302,7 @@ void mish_kernel(TensorIteratorBase& iter) {
           return static_cast<scalar_t>(float(x) * std::tanh(std::log1p(std::exp(float(x)))));
         },
         [](Vectorized<scalar_t> x_vec) -> Vectorized<scalar_t> {
-          Vectorized<float> x_vec0, x_vec1;
-          std::tie(x_vec0, x_vec1) = convert_to_float<scalar_t>(x_vec);
+          auto [x_vec0, x_vec1] = convert_to_float<scalar_t>(x_vec);
           return convert_from_float<scalar_t>(
             x_vec0 * x_vec0.exp().log1p().tanh(),
             x_vec1 * x_vec1.exp().log1p().tanh()
@@ -1367,9 +1338,8 @@ void mish_backward_kernel(TensorIterator& iter) {
           return dy * (tanh_softplus + x * sigmoid * (1.0f - tanh_softplus * tanh_softplus));
         },
         [kOneVec](Vectorized<scalar_t> dy_vec, Vectorized<scalar_t> x_vec) -> Vectorized<scalar_t> {
-          Vectorized<float> x_vec0, x_vec1, dy_vec0, dy_vec1;
-          std::tie(x_vec0, x_vec1) = convert_to_float<scalar_t>(x_vec);
-          std::tie(dy_vec0, dy_vec1) = convert_to_float<scalar_t>(dy_vec);
+          auto [x_vec0, x_vec1] = convert_to_float<scalar_t>(x_vec);
+          auto [dy_vec0, dy_vec1] = convert_to_float<scalar_t>(dy_vec);
           const Vec sigmoid0 = kOneVec / (kOneVec + x_vec0.neg().exp());
           const Vec sigmoid1 = kOneVec / (kOneVec + x_vec1.neg().exp());
           const Vec tanh_softplus0 = x_vec0.exp().log1p().tanh();
diff --git a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp
index b6ba000954d50..6f96d495f85c4 100644
--- a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp
@@ -15,14 +15,14 @@ namespace at::native {
 namespace {
 
 template <typename scalar_t, typename accscalar_t>
-void cpu_adaptive_avg_pool(
+void cpu_adaptive_avg_pool2d(
     Tensor& output_,
     const Tensor& input_,
     IntArrayRef output_size) {
   auto input = input_.contiguous();
   auto output = output_.contiguous();
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   int64_t ndim = input.ndimension();
@@ -36,7 +36,7 @@ void cpu_adaptive_avg_pool(
   // parallel on dim of N, C
   at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
     for (const auto c : c10::irange(begin, end)) {
-      scalar_t* input_ptr = input_data + c * input_height * input_width;
+      const scalar_t* input_ptr = input_data + c * input_height * input_width;
       scalar_t* output_ptr = output_data + c * output_height * output_width;
 
       for (const auto oh : c10::irange(output_height)) {
@@ -69,7 +69,7 @@ void cpu_adaptive_avg_pool(
 
 template <typename scalar_t>
 typename std::enable_if_t<std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
-cpu_adaptive_avg_pool_channels_last(
+cpu_adaptive_avg_pool2d_channels_last(
     Tensor& output_,
     const Tensor& input_,
     IntArrayRef output_size) {
@@ -77,7 +77,7 @@ cpu_adaptive_avg_pool_channels_last(
   auto input = input_.contiguous(memory_format);
   auto output = output_.contiguous(memory_format);
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   int64_t nbatch = input.size(0);
@@ -107,7 +107,7 @@ cpu_adaptive_avg_pool_channels_last(
       scalar_t* out = output_data + i * channels;
       int64_t size = channels;
 
-      // Note: For oridinary usage scenario, each out lane should
+      // Note: For ordinary usage scenario, each out lane should
       //   fit in L1 cache; otherwise consider block dim C.
       // Pass I: zero the out lane
       int64_t d1 = 0;
@@ -121,7 +121,7 @@ cpu_adaptive_avg_pool_channels_last(
       // Pass II: compute local sum
       for (const auto ih : c10::irange(ih0, ih1)) {
         for (const auto iw : c10::irange(iw0, iw1)) {
-          scalar_t* in = input_data + n * input_height * input_width * channels +
+          const scalar_t* in = input_data + n * input_height * input_width * channels +
               ih * input_width * channels + iw * channels;
 
           int64_t d2 = 0;
@@ -156,7 +156,7 @@ cpu_adaptive_avg_pool_channels_last(
 
 template <typename scalar_t>
 typename std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
-cpu_adaptive_avg_pool_channels_last(
+cpu_adaptive_avg_pool2d_channels_last(
     Tensor& output_,
     const Tensor& input_,
     IntArrayRef output_size) {
@@ -164,7 +164,7 @@ cpu_adaptive_avg_pool_channels_last(
   auto input = input_.contiguous(memory_format);
   auto output = output_.contiguous(memory_format);
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   int64_t nbatch = input.size(0);
@@ -212,7 +212,7 @@ cpu_adaptive_avg_pool_channels_last(
       // Pass II: compute local sum
       for (const auto ih : c10::irange(ih0, ih1)) {
         for (const auto iw : c10::irange(iw0, iw1)) {
-          scalar_t* in = input_data + n * input_height * input_width * channels +
+          const scalar_t* in = input_data + n * input_height * input_width * channels +
               ih * input_width * channels + iw * channels;
 
           int64_t d2 = 0;
@@ -255,13 +255,13 @@ cpu_adaptive_avg_pool_channels_last(
 }
 
 template <typename scalar_t>
-void cpu_adaptive_avg_pool_backward(
+void cpu_adaptive_avg_pool2d_backward(
     Tensor& grad_input_,
     const Tensor& grad_output_) {
   auto grad_output = grad_output_.contiguous();
   auto grad_input = grad_input_.contiguous();
 
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
   auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
 
   int64_t ndim = grad_output.ndimension();
@@ -276,7 +276,7 @@ void cpu_adaptive_avg_pool_backward(
   at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
     for (const auto c : c10::irange(begin, end)) {
       scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width;
-      scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width;
+      const scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width;
 
       for (const auto oh : c10::irange(output_height)) {
         int64_t ih0 = start_index(oh, output_height, input_height);
@@ -305,7 +305,7 @@ void cpu_adaptive_avg_pool_backward(
 }
 
 template <typename scalar_t>
-void cpu_adaptive_avg_pool_backward_channels_last(
+void cpu_adaptive_avg_pool2d_backward_channels_last(
     Tensor& grad_input_,
     const Tensor& grad_output_) {
   auto memory_format = at::MemoryFormat::ChannelsLast;
@@ -313,7 +313,7 @@ void cpu_adaptive_avg_pool_backward_channels_last(
   auto grad_output = grad_output_.contiguous(memory_format);
 
   auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
 
   int64_t nbatch = grad_input.size(0);
   int64_t channels = grad_input.size(1);
@@ -327,7 +327,7 @@ void cpu_adaptive_avg_pool_backward_channels_last(
   at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
     for (const auto n : c10::irange(begin, end)) {
       scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels;
-      scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels;
+      const scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels;
 
       for (const auto oh : c10::irange(output_height)) {
         int64_t ih0 = start_index(oh, output_height, input_height);
@@ -339,7 +339,7 @@ void cpu_adaptive_avg_pool_backward_channels_last(
           int64_t iw1 = end_index(ow, output_width, input_width);
           int64_t kw = iw1 - iw0;
 
-          scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels;
+          const scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels;
           int64_t size = channels;
           for (const auto ih : c10::irange(ih0, ih1)) {
             for (const auto iw : c10::irange(iw0, iw1)) {
@@ -373,13 +373,13 @@ void adaptive_avg_pool2d_kernel_impl(
     case at::MemoryFormat::Contiguous: {
       AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_avg_pool2d", [&] {
         using param_t = at::opmath_type<scalar_t>;
-        cpu_adaptive_avg_pool<scalar_t, /*accscalar_t*/param_t>(output, input, output_size);
+        cpu_adaptive_avg_pool2d<scalar_t, /*accscalar_t*/param_t>(output, input, output_size);
       });
       break;
     }
     case at::MemoryFormat::ChannelsLast: {
       AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_avg_pool2d_channels_last", [&]{
-        cpu_adaptive_avg_pool_channels_last<scalar_t>(output, input, output_size);
+        cpu_adaptive_avg_pool2d_channels_last<scalar_t>(output, input, output_size);
       });
       break;
     }
@@ -394,13 +394,458 @@ void adapative_avg_pool2d_backward_kernel_impl(
   switch (grad_output.suggest_memory_format()) {
     case at::MemoryFormat::Contiguous: {
       AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_avg_pool2d_backward", [&] {
-        cpu_adaptive_avg_pool_backward<scalar_t>(grad_input, grad_output);
+        cpu_adaptive_avg_pool2d_backward<scalar_t>(grad_input, grad_output);
       });
       break;
     }
     case at::MemoryFormat::ChannelsLast: {
       AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_avg_pool2d_backward_channels_last", [&]{
-        cpu_adaptive_avg_pool_backward_channels_last<scalar_t>(grad_input, grad_output);
+        cpu_adaptive_avg_pool2d_backward_channels_last<scalar_t>(grad_input, grad_output);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+
+template <typename scalar_t, typename accscalar_t>
+void cpu_adaptive_avg_pool3d(
+    Tensor& output_,
+    const Tensor& input_,
+    IntArrayRef output_size) {
+  auto input = input_.contiguous();
+  auto output = output_.contiguous();
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  int64_t ndim = input.ndimension();
+  // treat batch size and channels as one dimension
+  int64_t channels = ndim == 4 ? input.size(0) : input.size(0) * input.size(1);
+  int64_t input_depth = input.size(-3);
+  int64_t input_height = input.size(-2);
+  int64_t input_width = input.size(-1);
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  // parallel on dim of N, C
+  at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width;
+      scalar_t* output_ptr = output_data + c * output_depth * output_height * output_width;
+
+      for (const auto od : c10::irange(output_depth)) {
+        int64_t id0 = start_index(od, output_depth, input_depth);
+        int64_t id1 = end_index(od, output_depth, input_depth);
+        int64_t kd = id1 - id0;
+
+        for (const auto oh : c10::irange(output_height)) {
+          int64_t ih0 = start_index(oh, output_height, input_height);
+          int64_t ih1 = end_index(oh, output_height, input_height);
+          int64_t kh = ih1 - ih0;
+
+          for (const auto ow : c10::irange(output_width)) {
+            int64_t iw0 = start_index(ow, output_width, input_width);
+            int64_t iw1 = end_index(ow, output_width, input_width);
+            int64_t kw = iw1 - iw0;
+
+            // compute local average
+            accscalar_t sum = 0;
+            for (const auto id : c10::irange(id0, id1)) {
+              for (const auto ih : c10::irange(ih0, ih1)) {
+                for (const auto iw : c10::irange(iw0, iw1)) {
+                  sum += accscalar_t(input_ptr[id * input_height * input_width + ih * input_width + iw]);
+                }
+              }
+            }
+            output_ptr[od * output_height * output_width + oh * output_width + ow] = scalar_t(sum / kd / kh / kw);
+          }
+        }
+      }
+    }
+  });
+
+  if (!output_.is_contiguous()) {
+    output_.copy_(output);
+  }
+}
+
+
+template <typename scalar_t>
+typename std::enable_if_t<std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
+cpu_adaptive_avg_pool3d_channels_last(
+    Tensor& output_,
+    const Tensor& input_,
+    IntArrayRef output_size) {
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto input = input_.contiguous(memory_format);
+  auto output = output_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_depth = input.size(2);
+  int64_t input_height = input.size(3);
+  int64_t input_width = input.size(4);
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  using Vec = vec::Vectorized<scalar_t>;
+  // parallel on dim N, H, W
+  at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t od = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+
+    for (const auto i : c10::irange(begin, end)) {
+      int64_t id0 = start_index(od, output_depth, input_depth);
+      int64_t id1 = end_index(od, output_depth, input_depth);
+      int64_t kd = id1 - id0;
+
+      int64_t ih0 = start_index(oh, output_height, input_height);
+      int64_t ih1 = end_index(oh, output_height, input_height);
+      int64_t kh = ih1 - ih0;
+
+      int64_t iw0 = start_index(ow, output_width, input_width);
+      int64_t iw1 = end_index(ow, output_width, input_width);
+      int64_t kw = iw1 - iw0;
+
+      scalar_t* out = output_data + i * channels;
+      int64_t size = channels;
+
+      // Note: For oridinary usage scenario, each out lane should
+      //   fit in L1 cache; otherwise consider block dim C.
+      // Pass I: zero the out lane
+      int64_t d1 = 0;
+      for (; d1 < size - (size % Vec::size()); d1 += Vec::size()) {
+        Vec out_vec = Vec(scalar_t(0));
+        out_vec.store(out + d1);
+      }
+      for (; d1 < size; d1++) {
+        out[d1] = scalar_t(0);
+      }
+      // Pass II: compute local sum
+      for (const auto id : c10::irange(id0, id1)) {
+        for (const auto ih : c10::irange(ih0, ih1)) {
+          for (const auto iw : c10::irange(iw0, iw1)) {
+            scalar_t* in = input_data + n * input_depth * input_height * input_width * channels +
+                id * input_height * input_width * channels + ih * input_width * channels + iw * channels;
+
+            int64_t d2 = 0;
+            for (; d2 < size - (size % Vec::size()); d2 += Vec::size()) {
+              Vec out_vec = Vec::loadu(out + d2) + Vec::loadu(in + d2);
+              out_vec.store(out + d2);
+            }
+            for (; d2 < size; d2++) {
+              out[d2] += in[d2];
+            }
+          }
+        }
+      }
+      // Pass III: compute local average
+      int64_t d3 = 0;
+      for (; d3 < size - (size % Vec::size()); d3 += Vec::size()) {
+        Vec out_vec = Vec::loadu(out + d3) / Vec(scalar_t(kd * kh * kw));
+        out_vec.store(out + d3);
+      }
+      for (; d3 < size; d3++) {
+        out[d3] = out[d3] / kd / kh / kw;
+      }
+
+      // move on to next output index
+      data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+}
+
+template <typename scalar_t>
+typename std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
+cpu_adaptive_avg_pool3d_channels_last(
+    Tensor& output_,
+    const Tensor& input_,
+    IntArrayRef output_size) {
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto input = input_.contiguous(memory_format);
+  auto output = output_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_depth = input.size(2);
+  int64_t input_height = input.size(3);
+  int64_t input_width = input.size(4);
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  using bVec = vec::Vectorized<scalar_t>;
+  using fVec = vec::Vectorized<float>;
+  // parallel on dim N,D, H, W
+  at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    int64_t od = 0;
+    data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+
+    // temp buffer for sum, use float as accumulation type
+    // can't reuse output buffer to store sum since it is BFloat16/Half
+    auto sum_arr = std::make_unique<float []>(channels);
+    float* sum = sum_arr.get();
+
+    for (const auto i : c10::irange(begin, end)) {
+      int64_t id0 = start_index(od, output_depth, input_depth);
+      int64_t id1 = end_index(od, output_depth, input_depth);
+      int64_t kd = id1 - id0;
+
+      int64_t ih0 = start_index(oh, output_height, input_height);
+      int64_t ih1 = end_index(oh, output_height, input_height);
+      int64_t kh = ih1 - ih0;
+
+      int64_t iw0 = start_index(ow, output_width, input_width);
+      int64_t iw1 = end_index(ow, output_width, input_width);
+      int64_t kw = iw1 - iw0;
+
+      scalar_t* out = output_data + i * channels;
+      int64_t size = channels;
+
+      // Pass I: zero the out lane
+      int64_t d1 = 0;
+      for (; d1 < size - (size % fVec::size()); d1 += fVec::size()) {
+        fVec sum_fvec = fVec(float(0));
+        sum_fvec.store(sum + d1);
+      }
+      for (; d1 < size; d1++) {
+        sum[d1] = float(0);
+      }
+      // Pass II: compute local sum
+      for (const auto id : c10::irange(id0, id1)) {
+        for (const auto ih : c10::irange(ih0, ih1)) {
+            for (const auto iw : c10::irange(iw0, iw1)) {
+                scalar_t* in = input_data + n * input_depth * input_height * input_width * channels +
+                    id * input_height * input_width * channels +
+                    ih * input_width * channels + iw * channels;
+
+                int64_t d2 = 0;
+                for (; d2 < size - (size % bVec::size()); d2 += bVec::size()) {
+                    bVec data_bvec = bVec::loadu(in + d2);
+                    fVec data_fvec0, data_fvec1;
+                    std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+
+                    fVec sum_fvec0 = fVec::loadu(sum + d2) + data_fvec0;
+                    fVec sum_fvec1 = fVec::loadu(sum + d2 + fVec::size()) + data_fvec1;
+                    sum_fvec0.store(sum + d2);
+                    sum_fvec1.store(sum + d2 + fVec::size());
+                }
+                for (; d2 < size; d2++) {
+                    sum[d2] += float(in[d2]);
+                }
+            }
+        }
+      }
+      // Pass III: compute local average
+      int64_t d3 = 0;
+      for (; d3 < size - (size % bVec::size()); d3 += bVec::size()) {
+        fVec out_fvec0 = fVec::loadu(sum + d3) / fVec(float(kd * kh * kw));
+        fVec out_fvec1 = fVec::loadu(sum + d3 + fVec::size()) / fVec(float(kd * kh * kw));
+
+        bVec out_bvec = convert_from_float<scalar_t>(out_fvec0, out_fvec1);
+        out_bvec.store(out + d3);
+      }
+      for (; d3 < size; d3++) {
+        out[d3] = scalar_t(sum[d3] / kd / kh / kw);
+      }
+
+      // move on to next output index
+      data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+}
+
+template <typename scalar_t>
+void cpu_adaptive_avg_pool3d_backward(
+    Tensor& grad_input_,
+    const Tensor& grad_output_) {
+  auto grad_output = grad_output_.contiguous();
+  auto grad_input = grad_input_.contiguous();
+
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
+
+  int64_t ndim = grad_output.ndimension();
+  // treat batch size and channels as one dimension
+  int64_t channels = ndim == 4 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
+  int64_t input_depth = grad_input.size(-3);
+  int64_t input_height = grad_input.size(-2);
+  int64_t input_width = grad_input.size(-1);
+  int64_t output_depth = grad_output.size(-3);
+  int64_t output_height = grad_output.size(-2);
+  int64_t output_width = grad_output.size(-1);
+
+  // parallel on dim of N, C
+  at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      scalar_t* grad_input_ptr = grad_input_data + c * input_depth * input_height * input_width;
+      scalar_t* grad_output_ptr = grad_output_data + c * output_depth * output_height * output_width;
+
+      for (const auto od : c10::irange(output_depth)) {
+        int64_t id0 = start_index(od, output_depth, input_depth);
+        int64_t id1 = end_index(od, output_depth, input_depth);
+        int64_t kd = id1 - id0;
+        for (const auto oh : c10::irange(output_height)) {
+          int64_t ih0 = start_index(oh, output_height, input_height);
+          int64_t ih1 = end_index(oh, output_height, input_height);
+          int64_t kh = ih1 - ih0;
+
+          for (const auto ow : c10::irange(output_width)) {
+            int64_t iw0 = start_index(ow, output_width, input_width);
+            int64_t iw1 = end_index(ow, output_width, input_width);
+            int64_t kw = iw1 - iw0;
+
+            scalar_t grad_delta = grad_output_ptr[od * output_width * output_height + oh * output_width + ow] / kd / kh / kw;
+            for (const auto id : c10::irange(id0, id1)) {
+              for (const auto ih : c10::irange(ih0, ih1)) {
+                for (const auto iw : c10::irange(iw0, iw1)) {
+                  grad_input_ptr[id * input_height * input_width + ih * input_width + iw] += grad_delta;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  });
+
+  if (!grad_input_.is_contiguous()) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+template <typename scalar_t>
+void cpu_adaptive_avg_pool3d_backward_channels_last(
+    Tensor& grad_input_,
+    const Tensor& grad_output_) {
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto grad_input = grad_input_.contiguous(memory_format);
+  auto grad_output = grad_output_.contiguous(memory_format);
+
+  auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+
+  int64_t nbatch = grad_input.size(0);
+  int64_t channels = grad_input.size(1);
+  int64_t input_depth = grad_input.size(2);
+  int64_t input_height = grad_input.size(3);
+  int64_t input_width = grad_input.size(4);
+  int64_t output_depth = grad_output.size(2);
+  int64_t output_height = grad_output.size(3);
+  int64_t output_width = grad_output.size(4);
+
+  using Vec = vec::Vectorized<scalar_t>;
+  // parallel on dim N
+  at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
+    for (const auto n : c10::irange(begin, end)) {
+      scalar_t* grad_input_ptr = grad_input_data + n * input_depth * input_height * input_width * channels;
+      scalar_t* grad_output_ptr = grad_output_data + n * output_depth * output_height * output_width * channels;
+
+      for (const auto od : c10::irange(output_depth)) {
+        int64_t id0 = start_index(od, output_depth, input_depth);
+        int64_t id1 = end_index(od, output_depth, input_depth);
+        int64_t kd = id1 - id0;
+        for (const auto oh : c10::irange(output_height)) {
+          int64_t ih0 = start_index(oh, output_height, input_height);
+          int64_t ih1 = end_index(oh, output_height, input_height);
+          int64_t kh = ih1 - ih0;
+
+          for (const auto ow : c10::irange(output_width)) {
+            int64_t iw0 = start_index(ow, output_width, input_width);
+            int64_t iw1 = end_index(ow, output_width, input_width);
+            int64_t kw = iw1 - iw0;
+
+            scalar_t* gout = grad_output_ptr + od * output_depth * channels + oh * output_width * channels + ow * channels;
+            int64_t size = channels;
+            for (const auto id : c10::irange(id0, id1)) {
+              for (const auto ih : c10::irange(ih0, ih1)) {
+                for (const auto iw : c10::irange(iw0, iw1)) {
+                  scalar_t* gin = grad_input_ptr + id * input_width * input_height * channels + ih * input_width * channels + iw * channels;
+
+                  int64_t d = 0;
+                  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+                    Vec gin_vec = Vec::loadu(gin + d) + Vec::loadu(gout + d) / Vec(scalar_t(kd * kh * kw));
+                    gin_vec.store(gin + d);
+                  }
+                  for (; d < size; d++) {
+                    gin[d] += gout[d] / kd / kh / kw;
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  });
+
+  if (!grad_input_.is_contiguous(memory_format)) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+
+void adaptive_avg_pool3d_kernel_impl(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size) {
+  switch (input.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_avg_pool3d", [&] {
+        using param_t = at::opmath_type<scalar_t>;
+        cpu_adaptive_avg_pool3d<scalar_t, /*accscalar_t*/param_t>(output, input, output_size);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast3d: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_avg_pool3d_channels_last", [&]{
+        cpu_adaptive_avg_pool3d_channels_last<scalar_t>(output, input, output_size);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+void adapative_avg_pool3d_backward_kernel_impl(
+    Tensor& grad_input,
+    const Tensor& grad_output) {
+  switch (grad_output.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_avg_pool3d_backward", [&] {
+        cpu_adaptive_avg_pool3d_backward<scalar_t>(grad_input, grad_output);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast3d: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_avg_pool3d_backward_channels_last", [&]{
+        cpu_adaptive_avg_pool3d_backward_channels_last<scalar_t>(grad_input, grad_output);
       });
       break;
     }
@@ -413,5 +858,7 @@ void adapative_avg_pool2d_backward_kernel_impl(
 
 REGISTER_DISPATCH(adaptive_avg_pool2d_kernel, &adaptive_avg_pool2d_kernel_impl);
 REGISTER_DISPATCH(adaptive_avg_pool2d_backward_kernel, &adapative_avg_pool2d_backward_kernel_impl);
+REGISTER_DISPATCH(adaptive_avg_pool3d_kernel, &adaptive_avg_pool3d_kernel_impl);
+REGISTER_DISPATCH(adaptive_avg_pool3d_backward_kernel, &adapative_avg_pool3d_backward_kernel_impl);
 
 } // at::native
diff --git a/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp b/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp
index 923f0a7034b85..2306fd05d132a 100644
--- a/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp
@@ -15,7 +15,7 @@ namespace at::native {
 namespace {
 
 template <typename scalar_t, typename accscalar_t>
-void cpu_adaptive_max_pool(
+void cpu_adaptive_max_pool2d(
     const Tensor& output_,
     const Tensor& indices_,
     const Tensor& input_,
@@ -24,7 +24,7 @@ void cpu_adaptive_max_pool(
   auto output = output_.contiguous();
   auto indices = indices_.contiguous();
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
   auto indices_data = indices.data_ptr<int64_t>();
 
@@ -39,7 +39,7 @@ void cpu_adaptive_max_pool(
   // parallel on dim of N, C
   at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
     for (const auto c : c10::irange(begin, end)) {
-      scalar_t* input_ptr = input_data + c * input_height * input_width;
+      const scalar_t* input_ptr = input_data + c * input_height * input_width;
       scalar_t* output_ptr = output_data + c * output_height * output_width;
       int64_t* indices_ptr = indices_data + c * output_height * output_width;
 
@@ -83,19 +83,19 @@ void cpu_adaptive_max_pool(
 
 template <typename scalar_t>
 typename std::enable_if_t<std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
-cpu_adaptive_max_pool_channels_last(
+cpu_adaptive_max_pool2d_channels_last(
     const Tensor& output_,
     const Tensor& indices_,
     const Tensor& input_,
     IntArrayRef output_size) {
   TORCH_CHECK(input_.ndimension() == 4,
-              "adaptive max pooling with channels last format supports tensors with 4 dims");
+              "2d adaptive max pooling with channels last format supports tensors with 4 dims");
   auto memory_format = at::MemoryFormat::ChannelsLast;
   auto input = input_.contiguous(memory_format);
   auto output = output_.contiguous(memory_format);
   auto indices = indices_.contiguous(memory_format);
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
   auto indices_data = indices.data_ptr<int64_t>();
 
@@ -109,7 +109,7 @@ cpu_adaptive_max_pool_channels_last(
   using Vec = vec::Vectorized<scalar_t>;
   using integer_t = vec::int_same_size_t<scalar_t>;
   using iVec = vec::Vectorized<integer_t>;
-  // for the convience of vectorization, use integer of the same size of scalar_t,
+  // for the convenience of vectorization, use integer of the same size of scalar_t,
   //   e.g. int32_t for float, int64_t for double
   // need to make sure doesn't overflow
   TORCH_CHECK(input_height * input_width <= std::numeric_limits<integer_t>::max());
@@ -151,7 +151,7 @@ cpu_adaptive_max_pool_channels_last(
       // Pass II: compute local max
       for (int64_t ih = ih0; ih < ih1; ih ++) {
         for (int64_t iw = iw0; iw < iw1; iw ++) {
-          scalar_t* in = input_data + n * input_height * input_width * channels +
+          const scalar_t* in = input_data + n * input_height * input_width * channels +
               ih * input_width * channels + iw * channels;
 
           int64_t d2 = 0;
@@ -200,19 +200,19 @@ cpu_adaptive_max_pool_channels_last(
 
 template <typename scalar_t>
 typename std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
-cpu_adaptive_max_pool_channels_last(
+cpu_adaptive_max_pool2d_channels_last(
     const Tensor& output_,
     const Tensor& indices_,
     const Tensor& input_,
     IntArrayRef output_size) {
   TORCH_CHECK(input_.ndimension() == 4,
-              "adaptive max pooling with channels last format supports tensors with 4 dims");
+              "2d adaptive max pooling with channels last format supports tensors with 4 dims");
   auto memory_format = at::MemoryFormat::ChannelsLast;
   auto input = input_.contiguous(memory_format);
   auto output = output_.contiguous(memory_format);
   auto indices = indices_.contiguous(memory_format);
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
   auto indices_data = indices.data_ptr<int64_t>();
 
@@ -269,7 +269,7 @@ cpu_adaptive_max_pool_channels_last(
       // Pass II: compute local max
       for (int64_t ih = ih0; ih < ih1; ih ++) {
         for (int64_t iw = iw0; iw < iw1; iw ++) {
-          scalar_t* in = input_data + n * input_height * input_width * channels +
+          const scalar_t* in = input_data + n * input_height * input_width * channels +
               ih * input_width * channels + iw * channels;
 
           int64_t d2 = 0;
@@ -340,7 +340,7 @@ cpu_adaptive_max_pool_channels_last(
 }
 
 template <typename scalar_t>
-void cpu_adaptive_max_pool_backward(
+void cpu_adaptive_max_pool2d_backward(
     const Tensor& grad_input_,
     const Tensor& grad_output_,
     const Tensor& indices_) {
@@ -348,8 +348,8 @@ void cpu_adaptive_max_pool_backward(
   auto indices = indices_.contiguous();
   auto grad_input = grad_input_.contiguous();
 
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
-  auto indices_data = indices.data_ptr<int64_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
+  auto indices_data = indices.const_data_ptr<int64_t>();
   auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
 
   int64_t ndim = grad_output.ndimension();
@@ -364,8 +364,8 @@ void cpu_adaptive_max_pool_backward(
   at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
     for (const auto c : c10::irange(begin, end)) {
       scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width;
-      scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width;
-      int64_t* indices_ptr = indices_data + c * output_height * output_width;
+      const scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width;
+      const int64_t* indices_ptr = indices_data + c * output_height * output_width;
 
       for (const auto oh : c10::irange(output_height)) {
         for (const auto ow : c10::irange(output_width)) {
@@ -386,20 +386,20 @@ void cpu_adaptive_max_pool_backward(
 }
 
 template <typename scalar_t>
-void cpu_adaptive_max_pool_backward_channels_last(
+void cpu_adaptive_max_pool2d_backward_channels_last(
     const Tensor& grad_input_,
     const Tensor& grad_output_,
     const Tensor& indices_) {
   TORCH_CHECK(grad_output_.ndimension() == 4,
-              "adaptive max pooling backward with channels last format supports tensors with 4 dims.");
+              "2d adaptive max pooling backward with channels last format supports tensors with 4 dims.");
   auto memory_format = at::MemoryFormat::ChannelsLast;
   auto grad_input = grad_input_.contiguous(memory_format);
   auto grad_output = grad_output_.contiguous(memory_format);
   auto indices = indices_.contiguous(memory_format);
 
   auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
-  auto indices_data = indices.data_ptr<int64_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
+  auto indices_data = indices.const_data_ptr<int64_t>();
 
   int64_t nbatch = grad_input.size(0);
   int64_t channels = grad_input.size(1);
@@ -412,13 +412,13 @@ void cpu_adaptive_max_pool_backward_channels_last(
   at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
     for (const auto n : c10::irange(begin, end)) {
       scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels;
-      scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels;
-      int64_t* indices_ptr = indices_data + n * output_height * output_width * channels;
+      const scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels;
+      const int64_t* indices_ptr = indices_data + n * output_height * output_width * channels;
 
       for (const auto oh : c10::irange(output_height)) {
         for (const auto ow : c10::irange(output_width)) {
-          scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels;
-          int64_t* ind = indices_ptr + oh * output_width * channels + ow * channels;
+          const scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels;
+          const int64_t* ind = indices_ptr + oh * output_width * channels + ow * channels;
           // TODO: gcc vectorization
           for (const auto c : c10::irange(channels)) {
             int64_t maxindex = ind[c];
@@ -443,13 +443,13 @@ void adaptive_max_pool2d_kernel_impl(
     case at::MemoryFormat::Contiguous: {
       AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_max_pool2d", [&] {
         using param_t = at::opmath_type<scalar_t>;
-        cpu_adaptive_max_pool<scalar_t, /*accscalar_t*/param_t>(output, indices, input, output_size);
+        cpu_adaptive_max_pool2d<scalar_t, /*accscalar_t*/param_t>(output, indices, input, output_size);
       });
       break;
     }
     case at::MemoryFormat::ChannelsLast: {
       AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_max_pool2d_channels_last", [&]{
-        cpu_adaptive_max_pool_channels_last<scalar_t>(output, indices, input, output_size);
+        cpu_adaptive_max_pool2d_channels_last<scalar_t>(output, indices, input, output_size);
       });
       break;
     }
@@ -466,13 +466,512 @@ void adaptive_max_pool2d_backward_kernel_impl(
   switch (grad_input.suggest_memory_format()) {
     case at::MemoryFormat::Contiguous: {
       AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_max_pool2d_backward", [&] {
-        cpu_adaptive_max_pool_backward<scalar_t>(grad_input, grad_output, indices);
+        cpu_adaptive_max_pool2d_backward<scalar_t>(grad_input, grad_output, indices);
       });
       break;
     }
     case at::MemoryFormat::ChannelsLast: {
       AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_max_pool2d_backward_channels_last", [&]{
-        cpu_adaptive_max_pool_backward_channels_last<scalar_t>(grad_input, grad_output, indices);
+        cpu_adaptive_max_pool2d_backward_channels_last<scalar_t>(grad_input, grad_output, indices);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+void cpu_adaptive_max_pool3d(
+    const Tensor& output_,
+    const Tensor& indices_,
+    const Tensor& input_,
+    IntArrayRef output_size) {
+  auto input = input_.contiguous();
+  auto output = output_.contiguous();
+  auto indices = indices_.contiguous();
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+
+  int64_t ndim = input.ndimension();
+  // treat batch size and channels as one dimension
+  int64_t channels = ndim == 4 ? input.size(0) : input.size(0) * input.size(1);
+  int64_t input_depth = input.size(-3);
+  int64_t input_height = input.size(-2);
+  int64_t input_width = input.size(-1);
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  // parallel on dim of N, C
+  at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width;
+      scalar_t* output_ptr = output_data + c * output_depth * output_height * output_width;
+      int64_t* indices_ptr = indices_data + c * output_depth * output_height * output_width;
+
+      for (const auto od : c10::irange(output_depth)) {
+        int64_t id0 = start_index(od, output_depth, input_depth);
+        int64_t id1 = end_index(od, output_depth, input_depth);
+          for (const auto oh : c10::irange(output_height)) {
+            int64_t ih0 = start_index(oh, output_height, input_height);
+            int64_t ih1 = end_index(oh, output_height, input_height);
+
+          for (const auto ow : c10::irange(output_width)) {
+            int64_t iw0 = start_index(ow, output_width, input_width);
+            int64_t iw1 = end_index(ow, output_width, input_width);
+
+            // compute local max
+            int64_t maxindex = id0 * input_height * input_width + ih0 * input_width + iw0;
+            accscalar_t maxval = -std::numeric_limits<accscalar_t>::infinity();
+            for (int64_t id = id0; id < id1; id ++) {
+              for (int64_t ih = ih0; ih < ih1; ih ++) {
+                for (int64_t iw = iw0; iw < iw1; iw ++) {
+                  int64_t index = id * input_height * input_width + ih * input_width + iw;
+                  scalar_t val = input_ptr[index];
+                  if ((val > maxval) || std::isnan(val)) {
+                    maxval = val;
+                    maxindex = index;
+                  }
+                }
+              }
+            }
+
+            // set output to local max and store location of max
+            output_ptr[od * output_height * output_width + oh * output_width + ow] = maxval;
+            indices_ptr[od * output_height * output_width + oh * output_width + ow] = scalar_t(maxindex);
+          }
+        }
+      }
+    }
+  });
+
+  if (!output_.is_contiguous()) {
+    output_.copy_(output);
+  }
+  if (!indices_.is_contiguous()) {
+    indices_.copy_(indices);
+  }
+}
+
+template <typename scalar_t>
+typename std::enable_if_t<std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
+cpu_adaptive_max_pool3d_channels_last(
+    const Tensor& output_,
+    const Tensor& indices_,
+    const Tensor& input_,
+    IntArrayRef output_size) {
+  TORCH_CHECK(input_.ndimension() == 5,
+              "3d adaptive max pooling with channels last format supports tensors with 5 dims");
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto input = input_.contiguous(memory_format);
+  auto output = output_.contiguous(memory_format);
+  auto indices = indices_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_depth = input.size(2);
+  int64_t input_height = input.size(3);
+  int64_t input_width = input.size(4);
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  using Vec = vec::Vectorized<scalar_t>;
+  using integer_t = vec::int_same_size_t<scalar_t>;
+  using iVec = vec::Vectorized<integer_t>;
+  // for the convience of vectorization, use integer of the same size of scalar_t,
+  //   e.g. int32_t for float, int64_t for double
+  // need to make sure doesn't overflow
+  TORCH_CHECK(input_height * input_width <= std::numeric_limits<integer_t>::max());
+
+  // parallel on dim of N, H, W
+  at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t od = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+
+    int64_t size = channels;
+    int64_t len = size - (size % Vec::size());
+    // temp buffer holding index with integer_t
+    auto index_buffer = std::make_unique<integer_t []>(len);
+
+    for (const auto i : c10::irange(begin, end)) {
+      int64_t id0 = start_index(od, output_depth, input_depth);
+      int64_t id1 = end_index(od, output_depth, input_depth);
+
+      int64_t ih0 = start_index(oh, output_height, input_height);
+      int64_t ih1 = end_index(oh, output_height, input_height);
+
+      int64_t iw0 = start_index(ow, output_width, input_width);
+      int64_t iw1 = end_index(ow, output_width, input_width);
+
+      scalar_t* out = output_data + i * channels;
+      int64_t* ind = indices_data + i * channels;
+
+      // Pass I: init out lane
+      iVec index0_vec = iVec(id0 * input_height * input_width + ih0 * input_width + iw0);
+      Vec out_vec = Vec(-std::numeric_limits<scalar_t>::infinity());
+      int64_t d1 = 0;
+      for (; d1 < len; d1 += Vec::size()) {
+        index0_vec.store(index_buffer.get() + d1);
+        out_vec.store(out + d1);
+      }
+      for (; d1 < size; d1++) {
+        ind[d1] = id0 * input_height * input_width + ih0 * input_width + iw0;
+        out[d1] = -std::numeric_limits<scalar_t>::infinity();
+      }
+      // Pass II: compute local max
+      for (int64_t id = id0; id < id1; id ++) {
+        for (int64_t ih = ih0; ih < ih1; ih ++) {
+          for (int64_t iw = iw0; iw < iw1; iw ++) {
+            scalar_t* in = input_data + n * input_depth * input_height * input_width * channels +
+                id * input_height * input_width * channels + ih * input_width * channels + iw * channels;
+
+            int64_t d2 = 0;
+            for (; d2 < len; d2 += Vec::size()) {
+              iVec index_vec = iVec(id * input_height * input_width + ih * input_width + iw);
+              Vec val_vec = Vec::loadu(in + d2);
+              iVec maxindex_vec = iVec::loadu(index_buffer.get() + d2);
+              Vec maxval_vec = Vec::loadu(out + d2);
+
+              // true = all ones, false = all zeros
+              Vec mask = (val_vec > maxval_vec) | val_vec.isnan();
+              iVec imask = vec::cast<integer_t>(mask);
+              Vec out_vec = Vec::blendv(maxval_vec, val_vec, mask);
+              iVec ind_vec = iVec::blendv(maxindex_vec, index_vec, imask);
+
+              out_vec.store(out + d2);
+              ind_vec.store(index_buffer.get() + d2);
+            }
+            for (; d2 < size; d2++) {
+              int64_t index = id * input_height * input_width + ih * input_width + iw;
+              scalar_t val = in[d2];
+              int64_t maxindex = ind[d2];
+              scalar_t maxval = out[d2];
+
+              bool mask = (val > maxval) || std::isnan(val);
+              out[d2] = mask ? val : maxval;
+              ind[d2] = mask ? index : maxindex;
+            }
+          }
+        }
+      }
+      // convert indice data type
+      vec::convert<integer_t, int64_t>(index_buffer.get(), ind, len);
+
+      // move on to next output index
+      data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+  if (!indices_.is_contiguous(memory_format)) {
+    indices_.copy_(indices);
+  }
+}
+
+template <typename scalar_t>
+typename std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
+cpu_adaptive_max_pool3d_channels_last(
+    const Tensor& output_,
+    const Tensor& indices_,
+    const Tensor& input_,
+    IntArrayRef output_size) {
+  TORCH_CHECK(input_.ndimension() == 5,
+              "3d adaptive max pooling with channels last format supports tensors with 5 dims");
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto input = input_.contiguous(memory_format);
+  auto output = output_.contiguous(memory_format);
+  auto indices = indices_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<BFloat16>();
+  auto output_data = output.data_ptr<BFloat16>();
+  auto indices_data = indices.data_ptr<int64_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_depth = input.size(2);
+  int64_t input_height = input.size(3);
+  int64_t input_width = input.size(4);
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  using bVec = vec::Vectorized<BFloat16>;
+  using fVec = vec::Vectorized<float>;
+  using iVec = vec::Vectorized<int32_t>;
+  // need to make sure doesn't overflow
+  TORCH_CHECK(input_height * input_width <= std::numeric_limits<int32_t>::max());
+
+  // parallel on dim of N, H, W
+  at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t od = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+
+    int64_t size = channels;
+    int64_t len = size - (size % bVec::size());
+    // temp buffer holding index with integer_t
+    auto index_buffer = std::make_unique<int32_t []>(len);
+    // temp buffer holding max value with float
+    auto max_arr = std::make_unique<float []>(size);
+    float* max = max_arr.get();
+
+    for (const auto i : c10::irange(begin, end)) {
+      int64_t id0 = start_index(od, output_depth, input_depth);
+      int64_t id1 = end_index(od, output_depth, input_depth);
+
+      int64_t ih0 = start_index(oh, output_height, input_height);
+      int64_t ih1 = end_index(oh, output_height, input_height);
+
+      int64_t iw0 = start_index(ow, output_width, input_width);
+      int64_t iw1 = end_index(ow, output_width, input_width);
+
+      BFloat16* out = output_data + i * channels;
+      int64_t* ind = indices_data + i * channels;
+
+      // Pass I: init out lane
+      iVec index0_ivec = iVec(id0 * input_height * input_width + ih0 * input_width + iw0);
+      fVec max_fvec = fVec(-std::numeric_limits<float>::infinity());
+      int64_t d1 = 0;
+      for (; d1 < len; d1 += fVec::size()) {
+        index0_ivec.store(index_buffer.get() + d1);
+        max_fvec.store(max + d1);
+      }
+      for (; d1 < size; d1++) {
+        ind[d1] = id0 * input_height * input_width + ih0 * input_width + iw0;
+        max[d1] = -std::numeric_limits<float>::infinity();
+      }
+      // Pass II: compute local max
+      for (int64_t id = id0; id < id1; id ++) {
+        for (int64_t ih = ih0; ih < ih1; ih ++) {
+          for (int64_t iw = iw0; iw < iw1; iw ++) {
+            BFloat16* in = input_data + n * input_depth * input_height * input_width * channels +
+                id * input_height * input_width * channels + ih * input_width * channels + iw * channels;
+
+            int64_t d2 = 0;
+            for (; d2 < len; d2 += bVec::size()) {
+              iVec index_ivec = iVec(id * input_height * input_width + ih * input_width + iw);
+              bVec val_bvec = bVec::loadu(in + d2);
+              fVec val_fvec0, val_fvec1;
+              std::tie(val_fvec0, val_fvec1) = convert_bfloat16_float(val_bvec);
+
+              iVec maxindex_ivec0 = iVec::loadu(index_buffer.get() + d2);
+              iVec maxindex_ivec1 = iVec::loadu(index_buffer.get() + d2 + iVec::size());
+              fVec maxval_fvec0 = fVec::loadu(max + d2);
+              fVec maxval_fvec1 = fVec::loadu(max + d2 + fVec::size());
+
+              // true = all ones, false = all zeros
+              fVec mask0 = (val_fvec0 > maxval_fvec0) | val_fvec0.isnan();
+              fVec mask1 = (val_fvec1 > maxval_fvec1) | val_fvec1.isnan();
+              iVec imask0 = vec::cast<int32_t>(mask0);
+              iVec imask1 = vec::cast<int32_t>(mask1);
+
+              fVec max_fvec0 = fVec::blendv(maxval_fvec0, val_fvec0, mask0);
+              fVec max_fvec1 = fVec::blendv(maxval_fvec1, val_fvec1, mask1);
+              iVec ind_ivec0 = iVec::blendv(maxindex_ivec0, index_ivec, imask0);
+              iVec ind_ivec1 = iVec::blendv(maxindex_ivec1, index_ivec, imask1);
+
+              max_fvec0.store(max + d2);
+              max_fvec1.store(max + d2 + fVec::size());
+              ind_ivec0.store(index_buffer.get() + d2);
+              ind_ivec1.store(index_buffer.get() + d2 + iVec::size());
+            }
+            for (; d2 < size; d2++) {
+              int64_t index = id * input_height * input_width + ih * input_width + iw;
+              float val = float(in[d2]);
+              int64_t maxindex = ind[d2];
+              float maxval = max[d2];
+
+              bool mask = (val > maxval) || std::isnan(val);
+              max[d2] = mask ? val : maxval;
+              ind[d2] = mask ? index : maxindex;
+            }
+          }
+        }
+      }
+      // Pass III: convert max values from float to bfloat16
+      int64_t d3 = 0;
+      for (; d3 < len; d3 += bVec::size()) {
+        fVec max_fvec0 = fVec::loadu(max + d3);
+        fVec max_fvec1 = fVec::loadu(max + d3 + fVec::size());
+        bVec max_bvec = convert_float_bfloat16(max_fvec0, max_fvec1);
+        max_bvec.store(out + d3);
+      }
+      for (; d3 < size; d3++) {
+        out[d3] = BFloat16(max[d3]);
+      }
+      // convert indice data type
+      vec::convert<int32_t, int64_t>(index_buffer.get(), ind, len);
+
+      // move on to next output index
+      data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+  if (!indices_.is_contiguous(memory_format)) {
+    indices_.copy_(indices);
+  }
+}
+
+template <typename scalar_t>
+void cpu_adaptive_max_pool3d_backward(
+    const Tensor& grad_input_,
+    const Tensor& grad_output_,
+    const Tensor& indices_) {
+  auto grad_output = grad_output_.contiguous();
+  auto indices = indices_.contiguous();
+  auto grad_input = grad_input_.contiguous();
+
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+  auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
+
+  int64_t ndim = grad_output.ndimension();
+  // treat batch size and channels as one dimension
+  int64_t channels = ndim == 3 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
+  int64_t input_depth = grad_input.size(-3);
+  int64_t input_height = grad_input.size(-2);
+  int64_t input_width = grad_input.size(-1);
+  int64_t output_depth = grad_output.size(-3);
+  int64_t output_height = grad_output.size(-2);
+  int64_t output_width = grad_output.size(-1);
+
+  // parallel on dim of N, C
+  at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      scalar_t* grad_input_ptr = grad_input_data + c * input_depth * input_height * input_width;
+      scalar_t* grad_output_ptr = grad_output_data + c * output_depth * output_height * output_width;
+      int64_t* indices_ptr = indices_data + c * output_depth * output_height * output_width;
+
+      for (const auto od : c10::irange(output_depth)) {
+        for (const auto oh : c10::irange(output_height)) {
+          for (const auto ow : c10::irange(output_width)) {
+            // retrieve position of max
+            int64_t index = od * output_height * output_width + oh * output_width + ow;
+            int64_t maxindex = indices_ptr[index];
+
+            // update gradient
+            grad_input_ptr[maxindex] += grad_output_ptr[index];
+          }
+        }
+      }
+    }
+  });
+
+  if (!grad_input_.is_contiguous()) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+template <typename scalar_t>
+void cpu_adaptive_max_pool3d_backward_channels_last(
+    const Tensor& grad_input_,
+    const Tensor& grad_output_,
+    const Tensor& indices_) {
+  TORCH_CHECK(grad_output_.ndimension() == 5,
+              "3d adaptive max pooling backward with channels last format supports tensors with 5 dims.");
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto grad_input = grad_input_.contiguous(memory_format);
+  auto grad_output = grad_output_.contiguous(memory_format);
+  auto indices = indices_.contiguous(memory_format);
+
+  auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto indices_data = indices.data_ptr<int64_t>();
+
+  int64_t nbatch = grad_input.size(0);
+  int64_t channels = grad_input.size(1);
+  int64_t input_depth = grad_input.size(2);
+  int64_t input_height = grad_input.size(3);
+  int64_t input_width = grad_input.size(4);
+  int64_t output_depth = grad_output.size(2);
+  int64_t output_height = grad_output.size(3);
+  int64_t output_width = grad_output.size(4);
+
+  // parallel on dim N
+  at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
+    for (const auto n : c10::irange(begin, end)) {
+      scalar_t* grad_input_ptr = grad_input_data + n * input_depth * input_height * input_width * channels;
+      scalar_t* grad_output_ptr = grad_output_data + n * output_depth * output_height * output_width * channels;
+      int64_t* indices_ptr = indices_data + n * output_depth * output_height * output_width * channels;
+
+      for (const auto od : c10::irange(output_depth)) {
+        for (const auto oh : c10::irange(output_height)) {
+          for (const auto ow : c10::irange(output_width)) {
+            scalar_t* gout = grad_output_ptr + od * output_height * output_width * channels + oh * output_width * channels + ow * channels;
+            int64_t* ind = indices_ptr + od * output_height * output_width * channels + oh * output_width * channels + ow * channels;
+            // TODO: gcc vectorization
+            for (const auto c : c10::irange(channels)) {
+              int64_t maxindex = ind[c];
+              grad_input_ptr[maxindex * channels + c] += gout[c];
+            }
+          }
+        }
+      }
+    }
+  });
+
+  if (!grad_input_.is_contiguous(memory_format)) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+void adaptive_max_pool3d_kernel_impl(
+    const Tensor& output,
+    const Tensor& indices,
+    const Tensor& input,
+    IntArrayRef output_size) {
+  switch (input.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_max_pool3d", [&] {
+        using param_t = at::opmath_type<scalar_t>;
+        cpu_adaptive_max_pool3d<scalar_t, /*accscalar_t*/param_t>(output, indices, input, output_size);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast3d: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_max_pool3d_channels_last", [&]{
+        cpu_adaptive_max_pool3d_channels_last<scalar_t>(output, indices, input, output_size);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+void adaptive_max_pool3d_backward_kernel_impl(
+    const Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& indices) {
+  // can't use grad_output memory format to switch here since grad_output might be NC11
+  switch (grad_input.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_max_pool3d_backward", [&] {
+        cpu_adaptive_max_pool3d_backward<scalar_t>(grad_input, grad_output, indices);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast3d: {
+      AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_max_pool3d_backward_channels_last", [&]{
+        cpu_adaptive_max_pool3d_backward_channels_last<scalar_t>(grad_input, grad_output, indices);
       });
       break;
     }
@@ -485,5 +984,7 @@ void adaptive_max_pool2d_backward_kernel_impl(
 
 REGISTER_DISPATCH(adaptive_max_pool2d_kernel, &adaptive_max_pool2d_kernel_impl);
 REGISTER_DISPATCH(adaptive_max_pool2d_backward_kernel, &adaptive_max_pool2d_backward_kernel_impl);
+REGISTER_DISPATCH(adaptive_max_pool3d_kernel, &adaptive_max_pool3d_kernel_impl);
+REGISTER_DISPATCH(adaptive_max_pool3d_backward_kernel, &adaptive_max_pool3d_backward_kernel_impl);
 
 } // at::native
diff --git a/aten/src/ATen/native/cpu/AmpGradScalerKernels.cpp b/aten/src/ATen/native/cpu/AmpGradScalerKernels.cpp
new file mode 100644
index 0000000000000..005b9c15060cc
--- /dev/null
+++ b/aten/src/ATen/native/cpu/AmpGradScalerKernels.cpp
@@ -0,0 +1,198 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+
+#include <ATen/native/AmpKernels.h>
+#include <math.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cpu/Loops.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/functional.h>
+
+namespace at::native {
+
+namespace {
+// Follow the implementations of CUDA.
+// Multiplies each tensor in scaled_grads by inv_scale in-place.
+// If any element of any tensor in scaled_grads is inf or NaN, sets found_inf
+// to 1.0.
+//
+// Args:
+// scaled_grads:  A TensorList of scaled gradient tensors.  May contain infs or
+// NaNs. found_inf:  A single-element float tensor to which 1.0 will be written
+// if any gradient contain infs/nans.
+//             Pre-zeroing found_inf, if appropriate, is the responsibility of
+//             the caller.
+// inv_scale:  The inverse of the scale factor by which scaled_grads are
+// currently multiplied.
+void _amp_foreach_non_finite_check_and_unscale_cpu_kernel(
+    TensorList scaled_grads,
+    at::Tensor& found_inf,
+    const at::Tensor& inv_scale) {
+  if (scaled_grads.size() == 0) {
+    return;
+  }
+
+  TORCH_CHECK(inv_scale.is_cpu(), "inv_scale must be a CPU tensor.");
+  TORCH_CHECK(found_inf.is_cpu(), "found_inf must be a CPU tensor.");
+  TORCH_CHECK(inv_scale.numel() == 1, "inv_scale must be a 1-element tensor.");
+  TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor.");
+  TORCH_CHECK(
+      inv_scale.scalar_type() == at::ScalarType::Float,
+      "inv_scale must be a float tensor.");
+  TORCH_CHECK(
+      found_inf.scalar_type() == at::ScalarType::Float,
+      "found_inf must be a float tensor.");
+
+  // Ensures client code (GradScaler) filtered scaled_grads by dtype.
+  at::native::check_foreach_api_restrictions(scaled_grads);
+  for (const at::Tensor& t : scaled_grads) {
+    TORCH_CHECK(t.is_cpu(), "one of scaled_grads was not a CPU tensor.");
+    TORCH_CHECK(
+        t.layout() == at::kStrided,
+        "one of scaled_grads was not a strided tensor.");
+    auto iter = at::TensorIterator::unary_op(
+        const_cast<at::Tensor&>(t), const_cast<at::Tensor&>(t));
+    if (at::isReducedFloatingType(iter.dtype())) {
+      AT_DISPATCH_REDUCED_FLOATING_TYPES(
+      iter.dtype(),
+      "_amp_foreach_non_finite_check_and_unscale_cpu",
+      [&iter, &found_inf, &inv_scale] {
+          auto* found_inf_ptr = found_inf.data_ptr<float>();
+          auto* inv_scale_ptr = inv_scale.data_ptr<float>();
+
+          using opmath_t = at::opmath_type<scalar_t>;
+
+          at::native::cpu_kernel_vec(
+              iter,
+              [found_inf_ptr, inv_scale_ptr](scalar_t val_in) -> scalar_t {
+                auto val = static_cast<opmath_t>(val_in);
+                if (!std::isfinite(val)) {
+                  *found_inf_ptr = 1.f;
+                }
+                // Every thread accesses inv_scale, but it will hit in cache.
+                const auto inv_scale_val = *inv_scale_ptr;
+                return static_cast<scalar_t>(
+                    inv_scale_val == 1.f ? val : val * inv_scale_val);
+              },
+              [found_inf_ptr, inv_scale_ptr](Vectorized<scalar_t> val_vec) -> Vectorized<scalar_t>{
+                auto [val_vec0, val_vec1] = convert_to_float<scalar_t>(val_vec);
+                if (val_vec0.has_inf_nan() || val_vec1.has_inf_nan()) {
+                  *found_inf_ptr = 1.f;
+                }
+                // Every thread accesses inv_scale, but it will hit in cache.
+                const auto inv_scale_val = *inv_scale_ptr;
+                val_vec0 = inv_scale_val == 1.f ? val_vec0 : val_vec0 * Vectorized<opmath_t>(inv_scale_val);
+                val_vec1 = inv_scale_val == 1.f ? val_vec1 : val_vec1 * Vectorized<opmath_t>(inv_scale_val);
+                return convert_from_float<scalar_t>(val_vec0, val_vec1);
+              });
+      });
+    } else {
+      AT_DISPATCH_FLOATING_TYPES(
+        iter.dtype(),
+        "_amp_foreach_non_finite_check_and_unscale_cpu",
+        [&iter, &found_inf, &inv_scale] {
+          auto* found_inf_ptr = found_inf.data_ptr<float>();
+          auto* inv_scale_ptr = inv_scale.data_ptr<float>();
+          at::native::cpu_kernel_vec(
+              iter,
+              [found_inf_ptr, inv_scale_ptr](scalar_t val_in) -> scalar_t {
+                if (!std::isfinite(val_in)) {
+                  *found_inf_ptr = 1.f;
+                }
+                // Every thread accesses inv_scale, but it will hit in cache.
+                const auto inv_scale_val = *inv_scale_ptr;
+                return static_cast<scalar_t>(
+                    inv_scale_val == 1.f ? val_in : val_in * inv_scale_val);
+              },
+              [found_inf_ptr, inv_scale_ptr](Vectorized<scalar_t> val_vec) -> Vectorized<scalar_t>{
+                if (val_vec.has_inf_nan()) {
+                  *found_inf_ptr = 1.f;
+                }
+                // Every thread accesses inv_scale, but it will hit in cache.
+                const auto inv_scale_val = *inv_scale_ptr;
+                return inv_scale_val == 1.f ? val_vec : val_vec * Vectorized<scalar_t>(inv_scale_val);
+              });
+        });
+    }
+  }
+}
+
+// _amp_update_scale_cpu updates the scale tensor in place.
+//
+// Args:
+// current_scale:  A one-element float tensor containing the scale value.
+// growth_tracker:  A one-element IntTensor containing the number of recent
+// consecutive unskipped steps. found_inf:  A one-element float tensor. If > 0,
+// indicates that infs/nans were found by the relevant
+//             prior _amp_non_finite_check_and_unscale_cpu call, and 0 if no
+//             infs/nans were found.
+// growth_factor:  Multiplier if no infs/NaNs were found (typically slightly >
+// 1). backoff_factor:  Multiplier if infs/NaNs were found (typically 0.5).
+// growth_interval:  Number of consecutive unskipped steps that must occur for
+// current_scale to be multiplied by
+//                   growth_factor.
+//
+// Returns:
+// current_scale
+at::Tensor& _amp_update_scale_cpu_kernel(
+    at::Tensor& current_scale,
+    at::Tensor& growth_tracker,
+    const at::Tensor& found_inf,
+    double growth_factor,
+    double backoff_factor,
+    int64_t growth_interval) {
+  TORCH_CHECK(growth_tracker.is_cpu(), "growth_tracker must be a CPU tensor.");
+  TORCH_CHECK(current_scale.is_cpu(), "current_scale must be a CPU tensor.");
+  TORCH_CHECK(found_inf.is_cpu(), "found_inf must be a CPU tensor.");
+  TORCH_CHECK(
+      growth_tracker.numel() == 1,
+      "growth_tracker must be a 1-element tensor.");
+  TORCH_CHECK(
+      current_scale.numel() == 1, "current_scale must be a 1-element tensor.");
+  TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor.");
+  TORCH_CHECK(
+      growth_tracker.scalar_type() == at::ScalarType::Int,
+      "growth_tracker must be an int tensor.");
+  TORCH_CHECK(
+      current_scale.scalar_type() == at::ScalarType::Float,
+      "current_scale must be a float tensor.");
+  TORCH_CHECK(
+      found_inf.scalar_type() == at::ScalarType::Float,
+      "found_inf must be a float tensor.");
+
+  float* current_scale_ptr = current_scale.data_ptr<float>();
+  int* growth_tracker_ptr = growth_tracker.data_ptr<int>();
+  float* found_inf_ptr = found_inf.data_ptr<float>();
+
+  if (*found_inf_ptr) {
+    *current_scale_ptr = (*current_scale_ptr) * backoff_factor;
+    *growth_tracker_ptr = 0;
+  } else {
+    // Entering this branch means we just carried out a successful step,
+    // so growth_tracker is incremented before comparing to growth_interval.
+    auto successful = (*growth_tracker_ptr) + 1;
+    if (successful == growth_interval) {
+      auto new_scale = static_cast<float>((*current_scale_ptr) * growth_factor);
+      // Do not grow the scale past fp32 bounds to inf.
+      if (std::isfinite(new_scale)) {
+        *current_scale_ptr = new_scale;
+      }
+      *growth_tracker_ptr = 0;
+    } else {
+      *growth_tracker_ptr = successful;
+    }
+  }
+
+  return current_scale;
+}
+
+} // namespace
+
+REGISTER_DISPATCH(_amp_foreach_non_finite_check_and_unscale_cpu_stub, &_amp_foreach_non_finite_check_and_unscale_cpu_kernel);
+REGISTER_DISPATCH(_amp_update_scale_cpu_stub, &_amp_update_scale_cpu_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
index 67d9eda485ffd..572d5af43f651 100644
--- a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp
@@ -14,7 +14,7 @@ namespace at::native {
 namespace {
 
 template <typename scalar_t>
-void cpu_avg_pool(
+void cpu_avg_pool2d(
     const Tensor& output_,
     const Tensor& input_,
     int64_t kW, int64_t kH,
@@ -27,7 +27,7 @@ void cpu_avg_pool(
   auto input = input_.contiguous();
   auto output = output_.contiguous();
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   int64_t numel = output.numel();
@@ -50,7 +50,7 @@ void cpu_avg_pool(
       output_data[i] = static_cast<scalar_t>(0);
 
       // local pointers
-      scalar_t* input_ptr = input_data + c * input_height * input_width;
+      const scalar_t* input_ptr = input_data + c * input_height * input_width;
 
       // compute the mean of the input image...
       int64_t ih0 = oh * dH - padH;
@@ -101,7 +101,7 @@ void cpu_avg_pool(
 
 template <typename scalar_t,
           typename std::enable_if<!is_reduced_floating_point<scalar_t>::value, int>::type = 0>
-void cpu_avg_pool_channels_last(
+void cpu_avg_pool2d_channels_last(
     const Tensor& output_,
     const Tensor& input_,
     int64_t kW, int64_t kH,
@@ -110,12 +110,12 @@ void cpu_avg_pool_channels_last(
     bool count_include_pad,
     c10::optional<int64_t> divisor_override) {
   TORCH_CHECK(input_.ndimension() == 4,
-              "average pooling with channels last format supports tensors with 4 dims");
+              "2d average pooling with channels last format supports tensors with 4 dims");
   auto memory_format = at::MemoryFormat::ChannelsLast;
   auto input = input_.contiguous(memory_format);
   auto output = output_.contiguous(memory_format);
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   int64_t nbatch = input.size(0);
@@ -179,7 +179,7 @@ void cpu_avg_pool_channels_last(
       // Pass II: compute local sum
       for (const auto ih : c10::irange(ih0, ih1)) {
         for (const auto iw : c10::irange(iw0, iw1)) {
-          scalar_t* in = input_data + n * input_height * input_width * channels +
+          const scalar_t* in = input_data + n * input_height * input_width * channels +
               ih * input_width * channels + iw * channels;
 
           int64_t d2 = 0;
@@ -215,7 +215,7 @@ void cpu_avg_pool_channels_last(
 
 template <typename scalar_t,
           typename std::enable_if<is_reduced_floating_point<scalar_t>::value, int>::type = 0>
-void cpu_avg_pool_channels_last(
+void cpu_avg_pool2d_channels_last(
     const Tensor& output_,
     const Tensor& input_,
     int64_t kW, int64_t kH,
@@ -224,12 +224,12 @@ void cpu_avg_pool_channels_last(
     bool count_include_pad,
     c10::optional<int64_t> divisor_override) {
   TORCH_CHECK(input_.ndimension() == 4,
-              "average pooling with channels last format supports tensors with 4 dims");
+              "2d average pooling with channels last format supports tensors with 4 dims");
   auto memory_format = at::MemoryFormat::ChannelsLast;
   auto input = input_.contiguous(memory_format);
   auto output = output_.contiguous(memory_format);
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   int64_t nbatch = input.size(0);
@@ -303,7 +303,7 @@ void cpu_avg_pool_channels_last(
       // Pass II: compute local sum
       for (const auto ih : c10::irange(ih0, ih1)) {
         for (const auto iw : c10::irange(iw0, iw1)) {
-          scalar_t* in = input_data + n * input_height * input_width * channels +
+          const scalar_t* in = input_data + n * input_height * input_width * channels +
               ih * input_width * channels + iw * channels;
 
           int64_t d2 = 0;
@@ -347,7 +347,7 @@ void cpu_avg_pool_channels_last(
 }
 
 template <typename scalar_t>
-void cpu_avg_pool_backward(
+void cpu_avg_pool2d_backward(
     const Tensor& grad_input_,
     const Tensor& grad_output_,
     int kW, int kH,
@@ -358,7 +358,7 @@ void cpu_avg_pool_backward(
   auto grad_output = grad_output_.contiguous();
   auto grad_input = grad_input_.contiguous();
 
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
   auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
 
   int64_t ndim = grad_output.ndimension();
@@ -373,7 +373,7 @@ void cpu_avg_pool_backward(
   at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
     for (const auto c : c10::irange(begin, end)) {
       scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width;
-      scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width;
+      const scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width;
 
       for (const auto oh : c10::irange(output_height)) {
         for (const auto ow : c10::irange(output_width)) {
@@ -415,7 +415,7 @@ void cpu_avg_pool_backward(
 }
 
 template <typename scalar_t>
-void cpu_avg_pool_backward_channels_last(
+void cpu_avg_pool2d_backward_channels_last(
     const Tensor& grad_input_,
     const Tensor& grad_output_,
     int kW, int kH,
@@ -428,7 +428,7 @@ void cpu_avg_pool_backward_channels_last(
   auto grad_output = grad_output_.contiguous(memory_format);
 
   auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
 
   int64_t nbatch = grad_input.size(0);
   int64_t channels = grad_input.size(1);
@@ -442,7 +442,7 @@ void cpu_avg_pool_backward_channels_last(
   at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
     for (const auto n : c10::irange(begin, end)) {
       scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels;
-      scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels;
+      const scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels;
 
       for (const auto oh : c10::irange(output_height)) {
         for (const auto ow : c10::irange(output_width)) {
@@ -463,11 +463,11 @@ void cpu_avg_pool_backward_channels_last(
             if(count_include_pad) {
               divide_factor = pool_size;
             } else {
-               divide_factor = (ih1 - ih0) * (iw1 - iw0);
+              divide_factor = (ih1 - ih0) * (iw1 - iw0);
             }
           }
 
-          scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels;
+          const scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels;
           int64_t size = channels;
           int64_t len = size - (size % Vec::size());
           for (const auto ih : c10::irange(ih0, ih1)) {
@@ -505,13 +505,13 @@ void avg_pool2d_kernel_impl(
   switch (input.suggest_memory_format()) {
     case at::MemoryFormat::Contiguous: {
       AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool2d", [&] {
-        cpu_avg_pool<scalar_t>(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
+        cpu_avg_pool2d<scalar_t>(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
       });
       break;
     }
     case at::MemoryFormat::ChannelsLast: {
       AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool2d_channels_last", [&] {
-        cpu_avg_pool_channels_last<scalar_t>(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
+        cpu_avg_pool2d_channels_last<scalar_t>(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
       });
       break;
     }
@@ -531,13 +531,569 @@ void avg_pool2d_backward_kernel_impl(
   switch (grad_output.suggest_memory_format()) {
     case at::MemoryFormat::Contiguous: {
       AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool2d_backward", [&] {
-        cpu_avg_pool_backward<scalar_t>(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
+        cpu_avg_pool2d_backward<scalar_t>(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
       });
       break;
     }
     case at::MemoryFormat::ChannelsLast: {
       AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool2d_backward_channels_last", [&] {
-        cpu_avg_pool_backward_channels_last<scalar_t>(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
+        cpu_avg_pool2d_backward_channels_last<scalar_t>(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+
+template <typename scalar_t>
+void cpu_avg_pool3d(
+    const Tensor& output_,
+    const Tensor& input_,
+    int64_t kW, int64_t kH, int64_t kD,
+    int64_t dW, int64_t dH, int64_t dD,
+    int64_t padW, int64_t padH, int64_t padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  using acc_t = at::opmath_type<scalar_t>;
+
+  auto input = input_.contiguous();
+  auto output = output_.contiguous();
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  int64_t numel = output.numel();
+  int64_t ndim = input.ndimension();
+  // treat batch size and channels as one dimension
+  int64_t channels = ndim == 4 ? input.size(0) : input.size(0) * input.size(1);
+  int64_t input_depth = input.size(-3);
+  int64_t input_height = input.size(-2);
+  int64_t input_width = input.size(-1);
+  int64_t output_depth = output.size(-3);
+  int64_t output_height = output.size(-2);
+  int64_t output_width = output.size(-1);
+
+  // parallel on dim N, C, D, H, W
+  at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) {
+    int64_t c = 0;
+    int64_t od = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    data_index_init(begin, c, channels, od, output_depth, oh, output_height, ow, output_width);
+
+    for (const auto i : c10::irange(begin, end)) {
+      output_data[i] = static_cast<scalar_t>(0);
+
+      // local pointers
+      scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width;
+
+      // compute the mean of the input image...
+      int64_t id0 = od * dD - padD;
+      int64_t ih0 = oh * dH - padH;
+      int64_t iw0 = ow * dW - padW;
+      int64_t id1 = std::min(id0 + kD, input_depth + padD);
+      int64_t ih1 = std::min(ih0 + kH, input_height + padH);
+      int64_t iw1 = std::min(iw0 + kW, input_width + padW);
+      int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+      id0 = std::max(id0, (int64_t) 0);
+      ih0 = std::max(ih0, (int64_t) 0);
+      iw0 = std::max(iw0, (int64_t) 0);
+      id1 = std::min(id1, input_depth);
+      ih1 = std::min(ih1, input_height);
+      iw1 = std::min(iw1, input_width);
+
+      if (id0 >= id1 || ih0 >= ih1 || iw0 >= iw1) {
+        // move on to next output index
+        data_index_step(c, channels, od, output_depth, oh, output_height, ow, output_width);
+        continue;
+      }
+
+      acc_t sum = 0;
+
+      int64_t divide_factor;
+      if (divisor_override.has_value()) {
+        divide_factor = divisor_override.value();
+      } else {
+        if(count_include_pad) {
+          divide_factor = pool_size;
+        } else {
+          divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+        }
+      }
+
+      for (const auto id : c10::irange(id0, id1)) {
+        for (const auto ih : c10::irange(ih0, ih1)) {
+          for (const auto iw : c10::irange(iw0, iw1)) {
+            sum += input_ptr[id * input_height * input_width + ih * input_width + iw];
+          }
+        }
+      }
+      output_data[i] += scalar_t(sum / divide_factor);
+
+      // move on to next output index
+      data_index_step(c, channels, od, output_depth, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous()) {
+    output_.copy_(output);
+  }
+}
+
+template <typename scalar_t,
+          typename std::enable_if<!is_reduced_floating_point<scalar_t>::value, int>::type = 0>
+void cpu_avg_pool3d_channels_last(
+    const Tensor& output_,
+    const Tensor& input_,
+    int64_t kW, int64_t kH, int64_t kD,
+    int64_t dW, int64_t dH, int64_t dD,
+    int64_t padW, int64_t padH, int64_t padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  TORCH_CHECK(input_.ndimension() == 5,
+              "3d average pooling with channels last format supports tensors with 5 dims");
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto input = input_.contiguous(memory_format);
+  auto output = output_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<scalar_t>();
+  auto output_data = output.data_ptr<scalar_t>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_depth = input.size(2);
+  int64_t input_height = input.size(3);
+  int64_t input_width = input.size(4);
+  int64_t output_depth = output.size(2);
+  int64_t output_height = output.size(3);
+  int64_t output_width = output.size(4);
+
+  using Vec = vec::Vectorized<scalar_t>;
+  // parallel on dim N, H, W
+  at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t od = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+
+    int64_t size = channels;
+    int64_t len = size - (size % Vec::size());
+    for (const auto i : c10::irange(begin, end)) {
+      // compute the mean of the input image...
+      int64_t id0 = od * dD - padD;
+      int64_t ih0 = oh * dH - padH;
+      int64_t iw0 = ow * dW - padW;
+      int64_t id1 = std::min(id0 + kD, input_depth + padD);
+      int64_t ih1 = std::min(ih0 + kH, input_height + padH);
+      int64_t iw1 = std::min(iw0 + kW, input_width + padW);
+      int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+      id0 = std::max(id0, (int64_t) 0);
+      ih0 = std::max(ih0, (int64_t) 0);
+      iw0 = std::max(iw0, (int64_t) 0);
+      id1 = std::min(id1, input_depth);
+      ih1 = std::min(ih1, input_height);
+      iw1 = std::min(iw1, input_width);
+
+      int64_t divide_factor;
+      if (divisor_override.has_value()) {
+        divide_factor = divisor_override.value();
+      } else {
+        if(count_include_pad) {
+          divide_factor = pool_size;
+        } else {
+          divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+        }
+      }
+
+      scalar_t* out = output_data + i * channels;
+
+      // Pass I: zero the out lane
+      int64_t d1 = 0;
+      for (; d1 < len; d1 += Vec::size()) {
+        Vec out_vec = Vec(scalar_t(0));
+        out_vec.store(out + d1);
+      }
+      for (; d1 < size; d1++) {
+        out[d1] = scalar_t(0);
+      }
+
+      if (id0 >= id1 || ih0 >= ih1 || iw0 >= iw1) {
+        // move on to next output index
+        data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+        continue;
+      }
+
+      // Pass II: compute local sum
+      for (const auto id : c10::irange(id0, id1)) {
+        for (const auto ih : c10::irange(ih0, ih1)) {
+          for (const auto iw : c10::irange(iw0, iw1)) {
+            scalar_t* in = input_data + n * input_depth * input_height * input_width * channels +
+                id * input_height * input_width * channels + ih * input_width * channels + iw * channels;
+
+            int64_t d2 = 0;
+            for (; d2 < len; d2 += Vec::size()) {
+              Vec out_vec = Vec::loadu(out + d2) + Vec::loadu(in + d2);
+              out_vec.store(out + d2);
+            }
+            for (; d2 < size; d2++) {
+              out[d2] += in[d2];
+            }
+          }
+        }
+      }
+
+      // Pass III: compute local average
+      int64_t d3 = 0;
+      for (; d3 < len; d3 += Vec::size()) {
+        Vec out_vec = Vec::loadu(out + d3) / Vec(scalar_t(divide_factor));
+        out_vec.store(out + d3);
+      }
+      for (; d3 < size; d3++) {
+        out[d3] = out[d3] / divide_factor;
+      }
+
+      // move on to next output index
+      data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+}
+
+template <typename scalar_t,
+          typename std::enable_if<is_reduced_floating_point<scalar_t>::value, int>::type = 0>
+void cpu_avg_pool3d_channels_last(
+    const Tensor& output_,
+    const Tensor& input_,
+    int64_t kW, int64_t kH, int64_t kD,
+    int64_t dW, int64_t dH, int64_t dD,
+    int64_t padW, int64_t padH, int64_t padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  TORCH_CHECK(input_.ndimension() == 5,
+              "3d average pooling with channels last format supports tensors with 5 dims");
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto input = input_.contiguous(memory_format);
+  auto output = output_.contiguous(memory_format);
+
+  auto input_data = input.data_ptr<BFloat16>();
+  auto output_data = output.data_ptr<BFloat16>();
+
+  int64_t nbatch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t input_depth = input.size(2);
+  int64_t input_height = input.size(3);
+  int64_t input_width = input.size(4);
+  int64_t output_depth = output.size(2);
+  int64_t output_height = output.size(3);
+  int64_t output_width = output.size(4);
+
+  using bVec = vec::Vectorized<BFloat16>;
+  using fVec = vec::Vectorized<float>;
+  // parallel on dim N, H, W
+  at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
+    int64_t n = 0;
+    int64_t od = 0;
+    int64_t oh = 0;
+    int64_t ow = 0;
+    data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+
+    // temp buffer for sum, use float as accumulation type
+    // can't reuse output buffer to store sum since it is BFloat16
+    auto sum_arr = std::make_unique<float []>(channels);
+    float* sum = sum_arr.get();
+
+    int64_t size = channels;
+    for (const auto i : c10::irange(begin, end)) {
+      // compute the mean of the input image...
+      int64_t id0 = od * dD - padD;
+      int64_t ih0 = oh * dH - padH;
+      int64_t iw0 = ow * dW - padW;
+      int64_t id1 = std::min(id0 + kD, input_depth + padD);
+      int64_t ih1 = std::min(ih0 + kH, input_height + padH);
+      int64_t iw1 = std::min(iw0 + kW, input_width + padW);
+      int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+      id0 = std::max(id0, (int64_t) 0);
+      ih0 = std::max(ih0, (int64_t) 0);
+      iw0 = std::max(iw0, (int64_t) 0);
+      id1 = std::min(id1, input_depth);
+      ih1 = std::min(ih1, input_height);
+      iw1 = std::min(iw1, input_width);
+
+      int64_t divide_factor;
+      if (divisor_override.has_value()) {
+        divide_factor = divisor_override.value();
+      } else {
+        if(count_include_pad) {
+          divide_factor = pool_size;
+        } else {
+          divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+        }
+      }
+
+      BFloat16* out = output_data + i * channels;
+
+      // Pass I: zero the out lane
+      int64_t d1 = 0;
+      for (; d1 < size - (size % fVec::size()); d1 += fVec::size()) {
+        fVec sum_fvec = fVec(float(0));
+        sum_fvec.store(sum + d1);
+      }
+      for (; d1 < size; d1++) {
+        sum[d1] = float(0);
+      }
+
+      if (id0 >= id1 || ih0 >= ih1 || iw0 >= iw1) {
+        // since we are not directly using output as the accumulation buffer,
+        // in case the kernel window is out of range, need to zero the output buffer here.
+        for (int64_t k = 0; k < size; k++) {
+          out[k] = 0;
+        }
+        // move on to next output index
+        data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+        continue;
+      }
+
+      // Pass II: compute local sum
+      for (const auto id : c10::irange(id0, id1)) {
+        for (const auto ih : c10::irange(ih0, ih1)) {
+          for (const auto iw : c10::irange(iw0, iw1)) {
+            BFloat16* in = input_data + n * input_depth * input_height * input_width * channels +
+                id * input_height * input_width * channels + ih * input_width * channels + iw * channels;
+
+            int64_t d2 = 0;
+            for (; d2 < size - (size % bVec::size()); d2 += bVec::size()) {
+              bVec data_bvec = bVec::loadu(in + d2);
+              fVec data_fvec0, data_fvec1;
+              std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
+
+              fVec sum_fvec0 = fVec::loadu(sum + d2) + data_fvec0;
+              fVec sum_fvec1 = fVec::loadu(sum + d2 + fVec::size()) + data_fvec1;
+              sum_fvec0.store(sum + d2);
+              sum_fvec1.store(sum + d2 + fVec::size());
+            }
+            for (; d2 < size; d2++) {
+              sum[d2] += float(in[d2]);
+            }
+          }
+        }
+      }
+
+      // Pass III: compute local average
+      int64_t d3 = 0;
+      for (; d3 < size - (size % bVec::size()); d3 += bVec::size()) {
+        fVec out_fvec0 = fVec::loadu(sum + d3) / fVec(float(divide_factor));
+        fVec out_fvec1 = fVec::loadu(sum + d3 + fVec::size()) / fVec(float(divide_factor));
+
+        bVec out_bvec = convert_float_bfloat16(out_fvec0, out_fvec1);
+        out_bvec.store(out + d3);
+      }
+      for (; d3 < size; d3++) {
+        out[d3] = BFloat16(sum[d3] / divide_factor);
+      }
+
+      // move on to next output index
+      data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width);
+    }
+  });
+
+  if (!output_.is_contiguous(memory_format)) {
+    output_.copy_(output);
+  }
+}
+
+template <typename scalar_t>
+void cpu_avg_pool3d_backward(
+    const Tensor& grad_input_,
+    const Tensor& grad_output_,
+    int kW, int kH, int kD,
+    int dW, int dH, int dD,
+    int padW, int padH, int padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  auto grad_output = grad_output_.contiguous();
+  auto grad_input = grad_input_.contiguous();
+
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
+
+  int64_t ndim = grad_output.ndimension();
+  // treat batch size and channels as one dimension
+  int64_t channels = ndim == 4 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1);
+  int64_t input_depth = grad_input.size(-3);
+  int64_t input_height = grad_input.size(-2);
+  int64_t input_width = grad_input.size(-1);
+  int64_t output_depth = grad_output.size(-3);
+  int64_t output_height = grad_output.size(-2);
+  int64_t output_width = grad_output.size(-1);
+
+  // parallel on dim of N, C
+  at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
+    for (const auto c : c10::irange(begin, end)) {
+      scalar_t* grad_input_ptr = grad_input_data + c * input_depth * input_height * input_width;
+      scalar_t* grad_output_ptr = grad_output_data + c * output_depth * output_height * output_width;
+
+      for (const auto od : c10::irange(output_depth)) {
+        for (const auto oh : c10::irange(output_height)) {
+          for (const auto ow : c10::irange(output_width)) {
+            int64_t id0 = od * dD - padD;
+            int64_t ih0 = oh * dH - padH;
+            int64_t iw0 = ow * dW - padW;
+            int64_t id1 = std::min(id0 + kD, input_depth + padD);
+            int64_t ih1 = std::min(ih0 + kH, input_height + padH);
+            int64_t iw1 = std::min(iw0 + kW, input_width + padW);
+            int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+            id0 = std::max(id0, (int64_t) 0);
+            ih0 = std::max(ih0, (int64_t) 0);
+            iw0 = std::max(iw0, (int64_t) 0);
+            ih1 = std::min(ih1, input_height);
+            iw1 = std::min(iw1, input_width);
+
+            int64_t divide_factor;
+            if (divisor_override.has_value()) {
+              divide_factor = divisor_override.value();
+            } else {
+              if(count_include_pad) {
+                divide_factor = pool_size;
+              } else {
+                divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+              }
+            }
+
+            scalar_t grad_delta = grad_output_ptr[od * output_height * output_width + oh * output_width + ow] / divide_factor;
+            for (const auto id : c10::irange(id0, id1)) {
+              for (const auto ih : c10::irange(ih0, ih1)) {
+                for (const auto iw : c10::irange(iw0, iw1)) {
+                  grad_input_ptr[id * input_height * input_width + ih * input_width + iw] += grad_delta;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  });
+
+  if (!grad_input_.is_contiguous()) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+template <typename scalar_t>
+void cpu_avg_pool3d_backward_channels_last(
+    const Tensor& grad_input_,
+    const Tensor& grad_output_,
+    int kW, int kH, int kD,
+    int dW, int dH, int dD,
+    int padW, int padH, int padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  auto memory_format = at::MemoryFormat::ChannelsLast3d;
+  auto grad_input = grad_input_.contiguous(memory_format);
+  auto grad_output = grad_output_.contiguous(memory_format);
+
+  auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+
+  int64_t nbatch = grad_input.size(0);
+  int64_t channels = grad_input.size(1);
+  int64_t input_depth = grad_input.size(2);
+  int64_t input_height = grad_input.size(3);
+  int64_t input_width = grad_input.size(4);
+  int64_t output_depth = grad_output.size(2);
+  int64_t output_height = grad_output.size(3);
+  int64_t output_width = grad_output.size(4);
+
+  using Vec = vec::Vectorized<scalar_t>;
+  // parallel on dim N
+  at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
+    for (const auto n : c10::irange(begin, end)) {
+      scalar_t* grad_input_ptr = grad_input_data + n * input_depth * input_height * input_width * channels;
+      scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels;
+
+      for (const auto od : c10::irange(output_depth)) {
+        for (const auto oh : c10::irange(output_height)) {
+          for (const auto ow : c10::irange(output_width)) {
+            int64_t id0 = od * dD - padD;
+            int64_t ih0 = oh * dH - padH;
+            int64_t iw0 = ow * dW - padW;
+            int64_t id1 = std::min(id0 + kD, input_depth + padD);
+            int64_t ih1 = std::min(ih0 + kH, input_height + padH);
+            int64_t iw1 = std::min(iw0 + kW, input_width + padW);
+            int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+            id0 = std::max(id0, (int64_t) 0);
+            ih0 = std::max(ih0, (int64_t) 0);
+            iw0 = std::max(iw0, (int64_t) 0);
+            id1 = std::min(id1, input_depth);
+            ih1 = std::min(ih1, input_height);
+            iw1 = std::min(iw1, input_width);
+
+            int64_t divide_factor;
+            if (divisor_override.has_value()) {
+              divide_factor = divisor_override.value();
+            } else {
+              if(count_include_pad) {
+                divide_factor = pool_size;
+              } else {
+                divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0);
+              }
+            }
+
+            scalar_t* gout = grad_output_ptr + od * output_height * output_width * channels + oh * output_width * channels + ow * channels;
+            int64_t size = channels;
+            int64_t len = size - (size % Vec::size());
+            for (const auto id : c10::irange(id0, id1)) {
+              for (const auto ih : c10::irange(ih0, ih1)) {
+                for (const auto iw : c10::irange(iw0, iw1)) {
+                  scalar_t* gin = grad_input_ptr + id * input_height * input_width * channels + ih * input_width * channels + iw * channels;
+
+                  int64_t d = 0;
+                  for (; d < len; d += Vec::size()) {
+                    Vec gin_vec = Vec::loadu(gin + d) + Vec::loadu(gout + d) / Vec(scalar_t(divide_factor));
+                    gin_vec.store(gin + d);
+                  }
+                  for (; d < size; d++) {
+                    gin[d] += gout[d] / divide_factor;
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  });
+
+  if (!grad_input_.is_contiguous(memory_format)) {
+    grad_input_.copy_(grad_input);
+  }
+}
+
+
+
+void avg_pool3d_kernel_impl(
+    const Tensor& output,
+    const Tensor& input,
+    int64_t kW, int64_t kH, int64_t kD,
+    int64_t dW, int64_t dH, int64_t dD,
+    int64_t padW, int64_t padH, int64_t padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  switch (input.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool3d", [&] {
+        cpu_avg_pool3d<scalar_t>(output, input, kW, kH, kD, dW, dH, dD, padW, padH, padD, count_include_pad, divisor_override);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast: {
+      AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool3d_channels_last", [&] {
+        cpu_avg_pool3d_channels_last<scalar_t>(output, input, kW, kH, kD, dW, dH, dD, padW, padH, padD, count_include_pad, divisor_override);
       });
       break;
     }
@@ -546,9 +1102,39 @@ void avg_pool2d_backward_kernel_impl(
   }
 }
 
+
+void avg_pool3d_backward_kernel_impl(
+    const Tensor& grad_input,
+    const Tensor& grad_output,
+    int kW, int kH, int kD,
+    int dW, int dH, int dD,
+    int padW, int padH, int padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  switch (grad_output.suggest_memory_format()) {
+    case at::MemoryFormat::Contiguous: {
+      AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool3d_backward", [&] {
+        cpu_avg_pool3d_backward<scalar_t>(grad_input, grad_output, kW, kH, kD, dW, dH, dD, padW, padH, padD, count_include_pad, divisor_override);
+      });
+      break;
+    }
+    case at::MemoryFormat::ChannelsLast3d: {
+      AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool3d_backward_channels_last", [&] {
+        cpu_avg_pool3d_backward_channels_last<scalar_t>(grad_input, grad_output, kW, kH, kD, dW, dH, dD, padW, padH, padD, count_include_pad, divisor_override);
+      });
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+  }
+}
+
+
 } // anonymous namespace
 
 REGISTER_DISPATCH(avg_pool2d_kernel, &avg_pool2d_kernel_impl);
 REGISTER_DISPATCH(avg_pool2d_backward_kernel, &avg_pool2d_backward_kernel_impl);
+REGISTER_DISPATCH(avg_pool3d_kernel, &avg_pool3d_kernel_impl);
+REGISTER_DISPATCH(avg_pool3d_backward_kernel, &avg_pool3d_backward_kernel_impl);
 
 } // at::native
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index 8958126d107e2..e008113c10b10 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -32,8 +32,8 @@ inline Vectorized<scalar_t> binary_op_scalar(
     const Vectorized<scalar_t>& a,
     opmath_t b,
     const Op& op) {
-  Vectorized<opmath_t> a0, a1, vec_b(b);
-  std::tie(a0, a1) = convert_to_float<scalar_t>(a);
+  Vectorized<opmath_t> vec_b(b);
+  auto [a0, a1] = convert_to_float<scalar_t>(a);
   return convert_from_float<scalar_t>(op(a0, vec_b), op(a1, vec_b));
 }
 
@@ -90,10 +90,7 @@ void atan2_kernel(TensorIteratorBase& iter) {
       kHalf,                                             \
       kBool,                                             \
       kBFloat16,                                         \
-      kFloat8_e5m2,                                      \
-      kFloat8_e5m2fnuz,                                  \
-      kFloat8_e4m3fn,                                    \
-      kFloat8_e4m3fnuz, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
+      AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
 #define _AT_DISPATCH_ALL_TYPES_NO_BOOL(TYPE, NAME, ...) \
   AT_DISPATCH_V2(               \
       TYPE,                                             \
@@ -102,12 +99,10 @@ void atan2_kernel(TensorIteratorBase& iter) {
       kComplexHalf,                                     \
       kHalf,                                            \
       kBFloat16,                                        \
-      kFloat8_e5m2,                                     \
-      kFloat8_e4m3fn,                                   \
-      AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
+      AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
 #define _AT_DISPATCH_MUL_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_V2(TYPE, NAME, AT_WRAP(__VA_ARGS__),       \
-      kHalf, kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
+      kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES))
 #else
 #define _AT_DISPATCH_ALL_TYPES_AND_BOOL(TYPE, NAME, ...) \
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(                \
@@ -347,9 +342,8 @@ void remainder_kernel(TensorIteratorBase& iter) {
               return mod0;
             },
         [=](Vectorized<BFloat16> a, Vectorized<BFloat16> b) {
-          Vectorized<float> a0, a1, b0, b1;
-          std::tie(a0, a1) = convert_bfloat16_float(a);
-          std::tie(b0, b1) = convert_bfloat16_float(b);
+          auto [a0, a1] = convert_bfloat16_float(a);
+          auto [b0, b1] = convert_bfloat16_float(b);
           auto mod0 = a0.fmod(b0);
           auto mod1 = a1.fmod(b1);
           const auto zero = Vectorized<float>(0);
@@ -748,9 +742,8 @@ void smooth_l1_kernel(TensorIteratorBase& iter, double beta) {
         },
         [&beta_val_vec, &point_five_vec](
             Vectorized<BFloat16> a, Vectorized<BFloat16> b) {
-          Vectorized<float> a0, a1, b0, b1;
-          std::tie(a0, a1) = convert_bfloat16_float(a);
-          std::tie(b0, b1) = convert_bfloat16_float(b);
+          auto [a0, a1] = convert_bfloat16_float(a);
+          auto [b0, b1] = convert_bfloat16_float(b);
           auto z = (a0 - b0).abs();
           a0 = Vectorized<float>::blendv(
               point_five_vec * z * z / beta_val_vec,
@@ -835,9 +828,8 @@ void sigmoid_backward_kernel(TensorIteratorBase& iter) {
           return a0 * (float(1) - b0) * b0;
         },
         [=](Vectorized<BFloat16> a, Vectorized<BFloat16> b) {
-          Vectorized<float> a0, a1, b0, b1;
-          std::tie(a0, a1) = convert_bfloat16_float(a);
-          std::tie(b0, b1) = convert_bfloat16_float(b);
+          auto [a0, a1] = convert_bfloat16_float(a);
+          auto [b0, b1] = convert_bfloat16_float(b);
           a0 = a0 * (one_vec - b0) * b0;
           a1 = a1 * (one_vec - b1) * b1;
           return convert_float_bfloat16(a0, a1);
@@ -933,9 +925,8 @@ void tanh_backward_kernel(TensorIteratorBase& iter) {
                 return a0 * (float{1} - b0 * b0);
               },
               [=](Vectorized<scalar_t> a, Vectorized<scalar_t> b) {
-                Vectorized<float> a0, a1, b0, b1;
-                std::tie(a0, a1) = convert_to_float<scalar_t>(a);
-                std::tie(b0, b1) = convert_to_float<scalar_t>(b);
+                auto [a0, a1] = convert_to_float<scalar_t>(a);
+                auto [b0, b1] = convert_to_float<scalar_t>(b);
                 a0 = a0 * (one_vec - b0 * b0);
                 a1 = a1 * (one_vec - b1 * b1);
                 return convert_from_float<scalar_t>(a0, a1);
@@ -1017,9 +1008,8 @@ void logaddexp_kernel(TensorIteratorBase& iter) {
             }
           },
           [=](Vec a, Vec b) -> Vec {
-            Vectorized<float> a0, a1, b0, b1;
-            std::tie(a0, a1) = convert_to_float<scalar_t>(a);
-            std::tie(b0, b1) = convert_to_float<scalar_t>(b);
+            auto [a0, a1] = convert_to_float<scalar_t>(a);
+            auto [b0, b1] = convert_to_float<scalar_t>(b);
             Vectorized<float> inf(std::numeric_limits<float>::infinity());
             Vectorized<float> m0 = maximum(a0, b0);
             Vectorized<float> m1 = maximum(a1, b1);
@@ -1082,9 +1072,8 @@ void logaddexp2_kernel(TensorIteratorBase& iter) {
             }
           },
           [=](Vec a, Vec b) -> Vec {
-            Vectorized<float> a0, a1, b0, b1;
-            std::tie(a0, a1) = convert_to_float<scalar_t>(a);
-            std::tie(b0, b1) = convert_to_float<scalar_t>(b);
+            auto [a0, a1] = convert_to_float<scalar_t>(a);
+            auto [b0, b1] = convert_to_float<scalar_t>(b);
             Vectorized<float> inf(std::numeric_limits<float>::infinity());
             Vectorized<float> inv_log_2_vec(inv_log_2);
             Vectorized<float> m0 = maximum(a0, b0);
diff --git a/aten/src/ATen/native/cpu/BlasKernel.cpp b/aten/src/ATen/native/cpu/BlasKernel.cpp
index d0761584f0442..1cc53da3584ea 100644
--- a/aten/src/ATen/native/cpu/BlasKernel.cpp
+++ b/aten/src/ATen/native/cpu/BlasKernel.cpp
@@ -1,9 +1,41 @@
 #define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
 #include <ATen/native/CPUBlas.h>
+#include <ATen/native/cpu/zmath.h>
 #include <c10/util/irange.h>
 #include <c10/util/Unroll.h>
 
+#if defined(__aarch64__) && !defined(C10_MOBILE)
+#include <arm_neon.h>
+
+namespace at::native::blas_impl {
+void fp16_gemv_notrans(
+    const int m,
+    const int n,
+    const float alpha,
+    const float16_t* a,
+    const int lda,
+    const float16_t* x,
+    const int incx,
+    const float beta,
+    float16_t* y,
+    const int incy);
+
+void fp16_gemv_trans(
+    const int m,
+    const int n,
+    const float alpha,
+    const float16_t* a,
+    const int lda,
+    const float16_t* x,
+    const int incx,
+    const float beta,
+    float16_t* y,
+    const int incy);
+}
+#endif
+
 namespace at::native {
 namespace cpublas {
 namespace {
@@ -121,6 +153,7 @@ gemm_notrans_(
 
 template <typename scalar_t, typename opmath_t>
 void gemm_transa_(
+    TransposeType transa,
     int64_t m, int64_t n, int64_t k,
     opmath_t alpha,
     const scalar_t *a, int64_t lda,
@@ -133,7 +166,7 @@ void gemm_transa_(
     const scalar_t *b_ = b;
     for (const auto j : c10::irange(n)) {
       const auto dot = sum(k, [&](int64_t l) -> opmath_t {
-        return static_cast<opmath_t>(a_[l]) * static_cast<opmath_t>(b_[l]);
+        return static_cast<opmath_t>(transa == TransposeType::ConjTranspose ? conj_impl(a_[l]) : a_[l]) * static_cast<opmath_t>(b_[l]);
       });
       b_ += ldb;
       if (beta == opmath_t(0)) {
@@ -149,6 +182,7 @@ void gemm_transa_(
 template <typename scalar_t, typename opmath_t>
 typename std::enable_if<std::is_same<scalar_t, opmath_t>::value, void>::type
 gemm_transb_(
+    TransposeType transb,
     int64_t m,
     int64_t n,
     int64_t k,
@@ -166,7 +200,7 @@ gemm_transb_(
   // c += alpha * (a @ b.T)
   for (const auto l : c10::irange(k)) {
     for (const auto j : c10::irange(n)) {
-      opmath_t val = b[j + l * ldb] * alpha;
+      opmath_t val = (transb == TransposeType::ConjTranspose ? conj_impl(b[j + l * ldb]) : b[j + l * ldb]) * alpha;
       int64_t i_m = m / 4;
       for (const auto i_i : c10::irange(i_m)) {
         c[j * ldc + i_i * 4 + 0] += a[i_i * 4 + 0 + l * lda] * val;
@@ -185,6 +219,7 @@ gemm_transb_(
 template <typename scalar_t, typename opmath_t>
 typename std::enable_if<!std::is_same<scalar_t, opmath_t>::value, void>::type
 gemm_transb_(
+    TransposeType transb,
     int64_t m,
     int64_t n,
     int64_t k,
@@ -201,7 +236,7 @@ gemm_transb_(
     for (const auto j : c10::irange(n)) {
       const auto dot = sum(k, [&](int64_t l) -> opmath_t {
         return static_cast<opmath_t>(a[l * lda + i]) *
-            static_cast<opmath_t>(b[l * ldb + j]);
+            static_cast<opmath_t>(transb == TransposeType::ConjTranspose ? conj_impl(b[l * ldb + j]) : b[l * ldb + j]);
       });
       if (beta == opmath_t(0)) {
         c[j * ldc + i] = alpha * dot;
@@ -214,6 +249,7 @@ gemm_transb_(
 
 template <typename scalar_t, typename opmath_t>
 void gemm_transab_(
+    TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
     opmath_t alpha,
     const scalar_t *a, int64_t lda,
@@ -224,8 +260,8 @@ void gemm_transab_(
   for (const auto i : c10::irange(m)) {
     for (const auto j : c10::irange(n)) {
       const auto dot = sum(k, [&](int64_t l) -> opmath_t {
-        return static_cast<opmath_t>(a[i * lda + l]) *
-            static_cast<opmath_t>(b[l * ldb + j]);
+        return static_cast<opmath_t>(transa == TransposeType::ConjTranspose ? conj_impl(a[i * lda + l]) : a[i * lda + l]) *
+            static_cast<opmath_t>(transb == TransposeType::ConjTranspose ? conj_impl(b[l * ldb + j]) : b[l * ldb + j]);
       });
 
       if (beta == opmath_t(0)) {
@@ -237,6 +273,91 @@ void gemm_transab_(
   }
 }
 
+#if defined(__aarch64__) && !defined(C10_MOBILE)
+template <>
+void gemm_notrans_(
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    float alpha,
+    const at::Half* a,
+    int64_t lda,
+    const at::Half* b,
+    int64_t ldb,
+    float beta,
+    at::Half* c,
+    int64_t ldc) {
+  // c += alpha * (a @ b)
+  if (n == 1 && beta == 0.0) {
+    at::native::blas_impl::fp16_gemv_notrans(m, k, alpha, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, beta, reinterpret_cast<float16_t*>(c), 1);
+    return;
+  }
+  for (const auto i : c10::irange(m)) {
+    for (const auto j : c10::irange(n)) {
+      const auto dot = sum(k, [&](int64_t l) -> float {
+        return float(c10::detail::fp16_from_bits(a[l * lda + i].x)) *
+            float(c10::detail::fp16_from_bits(b[j * ldb + l].x));
+      });
+      if (beta == 0) {
+        c[j * ldc + i] = alpha * dot;
+      } else {
+        c[j * ldc + i] = beta * c[j * ldc + i] + alpha * dot;
+      }
+    }
+  }
+}
+
+
+static float compute_dot(const float16_t *a, const float16_t *b, int64_t l) {
+    if ((l&3) != 0) {
+      return sum(l, [&](int64_t i) -> float {
+        return float(a[i]) * float(b[i]);
+      });
+    }
+    float32x4_t rcv = vdupq_n_f32(0);
+    for (int64_t idx = 0; idx < l; idx += 4) {
+      float32x4_t aVec = vcvt_f32_f16(vld1_f16(a + idx));
+      float32x4_t bVec = vcvt_f32_f16(vld1_f16(b + idx));
+      rcv = vaddq_f32(rcv, vmulq_f32(aVec, bVec));
+    }
+    auto sum = vpaddq_f32(rcv, rcv);
+    return vgetq_lane_f32(vpaddq_f32(sum, sum), 0);
+}
+
+template <>
+void gemm_transa_(
+    TransposeType transa,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const at::Half *a, int64_t lda,
+    const at::Half *b, int64_t ldb,
+    float beta,
+    at::Half *c, int64_t ldc) {
+  // c = alpha * (a.T @ b) + beta * c
+  if (n == 1 && beta == 0.0) {
+    at::native::blas_impl::fp16_gemv_trans(k, m, alpha, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(b), 1, beta, reinterpret_cast<float16_t*>(c), 1);
+    return;
+  }
+  parallel_for(0, m, 1, [&](int64_t begin, int64_t end) {
+    const auto *a_ = a + begin * lda;
+    for (const auto i : c10::irange(begin, end)) {
+      const auto *b_ = b;
+      for (const auto j : c10::irange(n)) {
+        const auto dot = compute_dot(reinterpret_cast<const float16_t*>(a_), reinterpret_cast<const float16_t*>(b_), k);
+        b_ += ldb;
+        if (beta == 0) {
+          c[j*ldc+i] = alpha*dot;
+        } else {
+          c[j*ldc+i] = beta*c[j*ldc+i]+alpha*dot;
+        }
+      }
+      a_ += lda;
+    }
+  });
+}
+
+#endif
+
 template <typename scalar_t, typename opmath_t>
 void gemm_core_(
     TransposeType transa, TransposeType transb,
@@ -250,23 +371,22 @@ void gemm_core_(
       transb == TransposeType::NoTranspose) {
     return gemm_notrans_(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
   } else if (
-      transa == TransposeType::Transpose &&
-      transb != TransposeType::Transpose) {
-    gemm_transa_(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+      transa != TransposeType::NoTranspose &&
+      transb == TransposeType::NoTranspose) {
+    gemm_transa_(transa, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
   } else if (
       transa == TransposeType::NoTranspose &&
-      transb == TransposeType::Transpose) {
-    gemm_transb_(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-  } else { // transa == TransposeType::Transpose && transb ==
-           // TransposeType::Transpose
-    gemm_transab_(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+      transb != TransposeType::NoTranspose) {
+    gemm_transb_(transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+  } else {
+    gemm_transab_(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
   }
 }
 
 #if !defined(C10_MOBILE)
-#define _AT_DISPATCH_GEMM_TYPES(TYPE, NAME, ...)                  \
-        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(                   \
-            kHalf, kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn,       \
+#define _AT_DISPATCH_GEMM_TYPES(TYPE, NAME, ...)                                                \
+        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND6(                                                 \
+            kHalf, kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, \
             TYPE, NAME, __VA_ARGS__)
 #else
 #define _AT_DISPATCH_GEMM_TYPES(TYPE, NAME, ...)         \
diff --git a/aten/src/ATen/native/cpu/CatKernel.cpp b/aten/src/ATen/native/cpu/CatKernel.cpp
index d3a83b2334515..23d9aa1708ba7 100644
--- a/aten/src/ATen/native/cpu/CatKernel.cpp
+++ b/aten/src/ATen/native/cpu/CatKernel.cpp
@@ -12,11 +12,11 @@ namespace at::native {
 namespace {
 
 struct InputMeta {
-  void* data_ptr;
+  const void* data_ptr;
   int64_t inner_size;
 
   InputMeta(const Tensor& t, int64_t dim, int64_t inner)
-    : data_ptr(t.data_ptr())
+    : data_ptr(t.const_data_ptr())
     , inner_size(t.sizes()[dim] * inner) {}
 };
 
@@ -38,7 +38,7 @@ void cat_serial_kernel_impl(const Tensor& result, const MaterializedITensorListR
   for (const auto i : c10::irange(outer)) {
     for (const auto j : c10::irange(ninputs)) {
       int64_t local_inner = inputs[j].inner_size;
-      scalar_t* input_ptr = (scalar_t*)(inputs[j].data_ptr) + i * local_inner;
+      const scalar_t* input_ptr = (const scalar_t*)(inputs[j].data_ptr) + i * local_inner;
       int64_t d = 0;
       for (; d < local_inner - (local_inner % Vec::size()); d += Vec::size()) {
         Vec in_vec = Vec::loadu(input_ptr + d);
diff --git a/aten/src/ATen/native/cpu/CopyKernel.cpp b/aten/src/ATen/native/cpu/CopyKernel.cpp
index a815896cc799c..906fa8911e884 100644
--- a/aten/src/ATen/native/cpu/CopyKernel.cpp
+++ b/aten/src/ATen/native/cpu/CopyKernel.cpp
@@ -71,7 +71,7 @@ static void reduced_float_copy_kernel(TensorIteratorBase &iter, bool requires_ne
       using Vecs = Vectorized<scalar_t>;
       c10::SmallBuffer<char*, 2> ptrs(2);
       dest_t* output_data = iter.tensor_base(0).data_ptr<dest_t>();
-      scalar_t* input_data = iter.tensor_base(1).data_ptr<scalar_t>();
+      scalar_t* input_data = const_cast<scalar_t*>(iter.tensor_base(1).const_data_ptr<scalar_t>());
       ptrs[0] = reinterpret_cast<char*>(output_data);
       ptrs[1] = reinterpret_cast<char*>(input_data);
 
@@ -139,7 +139,7 @@ static void reduced_float_copy_kernel(TensorIteratorBase &iter, bool requires_ne
       using Vecs = Vectorized<source_t>;
       c10::SmallBuffer<char*, 2> ptrs(2);
       dest_t* output_data = iter.tensor_base(0).data_ptr<dest_t>();
-      source_t* input_data = iter.tensor_base(1).data_ptr<source_t>();
+      source_t* input_data = const_cast<source_t*>(iter.tensor_base(1).const_data_ptr<source_t>());
 
       ptrs[0] = reinterpret_cast<char*>(output_data);
       ptrs[1] = reinterpret_cast<char*>(input_data);
diff --git a/aten/src/ATen/native/cpu/CrossKernel.cpp b/aten/src/ATen/native/cpu/CrossKernel.cpp
index 0394a9f524abd..d982f63dd0508 100644
--- a/aten/src/ATen/native/cpu/CrossKernel.cpp
+++ b/aten/src/ATen/native/cpu/CrossKernel.cpp
@@ -21,8 +21,8 @@ static void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b,
   int64_t b_stride = b.stride(dim);
   int64_t r_stride = result.stride(dim);
 
-  scalar_t *a_ptr = a.data_ptr<scalar_t>();
-  scalar_t *b_ptr = b.data_ptr<scalar_t>();
+  const scalar_t *a_ptr = a.const_data_ptr<scalar_t>();
+  const scalar_t *b_ptr = b.const_data_ptr<scalar_t>();
   scalar_t *r_ptr = result.data_ptr<scalar_t>();
 
   parallel_for(0, total, internal::GRAIN_SIZE, [&](int64_t s, int64_t e) {
diff --git a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
index 3a34ad3f7a6e8..573d5de9cde19 100644
--- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
+++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
@@ -292,16 +292,21 @@ Tensor _convolution_depthwise3x3_winograd(
                       bias_potentially_undefined :
                       at::zeros({kernel_sizes[0]}, input.options());
 
+  auto input_data = input.const_data_ptr<float>();
+  auto kernel_data = kernel.const_data_ptr<float>();
+  auto bias_data = bias.const_data_ptr<float>();
+  auto output_data = output.data_ptr<float>();
+
   at::parallel_for(0, args.batch * args.out_channels, 0, [&](int64_t start, int64_t end) {
     for (const auto k : c10::irange(start, end)) {
       const int64_t g = k % args.out_channels;
       const int64_t i = k / (args.out_channels / groups);
       convolution_depthwise3x3_winograd_impl(
           args,
-          input.data_ptr<float>() + i * input_hxw,
-          kernel.data_ptr<float>() + g * 3 * 3,
-          bias.data_ptr<float>() + g,
-          output.data_ptr<float>() + k * output_hxw);
+          input_data + i * input_hxw,
+          kernel_data + g * 3 * 3,
+          bias_data + g,
+          output_data + k * output_hxw);
     }
   });
 
diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
index f2346759cbc09..04d82d365baa3 100644
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@@ -146,7 +146,7 @@ struct Dist {
 
   template <typename F>
   static void run_parallel_pdist(Tensor& result, const Tensor& self, const scalar_t p) {
-    const scalar_t * const self_start = self.data_ptr<scalar_t>();
+    const scalar_t * const self_start = self.const_data_ptr<scalar_t>();
     const scalar_t * const self_end = self_start + self.numel();
     int64_t n = self.size(0);
     int64_t m = self.size(1);
@@ -203,8 +203,8 @@ struct Dist {
 
   template <typename F>
   static void run_parallel_cdist(Tensor& result, const Tensor& t1, const Tensor& t2, const scalar_t p) {
-    const scalar_t * const t1_start = t1.data_ptr<scalar_t>();
-    const scalar_t * const t2_start = t2.data_ptr<scalar_t>();
+    const scalar_t * const t1_start = t1.const_data_ptr<scalar_t>();
+    const scalar_t * const t2_start = t2.const_data_ptr<scalar_t>();
     int64_t d = t1.size(0);
     int64_t r1 = t1.size(-2);
     int64_t r2 = t2.size(-2);
@@ -296,14 +296,14 @@ struct Dist {
     const int64_t m = self.size(1);
     const int64_t gs = grad.stride(0);
 
-    const scalar_t * const grad_start = grad.data_ptr<scalar_t>();
-    const scalar_t * const dist_start = dist.data_ptr<scalar_t>();
-    const scalar_t * const self_start = self.data_ptr<scalar_t>();
+    const scalar_t * const grad_start = grad.const_data_ptr<scalar_t>();
+    const scalar_t * const dist_start = dist.const_data_ptr<scalar_t>();
+    const scalar_t * const self_start = self.const_data_ptr<scalar_t>();
     scalar_t * const res_start = result.data_ptr<scalar_t>();
 
     // The only way to parallelize and avoid locking requires parallelizing
     // over the columns of the input, i.e. we compute the gradient for the
-    // first section of each vector independentaly of the second section, etc.
+    // first section of each vector independently of the second section, etc.
     at::parallel_for(0, m / Vec::size(), internal::GRAIN_SIZE / (8 * n * n), [p, n, m, gs, grad_start, dist_start, self_start, res_start](int64_t l, int64_t end) {
       const Vec pvec(p);
 
@@ -367,10 +367,10 @@ struct Dist {
     //don't use grad.stride(-1), because if last dimension is 1, stride can be bogus.
     const int64_t gs = 1;
 
-    const scalar_t * const grad_start = grad.data_ptr<scalar_t>();
-    const scalar_t * const dist_start = dist.data_ptr<scalar_t>();
-    const scalar_t * const t1_start = t1.data_ptr<scalar_t>();
-    const scalar_t * const t2_start = t2.data_ptr<scalar_t>();
+    const scalar_t * const grad_start = grad.const_data_ptr<scalar_t>();
+    const scalar_t * const dist_start = dist.const_data_ptr<scalar_t>();
+    const scalar_t * const t1_start = t1.const_data_ptr<scalar_t>();
+    const scalar_t * const t2_start = t2.const_data_ptr<scalar_t>();
     scalar_t * const res_start = result.data_ptr<scalar_t>();
 
     at::parallel_for(0, m / Vec::size(), internal::GRAIN_SIZE / (16 * r1), [=](int64_t l, int64_t end) {
diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h
index 1a1039b916f8e..93a9b33b29285 100644
--- a/aten/src/ATen/native/cpu/DistributionTemplates.h
+++ b/aten/src/ATen/native/cpu/DistributionTemplates.h
@@ -15,7 +15,6 @@
 #include <c10/util/irange.h>
 #endif
 
-
 namespace at {
 namespace native {
 namespace templates {
@@ -149,6 +148,62 @@ static void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t s
   }
 }
 
+#if defined(__VSX__)  || defined(CPU_CAPABILITY_VSX)
+static void normal_fill_16_VSX(float *data,const Vectorized<float> &two_pi,const Vectorized<float> &one,const Vectorized<float> &minus_two,const Vectorized<float> &mean,const Vectorized<float> &std) {
+  using Vec = Vectorized<float>;
+  Vec u1=one-Vec::loadu(data);
+  Vec u2=Vec::loadu(data+8);
+  Vec radius=(minus_two * u1.log());
+  radius=radius.sqrt();
+  Vec theta=two_pi * u2;
+  Vec output_vec=radius * theta.cos() * std + mean;
+  Vec output_vec2=radius * theta.sin() * std + mean;
+  output_vec.store(data);
+  output_vec2.store(data+8);
+}
+
+template <typename scalar_t, typename RNG>
+void normal_fill_VSX(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) {
+  float *data = self.data_ptr<float>();
+  auto size = self.numel();
+  std::lock_guard<std::mutex> lock(generator->mutex_);
+  for (const auto i : c10::irange(size)) {
+    at::uniform_real_distribution<scalar_t> uniform(0, 1);
+    data[i] = uniform(generator);
+  }
+
+  using Vec = Vectorized<float>;
+  const Vec two_pi = Vec(2.0f * c10::pi<double>);
+  const Vec one = Vec(1.0f);
+  const Vec minus_two = Vec(-2.0f);
+  const Vec var_vec  = Vec(std);
+  const Vec mean_vec = Vec(mean);
+
+  for (int64_t i = 0; i < size - 15; i += 16) {
+    if(Vec::size()==8) {
+      normal_fill_16_VSX(data + i, two_pi, one, minus_two, mean_vec, var_vec);
+    }
+    else{
+      normal_fill_16<scalar_t>(data + i, mean, std);
+    }
+  }
+  if (size % 16 != 0) {
+    // Recompute the last 16 values.
+    data = data + size - 16;
+    for (const auto i : c10::irange(16)) {
+      at::uniform_real_distribution<scalar_t> uniform(0, 1);
+      data[i] = uniform(generator);
+    }
+    if(Vec::size()==8){
+      normal_fill_16_VSX(data, two_pi, one, minus_two, mean_vec, var_vec);
+    }
+    else{
+      normal_fill_16<scalar_t>(data, mean, std);
+    }
+  }
+}
+#endif //VSX
+
 template <typename scalar_t, typename RNG>
 void normal_fill(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) {
   scalar_t *data = self.data_ptr<scalar_t>();
@@ -179,6 +234,8 @@ void normal_kernel(const TensorBase &self, double mean, double std, RNG generato
   if (self.scalar_type() == ScalarType::Float && size >= 16 && self.is_contiguous()) {
 #ifdef CPU_CAPABILITY_AVX2
     normal_fill_AVX2(self, static_cast<float>(mean), static_cast<float>(std), generator);
+#elif defined(__VSX__)  || defined(CPU_CAPABILITY_VSX)
+    normal_fill_VSX(self, static_cast<float>(mean), static_cast<float>(std), generator);
 #else
     normal_fill(self, static_cast<float>(mean), static_cast<float>(std), generator);
 #endif
@@ -321,7 +378,7 @@ void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG generato
     auto p = expand_inplace(self, p_cpu);
     auto iter = TensorIteratorConfig()
         .add_output(self)
-        .add_input(*p)
+        .add_const_input(*p)
         .check_all_same_dtype(false)
         .build();
     if (p->scalar_type() == kDouble) {
diff --git a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
index e58ebd17c255a..cb96f24ebdde6 100644
--- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
+++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
@@ -23,25 +23,25 @@ namespace {
 
 // 1) out = exp(a - val)
 // 2) val = sum(out)
-template <typename scalar_t>
+template <typename T1, typename T2>
 inline void _exp_reduce_sum_fusion_kernel(
-    scalar_t* a,
+    T1* a,
     const int& size,
-    scalar_t* out,
-    scalar_t& val) {
-  auto vec_size = vec::Vectorized<scalar_t>::size();
-  auto vec_max = vec::Vectorized<scalar_t>(val);
-  scalar_t tmp_sum = 0;
-  auto vec_tmp_sum = vec::Vectorized<scalar_t>(tmp_sum);
+    T2* out,
+    T1& val) {
+  auto vec_size = vec::Vectorized<T1>::size();
+  auto vec_max = vec::Vectorized<T1>(val);
+  T1 tmp_sum = 0;
+  auto vec_tmp_sum = vec::Vectorized<T1>(tmp_sum);
   for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) {
-    auto tmp0 = vec::Vectorized<scalar_t>::loadu(a + i);
+    auto tmp0 = vec::Vectorized<T1>::loadu(a + i);
     auto tmp1 = tmp0 - vec_max;
     auto tmp2 = tmp1.exp_u20();
     vec_tmp_sum += tmp2;
     _store(out + i, tmp2);
   }
-  tmp_sum = vec::vec_reduce_all<scalar_t>(
-      [](vec::Vectorized<scalar_t>& x, vec::Vectorized<scalar_t>& y) {
+  tmp_sum = vec::vec_reduce_all<T1>(
+      [](vec::Vectorized<T1>& x, vec::Vectorized<T1>& y) {
         return x + y;
       },
       vec_tmp_sum);
@@ -55,27 +55,6 @@ inline void _exp_reduce_sum_fusion_kernel(
   val = tmp_sum;
 }
 
-// out = a / sum
-template <typename T1, typename T2>
-inline void _normalization_kernel(
-    const T1* a,
-    const T1& sum,
-    const int& size,
-    T2* out) {
-  auto vec_size = vec::Vectorized<T1>::size();
-  auto vec_sum = vec::Vectorized<T1>(sum);
-  for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) {
-    auto tmp0 = vec::Vectorized<T1>::loadu(a + i);
-    auto tmp1 = tmp0 / vec_sum;
-    _store(out + i, tmp1);
-  }
-  for (long i = vec_size * (size / vec_size); i < size; i++) {
-    auto tmp0 = a[i];
-    auto tmp1 = tmp0 / sum;
-    out[i] = tmp1;
-  }
-}
-
 // 1) out = a * scale
 // 2) max = max(out)
 template <typename scalar_t>
@@ -112,7 +91,7 @@ inline void _mul_reduce_max_fusion_kernel(
 
 template <typename scalar_t>
 static inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) {
-  TORCH_INTERNAL_ASSERT(ptr2 == nullptr);
+  TORCH_CHECK(ptr2 == nullptr);
   return ptr;
 }
 
@@ -253,11 +232,11 @@ void cpu_flash_attention(
   at::Tensor buf_reduced = at::empty({num_thread, qSplitSize, is_reduced_type ? kvSplitSize : 0}, query.options());
 
   // Data ptrs
-  scalar_t* q_data = query.data_ptr<scalar_t>();
-  scalar_t* k_data = key.data_ptr<scalar_t>();
-  scalar_t* v_data = value.data_ptr<scalar_t>();
-  accum_t* mask_data = has_attn_mask
-      ? attn_mask.value().data_ptr<accum_t>()
+  const scalar_t* q_data = query.const_data_ptr<scalar_t>();
+  const scalar_t* k_data = key.const_data_ptr<scalar_t>();
+  const scalar_t* v_data = value.const_data_ptr<scalar_t>();
+  const accum_t* mask_data = has_attn_mask
+      ? attn_mask.value().const_data_ptr<accum_t>()
       : nullptr;
   scalar_t* out_data = output.data_ptr<scalar_t>();
   accum_t* lse_data = logsumexp.data_ptr<accum_t>();
@@ -331,9 +310,8 @@ void cpu_flash_attention(
           }
         }
         // Update coefficients with Softmax
-        accum_t tmp_max = 0, tmp_sum = 0, sum_old = 0, exp_tmp = 0;
+        accum_t tmp_max = 0, tmp_sum = 0, exp_tmp = 0;
         for (int64_t row = 0; row < qBlockSize; ++row) {
-          sum_old = qk_sum_data[row];
           if (has_attn_mask) {
             // max per row
             tmp_max = at::vec::reduce_all<accum_t>(
@@ -352,23 +330,20 @@ void cpu_flash_attention(
           tmp_max = qk_max_data[row] > tmp_max ? qk_max_data[row] : tmp_max;
           // qk <- exp(qk - max) and sum per row
           tmp_sum = tmp_max;
-          _exp_reduce_sum_fusion_kernel(qk_data + row * kvBlockSize, kvBlockSize, qk_data + row * kvBlockSize, tmp_sum);
+          _exp_reduce_sum_fusion_kernel(
+              qk_data + row * kvBlockSize, kvBlockSize,
+              conditional_data_ptr(qk_data, qk_reduced_data) + row * kvBlockSize,
+              tmp_sum);
           // exp_tmp <- exp(max[row] - max)
           exp_tmp = std::exp(qk_max_data[row] - tmp_max);
           // sum[row] <- sum + exp_tmp * sum[row]
           qk_sum_data[row] = tmp_sum + exp_tmp * qk_sum_data[row];
           // max[row] <- max
           qk_max_data[row] = tmp_max;
-          // qk <- qk / sum[row]
-          accum_t sum_new = qk_sum_data[row];
-          _normalization_kernel(qk_data + row * kvBlockSize, sum_new, kvBlockSize,
-                conditional_data_ptr(qk_data, qk_reduced_data) + row * kvBlockSize);
-          // dst <- dst * sum_old / sum_new * exp_tmp
+          // dst <- dst * exp_tmp
           if (n > 0) {
-            accum_t sum_cor = sum_old / sum_new;
             vec::map<accum_t>(
-              [sum_cor, exp_tmp](Vec x)
-              { return x * Vec(sum_cor) * Vec(exp_tmp); },
+              [exp_tmp](Vec x) { return x * Vec(exp_tmp); },
               dst_data + row * headSize, dst_data + row * headSize, headSize);
           }
         }
@@ -389,10 +364,12 @@ void cpu_flash_attention(
             dst_data,
             headSize);
       }
+      // dst <- dst / sum[row]
       // reorder MHA output with strides
       for (int64_t row = 0; row < qBlockSize; ++row) {
+        accum_t sum_reciprocal = 1 / qk_sum_data[row];
         vec::map<scalar_t>(
-          [](Vec x) { return x; },
+          [sum_reciprocal](Vec x) { return x * Vec(sum_reciprocal); },
           out_data + i * oStrideB + j * oStrideH + m * oStrideM + row * oStrideM,
           dst_data + row * headSize,
           headSize);
@@ -516,15 +493,15 @@ void cpu_flash_attention_backward(
   scalar_t* grad_q_data = grad_q.data_ptr<scalar_t>();
   scalar_t* grad_k_data = grad_k.data_ptr<scalar_t>();
   scalar_t* grad_v_data = grad_v.data_ptr<scalar_t>();
-  scalar_t* grad_out_data = grad_out.data_ptr<scalar_t>();
-  scalar_t* q_data = query.data_ptr<scalar_t>();
-  scalar_t* k_data = key.data_ptr<scalar_t>();
-  scalar_t* v_data = value.data_ptr<scalar_t>();
-  accum_t* mask_data = has_attn_mask
-      ? attn_mask.value().data_ptr<accum_t>()
+  const scalar_t* grad_out_data = grad_out.const_data_ptr<scalar_t>();
+  const scalar_t* q_data = query.const_data_ptr<scalar_t>();
+  const scalar_t* k_data = key.const_data_ptr<scalar_t>();
+  const scalar_t* v_data = value.const_data_ptr<scalar_t>();
+  const accum_t* mask_data = has_attn_mask
+      ? attn_mask.value().const_data_ptr<accum_t>()
       : nullptr;
-  scalar_t* out_data = out.data_ptr<scalar_t>();
-  accum_t* lse_data = logsumexp.data_ptr<accum_t>();
+  const scalar_t* out_data = out.const_data_ptr<scalar_t>();
+  const accum_t* lse_data = logsumexp.const_data_ptr<accum_t>();
   accum_t* buf_data = buf.data_ptr<accum_t>();
   scalar_t* buf_reduced_data = is_reduced_type ? buf_reduced.data_ptr<scalar_t>() : nullptr;
 
@@ -721,7 +698,7 @@ void flash_attention_kernel_impl(
     c10::optional<double> scale) {
   auto q_seq_len = query.size(2);
 
-  AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, query.scalar_type(), "flash_attention", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, query.scalar_type(), "flash_attention", [&] {
     if (q_seq_len >= 768) {
       cpu_flash_attention<scalar_t, 256, 512>(
         output, logsumexp, query, key, value,
@@ -758,7 +735,7 @@ void flash_attention_backward_kernel_impl(
   auto grad_out_contig = grad_out.contiguous();
   auto q_seq_len = query.size(1);
 
-  AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, query.scalar_type(), "flash_attention_backward", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, query.scalar_type(), "flash_attention_backward", [&] {
     if (q_seq_len >= 768) {
       cpu_flash_attention_backward<scalar_t, 256, 512>(
         grad_q, grad_k, grad_v, grad_out_contig,
diff --git a/aten/src/ATen/native/cpu/FusedAdamKernel.cpp b/aten/src/ATen/native/cpu/FusedAdamKernel.cpp
new file mode 100644
index 0000000000000..4a10fe202c4a0
--- /dev/null
+++ b/aten/src/ATen/native/cpu/FusedAdamKernel.cpp
@@ -0,0 +1,379 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Parallel.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/FusedAdam.h>
+#include <ATen/Dispatch.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/functional.h>
+namespace at::native {
+
+namespace{
+
+template <typename scalar_t, typename opmath_t, ADAM_MODE adam_mode>
+typename std::enable_if<
+    std::is_same<scalar_t, Half>::value || std::is_same<scalar_t, BFloat16>::value,
+    void>::
+    type inline adam_math(
+  scalar_t* param_ptr,
+  scalar_t* exp_avg_ptr,
+  scalar_t* exp_avg_sq_ptr,
+  scalar_t* grad_ptr,
+  scalar_t* max_exp_avg_sq_ptr,
+  double lr,
+  double bias_correction1,
+  double bias_correction2,
+  double exp_avg_grad_coefficient,
+  double exp_avg_sq_grad_coefficient,
+  double bias_correction2_sqrt,
+  double eps,
+  double weight_decay,
+  double beta2,
+  bool amsgrad,
+  bool maximize,
+  const float* grad_scale_ptr,
+  int64_t size
+){
+  double step_size = lr / bias_correction1;
+  using lpVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<opmath_t>;
+  lpVec grad_vec_to_store;
+  int64_t d = 0;
+  fVec param_vec1, param_vec2;
+  fVec grad_vec1, grad_vec2;
+  fVec exp_avg_vec1, exp_avg_vec2;
+  fVec exp_avg_sq_vec1, exp_avg_sq_vec2;
+  fVec max_exp_avg_sq_vec1, max_exp_avg_sq_vec2;
+  for (; d < size - (size % lpVec::size()); d += lpVec::size()) {
+    lpVec param_lpvec = lpVec::loadu(param_ptr + d);
+    std::tie(param_vec1, param_vec2) = vec::convert_to_float<scalar_t>(param_lpvec);
+    lpVec grad_lpvec = lpVec::loadu(grad_ptr + d);
+    std::tie(grad_vec1, grad_vec2) = vec::convert_to_float<scalar_t>(grad_lpvec);
+    if (grad_scale_ptr) {
+      grad_vec1 = grad_vec1 / fVec(float(*grad_scale_ptr));
+      grad_vec2 = grad_vec2 / fVec(float(*grad_scale_ptr));
+      grad_vec_to_store = vec::convert_from_float<scalar_t>(grad_vec1, grad_vec2);
+      grad_vec_to_store.store(grad_ptr + d);
+    }
+    if (maximize){
+      grad_vec1 = grad_vec1 * fVec(opmath_t(-1.0));
+      grad_vec2 = grad_vec2 * fVec(opmath_t(-1.0));
+    }
+    if (weight_decay != 0.f){
+      if constexpr (adam_mode == ADAM_MODE::ORIGINAL) {
+        grad_vec1 += param_vec1 * fVec(opmath_t(weight_decay));
+        grad_vec2 += param_vec2 * fVec(opmath_t(weight_decay));
+       } else if constexpr (adam_mode == ADAM_MODE::ADAMW) {
+        param_vec1 = param_vec1 * fVec(opmath_t(1 - lr * weight_decay));
+        param_vec2 = param_vec2 * fVec(opmath_t(1 - lr * weight_decay));
+      }
+    }
+
+    lpVec exp_avg_lpvec = lpVec::loadu(exp_avg_ptr + d);
+    std::tie(exp_avg_vec1, exp_avg_vec2) = vec::convert_to_float<scalar_t>(exp_avg_lpvec);
+
+    // exp_avg.lerp_(grad, 1 - beta1)
+    const fVec lerp_weight = fVec(opmath_t(exp_avg_grad_coefficient));
+    auto mask = lerp_weight.abs() < fVec(0.5);
+    auto coeff = fVec::blendv(lerp_weight - fVec(1), lerp_weight, mask);
+
+    auto base1 = fVec::blendv(grad_vec1, exp_avg_vec1, mask);
+    exp_avg_vec1 = vec::fmadd(coeff, grad_vec1 - exp_avg_vec1, base1);
+
+    auto base2 = fVec::blendv(grad_vec2, exp_avg_vec2, mask);
+    exp_avg_vec2 = vec::fmadd(coeff, grad_vec2 - exp_avg_vec2, base2);
+
+    lpVec exp_avg_sq_lpvec = lpVec::loadu(exp_avg_sq_ptr + d);
+    std::tie(exp_avg_sq_vec1, exp_avg_sq_vec2) = vec::convert_to_float<scalar_t>(exp_avg_sq_lpvec);
+    exp_avg_sq_vec1 = exp_avg_sq_vec1 * fVec(opmath_t(beta2)) +
+        fVec(opmath_t(exp_avg_sq_grad_coefficient)) * grad_vec1 * grad_vec1;
+    exp_avg_sq_vec2 = exp_avg_sq_vec2 * fVec(opmath_t(beta2)) +
+        fVec(opmath_t(exp_avg_sq_grad_coefficient)) * grad_vec2 * grad_vec2;
+
+    vec::convert_from_float<scalar_t>(exp_avg_vec1, exp_avg_vec2).store(exp_avg_ptr + d);
+    vec::convert_from_float<scalar_t>(exp_avg_sq_vec1, exp_avg_sq_vec2).store(exp_avg_sq_ptr + d);
+
+    fVec denom_vec1, denom_vec2;
+    if (amsgrad) {
+      lpVec max_exp_avg_sq_lpvec = lpVec::loadu(max_exp_avg_sq_ptr + d);
+      std::tie(max_exp_avg_sq_vec1, max_exp_avg_sq_vec2) = vec::convert_to_float<scalar_t>(max_exp_avg_sq_lpvec);
+      max_exp_avg_sq_vec1 = maximum(max_exp_avg_sq_vec1, exp_avg_sq_vec1);
+      max_exp_avg_sq_vec2 = maximum(max_exp_avg_sq_vec2, exp_avg_sq_vec2);
+      vec::convert_from_float<scalar_t>(max_exp_avg_sq_vec1, max_exp_avg_sq_vec2).store(max_exp_avg_sq_ptr + d);
+      denom_vec1 =
+          (max_exp_avg_sq_vec1.sqrt() / fVec(opmath_t(bias_correction2_sqrt))) + fVec(opmath_t(eps));
+      denom_vec2 =
+          (max_exp_avg_sq_vec2.sqrt() / fVec(opmath_t(bias_correction2_sqrt))) + fVec(opmath_t(eps));
+    } else {
+      denom_vec1 =
+          (exp_avg_sq_vec1.sqrt() / fVec(opmath_t(bias_correction2_sqrt))) + fVec(opmath_t(eps));
+      denom_vec2 =
+          (exp_avg_sq_vec2.sqrt() / fVec(opmath_t(bias_correction2_sqrt))) + fVec(opmath_t(eps));
+    }
+    param_vec1 = param_vec1 + fVec(opmath_t(-step_size)) * exp_avg_vec1 / denom_vec1;
+    param_vec2 = param_vec2 + fVec(opmath_t(-step_size)) * exp_avg_vec2 / denom_vec2;
+    vec::convert_from_float<scalar_t>(param_vec1, param_vec2).store(param_ptr + d);
+  }
+  scalar_t grad_val_to_store;
+  for (; d < size; d++) {
+    opmath_t grad_val = grad_ptr[d];
+    opmath_t param_val = param_ptr[d];
+    if (grad_scale_ptr) {
+      grad_val = grad_ptr[d] / float(*grad_scale_ptr);
+      grad_val_to_store = scalar_t(grad_val);
+      grad_ptr[d] = grad_val_to_store;
+    }
+    if (maximize) grad_val = -grad_val;
+    if (weight_decay != 0.f){
+      if constexpr (adam_mode == ADAM_MODE::ORIGINAL) {
+        grad_val += param_val * opmath_t(weight_decay);
+      } else if constexpr (adam_mode == ADAM_MODE::ADAMW) {
+        param_val = param_val * opmath_t(1 - lr * weight_decay);
+      }
+    }
+    // exp_avg.lerp_(grad, 1 - beta1)
+    opmath_t exp_avg_var = exp_avg_ptr[d];
+    auto is_lerp_weight_small = std::abs(opmath_t(exp_avg_grad_coefficient)) < opmath_t(0.5);
+    if (is_lerp_weight_small) {
+      exp_avg_var = exp_avg_var + opmath_t(exp_avg_grad_coefficient) * (grad_val - exp_avg_var);
+    } else {
+      exp_avg_var = grad_val - (grad_val - exp_avg_var) * (opmath_t(1) - opmath_t(exp_avg_grad_coefficient));
+    }
+    exp_avg_ptr[d] = scalar_t(exp_avg_var);
+    opmath_t exp_avg_sq_var = exp_avg_sq_ptr[d];
+    exp_avg_sq_var = exp_avg_sq_var * opmath_t(beta2);
+    exp_avg_sq_var = exp_avg_sq_var +
+        opmath_t(exp_avg_sq_grad_coefficient) * grad_val * grad_val;
+    exp_avg_sq_ptr[d] = scalar_t(exp_avg_sq_var);
+    opmath_t demon_val;
+    if (amsgrad) {
+      opmath_t max_exp_avg_sq_var = max_exp_avg_sq_ptr[d];
+      max_exp_avg_sq_var = std::max(max_exp_avg_sq_var, exp_avg_sq_var);
+      max_exp_avg_sq_ptr[d] =
+          scalar_t(max_exp_avg_sq_var);
+      demon_val =
+          std::sqrt(max_exp_avg_sq_var) / opmath_t(bias_correction2_sqrt) + opmath_t(eps);
+    } else {
+      demon_val = std::sqrt(exp_avg_sq_var) / opmath_t(bias_correction2_sqrt) + opmath_t(eps);
+    }
+    param_ptr[d] = param_val - opmath_t(step_size) * exp_avg_var / demon_val;
+  }
+}
+
+
+template <typename scalar_t, typename opmath_t, ADAM_MODE adam_mode>
+typename std::enable_if<
+    std::is_same<scalar_t, float>::value || std::is_same<scalar_t, double>::value,
+    void>::
+    type inline adam_math(
+  scalar_t* param_ptr,
+  scalar_t* exp_avg_ptr,
+  scalar_t* exp_avg_sq_ptr,
+  scalar_t* grad_ptr,
+  scalar_t* max_exp_avg_sq_ptr,
+  double lr,
+  double bias_correction1,
+  double bias_correction2,
+  double exp_avg_grad_coefficient,
+  double exp_avg_sq_grad_coefficient,
+  double bias_correction2_sqrt,
+  double eps,
+  double weight_decay,
+  double beta2,
+  bool amsgrad,
+  bool maximize,
+  const float* grad_scale_ptr,
+  int64_t size
+){
+  double step_size = lr / bias_correction1;
+  using Vec = at::vec::Vectorized<scalar_t>;
+  Vec grad_vec_to_store;
+  int64_t d = 0;
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+    Vec param_vec = Vec::loadu(param_ptr + d);
+    Vec grad_vec = Vec::loadu(grad_ptr + d);
+    if (grad_scale_ptr) {
+      grad_vec = grad_vec / Vec(scalar_t(*grad_scale_ptr));
+      grad_vec_to_store = grad_vec;
+      grad_vec_to_store.store(grad_ptr + d);
+    }
+    if (maximize) grad_vec = grad_vec * Vec(scalar_t(-1.0));
+    if (weight_decay != 0.f){
+      if constexpr (adam_mode == ADAM_MODE::ORIGINAL) {
+        grad_vec += param_vec * Vec(scalar_t(weight_decay));
+      } else if constexpr (adam_mode == ADAM_MODE::ADAMW) {
+        param_vec = param_vec * Vec(scalar_t(1 - lr * weight_decay));
+      }
+    }
+    Vec exp_avg_vec = Vec::loadu(exp_avg_ptr + d);
+    // exp_avg.lerp_(grad, 1 - beta1)
+    const Vec lerp_weight = Vec(scalar_t(exp_avg_grad_coefficient));
+    auto mask = lerp_weight.abs() < Vec(0.5);
+    auto coeff = Vec::blendv(lerp_weight - Vec(1), lerp_weight, mask);
+    auto base = Vec::blendv(grad_vec, exp_avg_vec, mask);
+    exp_avg_vec = vec::fmadd(coeff, grad_vec - exp_avg_vec, base);
+
+    Vec exp_avg_sq_vec = Vec::loadu(exp_avg_sq_ptr + d) * Vec(scalar_t(beta2)) +
+        Vec(scalar_t(exp_avg_sq_grad_coefficient)) * grad_vec * grad_vec;
+    exp_avg_vec.store(exp_avg_ptr + d);
+    exp_avg_sq_vec.store(exp_avg_sq_ptr + d);
+
+    Vec denom_vec;
+    if (amsgrad) {
+      Vec max_exp_avg_sq_vec =
+          maximum(Vec::loadu(max_exp_avg_sq_ptr + d), exp_avg_sq_vec);
+      max_exp_avg_sq_vec.store(max_exp_avg_sq_ptr + d);
+      denom_vec =
+          (max_exp_avg_sq_vec.sqrt() / Vec(scalar_t(bias_correction2_sqrt))) + Vec(scalar_t(eps));
+    } else {
+      denom_vec =
+          (exp_avg_sq_vec.sqrt() / Vec(scalar_t(bias_correction2_sqrt))) + Vec(scalar_t(eps));
+    }
+    param_vec = param_vec + Vec(scalar_t(-step_size)) * exp_avg_vec / denom_vec;
+    param_vec.store(param_ptr + d);
+  }
+  scalar_t grad_val_to_store;
+  for (; d < size; d++) {
+    scalar_t grad_val = grad_ptr[d];
+    if (grad_scale_ptr) {
+      grad_val = grad_ptr[d] / scalar_t(*grad_scale_ptr);
+      grad_val_to_store = grad_val;
+      grad_ptr[d] = grad_val_to_store;
+    }
+    if (maximize) grad_val = -grad_val;
+    if (weight_decay != 0.f){
+      if constexpr (adam_mode == ADAM_MODE::ORIGINAL) {
+        grad_val += param_ptr[d] * scalar_t(weight_decay);
+      } else if constexpr (adam_mode == ADAM_MODE::ADAMW) {
+        param_ptr[d] = param_ptr[d] * scalar_t(1 - lr * weight_decay);
+      }
+    }
+    // exp_avg.lerp_(grad, 1 - beta1)
+    auto is_lerp_weight_small = std::abs(scalar_t(exp_avg_grad_coefficient)) < scalar_t(0.5);
+    if (is_lerp_weight_small) {
+      exp_avg_ptr[d] = exp_avg_ptr[d] + scalar_t(exp_avg_grad_coefficient) * (grad_val - exp_avg_ptr[d]);
+    } else {
+      exp_avg_ptr[d] = grad_val - (grad_val - exp_avg_ptr[d]) * (scalar_t(1) - scalar_t(exp_avg_grad_coefficient));
+    }
+    exp_avg_sq_ptr[d] = exp_avg_sq_ptr[d] * scalar_t(beta2);
+    exp_avg_sq_ptr[d] = exp_avg_sq_ptr[d] +
+        scalar_t(exp_avg_sq_grad_coefficient) * grad_val * grad_val;
+    scalar_t demon_val;
+    if (amsgrad) {
+      max_exp_avg_sq_ptr[d] =
+          std::max(max_exp_avg_sq_ptr[d], exp_avg_sq_ptr[d]);
+      demon_val =
+          std::sqrt(max_exp_avg_sq_ptr[d]) / scalar_t(bias_correction2_sqrt) + scalar_t(eps);
+    } else {
+      demon_val = std::sqrt(exp_avg_sq_ptr[d]) / scalar_t(bias_correction2_sqrt) + scalar_t(eps);
+    }
+    param_ptr[d] = param_ptr[d] - scalar_t(step_size) * exp_avg_ptr[d] / demon_val;
+  }
+}
+
+
+template <typename scalar_t, ADAM_MODE adam_mode>
+void adam_fused_step_impl(
+    const at::Tensor& param,
+    const at::Tensor& grad,
+    const at::Tensor& exp_avg,
+    const at::Tensor& exp_avg_sq,
+    const at::Tensor& max_exp_avg_sq,
+    const at::Tensor& state_step,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool amsgrad,
+    const bool maximize,
+    const float* grad_scale_ptr) {
+  using opmath_t = at::opmath_type<scalar_t>;
+  double step = state_step.item<float>();
+  scalar_t* param_data = param.data_ptr<scalar_t>();
+  scalar_t* exp_avg_data = exp_avg.data_ptr<scalar_t>();
+  scalar_t* exp_avg_sq_data = exp_avg_sq.data_ptr<scalar_t>();
+  scalar_t* max_exp_avg_sq_data = amsgrad ? max_exp_avg_sq.data_ptr<scalar_t>() : nullptr;
+  scalar_t* grad_data = grad.data_ptr<scalar_t>();
+
+  // need to use double here to align with non-fused adam
+  double bias_correction1 = 1 - std::pow(beta1, step);
+  double bias_correction2 = 1 - std::pow(beta2, step);
+  double exp_avg_grad_coefficient = 1 - beta1;
+  double exp_avg_sq_grad_coefficient = 1 - beta2;
+  double bias_correction2_sqrt = std::sqrt(bias_correction2);
+
+
+  constexpr size_t cache_line_size = 64;
+  constexpr int64_t cache_line_aligned_task_unit = cache_line_size / sizeof(scalar_t);
+  size_t num_units = divup(param.numel(), cache_line_aligned_task_unit);
+
+  auto adam_fn = [&](int64_t begin, int64_t end) {
+        // local pointers
+        begin *= cache_line_aligned_task_unit;
+        end = std::min(end * cache_line_aligned_task_unit, param.numel());
+        scalar_t* param_ptr = param_data + begin;
+        scalar_t* exp_avg_ptr = exp_avg_data + begin;
+        scalar_t* exp_avg_sq_ptr = exp_avg_sq_data + begin;
+        scalar_t* grad_ptr = grad_data + begin;
+        scalar_t* max_exp_avg_sq_ptr = amsgrad ? max_exp_avg_sq_data + begin : nullptr;
+
+        const int64_t size = end - begin;
+        adam_math<scalar_t, opmath_t, adam_mode>(
+          param_ptr,
+          exp_avg_ptr,
+          exp_avg_sq_ptr,
+          grad_ptr,
+          max_exp_avg_sq_ptr,
+          lr,
+          bias_correction1,
+          bias_correction2,
+          exp_avg_grad_coefficient,
+          exp_avg_sq_grad_coefficient,
+          bias_correction2_sqrt,
+          eps,
+          weight_decay,
+          beta2,
+          amsgrad,
+          maximize,
+          grad_scale_ptr,
+          size
+        );
+      };
+  at::parallel_for(
+      0, num_units, 0, adam_fn);
+}
+
+void fused_adam_kernel(
+    const at::Tensor& param,
+    const at::Tensor& grad,
+    const at::Tensor& exp_avg,
+    const at::Tensor& exp_avg_sq,
+    const at::Tensor& max_exp_avg_sq,
+    const at::Tensor& state_step,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool amsgrad,
+    const bool maximize,
+    const float* grad_scale_ptr,
+    const ADAM_MODE adam_mode
+  ) {
+  Tensor grad_contiguous = grad.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, param.scalar_type(), "fused_adam_kernel", [&] {
+    if(adam_mode == ADAM_MODE::ORIGINAL){
+      adam_fused_step_impl<scalar_t, ADAM_MODE::ORIGINAL>(param, grad, exp_avg, exp_avg_sq, max_exp_avg_sq, state_step, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale_ptr);
+    } else {
+      adam_fused_step_impl<scalar_t, ADAM_MODE::ADAMW>(param, grad, exp_avg, exp_avg_sq, max_exp_avg_sq, state_step, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale_ptr);
+    }
+
+  });
+}
+
+}
+
+REGISTER_DISPATCH(fused_adam_stub, &fused_adam_kernel);
+} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/FusedSGDKernel.cpp b/aten/src/ATen/native/cpu/FusedSGDKernel.cpp
new file mode 100644
index 0000000000000..c19aa249a1594
--- /dev/null
+++ b/aten/src/ATen/native/cpu/FusedSGDKernel.cpp
@@ -0,0 +1,278 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Parallel.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/FusedSGD.h>
+#include <ATen/Dispatch.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/functional.h>
+namespace at::native {
+
+namespace{
+
+template <typename scalar_t, typename opmath_t>
+typename std::enable_if<
+    std::is_same<scalar_t, Half>::value || std::is_same<scalar_t, BFloat16>::value,
+    void>::
+    type inline sgd_math(
+  scalar_t* param_ptr,
+  scalar_t* grad_ptr,
+  scalar_t* momentum_buf_ptr,
+  const double weight_decay,
+  const double momentum,
+  const double lr,
+  const double dampening,
+  const bool nesterov,
+  const bool maximize,
+  const bool is_first_step,
+  const float* grad_scale_ptr,
+  int64_t size
+){
+  using lpVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<opmath_t>;
+  lpVec grad_vec_to_store;
+  fVec param_vec1, param_vec2;
+  fVec grad_vec1, grad_vec2;
+  fVec momentum_buffer_vec1, momentum_buffer_vec2;
+  int64_t d = 0;
+  for (; d < size - (size % lpVec::size()); d += lpVec::size()) {
+    lpVec param_lpvec = lpVec::loadu(param_ptr + d);
+    std::tie(param_vec1, param_vec2) = vec::convert_to_float<scalar_t>(param_lpvec);
+    lpVec grad_lpvec = lpVec::loadu(grad_ptr + d);
+    std::tie(grad_vec1, grad_vec2) = vec::convert_to_float<scalar_t>(grad_lpvec);
+    if (grad_scale_ptr) {
+      grad_vec1 = grad_vec1 / fVec(float(*grad_scale_ptr));
+      grad_vec2 = grad_vec2 / fVec(float(*grad_scale_ptr));
+      grad_vec_to_store = vec::convert_from_float<scalar_t>(grad_vec1, grad_vec2);
+      grad_vec_to_store.store(grad_ptr + d);
+    }
+    if (maximize){
+      grad_vec1 = grad_vec1 * fVec(opmath_t(-1.0));
+      grad_vec2 = grad_vec2 * fVec(opmath_t(-1.0));
+    }
+    if (weight_decay != 0.0){
+      grad_vec1 = vec::fmadd(param_vec1, fVec(scalar_t(weight_decay)), grad_vec1);
+      grad_vec2 = vec::fmadd(param_vec2, fVec(scalar_t(weight_decay)), grad_vec2);
+    }
+    if (momentum != 0.0) {
+      fVec momentum_vec1, momentum_vec2;
+      if (is_first_step) {
+        momentum_vec1 = grad_vec1;
+        momentum_vec2 = grad_vec2;
+      } else {
+
+        momentum_vec1 = fVec::loadu(momentum_buf_ptr + d) * fVec(scalar_t(momentum));
+        momentum_vec2 = fVec::loadu(momentum_buf_ptr + d + fVec::size()) * fVec(scalar_t(momentum));
+        momentum_vec1 = vec::fmadd(fVec(scalar_t(1 - dampening)), grad_vec1, momentum_vec1);
+        momentum_vec2 = vec::fmadd(fVec(scalar_t(1 - dampening)), grad_vec2, momentum_vec2);
+      }
+      vec::convert_from_float<scalar_t>(momentum_vec1, momentum_vec2).store(momentum_buf_ptr + d);;
+      if (nesterov) {
+        grad_vec1 = vec::fmadd(momentum_vec1, fVec(scalar_t(momentum)), grad_vec1);
+        grad_vec2 = vec::fmadd(momentum_vec2, fVec(scalar_t(momentum)), grad_vec2);
+      } else {
+        grad_vec1 = momentum_vec1;
+        grad_vec2 = momentum_vec2;
+      }
+    }
+  }
+  scalar_t grad_val_to_store;
+  for (; d < size; d++) {
+    opmath_t grad_val = grad_ptr[d];
+    opmath_t param_val = param_ptr[d];
+    if (grad_scale_ptr) {
+      grad_val = grad_ptr[d] / opmath_t(*grad_scale_ptr);
+      grad_val_to_store = grad_val;
+      grad_ptr[d] = grad_val_to_store;
+    }
+    if (maximize) grad_val = -grad_val;
+    if (weight_decay != 0.0){
+      grad_val += param_val * opmath_t(weight_decay);
+    }
+    if (momentum != 0.0) {
+      opmath_t momentum_buf_var = momentum_buf_ptr[d];
+      if (is_first_step) {
+        momentum_buf_var = grad_val;
+      } else {
+        momentum_buf_var = momentum_buf_var * opmath_t(momentum) +
+            grad_val * opmath_t(1 - dampening);
+      }
+      momentum_buf_ptr[d] = momentum_buf_var;
+      if (nesterov) {
+        grad_val += momentum_buf_var * opmath_t(momentum);
+      } else {
+        grad_val = momentum_buf_var;
+      }
+    }
+    param_ptr[d] = param_val - grad_val * opmath_t(lr);
+  }
+}
+
+
+template <typename scalar_t, typename opmath_t>
+typename std::enable_if<
+    std::is_same<scalar_t, float>::value || std::is_same<scalar_t, double>::value,
+    void>::
+    type inline sgd_math(
+  scalar_t* param_ptr,
+  scalar_t* grad_ptr,
+  scalar_t* momentum_buf_ptr,
+  const double weight_decay,
+  const double momentum,
+  const double lr,
+  const double dampening,
+  const bool nesterov,
+  const bool maximize,
+  const bool is_first_step,
+  const float* grad_scale_ptr,
+  int64_t size
+){
+  using Vec = at::vec::Vectorized<scalar_t>;
+  Vec grad_vec_to_store;
+  int64_t d = 0;
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+    Vec param_vec = Vec::loadu(param_ptr + d);
+    Vec grad_vec = Vec::loadu(grad_ptr + d);
+    if (grad_scale_ptr) {
+      grad_vec = grad_vec / Vec(scalar_t(*grad_scale_ptr));
+      grad_vec_to_store = grad_vec;
+      grad_vec_to_store.store(grad_ptr + d);
+    }
+    if (maximize) grad_vec = grad_vec * Vec(scalar_t(-1.0));
+    if (weight_decay != 0.0){
+      grad_vec = vec::fmadd(param_vec, Vec(scalar_t(weight_decay)), grad_vec);
+    }
+    if (momentum != 0.0) {
+      Vec momentum_vec;
+      if (is_first_step) {
+        momentum_vec = grad_vec;
+      } else {
+        momentum_vec =
+            Vec::loadu(momentum_buf_ptr + d) * Vec(scalar_t(momentum));
+        momentum_vec = vec::fmadd(Vec(scalar_t(1 - dampening)), grad_vec, momentum_vec);
+      }
+      momentum_vec.store(momentum_buf_ptr + d);
+      if (nesterov) {
+        grad_vec =  vec::fmadd(momentum_vec, Vec(scalar_t(momentum)), grad_vec);
+      } else {
+        grad_vec = momentum_vec;
+      }
+    }
+    param_vec += grad_vec * Vec(scalar_t(-lr));
+    param_vec.store(param_ptr + d);
+  }
+  scalar_t grad_val_to_store;
+  for (; d < size; d++) {
+    scalar_t grad_val = grad_ptr[d];
+    if (grad_scale_ptr) {
+      grad_val = grad_ptr[d] / scalar_t(*grad_scale_ptr);
+      grad_val_to_store = grad_val;
+      grad_ptr[d] = grad_val_to_store;
+    }
+    if (maximize) grad_val = -grad_val;
+    if (weight_decay != 0.0){
+      grad_val += param_ptr[d] * scalar_t(weight_decay);
+    }
+    if (momentum != 0.0) {
+      if (is_first_step) {
+        momentum_buf_ptr[d] = grad_val;
+      } else {
+        momentum_buf_ptr[d] = momentum_buf_ptr[d] * scalar_t(momentum) +
+            grad_val * scalar_t(1 - dampening);
+      }
+      if (nesterov) {
+        grad_val += momentum_buf_ptr[d] * scalar_t(momentum);
+      } else {
+        grad_val = momentum_buf_ptr[d];
+      }
+    }
+    param_ptr[d] -= grad_val * scalar_t(lr);
+  }
+}
+
+template <typename scalar_t>
+void sgd_fused_step_impl(
+    const at::Tensor& param,
+    const at::Tensor& grad,
+    const at::Tensor& momentum_buffer,
+    const double weight_decay,
+    const double momentum,
+    const double lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const float* grad_scale_ptr) {
+  using opmath_t = at::opmath_type<scalar_t>;
+  scalar_t* param_data = param.data_ptr<scalar_t>();
+  scalar_t* grad_data = grad.data_ptr<scalar_t>();
+  bool has_momentum_buffer = momentum != 0.0;
+  scalar_t* momentum_buffer_data = has_momentum_buffer ? momentum_buffer.data_ptr<scalar_t>() : nullptr;
+
+  constexpr size_t cache_line_size = 64;
+  constexpr int64_t cache_line_aligned_task_unit = cache_line_size / sizeof(scalar_t);
+  size_t num_units = divup(param.numel(), cache_line_aligned_task_unit);
+
+  auto sgd_fn = [&](int64_t begin, int64_t end) {
+        // local pointers
+        begin *= cache_line_aligned_task_unit;
+        end = std::min(end * cache_line_aligned_task_unit, param.numel());
+        scalar_t* param_ptr = param_data + begin;
+        scalar_t* grad_ptr = grad_data + begin;
+        scalar_t* momentum_buffer_ptr = has_momentum_buffer ? momentum_buffer_data + begin : nullptr;
+
+        const int64_t size = end - begin;
+        sgd_math<scalar_t, opmath_t>(
+          param_ptr,
+          grad_ptr,
+          momentum_buffer_ptr,
+          weight_decay,
+          momentum,
+          lr,
+          dampening,
+          nesterov,
+          maximize,
+          is_first_step,
+          grad_scale_ptr,
+          size
+        );
+      };
+  at::parallel_for(
+      0, num_units, 0, sgd_fn);
+}
+
+void fused_sgd_kernel(
+    const at::Tensor& param,
+    const at::Tensor& grad,
+    const at::Tensor& momentum_buffer,
+    const double weight_decay,
+    const double momentum,
+    const double lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const float* grad_scale_ptr
+  ) {
+  Tensor grad_contiguous = grad.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, param.scalar_type(), "fused_sgd_kernel", [&] {
+    sgd_fused_step_impl<scalar_t>(
+      param,
+      grad,
+      momentum_buffer,
+      weight_decay,
+      momentum,
+      lr,
+      dampening,
+      nesterov,
+      maximize,
+      is_first_step,
+      grad_scale_ptr);
+  });
+}
+
+}
+
+REGISTER_DISPATCH(fused_sgd_stub, &fused_sgd_kernel);
+} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
index 5c02472be592a..0a704e5419c4f 100644
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@@ -7,12 +7,10 @@
 #include <ATen/TensorGeometry.h>
 #include <ATen/TensorIterator.h>
 #include <ATen/cpu/vec/vec.h>
-#include <c10/util/C++17.h>
 #include <c10/util/irange.h>
 
 #include <algorithm>
 #include <cstring>
-#include <type_traits>
 
 namespace at::native { namespace {
 
@@ -110,7 +108,7 @@ namespace at::native { namespace {
  *          //   3. writes the first `len` values in the interpolated vector to
  *          //      `out_slice` with spatial offset being `offset`.
  *          //
- *          // This assimes that `grid_x` and `grid_y` all contain valid grid
+ *          // This assumes that `grid_x` and `grid_y` all contain valid grid
  *          // values \in [-1, 1], even at indices greater than `len`.
  *          //
  *          // The `*_slice` argument names mean samples within a batch (i.e.,
@@ -391,8 +389,7 @@ struct ComputeLocation<scalar_t, GridSamplerPadding::Border, align_corners>
   }
 
   inline std::pair<Vec, Vec> apply_get_grad(const Vec &in) const {
-    Vec res, grad_clip;
-    std::tie(res, grad_clip) = clip_coordinates_get_grad(unnormalize(in));
+    auto [res, grad_clip] = clip_coordinates_get_grad(unnormalize(in));
     return std::make_pair(res, grad_clip & Vec(scaling_factor));
   }
 };
@@ -423,8 +420,8 @@ struct ComputeLocation<scalar_t, GridSamplerPadding::Reflection, align_corners>
   }
 
   inline std::pair<Vec, Vec> apply_get_grad(const Vec &in) const {
-    Vec res, grad_refl, grad_clip, grad(scaling_factor);
-    std::tie(res, grad_refl) = reflect_coordinates_get_grad(unnormalize(in));
+    auto [res, grad_refl] = reflect_coordinates_get_grad(unnormalize(in));
+    Vec grad_clip, grad(scaling_factor);
     grad = grad_refl * grad;
     std::tie(res, grad_clip) = clip_coordinates_get_grad(res);
     grad = grad_clip & grad;
@@ -475,7 +472,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear,
   const ComputeLocation<scalar_t, padding, align_corners> compute_W;
   const bool must_in_bound = padding != GridSamplerPadding::Zeros;
 
-  ApplyGridSample(const TensorAccessor<scalar_t, 4>& input)
+  ApplyGridSample(const TensorAccessor<const scalar_t, 4>& input)
     : inp_H(input.size(2))
     , inp_W(input.size(3))
     , inp_sH(input.stride(2))
@@ -541,7 +538,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear,
   }
 
   inline void forward(TensorAccessor<scalar_t, 3>& out_slice,
-                      const TensorAccessor<scalar_t, 3>& inp_slice,
+                      const TensorAccessor<const scalar_t, 3>& inp_slice,
                       int64_t offset, const Vec& grid_x, const Vec& grid_y,
                       int64_t len) const {
     auto x = compute_W.apply(grid_x);
@@ -591,20 +588,16 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear,
   template<bool input_requires_grad>
   inline void backward(TensorAccessor<scalar_t, 3>* gInp_slice_ptr,
                        TensorAccessor<scalar_t, 3>& gGrid_slice,
-                       const TensorAccessor<scalar_t, 3>& gOut_slice,
-                       const TensorAccessor<scalar_t, 3>& inp_slice,
+                       const TensorAccessor<const scalar_t, 3>& gOut_slice,
+                       const TensorAccessor<const scalar_t, 3>& inp_slice,
                        int64_t offset, const Vec& grid_x, const Vec& grid_y,
                        int64_t len) const {
-    Vec x, y, gx_mult, gy_mult;
-    std::tie(x, gx_mult) = compute_W.apply_get_grad(grid_x);
-    std::tie(y, gy_mult) = compute_H.apply_get_grad(grid_y);
+    auto [x, gx_mult] = compute_W.apply_get_grad(grid_x);
+    auto [y, gy_mult] = compute_H.apply_get_grad(grid_y);
 
-    Vec n, s, w, e, nw, ne, sw, se, nw_mask, ne_mask, sw_mask, se_mask;
-    iVec i_y_n, i_x_w;
-
-    std::tie(
+    auto [
       n, s, w, e, nw, ne, sw, se, nw_mask, ne_mask, sw_mask, se_mask,
-      i_y_n, i_x_w) = compute_interp_params(x, y);
+      i_y_n, i_x_w] = compute_interp_params(x, y);
 
     auto i_nw_offset = i_y_n * iVec(inp_sH) + i_x_w * iVec(inp_sW);
     auto i_ne_offset = i_nw_offset + iVec(inp_sW);
@@ -722,7 +715,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest,
   const ComputeLocation<scalar_t, padding, align_corners> compute_W;
   const bool must_in_bound = padding != GridSamplerPadding::Zeros;
 
-  ApplyGridSample(const TensorAccessor<scalar_t, 4>& input)
+  ApplyGridSample(const TensorAccessor<const scalar_t, 4>& input)
     : inp_H(input.size(2))
     , inp_W(input.size(3))
     , inp_sH(input.stride(2))
@@ -733,7 +726,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest,
     , compute_W(input.size(3)) {}
 
   inline void forward(TensorAccessor<scalar_t, 3>& out_slice,
-                      const TensorAccessor<scalar_t, 3>& inp_slice,
+                      const TensorAccessor<const scalar_t, 3>& inp_slice,
                       int64_t offset, const Vec& grid_x, const Vec& grid_y,
                       int64_t len) const {
     auto x = compute_W.apply(grid_x);
@@ -769,8 +762,8 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest,
   template<bool input_requires_grad>
   inline void backward(TensorAccessor<scalar_t, 3>* gInp_slice_ptr,
                        TensorAccessor<scalar_t, 3>& gGrid_slice,
-                       const TensorAccessor<scalar_t, 3>& gOut_slice,
-                       const TensorAccessor<scalar_t, 3>& /*inp_slice*/,
+                       const TensorAccessor<const scalar_t, 3>& gOut_slice,
+                       const TensorAccessor<const scalar_t, 3>& /*inp_slice*/,
                        int64_t offset, const Vec& grid_x, const Vec& grid_y,
                        int64_t len) const {
     if (input_requires_grad) {
@@ -834,7 +827,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
   // could be -0.5 or -0.75, use the same value in UpSampleBicubic2d.h
   const Vec A = Vec(-0.75);
 
-  ApplyGridSample(const TensorAccessor<scalar_t, 4>& input)
+  ApplyGridSample(const TensorAccessor<const scalar_t, 4>& input)
     : inp_H(input.size(2))
     , inp_W(input.size(3))
     , inp_sH(input.stride(2))
@@ -913,7 +906,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
   }
 
   inline void forward(TensorAccessor<scalar_t, 3>& out_slice,
-                      const TensorAccessor<scalar_t, 3>& inp_slice,
+                      const TensorAccessor<const scalar_t, 3>& inp_slice,
                       int64_t offset, const Vec& grid_x, const Vec& grid_y,
                       int64_t len) const {
 
@@ -957,8 +950,8 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
   template<bool input_requires_grad>
   inline void backward(TensorAccessor<scalar_t, 3>* gInp_slice_ptr,
                       TensorAccessor<scalar_t, 3>& gGrid_slice,
-                      const TensorAccessor<scalar_t, 3>& gOut_slice,
-                      const TensorAccessor<scalar_t, 3>& inp_slice,
+                      const TensorAccessor<const scalar_t, 3>& gOut_slice,
+                      const TensorAccessor<const scalar_t, 3>& inp_slice,
                       int64_t offset, const Vec& grid_x, const Vec& grid_y,
                       int64_t len) const {
     Vec x = compute_W.unnormalize(grid_x);
@@ -1028,7 +1021,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
 
 template<typename scalar_t, typename ApplyFn>
 static inline void grid_sample_2d_grid_slice_iterator(
-    const TensorAccessor<scalar_t, 3>& grid_slice, const ApplyFn &apply_fn) {
+    const TensorAccessor<const scalar_t, 3>& grid_slice, const ApplyFn &apply_fn) {
   int64_t out_H = grid_slice.size(0);
   int64_t out_W = grid_slice.size(1);
   int64_t grid_sH = grid_slice.stride(0);
@@ -1193,8 +1186,8 @@ void grid_sampler_2d_cpu_kernel_impl(
 
   AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_2d_cpu_kernel_impl", [&] {
     auto out_acc = output.accessor<scalar_t, 4>();
-    auto inp_acc = input.accessor<scalar_t, 4>();
-    auto grid_acc = grid.accessor<scalar_t, 4>();
+    auto inp_acc = input.accessor<const scalar_t, 4>();
+    auto grid_acc = grid.accessor<const scalar_t, 4>();
     if (align_corners) {
       switch (static_cast<GridSamplerInterpolation>(interpolation_mode)) {
         HANDLE_INTERP(GridSamplerInterpolation::Bilinear, true);
@@ -1281,9 +1274,9 @@ void grid_sampler_2d_backward_cpu_kernel_impl(
 
   AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_2d_backward_cpu_kernel_impl", [&] {
     auto gGrid_acc = grad_grid.accessor<scalar_t, 4>();
-    auto inp_acc = input.accessor<scalar_t, 4>();
-    auto grid_acc = grid.accessor<scalar_t, 4>();
-    auto gOut_acc = grad_output.accessor<scalar_t, 4>();
+    auto inp_acc = input.accessor<const scalar_t, 4>();
+    auto grid_acc = grid.accessor<const scalar_t, 4>();
+    auto gOut_acc = grad_output.accessor<const scalar_t, 4>();
     if (input_requires_grad) {
       auto gInp_acc = grad_input.accessor<scalar_t, 4>();
       if (align_corners) {
diff --git a/aten/src/ATen/native/cpu/HistogramKernel.cpp b/aten/src/ATen/native/cpu/HistogramKernel.cpp
index e3a2b6c30bb80..196bfd5647a76 100644
--- a/aten/src/ATen/native/cpu/HistogramKernel.cpp
+++ b/aten/src/ATen/native/cpu/HistogramKernel.cpp
@@ -98,14 +98,14 @@ void histogramdd_cpu_contiguous(Tensor& hist, const TensorList& bin_edges,
         return;
     }
 
-    TensorAccessor<input_t, 2> accessor_in = input.accessor<input_t, 2>();
+    TensorAccessor<const input_t, 2> accessor_in = input.accessor<const input_t, 2>();
 
     /* Constructs a c10::optional<TensorAccessor> containing an accessor iff
      * the optional weight tensor has a value.
      */
     const auto accessor_wt = weight.has_value()
-            ? c10::optional<TensorAccessor<input_t, 1>>(weight.value().accessor<input_t, 1>())
-            : c10::optional<TensorAccessor<input_t, 1>>();
+            ? c10::optional<TensorAccessor<const input_t, 1>>(weight.value().accessor<const input_t, 1>())
+            : c10::optional<TensorAccessor<const input_t, 1>>();
 
     std::vector<input_t*> bin_seq(D);
     std::vector<int64_t> num_bin_edges(D);
@@ -292,10 +292,10 @@ void infer_bin_edges_from_input(const Tensor& input, const int64_t N,
 
     TORCH_INTERNAL_ASSERT(min.is_contiguous() && max.is_contiguous());
 
-    const scalar_t *min_data = min.data_ptr<scalar_t>();
+    const scalar_t *min_data = min.const_data_ptr<scalar_t>();
     std::copy(min_data, min_data + N, leftmost_edges.begin());
 
-    const scalar_t *max_data = max.data_ptr<scalar_t>();
+    const scalar_t *max_data = max.const_data_ptr<scalar_t>();
     std::copy(max_data, max_data + N, rightmost_edges.begin());
 }
 
diff --git a/aten/src/ATen/native/cpu/IndexKernel.cpp b/aten/src/ATen/native/cpu/IndexKernel.cpp
index 36ce92f04d80b..1640d2d400edd 100644
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@@ -54,11 +54,12 @@ template <typename scalar_t, typename func_t>
 void cpu_take_put_kernel(
     TensorIterator& iter,
     const TensorBase& indexed,
+    bool is_indexed_data_mutated,
     const func_t& f,
     bool serial_execution=false) {
   // This kernel follows the same strategy as `cpu_index_kernel`
   // Even though the indexed_tensor is const, we modify it through the data_ptr
-  // This is a bit dirty, but otherwise it would be necessary to innecessarily add tensor
+  // This is a bit dirty, but otherwise it would be necessary to unnecessarily add tensor
   // with zero strides to `iter` which would not be much better
 
   // When launch the parallel version, set a relative small grain size less than the INTERNAL::GRAIN_SIZE
@@ -70,7 +71,9 @@ void cpu_take_put_kernel(
   const auto numel = indexed.numel();
   const auto offset_indexed = IndexToOffset(indexed);
 
-  auto* indexed_data = indexed.data_ptr<scalar_t>();
+  auto* indexed_data = is_indexed_data_mutated ?
+   indexed.data_ptr<scalar_t>()
+   : const_cast<scalar_t*>(indexed.const_data_ptr<scalar_t>());
   auto loop = [&](char** data, const int64_t* strides, int64_t n) {
     auto* iterated_data_bytes = data[0];
     auto* index_data_bytes = data[1];
@@ -115,21 +118,21 @@ void put_kernel(
       bool use_parallel_for = (!is_deterministic) && (
         (iter.numel() >= internal::GRAIN_SIZE) && (at::get_num_threads() > 1));
       if (use_parallel_for && iter.dtype() == ScalarType::Float) {
-        cpu_take_put_kernel<float>(iter, self,
+        cpu_take_put_kernel<float>(iter, self, true,
             [](float& iterated, float* indexed, const int64_t idx) {
                 cpu_atomic_add_float(indexed+idx, iterated);
               });
       } else {
         // TODO: investigate parallelization of the accumulate kernel.
         // Unlike the non-accumulate case, this needs to be thread-safe.
-        cpu_take_put_kernel<scalar_t>(iter, self,
+        cpu_take_put_kernel<scalar_t>(iter, self, true,
             [](scalar_t& iterated, scalar_t* indexed, const int64_t idx) {
                 indexed[idx] += iterated;
               },
             /*serial_execution=*/true);
       }
     } else {
-      cpu_take_put_kernel<scalar_t>(iter, self,
+      cpu_take_put_kernel<scalar_t>(iter, self, true,
           [](scalar_t& iterated, scalar_t* indexed, const int64_t idx) {
               indexed[idx] = iterated;
             });
@@ -142,8 +145,8 @@ void take_kernel(
   const TensorBase & input) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16,
     iter.dtype(), "take_cpu", [&] {
-      cpu_take_put_kernel<scalar_t>(iter, input,
-          [](scalar_t& iterated, scalar_t* indexed, const int64_t idx) {
+      cpu_take_put_kernel<scalar_t>(iter, input, false,
+          [](scalar_t& iterated, const scalar_t* indexed, const int64_t idx) {
               iterated = indexed[idx];
             });
     });
@@ -332,7 +335,7 @@ void masked_fill_kernel(TensorIterator& iter, const Scalar& value) {
 template <typename scalar_t>
 void cpu_masked_scatter_kernel(TensorIterator& iter, const TensorBase& source) {
   std::ptrdiff_t source_cntr = 0;
-  scalar_t* source_ptr = source.data_ptr<scalar_t>();
+  const scalar_t* source_ptr = source.const_data_ptr<scalar_t>();
   auto numel = source.numel();
 
   auto loop = [&](char** data, const int64_t* strides, int64_t n) {
@@ -744,11 +747,11 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
       // Special cases:
       // a) channels last hflip on (N, C, H, W) and outer_stride(=dtype_size * C) in [2, 16]
       // b) flip dim=-2 on (N, ..., M, C) and outer_stride(=dtype_size * C) in [2, 16]
-      auto output_strides = iter.strides(0);
-      auto input_strides = iter.strides(1);
-      auto c = -output_strides[1];
+      auto output_strides_2 = iter.strides(0);
+      auto input_strides_2 = iter.strides(1);
+      auto c = -output_strides_2[1];
       if (c >= 2 && c <= 16 &&
-          c == input_strides[1] &&
+          c == input_strides_2[1] &&
           c == iter.element_size(0) * iter.shape()[0]  // checks if dim=1 is contiguous as well
       ) {
         return cpu_hflip_channels_last_vec(iter);
diff --git a/aten/src/ATen/native/cpu/LerpKernel.cpp b/aten/src/ATen/native/cpu/LerpKernel.cpp
index c9bf7525a76fb..7eaac38c21c8a 100644
--- a/aten/src/ATen/native/cpu/LerpKernel.cpp
+++ b/aten/src/ATen/native/cpu/LerpKernel.cpp
@@ -72,9 +72,8 @@ void lerp_scalar_kernel(at::TensorIteratorBase& iter, const Scalar& weight) {
         return lerp(self_val, end_val, weight_val);
       },
       [=](bVec self_vec, bVec end_vec) -> bVec {
-          fVec self_vec0, self_vec1, end_vec0, end_vec1;
-          std::tie(self_vec0, self_vec1) = convert_bfloat16_float(self_vec);
-          std::tie(end_vec0, end_vec1) = convert_bfloat16_float(end_vec);
+          auto [self_vec0, self_vec1] = convert_bfloat16_float(self_vec);
+          auto [end_vec0, end_vec1] = convert_bfloat16_float(end_vec);
           auto result0 = lerp_vec(self_vec0, end_vec0, weight_vec);
           auto result1 = lerp_vec(self_vec1, end_vec1, weight_vec);
           return convert_float_bfloat16(result0, result1);
@@ -90,9 +89,8 @@ void lerp_scalar_kernel(at::TensorIteratorBase& iter, const Scalar& weight) {
         return lerp(self_val, end_val, weight_val);
       },
       [=](hVec self_vec, hVec end_vec) -> hVec {
-          fVec self_vec0, self_vec1, end_vec0, end_vec1;
-          std::tie(self_vec0, self_vec1) = convert_half_float(self_vec);
-          std::tie(end_vec0, end_vec1) = convert_half_float(end_vec);
+          auto [self_vec0, self_vec1] = convert_half_float(self_vec);
+          auto [end_vec0, end_vec1] = convert_half_float(end_vec);
           auto result0 = lerp_vec(self_vec0, end_vec0, weight_vec);
           auto result1 = lerp_vec(self_vec1, end_vec1, weight_vec);
           return convert_float_half(result0, result1);
@@ -116,34 +114,30 @@ void lerp_scalar_kernel(at::TensorIteratorBase& iter, const Scalar& weight) {
 void lerp_tensor_kernel(at::TensorIteratorBase& iter) {
   if (iter.common_dtype() == kBFloat16) {
     using bVec = Vectorized<BFloat16>;
-    using fVec = Vectorized<float>;
     at::native::cpu_kernel_vec(
       iter,
       [=](BFloat16 self_val, BFloat16 end_val, BFloat16 weight_val) -> BFloat16 {
         return lerp(self_val, end_val, weight_val);
       },
       [=](bVec self_vec, bVec end_vec, bVec weight_vec) -> bVec {
-          fVec self_vec0, self_vec1, end_vec0, end_vec1, weight_vec0, weight_vec1;
-          std::tie(self_vec0, self_vec1) = convert_bfloat16_float(self_vec);
-          std::tie(end_vec0, end_vec1) = convert_bfloat16_float(end_vec);
-          std::tie(weight_vec0, weight_vec1) = convert_bfloat16_float(weight_vec);
+          auto [self_vec0, self_vec1] = convert_bfloat16_float(self_vec);
+          auto [end_vec0, end_vec1] = convert_bfloat16_float(end_vec);
+          auto [weight_vec0, weight_vec1] = convert_bfloat16_float(weight_vec);
           auto result0 = lerp_vec(self_vec0, end_vec0, weight_vec0);
           auto result1 = lerp_vec(self_vec1, end_vec1, weight_vec1);
           return convert_float_bfloat16(result0, result1);
       });
   } else if (iter.common_dtype() == kHalf) {
     using hVec = Vectorized<Half>;
-    using fVec = Vectorized<float>;
     at::native::cpu_kernel_vec(
       iter,
       [=](Half self_val, Half end_val, Half weight_val) -> Half {
         return lerp(self_val, end_val, weight_val);
       },
       [=](hVec self_vec, hVec end_vec, hVec weight_vec) -> hVec {
-          fVec self_vec0, self_vec1, end_vec0, end_vec1, weight_vec0, weight_vec1;
-          std::tie(self_vec0, self_vec1) = convert_half_float(self_vec);
-          std::tie(end_vec0, end_vec1) = convert_half_float(end_vec);
-          std::tie(weight_vec0, weight_vec1) = convert_half_float(weight_vec);
+          auto [self_vec0, self_vec1] = convert_half_float(self_vec);
+          auto [end_vec0, end_vec1] = convert_half_float(end_vec);
+          auto [weight_vec0, weight_vec1] = convert_half_float(weight_vec);
           auto result0 = lerp_vec(self_vec0, end_vec0, weight_vec0);
           auto result1 = lerp_vec(self_vec1, end_vec1, weight_vec1);
           return convert_float_half(result0, result1);
diff --git a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp
index 06421ee57a0bb..17e9b752d6c53 100644
--- a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp
@@ -65,7 +65,7 @@ template <typename scalar_t, typename opmath_t>
 inline
 typename std::enable_if<std::is_same<scalar_t, opmath_t>::value, void>::type
 compute_internal(
-  scalar_t* input_data,
+  const scalar_t* input_data,
   scalar_t* out_data,
   opmath_t* max_ptr,
   vec::int_same_size_t<opmath_t>* index_ptr,
@@ -99,7 +99,7 @@ compute_internal(
   for (int64_t id = id0; id < id1; id += dilationD) {
     for (int64_t ih = ih0; ih < ih1; ih += dilationH) {
       for (int64_t iw = iw0; iw < iw1; iw += dilationW) {
-        scalar_t* in = input_data + (n * input_depth * input_height * input_width +
+        const scalar_t* in = input_data + (n * input_depth * input_height * input_width +
             id * input_height * input_width + ih * input_width + iw) * channels;
 
         int64_t d2 = 0;
@@ -138,7 +138,7 @@ template <typename scalar_t, typename opmath_t>
 inline
 typename std::enable_if<!std::is_same<scalar_t, opmath_t>::value, void>::type
 compute_internal(
-  scalar_t* input_data,
+  const scalar_t* input_data,
   scalar_t* out_data,
   opmath_t* max_ptr,
   vec::int_same_size_t<opmath_t>* index_ptr,
@@ -172,15 +172,14 @@ compute_internal(
   for (int64_t id = id0; id < id1; id += dilationD) {
     for (int64_t ih = ih0; ih < ih1; ih += dilationH) {
       for (int64_t iw = iw0; iw < iw1; iw += dilationW) {
-        scalar_t* in = input_data + (n * input_depth * input_height * input_width +
+        const scalar_t* in = input_data + (n * input_depth * input_height * input_width +
             id * input_height * input_width + ih * input_width + iw) * channels;
 
         int64_t d2 = 0;
         for (; d2 < len; d2 += Vec::size()) {
           iVec index_ivec = iVec(id * input_height * input_width + ih * input_width + iw);
           Vec val_bvec = Vec::loadu(in + d2);
-          fVec val_fvec0, val_fvec1;
-          std::tie(val_fvec0, val_fvec1) = convert_to_float<scalar_t>(val_bvec);
+          auto [val_fvec0, val_fvec1] = convert_to_float<scalar_t>(val_bvec);
 
           iVec maxindex_ivec0 = iVec::loadu(index_ptr + d2);
           iVec maxindex_ivec1 = iVec::loadu(index_ptr + d2 + iVec::size());
@@ -260,7 +259,7 @@ void cpu_max_pool(
   auto output = output_.contiguous();
   auto indices = indices_.contiguous();
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
   auto indices_data = indices.data_ptr<int64_t>();
 
@@ -291,7 +290,7 @@ void cpu_max_pool(
   // parallel on dim N, C
   at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
     for (int64_t c = begin; c < end; c++) {
-      scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width;
+      const scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width;
       scalar_t* output_ptr = output_data + c * output_depth * output_height * output_width;
       int64_t* indices_ptr = indices_data + c * output_depth * output_height * output_width;
 
@@ -390,7 +389,7 @@ void cpu_max_pool_channels_last(
   auto output = output_.contiguous(memory_format);
   auto indices = indices_.contiguous(memory_format);
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
   auto indices_data = indices.data_ptr<int64_t>();
 
@@ -406,7 +405,7 @@ void cpu_max_pool_channels_last(
   using opmath_t = at::opmath_type<scalar_t>;
   using Vec = vec::Vectorized<scalar_t>;
   using integer_t = vec::int_same_size_t<opmath_t>;
-  // for the convience of vectorization, use integer of the same size of scalar_t,
+  // for the convenience of vectorization, use integer of the same size of scalar_t,
   //   e.g. int32_t for float, int64_t for double
   // need to make sure doesn't overflow
   TORCH_CHECK(input_depth * input_height * input_width <= std::numeric_limits<integer_t>::max());
@@ -476,8 +475,8 @@ void cpu_max_pool_backward(
   auto indices = indices_.contiguous();
   auto grad_input = grad_input_.contiguous();
 
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
-  auto indices_data = indices.data_ptr<int64_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
+  auto indices_data = indices.const_data_ptr<int64_t>();
   auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
 
   // treat batch size and channels as one dimension
@@ -508,8 +507,8 @@ void cpu_max_pool_backward(
   at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) {
     for (const auto c : c10::irange(begin, end)) {
       scalar_t* grad_input_ptr = grad_input_data + c * input_depth * input_height * input_width;
-      scalar_t* grad_output_ptr = grad_output_data + c * output_depth * output_height * output_width;
-      int64_t * indices_ptr = indices_data + c * output_depth * output_height * output_width;
+      const scalar_t* grad_output_ptr = grad_output_data + c * output_depth * output_height * output_width;
+      const int64_t * indices_ptr = indices_data + c * output_depth * output_height * output_width;
 
       for (int64_t od = 0; od < output_depth; od++) {
         for (int64_t oh = 0; oh < output_height; oh++) {
@@ -550,8 +549,8 @@ void cpu_max_pool_backward_channels_last(
   auto indices = indices_.contiguous(memory_format);
 
   auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
-  auto indices_data = indices.data_ptr<int64_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
+  auto indices_data = indices.const_data_ptr<int64_t>();
 
   // MaxPool2d: NHWC
   // MaxPool3d: NDHWC
@@ -568,14 +567,14 @@ void cpu_max_pool_backward_channels_last(
   at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) {
     for (const auto n : c10::irange(begin, end)) {
       scalar_t* grad_input_ptr = grad_input_data + n * input_depth * input_height * input_width * channels;
-      scalar_t* grad_output_ptr = grad_output_data + n * output_depth * output_height * output_width * channels;
-      int64_t* indices_ptr = indices_data + n * output_depth * output_height * output_width * channels;
+      const scalar_t* grad_output_ptr = grad_output_data + n * output_depth * output_height * output_width * channels;
+      const int64_t* indices_ptr = indices_data + n * output_depth * output_height * output_width * channels;
 
       for (int64_t od = 0; od < output_depth; od++) {
         for (int64_t oh = 0; oh < output_height; oh++) {
           for (int64_t ow = 0; ow < output_width; ow++) {
-            scalar_t* gout = grad_output_ptr + (od * output_height * output_width + oh * output_width + ow) * channels;
-            int64_t* ind = indices_ptr + (od * output_height * output_width + oh * output_width + ow) * channels;
+            const scalar_t* gout = grad_output_ptr + (od * output_height * output_width + oh * output_width + ow) * channels;
+            const int64_t* ind = indices_ptr + (od * output_height * output_width + oh * output_width + ow) * channels;
             // TODO: gcc vectorization
             for (int64_t c = 0; c < channels; c++) {
               int64_t maxindex = ind[c];
diff --git a/aten/src/ATen/native/cpu/MaxPooling.cpp b/aten/src/ATen/native/cpu/MaxPooling.cpp
index 70443e67ae74d..660708a2a06d6 100644
--- a/aten/src/ATen/native/cpu/MaxPooling.cpp
+++ b/aten/src/ATen/native/cpu/MaxPooling.cpp
@@ -39,7 +39,7 @@ void max_pool1d_impl(
       [&] {
         const Tensor in = input.contiguous();
         scalar_t* const OP = output.data_ptr<scalar_t>();
-        const scalar_t* const IP = in.data_ptr<scalar_t>();
+        const scalar_t* const IP = in.const_data_ptr<scalar_t>();
 
         // Value used for padding
         scalar_t FILL = std::numeric_limits<scalar_t>::has_infinity
diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
index c9dc3eded2a19..d5af5d23e8b10 100644
--- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
@@ -20,8 +20,8 @@ void cpu_max_unpool(
     const Tensor& indices) {
   auto output = output_.contiguous();
 
-  auto input_data = input.data_ptr<scalar_t>();
-  auto indices_data = indices.data_ptr<int64_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
+  auto indices_data = indices.const_data_ptr<int64_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   // NB: input tensor dimensions:
@@ -105,8 +105,8 @@ void cpu_max_unpool_channels_last(
   auto memory_format = at::MemoryFormat::ChannelsLast;
   auto output = output_.contiguous(memory_format);
 
-  auto input_data = input.data_ptr<scalar_t>();
-  auto indices_data = indices.data_ptr<int64_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
+  auto indices_data = indices.const_data_ptr<int64_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   int64_t nbatch = input.size(0);
@@ -127,8 +127,8 @@ void cpu_max_unpool_channels_last(
     data_index_init(begin, n, nbatch, ip, input_image_size);
 
     for (const auto i : c10::irange(begin, end)) {
-      scalar_t* input_ptr = input_data + i * channels;
-      int64_t* indices_ptr = indices_data + i * channels;
+      const scalar_t* input_ptr = input_data + i * channels;
+      const int64_t* indices_ptr = indices_data + i * channels;
       scalar_t* output_ptr = output_data + n * output_image_size * channels;
 
       // can't do scatter on avx2 (only available on avx512)
diff --git a/aten/src/ATen/native/cpu/MultinomialKernel.cpp b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
index c5c2eebb5d35e..1c4054abdf239 100644
--- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp
+++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
@@ -36,7 +36,7 @@ multinomial_with_replacement_apply(
   /* cumulative probability distribution vector */
   Tensor cum_dist = at::empty({n_categories}, self.options());
 
-  const scalar_t* const self_ptr = self.data_ptr<scalar_t>();
+  const scalar_t* const self_ptr = self.const_data_ptr<scalar_t>();
   scalar_t* const cum_dist_ptr = cum_dist.data_ptr<scalar_t>();
   int64_t* const result_ptr = result.data_ptr<int64_t>();
 
@@ -140,7 +140,7 @@ multinomial_with_replacement_apply(
   /* cumulative probability distribution vector */
   Tensor cum_dist = at::empty({n_categories}, self.options().dtype(kFloat));
 
-  const scalar_t* const self_ptr = self.data_ptr<scalar_t>();
+  const scalar_t* const self_ptr = self.const_data_ptr<scalar_t>();
   float* const cum_dist_ptr = cum_dist.data_ptr<float>();
   int64_t* const result_ptr = result.data_ptr<int64_t>();
 
diff --git a/aten/src/ATen/native/cpu/PaddingKernel.cpp b/aten/src/ATen/native/cpu/PaddingKernel.cpp
index ca438f144b2de..302346c4515c9 100644
--- a/aten/src/ATen/native/cpu/PaddingKernel.cpp
+++ b/aten/src/ATen/native/cpu/PaddingKernel.cpp
@@ -17,7 +17,7 @@ struct PaddingParams {
   int64_t nbatch;
   int64_t channels;
 
-  // use vectorized logic on width when output index is in [pad, input_witdh + pad),
+  // use vectorized logic on width when output index is in [pad, input_width + pad),
   // applies only to Channels First format when pad_l and pad_r are both positive.
   bool is_padding_positive_width;
 
@@ -136,7 +136,7 @@ void cpu_padding(
   auto input = input_.contiguous();
   auto output = output_.contiguous();
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   // fold nbatch and channels into single dimension for channels first.
@@ -158,7 +158,7 @@ void cpu_padding(
 
   // do vectorized copy whe output is overlapped with input on W,
   // only applies to positive padding
-  auto loop = [=](scalar_t* out, scalar_t* in, bool positive_padding) {
+  auto loop = [=](scalar_t* out, const scalar_t* in, bool positive_padding) {
     if (positive_padding) {
       for (const auto ow : c10::irange(pad_w)) {
         int64_t iw = PaddingType::index(ow, input_width, pad_w, offset_w);
@@ -198,7 +198,7 @@ void cpu_padding(
       for (const auto i : c10::irange(begin, end)) {
         int64_t ih = PaddingType::index(oh, input_height, pad_h, offset_h);
         scalar_t* output_ptr = output_data + i * output_width;
-        scalar_t* input_ptr = input_data + c * input_height * input_width + ih * input_width;
+        const scalar_t* input_ptr = input_data + c * input_height * input_width + ih * input_width;
 
         loop(output_ptr, input_ptr, p.is_padding_positive_width);
         data_index_step(c, channels, oh, output_height);
@@ -214,7 +214,7 @@ void cpu_padding(
         int64_t id = PaddingType::index(od, input_depth, pad_d, offset_d);
         int64_t ih = PaddingType::index(oh, input_height, pad_h, offset_h);
         scalar_t* output_ptr = output_data + i * output_width;
-        scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width +
+        const scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width +
             id * input_height * input_width + ih * input_width;
 
         loop(output_ptr, input_ptr, p.is_padding_positive_width);
@@ -243,7 +243,7 @@ void cpu_padding_channels_last(
   auto input = input_.contiguous(memory_format);
   auto output = output_.contiguous(memory_format);
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   int64_t nbatch = p.nbatch;
@@ -274,7 +274,7 @@ void cpu_padding_channels_last(
         int64_t iw = PaddingType::index(ow, input_width, pad_w, offset_w);
 
         scalar_t* output_ptr = output_data + i * channels;
-        scalar_t* input_ptr = input_data + (n * input_height * input_width + ih * input_width + iw) * channels;
+        const scalar_t* input_ptr = input_data + (n * input_height * input_width + ih * input_width + iw) * channels;
         copy_stub(output_ptr, input_ptr, channels);
 
         data_index_step(n, nbatch, oh, output_height, ow, output_width);
@@ -292,7 +292,7 @@ void cpu_padding_channels_last(
         int64_t iw = PaddingType::index(ow, input_width, pad_w, offset_w);
 
         scalar_t* output_ptr = output_data + i * channels;
-        scalar_t* input_ptr = input_data + (n * input_depth * input_height * input_width +
+        const scalar_t* input_ptr = input_data + (n * input_depth * input_height * input_width +
             id * input_height * input_width + ih * input_width + iw) * channels;
         copy_stub(output_ptr, input_ptr, channels);
 
@@ -317,7 +317,7 @@ void cpu_padding_backward(
   auto grad_output = grad_output_.contiguous();
   auto grad_input = grad_input_.contiguous();
 
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
   auto grad_input_data = grad_input.data_ptr<scalar_t>();
 
   // fold nbatch and channels into single dimension for channels first.
@@ -351,7 +351,7 @@ void cpu_padding_backward(
     // parallel on N,C, sequential on H,W
     at::parallel_for(0, channels, 1, [&](int64_t begin, int64_t end) {
       for (const auto c : c10::irange(begin, end)) {
-        scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width;
+        const scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width;
         scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width;
 
         for (const auto oh : c10::irange(output_height)) {
@@ -367,7 +367,7 @@ void cpu_padding_backward(
     // parallel on N,C, sequential on D,H,W
     at::parallel_for(0, channels, 1, [&](int64_t begin, int64_t end) {
       for (const auto c : c10::irange(begin, end)) {
-        scalar_t* grad_output_ptr = grad_output_data + c * output_depth *output_height * output_width;
+        const scalar_t* grad_output_ptr = grad_output_data + c * output_depth *output_height * output_width;
         scalar_t* grad_input_ptr = grad_input_data + c * input_depth * input_height * input_width;
 
         for (const auto od : c10::irange(output_depth)) {
@@ -406,7 +406,7 @@ void cpu_padding_backward_channels_last(
   auto grad_output = grad_output_.contiguous(memory_format);
 
   auto grad_input_data = grad_input.data_ptr<scalar_t>();
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
 
   int64_t nbatch = p.nbatch;
   int64_t channels = p.channels;
@@ -435,7 +435,7 @@ void cpu_padding_backward_channels_last(
             int64_t iw = PaddingType::index(ow, input_width, pad_w, offset_w);
             scalar_t* grad_input_ptr = grad_input_data +
                 (n * input_height * input_width + ih * input_width + iw) * channels;
-            scalar_t* grad_output_ptr = grad_output_data +
+            const scalar_t* grad_output_ptr = grad_output_data +
                 (n * output_height * output_width + oh * output_width + ow) * channels;
             add_stub(grad_input_ptr, grad_output_ptr, channels);
           }
@@ -455,7 +455,7 @@ void cpu_padding_backward_channels_last(
               scalar_t* grad_input_ptr = grad_input_data +
                   (n * input_depth * input_height * input_width + id * input_height * input_width +
                    ih * input_width + iw) * channels;
-              scalar_t* grad_output_ptr = grad_output_data +
+              const scalar_t* grad_output_ptr = grad_output_data +
                   (n * output_depth * output_height * output_width + od * output_height * output_width +
                    oh * output_width + ow) * channels;
               add_stub(grad_input_ptr, grad_output_ptr, channels);
diff --git a/aten/src/ATen/native/cpu/PixelShuffleKernel.cpp b/aten/src/ATen/native/cpu/PixelShuffleKernel.cpp
index b654518ae273a..d81e3c50fcea5 100644
--- a/aten/src/ATen/native/cpu/PixelShuffleKernel.cpp
+++ b/aten/src/ATen/native/cpu/PixelShuffleKernel.cpp
@@ -17,7 +17,7 @@ void cpu_pixel_shuffle(
     TensorBase& output,
     const TensorBase& input,
     int64_t upscale_factor) {
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   // [(B1...Bn), C, H, W] => [N, C, H, W]
@@ -59,7 +59,7 @@ void cpu_pixel_shuffle_channels_last(
     int64_t upscale_factor) {
   TORCH_CHECK(input.ndimension() == 4,
               "pixel shuffle with channels last format supports tensors with 4 dims");
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   int64_t nbatch = input.size(0);
@@ -81,7 +81,7 @@ void cpu_pixel_shuffle_channels_last(
     data_index_init(begin, n, nbatch, h, height);
     for (const auto i : c10::irange(begin, end)) {
       for (const auto w : c10::irange(width)) {
-        scalar_t* input_ptr = input_data + n * height * width * channels + h * width * channels + w * channels;
+        const scalar_t* input_ptr = input_data + n * height * width * channels + h * width * channels + w * channels;
 
         // step 1: transpose each channel lane
         //   from: [c, s1*s2]
@@ -115,7 +115,7 @@ void cpu_pixel_unshuffle(
     TensorBase& output,
     const TensorBase& input,
     int64_t downscale_factor) {
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   // [(B1...Bn), C, H, W] => [N, C, H, W]
@@ -158,7 +158,7 @@ void cpu_pixel_unshuffle_channels_last(
     int64_t downscale_factor) {
   TORCH_CHECK(input.ndimension() == 4,
               "pixel unshuffle with channels last format supports tensors with 4 dims");
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   int64_t nbatch = input.size(0);
diff --git a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
index 25243b2b19107..e02e57828e9b3 100644
--- a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
@@ -117,10 +117,9 @@ static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& no
         // 1        if  x >= beta
         // -1       if x <= -beta
         // x / beta if |x| < beta
-        Vectorized<float> input0, input1, target0, target1, grad_output0, grad_output1;
-        std::tie(input0, input1) = convert_bfloat16_float(input);
-        std::tie(target0, target1) = convert_bfloat16_float(target);
-        std::tie(grad_output0, grad_output1) = convert_bfloat16_float(grad_output);
+        auto [input0, input1] = convert_bfloat16_float(input);
+        auto [target0, target1] = convert_bfloat16_float(target);
+        auto [grad_output0, grad_output1] = convert_bfloat16_float(grad_output);
         auto x = input0 - target0;
         auto pos_or_neg_1_vec = Vectorized<float>::blendv(
             neg_1_vec, pos_1_vec, x > zero_vec);
diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h
index fdb1c0d1a0fce..26155373be589 100644
--- a/aten/src/ATen/native/cpu/Reduce.h
+++ b/aten/src/ATen/native/cpu/Reduce.h
@@ -7,6 +7,7 @@
 #include <c10/util/irange.h>
 
 #include <sstream>
+#include <type_traits>
 
 namespace at { namespace native { inline namespace CPU_CAPABILITY {
 
@@ -154,7 +155,7 @@ static void set_results(const std::tuple<res_t...>& result, const TensorIterator
 }
 
 template <typename T, typename... Args>
-struct all_same : guts::conjunction<
+struct all_same : std::conjunction<
   std::is_same<T, Args>...
 > {};
 
diff --git a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
index 125f3ce3d11fd..04fc88d1d147e 100644
--- a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
@@ -29,7 +29,7 @@ inline void reduce_all_impl_vec(
     vec_func_t vop) {
   using Vec = Vectorized<opmath_type<scalar_t>>;
   const int64_t input_numel = input.numel();
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   // NOTE: parallel_reduce not support bool type
   scalar_t result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v,
     [&](int64_t start, int64_t end, const scalar_t /*ident*/) -> scalar_t {
@@ -50,7 +50,7 @@ inline void reduce_all_impl(
     const scalar_t ident_v,
     func_t op) {
   const int64_t input_numel = input.numel();
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   scalar_t result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v,
     [&](int64_t start, int64_t end, const scalar_t ident) -> scalar_t {
       scalar_t partial_out = ident;
@@ -123,7 +123,7 @@ inline void reduce_all_impl_two_outputs(
     func_t2 reduce_acc_func) {
   using scalar_t_pair = std::pair<scalar_t, scalar_t>;
   const int64_t input_numel = input.numel();
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   scalar_t_pair result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v,
     [&](int64_t start, int64_t end, const scalar_t_pair& ident) -> scalar_t_pair {
       scalar_t_pair partial_out(ident);
@@ -150,7 +150,7 @@ inline void reduce_all_impl_vec_two_outputs(
   using Vec = Vectorized<opmath_type<scalar_t>>;
   using scalar_t_pair = std::pair<scalar_t, scalar_t>;
   const int64_t input_numel = input.numel();
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   // NOTE: parallel_reduce not support bool type
   std::pair<scalar_t, scalar_t> result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v,
     [&](int64_t start, int64_t end, const scalar_t_pair& /* ident */) -> scalar_t_pair {
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index 92250c115022c..c935f81f9ff08 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -53,7 +53,7 @@ static inline void cpu_cum_base_kernel(const Tensor& result,
     // NOLINTNEXTLINE(bugprone-argument-comment)
     .declare_static_shape(self.sizes(), /*squash_dim=*/dim)
     .add_output(result)
-    .add_input(self)
+    .add_const_input(self)
     .build();
 
   auto result_dim_stride = ensure_nonempty_stride(result, dim);
@@ -183,8 +183,7 @@ inline void norm_two_reduce_step(Vectorized<acc_t>& acc_vec, Vectorized<scalar_t
 
 template <>
 inline void norm_two_reduce_step(Vectorized<float>& acc_fvec, Vectorized<BFloat16>& data_bvec) {
-  Vectorized<float> data_fvec0, data_fvec1;
-  std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec);
+  auto [data_fvec0, data_fvec1] = convert_bfloat16_float(data_bvec);
   acc_fvec += data_fvec0 * data_fvec0;
   acc_fvec += data_fvec1 * data_fvec1;
 }
@@ -196,7 +195,7 @@ template <typename scalar_t, typename acc_t=typename scalar_value_type<scalar_t>
 void norm_kernel_cpu_impl(TensorIterator& iter, const double& val) {
   if (val == 0.0) {
     binary_kernel_reduce(iter, NormZeroOps<scalar_t, acc_t, out_t>(), acc_t(0));
-  } else if (val == 0.0) {
+  } else if (val == 1.0) {
     binary_kernel_reduce(iter, NormOneOps<scalar_t, acc_t, out_t>(), acc_t(0));
   } else if (val == 2.0) {
     binary_kernel_reduce(iter, NormTwoOps<scalar_t, acc_t, out_t>(), acc_t(0));
@@ -291,7 +290,9 @@ static void and_kernel_impl(TensorIterator& iter) {
         iter,
         [=](uint8_t a, uint8_t b) -> uint8_t { return (a && b) ? 1 : 0; },
         [=](Vectorized<uint8_t> a, Vectorized<uint8_t> b) {
-          return a & b;
+          // NB: != returns 0xFF rather than 0x01, so we must negate to get
+          // the desired result
+          return (a != Vectorized<uint8_t>(0)).neg() & (b != Vectorized<uint8_t>(0)).neg();
         },
         /*ident=*/true);
   } else {
@@ -327,7 +328,7 @@ static void or_kernel_impl(TensorIterator& iter) {
         iter,
         [=](uint8_t a, uint8_t b) -> uint8_t { return (a || b) ? 1 : 0; },
         [=](Vectorized<uint8_t> a, Vectorized<uint8_t> b) {
-          return a | b;
+          return (a != Vectorized<uint8_t>(0)).neg() | (b != Vectorized<uint8_t>(0)).neg();
         },
         /*ident=*/false);
   } else {
diff --git a/aten/src/ATen/native/cpu/ReduceUtils.h b/aten/src/ATen/native/cpu/ReduceUtils.h
index c54dc494fb6fa..d6afac295aff6 100644
--- a/aten/src/ATen/native/cpu/ReduceUtils.h
+++ b/aten/src/ATen/native/cpu/ReduceUtils.h
@@ -158,8 +158,7 @@ inline void map_acc(
   constexpr int64_t kaVecSize = aVec::size();
   for (d = 0; d < size - (size % kVecSize); d += kVecSize) {
     Vec data2_vec = Vec::loadu(input_data2 + d);
-    aVec data2_avec0, data2_avec1;
-    std::tie(data2_avec0, data2_avec1) = convert_to_float<scalar_t>(data2_vec);
+    auto [data2_avec0, data2_avec1] = convert_to_float<scalar_t>(data2_vec);
     aVec input_vec0 = aVec::loadu(input_data + d);
     aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize);
     vec_fun(input_vec0, data2_avec0).store(output_data + d);
@@ -168,8 +167,7 @@ inline void map_acc(
   if (size - d > 0) {
     int64_t tail_size = size - d;
     Vec data2_vec = Vec::loadu(input_data2 + d, tail_size);
-    aVec data2_avec0, data2_avec1;
-    std::tie(data2_avec0, data2_avec1) = convert_to_float<scalar_t>(data2_vec);
+    auto [data2_avec0, data2_avec1] = convert_to_float<scalar_t>(data2_vec);
     if (tail_size > kaVecSize) {
       aVec input_vec0 = aVec::loadu(input_data + d);
       aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize, tail_size - kaVecSize);
@@ -199,7 +197,7 @@ inline T update(const T& x, const T& y) {
 }
 
 template <typename scalar_t, ReductionType reduce>
-inline void update(scalar_t* out, scalar_t* data, int64_t K) {
+inline void update(scalar_t* out, const scalar_t* data, int64_t K) {
   using Vec = vec::Vectorized<vec_scalar_t<scalar_t>>;
   map2<scalar_t>(
       [](Vec x, Vec y) { return update<Vec, reduce>(x, y); },
@@ -211,7 +209,7 @@ inline void update(scalar_t* out, scalar_t* data, int64_t K) {
 
 template <typename scalar_t, ReductionType reduce,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
-inline void update(at::opmath_type<scalar_t>* out, scalar_t* data, int64_t K) {
+inline void update(at::opmath_type<scalar_t>* out, const scalar_t* data, int64_t K) {
   using opmath_t = at::opmath_type<scalar_t>;
   using Vec = vec::Vectorized<opmath_t>;
   map_acc<scalar_t, opmath_t>(
diff --git a/aten/src/ATen/native/cpu/SampledAddmmKernel.cpp b/aten/src/ATen/native/cpu/SampledAddmmKernel.cpp
index 731f91c349e7a..ed752f7b39364 100644
--- a/aten/src/ATen/native/cpu/SampledAddmmKernel.cpp
+++ b/aten/src/ATen/native/cpu/SampledAddmmKernel.cpp
@@ -26,8 +26,8 @@ void sampled_addmm_sparse_csr_kernel_impl(
   auto beta_ = beta.to<scalar_t>();
   auto alpha_ = alpha.to<scalar_t>();
 
-  scalar_t* mat1_data = mat1.data_ptr<scalar_t>();
-  scalar_t* mat2_data = mat2.data_ptr<scalar_t>();
+  const scalar_t* mat1_data = mat1.const_data_ptr<scalar_t>();
+  const scalar_t* mat2_data = mat2.const_data_ptr<scalar_t>();
 
   // mat1: {B, M, K}
   // mat2: {B, N, K}
@@ -43,8 +43,8 @@ void sampled_addmm_sparse_csr_kernel_impl(
   auto col = result.col_indices().reshape({-1, nnz});
 
   auto values_acc = values.accessor<scalar_t, 2>();
-  auto crow_acc = crow.accessor<index_t, 2>();
-  auto col_acc = col.accessor<index_t, 2>();
+  auto crow_acc = crow.accessor<const index_t, 2>();
+  auto col_acc = col.accessor<const index_t, 2>();
 
   // usually, collapse B and M is a better option,
   // but for most commonly used case (mat1 and mat2 is 2d tensor), B = 1,
@@ -54,8 +54,8 @@ void sampled_addmm_sparse_csr_kernel_impl(
     auto crow_slice = crow_acc[b];
     auto col_slice = col_acc[b];
     auto values_slice = values_acc[b];
-    scalar_t* mat1_ptr = mat1_data + b * M * K;
-    scalar_t* mat2_ptr = mat2_data + b * N * K;
+    const scalar_t* mat1_ptr = mat1_data + b * M * K;
+    const scalar_t* mat2_ptr = mat2_data + b * N * K;
 
     utils::parallel_sparse_csr(crow_slice, M, nnz, [&](int64_t begin, int64_t end) {
       for (const auto m : c10::irange(begin, end)) {
diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
index cae9260b5720c..bcfc26c7df7d8 100644
--- a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
+++ b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
@@ -186,7 +186,7 @@ struct cpu_scatter_gather_base_kernel {
       // NOLINTNEXTLINE(bugprone-argument-comment)
       .declare_static_shape(index.sizes(), /*squash_dim=*/dim)
       .add_output(buffer)
-      .add_input(index)
+      .add_const_input(index)
       .build();
 
     auto self_dim_stride = ensure_nonempty_stride(buffer, dim);
@@ -273,8 +273,8 @@ struct cpu_scatter_gather_base_kernel {
       // NOLINTNEXTLINE(bugprone-argument-comment)
       .declare_static_shape(index.sizes(), /*squash_dim=*/dim)
       .add_output(buffer)
-      .add_input(src)
-      .add_input(index)
+      .add_const_input(src)
+      .add_const_input(index)
       .build();
 
     auto self_dim_stride = ensure_nonempty_stride(buffer, dim);
@@ -369,8 +369,8 @@ struct cpu_scatter_gather_base_kernel {
       // NOLINTNEXTLINE(bugprone-argument-comment)
       .declare_static_shape(index.sizes(), /*squash_dim=*/dim)
       .add_output(buffer)
-      .add_input(src)
-      .add_input(index)
+      .add_const_input(src)
+      .add_const_input(index)
       .build();
 
     auto self_dim_stride = ensure_nonempty_stride(buffer, dim);
@@ -464,8 +464,8 @@ struct cpu_scatter_gather_base_kernel {
       // NOLINTNEXTLINE(bugprone-argument-comment)
       .declare_static_shape(index.sizes(), /*squash_dim=*/dim)
       .add_output(buffer)
-      .add_input(src)
-      .add_input(index)
+      .add_const_input(src)
+      .add_const_input(index)
       .build();
 
     auto self_dim_stride = ensure_nonempty_stride(buffer, dim);
@@ -560,8 +560,8 @@ struct cpu_scatter_gather_base_kernel {
       // NOLINTNEXTLINE(bugprone-argument-comment)
       .declare_static_shape(index.sizes(), /*squash_dim=*/dim)
       .add_output(buffer)
-      .add_input(src)
-      .add_input(index)
+      .add_const_input(src)
+      .add_const_input(index)
       .build();
 
     auto self_dim_stride = ensure_nonempty_stride(buffer, dim);
@@ -687,9 +687,9 @@ std::pair<K*, V*> radix_sort_parallel(
 
 template <typename scalar_t, ReductionType reduce>
 void cpu_scatter_reduce_expanded_index(const Tensor& self, const Tensor& index, const Tensor& src, bool include_self) {
-  int64_t* index_data = index.data_ptr<int64_t>();
+  const int64_t* index_data = index.const_data_ptr<int64_t>();
   scalar_t* self_data = self.data_ptr<scalar_t>();
-  scalar_t* src_data = src.data_ptr<scalar_t>();
+  const scalar_t* src_data = src.const_data_ptr<scalar_t>();
 
   const int64_t M = ensure_nonempty_size(self, 0);
   const int64_t nnz = ensure_nonempty_size(index, 0);
@@ -812,9 +812,9 @@ void cpu_scatter_reduce_expanded_index(const Tensor& self, const Tensor& index,
 
 template <typename scalar_t>
 void cpu_gather_expanded_index_kernel(const Tensor& result, const Tensor& index, const Tensor& self) {
-  int64_t* index_data = index.data_ptr<int64_t>();
+  const int64_t* index_data = index.const_data_ptr<int64_t>();
   scalar_t* result_data = result.data_ptr<scalar_t>();
-  scalar_t* self_data = self.data_ptr<scalar_t>();
+  const scalar_t* self_data = self.const_data_ptr<scalar_t>();
 
   const int64_t M = ensure_nonempty_size(result, 0);
   const int64_t N = ensure_nonempty_size(self, 0);
@@ -832,7 +832,7 @@ void cpu_gather_expanded_index_kernel(const Tensor& result, const Tensor& index,
                   "index ", index,
                   " is out of bounds for dimension ", 0,
                   " with size ", index_upper_bound);
-      scalar_t* self_ptr = self_data + index * K;
+      const scalar_t* self_ptr = self_data + index * K;
       int64_t d = 0;
       for (; d < K - (K % Vec::size()); d += Vec::size()) {
         Vec out_vec = Vec::loadu(self_ptr + d);
diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
index 80e5e947d692f..5f16ea72505fa 100644
--- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@@ -33,7 +33,7 @@ namespace at::native {
 namespace {
 template <typename scalar_t>
 inline void _vec_log_softmax_lastdim(
-    scalar_t* input_data_base,
+    const scalar_t* input_data_base,
     scalar_t* output_data_base,
     int64_t outer_size,
     int64_t dim_size) {
@@ -46,10 +46,13 @@ inline void _vec_log_softmax_lastdim(
       1,
       at::internal::GRAIN_SIZE / (sizeof(scalar_t) * dim_size));
   int64_t CHUNK_SIZE = std::min<int64_t>(MAX_CHUNK_SIZE, outer_size);
-
-  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
-
-  parallel_for(0, outer_size, grain_size, [&](int64_t begin, int64_t end) {
+  // Note: grain_size value of 0
+  // We don't change the number of OpenMP threads in the OpenMP thread-pool,
+  // so some threads do useful work, while others don't.
+  // We can simply use grain_size of 0 & rely upon invoke_parallel to distribute
+  // work among threads in an equitable manner. We compute CHUNK_SIZE to ensure
+  // each thread's computations would be efficient.
+  parallel_for(0, outer_size, 0, [&](int64_t begin, int64_t end) {
     // MSVC requires such a declaration of dynamic arrays
     // Source: https://stackoverflow.com/a/33423538
     auto tmp_sum_scalar = std::make_unique<scalar_t[]>(CHUNK_SIZE);
@@ -60,7 +63,7 @@ inline void _vec_log_softmax_lastdim(
         loop_end = end - ii;
       for (const auto j : c10::irange(loop_end)) {
         int64_t i = ii + j;
-        scalar_t* input_data = input_data_base + i * dim_size;
+        const scalar_t* input_data = input_data_base + i * dim_size;
         max_input_arr[j] = vec::reduce_all<scalar_t>(
             [](Vec& x, Vec& y) { return vec::maximum(x, y); },
             input_data,
@@ -68,7 +71,7 @@ inline void _vec_log_softmax_lastdim(
       }
       for (const auto j : c10::irange(loop_end)) {
         int64_t i = ii + j;
-        scalar_t* input_data = input_data_base + i * dim_size;
+        const scalar_t* input_data = input_data_base + i * dim_size;
         scalar_t max_input = max_input_arr[j];
         tmp_sum_scalar[j] = vec::map_reduce_all<scalar_t>(
             [max_input](Vec x) { return (x - Vec(max_input)).exp(); },
@@ -85,7 +88,7 @@ inline void _vec_log_softmax_lastdim(
           loop_end);
       for (const auto j : c10::irange(loop_end)) {
         int64_t i = ii + j;
-        scalar_t* input_data = input_data_base + i * dim_size;
+        const scalar_t* input_data = input_data_base + i * dim_size;
         scalar_t* output_data = output_data_base + i * dim_size;
         scalar_t tmp_sum = tmp_sum_scalar[j];
         scalar_t max_input = max_input_arr[j];
@@ -110,15 +113,15 @@ inline void _vec_log_softmax_lastdim(
 template<typename scalar_t>
 inline typename std::enable_if_t<std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
 _vec_softmax_lastdim(
-    scalar_t* input_data_base,
+    const scalar_t* input_data_base,
     scalar_t* output_data_base,
     int64_t outer_size,
     int64_t dim_size) {
   using Vec = vec::Vectorized<scalar_t>;
-  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
-  parallel_for(0, outer_size, grain_size, [&](int64_t begin, int64_t end) {
+  // See Note: grain_size value of 0
+  parallel_for(0, outer_size, 0, [&](int64_t begin, int64_t end) {
     for (const auto i : c10::irange(begin, end)) {
-      scalar_t* input_data = input_data_base + i * dim_size;
+      const scalar_t* input_data = input_data_base + i * dim_size;
       scalar_t* output_data = output_data_base + i * dim_size;
       scalar_t max_input = vec::reduce_all<scalar_t>(
           [](Vec& x, Vec& y) { return vec::maximum(x, y); },
@@ -144,20 +147,20 @@ _vec_softmax_lastdim(
 template<typename scalar_t>
 inline typename std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
 _vec_softmax_lastdim(
-    scalar_t* input_data_base,
+    const scalar_t* input_data_base,
     scalar_t* output_data_base,
     int64_t outer_size,
     int64_t dim_size) {
   using Vec = vec::Vectorized<scalar_t>;
   using fVec = vec::Vectorized<float>;
-  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
-  parallel_for(0, outer_size, grain_size, [&](int64_t begin, int64_t end) {
+  // See Note: grain_size value of 0
+  parallel_for(0, outer_size, 0, [&](int64_t begin, int64_t end) {
     // thread local temp buffer.
     auto buffer = std::make_unique<float []>(dim_size);
     float* buffer_data = buffer.get();
 
     for (const auto i : c10::irange(begin, end)) {
-      scalar_t* input_data = input_data_base + i * dim_size;
+      const scalar_t* input_data = input_data_base + i * dim_size;
       scalar_t* output_data = output_data_base + i * dim_size;
       // reduce to max and cache float input data
       fVec max_fvec = fVec(-std::numeric_limits<float>::infinity());
@@ -210,24 +213,21 @@ _vec_softmax_lastdim(
 template <typename scalar_t, bool log_softmax>
 inline void _vec_host_softmax_backward_lastdim(
     scalar_t* grad_input_data_base,
-    scalar_t* grad_data_base,
-    scalar_t* output_data_base,
+    const scalar_t* grad_data_base,
+    const scalar_t* output_data_base,
     int64_t outer_size,
     int64_t dim_size) {
   using Vec = vec::Vectorized<at::opmath_type<scalar_t>>;
-  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
-  if (grain_size < 1)
-    grain_size = 1;
-
+  // See Note: grain_size value of 0
   parallel_for(
       0,
       outer_size,
-      grain_size,
+      0,
       [&](int64_t begin, int64_t end) {
         for (const auto i : c10::irange(begin, end)) {
           scalar_t* grad_input_data = grad_input_data_base + i * dim_size;
-          scalar_t* grad_data = grad_data_base + i * dim_size;
-          scalar_t* output_data = output_data_base + i * dim_size;
+          const scalar_t* grad_data = grad_data_base + i * dim_size;
+          const scalar_t* output_data = output_data_base + i * dim_size;
           // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
           scalar_t sum;
           if (log_softmax) {
@@ -264,21 +264,22 @@ template<typename scalar_t>
 inline typename std::enable_if_t<std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
 _vec_softmax_backward(
     scalar_t* grad_input_data_base,
-    scalar_t* grad_output_data_base,
-    scalar_t* output_data_base,
+    const scalar_t* grad_output_data_base,
+    const scalar_t* output_data_base,
     int64_t outer_size,
     int64_t inner_size,
     int64_t dim_size) {
   using Vec = vec::Vectorized<scalar_t>;
   int64_t outer_stride = dim_size * inner_size;
   int64_t BLOCK_SIZE = 128 * 1024;
-  int64_t CHUNK_SIZE = std::max<int64_t>(
+  int64_t MAX_CHUNK_SIZE = std::max<int64_t>(
       BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size());
-  CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size();
+  MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size();
+  int64_t CHUNK_SIZE = std::min<int64_t>(MAX_CHUNK_SIZE, inner_size);
   int64_t num_chunks = divup(inner_size, CHUNK_SIZE);
-  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
+  // See Note: grain_size value of 0
   parallel_for(
-      0, outer_size * num_chunks, grain_size, [&](int64_t begin, int64_t end) {
+      0, outer_size * num_chunks, 0, [&](int64_t begin, int64_t end) {
         // thread local temp buffer that holds vertical sum result
         auto buffer = std::make_unique<scalar_t[]>(CHUNK_SIZE);
         scalar_t* tmp_sum_data = buffer.get();
@@ -303,8 +304,8 @@ _vec_softmax_backward(
           for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
             int64_t offset = outer_idx * outer_stride + dim_idx * inner_size +
                 inner_idx_begin;
-            scalar_t* grad_output_ptr = grad_output_data_base + offset;
-            scalar_t* output_ptr = output_data_base + offset;
+            const scalar_t* grad_output_ptr = grad_output_data_base + offset;
+            const scalar_t* output_ptr = output_data_base + offset;
 
             int64_t d1 = 0;
             for (; d1 < size - (size % Vec::size()); d1 += Vec::size()) {
@@ -323,8 +324,8 @@ _vec_softmax_backward(
           for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
             int64_t offset = outer_idx * outer_stride + dim_idx * inner_size +
                 inner_idx_begin;
-            scalar_t* grad_output_ptr = grad_output_data_base + offset;
-            scalar_t* output_ptr = output_data_base + offset;
+            const scalar_t* grad_output_ptr = grad_output_data_base + offset;
+            const scalar_t* output_ptr = output_data_base + offset;
             scalar_t* grad_input_ptr = grad_input_data_base + offset;
 
             int64_t d2 = 0;
@@ -347,8 +348,8 @@ template<typename scalar_t>
 inline typename std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
 _vec_softmax_backward(
     scalar_t* grad_input_data_base,
-    scalar_t* grad_output_data_base,
-    scalar_t* output_data_base,
+    const scalar_t* grad_output_data_base,
+    const scalar_t* output_data_base,
     int64_t outer_size,
     int64_t inner_size,
     int64_t dim_size) {
@@ -356,13 +357,14 @@ _vec_softmax_backward(
   using fVec = vec::Vectorized<float>;
   int64_t outer_stride = dim_size * inner_size;
   int64_t BLOCK_SIZE = 128 * 1024;
-  int64_t CHUNK_SIZE = std::max<int64_t>(
+  int64_t MAX_CHUNK_SIZE = std::max<int64_t>(
       BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size());
-  CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size();
+  MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size();
+  int64_t CHUNK_SIZE = std::min<int64_t>(MAX_CHUNK_SIZE, inner_size);
   int64_t num_chunks = divup(inner_size, CHUNK_SIZE);
-  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
+  // See Note: grain_size value of 0
   parallel_for(
-      0, outer_size * num_chunks, grain_size, [&](int64_t begin, int64_t end) {
+      0, outer_size * num_chunks, 0, [&](int64_t begin, int64_t end) {
         // thread local temp buffer that holds vertical sum result
         auto buffer = std::make_unique<float[]>(CHUNK_SIZE);
         float* tmp_sum_data = buffer.get();
@@ -395,8 +397,8 @@ _vec_softmax_backward(
           for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
             int64_t offset = outer_idx * outer_stride + dim_idx * inner_size +
                 inner_idx_begin;
-            scalar_t* grad_output_ptr = grad_output_data_base + offset;
-            scalar_t* output_ptr = output_data_base + offset;
+            const scalar_t* grad_output_ptr = grad_output_data_base + offset;
+            const scalar_t* output_ptr = output_data_base + offset;
             float* grad_output_buffer_ptr =
                 grad_output_buffer_data + dim_idx * CHUNK_SIZE;
             float* output_buffer_ptr =
@@ -473,21 +475,22 @@ template<typename scalar_t>
 inline typename std::enable_if_t<std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
 _vec_log_softmax_backward(
     scalar_t* grad_input_data_base,
-    scalar_t* grad_output_data_base,
-    scalar_t* output_data_base,
+    const scalar_t* grad_output_data_base,
+    const scalar_t* output_data_base,
     int64_t outer_size,
     int64_t inner_size,
     int64_t dim_size) {
   using Vec = vec::Vectorized<scalar_t>;
   int64_t outer_stride = dim_size * inner_size;
   int64_t BLOCK_SIZE = 128 * 1024;
-  int64_t CHUNK_SIZE = std::max<int64_t>(
+  int64_t MAX_CHUNK_SIZE = std::max<int64_t>(
       BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size());
-  CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size();
+  MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size();
+  int64_t CHUNK_SIZE = std::min<int64_t>(MAX_CHUNK_SIZE, inner_size);
   int64_t num_chunks = divup(inner_size, CHUNK_SIZE);
-  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
+  // See Note: grain_size value of 0
   parallel_for(
-      0, outer_size * num_chunks, grain_size, [&](int64_t begin, int64_t end) {
+      0, outer_size * num_chunks, 0, [&](int64_t begin, int64_t end) {
         // thread local temp buffer that holds vertical sum result
         auto buffer = std::make_unique<scalar_t[]>(CHUNK_SIZE);
         scalar_t* tmp_sum_data = buffer.get();
@@ -510,7 +513,7 @@ _vec_log_softmax_backward(
 
           // compute sum of grad_output
           for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
-            scalar_t* grad_output_ptr = grad_output_data_base +
+            const scalar_t* grad_output_ptr = grad_output_data_base +
                 outer_idx * outer_stride + dim_idx * inner_size +
                 inner_idx_begin;
 
@@ -530,8 +533,8 @@ _vec_log_softmax_backward(
           for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
             int64_t offset = outer_idx * outer_stride + dim_idx * inner_size +
                 inner_idx_begin;
-            scalar_t* grad_output_ptr = grad_output_data_base + offset;
-            scalar_t* output_ptr = output_data_base + offset;
+            const scalar_t* grad_output_ptr = grad_output_data_base + offset;
+            const scalar_t* output_ptr = output_data_base + offset;
             scalar_t* grad_input_ptr = grad_input_data_base + offset;
 
             int64_t d2 = 0;
@@ -555,8 +558,8 @@ template<typename scalar_t>
 inline typename std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
 _vec_log_softmax_backward(
     scalar_t* grad_input_data_base,
-    scalar_t* grad_output_data_base,
-    scalar_t* output_data_base,
+    const scalar_t* grad_output_data_base,
+    const scalar_t* output_data_base,
     int64_t outer_size,
     int64_t inner_size,
     int64_t dim_size) {
@@ -564,13 +567,14 @@ _vec_log_softmax_backward(
   using fVec = vec::Vectorized<float>;
   int64_t outer_stride = dim_size * inner_size;
   int64_t BLOCK_SIZE = 128 * 1024;
-  int64_t CHUNK_SIZE = std::max<int64_t>(
+  int64_t MAX_CHUNK_SIZE = std::max<int64_t>(
       BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size());
-  CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size();
+  MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size();
+  int64_t CHUNK_SIZE = std::min<int64_t>(MAX_CHUNK_SIZE, inner_size);
   int64_t num_chunks = divup(inner_size, CHUNK_SIZE);
-  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
+  // See Note: grain_size value of 0
   parallel_for(
-      0, outer_size * num_chunks, grain_size, [&](int64_t begin, int64_t end) {
+      0, outer_size * num_chunks, 0, [&](int64_t begin, int64_t end) {
         // thread local temp buffer that holds vertical sum result
         auto buffer = std::make_unique<float[]>(CHUNK_SIZE);
         float* tmp_sum_data = buffer.get();
@@ -598,7 +602,7 @@ _vec_log_softmax_backward(
 
           // compute sum of grad_output
           for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
-            scalar_t* grad_output_ptr = grad_output_data_base +
+            const scalar_t* grad_output_ptr = grad_output_data_base +
                 outer_idx * outer_stride + dim_idx * inner_size +
                 inner_idx_begin;
             float* grad_output_buffer_ptr =
@@ -632,7 +636,7 @@ _vec_log_softmax_backward(
           for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
             int64_t offset = outer_idx * outer_stride + dim_idx * inner_size +
                 inner_idx_begin;
-            scalar_t* output_ptr = output_data_base + offset;
+            const scalar_t* output_ptr = output_data_base + offset;
             scalar_t* grad_input_ptr = grad_input_data_base + offset;
             float* grad_output_buffer_ptr =
                 grad_output_buffer_data + dim_idx * CHUNK_SIZE;
@@ -671,7 +675,7 @@ struct vec_host_softmax_lastdim {
     int64_t dim_size = input.size(input.ndimension() - 1);
     for (int64_t i = 0; i < input.ndimension() - 1; ++i)
       outer_size *= input.size(i);
-    scalar_t* input_data_base = input.data_ptr<scalar_t>();
+    const scalar_t* input_data_base = input.const_data_ptr<scalar_t>();
     scalar_t* output_data_base = output.data_ptr<scalar_t>();
     if (LogSoftMax) {
       _vec_log_softmax_lastdim(
@@ -686,7 +690,7 @@ struct vec_host_softmax_lastdim {
 template<typename scalar_t>
 inline typename std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
 _vec_softmax(
-    scalar_t* input_data_base,
+    const scalar_t* input_data_base,
     scalar_t* output_data_base,
     int64_t outer_size,
     int64_t inner_size,
@@ -695,10 +699,10 @@ _vec_softmax(
   using Vec16 = vec::Vectorized<scalar_t>;
   int64_t dim_stride = inner_size;
   int64_t outer_stride = dim_size * dim_stride;
-  int64_t grain_size = internal::GRAIN_SIZE / dim_size;
   int vectorized_step = Vec16().size(); // Currently, we only support BFloat16/Half in this special implementation
+  // See Note: grain_size value of 0
   parallel_for(
-      0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) {
+      0, outer_size * inner_size, 0, [&](int64_t begin, int64_t end) {
         int64_t idx = begin;
         std::unique_ptr<float[]> temp_vec_input(new float[dim_size*vectorized_step]());
         std::unique_ptr<float[]> temp_vec_output(new float[dim_size*vectorized_step]());
@@ -709,7 +713,7 @@ _vec_softmax(
           int64_t inner_idx = idx % inner_size;
           if (((inner_idx + vectorized_step) <= inner_size) && ((idx + vectorized_step) <= end)) {
             // Vectorization
-            scalar_t* input_data =
+            const scalar_t* input_data =
                 input_data_base + outer_idx * outer_stride + inner_idx;
             scalar_t* output_data =
                 output_data_base + outer_idx * outer_stride + inner_idx;
@@ -756,13 +760,13 @@ _vec_softmax(
             // Tail case(Scalar): it is exactly same logic as host_softmax
             // inside aten/src/ATen/native/SoftMax.cpp. There are 2 kind of
             // cases which will fall through this part:
-            // Case 1: For the idx at the end of total chunk for each thread, there are not enough numbers for parallization.
-            // Case 2: For the idx at the end of each inner_size inside thread, there are not enough numbers for parallization.
+            // Case 1: For the idx at the end of total chunk for each thread, there are not enough numbers for parallelization.
+            // Case 2: For the idx at the end of each inner_size inside thread, there are not enough numbers for parallelization.
             int64_t tail_number = ((idx+vectorized_step) > end) ? /*Case1*/ (end - idx) : /*Case2*/ (inner_size - inner_idx);
             for (const auto i : c10::irange(tail_number)) {
               outer_idx = (idx + i) / inner_size;
               inner_idx = (idx + i) % inner_size;
-              scalar_t* input_data =
+              const scalar_t* input_data =
                   input_data_base + outer_idx * outer_stride + inner_idx;
               scalar_t* output_data =
                   output_data_base + outer_idx * outer_stride + inner_idx;
@@ -794,7 +798,7 @@ _vec_softmax(
 template<typename scalar_t>
 inline typename std::enable_if_t<std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
 _vec_softmax(
-    scalar_t* input_data_base,
+    const scalar_t* input_data_base,
     scalar_t* output_data_base,
     int64_t outer_size,
     int64_t inner_size,
@@ -802,17 +806,17 @@ _vec_softmax(
   using Vec = vec::Vectorized<scalar_t>;
   int64_t dim_stride = inner_size;
   int64_t outer_stride = dim_size * dim_stride;
-  int64_t grain_size = internal::GRAIN_SIZE / dim_size;
   int vectorized_step = Vec().size();
+  // See Note: grain_size value of 0
   parallel_for(
-      0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) {
+      0, outer_size * inner_size, 0, [&](int64_t begin, int64_t end) {
         int64_t idx = begin;
         while (idx < end) {
           int64_t outer_idx = idx / inner_size;
           int64_t inner_idx = idx % inner_size;
           if (((inner_idx + vectorized_step) <= inner_size) && ((idx + vectorized_step) <= end)) {
             // Vectorization
-            scalar_t* input_data =
+            const scalar_t* input_data =
                 input_data_base + outer_idx * outer_stride + inner_idx;
             scalar_t* output_data =
                 output_data_base + outer_idx * outer_stride + inner_idx;
@@ -841,13 +845,13 @@ _vec_softmax(
             // Tail case(Scalar): it is exactly same logic as host_softmax
             // inside aten/src/ATen/native/SoftMax.cpp. There are 2 kind of
             // cases which will fall through this part:
-            // Case 1: For the idx at the end of total chunk for each thread, there are not enough numbers for parallization.
-            // Case 2: For the idx at the end of each inner_size inside thread, there are not enough numbers for parallization.
+            // Case 1: For the idx at the end of total chunk for each thread, there are not enough numbers for parallelization.
+            // Case 2: For the idx at the end of each inner_size inside thread, there are not enough numbers for parallelization.
             int64_t tail_number = ((idx+vectorized_step) > end) ? /*Case1*/ (end - idx) : /*Case2*/ (inner_size - inner_idx);
             for (const auto i : c10::irange(tail_number)) {
               outer_idx = (idx + i) / inner_size;
               inner_idx = (idx + i) % inner_size;
-              scalar_t* input_data =
+              const scalar_t* input_data =
                   input_data_base + outer_idx * outer_stride + inner_idx;
               scalar_t* output_data =
                   output_data_base + outer_idx * outer_stride + inner_idx;
@@ -878,7 +882,7 @@ _vec_softmax(
 // NB: fast kernel for log_softmax when dim != -1
 // input shape is normalized to {outer_size, dim_size, inner_size}
 //
-// The algorithm requires to load input tensor 3 times, to increase parallelsim
+// The algorithm requires to load input tensor 3 times, to increase parallelism
 // and cache hit rate, inner_size is blocked as:
 //   inner_size: {CHUNK_SIZE, CHUNK_SIZE, ..., Remainder}
 //
@@ -888,19 +892,20 @@ _vec_softmax(
 template<typename scalar_t>
 inline typename std::enable_if_t<std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
 _vec_logsoftmax(
-    scalar_t* input_data_base,
+    const scalar_t* input_data_base,
     scalar_t* output_data_base,
     int64_t outer_size,
     int64_t inner_size,
     int64_t dim_size) {
   using Vec = vec::Vectorized<scalar_t>;
   int64_t BLOCK_SIZE = 128 * 1024;
-  int64_t CHUNK_SIZE = std::max<int64_t>(BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size());
-  CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size();
+  int64_t MAX_CHUNK_SIZE = std::max<int64_t>(BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size());
+  MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size();
+  int64_t CHUNK_SIZE = std::min<int64_t>(MAX_CHUNK_SIZE, inner_size);
   int64_t num_chunks = divup(inner_size, CHUNK_SIZE);
 
-  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
-  at::parallel_for(0, outer_size * num_chunks, grain_size, [&](int64_t begin, int64_t end) {
+  // See Note: grain_size value of 0
+  at::parallel_for(0, outer_size * num_chunks, 0, [&](int64_t begin, int64_t end) {
     // thread local temp buffer which holds vertical reduction result: max and sum.
     auto buffer = std::make_unique<scalar_t []>(CHUNK_SIZE * 2);
     scalar_t* input_max_data = buffer.get();
@@ -927,7 +932,7 @@ _vec_logsoftmax(
 
       // compute max
       for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
-        scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size
+        const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size
             + dim_idx * inner_size + inner_idx_begin;
 
         int64_t d1 = 0;
@@ -946,7 +951,7 @@ _vec_logsoftmax(
 
       // compute sum of (x - max).exp()
       for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
-        scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size
+        const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size
             + dim_idx * inner_size + inner_idx_begin;
 
         int64_t d2 = 0;
@@ -970,7 +975,7 @@ _vec_logsoftmax(
       // compute x - max - sum
       for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
         int64_t offset = outer_idx * dim_size * inner_size + dim_idx * inner_size + inner_idx_begin;
-        scalar_t* input_ptr = input_data_base + offset;
+        const scalar_t* input_ptr = input_data_base + offset;
         scalar_t* output_ptr = output_data_base + offset;
 
         int64_t d3 = 0;
@@ -992,7 +997,7 @@ _vec_logsoftmax(
 template<typename scalar_t>
 inline typename std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
 _vec_logsoftmax(
-    scalar_t* input_data_base,
+    const scalar_t* input_data_base,
     scalar_t* output_data_base,
     int64_t outer_size,
     int64_t inner_size,
@@ -1000,12 +1005,13 @@ _vec_logsoftmax(
   using Vec = vec::Vectorized<scalar_t>;
   using fVec = vec::Vectorized<float>;
   int64_t BLOCK_SIZE = 128 * 1024;
-  int64_t CHUNK_SIZE = std::max<int64_t>(BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size());
-  CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size();
+  int64_t MAX_CHUNK_SIZE = std::max<int64_t>(BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size());
+  MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size();
+  int64_t CHUNK_SIZE = std::min<int64_t>(MAX_CHUNK_SIZE, inner_size);
   int64_t num_chunks = divup(inner_size, CHUNK_SIZE);
 
-  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
-  at::parallel_for(0, outer_size * num_chunks, grain_size, [&](int64_t begin, int64_t end) {
+  // See Note: grain_size value of 0
+  at::parallel_for(0, outer_size * num_chunks, 0, [&](int64_t begin, int64_t end) {
     auto buffer = std::make_unique<float []>(CHUNK_SIZE * 2);
     float* input_max_data = buffer.get();
     float* tmp_sum_data = buffer.get() + CHUNK_SIZE;
@@ -1037,7 +1043,7 @@ _vec_logsoftmax(
 
       // compute max
       for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) {
-        scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size
+        const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size
             + dim_idx * inner_size + inner_idx_begin;
         float* input_buffer_ptr = input_buffer_data + dim_idx * CHUNK_SIZE;
 
@@ -1127,7 +1133,7 @@ struct vec_softmax {
     for (const auto i : c10::irange(dim))outer_size *= input.size(i);
     for (int64_t i = dim + 1; i < input.dim(); ++i)
       inner_size *= input.size(i);
-    scalar_t* input_data_base = input.data_ptr<scalar_t>();
+    const scalar_t* input_data_base = input.const_data_ptr<scalar_t>();
     scalar_t* output_data_base = output.data_ptr<scalar_t>();
     if (LogSoftMax) {
       _vec_logsoftmax(
@@ -1148,8 +1154,8 @@ struct vec_host_softmax_backward_lastdim {
     for (int64_t i = 0; i < grad.ndimension() - 1; ++i)
       outer_size *= grad.size(i);
     scalar_t* grad_input_data_base = grad_input.mutable_data_ptr<scalar_t>();
-    scalar_t* grad_data_base = grad.data_ptr<scalar_t>();
-    scalar_t* output_data_base = output.data_ptr<scalar_t>();
+    const scalar_t* grad_data_base = grad.const_data_ptr<scalar_t>();
+    const scalar_t* output_data_base = output.const_data_ptr<scalar_t>();
     _vec_host_softmax_backward_lastdim<scalar_t, LogSoftMax>(
         grad_input_data_base,
         grad_data_base,
@@ -1176,8 +1182,8 @@ struct vec_host_softmax_backward {
       inner_size *= grad.size(i);
     }
     scalar_t* grad_input_data_base = grad_input.mutable_data_ptr<scalar_t>();
-    scalar_t* grad_output_data_base = grad.data_ptr<scalar_t>();
-    scalar_t* output_data_base = output.data_ptr<scalar_t>();
+    const scalar_t* grad_output_data_base = grad.const_data_ptr<scalar_t>();
+    const scalar_t* output_data_base = output.const_data_ptr<scalar_t>();
     if (LogSoftMax) {
       _vec_log_softmax_backward<scalar_t>(
           grad_input_data_base,
diff --git a/aten/src/ATen/native/cpu/SortingKernel.cpp b/aten/src/ATen/native/cpu/SortingKernel.cpp
index 89756906580a8..22ba0152153d3 100644
--- a/aten/src/ATen/native/cpu/SortingKernel.cpp
+++ b/aten/src/ATen/native/cpu/SortingKernel.cpp
@@ -5,6 +5,7 @@
 #include <ATen/native/Sorting.h>
 #include <ATen/core/TensorBase.h>
 #include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
 #include <ATen/Parallel.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/TensorIterator.h>
@@ -42,9 +43,8 @@ void _dim_apply(
   auto indices_dim_stride = indices.stride(dim);
   auto dim_size = values.size(dim);
 
-  AT_DISPATCH_ALL_TYPES_AND3(
-    ScalarType::Bool, ScalarType::Half, ScalarType::BFloat16, iter.dtype(),
-    "sorting_kernel_method_name", [&] {
+  AT_DISPATCH_V2(
+    iter.dtype(), "sorting_kernel_method_name", AT_WRAP([&] {
       auto loop = [&](char** data, const int64_t* strides, int64_t n) {
         auto* values_data_bytes = data[0];
         auto* indices_data_bytes = data[1];
@@ -69,7 +69,7 @@ void _dim_apply(
 
       int64_t grain_size = internal::GRAIN_SIZE / std::max(int64_t{1}, dim_size);
       iter.for_each(loop, /*grain_size=*/grain_size);
-    }
+    }), kBool, kHalf, kBFloat16, AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)
   );
 }
 
@@ -216,7 +216,7 @@ static void topk_kernel(
     .declare_static_shape(sizes, /*squash_dims=*/dim)
     .add_output(values)
     .add_output(indices)
-    .add_input(self)
+    .add_const_input(self)
     .build();
 
   auto mode_values_stride = values.strides()[dim];
diff --git a/aten/src/ATen/native/cpu/SparseFactories.cpp b/aten/src/ATen/native/cpu/SparseFactories.cpp
index 8f938e545f27a..2c0b54b8dd7af 100644
--- a/aten/src/ATen/native/cpu/SparseFactories.cpp
+++ b/aten/src/ATen/native/cpu/SparseFactories.cpp
@@ -29,7 +29,7 @@ void _spdiags_kernel_cpu(
       "spdiags_cpu",
       [&] {
         auto* const values_write_ptr = values.data_ptr<scalar_t>();
-        const auto* const diagonals_ptr = diagonals.data_ptr<scalar_t>();
+        const auto* const diagonals_ptr = diagonals.const_data_ptr<scalar_t>();
 
         cpu_kernel(
             iter,
diff --git a/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp b/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp
index d9aa9a35f1b0d..36f36746dbd89 100644
--- a/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp
+++ b/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp
@@ -24,7 +24,7 @@ namespace at { namespace native {
 namespace {
 
 template <typename scalar_t, typename index_t, ReductionType reduce>
-inline void _update(at::opmath_type<scalar_t>* out_ptr, int64_t e, int64_t c, const scalar_t val, scalar_t* other_data, int64_t K) {
+inline void _update(at::opmath_type<scalar_t>* out_ptr, int64_t e, int64_t c, const scalar_t val, const scalar_t* other_data, int64_t K) {
   using opmath_t = at::opmath_type<scalar_t>;
   using Vec = vec::Vectorized<scalar_t>;
   using aVec = VecType<scalar_t>;
@@ -33,7 +33,7 @@ inline void _update(at::opmath_type<scalar_t>* out_ptr, int64_t e, int64_t c, co
 
   int64_t k = 0;
   aVec val_vec = aVec((opmath_t)val);
-  scalar_t* other_ptr = other_data + c * K;
+  const scalar_t* other_ptr = other_data + c * K;
 
   for (; k < K - (K % kVLEN); k += kVLEN) {
     aVec out_vec0 = aVec::loadu(out_ptr + k);
@@ -78,12 +78,12 @@ void spmm_reduce_kernel_impl(
 
   auto other = other_.contiguous();
 
-  // access `crow_indices`, `col_indices` and `values` via TessorAccessor
+  // access `crow_indices`, `col_indices` and `values` via TensorAccessor
   scalar_t* out_data = out.data_ptr<scalar_t>();
-  auto csr_data = crow_indices.accessor<index_t, 1>();
-  auto col_data = col_indices.accessor<index_t, 1>();
-  auto val_data = values.accessor<scalar_t, 1>();
-  scalar_t* other_data = other.data_ptr<scalar_t>();
+  auto csr_data = crow_indices.accessor<const index_t, 1>();
+  auto col_data = col_indices.accessor<const index_t, 1>();
+  auto val_data = values.accessor<const scalar_t, 1>();
+  const scalar_t* other_data = other.const_data_ptr<scalar_t>();
 
   int64_t M = crow_indices.numel() - 1;
   int64_t K = other.size(-1);
@@ -178,10 +178,10 @@ void spmm_reduce_arg_kernel_impl(
 
   scalar_t* out_data = out.data_ptr<scalar_t>();
   index_t* arg_out_data = arg_out.data_ptr<index_t>();
-  auto csr_data = crow_indices.accessor<index_t, 1>();
-  auto col_data = col_indices.accessor<index_t, 1>();
-  auto val_data = values.accessor<scalar_t, 1>();
-  scalar_t* other_data = other.data_ptr<scalar_t>();
+  auto csr_data = crow_indices.accessor<const index_t, 1>();
+  auto col_data = col_indices.accessor<const index_t, 1>();
+  auto val_data = values.accessor<const scalar_t, 1>();
+  const scalar_t* other_data = other.const_data_ptr<scalar_t>();
 
   int64_t M = crow_indices.numel() - 1;
   int64_t K = other.size(-1);
@@ -222,7 +222,7 @@ void spmm_reduce_arg_kernel_impl(
           c = col_data[e];
           opmath_t val = opmath_t(val_data[e]);
 
-          scalar_t* other_ptr = other_data + c * K;
+          const scalar_t* other_ptr = other_data + c * K;
           for (const auto k : c10::irange(K)) {
             update_with_index<opmath_t, index_t, reduce>(
                 &buffer_ptr[k], opmath_t(val *  other_ptr[k]), &arg_out_ptr[k], index_t(e));
@@ -257,11 +257,11 @@ void spmm_reduce_backward_input_kernel_impl(
 
   auto values = grad_self.values();
   auto grad_values_data = values.accessor<scalar_t, 1>();
-  scalar_t* grad_out_data = grad_out.data_ptr<scalar_t>();
-  auto crow_data = crow_indices.accessor<index_t, 1>();
-  auto col_data = col_indices.accessor<index_t, 1>();
-  scalar_t* other_data = other.data_ptr<scalar_t>();
-  auto row_data = row_indices.accessor<index_t, 1>();
+  const scalar_t* grad_out_data = grad_out.const_data_ptr<scalar_t>();
+  auto crow_data = crow_indices.accessor<const index_t, 1>();
+  auto col_data = col_indices.accessor<const index_t, 1>();
+  const scalar_t* other_data = other.const_data_ptr<scalar_t>();
+  auto row_data = row_indices.accessor<const index_t, 1>();
 
   int64_t K = grad_out.size(1);
 
@@ -307,9 +307,9 @@ void spmm_reduce_backward_input_arg_kernel_impl(
 
   auto grad_values = grad_self.values();
   auto grad_values_data = grad_values.accessor<scalar_t, 1>();
-  scalar_t* grad_out_data = grad_out.data_ptr<scalar_t>();
-  auto col_data = col_indices.accessor<index_t, 1>();
-  scalar_t* other_data = other.data_ptr<scalar_t>();
+  const scalar_t* grad_out_data = grad_out.const_data_ptr<scalar_t>();
+  auto col_data = col_indices.accessor<const index_t, 1>();
+  const scalar_t* other_data = other.const_data_ptr<scalar_t>();
   index_t* arg_out_data = arg_out.data_ptr<index_t>();
 
   int64_t M = grad_out.size(0);
@@ -319,7 +319,7 @@ void spmm_reduce_backward_input_arg_kernel_impl(
 
   at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) {
     for (const auto m : c10::irange(begin, end)) {
-      scalar_t* grad_out_ptr = grad_out_data + m * K;
+      const scalar_t* grad_out_ptr = grad_out_data + m * K;
       scalar_t* grad_ptr = grad_data + m * K;
       index_t* arg_out_ptr = arg_out_data + m * K;
 
@@ -389,10 +389,10 @@ void spmm_reduce_backward_other_arg_kernel_impl(
   auto arg_out = arg_out_.contiguous();
 
   scalar_t* grad_other_data = grad_other.data_ptr<scalar_t>();
-  scalar_t* grad_out_data = grad_out.data_ptr<scalar_t>();
-  auto col_data = col_indices.accessor<index_t, 1>();
-  auto values_data = values.accessor<scalar_t, 1>();
-  index_t* arg_out_data = arg_out.data_ptr<index_t>();
+  const scalar_t* grad_out_data = grad_out.const_data_ptr<scalar_t>();
+  auto col_data = col_indices.accessor<const index_t, 1>();
+  auto values_data = values.accessor<const scalar_t, 1>();
+  const index_t* arg_out_data = arg_out.const_data_ptr<index_t>();
 
   int64_t M = grad_out.size(0);
   int64_t K = grad_out.size(1);
@@ -401,9 +401,9 @@ void spmm_reduce_backward_other_arg_kernel_impl(
 
   at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) {
     for (const auto m : c10::irange(begin, end)) {
-      scalar_t* grad_out_ptr = grad_out_data + m * K;
+      const scalar_t* grad_out_ptr = grad_out_data + m * K;
       scalar_t* grad_ptr = grad_data + m * K;
-      index_t* arg_out_ptr = arg_out_data + m * K;
+      const index_t* arg_out_ptr = arg_out_data + m * K;
 
       for (const auto k : c10::irange(K)) {
         if (arg_out_ptr[k] == index_t(nnz)) {
diff --git a/aten/src/ATen/native/cpu/SumKernel.cpp b/aten/src/ATen/native/cpu/SumKernel.cpp
index 3f0fde5d4b6e2..7865a6a82d272 100644
--- a/aten/src/ATen/native/cpu/SumKernel.cpp
+++ b/aten/src/ATen/native/cpu/SumKernel.cpp
@@ -6,7 +6,7 @@
 #include <ATen/native/cpu/Reduce.h>
 #include <ATen/native/cpu/utils.h>
 #include <c10/util/irange.h>
-
+#include <ATen/cpu/vec/functional.h>
 #include <algorithm>
 
 namespace at::native {
@@ -82,8 +82,13 @@ struct CastLoadPolicy<scalar_t, scalar_t>:
 };
 
 // For inner sum, load full vec_t then sum partials down to vacc_t size
+template <typename vec_t, typename vacc_t, typename = void>
+struct InnerSumCastLoadPolicy;
+
 template <typename vec_t, typename vacc_t>
-struct InnerSumCastLoadPolicy {
+struct InnerSumCastLoadPolicy <vec_t, vacc_t,
+  std::enable_if_t<(!is_reduced_floating_point_v<vechold_type<vec_t>>) &&
+                    !std::is_same_v<vec_t, vacc_t>>> {
   using scalar_t = vechold_type<vec_t>;
   using acc_t = vechold_type<vacc_t>;
 
@@ -100,30 +105,35 @@ struct InnerSumCastLoadPolicy {
 };
 
 template <typename scalar_t>
-struct InnerSumCastLoadPolicy<scalar_t, scalar_t>:
+struct InnerSumCastLoadPolicy<scalar_t, scalar_t, void>:
     LoadPolicy<scalar_t> {
 };
 
-template <>
-struct InnerSumCastLoadPolicy<Vectorized<c10::BFloat16>, Vectorized<float>> {
-  using vec_t = Vectorized<c10::BFloat16>;
-  using vacc_t = Vectorized<float>;
+template <typename vec_t, typename vacc_t>
+struct InnerSumCastLoadPolicy <vec_t, vacc_t, std::enable_if_t<is_reduced_floating_point_v<vechold_type<vec_t>>>> {
+  using scalar_t = vechold_type<vec_t>;
 
   static constexpr int64_t memsize() {
     return LoadPolicy<vec_t>::memsize();
   }
 
   static vacc_t load(const char * C10_RESTRICT data, int64_t stride, int64_t index) {
-    auto ptr = reinterpret_cast<const c10::BFloat16*>(data + stride * index);
+    auto ptr = reinterpret_cast<const scalar_t*>(data + stride * index);
     vacc_t first, second;
-    vec::load_fp32_from_bf16(ptr, first, second);
+    vec::load_to_float<scalar_t>(ptr, first, second);
     return first + second;
   }
 };
 
 // For outer sum, load a partial vec_t of size vacc_t then cast to vacc_t
+template <typename vec_t, typename vacc_t, typename = void>
+struct OuterSumCastLoadPolicy;
+
 template <typename vec_t, typename vacc_t>
-struct OuterSumCastLoadPolicy {
+struct OuterSumCastLoadPolicy <vec_t, vacc_t,
+  std::enable_if_t<(!is_reduced_floating_point_v<vechold_type<vec_t>>) &&
+                    !std::is_same_v<vec_t, vacc_t>>> {
+
   using scalar_t = vechold_type<vec_t>;
   using acc_t = vechold_type<vacc_t>;
 
@@ -146,25 +156,24 @@ struct OuterSumCastLoadPolicy {
   }
 };
 
-template <>
-struct OuterSumCastLoadPolicy<Vectorized<c10::BFloat16>, Vectorized<float>> {
-  using vec_t = Vectorized<c10::BFloat16>;
-  using vacc_t = Vectorized<float>;
+template <typename vec_t, typename vacc_t>
+struct OuterSumCastLoadPolicy <vec_t, vacc_t, std::enable_if_t<is_reduced_floating_point_v<vechold_type<vec_t>>>> {
+  using scalar_t = vechold_type<vec_t>;
 
   static constexpr int64_t memsize() {
-    return sizeof(c10::BFloat16) * vacc_t::size();
+    return sizeof(scalar_t) * vacc_t::size();
   }
 
   static vacc_t load(const char * C10_RESTRICT data, int64_t stride, int64_t index) {
-    auto ptr = reinterpret_cast<const c10::BFloat16*>(data + stride * index);
+    auto ptr = reinterpret_cast<const scalar_t*>(data + stride * index);
     vacc_t values;
-    vec::load_fp32_from_bf16(ptr, values);
+    vec::load_to_float<scalar_t>(ptr, values);
     return values;
   }
 };
 
 template <typename scalar_t>
-struct OuterSumCastLoadPolicy<scalar_t, scalar_t>:
+struct OuterSumCastLoadPolicy<scalar_t, scalar_t, void>:
     LoadPolicy<scalar_t> {
 };
 
@@ -210,8 +219,13 @@ struct NanSumCastLoadPolicy {
   }
 };
 
+template <typename vec_t, typename vacc_t, typename = void>
+struct InnerNanSumCastLoadPolicy;
+
 template <typename vec_t, typename vacc_t>
-struct InnerNanSumCastLoadPolicy {
+struct InnerNanSumCastLoadPolicy <vec_t, vacc_t,
+  std::enable_if_t<(!is_reduced_floating_point_v<vechold_type<vec_t>>) &&
+                    !std::is_same_v<vec_t, vacc_t>>> {
   using scalar_t = vechold_type<vec_t>;
   using acc_t = vechold_type<vacc_t>;
 
@@ -228,23 +242,22 @@ struct InnerNanSumCastLoadPolicy {
 };
 
 template <typename scalar_t>
-struct InnerNanSumCastLoadPolicy<scalar_t, scalar_t> :
+struct InnerNanSumCastLoadPolicy<scalar_t, scalar_t, void>:
     NanSumLoadPolicy<scalar_t> {
 };
 
-template <>
-struct InnerNanSumCastLoadPolicy<Vectorized<c10::BFloat16>, Vectorized<float>> {
-  using vec_t = Vectorized<c10::BFloat16>;
-  using vacc_t = Vectorized<float>;
+template <typename vec_t, typename vacc_t>
+struct InnerNanSumCastLoadPolicy <vec_t, vacc_t, std::enable_if_t<is_reduced_floating_point_v<vechold_type<vec_t>>>> {
+  using scalar_t = vechold_type<vec_t>;
 
   static constexpr int64_t memsize() {
     return LoadPolicy<vec_t>::memsize();
   }
 
   static vacc_t load(const char * C10_RESTRICT data, int64_t stride, int64_t index) {
-    auto ptr = reinterpret_cast<const c10::BFloat16*>(data + stride * index);
+    auto ptr = reinterpret_cast<const scalar_t*>(data + stride * index);
     vacc_t first, second;
-    vec::load_fp32_from_bf16(ptr, first, second);
+    vec::load_to_float<scalar_t>(ptr, first, second);
     const vacc_t zero(0);
     return (vacc_t::blendv(first, zero, first.isnan()) +
             vacc_t::blendv(second, zero, second.isnan()));
diff --git a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
index f014c34c7e2e0..984e60056af9a 100644
--- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
+++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
@@ -59,7 +59,7 @@ static inline void compare_base_kernel_core(
     .declare_static_shape(self.sizes(), /*squash_dims=*/dim)
     .add_output(result1)
     .add_output(result2)
-    .add_input(self)
+    .add_const_input(self)
     .build();
 
   iter.for_each(loop, /* grain_size */ 1);
@@ -320,13 +320,13 @@ static void isin_default_kernel_cpu(
 
   auto iter = TensorIteratorConfig()
     .add_output(out)
-    .add_input(promoted_elements)
+    .add_const_input(promoted_elements)
     .check_all_same_dtype(false)
     .build();
   // Dispatch based on promoted type.
   AT_DISPATCH_ALL_TYPES(iter.dtype(1), "isin_default_cpu", [&]() {
     cpu_kernel(iter, [&](scalar_t element_val) -> bool {
-      const auto* test_element_data = test_elements_flat.data_ptr<scalar_t>();
+      const auto* test_element_data = test_elements_flat.const_data_ptr<scalar_t>();
       for (const auto j : c10::irange(test_elements_flat.numel())) {
         if (element_val == *(test_element_data + test_elements_stride * j)) {
           return !invert;
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index a966e4ac6dd18..461ceb2f36383 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -18,7 +18,6 @@
 #include <ATen/native/cpu/zmath.h>
 #include <ATen/OpMathType.h>
 
-#include <c10/util/math_compat.h>
 #include <c10/util/MathConstants.h>
 #include <c10/core/Scalar.h>
 #include <c10/util/TypeSafeSignMath.h>
@@ -45,8 +44,7 @@ static void sigmoid_kernel(TensorIteratorBase& iter) {
             return static_cast<float>(1) / (static_cast<float>(1) + std::exp((-a0)));
           },
           [=](Vectorized<scalar_t> a) {
-            Vectorized<float> a0, a1;
-            std::tie(a0, a1) = convert_to_float<scalar_t>(a);
+            auto [a0, a1] = convert_to_float<scalar_t>(a);
             a0 = (Vectorized<float>(static_cast<float>(1)) + a0.neg().exp()).reciprocal();
             a1 = (Vectorized<float>(static_cast<float>(1)) + a1.neg().exp()).reciprocal();
             return convert_from_float<scalar_t>(a0, a1);
@@ -145,6 +143,7 @@ static void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar) {
         const scalar_t eps = eps_scalar.to<scalar_t>();
         if (at::hasMKL() && iter.is_contiguous()) {
           LogitMKLKernel<scalar_t>(eps, &iter);
+          iter.cast_outputs();
         } else if (eps < scalar_t(0)) {
           const Vectorized<scalar_t> kOneVec(scalar_t(1));
           cpu_kernel_vec(
@@ -180,9 +179,9 @@ static void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar) {
 }
 
 #if !defined(C10_MOBILE)
-#define _AT_DISPATCH_ABS_TYPES(TYPE, NAME, ...)                   \
-        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(                   \
-            kHalf, kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn,       \
+#define _AT_DISPATCH_ABS_TYPES(TYPE, NAME, ...)                                                 \
+        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND6(                                                 \
+            kHalf, kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, \
             TYPE, NAME, __VA_ARGS__)
 #else
 #define _AT_DISPATCH_ABS_TYPES(TYPE, NAME, ...)          \
@@ -356,8 +355,9 @@ static void sinc_kernel(TensorIteratorBase& iter) {
           if (a == scalar_t(0)) {
             return scalar_t(1);
           } else {
-            scalar_t product = c10::pi<scalar_t> * a;
-            return std::sin(product) / product;
+            using opmath_t = at::opmath_type<scalar_t>;
+            opmath_t product = c10::pi<opmath_t> * opmath_t{a};
+            return static_cast<scalar_t>(std::sin(product) / product);
           }
         });
   });
@@ -523,8 +523,8 @@ static void kaiser_window_kernel(TensorIteratorBase& iter, int64_t window_length
     using opmath_t = at::opmath_type<scalar_t>;
     const opmath_t alpha = static_cast<opmath_t>((window_length - 1) / 2.0);
     const opmath_t beta_ = static_cast<opmath_t>(beta);
-    cpu_kernel(iter, [=](scalar_t a){
-        return calc_i0(beta_ * std::sqrt(1 - std::pow((static_cast<opmath_t>(a) - alpha) / alpha, static_cast<opmath_t>(2.0)))) / calc_i0(beta_);
+    cpu_kernel(iter, [=](scalar_t a) -> scalar_t {
+        return calc_i0(beta_ * std::sqrt(std::abs(1 - std::pow((static_cast<opmath_t>(a) - alpha) / alpha, static_cast<opmath_t>(2.0))))) / calc_i0(beta_);
     });
   });
 }
diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp
index bb35ef23b8eaa..026cfa812f3c6 100644
--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
+++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
@@ -228,7 +228,7 @@ void unfolded2d_acc_kernel(
 
 template <typename scalar_t>
 static void unfolded2d_copy(
-    scalar_t* input_data,
+    const scalar_t* input_data,
     scalar_t* finput_data,
     int64_t kH,
     int64_t kW,
@@ -256,7 +256,7 @@ static void unfolded2d_copy(
               nip * ((size_t)kH * kW * output_height * output_width) +
               kh * ((size_t)kW * output_height * output_width) +
               kw * ((size_t)output_height * output_width);
-          scalar_t* src =
+          const scalar_t* src =
               input_data + nip * ((size_t)input_height * input_width);
           if (padW > 0 || padH > 0) {
             // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -335,7 +335,7 @@ static void unfolded2d_copy(
 
 template <typename scalar_t>
 static void unfolded2d_copy_channels_last(
-    scalar_t* input_data,
+    const scalar_t* input_data,
     scalar_t* finput_data,
     int64_t kH,
     int64_t kW,
@@ -355,7 +355,7 @@ static void unfolded2d_copy_channels_last(
 
     for (const auto k C10_UNUSED: c10::irange(start, end)) {
       scalar_t* dst = finput_data + y * output_width * kH * kW * n_input_plane + x * kH * kW * n_input_plane;
-      scalar_t* src = input_data;
+      const scalar_t* src = input_data;
 
       if (padW > 0 || padH > 0) {
         for (int64_t kh = 0; kh < kH; kh++) {
@@ -393,7 +393,7 @@ static void unfolded2d_copy_channels_last(
 void unfolded2d_copy_kernel(
     ScalarType dtype,
     void *finput_data,
-    void *input_data,
+    const void *input_data,
     int64_t kH,
     int64_t kW,
     int64_t dH,
@@ -415,7 +415,7 @@ void unfolded2d_copy_kernel(
   if (is_channels_last) {
     AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, dtype, "unfolded2d_copy_channels_last", [&] {
       unfolded2d_copy_channels_last(
-          static_cast<scalar_t*>(input_data),
+          static_cast<const scalar_t*>(input_data),
           static_cast<scalar_t*>(finput_data),
             kH, kW,
             dH, dW,
@@ -429,7 +429,7 @@ void unfolded2d_copy_kernel(
   } else {
     AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, dtype, "unfolded2d_copy", [&] {
       unfolded2d_copy(
-          static_cast<scalar_t*>(input_data),
+          static_cast<const scalar_t*>(input_data),
           static_cast<scalar_t*>(finput_data),
             kH, kW,
             dH, dW,
diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
index bee568881a95e..67fe50c1d2a62 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@@ -73,30 +73,30 @@ using scale_t = std::vector<c10::optional<double>>;
 // - recursively compute interpolated output for each dimension
 // - we rely a lot on compiler's code optimization such that implemented operations
 //   can be automatically factorized and vectorized using SSE and AVX2
-template <int n, typename scalar_t, typename index_t, int interp_size>
+template <int n, typename scalar_t, typename opmath_t, typename index_t, int interp_size>
 struct Interpolate {
-    static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
+    static inline opmath_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
       index_t ids = *(index_t*)&data[0][i * strides[0]];
-      scalar_t wts = *(scalar_t*)&data[1][i * strides[1]];
-      scalar_t t = Interpolate<n - 1, scalar_t, index_t, interp_size>::eval(src + ids, &data[2 * interp_size], &strides[2 * interp_size], i);
-      scalar_t output = t * wts;
+      opmath_t wts = *(scalar_t*)&data[1][i * strides[1]];
+      opmath_t t = Interpolate<n - 1, scalar_t, opmath_t, index_t, interp_size>::eval(src + ids, &data[2 * interp_size], &strides[2 * interp_size], i);
+      opmath_t output = t * wts;
       for (const auto j : c10::irange(1, interp_size)) {
         ids = *(index_t*)&data[2 * j + 0][i * strides[2 * j + 0]];
         wts = *(scalar_t*)&data[2 * j + 1][i * strides[2 * j + 1]];
-        t = Interpolate<n - 1, scalar_t, index_t, interp_size>::eval(src + ids, &data[2 * interp_size], &strides[2 * interp_size], i);
+        t = Interpolate<n - 1, scalar_t, opmath_t, index_t, interp_size>::eval(src + ids, &data[2 * interp_size], &strides[2 * interp_size], i);
         output += t * wts;
       }
       return output;
   }
 };
 
-template <typename scalar_t, typename index_t, int interp_size>
-struct Interpolate<1, scalar_t, index_t, interp_size> {
-    static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
+template <typename scalar_t, typename opmath_t, typename index_t, int interp_size>
+struct Interpolate<1, scalar_t, opmath_t, index_t, interp_size> {
+    static inline opmath_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
       index_t ids = *(index_t*)&data[0][i * strides[0]];
-      scalar_t wts = *(scalar_t*)&data[1][i * strides[1]];
-      scalar_t t = *(scalar_t *)&src[ids];
-      scalar_t output = t * wts;
+      opmath_t wts = *(scalar_t*)&data[1][i * strides[1]];
+      opmath_t t = *(scalar_t *)&src[ids];
+      opmath_t output = t * wts;
       for (const auto j : c10::irange(1, interp_size)) {
         ids = *(index_t*)&data[2 * j + 0][i * strides[2 * j + 0]];
         wts = *(scalar_t*)&data[2 * j + 1][i * strides[2 * j + 1]];
@@ -107,17 +107,17 @@ struct Interpolate<1, scalar_t, index_t, interp_size> {
     }
 };
 
-template <int n, typename scalar_t, typename index_t>
-struct Interpolate<n, scalar_t, index_t, 1> {
-    static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
+template <int n, typename scalar_t, typename opmath_t, typename index_t>
+struct Interpolate<n, scalar_t, opmath_t, index_t, 1> {
+    static inline opmath_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
       index_t ids = *(index_t*)&data[0][i * strides[0]];
-      return Interpolate<n - 1, scalar_t, index_t, 1>::eval(src + ids, &data[2], &strides[2], i);
+      return Interpolate<n - 1, scalar_t, opmath_t, index_t, 1>::eval(src + ids, &data[2], &strides[2], i);
   }
 };
 
-template <typename scalar_t, typename index_t>
-struct Interpolate<1, scalar_t, index_t, 1> {
-    static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
+template <typename scalar_t, typename opmath_t, typename index_t>
+struct Interpolate<1, scalar_t, opmath_t, index_t, 1> {
+    static inline opmath_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
       index_t ids = *(index_t*)&data[0][i * strides[0]];
       return *(scalar_t *)&src[ids];
     }
@@ -128,37 +128,38 @@ struct Interpolate<1, scalar_t, index_t, 1> {
 // Once the issue is fixed we can keep generic implementation and remove:
 // struct Interpolate<n, scalar_t, index_t, 2> and
 // struct Interpolate<1, scalar_t, index_t, 2>
-template <int n, typename scalar_t, typename index_t>
-struct Interpolate<n, scalar_t, index_t, 2> {
-    static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
+template <int n, typename scalar_t, typename opmath_t, typename index_t>
+struct Interpolate<n, scalar_t, opmath_t, index_t, 2> {
+    static inline opmath_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
         index_t i0 = *(index_t*)&data[0][i * strides[0]];
         index_t i1 = *(index_t*)&data[2][i * strides[2]];
-        scalar_t w0 = *(scalar_t *)&data[1][i * strides[1]];
-        scalar_t w1 = *(scalar_t *)&data[3][i * strides[3]];
+        opmath_t w0 = *(scalar_t *)&data[1][i * strides[1]];
+        opmath_t w1 = *(scalar_t *)&data[3][i * strides[3]];
 
-        scalar_t t0 = Interpolate<n - 1, scalar_t, index_t, 2>::eval(src + i0, &data[4], &strides[4], i);
-        scalar_t t1 = Interpolate<n - 1, scalar_t, index_t, 2>::eval(src + i1, &data[4], &strides[4], i);
+        opmath_t t0 = Interpolate<n - 1, scalar_t, opmath_t, index_t, 2>::eval(src + i0, &data[4], &strides[4], i);
+        opmath_t t1 = Interpolate<n - 1, scalar_t, opmath_t, index_t, 2>::eval(src + i1, &data[4], &strides[4], i);
 
         return t0 * w0 + t1 * w1;
   }
 };
 
-template <typename scalar_t, typename index_t>
-struct Interpolate<1, scalar_t, index_t, 2> {
-    static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
+template <typename scalar_t, typename opmath_t, typename index_t>
+struct Interpolate<1, scalar_t, opmath_t, index_t, 2> {
+    static inline opmath_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
         index_t i0 = *(index_t*)&data[0][i * strides[0]];
         index_t i1 = *(index_t*)&data[2][i * strides[2]];
-        scalar_t w0 = *(scalar_t *)&data[1][i * strides[1]];
-        scalar_t w1 = *(scalar_t *)&data[3][i * strides[3]];
-        scalar_t t0 = *(scalar_t *)&src[i0];
-        scalar_t t1 = *(scalar_t *)&src[i1];
+        opmath_t w0 = *(scalar_t *)&data[1][i * strides[1]];
+        opmath_t w1 = *(scalar_t *)&data[3][i * strides[3]];
+        opmath_t t0 = *(scalar_t *)&src[i0];
+        opmath_t t1 = *(scalar_t *)&src[i1];
         return t0 * w0 + t1 * w1;
     }
 };
 
 template <int n, typename scalar_t, typename index_t, int interp_size>
 static inline scalar_t interpolate(char* src, char** data, const int64_t* strides, int64_t i) {
-  return Interpolate<n, scalar_t, index_t, interp_size>::eval(src, data, strides, i);
+  using opmath_t = at::opmath_type<scalar_t>;
+  return Interpolate<n, scalar_t, opmath_t, index_t, interp_size>::eval(src, data, strides, i);
 }
 
 template <typename scalar_t, typename index_t>
@@ -472,7 +473,7 @@ void cpu_upsample_nearest_channels_last(
   auto input = input_.contiguous(channels_last_memory_format);
   auto output = output_.contiguous(channels_last_memory_format);
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   int64_t num_batches =  input_sizes[0];
@@ -488,7 +489,7 @@ void cpu_upsample_nearest_channels_last(
   TORCH_CHECK(channels > 0, "expected input and output channels greater than 0 but got ", channels);
 
   using Vec = vec::Vectorized<scalar_t>;
-  auto copy = [](scalar_t* out, scalar_t* in, int64_t size) {
+  auto copy = [](scalar_t* out, const scalar_t* in, int64_t size) {
     int64_t d = 0;
     for (; d < size - (size % Vec::size()); d += Vec::size()) {
       Vec out_vec = Vec::loadu(in + d);
@@ -509,7 +510,7 @@ void cpu_upsample_nearest_channels_last(
       int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[0]);
       int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[1]);
       scalar_t* output_ptr = output_data + i * channels;
-      scalar_t* input_ptr = input_data + n * input_height * input_width * channels +
+      const scalar_t* input_ptr = input_data + n * input_height * input_width * channels +
           ih * input_width * channels + iw * channels;
       copy(output_ptr, input_ptr, channels);
       data_index_step(n, num_batches, oh, output_height, ow, output_width);
@@ -528,7 +529,7 @@ void cpu_upsample_nearest_channels_last(
       int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[1]);
       int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[2]);
       scalar_t* output_ptr = output_data + i * channels;
-      scalar_t* input_ptr = input_data + n * input_depth * input_height * input_width * channels +
+      const scalar_t* input_ptr = input_data + n * input_depth * input_height * input_width * channels +
           id * input_height * input_width * channels +
           ih * input_width * channels + iw * channels;
       copy(output_ptr, input_ptr, channels);
@@ -578,7 +579,7 @@ void cpu_upsample_linear_channels_last(
   auto input = input_.contiguous(channels_last_memory_format);
   auto output = output_.contiguous(channels_last_memory_format);
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   int64_t num_batches =  input_sizes[0];
@@ -619,10 +620,10 @@ void cpu_upsample_linear_channels_last(
 
           scalar_t* out = output_data + n * output_slice_size +
               oh * output_width * channels + ow * channels;
-          scalar_t* i00 = input_indexr(n, ih0, iw0);
-          scalar_t* i01 = input_indexr(n, ih0, iw1);
-          scalar_t* i10 = input_indexr(n, ih1, iw0);
-          scalar_t* i11 = input_indexr(n, ih1, iw1);
+          const scalar_t* i00 = input_indexr(n, ih0, iw0);
+          const scalar_t* i01 = input_indexr(n, ih0, iw1);
+          const scalar_t* i10 = input_indexr(n, ih1, iw0);
+          const scalar_t* i11 = input_indexr(n, ih1, iw1);
           opmath_t w00 = h0lambda * w0lambda;
           opmath_t w01 = h0lambda * w1lambda;
           opmath_t w10 = h1lambda * w0lambda;
@@ -673,14 +674,14 @@ void cpu_upsample_linear_channels_last(
             scalar_t* out = output_data + n * output_slice_size +
                 od * output_height * output_width * channels +
                 oh * output_width * channels + ow * channels;
-            scalar_t* i000 = input_indexr(n, id0, ih0, iw0);
-            scalar_t* i001 = input_indexr(n, id0, ih0, iw1);
-            scalar_t* i010 = input_indexr(n, id0, ih1, iw0);
-            scalar_t* i011 = input_indexr(n, id0, ih1, iw1);
-            scalar_t* i100 = input_indexr(n, id1, ih0, iw0);
-            scalar_t* i101 = input_indexr(n, id1, ih0, iw1);
-            scalar_t* i110 = input_indexr(n, id1, ih1, iw0);
-            scalar_t* i111 = input_indexr(n, id1, ih1, iw1);
+            const scalar_t* i000 = input_indexr(n, id0, ih0, iw0);
+            const scalar_t* i001 = input_indexr(n, id0, ih0, iw1);
+            const scalar_t* i010 = input_indexr(n, id0, ih1, iw0);
+            const scalar_t* i011 = input_indexr(n, id0, ih1, iw1);
+            const scalar_t* i100 = input_indexr(n, id1, ih0, iw0);
+            const scalar_t* i101 = input_indexr(n, id1, ih0, iw1);
+            const scalar_t* i110 = input_indexr(n, id1, ih1, iw0);
+            const scalar_t* i111 = input_indexr(n, id1, ih1, iw1);
             opmath_t w000 = d0lambda * h0lambda * w0lambda;
             opmath_t w001 = d0lambda * h0lambda * w1lambda;
             opmath_t w010 = d0lambda * h1lambda * w0lambda;
@@ -741,30 +742,30 @@ struct HelperInterpBase {
     }
   }
 
+  // This is a helper function for _compute_index_ranges_weights method that computes
+  // source two int64 scalars index min and size and a list weights (of size max_interp_size)
+  // for interpolation with antialiasing=true mode. It returns the maximal weights value
   template <typename scalar_t, typename aa_filter_fn_t>
-  static inline scalar_t _compute_weights_aa(
+  static inline scalar_t _compute_indices_min_size_weights_aa(
     const int64_t i, const int64_t input_size, const scalar_t scale, const scalar_t support,
     scalar_t* wt_ptr, const int64_t max_interp_size, aa_filter_fn_t filter_fn,
-    int64_t& xmin, int64_t& xsize, bool antialias, double align_corners_delta
+    int64_t& xmin, int64_t& xsize
   ) {
 
-    // align_corners_delta is 0.5 for uint8 and align_corners=true and antialias=false
-    //                     is 0.0 otherwise
-    scalar_t center = scale * (i + 0.5 - align_corners_delta);
+    scalar_t center = scale * (i + 0.5);
     scalar_t total_w = 0.0;
-    scalar_t invscale = (scale >= 1.0 && antialias) ? 1.0 / scale : 1.0;
+    scalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0;
     xmin = std::max(
-        static_cast<int64_t>(center - support + 0.5 + align_corners_delta), static_cast<int64_t>(0));
+        static_cast<int64_t>(center - support + 0.5), static_cast<int64_t>(0));
     xsize = std::min(
-        static_cast<int64_t>(center + support + 0.5 + align_corners_delta), input_size) - xmin;
-
+        static_cast<int64_t>(center + support + 0.5), input_size) - xmin;
     // There are rare cases when due to precision xsize can be larger than max_interp_size by one.
     // We have to clip the value
     xsize = std::clamp(xsize, static_cast<int64_t>(0), max_interp_size);
 
     int64_t j = 0;
     for (; j < xsize; j++) {
-      scalar_t w = filter_fn((j + xmin - center + 0.5 - align_corners_delta) * invscale);
+      scalar_t w = filter_fn((j + xmin - center + 0.5) * invscale);
       wt_ptr[j] = w;
       total_w += w;
     }
@@ -783,10 +784,72 @@ struct HelperInterpBase {
     return wt_max;
   }
 
-  // Note [ Support for antialias=False as a subcase of antilias=True ]
+  // This is a helper function for _compute_index_ranges_weights method that computes
+  // source two int64 scalars index min and size and a list weights (of size max_interp_size)
+  // for interpolation with antialiasing=false mode. It returns the maximal weights value.
+  // This function is templated with scalar_t for type of scale and weights but is only used for
+  // bilinear/bicubic modes on uint8 input and antialiasing=false (in this case scalar_t is double).
+  // For float input types we are using upsample_generic_Nd_kernel_impl and compute_indices_weights methods
+  template <typename scalar_t, typename aa_filter_fn_t>
+  static inline scalar_t _compute_indices_min_size_weights(
+    const int64_t i, const int64_t input_size, const scalar_t scale,
+    scalar_t* wt_ptr, const int64_t max_interp_size, aa_filter_fn_t filter_fn,
+    bool align_corners, int64_t& index_min, int64_t& index_size
+  ) {
+    // Notes. We do not use opmath_t in this method as f16 and other smaller float types are not routed here.
+    // Typical usage of this method is with scalar_t = double when computing indices and weights for uint8 input
+    // The code below partly adapts indices and lambda computation from compute_indices_weights method and
+    // index_min/index_size from _compute_indices_min_size_weights_aa
+
+    bool cubic = max_interp_size > 2;
+    const auto real_input_index = area_pixel_compute_source_index<scalar_t>(
+        scale, i, align_corners, /*cubic=*/cubic);
+
+    scalar_t lambda;
+    int64_t input_index;
+    guard_index_and_lambda(real_input_index, input_size, input_index, lambda);
+
+    const auto support = static_cast<int64_t>(max_interp_size * 0.5);
+    const auto unbound_index_min = input_index - support + 1;
+    const auto unbound_index_max = input_index + support + 1;
+    index_min = std::max(unbound_index_min, static_cast<int64_t>(0));
+    index_size = std::min(unbound_index_max, input_size) - index_min;
+    // There are rare cases when due to precision xsize can be larger than max_interp_size by one.
+    // We have to clip the value
+    index_size = std::clamp(index_size, static_cast<int64_t>(0), max_interp_size);
+
+    // Below the weights are computed using filter_fn and accumulating values for indices being out of bounds
+    // For example, for bicubic mode for output index i = 0, we have input_index = -1,
+    // then we have unbound_index_min = -2 and unbound_index_max = 1 => unbounded input indices are [-2, -1, 0, 1] and
+    // valid input indices will be [0, 1]
+    // For unbounded input indices we compute four non-zero weights values [w0, w1, w2, w3] and as only two weights can
+    // be used with valid input indcies, we accumulate values in the following way: [w0 + w1 + w2, w3, 0.0, 0.0]
+    // This is equivalent to the float path which would compute indices as [0, 0, 0, 1] and weights as [w0, w1, w2, s3].
+    // A similar accumulation should done for unbounded indices larger than input size.
+    auto w_index = 0;
+    scalar_t wt_max = 0.0;
+    for (const auto j : c10::irange(max_interp_size)) {
+      // initialize weights value as we will accumulate below
+      wt_ptr[j] = 0.0;
+
+      scalar_t w = filter_fn(static_cast<scalar_t>(j + 1 - support) - lambda);
+      if (unbound_index_min + j <= 0) {
+        w_index = 0;
+      } else if (unbound_index_min + j >= input_size - 1) {
+        w_index = index_size - 1;
+      }
+      wt_ptr[w_index] += w;
+      wt_max = std::max(wt_max, wt_ptr[w_index]);
+      w_index++;
+    }
+
+    return wt_max;
+  }
+
+  // Note [ Support for antialias=False as a subcase of antialias=True ]
   // This function was originally written with the hard assumption that
-  // antialias=True (hence the aa in the name). It was later extended to support
-  // antialias=False. The only difference between aa and no-aa is in how the
+  // antialias=True and it was later extended to support antialias=False.
+  // The only difference between aa and no-aa is in how the
   // weights and indices are computed (and their number). In aa their number is
   // variable but with no-aa, they're fixed to interp_size. The same "filters"
   // can be used otherwise. HOWEVER, support for antialias=False here may not be
@@ -794,10 +857,10 @@ struct HelperInterpBase {
   // indices, but this can be optimized further when aa=False since we know
   // their actual dimensions.
   template <typename scalar_t, typename aa_filter_fn_t, int weight_index_stride=sizeof(scalar_t)>
-  static inline std::tuple<std::vector<Tensor>, int, scalar_t> _compute_indices_weights_aa(
+  static inline std::tuple<std::vector<Tensor>, int, scalar_t> _compute_index_ranges_weights(
     int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
     int64_t reshape_dim, scalar_t scale,
-    int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, double align_corners_delta
+    int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, bool align_corners
   ) {
 
     std::vector<Tensor> output;
@@ -845,24 +908,35 @@ struct HelperInterpBase {
 
     scalar_t wt_max = 0.0;
     for (const auto i : c10::irange(output_size)) {
-      int64_t xmin, xmax;
-      auto wt_max_i = HelperInterpBase::_compute_weights_aa(
-          i,
-          input_size,
-          scale,
-          support,
-          wt_ptr + i * max_interp_size,
-          max_interp_size,
-          aa_filter_fn,
-          xmin,
-          xmax,
-          antialias,
-          align_corners_delta);
-
+      int64_t xmin, xsize;
+      scalar_t wt_max_i;
+      if (antialias) {
+        wt_max_i = HelperInterpBase::_compute_indices_min_size_weights_aa(
+            i,
+            input_size,
+            scale,
+            support,
+            wt_ptr + i * max_interp_size,
+            max_interp_size,
+            aa_filter_fn,
+            xmin,
+            xsize);
+      } else {
+        wt_max_i = HelperInterpBase::_compute_indices_min_size_weights(
+            i,
+            input_size,
+            scale,
+            wt_ptr + i * max_interp_size,
+            max_interp_size,
+            aa_filter_fn,
+            align_corners,
+            xmin,
+            xsize);
+      }
       wt_max = std::max(wt_max, wt_max_i);
 
       idx_ptr_xmin[i] = xmin * stride;
-      idx_ptr_size[i] = xmax;
+      idx_ptr_size[i] = xsize;
       idx_ptr_stride[i] = stride;
       wt_idx_ptr[i] = i * max_interp_size * weight_index_stride;
     }
@@ -878,7 +952,7 @@ struct HelperInterpBase {
   uint8 in basic_loop_aa_horizontal<uint8_t> (and vertical)
 
   In essence the idea is to avoid a multiplication between a float (the
-  weight) and an int (the pixel value) and instead run a multpilication between
+  weight) and an int (the pixel value) and instead run a multiplication between
   2 ints:
 
   ```py
@@ -911,7 +985,7 @@ struct HelperInterpBase {
       = what we wanted
   */
   template <typename aa_filter_fn_t>
-  static inline std::tuple<std::vector<Tensor>, int, unsigned int> _compute_indices_int16_weights_aa(
+  static inline std::tuple<std::vector<Tensor>, int, unsigned int> _compute_index_ranges_int16_weights(
     int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
     int64_t reshape_dim, bool align_corners, const c10::optional<double> opt_scale,
     int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, bool align_i32=false
@@ -921,10 +995,9 @@ struct HelperInterpBase {
         input_size, output_size, align_corners, opt_scale);
 
     std::vector<Tensor> indices_weights;
-    auto align_corners_delta = (align_corners && !antialias) ? 0.5 : 0.0;
     double wt_max;
-    std::tie(indices_weights, interp_size, wt_max) = HelperInterpBase::_compute_indices_weights_aa<double, aa_filter_fn_t, sizeof(int16_t)>(
-        input_size, output_size, stride, ndims, reshape_dim, scale, interp_size, aa_filter_fn, antialias, align_corners_delta);
+    std::tie(indices_weights, interp_size, wt_max) = HelperInterpBase::_compute_index_ranges_weights<double, aa_filter_fn_t, sizeof(int16_t)>(
+        input_size, output_size, stride, ndims, reshape_dim, scale, interp_size, aa_filter_fn, antialias, align_corners);
 
     // Rescale float weights to int16 and compute weights precision
     auto weights_f64 = indices_weights[3];
@@ -1008,8 +1081,8 @@ struct HelperInterpNearest : public HelperInterpBase {
     HelperInterpNearest::init_indices_weights(
       scalar_type, output, output_size, ndims, reshape_dim, HelperInterpNearest::interp_size);
 
-    AT_DISPATCH_FLOATING_TYPES_AND(
-      ScalarType::BFloat16, scalar_type, "compute_indices_weights_nearest", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      kBFloat16, kHalf, scalar_type, "compute_indices_weights_nearest", [&] {
         using opmath_t = at::opmath_type<scalar_t>;
         opmath_t scale = area_pixel_compute_scale<opmath_t>(input_size, output_size, align_corners, opt_scale);
 
@@ -1059,9 +1132,10 @@ struct HelperInterpNearestExact : public HelperInterpNearest {
     HelperInterpNearest::init_indices_weights(
       scalar_type, output, output_size, ndims, reshape_dim, HelperInterpNearest::interp_size);
 
-    AT_DISPATCH_FLOATING_TYPES(
-      scalar_type, "compute_indices_weights_nearest", [&] {
-        scalar_t scale = area_pixel_compute_scale<scalar_t>(input_size, output_size, align_corners, opt_scale);
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      kBFloat16, kHalf, scalar_type, "compute_indices_weights_nearest", [&] {
+        using opmath_t = at::opmath_type<scalar_t>;
+        opmath_t scale = area_pixel_compute_scale<opmath_t>(input_size, output_size, align_corners, opt_scale);
 
         auto input_index_ptr = output[0].data_ptr<int64_t>();
         int64_t input_index;
@@ -1071,7 +1145,6 @@ struct HelperInterpNearestExact : public HelperInterpNearest {
         // index_f32 = (output_index + 0.5) * scale - 0.5
         // input_index = round(index_f32)
         // Same as Pillow and Scikit-Image/Scipy ndi.zoom
-        using opmath_t = at::opmath_type<scalar_t>;
         for (const auto i : c10::irange(output_size)) {
           const auto real_input_index =
               area_pixel_compute_source_index<opmath_t>(
@@ -1108,8 +1181,8 @@ struct HelperInterpLinear : public HelperInterpBase {
     std::vector<Tensor> output;
     HelperInterpLinear::init_indices_weights(
       scalar_type, output, output_size, ndims, reshape_dim, HelperInterpLinear::interp_size);
-    AT_DISPATCH_FLOATING_TYPES_AND(
-      ScalarType::BFloat16, scalar_type, "compute_indices_weights_linear", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      kBFloat16, kHalf, scalar_type, "compute_indices_weights_linear", [&] {
         using opmath_t = at::opmath_type<scalar_t>;
         opmath_t scale = area_pixel_compute_scale<opmath_t>(input_size, output_size, align_corners, opt_scale);
 
@@ -1149,7 +1222,7 @@ struct HelperInterpLinear : public HelperInterpBase {
     return 0.0;
   }
 
-  static inline std::vector<Tensor> compute_indices_weights_aa(
+  static inline std::vector<Tensor> compute_index_ranges_weights(
     at::ScalarType scalar_type,
     int64_t input_size,
     int64_t output_size,
@@ -1163,17 +1236,14 @@ struct HelperInterpLinear : public HelperInterpBase {
 
     std::vector<Tensor> indices_weights;
     AT_DISPATCH_FLOATING_TYPES(
-      scalar_type, "compute_indices_weights_aa", [&] {
+      scalar_type, "compute_index_ranges_weights", [&] {
 
         scalar_t scale = area_pixel_compute_scale<scalar_t>(
             input_size, output_size, align_corners, opt_scale);
 
         auto interp_size = HelperInterpLinear::interp_size;
-        int unused;
-        scalar_t unused_2;
-        auto align_corners_delta = (align_corners && !antialias) ? 0.5 : 0.0;
 
-        std::tie(indices_weights, unused, unused_2) = HelperInterpLinear::_compute_indices_weights_aa<scalar_t>(
+        indices_weights = std::get<0>(HelperInterpLinear::_compute_index_ranges_weights<scalar_t>(
             input_size,
             output_size,
             stride,
@@ -1183,13 +1253,13 @@ struct HelperInterpLinear : public HelperInterpBase {
             interp_size,
             &HelperInterpLinear::aa_filter<scalar_t>,
             /*antialias=*/antialias,
-            /*align_corners_delta=*/align_corners_delta);
+            /*align_corners=*/align_corners));
       }
     );
     return indices_weights;
   }
 
-  static inline std::tuple<std::vector<Tensor>, int, unsigned int> compute_indices_int16_weights_aa(
+  static inline std::tuple<std::vector<Tensor>, int, unsigned int> compute_index_ranges_int16_weights(
     int64_t input_size,
     int64_t output_size,
     int64_t stride,
@@ -1203,7 +1273,7 @@ struct HelperInterpLinear : public HelperInterpBase {
 
     auto interp_size = HelperInterpLinear::interp_size;
     auto fn = HelperInterpLinear::aa_filter<double>;
-    return HelperInterpLinear::_compute_indices_int16_weights_aa(
+    return HelperInterpLinear::_compute_index_ranges_int16_weights(
         input_size, output_size, stride, ndims, reshape_dim,
         align_corners, opt_scale, interp_size, fn, antialias, align_i32);
   }
@@ -1233,8 +1303,8 @@ struct HelperInterpCubic : public HelperInterpBase {
     HelperInterpCubic::init_indices_weights(
       scalar_type, output, output_size, ndims, reshape_dim, HelperInterpCubic::interp_size);
 
-    AT_DISPATCH_FLOATING_TYPES_AND(
-      ScalarType::BFloat16, scalar_type, "compute_indices_weights_cubic", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      kBFloat16, kHalf, scalar_type, "compute_indices_weights_cubic", [&] {
         using opmath_t = at::opmath_type<scalar_t>;
         opmath_t scale = area_pixel_compute_scale<opmath_t>(input_size, output_size, align_corners, opt_scale);
 
@@ -1286,7 +1356,7 @@ struct HelperInterpCubic : public HelperInterpBase {
     return 0.0;
   }
 
-  static inline std::vector<Tensor> compute_indices_weights_aa(
+  static inline std::vector<Tensor> compute_index_ranges_weights(
     at::ScalarType scalar_type,
     int64_t input_size,
     int64_t output_size,
@@ -1300,17 +1370,14 @@ struct HelperInterpCubic : public HelperInterpBase {
 
     std::vector<Tensor> indices_weights;
     AT_DISPATCH_FLOATING_TYPES(
-      scalar_type, "compute_indices_weights_aa", [&] {
+      scalar_type, "compute_index_ranges_weights", [&] {
 
         scalar_t scale = area_pixel_compute_scale<scalar_t>(
             input_size, output_size, align_corners, opt_scale);
 
         auto interp_size = HelperInterpCubic::interp_size;
-        int unused;
-        scalar_t unused_2;
-        auto align_corners_delta = (align_corners && !antialias) ? 0.5 : 0.0;
 
-        std::tie(indices_weights, unused, unused_2) = HelperInterpCubic::_compute_indices_weights_aa<scalar_t>(
+        indices_weights = std::get<0>(HelperInterpCubic::_compute_index_ranges_weights<scalar_t>(
             input_size,
             output_size,
             stride,
@@ -1320,13 +1387,13 @@ struct HelperInterpCubic : public HelperInterpBase {
             interp_size,
             &HelperInterpCubic::aa_filter<scalar_t>,
             /*antialias=*/antialias,
-            /*align_corners_delta*/align_corners_delta);
+            /*align_corners=*/align_corners));
       }
     );
     return indices_weights;
   }
 
-  static inline std::tuple<std::vector<Tensor>, int, unsigned int> compute_indices_int16_weights_aa(
+  static inline std::tuple<std::vector<Tensor>, int, unsigned int> compute_index_ranges_int16_weights(
     int64_t input_size,
     int64_t output_size,
     int64_t stride,
@@ -1342,7 +1409,7 @@ struct HelperInterpCubic : public HelperInterpBase {
     // We have to use the -0.75 constant when aa is False so that this uint8
     // path is as close as possible to float results.
     auto fn = antialias ? HelperInterpCubic::aa_filter<double, true> : HelperInterpCubic::aa_filter<double, false>;
-    return HelperInterpCubic::_compute_indices_int16_weights_aa(
+    return HelperInterpCubic::_compute_index_ranges_int16_weights(
         input_size, output_size, stride, ndims, reshape_dim,
         align_corners, opt_scale, interp_size, fn, antialias, align_i32);
   }
@@ -1407,11 +1474,11 @@ void upsample_generic_Nd_kernel_impl(
   config.check_all_same_dtype(false)
     .declare_static_dtype_and_device(input.scalar_type(), input.device())
     .add_output(output)
-    .add_input(restrided_input);
+    .add_const_input(restrided_input);
 
   for (auto & idx_weight: indices_weights) {
     for (auto& tensor : idx_weight) {
-      config.add_input(tensor);
+      config.add_const_input(tensor);
     }
   }
 
@@ -1419,14 +1486,14 @@ void upsample_generic_Nd_kernel_impl(
 
   if (interp_size > 1) {
     // Nearest also supports uint8 tensor, so need to handle it separately
-    AT_DISPATCH_FLOATING_TYPES_AND(
-        at::ScalarType::BFloat16, iter.dtype(), "upsample_generic_Nd", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kBFloat16, kHalf, iter.dtype(), "upsample_generic_Nd", [&] {
         // MSVC can not catch constexpr int interp_size here
         constexpr int mode = F::interp_size;
         cpu_upsample_generic<scalar_t, out_ndims, mode>(iter);
     });
   } else {
-    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16,
+    AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf,
         iter.dtype(), "upsample_generic_Nd", [&] {
         constexpr int mode = F::interp_size;
         cpu_upsample_generic<scalar_t, out_ndims, mode>(iter);
@@ -1504,7 +1571,7 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim(
     // This is a special branch to provide uint8 dtype support for bilinear and bicubic modes only
     TORCH_INTERNAL_ASSERT(F::interp_size == 2 || F::interp_size == 4);
     std::tie(indices_weights, unused, weights_precision) =
-      F::compute_indices_int16_weights_aa(
+      F::compute_index_ranges_int16_weights(
         input.size(interp_dim), oshape[interp_dim],
         input.stride(interp_dim) * input.element_size(),
         input.dim(), interp_dim, align_corners, scales[interp_dim - 2],
@@ -1512,7 +1579,7 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim(
     TORCH_INTERNAL_ASSERT(weights_precision > 0);
   } else {
     indices_weights =
-      F::compute_indices_weights_aa(
+      F::compute_index_ranges_weights(
         input_scalar_type, input.size(interp_dim), oshape[interp_dim],
         input.stride(interp_dim) * input.element_size(),
         input.dim(), interp_dim, align_corners, scales[interp_dim - 2],
@@ -1523,10 +1590,10 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim(
   config.check_all_same_dtype(false)
       .declare_static_dtype_and_device(input.scalar_type(), input.device())
       .add_output(output)
-      .add_input(restrided_input);
+      .add_const_input(restrided_input);
 
   for (auto& tensor : indices_weights) {
-    config.add_input(tensor);
+    config.add_const_input(tensor);
   }
 
   auto iter = config.build();
@@ -1662,7 +1729,7 @@ void upsample_nearest2d_kernel_impl(
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
   if (_use_vectorized_kernel_cond_2d(output, input)) {
-    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16,
+    AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf,
         input.scalar_type(), "upsample_nearest2d_channels_last", [&] {
       cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_idx>(output, input, {scales_h, scales_w});
     });
@@ -1678,7 +1745,7 @@ void _upsample_nearest_exact2d_kernel_impl(
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
   if (_use_vectorized_kernel_cond_2d(output, input)) {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_channels_last", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf, input.scalar_type(), "upsample_nearest2d_channels_last", [&] {
       cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_exact_idx>(output, input, {scales_h, scales_w});
     });
   } else {
@@ -1694,7 +1761,7 @@ void upsample_nearest3d_kernel_impl(
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
   if (_use_vectorized_kernel_cond_3d(output, input)) {
-    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16,
+    AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf,
         input.scalar_type(), "upsample_nearest3d_channels_last", [&] {
       cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_idx>(output, input, {scales_d, scales_h, scales_w});
     });
@@ -1711,7 +1778,7 @@ void _upsample_nearest_exact3d_kernel_impl(
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
   if (_use_vectorized_kernel_cond_3d(output, input)) {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Byte, input.scalar_type(), "upsample_nearest3d_channels_last", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf, input.scalar_type(), "upsample_nearest3d_channels_last", [&] {
       cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_exact_idx>(output, input, {scales_d, scales_h, scales_w});
     });
   } else {
@@ -1743,7 +1810,7 @@ void upsample_bilinear2d_kernel_impl_float(
   // That's not the case for masks though (C == 1), which strongly benefit from
   // using the generic kernel.
   if ((_use_vectorized_kernel_cond_2d(output, input)) || (at::get_num_threads() == 1 && input.size(1) == 3)) {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, input.scalar_type(), "upsample_bilinear2d_channels_last", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "upsample_bilinear2d_channels_last", [&] {
       cpu_upsample_linear_channels_last<scalar_t, scale_t>(output, input, align_corners, {scales_h, scales_w});
     });
   } else {
@@ -1812,7 +1879,7 @@ void upsample_trilinear3d_kernel_impl(
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
   if ((_use_vectorized_kernel_cond_3d(output, input))) {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, input.scalar_type(), "upsample_trilinear3d_channels_last", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "upsample_trilinear3d_channels_last", [&] {
       cpu_upsample_linear_channels_last<scalar_t, scale_t>(output, input, align_corners, {scales_d, scales_h, scales_w});
     });
   } else {
@@ -1890,7 +1957,7 @@ void cpu_upsample_genNd_backward_aa(
   auto grad_output = grad_output_.contiguous();
   auto grad_input = grad_input_.contiguous();
 
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
   auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
   auto input_sizes = grad_input.sizes().vec();
   auto output_sizes = grad_output.sizes().vec();
@@ -1939,7 +2006,7 @@ void cpu_upsample_genNd_backward_aa(
     aa_filter_fn_t filter_fn = &F::aa_filter;
 
     for (const auto oh : c10::irange(output_height)) {
-      F::_compute_weights_aa(
+      F::_compute_indices_min_size_weights_aa(
           oh,
           input_height,
           height_scale,
@@ -1948,12 +2015,10 @@ void cpu_upsample_genNd_backward_aa(
           interp_height,
           filter_fn,
           ymin,
-          ysize,
-          /*antialias=*/true,
-          /*align_corners_delta=*/0.0);
+          ysize);
 
       for (const auto ow : c10::irange(output_width)) {
-        F::_compute_weights_aa(
+        F::_compute_indices_min_size_weights_aa(
             ow,
             input_width,
             width_scale,
@@ -1962,9 +2027,7 @@ void cpu_upsample_genNd_backward_aa(
             interp_width,
             filter_fn,
             xmin,
-            xsize,
-            /*antialias=*/true,
-            /*align_corners_delta=*/0.0);
+            xsize);
 
         for (const auto c : c10::irange(begin, end)) {
           scalar_t grad_output_value =
diff --git a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
index 2a996cfa4f1c9..726a83c20963d 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
+++ b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
@@ -66,7 +66,7 @@ at::Tensor unpack_rgb(const at::Tensor& packed_tensor) {
   // into as 32 bits. This generalizes to num_channels <= 4 and also works for
   // non-channels_last tensors.
 
-  const uint8_t* packed = (const uint8_t*)packed_tensor.data_ptr<uint8_t>();
+  const uint8_t* packed = (const uint8_t*)packed_tensor.const_data_ptr<uint8_t>();
   auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2);
   auto num_channels = packed_tensor.size(0);
 
@@ -180,18 +180,18 @@ void ImagingResampleHorizontal(
   // Although this may not be needed if / when we port all this code to use
   // Vec.h since this would potentially give us another fall-back implem
 
-  const int16_t* kk = (int16_t*)(horiz_indices_weights[3].data_ptr<double>());
+  const int16_t* kk = (int16_t*)(horiz_indices_weights[3].const_data_ptr<double>());
 
   auto xout = unpacked_output.size(2);
   auto yout = unpacked_output.size(1);
   auto xin = unpacked_input.size(2);
   TORCH_INTERNAL_ASSERT(num_channels == unpacked_input.size(0));
 
-  const int64_t* idx_ptr_xmin = horiz_indices_weights[0].data_ptr<int64_t>();
-  const int64_t* idx_ptr_size = horiz_indices_weights[1].data_ptr<int64_t>();
+  const int64_t* idx_ptr_xmin = horiz_indices_weights[0].const_data_ptr<int64_t>();
+  const int64_t* idx_ptr_size = horiz_indices_weights[1].const_data_ptr<int64_t>();
 
   uint8_t* unpacked_output_p = unpacked_output.data_ptr<uint8_t>();
-  const uint8_t* unpacked_input_p = unpacked_input.data_ptr<uint8_t>();
+  const uint8_t* unpacked_input_p = unpacked_input.const_data_ptr<uint8_t>();
 
   int64_t yy = 0;
   auto xout_stride = xout * num_channels;
@@ -255,13 +255,13 @@ void ImagingResampleVertical(
   // basic_loop_aa_vertical<uint8_t>)
   // Although this may not be needed if / when we port all this code to use
   // Vec.h since this would potentially give us another fall-back implem
-  const int16_t* kk = (int16_t*)(vert_indices_weights[3].data_ptr<double>());
+  const int16_t* kk = (int16_t*)(vert_indices_weights[3].const_data_ptr<double>());
 
-  const int64_t* idx_ptr_xmin = vert_indices_weights[0].data_ptr<int64_t>();
-  const int64_t* idx_ptr_size = vert_indices_weights[1].data_ptr<int64_t>();
+  const int64_t* idx_ptr_xmin = vert_indices_weights[0].const_data_ptr<int64_t>();
+  const int64_t* idx_ptr_size = vert_indices_weights[1].const_data_ptr<int64_t>();
 
   uint8_t* unpacked_output_p = unpacked_output.data_ptr<uint8_t>();
-  const uint8_t* unpacked_input_p = unpacked_input.data_ptr<uint8_t>();
+  const uint8_t* unpacked_input_p = unpacked_input.const_data_ptr<uint8_t>();
 
   auto xout = unpacked_output.size(2);
   auto yout = unpacked_output.size(1);
@@ -296,7 +296,7 @@ void ImagingResampleVertical(
 // [ Weights computation for uint8_t and multiplication trick ]
 // For details on how the AVX kernels are implemented, see
 // https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
-// See also [ Support for antialias=False as a subcase of antilias=True ] to
+// See also [ Support for antialias=False as a subcase of antialias=True ] to
 // learn more about how the antialias=False case is computed. The same holds
 // here: all these kernels are general enough to handle an arbitrary number of
 // weights, but when aa=False they could be optimized further.
@@ -344,7 +344,7 @@ void upsample_avx_bilinear_bicubic_uint8(
     int interp_dim = 3;
     auto stride = (skip_unpacking) ? num_channels : 4;
     std::tie(horiz_indices_weights, ksize_horiz, horiz_weights_precision) =
-        F::compute_indices_int16_weights_aa(
+        F::compute_index_ranges_int16_weights(
             /*input_size=*/xin,
             /*output_size=*/xout,
             /*stride=*/stride,
@@ -360,7 +360,7 @@ void upsample_avx_bilinear_bicubic_uint8(
     int interp_dim = 2;
     auto stride = (skip_unpacking) ? num_channels * xout : 4 * xout;
     std::tie(vert_indices_weights, ksize_vert, vert_weights_precision) =
-        F::compute_indices_int16_weights_aa(
+        F::compute_index_ranges_int16_weights(
             /*input_size=*/yin,
             /*output_size=*/yout,
             /*stride=*/stride,
@@ -699,7 +699,7 @@ void ImagingResampleHorizontalConvolution8u4x(
       // Memcpy 4-bytes is faster than 3-bytes and this is a boundary case when we want to write
       // 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1).
       // The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct
-      // value which was preveiously computed by another line. In other words, it means that we can not overwrite
+      // value which was previously computed by another line. In other words, it means that we can not overwrite
       // it by simply writing 4 bytes from the register to the output. We'll do the following:
       //               v----------|
       // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...]
@@ -1040,7 +1040,7 @@ void ImagingResampleHorizontalConvolution8u(
         // Memcpy 4-bytes is faster than 3-bytes and this is a boundary case when we want to write
         // 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1).
         // The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct
-        // value which was preveiously computed by another line. In other words, it means that we can not overwrite
+        // value which was previously computed by another line. In other words, it means that we can not overwrite
         // it by simply writing 4 bytes from the register to the output. We'll do the following:
         //               v----------|
         // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...]
diff --git a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
index 0e2511394ec75..b97b5cefee2c8 100644
--- a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
@@ -14,11 +14,13 @@ namespace {
 
 using scale_t = std::vector<c10::optional<double>>;
 
-template <typename acc_t, typename scalar_t>
+template <typename acc_t, typename scalar_t,
+          typename scalar_nonconst_t = std::remove_const_t<scalar_t>,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_nonconst_t> || !std::is_same<acc_t, float>::value, int> = 0>
 void inline nearest_channels_last_acc(acc_t* gin, scalar_t* gout, int64_t size) {
-  TORCH_CHECK((std::is_same<acc_t, scalar_t>::value),
+  TORCH_CHECK((std::is_same<acc_t, scalar_nonconst_t>::value),
               "acc data type of Upsample backward should be same as scalar_t for float or double on CPU.")
-  using Vec = vec::Vectorized<acc_t>;
+  using Vec = Vectorized<acc_t>;
   int64_t d = 0;
   for (; d < size - (size % Vec::size()); d += Vec::size()) {
     Vec gin_vec = Vec::loadu(gin + d) + Vec::loadu(gout + d);
@@ -29,14 +31,16 @@ void inline nearest_channels_last_acc(acc_t* gin, scalar_t* gout, int64_t size)
   }
 }
 
-template <>
-void inline nearest_channels_last_acc(float* gin, BFloat16* gout, int64_t size) {
-  using bVec = vec::Vectorized<BFloat16>;
-  using fVec = vec::Vectorized<float>;
+template <typename acc_t, typename scalar_t,
+          typename scalar_nonconst_t = std::remove_const_t<scalar_t>,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_nonconst_t> && std::is_same<acc_t, float>::value, int> = 0>
+void inline nearest_channels_last_acc(acc_t* gin, scalar_t* gout, int64_t size) {
+  using bVec = Vectorized<scalar_nonconst_t>;
+  using fVec = Vectorized<float>;
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec gout_bvec = bVec::loadu(gout + d);
-    auto [gout_fvec0, gout_fvec1] = convert_bfloat16_float(gout_bvec);
+    auto [gout_fvec0, gout_fvec1] = convert_to_float<scalar_nonconst_t>(gout_bvec);
     fVec gin_fvec0 = fVec::loadu(gin + d) + gout_fvec0;
     fVec gin_fvec1 = fVec::loadu(gin + d + fVec::size()) + gout_fvec1;
     gin_fvec0.store(gin + d);
@@ -47,11 +51,13 @@ void inline nearest_channels_last_acc(float* gin, BFloat16* gout, int64_t size)
   }
 }
 
-template <typename acc_t, typename scalar_t>
-void inline linear_channels_last_acc(acc_t* gin, scalar_t* gout, acc_t w, int64_t size) {
-  TORCH_CHECK((std::is_same<acc_t, scalar_t>::value),
+template <typename acc_t, typename scalar_t,
+          typename scalar_nonconst_t = std::remove_const_t<scalar_t>,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_nonconst_t> || !std::is_same<acc_t, float>::value, int> = 0>
+void inline linear_channels_last_acc(acc_t* gin, const scalar_t* gout, acc_t w, int64_t size) {
+  TORCH_CHECK((std::is_same<acc_t, scalar_nonconst_t>::value),
               "acc data type of Upsample backward should be same as scalar_t for float or double on CPU.")
-  using Vec = vec::Vectorized<acc_t>;
+  using Vec = Vectorized<acc_t>;
   int64_t d = 0;
   for (; d < size - (size % Vec::size()); d += Vec::size()) {
     Vec gin_vec = Vec::loadu(gin + d) + Vec(w) * Vec::loadu(gout + d);
@@ -62,14 +68,16 @@ void inline linear_channels_last_acc(acc_t* gin, scalar_t* gout, acc_t w, int64_
   }
 }
 
-template <>
-void inline linear_channels_last_acc(float* gin, BFloat16* gout, float w, int64_t size) {
-  using bVec = vec::Vectorized<BFloat16>;
-  using fVec = vec::Vectorized<float>;
+template <typename acc_t, typename scalar_t,
+          typename scalar_nonconst_t = std::remove_const_t<scalar_t>,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_nonconst_t> && std::is_same<acc_t, float>::value, int> = 0>
+void inline linear_channels_last_acc(acc_t* gin, const scalar_t* gout, acc_t w, int64_t size) {
+  using bVec = Vectorized<scalar_nonconst_t>;
+  using fVec = Vectorized<float>;
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec gout_bvec = bVec::loadu(gout + d);
-    auto [gout_fvec0, gout_fvec1] = convert_bfloat16_float(gout_bvec);
+    auto [gout_fvec0, gout_fvec1] = convert_to_float<scalar_nonconst_t>(gout_bvec);
     fVec gin_fvec0 = fVec::loadu(gin + d) + fVec(w) * gout_fvec0;
     fVec gin_fvec1 = fVec::loadu(gin + d + fVec::size()) + fVec(w) * gout_fvec1;
     gin_fvec0.store(gin + d);
@@ -91,7 +99,7 @@ void cpu_upsample_nearest_backward(
   auto grad_output = grad_output_.contiguous();
   auto grad_input = grad_input_.contiguous();
 
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
   auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
   auto input_sizes = grad_input.sizes().vec();
   auto output_sizes = grad_output.sizes().vec();
@@ -228,7 +236,7 @@ void cpu_upsample_nearest_backward_channels_last(
   auto grad_output = grad_output_.contiguous(channels_last_memory_format);
   auto grad_input = grad_input_.contiguous(channels_last_memory_format);
 
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
   auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
 
   auto input_sizes = grad_input.sizes().vec();
@@ -262,7 +270,7 @@ void cpu_upsample_nearest_backward_channels_last(
         int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[0]);
         for (const auto ow : c10::irange(output_width)) {
           int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[1]);
-          scalar_t* grad_output_ptr = grad_output_data +
+          const scalar_t* grad_output_ptr = grad_output_data +
               (n * output_height * output_width + oh * output_width + ow) * channels;
           opmath_t* buffer_ptr = acc_data_ptr + input_offset + (ih * input_width + iw) * channels;
           nearest_channels_last_acc(buffer_ptr, grad_output_ptr, channels);
@@ -295,7 +303,7 @@ void cpu_upsample_nearest_backward_channels_last(
           int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[1]);
           for (int64_t ow = 0; ow < output_width; ow++) {
             int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[2]);
-            scalar_t* grad_output_ptr = grad_output_data +
+            const scalar_t* grad_output_ptr = grad_output_data +
                 (n * output_depth * output_height * output_width +
                 od * output_height * output_width + oh * output_width + ow) * channels;
 
@@ -330,7 +338,7 @@ void upsample_nearest1d_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad_output,
     c10::optional<double> scales_w) {
-  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest1d_backward", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_nearest1d_backward", [&] {
     cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_w});
   });
 }
@@ -339,7 +347,7 @@ void _upsample_nearest_exact1d_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad_output,
     c10::optional<double> scales_w) {
-  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact1d_backward", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact1d_backward", [&] {
     cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_w});
   });
 }
@@ -350,11 +358,11 @@ void upsample_nearest2d_backward_kernel_impl(
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
   if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest2d_backward_cl", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_nearest2d_backward_cl", [&] {
       cpu_upsample_nearest_backward_channels_last<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_h, scales_w});
     });
   } else {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest2d_backward", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_nearest2d_backward", [&] {
       cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_h, scales_w});
     });
   }
@@ -366,11 +374,11 @@ void _upsample_nearest_exact2d_backward_kernel_impl(
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
   if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact2d_backward_cl", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact2d_backward_cl", [&] {
       cpu_upsample_nearest_backward_channels_last<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_h, scales_w});
     });
   } else {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact2d_backward", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact2d_backward", [&] {
       cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_h, scales_w});
     });
   }
@@ -382,9 +390,15 @@ void upsample_nearest3d_backward_kernel_impl(
     c10::optional<double> scales_d,
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
-  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest3d_backward", [&] {
-    cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_d, scales_h, scales_w});
-  });
+  if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast3d)) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest3d_backward_cl", [&] {
+      cpu_upsample_nearest_backward_channels_last<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_d, scales_h, scales_w});
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_nearest3d_backward", [&] {
+      cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_idx>(grad_input, grad_output, {scales_d, scales_h, scales_w});
+    });
+  }
 }
 
 void _upsample_nearest_exact3d_backward_kernel_impl(
@@ -393,9 +407,15 @@ void _upsample_nearest_exact3d_backward_kernel_impl(
     c10::optional<double> scales_d,
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
-  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact3d_backward", [&] {
-    cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_d, scales_h, scales_w});
-  });
+  if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast3d)) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact3d_backward_cl", [&] {
+      cpu_upsample_nearest_backward_channels_last<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_d, scales_h, scales_w});
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact3d_backward", [&] {
+      cpu_upsample_nearest_backward<scalar_t, scale_t, nearest_exact_idx>(grad_input, grad_output, {scales_d, scales_h, scales_w});
+    });
+  }
 }
 
 template <typename scalar_t, typename scale_type>
@@ -410,7 +430,7 @@ void cpu_upsample_linear_backward(
   auto grad_output = grad_output_.contiguous();
   auto grad_input = grad_input_.contiguous();
 
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
   auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
   auto input_sizes = grad_input.sizes().vec();
   auto output_sizes = grad_output.sizes().vec();
@@ -587,7 +607,7 @@ void cpu_upsample_linear_backward_channels_last(
   auto grad_output = grad_output_.contiguous(channels_last_memory_format);
   auto grad_input = grad_input_.contiguous(channels_last_memory_format);
 
-  auto grad_output_data = grad_output.data_ptr<scalar_t>();
+  auto grad_output_data = grad_output.const_data_ptr<scalar_t>();
   auto grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
 
   auto input_sizes = grad_input.sizes().vec();
@@ -635,7 +655,7 @@ void cpu_upsample_linear_backward_channels_last(
         for (const auto ow : c10::irange(output_width)) {
           compute_source_index_and_lambda(
               iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners);
-          scalar_t* grad_output_ptr = grad_output_data +
+          const scalar_t* grad_output_ptr = grad_output_data +
               (n * output_height * output_width + oh * output_width + ow) * channels;
           linear_channels_last_acc(input_indexr(n, ih0, iw0, input_offset), grad_output_ptr, h0lambda * w0lambda, channels); /* i00 */
           linear_channels_last_acc(input_indexr(n, ih0, iw1, input_offset), grad_output_ptr, h0lambda * w1lambda, channels); /* i01 */
@@ -687,7 +707,7 @@ void cpu_upsample_linear_backward_channels_last(
           for (const auto ow : c10::irange(output_width)) {
             compute_source_index_and_lambda(
                 iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners);
-            scalar_t* grad_output_ptr = grad_output_data + (n * output_depth * output_height * output_width +
+            const scalar_t* grad_output_ptr = grad_output_data + (n * output_depth * output_height * output_width +
                 od *  output_height * output_width + oh * output_width + ow) * channels;
             linear_channels_last_acc(input_indexr(n, id0, ih0, iw0, input_offset), grad_output_ptr, d0lambda * h0lambda * w0lambda, channels); /* i000 */
             linear_channels_last_acc(input_indexr(n, id0, ih0, iw1, input_offset), grad_output_ptr, d0lambda * h0lambda * w1lambda, channels); /* i001 */
@@ -726,7 +746,7 @@ void upsample_linear1d_backward_kernel_impl(
     const Tensor& grad_output,
     bool align_corners,
     c10::optional<double> scales_w) {
-  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_linear1d_backward", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_linear1d_backward", [&] {
     cpu_upsample_linear_backward<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_w});
   });
 }
@@ -738,11 +758,11 @@ void upsample_bilinear2d_backward_kernel_impl(
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
   if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_bilinear2d_backward_channels_last", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_bilinear2d_backward_channels_last", [&] {
       cpu_upsample_linear_backward_channels_last<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_h, scales_w});
     });
   } else {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_bilinear2d_backward", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_bilinear2d_backward", [&] {
       cpu_upsample_linear_backward<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_h, scales_w});
     });
   }
@@ -756,11 +776,11 @@ void upsample_trilinear3d_backward_kernel_impl(
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
   if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast3d)) {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_trilinear3d_backward_channels_last", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_trilinear3d_backward_channels_last", [&] {
       cpu_upsample_linear_backward_channels_last<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_d, scales_h, scales_w});
     });
   } else {
-    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_trilinear3d_backward", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_trilinear3d_backward", [&] {
       cpu_upsample_linear_backward<scalar_t, scale_t>(grad_input, grad_output, align_corners, {scales_d, scales_h, scales_w});
     });
   }
diff --git a/aten/src/ATen/native/cpu/WeightNormKernel.cpp b/aten/src/ATen/native/cpu/WeightNormKernel.cpp
index cace911114efe..8d483d24636ed 100644
--- a/aten/src/ATen/native/cpu/WeightNormKernel.cpp
+++ b/aten/src/ATen/native/cpu/WeightNormKernel.cpp
@@ -70,8 +70,7 @@ inline void sum_norm_per_row(
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec v_bvec = bVec::loadu(v_ptr + d);
-    fVec v_fvec0, v_fvec1;
-    std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec);
+    auto [v_fvec0, v_fvec1] = convert_bfloat16_float(v_bvec);
 
     fVec out_fvec0 = fVec::loadu(out_ptr + d) + v_fvec0 * v_fvec0;
     fVec out_fvec1 = fVec::loadu(out_ptr + d + fVec::size()) + v_fvec1 * v_fvec1;
@@ -109,8 +108,7 @@ inline void apply_norm_per_row(
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec v_bvec = bVec::loadu(v_ptr + d);
-    fVec v_fvec0, v_fvec1;
-    std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec);
+    auto [v_fvec0, v_fvec1] = convert_bfloat16_float(v_bvec);
 
     fVec w_fvec0 = fVec::loadu(a_ptr + d) * v_fvec0;
     fVec w_fvec1 = fVec::loadu(a_ptr + d + fVec::size()) * v_fvec1;
@@ -249,11 +247,9 @@ inline void sum_product_per_row(
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec grad_w_bvec = bVec::loadu(grad_w_ptr + d);
-    fVec grad_w_fvec0, grad_w_fvec1;
-    std::tie(grad_w_fvec0, grad_w_fvec1) = convert_bfloat16_float(grad_w_bvec);
+    auto [grad_w_fvec0, grad_w_fvec1] = convert_bfloat16_float(grad_w_bvec);
     bVec v_bvec = bVec::loadu(v_ptr + d);
-    fVec v_fvec0, v_fvec1;
-    std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec);
+    auto [v_fvec0, v_fvec1] = convert_bfloat16_float(v_bvec);
 
     fVec out_fvec0 = fVec::loadu(out_ptr + d) + grad_w_fvec0 * v_fvec0;
     fVec out_fvec1 = fVec::loadu(out_ptr + d + fVec::size()) + grad_w_fvec1 * v_fvec1;
@@ -298,11 +294,9 @@ inline void apply_per_row_backward(
   int64_t d = 0;
   for (; d < size - (size % bVec::size()); d += bVec::size()) {
     bVec grad_w_bvec = bVec::loadu(grad_w_ptr + d);
-    fVec grad_w_fvec0, grad_w_fvec1;
-    std::tie(grad_w_fvec0, grad_w_fvec1) = convert_bfloat16_float(grad_w_bvec);
+    auto [grad_w_fvec0, grad_w_fvec1] = convert_bfloat16_float(grad_w_bvec);
     bVec v_bvec = bVec::loadu(v_ptr + d);
-    fVec v_fvec0, v_fvec1;
-    std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec);
+    auto [v_fvec0, v_fvec1] = convert_bfloat16_float(v_bvec);
 
     fVec grad_v_fvec0 = fVec::loadu(a_ptr + d) * grad_w_fvec0 - fVec::loadu(b_ptr + d) * v_fvec0;
     fVec grad_v_fvec1 = fVec::loadu(a_ptr + d + fVec::size()) * grad_w_fvec1
diff --git a/aten/src/ATen/native/cpu/avx_mathfun.h b/aten/src/ATen/native/cpu/avx_mathfun.h
index 080cd833d3a10..f4fd3b7bc461f 100644
--- a/aten/src/ATen/native/cpu/avx_mathfun.h
+++ b/aten/src/ATen/native/cpu/avx_mathfun.h
@@ -240,7 +240,7 @@ _PS256_CONST(coscof_p2,  4.166664568298827E-002);
 _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
 
 
-/* evaluation of 8 sines at onces using AVX intrisics
+/* evaluation of 8 sines at onces using AVX intrinsics
 
    The code is the exact rewriting of the cephes sinf function.
    Precision is excellent as long as x < 8192 (I did not bother to
diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
index 58faea7f51b83..bf007114e78c1 100644
--- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
@@ -34,13 +34,13 @@ void batch_norm_cpu_collect_linear_and_constant_terms(
     const Tensor& save_mean, const Tensor& save_invstd,
     const Tensor& running_mean, const Tensor& running_var, bool train, double eps) {
 
-  const param_t* weight_data = weight.defined() ? weight.data_ptr<param_t>() : nullptr;
-  const param_t* bias_data = bias.defined() ? bias.data_ptr<param_t>() : nullptr;
+  const param_t* weight_data = weight.defined() ? weight.const_data_ptr<param_t>() : nullptr;
+  const param_t* bias_data = bias.defined() ? bias.const_data_ptr<param_t>() : nullptr;
 
-  auto save_mean_a = conditional_accessor_1d<param_t>(save_mean);
-  auto save_invstd_a = conditional_accessor_1d<param_t>(save_invstd);
-  auto running_mean_a = conditional_accessor_1d<param_t>(running_mean);
-  auto running_var_a = conditional_accessor_1d<param_t>(running_var);
+  auto save_mean_a = conditional_accessor_1d<const param_t>(save_mean);
+  auto save_invstd_a = conditional_accessor_1d<const param_t>(save_invstd);
+  auto running_mean_a = conditional_accessor_1d<const param_t>(running_mean);
+  auto running_var_a = conditional_accessor_1d<const param_t>(running_var);
 
   /// Collect the linear and constant terms regarding the input.
   /// output(n, c, h, w)
@@ -91,7 +91,7 @@ batch_norm_cpu_contiguous_impl(Tensor& output, const Tensor& input,
      save_mean, save_invstd, running_mean, running_var, train, eps);
 
   scalar_t* output_data = output.data_ptr<scalar_t>();
-  const scalar_t* input_data = input.data_ptr<scalar_t>();
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
 
   // Apply the linear terms to the input,
   // output(n, c, h, w) = input(n, c, h, w) * alpha(c) + beta(c)
@@ -143,7 +143,7 @@ batch_norm_cpu_channels_last_impl(Tensor& output, const Tensor& input,
       save_mean, save_invstd, running_mean, running_var, train, eps);
 
   scalar_t* output_data = output.data_ptr<scalar_t>();
-  const scalar_t* input_data = input.data_ptr<scalar_t>();
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
 
   // Apply the linear terms to the input,
   // output(n, c, h, w) = input(n, c, h, w) * alpha(c) + beta(c)
@@ -185,7 +185,7 @@ batch_norm_cpu_collect_stats_contiguous_impl(
   int64_t image_size = input.numel() / n_batch / n_channel;
   int64_t N = input.numel() / n_channel;
 
-  const scalar_t* input_data = input.data_ptr<scalar_t>();
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
   scalar_t* mean_data = mean.data_ptr<scalar_t>();
   scalar_t* var_sum_data = var_sum.data_ptr<scalar_t>();
 
@@ -229,7 +229,7 @@ batch_norm_cpu_collect_stats_channels_last_impl(
   int64_t n_channel = input.size(1);
   int64_t N = input.numel() / n_channel;
 
-  const scalar_t* input_data = input.data_ptr<scalar_t>();
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
   scalar_t* mean_data = mean.data_ptr<scalar_t>();
   scalar_t* var_sum_data = var_sum.data_ptr<scalar_t>();
 
@@ -416,8 +416,8 @@ batch_norm_cpu_backward_contiguous_impl(Tensor& grad_input, Tensor& grad_weight,
   int64_t image_size = input.numel() / n_batch / n_channel;
   int64_t N = input.numel() / n_channel;
 
-  const scalar_t* grad_output_data = grad_output.data_ptr<scalar_t>();
-  const scalar_t* input_data = input.data_ptr<scalar_t>();
+  const scalar_t* grad_output_data = grad_output.const_data_ptr<scalar_t>();
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
 
   scalar_t* grad_input_data = grad_input.defined() ? grad_input.mutable_data_ptr<scalar_t>() : nullptr;
   scalar_t* grad_weight_data = grad_weight.defined() ? grad_weight.data_ptr<scalar_t>() : nullptr;
@@ -426,11 +426,11 @@ batch_norm_cpu_backward_contiguous_impl(Tensor& grad_input, Tensor& grad_weight,
   const bool grad_weight_null = grad_weight_data == nullptr;
   const bool grad_bias_null = grad_bias_data == nullptr;
 
-  auto weight_a = conditional_accessor_1d<scalar_t>(weight);
-  auto save_mean_a = conditional_accessor_1d<scalar_t>(save_mean);
-  auto save_invstd_a = conditional_accessor_1d<scalar_t>(save_invstd);
-  auto running_mean_a = conditional_accessor_1d<scalar_t>(running_mean);
-  auto running_var_a = conditional_accessor_1d<scalar_t>(running_var);
+  auto weight_a = conditional_accessor_1d<const scalar_t>(weight);
+  auto save_mean_a = conditional_accessor_1d<const scalar_t>(save_mean);
+  auto save_invstd_a = conditional_accessor_1d<const scalar_t>(save_invstd);
+  auto running_mean_a = conditional_accessor_1d<const scalar_t>(running_mean);
+  auto running_var_a = conditional_accessor_1d<const scalar_t>(running_var);
 
   // parallel dim reduce on 'channel'
   at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) {
@@ -537,22 +537,22 @@ batch_norm_cpu_backward_channels_last_impl(Tensor& grad_input, Tensor& grad_weig
   int64_t n_channel = input.size(1);
   int64_t N = input.numel() / n_channel;
 
-  const scalar_t* grad_output_data = grad_output.data_ptr<scalar_t>();
-  const scalar_t* input_data = input.data_ptr<scalar_t>();
+  const scalar_t* grad_output_data = grad_output.const_data_ptr<scalar_t>();
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
 
   scalar_t* grad_input_data = grad_input.defined() ? grad_input.mutable_data_ptr<scalar_t>() : nullptr;
   scalar_t* grad_weight_data = grad_weight.defined() ? grad_weight.data_ptr<scalar_t>() : nullptr;
   scalar_t* grad_bias_data = grad_bias.defined() ? grad_bias.data_ptr<scalar_t>() : nullptr;
 
-  scalar_t* save_mean_data = conditional_data_ptr<scalar_t>(save_mean);
+  const scalar_t* save_mean_data = conditional_data_ptr<const scalar_t>(save_mean);
   scalar_t* save_invstd_data = conditional_data_ptr<scalar_t>(save_invstd);
-  scalar_t* running_mean_data = conditional_data_ptr<scalar_t>(running_mean);
-  scalar_t* running_var_data = conditional_data_ptr<scalar_t>(running_var);
+  const scalar_t* running_mean_data = conditional_data_ptr<const scalar_t>(running_mean);
+  const scalar_t* running_var_data = conditional_data_ptr<const scalar_t>(running_var);
 
   Tensor weight_ = weight.defined() ? weight : at::ones({n_channel}, input.options());
-  const scalar_t* weight_data = weight_.data_ptr<scalar_t>();
+  const scalar_t* weight_data = weight_.const_data_ptr<scalar_t>();
 
-  scalar_t* mean_ptr = nullptr;
+  const scalar_t* mean_ptr = nullptr;
   scalar_t* invstd_ptr = nullptr;
   Tensor invstd = at::empty({0}, input.options());
   if (train) {
@@ -735,7 +735,7 @@ batch_norm_cpu_contiguous_impl(Tensor& output, const Tensor& input,
   }
 
   scalar_t* output_data = output.data_ptr<scalar_t>();
-  const scalar_t* input_data = input.data_ptr<scalar_t>();
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
 
   const int64_t loop_size = image_size - (image_size % bVec::size());
   at::parallel_for(0, n_batch * n_channel, 1, [&](int64_t begin, int64_t end) {
@@ -753,8 +753,7 @@ batch_norm_cpu_contiguous_impl(Tensor& output, const Tensor& input,
       int64_t d = 0;
       for (; d < loop_size; d += bVec::size()) {
         bVec data_bvec = bVec::loadu(input_ptr + d);
-        fVec data_fvec0, data_fvec1;
-        std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+        auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
 
         fVec out_fvec0 = data_fvec0 * alpha_fvec + beta_fvec;
         fVec out_fvec1 = data_fvec1 * alpha_fvec + beta_fvec;
@@ -799,7 +798,7 @@ batch_norm_cpu_channels_last_impl(Tensor& output, const Tensor& input,
   }
 
   scalar_t* output_data = output.data_ptr<scalar_t>();
-  const scalar_t* input_data = input.data_ptr<scalar_t>();
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
 
   const int64_t loop_size = n_channel - (n_channel % bVec::size());
   at::parallel_for(0, n_batch * image_size, 1, [&](int64_t begin, int64_t end) {
@@ -813,8 +812,7 @@ batch_norm_cpu_channels_last_impl(Tensor& output, const Tensor& input,
         fVec beta_fvec0 = fVec::loadu(beta_data + d);
         fVec beta_fvec1 = fVec::loadu(beta_data + d + fVec::size());
         bVec data_bvec = bVec::loadu(input_ptr + d);
-        fVec data_fvec0, data_fvec1;
-        std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+        auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
 
         fVec out_fvec0 = data_fvec0 * alpha_fvec0 + beta_fvec0;
         fVec out_fvec1 = data_fvec1 * alpha_fvec1 + beta_fvec1;
@@ -839,7 +837,7 @@ inline void batch_norm_cpu_collect_stats_contiguous_internal(
   int64_t image_size = input.numel() / n_batch / n_channel;
   int64_t N = input.numel() / n_channel;
 
-  const scalar_t* input_data = input.data_ptr<scalar_t>();
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
   param_t* mean_data = mean.data_ptr<param_t>();
   param_t* var_sum_data = var_sum.data_ptr<param_t>();
 
@@ -852,8 +850,7 @@ inline void batch_norm_cpu_collect_stats_contiguous_internal(
         int64_t d = 0;
         for (; d < image_size - (image_size % bVec::size()); d += bVec::size()) {
           bVec data_bvec = bVec::loadu(input_ptr + d);
-          fVec data_fvec0, data_fvec1;
-          std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+          auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
           sum_fvec += data_fvec0;
           sum_fvec += data_fvec1;
         }
@@ -874,8 +871,7 @@ inline void batch_norm_cpu_collect_stats_contiguous_internal(
         int64_t d = 0;
         for (; d < image_size - (image_size % bVec::size()); d += bVec::size()) {
           bVec data_bvec = bVec::loadu(input_ptr + d);
-          fVec data_fvec0, data_fvec1;
-          std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+          auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
           var_fvec += (data_fvec0 - mean_fvec) * (data_fvec0 - mean_fvec);
           var_fvec += (data_fvec1 - mean_fvec) * (data_fvec1 - mean_fvec);
         }
@@ -912,7 +908,7 @@ inline void batch_norm_cpu_collect_stats_channels_last_internal(
   int64_t n_channel = input.size(1);
   int64_t N = input.numel() / n_channel;
 
-  const scalar_t* input_data = input.data_ptr<scalar_t>();
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
   param_t* mean_data = mean.data_ptr<param_t>();
   param_t* var_sum_data = var_sum.data_ptr<param_t>();
 
@@ -929,8 +925,7 @@ inline void batch_norm_cpu_collect_stats_channels_last_internal(
       int64_t d = 0;
       for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) {
         bVec data_bvec = bVec::loadu(input_ptr + d);
-        fVec data_fvec0, data_fvec1;
-        std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
+        auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
         fVec sum_fvec0 = fVec::loadu(buffer_ptr + d) + data_fvec0;
         fVec sum_fvec1 = fVec::loadu(buffer_ptr + d + fVec::size()) + data_fvec1;
         sum_fvec0.store(buffer_ptr + d);
@@ -960,10 +955,8 @@ inline void batch_norm_cpu_collect_stats_channels_last_internal(
       int64_t d = 0;
       for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) {
         bVec data_bvec = bVec::loadu(input_ptr + d);
-        fVec data_fvec0, data_fvec1;
-        std::tie(data_fvec0, data_fvec1) = convert_to_float<scalar_t>(data_bvec);
-        fVec mean_fvec0, mean_fvec1;
-        std::tie(mean_fvec0, mean_fvec1) = load2f(mean_data + d);
+        auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+        auto [mean_fvec0, mean_fvec1] = load2f(mean_data + d);
         fVec var_fvec0 = fVec::loadu(buffer_ptr + d);
         fVec var_fvec1 = fVec::loadu(buffer_ptr + d + fVec::size());
         var_fvec0 += (data_fvec0 - mean_fvec0) * (data_fvec0 - mean_fvec0);
@@ -1013,8 +1006,8 @@ void batch_norm_cpu_backward_contiguous_internal(Tensor& grad_input, Tensor& gra
   int64_t image_size = input.numel() / n_batch / n_channel;
   int64_t N = input.numel() / n_channel;
 
-  const scalar_t* grad_output_data = grad_output.data_ptr<scalar_t>();
-  const scalar_t* input_data = input.data_ptr<scalar_t>();
+  const scalar_t* grad_output_data = grad_output.const_data_ptr<scalar_t>();
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
 
   scalar_t* grad_input_data = grad_input.defined() ? grad_input.mutable_data_ptr<scalar_t>() : nullptr;
   param_t* grad_weight_data = grad_weight.defined() ? grad_weight.data_ptr<param_t>() : nullptr;
@@ -1023,11 +1016,11 @@ void batch_norm_cpu_backward_contiguous_internal(Tensor& grad_input, Tensor& gra
   const bool grad_weight_null = grad_weight_data == nullptr;
   const bool grad_bias_null = grad_bias_data == nullptr;
 
-  auto weight_a = conditional_accessor_1d<param_t>(weight);
-  auto save_mean_a = conditional_accessor_1d<param_t>(save_mean);
-  auto save_invstd_a = conditional_accessor_1d<param_t>(save_invstd);
-  auto running_mean_a = conditional_accessor_1d<param_t>(running_mean);
-  auto running_var_a = conditional_accessor_1d<param_t>(running_var);
+  auto weight_a = conditional_accessor_1d<const param_t>(weight);
+  auto save_mean_a = conditional_accessor_1d<const param_t>(save_mean);
+  auto save_invstd_a = conditional_accessor_1d<const param_t>(save_invstd);
+  auto running_mean_a = conditional_accessor_1d<const param_t>(running_mean);
+  auto running_var_a = conditional_accessor_1d<const param_t>(running_var);
 
   // parallel dim reduce on 'channel'
   at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) {
@@ -1053,14 +1046,12 @@ void batch_norm_cpu_backward_contiguous_internal(Tensor& grad_input, Tensor& gra
         int64_t d = 0;
         for (; d < image_size - (image_size % bVec::size()); d += bVec::size()) {
           bVec dy_bvec = bVec::loadu(dy_ptr + d);
-          fVec dy_fvec0, dy_fvec1;
-          std::tie(dy_fvec0, dy_fvec1) = convert_to_float<scalar_t>(dy_bvec);
+          auto [dy_fvec0, dy_fvec1] = convert_to_float<scalar_t>(dy_bvec);
           sum_fvec += dy_fvec0;
           sum_fvec += dy_fvec1;
 
           bVec x_bvec = bVec::loadu(x_ptr + d);
-          fVec x_fvec0, x_fvec1;
-          std::tie(x_fvec0, x_fvec1) = convert_to_float<scalar_t>(x_bvec);
+          auto [x_fvec0, x_fvec1] = convert_to_float<scalar_t>(x_bvec);
           dotp_fvec += (x_fvec0 - fVec(mean)) * dy_fvec0;
           dotp_fvec += (x_fvec1 - fVec(mean)) * dy_fvec1;
         }
@@ -1137,18 +1128,18 @@ void batch_norm_cpu_backward_channels_last_internal(Tensor& grad_input, Tensor&
   int64_t n_channel = input.size(1);
   int64_t N = input.numel() / n_channel;
 
-  const scalar_t* grad_output_data = grad_output.data_ptr<scalar_t>();
-  const scalar_t* input_data = input.data_ptr<scalar_t>();
+  const scalar_t* grad_output_data = grad_output.const_data_ptr<scalar_t>();
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
 
   scalar_t* grad_input_data = grad_input.defined() ? grad_input.mutable_data_ptr<scalar_t>() : nullptr;
   param_t* grad_weight_data = grad_weight.defined() ? grad_weight.data_ptr<param_t>() : nullptr;
   param_t* grad_bias_data = grad_bias.defined() ? grad_bias.data_ptr<param_t>() : nullptr;
 
-  auto weight_a = conditional_accessor_1d<param_t>(weight);
-  auto save_mean_a = conditional_accessor_1d<param_t>(save_mean);
-  auto save_invstd_a = conditional_accessor_1d<param_t>(save_invstd);
-  auto running_mean_a = conditional_accessor_1d<param_t>(running_mean);
-  auto running_var_a = conditional_accessor_1d<param_t>(running_var);
+  auto weight_a = conditional_accessor_1d<const param_t>(weight);
+  auto save_mean_a = conditional_accessor_1d<const param_t>(save_mean);
+  auto save_invstd_a = conditional_accessor_1d<const param_t>(save_invstd);
+  auto running_mean_a = conditional_accessor_1d<const param_t>(running_mean);
+  auto running_var_a = conditional_accessor_1d<const param_t>(running_var);
 
   // use float as acc type
   bool weight_defined = weight.defined();
@@ -1188,16 +1179,14 @@ void batch_norm_cpu_backward_channels_last_internal(Tensor& grad_input, Tensor&
       int64_t d = 0;
       for(; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) {
         bVec dy_bvec = bVec::loadu(dy_ptr + d);
-        fVec dy_fvec0, dy_fvec1;
-        std::tie(dy_fvec0, dy_fvec1) = convert_to_float<scalar_t>(dy_bvec);
+        auto [dy_fvec0, dy_fvec1] = convert_to_float<scalar_t>(dy_bvec);
         fVec sum_fvec0 = dy_fvec0 + fVec::loadu(sum_ptr + d);
         fVec sum_fvec1 = dy_fvec1 + fVec::loadu(sum_ptr + d + fVec::size());
         sum_fvec0.store(sum_ptr + d);
         sum_fvec1.store(sum_ptr + d + fVec::size());
 
         bVec x_bvec = bVec::loadu(x_ptr + d);
-        fVec x_fvec0, x_fvec1;
-        std::tie(x_fvec0, x_fvec1) = convert_to_float<scalar_t>(x_bvec);
+        auto [x_fvec0, x_fvec1] = convert_to_float<scalar_t>(x_bvec);
         fVec mean_fvec0 = fVec::loadu(mean_data + d);
         fVec mean_fvec1 = fVec::loadu(mean_data + d + fVec::size());
         fVec dotp_fvec0 = fVec::loadu(dotp_ptr + d);
@@ -1246,8 +1235,7 @@ void batch_norm_cpu_backward_channels_last_internal(Tensor& grad_input, Tensor&
           int64_t d = 0;
           for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) {
             bVec x_bvec = bVec::loadu(x_ptr + d);
-            fVec x_fvec0, x_fvec1;
-            std::tie(x_fvec0, x_fvec1) = convert_to_float<scalar_t>(x_bvec);
+            auto [x_fvec0, x_fvec1] = convert_to_float<scalar_t>(x_bvec);
             fVec mean_fvec0 = fVec::loadu(mean_data + d);
             fVec mean_fvec1 = fVec::loadu(mean_data + d + fVec::size());
             fVec dotp_fvec0 = fVec::loadu(dotp_data + d);
@@ -1259,8 +1247,7 @@ void batch_norm_cpu_backward_channels_last_internal(Tensor& grad_input, Tensor&
             fVec dx_fvec0 = (x_fvec0 - mean_fvec0) * k_fvec0;
             fVec dx_fvec1 = (x_fvec1 - mean_fvec1) * k_fvec1;
             bVec dy_bvec = bVec::loadu(dy_ptr + d);
-            fVec dy_fvec0, dy_fvec1;
-            std::tie(dy_fvec0, dy_fvec1) = convert_to_float<scalar_t>(dy_bvec);
+            auto [dy_fvec0, dy_fvec1] = convert_to_float<scalar_t>(dy_bvec);
             fVec grad_mean_fvec0 = fVec::loadu(sum_data + d) / fVec(N);
             fVec grad_mean_fvec1 = fVec::loadu(sum_data + d + fVec::size()) / fVec(N);
             fVec w_fvec0 = fVec::loadu(weight_data + d);
@@ -1287,8 +1274,7 @@ void batch_norm_cpu_backward_channels_last_internal(Tensor& grad_input, Tensor&
           int64_t d = 0;
           for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) {
             bVec dy_bvec = bVec::loadu(dy_ptr + d);
-            fVec dy_fvec0, dy_fvec1;
-            std::tie(dy_fvec0, dy_fvec1) = convert_to_float<scalar_t>(dy_bvec);
+            auto [dy_fvec0, dy_fvec1] = convert_to_float<scalar_t>(dy_bvec);
             fVec invstd_fvec0 = fVec::loadu(invstd_data + d);
             fVec invstd_fvec1 = fVec::loadu(invstd_data + d + fVec::size());
             fVec w_fvec0 = fVec::loadu(weight_data + d);
diff --git a/aten/src/ATen/native/cpu/group_norm_kernel.cpp b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
index fc7aad9c28708..f6b7f2a5d4813 100644
--- a/aten/src/ATen/native/cpu/group_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
@@ -43,9 +43,9 @@ void GroupNormKernelImplInternal(
   TORCH_CHECK(!beta.defined() || beta.numel() == C);
   const int64_t G = group;
   const int64_t D = C / G;
-  const T* X_data = X.data_ptr<T>();
-  const PT* gamma_data = gamma.defined() ? gamma.data_ptr<PT>() : nullptr;
-  const PT* beta_data = beta.defined() ? beta.data_ptr<PT>() : nullptr;
+  const T* X_data = X.const_data_ptr<T>();
+  const PT* gamma_data = gamma.defined() ? gamma.const_data_ptr<PT>() : nullptr;
+  const PT* beta_data = beta.defined() ? beta.const_data_ptr<PT>() : nullptr;
   T* Y_data = Y.data_ptr<T>();
   PT* mean_data = mean.data_ptr<PT>();
   PT* rstd_data = rstd.data_ptr<PT>();
@@ -298,9 +298,9 @@ void GroupNormKernelImplChannelsLastInternal(
   TORCH_CHECK(!beta.defined() || beta.numel() == C);
   const int64_t G = group;
   const int64_t D = C / G;
-  const T* X_data = X.data_ptr<T>();
-  const PT* gamma_data = gamma.defined() ? gamma.data_ptr<PT>() : nullptr;
-  const PT* beta_data = beta.defined() ? beta.data_ptr<PT>() : nullptr;
+  const T* X_data = X.const_data_ptr<T>();
+  const PT* gamma_data = gamma.defined() ? gamma.const_data_ptr<PT>() : nullptr;
+  const PT* beta_data = beta.defined() ? beta.const_data_ptr<PT>() : nullptr;
   T* Y_data = Y.data_ptr<T>();
   PT* mean_data = mean.data_ptr<PT>();
   PT* rstd_data = rstd.data_ptr<PT>();
@@ -442,7 +442,7 @@ void GroupNormKernelImplChannelsLastInternal(
     //
     // We could fuse step 3 and 4 into a single session but this way is better:
     //   a. D might be too small for vectorization;
-    //   b. Avoid duplicate caculation of scale/bias, each HxW plain share the same scale/bias
+    //   b. Avoid duplicate calculation of scale/bias, each HxW plain share the same scale/bias
     //
     for (const auto n : c10::irange(N)) {
       for (const auto g : c10::irange(G)) {
@@ -897,11 +897,11 @@ void GroupNormBackwardKernelImplInternal(
   TORCH_CHECK(mean.numel() == N * group);
   TORCH_CHECK(rstd.numel() == N * group);
   TORCH_CHECK(!gamma.defined() || gamma.numel() == C);
-  const T* dY_data = dY.data_ptr<T>();
-  const T* X_data = X.data_ptr<T>();
-  const PT* mean_data = mean.data_ptr<PT>();
-  const PT* rstd_data = rstd.data_ptr<PT>();
-  const PT* gamma_data = gamma.defined() ? gamma.data_ptr<PT>() : nullptr;
+  const T* dY_data = dY.const_data_ptr<T>();
+  const T* X_data = X.const_data_ptr<T>();
+  const PT* mean_data = mean.const_data_ptr<PT>();
+  const PT* rstd_data = rstd.const_data_ptr<PT>();
+  const PT* gamma_data = gamma.defined() ? gamma.const_data_ptr<PT>() : nullptr;
   T* dX_data = dX.defined() ? dX.data_ptr<T>() : nullptr;
   PT* dgamma_data = dgamma.defined() ? dgamma.data_ptr<PT>() : nullptr;
   PT* dbeta_data = dbeta.defined() ? dbeta.data_ptr<PT>() : nullptr;
@@ -1377,11 +1377,11 @@ void GroupNormBackwardKernelImplChannelsLastInternal(
   TORCH_CHECK(!gamma.defined() || gamma.numel() == C);
   int64_t D = C / group;
   int64_t G = group;
-  const T* dY_data = dY.data_ptr<T>();
-  const T* X_data = X.data_ptr<T>();
-  const PT* mean_data = mean.data_ptr<PT>();
-  const PT* rstd_data = rstd.data_ptr<PT>();
-  const PT* gamma_data = gamma.defined() ? gamma.data_ptr<PT>() : nullptr;
+  const T* dY_data = dY.const_data_ptr<T>();
+  const T* X_data = X.const_data_ptr<T>();
+  const PT* mean_data = mean.const_data_ptr<PT>();
+  const PT* rstd_data = rstd.const_data_ptr<PT>();
+  const PT* gamma_data = gamma.defined() ? gamma.const_data_ptr<PT>() : nullptr;
   T* dX_data = dX.defined() ? dX.data_ptr<T>() : nullptr;
   PT* dgamma_data = dgamma.defined() ? dgamma.data_ptr<PT>() : nullptr;
   PT* dbeta_data = dbeta.defined() ? dbeta.data_ptr<PT>() : nullptr;
diff --git a/aten/src/ATen/native/cpu/int4mm_kernel.cpp b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
new file mode 100644
index 0000000000000..acb4b927f23f5
--- /dev/null
+++ b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
@@ -0,0 +1,691 @@
+#include <type_traits>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+
+#include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/native/cpu/int_mm_kernel.h>
+#include <ATen/native/cpu/utils.h>
+#include <c10/util/irange.h>
+#include <c10/util/Unroll.h>
+
+#if (defined(_WIN32) || defined(_WIN64))
+#define RESTRICT __restrict
+#else
+#define RESTRICT __restrict__
+#endif
+
+namespace at::native {
+
+namespace {
+
+inline bool is_block_start(int index, int BLOCK_SIZE) {
+  return !(index & (BLOCK_SIZE -1));
+}
+
+#if (defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)) && !defined(_MSC_VER)
+// convert 16x int4 to int8, handle 64 bits at a time
+// used in avx2 and avx512
+inline __m128i conver_int4_to_int8(const uint8_t* data) {
+  __m128i tmp = _mm_loadu_si64((const __m128i*)data);
+  __m128i bytes = _mm_cvtepu8_epi16(tmp);
+  const __m128i lowMask = _mm_set1_epi8(0xF);
+  __m128i high = _mm_andnot_si128(lowMask, bytes);
+  __m128i low = _mm_and_si128(lowMask, bytes);
+  high = _mm_slli_epi16(high, 4);
+  bytes = _mm_or_si128(low, high);
+  return bytes;
+}
+#endif
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+// A block : {BLOCK_M, BLOCK_K}, lda = K
+// B block : {BLOCK_K, BLOCK_N / 2}, ldb = BLOCK_N / 2
+// C block : {BLOCK_M, BLOCK_N}, ldc = N
+//
+// ScaleAndZeros block : {1, BLOCK_N, 2}
+//
+template <int BLOCK_M, int BLOCK_N>
+inline void tinygemm_kernel(
+    const BFloat16* RESTRICT A,
+    const uint8_t* RESTRICT B,
+    const BFloat16* RESTRICT ScaleAndZeros,
+    BFloat16* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K,
+    int BLOCK_K) {
+
+  constexpr int ROWS = BLOCK_M;
+  constexpr int COLS = BLOCK_N / 16;
+
+  const int PREFETCH_SIZE_K = 16 * 4;
+  const int PREFETCH_SIZE_KB = (PREFETCH_SIZE_K + BLOCK_K - 1) / BLOCK_K;
+
+  // number of blocks on K
+  const int KB = K / BLOCK_K;
+
+  __m512 va;
+  __m512 vb[COLS];
+  __m512 vc[ROWS * COLS];
+  __m512 scale[COLS];
+  __m512 zero[COLS];
+
+  // Lookup table to de-quantize int4 values to bf16.
+  // Values are dequantized as truly int4 [-8, 7] range;
+  //
+  // dequant = (bf16(int4_value) * bf16_scale) + bf16_zero
+  //
+  static const __m512 lut = _mm512_set_ps(
+      7.0f, 6.0f, 5.0f, 4.0f,
+      3.0f, 2.0f, 1.0f, 0.0f,
+      -1.0f, -2.0f, -3.0f, -4.0f,
+      -5.0f, -6.0f, -7.0f, -8.0f);
+
+  // index for transpose
+  static const __m512i idx1 = _mm512_set_epi32(
+      30, 28, 26, 24, 22, 20, 18, 16,
+      14, 12, 10, 8, 6, 4, 2, 0);
+  static const __m512i idx2 = _mm512_set_epi32(
+      31, 29, 27, 25, 23, 21, 19, 17,
+      15, 13, 11, 9, 7, 5, 3, 1);
+
+  // load scale and zero point
+  auto load_scale_and_zeros = [&](int i, int _kb) {
+    // load 2x bfloat16 vector
+    __m512i t = _mm512_loadu_si512((__m512i*)(ScaleAndZeros + _kb * ldc * 2 + 32 * i));
+    if (_kb + PREFETCH_SIZE_KB < KB) {
+      _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * ldc * 2 + 32 * i, _MM_HINT_T0);
+    }
+
+    // convert to 2x f32 vector
+    __m512 a, b;
+    vec::cvtbf16_fp32(t, a, b);
+
+    // transpose scale_and_zero from {16, 2} to {2, 16}
+    // inputs:
+    //   a: {s0, z0, s1, z1, ..., s7, z7}
+    //   b: {s8, z8, s9, z9, ..., s15, z15}
+    // output:
+    //   scale: {s0, s1, s2, ..., s15}
+    //   zero:  {z0, z1, z2, ..., z15}
+    scale[i] = _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b);
+    zero[i] = _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b);
+  };
+
+  auto loadc = [&](auto i) {
+    vc[i] = _mm512_setzero_ps();
+  };
+  c10::ForcedUnroll<ROWS * COLS>{}(loadc);
+
+  auto compute = [&, COLS](auto i, int k) {
+    constexpr  int row = i / COLS;
+    constexpr  int col = i % COLS;
+
+    if constexpr (col == 0) {
+      float aa = static_cast<float>(A[row * lda + k]);
+      if (k + PREFETCH_SIZE_K < K) {
+        _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0);
+      }
+      va = _mm512_set1_ps(aa);
+    }
+
+    if constexpr (row == 0) {
+      if constexpr (COLS == 4) {
+        // when BLOCK_N = 64, handle each row at a time
+        // to reduce de-quantize overhead.
+        if constexpr (col == 0) {
+          __m256i b4 = _mm256_loadu_si256((__m256i*)(B + k * ldb));
+          if (k + PREFETCH_SIZE_K < K) {
+            _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb, _MM_HINT_T0);
+          }
+
+          __m512i b32 = _mm512_cvtepu8_epi32(_mm256_castsi256_si128(b4));
+          vb[0] = _mm512_permutexvar_ps(b32, lut);
+          vb[0] = _mm512_fmadd_ps(vb[0], scale[0], zero[0]);
+          vb[2] = _mm512_permutexvar_ps(_mm512_srli_epi32(b32, 4), lut);
+          vb[2] = _mm512_fmadd_ps(vb[2], scale[2], zero[2]);
+
+          b32 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(b4, 1));
+          vb[1] = _mm512_permutexvar_ps(b32, lut);
+          vb[1] = _mm512_fmadd_ps(vb[1], scale[1], zero[1]);
+          vb[3] = _mm512_permutexvar_ps(_mm512_srli_epi32(b32, 4), lut);
+          vb[3] = _mm512_fmadd_ps(vb[3], scale[3], zero[3]);
+        }
+      } else {
+        __m128i b8 = conver_int4_to_int8(B + k * ldb + col * 8);
+        __m512i b32 = _mm512_cvtepu8_epi32(b8);
+        vb[col] = _mm512_permutexvar_ps(b32, lut);
+        vb[col] = _mm512_fmadd_ps(vb[col], scale[col], zero[col]);
+      }
+    }
+
+    constexpr int idx = row * COLS + col;
+    vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]);
+  };
+
+  for (int k = 0, kb = 0; k < K; ++k) {
+    if (is_block_start(k, BLOCK_K)) {
+      c10::ForcedUnroll<COLS>{}(load_scale_and_zeros, kb++);
+    }
+    c10::ForcedUnroll<ROWS * COLS>{}(compute, k);
+  }
+
+  //store to C
+  auto storec = [&, COLS](auto i) {
+    constexpr int row = i / COLS;
+    constexpr int col = i % COLS;
+    if constexpr (COLS == 4) {
+      // when BLOCK_N = 64, handle each row at a time
+      // to reduce `cvtfp32_bf16` overhead.
+      if constexpr (col == 0) {
+        __m512i c01 = vec::cvtfp32_bf16(vc[row * 4 + 0], vc[row * 4 + 1]);
+        __m512i c23 = vec::cvtfp32_bf16(vc[row * 4 + 2], vc[row * 4 + 3]);
+        _mm512_storeu_si512((__m512i*)(C + row * ldc + 0 * 32), c01);
+        _mm512_storeu_si512((__m512i*)(C + row * ldc + 1 * 32), c23);
+      }
+    } else {
+      __m256i ci = vec::cvtfp32_bf16(vc[i]);
+      _mm256_storeu_si256((__m256i*)(C + row * ldc + col * 16), ci);
+    }
+  };
+  c10::ForcedUnroll<ROWS * COLS>{}(storec);
+}
+
+#elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+template <int BLOCK_M, int BLOCK_N>
+inline void tinygemm_kernel(
+    const BFloat16* RESTRICT A,
+    const uint8_t* RESTRICT B,
+    const BFloat16* RESTRICT ScaleAndZeros,
+    BFloat16* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K,
+    int BLOCK_K) {
+
+  constexpr int ROWS = BLOCK_M;
+  constexpr int COLS = BLOCK_N / 8;
+
+  const int PREFETCH_SIZE_K = 16 * 4;
+  const int PREFETCH_SIZE_KB = (PREFETCH_SIZE_K + BLOCK_K - 1) / BLOCK_K;
+
+  // number of blocks on K
+  const int KB = K / BLOCK_K;
+
+  __m256 va;
+  __m256 vb[COLS];
+  __m256 vc[ROWS * COLS];
+  __m256 scale[COLS];
+  __m256 zero[COLS];
+
+  static const __m256i idx1 = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
+
+  // offset to shift from range [0, 15] to [-8, 7]
+  const __m256 offset = _mm256_set1_ps(-8.0f);
+
+  // load scale and zero point
+  auto load_scale_and_zeros = [&](int i, int _kb) {
+    // load 2x bfloat16 vector
+    __m256i t = _mm256_loadu_si256((__m256i*)(ScaleAndZeros + _kb * ldc * 2 + 16 * i));
+    if (_kb + PREFETCH_SIZE_KB < KB) {
+      _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * ldc * 2 + 16 * i, _MM_HINT_T0);
+    }
+
+    // convert to 2x f32 vector
+    __m256 a, b;
+    vec::cvtbf16_fp32(t, a, b);
+
+    // transpose scale_and_zero from {8, 2} to {2, 8}
+    // inputs:
+    //   a: {s0, z0, s1, z1, s2, z2, s3, z3}
+    //   b: {s4, z4, s5, z5, s6, z6, s7, z7}
+    // output:
+    //   scale: {s0, s1, s2, s3, s4, s5, s6, s7}
+    //   zero:  {z0, z1, z2, z3, z4, z5, z6, z7}
+    a = _mm256_permutevar8x32_ps(a, idx1);
+    b = _mm256_permutevar8x32_ps(b, idx1);
+    scale[i] = _mm256_permute2f128_ps(a, b, 0b0100000);
+    zero[i] = _mm256_permute2f128_ps(a, b, 0b0110001);
+
+    // zero = -8 * scale + zero
+    zero[i] = _mm256_fmadd_ps(scale[i], offset, zero[i]);
+  };
+
+  auto loadc = [&](auto i) {
+    vc[i] = _mm256_setzero_ps();
+  };
+  c10::ForcedUnroll<ROWS * COLS>{}(loadc);
+
+  auto compute = [&, COLS](auto i, int k) {
+    constexpr int row = i / COLS;
+    constexpr int col = i % COLS;
+
+    if constexpr (col == 0) {
+      float aa = static_cast<float>(A[row * lda + k]);
+      if (k + PREFETCH_SIZE_K < K) {
+        _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0);
+      }
+      va = _mm256_set1_ps(aa);
+    }
+
+    if constexpr (row == 0) {
+      if constexpr (COLS == 4) {
+        // when BLOCK_N = 32, handle each row at a time
+        if constexpr (col == 0) {
+          __m256i mask = _mm256_set1_epi32(0xF);
+          __m128i b4 = _mm_loadu_si128((__m128i*)(B + k * ldb));
+          if (k + PREFETCH_SIZE_K < K) {
+            _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb, _MM_HINT_T0);
+          }
+
+          __m256i b32 = _mm256_cvtepu8_epi32(b4);
+          vb[0] = _mm256_cvtepi32_ps(_mm256_and_si256(b32, mask));
+          vb[0] = _mm256_fmadd_ps(vb[0], scale[0], zero[0]);
+          vb[2] = _mm256_cvtepi32_ps(_mm256_srli_epi32(b32, 4));
+          vb[2] = _mm256_fmadd_ps(vb[2], scale[2], zero[2]);
+
+          b32 = _mm256_cvtepu8_epi32(_mm_shuffle_epi32(b4, _MM_SHUFFLE(3, 2, 3, 2)));
+          vb[1] = _mm256_cvtepi32_ps(_mm256_and_si256(b32, mask));
+          vb[1] = _mm256_fmadd_ps(vb[1], scale[1], zero[1]);
+          vb[3] = _mm256_cvtepi32_ps(_mm256_srli_epi32(b32, 4));
+          vb[3] = _mm256_fmadd_ps(vb[3], scale[3], zero[3]);
+        }
+      } else {
+        if constexpr (col % 2 == 0) {
+          // de-quantize per 64 bits (16x int4)
+          __m128i b8 = conver_int4_to_int8(B + k * ldb + col * 4);
+          __m128i b8_val0 = _mm_set1_epi64x(_mm_extract_epi64(b8, 0));
+          __m128i b8_val1 = _mm_set1_epi64x(_mm_extract_epi64(b8, 1));
+          if (k + PREFETCH_SIZE_K < K) {
+            _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb + col * 4, _MM_HINT_T0);
+          }
+
+          vb[col] = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(b8_val0));
+          vb[col] = _mm256_fmadd_ps(vb[col], scale[col], zero[col]);
+          vb[col + 1] = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(b8_val1));
+          vb[col + 1] = _mm256_fmadd_ps(vb[col + 1], scale[col + 1], zero[col + 1]);
+        }
+      }
+    }
+
+    constexpr int idx = row * COLS + col;
+    vc[idx] = _mm256_fmadd_ps(va, vb[col], vc[idx]);
+  };
+  for (int k = 0, kb = 0; k < K; ++k) {
+    if (is_block_start(k, BLOCK_K)) {
+        c10::ForcedUnroll<COLS>{}(load_scale_and_zeros, kb++);
+    }
+    c10::ForcedUnroll<ROWS * COLS>{}(compute, k);
+  }
+
+  // store to C
+  auto storec = [&](auto i) {
+    constexpr int row = i / COLS;
+    constexpr int col = i % COLS;
+    if constexpr (col % 2 == 0) {
+      __m256i ci = vec::cvtfp32_bf16(vc[row * COLS + col], vc[row * COLS + col + 1]);
+      _mm256_storeu_si256((__m256i*)(C + row * ldc + col * 8), ci);
+    }
+  };
+  c10::ForcedUnroll<ROWS * COLS>{}(storec);
+}
+
+#endif
+
+#if !defined(C10_MOBILE) && defined(__aarch64__)
+#include <arm_neon.h>
+template <int BLOCK_M, int BLOCK_N>
+inline void tinygemm_kernel(
+    const Half* RESTRICT A,
+    const uint8_t* RESTRICT B,
+    const Half* RESTRICT ScaleAndZeros,
+    Half* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K,
+    int BLOCK_K) {
+  int16_t shift_vals[4] = {0, -4, -8, -12};
+  int16x4_t shifts = vld1_s16(shift_vals);
+  int16x4_t offs = vdup_n_s16(8);
+  uint16x4_t mask = vdup_n_u16(0x0F);
+  for (const auto m : c10::irange(BLOCK_M)) {
+    for (int n = 0; n < BLOCK_N; n+= 16) {
+      float32x4_t c_val[4];
+      float32x4_t scales[4], zeros[4];
+      c10::ForcedUnroll<4>{}([&](auto i) {
+          c_val[i] = vdupq_n_f32(0.0);
+      });
+      for (const auto k : c10::irange(K)) {
+        const auto a_val = vdupq_n_f32(static_cast<float>(A[m * lda + k]));
+        if (is_block_start(k, BLOCK_K)) {
+          int kb = k / BLOCK_K;
+          c10::ForcedUnroll<4>{}([&](auto i) {
+            auto scales_and_zeros = vld2_f16(reinterpret_cast<const float16_t*>(ScaleAndZeros + kb * ldc * 2 + n * 2 + i * 8));
+            scales[i] = vcvt_f32_f16(scales_and_zeros.val[0]);
+            zeros[i] = vcvt_f32_f16(scales_and_zeros.val[1]);
+          });
+        }
+        c10::ForcedUnroll<4>{}([&](auto i) {
+          uint16_t b_pack = reinterpret_cast<const uint16_t*>(B + k * ldb + n / 2)[i];
+          uint16x4_t b_masked = vand_u16(vshl_u16(vdup_n_u16(b_pack), shifts), mask);
+          int16x4_t b_ints = vsub_s16(vreinterpret_s16_u16(b_masked), offs);
+          float32x4_t b_vals = vcvtq_f32_s32(vmovl_s16(b_ints));
+          b_vals = vaddq_f32(zeros[i], vmulq_f32(scales[i], b_vals));
+          c_val[i] = vfmaq_f32(c_val[i], b_vals, a_val);
+        });
+      }
+      c10::ForcedUnroll<4>{}([&](auto i) {
+        vst1_f16(reinterpret_cast<float16_t*>(C + m * ldc + n + i * 4), vcvt_f16_f32(c_val[i]));
+      });
+    }
+  }
+}
+#endif
+
+template<int BLOCK_N>
+inline float convert_int4_to_float(const uint8_t* b, int n) {
+  static constexpr float lut[16] = {
+    -8.0f, -7.0f, -6.0f, -5.0f,
+    -4.0f, -3.0f, -2.0f, -1.0f,
+    0.0f, 1.0f, 2.0f, 3.0f,
+    4.0f, 5.0f, 6.0f, 7.0f
+  };
+  int index;
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+  if constexpr (BLOCK_N == 64) {
+    const int nb = n/BLOCK_N;
+    n -= nb*BLOCK_N;
+    if (n < 32) {
+      auto val = b[nb * BLOCK_N / 2 + n];
+      index = val & 0x0f;
+    } else {
+      auto val = b[nb * BLOCK_N / 2 + (n - 32)];
+      index = val >> 4;
+    }
+  } else
+#elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+  if constexpr (BLOCK_N == 32) {
+    const int nb = n/BLOCK_N;
+    n -= nb*BLOCK_N;
+    if (n < 16) {
+      auto val = b[nb * BLOCK_N / 2 + n];
+      index = val & 0x0f;
+    } else {
+      auto val = b[nb * BLOCK_N / 2 + (n - 16)];
+      index = val >> 4;
+    }
+  } else
+#endif
+  {
+    const auto is_even = (n & 1) == 0;
+    auto val = b[n/2];
+    index = is_even ? (val & 0x0F) : (val >> 4);
+  }
+  return lut[index];
+}
+
+// non-vectorized version
+template <int BLOCK_M, int BLOCK_N, typename T>
+inline void tinygemm_kernel(
+    const T* RESTRICT A,
+    const uint8_t* RESTRICT B,
+    const T* RESTRICT ScaleAndZeros,
+    T* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K,
+    int BLOCK_K) {
+
+  for (const auto m : c10::irange(BLOCK_M)) {
+    for (const auto n : c10::irange(BLOCK_N)) {
+      float c_val = 0;
+      for (const auto k : c10::irange(K)) {
+        int kb = k / BLOCK_K;
+        const auto scale = static_cast<float>(ScaleAndZeros[kb * ldc * 2 + n * 2]);
+        const auto zero = static_cast<float>(ScaleAndZeros[kb * ldc * 2 + n * 2 + 1]);
+        const auto a_val = static_cast<float>(A[m * lda + k]);
+        float b_val = convert_int4_to_float<BLOCK_N>(B + k *ldb, n);
+        b_val = b_val * scale + zero;
+
+        c_val += a_val * b_val;
+      }
+      C[m * ldc + n] = c_val;
+    }
+  }
+}
+
+
+#define LAUNCH_TINYGEMM_KERNEL(MB_SIZE, NB_SIZE)                 \
+  tinygemm_kernel<MB_SIZE, NB_SIZE>(                             \
+      A_ptr, B_ptr, S_ptr, C_ptr,                                \
+      K, NB_SIZE / 2, N, K, BLOCK_K);
+
+#define LAUNCH_TINYGEMM_NB_SIZE(MB_SIZE)                         \
+  switch (nb_size) {                                             \
+    case 16:                                                     \
+      LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 16);                       \
+      break;                                                     \
+    case 32:                                                     \
+      LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 32);                       \
+      break;                                                     \
+    case 48:                                                     \
+      LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 48);                       \
+      break;                                                     \
+    case 64:                                                     \
+      LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 64);                       \
+      break;                                                     \
+    default:                                                     \
+      TORCH_CHECK(false, "Unsupported n block size: ", nb_size); \
+      break;                                                     \
+  }
+
+// NB: int4 weight pack (with BLOCK_N 64)
+//   weight (int32): {N/64, 64, K}
+//   packed (uint8): {N/64, K, 32}
+//
+// 1. avx512 packed format:
+//   When N is 64, to do 256-bit unpacking at a time, we pack Lane0 with Lane2,
+//   Lane1 with Lane3 since we can only do shift on a 128-bit basis.
+//
+//   weight:
+//     [Lane0] N0...15:  {a00, a01, a02, ...}
+//     [Lane1] N16...31: {a10, a11, a12, ...}
+//     [Lane2] N32...47: {a20, a21, a22, ...}
+//     [Lane3] N48...63: {a30, a31, a32, ...}
+//
+//  packed:
+//     [Lane02] N0...31:  {a20|a00, a21|a01, a22|a02, ...}
+//     [Lane13] N32...63: {a30|a10, a31|a11, a32|a12, ...}
+//
+//  Note: when N is 16, 32 or 48, pack with 64-bit format.
+//
+// 2. avx2 packed format:
+//   When N is 32, to do 128-bit unpacking at a time.
+//
+//   weight:
+//     [Lane0] N0...15:  { a0,  a1,  a2, ...}
+//     [Lane1] N16...32: {a16, a17, a18, ...}
+//
+//  packed:
+//    [Lane01] N0...32: {a16|a0, a17|a1, a18|a2, ...}
+//
+//  Note: When N is 16, pack with 64-bit format
+//
+// 3 non-vectorized packed format:
+//   Do 64-bit unpacking at a time.
+//
+//   weight: {a0, a1, a2, a3, ..., a14, a15}
+//   packed: {a1|a0, a3, a2, ..., a15|a14}
+//
+void weight_to_int4pack_kernel(
+    const Tensor& weight_packed,
+    const Tensor& weight,
+    int N, int K) {
+
+  auto weight_packed_data = reinterpret_cast<uint8_t*>(weight_packed.data_ptr());
+  const auto weight_data = weight.data_ptr<int32_t>();
+
+  // 64 for avx512 and 32 for avx2/non-vectorized
+  constexpr int BLOCK_N = vec::Vectorized<float>::size() * 4;
+  const int NB =  (N + BLOCK_N - 1) / BLOCK_N;
+
+  // parallel on NB blocks
+  at::parallel_for(0, NB, 0, [&](int begin, int end) {
+    for (const auto i : c10::irange(begin, end)) {
+      int nb_size = std::min(BLOCK_N, N - i * BLOCK_N);
+
+      const int32_t* src = weight_data + i * BLOCK_N * K;
+      uint8_t* dst = weight_packed_data + i * K * BLOCK_N / 2;
+      for (const auto k : c10::irange(K)) {
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+        if (nb_size == BLOCK_N) {
+          for (const auto d : c10::irange(16)) {
+            int32_t val0 = src[(d +  0) * K + k];
+            int32_t val1 = src[(d + 16) * K + k];
+            int32_t val2 = src[(d + 32) * K + k];
+            int32_t val3 = src[(d + 48) * K + k];
+
+            uint8_t packed02 = (((uint8_t)(val2) << 4)) | ((uint8_t)(val0));
+            uint8_t packed13 = (((uint8_t)(val3) << 4)) | ((uint8_t)(val1));
+
+            dst[k * 32 + d] = packed02;
+            dst[k * 32 + 16 + d] = packed13;
+          }
+        } else {
+          // for nb_size 16, 32, 48
+          for (int n = 0; n < nb_size; n += 2) {
+            int32_t val0 = src[n * K + k];
+            int32_t val1 = src[n * K + K + k];
+
+            uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0));
+            dst[k * nb_size / 2 + n / 2] = packed;
+          }
+        }
+#elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+        if (nb_size == BLOCK_N) {
+          // for nb_size 32
+          for (const auto d : c10::irange(16)) {
+            int32_t val0 = src[(d + 0) * K + k];
+            int32_t val1 = src[(d + 16) * K + k];
+
+            uint8_t packed01 = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0));
+            dst[k * 16 + d] = packed01;
+          }
+        } else {
+          // for nb_size 16
+          for (int n = 0; n < nb_size; n += 2) {
+            int32_t val0 = src[n * K + k];
+            int32_t val1 = src[n * K + K + k];
+
+            uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0));
+            dst[k * nb_size / 2 + n / 2] = packed;
+          }
+        }
+#else
+        for (int n = 0; n < nb_size; n += 2) {
+          int32_t val0 = src[n * K + k];
+          int32_t val1 = src[n * K + K + k];
+
+          uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0));
+          dst[k * nb_size / 2 + n / 2] = packed;
+        }
+#endif
+      }
+    }
+  });
+}
+
+template<typename T>
+void int4pack_mm_kernel_(
+    const Tensor& C,
+    const Tensor& A,
+    const Tensor& B,
+    int qGroupSize,
+    const Tensor& qScaleAndZeros,
+    int N, int K) {
+
+  const auto* A_data = A.const_data_ptr<T>();
+  const auto* B_data = reinterpret_cast<const uint8_t*>(B.const_data_ptr());
+  auto* C_data = C.data_ptr<T>();
+  const auto* S_data = qScaleAndZeros.const_data_ptr<T>();
+
+  int M = A.size(0);
+
+  constexpr int BLOCK_M = 4;
+  // 64 for avx512 and 32 for avx2/non-vectorized
+  constexpr int BLOCK_N = vec::Vectorized<float>::size() * 4;
+  // 32, 64, 128, 256
+  const int BLOCK_K = qGroupSize;
+
+  const int MB = (M + BLOCK_M - 1) / BLOCK_M;
+  const int NB = (N + BLOCK_N - 1) / BLOCK_N;
+
+  at::parallel_for(0, MB * NB, 0, [&](int begin, int end) {
+    int mb{0}, nb{0};
+    data_index_init(begin, mb, MB, nb, NB);
+
+    for (C10_UNUSED const auto i : c10::irange(begin, end)) {
+      int mb_start = mb * BLOCK_M;
+      int mb_size = std::min(BLOCK_M, M - mb_start);
+      int nb_start = nb * BLOCK_N;
+      int nb_size = std::min(BLOCK_N, N - nb_start);
+
+      const auto* A_ptr = A_data + mb_start * K;
+      const auto* B_ptr = B_data + nb_start * K / 2;
+      const auto* S_ptr = S_data + nb_start * 2;
+      auto* C_ptr = C_data + mb_start * N + nb_start;
+
+      switch (mb_size) {
+        case 1:
+          LAUNCH_TINYGEMM_NB_SIZE(1);
+          break;
+        case 2:
+          LAUNCH_TINYGEMM_NB_SIZE(2);
+          break;
+        case 3:
+          LAUNCH_TINYGEMM_NB_SIZE(3);
+          break;
+        case 4:
+          LAUNCH_TINYGEMM_NB_SIZE(4);
+          break;
+        default:
+          TORCH_CHECK(false, "Unsupported m block size: ", mb_size);
+      }
+
+      // move to the next index
+      data_index_step(mb, MB, nb, NB);
+    }
+  });
+}
+
+void int4pack_mm_kernel(
+    const Tensor& C,
+    const Tensor& A,
+    const Tensor& B,
+    int qGroupSize,
+    const Tensor& qScaleAndZeros,
+    int N, int K) {
+  if (C.scalar_type() == kBFloat16) {
+    int4pack_mm_kernel_<BFloat16>(C, A, B, qGroupSize, qScaleAndZeros, N, K);
+  } else if (C.scalar_type() == kHalf) {
+    int4pack_mm_kernel_<Half>(C, A, B, qGroupSize, qScaleAndZeros, N, K);
+  } else {
+    int4pack_mm_kernel_<float>(C, A, B, qGroupSize, qScaleAndZeros, N, K);
+  }
+}
+
+} // anonymous namespace
+
+ALSO_REGISTER_AVX512_DISPATCH(weight_to_int4pack_stub, &weight_to_int4pack_kernel);
+ALSO_REGISTER_AVX512_DISPATCH(int4pack_mm_stub, &int4pack_mm_kernel);
+
+} // at::native
diff --git a/aten/src/ATen/native/cpu/int8mm_kernel.cpp b/aten/src/ATen/native/cpu/int8mm_kernel.cpp
new file mode 100644
index 0000000000000..bd266030b2566
--- /dev/null
+++ b/aten/src/ATen/native/cpu/int8mm_kernel.cpp
@@ -0,0 +1,430 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+
+#include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/native/cpu/int_mm_kernel.h>
+#include <ATen/native/cpu/utils.h>
+#include <c10/util/irange.h>
+#include <c10/util/Unroll.h>
+
+#if (defined(_WIN32) || defined(_WIN64))
+#define RESTRICT __restrict
+#else
+#define RESTRICT __restrict__
+#endif
+
+namespace at::native {
+
+namespace {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+// A block : {BLOCK_M, BLOCK_K}, lda = K
+// B block : {BLOCK_K, BLOCK_N}, ldb = K
+// C block : {BLOCK_M, BLOCK_N}, ldc = N
+//
+// scales block: {BLOCK_N}
+//
+template <int BLOCK_M, int BLOCK_N>
+inline void tinygemm_kernel(
+    const BFloat16* RESTRICT A,
+    const int8_t* RESTRICT B,
+    const BFloat16* RESTRICT scales,
+    BFloat16* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K) {
+
+  constexpr int ROWS = BLOCK_M;
+  constexpr int COLS = BLOCK_N;
+
+  const int PREFETCH_SIZE_K = 16 * 4;
+
+  __m512 va;
+  __m512 vb[COLS];
+  __m512 vc[ROWS * COLS];
+  __m512 scale[COLS];
+
+  auto load_scale = [&](int i) {
+    float ss = static_cast<float>(scales[i]);
+    scale[i] = _mm512_set1_ps(ss);
+  };
+  c10::ForcedUnroll<COLS>{}(load_scale);
+
+  auto loadc = [&](auto i) {
+    vc[i] = _mm512_setzero_ps();
+  };
+  c10::ForcedUnroll<ROWS * COLS>{}(loadc);
+
+  auto compute = [&](auto i, int k) {
+    constexpr int row = i / COLS;
+    constexpr int col = i % COLS;
+
+    if constexpr (col == 0) {
+      __m256i a16 = _mm256_load_si256((__m256i*)(A + row * lda + k));
+      if (k + PREFETCH_SIZE_K < K) {
+        _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0);
+      }
+      vec::cvtbf16_fp32(a16, va);
+    }
+
+    if constexpr (row == 0) {
+      __m128i b8 = _mm_load_si128((__m128i*)(B + col * ldb + k));
+      if (k + PREFETCH_SIZE_K < K) {
+        _mm_prefetch(B + col * ldb + k + PREFETCH_SIZE_K, _MM_HINT_T0);
+      }
+      __m512i b32 = _mm512_cvtepi8_epi32(b8);
+      vb[col] = _mm512_cvtepi32_ps(b32);
+      vb[col] = _mm512_mul_ps(vb[col], scale[col]);
+    }
+
+    constexpr int idx = row * COLS + col;
+    vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]);
+  };
+
+  for (int k = 0; k < K; k += 16) {
+      c10::ForcedUnroll<ROWS * COLS>{}(compute, k);
+  }
+
+  auto storec = [&](auto i) {
+    constexpr int row = i / COLS;
+    constexpr int col = i % COLS;
+    C[row * ldc + col] = static_cast<BFloat16>(_mm512_reduce_add_ps(vc[i]));
+  };
+  c10::ForcedUnroll<ROWS * COLS>{}(storec);
+}
+
+#elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+static inline float _mm256_reduce_add_ps(__m256& v) {
+  __m256 v1 = _mm256_permute2f128_ps(v, v, 0x1);
+  v = _mm256_add_ps(v, v1);
+  v1 = _mm256_shuffle_ps(v, v, 0x4E);
+  v = _mm256_add_ps(v, v1);
+  v1 = _mm256_shuffle_ps(v, v, 0xB1);
+  v = _mm256_add_ps(v, v1);
+  return _mm256_cvtss_f32(v);
+}
+
+template <int BLOCK_M, int BLOCK_N>
+inline void tinygemm_kernel(
+    const BFloat16* RESTRICT A,
+    const int8_t* RESTRICT B,
+    const BFloat16* RESTRICT scales,
+    BFloat16* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K) {
+
+  constexpr int ROWS = BLOCK_M;
+  constexpr int COLS = BLOCK_N;
+
+  const int PREFETCH_SIZE_K = 16 * 4;
+
+  __m256 va;
+  __m256 vb[COLS];
+  __m256 vc[ROWS * COLS];
+  __m256 scale[COLS];
+
+  auto load_scale = [&](int i) {
+    float ss = static_cast<float>(scales[i]);
+    scale[i] = _mm256_set1_ps(ss);
+  };
+  c10::ForcedUnroll<COLS>{}(load_scale);
+
+  auto loadc = [&](auto i) {
+    vc[i] = _mm256_setzero_ps();
+  };
+  c10::ForcedUnroll<ROWS * COLS>{}(loadc);
+
+  auto compute = [&](auto i, int k) {
+    constexpr int row = i / COLS;
+    constexpr int col = i % COLS;
+
+    if constexpr (col == 0) {
+      __m128i a16 = _mm_load_si128((__m128i*)(A + row * lda + k));
+      if (k + PREFETCH_SIZE_K < K) {
+        _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0);
+      }
+      vec::cvtbf16_fp32(a16, va);
+    }
+
+    if constexpr (row == 0) {
+       __m128i b8 = _mm_loadu_si64((__m128i*)(B + col * ldb + k));
+       if (k + PREFETCH_SIZE_K < K) {
+         _mm_prefetch(B + col * ldb + k + PREFETCH_SIZE_K, _MM_HINT_T0);
+       }
+       __m256i b32 = _mm256_cvtepi8_epi32(b8);
+       vb[col] = _mm256_cvtepi32_ps(b32);
+       vb[col] = _mm256_mul_ps(vb[col], scale[col]);
+     }
+
+     constexpr int idx = row * COLS + col;
+     vc[idx] = _mm256_fmadd_ps(va, vb[col], vc[idx]);
+  };
+
+  for (int k = 0; k < K; k += 8) {
+    c10::ForcedUnroll<ROWS * COLS>{}(compute, k);
+  }
+
+  auto storec = [&](auto i) {
+    constexpr int row = i / COLS;
+    constexpr int col = i % COLS;
+    C[row * ldc + col] = static_cast<BFloat16>(_mm256_reduce_add_ps(vc[i]));
+  };
+  c10::ForcedUnroll<ROWS * COLS>{}(storec);
+}
+
+#endif
+
+#if !defined(C10_MOBILE) && defined(__aarch64__)
+#include <arm_neon.h>
+
+inline float reduce(float32x4_t x) {
+        auto sum = vpaddq_f32(x, x);
+        return vgetq_lane_f32(vpaddq_f32(sum, sum), 0);
+}
+
+inline float32x4x2_t load_as_float32x4x2(const Half* ptr) {
+  float16x8_t f16_val = vld1q_f16(reinterpret_cast<const float16_t *>(ptr));
+  auto val_low = vcvt_f32_f16(vget_low_f16(f16_val));
+  auto val_high = vcvt_f32_f16(vget_high_f16(f16_val));
+  return {val_low, val_high};
+}
+
+inline float32x4_t load_as_float32x4(const Half* ptr) {
+    return vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(ptr)));
+}
+
+inline float32x4x2_t load_as_float32x4x2(const BFloat16* ptr) {
+  int32x4_t shift = vdupq_n_s32(16);
+  uint16x8_t u16_val = vld1q_u16(reinterpret_cast<const uint16_t *>(ptr));
+  uint32x4_t int_low = vmovl_u16(vget_low_u16(u16_val));
+  uint32x4_t int_high = vmovl_u16(vget_high_u16(u16_val));
+  return {vreinterpretq_f32_u32(vshlq_u32(int_low, shift)), vreinterpretq_f32_u32(vshlq_u32(int_high, shift))};
+}
+
+inline float32x4_t load_as_float32x4(const BFloat16* ptr) {
+  int32x4_t shift = vdupq_n_s32(16);
+  uint32x4_t as_int = vmovl_u16(vld1_u16(reinterpret_cast<const uint16_t *>(ptr)));
+  return vreinterpretq_f32_u32(vshlq_u32(as_int, shift));
+}
+
+inline float32x4_t load_as_float32x4(const float* ptr) {
+  return vld1q_f32(ptr);
+}
+
+inline float32x4x2_t load_as_float32x4x2(const float* ptr) {
+  return {vld1q_f32(ptr), vld1q_f32(ptr + 4)};
+}
+
+template <int BLOCK_M, int BLOCK_N, typename T>
+inline void tinygemm_kernel_(
+    const T* RESTRICT A,
+    const int8_t* RESTRICT B,
+    const T* RESTRICT scales,
+    T* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K) {
+
+  for (const auto m : c10::irange(BLOCK_M)) {
+    float32x4_t c_val[BLOCK_N];
+    c10::ForcedUnroll<BLOCK_N>{}([&](auto i) {
+        c_val[i] = vdupq_n_f32(0.0);
+    });
+    for (int k = 0; k < K; k += 8) {
+      auto a_val = load_as_float32x4x2(A + m * lda + k);
+      c10::ForcedUnroll<BLOCK_N>{}([&](auto i) {
+        int16x8_t b_val = vmovl_s8(vld1_s8(B + i * ldb + k));
+        auto b_val_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_val)));
+        auto b_val_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_val)));
+        c_val[i] = vfmaq_f32(c_val[i], a_val.val[1], b_val_high);
+        c_val[i] = vfmaq_f32(c_val[i], a_val.val[0], b_val_low);
+      });
+    }
+
+    float32x4_t scale_val = load_as_float32x4(scales);
+    c10::ForcedUnroll<BLOCK_N>{}([&](auto i) {
+      C[m * ldc + i] = reduce(c_val[i]) * vgetq_lane_f32(scale_val, i);
+    });
+  }
+}
+
+template <int BLOCK_M, int BLOCK_N>
+inline void tinygemm_kernel(
+    const Half* RESTRICT A,
+    const int8_t* RESTRICT B,
+    const Half* RESTRICT scales,
+    Half* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K) {
+  tinygemm_kernel_<BLOCK_M, BLOCK_N>(A, B, scales, C, lda, ldb, ldc, K);
+}
+
+template <int BLOCK_M, int BLOCK_N>
+inline void tinygemm_kernel(
+    const BFloat16* RESTRICT A,
+    const int8_t* RESTRICT B,
+    const BFloat16* RESTRICT scales,
+    BFloat16* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K) {
+  tinygemm_kernel_<BLOCK_M, BLOCK_N>(A, B, scales, C, lda, ldb, ldc, K);
+}
+
+template <int BLOCK_M, int BLOCK_N>
+inline void tinygemm_kernel(
+    const float* RESTRICT A,
+    const int8_t* RESTRICT B,
+    const float* RESTRICT scales,
+    float* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K) {
+  tinygemm_kernel_<BLOCK_M, BLOCK_N>(A, B, scales, C, lda, ldb, ldc, K);
+}
+#endif
+
+// non-vectorized version
+template <int BLOCK_M, int BLOCK_N, typename T>
+inline void tinygemm_kernel(
+    const T* RESTRICT A,
+    const int8_t* RESTRICT B,
+    const T* RESTRICT scales,
+    T* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K) {
+
+  for (const auto m : c10::irange(BLOCK_M)) {
+    for (const auto n : c10::irange(BLOCK_N)) {
+      float c_val = 0;
+      float scale_val = static_cast<float>(scales[n]);
+      for (const auto k : c10::irange(K)) {
+        float a_val = static_cast<float>(A[m * lda + k]);
+        float b_val = static_cast<float>(B[n * ldb + k]);
+        c_val += a_val * (b_val * scale_val);
+      }
+      C[m * ldc + n] = c_val;
+    }
+  }
+}
+
+#define LAUNCH_TINYGEMM_KERNEL(MB_SIZE, NB_SIZE)                 \
+  tinygemm_kernel<MB_SIZE, NB_SIZE>(                             \
+      A_ptr, B_ptr, S_ptr, C_ptr,                                \
+      K, K, N, K);
+
+#define LAUNCH_TINYGEMM_NB_SIZE(MB_SIZE)                         \
+  switch (nb_size) {                                             \
+    case 1:                                                      \
+      LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 1);                        \
+      break;                                                     \
+    case 2:                                                      \
+      LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 2);                        \
+      break;                                                     \
+    case 3:                                                      \
+      LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 3);                        \
+      break;                                                     \
+    case 4:                                                      \
+      LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 4);                        \
+      break;                                                     \
+    default:                                                     \
+      TORCH_CHECK(false, "Unsupported n block size: ", nb_size); \
+      break;                                                     \
+  }
+
+template<typename T>
+void int8pack_mm_kernel_(
+    const Tensor& C,
+    const Tensor& A,
+    const Tensor& B,
+    const Tensor& scales) {
+
+  const auto* A_data = A.const_data_ptr<T>();
+  const auto* B_data = B.const_data_ptr<int8_t>();
+  auto* C_data = C.data_ptr<T>();
+  const auto* S_data = scales.const_data_ptr<T>();
+
+  int M = A.size(0);
+  int N = B.size(0);
+  int K = A.size(1);
+
+  constexpr int BLOCK_M = 4;
+  constexpr int BLOCK_N = 4;
+
+  const int MB = (M + BLOCK_M - 1) / BLOCK_M;
+  const int NB = (N + BLOCK_N - 1) / BLOCK_N;
+
+  at::parallel_for(0, MB * NB, 0, [&](int begin, int end) {
+    int mb{0}, nb{0};
+    data_index_init(begin, mb, MB, nb, NB);
+
+    for (const auto i : c10::irange(begin, end)) {
+      (void)i;
+
+      int mb_start = mb * BLOCK_M;
+      int mb_size = std::min(BLOCK_M, M - mb_start);
+      int nb_start = nb * BLOCK_N;
+      int nb_size = std::min(BLOCK_N, N - nb_start);
+
+      const auto* A_ptr = A_data + mb_start * K;
+      const auto* B_ptr = B_data + nb_start * K;
+      const auto* S_ptr = S_data + nb_start;
+      auto* C_ptr = C_data + mb_start * N + nb_start;
+
+      switch (mb_size) {
+        case 1:
+          LAUNCH_TINYGEMM_NB_SIZE(1);
+          break;
+        case 2:
+          LAUNCH_TINYGEMM_NB_SIZE(2);
+          break;
+        case 3:
+          LAUNCH_TINYGEMM_NB_SIZE(3);
+          break;
+        case 4:
+          LAUNCH_TINYGEMM_NB_SIZE(4);
+          break;
+        default:
+          TORCH_CHECK(false, "Unsupported m block size: ", mb_size);
+      }
+
+      // move to the next index
+      data_index_step(mb, MB, nb, NB);
+    }
+  });
+}
+
+void int8pack_mm_kernel(
+    const Tensor& C,
+    const Tensor& A,
+    const Tensor& B,
+    const Tensor& scales) {
+  if (C.dtype() == kHalf) {
+    int8pack_mm_kernel_<Half>(C, A, B, scales);
+  } else if (C.dtype() == kBFloat16) {
+    int8pack_mm_kernel_<BFloat16>(C, A, B, scales);
+  } else {
+    int8pack_mm_kernel_<float>(C, A, B, scales);
+  }
+}
+
+} // anonymous namespace
+
+ALSO_REGISTER_AVX512_DISPATCH(int8pack_mm_stub, &int8pack_mm_kernel);
+
+} // at::native
diff --git a/aten/src/ATen/native/cpu/int_mm_kernel.h b/aten/src/ATen/native/cpu/int_mm_kernel.h
new file mode 100644
index 0000000000000..f215078d61f91
--- /dev/null
+++ b/aten/src/ATen/native/cpu/int_mm_kernel.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at::native {
+
+using weight_to_int4pack_fn = void(*)(const Tensor&, const Tensor&, int, int);
+using int4pack_mm_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, int, const Tensor&, int, int);
+using int8pack_mm_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&);
+
+DECLARE_DISPATCH(weight_to_int4pack_fn, weight_to_int4pack_stub);
+DECLARE_DISPATCH(int4pack_mm_fn, int4pack_mm_stub);
+DECLARE_DISPATCH(int8pack_mm_fn, int8pack_mm_stub);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
index a668305d462ab..c2dbd0d7c7858 100644
--- a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
@@ -36,9 +36,9 @@ void LayerNormKernelImplInternal(
     Tensor* mean,
     Tensor* rstd) {
   using Vec = vec::Vectorized<T>;
-  const T* X_data = X.data_ptr<T>();
-  const T* gamma_data = gamma.defined() ? gamma.data_ptr<T>() : nullptr;
-  const T* beta_data = beta.defined() ? beta.data_ptr<T>() : nullptr;
+  const T* X_data = X.const_data_ptr<T>();
+  const T* gamma_data = gamma.defined() ? gamma.const_data_ptr<T>() : nullptr;
+  const T* beta_data = beta.defined() ? beta.const_data_ptr<T>() : nullptr;
   T* Y_data = Y->data_ptr<T>();
   T* mean_data = mean ? mean->data_ptr<T>() : nullptr;
   T* rstd_data = rstd ? rstd->data_ptr<T>() : nullptr;
@@ -51,9 +51,7 @@ void LayerNormKernelImplInternal(
     for (const auto i : c10::irange(start, end)) {
       const T* X_ptr = X_data + i * N;
       T* Y_ptr = Y_data + i * N;
-      T mean_val;
-      T rstd_val;
-      std::tie(mean_val, rstd_val) = RowwiseMoments(X_ptr, N);
+      auto [mean_val, rstd_val] = RowwiseMoments(X_ptr, N);
       rstd_val = T(1) / std::sqrt(rstd_val + eps);
       const T scale = rstd_val;
       const T bias = - mean_val;
@@ -98,9 +96,9 @@ void layer_norm_kernel_mixed_type(
     Tensor* rstd) {
   using bVec = Vectorized<T>;
   using fVec = Vectorized<float>;
-  const T* X_data = X.data_ptr<T>();
-  const param_t* gamma_data = gamma.defined() ? gamma.data_ptr<param_t>() : nullptr;
-  const param_t* beta_data = beta.defined() ? beta.data_ptr<param_t>() : nullptr;
+  const T* X_data = X.const_data_ptr<T>();
+  const param_t* gamma_data = gamma.defined() ? gamma.const_data_ptr<param_t>() : nullptr;
+  const param_t* beta_data = beta.defined() ? beta.const_data_ptr<param_t>() : nullptr;
   T* Y_data = Y->data_ptr<T>();
   param_t* mean_data = mean ? mean->data_ptr<param_t>() : nullptr;
   param_t* rstd_data = rstd ? rstd->data_ptr<param_t>() : nullptr;
@@ -113,9 +111,7 @@ void layer_norm_kernel_mixed_type(
     for (const auto i : c10::irange(start, end)) {
       const T* X_ptr = X_data + i * N;
       T* Y_ptr = Y_data + i * N;
-      float mean_val;
-      float rstd_val;
-      std::tie(mean_val, rstd_val) = RowwiseMoments(X_ptr, N);
+      auto [mean_val, rstd_val] = RowwiseMoments(X_ptr, N);
       rstd_val = float(1) / std::sqrt(rstd_val + eps);
       const float scale = rstd_val;
       const float bias = -rstd_val * mean_val;
@@ -373,10 +369,9 @@ void layer_norm_backward_frame(
       if (N < bVec::size()) {
         bVec x_bvec = bVec::loadu(X_ptr, N);
         bVec dy_bvec = bVec::loadu(dY_ptr, N);
-        fVec x_fvec0, x_fvec1, dy_fvec0, dy_fvec1, gamma_fvec0, gamma_fvec1;
-        std::tie(x_fvec0, x_fvec1) = convert_to_float<T>(x_bvec);
-        std::tie(dy_fvec0, dy_fvec1) = convert_to_float<T>(dy_bvec);
-        std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data, N);
+        auto [x_fvec0, x_fvec1] = convert_to_float<T>(x_bvec);
+        auto [dy_fvec0, dy_fvec1] = convert_to_float<T>(dy_bvec);
+        auto [gamma_fvec0, gamma_fvec1] = load2f(gamma_data, N);
         if (N > fVec::size()) {
           fVec db_fvec0 = dy_fvec0 * gamma_fvec0;
           fVec db_fvec1 = dy_fvec1 * gamma_fvec1;
@@ -396,11 +391,10 @@ void layer_norm_backward_frame(
         int64_t d = bVec::size();
         bVec x_bvec = bVec::loadu(X_ptr);
         bVec dy_bvec = bVec::loadu(dY_ptr);
-        fVec x_fvec0, x_fvec1, dy_fvec0, dy_fvec1, gamma_fvec0, gamma_fvec1;
         fVec ds_fvec0, ds_fvec1, db_fvec0, db_fvec1, acc_ds_fvec0, acc_ds_fvec1, acc_db_fvec0, acc_db_fvec1;
-        std::tie(x_fvec0, x_fvec1) = convert_to_float<T>(x_bvec);
-        std::tie(dy_fvec0, dy_fvec1) = convert_to_float<T>(dy_bvec);
-        std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data);
+        auto [x_fvec0, x_fvec1] = convert_to_float<T>(x_bvec);
+        auto [dy_fvec0, dy_fvec1] = convert_to_float<T>(dy_bvec);
+        auto [gamma_fvec0, gamma_fvec1] = load2f(gamma_data);
         acc_db_fvec0 = dy_fvec0 * gamma_fvec0;
         acc_db_fvec1 = dy_fvec1 * gamma_fvec1;
         acc_ds_fvec0 = x_fvec0 * acc_db_fvec0;
@@ -470,10 +464,9 @@ void layer_norm_backward_frame(
       for (; d < N - (N % bVec::size()); d += bVec::size()) {
         bVec x_bvec = bVec::loadu(X_ptr + d);
         bVec dy_bvec = bVec::loadu(dY_ptr + d);
-        fVec x_fvec0, x_fvec1, dy_fvec0, dy_fvec1, gamma_fvec0, gamma_fvec1;
-        std::tie(x_fvec0, x_fvec1) = convert_to_float<T>(x_bvec);
-        std::tie(dy_fvec0, dy_fvec1) = convert_to_float<T>(dy_bvec);
-        std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data + d);
+        auto [x_fvec0, x_fvec1] = convert_to_float<T>(x_bvec);
+        auto [dy_fvec0, dy_fvec1] = convert_to_float<T>(dy_bvec);
+        auto [gamma_fvec0, gamma_fvec1] = load2f(gamma_data + d);
         fVec r_fvec0 = fVec(a) * dy_fvec0 * gamma_fvec0 + fVec(b) * x_fvec0 + fVec(c);
         fVec r_fvec1 = fVec(a) * dy_fvec1 * gamma_fvec1 + fVec(b) * x_fvec1 + fVec(c);
         bVec r_bvec = convert_from_float<T>(r_fvec0, r_fvec1);
@@ -482,10 +475,9 @@ void layer_norm_backward_frame(
       if (N - d > 0) {
         bVec x_bvec = bVec::loadu(X_ptr + d, N - d);
         bVec dy_bvec = bVec::loadu(dY_ptr + d, N - d);
-        fVec x_fvec0, x_fvec1, dy_fvec0, dy_fvec1, gamma_fvec0, gamma_fvec1;
-        std::tie(x_fvec0, x_fvec1) = convert_to_float<T>(x_bvec);
-        std::tie(dy_fvec0, dy_fvec1) = convert_to_float<T>(dy_bvec);
-        std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data + d, N - d);
+        auto [x_fvec0, x_fvec1] = convert_to_float<T>(x_bvec);
+        auto [dy_fvec0, dy_fvec1] = convert_to_float<T>(dy_bvec);
+        auto [gamma_fvec0, gamma_fvec1] = load2f(gamma_data + d, N - d);
         fVec r_fvec0 = fVec(a) * dy_fvec0 * gamma_fvec0 + fVec(b) * x_fvec0 + fVec(c);
         fVec r_fvec1 = fVec(a) * dy_fvec1 * gamma_fvec1 + fVec(b) * x_fvec1 + fVec(c);
         bVec r_bvec = convert_from_float<T>(r_fvec0, r_fvec1);
@@ -513,12 +505,12 @@ void LayerNormBackwardKernelImplInternal(
   TORCH_DCHECK_EQ(mean.numel(), M);
   TORCH_DCHECK_EQ(rstd.numel(), M);
   DCHECK(!gamma.defined() || gamma.numel() == N);
-  const T* dY_data = dY.template data_ptr<T>();
-  const T* X_data = X.template data_ptr<T>();
-  const T2* mean_data = mean.template data_ptr<T2>();
-  const T2* rstd_data = rstd.template data_ptr<T2>();
+  const T* dY_data = dY.template const_data_ptr<T>();
+  const T* X_data = X.template const_data_ptr<T>();
+  const T2* mean_data = mean.template const_data_ptr<T2>();
+  const T2* rstd_data = rstd.template const_data_ptr<T2>();
   const T2* gamma_data =
-      gamma.defined() ? gamma.template data_ptr<T2>() : nullptr;
+      gamma.defined() ? gamma.template const_data_ptr<T2>() : nullptr;
   T* dX_data = dX->defined() ? dX->template data_ptr<T>() : nullptr;
   T2* dgamma_data = dgamma->defined() ? dgamma->template data_ptr<T2>() : nullptr;
   T2* dbeta_data = dbeta->defined() ? dbeta->template data_ptr<T2>() : nullptr;
diff --git a/aten/src/ATen/native/cpu/moments_utils.h b/aten/src/ATen/native/cpu/moments_utils.h
index c89aa6b3f602d..f5337f5ff4ebe 100644
--- a/aten/src/ATen/native/cpu/moments_utils.h
+++ b/aten/src/ATen/native/cpu/moments_utils.h
@@ -93,8 +93,7 @@ UpdateMomentsVec(
   fVec m2_fvec0(0), m2_fvec1(0);
   for (const auto j : c10::irange(m0)) {
     const Vec x_bvec = Vec::loadu(X_ptr + j * Vec::size());
-    fVec x_fvec0, x_fvec1;
-    std::tie(x_fvec0, x_fvec1) = convert_to_float<T>(x_bvec);
+    auto [x_fvec0, x_fvec1] = convert_to_float<T>(x_bvec);
     const fVec delta_fvec0 = x_fvec0 - m1_fvec0;
     const fVec delta_fvec1 = x_fvec1 - m1_fvec1;
     m1_fvec0 += delta_fvec0 * c_vecs[j];
diff --git a/aten/src/ATen/native/cpu/utils.h b/aten/src/ATen/native/cpu/utils.h
index 6607c287cf0e3..641ac0cd06125 100644
--- a/aten/src/ATen/native/cpu/utils.h
+++ b/aten/src/ATen/native/cpu/utils.h
@@ -21,6 +21,11 @@ inline void _store(at::BFloat16* dst, at::vec::Vectorized<float> src) {
   res.store(dst, at::vec::Vectorized<float>::size());
 }
 
+inline void _store(at::Half* dst, at::vec::Vectorized<float> src) {
+  auto res = at::vec::convert_float_half(src, src);
+  res.store(dst, at::vec::Vectorized<float>::size());
+}
+
 inline namespace CPU_CAPABILITY {
 
 template <typename T>
@@ -56,8 +61,7 @@ struct Vec2 {
   Vec2(Vectorized<float> v0, Vectorized<float> v1) : val0(v0), val1(v1) {}
   Vec2(float v) : val0(v), val1(v) {}
   static Vec2 loadu(const BFloat16* ptr) {
-    Vectorized<float> v0, v1;
-    std::tie(v0, v1) = convert_bfloat16_float(Vectorized<BFloat16>::loadu(ptr));
+    auto [v0, v1] = convert_bfloat16_float(Vectorized<BFloat16>::loadu(ptr));
     return {v0, v1};
   }
   static Vec2 loadu(const float* ptr) {
diff --git a/aten/src/ATen/native/cpu/zmath.h b/aten/src/ATen/native/cpu/zmath.h
index 3f3971e6e76e2..9b52039e84f91 100644
--- a/aten/src/ATen/native/cpu/zmath.h
+++ b/aten/src/ATen/native/cpu/zmath.h
@@ -2,7 +2,6 @@
 
 // Complex number math operations that act as no-ops for other dtypes.
 #include <c10/util/complex.h>
-#include <c10/util/math_compat.h>
 #include <c10/util/MathConstants.h>
 #include<ATen/NumericUtils.h>
 
diff --git a/aten/src/ATen/native/cuda/Activation.cpp b/aten/src/ATen/native/cuda/Activation.cpp
index 633a5f386a87e..6bbfd985d3572 100644
--- a/aten/src/ATen/native/cuda/Activation.cpp
+++ b/aten/src/ATen/native/cuda/Activation.cpp
@@ -44,8 +44,8 @@ Tensor& glu_backward_cuda_out(const Tensor& grad_output, const Tensor& input,
 
   const auto iter = at::TensorIteratorConfig()
     .add_output(grad_input)
-    .add_input(input)
-    .add_input(grad_output)
+    .add_const_input(input)
+    .add_const_input(grad_output)
     .resize_outputs(false)
     .declare_static_shape(iter_shape)
     .build();
@@ -80,7 +80,7 @@ std::tuple<Tensor&, Tensor&> log_sigmoid_forward_out_cuda(const Tensor& input, T
   // NOTE: buffer is only used by CPU dispatch, we just ignore it here
   auto iter = TensorIteratorConfig()
     .add_output(result)
-    .add_input(input)
+    .add_const_input(input)
     .build();
   launch_log_sigmoid_forward_kernel(iter);
   return std::forward_as_tuple(result, buffer);
diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
index 32cfb8fcf0339..9db469cd4f752 100644
--- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
@@ -703,7 +703,7 @@ namespace {
             );
         } while (!done && max_threads);
         if (!done) {
-          TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accomodate shaedMemPerBlock limit");
+          TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accomodate sharedMemPerBlock limit");
         }
         break;
       }
diff --git a/aten/src/ATen/native/cuda/AveragePool3d.cu b/aten/src/ATen/native/cuda/AveragePool3d.cu
index a722236ea57cc..f4b0ee00d9a9a 100644
--- a/aten/src/ATen/native/cuda/AveragePool3d.cu
+++ b/aten/src/ATen/native/cuda/AveragePool3d.cu
@@ -34,7 +34,7 @@ __device__ inline int max(int a, int b) {
 
 template <typename scalar_t, typename accscalar_t>
 __global__ void avg_pool3d_cuda_update_output(
-  PackedTensorAccessor64<scalar_t, 4> input,
+  PackedTensorAccessor64<const scalar_t, 4> input,
   PackedTensorAccessor64<scalar_t, 4> output,
   int kT, int kH, int kW,
   int dT, int dH, int dW,
@@ -88,7 +88,7 @@ __global__ void avg_pool3d_cuda_update_output(
       {
         for (wi = wstart; wi < wend; ++wi)
         {
-          scalar_t val = input[slice][ti][hi][wi];
+          const scalar_t val = input[slice][ti][hi][wi];
           sum += val;
         }
       }
@@ -103,7 +103,7 @@ __global__ void avg_pool3d_cuda_update_output(
 //
 template<int KERNEL_WIDTH, typename scalar_t, typename accscalar_t>
 __global__ void avg_pool3d_cuda_update_output(
-  PackedTensorAccessor64<scalar_t, 4> input,
+  PackedTensorAccessor64<const scalar_t, 4> input,
   PackedTensorAccessor64<scalar_t, 4> output,
   int kT, int kH,
   int dT, int dH, int dW,
@@ -157,7 +157,7 @@ __global__ void avg_pool3d_cuda_update_output(
       {
         for (wi = wstart; wi < wend; ++wi)
         {
-          scalar_t val = input[slice][ti][hi][wi];
+          const scalar_t val = input[slice][ti][hi][wi];
           sum += val;
         }
       }
@@ -169,7 +169,7 @@ __global__ void avg_pool3d_cuda_update_output(
 
 template <typename scalar_t, typename accscalar_t>
 __global__ void avg_pool3d_single_backward_out_frame_stride1(
-  PackedTensorAccessor64<scalar_t, 4> gradOutput,
+  PackedTensorAccessor64<const scalar_t, 4> gradOutput,
   PackedTensorAccessor64<scalar_t, 4> gradInput,
   int kT, int kH, int kW,
   accscalar_t normFactor,
@@ -184,7 +184,7 @@ __global__ void avg_pool3d_single_backward_out_frame_stride1(
   if (iRow < gradInput.size(2) && iCol < gradInput.size(3))
   {
     accscalar_t sum = 0.0;
-    scalar_t *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)]
+    const scalar_t *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)]
       [max(0, iRow - kH + 1)][max(0, iCol - kW + 1)];
     int frameOffset = 0;
     for (int oFrame  = max(0, iFrame - kT + 1);
@@ -214,7 +214,7 @@ __global__ void avg_pool3d_single_backward_out_frame_stride1(
 
 template <typename scalar_t, typename accscalar_t>
 __global__ void avg_pool3d_cuda_update_grad_input_atomic(
-  PackedTensorAccessor64<scalar_t, 4> gradOutput,
+  PackedTensorAccessor64<const scalar_t, 4> gradOutput,
   PackedTensorAccessor64<scalar_t, 4> gradInput,
   int kT, int kH, int kW,
   int dT, int dH, int dW,
@@ -273,7 +273,7 @@ __global__ void avg_pool3d_cuda_update_grad_input_atomic(
 
 template <typename scalar_t, typename accscalar_t>
 __global__ void avg_pool3d_cuda_update_grad_input(
-  PackedTensorAccessor64<scalar_t, 4> gradOutput,
+  PackedTensorAccessor64<const scalar_t, 4> gradOutput,
   PackedTensorAccessor64<scalar_t, 4> gradInput,
   int kT, int kH, int kW,
   int dT, int dH, int dW,
@@ -333,7 +333,7 @@ __global__ void avg_pool3d_cuda_update_grad_input(
 #define LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW:      \
   avg_pool3d_cuda_update_output<KW, scalar_t, accscalar_t>  \
     <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>( \
-       work_input.packed_accessor64<scalar_t, 4>(),         \
+       work_input.packed_accessor64<const scalar_t, 4>(),   \
        work_output.packed_accessor64<scalar_t, 4>(),        \
        kT, kH,                                              \
        dT, dH, dW,                                          \
@@ -422,7 +422,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cuda) (
         default:
           avg_pool3d_cuda_update_output<scalar_t, accscalar_t>
             <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-                work_input.packed_accessor64<scalar_t, 4>(),
+                work_input.packed_accessor64<const scalar_t, 4>(),
                 work_output.packed_accessor64<scalar_t, 4>(),
                 kT, kH, kW,
                 dT, dH, dW,
@@ -543,7 +543,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cuda) (
 
           avg_pool3d_single_backward_out_frame_stride1<scalar_t, accscalar_t>
             <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-              work_grad_output.packed_accessor64<scalar_t, 4>(),
+              work_grad_output.packed_accessor64<const scalar_t, 4>(),
               work_grad_input.packed_accessor64<scalar_t, 4>(),
               kT, kH, kW,
               1.0f/divide_factor,
@@ -573,7 +573,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cuda) (
           if (kernelsOverlap) {
             avg_pool3d_cuda_update_grad_input_atomic<scalar_t, accscalar_t>
               <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-                  work_grad_output.packed_accessor64<scalar_t, 4>(),
+                  work_grad_output.packed_accessor64<const scalar_t, 4>(),
                   work_grad_input.packed_accessor64<scalar_t, 4>(),
                   kT, kH, kW,
                   dT, dH, dW,
@@ -585,7 +585,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cuda) (
           else {
             avg_pool3d_cuda_update_grad_input<scalar_t, accscalar_t>
               <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-                  work_grad_output.packed_accessor64<scalar_t, 4>(),
+                  work_grad_output.packed_accessor64<const scalar_t, 4>(),
                   work_grad_input.packed_accessor64<scalar_t, 4>(),
                   kT, kH, kW,
                   dT, dH, dW,
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 35a247725a3ea..df6f470916428 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -6,6 +6,8 @@
 #include <ATen/OpMathType.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/cuda/CUDABlas.h>
+#include <ATen/cuda/tunable/Tunable.h>
+#include <ATen/cuda/tunable/TunableGemm.h>
 #include <ATen/native/Resize.h>
 #include <c10/util/MaybeOwned.h>
 
@@ -17,6 +19,7 @@
 #include <ATen/ops/_efficientzerotensor.h>
 #include <ATen/ops/_scaled_mm_native.h>
 #include <ATen/ops/_unsafe_view_native.h>
+#include <ATen/ops/abs.h>
 #include <ATen/ops/addmm_native.h>
 #include <ATen/ops/addmv_native.h>
 #include <ATen/ops/baddbmm_native.h>
@@ -25,6 +28,7 @@
 #include <ATen/ops/dot_native.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/gelu.h>
+#include <ATen/ops/max.h>
 #include <ATen/ops/mm_native.h>
 #include <ATen/ops/mul.h>
 #include <ATen/ops/relu.h>
@@ -153,7 +157,7 @@ enum class Activation {
   GELU,
 };
 
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activation a) {
   switch (a) {
     case Activation::None:
@@ -172,6 +176,12 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
 static bool getDisableAddmmCudaLt() {
     static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT");
 #ifdef USE_ROCM
+    // if we enable tunable op, it'll take priority over just hipblaslt (heuristics)
+    // note the current tunable op is not the hipblaslt path (gemm_and_bias)
+    auto tuning_ctx = at::cuda::tunable::getTuningContext();
+    if (tuning_ctx->IsTunableOpEnabled()) {
+      return true;
+    }
     // allow both CUDA and HIP env var names for ROCm builds
     // also, current default for ROCm builds is disable by default
     if (env_value == nullptr) {
@@ -226,7 +236,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
   at::ScalarType scalar_type = self.scalar_type();
   c10::MaybeOwned<Tensor> self_;
   if (&result != &self) {
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)) || defined(USE_ROCM) && ROCM_VERSION >= 50700
+#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || (defined(USE_ROCM) && (ROCM_VERSION >= 50700))
     // Strangely, if mat2 has only 1 row or column, we get
     // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
     // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
@@ -250,10 +260,13 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
            scalar_type == at::ScalarType::Half ||
            scalar_type == at::ScalarType::BFloat16) &&
 #endif
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12010 && !defined(USE_ROCM))
+          mat2_sizes[0] > 1 && mat2_sizes[1] > 1;
+#else
           mat2_sizes[0] > 1 && mat2_sizes[1] > 1 &&
           mat2_sizes[0] < 65535 * 32 && mat2_sizes[1] < 65535 * 32 &&
           mat1_sizes[0] < 65535 * 32 && mat1_sizes[1] < 65535 * 32 &&
-          // avoid leaing dim >> rows bugs
+          // avoid leading dim >> rows bugs
           ((mat1.strides()[0] == 1 && mat1.strides()[1] == mat1_sizes[0]) ||
            (mat1.strides()[1] == 1 && mat1.strides()[0] == mat1_sizes[1]) ||
            (scalar_type != at::ScalarType::Half &&
@@ -262,6 +275,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
            (mat2.strides()[1] == 1 && mat2.strides()[0] == mat2_sizes[1]) ||
            (scalar_type != at::ScalarType::Half &&
             scalar_type != at::ScalarType::BFloat16));
+#endif
     }
 #endif
     if (!useLtInterface) {
@@ -309,7 +323,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
     // That requires some fixing some internal build dependencies though.
     return at::mul_out(
         result,
-        self,
+        self.expand(result.sizes()),
         at::native::scalar_tensor(
             beta,
             self.scalar_type(),
@@ -320,8 +334,9 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
 
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());
 
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && (ROCM_VERSION >= 50700))
   if (useLtInterface) {
+#if defined(USE_ROCM)
     AT_DISPATCH_FLOATING_TYPES_AND2(
         at::ScalarType::Half,
         at::ScalarType::BFloat16,
@@ -335,26 +350,53 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
               args.n,
               args.k,
               alpha.to<at::opmath_type<scalar_t>>(),
-              args.mata->data_ptr<scalar_t>(),
+              args.mata->const_data_ptr<scalar_t>(),
               args.lda,
-              args.matb->data_ptr<scalar_t>(),
+              args.matb->const_data_ptr<scalar_t>(),
               args.ldb,
-              self.const_data_ptr<scalar_t>(),
+              // This condition is needed for mm case on ROCm for hipblasLt path.
+              // Passing the bias ptr as null to avoid accuracy issues for mm case.
+              (&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
               args.result->data_ptr<scalar_t>(),
               args.result_ld,
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11080) || defined(USE_ROCM)
               activation_to_gemm_and_blas_arg(activation)
+          );
+        });
 #else
-              // GELU is not supported (and does not compile!) prior
-              // to CUDA 11.4. Have observed accuracy issues with
-              // GELU epilogue in 11.4; disabling the GELU epilogue
-              // path for CUDA version < 11.8.
-              activation != Activation::GELU
-              ? activation_to_gemm_and_blas_arg(activation)
-              : cuda::blas::GEMMAndBiasActivationEpilogue::None
+    auto activation_epilogue = activation_to_gemm_and_blas_arg(activation);
+#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11080))
+    // GELU is not supported (and does not compile!) prior
+    // to CUDA 11.4. Have observed accuracy issues with
+    // GELU epilogue in 11.4; disabling the GELU epilogue
+    // path for CUDA version < 11.8.
+    if (activation == Activation::GELU)
+      activation_epilogue = cuda::blas::GEMMAndBiasActivationEpilogue::None;
 #endif
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        scalar_type,
+        "addmm_cuda_lt",
+        [&] {
+          at::cuda::blas::gemm_and_bias<scalar_t>(
+              args.transa == 't',
+              args.transb == 't',
+              args.m,
+              args.n,
+              args.k,
+              alpha.to<at::opmath_type<scalar_t>>(),
+              args.mata->const_data_ptr<scalar_t>(),
+              args.lda,
+              args.matb->const_data_ptr<scalar_t>(),
+              args.ldb,
+              self.const_data_ptr<scalar_t>(),
+              args.result->data_ptr<scalar_t>(),
+              args.result_ld,
+              activation_epilogue
           );
         });
+#endif
   } else
 #endif
   {
@@ -728,7 +770,7 @@ Tensor& _int_mm_out_cuda(const Tensor& self, const Tensor& mat2, Tensor& result)
 
   TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous.");
 
-#if !defined(USE_ROCM) && !defined(_MSC_VER) && defined(CUDA_VERSION) && CUDA_VERSION >= 11070
+#if (!defined(USE_ROCM) && defined(CUDA_VERSION) && (CUDA_VERSION >= 11070)) || (defined(USE_ROCM) && (ROCM_VERSION >= 60000))
   cublasCommonArgs args(self, mat2, result);
 
   at::cuda::blas::int8_gemm(
@@ -748,7 +790,7 @@ Tensor& _int_mm_out_cuda(const Tensor& self, const Tensor& mat2, Tensor& result)
     result.copy_(*args.result);
   }
 #else
-#if !defined(USE_ROCM) && !defined(_MSC_VER) && defined(CUDA_VERSION)
+#if !defined(USE_ROCM) && defined(CUDA_VERSION)
   TORCH_CHECK(false, "_int_mm_out_cuda not compiled for CUDA ", CUDA_VERSION);
 #else
   TORCH_CHECK(false, "_int_mm_out_cuda not compiled for this platform.");
@@ -763,12 +805,42 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
   return _int_mm_out_cuda(self, mat2, result);
 }
 
+static bool _scaled_mm_allowed_device() {
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+#ifdef USE_ROCM
+    std::string device_arch = dprops->gcnArchName;
+    static const std::vector<std::string> archs = {"gfx940", "gfx941", "gfx942"};
+    for (std::string arch : archs) {
+        size_t substring = device_arch.find(arch);
+        if (substring != std::string::npos) {
+            return true;
+        }
+    }
+    return false;
+#else
+    return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9);
+#endif
+}
+
 // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
 // Scales are only applicable when matrices are of Float8 type and assumbed to be equal to 1.0 by default.
 // If output matrix type is 16 or 32-bit type, neither scale_result is applied nor amax is computed.
 // Known limitations:
 //  - Only works if mat1 is row-major and mat2 is column-major
 //  - Only works if matrices sizes are divisible by 32
+//
+//  Arguments:
+//    - `mat1`: the first operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `mat2`: the second operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
+//    - `out_dtype`: the output dtype, can either be a float8 or a higher precision floating point type
+//    - `scale_a`: a scalar tensor with the inverse scale of `mat1`, only needed if `mat1` is a float8 type
+//    - `scale_b`: a scalar tensor with the inverse scale of `mat2`, only needed if `mat2` is a float8 type
+//    - `scale_result`: a scalar tensor with the scale of the output, only set if the output is a float8 type
+//    - `use_fast_accum`: if true, enables fast float8 accumulation
+//    - `out`: a reference to the output tensor
+//    - `amax`: a reference to the amax tensor of the output, only needed if the output is a float8 type and will be updated inplace
+
 std::tuple<Tensor&, Tensor&>
 _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
           const c10::optional<at::Tensor>& bias,
@@ -779,8 +851,8 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
           bool use_fast_accum,
           Tensor& out, Tensor& amax) {
   // Check sizes
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  TORCH_CHECK(dprops->major >= 9, "torch._scaled_mm is only supported on devices with compute capability >= 9.0)");
+  bool allowed_device = _scaled_mm_allowed_device();
+  TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
   TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
   TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
   TORCH_CHECK(
@@ -796,7 +868,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
        " but got ", bias->numel());
   TORCH_CHECK(
       mat1.sizes()[1] % 16 == 0,
-      "Expected trailing dimension of mat1 to be divisble by 16 ",
+      "Expected trailing dimension of mat1 to be divisible by 16 ",
       "but got mat1 shape: (",
       mat1.sizes()[0],
       "x",
@@ -838,36 +910,121 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
   at::native::resize_output(amax, {});
 
-#if !defined(USE_ROCM) && !defined(_MSC_VER)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && (ROCM_VERSION >= 60000))
   cublasCommonArgs args(mat1, mat2, out);
   const auto out_dtype_ = args.result->scalar_type();
   TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
-  at::cuda::blas::scaled_gemm(
-      args.transa,
-      args.transb,
-      args.m,
-      args.n,
-      args.k,
-      args.mata->data_ptr(),
-      scale_a ? scale_a->data_ptr() : nullptr,
-      args.lda,
-      args.mata->scalar_type(),
-      args.matb->data_ptr(),
-      scale_b ? scale_b->data_ptr() : nullptr,
-      args.ldb,
-      args.matb->scalar_type(),
-      bias ? bias->data_ptr(): nullptr,
-      bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
-      args.result->data_ptr(),
-      scale_result ? scale_result->data_ptr() : nullptr,
-      args.result_ld,
-      out_dtype_,
-      amax.data_ptr(),
-      use_fast_accum);
+#ifdef USE_ROCM
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+#define TUNABLE_DISPATCH(BLASOP_A, BLASOP_B)                            \
+        if (mat1.scalar_type() == ScalarType::Float8_e4m3fnuz) {        \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fnuz, at::Float8_e4m3fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fnuz, at::Float8_e5m2fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }                                                               \
+        else if (mat1.scalar_type() == ScalarType::Float8_e5m2fnuz) {   \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2fnuz, at::Float8_e4m3fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
+            static at::cuda::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2fnuz, at::Float8_e5m2fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }
+    AT_DISPATCH_V2(out_dtype_, "_tunable_scaled_gemm", AT_WRAP([&] {
+      bool transa_ = ((args.transa != 'n') && (args.transa != 'N'));
+      bool transb_ = ((args.transb != 'n') && (args.transb != 'N'));
+      at::cuda::tunable::ScaledGemmParams<scalar_t> params;
+      params.transa = args.transa;
+      params.transb = args.transb;
+      params.m = args.m;
+      params.n = args.n;
+      params.k = args.k;
+      params.a = args.mata->data_ptr();
+      params.a_scale_ptr = scale_a ? scale_a->data_ptr() : nullptr;
+      params.lda = args.lda;
+      params.a_dtype = args.mata->scalar_type();
+      params.b = args.matb->data_ptr();
+      params.b_scale_ptr = scale_b ? scale_b->data_ptr() : nullptr;
+      params.ldb = args.ldb;
+      params.b_dtype = args.matb->scalar_type();
+      params.bias_ptr = bias ? bias->data_ptr(): nullptr;
+      params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_;
+      params.c = args.result->data_ptr();
+      params.c_scale_ptr = scale_result ? scale_result->data_ptr() : nullptr;
+      params.ldc = args.result_ld;
+      params.c_dtype = out_dtype_;
+      params.amax_ptr = amax.data_ptr();
+      params.use_fast_accum = use_fast_accum;
+      if (transa_ && transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T)
+      }
+      else if (transa_ && !transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::N)
+      }
+      else if (!transa_ && transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::T)
+      }
+      else if (!transa_ && !transb_) {
+        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::N)
+      }
+      else {
+        TORCH_CHECK(false, "unreachable");
+      }
+    }),
+    kHalf, kBFloat16, kFloat8_e4m3fnuz, kFloat8_e5m2fnuz, AT_EXPAND(AT_FLOATING_TYPES));
+#undef TUNABLE_DISPATCH
+  }
+  else
+#endif
+  {
+    at::cuda::blas::scaled_gemm(
+        args.transa,
+        args.transb,
+        args.m,
+        args.n,
+        args.k,
+        args.mata->data_ptr(),
+        scale_a ? scale_a->data_ptr() : nullptr,
+        args.lda,
+        args.mata->scalar_type(),
+        args.matb->data_ptr(),
+        scale_b ? scale_b->data_ptr() : nullptr,
+        args.ldb,
+        args.matb->scalar_type(),
+        bias ? bias->data_ptr(): nullptr,
+        bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
+        args.result->data_ptr(),
+        scale_result ? scale_result->data_ptr() : nullptr,
+        args.result_ld,
+        out_dtype_,
+        amax.data_ptr(),
+        use_fast_accum);
+  }
 #else
   TORCH_CHECK(false, "_scaled_mm_out_cuda is not compiled for this platform.");
 #endif
 
+#if defined(USE_ROCM) && ROCM_VERSION >= 60000
+  // rocm's hipblaslt does not yet support amax, so calculate separately
+  amax = at::max(at::abs(out.to(kFloat)));
+#endif
+
   return {out, amax};
 }
 
diff --git a/aten/src/ATen/native/cuda/Bucketization.cu b/aten/src/ATen/native/cuda/Bucketization.cu
index c85c059f91c01..05d5421b046f8 100644
--- a/aten/src/ATen/native/cuda/Bucketization.cu
+++ b/aten/src/ATen/native/cuda/Bucketization.cu
@@ -18,7 +18,7 @@
 namespace at::native {
 
 // Implement a numpy like searchsorted and a TF like bucketize function running on cuda
-// See details in ATen/nativate/Bucketization.cpp
+// See details in ATen/native/Bucketization.cpp
 
 namespace {
 
@@ -149,7 +149,7 @@ Tensor& searchsorted_out_cuda(
     return result;
   }
 
-  // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaing the original result tensor
+  // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaining the original result tensor
   Tensor out = result;
   if (!result.is_contiguous()) {
     out = result.contiguous();
diff --git a/aten/src/ATen/native/cuda/CUDAJitLoops.cuh b/aten/src/ATen/native/cuda/CUDAJitLoops.cuh
index 39b8a5bab4b7a..e764cc4ce8039 100644
--- a/aten/src/ATen/native/cuda/CUDAJitLoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDAJitLoops.cuh
@@ -18,7 +18,6 @@
 #include <c10/macros/Macros.h>
 #include <c10/core/ScalarType.h>
 #include <c10/util/SmallBuffer.h>
-#include <c10/util/C++17.h>
 
 #include <initializer_list>
 #include <type_traits>
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
index b7e4026eabb6b..b8eb85fd4eb2e 100644
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -39,7 +39,6 @@
 #include <c10/core/DynamicCast.h>
 #include <c10/core/ScalarType.h>
 #include <c10/macros/Macros.h>
-#include <c10/util/C++17.h>
 #include <c10/util/TypeCast.h>
 
 #ifdef __NVCC__
@@ -303,6 +302,20 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
   bool contiguous = iter.is_contiguous();
 
   if (contiguous) {
+#ifdef USE_ROCM
+    at::detail::Array<ScalarType, ntensors> dtypes;
+    auto inner_strides = iter.get_inner_strides();
+    at::detail::Array<int, ntensors> strides;
+    for (int i = 0; i < ntensors; i++) {
+      dtypes[i] = iter.dtype(i);
+      strides[i] = inner_strides[i];
+    }
+    launch_legacy_kernel<512, 1>(numel, [=]GPU_LAMBDA(int idx) {
+      void* out = data[0] + strides[0] * idx;
+      arg0_t result = invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx);
+      c10::cast_and_store<arg0_t>(dtypes[0], out, result);
+    });
+#else
     auto loader = memory::LoadWithCast<traits::arity>(iter);
     auto storer = memory::StoreWithCast<1>(iter);
     auto input_offset_calculator = TrivialOffsetCalculator<traits::arity>();
@@ -315,6 +328,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
         output_offset_calculator,
         loader,
         storer);
+#endif
   } else {
     at::detail::Array<ScalarType, ntensors> dtypes;
     for (int i = 0; i < ntensors; i++) {
@@ -324,8 +338,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
     launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
       auto offsets = offset_calc.get(idx);
       void* out = data[0] + offsets[0];
-      arg0_t result =
-          invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1);
+      arg0_t result = invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1);
       c10::cast_and_store<arg0_t>(dtypes[0], out, result);
     });
   }
diff --git a/aten/src/ATen/native/cuda/CUDAScalar.cu b/aten/src/ATen/native/cuda/CUDAScalar.cu
index 8f5208ab59194..428c80a7e0e19 100644
--- a/aten/src/ATen/native/cuda/CUDAScalar.cu
+++ b/aten/src/ATen/native/cuda/CUDAScalar.cu
@@ -1,6 +1,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch_v2.h>
+#include <ATen/EmptyTensor.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/NativeFunctions.h>
@@ -16,10 +17,19 @@ Scalar _local_scalar_dense_cuda(const Tensor& self) {
   Scalar r;
   AT_DISPATCH_V2(
     self.scalar_type(), "_local_scalar_dense_cuda", AT_WRAP([&] {
-        scalar_t value;
+        // Create pinned memory for the scalar value to avoid implicit
+        // locking/sync in cuda library due to pageable memory
+        auto value = at::detail::empty_cpu(
+          {1}, /* size */
+          c10::CppTypeToScalarType<scalar_t>(), /* dtype */
+          c10::nullopt, /* layout */
+          c10::nullopt, /* device */
+          true, /* pin_memory */
+          c10::nullopt /* memory format */
+        );
         cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-        at::cuda::memcpy_and_sync(&value, self.const_data_ptr<scalar_t>(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream);
-        r = Scalar(value);
+        at::cuda::memcpy_and_sync((void *)value.const_data_ptr<scalar_t>(), self.const_data_ptr<scalar_t>(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream);
+        r = Scalar(*value.const_data_ptr<scalar_t>());
       }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
   return r;
 }
diff --git a/aten/src/ATen/native/cuda/CompareEQKernel.cu b/aten/src/ATen/native/cuda/CompareEQKernel.cu
index 9966c3b085050..9496ae95d13b2 100644
--- a/aten/src/ATen/native/cuda/CompareEQKernel.cu
+++ b/aten/src/ATen/native/cuda/CompareEQKernel.cu
@@ -33,7 +33,7 @@ C10_NOINLINE void compare_eq_ne_kernel(TensorIteratorBase &iter, EqOpType op) {
   AT_DISPATCH_V2(iter.common_dtype(), "compare_eq_ne_cuda", AT_WRAP([&]() {
     opmath_symmetric_gpu_kernel_with_scalars<scalar_t, bool>(
         iter, CompareEqFunctor<scalar_t>(op));
-  }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBFloat16, kBool, kFloat8_e4m3fn, kFloat8_e5m2, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+  }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBFloat16, kBool, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
 }
 
 void eq_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu
index a405e93b1e034..9e45e2693cb0f 100644
--- a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu
+++ b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu
@@ -173,7 +173,7 @@ void slow_conv2d_forward(
                                   "slow_conv2d_cuda", [&] {
     // For each elt in batch, do:
     for (int elt = 0; elt < batchSize; elt ++) {
-      // Matrix mulitply per output:
+      // Matrix multiply per output:
       auto input_n = input.select(0, elt);
       auto output_n = output.select(0, elt);
 
@@ -255,7 +255,7 @@ void slow_conv2d_backward(
                                   "slow_conv2d_backward_cuda", [&] {
     // For each elt in batch, do:
     for (int elt = 0; elt < batchSize; elt ++) {
-      // Matrix mulitply per sample:
+      // Matrix multiply per sample:
       auto grad_input_n = grad_input.select(0, elt);
       auto grad_output_n = grad_output.select(0, elt);
 
@@ -327,10 +327,10 @@ void slow_conv2d_grad_weight(
                                   "slow_conv2d_grad_weight_cuda", [&] {
     // For each elt in batch, do:
     for (int elt = 0; elt < batchSize; elt ++) {
-      // Matrix mulitply per output:
+      // Matrix multiply per output:
       auto grad_output_n = grad_output.select(0, elt);
 
-      // Matrix mulitply per output:
+      // Matrix multiply per output:
       auto input_n = input.select(0, elt);
 
       if (requires_columns) {
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
index 81149085354da..fad81d59d45c9 100644
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@@ -20,7 +20,7 @@
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAStream.h>
 
-// TODO(NS): Investigate why FP8 conversion intrisncs end up being slower
+// TODO(NS): Investigate why FP8 conversion intrinsics end up being slower
 #ifdef AT_USE_NV_CVT_INTRINSICS
 #include <cuda_fp8.h>
 #endif
@@ -35,7 +35,6 @@ void float8_copy_kernel_cuda(TensorIteratorBase &iter) {
   ScalarType other_dtype = iter.dtype(1);
   if (dtype == kFloat8_e4m3fn) {
     switch (other_dtype) {
-#if !defined(USE_ROCM)
       case kFloat:
          gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) {
              return Float8_e4m3fn(value);
@@ -51,14 +50,12 @@ void float8_copy_kernel_cuda(TensorIteratorBase &iter) {
              return Float8_e4m3fn(value);
          });
          break;
-#endif /* !defined(USE_ROCM) */
       default:
         gpu_kernel(iter, [] GPU_LAMBDA(Float8_e4m3fn x) { return x; });
         break;
     }
   } else if (dtype == kFloat8_e5m2) {
     switch (other_dtype) {
-#if !defined(USE_ROCM)
       case kFloat:
          gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) {
 #ifdef AT_USE_NV_CVT_INTRINSICS
@@ -89,11 +86,52 @@ void float8_copy_kernel_cuda(TensorIteratorBase &iter) {
 #endif
          });
          break;
-#endif /* !defined(USE_ROCM) */
       default:
          gpu_kernel(iter, [] GPU_LAMBDA(Float8_e5m2 x) { return x; });
          break;
     }
+  } else if (dtype == kFloat8_e4m3fnuz) {
+    switch (other_dtype) {
+      case kFloat:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) {
+             return Float8_e4m3fnuz(value);
+         });
+         break;
+      case kHalf:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(Half value) {
+             return Float8_e4m3fnuz(value);
+         });
+         break;
+      case kBFloat16:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(BFloat16 value) {
+             return Float8_e4m3fnuz(value);
+         });
+         break;
+      default:
+        gpu_kernel(iter, [] GPU_LAMBDA(Float8_e4m3fnuz x) { return x; });
+        break;
+    }
+  } else if (dtype == kFloat8_e5m2fnuz) {
+    switch (other_dtype) {
+      case kFloat:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) {
+             return Float8_e5m2fnuz(value);
+         });
+         break;
+      case kHalf:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(Half value) {
+             return Float8_e5m2fnuz(value);
+         });
+         break;
+      case kBFloat16:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(BFloat16 value) {
+             return Float8_e5m2fnuz(value);
+         });
+         break;
+      default:
+         gpu_kernel(iter, [] GPU_LAMBDA(Float8_e5m2fnuz x) { return x; });
+         break;
+    }
   } else {
     TORCH_CHECK(false, "This supposed ot be called only for Float8 types");
   }
@@ -107,16 +145,14 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) {
     AT_DISPATCH_QINT_TYPES(dtype, "copy_", [&] {
       gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) { return x; });
     });
-  } else if (dtype == kFloat8_e5m2 || dtype == kFloat8_e4m3fn) {
+  } else if (dtype == kFloat8_e5m2 || dtype == kFloat8_e4m3fn || dtype == kFloat8_e5m2fnuz || dtype == kFloat8_e4m3fnuz) {
      float8_copy_kernel_cuda(iter);
-#if !defined(USE_ROCM)
   } else if (isBitsType(dtype)) {
     TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting "
       "bits types to different bits types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype);
     AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] {
       gpu_kernel_nocast(iter, [] GPU_LAMBDA(scalar_t x) { return x; });
     });
-#endif /* !defined(USE_ROCM) */
   } else {
     AT_DISPATCH_V2(
         dtype, "copy_", AT_WRAP([&] {
@@ -266,9 +302,11 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
     Tensor src_contig;
 
     // If non_blocking is true - type conversions are performed on the GPU
-    // for CPU-GPU copies, otherwise type conversions are performed on the CPU.
-    // Type conversions are performed on the src device for GPU-GPU copies.
-    if (iter.device_type(0) == kCUDA || non_blocking) {
+    // For blocking transfers conversions are performed on CPU to avoid allocating
+    // extra GPU memory
+    // for GPU-GPU transfers conversions are performed on the source device
+    auto conversion_device = non_blocking ? kCUDA : kCPU;
+    if (iter.device_type(1) == conversion_device) {
       dst_contig = dst.is_contiguous() ? dst : at::empty_like(dst, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
       src_contig = iter.tensor(1).to(iter.dtype(0)).expand_as(dst).contiguous();
     } else {
diff --git a/aten/src/ATen/native/cuda/CrossKernel.cu b/aten/src/ATen/native/cuda/CrossKernel.cu
index 956ce2446dc18..560d419c982b5 100644
--- a/aten/src/ATen/native/cuda/CrossKernel.cu
+++ b/aten/src/ATen/native/cuda/CrossKernel.cu
@@ -68,8 +68,8 @@ void cross_impl(const Tensor& result, const Tensor& x1, const Tensor& x2, int64_
 
   auto iter = TensorIteratorConfig()
       .add_output(result)
-      .add_input(x1)
-      .add_input(x2)
+      .add_const_input(x1)
+      .add_const_input(x2)
       .resize_outputs(false)
       .declare_static_shape(result.sizes(), /*squash_dims=*/dim)
       .build();
diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
index edeb8e8c82f80..6bcd57027d517 100644
--- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@@ -123,11 +123,7 @@ static bool is_pow_of_two(int64_t x) {
   return (x & (x - 1)) == 0;
 }
 
-#if defined(USE_ROCM)
-    using cufft_size_type = int;
-#else
-    using cufft_size_type = long long int;
-#endif
+using cufft_size_type = long long int;
 
 using CuFFTDimVector = c10::SmallVector<cufft_size_type, at::kDimVectorStaticSize>;
 
@@ -299,25 +295,6 @@ class CuFFTConfig {
     // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu.
 
     const bool simple_layout = in_layout.simple && out_layout.simple;
-
-#if defined(USE_ROCM)
-    hipfftType exec_type = [&]{
-      if (dtype == kFloat) {
-        switch (fft_type) {
-          case CuFFTTransformType::C2C: return HIPFFT_C2C;
-          case CuFFTTransformType::R2C: return HIPFFT_R2C;
-          case CuFFTTransformType::C2R: return HIPFFT_C2R;
-        }
-      } else if (dtype == kDouble) {
-        switch (fft_type) {
-          case CuFFTTransformType::C2C: return HIPFFT_Z2Z;
-          case CuFFTTransformType::R2C: return HIPFFT_D2Z;
-          case CuFFTTransformType::C2R: return HIPFFT_Z2D;
-        }
-      }
-      TORCH_CHECK(false, "hipFFT doesn't support transforms of type: ", dtype);
-    }();
-#else
     cudaDataType itype, otype, exec_type;
     const auto complex_input = cufft_complex_input(fft_type);
     const auto complex_output = cufft_complex_output(fft_type);
@@ -336,7 +313,6 @@ class CuFFTConfig {
     } else {
       TORCH_CHECK(false, "cuFFT doesn't support tensor of type: ", dtype);
     }
-#endif
 
     // disable auto allocation of workspace to use THC allocator
     CUFFT_CHECK(cufftSetAutoAllocation(plan(), /* autoAllocate */ 0));
@@ -350,29 +326,15 @@ class CuFFTConfig {
       // by assuming istride = ostride = 1.
       //
       // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu.
-#if defined(USE_ROCM)
-      CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
-        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
-        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1,
-        exec_type, batch, &ws_size_t));
-#else
       CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
         /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
         /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
         batch, &ws_size_t, exec_type));
-#endif
     } else {
-#if defined(USE_ROCM)
-      CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
-        in_layout.embed.data(), in_layout.stride, in_layout.dist,
-        out_layout.embed.data(), out_layout.stride, out_layout.dist,
-        exec_type, batch, &ws_size_t));
-#else
       CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
             in_layout.embed.data(), in_layout.stride, in_layout.dist, itype,
             out_layout.embed.data(), out_layout.stride, out_layout.dist, otype,
             batch, &ws_size_t, exec_type));
-#endif
     }
     ws_size = static_cast<int64_t>(ws_size_t);
   }
diff --git a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
index cfa0e8a029ed1..69757df220886 100644
--- a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
+++ b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
@@ -32,10 +32,10 @@ PackedTensorAccessor32<scalar_t, ndim, PtrTraits> dummy_packed_accessor32() {
 
 template <int kSize, typename scalar_t, typename index_t>
 __global__ void conv_depthwise2d_forward_kernel(
-    const PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> input,
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> input,
     PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> output,
-    const PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> weight,
-    const PackedTensorAccessor32<scalar_t, 1, DefaultPtrTraits> bias,
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> weight,
+    const PackedTensorAccessor32<const scalar_t, 1, DefaultPtrTraits> bias,
     bool biasEnabled,
     index_t totalElements,
     const int outputChannels,
@@ -103,9 +103,9 @@ __global__ void conv_depthwise2d_forward_kernel(
 
 template <int kSize, int stride, typename scalar_t, typename index_t>
 __global__ void conv_depthwise2d_backward_kernel(
-    const PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> grad_output,
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> grad_output,
     PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> grad_input,
-    const PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> weight,
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> weight,
     index_t totalElements,
     const int inputChannels,
     const int depthwiseMultiplier,
@@ -174,8 +174,8 @@ __global__ void conv_depthwise2d_backward_kernel(
 
 template <typename scalar_t, typename index_t=unsigned>
 __global__ void conv_depthwise2d_grad_weight_kernel(
-    const PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> grad_output,
-    const PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> input,
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> grad_output,
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> input,
     PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> grad_weight,
     const int batchSize,
     const int inputChannels,
@@ -309,12 +309,12 @@ void conv_depthwise2d_forward_out(
     // Create PackedTensorAccessor
     // Kernel currently relies upon all the Tensors to be contiguous, but we made
     // them contiguous above
-    const auto input_a = input.packed_accessor32<scalar_t, 4>();
-    const auto weight_a = weight.packed_accessor32<scalar_t, 4>();
+    const auto input_a = input.packed_accessor32<const scalar_t, 4>();
+    const auto weight_a = weight.packed_accessor32<const scalar_t, 4>();
     const auto output_a = output.packed_accessor32<scalar_t, 4>();
     const auto bias_a = has_bias ?
-      bias.packed_accessor32<scalar_t, 1>() :
-      dummy_packed_accessor32<scalar_t, 1>();
+      bias.packed_accessor32<const scalar_t, 1>() :
+      dummy_packed_accessor32<const scalar_t, 1>();
     if (kW == 3 && kH == 3) {
       conv_depthwise2d_forward_kernel<3> <<<grid, block, 0, stream>>>(
         input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
@@ -387,9 +387,9 @@ void conv_depthwise2d_backward_out(
   const auto stream = c10::cuda::getCurrentCUDAStream();
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, grad_output.scalar_type(),
                                   "conv_depthwise2d_backward_cuda", [&] {
-    auto grad_output_a = grad_output.packed_accessor32<scalar_t, 4>();
+    auto grad_output_a = grad_output.packed_accessor32<const scalar_t, 4>();
     auto grad_input_a = grad_input.packed_accessor32<scalar_t, 4>();
-    auto weight_a = weight.packed_accessor32<scalar_t, 4>();
+    auto weight_a = weight.packed_accessor32<const scalar_t, 4>();
 
     if (kW == 3 && kH == 3) {
       if (dW == 1 && dH == 1){
@@ -501,8 +501,8 @@ void conv_depthwise2d_grad_weight_out(
 
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, grad_output.scalar_type(),
                                   "conv_depthwise2d_grad_weight_cuda", [&] {
-    const auto grad_output_a = grad_output.packed_accessor32<scalar_t, 4>();
-    const auto input_a = input.packed_accessor32<scalar_t, 4>();
+    const auto grad_output_a = grad_output.packed_accessor32<const scalar_t, 4>();
+    const auto input_a = input.packed_accessor32<const scalar_t, 4>();
     const auto grad_weight_a = grad_weight.packed_accessor32<scalar_t, 4>();
     using acc_t = at::acc_type<scalar_t, true>;
     int warp_size = at::cuda::warp_size();
diff --git a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu
index 631c2677900cd..991471a6ef82f 100644
--- a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu
+++ b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu
@@ -26,9 +26,9 @@ template <typename scalar_t, typename accscalar_t,
     int kKnownKernelT, int kKnownKernelH, int kKnownKernelW,
     int kKnownDilationT, int kKnownDilationH, int kKnownDilationW>
 __global__ void conv_depthwise3d_cuda_kernel(
-    const PackedTensorAccessor32<scalar_t, 5> input,
+    const PackedTensorAccessor32<const scalar_t, 5> input,
     PackedTensorAccessor32<scalar_t, 5> output,
-    const PackedTensorAccessor32<scalar_t, 5> kernel,
+    const PackedTensorAccessor32<const scalar_t, 5> kernel,
     const scalar_t* bias,
     int strideT, int strideH, int strideW,
     int paddingT, int paddingH, int paddingW,
@@ -99,9 +99,9 @@ template <typename scalar_t, typename accscalar_t,
     int kKnownStrideT, int kKnownStrideH, int kKnownStrideW>
 __global__ void
 conv_depthwise3d_cuda_backward_input_kernel(
-    const PackedTensorAccessor32<scalar_t, 5> grad_output,
+    const PackedTensorAccessor32<const scalar_t, 5> grad_output,
     PackedTensorAccessor32<scalar_t, 5> grad_input,
-    const PackedTensorAccessor32<scalar_t, 5> kernel,
+    const PackedTensorAccessor32<const scalar_t, 5> kernel,
     int strideT_, int strideH_, int strideW_,
     int paddingT, int paddingH, int paddingW,
     int dilationT_, int dilationH_, int dilationW_) {
@@ -180,8 +180,8 @@ template <typename scalar_t, typename accscalar_t,
     int kKnownStrideH, int kKnownStrideW>
 __global__ void
 conv_depthwise3d_cuda_backward_weight_kernel(
-    const PackedTensorAccessor32<scalar_t, 5> grad_output,
-    const PackedTensorAccessor32<scalar_t, 5> input,
+    const PackedTensorAccessor32<const scalar_t, 5> grad_output,
+    const PackedTensorAccessor32<const scalar_t, 5> input,
     PackedTensorAccessor32<scalar_t, 5> grad_kernel,
     int strideT, int strideH_, int strideW_,
     int paddingT, int paddingH, int paddingW,
@@ -361,9 +361,9 @@ void conv_depthwise_shape_check(
     conv_depthwise3d_cuda_kernel                                            \
     <scalar_t, accscalar_t, (kt), (kh), (kw), (dilt), (dilh), (dilw)>       \
       <<<grid, block, (smem), at::cuda::getCurrentCUDAStream()>>>(          \
-        input_.packed_accessor32<scalar_t, 5>(),                            \
+        input_.packed_accessor32<const scalar_t, 5>(),                      \
         output_.packed_accessor32<scalar_t, 5>(),                           \
-        weight_.packed_accessor32<scalar_t, 5>(),                           \
+        weight_.packed_accessor32<const scalar_t, 5>(),                     \
         bias_ptr,                                                           \
         stride[0], stride[1], stride[2],                                    \
         padding[0], padding[1], padding[2],                                 \
@@ -377,9 +377,9 @@ void conv_depthwise_shape_check(
     conv_depthwise3d_cuda_kernel                                            \
     <scalar_t,accscalar_t, -1, -1, -1, -1, -1, -1>                          \
       <<<grid, block, (smem), at::cuda::getCurrentCUDAStream()>>>(          \
-        input_.packed_accessor32<scalar_t, 5>(),                            \
+        input_.packed_accessor32<const scalar_t, 5>(),                      \
         output_.packed_accessor32<scalar_t, 5>(),                           \
-        weight_.packed_accessor32<scalar_t, 5>(),                           \
+        weight_.packed_accessor32<const scalar_t, 5>(),                     \
         bias_ptr,                                                           \
         stride[0], stride[1], stride[2],                                    \
         padding[0], padding[1], padding[2],                                 \
@@ -470,9 +470,9 @@ Tensor conv_depthwise3d_cuda(
     conv_depthwise3d_cuda_backward_input_kernel                             \
     <scalar_t, accscalar_t, (kt), (kh), (kw), (dilt), (dilh), (dilw), (dt), (dh), (dw)>  \
       <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(               \
-        grad_output_.packed_accessor32<scalar_t, 5>(),                      \
+        grad_output_.packed_accessor32<const scalar_t, 5>(),                \
         grad_input_.packed_accessor32<scalar_t, 5>(),                       \
-        weight_.packed_accessor32<scalar_t, 5>(),                           \
+        weight_.packed_accessor32<const scalar_t, 5>(),                     \
         stride[0], stride[1], stride[2],                                    \
         padding[0], padding[1], padding[2],                                 \
         dilation[0], dilation[1], dilation[2]);                             \
@@ -485,9 +485,9 @@ Tensor conv_depthwise3d_cuda(
     conv_depthwise3d_cuda_backward_input_kernel                             \
     <scalar_t, accscalar_t, -1, -1, -1, -1, -1, -1, -1, -1, -1>             \
       <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(               \
-        grad_output_.packed_accessor32<scalar_t, 5>(),                      \
+        grad_output_.packed_accessor32<const scalar_t, 5>(),                \
         grad_input_.packed_accessor32<scalar_t, 5>(),                       \
-        weight_.packed_accessor32<scalar_t, 5>(),                           \
+        weight_.packed_accessor32<const scalar_t, 5>(),                     \
         stride[0], stride[1], stride[2],                                    \
         padding[0], padding[1], padding[2],                                 \
         dilation[0], dilation[1], dilation[2]);                             \
@@ -500,8 +500,8 @@ Tensor conv_depthwise3d_cuda(
     conv_depthwise3d_cuda_backward_weight_kernel                            \
     <scalar_t, accscalar_t, (dh), (dw)>                                     \
       <<<grid, block, smem, at::cuda::getCurrentCUDAStream()>>>(            \
-        grad_output_.packed_accessor32<scalar_t, 5>(),                      \
-        input_.packed_accessor32<scalar_t, 5>(),                            \
+        grad_output_.packed_accessor32<const scalar_t, 5>(),                \
+        input_.packed_accessor32<const scalar_t, 5>(),                      \
         grad_weight.packed_accessor32<scalar_t, 5>(),                       \
         stride[0], stride[1], stride[2],                                    \
         padding[0], padding[1], padding[2],                                 \
@@ -515,8 +515,8 @@ Tensor conv_depthwise3d_cuda(
     conv_depthwise3d_cuda_backward_weight_kernel                            \
     <scalar_t, accscalar_t, -1, -1>                                         \
       <<<grid, block, smem, at::cuda::getCurrentCUDAStream()>>>(            \
-        grad_output_.packed_accessor32<scalar_t, 5>(),                      \
-        input_.packed_accessor32<scalar_t, 5>(),                            \
+        grad_output_.packed_accessor32<const scalar_t, 5>(),                \
+        input_.packed_accessor32<const scalar_t, 5>(),                      \
         grad_weight.packed_accessor32<scalar_t, 5>(),                       \
         stride[0], stride[1], stride[2],                                    \
         padding[0], padding[1], padding[2],                                 \
diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h
index 04a278d83f763..8ac91f3114511 100644
--- a/aten/src/ATen/native/cuda/DistributionTemplates.h
+++ b/aten/src/ATen/native/cuda/DistributionTemplates.h
@@ -618,7 +618,7 @@ void bernoulli_tensor_cuda_kernel(
       };
   // The template argument `4` below indicates that we want to operate on four
   // element at each time. See NOTE [ CUDA_tensor_applyN helpers ] for details.
-  at::cuda::CUDA_tensor_apply2<scalar_t, prob_t, 4, decltype(functor),
+  at::cuda::CUDA_tensor_apply2<scalar_t, const prob_t, 4, decltype(functor),
                                /*max_threads_per_block=*/512,
                                /*min_blocks_per_sm==*/2>(ret, p, functor);
 }
diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
index 67ea3e4f832b3..a749872ba38f3 100644
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@@ -45,7 +45,7 @@ template <
 C10_LAUNCH_BOUNDS_2(256, 4)
 #endif
 __global__ void
-fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<scalar_t, IndexType> a,
+fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<const scalar_t, IndexType> a,
                          at::cuda::detail::TensorInfo<scalar_t, IndexType> b,
                          at::cuda::detail::TensorInfo<mask_t, IndexType> c,
                          IndexType totalElements, accscalar_t p,
@@ -103,7 +103,7 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<scalar_t, IndexType> a,
     // and replace IndexToOffset call with linearIndex to allow vectorization of NHWC (or other)
     // ordering.
     // Single vectorized load
-    *value = *reinterpret_cast<LoadT*>(&a.data[linearIndex]);
+    *value = *reinterpret_cast<const LoadT*>(&a.data[linearIndex]);
 
     scalar_t r[VEC];
     mask_t mask[VEC];
@@ -133,7 +133,7 @@ template <
 C10_LAUNCH_BOUNDS_2(256, 4)
 #endif
 __global__ void
-fused_dropout_kernel(cuda::detail::TensorInfo<scalar_t, IndexType> a,
+fused_dropout_kernel(cuda::detail::TensorInfo<const scalar_t, IndexType> a,
                      cuda::detail::TensorInfo<scalar_t, IndexType> b,
                      cuda::detail::TensorInfo<mask_t, IndexType> c,
                      IndexType totalElements, accscalar_t p,
@@ -164,7 +164,7 @@ fused_dropout_kernel(cuda::detail::TensorInfo<scalar_t, IndexType> a,
            if (li < totalElements) {
     // Convert `linearIndex` into an offset of `a`
                const IndexType aOffset =
-                   cuda::detail::IndexToOffset<scalar_t, IndexType, ADims>::get(li, a);
+                   cuda::detail::IndexToOffset<const scalar_t, IndexType, ADims>::get(li, a);
                src[ii] = a.data[aOffset];
            }
        }
@@ -187,8 +187,8 @@ void masked_scale_kernel(at::Tensor& ret, const at::Tensor& src, const at::Tenso
    auto iter = at::TensorIteratorConfig()
      .check_all_same_dtype(false)
      .add_output(ret)
-     .add_input(src)
-     .add_input(mask)
+     .add_const_input(src)
+     .add_const_input(mask)
      .build();
 
    at::native::gpu_kernel(
@@ -205,7 +205,7 @@ int get_vector_size(at::Tensor self, at::Tensor ret, at::Tensor mask) {
   if (!self.is_non_overlapping_and_dense() || !ret.is_non_overlapping_and_dense() || !mask.is_non_overlapping_and_dense()) {
     vec_size = 1;
   } else {
-    vec_size = memory::can_vectorize_up_to<scalar_t>((char*)self.data_ptr());
+    vec_size = memory::can_vectorize_up_to<scalar_t>((const char*)self.const_data_ptr());
   }
 
   // check that we'd have no remainders - prefer a smaller vector size with no remainders over a larger vector and remainder.
@@ -236,7 +236,7 @@ inline void launcher(
         using accscalar_t = acc_type<scalar_t, true>;
         accscalar_t pa = (accscalar_t)(p);
         auto self_info =
-            cuda::detail::getTensorInfo<scalar_t, index_type>(self);
+            cuda::detail::getTensorInfo<const scalar_t, index_type>(self);
         auto ret_info =
             cuda::detail::getTensorInfo<scalar_t, index_type>(ret);
         auto mask_info =
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index 92eb4bbbb4929..b8fb51304e4b0 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -150,7 +150,7 @@ __global__ void embedding_backward_kernel(
   // 5     <warp 3>
   // 8     <warp 4>
 
-  // Number of values proceessed by each thread (grain size)
+  // Number of values processed by each thread (grain size)
   const int SZ = 4;
 
   if (idx < numel
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index 52bb16b13c5bb..64852ae79b1f9 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -538,7 +538,7 @@ Tensor _embedding_bag_per_sample_weights_backward_cuda(
 
   auto output = at::empty({num_samples}, grad.options());
 
-  // Early return when there is no samples in the batch. This saves unnecesary kernel
+  // Early return when there is no samples in the batch. This saves unnecessary kernel
   // launch, but also prevents cudaGetLastError() to complain about invalid launch args
   if (num_samples == 0) {
     return output;
diff --git a/aten/src/ATen/native/cuda/FillKernel.cu b/aten/src/ATen/native/cuda/FillKernel.cu
index e7e1237a6f412..dc2ecf2db35b6 100644
--- a/aten/src/ATen/native/cuda/FillKernel.cu
+++ b/aten/src/ATen/native/cuda/FillKernel.cu
@@ -22,7 +22,7 @@ struct FillFunctor {
 void fill_kernel_cuda(TensorIterator& iter, const Scalar& value) {
   AT_DISPATCH_V2(iter.dtype(), "fill_cuda", AT_WRAP([&]() {
     gpu_kernel(iter, FillFunctor<scalar_t>(value.to<scalar_t>()));
-  }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kBool, kHalf, kBFloat16, kFloat8_e4m3fn, kFloat8_e5m2, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+  }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kBool, kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
 }
 
 REGISTER_DISPATCH(fill_stub, &fill_kernel_cuda);
diff --git a/aten/src/ATen/native/cuda/ForeachReduceOp.cu b/aten/src/ATen/native/cuda/ForeachReduceOp.cu
index d8af951afa701..eed96563efcdc 100644
--- a/aten/src/ATen/native/cuda/ForeachReduceOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachReduceOp.cu
@@ -2,6 +2,7 @@
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/OpMathType.h>
+#include <ATen/ceil_div.h>
 #include <ATen/native/ForeachUtils.h>
 #include <ATen/cuda/DeviceUtils.cuh>
 #include <ATen/native/cuda/ForeachFunctors.cuh>
@@ -20,16 +21,33 @@
 
 namespace at::native {
 
+// _foreach_norm supports only L1, L2, and inf norm
+enum class NormType { L1, L2, LInf };
+
+// NOTE: This is a simple variant of TensorListMetadata in MultiTensorApply.cuh
+// as we only need to track addresses for the lpnorm_cleanup function below.
+// Why is this struct necessary? For the same reason the TensorListMetadata
+// struct is necessary--which is to ferry static metadata to the CUDA kernel
+// while complying with the 4kb size constraint. Since we only need to track
+// addresses, we introduce this struct to be able to fit more Tensor pointers at
+// a time, currently 400 empirically, compared to the much smaller values in
+// depth_to_max_tensors. This way, we can launch fewer kernels for better
+// performance.
+//
+// IF YOU USE THIS STRUCT, PLEASE ADD A ONE-OFF TEST IN test_foreach.py AS THIS
+// IS CURRENTLY ONLY TESTED FOR _foreach_norm.
+const size_t MAX_TENSORS_PER_KERNEL = 400;
+struct TensorListAddresses {
+  const void* addresses[MAX_TENSORS_PER_KERNEL];
+};
+
 template <
     typename T,
-    int NormType,
+    NormType norm_type,
     int depth = 1,
     int r_args_depth = 1,
     int res_arg_index = 0>
 struct LpNormFunctor {
-  static_assert(
-      NormType == 1 || NormType == 2,
-      "foreach_norm supports only L1 and L2 norm");
   using opmath_t = typename at::opmath_type<T>;
   __device__ __forceinline__ void operator()(
       int chunk_size,
@@ -47,7 +65,7 @@ struct LpNormFunctor {
     __shared__ opmath_t s_vals[512];
     opmath_t vals[kILP];
     T r_x[kILP];
-    for (int i = 0; i < kILP; i++) {
+    for (int64_t i = 0; i < kILP; i++) {
       vals[i] = opmath_t(0);
       r_x[i] = T(0);
     }
@@ -61,7 +79,11 @@ struct LpNormFunctor {
 #pragma unroll
         for (int ii = 0; ii < kILP; ii++) {
           opmath_t next = static_cast<opmath_t>(r_x[ii]);
-          vals[ii] += NormType == 1 ? ::abs(next) : next * next;
+          if constexpr (norm_type == NormType::LInf) {
+            vals[ii] = max_propagate_nan(vals[ii], ::abs(next));
+          } else {
+            vals[ii] += norm_type == NormType::L1 ? ::abs(next) : next * next;
+          }
         }
       }
     } else {
@@ -72,7 +94,11 @@ struct LpNormFunctor {
           int i = i_start + threadIdx.x + ii * blockDim.x;
           if (i < n && i < chunk_size) {
             opmath_t next = static_cast<opmath_t>(x[i]);
-            vals[ii] += NormType == 1 ? ::abs(next) : next * next;
+            if constexpr (norm_type == NormType::LInf) {
+              vals[ii] = max_propagate_nan(vals[ii], ::abs(next));
+            } else {
+              vals[ii] += norm_type == NormType::L1 ? ::abs(next) : next * next;
+            }
           }
         }
       }
@@ -80,34 +106,52 @@ struct LpNormFunctor {
 
     auto val = opmath_t(0);
     for (int i = 0; i < kILP; i++) {
-      val += vals[i];
+      if constexpr (norm_type == NormType::LInf) {
+        val = max_propagate_nan(val, vals[i]);
+      } else {
+        val += vals[i];
+      }
     }
-    auto final = at::native::cuda_utils::BlockReduceSum(val, s_vals);
+    auto final_val = norm_type == NormType::L1 || norm_type == NormType::L2
+        ? at::native::cuda_utils::BlockReduceSum(val, s_vals)
+        : at::native::cuda_utils::BlockReduceMax(val, s_vals);
 
     if (threadIdx.x == 0) {
       output_per_tensor
           [(tl.start_tensor_this_launch + tensor_loc) * max_chunks_per_tensor +
-           chunk_idx] = final;
+           chunk_idx] = final_val;
     }
   }
 };
 
-template <typename T, int NormType, typename opmath_t = at::opmath_type<T>>
+template <
+    typename T,
+    NormType norm_type,
+    typename opmath_t = at::opmath_type<T>>
 __global__ void lpnorm_cleanup(
     const opmath_t* output_per_tensor,
-    T* ret_per_tensor,
+    TensorListAddresses addr_struct,
     int max_chunks_per_tensor) {
   __shared__ opmath_t vals[512];
 
   const opmath_t* output_this_tensor =
       output_per_tensor + blockIdx.x * max_chunks_per_tensor;
   opmath_t val = 0;
-  for (int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x) {
-    val += output_this_tensor[i];
+  for (size_t i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x) {
+    if constexpr (norm_type == NormType::LInf) {
+      val = max_propagate_nan(val, output_this_tensor[i]);
+    } else {
+      val += output_this_tensor[i];
+    }
   }
-  opmath_t final = at::native::cuda_utils::BlockReduceSum<opmath_t>(val, vals);
+  opmath_t final_val = norm_type == NormType::L1 || norm_type == NormType::L2
+      ? at::native::cuda_utils::BlockReduceSum<opmath_t>(val, vals)
+      : at::native::cuda_utils::BlockReduceMax(val, vals);
   if (threadIdx.x == 0) {
-    ret_per_tensor[blockIdx.x] = NormType == 1 ? final : ::sqrt(final);
+    *(T*)addr_struct.addresses[blockIdx.x] =
+        norm_type == NormType::L1 || norm_type == NormType::LInf
+        ? final_val
+        : ::sqrt(final_val);
   }
 }
 
@@ -135,14 +179,15 @@ std::vector<Tensor> foreach_tensor_norm_cuda(
             at::isComplexType(scalar_type);
       });
   if (!can_use_fast_route(tensors) || has_int_or_complex ||
-      !(p == static_cast<double>(1) || p == static_cast<double>(2))) {
+      !(p == static_cast<double>(1) || p == static_cast<double>(2) ||
+        p == std::numeric_limits<double>::infinity())) {
     return foreach_tensor_norm_slow(tensors, ord);
   }
 
-  const int ntensors = tensors.size();
+  const size_t ntensors = tensors.size();
   int max_chunks_per_tensor = -1;
 
-  for (int t = 0; t < ntensors; t++) {
+  for (const auto t : c10::irange(ntensors)) {
     int max_chunks_this_tensor =
         (tensors[t].numel() + kChunkSize - 1) / kChunkSize;
     if (max_chunks_this_tensor > max_chunks_per_tensor) {
@@ -151,9 +196,14 @@ std::vector<Tensor> foreach_tensor_norm_cuda(
   }
   const auto options = tensors[0].options();
   auto output_per_tensor = at::zeros(
-      {ntensors * max_chunks_per_tensor},
+      {static_cast<int64_t>(ntensors) * max_chunks_per_tensor},
       options.dtype(toOpMathType(tensors[0].scalar_type())));
-  auto ret_per_tensor = at::empty({ntensors}, options);
+
+  std::vector<at::Tensor> vec_res;
+  vec_res.reserve(ntensors);
+  for (const auto i : c10::irange(ntensors)) {
+    vec_res.push_back(at::empty({}, options));
+  }
 
   auto tensor_lists = std::vector<std::vector<Tensor>>{tensors.vec()};
   if (p == static_cast<double>(1)) {
@@ -166,18 +216,35 @@ std::vector<Tensor> foreach_tensor_norm_cuda(
           using opmath_t = typename at::opmath_type<scalar_t>;
           multi_tensor_apply<1>(
               tensor_lists,
-              LpNormFunctor<scalar_t, 1>(),
+              LpNormFunctor<scalar_t, NormType::L1>(),
               output_per_tensor.mutable_data_ptr<opmath_t>(),
               max_chunks_per_tensor);
           C10_CUDA_KERNEL_LAUNCH_CHECK();
           const at::cuda::OptionalCUDAGuard device_guard(
               device_of(output_per_tensor));
           auto stream = at::cuda::getCurrentCUDAStream();
-          lpnorm_cleanup<scalar_t, 1><<<ntensors, 512, 0, stream>>>(
-              output_per_tensor.const_data_ptr<opmath_t>(),
-              ret_per_tensor.mutable_data_ptr<scalar_t>(),
-              max_chunks_per_tensor);
-          C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+          const size_t num_kernels = ceil_div(ntensors, MAX_TENSORS_PER_KERNEL);
+          for (const auto i : c10::irange(num_kernels)) {
+            const size_t num_tensors_this_kernel =
+                (i < num_kernels - 1 || ntensors % MAX_TENSORS_PER_KERNEL == 0)
+                ? MAX_TENSORS_PER_KERNEL
+                : (ntensors % MAX_TENSORS_PER_KERNEL);
+
+            TensorListAddresses addr_struct;
+            for (const auto j : c10::irange(num_tensors_this_kernel)) {
+              addr_struct.addresses[j] = vec_res[i * MAX_TENSORS_PER_KERNEL + j]
+                                             .mutable_data_ptr<scalar_t>();
+            }
+
+            lpnorm_cleanup<scalar_t, NormType::L1>
+                <<<num_tensors_this_kernel, 512, 0, stream>>>(
+                    output_per_tensor.const_data_ptr<opmath_t>() +
+                        i * MAX_TENSORS_PER_KERNEL * max_chunks_per_tensor,
+                    addr_struct,
+                    max_chunks_per_tensor);
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
+          }
         });
   } else if (p == static_cast<double>(2)) {
     AT_DISPATCH_FLOATING_TYPES_AND2(
@@ -189,18 +256,75 @@ std::vector<Tensor> foreach_tensor_norm_cuda(
           using opmath_t = typename at::opmath_type<scalar_t>;
           multi_tensor_apply<1>(
               tensor_lists,
-              LpNormFunctor<scalar_t, 2>(),
+              LpNormFunctor<scalar_t, NormType::L2>(),
               output_per_tensor.mutable_data_ptr<opmath_t>(),
               max_chunks_per_tensor);
           C10_CUDA_KERNEL_LAUNCH_CHECK();
           const at::cuda::OptionalCUDAGuard device_guard(
               device_of(output_per_tensor));
           auto stream = at::cuda::getCurrentCUDAStream();
-          lpnorm_cleanup<scalar_t, 2><<<ntensors, 512, 0, stream>>>(
-              output_per_tensor.const_data_ptr<opmath_t>(),
-              ret_per_tensor.mutable_data_ptr<scalar_t>(),
+
+          const size_t num_kernels = ceil_div(ntensors, MAX_TENSORS_PER_KERNEL);
+          for (const auto i : c10::irange(num_kernels)) {
+            const size_t num_tensors_this_kernel =
+                (i < num_kernels - 1 || ntensors % MAX_TENSORS_PER_KERNEL == 0)
+                ? MAX_TENSORS_PER_KERNEL
+                : (ntensors % MAX_TENSORS_PER_KERNEL);
+
+            TensorListAddresses addr_struct;
+            for (const auto j : c10::irange(num_tensors_this_kernel)) {
+              addr_struct.addresses[j] = vec_res[i * MAX_TENSORS_PER_KERNEL + j]
+                                             .mutable_data_ptr<scalar_t>();
+            }
+
+            lpnorm_cleanup<scalar_t, NormType::L2>
+                <<<num_tensors_this_kernel, 512, 0, stream>>>(
+                    output_per_tensor.const_data_ptr<opmath_t>() +
+                        i * MAX_TENSORS_PER_KERNEL * max_chunks_per_tensor,
+                    addr_struct,
+                    max_chunks_per_tensor);
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
+          }
+        });
+  } else if (p == std::numeric_limits<double>::infinity()) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf,
+        kBFloat16,
+        tensor_lists[0][0].scalar_type(),
+        "foreach_tensor_norm_cuda",
+        [&]() {
+          using opmath_t = typename at::opmath_type<scalar_t>;
+          multi_tensor_apply<1>(
+              tensor_lists,
+              LpNormFunctor<scalar_t, NormType::LInf>(),
+              output_per_tensor.mutable_data_ptr<opmath_t>(),
               max_chunks_per_tensor);
           C10_CUDA_KERNEL_LAUNCH_CHECK();
+          const at::cuda::OptionalCUDAGuard device_guard(
+              device_of(output_per_tensor));
+          auto stream = at::cuda::getCurrentCUDAStream();
+
+          const size_t num_kernels = ceil_div(ntensors, MAX_TENSORS_PER_KERNEL);
+          for (const auto i : c10::irange(num_kernels)) {
+            const size_t num_tensors_this_kernel =
+                (i < num_kernels - 1 || ntensors % MAX_TENSORS_PER_KERNEL == 0)
+                ? MAX_TENSORS_PER_KERNEL
+                : (ntensors % MAX_TENSORS_PER_KERNEL);
+
+            TensorListAddresses addr_struct;
+            for (const auto j : c10::irange(num_tensors_this_kernel)) {
+              addr_struct.addresses[j] = vec_res[i * MAX_TENSORS_PER_KERNEL + j]
+                                             .mutable_data_ptr<scalar_t>();
+            }
+
+            lpnorm_cleanup<scalar_t, NormType::LInf>
+                <<<num_tensors_this_kernel, 512, 0, stream>>>(
+                    output_per_tensor.const_data_ptr<opmath_t>() +
+                        i * MAX_TENSORS_PER_KERNEL * max_chunks_per_tensor,
+                    addr_struct,
+                    max_chunks_per_tensor);
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
+          }
         });
   } else {
     TORCH_CHECK(
@@ -216,7 +340,7 @@ std::vector<Tensor> foreach_tensor_norm_cuda(
   int i = 0;
   for (const auto& t : tensors) {
     if (t.numel() != 0) {
-      result.emplace_back(ret_per_tensor[i]);
+      result.emplace_back(vec_res[i]);
       i++;
     } else {
       result.emplace_back(at::zeros({}, options));
diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
index ff809d108d9ee..d7a118e6a9584 100644
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@@ -388,9 +388,10 @@ void foreach_tensor_zero_cuda_(TensorList tensors) {
   std::vector<std::vector<at::Tensor>> tensor_lists;
   tensor_lists.emplace_back(tensors.vec());
 
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       ScalarType::Half,
       ScalarType::BFloat16,
+      ScalarType::Bool,
       tensors[0].scalar_type(),
       "foreach_zero_cuda_",
       [&]() {
diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
index 55d4b46364e75..3bc3b6f4cb510 100644
--- a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
@@ -47,8 +47,8 @@ template <typename scalar_t>
 __global__ void fractional_max_pool2d_out_cuda_frame(
   PackedTensorAccessor<scalar_t, 4> output,
   PackedTensorAccessor<int64_t, 4> indices,
-  PackedTensorAccessor<scalar_t, 4> input,
-  PackedTensorAccessor<scalar_t, 3> samples,
+  PackedTensorAccessor<const scalar_t, 4> input,
+  PackedTensorAccessor<const scalar_t, 3> samples,
   int poolSizeH, int poolSizeW) {
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
@@ -103,8 +103,8 @@ __global__ void fractional_max_pool2d_out_cuda_frame(
 template <typename scalar_t>
 __global__ void fractional_max_pool2d_backward_out_cuda_frame(
   PackedTensorAccessor<scalar_t, 4> gradInput,
-  PackedTensorAccessor<scalar_t, 4> gradOutput,
-  PackedTensorAccessor<int64_t, 4> indices) {
+  PackedTensorAccessor<const scalar_t, 4> gradOutput,
+  PackedTensorAccessor<const int64_t, 4> indices) {
   // Output (h, w) point that this thread is responsible for
   int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
   int plane = blockIdx.y;
@@ -186,10 +186,10 @@ TORCH_IMPL_FUNC(fractional_max_pool2d_out_cuda) (
     input.scalar_type(),
     "fractional_max_pool2d_out_cuda_frame",
     [&] {
-      auto devInput = input_.packed_accessor64<scalar_t, 4>();
+      auto devInput = input_.packed_accessor64<const scalar_t, 4>();
       auto devOutput = output_.packed_accessor64<scalar_t, 4>();
       auto devIndices = indices_.packed_accessor64<int64_t, 4>();
-      auto devSamples = randomSamples.packed_accessor64<scalar_t, 3>();
+      auto devSamples = randomSamples.packed_accessor64<const scalar_t, 3>();
       fractional_max_pool2d_out_cuda_frame<scalar_t>
         <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
           devOutput, devIndices, devInput, devSamples,
@@ -254,7 +254,7 @@ TORCH_IMPL_FUNC(fractional_max_pool2d_backward_cuda)(
             gradInput_.size(0));
   dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
 
-  auto devIndices = indices_.packed_accessor64<int64_t, 4>();
+  auto devIndices = indices_.packed_accessor64<const int64_t, 4>();
   AT_DISPATCH_FLOATING_TYPES_AND2(
     at::ScalarType::Half,
     at::ScalarType::BFloat16,
@@ -262,7 +262,7 @@ TORCH_IMPL_FUNC(fractional_max_pool2d_backward_cuda)(
     "fractional_max_pool2d_backward_out_cuda_frame",
     [&] {
       auto devGradInput = gradInput_.packed_accessor64<scalar_t, 4>();
-      auto devGradOutput = gradOutput_.packed_accessor64<scalar_t, 4>();
+      auto devGradOutput = gradOutput_.packed_accessor64<const scalar_t, 4>();
       fractional_max_pool2d_backward_out_cuda_frame<scalar_t>
         <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
         devGradInput, devGradOutput, devIndices);
diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
index 9873b4da5998a..0bd2f50e12bb7 100644
--- a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
+++ b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
@@ -53,10 +53,10 @@ __device__ inline int64_t get_intervals(
 
 template <typename scalar_t>
 __global__ void fractional_max_pool3d_out_frame(
-  PackedTensorAccessor64<scalar_t, 5> input,
+  PackedTensorAccessor64<const scalar_t, 5> input,
   PackedTensorAccessor64<scalar_t, 5> output,
   PackedTensorAccessor64<int64_t, 5> indices,
-  PackedTensorAccessor64<scalar_t, 3> samples,
+  PackedTensorAccessor64<const scalar_t, 3> samples,
   int64_t poolSizeT, int64_t poolSizeH, int64_t poolSizeW) {
     using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
     // Output (t, h, w) point that this thread is responsible for
@@ -120,8 +120,8 @@ __global__ void fractional_max_pool3d_out_frame(
 template <typename scalar_t>
 __global__ void fractional_max_pool3d_backward_out_frame(
   PackedTensorAccessor64<scalar_t, 5> gradInput,
-  PackedTensorAccessor64<scalar_t, 5> gradOutput,
-  PackedTensorAccessor64<int64_t, 5> indices) {
+  PackedTensorAccessor64<const scalar_t, 5> gradOutput,
+  PackedTensorAccessor64<const int64_t, 5> indices) {
   // Output (h, w) point that this thread is responsible for
   int64_t ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
   int64_t plane = blockIdx.y;
@@ -235,8 +235,8 @@ void fractional_max_pool3d_backward_out_cuda_template(
         fractional_max_pool3d_backward_out_frame<scalar_t>
         <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
           gradInput_.packed_accessor64<scalar_t, 5>(),
-          gradOutput_.packed_accessor64<scalar_t, 5>(),
-          indices_.packed_accessor64<int64_t, 5>()
+          gradOutput_.packed_accessor64<const scalar_t, 5>(),
+          indices_.packed_accessor64<const int64_t, 5>()
         );
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
@@ -295,10 +295,10 @@ TORCH_IMPL_FUNC(fractional_max_pool3d_out_cuda) (
     [&]{
       fractional_max_pool3d_out_frame<scalar_t>
       <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-        input_.packed_accessor64<scalar_t, 5>(),
+        input_.packed_accessor64<const scalar_t, 5>(),
         output_.packed_accessor64<scalar_t, 5>(),
         indices_.packed_accessor64<int64_t, 5>(),
-        randomSamples.packed_accessor64<scalar_t, 3>(),
+        randomSamples.packed_accessor64<const scalar_t, 3>(),
         poolSizeT, poolSizeH, poolSizeW
       );
       C10_CUDA_KERNEL_LAUNCH_CHECK();
diff --git a/aten/src/ATen/native/cuda/FusedSgdKernel.cu b/aten/src/ATen/native/cuda/FusedSgdKernel.cu
new file mode 100644
index 0000000000000..36ac7401a2d0b
--- /dev/null
+++ b/aten/src/ATen/native/cuda/FusedSgdKernel.cu
@@ -0,0 +1,427 @@
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/ForeachUtils.h>
+#include <c10/util/Exception.h>
+#include <ATen/native/cuda/ForeachFunctors.cuh>
+#include <ATen/native/cuda/MultiTensorApply.cuh>
+
+namespace at::native {
+
+namespace {
+
+template <typename scalar_t, int depth>
+C10_DEVICE __forceinline__ void sgd_math(
+    scalar_t r_args[depth][kILP],
+    const double weight_decay,
+    const double momentum,
+    const float* lr_ptr,
+    const double lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const float* grad_scale_ptr) {
+  using opmath_t = at::opmath_type<scalar_t>;
+  const double double_lr = lr_ptr != nullptr ? *lr_ptr : lr;
+#pragma unroll
+  for (int ii = 0; ii < kILP; ii++) {
+    auto p = static_cast<opmath_t>(r_args[0][ii]);
+    auto g = static_cast<opmath_t>(r_args[1][ii]);
+    if (grad_scale_ptr) {
+      g /= static_cast<double>(*grad_scale_ptr);
+      r_args[1][ii] = g;
+    }
+    if (maximize) {
+      g *= -1.0;
+    }
+    if (weight_decay != 0) {
+      g += weight_decay * p;
+    }
+    if (depth > 2) {
+      const auto momentum_buffer = is_first_step
+          ? g
+          : (momentum * static_cast<opmath_t>(r_args[2][ii]) +
+             (1 - dampening) * g);
+      r_args[2][ii] = momentum_buffer;
+
+      if (nesterov) {
+        g = g + momentum * momentum_buffer;
+      } else {
+        g = momentum_buffer;
+      }
+    }
+    p -= double_lr * g;
+    r_args[0][ii] = p;
+  }
+}
+
+template <typename scalar_t, int depth>
+struct FusedSgdMathFunctor {
+  static_assert(
+      depth == 2 || depth == 3,
+      "depth of 2 for SGD w/ momentum == 0, 3 for SGD w/ momentum != 0");
+  C10_DEVICE __forceinline__ void operator()(
+      const int chunk_size,
+      TensorListMetadata<depth>& tl,
+      const double weight_decay,
+      const double momentum,
+      const float* lr_ptr,
+      const double lr,
+      const double dampening,
+      const bool nesterov,
+      const bool maximize,
+      const bool is_first_step,
+      const float* grad_scale_ptr,
+      const float* found_inf_ptr) {
+    if (found_inf_ptr && *found_inf_ptr == 1) {
+      return;
+    }
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+
+    scalar_t* args[depth];
+    scalar_t r_args[depth][kILP];
+    const auto all_aligned{
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc)};
+    const auto n = tl.numel_for_tensor[tensor_loc] - chunk_idx * chunk_size;
+
+#ifndef USE_ROCM
+    const auto use_faster_load_store =
+        (n % kILP == 0) && (chunk_size % kILP == 0) && all_aligned;
+#else
+    const auto use_faster_load_store{false};
+#endif
+    if (use_faster_load_store) {
+      for (auto i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+#pragma unroll
+        for (auto i = 0; i < depth; i++) {
+          load_store(r_args[i], args[i], 0, i_start);
+        }
+        sgd_math<scalar_t, depth>(
+            r_args,
+            weight_decay,
+            momentum,
+            lr_ptr,
+            lr,
+            dampening,
+            nesterov,
+            maximize,
+            is_first_step,
+            grad_scale_ptr);
+        load_store(args[0], r_args[0], i_start, 0);
+        if (grad_scale_ptr) {
+          load_store(args[1], r_args[1], i_start, 0);
+        }
+        if (depth > 2) {
+          load_store(args[2], r_args[2], i_start, 0);
+        }
+      }
+    } else {
+      for (auto i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<depth>(r_args, args, i_start, chunk_size, n);
+        sgd_math<scalar_t, depth>(
+            r_args,
+            weight_decay,
+            momentum,
+            lr_ptr,
+            lr,
+            dampening,
+            nesterov,
+            maximize,
+            is_first_step,
+            grad_scale_ptr);
+        store_args(args[0], r_args[0], i_start, chunk_size, n);
+        if (grad_scale_ptr) {
+          store_args(args[1], r_args[1], i_start, chunk_size, n);
+        }
+        if (depth > 2) {
+          store_args(args[2], r_args[2], i_start, chunk_size, n);
+        }
+      }
+    }
+  }
+};
+
+void _fused_sgd_with_momentum_kernel_cuda_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList momentum_buffer_list,
+    const double weight_decay,
+    const double momentum,
+    const double lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf) {
+  TORCH_CHECK_GT(momentum, 0);
+  TORCH_CHECK(at::native::check_fast_path_restrictions(
+      {params, grads, momentum_buffer_list}));
+  float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  float* lr_ptr = nullptr;
+
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      params.vec(), grads.vec(), momentum_buffer_list.vec()};
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_sgd_with_momentum_kernel_cuda",
+      [&]() {
+        multi_tensor_apply<3>(
+            tensor_lists,
+            FusedSgdMathFunctor<scalar_t, 3>(),
+            weight_decay,
+            momentum,
+            lr_ptr,
+            lr,
+            dampening,
+            nesterov,
+            maximize,
+            is_first_step,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+void _fused_sgd_with_momentum_kernel_cuda_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList momentum_buffer_list,
+    const double weight_decay,
+    const double momentum,
+    const at::Tensor& lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf) {
+  if (lr.is_cpu()) {
+    _fused_sgd_with_momentum_kernel_cuda_(
+        params,
+        grads,
+        momentum_buffer_list,
+        weight_decay,
+        momentum,
+        lr.item<double>(),
+        dampening,
+        nesterov,
+        maximize,
+        is_first_step,
+        grad_scale,
+        found_inf);
+    return;
+  }
+  TORCH_CHECK_GT(momentum, 0);
+  TORCH_CHECK(at::native::check_fast_path_restrictions(
+      {params, grads, momentum_buffer_list}));
+  if (grad_scale != c10::nullopt) {
+    TORCH_CHECK(
+        grad_scale->device() == params[0].device(),
+        "grad_scale must be on the same GPU device as the params");
+  }
+  if (found_inf != c10::nullopt) {
+    TORCH_CHECK(
+        found_inf->device() == params[0].device(),
+        "found_inf must be on the same GPU device as the params");
+  }
+  TORCH_CHECK(
+      lr.device() == params[0].device(),
+      "found_inf must be on the same GPU device as the params");
+  float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      params.vec(), grads.vec(), momentum_buffer_list.vec()};
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_sgd_with_momentum_kernel_cuda",
+      [&]() {
+        multi_tensor_apply<3>(
+            tensor_lists,
+            FusedSgdMathFunctor<scalar_t, 3>(),
+            weight_decay,
+            momentum,
+            lr.data_ptr<float>(),
+            1.0,
+            dampening,
+            nesterov,
+            maximize,
+            is_first_step,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+} // namespace
+
+void _fused_sgd_kernel_cuda_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList momentum_buffer_list,
+    const double weight_decay,
+    const double momentum,
+    const double lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf) {
+  if (!momentum_buffer_list.empty()) {
+    _fused_sgd_with_momentum_kernel_cuda_(
+        params,
+        grads,
+        momentum_buffer_list,
+        weight_decay,
+        momentum,
+        lr,
+        dampening,
+        nesterov,
+        maximize,
+        is_first_step,
+        grad_scale,
+        found_inf);
+    return;
+  }
+  TORCH_CHECK_EQ(momentum, 0);
+  TORCH_CHECK(at::native::check_fast_path_restrictions({params, grads}));
+  if (is_first_step) {
+    TORCH_WARN_ONCE(
+        "`is_first_step` argument has no effect when `momentum_buffer_list` is empty");
+  }
+  float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  float* lr_ptr = nullptr;
+
+  std::vector<std::vector<at::Tensor>> tensor_lists{params.vec(), grads.vec()};
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_sgd_kernel_cuda",
+      [&]() {
+        multi_tensor_apply<2>(
+            tensor_lists,
+            FusedSgdMathFunctor<scalar_t, 2>(),
+            weight_decay,
+            momentum,
+            lr_ptr,
+            lr,
+            dampening,
+            nesterov,
+            maximize,
+            /* is_first_step */ false,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+void _fused_sgd_kernel_cuda_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList momentum_buffer_list,
+    const double weight_decay,
+    const double momentum,
+    const at::Tensor& lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf) {
+  if (!momentum_buffer_list.empty()) {
+    _fused_sgd_with_momentum_kernel_cuda_(
+        params,
+        grads,
+        momentum_buffer_list,
+        weight_decay,
+        momentum,
+        lr,
+        dampening,
+        nesterov,
+        maximize,
+        is_first_step,
+        grad_scale,
+        found_inf);
+    return;
+  }
+  if (lr.is_cpu()) {
+    _fused_sgd_kernel_cuda_(
+        params,
+        grads,
+        momentum_buffer_list,
+        weight_decay,
+        momentum,
+        lr.item<double>(),
+        dampening,
+        nesterov,
+        maximize,
+        is_first_step,
+        grad_scale,
+        found_inf);
+    return;
+  }
+  TORCH_CHECK_EQ(momentum, 0);
+  TORCH_CHECK(at::native::check_fast_path_restrictions({params, grads}));
+  if (is_first_step) {
+    TORCH_WARN_ONCE(
+        "`is_first_step` argument has no effect when `momentum_buffer_list` is empty");
+  }
+  if (grad_scale.has_value()) {
+    TORCH_CHECK(
+        grad_scale->device() == params[0].device(),
+        "grad_scale must be on the same GPU device as the params");
+  }
+  if (found_inf.has_value()) {
+    TORCH_CHECK(
+        found_inf->device() == params[0].device(),
+        "found_inf must be on the same GPU device as the params");
+  }
+  TORCH_CHECK(
+      lr.device() == params[0].device(),
+      "found_inf must be on the same GPU device as the params");
+  float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+
+  std::vector<std::vector<at::Tensor>> tensor_lists{params.vec(), grads.vec()};
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_sgd_kernel_cuda",
+      [&]() {
+        multi_tensor_apply<2>(
+            tensor_lists,
+            FusedSgdMathFunctor<scalar_t, 2>(),
+            weight_decay,
+            momentum,
+            lr.data_ptr<float>(),
+            1.0,
+            dampening,
+            nesterov,
+            maximize,
+            /* is_first_step */ false,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/GridSampler.cu b/aten/src/ATen/native/cuda/GridSampler.cu
index 9d87cbc327114..2c9128eee2217 100644
--- a/aten/src/ATen/native/cuda/GridSampler.cu
+++ b/aten/src/ATen/native/cuda/GridSampler.cu
@@ -25,8 +25,8 @@ namespace {
   C10_LAUNCH_BOUNDS_1(256)
   __global__ void grid_sampler_2d_kernel(
       const index_t nthreads,
-      TensorInfo<scalar_t, index_t> input,
-      TensorInfo<scalar_t, index_t> grid,
+      TensorInfo<const scalar_t, index_t> input,
+      TensorInfo<const scalar_t, index_t> grid,
       TensorInfo<scalar_t, index_t> output,
       const GridSamplerInterpolation interpolation_mode,
       const GridSamplerPadding padding_mode,
@@ -104,7 +104,7 @@ namespace {
         index_t ix_nearest = static_cast<index_t>(std::nearbyint(ix));
         index_t iy_nearest = static_cast<index_t>(std::nearbyint(iy));
 
-        // assign nearest neighor pixel value to output pixel
+        // assign nearest neighbour pixel value to output pixel
         auto inp_ptr_NC = input.data + n * inp_sN;
         auto out_ptr_NCHW = output.data + n * out_sN + h * out_sH + w * out_sW;
         for (index_t c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
@@ -155,8 +155,8 @@ namespace {
   C10_LAUNCH_BOUNDS_1(512)
   __global__ void grid_sampler_3d_kernel(
       const index_t nthreads,
-      TensorInfo<scalar_t, index_t> input,
-      TensorInfo<scalar_t, index_t> grid,
+      TensorInfo<const scalar_t, index_t> input,
+      TensorInfo<const scalar_t, index_t> grid,
       TensorInfo<scalar_t, index_t> output,
       const GridSamplerInterpolation interpolation_mode,
       const GridSamplerPadding padding_mode,
@@ -287,7 +287,7 @@ namespace {
         index_t iy_nearest = static_cast<index_t>(std::nearbyint(iy));
         index_t iz_nearest = static_cast<index_t>(std::nearbyint(iz));
 
-        // assign nearest neighor pixel value to output pixel
+        // assign nearest neighbour pixel value to output pixel
         auto inp_ptr_NC = input.data + n * inp_sN;
         auto out_ptr_NCDHW = output.data + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
         for (index_t c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
@@ -311,9 +311,9 @@ namespace {
   C10_LAUNCH_BOUNDS_1(256)
   __global__ void grid_sampler_2d_backward_kernel(
       const index_t nthreads,
-      TensorInfo<scalar_t, index_t> grad_output,
-      TensorInfo<scalar_t, index_t> input,
-      TensorInfo<scalar_t, index_t> grid,
+      TensorInfo<const scalar_t, index_t> grad_output,
+      TensorInfo<const scalar_t, index_t> input,
+      TensorInfo<const scalar_t, index_t> grid,
       TensorInfo<scalar_t, index_t> grad_input,  // initialized to zeros (or unused if input_requires_grad is false)
       TensorInfo<scalar_t, index_t> grad_grid,   // initialized to empty
       const GridSamplerInterpolation interpolation_mode,
@@ -385,11 +385,11 @@ namespace {
         scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
 
         scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0);
-        scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW;
+        const scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW;
         index_t NC_offset = n * gInp_sN;
-        scalar_t *inp_ptr_NC = input.data + n * inp_sN;
+        const scalar_t *inp_ptr_NC = input.data + n * inp_sN;
         for (index_t c = 0; c < C; ++c, inp_ptr_NC += inp_sC, NC_offset += gInp_sC, gOut_ptr_NCHW += gOut_sC) {
-          scalar_t gOut = *gOut_ptr_NCHW;
+          const scalar_t gOut = *gOut_ptr_NCHW;
 
           if (input_requires_grad) {
             // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd].
@@ -434,8 +434,8 @@ namespace {
           index_t ix_nearest = static_cast<index_t>(std::nearbyint(ix));
           index_t iy_nearest = static_cast<index_t>(std::nearbyint(iy));
 
-          // assign nearest neighor pixel value to output pixel
-          scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW;
+          // assign nearest neighbour pixel value to output pixel
+          const scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW;
           index_t NC_offset = n * gInp_sN;
           for (index_t c = 0; c < C; ++c, NC_offset += gInp_sC, gOut_ptr_NCHW += gOut_sC) {
             // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd].
@@ -474,12 +474,12 @@ namespace {
         scalar_t gix = static_cast<scalar_t>(0);
         scalar_t giy = static_cast<scalar_t>(0);
 
-        scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW;
+        const scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW;
         index_t NC_offset = n * gInp_sN;
-        scalar_t *inp_ptr_NC = input.data + n * inp_sN;
+        const scalar_t *inp_ptr_NC = input.data + n * inp_sN;
 
         for (index_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, NC_offset += gInp_sC, inp_ptr_NC+= inp_sC) {
-          scalar_t gOut = *gOut_ptr_NCHW;
+          const scalar_t gOut = *gOut_ptr_NCHW;
 
           #pragma unroll 4
           for (index_t i = 0; i < 4; ++i) {
@@ -517,9 +517,9 @@ namespace {
   C10_LAUNCH_BOUNDS_1(256)
   __global__ void grid_sampler_3d_backward_kernel(
       const index_t nthreads,
-      TensorInfo<scalar_t, index_t> grad_output,
-      TensorInfo<scalar_t, index_t> input,
-      TensorInfo<scalar_t, index_t> grid,
+      TensorInfo<const scalar_t, index_t> grad_output,
+      TensorInfo<const scalar_t, index_t> input,
+      TensorInfo<const scalar_t, index_t> grid,
       TensorInfo<scalar_t, index_t> grad_input,  // initialized to zeros (or unused if input_requires_grad is false)
       TensorInfo<scalar_t, index_t> grad_grid,   // initialized to empty
       const GridSamplerInterpolation interpolation_mode,
@@ -630,12 +630,12 @@ namespace {
         scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
 
         scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0), giz = static_cast<scalar_t>(0);
-        scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+        const scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
         index_t NC_offset;
         if (input_requires_grad) {
           NC_offset = n * gInp_sN;
         }
-        scalar_t *inp_ptr_NC = input.data + n * inp_sN;
+        const scalar_t *inp_ptr_NC = input.data + n * inp_sN;
         // calculate bilinear weighted pixel value and set output pixel
         for (index_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, NC_offset += gInp_sC, inp_ptr_NC += inp_sC) {
           scalar_t gOut = *gOut_ptr_NCDHW;
@@ -724,8 +724,8 @@ namespace {
           auto iy_nearest = static_cast<index_t>(std::nearbyint(iy));
           auto iz_nearest = static_cast<index_t>(std::nearbyint(iz));
 
-          // assign nearest neighor pixel value to output pixel
-          scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+          // assign nearest neighbour pixel value to output pixel
+          const scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
           index_t NC_offset = n * gInp_sN;
           for (index_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, NC_offset += gInp_sC) {
             // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd].
@@ -768,8 +768,8 @@ void launch_grid_sampler_2d_forward_kernel(
         grid_sampler_2d_kernel<scalar_t>
           <<<GET_BLOCKS(count, 256), 256, 0, at::cuda::getCurrentCUDAStream()>>>(
             static_cast<int>(count),
-            getTensorInfo<scalar_t, int>(input),
-            getTensorInfo<scalar_t, int>(grid),
+            getTensorInfo<const scalar_t, int>(input),
+            getTensorInfo<const scalar_t, int>(grid),
             getTensorInfo<scalar_t, int>(output),
             static_cast<GridSamplerInterpolation>(interpolation_mode),
             static_cast<GridSamplerPadding>(padding_mode),
@@ -779,8 +779,8 @@ void launch_grid_sampler_2d_forward_kernel(
         grid_sampler_2d_kernel<scalar_t>
           <<<GET_BLOCKS(count, 256), 256, 0, at::cuda::getCurrentCUDAStream()>>>(
             count,
-            getTensorInfo<scalar_t, int64_t>(input),
-            getTensorInfo<scalar_t, int64_t>(grid),
+            getTensorInfo<const scalar_t, int64_t>(input),
+            getTensorInfo<const scalar_t, int64_t>(grid),
             getTensorInfo<scalar_t, int64_t>(output),
             static_cast<GridSamplerInterpolation>(interpolation_mode),
             static_cast<GridSamplerPadding>(padding_mode),
@@ -813,8 +813,8 @@ void launch_grid_sampler_3d_forward_kernel(
         grid_sampler_3d_kernel<scalar_t>
           <<<GET_BLOCKS(count, 512), 512, 0, at::cuda::getCurrentCUDAStream()>>>(
             static_cast<int>(count),
-            getTensorInfo<scalar_t, int>(input),
-            getTensorInfo<scalar_t, int>(grid),
+            getTensorInfo<const scalar_t, int>(input),
+            getTensorInfo<const scalar_t, int>(grid),
             getTensorInfo<scalar_t, int>(output),
             static_cast<GridSamplerInterpolation>(interpolation_mode),
             static_cast<GridSamplerPadding>(padding_mode),
@@ -824,8 +824,8 @@ void launch_grid_sampler_3d_forward_kernel(
         grid_sampler_3d_kernel<scalar_t>
           <<<GET_BLOCKS(count, 512), 512, 0, at::cuda::getCurrentCUDAStream()>>>(
             count,
-            getTensorInfo<scalar_t, int64_t>(input),
-            getTensorInfo<scalar_t, int64_t>(grid),
+            getTensorInfo<const scalar_t, int64_t>(input),
+            getTensorInfo<const scalar_t, int64_t>(grid),
             getTensorInfo<scalar_t, int64_t>(output),
             static_cast<GridSamplerInterpolation>(interpolation_mode),
             static_cast<GridSamplerPadding>(padding_mode),
@@ -868,9 +868,9 @@ void launch_grid_sampler_2d_backward_kernel(
         grid_sampler_2d_backward_kernel<scalar_t>
           <<<GET_BLOCKS(count, 256), 256, 0, at::cuda::getCurrentCUDAStream()>>>(
             static_cast<int>(count),
-            getTensorInfo<scalar_t, int>(grad_output),
-            getTensorInfo<scalar_t, int>(input),
-            getTensorInfo<scalar_t, int>(grid),
+            getTensorInfo<const scalar_t, int>(grad_output),
+            getTensorInfo<const scalar_t, int>(input),
+            getTensorInfo<const scalar_t, int>(grid),
             input_requires_grad ? getTensorInfo<scalar_t, int>(grad_input) : TensorInfo<scalar_t, int>(),
             getTensorInfo<scalar_t, int>(grad_grid),
             static_cast<GridSamplerInterpolation>(interpolation_mode),
@@ -883,9 +883,9 @@ void launch_grid_sampler_2d_backward_kernel(
         grid_sampler_2d_backward_kernel<scalar_t>
           <<<GET_BLOCKS(count, 256), 256, 0, at::cuda::getCurrentCUDAStream()>>>(
             count,
-            getTensorInfo<scalar_t, int64_t>(grad_output),
-            getTensorInfo<scalar_t, int64_t>(input),
-            getTensorInfo<scalar_t, int64_t>(grid),
+            getTensorInfo<const scalar_t, int64_t>(grad_output),
+            getTensorInfo<const scalar_t, int64_t>(input),
+            getTensorInfo<const scalar_t, int64_t>(grid),
             input_requires_grad ? getTensorInfo<scalar_t, int64_t>(grad_input) : TensorInfo<scalar_t, int64_t>(),
             getTensorInfo<scalar_t, int64_t>(grad_grid),
             static_cast<GridSamplerInterpolation>(interpolation_mode),
@@ -927,9 +927,9 @@ void launch_grid_sampler_3d_backward_kernel(
         grid_sampler_3d_backward_kernel<scalar_t>
           <<<GET_BLOCKS(count, 256), 256, 0, at::cuda::getCurrentCUDAStream()>>>(
             static_cast<int>(count),
-            getTensorInfo<scalar_t, int>(grad_output),
-            getTensorInfo<scalar_t, int>(input),
-            getTensorInfo<scalar_t, int>(grid),
+            getTensorInfo<const scalar_t, int>(grad_output),
+            getTensorInfo<const scalar_t, int>(input),
+            getTensorInfo<const scalar_t, int>(grid),
             input_requires_grad ? getTensorInfo<scalar_t, int>(grad_input) : TensorInfo<scalar_t, int>(),
             getTensorInfo<scalar_t, int>(grad_grid),
             static_cast<GridSamplerInterpolation>(interpolation_mode),
@@ -942,9 +942,9 @@ void launch_grid_sampler_3d_backward_kernel(
         grid_sampler_3d_backward_kernel<scalar_t>
           <<<GET_BLOCKS(count, 256), 256, 0, at::cuda::getCurrentCUDAStream()>>>(
             count,
-            getTensorInfo<scalar_t, int64_t>(grad_output),
-            getTensorInfo<scalar_t, int64_t>(input),
-            getTensorInfo<scalar_t, int64_t>(grid),
+            getTensorInfo<const scalar_t, int64_t>(grad_output),
+            getTensorInfo<const scalar_t, int64_t>(input),
+            getTensorInfo<const scalar_t, int64_t>(grid),
             input_requires_grad ? getTensorInfo<scalar_t, int64_t>(grad_input) : TensorInfo<scalar_t, int64_t>(),
             getTensorInfo<scalar_t, int64_t>(grad_grid),
             static_cast<GridSamplerInterpolation>(interpolation_mode),
diff --git a/aten/src/ATen/native/cuda/GridSampler.cuh b/aten/src/ATen/native/cuda/GridSampler.cuh
index a0e3b16c3a43a..731f4d7824bf1 100644
--- a/aten/src/ATen/native/cuda/GridSampler.cuh
+++ b/aten/src/ATen/native/cuda/GridSampler.cuh
@@ -228,7 +228,7 @@ bool within_bounds_3d(int d, int h, int w, int D, int H, int W) {
 template<typename scalar_t>
 static __forceinline__ __device__
 scalar_t get_value_bounded(
-    scalar_t *data, scalar_t x, scalar_t y, int W, int H, int sW, int sH,
+    const scalar_t *data, scalar_t x, scalar_t y, int W, int H, int sW, int sH,
     GridSamplerPadding padding_mode,
     bool align_corners) {
 
diff --git a/aten/src/ATen/native/cuda/IGammaKernel.cu b/aten/src/ATen/native/cuda/IGammaKernel.cu
index be3f7fc54a6b3..7102110fb4fd3 100644
--- a/aten/src/ATen/native/cuda/IGammaKernel.cu
+++ b/aten/src/ATen/native/cuda/IGammaKernel.cu
@@ -450,7 +450,7 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
 }
 
 // NOTE: this __noinline__ is important -- otherwise, observed compile times significantly
-// increase.  The same kernel seems to get recompiled mulitple times via gpu_kernel_with_scalars,
+// increase.  The same kernel seems to get recompiled multiple times via gpu_kernel_with_scalars,
 // multiple dtypes, etc.
 template <typename scalar_t>
 __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
index 657c0c77b3d67..5682ba2757315 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -333,7 +333,7 @@ void take_kernel(
     // Cannot use `OpaqueType`, as Tensor::data_ptr<OpaqueType<N>> is not implemented
     AT_DISPATCH_INDEX_TYPES(cuda::detail::canUse32BitIndexMath(input) ? ScalarType::Int : ScalarType::Long,
       "take_cuda_index", [&] {
-         const auto* __restrict__ indexed_ptr = input.template data_ptr<scalar_t>();
+         const auto* __restrict__ indexed_ptr = input.template const_data_ptr<scalar_t>();
          cuda_take_put_kernel<scalar_t, index_t>(iter, input,
             [indexed_ptr] __device__(scalar_t& iterated, const index_t offset) {
                iterated = indexed_ptr[offset];
@@ -385,7 +385,7 @@ void launch_masked_scatter_kernel(
       .resize_outputs(false)
       .add_output(self)
       .add_input(self)
-      .add_input(mask_cont)
+      .add_const_input(mask_cont)
       .add_input(maskPrefixSum)
       .build();
 
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index 607cb9e2c9c56..c3eadde686355 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -414,9 +414,7 @@ static std::tuple<Tensor, Tensor, int64_t, int64_t, int64_t, std::vector<int64_t
   if (!hasContiguousSubspace(indices)) {
     std::tie(self, indices, inversePerm) = transposeToFrontAndInvPerm(self, indices);
   }
-  int64_t nElemBefore, strideBefore, nElemAfter;
-  Tensor linearIndex;
-  std::tie(linearIndex, nElemBefore, strideBefore, nElemAfter) = computeLinearIndex(self, indices, check_range);
+  auto [linearIndex, nElemBefore, strideBefore, nElemAfter] = computeLinearIndex(self, indices, check_range);
   return std::make_tuple(linearIndex, self, nElemBefore, strideBefore, nElemAfter, inversePerm);
 }
 
@@ -727,8 +725,8 @@ static ptrdiff_t getSliceSize(const Tensor & dst,
 template <typename T, typename IndicesType, typename IndexType, int DstDim, int SrcDim, int IdxDim,
           typename func_t>
 __global__ void indexFuncSmallIndex(cuda::detail::TensorInfo<T, IndexType> dst,
-                                    cuda::detail::TensorInfo<T, IndexType> src,
-                                    cuda::detail::TensorInfo<IndicesType, IndexType> indices,
+                                    cuda::detail::TensorInfo<const T, IndexType> src,
+                                    cuda::detail::TensorInfo<const IndicesType, IndexType> indices,
                                     int dstAddDim,
                                     int srcAddDim,
                                     IndexType innerSize,
@@ -744,7 +742,7 @@ __global__ void indexFuncSmallIndex(cuda::detail::TensorInfo<T, IndexType> dst,
   for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) {
     // Lua indices begin at 1
     IndexType dstIndex =
-        indices.data[cuda::detail::IndexToOffset<IndicesType, IndexType, IdxDim>::get(srcIndex, indices)];
+        indices.data[cuda::detail::IndexToOffset<const IndicesType, IndexType, IdxDim>::get(srcIndex, indices)];
     CUDA_KERNEL_ASSERT(dstIndex < dstAddDimSize);
 
     // We stride over the output ignoring the indexed dimension
@@ -757,7 +755,7 @@ __global__ void indexFuncSmallIndex(cuda::detail::TensorInfo<T, IndexType> dst,
       dstOffset += dstIndex * dst.strides[dstAddDim];
 
       IndexType srcOffset =
-          cuda::detail::IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src);
+          cuda::detail::IndexToOffset<const T, IndexType, SrcDim>::get(linearIndex, src);
       srcOffset += srcIndex * src.strides[srcAddDim];
 
       T val = src.data[srcOffset] * alpha;
@@ -776,8 +774,8 @@ __global__ void indexFuncSmallIndex(cuda::detail::TensorInfo<T, IndexType> dst,
 template <typename T, typename IndicesType, typename IndexType, int DstDim, int SrcDim, int IdxDim,
           bool IndexIsMajor, typename func_t>
 __global__ void indexFuncLargeIndex(cuda::detail::TensorInfo<T, IndexType> dst,
-                                    cuda::detail::TensorInfo<T, IndexType> src,
-                                    cuda::detail::TensorInfo<IndicesType, IndexType> indices,
+                                    cuda::detail::TensorInfo<const T, IndexType> src,
+                                    cuda::detail::TensorInfo<const IndicesType, IndexType> indices,
                                     int dstAddDim,
                                     int srcAddDim,
                                     IndexType totalSize,
@@ -803,7 +801,7 @@ __global__ void indexFuncLargeIndex(cuda::detail::TensorInfo<T, IndexType> dst,
 
     // Lua indices begin at 1
     IndexType dstIndex =
-        indices.data[cuda::detail::IndexToOffset<IndicesType, IndexType, IdxDim>::get(srcIndex, indices)];
+        indices.data[cuda::detail::IndexToOffset<const IndicesType, IndexType, IdxDim>::get(srcIndex, indices)];
     CUDA_KERNEL_ASSERT(dstIndex < dstAddDimSize);
 
     IndexType dstOffset =
@@ -811,7 +809,7 @@ __global__ void indexFuncLargeIndex(cuda::detail::TensorInfo<T, IndexType> dst,
     dstOffset += dstIndex * dst.strides[dstAddDim];
 
     IndexType srcOffset =
-      cuda::detail::IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src);
+      cuda::detail::IndexToOffset<const T, IndexType, SrcDim>::get(elementInSlice, src);
     srcOffset += srcIndex * src.strides[srcAddDim];
 
     T val = src.data[srcOffset] * alpha;
@@ -933,12 +931,12 @@ void index_add_cuda_impl(const Tensor& self, int64_t dim, const Tensor& index, c
       const auto alpha_value = alpha.to<scalar_t>();
       AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_add_cuda_", [&] () {
         auto sourceInfo =
-          cuda::detail::getTensorInfo<scalar_t, unsigned int>(source_);
+          cuda::detail::getTensorInfo<const scalar_t, unsigned int>(source_);
         const int sourceAddDim = sourceInfo.collapseDims(dim);
         sourceInfo.reduceDim(sourceAddDim);
 
         auto indexInfo =
-        cuda::detail::getTensorInfo<index_t, unsigned int>(index);
+        cuda::detail::getTensorInfo<const index_t, unsigned int>(index);
         indexInfo.collapseDims();
 
         // A reasonable choice for when to have each thread iterate over
@@ -984,14 +982,14 @@ void index_add_cuda_impl(const Tensor& self, int64_t dim, const Tensor& index, c
       selfInfo.reduceDim(selfAddDim);
       const auto alpha_value = alpha.to<scalar_t>();
 
-      cuda::detail::TensorInfo<scalar_t, uint64_t> sourceInfo =
-        cuda::detail::getTensorInfo<scalar_t, uint64_t>(source_);
+      cuda::detail::TensorInfo<const scalar_t, uint64_t> sourceInfo =
+        cuda::detail::getTensorInfo<const scalar_t, uint64_t>(source_);
       const int sourceAddDim = sourceInfo.collapseDims(dim);
       sourceInfo.reduceDim(sourceAddDim);
 
       AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_add_cuda_", [&] () {
-        cuda::detail::TensorInfo<index_t, uint64_t> indexInfo =
-          cuda::detail::getTensorInfo<index_t, uint64_t>(index);
+        cuda::detail::TensorInfo<const index_t, uint64_t> indexInfo =
+          cuda::detail::getTensorInfo<const index_t, uint64_t>(index);
         indexInfo.collapseDims();
 
         LARGE_INDEX(scalar_t, index_t, uint64_t, -1, -1, -1, true);
@@ -1106,12 +1104,12 @@ void index_reduce_func_cuda_impl(
       auto alpha_value = (scalar_t) 1;
       AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_reduce_cuda", [&] () {
         auto sourceInfo =
-          cuda::detail::getTensorInfo<scalar_t, unsigned int>(source_);
+          cuda::detail::getTensorInfo<const scalar_t, unsigned int>(source_);
         int sourceReduceDim = sourceInfo.collapseDims(dim);
         sourceInfo.reduceDim(sourceReduceDim);
 
         auto indexInfo =
-        cuda::detail::getTensorInfo<index_t, unsigned int>(index);
+        cuda::detail::getTensorInfo<const index_t, unsigned int>(index);
         indexInfo.collapseDims();
 
         // A reasonable choice for when to have each thread iterate over
@@ -1157,14 +1155,14 @@ void index_reduce_func_cuda_impl(
       selfInfo.reduceDim(selfReduceDim);
       auto alpha_value = (scalar_t) 1;
 
-      cuda::detail::TensorInfo<scalar_t, uint64_t> sourceInfo =
-        cuda::detail::getTensorInfo<scalar_t, uint64_t>(source_);
+      cuda::detail::TensorInfo<const scalar_t, uint64_t> sourceInfo =
+        cuda::detail::getTensorInfo<const scalar_t, uint64_t>(source_);
       int sourceReduceDim = sourceInfo.collapseDims(dim);
       sourceInfo.reduceDim(sourceReduceDim);
 
       AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_reduce_cuda", [&] () {
-        cuda::detail::TensorInfo<index_t, uint64_t> indexInfo =
-          cuda::detail::getTensorInfo<index_t, uint64_t>(index);
+        cuda::detail::TensorInfo<const index_t, uint64_t> indexInfo =
+          cuda::detail::getTensorInfo<const index_t, uint64_t>(index);
         indexInfo.collapseDims();
 
         LARGE_INDEX(scalar_t, index_t, uint64_t, -1, -1, -1, true);
@@ -1221,8 +1219,8 @@ namespace {
 // parallelism.
 template <typename T, typename IndicesType, typename IndexType, int DstDim, int SrcDim, int IdxDim>
 __global__ void indexSelectSmallIndex(cuda::detail::TensorInfo<T, IndexType> dst,
-                                      cuda::detail::TensorInfo<T, IndexType> src,
-                                      cuda::detail::TensorInfo<IndicesType, IndexType> indices,
+                                      cuda::detail::TensorInfo<const T, IndexType> src,
+                                      cuda::detail::TensorInfo<const IndicesType, IndexType> indices,
                                       int dstSelectDim,
                                       int srcSelectDim,
                                       IndexType innerSize,
@@ -1234,7 +1232,7 @@ __global__ void indexSelectSmallIndex(cuda::detail::TensorInfo<T, IndexType> dst
   // re-accessing indices in addition to src elements can be slow.
   for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) {
     IndexType srcIndex =
-      indices.data[cuda::detail::IndexToOffset<IndicesType, IndexType, IdxDim>::get(dstIndex, indices)];
+      indices.data[cuda::detail::IndexToOffset<const IndicesType, IndexType, IdxDim>::get(dstIndex, indices)];
     CUDA_KERNEL_ASSERT(srcIndex < srcSelectDimSize);
 
     // We stride over the output ignoring the indexed dimension
@@ -1247,7 +1245,7 @@ __global__ void indexSelectSmallIndex(cuda::detail::TensorInfo<T, IndexType> dst
       dstOffset += dstIndex * dst.strides[dstSelectDim];
 
       IndexType srcOffset =
-        cuda::detail::IndexToOffset<T, IndexType, SrcDim>::get(linearIndex, src);
+        cuda::detail::IndexToOffset<const T, IndexType, SrcDim>::get(linearIndex, src);
       srcOffset += srcIndex * src.strides[srcSelectDim];
 
       dst.data[dstOffset] = src.data[srcOffset];
@@ -1264,8 +1262,8 @@ __global__ void indexSelectSmallIndex(cuda::detail::TensorInfo<T, IndexType> dst
 template <typename T, typename IndicesType, typename IndexType, int DstDim, int SrcDim, int IdxDim,
           bool IndexIsMajor>
 __global__ void indexSelectLargeIndex(cuda::detail::TensorInfo<T, IndexType> dst,
-                                      cuda::detail::TensorInfo<T, IndexType> src,
-                                      cuda::detail::TensorInfo<IndicesType, IndexType> indices,
+                                      cuda::detail::TensorInfo<const T, IndexType> src,
+                                      cuda::detail::TensorInfo<const IndicesType, IndexType> indices,
                                       int dstSelectDim,
                                       int srcSelectDim,
                                       IndexType totalSize,
@@ -1287,7 +1285,7 @@ __global__ void indexSelectLargeIndex(cuda::detail::TensorInfo<T, IndexType> dst
     }
 
     IndexType srcIndex =
-      indices.data[cuda::detail::IndexToOffset<IndicesType, IndexType, IdxDim>::get(dstIndex, indices)];
+      indices.data[cuda::detail::IndexToOffset<const IndicesType, IndexType, IdxDim>::get(dstIndex, indices)];
     CUDA_KERNEL_ASSERT(srcIndex < srcSelectDimSize);
 
     IndexType dstOffset =
@@ -1295,7 +1293,7 @@ __global__ void indexSelectLargeIndex(cuda::detail::TensorInfo<T, IndexType> dst
     dstOffset += dstIndex * dst.strides[dstSelectDim];
 
     IndexType srcOffset =
-      cuda::detail::IndexToOffset<T, IndexType, SrcDim>::get(elementInSlice, src);
+      cuda::detail::IndexToOffset<const T, IndexType, SrcDim>::get(elementInSlice, src);
     srcOffset += srcIndex * src.strides[srcSelectDim];
 
     dst.data[dstOffset] = src.data[srcOffset];
@@ -1395,12 +1393,12 @@ void index_select_out_cuda_impl(
     int outSelectDim = outInfo.collapseDims(dim);
     outInfo.reduceDim(outSelectDim);
 
-    auto  selfInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo<scalar_t, unsigned int>(self));
+    auto  selfInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo<const scalar_t, unsigned int>(self));
     int selfSelectDim = selfInfo.collapseDims(dim);
     selfInfo.reduceDim(selfSelectDim);
 
     AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_select_out_cuda_impl", [&] () {
-      auto indicesInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo<index_t, unsigned int>(index));
+      auto indicesInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo<const index_t, unsigned int>(index));
       indicesInfo.collapseDims();
 
       // A reasonable choice for when to have each thread iterate over
@@ -1442,11 +1440,11 @@ void index_select_out_cuda_impl(
     int outSelectDim = outInfo.collapseDims(dim);
     outInfo.reduceDim(outSelectDim);
 
-    auto selfInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo<scalar_t, uint64_t>(self));
+    auto selfInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo<const scalar_t, uint64_t>(self));
     int selfSelectDim = selfInfo.collapseDims(dim);
     selfInfo.reduceDim(selfSelectDim);
     AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_select_out_cuda_impl", [&] () {
-      auto indicesInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo<index_t, uint64_t>(index));
+      auto indicesInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo<const index_t, uint64_t>(index));
       indicesInfo.collapseDims();
 
       LARGE_INDEX(scalar_t, index_t, uint64_t, -1, -1, -1, true);
@@ -1576,8 +1574,8 @@ Tensor & masked_fill__cuda(Tensor& self, const Tensor & mask, const Scalar& valu
       .check_all_same_dtype(false)
       .resize_outputs(false)
       .add_output(self)
-      .add_input(self)
-      .add_input(*b_mask)
+      .add_const_input(self)
+      .add_const_input(*b_mask)
       .build();
 
   masked_fill_kernel(iter, value);
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
index fe38b1e17f24e..cb14f275e2171 100644
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@@ -68,17 +68,7 @@ __device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) {
 
 }}  // namespace at::native
 
-// Note:
-// CUDA and ROCm get diverged in this PR:
-//   https://github.com/pytorch/pytorch/pull/32383
-// Because for some reason trying to enable vectorized
-// memory access introduce regression on ROCm.
-
-#if !defined(USE_ROCM)
-  #include <ATen/native/cuda/CUDALoops.cuh>
-#else
-  #include <ATen/native/cuda/ROCmLoops.cuh>
-#endif
+#include <ATen/native/cuda/CUDALoops.cuh>
 
 namespace at:: native {
 
diff --git a/aten/src/ATen/native/cuda/Loss.cu b/aten/src/ATen/native/cuda/Loss.cu
index 3f76f0931bfbc..1691adca87253 100644
--- a/aten/src/ATen/native/cuda/Loss.cu
+++ b/aten/src/ATen/native/cuda/Loss.cu
@@ -219,7 +219,7 @@ __global__ void nll_loss_forward_reduce_cuda_kernel_1d(
       *output = -cur_weight * input[t];
     }
   } else {
-    // If the only element was omited, we get 0. See the discussion in
+    // If the only element was omitted, we get 0. See the discussion in
     // https://github.com/pytorch/pytorch/pull/64572#issuecomment-926504162
     *output = scalar_t{0};
     *total_weight = scalar_t{0};
@@ -408,7 +408,7 @@ template <typename scalar_t, typename index_t>
 __global__ void nll_loss_backward_no_reduce_cuda_kernel(
   int batch_size,
   const index_t *target,
-  PackedTensorAccessor64<scalar_t, 1> grad_output,
+  PackedTensorAccessor64<const scalar_t, 1> grad_output,
   PackedTensorAccessor64<scalar_t, 2> grad_input,
   const scalar_t *weights,
   int64_t n_classes,
@@ -520,7 +520,7 @@ void nll_loss_backward_out_cuda_template(
                        at::cuda::getCurrentCUDAStream()>>>(
                         batch_size,
                         target.const_data_ptr<index_t>(),
-                        grad_output.packed_accessor64<scalar_t, 1>(),
+                        grad_output.packed_accessor64<const scalar_t, 1>(),
                         grad_input.packed_accessor64<scalar_t, 2>(),
                         weight.defined() ? weight_.const_data_ptr<scalar_t>() : nullptr,
                         n_classes,
diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu
index 5fb86d16e95a9..b451592f19440 100644
--- a/aten/src/ATen/native/cuda/LossCTC.cu
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@@ -44,7 +44,7 @@ namespace {
 // so if l is l_0 l_1 ... l_(tl-1) then this looks up idx in
 // l' = BLANK l_0 BLANK l_1 BLANK ... BLANK l_(tl-1) BLANK
 // - note that no bound-checking is done
-// - it is important to only call it witth idx == 0 if the target length is 0
+// - it is important to only call it with idx == 0 if the target length is 0
 // - __restrict__ impact to be measured, see
 //   https://devblogs.nvidia.com/cuda-pro-tip-optimize-pointer-aliasing/
 template <typename target_t>
@@ -97,6 +97,14 @@ ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
   if (b >= batch_size)
     return;
 
+  if (input_length == 0) {
+    if (threadIdx.x == 0) {
+      scalar_t log_likelihood = target_length == 0 ? 0 : neginf;
+      neg_log_likelihood_data[b] = -log_likelihood;
+    }
+    return;
+  }
+
   // first row (t=0), the three equations for alpha_1 above eq (6)
   for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) {
     int64_t s = threadIdx.x + block_s;
@@ -237,6 +245,9 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
   if (targets.dim() == 1) { // concatenated targets
     int64_t pos = 0;
     for (int64_t i = 0; i < batch_size; i++) {
+      TORCH_CHECK(target_lengths[i] >= 0,
+                  "Expected target_lengths to have value at least ", 0, ", but got value ", target_lengths[i],
+                  " (while checking arguments for ", c, ")");
       tg_batch_offsets_data[i] = pos;
       pos += target_lengths[i];
       if (max_target_length < target_lengths[i])
@@ -249,6 +260,9 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
     // dim is 2
     int64_t tg_batch_stride = targets.stride(0);
     for (int64_t i = 0; i < batch_size; i++) {
+      TORCH_CHECK(target_lengths[i] >= 0,
+                  "Expected target_lengths to have value at least ", 0, ", but got value ", target_lengths[i],
+                  " (while checking arguments for ", c, ")");
       tg_batch_offsets_data[i] = i * tg_batch_stride;
       if (max_target_length < target_lengths[i])
         max_target_length = target_lengths[i];
@@ -261,6 +275,9 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
   }
   int64_t max_input_length = log_probs.size(0);
   for (int64_t b = 0; b < batch_size; b++) {
+    TORCH_CHECK(input_lengths[b] >= 0,
+             "Expected input_lengths to have value at least ", 0, ", but got value ", input_lengths[b],
+             " (while checking arguments for ", c, ")");
     TORCH_CHECK(input_lengths[b] <= max_input_length,
              "Expected input_lengths to have value at most ", max_input_length, ", but got value ", input_lengths[b],
              " (while checking arguments for ", c, ")");
@@ -273,8 +290,8 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
   Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
   Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options());
 
-  // Very likely, we could be more clever here, e.g. learning (or genralizing and reusing) from SoftMax.cu...
-  constexpr int max_threads = std::is_same<scalar_t, float>::value ? 1024 : 896; // we need 72 or so 32 bit registers for double
+  // Very likely, we could be more clever here, e.g. learning (or generalizing and reusing) from SoftMax.cu...
+  constexpr int max_threads = std::is_same<scalar_t, float>::value ? 1024 : 768; // we need 72 or so 32 bit registers for double
   int threads_target = max_threads;
   while (threads_target / 2 >= 2*max_target_length+1) {
     threads_target /= 2;
@@ -322,7 +339,10 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
   if (b >= batch_size)
     return;
 
-  // "first" row, the beta initiaization before eq (10) (t=target_length - differes per batch)
+  if (input_length == 0)
+    return;
+
+  // "first" row, the beta initialization before eq (10) (t=target_length - differes per batch)
   for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
     int64_t s = threadIdx.x + block_s;
     scalar_t lb;
diff --git a/aten/src/ATen/native/cuda/MaxUnpooling.cu b/aten/src/ATen/native/cuda/MaxUnpooling.cu
index 340162c649ece..b364d679fa3b1 100644
--- a/aten/src/ATen/native/cuda/MaxUnpooling.cu
+++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu
@@ -51,8 +51,8 @@ __global__ void max_unpooling2d_forward_kernel(
 
 template <typename T>
 __global__ void max_unpooling3d_forward_kernel(
-    PackedTensorAccessor64<T, 4> input,
-    PackedTensorAccessor64<int64_t, 4> indices,
+    PackedTensorAccessor64<const T, 4> input,
+    PackedTensorAccessor64<const int64_t, 4> indices,
     T* output,
     const int64_t oT,
     const int64_t oH,
@@ -64,8 +64,8 @@ __global__ void max_unpooling3d_forward_kernel(
   int64_t slice = (blockIdx.z + offsetZ) / input.size(1); // input slice/feature
   int64_t outputImageSize = oT * oH * oW;
   if (iRow < input.size(2) && iColumn < input.size(3)) {
-    T val = input[slice][iFrame][iRow][iColumn];
-    int64_t index = indices[slice][iFrame][iRow][iColumn];
+    const T val = input[slice][iFrame][iRow][iColumn];
+    const int64_t index = indices[slice][iFrame][iRow][iColumn];
     CUDA_KERNEL_ASSERT(index >= 0 && index < outputImageSize);
     output[slice * oT * oH * oW + index] = val;
   }
@@ -370,8 +370,8 @@ Tensor& max_unpooling3d_forward_out_cuda(const Tensor& self_,
               block,
               0,
               at::cuda::getCurrentCUDAStream()>>>(
-              self.packed_accessor64<scalar_t, 4>(),
-              indices.packed_accessor64<int64_t, 4>(),
+              self.packed_accessor64<const scalar_t, 4>(),
+              indices.packed_accessor64<const int64_t, 4>(),
               output.mutable_data_ptr<scalar_t>(),
               oT,
               oH,
diff --git a/aten/src/ATen/native/cuda/MemoryAccess.cuh b/aten/src/ATen/native/cuda/MemoryAccess.cuh
index 8f47a039a1e3f..0fdc813fd7770 100644
--- a/aten/src/ATen/native/cuda/MemoryAccess.cuh
+++ b/aten/src/ATen/native/cuda/MemoryAccess.cuh
@@ -109,7 +109,7 @@ struct LoadWithCast {
   size_array_t element_sizes;
 
   LoadWithCast(const TensorIteratorBase& iter) {
-    assert(iter.ninputs() == N);
+    CUDA_KERNEL_ASSERT(iter.ninputs() == N);
     #pragma unroll
     for (auto i = 0; i < N; ++i) {
       this->dtypes[i] = iter.dtype(i + iter.noutputs());
@@ -140,7 +140,7 @@ struct StoreWithCast {
   size_array_t element_sizes;
 
   StoreWithCast(const TensorIteratorBase& iter) {
-    assert(iter.noutputs() == N);
+    CUDA_KERNEL_ASSERT(iter.noutputs() == N);
     #pragma unroll
     for (auto i = 0; i < N; ++i) {
       this->dtypes[i] = iter.dtype(i);
@@ -197,7 +197,7 @@ struct unroll {
     data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc), loader(l), storer(s) {}
 
   __device__ inline bool check_inbounds(int thread_work_elem) {
-    return ((threadIdx.x  + thread_work_elem*num_threads()) < remaining);
+    return ((int)(threadIdx.x  + thread_work_elem*num_threads()) < remaining);
   }
 
   template<typename args_t>
@@ -219,7 +219,6 @@ struct unroll {
   template<typename scalar_t>
   __device__ inline void store(scalar_t *from, int idx) {
     int thread_idx = threadIdx.x;
-    scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size() * idx;
     #pragma unroll
     for (int i = 0; i < thread_work_size(); i++) {
       if (thread_idx >= remaining) {
@@ -305,7 +304,7 @@ struct multi_outputs_unroll {
   data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {}
 
   __device__ inline bool check_inbounds(int thread_work_elem) {
-    return ((threadIdx.x  + thread_work_elem*num_threads()) < remaining);
+    return ((int)(threadIdx.x  + thread_work_elem*num_threads()) < remaining);
   }
 
   template<typename args_t>
@@ -347,7 +346,7 @@ struct multi_outputs_unroll {
 // which is C10_HOST_DEVICE, so we have to make this C10_HOST_DEVICE
 // in order to compile
 template<typename scalar_t>
-inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) {
+inline C10_HOST_DEVICE int can_vectorize_up_to(const char *pointer) {
   uint64_t address = reinterpret_cast<uint64_t>(pointer);
   constexpr int vec2_alignment = std::alignment_of<aligned_vector<scalar_t, 2>>::value;
   constexpr int vec4_alignment = std::alignment_of<aligned_vector<scalar_t, 4>>::value;
@@ -359,6 +358,11 @@ inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) {
   return 1;
 }
 
+template<typename scalar_t>
+inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) {
+  return can_vectorize_up_to<scalar_t>(static_cast<const char*>(pointer));
+}
+
 template<int i>
 struct can_vectorize_up_to_helper {
   template <typename array_t, typename traits>
diff --git a/aten/src/ATen/native/cuda/NLLLoss2d.cu b/aten/src/ATen/native/cuda/NLLLoss2d.cu
index 53d4238806b6e..94c9aeba79f51 100644
--- a/aten/src/ATen/native/cuda/NLLLoss2d.cu
+++ b/aten/src/ATen/native/cuda/NLLLoss2d.cu
@@ -56,6 +56,7 @@ __global__ void nll_loss2d_forward_no_reduce_kernel(
   int64_t ignore_index
 ) {
   int64_t batch_size = input.size(0);
+  int64_t n_classes = input.size(1);
   int64_t H = input.size(2);
   int64_t W = input.size(3);
 
@@ -69,6 +70,7 @@ __global__ void nll_loss2d_forward_no_reduce_kernel(
       output[b][h][w] = static_cast<scalar_t>(0);
       continue;
     }
+    CUDA_KERNEL_ASSERT(cur_target >= 0 && cur_target < n_classes);
     scalar_t value = input[b][cur_target][h][w];
     scalar_t cur_weight = weight != nullptr ? weight[cur_target] : static_cast<scalar_t>(1);
     output[b][h][w] = -value * cur_weight;
diff --git a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu
index c6c115dc640d8..247b1728badea 100644
--- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu
+++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu
@@ -219,7 +219,7 @@ void slow_conv_transpose2d_out_cuda_template(
 
         // For each elt in batch, do:
         for (int elt = 0; elt < batch_size; elt++) {
-          // Matrix mulitply per output:
+          // Matrix multiply per output:
           input_n = input_.select(0, elt);
           output_n = output.select(0, elt);
 
@@ -419,7 +419,7 @@ static void slow_conv_transpose2d_backward_out_cuda_template(
 
         // For each elt in batch, do:
         for (int elt = 0; elt < batch_size; elt++) {
-          // Matrix mulitply per sample:
+          // Matrix multiply per sample:
           grad_input_n = grad_input.select(0, elt);
           grad_output_n = grad_output.select(0, elt);
 
@@ -611,12 +611,12 @@ void slow_conv_transpose2d_acc_grad_parameters_cuda_template(
 
         // For each elt in batch, do:
         for (int elt = 0; elt < batch_size; elt++) {
-          // Matrix mulitply per output:
+          // Matrix multiply per output:
           grad_output_n = grad_output.select(0, elt);
 
           // Do Weight:
           if (grad_weight.defined()) {
-            // Matrix mulitply per output:
+            // Matrix multiply per output:
             input_n = input.select(0, elt);
 
             if (need_columns) {
diff --git a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu
index 1074769392b48..fd6e83aa24171 100644
--- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu
+++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu
@@ -301,7 +301,7 @@ void slow_conv_transpose3d_out_cuda_template(
 
         // For each elt in batch, do:
         for (int elt = 0; elt < batch_size; elt++) {
-          // Matrix mulitply per output:
+          // Matrix multiply per output:
           input_n = input.select(0, elt);
           output_n = output.select(0, elt);
 
@@ -531,7 +531,7 @@ void slow_conv_transpose3d_backward_out_cuda_template(
 
         // For each elt in batch, do:
         for (int elt = 0; elt < batch_size; elt++) {
-          // Matrix mulitply per sample:
+          // Matrix multiply per sample:
           grad_input_n = grad_input.select(0, elt);
           grad_output_n = grad_output.select(0, elt);
 
@@ -756,12 +756,12 @@ void slow_conv_transpose3d_acc_grad_parameters_cuda(
 
         // For each elt in batch, do:
         for (int elt = 0; elt < batch_size; elt++) {
-          // Matrix mulitply per output:
+          // Matrix multiply per output:
           grad_output_n = grad_output.select(0, elt);
 
           // Do Weight:
           if (grad_weight.defined()) {
-            // Matrix mulitply per output:
+            // Matrix multiply per output:
             input_n = input.select(0, elt);
 
             if (need_columns) {
diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu
index f2104ee9d0459..ce0a50daae145 100644
--- a/aten/src/ATen/native/cuda/Normalization.cu
+++ b/aten/src/ATen/native/cuda/Normalization.cu
@@ -1,5 +1,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/native/Normalization.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/ReduceOps.h>
 #include <ATen/native/Resize.h>
@@ -12,6 +14,8 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_batch_norm_with_update_native.h>
+#include <ATen/ops/batch_norm_backward_native.h>
 #include <ATen/ops/batch_norm_backward_elemt_native.h>
 #include <ATen/ops/batch_norm_backward_reduce_native.h>
 #include <ATen/ops/batch_norm_elemt_native.h>
@@ -19,8 +23,12 @@
 #include <ATen/ops/batch_norm_gather_stats_with_counts_native.h>
 #include <ATen/ops/batch_norm_stats_native.h>
 #include <ATen/ops/batch_norm_update_stats_native.h>
+#include <ATen/ops/cudnn_batch_norm.h>
+#include <ATen/ops/cudnn_batch_norm_backward.h>
 #include <ATen/ops/empty_like.h>
 #include <ATen/ops/from_blob.h>
+#include <ATen/ops/miopen_batch_norm.h>
+#include <ATen/ops/miopen_batch_norm_backward.h>
 #include <ATen/ops/native_batch_norm_backward_native.h>
 #include <ATen/ops/native_batch_norm_native.h>
 #include <ATen/ops/scalar_tensor.h>
@@ -124,7 +132,7 @@ void batch_norm_elementwise(
           out, self, *weight, *bias, mean_, invstd_);
       return;
     }
-    C10_FALLTHROUGH;
+    [[fallthrough]];
   }
   case Impl::General: {
     const int64_t ndim = self.dim();
@@ -193,7 +201,7 @@ Tensor batch_norm_elementwise_backward_train(
       return batch_norm_backward_elemt_channels_last_cuda_template(
           grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu);
     }
-    C10_FALLTHROUGH;
+    [[fallthrough]];
   }
   case Impl::General: {
     const auto ndim = input.dim();
@@ -259,9 +267,9 @@ Tensor batch_norm_elementwise_backward_eval(
     auto weight_nd = weight.as_strided(shape, strides);
     auto iter = TensorIteratorConfig()
         .add_output(grad_input)
-        .add_input(grad_out)
-        .add_input(invstd_nd)
-        .add_input(weight_nd)
+        .add_const_input(grad_out)
+        .add_const_input(invstd_nd)
+        .add_const_input(weight_nd)
         .check_all_same_dtype(false)
         .promote_inputs_to_common_dtype(false)
         .build();
@@ -277,8 +285,8 @@ Tensor batch_norm_elementwise_backward_eval(
   } else {
     auto iter = TensorIteratorConfig()
         .add_output(grad_input)
-        .add_input(grad_out)
-        .add_input(invstd_nd)
+        .add_const_input(grad_out)
+        .add_const_input(invstd_nd)
         .check_all_same_dtype(false)
         .promote_inputs_to_common_dtype(false)
         .build();
@@ -317,7 +325,7 @@ void batch_norm_mean_var(const Tensor& self, Tensor& save_mean, Tensor& save_var
       });
       return;
     }
-    C10_FALLTHROUGH;
+    [[fallthrough]];
   }
   case Impl::General: {
     const int64_t ndim = self.dim();
@@ -378,7 +386,7 @@ void batch_norm_update_stats_and_invert(
       .add_output(running_mean)
       .add_output(running_var)
       .add_output(save_var)
-      .add_input(save_mean)
+      .add_const_input(save_mean)
       .add_input(save_var)
       .add_input(running_mean)
       .add_input(running_var)
@@ -473,6 +481,54 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cuda(const Tensor& self, const c10
   return std::make_tuple(output, save_mean, save_invstd);
 }
 
+std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_cuda(
+    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    Tensor& running_mean, Tensor& running_var, double momentum, double eps) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
+  Tensor output, save_mean, save_var, reserve;
+
+  BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, /*training*/true, eps);
+  if (backend == BatchNormBackend::Cudnn) {
+    std::tie(output, save_mean, save_var, reserve) =
+        at::cudnn_batch_norm(input, weight, bias, running_mean, running_var, /*training*/true, momentum, eps);
+  } else if (backend == BatchNormBackend::Miopen) {
+    reserve = at::empty({0}, input.options().dtype(kByte));
+    std::tie(output, save_mean, save_var) =
+        at::miopen_batch_norm(input, weight, bias, running_mean, running_var, /*training*/true, momentum, eps);
+  } else {
+    reserve = at::empty({0}, input.options().dtype(kByte));
+    std::tie(output, save_mean, save_var) =
+        batch_norm_cuda(input, weight_opt, bias_opt, running_mean, running_var, /*training*/true, momentum, eps);
+  }
+  return std::tuple<Tensor, Tensor, Tensor, Tensor>(output, save_mean, save_var, reserve);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> _batch_norm_with_update_cuda_out(
+    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    Tensor& running_mean, Tensor& running_var, double momentum, double eps,
+    Tensor& out, Tensor& save_mean, Tensor& save_var, Tensor& reserve) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
+
+  BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, /*training*/true, eps);
+  if (backend == BatchNormBackend::Cudnn) {
+    std::tie(out, save_mean, save_var, reserve) =
+        at::cudnn_batch_norm_out(out, save_mean, save_var, reserve, input, weight, bias, running_mean, running_var, /*training*/true, momentum, eps);
+  } else if (backend == BatchNormBackend::Miopen) {
+    std::tie(out, save_mean, save_var) =
+        at::miopen_batch_norm_out(out, save_mean, save_var, input, weight, bias, running_mean, running_var, /*training*/true, momentum, eps);
+  } else {
+    std::tie(out, save_mean, save_var) =
+      batch_norm_cuda_out(input, weight_opt, bias_opt, running_mean, running_var, /*update*/true, momentum, eps, out, save_mean, save_var);
+  }
+  return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>(out, save_mean, save_var, reserve);
+}
+
 std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_cuda(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double epsilon) {
   return batch_norm_cuda(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, epsilon);
 }
@@ -489,6 +545,28 @@ std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_no_stats_cuda_out(const
   return batch_norm_cuda_out(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, epsilon, output, save_mean, save_invstd);
 }
 
+std::tuple<Tensor, Tensor, Tensor> _new_batch_norm_backward_cuda(
+    const Tensor& grad_output, const Tensor& input, const Tensor& weight,
+    const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
+    bool update, double eps, std::array<bool,3> grad_input_mask, const Tensor& reserve) {
+  const Tensor& dummy_bias = at::empty(1);
+  const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
+  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+  const Tensor& save_mean = c10::value_or_else(save_mean_opt, [] {return Tensor();});
+  const Tensor& save_var = c10::value_or_else(save_var_opt, [] {return Tensor();});
+
+  BatchNormBackend backend = _select_batch_norm_backend(input, weight, dummy_bias, running_mean, running_var, /*training*/true, eps);
+
+  if (backend == BatchNormBackend::Cudnn) {
+    return at::cudnn_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var, eps, reserve);
+  } else if (backend == BatchNormBackend::Miopen) {
+    return at::miopen_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var, eps);
+  } else {
+    return batch_norm_backward_cuda(grad_output, input, weight, running_mean, running_var, save_mean, save_var, update, eps, grad_input_mask);
+  }
+}
+
 std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cuda(const Tensor& grad_out, const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt, bool train, double epsilon, std::array<bool,3> grad_input_mask) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight = at::borrow_from_optional_tensor(weight_opt);
@@ -499,7 +577,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cuda(const Tensor& grad_o
 
   const bool needs_reduction = train || grad_input_mask[1] || grad_input_mask[2];
 
-  // Fused reducion & elementwise kernel
+  // Fused reduction & elementwise kernel
   if (needs_reduction && grad_input_mask[0] &&
       !batch_norm_use_channels_last_kernels(input) &&
       cuda::detail::canUse32BitIndexMath(input) &&
@@ -722,6 +800,8 @@ std::tuple<Tensor, Tensor> batch_norm_update_stats_cuda(
   c10::MaybeOwned<Tensor> running_var = at::borrow_from_optional_tensor(running_var_opt);
 
   const int64_t n_input = self.size(1);
+
+  TORCH_CHECK(self.numel() != 0, "input tensor must have at least one element, but got input_sizes = ", self.sizes());
   auto options = self.options().dtype(
       at::toAccumulateType(self.scalar_type(), /*is_cuda=*/true));
   auto save_mean = at::empty({n_input}, options);
diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh
index ab2b316bc8a4b..2cd05518d726e 100644
--- a/aten/src/ATen/native/cuda/Normalization.cuh
+++ b/aten/src/ATen/native/cuda/Normalization.cuh
@@ -210,12 +210,12 @@ __device__ __forceinline__ void welford_merge_block_vertical(C& count,
 
 template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, bool train, typename index_t>
 __global__ void batch_norm_transform_input_kernel(
-    const GenericPackedTensorAccessor<input_scalar_t, 3, RestrictPtrTraits, index_t> input,
+    const GenericPackedTensorAccessor<const input_scalar_t, 3, RestrictPtrTraits, index_t> input,
     GenericPackedTensorAccessor<input_scalar_t, 3, RestrictPtrTraits, index_t> output,
     const GenericPackedTensorAccessor<typename std::conditional<train, stat_accscalar_t, stat_scalar_t>::type, 1, RestrictPtrTraits, index_t> mean_,
     const GenericPackedTensorAccessor<typename std::conditional<train, stat_accscalar_t, stat_scalar_t>::type, 1, RestrictPtrTraits, index_t> var_or_invstd,
-    const GenericPackedTensorAccessor<stat_scalar_t, 1, RestrictPtrTraits, index_t> weight,
-    const GenericPackedTensorAccessor<stat_scalar_t, 1, RestrictPtrTraits, index_t> bias,
+    const GenericPackedTensorAccessor<const stat_scalar_t, 1, RestrictPtrTraits, index_t> weight,
+    const GenericPackedTensorAccessor<const stat_scalar_t, 1, RestrictPtrTraits, index_t> bias,
     stat_accscalar_t epsilon) {
 
   index_t plane = blockIdx.x;
@@ -267,7 +267,7 @@ struct Var {
 
 template <typename VarTransform, typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, typename index_t>
 __global__ void batch_norm_collect_statistics_kernel(
-    const GenericPackedTensorAccessor<input_scalar_t, 3, RestrictPtrTraits, index_t> input,
+    const GenericPackedTensorAccessor<const input_scalar_t, 3, RestrictPtrTraits, index_t> input,
     const stat_accscalar_t epsilon,
     const stat_accscalar_t momentum,
     GenericPackedTensorAccessor<stat_accscalar_t, 1, RestrictPtrTraits, index_t> save_mean,
@@ -354,16 +354,16 @@ __global__ void batch_norm_collect_statistics_kernel(
 
 template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, typename index_t>
 __global__ void batch_norm_backward_kernel(
-    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> input,
-    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
+    const GenericPackedTensorAccessor<const input_scalar_t, 3, DefaultPtrTraits, index_t> input,
+    const GenericPackedTensorAccessor<const input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
     GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_input,
     GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_weight,
     GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_bias,
-    const GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> weight,
-    const GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> running_mean,
-    const GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> running_var,
-    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> save_mean,
-    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> save_invstd,
+    const GenericPackedTensorAccessor<const stat_scalar_t, 1, DefaultPtrTraits, index_t> weight,
+    const GenericPackedTensorAccessor<const stat_scalar_t, 1, DefaultPtrTraits, index_t> running_mean,
+    const GenericPackedTensorAccessor<const stat_scalar_t, 1, DefaultPtrTraits, index_t> running_var,
+    const GenericPackedTensorAccessor<const stat_accscalar_t, 1, DefaultPtrTraits, index_t> save_mean,
+    const GenericPackedTensorAccessor<const stat_accscalar_t, 1, DefaultPtrTraits, index_t> save_invstd,
     bool train,
     stat_accscalar_t epsilon) {
 
@@ -385,7 +385,7 @@ __global__ void batch_norm_backward_kernel(
   // Compute two values across (batch, x/y/z) in one pass:
   // 1. Sum(grad_output)
   // 2. DotProduct(input - mean, grad_output)
-  GradOp<input_scalar_t, stat_accscalar_t, GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t>> g(mean, input, grad_output);
+  GradOp<input_scalar_t, stat_accscalar_t, GenericPackedTensorAccessor<const input_scalar_t, 3, DefaultPtrTraits, index_t>> g(mean, input, grad_output);
   auto res = reduce<Float2<input_scalar_t, stat_accscalar_t>>(g, grad_output, plane);
 
   stat_accscalar_t grad_output_sum = res.v1;
@@ -582,7 +582,7 @@ __global__ void batch_norm_backward_elemt_kernel(
 template <typename scalar_t, int64_t dim, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
 static GenericPackedTensorAccessor<scalar_t, dim, PtrTraits, index_t> get_packed_accessor(
     const Tensor& t, c10::string_view var_name) {
-  constexpr auto expect_type = c10::CppTypeToScalarType<scalar_t>::value;
+  constexpr auto expect_type = c10::CppTypeToScalarType<typename std::remove_const<scalar_t>::type>::value;
   const auto actual_type = t.scalar_type();
   TORCH_CHECK(actual_type == expect_type, "Expected ", var_name,
               " to have type ", expect_type, " but got ", actual_type);
@@ -624,25 +624,25 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cuda_template(const Tenso
   }
 
   auto input = get_packed_accessor<
-      input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input");
+      const input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input");
   auto grad_output = get_packed_accessor<
-      input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output");
+      const input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output");
   auto grad_input = packed_accessor_or_dummy<
       input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_input_reshaped, "grad_input");
   auto weight = packed_accessor_or_dummy<
-      stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight");
+      const stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight");
   auto grad_weight = packed_accessor_or_dummy<
       stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_weight_, "grad_weight");
   auto grad_bias = packed_accessor_or_dummy<
       stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_bias_, "grad_bias");
   auto running_mean = packed_accessor_or_dummy<
-      stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_mean_, "running_mean");
+      const stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_mean_, "running_mean");
   auto running_var = packed_accessor_or_dummy<
-      stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_var_, "running_var");
+      const stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_var_, "running_var");
   auto save_mean = packed_accessor_or_dummy<
-      accscalar_t, 1, DefaultPtrTraits, index_t>(save_mean_, "save_mean");
+      const accscalar_t, 1, DefaultPtrTraits, index_t>(save_mean_, "save_mean");
   auto save_invstd = packed_accessor_or_dummy<
-      accscalar_t, 1, DefaultPtrTraits, index_t>(save_invstd_, "save_invstd");
+      const accscalar_t, 1, DefaultPtrTraits, index_t>(save_invstd_, "save_invstd");
 
   auto stream = at::cuda::getCurrentCUDAStream();
   dim3 blocks(input.size(1));
@@ -670,7 +670,7 @@ void batch_norm_stats_cuda_template(
   resize_output(out_mean, {n_input});
   resize_output(out_invstd, {n_input});
   auto input = get_packed_accessor<
-      scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input");
+      const scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input");
   TORCH_INTERNAL_ASSERT(out_invstd.dim() == 1 && out_invstd.is_contiguous() &&
                         out_invstd.sizes()[0]);
   TORCH_INTERNAL_ASSERT(out_mean.dim() == 1 && out_mean.is_contiguous() &&
@@ -700,13 +700,13 @@ void batch_norm_elemt_cuda_template(const Tensor& output_, const Tensor& input_,
   auto output_reshaped = output_.view({input_.size(0), input_.size(1), -1});
 
   auto input = get_packed_accessor<
-      input_scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input");
+      const input_scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input");
   auto output = get_packed_accessor<
       input_scalar_t, 3, RestrictPtrTraits, index_t>(output_reshaped, "output");
   auto weight = packed_accessor_or_dummy<
-    stat_scalar_t, 1, RestrictPtrTraits, index_t>(weight_, "weight");
+    const stat_scalar_t, 1, RestrictPtrTraits, index_t>(weight_, "weight");
   auto bias = packed_accessor_or_dummy<
-      stat_scalar_t, 1, RestrictPtrTraits, index_t>(bias_, "bias");
+      const stat_scalar_t, 1, RestrictPtrTraits, index_t>(bias_, "bias");
   auto mean = packed_accessor_or_dummy<
       stat_accscalar_t, 1, RestrictPtrTraits, index_t>(mean_, "mean");
   auto invstd = packed_accessor_or_dummy<
diff --git a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
index ac5cf934fab04..4553276bab684 100644
--- a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
+++ b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
@@ -316,7 +316,7 @@ void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_ele
         // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
         int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
 
-        // use 128 threads per block to maximimize gpu utilization
+        // use 128 threads per block to maximize gpu utilization
         constexpr int threads_per_block = 128;
 
         int warps_per_block = (threads_per_block / warp_size);
@@ -366,7 +366,7 @@ void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const
         // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
         int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
 
-        // use 128 threads per block to maximimize gpu utilization
+        // use 128 threads per block to maximize gpu utilization
         constexpr int threads_per_block = 128;
 
         int warps_per_block = (threads_per_block / warp_size);
diff --git a/aten/src/ATen/native/cuda/RNN.cu b/aten/src/ATen/native/cuda/RNN.cu
index cf8887d9cc132..a997777fe0c3a 100644
--- a/aten/src/ATen/native/cuda/RNN.cu
+++ b/aten/src/ATen/native/cuda/RNN.cu
@@ -55,7 +55,7 @@ bool allContiguous(at::TensorList tensors) {
 }
 
 void getLaunchConfig(dim3* block, dim3* grid, int64_t numel) {
-  int curDevice = -1;
+  c10::DeviceIndex curDevice = -1;
   c10::cuda::GetDevice(&curDevice);
   *block = cuda::getApplyBlock();
   TORCH_INTERNAL_ASSERT(cuda::getApplyGrid(numel, *grid, curDevice),
diff --git a/aten/src/ATen/native/cuda/ROCmLoops.cuh b/aten/src/ATen/native/cuda/ROCmLoops.cuh
deleted file mode 100644
index 75811d7ae6102..0000000000000
--- a/aten/src/ATen/native/cuda/ROCmLoops.cuh
+++ /dev/null
@@ -1,364 +0,0 @@
-#pragma once
-
-// This file provides two functions to help write GPU elementwise kernels:
-//
-//   gpu_kernel(TensorIterator iter, <lambda>)
-//   gpu_kernel_with_scalars(TensorIterator iter, <lambda>)
-//
-// The gpu_kernel_with_scalars generates specializations that support a
-// single scalar CPU argument, such as from `cuda_tensor + 5`. The CPU scalar
-// is lifted to a kernel parameter instead of copying to device memory.
-// This should be  used in conjunction with TensorIterator::allow_cpu_scalars_,
-// which is the default for TensorIterator::binary_op. Otherwise, all inputs
-// and the output must be on the GPU.
-//
-// For example, to write a reciprocal kernel for GPU float Tensors:
-//
-//   gpu_kernel(iter, []GPU_LAMBDA(float a) {
-//    return 1.0f / a;
-//   });
-//
-// To write a multiplication kernel for GPU float Tensors where one argument
-// may be a CPU scalar:
-//
-//   gpu_kernel_with_scalars(iter, []GPU_LAMBDA(float a, float b) {
-//     return a * b;
-//   });
-//
-// See BinaryOpsKernel.cu for the complete implementation
-//
-
-#include <type_traits>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/core/Array.h>
-#include <ATen/cuda/detail/OffsetCalculator.cuh>
-#include <ATen/detail/FunctionTraits.h>
-#include <ATen/native/TensorIterator.h>
-#include <c10/macros/Macros.h>
-#include <c10/core/ScalarType.h>
-#include <c10/core/DynamicCast.h>
-
-
-#ifdef __NVCC__
-#define ASSERT_HOST_DEVICE_LAMBDA(type) \
-  static_assert(__nv_is_extended_host_device_lambda_closure_type(type), \
-                #type " must be a __host__ __device__ lambda")
-#else
-#define ASSERT_HOST_DEVICE_LAMBDA(type)
-#endif
-
-static constexpr int launch_size_1d = 512;
-static constexpr int launch_size_nd = 128;
-static constexpr int launch_bound2 = 4;
-
-
-namespace at { namespace native {
-
-// See [NOTE: Complex Operator Unification]
-// std::complex and thrust::complex don't work with some !needs_dynamic_casting optimizations.
-// They always currently map to !needs_dynamic_casting even though we sometimes rely on the ability
-// to reinterpret_cast between these representations.
-// In order to separate these concerns, we have a check for non-c10 complex separately.
-template<typename func_t, int nargs=function_traits<func_t>::arity>
-struct uses_non_c10_complex {
-  constexpr static bool check() {
-    using traits = function_traits<func_t>;
-    using type = typename traits::template arg<nargs - 1>::type;
-    constexpr bool non_c10_complex =
-        std::is_same<std::complex<float>, type>::value
-        || std::is_same<std::complex<double>, type>::value
-        || std::is_same<thrust::complex<float>, type>::value
-        || std::is_same<thrust::complex<double>, type>::value;
-
-    if constexpr (non_c10_complex) {
-      return true;
-    } else {
-      return uses_non_c10_complex<func_t, nargs - 1>::check();
-    }
-  }
-};
-
-template<typename func_t>
-struct uses_non_c10_complex<func_t, 0> {
-  constexpr static bool check() {
-    using traits = function_traits<func_t>;
-    using type = typename traits::result_type;
-    constexpr bool non_c10_complex =
-        std::is_same<std::complex<float>, type>::value
-        || std::is_same<std::complex<double>, type>::value
-        || std::is_same<thrust::complex<float>, type>::value
-        || std::is_same<thrust::complex<double>, type>::value;
-
-    return non_c10_complex;
-  }
-};
-
-// NOTE: @zasdfgbnm is currently working on rewriting the gpu loops.
-// Some of the old codes has been moved to namespace legacy, and
-// new codes will be put into namespace modern. These two namespaces
-// will coexists for a while until the rewrite is done. Once the rewrite
-// is done, we will remove the legacy and modern namespace and everything
-// will be in at::native directly.
-namespace legacy {
-
-template<int nt, int vt, typename func_t>
-C10_LAUNCH_BOUNDS_2(nt, launch_bound2)
-__global__ void elementwise_kernel(int N, func_t f) {
-  int tid = threadIdx.x;
-  int nv = nt * vt;
-  int idx = nv * blockIdx.x + tid;
-  #pragma unroll
-  for (int i = 0; i < vt; i++) {
-    if (idx < N) {
-      f(idx);
-      idx += nt;
-    }
-  }
-}
-
-template<int nt, int vt, typename func_t>
-static void launch_kernel(int64_t N, const func_t& f) {
-  TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
-  if (N == 0) {
-    return;
-  }
-  dim3 block(nt);
-  dim3 grid((N + block.x * vt - 1) / (block.x * vt));
-  auto stream = at::cuda::getCurrentCUDAStream();
-  elementwise_kernel<nt, vt, func_t><<<grid, block, 0, stream>>>(N, f);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-}
-
-template <typename traits, typename func_t, typename index_t, size_t... INDEX>
-C10_HOST_DEVICE typename traits::result_type
-invoke_impl(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], int i,
-            std::index_sequence<INDEX...>) {
-  return f(c10::load<typename traits::template arg<INDEX>::type>(data[INDEX] + i * strides[INDEX])...);
-}
-
-template <typename func_t, typename index_t, typename traits = function_traits<func_t>>
-C10_HOST_DEVICE typename traits::result_type
-invoke(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], int i) {
-  using Indices = std::make_index_sequence<traits::arity>;
-  return invoke_impl<traits>(f, data, strides, i, Indices{});
-}
-
-template <typename traits, typename func_t, typename index_t, size_t... I>
-C10_HOST_DEVICE typename traits::result_type
-invoke_impl(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], const ScalarType dtypes[], int i,
-            std::index_sequence<I...>) {
-  return f(c10::fetch_and_cast<typename traits::template arg<I>::type>(dtypes[I], data[I] + i * strides[I])...);
-}
-
-template <typename func_t, typename index_t, typename traits = function_traits<func_t>>
-C10_HOST_DEVICE typename traits::result_type
-invoke(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], const ScalarType dtypes[], int i) {
-  using Indices = std::make_index_sequence<traits::arity>;
-  return invoke_impl<traits>(f, data, strides, dtypes, i, Indices{});
-}
-
-} // namespace legacy
-
-// See the note for namespace legacy above.
-namespace modern {
-
-namespace detail {
-
-template <typename func_t, typename array_t, std::size_t... I>
-__device__ inline constexpr decltype(auto) invoke_with_array_impl(func_t f, array_t t, std::index_sequence<I...>)
-{
-    return f(t[I]...);
-}
-template <typename func_t, typename array_t>
-__device__ inline constexpr decltype(auto) invoke_with_array(func_t f, array_t a) {
-  constexpr auto arity = function_traits<func_t>::arity;
-  return invoke_with_array_impl(f, a, std::make_index_sequence<arity>{});
-}
-
-namespace arg_type {
-
-// We need a way to compute the argument type of a function. But
-// for nullary function, it does not really have an argument type
-// in this case, we still need to return a valid type, but we don't
-// really care what type this is.
-
-struct dont_care {};
-
-template <typename func_t, std::size_t arity>
-struct arg_type_helper {
-  using type = typename function_traits<func_t>::template arg<0>::type;
-};
-
-template <typename func_t>
-struct arg_type_helper<func_t, 0> {
-  using type = dont_care;
-};
-
-template <typename func_t>
-using type = typename arg_type_helper<func_t, function_traits<func_t>::arity>::type;
-
-}  // namespace arg_type
-
-template<typename func_t, int remaining=function_traits<func_t>::arity-1>
-struct has_same_arg_types {
-  using traits = function_traits<func_t>;
-  static constexpr bool value = std::is_same<
-      typename traits::template arg<remaining>::type,
-      typename traits::template arg<remaining-1>::type
-    >::value && has_same_arg_types<func_t, remaining-1>::value;
-};
-
-template<typename func_t>
-struct has_same_arg_types<func_t, 0> {
-  static constexpr bool value = true;
-};
-
-template<typename func_t>
-struct has_same_arg_types<func_t, -1> {
-  static constexpr bool value = true;
-};
-
-}  // namespace detail
-
-template<typename func_t, typename array_t>
-C10_LAUNCH_BOUNDS_1(num_threads())
-__global__ void elementwise_kernel(int N, func_t f, array_t data) {
-  // Assumption:
-  // 1. all arguments of `f` have the same type, which could be different from the return type of `f`
-  // 2. all tensors are contiguous, that is: stride == sizeof(type) for all tensors
-
-  using traits = function_traits<func_t>;
-  using return_t = typename traits::result_type;
-  using arg_t = detail::arg_type::type<func_t>;
-  constexpr int arity = traits::arity;
-
-  // We need to create array to hold all the arguments, for nullary `f`, this means array of size 0.
-  // Unfortunately the compiler don't allow us to create array of 0 size, so for this case, we create
-  // an array of size 1 and just don't use it.
-  constexpr int nargs = traits::arity == 0 ? 1 : traits::arity;
-
-  int tid = threadIdx.x;
-  int idx = block_work_size() * blockIdx.x + tid;
-
-  // compute base pointers
-  return_t *result_base = reinterpret_cast<return_t *>(data[0]) + idx;
-  arg_t *args_base[nargs];
-  #pragma unroll
-  for (int i = 0; i < arity; i++) {
-    args_base[i] = reinterpret_cast<arg_t *>(data[i + 1]) + idx;
-  }
-
-  // fetch data
-  return_t results[thread_work_size()];
-  arg_t args[thread_work_size()][nargs];
-  #pragma unroll
-  for (int i = 0; i < thread_work_size(); i++) {
-    if (idx + num_threads() * i < N) {
-      #pragma unroll
-      for (int j = 0; j < arity; j++) {
-        args[i][j] = c10::load(args_base[j] + i * num_threads());
-      }
-    }
-  }
-
-  // compute
-  #pragma unroll
-  for (int i = 0; i < thread_work_size(); i++) {
-    if (idx + num_threads() * i < N) {
-      results[i] = detail::invoke_with_array<func_t, arg_t[nargs]>(f, args[i]);
-    }
-  }
-
-  // store data
-  #pragma unroll
-  for (int i = 0; i < thread_work_size(); i++) {
-    if (idx + num_threads() * i < N) {
-      *(result_base + i * num_threads()) = results[i];
-    }
-  }
-}
-
-// TODO (@zasdfgbnm): this function assume trivial 1d and no dynamic casting
-template<typename func_t, typename array_t, std::enable_if_t<detail::has_same_arg_types<func_t>::value, int> = 0>
-static void launch_kernel(int64_t N, const func_t& f, array_t data) {
-  TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
-  if (N == 0) {
-    return;
-  }
-  int64_t grid = (N + block_work_size() - 1) / block_work_size();
-  auto stream = at::cuda::getCurrentCUDAStream();
-  elementwise_kernel<func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-}
-
-template<typename func_t, typename array_t, std::enable_if_t<!detail::has_same_arg_types<func_t>::value, int> = 0>
-static void launch_kernel(int64_t N, const func_t& f, array_t data) {}
-
-} // namespace modern
-
-
-template <typename func_t>
-void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
-  using traits = function_traits<func_t>;
-  using arg0_t = typename traits::result_type;
-  constexpr int ntensors = traits::arity + 1;
-
-  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
-  TORCH_INTERNAL_ASSERT(iter.ntensors() == traits::arity + 1);
-  bool non_c10_complex = uses_non_c10_complex<func_t>::check();
-
-  at::detail::Array<char*, ntensors> data;
-  for (int i = 0; i < ntensors; i++) {
-    data[i] = (char*)iter.data_ptr(i);
-  }
-
-  at::detail::Array<ScalarType, ntensors> dtypes;
-  for (int i = 0; i < ntensors; i++) {
-    dtypes[i] = iter.dtype(i);
-  }
-
-  int64_t numel = iter.numel();
-  if (iter.is_trivial_1d()) {
-    auto inner_strides = iter.get_inner_strides();
-    at::detail::Array<int, ntensors> strides;
-    for (int i = 0; i < ntensors; i++) {
-      strides[i] = inner_strides[i];
-    }
-
-    // TODO: can non_c10_complex go through the other path?  Need to verify.
-    if (needs_dynamic_casting<func_t>::check(iter) || non_c10_complex) {
-      legacy::launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) {
-        void* out = data[0] + strides[0] * idx;
-        arg0_t result = legacy::invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx);
-        c10::cast_and_store<arg0_t>(dtypes[0], out, result);
-      });
-    } else if (iter.has_contiguous_first_dim() && modern::detail::has_same_arg_types<func_t>::value) {
-      modern::launch_kernel(numel, f, data);
-    } else {
-      legacy::launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) {
-        arg0_t* out = (arg0_t*)(data[0] + strides[0] * idx);
-        *out = legacy::invoke(f, &data.data[1], &strides.data[1], idx);
-      });
-    }
-  } else {
-    auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
-    // TODO: can non_c10_complex go through the other path?  Need to verify.
-    if (needs_dynamic_casting<func_t>::check(iter) || non_c10_complex) {
-      legacy::launch_kernel<launch_size_nd, launch_bound2>(numel, [=]GPU_LAMBDA(int idx) {
-        auto offsets = offset_calc.get(idx);
-        void* out = data[0] + offsets[0];
-        arg0_t result = legacy::invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1);
-        c10::cast_and_store<arg0_t>(dtypes[0], out, result);
-      });
-    } else {
-      legacy::launch_kernel<launch_size_nd, launch_bound2>(numel, [=]GPU_LAMBDA(int idx) {
-        auto offsets = offset_calc.get(idx);
-        arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
-        *out = legacy::invoke(f, &data.data[1], &offsets.data[1], 1);
-      });
-    }
-  }
-}
-
-}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 0ccb01110e2a6..1f67ee3ea63e1 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -1054,7 +1054,7 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
   // Case 1: "vectorize along input"
   // This case happens when we are reducing along fastest moving dimesion. In such case, threads
   // with the same threadIdx.y works on the same reduction cooperatively and will produce results
-  // for the same ouput. In such case, values in each loaded vector always correspond to the same ouput.
+  // for the same output. In such case, values in each loaded vector always correspond to the same output.
   //
   // Case 2: "vectorize along output"
   // This case happens when the fastest moving dimesion is not the dimension of reduction. In such case,
diff --git a/aten/src/ATen/native/cuda/ReflectionPad.cu b/aten/src/ATen/native/cuda/ReflectionPad.cu
index 3e576c896742e..6f0ba1fbb7905 100644
--- a/aten/src/ATen/native/cuda/ReflectionPad.cu
+++ b/aten/src/ATen/native/cuda/ReflectionPad.cu
@@ -160,10 +160,10 @@ __global__ void reflection_pad2d_backward_out_kernel(
     gpuAtomicAddNoReturn(&grad_input[index_pair.first], grad_output[index_pair.second]);
   }
 }
-template <typename scalar_t, typename F>
+template <typename input_scalar_t, typename output_scalar_t, typename F>
 __device__ inline void parallel_reflection_pad3d(
-    PackedTensorAccessor64<scalar_t, 5> input,
-    PackedTensorAccessor64<scalar_t, 5> output,
+    PackedTensorAccessor64<input_scalar_t, 5> input,
+    PackedTensorAccessor64<output_scalar_t, 5> output,
     int64_t pad_left,
     int64_t pad_top,
     int64_t pad_front,
@@ -211,7 +211,7 @@ __device__ inline void parallel_reflection_pad3d(
 
 template<typename scalar_t>
 __global__ void reflection_pad3d_out_kernel(
-    PackedTensorAccessor64<scalar_t, 5> input,
+    PackedTensorAccessor64<const scalar_t, 5> input,
     PackedTensorAccessor64<scalar_t, 5> output,
     int64_t pad_left,  int64_t pad_top, int64_t pad_front,
     int64_t y_shift, int64_t z_shift
@@ -241,7 +241,7 @@ __global__ void reflection_pad3d_out_kernel(
 template <typename scalar_t>
 __global__ void reflection_pad3d_backward_out_kernel(
     PackedTensorAccessor64<scalar_t, 5> grad_input,
-    PackedTensorAccessor64<scalar_t, 5> grad_output,
+    PackedTensorAccessor64<const scalar_t, 5> grad_output,
     int64_t pad_left,  int64_t pad_top, int64_t pad_front,
     int64_t y_shift, int64_t z_shift
 ) {
@@ -595,7 +595,7 @@ TORCH_IMPL_FUNC(reflection_pad3d_out_cuda) (
           output_inner = output.unsqueeze(0);
         }
 
-        auto input_packed = input_inner.packed_accessor64<scalar_t, 5>();
+        auto input_packed = input_inner.packed_accessor64<const scalar_t, 5>();
         auto output_packed = output_inner.packed_accessor64<scalar_t, 5>();
 
         int64_t output_plane_size = output_packed.size(2) * output_packed.size(3) * output_packed.size(4);
@@ -648,7 +648,7 @@ TORCH_IMPL_FUNC(reflection_pad3d_backward_out_cuda) (
         }
 
         auto grad_input_packed = grad_input_.packed_accessor64<scalar_t, 5>();
-        auto grad_output_packed = grad_output_.packed_accessor64<scalar_t, 5>();
+        auto grad_output_packed = grad_output_.packed_accessor64<const scalar_t, 5>();
 
         int64_t output_plane_size = grad_output_packed.size(2) *
             grad_output_packed.size(3) * grad_output_packed.size(4);
diff --git a/aten/src/ATen/native/cuda/Repeat.cu b/aten/src/ATen/native/cuda/Repeat.cu
index 65c6863745c8c..0a39a0445dbe2 100644
--- a/aten/src/ATen/native/cuda/Repeat.cu
+++ b/aten/src/ATen/native/cuda/Repeat.cu
@@ -12,8 +12,8 @@
 
 template <typename index_t>
 __global__ static void compute_cuda_kernel(
-    index_t* repeat_ptr,
-    int64_t* cumsum_ptr,
+    const index_t* repeat_ptr,
+    const int64_t* cumsum_ptr,
     index_t* result_ptr,
     int64_t size,
     int64_t result_size) {
@@ -35,8 +35,8 @@ __global__ static void compute_cuda_kernel(
 
 template <typename index_t>
 static void compute_cuda(
-    index_t* repeat_ptr,
-    int64_t* cumsum_ptr,
+    const index_t* repeat_ptr,
+    const int64_t* cumsum_ptr,
     index_t* result_ptr,
     int64_t size,
     int64_t result_size) {
diff --git a/aten/src/ATen/native/cuda/ReplicationPadding.cu b/aten/src/ATen/native/cuda/ReplicationPadding.cu
index e65c0e90fe03d..d6517516e51ff 100644
--- a/aten/src/ATen/native/cuda/ReplicationPadding.cu
+++ b/aten/src/ATen/native/cuda/ReplicationPadding.cu
@@ -39,23 +39,23 @@ __host__ __device__ __forceinline__ int imax(int a, int b) {
 namespace {
 template <typename scalar_t>
 __global__ void replication_pad_forward_kernel1d(
-    PackedTensorAccessor64<scalar_t, 3> input,
+    PackedTensorAccessor64<const scalar_t, 3> input,
     PackedTensorAccessor64<scalar_t, 3> output,
     const int padL,
     const int y_shift,
     const int z_shift) {
-  const int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
-  const int plane = blockIdx.y + y_shift;
-  const int batch = blockIdx.z + z_shift;
+  const int64_t outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  const int64_t plane = blockIdx.y + y_shift;
+  const int64_t batch = blockIdx.z + z_shift;
   if (outputPointId >= output.size(2)) {
     return;
   }
-  const int outputPointX = outputPointId % output.size(2);
+  const auto outputPointX = outputPointId % output.size(2);
 
   const int iStartX = imax(0, -padL);
   const int oStartX = imax(0, padL);
 
-  const int inputPointX = imin(imax(padL, outputPointX), input.size(2) + padL - 1) - oStartX + iStartX;
+  const auto inputPointX = imin(imax(padL, outputPointX), input.size(2) + padL - 1) - oStartX + iStartX;
 
   scalar_t valueToCopy = input[batch][plane][inputPointX];
   output[batch][plane][outputPointX] = valueToCopy;
@@ -64,22 +64,22 @@ __global__ void replication_pad_forward_kernel1d(
 template <typename scalar_t>
 __global__ void replication_pad_backward_kernel(
     PackedTensorAccessor64<scalar_t, 3> gradInput,
-    PackedTensorAccessor64<scalar_t, 3> gradOutput,
+    PackedTensorAccessor64<const scalar_t, 3> gradOutput,
     const int padL,
     const int y_shift,
     const int z_shift) {
-  const int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
-  const int plane = blockIdx.y + y_shift;
-  const int batch = blockIdx.z + z_shift;
+  const int64_t outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  const int64_t plane = blockIdx.y + y_shift;
+  const int64_t batch = blockIdx.z + z_shift;
   if (outputPointId >= gradOutput.size(2)) {
     return;
   }
-  const int outputPointX = outputPointId % gradOutput.size(2);
+  const auto outputPointX = outputPointId % gradOutput.size(2);
 
   const int iStartX = imax(0, -padL);
   const int oStartX = imax(0, padL);
 
-  const int inputPointX = imin(imax(padL, outputPointX), gradInput.size(2) + padL - 1) - oStartX + iStartX;
+  const auto inputPointX = imin(imax(padL, outputPointX), gradInput.size(2) + padL - 1) - oStartX + iStartX;
 
   scalar_t valueToCopy = gradOutput[batch][plane][outputPointX];
   gpuAtomicAddNoReturn(&gradInput[batch][plane][inputPointX], valueToCopy);
@@ -87,7 +87,7 @@ __global__ void replication_pad_backward_kernel(
 
 template <typename scalar_t>
 __global__ void replication_pad_forward_kernel2d(
-    PackedTensorAccessor64<scalar_t, 4> input,
+    PackedTensorAccessor64<const scalar_t, 4> input,
     PackedTensorAccessor64<scalar_t, 4> output,
     const int padT,
     const int padL,
@@ -117,7 +117,7 @@ __global__ void replication_pad_forward_kernel2d(
 template <typename scalar_t>
 __global__ void replication_pad_backward_kernel(
     PackedTensorAccessor64<scalar_t, 4> gradInput,
-    PackedTensorAccessor64<scalar_t, 4> gradOutput,
+    PackedTensorAccessor64<const scalar_t, 4> gradOutput,
     const int padT,
     const int padL,
     const int y_shift,
@@ -145,7 +145,7 @@ __global__ void replication_pad_backward_kernel(
 
 template <typename scalar_t>
 __global__ void replication_pad_forward_kernel3d(
-    PackedTensorAccessor64<scalar_t, 5> input,
+    PackedTensorAccessor64<const scalar_t, 5> input,
     PackedTensorAccessor64<scalar_t, 5> output,
     const int pfront,
     const int ptop,
@@ -185,7 +185,7 @@ __global__ void replication_pad_forward_kernel3d(
 template <typename scalar_t>
 __global__ void replication_pad_backward_kernel(
     PackedTensorAccessor64<scalar_t, 5> gradInput,
-    PackedTensorAccessor64<scalar_t, 5> gradOutput,
+    PackedTensorAccessor64<const scalar_t, 5> gradOutput,
     const int pfront,
     const int ptop,
     const int pleft,
@@ -278,7 +278,7 @@ void replication_pad2d_backward_out_cuda_template(
           gradOutput_ = gradOutput.unsqueeze(0);
         }
         auto devGradInput = gradInput_.packed_accessor64<scalar_t, 4>();
-        auto devGradOutput = gradOutput_.packed_accessor64<scalar_t, 4>();
+        auto devGradOutput = gradOutput_.packed_accessor64<const scalar_t, 4>();
 
         int64_t outputPlaneSize = devGradOutput.size(2) * devGradOutput.size(3);
         int64_t size1 = devGradOutput.size(1);
@@ -392,7 +392,7 @@ void replication_pad3d_backward_out_cuda_template(
         gradOutput_ = gradOutput.unsqueeze(0);
       }
       auto devGradInput = gradInput_.packed_accessor64<scalar_t, 5>();
-      auto devGradOutput = gradOutput_.packed_accessor64<scalar_t, 5>();
+      auto devGradOutput = gradOutput_.packed_accessor64<const scalar_t, 5>();
 
       const int64_t outputPlaneSize = devGradOutput.size(2) * devGradOutput.size(3) * devGradOutput.size(4);
       const int64_t size1 = devGradOutput.size(1);
@@ -419,8 +419,8 @@ void replication_pad3d_backward_out_cuda_template(
 TORCH_IMPL_FUNC(replication_pad1d_out_cuda) (
   const Tensor& input, IntArrayRef paddingSize, const Tensor& output
 ) {
-  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
-      "input tensor must fit into 32-bit index math");
+  TORCH_CHECK(input.numel() < std::numeric_limits<int64_t>::max(),
+      "replication_pad1d only supports input tensors with less than 2^63 - 1 elements");
 
   int64_t padL = paddingSize[0];
   int64_t padR = paddingSize[1];
@@ -446,7 +446,7 @@ TORCH_IMPL_FUNC(replication_pad1d_out_cuda) (
         output_ = output.unsqueeze(0);
       }
 
-      auto devInput = input_.packed_accessor64<scalar_t, 3>();
+      auto devInput = input_.packed_accessor64<const scalar_t, 3>();
       auto devOutput = output_.packed_accessor64<scalar_t, 3>();
 
       int64_t outputPlaneSize = devOutput.size(2);
@@ -480,19 +480,19 @@ TORCH_IMPL_FUNC(replication_pad1d_backward_out_cuda) (
   // Nondeterministic because of atomicAdd usage
   globalContext().alertNotDeterministic("replication_pad1d_backward_cuda");
 
-  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input),
-      "input tensor must fit into 32-bit index math");
-  TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput),
-      "output gradient tensor must fit into 32-bit index math");
+  TORCH_CHECK(input.numel() < std::numeric_limits<int64_t>::max(),
+      "replication_pad1d only supports input tensors with less than 2^63 - 1 elements");
+  TORCH_CHECK(gradOutput.numel() < std::numeric_limits<int64_t>::max(),
+      "replication_pad1d only supports output tensors with less than 2^63 - 1 elements");
 
-  const int padL = paddingSize[0];
-  int dimw = 1;
+  const int64_t padL = paddingSize[0];
+  int64_t dimw = 1;
 
-  int numInputDims = input.ndimension();
+  int64_t numInputDims = input.ndimension();
   if (numInputDims == 3) {
     dimw++;
   }
-  int iwidth = input.size(dimw);
+  int64_t iwidth = input.size(dimw);
 
   if (gradInput.numel() == 0) {
     return;
@@ -509,7 +509,7 @@ TORCH_IMPL_FUNC(replication_pad1d_backward_out_cuda) (
         gradOutput_ = gradOutput.unsqueeze(0);
       }
       auto devGradInput = gradInput_.packed_accessor64<scalar_t, 3>();
-      auto devGradOutput = gradOutput_.packed_accessor64<scalar_t, 3>();
+      auto devGradOutput = gradOutput_.packed_accessor64<const scalar_t, 3>();
 
       int64_t outputPlaneSize = devGradOutput.size(2);
       int64_t size1 = devGradOutput.size(1);
@@ -551,7 +551,7 @@ TORCH_IMPL_FUNC(replication_pad2d_out_cuda) (
         input_ = input.unsqueeze(0);
         output_ = output.unsqueeze(0);
       }
-      auto devInput = input_.packed_accessor64<scalar_t, 4>();
+      auto devInput = input_.packed_accessor64<const scalar_t, 4>();
       auto devOutput = output_.packed_accessor64<scalar_t, 4>();
       int64_t outputPlaneSize = devOutput.size(2) * devOutput.size(3);
       int64_t size1 = devOutput.size(1);
@@ -644,7 +644,7 @@ TORCH_IMPL_FUNC(replication_pad3d_out_cuda) (
         output_ = output.unsqueeze(0);
       }
 
-      auto devInput = input_.packed_accessor64<scalar_t, 5>();
+      auto devInput = input_.packed_accessor64<const scalar_t, 5>();
       auto devOutput = output_.packed_accessor64<scalar_t, 5>();
 
       const int64_t outputPlaneSize = devOutput.size(2) * devOutput.size(3) * devOutput.size(4);
diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
index 5509d854a34dd..9ef83599cd15c 100644
--- a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
+++ b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
@@ -82,7 +82,7 @@ static TensorAssign tensor_assign;
 // of the same size.
 template <int N> struct alignas(N) OpaqueType { char data[N]; };
 
-// essentialy rewritten related to legacy::launch_kernel parts
+// essentially rewritten related to legacy::launch_kernel parts
 template <int nt, int vt, typename func_t>
 C10_LAUNCH_BOUNDS_2(nt, vt)
 __global__ void _scatter_gather_elementwise_kernel(int N, func_t f) {
@@ -188,8 +188,8 @@ struct cuda_scatter_gather_base_kernel {
       .check_all_same_dtype(false)
       .resize_outputs(false)
       .add_output(self_restrided)
-      .add_input(src_restrided)
-      .add_input(index)
+      .add_const_input(src_restrided)
+      .add_const_input(index)
       .build();
 
     auto self_dim_stride = ensure_nonempty_stride(self, dim);
@@ -246,8 +246,8 @@ struct cuda_scatter_gather_base_kernel {
       .check_all_same_dtype(false)
       .resize_outputs(false)
       .add_output(self_restrided)
-      .add_input(src_restrided)
-      .add_input(index)
+      .add_const_input(src_restrided)
+      .add_const_input(index)
       .build();
 
     auto self_dim_stride = ensure_nonempty_stride(self, dim);
@@ -305,8 +305,8 @@ struct cuda_scatter_gather_base_kernel {
       .check_all_same_dtype(false)
       .resize_outputs(false)
       .add_output(self_restrided)
-      .add_input(src_restrided)
-      .add_input(index)
+      .add_const_input(src_restrided)
+      .add_const_input(index)
       .build();
 
     auto self_dim_stride = ensure_nonempty_stride(self, dim);
@@ -401,7 +401,7 @@ struct cuda_scatter_fill_base_kernel {
       .check_all_same_dtype(false)
       .resize_outputs(false)
       .add_output(self_restrided)
-      .add_input(index)
+      .add_const_input(index)
       .build();
 
     auto index_size = ensure_nonempty_size(self, dim);
@@ -444,7 +444,7 @@ struct cuda_scatter_fill_base_kernel {
       .check_all_same_dtype(false)
       .resize_outputs(false)
       .add_output(self_restrided)
-      .add_input(index)
+      .add_const_input(index)
       .build();
 
     auto index_size = ensure_nonempty_size(self, dim);
diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu
index 1680a43e014f4..99fea30540210 100644
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@@ -51,6 +51,28 @@ inline bool getCatGrid(ptrdiff_t nTensors, dim3& grid) {
   return true;
 }
 
+template<typename T>
+inline std::tuple<dim3, dim3> getCatGridRocm(unsigned int max_elements_per_tensor,
+  ptrdiff_t nTensors) {
+  constexpr unsigned int threads_per_block = 256;
+  constexpr unsigned int elements_per_thread = 8;
+  constexpr unsigned int max_tb_per_sm = 32;
+
+  unsigned int max_threads = ceil_div(max_elements_per_tensor, elements_per_thread);
+  unsigned int thread_blocks = ceil_div(max_threads, threads_per_block);
+
+  // Limit the number of thread blocks to prevent too many threads to load the metadata
+  // if they operate on very small tensors.
+
+  const unsigned int num_sm = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  thread_blocks = std::min(num_sm * max_tb_per_sm, thread_blocks);
+
+  dim3 block = dim3(threads_per_block);
+  dim3 grid = dim3(thread_blocks, (long long)nTensors);
+
+  return std::make_tuple(grid, block);
+}
+
 template<typename T>
 inline std::tuple<dim3, dim3> getCatGridContig(unsigned int max_elements_per_tensor,
   ptrdiff_t nTensors) {
@@ -176,6 +198,34 @@ __global__ void CatArrayBatchedCopy(
     }
 }
 
+template <typename T, typename IndexType, int Dims, int batch_size, int stride_size>
+__global__ void CatArrayBatchedCopy_contig(
+    T* output,
+    CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
+    TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
+    const int concatDim,
+    IndexType dimStride) {
+
+    IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
+    IndexType nElements = inputs.nElements[blockIdx.y];
+
+    if(tid >= nElements) return;
+
+    const T* data = inputs.input[blockIdx.y];
+    IndexType offset = inputs.offset[blockIdx.y];
+    IndexType dimSize = inputs.dimSize[blockIdx.y];
+    IndexType dataOffset = offset * dimStride;
+
+    IndexType stride = gridDim.x * blockDim.x;
+
+    while( tid < nElements){
+      IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
+                    os.tensorSize, os.tensorStride, dimSize, concatDim, tid);
+      output[dataOffset + elementOffset] = data[tid];
+      tid += stride;
+    }
+}
+
 /*
   Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
   to improve memory bandwidth throughput.
@@ -295,9 +345,14 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
       catMetaData.dimSize[batchCounter] = dimSize;
       catMetaData.nElements[batchCounter] = inputs[i+batchCounter].get().numel();
 
+#ifdef USE_ROCM
+      // On ROCm, CatArrayBatchedCopy_contig is faster
+      isAligned = false;
+#else
       // If at least one of the inputs is not aligned, we can't call the
       // CatArrayBatchedCopy_aligned16_contig
       isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
+#endif
 
       if (stride_size > 1) {
         auto strides = inputs[i+batchCounter].get().strides();
@@ -326,6 +381,15 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
 
     dim3 applyBlock, catGrid;
 
+#ifdef USE_ROCM
+    // always base grid size on max_elements_per_tensor
+    {
+      std::tuple<dim3, dim3> launchParams = getCatGridRocm<scalar_t>(
+          max_elements_per_tensor, batchCounter);
+      catGrid = std::get<0>(launchParams);
+      applyBlock = std::get<1>(launchParams);
+    }
+#else
     if (isContig && sizeof(scalar_t) > 2) {
       std::tuple<dim3, dim3> launchParams = getCatGridContig<scalar_t>(
           max_elements_per_tensor, batchCounter);
@@ -335,6 +399,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
       applyBlock = dim3(32 * 16);
       getCatGrid(batchCounter, catGrid);
     }
+#endif
 
     if (memory_format != c10::MemoryFormat::Contiguous) {
       switch (dimension) {
@@ -353,6 +418,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
       CatArrayBatchedCopy_aligned16_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size><<<\
           catGrid, applyBlock, 0, stream.stream()>>>(\
               data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+    } else if (isContig) {\
+      CatArrayBatchedCopy_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size><<<\
+          catGrid, applyBlock, 0, stream.stream()>>>(\
+              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
     } else {\
       CatArrayBatchedCopy<scalar_t, unsigned int, DIMS, batch_size, stride_size><<<\
           catGrid, applyBlock, 0, stream.stream()>>>(\
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
index b57d778d7cb58..cffd52624f9e3 100644
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -15,6 +15,7 @@
 #include <ATen/native/cuda/MemoryAccess.cuh>
 #include <ATen/native/cuda/PersistentSoftmax.cuh>
 #include <ATen/native/IndexingUtils.h>
+#include <ATen/native/cuda/block_reduce.cuh>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -172,11 +173,39 @@ inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
   return dim3(block_size);
 }
 
+inline dim3 SoftMaxForward_getBlockSize(uint64_t dim_size) {
+  uint64_t block_size = 1;
+  uint64_t max_block_size = std::min(dim_size, static_cast<uint64_t>(max_threads));
+
+  // We need a block size that is a multiple of C10_WARP_SIZE in order
+  // to perform block size reductions using warp shuffle instructions.
+  // Since max_threads is also a multiple of C10_WARPS_SIZE we do not
+  // risk creating a block size larger than the limit.
+
+  if (max_block_size % C10_WARP_SIZE == 0) {
+    block_size = max_block_size;
+  } else {
+    block_size = (max_block_size / C10_WARP_SIZE + 1) * C10_WARP_SIZE;
+  }
+
+  return dim3(block_size);
+}
+
 template<typename T>
 struct Add {
   __device__ __forceinline__ T operator()(T a, T b) const {
     return a + b;
   }
+
+  __device__ __forceinline__ T combine(T a, T b) const {
+    return a + b;
+  }
+
+  // Needed to allow warp level reduction as a first step in the
+  // thread block reduction
+  __device__ __forceinline__ T warp_shfl_down(T data, int offset) const {
+    return WARP_SHFL_DOWN(data, offset);
+  }
 };
 
 template<typename T>
@@ -184,6 +213,16 @@ struct Max {
   __device__ __forceinline__ T operator()(T a, T b) const {
     return a < b ? b : a;
   }
+
+  __device__ __forceinline__ T combine(T a, T b) const {
+    return a < b ? b : a;
+  }
+
+  // Needed to allow warp level reduction as a first step in the
+  // thread block reduction
+  __device__ __forceinline__ T warp_shfl_down(T data, int offset) const {
+    return WARP_SHFL_DOWN(data, offset);
+  }
 };
 
 // Note that it's not a complete block-wide reduction.
@@ -396,6 +435,20 @@ blockReduce(AccumT* smem, AccumT val,
   return smem[0];
 }
 
+// Performs a thread block reduction with a given functor but uses
+// warp shuffles as the first step in the reduction
+template <template<typename> class Reduction, typename T>
+__device__ __forceinline__
+T blockReduceWarp(T* smem_cache, T value, const Reduction<T>& op, T defaultVal)
+{
+  T result = cuda_utils::BlockReduce<T, Reduction<T>>(value, op, defaultVal, smem_cache);
+  if (threadIdx.x == 0) {
+    smem_cache[0] = result;
+  }
+  __syncthreads();
+  return smem_cache[0];
+}
+
 template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT, typename index_t=int>
 __device__ __forceinline__ AccumT
 ilpReduce(index_t shift,
@@ -556,7 +609,7 @@ WriteBpropResultsVectorized(
 }
 
 /**
- * This will apply the Epilogue with non-vectrorized reads & writes for the general case
+ * This will apply the Epilogue with non-vectorized reads & writes for the general case
  */
 template <int ILP, typename scalar_t, typename accum_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
 __device__ __forceinline__ void
@@ -565,26 +618,7 @@ WriteFpropResults(
              const scalar_t *input,
              outscalar_t *output,
              Epilogue<scalar_t, accum_t, outscalar_t> epilogue) {
-  int offset = threadIdx.x;
-
-  int last = classes % (ILP * blockDim.x);
-
-  // Main bulk of loop with ILP
-  for (; offset < classes - last; offset += blockDim.x * ILP) {
-    scalar_t tmp[ILP];
-
-    #pragma unroll
-    for (int j = 0; j < ILP; ++j) {
-      tmp[j] = input[offset + j * blockDim.x];
-    }
-    #pragma unroll
-    for (int j = 0; j < ILP; ++j) {
-      output[offset + j * blockDim.x] = epilogue(tmp[j]);
-    }
-  }
-
-  // Remainder - no ILP
-  for (; offset < classes; offset += blockDim.x) {
+  for (int offset = threadIdx.x; offset < classes; offset += blockDim.x) {
     output[offset] = epilogue(input[offset]);
   }
 }
@@ -631,9 +665,6 @@ cunn_SoftMaxForward(outscalar_t *output, const scalar_t *input, int classes)
   extern __shared__ unsigned char smem[];
   auto sdata = reinterpret_cast<accscalar_t*>(smem);
 
-  using LoadT = at::native::memory::aligned_vector<scalar_t, ILP>;
-  using StoreT = at::native::memory::aligned_vector<outscalar_t, ILP>;
-
   // forward pointers to batch[blockIdx.x]
   // each block handles a sample in the mini-batch
   input += static_cast<int64_t>(blockIdx.x) * classes;
@@ -644,15 +675,15 @@ cunn_SoftMaxForward(outscalar_t *output, const scalar_t *input, int classes)
 
   // find the max
   accscalar_t threadMax = ilpReduce<MaxFloat, ILP, scalar_t, accscalar_t>(
-      shift, input, classes, MaxFloat<scalar_t, accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
-  accscalar_t max_k = blockReduce<Max, accscalar_t>(
-      sdata, threadMax, Max<accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
+    shift, input, classes, MaxFloat<scalar_t, accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
+  accscalar_t max_k = blockReduceWarp<Max, accscalar_t>(sdata, threadMax,
+    Max<accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
 
   // reduce all values
   accscalar_t threadExp = ilpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(
-      shift, input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
-  accscalar_t sumAll = blockReduce<Add, accscalar_t>(
-      sdata, threadExp, Add<accscalar_t>(), static_cast<accscalar_t>(0));
+    shift, input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
+  accscalar_t sumAll = blockReduceWarp<Add, accscalar_t>(sdata, threadExp,
+    Add<accscalar_t>(), static_cast<accscalar_t>(0));
 
   Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
 
@@ -663,6 +694,78 @@ cunn_SoftMaxForward(outscalar_t *output, const scalar_t *input, int classes)
   }
 }
 
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
+  template <typename, typename, typename> class Epilogue, typename index_t = int32_t>
+__global__ void
+cunn_SoftMaxForwardSmem(outscalar_t *output, const scalar_t *input, index_t classes)
+{
+  // Each thread block processes a sample in the batch
+  input += static_cast<int64_t>(blockIdx.x) * classes;
+  output += static_cast<int64_t>(blockIdx.x) * classes;
+
+  accscalar_t threadMax = -at::numeric_limits<accscalar_t>::max();
+  accscalar_t threadExp = static_cast<accscalar_t>(0);
+
+  // The first smem segment is used to cache input values and the last
+  // segment is used for thread block reductions
+  extern __shared__ unsigned char smem[];
+  auto smem_input_cache = reinterpret_cast<scalar_t*>(smem);
+  auto smem_reduction_cache = reinterpret_cast<accscalar_t*>(smem +
+    classes * sizeof(scalar_t));
+
+  using LoadT = at::native::memory::aligned_vector<scalar_t, ILP>;
+  const LoadT* const input_vec_ptr = reinterpret_cast<const LoadT*>(input);
+  LoadT* const smem_input_cache_vec_ptr = reinterpret_cast<LoadT*>(smem_input_cache);
+
+  // Download inputs to shared memory while doing the first step
+  // in max calculation
+  MaxFloat<scalar_t, accscalar_t> maxFunc;
+  for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
+    LoadT crnt_vec = input_vec_ptr[offset];
+    smem_input_cache_vec_ptr[offset] = crnt_vec;
+
+    #pragma unroll
+    for (int i = 0; i < ILP; ++i) {
+      threadMax = maxFunc(threadMax, crnt_vec.val[i]);
+    }
+  }
+
+  accscalar_t max_k = blockReduceWarp<Max, accscalar_t>(smem_reduction_cache, threadMax,
+    Max<accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
+
+  // Reload input from shared memory to compute the sum. The previous
+  // reduce has performed a __syncthreads() so the smem contents are populated.
+  SumExpFloat<scalar_t, accscalar_t> sumExpFunc(max_k);
+  for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
+    LoadT crnt_vec = smem_input_cache_vec_ptr[offset];
+
+    #pragma unroll
+    for (int i = 0; i < ILP; ++i) {
+      threadExp = sumExpFunc(threadExp, crnt_vec.val[i]);
+    }
+  }
+
+  accscalar_t sumAll = blockReduceWarp<Add, accscalar_t>(smem_reduction_cache, threadExp,
+    Add<accscalar_t>(), static_cast<accscalar_t>(0));
+
+  Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
+
+  // Use vectorized stores to save the output
+  using StoreT = at::native::memory::aligned_vector<outscalar_t, ILP>;
+  StoreT* output_vec_ptr = reinterpret_cast<StoreT*>(output);
+  for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
+    LoadT crnt_vec = smem_input_cache_vec_ptr[offset];
+    StoreT out_vec;
+
+    #pragma unroll
+    for (int i = 0; i < ILP; ++i) {
+      out_vec.val[i] = epilogue(crnt_vec.val[i]);
+    }
+
+    output_vec_ptr[offset] = out_vec;
+  }
+}
+
 C10_DEVICE bool inline is_32bit_representable(const int64_t value) {
   return value < static_cast<int64_t>(std::numeric_limits<int32_t>::max());
 }
@@ -741,9 +844,9 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
       AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "host_softmax", [&] {
         using accscalar_t = acc_type<scalar_t, true>;
         if (!half_to_float) {
+          auto output_ptr = output.mutable_data_ptr<scalar_t>();
+          auto input_ptr = input.const_data_ptr<scalar_t>();
           if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) {
-            auto output_ptr = output.mutable_data_ptr<scalar_t>();
-            auto input_ptr = input.const_data_ptr<scalar_t>();
             int64_t remaining = outer_size;
             int64_t chunk_size = (1L << 30L) / dim_size;
             while(remaining > 0) {
@@ -755,16 +858,31 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
             }
           } else {
             constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
-            dim3 block = SoftMax_getBlockSize(ILP, dim_size);
-            cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
-              <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
-                output.mutable_data_ptr<scalar_t>(), input.const_data_ptr<scalar_t>(), dim_size);
+            dim3 block = SoftMaxForward_getBlockSize(dim_size);
+            size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+            auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
+              smem_reduction_sz) / sizeof(scalar_t);
+
+            bool can_use_smem = dim_size < max_elements_per_smem;
+            can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES);
+            can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
+            can_use_smem &= !(dim_size % ILP);
+
+            if (can_use_smem) {
+              size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
+              cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
+                <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
+            } else {
+              cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
+                <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+            }
+
             C10_CUDA_KERNEL_LAUNCH_CHECK();
           }
         } else {
+          auto output_ptr = output.mutable_data_ptr<accscalar_t>();
+          auto input_ptr = input.const_data_ptr<scalar_t>();
           if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) {
-            auto output_ptr = output.mutable_data_ptr<accscalar_t>();
-            auto input_ptr = input.const_data_ptr<scalar_t>();
             int64_t remaining = outer_size;
             int64_t chunk_size = (1<<30) / dim_size;
             while(remaining > 0) {
@@ -775,11 +893,26 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
               remaining -= chunk_size;
             }
           } else {
-            constexpr int ILP = sizeof(float4) / sizeof(accscalar_t);
-            dim3 block = SoftMax_getBlockSize(ILP, dim_size);
-            cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
-              <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
-                output.mutable_data_ptr<accscalar_t>(), input.const_data_ptr<scalar_t>(), dim_size);
+            constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
+            dim3 block = SoftMaxForward_getBlockSize(dim_size);
+            size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+            auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
+              smem_reduction_sz) / sizeof(scalar_t);
+
+            bool can_use_smem = dim_size < max_elements_per_smem;
+            can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES);
+            can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
+            can_use_smem &= !(dim_size % ILP);
+
+            if (can_use_smem) {
+              size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
+              cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
+                <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
+            } else {
+              cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
+                <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+            }
+
             C10_CUDA_KERNEL_LAUNCH_CHECK();
           }
         }
diff --git a/aten/src/ATen/native/cuda/Sorting.cu b/aten/src/ATen/native/cuda/Sorting.cu
index 313c6d1ea981b..6272bbb9b75df 100644
--- a/aten/src/ATen/native/cuda/Sorting.cu
+++ b/aten/src/ATen/native/cuda/Sorting.cu
@@ -22,7 +22,7 @@ namespace {
 // Finds the rank k element, and its index, of the values along dimension dim
 template <typename scalar_t, typename index_t, int Dim>
 __global__ void gatherKthValue(
-    cuda::detail::TensorInfo<scalar_t, index_t> input,
+    cuda::detail::TensorInfo<const scalar_t, index_t> input,
     index_t inputSliceSize,
     index_t k,
     index_t numInputSlices,
@@ -40,13 +40,13 @@ __global__ void gatherKthValue(
 
   // Find the start offset for our slice
   index_t sliceStartIndex =
-      cuda::detail::IndexToOffset<scalar_t, index_t, Dim>::get(slice, input);
+      cuda::detail::IndexToOffset<const scalar_t, index_t, Dim>::get(slice, input);
   index_t kthValueSliceStartIndex =
       cuda::detail::IndexToOffset<scalar_t, index_t, Dim>::get(slice, kthValue);
   index_t indicesSliceStartIndex =
       cuda::detail::IndexToOffset<int64_t, index_t, Dim>::get(slice, indices);
 
-  scalar_t* inputSliceStart = &input.data[sliceStartIndex];
+  const scalar_t* inputSliceStart = &input.data[sliceStartIndex];
   scalar_t* kthValueSliceStart = &kthValue.data[kthValueSliceStartIndex];
   int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex];
 
@@ -92,7 +92,7 @@ template <typename scalar_t, typename index_t, int Dim>
 __global__ void gatherMedian(
     cuda::detail::TensorInfo<scalar_t, index_t> values,
     cuda::detail::TensorInfo<int64_t, index_t> indices,
-    cuda::detail::TensorInfo<scalar_t, index_t> input,
+    cuda::detail::TensorInfo<const scalar_t, index_t> input,
     index_t inputSliceSize,
     index_t numInputSlices,
     index_t inputWithinSliceStride,
@@ -112,11 +112,11 @@ __global__ void gatherMedian(
   index_t indicesSliceStartIndex =
       cuda::detail::IndexToOffset<int64_t, index_t, Dim>::get(slice, indices);
   index_t inputSliceStartIndex =
-      cuda::detail::IndexToOffset<scalar_t, index_t, Dim>::get(slice, input);
+      cuda::detail::IndexToOffset<const scalar_t, index_t, Dim>::get(slice, input);
 
   scalar_t* valuesSliceStart = &values.data[valuesSliceStartIndex];
   int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex];
-  scalar_t* inputSliceStart = &input.data[inputSliceStartIndex];
+  const scalar_t* inputSliceStart = &input.data[inputSliceStartIndex];
 
   index_t nan_count = 0;
   for (index_t i = threadIdx.x; i < inputSliceSize; i += blockDim.x) {
@@ -178,7 +178,7 @@ struct KthValueLauncher {
       int collapse_values_dim,
       cuda::detail::TensorInfo<int64_t, index_t> indices_info,
       int collapse_indices_dim,
-      cuda::detail::TensorInfo<scalar_t, index_t> self_info,
+      cuda::detail::TensorInfo<const scalar_t, index_t> self_info,
       int collapse_self_dim,
       int64_t num_slices,
       int64_t slice_size) {
@@ -216,7 +216,7 @@ struct MedianLauncher {
       int collapse_values_dim,
       cuda::detail::TensorInfo<int64_t, index_t> indices_info,
       int collapse_indices_dim,
-      cuda::detail::TensorInfo<scalar_t, index_t> self_info,
+      cuda::detail::TensorInfo<const scalar_t, index_t> self_info,
       int collapse_self_dim,
       int64_t num_slices,
       int64_t slice_size) {
@@ -247,8 +247,8 @@ struct MedianLauncher {
 void launch_kthvalue_kernel(
     const TensorBase &values, const TensorBase &indices,
     const TensorBase &self, int64_t dim, int64_t k) {
-  AT_DISPATCH_ALL_TYPES_AND(
-      at::ScalarType::Half, self.scalar_type(), "kthvalue_cuda", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "kthvalue_cuda", [&] {
     AT_DISPATCH_INDEX_TYPES(
         cuda::detail::canUse32BitIndexMath(self) &&
         cuda::detail::canUse32BitIndexMath(values) &&
@@ -263,8 +263,8 @@ void launch_kthvalue_kernel(
 void launch_median_kernel(
     const TensorBase &vals, const TensorBase &inds,
     const TensorBase &self, int64_t dim, bool ignore_nan) {
-  AT_DISPATCH_ALL_TYPES_AND(
-      at::ScalarType::Half, self.scalar_type(), "median_out_impl", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "median_out_impl", [&] {
         if (cuda::detail::canUse32BitIndexMath(vals) &&
             cuda::detail::canUse32BitIndexMath(inds) &&
             cuda::detail::canUse32BitIndexMath(self)) {
diff --git a/aten/src/ATen/native/cuda/SortingCommon.cuh b/aten/src/ATen/native/cuda/SortingCommon.cuh
index 4f151c407fea7..c4a8ec6864a1d 100644
--- a/aten/src/ATen/native/cuda/SortingCommon.cuh
+++ b/aten/src/ATen/native/cuda/SortingCommon.cuh
@@ -116,7 +116,7 @@ void run_launcher(
     const TensorBase &self,
     int64_t dim,
     Launcher l) {
-  auto self_info = cuda::detail::getTensorInfo<scalar_t, index_t>(self);
+  auto self_info = cuda::detail::getTensorInfo<const scalar_t, index_t>(self);
   auto values_info = cuda::detail::getTensorInfo<scalar_t, index_t>(values);
   auto indices_info = cuda::detail::getTensorInfo<int64_t, index_t>(indices);
 
diff --git a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
index 238479c545234..1aeaca19652a6 100644
--- a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
+++ b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh
@@ -186,7 +186,7 @@ __device__ void countRadixUsingMask(
     int radixDigitPos,
     index_t sliceSize,
     index_t withinSliceStride,
-    scalar_t* data) {
+    const scalar_t* data) {
   // Clear out per-thread counts from a previous round
 #pragma unroll
   for (int i = 0; i < RadixSize; ++i) {
@@ -256,7 +256,7 @@ constexpr int RADIX_MASK = (RADIX_SIZE - 1);
 template <typename scalar_t, typename bitwise_t, typename index_t>
 __device__ scalar_t findPattern(
     scalar_t* smem,
-    scalar_t* data,
+    const scalar_t* data,
     index_t sliceSize,
     index_t withinSliceStride,
     bitwise_t desired,
@@ -304,7 +304,7 @@ __device__ scalar_t findPattern(
 // Returns the top-Kth element found in the data using radix selection
 template <typename scalar_t, typename bitwise_t, typename index_t>
 __device__ void radixSelect(
-    scalar_t* data,
+    const scalar_t* data,
     index_t k,
     bool largest,
     index_t sliceSize,
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp
index 6a0c05a9e5424..1032fb28d799c 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@@ -38,52 +38,8 @@ using namespace at::native::detail;
 static void exec_cufft_plan(
     const CuFFTConfig &config, void* in_data, void* out_data, bool forward) {
   auto& plan = config.plan();
-#if defined(USE_ROCM)
-  auto value_type = config.data_type();
-  if (value_type == kFloat) {
-    switch (config.transform_type()) {
-      case CuFFTTransformType::C2C: {
-        CUFFT_CHECK(hipfftExecC2C(plan, static_cast<hipfftComplex*>(in_data),
-                                  static_cast<hipfftComplex*>(out_data),
-                                  forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
-        return;
-      }
-      case CuFFTTransformType::R2C: {
-        CUFFT_CHECK(hipfftExecR2C(plan, static_cast<hipfftReal*>(in_data),
-                                  static_cast<hipfftComplex*>(out_data)));
-        return;
-      }
-      case CuFFTTransformType::C2R: {
-        CUFFT_CHECK(hipfftExecC2R(plan, static_cast<hipfftComplex*>(in_data),
-                                  static_cast<hipfftReal*>(out_data)));
-        return;
-      }
-    }
-  } else if (value_type == kDouble) {
-    switch (config.transform_type()) {
-      case CuFFTTransformType::C2C: {
-        CUFFT_CHECK(hipfftExecZ2Z(plan, static_cast<hipfftDoubleComplex*>(in_data),
-                                  static_cast<hipfftDoubleComplex*>(out_data),
-                                  forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
-        return;
-      }
-      case CuFFTTransformType::R2C: {
-        CUFFT_CHECK(hipfftExecD2Z(plan, static_cast<hipfftDoubleReal*>(in_data),
-                                  static_cast<hipfftDoubleComplex*>(out_data)));
-        return;
-      }
-      case CuFFTTransformType::C2R: {
-        CUFFT_CHECK(hipfftExecZ2D(plan, static_cast<hipfftDoubleComplex*>(in_data),
-                                  static_cast<hipfftDoubleReal*>(out_data)));
-        return;
-      }
-    }
-  }
-  TORCH_CHECK(false, "hipFFT doesn't support transforms on type: ", value_type);
-#else
   CUFFT_CHECK(cufftXtExec(plan, in_data, out_data,
                           forward ? CUFFT_FORWARD : CUFFT_INVERSE));
-#endif
 }
 
 
@@ -315,7 +271,7 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
     at::globalContext().getNVRTC().cuCtxSetCurrent(pctx);
   }
 #endif /* !defined(USE_ROCM) */
-  exec_cufft_plan(*config, input.data_ptr(), out.data_ptr(), forward);
+  exec_cufft_plan(*config, const_cast<void*>(input.const_data_ptr()), out.data_ptr(), forward);
 
   // Inplace reshaping to original batch shape and inverting the dimension permutation
   DimVector out_strides(ndim);
@@ -387,7 +343,7 @@ Tensor _fft_r2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization
   // CuFFT requires real input to be over-aligned, as if it were complex
   const auto complex_size = 2 * self.element_size();
   const bool complex_aligned = (
-      reinterpret_cast<std::uintptr_t>(self.data_ptr()) % complex_size == 0);
+      reinterpret_cast<std::uintptr_t>(self.const_data_ptr()) % complex_size == 0);
   auto working_tensor = self;
   if (!complex_aligned) {
     working_tensor = self.movedim(last_dim, -1)
diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu
index a35e5e274b7e2..f2626ccff4db7 100644
--- a/aten/src/ATen/native/cuda/SummaryOps.cu
+++ b/aten/src/ATen/native/cuda/SummaryOps.cu
@@ -65,7 +65,7 @@ C10_LAUNCH_BOUNDS_1(cuda::getApplyBlockSize())
 __global__ void kernelHistogram1D(
     detail::TensorInfo<output_t, IndexType> a, /* output */
     detail::TensorInfo<output_t, IndexType> p, /* partial output */
-    detail::TensorInfo<input_t, IndexType> b, /* input */
+    detail::TensorInfo<const input_t, IndexType> b, /* input */
     int64_t nbins,
     at::acc_type<input_t, /*is_cuda=*/true> minvalue,
     at::acc_type<input_t, /*is_cuda=*/true> maxvalue,
@@ -86,7 +86,7 @@ __global__ void kernelHistogram1D(
     FOR_KERNEL_LOOP(linearIndex, totalElements) {
       // Convert `linearIndex` into an offset of `b`
       const IndexType bOffset =
-          detail::IndexToOffset<input_t, IndexType, BDims>::get(linearIndex, b);
+          detail::IndexToOffset<const input_t, IndexType, BDims>::get(linearIndex, b);
       const auto bVal = b.data[bOffset];
       if (bVal >= minvalue && bVal <= maxvalue) {
         // Use value at `b` as an offset of `smem`
@@ -112,7 +112,7 @@ __global__ void kernelHistogram1D(
     FOR_KERNEL_LOOP(linearIndex, totalElements) {
       // Convert `linearIndex` into an offset of `b`
       const IndexType bOffset =
-          detail::IndexToOffset<input_t, IndexType, BDims>::get(linearIndex, b);
+          detail::IndexToOffset<const input_t, IndexType, BDims>::get(linearIndex, b);
       const auto bVal = b.data[bOffset];
       if (bVal >= minvalue && bVal <= maxvalue) {
         // Use value at `b` as an offset of `a`
@@ -192,7 +192,7 @@ bool CUDA_tensor_histogram(
 
   const dim3 block = getApplyBlock();
   dim3 grid;
-  int64_t curDevice = current_device();
+  auto curDevice = current_device();
   if (curDevice == -1 || !getApplyGrid(totalElements, grid, curDevice)) {
     return false;
   }
@@ -219,7 +219,7 @@ bool CUDA_tensor_histogram(
 
   using IndexType = int64_t;
   auto aInfo = detail::getTensorInfo<output_t, IndexType>(a);
-  auto bInfo = detail::getTensorInfo<input_t, IndexType>(b);
+  auto bInfo = detail::getTensorInfo<const input_t, IndexType>(b);
   detail::TensorInfo<output_t, IndexType> pInfo(nullptr, 0, {}, {});
 
   if (HasWeights) {
diff --git a/aten/src/ATen/native/cuda/TensorShape.cu b/aten/src/ATen/native/cuda/TensorShape.cu
new file mode 100644
index 0000000000000..d82901ef94529
--- /dev/null
+++ b/aten/src/ATen/native/cuda/TensorShape.cu
@@ -0,0 +1,841 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/TensorShape.h>
+#include <c10/cuda/CUDAGraphsC10Utils.h>
+#include <c10/util/TypeCast.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_chunk_cat_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/split_with_sizes_copy_native.h>
+#endif
+
+namespace at::native {
+
+namespace detail {
+
+// NOTE [CUDA fast path for split_with_sizes_copy.out]
+// split_with_sizes_copy.out for contiguous operands has the following
+// properties:
+// - Each src split consists of multiple chunks that are separated by a fixed
+// stride. The number of chunks and the strides are the same across all src
+// splits.
+// - Each dst split is the concatenation of the chunks in its corresponding src
+// splits.
+// - The sizes of chunks vary across splits.
+// - A (src, dst) chunk pair is not guaranteed to have the
+// same alignment.
+//
+// The following strategies are employed to optimize for this workload:
+// - The entire workload is fused into a single kernel to maximize I/O
+// throughput and minimize wave quantization.
+// - To account for both small and large chunk sizes, a "jagged grid" is used.
+// Each chunk is processed by one or more blocks depending on its size.
+// - Within each chunk, the region in which writes can be vectorized is
+// identified. Within this region, writes are always vectorized and reads are
+// oppurtunistically vectorized.
+static constexpr int64_t BLOCK_SIZE = 128;
+static constexpr int64_t BYTES_PER_THREAD = 16;
+static constexpr int64_t BYTES_PER_BLOCK = BYTES_PER_THREAD * BLOCK_SIZE;
+
+static __host__ __device__ inline int64_t div_up(int64_t a, int64_t b) {
+  return (a + b - 1) / b;
+}
+
+template <typename T>
+__device__ inline void stream_load128(uint4& val, const T* addr) {
+  uint64_t low, high;
+#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
+  low = reinterpret_cast<const uint64_t*>(addr)[0];
+  high = reinterpret_cast<const uint64_t*>(addr)[1];
+#else
+  asm("ld.global.nc.v2.u64 {%0, %1}, [%2];"
+      : "=l"(low), "=l"(high)
+      : "l"(addr));
+#endif
+  reinterpret_cast<uint64_t*>(&val)[0] = low;
+  reinterpret_cast<uint64_t*>(&val)[1] = high;
+}
+
+template <typename T>
+__device__ inline void stream_store128(T* addr, const uint4& val) {
+  uint64_t low, high;
+  low = reinterpret_cast<const uint64_t*>(&val)[0];
+  high = reinterpret_cast<const uint64_t*>(&val)[1];
+#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
+  reinterpret_cast<uint64_t*>(addr)[0] = low;
+  reinterpret_cast<uint64_t*>(addr)[1] = high;
+#else
+  asm("st.global.cs.v2.u64 [%0], {%1, %2};" : : "l"(addr), "l"(low), "l"(high));
+#endif
+}
+
+template <typename T>
+static __device__ inline bool is_aligned(const void* addr) {
+  return reinterpret_cast<uintptr_t>(addr) % sizeof(T) == 0;
+}
+
+template <typename T>
+static __device__ inline void load128(uint4& val, const char* addr) {
+  for (size_t i = 0; i < detail::BYTES_PER_THREAD / sizeof(T); ++i) {
+    reinterpret_cast<T*>(&val)[i] = reinterpret_cast<const T*>(addr)[i];
+  }
+}
+
+template <>
+__device__ inline void load128<uint4>(uint4& val, const char* addr) {
+  stream_load128(val, addr);
+}
+
+static __device__ inline void load128(uint4& val, const char* addr) {
+  if (is_aligned<uint4>(addr)) {
+    load128<uint4>(val, addr);
+  } else if (is_aligned<int64_t>(addr)) {
+    load128<uint64_t>(val, addr);
+  } else if (is_aligned<uint32_t>(addr)) {
+    load128<uint32_t>(val, addr);
+  } else {
+    load128<uint8_t>(val, addr);
+  }
+}
+
+static __device__ __inline__ void get_aligned_region(
+    char* ptr,
+    const int64_t chunk_size,
+    const int64_t alignment,
+    int64_t& align_off,
+    int64_t& aligned_size) {
+  const int64_t ptr_val = reinterpret_cast<uintptr_t>(ptr);
+  align_off = detail::div_up(ptr_val, alignment) * alignment - ptr_val;
+  aligned_size = (chunk_size - align_off) / alignment * alignment;
+}
+
+static __device__ __inline__ void copy_chunk(
+    char* dst,
+    const char* src,
+    int64_t chunk_size,
+    int64_t thread_idx,
+    int64_t num_threads) {
+  if (chunk_size < num_threads) {
+    if (thread_idx < chunk_size) {
+      dst[thread_idx] = src[thread_idx];
+    }
+    return;
+  }
+
+  // Identify the region in which writes are guaranteed to be 128-bit aligned
+  int64_t align_off, aligned_size;
+  get_aligned_region(
+      dst, chunk_size, detail::BYTES_PER_THREAD, align_off, aligned_size);
+
+  for (int64_t off = align_off + thread_idx * detail::BYTES_PER_THREAD;
+       off < align_off + aligned_size;
+       off += num_threads * detail::BYTES_PER_THREAD) {
+    uint4 val;
+    // Oppurtunistically vectorize reads
+    load128(val, &src[off]);
+    stream_store128(&dst[off], val);
+  }
+
+  // Handle unaligned regions
+  if (thread_idx < align_off && thread_idx < chunk_size) {
+    dst[thread_idx] = src[thread_idx];
+  }
+  if (align_off + aligned_size + thread_idx < chunk_size) {
+    dst[align_off + aligned_size + thread_idx] =
+        src[align_off + aligned_size + thread_idx];
+  }
+}
+
+static __global__ void split_with_sizes_copy_out_contiguous_no_cast_kernel(
+    char** dst_base_addrs,
+    char** src_base_addrs,
+    int64_t* split_chunk_sizes,
+    int64_t* block_idx_to_split_idx,
+    int64_t* blocks_cumsums,
+    int64_t src_stride,
+    int64_t num_chunks) {
+  const int64_t split_idx = block_idx_to_split_idx[blockIdx.x];
+  const int64_t split_blocks =
+      blocks_cumsums[split_idx + 1] - blocks_cumsums[split_idx];
+  const int64_t split_threads = split_blocks * blockDim.x;
+  const int64_t split_thread_idx =
+      (blockIdx.x - blocks_cumsums[split_idx]) * blockDim.x + threadIdx.x;
+  const int64_t split_chunk_size = split_chunk_sizes[split_idx];
+
+  char* dst_base_addr = dst_base_addrs[split_idx];
+  char* src_base_addr = src_base_addrs[split_idx];
+
+  for (int64_t i = blockIdx.y; i < num_chunks; i += gridDim.y) {
+    copy_chunk(
+        dst_base_addr + i * split_chunk_size,
+        src_base_addr + i * src_stride,
+        split_chunk_size,
+        split_thread_idx,
+        split_threads);
+  }
+}
+
+// Calculate the base addr for each split.
+static inline std::vector<int64_t> get_split_base_addrs(
+    const at::Tensor& tensor,
+    at::IntArrayRef split_sizes,
+    int64_t dim) {
+  const auto* data_ptr = static_cast<const char*>(tensor.const_data_ptr());
+  const auto strides = tensor.strides();
+  const auto element_sz = tensor.element_size();
+  int64_t off = 0;
+  std::vector<int64_t> split_base_addrs;
+  split_base_addrs.reserve(split_sizes.size());
+  for (const auto& split_size : split_sizes) {
+    split_base_addrs.push_back(reinterpret_cast<int64_t>(data_ptr + off));
+    off += split_size * strides[dim] * element_sz;
+  }
+  return split_base_addrs;
+}
+
+static inline std::vector<int64_t> get_dst_addrs(at::TensorList out) {
+  std::vector<int64_t> addrs;
+  addrs.reserve(out.size());
+  for (const auto& tensor : out) {
+    addrs.push_back(reinterpret_cast<int64_t>(tensor.data_ptr()));
+  }
+  return addrs;
+}
+
+// Calculate the chunk size for each split in bytes.
+static inline std::vector<int64_t> get_split_chunk_sizes(
+    const at::Tensor& tensor,
+    at::IntArrayRef split_sizes,
+    int64_t dim) {
+  const auto stride = tensor.stride(dim);
+  const auto element_sz = tensor.element_size();
+  std::vector<int64_t> split_chunk_sizes;
+  split_chunk_sizes.reserve(split_sizes.size());
+  for (const auto& split_size : split_sizes) {
+    split_chunk_sizes.push_back(split_size * stride * element_sz);
+  }
+  return split_chunk_sizes;
+}
+
+// Calculate the chunk stride in bytes. This is the same for all splits.
+static inline int64_t get_chunk_stride(const at::Tensor& tensor, int64_t dim) {
+  int64_t stride = 1;
+  for (int64_t d = dim; d < tensor.dim(); ++d) {
+    stride *= tensor.sizes()[d];
+  }
+  return stride * tensor.element_size();
+}
+
+// Calculate the number of chunks. This is the same for all splits.
+static inline int64_t get_num_chunks(const at::Tensor& tensor, int64_t dim) {
+  int64_t num_chunks = tensor.numel();
+  for (int64_t d = dim; d < tensor.dim(); ++d) {
+    num_chunks /= tensor.sizes()[d];
+  }
+  return num_chunks;
+}
+
+// Pack multiple std::vector<int64_t> into a single cuda tensor.
+std::pair<at::Tensor, std::vector<int64_t*>> pack_vecs(
+    std::vector<const std::vector<int64_t>*> vecs,
+    const at::Device& device) {
+  int64_t numel = 0;
+  for (const auto* vec : vecs) {
+    numel += vec->size();
+  }
+
+  auto packed = at::empty(
+      {numel}, at::TensorOptions().dtype(at::kLong).pinned_memory(true));
+  size_t offset = 0;
+  for (const auto* vec : vecs) {
+    memcpy(
+        packed.data_ptr<int64_t>() + offset,
+        vec->data(),
+        sizeof(int64_t) * vec->size());
+    offset += vec->size();
+  }
+  packed = packed.to(device, /*non_blocking=*/true);
+
+  std::vector<int64_t*> ptrs;
+  ptrs.reserve(vecs.size());
+  offset = 0;
+  for (const auto* vec : vecs) {
+    ptrs.push_back(packed.data_ptr<int64_t>() + offset);
+    offset += vec->size();
+  }
+  return std::make_pair(std::move(packed), std::move(ptrs));
+}
+
+static inline std::vector<int64_t> get_chunk_cat_out_sizes(
+    IntArrayRef input_tensor_sizes,
+    int64_t dim,
+    int64_t num_chunks,
+    int64_t chunk_size,
+    int64_t out_element_size) {
+  std::vector<int64_t> view_sizes = std::vector<int64_t>(
+      input_tensor_sizes.begin(), input_tensor_sizes.begin() + dim);
+  view_sizes.insert(
+      view_sizes.end(), {num_chunks, chunk_size / out_element_size});
+  return view_sizes;
+}
+
+// Copy `max_chunk_size` bytes from `src` to `dst` by `num_threads`, and pad
+// zero when `src` size (i.e., actual_chunk_size) is less than `max_chunk_size`.
+// Assume elements of src and dst have the same data type.
+template <typename dst_t, typename src_t>
+__device__ __inline__ void copy_chunk_with_pad(
+    dst_t* dst_ptr,
+    src_t* src_ptr,
+    int64_t max_chunk_size,
+    int64_t actual_chunk_size,
+    int64_t thread_idx,
+    int64_t num_threads) {
+  // Supports type cast
+  if (!std::is_same_v<dst_t, src_t>) {
+    const int64_t max_num_elems = max_chunk_size / sizeof(dst_t);
+    const int64_t actual_num_elems = actual_chunk_size / sizeof(src_t);
+    int64_t elem_index = thread_idx;
+    while (elem_index < actual_num_elems) {
+      dst_ptr[elem_index] =
+          static_cast_with_inter_type<dst_t, src_t>::apply(src_ptr[elem_index]);
+      elem_index += num_threads;
+    }
+    while (elem_index < max_num_elems) {
+      dst_ptr[elem_index] = static_cast_with_inter_type<dst_t, int>::apply(0);
+      elem_index += num_threads;
+    }
+    return;
+  }
+  char* dst = reinterpret_cast<char*>(dst_ptr);
+  char* src = reinterpret_cast<char*>(src_ptr);
+  // Fast path when the number of threads is larger than the number of bytes to
+  // be copied (i.e., max_chunk_size). In this case, each thread only copies 1
+  // byte. For 0 <= thread_idx < actual_chunk_size, the thread copies data from
+  // `src`. For actual_chunk_size <= thread_idx < max_chunk_size, the thread set
+  // the val=0 for padding.
+  if (max_chunk_size < num_threads) {
+    char val = static_cast<char>(0);
+    if (thread_idx < actual_chunk_size) {
+      val = src[thread_idx];
+    }
+    if (thread_idx < max_chunk_size) {
+      dst[thread_idx] = val;
+    }
+    return;
+  }
+  // Split dst array into three parts:
+  // [dst, dst+align_off), [dst+align_off, dst+align_end), [dst+align_end,
+  // dst+max_chunk_size) The second part is aligned with BYTES_PER_THREAD(=16
+  // bytes) to enable `stream_store128`.
+  int64_t align_off, aligned_size;
+  get_aligned_region(
+      dst, actual_chunk_size, BYTES_PER_THREAD, align_off, aligned_size);
+  int64_t align_end = align_off + aligned_size;
+  for (int64_t i = align_off + thread_idx * BYTES_PER_THREAD; i < align_end;
+       i += num_threads * BYTES_PER_THREAD) {
+    uint4 val;
+    if (is_aligned<uint4>(src + i)) {
+      stream_load128(val, src + i);
+    } else {
+      for (size_t j = 0; j < BYTES_PER_THREAD; ++j) {
+        reinterpret_cast<char*>(&val)[j] = src[i + j];
+      }
+    }
+    stream_store128(&dst[i], val);
+  }
+  // Copy data for the first part of dst array [dst, dst+align_off).
+  // Check `thread_idx<max_chunk_sze` for the edge case that max_chunk_size <
+  // align_off.
+  if (thread_idx < align_off && thread_idx < max_chunk_size) {
+    char val = (char)0;
+    if (thread_idx < actual_chunk_size) {
+      val = src[thread_idx];
+    }
+    dst[thread_idx] = val;
+  }
+  // Copy data for the third part of dst array [dst+align_end,
+  // dst+max_chunk_size).
+  while (align_end + thread_idx < max_chunk_size) {
+    char val = (char)0;
+    if (align_end + thread_idx < actual_chunk_size) {
+      val = src[align_end + thread_idx];
+    }
+    dst[align_end + thread_idx] = val;
+    align_end += num_threads;
+  }
+}
+
+// NOTE [CUDA kernel for chunk_cat]
+// chunk_cat_cuda adopts a "jagged grid" strategy, inspired by NOTE [CUDA fast
+// path for split_with_sizes_copy.out]. In addition, chunk_cat_cuda supports
+// padding via copy_chunk_with_pad when src chunk size is less than dst chunk
+// size.
+template <typename dst_t, typename src_t>
+static __global__ void chunk_cat_cuda_kernel(
+    src_t** src,
+    dst_t* dst,
+    int64_t* block_idx_to_tensor_idx,
+    int64_t* tensor_idx_to_start_tensor_bytes,
+    int64_t* start_block_idx_per_tensor_chunk,
+    int64_t* actual_tensor_sizes,
+    int64_t* pad_tensor_chunk_sizes,
+    int64_t* num_blocks_per_tensor_chunk,
+    int64_t slice_size,
+    int64_t chunk_size,
+    int64_t dst_to_src_ratio) {
+  const int64_t slice_idx = blockIdx.z;
+  const int64_t chunk_idx = blockIdx.y;
+  const int64_t tensor_idx = block_idx_to_tensor_idx[blockIdx.x];
+  const int64_t tile_idx =
+      blockIdx.x - start_block_idx_per_tensor_chunk[tensor_idx];
+  // Number of threads for the `tensor_idx`-th tensor chunk.
+  const int64_t num_threads =
+      num_blocks_per_tensor_chunk[tensor_idx] * BLOCK_SIZE;
+  const int64_t thread_idx = tile_idx * BLOCK_SIZE + threadIdx.x;
+  char* src_addr = reinterpret_cast<char**>(src)[tensor_idx] +
+      slice_idx * actual_tensor_sizes[tensor_idx] +
+      chunk_idx * pad_tensor_chunk_sizes[tensor_idx] / dst_to_src_ratio;
+  char* dst_addr = reinterpret_cast<char*>(dst) + slice_idx * slice_size +
+      chunk_idx * chunk_size + tensor_idx_to_start_tensor_bytes[tensor_idx];
+  // Compute the actual number of bytes to copy from src.
+  const int64_t actual_copy_size = std::min(
+      pad_tensor_chunk_sizes[tensor_idx] / dst_to_src_ratio,
+      std::max(
+          (int64_t)0,
+          actual_tensor_sizes[tensor_idx] -
+              chunk_idx * pad_tensor_chunk_sizes[tensor_idx] /
+                  dst_to_src_ratio));
+  copy_chunk_with_pad<dst_t, src_t>(
+      reinterpret_cast<dst_t*>(dst_addr),
+      reinterpret_cast<src_t*>(src_addr),
+      pad_tensor_chunk_sizes[tensor_idx],
+      actual_copy_size,
+      thread_idx,
+      num_threads);
+}
+
+bool all_contiguous(TensorList tensors) {
+  bool contiguous = true;
+  for (const auto& t : tensors) {
+    contiguous &= t.is_non_overlapping_and_dense();
+  }
+  return contiguous;
+}
+
+// Get leading dimensions before `dim`-th dimension.
+static inline int64_t get_leading_dim(at::IntArrayRef sizes, int64_t dim) {
+  int64_t leading_dim = 1;
+  if (dim > 0) {
+    leading_dim = c10::multiply_integers(sizes.slice(0, dim));
+  }
+  return leading_dim;
+}
+
+// Get trailing dimensions after `dim`-th dimension and padded size along
+// `dim`-th dimension.
+static inline std::pair<int64_t, int64_t> get_pad_size(
+    at::IntArrayRef sizes,
+    int64_t dim,
+    int64_t num_chunks) {
+  int64_t trailing_numel = 1;
+  if (sizes.size() > (uint64_t)dim + 1) {
+    trailing_numel =
+        c10::multiply_integers(sizes.slice(dim + 1, sizes.size() - dim - 1));
+  }
+  int64_t pad_size_along_dim =
+      detail::div_up(sizes[dim], num_chunks) * num_chunks;
+  return std::make_pair(pad_size_along_dim, trailing_numel);
+}
+
+// Get the padded chunk size.
+static inline int64_t get_chunk_size(
+    TensorList tensors,
+    int64_t dim,
+    int64_t num_chunks,
+    int64_t elem_size) {
+  auto num_tensors = tensors.size();
+  int64_t chunk_size = 0;
+  for (const auto i : c10::irange(num_tensors)) {
+    auto [pad_size_along_dim, trailing_numel] =
+        get_pad_size(tensors[i].sizes(), dim, num_chunks);
+    const int64_t pad_tensor_chunk_size =
+        pad_size_along_dim * trailing_numel * elem_size / num_chunks;
+    chunk_size += pad_tensor_chunk_size;
+  }
+  return chunk_size;
+}
+
+// Get metadata for chunk_cat.
+std::tuple<
+    int64_t,
+    int64_t,
+    int64_t,
+    int64_t,
+    std::vector<int64_t>,
+    std::vector<int64_t>,
+    std::vector<int64_t>,
+    std::vector<int64_t>,
+    std::vector<int64_t>,
+    std::vector<int64_t>,
+    std::vector<int64_t>>
+get_chunk_cat_metadata(
+    TensorList tensors,
+    int64_t dim,
+    int64_t num_chunks,
+    int64_t dst_elem_size,
+    int64_t src_elem_size) {
+  TORCH_CHECK(
+      dst_elem_size % src_elem_size == 0,
+      "get_chunk_cat_metadata error: only support dst_elem_size % src_elem_size == 0");
+  auto num_tensors = tensors.size();
+  int64_t leading_dim = get_leading_dim(tensors[0].sizes(), dim);
+  std::vector<int64_t> pad_tensor_chunk_sizes;
+  std::vector<int64_t> num_blocks_per_tensor_chunk;
+  std::vector<int64_t> start_block_idx_per_tensor_chunk{0};
+  std::vector<int64_t> actual_tensor_sizes;
+  std::vector<int64_t> tensor_idx_to_start_tensor_bytes{0};
+  std::vector<int64_t> srcs;
+  pad_tensor_chunk_sizes.reserve(num_tensors);
+  num_blocks_per_tensor_chunk.reserve(num_tensors);
+  start_block_idx_per_tensor_chunk.reserve(num_tensors + 1);
+  actual_tensor_sizes.reserve(num_tensors);
+  tensor_idx_to_start_tensor_bytes.reserve(num_tensors + 1);
+  srcs.reserve(num_tensors);
+  // block_idx_to_tensor_idx cannot be reserved since the number of blocks is
+  // data dependent
+  std::vector<int64_t> block_idx_to_tensor_idx;
+  // Inline computing `chunk_size` to avoid redundant computation
+  int64_t chunk_size = 0;
+  for (const auto i : c10::irange(num_tensors)) {
+    at::Tensor tensor = tensors[i];
+    srcs.push_back(reinterpret_cast<int64_t>(tensor.data_ptr()));
+    auto sizes = tensor.sizes();
+    auto [pad_size_along_dim, trailing_numel] =
+        get_pad_size(sizes, dim, num_chunks);
+    const int64_t pad_tensor_chunk_size =
+        pad_size_along_dim * trailing_numel * dst_elem_size / num_chunks;
+    pad_tensor_chunk_sizes.push_back(pad_tensor_chunk_size);
+    chunk_size += pad_tensor_chunk_size;
+    // Number of blocks required to process this tensor chunk.
+    const int64_t num_blocks =
+        detail::div_up(pad_tensor_chunk_size, detail::BYTES_PER_BLOCK);
+    num_blocks_per_tensor_chunk.push_back(num_blocks);
+    start_block_idx_per_tensor_chunk.push_back(
+        start_block_idx_per_tensor_chunk.back() + num_blocks);
+    block_idx_to_tensor_idx.insert(
+        block_idx_to_tensor_idx.end(), num_blocks, i);
+    tensor_idx_to_start_tensor_bytes.push_back(
+        tensor_idx_to_start_tensor_bytes.back() + pad_tensor_chunk_size);
+    actual_tensor_sizes.push_back(sizes[dim] * trailing_numel * src_elem_size);
+  }
+  const int64_t num_blocks_per_chunk = start_block_idx_per_tensor_chunk.back();
+  const int64_t slice_size = num_chunks * chunk_size;
+  return std::make_tuple(
+      chunk_size,
+      leading_dim,
+      num_blocks_per_chunk,
+      slice_size,
+      srcs,
+      block_idx_to_tensor_idx,
+      tensor_idx_to_start_tensor_bytes,
+      start_block_idx_per_tensor_chunk,
+      actual_tensor_sizes,
+      pad_tensor_chunk_sizes,
+      num_blocks_per_tensor_chunk);
+}
+
+// See [CUDA kernel for chunk_cat_cuda]
+template <typename dst_t, typename src_t>
+void _chunk_cat_out_cuda_contiguous(
+    TensorList tensors,
+    int64_t dim,
+    int64_t num_chunks,
+    Tensor& out,
+    int64_t dst_elem_size,
+    int64_t src_elem_size) {
+  const auto device = tensors[0].device();
+  // `get_chunk_cat_metadata` must return vectors and `pack_vecs` cannot be
+  // moved into `get_chunk_cat_metadata`. Otherwise `packed` would point to
+  // vectors allocated inside `get_chunk_cat_metadata` which become out of local
+  // scope.
+  auto
+      [chunk_size,
+       leading_dim,
+       num_blocks_per_chunk,
+       slice_size,
+       srcs,
+       block_idx_to_tensor_idx,
+       tensor_idx_to_start_tensor_bytes,
+       start_block_idx_per_tensor_chunk,
+       actual_tensor_sizes,
+       pad_tensor_chunk_sizes,
+       num_blocks_per_tensor_chunk] =
+          get_chunk_cat_metadata(
+              tensors, dim, num_chunks, dst_elem_size, src_elem_size);
+  auto packed = pack_vecs(
+      {&srcs,
+       &block_idx_to_tensor_idx,
+       &tensor_idx_to_start_tensor_bytes,
+       &start_block_idx_per_tensor_chunk,
+       &actual_tensor_sizes,
+       &pad_tensor_chunk_sizes,
+       &num_blocks_per_tensor_chunk},
+      device);
+  std::vector<int64_t> view_sizes = get_chunk_cat_out_sizes(
+      tensors[0].sizes(), dim, num_chunks, chunk_size, dst_elem_size);
+  at::native::resize_output(out, view_sizes);
+  dim3 blocks(num_blocks_per_chunk, num_chunks, leading_dim);
+  dim3 threads(detail::BLOCK_SIZE, 1, 1);
+  detail::chunk_cat_cuda_kernel<<<
+      blocks,
+      threads,
+      0,
+      at::cuda::getCurrentCUDAStream()>>>(
+      /*srcs=*/reinterpret_cast<src_t**>(packed.second[0]),
+      reinterpret_cast<dst_t*>(out.data_ptr()),
+      /*block_idx_to_tensor_idx=*/packed.second[1],
+      /*tensor_idx_to_start_tensor_bytes=*/packed.second[2],
+      /*start_block_idx_per_tensor_chunk=*/packed.second[3],
+      /*actual_tensor_sizes=*/packed.second[4],
+      /*pad_tensor_chunk_sizes=*/packed.second[5],
+      /*num_blocks_per_tensor_chunk=*/packed.second[6],
+      slice_size,
+      chunk_size,
+      dst_elem_size / src_elem_size);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+} // namespace detail
+
+// See [CUDA fast path for split_with_sizes_copy.out]
+void split_with_sizes_copy_out_cuda_contiguous_no_cast(
+    const at::Tensor& self,
+    at::IntArrayRef split_sizes,
+    int64_t dim,
+    at::TensorList out) {
+  const auto device = self.device();
+  const auto src_base_addrs =
+      detail::get_split_base_addrs(self, split_sizes, dim);
+  const auto dst_base_addrs = detail::get_dst_addrs(out);
+  const auto src_stride = detail::get_chunk_stride(self, dim);
+  const auto split_chunk_sizes =
+      detail::get_split_chunk_sizes(self, split_sizes, dim);
+  const auto num_chunks = detail::get_num_chunks(self, dim);
+
+  // Calculate the number of blocks required for the first chunk across all
+  // splits, assuming each thread only processes BYTES_PER_THREAD bytes.
+  int64_t num_blocks = 0;
+  for (const auto& split_chunk_size : split_chunk_sizes) {
+    num_blocks += detail::div_up(
+        split_chunk_size, detail::BLOCK_SIZE * detail::BYTES_PER_THREAD);
+  }
+
+  // Calculate the maximum number of blocks to launch. Only consider
+  // maxThreadsPerMultiProcessor as a limiting factor as the kernel uses no
+  // shared memory and little registers. Over-subscribe the SMs to hide I/O
+  // latency.
+  const auto num_sms =
+      at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  const auto max_threads_per_sm =
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor;
+  const int64_t max_blocks =
+      num_sms * max_threads_per_sm / detail::BLOCK_SIZE * 2.0;
+
+  // Make each thread process BYTES_PER_THREAD * iter_factor bytes to regulate
+  // block size. Spread iter_factor evenly between chunks_per_block and
+  // iters_per_chunk.
+  int64_t iter_factor = detail::div_up(num_blocks * num_chunks, max_blocks);
+  int64_t chunks_per_block = std::ceil(std::sqrt(iter_factor));
+  chunks_per_block = std::min(chunks_per_block, num_chunks);
+  const int64_t iters_per_chunk = detail::div_up(iter_factor, chunks_per_block);
+
+  // Launch a logically jagged grid of shape
+  // (chunk_size*, num_splits, num_chunks / chunks_per_block)
+  // backed by a physical grid of shape
+  // (sum(chunk_size), num_chunks / chunks_per_block).
+  // A block can find its split_idx via block_idx_to_split_idx.
+  std::vector<int64_t> block_idx_to_split_idx;
+  std::vector<int64_t> blocks_cumsums{0};
+  block_idx_to_split_idx.reserve(num_blocks);
+  for (size_t split_idx = 0; split_idx < split_sizes.size(); ++split_idx) {
+    const auto blocks = detail::div_up(
+        split_chunk_sizes[split_idx],
+        detail::BLOCK_SIZE * detail::BYTES_PER_THREAD * iters_per_chunk);
+    block_idx_to_split_idx.insert(
+        block_idx_to_split_idx.end(), blocks, split_idx);
+    blocks_cumsums.push_back(blocks_cumsums.back() + blocks);
+  }
+
+  dim3 blocks(blocks_cumsums.back(), num_chunks / chunks_per_block, 1);
+  dim3 threads(detail::BLOCK_SIZE, 1, 1);
+
+  auto [_, ptrs] = detail::pack_vecs(
+      {&dst_base_addrs,
+       &src_base_addrs,
+       &split_chunk_sizes,
+       &block_idx_to_split_idx,
+       &blocks_cumsums},
+      device);
+
+  detail::split_with_sizes_copy_out_contiguous_no_cast_kernel<<<
+      blocks,
+      threads,
+      0,
+      at::cuda::getCurrentCUDAStream()>>>(
+      /*dst_base_addrs=*/reinterpret_cast<char**>(ptrs[0]),
+      /*src_base_addrs=*/reinterpret_cast<char**>(ptrs[1]),
+      /*split_chunk_sizes=*/ptrs[2],
+      /*block_idx_to_split_idx=*/ptrs[3],
+      /*blocks_cumsums=*/ptrs[4],
+      src_stride,
+      num_chunks);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+void split_with_sizes_copy_out_cuda(
+    const Tensor& self,
+    IntArrayRef split_sizes,
+    int64_t dim,
+    TensorList out) {
+  const bool is_capturing = at::cuda::currentStreamCaptureStatusMayInitCtx() !=
+      at::cuda::CaptureStatus::None;
+  bool contiguous_no_cast = self.is_non_overlapping_and_dense();
+  for (const auto& t : out) {
+    contiguous_no_cast &= t.is_non_overlapping_and_dense();
+    contiguous_no_cast &= (t.dtype() == self.dtype());
+  }
+  // TODO(yifu): make the fast path work for CUDA graph
+  if (!is_capturing && contiguous_no_cast) {
+    // Perform equivalent checks performed by the composite impl
+    if (dim < 0) {
+      dim = at::maybe_wrap_dim(dim, self.dim());
+    }
+    TORCH_CHECK(
+        self.dim() != 0, "split expects at least a 1-dimensional tensor")
+
+    const int64_t dim_size = self.size(dim);
+    int64_t split_sizes_sum = 0;
+    for (const auto i : c10::irange(split_sizes.size())) {
+      TORCH_CHECK(
+          split_sizes[i] >= 0,
+          "split_with_sizes expects split_sizes have only non-negative ",
+          "entries, but got split_sizes=",
+          split_sizes[i]);
+      split_sizes_sum += split_sizes[i];
+    }
+    TORCH_CHECK(
+        split_sizes_sum == dim_size,
+        "split_with_sizes expects split_sizes to sum exactly to ",
+        dim_size,
+        " (input tensor's size at dimension ",
+        dim,
+        "), ",
+        "but got split_sizes=",
+        split_sizes);
+
+    TORCH_CHECK(
+        out.size() == split_sizes.size(),
+        "split_with_sizes_copy_out() expected an out= argument of size ",
+        split_sizes.size(),
+        ", got size ",
+        out.size());
+
+    auto out_shape = self.sizes().vec();
+    for (const auto i : c10::irange(split_sizes.size())) {
+      out_shape[dim] = split_sizes[i];
+      if (resize_output_check(out[i], out_shape)) {
+        out[i].resize_(out_shape);
+      }
+      TORCH_CHECK(
+          out[i].dtype() == self.dtype(),
+          "Expected out tensor to have dtype ",
+          self.dtype(),
+          ", but got ",
+          out[i].dtype(),
+          " instead");
+      TORCH_CHECK(
+          out[i].device() == self.device(),
+          "Expected out tensor to have device ",
+          self.device(),
+          ", but got ",
+          out[i].device(),
+          " instead");
+    }
+    split_with_sizes_copy_out_cuda_contiguous_no_cast(
+        self, split_sizes, dim, out);
+  } else {
+    at::native::split_with_sizes_copy_out(self, split_sizes, dim, out);
+  }
+}
+
+Tensor _chunk_cat_cuda(TensorList tensors, int64_t dim, int64_t num_chunks) {
+  dim = at::native::preprocess_chunk_cat_inputs(tensors, dim, num_chunks);
+  if (detail::all_contiguous(tensors)) {
+    // Return a tensor with the same dtype as input tensors
+    int64_t elem_size = tensors[0].element_size();
+    int64_t chunk_size =
+        detail::get_chunk_size(tensors, dim, num_chunks, elem_size);
+    int64_t leading_dim = detail::get_leading_dim(tensors[0].sizes(), dim);
+    auto view_sizes = detail::get_chunk_cat_out_sizes(
+        tensors[0].sizes(), dim, num_chunks, chunk_size, elem_size);
+    Tensor out =
+        tensors[0]
+            .new_empty(chunk_size * num_chunks * leading_dim / elem_size)
+            .view(view_sizes);
+    // Type-agnostic copy since out and input tensors have the same type.
+    detail::_chunk_cat_out_cuda_contiguous<char, char>(
+        tensors, dim, num_chunks, out, elem_size, elem_size);
+    return out;
+  } else {
+    return at::native::_chunk_cat(tensors, dim, num_chunks);
+  }
+}
+
+Tensor& _chunk_cat_out_cuda(
+    TensorList tensors,
+    int64_t dim,
+    int64_t num_chunks,
+    Tensor& out) {
+  dim = at::native::preprocess_chunk_cat_inputs(tensors, dim, num_chunks);
+  TORCH_CHECK(
+      tensors[0].device() == out.device(),
+      "_chunk_cat_out_cuda: mismatch between input and out tensor devices");
+  bool both_input_output_contiguous =
+      detail::all_contiguous(tensors) && out.is_non_overlapping_and_dense();
+  if (both_input_output_contiguous &&
+      (tensors[0].dtype() == at::ScalarType::BFloat16) &&
+      (out.dtype() == at::ScalarType::Float)) {
+    // _chunk_cat_out_cuda_contiguous should also support other types, thanks to
+    // static_cast_with_inter_type. Here, we dispatch to BFloat16 in and float32
+    // out since it is the only known use case.
+    detail::_chunk_cat_out_cuda_contiguous<float, BFloat16>(
+        tensors,
+        dim,
+        num_chunks,
+        out,
+        out.element_size(),
+        tensors[0].element_size());
+  } else if (
+      both_input_output_contiguous && tensors[0].dtype() == out.dtype()) {
+    // Type-agnostic copy since out and input tensors have the same type.
+    detail::_chunk_cat_out_cuda_contiguous<char, char>(
+        tensors,
+        dim,
+        num_chunks,
+        out,
+        out.element_size(),
+        tensors[0].element_size());
+  } else {
+    at::native::_chunk_cat_out(tensors, dim, num_chunks, out);
+  }
+  return out;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cpp b/aten/src/ATen/native/cuda/TensorTopK.cpp
index 36e45d4dae2a0..f44cdcdcea2c5 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cpp
+++ b/aten/src/ATen/native/cuda/TensorTopK.cpp
@@ -26,8 +26,7 @@ void topk_out_with_sort(
   const Tensor& values,
   const Tensor& indices
 ) {
-  Tensor sorted_values, sorted_indices;
-  std::tie(sorted_values, sorted_indices) = at::cuda::sort(self, /* stable= */false, dim, largest);
+  auto [sorted_values, sorted_indices] = at::cuda::sort(self, /* stable= */false, dim, largest);
   values.copy_(sorted_values.narrow(dim, 0, k));
   indices.copy_(sorted_indices.narrow(dim, 0, k));
 }
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu
index bd48c9b058084..d06efa6635131 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@@ -37,7 +37,7 @@ struct AddOp {
 
 template <typename T, typename IndexType, int Dim, bool WithKthValues>
 C10_LAUNCH_BOUNDS_1(1024)
-__global__ void gatherTopK(at::cuda::detail::TensorInfo<T, IndexType> input,
+__global__ void gatherTopK(at::cuda::detail::TensorInfo<const T, IndexType> input,
                            IndexType inputSliceSize,
                            IndexType outputSliceSize, // aka `k`
                            bool largest,
@@ -65,13 +65,13 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<T, IndexType> input,
 
   // Find the start offset for our slice
   IndexType sliceStartIndex =
-    at::cuda::detail::IndexToOffset<T, IndexType, Dim>::get(slice, input);
+    at::cuda::detail::IndexToOffset<const T, IndexType, Dim>::get(slice, input);
   IndexType topKSliceStartIndex =
     at::cuda::detail::IndexToOffset<T, IndexType, Dim>::get(slice, topK);
   IndexType indicesSliceStartIndex =
     at::cuda::detail::IndexToOffset<int64_t, IndexType, Dim>::get(slice, indices);
 
-  T* inputSliceStart = &input.data[sliceStartIndex];
+  const T* inputSliceStart = &input.data[sliceStartIndex];
   T* topKSliceStart = &topK.data[topKSliceStartIndex];
   int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex];
 
@@ -179,7 +179,7 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<T, IndexType> input,
 
 template <typename T, typename IndexType, int Dim>
 void launch(
-    at::cuda::detail::TensorInfo<T, IndexType> input,
+    at::cuda::detail::TensorInfo<const T, IndexType> input,
     IndexType inputSliceSize,
     IndexType outputSliceSize, // aka `k`
     bool largest,
@@ -247,7 +247,7 @@ __global__ void fill(T* x, T value, IndexType size) {
 template <typename T, typename IndexType, typename Bitwise, int Dim>
 C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
 __global__ void radixFindKthValues(
-    at::cuda::detail::TensorInfo<T, IndexType> input,
+    at::cuda::detail::TensorInfo<const T, IndexType> input,
     uint32_t slice_size,
     uint32_t* ks_to_find,  // size: num_slices
 
@@ -277,8 +277,8 @@ __global__ void radixFindKthValues(
 
   Bitwise desired = desires[slice_idx];
   uint32_t k_to_find = ks_to_find[slice_idx];
-  IndexType slice_start_index = at::cuda::detail::IndexToOffset<T, IndexType, Dim>::get(slice_idx, input);
-  T* data = &input.data[slice_start_index];
+  IndexType slice_start_index = at::cuda::detail::IndexToOffset<const T, IndexType, Dim>::get(slice_idx, input);
+  const T* data = &input.data[slice_start_index];
 
   typedef cub::BlockScan<uint32_t, BLOCK_THREADS> BlockScan;
   static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits<short>::max(),
@@ -300,7 +300,7 @@ __global__ void radixFindKthValues(
       ? items_per_thread
       : at::ceil_div((int64_t)(slice_size - blk_idx_in_slice * items_per_block), (int64_t)BLOCK_THREADS);
 
-  // collect digit counts and store in shared memorey
+  // collect digit counts and store in shared memory
   for (int i = 0; i < items_per_thread; ++i) {
     // Find the start offset for this slice
     IndexType idx = blk_idx_in_slice * items_per_block + i * BLOCK_THREADS + tidx;
@@ -493,7 +493,7 @@ __global__ void computeBlockwiseKthCounts(
 
 template <typename T, typename IndexType, int Dim>
 C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
-__global__ void gatherTopK(at::cuda::detail::TensorInfo<T, IndexType> input,
+__global__ void gatherTopK(at::cuda::detail::TensorInfo<const T, IndexType> input,
                            IndexType inputSliceSize,
                            IndexType outputSliceSize, // aka `k`
                            bool largest,
@@ -537,13 +537,13 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<T, IndexType> input,
 
   // Find the start offset for our slice
   IndexType sliceStartIndex =
-    at::cuda::detail::IndexToOffset<T, IndexType, Dim>::get(slice_idx, input);
+    at::cuda::detail::IndexToOffset<const T, IndexType, Dim>::get(slice_idx, input);
   IndexType topKSliceStartIndex =
     at::cuda::detail::IndexToOffset<T, IndexType, Dim>::get(slice_idx, topK);
   IndexType indicesSliceStartIndex =
     at::cuda::detail::IndexToOffset<int64_t, IndexType, Dim>::get(slice_idx, indices);
 
-  T* inputSliceStart = &input.data[sliceStartIndex];
+  const T* inputSliceStart = &input.data[sliceStartIndex];
   T* topKSliceStart = &topK.data[topKSliceStartIndex];
   int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex];
 
@@ -640,7 +640,7 @@ public:
 
 template <typename T, typename IndexType, int Dim>
 void launch(
-    at::cuda::detail::TensorInfo<T, IndexType> input,
+    at::cuda::detail::TensorInfo<const T, IndexType> input,
     IndexType inputSliceSize,
     IndexType outputSliceSize, // aka `k`
     bool largest,
@@ -836,8 +836,8 @@ void launch_gather_topk_kernel(
 
 #define RUN_T(INDEX_T)                                                    \
   AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "topk_out_cuda", [&] { \
-    at::cuda::detail::TensorInfo<scalar_t, INDEX_T> inputInfo =           \
-      at::cuda::detail::getTensorInfo<scalar_t, INDEX_T>(input);          \
+    at::cuda::detail::TensorInfo<const scalar_t, INDEX_T> inputInfo =     \
+      at::cuda::detail::getTensorInfo<const scalar_t, INDEX_T>(input);    \
     at::cuda::detail::TensorInfo<scalar_t, INDEX_T> topKInfo =            \
       at::cuda::detail::getTensorInfo<scalar_t, INDEX_T>(values);         \
     at::cuda::detail::TensorInfo<int64_t, INDEX_T> indicesInfo =          \
diff --git a/aten/src/ATen/native/cuda/TriangularOps.cu b/aten/src/ATen/native/cuda/TriangularOps.cu
index e7ab3a44ddf35..efc79672f5621 100644
--- a/aten/src/ATen/native/cuda/TriangularOps.cu
+++ b/aten/src/ATen/native/cuda/TriangularOps.cu
@@ -19,33 +19,53 @@
 
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 
+#define BOOL_SWITCH(COND, CONST_NAME, ...)      \
+  [&] {                                         \
+    if (COND) {                                 \
+      constexpr static bool CONST_NAME = true;  \
+      return __VA_ARGS__();                     \
+    } else {                                    \
+      constexpr static bool CONST_NAME = false; \
+      return __VA_ARGS__();                     \
+    }                                           \
+  }()
+
 namespace at::native {
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triu/tril ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-template <typename scalar_t, typename IndexType, bool upper>
-C10_LAUNCH_BOUNDS_1(cuda::getApplyBlockSize())
+constexpr static int block_size = 128;
+
+template <typename scalar_t, typename IndexType, bool upper, int elements_per_thread, bool inplace>
+C10_LAUNCH_BOUNDS_1(block_size)
 __global__ void triu_tril_kernel(
     cuda::detail::TensorInfo<scalar_t, IndexType> result_info,
-    const cuda::detail::TensorInfo<scalar_t, IndexType> self_info,
+    const cuda::detail::TensorInfo<const scalar_t, IndexType> self_info,
     const int64_t k,
-    const int64_t N) {
-  int64_t linear_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (linear_idx >= N) {
+    const int64_t N_padded,
+    const IndexType last_dim_padded) {
+  int64_t linear_idx = (blockIdx.x * blockDim.x + threadIdx.x) * elements_per_thread;
+  if (linear_idx >= N_padded) {
     return;
   }
 
   auto dims = self_info.dims;
 
+  // Compute column index amd row index
+  IndexType col = linear_idx % last_dim_padded;
+  linear_idx /= last_dim_padded;
+  IndexType row = linear_idx % self_info.sizes[dims - 2];
+
+  if constexpr (inplace) {
+    bool mask_all_true = upper ? (col - row >= k) : (col + elements_per_thread - row <= k);
+    if (mask_all_true)
+      return;
+  }
+
+  // Compute offset
   IndexType self_offset = 0, result_offset = 0;
-  // Compute column index and corresponding offset
-  IndexType col = linear_idx % self_info.sizes[dims - 1];
-  linear_idx /= self_info.sizes[dims - 1];
   self_offset += self_info.strides[dims - 1] * col;
   result_offset += result_info.strides[dims - 1] * col;
-
-  // Compute row index and corresponding offset
-  IndexType row = linear_idx % self_info.sizes[dims - 2];
   linear_idx /= self_info.sizes[dims - 2];
   self_offset += self_info.strides[dims - 2] * row;
   result_offset += result_info.strides[dims - 2] * row;
@@ -60,34 +80,65 @@ __global__ void triu_tril_kernel(
     result_offset += running_index * result_info.strides[i];
   }
 
-  bool mask = upper ? (col - row >= k) : (col - row <= k);
-  result_info.data[result_offset] = mask ? self_info.data[self_offset] : scalar_t(0);
+  if constexpr (inplace) {
+    #pragma unroll
+    for (int i = 0; i < elements_per_thread && col + i < self_info.sizes[dims - 1]; i++) {
+      bool mask = upper ? (col + i - row >= k) : (col + i - row <= k);
+      if (!mask)
+        result_info.data[result_offset + i * result_info.strides[dims - 1]] = scalar_t(0);
+    }
+  } else {
+    scalar_t frag[elements_per_thread] = {};
+    bool has_mask = (upper && col + elements_per_thread - row >= k) || (!upper && col - row <= k);
+    if (has_mask) {
+      #pragma unroll
+      for (int i = 0; i < elements_per_thread && col + i < self_info.sizes[dims - 1]; i++)
+        frag[i] = self_info.data[self_offset + i * self_info.strides[dims - 1]];
+
+      #pragma unroll
+      for (int i = 0; i < elements_per_thread; i++) {
+        bool mask = upper ? (col + i - row >= k) : (col + i - row <= k);
+        frag[i] = mask ? frag[i] : scalar_t(0);
+      }
+    }
+
+    #pragma unroll
+    for (int i = 0; i < elements_per_thread && col + i < self_info.sizes[dims - 1]; i++)
+      result_info.data[result_offset + i * result_info.strides[dims - 1]] = frag[i];
+  }
 }
 
 template <bool upper>
 void triu_tril_cuda_template(const Tensor& result, const Tensor& self, int64_t k, const char* name) {
-  int64_t N = self.numel();
-  dim3 dim_block = cuda::getApplyBlock();
-  dim3 dim_grid((N + dim_block.x - 1) / dim_block.x);
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
       at::ScalarType::ComplexHalf,
       at::ScalarType::Half,
       at::ScalarType::BFloat16,
       at::ScalarType::Bool,
       self.scalar_type(), "triu_tril_cuda_template", [&] {
+    constexpr int elements_per_thread = sizeof(scalar_t) < 8 ? 8 / sizeof(scalar_t) : 1;
+    auto sizes = self.sizes();
+    int64_t last_dim_padded = round_up<int64_t>(sizes.back(), elements_per_thread);
+    int64_t N_padded = c10::multiply_integers(sizes.begin(), sizes.end() - 1) * last_dim_padded;
+    dim3 dim_block = block_size;
+    dim3 dim_grid((N_padded / elements_per_thread + dim_block.x - 1) / dim_block.x);
     if (cuda::detail::canUse32BitIndexMath(result) && cuda::detail::canUse32BitIndexMath(self)) {
       auto result_info = cuda::detail::getTensorInfo<scalar_t, int32_t>(result);
-      auto self_info = cuda::detail::getTensorInfo<scalar_t, int32_t>(self);
-      triu_tril_kernel<scalar_t, int32_t, upper>
-        <<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
-          result_info, self_info, k, N);
+      auto self_info = cuda::detail::getTensorInfo<const scalar_t, int32_t>(self);
+      BOOL_SWITCH(self.is_same(result), inplace, [&] {
+        triu_tril_kernel<scalar_t, int32_t, upper, elements_per_thread, inplace>
+          <<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+            result_info, self_info, k, N_padded, last_dim_padded);
+      });
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       auto result_info = cuda::detail::getTensorInfo<scalar_t, int64_t>(result);
-      auto self_info = cuda::detail::getTensorInfo<scalar_t, int64_t>(self);
-      triu_tril_kernel<scalar_t, int64_t, upper>
-        <<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
-          result_info, self_info, k, N);
+      auto self_info = cuda::detail::getTensorInfo<const scalar_t, int64_t>(self);
+      BOOL_SWITCH(self.is_same(result), inplace, [&] {
+        triu_tril_kernel<scalar_t, int64_t, upper, elements_per_thread, inplace>
+          <<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+            result_info, self_info, k, N_padded, last_dim_padded);
+      });
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   });
diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu
index 31aa3a9cd10e6..e2654be0135f8 100644
--- a/aten/src/ATen/native/cuda/Unique.cu
+++ b/aten/src/ATen/native/cuda/Unique.cu
@@ -1,6 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
-#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/ThrustAllocator.h>
 
@@ -99,7 +99,7 @@ std::tuple<Tensor, Tensor, Tensor> unique_dim_cuda_template(
     * For unique_dim, we are taking the unique with respect to a index
     * tensor, but during the processes, we override the compare and equal
     * operator by checking the data underlying it instead. After the
-    * algorithm, we would use index_select to map the resulting indicies
+    * algorithm, we would use index_select to map the resulting indices
     * to the result on the actual data.
     */
 
@@ -152,9 +152,7 @@ std::tuple<Tensor, Tensor, Tensor> unique_dim_cuda_template(
     );
   }
 
-  Tensor inverse_indices, counts;
-  int64_t num_out;
-  std::tie(inverse_indices, counts, num_out) = compute_unique(
+  auto [inverse_indices, counts, num_out] = compute_unique(
     policy, indices_data, num_inp, indices,
     return_inverse, return_counts, options,
     [=] __device__ (int64_t a, int64_t b) -> bool {
@@ -188,46 +186,45 @@ std::tuple<Tensor, Tensor, Tensor> unique_dim_cuda_template(
 
 std::tuple<Tensor, Tensor>
 _unique_cuda(const Tensor& self, const bool sorted, const bool return_inverse) {
-  return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique", [&] {
+  return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] {
     // The current CUDA implementation of unique always sort due to the
     // lack of hashtable implementation in thrust
-    Tensor output, inverse;
-    std::tie(output, inverse, std::ignore) = internal::unique_cuda_template<scalar_t>(self, false, return_inverse, false);
+    auto [output, inverse, _] = internal::unique_cuda_template<scalar_t>(self, false, return_inverse, false);
     return std::make_tuple(output, inverse);
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
 }
 
 std::tuple<Tensor, Tensor, Tensor>
 _unique2_cuda(const Tensor& self, const bool sorted, const bool return_inverse, const bool return_counts) {
-  return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique", [&] {
+  return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] {
     // The current CUDA implementation of unique always sort due to the
     // lack of hashtable implementation in thrust
     return internal::unique_cuda_template<scalar_t>(self, false, return_inverse, return_counts);
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
 }
 
 std::tuple<Tensor, Tensor, Tensor>
 unique_dim_cuda(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse, const bool return_counts) {
-  return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique_dim", [&] {
+  return AT_DISPATCH_V2(self.scalar_type(), "unique_dim", AT_WRAP([&] {
     return unique_dim_cuda_template<scalar_t>(self, dim, false, return_inverse, return_counts);
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
 }
 
 std::tuple<Tensor, Tensor, Tensor>
 unique_dim_consecutive_cuda(const Tensor& self, const int64_t dim, const bool return_inverse, const bool return_counts) {
-  return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique_dim", [&] {
+  return AT_DISPATCH_V2(self.scalar_type(), "unique_dim", AT_WRAP([&] {
     return unique_dim_cuda_template<scalar_t>(self, dim, true, return_inverse, return_counts);
-  });
+  }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
 }
 
 std::tuple<Tensor, Tensor, Tensor>
 unique_consecutive_cuda(const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional<int64_t> dim) {
   if (!dim.has_value()) {
-    return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique", [&] {
+    return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] {
       // The current CUDA implementation of unique always sort due to the
       // lack of hashtable implementation in thrust
       return internal::unique_cuda_template<scalar_t>(self, true, return_inverse, return_counts);
-    });
+    }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
   }
   return unique_dim_consecutive_cuda(self, dim.value(), return_inverse, return_counts);
 }
diff --git a/aten/src/ATen/native/cuda/UniqueCub.cu b/aten/src/ATen/native/cuda/UniqueCub.cu
index 38f75f1ee4fee..bbd8673bcf5a6 100644
--- a/aten/src/ATen/native/cuda/UniqueCub.cu
+++ b/aten/src/ATen/native/cuda/UniqueCub.cu
@@ -84,7 +84,7 @@ std::tuple<Tensor, Tensor, Tensor> compute_unique(
     const dim3 block =
         dim3(std::min(static_cast<int64_t>(cuda::getApplyBlock().x), num_inp));
     dim3 grid;
-    int curDevice = -1;
+    c10::DeviceIndex curDevice = -1;
     c10::cuda::GetDevice(&curDevice);
     cuda::getApplyGrid(num_inp, grid, curDevice);
     adjacent_difference_kernel<<<grid, block, 0, stream>>>(
@@ -158,12 +158,14 @@ struct UniqueCub {
     } else {
       sorted = at::empty(self.sizes(), self.options());
     }
-    scalar_t* sorted_data = sorted.mutable_data_ptr<scalar_t>();
 
     Tensor sorted_indices;
     if (!return_inverse) {
       if (!consecutive) {
-        cuda::cub::radix_sort_keys(self.const_data_ptr<scalar_t>(), sorted_data, num_inp);
+        cuda::cub::radix_sort_keys(
+          self.const_data_ptr<scalar_t>(),
+          sorted.mutable_data_ptr<scalar_t>(),
+          num_inp);
       }
     } else {
       if (!consecutive) {
@@ -172,7 +174,7 @@ struct UniqueCub {
         sorted_indices = at::empty({num_inp}, options);
         cuda::cub::radix_sort_pairs(
             self.const_data_ptr<scalar_t>(),
-            sorted_data,
+            sorted.mutable_data_ptr<scalar_t>(),
             range.const_data_ptr<int64_t>(),
             sorted_indices.mutable_data_ptr<int64_t>(),
             num_inp);
@@ -286,7 +288,7 @@ struct UniqueCub<bool> {
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 
-    // Final sync to fix the ouput tensors shape
+    // Final sync to fix the output tensors shape
     int num_true = 0;
     at::cuda::memcpy_and_sync(&num_true, tmp_num_true.get(), sizeof(int),
                               cudaMemcpyDeviceToHost, stream);
@@ -333,6 +335,9 @@ INSTANTIATE_UNIQUE_CUDA_TEMPLATE(float);
 INSTANTIATE_UNIQUE_CUDA_TEMPLATE(int32_t);
 INSTANTIATE_UNIQUE_CUDA_TEMPLATE(int64_t);
 INSTANTIATE_UNIQUE_CUDA_TEMPLATE(int16_t);
+INSTANTIATE_UNIQUE_CUDA_TEMPLATE(uint32_t);
+INSTANTIATE_UNIQUE_CUDA_TEMPLATE(uint64_t);
+INSTANTIATE_UNIQUE_CUDA_TEMPLATE(uint16_t);
 INSTANTIATE_UNIQUE_CUDA_TEMPLATE(bool);
 INSTANTIATE_UNIQUE_CUDA_TEMPLATE(at::Half);
 
diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh
index 09e460640df8d..b7f97088c5ff3 100644
--- a/aten/src/ATen/native/cuda/UpSample.cuh
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@@ -183,7 +183,7 @@ __device__ __forceinline__ static int nearest_neighbor_exact_bw_compute_source_i
 /* Used by UpSampleBicubic2d.cu */
 template <typename scalar_t>
 __device__ __forceinline__ static scalar_t upsample_get_value_bounded(
-    const PackedTensorAccessor64<scalar_t, 4>& data,
+    const PackedTensorAccessor64<const scalar_t, 4>& data,
     int batch,
     int channel,
     int height,
diff --git a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
index c96d7dbae7630..6673fe4993f39 100644
--- a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
@@ -26,7 +26,7 @@ __global__ void upsample_bicubic2d_out_frame(
     const accscalar_t height_scale,
     const accscalar_t width_scale,
     const bool align_corners,
-    const PackedTensorAccessor64<scalar_t, 4> idata,
+    const PackedTensorAccessor64<const scalar_t, 4> idata,
     PackedTensorAccessor64<scalar_t, 4> odata) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -102,7 +102,7 @@ __global__ void upsample_bicubic2d_backward_out_frame(
     const accscalar_t width_scale,
     const bool align_corners,
     PackedTensorAccessor64<scalar_t, 4> idata,
-    const PackedTensorAccessor64<scalar_t, 4> odata) {
+    const PackedTensorAccessor64<const scalar_t, 4> odata) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
 
   const int batchsize = idata.size(0);
@@ -195,7 +195,7 @@ static void upsample_bicubic2d_out_cuda_template(
       input.scalar_type(), "upsample_bicubic2d_out_frame", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
-        auto idata = input.packed_accessor64<scalar_t, 4>();
+        auto idata = input.packed_accessor64<const scalar_t, 4>();
         auto odata = output.packed_accessor64<scalar_t, 4>();
 
         // Get scaling factors
@@ -252,7 +252,7 @@ static void upsample_bicubic2d_backward_out_cuda_template(
         using accscalar_t = at::acc_type<scalar_t, true>;
 
         auto idata = grad_input.packed_accessor64<scalar_t, 4>();
-        auto odata = grad_output.packed_accessor64<scalar_t, 4>();
+        auto odata = grad_output.packed_accessor64<const scalar_t, 4>();
 
         const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
             input_height, output_height, align_corners, scales_h);
diff --git a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
index 1570853c844aa..3c80cb7877a5c 100644
--- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
@@ -37,7 +37,7 @@ __global__ void upsample_bilinear2d_out_frame(
     const accscalar_t rheight,
     const accscalar_t rwidth,
     const bool align_corners,
-    const PackedTensorAccessor<scalar_t, 4> idata,
+    const PackedTensorAccessor<const scalar_t, 4> idata,
     PackedTensorAccessor<scalar_t, 4> odata) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -337,7 +337,7 @@ static void upsample_bilinear2d_out_cuda_template(
 
       using accscalar_t = at::acc_type<scalar_t, true>;
 
-      auto idata = input.packed_accessor64<scalar_t, 4>();
+      auto idata = input.packed_accessor64<const scalar_t, 4>();
       auto odata = output.packed_accessor64<scalar_t, 4>();
 
       const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
@@ -474,7 +474,7 @@ C10_LAUNCH_BOUNDS_1(256) // 256 performs better then 1024
 __global__ void upsample_gen2d_aa_out_frame(
     const accscalar_t height_scale,
     const accscalar_t width_scale,
-    const PackedTensorAccessor64<scalar_t, 4> idata,
+    const PackedTensorAccessor64<const scalar_t, 4> idata,
     PackedTensorAccessor64<scalar_t, 4> odata,
     const InterpFilter & interp_filter) {
 
@@ -568,7 +568,7 @@ __global__ void upsample_gen2d_aa_backward_out_frame(
     const accscalar_t height_scale,
     const accscalar_t width_scale,
     PackedTensorAccessor64<scalar_t, 4> idata,
-    const PackedTensorAccessor64<scalar_t, 4> odata,
+    const PackedTensorAccessor64<const scalar_t, 4> odata,
     const InterpFilter & interp_filter) {
 
   const int batchsize = idata.size(0);
@@ -704,7 +704,7 @@ static void upsample_gen2d_aa_out_cuda_template(
       input.scalar_type(), "upsample_bilinear2d_out_frame", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
-        auto idata = input.packed_accessor64<scalar_t, 4>();
+        auto idata = input.packed_accessor64<const scalar_t, 4>();
         auto odata = output_c.packed_accessor64<scalar_t, 4>();
 
         const accscalar_t height_scale = area_pixel_compute_scale<accscalar_t>(
@@ -807,7 +807,7 @@ static void upsample_gen2d_aa_backward_out_cuda_template(
         using accscalar_t = at::acc_type<scalar_t, true>;
 
         auto idata = grad_input.packed_accessor64<scalar_t, 4>();
-        auto odata = grad_output.packed_accessor64<scalar_t, 4>();
+        auto odata = grad_output.packed_accessor64<const scalar_t, 4>();
 
         const accscalar_t height_scale = area_pixel_compute_scale<accscalar_t>(
             input_height, output_height, align_corners, scales_h);
diff --git a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
index 54a03ae61b8f8..dfba2f5479071 100644
--- a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
@@ -28,7 +28,7 @@ __global__ void upsample_linear1d_out_frame(
     const int n,
     const accscalar_t rwidth,
     const bool align_corners,
-    const PackedTensorAccessor64<scalar_t, 3> idata,
+    const PackedTensorAccessor64<const scalar_t, 3> idata,
     PackedTensorAccessor64<scalar_t, 3> odata) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -76,7 +76,7 @@ __global__ void upsample_linear1d_out_frame_backward(
     const accscalar_t rwidth,
     const bool align_corners,
     PackedTensorAccessor64<scalar_t, 3> idata,
-    const PackedTensorAccessor64<scalar_t, 3> odata) {
+    const PackedTensorAccessor64<const scalar_t, 3> odata) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
 
   const int batchsize = idata.size(0);
@@ -143,7 +143,7 @@ static void upsample_linear1d_out_cuda_template(
       input.scalar_type(), "upsample_linear1d_out_frame", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
-        auto idata = input.packed_accessor64<scalar_t, 3>();
+        auto idata = input.packed_accessor64<const scalar_t, 3>();
         auto odata = output.packed_accessor64<scalar_t, 3>();
 
         const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
@@ -188,7 +188,7 @@ static void upsample_linear1d_backward_out_cuda_template(
         using accscalar_t = at::acc_type<scalar_t, true>;
 
         auto idata = grad_input.packed_accessor64<scalar_t, 3>();
-        auto odata = grad_output.packed_accessor64<scalar_t, 3>();
+        auto odata = grad_output.packed_accessor64<const scalar_t, 3>();
 
         const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
             input_width, output_width, align_corners, scales);
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
index 6bfeef431c13f..3085cba0a1d16 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
@@ -175,6 +175,7 @@ static void upsample_nearest1d_backward_out_cuda_template(
   dim3 gdim{ceil_div(n, bdim.x)};
   // safe check for int32 indexing; implicitly restrict launch config for kernel
   TORCH_CHECK(grad_input.numel() <= std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(grad_output.numel() <= std::numeric_limits<int32_t>::max());
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest1d_backward_out_frame", [&] {
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
index ba71fdc0b077f..197fc9d60bef7 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
@@ -122,7 +122,7 @@ __global__ void upsample_nearest2d_backward_out_frame(
     scalar_t* grad_i,
     float height_scale,
     float width_scale) {
-  int dst_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t dst_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (dst_idx >= dim_c * dst_dim_h * dst_dim_w)
     return;
 
@@ -151,7 +151,7 @@ __global__ void upsample_nearest2d_backward_out_frame(
     accscalar_t grad = 0;
     for (int y = src_y; y < src_y_up; y++) {
       for (int x = src_x; x < src_x_up; x++) {
-        int src_idx =
+        int64_t src_idx =
             b * dim_c * src_c_stride + c * src_c_stride + y * src_dim_w + x;
         grad += grad_o[src_idx];
       }
@@ -408,8 +408,9 @@ static void upsample_nearest2d_backward_out_cuda_template(
     dim3 bdim{std::min<unsigned int>(
         at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, MAX_THREADS)};
     dim3 gdim{ceil_div(n, bdim.x)};
-    // safe check for int32 indexing; implicitly restrict launch config for kernel
-    TORCH_CHECK(grad_input.numel() <= std::numeric_limits<int32_t>::max());
+    // safe check for int64 indexing; implicitly restrict launch config for kernel
+    TORCH_CHECK(grad_input.numel() <= std::numeric_limits<int64_t>::max(), "upsample2d grad_input.numel() <= std::numeric_limits<int64_t>::max()");
+    TORCH_CHECK(grad_output.numel() <= std::numeric_limits<int64_t>::max(), "upsample2d grad_output.numel() <= std::numeric_limits<int64_t>::max()");
 
     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
     AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest2d_backward_out_frame", [&] {
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
index f9c1dfdb8ab76..31a7ee92e7488 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
@@ -255,6 +255,7 @@ static void upsample_nearest3d_backward_out_cuda_template(
   dim3 gdim{ceil_div(n, bdim.x)};
   // safe check for int32 indexing; implicitly restrict launch config for kernel
   TORCH_CHECK(grad_input.numel() <= std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(grad_output.numel() <= std::numeric_limits<int32_t>::max());
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest3d_backward_out_frame", [&] {
diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
index de8e797c6d358..43cc09d34b677 100644
--- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
@@ -43,7 +43,7 @@ __global__ void upsample_trilinear3d_out_frame(
     const accscalar_t rheight,
     const accscalar_t rwidth,
     const bool align_corners,
-    const PackedTensorAccessor64<scalar_t, 5> idata,
+    const PackedTensorAccessor64<const scalar_t, 5> idata,
     PackedTensorAccessor64<scalar_t, 5> odata) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -128,7 +128,7 @@ __global__ void upsample_trilinear3d_backward_out_frame(
     const accscalar_t rwidth,
     const bool align_corners,
     PackedTensorAccessor64<scalar_t, 5> idata,
-    const PackedTensorAccessor64<scalar_t, 5> odata,
+    const PackedTensorAccessor64<const scalar_t, 5> odata,
     scalar_t* idata_ptr) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -269,7 +269,7 @@ static void upsample_trilinear3d_out_cuda_template(
       input.scalar_type(), "upsample_trilinear3d_out_frame", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
-        auto idata = input.packed_accessor64<scalar_t, 5>();
+        auto idata = input.packed_accessor64<const scalar_t, 5>();
         auto odata = output.packed_accessor64<scalar_t, 5>();
 
         const accscalar_t rdepth = area_pixel_compute_scale<accscalar_t>(
@@ -296,7 +296,7 @@ static void upsample_trilinear3d_out_cuda_template(
 }
 
 static void upsample_trilinear3d_backward_out_cuda_template(
-    const Tensor& grad_input,
+    const Tensor& grad_input_,
     const Tensor& grad_output_,
     IntArrayRef output_size,
     IntArrayRef input_size,
@@ -304,7 +304,7 @@ static void upsample_trilinear3d_backward_out_cuda_template(
     c10::optional<double> scales_d,
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
-  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+  TensorArg grad_input_arg{grad_input_, "grad_input_", 1},
       grad_output_arg{grad_output_, "grad_output_", 2};
   checkAllSameGPU(
       "upsample_trilinear3d_backward_out_cuda",
@@ -321,7 +321,8 @@ static void upsample_trilinear3d_backward_out_cuda_template(
   Tensor grad_output = grad_output_.contiguous();
 
   // A contiguous tensor is required for the kernel launch config
-  grad_input.contiguous();
+  Tensor grad_input = grad_input_.contiguous();
+
   // Numbers are added atomically to grad_input tensor from multiple threads,
   // so it has to be initialized to zero.
   grad_input.zero_();
@@ -339,7 +340,7 @@ static void upsample_trilinear3d_backward_out_cuda_template(
         using accscalar_t = at::acc_type<scalar_t, true>;
 
         auto idata = grad_input.packed_accessor64<scalar_t, 5>();
-        auto odata = grad_output.packed_accessor64<scalar_t, 5>();
+        auto odata = grad_output.packed_accessor64<const scalar_t, 5>();
         scalar_t* idata_ptr = grad_input.mutable_data_ptr<scalar_t>();
 
         const accscalar_t rdepth = area_pixel_compute_scale<accscalar_t>(
@@ -363,6 +364,10 @@ static void upsample_trilinear3d_backward_out_cuda_template(
                 odata,
                 idata_ptr);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+        if (!grad_input_.is_contiguous()) {
+            grad_input_.copy_(grad_input);
+        }
       });
 }
 
diff --git a/aten/src/ATen/native/cuda/block_reduce.cuh b/aten/src/ATen/native/cuda/block_reduce.cuh
index fa75c71f8acaf..e8fd69c0aec93 100644
--- a/aten/src/ATen/native/cuda/block_reduce.cuh
+++ b/aten/src/ATen/native/cuda/block_reduce.cuh
@@ -16,7 +16,7 @@ constexpr int kCUDABlockReduceNumThreads = 512;
 // NOTE: This is >= the max block size on current hardware anyway (1024).
 constexpr int kCUDABlockReduceMaxThreads = C10_WARP_SIZE * C10_WARP_SIZE;
 
-// Sums `val` accross all threads in a warp.
+// Sums `val` across all threads in a warp.
 //
 // Assumptions:
 //   - The size of each block should be a multiple of `C10_WARP_SIZE`
@@ -29,6 +29,19 @@ __inline__ __device__ T WarpReduceSum(T val) {
   return val;
 }
 
+// Picks the maximum `val` across all threads in a warp.
+//
+// Assumptions:
+//   - The size of each block should be a multiple of `C10_WARP_SIZE`
+template <typename T>
+__inline__ __device__ T WarpReduceMax(T val) {
+#pragma unroll
+  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
+    val = max_propagate_nan(val, WARP_SHFL_DOWN(val, offset));
+  }
+  return val;
+}
+
 struct Block1D {
     static __forceinline__ __device__ int Tid() { return threadIdx.x; }
 
@@ -72,6 +85,31 @@ __inline__ __device__ T BlockReduceSum(T val, T* shared) {
   return val;
 }
 
+// Picks out the maximum `val` across all threads in a block.
+//
+// Warning: the return value is only valid for thread 0.
+// Assumptions:
+//   - The size of each block should be a multiple of `C10_WARP_SIZE`
+//   - `shared` should be a pointer to shared memory with size of, at least,
+//     `sizeof(T) * number_of_warps`
+template <typename T, typename B = Block1D>
+__inline__ __device__ T BlockReduceMax(T val, T* shared) {
+  const int tid = B::Tid();
+  const int lid = tid % C10_WARP_SIZE;
+  const int wid = tid / C10_WARP_SIZE;
+  val = WarpReduceMax(val);
+  __syncthreads(); // prevent races when BlockReduces are called in a row.
+  if (lid == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+  val = (tid < B::Warps()) ? shared[lid] : T(0);
+  if (wid == 0) {
+    val = WarpReduceMax(val);
+  }
+  return val;
+}
+
 template <typename T, class ReduceOp>
 __inline__ __device__ T WarpReduce(T val, const ReduceOp& op) {
 #pragma unroll
diff --git a/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_base.h b/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_base.h
index a800dbaeaa2d6..0d1b0bd8e7a5e 100644
--- a/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_base.h
+++ b/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_base.h
@@ -113,7 +113,7 @@ class DqMmaBase {
     /// Shape describing the number of warps filling the CTA
     using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
 
-    /// Number of warp-level GEMM oeprations
+    /// Number of warp-level GEMM operations
     static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
 
     static constexpr int kNumKIterationsPerWarpBLoad =
diff --git a/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h b/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h
index 6395713824347..6517e1a4f7a13 100644
--- a/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h
+++ b/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h
@@ -149,7 +149,7 @@ class DqMmaPipelined: public DqMmaBase<Shape_, Policy_, typename SmemIteratorSca
     /// Complex transform on B operand
     static ComplexTransform const kTransformB = Operator::kTransformB;
 
-    // staticaly assert kStages for DqMmaPipelined is two (Double-buffered pipeline)
+    // statically assert kStages for DqMmaPipelined is two (Double-buffered pipeline)
     static_assert((Base::kStages == 2), "DqMmaPipelined requires kStages set to value 2");
 
 private:
diff --git a/aten/src/ATen/native/cuda/cutlass_extensions/interleaved_numeric_conversion.h b/aten/src/ATen/native/cuda/cutlass_extensions/interleaved_numeric_conversion.h
index 59e65c8fe8209..a409606d4b7e2 100644
--- a/aten/src/ATen/native/cuda/cutlass_extensions/interleaved_numeric_conversion.h
+++ b/aten/src/ATen/native/cuda/cutlass_extensions/interleaved_numeric_conversion.h
@@ -43,7 +43,7 @@
 namespace cutlass {
 
 // This converter is meant to be used with data interleaved in a 32-bit register where the even elements are in the low
-// bits and the odd elemeents are in the high bits of the register. In addition, it assumes elements were originally
+// bits and the odd elements are in the high bits of the register. In addition, it assumes elements were originally
 // signed and had a bias of 2**(b-1) added (where b is the number of bits in the type) to make all numbers unsigned.
 // This converter will uninterleave the data and subtract the bias while converting to the result type.
 template<typename T, typename S, int N>
@@ -426,4 +426,4 @@ struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint4b_t, N> {
 
 }  // namespace cutlass
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
\ No newline at end of file
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu b/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu
index d970914dbc294..9cebb82e512a8 100644
--- a/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu
+++ b/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu
@@ -30,11 +30,11 @@ void _fused_adam_amsgrad_cuda_impl_(
       exp_avg_sqs.vec(),
       max_exp_avg_sqs.vec()};
 
-  float* grad_scale_ptr =
+  const float* grad_scale_ptr =
       grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
-  float* found_inf_ptr =
+  const float* found_inf_ptr =
       found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
-  float* lr_ptr = nullptr;
+  const float* lr_ptr = nullptr;
 
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kHalf,
@@ -45,7 +45,7 @@ void _fused_adam_amsgrad_cuda_impl_(
         multi_tensor_apply_for_fused_optimizer<5>(
             tensor_lists,
             state_steps,
-            FusedAdamMathFunctor<scalar_t, 5>(),
+            FusedAdamMathFunctor<scalar_t, 5, ADAM_MODE::ORIGINAL, true>(),
             lr_ptr, // unused
             lr,
             beta1,
@@ -53,10 +53,8 @@ void _fused_adam_amsgrad_cuda_impl_(
             weight_decay,
             eps,
             maximize,
-            /* amsgrad */ true,
             grad_scale_ptr,
-            found_inf_ptr,
-            ADAM_MODE::ORIGINAL);
+            found_inf_ptr);
       });
 }
 
@@ -83,11 +81,11 @@ void _fused_adam_amsgrad_cuda_impl_(
       exp_avg_sqs.vec(),
       max_exp_avg_sqs.vec()};
 
-  float* grad_scale_ptr =
+  const float* grad_scale_ptr =
       grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
-  float* found_inf_ptr =
+  const float* found_inf_ptr =
       found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
-  float* lr_ptr = lr.data_ptr<float>();
+  const float* lr_ptr = lr.const_data_ptr<float>();
 
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kHalf,
@@ -98,7 +96,7 @@ void _fused_adam_amsgrad_cuda_impl_(
         multi_tensor_apply_for_fused_optimizer<5>(
             tensor_lists,
             state_steps,
-            FusedAdamMathFunctor<scalar_t, 5>(),
+            FusedAdamMathFunctor<scalar_t, 5, ADAM_MODE::ORIGINAL, true>(),
             lr_ptr,
             1.0, // unused
             beta1,
@@ -106,10 +104,8 @@ void _fused_adam_amsgrad_cuda_impl_(
             weight_decay,
             eps,
             maximize,
-            /* amsgrad */ true,
             grad_scale_ptr,
-            found_inf_ptr,
-            ADAM_MODE::ORIGINAL);
+            found_inf_ptr);
       });
 }
 
diff --git a/aten/src/ATen/native/cuda/fused_adam_impl.cu b/aten/src/ATen/native/cuda/fused_adam_impl.cu
index 075dd38f3aaed..7f2843b3b4ee4 100644
--- a/aten/src/ATen/native/cuda/fused_adam_impl.cu
+++ b/aten/src/ATen/native/cuda/fused_adam_impl.cu
@@ -25,11 +25,11 @@ void _fused_adam_cuda_impl_(
   std::vector<std::vector<at::Tensor>> tensor_lists{
       params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()};
 
-  float* grad_scale_ptr =
+  const float* grad_scale_ptr =
       grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
-  float* found_inf_ptr =
+  const float* found_inf_ptr =
       found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
-  float* lr_ptr = nullptr;
+  const float* lr_ptr = nullptr;
 
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kHalf,
@@ -40,7 +40,7 @@ void _fused_adam_cuda_impl_(
         multi_tensor_apply_for_fused_optimizer<4>(
             tensor_lists,
             state_steps,
-            FusedAdamMathFunctor<scalar_t, 4>(),
+            FusedAdamMathFunctor<scalar_t, 4, ADAM_MODE::ORIGINAL, false>(),
             lr_ptr, // unused
             lr,
             beta1,
@@ -48,10 +48,8 @@ void _fused_adam_cuda_impl_(
             weight_decay,
             eps,
             maximize,
-            /* amsgrad */ false,
             grad_scale_ptr,
-            found_inf_ptr,
-            ADAM_MODE::ORIGINAL);
+            found_inf_ptr);
       });
 }
 
@@ -73,11 +71,11 @@ void _fused_adam_cuda_impl_(
   std::vector<std::vector<at::Tensor>> tensor_lists{
       params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()};
 
-  float* grad_scale_ptr =
+  const float* grad_scale_ptr =
       grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
-  float* found_inf_ptr =
+  const float* found_inf_ptr =
       found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
-  float* lr_ptr = lr.data_ptr<float>();
+  const float* lr_ptr = lr.const_data_ptr<float>();
 
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kHalf,
@@ -88,7 +86,7 @@ void _fused_adam_cuda_impl_(
         multi_tensor_apply_for_fused_optimizer<4>(
             tensor_lists,
             state_steps,
-            FusedAdamMathFunctor<scalar_t, 4>(),
+            FusedAdamMathFunctor<scalar_t, 4, ADAM_MODE::ORIGINAL, false>(),
             lr_ptr,
             1.0, // unused
             beta1,
@@ -96,10 +94,8 @@ void _fused_adam_cuda_impl_(
             weight_decay,
             eps,
             maximize,
-            /* amsgrad */ false,
             grad_scale_ptr,
-            found_inf_ptr,
-            ADAM_MODE::ORIGINAL);
+            found_inf_ptr);
       });
 }
 
diff --git a/aten/src/ATen/native/cuda/fused_adam_utils.cuh b/aten/src/ATen/native/cuda/fused_adam_utils.cuh
index 25de84ee7c971..182195969ed9a 100644
--- a/aten/src/ATen/native/cuda/fused_adam_utils.cuh
+++ b/aten/src/ATen/native/cuda/fused_adam_utils.cuh
@@ -3,6 +3,7 @@
 #include <ATen/native/cuda/ForeachFunctors.cuh>
 #include <ATen/native/cuda/MultiTensorApply.cuh>
 #include <ATen/native/cuda/Pow.cuh>
+#include <utility>
 
 namespace at {
 namespace native {
@@ -17,20 +18,25 @@ constexpr uint8_t kExpAvgIdx = 2;
 constexpr uint8_t kExpAvgSqIdx = 3;
 constexpr uint8_t kMaxExpAvgSqIdx = 4;
 
-template <typename scalar_type, typename opmath_t, int depth = 4>
-C10_DEVICE __forceinline__ void adam_math(
+template <
+    typename scalar_type,
+    typename opmath_t,
+    int depth,
+    ADAM_MODE adam_mode,
+    bool amsgrad>
+C10_DEVICE inline void adam_math(
     scalar_type r_args[depth][kILP],
-    const float* step_count,
-    const double lr,
-    const double beta1,
-    const double beta2,
-    const double weight_decay,
-    const double eps,
-    const bool maximize,
-    const bool amsgrad,
+    const double& lr,
+    const double& beta1,
+    const double& beta2,
+    const double& weight_decay,
+    const double& eps,
+    const bool& maximize,
     const float* grad_scale_ptr,
     const float* found_inf_ptr,
-    const ADAM_MODE adam_mode) {
+    const opmath_t& bias_correction1,
+    const opmath_t& bias_correction2_sqrt) {
+  static_assert(depth == 4 || depth == 5);
 #pragma unroll
   for (int ii = 0; ii < kILP; ii++) {
     // Load values.
@@ -51,23 +57,17 @@ C10_DEVICE __forceinline__ void adam_math(
     }
     // Update param, grad, 1st and 2nd order momentum.
     if (weight_decay != 0) {
-      switch (adam_mode) {
-        case ADAM_MODE::ORIGINAL:
-          grad += param * weight_decay;
-          break;
-        case ADAM_MODE::ADAMW:
-          param -= lr * weight_decay * param;
-          break;
+      if constexpr (adam_mode == ADAM_MODE::ORIGINAL) {
+        grad += param * weight_decay;
+      } else if constexpr (adam_mode == ADAM_MODE::ADAMW) {
+        param -= lr * weight_decay * param;
       }
     }
     // todo(crcrpar): use lerp
     // ref: https://developer.nvidia.com/blog/lerp-faster-cuda/
     exp_avg = beta1 * exp_avg + (1 - beta1) * grad;
     exp_avg_sq = beta2 * exp_avg_sq + (1 - beta2) * grad * grad;
-    const opmath_t bias_correction1 = 1 - at::native::pow_(beta1, *step_count);
     const opmath_t step_size = lr / bias_correction1;
-    const opmath_t bias_correction2 = 1 - at::native::pow_(beta2, *step_count);
-    const opmath_t bias_correction2_sqrt = std::sqrt(bias_correction2);
     opmath_t denom;
     if (amsgrad) {
       max_exp_avg_sq = std::max(max_exp_avg_sq, exp_avg_sq);
@@ -102,7 +102,7 @@ C10_DEVICE __forceinline__ void adam_math(
 // parameter updates accordingly. To be functionally on par with `torch.optim`
 // optimizers and `_multi_tensor` ones, the kernel below writes out gradients
 // only when `grad_scale_ptr != nullptr.
-template <typename scalar_type, int depth = 4>
+template <typename scalar_type, int depth, ADAM_MODE adam_mode, bool amsgrad>
 struct FusedAdamMathFunctor {
   static_assert(
       depth == 4 || depth == 5,
@@ -112,33 +112,37 @@ struct FusedAdamMathFunctor {
       int chunk_size,
       FusedOptimizerTensorListMetadata<depth>& tl,
       const float* lr_ptr,
-      const double lr,
-      const double beta1,
-      const double beta2,
-      const double weight_decay,
-      const double eps,
-      const bool maximize,
-      const bool amsgrad,
+      const double& lr,
+      const double& beta1,
+      const double& beta2,
+      const double& weight_decay,
+      const double& eps,
+      const bool& maximize,
       const float* grad_scale_ptr,
-      const float* found_inf_ptr,
-      const ADAM_MODE adam_mode) {
-    int tensor_loc = tl.block_to_tensor[blockIdx.x];
-    int chunk_idx = tl.block_to_chunk[blockIdx.x];
-    int n = tl.numel_for_tensor[tensor_loc];
-    double lr_double = lr_ptr ? *lr_ptr : lr;
+      const float* found_inf_ptr) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    const double lr_double = lr_ptr ? *lr_ptr : lr;
 
     if (found_inf_ptr && *found_inf_ptr == 1) {
       return;
     }
-    auto* step_count =
-        reinterpret_cast<const float*>(tl.state_steps_addresses[tensor_loc]);
+    const auto [bias_correction1, bias_correction2_sqrt] =
+        [&]() -> std::pair<double, double> {
+      auto* step_count =
+          reinterpret_cast<const float*>(tl.state_steps_addresses[tensor_loc]);
+      const auto bias_correction1 = 1 - at::native::pow_(beta1, *step_count);
+      const auto bias_correction2 = 1 - at::native::pow_(beta2, *step_count);
+      const auto bias_correction2_sqrt = std::sqrt(bias_correction2);
+      return {bias_correction1, bias_correction2_sqrt};
+    }();
 
     scalar_type* args[depth];
-    const bool all_aligned{
-        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc)};
-    n -= chunk_idx * chunk_size;
     scalar_type r_args[depth][kILP];
+    const auto n = tl.numel_for_tensor[tensor_loc] - chunk_idx * chunk_size;
 
+    const bool all_aligned{
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc)};
     if ((n % kILP == 0) && (chunk_size % kILP == 0) && all_aligned) {
       for (int64_t i_start = threadIdx.x;
            i_start * kILP < n && i_start * kILP < chunk_size;
@@ -147,19 +151,18 @@ struct FusedAdamMathFunctor {
         for (int i = 0; i < depth; i++) {
           load_store(r_args[i], args[i], 0, i_start);
         }
-        adam_math<scalar_type, opmath_t, depth>(
+        adam_math<scalar_type, opmath_t, depth, adam_mode, amsgrad>(
             r_args,
-            step_count,
             lr_double,
             beta1,
             beta2,
             weight_decay,
             eps,
             maximize,
-            amsgrad,
             grad_scale_ptr,
             found_inf_ptr,
-            adam_mode);
+            bias_correction1,
+            bias_correction2_sqrt);
 #pragma unroll
         for (int i = 0; i < depth; i++) {
           if (i != kGradIdx || grad_scale_ptr) {
@@ -171,19 +174,18 @@ struct FusedAdamMathFunctor {
       for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
            i_start += blockDim.x * kILP) {
         load_args<depth>(r_args, args, i_start, chunk_size, n);
-        adam_math<scalar_type, opmath_t, depth>(
+        adam_math<scalar_type, opmath_t, depth, adam_mode, amsgrad>(
             r_args,
-            step_count,
             lr_double,
             beta1,
             beta2,
             weight_decay,
             eps,
             maximize,
-            amsgrad,
             grad_scale_ptr,
             found_inf_ptr,
-            adam_mode);
+            bias_correction1,
+            bias_correction2_sqrt);
 #pragma unroll
         for (int i = 0; i < depth; i++) {
           if (i != kGradIdx || grad_scale_ptr) {
diff --git a/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu b/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu
index 91f6619973317..376711c39db6d 100644
--- a/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu
+++ b/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu
@@ -31,11 +31,11 @@ void _fused_adamw_amsgrad_cuda_impl_(
       exp_avg_sqs.vec(),
       max_exp_avg_sqs.vec()};
 
-  float* grad_scale_ptr =
+  const float* grad_scale_ptr =
       grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
-  float* found_inf_ptr =
+  const float* found_inf_ptr =
       found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
-  float* lr_ptr = nullptr;
+  const float* lr_ptr = nullptr;
 
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kHalf,
@@ -46,7 +46,7 @@ void _fused_adamw_amsgrad_cuda_impl_(
         multi_tensor_apply_for_fused_optimizer<5>(
             tensor_lists,
             state_steps,
-            FusedAdamMathFunctor<scalar_t, 5>(),
+            FusedAdamMathFunctor<scalar_t, 5, ADAM_MODE::ADAMW, true>(),
             lr_ptr, // unused
             lr,
             beta1,
@@ -54,10 +54,8 @@ void _fused_adamw_amsgrad_cuda_impl_(
             weight_decay,
             eps,
             maximize,
-            /* amsgrad */ true,
             grad_scale_ptr,
-            found_inf_ptr,
-            ADAM_MODE::ADAMW);
+            found_inf_ptr);
       });
 }
 
@@ -84,11 +82,11 @@ void _fused_adamw_amsgrad_cuda_impl_(
       exp_avg_sqs.vec(),
       max_exp_avg_sqs.vec()};
 
-  float* grad_scale_ptr =
+  const float* grad_scale_ptr =
       grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
-  float* found_inf_ptr =
+  const float* found_inf_ptr =
       found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
-  float* lr_ptr = lr.data_ptr<float>();
+  const float* lr_ptr = lr.const_data_ptr<float>();
 
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kHalf,
@@ -99,7 +97,7 @@ void _fused_adamw_amsgrad_cuda_impl_(
         multi_tensor_apply_for_fused_optimizer<5>(
             tensor_lists,
             state_steps,
-            FusedAdamMathFunctor<scalar_t, 5>(),
+            FusedAdamMathFunctor<scalar_t, 5, ADAM_MODE::ADAMW, true>(),
             lr_ptr,
             1.0, // unused
             beta1,
@@ -107,10 +105,8 @@ void _fused_adamw_amsgrad_cuda_impl_(
             weight_decay,
             eps,
             maximize,
-            /* amsgrad */ true,
             grad_scale_ptr,
-            found_inf_ptr,
-            ADAM_MODE::ADAMW);
+            found_inf_ptr);
       });
 }
 
diff --git a/aten/src/ATen/native/cuda/fused_adamw_impl.cu b/aten/src/ATen/native/cuda/fused_adamw_impl.cu
index 847f05671360d..cc4feaa145122 100644
--- a/aten/src/ATen/native/cuda/fused_adamw_impl.cu
+++ b/aten/src/ATen/native/cuda/fused_adamw_impl.cu
@@ -26,11 +26,11 @@ void _fused_adamw_cuda_impl_(
   std::vector<std::vector<at::Tensor>> tensor_lists{
       params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()};
 
-  float* grad_scale_ptr =
+  const float* grad_scale_ptr =
       grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
-  float* found_inf_ptr =
+  const float* found_inf_ptr =
       found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
-  float* lr_ptr = nullptr;
+  const float* lr_ptr = nullptr;
 
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kHalf,
@@ -41,7 +41,7 @@ void _fused_adamw_cuda_impl_(
         multi_tensor_apply_for_fused_optimizer<4>(
             tensor_lists,
             state_steps,
-            FusedAdamMathFunctor<scalar_t, 4>(),
+            FusedAdamMathFunctor<scalar_t, 4, ADAM_MODE::ADAMW, false>(),
             lr_ptr, // unused
             lr,
             beta1,
@@ -49,10 +49,8 @@ void _fused_adamw_cuda_impl_(
             weight_decay,
             eps,
             maximize,
-            /* amsgrad */ false,
             grad_scale_ptr,
-            found_inf_ptr,
-            ADAM_MODE::ADAMW);
+            found_inf_ptr);
       });
 }
 
@@ -74,11 +72,11 @@ void _fused_adamw_cuda_impl_(
   std::vector<std::vector<at::Tensor>> tensor_lists{
       params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()};
 
-  float* grad_scale_ptr =
+  const float* grad_scale_ptr =
       grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
-  float* found_inf_ptr =
+  const float* found_inf_ptr =
       found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
-  float* lr_ptr = lr.data_ptr<float>();
+  const float* lr_ptr = lr.const_data_ptr<float>();
 
   AT_DISPATCH_FLOATING_TYPES_AND2(
       kHalf,
@@ -89,7 +87,7 @@ void _fused_adamw_cuda_impl_(
         multi_tensor_apply_for_fused_optimizer<4>(
             tensor_lists,
             state_steps,
-            FusedAdamMathFunctor<scalar_t, 4>(),
+            FusedAdamMathFunctor<scalar_t, 4, ADAM_MODE::ADAMW, false>(),
             lr_ptr,
             1.0, // unused
             beta1,
@@ -97,10 +95,8 @@ void _fused_adamw_cuda_impl_(
             weight_decay,
             eps,
             maximize,
-            /* amsgrad */ false,
             grad_scale_ptr,
-            found_inf_ptr,
-            ADAM_MODE::ADAMW);
+            found_inf_ptr);
       });
 }
 
diff --git a/aten/src/ATen/native/cuda/group_norm_kernel.cu b/aten/src/ATen/native/cuda/group_norm_kernel.cu
index 5a29338c303e0..f3ed79e745382 100644
--- a/aten/src/ATen/native/cuda/group_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/group_norm_kernel.cu
@@ -496,11 +496,11 @@ void GroupNorm1dForward(
     auto iter = TensorIteratorConfig()
                     .resize_outputs(false)
                     .add_owned_output(Y.view({N, G, D}))
-                    .add_owned_input(X.view({N, G, D}))
+                    .add_owned_const_input(X.view({N, G, D}))
                     .add_owned_input(mean.view({N, G, 1}))
                     .add_owned_input(rstd.view({N, G, 1}))
-                    .add_owned_input(gamma.view({1, G, D}))
-                    .add_owned_input(beta.view({1, G, D}))
+                    .add_owned_const_input(gamma.view({1, G, D}))
+                    .add_owned_const_input(beta.view({1, G, D}))
                     .build();
     gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd, T gamma, T beta) -> T {
       return (static_cast<T_ACC>(x) - static_cast<T_ACC>(mean)) *
@@ -511,10 +511,10 @@ void GroupNorm1dForward(
     auto iter = TensorIteratorConfig()
                     .resize_outputs(false)
                     .add_owned_output(Y.view({N, G, D}))
-                    .add_owned_input(X.view({N, G, D}))
+                    .add_owned_const_input(X.view({N, G, D}))
                     .add_owned_input(mean.view({N, G, 1}))
                     .add_owned_input(rstd.view({N, G, 1}))
-                    .add_owned_input(gamma.view({1, G, D}))
+                    .add_owned_const_input(gamma.view({1, G, D}))
                     .build();
     gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd, T gamma) -> T {
       return (static_cast<T_ACC>(x) - static_cast<T_ACC>(mean)) *
@@ -524,10 +524,10 @@ void GroupNorm1dForward(
     auto iter = TensorIteratorConfig()
                     .resize_outputs(false)
                     .add_owned_output(Y.view({N, G, D}))
-                    .add_owned_input(X.view({N, G, D}))
+                    .add_owned_const_input(X.view({N, G, D}))
                     .add_owned_input(mean.view({N, G, 1}))
                     .add_owned_input(rstd.view({N, G, 1}))
-                    .add_owned_input(beta.view({1, G, D}))
+                    .add_owned_const_input(beta.view({1, G, D}))
                     .build();
     gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd, T beta) -> T {
       return (static_cast<T_ACC>(x) - static_cast<T_ACC>(mean)) *
@@ -538,7 +538,7 @@ void GroupNorm1dForward(
     auto iter = TensorIteratorConfig()
                     .resize_outputs(false)
                     .add_owned_output(Y.view({N * G, D}))
-                    .add_owned_input(X.view({N * G, D}))
+                    .add_owned_const_input(X.view({N * G, D}))
                     .add_owned_input(mean.view({N * G, 1}))
                     .add_owned_input(rstd.view({N * G, 1}))
                     .build();
@@ -590,7 +590,7 @@ void GroupNormKernelImplInternal(
     auto iter = TensorIteratorConfig()
                     .resize_outputs(false)
                     .add_owned_output(Y.view({N * G, D * HxW}))
-                    .add_owned_input(X.view({N * G, D * HxW}))
+                    .add_owned_const_input(X.view({N * G, D * HxW}))
                     .add_owned_input(mean.view({N * G, 1}))
                     .add_owned_input(rstd.view({N * G, 1}))
                     .build();
@@ -611,7 +611,7 @@ void GroupNormKernelImplInternal(
     T_ACC* b_data = b.mutable_data_ptr<T_ACC>();
 
     // TODO: Since there is some issues in gpu_kernel_multiple_outputs, we are
-    // using maunal kernel here. Make it using gpu_kernel_multiple_outputs once
+    // using manual kernel here. Make it using gpu_kernel_multiple_outputs once
     // the issue fixed.
     const int64_t B = (N * C + kCUDANumThreads - 1) / kCUDANumThreads;
     ComputeFusedParamsCUDAKernel<T><<<B, kCUDANumThreads, 0, cuda_stream>>>(
@@ -622,7 +622,7 @@ void GroupNormKernelImplInternal(
                     .check_all_same_dtype(std::is_same<T, T_ACC>::value)
                     .resize_outputs(false)
                     .add_owned_output(Y.view({N * C, HxW}))
-                    .add_owned_input(X.view({N * C, HxW}))
+                    .add_owned_const_input(X.view({N * C, HxW}))
                     .add_owned_input(a.view({N * C, 1}))
                     .add_owned_input(b.view({N * C, 1}))
                     .build();
@@ -719,12 +719,12 @@ void GroupNorm1dBackward(
                       .check_all_same_dtype(std::is_same<T, T_ACC>::value)
                       .resize_outputs(false)
                       .add_owned_output(dX.view({N, G, D}))
-                      .add_owned_input(dY.view({N, G, D}))
-                      .add_owned_input(X.view({N, G, D}))
-                      .add_owned_input(rstd.view({N, G, 1}))
-                      .add_owned_input(gamma.view({1, G, D}))
-                      .add_owned_input(c2.view({N, G, 1}))
-                      .add_owned_input(c3.view({N, G, 1}))
+                      .add_owned_const_input(dY.view({N, G, D}))
+                      .add_owned_const_input(X.view({N, G, D}))
+                      .add_owned_const_input(rstd.view({N, G, 1}))
+                      .add_owned_const_input(gamma.view({1, G, D}))
+                      .add_owned_const_input(c2.view({N, G, 1}))
+                      .add_owned_const_input(c3.view({N, G, 1}))
                       .build();
       gpu_kernel(
           iter,
@@ -739,11 +739,11 @@ void GroupNorm1dBackward(
                       .check_all_same_dtype(std::is_same<T, T_ACC>::value)
                       .resize_outputs(false)
                       .add_owned_output(dX.view({N * G, D}))
-                      .add_owned_input(dY.view({N * G, D}))
-                      .add_owned_input(X.view({N * G, D}))
-                      .add_owned_input(rstd.view({N * G, 1}))
-                      .add_owned_input(c2.view({N * G, 1}))
-                      .add_owned_input(c3.view({N * G, 1}))
+                      .add_owned_const_input(dY.view({N * G, D}))
+                      .add_owned_const_input(X.view({N * G, D}))
+                      .add_owned_const_input(rstd.view({N * G, 1}))
+                      .add_owned_const_input(c2.view({N * G, 1}))
+                      .add_owned_const_input(c3.view({N * G, 1}))
                       .build();
       gpu_kernel(
           iter, [] GPU_LAMBDA(T dy, T x, T rstd, T_ACC c2, T_ACC c3) -> T {
@@ -772,7 +772,7 @@ void GroupNorm1dBackward(
     } else {
       const int64_t B = (C + kReduceTileSize - 1) / kReduceTileSize;
       // The algorithm for colwise reduction here is to accumulate each 32 cols
-      // to a 32 * 32 tile and write the tile to shared memmory. Then do warp
+      // to a 32 * 32 tile and write the tile to shared memory. Then do warp
       // reduce for each col in the tile. So here the blockDim must be (32, 16).
       constexpr int kThreadX = kReduceTileSize;
       constexpr int kThreadY = kReduceTileSize / 2;
@@ -865,8 +865,8 @@ void GroupNormBackwardKernelImplInternal(
       auto iter = TensorIteratorConfig()
                       .check_all_same_dtype(std::is_same<T, T_ACC>::value)
                       .add_output(c1)
-                      .add_owned_input(rstd.view({N, G, 1}))
-                      .add_owned_input(gamma.view({1, G, D}))
+                      .add_owned_const_input(rstd.view({N, G, 1}))
+                      .add_owned_const_input(gamma.view({1, G, D}))
                       .build();
       gpu_kernel(iter, [] GPU_LAMBDA(T rstd, T gamma) -> T_ACC {
         return static_cast<T_ACC>(rstd) * static_cast<T_ACC>(gamma);
@@ -895,11 +895,11 @@ void GroupNormBackwardKernelImplInternal(
                       .check_all_same_dtype(std::is_same<T, T_ACC>::value)
                       .resize_outputs(false)
                       .add_owned_output(dX.view({N * G, D, HxW}))
-                      .add_owned_input(dY.view({N * G, D, HxW}))
-                      .add_owned_input(X.view({N * G, D, HxW}))
-                      .add_owned_input(c1.view({N * G, D, 1}))
-                      .add_owned_input(c2.view({N * G, 1, 1}))
-                      .add_owned_input(c3.view({N * G, 1, 1}))
+                      .add_owned_const_input(dY.view({N * G, D, HxW}))
+                      .add_owned_const_input(X.view({N * G, D, HxW}))
+                      .add_owned_const_input(c1.view({N * G, D, 1}))
+                      .add_owned_const_input(c2.view({N * G, 1, 1}))
+                      .add_owned_const_input(c3.view({N * G, 1, 1}))
                       .build();
       gpu_kernel(
           iter, [] GPU_LAMBDA(T dy, T x, T_ACC c1, T_ACC c2, T_ACC c3) -> T {
@@ -911,11 +911,11 @@ void GroupNormBackwardKernelImplInternal(
                       .check_all_same_dtype(std::is_same<T, T_ACC>::value)
                       .resize_outputs(false)
                       .add_owned_output(dX.view({N * G, D * HxW}))
-                      .add_owned_input(dY.view({N * G, D * HxW}))
-                      .add_owned_input(X.view({N * G, D * HxW}))
-                      .add_owned_input(rstd.view({N * G, 1}))
-                      .add_owned_input(c2.view({N * G, 1}))
-                      .add_owned_input(c3.view({N * G, 1}))
+                      .add_owned_const_input(dY.view({N * G, D * HxW}))
+                      .add_owned_const_input(X.view({N * G, D * HxW}))
+                      .add_owned_const_input(rstd.view({N * G, 1}))
+                      .add_owned_const_input(c2.view({N * G, 1}))
+                      .add_owned_const_input(c3.view({N * G, 1}))
                       .build();
       gpu_kernel(
           iter, [] GPU_LAMBDA(T dy, T x, T_ACC c1, T_ACC c2, T_ACC c3) -> T {
@@ -944,7 +944,7 @@ void GroupNormBackwardKernelImplInternal(
     } else {
       const int64_t B = (C + kReduceTileSize - 1) / kReduceTileSize;
       // The algorithm for colwise reduction here is to accumulate each 32 cols
-      // to a 32 * 32 tile and write the tile to shared memmory. Then do warp
+      // to a 32 * 32 tile and write the tile to shared memory. Then do warp
       // reduce for each col in the tile. So here the blockDim must be (32, 16).
       constexpr int kThreadX = kReduceTileSize;
       constexpr int kThreadY = kReduceTileSize / 2;
diff --git a/aten/src/ATen/native/cuda/im2col.cuh b/aten/src/ATen/native/cuda/im2col.cuh
index 06eef13208c67..ec74617de34a1 100644
--- a/aten/src/ATen/native/cuda/im2col.cuh
+++ b/aten/src/ATen/native/cuda/im2col.cuh
@@ -34,7 +34,7 @@ __global__ void im2col_kernel(
     const int64_t height_col,
     const int64_t width_col,
     dt* data_col) {
-  CUDA_KERNEL_LOOP(index, n) {
+  CUDA_KERNEL_LOOP_TYPE(index, n, int64_t) {
     int64_t w_out = index % width_col;
 
     int64_t idx = index / width_col;
diff --git a/aten/src/ATen/native/cuda/int4mm.cu b/aten/src/ATen/native/cuda/int4mm.cu
index 07a70013b26f4..fcfcd2e5ebbdb 100644
--- a/aten/src/ATen/native/cuda/int4mm.cu
+++ b/aten/src/ATen/native/cuda/int4mm.cu
@@ -868,7 +868,6 @@ at::Tensor _weight_int4pack_mm_cuda(
     int64_t qGroupSize,
     const at::Tensor& qScaleAndZeros) {
   c10::cuda::CUDAGuard g(A.device());
-  auto stream = at::cuda::getCurrentCUDAStream();
 
   TORCH_CHECK(
       A.device() == B.device() && A.device() == qScaleAndZeros.device());
@@ -926,6 +925,7 @@ at::Tensor _weight_int4pack_mm_cuda(
       {m, n}, at::TensorOptions().dtype(at::kBFloat16).device(A.device()));
 
 #if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))
+  auto stream = at::cuda::getCurrentCUDAStream();
 #define RUN_GEMM(WARPS, K_TILES_PER_WARP, Q_GROUP_SIZE, REDUCE_TYPE) \
   do {                                                               \
     using ACLayout = ALayout_RM<REDUCE_TYPE>;                        \
@@ -1041,7 +1041,6 @@ at::Tensor _convert_weight_to_int4pack_cuda(
     const at::Tensor& in,
     int64_t innerKTiles) {
   c10::cuda::CUDAGuard g(in.device());
-  auto stream = at::cuda::getCurrentCUDAStream();
 
   TORCH_CHECK(in.dim() == 2);
   TORCH_CHECK(in.dtype() == at::kInt);
@@ -1072,6 +1071,7 @@ at::Tensor _convert_weight_to_int4pack_cuda(
       at::TensorOptions().dtype(at::kInt).device(in.device()));
 
 #if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800))
+  auto stream = at::cuda::getCurrentCUDAStream();
   dim3 grid(kSuperTiles, nTiles);
 
   if (innerKTiles == 2) {
diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index 61781e03b4a96..6e804efe5f847 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -20,7 +20,7 @@
 #include <cstdlib>
 #include <string>
 
-// TODO: C++17 has the fileystem header, which may replace these
+// TODO: C++17 has the filesystem header, which may replace these
 #ifdef _WIN32
   // On Windows, the POSIX implementations are considered deprecated. We simply map to the newer variant.
   #include <process.h>
@@ -1500,7 +1500,11 @@ NvrtcFunction jit_pwise_function(
     std::stringstream ss;
     ss << *cache_dir << "/";
     ss << kernel_name;
+#ifdef USE_ROCM
+    ss << "_arch" << prop->gcnArchName;
+#else
     ss << "_arch" << cuda_major << "." << cuda_minor;
+#endif
     ss << "_nvrtc" << nvrtc_major << "." << nvrtc_minor;
     ss << (compile_to_sass ? "_sass" : "_ptx");
     ss << "_" << code.length();
@@ -1510,7 +1514,7 @@ NvrtcFunction jit_pwise_function(
     std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
     if (readin.fail()) {
       // NOTE: this does not warn because the file might not exist
-      // TODO: consider if this should explicilty check for the file's existence or not to throw
+      // TODO: consider if this should explicitly check for the file's existence or not to throw
       //   an informative warning
       readin.close();
     } else {
@@ -1537,7 +1541,7 @@ NvrtcFunction jit_pwise_function(
   // Constructs nvrtc build arguments
   // CUDA 11.1 allows going directly to SASS (sm_) instead of PTX (compute_)
   // which gives better backwards compatibility to work on older driver,
-  // (since older driver doesn't necessrily recognize PTX emitted by new
+  // (since older driver doesn't necessarily recognize PTX emitted by new
   // toolkit);
   // Meanwhile, for forward compatibility (future device with
   // `unsupported_arch==True`), since SASS are not necessarily compatible,
@@ -1565,11 +1569,9 @@ NvrtcFunction jit_pwise_function(
   if (compilation_result != NVRTC_SUCCESS) {
     size_t logsize;
     AT_CUDA_NVRTC_CHECK(nvrtc.nvrtcGetProgramLogSize(program, &logsize));
-    std::vector<char> log(logsize);
-    AT_CUDA_NVRTC_CHECK(nvrtc.nvrtcGetProgramLog(program, log.data()));
-    std::stringstream cu;
-    cu << log.data();
-    throw std::runtime_error(code + cu.str());
+    std::string log(logsize, '\0');
+    AT_CUDA_NVRTC_CHECK(nvrtc.nvrtcGetProgramLog(program, &log[0]));
+    throw std::runtime_error(code + log);
   }
 
   size_t ptx_size = 0;
diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index bc5190874ffee..6423dddbb2995 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -297,7 +297,7 @@ __device__ __inline__ void vectorized_layer_norm_kernel_impl(
 
 //to avoid windows SFINAE errors
 template <typename T, typename T_ACC>
-__global__ __inline__ void vectorized_layer_norm_kernel(
+__global__ void vectorized_layer_norm_kernel(
   const int N,
   T_ACC eps,
   const  T* __restrict__ X,
@@ -393,7 +393,7 @@ __global__ void layer_norm_grad_input_kernel(
 
 // This implementation gets called when input buffers (dY, X, gamma and dX) are aligned
 // to vec_size * sizeof(T). Compared to the unvectorized implementation, it is about 10%
-// faster measuread at PT operator level, with cases seeing a 2X speedup (where N >> M).
+// faster measured at PT operator level, with cases seeing a 2X speedup (where N >> M).
 // There are no noticeable regressions on the rest of the sizes.
 
 template<typename T, typename T_ACC>
@@ -1149,12 +1149,12 @@ void LayerNormBackwardKernelImplInternal(
   file a support request to support bigger batches");
   TORCH_CHECK(N <= std::numeric_limits<int>::max(), "Normalized shape should have less than INT_MAX elements, \
   file a support request to support bigger normalized shapes");
-  const T* dY_data = dY.template data_ptr<T>();
-  const T* X_data = X.template data_ptr<T>();
-  const T_ACC* mean_data = mean.template data_ptr<T_ACC>();
-  const T_ACC* rstd_data = rstd.template data_ptr<T_ACC>();
+  const T* dY_data = dY.template const_data_ptr<T>();
+  const T* X_data = X.template const_data_ptr<T>();
+  const T_ACC* mean_data = mean.template const_data_ptr<T_ACC>();
+  const T_ACC* rstd_data = rstd.template const_data_ptr<T_ACC>();
   const T* gamma_data =
-      gamma.defined() ? gamma.template data_ptr<T>() : nullptr;
+      gamma.defined() ? gamma.template const_data_ptr<T>() : nullptr;
   T* dX_data = dX->defined() ? dX->template data_ptr<T>() : nullptr;
   cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
   const int warp_size = at::cuda::warp_size();
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
index a08547dc21b6a..5471c57ec30ed 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@@ -1992,7 +1992,7 @@ void linalg_eigh_magma(const Tensor& eigenvalues, const Tensor& eigenvectors, co
 }
 
 void linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvectors, const Tensor& infos, bool upper, bool compute_eigenvectors) {
-#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
+#if defined(USE_LINALG_SOLVER)
   auto preferred_backend = at::globalContext().linalgPreferredBackend();
   switch (preferred_backend) {
     case at::LinalgBackend::Magma:
@@ -2427,7 +2427,7 @@ static void lu_solve_kernel(const Tensor& LU, const Tensor& pivots, const Tensor
   // magma implementation of LU solve cannot handle a b tensor with last dim > 1024
   // See https://bitbucket.org/icl/magma/issues/19/dgesv_batched-dgetrs_batched-fails-for
   bool over_batched_magma_dim_limit = k > 1024;
-  // heuristics determined from tests dicussed in https://github.com/pytorch/pytorch/pull/72935
+  // heuristics determined from tests discussed in https://github.com/pytorch/pytorch/pull/72935
 
   // Computes X = U^{-1}L^{-1}P^T B via triangular solves
   // Helps mitigating the bugs in magma
@@ -2443,7 +2443,7 @@ static void lu_solve_kernel(const Tensor& LU, const Tensor& pivots, const Tensor
       .resize_outputs(false)
       .declare_static_shape(pivots_->sizes(), /*squash_dim=*/pivots_->dim() - 1)
       .add_output(perm)
-      .add_input(*pivots_)
+      .add_const_input(*pivots_)
       .build();
     unpack_pivots_stub(pivots_->device().type(), iter, n, n);
 
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
index ec65435d6c8df..06b095af4f66e 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@@ -135,11 +135,11 @@ void apply_ldl_solve_cusolver(
   auto b_stride = B.dim() > 2 ? B.stride(-3) : 0;
   auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0;
 
-  auto a_data = A.data_ptr<scalar_t>();
+  auto a_data = A.const_data_ptr<scalar_t>();
   auto b_data = B.data_ptr<scalar_t>();
 
   auto pivots_ = pivots.to(kLong);
-  auto pivots_data = pivots_.data_ptr<int64_t>();
+  auto pivots_data = pivots_.const_data_ptr<int64_t>();
 
   // needed to run ldl_solve tests in parallel
   // see https://github.com/pytorch/pytorch/issues/82894 for examples of failures
@@ -175,9 +175,9 @@ void apply_ldl_solve_cusolver(
 
   Tensor info = at::zeros({}, A.options().dtype(at::kInt));
   for (const auto i : c10::irange(batch_size)) {
-    auto* a_working_ptr = &a_data[i * a_stride];
+    const auto* a_working_ptr = &a_data[i * a_stride];
     auto* b_working_ptr = &b_data[i * b_stride];
-    auto* pivots_working_ptr = &pivots_data[i * pivots_stride];
+    const auto* pivots_working_ptr = &pivots_data[i * pivots_stride];
     TORCH_CUSOLVER_CHECK(cusolverDnXsytrs(
         handle,
         uplo,
@@ -1078,8 +1078,8 @@ static void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& ot
   auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
   auto trans = transpose ? (input.is_complex() ? CUBLAS_OP_C : CUBLAS_OP_T) : CUBLAS_OP_N;
 
-  auto input_data = input.data_ptr<scalar_t>();
-  auto tau_data = tau.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
+  auto tau_data = tau.const_data_ptr<scalar_t>();
   auto other_data = other.data_ptr<scalar_t>();
 
   auto input_matrix_stride = matrixStride(input);
@@ -1101,9 +1101,9 @@ static void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& ot
   auto info_data = info.data_ptr<int>();
 
   for (auto i = decltype(batch_size){0}; i < batch_size; i++) {
-    scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
+    const scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
     scalar_t* other_working_ptr = &other_data[i * other_matrix_stride];
-    scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
+    const scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
     auto handle = at::cuda::getCurrentCUDASolverDnHandle();
 
     // allocate workspace storage
@@ -1149,7 +1149,7 @@ void ormqr_cusolver(const Tensor& input, const Tensor& tau, const Tensor& other,
 template <typename scalar_t>
 inline static void apply_orgqr(Tensor& self, const Tensor& tau) {
   auto self_data = self.data_ptr<scalar_t>();
-  auto tau_data = tau.data_ptr<scalar_t>();
+  auto tau_data = tau.const_data_ptr<scalar_t>();
   auto self_matrix_stride = matrixStride(self);
   auto batchsize = cuda_int_cast(batchCount(self), "batch size");
   auto m = cuda_int_cast(self.size(-2), "m");
@@ -1180,7 +1180,7 @@ inline static void apply_orgqr(Tensor& self, const Tensor& tau) {
 
   for (auto i = decltype(batchsize){0}; i < batchsize; i++) {
     scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
-    scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
+    const scalar_t* tau_working_ptr = &tau_data[i * tau_stride];
     auto handle = at::cuda::getCurrentCUDASolverDnHandle();
 
     // allocate workspace storage
@@ -1434,8 +1434,12 @@ static void linalg_eigh_cusolver_syevj_batched(const Tensor& eigenvalues, const
 }
 
 void linalg_eigh_cusolver(const Tensor& eigenvalues, const Tensor& eigenvectors, const Tensor& infos, bool upper, bool compute_eigenvectors) {
+  // for ROCm's hipSolver, syevj is fastest.
+#ifdef USE_ROCM
+  linalg_eigh_cusolver_syevj(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
+#else
   if (use_cusolver_syevj_batched_ && batchCount(eigenvectors) > 1 && eigenvectors.size(-1) <= 32) {
-    // Use syevjBatched for batched matrix opertion when matrix size <= 32
+    // Use syevjBatched for batched matrix operation when matrix size <= 32
     // See https://github.com/pytorch/pytorch/pull/53040#issuecomment-788264724
     linalg_eigh_cusolver_syevj_batched(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
   } else if (eigenvectors.scalar_type() == at::kFloat && eigenvectors.size(-1) >= 32 && eigenvectors.size(-1) <= 512) {
@@ -1445,6 +1449,7 @@ void linalg_eigh_cusolver(const Tensor& eigenvalues, const Tensor& eigenvectors,
   } else {
     linalg_eigh_cusolver_syevd(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
   }
+#endif
 }
 
 // The 'apply_' word is used for templated by dtype functions that call an API routine
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp
index 38e7b8dd3288b..2a9f46e6f73e7 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp
@@ -133,7 +133,7 @@ static void apply_lu_solve_batched_cublas(const Tensor& LU, const Tensor& pivots
   TORCH_INTERNAL_ASSERT(batchCount(LU) == batchCount(pivots.unsqueeze(-1)), "batch_size of LU and pivots must be the same");
   const auto trans = to_cublas(transpose);
 
-  auto pivots_data = pivots.data_ptr<int>();
+  auto pivots_data = pivots.const_data_ptr<int>();
   auto batch_size = cuda_int_cast(batchCount(LU), "batch_size");;
   auto m = cuda_int_cast(LU.size(-2), "m");
   auto nrhs = cuda_int_cast(B.size(-1), "nrhs");
@@ -142,12 +142,12 @@ static void apply_lu_solve_batched_cublas(const Tensor& LU, const Tensor& pivots
 
   Tensor lu_ptr_array = get_device_pointers<scalar_t>(LU);
   Tensor b_ptr_array = get_device_pointers<scalar_t>(B);
-  auto lu_ptr_array_data = reinterpret_cast<scalar_t**>(lu_ptr_array.data_ptr());
+  auto lu_ptr_array_data = reinterpret_cast<const scalar_t* const*>(lu_ptr_array.const_data_ptr());
   auto b_ptr_array_data = reinterpret_cast<scalar_t**>(b_ptr_array.data_ptr());
 
   auto handle = at::cuda::getCurrentCUDABlasHandle();
-  at::cuda::blas::getrsBatched(handle, trans, m, nrhs, lu_ptr_array_data,
-    lda, pivots_data, b_ptr_array_data, lda, &info, batch_size);
+  at::cuda::blas::getrsBatched(handle, trans, m, nrhs, const_cast<scalar_t**>(lu_ptr_array_data),
+    lda, const_cast<int*>(pivots_data), b_ptr_array_data, lda, &info, batch_size);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0);
 }
 
@@ -218,6 +218,20 @@ static void apply_triangular_solve_batched(const Tensor& A, const Tensor& B, boo
 }
 
 void triangular_solve_batched_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
+  // Workaround the following a bug on CUDA < 12.1
+  // RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasStrsmBatched
+  // See https://github.com/pytorch/pytorch/issues/79191#issuecomment-1154222580
+#if defined(CUSOLVER_VERSION) && CUSOLVER_VERSION < 12100
+  constexpr auto max_batch_size = 524280;
+  if (B.size(-1) > max_batch_size) {
+    auto n_chunks = (B.size(-1) + max_batch_size - 1) / max_batch_size; // ceildiv
+    auto splits = B.split(n_chunks, /*dim=*/-1);
+    for (const Tensor& b : splits) {
+      triangular_solve_batched_cublas(A, b, left, upper, transpose, unitriangular);
+    }
+    return;
+  }
+#endif
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cuda", [&]{
     apply_triangular_solve_batched<scalar_t>(A, B, left, upper, transpose, unitriangular);
   });
diff --git a/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp b/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp
index bdb0d26a1b690..3016897c66c5d 100644
--- a/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp
+++ b/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp
@@ -29,7 +29,7 @@ using CuSolverDnPoolType = DeviceThreadHandlePool<cusolverDnHandle_t, createCuso
 } // namespace
 
 cusolverDnHandle_t getCurrentCUDASolverDnHandle() {
-  int device;
+  c10::DeviceIndex device = 0;
   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
 
   // Thread local PoolWindows are lazily-initialized
diff --git a/aten/src/ATen/native/cuda/vol2col.cuh b/aten/src/ATen/native/cuda/vol2col.cuh
index 51dbe1c744053..98ec2c3522d54 100644
--- a/aten/src/ATen/native/cuda/vol2col.cuh
+++ b/aten/src/ATen/native/cuda/vol2col.cuh
@@ -36,7 +36,7 @@ __global__ void vol2col_kernel(
     const int height_col,
     const int width_col,
     T* data_col) {
-  CUDA_KERNEL_LOOP(index, n) {
+  CUDA_KERNEL_LOOP_TYPE(index, n, int64_t) {
     auto w_out = index % width_col;
     index /= width_col;
     auto h_out = index % height_col;
diff --git a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
index bfc7184e93034..3ee342a03e19e 100644
--- a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
+++ b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
@@ -1,68 +1,84 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAConfig.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
-#include <ATen/ops/empty.h>
-#include <ATen/ops/cudnn_affine_grid_generator_native.h>
 #include <ATen/ops/cudnn_affine_grid_generator_backward_native.h>
+#include <ATen/ops/cudnn_affine_grid_generator_native.h>
+#include <ATen/ops/empty.h>
 #endif
 
 #if !AT_CUDNN_ENABLED()
 
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 // See Note [ATen preprocessor philosophy]
 
 Tensor cudnn_affine_grid_generator_forward(
     const Tensor& theta,
-    int64_t N, int64_t C, int64_t H, int64_t W) {
-  AT_ERROR("cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support");
+    int64_t N,
+    int64_t C,
+    int64_t H,
+    int64_t W) {
+  AT_ERROR(
+      "cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support");
 }
 
 Tensor cudnn_affine_grid_generator_backward(
     const Tensor& grad_theta,
-    int64_t N, int64_t C, int64_t H, int64_t W) {
-  AT_ERROR("cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support");
+    int64_t N,
+    int64_t C,
+    int64_t H,
+    int64_t W) {
+  AT_ERROR(
+      "cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support");
 }
 
-}}
+} // namespace native
+} // namespace at
 
 #else // AT_CUDNN_ENABLED()
 
-#include <ATen/cudnn/cudnn-wrapper.h>
-#include <ATen/cudnn/Handle.h>
+#include <ATen/cuda/Exceptions.h>
 #include <ATen/cudnn/Descriptors.h>
+#include <ATen/cudnn/Handle.h>
 #include <ATen/cudnn/Types.h>
 #include <ATen/cudnn/Utils.h>
-#include <ATen/cuda/Exceptions.h>
+#include <ATen/cudnn/cudnn-wrapper.h>
 
 #include <ATen/TensorUtils.h>
 
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 namespace {
 
-void setSamplerDescriptor(SpatialTransformerDescriptor& desc,
-                          cudnnDataType_t dataType,
-                          int N, int C, int H, int W)
-{
+void setSamplerDescriptor(
+    SpatialTransformerDescriptor& desc,
+    cudnnDataType_t dataType,
+    int N,
+    int C,
+    int H,
+    int W) {
   int inputSize[4] = {N, C, H, W};
   desc.set(dataType, 4, inputSize);
 }
 
-}  // namespace
+} // namespace
 
 Tensor cudnn_affine_grid_generator_forward(
     const Tensor& theta_t,
-    int64_t N, int64_t C, int64_t H, int64_t W)
-{
+    int64_t N,
+    int64_t C,
+    int64_t H,
+    int64_t W) {
   auto theta_t_contig = theta_t.contiguous();
-  TensorArg theta{ theta_t_contig, "theta", 1 };
+  TensorArg theta{theta_t_contig, "theta", 1};
   CheckedFrom c = "cudnn_affine_grid_generator_forward";
   checkContiguous(c, theta);
   checkSize(c, theta, {N, 2, 3});
@@ -73,18 +89,19 @@ Tensor cudnn_affine_grid_generator_forward(
   auto dataType = getCudnnDataType(*theta);
   SpatialTransformerDescriptor desc;
   setSamplerDescriptor(desc, dataType, N, C, H, W);
-  AT_CUDNN_CHECK(cudnnSpatialTfGridGeneratorForward(getCudnnHandle(), desc.desc(),
-                                                 theta->data_ptr(),
-                                                 grid_t.data_ptr()));
+  AT_CUDNN_CHECK(cudnnSpatialTfGridGeneratorForward(
+      getCudnnHandle(), desc.desc(), theta->data_ptr(), grid_t.data_ptr()));
   return grid_t;
 }
 
 Tensor cudnn_affine_grid_generator_backward(
     const Tensor& grad_grid_t,
-    int64_t N, int64_t C, int64_t H, int64_t W)
-{
+    int64_t N,
+    int64_t C,
+    int64_t H,
+    int64_t W) {
   auto grad_grid_contig = grad_grid_t.contiguous();
-  TensorArg grad_grid{ grad_grid_contig, "grad_grid", 1 };
+  TensorArg grad_grid{grad_grid_contig, "grad_grid", 1};
   CheckedFrom c = "cudnn_affine_grid_generator_backward";
   checkContiguous(c, grad_grid);
   checkSize(c, grad_grid, {N, H, W, 2});
@@ -95,12 +112,15 @@ Tensor cudnn_affine_grid_generator_backward(
   auto dataType = getCudnnDataType(grad_theta_t);
   SpatialTransformerDescriptor desc;
   setSamplerDescriptor(desc, dataType, N, C, H, W);
-  AT_CUDNN_CHECK(cudnnSpatialTfGridGeneratorBackward(getCudnnHandle(), desc.desc(),
-                                                  grad_grid->data_ptr(),
-                                                  grad_theta_t.data_ptr()));
+  AT_CUDNN_CHECK(cudnnSpatialTfGridGeneratorBackward(
+      getCudnnHandle(),
+      desc.desc(),
+      grad_grid->data_ptr(),
+      grad_theta_t.data_ptr()));
   return grad_theta_t;
 }
 
-}}  // namespace at::native
+} // namespace native
+} // namespace at
 
 #endif // AT_CUDNN_ENABLED()
diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp
index f18318fd0dcf8..44b004dff0007 100644
--- a/aten/src/ATen/native/cudnn/BatchNorm.cpp
+++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp
@@ -1,36 +1,63 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAConfig.h>
 
+#ifdef __HIP_PLATFORM_AMD__
+#include <ATen/native/cudnn/hip/BatchNorm.h>
+#else
+#include <ATen/native/cudnn/BatchNorm.h>
+#endif
+
 #if !AT_CUDNN_ENABLED()
 
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 // See Note [ATen preprocessor philosophy]
 
 std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
-    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
-    bool training, double exponential_average_factor, double epsilon) {
+    const Tensor& input,
+    const Tensor& weight,
+    const c10::optional<Tensor>& bias_opt,
+    const c10::optional<Tensor>& running_mean_opt,
+    const c10::optional<Tensor>& running_var_opt,
+    bool training,
+    double exponential_average_factor,
+    double epsilon) {
   AT_ERROR("cudnn_batch_norm: ATen not compiled with cuDNN support");
 }
 
 std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
-    const Tensor& input, const Tensor& grad_output, const Tensor& weight, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
-    double epsilon, const Tensor& reservedSpace) {
+    const Tensor& input,
+    const Tensor& grad_output,
+    const Tensor& weight,
+    const c10::optional<Tensor>& running_mean_opt,
+    const c10::optional<Tensor>& running_var_opt,
+    const c10::optional<Tensor>& save_mean_opt,
+    const c10::optional<Tensor>& save_var_opt,
+    double epsilon,
+    const Tensor& reservedSpace) {
   AT_ERROR("cudnn_batch_norm_backward: ATen not compiled with cuDNN support");
 }
 
-}}  // namespace at::native
+size_t _get_cudnn_batch_norm_reserve_space_size(
+    const Tensor& input_t,
+    bool training) {
+  AT_ERROR(
+      "_get_cudnn_batch_norm_reserve_space_size: ATen not compiled with cuDNN support");
+}
+
+} // namespace native
+} // namespace at
 
 #else // AT_CUDNN_ENABLED
 
+#include <ATen/TensorUtils.h>
+#include <ATen/cuda/Exceptions.h>
 #include <ATen/cudnn/Descriptors.h>
 #include <ATen/cudnn/Types.h>
 #include <ATen/cudnn/Utils.h>
-#include <ATen/cuda/Exceptions.h>
-
-#include <ATen/TensorUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -42,33 +69,29 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
 #include <ATen/ops/empty_like.h>
 #endif
 
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 namespace {
 
 Tensor expandScale(const Tensor& t, int64_t dim) {
-  std::vector<int64_t> size{ 1, t.numel() };
+  std::vector<int64_t> size{1, t.numel()};
   while (static_cast<int64_t>(size.size()) < dim) {
     size.emplace_back(1);
   }
   return t.view(size);
 }
 
-cudnnBatchNormMode_t getCudnnBatchNormMode(bool training, at::MemoryFormat memory_format, int64_t dim) {
+cudnnBatchNormMode_t getCudnnBatchNormMode(
+    bool training,
+    at::MemoryFormat memory_format,
+    int64_t dim) {
   if (dim == 2) {
     return CUDNN_BATCHNORM_PER_ACTIVATION;
   } else if (training && memory_format == at::MemoryFormat::ChannelsLast) {
-
     return CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-
   } else if (training && memory_format == at::MemoryFormat::ChannelsLast3d) {
-
-#if CUDNN_VERSION >= 8100
     return CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-#else
-    return CUDNN_BATCHNORM_SPATIAL;
-#endif // CUDNN_VERSION >= 8100
-
   } else {
     // TODO: The new CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode was
     // introduced in CuDNN 7 for performance optimization, but it results in
@@ -78,23 +101,44 @@ cudnnBatchNormMode_t getCudnnBatchNormMode(bool training, at::MemoryFormat memor
   }
 }
 
-}  // namespace
+} // namespace
+
+size_t _get_cudnn_batch_norm_reserve_space_size(
+    const Tensor& input_t,
+    bool training) {
+  size_t reserve_size;
+  TensorArg input{input_t, "input", 1};
+  TensorDescriptor idesc{*input, 4};
+  auto handle = getCudnnHandle();
+  cudnnBatchNormMode_t mode = getCudnnBatchNormMode(
+      training, input->suggest_memory_format(), input->dim());
+  auto op = CUDNN_BATCHNORM_OPS_BN;
+  AT_CUDNN_CHECK(cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+      handle, mode, op, nullptr, idesc.desc(), &reserve_size));
+  return reserve_size;
+}
 
 std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
-    const Tensor& input_t, const Tensor& weight_t, const c10::optional<Tensor>& bias_t_opt, const c10::optional<Tensor>& running_mean_t_opt, const c10::optional<Tensor>& running_var_t_opt,
-    bool training, double exponential_average_factor, double epsilon)
-{
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const c10::optional<Tensor>& bias_t_opt,
+    const c10::optional<Tensor>& running_mean_t_opt,
+    const c10::optional<Tensor>& running_var_t_opt,
+    bool training,
+    double exponential_average_factor,
+    double epsilon) {
   // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt);
+  c10::MaybeOwned<Tensor> bias_t_maybe_owned =
+      at::borrow_from_optional_tensor(bias_t_opt);
   const Tensor& bias_t = *bias_t_maybe_owned;
-  const Tensor& running_mean_t = c10::value_or_else(running_mean_t_opt, [] {return Tensor();});
-  const Tensor& running_var_t = c10::value_or_else(running_var_t_opt, [] {return Tensor();});
-
-  TensorArg input{ input_t, "input", 1 },
-            weight{ weight_t, "weight", 2 },
-            bias{ bias_t, "bias", 3 },
-            running_mean{ running_mean_t, "running_mean", 4 },
-            running_var{ running_var_t, "running_var", 5 };
+  const Tensor& running_mean_t =
+      c10::value_or_else(running_mean_t_opt, [] { return Tensor(); });
+  const Tensor& running_var_t =
+      c10::value_or_else(running_var_t_opt, [] { return Tensor(); });
+
+  TensorArg input{input_t, "input", 1}, weight{weight_t, "weight", 2},
+      bias{bias_t, "bias", 3}, running_mean{running_mean_t, "running_mean", 4},
+      running_var{running_var_t, "running_var", 5};
   CheckedFrom c = "cudnn_batch_norm";
 
   checkAllDefined(c, {input, weight, bias});
@@ -122,19 +166,19 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
   }
 
   cudnnBatchNormMode_t mode = getCudnnBatchNormMode(
-                                training,
-                                input->suggest_memory_format(),
-                                input->dim()
-                              );
+      training, input->suggest_memory_format(), input->dim());
 
-  auto output_t = at::empty_like(*input, input->options(), input->suggest_memory_format());
+  auto output_t =
+      at::empty_like(*input, input->options(), input->suggest_memory_format());
 
-  TensorArg output{ output_t, "output", 0 };
+  TensorArg output{output_t, "output", 0};
 
   auto handle = getCudnnHandle();
   auto dataType = getCudnnDataType(*input);
-  TensorDescriptor idesc{ *input, 4 };  // input descriptor
-  TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 };  // descriptor for weight, bias, running_mean, etc.
+  TensorDescriptor idesc{*input, 4}; // input descriptor
+  TensorDescriptor wdesc{
+      expandScale(*weight, input->dim()),
+      4}; // descriptor for weight, bias, running_mean, etc.
 
   Constant one(dataType, 1);
   Constant zero(dataType, 0);
@@ -143,10 +187,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
   Tensor reserve;
 
   if (training) {
-
     int64_t num_features = input_t.size(1);
-    save_mean = at::empty({ num_features }, weight_t.options());
-    save_var = at::empty({ num_features }, weight_t.options());
+    save_mean = at::empty({num_features}, weight_t.options());
+    save_var = at::empty({num_features}, weight_t.options());
 
     auto op = CUDNN_BATCHNORM_OPS_BN;
     size_t workspace_size;
@@ -163,14 +206,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
     Tensor workspace = at::empty(workspace_size, input->options().dtype(kByte));
 
     // get the reserved size and allocate as tensor
-    size_t reserve_size;
-    AT_CUDNN_CHECK(cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-        handle,
-        mode,
-        op,
-        nullptr,
-        idesc.desc(),
-        &reserve_size));
+    size_t reserve_size =
+        _get_cudnn_batch_norm_reserve_space_size(input_t, true /* training */);
     reserve = at::empty(reserve_size, input->options().dtype(kByte));
 
     AT_CUDNN_CHECK(cudnnBatchNormalizationForwardTrainingEx(
@@ -180,14 +217,14 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
         &one,
         &zero,
         idesc.desc(),
-        input->data_ptr(),
-        nullptr,  // z descriptor for BN-Add-Relu
-        nullptr,  // z for BN-Add-ReLU
+        input->const_data_ptr(),
+        nullptr, // z descriptor for BN-Add-Relu
+        nullptr, // z for BN-Add-ReLU
         idesc.desc(),
         output->data_ptr(),
         wdesc.desc(),
-        weight->data_ptr(),
-        bias->data_ptr(),
+        weight->const_data_ptr(),
+        bias->const_data_ptr(),
         exponential_average_factor,
         at::maybe_data_ptr(running_mean),
         at::maybe_data_ptr(running_var),
@@ -205,21 +242,27 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
     save_mean = at::empty({0}, weight_t.options());
     save_var = at::empty({0}, weight_t.options());
     AT_CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
-      handle, mode, &one, &zero,
-      idesc.desc(), input->data_ptr(),
-      idesc.desc(), output->data_ptr(),
-      wdesc.desc(),
-      weight->data_ptr(),
-      bias->data_ptr(),
-      running_mean->data_ptr(),
-      running_var->data_ptr(),
-      epsilon));
+        handle,
+        mode,
+        &one,
+        &zero,
+        idesc.desc(),
+        input->const_data_ptr(),
+        idesc.desc(),
+        output->data_ptr(),
+        wdesc.desc(),
+        weight->const_data_ptr(),
+        bias->const_data_ptr(),
+        running_mean->const_data_ptr(),
+        running_var->const_data_ptr(),
+        epsilon));
   }
 
   // save_mean and save_var can be undefined
   // If this causes problems, we can initialize them to empty tensors
   // of the correct type
-  return std::tuple<Tensor, Tensor, Tensor, Tensor>{output_t, save_mean, save_var, reserve};
+  return std::tuple<Tensor, Tensor, Tensor, Tensor>{
+      output_t, save_mean, save_var, reserve};
 }
 
 // NB: CuDNN only implements the backward algorithm for batchnorm
@@ -246,13 +289,13 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
   // TODO: Is it worth it to have a contiguous call or maybe we should go with
   // whatever format is given here.
 
-  auto grad_output_contig = grad_output_t.contiguous(input_t.suggest_memory_format());
-  TensorArg input{ input_t, "input", 1 },
-            grad_output{ grad_output_contig, "grad_output", 2 },
-            weight{ weight_t, "weight", 3 },
-            save_mean{ save_mean_t, "save_mean", 4 },
-            save_var{ save_var_t, "save_var", 5 },
-            reserve{ reserveSpace, "reserve_space", 6 };
+  auto grad_output_contig =
+      grad_output_t.contiguous(input_t.suggest_memory_format());
+  TensorArg input{input_t, "input", 1},
+      grad_output{grad_output_contig, "grad_output", 2},
+      weight{weight_t, "weight", 3}, save_mean{save_mean_t, "save_mean", 4},
+      save_var{save_var_t, "save_var", 5},
+      reserve{reserveSpace, "reserve_space", 6};
   CheckedFrom c = "cudnn_batch_norm_backward";
 
   checkAllDefined(c, {input, grad_output, weight, save_mean, save_var});
@@ -277,21 +320,23 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
   }
 
   cudnnBatchNormMode_t mode = getCudnnBatchNormMode(
-                                true, // training
-                                input->suggest_memory_format(),
-                                input->dim()
-                              );
+      true, // training
+      input->suggest_memory_format(),
+      input->dim());
 
-  auto grad_input_t  = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
+  auto grad_input_t = at::empty(
+      input->sizes(), input->options(), input->suggest_memory_format());
   auto grad_weight_t = at::empty(weight->sizes(), weight->options());
-  auto grad_bias_t   = at::empty(weight->sizes(), weight->options());
+  auto grad_bias_t = at::empty(weight->sizes(), weight->options());
 
   auto handle = getCudnnHandle();
   auto dataType = getCudnnDataType(*input);
 
-  TensorDescriptor idesc{ *input, 4 };  // input, grad_output descriptor
-  TensorDescriptor odesc{ *grad_output, 4 };  // input, grad_output descriptor
-  TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 };  // descriptor for weight, save_mean, etc.
+  TensorDescriptor idesc{*input, 4}; // input, grad_output descriptor
+  TensorDescriptor odesc{*grad_output, 4}; // input, grad_output descriptor
+  TensorDescriptor wdesc{
+      expandScale(*weight, input->dim()),
+      4}; // descriptor for weight, save_mean, etc.
 
   Constant one(dataType, 1);
   Constant zero(dataType, 0);
@@ -314,28 +359,42 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
   Tensor workspace = at::empty(workspace_size, input->options().dtype(kByte));
 
   AT_CUDNN_CHECK(cudnnBatchNormalizationBackwardEx(
-    handle, mode, op, &one, &zero, &one, &zero,
-    idesc.desc(), input->data_ptr(),
-    nullptr, nullptr,
-    odesc.desc(), grad_output->data_ptr(),
-    nullptr, nullptr,
-    idesc.desc(), grad_input_t.data_ptr(),
-    wdesc.desc(), weight->data_ptr(),
-    nullptr,
-    grad_weight_t.data_ptr(),
-    grad_bias_t.data_ptr(),
-    epsilon,
-    save_mean->data_ptr(),
-    save_var->data_ptr(),
-    nullptr,
-    workspace.data_ptr(),
-    workspace_size,
-    reserve->data_ptr(),
-    reserve->numel()));
-
-  return std::tuple<Tensor,Tensor,Tensor>{grad_input_t, grad_weight_t, grad_bias_t};
+      handle,
+      mode,
+      op,
+      &one,
+      &zero,
+      &one,
+      &zero,
+      idesc.desc(),
+      input->const_data_ptr(),
+      nullptr,
+      nullptr,
+      odesc.desc(),
+      grad_output->const_data_ptr(),
+      nullptr,
+      nullptr,
+      idesc.desc(),
+      grad_input_t.data_ptr(),
+      wdesc.desc(),
+      weight->const_data_ptr(),
+      nullptr,
+      grad_weight_t.data_ptr(),
+      grad_bias_t.data_ptr(),
+      epsilon,
+      save_mean->const_data_ptr(),
+      save_var->const_data_ptr(),
+      nullptr,
+      workspace.data_ptr(),
+      workspace_size,
+      reserve->data_ptr(),
+      reserve->numel()));
+
+  return std::tuple<Tensor, Tensor, Tensor>{
+      grad_input_t, grad_weight_t, grad_bias_t};
 }
 
-}}  // namespace native
+} // namespace native
+} // namespace at
 
 #endif
diff --git a/aten/src/ATen/native/cudnn/BatchNorm.h b/aten/src/ATen/native/cudnn/BatchNorm.h
new file mode 100644
index 0000000000000..3da76c0c16e41
--- /dev/null
+++ b/aten/src/ATen/native/cudnn/BatchNorm.h
@@ -0,0 +1,6 @@
+namespace at::native {
+
+TORCH_API size_t
+_get_cudnn_batch_norm_reserve_space_size(const Tensor& input_t, bool training);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp b/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp
index f6362b828f4ca..8475a143f466c 100644
--- a/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp
+++ b/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp
@@ -12,7 +12,8 @@
 #include <ATen/ops/cudnn_convolution_transpose_native.h>
 #endif
 
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 // ---------------------------------------------------------------------
 //
@@ -25,89 +26,180 @@ namespace at { namespace native {
 // See Note [ATen preprocessor philosophy]
 
 at::Tensor cudnn_convolution(
-    const at::Tensor& input, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) {
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
   AT_ERROR("cudnn_convolution: ATen not compiled with cuDNN support");
 }
 
 at::Tensor& cudnn_convolution_out(
-    const Tensor& input_t, const Tensor& weight_t, IntArrayRef padding,
-    IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool benchmark,
-    bool deterministic, bool allow_tf32, Tensor& output_t) {
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32,
+    Tensor& output_t) {
   AT_ERROR("cudnn_convolution_out: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_backward_input(
-    IntArrayRef input_size, const at::Tensor& grad_output, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32) {
-  AT_ERROR("cudnn_convolution_backward_input: ATen not compiled with cuDNN support");
+    IntArrayRef input_size,
+    const at::Tensor& grad_output,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
+  AT_ERROR(
+      "cudnn_convolution_backward_input: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_backward_weight(
-    IntArrayRef weight_size, const at::Tensor& grad_output, const at::Tensor& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32) {
-  AT_ERROR("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support");
+    IntArrayRef weight_size,
+    const at::Tensor& grad_output,
+    const at::Tensor& input,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
+  AT_ERROR(
+      "cudnn_convolution_backward_weight: ATen not compiled with cuDNN support");
 }
 
-std::tuple<at::Tensor,at::Tensor> cudnn_convolution_backward(
-    const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32, std::array<bool,2> output_mask) {
+std::tuple<at::Tensor, at::Tensor> cudnn_convolution_backward(
+    const at::Tensor& input,
+    const at::Tensor& grad_output,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32,
+    std::array<bool, 2> output_mask) {
   AT_ERROR("cudnn_convolution_backward: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_transpose(
-    const at::Tensor& input, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) {
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
   AT_ERROR("cudnn_convolution_transpose: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_transpose_backward_input(
-    const at::Tensor& grad_output, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) {
-  AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
+    const at::Tensor& grad_output,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
+  AT_ERROR(
+      "cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_transpose_backward_weight(
-    IntArrayRef weight_size, const at::Tensor& grad_output, const at::Tensor& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32) {
-  AT_ERROR("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support");
+    IntArrayRef weight_size,
+    const at::Tensor& grad_output,
+    const at::Tensor& input,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
+  AT_ERROR(
+      "cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support");
 }
 
-std::tuple<at::Tensor,at::Tensor> cudnn_convolution_transpose_backward(
-    const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32, std::array<bool,2> output_mask) {
-  AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
+std::tuple<at::Tensor, at::Tensor> cudnn_convolution_transpose_backward(
+    const at::Tensor& input,
+    const at::Tensor& grad_output,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32,
+    std::array<bool, 2> output_mask) {
+  AT_ERROR(
+      "cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
 }
 
 void raw_cudnn_convolution_forward_out(
-    const Tensor& output, const Tensor& input, const Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32) {
-  AT_ERROR("raw_cudnn_convolution_forward_out: ATen not compiled with cuDNN support");
+    const Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
+  AT_ERROR(
+      "raw_cudnn_convolution_forward_out: ATen not compiled with cuDNN support");
 }
 
 void raw_cudnn_convolution_backward_input_out(
     const at::Tensor& grad_input,
     const at::Tensor& grad_output,
     const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32) {
-  AT_ERROR("raw_cudnn_convolution_backward_input_out: ATen not compiled with cuDNN support");
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
+  AT_ERROR(
+      "raw_cudnn_convolution_backward_input_out: ATen not compiled with cuDNN support");
 }
 
 void raw_cudnn_convolution_backward_weight_out(
-    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32) {
-  AT_ERROR("raw_cudnn_convolution_backward_weight_out: ATen not compiled with cuDNN support");
+    const Tensor& grad_weight,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
+  AT_ERROR(
+      "raw_cudnn_convolution_backward_weight_out: ATen not compiled with cuDNN support");
 }
 
 Tensor cudnn_convolution_relu(
@@ -134,6 +226,7 @@ Tensor cudnn_convolution_add_relu(
   AT_ERROR("cudnn_convolution_add_relu: ATen not compiled with cuDNN support");
 }
 
-#endif  // AT_CUDNN_ENABLED
+#endif // AT_CUDNN_ENABLED
 
-}}
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cudnn/ConvShared.cpp b/aten/src/ATen/native/cudnn/ConvShared.cpp
index 3a615806e50a9..104ae8c70803d 100644
--- a/aten/src/ATen/native/cudnn/ConvShared.cpp
+++ b/aten/src/ATen/native/cudnn/ConvShared.cpp
@@ -1,10 +1,10 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
 #include <ATen/Context.h>
-#include <ATen/cuda/EmptyTensor.h>
 #include <ATen/TensorGeometry.h>
 #include <ATen/TensorUtils.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAConfig.h>
+#include <ATen/cuda/EmptyTensor.h>
 #include <ATen/native/ConvUtils.h>
 
 #if AT_CUDNN_ENABLED()
@@ -30,7 +30,7 @@
 // ConvPlaceholders.cpp contains placeholder implementation of cudnn
 // convolution when cudnn is not enabled. These operators only raises
 // errors, and do no real computation. These operators are implemented
-// using currnet operators.
+// using current operators.
 //
 // cuDNN v7 and v8 have different API. ConvShared.{cpp, h} contains
 // code shared by v7 and v8. Conv_v7.cpp contains implementation of
@@ -54,7 +54,7 @@
 //      Function that has different implementation on Conv_v7.cpp
 //      and Conv_v8.cpp
 //
-// The raw API directly invokes CuDNN and are implemeted differently
+// The raw API directly invokes CuDNN and are implemented differently
 // on cuDNN v7 and cuDNN v8
 //
 // There are a few reasons this should never be directly exposed
@@ -71,7 +71,8 @@
 //  - Things that happen in TensorArg
 //    - Check arguments (type, GPU, shape)
 
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 // ---------------------------------------------------------------------
 //
@@ -79,16 +80,17 @@ namespace at { namespace native {
 //
 // ---------------------------------------------------------------------
 
-std::ostream& operator<<(std::ostream & out, const ConvolutionParams& params) {
+std::ostream& operator<<(std::ostream& out, const ConvolutionParams& params) {
   out << "ConvolutionParams \n"
-    << "    memory_format = " << params.memory_format << "\n"
-    << "    data_type = " << cudnnTypeToString(params.dataType) << "\n"
-    << "    padding = " << ArrayRef<int>{params.padding} << "\n"
-    << "    stride = " << ArrayRef<int>{params.stride} << "\n"
-    << "    dilation = " << ArrayRef<int>{params.dilation} << "\n"
-    << "    groups = " << params.groups << "\n"
-    << "    deterministic = " << (params.deterministic ? "true" : "false") << "\n"
-    << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n";
+      << "    memory_format = " << params.memory_format << "\n"
+      << "    data_type = " << cudnnTypeToString(params.dataType) << "\n"
+      << "    padding = " << ArrayRef<int>{params.padding} << "\n"
+      << "    stride = " << ArrayRef<int>{params.stride} << "\n"
+      << "    dilation = " << ArrayRef<int>{params.dilation} << "\n"
+      << "    groups = " << params.groups << "\n"
+      << "    deterministic = " << (params.deterministic ? "true" : "false")
+      << "\n"
+      << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n";
 
   return out;
 }
@@ -100,10 +102,15 @@ std::ostream& operator<<(std::ostream & out, const ConvolutionParams& params) {
 // grad_input/grad_output, so this is not very pressing)
 void setConvolutionParams(
     ConvolutionParams* params,
-    const at::Tensor& input, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool deterministic, bool allow_tf32, at::MemoryFormat memory_format) {
-
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool deterministic,
+    bool allow_tf32,
+    at::MemoryFormat memory_format) {
   cudnnDataType_t dataType = getCudnnDataType(input);
   memset(params, 0, sizeof(ConvolutionParams));
   params->device_id = at::cuda::current_device();
@@ -112,8 +119,8 @@ void setConvolutionParams(
   params->input_dim = input.dim();
   params->memory_format = memory_format;
   for (int i = 0; i != params->input_dim; ++i) {
-    params->input_size[i] = (int) input.sizes()[i];
-    params->weight_size[i] = (int) weight.sizes()[i];
+    params->input_size[i] = (int)input.sizes()[i];
+    params->weight_size[i] = (int)weight.sizes()[i];
   }
   // ASSERT(padding.size() == stride.size())
   // ASSERT(padding.size() == dilation.size())
@@ -133,37 +140,55 @@ std::string repro_from_args(const ConvolutionParams& params) {
   auto pybool = [](bool b) -> const char* { return b ? "True" : "False"; };
   std::string partial_dtype;
   switch (params.dataType) {
-    case CUDNN_DATA_FLOAT: partial_dtype = "float"; break;
-    case CUDNN_DATA_DOUBLE: partial_dtype = "double"; break;
-    case CUDNN_DATA_HALF: partial_dtype = "half"; break;
-    default: partial_dtype = "unsupported";
+    case CUDNN_DATA_FLOAT:
+      partial_dtype = "float";
+      break;
+    case CUDNN_DATA_DOUBLE:
+      partial_dtype = "double";
+      break;
+    case CUDNN_DATA_HALF:
+      partial_dtype = "half";
+      break;
+    default:
+      partial_dtype = "unsupported";
   }
   const std::string full_dtype = "torch." + partial_dtype;
   const int out_channels = params.weight_size[0];
   const int in_channels = params.weight_size[1] * params.groups;
   const size_t dim = params.input_dim;
-  const std::string channels_last_xd = dim == 4 ? "channels_last" : "channels_last_3d";
+  const std::string channels_last_xd =
+      dim == 4 ? "channels_last" : "channels_last_3d";
   const std::string to_channels_last =
-    ((params.memory_format == at::MemoryFormat::ChannelsLast) || (params.memory_format == at::MemoryFormat::ChannelsLast3d)) \
-    ? ".to(memory_format=torch." + channels_last_xd + ")" : "";
+      ((params.memory_format == at::MemoryFormat::ChannelsLast) ||
+       (params.memory_format == at::MemoryFormat::ChannelsLast3d))
+      ? ".to(memory_format=torch." + channels_last_xd + ")"
+      : "";
 
   std::ostringstream ss;
   ss << "You can try to repro this exception using the following code snippet. ";
   ss << "If that doesn't trigger the error, please include your original repro script when reporting this issue.\n\n";
   ss << "import torch\n";
-  ss << "torch.backends.cuda.matmul.allow_tf32 = " << pybool(at::globalContext().allowTF32CuBLAS()) << "\n";
-  ss << "torch.backends.cudnn.benchmark = " << pybool(at::globalContext().benchmarkCuDNN()) << "\n";
-  ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic) << "\n";
-  ss << "torch.backends.cudnn.allow_tf32 = " << pybool(params.allow_tf32) << "\n";
-  ss << "data = torch.randn(" << ArrayRef<int>(params.input_size, dim) << ", dtype=" << full_dtype << ", ";
-  ss <<   "device='cuda', requires_grad=True)" << to_channels_last << "\n";
-  ss << "net = torch.nn.Conv" << dim-2 << "d(" << in_channels << ", " << out_channels << ", ";
-  ss <<   "kernel_size=" << ArrayRef<int>(&params.weight_size[2], dim - 2) << ", ";
-  ss <<   "padding=" << ArrayRef<int>(params.padding, dim-2) << ", ";
-  ss <<   "stride=" << ArrayRef<int>(params.stride, dim-2) << ", ";
-  ss <<   "dilation=" << ArrayRef<int>(params.dilation, dim-2) << ", ";
-  ss <<   "groups=" << params.groups << ")\n";
-  ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last << "\n";
+  ss << "torch.backends.cuda.matmul.allow_tf32 = "
+     << pybool(at::globalContext().allowTF32CuBLAS()) << "\n";
+  ss << "torch.backends.cudnn.benchmark = "
+     << pybool(at::globalContext().benchmarkCuDNN()) << "\n";
+  ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic)
+     << "\n";
+  ss << "torch.backends.cudnn.allow_tf32 = " << pybool(params.allow_tf32)
+     << "\n";
+  ss << "data = torch.randn(" << ArrayRef<int>(params.input_size, dim)
+     << ", dtype=" << full_dtype << ", ";
+  ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n";
+  ss << "net = torch.nn.Conv" << dim - 2 << "d(" << in_channels << ", "
+     << out_channels << ", ";
+  ss << "kernel_size=" << ArrayRef<int>(&params.weight_size[2], dim - 2)
+     << ", ";
+  ss << "padding=" << ArrayRef<int>(params.padding, dim - 2) << ", ";
+  ss << "stride=" << ArrayRef<int>(params.stride, dim - 2) << ", ";
+  ss << "dilation=" << ArrayRef<int>(params.dilation, dim - 2) << ", ";
+  ss << "groups=" << params.groups << ")\n";
+  ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last
+     << "\n";
   ss << "out = net(data)\n";
   ss << "out.backward(torch.randn_like(out))\n";
   ss << "torch.cuda.synchronize()\n\n";
@@ -339,10 +364,16 @@ Tensor cudnn_convolution_transpose_backward_input(
 
 Tensor cudnn_convolution_backward_input(
     CheckedFrom c,
-    IntArrayRef input_size, const TensorArg& grad_output, const TensorArg& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32)
-{
+    IntArrayRef input_size,
+    const TensorArg& grad_output,
+    const TensorArg& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
   checkAllSameType(c, {grad_output, weight});
   checkAllSameGPU(c, {grad_output, weight});
 
@@ -351,54 +382,114 @@ Tensor cudnn_convolution_backward_input(
       input_size, grad_output->options().memory_format(memory_format));
 
   // Avoid "grad_input" when this is being used as transposed convolution
-  TensorArg grad_input{ grad_input_t, "result", 0 };
-  convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
+  TensorArg grad_input{grad_input_t, "result", 0};
+  convolution_shape_check(
+      c, grad_input, weight, grad_output, padding, stride, dilation, groups);
 
   Tensor weight_contig = weight->contiguous(memory_format);
   Tensor grad_output_contig = grad_output->contiguous(memory_format);
 
   raw_cudnn_convolution_backward_input_out(
-      *grad_input, grad_output_contig, weight_contig,
-      padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+      *grad_input,
+      grad_output_contig,
+      weight_contig,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      allow_tf32);
 
   return *grad_input;
 }
 
 Tensor cudnn_convolution_transpose_forward(
     CheckedFrom c,
-    const TensorArg& grad_output, const TensorArg& weight,
-    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32)
-{
-  auto input_size = conv_input_size(grad_output->sizes(), weight->sizes(),
-                                    padding, output_padding, stride, dilation, groups);
-  return cudnn_convolution_backward_input(c, input_size, grad_output, weight,
-                                    padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+    const TensorArg& grad_output,
+    const TensorArg& weight,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
+  auto input_size = conv_input_size(
+      grad_output->sizes(),
+      weight->sizes(),
+      padding,
+      output_padding,
+      stride,
+      dilation,
+      groups);
+  return cudnn_convolution_backward_input(
+      c,
+      input_size,
+      grad_output,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      allow_tf32);
 }
 
 Tensor cudnn_convolution_backward_input(
-    IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            weight{ weight_t, "weight", 2 };
+    IntArrayRef input_size,
+    const Tensor& grad_output_t,
+    const Tensor& weight_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
+  TensorArg grad_output{grad_output_t, "grad_output", 1},
+      weight{weight_t, "weight", 2};
   return cudnn_convolution_backward_input(
       "cudnn_convolution_backward_input",
-      input_size, grad_output, weight,
-      padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+      input_size,
+      grad_output,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      allow_tf32);
 }
 
 Tensor cudnn_convolution_transpose(
-    const Tensor& input_t, const Tensor& weight_t,
-    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool benchmark, bool deterministic, bool allow_tf32)
-{
-  TensorArg input  { input_t,  "input",  1 },
-            weight { weight_t, "weight", 2 };
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
+  TensorArg input{input_t, "input", 1}, weight{weight_t, "weight", 2};
   CheckedFrom c = "cudnn_convolution_transpose";
   auto output_t = cudnn_convolution_transpose_forward(
-    c, input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+      c,
+      input,
+      weight,
+      padding,
+      output_padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      allow_tf32);
   return output_t;
 }
 
@@ -410,31 +501,54 @@ Tensor cudnn_convolution_transpose(
 
 Tensor cudnn_convolution_backward_weight(
     CheckedFrom c,
-    IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32)
-{
+    IntArrayRef weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
   auto layout = cudnn_conv_suggest_memory_format(input_t, grad_output_t);
 
   Tensor grad_output_contig_t = grad_output_t.contiguous(layout);
-  TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 };
+  TensorArg grad_output_contig{grad_output_contig_t, "grad_output", 1};
 
   Tensor input_contig_t = input_t.contiguous(layout);
-  TensorArg input{ input_contig_t, "input", 2};
+  TensorArg input{input_contig_t, "input", 2};
 
   checkAllSameType(c, {grad_output_contig, input});
   checkAllSameGPU(c, {grad_output_contig, input});
 
-  auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), layout);
+  auto grad_weight_t =
+      at::empty(weight_size, grad_output_contig->options(), layout);
 
   // For uniformity with everything else, although it seems grad_weight
   // would be unambiguous too.
-  TensorArg grad_weight{ grad_weight_t, "result", 0 };
-  convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups);
+  TensorArg grad_weight{grad_weight_t, "result", 0};
+  convolution_shape_check(
+      c,
+      input,
+      grad_weight,
+      grad_output_contig,
+      padding,
+      stride,
+      dilation,
+      groups);
 
   raw_cudnn_convolution_backward_weight_out(
-      *grad_weight, *grad_output_contig, *input,
-      padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+      *grad_weight,
+      *grad_output_contig,
+      *input,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      allow_tf32);
 
   return grad_weight_t;
 }
@@ -443,20 +557,39 @@ Tensor cudnn_convolution_backward_weight(
     IntArrayRef weight_size,
     const Tensor& grad_output_t,
     const Tensor& input_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32)
-{
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
   return cudnn_convolution_backward_weight(
       "cudnn_convolution_backward_weight",
-      weight_size, grad_output_t, input_t,
-      padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+      weight_size,
+      grad_output_t,
+      input_t,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      allow_tf32);
 }
 
-std::tuple<at::Tensor,at::Tensor> cudnn_convolution_backward(
-    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32, std::array<bool,2> output_mask) {
-
+std::tuple<at::Tensor, at::Tensor> cudnn_convolution_backward(
+    const at::Tensor& input,
+    const at::Tensor& grad_output_t,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32,
+    std::array<bool, 2> output_mask) {
   Tensor grad_output = grad_output_t.to(input.suggest_memory_format());
 
   Tensor grad_input, grad_weight;
@@ -469,45 +602,104 @@ std::tuple<at::Tensor,at::Tensor> cudnn_convolution_backward(
     }
   } else {
     if (output_mask[0]) {
-      grad_input = cudnn_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+      grad_input = cudnn_convolution_backward_input(
+          input.sizes(),
+          grad_output,
+          weight,
+          padding,
+          stride,
+          dilation,
+          groups,
+          benchmark,
+          deterministic,
+          allow_tf32);
     }
     if (output_mask[1]) {
-      grad_weight = cudnn_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+      grad_weight = cudnn_convolution_backward_weight(
+          weight.sizes(),
+          grad_output,
+          input,
+          padding,
+          stride,
+          dilation,
+          groups,
+          benchmark,
+          deterministic,
+          allow_tf32);
     }
   }
 
-  return std::tuple<Tensor,Tensor>{grad_input, grad_weight};
+  return std::tuple<Tensor, Tensor>{grad_input, grad_weight};
 }
 
 Tensor cudnn_convolution_transpose_backward_weight(
     IntArrayRef weight_size,
     const Tensor& grad_output_t,
     const Tensor& input_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32)
-{
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
   return cudnn_convolution_backward_weight(
       "cudnn_convolution_backward_weight",
-      weight_size, input_t, grad_output_t,
-      padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+      weight_size,
+      input_t,
+      grad_output_t,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      allow_tf32);
 }
 
-std::tuple<at::Tensor,at::Tensor> cudnn_convolution_transpose_backward(
-    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32, std::array<bool,2> output_mask) {
-
+std::tuple<at::Tensor, at::Tensor> cudnn_convolution_transpose_backward(
+    const at::Tensor& input,
+    const at::Tensor& grad_output_t,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32,
+    std::array<bool, 2> output_mask) {
   Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
 
   Tensor grad_input, grad_weight;
   if (output_mask[0]) {
-    grad_input = cudnn_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+    grad_input = cudnn_convolution_transpose_backward_input(
+        grad_output,
+        weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        allow_tf32);
   }
   if (output_mask[1]) {
-    grad_weight = cudnn_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+    grad_weight = cudnn_convolution_transpose_backward_weight(
+        weight.sizes(),
+        grad_output,
+        input,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        allow_tf32);
   }
 
-  return std::tuple<Tensor,Tensor>{grad_input, grad_weight};
+  return std::tuple<Tensor, Tensor>{grad_input, grad_weight};
 }
 
 Tensor cudnn_convolution_relu(
@@ -535,31 +727,14 @@ Tensor cudnn_convolution_relu(
   bool benchmark = ctx.benchmarkCuDNN();
   bool allow_tf32 = ctx.allowTF32CuDNN();
   auto _bias = bias_t.has_value()
-          ? bias_t.value()
-          : at::zeros(
-                {output_t.size(1)},
-                optTypeMetaToScalarType(output_t.options().dtype_opt()),
-                output_t.options().layout_opt(),
-                output_t.options().device_opt(),
-                output_t.options().pinned_memory_opt());
-
-#ifdef AT_CUDNN_CONV_BIAS_RELU_FALLBACK
-  raw_cudnn_convolution_add_relu_fallback_out(
-      output_t,
-      input,
-      weight,
-      output_t, // use output_t as z to satisfy CUDNN API
-      0, // alpha
-      _bias,
-      stride,
-      padding,
-      dilation,
-      groups,
-      benchmark, // benchmark
-      false, // deterministic
-      allow_tf32  // allow_tf32
-  );
-#else  // AT_CUDNN_CONV_BIAS_RELU_FALLBACK
+      ? bias_t.value()
+      : at::zeros(
+            {output_t.size(1)},
+            optTypeMetaToScalarType(output_t.options().dtype_opt()),
+            output_t.options().layout_opt(),
+            output_t.options().device_opt(),
+            output_t.options().pinned_memory_opt());
+
   raw_cudnn_convolution_add_relu_out(
       output_t,
       input,
@@ -573,9 +748,8 @@ Tensor cudnn_convolution_relu(
       groups,
       benchmark, // benchmark
       false, // deterministic
-      allow_tf32  // allow_tf32
+      allow_tf32 // allow_tf32
   );
-#endif
 
   return output_t;
 }
@@ -613,31 +787,14 @@ Tensor cudnn_convolution_add_relu(
   bool benchmark = ctx.benchmarkCuDNN();
   auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
   auto _bias = bias_t.has_value()
-          ? bias_t.value()
-          : at::zeros(
-                {output_t.size(1)},
-                optTypeMetaToScalarType(output_t.options().dtype_opt()),
-                output_t.options().layout_opt(),
-                output_t.options().device_opt(),
-                output_t.options().pinned_memory_opt());
-
-#ifdef AT_CUDNN_CONV_BIAS_RELU_FALLBACK
-  raw_cudnn_convolution_add_relu_fallback_out(
-      output_t,
-      input,
-      weight,
-      z,
-      _alpha,
-      _bias,
-      stride,
-      padding,
-      dilation,
-      groups,
-      benchmark,
-      false, // deterministic
-      allow_tf32  // allow_tf32
-  );
-#else  // AT_CUDNN_CONV_BIAS_RELU_FALLBACK
+      ? bias_t.value()
+      : at::zeros(
+            {output_t.size(1)},
+            optTypeMetaToScalarType(output_t.options().dtype_opt()),
+            output_t.options().layout_opt(),
+            output_t.options().device_opt(),
+            output_t.options().pinned_memory_opt());
+
   raw_cudnn_convolution_add_relu_out(
       output_t,
       input,
@@ -651,16 +808,20 @@ Tensor cudnn_convolution_add_relu(
       groups,
       benchmark,
       false, // deterministic
-      allow_tf32  // allow_tf32
+      allow_tf32 // allow_tf32
   );
-#endif  // AT_CUDNN_CONV_BIAS_RELU_FALLBACK
 
   return output_t;
 }
 
-REGISTER_CUDA_DISPATCH(cudnn_convolution_backward_stub, &cudnn_convolution_backward);
-REGISTER_CUDA_DISPATCH(cudnn_convolution_transpose_backward_stub, &cudnn_convolution_transpose_backward);
+REGISTER_CUDA_DISPATCH(
+    cudnn_convolution_backward_stub,
+    &cudnn_convolution_backward);
+REGISTER_CUDA_DISPATCH(
+    cudnn_convolution_transpose_backward_stub,
+    &cudnn_convolution_transpose_backward);
 
-}}
+} // namespace native
+} // namespace at
 
-#endif  // AT_CUDNN_ENABLED
+#endif // AT_CUDNN_ENABLED
diff --git a/aten/src/ATen/native/cudnn/ConvShared.h b/aten/src/ATen/native/cudnn/ConvShared.h
index 89986adadac1f..ae68bfc7d20d6 100644
--- a/aten/src/ATen/native/cudnn/ConvShared.h
+++ b/aten/src/ATen/native/cudnn/ConvShared.h
@@ -1,16 +1,13 @@
 #pragma once
 #include <ATen/core/Tensor.h>
 
-#include <ATen/cudnn/cudnn-wrapper.h>
 #include <ATen/cudnn/Descriptors.h>
 #include <ATen/cudnn/Types.h>
+#include <ATen/cudnn/cudnn-wrapper.h>
 #include <ATen/native/ConvUtils.h>
 
-#if CUDNN_VERSION < 8000
-#define AT_CUDNN_CONV_BIAS_RELU_FALLBACK
-#endif
-
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 // ---------------------------------------------------------------------
 //
@@ -20,8 +17,7 @@ namespace at { namespace native {
 
 // This POD struct is used to let us easily compute hashes of the
 // parameters
-struct ConvolutionParams
-{
+struct ConvolutionParams {
   c10::DeviceIndex device_id;
   cudnnDataType_t dataType;
   int input_size[2 + max_dim];
@@ -38,7 +34,7 @@ struct ConvolutionParams
   // forward and backward, so you can reuse the benchmark entry,
 };
 
-std::ostream& operator<<(std::ostream & out, const ConvolutionParams& params);
+std::ostream& operator<<(std::ostream& out, const ConvolutionParams& params);
 
 // NB: This can't be a constructor, because then ConvolutionParams
 // would not be a POD anymore.
@@ -47,13 +43,18 @@ std::ostream& operator<<(std::ostream & out, const ConvolutionParams& params);
 // grad_input/grad_output, so this is not very pressing)
 void setConvolutionParams(
     ConvolutionParams* params,
-    const at::Tensor& input, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool deterministic, bool allow_tf32, at::MemoryFormat memory_format);
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool deterministic,
+    bool allow_tf32,
+    at::MemoryFormat memory_format);
 
 std::string repro_from_args(const ConvolutionParams& args);
 
-
 // ---------------------------------------------------------------------
 //
 // Raw functions
@@ -61,21 +62,40 @@ std::string repro_from_args(const ConvolutionParams& args);
 // ---------------------------------------------------------------------
 
 void raw_cudnn_convolution_forward_out(
-    const Tensor& output, const Tensor& input, const Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32);
+    const Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32);
 
 void raw_cudnn_convolution_backward_input_out(
     const at::Tensor& grad_input,
     const at::Tensor& grad_output,
     const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32);
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32);
 
 void raw_cudnn_convolution_backward_weight_out(
-    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32);
+    const Tensor& grad_weight,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32);
 
 void raw_cudnn_convolution_add_relu_out(
     const Tensor& output,
@@ -107,7 +127,6 @@ void raw_cudnn_convolution_add_relu_fallback_out(
     bool deterministic,
     bool allow_tf32);
 
-
 #if AT_CUDNN_ENABLED()
 
 // v7 functions are preserved here to allow for runtime switching to v7
@@ -116,21 +135,40 @@ void raw_cudnn_convolution_add_relu_fallback_out(
 // versions, as v7 explicitly splits large tensors as a 32-bit indexing
 // workaround whereas v8 expects cuDNN to handle large tensors.
 void raw_cudnn_convolution_forward_out_v7(
-    const Tensor& output, const Tensor& input, const Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32);
+    const Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32);
 
 void raw_cudnn_convolution_backward_input_out_v7(
     const at::Tensor& grad_input,
     const at::Tensor& grad_output,
     const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32);
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32);
 
 void raw_cudnn_convolution_backward_weight_out_v7(
-    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32);
+    const Tensor& grad_weight,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32);
 
 void raw_cudnn_convolution_add_relu_out_v7(
     const Tensor& output,
@@ -147,4 +185,5 @@ void raw_cudnn_convolution_add_relu_out_v7(
     bool deterministic,
     bool allow_tf32);
 #endif
-}}
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp
index ef3a70a2232f4..1c5b6fb94a221 100644
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@@ -1,5 +1,5 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/cuda/CUDAConfig.h>  // for the definition of AT_CUDNN_ENABLED
+#include <ATen/cuda/CUDAConfig.h> // for the definition of AT_CUDNN_ENABLED
 
 #if AT_CUDNN_ENABLED()
 
@@ -13,12 +13,12 @@
 #include <ATen/ops/zeros.h>
 #endif
 
-#include <limits>
-#include <vector>
 #include <ATen/Config.h>
-#include <ATen/cuda/CUDAGraphsUtils.cuh>
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/native/cudnn/ConvShared.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#include <limits>
+#include <vector>
 
 #include <ATen/cudnn/Types.h>
 #include <ATen/cudnn/Utils.h>
@@ -27,43 +27,48 @@
 #include <ATen/TensorUtils.h>
 #include <c10/util/irange.h>
 
+#include <stdint.h>
+#include <algorithm>
 #include <functional>
 #include <iterator>
-#include <sstream>
-#include <algorithm>
 #include <memory>
 #include <mutex>
-#include <stdint.h>
+#include <sstream>
 #include <unordered_map>
 
 // Note [behavior of cudnnFind and cudnnGet]
-// You'll notice that by default, in the ConvolutionDescriptor, we do the following:
+// You'll notice that by default, in the ConvolutionDescriptor, we do the
+// following:
 //
-//     AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH));
-//     if(dataType == CUDNN_DATA_HALF)
-//       AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
+//     AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(),
+//     CUDNN_DEFAULT_MATH)); if(dataType == CUDNN_DATA_HALF)
+//       AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(),
+//       CUDNN_TENSOR_OP_MATH));
 //
 //     Update: AT_CUDNN_CHECK is updated with AT_CUDNN_CHECK_WITH_SHAPES, which
-//        automatically prints tensor shapes and convolution parameters if there is
-//        a cuDNN exception thrown.
+//        automatically prints tensor shapes and convolution parameters if there
+//        is a cuDNN exception thrown.
 //
-// When cudnnSetConvolutionMathType is called before cudnnGet/cudnnFind, it informs
-// cudnnGet/cudnnFind to iterate/take into account both tensor core and non-tensor-core algos.
-// If you don't call cudnnSetConvolutionMathType before calling cudnnGet/cudnnFind,
-// cudnnGet/cudnnFind may not pick tensor core algos.
+// When cudnnSetConvolutionMathType is called before cudnnGet/cudnnFind, it
+// informs cudnnGet/cudnnFind to iterate/take into account both tensor core and
+// non-tensor-core algos. If you don't call cudnnSetConvolutionMathType before
+// calling cudnnGet/cudnnFind, cudnnGet/cudnnFind may not pick tensor core
+// algos.
 //
-// Now after its run, cudnnGet/cudnnFind comes up with the best pair of algo+mathType
-// with all the initial knowledge its given. It then becomes the user's responsibility
-// to update mathType of the convolution descriptor and call the subsequent cudnn calls with
-// the best algo and the updated descriptor. If we don't update the descriptor but just run
-// with the best algo, under the hood, cudnn will run with the slower kernel
-// since it sees fastest algorithm combination with a sub optimal mathType.
-
-constexpr size_t operator "" _TiB(unsigned long long n) {
+// Now after its run, cudnnGet/cudnnFind comes up with the best pair of
+// algo+mathType with all the initial knowledge its given. It then becomes the
+// user's responsibility to update mathType of the convolution descriptor and
+// call the subsequent cudnn calls with the best algo and the updated
+// descriptor. If we don't update the descriptor but just run with the best
+// algo, under the hood, cudnn will run with the slower kernel since it sees
+// fastest algorithm combination with a sub optimal mathType.
+
+constexpr size_t operator"" _TiB(unsigned long long n) {
   return size_t(n) * 1024 * 1024 * 1024 * 1024;
 }
 
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 // Convenience struct for passing around descriptors and data
 // pointers
@@ -72,23 +77,27 @@ struct ConvolutionArgs {
   ConvolutionParams params;
   TensorDescriptor idesc, odesc;
   FilterDescriptor wdesc;
-  const Tensor& input, output, weight;
+  const Tensor &input, output, weight;
   ConvolutionDescriptor cdesc;
 
-  ConvolutionArgs(const Tensor& input, const Tensor& output, const Tensor& weight) : input(input), output(output), weight(weight) {
-  }
+  ConvolutionArgs(
+      const Tensor& input,
+      const Tensor& output,
+      const Tensor& weight)
+      : input(input), output(output), weight(weight) {}
 };
 
-std::ostream& operator<<(std::ostream & out, const ConvolutionArgs& args) {
-  out << repro_from_args(args.params)  // already has a trailing newline
-    << args.params                     // already has a trailing newline
-    << "input: " << args.idesc         // already has a trailing newline
-    << "output: " << args.odesc        // already has a trailing newline
-    << "weight: " << args.wdesc        // already has a trailing newline
-    << "Pointer addresses: " << "\n"
-    << "    input: " << args.input.data_ptr() << "\n"
-    << "    output: " << args.output.data_ptr() << "\n"
-    << "    weight: " << args.weight.data_ptr() << "\n";
+std::ostream& operator<<(std::ostream& out, const ConvolutionArgs& args) {
+  out << repro_from_args(args.params) // already has a trailing newline
+      << args.params // already has a trailing newline
+      << "input: " << args.idesc // already has a trailing newline
+      << "output: " << args.odesc // already has a trailing newline
+      << "weight: " << args.wdesc // already has a trailing newline
+      << "Pointer addresses: "
+      << "\n"
+      << "    input: " << args.input.const_data_ptr() << "\n"
+      << "    output: " << args.output.const_data_ptr() << "\n"
+      << "    weight: " << args.weight.const_data_ptr() << "\n";
 
   return out;
 }
@@ -103,7 +112,12 @@ std::ostream& operator<<(std::ostream & out, const ConvolutionArgs& args) {
 template <typename T>
 struct BenchmarkCache {
   std::mutex mutex;
-  std::unordered_map<ConvolutionParams, T, ParamsHash<ConvolutionParams>, ParamsEqual<ConvolutionParams>> map;
+  std::unordered_map<
+      ConvolutionParams,
+      T,
+      ParamsHash<ConvolutionParams>,
+      ParamsEqual<ConvolutionParams>>
+      map;
 
   bool find(const ConvolutionParams& params, T* results) {
     std::lock_guard<std::mutex> guard(mutex);
@@ -129,10 +143,11 @@ BenchmarkCache<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filter_algos;
 // tensor instead.
 struct Workspace {
   Workspace(size_t size) : size(size), data(NULL) {
-    // Sometimes cuDNN returns a workspace size > 2^63, this could makes the allocation of
-    // workspace fail with some 64bit indexing error instead of an OOM error. In such case,
-    // we manually fail with OOM.
-    TORCH_CHECK_WITH(OutOfMemoryError, size < 1_TiB, "Not enough memory for workspace!");
+    // Sometimes cuDNN returns a workspace size > 2^63, this could makes the
+    // allocation of workspace fail with some 64bit indexing error instead of an
+    // OOM error. In such case, we manually fail with OOM.
+    TORCH_CHECK_WITH(
+        OutOfMemoryError, size < 1_TiB, "Not enough memory for workspace!");
     data = c10::cuda::CUDACachingAllocator::raw_alloc(size);
   }
   Workspace(const Workspace&) = delete;
@@ -148,78 +163,80 @@ struct Workspace {
   void* data;
 };
 
-template<typename perf_t>
-struct algorithm_search {
-};
+template <typename perf_t>
+struct algorithm_search {};
 
 cudnnStatus_t getWorkspaceSize(
     const ConvolutionArgs& args,
-    cudnnConvolutionFwdAlgo_t algo, size_t* sz)
-{
-    return cudnnGetConvolutionForwardWorkspaceSize(
-        args.handle,
-        args.idesc.desc(),
-        args.wdesc.desc(),
-        args.cdesc.desc(),
-        args.odesc.desc(),
-        algo,
-        sz
-    );
+    cudnnConvolutionFwdAlgo_t algo,
+    size_t* sz) {
+  return cudnnGetConvolutionForwardWorkspaceSize(
+      args.handle,
+      args.idesc.desc(),
+      args.wdesc.desc(),
+      args.cdesc.desc(),
+      args.odesc.desc(),
+      algo,
+      sz);
 }
 cudnnStatus_t getWorkspaceSize(
     const ConvolutionArgs& args,
-    cudnnConvolutionBwdDataAlgo_t algo, size_t* sz)
-{
-    return cudnnGetConvolutionBackwardDataWorkspaceSize(
-        args.handle,
-        args.wdesc.desc(),
-        args.odesc.desc(),
-        args.cdesc.desc(),
-        args.idesc.desc(),
-        algo,
-        sz);
+    cudnnConvolutionBwdDataAlgo_t algo,
+    size_t* sz) {
+  return cudnnGetConvolutionBackwardDataWorkspaceSize(
+      args.handle,
+      args.wdesc.desc(),
+      args.odesc.desc(),
+      args.cdesc.desc(),
+      args.idesc.desc(),
+      algo,
+      sz);
 }
 cudnnStatus_t getWorkspaceSize(
     const ConvolutionArgs& args,
-    cudnnConvolutionBwdFilterAlgo_t algo, size_t* sz)
-{
-    return cudnnGetConvolutionBackwardFilterWorkspaceSize(
-        args.handle,
-        args.idesc.desc(),
-        args.odesc.desc(),
-        args.cdesc.desc(),
-        args.wdesc.desc(),
-        algo,
-        sz);
+    cudnnConvolutionBwdFilterAlgo_t algo,
+    size_t* sz) {
+  return cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      args.handle,
+      args.idesc.desc(),
+      args.odesc.desc(),
+      args.cdesc.desc(),
+      args.wdesc.desc(),
+      algo,
+      sz);
 }
 
-template<typename algo_t>
+template <typename algo_t>
 size_t getMaxWorkspaceSize(
     const ConvolutionArgs& args,
-    const algo_t *algo, int n_algo)
-{
+    const algo_t* algo,
+    int n_algo) {
   size_t max_ws_size = 0;
   size_t max_block_size = 0;
 
   const auto device = c10::cuda::current_device();
   // For the native allocator, retrieves the size of the largest unused block.
-  // For cudaMallocAsync, see c10/cuda/CUDAMallocAsync.cpp:cacheInfo for details.
+  // For cudaMallocAsync, see c10/cuda/CUDAMallocAsync.cpp:cacheInfo for
+  // details.
   c10::cuda::CUDACachingAllocator::cacheInfo(device, &max_block_size);
 
   for (const auto i : c10::irange(n_algo)) {
     cudnnStatus_t err;
     size_t sz;
     err = getWorkspaceSize(args, algo[i], &sz);
-    if (CUDNN_STATUS_SUCCESS != err || sz == 0 || sz < max_ws_size || sz > max_block_size)
+    if (CUDNN_STATUS_SUCCESS != err || sz == 0 || sz < max_ws_size ||
+        sz > max_block_size)
       continue;
     max_ws_size = sz;
   }
   return max_ws_size;
 }
 
-template<typename perf_t>
-std::vector<perf_t> getValidAlgorithms(perf_t *perfResults, const ConvolutionArgs& args, int n_algo) {
-
+template <typename perf_t>
+std::vector<perf_t> getValidAlgorithms(
+    perf_t* perfResults,
+    const ConvolutionArgs& args,
+    int n_algo) {
   std::vector<perf_t> result;
   result.reserve(n_algo);
   for (const auto i : c10::irange(n_algo)) {
@@ -228,170 +245,203 @@ std::vector<perf_t> getValidAlgorithms(perf_t *perfResults, const ConvolutionArg
     // TODO: Shouldn't all returned results be successful?
     // Double check documentation for cudnnFindConvolutionForwardAlgorithmEx
     if (perf.status == CUDNN_STATUS_SUCCESS) {
-      if (!args.params.deterministic || perf.determinism == CUDNN_DETERMINISTIC) {
-
+      if (!args.params.deterministic ||
+          perf.determinism == CUDNN_DETERMINISTIC) {
         result.push_back(perf);
       }
     }
   }
-  TORCH_CHECK(result.size() > 0, "no valid convolution algorithms available in CuDNN");
+  TORCH_CHECK(
+      result.size() > 0, "no valid convolution algorithms available in CuDNN");
   return result;
 }
 
-template<>
+template <>
 struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
   using perf_t = cudnnConvolutionFwdAlgoPerf_t;
   using algo_t = cudnnConvolutionFwdAlgo_t;
 
-  static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-  static BenchmarkCache<perf_t>& cache() { return fwd_algos; }
+  static constexpr auto DEFAULT_ALGO =
+      CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+  static BenchmarkCache<perf_t>& cache() {
+    return fwd_algos;
+  }
 
-  static std::vector<perf_t> findAlgorithms(const ConvolutionArgs& args, bool benchmark) {
+  static std::vector<perf_t> findAlgorithms(
+      const ConvolutionArgs& args,
+      bool benchmark) {
     static const algo_t algos[] = {
-         CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-         CUDNN_CONVOLUTION_FWD_ALGO_FFT,
-         CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
-         CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
-         CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
-         CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
-         CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
-         CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED,
+        CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+        CUDNN_CONVOLUTION_FWD_ALGO_FFT,
+        CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
+        CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
+        CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
+        CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
+        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
+        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED,
     };
     static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
-    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
-                  "Missing cuDNN convolution forward algorithms");
+    static_assert(
+        sizeof(algos) / sizeof(algos[0]) == num_algos,
+        "Missing cuDNN convolution forward algorithms");
     int perf_count;
     std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
     if (!benchmark) {
-      AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionForwardAlgorithm_v7(
-          args.handle,
-          args.idesc.desc(),
-          args.wdesc.desc(),
-          args.cdesc.desc(),
-          args.odesc.desc(),
-          num_algos,
-          &perf_count,
-          perf_results.get()), args);
+      AT_CUDNN_CHECK_WITH_SHAPES(
+          cudnnGetConvolutionForwardAlgorithm_v7(
+              args.handle,
+              args.idesc.desc(),
+              args.wdesc.desc(),
+              args.cdesc.desc(),
+              args.odesc.desc(),
+              num_algos,
+              &perf_count,
+              perf_results.get()),
+          args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
       Workspace ws(max_ws_size);
       at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind");
-      AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionForwardAlgorithmEx(
-          args.handle,
-          args.idesc.desc(), args.input.data_ptr(),
-          args.wdesc.desc(), args.weight.data_ptr(),
-          args.cdesc.desc(),
-          args.odesc.desc(), args.output.data_ptr(),
-          num_algos,
-          &perf_count,
-          perf_results.get(),
-          ws.data,
-          ws.size), args);
+      AT_CUDNN_CHECK_WITH_SHAPES(
+          cudnnFindConvolutionForwardAlgorithmEx(
+              args.handle,
+              args.idesc.desc(),
+              args.input.const_data_ptr(),
+              args.wdesc.desc(),
+              args.weight.const_data_ptr(),
+              args.cdesc.desc(),
+              args.odesc.desc(),
+              args.output.data_ptr(),
+              num_algos,
+              &perf_count,
+              perf_results.get(),
+              ws.data,
+              ws.size),
+          args);
 
       // Free the cached blocks in our caching allocator. They are
-      // needed here because the above benchmarking uses a huge amount of memory,
-      // e.g. a few GBs.
+      // needed here because the above benchmarking uses a huge amount of
+      // memory, e.g. a few GBs.
       c10::cuda::CUDACachingAllocator::emptyCache();
     }
     return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
   }
 
   static void getWorkspaceSize(
-    const ConvolutionArgs& args,
-    algo_t algo, size_t* workspaceSize)
-  {
-    AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionForwardWorkspaceSize(
-        args.handle,
-        args.idesc.desc(),
-        args.wdesc.desc(),
-        args.cdesc.desc(),
-        args.odesc.desc(),
-        algo,
-        workspaceSize), args);
+      const ConvolutionArgs& args,
+      algo_t algo,
+      size_t* workspaceSize) {
+    AT_CUDNN_CHECK_WITH_SHAPES(
+        cudnnGetConvolutionForwardWorkspaceSize(
+            args.handle,
+            args.idesc.desc(),
+            args.wdesc.desc(),
+            args.cdesc.desc(),
+            args.odesc.desc(),
+            algo,
+            workspaceSize),
+        args);
   }
 };
 
-template<>
+template <>
 struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
   using perf_t = cudnnConvolutionBwdDataAlgoPerf_t;
   using algo_t = cudnnConvolutionBwdDataAlgo_t;
 
   static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
-  static BenchmarkCache<perf_t>& cache() { return bwd_data_algos; }
+  static BenchmarkCache<perf_t>& cache() {
+    return bwd_data_algos;
+  }
 
-  static std::vector<perf_t> findAlgorithms(const ConvolutionArgs& args, bool benchmark) {
+  static std::vector<perf_t> findAlgorithms(
+      const ConvolutionArgs& args,
+      bool benchmark) {
     static const algo_t algos[] = {
         CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
         CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
         CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,
         CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
         CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD,
-        CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED
-    };
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED};
     static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
-    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
-                  "Missing cuDNN convolution backward data algorithms.");
+    static_assert(
+        sizeof(algos) / sizeof(algos[0]) == num_algos,
+        "Missing cuDNN convolution backward data algorithms.");
     int perf_count;
     std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
     if (!benchmark) {
-      AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardDataAlgorithm_v7(
-          args.handle,
-          args.wdesc.desc(),
-          args.odesc.desc(),
-          args.cdesc.desc(),
-          args.idesc.desc(),
-          num_algos,
-          &perf_count,
-          perf_results.get()), args);
+      AT_CUDNN_CHECK_WITH_SHAPES(
+          cudnnGetConvolutionBackwardDataAlgorithm_v7(
+              args.handle,
+              args.wdesc.desc(),
+              args.odesc.desc(),
+              args.cdesc.desc(),
+              args.idesc.desc(),
+              num_algos,
+              &perf_count,
+              perf_results.get()),
+          args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
       Workspace ws(max_ws_size);
       at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind");
-      AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionBackwardDataAlgorithmEx(
-          args.handle,
-          args.wdesc.desc(), args.weight.data_ptr(),
-          args.odesc.desc(), args.output.data_ptr(),
-          args.cdesc.desc(),
-          args.idesc.desc(), args.input.data_ptr(),
-          num_algos,
-          &perf_count,
-          perf_results.get(),
-          ws.data,
-          ws.size), args);
+      AT_CUDNN_CHECK_WITH_SHAPES(
+          cudnnFindConvolutionBackwardDataAlgorithmEx(
+              args.handle,
+              args.wdesc.desc(),
+              args.weight.const_data_ptr(),
+              args.odesc.desc(),
+              args.output.const_data_ptr(),
+              args.cdesc.desc(),
+              args.idesc.desc(),
+              args.input.data_ptr(),
+              num_algos,
+              &perf_count,
+              perf_results.get(),
+              ws.data,
+              ws.size),
+          args);
 
       // Free the cached blocks in our caching allocator. They are
-      // needed here because the above benchmarking uses a huge amount of memory,
-      // e.g. a few GBs.
+      // needed here because the above benchmarking uses a huge amount of
+      // memory, e.g. a few GBs.
       c10::cuda::CUDACachingAllocator::emptyCache();
     }
     return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
   }
 
   static void getWorkspaceSize(
-    const ConvolutionArgs& args,
-    cudnnConvolutionBwdDataAlgo_t algo, size_t* workspaceSize)
-  {
-    AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardDataWorkspaceSize(
-        args.handle,
-        args.wdesc.desc(),
-        args.odesc.desc(),
-        args.cdesc.desc(),
-        args.idesc.desc(),
-        algo,
-        workspaceSize), args);
+      const ConvolutionArgs& args,
+      cudnnConvolutionBwdDataAlgo_t algo,
+      size_t* workspaceSize) {
+    AT_CUDNN_CHECK_WITH_SHAPES(
+        cudnnGetConvolutionBackwardDataWorkspaceSize(
+            args.handle,
+            args.wdesc.desc(),
+            args.odesc.desc(),
+            args.cdesc.desc(),
+            args.idesc.desc(),
+            algo,
+            workspaceSize),
+        args);
   }
 };
 
-template<>
+template <>
 struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
   using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
   using algo_t = cudnnConvolutionBwdFilterAlgo_t;
 
   static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
 
-  static BenchmarkCache<perf_t>& cache() { return bwd_filter_algos; }
+  static BenchmarkCache<perf_t>& cache() {
+    return bwd_filter_algos;
+  }
 
-  static std::vector<perf_t> findAlgorithms(const ConvolutionArgs& args, bool benchmark) {
+  static std::vector<perf_t> findAlgorithms(
+      const ConvolutionArgs& args,
+      bool benchmark) {
     static const algo_t algos[] = {
         CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
         CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,
@@ -401,68 +451,82 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
         CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,
     };
     // NOTE: - 1 because ALGO_WINOGRAD is not implemented
-    static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
-    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
-                  "Missing cuDNN convolution backward filter algorithms.");
+    static constexpr int num_algos =
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
+    static_assert(
+        sizeof(algos) / sizeof(algos[0]) == num_algos,
+        "Missing cuDNN convolution backward filter algorithms.");
     std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
     int perf_count;
     if (!benchmark) {
-      AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-          args.handle,
-          args.idesc.desc(),
-          args.odesc.desc(),
-          args.cdesc.desc(),
-          args.wdesc.desc(),
-          num_algos,
-          &perf_count,
-          perf_results.get()), args);
+      AT_CUDNN_CHECK_WITH_SHAPES(
+          cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+              args.handle,
+              args.idesc.desc(),
+              args.odesc.desc(),
+              args.cdesc.desc(),
+              args.wdesc.desc(),
+              num_algos,
+              &perf_count,
+              perf_results.get()),
+          args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
       Workspace ws(max_ws_size);
       at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind");
-      AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionBackwardFilterAlgorithmEx(
-          args.handle,
-          args.idesc.desc(), args.input.data_ptr(),
-          args.odesc.desc(), args.output.data_ptr(),
-          args.cdesc.desc(),
-          args.wdesc.desc(), args.weight.data_ptr(),
-          num_algos,
-          &perf_count,
-          perf_results.get(),
-          ws.data,
-          ws.size), args);
+      AT_CUDNN_CHECK_WITH_SHAPES(
+          cudnnFindConvolutionBackwardFilterAlgorithmEx(
+              args.handle,
+              args.idesc.desc(),
+              args.input.const_data_ptr(),
+              args.odesc.desc(),
+              args.output.const_data_ptr(),
+              args.cdesc.desc(),
+              args.wdesc.desc(),
+              args.weight.data_ptr(),
+              num_algos,
+              &perf_count,
+              perf_results.get(),
+              ws.data,
+              ws.size),
+          args);
 
       // Free the cached blocks in our caching allocator. They are
-      // needed here because the above benchmarking uses a huge amount of memory,
-      // e.g. a few GBs.
+      // needed here because the above benchmarking uses a huge amount of
+      // memory, e.g. a few GBs.
       c10::cuda::CUDACachingAllocator::emptyCache();
     }
     return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
   }
 
-  static void getWorkspaceSize(const ConvolutionArgs& args, algo_t algo, size_t* workspaceSize)
-  {
-    AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-        args.handle,
-        args.idesc.desc(),
-        args.odesc.desc(),
-        args.cdesc.desc(),
-        args.wdesc.desc(),
-        algo,
-        workspaceSize), args);
+  static void getWorkspaceSize(
+      const ConvolutionArgs& args,
+      algo_t algo,
+      size_t* workspaceSize) {
+    AT_CUDNN_CHECK_WITH_SHAPES(
+        cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            args.handle,
+            args.idesc.desc(),
+            args.odesc.desc(),
+            args.cdesc.desc(),
+            args.wdesc.desc(),
+            algo,
+            workspaceSize),
+        args);
   }
 };
 
-template<typename perf_t>
+template <typename perf_t>
 class AlgoIterator {
   using search = algorithm_search<perf_t>;
-  const ConvolutionArgs &args;
+  const ConvolutionArgs& args;
   bool benchmark;
 
-public:
-  AlgoIterator(const ConvolutionArgs &args, bool benchmark): args(args), benchmark(benchmark) {}
+ public:
+  AlgoIterator(const ConvolutionArgs& args, bool benchmark)
+      : args(args), benchmark(benchmark) {}
 
-  static std::vector<perf_t> onlyDefaultAlgorithm(const ConvolutionArgs &args) {
+  static std::vector<perf_t> onlyDefaultAlgorithm(const ConvolutionArgs& args) {
     std::vector<perf_t> perfResults(1);
     perfResults[0].algo = search::DEFAULT_ALGO;
     if (args.params.dataType == CUDNN_DATA_HALF) {
@@ -473,11 +537,12 @@ class AlgoIterator {
         perfResults[0].mathType = CUDNN_FMA_MATH;
       }
     }
-    search::getWorkspaceSize(args, perfResults[0].algo, &(perfResults[0].memory));
+    search::getWorkspaceSize(
+        args, perfResults[0].algo, &(perfResults[0].memory));
     return perfResults;
   }
 
-  void try_all(std::function<void (const perf_t &perf)> f) {
+  void try_all(std::function<void(const perf_t& perf)> f) {
     bool only_use_default = args.params.deterministic && !benchmark;
 
     auto& cache = search::cache();
@@ -486,32 +551,36 @@ class AlgoIterator {
       try {
         f(algoPerf);
         return;
-      } catch (c10::OutOfMemoryError &e) {
+      } catch (c10::OutOfMemoryError& e) {
         cudaGetLastError(); // clear CUDA error
       }
     }
 
-    auto perfResults = only_use_default ? onlyDefaultAlgorithm(args) : search::findAlgorithms(args, benchmark);
-    for (auto &algoPerf : perfResults) {
+    auto perfResults = only_use_default
+        ? onlyDefaultAlgorithm(args)
+        : search::findAlgorithms(args, benchmark);
+    for (auto& algoPerf : perfResults) {
       try {
         f(algoPerf);
         cache.insert(args.params, algoPerf);
         return;
-      } catch (c10::OutOfMemoryError &e) {
+      } catch (c10::OutOfMemoryError& e) {
         cudaGetLastError(); // clear CUDA error
-      } catch (c10::CuDNNError &e) {
+      } catch (c10::CuDNNError& e) {
         cudaGetLastError(); // clear CUDA error
       }
     }
-    TORCH_CHECK(false, "Unable to find a valid cuDNN algorithm to run convolution");
+    TORCH_CHECK(
+        false, "Unable to find a valid cuDNN algorithm to run convolution");
   }
 };
 
-inline Tensor allocate_workspace(size_t size, const Tensor &other) {
-  // Sometimes cuDNN returns a workspace size > 2^63, this could makes the allocation of
-  // workspace fail with some 64bit indexing error instead of an OOM error. In such case,
-  // we manually fail with OOM.
-  TORCH_CHECK_WITH(OutOfMemoryError, size < 1_TiB, "Not enough memory for workspace!");
+inline Tensor allocate_workspace(size_t size, const Tensor& other) {
+  // Sometimes cuDNN returns a workspace size > 2^63, this could makes the
+  // allocation of workspace fail with some 64bit indexing error instead of an
+  // OOM error. In such case, we manually fail with OOM.
+  TORCH_CHECK_WITH(
+      OutOfMemoryError, size < 1_TiB, "Not enough memory for workspace!");
   return at::empty({static_cast<int64_t>(size)}, other.options().dtype(kByte));
 }
 
@@ -519,14 +588,14 @@ inline Tensor allocate_workspace(size_t size, const Tensor &other) {
 //
 //    - raw_cudnn_convolution_forward_out (Tensor)
 //      Functiont that handles tensors that are too large to use 32bit indexing.
-//      It just split the tensor and dispatches to `raw_cudnn_convolution_forward_out_32bit`.
+//      It just split the tensor and dispatches to
+//      `raw_cudnn_convolution_forward_out_32bit`.
 //
 //    - raw_cudnn_convolution_forward_out_32bit (Tensor)
 //      Low level function which invokes CuDNN, and takes an output
 //      tensor which is directly written to (thus _out).
 //
 
-
 // ---------------------------------------------------------------------
 //
 // Splitting to 32bit
@@ -538,19 +607,36 @@ static inline void split_batch_dim_to_32bit_out(
     const at::Tensor& output,
     const at::Tensor& input,
     const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32,
-    int64_t max_worksize, func_t func_32bit) {
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32,
+    int64_t max_worksize,
+    func_t func_32bit) {
   constexpr int64_t int_max = std::numeric_limits<int>::max();
   const int64_t ni = input.numel();
   const int64_t no = output.numel();
   // Assume the shape of the tensor is (N, C, D1, D2, ...)
   // if N * C * D1 * D2 * ... <= int_max, then no need to split at all
   if (ni <= int_max && no <= int_max) {
-    func_32bit(output, input, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+    func_32bit(
+        output,
+        input,
+        weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        allow_tf32);
     return;
   }
-  // else, if C * D1 * D2 * ... <= int_max, then we just need to split across the N dimension
+  // else, if C * D1 * D2 * ... <= int_max, then we just need to split across
+  // the N dimension
   //
   // Here we use a simple heuristics to determine the size of each split
   // We don't max out the 2^31 address space because this number is super
@@ -565,30 +651,42 @@ static inline void split_batch_dim_to_32bit_out(
       int64_t split_size_ = std::min<int64_t>(split_size, n - start);
       Tensor input_ = input.narrow(0, start, split_size_);
       Tensor output_ = output.narrow(0, start, split_size_);
-      func_32bit(output_, input_, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+      func_32bit(
+          output_,
+          input_,
+          weight,
+          padding,
+          stride,
+          dilation,
+          groups,
+          benchmark,
+          deterministic,
+          allow_tf32);
     }
     return;
   }
-  // If control flow reaches here, this means even splitting N is not enough, then things starts to become complicated:
-  // For example, for conv2d, there following questions needs to be considered.
+  // If control flow reaches here, this means even splitting N is not enough,
+  // then things starts to become complicated: For example, for conv2d, there
+  // following questions needs to be considered.
   // - Is the memory layout NCHW or NHWC ?
   // - If the conv is NCHW -> NC'H'W', then should we
   //   - split only NC?
   //   - split only N'C'?
   //   - split both?
-  // - If the conv is NHWC, then we need to split across H, we need to be very careful about the boundary condition
+  // - If the conv is NHWC, then we need to split across H, we need to be very
+  // careful about the boundary condition
   //   to make sure that the boundary is handled correctly.
-  // - If we decide to make these splits, is the memory contiguous? Do we need to copy the memory?
-  // Considering the complexity of this issue, it is better not to use cuDNN for this case
+  // - If we decide to make these splits, is the memory contiguous? Do we need
+  // to copy the memory? Considering the complexity of this issue, it is better
+  // not to use cuDNN for this case
   TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN.");
 }
 
-
-#define ASSERT_CORRECT_PRECISION(math_type)                                     \
-if (args.params.dataType == CUDNN_DATA_FLOAT) {                                 \
-  TORCH_INTERNAL_ASSERT(args.params.allow_tf32 || math_type == CUDNN_FMA_MATH); \
-}
-
+#define ASSERT_CORRECT_PRECISION(math_type)                     \
+  if (args.params.dataType == CUDNN_DATA_FLOAT) {               \
+    TORCH_INTERNAL_ASSERT(                                      \
+        args.params.allow_tf32 || math_type == CUDNN_FMA_MATH); \
+  }
 
 // ---------------------------------------------------------------------
 //
@@ -597,56 +695,112 @@ if (args.params.dataType == CUDNN_DATA_FLOAT) {
 // ---------------------------------------------------------------------
 
 void raw_cudnn_convolution_forward_out_32bit(
-    const Tensor& output, const Tensor& input, const Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32) {
-
+    const Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
   auto dataType = getCudnnDataType(input);
 
-  ConvolutionArgs args{ input, output, weight };
+  ConvolutionArgs args{input, output, weight};
   args.handle = getCudnnHandle();
-  at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(input, weight);
-  setConvolutionParams(&args.params, input, weight, padding, stride, dilation, groups, deterministic, allow_tf32, memory_format);
+  at::MemoryFormat memory_format =
+      cudnn_conv_suggest_memory_format(input, weight);
+  setConvolutionParams(
+      &args.params,
+      input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      deterministic,
+      allow_tf32,
+      memory_format);
   args.idesc.set(input, memory_format);
   args.wdesc.set(weight, memory_format, 0);
   args.odesc.set(output, memory_format);
-  args.cdesc.set(dataType, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, args.params.allow_tf32);
+  args.cdesc.set(
+      dataType,
+      input.dim() - 2,
+      args.params.padding,
+      args.params.stride,
+      args.params.dilation,
+      args.params.groups,
+      args.params.allow_tf32);
 
   // TODO: when we do legacy group convolution support, we'll repeatedly
   // reinitialize the workspace for each convolution we do.  This is
   // wasteful; we'd rather reuse the workspace.  OTOH, legacy group
   // convolution support is already pretty slow, so this might not
   // matter.  (This applies to raw_cudnn_convolution_backward_input as well.)
-  AlgoIterator<cudnnConvolutionFwdAlgoPerf_t>(args, benchmark).try_all(
-    [&](const cudnnConvolutionFwdAlgoPerf_t &fwdAlgPerf){
-      Tensor workspace = allocate_workspace(fwdAlgPerf.memory, input);
-
-      // update convDesc mathType since cudnn 7.4+ now requires both algo + mathType to figure out
-      // whether to use Tensor core kernels or not
-      // See Note [behavior of cudnnFind and cudnnGet]
-      ASSERT_CORRECT_PRECISION(fwdAlgPerf.mathType);
-      AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), fwdAlgPerf.mathType), args);
-
-      Constant one(dataType, 1);
-      Constant zero(dataType, 0);
-
-      AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionForward(
-          args.handle,
-          &one, args.idesc.desc(), input.data_ptr(),
-          args.wdesc.desc(), weight.data_ptr(),
-          args.cdesc.desc(), fwdAlgPerf.algo, workspace.data_ptr(), fwdAlgPerf.memory,
-          &zero, args.odesc.desc(), output.data_ptr()),
-        args, "Forward algorithm: ", static_cast<int>(fwdAlgPerf.algo), "\n");
-      }
-  );
-}
+  AlgoIterator<cudnnConvolutionFwdAlgoPerf_t>(args, benchmark)
+      .try_all([&](const cudnnConvolutionFwdAlgoPerf_t& fwdAlgPerf) {
+        Tensor workspace = allocate_workspace(fwdAlgPerf.memory, input);
+
+        // update convDesc mathType since cudnn 7.4+ now requires both algo +
+        // mathType to figure out whether to use Tensor core kernels or not See
+        // Note [behavior of cudnnFind and cudnnGet]
+        ASSERT_CORRECT_PRECISION(fwdAlgPerf.mathType);
+        AT_CUDNN_CHECK_WITH_SHAPES(
+            cudnnSetConvolutionMathType(
+                args.cdesc.mut_desc(), fwdAlgPerf.mathType),
+            args);
 
+        Constant one(dataType, 1);
+        Constant zero(dataType, 0);
+
+        AT_CUDNN_CHECK_WITH_SHAPES(
+            cudnnConvolutionForward(
+                args.handle,
+                &one,
+                args.idesc.desc(),
+                input.const_data_ptr(),
+                args.wdesc.desc(),
+                weight.const_data_ptr(),
+                args.cdesc.desc(),
+                fwdAlgPerf.algo,
+                workspace.data_ptr(),
+                fwdAlgPerf.memory,
+                &zero,
+                args.odesc.desc(),
+                output.data_ptr()),
+            args,
+            "Forward algorithm: ",
+            static_cast<int>(fwdAlgPerf.algo),
+            "\n");
+      });
+}
 
 void raw_cudnn_convolution_forward_out_v7(
-    const Tensor& output, const Tensor& input, const Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32) {
-  split_batch_dim_to_32bit_out(output, input, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32, 1024 * 1024 * 256, raw_cudnn_convolution_forward_out_32bit);
+    const Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
+  split_batch_dim_to_32bit_out(
+      output,
+      input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      allow_tf32,
+      1024 * 1024 * 256,
+      raw_cudnn_convolution_forward_out_32bit);
 }
 
 // ---------------------------------------------------------------------
@@ -659,54 +813,112 @@ void raw_cudnn_convolution_backward_input_out_32bit(
     const at::Tensor& grad_input,
     const at::Tensor& grad_output,
     const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32) {
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
   auto dataType = getCudnnDataType(grad_output);
 
-  ConvolutionArgs args{ grad_input, grad_output, weight };
+  ConvolutionArgs args{grad_input, grad_output, weight};
   args.handle = getCudnnHandle();
-  at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(grad_input, weight);
-  setConvolutionParams(&args.params, grad_input, weight, padding, stride, dilation, groups, deterministic, allow_tf32, memory_format);
+  at::MemoryFormat memory_format =
+      cudnn_conv_suggest_memory_format(grad_input, weight);
+  setConvolutionParams(
+      &args.params,
+      grad_input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      deterministic,
+      allow_tf32,
+      memory_format);
   args.idesc.set(grad_input, memory_format);
   args.wdesc.set(weight, memory_format, 0);
   args.odesc.set(grad_output, memory_format);
-  args.cdesc.set(dataType, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, args.params.allow_tf32);
-
-  AlgoIterator<cudnnConvolutionBwdDataAlgoPerf_t>(args, benchmark).try_all(
-    [&](const cudnnConvolutionBwdDataAlgoPerf_t &bwdDataAlgPerf){
-      Tensor workspace = allocate_workspace(bwdDataAlgPerf.memory, grad_output);
-
-      // update convDesc mathType since cudnn 7.4+ now requires both algo + mathType to figure out
-      // whether to use Tensor core kernels or not
-      // See Note [behavior of cudnnFind and cudnnGet]
-      ASSERT_CORRECT_PRECISION(bwdDataAlgPerf.mathType);
-      AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdDataAlgPerf.mathType), args);
-
-      Constant one(dataType, 1);
-      Constant zero(dataType, 0);
-
-      AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionBackwardData(
-          args.handle,
-          &one, args.wdesc.desc(), weight.data_ptr(),
-          args.odesc.desc(), grad_output.data_ptr(),
-          args.cdesc.desc(), bwdDataAlgPerf.algo, workspace.data_ptr(), bwdDataAlgPerf.memory,
-          &zero, args.idesc.desc(), grad_input.mutable_data_ptr()),
-        args,
-        "Additional pointer addresses: \n",
-        "    grad_output: ", grad_output.data_ptr(), "\n",
-        "    grad_input: ", grad_input.mutable_data_ptr(), "\n",
-        "Backward data algorithm: ", static_cast<int>(bwdDataAlgPerf.algo), "\n");
-    }
-  );
+  args.cdesc.set(
+      dataType,
+      grad_output.dim() - 2,
+      args.params.padding,
+      args.params.stride,
+      args.params.dilation,
+      args.params.groups,
+      args.params.allow_tf32);
+
+  AlgoIterator<cudnnConvolutionBwdDataAlgoPerf_t>(args, benchmark)
+      .try_all([&](const cudnnConvolutionBwdDataAlgoPerf_t& bwdDataAlgPerf) {
+        Tensor workspace =
+            allocate_workspace(bwdDataAlgPerf.memory, grad_output);
+
+        // update convDesc mathType since cudnn 7.4+ now requires both algo +
+        // mathType to figure out whether to use Tensor core kernels or not See
+        // Note [behavior of cudnnFind and cudnnGet]
+        ASSERT_CORRECT_PRECISION(bwdDataAlgPerf.mathType);
+        AT_CUDNN_CHECK_WITH_SHAPES(
+            cudnnSetConvolutionMathType(
+                args.cdesc.mut_desc(), bwdDataAlgPerf.mathType),
+            args);
+
+        Constant one(dataType, 1);
+        Constant zero(dataType, 0);
+
+        AT_CUDNN_CHECK_WITH_SHAPES(
+            cudnnConvolutionBackwardData(
+                args.handle,
+                &one,
+                args.wdesc.desc(),
+                weight.const_data_ptr(),
+                args.odesc.desc(),
+                grad_output.const_data_ptr(),
+                args.cdesc.desc(),
+                bwdDataAlgPerf.algo,
+                workspace.data_ptr(),
+                bwdDataAlgPerf.memory,
+                &zero,
+                args.idesc.desc(),
+                grad_input.mutable_data_ptr()),
+            args,
+            "Additional pointer addresses: \n",
+            "    grad_output: ",
+            grad_output.const_data_ptr(),
+            "\n",
+            "    grad_input: ",
+            grad_input.mutable_data_ptr(),
+            "\n",
+            "Backward data algorithm: ",
+            static_cast<int>(bwdDataAlgPerf.algo),
+            "\n");
+      });
 }
 
 void raw_cudnn_convolution_backward_input_out_v7(
     const at::Tensor& grad_input,
     const at::Tensor& grad_output,
     const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32) {
-  split_batch_dim_to_32bit_out(grad_input, grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32, 1024 * 1024 * 128, raw_cudnn_convolution_backward_input_out_32bit);
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
+  split_batch_dim_to_32bit_out(
+      grad_input,
+      grad_output,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      allow_tf32,
+      1024 * 1024 * 128,
+      raw_cudnn_convolution_backward_input_out_32bit);
 }
 
 // ---------------------------------------------------------------------
@@ -716,98 +928,176 @@ void raw_cudnn_convolution_backward_input_out_v7(
 // ---------------------------------------------------------------------
 
 void raw_cudnn_convolution_backward_weight_out_32bit(
-    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32) {
-
+    const Tensor& grad_weight,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
   auto dataType = getCudnnDataType(input);
 
-  ConvolutionArgs args{ input, grad_output, grad_weight };
+  ConvolutionArgs args{input, grad_output, grad_weight};
   args.handle = getCudnnHandle();
-  at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(input, grad_weight);
-  setConvolutionParams(&args.params, input, grad_weight, padding, stride, dilation, groups, deterministic, allow_tf32, memory_format);
+  at::MemoryFormat memory_format =
+      cudnn_conv_suggest_memory_format(input, grad_weight);
+  setConvolutionParams(
+      &args.params,
+      input,
+      grad_weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      deterministic,
+      allow_tf32,
+      memory_format);
   args.idesc.set(input, memory_format);
   args.wdesc.set(grad_weight, memory_format, 0);
   args.odesc.set(grad_output, memory_format);
-  args.cdesc.set(dataType, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, args.params.allow_tf32);
-
-  AlgoIterator<cudnnConvolutionBwdFilterAlgoPerf_t>(args, benchmark).try_all(
-    [&](const cudnnConvolutionBwdFilterAlgoPerf_t &bwdFilterAlgPerf){
-      Tensor workspace = allocate_workspace(bwdFilterAlgPerf.memory, input);
-
-      // update convDesc mathType since cudnn 7.4+ now requires both algo + mathType to figure out
-      // whether to use Tensor core kernels or not
-      // See Note [behavior of cudnnFind and cudnnGet]
-      ASSERT_CORRECT_PRECISION(bwdFilterAlgPerf.mathType);
-      AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdFilterAlgPerf.mathType), args);
-
-      Constant one(dataType, 1);
-      Constant zero(dataType, 0);
-
-      AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionBackwardFilter(
-          args.handle,
-          &one, args.idesc.desc(), input.data_ptr(),
-          args.odesc.desc(), grad_output.data_ptr(),
-          args.cdesc.desc(), bwdFilterAlgPerf.algo, workspace.data_ptr(), bwdFilterAlgPerf.memory,
-          &zero, args.wdesc.desc(), grad_weight.data_ptr()),
-        args,
-        "Additional pointer addresses: \n",
-        "    grad_output: ", grad_output.data_ptr(), "\n",
-        "    grad_weight: ", grad_weight.data_ptr(), "\n",
-        "Backward filter algorithm: ", static_cast<int>(bwdFilterAlgPerf.algo), "\n");
-    }
-  );
+  args.cdesc.set(
+      dataType,
+      input.dim() - 2,
+      args.params.padding,
+      args.params.stride,
+      args.params.dilation,
+      args.params.groups,
+      args.params.allow_tf32);
+
+  AlgoIterator<cudnnConvolutionBwdFilterAlgoPerf_t>(args, benchmark)
+      .try_all(
+          [&](const cudnnConvolutionBwdFilterAlgoPerf_t& bwdFilterAlgPerf) {
+            Tensor workspace =
+                allocate_workspace(bwdFilterAlgPerf.memory, input);
+
+            // update convDesc mathType since cudnn 7.4+ now requires both algo
+            // + mathType to figure out whether to use Tensor core kernels or
+            // not See Note [behavior of cudnnFind and cudnnGet]
+            ASSERT_CORRECT_PRECISION(bwdFilterAlgPerf.mathType);
+            AT_CUDNN_CHECK_WITH_SHAPES(
+                cudnnSetConvolutionMathType(
+                    args.cdesc.mut_desc(), bwdFilterAlgPerf.mathType),
+                args);
+
+            Constant one(dataType, 1);
+            Constant zero(dataType, 0);
+
+            AT_CUDNN_CHECK_WITH_SHAPES(
+                cudnnConvolutionBackwardFilter(
+                    args.handle,
+                    &one,
+                    args.idesc.desc(),
+                    input.const_data_ptr(),
+                    args.odesc.desc(),
+                    grad_output.const_data_ptr(),
+                    args.cdesc.desc(),
+                    bwdFilterAlgPerf.algo,
+                    workspace.data_ptr(),
+                    bwdFilterAlgPerf.memory,
+                    &zero,
+                    args.wdesc.desc(),
+                    grad_weight.data_ptr()),
+                args,
+                "Additional pointer addresses: \n",
+                "    grad_output: ",
+                grad_output.const_data_ptr(),
+                "\n",
+                "    grad_weight: ",
+                grad_weight.data_ptr(),
+                "\n",
+                "Backward filter algorithm: ",
+                static_cast<int>(bwdFilterAlgPerf.algo),
+                "\n");
+          });
 }
 
 void raw_cudnn_convolution_backward_weight_out_v7(
-    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, bool allow_tf32) {
+    const Tensor& grad_weight,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool allow_tf32) {
   constexpr int64_t int_max = std::numeric_limits<int>::max();
   const int64_t ni = input.numel();
   const int64_t no = grad_output.numel();
   // Assume the shape of the tensor is (N, C, D1, D2, ...)
   // if N * C * D1 * D2 * ... <= int_max, then no need to split at all
   if (ni <= int_max && no <= int_max) {
-    raw_cudnn_convolution_backward_weight_out_32bit(grad_weight, grad_output, input, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+    raw_cudnn_convolution_backward_weight_out_32bit(
+        grad_weight,
+        grad_output,
+        input,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        allow_tf32);
     return;
   }
-  // else, if C * D1 * D2 * ... <= int_max, then we just need to split across the N dimension
+  // else, if C * D1 * D2 * ... <= int_max, then we just need to split across
+  // the N dimension
   //
   // Here we use a simple heuristics to determine the size of each split
   // We don't max out the 2^31 address space because this number is super
   // large and very likely to get an OOM.
   int64_t n = grad_output.size(0);
   int64_t max_inner_size = std::max<int64_t>(ni, no) / n;
-  int64_t split_size = std::max<int64_t>(1024 * 1024 * 512 / max_inner_size, 1L);
+  int64_t split_size =
+      std::max<int64_t>(1024 * 1024 * 512 / max_inner_size, 1L);
   int64_t num_splits = (n + split_size - 1) / split_size;
   if (split_size * max_inner_size < int_max) {
-    const auto kAccType = (grad_weight.scalar_type() == kHalf || grad_weight.scalar_type() == kBFloat16)
-                             ? kFloat : grad_weight.scalar_type();
-    Tensor grad_weight_accumulator = at::zeros(grad_weight.sizes(), grad_weight.options().dtype(kAccType));
+    const auto kAccType = (grad_weight.scalar_type() == kHalf ||
+                           grad_weight.scalar_type() == kBFloat16)
+        ? kFloat
+        : grad_weight.scalar_type();
+    Tensor grad_weight_accumulator =
+        at::zeros(grad_weight.sizes(), grad_weight.options().dtype(kAccType));
     for (const auto i : c10::irange(num_splits)) {
       int64_t start = split_size * i;
       int64_t split_size_ = std::min<int64_t>(split_size, n - start);
       Tensor input_ = input.narrow(0, start, split_size_);
       Tensor grad_output_ = grad_output.narrow(0, start, split_size_);
       Tensor grad_weight_ = at::empty_like(grad_weight);
-      raw_cudnn_convolution_backward_weight_out_32bit(grad_weight_, grad_output_, input_, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+      raw_cudnn_convolution_backward_weight_out_32bit(
+          grad_weight_,
+          grad_output_,
+          input_,
+          padding,
+          stride,
+          dilation,
+          groups,
+          benchmark,
+          deterministic,
+          allow_tf32);
       grad_weight_accumulator.add_(grad_weight_);
     }
     grad_weight.copy_(grad_weight_accumulator);
     return;
   }
-  // If control flow reaches here, this means even splitting N is not enough, then things starts to become complicated:
-  // For example, for conv2d, there following questions needs to be considered.
+  // If control flow reaches here, this means even splitting N is not enough,
+  // then things starts to become complicated: For example, for conv2d, there
+  // following questions needs to be considered.
   // - Is the memory layout NCHW or NHWC ?
   // - If the conv is NCHW -> NC'H'W', then should we
   //   - split only NC?
   //   - split only N'C'?
   //   - split both?
-  // - If the conv is NHWC, then we need to split across H, we need to be very careful about the boundary condition
+  // - If the conv is NHWC, then we need to split across H, we need to be very
+  // careful about the boundary condition
   //   to make sure that the boundary is handled correctly.
-  // - If we decide to make these splits, is the memory contiguous? Do we need to copy the memory?
-  // Considering the complexity of this issue, it is better not to use cuDNN for this case
+  // - If we decide to make these splits, is the memory contiguous? Do we need
+  // to copy the memory? Considering the complexity of this issue, it is better
+  // not to use cuDNN for this case
   TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN.");
 }
 
@@ -828,7 +1118,8 @@ void raw_cudnn_convolution_add_relu_out_v7(
   auto dataType = getCudnnDataType(input);
   ConvolutionArgs args{input, output, weight};
   args.handle = getCudnnHandle();
-  at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(input, weight);
+  at::MemoryFormat memory_format =
+      cudnn_conv_suggest_memory_format(input, weight);
   setConvolutionParams(
       &args.params,
       input,
@@ -882,24 +1173,26 @@ void raw_cudnn_convolution_add_relu_out_v7(
                 args.handle,
                 &one,
                 args.idesc.desc(),
-                input.data_ptr(),
+                input.const_data_ptr(),
                 args.wdesc.desc(),
-                weight.data_ptr(),
+                weight.const_data_ptr(),
                 args.cdesc.desc(),
                 fwdAlgPerf.algo,
                 workspace.data_ptr(),
                 fwdAlgPerf.memory,
                 &alpha_,
                 zdesc.desc(),
-                z.data_ptr(),
+                z.const_data_ptr(),
                 bdesc.desc(),
-                bias.data_ptr(),
+                bias.const_data_ptr(),
                 adesc.desc(),
                 args.odesc.desc(),
                 output.data_ptr()),
             args,
-            "zdesc: ", zdesc,
-            "bdesc: ", bdesc,
+            "zdesc: ",
+            zdesc,
+            "bdesc: ",
+            bdesc,
             "cudnnConvolutionBiasActivationForward: ",
             static_cast<int>(fwdAlgPerf.algo),
             "\n");
@@ -920,17 +1213,29 @@ void raw_cudnn_convolution_add_relu_fallback_out(
     bool benchmark,
     bool deterministic,
     bool allow_tf32) {
-
   // cuDNN Conv-Bias-Activation:
   // y = act ( alpha1 * conv(x) + alpha2 * z + bias )
-  // In pytorch function `raw_cudnn_convolution_add_relu_out`: alpha1 is 1, alpha 2 is `float alpha`
+  // In pytorch function `raw_cudnn_convolution_add_relu_out`: alpha1 is 1,
+  // alpha 2 is `float alpha`
 
-  raw_cudnn_convolution_forward_out(output, input, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
-  at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input.dim(), bias).add(z, alpha);
+  raw_cudnn_convolution_forward_out(
+      output,
+      input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      allow_tf32);
+  at::Tensor alpha_mul_z_add_bias =
+      at::native::reshape_bias(input.dim(), bias).add(z, alpha);
   output.add_(alpha_mul_z_add_bias);
   output.relu_();
 }
 
-}}  // namespace at::native
+} // namespace native
+} // namespace at
 
 #endif
diff --git a/aten/src/ATen/native/cudnn/Conv_v8.cpp b/aten/src/ATen/native/cudnn/Conv_v8.cpp
index aa582fc19e14c..750cbcca6b6d0 100644
--- a/aten/src/ATen/native/cudnn/Conv_v8.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@@ -1,6 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 
-#include <ATen/cuda/CUDAConfig.h>  // for the definition of AT_CUDNN_ENABLED
+#include <ATen/cuda/CUDAConfig.h> // for the definition of AT_CUDNN_ENABLED
 
 #if AT_CUDNN_ENABLED()
 
@@ -12,23 +12,22 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wsuggest-override")
 #include <cudnn_frontend.h>
 C10_DIAGNOSTIC_POP()
 
-#include <cudnn_frontend_find_plan.h>
-#include <cudnn_frontend_get_plan.h>
-#include <ATen/core/Tensor.h>
 #include <ATen/TensorUtils.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/Exceptions.h>
+#include <ATen/cudnn/Handle.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/cudnn/ConvShared.h>
 #include <ATen/native/utils/ParamsHash.h>
-#include <ATen/cudnn/Handle.h>
-#include <ATen/TensorUtils.h>
+#include <cudnn_frontend_find_plan.h>
+#include <cudnn_frontend_get_plan.h>
 
-#include <c10/util/env.h>
-#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/util/env.h>
 
-#include <unordered_map>
 #include <list>
+#include <unordered_map>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -40,19 +39,20 @@ C10_DIAGNOSTIC_POP()
 #include <dlfcn.h>
 #endif
 
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 namespace {
 
 // TODO: remove duplicate code in Conv_v7.cpp
-constexpr int64_t operator "" _TiB(unsigned long long n) {
+constexpr int64_t operator"" _TiB(unsigned long long n) {
   return size_t(n) << 40;
 }
 
-uint8_t getAlignment(const Tensor &t) {
+uint8_t getAlignment(const Tensor& t) {
   // alignment are in bytes
   uint8_t alignment = 1;
-  uintptr_t address = reinterpret_cast<uintptr_t>(t.data_ptr());
+  uintptr_t address = reinterpret_cast<uintptr_t>(t.const_data_ptr());
   for (; alignment < 32; alignment *= 2) {
     if (address % (alignment * 2)) {
       return alignment;
@@ -61,17 +61,25 @@ uint8_t getAlignment(const Tensor &t) {
   return alignment;
 }
 
-cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(const Tensor &t, const int64_t id, const uint8_t alignment, const cudnnDataType_t dataType, const at::MemoryFormat memory_format, const bool _virtual) {
-#if defined(__linux__) && !defined(FBCODE_CAFFE2) && CUDNN_MAJOR == 8 && CUDNN_MINOR > 5
-  // Workaround for cudnn error handling deficiency, that results in a crash on Ubuntu-22+
-  // if `libnvrtc.so` is not found on the system, which strictly speaking is not necessary
-  // for usecases below
-  // See https://github.com/pytorch/pytorch/issues/97041
+cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(
+    const Tensor& t,
+    const int64_t id,
+    const uint8_t alignment,
+    const cudnnDataType_t dataType,
+    const at::MemoryFormat memory_format,
+    const bool _virtual) {
+#if defined(__linux__) && !defined(FBCODE_CAFFE2) && CUDNN_MAJOR == 8 && \
+    CUDNN_MINOR > 5
+  // Workaround for cudnn error handling deficiency, that results in a crash on
+  // Ubuntu-22+ if `libnvrtc.so` is not found on the system, which strictly
+  // speaking is not necessary for usecases below See
+  // https://github.com/pytorch/pytorch/issues/97041
   static C10_UNUSED auto cudnn_cnn_infer_handler = [] {
-    void *handle = dlopen("libcudnn_cnn_infer.so.8", RTLD_LAZY);
-    char *err = dlerror();
+    void* handle = dlopen("libcudnn_cnn_infer.so.8", RTLD_LAZY);
+    char* err = dlerror();
     if (!handle) {
-      TORCH_WARN("Attempt to open cnn_infer failed: handle=", handle, " error: ", err);
+      TORCH_WARN(
+          "Attempt to open cnn_infer failed: handle=", handle, " error: ", err);
     } else if (err) {
       TORCH_WARN("Applied workaround for CuDNN issue, install nvrtc.so");
     }
@@ -81,52 +89,74 @@ cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(const Tensor &t, const
   auto sizes = t.sizes();
   auto strides = t.strides();
   bool channels_last = memory_format == at::MemoryFormat::ChannelsLast ||
-    memory_format == at::MemoryFormat::ChannelsLast3d;
-  fixSizeOneDimStride<int64_t>(sizes.size(), &sizes[0], (int64_t *) &strides[0], channels_last);
+      memory_format == at::MemoryFormat::ChannelsLast3d;
+  fixSizeOneDimStride<int64_t>(
+      sizes.size(), &sizes[0], (int64_t*)&strides[0], channels_last);
   auto r = cudnn_frontend::TensorBuilder()
-    .setDim(sizes.size(), sizes.data())
-    .setStrides(strides.size(), strides.data())
-    .setId(id)
-    .setAlignment(alignment)
-    .setDataType(dataType)
-    .setVirtual(_virtual)
-    .build();
+               .setDim(sizes.size(), sizes.data())
+               .setStrides(strides.size(), strides.data())
+               .setId(id)
+               .setAlignment(alignment)
+               .setDataType(dataType)
+               .setVirtual(_virtual)
+               .build();
   return r;
 }
 
-cudnn_frontend::Tensor getTensorDescriptor(const Tensor &t, const int64_t id, const uint8_t alignment, const at::MemoryFormat memory_format) {
-  return getTensorDescriptorWithTypeVirtual(t, id, alignment, getCudnnDataType(t), memory_format, false);
+cudnn_frontend::Tensor getTensorDescriptor(
+    const Tensor& t,
+    const int64_t id,
+    const uint8_t alignment,
+    const at::MemoryFormat memory_format) {
+  return getTensorDescriptorWithTypeVirtual(
+      t, id, alignment, getCudnnDataType(t), memory_format, false);
 }
 
-cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, const at::ScalarType scalar_type) {
+cudnn_frontend::ConvDesc_v8 getConvDescriptor(
+    cudnnDataType_t dataType,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    const at::ScalarType scalar_type) {
   uint64_t convDim = stride.size();
   if (scalar_type == kBFloat16 || scalar_type == kHalf) {
     dataType = CUDNN_DATA_FLOAT;
   }
   return cudnn_frontend::ConvDescBuilder()
-    .setDataType(dataType)
-    .setMathMode(CUDNN_CROSS_CORRELATION)
-    .setNDims(convDim)
-    .setStrides(convDim, stride.data())
-    .setPrePadding(convDim, padding.data())
-    .setPostPadding(convDim, padding.data())
-    .setDilation(convDim, dilation.data())
-    .build();
+      .setDataType(dataType)
+      .setMathMode(CUDNN_CROSS_CORRELATION)
+      .setNDims(convDim)
+      .setStrides(convDim, stride.data())
+      .setPrePadding(convDim, padding.data())
+      .setPostPadding(convDim, padding.data())
+      .setDilation(convDim, dilation.data())
+      .build();
 }
 
 void filterEngineConfigs(
-  cudnn_frontend::EngineConfigList &from,
-  cudnn_frontend::EngineConfigList &to,
-  bool deterministic, bool allow_tf32, c10::ScalarType scalar_type)
-{
+    cudnn_frontend::EngineConfigList& from,
+    cudnn_frontend::EngineConfigList& to,
+    bool deterministic,
+    bool allow_tf32,
+    c10::ScalarType scalar_type) {
   auto filter = [=](cudnnBackendDescriptor_t c) {
     if (deterministic) {
-      if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC>(c)) {return true;}
+      if (cudnn_frontend::hasNumericalNote<
+              CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC>(c)) {
+        return true;
+      }
+    }
+    if (cudnn_frontend::hasNumericalNote<
+            CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(c)) {
+      return true;
     }
-    if (cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(c)) {return true;}
     if (scalar_type == kFloat) {
       // TODO: check under which conditions this is OK
-      if (!allow_tf32 && cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_TENSOR_CORE>(c)) {return true;}
+      if (!allow_tf32 &&
+          cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_TENSOR_CORE>(
+              c)) {
+        return true;
+      }
     }
     return false;
   };
@@ -149,14 +179,35 @@ struct CacheKeyFused {
   uint8_t y_alignment;
   uint8_t z_alignment;
   uint8_t b_alignment;
-  // TODO: does it make sense to have this in the key? but alpha is a graph-level param...
+  // TODO: does it make sense to have this in the key? but alpha is a
+  // graph-level param...
   float alpha;
 };
 
 struct CacheKeyWrapper : ParamsWrapper<CacheKey> {
-  CacheKeyWrapper(const cudnnBackendDescriptorType_t operation, const Tensor& y, const Tensor& x, const Tensor& w, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, int64_t groups, bool deterministic, bool allow_tf32) {
+  CacheKeyWrapper(
+      const cudnnBackendDescriptorType_t operation,
+      const Tensor& y,
+      const Tensor& x,
+      const Tensor& w,
+      const IntArrayRef padding,
+      const IntArrayRef stride,
+      const IntArrayRef dilation,
+      int64_t groups,
+      bool deterministic,
+      bool allow_tf32) {
     at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(x, w);
-    setConvolutionParams(&(this->pod.params), x, w, padding, stride, dilation, groups, deterministic, allow_tf32, memory_format);
+    setConvolutionParams(
+        &(this->pod.params),
+        x,
+        w,
+        padding,
+        stride,
+        dilation,
+        groups,
+        deterministic,
+        allow_tf32,
+        memory_format);
     this->pod.operation = operation;
     this->pod.x_alignment = getAlignment(x);
     this->pod.y_alignment = getAlignment(y);
@@ -165,9 +216,31 @@ struct CacheKeyWrapper : ParamsWrapper<CacheKey> {
 };
 
 struct CacheKeyFusedWrapper : ParamsWrapper<CacheKeyFused> {
-  CacheKeyFusedWrapper(const Tensor& y, const Tensor& x, const Tensor& w, const Tensor& z, const Tensor& b, const float alpha, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, int64_t groups, bool deterministic, bool allow_tf32) {
+  CacheKeyFusedWrapper(
+      const Tensor& y,
+      const Tensor& x,
+      const Tensor& w,
+      const Tensor& z,
+      const Tensor& b,
+      const float alpha,
+      const IntArrayRef padding,
+      const IntArrayRef stride,
+      const IntArrayRef dilation,
+      int64_t groups,
+      bool deterministic,
+      bool allow_tf32) {
     at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(x, w);
-    setConvolutionParams(&(this->pod).params, x, w, padding, stride, dilation, groups, deterministic, allow_tf32, memory_format);
+    setConvolutionParams(
+        &(this->pod).params,
+        x,
+        w,
+        padding,
+        stride,
+        dilation,
+        groups,
+        deterministic,
+        allow_tf32,
+        memory_format);
     this->pod.x_alignment = getAlignment(x);
     this->pod.y_alignment = getAlignment(y);
     this->pod.w_alignment = getAlignment(w);
@@ -178,177 +251,305 @@ struct CacheKeyFusedWrapper : ParamsWrapper<CacheKeyFused> {
 };
 
 static int getLRUCacheLimit() {
-  constexpr int DEFAULT_LIMIT = 10000; // roughly corresponds to 2GiB assuming 200KiB per ExecutionPlan
+  constexpr int DEFAULT_LIMIT =
+      10000; // roughly corresponds to 2GiB assuming 200KiB per ExecutionPlan
   // 0 is used to indicate no limit
   // negative values are used to indicate no caching
   static int limit = [&] {
-    const char * val = getenv("TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT");
+    const char* val = getenv("TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT");
     if (!val) {
-       return DEFAULT_LIMIT;
+      return DEFAULT_LIMIT;
     }
     try {
       return std::stoi(val);
-    } catch(std::invalid_argument const& e) {
-      TORCH_WARN("invalid TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT,",
-               " using default LRU cache limit of ", DEFAULT_LIMIT, " entries.");
-    } catch(std::out_of_range const& e) {
-      TORCH_WARN("invalid TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT,",
-                 " using default LRU cache limit of ", DEFAULT_LIMIT, " entries.");
+    } catch (std::invalid_argument const& e) {
+      TORCH_WARN(
+          "invalid TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT,",
+          " using default LRU cache limit of ",
+          DEFAULT_LIMIT,
+          " entries.");
+    } catch (std::out_of_range const& e) {
+      TORCH_WARN(
+          "invalid TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT,",
+          " using default LRU cache limit of ",
+          DEFAULT_LIMIT,
+          " entries.");
     }
     return DEFAULT_LIMIT;
-  } ();
+  }();
   return limit;
 }
 
 template <typename T, typename KeyType>
 struct BenchmarkCache {
-std::list<KeyType> engine_cache_order;
-std::unordered_map<KeyType, std::pair<cudnn_frontend::ExecutionPlan, typename std::list<KeyType>::iterator>, ParamsWrapperHash<KeyType>> engine_cache;
-
-// no mutexes here as caches are now thread local for v8, can also return a pointer
-// to the Execution Plan if we know it will not be invalidated by another thread
-cudnn_frontend::ExecutionPlan* find(const KeyType& key) {
-  const int lru_cache_limit = getLRUCacheLimit();
-  if (lru_cache_limit < 0) {
-    return nullptr;
-  }
-  auto it = engine_cache.find(key);
-  if (it == engine_cache.end()) {
-    return nullptr;
-  }
-  if (lru_cache_limit) {
-    // update most recently accessed
-    engine_cache_order.splice(engine_cache_order.begin(), engine_cache_order, it->second.second);
-  }
-  return &(it->second.first);
-}
+  std::list<KeyType> engine_cache_order;
+  std::unordered_map<
+      KeyType,
+      std::pair<
+          cudnn_frontend::ExecutionPlan,
+          typename std::list<KeyType>::iterator>,
+      ParamsWrapperHash<KeyType>>
+      engine_cache;
 
-void update(const KeyType& key, T& results) {
-  int lru_cache_limit = getLRUCacheLimit();
-  if (lru_cache_limit < 0) {
-    return;
-  } else if (lru_cache_limit) {
+  // no mutexes here as caches are now thread local for v8, can also return a
+  // pointer to the Execution Plan if we know it will not be invalidated by
+  // another thread
+  cudnn_frontend::ExecutionPlan* find(const KeyType& key) {
+    const int lru_cache_limit = getLRUCacheLimit();
+    if (lru_cache_limit < 0) {
+      return nullptr;
+    }
     auto it = engine_cache.find(key);
     if (it == engine_cache.end()) {
-      if ((long) engine_cache.size() >= lru_cache_limit) {
-        auto erase_count = engine_cache.erase(engine_cache_order.back());
-        TORCH_INTERNAL_ASSERT(erase_count == 1, "CUDNN V8 LRU Cache Corrupted (eviction key not in map). Please report a bug to PyTorch.");
-        engine_cache_order.pop_back();
-      }
-      engine_cache_order.emplace_front(key);
-      engine_cache.emplace(key, std::make_pair(results, engine_cache_order.begin()));
-    } else {
-      it->second.first = results;
+      return nullptr;
+    }
+    if (lru_cache_limit) {
       // update most recently accessed
-      engine_cache_order.splice(engine_cache_order.begin(), engine_cache_order, it->second.second);
+      engine_cache_order.splice(
+          engine_cache_order.begin(), engine_cache_order, it->second.second);
     }
-  } else {
-    engine_cache.erase(key);
-    engine_cache.emplace(key, std::make_pair(results, engine_cache_order.end())); // dummy iterator
+    return &(it->second.first);
   }
-}
 
+  void update(const KeyType& key, T& results) {
+    int lru_cache_limit = getLRUCacheLimit();
+    if (lru_cache_limit < 0) {
+      return;
+    } else if (lru_cache_limit) {
+      auto it = engine_cache.find(key);
+      if (it == engine_cache.end()) {
+        if ((long)engine_cache.size() >= lru_cache_limit) {
+          auto erase_count = engine_cache.erase(engine_cache_order.back());
+          TORCH_INTERNAL_ASSERT(
+              erase_count == 1,
+              "CUDNN V8 LRU Cache Corrupted (eviction key not in map). Please report a bug to PyTorch.");
+          engine_cache_order.pop_back();
+        }
+        engine_cache_order.emplace_front(key);
+        engine_cache.emplace(
+            key, std::make_pair(results, engine_cache_order.begin()));
+      } else {
+        it->second.first = results;
+        // update most recently accessed
+        engine_cache_order.splice(
+            engine_cache_order.begin(), engine_cache_order, it->second.second);
+      }
+    } else {
+      engine_cache.erase(key);
+      engine_cache.emplace(
+          key,
+          std::make_pair(results, engine_cache_order.end())); // dummy iterator
+    }
+  }
 };
 
-// @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to be thread safe across all engines
-// see Limitations in https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html
-thread_local BenchmarkCache<cudnn_frontend::ExecutionPlan, CacheKeyWrapper> benchmark_cache;
-thread_local BenchmarkCache<cudnn_frontend::ExecutionPlan, CacheKeyFusedWrapper> benchmark_cache_fused;
+// @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to
+// be thread safe across all engines see Limitations in
+// https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html
+thread_local BenchmarkCache<cudnn_frontend::ExecutionPlan, CacheKeyWrapper>
+    benchmark_cache;
+thread_local BenchmarkCache<cudnn_frontend::ExecutionPlan, CacheKeyFusedWrapper>
+    benchmark_cache_fused;
 
 } // namespace
 
-void run_conv_plan(cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const cudnn_frontend::ExecutionPlan& plan) {
+void run_conv_plan(
+    cudnnHandle_t handle,
+    const Tensor& x,
+    const Tensor& y,
+    const Tensor& w,
+    const cudnn_frontend::ExecutionPlan& plan,
+    const cudnnBackendDescriptorType_t operation) {
   c10::DeviceGuard g(x.options().device());
   auto workspace_size = plan.getWorkspaceSize();
-  auto workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
-  void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr()};
+  auto workspace_ptr =
+      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
+  void* data_ptrs[3];
+
+  if (operation == CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) {
+    data_ptrs[0] = const_cast<void*>(x.const_data_ptr());
+    data_ptrs[1] = y.data_ptr();
+    data_ptrs[2] = const_cast<void*>(w.const_data_ptr());
+  } else if (
+      operation ==
+      CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR) {
+    data_ptrs[0] = x.data_ptr();
+    data_ptrs[1] = const_cast<void*>(y.const_data_ptr());
+    data_ptrs[2] = const_cast<void*>(w.const_data_ptr());
+  } else if (
+      operation ==
+      CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR) {
+    data_ptrs[0] = const_cast<void*>(x.const_data_ptr());
+    data_ptrs[1] = const_cast<void*>(y.const_data_ptr());
+    data_ptrs[2] = w.data_ptr();
+  } else {
+    data_ptrs[0] = x.data_ptr();
+    data_ptrs[1] = y.data_ptr();
+    data_ptrs[2] = w.data_ptr();
+  }
+
   int64_t uids[] = {'x', 'y', 'w'};
-  auto variantPack = cudnn_frontend::VariantPackBuilder()
-      .setWorkspacePointer(workspace_size ? workspace_ptr.get() : nullptr)
-      .setDataPointers(3, data_ptrs)
-      .setUids(3, uids)
-      .build();
-  AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()));
+  auto variantPack =
+      cudnn_frontend::VariantPackBuilder()
+          .setWorkspacePointer(workspace_size ? workspace_ptr.get() : nullptr)
+          .setDataPointers(3, data_ptrs)
+          .setUids(3, uids)
+          .build();
+  AT_CUDNN_CHECK(cudnnBackendExecute(
+      handle, plan.get_raw_desc(), variantPack.get_raw_desc()));
 }
 
-void run_conv_plan_fused(cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, const cudnn_frontend::ExecutionPlan& plan) {
+void run_conv_plan_fused(
+    cudnnHandle_t handle,
+    const Tensor& x,
+    const Tensor& y,
+    const Tensor& w,
+    const Tensor& z,
+    const Tensor& b,
+    const cudnn_frontend::ExecutionPlan& plan) {
   c10::DeviceGuard g(x.options().device());
   auto workspace_size = plan.getWorkspaceSize();
-  auto workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
-  void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr(), z.data_ptr(), b.data_ptr()};
+  auto workspace_ptr =
+      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
+  void* data_ptrs[] = {
+      x.data_ptr(), y.data_ptr(), w.data_ptr(), z.data_ptr(), b.data_ptr()};
   int64_t uids[] = {'x', 'y', 'w', 'z', 'b'};
-  auto variantPack = cudnn_frontend::VariantPackBuilder()
-      .setWorkspacePointer(workspace_size ? workspace_ptr.get() : nullptr)
-      .setDataPointers(5, data_ptrs)
-      .setUids(5, uids)
-      .build();
-  AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()));
+  auto variantPack =
+      cudnn_frontend::VariantPackBuilder()
+          .setWorkspacePointer(workspace_size ? workspace_ptr.get() : nullptr)
+          .setDataPointers(5, data_ptrs)
+          .setUids(5, uids)
+          .build();
+  AT_CUDNN_CHECK(cudnnBackendExecute(
+      handle, plan.get_raw_desc(), variantPack.get_raw_desc()));
 }
 
-auto build_opgraph(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const Tensor& x, const Tensor& y, const Tensor& w, const CacheKeyWrapper& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation) {
+auto build_opgraph(
+    const cudnnHandle_t handle,
+    const cudnnBackendDescriptorType_t desc,
+    const Tensor& x,
+    const Tensor& y,
+    const Tensor& w,
+    const CacheKeyWrapper& key,
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation) {
   auto op = cudnn_frontend::OperationBuilder(desc)
-      .setxDesc(getTensorDescriptor(x, 'x', key.pod.x_alignment, key.pod.params.memory_format))
-      .setyDesc(getTensorDescriptor(y, 'y', key.pod.y_alignment, key.pod.params.memory_format))
-      .setwDesc(getTensorDescriptor(w, 'w', key.pod.w_alignment, key.pod.params.memory_format))
-      .setcDesc(getConvDescriptor(key.pod.params.dataType, padding, stride, dilation, x.scalar_type()))
-      .build();
-  std::array<cudnn_frontend::Operation const *, 1> ops = {&op};
+                .setxDesc(getTensorDescriptor(
+                    x, 'x', key.pod.x_alignment, key.pod.params.memory_format))
+                .setyDesc(getTensorDescriptor(
+                    y, 'y', key.pod.y_alignment, key.pod.params.memory_format))
+                .setwDesc(getTensorDescriptor(
+                    w, 'w', key.pod.w_alignment, key.pod.params.memory_format))
+                .setcDesc(getConvDescriptor(
+                    key.pod.params.dataType,
+                    padding,
+                    stride,
+                    dilation,
+                    x.scalar_type()))
+                .build();
+  std::array<cudnn_frontend::Operation const*, 1> ops = {&op};
   auto opGraph = cudnn_frontend::OperationGraphBuilder()
-      .setHandle(handle)
-      .setOperationGraph(ops.size(), ops.data())
-      .build();
+                     .setHandle(handle)
+                     .setOperationGraph(ops.size(), ops.data())
+                     .build();
   return opGraph;
 }
 
-auto build_opgraph_fused(const cudnnHandle_t handle, const Tensor & x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, const float alpha, const CacheKeyFusedWrapper& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation) {
-  // need computation to be done in FLOAT type regardless of reduced precision input
+auto build_opgraph_fused(
+    const cudnnHandle_t handle,
+    const Tensor& x,
+    const Tensor& y,
+    const Tensor& w,
+    const Tensor& z,
+    const Tensor& b,
+    const float alpha,
+    const CacheKeyFusedWrapper& key,
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation) {
+  // need computation to be done in FLOAT type regardless of reduced precision
+  // input
   const auto precision = CUDNN_DATA_FLOAT;
   auto addDesc = cudnn_frontend::PointWiseDescBuilder()
-                           .setMode(CUDNN_POINTWISE_ADD)
-                           .setMathPrecision(precision)
-                           .build();
+                     .setMode(CUDNN_POINTWISE_ADD)
+                     .setMathPrecision(precision)
+                     .build();
   auto addBiasDesc = cudnn_frontend::PointWiseDescBuilder()
-                            .setMode(CUDNN_POINTWISE_ADD)
-                            .setMathPrecision(precision)
-                            .build();
+                         .setMode(CUDNN_POINTWISE_ADD)
+                         .setMathPrecision(precision)
+                         .build();
   auto actDesc = cudnn_frontend::PointWiseDescBuilder()
-                           .setMode(CUDNN_POINTWISE_RELU_FWD)
-                           .setMathPrecision(precision)
-                           .build();
-  auto convDesc = getConvDescriptor(key.pod.params.dataType, padding, stride, dilation, x.scalar_type());
+                     .setMode(CUDNN_POINTWISE_RELU_FWD)
+                     .setMathPrecision(precision)
+                     .build();
+  auto convDesc = getConvDescriptor(
+      key.pod.params.dataType, padding, stride, dilation, x.scalar_type());
   const float alpha1 = 1.0;
   const float alpha2 = alpha;
-  auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
-                   .setxDesc(getTensorDescriptor(x, 'x', key.pod.x_alignment, key.pod.params.memory_format))
-                   // virtual output of conv
-                   .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'C', key.pod.y_alignment, precision, key.pod.params.memory_format, true))
-                   .setwDesc(getTensorDescriptor(w, 'w', key.pod.w_alignment, key.pod.params.memory_format))
-                   .setAlpha(alpha1)
-                   .setcDesc(convDesc)
-                   .build();
-  auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                           .setxDesc(conv_op.getOutputTensor())
-                           .setbDesc(getTensorDescriptor(z, 'z', key.pod.z_alignment, key.pod.params.memory_format))
-                           // another virtual output (of add)
-                           .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'A', key.pod.y_alignment, precision, key.pod.params.memory_format, true))
-                           .setpwDesc(addDesc)
-                           .setAlpha(alpha1)
-                           .setAlpha2(alpha2)
-                           .build();
-  auto add_bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                           .setxDesc(add_op.getOutputTensor())
-                           .setbDesc(getTensorDescriptor(b, 'b', key.pod.b_alignment, key.pod.params.memory_format))
-                           // another virtual output (of add bias)
-                           .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'B', key.pod.y_alignment, precision, key.pod.params.memory_format, true))
-                           .setpwDesc(addBiasDesc)
-                           .build();
-  auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                          .setxDesc(add_bias_op.getOutputTensor())
-                          // final output is in original datatype
-                          .setyDesc(getTensorDescriptor(y, 'y', key.pod.y_alignment, key.pod.params.memory_format))
-                          .setpwDesc(actDesc)
-                          .build();
-  std::array<cudnn_frontend::Operation const*, 4> ops = {&conv_op, &add_op, &add_bias_op, &act_op};
+  auto conv_op =
+      cudnn_frontend::OperationBuilder(
+          CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+          .setxDesc(getTensorDescriptor(
+              x, 'x', key.pod.x_alignment, key.pod.params.memory_format))
+          // virtual output of conv
+          .setyDesc(getTensorDescriptorWithTypeVirtual(
+              y,
+              'C',
+              key.pod.y_alignment,
+              precision,
+              key.pod.params.memory_format,
+              true))
+          .setwDesc(getTensorDescriptor(
+              w, 'w', key.pod.w_alignment, key.pod.params.memory_format))
+          .setAlpha(alpha1)
+          .setcDesc(convDesc)
+          .build();
+  auto add_op =
+      cudnn_frontend::OperationBuilder(
+          CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+          .setxDesc(conv_op.getOutputTensor())
+          .setbDesc(getTensorDescriptor(
+              z, 'z', key.pod.z_alignment, key.pod.params.memory_format))
+          // another virtual output (of add)
+          .setyDesc(getTensorDescriptorWithTypeVirtual(
+              y,
+              'A',
+              key.pod.y_alignment,
+              precision,
+              key.pod.params.memory_format,
+              true))
+          .setpwDesc(addDesc)
+          .setAlpha(alpha1)
+          .setAlpha2(alpha2)
+          .build();
+  auto add_bias_op =
+      cudnn_frontend::OperationBuilder(
+          CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+          .setxDesc(add_op.getOutputTensor())
+          .setbDesc(getTensorDescriptor(
+              b, 'b', key.pod.b_alignment, key.pod.params.memory_format))
+          // another virtual output (of add bias)
+          .setyDesc(getTensorDescriptorWithTypeVirtual(
+              y,
+              'B',
+              key.pod.y_alignment,
+              precision,
+              key.pod.params.memory_format,
+              true))
+          .setpwDesc(addBiasDesc)
+          .build();
+  auto act_op =
+      cudnn_frontend::OperationBuilder(
+          CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+          .setxDesc(add_bias_op.getOutputTensor())
+          // final output is in original datatype
+          .setyDesc(getTensorDescriptor(
+              y, 'y', key.pod.y_alignment, key.pod.params.memory_format))
+          .setpwDesc(actDesc)
+          .build();
+  std::array<cudnn_frontend::Operation const*, 4> ops = {
+      &conv_op, &add_op, &add_bias_op, &act_op};
   auto opGraph = cudnn_frontend::OperationGraphBuilder()
                      .setHandle(handle)
                      .setOperationGraph(ops.size(), ops.data())
@@ -356,31 +557,55 @@ auto build_opgraph_fused(const cudnnHandle_t handle, const Tensor & x, const Ten
   return opGraph;
 }
 
-auto get_generator_sources(const cudnnBackendDescriptorType_t& desc, const Tensor& x, const bool deterministic, const bool allow_tf32, const cudnnBackendHeurMode_t heur_mode, const bool heuristic, const bool fallback) {
+auto get_generator_sources(
+    const cudnnBackendDescriptorType_t& desc,
+    const Tensor& x,
+    const bool deterministic,
+    const bool allow_tf32,
+    const cudnnBackendHeurMode_t heur_mode,
+    const bool heuristic,
+    const bool fallback) {
   // Method for engine config generator based on heuristics
-  const auto heurgen_method = [/*&desc,*/ &x, deterministic, allow_tf32, heur_mode](cudnn_frontend::OperationGraph &opGraph) -> cudnn_frontend::EngineConfigList {
-      auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
-                            .setOperationGraph(opGraph)
-                            .setHeurMode(heur_mode)
-                            .build();
-      auto &engine_configs = heuristics.getEngineConfig(heuristics.getEngineConfigCount());
-      cudnn_frontend::EngineConfigList filtered_configs;
-      filterEngineConfigs(engine_configs, filtered_configs, deterministic, allow_tf32, x.scalar_type());
-      return filtered_configs;
+  const auto heurgen_method =
+      [/*&desc,*/ &x, deterministic, allow_tf32, heur_mode](
+          cudnn_frontend::OperationGraph& opGraph)
+      -> cudnn_frontend::EngineConfigList {
+    auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
+                          .setOperationGraph(opGraph)
+                          .setHeurMode(heur_mode)
+                          .build();
+    auto& engine_configs =
+        heuristics.getEngineConfig(heuristics.getEngineConfigCount());
+    cudnn_frontend::EngineConfigList filtered_configs;
+    filterEngineConfigs(
+        engine_configs,
+        filtered_configs,
+        deterministic,
+        allow_tf32,
+        x.scalar_type());
+    return filtered_configs;
   };
   // Method for engine config generator based on fallback list
-  const auto fallback_method = [&desc, &x, deterministic, allow_tf32](cudnn_frontend::OperationGraph &opGraph) -> cudnn_frontend::EngineConfigList {
+  const auto fallback_method = [&desc, &x, deterministic, allow_tf32](
+                                   cudnn_frontend::OperationGraph& opGraph)
+      -> cudnn_frontend::EngineConfigList {
     auto fallback = cudnn_frontend::EngineFallbackListBuilder()
                         .setOperationGraph(opGraph)
                         .setOperation(desc)
                         .build();
-    auto &fallback_list = fallback.getFallbackList();
+    auto& fallback_list = fallback.getFallbackList();
     cudnn_frontend::EngineConfigList filtered_configs;
-    filterEngineConfigs(fallback_list, filtered_configs, deterministic, allow_tf32, x.scalar_type());
+    filterEngineConfigs(
+        fallback_list,
+        filtered_configs,
+        deterministic,
+        allow_tf32,
+        x.scalar_type());
     return filtered_configs;
   };
   if (heuristic && fallback) {
-    std::vector<cudnn_frontend::GeneratorSource> sources = {heurgen_method, fallback_method};
+    std::vector<cudnn_frontend::GeneratorSource> sources = {
+        heurgen_method, fallback_method};
     return sources;
   } else if (heuristic) {
     std::vector<cudnn_frontend::GeneratorSource> sources = {heurgen_method};
@@ -392,7 +617,7 @@ auto get_generator_sources(const cudnnBackendDescriptorType_t& desc, const Tenso
 }
 
 int64_t get_available_workspace() {
-  int device;
+  c10::DeviceIndex device = 0;
   C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
   size_t max_block_size = 0;
   c10::cuda::CUDACachingAllocator::cacheInfo(device, &max_block_size);
@@ -401,38 +626,55 @@ int64_t get_available_workspace() {
 
 static nlohmann::json errata_json_handle;
 
-bool plan_errata_exception(const cudnnHandle_t handle, const std::string & executionPlanTag) {
-  static bool has_json = cudnn_frontend::load_from_config(errata_json_handle, "");
+bool plan_errata_exception(
+    const cudnnHandle_t handle,
+    const std::string& executionPlanTag) {
+  static bool has_json =
+      cudnn_frontend::load_from_config(errata_json_handle, "");
   if (!has_json) {
     return false;
   } else {
-    return cudnn_frontend::check_errata(errata_json_handle, executionPlanTag, handle, [](){return true;});
+    return cudnn_frontend::check_errata(
+        errata_json_handle, executionPlanTag, handle, []() { return true; });
   }
 }
 
-void generate_and_filter_plans(const cudnnHandle_t handle, cudnn_frontend::OperationGraph& opGraph, cudnn_frontend::EngineConfigGenerator& generator, const Tensor& x, cudnn_frontend::executionPlans_t& valid_plans, at::DataPtr& workspace_ptr) {
-  auto initial_predicate_function = [&](cudnn_frontend::ExecutionPlan const& plan) -> bool {
+void generate_and_filter_plans(
+    const cudnnHandle_t handle,
+    cudnn_frontend::OperationGraph& opGraph,
+    cudnn_frontend::EngineConfigGenerator& generator,
+    const Tensor& x,
+    cudnn_frontend::executionPlans_t& valid_plans,
+    at::DataPtr& workspace_ptr) {
+  auto initial_predicate_function =
+      [&](cudnn_frontend::ExecutionPlan const& plan) -> bool {
     return plan_errata_exception(handle, plan.getTag());
   };
-  auto plans = generator.cudnnGetPlan(handle, opGraph, initial_predicate_function);
+  auto plans =
+      generator.cudnnGetPlan(handle, opGraph, initial_predicate_function);
   int64_t max_block_size = get_available_workspace();
   int64_t max_workspace_size = 0;
-  std::for_each(plans.begin(), plans.end(), [&] (cudnn_frontend::ExecutionPlan& plan) {
-    int64_t curr_workspace_size = plan.getWorkspaceSize();
-    if (curr_workspace_size <= max_block_size) {
-      if (curr_workspace_size > max_workspace_size) {
-        max_workspace_size = plan.getWorkspaceSize();
-      }
-      valid_plans.emplace_back(std::move(plan));
-    }
-  });
-  TORCH_CHECK_WITH(OutOfMemoryError, max_workspace_size < 1_TiB, "Not enough memory for workspace!");
+  std::for_each(
+      plans.begin(), plans.end(), [&](cudnn_frontend::ExecutionPlan& plan) {
+        int64_t curr_workspace_size = plan.getWorkspaceSize();
+        if (curr_workspace_size <= max_block_size) {
+          if (curr_workspace_size > max_workspace_size) {
+            max_workspace_size = plan.getWorkspaceSize();
+          }
+          valid_plans.emplace_back(std::move(plan));
+        }
+      });
+  TORCH_CHECK_WITH(
+      OutOfMemoryError,
+      max_workspace_size < 1_TiB,
+      "Not enough memory for workspace!");
   bool remove_invalid = false;
   while (max_workspace_size) {
     try {
-      workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(max_workspace_size);
+      workspace_ptr =
+          c10::cuda::CUDACachingAllocator::get()->allocate(max_workspace_size);
       break;
-    } catch (c10::OutOfMemoryError &e) {
+    } catch (c10::OutOfMemoryError& e) {
       max_workspace_size /= 2;
       (void)cudaGetLastError(); // clear CUDA error
       remove_invalid = true;
@@ -440,7 +682,7 @@ void generate_and_filter_plans(const cudnnHandle_t handle, cudnn_frontend::Opera
   }
   if (remove_invalid) {
     cudnn_frontend::executionPlans_t new_valid_plans;
-    for (auto &plan : valid_plans) {
+    for (auto& plan : valid_plans) {
       if (plan.getWorkspaceSize() <= max_workspace_size) {
         new_valid_plans.emplace_back(std::move(plan));
       }
@@ -449,26 +691,45 @@ void generate_and_filter_plans(const cudnnHandle_t handle, cudnn_frontend::Opera
   }
 }
 
-auto get_plans_from_find(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const Tensor& x, const Tensor& y, const Tensor& w, const CacheKeyWrapper& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const bool deterministic, const bool allow_tf32) {
-  auto opGraph = build_opgraph(handle, desc, x, y, w, key, padding, stride, dilation);
-  void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr()};
+auto get_plans_from_find(
+    const cudnnHandle_t handle,
+    const cudnnBackendDescriptorType_t desc,
+    const Tensor& x,
+    const Tensor& y,
+    const Tensor& w,
+    const CacheKeyWrapper& key,
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const bool deterministic,
+    const bool allow_tf32) {
+  auto opGraph =
+      build_opgraph(handle, desc, x, y, w, key, padding, stride, dilation);
+  void* data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr()};
   int64_t uids[] = {'x', 'y', 'w'};
-  // We don't care about getting the best ordering of algos if we're roing to run all of them
-  auto sources = get_generator_sources(desc, x, deterministic, allow_tf32, CUDNN_HEUR_MODE_INSTANT, true, true);
-  cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data());
+  // We don't care about getting the best ordering of algos if we're roing to
+  // run all of them
+  auto sources = get_generator_sources(
+      desc, x, deterministic, allow_tf32, CUDNN_HEUR_MODE_INSTANT, true, true);
+  cudnn_frontend::EngineConfigGenerator generator(
+      sources.size(), sources.data());
   cudnn_frontend::executionPlans_t valid_plans;
   c10::DeviceGuard g(x.options().device());
   at::DataPtr workspace_ptr;
-  generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr);
-  auto variantPack = cudnn_frontend::VariantPackBuilder()
-      .setDataPointers(3, data_ptrs)
-      .setUids(3, uids)
-      .setWorkspacePointer(workspace_ptr ? workspace_ptr.get() : nullptr)
-      .build();
+  generate_and_filter_plans(
+      handle, opGraph, generator, x, valid_plans, workspace_ptr);
+  auto variantPack =
+      cudnn_frontend::VariantPackBuilder()
+          .setDataPointers(3, data_ptrs)
+          .setUids(3, uids)
+          .setWorkspacePointer(workspace_ptr ? workspace_ptr.get() : nullptr)
+          .build();
 
   auto benchmark_limit = at::globalContext().benchmarkLimitCuDNN();
   benchmark_limit = benchmark_limit ? benchmark_limit : 10000;
-  auto plans = cudnn_frontend::time_sorted_plan<cudnn_frontend::CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_ONCE>(handle, std::move(valid_plans), variantPack, benchmark_limit);
+  auto plans = cudnn_frontend::time_sorted_plan<
+      cudnn_frontend::CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_ONCE>(
+      handle, std::move(valid_plans), variantPack, benchmark_limit);
 
   cudnn_frontend::executionPlans_t sorted_plans;
   for (auto& plan : plans) {
@@ -477,30 +738,53 @@ auto get_plans_from_find(const cudnnHandle_t handle, const cudnnBackendDescripto
   return sorted_plans;
 }
 
-auto get_plans_from_find_fused(const cudnnHandle_t handle,
-                               const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b,
-                               const float alpha, const CacheKeyFusedWrapper& key,
-                               const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation,
-                               const bool deterministic, const bool allow_tf32) {
-  auto opGraph = build_opgraph_fused(handle, x, y, w, z, b, alpha, key, padding, stride, dilation);
-  void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr(), z.data_ptr(), b.data_ptr()};
+auto get_plans_from_find_fused(
+    const cudnnHandle_t handle,
+    const Tensor& x,
+    const Tensor& y,
+    const Tensor& w,
+    const Tensor& z,
+    const Tensor& b,
+    const float alpha,
+    const CacheKeyFusedWrapper& key,
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const bool deterministic,
+    const bool allow_tf32) {
+  auto opGraph = build_opgraph_fused(
+      handle, x, y, w, z, b, alpha, key, padding, stride, dilation);
+  void* data_ptrs[] = {
+      x.data_ptr(), y.data_ptr(), w.data_ptr(), z.data_ptr(), b.data_ptr()};
   int64_t uids[] = {'x', 'y', 'w', 'z', 'b'};
 
-  auto sources = get_generator_sources(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, x, deterministic, allow_tf32, CUDNN_HEUR_MODE_INSTANT, true, true);
-  cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data());
+  auto sources = get_generator_sources(
+      CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
+      x,
+      deterministic,
+      allow_tf32,
+      CUDNN_HEUR_MODE_INSTANT,
+      true,
+      true);
+  cudnn_frontend::EngineConfigGenerator generator(
+      sources.size(), sources.data());
   cudnn_frontend::executionPlans_t valid_plans;
   c10::DeviceGuard g(x.options().device());
   at::DataPtr workspace_ptr;
-  generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr);
-  auto variantPack = cudnn_frontend::VariantPackBuilder()
-      .setDataPointers(5, data_ptrs)
-      .setUids(5, uids)
-      .setWorkspacePointer(workspace_ptr ? workspace_ptr.get() : nullptr)
-      .build();
+  generate_and_filter_plans(
+      handle, opGraph, generator, x, valid_plans, workspace_ptr);
+  auto variantPack =
+      cudnn_frontend::VariantPackBuilder()
+          .setDataPointers(5, data_ptrs)
+          .setUids(5, uids)
+          .setWorkspacePointer(workspace_ptr ? workspace_ptr.get() : nullptr)
+          .build();
 
   auto benchmark_limit = at::globalContext().benchmarkLimitCuDNN();
   benchmark_limit = benchmark_limit ? benchmark_limit : 10000;
-  auto plans = cudnn_frontend::time_sorted_plan<cudnn_frontend::CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_ONCE>(handle, std::move(valid_plans), variantPack, benchmark_limit);
+  auto plans = cudnn_frontend::time_sorted_plan<
+      cudnn_frontend::CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_ONCE>(
+      handle, std::move(valid_plans), variantPack, benchmark_limit);
 
   cudnn_frontend::executionPlans_t sorted_plans;
   for (auto& plan : plans) {
@@ -509,203 +793,419 @@ auto get_plans_from_find_fused(const cudnnHandle_t handle,
   return sorted_plans;
 }
 
-
-// We only get configs from this stage to avoid building unnecessary plans that are never executed
-auto get_configs_from_heuristics(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, std::string& opgraph_tag, const Tensor& x, const Tensor& y, const Tensor& w, const CacheKeyWrapper& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const bool deterministic, const bool allow_tf32, const bool fallback) {
-  auto opGraph = build_opgraph(handle, desc, x, y, w, key, padding, stride, dilation);
+// We only get configs from this stage to avoid building unnecessary plans that
+// are never executed
+auto get_configs_from_heuristics(
+    const cudnnHandle_t handle,
+    const cudnnBackendDescriptorType_t desc,
+    std::string& opgraph_tag,
+    const Tensor& x,
+    const Tensor& y,
+    const Tensor& w,
+    const CacheKeyWrapper& key,
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const bool deterministic,
+    const bool allow_tf32,
+    const bool fallback) {
+  auto opGraph =
+      build_opgraph(handle, desc, x, y, w, key, padding, stride, dilation);
   opgraph_tag = opGraph.getTag();
-  auto heuristic_mode = at::native::cudnnv8_use_heur_mode_b() ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_INSTANT;
-  auto sources = get_generator_sources(desc, x, deterministic, allow_tf32, heuristic_mode, !fallback, fallback);
+  auto heuristic_mode = at::native::cudnnv8_use_heur_mode_b()
+      ? CUDNN_HEUR_MODE_B
+      : CUDNN_HEUR_MODE_INSTANT;
+  auto sources = get_generator_sources(
+      desc, x, deterministic, allow_tf32, heuristic_mode, !fallback, fallback);
 
-  cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data());
+  cudnn_frontend::EngineConfigGenerator generator(
+      sources.size(), sources.data());
   auto configs = generator.generate_engine_config(opGraph);
   return configs;
 }
 
-auto get_configs_from_heuristics_fused(const cudnnHandle_t handle, std::string& opgraph_tag, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, const float alpha, const CacheKeyFusedWrapper& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const bool deterministic, const bool allow_tf32, const bool fallback) {
-  auto opGraph = build_opgraph_fused(handle, x, y, w, z, b, alpha, key, padding, stride, dilation);
+auto get_configs_from_heuristics_fused(
+    const cudnnHandle_t handle,
+    std::string& opgraph_tag,
+    const Tensor& x,
+    const Tensor& y,
+    const Tensor& w,
+    const Tensor& z,
+    const Tensor& b,
+    const float alpha,
+    const CacheKeyFusedWrapper& key,
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const bool deterministic,
+    const bool allow_tf32,
+    const bool fallback) {
+  auto opGraph = build_opgraph_fused(
+      handle, x, y, w, z, b, alpha, key, padding, stride, dilation);
   opgraph_tag = opGraph.getTag();
-  auto heuristic_mode = at::native::cudnnv8_use_heur_mode_b() ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_INSTANT;
-  auto sources = get_generator_sources(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, x, deterministic, allow_tf32, heuristic_mode, !fallback, fallback);
+  auto heuristic_mode = at::native::cudnnv8_use_heur_mode_b()
+      ? CUDNN_HEUR_MODE_B
+      : CUDNN_HEUR_MODE_INSTANT;
+  auto sources = get_generator_sources(
+      CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
+      x,
+      deterministic,
+      allow_tf32,
+      heuristic_mode,
+      !fallback,
+      fallback);
 
-  cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data());
+  cudnn_frontend::EngineConfigGenerator generator(
+      sources.size(), sources.data());
   auto configs = generator.generate_engine_config(opGraph);
   return configs;
 }
 
-void try_plans(cudnn_frontend::executionPlans_t& plans, const CacheKeyWrapper& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w) {
-  for (auto & plan : plans) {
+void try_plans(
+    cudnn_frontend::executionPlans_t& plans,
+    const CacheKeyWrapper& key,
+    const cudnnHandle_t handle,
+    const Tensor& x,
+    const Tensor& y,
+    const Tensor& w,
+    const cudnnBackendDescriptorType_t operation) {
+  for (auto& plan : plans) {
     try {
-      run_conv_plan(handle, x, y, w, plan);
+      run_conv_plan(handle, x, y, w, plan, operation);
       benchmark_cache.update(key, plan);
       return;
-    } catch (cudnn_frontend::cudnnException &e) {} catch (CuDNNError &e) {}
-      catch (c10::OutOfMemoryError &e) {
-        (void)cudaGetLastError(); // clear CUDA error
+    } catch (cudnn_frontend::cudnnException& e) {
+    } catch (CuDNNError& e) {
+    } catch (c10::OutOfMemoryError& e) {
+      (void)cudaGetLastError(); // clear CUDA error
     }
   }
-  TORCH_CHECK(false, "FIND was unable to find an engine to execute this computation");
+  TORCH_CHECK(
+      false, "FIND was unable to find an engine to execute this computation");
 }
 
-void try_plans_fused(cudnn_frontend::executionPlans_t& plans, const CacheKeyFusedWrapper& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b) {
-  for (auto & plan : plans) {
+void try_plans_fused(
+    cudnn_frontend::executionPlans_t& plans,
+    const CacheKeyFusedWrapper& key,
+    const cudnnHandle_t handle,
+    const Tensor& x,
+    const Tensor& y,
+    const Tensor& w,
+    const Tensor& z,
+    const Tensor& b) {
+  for (auto& plan : plans) {
     try {
       run_conv_plan_fused(handle, x, y, w, z, b, plan);
       benchmark_cache_fused.update(key, plan);
       return;
-    } catch (cudnn_frontend::cudnnException &e) {} catch (CuDNNError &e) {}
-      catch (c10::OutOfMemoryError &e) {
-        (void)cudaGetLastError(); // clear CUDA error
+    } catch (cudnn_frontend::cudnnException& e) {
+    } catch (CuDNNError& e) {
+    } catch (c10::OutOfMemoryError& e) {
+      (void)cudaGetLastError(); // clear CUDA error
     }
   }
-  TORCH_CHECK(false, "FIND was unable to find an engine to execute this computation");
+  TORCH_CHECK(
+      false, "FIND was unable to find an engine to execute this computation");
 }
 
-bool try_configs(cudnn_frontend::EngineConfigList& configs, const std::string& opgraph_tag, const CacheKeyWrapper& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w) {
-  for (auto & config : configs) {
+bool try_configs(
+    cudnn_frontend::EngineConfigList& configs,
+    const std::string& opgraph_tag,
+    const CacheKeyWrapper& key,
+    const cudnnHandle_t handle,
+    const Tensor& x,
+    const Tensor& y,
+    const Tensor& w,
+    const cudnnBackendDescriptorType_t operation) {
+  for (auto& config : configs) {
     try {
       auto plan = cudnn_frontend::ExecutionPlanBuilder()
-                    .setHandle(handle)
-                    .setEngineConfig(config, opgraph_tag)
-                    .build();
+                      .setHandle(handle)
+                      .setEngineConfig(config, opgraph_tag)
+                      .build();
       if (plan_errata_exception(handle, plan.getTag())) {
         continue;
       }
-      run_conv_plan(handle, x, y, w, plan);
+      run_conv_plan(handle, x, y, w, plan, operation);
       benchmark_cache.update(key, plan);
       return true;
-    } catch (cudnn_frontend::cudnnException &e) {} catch(CuDNNError &e) {}
-      catch (c10::OutOfMemoryError &e) {
-        (void)cudaGetLastError(); // clear CUDA error
+    } catch (cudnn_frontend::cudnnException& e) {
+    } catch (CuDNNError& e) {
+    } catch (c10::OutOfMemoryError& e) {
+      (void)cudaGetLastError(); // clear CUDA error
     }
   }
   return false;
 }
 
-bool try_configs_fused(cudnn_frontend::EngineConfigList& configs, const std::string& opgraph_tag, const CacheKeyFusedWrapper& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b) {
-  for (auto & config : configs) {
+bool try_configs_fused(
+    cudnn_frontend::EngineConfigList& configs,
+    const std::string& opgraph_tag,
+    const CacheKeyFusedWrapper& key,
+    const cudnnHandle_t handle,
+    const Tensor& x,
+    const Tensor& y,
+    const Tensor& w,
+    const Tensor& z,
+    const Tensor& b) {
+  for (auto& config : configs) {
     try {
       auto plan = cudnn_frontend::ExecutionPlanBuilder()
-                    .setHandle(handle)
-                    .setEngineConfig(config, opgraph_tag)
-                    .build();
+                      .setHandle(handle)
+                      .setEngineConfig(config, opgraph_tag)
+                      .build();
       if (plan_errata_exception(handle, plan.getTag())) {
         continue;
       }
       run_conv_plan_fused(handle, x, y, w, z, b, plan);
       benchmark_cache_fused.update(key, plan);
       return true;
-    } catch (cudnn_frontend::cudnnException &e) {} catch(CuDNNError &e) {}
-      catch (c10::OutOfMemoryError &e) {
-        (void)cudaGetLastError(); // clear CUDA error
+    } catch (cudnn_frontend::cudnnException& e) {
+    } catch (CuDNNError& e) {
+    } catch (c10::OutOfMemoryError& e) {
+      (void)cudaGetLastError(); // clear CUDA error
     }
   }
   return false;
 }
 
-void run_single_conv(const cudnnBackendDescriptorType_t operation,
-  const Tensor& x, const Tensor& y, const Tensor& w,
-  const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups,
-  const bool benchmark, const bool deterministic, const bool allow_tf32) {
+void run_single_conv(
+    const cudnnBackendDescriptorType_t operation,
+    const Tensor& x,
+    const Tensor& y,
+    const Tensor& w,
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const int64_t groups,
+    const bool benchmark,
+    const bool deterministic,
+    const bool allow_tf32) {
   cudnnHandle_t handle = getCudnnHandle();
-  CacheKeyWrapper key(operation, y, x, w, padding, stride, dilation, groups, deterministic, allow_tf32);
+  CacheKeyWrapper key(
+      operation,
+      y,
+      x,
+      w,
+      padding,
+      stride,
+      dilation,
+      groups,
+      deterministic,
+      allow_tf32);
   // TODO: is this thread safe if cache is updated? is pointer stale?
   auto search = benchmark_cache.find(key);
   if (search) {
     try {
-      run_conv_plan(handle, x, y, w, *search);
+      run_conv_plan(handle, x, y, w, *search, operation);
       return;
-    } catch(c10::OutOfMemoryError &e) {
+    } catch (c10::OutOfMemoryError& e) {
       (void)cudaGetLastError(); // clear CUDA error
     }
   }
   if (!benchmark) {
     std::string opgraph_tag; // extra data needed for errata filter
     // heuristic configs
-    cudnn_frontend::EngineConfigList configs = get_configs_from_heuristics(handle, operation,
-                                                                           opgraph_tag,
-                                                                           x, y, w, key,
-                                                                           padding, stride, dilation,
-                                                                           deterministic, allow_tf32, false);
-    if (try_configs(configs, opgraph_tag, key, handle, x, y, w)) { return; }
+    cudnn_frontend::EngineConfigList configs = get_configs_from_heuristics(
+        handle,
+        operation,
+        opgraph_tag,
+        x,
+        y,
+        w,
+        key,
+        padding,
+        stride,
+        dilation,
+        deterministic,
+        allow_tf32,
+        false);
+    if (try_configs(configs, opgraph_tag, key, handle, x, y, w, operation)) {
+      return;
+    }
     // fallback configs
-    configs = get_configs_from_heuristics(handle, operation,
-                                          opgraph_tag,
-                                          x, y, w, key,
-                                          padding, stride, dilation,
-                                          deterministic, allow_tf32, true);
-    if (try_configs(configs, opgraph_tag, key, handle, x, y, w)) { return; }
-    TORCH_CHECK(false, "GET was unable to find an engine to execute this computation");
+    configs = get_configs_from_heuristics(
+        handle,
+        operation,
+        opgraph_tag,
+        x,
+        y,
+        w,
+        key,
+        padding,
+        stride,
+        dilation,
+        deterministic,
+        allow_tf32,
+        true);
+    if (try_configs(configs, opgraph_tag, key, handle, x, y, w, operation)) {
+      return;
+    }
+    TORCH_CHECK(
+        false, "GET was unable to find an engine to execute this computation");
   } else {
-    cudnn_frontend::executionPlans_t plans = get_plans_from_find(handle, operation,
-                                                                 x, y, w, key,
-                                                                 padding, stride, dilation,
-                                                                 deterministic, allow_tf32);
+    cudnn_frontend::executionPlans_t plans = get_plans_from_find(
+        handle,
+        operation,
+        x,
+        y,
+        w,
+        key,
+        padding,
+        stride,
+        dilation,
+        deterministic,
+        allow_tf32);
     // Replicate v7 behavior: clear cached blocks as benchmark incurs
     // significant memory consumptiont that is not needed after this step
     if (at::native::_cudnn_get_conv_benchmark_empty_cache()) {
       c10::cuda::CUDACachingAllocator::emptyCache();
     }
-    try_plans(plans, key, handle, x, y, w);
+    try_plans(plans, key, handle, x, y, w, operation);
   }
 }
 
-void run_fused_conv(const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b,
-  float alpha, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
-  int64_t groups, const bool benchmark, const bool deterministic, const bool allow_tf32) {
+void run_fused_conv(
+    const Tensor& x,
+    const Tensor& y,
+    const Tensor& w,
+    const Tensor& z,
+    const Tensor& b,
+    float alpha,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    const bool benchmark,
+    const bool deterministic,
+    const bool allow_tf32) {
   cudnnHandle_t handle = getCudnnHandle();
 
-  CacheKeyFusedWrapper key(y, x, w, z, b, alpha, padding, stride, dilation, groups, deterministic, allow_tf32);
+  CacheKeyFusedWrapper key(
+      y,
+      x,
+      w,
+      z,
+      b,
+      alpha,
+      padding,
+      stride,
+      dilation,
+      groups,
+      deterministic,
+      allow_tf32);
   auto search = benchmark_cache_fused.find(key);
   if (search) {
     try {
       run_conv_plan_fused(handle, x, y, w, z, b, *search);
       return;
-    } catch(c10::OutOfMemoryError &e) {
+    } catch (c10::OutOfMemoryError& e) {
       (void)cudaGetLastError(); // clear CUDA error
     }
   }
   if (!benchmark) {
     std::string opgraph_tag; // extra data needed for errata filter
     // heuristic configs
-    cudnn_frontend::EngineConfigList configs = get_configs_from_heuristics_fused(handle,
-                                                                                 opgraph_tag,
-                                                                                 x, y, w, z, b, alpha, key,
-                                                                                 padding, stride, dilation,
-                                                                                 deterministic, allow_tf32, false);
-    if (try_configs_fused(configs, opgraph_tag, key, handle, x, y, w, z, b)) { return; }
+    cudnn_frontend::EngineConfigList configs =
+        get_configs_from_heuristics_fused(
+            handle,
+            opgraph_tag,
+            x,
+            y,
+            w,
+            z,
+            b,
+            alpha,
+            key,
+            padding,
+            stride,
+            dilation,
+            deterministic,
+            allow_tf32,
+            false);
+    if (try_configs_fused(configs, opgraph_tag, key, handle, x, y, w, z, b)) {
+      return;
+    }
     // fallback configs
-    configs = get_configs_from_heuristics_fused(handle,
-                                                opgraph_tag,
-                                                x, y, w, z, b, alpha, key,
-                                                padding, stride, dilation,
-                                                deterministic, allow_tf32, true);
-    if (try_configs_fused(configs, opgraph_tag, key, handle, x, y, w, z, b)) { return; }
-    TORCH_CHECK(false, "GET was unable to find an engine to execute this computation");
+    configs = get_configs_from_heuristics_fused(
+        handle,
+        opgraph_tag,
+        x,
+        y,
+        w,
+        z,
+        b,
+        alpha,
+        key,
+        padding,
+        stride,
+        dilation,
+        deterministic,
+        allow_tf32,
+        true);
+    if (try_configs_fused(configs, opgraph_tag, key, handle, x, y, w, z, b)) {
+      return;
+    }
+    TORCH_CHECK(
+        false, "GET was unable to find an engine to execute this computation");
   } else {
-    cudnn_frontend::executionPlans_t plans = get_plans_from_find_fused(handle,
-                                                                       x, y, w, z, b, alpha, key,
-                                                                       padding, stride, dilation,
-                                                                       deterministic, allow_tf32);
+    cudnn_frontend::executionPlans_t plans = get_plans_from_find_fused(
+        handle,
+        x,
+        y,
+        w,
+        z,
+        b,
+        alpha,
+        key,
+        padding,
+        stride,
+        dilation,
+        deterministic,
+        allow_tf32);
     try_plans_fused(plans, key, handle, x, y, w, z, b);
   }
 }
 
 void raw_cudnn_convolution_forward_out(
-    const Tensor& output, const Tensor& input, const Tensor& weight,
-    const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups,
-    const bool benchmark, const bool deterministic, const bool allow_tf32)
-{
-  if (output.numel() == 0) { return; }
+    const Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const int64_t groups,
+    const bool benchmark,
+    const bool deterministic,
+    const bool allow_tf32) {
+  if (output.numel() == 0) {
+    return;
+  }
   if (at::native::cudnnv8_enabled_check_debug()) {
-    run_single_conv(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
-      input, output, weight, padding, stride, dilation, groups,
-      benchmark, deterministic, allow_tf32);
+    run_single_conv(
+        CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
+        input,
+        output,
+        weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        allow_tf32);
   } else {
     raw_cudnn_convolution_forward_out_v7(
-      output, input, weight,
-      padding, stride, dilation, groups,
-      benchmark, deterministic, allow_tf32);
+        output,
+        input,
+        weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        allow_tf32);
   }
 }
 
@@ -713,37 +1213,83 @@ void raw_cudnn_convolution_backward_input_out(
     const at::Tensor& grad_input,
     const at::Tensor& grad_output,
     const at::Tensor& weight,
-    const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups,
-    const bool benchmark, const bool deterministic, const bool allow_tf32) {
-  if (grad_input.numel() == 0) { return; }
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const int64_t groups,
+    const bool benchmark,
+    const bool deterministic,
+    const bool allow_tf32) {
+  if (grad_input.numel() == 0) {
+    return;
+  }
   if (at::native::cudnnv8_enabled_check_debug()) {
-    run_single_conv(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR,
-      grad_input, grad_output, weight, padding, stride, dilation, groups,
-      benchmark, deterministic, allow_tf32);
+    run_single_conv(
+        CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR,
+        grad_input,
+        grad_output,
+        weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        allow_tf32);
   } else {
     raw_cudnn_convolution_backward_input_out_v7(
-      grad_input,
-      grad_output,
-      weight,
-      padding, stride, dilation, groups,
-      benchmark, deterministic, allow_tf32);
+        grad_input,
+        grad_output,
+        weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        allow_tf32);
   }
 }
 
 void raw_cudnn_convolution_backward_weight_out(
-    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
-    const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups,
-    const bool benchmark, const bool deterministic, const bool allow_tf32) {
-  if (grad_weight.numel() == 0) { return; }
+    const Tensor& grad_weight,
+    const Tensor& grad_output,
+    const Tensor& input,
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const int64_t groups,
+    const bool benchmark,
+    const bool deterministic,
+    const bool allow_tf32) {
+  if (grad_weight.numel() == 0) {
+    return;
+  }
   if (at::native::cudnnv8_enabled_check_debug()) {
-    run_single_conv(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR,
-      input, grad_output, grad_weight, padding, stride, dilation, groups,
-      benchmark, deterministic, allow_tf32);
+    run_single_conv(
+        CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR,
+        input,
+        grad_output,
+        grad_weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        allow_tf32);
   } else {
     raw_cudnn_convolution_backward_weight_out_v7(
-      grad_weight, grad_output, input,
-      padding, stride, dilation, groups,
-      benchmark, deterministic, allow_tf32);
+        grad_weight,
+        grad_output,
+        input,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        allow_tf32);
   }
 }
 
@@ -761,19 +1307,46 @@ void raw_cudnn_convolution_add_relu_out(
     bool benchmark,
     bool deterministic,
     bool allow_tf32) {
-  if (output.numel() == 0) { return; }
+  if (output.numel() == 0) {
+    return;
+  }
   if (at::native::cudnnv8_enabled_check_debug()) {
-    auto bias_ = input.ndimension() == 4 ? bias.view({1, bias.numel(), 1, 1}) : bias.view({1, bias.numel(), 1, 1, 1});
-    run_fused_conv(input, output, weight, z, bias_,
-      alpha, stride, padding, dilation,
-      groups, benchmark, deterministic, allow_tf32);
+    auto bias_ = input.ndimension() == 4
+        ? bias.view({1, bias.numel(), 1, 1})
+        : bias.view({1, bias.numel(), 1, 1, 1});
+    run_fused_conv(
+        input,
+        output,
+        weight,
+        z,
+        bias_,
+        alpha,
+        stride,
+        padding,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        allow_tf32);
   } else {
-    raw_cudnn_convolution_add_relu_out_v7(output, input, weight, z,
-                                          alpha, bias, stride, padding, dilation,
-                                          groups, benchmark, deterministic, allow_tf32);
+    raw_cudnn_convolution_add_relu_out_v7(
+        output,
+        input,
+        weight,
+        z,
+        alpha,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        allow_tf32);
   }
 }
 
-}} // at::native
+} // namespace native
+} // namespace at
 
-#endif  // AT_CUDNN_ENABLED
+#endif // AT_CUDNN_ENABLED
diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp
index 8697b89c399af..af6b13567e37c 100644
--- a/aten/src/ATen/native/cudnn/GridSampler.cpp
+++ b/aten/src/ATen/native/cudnn/GridSampler.cpp
@@ -1,6 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/native/GridSamplerUtils.h>
 
@@ -8,58 +8,61 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
-#include <ATen/ops/empty.h>
-#include <ATen/ops/cudnn_grid_sampler_native.h>
 #include <ATen/ops/cudnn_grid_sampler_backward_native.h>
+#include <ATen/ops/cudnn_grid_sampler_native.h>
+#include <ATen/ops/empty.h>
 #endif
 
 #if !AT_CUDNN_ENABLED()
 
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 // See Note [ATen preprocessor philosophy]
 
-Tensor cudnn_grid_sampler_forward(
-    const Tensor& input_t, const Tensor& grid_t) {
+Tensor cudnn_grid_sampler_forward(const Tensor& input_t, const Tensor& grid_t) {
   AT_ERROR("cudnn_grid_sampler_forward: ATen not compiled with cuDNN support");
 }
 
 std::tuple<Tensor, Tensor> cudnn_grid_sampler_backward(
-    const Tensor& input_t, const Tensor& grid_t,
+    const Tensor& input_t,
+    const Tensor& grid_t,
     const Tensor& grad_output_t) {
   AT_ERROR("cudnn_grid_sampler_backward: ATen not compiled with cuDNN support");
 }
 
-}}
+} // namespace native
+} // namespace at
 
 #else // AT_CUDNN_ENABLED
 
+#include <ATen/cuda/Exceptions.h>
 #include <ATen/cudnn/Descriptors.h>
 #include <ATen/cudnn/Types.h>
 #include <ATen/cudnn/Utils.h>
-#include <ATen/cuda/Exceptions.h>
 
 #include <ATen/TensorUtils.h>
 #include <c10/util/irange.h>
 
 // TODO: descriptor checking
 
-
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 namespace {
 
-void setSamplerDescriptor(SpatialTransformerDescriptor& desc, cudnnDataType_t dataType, const at::Tensor& tensor)
-{
+void setSamplerDescriptor(
+    SpatialTransformerDescriptor& desc,
+    cudnnDataType_t dataType,
+    const at::Tensor& tensor) {
   int inputSize[4] = {0};
   for (const auto i : c10::irange(tensor.dim())) {
-    inputSize[i] = (int) tensor.size(i);
+    inputSize[i] = (int)tensor.size(i);
   }
   desc.set(dataType, 4, inputSize);
 }
 
-void checkGridSize(CheckedFrom c, TensorArg grid, TensorArg input)
-{
+void checkGridSize(CheckedFrom c, TensorArg grid, TensorArg input) {
   // assert size of grid is n*h*w*2
   // FYI: grid is between [-1, 1], where -1 left most pixel,
   // 1 represents right most pixel (and hence 0 is the center pixel)
@@ -72,22 +75,19 @@ void checkGridSize(CheckedFrom c, TensorArg grid, TensorArg input)
   checkSize(c, grid, 3, 2);
 }
 
-}  // namespace
+} // namespace
 
-Tensor cudnn_grid_sampler_forward(
-    const Tensor& input_t, const Tensor& grid_t)
-{
+Tensor cudnn_grid_sampler_forward(const Tensor& input_t, const Tensor& grid_t) {
   // See NOTE [ grid_sampler Native Functions ].
   // Add checks here in case this is called instead of grid_sampler.
   check_grid_sampler_common(input_t, grid_t);
   TORCH_CHECK(
-    cond_cudnn_grid_sampler(input_t, grid_t),
-    "Invalid arguments to cudnn_grid_sampler_forward");
+      cond_cudnn_grid_sampler(input_t, grid_t),
+      "Invalid arguments to cudnn_grid_sampler_forward");
 
   auto input_contig = contiguousIfZeroInStrides(input_t);
   auto grid_contig = grid_t.contiguous();
-  TensorArg input{ input_contig, "input", 1 },
-            grid{ grid_contig, "grid", 2 };
+  TensorArg input{input_contig, "input", 1}, grid{grid_contig, "grid", 2};
   CheckedFrom c = "cudnn_grid_sampler_forward";
   checkAllSameGPU(c, {input, grid});
   checkAllSameType(c, {input, grid});
@@ -95,10 +95,11 @@ Tensor cudnn_grid_sampler_forward(
   checkDim(c, input, 4);
 
   auto output_t = at::empty({0}, input->options());
-  output_t.resize_({input->size(0), input->size(1), grid->size(1), grid->size(2)});
+  output_t.resize_(
+      {input->size(0), input->size(1), grid->size(1), grid->size(2)});
 
-  TensorDescriptor idesc{ *input };  // input descriptor
-  TensorDescriptor odesc{ output_t };  // output descriptor
+  TensorDescriptor idesc{*input}; // input descriptor
+  TensorDescriptor odesc{output_t}; // output descriptor
   SpatialTransformerDescriptor desc; // sampler descriptor
 
   auto handle = getCudnnHandle();
@@ -108,11 +109,15 @@ Tensor cudnn_grid_sampler_forward(
   Constant one(dataType, 1);
   Constant zero(dataType, 0);
   AT_CUDNN_CHECK(cudnnSpatialTfSamplerForward(
-      handle, desc.desc(),
-      &one, idesc.desc(), input->data_ptr(),
-      grid->data_ptr(),
-      &zero, odesc.desc(), output_t.data_ptr()
-  ));
+      handle,
+      desc.desc(),
+      &one,
+      idesc.desc(),
+      input->const_data_ptr(),
+      grid->const_data_ptr(),
+      &zero,
+      odesc.desc(),
+      output_t.data_ptr()));
 
   return output_t;
 }
@@ -120,22 +125,21 @@ Tensor cudnn_grid_sampler_forward(
 // NB: CuDNN does not support output mask; you always get both
 // gradients.
 std::tuple<Tensor, Tensor> cudnn_grid_sampler_backward(
-    const Tensor& input_t, const Tensor& grid_t,
-    const Tensor& grad_output_t)
-{
+    const Tensor& input_t,
+    const Tensor& grid_t,
+    const Tensor& grad_output_t) {
   // See NOTE [ grid_sampler Native Functions ].
   // Add checks here in case this is called instead of grid_sampler.
   check_grid_sampler_common(input_t, grid_t);
   TORCH_CHECK(
-    cond_cudnn_grid_sampler(input_t, grid_t),
-    "Invalid arguments to cudnn_grid_sampler_backward");
+      cond_cudnn_grid_sampler(input_t, grid_t),
+      "Invalid arguments to cudnn_grid_sampler_backward");
 
   auto input_contig = contiguousIfZeroInStrides(input_t);
   auto grid_contig = grid_t.contiguous();
   auto grad_output_contig = contiguousIfZeroInStrides(grad_output_t);
-  TensorArg input{ input_contig, "input", 1 },
-            grid{ grid_contig, "grid", 2 },
-            grad_output{ grad_output_contig, "grad_output", 3 };
+  TensorArg input{input_contig, "input", 1}, grid{grid_contig, "grid", 2},
+      grad_output{grad_output_contig, "grad_output", 3};
   CheckedFrom c = "cudnn_grid_sampler_backward";
   checkAllSameGPU(c, {input, grad_output, grid});
   checkGridSize(c, grid, input);
@@ -147,9 +151,9 @@ std::tuple<Tensor, Tensor> cudnn_grid_sampler_backward(
   auto grad_grid_t = at::empty({0}, grid->options());
   grad_grid_t.resize_(grid->sizes());
 
-  TensorDescriptor idesc{ *input };  // input descriptor
-  TensorDescriptor odesc{ *grad_output };  // grad_output descriptor
-  TensorDescriptor gdesc{ grad_input_t };  // grad_input descriptor
+  TensorDescriptor idesc{*input}; // input descriptor
+  TensorDescriptor odesc{*grad_output}; // grad_output descriptor
+  TensorDescriptor gdesc{grad_input_t}; // grad_input descriptor
   SpatialTransformerDescriptor desc; // sampler descriptor
 
   auto handle = getCudnnHandle();
@@ -159,18 +163,26 @@ std::tuple<Tensor, Tensor> cudnn_grid_sampler_backward(
   Constant one(dataType, 1);
   Constant zero(dataType, 0);
   AT_CUDNN_CHECK(cudnnSpatialTfSamplerBackward(
-    handle, desc.desc(),
-    &one, idesc.desc(), input->data_ptr(),
-    &zero, gdesc.desc(), grad_input_t.data_ptr(),
-    &one, odesc.desc(), grad_output->data_ptr(),
-    // intruigingly, the outputs don't need descriptors
-    grid->data_ptr(),
-    &zero, grad_grid_t.data_ptr()
-  ));
-
-  return std::tuple<Tensor, Tensor>{ grad_input_t, grad_grid_t };
+      handle,
+      desc.desc(),
+      &one,
+      idesc.desc(),
+      input->const_data_ptr(),
+      &zero,
+      gdesc.desc(),
+      grad_input_t.data_ptr(),
+      &one,
+      odesc.desc(),
+      grad_output->const_data_ptr(),
+      // intriguingly, the outputs don't need descriptors
+      grid->const_data_ptr(),
+      &zero,
+      grad_grid_t.data_ptr()));
+
+  return std::tuple<Tensor, Tensor>{grad_input_t, grad_grid_t};
 }
 
-}}  // namespace at::cudnn
+} // namespace native
+} // namespace at
 
 #endif
diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp
index cb08b57c309c1..dff3bf9b80141 100644
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@@ -1,9 +1,9 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAConfig.h>
 #if AT_CUDNN_ENABLED()
-  #include <ATen/cudnn/Descriptors.h>
+#include <ATen/cudnn/Descriptors.h>
 #endif
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -20,7 +20,8 @@
 
 #if (!AT_CUDNN_ENABLED())
 
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 // See Note [ATen preprocessor philosophy]
 
@@ -42,7 +43,14 @@ bool _use_cudnn_ctc_loss_tensor(
   return false;
 }
 
-std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t BLANK, bool deterministic, bool zero_infinity) {
+std::tuple<Tensor, Tensor> _cudnn_ctc_loss(
+    const Tensor& log_probs,
+    const Tensor& targets,
+    IntArrayRef input_lengths,
+    IntArrayRef target_lengths,
+    int64_t BLANK,
+    bool deterministic,
+    bool zero_infinity) {
   AT_ERROR("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support");
 }
 
@@ -57,7 +65,8 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(
   AT_ERROR("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support");
 }
 
-}}
+} // namespace native
+} // namespace at
 
 #else // AT_CUDNN_ENABLED
 
@@ -68,7 +77,8 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(
 #include <ATen/TensorUtils.h>
 #include <c10/util/irange.h>
 
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 bool _use_cudnn_ctc_loss(
     const Tensor& log_probs,
@@ -82,8 +92,7 @@ bool _use_cudnn_ctc_loss(
       (targets.dim() == 1) && (log_probs.scalar_type() == at::kFloat) &&
       (targets.scalar_type() == at::kInt) &&
       (log_probs.device().type() == at::kCUDA) &&
-      (targets.device().type() == at::kCPU) &&
-      (targets.is_contiguous()) &&
+      (targets.device().type() == at::kCPU) && (targets.is_contiguous()) &&
       (log_probs.dim() == 3);
 
   if (use_cudnn) {
@@ -96,8 +105,8 @@ bool _use_cudnn_ctc_loss(
     for (const auto b : c10::irange(target_lengths.size())) {
       // target length < 256 is documented, but we see illegal memory accesses
       // when target lengths > input lengths for CuDNN
-      use_cudnn =
-          use_cudnn && (target_lengths[b] < 256) && (target_lengths[b] <= input_lengths[b]);
+      use_cudnn = use_cudnn && (target_lengths[b] < 256) &&
+          (target_lengths[b] <= input_lengths[b]);
     }
   }
   return use_cudnn;
@@ -113,15 +122,21 @@ bool _use_cudnn_ctc_loss_tensor(
   Tensor tlc = target_lengths.to(Device(at::kCPU), at::kLong).contiguous();
   IntArrayRef il(ilc.data_ptr<int64_t>(), ilc.numel());
   IntArrayRef tl(tlc.data_ptr<int64_t>(), tlc.numel());
-  return at::_use_cudnn_ctc_loss(
-      log_probs, targets, il, tl, BLANK);
+  return at::_use_cudnn_ctc_loss(log_probs, targets, il, tl, BLANK);
 }
 
-std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs_t, const Tensor& targets_t, IntArrayRef input_lengths_, IntArrayRef target_lengths_, int64_t BLANK, bool deterministic, bool zero_infinity) {
+std::tuple<Tensor, Tensor> _cudnn_ctc_loss(
+    const Tensor& log_probs_t,
+    const Tensor& targets_t,
+    IntArrayRef input_lengths_,
+    IntArrayRef target_lengths_,
+    int64_t BLANK,
+    bool deterministic,
+    bool zero_infinity) {
   (void)zero_infinity; // only used for backward
   const CheckedFrom c = "cudnn_ctc_loss";
-  const TensorArg log_probs { log_probs_t, "log_probs", 1 };
-  const TensorArg targets { targets_t, "targets", 2 };
+  const TensorArg log_probs{log_probs_t, "log_probs", 1};
+  const TensorArg targets{targets_t, "targets", 2};
   checkDim(c, log_probs, 3);
   checkScalarType(c, log_probs, kFloat);
   checkDim(c, targets, 1);
@@ -130,11 +145,16 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs_t, const Tens
   checkBackend(c, {*log_probs}, Backend::CUDA);
   checkBackend(c, {*targets}, Backend::CPU);
   const auto batch_size = log_probs->size(1);
-  TORCH_CHECK(static_cast<int64_t>(input_lengths_.size()) == batch_size, "input_lengths needs to have size to match batch_size");
-  TORCH_CHECK(static_cast<int64_t>(target_lengths_.size()) == batch_size, "target_lengths needs to have size to match batch_size");
+  TORCH_CHECK(
+      static_cast<int64_t>(input_lengths_.size()) == batch_size,
+      "input_lengths needs to have size to match batch_size");
+  TORCH_CHECK(
+      static_cast<int64_t>(target_lengths_.size()) == batch_size,
+      "target_lengths needs to have size to match batch_size");
 
   std::vector<int> input_lengths(input_lengths_.begin(), input_lengths_.end());
-  std::vector<int> target_lengths(target_lengths_.begin(), target_lengths_.end());
+  std::vector<int> target_lengths(
+      target_lengths_.begin(), target_lengths_.end());
 
   TORCH_CHECK(BLANK == 0, "blank must be label 0 for cudnn_ctc_loss");
   // checked in dispatch:
@@ -143,7 +163,9 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs_t, const Tens
 
   const auto handle = getCudnnHandle();
 
-  const cudnnCTCLossAlgo_t algo = (deterministic ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC);
+  const cudnnCTCLossAlgo_t algo =
+      (deterministic ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
+                     : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC);
 
   CTCLossDescriptor ctc_loss_desc;
 
@@ -167,7 +189,8 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs_t, const Tens
       ctc_loss_desc.desc(),
       &workspace_size));
 
-  Tensor workspace = at::empty(workspace_size, log_probs->options().dtype(kByte));
+  Tensor workspace =
+      at::empty(workspace_size, log_probs->options().dtype(kByte));
   Tensor costs = at::empty({log_probs->size(1)}, log_probs->options());
 
   AT_CUDNN_CHECK(cudnnCTCLoss(
@@ -203,6 +226,7 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(
       log_probs, targets, il, tl, BLANK, deterministic, zero_infinity);
 }
 
-}}  // namespace at::native
+} // namespace native
+} // namespace at
 
 #endif
diff --git a/aten/src/ATen/native/cudnn/MHA.cpp b/aten/src/ATen/native/cudnn/MHA.cpp
new file mode 100644
index 0000000000000..1f6bdbf5305a2
--- /dev/null
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@@ -0,0 +1,681 @@
+#include <ATen/ATen.h>
+#include <ATen/Config.h>
+#include <ATen/cuda/CUDAConfig.h>
+
+#if defined(USE_ROCM) || !AT_CUDNN_ENABLED() || \
+    (defined(CUDNN_VERSION) && CUDNN_VERSION < 8900)
+
+namespace at {
+namespace native {
+
+void run_cudnn_SDP_fprop(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    float scaling_factor,
+    bool isTraining,
+    bool is_causal,
+    double dropout_probability,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    Tensor& softmaxstats,
+    Tensor& o,
+    Tensor& dropoutseed,
+    Tensor& dropoutoffset) {
+  TORCH_CHECK(
+      false, "PyTorch was not compiled with cuDNN Flash Attention enabled!");
+}
+
+void run_cudnn_SDP_bprop(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    float scaling_factor,
+    bool is_causal,
+    float dropout_probability,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const Tensor& o,
+    const Tensor& dO,
+    const Tensor& softmaxstats,
+    Tensor& dQ,
+    Tensor& dK,
+    Tensor& dV,
+    const Tensor& dropoutseed,
+    const Tensor& dropoutoffset) {
+  TORCH_CHECK(
+      false, "PyTorch was not compiled with cuDNN Flash Attention enabled!");
+}
+
+} // namespace native
+} // namespace at
+
+#else // AT_CUDNN_ENABLED && defined(CUDNN_VERSION) && CUDNN_VERSION >= 8900
+#include <ATen/cudnn/Descriptors.h>
+#include <ATen/cudnn/Types.h>
+#include <ATen/cudnn/Utils.h>
+#include <ATen/native/cudnn/MHA.h>
+
+#include <ATen/cuda/Exceptions.h>
+#include <cudnn_frontend.h>
+
+#include <ATen/TensorUtils.h>
+#include <ATen/native/utils/ParamsHash.h>
+
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <cudnn.h>
+
+#include <iostream>
+
+namespace at {
+namespace native {
+
+#include <cudnn_frontend.h>
+
+namespace fe = cudnn_frontend;
+using graph_and_tensors = std::tuple<
+    std::shared_ptr<fe::graph::Graph>,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // K,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // V,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale,
+    // TODO(eqy): additional options
+    // std::shared_ptr<fe::graph::Tensor_attributes>, // Bias,
+    // std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_Q,
+    // std::shared_ptr<fe::graph::Tensor_attributes>, // SEQ_LEN_KV,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Offset,
+    // std::shared_ptr<fe::graph::Tensor_attributes>, // Dropout_mask,
+    // std::shared_ptr<fe::graph::Tensor_attributes>, // Dropout_scale
+    std::shared_ptr<fe::graph::Tensor_attributes>, // O
+    std::shared_ptr<fe::graph::Tensor_attributes> // Stats
+    >;
+
+using graph_and_tensors_backward = std::tuple<
+    std::shared_ptr<fe::graph::Graph>,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Q,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // K,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // V,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Attn_scale
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Seed,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // Offset,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // O,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // dO,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // stats,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // dQ,
+    std::shared_ptr<fe::graph::Tensor_attributes>, // dK,,
+    std::shared_ptr<fe::graph::Tensor_attributes> // dV,
+    >;
+
+#define MAX_MHA_DIM 4
+
+struct MHAParams {
+  c10::DeviceIndex device_id;
+  fe::DataType_t dataType;
+  std::array<int, MAX_MHA_DIM> q_dim;
+  std::array<int, MAX_MHA_DIM> k_dim;
+  std::array<int, MAX_MHA_DIM> v_dim;
+  std::array<int, MAX_MHA_DIM> q_stride;
+  std::array<int, MAX_MHA_DIM> k_stride;
+  std::array<int, MAX_MHA_DIM> v_stride;
+  int64_t b;
+  int64_t h;
+  int64_t s_q;
+  int64_t s_kv;
+  int64_t d;
+  double dropout_probability;
+  bool is_causal;
+  bool return_softmaxstats;
+};
+
+void setMHAParams(
+    MHAParams& params,
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    double dropout_probability,
+    bool is_causal,
+    bool return_softmaxstats) {
+  memset(&params, 0, sizeof(MHAParams));
+  params.device_id = at::cuda::current_device();
+  params.dataType = fe::DataType_t::HALF;
+  if (q.scalar_type() == kBFloat16) {
+    params.dataType = fe::DataType_t::BFLOAT16;
+  }
+  params.b = b;
+  params.h = h;
+  params.d = d;
+  params.s_q = s_q;
+  params.s_kv = s_kv;
+  params.dropout_probability = dropout_probability;
+  params.is_causal = is_causal;
+  params.return_softmaxstats = return_softmaxstats;
+  TORCH_INTERNAL_ASSERT(
+      q.sizes().size() == MAX_MHA_DIM,
+      "Q tensor has unexpected number of dims, please report a bug to PyTorch.");
+  TORCH_INTERNAL_ASSERT(
+      q.strides().size() == MAX_MHA_DIM,
+      "Q tensor has unexpected number of dims, please report a bug to PyTorch.");
+  TORCH_INTERNAL_ASSERT(
+      k.sizes().size() == MAX_MHA_DIM,
+      "K tensor has unexpected number of dims, please report a bug to PyTorch.");
+  TORCH_INTERNAL_ASSERT(
+      k.strides().size() == MAX_MHA_DIM,
+      "K tensor has unexpected number of dims, please report a bug to PyTorch.");
+  TORCH_INTERNAL_ASSERT(
+      v.sizes().size() == MAX_MHA_DIM,
+      "V tensor has unexpected number of dims, please report a bug to PyTorch.");
+  TORCH_INTERNAL_ASSERT(
+      v.strides().size() == MAX_MHA_DIM,
+      "V tensor has unexpected number of dims, please report a bug to PyTorch.");
+  std::copy(q.sizes().begin(), q.sizes().end(), params.q_dim.begin());
+  std::copy(q.strides().begin(), q.strides().end(), params.q_stride.begin());
+  std::copy(k.sizes().begin(), k.sizes().end(), params.k_dim.begin());
+  std::copy(k.strides().begin(), k.strides().end(), params.k_stride.begin());
+  std::copy(v.sizes().begin(), v.sizes().end(), params.v_dim.begin());
+  std::copy(v.strides().begin(), v.strides().end(), params.v_stride.begin());
+}
+
+struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
+  MHACacheKeyWrapper(
+      int64_t b,
+      int64_t h,
+      int64_t s_q,
+      int64_t s_kv,
+      int64_t d,
+      const Tensor& q,
+      const Tensor& k,
+      const Tensor& v,
+      double dropout_probability,
+      bool is_causal,
+      bool return_softmaxstats) {
+    setMHAParams(
+        this->pod,
+        b,
+        h,
+        s_q,
+        s_kv,
+        d,
+        q,
+        k,
+        v,
+        dropout_probability,
+        is_causal,
+        return_softmaxstats);
+  }
+};
+
+template <typename T, typename KeyType>
+struct MHAGraphCache {
+  std::unordered_map<KeyType, T, ParamsWrapperHash<KeyType>> engine_cache;
+
+  // no mutexes here as caches are now thread local for v8, can also return a
+  // pointer to the Execution Plan if we know it will not be invalidated by
+  // another thread
+  T* find(const KeyType& key) {
+    auto it = engine_cache.find(key);
+    if (it == engine_cache.end()) {
+      return nullptr;
+    }
+    return &(it->second);
+  }
+
+  void update(const KeyType& key, T& results) {
+    engine_cache.erase(key);
+    engine_cache.emplace(key, std::move(results));
+  }
+};
+
+// @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to
+// be thread safe across all engines see Limitations in
+// https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html
+thread_local MHAGraphCache<graph_and_tensors, MHACacheKeyWrapper> mhagraphcache;
+thread_local MHAGraphCache<graph_and_tensors_backward, MHACacheKeyWrapper>
+    mhagraphbackwardcache;
+
+auto build_graph_and_tensors(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    float scaling_factor,
+    bool return_softmaxstats,
+    bool is_causal,
+    double dropout_probability,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    Tensor& softmaxstats,
+    Tensor& o,
+    Tensor& dropoutseed,
+    Tensor& dropoutoffset,
+    cudnnHandle_t& handle,
+    MHAParams& params) {
+  auto dtype = fe::DataType_t::HALF;
+  if (q.scalar_type() == kBFloat16) {
+    dtype = fe::DataType_t::BFLOAT16;
+  }
+  auto mha_graph = std::make_shared<fe::graph::Graph>();
+  // We're baking in float accumulation and scale types
+  // in theory the graph may support other types, but they
+  // have not been tested
+  mha_graph->set_io_data_type(dtype)
+      .set_intermediate_data_type(fe::DataType_t::FLOAT)
+      .set_compute_data_type(fe::DataType_t::FLOAT);
+  auto Q = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("Q")
+          .set_dim(
+              std::vector<int64_t>(params.q_dim.begin(), params.q_dim.end()))
+          .set_stride(std::vector<int64_t>(
+              params.q_stride.begin(), params.q_stride.end())));
+  auto K = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("K")
+          .set_dim(
+              std::vector<int64_t>(params.k_dim.begin(), params.k_dim.end()))
+          .set_stride(std::vector<int64_t>(
+              params.k_stride.begin(), params.k_stride.end())));
+  auto V = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("V")
+          .set_dim(
+              std::vector<int64_t>(params.v_dim.begin(), params.v_dim.end()))
+          .set_stride(std::vector<int64_t>(
+              params.v_stride.begin(), params.v_stride.end())));
+  auto attn_scale =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_name("Attn_scale")
+                            .set_dim({1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_is_pass_by_value(true)
+                            .set_data_type(fe::DataType_t::FLOAT));
+  // TODO(eqy): support bias in the future in a follow-up PR
+  // auto bias = mha_graph->tensor(fe::graph::Tensor_attributes()
+  //                         .set_name("bias")
+  //                         .set_dim({b, 1, s_q, s_kv})
+  //                         .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
+  auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                    .set_name("Seed")
+                                    .set_dim({1, 1, 1, 1})
+                                    .set_stride({1, 1, 1, 1})
+                                    .set_data_type(fe::DataType_t::INT32));
+  auto offset = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("Offset")
+                                      .set_dim({1, 1, 1, 1})
+                                      .set_stride({1, 1, 1, 1})
+                                      .set_data_type(fe::DataType_t::INT32));
+  auto scaled_dot_product_flash_attention_options =
+      fe::graph::SDPA_attributes()
+          .set_name("CUDNN_SDPA")
+          .set_is_inference(return_softmaxstats == false)
+          .set_causal_mask(is_causal)
+          .set_attn_scale(attn_scale)
+          .set_dropout(dropout_probability, seed, offset);
+  // Optional bias in flash attention is only supported 8.9.3 onwards
+  if (cudnnGetVersion() >= 8904) {
+    // scaled_dot_product_flash_attention_options.set_alibi_mask(true);
+  }
+
+  auto seq_q = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                     .set_name("Seq_q")
+                                     .set_dim({b, 1, 1, 1})
+                                     .set_stride({1, 1, 1, 1})
+                                     .set_data_type(fe::DataType_t::INT32));
+  auto seq_kv = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("Seq_kv")
+                                      .set_dim({b, 1, 1, 1})
+                                      .set_stride({1, 1, 1, 1})
+                                      .set_data_type(fe::DataType_t::INT32));
+
+  // if (cudnnGetVersion() >= 8903) {
+  //     scaled_dot_product_flash_attention_options.set_bias(bias)
+  //         .set_padding_mask(true)
+  //         .set_seq_len_q(seq_q)
+  //         .set_seq_len_kv(seq_kv);
+  // }
+
+  auto [O, Stats] =
+      mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options);
+  O->set_output(true)
+      .set_dim(std::vector<int64_t>(
+          o.sizes().data(), o.sizes().data() + o.sizes().size()))
+      .set_stride(std::vector<int64_t>(
+          o.strides().data(), o.strides().data() + o.strides().size()));
+
+  if (Stats) {
+    Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+  }
+
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
+  AT_CUDNN_FRONTEND_CHECK(
+      mha_graph->create_execution_plans({fe::HeurMode_t::A}));
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle));
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle));
+
+  return std::make_tuple(
+      std::move(mha_graph),
+      std::move(Q),
+      std::move(K),
+      std::move(V),
+      std::move(attn_scale),
+      std::move(seed),
+      std::move(offset),
+      std::move(O),
+      std::move(Stats));
+}
+
+auto build_graph_and_tensors_backward(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    float scaling_factor,
+    bool is_causal,
+    float dropout_probability,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const Tensor& o,
+    const Tensor& dO,
+    const Tensor& softmaxstats,
+    Tensor& dQ,
+    Tensor& dK,
+    Tensor& dV,
+    const Tensor& dropoutseed,
+    const Tensor& dropoutoffset,
+    cudnnHandle_t& handle,
+    MHAParams& params) {
+  auto dtype = fe::DataType_t::HALF;
+  if (q.scalar_type() == kBFloat16) {
+    dtype = fe::DataType_t::BFLOAT16;
+  }
+  auto mha_graph = std::make_shared<fe::graph::Graph>();
+  // We're baking in float accumulation and scale types
+  // in theory the graph may support other types, but they
+  // have not been tested
+  mha_graph->set_io_data_type(dtype)
+      .set_intermediate_data_type(fe::DataType_t::FLOAT)
+      .set_compute_data_type(fe::DataType_t::FLOAT);
+  auto Q = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("Q")
+          .set_dim(std::vector<int64_t>(q.sizes().begin(), q.sizes().end()))
+          .set_stride(
+              std::vector<int64_t>(q.strides().begin(), q.strides().end())));
+  auto K = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("K")
+          .set_dim(std::vector<int64_t>(k.sizes().begin(), k.sizes().end()))
+          .set_stride(
+              std::vector<int64_t>(k.strides().begin(), k.strides().end())));
+  auto V = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("V")
+          .set_dim(std::vector<int64_t>(v.sizes().begin(), v.sizes().end()))
+          .set_stride(
+              std::vector<int64_t>(v.strides().begin(), v.strides().end())));
+  auto attn_scale =
+      mha_graph->tensor(fe::graph::Tensor_attributes()
+                            .set_name("Attn_scale")
+                            .set_dim({1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_is_pass_by_value(true)
+                            .set_data_type(fe::DataType_t::FLOAT));
+  auto Seed = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                    .set_name("Seed")
+                                    .set_dim({1, 1, 1, 1})
+                                    .set_stride({1, 1, 1, 1})
+                                    .set_data_type(fe::DataType_t::INT32));
+  auto Offset = mha_graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("Offset")
+                                      .set_dim({1, 1, 1, 1})
+                                      .set_stride({1, 1, 1, 1})
+                                      .set_data_type(fe::DataType_t::INT32));
+  auto O = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("O")
+          .set_dim(std::vector<int64_t>(o.sizes().begin(), o.sizes().end()))
+          .set_stride(
+              std::vector<int64_t>(o.strides().begin(), o.strides().end())));
+  auto STATS = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("Stats")
+          .set_dim(std::vector<int64_t>(
+              softmaxstats.sizes().begin(), softmaxstats.sizes().end()))
+          .set_stride(std::vector<int64_t>(
+              softmaxstats.strides().begin(), softmaxstats.strides().end()))
+          .set_data_type(fe::DataType_t::FLOAT));
+  auto DO = mha_graph->tensor(
+      fe::graph::Tensor_attributes()
+          .set_name("DO")
+          .set_dim(std::vector<int64_t>(dO.sizes().begin(), dO.sizes().end()))
+          .set_stride(
+              std::vector<int64_t>(dO.strides().begin(), dO.strides().end())));
+  auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
+                                   .set_name("CUDNN_SDPA_BACKWARD")
+                                   .set_causal_mask(is_causal)
+                                   .set_attn_scale(attn_scale);
+  if (dropout_probability != 0.0f) {
+    sdpa_backward_options.set_dropout(dropout_probability, Seed, Offset);
+  }
+  auto [DQ, DK, DV] =
+      mha_graph->sdpa_backward(Q, K, V, O, DO, STATS, sdpa_backward_options);
+  DQ->set_output(true)
+      .set_dim(std::vector<int64_t>(dQ.sizes().begin(), dQ.sizes().end()))
+      .set_stride(
+          std::vector<int64_t>(dQ.strides().begin(), dQ.strides().end()));
+  DK->set_output(true)
+      .set_dim(std::vector<int64_t>(dK.sizes().begin(), dK.sizes().end()))
+      .set_stride(
+          std::vector<int64_t>(dK.strides().begin(), dK.strides().end()));
+  DV->set_output(true)
+      .set_dim(std::vector<int64_t>(dV.sizes().begin(), dV.sizes().end()))
+      .set_stride(
+          std::vector<int64_t>(dV.strides().begin(), dV.strides().end()));
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
+  AT_CUDNN_FRONTEND_CHECK(
+      mha_graph->create_execution_plans({fe::HeurMode_t::A}));
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle));
+  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle));
+  return std::make_tuple(
+      std::move(mha_graph),
+      std::move(Q),
+      std::move(K),
+      std::move(V),
+      std::move(attn_scale),
+      std::move(Seed),
+      std::move(Offset),
+      std::move(O),
+      std::move(DO),
+      std::move(STATS),
+      std::move(DQ),
+      std::move(DK),
+      std::move(DV));
+}
+
+void run_cudnn_SDP_fprop(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    float scaling_factor,
+    bool return_softmaxstats,
+    bool is_causal,
+    double dropout_probability,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    Tensor& softmaxstats,
+    Tensor& o,
+    Tensor& dropoutseed,
+    Tensor& dropoutoffset) {
+  cudnnHandle_t handle = getCudnnHandle();
+  o = at::empty_strided(
+      {b, h, s_q, d}, {s_q * h * d, d, h * d, 1}, q.options());
+  if (return_softmaxstats) {
+    // TODO(eqy): verify that this is correct
+    softmaxstats = at::empty({b, h, s_q}, q.options().dtype(kFloat));
+  }
+
+  auto key = MHACacheKeyWrapper(
+      b,
+      h,
+      s_q,
+      s_kv,
+      d,
+      q,
+      k,
+      v,
+      dropout_probability,
+      is_causal,
+      return_softmaxstats);
+  auto graph_and_tensors_ptr = mhagraphcache.find(key);
+  graph_and_tensors graph_and_tensors_values;
+  if (graph_and_tensors_ptr) {
+    graph_and_tensors_values = *graph_and_tensors_ptr;
+  } else {
+    graph_and_tensors_values = build_graph_and_tensors(
+        b,
+        h,
+        s_q,
+        s_kv,
+        d,
+        scaling_factor,
+        return_softmaxstats,
+        is_causal,
+        dropout_probability,
+        q,
+        k,
+        v,
+        softmaxstats,
+        o,
+        dropoutseed,
+        dropoutoffset,
+        handle,
+        key.pod);
+  }
+  auto [mha_graph, Q, K, V, attn_scale, seed, offset, O, Stats] =
+      graph_and_tensors_values;
+  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
+      variant_pack = {
+          {Q, q.data_ptr()},
+          {K, k.data_ptr()},
+          {V, v.data_ptr()},
+          {attn_scale, &scaling_factor},
+          //{bias, bias.data_ptr()},
+          {seed, dropoutseed.data_ptr()},
+          {offset, dropoutoffset.data_ptr()},
+          {O, o.data_ptr()}};
+  if (return_softmaxstats) {
+    variant_pack[Stats] = softmaxstats.data_ptr();
+  }
+  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_ptr =
+      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
+  TORCH_CHECK(
+      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
+  mhagraphcache.update(key, graph_and_tensors_values);
+}
+
+void run_cudnn_SDP_bprop(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    float scaling_factor,
+    bool is_causal,
+    float dropout_probability,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const Tensor& o,
+    const Tensor& dO,
+    const Tensor& softmaxstats,
+    Tensor& dQ,
+    Tensor& dK,
+    Tensor& dV,
+    const Tensor& dropoutseed,
+    const Tensor& dropoutoffset) {
+  cudnnHandle_t handle = getCudnnHandle();
+  auto key = MHACacheKeyWrapper(
+      b, h, s_q, s_kv, d, q, k, v, dropout_probability, is_causal, true);
+  auto graph_and_tensors_backward_ptr = mhagraphbackwardcache.find(key);
+  graph_and_tensors_backward graph_and_tensors_backward_values;
+  if (graph_and_tensors_backward_ptr) {
+    graph_and_tensors_backward_values = *graph_and_tensors_backward_ptr;
+  } else {
+    graph_and_tensors_backward_values = build_graph_and_tensors_backward(
+        b,
+        h,
+        s_q,
+        s_kv,
+        d,
+        scaling_factor,
+        is_causal,
+        dropout_probability,
+        q,
+        k,
+        v,
+        o,
+        dO,
+        softmaxstats,
+        dQ,
+        dK,
+        dV,
+        dropoutseed,
+        dropoutoffset,
+        handle,
+        key.pod);
+  }
+  auto
+      [mha_graph, Q, K, V, attn_scale, Seed, Offset, O, Do, Stats, Dq, Dk, Dv] =
+          graph_and_tensors_backward_values;
+  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
+      variant_pack = {// inputs
+                      {Q, q.data_ptr()},
+                      {K, k.data_ptr()},
+                      {V, v.data_ptr()},
+                      {O, o.data_ptr()},
+                      {Do, dO.data_ptr()},
+                      {Stats, softmaxstats.data_ptr()},
+                      // outputs
+                      {Dq, dQ.data_ptr()},
+                      {Dk, dK.data_ptr()},
+                      {Dv, dV.data_ptr()},
+                      // pass by value
+                      {attn_scale, &scaling_factor}};
+  if (dropout_probability != 0.0f) {
+    variant_pack[Seed] = dropoutseed.data_ptr();
+    variant_pack[Offset] = dropoutoffset.data_ptr();
+  }
+  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_ptr =
+      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
+  TORCH_CHECK(!workspace_size || workspace_ptr.get());
+  TORCH_CHECK(
+      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
+  mhagraphbackwardcache.update(key, graph_and_tensors_backward_values);
+}
+
+} // namespace native
+} // namespace at
+
+#endif
diff --git a/aten/src/ATen/native/cudnn/MHA.h b/aten/src/ATen/native/cudnn/MHA.h
new file mode 100644
index 0000000000000..0406cf783dc53
--- /dev/null
+++ b/aten/src/ATen/native/cudnn/MHA.h
@@ -0,0 +1,47 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void run_cudnn_SDP_fprop(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    float scaling_factor,
+    bool isTraining,
+    bool is_causal,
+    double dropout_probability,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    Tensor& softmaxstats,
+    Tensor& o,
+    Tensor& dropoutseed,
+    Tensor& dropoutoffset);
+
+void run_cudnn_SDP_bprop(
+    int64_t b,
+    int64_t h,
+    int64_t s_q,
+    int64_t s_kv,
+    int64_t d,
+    float scaling_factor,
+    bool is_causal,
+    float dropout_probability,
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const Tensor& o,
+    const Tensor& dO,
+    const Tensor& softmaxstats,
+    Tensor& dQ,
+    Tensor& dK,
+    Tensor& dV,
+    const Tensor& dropoutseed,
+    const Tensor& dropoutoffset);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 7b758309d4cbd..05b1df3114f85 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -1,16 +1,16 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
+#include <ATen/MatrixRef.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/cuda/CUDAEvent.h>
-#include <ATen/cuda/CUDAGraphsUtils.cuh>
 #include <ATen/cuda/Exceptions.h>
-#include <ATen/MatrixRef.h>
 #include <ATen/native/RNN.h>
-#include <ATen/TensorUtils.h>
-#include <c10/util/accumulate.h>
 #include <c10/util/Exception.h>
+#include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -29,1038 +29,1222 @@
 
 #if !AT_CUDNN_ENABLED()
 
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 // See Note [ATen preprocessor philosophy]
 
 Tensor _cudnn_rnn_flatten_weight(
-    TensorList weight_arr, int64_t weight_stride0,
+    TensorList weight_arr,
+    int64_t weight_stride0,
     int64_t input_size,
-    int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size,
-    int64_t fn_num_layers, bool batch_first,
-    bool fn_bidirectional
-    ) {
+    int64_t fn_mode,
+    int64_t fn_hidden_size,
+    int64_t fn_proj_size,
+    int64_t fn_num_layers,
+    bool batch_first,
+    bool fn_bidirectional) {
   AT_ERROR("_cudnn_rnn_flatten_weight: ATen not compiled with cuDNN support");
 }
 
 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     const Tensor& input_r,
-    TensorList weight, int64_t weight_stride0, const c10::optional<Tensor>& weight_buf_r_opt, const Tensor& hx, const c10::optional<Tensor>& cx_opt,
-    int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size,
-    int64_t fn_num_layers, bool batch_first, double fn_dropout,
-    bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes, const c10::optional<Tensor>& fn_dropout_state_opt
-    ) {
+    TensorList weight,
+    int64_t weight_stride0,
+    const c10::optional<Tensor>& weight_buf_r_opt,
+    const Tensor& hx,
+    const c10::optional<Tensor>& cx_opt,
+    int64_t fn_mode,
+    int64_t fn_hidden_size,
+    int64_t fn_proj_size,
+    int64_t fn_num_layers,
+    bool batch_first,
+    double fn_dropout,
+    bool fn_train,
+    bool fn_bidirectional,
+    IntArrayRef fn_batch_sizes,
+    const c10::optional<Tensor>& fn_dropout_state_opt) {
   AT_ERROR("_cudnn_rnn: ATen not compiled with cuDNN support");
 }
 
 std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
-    const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const c10::optional<Tensor>& cx_opt,
-    const Tensor& output, const c10::optional<Tensor>& grad_output_r_opt, const c10::optional<Tensor>& grad_hy_r_opt, const c10::optional<Tensor>& grad_cy_r_opt,
-    int64_t mode, int64_t hidden_size, int64_t proj_size,
-    int64_t num_layers, bool batch_first, double dropout,
-    bool train, bool bidirectional, IntArrayRef batch_sizes, const c10::optional<Tensor>& dropout_state_opt, const Tensor& reserve,
-    std::array<bool, 4> output_mask
-    ) {
+    const Tensor& input,
+    TensorList weight,
+    int64_t weight_stride0,
+    const Tensor& weight_buf,
+    const Tensor& hx,
+    const c10::optional<Tensor>& cx_opt,
+    const Tensor& output,
+    const c10::optional<Tensor>& grad_output_r_opt,
+    const c10::optional<Tensor>& grad_hy_r_opt,
+    const c10::optional<Tensor>& grad_cy_r_opt,
+    int64_t mode,
+    int64_t hidden_size,
+    int64_t proj_size,
+    int64_t num_layers,
+    bool batch_first,
+    double dropout,
+    bool train,
+    bool bidirectional,
+    IntArrayRef batch_sizes,
+    const c10::optional<Tensor>& dropout_state_opt,
+    const Tensor& reserve,
+    std::array<bool, 4> output_mask) {
   AT_ERROR("_cudnn_rnn_backward: ATen not compiled with cuDNN support");
 }
 
-Tensor _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_seed,
+Tensor _cudnn_init_dropout_state(
+    double dropout,
+    bool train,
+    int64_t dropout_seed,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
     c10::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
-  TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
+  TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(
+      pin_memory);
 
   AT_ERROR("_cudnn_init_dropout_state: ATen not compiled with cuDNN support");
 }
 
-}} // namespace at::native
+} // namespace native
+} // namespace at
 
 #else // AT_CUDNN_ENABLED()
 
 #include <ATen/native/cudnn/RNNUtils.h>
 
-namespace at { namespace native {
+namespace at {
+namespace native {
 
 namespace {
-  // DropoutDescriptor
-
-  struct DropoutDescriptorParams {
-    bool train;
-    double dropout;
-    Tensor dropout_state;
-    DropoutDescriptorParams() = default;
-    void set(bool train_, double dropout_, Tensor dropout_state_) {
-      train = train_;
-      dropout = dropout_;
-      dropout_state = dropout_state_;
-    }
-    DropoutDescriptor descriptor(cudnnHandle_t handle) const {
-      auto dropout_p = train ? dropout : 0;
-      DropoutDescriptor dropout_desc;
-      if (dropout_p == 0) {
-        dropout_desc.set_no_dropout(handle);
-      } else {
-        dropout_desc.set(handle, dropout_p, dropout_state);
-      }
-      return dropout_desc;
+// DropoutDescriptor
+
+struct DropoutDescriptorParams {
+  bool train;
+  double dropout;
+  Tensor dropout_state;
+  DropoutDescriptorParams() = default;
+  void set(bool train_, double dropout_, Tensor dropout_state_) {
+    train = train_;
+    dropout = dropout_;
+    dropout_state = dropout_state_;
+  }
+  DropoutDescriptor descriptor(cudnnHandle_t handle) const {
+    auto dropout_p = train ? dropout : 0;
+    DropoutDescriptor dropout_desc;
+    if (dropout_p == 0) {
+      dropout_desc.set_no_dropout(handle);
+    } else {
+      dropout_desc.set(handle, dropout_p, dropout_state);
     }
-  };
+    return dropout_desc;
+  }
+};
 
-  // RNNDescriptor
+// RNNDescriptor
 
-  struct RNNDescriptorParams {
+struct RNNDescriptorParams {
 #ifdef USE_CUDNN_RNN_V8_API
-    int64_t input_size;
-    bool packed;
+  int64_t input_size;
+  bool packed;
 #endif
-    int64_t hidden_size;
-    int64_t proj_size;
-    int64_t num_layers;
-    cudnnDirectionMode_t bidirectional;
-    cudnnRNNMode_t mode;
-    cudnnDataType_t datatype;
-    cudnnDataType_t input_datatype;
-    cudnnRNNAlgo_t algo = CUDNN_RNN_ALGO_STANDARD;
-    cudnnRNNInputMode_t input_mode = CUDNN_LINEAR_INPUT;
-
-    int64_t num_directions() const {
-      return bidirectional ? 2 : 1;
-    }
+  int64_t hidden_size;
+  int64_t proj_size;
+  int64_t num_layers;
+  cudnnDirectionMode_t bidirectional;
+  cudnnRNNMode_t mode;
+  cudnnDataType_t datatype;
+  cudnnDataType_t input_datatype;
+  cudnnRNNAlgo_t algo = CUDNN_RNN_ALGO_STANDARD;
+  cudnnRNNInputMode_t input_mode = CUDNN_LINEAR_INPUT;
+
+  int64_t num_directions() const {
+    return bidirectional ? 2 : 1;
+  }
 
-    void set_mode(int64_t fn_mode) {
-      switch (fn_mode) {
-        case CUDNN_RNN_RELU:
-          mode = CUDNN_RNN_RELU;
-          break;
-        case CUDNN_RNN_TANH:
-          mode = CUDNN_RNN_TANH;
-          break;
-        case CUDNN_LSTM:
-          mode = CUDNN_LSTM;
-          break;
-        case CUDNN_GRU:
-          mode = CUDNN_GRU;
-          break;
-        default:
-        {
-          std::ostringstream oss;
-          oss << "unrecognized cuDNN RNN mode " << fn_mode;
-          AT_ERROR(oss.str());
-        }
+  void set_mode(int64_t fn_mode) {
+    switch (fn_mode) {
+      case CUDNN_RNN_RELU:
+        mode = CUDNN_RNN_RELU;
+        break;
+      case CUDNN_RNN_TANH:
+        mode = CUDNN_RNN_TANH;
+        break;
+      case CUDNN_LSTM:
+        mode = CUDNN_LSTM;
+        break;
+      case CUDNN_GRU:
+        mode = CUDNN_GRU;
+        break;
+      default: {
+        std::ostringstream oss;
+        oss << "unrecognized cuDNN RNN mode " << fn_mode;
+        AT_ERROR(oss.str());
       }
     }
+  }
 
-    void set_bidirectional(bool fn_bidirectional) {
-      bidirectional = fn_bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
-    }
+  void set_bidirectional(bool fn_bidirectional) {
+    bidirectional =
+        fn_bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
+  }
 
-    void set_algo(cudnnRNNAlgo_t algo){
-      this->algo = algo;
-    }
+  void set_algo(cudnnRNNAlgo_t algo) {
+    this->algo = algo;
+  }
 
 #ifndef USE_CUDNN_RNN_V8_API
-    void set(int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool bidirectional, cudnnDataType_t datatype, cudnnDataType_t input_datatype) {
+  void set(
+      int64_t mode,
+      int64_t hidden_size,
+      int64_t proj_size,
+      int64_t num_layers,
+      bool bidirectional,
+      cudnnDataType_t datatype,
+      cudnnDataType_t input_datatype){
 #else
-    void set(int64_t mode, int64_t input_size, bool packed, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool bidirectional, cudnnDataType_t datatype, cudnnDataType_t input_datatype) {
+  void set(
+      int64_t mode,
+      int64_t input_size,
+      bool packed,
+      int64_t hidden_size,
+      int64_t proj_size,
+      int64_t num_layers,
+      bool bidirectional,
+      cudnnDataType_t datatype,
+      cudnnDataType_t input_datatype) {
 #endif
       this->set_mode(mode);
 #ifdef USE_CUDNN_RNN_V8_API
-      this->input_size = input_size;
-      this->packed = packed;
+  this->input_size = input_size;
+  this->packed = packed;
 #endif
-      this->hidden_size = hidden_size;
-      this->proj_size = proj_size;
-      this->num_layers = num_layers;
-      this->set_bidirectional(bidirectional);
-      this->datatype = datatype;
-      this->input_datatype = input_datatype;
-    }
+  this->hidden_size = hidden_size;
+  this->proj_size = proj_size;
+  this->num_layers = num_layers;
+  this->set_bidirectional(bidirectional);
+  this->datatype = datatype;
+  this->input_datatype = input_datatype;
+}
 
-    RNNDescriptor descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const {
-      RNNDescriptor rnn_desc;
+RNNDescriptor
+descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const {
+  RNNDescriptor rnn_desc;
 #ifndef USE_CUDNN_RNN_V8_API
-      rnn_desc.set(handle, hidden_size, proj_size, num_layers, std::move(dropout_desc), input_mode, bidirectional, mode, datatype, input_datatype, algo, at::globalContext().allowTF32CuDNN());
+  rnn_desc.set(
+      handle,
+      hidden_size,
+      proj_size,
+      num_layers,
+      std::move(dropout_desc),
+      input_mode,
+      bidirectional,
+      mode,
+      datatype,
+      input_datatype,
+      algo,
+      at::globalContext().allowTF32CuDNN());
 #else
-      rnn_desc.set(handle, input_size, packed, hidden_size, proj_size, num_layers, std::move(dropout_desc), input_mode, bidirectional, mode, datatype, input_datatype, algo, at::globalContext().allowTF32CuDNN());
+    rnn_desc.set(
+        handle,
+        input_size,
+        packed,
+        hidden_size,
+        proj_size,
+        num_layers,
+        std::move(dropout_desc),
+        input_mode,
+        bidirectional,
+        mode,
+        datatype,
+        input_datatype,
+        algo,
+        at::globalContext().allowTF32CuDNN());
 #endif
-      return rnn_desc;
-    }
+  return rnn_desc;
+}
 
-    // In some cases, a use of RNNDescriptor does not rely on the
-    // DropoutDescriptor.  In this case, we fake up a no-dropout
-    // descriptor to make the RNN descriptor initialization go through.
-    // This is used by _cudnn_rnn_flatten_weight, which needs an
-    // RNNDescriptor for get_parameters(), but does not actually need
-    // a fully initialized dropout descriptor.  This lets us avoid
-    // having to pass the dropout state to flatten, which has no business
-    // knowing what the dropout state is.
-    RNNDescriptor descriptor(cudnnHandle_t handle) const {
-      DropoutDescriptor dropout_desc;
-      dropout_desc.set_no_dropout(handle);
-      return descriptor(handle, std::move(dropout_desc));
-    }
-  };
+// In some cases, a use of RNNDescriptor does not rely on the
+// DropoutDescriptor.  In this case, we fake up a no-dropout
+// descriptor to make the RNN descriptor initialization go through.
+// This is used by _cudnn_rnn_flatten_weight, which needs an
+// RNNDescriptor for get_parameters(), but does not actually need
+// a fully initialized dropout descriptor.  This lets us avoid
+// having to pass the dropout state to flatten, which has no business
+// knowing what the dropout state is.
+RNNDescriptor descriptor(cudnnHandle_t handle) const {
+  DropoutDescriptor dropout_desc;
+  dropout_desc.set_no_dropout(handle);
+  return descriptor(handle, std::move(dropout_desc));
+}
+}; // namespace
 
-  // TensorDescriptor list
+// TensorDescriptor list
 #ifndef USE_CUDNN_RNN_V8_API
-  std::vector<TensorDescriptor> rnn_descriptor_sequence(const Tensor& tensor, IntArrayRef batch_sizes) {
-    std::vector<TensorDescriptor> descriptors(batch_sizes.size());
-    size_t i = 0;
-    // To be mutated in the loop
-    auto batch_tensor_size = tensor.sizes().vec();
-    for (auto batch_size : batch_sizes) {
-      batch_tensor_size[0] = batch_size;
-      // NB: cuDNN RNN API does not support 2d descriptors, so we
-      // must pad it out to 3d.
-      descriptors[i].set(getCudnnDataType(tensor), batch_tensor_size, tensor.strides(), 3);
-      i++;
-    }
-    return descriptors;
+std::vector<TensorDescriptor> rnn_descriptor_sequence(
+    const Tensor& tensor,
+    IntArrayRef batch_sizes) {
+  std::vector<TensorDescriptor> descriptors(batch_sizes.size());
+  size_t i = 0;
+  // To be mutated in the loop
+  auto batch_tensor_size = tensor.sizes().vec();
+  for (auto batch_size : batch_sizes) {
+    batch_tensor_size[0] = batch_size;
+    // NB: cuDNN RNN API does not support 2d descriptors, so we
+    // must pad it out to 3d.
+    descriptors[i].set(
+        getCudnnDataType(tensor), batch_tensor_size, tensor.strides(), 3);
+    i++;
   }
+  return descriptors;
+}
 
-  std::vector<TensorDescriptor> rnn_descriptor(const Tensor& tensor, int64_t N) {
-    std::vector<TensorDescriptor> descriptors(N);
-    for (const auto i : c10::irange(N)) {
-      descriptors[i].set(tensor, 5);
-    }
-    return descriptors;
+std::vector<TensorDescriptor> rnn_descriptor(const Tensor& tensor, int64_t N) {
+  std::vector<TensorDescriptor> descriptors(N);
+  for (const auto i : c10::irange(N)) {
+    descriptors[i].set(tensor, 5);
   }
+  return descriptors;
+}
 #else
-  auto rnn_descriptor_sequence(const Tensor& tensor, uint32_t batch_size, const IntArrayRef batch_sizes, uint32_t seq_len, uint32_t vector_size) { // packed case
-    RNNDataDescriptor r;
-    std::vector<int> seqLengthArray(batch_size, 1);
-    // cuDNN wants the sequence lenghts for a packed batch as if they
-    // were unpacked, e.g., for the
-    // Sequence 1: ABCD
-    // Sequence 2: EF
-    // Sequence 3: G
-    // case below, this would be [4, 2, 1] (has length == mini_batch)
-    // TODO(eqy): There's probably a smarter way to do this than O(SN)
-    for (auto it = batch_sizes.begin(); it != batch_sizes.end(); it++) {
-      // everyone starts at sequence length 1 so we skip an iteration
-      if (it == batch_sizes.begin()) {
-        continue;
-      }
-      for (const auto idx : c10::irange(*it)) {
-        seqLengthArray[idx]++;
-      }
+auto rnn_descriptor_sequence(
+    const Tensor& tensor,
+    uint32_t batch_size,
+    const IntArrayRef batch_sizes,
+    uint32_t seq_len,
+    uint32_t vector_size) { // packed case
+  RNNDataDescriptor r;
+  std::vector<int> seqLengthArray(batch_size, 1);
+  // cuDNN wants the sequence lengths for a packed batch as if they
+  // were unpacked, e.g., for the
+  // Sequence 1: ABCD
+  // Sequence 2: EF
+  // Sequence 3: G
+  // case below, this would be [4, 2, 1] (has length == mini_batch)
+  // TODO(eqy): There's probably a smarter way to do this than O(SN)
+  for (auto it = batch_sizes.begin(); it != batch_sizes.end(); it++) {
+    // everyone starts at sequence length 1 so we skip an iteration
+    if (it == batch_sizes.begin()) {
+      continue;
+    }
+    for (const auto idx : c10::irange(*it)) {
+      seqLengthArray[idx]++;
     }
-    r.set(tensor, CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, seq_len, batch_size, vector_size, seqLengthArray.data());
-    return r;
   }
+  r.set(
+      tensor,
+      CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
+      seq_len,
+      batch_size,
+      vector_size,
+      seqLengthArray.data());
+  return r;
+}
 
-  auto rnn_descriptor(const Tensor& tensor, uint32_t batch_size, uint32_t seq_len, uint32_t vector_size) {
-    RNNDataDescriptor r;
-    // NB: Looks like even if batch_first is true here we always want SEQ_MAJOR_UNPACKED, because the input
-    // appears to be transposed if it is barch-major
-    const auto layout = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED;
-    std::vector<int32_t> seqLengthArray(batch_size, seq_len);
-    r.set(tensor, layout, seq_len, batch_size, vector_size, seqLengthArray.data());
-    return r;
-  }
+auto rnn_descriptor(
+    const Tensor& tensor,
+    uint32_t batch_size,
+    uint32_t seq_len,
+    uint32_t vector_size) {
+  RNNDataDescriptor r;
+  // NB: Looks like even if batch_first is true here we always want
+  // SEQ_MAJOR_UNPACKED, because the input appears to be transposed if it is
+  // barch-major
+  const auto layout = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED;
+  std::vector<int32_t> seqLengthArray(batch_size, seq_len);
+  r.set(
+      tensor, layout, seq_len, batch_size, vector_size, seqLengthArray.data());
+  return r;
+}
 #endif
 
-  // The best way to understand the meaning of the values stored in
-  // this struct is to consider each of the possible ways our
-  // input can be structured.
-  //
-  // Suppose you want to run RNN on the following variable
-  // length inputs:
-  //
-  //    Sequence 1: ABCD
-  //    Sequence 2: EF
-  //    Sequence 3: G
-  //
-  // (Let _ be padding when we have non-packed representations.)
-  //
-  // # Packed input (batch_sizes is non-empty)
-  //
-  //  input_size
-  // +------+                    +
-  // | A    |                    |
-  // | E    | mini_batch =       |
-  // | G    | batch_sizes[0] = 3 |
-  // +------+                    |
-  // | B    |                    | batch_sizes_sum = 7
-  // | F    | batch_sizes[1] = 2 |
-  // +------+                    |
-  // | C    | batch_sizes[2] = 1 |
-  // +------+                    |
-  // | D    | batch_sizes[3] = 1 |
-  // +------+                    +
-  //
-  //              (seq_length = 4)
-  //
-  //    input.size() = batch_sizes_sum x input_size
-  //
-  // # Unpacked input (batch_first = false)
-  //
-  //  mini_batch = 3
-  // +-------+
-  // | A E G |
-  // | B F _ | seq_length = 4
-  // | C _ _ |
-  // | D _ _ |
-  // +-------+
-  //    ...    input_size
-  // +-------+
-  //
-  //    input.size() = seq_length x mini_batch x input_size
-  //
-  // # Unpacked input (batch_first = true)
-  //
-  //  seq_length = 4
-  // +---------+
-  // | A B C D |
-  // | E F _ _ | mini_batch = 3
-  // | G _ _ _ |
-  // +---------+
-  //     ...     input_size
-  // +---------+
-  //
-  //    input.size() = mini_batch x seq_length x input_size
-  //
-  struct TensorDescriptorListParams {
-    IntArrayRef batch_sizes;
-    int64_t seq_length;
-    int64_t mini_batch;
-    // NB: this is not input.size(), which is an IntArrayRef; instead, this
-    // size of the inner-most dimension.  In NL applications, this is usually
-    // the size of the embedding.  You can also think of this as the size
-    // of the "channel" dimension (at risk of confusing vision researchers :)
-    int64_t input_size;
-    // Only valid when !is_input_packed
-    int64_t batch_sizes_sum; // == sum(batch_sizes)
-
-    bool is_input_packed() const {
-      return batch_sizes.size() != 0;
-    }
+// The best way to understand the meaning of the values stored in
+// this struct is to consider each of the possible ways our
+// input can be structured.
+//
+// Suppose you want to run RNN on the following variable
+// length inputs:
+//
+//    Sequence 1: ABCD
+//    Sequence 2: EF
+//    Sequence 3: G
+//
+// (Let _ be padding when we have non-packed representations.)
+//
+// # Packed input (batch_sizes is non-empty)
+//
+//  input_size
+// +------+                    +
+// | A    |                    |
+// | E    | mini_batch =       |
+// | G    | batch_sizes[0] = 3 |
+// +------+                    |
+// | B    |                    | batch_sizes_sum = 7
+// | F    | batch_sizes[1] = 2 |
+// +------+                    |
+// | C    | batch_sizes[2] = 1 |
+// +------+                    |
+// | D    | batch_sizes[3] = 1 |
+// +------+                    +
+//
+//              (seq_length = 4)
+//
+//    input.size() = batch_sizes_sum x input_size
+//
+// # Unpacked input (batch_first = false)
+//
+//  mini_batch = 3
+// +-------+
+// | A E G |
+// | B F _ | seq_length = 4
+// | C _ _ |
+// | D _ _ |
+// +-------+
+//    ...    input_size
+// +-------+
+//
+//    input.size() = seq_length x mini_batch x input_size
+//
+// # Unpacked input (batch_first = true)
+//
+//  seq_length = 4
+// +---------+
+// | A B C D |
+// | E F _ _ | mini_batch = 3
+// | G _ _ _ |
+// +---------+
+//     ...     input_size
+// +---------+
+//
+//    input.size() = mini_batch x seq_length x input_size
+//
+struct TensorDescriptorListParams {
+  IntArrayRef batch_sizes;
+  int64_t seq_length;
+  int64_t mini_batch;
+  // NB: this is not input.size(), which is an IntArrayRef; instead, this
+  // size of the inner-most dimension.  In NL applications, this is usually
+  // the size of the embedding.  You can also think of this as the size
+  // of the "channel" dimension (at risk of confusing vision researchers :)
+  int64_t input_size;
+  // Only valid when !is_input_packed
+  int64_t batch_sizes_sum; // == sum(batch_sizes)
+
+  bool is_input_packed() const {
+    return batch_sizes.size() != 0;
+  }
 
-    void set(IntArrayRef input_sizes, IntArrayRef batch_sizes_, bool batch_first) {
-      batch_sizes = batch_sizes_;
-      if (is_input_packed()) {
-        seq_length = batch_sizes.size();
-        mini_batch = batch_sizes[0];
-        // NB: When input is packed, the mini_batch size is NOT the size
-        // of the outer dimension
-        batch_sizes_sum = input_sizes[0];
-        input_size = input_sizes[1];
+  void set(
+      IntArrayRef input_sizes,
+      IntArrayRef batch_sizes_,
+      bool batch_first) {
+    batch_sizes = batch_sizes_;
+    if (is_input_packed()) {
+      seq_length = batch_sizes.size();
+      mini_batch = batch_sizes[0];
+      // NB: When input is packed, the mini_batch size is NOT the size
+      // of the outer dimension
+      batch_sizes_sum = input_sizes[0];
+      input_size = input_sizes[1];
+    } else {
+      if (batch_first) {
+        seq_length = input_sizes[1];
+        mini_batch = input_sizes[0];
       } else {
-        if (batch_first) {
-          seq_length = input_sizes[1];
-          mini_batch = input_sizes[0];
-        } else {
-          seq_length = input_sizes[0];
-          mini_batch = input_sizes[1];
-        }
-        input_size = input_sizes[2];
-        // TODO: Actually, would this make ASAN's job harder catching
-        // an uninitialized access?
-        batch_sizes_sum = -1; // something bogus in case we access it
+        seq_length = input_sizes[0];
+        mini_batch = input_sizes[1];
       }
+      input_size = input_sizes[2];
+      // TODO: Actually, would this make ASAN's job harder catching
+      // an uninitialized access?
+      batch_sizes_sum = -1; // something bogus in case we access it
     }
+  }
 #ifndef USE_CUDNN_RNN_V8_API
-    // TODO: check x for consistency with input_size?
-    std::vector<TensorDescriptor> descriptors(Tensor x) const {
-      auto is_input_packed = batch_sizes.size() != 0;
-      if (is_input_packed) {
-        return rnn_descriptor_sequence(x, batch_sizes);
-      } else {
-        return rnn_descriptor(x[0], seq_length);
-      }
+  // TODO: check x for consistency with input_size?
+  std::vector<TensorDescriptor> descriptors(Tensor x) const {
+    auto is_input_packed = batch_sizes.size() != 0;
+    if (is_input_packed) {
+      return rnn_descriptor_sequence(x, batch_sizes);
+    } else {
+      return rnn_descriptor(x[0], seq_length);
     }
+  }
 #else
-    auto descriptors(Tensor x) const {
-      auto is_input_packed = batch_sizes.size() != 0;
-      if (is_input_packed) {
-        return rnn_descriptor_sequence(x, mini_batch, batch_sizes, seq_length, x.size(-1));
-      } else {
-        return rnn_descriptor(x, mini_batch, seq_length, x.size(-1));
-      }
+  auto descriptors(Tensor x) const {
+    auto is_input_packed = batch_sizes.size() != 0;
+    if (is_input_packed) {
+      return rnn_descriptor_sequence(
+          x, mini_batch, batch_sizes, seq_length, x.size(-1));
+    } else {
+      return rnn_descriptor(x, mini_batch, seq_length, x.size(-1));
     }
+  }
 #endif
-  };
+};
 
-  // Everything together
+// Everything together
 
-  struct RNNParams {
-    DropoutDescriptorParams dropout;
-    RNNDescriptorParams rnn;
-    TensorDescriptorListParams tensors;
-  };
+struct RNNParams {
+  DropoutDescriptorParams dropout;
+  RNNDescriptorParams rnn;
+  TensorDescriptorListParams tensors;
+};
 
-  // NB: Doesn't include the weight descriptor
-  struct RNNDescriptors {
-    RNNDescriptor rnn_desc;
-    // NB: this won't actually lay out the tensor descriptor pointers
-    // in the right way, so you'll have to preprocess them
+// NB: Doesn't include the weight descriptor
+struct RNNDescriptors {
+  RNNDescriptor rnn_desc;
+  // NB: this won't actually lay out the tensor descriptor pointers
+  // in the right way, so you'll have to preprocess them
 #ifndef USE_CUDNN_RNN_V8_API
-    std::vector<TensorDescriptor> x_descs;
-    std::vector<TensorDescriptor> y_descs;
+  std::vector<TensorDescriptor> x_descs;
+  std::vector<TensorDescriptor> y_descs;
 #else
-    RNNDataDescriptor x_descs;
-    RNNDataDescriptor y_descs;
+  RNNDataDescriptor x_descs;
+  RNNDataDescriptor y_descs;
 #endif
-    TensorDescriptor hx_desc;
-    TensorDescriptor hy_desc;
-    TensorDescriptor cx_desc;
-    TensorDescriptor cy_desc;
-
-    RNNDescriptors(const RNNParams& fn, cudnnHandle_t handle, Tensor x, Tensor y, Tensor hx, Tensor cx) {
-      rnn_desc = fn.rnn.descriptor(handle, fn.dropout.descriptor(handle));
-      x_descs = fn.tensors.descriptors(x);
-      y_descs = fn.tensors.descriptors(y);
-      hx_desc.set(hx, 5);
-      hy_desc.set(hx, 5);
-      if (cx.defined()) {
-        cx_desc.set(cx, 5);
-        cy_desc.set(cx, 5);
-      }
+  TensorDescriptor hx_desc;
+  TensorDescriptor hy_desc;
+  TensorDescriptor cx_desc;
+  TensorDescriptor cy_desc;
+
+  RNNDescriptors(
+      const RNNParams& fn,
+      cudnnHandle_t handle,
+      Tensor x,
+      Tensor y,
+      Tensor hx,
+      Tensor cx) {
+    rnn_desc = fn.rnn.descriptor(handle, fn.dropout.descriptor(handle));
+    x_descs = fn.tensors.descriptors(x);
+    y_descs = fn.tensors.descriptors(y);
+    hx_desc.set(hx, 5);
+    hy_desc.set(hx, 5);
+    if (cx.defined()) {
+      cx_desc.set(cx, 5);
+      cy_desc.set(cx, 5);
     }
+  }
 
-    // TODO: This is annoying, having to put the cudnnTensorDescriptor_t
-    // in a contiguous array...
-    std::vector<cudnnTensorDescriptor_t> get_descs(const std::vector<TensorDescriptor>& descs) {
-      std::vector<cudnnTensorDescriptor_t> r;
-      r.reserve(descs.size());
-      for (auto& desc : descs) {
-        r.emplace_back(desc.desc());
-      }
-      return r;
+  // TODO: This is annoying, having to put the cudnnTensorDescriptor_t
+  // in a contiguous array...
+  std::vector<cudnnTensorDescriptor_t> get_descs(
+      const std::vector<TensorDescriptor>& descs) {
+    std::vector<cudnnTensorDescriptor_t> r;
+    r.reserve(descs.size());
+    for (auto& desc : descs) {
+      r.emplace_back(desc.desc());
     }
+    return r;
+  }
 #ifndef USE_CUDNN_RNN_V8_API
-    std::vector<cudnnTensorDescriptor_t> get_x_descs() {
-      return get_descs(x_descs);
-    }
+  std::vector<cudnnTensorDescriptor_t> get_x_descs() {
+    return get_descs(x_descs);
+  }
 
-    std::vector<cudnnTensorDescriptor_t> get_y_descs() {
-      return get_descs(y_descs);
-    }
+  std::vector<cudnnTensorDescriptor_t> get_y_descs() {
+    return get_descs(y_descs);
+  }
 #endif
-  };
+};
 
-  int64_t get_num_weights(cudnnHandle_t handle, const RNNDescriptor& rnn_desc,
+int64_t get_num_weights(
+    cudnnHandle_t handle,
+    const RNNDescriptor& rnn_desc,
 #ifndef USE_CUDNN_RNN_V8_API
-                          const TensorDescriptor& x_desc,
+    const TensorDescriptor& x_desc,
 #endif
-                          cudnnDataType_t datatype) {
-    size_t weight_size;
+    cudnnDataType_t datatype) {
+  size_t weight_size;
 #ifndef USE_CUDNN_RNN_V8_API
-    AT_CUDNN_CHECK(cudnnGetRNNParamsSize(handle, rnn_desc.desc(), x_desc.desc(), &weight_size, datatype));
+  AT_CUDNN_CHECK(cudnnGetRNNParamsSize(
+      handle, rnn_desc.desc(), x_desc.desc(), &weight_size, datatype));
 #else
-    AT_CUDNN_CHECK(cudnnGetRNNWeightSpaceSize(handle, rnn_desc.desc(), &weight_size));
+  AT_CUDNN_CHECK(
+      cudnnGetRNNWeightSpaceSize(handle, rnn_desc.desc(), &weight_size));
 #endif
-    auto elem_size = dataSize(datatype);
-    TORCH_INTERNAL_ASSERT(weight_size % elem_size == 0, "cudnnGetRNNParamsSize returned nonsensical weight_size");
-    return weight_size / elem_size;
-  }
+  auto elem_size = dataSize(datatype);
+  TORCH_INTERNAL_ASSERT(
+      weight_size % elem_size == 0,
+      "cudnnGetRNNParamsSize returned nonsensical weight_size");
+  return weight_size / elem_size;
+}
 
-  int64_t _num_linear_layers(cudnnRNNMode_t mode) {
-    switch(mode) {
-      case CUDNN_LSTM:
-        return 8;
-      case CUDNN_GRU:
-        return 6;
-      case CUDNN_RNN_RELU:
-        return 2;
-      case CUDNN_RNN_TANH:
-        return 2;
-      default:
-        AT_ERROR("unknown cuDNN RNN mode ", mode);
-    }
+int64_t _num_linear_layers(cudnnRNNMode_t mode) {
+  switch (mode) {
+    case CUDNN_LSTM:
+      return 8;
+    case CUDNN_GRU:
+      return 6;
+    case CUDNN_RNN_RELU:
+      return 2;
+    case CUDNN_RNN_TANH:
+      return 2;
+    default:
+      AT_ERROR("unknown cuDNN RNN mode ", mode);
   }
+}
 
-  void add_projection_weights(
-        cudnnHandle_t handle,
-        const RNNDescriptor& rnn_desc,
+void add_projection_weights(
+    cudnnHandle_t handle,
+    const RNNDescriptor& rnn_desc,
 #ifndef USE_CUDNN_RNN_V8_API
-        const TensorDescriptor& x_desc,
-        const FilterDescriptor& w_desc,
+    const TensorDescriptor& x_desc,
+    const FilterDescriptor& w_desc,
 #endif
-        const Tensor& weight_buf,
-        int64_t layer,
-        std::vector<Tensor>& params
-  ) {
-    void* matrix_pointer = nullptr;
-    // assuming it's LSTM which has 8 "linear layers" (i.e. 4 weights and 4 biases)
-    int64_t linear_id = 8;
+    const Tensor& weight_buf,
+    int64_t layer,
+    std::vector<Tensor>& params) {
+  void* matrix_pointer = nullptr;
+  // assuming it's LSTM which has 8 "linear layers" (i.e. 4 weights and 4
+  // biases)
+  int64_t linear_id = 8;
 #ifndef USE_CUDNN_RNN_V8_API
-    FilterDescriptor lin_layer_mat_desc;
-    AT_CUDNN_CHECK(cudnnGetRNNLinLayerMatrixParams(
-        /*handle=*/handle,
-        /*rnnDesc=*/rnn_desc.desc(),
-        /*layer=*/layer,
-        /*xDesc=*/x_desc.desc(),
-        /*wDesc=*/w_desc.desc(),
-        /*w=*/weight_buf.data_ptr(),
-        /*linLayerID=*/linear_id,
-        /*linLayerMatDesc=*/lin_layer_mat_desc.mut_desc(),
-        /*linLayerMat=*/&matrix_pointer));
+  FilterDescriptor lin_layer_mat_desc;
+  AT_CUDNN_CHECK(cudnnGetRNNLinLayerMatrixParams(
+      /*handle=*/handle,
+      /*rnnDesc=*/rnn_desc.desc(),
+      /*layer=*/layer,
+      /*xDesc=*/x_desc.desc(),
+      /*wDesc=*/w_desc.desc(),
+      /*w=*/weight_buf.data_ptr(),
+      /*linLayerID=*/linear_id,
+      /*linLayerMatDesc=*/lin_layer_mat_desc.mut_desc(),
+      /*linLayerMat=*/&matrix_pointer));
 #else
-    void *unused_pointer;
-    TensorDescriptor unused_desc;
-    TensorDescriptor lin_layer_mat_desc;
-    AT_CUDNN_CHECK(cudnnGetRNNWeightParams(
-        /*handle=*/handle,
-        /*rnnDesc=*/rnn_desc.desc(),
-        /*layer=*/layer,
-        /*wDesc=*/weight_buf.numel() * weight_buf.element_size(),
-        /*w=*/weight_buf.data_ptr(),
-        /*linLayerID=*/linear_id,
-        /*linLayerMatDesc=*/lin_layer_mat_desc.mut_desc(),
-        /*linLayerMat=*/&matrix_pointer, unused_desc.mut_desc(), &unused_pointer));
+  void* unused_pointer;
+  TensorDescriptor unused_desc;
+  TensorDescriptor lin_layer_mat_desc;
+  AT_CUDNN_CHECK(cudnnGetRNNWeightParams(
+      /*handle=*/handle,
+      /*rnnDesc=*/rnn_desc.desc(),
+      /*layer=*/layer,
+      /*wDesc=*/weight_buf.numel() * weight_buf.element_size(),
+      /*w=*/weight_buf.data_ptr(),
+      /*linLayerID=*/linear_id,
+      /*linLayerMatDesc=*/lin_layer_mat_desc.mut_desc(),
+      /*linLayerMat=*/&matrix_pointer,
+      unused_desc.mut_desc(),
+      &unused_pointer));
 #endif
 
-    cudnnDataType_t data_type;
+  cudnnDataType_t data_type;
 #ifndef USE_CUDNN_RNN_V8_API
-    cudnnTensorFormat_t format;
+  cudnnTensorFormat_t format;
 #else
-    int stride_dim_a[5];
+  int stride_dim_a[5];
 #endif
-    int nb_dims;
-    constexpr int min_dim = 3;
-    int filter_dim_a[min_dim];
+  int nb_dims;
+  constexpr int min_dim = 3;
+  int filter_dim_a[min_dim];
 #ifndef USE_CUDNN_RNN_V8_API
-    AT_CUDNN_CHECK(
-      cudnnGetFilterNdDescriptor(
-          lin_layer_mat_desc.desc(),
-          min_dim,
-          &data_type,
-          &format,
-          &nb_dims,
-          filter_dim_a
-          ));
+  AT_CUDNN_CHECK(cudnnGetFilterNdDescriptor(
+      lin_layer_mat_desc.desc(),
+      min_dim,
+      &data_type,
+      &format,
+      &nb_dims,
+      filter_dim_a));
 #else
-    AT_CUDNN_CHECK(
-      cudnnGetTensorNdDescriptor(
-          lin_layer_mat_desc.desc(),
-          min_dim,
-          &data_type,
-          &nb_dims,
-          filter_dim_a,
-          stride_dim_a
-          ));
+  AT_CUDNN_CHECK(cudnnGetTensorNdDescriptor(
+      lin_layer_mat_desc.desc(),
+      min_dim,
+      &data_type,
+      &nb_dims,
+      filter_dim_a,
+      stride_dim_a));
 #endif
 
-    TORCH_INTERNAL_ASSERT(nb_dims <= min_dim, "nb_dims = ", nb_dims, "; min_dim  = ", min_dim);
-    auto elem_size = dataSize(getCudnnDataType(weight_buf));
-    auto offset_bytes = (char*)matrix_pointer - (char*)weight_buf.data_ptr();
-    TORCH_INTERNAL_ASSERT(offset_bytes % elem_size == 0, "offset_bytes = ", offset_bytes, "; elem_size = ", elem_size);
-    size_t offset = offset_bytes / elem_size;
-
-    int mat_numel = c10::multiply_integers(filter_dim_a, filter_dim_a + nb_dims);
-    // Generate a new parameter tensor which is a view into the weight_buf.
-    std::initializer_list<int64_t> size = {mat_numel, 1};
-    Tensor param = at::empty({0}, weight_buf.options()).set_(weight_buf.storage(), offset, size);
-    params.emplace_back(std::move(param));
-  }
-
+  TORCH_INTERNAL_ASSERT(
+      nb_dims <= min_dim, "nb_dims = ", nb_dims, "; min_dim  = ", min_dim);
+  auto elem_size = dataSize(getCudnnDataType(weight_buf));
+  auto offset_bytes = (char*)matrix_pointer - (char*)weight_buf.data_ptr();
+  TORCH_INTERNAL_ASSERT(
+      offset_bytes % elem_size == 0,
+      "offset_bytes = ",
+      offset_bytes,
+      "; elem_size = ",
+      elem_size);
+  size_t offset = offset_bytes / elem_size;
+
+  int mat_numel = c10::multiply_integers(filter_dim_a, filter_dim_a + nb_dims);
+  // Generate a new parameter tensor which is a view into the weight_buf.
+  std::initializer_list<int64_t> size = {mat_numel, 1};
+  Tensor param = at::empty({0}, weight_buf.options())
+                     .set_(weight_buf.storage(), offset, size);
+  params.emplace_back(std::move(param));
+}
 
-  /*
-    Returns weight and bias tensors for each layer of the RNN. These tensors
-    are views on the underlying weight buffer allocated by CuDNN.
-
-    Note: for LSTM and GRU, which have multiple parameters of each type (4 and 3, respectively),
-          these parameters are concatenated along the first dimension.
-          These parameters are returned in a consistent order by CuDNN:
-              (reset, forget, cell, output) for LSTM
-              (reset, input, new) for GRU
-    Args:
-        fn: The RNN function object holding the RNN state
-        handle: a CuDNN handle
-        weight_buf: a 1D tensor containing the CuDNN-allocated weight (or grad_weight) buffer
-    Returns:
-        parameters: [(weight_ih, weight_hh, bias_ih, bias_hh)*], with length equal to the num_layers.
-            This is represented as a pair of vector, and outer-dimension stride
-            (NB: Can't return MatrixRef because we need to allocate the underlying tensor)
-  */
-  std::pair<std::vector<Tensor>, size_t> // stride0
-  get_parameters(
-      cudnnHandle_t handle,
-      const RNNDescriptorParams& rnn,
-      const RNNDescriptor& rnn_desc,
+/*
+  Returns weight and bias tensors for each layer of the RNN. These tensors
+  are views on the underlying weight buffer allocated by CuDNN.
+
+  Note: for LSTM and GRU, which have multiple parameters of each type (4 and 3,
+  respectively), these parameters are concatenated along the first dimension.
+        These parameters are returned in a consistent order by CuDNN:
+            (reset, forget, cell, output) for LSTM
+            (reset, input, new) for GRU
+  Args:
+      fn: The RNN function object holding the RNN state
+      handle: a CuDNN handle
+      weight_buf: a 1D tensor containing the CuDNN-allocated weight (or
+  grad_weight) buffer Returns: parameters: [(weight_ih, weight_hh, bias_ih,
+  bias_hh)*], with length equal to the num_layers. This is represented as a pair
+  of vector, and outer-dimension stride (NB: Can't return MatrixRef because we
+  need to allocate the underlying tensor)
+*/
+std::pair<std::vector<Tensor>, size_t> // stride0
+get_parameters(
+    cudnnHandle_t handle,
+    const RNNDescriptorParams& rnn,
+    const RNNDescriptor& rnn_desc,
 #ifndef USE_CUDNN_RNN_V8_API
-      const TensorDescriptor& x_desc,
-      const FilterDescriptor& w_desc,
+    const TensorDescriptor& x_desc,
+    const FilterDescriptor& w_desc,
 #endif
-      const Tensor& weight_buf,
-      bool include_bias=true
-  ) {
+    const Tensor& weight_buf,
+    bool include_bias = true) {
 #ifndef USE_CUDNN_RNN_V8_API
-    auto cudnn_methods = { cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams };
+  auto cudnn_methods = {
+      cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams};
 #else
-    auto cudnn_methods = { true, false };
+  auto cudnn_methods = {true, false};
 #endif
-    std::vector<Tensor> params;
-    int64_t num_linear_layers = _num_linear_layers(rnn.mode);
-    int64_t num_layers = rnn.num_directions() * rnn.num_layers;
-    size_t cur_offset = 0;
-    size_t global_layer_params_count = 0;
-    for (const auto layer : c10::irange(num_layers)) {
-      size_t layer_params_count = 0;
-      for (auto cudnn_method : cudnn_methods) {
-        for (const auto linear_id : c10::irange(num_linear_layers)) {
-          void* matrix_pointer;
+  std::vector<Tensor> params;
+  int64_t num_linear_layers = _num_linear_layers(rnn.mode);
+  int64_t num_layers = rnn.num_directions() * rnn.num_layers;
+  size_t cur_offset = 0;
+  size_t global_layer_params_count = 0;
+  for (const auto layer : c10::irange(num_layers)) {
+    size_t layer_params_count = 0;
+    for (auto cudnn_method : cudnn_methods) {
+      for (const auto linear_id : c10::irange(num_linear_layers)) {
+        void* matrix_pointer;
 #ifndef USE_CUDNN_RNN_V8_API
-          FilterDescriptor lin_layer_mat_desc;
-          AT_CUDNN_CHECK(cudnn_method(
+        FilterDescriptor lin_layer_mat_desc;
+        AT_CUDNN_CHECK(cudnn_method(
+            handle,
+            rnn_desc.desc(),
+            layer,
+            x_desc.desc(),
+            w_desc.desc(),
+            weight_buf.data_ptr(),
+            linear_id,
+            lin_layer_mat_desc.mut_desc(),
+            &matrix_pointer));
+#else
+        void* unused_pointer = nullptr;
+        TensorDescriptor unused_desc;
+        TensorDescriptor lin_layer_mat_desc;
+        for (int stateless = 0; stateless < 100; stateless++) {
+          if (cudnn_method) { // matrix
+            AT_CUDNN_CHECK(cudnnGetRNNWeightParams(
                 handle,
                 rnn_desc.desc(),
                 layer,
-                x_desc.desc(),
-                w_desc.desc(),
+                weight_buf.numel() * weight_buf.element_size(),
                 weight_buf.data_ptr(),
                 linear_id,
                 lin_layer_mat_desc.mut_desc(),
-                &matrix_pointer
-                ));
-#else
-          void *unused_pointer = nullptr;
-          TensorDescriptor unused_desc;
-          TensorDescriptor lin_layer_mat_desc;
-          for (int stateless = 0; stateless < 100; stateless++) {
-          if (cudnn_method) { // matrix
-               AT_CUDNN_CHECK(cudnnGetRNNWeightParams(
-                   handle,
-                   rnn_desc.desc(),
-                   layer,
-                   weight_buf.numel() * weight_buf.element_size(),
-                   weight_buf.data_ptr(),
-                   linear_id,
-                   lin_layer_mat_desc.mut_desc(),
-                   &matrix_pointer,
-                   unused_desc.mut_desc(),
-                   &unused_pointer
-                   ));
+                &matrix_pointer,
+                unused_desc.mut_desc(),
+                &unused_pointer));
           } else { // bias
-               AT_CUDNN_CHECK(cudnnGetRNNWeightParams(
-                   handle,
-                   rnn_desc.desc(),
-                   layer,
-                   weight_buf.numel() * weight_buf.element_size(),
-                   weight_buf.data_ptr(),
-                   linear_id,
-                   unused_desc.mut_desc(),
-                   &unused_pointer,
-                   lin_layer_mat_desc.mut_desc(),
-                   &matrix_pointer
-                   ));
-          }
+            AT_CUDNN_CHECK(cudnnGetRNNWeightParams(
+                handle,
+                rnn_desc.desc(),
+                layer,
+                weight_buf.numel() * weight_buf.element_size(),
+                weight_buf.data_ptr(),
+                linear_id,
+                unused_desc.mut_desc(),
+                &unused_pointer,
+                lin_layer_mat_desc.mut_desc(),
+                &matrix_pointer));
           }
+        }
 #endif
-          cudnnDataType_t data_type;
+        cudnnDataType_t data_type;
 #ifndef USE_CUDNN_RNN_V8_API
-          cudnnTensorFormat_t format;
+        cudnnTensorFormat_t format;
 #else
-          int stride_dim_a[5];
+        int stride_dim_a[5];
 #endif
-          int nb_dims;
-          constexpr int min_dim = 3;
-          int filter_dim_a[min_dim];
+        int nb_dims;
+        constexpr int min_dim = 3;
+        int filter_dim_a[min_dim];
 #ifndef USE_CUDNN_RNN_V8_API
-          AT_CUDNN_CHECK(
-            cudnnGetFilterNdDescriptor(
-                lin_layer_mat_desc.desc(),
-                min_dim,
-                &data_type,
-                &format,
-                &nb_dims,
-                filter_dim_a
-                ));
+        AT_CUDNN_CHECK(cudnnGetFilterNdDescriptor(
+            lin_layer_mat_desc.desc(),
+            min_dim,
+            &data_type,
+            &format,
+            &nb_dims,
+            filter_dim_a));
 #else
-          AT_CUDNN_CHECK(
-            cudnnGetTensorNdDescriptor(
-                lin_layer_mat_desc.desc(),
-                min_dim,
-                &data_type,
-                &nb_dims,
-                filter_dim_a,
-                stride_dim_a
-                ));
+        AT_CUDNN_CHECK(cudnnGetTensorNdDescriptor(
+            lin_layer_mat_desc.desc(),
+            min_dim,
+            &data_type,
+            &nb_dims,
+            filter_dim_a,
+            stride_dim_a));
 #endif
 
-          TORCH_INTERNAL_ASSERT(nb_dims <= min_dim, "nb_dims = ", nb_dims, "; min_dim  = ", min_dim);
-          auto elem_size = dataSize(getCudnnDataType(weight_buf));
-          auto offset_bytes = (char*)matrix_pointer - (char*)weight_buf.data_ptr();
-          TORCH_INTERNAL_ASSERT(offset_bytes % elem_size == 0, "offset_bytes = ", offset_bytes, "; elem_size = ", elem_size);
-          size_t offset = offset_bytes / elem_size;
-          // for all the RNN types provided by CUDNN, all the ih weights
-          // are the same size and are allocated in a contiguous chunk
-          // (same for the hh weights, and the ih and hh biases).
-          // Since we're storing all the weights in a single tensor anyway,
-          // might as well merge the CUDNN ones into a single tensor as well
-          int mat_numel = c10::multiply_integers(filter_dim_a, filter_dim_a + nb_dims);
-          if (linear_id == 0 || linear_id == num_linear_layers / 2) {
-            // We could also exclude bias params by restricting cudnn_methods to just { cudnnGetRNNLinLayerMatrixParams }
-            // at the very top.  However, to do so would throw off the cur_offset account, which is currently a strict
-            // and informative check that all params are laid out the way we think they are.  If include_bias is false,
-            // I'd rather keep full cur_offset checks rather than save some CPU overhead by skipping the cudnn_method =
-            // cudnnGetRNNLinLayerBiasParams iteration.
+        TORCH_INTERNAL_ASSERT(
+            nb_dims <= min_dim,
+            "nb_dims = ",
+            nb_dims,
+            "; min_dim  = ",
+            min_dim);
+        auto elem_size = dataSize(getCudnnDataType(weight_buf));
+        auto offset_bytes =
+            (char*)matrix_pointer - (char*)weight_buf.data_ptr();
+        TORCH_INTERNAL_ASSERT(
+            offset_bytes % elem_size == 0,
+            "offset_bytes = ",
+            offset_bytes,
+            "; elem_size = ",
+            elem_size);
+        size_t offset = offset_bytes / elem_size;
+        // for all the RNN types provided by CUDNN, all the ih weights
+        // are the same size and are allocated in a contiguous chunk
+        // (same for the hh weights, and the ih and hh biases).
+        // Since we're storing all the weights in a single tensor anyway,
+        // might as well merge the CUDNN ones into a single tensor as well
+        int mat_numel =
+            c10::multiply_integers(filter_dim_a, filter_dim_a + nb_dims);
+        if (linear_id == 0 || linear_id == num_linear_layers / 2) {
+          // We could also exclude bias params by restricting cudnn_methods to
+          // just { cudnnGetRNNLinLayerMatrixParams } at the very top.  However,
+          // to do so would throw off the cur_offset account, which is currently
+          // a strict and informative check that all params are laid out the way
+          // we think they are.  If include_bias is false, I'd rather keep full
+          // cur_offset checks rather than save some CPU overhead by skipping
+          // the cudnn_method = cudnnGetRNNLinLayerBiasParams iteration.
 #ifndef USE_CUDNN_RNN_V8_API
-            if (include_bias || cudnn_method != cudnnGetRNNLinLayerBiasParams) {
+          if (include_bias || cudnn_method != cudnnGetRNNLinLayerBiasParams) {
 #else
-            if (include_bias || cudnn_method) {
+          if (include_bias || cudnn_method) {
 #endif
-              // Generate a new parameter tensor which is a view into the weight_buf.
-              std::initializer_list<int64_t> size = {
+            // Generate a new parameter tensor which is a view into the
+            // weight_buf.
+            std::initializer_list<int64_t> size = {
                 mat_numel * num_linear_layers / 2, 1};
-              Tensor param = at::empty({0}, weight_buf.options()).set_(weight_buf.storage(), offset, size);
-              params.emplace_back(std::move(param));
-              layer_params_count++;
-            }
-          } else {
-            TORCH_INTERNAL_ASSERT(cur_offset == offset, "cur_offset = ", cur_offset, "; offset = ", offset);
+            Tensor param = at::empty({0}, weight_buf.options())
+                               .set_(weight_buf.storage(), offset, size);
+            params.emplace_back(std::move(param));
+            layer_params_count++;
           }
-          cur_offset = offset + mat_numel;
+        } else {
+          TORCH_INTERNAL_ASSERT(
+              cur_offset == offset,
+              "cur_offset = ",
+              cur_offset,
+              "; offset = ",
+              offset);
         }
-      } // for cudnn_method
-      if (rnn.proj_size != 0) {
+        cur_offset = offset + mat_numel;
+      }
+    } // for cudnn_method
+    if (rnn.proj_size != 0) {
 #ifndef USE_CUDNN_RNN_V8_API
-        add_projection_weights(handle, rnn_desc, x_desc, w_desc, weight_buf, layer, params);
+      add_projection_weights(
+          handle, rnn_desc, x_desc, w_desc, weight_buf, layer, params);
 #else
-        add_projection_weights(handle, rnn_desc, weight_buf, layer, params);
+      add_projection_weights(handle, rnn_desc, weight_buf, layer, params);
 #endif
-        layer_params_count++;
-      }
+      layer_params_count++;
+    }
 
-      if (layer == 0) {
-        global_layer_params_count = layer_params_count;
-      } else {
-        TORCH_INTERNAL_ASSERT(global_layer_params_count == layer_params_count,
-                   "global_layer_params_count = ", global_layer_params_count,
-                   "; layer_params_count = ", layer_params_count);
-      }
-    } // for layer
-    return std::make_pair(params, global_layer_params_count);
-  }
+    if (layer == 0) {
+      global_layer_params_count = layer_params_count;
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          global_layer_params_count == layer_params_count,
+          "global_layer_params_count = ",
+          global_layer_params_count,
+          "; layer_params_count = ",
+          layer_params_count);
+    }
+  } // for layer
+  return std::make_pair(params, global_layer_params_count);
+}
 
-  // This is a lightweight version of the method above used to quickly get the expected
-  // parameter offsets.
-  std::vector<void*> get_expected_data_ptrs(
-        const Tensor& weight_buf, cudnnHandle_t handle, const RNNDescriptorParams& rnn,
-        const RNNDescriptor& rnn_desc, const TensorDescriptor& x_desc, cudnnDataType_t datatype) {
+// This is a lightweight version of the method above used to quickly get the
+// expected parameter offsets.
+std::vector<void*> get_expected_data_ptrs(
+    const Tensor& weight_buf,
+    cudnnHandle_t handle,
+    const RNNDescriptorParams& rnn,
+    const RNNDescriptor& rnn_desc,
+    const TensorDescriptor& x_desc,
+    cudnnDataType_t datatype) {
 #ifndef USE_CUDNN_RNN_V8_API
-    FilterDescriptor w_desc;
-    w_desc.set(weight_buf, 3);
+  FilterDescriptor w_desc;
+  w_desc.set(weight_buf, 3);
 #endif
 
-    int64_t num_linear_layers = _num_linear_layers(rnn.mode);
-    int64_t num_dir_layers = rnn.num_directions() * rnn.num_layers;
+  int64_t num_linear_layers = _num_linear_layers(rnn.mode);
+  int64_t num_dir_layers = rnn.num_directions() * rnn.num_layers;
 #ifndef USE_CUDNN_RNN_V8_API
-    const auto cudnn_methods = { cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams };
+  const auto cudnn_methods = {
+      cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams};
 #else
-    const auto cudnn_methods = { true, false };
+  const auto cudnn_methods = {true, false};
 #endif
-    std::vector<void*> data_ptrs;
-    if (rnn.proj_size != 0) {
-      data_ptrs.reserve(num_dir_layers * (2 * 2 + 1));
-    } else {
-      data_ptrs.reserve(num_dir_layers * 2 * 2);
-    }
-    for (const auto layer : c10::irange(num_dir_layers)) {
-      for (auto cudnn_method : cudnn_methods) {
-        // This API returns a separate pointer for weight of every gate,
-        // but we represent them as a single tensor, so we're only interested
-        // in a very limited subset of possible values.
-        const std::array<int64_t, 2> linear_offsets = { 0, num_linear_layers / 2 };
-        for (int64_t linear_id : linear_offsets) {
-          void* matrix_pointer;
+  std::vector<void*> data_ptrs;
+  if (rnn.proj_size != 0) {
+    data_ptrs.reserve(num_dir_layers * (2 * 2 + 1));
+  } else {
+    data_ptrs.reserve(num_dir_layers * 2 * 2);
+  }
+  for (const auto layer : c10::irange(num_dir_layers)) {
+    for (auto cudnn_method : cudnn_methods) {
+      // This API returns a separate pointer for weight of every gate,
+      // but we represent them as a single tensor, so we're only interested
+      // in a very limited subset of possible values.
+      const std::array<int64_t, 2> linear_offsets = {0, num_linear_layers / 2};
+      for (int64_t linear_id : linear_offsets) {
+        void* matrix_pointer;
 #ifndef USE_CUDNN_RNN_V8_API
-          FilterDescriptor lin_layer_mat_desc;
-          AT_CUDNN_CHECK(cudnn_method(
-                handle,
-                rnn_desc.desc(),
-                layer,
-                x_desc.desc(),
-                w_desc.desc(),
-                weight_buf.data_ptr(),
-                linear_id,
-                lin_layer_mat_desc.mut_desc(),
-                &matrix_pointer
-                ));
+        FilterDescriptor lin_layer_mat_desc;
+        AT_CUDNN_CHECK(cudnn_method(
+            handle,
+            rnn_desc.desc(),
+            layer,
+            x_desc.desc(),
+            w_desc.desc(),
+            weight_buf.data_ptr(),
+            linear_id,
+            lin_layer_mat_desc.mut_desc(),
+            &matrix_pointer));
 #else
-        void *unused_pointer = nullptr;
+        void* unused_pointer = nullptr;
         TensorDescriptor unused_desc;
         TensorDescriptor lin_layer_mat_desc;
-          if (cudnn_method) { // matrix
-              AT_CUDNN_CHECK(cudnnGetRNNWeightParams(
-                    handle,
-                    rnn_desc.desc(),
-                    layer,
-                    weight_buf.numel() * weight_buf.element_size(),
-                    weight_buf.data_ptr(),
-                    linear_id,
-                    lin_layer_mat_desc.mut_desc(),
-                    &matrix_pointer,
-                    unused_desc.mut_desc(),
-                    &unused_pointer
-                    ));
-          } else { // bias
-              AT_CUDNN_CHECK(cudnnGetRNNWeightParams(
-                    handle,
-                    rnn_desc.desc(),
-                    layer,
-                    weight_buf.numel() * weight_buf.element_size(),
-                    weight_buf.data_ptr(),
-                    linear_id,
-                    unused_desc.mut_desc(),
-                    &unused_pointer,
-                    lin_layer_mat_desc.mut_desc(),
-                    &matrix_pointer
-                    ));
-          }
-#endif
-          data_ptrs.push_back(matrix_pointer);
-        }
-      }
-      if (rnn.proj_size != 0) {
-        // assuming it's LSTM which has 8 "linear layers" (i.e. 4 weights and 4 biases)
-        int64_t linear_id = 8;
-        void* matrix_pointer;
-#ifndef USE_CUDNN_RNN_V8_API
-        FilterDescriptor lin_layer_mat_desc;
-        AT_CUDNN_CHECK(cudnnGetRNNLinLayerMatrixParams(
+        if (cudnn_method) { // matrix
+          AT_CUDNN_CHECK(cudnnGetRNNWeightParams(
               handle,
               rnn_desc.desc(),
               layer,
-              x_desc.desc(),
-              w_desc.desc(),
+              weight_buf.numel() * weight_buf.element_size(),
               weight_buf.data_ptr(),
               linear_id,
               lin_layer_mat_desc.mut_desc(),
-              &matrix_pointer
-              ));
-#else
-    void *unused_pointer;
-        TensorDescriptor unused_desc;
-        TensorDescriptor lin_layer_mat_desc;
-
-        AT_CUDNN_CHECK(cudnnGetRNNWeightParams(
+              &matrix_pointer,
+              unused_desc.mut_desc(),
+              &unused_pointer));
+        } else { // bias
+          AT_CUDNN_CHECK(cudnnGetRNNWeightParams(
               handle,
               rnn_desc.desc(),
               layer,
               weight_buf.numel() * weight_buf.element_size(),
               weight_buf.data_ptr(),
               linear_id,
+              unused_desc.mut_desc(),
+              &unused_pointer,
               lin_layer_mat_desc.mut_desc(),
-              &matrix_pointer,
-              unused_desc.mut_desc(), &unused_pointer));
+              &matrix_pointer));
+        }
 #endif
         data_ptrs.push_back(matrix_pointer);
       }
     }
-    return data_ptrs;
-  }
+    if (rnn.proj_size != 0) {
+      // assuming it's LSTM which has 8 "linear layers" (i.e. 4 weights and 4
+      // biases)
+      int64_t linear_id = 8;
+      void* matrix_pointer;
+#ifndef USE_CUDNN_RNN_V8_API
+      FilterDescriptor lin_layer_mat_desc;
+      AT_CUDNN_CHECK(cudnnGetRNNLinLayerMatrixParams(
+          handle,
+          rnn_desc.desc(),
+          layer,
+          x_desc.desc(),
+          w_desc.desc(),
+          weight_buf.data_ptr(),
+          linear_id,
+          lin_layer_mat_desc.mut_desc(),
+          &matrix_pointer));
+#else
+      void* unused_pointer;
+      TensorDescriptor unused_desc;
+      TensorDescriptor lin_layer_mat_desc;
 
-  void _viewOrCopyOneParam(const Tensor& param_from, const Tensor& param_to,
-                          bool copy, bool allow_type_change=false) {
-    // if copying, allow_type_change may be true or false.
-    // if viewing, allow_type_change must be false.
-    TORCH_INTERNAL_ASSERT(copy || !allow_type_change,
-                          "if viewing, type change is not allowed.");
-    TORCH_INTERNAL_ASSERT(allow_type_change || (param_from.scalar_type() == param_to.scalar_type()),
-                          "parameter types mismatch");
-    if (copy) {
-        param_to.copy_(param_from.view_as(param_to));
-    } else {
-        param_from.resize_as_(param_to);
+      AT_CUDNN_CHECK(cudnnGetRNNWeightParams(
+          handle,
+          rnn_desc.desc(),
+          layer,
+          weight_buf.numel() * weight_buf.element_size(),
+          weight_buf.data_ptr(),
+          linear_id,
+          lin_layer_mat_desc.mut_desc(),
+          &matrix_pointer,
+          unused_desc.mut_desc(),
+          &unused_pointer));
+#endif
+      data_ptrs.push_back(matrix_pointer);
     }
   }
+  return data_ptrs;
+}
 
-  void _viewOrCopyParams(MatrixRef<Tensor> params_from, MatrixRef<Tensor> params_to,
-                         bool copy, bool allow_type_change=false) {
-    TORCH_INTERNAL_ASSERT(params_from.size(0) == params_to.size(0), "number of layers mismatch");
-    for (const auto i : c10::irange(params_from.size(0))) {
-      auto layer_params_from = params_from[i];
-      auto layer_params_to = params_to[i];
-      // NOTE: these lists have all weights before all biases, so if the layer
-      // doesn't use biases, iteration will terminate once layer_params_from ends
-      // and ignore them.
-
-      // NOTE: there is an exception from the above statement. If LSTMs with projections
-      // are used, weights layout will be w_ih, w_hh, b_ih, b_hh, w_hr. So need to handle no-bias
-      // case specially, because will need to copy 0->0, 1->1, 2->4. This case can be uniquely
-      // identified by checking if number of defined parameters for each layer is 3.
-      if (layer_params_from.size() == 3 && layer_params_to.size() != 3) {
-        _viewOrCopyOneParam(layer_params_from[0], layer_params_to[0], copy, allow_type_change);
-        _viewOrCopyOneParam(layer_params_from[1], layer_params_to[1], copy, allow_type_change);
-        _viewOrCopyOneParam(layer_params_from[2], layer_params_to[4], copy, allow_type_change);
-        continue;
-      }
-      if (layer_params_to.size() == 3 && layer_params_from.size() != 3) {
-        _viewOrCopyOneParam(layer_params_from[0], layer_params_to[0], copy, allow_type_change);
-        _viewOrCopyOneParam(layer_params_from[1], layer_params_to[1], copy, allow_type_change);
-        _viewOrCopyOneParam(layer_params_from[4], layer_params_to[2], copy, allow_type_change);
-        continue;
-      }
-      for (auto a = layer_params_from.begin(), b = layer_params_to.begin();
-            a != layer_params_from.end() && b != layer_params_to.end();
-            ++a, ++b) {
-        _viewOrCopyOneParam(*a, *b, copy, allow_type_change);
-      }
-    }
+void _viewOrCopyOneParam(
+    const Tensor& param_from,
+    const Tensor& param_to,
+    bool copy,
+    bool allow_type_change = false) {
+  // if copying, allow_type_change may be true or false.
+  // if viewing, allow_type_change must be false.
+  TORCH_INTERNAL_ASSERT(
+      copy || !allow_type_change, "if viewing, type change is not allowed.");
+  TORCH_INTERNAL_ASSERT(
+      allow_type_change || (param_from.scalar_type() == param_to.scalar_type()),
+      "parameter types mismatch");
+  if (copy) {
+    param_to.copy_(param_from.view_as(param_to));
+  } else {
+    param_from.resize_as_(param_to);
   }
+}
 
-  void _copyParams(MatrixRef<Tensor> params_from, MatrixRef<Tensor> params_to) {
-    _viewOrCopyParams(params_from, params_to, true);
+void _viewOrCopyParams(
+    MatrixRef<Tensor> params_from,
+    MatrixRef<Tensor> params_to,
+    bool copy,
+    bool allow_type_change = false) {
+  TORCH_INTERNAL_ASSERT(
+      params_from.size(0) == params_to.size(0), "number of layers mismatch");
+  for (const auto i : c10::irange(params_from.size(0))) {
+    auto layer_params_from = params_from[i];
+    auto layer_params_to = params_to[i];
+    // NOTE: these lists have all weights before all biases, so if the layer
+    // doesn't use biases, iteration will terminate once layer_params_from ends
+    // and ignore them.
+
+    // NOTE: there is an exception from the above statement. If LSTMs with
+    // projections are used, weights layout will be w_ih, w_hh, b_ih, b_hh,
+    // w_hr. So need to handle no-bias case specially, because will need to copy
+    // 0->0, 1->1, 2->4. This case can be uniquely identified by checking if
+    // number of defined parameters for each layer is 3.
+    if (layer_params_from.size() == 3 && layer_params_to.size() != 3) {
+      _viewOrCopyOneParam(
+          layer_params_from[0], layer_params_to[0], copy, allow_type_change);
+      _viewOrCopyOneParam(
+          layer_params_from[1], layer_params_to[1], copy, allow_type_change);
+      _viewOrCopyOneParam(
+          layer_params_from[2], layer_params_to[4], copy, allow_type_change);
+      continue;
+    }
+    if (layer_params_to.size() == 3 && layer_params_from.size() != 3) {
+      _viewOrCopyOneParam(
+          layer_params_from[0], layer_params_to[0], copy, allow_type_change);
+      _viewOrCopyOneParam(
+          layer_params_from[1], layer_params_to[1], copy, allow_type_change);
+      _viewOrCopyOneParam(
+          layer_params_from[4], layer_params_to[2], copy, allow_type_change);
+      continue;
+    }
+    for (auto a = layer_params_from.begin(), b = layer_params_to.begin();
+         a != layer_params_from.end() && b != layer_params_to.end();
+         ++a, ++b) {
+      _viewOrCopyOneParam(*a, *b, copy, allow_type_change);
+    }
   }
+}
 
-  void _viewParams(MatrixRef<Tensor> params_from, MatrixRef<Tensor> params_to) {
-    _viewOrCopyParams(params_from, params_to, false);
-  }
+void _copyParams(MatrixRef<Tensor> params_from, MatrixRef<Tensor> params_to) {
+  _viewOrCopyParams(params_from, params_to, true);
+}
 
+void _viewParams(MatrixRef<Tensor> params_from, MatrixRef<Tensor> params_to) {
+  _viewOrCopyParams(params_from, params_to, false);
+}
 
-  std::vector<int64_t> _input_size(const TensorDescriptorListParams& tensors) {
-    if (tensors.is_input_packed()) {
-      return {tensors.batch_sizes_sum, tensors.input_size};
-    } else {
-      return {tensors.seq_length, tensors.mini_batch, tensors.input_size};
-    }
+std::vector<int64_t> _input_size(const TensorDescriptorListParams& tensors) {
+  if (tensors.is_input_packed()) {
+    return {tensors.batch_sizes_sum, tensors.input_size};
+  } else {
+    return {tensors.seq_length, tensors.mini_batch, tensors.input_size};
   }
+}
 
-  std::vector<int64_t> _hidden_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) {
-    if (rnn.proj_size != 0) {
-      return {rnn.num_layers * rnn.num_directions(), tensors.mini_batch, rnn.proj_size};
-    } else {
-      return {rnn.num_layers * rnn.num_directions(), tensors.mini_batch, rnn.hidden_size};
-    }
+std::vector<int64_t> _hidden_size(
+    const RNNDescriptorParams& rnn,
+    const TensorDescriptorListParams& tensors) {
+  if (rnn.proj_size != 0) {
+    return {
+        rnn.num_layers * rnn.num_directions(),
+        tensors.mini_batch,
+        rnn.proj_size};
+  } else {
+    return {
+        rnn.num_layers * rnn.num_directions(),
+        tensors.mini_batch,
+        rnn.hidden_size};
   }
+}
 
-  std::vector<int64_t> _cell_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) {
-    return {rnn.num_layers * rnn.num_directions(), tensors.mini_batch, rnn.hidden_size};
-  }
+std::vector<int64_t> _cell_size(
+    const RNNDescriptorParams& rnn,
+    const TensorDescriptorListParams& tensors) {
+  return {
+      rnn.num_layers * rnn.num_directions(),
+      tensors.mini_batch,
+      rnn.hidden_size};
+}
 
-  std::vector<int64_t> _output_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) {
-    auto out_size = rnn.hidden_size;
-    if (rnn.proj_size != 0) {
-      out_size = rnn.proj_size;
-    }
-    if (tensors.is_input_packed()) {
-      return {tensors.batch_sizes_sum, out_size * rnn.num_directions()};
-    } else {
-      return {tensors.seq_length, tensors.mini_batch, out_size * rnn.num_directions()};
-    }
+std::vector<int64_t> _output_size(
+    const RNNDescriptorParams& rnn,
+    const TensorDescriptorListParams& tensors) {
+  auto out_size = rnn.hidden_size;
+  if (rnn.proj_size != 0) {
+    out_size = rnn.proj_size;
   }
-
-  inline bool use_persist_common_heuristics(const RNNDescriptorParams& rnn,
-                                            const TensorDescriptorListParams& tensors) {
-    return rnn.num_layers == 1 &&
-           rnn.hidden_size <= 1024 &&
-           rnn.num_directions() == 1 &&
-           rnn.hidden_size % 128 == 0 &&
-           tensors.input_size % 128 == 0;
+  if (tensors.is_input_packed()) {
+    return {tensors.batch_sizes_sum, out_size * rnn.num_directions()};
+  } else {
+    return {
+        tensors.seq_length,
+        tensors.mini_batch,
+        out_size * rnn.num_directions()};
   }
+}
 
-  inline bool use_persist_device_heuristics(const RNNDescriptorParams& rnn,
-                                            const TensorDescriptorListParams& tensors) {
-    auto bsize = tensors.mini_batch;
-    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
-    if (prop->major == 7) {
-      if (prop->minor == 5) {
-        // Excludes Turing from using persistent rnn.
-        return false;
-      } else {
-        // technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf,
-        // weed them out
-        return ((bsize % 16 == 0 && bsize != 80 && bsize !=112) || bsize == 8) &&
-               ((tensors.seq_length >=40 && bsize <=128) ||
-                (tensors.seq_length >=20 && bsize <=96) ||
-                (tensors.seq_length >=10 && bsize <=32));
-      }
-    } else if (prop->major >= 8 && prop->multiProcessorCount >= 98) {
-      // SM count check excludes A30 (similar issue to A40)
-      if (prop->minor == 6) {
-        // Excludes sm_86 GPU devices from using persistent rnn.
-        // This is because there are some edge cases that will throw exceptions with cudnn 8.0.5 on Nvidia A40 GPU.
-        return false;
-      }
-      // Based on tests by Vasily Volkov and xwang233.  Vasily only tried bsize <= 128,
-      // so conservatively enable persistence for bsize <= 128 only.
-      // TODO:  Run more tests for bsize > 128.
-      if (rnn.mode == CUDNN_GRU) {
-        // Persistent GRU performance is flakier than other RNN types.  Exclude them for now.
-        // TODO:  Write a more refined GRU heuristic.
-        return false;
-      } else if (rnn.mode == CUDNN_LSTM) {
-        // Persistent LSTMs are comparable to or better than non-persistent for bsize <= 128.
-        return (bsize % 8 == 0) && (bsize <= 128);
-      } else {
-        // Persistent RNN_RELU and TANH show poor performance when bsize >= 96 AND hidden size >= 896.
-        return (bsize % 8 == 0) && (bsize <= 128) && (bsize < 96 || rnn.hidden_size < 896);
-      }
+inline bool use_persist_common_heuristics(
+    const RNNDescriptorParams& rnn,
+    const TensorDescriptorListParams& tensors) {
+  return rnn.num_layers == 1 && rnn.hidden_size <= 1024 &&
+      rnn.num_directions() == 1 && rnn.hidden_size % 128 == 0 &&
+      tensors.input_size % 128 == 0;
+}
+
+inline bool use_persist_device_heuristics(
+    const RNNDescriptorParams& rnn,
+    const TensorDescriptorListParams& tensors) {
+  auto bsize = tensors.mini_batch;
+  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+  if (prop->major == 7) {
+    if (prop->minor == 5) {
+      // Excludes Turing from using persistent rnn.
+      return false;
     } else {
+      // technically, batch size should be multiple of 8, but there are quite a
+      // few multiple-of-8 batchsizes that give bad perf, weed them out
+      return ((bsize % 16 == 0 && bsize != 80 && bsize != 112) || bsize == 8) &&
+          ((tensors.seq_length >= 40 && bsize <= 128) ||
+           (tensors.seq_length >= 20 && bsize <= 96) ||
+           (tensors.seq_length >= 10 && bsize <= 32));
+    }
+  } else if (prop->major >= 8 && prop->multiProcessorCount >= 98) {
+    // SM count check excludes A30 (similar issue to A40)
+    if (prop->minor == 6) {
+      // Excludes sm_86 GPU devices from using persistent rnn.
+      // This is because there are some edge cases that will throw exceptions
+      // with cudnn 8.0.5 on Nvidia A40 GPU.
       return false;
     }
+    // Based on tests by Vasily Volkov and xwang233.  Vasily only tried bsize <=
+    // 128, so conservatively enable persistence for bsize <= 128 only.
+    // TODO:  Run more tests for bsize > 128.
+    if (rnn.mode == CUDNN_GRU) {
+      // Persistent GRU performance is flakier than other RNN types.  Exclude
+      // them for now.
+      // TODO:  Write a more refined GRU heuristic.
+      return false;
+    } else if (rnn.mode == CUDNN_LSTM) {
+      // Persistent LSTMs are comparable to or better than non-persistent for
+      // bsize <= 128.
+      return (bsize % 8 == 0) && (bsize <= 128);
+    } else {
+      // Persistent RNN_RELU and TANH show poor performance when bsize >= 96 AND
+      // hidden size >= 896.
+      return (bsize % 8 == 0) && (bsize <= 128) &&
+          (bsize < 96 || rnn.hidden_size < 896);
+    }
+  } else {
+    return false;
   }
+}
 
-  inline bool use_rnn_persist_small_h(const RNNDescriptorParams& rnn,
-                                            const TensorDescriptorListParams& tensors,
-                                            bool forward) {
+inline bool use_rnn_persist_small_h(
+    const RNNDescriptorParams& rnn,
+    const TensorDescriptorListParams& tensors,
+    bool forward) {
 #if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8201 // 8.2.1
-    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
-    if (prop->major < 6) return false;
+  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+  if (prop->major < 6)
+    return false;
 
-    if (forward) {
-      if (rnn.mode == CUDNN_RNN_RELU || rnn.mode == CUDNN_RNN_TANH) {
-        return rnn.hidden_size <= 384;
-      }
-      if (rnn.mode == CUDNN_LSTM || rnn.mode == CUDNN_GRU) {
-        return rnn.hidden_size <= 192;
-      }
-    } else /* backward */ {
-      if (rnn.mode == CUDNN_RNN_RELU || rnn.mode == CUDNN_RNN_TANH) {
-        return rnn.hidden_size <= 256;
-      }
-      if (rnn.mode == CUDNN_LSTM || rnn.mode == CUDNN_GRU) {
-        return rnn.hidden_size <= 128;
-      }
+  if (forward) {
+    if (rnn.mode == CUDNN_RNN_RELU || rnn.mode == CUDNN_RNN_TANH) {
+      return rnn.hidden_size <= 384;
+    }
+    if (rnn.mode == CUDNN_LSTM || rnn.mode == CUDNN_GRU) {
+      return rnn.hidden_size <= 192;
     }
+  } else /* backward */ {
+    if (rnn.mode == CUDNN_RNN_RELU || rnn.mode == CUDNN_RNN_TANH) {
+      return rnn.hidden_size <= 256;
+    }
+    if (rnn.mode == CUDNN_LSTM || rnn.mode == CUDNN_GRU) {
+      return rnn.hidden_size <= 128;
+    }
+  }
 
-    return false;
+  return false;
 #else
-    return false;
+  return false;
 #endif
-  }
+}
 
-  cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input, bool forward) {
-    // LSTM with projections only works with standard algorithm
-    if (rnn.proj_size != 0) {
-      return CUDNN_RNN_ALGO_STANDARD;
-    }
+cudnnRNNAlgo_t get_algo(
+    const RNNDescriptorParams& rnn,
+    const TensorDescriptorListParams& tensors,
+    const Tensor input,
+    bool forward) {
+  // LSTM with projections only works with standard algorithm
+  if (rnn.proj_size != 0) {
+    return CUDNN_RNN_ALGO_STANDARD;
+  }
 
-    // Persistent algos typically don't work for packed inputs with sequence lengths that vary
-    // across batch elements, and will return CUDNN_STATUS_NOT_SUPPORTED if attempted. See
-    // https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#features-of-rnn-functions
-    if (!tensors.is_input_packed()) {
-      auto cudnnDataType = getCudnnDataType(input);
+  // Persistent algos typically don't work for packed inputs with sequence
+  // lengths that vary across batch elements, and will return
+  // CUDNN_STATUS_NOT_SUPPORTED if attempted. See
+  // https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#features-of-rnn-functions
+  if (!tensors.is_input_packed()) {
+    auto cudnnDataType = getCudnnDataType(input);
 #if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8201 // 8.2.1
-      if (cudnnDataType != CUDNN_DATA_DOUBLE) {
-        if (use_rnn_persist_small_h(rnn, tensors, forward)) {
-          return CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H;
-        }
+    if (cudnnDataType != CUDNN_DATA_DOUBLE) {
+      if (use_rnn_persist_small_h(rnn, tensors, forward)) {
+        return CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H;
       }
+    }
 #endif
-      if (cudnnDataType == CUDNN_DATA_HALF) {
-        if (use_persist_common_heuristics(rnn, tensors) &&
-            use_persist_device_heuristics(rnn, tensors)) {
-          return CUDNN_RNN_ALGO_PERSIST_STATIC;
-        }
+    if (cudnnDataType == CUDNN_DATA_HALF) {
+      if (use_persist_common_heuristics(rnn, tensors) &&
+          use_persist_device_heuristics(rnn, tensors)) {
+        return CUDNN_RNN_ALGO_PERSIST_STATIC;
       }
     }
-
-    return CUDNN_RNN_ALGO_STANDARD;
   }
 
-  cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
-    if (dtype == CUDNN_DATA_HALF) {
-      return CUDNN_DATA_FLOAT;
-    }
-    return dtype;
+  return CUDNN_RNN_ALGO_STANDARD;
+}
+
+cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
+  if (dtype == CUDNN_DATA_HALF) {
+    return CUDNN_DATA_FLOAT;
   }
+  return dtype;
+}
 
-} // anonymous namespace
+} // namespace native
 
 // Utilities exposed in RNNUtils.h
 namespace cudnn_rnn {
@@ -1097,7 +1281,8 @@ copy_weights_to_flat_buf_views(
 #ifdef USE_CUDNN_RNN_V8_API
       input_size,
       false, // eqy: bogus as we do not know if the input is packed here
-         // but it should not affect the weights (what are are interested in)
+             // but it should not affect the weights (what are are interested
+             // in)
 #endif
       hidden_size,
       proj_size,
@@ -1130,9 +1315,7 @@ copy_weights_to_flat_buf_views(
 #endif
 
   // Slice off views into weight_buf
-  std::vector<Tensor> params_arr;
-  size_t params_stride0;
-  std::tie(params_arr, params_stride0) = get_parameters(
+  auto [params_arr, params_stride0] = get_parameters(
 #ifndef USE_CUDNN_RNN_V8_API
       handle, rnn, rnn_desc, x_desc, w_desc, weight_buf, include_bias);
 #else
@@ -1177,12 +1360,15 @@ using namespace cudnn_rnn;
 // functions, only one of which does an inplace update, but we leave this
 // for future work
 Tensor _cudnn_rnn_flatten_weight(
-    TensorList weight_arr, int64_t weight_stride0,
+    TensorList weight_arr,
+    int64_t weight_stride0,
     int64_t input_size,
-    int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size,
-    int64_t fn_num_layers, bool batch_first,
-    bool fn_bidirectional
-    ) {
+    int64_t fn_mode,
+    int64_t fn_hidden_size,
+    int64_t fn_proj_size,
+    int64_t fn_num_layers,
+    bool batch_first,
+    bool fn_bidirectional) {
   // returns flat weight_buf
   return std::get<0>(copy_weights_to_flat_buf_views(
       weight_arr,
@@ -1199,24 +1385,37 @@ Tensor _cudnn_rnn_flatten_weight(
       /*set_orig_weights_to_flat_buf=*/true));
 }
 
-const char * WEIGHT_FORMAT_WARN = "RNN module weights are not part of single contiguous "
-                                  "chunk of memory. This means they need to be compacted "
-                                  "at every call, possibly greatly increasing memory usage. "
-                                  "To compact weights again call flatten_parameters().";
+const char* WEIGHT_FORMAT_WARN =
+    "RNN module weights are not part of single contiguous "
+    "chunk of memory. This means they need to be compacted "
+    "at every call, possibly greatly increasing memory usage. "
+    "To compact weights again call flatten_parameters().";
 
 // NB: when fn_batch_sizes is empty, that means no batch sizes was specified
 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     const Tensor& input_r,
-    TensorList weight, int64_t weight_stride0, const c10::optional<Tensor>& weight_buf_r_opt, const Tensor& hx, const c10::optional<Tensor>& cx_opt,
-    int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size,
-    int64_t fn_num_layers, bool batch_first, double fn_dropout,
-    bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes, const c10::optional<Tensor>& fn_dropout_state_opt
-    ) {
+    TensorList weight,
+    int64_t weight_stride0,
+    const c10::optional<Tensor>& weight_buf_r_opt,
+    const Tensor& hx,
+    const c10::optional<Tensor>& cx_opt,
+    int64_t fn_mode,
+    int64_t fn_hidden_size,
+    int64_t fn_proj_size,
+    int64_t fn_num_layers,
+    bool batch_first,
+    double fn_dropout,
+    bool fn_train,
+    bool fn_bidirectional,
+    IntArrayRef fn_batch_sizes,
+    const c10::optional<Tensor>& fn_dropout_state_opt) {
   // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_buf_r_maybe_owned = at::borrow_from_optional_tensor(weight_buf_r_opt);
+  c10::MaybeOwned<Tensor> weight_buf_r_maybe_owned =
+      at::borrow_from_optional_tensor(weight_buf_r_opt);
   const Tensor& weight_buf_r = *weight_buf_r_maybe_owned;
-  const Tensor& cx = c10::value_or_else(cx_opt, [] {return Tensor();});
-  const Tensor& fn_dropout_state = c10::value_or_else(fn_dropout_state_opt, [] {return Tensor();});
+  const Tensor& cx = c10::value_or_else(cx_opt, [] { return Tensor(); });
+  const Tensor& fn_dropout_state =
+      c10::value_or_else(fn_dropout_state_opt, [] { return Tensor(); });
 
   check_attributes(input_r, weight, {hx, cx}, /*check_dtype=*/true);
   auto input = input_r;
@@ -1225,18 +1424,34 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     TORCH_WARN(WEIGHT_FORMAT_WARN);
   }
   if (fn_dropout_state.defined()) {
-      auto input_arg = TensorArg(input, "input", 1);
-      auto dropout_state_arg = TensorArg(fn_dropout_state, "dropout_states", 15);
-      checkSameGPU("cudnn_rnn", input_arg, dropout_state_arg);
+    auto input_arg = TensorArg(input, "input", 1);
+    auto dropout_state_arg = TensorArg(fn_dropout_state, "dropout_states", 15);
+    checkSameGPU("cudnn_rnn", input_arg, dropout_state_arg);
   }
   RNNParams fn;
   auto datatype = getCudnnDataType(input);
 #ifndef USE_CUDNN_RNN_V8_API
-  fn.rnn.set(fn_mode, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
+  fn.rnn.set(
+      fn_mode,
+      fn_hidden_size,
+      fn_proj_size,
+      fn_num_layers,
+      fn_bidirectional,
+      promote_rnn_math_type(datatype),
+      datatype);
 #else
   auto input_size = input_r.size(-1);
   auto packed = fn_batch_sizes.size() != 0;
-  fn.rnn.set(fn_mode, input_size, packed, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
+  fn.rnn.set(
+      fn_mode,
+      input_size,
+      packed,
+      fn_hidden_size,
+      fn_proj_size,
+      fn_num_layers,
+      fn_bidirectional,
+      promote_rnn_math_type(datatype),
+      datatype);
 #endif
   fn.dropout.set(fn_train, fn_dropout, fn_dropout_state);
   fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first);
@@ -1244,8 +1459,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   // TODO: Set device to input
 
   if (fn.rnn.mode != CUDNN_LSTM) {
-    TORCH_CHECK(!cx.defined(),
-             "rnn: illegal defined cx for non-LSTM RNN");
+    TORCH_CHECK(!cx.defined(), "rnn: illegal defined cx for non-LSTM RNN");
   }
 
   // TODO: can batch_first be a wrapper around this function?
@@ -1258,10 +1472,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   auto cell_size = _cell_size(fn.rnn, fn.tensors);
   auto output_size = _output_size(fn.rnn, fn.tensors);
 
-  TORCH_CHECK(hx.is_contiguous(),
-           "rnn: hx is not contiguous");
-  TORCH_CHECK(!cx.defined() || cx.is_contiguous(),
-           "rnn: cx is not contiguous");
+  TORCH_CHECK(hx.is_contiguous(), "rnn: hx is not contiguous");
+  TORCH_CHECK(!cx.defined() || cx.is_contiguous(), "rnn: cx is not contiguous");
 
   auto x = input.contiguous();
   auto output = at::empty(output_size, input.options());
@@ -1270,7 +1482,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   if (cx.defined()) {
     cy = at::empty(cell_size, cx.options());
   } else {
-    cy = at::empty({0}, hx.options()); // NB: Not allowed to return undefined tensors
+    cy = at::empty(
+        {0}, hx.options()); // NB: Not allowed to return undefined tensors
   }
   auto y = output;
 
@@ -1284,7 +1497,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
 #endif
   if (!weight_buf.defined()) {
 #ifndef USE_CUDNN_RNN_V8_API
-    auto num_weights = get_num_weights(handle, descs.rnn_desc, descs.x_descs[0], datatype);
+    auto num_weights =
+        get_num_weights(handle, descs.rnn_desc, descs.x_descs[0], datatype);
 #else
     auto num_weights = get_num_weights(handle, descs.rnn_desc, datatype);
 #endif
@@ -1293,23 +1507,28 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     w_desc.set(weight_buf, 3);
 #endif
     weight_buf.zero_();
-    std::vector<Tensor> params;
-    size_t params_stride0;
 #ifndef USE_CUDNN_RNN_V8_API
-    std::tie(params, params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, weight_buf);
+    auto [params, params_stride0] = get_parameters(
+        handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, weight_buf);
 #else
-    std::tie(params, params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, weight_buf);
+    auto [params, params_stride0] =
+        get_parameters(handle, fn.rnn, descs.rnn_desc, weight_buf);
 #endif
-    _copyParams(MatrixRef<Tensor>{weight, static_cast<size_t>(weight_stride0)},
-                MatrixRef<Tensor>{params, params_stride0});
+    _copyParams(
+        MatrixRef<Tensor>{weight, static_cast<size_t>(weight_stride0)},
+        MatrixRef<Tensor>{params, params_stride0});
   } else {
 #ifndef USE_CUDNN_RNN_V8_API
     w_desc.set(weight_buf, 3);
 #endif
   }
 
-  TORCH_CHECK(!cx.defined() || cx.sizes().equals(cell_size),
-          "Expected cell size ", IntArrayRef{cell_size}, ", got ", cx.sizes());
+  TORCH_CHECK(
+      !cx.defined() || cx.sizes().equals(cell_size),
+      "Expected cell size ",
+      IntArrayRef{cell_size},
+      ", got ",
+      cx.sizes());
   size_t workspace_size;
 #ifndef USE_CUDNN_RNN_V8_API
   auto x_descs_arr = descs.get_x_descs();
@@ -1320,12 +1539,11 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
 #endif
 #ifndef USE_CUDNN_RNN_V8_API
   AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize(
-        handle,
-        descs.rnn_desc.desc(),
-        fn.tensors.seq_length,
-        x_descs_arr.data(),
-        &workspace_size
-        ));
+      handle,
+      descs.rnn_desc.desc(),
+      fn.tensors.seq_length,
+      x_descs_arr.data(),
+      &workspace_size));
 #endif
   Tensor workspace;
   Tensor reserve;
@@ -1335,93 +1553,123 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     size_t reserve_size;
 #ifndef USE_CUDNN_RNN_V8_API
     AT_CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(
-          handle,
-          descs.rnn_desc.desc(),
-          fn.tensors.seq_length,
-          x_descs_arr.data(),
-          &reserve_size
-          ));
+        handle,
+        descs.rnn_desc.desc(),
+        fn.tensors.seq_length,
+        x_descs_arr.data(),
+        &reserve_size));
 #else
     AT_CUDNN_CHECK(cudnnGetRNNTempSpaceSizes(
-          handle,
-          descs.rnn_desc.desc(),
-          CUDNN_FWD_MODE_TRAINING,
-          x_descs_arr.desc(),
-          &workspace_size,
-          &reserve_size
-          ));
+        handle,
+        descs.rnn_desc.desc(),
+        CUDNN_FWD_MODE_TRAINING,
+        x_descs_arr.desc(),
+        &workspace_size,
+        &reserve_size));
 #endif
     workspace = at::empty(workspace_size, input.options().dtype(kByte));
     reserve = at::empty(reserve_size, input.options().dtype(kByte));
 #ifndef USE_CUDNN_RNN_V8_API
     AT_CUDNN_CHECK(cudnnRNNForwardTraining(
-          handle,
-          descs.rnn_desc.desc(),
-          fn.tensors.seq_length,
-          x_descs_arr.data(), x.data_ptr(),
-          descs.hx_desc.desc(), hx.data_ptr(),
-          descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr,
-          w_desc.desc(), weight_buf.data_ptr(),
-          y_descs_arr.data(), y.data_ptr(),
-          descs.hy_desc.desc(), hy.data_ptr(),
-          descs.cy_desc.desc(), cy.defined() ? cy.data_ptr() : nullptr,
-          workspace.data_ptr(), workspace.size(0),
-          reserve.mutable_data_ptr(), reserve.size(0)
-          ));
+        handle,
+        descs.rnn_desc.desc(),
+        fn.tensors.seq_length,
+        x_descs_arr.data(),
+        x.data_ptr(),
+        descs.hx_desc.desc(),
+        hx.data_ptr(),
+        descs.cx_desc.desc(),
+        cx.defined() ? cx.data_ptr() : nullptr,
+        w_desc.desc(),
+        weight_buf.data_ptr(),
+        y_descs_arr.data(),
+        y.data_ptr(),
+        descs.hy_desc.desc(),
+        hy.data_ptr(),
+        descs.cy_desc.desc(),
+        cy.defined() ? cy.data_ptr() : nullptr,
+        workspace.data_ptr(),
+        workspace.size(0),
+        reserve.mutable_data_ptr(),
+        reserve.size(0)));
 #else
     AT_CUDNN_CHECK(cudnnRNNForward(
-          handle,
-          descs.rnn_desc.desc(),
-          CUDNN_FWD_MODE_TRAINING,
-          nullptr,
-          x_descs_arr.desc(), x.data_ptr(),
-          y_descs_arr.desc(), y.data_ptr(),
-          descs.hx_desc.desc(), hx.data_ptr(), hy.data_ptr(),
-          descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr, cy.defined() ? cy.data_ptr() : nullptr,
-          weight_buf.numel() * weight_buf.element_size(), weight_buf.data_ptr(),
-          workspace.size(0), workspace.data_ptr(),
-          reserve.size(0), reserve.mutable_data_ptr()));
+        handle,
+        descs.rnn_desc.desc(),
+        CUDNN_FWD_MODE_TRAINING,
+        nullptr,
+        x_descs_arr.desc(),
+        x.data_ptr(),
+        y_descs_arr.desc(),
+        y.data_ptr(),
+        descs.hx_desc.desc(),
+        hx.data_ptr(),
+        hy.data_ptr(),
+        descs.cx_desc.desc(),
+        cx.defined() ? cx.data_ptr() : nullptr,
+        cy.defined() ? cy.data_ptr() : nullptr,
+        weight_buf.numel() * weight_buf.element_size(),
+        weight_buf.data_ptr(),
+        workspace.size(0),
+        workspace.data_ptr(),
+        reserve.size(0),
+        reserve.mutable_data_ptr()));
 #endif
   } else { // inference
 #ifdef USE_CUDNN_RNN_V8_API
     AT_CUDNN_CHECK(cudnnGetRNNTempSpaceSizes(
-          handle,
-          descs.rnn_desc.desc(),
-          CUDNN_FWD_MODE_INFERENCE,
-          x_descs_arr.desc(),
-          &workspace_size,
-          NULL
-          ));
+        handle,
+        descs.rnn_desc.desc(),
+        CUDNN_FWD_MODE_INFERENCE,
+        x_descs_arr.desc(),
+        &workspace_size,
+        NULL));
 #endif
     workspace = at::empty(workspace_size, input.options().dtype(kByte));
     reserve = at::empty({0}, input.options().dtype(kByte));
 #ifndef USE_CUDNN_RNN_V8_API
     AT_CUDNN_CHECK(cudnnRNNForwardInference(
-          handle,
-          descs.rnn_desc.desc(),
-          fn.tensors.seq_length,
-          x_descs_arr.data(), x.data_ptr(),
-          descs.hx_desc.desc(), hx.data_ptr(),
-          descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr,
-          w_desc.desc(), weight_buf.data_ptr(),
-          y_descs_arr.data(), y.data_ptr(),
-          descs.hy_desc.desc(), hy.data_ptr(),
-          descs.cy_desc.desc(), cy.defined() ? cy.data_ptr() : nullptr,
-          workspace.data_ptr(), workspace.size(0)
-          ));
+        handle,
+        descs.rnn_desc.desc(),
+        fn.tensors.seq_length,
+        x_descs_arr.data(),
+        x.data_ptr(),
+        descs.hx_desc.desc(),
+        hx.data_ptr(),
+        descs.cx_desc.desc(),
+        cx.defined() ? cx.data_ptr() : nullptr,
+        w_desc.desc(),
+        weight_buf.data_ptr(),
+        y_descs_arr.data(),
+        y.data_ptr(),
+        descs.hy_desc.desc(),
+        hy.data_ptr(),
+        descs.cy_desc.desc(),
+        cy.defined() ? cy.data_ptr() : nullptr,
+        workspace.data_ptr(),
+        workspace.size(0)));
 #else
     AT_CUDNN_CHECK(cudnnRNNForward(
-          handle,
-          descs.rnn_desc.desc(),
-          CUDNN_FWD_MODE_INFERENCE,
-          nullptr,
-          x_descs_arr.desc(), x.data_ptr(),
-          y_descs_arr.desc(), y.data_ptr(),
-          descs.hx_desc.desc(), hx.data_ptr(), hy.data_ptr(),
-          descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr, cy.defined() ? cy.data_ptr() : nullptr,
-          weight_buf.numel() * weight_buf.element_size(), weight_buf.data_ptr(),
-          workspace.size(0), workspace.data_ptr(),
-          reserve.size(0), reserve.mutable_data_ptr()));
+        handle,
+        descs.rnn_desc.desc(),
+        CUDNN_FWD_MODE_INFERENCE,
+        nullptr,
+        x_descs_arr.desc(),
+        x.data_ptr(),
+        y_descs_arr.desc(),
+        y.data_ptr(),
+        descs.hx_desc.desc(),
+        hx.data_ptr(),
+        hy.data_ptr(),
+        descs.cx_desc.desc(),
+        cx.defined() ? cx.data_ptr() : nullptr,
+        cy.defined() ? cy.data_ptr() : nullptr,
+        weight_buf.numel() * weight_buf.element_size(),
+        weight_buf.data_ptr(),
+        workspace.size(0),
+        workspace.data_ptr(),
+        reserve.size(0),
+        reserve.mutable_data_ptr()));
 #endif
   }
 
@@ -1433,16 +1681,26 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
 }
 
 std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
-    const Tensor& input_r, const Tensor& weight_buf, const Tensor& hx, const Tensor& cx,
-    const Tensor& output_r, const Tensor& grad_output_r, const Tensor& grad_hy,
+    const Tensor& input_r,
+    const Tensor& weight_buf,
+    const Tensor& hx,
+    const Tensor& cx,
+    const Tensor& output_r,
+    const Tensor& grad_output_r,
+    const Tensor& grad_hy,
     const Tensor& grad_cy,
-    int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size,
-    int64_t fn_num_layers, bool batch_first, double fn_dropout,
-    bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes,
-    const Tensor& fn_dropout_state, const Tensor& fn_reserve,
-    std::array<bool, 3> output_mask
-    ) {
-
+    int64_t fn_mode,
+    int64_t fn_hidden_size,
+    int64_t fn_proj_size,
+    int64_t fn_num_layers,
+    bool batch_first,
+    double fn_dropout,
+    bool fn_train,
+    bool fn_bidirectional,
+    IntArrayRef fn_batch_sizes,
+    const Tensor& fn_dropout_state,
+    const Tensor& fn_reserve,
+    std::array<bool, 3> output_mask) {
   auto input = input_r;
   auto grad_output = grad_output_r;
   auto output = output_r;
@@ -1450,11 +1708,27 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   RNNParams fn;
   auto datatype = getCudnnDataType(input);
 #ifndef USE_CUDNN_RNN_V8_API
-  fn.rnn.set(fn_mode, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
+  fn.rnn.set(
+      fn_mode,
+      fn_hidden_size,
+      fn_proj_size,
+      fn_num_layers,
+      fn_bidirectional,
+      promote_rnn_math_type(datatype),
+      datatype);
 #else
   auto cudnn_input_size = input_r.size(-1);
   auto packed = fn_batch_sizes.size() != 0;
-  fn.rnn.set(fn_mode, cudnn_input_size, packed, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
+  fn.rnn.set(
+      fn_mode,
+      cudnn_input_size,
+      packed,
+      fn_hidden_size,
+      fn_proj_size,
+      fn_num_layers,
+      fn_bidirectional,
+      promote_rnn_math_type(datatype),
+      datatype);
 #endif
   fn.dropout.set(fn_train, fn_dropout, fn_dropout_state);
   fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first);
@@ -1463,8 +1737,7 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   auto handle = getCudnnHandle();
 
   if (fn.rnn.mode != CUDNN_LSTM) {
-    TORCH_CHECK(!cx.defined(),
-             "rnn: illegal defined cx for non-LSTM RNN");
+    TORCH_CHECK(!cx.defined(), "rnn: illegal defined cx for non-LSTM RNN");
   }
 
   auto is_input_packed = fn_batch_sizes.size() != 0;
@@ -1479,41 +1752,68 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   auto cell_size = _cell_size(fn.rnn, fn.tensors);
   auto output_size = _output_size(fn.rnn, fn.tensors);
 
-  TORCH_CHECK(hx.is_contiguous(),
-           "rnn: hx is not contiguous");
-  TORCH_CHECK(!cx.defined() || cx.is_contiguous(),
-           "rnn: cx is not contiguous");
+  TORCH_CHECK(hx.is_contiguous(), "rnn: hx is not contiguous");
+  TORCH_CHECK(!cx.defined() || cx.is_contiguous(), "rnn: cx is not contiguous");
 
   auto x = input.contiguous();
   auto dy = grad_output.contiguous();
   auto y = output;
   auto w = weight_buf;
-  auto dx = at::empty(input.sizes(), input.options()); // TODO: more compact way of saying this
+  auto dx = at::empty(
+      input.sizes(), input.options()); // TODO: more compact way of saying this
   auto dhy = grad_hy.contiguous().view(hidden_size);
-  auto dcy = grad_cy.defined() ? grad_cy.contiguous().view(cell_size) : Tensor();
+  auto dcy =
+      grad_cy.defined() ? grad_cy.contiguous().view(cell_size) : Tensor();
   auto dhx = at::empty(hidden_size, hx.options());
-  TORCH_INTERNAL_ASSERT(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN");
+  TORCH_INTERNAL_ASSERT(
+      cx.defined() || !output_mask[2],
+      "illegally required grad of cx for non-LSTM RNN");
   auto dcx = cx.defined() ? at::empty(cell_size, cx.options()) : Tensor();
 
-  TORCH_CHECK(fn_train,
-           "cudnn RNN backward can only be called in training mode");
+  TORCH_CHECK(
+      fn_train, "cudnn RNN backward can only be called in training mode");
 
-  TORCH_CHECK(input.sizes().equals(input_size),
-           "Expected input size ", IntArrayRef{input_size}, ", got ", input.sizes());
-  TORCH_CHECK(output.sizes().equals(output_size),
-           "Expected output size ", IntArrayRef{output_size}, ", got ", output.sizes());
+  TORCH_CHECK(
+      input.sizes().equals(input_size),
+      "Expected input size ",
+      IntArrayRef{input_size},
+      ", got ",
+      input.sizes());
+  TORCH_CHECK(
+      output.sizes().equals(output_size),
+      "Expected output size ",
+      IntArrayRef{output_size},
+      ", got ",
+      output.sizes());
 
-  TORCH_CHECK(!hx.defined() || hx.sizes().equals(hidden_size),
-           "Expected hidden size ", IntArrayRef{hidden_size}, ", got ", hx.sizes());
-  TORCH_CHECK(!cx.defined() || cx.sizes().equals(cell_size),
-           "Expected cell size ", IntArrayRef{cell_size}, ", got ", cx.sizes());
-  TORCH_CHECK(!dhy.defined() || dhy.sizes().equals(hidden_size),
-           "Expected d_hidden size ", IntArrayRef{hidden_size}, ", got ", dhy.sizes());
-  TORCH_CHECK(!dcy.defined() || dcy.sizes().equals(cell_size),
-           "Expected d_cell size ", IntArrayRef{cell_size}, ", got ", dcy.sizes());
+  TORCH_CHECK(
+      !hx.defined() || hx.sizes().equals(hidden_size),
+      "Expected hidden size ",
+      IntArrayRef{hidden_size},
+      ", got ",
+      hx.sizes());
+  TORCH_CHECK(
+      !cx.defined() || cx.sizes().equals(cell_size),
+      "Expected cell size ",
+      IntArrayRef{cell_size},
+      ", got ",
+      cx.sizes());
+  TORCH_CHECK(
+      !dhy.defined() || dhy.sizes().equals(hidden_size),
+      "Expected d_hidden size ",
+      IntArrayRef{hidden_size},
+      ", got ",
+      dhy.sizes());
+  TORCH_CHECK(
+      !dcy.defined() || dcy.sizes().equals(cell_size),
+      "Expected d_cell size ",
+      IntArrayRef{cell_size},
+      ", got ",
+      dcy.sizes());
 
-  TORCH_CHECK(dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()),
-           "Gradients aren't CUDA tensors");
+  TORCH_CHECK(
+      dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()),
+      "Gradients aren't CUDA tensors");
 
   cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input, false);
   fn.rnn.set_algo(algo);
@@ -1529,61 +1829,77 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   auto x_descs_arr = descs.get_x_descs();
   auto y_descs_arr = descs.get_y_descs();
   AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize(
-        handle,
-        descs.rnn_desc.desc(),
-        fn.tensors.seq_length,
-        x_descs_arr.data(),
-        &workspace_size
-        ));
+      handle,
+      descs.rnn_desc.desc(),
+      fn.tensors.seq_length,
+      x_descs_arr.data(),
+      &workspace_size));
 #else
   auto& x_descs_arr = descs.x_descs;
   auto& y_descs_arr = descs.y_descs;
   AT_CUDNN_CHECK(cudnnGetRNNTempSpaceSizes(
-        handle,
-        descs.rnn_desc.desc(),
-        CUDNN_FWD_MODE_TRAINING,
-        x_descs_arr.desc(),
-        &workspace_size,
-        NULL
-        ));
+      handle,
+      descs.rnn_desc.desc(),
+      CUDNN_FWD_MODE_TRAINING,
+      x_descs_arr.desc(),
+      &workspace_size,
+      NULL));
 #endif
   // TODO: put this in the correct device???
   Tensor workspace = at::empty(workspace_size, input.options().dtype(kByte));
 #ifndef USE_CUDNN_RNN_V8_API
   AT_CUDNN_CHECK(cudnnRNNBackwardData(
-        handle,
-        descs.rnn_desc.desc(),
-        fn.tensors.seq_length,
-        y_descs_arr.data(), y.data_ptr(),
-        y_descs_arr.data(), dy.data_ptr(),
-        descs.hy_desc.desc(), dhy.data_ptr(),
-        descs.cy_desc.desc(), cx.defined() ? dcy.data_ptr() : nullptr,
-        w_desc.desc(), w.data_ptr(),
-        descs.hx_desc.desc(), hx.data_ptr(),
-        descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr,
-        x_descs_arr.data(), dx.data_ptr(),
-        descs.hx_desc.desc(), dhx.data_ptr(),
-        descs.cx_desc.desc(), cx.defined() ? dcx.data_ptr() : nullptr,
-        workspace.data_ptr(), workspace.size(0),
-        fn_reserve.data_ptr(), fn_reserve.size(0)
-        ));
+      handle,
+      descs.rnn_desc.desc(),
+      fn.tensors.seq_length,
+      y_descs_arr.data(),
+      y.data_ptr(),
+      y_descs_arr.data(),
+      dy.data_ptr(),
+      descs.hy_desc.desc(),
+      dhy.data_ptr(),
+      descs.cy_desc.desc(),
+      cx.defined() ? dcy.data_ptr() : nullptr,
+      w_desc.desc(),
+      w.data_ptr(),
+      descs.hx_desc.desc(),
+      hx.data_ptr(),
+      descs.cx_desc.desc(),
+      cx.defined() ? cx.data_ptr() : nullptr,
+      x_descs_arr.data(),
+      dx.data_ptr(),
+      descs.hx_desc.desc(),
+      dhx.data_ptr(),
+      descs.cx_desc.desc(),
+      cx.defined() ? dcx.data_ptr() : nullptr,
+      workspace.data_ptr(),
+      workspace.size(0),
+      fn_reserve.data_ptr(),
+      fn_reserve.size(0)));
 #else
   AT_CUDNN_CHECK(cudnnRNNBackwardData_v8(
-        handle,
-        descs.rnn_desc.desc(),
-        nullptr,
-        y_descs_arr.desc(), y.data_ptr(),
-        dy.data_ptr(),
-        x_descs_arr.desc(), dx.data_ptr(),
-        descs.hx_desc.desc(), hx.data_ptr(),
-        dhy.data_ptr(),
-        dhx.data_ptr(),
-        descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr,
-        cx.defined() ? dcy.data_ptr() : nullptr,
-        cx.defined() ? dcx.data_ptr() : nullptr,
-        weight_buf.numel() * weight_buf.element_size(), weight_buf.data_ptr(),
-        workspace.size(0), workspace.data_ptr(),
-        fn_reserve.size(0), fn_reserve.data_ptr()));
+      handle,
+      descs.rnn_desc.desc(),
+      nullptr,
+      y_descs_arr.desc(),
+      y.data_ptr(),
+      dy.data_ptr(),
+      x_descs_arr.desc(),
+      dx.data_ptr(),
+      descs.hx_desc.desc(),
+      hx.data_ptr(),
+      dhy.data_ptr(),
+      dhx.data_ptr(),
+      descs.cx_desc.desc(),
+      cx.defined() ? cx.data_ptr() : nullptr,
+      cx.defined() ? dcy.data_ptr() : nullptr,
+      cx.defined() ? dcx.data_ptr() : nullptr,
+      weight_buf.numel() * weight_buf.element_size(),
+      weight_buf.data_ptr(),
+      workspace.size(0),
+      workspace.data_ptr(),
+      fn_reserve.size(0),
+      fn_reserve.data_ptr()));
 #endif
   if (batch_first && !is_input_packed) {
     dx = dx.transpose_(0, 1);
@@ -1596,27 +1912,52 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
 // We'll give a user friendly combined function...
 std::vector<Tensor> _cudnn_rnn_backward_weight(
     // TODO: I think tensor geometry sufficient for weight_buf/weight
-    const Tensor& input_r, TensorList weight_arr, int64_t weight_stride0,
-    const Tensor& weight_buf, const Tensor& hx, const Tensor& cx,
+    const Tensor& input_r,
+    TensorList weight_arr,
+    int64_t weight_stride0,
+    const Tensor& weight_buf,
+    const Tensor& hx,
+    const Tensor& cx,
     const Tensor& output_r,
-    int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size,
-    int64_t fn_num_layers, bool batch_first, double fn_dropout,
-    bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes,
-    const Tensor& fn_dropout_state, const Tensor& fn_reserve
-    ) {
-
-  MatrixRef<Tensor> weight{ weight_arr, static_cast<size_t>(weight_stride0) };
+    int64_t fn_mode,
+    int64_t fn_hidden_size,
+    int64_t fn_proj_size,
+    int64_t fn_num_layers,
+    bool batch_first,
+    double fn_dropout,
+    bool fn_train,
+    bool fn_bidirectional,
+    IntArrayRef fn_batch_sizes,
+    const Tensor& fn_dropout_state,
+    const Tensor& fn_reserve) {
+  MatrixRef<Tensor> weight{weight_arr, static_cast<size_t>(weight_stride0)};
   auto input = input_r;
   auto output = output_r;
 
   RNNParams fn;
   auto datatype = getCudnnDataType(input);
 #ifndef USE_CUDNN_RNN_V8_API
-  fn.rnn.set(fn_mode, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
+  fn.rnn.set(
+      fn_mode,
+      fn_hidden_size,
+      fn_proj_size,
+      fn_num_layers,
+      fn_bidirectional,
+      promote_rnn_math_type(datatype),
+      datatype);
 #else
   auto cudnn_input_size = input_r.size(-1);
   auto packed = fn_batch_sizes.size() != 0;
-  fn.rnn.set(fn_mode, cudnn_input_size, packed, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
+  fn.rnn.set(
+      fn_mode,
+      cudnn_input_size,
+      packed,
+      fn_hidden_size,
+      fn_proj_size,
+      fn_num_layers,
+      fn_bidirectional,
+      promote_rnn_math_type(datatype),
+      datatype);
 #endif
   fn.dropout.set(fn_train, fn_dropout, fn_dropout_state);
   fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first);
@@ -1624,8 +1965,7 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
   auto handle = getCudnnHandle();
 
   if (fn.rnn.mode != CUDNN_LSTM) {
-    TORCH_CHECK(!cx.defined(),
-             "rnn: illegal defined cx for non-LSTM RNN");
+    TORCH_CHECK(!cx.defined(), "rnn: illegal defined cx for non-LSTM RNN");
   }
 
   auto is_input_packed = fn_batch_sizes.size() != 0;
@@ -1637,21 +1977,27 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
   auto input_size = _input_size(fn.tensors);
   auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
 
-  TORCH_CHECK(fn_train,
-           "cudnn RNN backward can only be called in training mode");
+  TORCH_CHECK(
+      fn_train, "cudnn RNN backward can only be called in training mode");
 
-  TORCH_CHECK(input.sizes().equals(input_size),
-           "Expected input size ", IntArrayRef{input_size}, ", got ", input.sizes());
-  TORCH_CHECK(!hx.defined() || hx.sizes().equals(hidden_size),
-           "Expected hidden size ", IntArrayRef{hidden_size}, ", got ", hx.sizes());
+  TORCH_CHECK(
+      input.sizes().equals(input_size),
+      "Expected input size ",
+      IntArrayRef{input_size},
+      ", got ",
+      input.sizes());
+  TORCH_CHECK(
+      !hx.defined() || hx.sizes().equals(hidden_size),
+      "Expected hidden size ",
+      IntArrayRef{hidden_size},
+      ", got ",
+      hx.sizes());
 
   // TODO: the above were the only checks in rnn.py, but it doesn't seem
   // like these checks are enough
 
-  TORCH_CHECK(hx.is_contiguous(),
-           "rnn: hx is not contiguous");
-  TORCH_CHECK(!cx.defined() || cx.is_contiguous(),
-           "rnn: cx is not contiguous");
+  TORCH_CHECK(hx.is_contiguous(), "rnn: hx is not contiguous");
+  TORCH_CHECK(!cx.defined() || cx.is_contiguous(), "rnn: cx is not contiguous");
 
   auto x = input.contiguous();
   const auto& y = output;
@@ -1671,109 +2017,188 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
   auto x_descs_arr = descs.get_x_descs();
   auto y_descs_arr = descs.get_y_descs();
   AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize(
-        handle,
-        descs.rnn_desc.desc(),
-        fn.tensors.seq_length,
-        x_descs_arr.data(),
-        &workspace_size
-        ));
+      handle,
+      descs.rnn_desc.desc(),
+      fn.tensors.seq_length,
+      x_descs_arr.data(),
+      &workspace_size));
 #else
   auto& x_descs_arr = descs.x_descs;
   auto& y_descs_arr = descs.y_descs;
   AT_CUDNN_CHECK(cudnnGetRNNTempSpaceSizes(
-        handle,
-        descs.rnn_desc.desc(),
-        CUDNN_FWD_MODE_TRAINING,
-        x_descs_arr.desc(),
-        &workspace_size,
-        NULL
-        ));
+      handle,
+      descs.rnn_desc.desc(),
+      CUDNN_FWD_MODE_TRAINING,
+      x_descs_arr.desc(),
+      &workspace_size,
+      NULL));
 #endif
   Tensor workspace = at::empty(workspace_size, input.options().dtype(kByte));
 #ifndef USE_CUDNN_RNN_V8_API
   AT_CUDNN_CHECK(cudnnRNNBackwardWeights(
-        handle,
-        descs.rnn_desc.desc(),
-        fn.tensors.seq_length,
-        x_descs_arr.data(), x.data_ptr(),
-        descs.hx_desc.desc(), hx.data_ptr(),
-        y_descs_arr.data(), y.data_ptr(),
-        workspace.data_ptr(), workspace.size(0),
-        w_desc.desc(), dw.data_ptr(),
-        fn_reserve.data_ptr(), fn_reserve.size(0)
-        ));
+      handle,
+      descs.rnn_desc.desc(),
+      fn.tensors.seq_length,
+      x_descs_arr.data(),
+      x.data_ptr(),
+      descs.hx_desc.desc(),
+      hx.data_ptr(),
+      y_descs_arr.data(),
+      y.data_ptr(),
+      workspace.data_ptr(),
+      workspace.size(0),
+      w_desc.desc(),
+      dw.data_ptr(),
+      fn_reserve.data_ptr(),
+      fn_reserve.size(0)));
 #else
   AT_CUDNN_CHECK(cudnnRNNBackwardWeights_v8(
-        handle,
-        descs.rnn_desc.desc(),
-        CUDNN_WGRAD_MODE_ADD,
-        nullptr,
-        x_descs_arr.desc(), x.data_ptr(),
-        descs.hx_desc.desc(), hx.data_ptr(),
-        y_descs_arr.desc(), y.data_ptr(),
-        weight_buf.numel() * weight_buf.element_size(), dw.data_ptr(),
-        workspace.size(0), workspace.data_ptr(),
-        fn_reserve.size(0), fn_reserve.data_ptr()));
+      handle,
+      descs.rnn_desc.desc(),
+      CUDNN_WGRAD_MODE_ADD,
+      nullptr,
+      x_descs_arr.desc(),
+      x.data_ptr(),
+      descs.hx_desc.desc(),
+      hx.data_ptr(),
+      y_descs_arr.desc(),
+      y.data_ptr(),
+      weight_buf.numel() * weight_buf.element_size(),
+      dw.data_ptr(),
+      workspace.size(0),
+      workspace.data_ptr(),
+      fn_reserve.size(0),
+      fn_reserve.data_ptr()));
 #endif
 
-
-  std::vector<Tensor> grad_params_arr;
-  size_t grad_params_stride0;
 #ifndef USE_CUDNN_RNN_V8_API
-  std::tie(grad_params_arr, grad_params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, dw);
+  auto [grad_params_arr, grad_params_stride0] = get_parameters(
+      handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, dw);
 #else
-  std::tie(grad_params_arr, grad_params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, dw);
+  auto [grad_params_arr, grad_params_stride0] =
+      get_parameters(handle, fn.rnn, descs.rnn_desc, dw);
 #endif
   if (grad_params_stride0 == static_cast<size_t>(weight_stride0)) {
-     _viewParams(MatrixRef<Tensor>{grad_params_arr, grad_params_stride0},
-              MatrixRef<Tensor>{weight_arr, static_cast<size_t>(weight_stride0)});
-      return grad_params_arr;
+    _viewParams(
+        MatrixRef<Tensor>{grad_params_arr, grad_params_stride0},
+        MatrixRef<Tensor>{weight_arr, static_cast<size_t>(weight_stride0)});
+    return grad_params_arr;
   } else {
-     std::vector<Tensor> grad_weight_arr;
-     grad_weight_arr.reserve( weight.numel() );
-     for (const auto& w : weight_arr) {
-        grad_weight_arr.emplace_back(at::empty(w.sizes(), w.options()));
-     }
-     _copyParams(MatrixRef<Tensor>{grad_params_arr, grad_params_stride0},
-              MatrixRef<Tensor>{grad_weight_arr, static_cast<size_t>(weight_stride0)});
-     return grad_weight_arr;
+    std::vector<Tensor> grad_weight_arr;
+    grad_weight_arr.reserve(weight.numel());
+    for (const auto& w : weight_arr) {
+      grad_weight_arr.emplace_back(at::empty(w.sizes(), w.options()));
+    }
+    _copyParams(
+        MatrixRef<Tensor>{grad_params_arr, grad_params_stride0},
+        MatrixRef<Tensor>{
+            grad_weight_arr, static_cast<size_t>(weight_stride0)});
+    return grad_weight_arr;
   }
 }
 
 // We need this dispatcher because _cudnn_rnn_backward_weight has a stringent
 // ordering requirement with _cudnn_rnn_backward_input
 std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
-    const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const c10::optional<Tensor>& cx_opt,
-    const Tensor& output, const c10::optional<Tensor>& grad_output_r_opt, const c10::optional<Tensor>& grad_hy_r_opt, const c10::optional<Tensor>& grad_cy_r_opt,
-    int64_t mode, int64_t hidden_size, int64_t proj_size,
-    int64_t num_layers, bool batch_first, double dropout,
-    bool train, bool bidirectional, IntArrayRef batch_sizes, const c10::optional<Tensor>& dropout_state_opt, const Tensor& reserve,
-    std::array<bool, 4> output_mask
-    ) {
+    const Tensor& input,
+    TensorList weight,
+    int64_t weight_stride0,
+    const Tensor& weight_buf,
+    const Tensor& hx,
+    const c10::optional<Tensor>& cx_opt,
+    const Tensor& output,
+    const c10::optional<Tensor>& grad_output_r_opt,
+    const c10::optional<Tensor>& grad_hy_r_opt,
+    const c10::optional<Tensor>& grad_cy_r_opt,
+    int64_t mode,
+    int64_t hidden_size,
+    int64_t proj_size,
+    int64_t num_layers,
+    bool batch_first,
+    double dropout,
+    bool train,
+    bool bidirectional,
+    IntArrayRef batch_sizes,
+    const c10::optional<Tensor>& dropout_state_opt,
+    const Tensor& reserve,
+    std::array<bool, 4> output_mask) {
   // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> cx_maybe_owned = at::borrow_from_optional_tensor(cx_opt);
+  c10::MaybeOwned<Tensor> cx_maybe_owned =
+      at::borrow_from_optional_tensor(cx_opt);
   const Tensor& cx = *cx_maybe_owned;
-  const Tensor& grad_output_r = c10::value_or_else(grad_output_r_opt, [] {return Tensor();});
-  const Tensor& grad_hy_r = c10::value_or_else(grad_hy_r_opt, [] {return Tensor();});
-  const Tensor& grad_cy_r = c10::value_or_else(grad_cy_r_opt, [] {return Tensor();});
-  const Tensor& dropout_state = c10::value_or_else(dropout_state_opt, [] {return Tensor();});
-
-  if (!grad_output_r.defined() && !grad_hy_r.defined() && !grad_cy_r.defined()) {
-    return std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>>(Tensor(), Tensor(), Tensor(), std::vector<Tensor>(weight.size()));
+  const Tensor& grad_output_r =
+      c10::value_or_else(grad_output_r_opt, [] { return Tensor(); });
+  const Tensor& grad_hy_r =
+      c10::value_or_else(grad_hy_r_opt, [] { return Tensor(); });
+  const Tensor& grad_cy_r =
+      c10::value_or_else(grad_cy_r_opt, [] { return Tensor(); });
+  const Tensor& dropout_state =
+      c10::value_or_else(dropout_state_opt, [] { return Tensor(); });
+
+  if (!grad_output_r.defined() && !grad_hy_r.defined() &&
+      !grad_cy_r.defined()) {
+    return std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>>(
+        Tensor(), Tensor(), Tensor(), std::vector<Tensor>(weight.size()));
   }
 
-  auto grad_output = grad_output_r.defined() ? grad_output_r : at::zeros_like(output, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  auto grad_hy = grad_hy_r.defined() ? grad_hy_r : at::zeros_like(hx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : at::zeros_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT)) : grad_cy_r;
+  auto grad_output = grad_output_r.defined()
+      ? grad_output_r
+      : at::zeros_like(output, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto grad_hy = grad_hy_r.defined()
+      ? grad_hy_r
+      : at::zeros_like(hx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto grad_cy = cx.defined()
+      ? (grad_cy_r.defined()
+             ? grad_cy_r
+             : at::zeros_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT))
+      : grad_cy_r;
 
-  Tensor dx, dhx, dcx;
   // NB: unconditionally compute this gradient, because it mutates reserve
-  std::tie(dx, dhx, dcx) = at::native::_cudnn_rnn_backward_input(input, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, {output_mask[0], output_mask[1], output_mask[2]});
+  auto [dx, dhx, dcx] = at::native::_cudnn_rnn_backward_input(
+      input,
+      weight_buf,
+      hx,
+      cx,
+      output,
+      grad_output,
+      grad_hy,
+      grad_cy,
+      mode,
+      hidden_size,
+      proj_size,
+      num_layers,
+      batch_first,
+      dropout,
+      train,
+      bidirectional,
+      batch_sizes,
+      dropout_state,
+      reserve,
+      {output_mask[0], output_mask[1], output_mask[2]});
   std::vector<Tensor> dw;
   if (output_mask[3]) {
-    dw = at::native::_cudnn_rnn_backward_weight(input, weight, weight_stride0, weight_buf, hx, cx, output, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve);
+    dw = at::native::_cudnn_rnn_backward_weight(
+        input,
+        weight,
+        weight_stride0,
+        weight_buf,
+        hx,
+        cx,
+        output,
+        mode,
+        hidden_size,
+        proj_size,
+        num_layers,
+        batch_first,
+        dropout,
+        train,
+        bidirectional,
+        batch_sizes,
+        dropout_state,
+        reserve);
   }
-  return std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>>{dx, dhx, dcx, dw};
+  return std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>>{
+      dx, dhx, dcx, dw};
 }
 
 // TODO: I am not sure if we actually need the 'dropout' and 'train' parameters
@@ -1785,13 +2210,18 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
 // as input.  The codegen currently assumes that ALL factory functions
 // take TensorOptions, so it's just a lot easier for this function to
 // be bound if it also does it.
-Tensor _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_seed,
+Tensor _cudnn_init_dropout_state(
+    double dropout,
+    bool train,
+    int64_t dropout_seed,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
     c10::optional<Device> device,
     c10::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
-  TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
+  TensorOptions options =
+      TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(
+          pin_memory);
 
   auto handle = getCudnnHandle();
   DropoutDescriptor dropout_desc;
@@ -1811,82 +2241,94 @@ std::tuple<Tensor, Tensor> unpack_hidden(const Tensor& hidden) {
   return std::make_tuple(hidden, at::Tensor{});
 }
 
-std::tuple<Tensor, Tensor> unpack_hidden(const std::tuple<Tensor, Tensor>& hidden) {
+std::tuple<Tensor, Tensor> unpack_hidden(
+    const std::tuple<Tensor, Tensor>& hidden) {
   return hidden;
 }
 
-template<typename hidden_type>
+template <typename hidden_type>
 hidden_type pack_hidden(const Tensor& hx, const Tensor& cx) {
-  static_assert(std::is_same<hidden_type, void>::value, "pack_hidden not implemented for this type");
+  static_assert(
+      std::is_same<hidden_type, void>::value,
+      "pack_hidden not implemented for this type");
   AT_ERROR("NOT IMPLEMENTED");
 }
 
-template<>
+template <>
 Tensor pack_hidden<Tensor>(const Tensor& hx, const Tensor& cx) {
   AT_ASSERT(cx.numel() == 0);
   return hx;
 }
 
-template<>
-std::tuple<Tensor, Tensor> pack_hidden<std::tuple<Tensor, Tensor>>(const Tensor& hx, const Tensor& cx) {
+template <>
+std::tuple<Tensor, Tensor> pack_hidden<std::tuple<Tensor, Tensor>>(
+    const Tensor& hx,
+    const Tensor& cx) {
   return std::make_tuple(hx, cx);
 }
 
 /**
  * Note [DropoutState and CUDA graph capture]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- * (1) Telling a capturing stream to wait on an event recorded in a non-capturing stream is an error.
- * (2) Telling a non-capturing stream to wait on an event recorded during capture is also an error.
+ * (1) Telling a capturing stream to wait on an event recorded in a
+ non-capturing stream is an error.
+ * (2) Telling a non-capturing stream to wait on an event recorded during
+ capture is also an error.
  *
- * So DropoutState's usage syncs could error if an RNN with dropout is called in an uncaptured region
- * then called in a captured region (triggering 1), or called in a captured region then called
- # in an uncaptured region (triggering 2).
+ * So DropoutState's usage syncs could error if an RNN with dropout is called in
+ an uncaptured region
+ * then called in a captured region (triggering 1), or called in a captured
+ region then called # in an uncaptured region (triggering 2).
  *
- * To prevent 1 and 2, lock() only syncs on the last usage event if it was recorded in the same
- * capture state as the current state (which also means the same graph, if capture is in progress).
+ * To prevent 1 and 2, lock() only syncs on the last usage event if it was
+ recorded in the same
+ * capture state as the current state (which also means the same graph, if
+ capture is in progress).
  *
- * The solution should be safe as long as capture obeys the following restrictions:
+ * The solution should be safe as long as capture obeys the following
+ restrictions:
  *  - Only one capture may be underway at a time in a given process.
- *  - While a capture is underway, no calls to eager ops on noncapturing streams (on any thread)
+ *  - While a capture is underway, no calls to eager ops on noncapturing streams
+ (on any thread)
  *    may interleave with the captured ops.
  *
- * TODO: As people experiment with capture, keep an eye out for use cases that might need to
+ * TODO: As people experiment with capture, keep an eye out for use cases that
+ might need to
  * relax those restrictions.
  *
  * See https://github.com/pytorch/pytorch/pull/56433 for more discussion.
  */
 
 struct DropoutState {
-  // Both buffer and event are lazily instantiated when a dropout state is needed
-  // for the first time. Note that in this case needed != used, as we don't need
-  // a buffer to e.g. run RNNs in test mode.
+  // Both buffer and event are lazily instantiated when a dropout state is
+  // needed for the first time. Note that in this case needed != used, as we
+  // don't need a buffer to e.g. run RNNs in test mode.
   at::Tensor buffer;
   c10::optional<cuda::CUDAEvent> event;
   std::mutex mutex;
 #if !defined(USE_ROCM)
-  // cudaStreamGetCaptureInfo will never give back a capture id of 0, so 0 can serve
-  // as a sentinel value that capture was not underway.
+  // cudaStreamGetCaptureInfo will never give back a capture id of 0, so 0 can
+  // serve as a sentinel value that capture was not underway.
   cuda::CaptureId_t capture_id_last_lock = 0;
   cuda::CaptureId_t capture_id_last_unlock = 0;
 #endif
 
   // Every time we use a dropout state, we need to synchronize with its event,
   // to make sure all previous uses finish running before this one starts. Once
-  // we're done, we record the event to allow others to synchronize with this kernel.
-  // Those events are really needed only for inter-stream sync on a single GPU.
-  // I doubt anyone will want to run cuDNN RNNs in parallel on a single GPU, so
-  // they should end up being complete no-ops.
+  // we're done, we record the event to allow others to synchronize with this
+  // kernel. Those events are really needed only for inter-stream sync on a
+  // single GPU. I doubt anyone will want to run cuDNN RNNs in parallel on a
+  // single GPU, so they should end up being complete no-ops.
   void lock() {
-    // NB: We can't ignore the lock even when event is undefined, because someone
-    // could then define it before we get to unlock().
+    // NB: We can't ignore the lock even when event is undefined, because
+    // someone could then define it before we get to unlock().
     mutex.lock();
     if (event) {
 #if !defined(USE_ROCM)
       // See Note [DropoutState and CUDA graph capture]
       cudaStreamCaptureStatus status;
-      AT_CUDA_CHECK(cudaStreamGetCaptureInfo(cuda::getCurrentCUDAStream(),
-                                             &status,
-                                             &capture_id_last_lock));
+      AT_CUDA_CHECK(cudaStreamGetCaptureInfo(
+          cuda::getCurrentCUDAStream(), &status, &capture_id_last_lock));
       if (status == cudaStreamCaptureStatus::cudaStreamCaptureStatusNone) {
         capture_id_last_lock = 0;
       }
@@ -1905,9 +2347,8 @@ struct DropoutState {
 #if !defined(USE_ROCM)
       // See Note [DropoutState and CUDA graph capture]
       cudaStreamCaptureStatus status;
-      AT_CUDA_CHECK(cudaStreamGetCaptureInfo(cuda::getCurrentCUDAStream(),
-                                             &status,
-                                             &capture_id_last_unlock));
+      AT_CUDA_CHECK(cudaStreamGetCaptureInfo(
+          cuda::getCurrentCUDAStream(), &status, &capture_id_last_unlock));
       if (status == cudaStreamCaptureStatus::cudaStreamCaptureStatusNone) {
         capture_id_last_unlock = 0;
       }
@@ -1918,27 +2359,34 @@ struct DropoutState {
   }
 };
 
-DropoutState& get_dropout_state(double dropout_p, bool train, TensorOptions options) {
-  // Each state is slightly over 2MB and initialized lazily, so it's fine to cache them.
-  static std::vector<DropoutState> dropout_state_cache { static_cast<size_t>(cuda::getNumGPUs()) };
+DropoutState& get_dropout_state(
+    double dropout_p,
+    bool train,
+    TensorOptions options) {
+  // Each state is slightly over 2MB and initialized lazily, so it's fine to
+  // cache them.
+  static std::vector<DropoutState> dropout_state_cache{
+      static_cast<size_t>(cuda::getNumGPUs())};
   static std::mutex state_cache_mut;
 
   AT_ASSERT(options.device().is_cuda());
-  int device = options.device().index();
+  auto device = options.device().index();
 
-  std::unique_lock<std::mutex> lock {state_cache_mut};
+  std::unique_lock<std::mutex> lock{state_cache_mut};
   auto& state = dropout_state_cache.at(device);
   if (train && dropout_p > 0) {
-    const auto &gen = at::detail::getCUDAHooks().getDefaultCUDAGenerator(device);
+    const auto& gen =
+        at::detail::getCUDAHooks().getDefaultCUDAGenerator(device);
     auto gen_impl = gen.get<at::CUDAGeneratorImpl>();
     bool reset_rnn_state = gen_impl->reset_rnn_state();
     if (!state.buffer.defined() || reset_rnn_state) {
-      std::unique_lock<std::mutex> lock {state.mutex};
-      int64_t seed = at::empty({}, options.dtype(at::kLong)).random_(gen).item<int64_t>();
+      std::unique_lock<std::mutex> lock{state.mutex};
+      int64_t seed =
+          at::empty({}, options.dtype(at::kLong)).random_(gen).item<int64_t>();
       state.buffer = at::_cudnn_init_dropout_state(
           dropout_p, train, seed, options.dtype(at::kByte));
-      // NB: CUDA binds the event to a device at creation time, so we can initialize it
-      // only now, when we know we're on the correct device.
+      // NB: CUDA binds the event to a device at creation time, so we can
+      // initialize it only now, when we know we're on the correct device.
       if (!state.event.has_value()) {
         state.event.emplace();
       }
@@ -1948,12 +2396,17 @@ DropoutState& get_dropout_state(double dropout_p, bool train, TensorOptions opti
 }
 
 Tensor try_get_weight_buf(
-      const Tensor& input, TensorList parameters, bool has_biases,
-      cudnnRNNMode_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool bidirectional) {
-
+    const Tensor& input,
+    TensorList parameters,
+    bool has_biases,
+    cudnnRNNMode_t mode,
+    c10::SymInt hidden_size,
+    c10::SymInt proj_size,
+    int64_t num_layers,
+    bool bidirectional) {
   // Prepare all relevant descriptors
   auto handle = getCudnnHandle();
-  auto & any_param = parameters.at(0);
+  auto& any_param = parameters.at(0);
   auto datatype = getCudnnDataType(any_param);
 
   // Something very naughty is happening here.  try_get_weight_buf
@@ -1965,20 +2418,36 @@ Tensor try_get_weight_buf(
   // the relationships
   RNNDescriptorParams rnn;
 #ifndef USE_CUDNN_RNN_V8_API
-  rnn.set(mode, hidden_size.guard_int(__FILE__, __LINE__), proj_size.guard_int(__FILE__, __LINE__), num_layers, bidirectional, promote_rnn_math_type(datatype), datatype);
+  rnn.set(
+      mode,
+      hidden_size.guard_int(__FILE__, __LINE__),
+      proj_size.guard_int(__FILE__, __LINE__),
+      num_layers,
+      bidirectional,
+      promote_rnn_math_type(datatype),
+      datatype);
 #else
   auto cudnn_input_size = input.size(-1);
-  auto packed = false; // eqy: bogus as we do not know if the input is packed here
-               // again, it should also not affect the weights
-  rnn.set(mode, cudnn_input_size, packed, hidden_size.guard_int(__FILE__, __LINE__), proj_size.guard_int(__FILE__, __LINE__), num_layers, bidirectional, promote_rnn_math_type(datatype), datatype);
+  auto packed = false; // eqy: bogus as we do not know if the input is packed
+                       // here again, it should also not affect the weights
+  rnn.set(
+      mode,
+      cudnn_input_size,
+      packed,
+      hidden_size.guard_int(__FILE__, __LINE__),
+      proj_size.guard_int(__FILE__, __LINE__),
+      num_layers,
+      bidirectional,
+      promote_rnn_math_type(datatype),
+      datatype);
 #endif
   RNNDescriptor rnn_desc = rnn.descriptor(handle);
 
-  TensorGeometry x_geom ({1, input.sym_size(-1).guard_int(__FILE__, __LINE__)});
+  TensorGeometry x_geom({1, input.sym_size(-1).guard_int(__FILE__, __LINE__)});
   TensorDescriptor x_desc;
   // datatype for x_desc comes from any_param, not input.
-  // try_get_weight_buf's job is to check "is the weight buffer correctly laid out
-  // for us to run it with input of the same datatype?"
+  // try_get_weight_buf's job is to check "is the weight buffer correctly laid
+  // out for us to run it with input of the same datatype?"
   x_desc.set(datatype, x_geom.sizes(), x_geom.strides(), 5);
 
 #ifndef USE_CUDNN_RNN_V8_API
@@ -2008,40 +2477,51 @@ Tensor try_get_weight_buf(
     if (has_biases) {
       AT_ASSERT(num_ptrs == num_parameters);
       for (const auto i : c10::irange(num_parameters)) {
-        if (expected_data_ptrs[i] != parameters[i].data_ptr()) return {};
+        if (expected_data_ptrs[i] != parameters[i].data_ptr())
+          return {};
       }
     } else {
       AT_ASSERT(num_parameters % 3 == 0);
       AT_ASSERT(num_ptrs == num_parameters * 5 / 3);
-      for (int64_t param_i = 0, ptr_i = 0;
-          ptr_i < num_ptrs;
-          ptr_i += 5, param_i += 3) {
-        if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr()) return {};
-        if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr()) return {};
-        if (expected_data_ptrs[ptr_i + 4] != parameters[param_i + 2].data_ptr()) return {};
+      for (int64_t param_i = 0, ptr_i = 0; ptr_i < num_ptrs;
+           ptr_i += 5, param_i += 3) {
+        if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr())
+          return {};
+        if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr())
+          return {};
+        if (expected_data_ptrs[ptr_i + 4] != parameters[param_i + 2].data_ptr())
+          return {};
       }
     }
   } else {
     AT_ASSERT(num_ptrs == (num_parameters * (has_biases ? 1 : 2)));
     AT_ASSERT(num_parameters % (has_biases ? 4 : 2) == 0);
-    for (int64_t param_i = 0, ptr_i = 0;
-        ptr_i < num_ptrs;
-        ptr_i += (has_biases ? 2 : 4), param_i += 2) {
-      if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr()) return {};
-      if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr()) return {};
+    for (int64_t param_i = 0, ptr_i = 0; ptr_i < num_ptrs;
+         ptr_i += (has_biases ? 2 : 4), param_i += 2) {
+      if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr())
+        return {};
+      if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr())
+        return {};
     }
   }
-  if (!parameters[num_parameters - 1].is_contiguous()) return {};
+  if (!parameters[num_parameters - 1].is_contiguous())
+    return {};
   return weight_buf;
 }
 
-template<typename hidden_type>
+template <typename hidden_type>
 std::pair<Tensor, hidden_type> _cudnn_impl(
-      const Tensor& input, const Tensor& _batch_sizes, const hidden_type& hidden,
-      TensorList params, bool has_biases, cudnnRNNMode_t mode,
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
-  Tensor hx, cx;
-  std::tie(hx, cx) = unpack_hidden(hidden);
+    const Tensor& input,
+    const Tensor& _batch_sizes,
+    const hidden_type& hidden,
+    TensorList params,
+    bool has_biases,
+    cudnnRNNMode_t mode,
+    int64_t num_layers,
+    double dropout_p,
+    bool train,
+    bool bidirectional) {
+  auto [hx, cx] = unpack_hidden(hidden);
   auto hidden_size = hx.sym_size(2);
   SymInt proj_size = 0;
   // For LSTM models with projections hidden size could be different
@@ -2050,40 +2530,72 @@ std::pair<Tensor, hidden_type> _cudnn_impl(
     proj_size = hx.sym_size(2);
   }
 
-  // TODO:  try_get_weight_buf returns a Tensor, but _cudnn_rnn below takes a c10::optional<Tensor>
-  // in weight_buf's slot.  Do we want try_get_weight_buf to return a c10::optional<Tensor>
-  // instead of a defined or undefined Tensor?
+  // TODO:  try_get_weight_buf returns a Tensor, but _cudnn_rnn below takes a
+  // c10::optional<Tensor> in weight_buf's slot.  Do we want try_get_weight_buf
+  // to return a c10::optional<Tensor> instead of a defined or undefined Tensor?
   at::cuda::OptionalCUDAGuard guard(input.get_device());
   auto weight_buf = try_get_weight_buf(
-      input, params, has_biases, mode, hidden_size, proj_size, num_layers, bidirectional);
+      input,
+      params,
+      has_biases,
+      mode,
+      hidden_size,
+      proj_size,
+      num_layers,
+      bidirectional);
 
   TORCH_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D");
-  IntArrayRef batch_sizes { _batch_sizes.data_ptr<int64_t>(), static_cast<size_t>(_batch_sizes.size(0)) };
+  IntArrayRef batch_sizes{
+      _batch_sizes.data_ptr<int64_t>(),
+      static_cast<size_t>(_batch_sizes.size(0))};
 
-  auto & dropout_state = get_dropout_state(dropout_p, train, input.options());
-  std::unique_lock<DropoutState> lock { dropout_state };
+  auto& dropout_state = get_dropout_state(dropout_p, train, input.options());
+  std::unique_lock<DropoutState> lock{dropout_state};
   int64_t num_params = has_biases ? 4 : 2;
   if (proj_size != 0) {
     ++num_params;
   }
-  auto sym_batch_sizes = c10::SymIntArrayRef(reinterpret_cast<const c10::SymInt*>(batch_sizes.data()), batch_sizes.size());
+  auto sym_batch_sizes = c10::SymIntArrayRef(
+      reinterpret_cast<const c10::SymInt*>(batch_sizes.data()),
+      batch_sizes.size());
   // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
   auto cudnn_output = at::_cudnn_rnn_symint(
-      input, params, num_params, weight_buf,
-      hx, cx, static_cast<int>(mode), hidden_size, proj_size, num_layers, /*batch_first=*/false,
-      dropout_p, train, bidirectional, sym_batch_sizes, dropout_state.buffer);
+      input,
+      params,
+      num_params,
+      weight_buf,
+      hx,
+      cx,
+      static_cast<int>(mode),
+      hidden_size,
+      proj_size,
+      num_layers,
+      /*batch_first=*/false,
+      dropout_p,
+      train,
+      bidirectional,
+      sym_batch_sizes,
+      dropout_state.buffer);
 
-  return {std::get<0>(cudnn_output),
-          pack_hidden<hidden_type>(std::get<1>(cudnn_output), std::get<2>(cudnn_output))};
+  return {
+      std::get<0>(cudnn_output),
+      pack_hidden<hidden_type>(
+          std::get<1>(cudnn_output), std::get<2>(cudnn_output))};
 }
 
-template<typename hidden_type>
+template <typename hidden_type>
 std::pair<Tensor, hidden_type> _cudnn_impl(
-      const Tensor& input, const hidden_type& hidden,
-      TensorList params, bool has_biases, cudnnRNNMode_t mode,
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
-  Tensor hx, cx;
-  std::tie(hx, cx) = unpack_hidden(hidden);
+    const Tensor& input,
+    const hidden_type& hidden,
+    TensorList params,
+    bool has_biases,
+    cudnnRNNMode_t mode,
+    int64_t num_layers,
+    double dropout_p,
+    bool train,
+    bool bidirectional,
+    bool batch_first) {
+  auto [hx, cx] = unpack_hidden(hidden);
   auto hidden_size = hx.sym_size(2);
   c10::SymInt proj_size = 0;
   // For LSTM models with projections hidden size could be different
@@ -2093,64 +2605,156 @@ std::pair<Tensor, hidden_type> _cudnn_impl(
   }
   at::cuda::OptionalCUDAGuard guard(input.get_device());
   auto weight_buf = try_get_weight_buf(
-      input, params, has_biases, mode, hidden_size, proj_size, num_layers, bidirectional);
-  auto & dropout_state = get_dropout_state(dropout_p, train, input.options());
-  std::unique_lock<DropoutState> lock { dropout_state };
+      input,
+      params,
+      has_biases,
+      mode,
+      hidden_size,
+      proj_size,
+      num_layers,
+      bidirectional);
+  auto& dropout_state = get_dropout_state(dropout_p, train, input.options());
+  std::unique_lock<DropoutState> lock{dropout_state};
   int64_t num_params = has_biases ? 4 : 2;
   if (proj_size != 0) {
     ++num_params;
   }
   // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
   auto cudnn_output = at::_cudnn_rnn_symint(
-      input, params, num_params, weight_buf,
-      hx, cx, static_cast<int>(mode), hidden_size, proj_size, num_layers, batch_first, dropout_p,
-      train, bidirectional, /*batch_sizes=*/{}, dropout_state.buffer);
+      input,
+      params,
+      num_params,
+      weight_buf,
+      hx,
+      cx,
+      static_cast<int>(mode),
+      hidden_size,
+      proj_size,
+      num_layers,
+      batch_first,
+      dropout_p,
+      train,
+      bidirectional,
+      /*batch_sizes=*/{},
+      dropout_state.buffer);
 
-  return {std::get<0>(cudnn_output),
-          pack_hidden<hidden_type>(std::get<1>(cudnn_output), std::get<2>(cudnn_output))};
+  return {
+      std::get<0>(cudnn_output),
+      pack_hidden<hidden_type>(
+          std::get<1>(cudnn_output), std::get<2>(cudnn_output))};
 }
 
-#define ONE_HIDDEN_RNN(NAME, MODE)                                             \
-void NAME##_cudnn(Tensor& output, Tensor& hy,                                  \
-      const Tensor& input, const Tensor& hx,                                   \
-      TensorList params, bool has_biases,                                      \
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { \
-  std::tie(output, hy) = _cudnn_impl(input, hx, params, has_biases,            \
-      MODE, num_layers, dropout_p, train, bidirectional, batch_first);         \
-}                                                                              \
-                                                                               \
-void NAME##_packed_cudnn(Tensor& output, Tensor& hy,                           \
-      const Tensor& data, const Tensor& batch_sizes, const Tensor& hx,         \
-      TensorList params, bool has_biases,                                      \
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {  \
-  std::tie(output, hy) = _cudnn_impl(data, batch_sizes, hx, params,            \
-      has_biases, MODE, num_layers, dropout_p, train, bidirectional);          \
-}                                                                              \
-                                                                               \
-REGISTER_CUDA_DISPATCH(NAME##_cudnn_stub, &NAME##_cudnn);                      \
-REGISTER_CUDA_DISPATCH(NAME##_packed_cudnn_stub, &NAME##_packed_cudnn);
+#define ONE_HIDDEN_RNN(NAME, MODE)                          \
+  void NAME##_cudnn(                                        \
+      Tensor& output,                                       \
+      Tensor& hy,                                           \
+      const Tensor& input,                                  \
+      const Tensor& hx,                                     \
+      TensorList params,                                    \
+      bool has_biases,                                      \
+      int64_t num_layers,                                   \
+      double dropout_p,                                     \
+      bool train,                                           \
+      bool bidirectional,                                   \
+      bool batch_first) {                                   \
+    std::tie(output, hy) = _cudnn_impl(                     \
+        input,                                              \
+        hx,                                                 \
+        params,                                             \
+        has_biases,                                         \
+        MODE,                                               \
+        num_layers,                                         \
+        dropout_p,                                          \
+        train,                                              \
+        bidirectional,                                      \
+        batch_first);                                       \
+  }                                                         \
+                                                            \
+  void NAME##_packed_cudnn(                                 \
+      Tensor& output,                                       \
+      Tensor& hy,                                           \
+      const Tensor& data,                                   \
+      const Tensor& batch_sizes,                            \
+      const Tensor& hx,                                     \
+      TensorList params,                                    \
+      bool has_biases,                                      \
+      int64_t num_layers,                                   \
+      double dropout_p,                                     \
+      bool train,                                           \
+      bool bidirectional) {                                 \
+    std::tie(output, hy) = _cudnn_impl(                     \
+        data,                                               \
+        batch_sizes,                                        \
+        hx,                                                 \
+        params,                                             \
+        has_biases,                                         \
+        MODE,                                               \
+        num_layers,                                         \
+        dropout_p,                                          \
+        train,                                              \
+        bidirectional);                                     \
+  }                                                         \
+                                                            \
+  REGISTER_CUDA_DISPATCH(NAME##_cudnn_stub, &NAME##_cudnn); \
+  REGISTER_CUDA_DISPATCH(NAME##_packed_cudnn_stub, &NAME##_packed_cudnn);
 
 ONE_HIDDEN_RNN(gru, CUDNN_GRU)
 ONE_HIDDEN_RNN(rnn_tanh, CUDNN_RNN_TANH)
 ONE_HIDDEN_RNN(rnn_relu, CUDNN_RNN_RELU)
 
-void lstm_cudnn(Tensor& output, Tensor& hy, Tensor& cy,
-      const Tensor& input, TensorList hx,
-      TensorList params, bool has_biases,
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
-  auto result = _cudnn_impl(input, std::make_tuple(hx[0], hx[1]), params, has_biases,
-      CUDNN_LSTM, num_layers, dropout_p, train, bidirectional, batch_first);
+void lstm_cudnn(
+    Tensor& output,
+    Tensor& hy,
+    Tensor& cy,
+    const Tensor& input,
+    TensorList hx,
+    TensorList params,
+    bool has_biases,
+    int64_t num_layers,
+    double dropout_p,
+    bool train,
+    bool bidirectional,
+    bool batch_first) {
+  auto result = _cudnn_impl(
+      input,
+      std::make_tuple(hx[0], hx[1]),
+      params,
+      has_biases,
+      CUDNN_LSTM,
+      num_layers,
+      dropout_p,
+      train,
+      bidirectional,
+      batch_first);
   output = result.first;
   hy = std::get<0>(result.second);
   cy = std::get<1>(result.second);
 }
 
-void lstm_packed_cudnn(Tensor& output, Tensor& hy, Tensor& cy,
-      const Tensor& data, const Tensor& batch_sizes, TensorList hx,
-      TensorList params, bool has_biases,
-      int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
-  auto result = _cudnn_impl(data, batch_sizes, std::make_tuple(hx[0], hx[1]),
-      params, has_biases, CUDNN_LSTM, num_layers, dropout_p, train, bidirectional);
+void lstm_packed_cudnn(
+    Tensor& output,
+    Tensor& hy,
+    Tensor& cy,
+    const Tensor& data,
+    const Tensor& batch_sizes,
+    TensorList hx,
+    TensorList params,
+    bool has_biases,
+    int64_t num_layers,
+    double dropout_p,
+    bool train,
+    bool bidirectional) {
+  auto result = _cudnn_impl(
+      data,
+      batch_sizes,
+      std::make_tuple(hx[0], hx[1]),
+      params,
+      has_biases,
+      CUDNN_LSTM,
+      num_layers,
+      dropout_p,
+      train,
+      bidirectional);
   output = result.first;
   hy = std::get<0>(result.second);
   cy = std::get<1>(result.second);
@@ -2159,8 +2763,9 @@ void lstm_packed_cudnn(Tensor& output, Tensor& hy, Tensor& cy,
 REGISTER_CUDA_DISPATCH(lstm_cudnn_stub, &lstm_cudnn);
 REGISTER_CUDA_DISPATCH(lstm_packed_cudnn_stub, &lstm_packed_cudnn);
 
-} // anonymous namepsace
+} // namespace
 
-}} // namespace at::native
+} // namespace at
+} // namespace at
 
 #endif // AT_CUDNN_ENABLED()
diff --git a/aten/src/ATen/native/cudnn/RNNUtils.h b/aten/src/ATen/native/cudnn/RNNUtils.h
index 64a2ecbef62e1..7e2869a805740 100644
--- a/aten/src/ATen/native/cudnn/RNNUtils.h
+++ b/aten/src/ATen/native/cudnn/RNNUtils.h
@@ -1,9 +1,9 @@
 #pragma once
 
-#include <ATen/cudnn/cudnn-wrapper.h>
 #include <ATen/cudnn/Descriptors.h>
 #include <ATen/cudnn/Types.h>
 #include <ATen/cudnn/Utils.h>
+#include <ATen/cudnn/cudnn-wrapper.h>
 
 // Declares utilities used by RNN.cpp and also needed by external consumers
 namespace at {
diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp
index 88d53da856d85..27a701dd2eb49 100644
--- a/aten/src/ATen/native/layer_norm.cpp
+++ b/aten/src/ATen/native/layer_norm.cpp
@@ -2,6 +2,7 @@
 #include <ATen/native/layer_norm.h>
 
 #include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/native/cpu/mixed_data_type.h>
 #include <c10/util/irange.h>
@@ -18,6 +19,9 @@
 #include <ATen/ops/native_layer_norm.h>
 #include <ATen/ops/native_layer_norm_backward_native.h>
 #include <ATen/ops/native_layer_norm_native.h>
+#include <ATen/ops/pow.h>
+#include <ATen/ops/rsqrt.h>
+#include <ATen/ops/rms_norm.h>
 #include <ATen/ops/zeros_like_native.h>
 #endif
 
@@ -258,4 +262,49 @@ std::tuple<Tensor, Tensor, Tensor> math_native_layer_norm(
   rstd = rstd.view(stat_shape);
   return std::make_tuple(out, mean, rstd);
 }
+
+Tensor rms_norm(
+    const Tensor& input,
+    IntArrayRef normalized_shape,
+    const c10::optional<Tensor>& weight_opt /* optional */,
+    c10::optional<double> eps) {
+
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+  auto bias_opt = at::optional<Tensor>();
+  const Tensor& bias = *at::borrow_from_optional_tensor(bias_opt);
+  (void) _check_layer_norm_inputs(input, normalized_shape, weight, bias);
+
+  std::vector<int64_t> dims_to_reduce;
+  for (const auto i : c10::irange(normalized_shape.size())) {
+    dims_to_reduce.push_back(input.dim() - i - 1);
+  }
+  IntArrayRef dims_to_reduce_ref = IntArrayRef(dims_to_reduce);
+
+  auto result = AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "rms_norm",
+        [&] {
+    scalar_t eps_val;
+    if (!eps.has_value()) {
+      eps_val = std::numeric_limits<at::scalar_value_type<scalar_t>::type>::epsilon();
+    } else {
+      eps_val = eps.value();
+    }
+
+    auto result = input.mul(at::rsqrt(at::pow(input, 2).mean(dims_to_reduce_ref, /*keep_dim=*/true).add_(eps_val)));
+
+    if (weight_opt.has_value()) {
+      result = result.mul(weight_opt.value());
+    }
+
+    return result;
+  });
+
+  return result;
+
+}
 } // namespace at::native
diff --git a/aten/src/ATen/native/layer_norm.h b/aten/src/ATen/native/layer_norm.h
index 13fb1e4783d20..38e63569586e3 100644
--- a/aten/src/ATen/native/layer_norm.h
+++ b/aten/src/ATen/native/layer_norm.h
@@ -71,6 +71,12 @@ void layer_norm_cpu_out(
     int64_t M,
     int64_t N);
 
+Tensor rms_norm(
+    const Tensor& input,
+    IntArrayRef normalized_shape,
+    const c10::optional<Tensor>& weight_opt /* optional */,
+    c10::optional<double> eps);
+
 using forward_fn = void (*)(
     const Tensor& /* X */,
     const Tensor& /* gamma */,
diff --git a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm
index 14c98f99cff02..926a52ffb8d20 100644
--- a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm
+++ b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm
@@ -40,8 +40,7 @@ - (void)endSynchronization:(NSError*)error {
     if (_imageWrapper) {
       _imageWrapper->release();
     }
-    // throw an exception with error details
-    METAL_THROW_IF_ERROR(error, "Command buffer execution failed!");
+    // T159183991: ignore error. We prefer to not crash the app.
   }
 }
 
diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm
index 04c82e94bda01..dbb5ee5c98de8 100644
--- a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm
+++ b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm
@@ -647,7 +647,7 @@ bool test_view2() {
 }
 
 bool test_view3() {
-  // nonarry -> array
+  // nonarray -> array
   __block std::vector<int64_t> size{5, 8};
   return TEST(size, __PRETTY_FUNCTION__, ^bool {
     auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat));
diff --git a/aten/src/ATen/native/metal/ops/MetalSoftmax.mm b/aten/src/ATen/native/metal/ops/MetalSoftmax.mm
index cfd55039cabd2..11ebe255953f2 100644
--- a/aten/src/ATen/native/metal/ops/MetalSoftmax.mm
+++ b/aten/src/ATen/native/metal/ops/MetalSoftmax.mm
@@ -20,7 +20,7 @@ Tensor mpscnn_softmax(
     int64_t dim,
     c10::optional<ScalarType> dtype) {
   TORCH_CHECK(input.is_metal());
-  // TODO: [T87180544] Implment softmax/log_softmax in metal shaders
+  // TODO: [T87180544] Implement softmax/log_softmax in metal shaders
   TORCH_CHECK(input.dim() == 2);
   if(input.numel() == 0){
       return makeTensor({input.sizes().vec()}, input.options());
diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
index 6ae3b6ab143bc..7c641b3fadd89 100644
--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@@ -117,11 +117,14 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
     save_var = at::empty({ num_features }, weight_t.options());
     MIOPEN_CHECK(miopenBatchNormalizationForwardTraining(
       handle, mode, &one, &zero,
-      idesc.desc(), input->data_ptr(),
+      idesc.desc(), input->const_data_ptr(),
       idesc.desc(), output->data_ptr(),
       wdesc.desc(),
-      weight->data_ptr(),
-      bias->data_ptr(),
+      // NOTE: MIOpen docs say that the bnScale and bnBias args are only inputs,
+      // not outputs. However, unfortunately the function signature only takes
+      // non-const pointers, presumably by accident
+      const_cast<void*>(weight->const_data_ptr()),
+      const_cast<void*>(bias->const_data_ptr()),
       exponential_average_factor,
       at::maybe_data_ptr(running_mean),
       at::maybe_data_ptr(running_var),
@@ -133,11 +136,14 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
     save_var = at::empty({0}, weight_t.options());
     MIOPEN_CHECK(miopenBatchNormalizationForwardInference(
       handle, mode, &one, &zero,
-      idesc.desc(), input->data_ptr(),
+      idesc.desc(), input->const_data_ptr(),
       idesc.desc(), output->data_ptr(),
       wdesc.desc(),
-      weight->data_ptr(),
-      bias->data_ptr(),
+      // NOTE: MIOpen docs say that the bnScale and bnBias args are only inputs,
+      // not outputs. However, unfortunately the function signature only takes
+      // non-const pointers, presumably by accident
+      const_cast<void*>(weight->const_data_ptr()),
+      const_cast<void*>(bias->const_data_ptr()),
       running_mean->data_ptr(),
       running_var->data_ptr(),
       epsilon));
@@ -216,15 +222,15 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
 
   MIOPEN_CHECK(miopenBatchNormalizationBackward(
     handle, mode, &one, &zero, &one, &zero,
-    idesc.desc(), input->data_ptr(),
-    idesc.desc(), grad_output->data_ptr(),
+    idesc.desc(), input->const_data_ptr(),
+    idesc.desc(), grad_output->const_data_ptr(),
     idesc.desc(), grad_input_t.data_ptr(),
-    wdesc.desc(), weight->data_ptr(),
+    wdesc.desc(), weight->const_data_ptr(),
     grad_weight_t.data_ptr(),
     grad_bias_t.data_ptr(),
     epsilon,
-    save_mean->data_ptr(),
-    save_var->data_ptr()));
+    save_mean->const_data_ptr(),
+    save_var->const_data_ptr()));
 
   return std::tuple<Tensor,Tensor,Tensor>{grad_input_t, grad_weight_t, grad_bias_t};
 }
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 4a192cae2a20d..88f889c2cc1fa 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -371,8 +371,8 @@ struct algorithm_search<miopenConvFwdAlgorithm_t> {
     Workspace ws(max_ws_size);
     MIOPEN_CHECK(miopenFindConvolutionForwardAlgorithm(
         args.handle,
-        args.idesc.desc(), args.input.data_ptr(),
-        args.wdesc.desc(), args.weight.data_ptr(),
+        args.idesc.desc(), args.input.const_data_ptr(),
+        args.wdesc.desc(), args.weight.const_data_ptr(),
         args.cdesc.desc(),
         args.odesc.desc(), args.output.data_ptr(),
         1,        // just return the fastest
@@ -444,8 +444,8 @@ struct algorithm_search<miopenConvBwdDataAlgorithm_t> {
     Workspace ws(max_ws_size);
     MIOPEN_CHECK(miopenFindConvolutionBackwardDataAlgorithm(
         args.handle,
-        args.odesc.desc(), args.output.data_ptr(),
-        args.wdesc.desc(), args.weight.data_ptr(),
+        args.odesc.desc(), args.output.const_data_ptr(),
+        args.wdesc.desc(), args.weight.const_data_ptr(),
         args.cdesc.desc(),
         args.idesc.desc(), args.input.data_ptr(),
         1,      // just return the fastest
@@ -517,8 +517,8 @@ struct algorithm_search<miopenConvBwdWeightsAlgorithm_t> {
     Workspace ws(max_ws_size);
     MIOPEN_CHECK(miopenFindConvolutionBackwardWeightsAlgorithm(
         args.handle,
-        args.odesc.desc(), args.output.data_ptr(),
-        args.idesc.desc(), args.input.data_ptr(),
+        args.odesc.desc(), args.output.const_data_ptr(),
+        args.idesc.desc(), args.input.const_data_ptr(),
         args.cdesc.desc(),
         args.wdesc.desc(), args.weight.data_ptr(),
         1,      // just return the fastest
@@ -599,7 +599,10 @@ void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) {
   cache.insert(args.params, *algo);
   wsscache.insert(args.params, perfResults.memory);
 
-  c10::hip::HIPCachingAllocator::emptyCache();
+  if (at::native::_cudnn_get_conv_benchmark_empty_cache()) {
+      c10::hip::HIPCachingAllocator::emptyCache();
+  }
+
 }
 
 template<typename algo_t>
@@ -682,7 +685,7 @@ void miopen_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const
   Constant one(dataType, 1);
   Constant zero(dataType, 0);
 
-  MIOPEN_CHECK(miopenConvolutionForwardBias(handle, &one, bdesc.desc(), bias->data_ptr(),
+  MIOPEN_CHECK(miopenConvolutionForwardBias(handle, &one, bdesc.desc(), bias->const_data_ptr(),
                                      &zero, odesc.desc(), output->data_ptr()));
   */
 }
@@ -730,8 +733,8 @@ void raw_miopen_convolution_forward_out(
 
       MIOPEN_CHECK(miopenConvolutionForward(
         args.handle,
-        &one, args.idesc.desc(), input.data_ptr(),
-        args.wdesc.desc(), weight.data_ptr(),
+        &one, args.idesc.desc(), input.const_data_ptr(),
+        args.wdesc.desc(), weight.const_data_ptr(),
         args.cdesc.desc(), fwdAlg, &zero,
         args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size));
   }
@@ -741,8 +744,8 @@ void raw_miopen_convolution_forward_out(
 
       MIOPEN_CHECK(miopenConvolutionForwardImmediate(
         args.handle,
-        args.wdesc.desc(), weight.data_ptr(),
-        args.idesc.desc(), input.data_ptr(),
+        args.wdesc.desc(), weight.const_data_ptr(),
+        args.idesc.desc(), input.const_data_ptr(),
         args.cdesc.desc(),
         args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size, solution_id));
   }
@@ -838,8 +841,8 @@ void raw_miopen_depthwise_convolution_forward_out(
 
       MIOPEN_CHECK(miopenConvolutionForward(
         args.handle,
-        &one, args.idesc.desc(), input.data_ptr(),
-        args.wdesc.desc(), weight.data_ptr(),
+        &one, args.idesc.desc(), input.const_data_ptr(),
+        args.wdesc.desc(), weight.const_data_ptr(),
         args.cdesc.desc(), fwdAlg, &zero,
         args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size));
   }
@@ -849,8 +852,8 @@ void raw_miopen_depthwise_convolution_forward_out(
 
       MIOPEN_CHECK(miopenConvolutionForwardImmediate(
         args.handle,
-        args.wdesc.desc(), weight.data_ptr(),
-        args.idesc.desc(), input.data_ptr(),
+        args.wdesc.desc(), weight.const_data_ptr(),
+        args.idesc.desc(), input.const_data_ptr(),
         args.cdesc.desc(),
         args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size, solution_id));
   }
@@ -993,8 +996,8 @@ void raw_miopen_convolution_backward_weight_out(
 
       MIOPEN_CHECK(miopenConvolutionBackwardWeights(
           args.handle,
-          &one, args.odesc.desc(), grad_output.data_ptr(),
-          args.idesc.desc(), input.data_ptr(),
+          &one, args.odesc.desc(), grad_output.const_data_ptr(),
+          args.idesc.desc(), input.const_data_ptr(),
           args.cdesc.desc(), bwdFilterAlg, &zero,
           args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size));
   }
@@ -1004,8 +1007,8 @@ void raw_miopen_convolution_backward_weight_out(
 
       MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate(
           args.handle,
-          args.odesc.desc(), grad_output.data_ptr(),
-          args.idesc.desc(), input.data_ptr(),
+          args.odesc.desc(), grad_output.const_data_ptr(),
+          args.idesc.desc(), input.const_data_ptr(),
           args.cdesc.desc(),
           args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size, solution_id));
   }
@@ -1037,8 +1040,8 @@ void raw_miopen_depthwise_convolution_backward_weight_out(
 
       MIOPEN_CHECK(miopenConvolutionBackwardWeights(
           args.handle,
-          &one, args.odesc.desc(), grad_output.data_ptr(),
-          args.idesc.desc(), input.data_ptr(),
+          &one, args.odesc.desc(), grad_output.const_data_ptr(),
+          args.idesc.desc(), input.const_data_ptr(),
           args.cdesc.desc(), bwdFilterAlg, &zero,
           args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size));
   }
@@ -1048,8 +1051,8 @@ void raw_miopen_depthwise_convolution_backward_weight_out(
 
       MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate(
           args.handle,
-          args.odesc.desc(), grad_output.data_ptr(),
-          args.idesc.desc(), input.data_ptr(),
+          args.odesc.desc(), grad_output.const_data_ptr(),
+          args.idesc.desc(), input.const_data_ptr(),
           args.cdesc.desc(),
           args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size, solution_id));
   }
@@ -1242,8 +1245,8 @@ void raw_miopen_convolution_backward_input_out(
 
       MIOPEN_CHECK(miopenConvolutionBackwardData(
           args.handle,
-          &one, args.odesc.desc(), grad_output.data_ptr(),
-          args.wdesc.desc(), weight.data_ptr(),
+          &one, args.odesc.desc(), grad_output.const_data_ptr(),
+          args.wdesc.desc(), weight.const_data_ptr(),
           args.cdesc.desc(), bwdDataAlg, &zero,
           args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size));
   }
@@ -1253,8 +1256,8 @@ void raw_miopen_convolution_backward_input_out(
 
       MIOPEN_CHECK(miopenConvolutionBackwardDataImmediate(
           args.handle,
-          args.odesc.desc(), grad_output.data_ptr(),
-          args.wdesc.desc(), weight.data_ptr(),
+          args.odesc.desc(), grad_output.const_data_ptr(),
+          args.wdesc.desc(), weight.const_data_ptr(),
           args.cdesc.desc(),
           args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size, solution_id));
   }
@@ -1351,8 +1354,8 @@ void raw_miopen_depthwise_convolution_backward_input_out(
 
       MIOPEN_CHECK(miopenConvolutionBackwardData(
           args.handle,
-          &one, args.odesc.desc(), grad_output.data_ptr(),
-          args.wdesc.desc(), weight.data_ptr(),
+          &one, args.odesc.desc(), grad_output.const_data_ptr(),
+          args.wdesc.desc(), weight.const_data_ptr(),
           args.cdesc.desc(), bwdDataAlg, &zero,
           args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size));
   }
@@ -1362,8 +1365,8 @@ void raw_miopen_depthwise_convolution_backward_input_out(
 
       MIOPEN_CHECK(miopenConvolutionBackwardDataImmediate(
           args.handle,
-          args.odesc.desc(), grad_output.data_ptr(),
-          args.wdesc.desc(), weight.data_ptr(),
+          args.odesc.desc(), grad_output.const_data_ptr(),
+          args.wdesc.desc(), weight.const_data_ptr(),
           args.cdesc.desc(),
           args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size, solution_id));
   }
@@ -1528,11 +1531,11 @@ void raw_miopen_convolution_relu_out(
   float activ_gamma = static_cast<float>(0);
   miopenOperatorArgs_t fusionArgs;
   MIOPEN_CHECK(miopenCreateOperatorArgs(&fusionArgs));
-  MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.data_ptr()));
-  MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.data_ptr()));
+  MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.const_data_ptr()));
+  MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.const_data_ptr()));
   MIOPEN_CHECK(miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, activ_beta, activ_gamma));
 
-  miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs);
+  miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.const_data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs);
 
   // Cleanup
   miopenDestroyFusionPlan(fusePlanDesc);
diff --git a/aten/src/ATen/native/miopen/RNN_miopen.cpp b/aten/src/ATen/native/miopen/RNN_miopen.cpp
index 51518fbfeb42a..7b2b2ab80e553 100644
--- a/aten/src/ATen/native/miopen/RNN_miopen.cpp
+++ b/aten/src/ATen/native/miopen/RNN_miopen.cpp
@@ -500,9 +500,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> miopen_rnn(
     auto weight_buf = at::empty(num_weights, x.options());
     w_desc.set(weight_buf, 3);
     weight_buf.zero_();
-    std::vector<Tensor> params;
-    size_t params_stride0;
-    std::tie(params, params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, weight_buf);
+    auto [params, params_stride0] = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, weight_buf);
     if (fn_mode < 2)
         _copyParams(MatrixRef<Tensor>{weight, static_cast<size_t>(weight_stride0)},
                 MatrixRef<Tensor>{params, params_stride0});
@@ -742,9 +740,7 @@ std::vector<Tensor> miopen_rnn_backward_weight(
         fn_reserve.data_ptr(), fn_reserve.size(0)
         ));
 
-    std::vector<Tensor> grad_params_arr;
-    size_t grad_params_stride0;
-    std::tie(grad_params_arr, grad_params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, dw);
+    auto [grad_params_arr, grad_params_stride0] = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, dw);
     if (grad_params_stride0 == static_cast<size_t>(weight_stride0)) {
         _viewParams(MatrixRef<Tensor>{grad_params_arr, grad_params_stride0},
             MatrixRef<Tensor>{weight_arr, static_cast<size_t>(weight_stride0)});
@@ -782,8 +778,7 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> miopen_rnn_backward(
     auto grad_hy = grad_hy_r.defined() ? grad_hy_r : at::zeros_like(hx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
     auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : at::zeros_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT)) : grad_cy_r;
 
-    Tensor dx, dhx, dcx, ws;
-    std::tie(dx, dhx, dcx, ws) = at::native::miopen_rnn_backward_input(input, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, {output_mask[0], output_mask[1], output_mask[2]});
+    auto [dx, dhx, dcx, ws] = at::native::miopen_rnn_backward_input(input, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, {output_mask[0], output_mask[1], output_mask[2]});
     std::vector<Tensor> dw;
     if (output_mask[3]) {
         dw = at::native::miopen_rnn_backward_weight(input, weight, weight_stride0, weight_buf, hx, cx, output, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, ws);
@@ -828,8 +823,7 @@ std::pair<Tensor, hidden_type> _miopen_impl(
     const Tensor& input, const Tensor& _batch_sizes, const hidden_type& hidden,
     TensorList params, bool has_biases, miopenRNNMode_t mode,
     int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
-    Tensor hx, cx;
-    std::tie(hx, cx) = unpack_hidden(hidden);
+    auto [hx, cx] = unpack_hidden(hidden);
     int64_t hidden_size = hx.size(2);
 
     TORCH_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D");
@@ -851,8 +845,7 @@ std::pair<Tensor, hidden_type> _miopen_impl(
     const Tensor& input, const hidden_type& hidden,
     TensorList params, bool has_biases, miopenRNNMode_t mode,
     int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
-    Tensor hx, cx;
-    std::tie(hx, cx) = unpack_hidden(hidden);
+    auto [hx, cx] = unpack_hidden(hidden);
     int64_t hidden_size = hx.size(2);
 
     Tensor dropout_state = at::empty({0}, input.options());
@@ -915,7 +908,7 @@ void lstm_packed_miopen(Tensor& output, Tensor& hy, Tensor& cy,
 REGISTER_CUDA_DISPATCH(lstm_miopen_stub, &lstm_miopen);
 REGISTER_CUDA_DISPATCH(lstm_packed_miopen_stub, &lstm_packed_miopen);
 
-} // anonymous namepsace
+} // anonymous namespace
 }} //namespace native.
 
 #endif
diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.cpp b/aten/src/ATen/native/mkl/LinearAlgebra.cpp
index ce8d848ec1ce6..5e82613f42285 100644
--- a/aten/src/ATen/native/mkl/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/mkl/LinearAlgebra.cpp
@@ -8,42 +8,50 @@ namespace at { namespace native {
 
 void mkl_gemm_batched(
     const TransposeType trans_A, const TransposeType trans_B,
-    const int batch_size, const int M, const int N, const int K, const float alpha,
-    const float** A, const int lda, const float** B, const int ldb, const float beta,
-    float** C, const int ldc) {
+    const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const float alpha,
+    const float** A, const MKL_INT lda, const float** B, const MKL_INT ldb, const float beta,
+    float** C, const MKL_INT ldc) {
   TORCH_INTERNAL_ASSERT(false, "mkl_gemm_batched: ATen not compiled with MKL support");
 }
 
 void mkl_gemm_batched(
     const TransposeType trans_A, const TransposeType trans_B,
-    const int batch_size, const int M, const int N, const int K, const double alpha,
-    const double** A, const int lda, const double** B, const int ldb, const double beta,
-    double** C, const int ldc) {
+    const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const double alpha,
+    const double** A, const MKL_INT lda, const double** B, const MKL_INT ldb, const double beta,
+    double** C, const MKL_INT ldc) {
   TORCH_INTERNAL_ASSERT(false, "mkl_gemm_batched: ATen not compiled with MKL support");
 }
 
 void mkl_gemm_batched(
     const TransposeType trans_A, const TransposeType trans_B,
-    const int batch_size, const int M, const int N, const int K, const c10::complex<float> alpha,
-    const c10::complex<float>** A, const int lda, const c10::complex<float>** B, const int ldb,
-    const c10::complex<float> beta, c10::complex<float>** C, const int ldc) {
+    const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const c10::complex<float> alpha,
+    const c10::complex<float>** A, const MKL_INT lda, const c10::complex<float>** B, const MKL_INT ldb,
+    const c10::complex<float> beta, c10::complex<float>** C, const MKL_INT ldc) {
   TORCH_INTERNAL_ASSERT(false, "mkl_gemm_batched: ATen not compiled with MKL support");
 }
 
 void mkl_gemm_batched(
     const TransposeType trans_A, const TransposeType trans_B,
-    const int batch_size, const int M, const int N, const int K, const c10::complex<double> alpha,
-    const c10::complex<double>** A, const int lda, const c10::complex<double>** B, const int ldb,
-    const c10::complex<double> beta, c10::complex<double>** C, const int ldc) {
+    const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const c10::complex<double> alpha,
+    const c10::complex<double>** A, const MKL_INT lda, const c10::complex<double>** B, const MKL_INT ldb,
+    const c10::complex<double> beta, c10::complex<double>** C, const MKL_INT ldc) {
   TORCH_INTERNAL_ASSERT(false, "mkl_gemm_batched: ATen not compiled with MKL support");
 }
 
 void mkl_gemm_bf16bf16f32(
+    TransposeType trans_A, TransposeType trans_B,
+    MKL_INT M, MKL_INT N, MKL_INT K, const float alpha,
+    const c10::BFloat16* A, MKL_INT lda, const c10::BFloat16* B, MKL_INT ldb,
+    const float beta, float* C, MKL_INT ldc) {
+  TORCH_INTERNAL_ASSERT(false, "mkl_gemm_bf16bf16f32: ATen not compiled with MKL support");
+}
+
+void mkl_gemm_f16f16f32(
     TransposeType trans_A, TransposeType trans_B,
     int M, int N, int K, const float alpha,
-    const c10::BFloat16* A, int lda, const c10::BFloat16* B, int ldb,
+    const c10::Half* A, int lda, const c10::Half* B, int ldb,
     const float beta, float* C, int ldc) {
-  TORCH_INTERNAL_ASSERT(false, "mkl_gemm_bf16bf16f32: ATen not compiled with MKL support");
+  TORCH_INTERNAL_ASSERT(false, "mkl_gemm_f16f16f32: ATen not compiled with MKL support");
 }
 
 }}
@@ -66,9 +74,9 @@ static CBLAS_TRANSPOSE to_cblas(TransposeType x) {
 
 void mkl_gemm_batched(
     const TransposeType trans_A, const TransposeType trans_B,
-    const int batch_size, const int M, const int N, const int K, const float alpha,
-    const float** A, const int lda, const float** B, const int ldb, const float beta,
-    float** C, const int ldc) {
+    const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const float alpha,
+    const float** A, const MKL_INT lda, const float** B, const MKL_INT ldb, const float beta,
+    float** C, const MKL_INT ldc) {
   auto transa_cblas = to_cblas(trans_A);
   auto transb_cblas = to_cblas(trans_B);
   cblas_sgemm_batch(CblasColMajor, &transa_cblas, &transb_cblas, &M, &N, &K, &alpha,
@@ -77,9 +85,9 @@ void mkl_gemm_batched(
 
 void mkl_gemm_batched(
     const TransposeType trans_A, const TransposeType trans_B,
-    const int batch_size, const int M, const int N, const int K, const double alpha,
-    const double** A, const int lda, const double** B, const int ldb, const double beta,
-    double** C, const int ldc) {
+    const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const double alpha,
+    const double** A, const MKL_INT lda, const double** B, const MKL_INT ldb, const double beta,
+    double** C, const MKL_INT ldc) {
   auto transa_cblas = to_cblas(trans_A);
   auto transb_cblas = to_cblas(trans_B);
   cblas_dgemm_batch(CblasColMajor, &transa_cblas, &transb_cblas, &M, &N, &K, &alpha,
@@ -88,9 +96,9 @@ void mkl_gemm_batched(
 
 void mkl_gemm_batched(
     const TransposeType trans_A, const TransposeType trans_B,
-    const int batch_size, const int M, const int N, const int K, const c10::complex<float> alpha,
-    const c10::complex<float>** A, const int lda, const c10::complex<float>** B, const int ldb,
-    const c10::complex<float> beta, c10::complex<float>** C, const int ldc) {
+    const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const c10::complex<float> alpha,
+    const c10::complex<float>** A, const MKL_INT lda, const c10::complex<float>** B, const MKL_INT ldb,
+    const c10::complex<float> beta, c10::complex<float>** C, const MKL_INT ldc) {
   auto transa_cblas = to_cblas(trans_A);
   auto transb_cblas = to_cblas(trans_B);
   cblas_cgemm_batch(CblasColMajor, &transa_cblas, &transb_cblas, &M, &N, &K,
@@ -101,9 +109,9 @@ void mkl_gemm_batched(
 
 void mkl_gemm_batched(
     const TransposeType trans_A, const TransposeType trans_B,
-    const int batch_size, const int M, const int N, const int K, const c10::complex<double> alpha,
-    const c10::complex<double>** A, const int lda, const c10::complex<double>** B, const int ldb,
-    const c10::complex<double> beta, c10::complex<double>** C, const int ldc) {
+    const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const c10::complex<double> alpha,
+    const c10::complex<double>** A, const MKL_INT lda, const c10::complex<double>** B, const MKL_INT ldb,
+    const c10::complex<double> beta, c10::complex<double>** C, const MKL_INT ldc) {
   auto transa_cblas = to_cblas(trans_A);
   auto transb_cblas = to_cblas(trans_B);
   cblas_zgemm_batch(CblasColMajor, &transa_cblas, &transb_cblas, &M, &N, &K,
@@ -114,9 +122,9 @@ void mkl_gemm_batched(
 
 void mkl_gemm_bf16bf16f32(
     TransposeType trans_A, TransposeType trans_B,
-    int M, int N, int K, const float alpha,
-    const c10::BFloat16* A, int lda, const c10::BFloat16* B, int ldb,
-    const float beta, float* C, int ldc) {
+    MKL_INT M, MKL_INT N, MKL_INT K, const float alpha,
+    const c10::BFloat16* A, MKL_INT lda, const c10::BFloat16* B, MKL_INT ldb,
+    const float beta, float* C, MKL_INT ldc) {
 #ifdef MKL_HAS_SBGEMM
   auto transa_cblas = to_cblas(trans_A);
   auto transb_cblas = to_cblas(trans_B);
@@ -127,6 +135,21 @@ void mkl_gemm_bf16bf16f32(
 #endif
 }
 
+void mkl_gemm_f16f16f32(
+    TransposeType trans_A, TransposeType trans_B,
+    int M, int N, int K, const float alpha,
+    const c10::Half* A, int lda, const c10::Half* B, int ldb,
+    const float beta, float* C, int ldc) {
+#ifdef MKL_HAS_SHGEMM
+  auto transa_cblas = to_cblas(trans_A);
+  auto transb_cblas = to_cblas(trans_B);
+  cblas_gemm_f16f16f32(CblasColMajor, transa_cblas, transb_cblas, M, N, K, alpha,
+                         (const MKL_F16*)A, lda, (const MKL_F16*)B, ldb, beta, C, ldc);
+#else
+  TORCH_INTERNAL_ASSERT(false, "mkl_gemm_f16f16f32 requires mkl version >= 2024.0");
+#endif
+}
+
 }} // namespace at::native
 
 #endif
diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.h b/aten/src/ATen/native/mkl/LinearAlgebra.h
index a881e7ba49b1e..54a7a5cb2c741 100644
--- a/aten/src/ATen/native/mkl/LinearAlgebra.h
+++ b/aten/src/ATen/native/mkl/LinearAlgebra.h
@@ -1,39 +1,51 @@
 #pragma once
+#include <ATen/Config.h>
 #include <ATen/native/TransposeType.h>
 #include <c10/util/complex.h>
 #include <c10/core/ScalarType.h>
 
+#if !AT_MKL_ENABLED()
+#define MKL_INT int
+#else
+#include <mkl.h>
+#endif
+
 namespace at {
 namespace native {
 
 void mkl_gemm_batched(
     TransposeType trans_A, TransposeType trans_B,
-    int batch_size, int M, int N, int K, float alpha,
-    const float** A, int lda, const float** B, int ldb, float beta,
-    float** C, int ldc);
+    MKL_INT batch_size, MKL_INT M, MKL_INT N, MKL_INT K, float alpha,
+    const float** A, MKL_INT lda, const float** B, MKL_INT ldb, float beta,
+    float** C, MKL_INT ldc);
 
 void mkl_gemm_batched(
     TransposeType trans_A, TransposeType trans_B,
-    int batch_size, int M, int N, int K, double alpha,
-    const double** A, int lda, const double** B, int ldb, double beta,
-    double** C, int ldc);
+    MKL_INT batch_size, MKL_INT M, MKL_INT N, MKL_INT K, double alpha,
+    const double** A, MKL_INT lda, const double** B, MKL_INT ldb, double beta,
+    double** C, MKL_INT ldc);
 
 void mkl_gemm_batched(
     TransposeType trans_A, TransposeType trans_B,
-    int batch_size, int M, int N, int K, c10::complex<float> alpha,
-    const c10::complex<float>** A, int lda, const c10::complex<float>** B, int ldb,
-    c10::complex<float> beta, c10::complex<float>** C, int ldc);
+    MKL_INT batch_size, MKL_INT M, MKL_INT N, MKL_INT K, c10::complex<float> alpha,
+    const c10::complex<float>** A, MKL_INT lda, const c10::complex<float>** B, MKL_INT ldb,
+    c10::complex<float> beta, c10::complex<float>** C, MKL_INT ldc);
 
 void mkl_gemm_batched(
     TransposeType trans_A, TransposeType trans_B,
-    int batch_size, int M, int N, int K, c10::complex<double> alpha,
-    const c10::complex<double>** A, int lda, const c10::complex<double>** B, int ldb,
-    c10::complex<double> beta, c10::complex<double>** C, int ldc);
+    MKL_INT batch_size, MKL_INT M, MKL_INT N, MKL_INT K, c10::complex<double> alpha,
+    const c10::complex<double>** A, MKL_INT lda, const c10::complex<double>** B, MKL_INT ldb,
+    c10::complex<double> beta, c10::complex<double>** C, MKL_INT ldc);
 
 void mkl_gemm_bf16bf16f32(
+    TransposeType trans_A, TransposeType trans_B,
+    MKL_INT M, MKL_INT N, MKL_INT K, const float alpha,
+    const c10::BFloat16* A, MKL_INT lda, const c10::BFloat16* B, MKL_INT ldb,
+    const float beta, float* C, MKL_INT ldc);
+
+void mkl_gemm_f16f16f32(
     TransposeType trans_A, TransposeType trans_B,
     int M, int N, int K, const float alpha,
-    const c10::BFloat16* A, int lda, const c10::BFloat16* B, int ldb,
+    const c10::Half* A, int lda, const c10::Half* B, int ldb,
     const float beta, float* C, int ldc);
-
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/mkl/SparseCsrLinearAlgebra.cpp b/aten/src/ATen/native/mkl/SparseCsrLinearAlgebra.cpp
index 3d0acb9aae751..b938ccd937a8d 100644
--- a/aten/src/ATen/native/mkl/SparseCsrLinearAlgebra.cpp
+++ b/aten/src/ATen/native/mkl/SparseCsrLinearAlgebra.cpp
@@ -2,12 +2,11 @@
 #include <ATen/native/mkl/SparseCsrLinearAlgebra.h>
 #include <ATen/native/SparseTensorUtils.h>
 
-// Don't compile with MKL for MSVC/macos since linking the sparse MKL routines
+// Don't compile with MKL for macos since linking the sparse MKL routines
 // needs some build fixes.
-// https://github.com/pytorch/pytorch/pull/50937#issuecomment-778732740
 // Macros source:
 // https://web.archive.org/web/20191012035921/http://nadeausoftware.com/articles/2012/01/c_c_tip_how_use_compiler_predefined_macros_detect_operating_system
-#if !AT_MKL_ENABLED() || defined(_MSC_VER) || defined(__APPLE__) || \
+#if !AT_MKL_ENABLED() || defined(__APPLE__) || \
     defined(__MACH__)
 
 namespace at {
@@ -19,9 +18,7 @@ Tensor& _sparse_mm_mkl_(
     const Tensor& t,
     const Scalar& alpha,
     const Scalar& beta) {
-#if _MSC_VER
-  AT_ERROR("sparse_mm_mkl: MKL support is disabled on Windows");
-#elif __APPLE__ || __MACH__
+#if __APPLE__ || __MACH__
   AT_ERROR("sparse_mm_mkl: MKL support is disabled on macos/iOS.");
 #else
   AT_ERROR("sparse_mm_mkl: ATen not compiled with MKL support");
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
index cb00ce99d82e6..e26cfbf6d8eba 100644
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -200,7 +200,7 @@ Tensor& _fft_c2c_mkl_out(const Tensor& self, IntArrayRef dim, int64_t normalizat
 }
 
 }} // namespace at::native
-#endif /* AT_MKL_ENALED() || AT_POCKETFFT_ENABLED() */
+#endif /* AT_MKL_ENABLED() || AT_POCKETFFT_ENABLED() */
 
 #if AT_POCKETFFT_ENABLED()
 #include <pocketfft_hdronly.h>
@@ -229,7 +229,7 @@ inline std::complex<T> *tensor_cdata(Tensor& t) {
 
 template<typename T>
 inline const std::complex<T> *tensor_cdata(const Tensor& t) {
-  return reinterpret_cast<const std::complex<T>*>(t.data_ptr<c10::complex<T>>());
+  return reinterpret_cast<const std::complex<T>*>(t.const_data_ptr<c10::complex<T>>());
 }
 
 template<typename T>
@@ -291,11 +291,11 @@ Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
   pocketfft::shape_t axes(dim.begin(), dim.end());
   if (self.scalar_type() == kFloat) {
     pocketfft::r2c(shape_from_tensor(self), stride_from_tensor(self), stride_from_tensor(out), axes, true,
-                   self.data_ptr<float>(),
+                   self.const_data_ptr<float>(),
                    tensor_cdata<float>(out), compute_fct<float>(self, dim, normalization));
   } else {
     pocketfft::r2c(shape_from_tensor(self), stride_from_tensor(self), stride_from_tensor(out), axes, true,
-                   self.data_ptr<double>(),
+                   self.const_data_ptr<double>(),
                    tensor_cdata<double>(out), compute_fct<double>(self, dim, normalization));
   }
 
@@ -307,6 +307,10 @@ Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
 
 Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) {
   TORCH_CHECK(self.is_complex());
+  if (dim.empty()) {
+    return self.clone();
+  }
+
   auto out = at::empty(self.sizes(), self.options());
   pocketfft::shape_t axes(dim.begin(), dim.end());
   if (self.scalar_type() == kComplexFloat) {
@@ -480,9 +484,9 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
 
   // run the FFT
   if (forward) {
-    MKL_DFTI_CHECK(DftiComputeForward(descriptor.get(), input.data_ptr(), out.data_ptr()));
+    MKL_DFTI_CHECK(DftiComputeForward(descriptor.get(), const_cast<void*>(input.const_data_ptr()), out.data_ptr()));
   } else {
-    MKL_DFTI_CHECK(DftiComputeBackward(descriptor.get(), input.data_ptr(), out.data_ptr()));
+    MKL_DFTI_CHECK(DftiComputeBackward(descriptor.get(), const_cast<void*>(input.const_data_ptr()), out.data_ptr()));
   }
 
   // Inplace reshaping to original batch shape and inverting the dimension permutation
@@ -556,6 +560,10 @@ Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
 // n-dimensional complex to complex FFT/IFFT
 Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) {
   TORCH_CHECK(self.is_complex());
+  if (dim.empty()) {
+    return self.clone();
+  }
+
   const auto sorted_dims = _sort_dims(self, dim);
   auto out = at::empty(self.sizes(), self.options());
   return _exec_fft(out, self, self.sizes(), sorted_dims, normalization, forward);
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
index b5e53732a472e..3e41e2f1071d0 100644
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -223,10 +223,10 @@ static void _mkldnn_convolution_out (
   auto memory_format = mkldnn_convolution_memory_format(input_t.ndimension(), is_channels_last);
   auto input = input_t.is_mkldnn() ? input_t : input_t.contiguous(memory_format);
   auto weight = weight_t.is_mkldnn() ? weight_t : weight_t.contiguous(memory_format);
-  const ideep::tensor x = itensor_from_tensor(input);
-  const ideep::tensor w = itensor_from_tensor(weight);
+  const ideep::tensor x = itensor_from_tensor(input, /*from_const_data_ptr*/true);
+  const ideep::tensor w = itensor_from_tensor(weight, /*from_const_data_ptr*/true);
   if (bias.defined()) {
-    const ideep::tensor b = itensor_from_tensor(bias);
+    const ideep::tensor b = itensor_from_tensor(bias, /*from_const_data_ptr*/true);
     ideep::convolution_forward::compute_v3(
         x,
         w,
@@ -704,9 +704,9 @@ Tensor _mkldnn_convolution_transpose(
   auto output_sizes = conv_input_size(input.sizes(), weight_IOHW_sizes, padding_expanded, output_padding_expanded, stride_expanded, dilation_expanded, groups);
   auto output = at::empty({0}, input.options());
 
-  const ideep::tensor x = itensor_from_tensor(input);
+  const ideep::tensor x = itensor_from_tensor(input, /*from_const_data_ptr*/true);
 
-  ideep::tensor w = itensor_from_tensor(weight);
+  ideep::tensor w = itensor_from_tensor(weight, /*from_const_data_ptr*/true);
   if (!weight.is_mkldnn()) {
     // mkldnn transposed convolution has weight in logical order of OIHW or OIDHW,
     // while PyTorch has IOHW or IODHW, `._tranpose()` switches strides (no memory copy).
@@ -720,7 +720,7 @@ Tensor _mkldnn_convolution_transpose(
   }
 
   if (bias.defined()) {
-    const ideep::tensor b = itensor_from_tensor(bias);
+    const ideep::tensor b = itensor_from_tensor(bias, /*from_const_data_ptr*/true);
     ideep::convolution_transpose_forward::compute_v3(
         x,
         w,
@@ -825,8 +825,8 @@ Tensor mkldnn_convolution_backward_input(
     bool is_channels_last) {
   auto grad_input = at::empty({0}, grad_output.options());
 
-  auto grad_y = itensor_from_tensor(grad_output);
-  auto w = itensor_view_from_dense(weight);
+  auto grad_y = itensor_from_tensor(grad_output, /*from_const_data_ptr*/true);
+  auto w = itensor_view_from_dense(weight, /*from_const_data_ptr*/true);
 
   ideep::tensor grad_x;
   if (is_channels_last) {
@@ -865,8 +865,8 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
     int64_t groups,
     bool bias_defined,
     bool is_channels_last) {
-  const ideep::tensor grad_y = itensor_from_tensor(grad_output);
-  const ideep::tensor x = itensor_from_tensor(input);
+  const ideep::tensor grad_y = itensor_from_tensor(grad_output, /*from_const_data_ptr*/true);
+  const ideep::tensor x = itensor_from_tensor(input, /*from_const_data_ptr*/true);
 
   ideep::tensor grad_w, grad_b;
   if (bias_defined) {
@@ -975,8 +975,8 @@ Tensor mkldnn_convolution_transpose_backward_input(
     bool is_channels_last) {
   auto grad_input = at::empty({0}, grad_output.options());
 
-  auto grad_y = itensor_from_tensor(grad_output);
-  auto w = itensor_view_from_dense(weight).transpose_(0, 1);
+  auto grad_y = itensor_from_tensor(grad_output, /*from_const_data_ptr*/true);
+  auto w = itensor_view_from_dense(weight, /*from_const_data_ptr*/true).transpose_(0, 1);
 
   ideep::tensor grad_x;
   if (is_channels_last) {
@@ -1016,8 +1016,8 @@ std::tuple<Tensor,Tensor> mkldnn_convolution_transpose_backward_weights(
     int64_t groups,
     bool bias_defined,
     bool is_channels_last) {
-  auto grad_y = itensor_from_tensor(grad_output);
-  auto x = itensor_from_tensor(input);
+  auto grad_y = itensor_from_tensor(grad_output, /*from_const_data_ptr*/true);
+  auto x = itensor_from_tensor(input, /*from_const_data_ptr*/true);
 
   ideep::tensor grad_w, grad_b;
   if (bias_defined) {
diff --git a/aten/src/ATen/native/mkldnn/IDeepRegistration.cpp b/aten/src/ATen/native/mkldnn/IDeepRegistration.cpp
index 5977b045951f6..f102756ebbb93 100644
--- a/aten/src/ATen/native/mkldnn/IDeepRegistration.cpp
+++ b/aten/src/ATen/native/mkldnn/IDeepRegistration.cpp
@@ -30,4 +30,4 @@ void clear_computation_cache() {
 
 } // namespace  at::native::mkldnn
 
-#endif // AT_MKLDNN_ENALBED()
+#endif // AT_MKLDNN_ENABLED()
diff --git a/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp b/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp
index 054953635591d..061d154f3b40f 100644
--- a/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp
+++ b/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp
@@ -81,7 +81,7 @@ ideep::tensor& itensor_from_mkldnn(const MKLDNNTensor& mkldnn_tensor) {
   return mklimpl->unsafe_opaque_handle()->get_target();
 }
 
-ideep::tensor itensor_view_from_dense(const Tensor& tensor) {
+ideep::tensor itensor_view_from_dense(const Tensor& tensor, bool from_const_data_ptr) {
   TORCH_CHECK(
       tensor.device().is_cpu(),
       "itensor_view_from_dense expects CPU tensor input");
@@ -92,31 +92,41 @@ ideep::tensor itensor_view_from_dense(const Tensor& tensor) {
     return {{tensor.sizes().vec(),
             ideep::tensor::data_type::f32,
             tensor.strides().vec()},
-            tensor.template data_ptr<float>()};
+            from_const_data_ptr ?
+              const_cast<float*>(tensor.template const_data_ptr<float>()) :
+              tensor.template data_ptr<float>()};
   }
   else if (tensor.scalar_type() == ScalarType::BFloat16) {
     return {{tensor.sizes().vec(),
             ideep::tensor::data_type::bf16,
             tensor.strides().vec()},
-            tensor.template data_ptr<BFloat16>()};
+            from_const_data_ptr ?
+              const_cast<BFloat16*>(tensor.template const_data_ptr<BFloat16>()) :
+              tensor.template data_ptr<BFloat16>()};
   }
   else if (tensor.scalar_type() == ScalarType::Half) {
     return {{tensor.sizes().vec(),
             ideep::tensor::data_type::f16,
             tensor.strides().vec()},
-            tensor.template data_ptr<Half>()};
+            from_const_data_ptr ?
+              const_cast<Half*>(tensor.template const_data_ptr<Half>()) :
+              tensor.template data_ptr<Half>()};
   }
   else if (tensor.scalar_type() == ScalarType::Byte) {
     return {{tensor.sizes().vec(),
             ideep::tensor::data_type::u8,
             tensor.strides().vec()},
-            tensor.data_ptr()};
+            from_const_data_ptr ?
+              const_cast<void*>(tensor.const_data_ptr()) :
+              tensor.data_ptr()};
   }
   else if (tensor.scalar_type() == ScalarType::Char) {
     return {{tensor.sizes().vec(),
             ideep::tensor::data_type::s8,
             tensor.strides().vec()},
-            tensor.data_ptr()};
+            from_const_data_ptr ?
+              const_cast<void*>(tensor.const_data_ptr()) :
+              tensor.data_ptr()};
   }
   else {
     TORCH_CHECK(false, "itensor_view_from_dense expects float/bfloat16/half/int8 tensor input");
@@ -145,11 +155,11 @@ ideep::tensor itensor_view_from_dense(
 // tensor is just a view of the storage of the aten dense tensor, so
 // caller needs to make sure the aten dense tensor's lifetime is
 // longer than the ideep tensor.
-ideep::tensor itensor_from_tensor(const Tensor& tensor) {
+ideep::tensor itensor_from_tensor(const Tensor& tensor, bool from_const_data_ptr) {
   if (tensor.is_mkldnn()) {
     return itensor_from_mkldnn(tensor);
   } else {
-    return itensor_view_from_dense(tensor);
+    return itensor_view_from_dense(tensor, from_const_data_ptr);
   }
 }
 
diff --git a/aten/src/ATen/native/mkldnn/MKLDNNCommon.h b/aten/src/ATen/native/mkldnn/MKLDNNCommon.h
index 86fc25c0f2fcd..5e9044ce908aa 100644
--- a/aten/src/ATen/native/mkldnn/MKLDNNCommon.h
+++ b/aten/src/ATen/native/mkldnn/MKLDNNCommon.h
@@ -36,7 +36,7 @@ TORCH_API ideep::tensor& itensor_from_mkldnn(const Tensor& mkldnn_tensor);
 
 // Construct an `ideep::tensor` "view" from dense tensor, note the
 // ideep::tensor will share the underlying buffer
-TORCH_API ideep::tensor itensor_view_from_dense(const Tensor& tensor);
+TORCH_API ideep::tensor itensor_view_from_dense(const Tensor& tensor, bool from_const_data_ptr=false);
 
 // Construct an `ideep::tensor` "view" from dense tensor using given desc, note
 // the ideep::tensor will share the underlying buffer
@@ -45,7 +45,7 @@ TORCH_API ideep::tensor itensor_view_from_dense(
     const ideep::tensor::desc& desc);
 
 // Helper function for getting an ideep tensor out of an aten Tensor or MKL-DNN tensor.
-TORCH_API ideep::tensor itensor_from_tensor(const Tensor& tensor);
+TORCH_API ideep::tensor itensor_from_tensor(const Tensor& tensor, bool from_const_data_ptr=false);
 
 // Set MKLDNN verbose level
 TORCH_API int set_verbose(int level);
diff --git a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
index 1f92705171f6d..b2901bc522be2 100644
--- a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
+++ b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
@@ -198,24 +198,40 @@ Tensor mkldnn_reorder_conv3d_weight(
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
-    int64_t groups) {
+    int64_t groups,
+    c10::OptionalArrayRef<int64_t> input_size) {
   mkldnn_check_low_precision(self.scalar_type(), "mkldnn_reorder_conv3d_weight");
   const auto padding_expanded = expand_param_if_needed(padding, "padding", 3);
   const auto stride_expanded = expand_param_if_needed(stride, "stride", 3);
   const auto dilation_expanded = expand_param_if_needed(dilation, "dilation", 3);
 
-  auto w = itensor_from_mkldnn(self);
-
-  auto desc =
-      ideep::convolution_forward::expected_weights_desc(
-          w.get_dims(),
-          w.get_data_type(),
-          stride_expanded,
-          padding_expanded,
-          padding_expanded,
-          dilation_expanded,
-          groups,
-          ideep::algorithm::convolution_direct);
+  ideep::dims src_dims = ideep::dims();
+  bool is_channels_last = false;
+  auto memory_format = at::MemoryFormat::Contiguous;
+  if (input_size.has_value()) {
+    src_dims = input_size.value().vec();
+    // if has input size, we always use channels last.
+    is_channels_last = true;
+    memory_format = at::MemoryFormat::ChannelsLast3d;
+  }
+
+  auto self_ = self.is_mkldnn() ? self : self.contiguous(memory_format);
+  auto w = itensor_from_tensor(self_);
+
+  auto desc = ideep::convolution_forward::expected_weights_desc(
+      w.get_dims(),
+      w.get_data_type(),
+      stride_expanded,
+      padding_expanded,
+      padding_expanded,
+      dilation_expanded,
+      groups,
+      ideep::algorithm::convolution_direct,
+      ideep::prop_kind::forward,
+      w.get_data_type(),
+      src_dims,
+      ideep::attr_t(),
+      is_channels_last);
   ideep::tensor result;
   result.init(desc);
   result.feed_from(w);
@@ -223,6 +239,21 @@ Tensor mkldnn_reorder_conv3d_weight(
   return new_with_itensor_mkldnn(std::move(result), optTypeMetaToScalarType(self.options().dtype_opt()), self.options().device_opt());
 }
 
+static Tensor mkldnn_reorder_conv_weight(
+    const Tensor& self,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    c10::OptionalArrayRef<int64_t> input_size) {
+  TORCH_CHECK((self.dim() == 4 || self.dim() == 5), "mkldnn_reorder_conv_weight only supports conv2d and conv3d");
+  if (self.dim() == 4) {
+    return at::native::mkldnn_reorder_conv2d_weight(self, padding, stride, dilation, groups, input_size);
+  } else {
+    return at::native::mkldnn_reorder_conv3d_weight(self, padding, stride, dilation, groups, input_size);
+  }
+}
+
 static Tensor mkldnn_reorder_linear_weight(
     const Tensor& self,
     c10::optional<int64_t> batch_size_opt) {
@@ -389,9 +420,7 @@ static std::tuple<ideep::tensor, ideep::tensor> get_lstm_packed_weights(
         get_mkldnn_dtype(weight_hh.scalar_type()),
         ideep::format_tag::ldgoi});
 
-  ideep::tensor::desc packed_desc_ih, packed_desc_hh;
-
-  std::tie(packed_desc_ih, packed_desc_hh) =
+  auto [packed_desc_ih, packed_desc_hh] =
       ideep::lstm_forward_inference::expected_weights_desc(
           output_sizes,
           src_layer,
@@ -443,12 +472,11 @@ static std::vector<Tensor> mkldnn_reorder_mkldnn_rnn_layer_weight(
     batch_size = 10;
   }
 
-  ideep::tensor w1_, w2_;
   at::Tensor packed_w1, packed_w2;
 
   int64_t feature_size = weight0.size(-1);
 
-  std::tie(w1_, w2_) = get_lstm_packed_weights(
+  auto [w1_, w2_] = get_lstm_packed_weights(
     weight0,
     weight1,
     at::zeros(
@@ -489,7 +517,7 @@ TORCH_LIBRARY_IMPL(mkldnn, CPU, m) {
       TORCH_FN(mkldnn_reorder_linear_weight));
   m.impl(
       TORCH_SELECTIVE_NAME("mkldnn::_reorder_convolution_weight"),
-      TORCH_FN(mkldnn_reorder_conv2d_weight));
+      TORCH_FN(mkldnn_reorder_conv_weight));
   m.impl(
       TORCH_SELECTIVE_NAME("mkldnn::_reorder_mkldnn_rnn_layer_weight"),
       TORCH_FN(mkldnn_reorder_mkldnn_rnn_layer_weight));
@@ -520,7 +548,8 @@ Tensor mkldnn_reorder_conv3d_weight(
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
-    int64_t groups) {
+    int64_t groups,
+    c10::OptionalArrayRef<int64_t> input_size) {
   TORCH_CHECK(false, "mkldnn_reorder_conv3d_weight: MKL-DNN build is disabled");
 }
 
diff --git a/aten/src/ATen/native/mkldnn/Matmul.cpp b/aten/src/ATen/native/mkldnn/Matmul.cpp
index a6770cca1d35c..db02e5f3857a6 100644
--- a/aten/src/ATen/native/mkldnn/Matmul.cpp
+++ b/aten/src/ATen/native/mkldnn/Matmul.cpp
@@ -53,14 +53,38 @@ bool mkldnn_fp16_gemm(
     c10::Half *c, int64_t ldc) {
   return false;
 }
+bool mkldnn_bf32_gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const float *a, int64_t lda,
+    const float *b, int64_t ldb,
+    float beta,
+    float *c, int64_t ldc){
+      return false;
+    }
+
+bool use_mkldnn_bf32_matmul(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Tensor& result) {
+    return false;
+}
 
-bool use_mkldnn_lower_precision_matmul(
+bool use_mkldnn_matmul(
     const Tensor& mat1,
     const Tensor& mat2,
     const Tensor& result) {
     return false;
 }
 
+void mkldnn_matmul_i8i8i32(
+    const Tensor &mat1,
+    const Tensor &mat2,
+    const Tensor &result) {
+  TORCH_INTERNAL_ASSERT(false, __func__, ": ATen not compiled with MKLDNN support");
+}
+
 } // namespace native
 } // namespace at
 
@@ -80,13 +104,18 @@ static bool use_mkldnn_fp16_matmul() {
   return at::globalContext().userEnabledMkldnn() && mkldnn_fp16_device_check();
 }
 
+static bool use_mkldnn_bf32_matmul() {
+  return use_mkldnn_bf16_matmul() && at::globalContext().float32MatmulPrecision() == at::Float32MatmulPrecision::MEDIUM;
+}
+
 
 template<typename scalar_t>
 inline typename std::enable_if_t<
+    std::is_same_v<scalar_t, float> ||
     std::is_same_v<scalar_t, c10::Half> ||
     std::is_same_v<scalar_t, c10::BFloat16>,
     bool>
-mkldnn_lowerp_gemm(
+mkldnn_gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
     float alpha,
@@ -94,8 +123,10 @@ mkldnn_lowerp_gemm(
     const scalar_t *b_data, int64_t ldb,
     float beta,
     scalar_t *c_data, int64_t ldc) {
-  if (!(std::is_same_v<scalar_t, c10::BFloat16> ? use_mkldnn_bf16_matmul()
-                                                : use_mkldnn_fp16_matmul()) ||
+  bool bf16_usable = std::is_same_v<scalar_t, c10::BFloat16> && use_mkldnn_bf16_matmul();
+  bool fp16_usable = std::is_same_v<scalar_t, c10::Half> && use_mkldnn_fp16_matmul();
+  bool bf32_usable = std::is_same_v<scalar_t, float> && use_mkldnn_bf32_matmul();
+  if ( !(bf16_usable || fp16_usable || bf32_usable) ||
       (m * n * k <= 16 * 16 * 16) || (alpha == 0.0f)) {
     return false;
   }
@@ -105,6 +136,7 @@ mkldnn_lowerp_gemm(
   if (beta != 0.0f) {
     op_attr = ideep::attr_t::fuse_sum();
   }
+  if (std::is_same_v<scalar_t, float>) op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16); // bf32 path
 
   // NOTE: View as c-contiguous to avoid extra reordering in mkldnn
   // Use identity: C = AB <=> C^T = B^T A^T
@@ -117,9 +149,12 @@ mkldnn_lowerp_gemm(
   }
 
   auto idtype = ideep::tensor::data_type::bf16;
-  if constexpr (!std::is_same_v<scalar_t, c10::BFloat16>) {
+  if constexpr (std::is_same_v<scalar_t, c10::Half>) {
     idtype = ideep::tensor::data_type::f16;
   }
+  if constexpr (std::is_same_v<scalar_t, float>) {
+    idtype = ideep::tensor::data_type::f32;
+  }
 
   ideep::tensor a({
       /*sizes=*/{k, m},
@@ -164,7 +199,7 @@ bool mkldnn_bf16_gemm(
     const c10::BFloat16 *b, int64_t ldb,
     float beta,
     c10::BFloat16 *c, int64_t ldc) {
-  return mkldnn_lowerp_gemm<c10::BFloat16>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+  return mkldnn_gemm<c10::BFloat16>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 bool mkldnn_fp16_gemm(
@@ -175,9 +210,19 @@ bool mkldnn_fp16_gemm(
     const c10::Half *b, int64_t ldb,
     float beta,
     c10::Half *c, int64_t ldc) {
-  return mkldnn_lowerp_gemm<c10::Half>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+  return mkldnn_gemm<c10::Half>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
+bool mkldnn_bf32_gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const float *a, int64_t lda,
+    const float *b, int64_t ldb,
+    float beta,
+    float *c, int64_t ldc){
+      return mkldnn_gemm<float>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    }
 
 void mkldnn_matmul(
     const Tensor &mat1,
@@ -205,11 +250,12 @@ void mkldnn_matmul(
 #else
   TORCH_CHECK(
       (mat1.scalar_type() == at::kBFloat16 ||
-       mat1.scalar_type() == at::kHalf) &&
+       mat1.scalar_type() == at::kHalf ||
+       mat1.scalar_type() == at::kFloat) &&
           mat2.scalar_type() == mat1.scalar_type() &&
           result.scalar_type() == mat1.scalar_type(),
       "mkldnn_matmul:  only enabled for bf16 and fp16 path");
-  if (mat1.scalar_type() == at::kBFloat16) {
+  if (mat1.scalar_type() == at::kBFloat16 || mat1.scalar_type() == at::kFloat) {
     TORCH_CHECK(
         mkldnn_bf16_device_check(),
         "mkldnn_matmul: mkldnn_matmul bf16 path needs the cpu support avx_ne_convert or avx512bw, avx512vl and avx512dq, or AWS Graviton3");
@@ -230,6 +276,7 @@ void mkldnn_matmul(
   // but mkldnn matmul primitive only support bias be 1-D tensors
   // to address their differences, we use mkldnn post ops to perform a fused "add" after matrix multiplication is over
   if (beta != 0.0f) op_attr = ideep::attr_t::fuse_sum();
+  if (mat1.scalar_type() == at::kFloat) op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16); // bf32 path
   // If alpha = 0, dose not need actually do gemm computation
   if (alpha == 0)
     return;
@@ -340,11 +387,129 @@ bool use_mkldnn_fp16_matmul(
       checksize(mat1, mat2));
 }
 
-bool use_mkldnn_lower_precision_matmul(
+bool use_mkldnn_bf32_matmul(
     const Tensor& mat1,
     const Tensor& mat2,
     const Tensor& result) {
-    return (use_mkldnn_bf16_matmul(mat1, mat2, result) || use_mkldnn_fp16_matmul(mat1, mat2, result));
+
+    return (
+      use_mkldnn_bf32_matmul() &&
+      mat1.scalar_type() == kFloat &&
+      mat2.scalar_type() == kFloat &&
+      (!result.defined() || result.scalar_type() == kFloat) &&
+      mat1.numel() != 0 &&
+      mat2.numel() != 0 &&
+      checksize(mat1, mat2));
+}
+
+bool use_mkldnn_matmul(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Tensor& result) {
+  return (use_mkldnn_bf16_matmul(mat1, mat2, result) || use_mkldnn_fp16_matmul(mat1, mat2, result) || use_mkldnn_bf32_matmul(mat1, mat2, result));
+}
+
+static void _mkldnn_matmul_i8i8i32_with_primitive(
+    const Tensor &mat1,
+    const Tensor &mat2,
+    const Tensor &result) {
+  // Create ideep tensors for oneDNN computation
+  auto src = ideep::tensor(
+      {mat1.sizes().vec(),
+       ideep::tensor::data_type::s8,
+       mat1.strides().vec()},
+      mat1.data_ptr());
+  auto wei = ideep::tensor(
+      {mat2.sizes().vec(),
+       ideep::tensor::data_type::s8,
+       mat2.strides().vec()},
+      mat2.data_ptr());
+  auto dst = ideep::tensor(
+      {result.sizes().vec(),
+       ideep::tensor::data_type::s32,
+       result.strides().vec()},
+      result.data_ptr());
+  // Create primitive desc
+  auto engine = ideep::engine::cpu_engine();
+  ideep::attr_t op_attr;
+  op_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+  auto src_desc = src.get_desc();
+  auto wei_desc = wei.get_desc();
+  auto dst_desc = dst.get_desc();
+  auto prim_desc = dnnl::matmul::primitive_desc(
+      engine, src_desc, wei_desc, dst_desc, op_attr);
+  // Reorder mat2 if needed
+  auto expected_weight = wei.reorder_if_differ_in(prim_desc.weights_desc());
+  // Prepare args for primitive
+  ideep::tensor scratchpad(prim_desc.scratchpad_desc());
+  ideep::exec_args args;
+  args.insert({DNNL_ARG_SRC, src});
+  args.insert({DNNL_ARG_WEIGHTS, expected_weight});
+  args.insert({DNNL_ARG_DST, dst});
+  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
+  // Create primitve and execute
+  auto primitive = dnnl::matmul(prim_desc);
+  primitive.execute(ideep::stream::default_stream(), args);
+}
+
+static void _mkldnn_gemm_i8i8i32_with_blas(
+  const Tensor& self,
+  const Tensor& mat2,
+  const Tensor& result) {
+    const int m = result.size(0);
+    const int n = result.size(1);
+    const int k = self.size(1);
+
+    const char transa = self.strides()[1] == 1 ? 'N' : 'T';
+    const char transb = mat2.strides()[1] == 1 ? 'N' : 'T';
+    const char offsetc = 'F';
+
+    const int lda = transa == 'T' ? self.stride(1) : self.stride(0);
+    const int ldb = transb == 'T' ? mat2.stride(1) : mat2.stride(0);
+    const int ldc = n;
+
+    const float alpha = 1;
+    const float beta = 0;
+
+    int8_t ao = 0;
+    int8_t bo = 0;
+    int32_t co = 0;
+
+    dnnl::gemm_s8s8s32(
+        transa,
+        transb,
+        offsetc,
+        m,
+        n,
+        k,
+        alpha,
+        (int8_t*)self.data_ptr(),
+        lda,
+        ao,
+        (int8_t*)mat2.data_ptr(),
+        ldb,
+        bo,
+        beta,
+        (int32_t*)result.data_ptr(),
+        ldc,
+        &co);
+  }
+
+void mkldnn_matmul_i8i8i32(
+    const Tensor &mat1,
+    const Tensor &mat2,
+    const Tensor &result) {
+  // x:s8 * w:s8 -> y:s32
+  // both inputs should be 2d
+  // In most cases, using DNNL blas API is faster but it requires a/b contiguous along one dimentsion
+  bool a_is_contigous = (mat1.stride(0) == 1 || mat1.stride(1) == 1);
+  bool b_is_contigous = (mat2.stride(0) == 1 || mat2.stride(1) == 1);
+
+  if (a_is_contigous && b_is_contigous) {
+    _mkldnn_gemm_i8i8i32_with_blas(mat1, mat2, result);
+  } else {
+    _mkldnn_matmul_i8i8i32_with_primitive(mat1, mat2, result);
+  }
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/mkldnn/Matmul.h b/aten/src/ATen/native/mkldnn/Matmul.h
index 86452c416953b..d82bb310efeba 100644
--- a/aten/src/ATen/native/mkldnn/Matmul.h
+++ b/aten/src/ATen/native/mkldnn/Matmul.h
@@ -24,6 +24,11 @@ bool use_mkldnn_fp16_matmul(
     const Tensor& mat2,
     const Tensor& result_opt);
 
+bool use_mkldnn_bf32_matmul(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Tensor& result_opt);
+
 // Try running mkldnn optimized gemm, or returns false if naive gemm would be faster
 bool mkldnn_bf16_gemm(
     TransposeType transa, TransposeType transb,
@@ -43,11 +48,31 @@ bool mkldnn_fp16_gemm(
     float beta,
     c10::Half *c, int64_t ldc);
 
-bool use_mkldnn_lower_precision_matmul(
+/*
+oneDNN implicit reduced precision arithmetic feature
+https://github.com/mgouicem/oneDNN/tree/mgouicem/rfcs/implicit_downconvert/rfcs/20210301-computation-datatype
+to allow implicitly cast data type from FP32 to BF16 in onednn compute primitives
+*/
+bool mkldnn_bf32_gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const float *a, int64_t lda,
+    const float *b, int64_t ldb,
+    float beta,
+    float *c, int64_t ldc);
+
+bool use_mkldnn_matmul(
     const Tensor& mat1,
     const Tensor& mat2,
     const Tensor& result);
 
+// x:s8 * w:s8 -> y:s32
+TORCH_API void mkldnn_matmul_i8i8i32(
+    const Tensor &mat1,
+    const Tensor &mat2,
+    const Tensor &result);
+
 }
 
 }
diff --git a/aten/src/ATen/native/mkldnn/Normalization.cpp b/aten/src/ATen/native/mkldnn/Normalization.cpp
index 108ce354ec9bb..0aced614a0ea3 100644
--- a/aten/src/ATen/native/mkldnn/Normalization.cpp
+++ b/aten/src/ATen/native/mkldnn/Normalization.cpp
@@ -6,6 +6,8 @@
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_batch_norm_with_update_native.h>
+#include <ATen/ops/batch_norm_backward_native.h>
 #include <ATen/ops/_native_batch_norm_legit_native.h>
 #include <ATen/ops/_to_dense_native.h>
 #include <ATen/ops/empty_native.h>
@@ -59,6 +61,20 @@ std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit_no_stats(
   TORCH_CHECK(false, "_mkldnn_batch_norm_legit_no_stats: ATen not compiled with MKLDNN support");
 }
 
+std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_mkldnn(
+    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    Tensor& running_mean, Tensor& running_var, double momentum, double eps) {
+  TORCH_CHECK(false, "_batch_norm_with_update_mkldnn: ATen not compiled with MKLDNN support");
+}
+
+std::tuple<Tensor, Tensor, Tensor> _new_batch_norm_backward_mkldnn(
+    const Tensor& grad_output, const Tensor& input, const Tensor& weight,
+    const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
+    bool update, double eps, std::array<bool,3> grad_input_mask, const Tensor& reserve) {
+  TORCH_CHECK(false, "_new_batch_norm_backward_mkldnn: ATen not compiled with MKLDNN support");
+}
+
 } // namespace native
 } // namespace at
 
@@ -192,6 +208,17 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm(
 }
 
 
+std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_mkldnn(
+    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    Tensor& running_mean, Tensor& running_var, double momentum, double eps) {
+  Tensor output, save_mean, save_var;
+  std::tie(output, save_mean, save_var) =
+    mkldnn_batch_norm(input, weight_opt, bias_opt, running_mean, running_var, /*train*/true, momentum, eps);
+  Tensor reserve = empty_mkldnn({0}, input.scalar_type());
+  return std::tuple<Tensor, Tensor, Tensor, Tensor>(output, save_mean, save_var, reserve);
+}
+
+
 std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit(
     const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var,
     bool train,
@@ -210,6 +237,15 @@ std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit_no_stats(
 }
 
 
+std::tuple<Tensor, Tensor, Tensor> _new_batch_norm_backward_mkldnn(
+    const Tensor& grad_output, const Tensor& input, const Tensor& weight,
+    const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
+    bool update, double eps, std::array<bool,3> grad_input_mask, const Tensor& reserve) {
+  return mkldnn_batch_norm_backward(grad_output, input, weight, running_mean_opt, running_var_opt, save_mean_opt, save_var_opt, update, eps, grad_input_mask);
+}
+
+
 std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm_backward(const Tensor& grad_output,
     const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt,
     bool train,
diff --git a/aten/src/ATen/native/mkldnn/Pooling.cpp b/aten/src/ATen/native/mkldnn/Pooling.cpp
index 5eb03c43220d0..7b59d7b85fe93 100644
--- a/aten/src/ATen/native/mkldnn/Pooling.cpp
+++ b/aten/src/ATen/native/mkldnn/Pooling.cpp
@@ -642,7 +642,7 @@ Tensor& mkldnn_avg_pool3d_backward_out(const Tensor & grad_output,
 Tensor mkldnn_adaptive_avg_pool2d_backward(
     const Tensor& grad_output,
     const Tensor& input) {
-  TORCH_CHECK(input.dim() == 4, "mkldnn_adaptive_avg_pool2d: Input is expected a 4D tenosor");
+  TORCH_CHECK(input.dim() == 4, "mkldnn_adaptive_avg_pool2d: Input is expected a 4D tensor");
 
   auto output_size_vec = grad_output.sizes();
   std::vector<int64_t> kernel_size(input.dim() - 2);
diff --git a/aten/src/ATen/native/mkldnn/RNN.cpp b/aten/src/ATen/native/mkldnn/RNN.cpp
index a5effcc0ce158..afea7f91e79ea 100644
--- a/aten/src/ATen/native/mkldnn/RNN.cpp
+++ b/aten/src/ATen/native/mkldnn/RNN.cpp
@@ -75,7 +75,7 @@ REGISTER_NO_CPU_DISPATCH(lstm_mkldnn_stub);
 
 } // namespace at::native
 
-#else // AT_MKLDNN_EBABLED
+#else // AT_MKLDNN_ENABLED
 
 #include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <ATen/native/mkldnn/Utils.h>
@@ -541,8 +541,7 @@ std::pair<Tensor, hidden_type> mkldnn_impl(
     const Tensor& input, const hidden_type& hidden,
     TensorList params, bool has_biases, ideep::rnn_kind mode,
     int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
-  Tensor hx, cx;
-  std::tie(hx, cx) = unpack_hidden(hidden);
+  auto [hx, cx] = unpack_hidden(hidden);
   int64_t hidden_size = hx.size(2);
 
   auto mkldnn_output = mkldnn_rnn(
@@ -569,4 +568,4 @@ REGISTER_ALL_CPU_DISPATCH(lstm_mkldnn_stub, &lstm_mkldnn);
 
 } // namespace at::native
 
-#endif // AT_MKLDNN_EBABLED
+#endif // AT_MKLDNN_ENABLED
diff --git a/aten/src/ATen/native/mkldnn/Utils.h b/aten/src/ATen/native/mkldnn/Utils.h
index 6d4a172ebe400..aa804d6bc1877 100644
--- a/aten/src/ATen/native/mkldnn/Utils.h
+++ b/aten/src/ATen/native/mkldnn/Utils.h
@@ -97,7 +97,7 @@ constexpr bool mkldnn_bf16_device_check_arm() {
 
 #if AT_MKLDNN_ENABLED()
 inline bool mkldnn_bf16_device_check() {
-#if defined(__x86_64__)
+#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC))
   // Use ideep to check bf16 on X64 as cpuinfo has no avx_ne_convert check.
   return ideep::has_bf16_type_support();
 #else
@@ -106,7 +106,7 @@ inline bool mkldnn_bf16_device_check() {
 }
 
 inline bool mkldnn_fp16_device_check() {
-#if defined(__x86_64__)
+#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC))
   return ideep::has_fp16_type_support();
 #else
   return false;
diff --git a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
new file mode 100644
index 0000000000000..6cba3f4c9fa18
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
@@ -0,0 +1,436 @@
+#include <ATen/WrapDimUtilsMulti.h>
+#include <ATen/native/Resize.h>
+#include <torch/library.h>
+#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
+
+namespace at::native::xpu {
+
+// result = beta * self + alpha * (mat1 * mat2)
+Tensor& addmm_out(
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    at::Tensor& result) {
+  checkBackend("addmm_out", {result, self, mat1, mat2}, Backend::XPU);
+  TORCH_CHECK(
+      mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
+  TORCH_CHECK(
+      mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
+  TORCH_CHECK(
+      mat1.sizes()[1] == mat2.sizes()[0],
+      "mat1 and mat2 shapes cannot be multiplied (",
+      mat1.sizes()[0],
+      "x",
+      mat1.sizes()[1],
+      " and ",
+      mat2.sizes()[0],
+      "x",
+      mat2.sizes()[1],
+      ")");
+
+  std::vector<int64_t> result_shape = {mat1.size(0), mat2.size(1)};
+  result.resize_(result_shape);
+
+  IntArrayRef result_sizes = result.sizes();
+  if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) {
+    return result;
+  }
+
+  if (mat1.numel() == 0){
+    if(beta.to<float>() == 0.f){
+      return result.zero_();
+    }
+    return at::mul_out(
+      result,
+      self.expand(result.sizes()),
+      at::native::scalar_tensor(
+        beta,
+        self.scalar_type(),
+        c10::nullopt,
+        at::kCPU,
+        c10::nullopt
+      )
+    );
+  }
+
+  TORCH_CHECK(
+      are_expandable(self.sizes(), result_shape),
+      "addmm_out input must be expanable to:",
+      result_shape,
+      " but got:",
+      self.sizes());
+
+  // complex/double case
+  if (mat1.is_complex() || mat1.scalar_type() == ScalarType::Double) {
+    AT_ERROR(
+        "Double and complex datatype matmul is not supported in oneDNN");
+  }
+
+  // general case
+  Tensor bias = Tensor();
+  onednn::Attr attr;
+  float beta_ = beta.to<float>();
+  if (beta_ == 0.f) {
+    if (alpha.to<float>() != 1.f) {
+      attr.append_post_eltwise(
+          1.f, alpha.to<float>(), 0.f, attr.kind_with_linear);
+    }
+  } else {
+    if (alpha.to<float>() == 1.f && beta_ == 1.f) {
+      bias = self;
+    } else {
+      Tensor binary = self.dim() == 1 ? self.unsqueeze(0) : self;
+      // Tensor binary = self.expand_as(result);
+      // For post-binary-add, onednn needs binary scale=1.f
+      // Thus we need the following transformation
+      // alpha * matmul(mat1, mat2) + beta * binary
+      // beta * (alpha/beta * matmul(src, wei) + binary)
+      float alpha_ = alpha.to<float>() / beta_;
+      if (alpha_ != 1.f)
+        attr.append_post_eltwise(1.f, alpha_, 0.f, attr.kind_with_linear);
+      attr.append_post_binary(attr.kind_with_binary_add, binary);
+      if (beta_ != 1.f)
+        attr.append_post_eltwise(1.f, beta_, 0.f, attr.kind_with_linear);
+    }
+  }
+  onednn::matmul(result, mat1, mat2, bias, true, attr);
+  return result;
+}
+
+Tensor& _addmm_activation_out(
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    bool use_gelu,
+    at::Tensor& result) {
+  addmm_out(self, mat1, mat2, beta, alpha, result);
+  if (use_gelu) {
+    at::gelu_(result);
+  } else {
+    at::relu_(result);
+  }
+  return result;
+}
+
+Tensor& mm_out(const Tensor& self, const Tensor& mat2, Tensor& result) {
+  checkBackend("mm_out", {result, self, mat2}, Backend::XPU);
+  TORCH_CHECK(self.dim() == 2, "self must be a matrix");
+  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
+  TORCH_CHECK(
+      self.sizes()[1] == mat2.sizes()[0],
+      "mat1 and mat2 shapes cannot be multiplied (",
+      self.sizes()[0],
+      "x",
+      self.sizes()[1],
+      " and ",
+      mat2.sizes()[0],
+      "x",
+      mat2.sizes()[1],
+      ")");
+
+  result.resize_({self.size(0), mat2.size(1)});
+  if (self.numel() == 0 || mat2.numel() == 0) {
+    if (result.numel() > 0)
+      result.zero_();
+    return result;
+  }
+
+  if (self.is_complex() || self.scalar_type() == ScalarType::Double) {
+    AT_ERROR(
+        "Double and complex datatype matmul is not supported in oneDNN");
+  }
+
+  onednn::matmul(result, self, mat2, Tensor(), true, onednn::Attr());
+  return result;
+}
+
+Tensor mm(const Tensor& self, const Tensor& mat2) {
+  auto result = at::empty({0}, self.options());
+  xpu::mm_out(self, mat2, result);
+  return result;
+}
+
+Tensor mv(const Tensor& self, const Tensor& vec) {
+  Tensor result = at::empty({self.size(0)}, self.options());
+  return at::addmv_(result, self, vec, 0, 1);
+}
+
+
+// result = beta * input + alpha * (batch1 @ batch2)
+Tensor& baddbmm_out(
+    const Tensor& input,
+    const Tensor& batch1,
+    const Tensor& batch2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    Tensor& result) {
+  checkBackend("baddbmm_out", {input, batch1, batch2}, Backend::XPU);
+  TORCH_CHECK(batch1.dim() == 3, "expected 3D tensor");
+  TORCH_CHECK(batch2.dim() == 3, "expected 3D tensor");
+
+  std::vector<int64_t> result_shape = {
+      batch1.size(0), batch1.size(1), batch2.size(2)};
+  result.resize_(result_shape);
+  if (result.numel() == 0){
+    return result;
+  } else if (batch1.size(2) == 0){
+    if (beta.to<c10::complex<double>>() == 0.0){
+      return result.zero_();
+    }else{
+      at::mul_out(result, input, beta);
+      return result;
+    }
+  }
+
+  TORCH_CHECK(
+      are_expandable(input.sizes(), result_shape),
+      "baddbmm_out input must be expanable to:",
+      result_shape,
+      " but got:",
+      input.sizes());
+
+  // complex and double case
+  if (batch1.is_complex() || batch2.scalar_type() == ScalarType::Double) {
+    AT_ERROR(
+        "Double and complex datatype matmul is not supported in oneDNN");
+  }
+
+  // general case
+  onednn::Attr attr;
+  float beta_ = beta.to<float>();
+  Tensor binary;
+  if (beta_ == 0.f) {
+    if (alpha.to<float>() != 1.f) {
+      attr.append_post_eltwise(
+          1.f, alpha.to<float>(), 0.f, attr.kind_with_linear);
+    }
+  } else {
+    binary = input.dim() < 3 ? input.unsqueeze(0) : input;
+    binary = binary.dim() < 3 ? binary.unsqueeze_(0) : binary;
+    float alpha_ = alpha.to<float>() / beta_;
+    if (alpha_ != 1.f)
+      attr.append_post_eltwise(1.f, alpha_, 0.f, attr.kind_with_linear);
+    attr.append_post_binary(attr.kind_with_binary_add, binary);
+    if (beta_ != 1.f)
+      attr.append_post_eltwise(1.f, beta_, 0.f, attr.kind_with_linear);
+  }
+  onednn::matmul(result, batch1, batch2, at::Tensor(), true, attr);
+  return result;
+}
+
+Tensor& baddbmm_(
+    Tensor& self,
+    const Tensor& batch1,
+    const Tensor& batch2,
+    const Scalar& beta,
+    const Scalar& alpha) {
+  TORCH_CHECK(self.dtype() == batch1.dtype(), "Input dtypes must be the same, got: input ", self.dtype(), ", batch1: ", batch1.dtype(), ", batch2: ", batch2.dtype());
+  return at::native::xpu::baddbmm_out(
+      self, batch1, batch2, beta, alpha, self);
+}
+
+Tensor baddbmm(
+    const Tensor& input,
+    const Tensor& batch1,
+    const Tensor& batch2,
+    const Scalar& beta,
+    const Scalar& alpha) {
+  Tensor r = at::empty({0}, input.options());
+  TORCH_CHECK(input.dtype() == batch1.dtype(), "Input dtypes must be the same, got: input ", input.dtype(), ", batch1: ", batch1.dtype(), ", batch2: ", batch2.dtype());
+  r = at::native::xpu::baddbmm_out(input, batch1, batch2, beta, alpha, r);
+  return r;
+}
+
+Tensor& addbmm_out(
+    const Tensor& self,
+    const Tensor& batch1,
+    const Tensor& batch2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    Tensor& out) {
+  checkBackend("addbmm_out", {out, self, batch1, batch2}, Backend::XPU);
+  TORCH_CHECK(
+      batch1.dim() == 3 && batch2.dim() == 3,
+      "Batch tensors should be 3D, got dimensions ",
+      batch1.dim(),
+      " and ",
+      batch2.dim());
+
+  out.resize_({batch1.size(1), batch2.size(2)});
+  if (alpha.to<float>() == 0.f || batch1.numel() == 0 || batch2.numel() == 0) {
+    out.resize_({batch1.size(1), batch2.size(2)});
+    if (out.numel() == 0)
+      return out;
+
+    if (self.defined() && beta.to<float>() != 0.f) {
+      out = at::mul_out(
+          out, self, at::native::wrapped_scalar_tensor(at::Scalar(beta)));
+    } else {
+      out.zero_();
+    }
+    return out;
+  }
+
+  Tensor b1;
+  if (batch1.size(0) > 1) {
+    b1 = batch1.transpose(0, 1).contiguous().view({batch1.size(1), -1});
+  } else {
+    b1 = batch1.contiguous().view({batch1.size(1), -1});
+  }
+  auto b2 = batch2.contiguous().view({-1, batch2.size(2)});
+  at::native::xpu::addmm_out(self, b1, b2, beta, alpha, out);
+
+  return out;
+}
+
+Tensor& addbmm_(
+    Tensor& self,
+    const Tensor& batch1,
+    const Tensor& batch2,
+    const Scalar& beta,
+    const Scalar& alpha) {
+  at::native::xpu::addbmm_out(self, batch1, batch2, beta, alpha, self);
+  return self;
+}
+
+Tensor addbmm(
+    const Tensor& self,
+    const Tensor& batch1,
+    const Tensor& batch2,
+    const Scalar& beta,
+    const Scalar& alpha) {
+  Tensor out = at::empty({0}, self.options());
+  at::native::xpu::addbmm_out(self, batch1, batch2, beta, alpha, out);
+  return out;
+}
+
+Tensor& bmm_out(const Tensor& self, const Tensor& batch2, Tensor& result) {
+  checkBackend("bmm_out", {result, self, batch2}, Backend::XPU);
+  TORCH_CHECK(self.dim() == 3, "expected 3D tensor");
+  TORCH_CHECK(batch2.dim() == 3, "expected 3D tensor");
+
+  result.resize_({self.size(0), self.size(1), batch2.size(2)});
+  if (self.numel() == 0 || batch2.numel() == 0) {
+    if (result.numel() > 0)
+      result.zero_();
+    return result;
+  }
+
+  if (self.is_complex() || self.scalar_type() == ScalarType::Double) {
+    AT_ERROR(
+        "Double and complex datatype matmul is not supported in oneDNN");
+  }
+  onednn::matmul(result, self, batch2, at::Tensor(), true, onednn::Attr());
+  return result;
+}
+
+Tensor bmm(const Tensor& self, const Tensor& batch2) {
+  auto result = at::empty({0}, self.options());
+  at::native::xpu::bmm_out(self, batch2, result);
+  return result;
+}
+
+Tensor& addmv_out(
+    const Tensor& self,
+    const Tensor& mat,
+    const Tensor& vec,
+    const Scalar& beta,
+    const Scalar& alpha,
+    Tensor& out) {
+  Tensor self_v;
+  TORCH_CHECK(
+      (mat.dim() == 2 && vec.dim() == 1 && self.dim() <= 1),
+      "vector + matrix @ vector expected, got ",
+      self.dim(),
+      ", ",
+      mat.dim(),
+      ", ",
+      vec.dim());
+  if (self.dim() == 1 && self.size(0) != 1) {
+    TORCH_CHECK(
+        (mat.size(1) == vec.size(0) && mat.size(0) == self.size(0)),
+        "size mismatch, get ",
+        self.size(0),
+        ", ",
+        mat.size(0),
+        "x",
+        mat.size(1),
+        ",",
+        vec.size(0));
+    self_v = self.view({self.size(0), 1});
+  } else {
+    TORCH_CHECK(
+        (mat.size(1) == vec.size(0)),
+        "size mismatch, get ",
+        mat.size(0),
+        "x",
+        mat.size(1),
+        ",",
+        vec.size(0));
+    self_v = self;
+  }
+
+  Tensor vec_v = vec.view({vec.size(0), 1});
+  at::native::xpu::addmm_out(self_v, mat, vec_v, beta, alpha, out);
+  out.resize_({mat.size(0)});
+  return out;
+}
+
+Tensor& tensordot_out(
+    const Tensor& input1,
+    const Tensor& input2,
+    IntArrayRef dims1,
+    IntArrayRef dims2,
+    Tensor& result) {
+  Tensor result_tmp = at::tensordot(input1, input2, dims1, dims2);
+  auto result_dtype = result_tmp.scalar_type();
+  auto output_tensor_dtype = result.scalar_type();
+  auto output_device = result.device();
+  auto input1_device = input1.device();
+  auto input2_device = input2.device();
+  // check if the input & output tensors are on the same device.
+  TORCH_CHECK(
+      (output_device == input1_device) && (input1_device == input2_device),
+      "tensordot: Expected the output and input tensors to be on the "
+      "same device, but got the output tensor on ",
+      output_device,
+      ", input tensor a on ",
+      input1_device,
+      ", and input tensor b on ",
+      input2_device);
+  // check if the computed result has the same dtype as the out tensor
+  // (because tensordot does not support type promotion)
+  TORCH_CHECK(
+      result_dtype == output_tensor_dtype,
+      "tensordot",
+      ": Expected the output tensor to have dtype ",
+      result_dtype,
+      ", but got an output tensor with dtype ",
+      output_tensor_dtype);
+  at::native::resize_output(result, result_tmp.sizes());
+  result.copy_(result_tmp);
+  return result;
+}
+
+TORCH_LIBRARY_IMPL(aten, XPU, m){
+  m.impl("addmm.out", TORCH_FN(addmm_out));
+  m.impl("_addmm_activation.out", TORCH_FN(_addmm_activation_out));
+  m.impl("mm.out", TORCH_FN(mm_out));
+  m.impl("mm", TORCH_FN(mm));
+  m.impl("baddbmm.out", TORCH_FN(baddbmm_out));
+  m.impl("baddbmm_", TORCH_FN(baddbmm_));
+  m.impl("baddbmm", TORCH_FN(baddbmm));
+  m.impl("addbmm.out", TORCH_FN(addbmm_out));
+  m.impl("addbmm_", TORCH_FN(addbmm_));
+  m.impl("addbmm", TORCH_FN(addbmm));
+  m.impl("bmm.out", TORCH_FN(bmm_out));
+  m.impl("bmm", TORCH_FN(bmm));
+  m.impl("addmv.out", TORCH_FN(addmv_out));
+  m.impl("tensordot.out", TORCH_FN(tensordot_out));
+}
+
+} // namespace at::native::xpu
diff --git a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
new file mode 100644
index 0000000000000..8ac19605b1c79
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
@@ -0,0 +1,739 @@
+#include <vector>
+
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/interned_strings.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/neg.h>
+#include <c10/core/Scalar.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <ATen/native/utils/ParamUtils.h>
+#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
+#include <torch/library.h>
+#include <ATen/native/ConvUtils.h>
+
+using namespace dnnl;
+using namespace at::native;
+using namespace at::native::onednn;
+
+namespace at::native {
+namespace xpu {
+namespace impl {
+
+struct ConvParams {
+  std::vector<int64_t> stride;
+  std::vector<int64_t> padding;
+  std::vector<int64_t> dilation;
+  bool transposed;
+  std::vector<int64_t> output_padding;
+  int groups;
+  bool benchmark;
+  bool deterministic;
+
+  bool is_strided() const;
+  bool is_dilated() const;
+  bool is_padded() const;
+  bool is_output_padding_neg() const;
+  bool is_output_padding_big() const;
+  bool is_padding_neg() const;
+  bool is_stride_nonpos() const;
+  void view1d_as_2d();
+  bool use_cpu_depthwise3x3_winograd(
+      const at::Tensor& input,
+      const at::Tensor& weight) const;
+  bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const;
+};
+
+std::ostream& operator<<(std::ostream& out, const ConvParams& params) {
+  out << "ConvParams {"
+      << "  stride = " << IntArrayRef{params.stride}
+      << "  padding = " << IntArrayRef{params.padding}
+      << "  dilation = " << IntArrayRef{params.dilation}
+      << "  transposed = " << params.transposed
+      << "  output_padding = " << IntArrayRef{params.output_padding}
+      << "  groups = " << params.groups << "  benchmark = " << params.benchmark
+      << "  deterministic = " << params.deterministic << "}";
+  return out;
+}
+
+bool ConvParams::is_strided() const {
+  bool is_strided = false;
+  for (int s : stride) {
+    is_strided |= (s != 1);
+  }
+  return is_strided;
+}
+
+bool ConvParams::is_dilated() const {
+  bool is_dilated = false;
+  for (int d : dilation) {
+    is_dilated |= (d != 1);
+  }
+  return is_dilated;
+}
+
+bool ConvParams::is_padded() const {
+  bool is_padded = false;
+  for (int p : padding) {
+    is_padded |= (p != 0);
+  }
+  return is_padded;
+}
+
+bool ConvParams::is_output_padding_neg() const {
+  bool is_non_neg = false;
+  for (int p : output_padding) {
+    is_non_neg |= (p < 0);
+  }
+  return is_non_neg;
+}
+
+bool ConvParams::is_output_padding_big() const {
+  bool is_big = false;
+  for (size_t i = 0; i < output_padding.size(); i++) {
+    is_big |=
+        (output_padding[i] >= stride[i] || output_padding[i] >= dilation[i]);
+  }
+  return is_big;
+}
+
+bool ConvParams::is_padding_neg() const {
+  bool is_non_neg = false;
+  for (int p : padding) {
+    is_non_neg |= (p < 0);
+  }
+  return is_non_neg;
+}
+
+bool ConvParams::is_stride_nonpos() const {
+  bool is_nonpos = false;
+  for (int s : stride) {
+    is_nonpos |= (s <= 0);
+  }
+  return is_nonpos;
+}
+
+void ConvParams::view1d_as_2d() {
+  if (stride.size() == 1) {
+    stride.insert(stride.begin(), 1);
+    padding.insert(padding.begin(), 0);
+    dilation.insert(dilation.begin(), 1);
+    output_padding.insert(output_padding.begin(), 0);
+  }
+}
+
+bool ConvParams::use_cpu_depthwise3x3_winograd(
+    const at::Tensor& input,
+    const at::Tensor& weight) const {
+  return false;
+}
+
+bool ConvParams::is_depthwise(const at::Tensor& input, const at::Tensor& weight)
+    const {
+  return !transposed && input.ndimension() == 4 && input.size(1) == groups &&
+      groups > 1 && // no point if there is only a single group
+      weight.size(0) % input.size(1) ==
+      0; // output channels must be a multiple of input channels
+}
+
+static void check_shape_forward(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& bias,
+    const ConvParams& params,
+    bool input_is_mkldnn) {
+  int64_t k = input.ndimension();
+  int64_t weight_dim = weight.ndimension();
+  std::vector<int64_t> weight_sizes(weight_dim);
+  if ((weight_dim == k + 1) && input_is_mkldnn) {
+    weight_sizes[0] = weight.size(0) * weight.size(1);
+    std::copy_n(weight.sizes().cbegin() + 2, k - 1, weight_sizes.begin() + 1);
+    weight_dim = k;
+  } else {
+    std::copy_n(weight.sizes().cbegin(), weight_dim, weight_sizes.begin());
+  }
+  int64_t groups = params.groups;
+  auto padding = params.padding;
+  auto output_padding = params.output_padding;
+  auto stride = params.stride;
+  auto dilation = params.dilation;
+  bool transposed = params.transposed;
+
+  TORCH_CHECK(!params.is_padding_neg(), "negative padding is not supported");
+  TORCH_CHECK(
+      !params.is_output_padding_neg(),
+      "negative output_padding is not supported");
+  TORCH_CHECK(
+      !params.is_stride_nonpos(), "non-positive stride is not supported");
+
+  TORCH_CHECK(
+      weight_dim == k,
+      "Expected ",
+      weight_dim,
+      "-dimensional input for ",
+      weight_dim,
+      "-dimensional weight ",
+      weight_sizes,
+      ", but got ",
+      k,
+      "-dimensional input of size ",
+      input.sizes(),
+      " instead");
+  TORCH_CHECK(
+      weight_sizes[0] >= groups,
+      "Given groups=",
+      groups,
+      ", expected weight to be at least ",
+      groups,
+      " at dimension 0, but got weight of size ",
+      weight_sizes,
+      " instead");
+  TORCH_CHECK(
+      weight_sizes[0] % groups == 0,
+      "Given groups=",
+      groups,
+      ", expected weight to be divisible by ",
+      groups,
+      " at dimension 0, but got weight of size ",
+      weight_sizes,
+      " instead");
+
+  if (!transposed) {
+    std::vector<int64_t> input_shape;
+    std::vector<int64_t> kernel_shape;
+    bool kernel_size_correct = true;
+
+    TORCH_CHECK(
+        input.size(1) == (weight_sizes[1] * groups),
+        "Given groups=",
+        groups,
+        ", weight of size ",
+        weight_sizes,
+        ", expected input",
+        input.sizes(),
+        " to have ",
+        (weight_sizes[1] * groups),
+        " channels, but got ",
+        input.size(1),
+        " channels instead");
+    TORCH_CHECK(
+        !bias.defined() ||
+            (bias.ndimension() == 1 && bias.size(0) == weight_sizes[0]),
+        "Given weight of size ",
+        weight_sizes,
+        ", expected bias to be 1-dimensional with ",
+        weight_sizes[0],
+        " elements",
+        ", but got bias of size ",
+        bias.sizes(),
+        " instead");
+
+    for (int i = 2; i < k; ++i) {
+      input_shape.push_back(input.size(i) + 2 * padding[i - 2]);
+      kernel_shape.push_back(dilation[i - 2] * (weight_sizes[i] - 1) + 1);
+      if (input_shape.back() < kernel_shape.back()) {
+        kernel_size_correct = false;
+      }
+    }
+
+    TORCH_CHECK(
+        input_shape.size() == kernel_shape.size(),
+        "Inconsistent shape between Input and Kernel");
+
+    if (!kernel_size_correct) {
+      std::ostringstream input_ss;
+      std::ostringstream kernel_ss;
+      std::ostringstream output_ss;
+      std::string separator = "";
+
+      for (int i = 0, len = input_shape.size(); i < len; ++i) {
+        input_ss << separator << input_shape[i];
+        kernel_ss << separator << kernel_shape[i];
+        separator = " x ";
+      }
+
+      TORCH_CHECK(
+          0,
+          "Calculated padded input size per channel: (",
+          input_ss.str(),
+          "). "
+          "Kernel size: (",
+          kernel_ss.str(),
+          "). Kernel size can't be greater than actual input size");
+    }
+  } else {
+    TORCH_CHECK(
+        input.size(1) == weight_sizes[0],
+        "Given transposed=",
+        transposed,
+        ", weight of size ",
+        weight_sizes,
+        ", expected input",
+        input.sizes(),
+        " to have ",
+        weight_sizes[0],
+        " channels, but got ",
+        input.size(1),
+        " channels instead");
+    TORCH_CHECK(
+        !bias.defined() ||
+            (bias.ndimension() == 1 &&
+             bias.size(0) == weight_sizes[1] * groups),
+        "Given transposed=",
+        transposed,
+        ", weight of size ",
+        weight_sizes,
+        ", expected bias to be 1-dimensional with ",
+        weight_sizes[1] * groups,
+        " elements",
+        ", but got bias of size ",
+        bias.sizes(),
+        " instead");
+  }
+}
+
+static at::Tensor view4d(const at::Tensor& tensor) {
+  TORCH_CHECK(
+      tensor.ndimension() == 3,
+      "expected 3D tensor, got tensor with ",
+      tensor.ndimension(),
+      " dimensions instead");
+  return tensor.unsqueeze(2);
+}
+
+static at::Tensor view3d(const at::Tensor& tensor) {
+  TORCH_CHECK(
+      tensor.ndimension() == 4,
+      "expected 4D tensor, got tensor with ",
+      tensor.ndimension(),
+      " dimensions instead");
+  return tensor.squeeze(2);
+}
+
+Attr get_onednn_conv_sum_attr(
+    const Tensor& input_r,
+    const Tensor& weight_r,
+    IntArrayRef stride_,
+    IntArrayRef padding_,
+    IntArrayRef dilation_,
+    Tensor& accumu,
+    double scale,
+    Tensor& output,
+    bool& is_fused,
+    Attr attr = Attr(),
+    bool force_inplace = false) {
+  is_fused = true;
+  if (scale == 0.f)
+    return attr;
+
+  auto ndim = input_r.ndimension();
+  auto output_size = conv_dst_size(
+      ndim,
+      input_r.sizes(),
+      weight_r.sizes(),
+      padding_,
+      padding_,
+      stride_,
+      dilation_);
+  MemoryFormat mem_fmt = at::MemoryFormat::Contiguous;
+  auto input_fmt = input_r.suggest_memory_format();
+  auto input_is_cl = (input_fmt == at::MemoryFormat::ChannelsLast || input_fmt == at::MemoryFormat::ChannelsLast3d);
+  auto weight_fmt = weight_r.suggest_memory_format();
+  auto weight_is_cl = (weight_fmt == at::MemoryFormat::ChannelsLast || weight_fmt == at::MemoryFormat::ChannelsLast3d);
+
+  bool propagate_channels_last = input_is_cl || weight_is_cl;
+  if (propagate_channels_last)
+    mem_fmt = get_cl_tag_by_ndim(ndim);
+
+  Tensor out = at::empty(output_size, input_r.options().memory_format(mem_fmt));
+  if (!onednn::binary_valid(out, accumu)) {
+    is_fused = false;
+    return attr;
+  }
+
+  // For post-sum and post-binary-add, onednn needs sum/binary scale=1.f
+  // Thus we need the following transformation
+  // conv(src, wei) + scale * accumu
+  // scale * (1/scale * conv(src, wei) + sum (or binary))
+  if (scale != 1.f)
+    attr.append_post_eltwise(
+        /* scale */ 1.f,
+        /* alpha */ 1.f / scale,
+        /* beta */ 0.f,
+        attr.kind_with_linear);
+
+  if (force_inplace) {
+    // If sizes are the same, post sum is used.
+    output = accumu;
+    attr.append_post_sum(/* sum_scale */ 1.f);
+  } else {
+    // If sizes are different, post binary is used.
+    attr.append_post_binary(attr.kind_with_binary_add, accumu);
+  }
+
+  if (scale != 1.f)
+    attr.append_post_eltwise(
+        /* scale */ 1.f,
+        /* alpha */ scale,
+        /* beta */ 0.f,
+        attr.kind_with_linear);
+
+  return attr;
+}
+
+} // namespace impl
+
+using namespace impl;
+
+Tensor _convolution_out(
+    Tensor& output_r,
+    const Tensor& input_r,
+    const Tensor& weight_r,
+    const Tensor& bias_r,
+    IntArrayRef stride_,
+    IntArrayRef padding_,
+    IntArrayRef dilation_,
+    bool transposed_,
+    IntArrayRef output_padding_,
+    int64_t groups_,
+    Attr attr,
+    IntArrayRef pad_nd = IntArrayRef({})) {
+  auto ndim = input_r.ndimension();
+  TORCH_CHECK(
+      3 == ndim || 4 == ndim || 5 == ndim,
+      "convolution only supports 3D, 4D, 5D tensor");
+  // get computation format for Conv/TransposedConv
+  bool is_channels_last_suggested = use_channels_last_for_conv(input_r, weight_r, transposed_);
+
+  Tensor input = input_r, weight = weight_r;
+  // PyTorch does not support ChannelsLast1D case,
+  // thus we need the transformation here
+  if (ndim == 3) {
+    input = view4d(input_r);
+    weight = view4d(weight_r);
+  }
+  // ensure the input/weight/bias/output are congituous in desired format
+  at::MemoryFormat mfmt = is_channels_last_suggested
+      ? get_cl_tag_by_ndim(input.ndimension())
+      : at::MemoryFormat::Contiguous;
+  auto bias = bias_r.defined() ? bias_r.contiguous() : bias_r;
+  input = input.contiguous(mfmt);
+  weight = weight.contiguous(mfmt);
+
+  auto k = weight.ndimension();
+  if (k == input.ndimension() + 1) {
+    k = input.ndimension();
+  }
+  int64_t dim = k - 2;
+  TORCH_CHECK(dim > 0, "weight should have at least three dimensions");
+
+  ConvParams params;
+  if (ndim == 3) {
+    // PyTorch does not support ChannelsLast1D case,
+    // thus we need the transformation here
+    params.stride = stride_.vec();
+    params.padding = padding_.vec();
+    params.dilation = dilation_.vec();
+    params.transposed = transposed_;
+    params.output_padding = output_padding_.vec();
+    params.groups = groups_;
+    params.view1d_as_2d();
+  } else {
+    params.stride = expand_param_if_needed(stride_, "stride", dim);
+    // PyTorch default Conv padding should be a single integer value
+    // or a list of values to match the conv dimensions
+    // conv2d, the number of padding values should be 1 or 2
+    // conv3d, the number of padding values should be 1 or 3
+    // the padding value will be padded into both side of Conv input (D, H, W)
+    params.padding = expand_param_if_needed(padding_, "padding", dim);
+    params.dilation = expand_param_if_needed(dilation_, "dilation", dim);
+    params.transposed = transposed_;
+    params.output_padding =
+        expand_param_if_needed(output_padding_, "output_padding", dim);
+    params.groups = groups_;
+  }
+  check_shape_forward(input, weight, bias, params, true);
+
+  Tensor output;
+  if (transposed_) {
+    // create output and propagate memory format
+    if (!output_r.defined()) {
+      auto dst_tz = deconv_dst_size(
+          input.sizes(),
+          weight.sizes(),
+          params.padding,
+          params.stride,
+          params.dilation,
+          params.output_padding,
+          params.groups);
+      output = at::empty(dst_tz, input.options(), mfmt);
+    }
+
+    onednn::deconvolution(
+        output,
+        input,
+        weight,
+        bias,
+        params.stride,
+        params.padding,
+        params.output_padding,
+        params.dilation,
+        params.groups,
+        attr);
+  } else {
+    // oneDNN supports padding the two sides of src with different values
+    // the padding order should be front_top_left and back_bottom_right
+    auto padding_front_top_left = params.padding;
+    auto padding_back_bottom_right = params.padding;
+
+    // PyTorch constant_pad_nd:
+    // can pad different value to the two sides of Conv input (W, H, D)
+    // (padding_left, padding_right,
+    //  padding_top, padding_bottom,
+    //  padding_front, padding_back)
+    if (pad_nd.vec().size() > 0) {
+      for (int i = 0; i < dim; ++i) {
+        padding_front_top_left[i] += pad_nd[2 * dim - 2 * i - 2]; // 4, 2, 0
+        padding_back_bottom_right[i] += pad_nd[2 * dim - 2 * i - 1]; // 5, 3, 1
+      }
+    }
+
+    // create output and propagate memory format
+    if (! output_r.defined()) {
+      auto dst_tz = conv_dst_size(
+          input.ndimension(),
+          input.sizes(),
+          weight.sizes(),
+          padding_front_top_left,
+          padding_back_bottom_right,
+          params.stride,
+          params.dilation);
+      output = at::empty(dst_tz, input.options(), mfmt);
+    }
+    onednn::convolution(
+        output,
+        input,
+        weight,
+        bias,
+        padding_front_top_left,
+        padding_back_bottom_right,
+        params.stride,
+        params.dilation,
+        params.groups,
+        attr);
+  }
+
+  if (ndim == 3) {
+    output = view3d(output);
+  }
+  if (output_r.defined() && !output_r.is_same(output)) {
+    output_r.copy_(output);
+  } else {
+    output_r = output;
+  }
+  return output_r;
+}
+
+Tensor _convolution(
+    const Tensor& input_r,
+    const Tensor& weight_r,
+    const Tensor& bias_r,
+    IntArrayRef stride_,
+    IntArrayRef padding_,
+    IntArrayRef dilation_,
+    bool transposed_,
+    IntArrayRef output_padding_,
+    int64_t groups_,
+    Attr attr) {
+  Tensor output_r;
+  return _convolution_out(
+      output_r,
+      input_r,
+      weight_r,
+      bias_r,
+      stride_,
+      padding_,
+      dilation_,
+      transposed_,
+      output_padding_,
+      groups_,
+      attr);
+}
+
+Tensor convolution_overrideable(
+    const Tensor& input_r,
+    const Tensor& weight_r,
+    const c10::optional<at::Tensor>& bias_r_opt,
+    IntArrayRef stride_,
+    IntArrayRef padding_,
+    IntArrayRef dilation_,
+    bool transposed_,
+    IntArrayRef output_padding_,
+    int64_t groups_) {
+  c10::MaybeOwned<Tensor> bias_r_maybe_owned =
+      at::borrow_from_optional_tensor(bias_r_opt);
+  const Tensor& bias_r = *bias_r_maybe_owned;
+
+  auto k = weight_r.ndimension();
+  at::MemoryFormat backend_memory_format = at::MemoryFormat::Contiguous;
+  if (xpu_conv_use_channels_last(input_r, weight_r)) {
+      backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
+  }
+  Tensor input_c = input_r.contiguous(backend_memory_format);
+  Tensor weight_c = weight_r.contiguous(backend_memory_format);
+
+  return _convolution(
+      input_c,
+      weight_c,
+      bias_r,
+      stride_,
+      padding_,
+      dilation_,
+      transposed_,
+      output_padding_,
+      groups_,
+      Attr());
+}
+
+std::tuple<Tensor, Tensor, Tensor> convolution_backward_overrideable(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool transposed,
+    IntArrayRef output_padding,
+    int64_t groups,
+    std::array<bool, 3> output_mask) {
+  auto ndim = input.ndimension();
+  TORCH_CHECK(
+      3 == ndim || 4 == ndim || 5 == ndim,
+      "convolution bwd only supports 3D, 4D, 5D tensor");
+  TORCH_CHECK(
+      grad_output.scalar_type() == ScalarType::Float ||
+          grad_output.scalar_type() == ScalarType::BFloat16 ||
+          grad_output.scalar_type() == ScalarType::Double ||
+          grad_output.scalar_type() == ScalarType::Half,
+      "so far only support float, bfloat16, half and double convolution backward in XPU backend, your data type is ",
+      grad_output.scalar_type());
+
+  bool is_channels_last_suggested = use_channels_last_for_conv(input, weight, transposed);
+
+  Tensor grad_output_, input_, weight_;
+  IntArrayRef stride_, padding_, dilation_, output_padding_;
+  bool transposed_;
+  int64_t groups_;
+  ConvParams params;
+  if (3 == ndim) {
+    grad_output_ = view4d(grad_output);
+    input_ = view4d(input);
+    weight_ = view4d(weight);
+    params.stride = stride.vec();
+    params.padding = padding.vec();
+    params.dilation = dilation.vec();
+    params.transposed = transposed;
+    params.output_padding = output_padding.vec();
+    params.groups = groups;
+    params.view1d_as_2d();
+    stride_ = params.stride;
+    padding_ = params.padding;
+    dilation_ = params.dilation;
+    transposed_ = params.transposed;
+    output_padding_ = params.output_padding;
+    groups_ = params.groups;
+  } else {
+    grad_output_ = grad_output;
+    input_ = input;
+    weight_ = weight;
+    stride_ = stride;
+    padding_ = padding;
+    dilation_ = dilation;
+    transposed_ = transposed;
+    output_padding_ = output_padding;
+    groups_ = groups;
+  }
+
+  // ensure the tensors are contiguous
+  auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(input_.ndimension())
+      : at::MemoryFormat::Contiguous;
+  grad_output_ =  grad_output_.contiguous(mfmt);
+  weight_ = weight_.contiguous(mfmt);
+  input_ = input_.contiguous(mfmt);
+
+  auto opt = grad_output_.options();
+  Tensor grad_input = at::empty(input_.sizes(), opt, mfmt);
+  Tensor grad_weight = at::empty(weight_.sizes(), opt, mfmt);
+  Tensor grad_bias;
+  if (output_mask[2])
+    grad_bias = at::empty({grad_output_.size(1)}, opt);
+
+  if (output_mask[0]) {
+    if (input.numel() > 0) {
+      if (transposed_) {
+        onednn::deconvolution_backward_data(
+            grad_input,
+            grad_output_,
+            weight_,
+            stride_,
+            padding_,
+            dilation_,
+            groups_,
+            output_mask[2]);
+      } else {
+        onednn::convolution_backward_data(
+            grad_input,
+            grad_output_,
+            weight_,
+            padding_,
+            padding_,
+            stride_,
+            dilation_,
+            groups_,
+            output_mask[2]);
+      }
+    }
+  }
+  if (output_mask[1] || output_mask[2]) {
+    if (input.numel() > 0) {
+      if (transposed_) {
+        onednn::deconvolution_backward_weights(
+            grad_weight,
+            grad_bias,
+            grad_output_,
+            input_,
+            stride_,
+            padding_,
+            dilation_,
+            groups_);
+      } else {
+        onednn::convolution_backward_weights(
+            grad_weight,
+            grad_bias,
+            grad_output_,
+            input_,
+            weight_.sizes(),
+            padding_,
+            padding_,
+            stride_,
+            dilation_,
+            groups_);
+      }
+    }
+  }
+
+  if (3 == ndim) {
+    if (output_mask[0])
+      grad_input = view3d(grad_input);
+    grad_weight = view3d(grad_weight);
+  }
+  return std::tuple<Tensor, Tensor, Tensor>{grad_input, grad_weight, grad_bias};
+}
+
+TORCH_LIBRARY_IMPL(aten, XPU, m){
+  m.impl("convolution_overrideable", TORCH_FN(convolution_overrideable));
+  m.impl("convolution_backward_overrideable", TORCH_FN(convolution_backward_overrideable));
+}
+
+} // namespace xpu
+} // namespace at::native
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h b/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
new file mode 100644
index 0000000000000..56e587084959d
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
@@ -0,0 +1,365 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <oneapi/dnnl/dnnl.hpp>
+#include <oneapi/dnnl/dnnl_types.h>
+#include <ATen/native/mkldnn/xpu/detail/Utils.h>
+#include <ATen/native/mkldnn/xpu/detail/oneDNNContext.h>
+
+namespace at::native::onednn {
+/* oneDNN quantization usage:
+   https://oneapi-src.github.io/oneDNN/dev_guide_attributes_quantization.html#
+
+   src_fp32 = scale_src * (src_int8 - zero_point)
+   wei_fp32 = scale_wei * (wei_int8 - zero_point)
+   dst_fp32 = scale_dst * (dst_int8 - zero_point)
+   fp32 Convolution: dst_fp32 = src_fp32 * wei_fp32
+   Int8 Convolution: dst_fp32 = (src_int8 * wei_int8) * (scale_src * scale_wei)
+   Int8 Convolution: dst_int8 = 1 / scale_dst * dst_fp32;
+
+   Considering zero-point (asymmetric):
+   dst_fp32 = (src_int8 - src_zp) * src_sc * wei_int8 * wei_sc
+   dst_sc * (dst_int8 - dst_zp) = (src_int8 - src_zp) * wei_int8  * src_sc *
+                                 wei_sc
+   dst_int8 = (src_int8 - src_zp) * wei_int8 * src_sc * wei_sc / dst_sc +
+              dst_zp
+
+   considering bias:
+   fp32 Convolution: dst_fp32 = src_fp32 * wei_fp32 + bias
+   Int8 Convolution: dst_fp32 = (src_int8 * wei_int8) * (scale_src * scale_wei)
+   + bias Int8 Convolution: dst_fp32 = (src_int8 * wei_int8 + bias/(scale_src *
+   scale_wei)) * (scale_src * scale_wei) Int8 Convolution: dst_int8 = 1 /
+   scale_dst * dst_fp32;
+*/
+
+/*
+   oneDNN postops usage:
+   Currently, oneDNN supports 5 kinds of post ops. More details can be refered
+to oneDNN doc.
+   https://oneapi-src.github.io/oneDNN/dev_guide_attributes_post_ops.html#doxid-dev-guide-attributes-post-ops-1dev-guide-attributes-post-ops-eltwise
+
+0. without post ops
+   dst = Conv(src, wei) + bias;
+   dst_int8 = 1/q_scale * dst; q_scale is the op output quantization scale
+   fp32 API: Attr attr;
+   int8 API: Attr attr(q_scale);
+
+1. append eltwise post op
+   dst = elt_scale * Eltwise{conv_scale * [Conv(src, wei) + bias], alpha, beta}
+   dst_int8 = 1/q_scale * dst;
+   fp32 API:
+   Attr attr;
+   attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear)
+   attr.append_post_eltwise(elt_scale, alpha, beta, eltwise_algorithm)
+   int8 API:
+   Attr attr(q_scale);
+   attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear)
+   attr.append_post_eltwise(elt_scale, alpha, beta, eltwise_algorithm)
+
+2. append sum post op
+   dst = conv_scale * Conv(src, wei) + sum_scale * (dst - zp)
+   dst_int8 = 1/q_scale * dst;
+   fp32 API:
+   Attr attr;
+   attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear)
+   attr.append_post_sum(sum_scale)
+   int8 API:
+   Attr attr(q_scale);
+   attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear)
+   attr.append_post_sum(sum_scale)
+
+3. append binary post op
+   dst = Binary[Conv(src, wei)]
+
+*/
+using kind_t = dnnl::primitive::kind;
+struct PostOpParam {
+  // eltwise post op constructor
+  PostOpParam(float scale, float alpha, float beta, dnnl::algorithm algo, kind_t kind)
+      : scale_(scale), alpha_(alpha), beta_(beta), algo_(algo), kind_(kind) {}
+  // sum post op constructor
+  PostOpParam(float scale, kind_t kind) : scale_(scale), kind_(kind) {}
+  // binary post op constructor
+  PostOpParam(
+      at::Tensor& binary,
+      dnnl::memory::desc& binary_md,
+      dnnl::memory::desc& expected_md,
+      dnnl::algorithm algo,
+      kind_t kind)
+      : binary_(binary),
+        meta_(binary_md),
+        expected_meta_(expected_md),
+        algo_(algo),
+        kind_(kind) {}
+  // prelu post op constructor
+  PostOpParam(int mask, kind_t kind) : mask_(mask), kind_(kind) {}
+
+  // post sum or binary with scale post op constructor
+  PostOpParam(at::Tensor& binary, float scale, dnnl::algorithm algo, kind_t kind)
+      : scale_(scale), binary_(binary), algo_(algo), kind_(kind) {}
+
+  // for int8 sum/eltwise
+  float scale_ = 1.0;
+  // for eltwise
+  float alpha_ = 0.0;
+  float beta_ = 0.0;
+  // for binary
+  at::Tensor binary_ = at::Tensor();
+  at::Tensor expected_binary_ = at::Tensor();
+  void* binary_ptr_ = nullptr;
+  dnnl::memory::desc meta_ = dnnl::memory::desc();
+  dnnl::memory::desc expected_meta_ = dnnl::memory::desc();
+  // for prelu
+  int mask_ = 0;
+  // common
+  dnnl::algorithm algo_ = dnnl::algorithm::eltwise_relu;
+  kind_t kind_ = kind_t::eltwise;
+};
+
+class Attr {
+ public:
+  Attr() : q_scale_(1.f), q_zero_point_(0) {}
+  Attr(float q_scale, int64_t zp = 0) : q_scale_(q_scale), q_zero_point_(zp) {}
+
+  /***** eltwise *****/
+  dnnl::algorithm kind_with_relu = dnnl::algorithm::eltwise_relu;
+  dnnl::algorithm kind_with_sigmoid = dnnl::algorithm::eltwise_logistic;
+  dnnl::algorithm kind_with_gelu_tanh = dnnl::algorithm::eltwise_gelu_tanh;
+  dnnl::algorithm kind_with_gelu_erf = dnnl::algorithm::eltwise_gelu_erf;
+  dnnl::algorithm kind_with_mish = dnnl::algorithm::eltwise_mish;
+  dnnl::algorithm kind_with_linear = dnnl::algorithm::eltwise_linear;
+  dnnl::algorithm kind_with_swish = dnnl::algorithm::eltwise_swish;
+  dnnl::algorithm kind_with_sqrt = dnnl::algorithm::eltwise_sqrt;
+  dnnl::algorithm kind_with_tanh = dnnl::algorithm::eltwise_tanh;
+  dnnl::algorithm kind_with_square = dnnl::algorithm::eltwise_square;
+  dnnl::algorithm kind_with_abs = dnnl::algorithm::eltwise_abs;
+  dnnl::algorithm kind_with_exp = dnnl::algorithm::eltwise_exp;
+  dnnl::algorithm kind_with_log = dnnl::algorithm::eltwise_log;
+  dnnl::algorithm kind_with_round = dnnl::algorithm::eltwise_round;
+  dnnl::algorithm kind_with_hardswish = dnnl::algorithm::eltwise_hardswish;
+  dnnl::algorithm kind_with_soft_relu = dnnl::algorithm::eltwise_soft_relu;
+  dnnl::algorithm kind_with_elu = dnnl::algorithm::eltwise_elu;
+  dnnl::algorithm kind_with_pow = dnnl::algorithm::eltwise_pow;
+  dnnl::algorithm kind_with_clip = dnnl::algorithm::eltwise_clip;
+  // note: hardsigmoid seems oneDNN still not support
+  dnnl::algorithm kind_with_hardsigmoid = dnnl::algorithm::eltwise_hardsigmoid;
+
+  /***** binary *****/
+  dnnl::algorithm kind_with_binary_mul = dnnl::algorithm::binary_mul;
+  dnnl::algorithm kind_with_binary_add = dnnl::algorithm::binary_add;
+  dnnl::algorithm kind_with_binary_sub = dnnl::algorithm::binary_sub;
+  dnnl::algorithm kind_with_binary_div = dnnl::algorithm::binary_div;
+  dnnl::algorithm kind_with_binary_eq = dnnl::algorithm::binary_eq;
+  dnnl::algorithm kind_with_binary_ne = dnnl::algorithm::binary_ne;
+  dnnl::algorithm kind_with_binary_ge = dnnl::algorithm::binary_ge;
+  dnnl::algorithm kind_with_binary_gt = dnnl::algorithm::binary_gt;
+  dnnl::algorithm kind_with_binary_le = dnnl::algorithm::binary_le;
+  dnnl::algorithm kind_with_binary_lt = dnnl::algorithm::binary_lt;
+  dnnl::algorithm kind_with_binary_max = dnnl::algorithm::binary_max;
+  dnnl::algorithm kind_with_binary_min = dnnl::algorithm::binary_min;
+
+  // append sum post op
+  Attr& append_post_sum(
+      float sum_scale,
+      float sum_q_scale = 1.f,
+      int64_t zp = 0) {
+    ops_params_.push_back(
+        PostOpParam(/*scale_sum*/ sum_scale * sum_q_scale, kind_t::sum));
+    return *this;
+  }
+
+  // append eltwise post op
+  Attr& append_post_eltwise(
+      float scale,
+      float alpha,
+      float beta,
+      dnnl::algorithm algo) {
+    ops_params_.push_back(
+        PostOpParam(scale, alpha, beta, algo, kind_t::eltwise));
+    return *this;
+  }
+
+  // append binary post op
+  Attr& append_post_binary(dnnl::algorithm algo, const at::Tensor& binary) {
+    auto binary_ = binary.is_quantized() ? at::dequantize(binary) : binary;
+    bool binary_is_channels_last = (binary_.suggest_memory_format() == at::MemoryFormat::ChannelsLast ||
+                                      binary_.suggest_memory_format() == at::MemoryFormat::ChannelsLast3d);
+
+    binary_ = binary_is_channels_last ? binary_ : binary_.contiguous();
+    dnnl::memory::desc md = get_onednn_md(binary_);
+    auto expected_md = dnnl::memory::desc(
+        md.get_dims(), md.get_data_type(), dnnl::memory::format_tag::any);
+    ops_params_.push_back(
+        PostOpParam(binary_, md, expected_md, algo, kind_t::binary));
+    return *this;
+  }
+
+  Attr& append_scale_binary(
+      dnnl::algorithm algo,
+      at::Tensor binary,
+      float scale,
+      float sum_q_scale = 1.f,
+      int64_t zp = 0) {
+    ops_params_.push_back(PostOpParam(
+        binary, /*scale_sum*/ scale * sum_q_scale, algo, kind_t::binary));
+    return *this;
+  }
+
+  // append bias with binary_add method (only used for QConv now)
+  template <int N>
+  Attr& append_bias(const at::Tensor& binary) {
+    // In PyTorch, bias are in shape of [OC],
+    // we expand its shape according to Conv dimension
+    // Conv1d [OC, 1, 1], Conv2d [1, OC, 1, ,1], Conv3d [1, OC, 1, 1, 1]
+    at::Tensor binary_ = binary.contiguous();
+    dnnl::memory::desc binary_md;
+    switch (N) {
+      case 1:
+        binary_md = dnnl::memory::desc(
+            {binary.size(0), 1, 1},
+            dnnl::memory::data_type::f32,
+            dnnl::memory::format_tag::abc);
+        break;
+      case 2:
+        binary_md = dnnl::memory::desc(
+            {1, binary.size(0), 1, 1},
+            dnnl::memory::data_type::f32,
+            dnnl::memory::format_tag::abcd);
+        break;
+      case 3:
+        binary_md = dnnl::memory::desc(
+            {1, binary.size(0), 1, 1, 1},
+            dnnl::memory::data_type::f32,
+            dnnl::memory::format_tag::abcde);
+        break;
+      default:
+        TORCH_INTERNAL_ASSERT(0,
+            "XPU only supports append_bias for Conv1d, Conv2d and Conv3d.");
+    }
+    // In this case, expected_md = binary_md
+    ops_params_.push_back(PostOpParam(
+        binary_, binary_md, binary_md, kind_with_binary_add, kind_t::binary));
+    return *this;
+  }
+
+  // append prelu post op
+  Attr& append_post_prelu(int mask) {
+    ops_params_.push_back(PostOpParam(mask, kind_t::prelu));
+    return *this;
+  }
+
+  dnnl::post_ops extract_post_ops(const at::Tensor& dst){
+    // this function is used to extract post ops params from the ops_params_
+    // and put them into onednn post ops
+    for (size_t i = 0; i < ops_params_.size(); ++i) {
+      kind_t kind = ops_params_[i].kind_;
+      switch (kind) {
+        case kind_t::eltwise: {
+          dnnl::algorithm algo = ops_params_[i].algo_;
+          float alpha = ops_params_[i].alpha_;
+          float beta = ops_params_[i].beta_;
+          dnnl_post_ops_.append_eltwise(algo, alpha, beta);
+          break;
+        }
+        case kind_t::sum: {
+          float scale = ops_params_[i].scale_;
+          // TODO [Asymmetric]:
+          // Post-sum zp for gpu is not supported currently
+          dnnl_post_ops_.append_sum(scale);
+          break;
+        }
+        case kind_t::binary: {
+          dnnl::algorithm algo = ops_params_[i].algo_;
+          auto expected_md = ops_params_[i].expected_meta_;
+          // In this case user may create src1 memory descriptor with
+          // format_tag::any or set a specific tag. However, in later case if
+          // tags mismatch with dst, it would result in suboptimal performance.
+          // So here we use format_tag::any to make sure the fast can be
+          // selected.
+          // Thus we use expected_md (with format_any) here to create pd instead
+          // of original md
+          dnnl_post_ops_.append_binary(algo, expected_md);
+          break;
+        }
+        default:
+          break;
+      }
+    }
+
+    // if output is quantized, then append the eltwise linear to adjust the
+    // output scale/zero_point
+    if (dst.is_quantized()) {
+      // [Note: Gap of u8 qtensor scale between oneDNN and PyTorch]
+      // The /2 here is for output_scale collected by observer is different
+      // from quantization requirements in oneDNN.
+      // For Observer, the conv_scale (activation scale in other case) is
+      // computed through 2max_v/(qmax - qmin). The max_v is collected
+      // from the tensor to be observerd.
+      // (https://pytorch.org/docs/stable/generated/torch.quantization.observer.MinMaxObserver.html#torch.quantization.observer.MinMaxObserver)
+      // On the other hand, for u8 in oneDNN, the scale for quantization is
+      // defined as max_v/(qmax-qmin). Hence, we need to divide by 2 here.
+      // (https://oneapi-src.github.io/oneDNN/dev_guide_inference_int8.html)
+      dnnl_post_ops_.append_eltwise(
+          kind_with_linear, 1.f / q_scale_, q_zero_point_);
+    }
+    return dnnl_post_ops_;
+  }
+
+  bool with_sum() {
+    for (size_t i = 0; i < ops_params_.size(); ++i) {
+      if (ops_params_[i].kind_ == kind_t::sum) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool with_binary() {
+    for (size_t i = 0; i < ops_params_.size(); ++i) {
+      if (ops_params_[i].kind_ == kind_t::binary) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void construct_post_binary(
+      dnnl::primitive_desc& pd,
+      std::unordered_map<int, dnnl::memory>& args) {
+    // This function is used to construct binary memory desc in binary post ops.
+    // According to oneDNN doc, the binary tensor can be in shape of
+    // [1, 1, 1, 1], tensor broadcast
+    // [1, C, 1, 1], channel broadcast
+    // [dst.shape], no broadcast and eltwise-wise binary operations on dst
+
+    auto engine =
+        GpuEngineManager::Instance().get_engine({c10::kXPU, c10::xpu::current_device()});
+    for (size_t i = 0; i < ops_params_.size(); ++i) {
+      kind_t kind = ops_params_[i].kind_;
+      if (kind == kind_t::binary) {
+        dnnl::memory binary_m;
+        auto binary = ops_params_[i].binary_;
+        auto md = ops_params_[i].meta_;
+        // qeury expected_md to achieve peak performance
+        auto expected_md = pd.query_md(
+            dnnl::query::exec_arg_md,
+            DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1);
+
+        binary_m = at::native::onednn::make_onednn_memory(
+          md, engine, binary.data_ptr()
+        );
+
+        args.insert(
+            {DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1, binary_m});
+      }
+    }
+  }
+
+  float q_scale_ = 1.0; // the scale used to quantize the fused result from fp32
+                        // to int8, only works for int8 case
+  int64_t q_zero_point_ = 0;
+  std::vector<PostOpParam> ops_params_; // series of post ops
+  dnnl::post_ops dnnl_post_ops_;
+};
+
+} // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Conv.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Conv.cpp
new file mode 100644
index 0000000000000..87ddd0af34fe9
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Conv.cpp
@@ -0,0 +1,451 @@
+#include <c10/xpu/XPUFunctions.h>
+
+#include <ATen/ATen.h>
+#include <ATen/core/grad_mode.h>
+#include <ATen/record_function.h>
+#include <c10/core/MemoryFormat.h>
+
+#include <ATen/native/mkldnn/xpu/detail/Attr.h>
+#include <ATen/native/mkldnn/xpu/detail/Utils.h>
+
+#include <oneapi/dnnl/dnnl.hpp>
+
+namespace at::native::onednn {
+
+constexpr int src_batch_size_dim = 0;
+constexpr int weight_dst_channels_dim = 0;
+
+dnnl::memory::dims conv_dst_size(
+    int64_t ndim,
+    IntArrayRef src_size,
+    IntArrayRef weight_size,
+    IntArrayRef padding_front_top_left,
+    IntArrayRef padding_back_bottom_right,
+    IntArrayRef stride,
+    IntArrayRef dilation) {
+  bool has_dilation = dilation.size() > 0;
+ dnnl::memory::dims dst_size(ndim);
+  dst_size[0] = src_size[src_batch_size_dim];
+  dst_size[1] = weight_size[weight_dst_channels_dim];
+  for (int d = 2; d < ndim; ++d) {
+    auto dilate = has_dilation ? dilation[d - 2] : 1;
+    auto kernel = dilate * (weight_size[d] - 1) + 1;
+    dst_size[d] =
+        (src_size[d] +
+         (padding_front_top_left[d - 2] + padding_back_bottom_right[d - 2]) -
+         kernel) /
+            stride[d - 2] +
+        1;
+  }
+  return dst_size;
+}
+
+static inline dnnl::memory::dims compatible_dilation(IntArrayRef& dilation) {
+ dnnl::memory::dims ret = dilation.vec();
+  for (auto it = ret.begin(); it != ret.end(); it++) {
+    *it -= 1;
+  }
+  return ret;
+}
+
+static inline dnnl::memory::format_tag conv_src_fmt(
+    const int64_t ndim,
+    const bool is_channels_last = false) {
+  if (!is_channels_last) {
+    return (ndim == 3)
+        ? dnnl::memory::format_tag::ncw
+        : ((ndim == 4) ? dnnl::memory::format_tag::nchw
+                       : ((ndim == 5) ? dnnl::memory::format_tag::ncdhw
+                                      : dnnl::memory::format_tag::undef));
+  } else {
+    return (ndim == 3)
+        ? dnnl::memory::format_tag::nwc
+        : ((ndim == 4) ? dnnl::memory::format_tag::nhwc
+                       : ((ndim == 5) ? dnnl::memory::format_tag::ndhwc
+                                      : dnnl::memory::format_tag::undef));
+  }
+}
+
+static inline dnnl::memory::format_tag conv_weight_fmt(
+    const int64_t ndim,
+    const bool grouped = false,
+    const bool is_channels_last = false) {
+  if (!is_channels_last) {
+    return (ndim == 3)
+        ? (grouped ? dnnl::memory::format_tag::goiw : dnnl::memory::format_tag::oiw)
+        : (ndim == 4)
+        ? (grouped ? dnnl::memory::format_tag::goihw : dnnl::memory::format_tag::oihw)
+        : ((ndim == 5) ? (grouped ? dnnl::memory::format_tag::goidhw
+                                  : dnnl::memory::format_tag::oidhw)
+                       : dnnl::memory::format_tag::undef);
+  } else {
+    return (ndim == 3)
+        ? (grouped ? dnnl::memory::format_tag::gowi : dnnl::memory::format_tag::owi)
+        : (ndim == 4)
+        ? (grouped ? dnnl::memory::format_tag::gohwi : dnnl::memory::format_tag::ohwi)
+        : ((ndim == 5) ? (grouped ? dnnl::memory::format_tag::godhwi
+                                  : dnnl::memory::format_tag::odhwi)
+                       : dnnl::memory::format_tag::undef);
+  }
+}
+
+static inline dnnl::memory::dims compatible_weight_dims(
+    const int64_t ndim,
+    const int64_t groups,
+    const int64_t oc,
+    const int64_t ic,
+    const IntArrayRef wsizes) {
+  if (ndim == 3) {
+    auto kw = wsizes[2];
+    return (groups != 1) ? dnnl::memory::dims({groups, oc / groups, ic / groups, kw})
+                         : dnnl::memory::dims({oc, ic, kw});
+  } else if (ndim == 4) {
+    auto kh = wsizes[2];
+    auto kw = wsizes[3];
+    return (groups != 1)
+        ? dnnl::memory::dims({groups, oc / groups, ic / groups, kh, kw})
+        : dnnl::memory::dims({oc, ic, kh, kw});
+  } else if (ndim == 5) {
+    auto kd = wsizes[2];
+    auto kh = wsizes[3];
+    auto kw = wsizes[4];
+    return (groups != 1)
+        ? dnnl::memory::dims({groups, oc / groups, ic / groups, kd, kh, kw})
+        : dnnl::memory::dims({oc, ic, kd, kh, kw});
+  }
+
+  return {};
+}
+
+static std::tuple<
+    dnnl::memory::desc,
+    dnnl::memory::desc,
+    dnnl::memory::desc>
+ conv_get_md(
+    const at::Tensor& src,
+    const at::Tensor& weight,
+    const at::Tensor& dst,
+    int64_t groups,
+    bool is_channels_last) {
+  // create memory desc from the src/weight/dst tensors
+  dnnl::memory::desc src_usr_md, weight_usr_md, dst_usr_md;
+  auto ndim = src.ndimension();
+  auto fmt_src =
+      conv_src_fmt(ndim, is_channels_last);
+
+  auto src_size = src.sizes().vec();
+  auto src_data_t = get_onednn_dtype_include_double(src);
+  src_usr_md = dnnl::memory::desc(src_size, src_data_t, fmt_src);
+
+  auto dst_size = dst.sizes().vec();
+  auto dst_data_t = get_onednn_dtype_include_double(dst);
+  dst_usr_md = dnnl::memory::desc(dst_size, dst_data_t, fmt_src);
+
+  auto ic = src.size(1);
+  auto oc = dst.size(1);
+  auto wei_data_t = get_onednn_dtype_include_double(weight);
+  dnnl::memory::dims weight_size =
+      compatible_weight_dims(ndim, groups, oc, ic, weight.sizes());
+  auto fmt_weight = conv_weight_fmt(
+      ndim,
+      groups != 1,
+      is_channels_last);
+  weight_usr_md = dnnl::memory::desc(weight_size, wei_data_t, fmt_weight);
+
+  return {src_usr_md, weight_usr_md, dst_usr_md};
+}
+
+sycl::event convolution(
+    at::Tensor& dst,
+    const at::Tensor& src,
+    const at::Tensor& weight,
+    const at::Tensor& bia,
+    IntArrayRef padding_front_top_left,
+    IntArrayRef padding_back_bottom_right,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    Attr& attr,
+    const std::vector<sycl::event>& deps) {
+  auto engine =
+      GpuEngineManager::Instance().get_engine({c10::kXPU, c10::xpu::current_device()});
+  auto stream = GpuStreamManager::Instance().get_stream();
+
+  bool is_channels_last = use_channels_last_for_conv(src, weight, false);
+
+  // create usr_md for tensors, and md for conv primitive
+  dnnl::memory::desc src_md, weight_md, dst_md;
+  std::tie(src_md, weight_md, dst_md) = conv_get_md(src, weight, dst, groups, is_channels_last);
+
+  auto bia_fmt = dnnl::memory::format_tag::x;
+  auto bia_md = bia.defined()
+      ? dnnl::memory::desc(
+            {dst.size(1)}, get_onednn_dtype_include_double(bia), bia_fmt)
+      : dnnl::memory::desc();
+
+  // create conv primitive descriptor
+  dnnl::memory::dims _stride = stride.vec();
+  dnnl::memory::dims _padding_front_top_left = padding_front_top_left.vec();
+  dnnl::memory::dims _padding_back_bottom_right = padding_back_bottom_right.vec();
+  dnnl::memory::dims _dilation = compatible_dilation(dilation);
+
+  // extract post ops
+  dnnl::primitive_attr pattr;
+  dnnl::post_ops po = attr.extract_post_ops(dst);
+  pattr.set_post_ops(po);
+
+  pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
+  #if ONEDNN_SUPPORT_DETERMINISTIC
+    if(at::globalContext().deterministicAlgorithms())
+        pattr.set_deterministic(true);
+  #endif
+
+  auto conv_fwd_pd = dnnl::convolution_forward::primitive_desc(
+      engine,
+      dnnl::prop_kind::forward,
+      dnnl::algorithm::convolution_direct,
+      src_md,
+      weight_md,
+      bia_md,
+      dst_md,
+      _stride,
+      _dilation,
+      _padding_front_top_left,
+      _padding_back_bottom_right,
+      pattr);
+
+  dnnl::memory src_m, weight_m, dst_m, bia_m;
+  at::Tensor src_blocked, weight_blocked, dst_blocked = dst;
+
+  src_m = make_onednn_memory(src_md, engine, src.data_ptr());
+  weight_m = make_onednn_memory(weight_md, engine, weight.data_ptr());
+  dst_m = make_onednn_memory(dst_md, engine, dst.data_ptr());
+
+
+  std::unordered_map<int, dnnl::memory> args;
+  if (bia.defined()) {
+    bia_m = make_onednn_memory(bia_md, engine, bia.data_ptr());
+    args.insert({DNNL_ARG_BIAS, bia_m});
+  }
+  auto expected_dst_md = conv_fwd_pd.dst_desc();
+  if (attr.with_binary())
+    attr.construct_post_binary(conv_fwd_pd, args);
+
+  args.insert({DNNL_ARG_SRC, src_m});
+  args.insert({DNNL_ARG_WEIGHTS, weight_m});
+  args.insert({DNNL_ARG_DST, dst_m});
+
+  size_t scratchpad_size = conv_fwd_pd.scratchpad_desc().get_size();
+  at::Tensor scratchpad_tensor = at::empty(
+      {static_cast<int64_t>(scratchpad_size)}, src.options().dtype(at::kByte), c10::nullopt);
+  auto scratchpad_m = make_onednn_memory(
+      conv_fwd_pd.scratchpad_desc(), engine, scratchpad_tensor.data_ptr());
+  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_m});
+
+  auto conv_forward = dnnl::convolution_forward(conv_fwd_pd);
+  auto conv_fwd_event = dnnl::sycl_interop::execute(conv_forward, stream, args, deps);
+
+  return conv_fwd_event;
+}
+
+sycl::event convolution_backward_weights(
+    at::Tensor& diff_weight,
+    at::Tensor& diff_bia,
+    const at::Tensor& diff_dst,
+    const at::Tensor& src,
+    IntArrayRef diff_weight_aten_size,
+    IntArrayRef padding_front_top_left,
+    IntArrayRef padding_back_bottom_right,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    const std::vector<sycl::event>& deps) {
+  auto engine =
+      GpuEngineManager::Instance().get_engine({c10::kXPU, c10::xpu::current_device()});
+  auto stream = GpuStreamManager::Instance().get_stream();
+
+  bool is_channels_last = use_channels_last_for_conv(src, diff_dst, /*is_transposed=*/false);
+
+  // create dnnl::memory desc
+  dnnl::memory::desc src_md, weight_md, dst_md;
+  std::tie(src_md, weight_md, dst_md) =
+      conv_get_md(src, diff_weight, diff_dst, groups, is_channels_last);
+  dnnl::memory::format_tag bia_fmt = dnnl::memory::format_tag::x;
+  auto bia_md = diff_bia.defined()
+      ? dnnl::memory::desc({diff_dst.size(1)}, src_md.get_data_type(), bia_fmt)
+      : dnnl::memory::desc();
+
+  // create fwd primitive hint
+  dnnl::memory::dims _stride = stride.vec();
+  dnnl::memory::dims _padding_front_top_left = padding_front_top_left.vec();
+  dnnl::memory::dims _padding_back_bottom_right = padding_back_bottom_right.vec();
+  dnnl::memory::dims _dilation = compatible_dilation(dilation);
+  dnnl::primitive_attr pattr;
+
+  #if ONEDNN_SUPPORT_DETERMINISTIC
+    if(at::globalContext().deterministicAlgorithms())
+        pattr.set_deterministic(true);
+  #endif
+
+  pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+  auto conv_fwd_pd = dnnl::convolution_forward::primitive_desc(
+      engine,
+      dnnl::prop_kind::forward,
+      dnnl::algorithm::convolution_direct,
+      src_md,
+      weight_md,
+      bia_md,
+      dst_md,
+      _stride,
+      _dilation,
+      _padding_front_top_left,
+      _padding_back_bottom_right,
+      pattr);
+
+  // create bwd weight primitive
+  auto conv_bwd_w_pd = dnnl::convolution_backward_weights::primitive_desc(
+      engine,
+      dnnl::algorithm::convolution_direct,
+      src_md,
+      weight_md,
+      bia_md,
+      dst_md,
+      _stride,
+      _dilation,
+      _padding_front_top_left,
+      _padding_back_bottom_right,
+      conv_fwd_pd,
+      pattr);
+
+  // create bwd memory
+  at::Tensor expected_src, expected_diff_dst, expected_diff_weight;
+  dnnl::memory src_m, diff_dst_m, diff_weight_m;
+
+  src_m = make_onednn_memory(src_md, engine, src.data_ptr());
+  diff_dst_m = make_onednn_memory(dst_md, engine, diff_dst.data_ptr());
+  diff_weight_m = make_onednn_memory(weight_md, engine, diff_weight.data_ptr());
+
+  // insert args
+  std::unordered_map<int, dnnl::memory> args;
+  args.insert({DNNL_ARG_DIFF_DST, diff_dst_m});
+  args.insert({DNNL_ARG_SRC, src_m});
+  args.insert({DNNL_ARG_DIFF_WEIGHTS, diff_weight_m});
+  if (diff_bia.defined()) {
+    dnnl::memory diff_bia_m =
+        make_onednn_memory(bia_md, engine, diff_bia.data_ptr());
+    args.insert({DNNL_ARG_DIFF_BIAS, diff_bia_m});
+  }
+
+  size_t scratchpad_size = conv_bwd_w_pd.scratchpad_desc().get_size();
+  at::Tensor scratchpad_tensor = at::empty(
+      {static_cast<int64_t>(scratchpad_size)}, src.options().dtype(at::kByte), c10::nullopt);
+  auto scratchpad_m = make_onednn_memory(
+      conv_bwd_w_pd.scratchpad_desc(), engine, scratchpad_tensor.data_ptr());
+  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_m});
+
+  // execute primitive
+  auto conv_bwd_w = dnnl::convolution_backward_weights(conv_bwd_w_pd);
+  sycl::event conv_bwd_w_event = dnnl::sycl_interop::execute(conv_bwd_w, stream, args, deps);
+
+  return conv_bwd_w_event;
+}
+
+sycl::event convolution_backward_data(
+    at::Tensor& diff_src,
+    const at::Tensor& diff_dst,
+    const at::Tensor& weight,
+    IntArrayRef padding_front_top_left,
+    IntArrayRef padding_back_bottom_right,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool bias_defined,
+    const std::vector<sycl::event>& deps) {
+  auto engine =
+      GpuEngineManager::Instance().get_engine({c10::kXPU, c10::xpu::current_device()});
+  auto stream = GpuStreamManager::Instance().get_stream();
+
+  bool is_channels_last = use_channels_last_for_conv(diff_dst, weight, /*is_transposed=*/false);
+
+  // create memory desc
+  dnnl::memory::desc src_md, weight_md, dst_md;
+  std::tie(src_md, weight_md, dst_md) =
+      conv_get_md(diff_src, weight, diff_dst, groups, is_channels_last);
+  dnnl::memory::format_tag bia_fmt = dnnl::memory::format_tag::x;
+  auto bia_md = bias_defined
+      ? dnnl::memory::desc({diff_dst.size(1)}, weight_md.get_data_type(), bia_fmt)
+      : dnnl::memory::desc();
+
+  // create fwd primitive desc hint
+  dnnl::primitive_attr pattr;
+
+  #if ONEDNN_SUPPORT_DETERMINISTIC
+    if(at::globalContext().deterministicAlgorithms())
+        pattr.set_deterministic(true);
+  #endif
+
+  pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+  dnnl::memory::dims _stride = stride.vec();
+  dnnl::memory::dims _padding_front_top_left = padding_front_top_left.vec();
+  dnnl::memory::dims _padding_back_bottom_right = padding_back_bottom_right.vec();
+  dnnl::memory::dims _dilation = compatible_dilation(dilation);
+  auto conv_forward_pd = dnnl::convolution_forward::primitive_desc(
+      engine,
+      dnnl::prop_kind::forward,
+      dnnl::algorithm::convolution_direct,
+      src_md,
+      weight_md,
+      bia_md,
+      dst_md,
+      _stride,
+      _dilation,
+      _padding_front_top_left,
+      _padding_back_bottom_right,
+      pattr);
+
+  auto conv_backward_data_pd = dnnl::convolution_backward_data::primitive_desc(
+      engine,
+      dnnl::algorithm::convolution_direct,
+      src_md,
+      weight_md,
+      dst_md,
+      _stride,
+      _dilation,
+      _padding_front_top_left,
+      _padding_back_bottom_right,
+      conv_forward_pd,
+      pattr);
+
+  // create memory
+  at::Tensor expected_src, expected_wei, expected_dst;
+  dnnl::memory diff_dst_m, wei_m, diff_src_m;
+
+  diff_src_m = make_onednn_memory(src_md, engine, diff_src.data_ptr());
+  wei_m = make_onednn_memory(weight_md, engine, weight.data_ptr());
+  diff_dst_m = make_onednn_memory(dst_md, engine, diff_dst.data_ptr());
+
+
+  // insert args
+  std::unordered_map<int, dnnl::memory> args;
+  size_t scratchpad_size = conv_backward_data_pd.scratchpad_desc().get_size();
+  at::Tensor scratchpad_tensor = at::empty(
+      {static_cast<int64_t>(scratchpad_size)}, diff_dst.options().dtype(at::kByte), c10::nullopt);
+  auto scratchpad_memory = make_onednn_memory(
+      conv_backward_data_pd.scratchpad_desc(),
+      engine,
+      scratchpad_tensor.data_ptr());
+  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_memory});
+  args.insert({DNNL_ARG_DIFF_DST, diff_dst_m});
+  args.insert({DNNL_ARG_WEIGHTS, wei_m});
+  args.insert({DNNL_ARG_DIFF_SRC, diff_src_m});
+
+  // execute primitive
+  auto conv_backward_data =
+      dnnl::convolution_backward_data(conv_backward_data_pd);
+  auto conv_backward_data_event = dnnl::sycl_interop::execute(conv_backward_data, stream, args, deps);
+  return conv_backward_data_event;
+
+}
+
+} // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Deconv.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Deconv.cpp
new file mode 100644
index 0000000000000..b8465c62c7e20
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Deconv.cpp
@@ -0,0 +1,435 @@
+#include <c10/xpu/XPUFunctions.h>
+#include <ATen/ATen.h>
+
+#include <oneapi/dnnl/dnnl.hpp>
+#include <ATen/native/mkldnn/xpu/detail/oneDNNContext.h>
+#include <ATen/native/mkldnn/xpu/detail/Utils.h>
+#include <ATen/native/mkldnn/xpu/detail/Attr.h>
+
+namespace at::native::onednn {
+
+static inline dnnl::memory::dims deconv_compatible_dilation(IntArrayRef& dilation) {
+  dnnl::memory::dims ret = dilation.vec();
+  for (auto it = ret.begin(); it != ret.end(); it++) {
+    *it -= 1;
+  }
+  return ret;
+}
+
+static inline std::vector<int64_t> compatible_groups_deconv_strides(
+    const at::Tensor& weight,
+    dnnl::memory::dims group_size) {
+  std::vector<int64_t> strides = weight.strides().vec();
+  strides[0] = weight.strides()[1];
+  strides[1] = weight.strides()[0];
+  strides.insert(strides.begin(), group_size[2] * weight.strides()[0]);
+  return strides;
+}
+
+dnnl::memory::dims deconv_dst_size(
+    IntArrayRef src_size,
+    IntArrayRef weight_size,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    IntArrayRef dst_padding,
+    int64_t groups) {
+  auto dim = src_size.size();
+  dnnl::memory::dims dst_size(dim);
+  auto kernel_size = weight_size.slice(2);
+
+  dst_size[0] = src_size[0];
+  dst_size[1] = weight_size[1] * groups;
+  for (size_t d = 2; d < dim; ++d) {
+    dst_size[d] = (src_size[d] - 1) * stride[d - 2] - 2 * padding[d - 2] +
+        (dilation[d - 2] * (kernel_size[d - 2] - 1) + 1) + dst_padding[d - 2];
+  }
+  return dst_size;
+}
+
+static inline dnnl::memory::format_tag deconv_src_fmt(
+    const int64_t ndim,
+    const bool is_channels_last = false) {
+  // 3D: n/c/w (n/w/c)         [a/b/c (a/c/b)]
+  // 4D: n/c/h/w (n/h/w/c)     [a/b/c/d (a/c/d/b)]
+  // 5D: n/c/d/h/w (n/d/h/w/c) [a/b/c/d/e (a/c/d/e/b)]
+  if (!is_channels_last) {
+    return (ndim == 3)
+        ? dnnl::memory::format_tag::ncw
+        : ((ndim == 4) ? dnnl::memory::format_tag::nchw
+                       : ((ndim == 5) ? dnnl::memory::format_tag::ncdhw
+                                      : dnnl::memory::format_tag::undef));
+  } else {
+    return (ndim == 3)
+        ? dnnl::memory::format_tag::nwc
+        : ((ndim == 4) ? dnnl::memory::format_tag::nhwc
+                       : ((ndim == 5) ? dnnl::memory::format_tag::ndhwc
+                                      : dnnl::memory::format_tag::undef));
+  }
+}
+
+static inline std::vector<int64_t> deconv_weight_fmt(
+    const at::Tensor& weight,
+    const int64_t ndim,
+    dnnl::memory::dims weight_size,
+    const bool grouped = false,
+    const bool is_channels_last = false) {
+  // 3D fmt: (g)i/o/w ((g)i/w/o)  [b/a/c  (b/c/a)]
+  // 4D fmt: (g)i/o/h/w ((g)i/h/w/o) [b/a/c/d (b/c/d/a)]
+  // 5D fmt: (g)i/o/d/h/w ((g)i/d/h/w/o) [b/a/c/d/e (b/c/d/e/a)]
+  auto strides_ = weight.strides().vec();
+  std::vector<int64_t> strides;
+  if (grouped) {
+    strides = compatible_groups_deconv_strides(weight, weight_size);
+  } else {
+    strides = strides_;
+    std::swap(strides[0], strides[1]);
+  }
+  return strides;
+}
+
+static inline dnnl::memory::dims deconv_compatible_weight_dims(
+    int64_t ndim,
+    int64_t groups,
+    int64_t oc,
+    int64_t ic,
+    IntArrayRef weight_size) {
+  if (ndim == 3) {
+    auto kw = weight_size[2];
+    return (groups != 1) ? dnnl::memory::dims({groups, oc / groups, ic / groups, kw})
+                         : dnnl::memory::dims({oc, ic, kw});
+  } else if (ndim == 4) {
+    auto kh = weight_size[2];
+    auto kw = weight_size[3];
+    return (groups != 1)
+        ? dnnl::memory::dims({groups, oc / groups, ic / groups, kh, kw})
+        : dnnl::memory::dims({oc, ic, kh, kw});
+  } else if (ndim == 5) {
+    auto kd = weight_size[2];
+    auto kh = weight_size[3];
+    auto kw = weight_size[4];
+    return (groups != 1)
+        ? dnnl::memory::dims({groups, oc / groups, ic / groups, kd, kh, kw})
+        : dnnl::memory::dims({oc, ic, kd, kh, kw});
+  } else {
+    TORCH_CHECK(0, "unsupported dimension in xpu oneDNN deconvolution...");
+  }
+}
+
+static std::tuple<
+    dnnl::memory::desc,
+    dnnl::memory::desc,
+    dnnl::memory::desc>
+deconv_get_plain_md(
+    const at::Tensor& src,
+    const at::Tensor& weight,
+    const at::Tensor& dst,
+    int64_t groups,
+    bool is_channels_last_suggested) {
+  auto ndim = src.ndimension();
+  auto src_data_t = get_onednn_dtype_include_double(src);
+  auto fmt_src = deconv_src_fmt(ndim, is_channels_last_suggested);
+  auto src_usr_md = dnnl::memory::desc(src.sizes().vec(), src_data_t, fmt_src);
+
+  auto dst_data_t = get_onednn_dtype_include_double(dst);
+  auto dst_usr_md = dnnl::memory::desc(dst.sizes().vec(), dst_data_t, fmt_src);
+
+  auto ic = src.size(1);
+  auto oc = dst.size(1);
+  dnnl::memory::dims weight_size =
+      deconv_compatible_weight_dims(ndim, groups, oc, ic, weight.sizes());
+  auto weight_dt = get_onednn_dtype_include_double(weight);
+  auto fmt_weight = deconv_weight_fmt(
+      weight, ndim, weight_size, groups != 1, is_channels_last_suggested);
+  dnnl::memory::desc weight_usr_md = dnnl::memory::desc(weight_size, weight_dt, fmt_weight);
+
+  return {src_usr_md, weight_usr_md, dst_usr_md};
+}
+
+sycl::event deconvolution(
+    at::Tensor& dst,
+    const at::Tensor& src,
+    const at::Tensor& weight,
+    const at::Tensor& bia,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dst_padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    Attr& attr,
+    const std::vector<sycl::event>& deps) {
+  auto engine =
+      GpuEngineManager::Instance().get_engine({c10::kXPU, c10::xpu::current_device()});
+  auto stream = GpuStreamManager::Instance().get_stream();
+
+  bool is_channels_last_suggested = use_channels_last_for_conv(src, weight, /*is_transposed=*/true);
+
+  // create usr_md for tensors, and md for conv primitive
+  dnnl::memory::desc src_md, weight_md, dst_md;
+
+  std::tie(src_md, weight_md, dst_md) =
+      deconv_get_plain_md(src, weight, dst, groups, is_channels_last_suggested);
+
+  dnnl::memory::format_tag bia_fmt = dnnl::memory::format_tag::x;
+  auto bia_md = bia.defined()
+      ? dnnl::memory::desc(
+            {dst.size(1)}, get_onednn_dtype_include_double(bia), bia_fmt)
+      : dnnl::memory::desc();
+
+  // create primitive desc
+  dnnl::memory::dims _stride = stride.vec();
+  dnnl::memory::dims _padding = padding.vec();
+  dnnl::memory::dims _dilation = deconv_compatible_dilation(dilation);
+
+  // construct primitive attr
+  dnnl::primitive_attr pattr;
+  dnnl::post_ops po = attr.extract_post_ops(dst);
+  pattr.set_post_ops(po);
+  #if ONEDNN_SUPPORT_DETERMINISTIC
+    if(at::globalContext().deterministicAlgorithms())
+        pattr.set_deterministic(true);
+  #endif
+
+  pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
+  auto deconv_fwd_pd = dnnl::deconvolution_forward::primitive_desc(
+      engine,
+      dnnl::prop_kind::forward,
+      dnnl::algorithm::deconvolution_direct,
+      src_md,
+      weight_md,
+      bia_md,
+      dst_md,
+      _stride,
+      _dilation,
+      _padding,
+      _padding,
+      pattr);
+
+  dnnl::memory src_m, weight_m, dst_m, bia_m;
+  at::Tensor src_blocked, weight_blocked, dst_blocked = dst;
+
+  src_m = make_onednn_memory(src_md, engine, src.data_ptr());
+  weight_m = make_onednn_memory(weight_md, engine, weight.data_ptr());
+  dst_m = make_onednn_memory(dst_md, engine, dst.data_ptr());
+
+  std::unordered_map<int, dnnl::memory> args;
+  args.insert({DNNL_ARG_SRC, src_m});
+  args.insert({DNNL_ARG_WEIGHTS, weight_m});
+  args.insert({DNNL_ARG_DST, dst_m});
+
+  if (bia.defined()) {
+    auto bia_m = make_onednn_memory(bia_md, engine, bia.data_ptr());
+    args.insert({DNNL_ARG_BIAS, bia_m});
+  }
+  if (attr.with_binary())
+    attr.construct_post_binary(deconv_fwd_pd, args);
+
+  size_t scratchpad_size = deconv_fwd_pd.scratchpad_desc().get_size();
+  at::Tensor scratchpad_tensor = at::empty(
+      {static_cast<int64_t>(scratchpad_size)}, src.options().dtype(at::kByte), c10::nullopt);
+  auto scratchpad_m = make_onednn_memory(
+      deconv_fwd_pd.scratchpad_desc(), engine, scratchpad_tensor.data_ptr());
+  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_m});
+
+  auto deconv_fwd = dnnl::deconvolution_forward(deconv_fwd_pd);
+  sycl::event deconv_event = dnnl::sycl_interop::execute(deconv_fwd, stream, args, deps);
+  return deconv_event;
+
+}
+
+sycl::event deconvolution_backward_data(
+    at::Tensor& diff_src,
+    const at::Tensor& diff_dst,
+    const at::Tensor& weight,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool bias_defined,
+    const std::vector<sycl::event>& deps) {
+  auto engine =
+      GpuEngineManager::Instance().get_engine({c10::kXPU, c10::xpu::current_device()});
+  auto stream = GpuStreamManager::Instance().get_stream();
+
+  bool is_channels_last_suggested =
+      use_channels_last_for_conv(diff_dst, weight, /*is_transposed=*/true);
+  // create memory desc
+  dnnl::memory::desc src_md, weight_md, dst_md;
+  std::tie(src_md, weight_md, dst_md) =
+      deconv_get_plain_md(
+          diff_src, weight, diff_dst, groups, is_channels_last_suggested);
+
+  dnnl::memory::format_tag bia_fmt = dnnl::memory::format_tag::x;
+  auto bias_md = bias_defined
+      ? dnnl::memory::desc({diff_dst.size(1)}, weight_md.get_data_type(), bia_fmt)
+      : dnnl::memory::desc();
+
+  // create fwd primitive desc hint
+  dnnl::primitive_attr pattr;
+  pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+  #if ONEDNN_SUPPORT_DETERMINISTIC
+    if(at::globalContext().deterministicAlgorithms())
+        pattr.set_deterministic(true);
+  #endif
+
+  dnnl::memory::dims _stride = stride.vec();
+  dnnl::memory::dims _padding = padding.vec();
+  dnnl::memory::dims _dilation = deconv_compatible_dilation(dilation);
+  auto deconv_fwd_pd = dnnl::deconvolution_forward::primitive_desc(
+      engine,
+      dnnl::prop_kind::forward,
+      dnnl::algorithm::deconvolution_direct,
+      src_md,
+      weight_md,
+      bias_md,
+      dst_md,
+      _stride,
+      _dilation,
+      _padding,
+      _padding,
+      pattr);
+
+  // create bwd primitive desc
+  auto deconv_backward_data_pd = dnnl::deconvolution_backward_data::primitive_desc(
+      engine,
+      dnnl::algorithm::deconvolution_direct,
+      src_md,
+      weight_md,
+      dst_md,
+      _stride,
+      _dilation,
+      _padding,
+      _padding,
+      deconv_fwd_pd);
+
+  // create memory
+  dnnl::memory diff_dst_m, wei_m, diff_src_m;
+
+  diff_src_m = make_onednn_memory(src_md, engine, diff_src.data_ptr());
+  wei_m = make_onednn_memory(weight_md, engine, weight.data_ptr());
+  diff_dst_m = make_onednn_memory(dst_md, engine, diff_dst.data_ptr());
+
+  // insert args
+  std::unordered_map<int, dnnl::memory> args;
+  size_t scratchpad_size = deconv_backward_data_pd.scratchpad_desc().get_size();
+  at::Tensor scratchpad_tensor = at::empty(
+      {static_cast<int64_t>(scratchpad_size)}, diff_dst.options().dtype(at::kByte), c10::nullopt);
+  auto scratchpad_memory = make_onednn_memory(
+      deconv_backward_data_pd.scratchpad_desc(),
+      engine,
+      scratchpad_tensor.data_ptr());
+  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_memory});
+  args.insert({DNNL_ARG_DIFF_DST, diff_dst_m});
+  args.insert({DNNL_ARG_WEIGHTS, wei_m});
+  args.insert({DNNL_ARG_DIFF_SRC, diff_src_m});
+
+  // execute primitive
+  auto deconv_backward_data =
+      dnnl::deconvolution_backward_data(deconv_backward_data_pd);
+  sycl::event deconv_bwd_data_event = dnnl::sycl_interop::execute(deconv_backward_data, stream, args, deps);
+  return deconv_bwd_data_event;
+
+}
+
+sycl::event deconvolution_backward_weights(
+    at::Tensor& diff_weight,
+    at::Tensor& diff_bia,
+    const at::Tensor& diff_dst,
+    const at::Tensor& src,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    const std::vector<sycl::event>& deps) {
+  auto engine =
+      GpuEngineManager::Instance().get_engine({c10::kXPU, c10::xpu::current_device()});
+  auto stream = GpuStreamManager::Instance().get_stream();
+
+  bool is_channels_last_suggested =
+      use_channels_last_for_conv(src, diff_dst, /*is_transposed=*/true);
+
+  // create memory desc
+  dnnl::memory::desc src_md, weight_md, dst_md;
+  std::tie(src_md, weight_md, dst_md) = deconv_get_plain_md(
+          src, diff_weight, diff_dst, groups, is_channels_last_suggested);
+
+  dnnl::memory::format_tag bia_fmt = dnnl::memory::format_tag::x;
+  auto bia_md = diff_bia.defined()
+      ? dnnl::memory::desc({diff_dst.size(1)}, src_md.get_data_type(), bia_fmt)
+      : dnnl::memory::desc();
+
+  // create fwd primitive desc hint
+  dnnl::memory::dims _stride = stride.vec();
+  dnnl::memory::dims _padding = padding.vec();
+  dnnl::memory::dims _dilation = deconv_compatible_dilation(dilation);
+  dnnl::primitive_attr pattr;
+
+  #if ONEDNN_SUPPORT_DETERMINISTIC
+    if(at::globalContext().deterministicAlgorithms())
+        pattr.set_deterministic(true);
+  #endif
+  pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+  auto deconv_fwd_pd = dnnl::deconvolution_forward::primitive_desc(
+      engine,
+      dnnl::prop_kind::forward,
+      dnnl::algorithm::deconvolution_direct,
+      src_md,
+      weight_md,
+      bia_md,
+      dst_md,
+      _stride,
+      _dilation,
+      _padding,
+      _padding,
+      pattr);
+
+  auto deconv_bwd_w_pd = dnnl::deconvolution_backward_weights::primitive_desc(
+      engine,
+      dnnl::algorithm::deconvolution_direct,
+      src_md,
+      weight_md,
+      bia_md,
+      dst_md,
+      _stride,
+      _dilation,
+      _padding,
+      _padding,
+      deconv_fwd_pd,
+      pattr);
+
+  // create bwd dnnl::memory
+  dnnl::memory src_m, diff_dst_m, diff_weight_m;
+
+  src_m = make_onednn_memory(src_md, engine, src.data_ptr());
+  diff_dst_m = make_onednn_memory(dst_md, engine, diff_dst.data_ptr());
+  diff_weight_m = make_onednn_memory(weight_md, engine, diff_weight.data_ptr());
+
+  // insert args
+  std::unordered_map<int, dnnl::memory> args;
+  args.insert({DNNL_ARG_DIFF_DST, diff_dst_m});
+  args.insert({DNNL_ARG_SRC, src_m});
+  args.insert({DNNL_ARG_DIFF_WEIGHTS, diff_weight_m});
+
+  if (diff_bia.defined()) {
+    dnnl::memory diff_bia_m =
+        make_onednn_memory(bia_md, engine, diff_bia.data_ptr());
+    args.insert({DNNL_ARG_DIFF_BIAS, diff_bia_m});
+  }
+
+  size_t scratchpad_size = deconv_bwd_w_pd.scratchpad_desc().get_size();
+  at::Tensor scratchpad_tensor = at::empty(
+      {static_cast<int64_t>(scratchpad_size)}, src.options().dtype(at::kByte), c10::nullopt);
+  auto scratchpad_m = make_onednn_memory(
+      deconv_bwd_w_pd.scratchpad_desc(), engine, scratchpad_tensor.data_ptr());
+  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_m});
+
+  // execute primitive
+  auto deconv_bwd_w = dnnl::deconvolution_backward_weights(deconv_bwd_w_pd);
+
+  sycl::event deconv_bwd_w_event = dnnl::sycl_interop::execute(deconv_bwd_w, stream, args, deps);
+  return deconv_bwd_w_event;
+
+}
+
+} // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp
new file mode 100644
index 0000000000000..7dfd31b93ba8d
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp
@@ -0,0 +1,244 @@
+
+#include <c10/xpu/XPUFunctions.h>
+
+#include <ATen/ATen.h>
+#include <ATen/record_function.h>
+
+#include <Attr.h>
+#include <Utils.h>
+
+#include <oneapi/dnnl/dnnl.hpp>
+
+namespace at::native::onednn {
+
+sycl::event matmul(
+    at::Tensor& result,
+    const at::Tensor& mat1,
+    const at::Tensor& mat2,
+    const at::Tensor& b_raw,
+    bool m2_trans,
+    Attr attr,
+    const std::vector<sycl::event>& deps) {
+  int64_t dims = result.dim();
+  TORCH_CHECK(
+      dims == 2 || dims == 3,
+      "oneDNN matmul only works with 2D or 3D, got ",
+      dims);
+  TORCH_CHECK(
+      dims == mat1.dim() && dims == mat2.dim(),
+      "oneDNN input matrixes must have the same ranks");
+  TORCH_CHECK(result.defined(), "oneDNN matmul result should be defined");
+
+  at::Device cur_device = at::Device(at::kXPU, c10::xpu::current_device());
+  auto engine = GpuEngineManager::Instance().get_engine(cur_device);
+  auto stream = GpuStreamManager::Instance().get_stream();
+
+  at::Tensor m1 = is_onednn_matmul_strides(mat1) ? mat1 : mat1.contiguous();
+  at::Tensor m2 = is_onednn_matmul_strides(mat2) ? mat2 : mat2.contiguous();
+  at::Tensor dst = is_onednn_matmul_strides(result, true) ? result : result.contiguous();
+
+  int64_t m = dst.size(-2);
+  int64_t n = dst.size(-1);
+  int64_t k = m1.size(-1);
+  int64_t mb = 1;
+
+  if (dims == 3) {
+    mb = dst.size(0);
+    TORCH_CHECK(
+        mb == m1.size(0) && mb == m2.size(0),
+        "batch size mismatch, dst mb: ",
+        mb,
+        "m1 mb",
+        m1.size(0),
+        " m2 mb: ",
+        m2.size(0));
+  }
+
+  // validate bias and make it compatible with oneDNN implementation
+  bool with_bias = false;
+  at::Tensor b = b_raw;
+  if (b.defined()) {
+    with_bias = true;
+    if (b.dim() == 1) {
+      TORCH_CHECK(
+          b.size(0) == n || b.size(0) == 1,
+          "matmul supports [n] or [1] when bias dim is 1 ...");
+      if (b.size(0) == 0) {
+        with_bias = false;
+      } else if (m1.dim() == 3) {
+        b = b.expand({mb, m, n}).contiguous();
+      } else if (m1.dim() == 2) {
+        b = b.expand({1, n}).contiguous();
+      }
+    } else if (b.dim() == 2) {
+      TORCH_CHECK(
+          (b.size(0) == m && b.size(1) == n) ||
+              (b.size(0) == 1 && b.size(1) == n) ||
+              (b.size(0) == m && b.size(1) == 1) ||
+              (b.size(0) == 1 && b.size(1) == 1),
+          "matmul supports [m, n] or [1, n] or [m, 1] or [1, 1] when bias dim is 2 ...");
+      if (b.size(0) == 1 && b.size(1) == 1)
+        b = b.expand({1, n}).contiguous();
+    } else if (b.dim() == 3) {
+      TORCH_CHECK(
+          at::are_expandable({mb, m, n}, b.sizes()),
+          "matmul bias must be expandable to:",
+          dst.sizes(),
+          " but got:",
+          b.sizes());
+      b = b.expand({mb, m, n}).contiguous();
+    } else if (b.dim() == 0) {
+      TORCH_CHECK(
+          b.numel() == 1, "matmul supports 1 numel when bias dim is [] ...");
+      if (m1.dim() == 3) {
+        b = b.expand({mb, m, n}).contiguous();
+      } else {
+        b = b.expand({1, n}).contiguous();
+      }
+    } else {
+      TORCH_CHECK(0, "unsupported bias dim in matmul ...");
+    }
+  }
+
+  b = b.contiguous(); // avoid reorder 2 times
+
+  // xpu matmul support both ab/ba shape for m2 tensor, we don't check any more
+  auto m1_usr_dt = get_onednn_dtype(m1);
+  auto m2_usr_dt = get_onednn_dtype(m2);
+  auto dst_usr_dt = get_onednn_dtype(dst);
+
+  auto m1_dt = m1_usr_dt;
+  auto m2_dt = m2_usr_dt;
+  auto dst_dt = dst_usr_dt;
+  dnnl::memory::data_type bias_dt;
+
+  dnnl::memory::desc m1_md, m1_usr_md, m1_any_md;
+  dnnl::memory::desc m2_md, m2_usr_md, m2_any_md;
+  dnnl::memory::desc dst_md, dst_usr_md, dst_any_md;
+  dnnl::memory::desc bias_md;
+
+  // Naive Master weight
+  if (m1_dt == dnnl::memory::data_type::bf16 && m2_dt == dnnl::memory::data_type::f32) {
+    m2_dt = dnnl::memory::data_type::bf16;
+    dst_dt = dnnl::memory::data_type::bf16;
+  } else if (
+      m1_dt == dnnl::memory::data_type::f32 && m2_dt == dnnl::memory::data_type::bf16) {
+    m1_dt = dnnl::memory::data_type::bf16;
+    dst_dt = dnnl::memory::data_type::bf16;
+  }
+
+  dnnl::memory::dims m1_dims, m2_dims, dst_dims, bias_dims;
+  dnnl::memory::dims m1_strides, m2_strides, dst_strides, bias_strides;
+  if (dims == 2) {
+    m1_dims = {m, k};
+    m2_dims = {k, n};
+    dst_dims = {m, n};
+
+    m1_strides = {m1.stride(0), m1.stride(1)};
+    if (m2_trans) {
+      m2_strides = {m2.stride(0), m2.stride(1)};
+    } else {
+      m2_strides = {m2.stride(1), m2.stride(0)};
+    }
+    dst_strides = {dst.stride(0), dst.stride(1)};
+  } else {
+    m1_dims = {mb, m, k};
+    m2_dims = {mb, k, n};
+    dst_dims = {mb, m, n};
+
+    m1_strides = {m1.stride(0), m1.stride(1), m1.stride(2)};
+    if (m2_trans) {
+      m2_strides = {m2.stride(0), m2.stride(1), m2.stride(2)};
+    } else {
+      m2_strides = {m2.stride(0), m2.stride(2), m2.stride(1)};
+    }
+    dst_strides = {dst.stride(0), dst.stride(1), dst.stride(2)};
+  }
+
+  if (with_bias) {
+    bias_dims = get_onednn_dims(b);
+    bias_dt = get_onednn_dtype(b);
+    bias_strides = get_onednn_strides(b);
+  }
+
+  dnnl::post_ops po = attr.extract_post_ops(dst);
+
+  std::unordered_map<int, dnnl::memory> args;
+  dnnl::matmul matmul_p;
+  dnnl::matmul::primitive_desc matmul_pd;
+
+  // STEP1: create memory desc
+  m1_md = dnnl::memory::desc(m1_dims, m1_dt, m1_strides);
+  m2_md = dnnl::memory::desc(m2_dims, m2_dt, m2_strides);
+  dst_md = dnnl::memory::desc(dst_dims, dst_dt, dst_strides);
+
+  // STEP2: creat attribute
+  dnnl::primitive_attr pattr;
+  pattr.set_post_ops(po);
+
+  #if ONEDNN_SUPPORT_DETERMINISTIC
+    if(at::globalContext().deterministicAlgorithms())
+        pattr.set_deterministic(true);
+  #endif
+
+  // scratchpad
+  pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
+  if (m1_dt == dnnl::memory::data_type::f32) {
+    pattr.set_fpmath_mode(dnnl::fpmath_mode::strict);
+  }
+
+  // STEP3: create primitive
+  if (with_bias) {
+    bias_md = dnnl::memory::desc(bias_dims, bias_dt, bias_strides);
+    matmul_pd =
+        dnnl::matmul::primitive_desc(engine, m1_md, m2_md, bias_md, dst_md, pattr);
+  } else {
+    matmul_pd = dnnl::matmul::primitive_desc(engine, m1_md, m2_md, dst_md, pattr);
+  }
+
+  matmul_p = dnnl::matmul(matmul_pd);
+
+  m1_usr_md = dnnl::memory::desc(m1_dims, m1_usr_dt, m1_strides);
+  m2_usr_md = dnnl::memory::desc(m2_dims, m2_usr_dt, m2_strides);
+  dst_usr_md = dnnl::memory::desc(dst_dims, dst_usr_dt, dst_strides);
+
+  // STEP4: create memory
+  auto m1_usr_m = make_onednn_memory(m1_usr_md, engine, m1.data_ptr());
+  auto m2_usr_m = make_onednn_memory(m2_usr_md, engine, m2.data_ptr());
+  auto dst_usr_m = make_onednn_memory(dst_usr_md, engine, dst.data_ptr());
+
+  auto expected_m1_md = matmul_pd.src_desc();
+  auto expected_m2_md = matmul_pd.weights_desc();
+  auto expected_dst_md = matmul_pd.dst_desc();
+
+  dnnl::memory m1_m = m1_usr_m, m2_m = m2_usr_m, dst_m = dst_usr_m;
+  at::Tensor m1_, m2_, dst_;
+
+  if (attr.with_binary())
+    attr.construct_post_binary(matmul_pd, args);
+
+  size_t scratchpad_size = matmul_pd.scratchpad_desc().get_size();
+  at::Tensor scratchpad_tensor = at::empty(
+      {static_cast<int64_t>(scratchpad_size)}, m1.options().dtype(at::kByte), c10::nullopt);
+  auto scratchpad_memory = make_onednn_memory(
+      matmul_pd.scratchpad_desc(), engine, scratchpad_tensor.data_ptr());
+  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_memory});
+
+  args.insert({DNNL_ARG_SRC, m1_m});
+  args.insert({DNNL_ARG_WEIGHTS, m2_m});
+  args.insert({DNNL_ARG_DST, dst_m});
+  if (with_bias) {
+    auto bias_m = make_onednn_memory(bias_md, engine, b.data_ptr());
+    args.insert({DNNL_ARG_BIAS, bias_m});
+  }
+
+  sycl::event matmul_event = dnnl::sycl_interop::execute(matmul_p, stream, args, deps);
+
+  if (!dst.is_same(result))
+    result.copy_(dst);
+
+  return matmul_event;
+}
+
+} // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
new file mode 100644
index 0000000000000..8dd3dc329c70f
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
@@ -0,0 +1,380 @@
+#include <ATen/native/mkldnn/xpu/detail/Utils.h>
+
+namespace at::native::onednn {
+
+dnnl::memory make_onednn_memory(
+    dnnl::memory::desc md,
+    dnnl::engine& engine,
+    void* ptr){
+  return dnnl::sycl_interop::make_memory(
+      md,
+      engine,
+      dnnl::sycl_interop::memory_kind::usm,
+      ptr == nullptr ? DNNL_MEMORY_ALLOCATE : ptr);
+}
+
+dnnl::memory::format_tag get_dnnl_default_format(
+    int ndims,
+    bool is_channels_last,
+    bool allow_undef) {
+  switch (ndims) {
+    case 1:
+      return dnnl::memory::format_tag::a;
+    case 2:
+      return dnnl::memory::format_tag::ab;
+    case 3:
+      return is_channels_last ? dnnl::memory::format_tag::acb
+                              : dnnl::memory::format_tag::abc;
+    case 4:
+      return is_channels_last ? dnnl::memory::format_tag::acdb
+                              : dnnl::memory::format_tag::abcd;
+    case 5:
+      return is_channels_last ? dnnl::memory::format_tag::acdeb
+                              : dnnl::memory::format_tag::abcde;
+    case 6:
+      return dnnl::memory::format_tag::abcdef;
+    case 7:
+      return dnnl::memory::format_tag::abcdefg;
+    case 8:
+      return dnnl::memory::format_tag::abcdefgh;
+    case 9:
+      return dnnl::memory::format_tag::abcdefghi;
+    case 10:
+      return dnnl::memory::format_tag::abcdefghij;
+    case 11:
+      return dnnl::memory::format_tag::abcdefghijk;
+    case 12:
+      return dnnl::memory::format_tag::abcdefghijkl;
+    default:
+      if (!allow_undef) {
+        TORCH_CHECK(false, "oneDNN doesn't support tensor dimension > 12");
+      }
+      return dnnl::memory::format_tag::undef;
+  }
+}
+
+dnnl::memory::data_type get_onednn_dtype(
+    const at::Tensor& tensor,
+    bool allow_undef) {
+  switch (tensor.scalar_type()) {
+    case at::ScalarType::Byte:
+      return dnnl::memory::data_type::u8;
+    case at::ScalarType::Char:
+      return dnnl::memory::data_type::s8;
+    case at::ScalarType::QInt8:
+      return dnnl::memory::data_type::s8;
+    case at::ScalarType::QUInt8:
+      return dnnl::memory::data_type::u8;
+    case at::ScalarType::Int:
+      return dnnl::memory::data_type::s32;
+    case at::ScalarType::Half:
+      return dnnl::memory::data_type::f16;
+    case at::ScalarType::Float:
+      return dnnl::memory::data_type::f32;
+    case at::ScalarType::BFloat16:
+      return dnnl::memory::data_type::bf16;
+    default:
+      if (!allow_undef) {
+        TORCH_CHECK(
+            false,
+            c10::toString(tensor.scalar_type()),
+            " is not supported in oneDNN!");
+      }
+      return dnnl::memory::data_type::undef;
+  };
+}
+
+dnnl::memory::data_type get_onednn_dtype_include_double(
+    const at::Tensor& tensor,
+    bool allow_undef) {
+  if (tensor.scalar_type() == at::ScalarType::Double)
+    return dnnl::memory::data_type::f64;
+  return get_onednn_dtype(tensor, allow_undef);
+}
+
+bool is_supported_onednn_dtype(const at::Tensor& tensor) {
+  return get_onednn_dtype(tensor, /*allow_undef*/ true) ==
+          dnnl::memory::data_type::undef
+      ? false
+      : true;
+}
+
+dnnl::memory::dims get_onednn_dims(const at::Tensor& tensor) {
+  dnnl::memory::dims dims;
+  for (size_t i = 0; i < tensor.sizes().size(); i++)
+    dims.push_back(tensor.size(i));
+  return dims;
+}
+
+dnnl::memory::dims get_onednn_strides(const at::Tensor& tensor) {
+  dnnl::memory::dims strides;
+  for (size_t i = 0; i < tensor.strides().size(); i++)
+    strides.push_back(tensor.stride(i));
+  return strides;
+}
+
+dnnl::memory::desc get_onednn_md(const at::Tensor& tensor) {
+  return {
+      get_onednn_dims(tensor),
+      get_onednn_dtype(tensor),
+      get_onednn_strides(tensor)};
+}
+
+bool onednn_strides_check(const at::Tensor& src) {
+  auto adims = get_onednn_dims(src);
+  int ndims = (int)adims.size();
+  auto dims = adims.data();
+  auto data_type = static_cast<dnnl_data_type_t>(
+      get_onednn_dtype(src, /*allow_undef*/ true));
+  auto strides_info = get_onednn_strides(src);
+  auto strides = strides_info.empty() ? nullptr : &strides_info[0];
+
+  dnnl_memory_desc_t md;
+  dnnl_memory_desc_create_with_strides(&md, ndims, dims, data_type, strides);
+  dnnl_format_kind_t md_fmt_kind;
+  int md_ndims;
+  int md_inner_nblks;
+  dnnl_dims_t* md_padded_dims = nullptr;
+
+  dnnl_memory_desc_query(md, dnnl_query_inner_nblks_s32, &md_inner_nblks);
+  dnnl_memory_desc_query(md, dnnl_query_format_kind, &md_fmt_kind);
+  dnnl_memory_desc_query(md, dnnl_query_ndims_s32, &md_ndims);
+  dnnl_memory_desc_query(md, dnnl_query_padded_dims, &md_padded_dims);
+  if (strides == nullptr || md_ndims == 0 ||
+      md_fmt_kind != dnnl_format_kind_t::dnnl_blocked)
+    return true;
+
+  dnnl_dims_t blocks = {0};
+  int perm[DNNL_MAX_NDIMS] = {0};
+  for (int d = 0; d < md_ndims; ++d) {
+    // no strides check needed for empty tensor
+    if (md_padded_dims[d] == 0)
+      return true;
+
+    // no strides verification for runtime dims
+    if (strides[d] == DNNL_RUNTIME_DIM_VAL)
+      return true;
+
+    perm[d] = d;
+    blocks[d] = 1;
+  }
+
+  auto block_size = 1;
+  dnnl_dims_t md_inner_blks;
+  dnnl_dims_t md_blk_inner_idxs;
+  dnnl_memory_desc_query(md, dnnl_query_inner_idxs, &md_blk_inner_idxs);
+  dnnl_memory_desc_query(md, dnnl_query_inner_blks, &md_inner_blks);
+  for (int iblk = 0; iblk < md_inner_nblks; ++iblk) {
+    blocks[md_blk_inner_idxs[iblk]] *= md_inner_blks[iblk];
+    block_size *= md_inner_blks[iblk];
+  }
+
+  // A custom comparator to yield linear order on perm
+  auto idx_sorter = [&](const int a, const int b) -> bool {
+    if (strides[a] == strides[b] && md_padded_dims[a] == md_padded_dims[b])
+      return a < b;
+    else if (strides[a] == strides[b])
+      return md_padded_dims[a] < md_padded_dims[b];
+    else
+      return strides[a] < strides[b];
+  };
+  std::sort(perm, perm + md_ndims, idx_sorter);
+
+  auto min_stride = block_size;
+  for (int idx = 0; idx < md_ndims; ++idx) {
+    const int d = perm[idx];
+
+    // Make an exception for strides[d] == 0 as it has broadcast semantics
+    // Note: owing to being sorted, these are the initial strides
+    if (strides[d] == 0)
+      continue;
+    else if (strides[d] < min_stride)
+      return false;
+
+    // update min_stride for next iteration
+    const auto padded_dim = *md_padded_dims[d];
+    min_stride = block_size * strides[d] * (padded_dim / blocks[d]);
+  }
+  return true;
+}
+
+bool is_broadcast(const at::Tensor& t) {
+  for (int i = 0; i < t.dim(); i++) {
+    if (t.stride(i) == 0)
+      return true;
+  }
+  return false;
+}
+
+bool is_onednn_matmul_strides(
+    const at::Tensor& tensor,
+    bool is_dst) {
+  // https://oneapi-src.github.io/oneDNN/dev_guide_matmul.html
+  // oneDNN matmul only support 2-dim and 3-dim
+  // 2D src(Mxk), wei(KxN), dst(MxN)
+  // 3D src(SxMxK), wei(WxKxN), dst(DxMxN)
+  auto sizes = tensor.sizes();
+  auto tensor_dim = sizes.size();
+  if (tensor_dim != 2 && tensor_dim != 3)
+    return false;
+
+  if (tensor.is_contiguous())
+    return true;
+
+  // the overlaped cases are not supported
+  dnnl::memory::dims strides = get_onednn_strides(tensor);
+  int64_t storage_size = 1;
+  for (size_t dim = 0; dim < tensor_dim; ++dim)
+    storage_size += (sizes[dim] - 1) * strides[dim];
+  if (storage_size < tensor.numel())
+    return false;
+
+  // the broadcast cases are not supported
+  if (is_broadcast(tensor)) {
+    return false;
+  }
+
+  if (is_dst) {
+    // The memory format of the destination tensor should always
+    // be plain with n axis contiguous
+    if (strides[-1] != 1)
+      return false;
+  } else {
+    // the src and weight must have at least one of the axes
+    // m or k and n or k contiguous (i.e., stride=1) respectively.
+    if (strides[tensor_dim - 1] != 1 && strides[tensor_dim - 2] != 1)
+      return false;
+  }
+
+  if (!onednn_strides_check(tensor))
+    return false;
+  return true;
+}
+
+bool is_broadcast_from_other_to_self(
+    const at::Tensor& self,
+    const at::Tensor& other) {
+  return (
+      self.sizes() != other.sizes() &&
+      at::is_expandable_to(other.sizes(), self.sizes()));
+}
+
+at::MemoryFormat get_cl_tag_by_ndim(const int64_t ndim) {
+  TORCH_CHECK(
+      3 == ndim || 4 == ndim || 5 == ndim,
+      "ndim must be 3, 4 or 5 when get cl tag");
+  if (3 == ndim) {
+    return at::MemoryFormat::Contiguous;
+  } else if (5 == ndim) {
+    return at::MemoryFormat::ChannelsLast3d;
+  } else {
+    return at::MemoryFormat::ChannelsLast;
+  }
+}
+
+bool binary_valid(
+    const at::Tensor& self,
+    const at::Tensor& other,
+    bool is_fusion) {
+  if (self.sizes() != other.sizes() &&
+      !is_broadcast_from_other_to_self(self, other))
+    return false;
+
+  /* If the following conditions are satisfied, then oneDNN path will be
+     selected:
+     * 1. self and other should be xpu tensor and be defined.
+     * 2. self or other should not be scalar (wrapped tensor).
+     * 3. dim of self and other should be equal and must be larger than 0 and
+     smaller than 7.
+     * 4. the datatype should be supported by oneDNN primitive.
+     * 5. self and other should be in the same datatype.
+     * 6. self and other should be contiguous or channel-last contiguous.*/
+
+
+  // 1. self and other should be xpu tensor and be defined.
+  if ((!self.defined()) || (!other.defined()) || (!self.is_xpu()) ||
+      (!other.is_xpu()))
+    return false;
+
+  // 2. self or other should not be scalar (wrapped tensor).
+  if (self.unsafeGetTensorImpl()->is_wrapped_number() || other.unsafeGetTensorImpl()->is_wrapped_number())
+    return false;
+
+  // 3. dim of self and other should be equal and must be larger than 0 and
+  // smaller than 7.
+  if ((self.dim() <= 0) || (other.dim() <= 0) || (self.dim() != other.dim()) ||
+      (self.dim() > 6) || (other.dim() > 6))
+    return false;
+
+  // 4. the datatype should be supported by oneDNN primitive.
+  switch (self.scalar_type()) {
+    case at::ScalarType::Char:
+      break;
+    case at::ScalarType::Byte:
+      break;
+    case at::ScalarType::Half:
+      break;
+    case at::ScalarType::Float:
+      break;
+    case at::ScalarType::BFloat16:
+      break;
+    default:
+      return false;
+  };
+
+  // 5. datatype check
+  if (is_fusion) {
+    // for fusion case, the fusion can be performed on scalar_type or Float
+    // datatype.
+    if (self.scalar_type() != other.scalar_type() &&
+        other.scalar_type() != at::ScalarType::Float) {
+      return false;
+    }
+  } else {
+    if (self.scalar_type() != other.scalar_type()) {
+      // for non-fusion case: self and other should be in the same datatype.
+      return false;
+    }
+  }
+
+  // 6. self and other should be contiguous or channel-last contiguous.
+  const auto ndim = self.ndimension();
+  auto cl_tag = at::MemoryFormat::ChannelsLast;
+  if (3 == ndim || 4 == ndim || 5 == ndim) {
+    cl_tag = get_cl_tag_by_ndim(ndim);
+  }
+  if ((self.is_contiguous() && other.is_contiguous()) ||
+      (self.is_contiguous(cl_tag) && other.is_contiguous(cl_tag)))
+    return true;
+  return false;
+}
+
+static inline bool is_channels_last(at::MemoryFormat fmt){
+  return (at::MemoryFormat::ChannelsLast == fmt) || (at::MemoryFormat::ChannelsLast3d == fmt);
+}
+
+static inline bool is_smf_channels_last(const Tensor& t){
+  return is_channels_last(t.suggest_memory_format());
+}
+
+bool use_channels_last_for_conv(
+    const at::Tensor& src,
+    const at::Tensor& weight,
+    bool is_transpose){
+
+  if (!src.defined() || src.is_sparse()) {
+    // suggest channels_first
+    return false;
+  }
+
+  auto suggest_channels_last_format =
+      (is_smf_channels_last(src) || is_smf_channels_last(weight));
+  if (suggest_channels_last_format) {
+    // suggest channels_last
+    return true;
+  }
+
+  return false;
+}
+
+}
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h
new file mode 100644
index 0000000000000..2929d3159e139
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h
@@ -0,0 +1,61 @@
+#pragma once
+#include <iostream>
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/core/Tensor.h>
+
+#include <ATen/core/grad_mode.h>
+#include <c10/core/MemoryFormat.h>
+#include <oneapi/dnnl/dnnl.hpp>
+#include <oneapi/dnnl/dnnl_sycl.hpp>
+#include <oneapi/dnnl/dnnl_version.h>
+
+
+#define ONEDNN_SUPPORT_DETERMINISTIC (DNNL_VERSION_MAJOR >=3 && DNNL_VERSION_MINOR >=4)
+
+namespace at::native::onednn {
+
+dnnl::memory::format_tag get_dnnl_default_format(
+    int ndims,
+    bool is_channels_last = false,
+    bool allow_undef = false);
+
+dnnl::memory::data_type get_onednn_dtype(
+    const at::Tensor& tensor,
+    bool allow_undef = false);
+
+dnnl::memory::data_type get_onednn_dtype_include_double(
+    const at::Tensor& tensor,
+    bool allow_undef = false);
+
+bool is_supported_onednn_dtype(const at::Tensor& tensor);
+
+dnnl::memory::dims get_onednn_dims(const at::Tensor& tensor);
+
+dnnl::memory::dims get_onednn_strides(const at::Tensor& tensor);
+dnnl::memory::desc get_onednn_md(const at::Tensor& tensor);
+
+bool onednn_strides_check(const at::Tensor& src);
+bool is_broadcast(const at::Tensor& t);
+
+bool is_onednn_matmul_strides(
+    const at::Tensor& tensor,
+    bool is_dst = false);
+
+bool is_broadcast_from_other_to_self(
+    const at::Tensor& self,
+    const at::Tensor& other);
+
+at::MemoryFormat get_cl_tag_by_ndim(const int64_t ndim);
+
+bool binary_valid(
+    const at::Tensor& self,
+    const at::Tensor& other,
+    bool is_fusion = false);
+
+bool use_channels_last_for_conv(
+    const at::Tensor& src,
+    const at::Tensor& weight,
+    bool is_transpose);
+
+} // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
new file mode 100644
index 0000000000000..0c219fc8c6db6
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/native/mkldnn/xpu/detail/oneDNNContext.h>
+#include <ATen/native/mkldnn/xpu/detail/Attr.h>
+#include <ATen/native/mkldnn/xpu/detail/Utils.h>
+
+namespace at::native::onednn{
+
+TORCH_API sycl::event matmul(
+    at::Tensor& result,
+    const at::Tensor& mat1,
+    const at::Tensor& mat2,
+    const at::Tensor& b_raw,
+    bool m2_trans,
+    Attr attr,
+    const std::vector<sycl::event>& deps = {});
+
+TORCH_API sycl::event convolution(
+    at::Tensor& dst,
+    const at::Tensor& src,
+    const at::Tensor& weight,
+    const at::Tensor& bia,
+    IntArrayRef padding_front_top_left,
+    IntArrayRef padding_back_bottom_right,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    Attr& attr,
+    const std::vector<sycl::event>& deps = {});
+
+TORCH_API sycl::event convolution_backward_weights(
+    at::Tensor& diff_weight,
+    at::Tensor& diff_bia,
+    const at::Tensor& diff_dst,
+    const at::Tensor& src,
+    IntArrayRef diff_weight_aten_size,
+    IntArrayRef padding_front_top_left,
+    IntArrayRef padding_back_bottom_right,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    const std::vector<sycl::event>& deps = {});
+
+TORCH_API sycl::event convolution_backward_data(
+    at::Tensor& diff_src,
+    const at::Tensor& diff_dst,
+    const at::Tensor& weight,
+    IntArrayRef padding_front_top_left,
+    IntArrayRef padding_back_bottom_right,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool bias_defined,
+    const std::vector<sycl::event>& deps = {});
+
+TORCH_API sycl::event deconvolution(
+    at::Tensor& dst,
+    const at::Tensor& src,
+    const at::Tensor& weight,
+    const at::Tensor& bia,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dst_padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    Attr& attr,
+    const std::vector<sycl::event>& deps = {});
+
+TORCH_API sycl::event deconvolution_backward_data(
+    at::Tensor& diff_src,
+    const at::Tensor& diff_dst,
+    const at::Tensor& weight,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool bias_defined,
+    const std::vector<sycl::event>& deps = {});
+
+TORCH_API sycl::event deconvolution_backward_weights(
+    at::Tensor& diff_weight,
+    at::Tensor& diff_bia,
+    const at::Tensor& diff_dst,
+    const at::Tensor& src,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    const std::vector<sycl::event>& deps = {});
+
+dnnl::memory::dims conv_dst_size(
+    int64_t ndim,
+    IntArrayRef src_tz,
+    IntArrayRef wgh_tz,
+    IntArrayRef padding_front_top_left,
+    IntArrayRef padding_back_bottom_right,
+    IntArrayRef stride,
+    IntArrayRef dilation);
+
+dnnl::memory::dims deconv_dst_size(
+    IntArrayRef src_size,
+    IntArrayRef wgh_size,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    IntArrayRef dst_padding,
+    int64_t groups);
+
+} // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.cpp
new file mode 100644
index 0000000000000..9bec64c8c0248
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.cpp
@@ -0,0 +1,27 @@
+#include <ATen/native/mkldnn/xpu/detail/oneDNNContext.h>
+#include <ATen/native/mkldnn/xpu/detail/Utils.h>
+
+/* *
+ * Do NOT put any kernels or call any device binaries here!
+ * Only maintain oneDNN runtime states in this file.
+ * */
+namespace at::native::onednn {
+
+using namespace dnnl;
+
+GpuEngineManager& GpuEngineManager::Instance() {
+  static GpuEngineManager myInstance;
+  return myInstance;
+}
+
+GpuStreamManager& GpuStreamManager::Instance() {
+  static thread_local GpuStreamManager myInstance;
+  return myInstance;
+}
+
+bool set_onednn_verbose(int level) {
+  dnnl::status rs = dnnl::set_verbose(level);
+  return rs == dnnl::status::success;
+}
+
+} // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h
new file mode 100644
index 0000000000000..c7e7a5e94b406
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <ATen/Config.h>
+
+#include <c10/core/Device.h>
+#include <c10/xpu/XPUFunctions.h>
+#include <c10/xpu/XPUStream.h>
+
+#include <oneapi/dnnl/dnnl.hpp>
+#include <oneapi/dnnl/dnnl_sycl.hpp>
+#include <vector>
+
+namespace at::native::onednn {
+
+TORCH_API dnnl::memory make_onednn_memory(
+    dnnl::memory::desc md,
+    dnnl::engine& engine,
+    void* ptr);
+
+// Keep non-static and non-inline
+bool set_onednn_verbose(int level);
+
+// GpuEngineManager singleton
+struct TORCH_API GpuEngineManager {
+  static GpuEngineManager& Instance(); // Singleton
+
+  dnnl::engine& get_engine(const Device& device) {
+    TORCH_INTERNAL_ASSERT(device.type() == kXPU);
+    TORCH_INTERNAL_ASSERT(device.index() < c10::xpu::device_count());
+    return *engine_pool[device.index()];
+  }
+
+  GpuEngineManager(GpuEngineManager const&) = delete;
+  GpuEngineManager& operator=(GpuEngineManager const&) = delete;
+
+ protected:
+  GpuEngineManager() {
+    int device_count = (int)c10::xpu::device_count();
+    TORCH_INTERNAL_ASSERT(device_count > 0);
+    for (int i = 0; i < device_count; i++) {
+        engine_pool.push_back(
+            std::make_shared<dnnl::engine>(dnnl::sycl_interop::make_engine(
+              c10::xpu::get_raw_device(i), c10::xpu::get_device_context()
+            )));
+    }
+  }
+  ~GpuEngineManager() {}
+
+ private:
+  std::vector<std::shared_ptr<dnnl::engine>> engine_pool;
+};
+
+// GpuStreamManager singleton
+struct TORCH_API GpuStreamManager {
+  static GpuStreamManager& Instance(); // Singleton
+
+  dnnl::stream get_stream() {
+    c10::DeviceIndex device_index = c10::xpu::current_device();
+    TORCH_INTERNAL_ASSERT(device_index < c10::xpu::device_count());
+    return dnnl::sycl_interop::make_stream(
+        GpuEngineManager::Instance().get_engine({c10::kXPU, device_index}),
+        c10::xpu::getCurrentXPUStream(device_index).queue());
+  }
+
+  GpuStreamManager(GpuStreamManager const&) = delete;
+  GpuStreamManager& operator=(GpuStreamManager const&) = delete;
+
+ protected:
+  GpuStreamManager() {
+  }
+  ~GpuStreamManager() {}
+
+};
+
+} // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mps/MPSGraphSonomaOps.h b/aten/src/ATen/native/mps/MPSGraphSonomaOps.h
new file mode 100644
index 0000000000000..b4cf3ad5dbcc8
--- /dev/null
+++ b/aten/src/ATen/native/mps/MPSGraphSonomaOps.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+
+#if !defined(__MAC_14_0) && \
+    (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
+
+typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode)
+{
+    MPSGraphFFTScalingModeNone          = 0L,
+    MPSGraphFFTScalingModeSize          = 1L,
+    MPSGraphFFTScalingModeUnitary       = 2L,
+};
+
+@interface FakeMPSGraphFFTDescriptor : NSObject<NSCopying>
+@property (readwrite, nonatomic) BOOL inverse;
+@property (readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode;
+@property (readwrite, nonatomic) BOOL roundToOddHermitean;
++(nullable instancetype) descriptor;
+@end
+
+@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor;
+
+@interface MPSGraph (SonomaOps)
+-(MPSGraphTensor * _Nonnull) conjugateWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                                            name:(NSString * _Nullable) name;
+
+-(MPSGraphTensor * _Nonnull) realPartOfTensor:(MPSGraphTensor * _Nonnull) tensor
+                                         name:(NSString * _Nullable) name;
+
+
+-(MPSGraphTensor * _Nonnull) fastFourierTransformWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                                                       axes:(NSArray<NSNumber *> * _Nonnull) axes
+                                                 descriptor:(MPSGraphFFTDescriptor * _Nonnull) descriptor
+                                                       name:(NSString * _Nullable) name;
+
+-(MPSGraphTensor * _Nonnull) realToHermiteanFFTWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                                                     axes:(NSArray<NSNumber *> * _Nonnull) axes
+                                               descriptor:(MPSGraphFFTDescriptor * _Nonnull) descriptor
+                                                     name:(NSString * _Nullable) name;
+
+-(MPSGraphTensor * _Nonnull) HermiteanToRealFFTWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                                                     axes:(NSArray<NSNumber *> * _Nonnull) axes
+                                               descriptor:(MPSGraphFFTDescriptor * _Nonnull) descriptor
+                                                     name:(NSString * _Nullable) name;
+@end
+
+// define BFloat16 enums for MacOS13
+#define MPSDataTypeBFloat16 ((MPSDataType) (MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16))
+
+// define Metal version
+#define MTLLanguageVersion3_1 ((MTLLanguageVersion) ((3 << 16) + 1))
+#endif
diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index d47d77b819e50..3e812d0718dcc 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -2,6 +2,7 @@
 
 #pragma once
 
+#include <initializer_list>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/Tensor.h>
 #include <ATen/Utils.h>
@@ -46,11 +47,13 @@ struct MPSScalar {
     at::Half h;
     int64_t i;
     bool b;
+    c10::complex<float> cf;
+    c10::complex<at::Half> ch;
+    at::BFloat16 bf16;
   } value {};
 };
 
-void runMPSGraph(
-    MPSStream* mpsStream,
+void runMPSGraph(MPSStream* mpsStream,
     MPSGraph* mpsGraph,
     NSDictionary* feeds,
     NSDictionary* results);
@@ -69,10 +72,13 @@ static inline std::string getMPSTypeString(const Tensor& t, bool short_name = fa
   return getMPSTypeString(t.scalar_type(), short_name);
 }
 std::string scalarToMetalTypeString(const c10::ScalarType& scalar_type);
+static inline std::string scalarToMetalTypeString(const Tensor& t) {
+  return scalarToMetalTypeString(t.scalar_type());
+}
 NSArray<NSNumber*>* getTensorAxes(const Tensor& t);
 NSArray<NSNumber*>* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim);
 std::string getMPSShapeString(MPSShape* shape);
-std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true);
+std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true, bool exclude_shape = false);
 std::string getArrayRefString(const IntArrayRef s);
 // use has_storage() on the returned tensor to determine if src actually is a view
 Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst);
@@ -327,6 +333,30 @@ inline bool is_dense_in_storage(const at::Tensor& t) {
   return compute_storage_numel_distance(t) == static_cast<size_t>(t.numel());
 }
 
+
+class MetalShaderLibrary {
+public:
+  MetalShaderLibrary(const std::string& src, unsigned nparams_ = 0): shaderSource(src), nparams(nparams_) {}
+  MetalShaderLibrary(const MetalShaderLibrary&) = delete;
+  inline id<MTLComputePipelineState> getPipelineStateForFunc(const std::string& fname) {
+    return getLibraryPipelineState(getLibrary(), fname);
+  }
+  id<MTLComputePipelineState> getPipelineStateForFunc(const std::string& fname, const std::initializer_list<std::string>& params) {
+    return getLibraryPipelineState(getLibrary(params), fname);
+  }
+private:
+  id<MTLComputePipelineState> getLibraryPipelineState(id<MTLLibrary> lib, const std::string& fname);
+  id<MTLLibrary> getLibrary();
+  id<MTLLibrary> getLibrary(const std::initializer_list<std::string>& params);
+
+  id<MTLLibrary> compileLibrary(const std::string& src);
+  std::string shaderSource;
+  unsigned nparams;
+  id<MTLLibrary> library = nil;
+  std::unordered_map<std::string, id<MTLLibrary>> libMap;
+  std::unordered_map<std::string, id<MTLComputePipelineState>> cplMap;
+};
+
 static inline void mtl_setBuffer(id<MTLComputeCommandEncoder> encoder, const Tensor& t, unsigned idx) {
   [encoder setBuffer:getMTLBufferStorage(t)
               offset:t.storage_offset() * t.element_size()
@@ -344,4 +374,53 @@ static inline void mtl_dispatch1DJob(id<MTLComputeCommandEncoder> encoder,
 
 id<MTLBuffer> generateKernelDataOffsets(id<MTLComputeCommandEncoder> commandEncoder, const TensorIteratorBase& iter, bool use_64bit_index = false);
 
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1) {
+        return @{ p1.getMPSGraphTensor(): p1.getMPSGraphTensorData() };
+}
+
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2) {
+        return @{
+                p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(),
+                p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(),
+         };
+}
+
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3) {
+        return @{
+                p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(),
+                p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(),
+                p3.getMPSGraphTensor(): p3.getMPSGraphTensorData(),
+         };
+}
+
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3, Placeholder& p4) {
+        return @{
+                p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(),
+                p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(),
+                p3.getMPSGraphTensor(): p3.getMPSGraphTensorData(),
+                p4.getMPSGraphTensor(): p4.getMPSGraphTensorData(),
+         };
+}
+
+inline void runMPSGraph(MPSStream* stream, MPSGraph* graph, NSDictionary* feeds, Placeholder& result) {
+        runMPSGraph(stream, graph, feeds, dictionaryFromPlaceholders(result));
+}
+
+inline bool supportsComplex() {
+  return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS);
+}
+
+// MPS yet to support double types, but starting from MacOS 14, supports bfloat16
+inline bool supportedFloatingType(ScalarType dtype) {
+  return dtype == kFloat || dtype == kHalf || dtype == kBFloat16;
+}
+
+inline bool supportedFloatingType(const Tensor& t) {
+  return supportedFloatingType(t.scalar_type());
+}
+
+inline bool needsGather(const Tensor& t) {
+  return !t.is_contiguous() || t.storage_offset();
+}
+
 } // namespace at::native::mps
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index ef651e784e0fd..8170bd0047397 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -3,8 +3,10 @@
 #include <ATen/TensorIterator.h>
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <fmt/format.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -48,12 +50,24 @@ void runMPSGraph(MPSStream* mpsStream, MPSGraph* mpsGraph, NSDictionary* feeds,
   mpsStream->executeMPSGraph(mpsGraph, feeds, results, SyncType::COMMIT_ADAPTIVE);
 }
 
+static inline void checkSupportsComplex() {
+  TORCH_CHECK_TYPE(supportsComplex(), "MPS complex types are only supported on MacOS 14.0 or newer.");
+}
+
+static inline void checkSupportsBFloat16() {
+  TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS),
+                   "MPS bfloat16 type is supported on MacOS 14.0 or newer.");
+}
+
 MPSDataType getMPSDataType(ScalarType scalar_type) {
   switch (scalar_type) {
     case ScalarType::Float:
       return MPSDataTypeFloat32;
     case ScalarType::Half:
       return MPSDataTypeFloat16;
+    case ScalarType::BFloat16:
+      checkSupportsBFloat16();
+      return MPSDataTypeBFloat16;
     case ScalarType::Int:
       return MPSDataTypeInt32;
     case ScalarType::Long:
@@ -71,12 +85,10 @@ MPSDataType getMPSDataType(ScalarType scalar_type) {
                        "Cannot convert a float64 Tensor to MPS as the MPS framework doesn't support float64. "
                        "Please use float32 instead.")
     case ScalarType::ComplexHalf:
-      TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS),
-                       "MPS complex types are only supported on MacOS 14.0 or newer.");
+      checkSupportsComplex();
       return MPSDataTypeComplexFloat16;
     case ScalarType::ComplexFloat:
-      TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS),
-                       "MPS complex types are only supported on MacOS 14.0 or newer.");
+      checkSupportsComplex();
       return MPSDataTypeComplexFloat32;
     default:
       TORCH_CHECK_TYPE(
@@ -132,6 +144,9 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
       return MPSDataTypeFloat32;
     case ScalarType::Half:
       return MPSDataTypeFloat16;
+    case ScalarType::BFloat16:
+      checkSupportsBFloat16();
+      return MPSDataTypeBFloat16;
     case ScalarType::Int:
       return MPSDataTypeInt32;
     case ScalarType::Long:
@@ -145,12 +160,13 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
     case ScalarType::Bool:
       return MPSDataTypeBool;
     case ScalarType::ComplexHalf:
-      TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS),
-                       "MPS complex types are only supported on MacOS 14.0 or newer.");
+      checkSupportsComplex();
       return MPSDataTypeComplexFloat16;
+    // This is an intentional fallthrough supporting ComplexDouble for Scalar
+    // types as they are casted to Complex64 currently.
+    case ScalarType::ComplexDouble:
     case ScalarType::ComplexFloat:
-      TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS),
-                       "MPS complex types are only supported on MacOS 14.0 or newer.");
+      checkSupportsComplex();
       return MPSDataTypeComplexFloat32;
     default:
       TORCH_CHECK_TYPE(
@@ -166,6 +182,8 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
       return short_name ? "f32" : "Float32";
     case ScalarType::Half:
       return short_name ? "f16" : "Float16";
+    case ScalarType::BFloat16:
+      return short_name ? "bf16" : "BFloat16";
     case ScalarType::Int:
       return short_name ? "i32" : "Int32";
     case ScalarType::Long:
@@ -193,6 +211,9 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
       return "float";
     case ScalarType::Half:
       return "half";
+    case ScalarType::BFloat16:
+      checkSupportsBFloat16();
+      return "bfloat";
     case ScalarType::Int:
       return "int";
     case ScalarType::Long:
@@ -256,7 +277,7 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
   return ss.str();
 }
 
-std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype) {
+std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype, bool exclude_shape) {
   std::string str;
   // The key format per tensor would look like ":Float32[1,1,1,10]:"
   for (const Tensor& tensor : tensors) {
@@ -267,8 +288,12 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
       if (tensor.dim() == 0) {
         str += "Scalar";
       } else {
-        const NSString* ns_shape_key = [[getMPSShape(tensor) valueForKey:@"description"] componentsJoinedByString:@","];
-        str += std::string(ns_shape_key.UTF8String);
+        if (exclude_shape) {
+          str += "[-1]";
+        } else {
+          str +=
+              std::string([[getMPSShape(tensor) valueForKey:@"description"] componentsJoinedByString:@","].UTF8String);
+        }
       }
       str += "]";
     } else {
@@ -343,9 +368,8 @@ void printTensorNDArray(const Tensor& t) {
   TORCH_CHECK(src.is_mps(), "Placeholder storage has not been allocated on MPS device!");
   // extract the pointer to MTLBuffer from the Tensor's storage
   id<MTLBuffer> srcBuf = getMTLBufferStorage(src);
-  bool sliceViewTensor = canSliceViewTensor(src, mpsShape);
   // a view tensor could be contiguous (e.g., slice ops) or non-contiguous (e.g., transpose())
-  if ((!src.is_contiguous() || (src.storage_offset() && !sliceViewTensor)) && gatherTensorData) {
+  if (needsGather(src) && gatherTensorData) {
     Tensor emptyShell = Tensor();
     // use "_tensor" from Placeholder to retain view's output during its usage in other ops
     _tensor = gatherViewTensor(src, emptyShell);
@@ -361,19 +385,13 @@ void printTensorNDArray(const Tensor& t) {
   // if buffer size is zero in here, it's not a user error. It could be a missing check for
   // tensor.numel() == 0 in our internal implementations of ops.
   TORCH_INTERNAL_ASSERT([srcBuf length] > 0, "Placeholder tensor is empty!");
-  const MPSDataType mpsDataType = dataType != MPSDataTypeInvalid ? dataType
-      : _tensor.dim() == 0                                       ? getMPSScalarType(_tensor.scalar_type())
-                                                                 : getMPSDataType(_tensor.scalar_type());
-
-  if (src.is_contiguous() && src.storage_offset() && sliceViewTensor) {
-    _value = getMPSGraphTensorDataForView(src, mpsShape, mpsDataType);
-  } else {
-    if (!mpsShape) {
-      mpsShape = getMPSShape(_tensor);
-    }
-
-    _value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf shape:mpsShape dataType:mpsDataType] autorelease];
+  if (dataType == MPSDataTypeInvalid) {
+    const auto scalar_type = _tensor.scalar_type();
+    dataType = _tensor.dim() == 0 ? getMPSScalarType(scalar_type) : getMPSDataType(scalar_type);
   }
+  _value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf
+                                                    shape:mpsShape ? mpsShape : getMPSShape(_tensor)
+                                                 dataType:dataType] autorelease];
 
   TORCH_INTERNAL_ASSERT(_value);
   _placeholder = mpsGraphTensor;
@@ -393,7 +411,7 @@ void printTensorNDArray(const Tensor& t) {
     MPSNDArray* emptyArray = [[[MPSNDArray alloc] initWithDevice:mpsStream->device() descriptor:desc] autorelease];
     result = [[[MPSGraphTensorData alloc] initWithMPSNDArray:emptyArray] autorelease];
   }
-  assert(result);
+  TORCH_INTERNAL_ASSERT(result);
   return result;
 }
 
@@ -404,6 +422,8 @@ MPSScalar getMPSScalar(const Scalar& scalar, ScalarType type) {
       return {.value.f = scalar.to<float>(), .size = sizeof(float), .type = type};
     case ScalarType::Half:
       return {.value.h = scalar.to<at::Half>(), .size = sizeof(short), .type = type};
+    case ScalarType::BFloat16:
+      return {.value.bf16 = scalar.to<at::BFloat16>(), .size = sizeof(short), .type = type};
     case ScalarType::Long:
       return {.value.i = scalar.to<int64_t>(), .size = sizeof(int64_t), .type = type};
     case ScalarType::Int:
@@ -416,6 +436,11 @@ MPSScalar getMPSScalar(const Scalar& scalar, ScalarType type) {
       return {.value.i = scalar.to<uint8_t>(), .size = sizeof(uint8_t), .type = type};
     case ScalarType::Bool:
       return {.value.b = scalar.to<bool>(), .size = sizeof(bool), .type = type};
+    case ScalarType::ComplexHalf:
+      return {.value.ch = scalar.to<c10::complex<at::Half>>(), .size = sizeof(int32_t), .type = type};
+    case ScalarType::ComplexFloat:
+    case ScalarType::ComplexDouble:
+      return {.value.cf = scalar.to<c10::complex<float>>(), .size = sizeof(int64_t), .type = type};
     default:
       TORCH_INTERNAL_ASSERT(false, "Unsupported scalar type '", type, "' on MPS backend.");
   }
@@ -455,7 +480,7 @@ Tensor wrapped_scalar_tensor_mps(const Scalar& scalar, const Device device) {
   } else if (scalar.isComplex()) {
     tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kComplexDouble));
   } else {
-    AT_ASSERT(scalar.isIntegral(false));
+    TORCH_INTERNAL_ASSERT(scalar.isIntegral(false));
     tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kLong));
   }
   tensor.unsafeGetTensorImpl()->set_wrapped_number(true);
@@ -518,7 +543,7 @@ string get_mem_format_string(c10::MemoryFormat memory_format) {
       mem_format_key = "ChannelsLast";
       break;
     default:
-      assert(0 && "Invalid memory format\n");
+      TORCH_CHECK(false, "Invalid memory format", memory_format);
   }
 
   return mem_format_key;
@@ -587,4 +612,74 @@ void executeMPSAllocatorCallback(void* ptr, EventType event) override {}
   return kernelDataOffsets;
 }
 
+id<MTLLibrary> MetalShaderLibrary::getLibrary() {
+  if (C10_UNLIKELY(!library)) {
+    TORCH_INTERNAL_ASSERT(nparams == 0);
+    library = compileLibrary(shaderSource);
+  }
+  return library;
+}
+
+id<MTLLibrary> MetalShaderLibrary::getLibrary(const std::initializer_list<std::string>& params) {
+  TORCH_INTERNAL_ASSERT(nparams == params.size());
+  std::string key = "";
+  for (auto p : params) {
+    key += ":" + p;
+  }
+  auto lib = libMap[key];
+  if (lib) {
+    return lib;
+  }
+  auto it = params.begin();
+  switch (nparams) {
+    case 1:
+      lib = compileLibrary(fmt::format(shaderSource, *it));
+      break;
+    case 2: {
+      auto& first = *it++;
+      auto& second = *it;
+      lib = compileLibrary(fmt::format(shaderSource, first, second));
+      break;
+    }
+    case 3: {
+      auto& first = *it++;
+      auto& second = *it++;
+      auto& third = *it;
+      lib = compileLibrary(fmt::format(shaderSource, first, second, third));
+      break;
+    }
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unsupported number of paramaters ", nparams);
+  }
+  return libMap[key] = lib;
+}
+
+id<MTLLibrary> MetalShaderLibrary::compileLibrary(const std::string& src) {
+  NSError* error = nil;
+  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
+  [options setLanguageVersion:is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) ? MTLLanguageVersion3_1
+                                                                                      : MTLLanguageVersion2_3];
+  auto str = [NSString stringWithCString:src.c_str() encoding:NSASCIIStringEncoding];
+  auto device = MPSDevice::getInstance()->device();
+  library = [device newLibraryWithSource:str options:options error:&error];
+  TORCH_CHECK(library, "Failed to create metal library, error: ", [[error description] UTF8String]);
+  return library;
+}
+
+id<MTLComputePipelineState> MetalShaderLibrary::getLibraryPipelineState(id<MTLLibrary> lib, const std::string& fname) {
+  auto key = fmt::format("{}:{}", reinterpret_cast<void*>(lib), fname);
+  auto cpl = cplMap[key];
+  if (cpl) {
+    return cpl;
+  }
+
+  NSError* error = nil;
+  id<MTLFunction> func = [lib newFunctionWithName:[NSString stringWithUTF8String:fname.c_str()]];
+  TORCH_CHECK(func, "Failed to create function state object for: ", fname);
+  cpl = [[lib device] newComputePipelineStateWithFunction:func error:&error];
+  TORCH_CHECK(cpl, "Failed to created pipeline state object, error: ", [[error description] UTF8String]);
+
+  return cplMap[key] = cpl;
+}
+
 } // namespace at::native::mps
diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index a8ac52c2ec25e..da11401c948d3 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -41,10 +41,6 @@
 #include <ATen/ops/threshold_native.h>
 #endif
 
-#ifdef __OBJC__
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
-#endif
-
 using namespace at::mps;
 
 namespace at::native {
@@ -53,6 +49,10 @@ Tensor relu_mps(const Tensor& self) {
   using namespace mps;
   using CachedGraph = MPSUnaryCachedGraph;
 
+  if (self.numel() == 0) {
+    return self;
+  }
+
   MPSStream* stream = getCurrentMPSStream();
 
   bool executeGatherOp =
@@ -75,13 +75,8 @@ Tensor relu_mps(const Tensor& self) {
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, nil, false);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return output;
@@ -90,6 +85,10 @@ Tensor relu_mps(const Tensor& self) {
 Tensor& relu_mps_(Tensor& self) {
   using namespace mps;
   using CachedGraph = MPSUnaryCachedGraph;
+
+  if (self.numel() == 0) {
+    return self;
+  }
   // Inplace relu
   Tensor& output = self;
   bool executeGatherOp =
@@ -117,13 +116,8 @@ Tensor relu_mps(const Tensor& self) {
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, executeGatherOp ? out : output, nil, false);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
     if (executeGatherOp) {
       output.copy_(out);
     }
@@ -137,8 +131,17 @@ Tensor relu_mps(const Tensor& self) {
   using CachedGraph = MPSUnaryCachedGraph;
   TORCH_CHECK(output.is_mps());
 
+  if (self.numel() == 0) {
+    return;
+  }
+
   MPSStream* stream = getCurrentMPSStream();
 
+  bool executeGatherOp =
+      !(self.is_contiguous(MemoryFormat::Contiguous) || self.is_contiguous(MemoryFormat::ChannelsLast) ||
+        self.is_contiguous(MemoryFormat::ChannelsLast3d));
+  Tensor output_ = at::empty_like(self, executeGatherOp ? MemoryFormat::Contiguous : MemoryFormat::Preserve);
+
   @autoreleasepool {
     string key = "leaky_relu" + getTensorsStringKey({self}) + ":" + to_string(negative_slope.to<double>());
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@@ -158,17 +161,16 @@ Tensor relu_mps(const Tensor& self) {
       newCachedGraph->outputTensor_ = outputTensor;
     });
 
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
+    Placeholder outputPlaceholder =
+        Placeholder(cachedGraph->outputTensor_, executeGatherOp ? output_ : output, nil, false);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
+  }
+  if (executeGatherOp) {
+    output.copy_(output_);
   }
 }
 
@@ -182,8 +184,14 @@ Tensor relu_mps(const Tensor& self) {
   using CachedGraph = MPSUnaryGradCachedGraph;
   TORCH_CHECK(output.is_mps());
 
+  if (self.numel() == 0) {
+    return;
+  }
+
   MPSStream* stream = getCurrentMPSStream();
 
+  Tensor output_ = at::empty_like(self, self.suggest_memory_format());
+
   @autoreleasepool {
     string key =
         "leaky_relu_backward" + getTensorsStringKey({self, grad_output}) + ":" + to_string(negative_slope.to<double>());
@@ -213,19 +221,13 @@ Tensor relu_mps(const Tensor& self) {
 
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
     Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, output_);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
+  output.copy_(output_);
 }
 
 TORCH_IMPL_FUNC(log_softmax_mps_out)
@@ -266,13 +268,8 @@ Tensor relu_mps(const Tensor& self) {
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -312,15 +309,8 @@ Tensor relu_mps(const Tensor& self) {
     Placeholder resultPlaceholder = Placeholder(cachedGraph->gradInputTensor_, out);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradPlaceholder.getMPSGraphTensor() : gradPlaceholder.getMPSGraphTensorData(),
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(gradPlaceholder, outputPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, resultPlaceholder);
   }
 }
 
@@ -363,13 +353,8 @@ Tensor relu_mps(const Tensor& self) {
         Placeholder(cachedGraph->outputTensor_, executeGatherOp ? output_ : output, nil, false);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   if (executeGatherOp) {
@@ -447,15 +432,8 @@ Tensor relu_mps(const Tensor& self) {
         Placeholder(cachedGraph->gradInputTensor_, executeGatherOp ? grad_input_ : grad_input, nil, false);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   if (executeGatherOp) {
@@ -507,16 +485,8 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, outputPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
   }
 }
 
@@ -555,16 +525,8 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, outputPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
   }
 }
 
@@ -609,14 +571,8 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -663,16 +619,8 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c
     Placeholder gradPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, gradInput);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradPlaceholder.getMPSGraphTensor() : gradPlaceholder.getMPSGraphTensorData(),
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(gradPlaceholder, selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -728,6 +676,11 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c
   auto approximate_type = get_gelutype_enum(approximate);
   MPSStream* stream = getCurrentMPSStream();
 
+  bool executeGatherOp =
+      !(self.is_contiguous(MemoryFormat::Contiguous) || self.is_contiguous(MemoryFormat::ChannelsLast) ||
+        self.is_contiguous(MemoryFormat::ChannelsLast3d));
+  Tensor output_ = at::empty_like(self, executeGatherOp ? MemoryFormat::Contiguous : MemoryFormat::Preserve);
+
   @autoreleasepool {
     const auto key = "gelu_out_mps" + getTensorsStringKey({self}) + ":" + gelutype_to_string(approximate_type);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@@ -744,16 +697,16 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c
       newCachedGraph->outputTensor_ = outputTensor;
     });
 
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
+    Placeholder outputPlaceholder =
+        Placeholder(cachedGraph->outputTensor_, executeGatherOp ? output_ : output, nil, false);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
+  }
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  if (executeGatherOp) {
+    output.copy_(output_);
   }
 }
 
@@ -763,8 +716,11 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c
   using CachedGraph = MPSUnaryGradCachedGraph;
 
   // Empty output
-  if (grad_input.numel() == 0)
+  if (self.numel() == 0) {
     return;
+  }
+
+  Tensor grad_input_ = at::empty_like(self, self.suggest_memory_format());
 
   auto approximate_type = get_gelutype_enum(approximate);
   MPSStream* stream = getCurrentMPSStream();
@@ -838,18 +794,12 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c
 
     Placeholder gradPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad);
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input_);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradPlaceholder.getMPSGraphTensor() : gradPlaceholder.getMPSGraphTensorData(),
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(gradPlaceholder, selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
+  grad_input.copy_(grad_input_);
 }
 
 static void elu_variants_out_mps(const Tensor& self,
@@ -864,7 +814,7 @@ static void elu_variants_out_mps(const Tensor& self,
   auto resultMemFormat = result.suggest_memory_format();
   bool executeGatherOp = !(self.is_contiguous(resultMemFormat) && result.is_contiguous(resultMemFormat));
   Tensor out;
-  if (executeGatherOp && resultMemFormat == MemoryFormat::ChannelsLast) {
+  if (executeGatherOp) {
     out = at::empty_like(result, MemoryFormat::Contiguous);
   }
 
@@ -923,18 +873,10 @@ static void elu_variants_out_mps(const Tensor& self,
       newCachedGraph->outputTensor_ = outputTensor;
     });
 
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
-    Placeholder outputPlaceholder =
-        Placeholder(cachedGraph->outputTensor_, out.has_storage() ? out : result, nil, false);
-
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
+    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out.has_storage() ? out : result, nil, false);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
     if (out.has_storage()) {
       result.copy_(out);
     }
@@ -1040,15 +982,8 @@ static void elu_variants_out_mps(const Tensor& self,
     Placeholder gradInputPlaceholder =
         Placeholder(cachedGraph->gradInputTensor_, out.has_storage() ? out : grad_input, nil, false);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      selfOrResultPlaceholder.getMPSGraphTensor() : selfOrResultPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfOrResultPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
     if (out.has_storage()) {
       grad_input.copy_(out);
     }
@@ -1095,13 +1030,8 @@ static void elu_variants_out_mps(const Tensor& self,
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -1167,16 +1097,8 @@ static void elu_variants_out_mps(const Tensor& self,
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
     Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData(),
-    };
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder, gradOutputPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
   }
   return grad_input;
 }
@@ -1256,9 +1178,7 @@ Tensor glu_backward_mps(const Tensor& grad_output, const Tensor& self, const int
       cachedGraph->betaTensor_ : getMPSGraphTensorFromScalar(stream, beta_scalar),
       cachedGraph->thresholdTensor_ : getMPSGraphTensorFromScalar(stream, threshold_scalar),
     };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -1335,9 +1255,7 @@ Tensor glu_backward_mps(const Tensor& grad_output, const Tensor& self, const int
       cachedGraph->betaTensor_ : getMPSGraphTensorFromScalar(stream, beta_scalar),
       cachedGraph->thresholdTensor_ : getMPSGraphTensorFromScalar(stream, threshold_scalar),
     };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
   }
 }
 
@@ -1357,6 +1275,11 @@ Tensor glu_backward_mps(const Tensor& grad_output, const Tensor& self, const int
 
   MPSStream* stream = getCurrentMPSStream();
 
+  bool executeGatherOp =
+      !(self.is_contiguous(MemoryFormat::Contiguous) || self.is_contiguous(MemoryFormat::ChannelsLast) ||
+        self.is_contiguous(MemoryFormat::ChannelsLast3d));
+  Tensor result_ = at::empty_like(self, executeGatherOp ? MemoryFormat::Contiguous : MemoryFormat::Preserve);
+
   @autoreleasepool {
     string key = "mish_out_mps:" + getTensorsStringKey({self});
 
@@ -1373,16 +1296,15 @@ Tensor glu_backward_mps(const Tensor& grad_output, const Tensor& self, const int
       newCachedGraph->inputTensor_ = inputTensor;
       newCachedGraph->outputTensor_ = outputTensor;
     });
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
+    Placeholder outputPlaceholder =
+        Placeholder(cachedGraph->outputTensor_, executeGatherOp ? result_ : result, nil, false);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
+  }
+  if (executeGatherOp) {
+    result.copy_(result_);
   }
 }
 
@@ -1445,14 +1367,8 @@ Tensor mish_backward_mps(const Tensor& grad_output, const Tensor& self) {
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
     return grad_input;
   }
 }
@@ -1518,9 +1434,7 @@ Tensor mish_backward_mps(const Tensor& grad_output, const Tensor& self) {
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
       cachedGraph->lambdTensor_ : getMPSGraphTensorFromScalar(stream, lambd_scalar),
     };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -1587,9 +1501,7 @@ static void shrink_backward_out_mps(const Tensor& grad_output,
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
       cachedGraph->lambdTensor_ : getMPSGraphTensorFromScalar(stream, lambd_scalar),
     };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
     return;
   }
 }
@@ -1648,14 +1560,8 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
     Placeholder weightPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      weightPlaceholder.getMPSGraphTensor() : weightPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder, weightPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
   return result;
 }
@@ -1720,16 +1626,8 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->outputTensor_, grad_input);
     Placeholder weightedGradPlaceholder = Placeholder(cachedGraph->weightedGradTensor_, weight_grad);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      weightPlaceholder.getMPSGraphTensor() : weightPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData(),
-      weightedGradPlaceholder.getMPSGraphTensor() : weightedGradPlaceholder.getMPSGraphTensorData()
-    };
+    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfPlaceholder, weightPlaceholder);
+    auto results = dictionaryFromPlaceholders(gradInputPlaceholder, weightedGradPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
   return std::tuple<Tensor, Tensor>{grad_input, weight_grad};
@@ -1770,14 +1668,8 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -1832,16 +1724,8 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
     Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder, gradOutputPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
   }
 }
 
@@ -1881,14 +1765,8 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -1943,13 +1821,8 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder, gradOutputPlaceholder);
+    auto results = dictionaryFromPlaceholders(gradInputPlaceholder);
 
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
@@ -2033,14 +1906,8 @@ Tensor hardtanh_backward_mps(const Tensor& grad_output, const Tensor& self, cons
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
-
+    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfPlaceholder);
+    auto results = dictionaryFromPlaceholders(gradInputPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
 
@@ -2116,12 +1983,8 @@ Tensor hardtanh_backward_mps(const Tensor& grad_output, const Tensor& self, cons
         Placeholder(cachedGraph->outputTensor_, out.has_storage() ? out : output, nil, false);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    auto results = dictionaryFromPlaceholders(outputPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
     if (out.has_storage()) {
       output.copy_(out);
@@ -2218,15 +2081,8 @@ Tensor hardswish_backward_mps(const Tensor& grad_output, const Tensor& self) {
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, gradInputPlaceholder);
   }
   return grad_input;
 }
diff --git a/aten/src/ATen/native/mps/operations/AdaptivePooling.mm b/aten/src/ATen/native/mps/operations/AdaptivePooling.mm
index c88d468f7ed15..c38d5faec6a73 100644
--- a/aten/src/ATen/native/mps/operations/AdaptivePooling.mm
+++ b/aten/src/ATen/native/mps/operations/AdaptivePooling.mm
@@ -37,8 +37,9 @@ static void set_kernel_params(int64_t isizeH,
 
   if (isizeH >= osizeH) {
     if (check_avg_pooling) {
-      TORCH_CHECK((isizeH % osizeH == 0 && isizeW % osizeW == 0),
-                  "Adaptive pool MPS: input sizes must be divisible by output sizes.");
+      TORCH_CHECK(
+          (isizeH % osizeH == 0 && isizeW % osizeW == 0),
+          "Adaptive pool MPS: input sizes must be divisible by output sizes. Non-divisible input sizes are not implemented on MPS device yet. For now, you can manually transfer tensor to cpu in this case. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/96056)");
     }
     strideH = (int64_t)(isizeH / osizeH);
     strideW = (int64_t)(isizeW / osizeW);
@@ -46,8 +47,9 @@ static void set_kernel_params(int64_t isizeH,
     kernel_sizeW = isizeW - (osizeW - 1) * strideW;
   } else {
     if (check_avg_pooling) {
-      TORCH_CHECK((osizeH % isizeH == 0 && osizeW % isizeW == 0),
-                  "Adaptive pool MPS: output sizes must be divisible by input sizes.");
+      TORCH_CHECK(
+          (osizeH % isizeH == 0 && osizeW % isizeW == 0),
+          "Adaptive pool MPS: output sizes must be divisible by input sizes. Non-divisible input sizes are not implemented on MPS device yet. For now, you can manually transfer tensor to cpu in this case. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/96056)");
     }
     strideH = (int64_t)(osizeH / isizeH);
     strideW = (int64_t)(osizeW / isizeW);
diff --git a/aten/src/ATen/native/mps/operations/BinaryKernel.mm b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
index 3b428d09c2d3f..409512a737971 100644
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@@ -6,6 +6,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/operations/BinaryKernel.h>
+// For MTLLanguageVersion_3_1
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -22,7 +24,7 @@
 namespace at::native {
 namespace mps {
 
-static const char* METAL_BINARY = R"BINARY_METAL(
+static MetalShaderLibrary lib(R"BINARY_METAL(
 
 #include <metal_stdlib>
 using namespace metal;
@@ -190,24 +192,25 @@ kernel void nextafter_kernel(constant void  * input_       [[buffer(0)]],
                              device   void  * out_         [[buffer(2)]],
                              constant uint3 * offsets      [[buffer(3)]],
                              uint tid [[thread_position_in_grid]]) {
-  device   T* out   = (device   T*)((device uint8_t*)out_ + offsets[tid].x);
-  constant T* input = (constant T*)((constant uint8_t*)input_ + offsets[tid].y);
-  constant T* other = (constant T*)((constant uint8_t*)other_ + offsets[tid].z);
-
-  if (*input == *other)
-  {
-    *out = *other;
-  }
-  else if (isnan(*input) || isnan(*other))
-  {
+  auto out   = (device   T*)((device uint8_t*)out_ + offsets[tid].x);
+  auto input = *(constant T*)((constant uint8_t*)input_ + offsets[tid].y);
+  auto other = *(constant T*)((constant uint8_t*)other_ + offsets[tid].z);
+#if __METAL_VERSION__ >= 310
+  *out = nextafter(input, other);
+#else
+  if (input == other) {
+    *out = input;
+  } else if (isnan(input) || isnan(other)) {
     *out = NAN;
-  }
-  else
-  {
-    U bits = as_type<U>(*input);
-    bits = bits + ((*other > *input) ? 1 : -1);
+  } else if (input == 0) {
+    constexpr auto one = as_type<T>(static_cast<U>(1));
+    *out = other > 0 ? one : -one;
+  } else {
+    U bits = as_type<U>(input);
+    (input > 0) ^ (input > other) ? bits++ : bits--;
     *out = as_type<T>(bits);
   }
+#endif
 }
 
 #define REGISTER_NEXTAFTER_OP(DTYPE, UTYPE)  \
@@ -249,43 +252,7 @@ kernel void complex_kernel(constant void  * real_       [[buffer(0)]],
 REGISTER_COMPLEX_OUT_OP(float);
 REGISTER_COMPLEX_OUT_OP(half);
 
-)BINARY_METAL";
-
-using namespace mps;
-
-static id<MTLLibrary> compileBinaryOpsLibrary(id<MTLDevice> device) {
-  static id<MTLLibrary> binaryLibrary = nil;
-  if (binaryLibrary) {
-    return binaryLibrary;
-  }
-
-  NSError* error = nil;
-  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
-  [options setLanguageVersion:MTLLanguageVersion2_3];
-  binaryLibrary = [device newLibraryWithSource:[NSString stringWithCString:METAL_BINARY encoding:NSASCIIStringEncoding]
-                                       options:options
-                                         error:&error];
-  TORCH_CHECK(binaryLibrary, "Failed to create metal binary library, error: ", [[error description] UTF8String]);
-  return binaryLibrary;
-}
-
-static id<MTLComputePipelineState> binaryPipelineState(id<MTLDevice> device, const std::string& kernel) {
-  static std::unordered_map<std::string, id<MTLComputePipelineState>> psoCache;
-  id<MTLComputePipelineState> pso = psoCache[kernel];
-  if (pso) {
-    return pso;
-  }
-
-  NSError* error = nil;
-  id<MTLLibrary> binaryLib = compileBinaryOpsLibrary(device);
-  id<MTLFunction> binaryFunc = [binaryLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]];
-  TORCH_CHECK(binaryFunc, "Failed to create function state object for: ", kernel);
-  pso = [device newComputePipelineStateWithFunction:binaryFunc error:&error];
-  TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]);
-
-  psoCache[kernel] = pso;
-  return pso;
-}
+)BINARY_METAL");
 
 static void binary_mps_impl(TensorIteratorBase& iter, const std::string func_name) {
   TORCH_CHECK(iter.common_dtype() != at::kDouble, "float64 is not supported on MPS");
@@ -302,10 +269,10 @@ static void binary_mps_impl(TensorIteratorBase& iter, const std::string func_nam
   dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
     @autoreleasepool {
       id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
-      const std::string kernel = func_name + "_" + scalarToMetalTypeString(input.scalar_type());
+      const std::string kernel = func_name + "_" + scalarToMetalTypeString(input);
       auto kernelDataOffsets = generateKernelDataOffsets(computeEncoder, iter);
 
-      id<MTLComputePipelineState> binaryPSO = binaryPipelineState(device, kernel);
+      id<MTLComputePipelineState> binaryPSO = lib.getPipelineStateForFunc(kernel);
 
       // this function call is a no-op if MPS Profiler is not enabled
       getMPSProfiler().beginProfileKernel(binaryPSO, kernel, {input, other});
@@ -323,7 +290,7 @@ static void binary_mps_impl(TensorIteratorBase& iter, const std::string func_nam
 }
 
 void complex_mul_out(const Tensor& input, const Tensor& other, const Tensor& output) {
-  TORCH_INTERNAL_ASSERT(c10::isComplexType(input.scalar_type()) && c10::isComplexType(other.scalar_type()));
+  TORCH_INTERNAL_ASSERT(c10::isComplexType(input.scalar_type()) || c10::isComplexType(other.scalar_type()));
   auto new_size = at::infer_size(input.sizes(), other.sizes());
   if (!output.sizes().equals(new_size)) {
     output.resize_(new_size);
@@ -332,9 +299,10 @@ void complex_mul_out(const Tensor& input, const Tensor& other, const Tensor& out
   if (length == 0) {
     return;
   }
+  auto common_dtype = output.scalar_type();
   auto output_as_real = at::view_as_real(output).select(output.dim(), 0);
-  auto input_as_real = at::view_as_real(input).select(input.dim(), 0);
-  auto other_as_real = at::view_as_real(other).select(other.dim(), 0);
+  auto input_as_real = at::view_as_real(input.to(kMPS, common_dtype)).select(input.dim(), 0);
+  auto other_as_real = at::view_as_real(other.to(kMPS, common_dtype)).select(other.dim(), 0);
   auto iter =
       TensorIteratorConfig().add_output(output_as_real).add_input(input_as_real).add_input(other_as_real).build();
 
diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index 4c96954ef4aeb..a225ab83028d2 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -53,6 +53,14 @@
 #define BinaryOpFn(graph, primary, secondary) \
   MPSGraphTensor*(mps::BinaryOpCachedGraph * graph, MPSGraphTensor * primary, MPSGraphTensor * secondary)
 
+static inline Tensor legacy_complex_as_view(const Tensor& t) {
+  // Convert non-complex types (and cdouble CPU scalars) to cfloat
+  if (!isComplexType(t.scalar_type()) || t.scalar_type() == kComplexDouble) {
+    return at::view_as_real(t.to(kMPS, kComplexFloat));
+  }
+  return at::view_as_real(t.dim() != 0 ? t : t.to(kMPS));
+}
+
 // alpha is always 1.0 except when this function is called from add_sub_lerp_template()
 static void binaryOpTensor(const Tensor& self,
                            const Tensor& other,
@@ -69,7 +77,8 @@ static void binaryOpTensor(const Tensor& self,
               "MPS: ",
               op_name,
               " op with int64 input is supported natively starting from macOS 13.2");
-  TORCH_CHECK_TYPE(!isComplexType(self.scalar_type()), "Complex types are unsupported on MPS");
+  TORCH_CHECK_TYPE(!isComplexType(self.scalar_type()) || mps::supportsComplex(),
+                   "Complex types are supported starting from MacOS 14.0+");
   MPSStream* mpsStream = getCurrentMPSStream();
 
   const bool is_self_scalar = self.dim() == 0;
@@ -88,7 +97,7 @@ static void binaryOpTensor(const Tensor& self,
   Tensor output = output_;
   bool needsCopyToOutput = false;
 
-  if (!output_.is_contiguous() || (output_.is_view() && (self.is_alias_of(output_) || other.is_alias_of(output_)))) {
+  if (needsGather(output_) || (output_.is_view() && (self.is_alias_of(output_) || other.is_alias_of(output_)))) {
     output = at::empty(output_.sizes(), output_.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
     needsCopyToOutput = true;
   }
@@ -184,9 +193,7 @@ static void binaryOpTensor(const Tensor& self,
     }
 
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, needsCopyToOutput ? output : output_);
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(mpsStream, cachedGraph->graph(), feeds, outputPlaceholder);
 
     if (needsCopyToOutput) {
       output_.copy_(output);
@@ -390,7 +397,7 @@ static void add_sub_lerp_template(const Tensor& self,
 CREATE_MPS_BINARY_COMPARISON_OP_FUNC(logical_xor_out_mps, logicalXOR, Tensor);
 
 TORCH_IMPL_FUNC(mul_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
-  if (c10::isComplexType(self.scalar_type()) || c10::isComplexType(other.scalar_type())) {
+  if (!mps::supportsComplex() && (c10::isComplexType(self.scalar_type()) || c10::isComplexType(other.scalar_type()))) {
     return mps::complex_mul_out(self, other, output);
   }
   mps::binaryOpTensor(
@@ -420,19 +427,27 @@ static void add_sub_lerp_template(const Tensor& self,
 }
 
 TORCH_IMPL_FUNC(add_out_mps)(const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output) {
-  if (isComplexType(self.scalar_type()) && isComplexType(other.scalar_type()) && !alpha.isComplex()) {
+  if ((isComplexType(self.scalar_type()) || isComplexType(other.scalar_type())) && !alpha.isComplex() &&
+      !mps::supportsComplex()) {
     // Complex add with non-complex alpha is just add over views
-    return mps::add_sub_lerp_template(
-        at::view_as_real(self), at::view_as_real(other), alpha, at::view_as_real(output), "add");
+    return mps::add_sub_lerp_template(mps::legacy_complex_as_view(self),
+                                      mps::legacy_complex_as_view(other),
+                                      alpha,
+                                      mps::legacy_complex_as_view(output),
+                                      "add");
   }
   mps::add_sub_lerp_template(self, other, alpha, output, "add");
 }
 
 TORCH_IMPL_FUNC(sub_out_mps)(const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output) {
-  if (isComplexType(self.scalar_type()) && isComplexType(other.scalar_type()) && !alpha.isComplex()) {
+  if ((isComplexType(self.scalar_type()) || isComplexType(other.scalar_type())) && !alpha.isComplex() &&
+      !mps::supportsComplex()) {
     // Complex sub with non-complex alpha is just add over views
-    return mps::add_sub_lerp_template(
-        at::view_as_real(self), at::view_as_real(other), alpha, at::view_as_real(output), "sub");
+    return mps::add_sub_lerp_template(mps::legacy_complex_as_view(self),
+                                      mps::legacy_complex_as_view(other),
+                                      alpha,
+                                      mps::legacy_complex_as_view(output),
+                                      "sub");
   }
   mps::add_sub_lerp_template(self, other, alpha, output, "sub");
 }
diff --git a/aten/src/ATen/native/mps/operations/BitwiseOps.mm b/aten/src/ATen/native/mps/operations/BitwiseOps.mm
index 58a9e711b6322..f243b06ba5e9f 100644
--- a/aten/src/ATen/native/mps/operations/BitwiseOps.mm
+++ b/aten/src/ATen/native/mps/operations/BitwiseOps.mm
@@ -12,7 +12,7 @@
 
 namespace at::native {
 namespace mps {
-static const char* BITWISE_OPS_TEMPLATE = R"METAL(
+static MetalShaderLibrary lib(R"METAL(
 
 kernel void bitwise_and_tensor(constant uint& length [[buffer(0)]],
                          device {0}  *out [[buffer(1)]],
@@ -90,7 +90,8 @@ kernel void bitwise_not(constant uint& length [[buffer(0)]],
   }}
   out[offset] = ~a[offset];
 }}
-)METAL";
+)METAL",
+                              3);
 
 static const std::string& getMetalType(const c10::ScalarType& t) {
   // Mapping from c10::ScalarType to integral type that can be used for bitwise ops
@@ -117,48 +118,12 @@ kernel void bitwise_not(constant uint& length [[buffer(0)]],
   return getMetalType(s.type());
 }
 
-static id<MTLLibrary> compileBitwiseOpsLibrary(id<MTLDevice> device,
-                                               const std::string& t1,
-                                               const std::string& t2,
-                                               const std::string& t3) {
-  auto key = t1 + t2 + t3;
-  static std::unordered_map<std::string, id<MTLLibrary>> libMap;
-  auto it = libMap.find(key);
-  if (it != libMap.end()) {
-    return it->second;
-  }
-  NSError* error = nil;
-  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
-  [options setLanguageVersion:MTLLanguageVersion2_3];
-  auto rc =
-      [device newLibraryWithSource:[NSString stringWithUTF8String:fmt::format(BITWISE_OPS_TEMPLATE, t1, t2, t3).c_str()]
-                           options:options
-                             error:&error];
-  TORCH_CHECK(rc != nil && error == nil, "Failed to compile library: ", [[error localizedDescription] UTF8String]);
-  libMap[key] = rc;
-  return rc;
-}
-
-static id<MTLComputePipelineState> getCPLState(id<MTLDevice> device,
-                                               const std::string& t1,
-                                               const std::string& t2,
-                                               const std::string& t3,
+template <typename ScalarOrTensor>
+static id<MTLComputePipelineState> getCPLState(const Tensor& t1,
+                                               const Tensor& t2,
+                                               const ScalarOrTensor& t3,
                                                const std::string& fname) {
-  auto key = t1 + t2 + t3 + fname;
-  static std::unordered_map<std::string, id<MTLComputePipelineState>> cplMap;
-  auto it = cplMap.find(key);
-  if (it != cplMap.end()) {
-    return it->second;
-  }
-  NSError* error = nil;
-  auto library = compileBitwiseOpsLibrary(device, t1, t2, t3);
-  id<MTLFunction> func = [library newFunctionWithName:[NSString stringWithUTF8String:fname.c_str()]];
-  TORCH_CHECK(func != nil, "Can't get function ", fname);
-  auto rc = [device newComputePipelineStateWithFunction:func error:&error];
-  TORCH_CHECK(
-      rc != nil && error == nil, "Failed to construct pipeline state: ", [[error localizedDescription] UTF8String]);
-  cplMap[key] = rc;
-  return rc;
+  return lib.getPipelineStateForFunc(fname, {getMetalType(t1), getMetalType(t2), getMetalType(t3)});
 }
 
 static void handle_tensor_tensor_binary_op(const Tensor& self,
@@ -167,8 +132,7 @@ static void handle_tensor_tensor_binary_op(const Tensor& self,
                                            const std::string& kernel_name) {
   using namespace at::mps;
   MPSStream* stream = getCurrentMPSStream();
-  id<MTLComputePipelineState> cplState = getCPLState(
-      MPSDevice::getInstance()->device(), getMetalType(output), getMetalType(self), getMetalType(other), kernel_name);
+  auto cplState = getCPLState(output, self, other, kernel_name);
   uint32_t length = output.numel();
   if (length == 0) {
     return;
@@ -198,8 +162,7 @@ static void handle_tensor_scalar_binary_op(const Tensor& self,
                                            const std::string& kernel_name) {
   using namespace at::mps;
   MPSStream* stream = getCurrentMPSStream();
-  id<MTLComputePipelineState> cplState = getCPLState(
-      MPSDevice::getInstance()->device(), getMetalType(output), getMetalType(self), getMetalType(other), kernel_name);
+  auto cplState = getCPLState(output, self, other, kernel_name);
   uint64_t sval = other.to<int64_t>();
   uint32_t length = output.numel();
   if (length == 0) {
@@ -236,7 +199,7 @@ static void _bitwise_op_out_mps(const Tensor& self,
 
   auto output_size = at::infer_size_dimvector(self.sizes(), other.sizes());
   resize_output(output, output_size);
-  if (!output.is_contiguous()) {
+  if (needsGather(output)) {
     output = output.contiguous();
     needs_output_copy = true;
   }
@@ -277,7 +240,7 @@ static void _bitwise_not_out_mps(const Tensor& self, const Tensor& output_) {
   bool needs_output_copy = false;
 
   resize_output(output, self.sizes());
-  if (!output.is_contiguous()) {
+  if (needsGather(output)) {
     output = output.contiguous();
     needs_output_copy = true;
   }
@@ -296,8 +259,7 @@ static void _bitwise_not_out_mps(const Tensor& self, const Tensor& output_) {
   }
   using namespace at::mps;
   MPSStream* stream = getCurrentMPSStream();
-  id<MTLComputePipelineState> cplState = getCPLState(
-      MPSDevice::getInstance()->device(), getMetalType(output), getMetalType(self), getMetalType(self), "bitwise_not");
+  auto cplState = getCPLState(output, self, self, "bitwise_not");
   dispatch_sync(stream->queue(), ^() {
     getMPSProfiler().beginProfileKernel(cplState, "bitwise_not", {self});
 
diff --git a/aten/src/ATen/native/mps/operations/Blas.mm b/aten/src/ATen/native/mps/operations/Blas.mm
index 74cc252ddb3e9..1714a8e7e2f88 100644
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@@ -102,15 +102,8 @@ Tensor dot_mps(const Tensor& self, const Tensor& other) {
     Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder, otherPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return output;
@@ -188,10 +181,7 @@ Tensor dot_mps(const Tensor& self, const Tensor& other) {
       feeds[selfPlaceholder.getMPSGraphTensor()] = selfPlaceholder.getMPSGraphTensorData();
     }
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return result;
diff --git a/aten/src/ATen/native/mps/operations/Bucketization.mm b/aten/src/ATen/native/mps/operations/Bucketization.mm
index 52696dc019179..6f725e851af67 100644
--- a/aten/src/ATen/native/mps/operations/Bucketization.mm
+++ b/aten/src/ATen/native/mps/operations/Bucketization.mm
@@ -17,7 +17,7 @@
 namespace at::native {
 namespace mps {
 
-static const char* METAL_BUCKETIZATION = R"BUCKETIZE_METAL(
+static MetalShaderLibrary lib(R"BUCKETIZE_METAL(
 
 #include <metal_stdlib>
 using namespace metal;
@@ -194,44 +194,7 @@ kernel void searchsorted(
 REGISTER_SEARCHSORTED_OP(long, int);
 REGISTER_SEARCHSORTED_OP(long, long);
 
-)BUCKETIZE_METAL";
-
-static id<MTLLibrary> compileBucketizationOpsLibrary(id<MTLDevice> device) {
-  static id<MTLLibrary> bucketizationLibrary = nil;
-  if (bucketizationLibrary) {
-    return bucketizationLibrary;
-  }
-
-  NSError* error = nil;
-  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
-  [options setLanguageVersion:MTLLanguageVersion2_3];
-  bucketizationLibrary = [device newLibraryWithSource:[NSString stringWithCString:METAL_BUCKETIZATION
-                                                                         encoding:NSASCIIStringEncoding]
-                                              options:options
-                                                error:&error];
-  TORCH_CHECK(
-      bucketizationLibrary, "Failed to create metal bucketization library, error: ", [[error description] UTF8String]);
-  return bucketizationLibrary;
-}
-
-static id<MTLComputePipelineState> bucketizationPipelineState(id<MTLDevice> device, const std::string& kernel) {
-  static std::unordered_map<std::string, id<MTLComputePipelineState>> psoCache;
-  id<MTLComputePipelineState> pso = psoCache[kernel];
-  if (pso) {
-    return pso;
-  }
-
-  NSError* error = nil;
-  id<MTLLibrary> bucketizationLib = compileBucketizationOpsLibrary(device);
-  id<MTLFunction> bucketizationFunc =
-      [bucketizationLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]];
-  TORCH_CHECK(bucketizationFunc, "Failed to create function state object for: ", kernel);
-  pso = [device newComputePipelineStateWithFunction:bucketizationFunc error:&error];
-  TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]);
-
-  psoCache[kernel] = pso;
-  return pso;
-}
+)BUCKETIZE_METAL");
 
 static void searchsorted_mps_contiguous(Tensor& result,
                                         const Tensor& input,
@@ -250,15 +213,14 @@ static void searchsorted_mps_contiguous(Tensor& result,
   int64_t right_i64 = right;
   int64_t is_1d_boundaries = boundaries.dim() == 1;
 
-  id<MTLDevice> device = MPSDevice::getInstance()->device();
   MPSStream* mpsStream = getCurrentMPSStream();
-  dispatch_sync(mpsStream->queue(), ^() {
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
     @autoreleasepool {
       id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
 
-      const std::string kernel = "searchsorted_" + scalarToMetalTypeString(input.scalar_type()) + "_" +
-          scalarToMetalTypeString(result.scalar_type()) + (sorter.defined() ? "_sorter" : "");
-      id<MTLComputePipelineState> bucketizationPSO = mps::bucketizationPipelineState(device, kernel);
+      const std::string kernel = "searchsorted_" + scalarToMetalTypeString(input) + "_" +
+          scalarToMetalTypeString(result) + (sorter.defined() ? "_sorter" : "");
+      id<MTLComputePipelineState> bucketizationPSO = lib.getPipelineStateForFunc(kernel);
 
       // this function call is a no-op if MPS Profiler is not enabled
       getMPSProfiler().beginProfileKernel(bucketizationPSO, kernel, {input, boundaries, sorter});
@@ -308,7 +270,7 @@ static void searchsorted_mps_contiguous(Tensor& result,
     return result;
   }
 
-  // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaing
+  // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaining
   // the original result tensor
   Tensor out = result.contiguous();
 
diff --git a/aten/src/ATen/native/mps/operations/ConstantOps.mm b/aten/src/ATen/native/mps/operations/ConstantOps.mm
index 52c74c3637e1a..2e7d0881bb60f 100644
--- a/aten/src/ATen/native/mps/operations/ConstantOps.mm
+++ b/aten/src/ATen/native/mps/operations/ConstantOps.mm
@@ -21,13 +21,14 @@
   }
   Tensor output = self;
   bool needsCopyToOutput = false;
-  if (!self.is_contiguous() || self.storage_offset()) {
+  if (needsGather(self)) {
     output = at::empty(self.sizes(), self.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
     needsCopyToOutput = true;
   }
 
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
     MPSGraphTensor* outputTensor_ = nil;
   };
 
@@ -35,36 +36,23 @@
     string key = "fill_scalar_mps_impl" + getTensorsStringKey(self) + ":" + to_string(value.toDouble());
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      auto isBool = self.scalar_type() == c10::ScalarType::Bool;
-      auto isUInt8 = self.scalar_type() == c10::ScalarType::Byte;
-      auto dataType = !isUInt8 ? !isBool ? getMPSScalarType(self.scalar_type()) : MPSDataTypeInt8 : MPSDataTypeUInt32;
-      // constantWithScalar does not work for boolTypes on MacOS-12.[34]
-      // workaround by filing it as int8 tensor and than casting to bool
-      // See https://github.com/pytorch/pytorch/issues/82427
-      // constantWithScalar does not work for UInt8 Types on MacOS-12.[34]/Ventura preview
-      // workaround by filing it as uint32 tensor and than casting to uint8
-      // See https://github.com/pytorch/pytorch/issues/83692
-      MPSGraphTensor* inputTensor = [mpsGraph constantWithScalar:value.toDouble()
-                                                           shape:getMPSShape(self)
-                                                        dataType:dataType];
+      MPSGraphTensor* inputTensor = mpsGraphScalarPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()));
       MPSGraphTensor* outputTensor = [mpsGraph identityWithTensor:inputTensor name:nil];
-      if (isBool) {
-        outputTensor = [mpsGraph castTensor:outputTensor toType:MPSDataTypeBool name:@"constWithBool-workaround"];
-      }
-      if (isUInt8) {
-        outputTensor = [mpsGraph castTensor:outputTensor toType:MPSDataTypeUInt8 name:@"constWithUInt8-workaround"];
-      }
-
+      newCachedGraph->inputTensor_ = inputTensor;
       newCachedGraph->outputTensor_ = outputTensor;
     });
 
+    auto mpsScalar = getMPSScalar(value, self.scalar_type());
+    auto mpsScalarData = getMPSGraphTensorFromScalar(getCurrentMPSStream(), mpsScalar);
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{cachedGraph->inputTensor_ : mpsScalarData};
+
     Placeholder outputPlaceholder =
         Placeholder(cachedGraph->outputTensor_, needsCopyToOutput ? output : self, nullptr, !needsCopyToOutput);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
         @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
 
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), /*feeds*/ nil, results);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
 
     if (needsCopyToOutput) {
       self.copy_(output);
diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 4f262d1549fcb..fbf5a67262be2 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -318,10 +318,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,
       feeds[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData();
     }
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return *output;
@@ -486,15 +483,8 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
     auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
     auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      weightsPlaceholder.getMPSGraphTensor() : weightsPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, weightsPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
   return *grad_input;
 }
@@ -650,15 +640,8 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
     auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
     auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return grad_weight_t;
diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index 8b1dd402e4f34..572582f5cb947 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -2,9 +2,15 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/mps/Copy.h>
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/_copy_from_and_resize_native.h>
 #include <ATen/ops/_copy_from_native.h>
+#include <ATen/ops/imag.h>
+#include <ATen/ops/neg.h>
+#include <ATen/ops/real.h>
+#include <ATen/ops/view_as_real.h>
+#include <ATen/ops/zeros_like.h>
 
 namespace at::native {
 namespace mps {
@@ -41,14 +47,22 @@ static void copy_cast_mps(at::Tensor& dst,
   MPSShape* srcShape = getMPSShape(src);
 
   @autoreleasepool {
-    string key = "copy_cast_mps" + getTensorsStringKey({src, dst});
+    const bool needs_conj = src.is_conj() != dst.is_conj();
+    string key = "copy_cast_mps" + getTensorsStringKey({src, dst}, true, /*exclude_shape*/ true) + ":" +
+        std::to_string(needs_conj);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, src);
-      MPSGraphTensor* inputCastTensor = inputTensor;
+      MPSGraphTensor* inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, srcDType);
+      auto outputTensor = inputTensor;
       if (isFloatingType(src.scalar_type()) && dstDType == MPSDataTypeUInt8) {
-        inputCastTensor = [mpsGraph castTensor:inputTensor toType:MPSDataTypeInt32 name:@"cast"];
+        outputTensor = [mpsGraph castTensor:inputTensor toType:MPSDataTypeInt32 name:@"cast"];
+      }
+      if (srcDType != dstDType) {
+        outputTensor = [mpsGraph castTensor:outputTensor toType:dstDType name:@"cast"];
+      }
+      if (needs_conj) {
+        TORCH_CHECK(supportsComplex(), "MPS complex tensors conjugation needs MacOS14+");
+        outputTensor = [mpsGraph conjugateWithTensor:outputTensor name:nil];
       }
-      MPSGraphTensor* outputTensor = [mpsGraph castTensor:inputCastTensor toType:dstDType name:@"cast"];
 
       newCachedGraph->inputTensor_ = inputTensor;
       newCachedGraph->outputTensor_ = outputTensor;
@@ -72,12 +86,11 @@ static void copy_cast_mps(at::Tensor& dst,
 
   id<MTLDevice> device = MPSDevice::getInstance()->device();
   MPSStream* stream = getCurrentMPSStream();
-  Tensor dst;
-  Tensor src;
+  Tensor dst = dst_;
+  Tensor src = src_;
+
   if (!dst_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) {
     dst = at::empty_like(dst_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  } else {
-    dst = dst_;
   }
 
   auto storage_byte_offset = src_.storage_offset() * src_.itemsize();
@@ -90,9 +103,8 @@ static void copy_cast_mps(at::Tensor& dst,
       src = src_.expand_as(dst).contiguous();
       storage_byte_offset = src.storage_offset() * src.itemsize();
     }
-  } else {
-    src = src_;
   }
+
   id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
   size_t dst_tensor_nbytes = dst.nbytes();
 
@@ -110,28 +122,25 @@ static void copy_cast_mps(at::Tensor& dst,
                                                          length:alignedLength
                                                         options:options
                                                     deallocator:nil];
-    id<MTLBuffer> tmpBuffer = sourceBuffer;
-    Tensor tmp;
+    id<MTLBuffer> maybeCastedSourceBuffer = sourceBuffer;
+    Tensor maybeCastedSource;
     bool needsBlit = true;
     if (src_.dtype() != dst.dtype()) {
       if (destOffset == 0 && storage_byte_offset == 0) {
         // Return the casted tensor directly if there's no destination offset
         needsBlit = false;
-        tmpBuffer = destBuffer;
+        maybeCastedSourceBuffer = destBuffer;
       } else if (src.element_size() < dst.element_size()) {
-        tmp = at::empty(dst.sizes(), dst.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
-        tmpBuffer = getMTLBufferStorage(tmp);
+        maybeCastedSource = at::empty(dst.sizes(), dst.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
+        maybeCastedSourceBuffer = getMTLBufferStorage(maybeCastedSource);
       }
-    }
 
-    size_t size_to_copy = src.nbytes();
-    // In case of dtype change, first convert src inplace
-    if (src_.dtype() != dst.dtype()) {
-      copy_cast_mps(dst, src, tmpBuffer, sourceBuffer, non_blocking);
+      // In case of dtype change, first convert src inplace
+      copy_cast_mps(dst, src, maybeCastedSourceBuffer, sourceBuffer, non_blocking);
     }
 
     if (needsBlit) {
-      size_to_copy = (size_to_copy / src.element_size()) * dst.element_size();
+      const size_t size_to_copy = (src.nbytes() / src.element_size()) * dst.element_size();
 
       // If there's anything wrong with source, we shouldn't return dst_ silently and must error out.
       TORCH_INTERNAL_ASSERT(sourceBuffer && dst_tensor_nbytes > 0);
@@ -139,7 +148,7 @@ static void copy_cast_mps(at::Tensor& dst,
           getMPSProfiler().beginProfileCopy(sourceBuffer, destBuffer, src, dst, size_to_copy, non_blocking);
 
       stream->copy_and_sync(
-          tmpBuffer, destBuffer, size_to_copy, storage_byte_offset, destOffset, non_blocking, profile_id);
+          maybeCastedSourceBuffer, destBuffer, size_to_copy, storage_byte_offset, destOffset, non_blocking, profile_id);
     }
     [destBuffer release];
   }
@@ -227,7 +236,7 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
   Tensor src;
   auto sameMemFormat =
       src_.is_contiguous(dst_.suggest_memory_format()) && dst_.is_contiguous(dst_.suggest_memory_format());
-  const bool sameDataType = src_.dtype() == dst_.dtype();
+  const bool sameDataType = src_.dtype() == dst_.dtype() && src_.is_conj() == dst_.is_conj();
 
   if ((!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) ||
       // the copy_cast path requires storage_offset to be applied before casting
@@ -266,13 +275,32 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
     // for GPU to GPU copies we only encode to stream's command buffer (no flushing)
     stream->copy(sourceBuffer, destBuffer, src.nbytes(), src_byte_offset, dst_byte_offset, profile_id);
   } else {
-    if (dst_byte_offset) {
-      auto tmp = at::empty(dst_.sizes(), dst_.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
-      auto tmpBuffer = getMTLBufferStorage(tmp);
-      copy_cast_mps(tmp, src, tmpBuffer, sourceBuffer);
-
-      uint64_t profile_id = getMPSProfiler().beginProfileCopy(tmpBuffer, destBuffer, tmp, dst_, dst_.nbytes(), true);
-      stream->copy(tmpBuffer, destBuffer, dst_.nbytes(), 0, dst_byte_offset, profile_id);
+    // Simulate cast to Complex on older MacOS by initializing real and imag parts
+    if (dst_.is_complex() && !supportsComplex()) {
+      if (!src.is_complex()) {
+        at::real(dst_).copy_(src);
+        at::imag(dst_).fill_(0);
+      } else if (src.is_conj() || dst_.is_conj()) {
+        // One cannot take view of conjugated tensor, but for some reason real and imag views are fine
+        // Use this to implement a conjugation
+        at::real(dst_).copy_(at::real(src));
+        if (src.is_conj() != dst_.is_conj()) {
+          at::imag(dst_).copy_(at::neg(at::imag(src)));
+        } else {
+          at::imag(dst_).copy_(at::imag(src));
+        }
+      } else {
+        at::view_as_real(dst_).copy_(at::view_as_real(src));
+      }
+    } else if (dst_byte_offset) {
+      auto maybeCastedSource =
+          at::empty(dst_.sizes(), dst_.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
+      auto maybeCastedSourceBuffer = getMTLBufferStorage(maybeCastedSource);
+      copy_cast_mps(maybeCastedSource, src, maybeCastedSourceBuffer, sourceBuffer);
+
+      uint64_t profile_id = getMPSProfiler().beginProfileCopy(
+          maybeCastedSourceBuffer, destBuffer, maybeCastedSource, dst_, dst_.nbytes(), true);
+      stream->copy(maybeCastedSourceBuffer, destBuffer, dst_.nbytes(), 0, dst_byte_offset, profile_id);
     } else {
       copy_cast_mps(dst_, src, destBuffer, sourceBuffer);
     }
diff --git a/aten/src/ATen/native/mps/operations/CrossKernel.mm b/aten/src/ATen/native/mps/operations/CrossKernel.mm
index 1e04a7633f1aa..69de4c5e78cc1 100644
--- a/aten/src/ATen/native/mps/operations/CrossKernel.mm
+++ b/aten/src/ATen/native/mps/operations/CrossKernel.mm
@@ -8,7 +8,10 @@
 namespace at::native {
 namespace {
 
-static const char* METAL_CROSS = R"CROSS_METAL(
+using namespace mps;
+
+static MetalShaderLibrary lib(R"CROSS_METAL(
+#include <metal_array>
 
 #include <metal_stdlib>
 using namespace metal;
@@ -75,44 +78,7 @@ kernel void cross(constant void     * input_        [[buffer(0)]],
 REGISTER_CROSS_OP(uchar);
 REGISTER_CROSS_OP(bool);
 
-)CROSS_METAL";
-
-using namespace mps;
-
-static id<MTLLibrary> compileCrossOpLibrary(id<MTLDevice> device) {
-  static id<MTLLibrary> crossLibrary = nil;
-  if (crossLibrary) {
-    return crossLibrary;
-  }
-
-  NSError* error = nil;
-  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
-  [options setLanguageVersion:MTLLanguageVersion2_3];
-  crossLibrary = [device newLibraryWithSource:[NSString stringWithCString:METAL_CROSS encoding:NSASCIIStringEncoding]
-                                      options:options
-                                        error:&error];
-  TORCH_CHECK(crossLibrary, "Failed to create metal cross library, error: ", [[error description] UTF8String]);
-  return crossLibrary;
-}
-
-static id<MTLComputePipelineState> crossPipelineState(id<MTLDevice> device, ScalarType scalar_type) {
-  std::string kernel = "cross_" + scalarToMetalTypeString(scalar_type);
-  static std::unordered_map<std::string, id<MTLComputePipelineState>> psoCache;
-  id<MTLComputePipelineState> pso = psoCache[kernel];
-  if (pso) {
-    return pso;
-  }
-
-  NSError* error = nil;
-  id<MTLLibrary> crossLib = compileCrossOpLibrary(device);
-  id<MTLFunction> crossFunc = [crossLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]];
-  TORCH_CHECK(crossFunc, "Failed to create function state object for: ", kernel);
-  pso = [device newComputePipelineStateWithFunction:crossFunc error:&error];
-  TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]);
-
-  psoCache[kernel] = pso;
-  return pso;
-}
+)CROSS_METAL");
 
 void cross_mps_impl(const Tensor& out, const Tensor& input, const Tensor& other, int64_t dim) {
   TORCH_CHECK(input.dtype() != at::kDouble, "float64 is not supported on MPS");
@@ -138,7 +104,7 @@ void cross_mps_impl(const Tensor& out, const Tensor& input, const Tensor& other,
       id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
       auto kernelDataOffsets = generateKernelDataOffsets(computeEncoder, iter);
 
-      id<MTLComputePipelineState> crossPSO = crossPipelineState(device, out.scalar_type());
+      auto crossPSO = lib.getPipelineStateForFunc("cross_" + scalarToMetalTypeString(out));
 
       // this function call is a no-op if MPS Profiler is not enabled
       getMPSProfiler().beginProfileKernel(crossPSO, "cross", {input, other});
diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index f8bb70086d5ff..7ed06c8bf4373 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -133,11 +133,7 @@
     }
 
     Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self);
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return self;
@@ -235,7 +231,7 @@
     scalar_type = ScalarType::Float;
   else if (scalar_type == ScalarType::ComplexHalf)
     scalar_type = ScalarType::Half;
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(scalar_type, "check_uniform_bounds", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, scalar_type, "check_uniform_bounds", [&] {
     const auto min = static_cast<double>(std::numeric_limits<scalar_t>::lowest());
     const auto max = static_cast<double>(std::numeric_limits<scalar_t>::max());
     TORCH_CHECK(from <= to, "uniform_ expects to return a [from, to) range, but found from=", from, " > to=", to);
@@ -325,17 +321,15 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
     to = *to_opt;
     TORCH_CHECK(from < to, "random_mps_ expects 'from' to be less than 'to', but got from=", from, " >= to=", to);
     if (isFloatingType(input_dtype)) {
-      AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16, input_dtype, "random_update_from_to", [&] {
-            from = templates::update_from<scalar_t>(from);
-            to = templates::update_to<scalar_t>(to);
-            TORCH_CHECK(
-                from < to,
-                "random_mps_ expects 'from' casted to dtype to be less than 'to' casted to dtype, but got from=",
-                from,
-                " >= to=",
-                to);
-          });
+      AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input_dtype, "random_update_from_to", [&] {
+        from = templates::update_from<scalar_t>(from);
+        to = templates::update_to<scalar_t>(to);
+        TORCH_CHECK(from < to,
+                    "random_mps_ expects 'from' casted to dtype to be less than 'to' casted to dtype, but got from=",
+                    from,
+                    " >= to=",
+                    to);
+      });
       templates::check_from_to_in_range(from, to - 1, self.dtype());
     }
   } else if (from != std::numeric_limits<int64_t>::lowest()) {
@@ -575,10 +569,7 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
       cachedGraph->stateTensor : stateTensorData,
       probPlaceholder.getMPSGraphTensor() : probPlaceholder.getMPSGraphTensorData()
     };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return result;
diff --git a/aten/src/ATen/native/mps/operations/Eye.mm b/aten/src/ATen/native/mps/operations/Eye.mm
index bdbb361a8a1e9..c7d682c3f22e0 100644
--- a/aten/src/ATen/native/mps/operations/Eye.mm
+++ b/aten/src/ATen/native/mps/operations/Eye.mm
@@ -98,9 +98,7 @@
     // Create dictionary of inputs/feeds and outputs/results
     // In this case, there are no inputs, so the feeds are nil
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = nil;
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
+    auto results = dictionaryFromPlaceholders(outputPlaceholder);
 
     // Run the graph
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
diff --git a/aten/src/ATen/native/mps/operations/FastFourierTransform.mm b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
new file mode 100644
index 0000000000000..21fb75bb2179e
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm
@@ -0,0 +1,179 @@
+#include <ATen/native/SpectralOpsUtils.h>
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
+#include <ATen/native/mps/OperationUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_fft_c2c_native.h>
+#include <ATen/ops/_fft_c2r_native.h>
+#include <ATen/ops/_fft_r2c_native.h>
+#endif
+
+#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
+@implementation FakeMPSGraphFFTDescriptor
++ (nullable instancetype)descriptor {
+  // Redispatch the constructor to the actual implementation
+  id desc = NSClassFromString(@"MPSGraphFFTDescriptor");
+  return (FakeMPSGraphFFTDescriptor*)[desc descriptor];
+}
+
+- (nonnull id)copyWithZone:(nullable NSZone*)zone {
+  return self;
+}
+@end
+#endif
+
+namespace at::native {
+namespace {
+MPSGraphFFTScalingMode normalization_to_ScalingMode(int64_t normalization) {
+  switch (static_cast<fft_norm_mode>(normalization)) {
+    case fft_norm_mode::none:
+      return MPSGraphFFTScalingModeNone;
+    case fft_norm_mode::by_n:
+      return MPSGraphFFTScalingModeSize;
+    case fft_norm_mode::by_root_n:
+      return MPSGraphFFTScalingModeUnitary;
+    default:
+      break;
+  }
+  TORCH_CHECK(false, "Unsupported normalization type", normalization);
+}
+
+NSArray<NSNumber*>* IntArrayToNSArray(IntArrayRef arr) {
+  auto rc = [NSMutableArray<NSNumber*> arrayWithCapacity:arr.size()];
+  for (const auto idx : c10::irange(arr.size())) {
+    rc[idx] = [NSNumber numberWithInteger:arr[idx]];
+  }
+  return rc;
+}
+
+} // anonymous namespace
+
+Tensor _fft_c2r_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
+  TORCH_CHECK(self.is_complex());
+  auto in_sizes = self.sizes();
+  DimVector out_sizes(in_sizes.begin(), in_sizes.end());
+  out_sizes[dim.back()] = last_dim_size;
+  auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type())));
+  return _fft_c2r_mps_out(self, dim, normalization, last_dim_size, out);
+}
+
+Tensor _fft_r2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) {
+  TORCH_CHECK(self.is_floating_point());
+  auto input_sizes = self.sizes();
+  DimVector out_sizes(input_sizes.begin(), input_sizes.end());
+  auto last_dim = dim.back();
+  auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1;
+  if (onesided) {
+    out_sizes[last_dim] = last_dim_halfsize;
+  }
+
+  auto out = at::empty(out_sizes, self.options().dtype(c10::toComplexType(self.scalar_type())));
+  return _fft_r2c_mps_out(self, dim, normalization, onesided, out);
+}
+
+Tensor _fft_c2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) {
+  TORCH_CHECK(self.is_complex());
+  if (dim.empty()) {
+    return self.clone();
+  }
+  auto out = at::empty(self.sizes(), self.options());
+  return _fft_c2c_mps_out(self, dim, normalization, forward, out);
+}
+
+using namespace mps;
+
+// TODO: Investigate numerical discrepancies see https://github.com/pytorch/pytorch/issues/120237
+Tensor& _fft_r2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided, Tensor& out) {
+  TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+");
+  auto key = __func__ + getTensorsStringKey({self, out}) + ":" + getArrayRefString(dim) + ":" +
+      std::to_string(normalization) + ":" + std::to_string(onesided);
+  @autoreleasepool {
+    auto cachedGraph = LookUpOrCreateCachedGraph<MPSUnaryCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
+      auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+      auto descriptor = [MPSGraphFFTDescriptor descriptor];
+      descriptor.scalingMode = normalization_to_ScalingMode(normalization);
+      MPSGraphTensor* outputTensor;
+      if (onesided) {
+        // Return only unique results:
+        outputTensor = [mpsGraph realToHermiteanFFTWithTensor:inputTensor
+                                                         axes:IntArrayToNSArray(dim)
+                                                   descriptor:descriptor
+                                                         name:nil];
+      } else {
+        // Return with Hermitean conjugate results:
+        auto useDataType =
+            (inputTensor.dataType == MPSDataTypeFloat16) ? MPSDataTypeComplexFloat16 : MPSDataTypeComplexFloat32;
+        auto cTensor = [mpsGraph castTensor:inputTensor toType:useDataType name:nil];
+        outputTensor = [mpsGraph fastFourierTransformWithTensor:cTensor
+                                                           axes:IntArrayToNSArray(dim)
+                                                     descriptor:descriptor
+                                                           name:nil];
+      }
+      newCachedGraph->inputTensor_ = inputTensor;
+      newCachedGraph->outputTensor_ = outputTensor;
+    });
+    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
+  }
+  return out;
+}
+
+Tensor& _fft_c2r_mps_out(const Tensor& self,
+                         IntArrayRef dim,
+                         int64_t normalization,
+                         int64_t last_dim_size,
+                         Tensor& out) {
+  TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+");
+  auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" +
+      std::to_string(normalization) + ":" + std::to_string(last_dim_size);
+  @autoreleasepool {
+    auto cachedGraph = LookUpOrCreateCachedGraph<MPSUnaryCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
+      auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+      auto descriptor = [MPSGraphFFTDescriptor descriptor];
+      descriptor.scalingMode = normalization_to_ScalingMode(normalization);
+      auto outputTensor = [mpsGraph HermiteanToRealFFTWithTensor:inputTensor
+                                                            axes:IntArrayToNSArray(dim)
+                                                      descriptor:descriptor
+                                                            name:nil];
+      newCachedGraph->inputTensor_ = inputTensor;
+      newCachedGraph->outputTensor_ = outputTensor;
+    });
+    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
+  }
+  return out;
+}
+
+Tensor& _fft_c2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward, Tensor& out) {
+  TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+");
+  auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" +
+      std::to_string(normalization) + ":" + std::to_string(forward);
+  @autoreleasepool {
+    auto cachedGraph = LookUpOrCreateCachedGraph<MPSUnaryCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
+      auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+      auto descriptor = [MPSGraphFFTDescriptor descriptor];
+      descriptor.scalingMode = normalization_to_ScalingMode(normalization);
+      descriptor.inverse = !forward;
+      auto outputTensor = [mpsGraph fastFourierTransformWithTensor:inputTensor
+                                                              axes:IntArrayToNSArray(dim)
+                                                        descriptor:descriptor
+                                                              name:nil];
+      newCachedGraph->inputTensor_ = inputTensor;
+      newCachedGraph->outputTensor_ = outputTensor;
+    });
+    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
+  }
+  return out;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/Gamma.mm b/aten/src/ATen/native/mps/operations/Gamma.mm
index 1a6bbb25c05f2..826e7acdde358 100644
--- a/aten/src/ATen/native/mps/operations/Gamma.mm
+++ b/aten/src/ATen/native/mps/operations/Gamma.mm
@@ -24,7 +24,7 @@
  * See note [3-Clause BSD License for the Cephes Math Library].
  */
 
-static const char* GAMMA_OPS_TEMPLATE = R"METAL(
+static MetalShaderLibrary lib(R"METAL(
 #include <metal_stdlib>
 using namespace metal;
 
@@ -388,45 +388,11 @@ kernel void polygamma(device {0} *input [[buffer(0)]],
   output[id] = sgn * Gamma(n + 1) * calc_zeta(n + 1, x);
 }}
 
-)METAL";
+)METAL",
+                              2);
 
-static id<MTLLibrary> compileGammaOpsLibrary(id<MTLDevice> device, const std::string& t1, const std::string& t2) {
-  auto key = t1 + t2;
-  static std::unordered_map<std::string, id<MTLLibrary>> libMap;
-  auto it = libMap.find(key);
-  if (it != libMap.end()) {
-    return it->second;
-  }
-  NSError* error = nil;
-  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
-  [options setLanguageVersion:MTLLanguageVersion2_3];
-  auto rc = [device newLibraryWithSource:[NSString stringWithUTF8String:fmt::format(GAMMA_OPS_TEMPLATE, t1, t2).c_str()]
-                                 options:options
-                                   error:&error];
-  TORCH_CHECK(rc != nil && error == nil, "Failed to compile library: ", [[error localizedDescription] UTF8String]);
-  libMap[key] = rc;
-  return rc;
-}
-
-static id<MTLComputePipelineState> getCPLState(id<MTLDevice> device,
-                                               const std::string& t1,
-                                               const std::string& t2,
-                                               const std::string& fname) {
-  auto key = t1 + t2 + fname;
-  static std::unordered_map<std::string, id<MTLComputePipelineState>> cplMap;
-  auto it = cplMap.find(key);
-  if (it != cplMap.end()) {
-    return it->second;
-  }
-  NSError* error = nil;
-  auto library = compileGammaOpsLibrary(device, t1, t2);
-  id<MTLFunction> func = [library newFunctionWithName:[NSString stringWithUTF8String:fname.c_str()]];
-  TORCH_CHECK(func != nil, "Can't get function ", fname);
-  auto rc = [device newComputePipelineStateWithFunction:func error:&error];
-  TORCH_CHECK(
-      rc != nil && error == nil, "Failed to construct pipeline state: ", [[error localizedDescription] UTF8String]);
-  cplMap[key] = rc;
-  return rc;
+static id<MTLComputePipelineState> getCPLState(const Tensor& t1, const Tensor& t2, const std::string& fname) {
+  return lib.getPipelineStateForFunc(fname, {scalarToMetalTypeString(t1), scalarToMetalTypeString(t2)});
 }
 
 } // namespace mps
@@ -441,19 +407,15 @@ kernel void polygamma(device {0} *input [[buffer(0)]],
     return;
   }
 
-  if (!self.is_contiguous()) {
+  if (mps::needsGather(output_)) {
     output = output.contiguous();
     needs_output_copy = true;
   }
 
   using namespace mps;
 
-  std::string input_type = scalarToMetalTypeString(self.scalar_type());
-  std::string output_type = scalarToMetalTypeString(output.scalar_type());
-
   @autoreleasepool {
-    id<MTLDevice> device = MPSDevice::getInstance()->device();
-    id<MTLComputePipelineState> cplState = getCPLState(device, input_type, output_type, "lgamma");
+    id<MTLComputePipelineState> cplState = getCPLState(self, output, "lgamma");
 
     MPSStream* mpsStream = getCurrentMPSStream();
     dispatch_sync(mpsStream->queue(), ^() {
@@ -485,19 +447,15 @@ kernel void polygamma(device {0} *input [[buffer(0)]],
     return;
   }
 
-  if (!self.is_contiguous()) {
+  if (mps::needsGather(output_)) {
     output = output.contiguous();
     needs_output_copy = true;
   }
 
   using namespace mps;
 
-  std::string input_type = scalarToMetalTypeString(self.scalar_type());
-  std::string output_type = scalarToMetalTypeString(output.scalar_type());
-
   @autoreleasepool {
-    id<MTLDevice> device = MPSDevice::getInstance()->device();
-    id<MTLComputePipelineState> cplState = getCPLState(device, input_type, output_type, "digamma");
+    id<MTLComputePipelineState> cplState = getCPLState(self, output, "digamma");
 
     MPSStream* mpsStream = getCurrentMPSStream();
     dispatch_sync(mpsStream->queue(), ^() {
@@ -530,15 +488,13 @@ kernel void polygamma(device {0} *input [[buffer(0)]],
     return;
   }
 
-  if (!self.is_contiguous()) {
+  if (mps::needsGather(output_)) {
     output = output.contiguous();
     needs_output_copy = true;
   }
 
   using namespace mps;
 
-  std::string input_type = scalarToMetalTypeString(self.scalar_type());
-  std::string output_type = scalarToMetalTypeString(output.scalar_type());
   std::string func_name;
 
   if (order == 0) {
@@ -550,9 +506,7 @@ kernel void polygamma(device {0} *input [[buffer(0)]],
   }
 
   @autoreleasepool {
-    id<MTLDevice> device = MPSDevice::getInstance()->device();
-
-    id<MTLComputePipelineState> cplState = getCPLState(device, input_type, output_type, func_name);
+    id<MTLComputePipelineState> cplState = getCPLState(self, output, func_name);
 
     MPSStream* mpsStream = getCurrentMPSStream();
     dispatch_sync(mpsStream->queue(), ^() {
diff --git a/aten/src/ATen/native/mps/operations/GridSampler.mm b/aten/src/ATen/native/mps/operations/GridSampler.mm
index fc775333b6c71..8589ed28dc54e 100644
--- a/aten/src/ATen/native/mps/operations/GridSampler.mm
+++ b/aten/src/ATen/native/mps/operations/GridSampler.mm
@@ -116,14 +116,8 @@ static void grid_sampler_2d_mps_impl(Tensor& output,
     Placeholder gridPlaceholder = Placeholder(cachedGraph->gridTensor_, grid);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      gridPlaceholder.getMPSGraphTensor() : gridPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder, gridPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 } // namespace mps
diff --git a/aten/src/ATen/native/mps/operations/HistogramKernel.mm b/aten/src/ATen/native/mps/operations/HistogramKernel.mm
index 874e0173658d3..553d792c94708 100644
--- a/aten/src/ATen/native/mps/operations/HistogramKernel.mm
+++ b/aten/src/ATen/native/mps/operations/HistogramKernel.mm
@@ -21,7 +21,7 @@
   BINARY_SEARCH,
 };
 
-static const char* METAL_HISTOGRAM = R"HISTOGRAM_METAL(
+static MetalShaderLibrary lib(R"HISTOGRAM_METAL(
 
 #include <metal_stdlib>
 using namespace metal;
@@ -157,42 +157,7 @@ kernel void kernel_index_offset(constant uint         * strides         [[buffer
         data_offsets[thread_index] += remainder * strides[reversed_dim];
     }
 }
-)HISTOGRAM_METAL";
-
-static id<MTLLibrary> compileHistogramOpLibrary(id<MTLDevice> device) {
-  static id<MTLLibrary> histogramLibrary = nil;
-  if (histogramLibrary) {
-    return histogramLibrary;
-  }
-
-  NSError* error = nil;
-  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
-  [options setLanguageVersion:MTLLanguageVersion2_3];
-  histogramLibrary = [device newLibraryWithSource:[NSString stringWithCString:METAL_HISTOGRAM
-                                                                     encoding:NSASCIIStringEncoding]
-                                          options:options
-                                            error:&error];
-  TORCH_CHECK(histogramLibrary, "Failed to create metal histogram library, error: ", [[error description] UTF8String]);
-  return histogramLibrary;
-}
-
-static id<MTLComputePipelineState> histogramPipelineState(id<MTLDevice> device, const std::string& kernel) {
-  static std::unordered_map<std::string, id<MTLComputePipelineState>> psoCache;
-  id<MTLComputePipelineState> pso = psoCache[kernel];
-  if (pso) {
-    return pso;
-  }
-
-  NSError* error = nil;
-  id<MTLLibrary> crossLib = compileHistogramOpLibrary(device);
-  id<MTLFunction> crossFunc = [crossLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]];
-  TORCH_CHECK(crossFunc, "Failed to create function state object for: ", kernel);
-  pso = [device newComputePipelineStateWithFunction:crossFunc error:&error];
-  TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]);
-
-  psoCache[kernel] = pso;
-  return pso;
-}
+)HISTOGRAM_METAL");
 
 template <typename input_t, BIN_SELECTION_ALGORITHM algorithm>
 void histogramdd_kernel_impl(Tensor& hist_output,
@@ -279,7 +244,7 @@ void histogramdd_kernel_impl(Tensor& hist_output,
 
       id<MTLBuffer> stridedIndicesBuffer = [[device newBufferWithLength:stridedIndicesNumThreads * sizeof(uint)
                                                                 options:0] autorelease];
-      id<MTLComputePipelineState> stridedIndicesPSO = histogramPipelineState(device, "kernel_index_offset");
+      id<MTLComputePipelineState> stridedIndicesPSO = lib.getPipelineStateForFunc("kernel_index_offset");
 
       [computeEncoder setComputePipelineState:stridedIndicesPSO];
       [computeEncoder setBytes:strides.data() length:sizeof(uint32_t) * nDim atIndex:0];
@@ -289,8 +254,8 @@ void histogramdd_kernel_impl(Tensor& hist_output,
 
       mtl_dispatch1DJob(computeEncoder, stridedIndicesPSO, stridedIndicesNumThreads);
 
-      const std::string kernel = "histogramdd_" + scalarToMetalTypeString(input.scalar_type());
-      id<MTLComputePipelineState> histogramPSO = histogramPipelineState(device, kernel);
+      const std::string kernel = "histogramdd_" + scalarToMetalTypeString(input);
+      id<MTLComputePipelineState> histogramPSO = lib.getPipelineStateForFunc(kernel);
 
       // this function call is a no-op if MPS Profiler is not enabled
       getMPSProfiler().beginProfileKernel(histogramPSO, "histogram", allTensorsList);
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 01113f699a6b6..55ead2cba1bd8 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -39,10 +39,7 @@
 #include <ATen/ops/masked_select_native.h>
 #include <ATen/ops/nonzero.h>
 #include <ATen/ops/nonzero_native.h>
-#endif
-
-#ifdef __OBJC__
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#include <ATen/ops/view_as_real.h>
 #endif
 
 namespace at::native {
@@ -196,9 +193,8 @@ static void validateInputData(const TensorIteratorBase& iter,
     TORCH_CHECK(scalar_type == ScalarType::Float || inputTensor.scalar_type() == ScalarType::Int ||
                 scalar_type == ScalarType::Bool);
   } else {
-    TORCH_CHECK(c10::isIntegralType(scalar_type, /*includesBool=*/true) || scalar_type == ScalarType::Float ||
-                    scalar_type == ScalarType::Half || scalar_type == ScalarType::ComplexFloat ||
-                    scalar_type == ScalarType::ComplexHalf,
+    TORCH_CHECK(c10::isIntegralType(scalar_type, /*includesBool=*/true) || supportedFloatingType(scalar_type) ||
+                    scalar_type == ScalarType::ComplexFloat || scalar_type == ScalarType::ComplexHalf,
                 getMPSTypeString(inputTensor) + std::string(" not supported for index.Tensor_out"));
   }
 }
@@ -245,14 +241,20 @@ static void index_put_kernel_mps(TensorIterator& iter,
 } // namespace mps
 
 static Tensor nonzero_fallback(const Tensor& self) {
-  TORCH_WARN_ONCE("MPS: nonzero op is supported natively starting from macOS 13.0. ",
-                  "Falling back on CPU. This may have performance implications.");
-
   return at::nonzero(self.to("cpu")).clone().to("mps");
 }
 
 Tensor& nonzero_out_mps(const Tensor& self, Tensor& out_) {
-  if (!is_macos_13_or_newer()) {
+  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) {
+    TORCH_WARN_ONCE("MPS: nonzero op is supported natively starting from macOS 13.0. ",
+                    "Falling back on CPU. This may have performance implications.");
+    Tensor out_fallback = nonzero_fallback(self);
+    at::native::resize_output(out_, out_fallback.sizes());
+    out_.copy_(out_fallback.to("mps"));
+    return out_;
+  } else if (self.is_complex()) {
+    TORCH_WARN_ONCE("MPS: nonzero op is not supported for complex datatypes. ",
+                    "Falling back on CPU. This may have performance implications.");
     Tensor out_fallback = nonzero_fallback(self);
     at::native::resize_output(out_, out_fallback.sizes());
     out_.copy_(out_fallback.to("mps"));
@@ -270,7 +272,7 @@ static Tensor nonzero_fallback(const Tensor& self) {
 
   TORCH_CHECK(self.numel() < std::numeric_limits<int>::max(),
               "nonzero is not supported for tensors with more than INT_MAX elements, \
-  file a support request");
+  See https://github.com/pytorch/pytorch/issues/51871");
   TORCH_CHECK(
       out_.dtype() == at::kLong, "Expected object of scalar type ", at::kLong, " as out, but got ", out_.dtype());
   TORCH_CHECK(self.device() == out_.device(),
@@ -286,7 +288,6 @@ static Tensor nonzero_fallback(const Tensor& self) {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
     MPSGraphTensor* inputTensor_ = nil;
     MPSGraphTensor* outputTensor_ = nil;
-    MPSGraphTensor* scatterDataTensor_ = nil;
   };
 
   dispatch_sync(stream->queue(), ^() {
@@ -298,109 +299,27 @@ static Tensor nonzero_fallback(const Tensor& self) {
     return out_;
   }
 
-  bool contiguous_output = out_.is_contiguous();
+  bool contiguous_output = !needsGather(out_);
   Tensor out = out_;
   if (!contiguous_output) {
     out = at::empty(out_.sizes(), out_.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
   }
 
-  int64_t _apparentInputShape = 1;
-  for (auto dim : self.sizes()) {
-    _apparentInputShape *= dim;
-  }
-  MPSShape* apparentOutputShape = @[ @(total_nonzero * nDim) ];
-  MPSShape* apparentInputShape = @[ @(_apparentInputShape) ];
-
-  // Pseudocode:
-  //
-  // inputTensor     = [1,  0,  0,  3]
-  // inputNonZero    = [1,  0,  0,  1]
-  // indices         = [1,  1,  1,  2]
-  // maskedIndices   = [0, -1, -1,  1]
-  // coordinates     = [0,  1,  2,  3]
-  // scatterResult   = [0,  3]
-
   @autoreleasepool {
     string key = "nonzero_out_mps" + getTensorsStringKey(self);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSDataType inputDataType = getMPSDataType(self);
-      MPSShape* inputShape = getMPSShape(self);
-
-      MPSGraphTensor* inputTensor =
-          mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(self.scalar_type()), apparentInputShape);
-      MPSGraphTensor* scatterDataTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSScalarType(out.scalar_type()));
-      MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0 dataType:inputDataType];
-      MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1.0 dataType:MPSDataTypeInt32];
-      MPSGraphTensor* minusMaxDimTensor = [mpsGraph constantWithScalar:-maxDimensions dataType:MPSDataTypeInt32];
-      MPSGraphTensor* inputNotEqualToZeroTensor = [mpsGraph notEqualWithPrimaryTensor:inputTensor
-                                                                      secondaryTensor:zeroTensor
-                                                                                 name:nil];
-      MPSGraphTensor* maskTensor = [mpsGraph castTensor:inputNotEqualToZeroTensor
-                                                 toType:MPSDataTypeInt32
-                                                   name:@"castToInt32"];
-      MPSGraphTensor* indicesTensor = [mpsGraph cumulativeSumWithTensor:maskTensor axis:0 name:nil];
-      MPSGraphTensor* indicesMinusOneTensor = [mpsGraph subtractionWithPrimaryTensor:indicesTensor
-                                                                     secondaryTensor:oneTensor
-                                                                                name:nil];
-      MPSGraphTensor* maskedIndicesTensor = [mpsGraph selectWithPredicateTensor:inputNotEqualToZeroTensor
-                                                            truePredicateTensor:indicesMinusOneTensor
-                                                           falsePredicateTensor:minusMaxDimTensor
-                                                                           name:nil];
-      MPSGraphTensor* coordinatesTensor = [mpsGraph reshapeTensor:[mpsGraph coordinateAlongAxis:0
-                                                                                      withShape:inputShape
-                                                                                           name:nil]
-                                                        withShape:@[ @-1 ]
-                                                             name:nil];
-      if (nDim > 1) {
-        NSMutableArray<MPSGraphTensor*>* maskedIndicesTensorArray = [NSMutableArray arrayWithCapacity:nDim];
-        NSMutableArray<MPSGraphTensor*>* coordinatesTensorArray = [NSMutableArray arrayWithCapacity:nDim];
-
-        MPSGraphTensor* constantRankTensor = [mpsGraph constantWithScalar:nDim dataType:MPSDataTypeInt32];
-        maskedIndicesTensorArray[0] = [mpsGraph multiplicationWithPrimaryTensor:maskedIndicesTensor
-                                                                secondaryTensor:constantRankTensor
-                                                                           name:nil];
-        coordinatesTensorArray[0] = coordinatesTensor;
-        for (int i = 1; i < nDim; i++) {
-          maskedIndicesTensorArray[i] = [mpsGraph additionWithPrimaryTensor:maskedIndicesTensorArray[i - 1]
-                                                            secondaryTensor:oneTensor
-                                                                       name:nil];
-          coordinatesTensorArray[i] = [mpsGraph reshapeTensor:[mpsGraph coordinateAlongAxis:i
-                                                                                  withShape:inputShape
-                                                                                       name:nil]
-                                                    withShape:@[ @-1 ]
-                                                         name:nil];
-        }
-        maskedIndicesTensor = [mpsGraph concatTensors:maskedIndicesTensorArray dimension:0 interleave:YES name:nil];
-        coordinatesTensor = [mpsGraph concatTensors:coordinatesTensorArray dimension:0 interleave:YES name:nil];
-      }
+      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), getMPSShape(self));
 
-      MPSGraphTensor* outputTensor = [mpsGraph scatterWithDataTensor:scatterDataTensor
-                                                       updatesTensor:coordinatesTensor
-                                                       indicesTensor:maskedIndicesTensor
-                                                                axis:0
-                                                                mode:MPSGraphScatterModeSet
-                                                                name:nil];
+      MPSGraphTensor* outputTensor = [mpsGraph nonZeroIndicesOfTensor:inputTensor name:nil];
 
       newCachedGraph->inputTensor_ = inputTensor;
-      newCachedGraph->scatterDataTensor_ = scatterDataTensor;
       newCachedGraph->outputTensor_ = outputTensor;
     });
 
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, apparentInputShape);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out, apparentOutputShape);
-    Placeholder scatterPlaceholder = Placeholder(cachedGraph->scatterDataTensor_, out, apparentOutputShape);
-
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      scatterPlaceholder.getMPSGraphTensor() : scatterPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   if (!contiguous_output) {
@@ -411,7 +330,13 @@ static Tensor nonzero_fallback(const Tensor& self) {
 }
 
 Tensor nonzero_mps(const Tensor& self) {
-  if (!is_macos_13_or_newer()) {
+  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) {
+    TORCH_WARN_ONCE("MPS: nonzero op is supported natively starting from macOS 13.0. ",
+                    "Falling back on CPU. This may have performance implications.");
+    return nonzero_fallback(self);
+  } else if (self.is_complex()) {
+    TORCH_WARN_ONCE("MPS: nonzero op is not supported for complex datatypes ",
+                    "Falling back on CPU. This may have performance implications.");
     return nonzero_fallback(self);
   }
 
@@ -484,14 +409,8 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
     Placeholder outputPlaceholder =
         Placeholder(cachedGraph->outputTensor_, result, /*mpsShape*/ nil, /*gatherTensorData=*/false, outputDataType);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData()};
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    // Run the graph
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return result;
@@ -568,10 +487,7 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
       sourcePlaceholder.getMPSGraphTensor() : sourcePlaceholder.getMPSGraphTensorData(),
       cachedGraph->alphaTensor_ : getMPSGraphTensorFromScalar(stream, alpha_scalar),
     };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -625,7 +541,6 @@ Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) {
               " and ",
               output.size(dim),
               ".");
-  TORCH_CHECK(!self.is_complex(), "index_select(): Yet not supported for complex");
 
   for (const auto i : irange(self.dim())) {
     if (i == dim)
@@ -651,6 +566,14 @@ Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) {
     return output;
   }
 
+  // As of MacOS 14.4 gatherWithUpdatesTensor: still does not support complex
+  // So back to old view_as_real trick
+  if (self.is_complex()) {
+    auto out_view = at::view_as_real(output);
+    index_select_out_mps(at::view_as_real(self), dim, index, out_view);
+    return output;
+  }
+
   // Derive from MPSCachedGraph
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@@ -697,14 +620,8 @@ Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) {
                                                 /*gatherTensorData=*/false,
                                                 /*dataType=*/outputType);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      indexPlaceholder.getMPSGraphTensor() : indexPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder, indexPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return output;
@@ -785,10 +702,7 @@ Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) {
       cachedGraph->valueTensor_ : getMPSGraphTensorFromScalar(stream, valueScalar)
     };
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
   namedinference::propagate_names_if_nonempty(self, maybe_outnames);
   return self;
@@ -862,14 +776,8 @@ Tensor embedding_dense_backward_mps(const Tensor& grad_,
     auto indicesPlaceholder = Placeholder(cachedGraph->indicesTensor_, indices);
     auto outgoingGradPlaceholder = Placeholder(cachedGraph->outgoingGradTensor_, outgoing_gradient);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      incomingGradPlaceholder.getMPSGraphTensor() : incomingGradPlaceholder.getMPSGraphTensorData(),
-      indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outgoingGradPlaceholder.getMPSGraphTensor() : outgoingGradPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(incomingGradPlaceholder, indicesPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outgoingGradPlaceholder);
   }
   return outgoing_gradient;
 }
@@ -1010,15 +918,8 @@ Tensor embedding_dense_backward_mps(const Tensor& grad_,
                                                 /*gatherTensorData=*/false,
                                                 /*dataType=*/inputType);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      indexPlaceholder.getMPSGraphTensor() : indexPlaceholder.getMPSGraphTensorData(),
-      updatePlaceholder.getMPSGraphTensor() : updatePlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder, indexPlaceholder, updatePlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
   return self;
 }
diff --git a/aten/src/ATen/native/mps/operations/Inverse.mm b/aten/src/ATen/native/mps/operations/Inverse.mm
index ae142f02fba46..176222f2deeeb 100644
--- a/aten/src/ATen/native/mps/operations/Inverse.mm
+++ b/aten/src/ATen/native/mps/operations/Inverse.mm
@@ -53,13 +53,8 @@
     Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, A);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData()};
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Lerp.mm b/aten/src/ATen/native/mps/operations/Lerp.mm
index ca674336a907f..1ad34ef9a566a 100644
--- a/aten/src/ATen/native/mps/operations/Lerp.mm
+++ b/aten/src/ATen/native/mps/operations/Lerp.mm
@@ -1,5 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
+#include <ATen/native/mps/OperationUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -11,8 +12,38 @@
 
 namespace at::native {
 TORCH_IMPL_FUNC(lerp_Tensor_mps)(const Tensor& self, const Tensor& end, const Tensor& weight, const Tensor& out) {
-  // TODO: Write a much better implementation
-  at::add_out(const_cast<Tensor&>(out), self, weight.mul(end.sub(self)));
+  TORCH_CHECK(out.is_mps());
+  std::array<TensorArg, 4> args{{{out, "out", 0}, {self, "self", 1}, {end, "end", 2}, {weight, "weight", 3}}};
+  checkAllSameGPU(__func__, args);
+  using namespace mps;
+  struct CachedGraph : public MPSCachedGraph {
+    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* selfTensor_ = nil;
+    MPSGraphTensor* endTensor_ = nil;
+    MPSGraphTensor* weightTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+  @autoreleasepool {
+    string key = "lerp_Tensor_mps" + getTensorsStringKey({self, end, weight});
+    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto graph) {
+      auto selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+      auto endTensor = mpsGraphRankedPlaceHolder(mpsGraph, end);
+      auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight);
+      auto distance = [mpsGraph subtractionWithPrimaryTensor:endTensor secondaryTensor:selfTensor name:nil];
+      auto weighedDistance = [mpsGraph multiplicationWithPrimaryTensor:weightTensor secondaryTensor:distance name:nil];
+      auto output = [mpsGraph additionWithPrimaryTensor:selfTensor secondaryTensor:weighedDistance name:nil];
+      graph->selfTensor_ = selfTensor;
+      graph->endTensor_ = endTensor;
+      graph->weightTensor_ = weightTensor;
+      graph->outputTensor_ = output;
+    });
+    auto selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self);
+    auto endPlaceholder = Placeholder(cachedGraph->endTensor_, end);
+    auto weightPlaceholder = Placeholder(cachedGraph->weightTensor_, weight);
+    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder, endPlaceholder, weightPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
+  }
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/Linear.mm b/aten/src/ATen/native/mps/operations/Linear.mm
index 4e556189b0f1f..450e24c77c9d4 100644
--- a/aten/src/ATen/native/mps/operations/Linear.mm
+++ b/aten/src/ATen/native/mps/operations/Linear.mm
@@ -15,11 +15,17 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const c10::opt
 
   auto weight = (weight_arg.dim() == 1) ? weight_arg.view({1, weight_arg.size(0)}) : weight_arg;
 
-  TORCH_CHECK(input.scalar_type() == ScalarType::Float || input.scalar_type() == ScalarType::Half,
-              "MPS device does not support linear for non-float inputs");
+  TORCH_CHECK(supportedFloatingType(input), "MPS device does not support linear for non-float inputs");
+  TORCH_CHECK(input.is_mps(), "Tensor for argument input is on ", input.device(), " but expected on mps");
+  TORCH_CHECK(supportedFloatingType(weight_arg), "MPS device does not support linear for non-float weights");
+  TORCH_CHECK(weight_arg.is_mps(), "Tensor for argument weight is on ", weight_arg.device(), " but expected on mps");
 
   const Tensor& bias = *(at::borrow_from_optional_tensor(bias_opt));
-  bool is_bias_defined = bias.defined();
+  const bool is_bias_defined = bias.defined();
+  if (is_bias_defined) {
+    TORCH_CHECK(bias.is_mps(), "Tensor for argument bias is on ", bias.device(), " but expected on mps");
+    TORCH_CHECK(supportedFloatingType(bias), "MPS device does not support linear for non-float bias");
+  }
 
   auto input_size = input.sizes();
   std::vector<int64_t> output_size(input_size.begin(), input_size.end() - 1);
@@ -68,31 +74,26 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const c10::opt
                                                               dimension:-1
                                                           withDimension:-2
                                                                    name:nil];
-      MPSGraphTensor* outputTensor = nil;
-
-      if (!is_bias_defined) {
-        outputTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:inputTensor
-                                                       secondaryTensor:weightTransposeTensor
-                                                                  name:nil];
-      } else {
-        MPSGraphTensor* inputFlattened = inputTensor;
-        bool doReshape = false;
+      // matrixMultiplicationWithPrimary crashes for 5D tensors, see https://github.com/pytorch/pytorch/issues/114942
+      bool doReshape = input.dim() > 4;
+      if (!doReshape && is_bias_defined) {
         // workaround to improve the performance with 3D+ inputs
-        if (input_size.size() > 2 && input_size[0] > 1 && input_size[1] >= 1 && input_size[1] <= 32 &&
-            bias.dim() <= 1) {
-          doReshape = true;
-          inputFlattened = [mpsGraph flatten2DTensor:inputTensor axis:-1 name:nil];
-        }
+        doReshape =
+            input_size.size() > 2 && input_size[0] > 1 && input_size[1] >= 1 && input_size[1] <= 32 && bias.dim() <= 1;
+      }
+      auto inputFlattened = doReshape ? [mpsGraph flatten2DTensor:inputTensor axis:-1 name:nil] : inputTensor;
+      auto outputTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:inputFlattened
+                                                          secondaryTensor:weightTransposeTensor
+                                                                     name:nil];
 
+      if (is_bias_defined) {
         newCachedGraph->biasTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, bias);
-        MPSGraphTensor* xMulWTTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:inputFlattened
-                                                                       secondaryTensor:weightTransposeTensor
-                                                                                  name:nil];
-        MPSGraphTensor* biasedTensor = [mpsGraph additionWithPrimaryTensor:xMulWTTensor
-                                                           secondaryTensor:newCachedGraph->biasTensor_
-                                                                      name:nil];
-        outputTensor = doReshape ? [mpsGraph reshapeTensor:biasedTensor withShape:getMPSShape(output_size) name:nil]
-                                 : biasedTensor;
+        outputTensor = [mpsGraph additionWithPrimaryTensor:outputTensor
+                                           secondaryTensor:newCachedGraph->biasTensor_
+                                                      name:nil];
+      }
+      if (doReshape) {
+        outputTensor = [mpsGraph reshapeTensor:outputTensor withShape:getMPSShape(output_size) name:nil];
       }
 
       newCachedGraph->inputTensor_ = inputTensor;
@@ -112,10 +113,7 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const c10::opt
       biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias);
       feeds[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData();
     }
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   // Shave off '1' present at the end of the shape
@@ -130,13 +128,11 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const c10::opt
 
 static Tensor _mps_linear_backward_input(IntArrayRef input_size, const Tensor& grad_output, const Tensor& weight) {
   TORCH_CHECK(grad_output.is_mps(), "mps_linear_backward: grad_output needs to be mps layout");
-  TORCH_CHECK(weight.device().is_mps() && (weight.scalar_type() == kFloat || (weight.scalar_type() == kHalf)),
+  TORCH_CHECK(weight.device().is_mps() && supportedFloatingType(weight),
               "mps_linear_backward: unsupported weights data type: ",
               weight.scalar_type());
 
-  TORCH_CHECK(grad_output.scalar_type() == ScalarType::Double || grad_output.scalar_type() == ScalarType::Float ||
-                  grad_output.scalar_type() == ScalarType::Half,
-              "MPS device does not support linear backward for non-float inputs");
+  TORCH_CHECK(supportedFloatingType(grad_output), "MPS device does not support linear backward for non-float inputs");
 
   const Tensor weight_reshaped = weight.is_contiguous() ? weight : weight.contiguous();
 
@@ -159,15 +155,23 @@ static Tensor _mps_linear_backward_input(IntArrayRef input_size, const Tensor& g
   @autoreleasepool {
     string key = "mps_linear_backward_input" + getTensorsStringKey({grad_output, weight_reshaped});
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto* mpsGraph, auto* newCachedGraph) {
-      MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_reshaped);
-      MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
-
-      MPSGraphTensor* outputTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:gradOutputTensor
-                                                                     secondaryTensor:weightTensor
-                                                                                name:nil];
+      newCachedGraph->weightTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, weight_reshaped);
+      newCachedGraph->gradOutputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+
+      // MPS matrixMultiplication crashes for 5D+ tensors on 14.2.1 with `New volume should match old volume`
+      // See https://github.com/pytorch/pytorch/issues/114942 for more details
+      bool needReshape = grad_output.dim() > 4;
+      auto gradOutputTensor = needReshape
+          ? [mpsGraph flatten2DTensor:newCachedGraph->gradOutputTensor_ axis:-1 name:nil]
+          : newCachedGraph->gradOutputTensor_;
+
+      auto outputTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:gradOutputTensor
+                                                          secondaryTensor:newCachedGraph->weightTensor_
+                                                                     name:nil];
+      if (needReshape) {
+        outputTensor = [mpsGraph reshapeTensor:outputTensor withShape:getMPSShape(output) name:nil];
+      }
 
-      newCachedGraph->weightTensor_ = weightTensor;
-      newCachedGraph->gradOutputTensor_ = gradOutputTensor;
       newCachedGraph->outputTensor_ = outputTensor;
     });
 
@@ -175,15 +179,8 @@ static Tensor _mps_linear_backward_input(IntArrayRef input_size, const Tensor& g
     Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      weightPlaceholder.getMPSGraphTensor() : weightPlaceholder.getMPSGraphTensorData(),
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(weightPlaceholder, gradOutputPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
 
     return output;
   }
@@ -196,8 +193,7 @@ static Tensor _mps_linear_backward_input(IntArrayRef input_size, const Tensor& g
   TORCH_CHECK(grad_output.is_mps() && input.is_mps(),
               "_mps_linear_backward: grad_output and input needs to be mps layout");
 
-  TORCH_CHECK(grad_output.scalar_type() == ScalarType::Float || grad_output.scalar_type() == ScalarType::Half,
-              "MPS device does not support linear backward for non-float inputs");
+  TORCH_CHECK(supportedFloatingType(grad_output), "MPS device does not support linear backward for non-float inputs");
 
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@@ -273,17 +269,9 @@ static Tensor _mps_linear_backward_input(IntArrayRef input_size, const Tensor& g
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
     Placeholder biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      weightPlaceholder.getMPSGraphTensor() : weightPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = [NSMutableDictionary dictionary];
-    results[outputPlaceholder.getMPSGraphTensor()] = outputPlaceholder.getMPSGraphTensorData();
-    if (bias_defined)
-      results[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData();
-
+    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder, weightPlaceholder);
+    auto results = bias_defined ? dictionaryFromPlaceholders(outputPlaceholder, biasPlaceholder)
+                                : dictionaryFromPlaceholders(outputPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
 
     return std::tuple<Tensor, Tensor>{output, bias};
diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
index 5d2e01f457c2d..002426db125e9 100644
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@@ -4,6 +4,8 @@
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
+// For MTLLanguageVersion_3_1
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -22,16 +24,114 @@
 
 namespace at::native {
 namespace mps {
+namespace {
+static MetalShaderLibrary lib(R"MATMUL_METAL(
+#include <metal_array>
+
+using namespace metal;
+template<typename T>
+T dot_product(constant T *v1, constant T* v2, ulong2 strides, uint32_t size) {
+  T rc = T(0.0);
+  for (uint32_t i = 0; i < size; ++i) {
+    rc += v1[i * strides.x] * v2[i * strides.y];
+  }
+  return rc;
+}
 
-enum LinearAlgebraOpType { ADDBMM_OP_TYPE, BADDBMM_OP_TYPE };
+template<typename T>
+kernel void naive_matmul(
+    constant T                 * mat1Data      [[buffer(0)]],
+    constant T                 * mat2Data      [[buffer(1)]],
+    device   T                 * outputData    [[buffer(2)]],
+    constant array<ulong2, 3>  & strides       [[buffer(3)]],
+    constant uint3             & sizes         [[buffer(4)]],
+    uint                         thread_index [[thread_position_in_grid]]) {
+    uint y = thread_index / sizes.x;
+    uint x = thread_index % sizes.x;
+    if (x >= sizes.x || y >= sizes.z) {
+        return;
+    }
+    auto rc = dot_product(mat1Data + x * strides[0].x,
+                          mat2Data + y * strides[1].y,
+                          ulong2(strides[0].y, strides[1].x),
+                          sizes.y);
+    outputData[x * strides[2].x + y * strides[2].y] = rc;
+}
+
+#define INSTANTIATE_NAIVE_MM(DTYPE)                                        \
+template                                                                   \
+[[host_name("naive_matmul_" #DTYPE)]]                                      \
+kernel void naive_matmul<DTYPE>(                                           \
+    constant DTYPE             * mat1Data      [[buffer(0)]],              \
+    constant DTYPE             * mat2Data      [[buffer(1)]],              \
+    device   DTYPE             * outputData    [[buffer(2)]],              \
+    constant array<ulong2, 3>  & strides       [[buffer(3)]],              \
+    constant uint3             & sizes         [[buffer(4)]],              \
+    uint                         thread_index [[thread_position_in_grid]])
+
+INSTANTIATE_NAIVE_MM(float);
+INSTANTIATE_NAIVE_MM(half);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_NAIVE_MM(bfloat);
+#endif
+)MATMUL_METAL");
+
+Tensor& do_metal_mm(const Tensor& self, const Tensor& other, Tensor& output) {
+  auto stream = getCurrentMPSStream();
+  auto device = MPSDevice::getInstance()->device();
+  auto matmulPSO = lib.getPipelineStateForFunc("naive_matmul_" + mps::scalarToMetalTypeString(output));
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      getMPSProfiler().beginProfileKernel(matmulPSO, "naive_matmul", {self, other});
+      auto computeEncoder = stream->commandEncoder();
+      [computeEncoder setComputePipelineState:matmulPSO];
+      std::array<uint32_t, 3> sizes = {static_cast<uint32_t>(self.size(0)),
+                                       static_cast<uint32_t>(self.size(1)),
+                                       static_cast<uint32_t>(output.size(1))};
+      std::array<int64_t, 6> strides = {
+          self.stride(0), self.stride(1), other.stride(0), other.stride(1), output.stride(0), output.stride(1)};
+      mtl_setBuffer(computeEncoder, self, 0);
+      mtl_setBuffer(computeEncoder, other, 1);
+      mtl_setBuffer(computeEncoder, output, 2);
+      [computeEncoder setBytes:strides.data() length:sizeof(uint64_t) * strides.size() atIndex:3];
+      [computeEncoder setBytes:sizes.data() length:sizeof(uint32_t) * sizes.size() atIndex:4];
+      mtl_dispatch1DJob(computeEncoder, matmulPSO, output.numel());
+      getMPSProfiler().endProfileKernel(matmulPSO);
+    }
+  });
+  return output;
+}
+
+std::tuple<MPSGraphTensor*, MPSGraphTensor*, MPSGraphTensor*> do_mm(MPSGraph* graph,
+                                                                    const Tensor& self,
+                                                                    const Tensor& other) {
+  if (self.numel() == 0 || other.numel() == 0) {
+    auto output = [graph constantWithScalar:0.0
+                                      shape:getMPSShape({self.size(0), other.size(1)})
+                                   dataType:getMPSDataType(self)];
+    return {nil, nil, output};
+  }
+  auto selfTensor = mpsGraphRankedPlaceHolder(graph, self);
+  auto otherTensor = mpsGraphRankedPlaceHolder(graph, other);
+  auto output = [graph matrixMultiplicationWithPrimaryTensor:selfTensor secondaryTensor:otherTensor name:nil];
+  return {selfTensor, otherTensor, output};
+}
+
+bool use_metal_mm(const Tensor& self, const Tensor& other, const Tensor& output) {
+  static bool always_use_metal = std::getenv("PYTORCH_MPS_PREFER_METAL") != nullptr;
+  constexpr auto max_stride_size = 32768;
+  return always_use_metal || self.stride(0) > max_stride_size || self.stride(1) > max_stride_size ||
+      self.size(0) > max_stride_size || self.size(1) > max_stride_size || other.stride(0) > max_stride_size ||
+      other.stride(1) > max_stride_size || other.size(0) > max_stride_size || other.size(1) > max_stride_size;
+}
+
+} // anonymous namespace
 
 static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) {
   using namespace mps;
   using CachedGraph = MPSBinaryCachedGraph;
   TORCH_CHECK(self.dim() == 2 && other.dim() == 2, "tensors must be 2-D");
-  TORCH_CHECK(self.scalar_type() == ScalarType::Double || self.scalar_type() == ScalarType::Float ||
-                  self.scalar_type() == ScalarType::Half,
-              "MPS device does not support mm for non-float inputs");
+  TORCH_CHECK(supportedFloatingType(self), "MPS device does not support mm for non-float inputs");
 
   TensorArg args[]{{output, "out", 0}, {self, "mat1", 1}, {other, "mat2", 2}};
   checkAllSameGPU("mm", args);
@@ -39,60 +139,38 @@
   TORCH_CHECK(output.is_mps());
 
   // Transpose inputs if needed
-  IntArrayRef output_sizes = output.sizes();
-  if ((output_sizes[0] == 0) || (output_sizes[1] == 0)) {
+  if (output.numel() == 0) {
     return output;
   }
 
-  MPSStream* stream = getCurrentMPSStream();
+  // MPS matmul returns silently incorrect results if one of the matrix dimensions is greater than 2**15
+  // And crashes if its a view of matrix with dimensions larger than 2**15
+  // See https://github.com/pytorch/pytorch/issues/116769#issuecomment-1888302095
+  // In such cases, fallback to naive but accurate metal shader
+  if (use_metal_mm(self, other, output)) {
+    return do_metal_mm(self, other, output);
+  }
 
   @autoreleasepool {
     string key = "mm_out_mps_impl" + getTensorsStringKey({self, other});
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* selfTensor = nil;
-      MPSGraphTensor* otherTensor = nil;
-      MPSGraphTensor* outputTensor = nil;
-
-      if (self.numel() == 0 || other.numel() == 0) {
-        outputTensor = [mpsGraph constantWithScalar:0. shape:getMPSShape(output_sizes) dataType:getMPSDataType(output)];
-
-      } else {
-        selfTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, self);
-        otherTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, other);
-        outputTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:selfTensor secondaryTensor:otherTensor name:nil];
-      }
-
-      newCachedGraph->inputTensor_ = selfTensor;
-      newCachedGraph->otherTensor_ = otherTensor;
-      newCachedGraph->outputTensor_ = outputTensor;
+      std::tie(newCachedGraph->inputTensor_, newCachedGraph->otherTensor_, newCachedGraph->outputTensor_) =
+          do_mm(mpsGraph, self, other);
     });
-    Placeholder selfPlaceholder = Placeholder();
-    Placeholder otherPlaceholder = Placeholder();
+    auto selfPlaceholder = self.numel() != 0 ? Placeholder(cachedGraph->inputTensor_, self) : Placeholder();
+    auto otherPlaceholder = other.numel() != 0 ? Placeholder(cachedGraph->otherTensor_, other) : Placeholder();
+    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
 
-    if (!(self.numel() == 0 || other.numel() == 0)) {
-      selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-      otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other);
-    }
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = nil;
-
-    if (!(self.numel() == 0 || other.numel() == 0))
-      feeds = @{
-        selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-        otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData()
-      };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = self.numel() != 0 ? dictionaryFromPlaceholders(selfPlaceholder, otherPlaceholder) : nil;
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return output;
 }
 
+enum LinearAlgebraOpType { ADDBMM_OP_TYPE, BADDBMM_OP_TYPE };
+
 static Tensor& addbmm_or_baddbmm_out_mps_impl(const Tensor& input,
                                               const Tensor& batch1,
                                               const Tensor& batch2,
@@ -107,9 +185,7 @@
   TORCH_CHECK(batch2.is_mps());
   TORCH_CHECK(result.is_mps());
 
-  TORCH_CHECK(batch1.scalar_type() == ScalarType::Double || batch1.scalar_type() == ScalarType::Float ||
-                  batch1.scalar_type() == ScalarType::Half,
-              "MPS device does not support addbmm or baddbmm for non-float inputs");
+  TORCH_CHECK(supportedFloatingType(batch1), "MPS device does not support addbmm or baddbmm for non-float inputs");
 
   TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
   TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
@@ -193,21 +269,14 @@
       newCachedGraph->batch2Tensor_ = batch2Tensor;
       newCachedGraph->outputTensor_ = outputTensor;
     });
+
     Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input);
     Placeholder batch1Placeholder = Placeholder(cachedGraph->batch1Tensor_, batch1);
     Placeholder batch2Placeholder = Placeholder(cachedGraph->batch2Tensor_, batch2);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      batch1Placeholder.getMPSGraphTensor() : batch1Placeholder.getMPSGraphTensorData(),
-      batch2Placeholder.getMPSGraphTensor() : batch2Placeholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder, batch1Placeholder, batch2Placeholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return result;
@@ -223,9 +292,7 @@
 
   TORCH_CHECK(output.is_mps());
   TORCH_CHECK(self.dim() == 2 && other.dim() == 2, "tensors must be 2-D");
-  TORCH_CHECK(self.scalar_type() == ScalarType::Double || self.scalar_type() == ScalarType::Float ||
-                  self.scalar_type() == ScalarType::Half,
-              "MPS device does not support addmm for non-float input");
+  TORCH_CHECK(supportedFloatingType(self), "MPS device does not support addmm for non-float input");
 
   TensorArg args[]{{output, "out", 0}, {bias, "self", 1}, {self, "mat1", 2}, {other, "mat2", 3}};
   checkAllSameGPU(__func__, args);
@@ -248,13 +315,10 @@
   if (&output != &self) {
     output.resize_(bias_sizes);
   }
-  IntArrayRef output_sizes = output.sizes();
-  if ((output_sizes[0] == 0) || (output_sizes[1] == 0)) {
+  if (output.numel() == 0) {
     return output;
   }
 
-  MPSStream* stream = getCurrentMPSStream();
-
   bool is_beta_non_zero = beta.toDouble() != 0.0;
 
   struct CachedGraph : public mps::MPSCachedGraph {
@@ -269,15 +333,13 @@
     string key = "addmm_out_mps_impl" + getTensorsStringKey({self, other, *bias_}) + ":" + to_string(beta.toDouble()) +
         ":" + to_string(alpha.toDouble());
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* selfTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, self);
-      MPSGraphTensor* otherTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, other);
-      MPSGraphTensor* biasTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, *bias_);
+      MPSGraphTensor* selfTensor = nil;
+      MPSGraphTensor* otherTensor = nil;
+      MPSGraphTensor* productTensor = nil;
+      MPSGraphTensor* biasTensor = mpsGraphRankedPlaceHolder(mpsGraph, *bias_);
 
       // TODO: Use alpha and beta here with fill_.Scalar and mul
-      // Intermediate as placeholder
-      MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:selfTensor
-                                                                      secondaryTensor:otherTensor
-                                                                                 name:@"MM/(mat1@mat2)"];
+      std::tie(selfTensor, otherTensor, productTensor) = do_mm(mpsGraph, self, other);
 
       auto productTimesAlphaTensor = productTensor;
       if (alpha.toDouble() != 1.0) {
@@ -309,21 +371,14 @@
       newCachedGraph->outputTensor_ = outputTensor;
     });
 
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self);
-    Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other);
+    Placeholder selfPlaceholder = self.numel() != 0 ? Placeholder(cachedGraph->selfTensor_, self) : Placeholder();
+    Placeholder otherPlaceholder = other.numel() != 0 ? Placeholder(cachedGraph->otherTensor_, other) : Placeholder();
     Placeholder biasPlaceholder = Placeholder(cachedGraph->biasTensor_, *bias_);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData(),
-      biasPlaceholder.getMPSGraphTensor() : biasPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = self.numel() != 0 ? dictionaryFromPlaceholders(selfPlaceholder, otherPlaceholder, biasPlaceholder)
+                                   : dictionaryFromPlaceholders(biasPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return output;
@@ -332,15 +387,32 @@
 static Tensor& bmm_out_mps_impl(const Tensor& batch1, const Tensor& batch2, Tensor& result) {
   using namespace mps;
 
-  TORCH_CHECK(batch1.scalar_type() == ScalarType::Double || batch1.scalar_type() == ScalarType::Float ||
-                  batch1.scalar_type() == ScalarType::Half,
-              "MPS device does not support bmm for non-float inputs");
+  TORCH_CHECK(supportedFloatingType(batch1), "MPS device does not support bmm for non-float inputs");
 
   if (batch1.numel() == 0 || batch2.numel() == 0) {
     result.zero_();
     return result;
   }
 
+  MPSShape* shape = nil;
+  bool doTranspose = false;
+
+  // Handle transposes for the second batch of matrices.
+  if (batch2.is_view() && !batch2.is_contiguous()) {
+    if (batch2.numel() == batch2._base().numel()) {
+      const IntArrayRef& viewSizes = batch2.sizes();
+
+      // Handle 3D and 4D tensors.
+      // For 4D tensors, first it must have been reshaped from 4D to 3D and then transposed.
+      int32_t baseTransposeStrideDim = batch2._base().dim() == 4 ? -3 : -2;
+      if (batch2._base().stride(0) == batch2.stride(0) &&
+          batch2._base().stride(baseTransposeStrideDim) == batch2.stride(-1)) {
+        shape = @[ @(viewSizes[0]), @(viewSizes[2]), @(viewSizes[1]) ];
+        doTranspose = true;
+      }
+    }
+  }
+
   MPSStream* stream = getCurrentMPSStream();
 
   struct CachedGraph : public mps::MPSCachedGraph {
@@ -351,14 +423,20 @@
   };
 
   @autoreleasepool {
-    string key = "bmm_out_mps_impl" + getTensorsStringKey({batch1, batch2});
+    string key = "bmm_out_mps_impl" + getTensorsStringKey({batch1, batch2}, true, /*exclude_shape*/ true) +
+        std::to_string(doTranspose);
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* batch1Tensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, batch1);
-      MPSGraphTensor* batch2Tensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, batch2);
+      MPSGraphTensor* batch1Tensor = mps::mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(batch1.scalar_type()));
+      MPSGraphTensor* batch2Tensor = mps::mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(batch2.scalar_type()));
+      MPSGraphTensor* batch2TensorTranspose = batch2Tensor;
+
+      if (doTranspose) {
+        batch2TensorTranspose = [mpsGraph transposeTensor:batch2Tensor dimension:-1 withDimension:-2 name:nil];
+      }
 
       MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:batch1Tensor
-                                                                      secondaryTensor:batch2Tensor
+                                                                      secondaryTensor:batch2TensorTranspose
                                                                                  name:@"MM/(batch1@batch2)"];
 
       newCachedGraph->batch1Tensor_ = batch1Tensor;
@@ -366,18 +444,11 @@
       newCachedGraph->outputTensor_ = productTensor;
     });
     Placeholder batch1Placeholder = Placeholder(cachedGraph->batch1Tensor_, batch1);
-    Placeholder batch2Placeholder = Placeholder(cachedGraph->batch2Tensor_, batch2);
+    Placeholder batch2Placeholder = Placeholder(cachedGraph->batch2Tensor_, batch2, shape, !doTranspose);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      batch1Placeholder.getMPSGraphTensor() : batch1Placeholder.getMPSGraphTensorData(),
-      batch2Placeholder.getMPSGraphTensor() : batch2Placeholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(batch1Placeholder, batch2Placeholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return result;
@@ -496,9 +567,7 @@ Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, cons
 
   TORCH_CHECK(result.is_mps());
   TORCH_CHECK(vec1.dim() == 1 && vec2.dim() == 1, "tensors must be 1-D");
-  TORCH_CHECK(vec1.scalar_type() == ScalarType::Double || vec1.scalar_type() == ScalarType::Float ||
-                  vec1.scalar_type() == ScalarType::Half,
-              "MPS device does not support addr for non-float input");
+  TORCH_CHECK(supportedFloatingType(vec1), "MPS device does not support addr for non-float input");
 
   TensorArg args[]{{result, "out", 0}, {self, "self", 1}, {vec1, "vec1", 2}, {vec2, "vec2", 3}};
   checkAllSameGPU(__func__, args);
@@ -592,16 +661,8 @@ Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, cons
     Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, *self_);
     Placeholder resultPlaceholder = Placeholder(cachedGraph->resultTensor_, result);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      vec1Placeholder.getMPSGraphTensor() : vec1Placeholder.getMPSGraphTensorData(),
-      vec2Placeholder.getMPSGraphTensor() : vec2Placeholder.getMPSGraphTensorData(),
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData()};
-
-    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(vec1Placeholder, vec2Placeholder, selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, resultPlaceholder);
   }
 
   return result;
diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm
index 8f563beabe250..3e58d2ca8a4b2 100644
--- a/aten/src/ATen/native/mps/operations/LossOps.mm
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@@ -76,7 +76,7 @@ static string reductionToString(int64_t reduction) {
       newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
       newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
 
-      MPSGraphTensor* normTensor = [mpsGraph constantWithScalar:norm dataType:MPSDataTypeFloat32];
+      MPSGraphTensor* normTensor = [mpsGraph constantWithScalar:norm dataType:[newCachedGraph->inputTensor dataType]];
       MPSGraphTensor* diffTensor = [mpsGraph subtractionWithPrimaryTensor:newCachedGraph->inputTensor
                                                           secondaryTensor:newCachedGraph->targetTensor
                                                                      name:nil];
@@ -92,15 +92,8 @@ static string reductionToString(int64_t reduction) {
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor, grad_input);
     Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor, grad_output);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData(),
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder, targetPlaceholder, gradOutputPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, gradInputPlaceholder);
   }
 
   return grad_input;
@@ -123,11 +116,12 @@ static string reductionToString(int64_t reduction) {
 
 static MPSGraphTensor* bce_forward_mps(CachedGraph* bceGraph) {
   MPSGraph* mpsGraph = bceGraph->graph();
+  const auto inputType = [bceGraph->inputTensor dataType];
 
   // Forward BCE: L = -w (y ln(x) + (1-y) ln(1-x))
-  MPSGraphTensor* one = [mpsGraph constantWithScalar:1.0 dataType:MPSDataTypeFloat32];
+  MPSGraphTensor* one = [mpsGraph constantWithScalar:1.0 dataType:inputType];
   // -100 is the hard limit value defined in BCELoss Spec. to clamp the log
-  MPSGraphTensor* neg100 = [mpsGraph constantWithScalar:-100.0 dataType:MPSDataTypeFloat32];
+  MPSGraphTensor* neg100 = [mpsGraph constantWithScalar:-100.0 dataType:inputType];
   // 1 - x
   MPSGraphTensor* one_Input = [mpsGraph subtractionWithPrimaryTensor:one
                                                      secondaryTensor:bceGraph->inputTensor
@@ -161,11 +155,12 @@ static string reductionToString(int64_t reduction) {
 
 static MPSGraphTensor* bce_backward_mps(CachedGraph* bceGraph) {
   MPSGraph* mpsGraph = bceGraph->graph();
+  const auto inputType = [bceGraph->inputTensor dataType];
 
   // Backward BCE: d(L)/d(x) = -w (y - x) / (x - x^2)
-  MPSGraphTensor* one = [mpsGraph constantWithScalar:1.0 dataType:MPSDataTypeFloat32];
+  MPSGraphTensor* one = [mpsGraph constantWithScalar:1.0 dataType:inputType];
   // epsilon used to clamp the grad input denominator
-  MPSGraphTensor* epsilon = [mpsGraph constantWithScalar:1e-12 dataType:MPSDataTypeFloat32];
+  MPSGraphTensor* epsilon = [mpsGraph constantWithScalar:1e-12 dataType:inputType];
   // 1 - x
   MPSGraphTensor* one_Input = [mpsGraph subtractionWithPrimaryTensor:one
                                                      secondaryTensor:bceGraph->inputTensor
@@ -245,7 +240,7 @@ static string reductionToString(int64_t reduction) {
       if (grad_output.defined()) {
         if (reduction == at::Reduction::Mean) {
           MPSGraphTensor* inputNumel = [mpsGraph constantWithScalar:static_cast<double>(input.numel())
-                                                           dataType:MPSDataTypeFloat32];
+                                                           dataType:[bceLoss dataType]];
           newCachedGraph->gradInputTensor = [mpsGraph divisionWithPrimaryTensor:bceLoss
                                                                 secondaryTensor:inputNumel
                                                                            name:nil];
@@ -273,10 +268,7 @@ static string reductionToString(int64_t reduction) {
       feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData();
     }
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{lossPlaceholder.getMPSGraphTensor() : lossPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, lossPlaceholder);
   }
 
   return loss;
@@ -411,10 +403,7 @@ static void nllnd_loss_backward_impl(Tensor& grad_input_arg,
     if (isWeightsArrayValid) {
       feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData();
     }
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, gradInputPlaceholder);
   }
 }
 
@@ -581,11 +570,7 @@ static void nllnd_loss_forward_impl(Tensor& output,
     if (isWeightsArrayValid)
       feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData();
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
-      totalWeightsPlaceholder.getMPSGraphTensor() : totalWeightsPlaceholder.getMPSGraphTensorData()
-    };
-
+    auto results = dictionaryFromPlaceholders(outputPlaceholder, totalWeightsPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
 
@@ -680,14 +665,8 @@ static void smooth_l1_loss_impl(const Tensor& input,
     Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor_, target, mpsInputShape);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, mpsOutputShape);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder, targetPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -804,15 +783,8 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
     Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData(),
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder, targetPlaceholder, gradOutputPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, gradInputPlaceholder);
   }
 }
 
@@ -887,15 +859,8 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
     Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor_, target);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
 
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder, targetPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
   return output;
 }
@@ -1002,23 +967,23 @@ Tensor huber_loss_mps(const Tensor& input, const Tensor& target, int64_t reducti
     Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor_, target);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, grad_input);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder, targetPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
   return grad_input;
 }
 
 // MSELoss
-TORCH_IMPL_FUNC(mse_loss_out_mps)(const Tensor& input, const Tensor& target, int64_t reduction, const Tensor& output) {
+TORCH_IMPL_FUNC(mse_loss_out_mps)(const Tensor& input, const Tensor& target, int64_t reduction, const Tensor& output_) {
   string op_name = __func__;
   using namespace mps;
+
+  bool contiguousOutput = output_.is_contiguous();
+  Tensor output = output_;
+  if (!contiguousOutput) {
+    output = output_.contiguous();
+  }
+
   TORCH_CHECK(target.is_same_size(input), op_name + ": target and input tensors must have identical shapes")
   TORCH_CHECK(output.is_mps());
 
@@ -1043,16 +1008,14 @@ Tensor huber_loss_mps(const Tensor& input, const Tensor& target, int64_t reducti
     });
     Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input);
     Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, contiguousOutput ? output_ : output);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder, targetPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
+  }
 
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+  if (!contiguousOutput) {
+    output_.copy_(output);
   }
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm
index eb754ae597689..bdca3b09780b2 100644
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@@ -10,7 +10,9 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_batch_norm_with_update_native.h>
 #include <ATen/ops/_native_batch_norm_legit_native.h>
+#include <ATen/ops/batch_norm_backward_native.h>
 #include <ATen/ops/native_batch_norm.h>
 #include <ATen/ops/native_batch_norm_backward_native.h>
 #include <ATen/ops/native_batch_norm_native.h>
@@ -406,6 +408,36 @@ Check if running mean exists (maybe do this check before making graph)
   return std::make_tuple(output, save_mean, save_var);
 }
 
+std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_mps(const Tensor& input,
+                                                                       const c10::optional<Tensor>& weight_opt,
+                                                                       const c10::optional<Tensor>& bias_opt,
+                                                                       Tensor& running_mean,
+                                                                       Tensor& running_var,
+                                                                       double momentum,
+                                                                       double eps) {
+  Tensor output, save_mean, save_var;
+  std::tie(output, save_mean, save_var) =
+      batch_norm_mps(input, weight_opt, bias_opt, running_mean, running_var, /*train*/ true, momentum, eps);
+  Tensor reserve = at::empty({0}, input.options().dtype(kByte));
+  return std::tuple<Tensor, Tensor, Tensor, Tensor>(output, save_mean, save_var, reserve);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> _batch_norm_with_update_mps_out(const Tensor& input,
+                                                                               const c10::optional<Tensor>& weight_opt,
+                                                                               const c10::optional<Tensor>& bias_opt,
+                                                                               Tensor& running_mean,
+                                                                               Tensor& running_var,
+                                                                               double momentum,
+                                                                               double eps,
+                                                                               Tensor& out,
+                                                                               Tensor& save_mean,
+                                                                               Tensor& save_var,
+                                                                               Tensor& reserve) {
+  std::tie(out, save_mean, save_var) = batch_norm_mps_out(
+      input, weight_opt, bias_opt, running_mean, running_var, /*update*/ true, momentum, eps, out, save_mean, save_var);
+  return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>(out, save_mean, save_var, reserve);
+}
+
 std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_mps(const Tensor& self,
                                                          const c10::optional<Tensor>& weight_opt,
                                                          const c10::optional<Tensor>& bias_opt,
@@ -471,6 +503,29 @@ static string get_mem_string(c10::MemoryFormat memory_format) {
 }
 
 // Batch norm backward
+std::tuple<Tensor, Tensor, Tensor> _new_batch_norm_backward_mps(const Tensor& grad_output,
+                                                                const Tensor& input,
+                                                                const Tensor& weight,
+                                                                const c10::optional<Tensor>& running_mean_opt,
+                                                                const c10::optional<Tensor>& running_var_opt,
+                                                                const c10::optional<Tensor>& save_mean_opt,
+                                                                const c10::optional<Tensor>& save_var_opt,
+                                                                bool update,
+                                                                double eps,
+                                                                std::array<bool, 3> grad_input_mask,
+                                                                const Tensor& reserve) {
+  return batch_norm_backward_mps(grad_output,
+                                 input,
+                                 weight,
+                                 running_mean_opt,
+                                 running_var_opt,
+                                 save_mean_opt,
+                                 save_var_opt,
+                                 update,
+                                 eps,
+                                 grad_input_mask);
+}
+
 std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_mps(const Tensor& grad_out,
                                                            const Tensor& input,
                                                            const c10::optional<Tensor>& weight_opt,
diff --git a/aten/src/ATen/native/mps/operations/Pad.mm b/aten/src/ATen/native/mps/operations/Pad.mm
index 377bbb236f884..badf2c064564e 100644
--- a/aten/src/ATen/native/mps/operations/Pad.mm
+++ b/aten/src/ATen/native/mps/operations/Pad.mm
@@ -317,9 +317,7 @@
     if (is_backward_pass) {
       feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData();
     }
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
   return output;
 }
@@ -467,7 +465,7 @@ Tensor replication_pad3d_backward_mps(const Tensor& grad_output, const Tensor& i
   return mps::pad_out_template(grad_input, input, padding, grad_output, MPSGraphPaddingModeClampToEdge, 0.0, __func__);
 }
 
-// backward pass is exlicitly handled in autograd by negating the "pad" argument
+// backward pass is explicitly handled in autograd by negating the "pad" argument
 Tensor constant_pad_nd_mps(const Tensor& self, IntArrayRef pad, const Scalar& value) {
   if (pad.size() > 6) {
     TORCH_WARN_ONCE("MPS: The constant padding of more than 3 dimensions is not currently supported natively. ",
diff --git a/aten/src/ATen/native/mps/operations/PixelShuffle.mm b/aten/src/ATen/native/mps/operations/PixelShuffle.mm
index 30e85bfde4ec1..f93fb62dc23c5 100644
--- a/aten/src/ATen/native/mps/operations/PixelShuffle.mm
+++ b/aten/src/ATen/native/mps/operations/PixelShuffle.mm
@@ -75,15 +75,8 @@ static Tensor pixel_shuffle_helper(const Tensor& self, int64_t factor, bool upsc
 
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
-
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return output;
diff --git a/aten/src/ATen/native/mps/operations/PointwiseOps.mm b/aten/src/ATen/native/mps/operations/PointwiseOps.mm
index 364acb4323f42..137c14be6ef4d 100644
--- a/aten/src/ATen/native/mps/operations/PointwiseOps.mm
+++ b/aten/src/ATen/native/mps/operations/PointwiseOps.mm
@@ -86,10 +86,7 @@ static void addc_mul_div_out_mps(const Tensor& self,
       cachedGraph->valueTensor : getMPSGraphTensorFromScalar(mpsStream, value_scalar),
     };
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(mpsStream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Quantized.mm b/aten/src/ATen/native/mps/operations/Quantized.mm
new file mode 100644
index 0000000000000..3743c6f13c371
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Quantized.mm
@@ -0,0 +1,202 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_weight_int4pack_mm_native.h>
+#include <ATen/ops/_weight_int8pack_mm_native.h>
+#include <ATen/ops/empty.h>
+#endif
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+// For Metal3_1
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
+#include <fmt/format.h>
+
+namespace at::native {
+
+using namespace mps;
+
+static at::native::mps::MetalShaderLibrary lib(R"METAL_QUANTIZED(
+#include <metal_stdlib>
+using namespace metal;
+
+// A is sizes.x x sizes.y
+// B.T is sizes.z x sizes.y
+// C is sizes.x x sizes.z
+
+template<typename T, unsigned groupSize>
+kernel void int4pack_mm(
+    constant T                 * A              [[buffer(0)]],
+    constant uchar             * B              [[buffer(1)]],
+    constant T                 * scalesAndZeros [[buffer(2)]],
+    device   T                 * outputData     [[buffer(3)]],
+    constant uint3             & sizes          [[buffer(4)]],
+    uint                         thread_index   [[thread_position_in_grid]]) {
+    const uint lda = sizes.y;
+    const uint ldc = sizes.z;
+    const uint m = thread_index / sizes.z; // 0..sizes.x-1
+    const uint n = thread_index % sizes.z; // 0..sizes.z-1
+    const uint nb = n / 32;
+    const uint ldb = min(32U,  sizes.z - nb * 32);
+    const uint32_t k_block = (sizes.y + groupSize - 1) / groupSize;
+    constant T *A_ptr = A + m * lda;
+    constant uchar *B_ptr = B + (nb * 16 * sizes.y);
+
+    float rc = 0.0;
+    uint k = 0;
+    for (uint32_t kb = 0; kb < k_block ; kb ++) {
+      const T scale = scalesAndZeros[(kb * ldc + n) * 2 + 0];
+      const T zero = scalesAndZeros[(kb * ldc + n) * 2 + 1] - scale * T(8);
+      for(uint idx = 0; idx < groupSize && k < sizes.y; idx++, k++) {
+        const auto a_val = float(A_ptr[k]);
+        uchar b_val = B_ptr[(k * ldb + (n % 32))/2];
+        b_val = (n & 1) == 0 ? b_val & 0x0f : (b_val >> 4);
+        rc += a_val * float(scale * T(b_val) + zero);
+      }
+    }
+    outputData[thread_index] = T(rc);
+}
+
+#define INSTANTIATE_INT4MM(DTYPE, GSIZE)                                 \
+template                                                                 \
+[[host_name("int4pack_mm_" #GSIZE "_" #DTYPE)]]                          \
+kernel void int4pack_mm<DTYPE, GSIZE>(                                   \
+    constant DTYPE             * A              [[buffer(0)]],           \
+    constant uchar             * B              [[buffer(1)]],           \
+    constant DTYPE             * scalesAndZeros [[buffer(2)]],           \
+    device   DTYPE             * outputData     [[buffer(3)]],           \
+    constant uint3             & sizes          [[buffer(4)]],           \
+    uint                         thread_index [[thread_position_in_grid]])
+
+INSTANTIATE_INT4MM(float, 32);
+INSTANTIATE_INT4MM(half, 32);
+INSTANTIATE_INT4MM(float, 64);
+INSTANTIATE_INT4MM(half, 64);
+INSTANTIATE_INT4MM(float, 128);
+INSTANTIATE_INT4MM(half, 128);
+INSTANTIATE_INT4MM(float, 256);
+INSTANTIATE_INT4MM(half, 256);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_INT4MM(bfloat, 32);
+INSTANTIATE_INT4MM(bfloat, 64);
+INSTANTIATE_INT4MM(bfloat, 128);
+INSTANTIATE_INT4MM(bfloat, 256);
+#endif
+)METAL_QUANTIZED");
+
+Tensor _weight_int4pack_mm_mps(const Tensor& A, const Tensor& B, int64_t qGroupSize, const Tensor& qScaleAndZeros) {
+  constexpr int64_t kNTileSize = 8;
+
+  auto M = A.size(0);
+  auto N = B.size(0) * kNTileSize;
+  auto K = A.size(1);
+
+  TORCH_CHECK(A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
+              __func__,
+              " : expect A to be either 32-bit or 16-bit float tensor.");
+  TORCH_CHECK(A.is_contiguous(), __func__, " : expect A to be contiguous.");
+  TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
+
+  TORCH_CHECK(B.dtype() == kInt, __func__, " : expect B to be int32 tensor.");
+  TORCH_CHECK(B.is_contiguous(), __func__, " : expect B to be contiguous.");
+  TORCH_CHECK(B.dim() == 4, __func__, " : expect B to 4d tensor.");
+
+  TORCH_CHECK(qGroupSize == 32 || qGroupSize == 64 || qGroupSize == 128 || qGroupSize == 256,
+              __func__,
+              ": expect qGroupSize to be 32, 64, 128 or 256, got ",
+              qGroupSize);
+
+  TORCH_CHECK(qScaleAndZeros.dim() == 3 && qScaleAndZeros.size(1) == N && qScaleAndZeros.size(2) == 2,
+              __func__,
+              ": expect qScaleAndZeros to be 3d tensor with sizes [:, ",
+              N,
+              ", 2]");
+
+  auto C = at::empty({M, N}, A.options());
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  std::array<uint32_t, 3> sizes = {static_cast<uint32_t>(M), static_cast<uint32_t>(K), static_cast<uint32_t>(N)};
+  static bool firstCapture = false;
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+#if _CAPTURE_KERNEL
+      auto& profiler = getMPSProfiler();
+      if (profiler.isCaptureEnabled()) {
+        profiler.startCapture(__func__, mpsStream);
+      }
+#endif
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      const std::string kernel = fmt::format("int4pack_mm_{}_{}", qGroupSize, scalarToMetalTypeString(A));
+      id<MTLComputePipelineState> quantizedPSO = lib.getPipelineStateForFunc(kernel);
+      [computeEncoder setComputePipelineState:quantizedPSO];
+      mtl_setBuffer(computeEncoder, A, 0);
+      mtl_setBuffer(computeEncoder, B, 1);
+      mtl_setBuffer(computeEncoder, qScaleAndZeros, 2);
+      mtl_setBuffer(computeEncoder, C, 3);
+      [computeEncoder setBytes:sizes.data() length:sizeof(uint32_t) * sizes.size() atIndex:4];
+      mtl_dispatch1DJob(computeEncoder, quantizedPSO, C.numel());
+#if _CAPTURE_KERNEL
+      if (profiler.isCapturing()) {
+        profiler.stopCapture(mpsStream);
+      }
+#endif
+    }
+  });
+  return C;
+}
+
+Tensor _weight_int8pack_mm_mps(const Tensor& A, const Tensor& B, const Tensor& scales) {
+  auto M = A.size(0);
+  auto N = B.size(0);
+  auto K = A.size(1);
+
+  TORCH_CHECK(A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
+              __func__,
+              " : expect A to be either 32-bit or 16-bit float tensor.");
+  TORCH_CHECK(A.is_contiguous(), __func__, " : expect A to be contiguous.");
+  TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
+
+  TORCH_CHECK(B.dtype() == kChar, __func__, " : expect B to be int8 tensor.");
+  TORCH_CHECK(B.is_contiguous(), __func__, " : expect B to be contiguous.");
+  TORCH_CHECK(B.size(1) == K, __func__, " : expect B.size(1) == ", K);
+
+  TORCH_CHECK(scales.dim() == 1 && scales.size(0) == N, __func__, " : expect scales to be 1d tensor with size ", N);
+
+  auto C = at::empty({M, N}, A.options());
+
+  struct CachedGraph : public MPSCachedGraph {
+    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *ATensor = nil, *BTensor = nil, *scalesTensor = nil;
+    MPSGraphTensor* outputTensor = nil;
+  };
+  @autoreleasepool {
+    std::string key = __func__ + getTensorsStringKey({A, B, scales});
+    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
+      newCachedGraph->ATensor = mpsGraphRankedPlaceHolder(mpsGraph, A);
+      newCachedGraph->BTensor = mpsGraphRankedPlaceHolder(mpsGraph, B);
+      newCachedGraph->scalesTensor = mpsGraphRankedPlaceHolder(mpsGraph, scales);
+      auto castB = castMPSTensor(mpsGraph, newCachedGraph->BTensor, getMPSScalarType(A));
+      auto transposedB = [mpsGraph transposeTensor:castB dimension:-1 withDimension:-2 name:nil];
+      auto mmTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:newCachedGraph->ATensor
+                                                      secondaryTensor:transposedB
+                                                                 name:nil];
+      newCachedGraph->outputTensor = [mpsGraph multiplicationWithPrimaryTensor:mmTensor
+                                                               secondaryTensor:newCachedGraph->scalesTensor
+                                                                          name:nil];
+    });
+    auto APlaceholder = Placeholder(cachedGraph->ATensor, A);
+    auto BPlaceholder = Placeholder(cachedGraph->BTensor, B);
+    auto scalesPlaceholder = Placeholder(cachedGraph->scalesTensor, scales);
+    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor, C);
+    runMPSGraph(getCurrentMPSStream(),
+                cachedGraph->graph(),
+                dictionaryFromPlaceholders(APlaceholder, BPlaceholder, scalesPlaceholder),
+                outputPlaceholder);
+  }
+
+  return C;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/RangeFactories.mm b/aten/src/ATen/native/mps/operations/RangeFactories.mm
index 682679aa2045c..102c54c251dba 100644
--- a/aten/src/ATen/native/mps/operations/RangeFactories.mm
+++ b/aten/src/ATen/native/mps/operations/RangeFactories.mm
@@ -121,9 +121,7 @@
       MPSScalar stepScalar = getMPSScalar(step, result.scalar_type());
       feeds[cachedGraph->multiplyTensor] = getMPSGraphTensorFromScalar(stream, stepScalar);
 
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-          @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-      runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+      runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
     }
 
     if (!is_contiguous) {
@@ -168,7 +166,7 @@
     if (numel != size) {
       result.resize_({size});
     }
-    bool is_contiguous = result.is_contiguous();
+    bool is_contiguous = !mps::needsGather(result);
     Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) : result;
     using namespace mps;
     auto cache_ = MPSGraphCache::getInstance();
@@ -190,9 +188,7 @@
       MPSScalar stepScalar = getMPSScalar(step, result.scalar_type());
       feeds[cachedGraph->multiplyTensor] = getMPSGraphTensorFromScalar(stream, stepScalar);
 
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-          @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-      runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+      runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
     }
 
     if (!is_contiguous) {
@@ -259,9 +255,7 @@
       MPSScalar multiplyScalar = getMPSScalar(multiply, ScalarType::Float);
       feeds[cachedGraph->multiplyTensor] = getMPSGraphTensorFromScalar(stream, multiplyScalar);
 
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-          @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-      runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+      runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
     }
 
     if (!result.is_contiguous()) {
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index 0fc0ff8f2859d..416c83f0d3b3e 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -30,10 +30,12 @@
 #include <ATen/ops/nansum_native.h>
 #include <ATen/ops/norm_native.h>
 #include <ATen/ops/prod_native.h>
+#include <ATen/ops/std_mean_native.h>
 #include <ATen/ops/std_native.h>
 #include <ATen/ops/sum.h>
 #include <ATen/ops/sum_native.h>
 #include <ATen/ops/trace_native.h>
+#include <ATen/ops/var_mean_native.h>
 #include <ATen/ops/var_native.h>
 #endif
 
@@ -280,13 +282,8 @@ static void reduction_out_mps(const Tensor& input_t,
 
     auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t, mpsShape);
     auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape);
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -321,8 +318,8 @@ static void impl_func_norm_mps(const Tensor& input_tensor,
 
   auto reciprocal_p = 1 / p;
   bool pIsZero = (p == 0.0);
-  bool pIsPosInf = (p == numeric_limits<double>::infinity());
-  bool pIsNegInf = (p == -numeric_limits<double>::infinity());
+  bool pIsPosInf = (p == std::numeric_limits<double>::infinity());
+  bool pIsNegInf = (p == -std::numeric_limits<double>::infinity());
 
   int64_t num_input_dims = input_shape.size();
   int64_t num_reduce_dims = dim.size();
@@ -434,10 +431,7 @@ static void impl_func_norm_mps(const Tensor& input_tensor,
       feeds[otherPlaceholder.getMPSGraphTensor()] = otherPlaceholder.getMPSGraphTensorData();
     }
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -606,13 +600,8 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
     auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
     auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return output_t;
@@ -661,14 +650,8 @@ static Tensor min_max_mps_impl(const Tensor& input_t, MPSReductionType reduction
     auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
     auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, @[ @1 ]);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return output_t;
@@ -757,15 +740,8 @@ static void min_max_out_mps(const Tensor& input_t,
     auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape);
     auto indicesPlaceholder = Placeholder(cachedGraph->indicesTensor_, indices_t, apparent_out_shape);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
-      indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData()
-    };
-
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    auto results = dictionaryFromPlaceholders(outputPlaceholder, indicesPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
 }
@@ -920,14 +896,8 @@ static void argmax_argmin_out_mps(const Tensor& input_t,
     auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t, apparent_in_shape);
     auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -1266,15 +1236,8 @@ Tensor std_mps(const Tensor& input_t,
 
     auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
     auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape);
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -1316,15 +1279,8 @@ Tensor std_mps(const Tensor& input_t,
 
     auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
     auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t);
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -1375,15 +1331,8 @@ Tensor std_mps(const Tensor& input_t,
 
     auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
     auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape);
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -1422,15 +1371,8 @@ Tensor std_mps(const Tensor& input_t,
 
     auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
     auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t);
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -1489,7 +1431,7 @@ Tensor min_mps(const Tensor& input_t) {
 Tensor median_mps(const Tensor& input_t) {
   if (!is_macos_13_or_newer()) {
     TORCH_WARN_ONCE("MPS: median op is supported natively starting from macOS 13.0. ",
-                    "Falling back on CPU. This may have performace implications.");
+                    "Falling back on CPU. This may have performance implications.");
     return at::median(input_t.to("cpu"));
   }
 
@@ -1536,14 +1478,8 @@ Tensor median_mps(const Tensor& input_t) {
     auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
     auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, @[ @1 ]);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return output_t;
@@ -1626,15 +1562,8 @@ static void median_out_mps(const Tensor& input_t,
     auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape);
     auto indicesPlaceholder = Placeholder(cachedGraph->indicesTensor_, indices_t, apparent_out_shape);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
-      indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData()
-    };
-
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    auto results = dictionaryFromPlaceholders(outputPlaceholder, indicesPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
 }
@@ -1717,7 +1646,7 @@ static void median_out_mps(const Tensor& input_t,
 
   if (!is_macos_13_or_newer()) {
     TORCH_WARN_ONCE("MPS: median op is supported natively starting from macOS 13.0.",
-                    "Falling back on CPU. This may have performace implications.");
+                    "Falling back on CPU. This may have performance implications.");
     return median_from_cpu(input_t.to("cpu"),
                            dim,
                            keepdim,
@@ -1732,4 +1661,26 @@ static void median_out_mps(const Tensor& input_t,
   return std::tuple<Tensor&, Tensor&>{values, indices};
 }
 
+std::tuple<Tensor, Tensor> std_mean_mps(const Tensor& self,
+                                        at::OptionalIntArrayRef dim,
+                                        const c10::optional<Scalar>& correction,
+                                        bool keepdim) {
+  // TODO: Refactor it into a proper std_var_mean composite function
+  auto std = std_mps(self, dim, correction, keepdim);
+  auto mean = at::empty(std.sizes(), self.scalar_type(), c10::nullopt, kMPS, c10::nullopt, MemoryFormat::Contiguous);
+  mps::reduction_out_mps(self, dim, keepdim, c10::nullopt, mean, mps::MPSReductionType::MEAN, "mean_out_mps");
+  return {std, mean};
+}
+
+std::tuple<Tensor, Tensor> var_mean_mps(const Tensor& self,
+                                        at::OptionalIntArrayRef dim,
+                                        const c10::optional<Scalar>& correction,
+                                        bool keepdim) {
+  // TODO: Refactor it into a proper std_var_mean composite function
+  auto var = var_mps(self, dim, correction, keepdim);
+  auto mean = at::empty(var.sizes(), self.scalar_type(), c10::nullopt, kMPS, c10::nullopt, MemoryFormat::Contiguous);
+  mps::reduction_out_mps(self, dim, keepdim, c10::nullopt, mean, mps::MPSReductionType::MEAN, "mean_out_mps");
+  return {var, mean};
+}
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/RenormKernel.mm b/aten/src/ATen/native/mps/operations/RenormKernel.mm
index c72e9560abba0..c4655baa90eee 100644
--- a/aten/src/ATen/native/mps/operations/RenormKernel.mm
+++ b/aten/src/ATen/native/mps/operations/RenormKernel.mm
@@ -15,7 +15,9 @@
 namespace at::native {
 namespace {
 
-static const char* METAL_RENORM = R"RENORM_METAL(
+using namespace mps;
+
+static MetalShaderLibrary lib(R"RENORM_METAL(
 
 #include <metal_stdlib>
 using namespace metal;
@@ -41,48 +43,7 @@ kernel void renorm(constant T* norm [[buffer(0)]],
 REGISTER_RENORM_OP(float);
 REGISTER_RENORM_OP(half);
 
-)RENORM_METAL";
-
-using namespace mps;
-
-static id<MTLLibrary> compileRenormLibrary(id<MTLDevice> device, const std::string& key) {
-  static std::unordered_map<std::string, id<MTLLibrary>> libMap;
-  auto it = libMap.find(key);
-  if (it != libMap.end()) {
-    return it->second;
-  }
-
-  NSError* error = nil;
-  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
-  [options setLanguageVersion:MTLLanguageVersion2_3];
-stringWithCString:
-  id<MTLLibrary> renormLibrary = [device newLibraryWithSource:[NSString stringWithUTF8String:METAL_RENORM]
-                                                      options:options
-                                                        error:&error];
-  TORCH_CHECK(
-      renormLibrary, "Failed to to create renorm mps kernel library, error: ", error.localizedDescription.UTF8String);
-
-  libMap[key] = renormLibrary;
-  return renormLibrary;
-}
-
-static id<MTLComputePipelineState> renormPipelineState(id<MTLDevice> device, const std::string& key) {
-  static std::unordered_map<std::string, id<MTLComputePipelineState>> psoCache;
-  id<MTLComputePipelineState> pso = psoCache[key];
-  if (pso) {
-    return pso;
-  }
-
-  NSError* error = nil;
-  id<MTLLibrary> renormLib = compileRenormLibrary(device, key);
-  id<MTLFunction> renormFunc = [renormLib newFunctionWithName:[NSString stringWithUTF8String:key.c_str()]];
-  TORCH_CHECK(renormFunc, "Failed to create function state object for: ", key);
-  pso = [device newComputePipelineStateWithFunction:renormFunc error:&error];
-  TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]);
-
-  psoCache[key] = pso;
-  return pso;
-}
+)RENORM_METAL");
 
 void renorm_out_mps(const Tensor& self, const Scalar& p, int64_t dim, const Scalar& maxnorm, const Tensor& out) {
   auto self_sizes = self.sizes();
@@ -100,10 +61,10 @@ void renorm_out_mps(const Tensor& self, const Scalar& p, int64_t dim, const Scal
   id<MTLBuffer> normBuffer = getMTLBufferStorage(norm);
   id<MTLBuffer> factorBuffer = getMTLBufferStorage(factor);
 
-  string key = "renorm_" + scalarToMetalTypeString(self.scalar_type());
+  string key = "renorm_" + scalarToMetalTypeString(self);
   MPSStream* mpsStream = getCurrentMPSStream();
   id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
-  id<MTLComputePipelineState> renormPSO = renormPipelineState(device, key);
+  id<MTLComputePipelineState> renormPSO = lib.getPipelineStateForFunc(key);
 
   dispatch_sync(mpsStream->queue(), ^() {
     @autoreleasepool {
diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm
index ac84e98f491f5..db7722b0e63b4 100644
--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@@ -10,10 +10,6 @@
 #include <ATen/ops/repeat_native.h>
 #include <fmt/format.h>
 
-#ifdef __OBJC__
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
-#endif
-
 namespace at::native {
 
 Tensor permute_mps(const Tensor& self, IntArrayRef dims) {
@@ -90,18 +86,14 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
     Placeholder outputPlaceholder =
         Placeholder(cachedGraph->outputTensor_, result, /*mpsShape=*/nil, /*gatherTensorData*/ false, outputDataType);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return result;
 }
 
-static const char* METAL_REPEAT_INTERLEAVE = R"METAL_REPEAT(
+static mps::MetalShaderLibrary lib(R"METAL_REPEAT(
 kernel void repeat_interleave(constant {0}     * repeat_ptr                [[buffer(0)]],
                               constant int64_t * cumsum_ptr                [[buffer(1)]],
                               device {0}       * result_ptr                [[buffer(2)]],
@@ -114,49 +106,12 @@ kernel void repeat_interleave(constant {0}     * repeat_ptr                [[buf
     result_ptr[j] = tid;
   }}
 }}
-)METAL_REPEAT";
-
-static id<MTLLibrary> compileRepeatInterleaveLib(id<MTLDevice> device, const std::string& t1) {
-  auto key = t1;
-  static std::unordered_map<std::string, id<MTLLibrary>> libMap;
-  auto it = libMap.find(key);
-  if (it != libMap.end()) {
-    return it->second;
-  }
-  NSError* error = nil;
-  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
-  [options setLanguageVersion:MTLLanguageVersion2_3];
-  auto rc =
-      [device newLibraryWithSource:[NSString stringWithUTF8String:fmt::format(METAL_REPEAT_INTERLEAVE, t1).c_str()]
-                           options:options
-                             error:&error];
-  TORCH_CHECK(rc != nil && error == nil, "Failed to compile library: ", [[error localizedDescription] UTF8String]);
-  libMap[key] = rc;
-  return rc;
-}
-
-static id<MTLComputePipelineState> getPipelineState(id<MTLDevice> device, const std::string& t1) {
-  static std::string kernel = "repeat_interleave";
-  auto key = kernel + t1;
-  static std::unordered_map<std::string, id<MTLComputePipelineState>> cplMap;
-  auto it = cplMap.find(key);
-  if (it != cplMap.end()) {
-    return it->second;
-  }
-  NSError* error = nil;
-  auto library = compileRepeatInterleaveLib(device, t1);
-  id<MTLFunction> func = [library newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]];
-  TORCH_CHECK(func != nil, "Can't get kernel ", kernel);
-  auto rc = [device newComputePipelineStateWithFunction:func error:&error];
-  TORCH_CHECK(
-      rc != nil && error == nil, "Failed to construct pipeline state: ", [[error localizedDescription] UTF8String]);
-  cplMap[key] = rc;
-  return rc;
-}
+)METAL_REPEAT",
+                                   1);
 
 template <typename index_t>
-void computeRepeatIndices(index_t* repeat_ptr,
-                          int64_t* cumsum_ptr,
+void computeRepeatIndices(const index_t* repeat_ptr,
+                          const int64_t* cumsum_ptr,
                           index_t* result_ptr,
                           int64_t size,
                           int64_t result_size) {
@@ -178,7 +133,7 @@ void computeRepeatIndices(index_t* repeat_ptr,
   dispatch_sync(mpsStream->queue(), ^() {
     @autoreleasepool {
       id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
-      id<MTLComputePipelineState> pipelineState = getPipelineState(MPSDevice::getInstance()->device(), scalar_type);
+      id<MTLComputePipelineState> pipelineState = lib.getPipelineStateForFunc("repeat_interleave", {scalar_type});
 
       // this function call is a no-op if MPS Profiler is not enabled
       getMPSProfiler().beginProfileKernel(pipelineState, "repeat_interleave:" + scalar_type, false);
diff --git a/aten/src/ATen/native/mps/operations/ScatterGather.mm b/aten/src/ATen/native/mps/operations/ScatterGather.mm
index 8496a16506f87..a3fc5f690754c 100644
--- a/aten/src/ATen/native/mps/operations/ScatterGather.mm
+++ b/aten/src/ATen/native/mps/operations/ScatterGather.mm
@@ -102,14 +102,8 @@
     Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index, index_shape);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, nullptr, false, output_type);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      indexPlaceholder.getMPSGraphTensor() : indexPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder, indexPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -299,15 +293,8 @@ static void scatter_mps_general(const Tensor& self_arg,
     Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index, index_shape);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      srcPlaceholder.getMPSGraphTensor() : srcPlaceholder.getMPSGraphTensorData(),
-      indexPlaceholder.getMPSGraphTensor() : indexPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder, srcPlaceholder, indexPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Shape.mm b/aten/src/ATen/native/mps/operations/Shape.mm
index e6148904aba44..135041be1f41e 100644
--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@@ -198,13 +198,8 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
     Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values);
     Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices);
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = nil;
-    feeds = @{inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData()};
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      valuesPlaceholder.getMPSGraphTensor() : valuesPlaceholder.getMPSGraphTensorData(),
-      indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData()
-    };
-
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    auto results = dictionaryFromPlaceholders(valuesPlaceholder, indicesPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
 }
@@ -325,8 +320,16 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
   };
 
   @autoreleasepool {
-    string key = "cat_out_mps:" + to_string(dimension) + getTensorsStringKey(input_tensors, /*short_dtype*/ true) +
-        ":" + (memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
+    string key =
+        "cat_out_mps:" + to_string(dimension) + ":" + (memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
+    if (!all_same_dtype) {
+      key += getTensorsStringKey(input_tensors, true, all_same_sizes_and_stride);
+    } else {
+      key += ":" + getMPSTypeString(input_tensors[0].scalar_type(), true) + ":" + to_string(inputs.size());
+    }
+    for (auto idx : skipped_tensor_indices) {
+      key += "," + std::to_string(idx);
+    }
 
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       auto len_tensor_array = inputs.size() - skipped_tensor_indices.size();
@@ -339,8 +342,7 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
         if (tensor.scalar_type() == kBool) {
           scalar_type = MPSDataTypeInt8;
         }
-        newCachedGraph->inputTensors_[idx] =
-            mpsGraphRankedPlaceHolder(mpsGraph, scalar_type, getMPSShape(tensor, MemoryFormat::Contiguous));
+        newCachedGraph->inputTensors_[idx] = mpsGraphUnrankedPlaceHolder(mpsGraph, scalar_type);
         if (tensor.scalar_type() != out_dtype) {
           castInputTensors[idx] = [mpsGraph castTensor:newCachedGraph->inputTensors_[idx]
                                                 toType:getMPSDataType(out_dtype)
@@ -369,11 +371,7 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
         if (tensor.scalar_type() == kBool) {
           scalar_type = MPSDataTypeInt8;
         }
-        inputPlaceholders.emplace_back(cachedGraph->inputTensors_[t_idx],
-                                       tensor,
-                                       getMPSShape(tensor, MemoryFormat::Contiguous),
-                                       /*gatherTensorData*/ true,
-                                       scalar_type);
+        inputPlaceholders.emplace_back(cachedGraph->inputTensors_[t_idx], tensor, nullptr, true, scalar_type);
         t_idx++;
       }
       i++;
@@ -390,10 +388,7 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
     for (auto& inputPlaceholder : inputPlaceholders) {
       feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
     }
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
diff --git a/aten/src/ATen/native/mps/operations/SoftMax.mm b/aten/src/ATen/native/mps/operations/SoftMax.mm
index 92531164fbcff..4687ac6b96a1a 100644
--- a/aten/src/ATen/native/mps/operations/SoftMax.mm
+++ b/aten/src/ATen/native/mps/operations/SoftMax.mm
@@ -92,10 +92,11 @@ static void get_shapes(MPSShape* input_shape_readonly,
 
     NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
 
-    string key = "softmax_mps_out:" + mem_format_key + ":" + getMPSTypeString(input) + ":" + [ns_shape_key UTF8String] +
+    string key = "softmax_mps_out" + getTensorsStringKey(input, true, /*exclude_shape*/ true) + ":" + mem_format_key +
         ":" + std::to_string(dim_);
+
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input), input_shape);
+      MPSGraphTensor* inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()));
 
       // passing selector of softMaxWithTensor on the mpsGraph object
       MPSGraphTensor* outputTensor = [mpsGraph softMaxWithTensor:inputTensor axis:(NSInteger)dim_ name:nil];
@@ -122,12 +123,8 @@ static void get_shapes(MPSShape* input_shape_readonly,
     // This must be the Contiguous shape
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData()};
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -186,14 +183,8 @@ static void get_shapes(MPSShape* input_shape_readonly,
     Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad, grad_shape);
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      softmaxPlaceholder.getMPSGraphTensor() : softmaxPlaceholder.getMPSGraphTensorData(),
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(softmaxPlaceholder, gradOutputPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
   }
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Sort.mm b/aten/src/ATen/native/mps/operations/Sort.mm
index bb12aa657c735..e3ee85cfe230e 100644
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@@ -89,12 +89,8 @@
     Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values);
     Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices);
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = nil;
-    feeds = @{inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData()};
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      valuesPlaceholder.getMPSGraphTensor() : valuesPlaceholder.getMPSGraphTensorData(),
-      indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData()
-    };
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    auto results = dictionaryFromPlaceholders(valuesPlaceholder, indicesPlaceholder);
 
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
diff --git a/aten/src/ATen/native/mps/operations/SummaryOps.mm b/aten/src/ATen/native/mps/operations/SummaryOps.mm
index 5c65fb3d0a089..34f7fbeae50a2 100644
--- a/aten/src/ATen/native/mps/operations/SummaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/SummaryOps.mm
@@ -66,11 +66,8 @@
       feeds[weightsPlaceholder.getMPSGraphTensor()] = weightsPlaceholder.getMPSGraphTensorData();
     }
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
     // Run the graph
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
   return output;
diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm
index 781d13a27e78e..f378af1326a73 100644
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@@ -12,7 +12,9 @@
 #include <ATen/ops/clamp_max_native.h>
 #include <ATen/ops/clamp_min_native.h>
 #include <ATen/ops/clamp_native.h>
+#include <ATen/ops/isin_native.h>
 #include <ATen/ops/nan_to_num_native.h>
+#include <ATen/ops/ones_like_native.h>
 #include <ATen/ops/where_native.h>
 #endif
 
@@ -30,41 +32,53 @@ static void clamp_mps_graph(CachedGraph* cachedGraph,
                             const Tensor& min_tensor,
                             const Tensor& max_tensor) {
   auto input_dtype = input_tensor.scalar_type();
-  auto min_dtype = input_dtype;
-  auto max_dtype = input_dtype;
-  if (cachedGraph->minTensor) {
-    min_dtype = min_tensor.scalar_type();
-  }
-  if (cachedGraph->maxTensor) {
-    max_dtype = max_tensor.scalar_type();
-  }
+  auto min_dtype = cachedGraph->minTensor ? min_tensor.scalar_type() : input_dtype;
+  auto max_dtype = cachedGraph->maxTensor ? max_tensor.scalar_type() : input_dtype;
 
   MPSGraph* mpsGraph = cachedGraph->graph();
 
   cachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_tensor);
 
-  MPSGraphTensor* minTensor = cachedGraph->minTensor;
-  MPSGraphTensor* maxTensor = cachedGraph->maxTensor;
+  auto minTensor = cachedGraph->minTensor;
+  auto maxTensor = cachedGraph->maxTensor;
+
   if (input_dtype != min_dtype) {
     minTensor = castMPSTensor(mpsGraph, cachedGraph->minTensor, input_dtype);
   }
   if (input_dtype != max_dtype) {
     maxTensor = castMPSTensor(mpsGraph, cachedGraph->maxTensor, input_dtype);
   }
-  if (cachedGraph->minTensor && cachedGraph->maxTensor) {
-    cachedGraph->outputTensor = [mpsGraph clampWithTensor:cachedGraph->inputTensor
-                                           minValueTensor:minTensor
-                                           maxValueTensor:maxTensor
-                                                     name:nil];
-  } else if (cachedGraph->maxTensor) {
-    cachedGraph->outputTensor = [mpsGraph minimumWithPrimaryTensor:cachedGraph->inputTensor
-                                                   secondaryTensor:maxTensor
-                                                              name:nil];
-  } else if (cachedGraph->minTensor) {
-    cachedGraph->outputTensor = [mpsGraph maximumWithPrimaryTensor:cachedGraph->inputTensor
-                                                   secondaryTensor:minTensor
-                                                              name:nil];
+  if (c10::isIntegralType(input_dtype, /*includeBool=*/true)) {
+    if (minTensor && maxTensor) {
+      cachedGraph->outputTensor = [mpsGraph clampWithTensor:cachedGraph->inputTensor
+                                             minValueTensor:minTensor
+                                             maxValueTensor:maxTensor
+                                                       name:nil];
+    } else if (maxTensor) {
+      cachedGraph->outputTensor = [mpsGraph minimumWithPrimaryTensor:cachedGraph->inputTensor
+                                                     secondaryTensor:maxTensor
+                                                                name:nil];
+    } else if (minTensor) {
+      cachedGraph->outputTensor = [mpsGraph maximumWithPrimaryTensor:cachedGraph->inputTensor
+                                                     secondaryTensor:minTensor
+                                                                name:nil];
+    }
+    return;
+  }
+  // clampWithTensor doesn't propagate NaN through so simulate it as composition of
+  // maximumWithNaNPropagationWithPrimaryTensor and minimumWithNaNPropagationWithPrimaryTensor
+  auto outputTensor = cachedGraph->inputTensor;
+  if (minTensor) {
+    outputTensor = [mpsGraph maximumWithNaNPropagationWithPrimaryTensor:outputTensor
+                                                        secondaryTensor:minTensor
+                                                                   name:nil];
   }
+  if (maxTensor) {
+    outputTensor = [mpsGraph minimumWithNaNPropagationWithPrimaryTensor:outputTensor
+                                                        secondaryTensor:maxTensor
+                                                                   name:nil];
+  }
+  cachedGraph->outputTensor = outputTensor;
 }
 
 static void check_min_max_dims(const OptionalTensorRef clamp_opt, const Tensor& input_t, string op_name) {
@@ -198,10 +212,7 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
       feeds[maxPlaceholder.getMPSGraphTensor()] = maxPlaceholder.getMPSGraphTensorData();
     }
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -254,13 +265,70 @@ static void clamp_scalar_out_mps(const Tensor& input_t,
     auto outputPlaceholder =
         Placeholder(cachedGraph->outputTensor, output_t, /*mpsShape=*/nil, /*gatherTensorData=*/false);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
+  }
+}
 
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
+                                       const Tensor& test_elements,
+                                       bool assume_unique,
+                                       bool invert,
+                                       const Tensor& out,
+                                       string op_name) {
+  TORCH_CHECK(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS),
+              "isin_Tensor_Tensor_out supported on MPS from MacOs_14_0 onwards");
+  if (elements.numel() == 0) {
+    return;
+  }
+
+  if (test_elements.numel() == 0) {
+    if (invert) {
+      auto ones = ones_like(out);
+      out.copy_(ones);
+    } else {
+      auto zeros = zeros_like(out);
+      out.copy_(zeros);
+    }
+    return;
+  }
+
+  TORCH_CHECK(elements.is_mps() && test_elements.is_mps());
+  TORCH_CHECK(elements.dtype() == test_elements.dtype());
+
+  @autoreleasepool {
+    string key =
+        op_name + getTensorsStringKey({elements}) + getTensorsStringKey({test_elements}) + std::to_string(invert);
+
+    auto cachedGraph = LookUpOrCreateCachedGraph<MPSBinaryCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
+      MPSGraphTensor* inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(elements.scalar_type()));
+      MPSGraphTensor* otherTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(test_elements.scalar_type()));
+
+      newCachedGraph->inputTensor_ = inputTensor;
+      newCachedGraph->otherTensor_ = otherTensor;
+
+      MPSShape* outputShape = getMPSShape(out);
+
+      MPSGraphTensor* input_flattened = [mpsGraph reshapeTensor:inputTensor withShape:@[ @-1, @1 ] name:nil];
+      MPSGraphTensor* other_flattened = [mpsGraph reshapeTensor:otherTensor withShape:@[ @1, @-1 ] name:nil];
+      MPSGraphTensor* isInTensor = [mpsGraph equalWithPrimaryTensor:input_flattened
+                                                    secondaryTensor:other_flattened
+                                                               name:nil];
+      MPSGraphTensor* output = [mpsGraph reductionOrWithTensor:isInTensor axis:1 name:nil];
+      output = [mpsGraph reshapeTensor:output withShape:outputShape name:nil];
+
+      if (invert) {
+        output = [mpsGraph notWithTensor:output name:nil];
+      }
+      newCachedGraph->outputTensor_ = output;
+    });
+
+    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, elements);
+    auto otherPlaceholder = Placeholder(cachedGraph->otherTensor_, test_elements);
+    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
+
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder, otherPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -297,7 +365,16 @@ static void clamp_scalar_out_mps(const Tensor& input_t,
   mps::clamp_scalar_out_mps(input_t, at::OptionalScalarRef(), max, output_t, __func__);
 }
 
-Tensor& where_self_out_mps(const Tensor& condition, const Tensor& self, const Tensor& other, Tensor& out) {
+TORCH_IMPL_FUNC(isin_Tensor_Tensor_out_mps)
+(const Tensor& elements, const Tensor& test_elements, bool assume_unique, bool invert, const Tensor& out) {
+  mps::isin_Tensor_Tensor_out_mps(elements, test_elements, assume_unique, invert, out, __func__);
+}
+
+static void where_kernel_mps(TensorIterator& iter) {
+  const auto& condition = iter.input(0);
+  const auto& self = iter.input(1);
+  const auto& other = iter.input(2);
+  auto& out = iter.output(0);
   TORCH_CHECK(condition.device() == self.device() && self.device() == other.device(),
               "Expected all tensors to be on the same device, but found at least two devices.");
   TORCH_CHECK(self.dtype() == other.dtype(), "expected scalar type ", self.dtype(), " but found ", other.dtype());
@@ -316,8 +393,9 @@ static void clamp_scalar_out_mps(const Tensor& input_t,
   MPSStream* stream = getCurrentMPSStream();
 
   // Empty output
-  if (out.numel() == 0)
-    return out;
+  if (out.numel() == 0) {
+    return;
+  }
 
   // Derive from MPSCachedGraph
   struct CachedGraph : public MPSCachedGraph {
@@ -372,61 +450,9 @@ static void clamp_scalar_out_mps(const Tensor& input_t,
         Placeholder(cachedGraph->otherTensor_, other, /*mpsShape=*/nullptr, /*gatherTensorData=*/true, otherDataType);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      conditionPlaceholder.getMPSGraphTensor() : conditionPlaceholder.getMPSGraphTensorData(),
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(conditionPlaceholder, selfPlaceholder, otherPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
-
-  return out;
-}
-
-Tensor where_mps(const Tensor& condition, const Tensor& self, const Tensor& other) {
-  auto max_dim = std::max(condition.dim(), std::max(self.dim(), other.dim()));
-
-  // How many leading dimensions do we broadcast across for each Tensor?
-  int cond_num_implicit_ones = (max_dim - condition.dim());
-  int self_num_implicit_ones = (max_dim - self.dim());
-  int other_num_implicit_ones = (max_dim - other.dim());
-
-  std::vector<int64_t> out_arr(max_dim);
-
-  // Broadcasted output shape
-  for (int i = 0; i < max_dim; i++) {
-    // Use up the leading broadcast dimensions for each Tensor, then continue from the start of the "actual" shape
-    int64_t cond_idx = i < cond_num_implicit_ones ? 1 : (condition.size(i - cond_num_implicit_ones));
-    int64_t self_idx = i < self_num_implicit_ones ? 1 : (self.size(i - self_num_implicit_ones));
-    int64_t other_idx = i < other_num_implicit_ones ? 1 : (other.size(i - other_num_implicit_ones));
-
-    auto max_idx = std::max({cond_idx, self_idx, other_idx});
-
-    TORCH_CHECK(cond_idx == max_idx || cond_idx == 1 || (cond_idx == 0 && max_idx == 1),
-                i,
-                "'th index ",
-                cond_idx,
-                " of condition tensor does not match the other tensors")
-    TORCH_CHECK(self_idx == max_idx || self_idx == 1 || (self_idx == 0 && max_idx == 1),
-                i,
-                "'th index ",
-                self_idx,
-                " of x tensor does not match the other tensors")
-    TORCH_CHECK(other_idx == max_idx || other_idx == 1 || (other_idx == 0 && max_idx == 1),
-                i,
-                "'th index ",
-                other_idx,
-                " of x tensor does not match the other tensors")
-
-    out_arr[i] = (cond_idx == 0 || self_idx == 0 || other_idx == 0) ? 0 : max_idx;
-  }
-
-  Tensor ret = at::empty(
-      IntArrayRef(out_arr), self.scalar_type(), c10::nullopt, kMPS, c10::nullopt, self.suggest_memory_format());
-  return where_self_out_mps(condition, self, other, ret);
 }
 
 Tensor& nan_to_num_out_mps(const Tensor& self,
@@ -520,11 +546,11 @@ Tensor where_mps(const Tensor& condition, const Tensor& self, const Tensor& othe
       cachedGraph->posInfReplacementTensor : getMPSGraphTensorFromScalar(stream, posInfReplacementScalar),
       cachedGraph->negInfReplacementTensor : getMPSGraphTensorFromScalar(stream, negInfReplacementScalar),
     };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
   return result;
 }
 
+REGISTER_DISPATCH(where_kernel, &where_kernel_mps);
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/TriangularOps.mm b/aten/src/ATen/native/mps/operations/TriangularOps.mm
index 89eb5c3c37fa1..5fa0b22184535 100644
--- a/aten/src/ATen/native/mps/operations/TriangularOps.mm
+++ b/aten/src/ATen/native/mps/operations/TriangularOps.mm
@@ -55,12 +55,8 @@
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
@@ -105,12 +101,8 @@
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 }
 
diff --git a/aten/src/ATen/native/mps/operations/UnaryKernel.mm b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
index 4998d90e46871..540fc6a26cd81 100644
--- a/aten/src/ATen/native/mps/operations/UnaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
@@ -39,45 +39,7 @@
   return getMetalType(t.scalar_type());
 }
 
-static id<MTLLibrary> compileUnaryOpsLibrary(id<MTLDevice> device, const std::string& t1, const std::string& t2) {
-  auto key = t1 + t2;
-  static std::unordered_map<std::string, id<MTLLibrary>> libMap;
-  auto it = libMap.find(key);
-  if (it != libMap.end()) {
-    return it->second;
-  }
-  NSError* error = nil;
-  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
-  [options setLanguageVersion:MTLLanguageVersion2_3];
-  auto rc =
-      [device newLibraryWithSource:[NSString stringWithUTF8String:fmt::format(UNARY_KERNEL_TEMPLATE, t1, t2).c_str()]
-                           options:options
-                             error:&error];
-  TORCH_CHECK(rc != nil && error == nil, "Failed to compile library: ", [[error localizedDescription] UTF8String]);
-  libMap[key] = rc;
-  return rc;
-}
-
-static id<MTLComputePipelineState> getCPLState(id<MTLDevice> device,
-                                               const std::string& t1,
-                                               const std::string& t2,
-                                               const std::string& fname) {
-  auto key = t1 + t2 + fname;
-  static std::unordered_map<std::string, id<MTLComputePipelineState>> cplMap;
-  auto it = cplMap.find(key);
-  if (it != cplMap.end()) {
-    return it->second;
-  }
-  NSError* error = nil;
-  auto library = compileUnaryOpsLibrary(device, t1, t2);
-  id<MTLFunction> func = [library newFunctionWithName:[NSString stringWithUTF8String:fname.c_str()]];
-  TORCH_CHECK(func != nil, "Can't get function ", fname);
-  auto rc = [device newComputePipelineStateWithFunction:func error:&error];
-  TORCH_CHECK(
-      rc != nil && error == nil, "Failed to construct pipeline state: ", [[error localizedDescription] UTF8String]);
-  cplMap[key] = rc;
-  return rc;
-}
+static mps::MetalShaderLibrary lib(UNARY_KERNEL_TEMPLATE, 2);
 
 TORCH_IMPL_FUNC(erfinv_out_mps)(const Tensor& self, const Tensor& output_) {
   // handle erfinv ops using metal kernel
@@ -95,9 +57,7 @@
   }
   using namespace mps;
   @autoreleasepool {
-    id<MTLDevice> device = MPSDevice::getInstance()->device();
-    id<MTLComputePipelineState> cplState =
-        getCPLState(device, getMetalType(outputTensor), getMetalType(self), "erfinv_mps_kernel");
+    auto cplState = lib.getPipelineStateForFunc("erfinv_mps_kernel", {getMetalType(outputTensor), getMetalType(self)});
 
     if (!self.is_contiguous()) {
       inputTensor = inputTensor.contiguous();
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 970c0ee77e868..46709f2489e7d 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -1,6 +1,7 @@
 //  Copyright © 2022 Apple Inc.
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/mps/Copy.h>
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 
@@ -18,6 +19,7 @@
 #include <ATen/ops/atan_native.h>
 #include <ATen/ops/atanh_native.h>
 #include <ATen/ops/ceil_native.h>
+#include <ATen/ops/conj_physical_native.h>
 #include <ATen/ops/cos_native.h>
 #include <ATen/ops/cosh_native.h>
 #include <ATen/ops/cumprod_native.h>
@@ -28,6 +30,7 @@
 #include <ATen/ops/expm1_native.h>
 #include <ATen/ops/floor_native.h>
 #include <ATen/ops/frac_native.h>
+#include <ATen/ops/imag.h>
 #include <ATen/ops/log10_native.h>
 #include <ATen/ops/log1p_native.h>
 #include <ATen/ops/log2_native.h>
@@ -35,7 +38,9 @@
 #include <ATen/ops/logical_not_native.h>
 #include <ATen/ops/logit_backward_native.h>
 #include <ATen/ops/logit_native.h>
+#include <ATen/ops/neg.h>
 #include <ATen/ops/neg_native.h>
+#include <ATen/ops/real.h>
 #include <ATen/ops/reciprocal_native.h>
 #include <ATen/ops/reshape.h>
 #include <ATen/ops/round_native.h>
@@ -70,26 +75,13 @@ static bool is_empty_tensor(const Tensor& self) {
   return self.numel() == 0;
 }
 
-static void unary_op(const Tensor& self,
-                     const Tensor& output_,
-                     std::string op_name,
-                     UnaryOpBlock unaryBlock,
-                     is_noop_p is_noop = is_empty_tensor) {
+static void unary_op_noresize(const Tensor& self, const Tensor& output_, std::string op_name, UnaryOpBlock unaryBlock) {
   TORCH_CHECK(!(!is_macos_13_or_newer() && self.scalar_type() == ScalarType::Byte),
               "MPS support unary op with uint8 natively starting from macOS 13.0");
 
-  if (!output_.is_same_size(self)) {
-    output_.resize_(self.sizes());
-  }
-
-  if (is_noop(self)) {
-    output_.copy_(self);
-    return;
-  }
-
   auto output = output_;
   bool needsCopyToOutput = false;
-  if (output.storage_offset() || !output.is_contiguous()) {
+  if (needsGather(output)) {
     output = at::empty(output.sizes(), output.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
     needsCopyToOutput = true;
   }
@@ -125,11 +117,8 @@ static void unary_op(const Tensor& self,
 
     auto selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self_, /*mpsShape=*/nullptr, gatherTensorData);
     auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, /*mpsShape=*/nullptr, false);
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()};
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
 
     if (needsCopyToOutput) {
       output_.copy_(output);
@@ -137,6 +126,23 @@ static void unary_op(const Tensor& self,
   }
 }
 
+static void unary_op(const Tensor& self,
+                     const Tensor& output_,
+                     std::string op_name,
+                     UnaryOpBlock unaryBlock,
+                     is_noop_p is_noop = is_empty_tensor) {
+  if (!output_.is_same_size(self)) {
+    output_.resize_(self.sizes());
+  }
+
+  if (is_noop(self)) {
+    output_.copy_(self);
+    return;
+  }
+
+  unary_op_noresize(self, output_, op_name, unaryBlock);
+}
+
 MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
   // Rounding is a no-op for integral types, and also a reasonable workaround
   // For MPSGraph bug on Apple Silicon, that throws `Function floorOp_i64 was not found in the library`
@@ -166,6 +172,12 @@ static void unary_op(const Tensor& self,
   return [mpsGraph logarithmWithTensor:addedTensor name:nil];
 }
 
+static MPSGraphTensor* lengthOfComplexAsReal(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+  auto squares = [mpsGraph squareWithTensor:inputTensor name:nil];
+  auto sumSquares = [mpsGraph reductionSumWithTensor:squares axis:-1 name:nil];
+  return [mpsGraph squareRootWithTensor:sumSquares name:nil];
+}
+
 } // namespace mps
 
 TORCH_IMPL_FUNC(trunc_out_mps)(const Tensor& self, const Tensor& output) {
@@ -224,14 +236,6 @@ static void unary_op(const Tensor& self,
     });                                                                                                          \
   }
 
-#define CREATE_MPS_UNARY_TORCH_IMPL_FUNC(func_out, func_stub)                                                    \
-  Tensor& func_out(const Tensor& self, Tensor& output) {                                                         \
-    mps::unary_op(self, output, #func_out, ^MPSGraphTensor*(MPSGraph * mpsGraph, MPSGraphTensor * inputTensor) { \
-      return [mpsGraph func_stub##WithTensor:inputTensor name:nil];                                              \
-    });                                                                                                          \
-    return output;                                                                                               \
-  }
-
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(exp_out_mps, exponent)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(exp2_out_mps, exponentBase2)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(reciprocal_out_mps, reciprocal)
@@ -255,7 +259,35 @@ static void unary_op(const Tensor& self,
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(acosh_out_mps, acosh)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(atanh_out_mps, atanh)
 
-CREATE_MPS_UNARY_TORCH_IMPL_FUNC(abs_out_mps, absolute)
+Tensor& abs_out_mps(const Tensor& self, Tensor& output) {
+  using namespace mps;
+
+  if (!output.is_same_size(self)) {
+    output.resize_(self.sizes());
+  }
+
+  if (self.numel() == 0) {
+    return output;
+  }
+
+  if (supportsComplex() || !self.is_complex()) {
+    unary_op_noresize(self, output, "abs_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+      auto rc = [mpsGraph absoluteWithTensor:inputTensor name:nil];
+      if (self.is_complex()) {
+        rc = [mpsGraph realPartOfTensor:rc name:nil];
+      }
+      return rc;
+    });
+  } else {
+    Tensor realInput = at::view_as_real(self);
+    unary_op_noresize(
+        realInput, output, "abs_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+          auto rc = lengthOfComplexAsReal(mpsGraph, inputTensor);
+          return [mpsGraph reshapeTensor:rc withShape:getMPSShape(output) name:nil];
+        });
+  }
+  return output;
+}
 
 Tensor& logical_not_out_mps(const Tensor& self, Tensor& output) {
   auto bool_self = self.to(ScalarType::Bool);
@@ -394,13 +426,8 @@ Tensor logit_mps(const Tensor& self, c10::optional<double> eps) {
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
   }
 }
 
@@ -437,7 +464,7 @@ static void cumulative_op_impl(const Tensor& self,
 
   // issue #103810551: cumsum / cumprod are broken for int8, int16 and as chances for overflow are pretty high, cast to
   // int32 fixed in macOS 13.3
-  bool castInputData = (isIntegralType(input.scalar_type(), false) && input.scalar_type() != ScalarType::Int &&
+  bool castInputData = (isIntegralType(input.scalar_type(), true) && input.scalar_type() != ScalarType::Int &&
                         input.scalar_type() != ScalarType::Long);
 
   TORCH_CHECK(macOS13_3_plus || input.scalar_type() != ScalarType::Long,
@@ -487,9 +514,7 @@ static void cumulative_op_impl(const Tensor& self,
   Tensor realOutput = at::view_as_real(output);
 
   auto complex_sgn_op = [&](MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) -> MPSGraphTensor* {
-    MPSGraphTensor* squares = [mpsGraph squareWithTensor:inputTensor name:nil];
-    MPSGraphTensor* sumSquares = [mpsGraph reductionSumWithTensor:squares axis:-1 name:nil];
-    MPSGraphTensor* norm = [mpsGraph squareRootWithTensor:sumSquares name:nil];
+    MPSGraphTensor* norm = mps::lengthOfComplexAsReal(mpsGraph, inputTensor);
     MPSGraphTensor* zero = [mpsGraph constantWithScalar:0.0 dataType:norm.dataType];
     MPSGraphTensor* isZero = [mpsGraph equalWithPrimaryTensor:norm secondaryTensor:zero name:nil];
     MPSGraphTensor* sgnTensor = [mpsGraph divisionWithPrimaryTensor:inputTensor secondaryTensor:norm name:nil];
@@ -499,4 +524,20 @@ static void cumulative_op_impl(const Tensor& self,
   mps::unary_op(realInput, realOutput, "sgn_out_mps", complex_sgn_op);
 }
 
+Tensor& conj_physical_out_mps(const Tensor& self, Tensor& result) {
+  TORCH_CHECK(self.is_complex());
+  if (!mps::supportsComplex()) {
+    if (!result.is_same_size(self)) {
+      result.resize_(self.sizes());
+    }
+    at::real(result).copy_(at::real(self));
+    at::imag(result).copy_(at::neg(at::imag(self)));
+  } else {
+    mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+      return [mpsGraph conjugateWithTensor:inputTensor name:nil];
+    });
+  }
+  return result;
+}
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/Unique.mm b/aten/src/ATen/native/mps/operations/Unique.mm
index cda420da5c03b..fc30c2d0b797c 100644
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@@ -107,7 +107,7 @@
                                                                           name:nil];
   MPSGraphTensor* mask = [graph castTensor:notEqualToPreviousElement toType:MPSDataTypeInt32 name:@"castMaskTensor"];
 
-  // If comparing tensors, not scalars, check if entire tensor matches previos element using reductionOr over tensor
+  // If comparing tensors, not scalars, check if entire tensor matches previous element using reductionOr over tensor
   if (dimOpt.has_value() && [shape count] != 1) {
     NSMutableArray* axes = [[NSMutableArray alloc] initWithCapacity:[shape count] - 1];
     for (const auto axis : c10::irange([shape count])) {
@@ -186,11 +186,7 @@
   @autoreleasepool {
     string key = getUniqueKey(self.scalar_type(), self.sizes(), return_inverse, return_counts, consecutive, dim);
     return LookUpOrCreateCachedGraph<UniqueCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      // Workaround for MPSShaderLibrary bug
-      // TODO: Remove once https://github.com/pytorch/pytorch/issues/82305 is resolved
-      auto inputType = getMPSScalarType(self.scalar_type());
-      newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, inputType, getMPSShape(self.sizes()));
-
+      newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(self), getMPSShape(self));
       auto outputTensors = buildUniqueGraph(self, newCachedGraph, return_inverse, return_counts, consecutive, dim);
 
       newCachedGraph->outputTensor_ = outputTensors[0];
@@ -210,9 +206,7 @@ static void runUniqueGraph(UniqueCachedGraph* uniqueGraph,
                            bool return_inverse,
                            bool return_counts) {
   Placeholder inputPlaceholder = Placeholder(uniqueGraph->inputTensor_, input);
-  NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-    inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-  };
+  auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
 
   NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = [NSMutableDictionary dictionary];
   Placeholder outputPlaceholder = Placeholder(uniqueGraph->outputTensor_, output);
@@ -285,7 +279,7 @@ static void runUniqueGraph(UniqueCachedGraph* uniqueGraph,
 }
 
 static std::tuple<Tensor, Tensor, Tensor> castToMPS(std::tuple<Tensor, Tensor, Tensor> out) {
-  return std::make_tuple(get<0>(out).to("mps"), get<1>(out).to("mps"), get<2>(out).to("mps"));
+  return std::make_tuple(std::get<0>(out).to("mps"), std::get<1>(out).to("mps"), std::get<2>(out).to("mps"));
 }
 
 std::tuple<Tensor, Tensor, Tensor> unique_consecutive_mps(const Tensor& self,
@@ -294,7 +288,7 @@ static void runUniqueGraph(UniqueCachedGraph* uniqueGraph,
                                                           c10::optional<int64_t> dim) {
   if (!is_macos_13_or_newer()) {
     TORCH_WARN_ONCE("MPS: unique_consecutive op is supported natively starting from macOS 13.0. ",
-                    "Falling back on CPU. This may have performace implications.");
+                    "Falling back on CPU. This may have performance implications.");
     return castToMPS(at::unique_consecutive(self.to("cpu"), return_inverse, return_counts, dim));
   }
 
@@ -307,7 +301,7 @@ static void runUniqueGraph(UniqueCachedGraph* uniqueGraph,
                                                               const bool return_counts) {
   if (!is_macos_13_or_newer()) {
     TORCH_WARN_ONCE("MPS: unique_dim_consecutive op is supported natively starting from macOS 13.0. ",
-                    "Falling back on CPU. This may have performace implications.");
+                    "Falling back on CPU. This may have performance implications.");
     return castToMPS(at::unique_dim_consecutive(self.to("cpu"), dim, return_inverse, return_counts));
   }
 
@@ -320,7 +314,7 @@ static void runUniqueGraph(UniqueCachedGraph* uniqueGraph,
                                                 const bool return_counts) {
   if (!is_macos_13_or_newer()) {
     TORCH_WARN_ONCE("MPS: _unique2 op is supported natively starting from macOS 13.0. ",
-                    "Falling back on CPU. This may have performace implications.");
+                    "Falling back on CPU. This may have performance implications.");
     return castToMPS(at::_unique2(self.to("cpu"), sorted, return_inverse, return_counts));
   }
 
diff --git a/aten/src/ATen/native/mps/operations/UpSample.mm b/aten/src/ATen/native/mps/operations/UpSample.mm
index 64fe89b7f539d..f4973f6000156 100644
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@@ -20,6 +20,10 @@
 #include <ATen/ops/upsample_bilinear2d_backward.h>
 #include <ATen/ops/upsample_bilinear2d_backward_native.h>
 #include <ATen/ops/upsample_bilinear2d_native.h>
+#include <ATen/ops/upsample_linear1d.h>
+#include <ATen/ops/upsample_linear1d_backward.h>
+#include <ATen/ops/upsample_linear1d_backward_native.h>
+#include <ATen/ops/upsample_linear1d_native.h>
 #include <ATen/ops/upsample_nearest1d.h>
 #include <ATen/ops/upsample_nearest1d_backward.h>
 #include <ATen/ops/upsample_nearest1d_backward_native.h>
@@ -36,9 +40,9 @@
 // supported resize_mode: 'nearest' | 'bilinear' | 'nearest-exact'
 static void upsample_out_template(const Tensor& input,
                                   IntArrayRef output_size,
-                                  c10::optional<IntArrayRef> input_size_opt, // only used for backward pass
-                                  c10::optional<double> scale_h_opt,
-                                  c10::optional<double> scale_w_opt,
+                                  std::optional<IntArrayRef> input_size_opt, // only used for backward pass
+                                  std::optional<double> scale_h_opt,
+                                  std::optional<double> scale_w_opt,
                                   const Tensor& output,
                                   bool align_corners,
                                   const c10::string_view resize_mode_str) {
@@ -225,9 +229,7 @@ static void upsample_out_template(const Tensor& input,
       inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
       cachedGraph->outputSizeTensor : sizeTensorData,
     };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results =
-        @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
 
     if (out.has_storage()) {
       output.copy_(out);
@@ -237,7 +239,7 @@ static void upsample_out_template(const Tensor& input,
 
 } // namespace mps
 
-static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10::optional<double> scale) {
+static bool check_mps_compatibility(const c10::string_view resize_mode_str, std::optional<double> scale) {
   static const bool is_macOS_13_0_or_newer = is_macos_13_or_newer();
   if (!is_macOS_13_0_or_newer) {
     // passing scale factors to MPS's resize APIs is not supported on macOS < 13
@@ -260,7 +262,7 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10:
 }
 
 TORCH_IMPL_FUNC(upsample_nearest1d_out_mps)
-(const Tensor& input, IntArrayRef output_size, c10::optional<double> scale, const Tensor& output) {
+(const Tensor& input, IntArrayRef output_size, std::optional<double> scale, const Tensor& output) {
   if (check_mps_compatibility("nearest", scale)) {
     mps::upsample_out_template(input, output_size, c10::nullopt, c10::nullopt, scale, output, false, "nearest");
   } else {
@@ -272,7 +274,7 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10:
 (const Tensor& grad_output,
  IntArrayRef output_size,
  IntArrayRef input_size,
- c10::optional<double> scale,
+ std::optional<double> scale,
  const Tensor& grad_input) {
   if (check_mps_compatibility("nearest", scale)) {
     mps::upsample_out_template(grad_output, output_size, input_size, c10::nullopt, scale, grad_input, false, "nearest");
@@ -282,7 +284,7 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10:
 }
 
 TORCH_IMPL_FUNC(_upsample_nearest_exact1d_out_mps)
-(const Tensor& input, IntArrayRef output_size, c10::optional<double> scale, const Tensor& output) {
+(const Tensor& input, IntArrayRef output_size, std::optional<double> scale, const Tensor& output) {
   if (check_mps_compatibility("nearest-exact", scale)) {
     mps::upsample_out_template(input, output_size, c10::nullopt, c10::nullopt, scale, output, false, "nearest-exact");
   } else {
@@ -294,7 +296,7 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10:
 (const Tensor& grad_output,
  IntArrayRef output_size,
  IntArrayRef input_size,
- c10::optional<double> scale,
+ std::optional<double> scale,
  const Tensor& grad_input) {
   if (check_mps_compatibility("nearest-exact", scale)) {
     mps::upsample_out_template(
@@ -307,8 +309,8 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10:
 TORCH_IMPL_FUNC(upsample_nearest2d_out_mps)
 (const Tensor& input,
  IntArrayRef output_size,
- c10::optional<double> scales_h,
- c10::optional<double> scales_w,
+ std::optional<double> scales_h,
+ std::optional<double> scales_w,
  const Tensor& output) {
   if (check_mps_compatibility("nearest", scales_w)) {
     mps::upsample_out_template(input, output_size, c10::nullopt, scales_h, scales_w, output, false, "nearest");
@@ -321,8 +323,8 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10:
 (const Tensor& grad_output,
  IntArrayRef output_size,
  IntArrayRef input_size,
- c10::optional<double> scales_h,
- c10::optional<double> scales_w,
+ std::optional<double> scales_h,
+ std::optional<double> scales_w,
  const Tensor& grad_input) {
   if (check_mps_compatibility("nearest", scales_w)) {
     mps::upsample_out_template(grad_output, output_size, input_size, scales_h, scales_w, grad_input, false, "nearest");
@@ -335,8 +337,8 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10:
 TORCH_IMPL_FUNC(_upsample_nearest_exact2d_out_mps)
 (const Tensor& input,
  IntArrayRef output_size,
- c10::optional<double> scales_h,
- c10::optional<double> scales_w,
+ std::optional<double> scales_h,
+ std::optional<double> scales_w,
  const Tensor& output) {
   if (check_mps_compatibility("nearest-exact", scales_w)) {
     mps::upsample_out_template(input, output_size, c10::nullopt, scales_h, scales_w, output, false, "nearest-exact");
@@ -349,8 +351,8 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10:
 (const Tensor& grad_output,
  IntArrayRef output_size,
  IntArrayRef input_size,
- c10::optional<double> scales_h,
- c10::optional<double> scales_w,
+ std::optional<double> scales_h,
+ std::optional<double> scales_w,
  const Tensor& grad_input) {
   if (check_mps_compatibility("nearest-exact", scales_w)) {
     mps::upsample_out_template(
@@ -361,12 +363,38 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10:
   }
 }
 
+TORCH_IMPL_FUNC(upsample_linear1d_out_mps)
+(const Tensor& input, IntArrayRef output_size, bool align_corners, std::optional<double> scale, const Tensor& output) {
+  if (check_mps_compatibility("bilinear", scale)) {
+    mps::upsample_out_template(
+        input, output_size, c10::nullopt, c10::nullopt, scale, output, align_corners, "bilinear");
+  } else {
+    output.copy_(at::upsample_linear1d(input.to("cpu"), output_size, align_corners, scale));
+  }
+}
+
+TORCH_IMPL_FUNC(upsample_linear1d_backward_out_mps)
+(const Tensor& grad_output,
+ IntArrayRef output_size,
+ IntArrayRef input_size,
+ bool align_corners,
+ std::optional<double> scale,
+ const Tensor& grad_input) {
+  if (check_mps_compatibility("bilinear", scale)) {
+    mps::upsample_out_template(
+        grad_output, output_size, input_size, c10::nullopt, scale, grad_input, align_corners, "bilinear");
+  } else {
+    grad_input.copy_(
+        at::upsample_linear1d_backward(grad_output.to("cpu"), output_size, input_size, align_corners, scale));
+  }
+}
+
 TORCH_IMPL_FUNC(upsample_bilinear2d_out_mps)
 (const Tensor& input,
  IntArrayRef output_size,
  bool align_corners,
- c10::optional<double> scales_h,
- c10::optional<double> scales_w,
+ std::optional<double> scales_h,
+ std::optional<double> scales_w,
  const Tensor& output) {
   if (check_mps_compatibility("bilinear", scales_w)) {
     mps::upsample_out_template(input, output_size, c10::nullopt, scales_h, scales_w, output, align_corners, "bilinear");
@@ -380,8 +408,8 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10:
  IntArrayRef output_size,
  IntArrayRef input_size,
  bool align_corners,
- c10::optional<double> scales_h,
- c10::optional<double> scales_w,
+ std::optional<double> scales_h,
+ std::optional<double> scales_w,
  const Tensor& grad_input) {
   if (check_mps_compatibility("bilinear", scales_w)) {
     mps::upsample_out_template(
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index 0d276c4a0c076..b583a19ef5e61 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -4,6 +4,8 @@
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/Resize.h>
+// For MTLLanguageVersion_3_1
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>
 
@@ -91,7 +93,7 @@
     MPSGraphTensorData* outputTensorData = [[[MPSGraphTensorData alloc] initWithMTLBuffer:outputBuffer
                                                                                     shape:outputShape
                                                                                  dataType:outputType] autorelease];
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{cachedGraph->outputTensor : outputTensorData};
+    auto results = @{cachedGraph->outputTensor : outputTensorData};
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
   return output;
@@ -732,6 +734,7 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self) {
   static std::unordered_map<c10::ScalarType, std::string> scalarToMetalType = {
       {c10::ScalarType::Float, "float"},
       {c10::ScalarType::Half, "half"},
+      {c10::ScalarType::BFloat16, "bfloat"},
       {c10::ScalarType::Long, "long"},
       {c10::ScalarType::Int, "int"},
       {c10::ScalarType::Short, "short"},
@@ -747,66 +750,36 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self) {
   return it->second;
 }
 
-static std::string genScatterGatherCvtFunc(const std::string& dtypeSrc, const std::string& dtypeDst) {
+static std::string genScatterGatherCvtFunc(const std::string& dtypeSrc, const std::string& dtypeDst, bool needsConj) {
   const bool srcComplex = dtypeSrc[dtypeSrc.size() - 1] == '2';
   const bool dstComplex = dtypeDst[dtypeDst.size() - 1] == '2';
   if (dstComplex) {
-    return dtypeDst + (srcComplex ? "(x.x, x.y)" : "(x,  0.0)");
+    return dtypeDst + (srcComplex ? needsConj ? "(x.x, -x.y)" : "(x.x, x.y)" : "(x,  0.0)");
   }
   if (srcComplex) {
+    // TODO: Document why explicit cast is needed only for bfloat types
+    if (dtypeDst == "bfloat") {
+      return "bfloat(x.x)";
+    }
     return "x.x";
   }
-  return "x";
+  // TODO: Document why explicit cast is needed only for bfloat types
+  if (dtypeDst == "bfloat") {
+    return "bfloat(x)";
+  }
+  return "(x)";
 }
 
-static id<MTLLibrary> compileGatherScatterOpsLibrary(id<MTLDevice> device,
-                                                     const std::string& dtypeSrc,
-                                                     const std::string& dtypeDst,
-                                                     bool needsScatter) {
-  auto key = std::to_string(needsScatter) + dtypeSrc + dtypeDst;
-  static std::unordered_map<std::string, id<MTLLibrary>> _libCache;
-  auto it = _libCache.find(key);
-  if (it != _libCache.end()) {
-    return it->second;
-  }
-  NSError* error = nil;
-  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
-  [options setLanguageVersion:MTLLanguageVersion2_3];
-  const auto shaderStr = fmt::format(needsScatter ? SCATTER_OPS_TEMPLATE : GATHER_OPS_TEMPLATE,
-                                     dtypeSrc,
-                                     dtypeDst,
-                                     genScatterGatherCvtFunc(dtypeSrc, dtypeDst));
-  auto gatherScatterLib = [device newLibraryWithSource:[NSString stringWithUTF8String:shaderStr.c_str()]
-                                               options:options
-                                                 error:&error];
-  TORCH_CHECK(gatherScatterLib != nil && error == nil,
-              "Failed to compile gather-scatter library, error: ",
-              [[error description] UTF8String]);
-  _libCache[key] = gatherScatterLib;
-  return gatherScatterLib;
-}
+static MetalShaderLibrary scatterLib(SCATTER_OPS_TEMPLATE, 3);
+static MetalShaderLibrary gatherLib(GATHER_OPS_TEMPLATE, 3);
 
-static id<MTLComputePipelineState> getPipelineState(id<MTLDevice> device,
-                                                    const std::string& kernel,
+static id<MTLComputePipelineState> getPipelineState(const std::string& kernel,
                                                     const std::string& dtypeSrc,
                                                     const std::string& dtypeDst,
-                                                    bool needsScatter) {
-  auto key = kernel + dtypeSrc + dtypeDst;
-  static std::unordered_map<std::string, id<MTLComputePipelineState>> _mtlPipelineCache;
-  auto it = _mtlPipelineCache.find(key);
-  if (it != _mtlPipelineCache.end()) {
-    return it->second;
-  }
-
-  NSError* error = nil;
-  id<MTLLibrary> library = compileGatherScatterOpsLibrary(device, dtypeSrc, dtypeDst, needsScatter);
-  id<MTLFunction> func = [library newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]];
-  TORCH_CHECK(func, "Failed to load the Metal Shader function: ", kernel);
-  id<MTLComputePipelineState> pso = [device newComputePipelineStateWithFunction:func error:&error];
-  TORCH_CHECK(
-      pso != nil && error == nil, "Failed to construct pipeline state: ", [[error localizedDescription] UTF8String]);
-  _mtlPipelineCache[key] = pso;
-  return pso;
+                                                    bool needsScatter,
+                                                    bool needsConj) {
+  auto cvtFunc = genScatterGatherCvtFunc(dtypeSrc, dtypeDst, needsConj);
+  return (needsScatter ? scatterLib : gatherLib).getPipelineStateForFunc(kernel, {dtypeSrc, dtypeDst, cvtFunc});
 }
 
 Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) {
@@ -831,11 +804,11 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) {
   dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
     id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
     std::string functionName = getGatherScatterFunctionName(output.scalar_type(), output.dim(), /*needsScatter=*/false);
-    id<MTLComputePipelineState> gatherPSO = getPipelineState(MPSDevice::getInstance()->device(),
-                                                             functionName,
+    id<MTLComputePipelineState> gatherPSO = getPipelineState(functionName,
                                                              getGatherScatterScalarType(src),
                                                              getGatherScatterScalarType(output),
-                                                             /*needsScatter=*/false);
+                                                             /*needsScatter=*/false,
+                                                             src.is_conj() != dst.is_conj());
 
     // this function call is a no-op if MPS Profiler is not enabled
     getMPSProfiler().beginProfileKernel(gatherPSO, functionName, {src, output});
@@ -888,11 +861,11 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) {
       id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
       std::string functionName =
           getGatherScatterFunctionName(output.scalar_type(), output.dim(), /*needsScatter=*/true);
-      id<MTLComputePipelineState> scatterPSO = getPipelineState(MPSDevice::getInstance()->device(),
-                                                                functionName,
+      id<MTLComputePipelineState> scatterPSO = getPipelineState(functionName,
                                                                 getGatherScatterScalarType(src),
                                                                 getGatherScatterScalarType(output),
-                                                                /*needsScatter=*/true);
+                                                                /*needsScatter=*/true,
+                                                                src.is_conj() != output.is_conj());
 
       getMPSProfiler().beginProfileKernel(scatterPSO, functionName, {src, output});
 
diff --git a/aten/src/ATen/native/mps/operations/WeightNorm.mm b/aten/src/ATen/native/mps/operations/WeightNorm.mm
index 7ca63533ed19b..6cc20cfa01aa4 100644
--- a/aten/src/ATen/native/mps/operations/WeightNorm.mm
+++ b/aten/src/ATen/native/mps/operations/WeightNorm.mm
@@ -76,16 +76,8 @@
     Placeholder norms_placeholder = Placeholder(cachedGraph->norms_, norms);
     Placeholder w_placeholder = Placeholder(cachedGraph->w_, w);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      v_placeholder.getMPSGraphTensor() : v_placeholder.getMPSGraphTensorData(),
-      g_placeholder.getMPSGraphTensor() : g_placeholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      norms_placeholder.getMPSGraphTensor() : norms_placeholder.getMPSGraphTensorData(),
-      w_placeholder.getMPSGraphTensor() : w_placeholder.getMPSGraphTensorData()
-    };
-
+    auto feeds = dictionaryFromPlaceholders(v_placeholder, g_placeholder);
+    auto results = dictionaryFromPlaceholders(norms_placeholder, w_placeholder);
     runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results);
   }
 
@@ -171,18 +163,8 @@
     Placeholder grad_g_placeholder = Placeholder(cachedGraph->grad_g, grad_g);
     Placeholder grad_v_placeholder = Placeholder(cachedGraph->grad_v, grad_v);
 
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      grad_w_placeholder.getMPSGraphTensor() : grad_w_placeholder.getMPSGraphTensorData(),
-      norms_placeholder.getMPSGraphTensor() : norms_placeholder.getMPSGraphTensorData(),
-      v_placeholder.getMPSGraphTensor() : v_placeholder.getMPSGraphTensorData(),
-      g_placeholder.getMPSGraphTensor() : g_placeholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      grad_g_placeholder.getMPSGraphTensor() : grad_g_placeholder.getMPSGraphTensorData(),
-      grad_v_placeholder.getMPSGraphTensor() : grad_v_placeholder.getMPSGraphTensorData()
-    };
-
+    auto feeds = dictionaryFromPlaceholders(grad_w_placeholder, norms_placeholder, v_placeholder, g_placeholder);
+    auto results = dictionaryFromPlaceholders(grad_g_placeholder, grad_v_placeholder);
     runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results);
   }
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 4960417abdbef..b75bc85bbed53 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -134,7 +134,7 @@
   autogen: _new_zeros_with_same_feature_meta.out
 
 # This function compares the storage numel of self with that of other, where
-# storage numel is cumputed as: `other.storage().nbytes() / other.itemsize()`.
+# storage numel is computed as: `other.storage().nbytes() / other.itemsize()`.
 # We create this function for composite compliance purposes. The batching rule
 # always returns true because vmapped as_strided does not support accessing
 # storage locations not indexable by the input tensor.
@@ -189,6 +189,10 @@
 
 - func: _assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> ()
 
+- func: _print(str s) -> ()
+  dispatch:
+    CompositeExplicitAutograd: _print
+
 - func: sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> ()
   dispatch:
     CompositeExplicitAutograd: sym_constrain_range
@@ -478,6 +482,7 @@
 - func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: conj_physical_out
+    MPS: conj_physical_out_mps
     SparseCPU, SparseCUDA: conj_physical_out_sparse
     SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_out
   tags: pointwise
@@ -544,8 +549,8 @@
   structured_delegate: add.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: add_sparse
-    SparseCsrCPU, SparseCsrCUDA: add_sparse_csr
+    SparseCPU, SparseCUDA, SparseMeta: add_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
     MkldnnCPU: mkldnn_add
     ZeroTensor: add_zerotensor
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
@@ -556,8 +561,8 @@
   variants: method
   structured_delegate: add.out
   dispatch:
-    SparseCPU, SparseCUDA: add_sparse_
-    SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_
+    SparseCPU, SparseCUDA, SparseMeta: add_sparse_
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
     MkldnnCPU: mkldnn_add_
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
   tags: pointwise
@@ -570,9 +575,9 @@
     Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf)
     ScalarOnly: add (Bool)
   dispatch:
-    SparseCPU: add_out_sparse_cpu
+    SparseCPU, SparseMeta: add_out_sparse_cpu
     SparseCUDA: add_out_sparse_cuda
-    SparseCsrCPU: add_out_sparse_compressed_cpu
+    SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu
     SparseCsrCUDA: add_out_sparse_compressed_cuda
     MkldnnCPU: mkldnn_add_out
     MPS: add_out_mps
@@ -771,7 +776,7 @@
   dispatch:
     CompositeExplicitAutograd: arange
 
-# This operator should be named `aragne.start_out` if following the naming convention. However that
+# This operator should be named `arange.start_out` if following the naming convention. However that
 # name is already taken. Disabled because of CI job failures.
 # FIXME: enable this
 #- func: arange.start_out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!)
@@ -1228,6 +1233,13 @@
     CompositeExplicitAutograd: copysign_out
   tags: pointwise
 
+- func: _lazy_clone(Tensor self) -> Tensor
+  # Like clone, but the copy takes place lazily, only if either the
+  # input or the output are written.
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: _lazy_clone
+
 - func: logical_not(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
@@ -2358,7 +2370,7 @@
     Meta: empty_meta_symint
     MkldnnCPU: empty_mkldnn
     SparseCPU, SparseCUDA, SparseMeta: empty_sparse
-    SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_sparse_compressed
     QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized
   tags: core
 
@@ -2464,7 +2476,7 @@
     CompositeExplicitAutograd: empty_like
     QuantizedCPU, QuantizedCUDA: empty_like_quantized
     SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
-    SparseCsrCPU, SparseCsrCUDA: empty_like_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
     NestedTensorCPU, NestedTensorCUDA: empty_like_nested
   autogen: empty_like.out
 
@@ -2966,12 +2978,14 @@
   dispatch:
     CPU: _fft_r2c_mkl
     CUDA: _fft_r2c_cufft
+    MPS: _fft_r2c_mps
 
 - func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
     CPU: _fft_r2c_mkl_out
     CUDA: _fft_r2c_cufft_out
+    MPS: _fft_r2c_mps_out
 
 # Complex to real inverse FFT
 - func: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
@@ -2979,12 +2993,14 @@
   dispatch:
     CPU: _fft_c2r_mkl
     CUDA: _fft_c2r_cufft
+    MPS: _fft_c2r_mps
 
 - func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
     CPU: _fft_c2r_mkl_out
     CUDA: _fft_c2r_cufft_out
+    MPS: _fft_c2r_mps_out
 
 # Standard complex to complex FFT (forward or backward)
 - func: _fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor
@@ -2992,12 +3008,14 @@
   dispatch:
     CPU: _fft_c2c_mkl
     CUDA: _fft_c2c_cufft
+    MPS: _fft_c2c_mps
 
 - func: _fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
     CPU: _fft_c2c_mkl_out
     CUDA: _fft_c2c_cufft_out
+    MPS: _fft_c2c_mps_out
 
 - func: _validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
   device_check: NoCheck
@@ -3109,6 +3127,7 @@
   structured: True
   dispatch:
     CPU, CUDA: isin_Tensor_Tensor_out
+    MPS: isin_Tensor_Tensor_out_mps
 
 - func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
   variants: function
@@ -3250,6 +3269,8 @@
   autogen: native_layer_norm_backward.out
   tags: core
 
+- func: rms_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor
+
 - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
   variants: function, method
   dispatch:
@@ -3322,10 +3343,31 @@
   dispatch:
     CUDA: _cslt_sparse_mm_search
 
+- func: _sparse_semi_structured_tile(Tensor input, str algorithm="", bool use_cutlass=True) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: _sparse_semi_structured_tile
+
+- func: _sparse_semi_structured_apply(Tensor input, Tensor thread_masks) -> (Tensor, Tensor)
+  dispatch:
+    CUDA: _sparse_semi_structured_apply
+
+- func: _sparse_semi_structured_apply_dense(Tensor input, Tensor thread_masks) -> Tensor
+  dispatch:
+    CUDA: _sparse_semi_structured_apply_dense
+
+# DEPRECATED: Use torch.__sparse_semi_structured_mm/torch._sparse_semi_structured_addmm instead
 - func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor
   dispatch:
     CUDA: _sparse_semi_structured_linear
 
+- func: _sparse_semi_structured_mm(Tensor mat1, Tensor mat1_meta, Tensor mat2, *, ScalarType? out_dtype=None) -> Tensor
+  dispatch:
+    CUDA: _sparse_semi_structured_mm
+
+- func: _sparse_semi_structured_addmm(Tensor input, Tensor mat1, Tensor mat1_meta, Tensor mat2, *, Scalar alpha=1, Scalar beta=1, ScalarType? out_dtype=None) -> Tensor
+  dispatch:
+    CUDA: _sparse_semi_structured_addmm
+
 - func: _mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor
   dispatch:
     CUDA: _mixed_dtypes_linear
@@ -4066,20 +4108,30 @@
 
 - func: _int_mm(Tensor self, Tensor mat2) -> Tensor
   dispatch:
+    CPU: _int_mm_cpu
     CUDA: _int_mm_cuda
 
 - func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
+    CPU: _int_mm_out_cpu
     CUDA: _int_mm_out_cuda
 
 - func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor
   dispatch:
+    CPU: _convert_weight_to_int4pack_cpu
     CUDA: _convert_weight_to_int4pack_cuda
 
 - func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor
   dispatch:
+    CPU: _weight_int4pack_mm_cpu
+    MPS: _weight_int4pack_mm_mps
     CUDA: _weight_int4pack_mm_cuda
 
+- func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor
+  dispatch:
+    CPU: _weight_int8pack_mm_cpu
+    MPS: _weight_int8pack_mm_mps
+
 - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
   python_module: sparse
 
@@ -4455,7 +4507,6 @@
     MPS: pixel_shuffle_mps
     CompositeExplicitAutogradNonFunctional: math_pixel_shuffle
   autogen: pixel_shuffle.out
-  tags: core
 
 - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
   dispatch:
@@ -4826,7 +4877,7 @@
   device_guard: False
   dispatch:
     CompositeImplicitAutograd: reshape_symint
-    CompositeImplicitAutogradNestedTensor: reshape_nested
+    CompositeImplicitAutogradNestedTensor: reshape_nested_symint
 
 - func: _reshape_copy(Tensor self, SymInt[] size) -> Tensor
   variants: function
@@ -4985,6 +5036,7 @@
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
+    QuantizedCPU: gelu_quantized_cpu_
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_
 
 - func: gelu(Tensor self, *, str approximate='none') -> Tensor
@@ -5372,6 +5424,21 @@
     CompositeExplicitAutograd: slice_backward
   autogen: slice_backward.out
 
+# NB: This op exists to back the implementation of reverse view_funcs for various views (chunk,
+# slice.Tensor, split_with_sizes, et. al.). Currently, these are only used during fake-ification
+# of PT2 graph input subclass instances that are views. This means:
+# * This op shouldn't really show up in eager mode (so e.g. XLA shouldn't have to implement it)
+# * This op shouldn't show up in a PT2 graph (so a PT2 backend shouldn't have to implement it)
+# * A subclass will have to implement this to work in PT2 if a subclass view is used as a graph
+#   input AND the view utilizes this op in its inverse. The idea is that slice_inverse() is
+#   easier to implement for a subclass than as_strided()
+- func: slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+  variants: function, method
+  device_check: NoCheck
+  device_guard: False
+  dispatch:
+    CompositeExplicitAutograd: slice_inverse_symint
+
 - func: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
   variants: function, method
   device_check: NoCheck
@@ -5379,7 +5446,7 @@
   dispatch:
     CompositeExplicitAutogradNonFunctional: slice_scatter
   autogen: slice_scatter.out
-  tags: core
+  tags: [core, view_copy]
 
 - func: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
   variants: function, method
@@ -5578,6 +5645,16 @@
     SparseCPU: _sspaddmm_out_cpu
     SparseCUDA: _sspaddmm_out_cuda
 
+- func: _chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: _chunk_cat
+    CUDA: _chunk_cat_cuda
+
+- func: _chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CompositeExplicitAutograd: _chunk_cat_out
+    CUDA: _chunk_cat_out_cuda
+
 - func: stack(Tensor[] tensors, int dim=0) -> Tensor
   dispatch:
     CompositeExplicitAutograd: stack
@@ -5642,8 +5719,8 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: sum
-    SparseCPU, SparseCUDA: sum_coo
-    SparseCsrCPU, SparseCsrCUDA: sum_csr
+    SparseCPU, SparseCUDA, SparseMeta: sum_coo
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
   autogen: sum.out
 
 - func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
@@ -5769,6 +5846,7 @@
   variants: function
   dispatch:
     CPU, CUDA: std_mean
+    MPS: std_mean_mps
   autogen: std_mean.correction_out
 
 - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
@@ -6024,7 +6102,6 @@
     CPU, MPS: roll
     CUDA: roll_cuda
   autogen: roll.out
-  tags: core
 
 # default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args
 
@@ -6107,6 +6184,58 @@
     CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy
   autogen: _nested_view_from_buffer_copy.out
 
+- func: _nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor(a)
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+- func: _nested_view_from_jagged_copy(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor
+  variants: function
+  device_check: NoCheck
+  tags: view_copy
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _nested_view_from_jagged_copy
+  autogen: _nested_view_from_jagged_copy.out
+
+- func: _nested_get_values(Tensor(a) self) -> Tensor(a)
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+- func: _nested_get_values_copy(Tensor self) -> Tensor
+  variants: function
+  device_check: NoCheck
+  tags: view_copy
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: _nested_get_values_copy
+  autogen: _nested_get_values_copy.out
+
+- func: _nested_get_offsets(Tensor self) -> Tensor
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+# returns undefined Tensor if no lengths present
+- func: _nested_get_lengths(Tensor self) -> Tensor
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+- func: _nested_get_ragged_idx(Tensor self) -> int
+  variants: function
+  device_check: NoCheck
+  dispatch: {}
+
+- func: _nested_get_jagged_dummy(Tensor any) -> Tensor
+  category_override: dummy
+  dispatch: {}
+
+- func: _nested_compute_contiguous_strides_offsets(Tensor nested_size) -> (Tensor, Tensor)
+  variants: function
+  device_check: NoCheck
+  dispatch:
+    CPU, CUDA: _nested_compute_contiguous_strides_offsets
+
 - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
   dispatch:
     # calls unsqueeze
@@ -6291,6 +6420,7 @@
   variants: function
   dispatch:
     CPU, CUDA: var_mean
+    MPS: var_mean_mps
   autogen: var_mean.correction_out
 
 - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
@@ -6311,15 +6441,13 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    CPU, CUDA: where
-    MPS: where_mps
+    CPU, CUDA, MPS: where
   tags: [core, pointwise]
 
 - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: where_self_out
-    MPS: where_self_out_mps
+    CPU, CUDA, MPS: where_self_out
 
 - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
   variants: function
@@ -6373,7 +6501,7 @@
     CPU: _efficientzerotensor
     CUDA: _efficientzerotensor_cuda
     MPS: _efficientzerotensor_mps
-    Meta: _efficientzerotensor_meta
+    Meta: _efficientzerotensor_meta_symint
   autogen: _efficientzerotensor.out
 
 - func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -6450,6 +6578,32 @@
     SparseCPU, SparseCUDA: norm_sparse
   autogen: native_norm.ScalarOpt_dim_dtype_out
 
+- func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: _batch_norm_with_update_cpu
+    CUDA: _batch_norm_with_update_cuda
+    MPS: _batch_norm_with_update_mps
+    MkldnnCPU: _batch_norm_with_update_mkldnn
+  autogen: _batch_norm_with_update_functional
+
+- func: _batch_norm_with_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd, Tensor(g!) reserve) -> (Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!))
+  dispatch:
+    CPU: _batch_norm_with_update_cpu_out
+    CUDA: _batch_norm_with_update_cuda_out
+    MPS: _batch_norm_with_update_mps_out
+
+- func: _batch_norm_no_update(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    CompositeExplicitAutograd: _batch_norm_no_update
+  autogen: _batch_norm_no_update.out
+
+- func: batch_norm_backward(Tensor grad_out, Tensor input, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, bool update, float eps, bool[3] output_mask, Tensor reserve) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: _new_batch_norm_backward_cpu
+    CUDA: _new_batch_norm_backward_cuda
+    MPS: _new_batch_norm_backward_mps
+    MkldnnCPU: _new_batch_norm_backward_mkldnn
+
 # TODO: reduce signatures down to one when optional args is available
 - func: _sparse_sum(Tensor self) -> Tensor
 
@@ -6660,7 +6814,7 @@
     MPS: zero_mps_
     Meta: zero_meta_
     SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
-    SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
     MkldnnCPU: mkldnn_zero_
     NestedTensorCPU, NestedTensorCUDA: zero_nested_
   autogen: zero, zero.out
@@ -6950,6 +7104,10 @@
 # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given
 # the default would never make sense.
 
+- func: _sparse_compressed_tensor_with_dims(int nnz, int dense_dim, int[] size, int[] blocksize, ScalarType index_dtype, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: sparse_compressed_tensor_with_dims
+
 - func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
   dispatch:
     CompositeExplicitAutograd: sparse_compressed_tensor
@@ -7054,9 +7212,9 @@
 - func: sparse_dim(Tensor self) -> int
   variants: method
   dispatch:
-    CPU, CUDA: sparse_dim_strided
     SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse
-    SparseCsrCPU, SparseCsrCUDA: sparse_dim_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_dim_sparse_csr
+    CompositeExplicitAutograd: sparse_dim_default
   device_check: NoCheck
   device_guard: False
 
@@ -7071,9 +7229,9 @@
 - func: dense_dim(Tensor self) -> int
   variants: method
   dispatch:
-    CPU, CUDA: dense_dim_strided
     SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
-    SparseCsrCPU, SparseCsrCUDA: dense_dim_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
+    CompositeExplicitAutograd: dense_dim_default
   device_check: NoCheck
   device_guard: False
 
@@ -7089,7 +7247,7 @@
   variants: method
   dispatch:
     SparseCPU, SparseCUDA, SparseMeta: _nnz_sparse
-    SparseCsrCPU, SparseCsrCUDA: _nnz_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _nnz_sparse_csr
   device_check: NoCheck
   device_guard: False
 
@@ -7152,7 +7310,7 @@
   variants: method
   dispatch:
     SparseCPU, SparseCUDA, SparseMeta: values_sparse
-    SparseCsrCPU, SparseCsrCUDA: values_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr
     NestedTensorCPU, NestedTensorCUDA: values_nested
     CompositeExplicitAutograd: values_default
   device_check: NoCheck
@@ -7161,7 +7319,7 @@
 - func: crow_indices(Tensor(a) self) -> Tensor(a)
   variants: method
   dispatch:
-    SparseCsrCPU, SparseCsrCUDA: crow_indices_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: crow_indices_sparse_csr
     CompositeExplicitAutograd: crow_indices_default
   device_check: NoCheck
   device_guard: False
@@ -7169,7 +7327,7 @@
 - func: col_indices(Tensor(a) self) -> Tensor(a)
   variants: method
   dispatch:
-    SparseCsrCPU, SparseCsrCUDA: col_indices_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: col_indices_sparse_csr
     CompositeExplicitAutograd: col_indices_default
   device_check: NoCheck
   device_guard: False
@@ -7177,7 +7335,7 @@
 - func: ccol_indices(Tensor(a) self) -> Tensor(a)
   variants: method
   dispatch:
-    SparseCsrCPU, SparseCsrCUDA: ccol_indices_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ccol_indices_sparse_csr
     CompositeExplicitAutograd: ccol_indices_default
   device_check: NoCheck
   device_guard: False
@@ -7185,7 +7343,7 @@
 - func: row_indices(Tensor(a) self) -> Tensor(a)
   variants: method
   dispatch:
-    SparseCsrCPU, SparseCsrCUDA: row_indices_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: row_indices_sparse_csr
     CompositeExplicitAutograd: row_indices_default
   device_check: NoCheck
   device_guard: False
@@ -7204,7 +7362,7 @@
   device_check: NoCheck  # Allows copy into different device
   variants: function
   dispatch:
-    SparseCPU, SparseCUDA: copy_sparse_
+    SparseCPU, SparseCUDA, SparseMeta: copy_sparse_
   autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
 
 # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
@@ -7307,7 +7465,7 @@
     MkldnnCPU: mkldnn_reorder_conv2d_weight
   autogen: mkldnn_reorder_conv2d_weight.out
 
-- func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
+- func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor
   variants: function
   python_module: nn
   dispatch:
@@ -7694,6 +7852,7 @@
   dispatch:
     CPU, CUDA, Meta, MPS: set_
   autogen: set.source_Storage, set.source_Storage_out
+  tags: inplace_view
 
 - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
   variants: method
@@ -7706,6 +7865,7 @@
     MPS: set_storage_mps_
     QuantizedCPU, QuantizedCUDA: set_storage_quantized_
   autogen: set.source_Storage_storage_offset, set.source_Storage_storage_offset_out
+  tags: inplace_view
 
 - func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
   variants: method
@@ -7713,6 +7873,7 @@
   device_guard: False
   dispatch:
     CompositeImplicitAutograd: set__symint
+  tags: inplace_view
 
 - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
   variants: method
@@ -7721,6 +7882,7 @@
   dispatch:
     CPU, CUDA, Meta, MPS: set_tensor_
   autogen: set.source_Tensor, set.source_Tensor_out
+  tags: inplace_view
 
 - func: set_(Tensor(a!) self) -> Tensor(a!)
   variants: method
@@ -7730,6 +7892,7 @@
     Meta: set_meta_
     MPS: set_mps_
   autogen: set, set.out
+  tags: inplace_view
 
 # Not making it CompositeImplicitAutograd because lift
 # should be a primitive w.r.t. functorch
@@ -10125,18 +10288,21 @@
   variants: method, function
   dispatch:
     CompositeExplicitAutograd: alias
+    NestedTensorCPU, NestedTensorCUDA: alias_nested
   tags: core
 
 - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
   variants: function
   dispatch:
     CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
+    CPU: _amp_foreach_non_finite_check_and_unscale_cpu_
   autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out
 
 - func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
   variants: function
   dispatch:
     CUDA: _amp_update_scale_cuda_
+    CPU: _amp_update_scale_cpu_
   autogen: _amp_update_scale, _amp_update_scale.out
 
     #- func: _cat(Tensor[] tensors, int dim=0) -> Tensor
@@ -12360,6 +12526,7 @@
   dispatch:
     CPU: upsample_linear1d_out_cpu
     CUDA: upsample_linear1d_out_cuda
+    MPS: upsample_linear1d_out_mps
 
 - func: upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor
   python_module: nn
@@ -12371,6 +12538,7 @@
   dispatch:
     CPU: upsample_linear1d_backward_out_cpu
     CUDA: upsample_linear1d_backward_out_cuda
+    MPS: upsample_linear1d_backward_out_mps
 
 - func: upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor
   python_module: nn
@@ -12843,7 +13011,7 @@
     SparseMeta: isinf_sparse_meta
     SparseCsrCPU, SparseCsrCUDA: isinf_sparse_csr
   autogen: isinf.out
-  tags: core
+  tags: [core, pointwise]
 
 - func: record_stream(Tensor(a!) self, Stream s) -> ()
   variants: method
@@ -13769,11 +13937,18 @@
   dispatch:
     CPU, CUDA: linalg_eig_out
 
+- func: _linalg_eigvals(Tensor self) -> Tensor
+  python_module: linalg
+  dispatch:
+    CPU, CUDA: _linalg_eigvals
+
 - func: linalg_eigvals(Tensor self) -> Tensor
   python_module: linalg
 
 - func: linalg_eigvals.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: linalg
+  dispatch:
+    CPU, CUDA: linalg_eigvals_out
 
 # This function is exposes the `compute_v` flag, which is then used to implement `linalg.eigh` and
 # `linalg.eigvalsh` as composite functions that call this one
@@ -14077,6 +14252,12 @@
 # It is undocumented and should not be used outside of tests.
 - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
 
+# Note: for testing COW materialization within `at::parallel_for` loop function
+- func: _test_parallel_materialize(Tensor self, int num_parallel, bool skip_first=False) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _test_parallel_materialize
+
 # Note: this function is only for testing.
 - func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor
   python_module: nn
@@ -14411,6 +14592,7 @@
   variants: function
   dispatch:
     CompositeExplicitAutograd: split_with_sizes_copy_out
+    CUDA: split_with_sizes_copy_out_cuda
 
 - func: view_copy(Tensor self, SymInt[] size) -> Tensor
   variants: function
@@ -14521,6 +14703,16 @@
     CUDA: _scaled_dot_product_efficient_attention_backward_cuda
   tags: nondeterministic_seeded
 
+- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+  dispatch:
+    CUDA: _scaled_dot_product_cudnn_attention_cuda
+  tags: nondeterministic_seeded
+
+- func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: _scaled_dot_product_cudnn_attention_backward_cuda
+  tags: nondeterministic_seeded
+
 - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
   variants: function
   dispatch:
@@ -14533,14 +14725,14 @@
   dispatch:
     CUDA: _flash_attention_backward
 
-# Returns ouput, logsumexp if compute_logsumexp
-- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
+# Returns output, logsumexp if compute_logsumexp
+- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt? max_seqlen_q, SymInt? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None, int? window_size=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
   variants: function
   dispatch:
     CUDA: _efficient_attention_forward
   tags: nondeterministic_seeded
 
-- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
+- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None, int? window_size=None) -> (Tensor, Tensor, Tensor, Tensor)
   device_check: NoCheck
   variants: function
   dispatch:
@@ -15345,6 +15537,7 @@
   # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
   variants: function
   dispatch:
+    CPU: _fused_adam_kernel_cpu_
     CUDA: _fused_adam_kernel_cuda_
   autogen: _fused_adam, _fused_adam.out
 
@@ -15354,6 +15547,7 @@
   device_check: NoCheck
   variants: function
   dispatch:
+    CPU: _fused_adam_kernel_cpu_
     CUDA: _fused_adam_kernel_cuda_
   autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out
 
@@ -15361,6 +15555,7 @@
   # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
   variants: function
   dispatch:
+    CPU: _fused_adamw_kernel_cpu_
     CUDA: _fused_adamw_kernel_cuda_
   autogen: _fused_adamw, _fused_adamw.out
 
@@ -15370,9 +15565,28 @@
   device_check: NoCheck
   variants: function
   dispatch:
+    CPU: _fused_adamw_kernel_cpu_
     CUDA: _fused_adamw_kernel_cuda_
   autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out
 
+- func: _fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
+  variants: function
+  dispatch:
+    CPU: _fused_sgd_kernel_cpu_
+    CUDA: _fused_sgd_kernel_cuda_
+  autogen: _fused_sgd, _fused_sgd.out
+
+- func: _fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
+  # but still skip the device check as the Tensor LR can be on CPU
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: _fused_sgd_kernel_cpu_
+    CUDA: _fused_sgd_kernel_cuda_
+  autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out
+
 # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
 - func: _propagate_xla_data(Tensor input, Tensor output) -> ()
   variants: function
diff --git a/aten/src/ATen/native/nested/NestedTensorBackward.cpp b/aten/src/ATen/native/nested/NestedTensorBackward.cpp
index ef992a37c8688..e4465b792c21e 100644
--- a/aten/src/ATen/native/nested/NestedTensorBackward.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorBackward.cpp
@@ -44,16 +44,16 @@ std::tuple<Tensor, Tensor, Tensor> nested_linear_backward(
     return std::tuple<Tensor, Tensor, Tensor>{Tensor(), Tensor(), Tensor()};
   }
   Tensor grad_input, grad_weight, grad_bias;
-  auto grad_ouput_contiguous = grad_output.contiguous();
-  auto* nt_grad_output = get_nested_tensor_impl(grad_ouput_contiguous);
+  auto grad_output_contiguous = grad_output.contiguous();
+  auto* nt_grad_output = get_nested_tensor_impl(grad_output_contiguous);
   auto* nt_input = get_nested_tensor_impl(input);
   TORCH_INTERNAL_ASSERT(nt_grad_output != nullptr);
   TORCH_INTERNAL_ASSERT(nt_input != nullptr);
   TORCH_INTERNAL_ASSERT(nested_tensor_impl_is_contiguous(nt_grad_output));
-  auto grad_ouput_buffer = nt_grad_output->get_buffer();
+  auto grad_output_buffer = nt_grad_output->get_buffer();
   auto input_buffer = nt_input->get_buffer();
 
-  auto reshaped_grad = grad_ouput_buffer.reshape({-1, weight.size(0)});
+  auto reshaped_grad = grad_output_buffer.reshape({-1, weight.size(0)});
 
   if (output_mask[0]) {
     auto grad_input_buffer = at::mm(reshaped_grad, weight).view({-1});
@@ -137,7 +137,7 @@ Tensor _nested_sum_backward_cpu(
   AT_DISPATCH_ALL_TYPES_AND2(
     ScalarType::Half, ScalarType::BFloat16, self_grad_buffer.scalar_type(), "nested_sum_dim_cpu", [&]() {
     auto* self_grad_data = self_grad_buffer.data_ptr<scalar_t>();
-    const auto* output_grad_data = grad_buffer.data_ptr<scalar_t>();
+    const auto* output_grad_data = grad_buffer.const_data_ptr<scalar_t>();
     int64_t out_idx = 0, in_idx = 0;
     for (const auto i : c10::irange(ntensors)) {
       int64_t segments = num_segments[i].item<int64_t>();
diff --git a/aten/src/ATen/native/nested/NestedTensorFactories.cpp b/aten/src/ATen/native/nested/NestedTensorFactories.cpp
index eaf3a1c88834c..45425ed63315c 100644
--- a/aten/src/ATen/native/nested/NestedTensorFactories.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorFactories.cpp
@@ -230,5 +230,20 @@ Tensor narrow_nested_symint(const at::Tensor& self, int64_t dim, SymInt start, S
       storage_offsets);
 }
 
+Tensor alias_nested(const Tensor& self) {
+  auto* nt_impl = get_nested_tensor_impl(self);
+  const at::Tensor& buffer = nt_impl->get_unsafe_storage_as_tensor();
+  const auto& nested_sizes = nt_impl->get_nested_sizes();
+  const auto& nested_strides = nt_impl->get_nested_strides();
+  const auto& storage_offsets = nt_impl->get_storage_offsets();
+  return at::detail::make_tensor<NestedTensorImpl>(
+      c10::TensorImpl::VIEW,
+      std::move(buffer),
+      std::move(nested_sizes),
+      std::move(nested_strides),
+      std::move(storage_offsets));
+}
+
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp
index c4bc824fdb3cf..7d3e826ef53e9 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@@ -403,7 +403,7 @@ Tensor NestedTensor_sum_dim_CPU(
   AT_DISPATCH_ALL_TYPES_AND2(
     ScalarType::Half, ScalarType::BFloat16, buffer.scalar_type(), "nested_sum_dim_cpu", [&]() {
     auto* output_data = output_buffer.data_ptr<scalar_t>();
-    const auto* input_data = buffer.data_ptr<scalar_t>();
+    const auto* input_data = buffer.const_data_ptr<scalar_t>();
     int64_t out_idx = 0, in_idx = 0;
     for (const auto i : c10::irange(ntensors)) {
       int64_t segments = num_segments[i].item<int64_t>();
@@ -680,7 +680,7 @@ inline std::tuple<bool, Tensor, Tensor> NestedTensor_compute_size_stride(
     std::vector<int64_t> size_reshaped_vector(proposed_shape.begin() + 1, proposed_shape.end());
     // only allow one pre-existing dimension to have proposed shape == -1
     int64_t infer_index_old = -1;
-    // some negative sizes remain to be infered
+    // some negative sizes remain to be inferred
     if (ndims_underlying < ndims_underlying_reshaped) {
       int64_t numel = 1, numel_reshaped = 1;
       // replace negative sizes for old dimensions with old sizes
@@ -770,7 +770,7 @@ inline std::tuple<bool, Tensor, Tensor> NestedTensor_compute_size_stride(
 } // namespace
 
 // Note [Special size rule for nested tensor]
-// Instead of infering size, -1 means "inherit the old size", so:
+// Instead of inferring size, -1 means "inherit the old size", so:
 // * negative size is legal for a ragged dimension
 // * however, we only allow one -1
 // In principle we could still infer a dimension,
@@ -861,6 +861,12 @@ Tensor _nested_view_from_buffer(
     storage_offsets);
 }
 
+std::tuple<Tensor, Tensor> _nested_compute_contiguous_strides_offsets(const Tensor& nested_size) {
+  return std::make_tuple(
+      construct_nested_strides(nested_size),
+      construct_offsets(nested_size));
+}
+
 // See Note [Special size rule for nested tensor]
 Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape) {
   TORCH_CHECK(
@@ -894,7 +900,26 @@ Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape) {
   }
 }
 
+Tensor reshape_nested_symint(const Tensor& self, SymIntArrayRef proposed_shape) {
+  // Jagged layout NT decomp
+  if (self.layout() == at::kJagged) {
+    // TODO: Expand decomp to handle other viewable cases
+    bool viewable = self.is_contiguous();
+    return (
+        viewable ? self.view_symint(proposed_shape) :
+        self.clone(at::MemoryFormat::Contiguous).view_symint(proposed_shape)
+    );
+  }
+
+  return reshape_nested(self, C10_AS_INTARRAYREF_SLOW(proposed_shape));
+}
+
 Tensor reshape_as_nested(const Tensor& self, const Tensor& other) {
+  // Jagged layout NT decomp
+  if (self.layout() == at::kJagged) {
+    return self.reshape_symint(other.sym_sizes());
+  }
+
   auto other_ptr = get_nested_tensor_impl(other);
   // TODO: this is to reproduce other_ptr->opt_sizes_
   //       if an accessor is provided in the future, can replace this
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.h b/aten/src/ATen/native/nested/NestedTensorMath.h
index 8269985990966..068cc6b51ee70 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.h
+++ b/aten/src/ATen/native/nested/NestedTensorMath.h
@@ -75,5 +75,7 @@ C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_nested_layer_norm_inputs(
   return std::make_pair(M, N);
 }
 
+Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape);
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
index 8b937f16b0cf3..88e2a94570185 100644
--- a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
@@ -1,4 +1,5 @@
 #include <ATen/native/nested/NestedTensorMath.h>
+#include <ATen/native/nested/NestedTensorUtils.h>
 
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
@@ -20,38 +21,30 @@ namespace at {
 namespace native {
 
 Tensor bmm_nested(const Tensor& self, const Tensor& mat2) {
-  if (self.is_nested() && !mat2.is_nested()) {
-    AT_ERROR("Expected both to be nested, but got a nested self and non-nested other");
-  }
-  else if (!self.is_nested() && mat2.is_nested()) {
-    AT_ERROR("Expected both to be nested, but got a non-nested self and nested other");
-  }
-  // dispatcher should have guaranteed that at least one is nested
-  auto self_ptr = get_nested_tensor_impl(self);
-  auto mat2_ptr = get_nested_tensor_impl(mat2);
-  TORCH_CHECK(self_ptr->dim() == 3, "batch1 must be a 3D tensor");
-  TORCH_CHECK(mat2_ptr->dim() == 3, "batch2 must be a 3D tensor");
-  int64_t ntensors = self_ptr->size(0),
-      ntensors2 = mat2_ptr->size(0);
+  TORCH_CHECK(self.dim() == 3, "batch1 must be a 3D tensor");
+  TORCH_CHECK(mat2.dim() == 3, "batch2 must be a 3D tensor");
+
+  int64_t ntensors = self.is_nested() ? get_nested_tensor_impl(self)->size(0) : self.size(0);
+  int64_t ntensors2 = mat2.is_nested() ? get_nested_tensor_impl(mat2)->size(0) : mat2.size(0);
+
   TORCH_CHECK(ntensors == ntensors2,
       "Expected size for the 1st dimension of batch2 tensor to be: ", ntensors,
       " but got: ", ntensors2, ".");
-  const Tensor& self_buffer = self_ptr->get_unsafe_storage_as_tensor(),
-      & mat2_buffer = mat2_ptr->get_unsafe_storage_as_tensor();
-  std::vector<IntArrayRef> self_sizes = NestedTensor_get_sizes(self_ptr),
-      mat2_sizes = NestedTensor_get_sizes(mat2_ptr),
-      self_strides = NestedTensor_get_strides(self_ptr),
-      mat2_strides = NestedTensor_get_strides(mat2_ptr);
-  int64_t *self_offsets_ptr = self_ptr->get_storage_offsets().data_ptr<int64_t>();
-  int64_t *mat2_offsets_ptr = mat2_ptr->get_storage_offsets().data_ptr<int64_t>();
+
+  const Tensor& self_buffer = self.is_nested() ? get_nested_tensor_impl(self)->get_unsafe_storage_as_tensor() : self;
+  const Tensor& mat2_buffer = mat2.is_nested() ? get_nested_tensor_impl(mat2)->get_unsafe_storage_as_tensor() : mat2;
+
+
   // create a contiguous output
   int64_t out_numel = 0;
-  const Tensor& self_sizemat = self_ptr->get_nested_sizes();
+  const Tensor& self_sizemat = self.is_nested() ?
+      get_nested_tensor_impl(self)->get_nested_sizes() : get_nested_tensor_impl(mat2)->get_nested_sizes();
+
   Tensor out_sizemat = self_sizemat.new_empty(self_sizemat.sizes());
   int64_t* out_sizemat_ptr = out_sizemat.data_ptr<int64_t>();
   for (int64_t i = 0; i < ntensors; i++) {
-    const IntArrayRef& self_shape = self_sizes[i],
-        & mat2_shape = mat2_sizes[i];
+    const IntArrayRef& self_shape = get_size_for_index(self, i);
+    const IntArrayRef& mat2_shape = get_size_for_index(mat2, i);
     const int64_t& self_size0 = self_shape[0], & self_size1 = self_shape[1],
         & mat2_size0 = mat2_shape[0], & mat2_size1 = mat2_shape[1];
     TORCH_CHECK(self_size1 == mat2_size0,
@@ -63,7 +56,7 @@ Tensor bmm_nested(const Tensor& self, const Tensor& mat2) {
     out_sizemat_ptr += 2;
     out_numel += self_size0 * mat2_size1;
   }
-  Tensor out_buffer = self_buffer.new_empty(out_numel);
+  Tensor out_buffer = self.is_nested() ? self_buffer.new_empty(out_numel) : mat2_buffer.new_empty(out_numel);
   Tensor output = wrap_buffer(out_buffer, out_sizemat);
   // call tensor mm
   // TODO: `padding nested tensor -> bmm -> remove padding` may be more efficient
@@ -73,12 +66,14 @@ Tensor bmm_nested(const Tensor& self, const Tensor& mat2) {
   std::vector<Tensor> output_unbind = output.unbind();
   for (int64_t i = 0; i < ntensors; i++) {
     at::mm_out(output_unbind[i],
-               self_buffer.as_strided(self_sizes[i], self_strides[i], self_offsets_ptr[i]),
-               mat2_buffer.as_strided(mat2_sizes[i], mat2_strides[i], mat2_offsets_ptr[i]));
+              self_buffer.as_strided(get_size_for_index(self, i), get_stride_for_index(self, i), get_offset_for_index(self, i)),
+              mat2_buffer.as_strided(get_size_for_index(mat2, i), get_stride_for_index(mat2, i), get_offset_for_index(mat2, i)));
   }
   return output;
 }
 
+
+
 static Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) {
   // Tensor self = self_.contiguous();
   // Tensor mat2 = mat2_.contiguous();
@@ -89,13 +84,15 @@ static Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) {
   // metadata for self
   std::vector<IntArrayRef> self_sizes = NestedTensor_get_sizes(self_ptr);
   std::vector<IntArrayRef> self_strides = NestedTensor_get_strides(self_ptr);
-  int64_t *self_offsets_ptr = self_ptr->get_storage_offsets().data_ptr<int64_t>();
+  int64_t* self_offsets_ptr =
+      self_ptr->get_storage_offsets().data_ptr<int64_t>();
   auto opt = self_ptr->get_nested_sizes().options();
 
   // metadata for mat2
   std::vector<IntArrayRef> mat2_sizes = NestedTensor_get_sizes(mat2_ptr);
   std::vector<IntArrayRef> mat2_strides = NestedTensor_get_strides(mat2_ptr);
-  int64_t *mat2_offsets_ptr = mat2_ptr->get_storage_offsets().data_ptr<int64_t>();
+  int64_t* mat2_offsets_ptr =
+      mat2_ptr->get_storage_offsets().data_ptr<int64_t>();
   auto opt2 = mat2_ptr->get_nested_sizes().options();
 
   int64_t N = self_sizes.size();
@@ -108,7 +105,7 @@ static Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) {
   auto self_new_strides = at::empty({N * n_heads, 2}, opt);
   int64_t* self_new_strides_ptr = self_new_strides.mutable_data_ptr<int64_t>();
   auto self_new_offsets = at::empty({N * n_heads}, opt);
-  int64_t *self_new_offsets_ptr = self_new_offsets.mutable_data_ptr<int64_t>();
+  int64_t* self_new_offsets_ptr = self_new_offsets.mutable_data_ptr<int64_t>();
 
   // viewed metadata for mat2
   auto mat2_new_sizes = at::empty({N * n_heads, 2}, opt2);
@@ -117,7 +114,7 @@ static Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) {
   auto mat2_new_strides = at::empty({N * n_heads, 2}, opt2);
   int64_t* mat2_new_strides_ptr = mat2_new_strides.mutable_data_ptr<int64_t>();
   auto mat2_new_offsets = at::empty({N * n_heads}, opt);
-  int64_t *mat2_new_offsets_ptr = mat2_new_offsets.mutable_data_ptr<int64_t>();
+  int64_t* mat2_new_offsets_ptr = mat2_new_offsets.mutable_data_ptr<int64_t>();
 
   for (int64_t i = 0; i < N; i++) {
     const IntArrayRef& self_size_i = self_sizes[i];
@@ -146,7 +143,6 @@ static Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) {
     }
   }
 
-
   // view self as [N * n_heads, *, head_dim] (collapse first 2 dims)
   auto viewed_self = create_nested_view_tensor(
       self, self_new_sizes, self_new_strides, self_new_offsets);
@@ -163,7 +159,7 @@ static Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) {
   auto out_new_sizes = at::empty({N, 3}, opt);
   auto out_new_strides = at::empty({N, 3}, opt);
   auto out_new_offsets = at::empty({N}, opt);
-  int64_t *out_new_offsets_ptr = out_new_offsets.mutable_data_ptr<int64_t>();
+  int64_t* out_new_offsets_ptr = out_new_offsets.mutable_data_ptr<int64_t>();
 
   int64_t* out_new_sizes_ptr = out_new_sizes.data_ptr<int64_t>();
   int64_t* out_new_strides_ptr = out_new_strides.data_ptr<int64_t>();
@@ -187,15 +183,16 @@ static Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) {
       bmm_output, out_new_sizes, out_new_strides, out_new_offsets);
 
   return viewed_out;
-
 }
 
 // nt: NT of shape (B, *, C, D)
 // other: dense tensor of shape (D, E)
 // output: NT of shape (B, *, C, E)
-static Tensor matmul_nested_with_broadcasted_dense(const Tensor& nt, const Tensor& other) {
+static Tensor matmul_nested_with_broadcasted_dense(
+    const Tensor& nt,
+    const Tensor& other) {
   // View nt buffer as 3D jagged for matmul
-  auto *nt_impl = get_nested_tensor_impl(nt);
+  auto* nt_impl = get_nested_tensor_impl(nt);
   auto jagged = nt_impl->get_buffer().view({-1, nt.size(2), nt.size(3)});
   auto new_buffer = at::matmul(jagged, other);
 
@@ -222,22 +219,19 @@ static Tensor matmul_nested_with_broadcasted_dense(const Tensor& nt, const Tenso
 // TODO: Should make full matmul semantics support some day
 Tensor matmul_nested(const Tensor& self, const Tensor& mat2) {
   // special case of NT (B, *, C, D) with broadcasted dense (D, E)
-  if (self.is_nested() &&
-          self.is_contiguous() &&
-          !mat2.is_nested() &&
-          self.dim() == 4 &&
-          mat2.dim() == 2 &&
-          get_nested_tensor_impl(self)->opt_size(2).has_value() &&
-          get_nested_tensor_impl(self)->opt_size(3).has_value() &&
-          self.size(3) == mat2.size(0)
-     ) {
+  if (self.is_nested() && self.is_contiguous() && !mat2.is_nested() &&
+      self.dim() == 4 && mat2.dim() == 2 &&
+      get_nested_tensor_impl(self)->opt_size(2).has_value() &&
+      get_nested_tensor_impl(self)->opt_size(3).has_value() &&
+      self.size(3) == mat2.size(0)) {
     return matmul_nested_with_broadcasted_dense(self, mat2);
   }
   if (self.is_nested() && !mat2.is_nested()) {
-    AT_ERROR("Expected both to be nested, but got a nested self and non-nested other");
-  }
-  else if (!self.is_nested() && mat2.is_nested()) {
-    AT_ERROR("Expected both to be nested, but got a non-nested self and nested other");
+    AT_ERROR(
+        "Expected both to be nested, but got a nested self and non-nested other");
+  } else if (!self.is_nested() && mat2.is_nested()) {
+    AT_ERROR(
+        "Expected both to be nested, but got a non-nested self and nested other");
   }
   // to_padded_tensor only supports contiguous inputs
   auto self_contig = self.contiguous();
@@ -245,8 +239,7 @@ Tensor matmul_nested(const Tensor& self, const Tensor& mat2) {
   // dispatcher should have guaranteed that at least one is nested
   const auto self_ptr = get_nested_tensor_impl(self_contig);
   const auto mat2_ptr = get_nested_tensor_impl(mat2_contig);
-  int64_t self_dim = self_ptr->dim(),
-      mat2_dim = mat2_ptr->dim();
+  int64_t self_dim = self_ptr->dim(), mat2_dim = mat2_ptr->dim();
   TORCH_CHECK(
       self_dim >= 3,
       "matmul: For nested tensors, only inputs with >= 3 dims are currently supported. 1st input has rank: ",
@@ -255,41 +248,47 @@ Tensor matmul_nested(const Tensor& self, const Tensor& mat2) {
       mat2_dim >= 3,
       "matmul: For nested tensors, only inputs with >= 3 dims are currently supported. 2nd input has rank: ",
       mat2_dim);
-  TORCH_CHECK(self_dim == mat2_dim, "matmul: both inputs must have the same rank");
-  int64_t ntensors = self_ptr->size(0),
-      ntensors2 = mat2_ptr->size(0);
-  TORCH_CHECK(ntensors == ntensors2,
-      "matmul: Expected size for the 1st dimension of 2nd input tensor to be: ", ntensors,
-      " but got: ", ntensors2, ".");
+  TORCH_CHECK(
+      self_dim == mat2_dim, "matmul: both inputs must have the same rank");
+  int64_t ntensors = self_ptr->size(0), ntensors2 = mat2_ptr->size(0);
+  TORCH_CHECK(
+      ntensors == ntensors2,
+      "matmul: Expected size for the 1st dimension of 2nd input tensor to be: ",
+      ntensors,
+      " but got: ",
+      ntensors2,
+      ".");
   // Ensure batch dimensions have the same sizes (no broadcasting).
   const auto& self_sizes = self_ptr->get_nested_sizes();
   const auto& mat2_sizes = mat2_ptr->get_nested_sizes();
-  const auto& self_batch_sizes = self_sizes.narrow(1, 0, self_dim-3);
-  const auto& mat2_batch_sizes = mat2_sizes.narrow(1, 0, mat2_dim-3);
-  TORCH_CHECK(at::equal(self_batch_sizes, mat2_batch_sizes),
-    "matmul: For nested tensors, batch dimensions must have the same sizes, ",
-    "no broadcasting is currently performed. Got batch shapes for self ",
-    self_batch_sizes,
-    " and batch shapes for mat2 ",
-    mat2_batch_sizes);
+  const auto& self_batch_sizes = self_sizes.narrow(1, 0, self_dim - 3);
+  const auto& mat2_batch_sizes = mat2_sizes.narrow(1, 0, mat2_dim - 3);
+  TORCH_CHECK(
+      at::equal(self_batch_sizes, mat2_batch_sizes),
+      "matmul: For nested tensors, batch dimensions must have the same sizes, ",
+      "no broadcasting is currently performed. Got batch shapes for self ",
+      self_batch_sizes,
+      " and batch shapes for mat2 ",
+      mat2_batch_sizes);
   // Ensure last dim of self and second last dim of mat2 have the same size
   const auto& self_dim_size = self_sizes.select(1, -1);
   const auto& mat2_dim_size = mat2_sizes.select(1, -2);
-  TORCH_CHECK(at::equal(self_dim_size, mat2_dim_size),
-    "matmul: Nested tensors cannot be matrix multiplied, last dimension of self has sizes",
-    self_dim_size,
-    "second last dimension of mat2 has sizes",
-    mat2_dim_size);
-
-  // use bmm inference-only fast path for [N, n_heads, *, head_dim] [N, n_heads, head_dim, *]
-  if (self.is_cuda() &&
-      self_dim == 4 && self.is_contiguous() &&
+  TORCH_CHECK(
+      at::equal(self_dim_size, mat2_dim_size),
+      "matmul: Nested tensors cannot be matrix multiplied, last dimension of self has sizes",
+      self_dim_size,
+      "second last dimension of mat2 has sizes",
+      mat2_dim_size);
+
+  // use bmm inference-only fast path for [N, n_heads, *, head_dim] [N, n_heads,
+  // head_dim, *]
+  if (self.is_cuda() && self_dim == 4 && self.is_contiguous() &&
       mat2_dim == 4 && mat2.is_contiguous() &&
-      !(GradMode::is_enabled() && (self.requires_grad() || mat2.requires_grad()))) {
+      !(GradMode::is_enabled() &&
+        (self.requires_grad() || mat2.requires_grad()))) {
     const auto& self_opt_head_dim = self_ptr->opt_size(1);
     const auto& mat2_opt_head_dim = mat2_ptr->opt_size(1);
-    if (self_opt_head_dim.has_value() &&
-        mat2_opt_head_dim.has_value() &&
+    if (self_opt_head_dim.has_value() && mat2_opt_head_dim.has_value() &&
         self_opt_head_dim.value() == mat2_opt_head_dim.value()) {
       return matmul_with_bmm_nested(self, mat2);
     }
@@ -297,8 +296,10 @@ Tensor matmul_nested(const Tensor& self, const Tensor& mat2) {
 
   // Construct output size from input sizes
   Tensor output_sizes = self_sizes.clone();
-  // The last entry in every row of output_sizes should be last column of mat2_sizes
-  output_sizes.index_put_({at::indexing::Slice(), -1}, mat2_sizes.select(1, -1).clone());
+  // The last entry in every row of output_sizes should be last column of
+  // mat2_sizes
+  output_sizes.index_put_(
+      {at::indexing::Slice(), -1}, mat2_sizes.select(1, -1).clone());
 
   auto self_padded = self_contig.to_padded_tensor(0.);
   auto mat2_padded = mat2_contig.to_padded_tensor(0.);
@@ -307,7 +308,10 @@ Tensor matmul_nested(const Tensor& self, const Tensor& mat2) {
   return output_nested;
 }
 
-Tensor& matmul_out_nested(const Tensor& tensor1, const Tensor& tensor2, Tensor& result) {
+Tensor& matmul_out_nested(
+    const Tensor& tensor1,
+    const Tensor& tensor2,
+    Tensor& result) {
   // TODO: this is a very quick and dirty implementation
   //       should improve it to avoid the intermediate memory usage
   Tensor function_result = at::matmul(tensor1, tensor2);
@@ -319,8 +323,7 @@ Tensor& matmul_out_nested(const Tensor& tensor1, const Tensor& tensor2, Tensor&
     c10::optional<int64_t> opt_size = function_result_ptr->opt_size(i);
     if (opt_size.has_value()) {
       sizes.push_back(*opt_size);
-    }
-    else {
+    } else {
       sizes.push_back(-1);
     }
   }
diff --git a/aten/src/ATen/native/nested/NestedTensorUtils.h b/aten/src/ATen/native/nested/NestedTensorUtils.h
index 206899875156b..3b4f18f11b64b 100644
--- a/aten/src/ATen/native/nested/NestedTensorUtils.h
+++ b/aten/src/ATen/native/nested/NestedTensorUtils.h
@@ -119,7 +119,7 @@ inline std::vector<IntArrayRef> NestedTensor_get_sizes(
   if (orig_dim == 0) {
     return sizes;
   }
-  const int64_t* sizemat_ptr = sizemat.data_ptr<int64_t>();
+  const int64_t* sizemat_ptr = sizemat.const_data_ptr<int64_t>();
 
   for (const auto i : c10::irange(ntensors)) {
     sizes[i] = IntArrayRef(sizemat_ptr, sizemat_ptr + orig_dim);
@@ -152,7 +152,7 @@ inline std::vector<IntArrayRef> NestedTensor_get_strides(
   if (orig_dim == 0) {
     return strides;
   }
-  const int64_t* stridemat_ptr = stridemat.data_ptr<int64_t>();
+  const int64_t* stridemat_ptr = stridemat.const_data_ptr<int64_t>();
   for (const auto i : c10::irange(ntensors)) {
     strides[i] = IntArrayRef(stridemat_ptr, stridemat_ptr + orig_dim);
     stridemat_ptr += orig_dim;
@@ -178,6 +178,40 @@ inline void check_numel_equals_buffer_size(const NestedTensorImpl* self_ptr) {
       self_ptr->numel() == static_cast<int64_t>(self_ptr->get_buffer_size()),
       "Number of elements in nested tensor must match number of elements in buffer.");
 }
+
+// Helper function to get size / stride / offset for a nested/normal tensor.
+inline IntArrayRef get_size_for_index(const Tensor& tensor, int i) {
+  if (tensor.is_nested()) {
+    std::vector<IntArrayRef> tensor_sizes =
+        NestedTensor_get_sizes(get_nested_tensor_impl(tensor));
+    return tensor_sizes[i];
+  } else {
+    return tensor.sizes().slice(1);
+  }
+}
+
+inline IntArrayRef get_stride_for_index(const Tensor& tensor, int i) {
+  if (tensor.is_nested()) {
+    std::vector<IntArrayRef> tensor_strides =
+        NestedTensor_get_strides(get_nested_tensor_impl(tensor));
+    return tensor_strides[i];
+  } else {
+    return tensor.strides().slice(1);
+  }
+}
+
+inline int64_t get_offset_for_index(const Tensor& tensor, int i) {
+  if (tensor.is_nested()) {
+    int64_t* offsets_ptr = get_nested_tensor_impl(tensor)
+                               ->get_storage_offsets()
+                               .data_ptr<int64_t>();
+    return offsets_ptr[i];
+
+  } else {
+    int64_t offset = tensor.storage_offset();
+    return offset + tensor.strides()[0] * i;
+  }
+}
 //  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // Data structures and functions for generically applying a function on a nested
 // tensor.
@@ -193,7 +227,8 @@ struct NestedNode {
   // NestedNode(NestedNode&) = delete;
   // NestedNode(const NestedNode&) = delete;
   // NestedNode& operator=(NestedNode) = delete;
-  explicit NestedNode(T payload) : _is_leaf(true), _payload(std::move(payload)) {}
+  explicit NestedNode(T payload)
+      : _is_leaf(true), _payload(std::move(payload)) {}
   inline bool is_leaf() const {
     return _is_leaf;
   }
@@ -367,7 +402,7 @@ inline Tensor wrap_tensor_node(
                   if (tensor_node.children(i).numel() > 0) {
                     memcpy(
                         nt_buffer.mutable_data_ptr<scalar_t>() + start_offsets[i],
-                        tensor_node.children(i).data_ptr<scalar_t>(),
+                        tensor_node.children(i).const_data_ptr<scalar_t>(),
                         tensor_node.children(i).numel() * sizeof(scalar_t));
                   }
                 }
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorBinaryOps.cu b/aten/src/ATen/native/nested/cuda/NestedTensorBinaryOps.cu
index f7055d7fd0330..350c3a27e77b0 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorBinaryOps.cu
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorBinaryOps.cu
@@ -85,8 +85,8 @@ void _nested_op_dense_esuhm_kernel(Tensor& result, const Tensor& self, const Ten
   auto result_offsets = at::cat({offsets, at::tensor(self_ptr->numel())});
   result_offsets = result_offsets.to(kCUDA);
 
-  const scalar_t* self_data_ptr = self_buffer.data_ptr<scalar_t>();
-  const scalar_t* other_data_ptr = other.data_ptr<scalar_t>();
+  const scalar_t* self_data_ptr = self_buffer.const_data_ptr<scalar_t>();
+  const scalar_t* other_data_ptr = other.const_data_ptr<scalar_t>();
   scalar_t* result_data_ptr = result_buffer.data_ptr<scalar_t>();
   int64_t* result_offsets_ptr = result_offsets.data_ptr<int64_t>();
 
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorMatmul.cu b/aten/src/ATen/native/nested/cuda/NestedTensorMatmul.cu
index f5b56b2a8c47e..252e3741c5c7d 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorMatmul.cu
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorMatmul.cu
@@ -283,16 +283,10 @@ bool group_gemm_dispatch(
 #endif
 
 Tensor bmm_nested_cuda(const Tensor& self, const Tensor& mat2) {
-  if (self.is_nested() && !mat2.is_nested()) {
-    AT_ERROR(
-        "Expected both to be nested, but got a nested self and non-nested other");
-  } else if (!self.is_nested() && mat2.is_nested()) {
-    AT_ERROR(
-        "Expected both to be nested, but got a non-nested self and nested other");
-  }
+
   // dispatcher should have guaranteed that at least one is nested
-  auto self_ptr = get_nested_tensor_impl(self);
-  auto mat2_ptr = get_nested_tensor_impl(mat2);
+  auto self_ptr = self.is_nested() ? get_nested_tensor_impl(self) : self.unsafeGetTensorImpl();
+  auto mat2_ptr = mat2.is_nested() ? get_nested_tensor_impl(mat2) : mat2.unsafeGetTensorImpl();
   TORCH_CHECK(self_ptr->dim() == 3, "batch1 must be a 3D tensor");
   TORCH_CHECK(mat2_ptr->dim() == 3, "batch2 must be a 3D tensor");
   int64_t ntensors = self_ptr->size(0), ntensors2 = mat2_ptr->size(0);
@@ -305,16 +299,15 @@ Tensor bmm_nested_cuda(const Tensor& self, const Tensor& mat2) {
       ".");
 
   // create a contiguous output
-  const Tensor& self_sizemat = self_ptr->get_nested_sizes();
+  const Tensor& self_sizemat = self.is_nested() ?
+      get_nested_tensor_impl(self)->get_nested_sizes() : get_nested_tensor_impl(mat2)->get_nested_sizes();
+
   Tensor out_sizemat = self_sizemat.new_empty(self_sizemat.sizes());
   int64_t* out_sizemat_ptr = out_sizemat.data_ptr<int64_t>();
 
-  std::vector<IntArrayRef> self_sizes = NestedTensor_get_sizes(self_ptr);
-  std::vector<IntArrayRef> mat2_sizes = NestedTensor_get_sizes(mat2_ptr);
-
   int64_t out_numel = 0;
   for (int64_t i = 0; i < ntensors; i++) {
-    const IntArrayRef &self_shape = self_sizes[i], &mat2_shape = mat2_sizes[i];
+    const IntArrayRef &self_shape = get_size_for_index(self, i), &mat2_shape = get_size_for_index(mat2, i);
     const int64_t &self_size0 = self_shape[0], &self_size1 = self_shape[1],
                   &mat2_size0 = mat2_shape[0], &mat2_size1 = mat2_shape[1];
     TORCH_CHECK(
@@ -334,17 +327,15 @@ Tensor bmm_nested_cuda(const Tensor& self, const Tensor& mat2) {
     out_sizemat_ptr += 2;
     out_numel += self_size0 * mat2_size1;
   }
-  const Tensor &self_buffer = self_ptr->get_unsafe_storage_as_tensor();
-  const Tensor &mat2_buffer = mat2_ptr->get_unsafe_storage_as_tensor();
+
+  const Tensor &self_buffer = self.is_nested() ? get_nested_tensor_impl(self)->get_unsafe_storage_as_tensor() : self;
+  const Tensor &mat2_buffer = mat2.is_nested() ? get_nested_tensor_impl(mat2)->get_unsafe_storage_as_tensor() : mat2;
+
   Tensor out_buffer = self_buffer.new_empty(out_numel);
   Tensor output = wrap_buffer(out_buffer, out_sizemat);
   auto out_ptr = get_nested_tensor_impl(output);
 
-  std::vector<IntArrayRef> self_strides = NestedTensor_get_strides(self_ptr);
-  std::vector<IntArrayRef> mat2_strides = NestedTensor_get_strides(mat2_ptr);
-  const int64_t *self_offsets_ptr = self_ptr->get_storage_offsets().data_ptr<int64_t>();
-  const int64_t *mat2_offsets_ptr = mat2_ptr->get_storage_offsets().data_ptr<int64_t>();
-  const int64_t *out_offsets_ptr = out_ptr->get_storage_offsets().data_ptr<int64_t>();
+  const int64_t *out_offsets_ptr = out_ptr->get_storage_offsets().const_data_ptr<int64_t>();
 
 #ifndef USE_ROCM
 #ifndef _WIN32
@@ -360,21 +351,23 @@ Tensor bmm_nested_cuda(const Tensor& self, const Tensor& mat2) {
         std::vector<cutlass::gemm::GemmCoord> gemm_sizes;
         bool all_row_major = true;
         for (int64_t i = 0; i < ntensors; i++) {
-          const IntArrayRef& self_shape = self_sizes[i];
-          const IntArrayRef& mat2_shape = mat2_sizes[i];
+          const IntArrayRef& self_shape = get_size_for_index(self, i);
+          const IntArrayRef& mat2_shape = get_size_for_index(mat2, i);
           const int64_t &self_size0 = self_shape[0];
           const int64_t &self_size1 = self_shape[1];
           const int64_t &mat2_size0 = mat2_shape[0];
           const int64_t &mat2_size1 = mat2_shape[1];
           gemm_sizes.push_back(
               cutlass::gemm::GemmCoord(self_size0, mat2_size1, self_size1));
-          aptr[i] = self_buffer.data_ptr<scalar_t>() + self_offsets_ptr[i];
-          bptr[i] = mat2_buffer.data_ptr<scalar_t>() + mat2_offsets_ptr[i];
+          aptr[i] = self_buffer.data_ptr<scalar_t>() + get_offset_for_index(self, i);
+          bptr[i] = mat2_buffer.data_ptr<scalar_t>() + get_offset_for_index(mat2, i);
           dptr[i] = out_buffer.data_ptr<scalar_t>() + out_offsets_ptr[i];
-          all_row_major = all_row_major && (self_strides[i][1] == 1);
-          all_row_major = all_row_major && (mat2_strides[i][1] == 1);
-          lda[i] = self_strides[i][0];
-          ldb[i] = mat2_strides[i][0];
+          auto self_stride = get_stride_for_index(self, i);
+          auto mat2_stride = get_stride_for_index(mat2, i);
+          all_row_major = all_row_major && (self_stride[1] == 1);
+          all_row_major = all_row_major && (mat2_stride[1] == 1);
+          lda[i] = self_stride[0];
+          ldb[i] = mat2_stride[0];
           ldd[i] = mat2_size1;
         }
         auto dprops = at::cuda::getCurrentDeviceProperties();
@@ -403,11 +396,9 @@ Tensor bmm_nested_cuda(const Tensor& self, const Tensor& mat2) {
 
   std::vector<Tensor> output_unbind = output.unbind();
   for (int64_t i = 0; i < ntensors; i++) {
-    at::mm_out(
-        output_unbind[i],
-        self_buffer.as_strided(self_sizes[i], self_strides[i], self_offsets_ptr[i]),
-        mat2_buffer.as_strided(
-            mat2_sizes[i], mat2_strides[i], mat2_offsets_ptr[i]));
+    at::mm_out(output_unbind[i],
+        self_buffer.as_strided(get_size_for_index(self, i), get_stride_for_index(self, i), get_offset_for_index(self, i)),
+        mat2_buffer.as_strided(get_size_for_index(mat2, i), get_stride_for_index(mat2, i), get_offset_for_index(mat2, i)));
   }
   return output;
 }
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
index 8955585b432e8..0da0c3e361d1f 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -28,7 +28,7 @@ namespace {
 int64_t padded_tensor_numel(const Tensor& sizes) {
   const auto sizes_num_rows = sizes.sizes()[0];
   const auto sizes_row_length = sizes.sizes()[1];
-  const auto* sizes_data = sizes.data_ptr<int64_t>();
+  const auto* sizes_data = sizes.const_data_ptr<int64_t>();
   int64_t numel = 0;
   for (const auto row_num : c10::irange(sizes_num_rows)) {
     const auto* row_ptr = sizes_data + row_num * sizes_row_length;
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerUtils.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerUtils.cpp
index a209c766c24ec..f708920d04dfa 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerUtils.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerUtils.cpp
@@ -78,8 +78,8 @@ int64_t get_nnz(Tensor nestedtensor) {
    * use with the flash-attention and efficient_attention kernels without
    * needing to call contiguous on the nested tensor input.
    * It checks that the storage offsets' adjacent_differences are a constant
-   * mutiple of the previous tensor in the nested tensor and that the strides
-   * are monitonically decreasing. This check is done after calling transpose on
+   * multiple of the previous tensor in the nested tensor and that the strides
+   * are monotonically decreasing. This check is done after calling transpose on
    * the nested tensor. Resulting in a Nt of shape [bsz, {seq_len}, num_heads, dim]
    *
    * @return A boolean indicating of contiguous needs to be called for input
@@ -133,8 +133,8 @@ int64_t get_nnz(Tensor nestedtensor) {
     }
 
     // Check the offsets are a constant multiple from the previous numels
-    const int64_t* tensor_size_ptr = tensor_sizes.data_ptr<int64_t>();
-    const int64_t* tensor_stride_ptr = tensor_strides.data_ptr<int64_t>();
+    const int64_t* tensor_size_ptr = tensor_sizes.const_data_ptr<int64_t>();
+    const int64_t* tensor_stride_ptr = tensor_strides.const_data_ptr<int64_t>();
 
     int64_t numel_0 = (tensor_size_ptr[0] * tensor_stride_ptr[0]);
     TORCH_INTERNAL_ASSERT(numel_0 > 0, "numels must be positive!");
diff --git a/aten/src/ATen/native/quantized/ConvUtils.h b/aten/src/ATen/native/quantized/ConvUtils.h
index 092f68e7d5b63..6f8ff918c1d2f 100644
--- a/aten/src/ATen/native/quantized/ConvUtils.h
+++ b/aten/src/ATen/native/quantized/ConvUtils.h
@@ -5,7 +5,7 @@
 namespace at::native::quantized {
 namespace {
 // MakeConvOutputShape used from both CPU and CUDA libraries
-// and exporting symbol from torch_cpu would probaby take more storage
+// and exporting symbol from torch_cpu would probably take more storage
 // than duplicating implementation which likely be inlined away
 template <int kSpatialDim>
 at::SmallVector<int64_t, kSpatialDim + 2> MakeConvOutputShape(
diff --git a/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp b/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
index 91db5a7ae313a..25dbc0926ccf0 100644
--- a/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
+++ b/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
@@ -48,6 +48,8 @@ std::tuple<Tensor, Tensor> fake_quantize_per_channel_affine_cachemask(
     int64_t axis,
     int64_t quant_min,
     int64_t quant_max) {
+  TORCH_CHECK(scale.scalar_type() == ScalarType::Float,
+              "Scale must be Float, found ", scale.scalar_type());
   TORCH_CHECK(zero_point.scalar_type() == ScalarType::Int || zero_point.scalar_type() == ScalarType::Float || zero_point.scalar_type() == ScalarType::Half,
               "Zero-point must be Int32, Float or Half, found ", zero_point.scalar_type());
   TORCH_CHECK(scale.dim() == 1, "scale should be a 1-D tensor");
diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
index b8841214fdcb2..9705de0a4a54d 100644
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -344,7 +344,7 @@ std::tuple<Tensor, Tensor> choose_qparams_optimized(
 
   TORCH_CHECK(numel <= input_tensor.numel(), "numel ", numel,
       " greater than input_tensor.numel() ", input_tensor.numel());
-  const float* input_row = input_tensor.data_ptr<float>();
+  const float* input_row = input_tensor.const_data_ptr<float>();
   float xmin = *std::min_element(input_row, input_row + numel);
   float xmax = *std::max_element(input_row, input_row + numel);
 
@@ -352,7 +352,7 @@ std::tuple<Tensor, Tensor> choose_qparams_optimized(
   // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
   int min_bins = n_bins * (1.0 - (float) ratio);
   Tensor input_tensor_contig = input_tensor.contiguous();
-  const float* input = input_tensor_contig.data_ptr<float>();
+  const float* input = input_tensor_contig.const_data_ptr<float>();
   std::vector<float> q_input(numel);
 
   float loss =
diff --git a/aten/src/ATen/native/quantized/TensorCompare.cpp b/aten/src/ATen/native/quantized/TensorCompare.cpp
index 25cf5c6c93a35..def1622863e1d 100644
--- a/aten/src/ATen/native/quantized/TensorCompare.cpp
+++ b/aten/src/ATen/native/quantized/TensorCompare.cpp
@@ -50,9 +50,7 @@ std::tuple<Tensor, Tensor> sort_quantized_cpu_stable(
     c10::optional<bool> stable,
     int64_t dim,
     bool descending) {
-  Tensor sort_int;
-  Tensor sort_indicies;
-  std::tie(sort_int, sort_indicies) =
+  auto [sort_int, sort_indicies] =
       at::sort(self.int_repr(), stable, dim, descending);
   return std::forward_as_tuple(
       at::_make_per_tensor_quantized_tensor(
diff --git a/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp
index 1317817902cfb..74476e0a80ae0 100644
--- a/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp
+++ b/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp
@@ -16,7 +16,6 @@
 #endif
 
 #include <c10/util/irange.h>
-#include <c10/util/math_compat.h>
 
 #include <algorithm>
 #include <cmath>
diff --git a/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp b/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp
index bb72a2010ca3b..754c7d6bd529b 100644
--- a/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp
@@ -18,7 +18,6 @@
 #endif
 
 #include <c10/util/irange.h>
-#include <c10/util/math_compat.h>
 
 #include <algorithm>
 #include <cmath>
@@ -188,10 +187,9 @@ Tensor q_avg_pool2d(
     bool count_include_pad,
     c10::optional<int64_t> divisor_override) {
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int kW, kH, dW, dH, padW, padH;
-  std::tie(kW, kH) = get_kernel(kernel_size);
-  std::tie(dW, dH) = get_stride(stride, kW, kH);
-  std::tie(padW, padH) = get_padding(padding);
+  auto [kW, kH] = get_kernel(kernel_size);
+  auto [dW, dH] = get_stride(stride, kW, kH);
+  auto [padW, padH] = get_padding(padding);
 
   const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
   const int64_t nInputPlane = input.size(-3);
@@ -268,12 +266,9 @@ Tensor qnnpack_avg_pool2d(
     bool ceil_mode,
     bool count_include_pad,
     c10::optional<int64_t> divisor_override) {
-  Tensor output;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int kW, kH, dW, dH, padW, padH;
-  std::tie(kW, kH) = get_kernel(kernel_size);
-  std::tie(dW, dH) = get_stride(stride, kW, kH);
-  std::tie(padW, padH) = get_padding(padding);
+  auto [kW, kH] = get_kernel(kernel_size);
+  auto [dW, dH] = get_stride(stride, kW, kH);
+  auto [padW, padH] = get_padding(padding);
   TORCH_CHECK(
       input.ndimension() == 4,
       "qnnpack_avg_pool2d(): Expected input to be 4-dimensional: got ",
@@ -304,7 +299,7 @@ Tensor qnnpack_avg_pool2d(
       oH > 0 && oW > 0,
       "qnnpack_avg_pool2d(): the resulting output Tensor size should be >= 0");
   // NHWC output
-  output = at::_empty_affine_quantized(
+  auto output = at::_empty_affine_quantized(
       output_shape,
       at::device(kCPU).dtype(kQUInt8),
       scale,
diff --git a/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp b/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp
index 93534b70c2c0f..875ae28e46a96 100644
--- a/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp
@@ -14,8 +14,6 @@
 #include <ATen/ops/avg_pool3d_native.h>
 #endif
 
-#include <c10/util/math_compat.h>
-
 #include <vector>
 
 namespace at {
@@ -103,11 +101,9 @@ Tensor q_avg_pool3d(
     bool ceil_mode,
     bool count_include_pad,
     c10::optional<int64_t> divisor_override) {
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int kD, kW, kH, dD, dW, dH, padD, padW, padH;
-  std::tie(kW, kH, kD) = get_kernel(kernel_size);
-  std::tie(dW, dH, dD) = get_stride(stride, kW, kH, kD);
-  std::tie(padW, padH, padD) = get_padding(padding);
+  auto [kW, kH, kD] = get_kernel(kernel_size);
+  auto [dW, dH, dD] = get_stride(stride, kW, kH, kD);
+  auto [padW, padH, padD] = get_padding(padding);
 
   const int64_t nbatch = input.ndimension() == 5 ? input.size(-5) : 1;
   const int64_t nInputPlane = input.size(-4);
diff --git a/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp b/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
index a25e2e23a32df..8b5fb286ec611 100644
--- a/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
+++ b/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
@@ -326,8 +326,8 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
   auto output_min = std::numeric_limits<underlying_t>::min();
   if (ReLUFused) {
     /*
-     * FIXME: use acticationLimits<T>()
-     * With <T>, MSVC runs into "error C3862: indetifier activationLimits not found".
+     * FIXME: use activationLimits<T>()
+     * With <T>, MSVC runs into "error C3862: identifier activationLimits not found".
      */
     constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
     constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
@@ -405,7 +405,7 @@ Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
 #endif // USE_XNNPACK
 
 #ifdef USE_PYTORCH_QNNPACK
-    if(qa.sizes() == qb.sizes() && /* qnnpack does not support boradcasting */
+    if(qa.sizes() == qb.sizes() && /* qnnpack does not support boardcasting */
       qa.scalar_type() == kQUInt8) {
     return qnnpack_add<ReLUFused>(qa, qb, scale, zero_point);
     }
diff --git a/aten/src/ATen/native/quantized/cpu/IntReprQuant.cpp b/aten/src/ATen/native/quantized/cpu/IntReprQuant.cpp
index 9867a8f48a9ea..cfcce3465a731 100644
--- a/aten/src/ATen/native/quantized/cpu/IntReprQuant.cpp
+++ b/aten/src/ATen/native/quantized/cpu/IntReprQuant.cpp
@@ -32,7 +32,7 @@ Tensor int_repr_quantized_cpu(const Tensor& self) {
           {out_size},
           self.options().dtype(UNDERLYING_TYPE),
           self.suggest_memory_format());
-      const underlying_t* qdata = reinterpret_cast<underlying_t*>(self.data_ptr<scalar_t>());
+      const underlying_t* qdata = reinterpret_cast<const underlying_t*>(self.const_data_ptr<scalar_t>());
       for (const auto i : c10::irange(dst.numel())) {
         dst[i] = static_cast<underlying_t>(qdata[i]);
       }
diff --git a/aten/src/ATen/native/quantized/cpu/Normalization.cpp b/aten/src/ATen/native/quantized/cpu/Normalization.cpp
index 05a9585274306..0f5fb9884a9c5 100644
--- a/aten/src/ATen/native/quantized/cpu/Normalization.cpp
+++ b/aten/src/ATen/native/quantized/cpu/Normalization.cpp
@@ -80,8 +80,8 @@ Tensor q_batch_norm1d_impl(
   TORCH_CHECK(weight.numel() == C, "Expect weight size to match C");
   TORCH_CHECK(bias.numel() == C, "Expect weight size to match C");
 
-  const float* weight_data = weight.template data_ptr<float>();
-  const float* bias_data = bias.template data_ptr<float>();
+  const float* weight_data = weight.template const_data_ptr<float>();
+  const float* bias_data = bias.template const_data_ptr<float>();
 
   TORCH_CHECK(mean.numel() == C, "Mean size must match channel dimension");
   TORCH_CHECK(var.numel() == C, "Variance size must match channel dimension");
@@ -91,8 +91,8 @@ Tensor q_batch_norm1d_impl(
   float* alpha_data = alpha.mutable_data_ptr<float>();
   float* beta_data = beta.data_ptr<float>();
 
-  const float* mean_data = mean.template data_ptr<float>();
-  const float* var_data = var.template data_ptr<float>();
+  const float* mean_data = mean.template const_data_ptr<float>();
+  const float* var_data = var.template const_data_ptr<float>();
 
   if (ndim == 2) {
     // create a fake H and W dimension so we can use NHWC
@@ -189,8 +189,8 @@ Tensor q_batch_norm2d_impl(
   TORCH_CHECK(weight.numel() == C, "Expect weight size to match C");
   TORCH_CHECK(bias.numel() == C, "Expect weight size to match C");
 
-  const float* weight_data = weight.template data_ptr<float>();
-  const float* bias_data = bias.template data_ptr<float>();
+  const float* weight_data = weight.template const_data_ptr<float>();
+  const float* bias_data = bias.template const_data_ptr<float>();
 
   TORCH_CHECK(mean.numel() == C, "Mean size must match channel dimension");
   TORCH_CHECK(var.numel() == C, "Variance size must match channel dimension");
@@ -200,8 +200,8 @@ Tensor q_batch_norm2d_impl(
   float* alpha_data = alpha.mutable_data_ptr<float>();
   float* beta_data = beta.data_ptr<float>();
 
-  const float* mean_data = mean.template data_ptr<float>();
-  const float* var_data = var.template data_ptr<float>();
+  const float* mean_data = mean.template const_data_ptr<float>();
+  const float* var_data = var.template const_data_ptr<float>();
 
   auto oSizes = qx.sizes();
   auto qx_nhwc = qx.contiguous(MemoryFormat::ChannelsLast);
@@ -285,8 +285,8 @@ Tensor q_batch_norm3d_impl(
   TORCH_CHECK(weight.numel() == C, "Expect weight size to match C");
   TORCH_CHECK(bias.numel() == C, "Expect weight size to match C");
 
-  const float* weight_data = weight.template data_ptr<float>();
-  const float* bias_data = bias.template data_ptr<float>();
+  const float* weight_data = weight.template const_data_ptr<float>();
+  const float* bias_data = bias.template const_data_ptr<float>();
 
   TORCH_CHECK(mean.numel() == C, "Mean size must match channel dimension");
   TORCH_CHECK(var.numel() == C, "Variance size must match channel dimension");
@@ -296,8 +296,8 @@ Tensor q_batch_norm3d_impl(
   float* alpha_data = alpha.mutable_data_ptr<float>();
   float* beta_data = beta.data_ptr<float>();
 
-  const float* mean_data = mean.template data_ptr<float>();
-  const float* var_data = var.template data_ptr<float>();
+  const float* mean_data = mean.template const_data_ptr<float>();
+  const float* var_data = var.template const_data_ptr<float>();
 
   auto oSizes = qx.sizes();
   auto qx_nhwc = qx.contiguous(MemoryFormat::ChannelsLast3d);
diff --git a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
index 4537feddd0c7b..8887bb83deb91 100644
--- a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
@@ -115,13 +115,6 @@ enum PostOps {
   Gelu
 };
 
-static std::unordered_map<std::string, PostOps> POST_OP_TABLE = {
-  {"none", NoPostOp},
-  {"relu", Relu},
-  {"leaky_relu", LeakyRelu},
-  {"tanh", Tanh},
-  {"gelu", Gelu}
-};
 
 struct PackedLinearWeightsOnednn : public LinearPackedParamsBase {
   PackedLinearWeightsOnednn(
@@ -317,19 +310,81 @@ struct PackedConvWeightsOnednn : public ConvPackedParamsBase<kSpatialDim> {
 namespace onednn_utils {
 
 static ideep::attr_t create_attr_by_post_op(
-    const std::string& post_op_name,
-    const torch::List<c10::optional<at::Scalar>>& post_op_args,
-    const dnnl::algorithm post_algorithm) {
+    const c10::string_view& binary_post_op,
+    double binary_alpha,
+    double input1_scale,
+    int64_t input1_zero_point,
+    const ideep::tensor::desc& input1_desc,
+    const c10::string_view& unary_post_op,
+    const torch::List<c10::optional<at::Scalar>>& unary_post_op_args,
+    const c10::string_view& unary_post_op_algorithm) {
   using ideep::tensor;
-  PostOps post_op = POST_OP_TABLE[post_op_name];
-  if (post_op == Relu) {
-    return ideep::attr_t::fuse_relu();
-  } else if (post_op == LeakyRelu) {
-    return ideep::attr_t::fuse_relu_v2(/*alpha=*/post_op_args[0].value().to<float>());
-  } else if (post_op == Tanh) {
-    return ideep::attr_t::fuse_tanh();
-  } else if (post_op == Gelu) {
-    return ideep::attr_t::fuse_gelu_v2(0.f, 0.f, post_algorithm);
+  if (binary_post_op == "none") {
+    if (unary_post_op == "relu") {
+      return ideep::attr_t::fuse_relu();
+    } else if (unary_post_op == "leaky_relu") {
+      TORCH_CHECK(
+          unary_post_op_args.size() == 1,
+          "onednn qlinear: expect one argument for post op leaky_relu but got ", unary_post_op_args.size(), " args");
+      auto alpha = unary_post_op_args[0].value().to<float>();
+      return ideep::attr_t::fuse_relu_v2(alpha);
+    } else if (unary_post_op == "tanh") {
+      return ideep::attr_t::fuse_tanh();
+    } else if (unary_post_op == "gelu") {
+      TORCH_CHECK(
+          unary_post_op_algorithm == "none" || unary_post_op_algorithm == "tanh",
+          "onednn qlinear: algorithm for post op gelu must be none or tanh but got ", unary_post_op_algorithm);
+      auto post_algorithm = unary_post_op_algorithm == "none" ?
+        dnnl::algorithm::eltwise_gelu_erf :
+        dnnl::algorithm::eltwise_gelu_tanh;
+      return ideep::attr_t::fuse_gelu_v2(0.f, 0.f, post_algorithm);
+    } else if (unary_post_op == "hardtanh") {
+      TORCH_CHECK(
+          unary_post_op_args.size() == 2 &&
+              unary_post_op_args[0].has_value() &&
+              unary_post_op_args[1].has_value(),
+          "hardtanh is expected to have two scalar input: min_val and max_val");
+      auto lower_bound_value =
+          unary_post_op_args[0].value().to<float>();
+      auto upper_bound_value =
+          unary_post_op_args[1].value().to<float>();
+      return ideep::attr_t::fuse_clamp(lower_bound_value, upper_bound_value);
+    } else if (unary_post_op == "hardswish") {
+      return ideep::attr_t::fuse_hardswish();
+    } else if (unary_post_op == "swish") {
+      return ideep::attr_t::fuse_swish();
+    } else {
+      TORCH_CHECK(
+          unary_post_op == "none",
+          "onednn qlinear: unsupported unary post op ", unary_post_op);
+    }
+  } else if (binary_post_op == "sum") {
+    if (unary_post_op == "none") {
+      return ideep::attr_t::fuse_sum(input1_scale, input1_zero_point);
+    } else if (unary_post_op == "relu") {
+      return ideep::attr_t::residual_with_sum_zero_point(input1_scale, input1_zero_point);
+    } else {
+      TORCH_CHECK(
+          false,
+          "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op sum");
+    }
+  } else if (binary_post_op == "add") {
+    if (unary_post_op == "none") {
+      return ideep::attr_t::fuse_binary(ideep::algorithm::binary_add, input1_desc);
+    } else if (unary_post_op == "relu") {
+      ideep::post_ops po;
+      po.append_binary(ideep::algorithm::binary_add, input1_desc);
+      po.append_eltwise(ideep::algorithm::eltwise_relu, 0, 0);
+      return ideep::attr_t::attr_post_ops(po);
+    } else {
+      TORCH_CHECK(
+          false,
+          "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op add");
+    }
+  } else {
+    TORCH_CHECK(
+        false,
+        "onednn qlinear: unsupported binary post op ", binary_post_op);
   }
   return ideep::attr_t();
 }
@@ -430,7 +485,7 @@ static at::Tensor _quantized_convolution_onednn(
     torch::List<int64_t> dilation,
     bool transposed,
     int64_t groups,
-    double inv_output_scale,
+    double output_scale,
     int64_t output_zero_point,
     c10::optional<at::Tensor> accum=c10::nullopt, // accum to fused with conv add
     double accum_scale=1.0,
diff --git a/aten/src/ATen/native/quantized/cpu/RuyUtils.cpp b/aten/src/ATen/native/quantized/cpu/RuyUtils.cpp
index c9aeb06930ddd..4a9791eb0faf3 100644
--- a/aten/src/ATen/native/quantized/cpu/RuyUtils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/RuyUtils.cpp
@@ -32,6 +32,6 @@ void quantize_multiplier(double scale,
 
 } // namespace ruy_utils
 } // namespace native
-} // namesplace
+} // namespace
 
 #endif // USE_RUY_QMATMUL
diff --git a/aten/src/ATen/native/quantized/cpu/RuyUtils.h b/aten/src/ATen/native/quantized/cpu/RuyUtils.h
index aeb332af4ecae..72abe1ad817f4 100644
--- a/aten/src/ATen/native/quantized/cpu/RuyUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/RuyUtils.h
@@ -16,6 +16,6 @@ void quantize_multiplier(double scale,
 
 } // namespace ruy_utils
 } // namespace native
-} // namesplace
+} // namespace
 
 #endif // USE_RUY_QMATMUL
diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
index 8a3d4b737f777..f428745eaa86f 100644
--- a/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
@@ -22,7 +22,7 @@ namespace at {
 namespace native {
 namespace {
 
-// pre calcuate interpolation params on width
+// pre calculate interpolation params on width
 struct UpsampleBilinearParamW {
   int64_t w1, w1p;
   float w0lambda, w1lambda;
@@ -48,14 +48,14 @@ static void upsample_bilinear2d_out_frame(
     bool align_corners,
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
-  auto* idata = static_cast<scalar_t*>(input.data_ptr());
+  auto* idata = static_cast<const scalar_t*>(input.const_data_ptr());
   auto* odata = static_cast<scalar_t*>(output.data_ptr());
 
   channels = channels * nbatch;
   if (channels == 0 || output_height == 0 || output_width == 0) {
     return;
   }
-  auto* i_p = reinterpret_cast<typename scalar_t::underlying*>(idata);
+  auto* i_p = reinterpret_cast<const typename scalar_t::underlying*>(idata);
   auto* o_p = reinterpret_cast<typename scalar_t::underlying*>(odata);
 
   // special case: just copy
diff --git a/aten/src/ATen/native/quantized/cpu/conv_serialization.h b/aten/src/ATen/native/quantized/cpu/conv_serialization.h
index 3c79d806d31c2..9f452a1cc7213 100644
--- a/aten/src/ATen/native/quantized/cpu/conv_serialization.h
+++ b/aten/src/ATen/native/quantized/cpu/conv_serialization.h
@@ -236,9 +236,7 @@ ConvParamsSerializationTypeV2 serialize_conv(
     // clone to retain ownership of the data
     .clone();
 
-  at::Tensor weight;
-  c10::optional<at::Tensor> bias;
-  std::tie(weight, bias) = params->unpack();
+  auto [weight, bias] = params->unpack();
 
   non_optional.emplace_back(std::move(params_tensor));
   non_optional.emplace_back(std::move(weight));
@@ -267,9 +265,7 @@ ConvParamsSerializationTypeV3 serialize_conv(
   config_vals.push_back(params->groups());
   config_vals.push_back(params->transpose());
 
-  at::Tensor weight;
-  c10::optional<at::Tensor> bias;
-  std::tie(weight, bias) = params->unpack();
+  auto [weight, bias] = params->unpack();
 
   std::vector<c10::optional<at::Tensor>> tensors;
   tensors.emplace_back();
@@ -287,12 +283,7 @@ ConvParamsSerializationTypeV3 serialize_conv(
 template <uint32_t kSpatialDim>
 c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> deserialize_conv(
     ConvParamsSerializationTypeV3 state) {
-
-  int64_t version;
-  std::vector<int64_t> config_vals;
-  std::vector<c10::optional<at::Tensor>> tensors;
-
-  std::tie(version, config_vals, tensors) = state;
+  auto [version, config_vals, tensors] = state;
   TORCH_INTERNAL_ASSERT(version == 3, "Unexpected serialized qconv version: ", version);
 
   TORCH_CHECK(tensors.size() == 3, "Wrong number of tensors", tensors.size());
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
index 2d15e54c4052b..d942e2f161a26 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@@ -410,16 +410,10 @@ register_conv_params() {
           return deserialize_conv<kSpatialDim>(state);
         })
     .def("weight", [](const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& self) {
-                     at::Tensor weight;
-                     c10::optional<at::Tensor> bias;
-                     std::tie(weight, bias) = self->unpack();
-                     return weight;
+                     return std::get<0>(self->unpack());
                    })
     .def("bias", [](const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& self) {
-                   at::Tensor weight;
-                   c10::optional<at::Tensor> bias;
-                   std::tie(weight, bias) = self->unpack();
-                   return bias;
+                     return std::get<1>(self->unpack());
                  })
     .def("unpack", &ConvPackedParamsBase<kSpatialDim>::unpack)
     .def("stride", &ConvPackedParamsBase<kSpatialDim>::stride)
@@ -446,10 +440,7 @@ TORCH_API int register_linear_params() {
           .def_pickle(
               [](const c10::intrusive_ptr<LinearPackedParamsBase>& params)
                   -> SerializationType { // __getstate__
-                at::Tensor weight;
-                c10::optional<at::Tensor> bias;
-                std::tie(weight, bias) = params->unpack();
-                return std::make_tuple(std::move(weight), std::move(bias));
+                return params->unpack();
               },
               [](SerializationType state)
                   -> c10::intrusive_ptr<
@@ -501,10 +492,7 @@ TORCH_API int register_linear_params() {
                 TORCH_CHECK(false, "Unknown qengine");
               })
               .def("bias", [](const c10::intrusive_ptr<LinearPackedParamsBase>& self) {
-                   at::Tensor weight;
-                   c10::optional<at::Tensor> bias;
-                   std::tie(weight, bias) = self->unpack();
-                   return bias;
+                  return std::get<1>(self->unpack());
                  })
               .def("unpack", &LinearPackedParamsBase::unpack);
   // (1) we can't (easily) return the static initializer itself because it can have a different type because of selective build
@@ -548,12 +536,7 @@ int register_embedding_params() {
           [](EmbeddingParamsSerializationType state)
               -> c10::intrusive_ptr<EmbeddingPackedParamsBase> { // __setstate__ call
 
-            std::vector<at::Tensor> tensors;
-            std::vector<double> doubles;
-            std::vector<int64_t> longs;
-            // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-            int64_t version;
-            std::tie(version, tensors, doubles, longs) = std::move(state);
+            auto [version, tensors, doubles, longs] = std::move(state);
 
             TORCH_INTERNAL_ASSERT(tensors.size() == 1, "EmbeddingPackedParams: Expected weight tensor to be serialized");
             TORCH_INTERNAL_ASSERT(longs.size() == 1, "EmbeddingPackedParams: Expected bit_rate to be serialized");
diff --git a/aten/src/ATen/native/quantized/cpu/fused_obs_fake_quant.cpp b/aten/src/ATen/native/quantized/cpu/fused_obs_fake_quant.cpp
index 77c60141b0655..409f6e38d3e0b 100644
--- a/aten/src/ATen/native/quantized/cpu/fused_obs_fake_quant.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fused_obs_fake_quant.cpp
@@ -41,8 +41,8 @@ void calculate_moving_average(
   } else {
     std::tie(x_min, x_max) = at::aminmax(x);
   }
-  const float* min_curr_val = x_min.data_ptr<float>();
-  const float* max_curr_val = x_max.data_ptr<float>();
+  const float* min_curr_val = x_min.const_data_ptr<float>();
+  const float* max_curr_val = x_max.const_data_ptr<float>();
   // Moving Average Min/Max observer for input tensor
   float* running_min_val = running_min.data_ptr<float>();
   float* running_max_val = running_max.data_ptr<float>();
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index 45d3c9a864ced..dc9063ecf46f1 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -913,7 +913,7 @@ void qhardsigmoid_kernel(const Tensor& qx, Tensor& qy) {
     fVec kThreeVec(3.0f);
     fVec kSixVec(6.0f);
 
-    // Naive implemenentation: uses dequantize/execute/quantize routine
+    // Naive implementation: uses dequantize/execute/quantize routine
     cpu_kernel_vec(
         iter,
         [&](scalar_t qx) -> scalar_t {
@@ -1070,7 +1070,7 @@ void qthreshold_kernel(
     Vec threshold_vec = Vec(threshold_float);
     Vec value_vec = Vec(value_float);
 
-    // Naive implemenentation: uses dequantize/execute/quantize routine
+    // Naive implementation: uses dequantize/execute/quantize routine
     cpu_kernel_vec(
         iter,
         [&](scalar_t value_qx) -> scalar_t {
@@ -1152,7 +1152,7 @@ void qtanh_kernel(const Tensor& qx, Tensor& qy) {
   auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
 
   AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qtanh", [&]() {
-    // Naive implemenentation: uses dequantize/execute/quantize routine
+    // Naive implementation: uses dequantize/execute/quantize routine
     // - Output scale is set to 2.0 / 2^(BIT_NUM)
     // - For signed types output zero point is set to 0
     // - For unsigned types output zero point is set to (qmax + qmin) / 2.0
@@ -2734,7 +2734,7 @@ void fake_quantize_learnable_channel_grad_kernel_cpu(
     float grad_factor) {
   iter.for_each([&](char** data, const int64_t* strides, int64_t n) {
     /*  To see how the input and outputs are referenced and assigned,
-        please see the implemenetation of
+        please see the implementation of
         fake_quantize_learnable_tensor_grad_kernel_cpu.
     */
     for (const auto i : c10::irange(n)) {
@@ -2797,8 +2797,8 @@ void quantized_normalize_kernel(
         "Unexpected size of beta");
 
     scalar_t* X_data = X.data_ptr<scalar_t>();
-    const float* gamma_data = gamma.defined() ? gamma.data_ptr<float>() : nullptr;
-    const float* beta_data = beta.defined() ? beta.data_ptr<float>() : nullptr;
+    const float* gamma_data = gamma.defined() ? gamma.const_data_ptr<float>() : nullptr;
+    const float* beta_data = beta.defined() ? beta.const_data_ptr<float>() : nullptr;
     scalar_t* Y_data = Y->data_ptr<scalar_t>();
     const bool gamma_null = gamma_data == nullptr;
     const bool beta_null = beta_data == nullptr;
@@ -3085,8 +3085,8 @@ void quantized_groupnorm_nhwc_kernel(
         "Unexpected size of beta");
 
     scalar_t* X_data = X.data_ptr<scalar_t>();
-    const float* gamma_data = gamma.defined() ? gamma.data_ptr<float>() : nullptr;
-    const float* beta_data = beta.defined() ? beta.data_ptr<float>() : nullptr;
+    const float* gamma_data = gamma.defined() ? gamma.const_data_ptr<float>() : nullptr;
+    const float* beta_data = beta.defined() ? beta.const_data_ptr<float>() : nullptr;
     scalar_t* Y_data = Y->data_ptr<scalar_t>();
     const bool gamma_null = gamma_data == nullptr;
     const bool beta_null = beta_data == nullptr;
@@ -3265,7 +3265,7 @@ void quantized_groupnorm_nhwc_kernel(
       //
       // We could fuse step 3 and 4 into a single session but this way is better:
       //   a. D might be too small for vectorization;
-      //   b. Avoid duplicate caculation of scale/bias, each HxW plain share the same scale/bias
+      //   b. Avoid duplicate calculation of scale/bias, each HxW plain share the same scale/bias
       //
       for (const auto n : c10::irange(Bs)) {
         for (const auto g : c10::irange(G)) {
@@ -3336,7 +3336,7 @@ void quantize_tensor_per_tensor_affine_cpu(
   AT_DISPATCH_QINT_TYPES(
       qtensor.scalar_type(), "quantize_tensor_per_tensor_affine_cpu", [&]() {
         check_tensor_memory_format(rtensor, qtensor);
-        const float* rd = rtensor.data_ptr<float>();
+        const float* rd = rtensor.const_data_ptr<float>();
         auto qd = reinterpret_cast<underlying_t*>(qtensor.data_ptr<scalar_t>());
         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
         fbgemm::TensorQuantizationParams qparams;
@@ -3668,7 +3668,7 @@ void quantize_tensor_per_tensor_affine_cpu(
     double scale,
     int64_t zero_point) {
   check_tensor_memory_format(rtensor, qtensor);
-  const float* rdata = rtensor.data_ptr<float>();
+  const float* rdata = rtensor.const_data_ptr<float>();
   int numel = rtensor.numel();
 #if defined(__ARM_NEON__) || defined(__aarch64__)
   AT_DISPATCH_QINT_TYPES(
@@ -3707,7 +3707,7 @@ void dequantize_tensor_per_tensor_affine_cpu(
 #if defined(__ARM_NEON__) || defined(__aarch64__)
   AT_DISPATCH_QINT_TYPES(
       qtensor.scalar_type(), "dequantize_tensor_per_tensor_affine_cpu", [&]() {
-        const scalar_t* qdata = qtensor.data_ptr<scalar_t>();
+        const scalar_t* qdata = qtensor.const_data_ptr<scalar_t>();
         auto dequantize_range = [&](int64_t begin, int64_t end) {
           dequantize_tensor_arm<scalar_t>(
             qdata + begin, rdata + begin, end - begin, scale, zero_point);
@@ -3722,7 +3722,7 @@ void dequantize_tensor_per_tensor_affine_cpu(
   // Fallback path
   AT_DISPATCH_QINT_TYPES(
       qtensor.scalar_type(), "dequantize_tensor_per_tensor_affine_cpu", [&]() {
-        const scalar_t* qdata = qtensor.data_ptr<scalar_t>();
+        const scalar_t* qdata = qtensor.const_data_ptr<scalar_t>();
         for (const auto i : c10::irange(numel)) {
           rdata[i] = dequantize_val<scalar_t>(scale, zero_point, qdata[i]);
         }
@@ -3743,7 +3743,7 @@ void quantize_tensor_per_channel_impl(
   // TODO: channels last kernel can be made faster.
   // For contiguous tensors, e.g. NCHW, arbitrary axis can be used.
   // For channels_last/3d however axis == 0 or 1.
-  // Since current implemntation on channels_last format does not
+  // Since current implementation on channels_last format does not
   // cover per channel quant with arbitrary axis value, it is better
   // to check and fail.
   int64_t batches = size_to_dim_(axis, rtensor.sizes());
@@ -3752,7 +3752,7 @@ void quantize_tensor_per_channel_impl(
   int64_t channels = rtensor.size(axis);
   auto scales_data = scales.data_ptr<double>();
   auto zero_points_data = zero_points.data_ptr<int64_t>();
-  const float* in = rtensor.data_ptr<float>();
+  const float* in = rtensor.const_data_ptr<float>();
   auto out = qtensor.data_ptr<T>();
   if (axis == 1 &&
       (rtensor.is_contiguous(MemoryFormat::ChannelsLast) ||
@@ -3804,7 +3804,7 @@ void quantize_tensor_per_channel_impl<c10::quint8>(
   int64_t channels = rtensor.size(axis);
   auto scales_data = scales.data_ptr<double>();
   auto zero_points_data = zero_points.data_ptr<int64_t>();
-  const float* in = rtensor.data_ptr<float>();
+  const float* in = rtensor.const_data_ptr<float>();
   auto out = (uint8_t*)qtensor.data_ptr<c10::quint8>();
 #if defined(__ARM_NEON__)
   // magic float and magic int to take care of rounding
@@ -4008,7 +4008,7 @@ void dequantize_per_channel_affine_kernel(
 
   // For contiguous tensors, e.g. NCHW, arbitrary axis can be used.
   // For channels_last/3d however axis == 0 or 1.
-  // Since current implemntation on channels_last format does not
+  // Since current implementation on channels_last format does not
   // cover per channel quant with arbitrary axis value, it is better
   // to check and fail.
   TORCH_CHECK(rtensor.is_contiguous() || (axis <=1),
@@ -4022,7 +4022,7 @@ void dequantize_per_channel_affine_kernel(
   auto scales_data = scales.data_ptr<T>();
   auto zero_points_data = zero_points.data_ptr<N>();
   check_tensor_memory_format(qtensor, rtensor);
-  const auto* qd = qtensor.data_ptr<Q>();
+  const auto* qd = qtensor.const_data_ptr<Q>();
   float* rd = rtensor.data_ptr<float>();
   const auto elem_per_byte = 8 / bit_width;
   if (axis == 1 && (rtensor.is_contiguous(MemoryFormat::ChannelsLast) ||
@@ -4084,7 +4084,7 @@ void quantize_tensor_per_channel_float_qparams_cpu(
     int64_t axis) {
   // For contiguous tensors, e.g. NCHW, arbitrary axis can be used.
   // For channels_last/3d however axis == 0 or 1.
-  // Since current implemntation on channels_last format does not
+  // Since current implementation on channels_last format does not
   // cover per channel quant with arbitrary axis value, it is better
   // to check and fail.
   TORCH_CHECK(rtensor.is_contiguous() || (axis <=1),
@@ -4099,7 +4099,7 @@ void quantize_tensor_per_channel_float_qparams_cpu(
         auto scales_data = scales.data_ptr<float>();
         auto zero_points_data = zero_points.data_ptr<float>();
         check_tensor_memory_format(rtensor, qtensor);
-        const float* rdata = rtensor.data_ptr<float>();
+        const float* rdata = rtensor.const_data_ptr<float>();
         auto qdata = reinterpret_cast<underlying_t*>(qtensor.data_ptr<scalar_t>());
         const auto elem_per_byte = CHAR_BIT / bit_width;
         int qvalue = 0;
@@ -4163,7 +4163,7 @@ void quantize_tensor_per_tensor_affine_sub_byte_cpu(
   AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(
     qtensor.scalar_type(), "quantize_tensor_per_tensor_affine_sub_byte_cpu", [&]() {
       check_tensor_memory_format(rtensor, qtensor);
-      const float* const rdata = rtensor.data_ptr<float>();
+      const float* const rdata = rtensor.const_data_ptr<float>();
       auto qdata = reinterpret_cast<underlying_t*>(qtensor.data_ptr<scalar_t>());
       auto numel = rtensor.numel();
       const auto elem_per_byte = CHAR_BIT / bit_width;
@@ -4196,7 +4196,7 @@ void dequantize_tensor_per_tensor_affine_sub_byte_cpu(
     qtensor.scalar_type(), "dequantize_tensor_per_tensor_affine_sub_byte_cpu", [&]() {
       check_tensor_memory_format(rtensor, qtensor);
       auto rdata = rtensor.data_ptr<float>();
-      const underlying_t* qdata = reinterpret_cast<underlying_t*>(qtensor.data_ptr<scalar_t>());
+      const underlying_t* qdata = reinterpret_cast<const underlying_t*>(qtensor.const_data_ptr<scalar_t>());
       auto numel = rtensor.numel();
       const auto elem_per_byte = CHAR_BIT / bit_width;
 
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 9f3c790d52c75..f915c014af143 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -647,7 +647,7 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl_xnnp(
     // create an empty tensor for packing the weights
     const at::Tensor weight_contig =
         orig_weight.contiguous(c10::MemoryFormat::ChannelsLast);
-    const float* w_scales_data = w_scales.data_ptr<float>();
+    const float* w_scales_data = w_scales.const_data_ptr<float>();
     underlying_t w_zp = 0;
     at::Tensor weight_tensor;
 
@@ -1397,7 +1397,7 @@ static at::Tensor _quantized_convolution_onednn(
     torch::List<int64_t> dilation,
     bool transposed,
     int64_t groups,
-    double inv_output_scale,  // inv_output_scale is the reciprocal of scale in fake quant
+    double output_scale,
     int64_t output_zero_point,
     c10::optional<at::Tensor> accum, // accum to fused with conv add
     double accum_scale,
@@ -1420,10 +1420,10 @@ static at::Tensor _quantized_convolution_onednn(
   bool bfloat16_output = output_dtype.has_value() && (output_dtype.value() == c10::kBFloat16);
   if (fp32_output || bfloat16_output) {
     // When fp32 or bf16 output, oneDNN expects op_attr doesn't set_scales and set_zero_points.
-    // So, we will use default inv_output_scale as 1.0 and output_zero_point as 0, since
-    // when inv_output_scale is 1.0, we will skip invoking of op_attr.set_scales in ideep;
+    // So, we will use default output_scale as 1.0 and output_zero_point as 0, since
+    // when output_scale is 1.0, we will skip invoking of op_attr.set_scales in ideep;
     // when output_zero_point is 0, we will skip invoking of op_attr.set_zero_points in ideep.
-    TORCH_CHECK(inv_output_scale == 1.0,  " (ONEDNN): fp32 or bf16 output, inv_output_scale must be 1.0.");
+    TORCH_CHECK(output_scale == 1.0,  " (ONEDNN): fp32 or bf16 output, output_scale must be 1.0.");
     TORCH_CHECK(output_zero_point == 0,  " (ONEDNN): fp32 or bf16 output, output_zero_point must be 0");
   }
 
@@ -1504,9 +1504,6 @@ static at::Tensor _quantized_convolution_onednn(
     kSpatialDim, "D convolution.");
 
   // Parameters
-  // Scales of ONEDNN and PyTorch are reciprocal
-  const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0 / act_scale);
-
 #if IDEEP_PREREQ(3, 1, 0, 1)
   // 1. If the weight scale generated by observer should with dtype float32
   // https://github.com/pytorch/pytorch/blob/d2c24eca8a60c56b31ca967a44d5cc4522802aa6/torch/ao/quantization/observer.py#L323
@@ -1592,75 +1589,117 @@ static at::Tensor _quantized_convolution_onednn(
   output_sizes = at::native::conv_output_size(input_size, kernel_size, padding.vec(), stride.vec(), dilation.vec());
   ideep::dims dst_dims = ideep::dims({output_sizes.cbegin(), output_sizes.cend()});
   // Output is not a quantized tensor but data type is uint8
-  at::Tensor output;
-  if (fp32_output || bfloat16_output) {
-    output = at::empty(
+  at::Tensor output = has_accum_postop_sum ?
+    accum.value() :
+    at::empty(
       dst_dims,
       device(c10::kCPU)
-        .dtype(fp32_output ? c10::kFloat : c10::kBFloat16)
-        .memory_format(kSpatialDim == 2 ?
-            c10::MemoryFormat::ChannelsLast :
-            c10::MemoryFormat::ChannelsLast3d),
-      c10::nullopt);
-  } else {
-    output = at::empty(
-      dst_dims,
-      device(c10::kCPU)
-          .dtype(c10::kByte)
+          .dtype(fp32_output ? c10::kFloat : (bfloat16_output ? c10::kBFloat16 : c10::kByte))
           .memory_format(kSpatialDim == 2 ?
               c10::MemoryFormat::ChannelsLast :
               c10::MemoryFormat::ChannelsLast3d)
     );
-  }
   if (output.numel() == 0) {
     return output;
   }
-  ideep::tensor dst;
-  if (has_accum_postop_sum) {
-    auto dst_desc = ideep::tensor::desc(dst_dims, fp32_output ? ideep::tensor::data_type::f32 : (
-      bfloat16_output ? ideep::tensor::data_type::bf16 : src_data_type),
-        kSpatialDim == 2 ? ideep::format_tag::nhwc : ideep::format_tag::ndhwc);
-    TORCH_CHECK(accum.value().dtype() == output.dtype(), "The output tensor should have same dtype as the accum tensor.");
-    // When fused with sum, the dst tensor will share the data ptr as the accum tensor.
-    dst.init(dst_desc, accum.value().data_ptr());
-  } else {
-    if (fp32_output || bfloat16_output) {
-      // Conv without add: int8-in, fp32-output
-      dst = ideep::tensor({dst_dims, fp32_output ? ideep::tensor::data_type::f32 : ideep::tensor::data_type::bf16, {output.strides().cbegin(), output.strides().cend()}},
-                        output.data_ptr());
-    } else {
-      dst = ideep::tensor({dst_dims, ideep::tensor::data_type::u8, {output.strides().cbegin(), output.strides().cend()}},
-                        output.data_ptr());
-    }
+  ideep::tensor dst = at::native::itensor_view_from_dense(output);
+  static ideep::tensor::desc dummy_accum_desc;
+  ideep::attr_t op_attr = onednn_utils::create_attr_by_post_op(
+    binary_attr.has_value() ? binary_attr.value() : "none",
+    binary_alpha.has_value() ? binary_alpha.value().to<double>() : 1.0,
+    accum_scale,
+    accum_zero_point,
+    dummy_accum_desc,
+    unary_attr.has_value() ? unary_attr.value() : "none",
+    unary_scalars,
+    unary_algorithm.has_value() ? unary_algorithm.value() : ""
+  );
+
+#if IDEEP_PREREQ(3, 1, 0, 0)
+  // Use oneDNN's APIs instead of prepare/compute from ideep to reduce integration overhead.
+  // The functions from ideep are heavy because they have complex data structures for unified API
+  // oneDNN version >= 3.1.0 is required.
+  using ideep::tensor;
+  auto weights_desc = packed_weight.get_desc();
+  auto dst_desc = dst.get_desc();
+  auto bias_desc = with_bias ?
+      tensor::desc(expected_bias.get_dims(), ideep::data_type::f32, ideep::format_tag::any) :
+      tensor::desc();
+  if (act_scale != 1.0f) {
+    op_attr.set_scales_mask(DNNL_ARG_SRC, 0);
   }
-  ideep::attr_t op_attr;
-  // attr
+  if (act_zero_point != 0) {
+    op_attr.set_zero_points_mask(DNNL_ARG_SRC, 0);
+  }
+  int oc_per_group = packed_weight.get_dim(0) / groups;
+  int wei_scale_mask = ideep::utils::conv_weight_scale_mask(weight_scales.numel(), oc_per_group, groups, false);
+  op_attr.set_scales_mask(DNNL_ARG_WEIGHTS, wei_scale_mask);
+  if (output_scale != 1.0f) {
+    op_attr.set_scales_mask(DNNL_ARG_DST, 0);
+  }
+  if (output_zero_point != 0) {
+    op_attr.set_zero_points_mask(DNNL_ARG_DST, 0);
+  }
+  op_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+  auto engine = ideep::engine::cpu_engine();
+  auto dilates_dnnl = ideep::utils::get_compatible_dilates(dilation.vec());
+  auto primitive_desc = with_bias ?
+      dnnl::convolution_forward::primitive_desc(
+        engine, dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct,
+        src_desc, weights_desc, bias_desc, dst_desc,
+        stride.vec(), dilates_dnnl, padding.vec(), padding.vec(), op_attr
+      ) :
+      dnnl::convolution_forward::primitive_desc(
+        engine, dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct,
+        src_desc, weights_desc, dst_desc,
+        stride.vec(), dilates_dnnl, padding.vec(), padding.vec(), op_attr
+      );
+  auto primitive = dnnl::convolution_forward(primitive_desc);
+
+  // Reorder weight if needed
+  auto expected_weight = packed_weight.reorder_if_differ_in(primitive_desc.weights_desc());
+
+  // Prepare args and execute primitive
+  tensor scratchpad(primitive_desc.scratchpad_desc());
+  ideep::exec_args args;
+  args.insert({DNNL_ARG_SRC, src});
+  args.insert({DNNL_ARG_WEIGHTS, expected_weight});
+  args.insert({DNNL_ARG_DST, dst});
+  args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
+  if (with_bias) {
+    args.insert({DNNL_ARG_BIAS, expected_bias});
+  }
+  tensor src_scales_t = tensor(ideep::scale_t(1, act_scale));
+  tensor wei_scales_t = tensor(weights_scales);
+  tensor dst_scales_t = tensor(ideep::scale_t(1, output_scale));
+  tensor src_zp_t = tensor(ideep::zero_point_t(1, act_zero_point));
+  tensor dst_zp_t = tensor(ideep::zero_point_t(1, output_zero_point));
+  if (act_scale != 1.0f) {
+    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scales_t});
+  }
+  if (output_scale != 1.0f) {
+    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scales_t});
+  }
+  args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_scales_t});
+  if (act_zero_point != 0) {
+    args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, src_zp_t});
+  }
+  if (output_zero_point != 0) {
+    args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, dst_zp_t});
+  }
+  primitive.execute(ideep::stream::default_stream(), args);
+#else
+  // Scales of ONEDNN and PyTorch are reciprocal
+  const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0 / act_scale);
+
+  // set accum scale/zero point to dst
   if (has_accum_postop_sum) {
-    op_attr = (has_unary_post_op && unary_attr.value()=="relu") ? ideep::attr_t::residual_with_sum_zero_point() : ideep::attr_t::fuse_sum();
     const ideep::scale_t accum_ideep_scale = ideep::scale_t(1, 1.0/accum_scale);
     const ideep::zero_point_t accum_ideep_zero_points = ideep::zero_point_t(1, accum_zero_point);
     // Set the dst scale and zero point with the value of accum.
-    // The true scale and zero point is stored in ideep::scale_t(scale_size, inv_output_scale) and dst_zero_points.
+    // The true scale and zero point is stored in ideep::scale_t(scale_size, output_scale) and dst_zero_points.
     dst.set_scale(accum_ideep_scale);
     dst.set_zero_point(accum_ideep_zero_points);
-  } else {
-    if (has_unary_post_op && unary_attr.value()=="relu") {
-      op_attr = ideep::attr_t::fuse_relu();
-    } else if (has_unary_post_op && unary_attr.value()=="hardtanh") {
-      TORCH_CHECK(
-          unary_scalars.size() == 2 &&
-              unary_scalars[0].get().toOptional<at::Scalar>().has_value() &&
-              unary_scalars[1].get().toOptional<at::Scalar>().has_value(),
-          "hardtanh is expected to have two scalar input: min_val and max_val");
-
-      auto lower_bound_value =
-          unary_scalars[0].get().toOptional<at::Scalar>().value().to<float>();
-      auto upper_bound_value =
-          unary_scalars[1].get().toOptional<at::Scalar>().value().to<float>();
-      op_attr = ideep::attr_t::fuse_clamp(lower_bound_value, upper_bound_value);
-    } else {
-      op_attr = ideep::attr_t();
-    }
   }
 
   // Weight Reorder
@@ -1668,7 +1707,7 @@ static at::Tensor _quantized_convolution_onednn(
   ideep::convolution_forward::prepare(
       params, src, packed_weight, expected_bias, dst_dims, dst,
       stride.vec(), dilation.vec(), padding.vec(), padding.vec(), groups,
-      src_scales, weights_scales, ideep::scale_t(1, inv_output_scale),
+      src_scales, weights_scales, ideep::scale_t(1, 1.0f / output_scale),
       src_zero_points, dst_zero_points,
       op_attr, dnnl::algorithm::convolution_direct,
       dnnl::prop_kind::forward_inference,
@@ -1678,6 +1717,7 @@ static at::Tensor _quantized_convolution_onednn(
 
   // Computation
   ideep::convolution_forward::compute<false, false>(params, src, expected_weight, expected_bias, dst);
+#endif
 
   if (is_1d) {
     output.squeeze_(quant_utils::kConv1dSqueezeDim + 2);
@@ -1832,7 +1872,7 @@ class QConvoneDNN final {
       torch::List<int64_t> padding,
       torch::List<int64_t> dilation,
       int64_t groups,
-      double inv_output_scale,  // inv_output_scale is the reciprocal of scale in fake quant
+      double output_scale,
       int64_t output_zero_point,
       c10::optional<c10::ScalarType> output_dtype,
       c10::string_view attr,
@@ -1851,8 +1891,8 @@ class QConvoneDNN final {
     } else {
       // Conv2D post op check
       TORCH_CHECK(
-        attr == "none" || attr == "relu" || attr == "hardtanh",
-        "none post_op or post_op relu/hardtanh is supported for quantized pointwise conv2d. Got unary_post_op: ",
+        attr == "none" || attr == "relu" || attr == "hardtanh" || attr == "hardswish" || attr == "swish",
+        "none post_op or post_op relu/hardtanh/hardswish is supported for quantized pointwise conv2d. Got unary_post_op: ",
         attr,
         ".")
     }
@@ -1860,7 +1900,7 @@ class QConvoneDNN final {
         act, act_scale, act_zero_point,
         weight, weight_scales, weight_zero_points,
         bias, stride, padding, dilation, /*transposed*/false,
-        groups, inv_output_scale, output_zero_point,
+        groups, output_scale, output_zero_point,
         /*accum*/c10::nullopt, /*accum_scale*/0.0, /*accum_zero_point*/0,
         /*output_dtype*/output_dtype, /*binary_attr*/c10::nullopt, /*binary_alpha*/c10::nullopt,
         /*unary_attr*/attr, /*unary_scalars*/scalars, /*unary_algorithm*/algorithm
@@ -1884,7 +1924,7 @@ class QConvoneDNN final {
       torch::List<int64_t> padding,
       torch::List<int64_t> dilation,
       int64_t groups,
-      double inv_output_scale,  // inv_output_scale is the reciprocal of scale in fake quant
+      double output_scale,
       int64_t output_zero_point,
       c10::optional<c10::ScalarType> output_dtype,
       c10::string_view binary_attr,
@@ -1912,7 +1952,7 @@ class QConvoneDNN final {
         act, act_scale, act_zero_point,
         weight, weight_scales, weight_zero_points,
         bias, stride, padding, dilation, /*transposed*/false,
-        groups, inv_output_scale, output_zero_point,
+        groups, output_scale, output_zero_point,
         accum, accum_scale, accum_zero_point,
         /*output_dtype*/output_dtype, binary_attr, alpha,
         unary_attr, unary_scalars, unary_algorithm
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index 6f996691c0946..46172f0c199f4 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -283,9 +283,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<
   auto kernel_dim = kSpatialDim == 2
       ? std::vector<int64_t>{kernel_h, kernel_w}
       : std::vector<int64_t>{kernel_d, kernel_h, kernel_w};
-  std::vector<uint8_t> w_zero_points;
-  at::Tensor w_scales;
-  std::tie(w_zero_points, w_scales) =
+  auto [w_zero_points, w_scales] =
       make_zero_points_and_scales_tensor(weight_contig, transpose, groups);
   // We set the pre-packed conv weights to nullptr below as we call pre-pack
   // during the first invocation of operator run. Refer to qconv.cpp for more
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
index 0057fea54c2e0..7e5083057a0ba 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
@@ -491,7 +491,7 @@ at::Tensor& embedding_bag_byte_impl(
               /*offsets_or_lengths=*/offsets_data + start_idx,
               /*weights=*/
               per_sample_weights_
-                  ? per_sample_weights_.value().data_ptr<float>() +
+                  ? per_sample_weights_.value().const_data_ptr<float>() +
                       offsets_data[start_idx]
                   : nullptr,
               /*out=*/output_data + start_idx * D);
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
index 763ac7c784c83..9cfbce72e31d1 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@@ -436,8 +436,7 @@ Tensor _qembeddingbag_nbit_prepack_helper(
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       float Xmin, Xmax;
       if (optimized_qparams) {
-        at::Tensor xmax_tensor, xmin_tensor;
-        std::tie(xmax_tensor, xmin_tensor) = at::choose_qparams_optimized(
+        auto [xmax_tensor, xmin_tensor] = at::choose_qparams_optimized(
             float_weight[row], embedding_cols, nbins, ratio, bit_width);
         TORCH_CHECK(
             xmax_tensor.numel() == 1 && xmin_tensor.numel() == 1,
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
index 3612f8eba2f88..7c1093a1c4c1a 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
@@ -37,7 +37,7 @@ at::Tensor PackedEmbeddingBagWeight::unpack() {
       scale_bias_bytes = 4;
     }
 
-    const auto* input = packed_weight.data_ptr<uint8_t>();
+    const auto* input = packed_weight.const_data_ptr<uint8_t>();
     // Calculate the output shape, accounting for the last n bytes to be used
     // for scale/bias rest of the entries are packed depending on the bit_width.
     std::vector<int64_t> output_shape = {
@@ -125,7 +125,7 @@ Tensor& qembeddingbag_byte_unpack_out(Tensor& output, const Tensor& packed_weigh
   // The last 2 values are used to store the FP32 scale and zero_point values
   // per row.
   const int32_t output_columns = input_columns - 2 * sizeof(float);
-  const auto* input_data = packed_weight.data_ptr<uint8_t>();
+  const auto* input_data = packed_weight.const_data_ptr<uint8_t>();
 
   std::vector<int64_t> output_shape = packed_weight_sizes.vec();
   output_shape[col_dim] = output_columns;
@@ -187,7 +187,7 @@ Tensor _qembeddingbag_nbit_unpack_helper(
     int BIT_RATE) {
   const auto input_rows = packed_weight.size(0);
   const auto input_columns = packed_weight.size(1);
-  const auto* input_data = packed_weight.data_ptr<uint8_t>();
+  const auto* input_data = packed_weight.const_data_ptr<uint8_t>();
   int NUM_ELEM_PER_BYTE = 8 / BIT_RATE;
 
   // The last 4 bytes per row are two fp16 scale and zero_point.
diff --git a/aten/src/ATen/native/quantized/cpu/qgelu.cpp b/aten/src/ATen/native/quantized/cpu/qgelu.cpp
index f9a3c32343df7..743832431e0c4 100644
--- a/aten/src/ATen/native/quantized/cpu/qgelu.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qgelu.cpp
@@ -18,4 +18,12 @@ Tensor gelu_quantized_cpu(const Tensor& qx, c10::string_view approximate) {
   qgelu_stub(qx.device().type(), qx, qy, get_gelutype_enum(approximate));
   return qy;
 }
+
+Tensor& gelu_quantized_cpu_(Tensor& self, c10::string_view approximate) {
+  Tensor qy = gelu_quantized_cpu(self, approximate);
+  // This can be optimized in a future PR if it becomes a bottleneck.
+  self.copy_(qy);
+  return self;
+}
+
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 78e4551119ddb..df6df3c35201d 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -123,6 +123,8 @@ at::Tensor& PackedLinearWeight::apply_impl(
   // Allocate a buffer for fbgemmPacked to use
   auto buffer = at::empty(out_sizes, output.options().dtype(at::kInt));
 
+  auto output_data = reinterpret_cast<uint8_t*>(output.data_ptr<c10::quint8>());
+
   int num_tasks = at::get_num_threads();
   at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) {
     for (const auto task_id : c10::irange(begin, end)) {
@@ -184,7 +186,7 @@ at::Tensor& PackedLinearWeight::apply_impl(
         fbgemm::fbgemmPacked(
             /*packA=*/packA,
             /*packB=*/*packB,
-            /*C=*/reinterpret_cast<uint8_t*>(output.data_ptr<c10::quint8>()),
+            /*C=*/output_data,
             /*C_buffer=*/buffer.data_ptr<int32_t>(),
             /*ldc=*/N,
             /*outProcess=*/outputProcObj,
@@ -220,7 +222,7 @@ at::Tensor& PackedLinearWeight::apply_impl(
         fbgemm::fbgemmPacked(
             /*packA=*/packA,
             /*packB=*/*packB,
-            /*C=*/reinterpret_cast<uint8_t*>(output.data_ptr<c10::quint8>()),
+            /*C=*/output_data,
             /*C_buffer=*/buffer.data_ptr<int32_t>(),
             /*ldc=*/N,
             /*outProcess=*/outputProcObj,
@@ -314,7 +316,7 @@ at::Tensor PackedLinearWeight::apply_with_input_q_dq_qweight_dq_output_fp32_impl
       fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
 
   auto input_contig = input.expect_contiguous();
-  const auto* input_ptr = input_contig->data_ptr<float>();
+  const auto* input_ptr = input_contig->const_data_ptr<float>();
 
   TORCH_CHECK(
       input.dim() >= 2,
@@ -358,6 +360,8 @@ at::Tensor PackedLinearWeight::apply_with_input_q_dq_qweight_dq_output_fp32_impl
       output.options().dtype(at::kInt),
       LEGACY_CONTIGUOUS_MEMORY_FORMAT);
 
+  auto output_data = output.data_ptr<float>();
+
   int num_tasks = at::get_num_threads();
   at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) {
     fbgemm::PackAWithQuantRowOffset<uint8_t> packA(
@@ -396,7 +400,7 @@ at::Tensor PackedLinearWeight::apply_with_input_q_dq_qweight_dq_output_fp32_impl
         fbgemm::fbgemmPacked(
             /*packA=*/packA,
             /*packB=*/*packB,
-            /*C=*/output.data_ptr<float>(),
+            /*C=*/output_data,
             /*C_buffer=*/buffer.data_ptr<int32_t>(),
             /*ldc=*/N,
             /*outProcess=*/outputProcObj,
@@ -428,7 +432,7 @@ at::Tensor PackedLinearWeight::apply_with_input_q_dq_qweight_dq_output_fp32_impl
         fbgemm::fbgemmPacked(
             /*packA=*/packA,
             /*packB=*/*packB,
-            /*C=*/output.data_ptr<float>(),
+            /*C=*/output_data,
             /*C_buffer=*/buffer.data_ptr<int32_t>(),
             /*ldc=*/N,
             /*outProcess=*/outputProcObj,
@@ -481,7 +485,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl_xnnp(
 
     xnn_operator_t xnnp_op = nullptr;
 
-    const float* weight_scales_data = w_scales.data_ptr<float>();
+    const float* weight_scales_data = w_scales.const_data_ptr<float>();
 
     // prepare weights
     underlying_t w_zp = static_cast<underlying_t>(
@@ -586,7 +590,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl_xnnp(
       status,
       ")");
 
-  // Run the opeator
+  // Run the operator
   status = xnn_run_operator(
       xnnp_linear_op.get(), // Linear op
       caffe2::pthreadpool_() // threadpool
@@ -917,24 +921,61 @@ static at::Tensor linear_int8_with_onednn_weight(
     double output_scale,
     int64_t output_zero_point,
     c10::optional<c10::ScalarType> output_dtype,
-    std::string& post_op_name, // e.g. "none", "relu"
-    torch::List<c10::optional<at::Scalar>>& post_op_args,
-    std::string& post_op_algorithm) {
+    c10::optional<at::Tensor> other, // extra input for binary post-op
+    double other_scale,
+    int64_t other_zero_point,
+    const c10::string_view& binary_post_op, // e.g. "none", "sum", "add"
+    double binary_alpha,
+    const c10::string_view& unary_post_op, // e.g. "none", "relu"
+    torch::List<c10::optional<at::Scalar>>& unary_post_op_args,
+    c10::string_view& unary_post_op_algorithm) {
   using ideep::tensor;
   const int64_t dim = input.dim();
-  output_scale = 1.0f / output_scale;
   TORCH_CHECK(input.scalar_type() == c10::ScalarType::Byte,
       "qlinear with mkldnn tensor: data type of input should be uint8 (unsigned char).");
   TORCH_CHECK(onednn_weight.scalar_type() == c10::ScalarType::Char,
       "qlinear with mkldnn tensor: data type of weight should be int8 (char).");
   TORCH_CHECK(
       weight_scales.scalar_type() == c10::ScalarType::Float, "weight scales should be dtype c10::ScalarType::Float.");
+  TORCH_CHECK(
+      binary_alpha == 1.0f, "onednn qlinear: alpha != 1 for binary post op is not yet supported.");
   bool fp32_output = output_dtype.has_value() && (output_dtype.value() == c10::kFloat);
-  bool bfloat16_output = output_dtype.has_value() && (output_dtype.value() == c10::kBFloat16);
-  if (fp32_output || bfloat16_output) {
+  bool bf16_output = output_dtype.has_value() && (output_dtype.value() == c10::kBFloat16);
+  if (fp32_output || bf16_output) {
     TORCH_CHECK(
         output_scale == 1.0f && output_zero_point == 0, "onednn qlinear: expect scale=1 and zero point=0 for fp32 output");
   }
+  if (binary_post_op != "none") {
+    /* Supported cases for binary post op:
+      +-------------------+--------------+---------------+
+      | Extra input dtype | Output dtype | Post op       |
+      +-------------------+--------------+---------------+
+      | Fp32/bf16         | fp32/bf16    | sum           |
+      +-------------------+--------------+---------------+
+      | Fp32/bf16         | int8         | add           |
+      +-------------------+--------------+---------------+
+      | int8              | fp32/bf16    | not supported |
+      +-------------------+--------------+---------------+
+      | int8              | int8         | sum           |
+      +-------------------+--------------+---------------+
+    */
+    TORCH_CHECK(other.has_value(), "onednn qlinear: the extra input is missing for post op ", binary_post_op);
+    if (fp32_output || bf16_output) {
+      TORCH_CHECK(
+          other_scale == 1.0f && other_zero_point == 0,
+          "onednn qlinear: expect extra input scale = 1.0 and zero point = 0 when output dtype is ", output_dtype.value(),
+          ", but got ", other_scale, " and ", other_zero_point, ", respectively"
+      );
+    }
+    if (binary_post_op == "sum") {
+      auto expected_dtype = output_dtype.has_value() ? output_dtype.value() : c10::kByte;
+      TORCH_CHECK(
+          other.value().scalar_type() == expected_dtype,
+          "onednn qlinear: the dtype of extra input for binary post op should be ", expected_dtype,
+          " (same as output dtype), but got ", other.value().scalar_type()
+      );
+    }
+  }
 
   // If the input has more than two dimensions, we will reshape it to a 2-dimensional form
   // for calculation and subsequently reshape the output back.
@@ -962,35 +1003,45 @@ static at::Tensor linear_int8_with_onednn_weight(
   }
   std::vector<int64_t> src_dims = {M, K};
   std::vector<int64_t> dst_dims = {M, N};
-  at::Tensor output = at::empty(
-    dst_dims,
-    device(c10::kCPU)
-        .dtype(fp32_output ? c10::kFloat : (bfloat16_output ? c10::kBFloat16 : c10::kByte))
-  );
+  at::Tensor output = binary_post_op == "sum" ?
+      other.value() :
+      at::empty(
+        dst_dims,
+        device(c10::kCPU)
+            .dtype(fp32_output ? c10::kFloat : (bf16_output ? c10::kBFloat16 : c10::kByte))
+      );
   if (output.numel() == 0) {
     return output;
   }
   tensor dst = at::native::itensor_view_from_dense(output);
+  static tensor empty_tensor;
+  static tensor::desc empty_tensor_desc;
+  tensor src1 = binary_post_op == "add" ?
+      at::native::itensor_view_from_dense(other.value().reshape({-1, other.value().size(dim - 1)})) :
+      empty_tensor;
 
   // Create onednn primitive
   auto src_desc = tensor::desc(src_dims, ideep::data_type::u8, ideep::format_tag::any);
   auto weights_desc = packed_weight.get_desc();
-  auto dst_dtype = fp32_output ? ideep::data_type::f32 : (bfloat16_output ? ideep::tensor::data_type::bf16 : ideep::data_type::u8);
+  auto dst_dtype = dst.get_data_type();
   auto dst_desc = tensor::desc(dst_dims, dst_dtype, ideep::format_tag::any);
   auto bias_desc = with_bias ?
       tensor::desc(onednn_bias.value().get_dims(), ideep::data_type::f32, ideep::format_tag::any) :
-      tensor::desc();
-  dnnl::algorithm post_op_algo = dnnl::algorithm::undef;
-  if (post_op_name == "gelu") {
-    if (post_op_algorithm == "none") {
-      post_op_algo = dnnl::algorithm::eltwise_gelu_erf;
-    } else if (post_op_algorithm == "tanh") {
-      post_op_algo = dnnl::algorithm::eltwise_gelu_tanh;
-    } else {
-      TORCH_CHECK(false, "un-supported GELU approximate, none or tanh is supported.");
-    }
-  }
-  auto op_attr = onednn_utils::create_attr_by_post_op(post_op_name, post_op_args, post_op_algo);
+      empty_tensor_desc;
+  // Get op attr for primitive
+  // Note: output_scale & output_zero_point are for re-quantization of the final output.
+  // And other_scale & other_zero_point are for dequantization of other.
+  auto other_desc = binary_post_op == "add" ? src1.get_desc() : empty_tensor_desc;
+  auto op_attr = onednn_utils::create_attr_by_post_op(
+    binary_post_op,
+    binary_alpha,
+    other_scale,
+    other_zero_point,
+    other_desc,
+    unary_post_op,
+    unary_post_op_args,
+    unary_post_op_algorithm
+  );
   if (input_scale != 1.0f) {
     op_attr.set_scales_mask(DNNL_ARG_SRC, 0);
   }
@@ -1042,6 +1093,9 @@ static at::Tensor linear_int8_with_onednn_weight(
   if (output_zero_point != 0) {
     args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, dst_zp_t});
   }
+  if (binary_post_op == "add") {
+    args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, src1});
+  }
   primitive.execute(ideep::stream::default_stream(), args);
   return dim == 2 ? output : output.reshape(output_size);
 }
@@ -1144,21 +1198,121 @@ class QLinearOnednn final {
       double output_scale,
       int64_t output_zero_point,
       c10::optional<c10::ScalarType> output_dtype,
-      std::string post_op_name,
+      c10::string_view post_op_name,
       torch::List<c10::optional<at::Scalar>> post_op_args,
-      std::string post_op_algorithm) {
+      c10::string_view post_op_algorithm) {
 #if AT_MKLDNN_ENABLED()
+    static c10::optional<at::Tensor> other = c10::nullopt;
+    static const c10::string_view binary_post_op = "none";
     return linear_int8_with_onednn_weight(
         act, act_scale, act_zero_point,
         onednn_weight, weight_scales, weight_zero_points,
         bias, output_scale, output_zero_point, output_dtype,
+        other, /*other scale*/1.0, /*other zp*/0,
+        binary_post_op, /*binary alpha*/1.0,
         post_op_name, post_op_args, post_op_algorithm
     );
 #endif
     TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)");
   }
-};
 
+  static Tensor run_pointwise_tensor(
+      Tensor act, // int8 CPU tensor, not QTensor
+      Tensor act_scale,
+      Tensor act_zero_point,
+      Tensor onednn_weight, // int8 tensor from MkldnnCPU
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      c10::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      c10::optional<c10::ScalarType> output_dtype,
+      c10::string_view post_op_name,
+      torch::List<c10::optional<at::Scalar>> post_op_args,
+      c10::string_view post_op_algorithm) {
+#if AT_MKLDNN_ENABLED()
+    TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1,
+        "onednn int8 linear: act scale/zp size should be 1");
+    static c10::optional<at::Tensor> other = c10::nullopt;
+    static const c10::string_view binary_post_op = "none";
+    return linear_int8_with_onednn_weight(
+        act, act_scale.item().toDouble(), act_zero_point.item().toLong(),
+        onednn_weight, weight_scales, weight_zero_points,
+        bias, output_scale, output_zero_point, output_dtype,
+        other, /*other scale*/1.0, /*other zp*/0,
+        binary_post_op, /*binary alpha*/1.0,
+        post_op_name, post_op_args, post_op_algorithm
+    );
+#endif
+    TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)");
+  }
+
+  static Tensor run_pointwise_binary(
+      Tensor act, // int8 CPU tensor, not QTensor
+      double act_scale,
+      int64_t act_zero_point,
+      Tensor onednn_weight, // int8 tensor from MkldnnCPU
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      c10::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      c10::optional<c10::ScalarType> output_dtype,
+      c10::optional<at::Tensor> other, // extra input for binary post-op
+      double other_scale,
+      int64_t other_zero_point,
+      c10::string_view binary_post_op, // e.g. "none", "sum", "add"
+      double binary_alpha,
+      c10::string_view unary_post_op, // e.g. "none", "relu"
+      torch::List<c10::optional<at::Scalar>> unary_post_op_args,
+      c10::string_view unary_post_op_algorithm) {
+#if AT_MKLDNN_ENABLED()
+    return linear_int8_with_onednn_weight(
+        act, act_scale, act_zero_point,
+        onednn_weight, weight_scales, weight_zero_points,
+        bias, output_scale, output_zero_point, output_dtype,
+        other, other_scale, other_zero_point,
+        binary_post_op, binary_alpha,
+        unary_post_op, unary_post_op_args, unary_post_op_algorithm
+    );
+#endif
+    TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)");
+  }
+
+  static Tensor run_pointwise_binary_tensor(
+      Tensor act, // int8 CPU tensor, not QTensor
+      Tensor act_scale,
+      Tensor act_zero_point,
+      Tensor onednn_weight, // int8 tensor from MkldnnCPU
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      c10::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      c10::optional<c10::ScalarType> output_dtype,
+      c10::optional<at::Tensor> other, // extra input for binary post-op
+      double other_scale,
+      int64_t other_zero_point,
+      c10::string_view binary_post_op, // e.g. "none", "sum", "add"
+      double binary_alpha,
+      c10::string_view unary_post_op, // e.g. "none", "relu"
+      torch::List<c10::optional<at::Scalar>> unary_post_op_args,
+      c10::string_view unary_post_op_algorithm) {
+#if AT_MKLDNN_ENABLED()
+    TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1,
+        "onednn int8 linear: act scale/zp size should be 1");
+    return linear_int8_with_onednn_weight(
+        act, act_scale.item().toDouble(), act_zero_point.item().toLong(),
+        onednn_weight, weight_scales, weight_zero_points,
+        bias, output_scale, output_zero_point, output_dtype,
+        other, other_scale, other_zero_point,
+        binary_post_op, binary_alpha,
+        unary_post_op, unary_post_op_args, unary_post_op_algorithm
+    );
+#endif
+    TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)");
+  }
+};
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   register_linear_params();
@@ -1181,6 +1335,12 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
 TORCH_LIBRARY_IMPL(onednn, MkldnnCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"),
       TORCH_FN(QLinearOnednn::run_pointwise));
+  m.impl(TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"),
+      TORCH_FN(QLinearOnednn::run_pointwise_tensor));
+  m.impl(TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"),
+      TORCH_FN(QLinearOnednn::run_pointwise_binary));
+  m.impl(TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"),
+      TORCH_FN(QLinearOnednn::run_pointwise_binary_tensor));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
index f871877073a75..935ad081bd908 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@@ -16,6 +16,9 @@
 #include <ATen/ops/_empty_affine_quantized.h>
 #include <ATen/ops/aminmax.h>
 #include <ATen/ops/empty.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_fp32_activation_native.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_native.h>
+#include <ATen/ops/fbgemm_pack_gemm_matrix_fp16_native.h>
 #include <ATen/ops/quantize_per_tensor.h>
 #endif
 
@@ -43,7 +46,7 @@ at::Tensor PackedLinearWeight::apply_dynamic_impl(
 
   // TODO: contiguous is called for further jit optimizations.
   auto input_contig = input.contiguous();
-  const auto* input_ptr = input_contig.data_ptr<float>();
+  const auto* input_ptr = input_contig.const_data_ptr<float>();
 
   TORCH_CHECK(
       input.dim() >= 2,
@@ -266,7 +269,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(
   TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)");
 
   auto bias_contig = bias_vec.contiguous();
-  const float* bias_ptr = bias_contig.data_ptr<float>();
+  const float* bias_ptr = bias_contig.const_data_ptr<float>();
 
   // Calculate statistics for quantization of input Tensor
   // TODO: optimized kernel
@@ -407,7 +410,7 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_impl(
     const at::Tensor& input,
     at::Tensor& output) {
   const at::Tensor input_contig = input.contiguous();
-  const float* input_ptr = input_contig.data_ptr<float>();
+  const float* input_ptr = input_contig.const_data_ptr<float>();
 
   auto& packed_weight_fp16 = *w;
 
@@ -423,6 +426,8 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_impl(
   // Resize output Tensor
   output.resize_(output_sizes);
 
+  auto output_data = output.data_ptr<float>();
+
   int num_tasks = at::get_num_threads();
   at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) {
     for (const auto task_id : c10::irange(begin, end)) {
@@ -433,7 +438,7 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_impl(
           /*A=*/input_ptr,
           /*Bp=*/packed_weight_fp16,
           /*beta=*/0.0f,
-          /*C=*/output.data_ptr<float>(),
+          /*C=*/output_data,
           /*thread_id=*/static_cast<int>(task_id),
           /*num_threads=*/num_tasks);
     }
@@ -520,8 +525,7 @@ at::Tensor PackedLinearWeightsOnednn::apply_dynamic_impl(
       /*len=*/input.numel());
 #else
   if (input_contig.numel() > 0) {
-    Tensor t_min, t_max;
-    std::tie(t_min, t_max) = at::aminmax(input_contig);
+    auto [t_min, t_max] = at::aminmax(input_contig);
     x_max = t_max.item<float>();
     x_min = t_min.item<float>();
   }
@@ -659,6 +663,122 @@ class QLinearDynamicFp16 final {
 #endif // USE_FBGEMM
 };
 
+class QLinearUnpackedDynamicFp16 final {
+ public:
+#ifdef USE_FBGEMM
+  static at::Tensor run(
+      at::Tensor input,
+      const at::Tensor& weight,
+      const at::Tensor& bias) {
+    // We make a strong guarantee that models using these operators will have
+    // the same numerics across different machines. Therefore, we do not provide
+    // a fallback path and rather fail loudly if we cannot run FBGEMM.
+    TORCH_CHECK(
+        fbgemm::fbgemmSupportedCPU(), "Your CPU doesn't support FBGEMM.");
+
+    TORCH_CHECK(
+        weight.dim() == 2,
+        "The dimension of weight tensor should be equal to 2");
+
+    auto packed_weight = PackedLinearWeightFp16::prepack(weight, bias);
+    auto output = packed_weight->apply_dynamic(std::move(input));
+
+    return output;
+  }
+
+  static at::Tensor meta(
+      at::Tensor input,
+      const at::Tensor& weight,
+      const at::Tensor& bias) {
+    // We make a strong guarantee that models using these operators will have
+    // the same numerics across different machines. Therefore, we do not provide
+    // a fallback path and rather fail loudly if we cannot run FBGEMM.
+    TORCH_CHECK(
+        fbgemm::fbgemmSupportedCPU(), "Your CPU doesn't support FBGEMM.");
+
+    TORCH_CHECK(
+        weight.dim() == 2,
+        "The dimension of weight tensor should be equal to 2");
+
+    auto out_channel = weight.sym_sizes().vec()[0];
+    auto out_sizes = input.sym_sizes().vec();
+    out_sizes[out_sizes.size() - 1] = out_channel;
+
+    return at::empty_symint(out_sizes, input.options());
+  }
+#else // USE_FBGEMM
+  static at::Tensor run(
+      at::Tensor /* input */,
+      const at::Tensor& weight,
+      const at::Tensor& bias) {
+    // We make a strong guarantee that models using these operators will have
+    // the same numerics across different machines. Therefore, we do not provide
+    // a fallback path and rather fail loudly if we cannot run FBGEMM.
+    TORCH_CHECK(
+        false, "This PyTorch installation was not built with FBGEMM operators");
+  }
+
+  static at::Tensor meta(
+      at::Tensor /* input */,
+      const at::Tensor& weight,
+      const at::Tensor& bias) {
+    TORCH_CHECK(
+        false, "This PyTorch installation was not built with FBGEMM operators");
+  }
+#endif // USE_FBGEMM
+};
+
+at::Tensor wrapped_fbgemm_pack_gemm_matrix_fp16(const at::Tensor& weight) {
+#ifdef USE_FBGEMM
+  TORCH_CHECK(
+      weight.dim() == 2,
+      "fbgemm weight packing only packs matrices not vectors.");
+  return at::native::fbgemm_pack_gemm_matrix_fp16(weight);
+#else // USE_FBGEMM
+  TORCH_CHECK(
+      false, "This PyTorch installation was not built with FBGEMM operators");
+#endif // USE_FBGEMM
+}
+
+at::Tensor wrapped_fbgemm_pack_gemm_matrix_fp16_meta(const at::Tensor& weight) {
+#ifdef USE_FBGEMM
+  // Strictly speaking this is not correct. However we do not know the exact
+  // size of the packed matrix as it's being maintained by the object itself,
+  // therefore we return the view we have here.
+  return at::empty({8}, weight.options().dtype(at::kByte));
+#else // USE_FBGEMM
+  TORCH_CHECK(
+      false, "This PyTorch installation was not built with FBGEMM operators");
+#endif // USE_FBGEMM
+}
+
+at::Tensor wrapped_fbgemm_linear_fp16_weight(const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, int64_t out_channel) {
+#ifdef USE_FBGEMM
+  return at::native::fbgemm_linear_fp16_weight(input, weight, bias);
+#else // USE_FBGEMM
+  TORCH_CHECK(
+      false, "This PyTorch installation was not built with FBGEMM operators");
+#endif // USE_FBGEMM
+}
+
+at::Tensor wrapped_fbgemm_linear_fp16_weight_meta(const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, int64_t out_channel) {
+#ifdef USE_FBGEMM
+  // For the meta function, we need users to provide the dimension explicitly
+  // as we don't have access to the weight.
+  auto out_sizes = input.sym_sizes().vec();
+  if (out_channel == -1) {
+    out_sizes.pop_back();
+  } else {
+    out_sizes.back() = out_channel;
+  }
+  return at::empty_symint(out_sizes, input.options());
+#else // USE_FBGEMM
+  TORCH_CHECK(
+      false, "This PyTorch installation was not built with FBGEMM operators");
+#endif // USE_FBGEMM
+}
+
+
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
   register_linear_params();
   m.impl(
@@ -670,16 +790,40 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("quantized::linear_dynamic_fp16"),
       TORCH_FN(QLinearDynamicFp16<false>::run));
+  m.impl(
+      TORCH_SELECTIVE_NAME("quantized::linear_dynamic_fp16_unpacked_weight"),
+      TORCH_FN(QLinearUnpackedDynamicFp16::run));
   m.impl(
       TORCH_SELECTIVE_NAME("quantized::linear_relu_dynamic_fp16"),
       TORCH_FN(QLinearDynamicFp16<true>::run));
 }
 
+TORCH_LIBRARY_IMPL(quantized, Meta, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("quantized::linear_dynamic_fp16_unpacked_weight"),
+      TORCH_FN(QLinearUnpackedDynamicFp16::meta));
+}
+
 TORCH_LIBRARY_IMPL(_quantized, CPU, m) {
   register_linear_params();
   m.impl(
       TORCH_SELECTIVE_NAME("_quantized::linear_dynamic"),
       TORCH_FN(QLinearDynamicInt8<false>::run));
+  m.impl(
+      TORCH_SELECTIVE_NAME("_quantized::wrapped_fbgemm_pack_gemm_matrix_fp16"),
+      wrapped_fbgemm_pack_gemm_matrix_fp16);
+  m.impl(
+      TORCH_SELECTIVE_NAME("_quantized::wrapped_fbgemm_linear_fp16_weight"),
+      wrapped_fbgemm_linear_fp16_weight);
+}
+
+TORCH_LIBRARY_IMPL(_quantized, Meta, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("_quantized::wrapped_fbgemm_pack_gemm_matrix_fp16"),
+      wrapped_fbgemm_pack_gemm_matrix_fp16_meta);
+  m.impl(
+      TORCH_SELECTIVE_NAME("_quantized::wrapped_fbgemm_linear_fp16_weight"),
+      wrapped_fbgemm_linear_fp16_weight_meta);
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index 50cca56a0284d..a2fb34f90b289 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -157,9 +157,7 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightsQnnp::prepack(
       " instead");
 
   at::Tensor weight_contig = weight.contiguous();
-  std::vector<uint8_t> w_zero_points;
-  at::Tensor  w_scales;
-  std::tie(w_zero_points, w_scales) =
+  auto [w_zero_points, w_scales] =
       make_zero_points_and_scales_tensor(weight_contig);
 
   at::native::initQNNPACK();
diff --git a/aten/src/ATen/native/quantized/cpu/qmul.cpp b/aten/src/ATen/native/quantized/cpu/qmul.cpp
index d7207ccdf5463..fe997c7a42b6c 100644
--- a/aten/src/ATen/native/quantized/cpu/qmul.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qmul.cpp
@@ -100,8 +100,8 @@ Tensor _mul_out_xnnpack(
 
   if(ReLUFused) {
     /*
-     * FIXME: use acticationLimits<T>()
-     * With <T>, MSVC runs into "error C3862: indetifier activationLimits not
+     * FIXME: use activationLimits<T>()
+     * With <T>, MSVC runs into "error C3862: identifier activationLimits not
      * found".
      */
     constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl b/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl
index ea83f48da0f70..0f822afd6da3c 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl
@@ -336,6 +336,7 @@ def define_qnnpack(third_party, labels = []):
             ":ukernels_sse2",
             ":ukernels_sse41",
             ":ukernels_ssse3",
+            third_party("clog"),
             third_party("cpuinfo"),
             third_party("FP16"),
             third_party("FXdiv"),
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/conv-run.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/src/conv-run.cc
index c6b7d8cb42049..863ab37e2b2c5 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/conv-run.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/conv-run.cc
@@ -400,7 +400,7 @@ enum pytorch_qnnp_status qnnpackConv(
         threadpool);
     if (status != pytorch_qnnp_status_success) {
       pytorch_qnnp_log_error(
-          "failed to run covolution op setup to setup indirection buffer.");
+          "failed to run convolution op setup to setup indirection buffer.");
       return status;
     }
   }
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/deconv-run.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/src/deconv-run.cc
index 87e3fd2000775..1660bd61e205c 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/deconv-run.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/deconv-run.cc
@@ -168,7 +168,7 @@ enum pytorch_qnnp_status qnnpackDeConv(
         threadpool);
     if (status != pytorch_qnnp_status_success) {
       pytorch_qnnp_log_error(
-          "failed to run decovolution op setup to setup indirection buffer.");
+          "failed to run deconvolution op setup to setup indirection buffer.");
       return status;
     }
   }
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/fully-connected-sparse.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/fully-connected-sparse.c
index 71226ab5250e1..857d78e57625c 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/fully-connected-sparse.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/fully-connected-sparse.c
@@ -72,7 +72,7 @@ enum pytorch_qnnp_status pytorch_qnnp_create_fully_connected_sparse_dq_nc_q8(
 
   if (kernel_row_block_size == 8 && kernel_col_block_size == 1) {
     // This is to gate 8x1 on SSE2 since we have not implemented SSE2
-    // kernel that suppors 8x1 sparsity pattern.
+    // kernel that supports 8x1 sparsity pattern.
     if (pytorch_qnnp_params.q8gemm_sparse_c8x1.packA == NULL) {
       status = pytorch_qnnp_status_invalid_parameter;
       goto error;
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/indirection.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/indirection.c
index 86432e6c1b242..232c015338bbe 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/indirection.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/indirection.c
@@ -208,7 +208,7 @@ void pytorch_qnnp_indirection_init_conv3d(
  *   width
  *
  * step_height: The number of pointers to traverse to move from an output
- *   pixel's first input's index in the indirection bufffer to that of the
+ *   pixel's first input's index in the indirection buffer to that of the
  *   output pixel one ROW (one output y) after it.
  *   i.e. if indirection_buffer[j] points to the first input pixel used to
  *   compute the i'th output pixel, then
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/init.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/init.c
index b2ea18c669c67..b3f1cf40fcc33 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/init.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/init.c
@@ -81,7 +81,7 @@ static void init(void) {
       .packA = pytorch_q8gemm_sparse_packA_ukernel_4x4__aarch32_neon,
       .mr = 4,
       .nr = 8,
-      .kr = 4, // kr is really 1 but we set it to 4 because we resuse 4x4 prepacking kernel
+      .kr = 4, // kr is really 1 but we set it to 4 because we reuse 4x4 prepacking kernel
       .log2_mr = 2,
       .log2_row_block_size = 3,
       .row_block_size = 8,
@@ -193,7 +193,7 @@ static void init(void) {
       .packA = pytorch_q8gemm_sparse_packA_ukernel_8x4__aarch64_neon,
       .mr = 8,
       .nr = 8,
-      .kr = 4, // kr is really 1 but we set it to 4 because we resuse 4x4 prepacking kernel
+      .kr = 4, // kr is really 1 but we set it to 4 because we reuse 4x4 prepacking kernel
       .log2_mr = 3,
       .log2_row_block_size = 3,
       .row_block_size = 8,
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/pack.h b/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/pack.h
index 67684d7fa40c7..14ea256124856 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/pack.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/pack.h
@@ -116,7 +116,7 @@ static inline void pytorch_pack_q8gemm_wrq(
       }
       if (kzp != 0) {
         // This part fills the packed weights with zero points for output channels
-        // when they are not divisble by nr blocking parameter.
+        // when they are not divisible by nr blocking parameter.
         // This is needed because in some kernels, sse2 ones, it relies on this
         // to produce zero as a result of subtracting zero point from weight value.
         size_t remaining_nr_blocks = ((nr - nr_block_size) & (np - 1));
@@ -239,7 +239,7 @@ static inline void pytorch_pack_q8conv_wrq(
         }
         if (kzp != 0) {
           // This part fills the packed wights with zero points for output channels
-          // when they are not divisble by nr blocking parameter.
+          // when they are not divisible by nr blocking parameter.
           // In that case
           for (size_t nr_block_offset = 0; nr_block_offset < (nr - nr_block_size);
                nr_block_offset++) {
@@ -361,7 +361,7 @@ static inline void pytorch_pack_q8deconv_wrq(
         }
         if (kzp != 0) {
           // This part fills the packed wights with zero points for output channels
-          // when they are not divisble by nr blocking parameter.
+          // when they are not divisible by nr blocking parameter.
           // In that case
           for (size_t nr_block_offset = 0; nr_block_offset < (nr - nr_block_size);
                nr_block_offset++) {
diff --git a/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp b/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
index 92703f322c29b..159da6e72febe 100644
--- a/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
@@ -108,7 +108,7 @@ Tensor sigmoid_quantized_cpu(const Tensor& qx) {
 #endif  // USE_PYTORCH_QNNPACK
   Tensor qy;
   AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qsigmoid", [&]() {
-    // Naive implemenentation: uses dequantize/execute/quantize routine
+    // Naive implementation: uses dequantize/execute/quantize routine
     // - Output scale is set to 1.0 / 2^(BIT_NUM)
     // - For signed types output zero point is set to 0
     // - For unsigned types output zero point is set to (qmax + qmin) / 2.0
diff --git a/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu b/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu
index 0580c47b8c627..3574bfe28f505 100644
--- a/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu
@@ -545,7 +545,7 @@ Tensor qembeddingbag_4bit_unpack(const Tensor& packed_weight) {
   int BIT_RATE = 4;
   const auto input_rows = packed_weight.size(0);
   const auto input_columns = packed_weight.size(1);
-  const auto* input_data = packed_weight.data_ptr<uint8_t>();
+  const auto* input_data = packed_weight.const_data_ptr<uint8_t>();
   int NUM_ELEM_PER_BYTE = 8 / BIT_RATE;
 
   // The last 4 bytes per row are two fp16 scale and zero_point.
diff --git a/aten/src/ATen/native/quantized/cudnn/Conv.cpp b/aten/src/ATen/native/quantized/cudnn/Conv.cpp
index 4cb496640746f..bb97a69859cb4 100644
--- a/aten/src/ATen/native/quantized/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/Conv.cpp
@@ -75,7 +75,7 @@ void PackedConvWeightCudnn<kSpatialDim>::apply_impl_helper(const at::Tensor& qua
   if (bias_.has_value()) {
     // the input bias is a 1-D tensor whose size is the same as the size of the second dimension of quantized_output.
     // we need to add trailing dimensions in order to properly broadcast bias, otherwise broadcast_to will fail.
-    // the number of trailling dimensions is quantized_output.dim() - 2, so the new size of the broadcast_bias
+    // the number of trailing dimensions is quantized_output.dim() - 2, so the new size of the broadcast_bias
     // becomes quantized_output.dim() - 2 + 1. nothing needs to be done for the leading dimensions
     std::vector<int64_t> new_size(quantized_output.dim() - 1, 1);
     new_size[0] = bias_.value().size(0);
@@ -157,7 +157,7 @@ void PackedConvWeightCudnn<kSpatialDim>::apply_impl_helper(const at::Tensor& qua
   c10::optional<cudnn_frontend::Operation> bias_mult_op;
   c10::optional<cudnn_frontend::Operation> sum_conv_bias_op;
   if (bias_.has_value()) {
-    // we can't directly assign bias_mult_op becauase operator= is deleted for cudnn_frontend::Operation;
+    // we can't directly assign bias_mult_op because operator= is deleted for cudnn_frontend::Operation;
     // alternatively, I think we can use std::unique_ptr and dynamically allocate these builder ops
     // but here, we chose to do it statically. c10::optional<T>::emplace() enables this approach
 
@@ -402,7 +402,7 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_relu.new"), QConvInt8<2, true>::run);
 }
 
-} // anonyous namespace
+} // anonymous namespace
 } // namespace at::native
 
 
diff --git a/aten/src/ATen/native/quantized/cudnn/Linear.cpp b/aten/src/ATen/native/quantized/cudnn/Linear.cpp
index 37e08ba7861d0..f9333d6fbed7a 100644
--- a/aten/src/ATen/native/quantized/cudnn/Linear.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/Linear.cpp
@@ -103,7 +103,7 @@ void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_outp
   if (bias_.has_value()) {
     // the input bias is a 1-D tensor whose size is the same as the size of the last dimension of quantized_output
     // we need to add trailing dimensions in order to properly broadcast bias, otherwise broadcast_to will fail.
-    // the number of trailling dimensions is quantized_output.dim() - 2. We also prepend a leading dimension for clarity
+    // the number of trailing dimensions is quantized_output.dim() - 2. We also prepend a leading dimension for clarity
     std::vector<int64_t> new_size(quantized_output.dim(), 1);
     new_size.back() = bias_.value().size(0);
     broadcasted_bias = bias_.value().clone().reshape(new_size);
@@ -186,7 +186,7 @@ void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_outp
   c10::optional<cudnn_frontend::Operation> bias_mult_op;
   c10::optional<cudnn_frontend::Operation> sum_linear_bias_op;
   if (bias_.has_value()) {
-    // we can't directly assign bias_mult_op becauase operator= is deleted for cudnn_frontend::Operation;
+    // we can't directly assign bias_mult_op because operator= is deleted for cudnn_frontend::Operation;
     // alternatively, I think we can use std::unique_ptr and dynamically allocate these builder ops
     // but here, we chose to do it statically. c10::optional<T>::emplace() enables this approach
 
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index 41cacf20114de..97de2cfbf078a 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -96,7 +96,7 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_dilation(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_groups(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_transpose(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int"), {at::Tag::pt2_compliant_tag});
-  // conv_tranpsose
+  // conv_transpose
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose3d(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"), {at::Tag::pt2_compliant_tag});
@@ -149,6 +149,7 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16_unpacked_weight(Tensor X, Tensor weight, Tensor bias) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_leaky_relu(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i, float negative_slope) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_tanh(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y"), {at::Tag::pt2_compliant_tag});
@@ -246,6 +247,8 @@ TORCH_LIBRARY(_quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
   m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"));
   m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_fp16_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_fbgemm_pack_gemm_matrix_fp16(Tensor W) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_fbgemm_linear_fp16_weight(Tensor X, Tensor W, Tensor B, int out_channel) -> Tensor"));
 }
 
 TORCH_LIBRARY(onednn, m) {
@@ -254,16 +257,20 @@ TORCH_LIBRARY(onednn, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv_prepack(Tensor weight, Tensor w_scales, float x_scale, int x_zp, int[] stride, int[] padding, int[] dilation, int groups, int[]? x_shape=None) -> Tensor"));
 
   // Conv1D/2D/3D with unary postop
-  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv1d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
-  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
-  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv3d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv1d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv3d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
 
   // Conv2D with binary postop
-  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise.binary(Tensor qx, float x_scale, int x_zero_point, Tensor qaccum, float accum_scale, int accum_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise.binary(Tensor qx, float x_scale, int x_zero_point, Tensor qaccum, float accum_scale, int accum_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor"));
 
   // Linear prepack
   m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_prepack(Tensor weight, int[]? x_shape) -> Tensor"));
 
   // Linear with unary postop
   m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, float output_scale, int output_zero_point, ScalarType? output_dtype, str post_op_name, Scalar?[] post_op_args, str post_op_algorithm) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_pointwise.tensor(Tensor qx, Tensor x_scale, Tensor x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, float output_scale, int output_zero_point, ScalarType? output_dtype, str post_op_name, Scalar?[] post_op_args, str post_op_algorithm) -> Tensor"));
+  // Linear with binary postop
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_pointwise.binary(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, float output_scale, int output_zero_point, ScalarType? output_dtype, Tensor? other, float other_scale, int other_zp, str binary_post_op, float binary_alpha, str unary_post_op, Scalar?[] unary_post_op_args, str unary_post_op_algorithm) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_pointwise.binary_tensor(Tensor qx, Tensor x_scale, Tensor x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, float output_scale, int output_zero_point, ScalarType? output_dtype, Tensor? other, float other_scale, int other_zp, str binary_post_op, float binary_alpha, str unary_post_op, Scalar?[] unary_post_op_args, str unary_post_op_algorithm) -> Tensor"));
 }
diff --git a/aten/src/ATen/native/quantized/qconv_unpack.cpp b/aten/src/ATen/native/quantized/qconv_unpack.cpp
index cff99560b7eec..fe4007c712ce5 100644
--- a/aten/src/ATen/native/quantized/qconv_unpack.cpp
+++ b/aten/src/ATen/native/quantized/qconv_unpack.cpp
@@ -181,9 +181,7 @@ class QConvTranspose final {
 IValue
 unpack_quantized_prepacked_sizes_conv2d(const IValue& ivalue) {
   auto params = ivalue.toCustomClass<ConvPackedParamsBase<2>>();
-  at::Tensor weight;
-  c10::optional<at::Tensor> bias;
-  std::tie(weight, bias) = params->unpack();
+  auto [weight, bias] = params->unpack();
   at::OptionalIntArrayRef bias_sizes = c10::nullopt;
   if (bias && bias->defined()) {
     bias_sizes = bias->sizes();
diff --git a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h
index 231da0e911c34..0e79ed809ae6d 100644
--- a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h
+++ b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h
@@ -62,7 +62,7 @@ Tensor _flatten_indices_impl(const Tensor& indices, IntArrayRef size) {
       .build();
 
     {
-      const auto* RESTRICT ptr_indices = indices.data_ptr<index_t>();
+      const auto* RESTRICT ptr_indices = indices.const_data_ptr<index_t>();
 
       KernelLauncher<kernel_t>::launch(iter,
           // NOTE: capture by value required by CUDA
@@ -87,7 +87,7 @@ Tensor _flatten_indices_impl(const Tensor& indices, IntArrayRef size) {
 template <template <typename func_t> class kernel_t>
 Tensor _flatten_indices(const Tensor& indices, IntArrayRef size) {
   TORCH_CHECK(indices.dim() > 1 && static_cast<size_t>(indices.size(0)) == size.size(),
-      NAME, "(): the dimensionality of sparse `indices` and the lenght of `size` must match. ",
+      NAME, "(): the dimensionality of sparse `indices` and the length of `size` must match. ",
             "Got `indices.size(0) == ", indices.size(0), "` != `size.size() == ", size.size(), "`.");
   Tensor flattened_indices;
   AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), NAME, [&] () {
diff --git a/aten/src/ATen/native/sparse/ParamUtils.h b/aten/src/ATen/native/sparse/ParamUtils.h
index 2a856d0e2fe0b..f6c94f549ec6c 100644
--- a/aten/src/ATen/native/sparse/ParamUtils.h
+++ b/aten/src/ATen/native/sparse/ParamUtils.h
@@ -4,8 +4,7 @@
 #include <ATen/TensorUtils.h>
 #include <tuple>
 
-namespace at {
-namespace native {
+namespace at::native {
 
 TORCH_API std::tuple<Tensor, Tensor, int64_t> softmax_sparse_input_preprocessing(
     const Tensor& input_,
@@ -20,5 +19,4 @@ TORCH_API std::tuple<Tensor, Tensor, Tensor, int64_t> softmax_backward_sparse_in
     const Tensor& input_,
     CheckedFrom function_name);
 
-} // namespace native
-} // namespace at
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
index df72ec2a41193..8782031c49aa1 100644
--- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
@@ -23,8 +23,7 @@
 #define NAME "sparse_binary_op_intersection_cpu"
 #endif
 
-namespace at {
-namespace native {
+namespace at::native {
 
 namespace {
 
@@ -87,7 +86,7 @@ TensorIterator make_value_selection_intersection_iter(
         lhs_values.sizes(),
         // remove nnz dim for smooth broadcasting
         rhs_values.sizes().slice(1));
-    // update nnz dim to be the lenght of an index
+    // update nnz dim to be the length of an index
     sizes[0] = lhs_select_idx.numel();
     return sizes;
   }();
@@ -271,7 +270,7 @@ void _sparse_binary_op_intersection_kernel_impl(
       .build();
 
     {
-      const auto* RESTRICT ptr_indices = indices.data_ptr<index_t>();
+      const auto* RESTRICT ptr_indices = indices.const_data_ptr<index_t>();
 
       KernelLauncher::launch(iter,
           // NOTE: capture by value required by CUDA
@@ -349,8 +348,8 @@ void _sparse_binary_op_intersection_kernel_impl(
       .build();
 
     {
-      const auto* RESTRICT ptr_indices = source_indices.data_ptr<index_t>();
-      const auto* RESTRICT ptr_sorted_hash = sorted_hash.data_ptr<int64_t>();
+      const auto* RESTRICT ptr_indices = source_indices.const_data_ptr<index_t>();
+      const auto* RESTRICT ptr_sorted_hash = sorted_hash.const_data_ptr<int64_t>();
       const auto sorted_hash_len = sorted_hash.numel();
       auto* RESTRICT ptr_intersection_count = intersection_count.data_ptr<int64_t>();
       auto* RESTRICT ptr_intersection_first_idx = intersection_first_idx.data_ptr<int64_t>();
@@ -478,4 +477,4 @@ void _sparse_binary_op_intersection_kernel_out(
 
 } // anonymous namespace
 
-}} // at::native
+} // at::native
diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
index b48822e32f301..2db8c9e9404cc 100644
--- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
@@ -76,7 +76,7 @@ struct CPUValueSelectionIntersectionKernel {
               const auto* ptr_rhs_values_bytes = data[3];
               const auto* ptr_rhs_select_idx_bytes = data[4];
               const auto* ptr_intersection_counts_bytes = data[5];
-              const auto* ptr_argsort = argsort.data_ptr<index_t>();
+              const auto* ptr_argsort = argsort.const_data_ptr<index_t>();
 
               for (int64_t i = 0; i < n; ++i) {
                 // Exctract data
diff --git a/aten/src/ATen/native/sparse/SparseBlas.h b/aten/src/ATen/native/sparse/SparseBlas.h
index 337308a2dddf2..475588deb86e1 100644
--- a/aten/src/ATen/native/sparse/SparseBlas.h
+++ b/aten/src/ATen/native/sparse/SparseBlas.h
@@ -5,9 +5,7 @@
 #include <ATen/Tensor.h>
 #include <ATen/core/Scalar.h>
 
-namespace at {
-namespace native {
-namespace sparse {
+namespace at::native::sparse {
 
 TORCH_API void sparse_sampled_addmm_check_inputs(
     const Tensor& self,
@@ -17,6 +15,4 @@ TORCH_API void sparse_sampled_addmm_check_inputs(
     const Scalar& alpha,
     const Tensor& result);
 
-} // namespace sparse
-} // namespace native
-} // namespace at
+} // namespace at::native::sparse
diff --git a/aten/src/ATen/native/sparse/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/SparseBlasImpl.cpp
index e29a9208cbe1e..fd67f0694f2d9 100644
--- a/aten/src/ATen/native/sparse/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/SparseBlasImpl.cpp
@@ -174,10 +174,9 @@ Tensor& _compressed_row_strided_mm_out(const Tensor& compressed, const Tensor& s
     values.unsqueeze_(-1).unsqueeze_(-1);
   }
 
-  Tensor compressed_indices, plain_indices;
-  std::tie(compressed_indices, plain_indices) = at::sparse_csr::getCompressedPlainIndices(compressed);
+  auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(compressed);
 
-  // Select block rows of the strided input that intersect with the block colums of the sparse input.
+  // Select block rows of the strided input that intersect with the block columns of the sparse input.
   auto strided_tiled_selected_rows = strided_tiled.index_select(-4, plain_indices);
 
   // Promote to float if output is half or bfloat16 for better precision
@@ -200,7 +199,7 @@ Tensor& _compressed_row_strided_mm_out(const Tensor& compressed, const Tensor& s
       compressed_indices.scalar_type() == kInt).select(0, 0);
 
   // Reduction step.
-  // If result is neither half nor bfloat16, do everyting in-place.
+  // If result is neither half nor bfloat16, do everything in-place.
   if (result.scalar_type() == mm_dtype) {
     // Zero out and sum over the blocks that share the same row indices.
     result_tiled.zero_();
diff --git a/aten/src/ATen/native/sparse/SparseBlasImpl.h b/aten/src/ATen/native/sparse/SparseBlasImpl.h
index acdd6b377ff6f..74fd61702c32d 100644
--- a/aten/src/ATen/native/sparse/SparseBlasImpl.h
+++ b/aten/src/ATen/native/sparse/SparseBlasImpl.h
@@ -3,10 +3,7 @@
 #include <ATen/Tensor.h>
 #include <ATen/core/Scalar.h>
 
-namespace at {
-namespace native {
-namespace sparse {
-namespace impl {
+namespace at::native::sparse::impl {
 
 TORCH_API Tensor& _compressed_row_strided_mm_out(
     const Tensor& compressed_row_sparse,
@@ -45,7 +42,4 @@ void triangular_solve_out_sparse_csr(
     bool unitriangular);
 
 } // namespace cpu
-} // namespace impl
-} // namespace sparse
-} // namespace native
-} // namespace at
+} // namespace at::native::sparse::impl
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
index 489c925217ad7..d1973c43e9ad7 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
@@ -22,6 +22,7 @@
 #include <ATen/ops/_sparse_csc_tensor_unsafe_native.h>
 #include <ATen/ops/_sparse_bsr_tensor_unsafe_native.h>
 #include <ATen/ops/_sparse_bsc_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_compressed_tensor_with_dims_native.h>
 #include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
 #include <ATen/ops/_sparse_coo_tensor_unsafe.h>
 #include <ATen/ops/_validate_sparse_compressed_tensor_args_native.h>
@@ -151,11 +152,11 @@ static void _validate_sparse_compressed_tensor_args_worker(const Tensor& compres
               "expected ", compressed_indices_name, " to be a strided tensor but got ", compressed_indices.layout(), " tensor");
 
   const int base_ndim = 2;  // corresponds to compressed and plain indices
-  const int batch_ndim = compressed_indices.dim() - 1;
+  const auto batch_ndim = compressed_indices.dim() - 1;
   const int block_ndim = AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(
                            layout, "validate_sparse_compressed_tensor_args",
                            [&] { return 0; }, [&] { return 2; });
-  const int dense_ndim = values.dim() - batch_ndim - block_ndim - 1;
+  const auto dense_ndim = values.dim() - batch_ndim - block_ndim - 1;
 
   // 2.3
   TORCH_CHECK(values.layout() == kStrided,
@@ -231,8 +232,7 @@ static void _validate_sparse_compressed_tensor_args_worker(const Tensor& compres
   }
   const int64_t nrows = size[batch_ndim] / blocksize[0];
   const int64_t ncols = size[batch_ndim + 1] / blocksize[1];
-  int64_t compressed_dim_size, plain_dim_size;
-  std::tie(compressed_dim_size, plain_dim_size) = AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args",
+  auto [compressed_dim_size, plain_dim_size] = AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args",
                                                                                             [&] { return std::make_tuple(nrows, ncols); },
                                                                                             [&] { return std::make_tuple(ncols, nrows); });
   // 3.8
@@ -258,22 +258,26 @@ static void _validate_sparse_compressed_tensor_args_worker(const Tensor& compres
       compressed_indices_name, " and ", plain_indices_name, " dtype must be Int or Long, but got ",
       compressed_indices_type);
 
-  // Indices invariants
-  at::_validate_compressed_sparse_indices(
-      /*is_crow = */layout == kSparseCsr || layout == kSparseBsr,
-      compressed_indices,
-      plain_indices,
-      compressed_dim_size,
-      plain_dim_size,
-      values_nnz);
+  if (compressed_indices.is_meta()) {
+    TORCH_CHECK(values_nnz == 0, "expected nnz to be 0 for sparse ", layout_name, " meta tensor but got ", values_nnz);
+  } else {
+    // Indices invariants
+    at::_validate_compressed_sparse_indices(
+        /*is_crow = */layout == kSparseCsr || layout == kSparseBsr,
+        compressed_indices,
+        plain_indices,
+        compressed_dim_size,
+        plain_dim_size,
+        values_nnz);
+  }
 
   // Device Invariants
   // 4.1
   TORCH_CHECK(
-      values.device().type() == kCPU || values.device().type() == kCUDA,
+      values.device().type() == kCPU || values.device().type() == kCUDA || values.device().type() == kMeta,
       "device type of values (",
       values.device().type(),
-      ") must be CPU or CUDA");
+      ") must be CPU or CUDA or Meta");
   // 4.2, 4.3, 4.4
   TORCH_CHECK(
       compressed_indices.get_device() == values.get_device(),
@@ -331,21 +335,101 @@ static SparseCsrTensor new_compressed_tensor(const TensorOptions& options) {
   // constructor.
   // TORCH_INTERNAL_ASSERT(impl::variable_excluded_from_dispatch());
   Layout layout = AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(options.layout(), "new_compressed_tensor", [&] { return the_layout; });
-  DispatchKey dispatch_key;
+  DispatchKey dispatch_key = DispatchKey::Undefined;
 
-  TORCH_CHECK_NOT_IMPLEMENTED(
-    options.device().type() == kCPU || options.device().type() == kCUDA,
-     "Could not run 'new_compressed_tensor' from the '", options.device(), "' device.)");
-
-  if (options.device().is_cuda()) {
-    dispatch_key = DispatchKey::SparseCsrCUDA;
-  } else {
+  switch(options.device().type()) {
+  case kCPU:
     dispatch_key = DispatchKey::SparseCsrCPU;
+    break;
+  case kCUDA:
+    dispatch_key = DispatchKey::SparseCsrCUDA;
+    break;
+  case kMeta:
+    dispatch_key = DispatchKey::SparseCsrMeta;
+    break;
+  case kPrivateUse1:
+    dispatch_key = DispatchKey::SparseCsrPrivateUse1;
+    break;
+  default:
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Could not run 'new_compressed_tensor' from the '", options.device(), "' device.)");
   }
 
   return detail::make_tensor<SparseCsrTensorImpl>(DispatchKeySet(dispatch_key), options.device(), layout, options.dtype());
 }
 
+Tensor sparse_compressed_tensor_with_dims(
+     int64_t nnz,
+     int64_t dense_dim,
+     c10::IntArrayRef size,
+     c10::IntArrayRef blocksize,
+     ScalarType index_dtype,
+     c10::optional<ScalarType> dtype,
+     c10::optional<Layout> layout,
+     c10::optional<Device> device,
+     c10::optional<bool> pin_memory) {
+  // sparse_compressed_tensor_with_dims is a generalization of empty
+  // that enables the specification of nnz, dense_dim, blocksize, and
+  // index_dtype for sparse compressed tensors.
+  //
+  // sparse_compressed_tensor_with_dims indices and values tensors are
+  // created as empty tensors, so the returned sparse compressed
+  // tensor will not satisfy the sparse compressed tensor
+  // invariants. The caller is responsible for initializing the
+  // indices tensors properly.
+  TORCH_CHECK(layout, "sparse_compressed_tensor_with_dims: expected sparse compressed tensor layout but got none");
+
+  Layout layout_ = layout.value();
+  AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(layout_, "sparse_compressed_tensor_with_dims", [&]{});
+
+  constexpr int64_t sparse_dim = 2;
+  int64_t batch_dim = size.size() - dense_dim - sparse_dim;
+  TORCH_CHECK(batch_dim >= 0, "sparse_compressed_tensor_with_dims: dimensionality must be at least dense_dim(=",
+              dense_dim, ") + sparse_dim(=", sparse_dim, "), but got ", size.size());
+
+  TORCH_CHECK(nnz >= 0, "sparse_compressed_tensor_with_dims: nnz must be non-negative, got ", nnz);
+
+  auto plain_indices_size = DimVector(size.slice(0, batch_dim));
+  auto compressed_indices_size = DimVector(size.slice(0, batch_dim));
+  auto values_size = DimVector(size.slice(0, batch_dim));
+
+  plain_indices_size.push_back(nnz);
+  values_size.push_back(nnz);
+
+  if (layout_ == kSparseBsr || layout_ == kSparseBsc) {
+    TORCH_CHECK(blocksize.size() == (size_t)sparse_dim, "sparse_compressed_tensor_with_dims: blocksize needs to be a tuple of size ",
+                sparse_dim, ", but got ", blocksize.size());
+    auto d0 = (layout_ == kSparseBsr ? 0 : 1);
+    auto d1 = (layout_ == kSparseBsr ? 1 : 0);
+    TORCH_CHECK(blocksize[0] > 0 && blocksize[1] > 0, "sparse_compressed_tensor_with_dims: blocksize needs to be positive, but got ", blocksize);
+    auto compressed_size = size[compressedDimension(layout_, size, dense_dim)];
+    auto plain_size = size[plainDimension(layout_, size, dense_dim)];
+    TORCH_CHECK(compressed_size % blocksize[d0] == 0, "sparse_compressed_tensor_with_dims: dimension ",
+                compressedDimension(layout_, size, dense_dim), " must be multiple of blocksize[", d0, "](=", blocksize[d0], ") but got ", compressed_size);
+    TORCH_CHECK(plain_size % blocksize[d1] == 0, "sparse_compressed_tensor_with_dims: dimension ", plainDimension(layout_, size, dense_dim),
+                " must be multiple of blocksize[", d1, "](=", blocksize[d1], ") but got ", plain_size);
+    compressed_indices_size.push_back(compressed_size / blocksize[d0] + 1);
+    values_size.append(DimVector(blocksize));
+  } else {
+    TORCH_CHECK(blocksize.size() == 0, "sparse_compressed_tensor_with_dims: blocksize cannot be specified for non-block layout ", layout_);
+    compressed_indices_size.push_back(size[compressedDimension(layout_, size, dense_dim)] + 1);
+  }
+
+  values_size.append(DimVector(size.slice(batch_dim + sparse_dim, dense_dim)));
+  TORCH_CHECK(
+      index_dtype == ScalarType::Int || index_dtype == ScalarType::Long,
+      "indices dtype must be Int or Long, but got ", index_dtype);
+
+  TensorOptions options_ = TensorOptions().layout(Layout::Strided).device(device).pinned_memory(pin_memory);
+  auto compressed_indices = at::empty(compressed_indices_size, options_.dtype(index_dtype));
+  auto plain_indices = at::empty(plain_indices_size, options_.dtype(index_dtype));
+  auto values = at::empty(values_size, options_.dtype(dtype));
+
+  TensorOptions options = TensorOptions().dtype(dtype).layout(layout_).device(device).pinned_memory(pin_memory);
+  SparseCsrTensor self = new_compressed_tensor(options);
+  get_sparse_csr_impl(self)->set_member_tensors(compressed_indices, plain_indices, values, size);
+  return self;
+}
+
 Tensor _sparse_compressed_tensor_unsafe_symint(
      const Tensor& compressed_indices,
      const Tensor& plain_indices,
@@ -413,7 +497,7 @@ static DimVector _estimate_sparse_compressed_tensor_size(
     Layout layout) {
   const int block_ndim = AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(layout, "estimate_sparse_compressed_tensor_size", [&] { return 0; }, [&] { return 2; });
   const int base_ndim = 2;  // corresponds to compressed and plain indices
-  const int batch_ndim = compressed_indices.dim() - 1;
+  const auto batch_ndim = compressed_indices.dim() - 1;
   const std::string compressed_indices_name = compressedIndicesName(layout);
   const std::string plain_indices_name = plainIndicesName(layout);
   TORCH_CHECK(
@@ -423,7 +507,7 @@ static DimVector _estimate_sparse_compressed_tensor_size(
               compressed_indices.dim() == plain_indices.dim(),
               compressed_indices_name, " and ", plain_indices_name, " dimensionalities must be equal but got ",
               compressed_indices.dim(), " and ", plain_indices.dim(), ", respectively");
-  const int dense_ndim = values.dim() - batch_ndim - block_ndim - 1;
+  const int64_t dense_ndim = values.dim() - batch_ndim - block_ndim - 1;
   TORCH_CHECK(
               dense_ndim >= 0,
               "values must have dimensionality > sum of batch and block dimensionalities (=",
@@ -795,8 +879,8 @@ Tensor empty_like_sparse_csr(
     self.layout(), " but you requested ", options.layout(), ")");
   if (options.layout() == kSparseCsr) {
     auto result = at::native::_sparse_csr_tensor_unsafe(
-        self.crow_indices().clone(),
-        self.col_indices().clone(),
+        self.crow_indices().to(options.device(), self.crow_indices().dtype(), false, true),
+        self.col_indices().to(options.device(), self.col_indices().dtype(), false, true),
         at::empty(self.values().sizes(), options.layout(kStrided)),
         self.sizes(),
         optTypeMetaToScalarType(options.dtype()),
@@ -805,8 +889,8 @@ Tensor empty_like_sparse_csr(
     return result;
   } else if (options.layout() == kSparseCsc) {
     auto result = at::native::_sparse_csc_tensor_unsafe(
-        self.ccol_indices().clone(),
-        self.row_indices().clone(),
+        self.ccol_indices().to(options.device(), self.ccol_indices().dtype(), false, true),
+        self.row_indices().to(options.device(), self.row_indices().dtype(), false, true),
         at::empty(self.values().sizes(), options.layout(kStrided)),
         self.sizes(),
         optTypeMetaToScalarType(options.dtype()),
@@ -815,8 +899,8 @@ Tensor empty_like_sparse_csr(
     return result;
   } else if (options.layout() == kSparseBsr) {
     auto result = at::native::_sparse_bsr_tensor_unsafe(
-        self.crow_indices().clone(),
-        self.col_indices().clone(),
+        self.crow_indices().to(options.device(), self.crow_indices().dtype(), false, true),
+        self.col_indices().to(options.device(), self.col_indices().dtype(), false, true),
         at::empty(self.values().sizes(), options.layout(kStrided)),
         self.sizes(),
         optTypeMetaToScalarType(options.dtype()),
@@ -826,8 +910,8 @@ Tensor empty_like_sparse_csr(
     return result;
   } else if (options.layout() == kSparseBsc) {
     auto result = at::native::_sparse_bsc_tensor_unsafe(
-        self.ccol_indices().clone(),
-        self.row_indices().clone(),
+        self.ccol_indices().to(options.device(), self.ccol_indices().dtype(), false, true),
+        self.row_indices().to(options.device(), self.row_indices().dtype(), false, true),
         at::empty(self.values().sizes(), options.layout(kStrided)),
         self.sizes(),
         optTypeMetaToScalarType(options.dtype()),
@@ -878,9 +962,7 @@ Tensor select_sparse_csr_worker(const Tensor& self, int64_t dim, int64_t index)
   new_sizes.erase(new_sizes.begin() + dim);
   auto options = self.options();
 
-  Tensor plain_indices;
-  Tensor compressed_indices;
-  std::tie(compressed_indices, plain_indices) =
+  auto [compressed_indices, plain_indices] =
       AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(
           self.layout(),
           "select",
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
index f98ff547272aa..bff9842a2a3ab 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -196,7 +196,7 @@ Tensor& unary_op_inplace(Tensor& self, const F& op_inplace, Args&&... args) {
 namespace native {
 
 using namespace at::sparse_csr;
-// certain utiliy functions are usable from sparse COO.
+// certain utility functions are usable from sparse COO.
 using namespace at::sparse;
 
 Tensor& mul_out_sparse_csr(const Tensor& t_, const Tensor& src_, Tensor& r) {
@@ -222,8 +222,7 @@ Tensor intersection_binary_op_with_wrapped_scalar(const Tensor& sparse, const Te
   // NOTE: intersection_binary_op_with_wrapped_scalar assumes scalar.numel() == 1.
   const auto result_values = op(sparse.values(), scalar.squeeze()).to(at::result_type(sparse, scalar));
   const auto result_sizes = infer_size(sparse.sizes(), scalar.sizes());
-  Tensor compressed_indices, plain_indices;
-  std::tie(compressed_indices, plain_indices) = getCompressedPlainIndices(sparse);
+  auto [compressed_indices, plain_indices] = getCompressedPlainIndices(sparse);
   return at::_sparse_compressed_tensor_unsafe(
       compressed_indices.clone(),
       plain_indices.clone(),
@@ -356,8 +355,7 @@ Tensor sparse_mask_sparse_compressed(
   }
 
   if (self.layout() == kStrided) {
-    Tensor compressed_indices, plain_indices;
-    std::tie(compressed_indices, plain_indices) = at::sparse_csr::getCompressedPlainIndices(mask);
+    auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(mask);
     auto mask_values = mask.values();
     auto dense_mask = at::_sparse_compressed_tensor_unsafe(
         compressed_indices,
@@ -1066,8 +1064,6 @@ Tensor reduce_sparse_csr_dim0_cpu_template(const Tensor& sparse, ReductionOp rop
   Tensor col_indices = sparse.col_indices();
   Tensor values = sparse.values();
   auto numel = values.numel();
-  Tensor new_col_indices;
-  Tensor columns_map;
 
   /*
     Calling at::_unique constitutes the main bottleneck of this
@@ -1075,7 +1071,7 @@ Tensor reduce_sparse_csr_dim0_cpu_template(const Tensor& sparse, ReductionOp rop
     invariant:
       csr.sum(dim=0) == csr.transpose(0, 1).sum(dim=1)
   */
-  std::tie(new_col_indices, columns_map) = at::_unique(col_indices, true, true);
+  auto [new_col_indices, columns_map] = at::_unique(col_indices, true, true);
   auto nnz = new_col_indices.numel();
 
   Tensor new_crow_indices = at::empty({2}, col_indices.options());
@@ -1456,7 +1452,7 @@ std::tuple<Tensor, Tensor> _sparse_mm_reduce_impl_backward_sparse_csr_cpu(
         /*transpose*/false);
     row = coo_indices.select(0, 0);
 
-    // calculte the global index for CSC
+    // calculate the global index for CSC
     // and get the conversion permute pattern
     Tensor index = col.mul(self.size(0)).add_(row);
     permute = index.argsort();
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.h b/aten/src/ATen/native/sparse/SparseCsrTensorMath.h
index d954c8960a235..d06493632eb20 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.h
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.h
@@ -6,10 +6,7 @@
 #include <ATen/native/ReductionType.h>
 #include <ATen/native/cpu/SpmmReduceKernel.h>
 
-namespace at {
-namespace native {
-namespace sparse {
-namespace impl {
+namespace at::native::sparse::impl {
 
 // Returns true if all entries of self are zero
 // TODO: This has potential to be a generic helper
@@ -84,7 +81,4 @@ inline void check_sparse_mm_reduce_impl_inputs(
   check_dim_size(other, 2, 0, self.size(1));
 }
 
-}
-}
-}
-}
+} // at::native::sparse::impl
diff --git a/aten/src/ATen/native/sparse/SparseStubs.h b/aten/src/ATen/native/sparse/SparseStubs.h
index e1ef79c975a44..2a3aef5c8bd92 100644
--- a/aten/src/ATen/native/sparse/SparseStubs.h
+++ b/aten/src/ATen/native/sparse/SparseStubs.h
@@ -22,6 +22,5 @@ DECLARE_DISPATCH(sparse_mask_projection_out_fn, sparse_mask_projection_out_stub)
 using flatten_indices_fn = Tensor (*)(const Tensor& indices, IntArrayRef size);
 DECLARE_DISPATCH(flatten_indices_fn, flatten_indices_stub);
 
-}
-
-}
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 8a2956d76bd43..add7f433731a2 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -629,9 +629,7 @@ SparseTensor _coalesce_sparse_cpu(const SparseTensor& self) {
   Tensor newValues = at::empty(values.sizes(), values.options());
   alias_into_sparse(dst, newIndices, newValues);
 
-  Tensor indicesBuffer;
-  Tensor indicesPermutation;
-  std::tie(indicesBuffer, indicesPermutation) = indices_scalar.sort(0);
+  auto [indicesBuffer, indicesPermutation] = indices_scalar.sort(0);
   // NB: The accessor accesses here rely on self._nnz() > 0 (tested earlier in
   // this function)
   auto newIndicesAccessor = newIndices.accessor<int64_t, 2>();
@@ -729,11 +727,7 @@ static std::tuple<Tensor, Tensor, OptTensor> sparse_mask_like_prepare_sparse_inp
     return res;
   };
 
-  Tensor lhs;
-  OptTensor lhs_hash_opt;
-  bool lhs_is_movable;
-
-  std::tie(lhs, lhs_hash_opt, lhs_is_movable) = [&]() -> auto {
+  auto [lhs, lhs_hash_opt, lhs_is_movable] = [&]() -> auto {
     if (t.is_coalesced()) {
       return std::make_tuple(t, static_cast<OptTensor>(c10::nullopt), false);
     } else {
@@ -743,7 +737,7 @@ static std::tuple<Tensor, Tensor, OptTensor> sparse_mask_like_prepare_sparse_inp
       const auto res_indices = t._indices().index_select(1, argsort_indices_hash);
       const auto res_values = t._values().index_select(0, argsort_indices_hash);
       const auto indices_hash_sorted = indices_hash.index_select(0, argsort_indices_hash);
-      // NOTE: res is not necessariy coalesced, but it is sorted.
+      // NOTE: res is not necessarily coalesced, but it is sorted.
       // We mark it as "coalesced" to skip sorting in the intersection kernel.
       auto res = wrapped_tensor(t, res_indices, res_values)._coalesced_(true);
       return std::make_tuple(std::move(res), static_cast<OptTensor>(std::move(indices_hash_sorted)), true);
@@ -782,9 +776,7 @@ SparseTensor sparse_mask(const Tensor& t, const SparseTensor& mask) {
     }
 
     auto res = at::empty({0}, t.options());
-    Tensor lhs, rhs;
-    OptTensor lhs_hash_opt;
-    std::tie(lhs, rhs, lhs_hash_opt) = sparse_mask_like_prepare_sparse_inputs("sparse_mask", t, mask);
+    auto [lhs, rhs, lhs_hash_opt] = sparse_mask_like_prepare_sparse_inputs("sparse_mask", t, mask);
     sparse_mask_intersection_out_stub(res.device().type(), res, lhs, rhs, lhs_hash_opt);
     return res._coalesced_(mask.is_coalesced());
   }
@@ -815,9 +807,7 @@ Tensor sparse_mask_projection(const Tensor& t, const Tensor& mask, bool accumula
   }
 
   auto res = at::empty({0}, t.options());
-  Tensor lhs, rhs;
-  OptTensor lhs_hash_opt;
-  std::tie(lhs, rhs, lhs_hash_opt) = sparse_mask_like_prepare_sparse_inputs("_sparse_mask_projection", mask, t);
+  auto [lhs, rhs, lhs_hash_opt] = sparse_mask_like_prepare_sparse_inputs("_sparse_mask_projection", mask, t);
   sparse_mask_projection_out_stub(res.device().type(), res, lhs, rhs, lhs_hash_opt, accumulate_matches);
   return res._coalesced_(t.is_coalesced());
 }
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 080702b0ef084..a3227df942c45 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -189,7 +189,7 @@ SparseTensor pow_sparse_scalar(const SparseTensor& t, const Scalar& value) {
 }
 
 // --------------------------------------------------------------------
-// div(SparseTensor, Scalar)
+// coalesce(SparseTensor)
 // --------------------------------------------------------------------
 
 static SparseTensor& coalesce_(SparseTensor& tensor) {
@@ -216,6 +216,9 @@ static SparseTensor& coalesce_(SparseTensor& tensor) {
 // A float tensor with values=[3., 3.] floor divided by 2 would also produce
 // values=[1., 1.] (after truncation), which sum to 2.f instead of 3.f.
 // To perform floor division the sparse tensor must be coalesced first.
+// --------------------------------------------------------------------
+// div(SparseTensor, Scalar)
+// --------------------------------------------------------------------
 
 SparseTensor& div_out_sparse_zerodim(const SparseTensor& t, const Tensor& value, c10::optional<c10::string_view> rounding_mode, SparseTensor& r) {
   TORCH_CHECK(value.dim() == 0, "Sparse division requires a scalar or ",
@@ -588,8 +591,9 @@ SparseTensor& add_out_sparse_cpu(const SparseTensor& t, const SparseTensor& src,
   TORCH_CHECK(is_same_density(t, src), "add: expected 'self' and 'other' to have same density, but 'self' has ", t.sparse_dim(), " sparse dimensions while 'other' has ", src.sparse_dim(), " sparse dimensions");
 
   r.resize_as_(src);
-
-  if (src._values().is_contiguous() && t._values().is_contiguous()) {
+  if (r.is_meta()) {
+    return r;
+  } else if (src._values().is_contiguous() && t._values().is_contiguous()) {
     return add_out_sparse_contiguous(r, t, src, value, commonDtype);
   } else {
     return add_out_sparse_non_contiguous(r, t, src, value, commonDtype);
@@ -1308,7 +1312,7 @@ static Tensor& s_addmm_out_sparse_dense_cpu(
   Tensor indices = sparse_._indices();
   Tensor values      = sparse_._values();
 
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16,
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf,
       values.scalar_type(), "addmm_sparse_dense", [&] {
         s_addmm_out_sparse_dense_worker<scalar_t>(nnz, dim_i, dim_j, dim_k, r, beta, t, alpha, indices, values, dense);
       }
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.h b/aten/src/ATen/native/sparse/SparseTensorMath.h
index c170de72b5846..5c76eddc9451f 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.h
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.h
@@ -2,7 +2,7 @@
 
 #include <ATen/native/SparseTensorUtils.h>
 
-namespace at { namespace native {
+namespace at::native {
 
 TORCH_API sparse::SparseTensor& mul_out_sparse_scalar(sparse::SparseTensor& r, const sparse::SparseTensor& t, const Scalar& value);
 TORCH_API sparse::SparseTensor& mul_out_sparse_zerodim(sparse::SparseTensor& r, const sparse::SparseTensor& t, const Tensor& value);
@@ -10,4 +10,4 @@ TORCH_API sparse::SparseTensor& _mul_dense_sparse_out(const Tensor& d, const Ten
 TORCH_API sparse::SparseTensor& _mul_sparse_sparse_zero_dim_out(const Tensor& zero_dim, const Tensor& other, Tensor& res);
 TORCH_API sparse::SparseTensor& _mul_sparse_sparse_out(const Tensor& x, const Tensor& y, Tensor& res);
 
-}}
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
index 2259ac13729e7..ec4c084a39cc1 100644
--- a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
+++ b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
@@ -24,8 +24,7 @@
 
 #define INVARIANT_CHECK_FUNC_API static INLINE FUNCAPI void
 
-namespace at {
-namespace native {
+namespace at::native {
 
 namespace {
 
@@ -313,7 +312,7 @@ void _validate_compressed_sparse_indices_kernel(
         idx.scalar_type(),
         NAME,
         [&iter, &idx, dim, nnz, idx_ndims, &idx_sizes, &idx_strides]() {
-          const auto* RESTRICT ptr_idx = idx.data_ptr<index_t>();
+          const auto* RESTRICT ptr_idx = idx.const_data_ptr<index_t>();
           const auto zero = index_t{0};
           KernelLauncher::launch(
               iter,
@@ -412,5 +411,4 @@ void validate_compressed_sparse_indices_kernel(
 
 } // namespace
 
-} // namespace native
-} // namespace at
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/ComputeSparseTile.h b/aten/src/ATen/native/sparse/cuda/ComputeSparseTile.h
new file mode 100644
index 0000000000000..886010e4cb7e9
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/ComputeSparseTile.h
@@ -0,0 +1,184 @@
+#pragma once
+
+#include <ATen/native/sparse/cuda/SparseSemiStructuredPack.h>
+#include <ATen/native/sparse/cuda/StaticSort.h>
+#include <cutlass/bfloat16.h>
+#include <cutlass/half.h>
+
+// Given 4x4 values, computes the selected indices that will remain after 2:4
+// sparsification, as a bitmask.
+// NOTE: Algorithms might select LESS than 8 values in total in some cases.
+
+namespace platform {
+template <>
+struct numeric_limits<cutlass::bfloat16_t> {
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t infinity() {
+    return cutlass::bfloat16_t::bitcast(0x7f80);
+  }
+};
+} // namespace platform
+
+namespace at::native{
+
+template <typename Element, typename Pointwise>
+struct TileValueOrderedT {
+  union {
+    struct {
+      Element value;
+      uint2b_t col;
+      uint2b_t row;
+    } parts;
+    uint32_t raw;
+  };
+  CUTLASS_DEVICE bool operator<(
+      TileValueOrderedT<Element, Pointwise> const& other) const {
+    return Pointwise::apply(parts.value) < Pointwise::apply(other.parts.value);
+  }
+  CUTLASS_DEVICE TileValueOrderedT() {}
+};
+
+// Operations that we can apply to rank the values
+struct IdentityOp {
+  template <typename T>
+  static T CUTLASS_HOST_DEVICE apply(T const& x) {
+    return x;
+  }
+};
+// Can be applied to rank based on absolute value
+struct AbsOp {
+  template <typename T>
+  static T CUTLASS_HOST_DEVICE apply(T const& x) {
+    return cutlass::abs(x);
+  }
+};
+
+// Given 4x4 values, computes the selected indices that will remain after 2:4
+// sparsification, as a bitmask. We have 2 constraints:
+// (1) At most 2 values per line
+// (2) At most 2 values per column
+// This means we can select at most 8 values in total.
+// ALGO: We use a greedy algorithm, where we take values in the 4x4
+// tile in descending order. If a value fits (because the line/col is not
+// already full), we select it. Then we move on to the next one.
+// NOTE: This algorithm might select LESS than 8 values in total in some cases.
+// NOTE (2): RF are not indexable, so we shouldn't rely on indexing
+//   values at any point, otherwise they will be stored in local memory.
+template <typename Op = IdentityOp>
+struct LargestValuesGreedy {
+  template <typename T>
+  static CUTLASS_DEVICE T outOfBoundsFillValue() {
+    return -platform::numeric_limits<T>::infinity();
+  }
+
+  template <typename Tile4x4Accessor>
+  CUTLASS_DEVICE Indices4x4 operator()(Tile4x4Accessor values) {
+    using TileValueOrdered =
+        TileValueOrderedT<typename Tile4x4Accessor::Element, Op>;
+    using TileValuesFragment = cutlass::Array<TileValueOrdered, 4 * 4>;
+    Indices4x4 indices;
+    TileValuesFragment values_ordered;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < 4; ++j) {
+        TileValueOrdered& v = values_ordered[i * 4 + j];
+        v.parts.value = values.at(i, j).get();
+        v.parts.col = uint2b_t(j);
+        v.parts.row = uint2b_t(i);
+      }
+    }
+    // Use a sorting network (aka without branches) to avoid
+    // warp divergence
+    StaticSort<TileValuesFragment::kElements> sorter;
+    sorter(values_ordered);
+
+    // bitmask to store how many we have selected on a given row/col
+    // 0 selected: (numPerRow >> 2*row) = 00 (0)
+    // 1 selected: (numPerRow >> 2*row) = 01 (1)
+    // 2 selected: (numPerRow >> 2*row) = 11 (3)
+    uint32_t numPerRow = 0;
+    uint32_t numPerCol = 0;
+    indices = 0;
+
+    // Take as many as we can, starting with the largest values
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = values_ordered.size() - 1; i >= 0; i--) {
+      auto& e = values_ordered[i];
+
+      uint32_t rcount = uint2b_t(numPerRow >> 2 * e.parts.row);
+      uint32_t ccount = uint2b_t(numPerCol >> 2 * e.parts.col);
+      // NOTE: This is more efficient (yet equivalent) to:
+      // `rcount != 3 && ccount != 3`
+      bool selected = (rcount + ccount) <= 2;
+      indices |= selected << (e.parts.col + 4 * e.parts.row);
+
+      numPerRow |= (rcount + selected) << 2 * e.parts.row;
+      numPerCol |= (ccount + selected) << 2 * e.parts.col;
+    }
+    return indices;
+  }
+};
+
+// We consider each rows independantly in order
+// This is to ensure that a row's sparsity pattern is only determined
+// by its values and the rows before (but never the rows after)
+// This enforces causality strictly
+template <typename Op = IdentityOp>
+struct Causal1122 {
+  template <typename T>
+  static CUTLASS_DEVICE T outOfBoundsFillValue() {
+    return -platform::numeric_limits<T>::infinity();
+  }
+
+  template <typename Tile4x4Accessor>
+  CUTLASS_DEVICE Indices4x4 operator()(Tile4x4Accessor values) {
+    static constexpr int kMaxValuesPerRow[] = {1, 1, 2, 2};
+    using TileValueOrdered =
+        TileValueOrderedT<typename Tile4x4Accessor::Element, Op>;
+    using TileValuesFragment = cutlass::Array<TileValueOrdered, 4>;
+    Indices4x4 indices = 0;
+
+    uint32_t numPerCol = 0; // <- see doc in `LargestValuesGreedy`
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int row = 0; row < 4; ++row) {
+      int row_count = 0;
+      TileValuesFragment values_ordered;
+      CUTLASS_PRAGMA_UNROLL
+      for (int col = 0; col < 4; ++col) {
+        TileValueOrdered& v = values_ordered[col];
+        v.parts.value = values.at(row, col).get();
+        v.parts.col = uint2b_t(col);
+      }
+      // Use a sorting network (aka without branches) to avoid
+      // warp divergence
+      StaticSort<TileValuesFragment::kElements> sorter;
+      sorter(values_ordered);
+
+      // Take as many as we can, starting with the largest values
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = values_ordered.size() - 1; i >= 0; i--) {
+        auto& e = values_ordered[i];
+
+        uint32_t ccount = uint2b_t(numPerCol >> 2 * e.parts.col);
+        bool selected = ccount != 3 && (row_count < kMaxValuesPerRow[row]);
+        indices |= selected << (e.parts.col + 4 * row);
+        numPerCol |= (ccount + selected) << 2 * e.parts.col;
+        row_count += selected;
+      }
+    }
+    return indices;
+  }
+};
+
+template <typename T>
+void named_algorithms(T callback) {
+  callback(LargestValuesGreedy<IdentityOp>(), "largest_values_greedy");
+  callback(Causal1122<IdentityOp>(), "causal1122");
+  callback(LargestValuesGreedy<AbsOp>(), "largest_abs_values_greedy");
+  // default one
+  callback(LargestValuesGreedy<IdentityOp>(), "");
+}
+
+} // namespace
diff --git a/aten/src/ATen/native/sparse/cuda/SoftMax.cu b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
index ec7e8adc8960b..c6e1f9158d238 100644
--- a/aten/src/ATen/native/sparse/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
@@ -61,8 +61,7 @@
 
 #include <c10/cuda/CUDAMathCompat.h>
 
-namespace at {
-namespace native {
+namespace at::native {
 namespace {
 
 // Number of threads in a block given an input size up to MAX_BLOCK_SIZE
@@ -669,5 +668,4 @@ Tensor log_softmax_backward_sparse_cuda(
   return grad_input;
 }
 
-} // namespace native
-} // namespace at
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
index 6cac383ac60c4..baad481053e2e 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
@@ -24,8 +24,7 @@
 
 #include <c10/util/MaybeOwned.h>
 
-namespace at {
-namespace native {
+namespace at::native {
 
 /*
   Computes `result` <- α*(A @ B) * spy(C) + β*C, where spy(C) is the sparsity pattern matrix of C.
@@ -285,5 +284,4 @@ std::tuple<Tensor&, Tensor&> triangular_solve_out_sparse_csr_cuda(
   return std::tuple<Tensor&, Tensor&>(X, clone_A);
 }
 
-} // namespace native
-} // namespace at
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
index f12b2f84c2ee9..ab61aed6fe96f 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
@@ -23,11 +23,7 @@
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/util/MaybeOwned.h>
 
-namespace at {
-namespace native {
-namespace sparse {
-namespace impl {
-namespace cuda {
+namespace at::native::sparse::impl::cuda {
 
 namespace {
 
@@ -586,13 +582,13 @@ void spmm(
     const Scalar& beta,
     const Scalar& alpha,
     const Tensor& result) {
-#if !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_52_API())
+#if !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
   addmm_out_legacy(mat1, mat2, beta, alpha, result);
 #else
   c10::MaybeOwned<Tensor> result_ = prepare_dense_matrix_for_cusparse(result);
   c10::MaybeOwned<Tensor> mat2_ = prepare_dense_matrix_for_cusparse(mat2);
 
-  // Here subscript "c" stands for column-major, substript "r" stands for
+  // Here subscript "c" stands for column-major, subscript "r" stands for
   // row-major order Both orders are supported by cuSPARSE. For mixed input we
   // need to cast 'mat2' to order of 'result'. We compute
   // result = mat1 @ op(mat2) + result.
@@ -636,7 +632,7 @@ void spmm(
   auto algorithm = CUSPARSE_SPMM_CSR_ALG2;
 #endif
 
-  auto descB = at::cuda::sparse::CuSparseDnMatDescriptor(
+  auto descB = at::cuda::sparse::CuSparseConstDnMatDescriptor(
       transpose_B ? mat2_->mT() : *mat2_);
   auto descC = at::cuda::sparse::CuSparseDnMatDescriptor(*result_);
 
@@ -659,7 +655,7 @@ void spmm(
             opB,
             &alpha_,
             descA.descriptor(),
-            descB.descriptor(),
+            descB.unsafe_mutable_descriptor(),
             &beta_,
             descC.descriptor(),
             compute_type,
@@ -676,7 +672,7 @@ void spmm(
             opB,
             &alpha_,
             descA.descriptor(),
-            descB.descriptor(),
+            descB.unsafe_mutable_descriptor(),
             &beta_,
             descC.descriptor(),
             compute_type,
@@ -813,8 +809,7 @@ void spgemm(
             buffer2.get()));
 
         // Get how many specified elements are there in C
-        int64_t C_num_rows, C_num_cols, C_nnz;
-        std::tie(C_num_rows, C_num_cols, C_nnz) = descC.get_size();
+        auto [C_num_rows, C_num_cols, C_nnz] = descC.get_size();
 
         TORCH_INTERNAL_ASSERT_DEBUG_ONLY(C_num_rows == m);
         TORCH_INTERNAL_ASSERT_DEBUG_ONLY(C_num_cols == n);
@@ -1431,7 +1426,7 @@ void sampled_addmm_out_sparse_csr(
     const Scalar& beta,
     const Scalar& alpha,
     const at::sparse_csr::SparseCsrTensor& C) {
-#if !(AT_USE_CUSPARSE_GENERIC_SDDMM() || AT_USE_HIPSPARSE_GENERIC_52_API())
+#if !(AT_USE_CUSPARSE_GENERIC_SDDMM() || AT_USE_HIPSPARSE_GENERIC_API())
   TORCH_CHECK(
       false,
       "Calling sampled_addmm with sparse GPU tensors requires compiling ",
@@ -1459,8 +1454,8 @@ void sampled_addmm_out_sparse_csr(
         // ** On entry to cusparseSDDMM_bufferSize(): batched SDDMM is not supported
         // So we need to resort to the for loop
         for (const auto i : c10::irange(batchCount(A))) {
-          auto descA = at::cuda::sparse::CuSparseDnMatDescriptor(*A_, /*batch_offset=*/i);
-          auto descB = at::cuda::sparse::CuSparseDnMatDescriptor(*B_, /*batch_offset=*/i);
+          auto descA = at::cuda::sparse::CuSparseConstDnMatDescriptor(*A_, /*batch_offset=*/i);
+          auto descB = at::cuda::sparse::CuSparseConstDnMatDescriptor(*B_, /*batch_offset=*/i);
           auto descC = at::cuda::sparse::CuSparseSpMatCsrDescriptor(C, /*batch_offset=*/i);
 
           auto beta_ = beta.to<scalar_t>();
@@ -1473,8 +1468,8 @@ void sampled_addmm_out_sparse_csr(
               opA,
               opB,
               &alpha_,
-              descA.descriptor(),
-              descB.descriptor(),
+              descA.unsafe_mutable_descriptor(),
+              descB.unsafe_mutable_descriptor(),
               &beta_,
               descC.descriptor(),
               compute_type,
@@ -1490,8 +1485,8 @@ void sampled_addmm_out_sparse_csr(
               opA,
               opB,
               &alpha_,
-              descA.descriptor(),
-              descB.descriptor(),
+              descA.unsafe_mutable_descriptor(),
+              descB.unsafe_mutable_descriptor(),
               &beta_,
               descC.descriptor(),
               compute_type,
@@ -1503,8 +1498,8 @@ void sampled_addmm_out_sparse_csr(
               opA,
               opB,
               &alpha_,
-              descA.descriptor(),
-              descB.descriptor(),
+              descA.unsafe_mutable_descriptor(),
+              descB.unsafe_mutable_descriptor(),
               &beta_,
               descC.descriptor(),
               compute_type,
@@ -1515,8 +1510,4 @@ void sampled_addmm_out_sparse_csr(
 #endif
 }
 
-} // namespace cuda
-} // namespace impl
-} // namespace sparse
-} // namespace native
-} // namespace at
+} // namespace at::native::sparse::impl::cuda
diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.h b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.h
index b2bae735dfd62..61eb96f9b9a10 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.h
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.h
@@ -4,11 +4,7 @@
 #include <ATen/Tensor.h>
 #include <ATen/core/Scalar.h>
 
-namespace at {
-namespace native {
-namespace sparse {
-namespace impl {
-namespace cuda {
+namespace at::native::sparse::impl::cuda {
 
 void addmm_out_sparse_csr(
     const Tensor& input,
@@ -47,8 +43,4 @@ void sampled_addmm_out_sparse_csr(
     const Scalar& alpha,
     const at::sparse_csr::SparseCsrTensor& result);
 
-} // namespace cuda
-} // namespace impl
-} // namespace sparse
-} // namespace native
-} // namespace at
+} // namespace at::native::sparse::impl::cuda
diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp
index a30ea192f7bb1..ccd84b9b2a570 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp
@@ -9,8 +9,7 @@ This file will be removed eventually.
 #include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
 #include <ATen/native/sparse/cuda/SparseCUDABlas.h>
 
-namespace at {
-namespace native {
+namespace at::native {
 
 void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, const Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, const Tensor& dense) {
   TORCH_INTERNAL_ASSERT(nnz > 0);
@@ -71,5 +70,4 @@ void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n,
     );
 }
 
-} // namespace native
-} // namespace at
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h b/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h
index 67eaffb13a75c..f88e62a6d36ef 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.h
@@ -9,10 +9,8 @@ Here only 32-bit indices sparse indices are supported.
 This file will be removed eventually.
 */
 
-namespace at {
-namespace native {
+namespace at::native {
 
 void s_addmm_out_csr_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, const Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, const Tensor& dense);
 
-} // namespace native
-} // namespace at
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
index c39c2710d0ffd..c9412d74e9cda 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
@@ -5,9 +5,7 @@
 #include <ATen/native/cuda/thread_constants.h>
 #include <c10/macros/Macros.h>
 
-namespace at { namespace native {
-
-namespace apply {
+namespace at::native::apply {
 
 using at::cuda::detail::TensorInfo;
 using indexT = int64_t;
@@ -290,6 +288,4 @@ __global__ void coalesceValuesKernel(
   }
 }
 
-} // namespace apply
-
-}} // namespace at::native
+} // namespace at::native::apply
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp
index d1b169a3511b9..02518574753fc 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp
@@ -69,7 +69,7 @@ const char* cusparseGetErrorString(cusparseStatus_t status) {
 }
 #endif
 
-namespace at { namespace native { namespace sparse { namespace cuda {
+namespace at::native::sparse::cuda {
 
 void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr) {
   TORCH_CHECK((m <= INT_MAX) && (nnz <= INT_MAX),
@@ -154,6 +154,13 @@ void _csrmm2(
 
 
   auto handle = at::cuda::getCurrentCUDASparseHandle();
+  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+  // ALG1 is broken on SM89 as of CUDA 11.8+
+#if !defined(USE_ROCM)
+  auto default_alg = prop->major == 8 && prop->minor == 9 ? CUSPARSE_SPMM_CSR_ALG2 : CUSPARSE_SPMM_CSR_ALG1;
+#else
+  auto default_alg = CUSPARSE_SPMM_CSR_ALG1;
+#endif
 
   // cusparseSpMM_bufferSize returns the bufferSize that can be used by cusparseSpMM
   size_t bufferSize;
@@ -164,7 +171,7 @@ void _csrmm2(
     beta,
     descC,
     cusparse_value_type,      /* data type in which the computation is executed */
-    CUSPARSE_SPMM_CSR_ALG1,   /* default computing algorithm for CSR sparse matrix format */
+    default_alg,              /* default computing algorithm for CSR sparse matrix format */
     &bufferSize               /* output */
   ));
 
@@ -178,7 +185,7 @@ void _csrmm2(
     beta,
     descC,
     cusparse_value_type,      /* data type in which the computation is executed */
-    CUSPARSE_SPMM_CSR_ALG1,   /* default computing algorithm for CSR sparse matrix format */
+    default_alg,              /* default computing algorithm for CSR sparse matrix format */
     dataPtr.get()             /* external buffer */
   ));
 
@@ -524,4 +531,4 @@ void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols
 }
 
 
-}}}} // namespace at::native::sparse::cuda
+} // namespace at::native::sparse::cuda
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.h b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.h
index 405d448ee487d..6a6f7aafdc60e 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.h
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.h
@@ -2,7 +2,7 @@
 
 #include <ATen/cuda/ATenCUDAGeneral.h>
 
-namespace at { namespace native { namespace sparse { namespace cuda {
+namespace at::native::sparse::cuda{
 
 TORCH_CUDA_CU_API void Xcoo2csr(
     const int* coorowind,
@@ -61,4 +61,4 @@ TORCH_CUDA_CU_API void XcoosortByRow(
     int* cooCols,
     int* P,
     void* pBuffer);
-}}}} // namespace at::native::sparse::cuda
+} // namespace at::native::sparse::cuda
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
index 1306c1fe7f243..a36ec9b203fc3 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
@@ -36,7 +36,7 @@
 #include <thrust/binary_search.h>
 #include <c10/macros/Macros.h>
 
-namespace at { namespace native {
+namespace at::native {
 
 using namespace at::sparse;
 
@@ -166,4 +166,4 @@ SparseTensor _coalesce_sparse_cuda(const SparseTensor& self) {
   return dst;
 }
 
-}} // namespace at::native
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index b64de66afc11d..e0d952a452242 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -51,7 +51,7 @@
 #define I_INFO(tensor) cuda::detail::getTensorInfo<int64_t, uint64_t>(tensor)
 #define V_INFO(tensor) cuda::detail::getTensorInfo<scalar_t, uint64_t>(tensor)
 
-namespace at { namespace native {
+namespace at::native {
 
 using namespace at::sparse;
 using at::cuda::detail::TensorInfo;
@@ -296,9 +296,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, const SparseT
   Tensor dense_buffer = dense.to(commonDtype);
   Tensor values = sparse._values().to(commonDtype);
 
-  if (is_same_tensor(r, dense_buffer)) {
-    TORCH_CHECK(r_.is_contiguous(), "add: CUDA dense-sparse addition with a non-contiguous output tensor does not work; shout if you need it (see https://github.com/pytorch/pytorch/issues/1521 )");
-  } else {
+  if (!is_same_tensor(r, dense_buffer)) {
     r.resize_as_(dense);
     r.copy_(dense_buffer);
   }
@@ -315,7 +313,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, const SparseT
     // TODO benchmark to decide whether to remove this special case
     const dim3 block = cuda::getApplyBlock();
     dim3 grid;
-    int curDevice = -1;
+    c10::DeviceIndex curDevice = -1;
     c10::cuda::GetDevice(&curDevice);
     cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
     if (sparse.dense_dim() == 0) {
@@ -612,7 +610,7 @@ Tensor _sparse_sum_backward_cuda(const Tensor& grad_, const SparseTensor& input_
       grad_input_values = grad_values_expand;
     }
     else {
-      int curDevice = -1;
+      c10::DeviceIndex curDevice = -1;
       c10::cuda::GetDevice(&curDevice);
       cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
       at::cuda::ThrustAllocator allocator;
@@ -717,7 +715,7 @@ __global__ void search_end_matrix_indices_cuda_kernel(
 // Search through a 1D tensor of sorted sparse matrix
 // indices to find the end index for each matrix
 void search_end_matrix_indices(int64_t* mat_el_end_indices, int64_t num_matrices, const Tensor& indices_1D) {
-  int curDevice = -1;
+  c10::DeviceIndex curDevice = -1;
   c10::cuda::GetDevice(&curDevice);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
 
@@ -961,4 +959,4 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
   return result;
 }
 
-}} // namespace at::native
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cuh
index 9643fee030242..37fb484ab258f 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cuh
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cuh
@@ -10,8 +10,8 @@ namespace c10 {
 class Scalar;
 }
 
-namespace at { namespace native {
+namespace at::native {
 
 void s_addmm_out_sparse_dense_cuda_worker(int64_t nnz, int64_t m, int64_t n, int64_t k, Tensor& r_, const Scalar& beta, const Tensor& t, const Scalar& alpha, Tensor& indices, Tensor& values, const Tensor& dense);
 
-}} // namespace at::native
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
index c9fcde039bce6..75474e77ea848 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
@@ -45,8 +45,7 @@
 #include <thrust/for_each.h>
 #include <thrust/sequence.h>
 
-namespace at {
-namespace native {
+namespace at::native {
 
 namespace {
 
@@ -68,7 +67,7 @@ __global__ void convert_indices_from_coo_to_csr_cuda_kernel(output_t* data_out,
 template <typename input_t, typename output_t>
 void convert_indices_from_coo_to_csr_cuda(const Tensor& result, const Tensor& input, const int64_t size) {
   int64_t numel = input.numel();
-  const input_t* data_in = input.data_ptr<input_t>();
+  const input_t* data_in = input.const_data_ptr<input_t>();
   output_t* data_out = result.data_ptr<output_t>();
 
   if (numel == 0) {
@@ -114,7 +113,7 @@ void convert_indices_from_csr_to_coo_cuda(const Tensor& indices, const Tensor& c
   }
 
   auto crow_indices_ = crow_indices.expect_contiguous();
-  const input_t* crow_indices_data_in = crow_indices_->data_ptr<input_t>();
+  const input_t* crow_indices_data_in = crow_indices_->const_data_ptr<input_t>();
   TORCH_INTERNAL_ASSERT(indices.is_contiguous());
   auto row0 = indices.select(0, transpose?batch_ndim + 1:batch_ndim + 0);
   auto row1 = indices.select(0, transpose?batch_ndim + 0:batch_ndim + 1);
@@ -134,7 +133,7 @@ void convert_indices_from_csr_to_coo_cuda(const Tensor& indices, const Tensor& c
 } // namespace
 
 using namespace at::sparse_csr;
-// certain utiliy functions are usable from sparse COO.
+// certain utility functions are usable from sparse COO.
 using namespace at::sparse;
 
 Tensor& add_out_dense_sparse_compressed_cuda(
@@ -737,5 +736,4 @@ Tensor _sparse_csr_prod_cuda(const Tensor& input, IntArrayRef dims_to_reduce, bo
   return result;
 }
 
-} // namespace native
-} // namespace at
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
index 9152dc5723795..20af0ee866a51 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@@ -50,8 +50,7 @@
 #include <library_types.h>
 #endif
 
-namespace at {
-namespace native {
+namespace at::native {
 
 namespace {
 
@@ -283,7 +282,7 @@ struct CusparseMatrixMultiplyOp {
 
     at::DataPtr dataPtr1 = allocator.allocate(bufferSize1);
     dBuffer1 = dataPtr1.get();
-    // inspect the matrices A and B to understand the memory requiremnent for
+    // inspect the matrices A and B to understand the memory requirement for
     // the next step
     TORCH_CUDASPARSE_CHECK(cusparseSpGEMM_workEstimation(
         handle,
@@ -811,5 +810,4 @@ Tensor sparse_sparse_matmul_cuda(const Tensor& mat1_, const Tensor& mat2_) {
   return output;
 }
 
-} // namespace native
-} // namespace at
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredApplyDense.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredApplyDense.cu
new file mode 100644
index 0000000000000..8b4d6be5aaac7
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredApplyDense.cu
@@ -0,0 +1,186 @@
+#include <ATen/ScalarOps.h>
+#include <ATen/Tensor.h>
+#include <ATen/Functions.h>
+#include <ATen/autocast_mode.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#else
+#include <ATen/native/sparse/cuda/ComputeSparseTile.h>
+#include <ATen/native/sparse/cuda/SparseSemiStructuredPack.h>
+#endif
+
+namespace at::native {
+
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#else
+struct Params {
+  uint64_t const* threads_masks;
+
+  uint16_t const* input;
+  int64_t input_stride;
+  int64_t input_dim0;
+  int64_t input_dim1;
+
+  uint16_t* output;
+  int64_t output_stride;
+
+  __host__ dim3 getBlocksGrid() const {
+    return dim3(
+        cutlass::ceil_div(input_dim0, kWarpX),
+        cutlass::ceil_div(input_dim1, kWarpY),
+        1);
+  }
+
+  static CUTLASS_HOST_DEVICE dim3 getThreadsGrid() {
+    return dim3(kWarpX / kThreadX, kWarpY / kThreadY, 1);
+  }
+
+  CUTLASS_DEVICE Tile8x8Masks* getCurrentThreadIndices() const {
+    Tile8x8Masks* gmem_threads_masks = (Tile8x8Masks*)threads_masks;
+    gmem_threads_masks += blockIdx.y * getThreadsGrid().y + threadIdx.y;
+    int64_t strideX = gridDim.y * getThreadsGrid().y;
+    gmem_threads_masks +=
+        (blockIdx.x * getThreadsGrid().x + threadIdx.x) * strideX;
+    return gmem_threads_masks;
+  }
+};
+
+template <bool kInputRowMajor = true, bool kOutputRowMajor = true>
+__global__ void __launch_bounds__(32 /* num_threads */, 32) sparse_semi_structured_apply_dense_k(Params p) {
+  using Fragment = cutlass::Array<uint16_t, 8>;
+
+  // Top-left of the 8x8 tile we own
+  int warp_x = blockIdx.x * kWarpX;
+  int warp_y = blockIdx.y * kWarpY;
+  int x = warp_x + threadIdx.x * kThreadX;
+  int y = warp_y + threadIdx.y * kThreadY;
+
+  uint16_t* output = p.output + x * p.output_stride + y;
+  Tile8x8Masks indices = *p.getCurrentThreadIndices();
+
+  // Load dense
+  Fragment lines[8];
+  if (kInputRowMajor) {
+    uint16_t const* input = p.input + x * p.input_stride + y;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+      cutlass::arch::global_load<Fragment, sizeof(Fragment)>(
+          lines[i], input + i * p.input_stride, true);
+    }
+  } else {
+    uint16_t const* input = p.input + x + y * p.input_stride;
+    Fragment columns[8];
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+      cutlass::arch::global_load<Fragment, sizeof(Fragment)>(
+          columns[i], input + i * p.input_stride, true);
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < 8; ++j) {
+        lines[i][j] = columns[j][i].get();
+      }
+    }
+  }
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int row = 0; row < 2; ++row) {
+    Indices4x4 masks[2];
+    if (row == 0) {
+      masks[0] = indices.a;
+      masks[1] = indices.b;
+    } else {
+      masks[0] = indices.c;
+      masks[1] = indices.d;
+    }
+
+    // Apply mask
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < 2; ++m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int r = 0; r < 4; ++r) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int c = 0; c < 4; ++c) {
+          lines[4 * row + r][4 * m + c] = lines[4 * row + r][4 * m + c] *
+              int((masks[m] >> (4 * r + c)) & 1);
+        }
+      }
+    }
+  }
+  static_assert(kOutputRowMajor, "Transpose here for ColMajor output");
+  // Save dense with zeros
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < 8; ++i) {
+    cutlass::arch::global_store<Fragment, sizeof(Fragment)>(
+        lines[i], output + i * p.output_stride, true);
+  }
+}
+#endif
+
+Tensor _sparse_semi_structured_apply_dense(
+    const Tensor& input,
+    const Tensor& threads_masks) {
+
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+  AT_ERROR("_sparse_semi_structured_apply_dense: not supported");
+  return Tensor{};
+#else
+  TORCH_CHECK(
+      input.scalar_type() == at::ScalarType::Half ||
+          input.scalar_type() == at::ScalarType::BFloat16,
+      "Unsupported `input` dtype");
+  TORCH_CHECK(
+      input.stride(0) == 1 || input.stride(1) == 1,
+      "`input` should be either RowMajor or ColMajor. Invalid memory layout - try .contiguous()?");
+
+  auto roundedx = cutlass::round_up(input.size(0), kWarpX);
+  auto roundedy = cutlass::round_up(input.size(1), kWarpY);
+
+  Params p;
+  p.input = (uint16_t const*)input.data_ptr();
+  p.input_dim0 = input.size(0);
+  p.input_dim1 = input.size(1);
+  p.threads_masks = (uint64_t const*)threads_masks.data_ptr();
+
+  TORCH_CHECK(threads_masks.dim() == 3);
+  TORCH_CHECK(threads_masks.size(0) == p.getBlocksGrid().x * p.getThreadsGrid().x);
+  TORCH_CHECK(threads_masks.size(1) == p.getBlocksGrid().y * p.getThreadsGrid().y);
+  TORCH_CHECK(threads_masks.stride(1) == sizeof(p.threads_masks[0]));
+  TORCH_CHECK(threads_masks.size(2) == sizeof(p.threads_masks[0]));
+  TORCH_CHECK(threads_masks.stride(2) == 1);
+  TORCH_CHECK(threads_masks.scalar_type() == at::ScalarType::Byte);
+
+  at::Tensor output = at::empty({p.input_dim0, p.input_dim1}, input.options());
+  TORCH_INTERNAL_ASSERT(output.stride(-1) == 1, "expected RowMajor?");
+  p.output = (uint16_t*)output.data_ptr();
+
+  bool inputRowMajor = input.stride(-1) == 1;
+  bool outputRowMajor = output.stride(-1) == 1;
+  p.input_stride = input.stride(inputRowMajor ? 0 : 1);
+  p.output_stride = output.stride(outputRowMajor ? 0 : 1);
+  at::cuda::CUDAGuard device_guard(input.device());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  size_t smem_bytes = 0;
+  if (inputRowMajor && outputRowMajor) {
+    sparse_semi_structured_apply_dense_k<true, true>
+        <<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
+  } else if (!inputRowMajor && outputRowMajor) {
+    sparse_semi_structured_apply_dense_k<false, true>
+        <<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
+  } else {
+    TORCH_CHECK(
+        false,
+        "Unsupported configuration: `input` is ",
+        inputRowMajor ? "RowMajor" : "ColMajor",
+        ", and `output` is ",
+        outputRowMajor ? "RowMajor" : "ColMajor");
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  return output;
+#endif
+}
+
+} // namespace
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu
index 63d4cb52d62c9..47ee1568beb1e 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredLinear.cu
@@ -3,32 +3,36 @@
 #include <ATen/cuda/CUDAUtils.h>
 #include <ATen/Dispatch.h>
 
-#ifndef USE_ROCM
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#else
 #include <cuda_runtime.h>
 #include <cutlass/cutlass.h>
 #include <cutlass/layout/layout.h>
 #include <cutlass/tensor_ref.h>
-#include <cutlass/epilogue/thread/linear_combination.h>
-#include <cutlass/epilogue/thread/linear_combination_relu.h>
-#include <cutlass/epilogue/thread/linear_combination_silu.h>
-#include <cutlass/gemm/device/gemm_sparse_row_broadcast.h>
+#include <cutlass/gemm/device/gemm_sparse_with_visitor.h>
+#include <cutlass/epilogue/threadblock/fusion/visitors.hpp>
 #endif
 
 #include <type_traits>
 #include <tuple>
 
-#ifndef USE_ROCM
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#else
 #define CUTLASS_STATUS_CHECK(status)                                      \
   {                                                                       \
     TORCH_CHECK(status == cutlass::Status::kSuccess,                      \
                 "Got CUTLASS error: ", cutlassGetStatusString(status));   \
   }
+
+namespace {
+    enum class Activation{NONE, RELU, SILU};
+}
 #endif
 
-namespace at {
-namespace native {
+namespace at::native {
 
-#ifndef USE_ROCM
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#else
 // Wrapper function for CUTLASS sparse GEMM implementation, used
 // solely to simplify dispatching from
 // _sparse_semi_structured_linear() function below.
@@ -37,14 +41,14 @@ template <
     typename ElementInputB,
     typename ElementOutput,
     typename ElementAccumulator,
-    typename ElementComputeEpilogue,
     typename ThreadblockShape,
     typename WarpShape,
     typename InstructionShape,
-    typename EpilogueOp,
     typename LayoutInputA,
-    typename LayoutInputB>
-Tensor two_four_sgemm_cutlass(
+    typename LayoutInputB,
+    bool use_bias,
+    Activation activation>
+Tensor two_four_sgemm(
     const Tensor& tensor_a,
     const at::IntArrayRef::value_type& tensor_a_stride,
     const Tensor& tensor_b,
@@ -55,25 +59,110 @@ Tensor two_four_sgemm_cutlass(
     // alias for particular instantiation of this template.
     using LayoutOutput = cutlass::layout::RowMajor; // Result of the operation will be provided in row-major format.
     using MMAOp = cutlass::arch::OpClassTensorOp; // Tensor cores are to be used for maximum performance.
-    using SmArch = cutlass::arch::Sm80; // Only CC 8.x devices are suported at the moment.
+    using SmArch = cutlass::arch::Sm80; // Only CC 8.x devices are supported at the moment.
     using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // This choice provides good performance across wide range of operand sizes.
     constexpr int NumStages = 3; // This choice provides good performance across wide range of operand sizes.
-    using Gemm = cutlass::gemm::device::SparseGemmRowBroadcast<
+    using Operator = cutlass::arch::OpMultiplyAdd;
+    constexpr int NumEVTEpilogueStages = 1;
+
+    constexpr int AlignmentInputA = 128 / cutlass::sizeof_bits<ElementInputA>::value;
+    constexpr int AlignmentInputB = 128 / cutlass::sizeof_bits<ElementInputB>::value;
+    constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+    using ElementComputeEpilogue = ElementAccumulator;
+    constexpr int AlignmentComputeEpilogue = 128 / cutlass::sizeof_bits<ElementComputeEpilogue>::value;
+    using ElementC = ElementOutput;
+    using LayoutC = LayoutOutput;
+    constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using BiasTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
+        ThreadblockShape,
+        WarpShape,
+        ElementC,
+        AlignmentC,
+        NumEVTEpilogueStages>;
+    using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
+        ThreadblockShape,
+        WarpShape,
+        ElementOutput,
+        AlignmentOutput,
+        NumEVTEpilogueStages>;
+
+    using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+    using BiasScalar =
+        cutlass::epilogue::threadblock::VisitorScalarBroadcast<ElementC>;
+    using BiasTensor =
+        cutlass::epilogue::threadblock::VisitorColBroadcast<
+            BiasTileThreadMap,
+            ElementC,
+            cute::Stride<cute::_1, cute::_0, int64_t>>;
+    using Bias = std::conditional_t<use_bias, BiasTensor, BiasScalar>;
+    using BiasArguments = typename Bias::Arguments;
+
+    using ApplyBias = cutlass::epilogue::threadblock::VisitorCompute<
+        cutlass::plus, ElementComputeEpilogue, ElementComputeEpilogue,
+        cutlass::FloatRoundStyle::round_to_nearest>;
+    using EVTApplyBias = cutlass::epilogue::threadblock::Sm80EVT<
+        ApplyBias,
+        Accum,
+        Bias>;
+
+    using ApplyActivationNone = cutlass::epilogue::threadblock::VisitorCompute<
+        cutlass::epilogue::thread::Identity,
+        ElementComputeEpilogue,
+        ElementComputeEpilogue,
+        cutlass::FloatRoundStyle::round_to_nearest>;
+    using ApplyActivationReLu = cutlass::epilogue::threadblock::VisitorCompute<
+        cutlass::epilogue::thread::ReLu,
+        ElementComputeEpilogue,
+        ElementComputeEpilogue,
+        cutlass::FloatRoundStyle::round_to_nearest>;
+    using ApplyActivationSiLu = cutlass::epilogue::threadblock::VisitorCompute<
+        cutlass::epilogue::thread::SiLu,
+        ElementComputeEpilogue,
+        ElementComputeEpilogue,
+        cutlass::FloatRoundStyle::round_to_nearest>;
+    using ApplyActivation =
+        std::conditional_t<
+            activation == Activation::NONE,
+            ApplyActivationNone,
+            std::conditional_t<
+                activation == Activation::RELU,
+                ApplyActivationReLu,
+                ApplyActivationSiLu>>;
+    using EVTApplyActivation = cutlass::epilogue::threadblock::Sm80EVT<
+        ApplyActivation,
+        EVTApplyBias>;
+
+    using Output = cutlass::epilogue::threadblock::VisitorAuxStore<
+        OutputTileThreadMap, ElementOutput, cutlass::FloatRoundStyle::round_to_nearest,
+        cute::Stride<int64_t, cute::_1, int64_t>>;
+
+    using EVTOutput = cutlass::epilogue::threadblock::Sm80EVT<
+        Output,
+        EVTApplyActivation>;
+
+    using Gemm = cutlass::gemm::device::SparseGemmWithVisitor<
         ElementInputA,
         LayoutInputA,
         ElementInputB,
         LayoutInputB,
-        ElementOutput,
-        LayoutOutput,
+        ElementC,
+        LayoutC,
         ElementAccumulator,
         MMAOp,
         SmArch,
         ThreadblockShape,
         WarpShape,
         InstructionShape,
-        EpilogueOp,
+        EVTOutput,
         SwizzleThreadBlock,
-        NumStages>;
+        NumStages,
+        AlignmentInputA,
+        AlignmentInputB,
+        Operator,
+        NumEVTEpilogueStages>;
 
     // Datatype and layout of metadata matrix are inferred from sparse
     // GEMM template.
@@ -106,11 +195,11 @@ Tensor two_four_sgemm_cutlass(
         meta_dtype = at::kInt;
         break;
     default:
-        AT_ERROR("two_four_sgemm_cutlass: invalid size of meta tensor datatype "
+        AT_ERROR("two_four_sgemm: invalid size of meta tensor datatype "
                  "encountered");
     }
     TORCH_CHECK(meta.dtype() == meta_dtype,
-                "two_four_sgemm_cutlass: Expected meta datatype ", meta_dtype,
+                "two_four_sgemm: Expected meta datatype ", meta_dtype,
                 ", but got ", meta.dtype());
 
     // Determine PyTorch datatype for the output matrix.
@@ -126,64 +215,69 @@ Tensor two_four_sgemm_cutlass(
     } else if constexpr (std::is_same_v<ElementOutput, float>) {
         tensor_d_dtype = at::kFloat;
     } else {
-        AT_ERROR("two_four_sgemm_cutlass: invalid datatype for sparse GEMM ",
-                 " output encountered");
+        AT_ERROR("two_four_sgemm: invalid datatype for sparse GEMM output ",
+                 "encountered");
     }
-    if (tensor_c.numel() != 0) {
+    if constexpr (use_bias) {
         TORCH_CHECK(tensor_c.dtype() == tensor_d_dtype,
-                    "two_four_sgemm_cutlass: Expected sparse GEMM bias "
-                    "datatype ", tensor_d_dtype, ", but got ",
-                    tensor_c.dtype());
+                    "two_four_sgemm: Expected sparse GEMM bias datatype ",
+                    tensor_d_dtype, ", but got ", tensor_c.dtype());
     }
 
     // Create output matrix.
-    Tensor tensor_d;
-    if (tensor_c.numel() != 0) {
-        tensor_d = tensor_c.new_empty({length_m, length_n});
-    } else {
-        tensor_d =
-            tensor_a.new_empty({length_m, length_n},
-                               at::TensorOptions().dtype(tensor_d_dtype));
-    }
+    Tensor tensor_d =
+        tensor_a.new_empty({length_m, length_n},
+                           at::TensorOptions().dtype(tensor_d_dtype));
 
     // Prepare arguments for CUTLASS sparse GEMM kernel.
     cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
     LayoutInputA layout_a(tensor_a_stride);
     LayoutInputB layout_b(tensor_b_stride);
-    LayoutOutput layout_c(tensor_c.numel() != 0 ? tensor_c.stride(0) : 0);
-    LayoutOutput layout_d(tensor_d.stride(0));
     auto tensor_a_device_ref =
         cutlass::TensorRef<ElementInputA, LayoutInputA>(
             (ElementInputA*)tensor_a.data_ptr(), layout_a);
     auto tensor_b_device_ref =
         cutlass::TensorRef<ElementInputB, LayoutInputB>(
             (ElementInputB*)tensor_b.data_ptr(), layout_b);
-    auto tensor_c_device_ref =
-        cutlass::TensorRef<ElementOutput, LayoutOutput>(
-            (ElementOutput*)(tensor_c.numel() != 0 ?
-                             tensor_c.data_ptr() : tensor_d.data_ptr()),
-            layout_c);
-    auto tensor_d_device_ref =
-        cutlass::TensorRef<ElementOutput, LayoutOutput>(
-            (ElementOutput*)tensor_d.data_ptr(), layout_d);
     auto tensor_e_reordered_device_ref =
         cutlass::TensorRef<ElementInputE, ReorderedLayoutInputE>(
             (ElementInputE*)meta.data_ptr(),
             ReorderedLayoutInputE::packed({length_m, meta_ncols}));
-    ElementComputeEpilogue alpha(1);
-    ElementComputeEpilogue beta(tensor_c.numel() != 0 ? 1 : 0);
-    constexpr int split_k_slices = 1;
+
+    BiasArguments bias_arguments{
+        [&]() -> BiasArguments {
+            if constexpr (use_bias) {
+                return {(ElementC*)tensor_c.data_ptr(),
+                        ElementC(0),
+                        {cute::_1{}, cute::_0{}, problem_size.m()}};
+            } else {
+                return {ElementC(0)};
+            }
+        }()
+    };
+    typename Output::Arguments output_arguments{
+        (ElementOutput*)tensor_d.data_ptr(),
+        {problem_size.n(), cute::_1{}, problem_size.mn().product()}
+    };
+    typename EVTOutput::Arguments callback_arguments{
+        {
+            {
+                {},                 // Accum
+                bias_arguments,     // Bias
+                {}                  // ApplyBias
+            },                      // EVTApplyBias
+            {}                      // ApplyActivation
+        },                          // EVTApplyActivation
+        output_arguments,           // Output
+    };                              // EVTOutput
 
     // Create a tuple of CUTLASS sparse GEMM kernel arguments.
     typename Gemm::Arguments arguments{
         problem_size,
         tensor_a_device_ref,
         tensor_b_device_ref,
-        tensor_c_device_ref,
-        tensor_d_device_ref,
         tensor_e_reordered_device_ref,
-        {alpha, beta},
-        split_k_slices};
+        callback_arguments};
 
     cutlass::Status status;
 
@@ -220,16 +314,16 @@ template <
     typename ElementInputB,
     typename ElementOutput,
     typename ElementAccumulator,
-    typename ElementComputeEpilogue,
     typename ThreadblockShape,
     typename WarpShape,
     typename InstructionShape,
-    typename EpilogueOp,
     bool EnableRowMajorRowMajorLayouts,
     bool EnableRowMajorColumnMajorLayouts,
     bool EnableColumnMajorRowMajorLayouts,
-    bool EnableColumnMajorColumnMajorLayouts>
-Tensor two_four_sgemm_cutlass_dispatch_layouts(
+    bool EnableColumnMajorColumnMajorLayouts,
+    bool use_bias,
+    Activation activation>
+Tensor two_four_sgemm_dispatch_layouts(
     const Tensor& tensor_a, const Tensor& tensor_b, const Tensor& tensor_c,
     const Tensor& meta) {
     // Determine layouts (row-major or column-major) of input tensors.
@@ -243,18 +337,18 @@ Tensor two_four_sgemm_cutlass_dispatch_layouts(
     // Perform dispatching.
     if constexpr (EnableRowMajorRowMajorLayouts) {
         if (tensor_a_row_major && tensor_b_row_major) {
-            return two_four_sgemm_cutlass<
+            return two_four_sgemm<
                 ElementInputA,
                 ElementInputB,
                 ElementOutput,
                 ElementAccumulator,
-                ElementComputeEpilogue,
                 ThreadblockShape,
                 WarpShape,
                 InstructionShape,
-                EpilogueOp,
                 cutlass::layout::RowMajor,
-                cutlass::layout::RowMajor>(
+                cutlass::layout::RowMajor,
+                use_bias,
+                activation>(
                 tensor_a,
                 tensor_a_stride,
                 tensor_b,
@@ -265,18 +359,18 @@ Tensor two_four_sgemm_cutlass_dispatch_layouts(
     }
     if constexpr (EnableRowMajorColumnMajorLayouts) {
         if (tensor_a_row_major && !tensor_b_row_major) {
-            return two_four_sgemm_cutlass<
+            return two_four_sgemm<
                 ElementInputA,
                 ElementInputB,
                 ElementOutput,
                 ElementAccumulator,
-                ElementComputeEpilogue,
                 ThreadblockShape,
                 WarpShape,
                 InstructionShape,
-                EpilogueOp,
                 cutlass::layout::RowMajor,
-                cutlass::layout::ColumnMajor>(
+                cutlass::layout::ColumnMajor,
+                use_bias,
+                activation>(
                 tensor_a,
                 tensor_a_stride,
                 tensor_b,
@@ -287,18 +381,18 @@ Tensor two_four_sgemm_cutlass_dispatch_layouts(
     }
     if constexpr (EnableColumnMajorRowMajorLayouts) {
         if (!tensor_a_row_major && tensor_b_row_major) {
-            return two_four_sgemm_cutlass<
+            return two_four_sgemm<
                 ElementInputA,
                 ElementInputB,
                 ElementOutput,
                 ElementAccumulator,
-                ElementComputeEpilogue,
                 ThreadblockShape,
                 WarpShape,
                 InstructionShape,
-                EpilogueOp,
                 cutlass::layout::ColumnMajor,
-                cutlass::layout::RowMajor>(
+                cutlass::layout::RowMajor,
+                use_bias,
+                activation>(
                 tensor_a,
                 tensor_a_stride,
                 tensor_b,
@@ -309,18 +403,18 @@ Tensor two_four_sgemm_cutlass_dispatch_layouts(
     }
     if constexpr (EnableColumnMajorColumnMajorLayouts) {
         if (!tensor_a_row_major && !tensor_b_row_major) {
-            return two_four_sgemm_cutlass<
+            return two_four_sgemm<
                 ElementInputA,
                 ElementInputB,
                 ElementOutput,
                 ElementAccumulator,
-                ElementComputeEpilogue,
                 ThreadblockShape,
                 WarpShape,
                 InstructionShape,
-                EpilogueOp,
                 cutlass::layout::ColumnMajor,
-                cutlass::layout::ColumnMajor>(
+                cutlass::layout::ColumnMajor,
+                use_bias,
+                activation>(
                 tensor_a,
                 tensor_a_stride,
                 tensor_b,
@@ -330,20 +424,77 @@ Tensor two_four_sgemm_cutlass_dispatch_layouts(
         }
     }
 
-    AT_ERROR("two_four_sgemm_cutlass_dispatch_layouts: Combination of ",
+    AT_ERROR("two_four_sgemm_dispatch_layouts: Combination of ",
              tensor_a_row_major ? "row-major" : "column_major", " and ",
              tensor_b_row_major ? "row-major" : "column_major",
              " layouts for input tensors is not supported");
     return Tensor{};
 }
 
+// Dispatch according to the bias tensor being provided or not.
+template <
+    typename ElementInputA,
+    typename ElementInputB,
+    typename ElementOutput,
+    typename ElementAccumulator,
+    typename ThreadblockShape,
+    typename WarpShape,
+    typename InstructionShape,
+    bool EnableRowMajorRowMajorLayouts,
+    bool EnableRowMajorColumnMajorLayouts,
+    bool EnableColumnMajorRowMajorLayouts,
+    bool EnableColumnMajorColumnMajorLayouts,
+    Activation activation>
+Tensor two_four_sgemm_dispatch_layouts_bias(
+    const Tensor& tensor_a, const Tensor& tensor_b, const Tensor& tensor_c,
+    const Tensor& meta) {
+    if (tensor_c.numel() > 0) {
+        return two_four_sgemm_dispatch_layouts<
+            ElementInputA,
+            ElementInputB,
+            ElementOutput,
+            ElementAccumulator,
+            ThreadblockShape,
+            WarpShape,
+            InstructionShape,
+            EnableRowMajorRowMajorLayouts,
+            EnableRowMajorColumnMajorLayouts,
+            EnableColumnMajorRowMajorLayouts,
+            EnableColumnMajorColumnMajorLayouts,
+            true,
+            activation>(
+            tensor_a,
+            tensor_b,
+            tensor_c,
+            meta);
+    } else {
+        return two_four_sgemm_dispatch_layouts<
+            ElementInputA,
+            ElementInputB,
+            ElementOutput,
+            ElementAccumulator,
+            ThreadblockShape,
+            WarpShape,
+            InstructionShape,
+            EnableRowMajorRowMajorLayouts,
+            EnableRowMajorColumnMajorLayouts,
+            EnableColumnMajorRowMajorLayouts,
+            EnableColumnMajorColumnMajorLayouts,
+            false,
+            activation>(
+            tensor_a,
+            tensor_b,
+            tensor_c,
+            meta);
+    }
+}
+
 // Dispatch according to the activation functions enabled.
 template <
     typename ElementInputA,
     typename ElementInputB,
     typename ElementOutput,
     typename ElementAccumulator,
-    typename ElementComputeEpilogue,
     typename ThreadblockShape,
     typename WarpShape,
     typename InstructionShape,
@@ -354,32 +505,25 @@ template <
     bool EnableActivationNone,
     bool EnableActivationReLU,
     bool EnableActivationSiLU>
-Tensor two_four_sgemm_cutlass_dispatch_layouts_activation(
+Tensor two_four_sgemm_dispatch_layouts_bias_activation(
     const Tensor& tensor_a, const Tensor& tensor_b, const Tensor& tensor_c,
     const Tensor& meta, const c10::string_view& activation) {
     // Perform dispatching.
     if constexpr (EnableActivationNone) {
         if (activation == "none") {
-            using EpilogueOp =
-                cutlass::epilogue::thread::LinearCombination<
-                    ElementOutput,
-                    128 / cutlass::sizeof_bits<ElementOutput>::value,
-                    ElementAccumulator,
-                    ElementComputeEpilogue>;
-            return two_four_sgemm_cutlass_dispatch_layouts<
+            return two_four_sgemm_dispatch_layouts_bias<
                 ElementInputA,
                 ElementInputB,
                 ElementOutput,
                 ElementAccumulator,
-                ElementComputeEpilogue,
                 ThreadblockShape,
                 WarpShape,
                 InstructionShape,
-                EpilogueOp,
                 EnableRowMajorRowMajorLayouts,
                 EnableRowMajorColumnMajorLayouts,
                 EnableColumnMajorRowMajorLayouts,
-                EnableColumnMajorColumnMajorLayouts>(
+                EnableColumnMajorColumnMajorLayouts,
+                Activation::NONE>(
                 tensor_a,
                 tensor_b,
                 tensor_c,
@@ -388,26 +532,19 @@ Tensor two_four_sgemm_cutlass_dispatch_layouts_activation(
     }
     if constexpr (EnableActivationReLU) {
         if (activation == "relu") {
-            using EpilogueOp =
-                cutlass::epilogue::thread::LinearCombinationRelu<
-                    ElementOutput,
-                    128 / cutlass::sizeof_bits<ElementOutput>::value,
-                    ElementAccumulator,
-                    ElementComputeEpilogue>;
-            return two_four_sgemm_cutlass_dispatch_layouts<
+            return two_four_sgemm_dispatch_layouts_bias<
                 ElementInputA,
                 ElementInputB,
                 ElementOutput,
                 ElementAccumulator,
-                ElementComputeEpilogue,
                 ThreadblockShape,
                 WarpShape,
                 InstructionShape,
-                EpilogueOp,
                 EnableRowMajorRowMajorLayouts,
                 EnableRowMajorColumnMajorLayouts,
                 EnableColumnMajorRowMajorLayouts,
-                EnableColumnMajorColumnMajorLayouts>(
+                EnableColumnMajorColumnMajorLayouts,
+                Activation::RELU>(
                 tensor_a,
                 tensor_b,
                 tensor_c,
@@ -416,26 +553,19 @@ Tensor two_four_sgemm_cutlass_dispatch_layouts_activation(
     }
     if constexpr (EnableActivationSiLU) {
         if (activation == "silu") {
-            using EpilogueOp =
-                cutlass::epilogue::thread::LinearCombinationSilu<
-                    ElementOutput,
-                    128 / cutlass::sizeof_bits<ElementOutput>::value,
-                    ElementAccumulator,
-                    ElementComputeEpilogue>;
-            return two_four_sgemm_cutlass_dispatch_layouts<
+            return two_four_sgemm_dispatch_layouts_bias<
                 ElementInputA,
                 ElementInputB,
                 ElementOutput,
                 ElementAccumulator,
-                ElementComputeEpilogue,
                 ThreadblockShape,
                 WarpShape,
                 InstructionShape,
-                EpilogueOp,
                 EnableRowMajorRowMajorLayouts,
                 EnableRowMajorColumnMajorLayouts,
                 EnableColumnMajorRowMajorLayouts,
-                EnableColumnMajorColumnMajorLayouts>(
+                EnableColumnMajorColumnMajorLayouts,
+                Activation::SILU>(
                 tensor_a,
                 tensor_b,
                 tensor_c,
@@ -443,8 +573,8 @@ Tensor two_four_sgemm_cutlass_dispatch_layouts_activation(
         }
     }
 
-    AT_ERROR("two_four_sgemm_cutlass_dispatch_layouts: Activation \"",
-             activation, "\" is not supported for given input tensors");
+    AT_ERROR("two_four_sgemm_dispatch_layouts: Activation \"", activation,
+             "\" is not supported for given input tensors");
     return Tensor{};
 }
 #endif
@@ -456,7 +586,7 @@ Tensor two_four_sgemm_cutlass_dispatch_layouts_activation(
 // a matrix with 2:4 sparsity pattern.  The "bias" tensor is optional;
 // if provided, it should be a vector, with the number of elements
 // equal to the number of rows of "weight" matrix.  It is assumed
-// that.  It is assummed that "input", after squashing eventual batch
+// that.  It is assumed that "input", after squashing eventual batch
 // dimensions with the next-to-last dimension of this tensor, and
 // "weight" tensors are supplied either in row-major or column-major
 // layouts (different layouts between these two tensors are OK, but
@@ -473,7 +603,14 @@ Tensor _sparse_semi_structured_linear(
       const Tensor& meta, const c10::optional<Tensor>& bias_opt,
       const c10::optional<c10::string_view> activation_opt,
       const c10::optional<c10::ScalarType> out_dtype_opt) {
-#ifndef USE_ROCM
+    TORCH_WARN_ONCE("_sparse_semi_structured_linear is deprecated and will be "
+                    "removed in a future PyTorch release.  Please use "
+                    "_sparse_semi_structured_mm/_sparse_semi_structured_addmm "
+                    "instead.");
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+    AT_ERROR("_sparse_semi_structured_linear: CUTLASS not supported");
+    return Tensor{};
+#else
     // No need to check that all tensors are on CUDA device, as this
     // is provided by dispatch.
 
@@ -562,7 +699,7 @@ Tensor _sparse_semi_structured_linear(
     // Call wrapper function for CUTLASS sparse GEMM, dispatching on
     // the input datatype, and then on input tensors layouts.
     // According to the input tensors datatypes and layouts,
-    // correspnding template arguments are supplied for instantiating
+    // corresponding template arguments are supplied for instantiating
     // the wrapper function.  The tile sizes template arguments are
     // selected according to the CUTLASS profiler results, for number
     // of runs.
@@ -576,7 +713,6 @@ Tensor _sparse_semi_structured_linear(
                 using ElementInputA = int8_t;
                 using ElementInputB = int8_t;
                 using ElementAccumulator = int32_t;
-                using ElementComputeEpilogue = int32_t;
                 using ThreadblockShape =
                     cutlass::gemm::GemmShape<128, 128, 128>;
                 using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
@@ -590,12 +726,11 @@ Tensor _sparse_semi_structured_linear(
                 const auto EnableActivationSiLU = false;
                 if (out_dtype_opt.has_value()) {
                   using ElementOutput = int32_t;
-                  output = two_four_sgemm_cutlass_dispatch_layouts_activation<
+                  output = two_four_sgemm_dispatch_layouts_bias_activation<
                       ElementInputA,
                       ElementInputB,
                       ElementOutput,
                       ElementAccumulator,
-                      ElementComputeEpilogue,
                       ThreadblockShape,
                       WarpShape,
                       InstructionShape,
@@ -613,12 +748,11 @@ Tensor _sparse_semi_structured_linear(
                       activation);
                 } else {
                   using ElementOutput = int8_t;
-                  output = two_four_sgemm_cutlass_dispatch_layouts_activation<
+                  output = two_four_sgemm_dispatch_layouts_bias_activation<
                       ElementInputA,
                       ElementInputB,
                       ElementOutput,
                       ElementAccumulator,
-                      ElementComputeEpilogue,
                       ThreadblockShape,
                       WarpShape,
                       InstructionShape,
@@ -644,7 +778,6 @@ Tensor _sparse_semi_structured_linear(
                 using ElementInputB = cutlass::half_t;
                 using ElementOutput = cutlass::half_t;
                 using ElementAccumulator = float;
-                using ElementComputeEpilogue = float;
                 using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 64>;
                 using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
                 using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
@@ -655,12 +788,11 @@ Tensor _sparse_semi_structured_linear(
                 const auto EnableActivationNone = true;
                 const auto EnableActivationReLU = true;
                 const auto EnableActivationSiLU = true;
-                output = two_four_sgemm_cutlass_dispatch_layouts_activation<
+                output = two_four_sgemm_dispatch_layouts_bias_activation<
                     ElementInputA,
                     ElementInputB,
                     ElementOutput,
                     ElementAccumulator,
-                    ElementComputeEpilogue,
                     ThreadblockShape,
                     WarpShape,
                     InstructionShape,
@@ -685,7 +817,6 @@ Tensor _sparse_semi_structured_linear(
                 using ElementInputB = cutlass::bfloat16_t;
                 using ElementOutput = cutlass::bfloat16_t;
                 using ElementAccumulator = float;
-                using ElementComputeEpilogue = float;
                 using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 64>;
                 using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
                 using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
@@ -696,12 +827,11 @@ Tensor _sparse_semi_structured_linear(
                 const auto EnableActivationNone = true;
                 const auto EnableActivationReLU = true;
                 const auto EnableActivationSiLU = true;
-                output = two_four_sgemm_cutlass_dispatch_layouts_activation<
+                output = two_four_sgemm_dispatch_layouts_bias_activation<
                     ElementInputA,
                     ElementInputB,
                     ElementOutput,
                     ElementAccumulator,
-                    ElementComputeEpilogue,
                     ThreadblockShape,
                     WarpShape,
                     InstructionShape,
@@ -726,7 +856,6 @@ Tensor _sparse_semi_structured_linear(
                 using ElementInputB = float;
                 using ElementOutput = float;
                 using ElementAccumulator = float;
-                using ElementComputeEpilogue = float;
                 using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>;
                 using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
                 using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
@@ -737,12 +866,11 @@ Tensor _sparse_semi_structured_linear(
                 const auto EnableActivationNone = true;
                 const auto EnableActivationReLU = true;
                 const auto EnableActivationSiLU = true;
-                output = two_four_sgemm_cutlass_dispatch_layouts_activation<
+                output = two_four_sgemm_dispatch_layouts_bias_activation<
                     ElementInputA,
                     ElementInputB,
                     ElementOutput,
                     ElementAccumulator,
-                    ElementComputeEpilogue,
                     ThreadblockShape,
                     WarpShape,
                     InstructionShape,
@@ -765,186 +893,7 @@ Tensor _sparse_semi_structured_linear(
     auto output_sizes = input_sizes;
     output_sizes.back() = weight.size(0);
     return output.transpose(-1, -2).reshape(output_sizes);
-#else
-    AT_ERROR("_sparse_semi_structured_linear: ROCm doesn't support CUTLASS");
-    return Tensor{};
-#endif
-}
-
-} // namespace native
-} // namespace at
-
-// Following is just for testing purposes.
-namespace at {
-namespace native {
-
-#ifndef USE_ROCM
-// Copied from tools/util/include/host_reorder.h, from CUTLASS source
-// tree.  This is for simplicity - namely, this file is not under
-// include/cutlass in this tree, as other CUTLASS include files
-// needed, so it would require changing PyTorch CMake configuration;
-// furthermore, including this file produces build errors in PyTorch
-// at the moment.
-template <typename Element, typename LayoutDest, typename LayoutSrc>
-static void reorder_meta(cutlass::TensorRef<Element, LayoutDest> dest,
-                         cutlass::TensorRef<Element, LayoutSrc> src,
-                         const int problem_size_m, const int problem_size_k) {
-  for (int m = 0; m < problem_size_m; m++) {
-    for (int k = 0; k < problem_size_k; k++) {
-      // First reorder the rows.
-      int group = (sizeof(Element) == 2) ? 32 : 16;
-      int interweave = (sizeof(Element) == 2) ? 4 : 2;
-
-      int dest_row = m / group * group + (m % 8) * interweave + (m % group) / 8;
-      int dest_col = k;
-
-      // Next swizzle the 2x2 blocks from Z to N.
-      if (((dest_row % 2) == 0) && ((dest_col % 2) == 1)) {
-        ++dest_row;
-        --dest_col;
-      } else if (((dest_row % 2) == 1) && ((dest_col % 2) == 0)) {
-        --dest_row;
-        ++dest_col;
-      }
-
-      dest.at({dest_row, dest_col}) = src.at({m, k});
-    }
-  }
-}
-#endif
-
-std::tuple<Tensor, Tensor>
-_to_sparse_semi_structured(const Tensor& dense) {
-#ifndef USE_ROCM
-  // Check dimensions of the dense matrix.
-  TORCH_CHECK(dense.dim() == 2,
-              "_to_sparse_semi_structured: Expected dense argument to be 2D "
-              "tensor, got ", dense.dim(), " dims");
-
-  // Determine PyTorch datatype for the metadata matrix.
-  auto meta_dtype = at::kChar;
-  auto ksparse = 0;
-  auto dense_elems_per_meta_elem = 0;
-  if (dense.dtype() == at::kChar) {
-    meta_dtype = at::kInt;
-    ksparse = 4;
-    dense_elems_per_meta_elem = 32;
-  } else if (dense.dtype() == at::kHalf || dense.dtype() == at::kBFloat16) {
-    meta_dtype = at::kShort;
-    ksparse = 4;
-    dense_elems_per_meta_elem = 16;
-  } else if (dense.dtype() == at::kFloat) {
-    meta_dtype = at::kShort;
-    ksparse = 2;
-    dense_elems_per_meta_elem = 8;
-  } else {
-    AT_ERROR("_to_sparse_semi_structured: Invalid dense argument datatype ",
-             dense.dtype(), " encountered");
-  }
-
-  const auto dense_nrows = dense.size(0);
-  const auto dense_ncols = dense.size(1);
-
-  if (dense_nrows % (meta_dtype == at::kShort ? 32 : 16) != 0) {
-    AT_ERROR("_to_sparse_semi_structured: Number of rows of dense matrix must "
-             "be divisible by ", (meta_dtype == at::kShort ? 32 : 16),
-             ", but it is ", dense_nrows);
-  }
-  if (dense_ncols % dense_elems_per_meta_elem != 0) {
-    AT_ERROR("_to_sparse_semi_structured: Number of columns of dense matrix "
-             "must be divisible by ", dense_elems_per_meta_elem, ", but it is ",
-             dense_ncols);
-  }
-
-  const auto dense_cpu = dense.to("cpu");
-
-  const auto mask_cpu = dense_cpu != at::zeros({1}, dense_cpu.options());
-
-  const auto sparse_cpu =
-    dense_cpu.masked_select(mask_cpu).view({dense_nrows, dense_ncols / 2});
-
-  const auto meta_nrows = dense_nrows;
-  const auto meta_ncols = dense_ncols / dense_elems_per_meta_elem;
-  auto meta_cpu = dense_cpu.new_empty({meta_nrows, meta_ncols},
-                                      at::TensorOptions().dtype(meta_dtype));
-
-  auto* mask_cpu_ptr = mask_cpu.data_ptr<bool>();
-  for (auto i = 0; i < meta_nrows; ++i) {
-    for (auto j = 0; j < meta_ncols; ++j) {
-      uint64_t meta_val = 0;
-      for (auto k = 0; k < dense_elems_per_meta_elem / ksparse; ++k, mask_cpu_ptr += ksparse) {
-        const auto mask_elems =
-          (ksparse == 4) ? std::make_tuple(mask_cpu_ptr[0], mask_cpu_ptr[1],
-                                           mask_cpu_ptr[2], mask_cpu_ptr[3])
-                         : std::make_tuple(mask_cpu_ptr[0], mask_cpu_ptr[0],
-                                           mask_cpu_ptr[1], mask_cpu_ptr[1]);
-        auto meta_quadruple = 0;
-        if (mask_elems == std::make_tuple(1, 1, 0, 0)) {
-          meta_quadruple = 4; // 0100
-        } else if (mask_elems == std::make_tuple(1, 0, 1, 0)) {
-          meta_quadruple = 8; // 1000
-        } else if (mask_elems == std::make_tuple(0, 1, 1, 0)) {
-          meta_quadruple = 9; // 1001
-        } else if (mask_elems == std::make_tuple(1, 0, 0, 1)) {
-          meta_quadruple = 12; // 1100
-        } else if (mask_elems == std::make_tuple(0, 1, 0, 1)) {
-          meta_quadruple = 13; // 1101
-        } else if (mask_elems == std::make_tuple(0, 0, 1, 1)) {
-          meta_quadruple = 14; // 1110
-        } else {
-          AT_ERROR("_to_sparse_semi_structured: dense argument does not match ",
-                   (dense.dtype() != at::kFloat) ? "2:4" : "1:2",
-                   "sparsity pattern");
-        }
-        meta_val = meta_val | (meta_quadruple << (4 * k));
-      }
-      const auto idx = i * meta_ncols + j;
-      if (meta_dtype == at::kShort) {
-        using MetaElement = int16_t;
-        const auto meta_cpu_ptr = meta_cpu.data_ptr<MetaElement>();
-        meta_cpu_ptr[idx] = (MetaElement)meta_val;
-      } else if (meta_dtype == at::kInt) {
-        using MetaElement = int32_t;
-        const auto meta_cpu_ptr = meta_cpu.data_ptr<MetaElement>();
-        meta_cpu_ptr[idx] = (MetaElement)meta_val;
-      }
-    }
-  }
-
-  auto meta_reordered_cpu = meta_cpu.new_empty({meta_nrows, meta_ncols});
-  using MetaLayout = cutlass::layout::RowMajor;
-  using MetaReorderedLayout = cutlass::layout::ColumnMajorInterleaved<2>;
-  if (meta_dtype == at::kShort) {
-    using MetaElement = int16_t;
-    auto meta_cpu_ref =
-      cutlass::TensorRef<MetaElement, MetaLayout>(
-          meta_cpu.data_ptr<MetaElement>(),
-          MetaLayout::packed({meta_nrows, meta_ncols}));
-    auto meta_reordered_cpu_ref =
-      cutlass::TensorRef<MetaElement, MetaReorderedLayout>(
-          meta_reordered_cpu.data_ptr<MetaElement>(),
-          MetaReorderedLayout::packed({meta_nrows, meta_ncols}));
-    reorder_meta(meta_reordered_cpu_ref, meta_cpu_ref, meta_nrows, meta_ncols);
-  } else if (meta_dtype == at::kInt) {
-    using MetaElement = int32_t;
-    auto meta_cpu_ref =
-      cutlass::TensorRef<MetaElement, MetaLayout>(
-          meta_cpu.data_ptr<MetaElement>(),
-          MetaLayout::packed({meta_nrows, meta_ncols}));
-    auto meta_reordered_cpu_ref =
-      cutlass::TensorRef<MetaElement, MetaReorderedLayout>(
-          meta_reordered_cpu.data_ptr<MetaElement>(),
-          MetaReorderedLayout::packed({meta_nrows, meta_ncols}));
-    reorder_meta(meta_reordered_cpu_ref, meta_cpu_ref, meta_nrows, meta_ncols);
-  }
-
-  return std::make_tuple(sparse_cpu.to(dense.device()),
-                         meta_reordered_cpu.to(dense.device()));
-#else
-  AT_ERROR("_to_sparse_semi_structured: ROCm doesn't support CUTLASS");
-  return std::make_tuple(Tensor{}, Tensor{});
 #endif
 }
 
-}  // namespace native
-}  // namespace at
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu
new file mode 100644
index 0000000000000..8c05acc66bc92
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu
@@ -0,0 +1,979 @@
+#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/cuda/CUDAUtils.h>
+#include <ATen/Dispatch.h>
+
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#else
+#include <cuda_runtime.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/layout/layout.h>
+#include <cutlass/tensor_ref.h>
+#include <cutlass/gemm/device/gemm_sparse_with_visitor.h>
+#include <cutlass/epilogue/threadblock/fusion/visitors.hpp>
+#endif
+
+#include <type_traits>
+#include <tuple>
+
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#else
+#define CUTLASS_STATUS_CHECK(status)                                    \
+  {                                                                     \
+    TORCH_CHECK(status == cutlass::Status::kSuccess,                    \
+                __func__, " : CUTLASS error: ",                         \
+                cutlassGetStatusString(status));                        \
+  }
+#endif
+
+namespace at::native {
+
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#else
+// Wrapper function for CUTLASS sparse GEMM implementation, used
+// solely to simplify dispatching from
+// sparse_semi_structured_mad_op() function below.
+template <
+    typename ElementInputA,
+    typename ElementInputB,
+    typename ElementOutput,
+    typename ElementAccumulator,
+    typename ThreadblockShape,
+    typename WarpShape,
+    typename InstructionShape,
+    typename LayoutInputA,
+    typename LayoutInputB,
+    bool use_tensor_c>
+void spgemm_cutlass(
+    const Tensor& tensor_a, const at::IntArrayRef::value_type& tensor_a_stride,
+    const Tensor& tensor_b, const at::IntArrayRef::value_type& tensor_b_stride,
+    const Tensor& tensor_c, const Tensor& tensor_e, const Scalar& alpha,
+    const Scalar& beta, Tensor& tensor_d) {
+    // Fix CUTLASS sparse GEMM template arguments that are not
+    // provided as template argument of this function, and create an
+    // alias for particular instantiation of this template.
+    using LayoutOutput = cutlass::layout::RowMajor; // Result of the operation will be provided in row-major format.
+    using MMAOp = cutlass::arch::OpClassTensorOp; // Tensor cores are to be used for maximum performance.
+    using SmArch = cutlass::arch::Sm80; // Only CC 8.x devices are supported at the moment.
+    using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // This choice provides good performance across wide range of operand sizes.
+    constexpr int NumStages = 3; // This choice provides good performance across wide range of operand sizes.
+    using Operator = cutlass::arch::OpMultiplyAdd;
+    constexpr int NumEVTEpilogueStages = 1;
+
+    constexpr int AlignmentInputA = 128 / cutlass::sizeof_bits<ElementInputA>::value;
+    constexpr int AlignmentInputB = 128 / cutlass::sizeof_bits<ElementInputB>::value;
+    constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+    using ElementComputeEpilogue = ElementAccumulator; // Typically slightly slower, but more precise than if ElementOutput used.
+    constexpr int AlignmentComputeEpilogue = 128 / cutlass::sizeof_bits<ElementComputeEpilogue>::value;
+    using ElementC = ElementOutput;
+    using LayoutC = LayoutOutput;
+    constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+    using TensorCTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
+        ThreadblockShape,
+        WarpShape,
+        ElementC,
+        AlignmentC,
+        NumEVTEpilogueStages>;
+    using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
+        ThreadblockShape,
+        WarpShape,
+        ElementOutput,
+        AlignmentOutput,
+        NumEVTEpilogueStages>;
+
+    using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+    using Alpha =
+        cutlass::epilogue::threadblock::VisitorScalarBroadcast<ElementComputeEpilogue>;
+    using AlphaArguments = typename Alpha::Arguments;
+
+    using ApplyAlpha = cutlass::epilogue::threadblock::VisitorCompute<
+        cutlass::multiplies, ElementComputeEpilogue, ElementComputeEpilogue,
+        cutlass::FloatRoundStyle::round_to_nearest>;
+    using EVTApplyAlpha = cutlass::epilogue::threadblock::Sm80EVT<
+        ApplyAlpha,
+        Alpha,
+        Accum>;
+
+    using Beta =
+        cutlass::epilogue::threadblock::VisitorScalarBroadcast<ElementComputeEpilogue>;
+    using BetaArguments = typename Beta::Arguments;
+
+    using TensorCScalar =
+        cutlass::epilogue::threadblock::VisitorScalarBroadcast<ElementC>;
+    using TensorCTensor =
+        cutlass::epilogue::threadblock::VisitorColBroadcast<
+            TensorCTileThreadMap,
+            ElementC,
+            cute::Stride<cute::_1, cute::_0, int64_t>>;
+    using TensorC = std::conditional_t<use_tensor_c, TensorCTensor, TensorCScalar>;
+    using TensorCArguments = typename TensorC::Arguments;
+
+    using ApplyBeta = cutlass::epilogue::threadblock::VisitorCompute<
+        cutlass::multiplies, ElementComputeEpilogue, ElementComputeEpilogue,
+        cutlass::FloatRoundStyle::round_to_nearest>;
+    using EVTApplyBeta = cutlass::epilogue::threadblock::Sm80EVT<
+        ApplyBeta,
+        Beta,
+        TensorC>;
+
+    using ApplySum = cutlass::epilogue::threadblock::VisitorCompute<
+        cutlass::plus, ElementComputeEpilogue, ElementComputeEpilogue,
+        cutlass::FloatRoundStyle::round_to_nearest>;
+    using EVTApplySum = cutlass::epilogue::threadblock::Sm80EVT<
+        ApplySum,
+        EVTApplyAlpha,
+        EVTApplyBeta>;
+
+    using Output = cutlass::epilogue::threadblock::VisitorAuxStore<
+        OutputTileThreadMap, ElementOutput, cutlass::FloatRoundStyle::round_to_nearest,
+        cute::Stride<int64_t, cute::_1, int64_t>>;
+
+    using EVTOutput = cutlass::epilogue::threadblock::Sm80EVT<
+        Output,
+        EVTApplySum>;
+
+    using Gemm = cutlass::gemm::device::SparseGemmWithVisitor<
+        ElementInputA,
+        LayoutInputA,
+        ElementInputB,
+        LayoutInputB,
+        ElementC,
+        LayoutC,
+        ElementAccumulator,
+        MMAOp,
+        SmArch,
+        ThreadblockShape,
+        WarpShape,
+        InstructionShape,
+        EVTOutput,
+        SwizzleThreadBlock,
+        NumStages,
+        AlignmentInputA,
+        AlignmentInputB,
+        Operator,
+        NumEVTEpilogueStages>;
+
+    // Datatype and layout of metadata matrix are inferred from sparse
+    // GEMM template.
+    using ElementInputE = typename Gemm::ElementE;
+    using LayoutInputE = cutlass::layout::RowMajor;
+    using ReorderedLayoutInputE = typename Gemm::LayoutE;
+    static_assert(
+        std::is_same<ReorderedLayoutInputE,
+                     cutlass::layout::ColumnMajorInterleaved<2>>::value,
+        "Matrix layout used by CUTLASS for reordered metadata for sparse GEMM "
+        "change, thus code doing conversions from/to dense matrix has to be "
+        "updated.");
+
+    constexpr auto kSparse = Gemm::kSparse;
+    constexpr int kElementsPerElementE = Gemm::kElementsPerElementE;
+
+    // Operand sizes.
+    const int length_m = tensor_a.size(0);
+    const int length_k = tensor_b.size(0);
+    const int length_n = tensor_b.size(1);
+    const auto tensor_e_ncols = length_k / kSparse / kElementsPerElementE;
+
+    // Determine PyTorch datatype for the metadata matrix.
+    auto tensor_e_dtype = at::kChar;
+    switch (sizeof(ElementInputE)) {
+    case 2:
+        tensor_e_dtype = at::kShort;
+        break;
+    case 4:
+        tensor_e_dtype = at::kInt;
+        break;
+    default:
+        AT_ERROR(__func__, ": invalid size of meta tensor datatype "
+                 "encountered");
+    }
+    TORCH_CHECK(tensor_e.dtype() == tensor_e_dtype,
+                __func__, " : Expected meta datatype ", tensor_e_dtype,
+                ", but got ", tensor_e.dtype());
+
+    // Prepare arguments for CUTLASS sparse GEMM kernel.
+    cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
+    LayoutInputA layout_a(tensor_a_stride);
+    LayoutInputB layout_b(tensor_b_stride);
+    auto tensor_a_device_ref =
+        cutlass::TensorRef<ElementInputA, LayoutInputA>(
+            (ElementInputA*)tensor_a.data_ptr(), layout_a);
+    auto tensor_b_device_ref =
+        cutlass::TensorRef<ElementInputB, LayoutInputB>(
+            (ElementInputB*)tensor_b.data_ptr(), layout_b);
+    auto tensor_e_reordered_device_ref =
+        cutlass::TensorRef<ElementInputE, ReorderedLayoutInputE>(
+            (ElementInputE*)tensor_e.data_ptr(),
+            ReorderedLayoutInputE::packed({length_m, tensor_e_ncols}));
+
+    AlphaArguments alpha_arguments{
+        [&]() -> AlphaArguments {
+            if constexpr (std::is_same<ElementComputeEpilogue, cutlass::half_t>::value ||
+                          std::is_same<ElementComputeEpilogue, cutlass::bfloat16_t>::value) {
+                return {ElementComputeEpilogue{alpha.to<float>()}};
+            } else {
+                return {alpha.to<ElementComputeEpilogue>()};
+            }
+        }()
+    };
+    BetaArguments beta_arguments{
+        [&]() -> BetaArguments {
+            if constexpr (std::is_same<ElementComputeEpilogue, cutlass::half_t>::value ||
+                          std::is_same<ElementComputeEpilogue, cutlass::bfloat16_t>::value) {
+                return {ElementComputeEpilogue{beta.to<float>()}};
+            } else {
+                return {beta.to<ElementComputeEpilogue>()};
+            }
+        }()
+    };
+    TensorCArguments tensor_c_arguments{
+        [&]() -> TensorCArguments {
+            if constexpr (use_tensor_c) {
+                return {(ElementC*)tensor_c.data_ptr(),
+                        ElementC(0),
+                        {cute::_1{}, cute::_0{}, problem_size.m()}};
+            } else {
+                return {ElementC(0)};
+            }
+        }()
+    };
+    typename Output::Arguments output_arguments{
+        (ElementOutput*)tensor_d.data_ptr(),
+        {problem_size.n(), cute::_1{}, problem_size.mn().product()}
+    };
+    typename EVTOutput::Arguments callback_arguments{
+        {
+            {
+                alpha_arguments,     // Alpha
+                {},                  // Accum
+                {}                   // ApplyAlpha
+            },                       // EVTApplyAlpha
+            {
+                beta_arguments,      // Beta
+                tensor_c_arguments,  // TensorC
+                {}                   // ApplyBeta
+            },                       // EVTApplyBeta
+            {}                       // ApplySum
+        },                           // EVTApplySum
+        output_arguments             // Output
+    };                               // EVTOutput
+
+    // Create a tuple of CUTLASS sparse GEMM kernel arguments.
+    typename Gemm::Arguments arguments{
+        problem_size,
+        tensor_a_device_ref,
+        tensor_b_device_ref,
+        tensor_e_reordered_device_ref,
+        callback_arguments};
+
+    cutlass::Status status;
+
+    // Create CUTLASS sparse GEMM kernel object.
+    Gemm gemm_op;
+
+    // Verify that sparse GEMM operation with given arguments can be
+    // performed by CUTLASS.
+    status = gemm_op.can_implement(arguments);
+    CUTLASS_STATUS_CHECK(status);
+
+    // Allocate workspace for CUTLASS sparse GEMM kernel.
+    const auto workspace_size = Gemm::get_workspace_size(arguments);
+    auto workspace = tensor_a.new_empty({(int64_t)workspace_size},
+                                        at::TensorOptions().dtype(at::kByte));
+
+    // Initialize CUTLASS sparse GEMM object.
+    status = gemm_op.initialize(arguments, workspace.data_ptr(),
+                                at::cuda::getCurrentCUDAStream());
+    CUTLASS_STATUS_CHECK(status);
+
+    // Perform sparse GEMM operation.
+    status = gemm_op.run(at::cuda::getCurrentCUDAStream());
+    CUTLASS_STATUS_CHECK(status);
+
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+// Dispatch according to the input tensors layouts combination.
+template <
+    typename ElementInputA,
+    typename ElementInputB,
+    typename ElementOutput,
+    typename ElementAccumulator,
+    typename ThreadblockShape,
+    typename WarpShape,
+    typename InstructionShape,
+    bool EnableRowMajorRowMajorLayouts,
+    bool EnableRowMajorColumnMajorLayouts,
+    bool EnableColumnMajorRowMajorLayouts,
+    bool EnableColumnMajorColumnMajorLayouts,
+    bool use_tensor_c>
+void spgemm_cutlass_dispatch_layouts(
+    const Tensor& tensor_a, const Tensor& tensor_b, const Tensor& tensor_c,
+    const Tensor& tensor_e, const Scalar& alpha, const Scalar& beta,
+    Tensor& tensor_d) {
+    // Determine layouts (row-major or column-major) of input tensors.
+    const auto strides_a = tensor_a.strides();
+    auto tensor_a_row_major = strides_a[1] == 1;
+    auto tensor_a_stride = tensor_a_row_major ? strides_a[0] : strides_a[1];
+    const auto strides_b = tensor_b.strides();
+    auto tensor_b_row_major = strides_b[1] == 1;
+    auto tensor_b_stride = tensor_b_row_major ? strides_b[0] : strides_b[1];
+
+    // Perform dispatching.
+    if constexpr (EnableRowMajorRowMajorLayouts) {
+        if (tensor_a_row_major && tensor_b_row_major) {
+            spgemm_cutlass<
+                ElementInputA,
+                ElementInputB,
+                ElementOutput,
+                ElementAccumulator,
+                ThreadblockShape,
+                WarpShape,
+                InstructionShape,
+                cutlass::layout::RowMajor,
+                cutlass::layout::RowMajor,
+                use_tensor_c>(
+                tensor_a,
+                tensor_a_stride,
+                tensor_b,
+                tensor_b_stride,
+                tensor_c,
+                tensor_e,
+                alpha,
+                beta,
+                tensor_d);
+            return;
+        }
+    }
+    if constexpr (EnableRowMajorColumnMajorLayouts) {
+        if (tensor_a_row_major && !tensor_b_row_major) {
+            spgemm_cutlass<
+                ElementInputA,
+                ElementInputB,
+                ElementOutput,
+                ElementAccumulator,
+                ThreadblockShape,
+                WarpShape,
+                InstructionShape,
+                cutlass::layout::RowMajor,
+                cutlass::layout::ColumnMajor,
+                use_tensor_c>(
+                tensor_a,
+                tensor_a_stride,
+                tensor_b,
+                tensor_b_stride,
+                tensor_c,
+                tensor_e,
+                alpha,
+                beta,
+                tensor_d);
+            return;
+        }
+    }
+    if constexpr (EnableColumnMajorRowMajorLayouts) {
+        if (!tensor_a_row_major && tensor_b_row_major) {
+            spgemm_cutlass<
+                ElementInputA,
+                ElementInputB,
+                ElementOutput,
+                ElementAccumulator,
+                ThreadblockShape,
+                WarpShape,
+                InstructionShape,
+                cutlass::layout::ColumnMajor,
+                cutlass::layout::RowMajor,
+                use_tensor_c>(
+                tensor_a,
+                tensor_a_stride,
+                tensor_b,
+                tensor_b_stride,
+                tensor_c,
+                tensor_e,
+                alpha,
+                beta,
+                tensor_d);
+            return;
+        }
+    }
+    if constexpr (EnableColumnMajorColumnMajorLayouts) {
+        if (!tensor_a_row_major && !tensor_b_row_major) {
+            spgemm_cutlass<
+                ElementInputA,
+                ElementInputB,
+                ElementOutput,
+                ElementAccumulator,
+                ThreadblockShape,
+                WarpShape,
+                InstructionShape,
+                cutlass::layout::ColumnMajor,
+                cutlass::layout::ColumnMajor,
+                use_tensor_c>(
+                tensor_a,
+                tensor_a_stride,
+                tensor_b,
+                tensor_b_stride,
+                tensor_c,
+                tensor_e,
+                alpha,
+                beta,
+                tensor_d);
+            return;
+        }
+    }
+
+    AT_ERROR(__func__, "_dispatch_layouts: Combination of ",
+             tensor_a_row_major ? "row-major" : "column_major", " and ",
+             tensor_b_row_major ? "row-major" : "column_major",
+             " layouts for input tensors is not supported");
+}
+
+// Dispatch according to the tensor_c tensor being provided or not.
+template <
+    typename ElementInputA,
+    typename ElementInputB,
+    typename ElementOutput,
+    typename ElementAccumulator,
+    typename ThreadblockShape,
+    typename WarpShape,
+    typename InstructionShape,
+    bool EnableRowMajorRowMajorLayouts,
+    bool EnableRowMajorColumnMajorLayouts,
+    bool EnableColumnMajorRowMajorLayouts,
+    bool EnableColumnMajorColumnMajorLayouts>
+void spgemm_cutlass_dispatch_layouts_tensor_c(
+    const Tensor& tensor_a, const Tensor& tensor_b, const Tensor& tensor_c,
+    const Tensor& tensor_e, const Scalar& alpha, const Scalar& beta,
+    Tensor& tensor_d) {
+    if (tensor_c.numel() > 0) {
+        spgemm_cutlass_dispatch_layouts<
+            ElementInputA,
+            ElementInputB,
+            ElementOutput,
+            ElementAccumulator,
+            ThreadblockShape,
+            WarpShape,
+            InstructionShape,
+            EnableRowMajorRowMajorLayouts,
+            EnableRowMajorColumnMajorLayouts,
+            EnableColumnMajorRowMajorLayouts,
+            EnableColumnMajorColumnMajorLayouts,
+            true>(
+            tensor_a,
+            tensor_b,
+            tensor_c,
+            tensor_e,
+            alpha,
+            beta,
+            tensor_d);
+    } else {
+        spgemm_cutlass_dispatch_layouts<
+            ElementInputA,
+            ElementInputB,
+            ElementOutput,
+            ElementAccumulator,
+            ThreadblockShape,
+            WarpShape,
+            InstructionShape,
+            EnableRowMajorRowMajorLayouts,
+            EnableRowMajorColumnMajorLayouts,
+            EnableColumnMajorRowMajorLayouts,
+            EnableColumnMajorColumnMajorLayouts,
+            false>(
+            tensor_a,
+            tensor_b,
+            tensor_c,
+            tensor_e,
+            alpha,
+            beta,
+            tensor_d);
+    }
+}
+#endif
+
+// Perform multiply-add operation, using corresponding CUTLASS
+// sparse GEMM kernel, to given arguments:
+//     result = alpha * mat1 @ mat2 + beta * input
+// The "mat2" tensor is a dense tensor, while the "mat1" tensor is a
+// sparse semi-structured matrix.  The "input" tensor is optional; if
+// provided, it should be a vector, with the number of elements equal
+// to the number of rows of "mat1" matrix.  It is assumed that "mat1"
+// and "mat2" are 2D tensors, supplied either in row-major or
+// column-major layouts (different layouts between these two tensors
+// are OK, but not all combinations of formats are supported for some
+// datatypes of these matrices).  The "mat1_meta" argument contains
+// sparse semi-strucutred metadata.
+//
+// There exists numerous limitations of CUTLASS sparse GEMM kernel,
+// with regards to sizes and alignments of input tensors, their
+// layouts and datatypes, and so on; this is the reason for large
+// number of checks throughout the code.
+//
+// TODO: The "input" tensor has to be a vector, such that it could be
+// broadcasted to columns of mat1 * mat2.  The case of broadcasting to
+// rows of mat1 * mat2 could be also supported, if "input" tensor is a
+// vector of corresponding length; and same for the case when "input"
+// tensor is a matrix of same size as mat1 * mat2 product.  If these
+// updates made here, then remember to update corresponding bits in
+// the Inductor code that are handling meta registrations and
+// lowerings of aten._sparse_semi_structured_mm and
+// aten._sparse_semi_structured_addmm operators.
+Tensor sparse_semi_structured_mad_op(
+      const Tensor& mat1, const Tensor& mat1_meta, const Tensor& mat2,
+      const c10::optional<Tensor>& input_opt, const Scalar& alpha,
+      const Scalar& beta, const c10::optional<c10::ScalarType> out_dtype_opt) {
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+    AT_ERROR(__func__, " : CUTLASS not supported");
+    return Tensor{};
+#else
+    // No need to check that all tensors are on CUDA device, as this
+    // is provided by dispatch.
+
+    const auto& input = input_opt.value_or(Tensor{});
+    const auto out_dtype = out_dtype_opt.value_or(mat2.scalar_type());
+
+    // For now, only CC 8.x devices are supported.
+    const auto dprops = at::cuda::getCurrentDeviceProperties();
+    const auto is_sm8x = dprops->major == 8;
+    TORCH_CHECK(is_sm8x,
+                __func__, " : Supported only on GPUs with compute capability "
+                "8.x");
+
+    // Validate datatypes of input tensors.
+    TORCH_CHECK(mat2.dtype() == at::kChar ||
+                mat2.dtype() == at::kHalf ||
+                mat2.dtype() == at::kBFloat16 ||
+                mat2.dtype() == at::kFloat,
+                __func__, " : The mat2 datatype ", mat2.dtype(),
+                " is not supported");
+    TORCH_CHECK(mat1.dtype() == mat2.dtype(),
+                __func__, " : Expected mat1 datatype ", mat2.dtype(),
+                ", but got ", mat1.dtype());
+    if (input.numel() != 0) {
+        TORCH_CHECK(input.dtype() == out_dtype,
+                    __func__, " : Expected input datatype ", out_dtype,
+                    ", but got ", input.dtype());
+    }
+
+    // Validate layouts of input tensors.
+    TORCH_CHECK(mat1.layout() == Layout::Strided,
+                __func__, " : Expected mat1 argument to be strided, but got "
+                "layout ", mat1.layout());
+    TORCH_CHECK(mat1.dim() == 2,
+                __func__, " : Expected mat1 argument to be 2D tensor, got ",
+                mat1.dim(), " dims");
+    const auto strides_a = mat1.strides();
+    TORCH_CHECK(strides_a[0] == 1 || strides_a[1] == 1,
+                __func__, " : Invalid strides for mat1 argument: row stride = ",
+                strides_a[0], ", column stride = ", strides_a[1]);
+    TORCH_CHECK(mat2.layout() == Layout::Strided,
+                __func__, " : Expected mat2 argument to be "
+                "strided, but got layout ", mat2.layout());
+    TORCH_CHECK(mat2.dim() == 2,
+                __func__, " : Expected mat2 argument to be 2D tensor, got ",
+                mat2.dim(), " dims");
+    const auto strides_b = mat2.strides();
+    TORCH_CHECK(strides_b[0] == 1 || strides_b[1] == 1,
+                __func__, " : Invalid strides for mat2 argument: row stride = ",
+                strides_b[0], ", column stride = ", strides_b[1]);
+    if (input.numel() != 0) {
+        TORCH_CHECK(input.layout() == Layout::Strided,
+                    __func__, " : Expected input argument to be strided, but "
+                    "got layout ", input.layout());
+        TORCH_CHECK(input.dim() == 1,
+                    __func__, " : Expected input argument to be 1D tensor, "
+                    "got ", input.dim(), " dims");
+    }
+
+    // Validate sizes of input tensors.
+    TORCH_CHECK(mat1.size(1) == mat2.size(0) / 2,
+                __func__, " : Expected mat1 argument to have ",
+                mat2.size(0) / 2, " columns, but got ", mat1.size(1));
+    if (input.numel() != 0) {
+        TORCH_CHECK(input.size(0) == mat1.size(0),
+                    __func__, " : Expected input argument to have ",
+                    mat1.size(0), " elements, but got ", input.size(0));
+    }
+
+    // Introduce alias names for arguments, according to the CUTLASS
+    // naming conventions.
+    const auto& tensor_a = mat1;
+    const auto& tensor_b = mat2;
+    const auto& tensor_c = input;
+    const auto& tensor_e = mat1_meta;
+
+    // Create output tensor.
+    Tensor tensor_d =
+        tensor_b.new_empty({tensor_a.size(0), tensor_b.size(1)},
+                           at::TensorOptions().dtype(out_dtype));
+
+    // Call wrapper function for CUTLASS sparse GEMM, dispatching on
+    // the input datatype, and then on input tensors layouts.
+    // According to the input tensors datatypes and layouts,
+    // corresponding template arguments are supplied for instantiating
+    // the wrapper function.  The tile sizes template arguments are
+    // selected according to the CUTLASS profiler results, for number
+    // of runs.
+    AT_DISPATCH_SWITCH(
+        tensor_a.scalar_type(),
+        "sparse_semi_structured_mad_op",
+        AT_DISPATCH_CASE(
+            at::ScalarType::Char,
+            [&]() {
+                using ElementInputA = int8_t;
+                using ElementInputB = int8_t;
+                using ElementAccumulator = int32_t;
+                using ThreadblockShape =
+                    cutlass::gemm::GemmShape<128, 128, 128>;
+                using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
+                using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>;
+                const auto EnableRowMajorRowMajorLayouts = false;
+                const auto EnableRowMajorColumnMajorLayouts = true;
+                const auto EnableColumnMajorRowMajorLayouts = false;
+                const auto EnableColumnMajorColumnMajorLayouts = false;
+                if (out_dtype == at::kInt) {
+                  using ElementOutput = int32_t;
+                  spgemm_cutlass_dispatch_layouts_tensor_c<
+                      ElementInputA,
+                      ElementInputB,
+                      ElementOutput,
+                      ElementAccumulator,
+                      ThreadblockShape,
+                      WarpShape,
+                      InstructionShape,
+                      EnableRowMajorRowMajorLayouts,
+                      EnableRowMajorColumnMajorLayouts,
+                      EnableColumnMajorRowMajorLayouts,
+                      EnableColumnMajorColumnMajorLayouts>(
+                      tensor_a,
+                      tensor_b,
+                      tensor_c,
+                      tensor_e,
+                      alpha,
+                      beta,
+                      tensor_d);
+                } else if (out_dtype == at::kChar) {
+                  using ElementOutput = int8_t;
+                  spgemm_cutlass_dispatch_layouts_tensor_c<
+                      ElementInputA,
+                      ElementInputB,
+                      ElementOutput,
+                      ElementAccumulator,
+                      ThreadblockShape,
+                      WarpShape,
+                      InstructionShape,
+                      EnableRowMajorRowMajorLayouts,
+                      EnableRowMajorColumnMajorLayouts,
+                      EnableColumnMajorRowMajorLayouts,
+                      EnableColumnMajorColumnMajorLayouts>(
+                      tensor_a,
+                      tensor_b,
+                      tensor_c,
+                      tensor_e,
+                      alpha,
+                      beta,
+                      tensor_d);
+                }
+            })
+        AT_DISPATCH_CASE(
+            at::ScalarType::Half,
+            [&]() {
+                using ElementInputA = cutlass::half_t;
+                using ElementInputB = cutlass::half_t;
+                using ElementOutput = cutlass::half_t;
+                using ElementAccumulator = float;
+                using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 64>;
+                using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+                using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+                const auto EnableRowMajorRowMajorLayouts = true;
+                const auto EnableRowMajorColumnMajorLayouts = true;
+                const auto EnableColumnMajorRowMajorLayouts = true;
+                const auto EnableColumnMajorColumnMajorLayouts = true;
+                spgemm_cutlass_dispatch_layouts_tensor_c<
+                    ElementInputA,
+                    ElementInputB,
+                    ElementOutput,
+                    ElementAccumulator,
+                    ThreadblockShape,
+                    WarpShape,
+                    InstructionShape,
+                    EnableRowMajorRowMajorLayouts,
+                    EnableRowMajorColumnMajorLayouts,
+                    EnableColumnMajorRowMajorLayouts,
+                    EnableColumnMajorColumnMajorLayouts>(
+                    tensor_a,
+                    tensor_b,
+                    tensor_c,
+                    tensor_e,
+                    alpha,
+                    beta,
+                    tensor_d);
+            })
+            AT_DISPATCH_CASE(
+            at::ScalarType::BFloat16,
+            [&]() {
+                using ElementInputA = cutlass::bfloat16_t;
+                using ElementInputB = cutlass::bfloat16_t;
+                using ElementOutput = cutlass::bfloat16_t;
+                using ElementAccumulator = float;
+                using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 64>;
+                using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
+                using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+                const auto EnableRowMajorRowMajorLayouts = true;
+                const auto EnableRowMajorColumnMajorLayouts = true;
+                const auto EnableColumnMajorRowMajorLayouts = true;
+                const auto EnableColumnMajorColumnMajorLayouts = true;
+                spgemm_cutlass_dispatch_layouts_tensor_c<
+                    ElementInputA,
+                    ElementInputB,
+                    ElementOutput,
+                    ElementAccumulator,
+                    ThreadblockShape,
+                    WarpShape,
+                    InstructionShape,
+                    EnableRowMajorRowMajorLayouts,
+                    EnableRowMajorColumnMajorLayouts,
+                    EnableColumnMajorRowMajorLayouts,
+                    EnableColumnMajorColumnMajorLayouts>(
+                    tensor_a,
+                    tensor_b,
+                    tensor_c,
+                    tensor_e,
+                    alpha,
+                    beta,
+                    tensor_d);
+            })
+            AT_DISPATCH_CASE(
+            at::ScalarType::Float,
+            [&]() {
+                using ElementInputA = float;
+                using ElementInputB = float;
+                using ElementOutput = float;
+                using ElementAccumulator = float;
+                using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 32>;
+                using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
+                using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+                const auto EnableRowMajorRowMajorLayouts = true;
+                const auto EnableRowMajorColumnMajorLayouts = true;
+                const auto EnableColumnMajorRowMajorLayouts = true;
+                const auto EnableColumnMajorColumnMajorLayouts = true;
+                spgemm_cutlass_dispatch_layouts_tensor_c<
+                    ElementInputA,
+                    ElementInputB,
+                    ElementOutput,
+                    ElementAccumulator,
+                    ThreadblockShape,
+                    WarpShape,
+                    InstructionShape,
+                    EnableRowMajorRowMajorLayouts,
+                    EnableRowMajorColumnMajorLayouts,
+                    EnableColumnMajorRowMajorLayouts,
+                    EnableColumnMajorColumnMajorLayouts>(
+                    tensor_a,
+                    tensor_b,
+                    tensor_c,
+                    tensor_e,
+                    alpha,
+                    beta,
+                    tensor_d);
+            }));
+
+    return tensor_d;
+#endif
+}
+
+// Implementation of aten._sparse_semi_structured_mm operator.
+Tensor _sparse_semi_structured_mm(
+      const Tensor& mat1, const Tensor& mat1_meta, const Tensor& mat2,
+      const c10::optional<c10::ScalarType> out_dtype_opt) {
+    return sparse_semi_structured_mad_op(mat1, mat1_meta, mat2,
+                                         c10::optional<Tensor>(), 1, 0,
+                                         out_dtype_opt);
+}
+
+// Implementation of aten._sparse_semi_structured_addmm operator.
+Tensor _sparse_semi_structured_addmm(
+      const Tensor& input, const Tensor& mat1, const Tensor& mat1_meta,
+      const Tensor& mat2, const Scalar& alpha, const Scalar& beta,
+      const c10::optional<c10::ScalarType> out_dtype_opt) {
+    return sparse_semi_structured_mad_op(mat1, mat1_meta, mat2, input, alpha,
+                                         beta, out_dtype_opt);
+}
+
+} // namespace at::native
+
+// Following is just for testing purposes.
+namespace at::native {
+
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#else
+// Copied from tools/util/include/host_reorder.h, from CUTLASS source
+// tree.  This is for simplicity - namely, this file is not under
+// include/cutlass in this tree, as other CUTLASS include files
+// needed, so it would require changing PyTorch CMake configuration;
+// furthermore, including this file produces build errors in PyTorch
+// at the moment.
+template <typename Element, typename LayoutDest, typename LayoutSrc>
+static void reorder_meta(cutlass::TensorRef<Element, LayoutDest> dest,
+                         cutlass::TensorRef<Element, LayoutSrc> src,
+                         const int problem_size_m, const int problem_size_k) {
+  for (int m = 0; m < problem_size_m; m++) {
+    for (int k = 0; k < problem_size_k; k++) {
+      // First reorder the rows.
+      int group = (sizeof(Element) == 2) ? 32 : 16;
+      int interweave = (sizeof(Element) == 2) ? 4 : 2;
+
+      int dest_row = m / group * group + (m % 8) * interweave + (m % group) / 8;
+      int dest_col = k;
+
+      // Next swizzle the 2x2 blocks from Z to N.
+      if (((dest_row % 2) == 0) && ((dest_col % 2) == 1)) {
+        ++dest_row;
+        --dest_col;
+      } else if (((dest_row % 2) == 1) && ((dest_col % 2) == 0)) {
+        --dest_row;
+        ++dest_col;
+      }
+
+      dest.at({dest_row, dest_col}) = src.at({m, k});
+    }
+  }
+}
+#endif
+
+std::tuple<Tensor, Tensor>
+_to_sparse_semi_structured(const Tensor& dense) {
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+  AT_ERROR(__func__, " : CUTLASS not supported");
+  return std::make_tuple(Tensor{}, Tensor{});
+#else
+  // Check dimensions of the dense matrix.
+  TORCH_CHECK(dense.dim() == 2,
+              __func__, " : Expected dense argument to be 2D tensor, got ",
+              dense.dim(), " dims");
+
+  // Determine PyTorch datatype for the metadata matrix.
+  auto meta_dtype = at::kChar;
+  auto ksparse = 0;
+  auto dense_elems_per_meta_elem = 0;
+  if (dense.dtype() == at::kChar) {
+    meta_dtype = at::kInt;
+    ksparse = 4;
+    dense_elems_per_meta_elem = 32;
+  } else if (dense.dtype() == at::kHalf || dense.dtype() == at::kBFloat16) {
+    meta_dtype = at::kShort;
+    ksparse = 4;
+    dense_elems_per_meta_elem = 16;
+  } else if (dense.dtype() == at::kFloat) {
+    meta_dtype = at::kShort;
+    ksparse = 2;
+    dense_elems_per_meta_elem = 8;
+  } else {
+    AT_ERROR("_to_sparse_semi_structured: Invalid dense argument datatype ",
+             dense.dtype(), " encountered");
+  }
+
+  const auto dense_nrows = dense.size(0);
+  const auto dense_ncols = dense.size(1);
+
+  if (dense_nrows % (meta_dtype == at::kShort ? 32 : 16) != 0) {
+    AT_ERROR("_to_sparse_semi_structured: Number of rows of dense matrix must "
+             "be divisible by ", (meta_dtype == at::kShort ? 32 : 16),
+             ", but it is ", dense_nrows);
+  }
+  if (dense_ncols % dense_elems_per_meta_elem != 0) {
+    AT_ERROR("_to_sparse_semi_structured: Number of columns of dense matrix "
+             "must be divisible by ", dense_elems_per_meta_elem, ", but it is ",
+             dense_ncols);
+  }
+
+  const auto dense_cpu = dense.to("cpu");
+
+  const auto mask_cpu = dense_cpu != at::zeros({1}, dense_cpu.options());
+
+  const auto sparse_cpu =
+    dense_cpu.masked_select(mask_cpu).view({dense_nrows, dense_ncols / 2});
+
+  const auto meta_nrows = dense_nrows;
+  const auto meta_ncols = dense_ncols / dense_elems_per_meta_elem;
+  auto meta_cpu = dense_cpu.new_empty({meta_nrows, meta_ncols},
+                                      at::TensorOptions().dtype(meta_dtype));
+
+  auto* mask_cpu_ptr = mask_cpu.data_ptr<bool>();
+  for (auto i = 0; i < meta_nrows; ++i) {
+    for (auto j = 0; j < meta_ncols; ++j) {
+      uint64_t meta_val = 0;
+      for (auto k = 0; k < dense_elems_per_meta_elem / ksparse; ++k, mask_cpu_ptr += ksparse) {
+        const auto mask_elems =
+          (ksparse == 4) ? std::make_tuple(mask_cpu_ptr[0], mask_cpu_ptr[1],
+                                           mask_cpu_ptr[2], mask_cpu_ptr[3])
+                         : std::make_tuple(mask_cpu_ptr[0], mask_cpu_ptr[0],
+                                           mask_cpu_ptr[1], mask_cpu_ptr[1]);
+        auto meta_quadruple = 0;
+        if (mask_elems == std::make_tuple(1, 1, 0, 0)) {
+          meta_quadruple = 4; // 0100
+        } else if (mask_elems == std::make_tuple(1, 0, 1, 0)) {
+          meta_quadruple = 8; // 1000
+        } else if (mask_elems == std::make_tuple(0, 1, 1, 0)) {
+          meta_quadruple = 9; // 1001
+        } else if (mask_elems == std::make_tuple(1, 0, 0, 1)) {
+          meta_quadruple = 12; // 1100
+        } else if (mask_elems == std::make_tuple(0, 1, 0, 1)) {
+          meta_quadruple = 13; // 1101
+        } else if (mask_elems == std::make_tuple(0, 0, 1, 1)) {
+          meta_quadruple = 14; // 1110
+        } else {
+          AT_ERROR("_to_sparse_semi_structured: dense argument does not match ",
+                   (dense.dtype() != at::kFloat) ? "2:4" : "1:2",
+                   "sparsity pattern");
+        }
+        meta_val = meta_val | (meta_quadruple << (4 * k));
+      }
+      const auto idx = i * meta_ncols + j;
+      if (meta_dtype == at::kShort) {
+        using MetaElement = int16_t;
+        const auto meta_cpu_ptr = meta_cpu.data_ptr<MetaElement>();
+        meta_cpu_ptr[idx] = (MetaElement)meta_val;
+      } else if (meta_dtype == at::kInt) {
+        using MetaElement = int32_t;
+        const auto meta_cpu_ptr = meta_cpu.data_ptr<MetaElement>();
+        meta_cpu_ptr[idx] = (MetaElement)meta_val;
+      }
+    }
+  }
+
+  auto meta_reordered_cpu = meta_cpu.new_empty({meta_nrows, meta_ncols});
+  using MetaLayout = cutlass::layout::RowMajor;
+  using MetaReorderedLayout = cutlass::layout::ColumnMajorInterleaved<2>;
+  if (meta_dtype == at::kShort) {
+    using MetaElement = int16_t;
+    auto meta_cpu_ref =
+      cutlass::TensorRef<MetaElement, MetaLayout>(
+          meta_cpu.data_ptr<MetaElement>(),
+          MetaLayout::packed({meta_nrows, meta_ncols}));
+    auto meta_reordered_cpu_ref =
+      cutlass::TensorRef<MetaElement, MetaReorderedLayout>(
+          meta_reordered_cpu.data_ptr<MetaElement>(),
+          MetaReorderedLayout::packed({meta_nrows, meta_ncols}));
+    reorder_meta(meta_reordered_cpu_ref, meta_cpu_ref, meta_nrows, meta_ncols);
+  } else if (meta_dtype == at::kInt) {
+    using MetaElement = int32_t;
+    auto meta_cpu_ref =
+      cutlass::TensorRef<MetaElement, MetaLayout>(
+          meta_cpu.data_ptr<MetaElement>(),
+          MetaLayout::packed({meta_nrows, meta_ncols}));
+    auto meta_reordered_cpu_ref =
+      cutlass::TensorRef<MetaElement, MetaReorderedLayout>(
+          meta_reordered_cpu.data_ptr<MetaElement>(),
+          MetaReorderedLayout::packed({meta_nrows, meta_ncols}));
+    reorder_meta(meta_reordered_cpu_ref, meta_cpu_ref, meta_nrows, meta_ncols);
+  }
+
+  return std::make_tuple(sparse_cpu.to(dense.device()),
+                         meta_reordered_cpu.to(dense.device()));
+#endif
+}
+
+}  // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredPack.h b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredPack.h
new file mode 100644
index 0000000000000..7ca9ccb4858fa
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredPack.h
@@ -0,0 +1,520 @@
+#pragma once
+
+#include <ATen/native/sparse/cuda/StaticSort.h>
+#include <cutlass/arch/memory.h>
+#include <cutlass/array.h>
+#include <cutlass/bfloat16.h>
+#include <cutlass/fast_math.h>
+#include <cutlass/half.h>
+#include <cutlass/integer_subbyte.h>
+
+namespace at::native {
+
+using cutlass::uint1b_t;
+using cutlass::uint2b_t;
+using cutlass::uint4b_t;
+using uint8b_t = cutlass::integer_subbyte<8, false>;
+using ReorderedLayoutInputE = cutlass::layout::ColumnMajorInterleaved<2>;
+using ElementInputE = uint16_t;
+constexpr int kWarpX = 32;
+constexpr int kWarpY = 64;
+constexpr int kThreadX = 8;
+constexpr int kThreadY = 8;
+
+// bitmask of selected values, in col-major storage
+// eg: indices & (1 << (col + 4 * row))
+using Indices4x4 = uint16_t;
+
+struct Tile8x8Masks {
+  Indices4x4 a, b, c, d;
+  CUTLASS_DEVICE Tile8x8Masks() {
+    a = b = c = d = 0;
+  }
+};
+
+static_assert(sizeof(Tile8x8Masks) == 8, "should be exactly uint64_t");
+
+// Each thread has data for an 8x8 area of the input tensor
+// Due to the very specific format of the metadata, 32 consecutive bits
+// of the metadata tensor will live in 4 different threads.
+// This functions does the required warp shuffling to send data to the
+// right threads.
+// This took some time to write (and get right), hopefully these slides
+// can help
+// https://docs.google.com/presentation/d/1DtmKThv8S5QAyBktuLRYzZhRzCvS1qSkBbrqNCjMPeA/edit#slide=id.g249eb2e2f2e_0_28
+CUTLASS_DEVICE uint32_t
+warp_shuffle_meta(uint32_t meta_ab, bool transposed = false) {
+  // The required format is
+  // (one line = 32 bits)
+  // a[ 0,  0:16] a[ 8,  0:16] <- T0 [left]
+  // a[ 0, 16:32] a[ 8, 16:32]
+  // a[16,  0:16] a[24,  0:16]
+  // a[16, 16:32] a[24, 16:32]
+  // a[ 1,  0:16] a[ 9,  0:16] <- T4
+  // a[ 1, 16:32] a[ 9, 16:32]
+  // a[17,  0:16] a[25,  0:16]
+  // a[17, 16:32] a[25, 16:32]
+  // a[ 2,  0:16] a[10,  0:16] <- T1 [left, bottom]
+  // a[ 2, 16:32] a[10, 16:32]
+  // a[18,  0:16] a[26,  0:16]
+  // a[18, 16:32] a[26, 16:32]
+  // a[ 3,  0:16] a[11,  0:16] <- T5 [bottom]
+  // a[ 3, 16:32] a[11, 16:32]
+  // a[19,  0:16] a[27,  0:16]
+  // a[19, 16:32] a[27, 16:32]
+  // ...
+  // Use warp-shuffles to send data around threads
+  bool thread_left = (threadIdx.y % 2) == 0;
+  bool thread_bottom = threadIdx.x % 2;
+
+  if (transposed) {
+    thread_left = (threadIdx.x % 2) == 0;
+    thread_bottom = threadIdx.y % 2;
+  }
+
+  uint8b_t stage0_data[2] = {
+      uint8b_t(meta_ab >> (8 * thread_left)),
+      uint8b_t(meta_ab >> (8 * (thread_left + 2)))};
+  // shfl t0-t4 / t1-t5
+  stage0_data[0] =
+      uint8b_t(__shfl_xor_sync(0xffffffff, stage0_data[0], transposed ? 1 : 4));
+  stage0_data[1] =
+      uint8b_t(__shfl_xor_sync(0xffffffff, stage0_data[1], transposed ? 1 : 4));
+
+  uint16_t line0 = int(uint8b_t(meta_ab >> (8 * (1 - thread_left))))
+      << ((1 - thread_left) * 8);
+  line0 |= int(stage0_data[0]) << (thread_left * 8);
+  uint16_t line1 = int(uint8b_t(meta_ab >> (8 * (1 - thread_left + 2))))
+      << ((1 - thread_left) * 8);
+  line1 |= int(stage0_data[1]) << (thread_left * 8);
+
+  uint16_t stage1_data = thread_bottom ? line0 : line1;
+  stage1_data = __shfl_xor_sync(0xffffffff, stage1_data, transposed ? 4 : 1);
+
+  uint32_t final_metadata;
+  if (thread_bottom) {
+    final_metadata = uint32_t(stage1_data) | uint32_t(line1) << 16;
+  } else {
+    final_metadata = uint32_t(stage1_data) << 16 | uint32_t(line0);
+  }
+  return final_metadata;
+}
+
+CUTLASS_DEVICE void warp_shuffle_and_write_meta(
+    ElementInputE* metadata_quad,
+    uint32_t meta_ab,
+    bool transposed = false) {
+  bool thread_left = (threadIdx.y % 2) == 0;
+  bool thread_bottom = threadIdx.x % 2;
+
+  if (transposed) {
+    thread_left = (threadIdx.x % 2) == 0;
+    thread_bottom = threadIdx.y % 2;
+  }
+
+  uint32_t final_metadata = warp_shuffle_meta(meta_ab, transposed);
+
+  int index = (!thread_left + 2 * thread_bottom) * 4;
+  ((uint32_t*)metadata_quad)[index] = final_metadata;
+}
+
+template <typename Element_>
+struct KernelTypes {
+  using Element = Element_;
+  using Fragment =
+      cutlass::Array<Element, 8>; // always read from gmem in chunks of 128bits
+  using Fragment4 = cutlass::Array<Element, 4>;
+  using ValuesPacked = cutlass::Array<Element, 8>; // 4 first col, 4 second col
+
+  struct Params {
+    /// inputs
+    Element const* input;
+    int64_t input_s0;
+    int64_t input_dim0;
+    int64_t input_dim1;
+
+    /// outputs
+    Element* packed;
+    int64_t packed_stride;
+
+    Element* packed_trans;
+    int64_t packed_trans_stride;
+
+    uint64_t* threads_masks;
+
+    __host__ dim3 getBlocksGrid() const {
+      return dim3(
+          cutlass::ceil_div(input_dim0, kWarpX),
+          cutlass::ceil_div(input_dim1, kWarpY),
+          1);
+    }
+
+    static CUTLASS_HOST_DEVICE dim3 getThreadsGrid() {
+      return dim3(kWarpX / kThreadX, kWarpY / kThreadY, 1);
+    }
+
+    CUTLASS_DEVICE Tile8x8Masks* getCurrentThreadIndices() const {
+      Tile8x8Masks* gmem_threads_masks = (Tile8x8Masks*)threads_masks;
+      gmem_threads_masks += blockIdx.y * getThreadsGrid().y + threadIdx.y;
+      int64_t strideX = gridDim.y * getThreadsGrid().y;
+      gmem_threads_masks +=
+          (blockIdx.x * getThreadsGrid().x + threadIdx.x) * strideX;
+      return gmem_threads_masks;
+    }
+  };
+
+  struct Tile4x4Accessor {
+    using Element = Element_;
+
+    Fragment (&_lines)[8];
+    int _start_row;
+    int _start_col;
+
+    CUTLASS_DEVICE Tile4x4Accessor(
+        Fragment (&lines)[8],
+        int start_row,
+        int start_col)
+        : _lines(lines), _start_row(start_row), _start_col(start_col) {}
+
+    CUTLASS_DEVICE typename Fragment::reference at(int r, int c) {
+      return _lines[r + _start_row][c + _start_col];
+    }
+  };
+
+  struct Tile4x4Packed {
+    Fragment4 values[2];
+    CUTLASS_DEVICE Tile4x4Packed() {
+      values[0].clear();
+      values[1].clear();
+    }
+  };
+
+  // Returns a packed 4x4 tile (eg 2x4 values) which correspond to the values
+  // that are in `indices`. Also fills the `meta` array in the right format
+  // for consumption in the TensorCores.
+  // Example:
+  //  indices:  0011
+  //            1001
+  //            1001
+  //            0100 (<- note, only 1 value on the last line)
+  //  packed: values[0][2] values[1][0] values[2][0] values[3][1]
+  //          values[0][3] values[1][3] values[2][3] Element(0)
+  CUTLASS_DEVICE static Tile4x4Packed pack_4x4(
+      Indices4x4 indices,
+      Tile4x4Accessor tile,
+      uint32_t& meta,
+      int meta_pos,
+      bool transpose = false) {
+    Tile4x4Packed packed;
+    CUTLASS_PRAGMA_UNROLL
+    for (int row = 0; row < 4; ++row) {
+      uint2b_t col0_from, col1_from;
+      auto packValue = [&](uint2b_t col_to, uint2b_t col_from) {
+        auto value = transpose ? tile.at(col_from, row).get()
+                               : tile.at(row, col_from).get();
+        packed.values[col_to][row] = value;
+        if (col_to == uint2b_t(0)) {
+          col0_from = col_from;
+        } else {
+          col1_from = col_from;
+        }
+      };
+      auto isSelected = [&](int col) {
+        if (transpose) {
+          return indices & (1 << (row + 4 * col));
+        }
+        return indices & (1 << (col + 4 * row));
+      };
+      // Process cols 0/1
+      // We know that col0 is always packed to position 0 if it's there
+      // and col1 is packed to pos 0 or 1 (depending if col0 is selected)
+      if (isSelected(1)) {
+        packValue(uint2b_t(0), uint2b_t(1));
+      }
+      if (isSelected(0)) {
+        packValue(uint2b_t(0), uint2b_t(0));
+      }
+      if (isSelected(0) && isSelected(1)) {
+        packValue(uint2b_t(1), uint2b_t(1));
+      }
+      // Process cols 2/3
+      // same sort of heuristic
+      if (isSelected(2)) {
+        packValue(uint2b_t(1), uint2b_t(2));
+      }
+      if (isSelected(3)) {
+        packValue(uint2b_t(1), uint2b_t(3));
+      }
+      if (isSelected(2) && isSelected(3)) {
+        packValue(uint2b_t(0), uint2b_t(2));
+      }
+      int add_mask = (col0_from | (col1_from << 2)) << (8 * row + meta_pos);
+      meta |= add_mask;
+    }
+    return packed;
+  }
+
+  struct Tile8x8Meta {
+    // meta_ab[row] |= (real_col << (8*row + 2*pos))
+    uint32_t meta_ab;
+    uint32_t meta_cd;
+
+    // meta_ac_trans[col] |= (real_row << (8*col + 2*pos))
+    uint32_t meta_ac_trans;
+    uint32_t meta_bd_trans;
+
+    CUTLASS_DEVICE Tile8x8Meta() {
+      meta_ab = meta_cd = meta_ac_trans = meta_bd_trans = 0;
+    }
+  };
+
+  CUTLASS_DEVICE static void writePacked(
+      Element* ptr,
+      Fragment4 packed0,
+      Fragment4 packed1) {
+    Fragment write;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      write[i] = packed0[i].get();
+      write[i + 4] = packed1[i].get();
+    }
+    cutlass::arch::global_store<Fragment, sizeof(Fragment)>(write, ptr, true);
+  }
+
+  CUTLASS_DEVICE static void writePackedT(
+      Element* ptr,
+      int64_t stride,
+      Tile4x4Packed a,
+      Tile4x4Packed b) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      Fragment4 write;
+      write[0] = a.values[0][i].get();
+      write[1] = a.values[1][i].get();
+      write[2] = b.values[0][i].get();
+      write[3] = b.values[1][i].get();
+      cutlass::arch::global_store<Fragment4, sizeof(Fragment4)>(
+          write, ptr + i * stride, true);
+    }
+  }
+
+  template <typename Algorithm, typename MetadataStore>
+  CUTLASS_DEVICE static void sparse_semi_structured_tile_kernel(
+      Params p,
+      MetadataStore metadata_gmem,
+      Algorithm compute_tile_indices) {
+    // Each thread is responsible for an 8x8 tile, which contains 4 4x4 tiles:
+    // A, B, C and D, as displayed in the following schema:
+    // +---+---+
+    // | A | B |
+    // +---+---+
+    // | C | D |
+    // +---+---+
+    // Each warp (32 threads) will then be responsible for a 32x64 tile of the
+    // input.
+    // This configuration allows to read/write data in 128bits chunks. These
+    // memory accesses are coalesced at the warp-level into 128bytes. See also:
+    // https://docs.google.com/presentation/d/1DtmKThv8S5QAyBktuLRYzZhRzCvS1qSkBbrqNCjMPeA/edit#slide=id.g2494f30c7cf_0_0
+
+    // Top-left of the 8x8 tile we own
+    int warp_x = blockIdx.x * kWarpX;
+    int warp_y = blockIdx.y * kWarpY;
+    int x = warp_x + threadIdx.x * kThreadX;
+    int y = warp_y + threadIdx.y * kThreadY;
+
+    Element const* input = p.input + x * p.input_s0 + y;
+    Element* packed = p.packed + x * p.packed_stride + (y / 2);
+    Element* packed_trans =
+        p.packed_trans + (x / 2) + y * p.packed_trans_stride;
+
+    Fragment lines[8]; // Contains all values from the 8x8 tile
+
+    Tile8x8Meta metadata;
+    Tile8x8Masks indices;
+
+    // Load/process tiles `A` and `B`
+    Element fillValue = Algorithm::template outOfBoundsFillValue<Element>();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      lines[i].fill(fillValue);
+      cutlass::arch::global_load<Fragment, sizeof(Fragment)>(
+          lines[i], input + i * p.input_s0, x + i < p.input_dim0);
+    }
+    indices.a = compute_tile_indices(Tile4x4Accessor(lines, 0, 0));
+    indices.b = compute_tile_indices(Tile4x4Accessor(lines, 0, 4));
+
+    // Compute packed tiles A & B
+    {
+      Tile4x4Packed packed_a = pack_4x4(
+          indices.a, Tile4x4Accessor(lines, 0, 0), metadata.meta_ab, 0);
+      Tile4x4Packed packed_b = pack_4x4(
+          indices.b, Tile4x4Accessor(lines, 0, 4), metadata.meta_ab, 4);
+      writePackedT(packed, p.packed_stride, packed_a, packed_b);
+    }
+
+    // Compute/store packed tiles A & B in transpose output
+    Tile4x4Packed packed_trans_a = pack_4x4(
+        indices.a,
+        Tile4x4Accessor(lines, 0, 0),
+        metadata.meta_ac_trans,
+        0,
+        true);
+    Tile4x4Packed packed_trans_b = pack_4x4(
+        indices.b,
+        Tile4x4Accessor(lines, 0, 4),
+        metadata.meta_bd_trans,
+        0,
+        true);
+    // (NOTE) Now we no longer need A & B (`lines[0:4]`)
+
+    // Load/process tiles `C` and `D`
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 4; i < 8; ++i) {
+      lines[i].fill(fillValue);
+      cutlass::arch::global_load<Fragment, sizeof(Fragment)>(
+          lines[i], input + i * p.input_s0, x + i < p.input_dim0);
+    }
+    indices.c = compute_tile_indices(Tile4x4Accessor(lines, 4, 0));
+    indices.d = compute_tile_indices(Tile4x4Accessor(lines, 4, 4));
+
+    // Compute packed tiles C & D
+    {
+      Tile4x4Packed packed_c = pack_4x4(
+          indices.c, Tile4x4Accessor(lines, 4, 0), metadata.meta_cd, 0);
+      Tile4x4Packed packed_d = pack_4x4(
+          indices.d, Tile4x4Accessor(lines, 4, 4), metadata.meta_cd, 4);
+      writePackedT(
+          packed + 4 * p.packed_stride, p.packed_stride, packed_c, packed_d);
+    }
+
+    // Compute/store packed tiles C & D in transpose output
+    Tile4x4Packed packed_trans_c = pack_4x4(
+        indices.c,
+        Tile4x4Accessor(lines, 4, 0),
+        metadata.meta_ac_trans,
+        4,
+        true);
+    Tile4x4Packed packed_trans_d = pack_4x4(
+        indices.d,
+        Tile4x4Accessor(lines, 4, 4),
+        metadata.meta_bd_trans,
+        4,
+        true);
+
+    // Dump the metadata in a nice format
+    *p.getCurrentThreadIndices() = indices;
+
+    // Store packed A, B, C & D for transposed matrix
+    writePackedT(
+        packed_trans, p.packed_trans_stride, packed_trans_a, packed_trans_c);
+    packed_trans += 4 * p.packed_trans_stride;
+    writePackedT(
+        packed_trans, p.packed_trans_stride, packed_trans_b, packed_trans_d);
+
+    // Writing meta non-transposed
+    {
+      ElementInputE* packed_meta_reordered = metadata_gmem.get_metaN(
+          warp_x, threadIdx.x * kThreadX, warp_y, threadIdx.y * kThreadY);
+      warp_shuffle_and_write_meta(packed_meta_reordered, metadata.meta_ab);
+      warp_shuffle_and_write_meta(packed_meta_reordered + 32, metadata.meta_cd);
+    }
+
+    // Writing meta transposed
+    {
+      ElementInputE* packed_trans_meta_reordered = metadata_gmem.get_metaT(
+          warp_x, threadIdx.x * kThreadX, warp_y, threadIdx.y * kThreadY);
+      warp_shuffle_and_write_meta(
+          packed_trans_meta_reordered, metadata.meta_ac_trans, true);
+      warp_shuffle_and_write_meta(
+          packed_trans_meta_reordered + 32, metadata.meta_bd_trans, true);
+    }
+  }
+
+  CUTLASS_DEVICE static void sparse_semi_structured_apply_kernel(Params p) {
+    // See `sparse24_sparsify_both_ways_kernel`
+    // It's basically the same, just that we skip
+    // the part where compute the indices we keep
+
+    // Top-left of the 8x8 tile we own
+    int warp_x = blockIdx.x * kWarpX;
+    int warp_y = blockIdx.y * kWarpY;
+    int x = warp_x + threadIdx.x * kThreadX;
+    int y = warp_y + threadIdx.y * kThreadY;
+
+    Element const* input = p.input + x * p.input_s0 + y;
+    Element* packed = p.packed + x * p.packed_stride + (y / 2);
+    Element* packed_trans =
+        p.packed_trans + (x / 2) + y * p.packed_trans_stride;
+
+    Fragment lines[8]; // Contains all values from the 8x8 tile
+
+    Tile8x8Meta metadata;
+    Tile8x8Masks indices = *p.getCurrentThreadIndices();
+
+    // Load/process tiles `A` and `B`
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+      // NB: Values outside bounds is undefined, but shouldn't
+      // be used anywhere
+      cutlass::arch::global_load<Fragment, sizeof(Fragment)>(
+          lines[i], input + i * p.input_s0, x + i < p.input_dim0);
+    }
+
+    // Compute packed tiles A & B
+    {
+      Tile4x4Packed packed_a = pack_4x4(
+          indices.a, Tile4x4Accessor(lines, 0, 0), metadata.meta_ab, 0);
+      Tile4x4Packed packed_b = pack_4x4(
+          indices.b, Tile4x4Accessor(lines, 0, 4), metadata.meta_ab, 4);
+      writePackedT(packed, p.packed_stride, packed_a, packed_b);
+    }
+
+    // Compute/store packed tiles A & B in transpose output
+    Tile4x4Packed packed_trans_a = pack_4x4(
+        indices.a,
+        Tile4x4Accessor(lines, 0, 0),
+        metadata.meta_ac_trans,
+        0,
+        true);
+    Tile4x4Packed packed_trans_b = pack_4x4(
+        indices.b,
+        Tile4x4Accessor(lines, 0, 4),
+        metadata.meta_bd_trans,
+        0,
+        true);
+    // (NOTE) Now we no longer need A & B (`lines[0:4]`)
+
+    // Compute packed tiles C & D
+    {
+      Tile4x4Packed packed_c = pack_4x4(
+          indices.c, Tile4x4Accessor(lines, 4, 0), metadata.meta_cd, 0);
+      Tile4x4Packed packed_d = pack_4x4(
+          indices.d, Tile4x4Accessor(lines, 4, 4), metadata.meta_cd, 4);
+      writePackedT(
+          packed + 4 * p.packed_stride, p.packed_stride, packed_c, packed_d);
+    }
+
+    // Compute/store packed tiles C & D in transpose output
+    Tile4x4Packed packed_trans_c = pack_4x4(
+        indices.c,
+        Tile4x4Accessor(lines, 4, 0),
+        metadata.meta_ac_trans,
+        4,
+        true);
+    Tile4x4Packed packed_trans_d = pack_4x4(
+        indices.d,
+        Tile4x4Accessor(lines, 4, 4),
+        metadata.meta_bd_trans,
+        4,
+        true);
+
+    // Store packed A, B, C & D for transposed matrix
+    writePackedT(
+        packed_trans, p.packed_trans_stride, packed_trans_a, packed_trans_c);
+    packed_trans += 4 * p.packed_trans_stride;
+    writePackedT(
+        packed_trans, p.packed_trans_stride, packed_trans_b, packed_trans_d);
+  }
+};
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu
new file mode 100644
index 0000000000000..fd5a04fa61039
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredTile.cu
@@ -0,0 +1,312 @@
+#include <ATen/ScalarOps.h>
+#include <ATen/Functions.h>
+#include <ATen/Tensor.h>
+#include <ATen/autocast_mode.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/cuda/CUDAUtils.h>
+#include <ATen/Dispatch.h>
+
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#else
+#include <ATen/native/sparse/cuda/ComputeSparseTile.h>
+#include <ATen/native/sparse/cuda/SparseSemiStructuredPack.h>
+#include <cuda_runtime.h>
+#endif
+
+namespace at::native {
+
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#else
+struct MetadataCuSparseLt {
+  // Format used by cuSparseLt
+  // This is based on reverse-engineering, for a visual illustration:
+  // https://docs.google.com/presentation/d/1DtmKThv8S5QAyBktuLRYzZhRzCvS1qSkBbrqNCjMPeA/edit#slide=id.g29afe95bda8_0_0
+  static constexpr int kStrideBlock32x32 = (32 * 32) / (sizeof(ElementInputE) * 8);
+
+  ElementInputE* _meta;
+  ElementInputE* _meta_trans;
+  int64_t _rows;
+  int64_t _cols;
+
+  static int64_t getMetadataSize(int rows, int cols)
+  {
+    TORCH_CHECK(rows % 128 == 0 && cols % 128 == 0, "Only supports rows/cols multiples of 128");
+    // 1 bit per dense value
+    return (rows * cols) / (8 * sizeof(ElementInputE));
+  }
+
+  // < return value of the function, packed, packed_meta >
+  static std::tuple<Tensor, Tensor, Tensor> create_compressed_representation(int rows, int cols, at::Tensor const& like)
+  {
+    TORCH_CHECK(
+        like.scalar_type() == at::ScalarType::Half ||
+        like.scalar_type() == at::ScalarType::BFloat16);
+    constexpr int kBytesPerScalar = 2;
+    int64_t data_scalars = rows * cutlass::ceil_div(cols, 2);
+    int64_t meta_scalars = getMetadataSize(rows, cols);
+
+    at::Tensor storage = at::empty(
+        {(data_scalars + meta_scalars)},
+        at::TensorOptions().device(like.device()).dtype(like.dtype()));
+
+    using at::indexing::Slice;
+    using at::indexing::None;
+    at::Tensor packed = storage.index({Slice(None, data_scalars)})
+                            .view({rows, cutlass::ceil_div(cols, 2)});
+    at::Tensor metadata = storage.index({Slice(data_scalars, None)});
+    // TODO: Cast metadata to Short
+    static_assert(kBytesPerScalar == 2, "or modify the last dim below");
+    metadata = metadata.view({rows / 128, cols / 32, 256});
+    return std::make_tuple(storage, packed, metadata);
+  }
+
+  MetadataCuSparseLt(at::Tensor metaN, at::Tensor metaT, int rows, int cols) {
+    _meta = (ElementInputE*)metaN.data_ptr();
+    _meta_trans = (ElementInputE*)metaT.data_ptr();
+    _rows = rows;
+    _cols = cols;
+  }
+  CUTLASS_HOST_DEVICE
+  static int64_t _get_meta_offset(
+      int warp_row,
+      int thread_row,
+      int warp_col,
+      int thread_col,
+      int totalRows) {
+    int64_t offset = 0;
+    // warp-level: Find the 128x64 tile
+    offset += (warp_row / 128) * (kStrideBlock32x32 * 8);
+    offset += (warp_col / 64) * (kStrideBlock32x32 * 8) * (totalRows / 128);
+    // Find the 32x32 tile inside
+    offset += (((warp_row + thread_row) % 128) / 32) * kStrideBlock32x32;
+    offset += (((warp_col + thread_col) % 64) / 32) * (kStrideBlock32x32 * 4);
+    // Inside the 32x32 tile
+    offset += (warp_row % 32) * 2;
+    // Top/bottom 16x16 tile
+    offset += ((thread_row % 32) / 16) * 4;
+    // Left/right 16x16 tile
+    offset += ((thread_col % 32) / 16) * 2;
+    return offset;
+  }
+  CUTLASS_HOST_DEVICE
+  ElementInputE* get_metaN(
+      int warp_row,
+      int thread_row,
+      int warp_col,
+      int thread_col) const {
+    return _meta +
+        _get_meta_offset(warp_row, thread_row, warp_col, thread_col, _rows);
+  }
+  CUTLASS_HOST_DEVICE
+  ElementInputE* get_metaT(
+      int warp_row,
+      int thread_row,
+      int warp_col,
+      int thread_col) const {
+    return _meta_trans +
+        _get_meta_offset(warp_col, thread_col, warp_row, thread_row, _cols);
+  }
+};
+
+struct MetadataCutlass {
+  // Layout needed to run 2:4 gemms in CUTLASS
+  // There is basically a hardware specific value for every
+  // 32x32 dense tile (1024 bits). Then these tiles are
+  // stored in a Column-Major fashion
+  ElementInputE* _meta;
+  ElementInputE* _meta_trans;
+  int64_t _meta_reordered_sy;
+  int64_t _meta_trans_reordered_sx;
+
+  static std::tuple<
+      at::Tensor, // return value of the function
+      at::Tensor, // packed
+      at::Tensor // packed_meta
+      >
+  create_compressed_representation(int rows, int cols, at::Tensor const& like) {
+    TORCH_CHECK(
+        like.scalar_type() == at::ScalarType::Half ||
+        like.scalar_type() == at::ScalarType::BFloat16);
+    auto roundedx = cutlass::round_up(rows, kWarpX);
+    auto roundedy = cutlass::round_up(cols, kWarpY);
+
+    // NB: Writing to `packed` tensors in transposed manner
+    at::Tensor packed =
+        at::empty({roundedx, cutlass::ceil_div(roundedy, 2)}, like.options());
+    at::Tensor packed_meta = at::empty(
+                                 {roundedx * roundedy / 16},
+                                 like.options().dtype(at::ScalarType::Short))
+                                 .view({roundedy / 32, roundedx, 2})
+                                 .permute({1, 2, 0});
+    return std::make_tuple(packed, packed, packed_meta);
+  }
+  MetadataCutlass(at::Tensor metaN, at::Tensor metaT, int rows, int cols) {
+    _meta = (ElementInputE*)metaN.data_ptr();
+    _meta_reordered_sy = metaN.stride(2);
+    _meta_trans = (ElementInputE*)metaT.data_ptr();
+    _meta_trans_reordered_sx = metaT.stride(2);
+  }
+  CUTLASS_HOST_DEVICE
+  int64_t _get_meta_offset(
+      int warp_row,
+      int thread_row,
+      int warp_col,
+      int thread_col,
+      int64_t stride) const {
+    int64_t offset = 0;
+    offset += warp_row * 2 + (warp_col / 32) * stride;
+    // A single warp is 32x64. The right 32x32 tile is at a different position
+    offset += 64 * (thread_row / 32);
+    offset += (thread_col / 32) * stride;
+    // Top/bottom 16x16 tile
+    offset += ((thread_row % 32) / 16) * 4;
+    // Left/right 16x16 tile
+    offset += ((thread_col % 32) / 16) * 2;
+    return offset;
+  }
+  CUTLASS_HOST_DEVICE
+  ElementInputE* get_metaN(
+      int warp_row,
+      int thread_row,
+      int warp_col,
+      int thread_col) const {
+    return _meta +
+        _get_meta_offset(
+               warp_row, thread_row, warp_col, thread_col, _meta_reordered_sy);
+  }
+  CUTLASS_HOST_DEVICE
+  ElementInputE* get_metaT(
+      int warp_row,
+      int thread_row,
+      int warp_col,
+      int thread_col) const {
+    return _meta_trans +
+        _get_meta_offset(
+               warp_col,
+               thread_col,
+               warp_row,
+               thread_row,
+               _meta_trans_reordered_sx);
+  }
+};
+
+template <typename KT, typename Metadata, typename Algorithm>
+__global__ void __launch_bounds__(32 /* num_threads */, 20)
+    sparse_semi_structured_tile_kernel(
+        typename KT::Params p,
+        Metadata metadata,
+        Algorithm algo) {
+  KT::sparse_semi_structured_tile_kernel(p, metadata, algo);
+}
+
+template <typename Element, typename MetadataFormat>
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> sparse_semi_structured_tile_typed(
+        const at::Tensor input,
+        std::string algorithm)
+{
+  using KT = KernelTypes<Element>;
+  c10::optional<at::cuda::CUDAGuard> device_guard;
+  if (!input.is_meta()) {
+    device_guard.emplace(input.device());
+  }
+
+  TORCH_CHECK(input.dim() == 2, "Can only sparsify 2d tensors");
+  TORCH_CHECK(
+      input.stride(1) == 1,
+      "Can only sparsify contiguous tensors. Sparsify the transpose otherwise.");
+
+  auto rows = input.size(0);
+  auto cols = input.size(1);
+
+  auto [compressed, packed, packed_meta_reordered] =
+      MetadataFormat::create_compressed_representation(rows, cols, input);
+  auto [compressed_trans, packed_trans, packed_trans_meta_reordered] =
+      MetadataFormat::create_compressed_representation(cols, rows, input);
+  TORCH_CHECK(
+      input.size(1) % 32 == 0, "Number of cols should be multiple of 32");
+
+  typename KT::Params p;
+  p.input = (Element const*)input.data_ptr();
+  p.input_s0 = input.stride(0);
+  p.input_dim0 = input.size(0);
+  p.input_dim1 = input.size(1);
+
+  p.packed = (Element*)packed.data_ptr();
+  p.packed_stride = packed.stride(0);
+  p.packed_trans = (Element*)packed_trans.data_ptr();
+  p.packed_trans_stride = packed_trans.stride(0);
+
+  MetadataFormat metadata = MetadataFormat(
+      packed_meta_reordered, packed_trans_meta_reordered, rows, cols);
+  at::Tensor threads_masks = at::empty(
+      {p.getBlocksGrid().x * p.getThreadsGrid().x,
+       p.getBlocksGrid().y * p.getThreadsGrid().y,
+       sizeof(p.threads_masks[0])},
+      input.options().dtype(at::ScalarType::Byte));
+  p.threads_masks = (uint64_t*)threads_masks.data_ptr();
+
+  bool kernel_launched = false;
+  auto launchKernel = [&](auto algo, std::string const& algo_name) {
+    if (algo_name == algorithm) {
+      kernel_launched = true;
+      if (input.is_meta()) {
+        return;
+      }
+      size_t smem_bytes = 0;
+      sparse_semi_structured_tile_kernel<KT>
+          <<<p.getBlocksGrid(),
+             p.getThreadsGrid(),
+             smem_bytes,
+             at::cuda::getCurrentCUDAStream()>>>(p, metadata, algo);
+    }
+  };
+  named_algorithms(launchKernel);
+  TORCH_CHECK(kernel_launched, "Unknown algorithm \"", algorithm, "\"");
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  return std::make_tuple(
+      compressed,
+      packed_meta_reordered,
+      compressed_trans,
+      packed_trans_meta_reordered,
+      threads_masks);
+}
+#endif
+
+// <packed, packed_meta_reordered, packed_trans, packed_trans_meta_reorderd, threads_masks>
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _sparse_semi_structured_tile(
+  const Tensor& input,
+  c10::string_view algorithm,
+  bool use_cutlass)
+{
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+  AT_ERROR("_sparse_semi_structured_tile: not supported");
+  return std::make_tuple(Tensor{}, Tensor{}, Tensor{}, Tensor{}, Tensor{});
+#else
+  std::string algo(algorithm.data(), algorithm.size());
+
+  auto runTyped = [&](auto type)
+  {
+    using ElementT = decltype(type);
+    if (use_cutlass) {
+      return sparse_semi_structured_tile_typed<ElementT, MetadataCutlass>(input, algo);
+    }
+    else {
+      return sparse_semi_structured_tile_typed<ElementT, MetadataCuSparseLt>(input, algo);
+    }
+  };
+
+  if (input.scalar_type() == at::ScalarType::Half)
+  {
+    return runTyped(cutlass::half_t());
+  } else {
+    TORCH_CHECK(
+        input.scalar_type() == at::ScalarType::Half ||
+        input.scalar_type() == at::ScalarType::BFloat16, input.scalar_type());
+    return runTyped(cutlass::bfloat16_t());
+  }
+#endif
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu
new file mode 100644
index 0000000000000..023e8f73930fd
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu
@@ -0,0 +1,107 @@
+#include <ATen/ScalarOps.h>
+#include <ATen/Tensor.h>
+#include <ATen/Functions.h>
+#include <ATen/Utils.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/accumulate.h>
+
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#else
+#include <ATen/native/sparse/cuda/SparseSemiStructuredPack.h>
+#endif
+
+namespace at::native {
+
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+#else
+template <typename KT>
+__global__ void __launch_bounds__(32 /* num_threads */)
+  sparse_semi_structured_apply_kernel(typename KT::Params p)
+{
+  KT::sparse_semi_structured_apply_kernel(p);
+}
+
+// Apply a 2:4 sparsify pattern computed with
+// `_sparse_semi_structured_tile` to another Tensor
+template <bool kIsMeta, typename Element>
+std::tuple<Tensor, Tensor> _sparse_semi_structured_apply_typed(Tensor input, Tensor threads_masks)
+{
+  using KT = KernelTypes<Element>;
+  // TODO: Technically we should be able to deal with that
+  // by running on the transpose of `input` and swapping
+  // `packed` & `packed_t`.
+  // This would require to adapt the `threads_masks` a bit tho.
+  if (input.stride(1) != 1) {
+    input = input.contiguous();
+  }
+  c10::optional<at::cuda::CUDAGuard> device_guard;
+  if (!kIsMeta) {
+    device_guard.emplace(input.device());
+  }
+
+  TORCH_CHECK(input.dim() == 2);
+  TORCH_CHECK(input.stride(1) == 1);
+  TORCH_CHECK(input.stride(0) % 8 == 0);
+  TORCH_CHECK(input.size(1) % 32 == 0, "Wrong alignment shape[1]");
+
+  auto roundedx = cutlass::round_up(input.size(0), kWarpX);
+  auto roundedy = cutlass::round_up(input.size(1), kWarpY);
+  at::Tensor packed =
+      at::empty({roundedx, cutlass::ceil_div(roundedy, 2)}, input.options());
+  at::Tensor packed_trans =
+      at::empty({roundedy, cutlass::ceil_div(roundedx, 2)}, input.options());
+
+  typename KT::Params p;
+  p.input = (Element const*)input.data_ptr();
+  p.input_s0 = input.stride(0);
+  p.input_dim0 = input.size(0);
+  p.input_dim1 = input.size(1);
+
+  p.packed = (Element*)packed.data_ptr();
+  p.packed_stride = packed.stride(0);
+  p.packed_trans = (Element*)packed_trans.data_ptr();
+  p.packed_trans_stride = packed_trans.stride(0);
+
+  p.threads_masks = (uint64_t*)threads_masks.data_ptr();
+
+  TORCH_CHECK(threads_masks.dim() == 3);
+  TORCH_CHECK(
+      threads_masks.size(0) == p.getBlocksGrid().x * p.getThreadsGrid().x);
+  TORCH_CHECK(
+      threads_masks.size(1) == p.getBlocksGrid().y * p.getThreadsGrid().y);
+  TORCH_CHECK(threads_masks.stride(1) == sizeof(p.threads_masks[0]));
+  TORCH_CHECK(threads_masks.size(2) == sizeof(p.threads_masks[0]));
+  TORCH_CHECK(threads_masks.stride(2) == 1);
+  TORCH_CHECK(threads_masks.scalar_type() == at::ScalarType::Byte);
+
+  if (!kIsMeta) {
+    size_t smem_bytes = 0;
+    sparse_semi_structured_apply_kernel<KT>
+        <<<p.getBlocksGrid(),
+           p.getThreadsGrid(),
+           smem_bytes,
+           at::cuda::getCurrentCUDAStream()>>>(p);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+  return std::make_tuple(packed, packed_trans);
+}
+#endif
+
+std::tuple<Tensor, Tensor> _sparse_semi_structured_apply(const Tensor& input, const Tensor& threads_masks) // Returned by `_sparse_semi_structured_tile`
+{
+#if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
+  AT_ERROR("_sparse_semi_structured_apply: not supported");
+  return std::make_tuple(Tensor{}, Tensor{});
+#else
+  TORCH_CHECK(
+    input.scalar_type() == at::ScalarType::Half || input.scalar_type() == at::ScalarType::BFloat16,
+    "Unsupported dtype - only `float16` and `bfloat16` are supported currently"
+  );
+  auto result = (input.scalar_type() == at::ScalarType::Half)
+            ? _sparse_semi_structured_apply_typed<false, cutlass::half_t>(input, threads_masks)
+            : _sparse_semi_structured_apply_typed<false, cutlass::bfloat16_t>(input, threads_masks);
+  return result;
+#endif
+}
+
+} // namespace
diff --git a/aten/src/ATen/native/sparse/cuda/StaticSort.h b/aten/src/ATen/native/sparse/cuda/StaticSort.h
new file mode 100644
index 0000000000000..e2fc5675f6065
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/StaticSort.h
@@ -0,0 +1,100 @@
+#pragma once
+#include <cutlass/cutlass.h>
+
+/**
+ * A Functor class to create a sort for fixed sized arrays/containers with a
+ * compile time generated Bose-Nelson sorting network.
+ * \tparam NumElements  The number of elements in the array or container to
+ * sort. \tparam T            The element type. \tparam Compare      A
+ * comparator functor class that returns true if lhs < rhs.
+ */
+template <unsigned NumElements>
+class StaticSort {
+  template <class A>
+  struct Swap {
+    template <class T>
+    CUTLASS_HOST_DEVICE void s(T& v0, T& v1) {
+      // Explicitly code out the Min and Max to nudge the compiler
+      // to generate branchless code.
+      T t = v0 < v1 ? v0 : v1; // Min
+      v1 = v0 < v1 ? v1 : v0; // Max
+      v0 = t;
+    }
+
+    CUTLASS_HOST_DEVICE Swap(A& a, const int& i0, const int& i1) {
+      s(a[i0], a[i1]);
+    }
+  };
+
+  template <class A, int I, int J, int X, int Y>
+  struct PB {
+    CUTLASS_HOST_DEVICE PB(A& a) {
+      enum {
+        L = X >> 1,
+        M = (X & 1 ? Y : Y + 1) >> 1,
+        IAddL = I + L,
+        XSubL = X - L
+      };
+      PB<A, I, J, L, M> p0(a);
+      PB<A, IAddL, J + M, XSubL, Y - M> p1(a);
+      PB<A, IAddL, J, XSubL, M> p2(a);
+    }
+  };
+
+  template <class A, int I, int J>
+  struct PB<A, I, J, 1, 1> {
+    CUTLASS_HOST_DEVICE PB(A& a) {
+      Swap<A> s(a, I - 1, J - 1);
+    }
+  };
+
+  template <class A, int I, int J>
+  struct PB<A, I, J, 1, 2> {
+    CUTLASS_HOST_DEVICE PB(A& a) {
+      Swap<A> s0(a, I - 1, J);
+      Swap<A> s1(a, I - 1, J - 1);
+    }
+  };
+
+  template <class A, int I, int J>
+  struct PB<A, I, J, 2, 1> {
+    CUTLASS_HOST_DEVICE PB(A& a) {
+      Swap<A> s0(a, I - 1, J - 1);
+      Swap<A> s1(a, I, J - 1);
+    }
+  };
+
+  template <class A, int I, int M, bool Stop = false>
+  struct PS {
+    CUTLASS_HOST_DEVICE PS(A& a) {
+      enum { L = M >> 1, IAddL = I + L, MSubL = M - L };
+      PS<A, I, L, (L <= 1)> ps0(a);
+      PS<A, IAddL, MSubL, (MSubL <= 1)> ps1(a);
+      PB<A, I, IAddL, L, MSubL> pb(a);
+    }
+  };
+
+  template <class A, int I, int M>
+  struct PS<A, I, M, true> {
+    CUTLASS_HOST_DEVICE PS(A& a) {}
+  };
+
+ public:
+  /**
+   * Sorts the array/container arr.
+   * \param  arr  The array/container to be sorted.
+   */
+  template <class Container>
+  CUTLASS_HOST_DEVICE void operator()(Container& arr) const {
+    PS<Container, 1, NumElements, (NumElements <= 1)> ps(arr);
+  };
+
+  /**
+   * Sorts the array arr.
+   * \param  arr  The array to be sorted.
+   */
+  template <class T>
+  CUTLASS_HOST_DEVICE void operator()(T* arr) const {
+    PS<T*, 1, NumElements, (NumElements <= 1)> ps(arr);
+  };
+};
diff --git a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
index b443dc08e6b7a..c66fbf8f2a93d 100644
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@@ -135,7 +135,8 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
         compute_type = CUSPARSE_COMPUTE_32I;
         compression_factor = 10;
         break;
-// cuSPARSELt v0.5.2 onwards changes CUSPARSE_COMPUTE_TF32, CUSPARES_COMPUT_16F to CUSPARSE_COMPUTE_32F
+
+// cuSPARSELt v0.5.2 onwards changes CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUT_16F to CUSPARSE_COMPUTE_32F
 #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 502
     case at::ScalarType::Half:
         input_type = CUDA_R_16F;
@@ -152,7 +153,8 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
         output_type = CUDA_R_32F;
         compute_type = CUSPARSE_COMPUTE_32F;
         break;
-// cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARES_COMPUTE_16F
+
+// cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUTE_16F
 #else
     case at::ScalarType::Half:
         input_type = CUDA_R_16F;
@@ -170,21 +172,30 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
         compute_type = CUSPARSE_COMPUTE_TF32;
         break;
 #endif
+
     default:
         TORCH_CHECK(false, "Unsupported dtype for cuSPARSELt compressed matrix multiplication.");
         break;
   }
   ScalarType out_dtype = dense_B.scalar_type();
-  // special check for int8 int8 -> fp16 support
+  // special check for mixed dtype int8 int8 -> {fp16, bf16, int32} support
   if (out_dtype_opt.has_value()) {
     out_dtype = out_dtype_opt.value();
-    if (input_type == CUDA_R_8I and out_dtype == at::ScalarType::Half)
+    TORCH_CHECK(input_type == CUDA_R_8I, "out_dtype support only available for int8 inputs");
+    switch (out_dtype)
     {
-        output_type = CUDA_R_16F;
-    }
-    else
-    {
-        TORCH_CHECK(false, "Setting out_dtype is only supported for int8 input and fp16 output.");
+        case at::ScalarType::Half:
+            output_type = CUDA_R_16F;
+            break;
+        case at::ScalarType::BFloat16:
+            output_type = CUDA_R_16BF;
+            break;
+        case at::ScalarType::Int:
+            output_type = CUDA_R_32I;
+            break;
+        default:
+            TORCH_CHECK(false, "Unsupported out_dtype passed, must be one of {fp16, bf16, int32}");
+            break;
     }
   }
 
@@ -205,7 +216,7 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
       CUSPARSE_ORDER_ROW,
       CUSPARSELT_SPARSITY_50_PERCENT));
 
-  // initalize dense input descriptor
+  // initialize dense input descriptor
   cusparseLtMatDescriptor_t dense_input_descriptor;
   TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
       &handle,
@@ -233,7 +244,7 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
       output_type,
       (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
 
-  // intialize matmul
+  // initialize matmul
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
       &handle,
       &matmul,
@@ -245,7 +256,7 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
       &res_descriptor,
       compute_type));
 
-  // set bias pointer for matmut, need to assign to get location
+  // set bias pointer for matmul, need to assign to get location
   if (bias_opt.has_value()) {
     auto& bias = bias_opt.value();
     void* dBias = bias.data_ptr();
@@ -260,7 +271,7 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSetAttribute(
       &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg_id, sizeof(alg_id)));
 
-  // set tensor_alpha_mode and alpha pointer for matmut
+  // set tensor_alpha_mode and alpha pointer for matmul
   const auto alpha_tensor = alpha_opt.has_value() ? *alpha_opt: Tensor{};
   const auto alpha_ptr = alpha_opt.has_value() ? alpha_tensor.data_ptr(): &alpha;
   if (alpha_opt.has_value()) {
diff --git a/aten/src/ATen/native/tags.yaml b/aten/src/ATen/native/tags.yaml
index 3d3d6a454653e..c31721729036f 100644
--- a/aten/src/ATen/native/tags.yaml
+++ b/aten/src/ATen/native/tags.yaml
@@ -42,6 +42,10 @@
   desc: |
           This tag indicates if an operator doesn't guarantee bitwise equivalence
           across different runs of an operator with identical inputs.
+- tag: needs_fixed_stride_order
+  desc: |
+          This tag indicates that the operator should be passed Tensors following
+          the same stride permutation as observed in eager when compiled in inductor.
 
 # NOTE [Core ATen Ops]
 - tag: core
diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index 9bebee2cb9559..e26de29537954 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -37,6 +37,7 @@
 #include <ATen/ops/_scaled_dot_product_flash_attention.h>
 #include <ATen/ops/_scaled_dot_product_flash_attention_backward_native.h>
 #include <ATen/ops/_scaled_dot_product_flash_attention_native.h>
+#include <ATen/ops/_scaled_dot_product_cudnn_attention.h>
 #include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu.h>
 #include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu_native.h>
 #include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu_backward.h>
@@ -496,8 +497,8 @@ inline void validate_sdpa_input(
       query_.dim(), " key.dim: ", key.dim(), " and value.dim: ", value.dim(), " instead.");
   if (attn_mask_.has_value()){
     auto mask_dtype = attn_mask_->dtype();
-    TORCH_CHECK(mask_dtype == at::kBool || mask_dtype == query_.dtype(),
-      "Expected attn_mask dtype to be bool or to match query dtype, but got attn_mask.dtype: ",
+    TORCH_CHECK(mask_dtype == at::kBool || mask_dtype == at::kFloat || mask_dtype == query_.dtype(),
+      "Expected attn_mask dtype to be bool or float or to match query dtype, but got attn_mask.dtype: ",
       mask_dtype, " and  query.dtype: ", query_.dtype(), " instead.");
     TORCH_CHECK(
       !query_.is_nested() && !key.is_nested(),
@@ -596,6 +597,17 @@ at::Tensor post_process_flash_output(
   return out;
 }
 
+int64_t handle_private_use(const Tensor& query_, const Tensor& key, const Tensor& value,
+    const c10::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal, c10::optional<double> scale){
+  int64_t choice_int = static_cast<int64_t>(sdp::SDPBackend::math);
+  try {
+    choice_int = _fused_sdp_choice_stub(query_.device().type(),
+        query_, key, value, attn_mask_, dropout_p, is_causal, scale);
+  } catch(const ::c10::Error& e){
+  }
+  return choice_int;
+}
+
 } // namespace
 
 // Computes scaled dot product attention on query, key and value tensors, using
@@ -606,8 +618,10 @@ at::Tensor post_process_flash_output(
 //     query (Tensor): Query tensor; shape (N, ..., L, E)
 //     key (Tensor): Key tensor; shape (N, ..., S, E)
 //     value (Tensor): Value tensor; shape (N, ..., S, E)
-//     attn_mask (optional Tensor): Attention mask; shape (N, ..., L, S) or (L, S). Currently, only a boolean mask
-//         is supported, where a value of True indicates that the element *should* take part in attention.
+//     attn_mask (optional Tensor): Attention mask; shape must be broadcastable to the shape of attention weights,
+//         which is (N,..., L, S). Two types of masks are supported.
+//         A boolean mask where a value of True indicates that the element *should* take part in attention.
+//         A float mask of the same type as query, key, value that is added to the attention score.
 //     dropout_p (float): Dropout probability; if greater than 0.0, dropout is applied
 //     need_attn_weights (bool): If true, the second return value will contain the attention weights used;
 //         otherwise, the second return value is unspecified
@@ -616,9 +630,8 @@ at::Tensor post_process_flash_output(
 //         to get specialized support for causal masks (and other types of masking e.g. local attention / block
 //         sparse masks) via tensor subclassing, allowing for a leaner API.
 //
-// Returns a tuple containing:
+// Returns a tensor:
 //     output (Tensor): Attention output; shape (N, ..., L, E)
-//     attn_weights (Tensor): Attention weighting; shape (N, ..., L, S)
 //
 // Shape legend:
 //     N: Batch size
@@ -638,13 +651,27 @@ Tensor scaled_dot_product_attention(
   int64_t choice_int = static_cast<int64_t>(sdp::SDPBackend::math);
   if (query_.device().type() == DeviceType::CUDA
       || query_.device().type() == DeviceType::CPU
-      || query_.device().type() == DeviceType::HIP){
-    choice_int = _fused_sdp_choice_stub(query_.device().type(),
-      query_, key, value, attn_mask_, dropout_p, is_causal, scale);
+      || query_.device().type() == DeviceType::HIP
+      || query_.device().type() == DeviceType::PrivateUse1){
+    if (query_.device().type() == DeviceType::PrivateUse1){
+      choice_int = handle_private_use(
+          query_, key, value, attn_mask_, dropout_p, is_causal, scale);
+    } else {
+      choice_int = _fused_sdp_choice_stub(query_.device().type(),
+          query_, key, value, attn_mask_, dropout_p, is_causal, scale);
+    }
   }
   sdp::SDPBackend backend = static_cast<sdp::SDPBackend>(choice_int);
   c10::optional<Tensor> attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype());
   switch (backend) {
+    case sdp::SDPBackend::cudnn_attention: {
+      bool compute_logsumexp =
+          (query_.requires_grad() || key.requires_grad() ||
+           value.requires_grad());
+      auto out_lse_softmax = at::_scaled_dot_product_cudnn_attention(
+          query_, key, value, dropout_p, is_causal, compute_logsumexp, scale);
+      return std::get<0>(out_lse_softmax);
+    }
     case sdp::SDPBackend::flash_attention: {
       if(query_.device().type() == DeviceType::CUDA){
         c10::SymInt og_size = query_.sym_size(-1);
@@ -760,8 +787,8 @@ _scaled_dot_product_flash_attention_cpu(
   int64_t num_head = query.size(1);
   int64_t headSize = query.size(3);
 
-  TORCH_CHECK(c10::isFloatingType(dtype) && dtype != ScalarType::Half,
-    "scaled_dot_product_attention_flash_attention: Expected data type in FP32, FP64, BF16, but got ", dtype, " instead.");
+  TORCH_CHECK(c10::isFloatingType(dtype),
+    "scaled_dot_product_attention_flash_attention: Expected data type in FP32, FP64, BF16, FP16, but got ", dtype, " instead.");
   TORCH_CHECK(query.dim() == 4 && key.dim() == 4 && value.dim() == 4,
     "scaled_dot_product_attention_flash_attention: Accept only 4 dims inputs shape of {B, H, T, K}");
   TORCH_CHECK(dropout_p == 0.0,
@@ -769,6 +796,7 @@ _scaled_dot_product_flash_attention_cpu(
   TORCH_CHECK((query.size(3) == value.size(3)) && (key.size(3) == value.size(3)),
     "scaled_dot_product_attention_flash_attention: Q/K/V should have the same head size");
   TORCH_CHECK(!attn_mask.has_value() ||
+          attn_mask.value().scalar_type() == at::kFloat ||
           dtype == attn_mask.value().scalar_type(),
     "scaled_dot_product_attention_flash_attention: Attention mask is the same data type as query");
   TORCH_CHECK(!attn_mask.has_value() ||
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index 9596bf6a497b7..dcf451feead7b 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -50,7 +50,13 @@
 #include <ATen/ops/scalar_tensor.h>
 #include <ATen/ops/scaled_dot_product_attention.h>
 #include <ATen/ops/split_native.h>
-#include <ATen/ops/narrow_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+#ifdef __HIP_PLATFORM_AMD__
+#include <ATen/native/cudnn/hip/MHA.h>
+#else
+#include <ATen/native/cudnn/MHA.h>
 #endif
 
 #include <c10/cuda/CUDAMathCompat.h>
@@ -58,7 +64,6 @@
 #include <ATen/native/transformers/attention.h>
 #include <ATen/native/nested/NestedTensorUtils.h>
 #include <ATen/native/nested/NestedTensorTransformerFunctions.h>
-#include <ATen/native/nested/NestedTensorUtils.h>
 #include <ATen/native/transformers/cuda/sdp_utils.h>
 #include <ATen/native/transformers/sdp_utils_cpp.h>
 
@@ -553,12 +558,13 @@ std::tuple<Tensor, Tensor> native_multi_head_attention_cuda(
     // strides from packed projection for nested tensors when seq_len is 1 will be
     // and will trigger a contiguous call in the kernel, so we prevent this
     bool no_seq_len_1_nested = query.is_nested() ? check_for_seq_len_1_nested_tensor(kernel_params, false) : true;
-    // The API for transfomer_encoder is a mask of shape (Batch_Size, Seq_len_q)
+    // The API for transformer_encoder is a mask of shape (Batch_Size, Seq_len_q)
     // For mem-eff attention this will cause the expand call to error
     // For now I am going to turn of that path not have to deal with all the annoying
     // Mask type shape grossness
     if (!mask.has_value() && no_seq_len_1_nested &&
-        (backend == sdp::SDPBackend::flash_attention || backend == sdp::SDPBackend::efficient_attention)) {
+        (backend == sdp::SDPBackend::flash_attention || backend == sdp::SDPBackend::efficient_attention ||
+         backend == sdp::SDPBackend::cudnn_attention)) {
       auto x = at::linear(query, qkv_weight, qkv_bias);
       auto chunks = x.chunk(3, -1);
       auto x_size_0 = x.size(0);
@@ -720,6 +726,56 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
   return std::make_tuple(attention, logsumexp, Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, philox_seed, philox_offset, debug_attn_mask);
 }
 
+std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_cuda(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    double dropout_p,
+    bool is_causal,
+    bool training,
+    c10::optional<double> scale) {
+  // Used for tracking usage statistics
+  C10_LOG_API_USAGE_ONCE("torch.sdpa.flash_attention_cudnn");
+  // Query (Batch x Num_heads x Q_seq_len  x Dim_per_head)
+  // Key   (Batch x Num_heads x KV_seq_len x Dim_per_head)
+  // Value (Batch x Num_heads x KV_seq_len x Dim_per_head)
+  const int64_t batch_size = query.size(0);
+  const int64_t num_heads = query.size(1);
+  const int64_t max_seqlen_batch_q = query.size(2);
+  const int64_t head_dim = query.size(3);
+
+  const int64_t max_seqlen_batch_k = key.size(2);
+  const int64_t max_seqlen_batch_v = value.size(2);
+  TORCH_CHECK(
+      max_seqlen_batch_k == max_seqlen_batch_v,
+      "Key and Value must have the same sequence length");
+
+  Tensor attention, log_sumexp;
+
+  auto cudnn_seed = at::zeros({1}, query.options().dtype(kLong));
+  auto cudnn_offset = at::zeros({1}, query.options().dtype(kLong));
+  const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
+
+  run_cudnn_SDP_fprop(batch_size/*int64_t b*/,
+                      num_heads/*int64_t h*/,
+                      max_seqlen_batch_q/*int64_t s_q*/,
+                      max_seqlen_batch_k/*int64_t s_kv*/,
+                      head_dim/*int64_t d*/,
+                      softmax_scale/*float scaling_factor*/,
+                      training/* bool */,
+                      is_causal/* bool */,
+                      dropout_p/*double dropout_probability*/,
+                      query/* Tensor q*/,
+                      key/* Tensor k*/,
+                      value/* Tensor v*/,
+                      log_sumexp/*Tensor softmaxstats*/,
+                      attention/*Tensor o*/,
+                      cudnn_seed/*Tensor dropoutseed*/,
+                      cudnn_offset/*Tensor dropoutoffset*/);
+
+  return std::make_tuple(attention, log_sumexp, Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, cudnn_seed, cudnn_offset, Tensor());
+}
+
 std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_efficient_attention_cuda(
     const Tensor& query,
     const Tensor& key,
@@ -794,6 +850,7 @@ _flash_attention_forward(
   // of the tensor. This is useful for kv cache scenarios but for now
   // we will not support in this PR.
   c10::optional<Tensor> seqused_k = c10::nullopt;
+  c10::optional<Tensor> alibi_slopes = c10::nullopt;
 
   // We are going to have two paths:
   // 1. The standard MHA path for dense tensors
@@ -822,6 +879,7 @@ _flash_attention_forward(
             cumulative_sequence_length_q.value(),
             cumulative_sequence_length_k.value(),
             seqused_k, /*seqused_k*/
+            alibi_slopes, /*alibi_slopes*/
             max_seqlen_batch_q,
             max_seqlen_batch_k,
             dropout_p,
@@ -847,6 +905,7 @@ _flash_attention_forward(
             key,
             value,
             out,
+            alibi_slopes,
             dropout_p,
             softmax_scale,
             is_causal,
@@ -893,7 +952,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
     bool compute_logsumexp,
     c10::optional<double> scale,
     const c10::optional<at::Tensor>& causal_diagonal,
-    const c10::optional<at::Tensor>& seqlen_k) {
+    const c10::optional<at::Tensor>& seqlen_k,
+    const c10::optional<int64_t> window_size) {
 #if defined(USE_MEM_EFF_ATTENTION)
 // TODO In theory it is possible to compile with _CUDA_ARCH < 5.0 and run on a
 // machine that is >= 5.0. In practice, this is not a problem but since
@@ -1047,9 +1107,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
          compute_logsumexp ? ceil_div(max_seqlen_q, kAlignLSE) * kAlignLSE : 0},
         query.options().dtype(at::ScalarType::Float));
     typename Kernel::Params p;
-    p.query_ptr = (scalar_t*)query.data_ptr();
-    p.key_ptr = (scalar_t*)key.data_ptr();
-    p.value_ptr = (scalar_t*)value.data_ptr();
+    p.query_ptr = (const scalar_t*)query.const_data_ptr();
+    p.key_ptr = (const scalar_t*)key.const_data_ptr();
+    p.value_ptr = (const scalar_t*)value.const_data_ptr();
     p.logsumexp_ptr = compute_logsumexp
         ? (typename Kernel::lse_scalar_t*)logsumexp.data_ptr()
         : nullptr;
@@ -1068,8 +1128,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
     p.output_ptr = (typename Kernel::output_t*)res.data_ptr();
 
     if (seqstart_q.has_value()) {
-      p.seqstart_q_ptr = (int32_t*)seqstart_q->data_ptr();
-      p.seqstart_k_ptr = (int32_t*)seqstart_k->data_ptr();
+      p.seqstart_q_ptr = (const int32_t*)seqstart_q->const_data_ptr();
+      p.seqstart_k_ptr = (const int32_t*)seqstart_k->const_data_ptr();
     }
 
     p.num_heads = num_heads;
@@ -1083,14 +1143,17 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
     if (causal_diagonal.has_value()) {
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(causal_diagonal.value());
       TORCH_CHECK(causal_diagonal->scalar_type() == at::ScalarType::Int);
-      p.causal_diagonal_ptr = (int32_t*)causal_diagonal->data_ptr();
+      p.causal_diagonal_ptr = (const int32_t*)causal_diagonal->const_data_ptr();
     }
 
     p.seqlen_k_ptr = nullptr;
     if (seqlen_k.has_value()) {
       CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(seqlen_k.value());
       TORCH_CHECK(seqlen_k->scalar_type() == at::ScalarType::Int);
-      p.seqlen_k_ptr = (int32_t*)seqlen_k->data_ptr();
+      p.seqlen_k_ptr = (const int32_t*)seqlen_k->const_data_ptr();
+    }
+    if (window_size.has_value()) {
+      p.window_size = *window_size;
     }
     p.scale = sdp::calculate_scale(query, scale).as_float_unchecked();
 
@@ -1110,7 +1173,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
       TORCH_CHECK(
           bias->scalar_type() == CutlassToAtenDtype<scalar_t>::atScalarType(),
           "invalid dtype for bias - should match query's dtype");
-      p.attn_bias_ptr = (scalar_t*)bias->data_ptr();
+      p.attn_bias_ptr = (const scalar_t*)bias->const_data_ptr();
 
       TORCH_CHECK(bias->dim() == 4, "Bias expected in BMHK format");
       TORCH_CHECK(
diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
index 4cbc5c9a33b7b..0405b6d73329f 100644
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@@ -1,3 +1,4 @@
+#include <string_view>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <cstdint>
 #include <type_traits>
@@ -41,9 +42,14 @@
 #include <ATen/native/transformers/cuda/mem_eff_attention/gemm_kernel_utils.h>
 #include <ATen/native/transformers/cuda/mem_eff_attention/pytorch_utils.h>
 #endif
-namespace at {
 
-namespace native {
+#ifdef __HIP_PLATFORM_AMD__
+#include <ATen/native/cudnn/hip/MHA.h>
+#else
+#include <ATen/native/cudnn/MHA.h>
+#endif
+
+namespace at::native {
 
 std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
     const Tensor& grad_out,
@@ -74,6 +80,21 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
   //  The kernel computes irregardless we will drop for this functions return
   Tensor grad_softmax;
 
+  // Currently unused args:
+  c10::optional<at::Tensor> alibi_slopes{c10::nullopt};
+
+  bool determinisitic{false};
+  auto& ctx = at::globalContext();
+  if (ctx.deterministicAlgorithms()) {
+    if (ctx.deterministicAlgorithmsWarnOnly()) {
+      TORCH_WARN_ONCE(
+          "Flash Attention defaults to a non-deterministic algorithm. ",
+          "To explicitly enable determinism call torch.use_deterministic_algorithms(True, warn_only=False).");
+    } else {
+      determinisitic = true;
+    }
+  }
+
   // We check the whether the cumulative_sequence_length_q is defined
   // in order to determine whether we are using varlen or dense forward
   if (cumulative_sequence_length_q.defined()) {
@@ -90,6 +111,7 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
         dv,
         cumulative_sequence_length_q,
         cumulative_sequence_length_k,
+        alibi_slopes,
         max_seqlen_batch_q,
         max_seqlen_batch_k,
         dropout_p,
@@ -98,9 +120,10 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
         is_causal,
         -1, /*window_size_left*/
         -1, /*window_size_right*/
+        determinisitic,
         philox_seed,
         philox_offset);
-    return std::make_tuple(dQuery, dKey, dValue);
+    return std::make_tuple(std::move(dQuery), std::move(dKey), std::move(dValue));
   } else {
     // Dense forward
     auto [dQuery, dKey, dValue, dSoftmax] = pytorch_flash::mha_bwd(
@@ -113,11 +136,13 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
         dq,
         dk,
         dv,
+        alibi_slopes,
         dropout_p,
         softmax_scale,
         is_causal,
         -1, /*window_size_left*/
         -1, /*window_size_right*/
+        determinisitic,
         philox_seed,
         philox_offset);
     return std::make_tuple(std::move(dQuery), std::move(dKey), std::move(dValue));
@@ -127,6 +152,52 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
   return std::make_tuple(Tensor(), Tensor(), Tensor());
 }
 
+std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_cuda(
+    const Tensor& grad_out,
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const Tensor& out,
+    const Tensor& logsumexp,
+    const Tensor& cumulative_sequence_length_q,
+    const Tensor& cumulative_sequence_length_k,
+    const int64_t max_seqlen_batch_q,
+    const int64_t max_seqlen_batch_k,
+    double dropout_p,
+    bool is_causal,
+    const Tensor& philox_seed,
+    const Tensor& philox_offset,
+    c10::optional<double> scale) {
+    const int64_t batch_size = query.size(0);
+    const int64_t num_heads = query.size(1);
+    const int64_t head_dim = query.size(3);
+
+    const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
+
+    auto dq = at::empty_like(query);
+    auto dk = at::empty_like(key);
+    auto dv = at::empty_like(value);
+    run_cudnn_SDP_bprop(batch_size /*int64_t b*/,
+                        num_heads /*int64_t h*/,
+                        max_seqlen_batch_q /*int64_t s_q*/,
+                        max_seqlen_batch_k /*int64_t s_kv*/,
+                        head_dim /*int64_t d*/,
+                        softmax_scale /*float scaling_factor*/,
+                        is_causal /*bool is_causal*/,
+                        dropout_p /*float dropout_probability*/,
+                        query /*const Tensor& q*/,
+                        key /*const Tensor& k*/,
+                        value /*const Tensor& v*/,
+                        out /*const Tensor& o*/,
+                        grad_out/*const Tensor& dO*/,
+                        logsumexp.unsqueeze(-1)/*const Tensor& softmaxstats*/,
+                        dq/*Tensor& dQ*/,
+                        dk/*Tensor& dK*/,
+                        dv/*Tensor& dV*/,
+                        philox_seed/*Tensor& dropoutseed*/,
+                        philox_offset/*Tensor& dropoutoffset*/);
+    return std::make_tuple(dq, dk, dv);
+}
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
 _efficient_attention_backward(
@@ -153,7 +224,8 @@ _efficient_attention_backward(
     int64_t custom_mask_type,
     const bool bias_requires_grad,
     const c10::optional<double> scale,
-    c10::optional <int64_t> num_splits_key) {
+    c10::optional <int64_t> num_splits_key,
+    const c10::optional<int64_t> window_size) {
   #if defined(USE_MEM_EFF_ATTENTION)
   if (!grad_out_.defined()) {
     return std::make_tuple(Tensor{}, Tensor{}, Tensor{}, Tensor{});
@@ -321,12 +393,12 @@ _efficient_attention_backward(
     TORCH_INTERNAL_ASSERT(delta.size(2) == M);
 
     typename Kernel::Params p;
-    p.query_ptr = (scalar_t*)query.data_ptr();
-    p.key_ptr = (scalar_t*)key.data_ptr();
-    p.value_ptr = (scalar_t*)value.data_ptr();
-    p.logsumexp_ptr = (typename Kernel::lse_scalar_t*)logsumexp.data_ptr();
-    p.output_ptr = (scalar_t*)out.data_ptr();
-    p.grad_output_ptr = (scalar_t*)grad_out.data_ptr();
+    p.query_ptr = (const scalar_t*)query.const_data_ptr();
+    p.key_ptr = (const scalar_t*)key.const_data_ptr();
+    p.value_ptr = (const scalar_t*)value.const_data_ptr();
+    p.logsumexp_ptr = (typename Kernel::lse_scalar_t const *)logsumexp.const_data_ptr();
+    p.output_ptr = (const scalar_t*)out.const_data_ptr();
+    p.grad_output_ptr = (const scalar_t*)grad_out.const_data_ptr();
     p.grad_query_ptr = (scalar_t*)grad_q.data_ptr();
     p.grad_key_ptr = (scalar_t*)grad_k.data_ptr();
     p.grad_value_ptr = (scalar_t*)grad_v.data_ptr();
@@ -340,8 +412,11 @@ _efficient_attention_backward(
     p.custom_mask_type = custom_mask_type;
     p.scale = sdp::calculate_scale(query, scale).as_float_unchecked();
     if (cu_seqlens_q.has_value()) {
-      p.cu_seqlens_q_ptr = (int32_t*)cu_seqlens_q->data_ptr();
-      p.cu_seqlens_k_ptr = (int32_t*)cu_seqlens_k->data_ptr();
+      p.cu_seqlens_q_ptr = (const int32_t*)cu_seqlens_q->const_data_ptr();
+      p.cu_seqlens_k_ptr = (const int32_t*)cu_seqlens_k->const_data_ptr();
+    }
+    if (window_size.has_value()) {
+      p.window_size = *window_size;
     }
 
     ASSIGN_CHECK_OVERFLOW(p.lse_strideB, logsumexp.stride(0));
@@ -540,8 +615,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_flash_attenti
   Tensor grad_out_t = grad_out_.transpose(1,2);
   Tensor out_t = out.transpose(1,2);
 
-  Tensor grad_q, grad_k, grad_v;
-  std::tie(grad_q, grad_k, grad_v) = at::_flash_attention_backward(
+  auto [grad_q, grad_k, grad_v] = at::_flash_attention_backward(
     grad_out_t,
     q_t,
     k_t,
@@ -592,14 +666,14 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_e
 
   Tensor grad_q, grad_k, grad_v, grad_bias;
 
-  // This is needed because SaveVarible automatically converts
+  // This is needed because SaveVariable automatically converts
   // c10::optional to undefined tensor
   c10::optional<Tensor> kernel_bias;
   if (attn_bias.defined()) {
     kernel_bias = attn_bias;
   }
   // Will add with signauter changes for dropout and bias
-  // We are only handiling Dense inputs, but this should be passed
+  // We are only handling Dense inputs, but this should be passed
   // from forward to backward
   int64_t max_seqlen_q = q_t.size(1);
   int64_t max_seqlen_k = k_t.size(1);
@@ -631,5 +705,4 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_e
       grad_q.transpose(1, 2), grad_k.transpose(1, 2), grad_v.transpose(1, 2), grad_bias);
 }
 
-} // namespace native
-} // namespace at
+} // namespace at::native
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/alibi.h b/aten/src/ATen/native/transformers/cuda/flash_attn/alibi.h
new file mode 100644
index 0000000000000..311231432c7cf
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/alibi.h
@@ -0,0 +1,74 @@
+#include <cmath>
+
+#include <cute/tensor.hpp>
+
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+
+#include <ATen/native/transformers/cuda/flash_attn/utils.h>
+
+namespace pytorch_flash {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool Is_causal>
+struct Alibi {
+
+    const float alibi_slope;
+    const int max_seqlen_k, max_seqlen_q;
+
+    __forceinline__ __device__ Alibi(const float alibi_slope, const int max_seqlen_k, const int max_seqlen_q)
+        : alibi_slope(alibi_slope)
+        , max_seqlen_k(max_seqlen_k)
+        , max_seqlen_q(max_seqlen_q) {
+    };
+
+
+    template <typename Engine, typename Layout>
+    __forceinline__ __device__ void apply_alibi(Tensor<Engine, Layout> &tensor,
+                                      const int col_idx_offset_,
+                                      const int row_idx_offset,
+                                      const int warp_row_stride) {
+        // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
+        static_assert(Layout::rank == 2, "Only support 2D Tensor");
+        const int lane_id = threadIdx.x % 32;
+        const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
+        if constexpr (Is_causal) {  // Simpler, we add the same bias vector to all rows
+            #pragma unroll
+            for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                const int col_idx_base = col_idx_offset + nj * 8;
+                #pragma unroll
+                for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                    const int col_idx = col_idx_base + j;
+                    #pragma unroll
+                    for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                        tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx;
+                    }
+                }
+            }
+        } else {  // Bias depends on both row_idx and col_idx
+            #pragma unroll
+            for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
+                const int row_idx_base = row_idx_offset + mi * warp_row_stride;
+                #pragma unroll
+                for (int i = 0; i < size<0, 0>(tensor); ++i) {
+                    const int row_idx = row_idx_base + i * 8;
+                    #pragma unroll
+                    for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                        const int col_idx_base = col_idx_offset + nj * 8;
+                        #pragma unroll
+                        for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                            const int col_idx = col_idx_base + j;
+                            tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+};
+
+}  // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/block_info.h b/aten/src/ATen/native/transformers/cuda/flash_attn/block_info.h
index 3e05d7e7195e8..bbaf697800217 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/block_info.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/block_info.h
@@ -24,12 +24,12 @@ struct BlockInfo {
         }
 
     template <typename index_t>
-    inline __device__ index_t q_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
+    __forceinline__ __device__ index_t q_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
         return sum_s_q == -1 ? bidb * batch_stride : uint32_t(sum_s_q) * row_stride;
     }
 
     template <typename index_t>
-    inline __device__ index_t k_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
+    __forceinline__ __device__ index_t k_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
         return sum_s_k == -1 ? bidb * batch_stride : uint32_t(sum_s_k) * row_stride;
     }
 
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/dropout.h b/aten/src/ATen/native/transformers/cuda/flash_attn/dropout.h
new file mode 100644
index 0000000000000..8dc4b0b22bcc9
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/dropout.h
@@ -0,0 +1,96 @@
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <ATen/native/transformers/cuda/flash_attn/philox.cuh>
+#include <ATen/native/transformers/cuda/flash_attn/utils.h>
+
+namespace pytorch_flash {
+
+using namespace cute;
+
+struct Dropout {
+
+    const unsigned long long seed, offset;
+    const uint8_t p_dropout_in_uint8_t;
+
+    __forceinline__ __device__ Dropout(const unsigned long long seed, const unsigned long long offset,
+                              const uint8_t p_dropout_in_uint8_t,
+                              const int bid, const int hid, const int tid, const int nheads)
+            : seed(seed)
+            , offset(offset + (bid * nheads + hid) * 32 + tid % 32)
+            , p_dropout_in_uint8_t(p_dropout_in_uint8_t) {
+    }
+
+    template <bool encode_dropout_in_sign_bit=false, typename Engine, typename Layout>
+    __forceinline__ __device__ void apply_dropout(Tensor<Engine, Layout> &tensor_,
+                                         int block_row_start, int block_col_start, int block_row_stride) {
+        // convert shape from (4, MMA_M, MMA_N) to (8, MMA_M, MMA_N / 2)
+        Tensor tensor = make_tensor(tensor_.data(), pytorch_flash::convert_layout_acc_dropout(tensor_.layout()));
+        using T = typename Engine::value_type;
+        auto encode_dropout = [](bool keep, T val) {
+            return keep ? val : (encode_dropout_in_sign_bit ? -val : T(0));
+        };
+        static_assert(decltype(size<2>(tensor))::value % 2 == 0);
+        const uint16_t p_dropout_8bit_in_uint16_t = uint16_t(p_dropout_in_uint8_t);
+        const uint32_t p_dropout_8bit_in_uint32_t = (uint32_t(p_dropout_8bit_in_uint16_t) << 16) | uint32_t(p_dropout_8bit_in_uint16_t);
+        // if (cute::thread0()) { printf("threshold2 = 0x%x\n", p_dropout_8bit_in_uint32_t); }
+        #pragma unroll
+        for (int m = 0; m < size<1>(tensor); ++m, block_row_start += block_row_stride) {
+            uint2 rowcol = make_uint2(block_row_start, block_col_start);
+            #pragma unroll
+            for (int n = 0; n < size<2>(tensor) / 2; ++n, ++rowcol.y) {
+                // if (cute::thread(32, 0)) { printf("m = %d, n = %d, row = %d, col = %d\n", m, n, int(rowcol.x), int(rowcol.y));}
+                uint4 random_uint4 = pytorch_flash::philox(seed, reinterpret_cast<unsigned long long&>(rowcol), offset);
+                // if (cute::thread0()) { printf("philox = %u, %d, %d, %d\n", random_uint4.x, random_uint4.y, random_uint4.z, random_uint4.w);}
+                uint8_t (&rnd_8)[16] = reinterpret_cast<uint8_t (&)[16]>(random_uint4);
+                // Special implementation for 16-bit types: we duplicate the threshold to the
+                // low and high 16 bits of a 32-bit value, then use the f16x2 comparison instruction
+                // to get a mask. The low 16 bits of the mask will be either 0xffff or 0x0000,
+                // and the high 16 bits will be either 0xffff or 0x0000, depending on whether
+                // the random value is less than the threshold.
+                // We then do a bit-wise AND between the mask and the original value (in 32-bit).
+                // We're exploiting the fact that floating point comparison is equivalent to integer
+                // comparison, since we're comparing unsigned integers whose top 8-bits are zero.
+                if (!encode_dropout_in_sign_bit
+                    && (std::is_same<T, cutlass::half_t>::value || std::is_same<T, cutlass::bfloat16_t>::value)) {
+                    uint16_t rnd_16[16];
+                    #pragma unroll
+                    for (int i = 0; i < 16; i++) { rnd_16[i] = uint16_t(rnd_8[i]); }
+                    uint32_t (&rnd_32)[8] = reinterpret_cast<uint32_t (&)[8]>(rnd_16);
+                    #pragma unroll
+                    for (int j = 0; j < 2; j++) {
+                        Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
+                        // if (cute::thread0()) { printf("random = 0x%x, 0x%x, 0x%x, 0x%x\n", rnd_32[j * 4 + 0], rnd_32[j * 4 + 1], rnd_32[j * 4 + 2], rnd_32[j * 4 + 3]); }
+                        // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
+                        #pragma unroll
+                        for (int i = 0; i < 4; i++) {
+                            uint32_t mask;
+                            asm volatile("set.le.u32.f16x2 %0, %1, %2;\n" : "=r"(mask) : "r"(rnd_32[j * 4 + i]), "r"(p_dropout_8bit_in_uint32_t));
+                            tensor_uint32(i) &= mask;
+                        }
+                        // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
+                    }
+                } else {
+                    #pragma unroll
+                    for (int j = 0; j < 2; j++) {
+                        #pragma unroll
+                        for (int i = 0; i < 8; i++) {
+                            tensor(i, m, n * 2 + j) = encode_dropout(rnd_8[j * 8 + i] <= p_dropout_in_uint8_t, tensor(i, m, n * 2 + j));
+                        }
+                        Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
+                        // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
+                    }
+                }
+                // // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+                // //     printf("n = %d, ph  Philox: %u, %u, %u, %u\n", n, rnd_8.x, rnd_8.y, rnd_8.z, rnd_8.w);
+                // // }
+            }
+        }
+    }
+
+};
+
+} // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash.h b/aten/src/ATen/native/transformers/cuda/flash_attn/flash.h
index 23fa6584b9b56..9ce14cf6489ef 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash.h
@@ -5,13 +5,15 @@
 #pragma once
 
 #include <cuda.h>
-#include <vector>
-
-#include <ATen/cuda/PhiloxUtils.cuh>
-
-namespace pytorch_flash{
 
+#ifdef OLD_GENERATOR_PATH
+#include <ATen/CUDAGeneratorImpl.h>
+#else
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#endif
 
+#include <ATen/cuda/CUDAGraphsUtils.cuh> // For at::cuda::philox::unpack
+namespace pytorch_flash {
 constexpr int TOTAL_DIM = 0;
 constexpr int H_DIM = 1;
 constexpr int D_DIM = 2;
@@ -19,7 +21,7 @@ constexpr int D_DIM = 2;
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 struct Qkv_params {
-    using index_t = uint32_t;
+    using index_t = int64_t;
     // The QKV matrices.
     void *__restrict__ q_ptr;
     void *__restrict__ k_ptr;
@@ -96,7 +98,12 @@ struct Flash_fwd_params : public Qkv_params {
     void * __restrict__ rotary_sin_ptr;
 
     // The indices to index into the KV cache.
-    int *__restrict__ cache_batch_idx;
+    int * __restrict__ cache_batch_idx;
+
+    // Paged KV cache
+    int * __restrict__ block_table;
+    index_t block_table_batch_stride;
+    int page_block_size;
 
     // The dropout probability (probability of keeping an activation).
     float p_dropout;
@@ -126,6 +133,9 @@ struct Flash_fwd_params : public Qkv_params {
     bool is_rotary_interleaved;
 
     int num_splits;  // For split-KV version
+
+    void * __restrict__ alibi_slopes_ptr;
+    index_t alibi_slopes_batch_stride;
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -165,6 +175,9 @@ struct Flash_bwd_params : public Flash_fwd_params {
 
     // The pointer to the softmax d sum.
     void *__restrict__ dsoftmax_sum;
+
+    bool deterministic;
+    index_t dq_accum_split_stride;
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -172,7 +185,6 @@ struct Flash_bwd_params : public Flash_fwd_params {
 template<typename T, int Headdim> void run_mha_fwd_(Flash_fwd_params &params, cudaStream_t stream);
 template<typename T, int Headdim> void run_mha_fwd_splitkv_dispatch(Flash_fwd_params &params, cudaStream_t stream);
 
-template<typename T, int Headdim> void run_mha_bwd_(Flash_bwd_params &params, cudaStream_t stream, const bool configure);
-
+template<typename T, int Headdim> void run_mha_bwd_(Flash_bwd_params &params, cudaStream_t stream);
 
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
index 4d23e9f138885..8f6f7a9f357dc 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
@@ -1,29 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2022, Tri Dao.
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
+ * Copyright (c) 2024, Tri Dao.
  ******************************************************************************/
 #include <c10/core/ScalarType.h>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
@@ -50,6 +26,7 @@
 #include <ATen/ops/slice.h>
 #include <ATen/ops/narrow.h>
 #include <ATen/ops/pad.h>
+#include <ATen/ops/zeros.h>
 #endif
 
 
@@ -93,11 +70,11 @@ void set_params_fprop(Flash_fwd_params &params,
                       float p_dropout,
                       float softmax_scale,
                       int window_size_left,
-                      int window_size_right) {
+                      int window_size_right,
+                      bool seqlenq_ngroups_swapped=false) {
 
-    // Reset the parameters should be equivalent
+    // Reset the parameters
     params = {};
-    // memset(&params, 0, sizeof(params));
 
     params.is_bf16 = q.dtype() == at::kBFloat16;
 
@@ -121,6 +98,10 @@ void set_params_fprop(Flash_fwd_params &params,
         params.k_batch_stride = k.stride(0);
         params.v_batch_stride = v.stride(0);
         params.o_batch_stride = out.stride(0);
+        if (seqlenq_ngroups_swapped) {
+             params.q_batch_stride *= seqlen_q;
+             params.o_batch_stride *= seqlen_q;
+        }
     }
 
     params.cu_seqlens_q = static_cast<int *>(cu_seqlens_q_d);
@@ -159,6 +140,9 @@ void set_params_fprop(Flash_fwd_params &params,
     params.rp_dropout = 1.f / params.p_dropout;
     params.scale_softmax_rp_dropout = params.rp_dropout * params.scale_softmax;
     TORCH_CHECK(p_dropout < 1.f);
+    #ifdef FLASHATTENTION_DISABLE_DROPOUT
+        TORCH_CHECK(p_dropout == 0.0f, "This flash attention build does not support dropout.");
+    #endif
 
     // Causal is the special case where window_size_right == 0 and window_size_left < 0.
     // Local is the more general case where window_size_right >= 0 or window_size_left >= 0.
@@ -169,7 +153,16 @@ void set_params_fprop(Flash_fwd_params &params,
     params.window_size_left = window_size_left;
     params.window_size_right = window_size_right;
 
+    #ifdef FLASHATTENTION_DISABLE_LOCAL
+        TORCH_CHECK(params.is_causal || (window_size_left < 0 && window_size_right < 0),
+            "This flash attention build does not support local attention.");
+    #endif
+
     params.is_seqlens_k_cumulative = true;
+
+    #ifdef FLASHATTENTION_DISABLE_UNEVEN_K
+        TORCH_CHECK(d == d_rounded, "This flash attention build does not support headdim not being a multiple of 32.");
+    #endif
 }
 
 void set_params_dgrad(Flash_bwd_params &params,
@@ -202,7 +195,8 @@ void set_params_dgrad(Flash_bwd_params &params,
                       float p_dropout,
                       float softmax_scale,
                       int window_size_left,
-                      int window_size_right) {
+                      int window_size_right,
+                      bool deterministic) {
 
     set_params_fprop(params,
                      b, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, h, h_k, d, d_rounded,
@@ -244,11 +238,13 @@ void set_params_dgrad(Flash_bwd_params &params,
 
     // Softmax sum
     params.dsoftmax_sum = dsoftmax_sum_d;
+
+    params.deterministic = deterministic;
 }
 
 void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream, bool force_split_kernel=false) {
     FP16_SWITCH(!params.is_bf16, [&] {
-        FWD_HEADDIM_SWITCH(params.d, [&] {
+        HEADDIM_SWITCH(params.d, [&] {
             if (params.num_splits <= 1 && !force_split_kernel) {  // If we don't set it num_splits == 0
                 run_mha_fwd_<elem_type, kHeadDim>(params, stream);
             } else {
@@ -300,16 +296,62 @@ inline int num_splits_heuristic(int batch_nheads_mblocks, int num_SMs, int num_n
     return 1;
 }
 
+void set_params_splitkv(Flash_fwd_params &params, const int batch_size,
+    const int num_heads, const int head_size, const int max_seqlen_k, const int max_seqlen_q,
+    const int head_size_rounded, const float p_dropout,
+    const int num_splits, cudaDeviceProp *dprops, struct c10::TensorOptions opts) {
+
+    // This needs to match with run_mha_fwd_splitkv_dispatch
+    const int block_n = head_size <= 64 ? 256 : (head_size <= 128 ? 128 : 64);
+    const int num_n_blocks = (max_seqlen_k + block_n - 1) / block_n;
+    // Technically kBlockM = 64 only for the splitKV kernels, not the standard kernel.
+    // In any case we don't expect seqlen_q to be larger than 64 for inference.
+    const int num_m_blocks = (max_seqlen_q + 64 - 1) / 64;
+    params.num_splits = num_splits;
+    if (p_dropout == 0.0f) {  // SplitKV is not implemented for dropout
+        if (num_splits < 1) {
+            params.num_splits = num_splits_heuristic(batch_size * num_heads * num_m_blocks, dprops->multiProcessorCount, num_n_blocks, 128);
+        }
+        if (params.num_splits > 1) {
+            at::Tensor softmax_lse_accum = at::empty({params.num_splits, batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
+            at::Tensor out_accum = at::empty({params.num_splits, batch_size, num_heads, max_seqlen_q, head_size_rounded}, opts.dtype(at::kFloat));
+            params.softmax_lseaccum_ptr = softmax_lse_accum.data_ptr();
+            params.oaccum_ptr = out_accum.data_ptr();
+        }
+        TORCH_CHECK(params.num_splits <= 128, "num_splits > 128 not supported");
+    }
+}
+
+void set_params_alibi(Flash_fwd_params &params, c10::optional<at::Tensor> &alibi_slopes_, int batch_size, int num_heads){
+#ifdef FLASHATTENTION_DISABLE_ALIBI
+    TORCH_CHECK(!alibi_slopes_.has_value(), "This flash attention build does not support alibi.");
+    params.alibi_slopes_ptr = nullptr;
+#else
+    if (alibi_slopes_.has_value()) {
+        auto alibi_slopes = alibi_slopes_.value();
+        TORCH_CHECK(alibi_slopes.dtype() == at::kFloat, "ALiBi slopes must have dtype fp32");
+        CHECK_DEVICE(alibi_slopes);
+        TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension");
+        TORCH_CHECK(alibi_slopes.sizes() == at::IntArrayRef({num_heads}) || alibi_slopes.sizes() == at::IntArrayRef({batch_size, num_heads}));
+        params.alibi_slopes_ptr = alibi_slopes.data_ptr();
+        params.alibi_slopes_batch_stride = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0;
+    } else {
+        params.alibi_slopes_ptr = nullptr;
+    }
+#endif
+}
+
 // return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p};
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
 mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
         const at::Tensor &k,         // batch_size x seqlen_k x num_heads_k x head_size
         const at::Tensor &v,         // batch_size x seqlen_k x num_heads_k x head_size
         c10::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
+        c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
         const float p_dropout,
         const float softmax_scale,
         bool is_causal,
-        const int window_size_left,
+        int window_size_left,
         int window_size_right,
         const bool return_softmax,
         c10::optional<at::Generator> gen_) {
@@ -350,12 +392,16 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
     TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
 
-    if (seqlen_q == 1) { is_causal = false; }  // causal=true is the same as causal=false in this case
+    if (window_size_left >= seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= seqlen_k) { window_size_right = -1; }
+
+    // causal=true is the same as causal=false in this case
+    if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }
     if (is_causal) { window_size_right = 0; }
 
     // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case
     // H/t Daniel Haziza
-    const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0;
+    const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && !alibi_slopes_.has_value();
     at::Tensor temp_q = q;
     if (seqlenq_ngroups_swapped) {
         const int ngroups = num_heads / num_heads_k;
@@ -369,9 +415,9 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
     CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size_og);
 
     at::Tensor q_padded, k_padded, v_padded;
-        q_padded = temp_q;
-        k_padded = k;
-        v_padded = v;
+    q_padded = temp_q;
+    k_padded = k;
+    v_padded = v;
 
     at::Tensor out;
     if (out_.has_value()) {
@@ -423,30 +469,17 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
                      window_size_left,
                      window_size_right);
 
-    // This needs to match with run_mha_fwd_splitkv_dispatch
-    const int block_n = head_size <= 64 ? 256 : (head_size <= 128 ? 128 : 64);
-    const int num_n_blocks = (seqlen_k + block_n - 1) / block_n;
-    // Technically kBlockM = 64 only for the splitKV kernels, not the standard kernel.
-    // In any case we don't expect seqlen_q to be larger than 64 for inference.
-    const int num_m_blocks = (seqlen_q + 64 - 1) / 64;
-    params.num_splits = 1;
-    if (p_dropout == 0.0f) {  // SplitKV is not implemented for dropout
-        params.num_splits = num_splits_heuristic(batch_size * num_heads * num_m_blocks, dprops->multiProcessorCount, num_n_blocks, 128);
-        if (params.num_splits > 1) {
-            at::Tensor softmax_lse_accum = at::empty({params.num_splits, batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
-            at::Tensor out_accum = at::empty({params.num_splits, batch_size, num_heads, seqlen_q, head_size_rounded}, opts.dtype(at::kFloat));
-            params.softmax_lseaccum_ptr = softmax_lse_accum.data_ptr();
-            params.oaccum_ptr = out_accum.data_ptr();
-        }
-        TORCH_CHECK(params.num_splits <= 128, "num_splits > 128 not supported");
-    }
+
+    set_params_splitkv(params, batch_size, num_heads,
+                       head_size, seqlen_k, seqlen_q,
+                       head_size_rounded, p_dropout, /*num_splits*/0, dprops, opts);
 
     // We want to checkpoint and save the RNG state for backward if dropout
     // We get the default generator and return the seed and offset which will
     // be used in the backward function
-    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
     at::Tensor seed_t, offset_t;
     if (p_dropout > 0.0)  {
+        auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
         // number of times random will be generated per thread, to offset philox counter in thc random
         // state
         // We use a custom RNG that increases the offset by batch_size * nheads * 32.
@@ -476,6 +509,8 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
 
     }
 
+    set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
+
     if (seqlen_k > 0) {
         auto stream = at::cuda::getCurrentCUDAStream().stream();
         run_mha_fwd(params, stream);
@@ -501,18 +536,18 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
                const at::Tensor &cu_seqlens_q,  // b+1
                const at::Tensor &cu_seqlens_k,  // b+1
                c10::optional<at::Tensor> &seqused_k, // b. If given, only this many elements of each batch element's keys are used.
-               const int max_seqlen_q,
+               c10::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
+               int max_seqlen_q,
                const int max_seqlen_k,
                const float p_dropout,
                const float softmax_scale,
                const bool zero_tensors,
-               const bool is_causal,
-               const int window_size_left,
+               bool is_causal,
+               int window_size_left,
                int window_size_right,
                const bool return_softmax,
                c10::optional<at::Generator> gen_) {
 
-    if (is_causal) { window_size_right = 0; }
     auto dprops = at::cuda::getCurrentDeviceProperties();
     // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
     bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
@@ -544,17 +579,39 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
 
     const auto sizes = q.sizes();
 
-    const int total_q = sizes[0];
     const int batch_size = cu_seqlens_q.numel() - 1;
-    const int num_heads = sizes[1];
+    int num_heads = sizes[1];
     const int head_size_og = sizes[2];
     const int total_k = k.size(0);
     const int num_heads_k = k.size(1);
+
+    if (max_seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }  // causal=true is the same as causal=false in this case
+    if (is_causal) { window_size_right = 0; }
+
+    void *cu_seqlens_q_d = cu_seqlens_q.data_ptr();
+
+    // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case
+    // H/t Daniel Haziza
+    const int seqlenq_ngroups_swapped = max_seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && !alibi_slopes_.has_value();
+    at::Tensor temp_q = q;
+    if (seqlenq_ngroups_swapped) {
+        const int ngroups = num_heads / num_heads_k;
+        temp_q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2).reshape({batch_size * ngroups, num_heads_k, head_size_og});
+        max_seqlen_q = ngroups;
+        num_heads = num_heads_k;
+        cu_seqlens_q_d = nullptr;
+    }
+
+    const int total_q = q.sizes()[0];
+
     TORCH_CHECK(batch_size > 0, "batch size must be positive");
     TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
     TORCH_CHECK(head_size_og % 8 == 0, "head_size must be a multiple of 8, this is ensured by padding!")
 
+    if (window_size_left >= max_seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= max_seqlen_k) { window_size_right = -1; }
+
     CHECK_SHAPE(q, total_q, num_heads, head_size_og);
     CHECK_SHAPE(k, total_k, num_heads_k, head_size_og);
     CHECK_SHAPE(v, total_k, num_heads_k, head_size_og);
@@ -569,7 +626,7 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
     }
 
     at::Tensor q_padded, k_padded, v_padded;
-    q_padded = q;
+    q_padded = temp_q;
     k_padded = k;
     v_padded = v;
 
@@ -619,7 +676,7 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
                      num_heads, num_heads_k,
                      head_size, head_size_rounded,
                      q_padded, k_padded, v_padded, out,
-                     cu_seqlens_q.data_ptr(),
+                     cu_seqlens_q_d,
                      cu_seqlens_k.data_ptr(),
                      seqused_k.has_value() ? seqused_k.value().data_ptr() : nullptr,
                      return_softmax ? p.data_ptr() : nullptr,
@@ -627,9 +684,16 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
                      p_dropout,
                      softmax_scale,
                      window_size_left,
-                     window_size_right);
+                     window_size_right,
+                     seqlenq_ngroups_swapped);
+    if (seqlenq_ngroups_swapped) {
+        // Only apply split-k for decoding
+        set_params_splitkv(params, batch_size, num_heads,
+                           head_size, max_seqlen_k, max_seqlen_q,
+                           head_size_rounded, p_dropout, /*num_splits*/0, dprops, opts);
+    }
 
-   // We want to checkpoint and save the RNG state for backward if dropout
+    // We want to checkpoint and save the RNG state for backward if dropout
     // We get the default generator and return the seed and offset which will
     // be used in the backward function
     auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
@@ -664,31 +728,33 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
 
     }
 
-    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    run_mha_fwd(params, stream);
+    set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
+
+    if (max_seqlen_k > 0) {
+        auto stream = at::cuda::getCurrentCUDAStream().stream();
+        run_mha_fwd(params, stream);
+    } else {
+        // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
+        out.zero_();
+        softmax_lse.fill_(std::numeric_limits<float>::infinity());
+    }
+
+    if (seqlenq_ngroups_swapped) {
+        std::array<int64_t, 4> size_before = {batch_size, max_seqlen_q, num_heads_k, head_size_og};
+        std::array<int64_t, 3> size_after = {batch_size, num_heads_k * max_seqlen_q, head_size_og};
+        out = out.reshape(size_before).transpose(1, 2).reshape(size_after);
+        q_padded = q_padded.reshape(size_before).transpose(1, 2).reshape(size_after);
+        softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * max_seqlen_q, 1});
+    }
 
     return {out, q_padded, k_padded, v_padded, softmax_lse, seed_t, offset_t, p};
 }
 
-void run_mha_bwd(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+void run_mha_bwd(Flash_bwd_params &params, cudaStream_t stream) {
     FP16_SWITCH(!params.is_bf16, [&] {
-        if (params.d <= 32) {
-            run_mha_bwd_<elem_type, 32>(params, stream, configure);
-        } else if (params.d <= 64) {
-            run_mha_bwd_<elem_type, 64>(params, stream, configure);
-        } else if (params.d <= 96) {
-            run_mha_bwd_<elem_type, 96>(params, stream, configure);
-        } else if (params.d <= 128) {
-            run_mha_bwd_<elem_type, 128>(params, stream, configure);
-        } else if (params.d <= 160) {
-            run_mha_bwd_<elem_type, 160>(params, stream, configure);
-        } else if (params.d <= 192) {
-            run_mha_bwd_<elem_type, 192>(params, stream, configure);
-        } else if (params.d <= 224) {
-          run_mha_bwd_<elem_type, 224>(params, stream, configure);
-        } else if (params.d <= 256) {
-          run_mha_bwd_<elem_type, 256>(params, stream, configure);
-        }
+        HEADDIM_SWITCH(params.d, [&] {
+            run_mha_bwd_<elem_type, kHeadDim>(params, stream);
+        });
     });
 }
 
@@ -702,14 +768,19 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
         c10::optional<at::Tensor> &dq_,   // batch_size x seqlen_q x num_heads x head_size
         c10::optional<at::Tensor> &dk_,   // batch_size x seqlen_k x num_heads_k x head_size
         c10::optional<at::Tensor> &dv_,   // batch_size x seqlen_k x num_heads_k x head_size
+        c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
         const float p_dropout,         // probability to drop
         const float softmax_scale,
         const bool is_causal,
-        const int window_size_left,
+        int window_size_left,
         int window_size_right,
+        const bool deterministic,
         const at::Tensor philox_seed,
         const at::Tensor philox_offset) {
 
+    #ifdef FLASHATTENTION_DISABLE_BACKWARD
+        TORCH_CHECK(false, "This flash attention build does not support backward.");
+    #endif
     if (is_causal) { window_size_right = 0; }
     auto dprops = at::cuda::getCurrentDeviceProperties();
     // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
@@ -756,8 +827,8 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
     TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
     TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
     TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256");
-    if (head_size > 192) {
-        TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim > 192 requires A100/A800 or H100/H800");
+    if (head_size > 192 && (head_size <= 224 || is_dropout)) {
+        TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim 256 with dropout, or head dim 224 with/without dropout requires A100/A800 or H100/H800");
     }
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
 
@@ -768,6 +839,9 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
 
     TORCH_CHECK(head_size == round_multiple(head_size_og, 8), "head_size must be head_size_og rounded to a multiple of 8");
 
+    if (window_size_left >= seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= seqlen_k) { window_size_right = -1; }
+
     CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size);
     CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size);
     CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size);
@@ -800,11 +874,9 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
         TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
         CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_k, head_size);
     } else {
-        dv = at::empty_like(k);
+        dv = at::empty_like(v);
     }
 
-    // const at::Tensor& dout_padded = dout;
-
     // bool loop = seqlen_k > blocksize_c;
     // TODO: change later, for now set to true for simplicity
     bool loop = true;
@@ -818,9 +890,14 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
     at::Tensor dq_accum;
     at::Tensor dk_accum, dv_accum;
     if (loop) {
-        dq_accum = at::empty({batch_size, seqlen_q_rounded, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
-        // dk_accum = at::empty({batch_size, num_heads_k, seqlen_k_rounded, head_size_rounded}, opts.dtype(at::kFloat));
-        // dv_accum = at::empty({batch_size, num_heads_k, seqlen_k_rounded, head_size_rounded}, opts.dtype(at::kFloat));
+        if (!deterministic) {
+            dq_accum = at::empty({batch_size, seqlen_q_rounded, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        } else {
+            const int nsplits = (dprops->multiProcessorCount + batch_size * num_heads - 1) / (batch_size * num_heads);
+            dq_accum = at::zeros({nsplits, batch_size, seqlen_q_rounded, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        }
+        // dk_accum = torch::empty({batch_size, num_heads_k, seqlen_k_rounded, head_size_rounded}, opts.dtype(at::kFloat));
+        // dv_accum = torch::empty({batch_size, num_heads_k, seqlen_k_rounded, head_size_rounded}, opts.dtype(at::kFloat));
     }
 
     at::Tensor dk_expanded, dv_expanded;
@@ -854,10 +931,11 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
                      p_dropout,
                      softmax_scale,
                      window_size_left,
-                     window_size_right);
+                     window_size_right,
+                     deterministic);
+    params.dq_accum_split_stride = !deterministic ? 0 : dq_accum.stride(0);
 
     auto launch = &run_mha_bwd;
-    // launch(params, stream, /*configure=*/true);
 
     at::PhiloxCudaState philox_args;
     if (is_dropout) {
@@ -872,12 +950,14 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
     }
     params.philox_args = philox_args;
 
+    set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
+
     if (seqlen_q > 0) {
-        launch(params, stream, /*configure=*/false);
+        launch(params, stream);
     } else {
         // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
-        dk.zero_();
-        dv.zero_();
+        dk_expanded.zero_();
+        dv_expanded.zero_();
         softmax_d.zero_();
     }
 
@@ -901,17 +981,24 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
                c10::optional<at::Tensor> &dv_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
                const at::Tensor &cu_seqlens_q,  // b+1
                const at::Tensor &cu_seqlens_k,  // b+1
+               c10::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
                const int max_seqlen_q,
                const int max_seqlen_k,          // max sequence length to choose the kernel
                const float p_dropout,         // probability to drop
                const float softmax_scale,
                const bool zero_tensors,
                const bool is_causal,
-               const int window_size_left,
+               int window_size_left,
                int window_size_right,
+               const bool deterministic,
                const at::Tensor philox_seed,
                const at::Tensor philox_offset)
 {
+
+    #ifdef FLASHATTENTION_DISABLE_BACKWARD
+        TORCH_CHECK(false, "This flash attention build does not support backward.");
+    #endif
+
     if (is_causal) { window_size_right = 0; }
     auto dprops = at::cuda::getCurrentDeviceProperties();
     // bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
@@ -925,7 +1012,7 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
     auto stream = at::cuda::getCurrentCUDAStream().stream();
 
     auto q_dtype = q.dtype();
-    TORCH_CHECK(q_dtype == at::kHalf|| q_dtype == at::kBFloat16,
+    TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
                 "FlashAttention only support fp16 and bf16 data type");
     if (q_dtype == at::kBFloat16) {
         TORCH_CHECK(is_sm90 || is_sm8x, "bfloat16 is only supported on Ampere GPUs or newer");
@@ -962,8 +1049,8 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
     TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
     TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
     TORCH_CHECK(head_size <= 256, "FlashAttention backward only supports head dimension at most 256");
-    if (head_size > 192) {
-        TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim > 192 requires A100/A800 or H100/H800");
+    if (head_size > 192 && (head_size <= 224 || is_dropout)) {
+        TORCH_CHECK(is_sm80 || is_sm90, "FlashAttention backward for head dim 256 with dropout, or head dim 224 with/without dropout requires A100/A800 or H100/H800");
     }
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
 
@@ -974,6 +1061,9 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
 
     TORCH_CHECK(head_size == round_multiple(head_size_og, 8), "head_size must be head_size_og rounded to a multiple of 8");
 
+    if (window_size_left >= max_seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= max_seqlen_k) { window_size_right = -1; }
+
     CHECK_SHAPE(q, total_q, num_heads, head_size);
     CHECK_SHAPE(k, total_k, num_heads_k, head_size);
     CHECK_SHAPE(v, total_k, num_heads_k, head_size);
@@ -1008,11 +1098,9 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
         TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
         CHECK_SHAPE(dv, total_k, num_heads_k, head_size);
     } else {
-        dv = at::empty_like(k);
+        dv = at::empty_like(v);
     }
 
-    // const at::Tensor& dout_padded = dout;
-
     // bool loop = max_seqlen_k > blocksize_c;
     // TODO: change later, for now set to true for simplicity
     bool loop = true;
@@ -1033,7 +1121,12 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
         // cu_seqlens[i + 1] * 128 * i - 1. This ensures that the i-th sequence and (i + 1)-th sequence will
         // be at least 128 apart. It's ok for us to do atomicAdds up to 128 rows beyond what we're normally
         // allowed to do. So we won't have to do any bound checking, and performance should stay the same.
-        dq_accum = at::empty({total_q + 128 * batch_size, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        if (!deterministic) {
+            dq_accum = at::empty({total_q + 128 * batch_size, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        } else {
+            const int nsplits = (dprops->multiProcessorCount + batch_size * num_heads - 1) / (batch_size * num_heads);
+            dq_accum = at::zeros({nsplits, total_q + 128 * batch_size, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
+        }
     }
 
     at::Tensor dk_expanded, dv_expanded;
@@ -1072,10 +1165,11 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
                      p_dropout,
                      softmax_scale,
                      window_size_left,
-                     window_size_right);
+                     window_size_right,
+                     deterministic);
+    params.dq_accum_split_stride = !deterministic ? 0 : dq_accum.stride(0);
 
     auto launch = &run_mha_bwd;
-    // launch(params, stream, /*configure=*/true);
 
     at::PhiloxCudaState philox_args;
     if (is_dropout) {
@@ -1090,7 +1184,16 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
     }
     params.philox_args = philox_args;
 
-    launch(params, stream, /*configure=*/false);
+    set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
+
+    if (max_seqlen_q > 0) {
+        launch(params, stream);
+    } else {
+        // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
+        dk_expanded.zero_();
+        dv_expanded.zero_();
+        softmax_d.zero_();
+    }
 
     // For MQA/GQA we need to sum dK and dV across the groups
     if (num_heads_k != num_heads) {
@@ -1103,18 +1206,20 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
 
 std::tuple<at::Tensor, at::Tensor>
 mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_heads x head_size
-                const at::Tensor &kcache,            // batch_size_c x seqlen_k x num_heads_k x head_size
-                const at::Tensor &vcache,            // batch_size_c x seqlen_k x num_heads_k x head_size
+                const at::Tensor &kcache,            // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+                const at::Tensor &vcache,            // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
                 c10::optional<const at::Tensor> &k_, // batch_size x seqlen_knew x num_heads_k x head_size
                 c10::optional<const at::Tensor> &v_, // batch_size x seqlen_knew x num_heads_k x head_size
                 c10::optional<const at::Tensor> &seqlens_k_, // batch_size
                 c10::optional<const at::Tensor> &rotary_cos_, // seqlen_ro x (rotary_dim / 2)
                 c10::optional<const at::Tensor> &rotary_sin_, // seqlen_ro x (rotary_dim / 2)
                 c10::optional<const at::Tensor> &cache_batch_idx_, // indices to index into the KV cache
+                c10::optional<at::Tensor> &block_table_, // batch_size x max_num_blocks_per_seq
+                c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
                 c10::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
                 const float softmax_scale,
                 bool is_causal,
-                const int window_size_left,
+                int window_size_left,
                 int window_size_right,
                 bool is_rotary_interleaved,   // if true, rotary combines indices 0 & 1, else indices 0 & rotary_dim / 2
                 int num_splits
@@ -1143,25 +1248,41 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
     TORCH_CHECK(kcache.stride(-1) == 1, "Input tensor must have contiguous last dimension");
     TORCH_CHECK(vcache.stride(-1) == 1, "Input tensor must have contiguous last dimension");
 
+    at::Tensor block_table;
+    const bool paged_KV = block_table_.has_value();
+    if (paged_KV) {
+        TORCH_CHECK(!cache_batch_idx_.has_value(), "Paged KVcache does not support cache_batch_idx");
+        block_table = block_table_.value();
+        CHECK_DEVICE(block_table);
+        TORCH_CHECK(block_table.dtype() == at::kInt, "block_table must have dtype torch.int32");
+        TORCH_CHECK(block_table.stride(-1) == 1, "block_table must have contiguous last dimension");
+    }
+
     const auto sizes = q.sizes();
 
     const int batch_size = sizes[0];
     int seqlen_q = sizes[1];
     int num_heads = sizes[2];
     const int head_size_og = sizes[3];
-    const int seqlen_k = kcache.size(1);
+
+    const int max_num_blocks_per_seq = !paged_KV ? 0 : block_table.size(1);
+    const int num_blocks = !paged_KV ? 0 : kcache.size(0);
+    const int page_block_size = !paged_KV ? 1 : kcache.size(1);
+    TORCH_CHECK(!paged_KV || page_block_size % 256 == 0, "Paged KV cache block size must be divisible by 256");
+    const int seqlen_k = !paged_KV ? kcache.size(1) : max_num_blocks_per_seq * page_block_size;
     const int num_heads_k = kcache.size(2);
-    const int batch_size_c = kcache.size(0);
-    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    const int batch_size_c = !paged_KV ? kcache.size(0) : batch_size;
+    TORCH_CHECK(batch_size > 0, "batch size must be postive");
     TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
 
-    if (seqlen_q == 1) { is_causal = false; }  // causal=true is the same as causal=false in this case
+    // causal=true is the same as causal=false in this case
+    if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }
     if (is_causal) { window_size_right = 0; }
 
     // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case
     // H/t Daniel Haziza
-    const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && head_size_og % 8 == 0;
+    const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && head_size_og % 8 == 0 && !alibi_slopes_.has_value();
     if (seqlenq_ngroups_swapped) {
         const int ngroups = num_heads / num_heads_k;
         q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2);
@@ -1169,9 +1290,18 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
         num_heads = num_heads_k;
     }
 
+    if (window_size_left >= seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= seqlen_k) { window_size_right = -1; }
+
     CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og);
-    CHECK_SHAPE(kcache, batch_size_c, seqlen_k, num_heads_k, head_size_og);
-    CHECK_SHAPE(vcache, batch_size_c, seqlen_k, num_heads_k, head_size_og);
+    if (!paged_KV) {
+        CHECK_SHAPE(kcache, batch_size_c, seqlen_k, num_heads_k, head_size_og);
+        CHECK_SHAPE(vcache, batch_size_c, seqlen_k, num_heads_k, head_size_og);
+    } else {
+        CHECK_SHAPE(kcache, num_blocks, page_block_size, num_heads_k, head_size_og);
+        CHECK_SHAPE(vcache, num_blocks, page_block_size, num_heads_k, head_size_og);
+        CHECK_SHAPE(block_table, batch_size, max_num_blocks_per_seq);
+    }
 
     at::Tensor q_padded, kcache_padded, vcache_padded;
     if (head_size_og % 8 != 0) {
@@ -1310,27 +1440,24 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
         TORCH_CHECK(cache_batch_idx.scalar_type() == at::kInt, "cache_batch_idx must have dtype int32");
         params.cache_batch_idx = reinterpret_cast<int *>(cache_batch_idx.data_ptr());
     }
-    // This needs to match with run_mha_fwd_splitkv_dispatch
-    const int block_n = head_size <= 64 ? 256 : (head_size <= 128 ? 128 : 64);
-    const int num_n_blocks = (seqlen_k + block_n - 1) / block_n;
-    // Technically kBlockM = 64 only for the splitKV kernels, not the standard kernel.
-    // In any case we don't expect seqlen_q to be larger than 64 for inference.
-    const int num_m_blocks = (seqlen_q + 64 - 1) / 64;
-    params.num_splits = num_splits;
-    if (num_splits < 1) {
-        params.num_splits = num_splits_heuristic(batch_size * num_heads * num_m_blocks, dprops->multiProcessorCount, num_n_blocks, 128);
-    }
-    TORCH_CHECK(params.num_splits <= 128, "num_splits > 128 not supported");
-    if (params.num_splits > 1) {
-        at::Tensor softmax_lse_accum = at::empty({params.num_splits, batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
-        at::Tensor out_accum = at::empty({params.num_splits, batch_size, num_heads, seqlen_q, head_size_rounded}, opts.dtype(at::kFloat));
-        params.softmax_lseaccum_ptr = softmax_lse_accum.data_ptr();
-        params.oaccum_ptr = out_accum.data_ptr();
+
+    set_params_splitkv(params, batch_size, num_heads,
+                       head_size, seqlen_k, seqlen_q,
+                       head_size_rounded, /*dropout*/0.f, num_splits, dprops, opts);
+
+    if (paged_KV) {
+        params.block_table = block_table.data_ptr<int>();
+        params.block_table_batch_stride = block_table.stride(0);
     }
+    params.page_block_size = page_block_size;
+
+
+    set_params_alibi(params, alibi_slopes_, batch_size, num_heads);
 
     auto stream = at::cuda::getCurrentCUDAStream().stream();
-    // Only split kernel supports appending to KV cache, or indexing to the cache with cache_batch_idx
-    run_mha_fwd(params, stream, /*force_split_kernel=*/k_.has_value() || cache_batch_idx_.has_value());
+    // Only split kernel supports appending to KV cache, or indexing to the cache with cache_batch_idx,
+    // or paged KV cache
+    run_mha_fwd(params, stream, /*force_split_kernel=*/k_.has_value() || cache_batch_idx_.has_value() || paged_KV);
 
     if (head_size_og % 8 != 0) {
         // out = out.index({"...", at::indexing::Slice(at::indexing::None, head_size_og)});
@@ -1352,6 +1479,7 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
     }
     return {out, softmax_lse};
 }
+
 } // namespace pytorch_fmha
 
 #endif
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h
index fd15d929e300b..2745b28dca29b 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.h
@@ -12,10 +12,11 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
         const at::Tensor &k,         // batch_size x seqlen_k x num_heads_k x head_size
         const at::Tensor &v,         // batch_size x seqlen_k x num_heads_k x head_size
         c10::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
+        c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
         const float p_dropout,
         const float softmax_scale,
         bool is_causal,
-        const int window_size_left,
+        int window_size_left,
         int window_size_right,
         const bool return_softmax,
         c10::optional<at::Generator> gen_);
@@ -28,13 +29,14 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
                const at::Tensor &cu_seqlens_q,  // b+1
                const at::Tensor &cu_seqlens_k,  // b+1
                c10::optional<at::Tensor> &seqused_k, // b. If given, only this many elements of each batch element's keys are used.
-               const int max_seqlen_q,
+               c10::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
+               int max_seqlen_q,
                const int max_seqlen_k,
                const float p_dropout,
                const float softmax_scale,
                const bool zero_tensors,
-               const bool is_causal,
-               const int window_size_left,
+               bool is_causal,
+               int window_size_left,
                int window_size_right,
                const bool return_softmax,
                c10::optional<at::Generator> gen_);
@@ -50,11 +52,13 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
         c10::optional<at::Tensor> &dq_,   // batch_size x seqlen_q x num_heads x head_size
         c10::optional<at::Tensor> &dk_,   // batch_size x seqlen_k x num_heads_k x head_size
         c10::optional<at::Tensor> &dv_,   // batch_size x seqlen_k x num_heads_k x head_size
+        c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
         const float p_dropout,         // probability to drop
         const float softmax_scale,
         const bool is_causal,
-        const int window_size_left,
+        int window_size_left,
         int window_size_right,
+        const bool deterministic,
         const at::Tensor philox_seed,
         const at::Tensor philox_offset);
 
@@ -70,14 +74,16 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
                c10::optional<at::Tensor> &dv_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
                const at::Tensor &cu_seqlens_q,  // b+1
                const at::Tensor &cu_seqlens_k,  // b+1
+               c10::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
                const int max_seqlen_q,
                const int max_seqlen_k,          // max sequence length to choose the kernel
                const float p_dropout,         // probability to drop
                const float softmax_scale,
                const bool zero_tensors,
                const bool is_causal,
-               const int window_size_left,
+               int window_size_left,
                int window_size_right,
+               const bool deterministic,
                const at::Tensor philox_seed,
                const at::Tensor philox_offset);
 
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_bwd_kernel.h b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_bwd_kernel.h
index 4b68639c7288f..db817a0657ffc 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_bwd_kernel.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_bwd_kernel.h
@@ -1,24 +1,23 @@
 /***************************************************************************************************
- * Copyright (c) 2023, Tri Dao.
+ * Copyright (c) 2024, Tri Dao.
  ******************************************************************************/
 
 #pragma once
 
 #include <ATen/cuda/PhiloxUtils.cuh>
-
 #include <cute/algorithm/copy.hpp>
-#include <cute/algorithm/gemm.hpp>
 
 #include <cutlass/cutlass.h>
 #include <cutlass/array.h>
 #include <cutlass/numeric_types.h>
-#include <cutlass/numeric_conversion.h>
 
 #include <ATen/native/transformers/cuda/flash_attn/block_info.h>
 #include <ATen/native/transformers/cuda/flash_attn/kernel_traits.h>
 #include <ATen/native/transformers/cuda/flash_attn/utils.h>
 #include <ATen/native/transformers/cuda/flash_attn/softmax.h>
-#include <ATen/native/transformers/cuda/flash_attn/philox.cuh>
+#include <ATen/native/transformers/cuda/flash_attn/mask.h>
+#include <ATen/native/transformers/cuda/flash_attn/dropout.h>
+#include <ATen/native/transformers/cuda/flash_attn/alibi.h>
 
 namespace pytorch_flash {
 
@@ -33,11 +32,12 @@ CUTE_HOST_DEVICE
 auto
 make_tiled_copy_B_warpcontiguousN(Copy_Atom<Args...> const& copy_atom,
                                   TiledMMA           const& tiled_mma) {
-    using TileShape_MNK = typename TiledMMA::TiledShape_MNK;
+    constexpr int TileShape_N = decltype(tiled_mma.template tile_size_mnk<1>())::value;
+    constexpr int TileShape_K = decltype(tiled_mma.template tile_size_mnk<2>())::value;
     using AtomShape_MNK = typename TiledMMA::AtomShape_MNK;
     constexpr int AtomShape_N = decltype(size<1>(AtomShape_MNK{}))::value;
     // Divide by 2 because right now we always use 2 for the ValLayout
-    constexpr int kNWarpsN = decltype(size<1>(TileShape_MNK{}))::value / AtomShape_N / 2;
+    constexpr int kNWarpsN = TileShape_N / AtomShape_N / 2;
     constexpr int MMAStride_N = MMA_N * AtomShape_N * 2;
     // This gives the correct layout, idk why.
     // auto t = make_tile(Layout<Shape<Shape<_8, _2>, _2>,
@@ -46,7 +46,7 @@ make_tiled_copy_B_warpcontiguousN(Copy_Atom<Args...> const& copy_atom,
     //                           Stride<_1, _64, _8> >{},
     auto t = make_tile(Layout<Shape<Int<AtomShape_N>, Int<kNWarpsN>, _2>,   // (8, 2, 2) or (8, 4, 2)
                               Stride<_1, Int<MMAStride_N>, _8> >{},       // (1, 64, 8) or (1, 32, 8)
-                       make_layout(size<2>(TileShape_MNK{})));
+                       make_layout(Int<TileShape_K>{}));
     // if (cute::thread0()) {printf("make_tiled_copy_B_warpcontiguousN "); print(t); printf("\n");  }
     return make_tiled_copy_impl(copy_atom, tiled_mma.get_layoutB_TV(), t);
 }
@@ -60,13 +60,14 @@ CUTE_HOST_DEVICE
 auto
 make_tiled_copy_C_warpcontiguousN(Copy_Atom<Args...> const& copy_atom,
                                   TiledMMA           const& tiled_mma) {
-    using TileShape_MNK = typename TiledMMA::TiledShape_MNK;
+    constexpr int TileShape_M = decltype(tiled_mma.template tile_size_mnk<0>())::value;
+    constexpr int TileShape_N = decltype(tiled_mma.template tile_size_mnk<1>())::value;
     using AtomShape_MNK = typename TiledMMA::AtomShape_MNK;
     constexpr int AtomShape_N = decltype(size<1>(AtomShape_MNK{}))::value;
     // Divide by 2 because right now we always use 2 for the ValLayout
-    constexpr int kNWarpsN = decltype(size<1>(TileShape_MNK{}))::value / AtomShape_N / 2;
+    constexpr int kNWarpsN = TileShape_N / AtomShape_N / 2;
     constexpr int MMAStride_N = MMA_N * AtomShape_N * 2;
-    auto t = make_tile(make_layout(size<0>(TileShape_MNK{})),
+    auto t = make_tile(make_layout(Int<TileShape_M>{}),
                        Layout<Shape<Int<AtomShape_N>, Int<kNWarpsN>, _2>,   // (8, 2, 2) or (8, 4, 2)
                               Stride<_1, Int<MMAStride_N>, _8> >{});       // (1, 64, 8) or (1, 32, 8)
     // if (cute::thread0()) {printf("make_tiled_copy_C_warpcontiguousN "); print(t); printf("\n");  }
@@ -75,359 +76,7 @@ make_tiled_copy_C_warpcontiguousN(Copy_Atom<Args...> const& copy_atom,
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <int THREADS_PER_ROW, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
-inline __device__ void dot_do_o(Tensor<Engine0, Layout0> const &do_, Tensor<Engine0, Layout0> const &o,
-                                Tensor<Engine1, Layout1> &dP_sum, const int gdP_col_stride, const float scale) {
-    static_assert(Layout0::rank == 3, "Only support 3D Tensor");
-    static_assert(Layout1::rank == 1, "Only support 1D Tensor");
-    CUTE_STATIC_ASSERT_V(do_.layout() == o.layout());
-    // Reshape do_ and o from (8, kBlockM / 32, kHeadDim / 64) to (kBlockM / 32, 8 * kHeadDim / 64)
-    // The last coordinate is the "page".
-    Tensor do_reshaped = make_tensor(do_.data(), make_layout(get<1>(do_.layout()),
-                                                             make_layout(get<0>(do_.layout()),
-                                                                         get<2>(do_.layout()))));
-    Tensor o_reshaped = make_tensor(o.data(), do_reshaped.layout());
-    Tensor do_fp32 = pytorch_flash::convert_type<float>(do_reshaped);
-    Tensor o_fp32 = pytorch_flash::convert_type<float>(o_reshaped);
-    #pragma unroll
-    for (int mi = 0; mi < size<0>(do_reshaped); ++mi) {
-        float dP_sum_cur = do_fp32(mi, 0) * o_fp32(mi, 0);
-        #pragma unroll
-        for (int ni = 1; ni < size<1>(do_reshaped); ni++) {
-            dP_sum_cur += do_fp32(mi, ni) * o_fp32(mi, ni);
-        }
-        pytorch_flash::SumOp<float> sum_op;
-        dP_sum_cur = pytorch_flash::Allreduce<THREADS_PER_ROW>::run(dP_sum_cur, sum_op) * scale;
-        if (threadIdx.x % THREADS_PER_ROW == 0) {
-            dP_sum(mi * gdP_col_stride + threadIdx.x / THREADS_PER_ROW) = dP_sum_cur;
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Just compute dot(do, o) and write the result (softmax_d) to global memory as a separate kernel.
-// This is used in the case where we want to parallelize the backward across seqlen_k.
-template<bool Clear_dQaccum=true, typename Kernel_traits, typename Params>
-inline __device__ void compute_dot_do_o(const Params &params) {
-    using Element = typename Kernel_traits::Element;
-    using ElementAccum = typename Kernel_traits::ElementAccum;
-    using index_t = typename Kernel_traits::index_t;
-
-    const int m_block = blockIdx.x;
-    // The block index for the batch.
-    const int bidb = blockIdx.y;
-    // The block index for the head.
-    const int bidh = blockIdx.z;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    constexpr int kBlockM = Kernel_traits::kBlockM;
-    constexpr int kHeadDim = Kernel_traits::kHeadDim;
-
-    const BlockInfo binfo(params, bidb);
-    if (m_block * kBlockM >= binfo.actual_seqlen_q) return;
-
-    const index_t row_offset_do = binfo.q_offset(params.do_batch_stride, params.do_row_stride, bidb)
-        + m_block * kBlockM * params.do_row_stride + bidh * params.do_head_stride;
-    const index_t row_offset_o = binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb)
-        + m_block * kBlockM * params.o_row_stride + bidh * params.o_head_stride;
-    const index_t row_offset_dq_accum = binfo.q_offset(params.seqlen_q_rounded * params.h * params.d_rounded, params.h * params.d_rounded, bidb)
-        + (m_block * kBlockM + (params.cu_seqlens_q == nullptr ? 0 : 128 * bidb)) * params.h * params.d_rounded + bidh * params.d_rounded;
-    const index_t row_offset_dpsum = (bidb * params.h + bidh) * params.seqlen_q_rounded + m_block * kBlockM;
-
-    Tensor gdO = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.do_ptr) + row_offset_do),
-                             Shape<Int<kBlockM>, Int<kHeadDim>>{},
-                             make_stride(params.do_row_stride, _1{}));
-    Tensor gO = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.o_ptr) + row_offset_o),
-                            Shape<Int<kBlockM>, Int<kHeadDim>>{},
-                            make_stride(params.o_row_stride, _1{}));
-    Tensor gdQaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dq_accum_ptr) + row_offset_dq_accum),
-                                  Shape<Int<kBlockM>, Int<kHeadDim>>{},
-                                  make_stride(params.h * params.d_rounded, _1{}));
-    Tensor dP_sum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dsoftmax_sum) + row_offset_dpsum),
-                                Shape<Int<kBlockM>>{}, Stride<_1>{});
-
-    typename Kernel_traits::GmemTiledCopydO gmem_tiled_copy_dO;
-    auto gmem_thr_copy_dO = gmem_tiled_copy_dO.get_thread_slice(tidx);
-    // TODO: careful, we're zeroing out dQaccum with type float4, but when
-    // we do atomicAdds, we use type float. The layouts are different. Check this.
-    typename Kernel_traits::GmemTiledCopydQaccum gmem_tiled_copy_dQaccum;
-    auto gmem_thr_copy_dQaccum = gmem_tiled_copy_dQaccum.get_thread_slice(tidx);
-
-    Tensor tdOgdO = gmem_thr_copy_dO.partition_S(gdO);
-    Tensor tdOgO = gmem_thr_copy_dO.partition_S(gO);
-    Tensor tdQgdQaccum = gmem_thr_copy_dQaccum.partition_D(gdQaccum);
-
-    Tensor cdO = make_identity_tensor(Shape<Int<kBlockM>, Int<kHeadDim>>{});    // (BLK_M,BLK_K) -> (blk_m,blk_k)
-    Tensor tdOcdO = gmem_thr_copy_dO.partition_S(cdO);
-
-    // Allocate predicate tensors for k
-    Tensor tdOpdO = make_tensor<bool>(make_shape(size<2>(tdOgdO)));
-    // Set predicates for k bounds
-    #pragma unroll
-    for (int k = 0; k < size(tdOpdO); ++k) {tdOpdO(k) = get<1>(tdOcdO(0, 0, k)) < params.d;}
-
-    Tensor tdOrdO = make_fragment_like(tdOgdO);
-    Tensor tdOrO = make_fragment_like(tdOgO);
-    pytorch_flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/true>(
-        gmem_tiled_copy_dO, tdOgdO, tdOrdO, tdOcdO, tdOpdO, binfo.actual_seqlen_q - m_block * kBlockM
-    );
-    pytorch_flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/true>(
-        gmem_tiled_copy_dO, tdOgO, tdOrO, tdOcdO, tdOpdO, binfo.actual_seqlen_q - m_block * kBlockM
-    );
-    // By right we need to scale dP up by 1/p_dropout, but instead we don't and only scale the final
-    // results (dQ and dK) by 1/p_dropout. So we need to keep dP_sum scaled down by p_dropout here,
-    // so that (dP - dP_sum) is on the same scale.
-    dot_do_o<Kernel_traits::kGmemThreadsPerRow>(tdOrdO, tdOrO, dP_sum,
-                                                Kernel_traits::kNThreads / (Kernel_traits::kGmemThreadsPerRow), params.p_dropout);
-    if (Clear_dQaccum) {
-        // We're actually not zero'ing out all of dQaccum, but only the part that we're going to
-        // do atomicAdds on.
-        Tensor zero = make_fragment_like(tdQgdQaccum);
-        clear(zero);
-        cute::copy(gmem_tiled_copy_dQaccum, zero, tdQgdQaccum);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename Kernel_traits, typename Params>
-inline __device__ void clear_dKVaccum(const Params &params) {
-    using ElementAccum = typename Kernel_traits::ElementAccum;
-    using index_t = typename Kernel_traits::index_t;
-
-    const int n_block = blockIdx.x;
-    // The block index for the batch.
-    const int bidb = blockIdx.y;
-    // The block index for the head.
-    const int bidh = blockIdx.z;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    constexpr int kBlockN = Kernel_traits::kBlockN;
-    constexpr int kHeadDim = Kernel_traits::kHeadDim;
-
-    const BlockInfo binfo(params, bidb);
-    if (n_block * kBlockN >= binfo.actual_seqlen_k) return;
-
-    const index_t row_offset_dkv_accum = ((bidb * params.h_k + bidh) * params.seqlen_k_rounded + n_block * kBlockN) * params.d_rounded;
-
-    Tensor gdKaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dk_accum_ptr) + row_offset_dkv_accum),
-                                  Shape<Int<kBlockN>, Int<kHeadDim>>{}, Stride<Int<kHeadDim>, _1>{});
-    Tensor gdVaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dv_accum_ptr) + row_offset_dkv_accum),
-                                  Shape<Int<kBlockN>, Int<kHeadDim>>{}, Stride<Int<kHeadDim>, _1>{});
-
-    typename Kernel_traits::GmemTiledCopydQaccum gmem_tiled_copy_dKVaccum;
-    auto gmem_thr_copy_dKVaccum = gmem_tiled_copy_dKVaccum.get_thread_slice(tidx);
-    Tensor tdKgdKaccum = gmem_thr_copy_dKVaccum.partition_D(gdKaccum);
-    Tensor tdVgdVaccum = gmem_thr_copy_dKVaccum.partition_D(gdVaccum);
-    Tensor zero = make_fragment_like(tdKgdKaccum);
-    clear(zero);
-    cute::copy(gmem_tiled_copy_dKVaccum, zero, tdKgdKaccum);
-    cute::copy(gmem_tiled_copy_dKVaccum, zero, tdVgdVaccum);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Convert dQ from dQaccum (in float) to fp16/bf16.
-// This is used in the case where we want to parallelize the backward across seqlen_k.
-template<typename Kernel_traits, typename Params>
-inline __device__ void convert_dQ(const Params &params) {
-    using Element = typename Kernel_traits::Element;
-    using ElementAccum = typename Kernel_traits::ElementAccum;
-    using index_t = typename Kernel_traits::index_t;
-
-    // Shared memory.
-    extern __shared__ char smem_[];
-
-    const int m_block = blockIdx.x;
-    // The block index for the batch.
-    const int bidb = blockIdx.y;
-    // The block index for the head.
-    const int bidh = blockIdx.z;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    constexpr int kBlockM = Kernel_traits::kBlockM;
-    constexpr int kHeadDim = Kernel_traits::kHeadDim;
-
-    const BlockInfo binfo(params, bidb);
-    if (m_block * kBlockM >= binfo.actual_seqlen_q) return;
-
-    const index_t row_offset_dq = binfo.q_offset(params.dq_batch_stride, params.dq_row_stride, bidb)
-        + m_block * kBlockM * params.dq_row_stride + bidh * params.dq_head_stride;
-    const index_t row_offset_dq_accum = binfo.q_offset(params.seqlen_q_rounded * params.h * params.d_rounded, params.h * params.d_rounded, bidb)
-        + (m_block * kBlockM + (params.cu_seqlens_q == nullptr ? 0 : 128 * bidb)) * params.h * params.d_rounded + bidh * params.d_rounded;
-
-    Tensor gdQ = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.dq_ptr) + row_offset_dq),
-                             Shape<Int<kBlockM>, Int<kHeadDim>>{},
-                             make_stride(params.dq_row_stride, _1{}));
-    Tensor gdQaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dq_accum_ptr) + row_offset_dq_accum),
-                                  Shape<Int<kBlockM>, Int<kHeadDim>>{},
-                                  make_stride(params.h * params.d_rounded, _1{}));
-
-    Tensor sdQ = make_tensor(make_smem_ptr(reinterpret_cast<Element *>(smem_)),
-                             typename Kernel_traits::SmemLayoutdQ{});
-
-    typename Kernel_traits::GmemTiledCopydQ gmem_tiled_copy_dQ;
-    auto gmem_thr_copy_dQ = gmem_tiled_copy_dQ.get_thread_slice(tidx);
-    typename Kernel_traits::GmemTiledCopydQaccumAtomicAdd gmem_tiled_copy_dQaccum;
-    auto gmem_thr_copy_dQaccum = gmem_tiled_copy_dQaccum.get_thread_slice(tidx);
-
-    typename Kernel_traits::TiledMmadQ tiled_mma_dq;
-    auto smem_tiled_copy_dQ = make_tiled_copy_C(typename Kernel_traits::SmemCopyAtomdQ{}, tiled_mma_dq);
-    auto smem_thr_copy_dQ = smem_tiled_copy_dQ.get_thread_slice(tidx);
-    Tensor taccdQsdQ = smem_thr_copy_dQ.partition_D(sdQ);  // ((Atom,AtomNum),PIPE_M,PIPE_N)
-
-    Tensor tdQsdQ = gmem_thr_copy_dQ.partition_S(sdQ);    // ((Atom,AtomNum),ATOM_M,ATOM_N)
-    Tensor tdQgdQ = gmem_thr_copy_dQ.partition_D(gdQ);
-    Tensor tdQgdQaccum = gmem_thr_copy_dQaccum.partition_S(gdQaccum);
-
-    Tensor acc_dq = partition_fragment_C(tiled_mma_dq, Shape<Int<kBlockM>, Int<kHeadDim>>{});  // MMA, MMA_N, MMA_K
-    CUTE_STATIC_ASSERT_V(size(acc_dq) == size(tdQgdQaccum));
-
-    Tensor tdQrdQaccum = make_fragment_like(tdQgdQaccum);
-    cute::copy(gmem_tiled_copy_dQaccum, tdQgdQaccum, tdQrdQaccum);
-    #pragma unroll
-    for (int i = 0; i < size(acc_dq); ++i) {
-        acc_dq(i) = tdQrdQaccum(i) * params.scale_softmax_rp_dropout;
-    }
-    // Convert acc_dq from fp32 to fp16
-    Tensor rdQ = pytorch_flash::convert_type<Element>(acc_dq);
-    Tensor taccdQrdQ = smem_thr_copy_dQ.retile_S(rdQ);  // ((Atom,AtomNum), MMA_N, MMA_N)
-    cute::copy(smem_tiled_copy_dQ, taccdQrdQ, taccdQsdQ);
-    __syncthreads();
-    Tensor tdQrdQ = make_tensor<Element>(shape(tdQgdQ));
-    cute::copy(gmem_tiled_copy_dQ, tdQsdQ, tdQrdQ);
-
-    Tensor cdQ = make_identity_tensor(Shape<Int<kBlockM>, Int<kHeadDim>>{});    // (BLK_M,BLK_K) -> (blk_m,blk_k)
-    Tensor tdQcdQ = gmem_thr_copy_dQ.partition_D(cdQ);
-    Tensor tdQpdQ = make_tensor<bool>(make_shape(size<2>(tdQgdQ)));
-    #pragma unroll
-    for (int k = 0; k < size(tdQpdQ); ++k) { tdQpdQ(k) = get<1>(tdQcdQ(0, 0, k)) < params.d; }
-    // Clear_OOB_K must be false since we don't want to write zeros to gmem
-    pytorch_flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
-        gmem_tiled_copy_dQ, tdQrdQ, tdQgdQ, tdQcdQ, tdQpdQ, binfo.actual_seqlen_q - m_block * kBlockM
-    );
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Convert dK and dV from dKaccum and dVaccum (in float) to fp16/bf16.
-// This is used in the case where we want to parallelize the backward across seqlen_q.
-template<typename Kernel_traits, typename Params>
-inline __device__ void convert_dKV(const Params &params) {
-    using Element = typename Kernel_traits::Element;
-    using ElementAccum = typename Kernel_traits::ElementAccum;
-    using index_t = typename Kernel_traits::index_t;
-
-    // Shared memory.
-    extern __shared__ char smem_[];
-
-    const int n_block = blockIdx.x;
-    // The block index for the batch.
-    const int bidb = blockIdx.y;
-    // The block index for the head.
-    const int bidh = blockIdx.z;
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    constexpr int kBlockN = Kernel_traits::kBlockN;
-    constexpr int kHeadDim = Kernel_traits::kHeadDim;
-
-    const BlockInfo binfo(params, bidb);
-    if (n_block * kBlockN >= binfo.actual_seqlen_k) return;
-
-    const index_t row_offset_dk = binfo.k_offset(params.dk_batch_stride, params.dk_row_stride, bidb)
-        + n_block * kBlockN * params.dk_row_stride + bidh * params.dk_head_stride;
-    const index_t row_offset_dv = binfo.k_offset(params.dv_batch_stride, params.dv_row_stride, bidb)
-        + n_block * kBlockN * params.dv_row_stride + bidh * params.dv_head_stride;
-    const index_t row_offset_dkv_accum = ((bidb * params.h_k + bidh) * params.seqlen_k_rounded
-                                          + n_block * kBlockN) * params.d_rounded;
-
-    Tensor gdK = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.dk_ptr) + row_offset_dk),
-                             Shape<Int<kBlockN>, Int<kHeadDim>>{},
-                             make_stride(params.dk_row_stride, _1{}));
-    Tensor gdV = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.dv_ptr) + row_offset_dv),
-                             Shape<Int<kBlockN>, Int<kHeadDim>>{},
-                             make_stride(params.dv_row_stride, _1{}));
-    Tensor gdKaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dk_accum_ptr) + row_offset_dkv_accum),
-                                  Shape<Int<kBlockN>, Int<kHeadDim>>{},
-                                  Stride<Int<kHeadDim>, _1>{});
-    Tensor gdVaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dv_accum_ptr) + row_offset_dkv_accum),
-                                  Shape<Int<kBlockN>, Int<kHeadDim>>{},
-                                  Stride<Int<kHeadDim>, _1>{});
-
-    Tensor sdK = make_tensor(make_smem_ptr(reinterpret_cast<Element *>(smem_)),
-                             typename Kernel_traits::SmemLayoutdKV{});
-    Tensor sdV = make_tensor(sdK.data() + size(sdK), typename Kernel_traits::SmemLayoutdKV{}); // (SMEM_N, SMEM_K)
-
-    typename Kernel_traits::GmemTiledCopydQ gmem_tiled_copy_dKV;
-    auto gmem_thr_copy_dKV = gmem_tiled_copy_dKV.get_thread_slice(tidx);
-    typename Kernel_traits::GmemTiledCopydQaccumAtomicAdd gmem_tiled_copy_dKVaccum;
-    auto gmem_thr_copy_dKVaccum = gmem_tiled_copy_dKVaccum.get_thread_slice(tidx);
-
-    typename Kernel_traits::TiledMmadKV tiled_mma_dkv;
-    auto smem_tiled_copy_dKV = make_tiled_copy_C(typename Kernel_traits::SmemCopyAtomdKV{}, tiled_mma_dkv);
-    auto smem_thr_copy_dKV = smem_tiled_copy_dKV.get_thread_slice(tidx);
-    Tensor taccdKsdK = smem_thr_copy_dKV.partition_D(sdK);  // ((Atom,AtomNum),PIPE_M,PIPE_N)
-    Tensor taccdVsdV = smem_thr_copy_dKV.partition_D(sdV);  // ((Atom,AtomNum),PIPE_M,PIPE_N)
-
-    Tensor tdKsdK = gmem_thr_copy_dKV.partition_S(sdK);    // ((Atom,AtomNum),ATOM_M,ATOM_N)
-    Tensor tdKgdK = gmem_thr_copy_dKV.partition_D(gdK);
-    Tensor tdVsdV = gmem_thr_copy_dKV.partition_S(sdV);    // ((Atom,AtomNum),ATOM_M,ATOM_N)
-    Tensor tdVgdV = gmem_thr_copy_dKV.partition_D(gdV);
-    Tensor tdKgdKaccum = gmem_thr_copy_dKVaccum.partition_S(gdKaccum);
-    Tensor tdVgdVaccum = gmem_thr_copy_dKVaccum.partition_S(gdVaccum);
-
-    Tensor acc_dk = partition_fragment_C(tiled_mma_dkv, Shape<Int<kBlockN>, Int<kHeadDim>>{});  // MMA, MMA_N, MMA_K
-    Tensor acc_dv = partition_fragment_C(tiled_mma_dkv, Shape<Int<kBlockN>, Int<kHeadDim>>{});  // MMA, MMA_N, MMA_K
-    CUTE_STATIC_ASSERT_V(size(acc_dk) == size(tdKgdKaccum));
-    CUTE_STATIC_ASSERT_V(size(acc_dv) == size(tdVgdVaccum));
-
-    Tensor tdKrdKaccum = make_fragment_like(tdKgdKaccum);
-    Tensor tdVrdVaccum = make_fragment_like(tdVgdVaccum);
-    cute::copy(gmem_tiled_copy_dKVaccum, tdKgdKaccum, tdKrdKaccum);
-    cute::copy(gmem_tiled_copy_dKVaccum, tdVgdVaccum, tdVrdVaccum);
-    #pragma unroll
-    for (int i = 0; i < size(acc_dk); ++i) {
-        acc_dk(i) = tdKrdKaccum(i) * params.scale_softmax_rp_dropout;
-    }
-    #pragma unroll
-    for (int i = 0; i < size(acc_dv); ++i) {
-        acc_dv(i) = tdVrdVaccum(i) * params.rp_dropout;
-    }
-    // Convert acc_dk from fp32 to fp16
-    Tensor rdK = pytorch_flash::convert_type<Element>(acc_dk);
-    Tensor rdV = pytorch_flash::convert_type<Element>(acc_dv);
-    Tensor taccdKrdK = smem_thr_copy_dKV.retile_S(rdK);  // ((Atom,AtomNum), MMA_N, MMA_N)
-    Tensor taccdVrdV = smem_thr_copy_dKV.retile_S(rdV);  // ((Atom,AtomNum), MMA_N, MMA_N)
-    cute::copy(smem_tiled_copy_dKV, taccdKrdK, taccdKsdK);
-    cute::copy(smem_tiled_copy_dKV, taccdVrdV, taccdVsdV);
-    __syncthreads();
-    Tensor tdKrdK = make_tensor<Element>(shape(tdKgdK));
-    Tensor tdVrdV = make_tensor<Element>(shape(tdVgdV));
-    cute::copy(gmem_tiled_copy_dKV, tdKsdK, tdKrdK);
-    cute::copy(gmem_tiled_copy_dKV, tdVsdV, tdVrdV);
-
-    Tensor cdKV = make_identity_tensor(Shape<Int<kBlockN>, Int<kHeadDim>>{});    // (BLK_M,BLK_K) -> (blk_m,blk_k)
-    Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV);
-    Tensor tdKVpdKV = make_tensor<bool>(make_shape(size<2>(tdKgdK)));
-    #pragma unroll
-    for (int k = 0; k < size(tdKVpdKV); ++k) { tdKVpdKV(k) = get<1>(tdKVcdKV(0, 0, k)) < params.d; }
-    // Clear_OOB_K must be false since we don't want to write zeros to gmem
-    pytorch_flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
-        gmem_tiled_copy_dKV, tdKrdK, tdKgdK, tdKVcdKV, tdKVpdKV, binfo.actual_seqlen_k - n_block * kBlockN
-    );
-    pytorch_flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
-        gmem_tiled_copy_dKV, tdVrdV, tdVgdV, tdKVcdKV, tdKVpdKV, binfo.actual_seqlen_k - n_block * kBlockN
-    );
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_local, bool Is_even_MN, bool Is_even_K, bool Is_first, bool Is_last, bool Seq_parallel=false, typename Params>
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Is_first, bool Is_last, bool Seq_parallel=false, typename Params>
 inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const int bidb, const int bidh, const int n_block) {
 
     using Element = typename Kernel_traits::Element;
@@ -443,8 +92,7 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const in
     constexpr int kBlockM = Kernel_traits::kBlockM;
     constexpr int kBlockN = Kernel_traits::kBlockN;
     constexpr int kHeadDim = Kernel_traits::kHeadDim;
-    // constexpr int kNWarps = Kernel_traits::kNWarps;
-    constexpr int MMA_N_SdP = kBlockN / decltype(size<1>(typename Kernel_traits::TiledMmaSdP::TiledShape_MNK{}))::value;
+    constexpr int MMA_N_SdP = kBlockN / decltype(typename Kernel_traits::TiledMmaSdP{}.template tile_size_mnk<1>())::value;
     constexpr int AtomLayoutMS = Kernel_traits::AtomLayoutMSdP;
     constexpr bool Double_buffer = !Kernel_traits::No_double_buffer;
 
@@ -469,7 +117,9 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const in
     const index_t row_offset_dq = binfo.q_offset(params.dq_batch_stride, params.dq_row_stride, bidb)
         + (m_block_max - 1) * kBlockM * params.dq_row_stride + bidh * params.dq_head_stride;
     const index_t row_offset_dq_accum = binfo.q_offset(params.seqlen_q_rounded * params.h * params.d_rounded, params.h * params.d_rounded, bidb)
-        + ((m_block_max - 1) * kBlockM + (params.cu_seqlens_q == nullptr ? 0 : 128 * bidb)) * params.h * params.d_rounded + bidh * params.d_rounded;
+        + ((m_block_max - 1) * kBlockM + (params.cu_seqlens_q == nullptr ? 0 : 128 * bidb)) * params.h * params.d_rounded + bidh * params.d_rounded
+        // If deterministic, each thread block will do atomicAdd to a different dQ_accum buffer.
+        + (!params.deterministic ? 0 : blockIdx.x * params.dq_accum_split_stride);
     const index_t row_offset_lse = (bidb * params.h + bidh) * params.seqlen_q
         + (m_block_max - 1) * kBlockM;
     const index_t row_offset_dpsum = (bidb * params.h + bidh) * params.seqlen_q_rounded
@@ -718,7 +368,7 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const in
         tdKsQt.data() = tdKsQt.data() + size(sQ);
     }
 
-    if (!Is_first && !Seq_parallel) { __syncthreads(); }
+    if ((!Is_first && !Seq_parallel) || params.deterministic) { __syncthreads(); }
 
     if (Kernel_traits::Is_V_in_regs) {
         // Clear the smem tiles to account for predicated off loads
@@ -756,8 +406,12 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const in
     #pragma unroll
     for (int mi = 0; mi < size(lse); ++mi) {
         const int row = get<0>(taccScS_row(mi));
-        lse(mi) = Is_even_MN || row < binfo.actual_seqlen_q - m_block * kBlockM ? gLSE(row) : 0;
+        lse(mi) = Is_even_MN || row < binfo.actual_seqlen_q - m_block * kBlockM ? gLSE(row) : INFINITY;
     }
+    // We want LSE = inf if the row is OOB. In that case Q would be zero, K would be zero,
+    // and scores would be zero. With LSE = 0, probs will be all 1's, and when we multiply
+    // with V (which would be zero), we're fine. However, with ALiBi, we might modify these
+    // scores, and probs can become NaN. Instead if we set LSE = inf for OOB rows, probs are always 0.
 
     // Tensor tKrK = make_fragment_like(tKsK);
     // // cute::copy(gmem_tiled_copy_QKV, tKgK(_, _, _, 0), tKrK);
@@ -791,11 +445,16 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const in
 
     auto seeds = at::cuda::philox::unpack(params.philox_args);
     unsigned long long seed = std::get<0>(seeds);
-    unsigned long long offset = std::get<1>(seeds) + (bidb * params.h + bidh) * 32 + tidx % 32;
+    unsigned long long offset = std::get<1>(seeds);
+    pytorch_flash::Dropout dropout(seed, offset, params.p_dropout_in_uint8_t,
+                           bidb, bidh, tidx, params.h);
 
     clear(acc_dv);
     clear(acc_dk);
 
+    const float alibi_slope = !Has_alibi || params.alibi_slopes_ptr == nullptr ? 0.0f : reinterpret_cast<float *>(params.alibi_slopes_ptr)[bidb * params.alibi_slopes_batch_stride + bidh] / params.scale_softmax;
+    pytorch_flash::Alibi<Is_causal> alibi(alibi_slope, binfo.actual_seqlen_k, binfo.actual_seqlen_q);
+
     for (; m_block >= m_block_min; --m_block) {
         Tensor acc_s = partition_fragment_C(tiled_mma_sdp, Shape<Int<kBlockM>, Int<kBlockN>>{});  // (MMA=4, MMA_N, MMA_N)
         clear(acc_s);
@@ -819,6 +478,12 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const in
         // Reshape acc_s from (MMA=4, MMA_N, MMA_N) to (col=(2, MMA_N), row=(2, MMA_N))
         Tensor scores = make_tensor(acc_s.data(), pytorch_flash::convert_layout_acc_rowcol(acc_s.layout()));
         // if (cute::thread(32, 0)) { print(scores); }
+
+        if (Has_alibi) {
+            alibi.apply_alibi(scores, n_block * kBlockN + (tidx / 32 / AtomLayoutMS) * MMA_N_SdP * 16,
+                              m_block * kBlockM + get<0>(taccScS_row(0)), AtomLayoutMS * 16);
+        }
+
         // TD [2023-07-29]: I was thinking that we don't need to mask out the elements beyond
         // actual_seqlen_k, because acc_s would be some finite value for those indices.
         // In the end when we multiply with K to get dQ, the corresponding values of K would be 0,
@@ -855,28 +520,27 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const in
             }
 
         }
+
         // if (cute::thread(32, 0)) { print(scores); }
         // Compute the exponential value.
         pytorch_flash::scale_apply_exp2</*scale_max=*/false>(scores, lse, params.scale_softmax_log2);
-        if (Is_dropout) {
+        if constexpr (Is_dropout) {
             int warp_id = tidx / 32;
             int block_row_idx = m_block * (kBlockM / 16) + warp_id % AtomLayoutMS;
             // Need col to be multiples of 32, since we're doing dropout with block of 16 x 32
             static_assert(MMA_N_SdP % 2 == 0);
             int block_col_idx = n_block * (kBlockN / 32) + (warp_id / AtomLayoutMS) * (MMA_N_SdP / 2);
-            Tensor scores_dropped = make_tensor(scores.data(), pytorch_flash::convert_layout_rowcol_Aregs<Kernel_traits::TiledMmaSdP>(scores.layout()));
-            pytorch_flash::apply_dropout</*encode_dropout_in_sign_bit=*/true>(
-                scores_dropped, params.p_dropout_in_uint8_t, seed, offset,
-                block_row_idx, block_col_idx, AtomLayoutMS
+            dropout.template apply_dropout</*encode_dropout_in_sign_bit=*/true>(
+                acc_s, block_row_idx, block_col_idx, AtomLayoutMS
             );
         }
         // Convert scores from fp32 to fp16/bf16
         Tensor rP = !Is_dropout
-            ? pytorch_flash::convert_type<Element>(scores)
-            : pytorch_flash::convert_type_relu<Element>(scores);
-        // Reshape rP from (nrow=(2, MMA_N), ncol=(2, MMA_N)) to ((2, 2, 2), MMA_N, MMA_N / 2)
-        // if using m16n8k16 or ((2, 2, 1), MMA_N, MMA_N) if using m16n8k8.
-        Tensor tPrP = make_tensor(rP.data(), pytorch_flash::convert_layout_rowcol_Aregs<Kernel_traits::TiledMmaSdP>(rP.layout()));
+            ? pytorch_flash::convert_type<Element>(acc_s)
+            : pytorch_flash::convert_type_relu<Element>(acc_s);
+        // Reshape rP from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_N, MMA_N / 2)
+        // if using m16n8k16 or (4, MMA_N, MMA_N) if using m16n8k8.
+        Tensor tPrP = make_tensor(rP.data(), pytorch_flash::convert_layout_acc_Aregs<Kernel_traits::TiledMmaSdP>(rP.layout()));
         Tensor tPaP = smem_thr_copy_PdS.retile_S(tPrP);     // ((Atom,AtomNum), MMA_N, MMA_N)
         cute::copy(smem_tiled_copy_PdS, tPaP, tPsP);
         // if (cute::thread0()) { print(tPaP); }
@@ -889,7 +553,7 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const in
         CUTE_STATIC_ASSERT_V(size<2>(acc_dp) == size<2>(acc_s));                     // MMA
 
         clear(acc_dp);
-        // Tensor acc_dp_reshaped = make_tensor(acc_dp.data(), flash::convert_layout_acc_rowcol(acc_dp.layout()));
+        // Tensor acc_dp_reshaped = make_tensor(acc_dp.data(), pytorch_flash::convert_layout_acc_rowcol(acc_dp.layout()));
         // #pragma unroll
         // for (int mi = 0; mi < size<0>(acc_dp_reshaped); ++mi) {
         //     #pragma unroll
@@ -953,9 +617,9 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const in
 
         // Layout p_l = tPrP.layout();
         // Tensor tdVrPt = make_tensor(tPrP.data(), make_layout(get<0>(p_l), get<2>(p_l), get<1>(p_l)));
-        // flash::gemm_A_in_regs(acc_dv, tdVrPt, tdVrdO, tdVsdOt, tiled_mma_dkv, smem_thr_copy_QdOt);
+        // pytorch_flash::gemm_rs(acc_dv, tdVrPt, tdVrdO, tdVsdOt, tiled_mma_dkv, smem_thr_copy_QdOt);
         // Tensor tdKrdSt = make_tensor(tdSrdS.data(), tdVrPt.layout());
-        // flash::gemm_A_in_regs(acc_dk, tdKrdSt, tdKrQt, tdKsQt, tiled_mma_dkv, smem_thr_copy_QdOt);
+        // pytorch_flash::gemm_rs(acc_dk, tdKrdSt, tdKrQt, tdKsQt, tiled_mma_dkv, smem_thr_copy_QdOt);
         pytorch_flash::gemm(acc_dv, tdVrPt, tdVrdO, tdVsPt, tdVsdOt, tiled_mma_dkv,
                     smem_tiled_copy_PdSt, smem_tiled_copy_QdOt, smem_thr_copy_PdSt, smem_thr_copy_QdOt);
         // if (cute::thread0() && n_block == 0 && m_block == 0) { print(tdVrPt); }
@@ -1120,430 +784,7 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const in
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_even_N, bool Is_even_K, typename Params>
-inline __device__ void compute_dq_dk_dv_1rowblock(const Params &params, const int bidb, const int bidh, const int m_block) {
-
-    using Element = typename Kernel_traits::Element;
-    using ElementAccum = typename Kernel_traits::ElementAccum;
-    using index_t = typename Kernel_traits::index_t;
-
-    // Shared memory.
-    extern __shared__ char smem_[];
-
-    // The thread index.
-    const int tidx = threadIdx.x;
-
-    constexpr int kBlockM = Kernel_traits::kBlockM;
-    constexpr int kBlockN = Kernel_traits::kBlockN;
-    constexpr int kHeadDim = Kernel_traits::kHeadDim;
-    // constexpr int kNWarps = Kernel_traits::kNWarps;
-    constexpr int MMA_N_SdP = kBlockN / decltype(size<1>(typename Kernel_traits::TiledMmaSdP::TiledShape_MNK{}))::value;
-    constexpr int AtomLayoutMS = Kernel_traits::AtomLayoutMSdP;
-
-    const BlockInfo</*Varlen=*/!Is_even_N> binfo(params, bidb);
-    if (m_block * kBlockM >= binfo.actual_seqlen_q || binfo.actual_seqlen_k == 0) return;
-
-    int n_block_max = cute::ceil_div(binfo.actual_seqlen_k, kBlockN);
-    if (Is_causal) {
-        n_block_max = std::min(n_block_max, cute::ceil_div((m_block + 1) * kBlockM, kBlockN));
-    }
-
-    // We iterate over the blocks in reverse order. This is because the last block is the only one
-    // that needs masking when we read K and V from global memory. Moreover, iterating in reverse
-    // might save us 1 register (we just need n_block instead of both n_block and n_block_max).
-
-    const index_t row_offset_q = binfo.q_offset(params.q_batch_stride, params.q_row_stride, bidb)
-        + m_block * kBlockM * params.q_row_stride + bidh * params.q_head_stride;
-    // We move K and V to the last block.
-    const index_t row_offset_k = binfo.k_offset(params.k_batch_stride, params.k_row_stride, bidb)
-        + (n_block_max - 1) * kBlockN * params.k_row_stride + (bidh / params.h_h_k_ratio) * params.k_head_stride;
-    const index_t row_offset_v = binfo.k_offset(params.v_batch_stride, params.v_row_stride, bidb)
-        + (n_block_max - 1) * kBlockN * params.v_row_stride + (bidh / params.h_h_k_ratio) * params.v_head_stride;
-    const index_t row_offset_do = binfo.q_offset(params.do_batch_stride, params.do_row_stride, bidb)
-        + m_block * kBlockM * params.do_row_stride + bidh * params.do_head_stride;
-    const index_t row_offset_o = binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb)
-        + m_block * kBlockM * params.o_row_stride + bidh * params.o_head_stride;
-    // We'll advance gdKaccum and gdVaccum before the first write.
-    const index_t row_offset_dkv_accum = ((bidb * params.h_k + (bidh / params.h_h_k_ratio)) * params.seqlen_k_rounded
-                                          + n_block_max * kBlockN) * params.d_rounded;
-    const index_t row_offset_lse = (bidb * params.h + bidh) * params.seqlen_q + m_block * kBlockM;
-
-    // We assume that params.d == kHeadDim for now
-    Tensor gQ = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.q_ptr) + row_offset_q),
-                            Shape<Int<kBlockM>, Int<kHeadDim>>{},
-                            make_stride(params.q_row_stride, _1{}));
-    Tensor gK = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.k_ptr) + row_offset_k),
-                            Shape<Int<kBlockN>, Int<kHeadDim>>{},
-                            make_stride(params.k_row_stride, _1{}));
-    Tensor gV = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.v_ptr) + row_offset_v),
-                            Shape<Int<kBlockN>, Int<kHeadDim>>{},
-                            make_stride(params.v_row_stride, _1{}));
-    Tensor gdO = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.do_ptr) + row_offset_do),
-                             Shape<Int<kBlockM>, Int<kHeadDim>>{},
-                             make_stride(params.do_row_stride, _1{}));
-    Tensor gO = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.o_ptr) + row_offset_o),
-                            Shape<Int<kBlockM>, Int<kHeadDim>>{},
-                            make_stride(params.o_row_stride, _1{}));
-    Tensor gdKaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dk_accum_ptr) + row_offset_dkv_accum),
-                                  Shape<Int<kBlockN>, Int<kHeadDim>>{},
-                                  Stride<Int<kHeadDim>, _1>{});
-    Tensor gdVaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dv_accum_ptr) + row_offset_dkv_accum),
-                                  Shape<Int<kBlockN>, Int<kHeadDim>>{},
-                                  Stride<Int<kHeadDim>, _1>{});
-    Tensor gLSE = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.softmax_lse_ptr) + row_offset_lse),
-                              Shape<Int<kBlockM>>{}, Stride<_1>{});
-
-    Tensor sQ = make_tensor(make_smem_ptr(reinterpret_cast<Element *>(smem_)),
-                            typename Kernel_traits::SmemLayoutQdO{});
-    Tensor sQt = make_tensor(sQ.data(), typename Kernel_traits::SmemLayoutQdOtransposed{});
-    Tensor sQtNoSwizzle = make_tensor(sQ.data(), typename Kernel_traits::SmemLayoutQdOtransposedNoSwizzle{});
-    Tensor sdO = make_tensor(sQ.data() + size(sQ), typename Kernel_traits::SmemLayoutQdO{});
-    Tensor sdOt = make_tensor(sdO.data(), typename Kernel_traits::SmemLayoutQdOtransposed{});
-    Tensor sdOtransposedNoSwizzle = make_tensor(sdO.data(),
-                                                typename Kernel_traits::SmemLayoutQdOtransposedNoSwizzle{});
-    Tensor sK = make_tensor(sdO.data() + size(sdO), typename Kernel_traits::SmemLayoutKV{});
-    // Double buffer for sK
-    Tensor sV = make_tensor(sK.data() + 2 * size(sK), typename Kernel_traits::SmemLayoutKV{});
-    Tensor sKt = make_tensor(sK.data(), typename Kernel_traits::SmemLayoutKtransposed{});
-    Tensor sKtNoSwizzle = make_tensor(sK.data(), typename Kernel_traits::SmemLayoutKtransposedNoSwizzle{});
-    Tensor sdS = make_tensor(sV.data() + size(sV), typename Kernel_traits::SmemLayoutPdS{});
-    Tensor sdSt = make_tensor(sdS.data(), typename Kernel_traits::SmemLayoutPdStransposed{});
-    Tensor sdStNoSwizzle = make_tensor(sdS.data(), typename Kernel_traits::SmemLayoutPdStransposedNoSwizzle{});
-    Tensor sP = make_tensor(sdS.data() + size(sdS), typename Kernel_traits::SmemLayoutPdS{});
-    Tensor sPt = make_tensor(sP.data(), typename Kernel_traits::SmemLayoutPdStransposed{});
-    Tensor sPtNoSwizzle = make_tensor(sP.data(), typename Kernel_traits::SmemLayoutPdStransposedNoSwizzle{});
-    Tensor sdPsum = make_tensor(make_smem_ptr(reinterpret_cast<ElementAccum *>(sdS.data().get())),
-                                Shape<Int<kBlockM>>{});
-
-    typename Kernel_traits::GmemTiledCopyQKV gmem_tiled_copy_QKV;
-    auto gmem_thr_copy_QKV = gmem_tiled_copy_QKV.get_thread_slice(tidx);
-    typename Kernel_traits::GmemTiledCopydO gmem_tiled_copy_dO;
-    auto gmem_thr_copy_dO = gmem_tiled_copy_dO.get_thread_slice(tidx);
-    typename Kernel_traits::GmemTiledCopydQaccumAtomicAdd gmem_tiled_copy_dKVaccum;
-    auto gmem_thr_copy_dKVaccum = gmem_tiled_copy_dKVaccum.get_thread_slice(tidx);
-
-    Tensor tQgQ = gmem_thr_copy_QKV.partition_S(gQ);
-    Tensor tQsQ = gmem_thr_copy_QKV.partition_D(sQ);
-    Tensor tdOgdO = gmem_thr_copy_dO.partition_S(gdO);
-    Tensor tdOsdO = gmem_thr_copy_dO.partition_D(sdO);
-    Tensor tdOgO = gmem_thr_copy_dO.partition_S(gO);
-    Tensor tKgK = gmem_thr_copy_QKV.partition_S(gK);  // (KCPY, KCPY_N, KCPY_K)
-    Tensor tKsK = gmem_thr_copy_QKV.partition_D(sK);
-    Tensor tVgV = gmem_thr_copy_QKV.partition_S(gV);  // (VCPY, VCPY_N, VCPY_K)
-    Tensor tVsV = gmem_thr_copy_QKV.partition_D(sV);
-    Tensor tdKgdKaccum = gmem_thr_copy_dKVaccum.partition_D(gdKaccum);
-    Tensor tdVgdVaccum = gmem_thr_copy_dKVaccum.partition_D(gdVaccum);
-
-    typename Kernel_traits::TiledMmaSdP tiled_mma_sdp;
-    auto thr_mma_sdp = tiled_mma_sdp.get_thread_slice(tidx);
-    Tensor tSrQ = thr_mma_sdp.partition_fragment_A(sQ);         // (MMA,MMA_N,MMA_K)
-    Tensor tSrK = thr_mma_sdp.partition_fragment_B(sK);         // (MMA,MMA_N,MMA_K)
-    Tensor tdPrdO = thr_mma_sdp.partition_fragment_A(sdO);      // (MMA,MMA_N,MMA_K)
-    Tensor tdPrV = thr_mma_sdp.partition_fragment_B(sV);        // (MMA,MMA_N,MMA_K)
-
-    typename Kernel_traits::TiledMmadKV tiled_mma_dkv;
-    auto thr_mma_dkv = tiled_mma_dkv.get_thread_slice(tidx);
-    Tensor tdKrdSt = thr_mma_dkv.partition_fragment_A(sdStNoSwizzle); // (MMA, MMA_N, MMA_N)
-    Tensor tdKrQt = thr_mma_dkv.partition_fragment_B(sQtNoSwizzle);   // (MMA, MMA_K, MMA_N)
-    Tensor tdVrPt = thr_mma_dkv.partition_fragment_A(sPtNoSwizzle);   // (MMA, MMA_N, MMA_N)
-    Tensor tdVrdO = thr_mma_dkv.partition_fragment_B(sdOtransposedNoSwizzle); // (MMA, MMA_K, MMA_N)
-
-    typename Kernel_traits::TiledMmadQ tiled_mma_dq;
-    auto thr_mma_dq = tiled_mma_dq.get_thread_slice(tidx);
-    Tensor tdQrdS = thr_mma_dq.partition_fragment_A(sdS);                      // (MMA, MMA_N, MMA_N)
-    Tensor tdQrKt  = thr_mma_dq.partition_fragment_B(sKtNoSwizzle);    // (MMA, MMA_K, MMA_N)
-
-    Tensor acc_dq = partition_fragment_C(tiled_mma_dq, Shape<Int<kBlockM>, Int<kHeadDim>>{});  // MMA, MMA_M_SdP, MMA_K
-
-    //
-    // Copy Atom retiling
-    //
-
-    auto smem_tiled_copy_QdO = make_tiled_copy_A(typename Kernel_traits::SmemCopyAtom{}, tiled_mma_sdp);
-    auto smem_thr_copy_QdO = smem_tiled_copy_QdO.get_thread_slice(tidx);
-    Tensor tSsQ = smem_thr_copy_QdO.partition_S(sQ);
-    Tensor tdPsdO = smem_thr_copy_QdO.partition_S(sdO);
-
-    auto smem_tiled_copy_KV = make_tiled_copy_B_warpcontiguousN<MMA_N_SdP>(typename Kernel_traits::SmemCopyAtom{}, tiled_mma_sdp);
-    auto smem_thr_copy_KV = smem_tiled_copy_KV.get_thread_slice(tidx);
-    Tensor tSsK = smem_thr_copy_KV.partition_S(sK);
-    Tensor tdPsV = smem_thr_copy_KV.partition_S(sV);
-
-    // Partition sP and sdS to match the accumulator partitioning
-    // This has to be tiled_mma_sdp, not tiled_mma_dkv
-    auto smem_tiled_copy_PdS = make_tiled_copy_C_warpcontiguousN<MMA_N_SdP>(typename Kernel_traits::SmemCopyAtomPdS{}, tiled_mma_sdp);
-    auto smem_thr_copy_PdS = smem_tiled_copy_PdS.get_thread_slice(tidx);
-    Tensor tPsP = smem_thr_copy_PdS.partition_D(sP);      // ((Atom,AtomNum),PIPE_M,PIPE_N)
-    Tensor tdSsdS = smem_thr_copy_PdS.partition_D(sdS);   // ((Atom,AtomNum),PIPE_M,PIPE_N)
-
-    auto smem_tiled_copy_PdSt = make_tiled_copy_A(typename Kernel_traits::SmemCopyAtomTransposed{}, tiled_mma_dkv);
-    auto smem_thr_copy_PdSt = smem_tiled_copy_PdSt.get_thread_slice(tidx);
-    Tensor tdVsPt = smem_thr_copy_PdSt.partition_S(sPt);
-    Tensor tdKsdSt = smem_thr_copy_PdSt.partition_S(sdSt);
-
-    auto smem_tiled_copy_QdOt = make_tiled_copy_B(typename Kernel_traits::SmemCopyAtomTransposed{}, tiled_mma_dkv);
-    auto smem_thr_copy_QdOt = smem_tiled_copy_QdOt.get_thread_slice(tidx);
-    Tensor tdVsdOt = smem_thr_copy_QdOt.partition_S(sdOt);
-    Tensor tdKsQt = smem_thr_copy_QdOt.partition_S(sQt);
-
-    auto smem_tiled_copy_dS = make_tiled_copy_A(typename Kernel_traits::SmemCopyAtom{}, tiled_mma_dq);
-    auto smem_thr_copy_dS = smem_tiled_copy_dS.get_thread_slice(tidx);
-    Tensor tdQsdS = smem_thr_copy_dS.partition_S(sdS);
-
-    auto smem_tiled_copy_Kt = make_tiled_copy_B(typename Kernel_traits::SmemCopyAtomTransposed{}, tiled_mma_dq);
-    auto smem_thr_copy_Kt = smem_tiled_copy_Kt.get_thread_slice(tidx);
-    Tensor tdQsKt = smem_thr_copy_Kt.partition_S(sKt);
-
-    //
-    // PREDICATES
-    //
-
-    // Construct identity layout for sQ and sK
-    Tensor cQ = make_identity_tensor(make_shape(size<0>(sQ), size<1>(sQ)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
-    Tensor cKV = make_identity_tensor(make_shape(size<0>(sK), size<1>(sK)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
-    // Repeat the partitioning with identity layouts
-    Tensor tQcQ = gmem_thr_copy_QKV.partition_S(cQ);       // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
-    Tensor tKVcKV = gmem_thr_copy_QKV.partition_S(cKV);   // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
-
-    // Allocate predicate tensors for k
-    Tensor tQpQ = make_tensor<bool>(make_shape(size<2>(tQsQ)));
-    Tensor tKVpKV = make_tensor<bool>(make_shape(size<2>(tKsK)));
-
-    // Set predicates for k bounds
-    if (!Is_even_K) {
-        #pragma unroll
-        for (int k = 0; k < size(tQpQ); ++k) { tQpQ(k) = get<1>(tQcQ(0, 0, k)) < params.d; }
-        #pragma unroll
-        for (int k = 0; k < size(tKVpKV); ++k) { tKVpKV(k) = get<1>(tKVcKV(0, 0, k)) < params.d; }
-    }
-
-    // Prologue
-
-    Tensor tdOrdO = make_fragment_like(tdOgdO);
-    Tensor tdOrO = make_fragment_like(tdOgO);
-
-    // TODO: Might need to exit early and write 0 to gdQ.
-
-    pytorch_flash::copy</*Is_even_MN=*/false, Is_even_K, /*Clear_OOB_MN=*/true>(
-        gmem_tiled_copy_dO, tdOgdO, tdOrdO, tQcQ, tQpQ, binfo.actual_seqlen_q - m_block * kBlockM
-    );
-    pytorch_flash::copy</*Is_even_MN=*/false, Is_even_K, /*Clear_OOB_MN=*/true>(
-        gmem_tiled_copy_dO, tdOgO, tdOrO, tQcQ, tQpQ, binfo.actual_seqlen_q - m_block * kBlockM
-    );
-
-    Tensor tQrQ = make_fragment_like(tQgQ);
-    pytorch_flash::copy</*Is_even_MN=*/false, Is_even_K, /*Clear_OOB_MN=*/true>(
-        gmem_tiled_copy_QKV, tQgQ, tQsQ, tQcQ, tQpQ, binfo.actual_seqlen_q - m_block * kBlockM
-    );
-
-    int n_block = n_block_max - 1;
-    if (n_block % 2 == 1) {
-        tKsK.data() = tKsK.data() + size(sK);
-        tSsK.data() = tSsK.data() + size(sK);
-        tdQsKt.data() = tdQsKt.data() + size(sK);
-    }
-
-    pytorch_flash::copy<Is_even_N, Is_even_K, /*Clear_OOB_MN=*/true>(
-        gmem_tiled_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV, binfo.actual_seqlen_k - n_block * kBlockN
-    );
-    pytorch_flash::copy<Is_even_N, Is_even_K, /*Clear_OOB_MN=*/true>(
-        gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV, binfo.actual_seqlen_k - n_block * kBlockN
-    );
-
-    Tensor caccS = make_identity_tensor(Shape<Int<kBlockM>, Int<kBlockN>>{});    // (BLK_M,BLK_N) -> (blk_m,blk_n)
-    Tensor taccScS = thr_mma_sdp.partition_C(caccS);                           // (MMA,MMA_N,MMA_N)
-    static_assert(decltype(size<0>(taccScS))::value == 4);
-    // Convert to ((2, 2), MMA_N, MMA_N) then take only the row indices.
-    Tensor taccScS_row = logical_divide(taccScS, Shape<_2>{})(make_coord(0, _), _, 0);
-    Tensor lse = make_tensor<ElementAccum>(Shape<Int<decltype(size(taccScS_row))::value>>{});
-    #pragma unroll
-    for (int mi = 0; mi < size(lse); ++mi) {
-        const int row = get<0>(taccScS_row(mi));
-        lse(mi) = row < binfo.actual_seqlen_q - m_block * kBlockM ? gLSE(row) : 0;
-    }
-
-    cute::cp_async_fence();
-
-    Tensor dP_sum = make_fragment_like(lse);
-    cute::copy(tdOrdO, tdOsdO);
-    dot_do_o<Kernel_traits::kGmemThreadsPerRow>(
-        tdOrdO, tdOrO, sdPsum,
-        Kernel_traits::kNThreads / (Kernel_traits::kGmemThreadsPerRow), params.p_dropout
-    );
-    __syncthreads();
-    #pragma unroll
-    for (int mi = 0; mi < size(dP_sum); ++mi) { dP_sum(mi) = sdPsum(get<0>(taccScS_row(mi))); }
-
-    auto seeds = at::cuda::philox::unpack(params.philox_args);
-    unsigned long long seed = std::get<0>(seeds);
-    unsigned long long offset = std::get<1>(seeds) + (bidb * params.h + bidh) * 32 + tidx % 32;
-
-    clear(acc_dq);
-
-    for (; n_block >= 0; --n_block) {
-        Tensor acc_s = partition_fragment_C(tiled_mma_sdp, Shape<Int<kBlockM>, Int<kBlockN>>{});  // (MMA=4, MMA_M_SdP, MMA_N)
-        clear(acc_s);
-        pytorch_flash::cp_async_wait<0>();
-        __syncthreads();
-
-        pytorch_flash::gemm(acc_s, tSrQ, tSrK, tSsQ, tSsK, tiled_mma_sdp,
-                    smem_tiled_copy_QdO, smem_tiled_copy_KV, smem_thr_copy_QdO, smem_thr_copy_KV);
-
-        // Reshape acc_s from (MMA=4, MMA_N, MMA_N) to (col=(2, MMA_N), row=(2, MMA_N))
-        Tensor scores = make_tensor(acc_s.data(), pytorch_flash::convert_layout_acc_rowcol(acc_s.layout()));
-        // We don't need to mask out the elements beyond actual_seqlen_k, because acc_s would
-        // be some finite value for those indices. In the end when we multiply with K to get dQ,
-        // the corresponding values of K would be 0, so the result would still be correct.
-        if (Is_causal && m_block * kBlockM < (n_block + 1) * kBlockN) {
-            pytorch_flash::apply_mask_causal(scores, n_block * kBlockN + (tidx / 32 / AtomLayoutMS) * MMA_N_SdP * 16,
-                                     binfo.actual_seqlen_k, m_block * kBlockM + get<0>(taccScS_row(0)),
-                                     // binfo.actual_seqlen_k, m_block * kBlockM + (tidx / 32) % AtomLayoutMS * 16 + (tidx % 32) / 4,
-                                     binfo.actual_seqlen_q,
-                                     AtomLayoutMS * 16);
-        }
-        // Compute the exponential value.
-        pytorch_flash::scale_apply_exp2</*scale_max=*/false>(scores, lse, params.scale_softmax_log2);
-        if (Is_dropout) {
-            int warp_id = tidx / 32;
-            int block_row_idx = m_block * (kBlockM / 16) + warp_id % AtomLayoutMS;
-            // Need col to be multiples of 32, since we're doing dropout with block of 16 x 32
-            static_assert(MMA_N_SdP % 2 == 0);
-            int block_col_idx = n_block * (kBlockN / 32) + (warp_id / AtomLayoutMS) * (MMA_N_SdP / 2);
-            Tensor scores_dropped = make_tensor(scores.data(), pytorch_flash::convert_layout_rowcol_Aregs<Kernel_traits::TiledMmaSdP>(scores.layout()));
-            pytorch_flash::apply_dropout</*encode_dropout_in_sign_bit=*/true>(
-                scores_dropped, params.p_dropout_in_uint8_t, seed, offset,
-                block_row_idx, block_col_idx, AtomLayoutMS
-            );
-        }
-        // Convert scores from fp32 to fp16/bf16
-        Tensor rP = !Is_dropout
-            ? pytorch_flash::convert_type<Element>(scores)
-            : pytorch_flash::convert_type_relu<Element>(scores);
-        // Reshape rP from (nrow=(2, MMA_N), ncol=(2, MMA_N)) to ((2, 2, 2), MMA_N, MMA_N / 2)
-        // if using m16n8k16 or ((2, 2, 1), MMA_N, MMA_N) if using m16n8k8.
-        Tensor tPrP = make_tensor(rP.data(), pytorch_flash::convert_layout_rowcol_Aregs<Kernel_traits::TiledMmaSdP>(rP.layout()));
-        Tensor tPaP = smem_thr_copy_PdS.retile_S(tPrP);     // ((Atom,AtomNum), MMA_N, MMA_N)
-        cute::copy(smem_tiled_copy_PdS, tPaP, tPsP);
-
-        Tensor acc_dp = partition_fragment_C(tiled_mma_sdp, Shape<Int<kBlockM>, Int<kBlockN>>{});  // (MMA=4, MMA_N, MMA_N)
-        CUTE_STATIC_ASSERT_V(size<0>(acc_dp) == size<0>(acc_s));                     // MMA
-        CUTE_STATIC_ASSERT_V(size<1>(acc_dp) == size<1>(acc_s));                     // MMA
-        CUTE_STATIC_ASSERT_V(size<2>(acc_dp) == size<2>(acc_s));                     // MMA
-
-        clear(acc_dp);
-        pytorch_flash::gemm(acc_dp, tdPrdO, tdPrV, tdPsdO, tdPsV, tiled_mma_sdp,
-                    smem_tiled_copy_QdO, smem_tiled_copy_KV, smem_thr_copy_QdO, smem_thr_copy_KV);
-
-        // Reshape acc_dp from (MMA=4, MMA_N, MMA_N) to (col=(2, MMA_N), row=(2, MMA_N))
-        Tensor dS = make_tensor(acc_dp.data(), scores.layout());
-        auto pointwise_mult = [](float p, float dp, float d) {
-            return p * (!Is_dropout || p >= 0 ? dp - d : d);
-        };
-        #pragma unroll
-        for (int mi = 0; mi < size<0>(dS); ++mi) {
-            #pragma unroll
-            for (int ni = 0; ni < size<1>(dS); ++ni) {
-                dS(mi, ni) = pointwise_mult(scores(mi, ni), dS(mi, ni), dP_sum(mi));
-            }
-        }
-
-        Tensor dS_reshaped = make_tensor(dS.data(), acc_dp.layout());
-        // Convert dS from fp32 to fp16
-        Tensor tdSrdS = pytorch_flash::convert_type<Element>(dS_reshaped);
-        Tensor tdSadS = smem_thr_copy_PdS.retile_S(tdSrdS);                                          // ((Atom,AtomNum), MMA_N, MMA_N)
-        cute::copy(smem_tiled_copy_PdS, tdSadS, tdSsdS);
-        __syncthreads();
-
-        if (n_block > 0) {
-            // Double buffer for sK
-            const int sK_offset = n_block % 2 == 0 ? size(sK) : -size(sK);
-            tKsK.data() = tKsK.data() + sK_offset;
-            tSsK.data() = tSsK.data() + sK_offset;
-            // Advance gK, gV
-            tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
-            tVgV.data() = tVgV.data() + (-int(kBlockN * params.v_row_stride));
-            pytorch_flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV);
-            pytorch_flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV);
-            // This cp_async_fence needs to be in the if block, otherwise the synchronization
-            // isn't right and we get race conditions.
-            cute::cp_async_fence();
-        }
-
-        Tensor acc_dv = partition_fragment_C(tiled_mma_dkv, Shape<Int<kBlockN>, Int<kHeadDim>>{});  // MMA, MMA_N, MMA_K
-        clear(acc_dv);
-        pytorch_flash::gemm(acc_dv, tdVrPt, tdVrdO, tdVsPt, tdVsdOt, tiled_mma_dkv,
-                    smem_tiled_copy_PdSt, smem_tiled_copy_QdOt, smem_thr_copy_PdSt, smem_thr_copy_QdOt);
-        // if (threadIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) { print(acc_dv); }
-        tdVgdVaccum.data() = tdVgdVaccum.data() + (-int(kBlockN * params.d_rounded));
-        #pragma unroll
-        for (int i = 0; i < size(acc_dv); ++i) { atomicAdd(&tdVgdVaccum(i), acc_dv(i)); }
-
-        __syncthreads();
-        Tensor acc_dk = partition_fragment_C(tiled_mma_dkv, Shape<Int<kBlockN>, Int<kHeadDim>>{});  // MMA, MMA_N, MMA_K
-        clear(acc_dk);
-        pytorch_flash::gemm(acc_dk, tdKrdSt, tdKrQt, tdKsdSt, tdKsQt, tiled_mma_dkv,
-                    smem_tiled_copy_PdSt, smem_tiled_copy_QdOt, smem_thr_copy_PdSt, smem_thr_copy_QdOt);
-        tdKgdKaccum.data() = tdKgdKaccum.data() + (-int(kBlockN * params.d_rounded));
-        #pragma unroll
-        for (int i = 0; i < size(acc_dk); ++i) { atomicAdd(&tdKgdKaccum(i), acc_dk(i)); }
-
-        pytorch_flash::gemm(acc_dq, tdQrdS, tdQrKt, tdQsdS, tdQsKt, tiled_mma_dq,
-                    smem_tiled_copy_dS, smem_tiled_copy_Kt, smem_thr_copy_dS, smem_thr_copy_Kt);
-        // Double buffer for sK
-        tdQsKt.data() = tdQsKt.data() + (n_block % 2 == 0 ? size(sK) : -size(sK));
-
-    }
-
-    // Epilogue
-
-    #pragma unroll
-    for (int i = 0; i < size(acc_dq); ++i) { acc_dq(i) *= params.scale_softmax_rp_dropout; }
-    // Convert acc_dq from fp32 to fp16
-    Tensor rdQ = pytorch_flash::convert_type<Element>(acc_dq);
-
-    Tensor sdQ = make_tensor(sQ.data(), typename Kernel_traits::SmemLayoutdQ{});
-
-    // Partition sdV and sdK to match the accumulator partitioning
-    auto smem_tiled_copy_dQ = make_tiled_copy_C(typename Kernel_traits::SmemCopyAtomdQ{}, tiled_mma_dq);
-    auto smem_thr_copy_dQ = smem_tiled_copy_dQ.get_thread_slice(tidx);
-    Tensor taccdQrdQ = smem_thr_copy_dQ.retile_S(rdQ);  // ((Atom,AtomNum), MMA_N, MMA_N)
-    Tensor taccdQsdQ = smem_thr_copy_dQ.partition_D(sdQ);  // ((Atom,AtomNum),PIPE_M,PIPE_N)
-
-    __syncthreads();
-    cute::copy(smem_tiled_copy_dQ, taccdQrdQ, taccdQsdQ);
-
-    const index_t row_offset_dq = binfo.q_offset(params.dq_batch_stride, params.dq_row_stride, bidb)
-        + m_block * kBlockM * params.dq_row_stride + bidh * params.dq_head_stride;
-    Tensor gdQ = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.dq_ptr) + row_offset_dq),
-                             Shape<Int<kBlockM>, Int<kHeadDim>>{},
-                             make_stride(params.dq_row_stride, _1{}));
-
-    typename Kernel_traits::GmemTiledCopydQ gmem_tiled_copy_dQ;
-    auto gmem_thr_copy_dQ = gmem_tiled_copy_dQ.get_thread_slice(tidx);
-    Tensor tdQsdQ = gmem_thr_copy_dQ.partition_S(sdQ);    // ((Atom,AtomNum),ATOM_M,ATOM_N)
-    Tensor tdQgdQ = gmem_thr_copy_dQ.partition_D(gdQ);
-
-    __syncthreads();
-
-    Tensor tdQrdQ = make_tensor<Element>(shape(tdQgdQ));
-    cute::copy(gmem_tiled_copy_dQ, tdQsdQ, tdQrdQ);
-
-    Tensor cdQ = make_identity_tensor(Shape<Int<kBlockM>, Int<kHeadDim>>{});    // (BLK_M,BLK_K) -> (blk_m,blk_k)
-    Tensor tdQcdQ = gmem_thr_copy_dQ.partition_D(cdQ);
-    Tensor tdQpdQ = make_tensor<bool>(make_shape(size<2>(tdQgdQ)));
-    if (!Is_even_K) {
-        #pragma unroll
-        for (int k = 0; k < size(tdQpdQ); ++k) { tdQpdQ(k) = get<1>(tdQcdQ(0, 0, k)) < params.d; }
-    }
-    // Clear_OOB_K must be false since we don't want to write zeros to gmem
-    pytorch_flash::copy</*Is_even_MN=*/false, Is_even_K, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
-        gmem_tiled_copy_dQ, tdQrdQ, tdQgdQ, tdQcdQ, tdQpdQ, binfo.actual_seqlen_q - m_block * kBlockM
-    );
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_even_M, bool Is_even_K, typename Params>
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Has_alibi, bool Is_even_M, bool Is_even_K, typename Params>
 inline __device__ void compute_dq_dk_dv(const Params &params) {
 
     // The block index for the batch.
@@ -1557,44 +798,32 @@ inline __device__ void compute_dq_dk_dv(const Params &params) {
 
     const int n_block_max = (params.seqlen_k + Kernel_traits::kBlockN - 1) / Kernel_traits::kBlockN;
     if (n_block_max == 1) {
-        compute_dq_dk_dv_1colblock<Kernel_traits, Is_dropout, Is_causal, Is_even_M, Is_even_K, true, true>(params, bidb, bidh, 0);
+        compute_dq_dk_dv_1colblock<Kernel_traits, Is_dropout, Is_causal, Has_alibi, Is_even_M, Is_even_K, true, true>(params, bidb, bidh, 0);
     } else {
         // Iterating backward from n_block_max - 1 to 0 might save 1 register
-        compute_dq_dk_dv_1colblock<Kernel_traits, Is_dropout, Is_causal, Is_even_M, Is_even_K, true, false>(params, bidb, bidh, n_block_max - 1);
+        compute_dq_dk_dv_1colblock<Kernel_traits, Is_dropout, Is_causal, Has_alibi, Is_even_M, Is_even_K, true, false>(params, bidb, bidh, n_block_max - 1);
         for (int n_block = n_block_max - 2; n_block > 0; n_block--) {
-            compute_dq_dk_dv_1colblock<Kernel_traits, Is_dropout, Is_causal, Is_even_M, Is_even_K, false, false>(params, bidb, bidh, n_block);
+            compute_dq_dk_dv_1colblock<Kernel_traits, Is_dropout, Is_causal, Has_alibi, Is_even_M, Is_even_K, false, false>(params, bidb, bidh, n_block);
         }
-        compute_dq_dk_dv_1colblock<Kernel_traits, Is_dropout, Is_causal, Is_even_M, Is_even_K, false, true>(params, bidb, bidh, 0);
+        compute_dq_dk_dv_1colblock<Kernel_traits, Is_dropout, Is_causal, Has_alibi, Is_even_M, Is_even_K, false, true>(params, bidb, bidh, 0);
     }
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_local, bool Is_even_MN, bool Is_even_K, typename Params>
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, typename Params>
 inline __device__ void compute_dq_dk_dv_seqk_parallel(const Params &params) {
 
-    const int n_block = blockIdx.x;
     // The block index for the batch.
     const int bidb = blockIdx.y;
     // The block index for the head.
     const int bidh = blockIdx.z;
 
-    compute_dq_dk_dv_1colblock<Kernel_traits, Is_dropout, Is_causal, Is_local, Is_even_MN, Is_even_K, false, false, /*Seq_parallel=*/true>(params, bidb, bidh, n_block);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_even_N, bool Is_even_K, typename Params>
-inline __device__ void compute_dq_dk_dv_seqq_parallel(const Params &params) {
-
-    const int m_block = blockIdx.x;
-    // The block index for the batch.
-    const int bidb = blockIdx.y;
-    // The block index for the head.
-    const int bidh = blockIdx.z;
-
-    compute_dq_dk_dv_1rowblock<Kernel_traits, Is_dropout, Is_causal, Is_even_N, Is_even_K>(params, bidb, bidh, m_block);
+    // If deterministic, each thread block will do atomicAdd to a different dQ_accum buffer.
+    for (int n_block = blockIdx.x; n_block < (params.seqlen_k + Kernel_traits::kBlockN - 1) / Kernel_traits::kBlockN; n_block += gridDim.x) {
+        compute_dq_dk_dv_1colblock<Kernel_traits, Is_dropout, Is_causal, Is_local, Has_alibi, Is_even_MN, Is_even_K, false, false, /*Seq_parallel=*/true>(params, bidb, bidh, n_block);
+    }
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace pytorch_flash
+} // namespace flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_bwd_launch_template.h b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_bwd_launch_template.h
index 5c65bbd5ced15..dd3bfa1bad7b6 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_bwd_launch_template.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_bwd_launch_template.h
@@ -1,4 +1,6 @@
-// Copyright (c) 2022, Tri Dao.
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
 
 #pragma once
 
@@ -6,58 +8,85 @@
 
 #include <ATen/native/transformers/cuda/flash_attn/static_switch.h>
 #include <ATen/native/transformers/cuda/flash_attn/flash.h>
+#include <ATen/native/transformers/cuda/flash_attn/flash_bwd_preprocess_kernel.h>
 #include <ATen/native/transformers/cuda/flash_attn/flash_bwd_kernel.h>
 
 namespace pytorch_flash {
 
-template<bool Clear_dQaccum=true, typename Kernel_traits>
-__global__ void flash_bwd_dot_do_o_kernel(Flash_bwd_params params) {
-    pytorch_flash::compute_dot_do_o<Clear_dQaccum, Kernel_traits>(params);
-}
+// Determine if the architecture supports FLASH and define a macro to handle parameter modifiers
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#define ARCH_SUPPORTS_FLASH
+#endif
 
-template<typename Kernel_traits>
-__global__ void flash_bwd_clear_dkvaccum_kernel(Flash_bwd_params params) {
-    pytorch_flash::clear_dKVaccum<Kernel_traits>(params);
-}
+#if defined(ARCH_SUPPORTS_FLASH) && defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 11 && \
+    defined(__CUDACC_VER_MINOR__) && __CUDACC_VER_MINOR__ >= 8
+#define KERNEL_PARAM_MODIFIER __grid_constant__
+#else
+#define KERNEL_PARAM_MODIFIER
+#endif
 
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_even_M, bool Is_even_K>
-__global__ void flash_bwd_dq_dk_dv_loop_kernel(Flash_bwd_params params) {
-    pytorch_flash::compute_dq_dk_dv<Kernel_traits, Is_dropout, Is_causal, Is_even_M, Is_even_K>(params);
+// Define a macro for unsupported architecture handling to centralize the error message
+#define FLASH_UNSUPPORTED_ARCH printf("FATAL: FlashAttention requires building with sm version sm80-sm90, but was built for < 8.0!");
+
+// Use a macro to clean up kernel definitions
+#define DEFINE_FLASH_BACKWARD_KERNEL(kernelName, ...) \
+template<typename Kernel_traits, __VA_ARGS__> \
+__global__ void kernelName(KERNEL_PARAM_MODIFIER const Flash_bwd_params params)
+
+DEFINE_FLASH_BACKWARD_KERNEL(flash_bwd_dq_dk_dv_loop_kernel, bool Is_dropout, bool Is_causal, bool Has_alibi, bool Is_even_M, bool Is_even_K) {
+    #if defined(ARCH_SUPPORTS_FLASH)
+       pytorch_flash::compute_dq_dk_dv<Kernel_traits, Is_dropout, Is_causal, Has_alibi, Is_even_M, Is_even_K>(params);
+    #else
+        FLASH_UNSUPPORTED_ARCH
+    #endif
 }
 
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_local, bool Is_even_MN, bool Is_even_K>
-__global__ void flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel(Flash_bwd_params params) {
-    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+DEFINE_FLASH_BACKWARD_KERNEL(flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K) {
+    #if defined(ARCH_SUPPORTS_FLASH)
         static_assert(!(Is_causal && Is_local));  // If Is_local is true, Is_causal should be false
-        pytorch_flash::compute_dq_dk_dv_seqk_parallel<Kernel_traits, Is_dropout, Is_causal, Is_local, Is_even_MN, Is_even_K>(params);
+        pytorch_flash::compute_dq_dk_dv_seqk_parallel<Kernel_traits, Is_dropout, Is_causal, Is_local, Has_alibi, Is_even_MN, Is_even_K>(params);
     #else
-        printf("FATAL: FlashAttention requires to be build with sm80-sm90, but was built for < 8.0!");
+        FLASH_UNSUPPORTED_ARCH
     #endif
 }
 
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_even_N, bool Is_even_K>
-__global__ void flash_bwd_dq_dk_dv_loop_seqq_parallel_kernel(Flash_bwd_params params) {
-    pytorch_flash::compute_dq_dk_dv_seqq_parallel<Kernel_traits, Is_dropout, Is_causal, Is_even_N, Is_even_K>(params);
+template<bool Clear_dQaccum=true, typename Kernel_traits>
+__global__ void flash_bwd_dot_do_o_kernel(const Flash_bwd_params params) {
+    pytorch_flash::compute_dot_do_o<Clear_dQaccum, Kernel_traits>(params);
+}
+
+template<typename Kernel_traits>
+__global__ void flash_bwd_clear_dkvaccum_kernel(const Flash_bwd_params params) {
+    pytorch_flash::clear_dKVaccum<Kernel_traits>(params);
 }
 
 template<typename Kernel_traits>
-__global__ void flash_bwd_convert_dq_kernel(Flash_bwd_params params) {
-    pytorch_flash::convert_dQ<Kernel_traits>(params);
+__global__ void flash_bwd_convert_dq_kernel(const Flash_bwd_params params, const int nsplits) {
+    pytorch_flash::convert_dQ<Kernel_traits>(params, nsplits);
 }
 
 template<typename Kernel_traits>
-__global__ void flash_bwd_convert_dkv_kernel(Flash_bwd_params params) {
+__global__ void flash_bwd_convert_dkv_kernel(const Flash_bwd_params params) {
     pytorch_flash::convert_dKV<Kernel_traits>(params);
 }
 
 template<typename Kernel_traits, bool Is_dropout>
-void run_flash_bwd_seqk_parallel(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+void run_flash_bwd_seqk_parallel(Flash_bwd_params &params, cudaStream_t stream) {
     const int num_m_block = (params.seqlen_q + Kernel_traits::kBlockM - 1) / Kernel_traits::kBlockM;
     dim3 grid_m(num_m_block, params.b, params.h);
     const int num_n_block = (params.seqlen_k + Kernel_traits::kBlockN - 1) / Kernel_traits::kBlockN;
-    dim3 grid_n(num_n_block, params.b, params.h);
+    int gridDimx = num_n_block;
+    if (params.deterministic) {
+        auto dprops = at::cuda::getCurrentDeviceProperties();
+        gridDimx = (dprops->multiProcessorCount + params.b * params.h - 1) / (params.b * params.h);
+    }
+    dim3 grid_n(gridDimx, params.b, params.h);
 
-    flash_bwd_dot_do_o_kernel<true, Kernel_traits><<<grid_m, Kernel_traits::kNThreads, 0, stream>>>(params);
+    if (!params.deterministic) {
+        flash_bwd_dot_do_o_kernel<true, Kernel_traits><<<grid_m, Kernel_traits::kNThreads, 0, stream>>>(params);
+    } else {
+        flash_bwd_dot_do_o_kernel<false, Kernel_traits><<<grid_m, Kernel_traits::kNThreads, 0, stream>>>(params);
+    }
     C10_CUDA_KERNEL_LAUNCH_CHECK();
 
     // We want to specialize to is_even_MN and not just is_even_M, since in the case where N is not
@@ -66,21 +95,23 @@ void run_flash_bwd_seqk_parallel(Flash_bwd_params &params, cudaStream_t stream,
     const bool is_even_K = params.d == Kernel_traits::kHeadDim;
     constexpr int smem_size_dq_dk_dv = Kernel_traits::kSmemSize1colblock;
     // printf("smem_size_dq_dk_dv = %d\n", smem_size_dq_dk_dv);
-     BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
         BOOL_SWITCH(is_even_MN, IsEvenMNConst, [&] {
-            BOOL_SWITCH(is_even_K, IsEvenKConst, [&] {
-                BOOL_SWITCH((params.window_size_left >= 0 || params.window_size_right >= 0) && !params.is_causal, Is_local, [&] {
-                    // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
-                    // If head dim > 128, set IsEvenMNConst to false to reduce number of templates
-                    // If Is_local, set Is_causal to false
-                    auto kernel = &flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel<Kernel_traits, Is_dropout, Is_causal, Is_local && !Is_causal, IsEvenMNConst && IsEvenKConst && !Is_local && Kernel_traits::kHeadDim <= 128, IsEvenKConst>;
-                    // auto kernel = &flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel<Kernel_traits, Is_dropout, Is_causal, IsEvenMNConst, true>;
-                    if (smem_size_dq_dk_dv >= 48 * 1024)  {
-                        C10_CUDA_CHECK(cudaFuncSetAttribute(
-                            kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_dq_dk_dv));
-                    }
-                    kernel<<<grid_n, Kernel_traits::kNThreads, smem_size_dq_dk_dv, stream>>>(params);
-                    C10_CUDA_KERNEL_LAUNCH_CHECK();
+            EVENK_SWITCH(is_even_K, IsEvenKConst, [&] {
+                LOCAL_SWITCH((params.window_size_left >= 0 || params.window_size_right >= 0) && !params.is_causal, Is_local, [&] {
+                    ALIBI_SWITCH(params.alibi_slopes_ptr != nullptr, Has_alibi, [&] {
+                        // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
+                        // If head dim > 128, set IsEvenMNConst to false to reduce number of templates
+                        // If Is_local, set Is_causal to false
+                        auto kernel = &flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel<Kernel_traits, Is_dropout, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && IsEvenKConst && !Is_local && Kernel_traits::kHeadDim <= 128, IsEvenKConst>;
+                        // auto kernel = &flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel<Kernel_traits, false, Is_causal, false, false, true, true>;
+                        if (smem_size_dq_dk_dv >= 48 * 1024)  {
+                            C10_CUDA_CHECK(cudaFuncSetAttribute(
+                                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_dq_dk_dv));
+                        }
+                        kernel<<<grid_n, Kernel_traits::kNThreads, smem_size_dq_dk_dv, stream>>>(params);
+                        C10_CUDA_KERNEL_LAUNCH_CHECK();
+                    });
                 });
             });
         });
@@ -91,58 +122,19 @@ void run_flash_bwd_seqk_parallel(Flash_bwd_params &params, cudaStream_t stream,
         C10_CUDA_CHECK(cudaFuncSetAttribute(
             kernel_dq, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::kSmemdQSize));
     }
-    kernel_dq<<<grid_m, Kernel_traits::kNThreads, Kernel_traits::kSmemdQSize, stream>>>(params);
+    kernel_dq<<<grid_m, Kernel_traits::kNThreads, Kernel_traits::kSmemdQSize, stream>>>(params, !params.deterministic ? 1 : gridDimx);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template<typename Kernel_traits, bool Is_dropout>
-void run_flash_bwd_seqq_parallel(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    const int num_n_block = (params.seqlen_k + Kernel_traits::kBlockN - 1) / Kernel_traits::kBlockN;
-    dim3 grid_n(num_n_block, params.b, params.h_k);
-    flash_bwd_clear_dkvaccum_kernel<Kernel_traits><<<grid_n, Kernel_traits::kNThreads, 0, stream>>>(params);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-    const int num_m_block = (params.seqlen_q + Kernel_traits::kBlockM - 1) / Kernel_traits::kBlockM;
-    dim3 grid_m(num_m_block, params.b, params.h);
-    // We also use is_even_N to set Unpadded in the BlockInfo constructor, so we need to check
-    // for cu_seqlens_k as well.
-    const bool is_even_N = params.cu_seqlens_q == nullptr && params.cu_seqlens_k == nullptr && params.seqlen_k % Kernel_traits::kBlockN == 0;
-    const bool is_even_K = params.d == Kernel_traits::kHeadDim;
-    constexpr int smem_size_dq_dk_dv = Kernel_traits::kSmemSize1rowblock;
-    // printf("smem_size_dq_dk_dv = %d\n", smem_size_dq_dk_dv);
-    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
-        BOOL_SWITCH(is_even_N, IsEvenNConst, [&] {
-            BOOL_SWITCH(is_even_K, IsEvenKConst, [&] {
-                // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
-                auto kernel = &flash_bwd_dq_dk_dv_loop_seqq_parallel_kernel<Kernel_traits, Is_dropout, Is_causal, IsEvenNConst && IsEvenKConst, IsEvenKConst>;
-                // auto kernel = &flash_bwd_dq_dk_dv_loop_seqq_parallel_kernel<Kernel_traits, false, false, IsEvenNConst, IsEvenKConst>;
-                if (smem_size_dq_dk_dv >= 48 * 1024)  {
-                    C10_CUDA_CHECK(cudaFuncSetAttribute(
-                        kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_dq_dk_dv));
-                }
-                kernel<<<grid_m, Kernel_traits::kNThreads, smem_size_dq_dk_dv, stream>>>(params);
-                C10_CUDA_KERNEL_LAUNCH_CHECK();
-            });
-        });
-    });
-
-    auto kernel_dkv = &flash_bwd_convert_dkv_kernel<Kernel_traits>;
-    if (Kernel_traits::kSmemKVSize >= 48 * 1024)  {
-        C10_CUDA_CHECK(cudaFuncSetAttribute(
-            kernel_dkv, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::kSmemKVSize));
-    }
-    kernel_dkv<<<grid_n, Kernel_traits::kNThreads, Kernel_traits::kSmemKVSize, stream>>>(params);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-}
-
-template<typename Kernel_traits, bool Is_dropout>
-void run_flash_bwd(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    if (configure) return;
-    run_flash_bwd_seqk_parallel<Kernel_traits, Is_dropout>(params, stream, configure);
+void run_flash_bwd(Flash_bwd_params &params, cudaStream_t stream) {
+#ifndef FLASHATTENTION_DISABLE_BACKWARD
+    run_flash_bwd_seqk_parallel<Kernel_traits, Is_dropout>(params, stream);
+#endif
 }
 
 template<typename T>
-void run_mha_bwd_hdim32(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+void run_mha_bwd_hdim32(Flash_bwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 32;
     int device;
     cudaGetDevice(&device);
@@ -152,21 +144,21 @@ void run_mha_bwd_hdim32(Flash_bwd_params &params, cudaStream_t stream, const boo
     if (status_ != cudaSuccess) {
       C10_CUDA_CHECK(status_);
     }
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
         if (max_smem_per_block >= 2 * ((3 * 128 + 2 * 128) * Headdim + 2 * 128 * 128)) { // 104 KB
             if constexpr(!Is_dropout) {  // We can afford more registers to keep V in registers
-                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, true, false, T>, Is_dropout>(params, stream, configure);
+                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, true, false, T>, Is_dropout>(params, stream);
             } else {
-                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, false, false, T>, Is_dropout>(params, stream, configure);
+                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, false, false, T>, Is_dropout>(params, stream);
             }
         } else {  // 96 KB
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, true, false, T>, Is_dropout>(params, stream, configure);
+            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, true, false, T>, Is_dropout>(params, stream);
         }
     });
 }
 
 template<typename T>
-void run_mha_bwd_hdim64(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+void run_mha_bwd_hdim64(Flash_bwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 64;
     int device;
     cudaGetDevice(&device);
@@ -177,42 +169,41 @@ void run_mha_bwd_hdim64(Flash_bwd_params &params, cudaStream_t stream, const boo
       C10_CUDA_CHECK(status_);
     }
     // printf("max_smem_per_block = %d\n", max_smem_per_block);
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
         // Changing AtomLayoutMdQ from 2 to 4 takes the same time
-        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 2, false, false, T>>(params, stream, configure);
-        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 2, true, false, T>>(params, stream, configure);
-        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 2, 4, 4, false, false, T>>(params, stream, configure);
-        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 4, false, false, T>, Is_dropout>(params, stream, configure);
+        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 2, false, false, T>>(params, stream);
+        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 2, true, false, T>>(params, stream);
+        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 2, 4, 4, false, false, T>>(params, stream);
+        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 4, false, false, T>, Is_dropout>(params, stream);
         // This is slightly faster. We want to split M more so we need fewer registers to store LSE.
         if (max_smem_per_block >= 144 * 1024) {
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, false, false, T>, Is_dropout>(params, stream, configure);
+            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, false, false, T>, Is_dropout>(params, stream);
             // This has a lot of register spilling
-            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, true, false, T>, Is_dropout>(params, stream, configure);
+            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, true, false, T>, Is_dropout>(params, stream);
         } else {
             // if (params.h == params.h_k) {
-                // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, false, false, T>, Is_dropout>(params, stream, configure);
-                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, true, false, T>, Is_dropout>(params, stream, configure);
-                // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 4, false, false, T>, Is_dropout>(params, stream, configure);
-                // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 4, true, false, T>, Is_dropout>(params, stream, configure);
+                // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, false, false, T>, Is_dropout>(params, stream);
+                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, true, false, T>, Is_dropout>(params, stream);
+                // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 4, false, false, T>, Is_dropout>(params, stream);
+                // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 4, true, false, T>, Is_dropout>(params, stream);
             // } else {
-            //     run_flash_bwd_seqq_parallel<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 4, false, false, T>, Is_dropout>(params, stream, configure);
             // }
         }
     });
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 4, true, false, T>>(params, stream, configure);
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 4, 2, 2, 2, true, false, T>>(params, stream, configure);
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 32, 128, 4, 1, 4, 1, false, false, T>>(params, stream, configure);
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 16, 128, 4, 1, 4, 1, false, false, T>>(params, stream, configure);
+    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 4, true, false, T>>(params, stream);
+    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 4, 2, 2, 2, true, false, T>>(params, stream);
+    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 32, 128, 4, 1, 4, 1, false, false, T>>(params, stream);
+    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 16, 128, 4, 1, 4, 1, false, false, T>>(params, stream);
     // M=128, N=64 is quite slow, I think because we need to read/write dQaccum twice as many times
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 2, 2, 2, false, T>>(params, stream, configure);
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, false, T>>(params, stream, configure);
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 4, false, T>>(params, stream, configure);
+    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 2, 2, 2, false, T>>(params, stream);
+    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, false, T>>(params, stream);
+    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 4, false, T>>(params, stream);
 
-    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 4, 4, 2, 4, false, false, T>>(params, stream, configure);
+    // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 4, 4, 2, 4, false, false, T>>(params, stream);
 }
 
 template<typename T>
-void run_mha_bwd_hdim96(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+void run_mha_bwd_hdim96(Flash_bwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 96;
     int device;
     cudaGetDevice(&device);
@@ -223,26 +214,22 @@ void run_mha_bwd_hdim96(Flash_bwd_params &params, cudaStream_t stream, const boo
       C10_CUDA_CHECK(status_);
     }
     // printf("max_smem_per_block = %d\n", max_smem_per_block);
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        // if (params.h == params.h_k) {
-            if (max_smem_per_block >= 116 * 1024) {
-                if constexpr(!Is_dropout) {  // 92KB
-                    run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, true, false, T>, Is_dropout>(params, stream, configure);
-                } else {  // 116 KB
-                    // This is faster for dropout since we don't have many registers to spare
-                    run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, false, false, T>, Is_dropout>(params, stream, configure);
-                }
-            } else {
-                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, true, false, T>, Is_dropout>(params, stream, configure);
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+        if (max_smem_per_block >= 116 * 1024) {
+            if constexpr(!Is_dropout) {  // 92KB
+                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, true, false, T>, Is_dropout>(params, stream);
+            } else {  // 116 KB
+                // This is faster for dropout since we don't have many registers to spare
+                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, false, false, T>, Is_dropout>(params, stream);
             }
-        // } else {
-            // run_flash_bwd_seqq_parallel<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 4, 4, false, false, T>>(params, stream, configure);
-        // }
+        } else {
+            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, true, false, T>, Is_dropout>(params, stream);
+        }
     });
 }
 
 template<typename T>
-void run_mha_bwd_hdim128(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+void run_mha_bwd_hdim128(Flash_bwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 128;
     int device;
     cudaGetDevice(&device);
@@ -253,35 +240,30 @@ void run_mha_bwd_hdim128(Flash_bwd_params &params, cudaStream_t stream, const bo
       C10_CUDA_CHECK(status_);
     }
     // printf("max_smem_per_block = %d\n", max_smem_per_block);
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        // if (params.h == params.h_k) {
-            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 32, 128, 8, 2, 2, 2, false, false, T>>(params, stream, configure);
-            // This is faster, in the case of sequence-parallel bwd (where we need fewer registers).
-            // Out of these three, the 2nd one is slightly faster (2% faster than the first). Idk why.
-            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 2, 2, false, false, T>>(params, stream, configure);
-            if (max_smem_per_block >= 144 * 1024) {
-                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 2, false, false, T>, Is_dropout>(params, stream, configure);
-                // run_flash_bwd_seqk_parallel<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, false, false, T>, Is_dropout>(params, stream, configure);
-                // run_flash_bwd_seqk_parallel<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, false, true, T>, Is_dropout>(params, stream, configure);
-                // run_flash_bwd_seqq_parallel<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, false, false, T>, Is_dropout>(params, stream, configure);
-                // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 2, true, false, T>, Is_dropout>(params, stream, configure);
-                // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 2, false, false, T>, Is_dropout>(params, stream, configure);
-                // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 2, true, false, T>, Is_dropout>(params, stream, configure);
-            } else {
-                // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, false, false, T>, Is_dropout>(params, stream, configure);
-                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, true, false, T>, Is_dropout>(params, stream, configure);
-            }
-            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, false, false, T>>(params, stream, configure);
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 32, 128, 8, 2, 2, 2, false, false, T>>(params, stream);
+        // This is faster, in the case of sequence-parallel bwd (where we need fewer registers).
+        // Out of these three, the 2nd one is slightly faster (2% faster than the first). Idk why.
+        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 2, 2, false, false, T>>(params, stream);
+        if (max_smem_per_block >= 144 * 1024) {
+            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 2, false, false, T>, Is_dropout>(params, stream);
+            // run_flash_bwd_seqk_parallel<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, false, false, T>, Is_dropout>(params, stream);
+            // run_flash_bwd_seqk_parallel<Flash_bwd_kernel_traits<Headdim, 128, 128, 8, 4, 4, 4, false, true, T>, Is_dropout>(params, stream);
+            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 2, true, false, T>, Is_dropout>(params, stream);
+            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 2, false, false, T>, Is_dropout>(params, stream);
+            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 2, 2, true, false, T>, Is_dropout>(params, stream);
+        } else {
+            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, false, false, T>, Is_dropout>(params, stream);
+            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, true, false, T>, Is_dropout>(params, stream);
+        }
+        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 128, 8, 2, 4, 4, false, false, T>>(params, stream);
 
-            // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 4, 4, false, false, T>>(params, stream, configure);
-        // } else {
-            // run_flash_bwd_seqq_parallel<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 4, 4, false, false, T>>(params, stream, configure);
-        // }
+        // run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 128, 64, 8, 4, 4, 4, false, false, T>>(params, stream);
     });
 }
 
 template<typename T>
-void run_mha_bwd_hdim160(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+void run_mha_bwd_hdim160(Flash_bwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 160;
     int device;
     cudaGetDevice(&device);
@@ -291,17 +273,17 @@ void run_mha_bwd_hdim160(Flash_bwd_params &params, cudaStream_t stream, const bo
     if (status_ != cudaSuccess) {
       C10_CUDA_CHECK(status_);
     }
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
         if (max_smem_per_block >= 116 * 1024) {
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 4, 4, false, false, T>, Is_dropout>(params, stream, configure);
+            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 4, 4, false, false, T>, Is_dropout>(params, stream);
         } else {
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 4, 4, false, true, T>, Is_dropout>(params, stream, configure);
+            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 4, 4, false, true, T>, Is_dropout>(params, stream);
         }
     });
 }
 
 template<typename T>
-void run_mha_bwd_hdim192(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+void run_mha_bwd_hdim192(Flash_bwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 192;
     int device;
     cudaGetDevice(&device);
@@ -311,25 +293,25 @@ void run_mha_bwd_hdim192(Flash_bwd_params &params, cudaStream_t stream, const bo
     if (status_ != cudaSuccess) {
       C10_CUDA_CHECK(status_);
     }
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
         if (max_smem_per_block >= 136 * 1024) {
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, false, false, T>, Is_dropout>(params, stream, configure);
+            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, false, false, T>, Is_dropout>(params, stream);
         } else {
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, true, true, T>, Is_dropout>(params, stream, configure);
+            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, true, true, T>, Is_dropout>(params, stream);
         }
     });
 }
 
 template<typename T>
-void run_mha_bwd_hdim224(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+void run_mha_bwd_hdim224(Flash_bwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 224;
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 4, 4, false, false, T>, Is_dropout>(params, stream, configure);
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+        run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 4, 4, false, false, T>, Is_dropout>(params, stream);
     });
 }
 
 template<typename T>
-void run_mha_bwd_hdim256(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
+void run_mha_bwd_hdim256(Flash_bwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 256;
     int device;
     cudaGetDevice(&device);
@@ -339,14 +321,18 @@ void run_mha_bwd_hdim256(Flash_bwd_params &params, cudaStream_t stream, const bo
     if (status_ != cudaSuccess) {
       C10_CUDA_CHECK(status_);
     }
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
         if (max_smem_per_block >= 176 * 1024) {  // H100
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, false, false, T>, Is_dropout>(params, stream, configure);
-        } else {  // A100, we don't do double buffering to save smem
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, false, true, T>, Is_dropout>(params, stream, configure);
+            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, false, false, T>, Is_dropout>(params, stream);
+        } else if (max_smem_per_block >= 144 * 1024) {  // A100, we don't do double buffering to save smem
+            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 2, 2, false, true, T>, Is_dropout>(params, stream);
+        } else { // sm86 and sm89, max smem is 99 KB. Only works without dropout. V in regs and no double buffering.
+            if constexpr (!Is_dropout) {
+                run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 32, 8, 4, 1, 2, true, true, T>, false>(params, stream);
+            }
         }
     });
 }
 
 
-}; // namespace pytorch_fmha
+}; // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_bwd_preprocess_kernel.h b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_bwd_preprocess_kernel.h
new file mode 100644
index 0000000000000..7811984b7e61e
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_bwd_preprocess_kernel.h
@@ -0,0 +1,377 @@
+/***************************************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cute/algorithm/copy.hpp>
+
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+#include <cutlass/numeric_types.h>
+
+#include <ATen/native/transformers/cuda/flash_attn/block_info.h>
+#include <ATen/native/transformers/cuda/flash_attn/kernel_traits.h>
+#include <ATen/native/transformers/cuda/flash_attn/utils.h>
+
+namespace pytorch_flash {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int THREADS_PER_ROW, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
+inline __device__ void dot_do_o(Tensor<Engine0, Layout0> const &do_, Tensor<Engine0, Layout0> const &o,
+                                Tensor<Engine1, Layout1> &dP_sum, const int gdP_col_stride, const float scale) {
+    static_assert(Layout0::rank == 3, "Only support 3D Tensor");
+    static_assert(Layout1::rank == 1, "Only support 1D Tensor");
+    CUTE_STATIC_ASSERT_V(do_.layout() == o.layout());
+    // Reshape do_ and o from (8, kBlockM / 32, kHeadDim / 64) to (kBlockM / 32, 8 * kHeadDim / 64)
+    // The last coordinate is the "page".
+    Tensor do_reshaped = make_tensor(do_.data(), make_layout(get<1>(do_.layout()),
+                                                             make_layout(get<0>(do_.layout()),
+                                                                         get<2>(do_.layout()))));
+    Tensor o_reshaped = make_tensor(o.data(), do_reshaped.layout());
+    Tensor do_fp32 = pytorch_flash::convert_type<float>(do_reshaped);
+    Tensor o_fp32 = pytorch_flash::convert_type<float>(o_reshaped);
+    #pragma unroll
+    for (int mi = 0; mi < size<0>(do_reshaped); ++mi) {
+        float dP_sum_cur = do_fp32(mi, 0) * o_fp32(mi, 0);
+        #pragma unroll
+        for (int ni = 1; ni < size<1>(do_reshaped); ni++) {
+            dP_sum_cur += do_fp32(mi, ni) * o_fp32(mi, ni);
+        }
+        pytorch_flash::SumOp<float> sum_op;
+        dP_sum_cur = pytorch_flash::Allreduce<THREADS_PER_ROW>::run(dP_sum_cur, sum_op) * scale;
+        if (threadIdx.x % THREADS_PER_ROW == 0) {
+            dP_sum(mi * gdP_col_stride + threadIdx.x / THREADS_PER_ROW) = dP_sum_cur;
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Just compute dot(do, o) and write the result (softmax_d) to global memory as a separate kernel.
+// This is used in the case where we want to parallelize the backward across seqlen_k.
+template<bool Clear_dQaccum=true, typename Kernel_traits, typename Params>
+inline __device__ void compute_dot_do_o(const Params &params) {
+    using Element = typename Kernel_traits::Element;
+    using ElementAccum = typename Kernel_traits::ElementAccum;
+    using index_t = typename Kernel_traits::index_t;
+
+    const int m_block = blockIdx.x;
+    // The block index for the batch.
+    const int bidb = blockIdx.y;
+    // The block index for the head.
+    const int bidh = blockIdx.z;
+    // The thread index.
+    const int tidx = threadIdx.x;
+
+    constexpr int kBlockM = Kernel_traits::kBlockM;
+    constexpr int kHeadDim = Kernel_traits::kHeadDim;
+
+    const BlockInfo binfo(params, bidb);
+    if (m_block * kBlockM >= binfo.actual_seqlen_q) return;
+
+    const index_t row_offset_do = binfo.q_offset(params.do_batch_stride, params.do_row_stride, bidb)
+        + m_block * kBlockM * params.do_row_stride + bidh * params.do_head_stride;
+    const index_t row_offset_o = binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb)
+        + m_block * kBlockM * params.o_row_stride + bidh * params.o_head_stride;
+    const index_t row_offset_dq_accum = binfo.q_offset(params.seqlen_q_rounded * params.h * params.d_rounded, params.h * params.d_rounded, bidb)
+        + (m_block * kBlockM + (params.cu_seqlens_q == nullptr ? 0 : 128 * bidb)) * params.h * params.d_rounded + bidh * params.d_rounded;
+    const index_t row_offset_dpsum = (bidb * params.h + bidh) * params.seqlen_q_rounded + m_block * kBlockM;
+
+    Tensor gdO = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.do_ptr) + row_offset_do),
+                             Shape<Int<kBlockM>, Int<kHeadDim>>{},
+                             make_stride(params.do_row_stride, _1{}));
+    Tensor gO = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.o_ptr) + row_offset_o),
+                            Shape<Int<kBlockM>, Int<kHeadDim>>{},
+                            make_stride(params.o_row_stride, _1{}));
+    Tensor gdQaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dq_accum_ptr) + row_offset_dq_accum),
+                                  Shape<Int<kBlockM>, Int<kHeadDim>>{},
+                                  make_stride(params.h * params.d_rounded, _1{}));
+    Tensor dP_sum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dsoftmax_sum) + row_offset_dpsum),
+                                Shape<Int<kBlockM>>{}, Stride<_1>{});
+
+    typename Kernel_traits::GmemTiledCopydO gmem_tiled_copy_dO;
+    auto gmem_thr_copy_dO = gmem_tiled_copy_dO.get_thread_slice(tidx);
+    // TODO: careful, we're zeroing out dQaccum with type float4, but when
+    // we do atomicAdds, we use type float. The layouts are different. Check this.
+    typename Kernel_traits::GmemTiledCopydQaccum gmem_tiled_copy_dQaccum;
+    auto gmem_thr_copy_dQaccum = gmem_tiled_copy_dQaccum.get_thread_slice(tidx);
+
+    Tensor tdOgdO = gmem_thr_copy_dO.partition_S(gdO);
+    Tensor tdOgO = gmem_thr_copy_dO.partition_S(gO);
+    Tensor tdQgdQaccum = gmem_thr_copy_dQaccum.partition_D(gdQaccum);
+
+    Tensor cdO = make_identity_tensor(Shape<Int<kBlockM>, Int<kHeadDim>>{});    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor tdOcdO = gmem_thr_copy_dO.partition_S(cdO);
+
+    // Allocate predicate tensors for k
+    Tensor tdOpdO = make_tensor<bool>(make_shape(size<2>(tdOgdO)));
+    // Set predicates for k bounds
+    #pragma unroll
+    for (int k = 0; k < size(tdOpdO); ++k) {tdOpdO(k) = get<1>(tdOcdO(0, 0, k)) < params.d;}
+
+    Tensor tdOrdO = make_fragment_like(tdOgdO);
+    Tensor tdOrO = make_fragment_like(tdOgO);
+    pytorch_flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/true>(
+        gmem_tiled_copy_dO, tdOgdO, tdOrdO, tdOcdO, tdOpdO, binfo.actual_seqlen_q - m_block * kBlockM
+    );
+    pytorch_flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/true>(
+        gmem_tiled_copy_dO, tdOgO, tdOrO, tdOcdO, tdOpdO, binfo.actual_seqlen_q - m_block * kBlockM
+    );
+    // By right we need to scale dP up by 1/p_dropout, but instead we don't and only scale the final
+    // results (dQ and dK) by 1/p_dropout. So we need to keep dP_sum scaled down by p_dropout here,
+    // so that (dP - dP_sum) is on the same scale.
+    dot_do_o<Kernel_traits::kGmemThreadsPerRow>(tdOrdO, tdOrO, dP_sum,
+                                                Kernel_traits::kNThreads / (Kernel_traits::kGmemThreadsPerRow), params.p_dropout);
+    if (Clear_dQaccum) {
+        // We're actually not zero'ing out all of dQaccum, but only the part that we're going to
+        // do atomicAdds on.
+        Tensor zero = make_fragment_like(tdQgdQaccum);
+        clear(zero);
+        cute::copy(gmem_tiled_copy_dQaccum, zero, tdQgdQaccum);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Kernel_traits, typename Params>
+inline __device__ void clear_dKVaccum(const Params &params) {
+    using ElementAccum = typename Kernel_traits::ElementAccum;
+    using index_t = typename Kernel_traits::index_t;
+
+    const int n_block = blockIdx.x;
+    // The block index for the batch.
+    const int bidb = blockIdx.y;
+    // The block index for the head.
+    const int bidh = blockIdx.z;
+    // The thread index.
+    const int tidx = threadIdx.x;
+
+    constexpr int kBlockN = Kernel_traits::kBlockN;
+    constexpr int kHeadDim = Kernel_traits::kHeadDim;
+
+    const BlockInfo binfo(params, bidb);
+    if (n_block * kBlockN >= binfo.actual_seqlen_k) return;
+
+    const index_t row_offset_dkv_accum = ((bidb * params.h_k + bidh) * params.seqlen_k_rounded + n_block * kBlockN) * params.d_rounded;
+
+    Tensor gdKaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dk_accum_ptr) + row_offset_dkv_accum),
+                                  Shape<Int<kBlockN>, Int<kHeadDim>>{}, Stride<Int<kHeadDim>, _1>{});
+    Tensor gdVaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dv_accum_ptr) + row_offset_dkv_accum),
+                                  Shape<Int<kBlockN>, Int<kHeadDim>>{}, Stride<Int<kHeadDim>, _1>{});
+
+    typename Kernel_traits::GmemTiledCopydQaccum gmem_tiled_copy_dKVaccum;
+    auto gmem_thr_copy_dKVaccum = gmem_tiled_copy_dKVaccum.get_thread_slice(tidx);
+    Tensor tdKgdKaccum = gmem_thr_copy_dKVaccum.partition_D(gdKaccum);
+    Tensor tdVgdVaccum = gmem_thr_copy_dKVaccum.partition_D(gdVaccum);
+    Tensor zero = make_fragment_like(tdKgdKaccum);
+    clear(zero);
+    cute::copy(gmem_tiled_copy_dKVaccum, zero, tdKgdKaccum);
+    cute::copy(gmem_tiled_copy_dKVaccum, zero, tdVgdVaccum);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Convert dQ from dQaccum (in float) to fp16/bf16.
+// This is used in the case where we want to parallelize the backward across seqlen_k.
+template<typename Kernel_traits, typename Params>
+inline __device__ void convert_dQ(const Params &params, const int nsplits) {
+    using Element = typename Kernel_traits::Element;
+    using ElementAccum = typename Kernel_traits::ElementAccum;
+    using index_t = typename Kernel_traits::index_t;
+
+    // Shared memory.
+    extern __shared__ char smem_[];
+
+    const int m_block = blockIdx.x;
+    // The block index for the batch.
+    const int bidb = blockIdx.y;
+    // The block index for the head.
+    const int bidh = blockIdx.z;
+    // The thread index.
+    const int tidx = threadIdx.x;
+
+    constexpr int kBlockM = Kernel_traits::kBlockM;
+    constexpr int kHeadDim = Kernel_traits::kHeadDim;
+
+    const BlockInfo binfo(params, bidb);
+    if (m_block * kBlockM >= binfo.actual_seqlen_q) return;
+
+    const index_t row_offset_dq = binfo.q_offset(params.dq_batch_stride, params.dq_row_stride, bidb)
+        + m_block * kBlockM * params.dq_row_stride + bidh * params.dq_head_stride;
+    const index_t row_offset_dq_accum = binfo.q_offset(params.seqlen_q_rounded * params.h * params.d_rounded, params.h * params.d_rounded, bidb)
+        + (m_block * kBlockM + (params.cu_seqlens_q == nullptr ? 0 : 128 * bidb)) * params.h * params.d_rounded + bidh * params.d_rounded;
+
+    Tensor gdQ = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.dq_ptr) + row_offset_dq),
+                             Shape<Int<kBlockM>, Int<kHeadDim>>{},
+                             make_stride(params.dq_row_stride, _1{}));
+    Tensor gdQaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dq_accum_ptr) + row_offset_dq_accum),
+                                  Shape<Int<kBlockM>, Int<kHeadDim>>{},
+                                  make_stride(params.h * params.d_rounded, _1{}));
+
+    Tensor sdQ = make_tensor(make_smem_ptr(reinterpret_cast<Element *>(smem_)),
+                             typename Kernel_traits::SmemLayoutdQ{});
+
+    typename Kernel_traits::GmemTiledCopydQ gmem_tiled_copy_dQ;
+    auto gmem_thr_copy_dQ = gmem_tiled_copy_dQ.get_thread_slice(tidx);
+    typename Kernel_traits::GmemTiledCopydQaccumAtomicAdd gmem_tiled_copy_dQaccum;
+    auto gmem_thr_copy_dQaccum = gmem_tiled_copy_dQaccum.get_thread_slice(tidx);
+
+    typename Kernel_traits::TiledMmadQ tiled_mma_dq;
+    auto smem_tiled_copy_dQ = make_tiled_copy_C(typename Kernel_traits::SmemCopyAtomdQ{}, tiled_mma_dq);
+    auto smem_thr_copy_dQ = smem_tiled_copy_dQ.get_thread_slice(tidx);
+    Tensor taccdQsdQ = smem_thr_copy_dQ.partition_D(sdQ);  // ((Atom,AtomNum),PIPE_M,PIPE_N)
+
+    Tensor tdQsdQ = gmem_thr_copy_dQ.partition_S(sdQ);    // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tdQgdQ = gmem_thr_copy_dQ.partition_D(gdQ);
+    Tensor tdQgdQaccum = gmem_thr_copy_dQaccum.partition_S(gdQaccum);
+
+    Tensor acc_dq = partition_fragment_C(tiled_mma_dq, Shape<Int<kBlockM>, Int<kHeadDim>>{});  // MMA, MMA_N, MMA_K
+    CUTE_STATIC_ASSERT_V(size(acc_dq) == size(tdQgdQaccum));
+
+    Tensor tdQrdQaccum = make_fragment_like(tdQgdQaccum);
+    clear(acc_dq);
+    for (int s = 0; s < nsplits; ++s) {
+        cute::copy(gmem_tiled_copy_dQaccum, tdQgdQaccum, tdQrdQaccum);
+        #pragma unroll
+        for (int i = 0; i < size(acc_dq); ++i) { acc_dq(i) += tdQrdQaccum(i); }
+        tdQgdQaccum.data() = tdQgdQaccum.data() + params.dq_accum_split_stride;
+    }
+    #pragma unroll
+    for (int i = 0; i < size(acc_dq); ++i) { acc_dq(i) *= params.scale_softmax_rp_dropout; }
+    // Convert acc_dq from fp32 to fp16
+    Tensor rdQ = pytorch_flash::convert_type<Element>(acc_dq);
+    Tensor taccdQrdQ = smem_thr_copy_dQ.retile_S(rdQ);  // ((Atom,AtomNum), MMA_N, MMA_N)
+    cute::copy(smem_tiled_copy_dQ, taccdQrdQ, taccdQsdQ);
+    __syncthreads();
+    Tensor tdQrdQ = make_tensor<Element>(shape(tdQgdQ));
+    cute::copy(gmem_tiled_copy_dQ, tdQsdQ, tdQrdQ);
+
+    Tensor cdQ = make_identity_tensor(Shape<Int<kBlockM>, Int<kHeadDim>>{});    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor tdQcdQ = gmem_thr_copy_dQ.partition_D(cdQ);
+    Tensor tdQpdQ = make_tensor<bool>(make_shape(size<2>(tdQgdQ)));
+    #pragma unroll
+    for (int k = 0; k < size(tdQpdQ); ++k) { tdQpdQ(k) = get<1>(tdQcdQ(0, 0, k)) < params.d; }
+    // Clear_OOB_K must be false since we don't want to write zeros to gmem
+    pytorch_flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
+        gmem_tiled_copy_dQ, tdQrdQ, tdQgdQ, tdQcdQ, tdQpdQ, binfo.actual_seqlen_q - m_block * kBlockM
+    );
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Convert dK and dV from dKaccum and dVaccum (in float) to fp16/bf16.
+// This is used in the case where we want to parallelize the backward across seqlen_q.
+template<typename Kernel_traits, typename Params>
+inline __device__ void convert_dKV(const Params &params) {
+    using Element = typename Kernel_traits::Element;
+    using ElementAccum = typename Kernel_traits::ElementAccum;
+    using index_t = typename Kernel_traits::index_t;
+
+    // Shared memory.
+    extern __shared__ char smem_[];
+
+    const int n_block = blockIdx.x;
+    // The block index for the batch.
+    const int bidb = blockIdx.y;
+    // The block index for the head.
+    const int bidh = blockIdx.z;
+    // The thread index.
+    const int tidx = threadIdx.x;
+
+    constexpr int kBlockN = Kernel_traits::kBlockN;
+    constexpr int kHeadDim = Kernel_traits::kHeadDim;
+
+    const BlockInfo binfo(params, bidb);
+    if (n_block * kBlockN >= binfo.actual_seqlen_k) return;
+
+    const index_t row_offset_dk = binfo.k_offset(params.dk_batch_stride, params.dk_row_stride, bidb)
+        + n_block * kBlockN * params.dk_row_stride + bidh * params.dk_head_stride;
+    const index_t row_offset_dv = binfo.k_offset(params.dv_batch_stride, params.dv_row_stride, bidb)
+        + n_block * kBlockN * params.dv_row_stride + bidh * params.dv_head_stride;
+    const index_t row_offset_dkv_accum = ((bidb * params.h_k + bidh) * params.seqlen_k_rounded
+                                          + n_block * kBlockN) * params.d_rounded;
+
+    Tensor gdK = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.dk_ptr) + row_offset_dk),
+                             Shape<Int<kBlockN>, Int<kHeadDim>>{},
+                             make_stride(params.dk_row_stride, _1{}));
+    Tensor gdV = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.dv_ptr) + row_offset_dv),
+                             Shape<Int<kBlockN>, Int<kHeadDim>>{},
+                             make_stride(params.dv_row_stride, _1{}));
+    Tensor gdKaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dk_accum_ptr) + row_offset_dkv_accum),
+                                  Shape<Int<kBlockN>, Int<kHeadDim>>{},
+                                  Stride<Int<kHeadDim>, _1>{});
+    Tensor gdVaccum = make_tensor(make_gmem_ptr(reinterpret_cast<ElementAccum *>(params.dv_accum_ptr) + row_offset_dkv_accum),
+                                  Shape<Int<kBlockN>, Int<kHeadDim>>{},
+                                  Stride<Int<kHeadDim>, _1>{});
+
+    Tensor sdK = make_tensor(make_smem_ptr(reinterpret_cast<Element *>(smem_)),
+                             typename Kernel_traits::SmemLayoutdKV{});
+    Tensor sdV = make_tensor(sdK.data() + size(sdK), typename Kernel_traits::SmemLayoutdKV{}); // (SMEM_N, SMEM_K)
+
+    typename Kernel_traits::GmemTiledCopydQ gmem_tiled_copy_dKV;
+    auto gmem_thr_copy_dKV = gmem_tiled_copy_dKV.get_thread_slice(tidx);
+    typename Kernel_traits::GmemTiledCopydQaccumAtomicAdd gmem_tiled_copy_dKVaccum;
+    auto gmem_thr_copy_dKVaccum = gmem_tiled_copy_dKVaccum.get_thread_slice(tidx);
+
+    typename Kernel_traits::TiledMmadKV tiled_mma_dkv;
+    auto smem_tiled_copy_dKV = make_tiled_copy_C(typename Kernel_traits::SmemCopyAtomdKV{}, tiled_mma_dkv);
+    auto smem_thr_copy_dKV = smem_tiled_copy_dKV.get_thread_slice(tidx);
+    Tensor taccdKsdK = smem_thr_copy_dKV.partition_D(sdK);  // ((Atom,AtomNum),PIPE_M,PIPE_N)
+    Tensor taccdVsdV = smem_thr_copy_dKV.partition_D(sdV);  // ((Atom,AtomNum),PIPE_M,PIPE_N)
+
+    Tensor tdKsdK = gmem_thr_copy_dKV.partition_S(sdK);    // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tdKgdK = gmem_thr_copy_dKV.partition_D(gdK);
+    Tensor tdVsdV = gmem_thr_copy_dKV.partition_S(sdV);    // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tdVgdV = gmem_thr_copy_dKV.partition_D(gdV);
+    Tensor tdKgdKaccum = gmem_thr_copy_dKVaccum.partition_S(gdKaccum);
+    Tensor tdVgdVaccum = gmem_thr_copy_dKVaccum.partition_S(gdVaccum);
+
+    Tensor acc_dk = partition_fragment_C(tiled_mma_dkv, Shape<Int<kBlockN>, Int<kHeadDim>>{});  // MMA, MMA_N, MMA_K
+    Tensor acc_dv = partition_fragment_C(tiled_mma_dkv, Shape<Int<kBlockN>, Int<kHeadDim>>{});  // MMA, MMA_N, MMA_K
+    CUTE_STATIC_ASSERT_V(size(acc_dk) == size(tdKgdKaccum));
+    CUTE_STATIC_ASSERT_V(size(acc_dv) == size(tdVgdVaccum));
+
+    Tensor tdKrdKaccum = make_fragment_like(tdKgdKaccum);
+    Tensor tdVrdVaccum = make_fragment_like(tdVgdVaccum);
+    cute::copy(gmem_tiled_copy_dKVaccum, tdKgdKaccum, tdKrdKaccum);
+    cute::copy(gmem_tiled_copy_dKVaccum, tdVgdVaccum, tdVrdVaccum);
+    #pragma unroll
+    for (int i = 0; i < size(acc_dk); ++i) {
+        acc_dk(i) = tdKrdKaccum(i) * params.scale_softmax_rp_dropout;
+    }
+    #pragma unroll
+    for (int i = 0; i < size(acc_dv); ++i) {
+        acc_dv(i) = tdVrdVaccum(i) * params.rp_dropout;
+    }
+    // Convert acc_dk from fp32 to fp16
+    Tensor rdK = pytorch_flash::convert_type<Element>(acc_dk);
+    Tensor rdV = pytorch_flash::convert_type<Element>(acc_dv);
+    Tensor taccdKrdK = smem_thr_copy_dKV.retile_S(rdK);  // ((Atom,AtomNum), MMA_N, MMA_N)
+    Tensor taccdVrdV = smem_thr_copy_dKV.retile_S(rdV);  // ((Atom,AtomNum), MMA_N, MMA_N)
+    cute::copy(smem_tiled_copy_dKV, taccdKrdK, taccdKsdK);
+    cute::copy(smem_tiled_copy_dKV, taccdVrdV, taccdVsdV);
+    __syncthreads();
+    Tensor tdKrdK = make_tensor<Element>(shape(tdKgdK));
+    Tensor tdVrdV = make_tensor<Element>(shape(tdVgdV));
+    cute::copy(gmem_tiled_copy_dKV, tdKsdK, tdKrdK);
+    cute::copy(gmem_tiled_copy_dKV, tdVsdV, tdVrdV);
+
+    Tensor cdKV = make_identity_tensor(Shape<Int<kBlockN>, Int<kHeadDim>>{});    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV);
+    Tensor tdKVpdKV = make_tensor<bool>(make_shape(size<2>(tdKgdK)));
+    #pragma unroll
+    for (int k = 0; k < size(tdKVpdKV); ++k) { tdKVpdKV(k) = get<1>(tdKVcdKV(0, 0, k)) < params.d; }
+    // Clear_OOB_K must be false since we don't want to write zeros to gmem
+    pytorch_flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
+        gmem_tiled_copy_dKV, tdKrdK, tdKgdK, tdKVcdKV, tdKVpdKV, binfo.actual_seqlen_k - n_block * kBlockN
+    );
+    pytorch_flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
+        gmem_tiled_copy_dKV, tdVrdV, tdVgdV, tdKVcdKV, tdKVpdKV, binfo.actual_seqlen_k - n_block * kBlockN
+    );
+}
+
+} // namespace flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_fwd_kernel.h b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_fwd_kernel.h
index 125517c486daf..0386a07cc64fd 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_fwd_kernel.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_fwd_kernel.h
@@ -1,23 +1,23 @@
 /******************************************************************************
- * Copyright (c) 2023, Tri Dao.
+ * Copyright (c) 2024, Tri Dao.
  ******************************************************************************/
 
 #pragma once
 
-#include <cmath>
 #include <cute/algorithm/copy.hpp>
-#include <cute/algorithm/gemm.hpp>
 
 #include <cutlass/cutlass.h>
 #include <cutlass/array.h>
 #include <cutlass/numeric_types.h>
-#include <cutlass/numeric_conversion.h>
+
 
 #include <ATen/native/transformers/cuda/flash_attn/block_info.h>
 #include <ATen/native/transformers/cuda/flash_attn/kernel_traits.h>
 #include <ATen/native/transformers/cuda/flash_attn/utils.h>
 #include <ATen/native/transformers/cuda/flash_attn/softmax.h>
-#include <ATen/native/transformers/cuda/flash_attn/philox.cuh>
+#include <ATen/native/transformers/cuda/flash_attn/mask.h>
+#include <ATen/native/transformers/cuda/flash_attn/dropout.h>
+#include <ATen/native/transformers/cuda/flash_attn/rotary.h>
 
 namespace pytorch_flash {
 
@@ -25,57 +25,7 @@ using namespace cute;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-template<bool Is_first, bool Check_inf=false, typename Tensor0, typename Tensor1, typename Tensor2>
-inline __device__ void softmax_rescale_o(Tensor0 &scores, Tensor1 &scores_max, Tensor1 &scores_sum,
-                                         Tensor2 &acc_o, float softmax_scale_log2) {
-    if (Is_first) {
-        pytorch_flash::template reduce_max</*zero_init=*/true>(scores, scores_max);
-        pytorch_flash::scale_apply_exp2(scores, scores_max, softmax_scale_log2);
-        pytorch_flash::reduce_sum(scores, scores_sum);
-    } else {
-        Tensor scores_max_prev = make_fragment_like(scores_max);
-        cute::copy(scores_max, scores_max_prev);
-        pytorch_flash::template reduce_max</*zero_init=*/false>(scores, scores_max);
-        // Reshape acc_o from (MMA=4, MMA_M, MMA_K) to (nrow=(2, MMA_M), ncol=(2, MMA_K))
-        Tensor acc_o_rowcol = make_tensor(acc_o.data(), pytorch_flash::convert_layout_acc_rowcol(acc_o.layout()));
-        #pragma unroll
-        for (int mi = 0; mi < size(scores_max); ++mi) {
-            float scores_max_cur = !Check_inf
-                ? scores_max(mi)
-                : (scores_max(mi) == -INFINITY ? 0.0f : scores_max(mi));
-            float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
-            scores_sum(mi) *= scores_scale;
-            #pragma unroll
-            for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) { acc_o_rowcol(mi, ni) *= scores_scale; }
-        }
-        pytorch_flash::scale_apply_exp2(scores, scores_max, softmax_scale_log2);
-        Tensor scores_sum_cur = make_fragment_like(scores_sum);
-        pytorch_flash::reduce_sum(scores, scores_sum_cur);
-        #pragma unroll
-        for (int mi = 0; mi < size(scores_sum); ++mi) { scores_sum(mi) += scores_sum_cur(mi); }
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename TiledCopy>
-inline __device__ void write_softmax_to_gmem(
-    Tensor<Engine0, Layout0> const &tOrP, Tensor<Engine1, Layout1> &tPgP, TiledCopy gmem_tiled_copy_P
-) {
-    // Reshape tOrP from (8, MMA_M, MMA_N) to (8, MMA_M * MMA_N)
-    Layout l = tOrP.layout();
-    Tensor tPrP = make_tensor(tOrP.data(), make_layout(get<0>(l), make_layout(get<1>(l), get<2>(l))));
-    CUTE_STATIC_ASSERT_V(size<2>(tPgP) == _1{});
-    CUTE_STATIC_ASSERT_V(size<1>(tPrP) == size<1>(tPgP));
-    #pragma unroll
-    for (int mi = 0; mi < size<1>(tPrP); ++mi) {
-        cute::copy(gmem_tiled_copy_P, tPrP(_, mi), tPgP(_, mi, 0));
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_local, bool Is_even_MN, bool Is_even_K, bool Return_softmax, typename Params>
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Return_softmax, typename Params>
 inline __device__ void compute_attn_1rowblock(const Params &params, const int bidb, const int bidh, const int m_block) {
 
     using Element = typename Kernel_traits::Element;
@@ -92,7 +42,19 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
     constexpr int kBlockN = Kernel_traits::kBlockN;
     constexpr int kHeadDim = Kernel_traits::kHeadDim;
     constexpr int kNWarps = Kernel_traits::kNWarps;
-    constexpr int MMA_M = kBlockM / decltype(size<0>(typename Kernel_traits::TiledMma::TiledShape_MNK{}))::value;
+
+    auto seed_offset = at::cuda::philox::unpack(params.philox_args);
+    pytorch_flash::Dropout dropout(std::get<0>(seed_offset), std::get<1>(seed_offset), params.p_dropout_in_uint8_t,
+                           bidb, bidh, tidx, params.h);
+
+    // Save seed and offset for backward. If we don't have this here, the 0-th thread block might
+    // exit early and no one saves the rng state.
+    if (Is_dropout && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && tidx == 0) {
+        if (params.philox_args.captured_) {
+            *params.seed = std::get<0>(seed_offset);
+            *params.extragraph_offset = std::get<1>(seed_offset);
+        }
+    }
 
     const BlockInfo</*Varlen=*/!Is_even_MN> binfo(params, bidb);
     if (m_block * kBlockM >= binfo.actual_seqlen_q) return;
@@ -109,15 +71,6 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
     // We exit early and write 0 to gO and gLSE. This also covers the case where actual_seqlen_k == 0.
     // Otherwise we might read OOB elements from gK and gV.
     if ((Is_causal || Is_local || !Is_even_MN) && n_block_max <= n_block_min) {
-        // Save seed and offset for backward. If we don't have this here, the 0-th thread block might
-        // exit early and no one saves the rng state.
-        if (Is_dropout && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && tidx == 0) {
-            auto seeds = at::cuda::philox::unpack(params.philox_args);
-            if (params.philox_args.captured_) {
-                *params.seed = std::get<0>(seeds);
-                *params.extragraph_offset = std::get<1>(seeds);
-            }
-        }
         const index_t row_offset_o = binfo.q_offset(params.o_batch_stride, params.o_row_stride, bidb)
             + m_block * kBlockM * params.o_row_stride + bidh * params.o_head_stride;
         const index_t row_offset_lse = (bidb * params.h + bidh) * params.seqlen_q + m_block * kBlockM;
@@ -192,8 +145,6 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
 
     typename Kernel_traits::GmemTiledCopyQKV gmem_tiled_copy_QKV;
     auto gmem_thr_copy_QKV = gmem_tiled_copy_QKV.get_thread_slice(tidx);
-    typename Kernel_traits::GmemTiledCopyP gmem_tiled_copy_P;
-    auto gmem_thr_copy_P = gmem_tiled_copy_P.get_thread_slice(tidx);
 
     Tensor tQgQ = gmem_thr_copy_QKV.partition_S(gQ);
     Tensor tQsQ = gmem_thr_copy_QKV.partition_D(sQ);
@@ -201,7 +152,6 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
     Tensor tKsK = gmem_thr_copy_QKV.partition_D(sK);
     Tensor tVgV = gmem_thr_copy_QKV.partition_S(gV);  // (VCPY, VCPY_N, VCPY_K)
     Tensor tVsV = gmem_thr_copy_QKV.partition_D(sV);
-    Tensor tPgP = gmem_thr_copy_P.partition_D(gP);
 
     typename Kernel_traits::TiledMma tiled_mma;
     auto thr_mma = tiled_mma.get_thread_slice(tidx);
@@ -209,6 +159,8 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
     Tensor tSrK  = thr_mma.partition_fragment_B(sK);                           // (MMA,MMA_N,MMA_K)
     Tensor tOrVt  = thr_mma.partition_fragment_B(sVtNoSwizzle);                // (MMA, MMA_K,MMA_N)
 
+    Tensor tSgS  = thr_mma.partition_C(gP);
+
     Tensor acc_o = partition_fragment_C(tiled_mma, Shape<Int<kBlockM>, Int<kHeadDim>>{});  // MMA, MMA_M, MMA_K
 
     //
@@ -229,10 +181,6 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
     auto smem_thr_copy_V = smem_tiled_copy_V.get_thread_slice(tidx);
     Tensor tOsVt = smem_thr_copy_V.partition_S(sVt);
 
-    // TODO: this might need to change if we change the mma instruction in SM70
-    Tensor scores_max = make_tensor<ElementAccum>(Shape<Int<2 * size<1>(acc_o)>>{});
-    Tensor scores_sum = make_fragment_like(scores_max);
-
     //
     // PREDICATES
     //
@@ -275,16 +223,11 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
 
     // Prologue
 
-    Tensor tQrQ = make_fragment_like(tQgQ);
     // We don't need to clear the sQ smem tiles since we'll only write out the valid outputs
     pytorch_flash::copy<Is_even_MN, Is_even_K>(gmem_tiled_copy_QKV, tQgQ, tQsQ, tQcQ, tQpQ,
                                        binfo.actual_seqlen_q - m_block * kBlockM);
     if (Kernel_traits::Is_Q_in_regs) { cute::cp_async_fence(); }
 
-    // // Copy rmem to smem
-    // // copy(tQrQ, tQsQ);
-    // pytorch_flash::cp_async_wait<0>();
-    // __syncthreads();
     // // if (cute::thread(1, 0)) { print(tQsQ); }
     // // Tensor sQNoSwizzle = make_tensor(make_smem_ptr(reinterpret_cast<Element *>(smem_)), typename Kernel_traits::SmemLayoutQNoSwizzle{});
     // // if (cute::thread0()) { print(sQNoSwizzle); }
@@ -314,16 +257,12 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
         cute::copy(smem_tiled_copy_Q, tSsQ, tSrQ_copy_view);
     }
 
-    auto seeds = at::cuda::philox::unpack(params.philox_args);
-    if (params.philox_args.captured_) {
-        *params.seed = std::get<0>(seeds);
-        *params.extragraph_offset = std::get<1>(seeds);
-    }
+    clear(acc_o);
 
-    unsigned long long seed = std::get<0>(seeds);
-    unsigned long long offset = std::get<1>(seeds) + (bidb * params.h + bidh) * 32 + tidx % 32;
+    pytorch_flash::Softmax<2 * size<1>(acc_o)> softmax;
 
-    clear(acc_o);
+    const float alibi_slope = !Has_alibi || params.alibi_slopes_ptr == nullptr ? 0.0f : reinterpret_cast<float *>(params.alibi_slopes_ptr)[bidb * params.alibi_slopes_batch_stride + bidh] / params.scale_softmax;
+    pytorch_flash::Mask<Is_causal, Is_local, Has_alibi> mask(binfo.actual_seqlen_k, binfo.actual_seqlen_q, params.window_size_left, params.window_size_right, alibi_slope);
 
     // For performance reason, we separate out two kinds of iterations:
     // those that need masking on S, and those that don't.
@@ -361,37 +300,9 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
         );
         // if (cute::thread0()) { print(acc_s); }
 
-        // Reshape acc_s from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
-        Tensor scores = make_tensor(acc_s.data(), pytorch_flash::convert_layout_acc_rowcol(acc_s.layout()));
-        // if (cute::thread0()) { print_tensor(scores); }
-        // We don't put the masking before the matmul S = Q K^T because we don't clear sK
-        // for rows outside actual_seqlen_k. So those rows could have Inf / NaN, and the matmul
-        // can produce Inf / NaN.
-        if (!Is_causal && !Is_local) {
-            if (!Is_even_MN) { pytorch_flash::apply_mask(scores, binfo.actual_seqlen_k - n_block * kBlockN); }
-        } else {
-            // Tensor caccS = make_identity_tensor(Shape<Int<kBlockM>, Int<kBlockN>>{});    // (BLK_M,BLK_N) -> (blk_m,blk_n)
-            // Tensor taccScS = thr_mma.partition_C(caccS);                           // (MMA,MMA_M,MMA_N)
-            // static_assert(decltype(size<0>(taccScS))::value == 4);
-            // // Convert to ((2, 2), MMA_M, MMA_N) then take only the row indices.
-            // Tensor idx_row = logical_divide(taccScS, Shape<_2>{})(make_coord(0, _), _, 0);
-            // Tensor idx_rowcol = make_tensor(taccScS.data(), pytorch_flash::convert_layout_acc_rowcol(taccScS.layout()));
-            // pytorch_flash::apply_mask_causal_w_idx(scores, idx_rowcol, n_block * kBlockN, binfo.actual_seqlen_k,
-            //                               m_block * kBlockM);
-            // Idk why it's get<1> and not get<0> of the stride.
-            // if (cute::thread0()) { print(idx_row.layout()); print(stride<1>(idx_row)); printf("stride = %d \n", get<1>(stride<1>(idx_row))); }
-            // I can't get the stride from idx_row
-            pytorch_flash::apply_mask_local</*HasWSLeft=*/Is_local>(
-                scores, n_block * kBlockN, binfo.actual_seqlen_k,
-                // m_block * kBlockM + get<0>(idx_row(0)),
-                m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4,
-                binfo.actual_seqlen_q, kNWarps * 16,
-                params.window_size_left, params.window_size_right
-                // m_block * kBlockM + (tidx / 32) * 16, kNWarps * 16
-                // m_block * kBlockM + (tidx / 32) * (kBlockM / kNWarps), 16
-            );
-            // if (cute::thread0()) { print_tensor(scores); }
-        }
+        mask.template apply_mask<Is_causal, Is_even_MN>(
+            acc_s, n_block * kBlockN, m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4, kNWarps * 16
+        );
 
         pytorch_flash::cp_async_wait<0>();
         __syncthreads();
@@ -406,33 +317,31 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
 
         // TODO: when we have key_padding_mask we'll need to Check_inf
         masking_step == 0
-            ? softmax_rescale_o</*Is_first=*/true,  /*Check_inf=*/Is_causal || Is_local>(scores, scores_max, scores_sum, acc_o, params.scale_softmax_log2)
-            : softmax_rescale_o</*Is_first=*/false, /*Check_inf=*/Is_causal || Is_local>(scores, scores_max, scores_sum, acc_o, params.scale_softmax_log2);
-
-        // Convert scores from fp32 to fp16/bf16
-        Tensor rP = pytorch_flash::convert_type<Element>(scores);
-        // Reshape rP from (nrow=(2, MMA_M), ncol=(2, MMA_N)) to ((2, 2, 2), MMA_M, MMA_N / 2)
-        // if using m16n8k16 or ((2, 2, 1), MMA_M, MMA_N) if using m16n8k8.
-        Tensor tOrP = make_tensor(rP.data(), pytorch_flash::convert_layout_rowcol_Aregs<Kernel_traits::TiledMma>(rP.layout()));
+            ? softmax.template softmax_rescale_o</*Is_first=*/true,  /*Check_inf=*/Is_causal || Is_local>(acc_s, acc_o, params.scale_softmax_log2)
+            : softmax.template softmax_rescale_o</*Is_first=*/false, /*Check_inf=*/Is_causal || Is_local>(acc_s, acc_o, params.scale_softmax_log2);
+
+        // Convert acc_s from fp32 to fp16/bf16
+        Tensor rP = pytorch_flash::convert_type<Element>(acc_s);
         int block_row_idx = m_block * (kBlockM / 16) + tidx / 32;
         int block_col_idx = n_block * (kBlockN / 32);
         if (Return_softmax) {
-            Tensor tOrP_copy = make_fragment_like(tOrP);
-            cute::copy(tOrP, tOrP_copy);
-            pytorch_flash::apply_dropout</*encode_dropout_in_sign_bit=*/true>(
-                tOrP_copy, params.p_dropout_in_uint8_t, seed, offset,
-                block_row_idx, block_col_idx, kNWarps
+            Tensor rP_drop = make_fragment_like(rP);
+            cute::copy(rP, rP_drop);
+            dropout.template apply_dropout</*encode_dropout_in_sign_bit=*/true>(
+                rP_drop, block_row_idx, block_col_idx, kNWarps
             );
-            pytorch_flash::write_softmax_to_gmem(tOrP_copy, tPgP, gmem_tiled_copy_P);
-            tPgP.data() = tPgP.data() + (-kBlockN);
+            cute::copy(rP_drop, tSgS);
+            tSgS.data() = tSgS.data() + (-kBlockN);
         }
         if (Is_dropout) {
-            pytorch_flash::apply_dropout(tOrP, params.p_dropout_in_uint8_t, seed, offset,
-                                 block_row_idx, block_col_idx, kNWarps);
+            dropout.apply_dropout(rP, block_row_idx, block_col_idx, kNWarps);
         }
-        // if (cute::thread0()) { print(tOrP); }
 
-        pytorch_flash::gemm_A_in_regs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_tiled_copy_V, smem_thr_copy_V);
+        // Reshape rP from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
+        // if using m16n8k16 or (4, MMA_M, MMA_N) if using m16n8k8.
+        Tensor tOrP = make_tensor(rP.data(), pytorch_flash::convert_layout_acc_Aregs<Kernel_traits::TiledMma>(rP.layout()));
+        // if (cute::thread0()) { print(tOrP); }
+        pytorch_flash::gemm_rs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_tiled_copy_V, smem_thr_copy_V);
         // if (cute::thread0()) { print(scores); }
 
         // This check is at the end of the loop since we always have at least 1 iteration
@@ -469,58 +378,37 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
             cute::cp_async_fence();
         }
 
-        // Reshape acc_s from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
-        Tensor scores = make_tensor(acc_s.data(), pytorch_flash::convert_layout_acc_rowcol(acc_s.layout()));
-        if (Is_local && n_block * kBlockN < (m_block + 1) * kBlockM + binfo.actual_seqlen_k - binfo.actual_seqlen_q + params.window_size_right) {
-            pytorch_flash::apply_mask_local(
-                scores, n_block * kBlockN, binfo.actual_seqlen_k,
-                m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4,
-                binfo.actual_seqlen_q, kNWarps * 16,
-                params.window_size_left, params.window_size_right
-            );
-        }
-        softmax_rescale_o</*Is_first=*/false, /*Check_inf=*/Is_local>(scores, scores_max, scores_sum, acc_o, params.scale_softmax_log2);
+        mask.template apply_mask</*Causal_mask=*/false>(
+            acc_s, n_block * kBlockN, m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4, kNWarps * 16
+        );
+
+        softmax.template softmax_rescale_o</*Is_first=*/false, /*Check_inf=*/Is_local>(acc_s, acc_o, params.scale_softmax_log2);
 
-        Tensor rP = pytorch_flash::convert_type<Element>(scores);
-        // Reshape rP from (nrow=(2, MMA_M), ncol=(2, MMA_N)) to ((2, 2, 2), MMA_M, MMA_N / 2)
-        // if using m16n8k16 or ((2, 2, 1), MMA_M, MMA_N) if using m16n8k8.
-        Tensor tOrP = make_tensor(rP.data(), pytorch_flash::convert_layout_rowcol_Aregs<Kernel_traits::TiledMma>(rP.layout()));
+        Tensor rP = pytorch_flash::convert_type<Element>(acc_s);
         int block_row_idx = m_block * (kBlockM / 16) + tidx / 32;
         int block_col_idx = n_block * (kBlockN / 32);
         if (Return_softmax) {
-            Tensor tOrP_copy = make_fragment_like(tOrP);
-            cute::copy(tOrP, tOrP_copy);
-            pytorch_flash::apply_dropout</*encode_dropout_in_sign_bit=*/true>(
-                tOrP_copy, params.p_dropout_in_uint8_t, seed, offset,
-                block_row_idx, block_col_idx, kNWarps
+            Tensor rP_drop = make_fragment_like(rP);
+            cute::copy(rP, rP_drop);
+            dropout.template apply_dropout</*encode_dropout_in_sign_bit=*/true>(
+                rP_drop, block_row_idx, block_col_idx, kNWarps
             );
-            pytorch_flash::write_softmax_to_gmem(tOrP_copy, tPgP, gmem_tiled_copy_P);
-            tPgP.data() = tPgP.data() + (-kBlockN);
+            cute::copy(rP_drop, tSgS);
+            tSgS.data() = tSgS.data() + (-kBlockN);
         }
         if (Is_dropout) {
-            pytorch_flash::apply_dropout(tOrP, params.p_dropout_in_uint8_t, seed, offset,
-                                 block_row_idx, block_col_idx, kNWarps);
+            dropout.apply_dropout(rP, block_row_idx, block_col_idx, kNWarps);
         }
 
-        pytorch_flash::gemm_A_in_regs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_tiled_copy_V, smem_thr_copy_V);
+        // Reshape rP from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
+        // if using m16n8k16 or (4, MMA_M, MMA_N) if using m16n8k8.
+        Tensor tOrP = make_tensor(rP.data(), pytorch_flash::convert_layout_acc_Aregs<Kernel_traits::TiledMma>(rP.layout()));
+        pytorch_flash::gemm_rs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_tiled_copy_V, smem_thr_copy_V);
     }
 
     // Epilogue
 
-    // Reshape acc_o from (MMA=4, MMA_M, MMA_K) to (nrow=(2, MMA_M), ncol=(2, MMA_K))
-    Tensor acc_o_rowcol = make_tensor(acc_o.data(), pytorch_flash::convert_layout_acc_rowcol(acc_o.layout()));
-    Tensor lse = make_fragment_like(scores_sum);
-    #pragma unroll
-    for (int mi = 0; mi < size<0>(acc_o_rowcol); ++mi) {
-        float sum = scores_sum(mi);
-        float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
-        lse(mi) = (sum == 0.f || sum != sum) ? INFINITY : scores_max(mi) * params.scale_softmax + __logf(sum);
-        float scale = !Is_dropout ? inv_sum : inv_sum * params.rp_dropout;
-        #pragma unroll
-        for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) { acc_o_rowcol(mi, ni) *= scale; }
-    }
-
-    // if (cute::thread0()) { print(acc_o_rowcol); }
+    Tensor lse = softmax.template normalize_softmax_lse<Is_dropout>(acc_o, params.scale_softmax, params.rp_dropout);
 
     // Convert acc_o from fp32 to fp16/bf16
     Tensor rO = pytorch_flash::convert_type<Element>(acc_o);
@@ -586,7 +474,7 @@ inline __device__ void compute_attn_1rowblock(const Params &params, const int bi
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-template<typename Kernel_traits, bool Is_causal, bool Is_local, bool Is_even_MN, bool Is_even_K, bool Split, bool Append_KV, typename Params>
+template<typename Kernel_traits, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Split, bool Append_KV, typename Params>
 inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, const int bidb, const int bidh, const int m_block, const int n_split_idx, const int num_n_splits) {
 
     using Element = typename Kernel_traits::Element;
@@ -674,10 +562,17 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
         + m_block * kBlockM * params.q_row_stride + bidh * params.q_head_stride;
     // We move K and V to the last block.
     const int bidb_cache = params.cache_batch_idx == nullptr ? bidb : params.cache_batch_idx[bidb];
-    const index_t row_offset_k = binfo.k_offset(params.k_batch_stride, params.k_row_stride, bidb_cache)
-        + (n_block_max - 1) * kBlockN * params.k_row_stride + (bidh / params.h_h_k_ratio) * params.k_head_stride;
-    const index_t row_offset_v = binfo.k_offset(params.v_batch_stride, params.v_row_stride, bidb_cache)
-        + (n_block_max - 1) * kBlockN * params.v_row_stride + (bidh / params.h_h_k_ratio) * params.v_head_stride;
+    const int *block_table = params.block_table == nullptr ? nullptr : params.block_table + bidb * params.block_table_batch_stride;
+    const int block_table_idx = block_table == nullptr ? 0 : (n_block_max - 1) * kBlockN / params.page_block_size;
+    const int block_table_offset = block_table == nullptr ? 0 : (n_block_max - 1) * kBlockN - block_table_idx * params.page_block_size;
+    const index_t row_offset_k = block_table == nullptr
+        ? binfo.k_offset(params.k_batch_stride, params.k_row_stride, bidb_cache)
+          + (n_block_max - 1) * kBlockN * params.k_row_stride + (bidh / params.h_h_k_ratio) * params.k_head_stride
+        : block_table[block_table_idx] * params.k_batch_stride + block_table_offset * params.k_row_stride + (bidh / params.h_h_k_ratio) * params.k_head_stride;
+    const index_t row_offset_v = block_table == nullptr
+        ? binfo.k_offset(params.v_batch_stride, params.v_row_stride, bidb_cache)
+          + (n_block_max - 1) * kBlockN * params.v_row_stride + (bidh / params.h_h_k_ratio) * params.v_head_stride
+        : block_table[block_table_idx] * params.v_batch_stride + block_table_offset * params.v_row_stride + (bidh / params.h_h_k_ratio) * params.v_head_stride;
 
     Tensor gQ = make_tensor(make_gmem_ptr(reinterpret_cast<Element *>(params.q_ptr) + row_offset_q),
                             Shape<Int<kBlockM>, Int<kHeadDim>>{},
@@ -731,11 +626,6 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
     auto smem_thr_copy_V = smem_tiled_copy_V.get_thread_slice(tidx);
     Tensor tOsVt = smem_thr_copy_V.partition_S(sVt);
 
-    // TODO: this might need to change if we change the mma instruction in SM70
-    Tensor scores_max = make_tensor<ElementAccum>(Shape<Int<2 * size<1>(acc_o)>>{});
-    Tensor scores_sum = make_fragment_like(scores_max);
-
-    //
     // PREDICATES
     //
 
@@ -815,11 +705,12 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
         Tensor tVgVnew = gmem_thr_copy_QKV.partition_S(gVnew);  // (VCPY, VCPY_N, VCPY_K)
 
         const int n_block_copy_min = std::max(n_block_min, binfo.seqlen_k_cache / kBlockN);
+        auto tKgK_data = tKgK.data();
+        auto tVgV_data = tVgV.data();
         for (int n_block = n_block_max - 1; n_block >= n_block_copy_min; n_block--) {
             pytorch_flash::copy_w_min_idx<Is_even_K>(
                 tVgVnew, tVgV, tKVcKV, tKVpKV, binfo.actual_seqlen_k - n_block * kBlockN, binfo.seqlen_k_cache - n_block * kBlockN
             );
-            tVgV.data() = tVgV.data() + (-int(kBlockN * params.v_row_stride));
             tVgVnew.data() = tVgVnew.data() + (-int(kBlockN * params.vnew_row_stride));
             if (params.rotary_dim == 0) {
                 pytorch_flash::copy_w_min_idx<Is_even_K>(
@@ -845,19 +736,30 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
 
                 }
             }
-            tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
             tKgKnew.data() = tKgKnew.data() + (-int(kBlockN * params.knew_row_stride));
+            if (block_table == nullptr) {
+                tVgV.data() = tVgV.data() + (-int(kBlockN * params.v_row_stride));
+                tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
+            } else {
+                if (n_block > n_block_copy_min) {
+                    const int block_table_idx_cur = n_block * kBlockN / params.page_block_size;
+                    const int block_table_offset_cur = n_block * kBlockN - block_table_idx_cur * params.page_block_size;
+                    const int block_table_idx_next = (n_block - 1) * kBlockN / params.page_block_size;
+                    const int block_table_offset_next = (n_block - 1) * kBlockN - block_table_idx_next * params.page_block_size;
+                    const int table_diff = block_table[block_table_idx_next] - block_table[block_table_idx_cur];
+                    const int offset_diff = block_table_offset_next - block_table_offset_cur;
+                    tVgV.data() = tVgV.data() + table_diff * params.v_batch_stride + offset_diff * params.v_row_stride;
+                    tKgK.data() = tKgK.data() + table_diff * params.k_batch_stride + offset_diff * params.k_row_stride;
+                }
+            }
         }
         // Need this before we can read in K again, so that we'll see the updated K values.
         __syncthreads();
-        if (n_block_max > n_block_copy_min) {
-            tKgK.data() = tKgK.data() + (n_block_max - n_block_copy_min) * kBlockN * params.k_row_stride;
-            tVgV.data() = tVgV.data() + (n_block_max - n_block_copy_min) * kBlockN * params.v_row_stride;
-        }
+        tKgK.data() = tKgK_data;
+        tVgV.data() = tVgV_data;
     }
 
     // Read Q from gmem to smem, optionally apply rotary embedding.
-    Tensor tQrQ = make_fragment_like(tQgQ);
     if (!Append_KV || params.rotary_dim == 0) {
         // We don't need to clear the sQ smem tiles since we'll only write out the valid outputs
         pytorch_flash::copy<Is_even_MN, Is_even_K>(gmem_tiled_copy_QKV, tQgQ, tQsQ, tQcQ, tQpQ,
@@ -908,6 +810,11 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
 
     clear(acc_o);
 
+    pytorch_flash::Softmax<2 * size<1>(acc_o)> softmax;
+
+    const float alibi_slope = !Has_alibi ? 0.0f : reinterpret_cast<float *>(params.alibi_slopes_ptr)[bidb * params.alibi_slopes_batch_stride + bidh] / params.scale_softmax;
+    pytorch_flash::Mask<Is_causal, Is_local, Has_alibi> mask(binfo.actual_seqlen_k, binfo.actual_seqlen_q, params.window_size_left, params.window_size_right, alibi_slope);
+
     // For performance reason, we separate out two kinds of iterations:
     // those that need masking on S, and those that don't.
     // We need masking on S for the very last block when K and V has length not multiple of kBlockN.
@@ -928,7 +835,15 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
 
         // Advance gV
         if (masking_step > 0) {
-            tVgV.data() = tVgV.data() + (-int(kBlockN * params.v_row_stride));
+            if (block_table == nullptr) {
+                tVgV.data() = tVgV.data() + (-int(kBlockN * params.v_row_stride));
+            } else {
+                const int block_table_idx_cur = (n_block + 1) * kBlockN / params.page_block_size;
+                const int block_table_offset_cur = (n_block + 1) * kBlockN - block_table_idx_cur * params.page_block_size;
+                const int block_table_idx_next = n_block * kBlockN / params.page_block_size;
+                const int block_table_offset_next = n_block * kBlockN - block_table_idx_next * params.page_block_size;
+                tVgV.data() = tVgV.data() + (block_table[block_table_idx_next] - block_table[block_table_idx_cur]) * params.v_batch_stride + (block_table_offset_next - block_table_offset_cur) * params.v_row_stride;
+            }
             pytorch_flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV);
         } else {
             // Clear the smem tiles to account for predicated off loads
@@ -944,21 +859,9 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
         );
         // if (cute::thread0()) { print(acc_s); }
 
-        // Reshape acc_s from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
-        Tensor scores = make_tensor(acc_s.data(), pytorch_flash::convert_layout_acc_rowcol(acc_s.layout()));
-        // if (cute::thread0()) { print(scores); }
-        // We don't put the masking before the matmul S = Q K^T because we don't clear sK
-        // for rows outside actual_seqlen_k. So those rows could have Inf / NaN, and the matmul
-        // can produce Inf / NaN.
-        if (!Is_causal && !Is_local) {
-            if (!Is_even_MN) { pytorch_flash::apply_mask(scores, binfo.actual_seqlen_k - n_block * kBlockN); }
-        } else {
-            pytorch_flash::apply_mask_local(scores, n_block * kBlockN, binfo.actual_seqlen_k,
-                                    m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4,
-                                    binfo.actual_seqlen_q, kNWarps * 16,
-                                    params.window_size_left, params.window_size_right
-                                    );
-        }
+        mask.template apply_mask<Is_causal, Is_even_MN>(
+            acc_s, n_block * kBlockN, m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4, kNWarps * 16
+        );
 
         pytorch_flash::cp_async_wait<0>();
         __syncthreads();
@@ -967,7 +870,15 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
 
         if (n_block > n_block_min) {
             // Advance gK
-            tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
+            if (block_table == nullptr) {
+                tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
+            } else {
+                const int block_table_idx_cur = n_block * kBlockN / params.page_block_size;
+                const int block_table_offset_cur = n_block * kBlockN - block_table_idx_cur * params.page_block_size;
+                const int block_table_idx_next = (n_block - 1) * kBlockN / params.page_block_size;
+                const int block_table_offset_next =(n_block - 1) * kBlockN - block_table_idx_next * params.page_block_size;
+                tKgK.data() = tKgK.data() + (block_table[block_table_idx_next] - block_table[block_table_idx_cur]) * params.k_batch_stride + (block_table_offset_next - block_table_offset_cur) * params.k_row_stride;
+            }
             pytorch_flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV);
             // This cp_async_fence needs to be in the if block, otherwise the synchronization
             // isn't right and we get race conditions.
@@ -976,18 +887,17 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
 
         // We have key_padding_mask so we'll need to Check_inf
         masking_step == 0
-            ? softmax_rescale_o</*Is_first=*/true,  /*Check_inf=*/Is_causal || Is_local || !Is_even_MN>(scores, scores_max, scores_sum, acc_o, params.scale_softmax_log2)
-            : softmax_rescale_o</*Is_first=*/false, /*Check_inf=*/Is_causal || Is_local || !Is_even_MN>(scores, scores_max, scores_sum, acc_o, params.scale_softmax_log2);
+            ? softmax.template softmax_rescale_o</*Is_first=*/true,  /*Check_inf=*/Is_causal || Is_local || !Is_even_MN>(acc_s, acc_o, params.scale_softmax_log2)
+            : softmax.template softmax_rescale_o</*Is_first=*/false, /*Check_inf=*/Is_causal || Is_local || !Is_even_MN>(acc_s, acc_o, params.scale_softmax_log2);
         // if (cute::thread0()) { print(scores_max); print(scores_sum); print(scores); }
 
-        // Convert scores from fp32 to fp16/bf16
-        Tensor rP = pytorch_flash::convert_type<Element>(scores);
-        // Reshape rP from (nrow=(2, MMA_M), ncol=(2, MMA_N)) to ((2, 2, 2), MMA_M, MMA_N / 2)
-        // if using m16n8k16 or ((2, 2, 1), MMA_M, MMA_N) if using m16n8k8.
-        Tensor tOrP = make_tensor(rP.data(), pytorch_flash::convert_layout_rowcol_Aregs<Kernel_traits::TiledMma>(rP.layout()));
+        // Convert acc_s from fp32 to fp16/bf16
+        Tensor rP = pytorch_flash::convert_type<Element>(acc_s);
+        // Reshape rP from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
+        // if using m16n8k16 or (4, MMA_M, MMA_N) if using m16n8k8.
+        Tensor tOrP = make_tensor(rP.data(), pytorch_flash::convert_layout_acc_Aregs<Kernel_traits::TiledMma>(rP.layout()));
 
-        pytorch_flash::gemm_A_in_regs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_tiled_copy_V, smem_thr_copy_V);
-        // if (cute::thread0()) { print(scores); }
+        pytorch_flash::gemm_rs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_tiled_copy_V, smem_thr_copy_V);
 
         // This check is at the end of the loop since we always have at least 1 iteration
         if (n_masking_steps > 1 && n_block <= n_block_min) {
@@ -1003,7 +913,15 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
         pytorch_flash::cp_async_wait<0>();
         __syncthreads();
         // Advance gV
-        tVgV.data() = tVgV.data() + (-int(kBlockN * params.v_row_stride));
+        if (block_table == nullptr) {
+            tVgV.data() = tVgV.data() + (-int(kBlockN * params.v_row_stride));
+        } else {
+            const int block_table_idx_cur = (n_block + 1) * kBlockN / params.page_block_size;
+            const int block_table_offset_cur = (n_block + 1) * kBlockN - block_table_idx_cur * params.page_block_size;
+            const int block_table_idx_next = n_block * kBlockN / params.page_block_size;
+            const int block_table_offset_next = n_block * kBlockN - block_table_idx_next * params.page_block_size;
+            tVgV.data() = tVgV.data() + (block_table[block_table_idx_next] - block_table[block_table_idx_cur]) * params.v_batch_stride + (block_table_offset_next - block_table_offset_cur) * params.v_row_stride;
+        }
         pytorch_flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tVgV, tVsV, tKVcKV, tKVpKV);
         cute::cp_async_fence();
 
@@ -1016,50 +934,38 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
         __syncthreads();
         if (n_block > n_block_min) {
             // Advance gK
-            tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
+            if (block_table == nullptr) {
+                tKgK.data() = tKgK.data() + (-int(kBlockN * params.k_row_stride));
+            } else {
+                const int block_table_idx_cur = n_block * kBlockN / params.page_block_size;
+                const int block_table_offset_cur = n_block * kBlockN - block_table_idx_cur * params.page_block_size;
+                const int block_table_idx_next = (n_block - 1) * kBlockN / params.page_block_size;
+                const int block_table_offset_next = (n_block - 1) * kBlockN - block_table_idx_next * params.page_block_size;
+                tKgK.data() = tKgK.data() + (block_table[block_table_idx_next] - block_table[block_table_idx_cur]) * params.k_batch_stride + (block_table_offset_next - block_table_offset_cur) * params.k_row_stride;
+            }
             pytorch_flash::copy</*Is_even_MN=*/true, Is_even_K>(gmem_tiled_copy_QKV, tKgK, tKsK, tKVcKV, tKVpKV);
             // This cp_async_fence needs to be in the if block, otherwise the synchronization
             // isn't right and we get race conditions.
             cute::cp_async_fence();
         }
 
-        // Reshape acc_s from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
-        Tensor scores = make_tensor(acc_s.data(), pytorch_flash::convert_layout_acc_rowcol(acc_s.layout()));
-        if (Is_local && n_block * kBlockN < (m_block + 1) * kBlockM + binfo.actual_seqlen_k - binfo.actual_seqlen_q + params.window_size_right) {
-            pytorch_flash::apply_mask_local(
-                scores, n_block * kBlockN, binfo.actual_seqlen_k,
-                m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4,
-                binfo.actual_seqlen_q, kNWarps * 16,
-                params.window_size_left, params.window_size_right
-            );
-        }
-        softmax_rescale_o</*Is_first=*/false, /*Check_inf=*/Is_local>(scores, scores_max, scores_sum, acc_o, params.scale_softmax_log2);
+        mask.template apply_mask</*Causal_mask=*/false>(
+            acc_s, n_block * kBlockN, m_block * kBlockM + (tidx / 32) * 16 + (tidx % 32) / 4, kNWarps * 16
+        );
+        softmax.template softmax_rescale_o</*Is_first=*/false, /*Check_inf=*/Is_local>(acc_s, acc_o, params.scale_softmax_log2);
 
-        Tensor rP = pytorch_flash::convert_type<Element>(scores);
-        // Reshape rP from (nrow=(2, MMA_M), ncol=(2, MMA_N)) to ((2, 2, 2), MMA_M, MMA_N / 2)
-        // if using m16n8k16 or ((2, 2, 1), MMA_M, MMA_N) if using m16n8k8.
-        Tensor tOrP = make_tensor(rP.data(), pytorch_flash::convert_layout_rowcol_Aregs<Kernel_traits::TiledMma>(rP.layout()));
+        Tensor rP = pytorch_flash::convert_type<Element>(acc_s);
+        // Reshape rP from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
+        // if using m16n8k16 or (4, MMA_M, MMA_N) if using m16n8k8.
+        Tensor tOrP = make_tensor(rP.data(), pytorch_flash::convert_layout_acc_Aregs<Kernel_traits::TiledMma>(rP.layout()));
 
-        pytorch_flash::gemm_A_in_regs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_tiled_copy_V, smem_thr_copy_V);
+        pytorch_flash::gemm_rs(acc_o, tOrP, tOrVt, tOsVt, tiled_mma, smem_tiled_copy_V, smem_thr_copy_V);
     }
 
     // Epilogue
 
-    // Reshape acc_o from (MMA=4, MMA_M, MMA_K) to (nrow=(2, MMA_M), ncol=(2, MMA_K))
-    Tensor acc_o_rowcol = make_tensor(acc_o.data(), pytorch_flash::convert_layout_acc_rowcol(acc_o.layout()));
-    // if (cute::thread0()) { print(acc_o_rowcol); }
-    Tensor lse = make_fragment_like(scores_sum);
-    #pragma unroll
-    for (int mi = 0; mi < size<0>(acc_o_rowcol); ++mi) {
-        float sum = scores_sum(mi);
-        float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
-        lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : scores_max(mi) * params.scale_softmax + __logf(sum);
-        float scale = inv_sum;
-        #pragma unroll
-        for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) { acc_o_rowcol(mi, ni) *= scale; }
-    }
+    Tensor lse = softmax.template normalize_softmax_lse</*Is_dropout=*/false, Split>(acc_o, params.scale_softmax);
     // if (cute::thread0()) { print(lse); }
-    // if (cute::thread0()) { print(acc_o_rowcol); }
 
     Tensor sOaccum = make_tensor(make_smem_ptr(reinterpret_cast<ElementO *>(smem_)), typename Kernel_traits::SmemLayoutO{}); // (SMEM_M,SMEM_N)
     // Partition sO to match the accumulator partitioning
@@ -1136,7 +1042,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params &params, cons
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_local, bool Is_even_MN, bool Is_even_K, bool Return_softmax, typename Params>
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Return_softmax, typename Params>
 inline __device__ void compute_attn(const Params &params) {
     const int m_block = blockIdx.x;
     // The block index for the batch.
@@ -1152,12 +1058,12 @@ inline __device__ void compute_attn(const Params &params) {
     // the attention matrix. This way, as long as we have the batch, head, and the location of
     // the 16 x 32 block within the attention matrix, we can generate the exact same dropout pattern.
 
-    pytorch_flash::compute_attn_1rowblock<Kernel_traits, Is_dropout, Is_causal, Is_local, Is_even_MN, Is_even_K, Return_softmax>(params, bidb, bidh, m_block);
+    pytorch_flash::compute_attn_1rowblock<Kernel_traits, Is_dropout, Is_causal, Is_local, Has_alibi, Is_even_MN, Is_even_K, Return_softmax>(params, bidb, bidh, m_block);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-template<typename Kernel_traits, bool Is_causal, bool Is_local, bool Is_even_MN, bool Is_even_K, bool Split, bool Append_KV, typename Params>
+template<typename Kernel_traits, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Split, bool Append_KV, typename Params>
 inline __device__ void compute_attn_splitkv(const Params &params) {
     const int m_block = blockIdx.x;
     // The block index for the batch.
@@ -1166,7 +1072,7 @@ inline __device__ void compute_attn_splitkv(const Params &params) {
     const int bidh = Split ? blockIdx.z - bidb * params.h : blockIdx.z;
     const int n_split_idx = Split ? blockIdx.y : 0;
     const int num_n_splits = Split ? gridDim.y : 1;
-    pytorch_flash::compute_attn_1rowblock_splitkv<Kernel_traits, Is_causal, Is_local, Is_even_MN, Is_even_K, Split, Append_KV>(params, bidb, bidh, m_block, n_split_idx, num_n_splits);
+    pytorch_flash::compute_attn_1rowblock_splitkv<Kernel_traits, Is_causal, Is_local, Has_alibi, Is_even_MN, Is_even_K, Split, Append_KV>(params, bidb, bidh, m_block, n_split_idx, num_n_splits);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1331,6 +1237,4 @@ inline __device__ void combine_attn_seqk_parallel(const Params &params) {
     }
 }
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace flash
+} // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_fwd_launch_template.h b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_fwd_launch_template.h
index d76eaa4450e4b..93e183542f691 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_fwd_launch_template.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_fwd_launch_template.h
@@ -12,27 +12,44 @@
 
 namespace pytorch_flash {
 
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_local, bool Is_even_MN, bool Is_even_K, bool Return_softmax>
-__global__ void flash_fwd_kernel(Flash_fwd_params params) {
-    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-        static_assert(!(Is_causal && Is_local));  // If Is_local is true, Is_causal should be false
-        pytorch_flash::compute_attn<Kernel_traits, Is_dropout, Is_causal, Is_local, Is_even_MN, Is_even_K, Return_softmax>(params);
+// Determine if the architecture supports FLASH and define a macro to handle parameter modifiers
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#define ARCH_SUPPORTS_FLASH
+#endif
+
+#if defined(ARCH_SUPPORTS_FLASH) && defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 11 && \
+    defined(__CUDACC_VER_MINOR__) && __CUDACC_VER_MINOR__ >= 8
+#define KERNEL_PARAM_MODIFIER __grid_constant__
+#else
+#define KERNEL_PARAM_MODIFIER
+#endif
+
+// Define a macro for unsupported architecture handling to centralize the error message
+#define FLASH_UNSUPPORTED_ARCH printf("FATAL: FlashAttention requires building with sm version sm80-sm90, but was built for < 8.0!");
+
+// Use a macro to clean up kernel definitions
+#define DEFINE_FLASH_FORWARD_KERNEL(kernelName, ...) \
+template<typename Kernel_traits, __VA_ARGS__> \
+__global__ void kernelName(KERNEL_PARAM_MODIFIER const Flash_fwd_params params)
+
+DEFINE_FLASH_FORWARD_KERNEL(flash_fwd_kernel, bool Is_dropout, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Return_softmax) {
+    #if defined(ARCH_SUPPORTS_FLASH)
+        static_assert(!(Is_causal && Is_local)); // Enforce constraints
+        pytorch_flash::compute_attn<Kernel_traits, Is_dropout, Is_causal, Is_local, Has_alibi, Is_even_MN, Is_even_K, Return_softmax>(params);
     #else
-        printf("FATAL: FlashAttention requires to be build with sm80-sm90, but was built for < 8.0!");
+        FLASH_UNSUPPORTED_ARCH
     #endif
 }
 
-template<typename Kernel_traits, bool Is_causal, bool Is_local, bool Is_even_MN, bool Is_even_K, bool Split, bool Append_KV>
-__global__ void flash_fwd_splitkv_kernel(Flash_fwd_params params) {
-    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-        pytorch_flash::compute_attn_splitkv<Kernel_traits, Is_causal, Is_local, Is_even_MN, Is_even_K, Split, Append_KV>(params);
+DEFINE_FLASH_FORWARD_KERNEL(flash_fwd_splitkv_kernel, bool Is_causal, bool Is_local, bool Has_alibi, bool Is_even_MN, bool Is_even_K, bool Split, bool Append_KV) {
+    #if defined(ARCH_SUPPORTS_FLASH)
+        pytorch_flash::compute_attn_splitkv<Kernel_traits, Is_causal, Is_local, Has_alibi, Is_even_MN, Is_even_K, Split, Append_KV>(params);
     #else
-        printf("FATAL: FlashAttention requires to be build with sm80-sm90, but was built for < 8.0!");
+        FLASH_UNSUPPORTED_ARCH
     #endif
 }
 
-template<typename Kernel_traits, int kBlockM, int Log_max_splits, bool Is_even_K>
-__global__ void flash_fwd_splitkv_combine_kernel(Flash_fwd_params params) {
+DEFINE_FLASH_FORWARD_KERNEL(flash_fwd_splitkv_combine_kernel, int kBlockM, int Log_max_splits, bool Is_even_K) {
     static_assert(Log_max_splits >= 1);
     pytorch_flash::combine_attn_seqk_parallel<Kernel_traits, kBlockM, Log_max_splits, Is_even_K>(params);
 }
@@ -52,27 +69,30 @@ void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
     const bool is_even_K = params.d == Kernel_traits::kHeadDim;
     const bool return_softmax = params.p_ptr != nullptr;
     BOOL_SWITCH(is_even_MN, IsEvenMNConst, [&] {
-        BOOL_SWITCH(is_even_K, IsEvenKConst, [&] {
-            BOOL_SWITCH((params.window_size_left >= 0 || params.window_size_right >= 0) && !Is_causal, Is_local, [&] {
+        EVENK_SWITCH(is_even_K, IsEvenKConst, [&] {
+            LOCAL_SWITCH((params.window_size_left >= 0 || params.window_size_right >= 0) && !Is_causal, Is_local, [&] {
                 BOOL_SWITCH(return_softmax, ReturnSoftmaxConst, [&] {
-                    // Will only return softmax if dropout, to reduce compilation time.
-                    // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
-                    // If return_softmax, set IsEvenMNConst to false to reduce number of templates
-                    // If head dim > 128, set IsEvenMNConst to false to reduce number of templates
-                    // If Is_local, set Is_causal to false
-                    auto kernel = &flash_fwd_kernel<Kernel_traits, Is_dropout, Is_causal, Is_local && !Is_causal, IsEvenMNConst && IsEvenKConst && !Is_local && !ReturnSoftmaxConst && Kernel_traits::kHeadDim <= 128, IsEvenKConst, ReturnSoftmaxConst && Is_dropout>;
-                    // printf("IsEvenMNConst = %d, IsEvenKConst = %d, Is_local = %d, Is_causal = %d, ReturnSoftmaxConst = %d, Is_dropout = %d\n", int(IsEvenMNConst), int(IsEvenKConst), int(Is_local), int(Is_causal), int(ReturnSoftmaxConst), int(Is_dropout));
-                    // auto kernel = &flash_fwd_kernel<Kernel_traits, false, Is_causal, false, true, true, false>;
-                    if (smem_size >= 48 * 1024) {
-                        C10_CUDA_CHECK(cudaFuncSetAttribute(
-                            kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-                    }
-                    // int ctas_per_sm;
-                    // cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-                    //     &ctas_per_sm, kernel, Kernel_traits::kNThreads, smem_size);
-                    // printf("smem_size = %d, CTAs per SM = %d\n", int(smem_size), ctas_per_sm);
-                    kernel<<<grid, Kernel_traits::kNThreads, smem_size, stream>>>(params);
-                    C10_CUDA_KERNEL_LAUNCH_CHECK();
+                    ALIBI_SWITCH(params.alibi_slopes_ptr != nullptr, Has_alibi, [&] {
+                        // Will only return softmax if dropout, to reduce compilation time.
+                        // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
+                        // If return_softmax, set IsEvenMNConst to false to reduce number of templates
+                        // If head dim > 128, set IsEvenMNConst to false to reduce number of templates
+                        // If Is_local, set Is_causal to false
+                        auto kernel = &flash_fwd_kernel<Kernel_traits, Is_dropout, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && IsEvenKConst && !Is_local && !ReturnSoftmaxConst && Kernel_traits::kHeadDim <= 128, IsEvenKConst, ReturnSoftmaxConst && Is_dropout>;
+                        // auto kernel = &flash_fwd_kernel<Kernel_traits, false, Is_causal, false, false, true, true, false>;
+                        // printf("IsEvenMNConst = %d, IsEvenKConst = %d, Is_local = %d, Is_causal = %d, ReturnSoftmaxConst = %d, Is_dropout = %d\n", int(IsEvenMNConst), int(IsEvenKConst), int(Is_local), int(Is_causal), int(ReturnSoftmaxConst), int(Is_dropout));
+                        // auto kernel = &flash_fwd_kernel<Kernel_traits, false, Is_causal, false, true, true, false>;
+                        if (smem_size >= 48 * 1024) {
+                            C10_CUDA_CHECK(cudaFuncSetAttribute(
+                                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+                        }
+                        // int ctas_per_sm;
+                        // cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+                        //     &ctas_per_sm, kernel, Kernel_traits::kNThreads, smem_size);
+                        // printf("smem_size = %d, CTAs per SM = %d\n", int(smem_size), ctas_per_sm);
+                        kernel<<<grid, Kernel_traits::kNThreads, smem_size, stream>>>(params);
+                        C10_CUDA_KERNEL_LAUNCH_CHECK();
+                    });
                 });
             });
         });
@@ -90,22 +110,24 @@ void run_flash_splitkv_fwd(Flash_fwd_params &params, cudaStream_t stream) {
     const bool is_even_K = params.d == Kernel_traits::kHeadDim;
     BOOL_SWITCH(params.is_causal, Is_causal, [&] {
         BOOL_SWITCH(is_even_MN, IsEvenMNConst, [&] {
-            BOOL_SWITCH(is_even_K, IsEvenKConst, [&] {
-                BOOL_SWITCH((params.window_size_left >= 0 || params.window_size_right >= 0) && !Is_causal, Is_local, [&] {
+            EVENK_SWITCH(is_even_K, IsEvenKConst, [&] {
+                LOCAL_SWITCH((params.window_size_left >= 0 || params.window_size_right >= 0) && !Is_causal, Is_local, [&] {
                     BOOL_SWITCH(params.num_splits > 1, Split, [&] {
                         BOOL_SWITCH(params.knew_ptr != nullptr, Append_KV, [&] {
-                            // If Append_KV, then we must have seqlen_offsets, which means cu_seqlens_k != nullptr.
-                            // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
-                            // If Is_local, set Is_causal to false
-                            auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, Is_local && !Is_causal, IsEvenMNConst && !Append_KV && IsEvenKConst && !Is_local && Kernel_traits::kHeadDim <= 128, IsEvenKConst, Split, Append_KV>;
-                            // auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, false, true, Split, Append_KV>;
-                            // auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, false, IsEvenKConst>;
-                            if (smem_size >= 48 * 1024) {
-                                C10_CUDA_CHECK(cudaFuncSetAttribute(
-                                    kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-                            }
-                            kernel<<<grid, Kernel_traits::kNThreads, smem_size, stream>>>(params);
-                            C10_CUDA_KERNEL_LAUNCH_CHECK();
+                            ALIBI_SWITCH(params.alibi_slopes_ptr != nullptr, Has_alibi, [&] {
+                                // If Append_KV, then we must have seqlen_offsets, which means cu_seqlens_k != nullptr.
+                                // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
+                                // If Is_local, set Is_causal to false
+                                auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && !Append_KV && IsEvenKConst && !Is_local && Kernel_traits::kHeadDim <= 128, IsEvenKConst, Split, Append_KV>;
+                                // auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, false, true, Split, Append_KV>;
+                                // auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, false, IsEvenKConst>;
+                                if (smem_size >= 48 * 1024) {
+                                    C10_CUDA_CHECK(cudaFuncSetAttribute(
+                                        kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+                                }
+                                kernel<<<grid, Kernel_traits::kNThreads, smem_size, stream>>>(params);
+                                C10_CUDA_KERNEL_LAUNCH_CHECK();
+                            });
                         });
                     });
                 });
@@ -118,7 +140,7 @@ void run_flash_splitkv_fwd(Flash_fwd_params &params, cudaStream_t stream) {
         // If headdim is divisible by 64, then we set kBlockM = 8, etc.
         constexpr static int kBlockM = Kernel_traits::kHeadDim % 128 == 0 ? 4 : (Kernel_traits::kHeadDim % 64 == 0 ? 8 : 16);
         dim3 grid_combine((params.b * params.h * params.seqlen_q + kBlockM - 1) / kBlockM);
-        BOOL_SWITCH(is_even_K, IsEvenKConst, [&] {
+        EVENK_SWITCH(is_even_K, IsEvenKConst, [&] {
             if (params.num_splits <= 2) {
                 flash_fwd_splitkv_combine_kernel<Kernel_traits, kBlockM, 1, IsEvenKConst><<<grid_combine, Kernel_traits::kNThreads, 0, stream>>>(params);
             } else if (params.num_splits <= 4) {
@@ -152,7 +174,7 @@ void run_mha_fwd_splitkv_dispatch(Flash_fwd_params &params, cudaStream_t stream)
 template<typename T>
 void run_mha_fwd_hdim32(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 32;
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
         BOOL_SWITCH(params.is_causal, Is_causal, [&] {
             run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 128, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
         });
@@ -162,7 +184,7 @@ void run_mha_fwd_hdim32(Flash_fwd_params &params, cudaStream_t stream) {
 template<typename T>
 void run_mha_fwd_hdim64(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 64;
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
         BOOL_SWITCH(params.is_causal, Is_causal, [&] {
             if constexpr(!Is_dropout) {
                 // Using 8 warps is 18% slower for seqlen=2k, 2 warps is 5% slower
@@ -186,7 +208,7 @@ void run_mha_fwd_hdim96(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 96;
     auto dprops = at::cuda::getCurrentDeviceProperties();
     bool is_sm8x = dprops->major == 8 && dprops->minor > 0;
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
         BOOL_SWITCH(params.is_causal, Is_causal, [&] {
             // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square),
             if (is_sm8x) {
@@ -212,7 +234,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 128;
     auto dprops = at::cuda::getCurrentDeviceProperties();
     bool is_sm8x = dprops->major == 8 && dprops->minor > 0;
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
         BOOL_SWITCH(params.is_causal, Is_causal, [&] {
             if constexpr(!Is_dropout) {
                 // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square),
@@ -249,7 +271,7 @@ void run_mha_fwd_hdim160(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 160;
     auto dprops = at::cuda::getCurrentDeviceProperties();
     bool is_sm8x = dprops->major == 8 && dprops->minor > 0;
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
         BOOL_SWITCH(params.is_causal, Is_causal, [&] {
             // For A100, H100, 128 x 32 is the fastest.
             // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square),
@@ -277,7 +299,7 @@ void run_mha_fwd_hdim160(Flash_fwd_params &params, cudaStream_t stream) {
 template<typename T>
 void run_mha_fwd_hdim192(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 192;
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
         BOOL_SWITCH(params.is_causal, Is_causal, [&] {
             if constexpr(!Is_dropout) {
                 run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 8, false, false, T>, Is_dropout, Is_causal>(params, stream);
@@ -305,7 +327,7 @@ void run_mha_fwd_hdim224(Flash_fwd_params &params, cudaStream_t stream) {
       C10_CUDA_CHECK(status_);
     }
     // printf("max_smem_per_block = %d\n", max_smem_per_block);
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
         BOOL_SWITCH(params.is_causal, Is_causal, [&] {
             if (max_smem_per_block >= 2 * Headdim * (128 + 2 * 64)) {  // 112 KB
                 run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 8, false, false, T>, Is_dropout, Is_causal>(params, stream);
@@ -336,7 +358,7 @@ void run_mha_fwd_hdim256(Flash_fwd_params &params, cudaStream_t stream) {
       C10_CUDA_CHECK(status_);
     }
     // printf("max_smem_per_sm = %d, max_smem_per_block = %d\n", max_smem_per_sm, max_smem_per_block);
-    BOOL_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
+    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
         BOOL_SWITCH(params.is_causal, Is_causal, [&] {
             // For A100, we want to run with 128 x 64 (128KB smem).
             // For H100 we want to run with 64 x 64 (96KB smem) since then we can get 2 CTAs per SM.
@@ -353,4 +375,4 @@ void run_mha_fwd_hdim256(Flash_fwd_params &params, cudaStream_t stream) {
     });
 }
 
-}; // namespace pytorch_fmha
+}; // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernel_traits.h b/aten/src/ATen/native/transformers/cuda/flash_attn/kernel_traits.h
index a0c3a3e2320ea..ef1c3b91c94b0 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernel_traits.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernel_traits.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2023, Tri Dao.
+ * Copyright (c) 2024, Tri Dao.
  ******************************************************************************/
 
 #pragma once
@@ -26,7 +26,7 @@ struct Flash_kernel_traits {
 #endif
 
     using ElementAccum = float;
-    using index_t = uint32_t;
+    using index_t = int64_t;
 
 #if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
     using MMA_Atom_Arch = std::conditional_t<
@@ -34,10 +34,8 @@ struct Flash_kernel_traits {
         MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
         MMA_Atom<SM80_16x8x16_F32BF16BF16F32_TN>
     >;
-    using ValLayoutMNK = Layout<Shape<_1, _2, _1>>;
 #else
     using MMA_Atom_Arch = MMA_Atom<SM75_16x8x8_F32F16F16F32_TN>;
-    using ValLayoutMNK = Layout<Shape<_1, _2, _2>>;
 #endif
 
 #if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 750
@@ -78,7 +76,7 @@ struct Flash_fwd_kernel_traits : public Base {
     using TiledMma = TiledMMA<
         typename Base::MMA_Atom_Arch,
         Layout<Shape<Int<kNWarps>,_1,_1>>,  // 4x1x1 or 8x1x1 thread group
-        typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM
+        Tile<Int<16 * kNWarps>, _16, _16>>;
 
     using SmemLayoutAtomQ = decltype(
         composition(Swizzle<kSwizzle, 3, 3>{},
@@ -93,20 +91,10 @@ struct Flash_fwd_kernel_traits : public Base {
         SmemLayoutAtomQ{},
         Shape<Int<kBlockN>, Int<kHeadDim>>{}));
 
-    // This has to be kBlockN and not 8, otherwise we get wrong results for d=128
-    using SmemLayoutAtomVtransposedNoSwizzle = Layout<Shape<Int<kBlockKSmem>, Int<kBlockN>>,
-                                                      Stride<_1, Int<kBlockKSmem>>>;
-    using SmemLayoutAtomVtransposed = decltype(
-        composition(Swizzle<kSwizzle, 3, 3>{}, SmemLayoutAtomVtransposedNoSwizzle{}));
-    using SmemLayoutVtransposed = decltype(tile_to_shape(
-        SmemLayoutAtomVtransposed{},
-        Shape<Int<kHeadDim>, Int<kBlockN>>{}));
-    // Maybe the VtransposeNoSwizzle just needs to have the right shape
-    // And the strides don't matter?
-    using SmemLayoutVtransposedNoSwizzle = decltype(tile_to_shape(
-        SmemLayoutAtomVtransposedNoSwizzle{},
-        Shape<Int<kHeadDim>, Int<kBlockN>>{}));
-    // using SmemLayoutVtransposedNoSwizzle = decltype(SmemLayoutVtransposed{}.layout_fn());
+    // https://github.com/ColfaxResearch/cutlass-kernels/blob/a222587e6d59b93ba704853d3946fb686d8b8892/src/fmha/fmha_forward.cu#L434
+    using SmemLayoutVtransposed = decltype(
+        composition(SmemLayoutKV{}, make_layout(Shape<Int<kHeadDim>, Int<kBlockN>>{}, GenRowMajor{})));
+    using SmemLayoutVtransposedNoSwizzle = decltype(get_nonswizzle_portion(SmemLayoutVtransposed{}));
 
     using SmemLayoutAtomO = decltype(
         composition(Swizzle<kSwizzle, 3, 3>{},
@@ -118,10 +106,8 @@ struct Flash_fwd_kernel_traits : public Base {
     using SmemCopyAtomO = Copy_Atom<DefaultCopy, Element>;
     using SmemCopyAtomOaccum = Copy_Atom<DefaultCopy, ElementAccum>;
 
-    static constexpr int kSmemQCount = size(SmemLayoutQ{});
-    static constexpr int kSmemKVCount = size(SmemLayoutKV{}) * 2;
-    static constexpr int kSmemQSize = kSmemQCount * sizeof(Element);
-    static constexpr int kSmemKVSize = kSmemKVCount * sizeof(Element);
+    static constexpr int kSmemQSize = size(SmemLayoutQ{}) * sizeof(Element);
+    static constexpr int kSmemKVSize = size(SmemLayoutKV{}) * 2 * sizeof(Element);
     static constexpr int kSmemSize = Share_Q_K_smem ? std::max(kSmemQSize, kSmemKVSize) : kSmemQSize + kSmemKVSize;
 
     static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element);
@@ -151,15 +137,6 @@ struct Flash_fwd_kernel_traits : public Base {
         make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
                         GmemLayoutAtom{},
                         Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per store
-    static constexpr int kGmemThreadsPerRowP = kBlockN / kGmemElemsPerLoad;
-    static_assert(kNThreads % kGmemThreadsPerRowP == 0, "kNThreads must be a multiple of kGmemThreadsPerRowP");
-    using GmemLayoutAtomP = Layout<Shape <Int<kNThreads / kGmemThreadsPerRowP>, Int<kGmemThreadsPerRowP>>,
-                                   Stride<Int<kGmemThreadsPerRowP>, _1>>;
-
-    using GmemTiledCopyP = decltype(
-        make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
-                        GmemLayoutAtomP{},
-                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per store
 
     using GmemLayoutAtomOaccum = std::conditional_t<
         kBlockKSmem == 32,
@@ -183,7 +160,7 @@ struct Flash_fwd_kernel_traits : public Base {
                         Layout<Shape < _1, _8>>{}));  // Val layout, 8 vals per load
 };
 
-// Is_V_in_regs is an option to reduce smem usage, but will increase register pressue.
+// Is_V_in_regs is an option to reduce smem usage, but will increase register pressure.
 // No_double_buffer is another option to reduce smem usage, but will slow things down.
 template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_,
          int AtomLayoutMSdP_=1, int AtomLayoutNdKV=2, int AtomLayoutMdQ=2,
@@ -220,18 +197,15 @@ struct Flash_bwd_kernel_traits : public Base {
     using TiledMmaSdP = TiledMMA<
         typename Base::MMA_Atom_Arch,
         Layout<Shape<Int<AtomLayoutMSdP>, Int<kNWarps / AtomLayoutMSdP>, _1>>,
-        typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM
-
+        Tile<Int<16 * AtomLayoutMSdP>, Int<16 * kNWarps / AtomLayoutMSdP>, _16>>;
     using TiledMmadKV = TiledMMA<
         typename Base::MMA_Atom_Arch,
         Layout<Shape<Int<AtomLayoutNdKV>, Int<kNWarps / AtomLayoutNdKV>, _1>>,
-        typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM
-
+        Tile<Int<16 * AtomLayoutNdKV>, Int<16 * kNWarps / AtomLayoutNdKV>, _16>>;
     using TiledMmadQ = TiledMMA<
         typename Base::MMA_Atom_Arch,
         Layout<Shape<Int<AtomLayoutMdQ>, Int<kNWarps / AtomLayoutMdQ>, _1>>,  // 2x4x1 or 4x2x1 thread group
-        typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM
-
+        Tile<Int<16 * AtomLayoutMdQ>, Int<16 * kNWarps / AtomLayoutMdQ>, _16>>;
     using SmemLayoutAtomQdO = decltype(
         composition(Swizzle<kSwizzle, 3, 3>{},
                     Layout<Shape<_8, Int<kBlockKSmem>>,
@@ -249,26 +223,18 @@ struct Flash_bwd_kernel_traits : public Base {
         SmemLayoutAtomKV{},
         make_shape(Int<kBlockN>{}, Int<kHeadDim>{})));
 
-    using SmemLayoutAtomKtransposedNoSwizzle = Layout<Shape<Int<kBlockKSmem>, Int<kBlockN>>,
-                                                      Stride<_1, Int<kBlockKSmem>>>;
-    using SmemLayoutAtomKtransposed = decltype(
-        composition(Swizzle<kSwizzle, 3, 3>{}, SmemLayoutAtomKtransposedNoSwizzle{}));
-    using SmemLayoutKtransposed = decltype(tile_to_shape(
-        SmemLayoutAtomKtransposed{},
-        make_shape(Int<kHeadDim>{}, Int<kBlockN>{})));
-    // Maybe the KtransposeNoSwizzle just needs to have the right shape
-    // And the strides don't matter?
-    using SmemLayoutKtransposedNoSwizzle = decltype(tile_to_shape(
-        SmemLayoutAtomKtransposedNoSwizzle{},
-        make_shape(Int<kHeadDim>{}, Int<kBlockN>{})));
-    // using SmemLayoutKtransposedNoSwizzle = decltype(SmemLayoutKtransposed{}.layout_fn());
+    using SmemLayoutKtransposed = decltype(
+        composition(SmemLayoutKV{}, make_layout(Shape<Int<kHeadDim>, Int<kBlockN>>{}, GenRowMajor{})));
+    using SmemLayoutKtransposedNoSwizzle = decltype(get_nonswizzle_portion(SmemLayoutKtransposed{}));
 
     // TODO: generalize to other values of kBlockN
     // TODO: what should be the Swizzle here? 3 is faster than 1, and 1 is faster than 2
     // static constexpr int kPBlockN = kBlockN;
-    static_assert(kBlockN >= 64);
+    // Temporarily disabling this for hdim 256 on sm86 and sm89
+    // static_assert(kBlockN >= 64);
+    static_assert(kBlockN >= 32);
     // TD [2023-03-19]: Idk why kPBlockN = 16 and kSwizzlePdS=3 is the fastest.
-    static constexpr int kPBlockN = 64;
+    static constexpr int kPBlockN = kBlockN >= 64 ? 64 : 32;
     static_assert(kPBlockN == 16 || kPBlockN == 32 || kPBlockN == 64);
     // static constexpr int kSwizzlePdS = kPBlockN == 16 ? 1 : (kPBlockN == 32 ? 2 : 3);
     static constexpr int kSwizzlePdS = 3;
@@ -279,30 +245,15 @@ struct Flash_bwd_kernel_traits : public Base {
     using SmemLayoutPdS = decltype(tile_to_shape(
         SmemLayoutAtomPdS{},
         make_shape(Int<kBlockM>{}, Int<kBlockN>{})));
-    using SmemLayoutAtomPdStransposedNoSwizzle = Layout<Shape<Int<kPBlockN>, Int<kBlockM>>,
-                                                        Stride<_1, Int<kPBlockN>>>;
-    using SmemLayoutAtomPdStransposed = decltype(
-        composition(Swizzle<kSwizzlePdS, 3, 3>{}, SmemLayoutAtomPdStransposedNoSwizzle{}));
-    using SmemLayoutPdStransposed = decltype(tile_to_shape(
-        SmemLayoutAtomPdStransposed{},
-        make_shape(Int<kBlockN>{}, Int<kBlockM>{})));
-    using SmemLayoutPdStransposedNoSwizzle = decltype(tile_to_shape(
-        SmemLayoutAtomPdStransposedNoSwizzle{},
-        make_shape(Int<kBlockN>{}, Int<kBlockM>{})));
-    // using SmemLayoutPdStransposedNoSwizzle = decltype(SmemLayoutPdStransposed{}.layout_fn());
+    using SmemLayoutPdStransposed = decltype(
+        composition(SmemLayoutPdS{}, make_layout(Shape<Int<kBlockN>, Int<kBlockM>>{}, GenRowMajor{})));
+    using SmemLayoutPdStransposedNoSwizzle = decltype(get_nonswizzle_portion(SmemLayoutPdStransposed{}));
+
     using SmemCopyAtomPdS = Copy_Atom<DefaultCopy, elem_type>;
 
-    using SmemLayoutAtomQdOtransposedNoSwizzle = Layout<Shape<Int<kBlockKSmem>, Int<kBlockM>>,
-                                                        Stride<_1, Int<kBlockKSmem>>>;
-    using SmemLayoutAtomQdOtransposed = decltype(
-        composition(Swizzle<kSwizzle, 3, 3>{}, SmemLayoutAtomQdOtransposedNoSwizzle{}));
-    using SmemLayoutQdOtransposed = decltype(tile_to_shape(
-        SmemLayoutAtomQdOtransposed{},
-        make_shape(Int<kHeadDim>{}, Int<kBlockM>{})));
-    using SmemLayoutQdOtransposedNoSwizzle = decltype(tile_to_shape(
-        SmemLayoutAtomQdOtransposedNoSwizzle{},
-        make_shape(Int<kHeadDim>{}, Int<kBlockM>{})));
-    // using SmemLayoutQdOtransposedNoSwizzle = decltype(SmemLayoutQdOtransposed{}.layout_fn());
+    using SmemLayoutQdOtransposed = decltype(
+        composition(SmemLayoutQdO{}, make_layout(Shape<Int<kHeadDim>, Int<kBlockM>>{}, GenRowMajor{})));
+    using SmemLayoutQdOtransposedNoSwizzle = decltype(get_nonswizzle_portion(SmemLayoutQdOtransposed{}));
 
     using SmemLayoutAtomdKV = decltype(
         composition(Swizzle<kSwizzle, 3, 3>{},
@@ -322,16 +273,12 @@ struct Flash_bwd_kernel_traits : public Base {
         make_shape(Int<kBlockM>{}, Int<kHeadDim>{})));
     using SmemCopyAtomdQ = Copy_Atom<DefaultCopy, elem_type>;
 
-    static constexpr int kSmemQdOCount = size(SmemLayoutQdO{}) * (No_double_buffer ? 2 : 3);  // Double buffer for sQ
-    static constexpr int kSmemKVCount = size(SmemLayoutKV{}) * 2;
-    static constexpr int kSmemdSCount = size(SmemLayoutPdS{});
-    static constexpr int kSmemPCount = size(SmemLayoutPdS{});
-    static constexpr int kSmemdQCount = size(SmemLayoutdQ{});
-    static constexpr int kSmemQdOSize = kSmemQdOCount * sizeof(Element);
-    static constexpr int kSmemKVSize = kSmemKVCount * sizeof(Element);
-    static constexpr int kSmemdSSize = kSmemdSCount * sizeof(Element);
-    static constexpr int kSmemPSize = kSmemPCount * sizeof(Element);
-    static constexpr int kSmemdQSize = kSmemdQCount * sizeof(Element);
+    // Double buffer for sQ
+    static constexpr int kSmemQdOSize = size(SmemLayoutQdO{}) * (No_double_buffer ? 2 : 3) * sizeof(Element);
+    static constexpr int kSmemKVSize = size(SmemLayoutKV{}) * 2 * sizeof(Element);
+    static constexpr int kSmemdSSize = size(SmemLayoutPdS{}) * sizeof(Element);
+    static constexpr int kSmemPSize = size(SmemLayoutPdS{}) * sizeof(Element);
+    static constexpr int kSmemdQSize = size(SmemLayoutdQ{}) * sizeof(Element);
     static constexpr int kSmemSize = kSmemQdOSize
         + (!Is_V_in_regs
            ? kSmemKVSize + kSmemdSSize + std::max(kSmemPSize, kSmemdQSize)
@@ -340,9 +287,6 @@ struct Flash_bwd_kernel_traits : public Base {
         + (!Is_V_in_regs
            ? kSmemKVSize + kSmemdSSize + kSmemPSize
            : std::max(kSmemKVSize, kSmemKVSize / 2 + kSmemdSSize + kSmemPSize));
-    static constexpr int kSmemSize1rowblock = kSmemQdOSize / 3 * 2 + kSmemKVSize / 2 * 3
-        + kSmemdSSize + kSmemPSize;
-
 
     static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element);
     static_assert(kHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad");
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernel_traits_sm90.h b/aten/src/ATen/native/transformers/cuda/flash_attn/kernel_traits_sm90.h
deleted file mode 100644
index 01ea212b452c4..0000000000000
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernel_traits_sm90.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2023, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <cute/algorithm/copy.hpp>
-
-#include <cutlass/cutlass.h>
-#include <cutlass/layout/layout.h>
-#include <cutlass/numeric_types.h>
-
-namespace pytorch_flash{
-
-using namespace cute;
-
-template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, typename elem_type=cutlass::half_t>
-struct Flash_kernel_traits_sm90 {
-
-#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
-    using Element = elem_type;
-    static constexpr bool Has_cp_async = true;
-#else
-    using Element = cutlass::half_t;
-    static constexpr bool Has_cp_async = false;
-#endif
-
-    using ElementAccum = float;
-    using index_t = uint32_t;
-
-#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
-    using MMA_Atom_Arch = std::conditional_t<
-        std::is_same_v<elem_type, cutlass::half_t>,
-        MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
-        MMA_Atom<SM80_16x8x16_F32BF16BF16F32_TN>
-    >;
-    using ValLayoutMNK = Layout<Shape<_1, _2, _1>>;
-#else
-    using MMA_Atom_Arch = MMA_Atom<SM75_16x8x8_F32F16F16F32_TN>;
-    using ValLayoutMNK = Layout<Shape<_1, _2, _2>>;
-#endif
-
-#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 750
-    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, elem_type>;
-    using SmemCopyAtomTransposed = Copy_Atom<SM75_U16x8_LDSM_T, elem_type>;
-#else
-    using SmemCopyAtom = Copy_Atom<DefaultCopy, elem_type>;
-    using SmemCopyAtomTransposed = Copy_Atom<DefaultCopy, elem_type>;
-#endif
-};
-
-template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, bool Is_Q_in_regs_=false, bool Share_Q_K_smem_=false, typename elem_type=cutlass::half_t,
-         typename Base=Flash_kernel_traits_sm90<kHeadDim_, kBlockM_, kBlockN_, kNWarps_, elem_type> >
-struct Flash_fwd_kernel_traits : public Base {
-    using Element = typename Base::Element;
-    using ElementAccum = typename Base::ElementAccum;
-    using index_t = typename Base::index_t;
-    static constexpr bool Has_cp_async = Base::Has_cp_async;
-    using SmemCopyAtom = typename Base::SmemCopyAtom;
-    using SmemCopyAtomTransposed = typename Base::SmemCopyAtomTransposed;
-
-    static constexpr bool Share_Q_K_smem = Share_Q_K_smem_;
-    static constexpr bool Is_Q_in_regs = Is_Q_in_regs_ || Share_Q_K_smem;
-
-    // The number of threads.
-    static constexpr int kNWarps = kNWarps_;
-    static constexpr int kNThreads = kNWarps * 32;
-
-    static constexpr int kBlockM = kBlockM_;
-    static constexpr int kBlockN = kBlockN_;
-    static constexpr int kHeadDim = kHeadDim_;
-    static_assert(kHeadDim % 32 == 0);
-    static constexpr int kBlockKSmem = kHeadDim % 64 == 0 ? 64 : 32;
-    static constexpr int kBlockKGmem = kHeadDim % 128 == 0 ? 128 : (kHeadDim % 64 == 0 ? 64 : 32);
-    static constexpr int kSwizzle = kBlockKSmem == 32 ? 2 : 3;
-
-    using TiledMma = TiledMMA<
-        typename Base::MMA_Atom_Arch,
-        Layout<Shape<Int<kNWarps>,_1,_1>>,  // 4x1x1 or 8x1x1 thread group
-        typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM
-
-    using SmemLayoutAtomQ = decltype(
-        composition(Swizzle<kSwizzle, 3, 3>{},
-                    // This has to be kBlockKSmem, using kHeadDim gives wrong results for d=128
-                    Layout<Shape<_8, Int<kBlockKSmem>>,
-                           Stride<Int<kBlockKSmem>, _1>>{}));
-    using SmemLayoutQ = decltype(tile_to_shape(
-        SmemLayoutAtomQ{},
-        Shape<Int<kBlockM>, Int<kHeadDim>>{}));
-
-    using SmemLayoutKV = decltype(tile_to_shape(
-        SmemLayoutAtomQ{},
-        Shape<Int<kBlockN>, Int<kHeadDim>>{}));
-
-    using SmemLayoutAtomVtransposed = decltype(
-        composition(Swizzle<kSwizzle, 3, 3>{},
-                    // This has to be kBlockN and not 8, otherwise we get wrong results for d=128
-                    Layout<Shape<Int<kBlockKSmem>, Int<kBlockN>>,
-                           Stride<_1, Int<kBlockKSmem>>>{}));
-    using SmemLayoutVtransposed = decltype(tile_to_shape(
-        SmemLayoutAtomVtransposed{},
-        Shape<Int<kHeadDim>, Int<kBlockN>>{}));
-    // Maybe the VtransposeNoSwizzle just needs to have the right shape
-    // And the strides don't matter?
-    using SmemLayoutVtransposedNoSwizzle = decltype(SmemLayoutVtransposed{}.layout_fn());
-
-    using SmemLayoutAtomO = decltype(
-        composition(Swizzle<kSwizzle, 3, 3>{},
-                    Layout<Shape<Int<8>, Int<kBlockKSmem>>,
-                           Stride<Int<kBlockKSmem>, _1>>{}));
-    using SmemLayoutO = decltype(tile_to_shape(
-        SmemLayoutAtomO{},
-        Shape<Int<kBlockM>, Int<kHeadDim>>{}));
-    using SmemCopyAtomO = Copy_Atom<DefaultCopy, elem_type>;
-
-    static constexpr int kSmemQCount = size(SmemLayoutQ{});
-    static constexpr int kSmemKVCount = size(SmemLayoutKV{}) * 2;
-    static constexpr int kSmemQSize = kSmemQCount * sizeof(Element);
-    static constexpr int kSmemKVSize = kSmemKVCount * sizeof(Element);
-    static constexpr int kSmemSize = Share_Q_K_smem ? std::max(kSmemQSize, kSmemKVSize) : kSmemQSize + kSmemKVSize;
-
-    static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element);
-    static_assert(kHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad");
-    // Using kBlockKSmem here is 6-10% faster than kBlockKGmem for d=128 because of bank conflicts.
-    // For example, for d=128, smem is split into 2 "pages", each page takes care of columns
-    // 0-63 and 64-127. If we have 16 threads per row for gmem read, when we write to smem,
-    // thread 0 - 7 will write to the first page and thread 8 - 15 will write to the second page,
-    // to the same banks.
-    static constexpr int kGmemThreadsPerRow = kBlockKSmem / kGmemElemsPerLoad;
-    static_assert(kNThreads % kGmemThreadsPerRow == 0, "kNThreads must be a multiple of kGmemThreadsPerRow");
-    using GmemLayoutAtom = Layout<Shape <Int<kNThreads / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
-                                  Stride<Int<kGmemThreadsPerRow>, _1>>;
-
-    // We use CACHEGLOBAL instead of CACHEALWAYS for both Q and K/V, since we won't be reading
-    // from the same address by the same threadblock. This is slightly faster.
-    using Gmem_copy_struct = std::conditional_t<
-        Has_cp_async,
-        SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>,
-        DefaultCopy
-    >;
-    using GmemTiledCopyQKV = decltype(
-        make_tiled_copy(Copy_Atom<Gmem_copy_struct, elem_type>{},
-                        GmemLayoutAtom{},
-                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per read
-    using GmemTiledCopyO = decltype(
-        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
-                        GmemLayoutAtom{},
-                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per store
-    static constexpr int kGmemThreadsPerRowP = kBlockN / kGmemElemsPerLoad;
-    static_assert(kNThreads % kGmemThreadsPerRowP == 0, "kNThreads must be a multiple of kGmemThreadsPerRowP");
-    using GmemLayoutAtomP = Layout<Shape <Int<kNThreads / kGmemThreadsPerRowP>, Int<kGmemThreadsPerRowP>>,
-                                   Stride<Int<kGmemThreadsPerRowP>, _1>>;
-
-    using GmemTiledCopyP = decltype(
-        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
-                        GmemLayoutAtomP{},
-                        Layout<Shape<_1, _8>>{}));  // Val layout, 8 vals per store
-
-};
-} // namespace pytorch_flash
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim128_bf16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim128_bf16_sm80.cu
index 247b359b05219..63a80c4d2062f 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim128_bf16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim128_bf16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 128>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim128<cutlass::bfloat16_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::bfloat16_t, 128>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim128<cutlass::bfloat16_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim128_fp16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim128_fp16_sm80.cu
index 54ba9b1d01657..720f54343a469 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim128_fp16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim128_fp16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::half_t, 128>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim128<cutlass::half_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::half_t, 128>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim128<cutlass::half_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim160_bf16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim160_bf16_sm80.cu
index 351df04f7bd8b..04aa184a6f78c 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim160_bf16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim160_bf16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 160>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim160<cutlass::bfloat16_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::bfloat16_t, 160>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim160<cutlass::bfloat16_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim160_fp16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim160_fp16_sm80.cu
index 057023e3be16a..979082162997a 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim160_fp16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim160_fp16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::half_t, 160>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim160<cutlass::half_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::half_t, 160>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim160<cutlass::half_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim192_bf16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim192_bf16_sm80.cu
index f772b3c75a4d5..76ac4426f0390 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim192_bf16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim192_bf16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 192>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim192<cutlass::bfloat16_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::bfloat16_t, 192>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim192<cutlass::bfloat16_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim192_fp16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim192_fp16_sm80.cu
index 91deb5f3e88e5..d0a05f597219c 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim192_fp16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim192_fp16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::half_t, 192>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim192<cutlass::half_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::half_t, 192>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim192<cutlass::half_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim224_bf16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim224_bf16_sm80.cu
index bf11ee849e1bc..14ce1a9a450fc 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim224_bf16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim224_bf16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 224>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim224<cutlass::bfloat16_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::bfloat16_t, 224>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim224<cutlass::bfloat16_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim224_fp16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim224_fp16_sm80.cu
index 59a062829d468..259c84cf8cdaa 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim224_fp16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim224_fp16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::half_t, 224>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim224<cutlass::half_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::half_t, 224>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim224<cutlass::half_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim256_bf16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim256_bf16_sm80.cu
index 48150fabcd61f..1767b60f7908b 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim256_bf16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim256_bf16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 256>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim256<cutlass::bfloat16_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::bfloat16_t, 256>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim256<cutlass::bfloat16_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim256_fp16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim256_fp16_sm80.cu
index f24074782bf7d..6381904f7b5b7 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim256_fp16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim256_fp16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::half_t, 256>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim256<cutlass::half_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::half_t, 256>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim256<cutlass::half_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim32_bf16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim32_bf16_sm80.cu
index 8724f83e90071..bd47a37e7f6e3 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim32_bf16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim32_bf16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 32>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim32<cutlass::bfloat16_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::bfloat16_t, 32>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim32<cutlass::bfloat16_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim32_fp16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim32_fp16_sm80.cu
index aca37f6dfa07e..ae046260c3706 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim32_fp16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim32_fp16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::half_t, 32>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim32<cutlass::half_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::half_t, 32>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim32<cutlass::half_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim64_bf16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim64_bf16_sm80.cu
index ce1c12768d75b..42314aac9d2a2 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim64_bf16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim64_bf16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 64>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim64<cutlass::bfloat16_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::bfloat16_t, 64>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim64<cutlass::bfloat16_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim64_fp16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim64_fp16_sm80.cu
index 5f901a7b3243f..616c784f7524c 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim64_fp16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim64_fp16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::half_t, 64>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim64<cutlass::half_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::half_t, 64>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim64<cutlass::half_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim96_bf16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim96_bf16_sm80.cu
index a0dc45eea3c88..6eccc4f455ad0 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim96_bf16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim96_bf16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 96>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim96<cutlass::bfloat16_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::bfloat16_t, 96>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim96<cutlass::bfloat16_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim96_fp16_sm80.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim96_fp16_sm80.cu
index 083828ee67f9b..54e455b81a36d 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim96_fp16_sm80.cu
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/flash_bwd_hdim96_fp16_sm80.cu
@@ -8,7 +8,7 @@
 namespace pytorch_flash{
 
 template<>
-void run_mha_bwd_<cutlass::half_t, 96>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {
-    run_mha_bwd_hdim96<cutlass::half_t>(params, stream, configure);
+void run_mha_bwd_<cutlass::half_t, 96>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim96<cutlass::half_t>(params, stream);
 }
 } // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/generate_kernels.py b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/generate_kernels.py
index ee97a6a73cc05..ca1fe27f94903 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/generate_kernels.py
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernels/generate_kernels.py
@@ -27,8 +27,8 @@
 
 KERNEL_IMPL_TEMPLATE_BWD = """
 template<>
-void run_mha_bwd_<{DTYPE}, {HEAD_DIM}>(Flash_bwd_params &params, cudaStream_t stream, const bool configure) {{
-    run_mha_bwd_hdim{HEAD_DIM}<{DTYPE}>(params, stream, configure);
+void run_mha_bwd_<{DTYPE}, {HEAD_DIM}>(Flash_bwd_params &params, cudaStream_t stream) {{
+    run_mha_bwd_hdim{HEAD_DIM}<{DTYPE}>(params, stream);
 }}
 """
 
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/mask.h b/aten/src/ATen/native/transformers/cuda/flash_attn/mask.h
new file mode 100644
index 0000000000000..9cee154fbbd50
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/mask.h
@@ -0,0 +1,213 @@
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cute/tensor.hpp>
+
+namespace pytorch_flash {
+
+using namespace cute;
+
+template <typename Engine, typename Layout>
+__forceinline__ __device__ void apply_mask(Tensor<Engine, Layout> &tensor, const int max_seqlen_k,
+                                  const int col_idx_offset_ = 0) {
+    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
+    static_assert(Layout::rank == 2, "Only support 2D Tensor");
+    const int lane_id = threadIdx.x % 32;
+    const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
+    #pragma unroll
+    for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+        const int col_idx_base = col_idx_offset + nj * 8;
+        #pragma unroll
+        for (int j = 0; j < size<1, 0>(tensor); ++j) {
+            const int col_idx = col_idx_base + j;
+            if (col_idx >= max_seqlen_k) {
+                // Without the "make_coord" we get wrong results
+                #pragma unroll
+                for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                    tensor(mi, make_coord(j, nj)) = -INFINITY;
+                }
+            }
+        }
+    }
+}
+
+template <bool HasWSLeft=true, typename Engine, typename Layout>
+__forceinline__ __device__ void apply_mask_local(Tensor<Engine, Layout> &tensor, const int col_idx_offset_,
+                                        const int max_seqlen_k, const int row_idx_offset,
+                                        const int max_seqlen_q, const int warp_row_stride,
+                                        const int window_size_left, const int window_size_right) {
+    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
+    static_assert(Layout::rank == 2, "Only support 2D Tensor");
+    const int lane_id = threadIdx.x % 32;
+    const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
+    #pragma unroll
+    for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
+        const int row_idx_base = row_idx_offset + mi * warp_row_stride;
+        #pragma unroll
+        for (int i = 0; i < size<0, 0>(tensor); ++i) {
+            const int row_idx = row_idx_base + i * 8;
+            const int col_idx_limit_left = std::max(0, row_idx + max_seqlen_k - max_seqlen_q - window_size_left);
+            const int col_idx_limit_right = std::min(max_seqlen_k, row_idx + 1 + max_seqlen_k - max_seqlen_q + window_size_right);
+            #pragma unroll
+            for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                const int col_idx_base = col_idx_offset + nj * 8;
+                #pragma unroll
+                for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                    const int col_idx = col_idx_base + j;
+                    if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) {
+                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    }
+                }
+            }
+            // if (cute::thread0()) {
+            //     printf("mi = %d, i = %d, row_idx = %d, max_seqlen_k = %d\n", mi, i, row_idx, max_seqlen_k);
+            //     print(tensor(make_coord(i, mi), _));
+            //     // print(tensor(_, j + nj * size<1, 0>(tensor)));
+            // }
+        }
+    }
+}
+
+template <typename Engine, typename Layout>
+__forceinline__ __device__ void apply_mask_causal(Tensor<Engine, Layout> &tensor, const int col_idx_offset_,
+                                         const int max_seqlen_k, const int row_idx_offset,
+                                         const int max_seqlen_q, const int warp_row_stride) {
+    // Causal masking is equivalent to local masking with window_size_left = infinity and window_size_right = 0
+    apply_mask_local</*HasWSLeft=*/false>(tensor, col_idx_offset_, max_seqlen_k, row_idx_offset,
+                                          max_seqlen_q, warp_row_stride, -1, 0);
+}
+
+template <typename Engine0, typename Layout0, typename Engine1, typename Layout1>
+__forceinline__ __device__ void apply_mask_causal_w_idx(
+    Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> const &idx_rowcol,
+    const int col_idx_offset_, const int max_seqlen_k, const int row_idx_offset)
+{
+    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
+    static_assert(Layout0::rank == 2, "Only support 2D Tensor");
+    static_assert(Layout1::rank == 2, "Only support 2D Tensor");
+    CUTE_STATIC_ASSERT_V(size<0>(tensor) == size<0>(idx_rowcol));
+    CUTE_STATIC_ASSERT_V(size<1>(tensor) == size<1>(idx_rowcol));
+    #pragma unroll
+    for (int mi = 0; mi < size<0>(tensor); ++mi) {
+        const int col_idx_limit = std::min(max_seqlen_k, 1 + row_idx_offset + get<0>(idx_rowcol(mi, 0)));
+        #pragma unroll
+        for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {
+            if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {
+                tensor(mi, ni) = -INFINITY;
+            }
+        }
+        // if (cute::thread0()) {
+        //     printf("ni = %d, j = %d, col_idx = %d, max_seqlen_k = %d\n", ni, j, col_idx, max_seqlen_k);
+        //     print(tensor(_, make_coord(j, ni)));
+        //     // print(tensor(_, j + ni * size<1, 0>(tensor)));
+        // }
+    }
+}
+
+template <bool Is_causal, bool Is_local, bool Has_alibi>
+struct Mask {
+
+    const int max_seqlen_k, max_seqlen_q;
+    const int window_size_left, window_size_right;
+    const float alibi_slope;
+
+    __forceinline__ __device__ Mask(const int max_seqlen_k, const int max_seqlen_q,
+                                    const int window_size_left, const int window_size_right,
+                                    const float alibi_slope=0.f)
+        : max_seqlen_k(max_seqlen_k)
+        , max_seqlen_q(max_seqlen_q)
+        , window_size_left(window_size_left)
+        , window_size_right(window_size_right)
+        , alibi_slope(!Has_alibi ? 0.0 : alibi_slope) {
+    };
+
+    // Causal_mask: whether this particular iteration needs causal masking
+    template <bool Causal_mask=false, bool Is_even_MN=true, typename Engine, typename Layout>
+    __forceinline__ __device__ void apply_mask(Tensor<Engine, Layout> &tensor_,
+                                               const int col_idx_offset_,
+                                               const int row_idx_offset,
+                                               const int warp_row_stride) {
+        static_assert(!(Causal_mask && Is_local), "Cannot be both causal and local");
+        static_assert(Layout::rank == 3, "Only support 3D Tensor");
+        static_assert(decltype(size<0>(tensor_))::value == 4, "First dimension must be 4");
+        static constexpr bool Need_masking = Has_alibi || Causal_mask || Is_local || !Is_even_MN;
+        // if (cute::thread0()) { printf("Has_alibi = %d, Causal_mask=%d, Is_local=%d, Is_even_MN = %d, Need_masking = %d\n", Has_alibi, Causal_mask, Is_local, Is_even_MN, Need_masking); }
+        if constexpr (Need_masking) {
+            // Reshape tensor_ from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
+            Tensor tensor = make_tensor(tensor_.data(), pytorch_flash::convert_layout_acc_rowcol(tensor_.layout()));
+            // Do we need both row and column indices, or just column incides?
+            static constexpr bool Col_idx_only = !(Has_alibi && !Is_causal) && !Is_local && !Causal_mask;
+            const int lane_id = threadIdx.x % 32;
+            const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
+            if constexpr (Col_idx_only) {
+                #pragma unroll
+                for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                    const int col_idx_base = col_idx_offset + nj * 8;
+                    #pragma unroll
+                    for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                        const int col_idx = col_idx_base + j;
+                        #pragma unroll
+                        for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                            // No causal, no local
+                            if constexpr (Has_alibi) {
+                                tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx;
+                            }
+                            if constexpr (!Is_even_MN) {
+                                if (col_idx >= max_seqlen_k) { tensor(mi, make_coord(j, nj)) = -INFINITY; }
+                            }
+                        }
+                    }
+                }
+            } else {
+                #pragma unroll
+                for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
+                    const int row_idx_base = row_idx_offset + mi * warp_row_stride;
+                    #pragma unroll
+                    for (int i = 0; i < size<0, 0>(tensor); ++i) {
+                        const int row_idx = row_idx_base + i * 8;
+                        const int col_idx_limit_left = std::max(0, row_idx + max_seqlen_k - max_seqlen_q - window_size_left);
+                        const int col_idx_limit_right = std::min(max_seqlen_k, row_idx + 1 + max_seqlen_k - max_seqlen_q + window_size_right);
+                        #pragma unroll
+                        for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                            const int col_idx_base = col_idx_offset + nj * 8;
+                            #pragma unroll
+                            for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                                const int col_idx = col_idx_base + j;
+                                if constexpr (Has_alibi) {
+                                    if constexpr (Is_causal) {
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope * col_idx;
+                                    } else {
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
+
+                                    }
+                                }
+                                if constexpr (Causal_mask) {
+                                    if (col_idx >= col_idx_limit_right) {
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                                    }
+                                }
+                                if constexpr (Is_local) {
+                                    if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) {
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                                    }
+                                }
+                                if constexpr (!Causal_mask && !Is_local && !Is_even_MN) {
+                                    // Causal and Local already handles MN masking
+                                    if (col_idx >= max_seqlen_k) {
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    };
+
+};
+
+}  // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/philox.cuh b/aten/src/ATen/native/transformers/cuda/flash_attn/philox.cuh
index 472d6b211f052..bed362bdd0c8e 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/philox.cuh
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/philox.cuh
@@ -11,7 +11,7 @@ struct ull2 {
     unsigned long long y;
 };
 
-inline __device__ uint2 mulhilo32(const unsigned int a, const unsigned int b) {
+__forceinline__ __device__ uint2 mulhilo32(const unsigned int a, const unsigned int b) {
     uint2 *res;
     unsigned long long tmp;
     asm ("mul.wide.u32 %0, %1, %2;\n\t"
@@ -21,7 +21,7 @@ inline __device__ uint2 mulhilo32(const unsigned int a, const unsigned int b) {
     return *res;
 }
 
-inline __device__ uint4 philox_single_round(const uint4 ctr, const uint2 key) {
+__forceinline__ __device__ uint4 philox_single_round(const uint4 ctr, const uint2 key) {
     constexpr unsigned long kPhiloxSA = 0xD2511F53;
     constexpr unsigned long kPhiloxSB = 0xCD9E8D57;
     uint2 res0 = mulhilo32(kPhiloxSA, ctr.x);
@@ -30,7 +30,7 @@ inline __device__ uint4 philox_single_round(const uint4 ctr, const uint2 key) {
     return ret;
 }
 
-inline __device__ uint4 philox(unsigned long long seed,
+__forceinline__ __device__ uint4 philox(unsigned long long seed,
                                unsigned long long subsequence,
                                unsigned long long offset) {
     constexpr unsigned long kPhilox10A = 0x9E3779B9;
@@ -51,117 +51,3 @@ inline __device__ uint4 philox(unsigned long long seed,
 }
 
 } // namespace flash
-
-namespace {
-
-class Philox {
-public:
-  __device__ inline Philox(unsigned long long seed,
-                           unsigned long long subsequence,
-                           unsigned long long offset)
-      : STATE(0)
-      , seed_(seed)
-      , offset_(offset)
-      , key(reinterpret_cast<const uint2&>(seed)) {
-    //key.x = (unsigned int)seed;
-    //key.y = (unsigned int)(seed >> 32);
-    //counter = make_uint4(0, 0, 0, 0);
-    //counter.z = (unsigned int)(subsequence);
-    //counter.w = (unsigned int)(subsequence >> 32);
-    //STATE = 0;
-    //incr_n(offset / 4);
-
-    // key = reinterpret_cast<const uint2&>(seed);
-    ull2 * tmp = reinterpret_cast<ull2*>(&counter);
-    tmp->x = offset / 4;
-    tmp->y = subsequence;
-    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-    //     printf("Philox counter: %d, %d, %d, %d\n", counter.x, counter.y, counter.z, counter.w);
-    // }
-  }
-  __device__ inline uint4 operator()() {
-    // // if (STATE == 0) {
-    //   uint4 counter_ = counter;
-    //   uint2 key_ = key;
-    //   // 7-round philox
-    //   #pragma unroll
-    //   for (int i = 0; i < 6; i++) {
-    //       counter_ = pytorch_flash::philox_single_round(counter_, key_);
-    //     key_.x += (kPhilox10A);
-    //     key_.y += (kPhilox10B);
-    //   }
-    //   // output = philox_single_round(counter_, key_);
-    //   uint4 output = pytorch_flash::philox_single_round(counter_, key_);
-    //   // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-    //   //     printf("Philox counter: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
-    //   //     printf("Philox output: %u, %u, %u, %u\n", output.x, output.y, output.z, output.w);
-    //   // }
-    //   incr();
-    // // }
-    // // return a float4 directly
-    // // unsigned long ret;
-    // // switch(STATE) {
-    // //  case 0: ret = output.x; break;
-    // //  case 1: ret = output.y; break;
-    // //  case 2: ret = output.z; break;
-    // //  case 3: ret = output.w; break;
-    // //}
-    // // STATE = (STATE + 1) % 4;
-    // return output;
-      return pytorch_flash::philox(seed_, offset_, offset_);
-  }
-
-private:
-  unsigned long long offset_, seed_;
-  struct ull2 {
-      uint64_t x;
-      uint64_t y;
-  };
-  uint4 counter;
-  // uint4 output;
-  const uint2 key;
-  unsigned int STATE;
-  __device__ inline void incr_n(unsigned long long n) {
-    unsigned int nlo = (unsigned int)(n);
-    unsigned int nhi = (unsigned int)(n >> 32);
-    counter.x += nlo;
-    if (counter.x < nlo)
-      nhi++;
-    counter.y += nhi;
-    if (nhi <= counter.y)
-      return;
-    if (++counter.z)
-      return;
-    ++counter.w;
-  }
-
-  __device__ uint4 incr128 (uint4 ctr)
-  {
-    uint4 res;
-    asm ("add.cc.u32      %0, %4, %8;\n\t"
-         "addc.cc.u32     %1, %5, %9;\n\t"
-         "addc.cc.u32     %2, %6, %10;\n\t"
-         "addc.u32        %3, %7, %11;\n\t"
-         : "=r"(res.x), "=r"(res.y), "=r"(res.z), "=r"(res.w)
-         : "r"(ctr.x), "r"(ctr.y), "r"(ctr.z), "r"(ctr.w),
-           "n"(1), "n"(0), "n"(0), "n"(0));
-    return res;
-  }
-
-  __device__ inline void incr() {
-    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-    //     printf("Counter before: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
-    // }
-    counter = incr128(counter);
-    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-    //     printf("Counter after: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
-    // }
-  }
-
-  static const unsigned long kPhilox10A = 0x9E3779B9;
-  static const unsigned long kPhilox10B = 0xBB67AE85;
-  // static const unsigned long kPhiloxSA = 0xD2511F53;
-  // static const unsigned long kPhiloxSB = 0xCD9E8D57;
-};
-
-} // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/rotary.h b/aten/src/ATen/native/transformers/cuda/flash_attn/rotary.h
new file mode 100644
index 0000000000000..12dc1746c8087
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/rotary.h
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cute/algorithm/copy.hpp>
+
+#include <ATen/native/transformers/cuda/flash_attn/utils.h>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace pytorch_flash {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool Is_even_K=true, bool Clear_OOB_K=true,
+          typename Engine0, typename Layout0, typename Engine1, typename Layout1,
+          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
+__forceinline__ __device__ void copy_rotary_interleaved(Tensor<Engine0, Layout0> const &S,
+                                               Tensor<Engine1, Layout1> &D,
+                                               Tensor<Engine2, Layout2> const &Cos,
+                                               Tensor<Engine2, Layout2> const &Sin,
+                                               Tensor<Engine3, Layout3> const &identity_MN,
+                                               const int max_MN, const int min_MN,
+                                               const int dim, const int rotary_dim) {
+    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
+    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Cos));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Cos));                     // MMA_K
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Sin));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Sin));                     // MMA_K
+    CUTE_STATIC_ASSERT_V(size<0>(Cos) == size<0>(Sin));                     // MMA_K
+    static_assert(decltype(size<0>(S))::value == decltype(size<0>(Cos))::value * 2);
+    static_assert(decltype(size<0>(Cos))::value % 2 == 0);  // Since we do fast conversion from fp16/bf16 to fp32
+    Tensor rCos = make_fragment_like(Cos);
+    Tensor rSin = make_fragment_like(Sin);
+    Tensor rS = make_fragment_like(S);
+    #pragma unroll
+    for (int m = 0; m < size<1>(S); ++m) {
+        if (get<0>(identity_MN(0, m, 0)) >= min_MN && get<0>(identity_MN(0, m, 0)) < max_MN) {
+            #pragma unroll
+            for (int k = 0; k < size<2>(S); ++k) {
+                if (Is_even_K || get<1>(identity_MN(0, 0, k)) < dim) {
+                    cute::copy(S(_, m, k), rS(_, m, k));
+                    if (get<1>(identity_MN(0, 0, k)) < rotary_dim) {
+                        cute::copy(Cos(_, m, k), rCos(_, m, k));
+                        cute::copy(Sin(_, m, k), rSin(_, m, k));
+                        Tensor S_fp32 = convert_type<float>(rS(_, m, k));
+                        Tensor cos_fp32 = convert_type<float>(rCos(_, m, k));
+                        Tensor sin_fp32 = convert_type<float>(rSin(_, m, k));
+                        #pragma unroll
+                        for (int i = 0; i < size<0>(rS) / 2; ++i) {
+                            float real = S_fp32(2 * i) * cos_fp32(i) - S_fp32(2 * i + 1) * sin_fp32(i);
+                            float imag = S_fp32(2 * i) * sin_fp32(i) + S_fp32(2 * i + 1) * cos_fp32(i);
+                            S_fp32(2 * i) = real;
+                            S_fp32(2 * i + 1) = imag;
+                        }
+                        // Idk but I need to copy for the convert_type to work
+                        Tensor S_fp32_copy = make_fragment_like(S_fp32);
+                        cute::copy(S_fp32, S_fp32_copy);
+                        using T = typename Engine0::value_type;
+                        Tensor S_og_type = convert_type<T>(S_fp32_copy);
+                        cute::copy(S_og_type, rS(_, m, k));
+                    }
+                    cute::copy(rS(_, m, k), D(_, m, k));
+                } else if (Clear_OOB_K) {
+                    cute::clear(D(_, m, k));
+                }
+            }
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool Is_even_K=true, bool Clear_OOB_K=true,
+          typename Engine0, typename Layout0, typename Engine1, typename Layout1,
+          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
+__forceinline__ __device__ void copy_rotary_contiguous(Tensor<Engine0, Layout0> const &S,
+                                              Tensor<Engine1, Layout1> &D,
+                                              Tensor<Engine2, Layout2> const &Cos,
+                                              Tensor<Engine2, Layout2> const &Sin,
+                                              Tensor<Engine3, Layout3> const &identity_MN,
+                                              const int max_MN, const int min_MN,
+                                              const int dim, const int rotary_dim) {
+    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
+    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Cos));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Cos));                     // MMA_K
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Sin));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Sin));                     // MMA_K
+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(Cos));                     // MMA
+    CUTE_STATIC_ASSERT_V(size<0>(Cos) == size<0>(Sin));
+    static_assert(decltype(size<0>(Cos))::value % 2 == 0);  // Since we do fast conversion from fp16/bf16 to fp32
+    Tensor rCos = make_fragment_like(Cos);
+    Tensor rSin = make_fragment_like(Sin);
+    Tensor rS = make_fragment_like(S);
+    Tensor rS_other = make_fragment_like(rS(_, 0, 0));
+    #pragma unroll
+    for (int m = 0; m < size<1>(S); ++m) {
+        if (get<0>(identity_MN(0, m, 0)) >= min_MN && get<0>(identity_MN(0, m, 0)) < max_MN) {
+            #pragma unroll
+            for (int k = 0; k < size<2>(S); ++k) {
+                if (Is_even_K || get<1>(identity_MN(0, 0, k)) < dim) {
+                    cute::copy(S(_, m, k), rS(_, m, k));
+                    if (get<1>(identity_MN(0, 0, k)) < rotary_dim) {
+                        const bool is_left = get<1>(identity_MN(0, 0, k)) < rotary_dim / 2;
+                        Tensor gS_other = make_tensor(S(_, m, k).data() + (is_left ? rotary_dim / 2 : -rotary_dim / 2), S(_, m, k).layout());
+                        cute::copy(gS_other, rS_other);
+                        // if (cute::thread0()) { print_tensor(rS(_, m, k)); print_tensor(rS_other); }
+                        Tensor gCos = make_tensor(Cos(_, m, k).data() + (is_left ? 0 : -rotary_dim / 2), Cos(_, m, k).layout());
+                        Tensor gSin = make_tensor(Sin(_, m, k).data() + (is_left ? 0 : -rotary_dim / 2), Sin(_, m, k).layout());
+                        cute::copy(gCos, rCos(_, m, k));
+                        cute::copy(gSin, rSin(_, m, k));
+                        // if (cute::thread0()) { print_tensor(rCos(_, m, k)); print_tensor(rSin(_, m, k)); }
+                        Tensor S_fp32 = convert_type<float>(rS(_, m, k));
+                        Tensor S_other_fp32 = convert_type<float>(rS_other);
+                        Tensor cos_fp32 = convert_type<float>(rCos(_, m, k));
+                        Tensor sin_fp32 = convert_type<float>(rSin(_, m, k));
+                        #pragma unroll
+                        for (int i = 0; i < size<0>(rS); ++i) {
+                            S_fp32(i) = S_fp32(i) * cos_fp32(i) + S_other_fp32(i) * (is_left ? -sin_fp32(i) : sin_fp32(i));
+                        }
+                        // Idk but I need to copy for the convert_type to work
+                        Tensor S_fp32_copy = make_fragment_like(S_fp32);
+                        cute::copy(S_fp32, S_fp32_copy);
+                        using T = typename Engine0::value_type;
+                        Tensor S_og_type = convert_type<T>(S_fp32_copy);
+                        cute::copy(S_og_type, rS(_, m, k));
+                        // if (cute::thread0()) { print_tensor(rS(_, m, k)); }
+                    }
+                    cute::copy(rS(_, m, k), D(_, m, k));
+                } else if (Clear_OOB_K) {
+                    cute::clear(D(_, m, k));
+                }
+            }
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/softmax.h b/aten/src/ATen/native/transformers/cuda/flash_attn/softmax.h
index 239a8114b68b7..9a9ae88b6cdfb 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/softmax.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/softmax.h
@@ -1,34 +1,15 @@
 /******************************************************************************
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
+ * Copyright (c) 2024, Tri Dao.
  ******************************************************************************/
 
 #pragma once
 
 #include <cmath>
-#include <cuda_fp16.h>
+
+#include <cute/tensor.hpp>
+
+#include <cutlass/numeric_types.h>
+
 #include <ATen/native/transformers/cuda/flash_attn/philox.cuh>
 #include <ATen/native/transformers/cuda/flash_attn/utils.h>
 
@@ -36,10 +17,11 @@ namespace pytorch_flash {
 
 using namespace cute;
 
+#define UNFUSE_FMA
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template<bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename Operator>
-__device__ inline void thread_reduce_(Tensor<Engine0, Layout0> const &tensor, Tensor<Engine1, Layout1> &summary, Operator &op) {
+__device__ __forceinline__ void thread_reduce_(Tensor<Engine0, Layout0> const &tensor, Tensor<Engine1, Layout1> &summary, Operator &op) {
     static_assert(Layout0::rank == 2, "Only support 2D Tensor");
     static_assert(Layout1::rank == 1, "Only support 1D Tensor");
     CUTE_STATIC_ASSERT_V(size<0>(summary) == size<0>(tensor));
@@ -54,7 +36,7 @@ __device__ inline void thread_reduce_(Tensor<Engine0, Layout0> const &tensor, Te
 }
 
 template<typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename Operator>
-__device__ inline void quad_allreduce_(Tensor<Engine0, Layout0> &dst, Tensor<Engine1, Layout1> &src, Operator &op) {
+__device__ __forceinline__ void quad_allreduce_(Tensor<Engine0, Layout0> &dst, Tensor<Engine1, Layout1> &src, Operator &op) {
     CUTE_STATIC_ASSERT_V(size(dst) == size(src));
     #pragma unroll
     for (int i = 0; i < size(dst); i++){
@@ -63,26 +45,26 @@ __device__ inline void quad_allreduce_(Tensor<Engine0, Layout0> &dst, Tensor<Eng
 }
 
 template<bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename Operator>
-__device__ inline void reduce_(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &summary, Operator &op) {
+__device__ __forceinline__ void reduce_(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &summary, Operator &op) {
     thread_reduce_<zero_init>(tensor, summary, op);
     quad_allreduce_(summary, summary, op);
 }
 
 template<bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
-__device__ inline void reduce_max(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &max){
+__device__ __forceinline__ void reduce_max(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &max){
     MaxOp<float> max_op;
     reduce_<zero_init>(tensor, max, max_op);
 }
 
-template<typename Engine0, typename Layout0, typename Engine1, typename Layout1>
-__device__ inline void reduce_sum(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &sum){
+template<bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
+__device__ __forceinline__ void reduce_sum(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &sum){
     SumOp<float> sum_op;
-    reduce_(tensor, sum, sum_op);
+    thread_reduce_<zero_init>(tensor, sum, sum_op);
 }
 
 // Apply the exp to all the elements.
 template <bool Scale_max=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
-inline __device__ void scale_apply_exp2(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> const &max, const float scale) {
+__forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> const &max, const float scale) {
     static_assert(Layout0::rank == 2, "Only support 2D Tensor");
     static_assert(Layout1::rank == 1, "Only support 1D Tensor");
     CUTE_STATIC_ASSERT_V(size<0>(max) == size<0>(tensor));
@@ -97,14 +79,18 @@ inline __device__ void scale_apply_exp2(Tensor<Engine0, Layout0> &tensor, Tensor
             // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
             // max * log_2(e)) This allows the compiler to use the ffma
             // instruction instead of fadd and fmul separately.
-            tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
+            #ifdef UNFUSE_FMA
+                tensor(mi, ni) = exp2f(__fmul_rn(tensor(mi, ni), scale) - max_scaled);
+            #else
+                tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
+            #endif
         }
     }
 }
 
 // Apply the exp to all the elements.
 template <bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
-inline __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> &max, Tensor<Engine1, Layout1> &sum, const float scale) {
+__forceinline__ __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> &max, Tensor<Engine1, Layout1> &sum, const float scale) {
     static_assert(Layout0::rank == 2, "Only support 2D Tensor");
     static_assert(Layout1::rank == 1, "Only support 1D Tensor");
     CUTE_STATIC_ASSERT_V(size<0>(max) == size<0>(tensor));
@@ -134,171 +120,67 @@ inline __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0> &tensor, Tens
     }
 }
 
-template <typename Engine, typename Layout>
-inline __device__ void apply_mask(Tensor<Engine, Layout> &tensor, const int max_seqlen_k,
-                                  const int col_idx_offset_ = 0) {
-    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
-    static_assert(Layout::rank == 2, "Only support 2D Tensor");
-    const int lane_id = threadIdx.x % 32;
-    const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
-    #pragma unroll
-    for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
-        const int col_idx_base = col_idx_offset + nj * 8;
-        #pragma unroll
-        for (int j = 0; j < size<1, 0>(tensor); ++j) {
-            const int col_idx = col_idx_base + j;
-            if (col_idx >= max_seqlen_k) {
-                // Without the "make_coord" we get wrong results
-                #pragma unroll
-                for (int mi = 0; mi < size<0>(tensor); ++mi) {
-                    tensor(mi, make_coord(j, nj)) = -INFINITY;
-                }
-            }
-        }
-    }
-}
+////////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <bool HasWSLeft=true, typename Engine, typename Layout>
-inline __device__ void apply_mask_local(Tensor<Engine, Layout> &tensor, const int col_idx_offset_,
-                                        const int max_seqlen_k, const int row_idx_offset_,
-                                        const int max_seqlen_q, const int warp_row_stride,
-                                        const int window_size_left, const int window_size_right) {
-    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
-    static_assert(Layout::rank == 2, "Only support 2D Tensor");
-    const int lane_id = threadIdx.x % 32;
-    // const int row_idx_offset = row_idx_offset_ + lane_id / 4;
-    const int row_idx_offset = row_idx_offset_;
-    const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
-    #pragma unroll
-    for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
-        const int row_idx_base = row_idx_offset + mi * warp_row_stride;
-        #pragma unroll
-        for (int i = 0; i < size<0, 0>(tensor); ++i) {
-            const int row_idx = row_idx_base + i * 8;
-            const int col_idx_limit_left = std::max(0, row_idx + max_seqlen_k - max_seqlen_q - window_size_left);
-            const int col_idx_limit_right = std::min(max_seqlen_k, row_idx + 1 + max_seqlen_k - max_seqlen_q + window_size_right);
+template <int kNRows>
+struct Softmax {
+
+    using TensorT = decltype(make_tensor<float>(Shape<Int<kNRows>>{}));
+    TensorT row_max, row_sum;
+
+    __forceinline__ __device__ Softmax() {};
+
+    template<bool Is_first, bool Check_inf=false, typename Tensor0, typename Tensor1>
+    __forceinline__ __device__ void softmax_rescale_o(Tensor0 &acc_s, Tensor1 &acc_o, float softmax_scale_log2) {
+        // Reshape acc_s from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
+        Tensor scores = make_tensor(acc_s.data(), pytorch_flash::convert_layout_acc_rowcol(acc_s.layout()));
+        static_assert(decltype(size<0>(scores))::value == kNRows);
+        if (Is_first) {
+            pytorch_flash::template reduce_max</*zero_init=*/true>(scores, row_max);
+            pytorch_flash::scale_apply_exp2(scores, row_max, softmax_scale_log2);
+            pytorch_flash::reduce_sum</*zero_init=*/true>(scores, row_sum);
+        } else {
+            Tensor scores_max_prev = make_fragment_like(row_max);
+            cute::copy(row_max, scores_max_prev);
+            pytorch_flash::template reduce_max</*zero_init=*/false>(scores, row_max);
+            // Reshape acc_o from (MMA=4, MMA_M, MMA_K) to (nrow=(2, MMA_M), ncol=(2, MMA_K))
+            Tensor acc_o_rowcol = make_tensor(acc_o.data(), pytorch_flash::convert_layout_acc_rowcol(acc_o.layout()));
+            static_assert(decltype(size<0>(acc_o_rowcol))::value == kNRows);
             #pragma unroll
-            for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
-                const int col_idx_base = col_idx_offset + nj * 8;
+            for (int mi = 0; mi < size(row_max); ++mi) {
+                float scores_max_cur = !Check_inf
+                    ? row_max(mi)
+                    : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
+                float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
+                row_sum(mi) *= scores_scale;
                 #pragma unroll
-                for (int j = 0; j < size<1, 0>(tensor); ++j) {
-                    const int col_idx = col_idx_base + j;
-                    if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) {
-                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
-                    }
-                }
+                for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) { acc_o_rowcol(mi, ni) *= scores_scale; }
             }
-            // if (cute::thread0()) {
-            //     printf("mi = %d, i = %d, row_idx = %d, max_seqlen_k = %d\n", mi, i, row_idx, max_seqlen_k);
-            //     print(tensor(make_coord(i, mi), _));
-            //     // print(tensor(_, j + nj * size<1, 0>(tensor)));
-            // }
+            pytorch_flash::scale_apply_exp2(scores, row_max, softmax_scale_log2);
+            // We don't do the reduce across threads here since we don't need to use the row_sum.
+            // We do that reduce at the end when we need to normalize the softmax.
+            pytorch_flash::reduce_sum</*zero_init=*/false>(scores, row_sum);
         }
-    }
-}
-
-template <typename Engine, typename Layout>
-inline __device__ void apply_mask_causal(Tensor<Engine, Layout> &tensor, const int col_idx_offset_,
-                                         const int max_seqlen_k, const int row_idx_offset_,
-                                         const int max_seqlen_q, const int warp_row_stride) {
-    // Causal masking is equivalent to local masking with window_size_left = infinity and window_size_right = 0
-    apply_mask_local</*HasWSLeft=*/false>(tensor, col_idx_offset_, max_seqlen_k, row_idx_offset_,
-                                          max_seqlen_q, warp_row_stride, -1, 0);
-}
+    };
 
-template <typename Engine0, typename Layout0, typename Engine1, typename Layout1>
-inline __device__ void apply_mask_causal_w_idx(
-    Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> const &idx_rowcol,
-    const int col_idx_offset_, const int max_seqlen_k, const int row_idx_offset_)
-{
-    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
-    static_assert(Layout0::rank == 2, "Only support 2D Tensor");
-    static_assert(Layout1::rank == 2, "Only support 2D Tensor");
-    CUTE_STATIC_ASSERT_V(size<0>(tensor) == size<0>(idx_rowcol));
-    CUTE_STATIC_ASSERT_V(size<1>(tensor) == size<1>(idx_rowcol));
-    #pragma unroll
-    for (int mi = 0; mi < size<0>(tensor); ++mi) {
-        const int col_idx_limit = std::min(max_seqlen_k, 1 + row_idx_offset_ + get<0>(idx_rowcol(mi, 0)));
+    template<bool Is_dropout=false, bool Split=false, typename Tensor0>
+    __forceinline__ __device__ TensorT normalize_softmax_lse(Tensor0 &acc_o, float softmax_scale, float rp_dropout=1.0) {
+        SumOp<float> sum_op;
+        quad_allreduce_(row_sum, row_sum, sum_op);
+        TensorT lse = make_fragment_like(row_sum);
+        Tensor acc_o_rowcol = make_tensor(acc_o.data(), pytorch_flash::convert_layout_acc_rowcol(acc_o.layout()));
+        static_assert(decltype(size<0>(acc_o_rowcol))::value == kNRows);
         #pragma unroll
-        for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {
-            if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {
-                tensor(mi, ni) = -INFINITY;
-            }
+        for (int mi = 0; mi < size<0>(acc_o_rowcol); ++mi) {
+            float sum = row_sum(mi);
+            float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
+            lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum);
+            float scale = !Is_dropout ? inv_sum : inv_sum * rp_dropout;
+            #pragma unroll
+            for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) { acc_o_rowcol(mi, ni) *= scale; }
         }
-        // if (cute::thread0()) {
-        //     printf("ni = %d, j = %d, col_idx = %d, max_seqlen_k = %d\n", ni, j, col_idx, max_seqlen_k);
-        //     print(tensor(_, make_coord(j, ni)));
-        //     // print(tensor(_, j + ni * size<1, 0>(tensor)));
-        // }
-    }
-}
-
-template <bool encode_dropout_in_sign_bit=false, typename Engine, typename Layout>
-inline __device__ void apply_dropout(Tensor<Engine, Layout> &tensor, uint8_t p_dropout_in_uint8_t,
-                                     unsigned long long seed, unsigned long long offset,
-                                     int block_row_start, int block_col_start,
-                                     int block_row_stride) {
-    // tensor has shape (8, MMA_M, MMA_N / 2)
-    using T = typename Engine::value_type;
-    auto encode_dropout = [](bool keep, T val) {
-        return keep ? val : (encode_dropout_in_sign_bit ? -val : T(0));
+        return lse;
     };
-    static_assert(decltype(size<2>(tensor))::value % 2 == 0);
-    const uint16_t p_dropout_8bit_in_uint16_t = uint16_t(p_dropout_in_uint8_t);
-    const uint32_t p_dropout_8bit_in_uint32_t = (uint32_t(p_dropout_8bit_in_uint16_t) << 16) | uint32_t(p_dropout_8bit_in_uint16_t);
-    // if (cute::thread0()) { printf("threshold2 = 0x%x\n", p_dropout_8bit_in_uint32_t); }
-    #pragma unroll
-    for (int m = 0; m < size<1>(tensor); ++m, block_row_start += block_row_stride) {
-        uint2 rowcol = make_uint2(block_row_start, block_col_start);
-        #pragma unroll
-        for (int n = 0; n < size<2>(tensor) / 2; ++n, ++rowcol.y) {
-            // if (cute::thread(32, 0)) { printf("m = %d, n = %d, row = %d, col = %d\n", m, n, int(rowcol.x), int(rowcol.y));}
-            uint4 random_uint4 = pytorch_flash::philox(seed, reinterpret_cast<unsigned long long&>(rowcol), offset);
-            // if (cute::thread0()) { printf("philox = %u, %d, %d, %d\n", random_uint4.x, random_uint4.y, random_uint4.z, random_uint4.w);}
-            uint8_t (&rnd_8)[16] = reinterpret_cast<uint8_t (&)[16]>(random_uint4);
-            // Special implementation for 16-bit types: we duplicate the threshold to the
-            // low and high 16 bits of a 32-bit value, then use the f16x2 comparison instruction
-            // to get a mask. The low 16 bits of the mask will be either 0xffff or 0x0000,
-            // and the high 16 bits will be either 0xffff or 0x0000, depending on whether
-            // the random value is less than the threshold.
-            // We then do a bit-wise AND between the mask and the original value (in 32-bit).
-            // We're exploiting the fact that floating point comparison is equivalent to integer
-            // comparison, since we're comparing unsigned integers whose top 8-bits are zero.
-            if (!encode_dropout_in_sign_bit
-                && (std::is_same<T, cutlass::half_t>::value || std::is_same<T, cutlass::bfloat16_t>::value)) {
-                uint16_t rnd_16[16];
-                #pragma unroll
-                for (int i = 0; i < 16; i++) { rnd_16[i] = uint16_t(rnd_8[i]); }
-                uint32_t (&rnd_32)[8] = reinterpret_cast<uint32_t (&)[8]>(rnd_16);
-                #pragma unroll
-                for (int j = 0; j < 2; j++) {
-                    Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
-                    // if (cute::thread0()) { printf("random = 0x%x, 0x%x, 0x%x, 0x%x\n", rnd_32[j * 4 + 0], rnd_32[j * 4 + 1], rnd_32[j * 4 + 2], rnd_32[j * 4 + 3]); }
-                    // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
-                    #pragma unroll
-                    for (int i = 0; i < 4; i++) {
-                        uint32_t mask;
-                        asm volatile("set.le.u32.f16x2 %0, %1, %2;\n" : "=r"(mask) : "r"(rnd_32[j * 4 + i]), "r"(p_dropout_8bit_in_uint32_t));
-                        tensor_uint32(i) &= mask;
-                    }
-                    // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
-                }
-            } else {
-                #pragma unroll
-                for (int j = 0; j < 2; j++) {
-                    #pragma unroll
-                    for (int i = 0; i < 8; i++) {
-                        tensor(i, m, n * 2 + j) = encode_dropout(rnd_8[j * 8 + i] <= p_dropout_in_uint8_t, tensor(i, m, n * 2 + j));
-                    }
-                    Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
-                    // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
-                }
-            }
-            // // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-            // //     printf("n = %d, ph  Philox: %u, %u, %u, %u\n", n, rnd_8.x, rnd_8.y, rnd_8.z, rnd_8.w);
-            // // }
-        }
-    }
-}
+};
 
 }  // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/static_switch.h b/aten/src/ATen/native/transformers/cuda/flash_attn/static_switch.h
index 4aa8474028868..ca12fa171bf98 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/static_switch.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/static_switch.h
@@ -14,6 +14,7 @@
 ///     some_function<BoolConst>(...);
 /// });
 /// ```
+
 #define BOOL_SWITCH(COND, CONST_NAME, ...)      \
   [&] {                                         \
     if (COND) {                                 \
@@ -25,6 +26,46 @@
     }                                           \
   }()
 
+#ifdef FLASHATTENTION_DISABLE_DROPOUT
+  #define DROPOUT_SWITCH(COND, CONST_NAME, ...) \
+  [&] {                                         \
+    constexpr static bool CONST_NAME = false;   \
+    return __VA_ARGS__();                       \
+  }()
+#else
+  #define DROPOUT_SWITCH BOOL_SWITCH
+#endif
+
+#ifdef FLASHATTENTION_DISABLE_ALIBI
+  #define ALIBI_SWITCH(COND, CONST_NAME, ...)   \
+  [&] {                                         \
+    constexpr static bool CONST_NAME = false;   \
+    return __VA_ARGS__();                       \
+  }()
+#else
+  #define ALIBI_SWITCH BOOL_SWITCH
+#endif
+
+#ifdef FLASHATTENTION_DISABLE_UNEVEN_K
+  #define EVENK_SWITCH(COND, CONST_NAME, ...)   \
+  [&] {                                         \
+    constexpr static bool CONST_NAME = true;    \
+    return __VA_ARGS__();                       \
+  }()
+#else
+  #define EVENK_SWITCH BOOL_SWITCH
+#endif
+
+#ifdef FLASHATTENTION_DISABLE_LOCAL
+  #define LOCAL_SWITCH(COND, CONST_NAME, ...)   \
+  [&] {                                         \
+    constexpr static bool CONST_NAME = false;    \
+    return __VA_ARGS__();                       \
+  }()
+#else
+  #define LOCAL_SWITCH BOOL_SWITCH
+#endif
+
 #define FP16_SWITCH(COND, ...)               \
   [&] {                                      \
     if (COND) {                              \
@@ -36,7 +77,7 @@
     }                                        \
   }()
 
-#define FWD_HEADDIM_SWITCH(HEADDIM, ...)   \
+#define HEADDIM_SWITCH(HEADDIM, ...)   \
   [&] {                                    \
     if (HEADDIM <= 32) {                   \
       constexpr static int kHeadDim = 32;  \
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/utils.h b/aten/src/ATen/native/transformers/cuda/flash_attn/utils.h
index fc791b0b2107e..2c8add318366a 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/utils.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/utils.h
@@ -22,16 +22,17 @@
 #include <cutlass/numeric_conversion.h>
 #include <cutlass/numeric_types.h>
 
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace pytorch_flash {
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template<typename T>
-inline __device__ uint32_t relu2(const uint32_t x);
+__forceinline__ __device__ uint32_t relu2(const uint32_t x);
 
 template<>
-inline __device__ uint32_t relu2<cutlass::half_t>(const uint32_t x) {
+__forceinline__ __device__ uint32_t relu2<cutlass::half_t>(const uint32_t x) {
     uint32_t res;
     const uint32_t zero = 0u;
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
@@ -49,7 +50,7 @@ inline __device__ uint32_t relu2<cutlass::half_t>(const uint32_t x) {
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
 template<>
-inline __device__ uint32_t relu2<cutlass::bfloat16_t>(const uint32_t x) {
+__forceinline__ __device__ uint32_t relu2<cutlass::bfloat16_t>(const uint32_t x) {
     uint32_t res;
     const uint32_t zero = 0u;
     asm volatile("max.bf16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(zero));
@@ -62,10 +63,10 @@ inline __device__ uint32_t relu2<cutlass::bfloat16_t>(const uint32_t x) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
 
 template<typename T>
-inline __device__ uint32_t convert_relu2(const float2 x);
+__forceinline__ __device__ uint32_t convert_relu2(const float2 x);
 
 template<>
-inline __device__ uint32_t convert_relu2<cutlass::half_t>(const float2 x) {
+__forceinline__ __device__ uint32_t convert_relu2<cutlass::half_t>(const float2 x) {
     uint32_t res;
     const uint32_t a = reinterpret_cast<const uint32_t&>(x.x);
     const uint32_t b = reinterpret_cast<const uint32_t&>(x.y);
@@ -74,7 +75,7 @@ inline __device__ uint32_t convert_relu2<cutlass::half_t>(const float2 x) {
 }
 
 template<>
-inline __device__ uint32_t convert_relu2<cutlass::bfloat16_t>(const float2 x) {
+__forceinline__ __device__ uint32_t convert_relu2<cutlass::bfloat16_t>(const float2 x) {
     uint32_t res;
     const uint32_t a = reinterpret_cast<const uint32_t&>(x.x);
     const uint32_t b = reinterpret_cast<const uint32_t&>(x.y);
@@ -88,20 +89,20 @@ inline __device__ uint32_t convert_relu2<cutlass::bfloat16_t>(const float2 x) {
 
 template<typename T>
 struct MaxOp {
-__device__ inline T operator()(T const & x, T const & y) { return x > y ? x : y; }
+__device__ __forceinline__ T operator()(T const & x, T const & y) { return x > y ? x : y; }
 };
 
 template <>
 struct MaxOp<float> {
 // This is slightly faster
-__device__ inline float operator()(float const &x, float const &y) { return max(x, y); }
+__device__ __forceinline__ float operator()(float const &x, float const &y) { return max(x, y); }
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template<typename T>
 struct SumOp {
-__device__ inline T operator()(T const & x, T const & y) { return x + y; }
+__device__ __forceinline__ T operator()(T const & x, T const & y) { return x + y; }
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -110,7 +111,7 @@ template<int THREADS>
 struct Allreduce {
     static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
     template<typename T, typename Operator>
-    static __device__ inline T run(T x, Operator &op) {
+    static __device__ __forceinline__ T run(T x, Operator &op) {
         constexpr int OFFSET = THREADS / 2;
         x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
         return Allreduce<OFFSET>::run(x, op);
@@ -122,7 +123,7 @@ struct Allreduce {
 template<>
 struct Allreduce<2> {
 template<typename T, typename Operator>
-static __device__ inline T run(T x, Operator &op) {
+static __device__ __forceinline__ T run(T x, Operator &op) {
     x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
     return x;
 }
@@ -134,7 +135,7 @@ template<bool A_in_regs=false, bool B_in_regs=false, typename Tensor0, typename
          typename Tensor2, typename Tensor3, typename Tensor4,
          typename TiledMma, typename TiledCopyA, typename TiledCopyB,
          typename ThrCopyA, typename ThrCopyB>
-inline __device__ void gemm(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3 const& tCsA,
+__forceinline__ __device__ void gemm(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3 const& tCsA,
                             Tensor4 const& tCsB, TiledMma tiled_mma,
                             TiledCopyA smem_tiled_copy_A, TiledCopyB smem_tiled_copy_B,
                             ThrCopyA smem_thr_copy_A, ThrCopyB smem_thr_copy_B) {
@@ -161,9 +162,9 @@ inline __device__ void gemm(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3
 
 template<typename Tensor0, typename Tensor1, typename Tensor2, typename Tensor3,
          typename TiledMma, typename TiledCopy, typename ThrCopy>
-inline __device__ void gemm_A_in_regs(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3 const& tCsB,
-                                      TiledMma tiled_mma, TiledCopy smem_tiled_copy_B,
-                                      ThrCopy smem_thr_copy_B) {
+__forceinline__ __device__ void gemm_rs(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB, Tensor3 const& tCsB,
+                               TiledMma tiled_mma, TiledCopy smem_tiled_copy_B,
+                               ThrCopy smem_thr_copy_B) {
     CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc));                     // MMA_M
     CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc));                     // MMA_N
     CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                     // MMA_K
@@ -183,42 +184,48 @@ inline __device__ void gemm_A_in_regs(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB
 
 // Convert acc_layout from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
 template<typename Layout>
-inline __device__ auto convert_layout_acc_rowcol(Layout acc_layout) {
+__forceinline__ __device__ auto convert_layout_acc_rowcol(Layout acc_layout) {
     static_assert(decltype(size<0>(acc_layout))::value == 4);
     static_assert(decltype(rank(acc_layout))::value == 3);
     auto l = logical_divide(acc_layout, Shape<_2>{});  // ((2, 2), MMA_M, MMA_N)
-    // TD [2023-08-13]: Idk why but get<0, 1>(l) doesn't work for Cutlass 3.2, I'm getting
-    // "int_tuple.hpp(74): error: conversion to inaccessible base class"
-    // return make_layout(make_layout(get<0, 1>(l), get<1>(l)), make_layout(get<0, 0>(l), get<2>(l)));
-    return make_layout(make_layout(get<1>(get<0>(l)), get<1>(l)), make_layout(get<0>(get<0>(l)), get<2>(l)));
+    return make_layout(make_layout(get<0, 1>(l), get<1>(l)), make_layout(get<0, 0>(l), get<2>(l)));
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-// Convert rowcol_layout from (nrow=(2, MMA_M), ncol=(2, MMA_N)) to ((2, 2, 2), MMA_M, MMA_N / 2)
-// if using m16n8k16, or to ((2, 2, 1), MMA_M, MMA_N) if using m16n8k8.
+// Convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
+// if using m16n8k16, or to (4, MMA_M, MMA_N) if using m16n8k8.
 template<typename MMA_traits, typename Layout>
-inline __device__ auto convert_layout_rowcol_Aregs(Layout rowcol_layout) {
+__forceinline__ __device__ auto convert_layout_acc_Aregs(Layout acc_layout) {
     using X = Underscore;
-    static_assert(decltype(size<0, 0>(rowcol_layout))::value == 2);
-    static_assert(decltype(size<1, 0>(rowcol_layout))::value == 2);
+    static_assert(decltype(size<0>(acc_layout))::value == 4);
+    static_assert(decltype(rank(acc_layout))::value == 3);
     constexpr int mma_shape_K = get<2>(typename MMA_traits::Shape_MNK{});
     static_assert(mma_shape_K == 8 || mma_shape_K == 16);
-    constexpr int MMA_N_divisor = mma_shape_K == 8 ? 1 : 2;
-    auto l = logical_divide(rowcol_layout, Shape<X, Shape<X, Int<MMA_N_divisor>>>{});  // ((2, MMA_M), (2, (2, MMA_N / 2)))
-    // TD [2023-08-13]: Same error as above on Cutlass 3.2
-    // return make_layout(make_layout(get<1, 0>(l), get<0, 0>(l), get<1, 1, 0>(l)),
-    //                    get<0, 1>(l),
-    //                    get<1, 1, 1>(l));
-    return make_layout(make_layout(get<0>(get<1>(l)), get<0>(get<0>(l)), get<0>(get<1>(get<1>(l)))),
-                       get<1>(get<0>(l)),
-                       get<1>(get<1>(get<1>(l))));
+    if constexpr (mma_shape_K == 8) {
+        return acc_layout;
+    } else {
+        auto l = logical_divide(acc_layout, Shape<X, X, _2>{});  // (4, MMA_M, (2, MMA_N / 2)))
+        return make_layout(make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l));
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
+template<typename Layout>
+__forceinline__ __device__ auto convert_layout_acc_dropout(Layout acc_layout) {
+    using X = Underscore;
+    static_assert(decltype(size<0>(acc_layout))::value == 4);
+    static_assert(decltype(rank(acc_layout))::value == 3);
+    auto l = logical_divide(acc_layout, Shape<X, X, _2>{});  // (4, MMA_M, (2, MMA_N / 2)))
+    return make_layout(make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l));
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename To_type, typename Engine, typename Layout>
-inline __device__ auto convert_type(Tensor<Engine, Layout> const &tensor) {
+__forceinline__ __device__ auto convert_type(Tensor<Engine, Layout> const &tensor) {
     using From_type = typename Engine::value_type;
     constexpr int numel = decltype(size(tensor))::value;
     cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
@@ -230,7 +237,7 @@ inline __device__ auto convert_type(Tensor<Engine, Layout> const &tensor) {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename Engine, typename Layout>
-inline __device__ void relu_(Tensor<Engine, Layout> &tensor) {
+__forceinline__ __device__ void relu_(Tensor<Engine, Layout> &tensor) {
     constexpr int numel = decltype(size(tensor))::value;
     static_assert(numel % 2 == 0);
     using value_t = typename Engine::value_type;
@@ -246,7 +253,7 @@ inline __device__ void relu_(Tensor<Engine, Layout> &tensor) {
 
 // On SM80 and above, we can fuse fp32 -> fp16/bf16 conversion and relu into 1 instruction
 template <typename To_type, typename Engine, typename Layout>
-inline __device__ auto convert_type_relu(Tensor<Engine, Layout> const &tensor) {
+__forceinline__ __device__ auto convert_type_relu(Tensor<Engine, Layout> const &tensor) {
     using From_type = typename Engine::value_type;
     static_assert(std::is_same_v<To_type, cutlass::half_t> || std::is_same_v<To_type, cutlass::bfloat16_t>);
     static_assert(std::is_same_v<float, From_type>);
@@ -288,7 +295,7 @@ void cp_async_wait() {
 template <bool Is_even_MN=true, bool Is_even_K=true, bool Clear_OOB_MN=false, bool Clear_OOB_K=true,
           typename TiledCopy, typename Engine0, typename Layout0, typename Engine1, typename Layout1,
           typename Engine2, typename Layout2, typename Engine3, typename Layout3>
-inline __device__ void copy(TiledCopy tiled_copy, Tensor<Engine0, Layout0> const &S,
+__forceinline__ __device__ void copy(TiledCopy tiled_copy, Tensor<Engine0, Layout0> const &S,
                             Tensor<Engine1, Layout1> &D, Tensor<Engine2, Layout2> const &identity_MN,
                             Tensor<Engine3, Layout3> const &predicate_K, const int max_MN=0) {
     CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
@@ -357,7 +364,7 @@ inline __device__ void copy(TiledCopy tiled_copy, Tensor<Engine0, Layout0> const
 template <bool Is_even_K=true,
           typename Engine0, typename Layout0, typename Engine1, typename Layout1,
           typename Engine2, typename Layout2, typename Engine3, typename Layout3>
-inline __device__ void copy_w_min_idx(Tensor<Engine0, Layout0> const &S,
+__forceinline__ __device__ void copy_w_min_idx(Tensor<Engine0, Layout0> const &S,
                                       Tensor<Engine1, Layout1> &D, Tensor<Engine2, Layout2> const &identity_MN,
                                       Tensor<Engine3, Layout3> const &predicate_K,
                                       const int max_MN=0, const int min_MN=0) {
@@ -384,137 +391,4 @@ inline __device__ void copy_w_min_idx(Tensor<Engine0, Layout0> const &S,
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <bool Is_even_K=true, bool Clear_OOB_K=true,
-          typename Engine0, typename Layout0, typename Engine1, typename Layout1,
-          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
-inline __device__ void copy_rotary_interleaved(Tensor<Engine0, Layout0> const &S,
-                                               Tensor<Engine1, Layout1> &D,
-                                               Tensor<Engine2, Layout2> const &Cos,
-                                               Tensor<Engine2, Layout2> const &Sin,
-                                               Tensor<Engine3, Layout3> const &identity_MN,
-                                               const int max_MN, const int min_MN,
-                                               const int dim, const int rotary_dim) {
-    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
-    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
-    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
-    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
-    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
-    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Cos));                     // MMA_M
-    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Cos));                     // MMA_K
-    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Sin));                     // MMA_M
-    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Sin));                     // MMA_K
-    CUTE_STATIC_ASSERT_V(size<0>(Cos) == size<0>(Sin));                     // MMA_K
-    static_assert(decltype(size<0>(S))::value == decltype(size<0>(Cos))::value * 2);
-    static_assert(decltype(size<0>(Cos))::value % 2 == 0);  // Since we do fast conversion from fp16/bf16 to fp32
-    Tensor rCos = make_fragment_like(Cos);
-    Tensor rSin = make_fragment_like(Sin);
-    Tensor rS = make_fragment_like(S);
-    #pragma unroll
-    for (int m = 0; m < size<1>(S); ++m) {
-        if (get<0>(identity_MN(0, m, 0)) >= min_MN && get<0>(identity_MN(0, m, 0)) < max_MN) {
-            #pragma unroll
-            for (int k = 0; k < size<2>(S); ++k) {
-                if (Is_even_K || get<1>(identity_MN(0, 0, k)) < dim) {
-                    cute::copy(S(_, m, k), rS(_, m, k));
-                    if (get<1>(identity_MN(0, 0, k)) < rotary_dim) {
-                        cute::copy(Cos(_, m, k), rCos(_, m, k));
-                        cute::copy(Sin(_, m, k), rSin(_, m, k));
-                        Tensor S_fp32 = convert_type<float>(rS(_, m, k));
-                        Tensor cos_fp32 = convert_type<float>(rCos(_, m, k));
-                        Tensor sin_fp32 = convert_type<float>(rSin(_, m, k));
-                        #pragma unroll
-                        for (int i = 0; i < size<0>(rS) / 2; ++i) {
-                            float real = S_fp32(2 * i) * cos_fp32(i) - S_fp32(2 * i + 1) * sin_fp32(i);
-                            float imag = S_fp32(2 * i) * sin_fp32(i) + S_fp32(2 * i + 1) * cos_fp32(i);
-                            S_fp32(2 * i) = real;
-                            S_fp32(2 * i + 1) = imag;
-                        }
-                        // Idk but I need to copy for the convert_type to work
-                        Tensor S_fp32_copy = make_fragment_like(S_fp32);
-                        cute::copy(S_fp32, S_fp32_copy);
-                        using T = typename Engine0::value_type;
-                        Tensor S_og_type = convert_type<T>(S_fp32_copy);
-                        cute::copy(S_og_type, rS(_, m, k));
-                    }
-                    cute::copy(rS(_, m, k), D(_, m, k));
-                } else if (Clear_OOB_K) {
-                    cute::clear(D(_, m, k));
-                }
-            }
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <bool Is_even_K=true, bool Clear_OOB_K=true,
-          typename Engine0, typename Layout0, typename Engine1, typename Layout1,
-          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
-inline __device__ void copy_rotary_contiguous(Tensor<Engine0, Layout0> const &S,
-                                              Tensor<Engine1, Layout1> &D,
-                                              Tensor<Engine2, Layout2> const &Cos,
-                                              Tensor<Engine2, Layout2> const &Sin,
-                                              Tensor<Engine3, Layout3> const &identity_MN,
-                                              const int max_MN, const int min_MN,
-                                              const int dim, const int rotary_dim) {
-    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
-    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
-    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
-    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
-    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
-    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Cos));                     // MMA_M
-    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Cos));                     // MMA_K
-    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Sin));                     // MMA_M
-    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Sin));                     // MMA_K
-    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(Cos));                     // MMA
-    CUTE_STATIC_ASSERT_V(size<0>(Cos) == size<0>(Sin));
-    static_assert(decltype(size<0>(Cos))::value % 2 == 0);  // Since we do fast conversion from fp16/bf16 to fp32
-    Tensor rCos = make_fragment_like(Cos);
-    Tensor rSin = make_fragment_like(Sin);
-    Tensor rS = make_fragment_like(S);
-    Tensor rS_other = make_fragment_like(rS(_, 0, 0));
-    #pragma unroll
-    for (int m = 0; m < size<1>(S); ++m) {
-        if (get<0>(identity_MN(0, m, 0)) >= min_MN && get<0>(identity_MN(0, m, 0)) < max_MN) {
-            #pragma unroll
-            for (int k = 0; k < size<2>(S); ++k) {
-                if (Is_even_K || get<1>(identity_MN(0, 0, k)) < dim) {
-                    cute::copy(S(_, m, k), rS(_, m, k));
-                    if (get<1>(identity_MN(0, 0, k)) < rotary_dim) {
-                        const bool is_left = get<1>(identity_MN(0, 0, k)) < rotary_dim / 2;
-                        Tensor gS_other = make_tensor(S(_, m, k).data() + (is_left ? rotary_dim / 2 : -rotary_dim / 2), S(_, m, k).layout());
-                        cute::copy(gS_other, rS_other);
-                        // if (cute::thread0()) { print_tensor(rS(_, m, k)); print_tensor(rS_other); }
-                        Tensor gCos = make_tensor(Cos(_, m, k).data() + (is_left ? 0 : -rotary_dim / 2), Cos(_, m, k).layout());
-                        Tensor gSin = make_tensor(Sin(_, m, k).data() + (is_left ? 0 : -rotary_dim / 2), Sin(_, m, k).layout());
-                        cute::copy(gCos, rCos(_, m, k));
-                        cute::copy(gSin, rSin(_, m, k));
-                        // if (cute::thread0()) { print_tensor(rCos(_, m, k)); print_tensor(rSin(_, m, k)); }
-                        Tensor S_fp32 = convert_type<float>(rS(_, m, k));
-                        Tensor S_other_fp32 = convert_type<float>(rS_other);
-                        Tensor cos_fp32 = convert_type<float>(rCos(_, m, k));
-                        Tensor sin_fp32 = convert_type<float>(rSin(_, m, k));
-                        #pragma unroll
-                        for (int i = 0; i < size<0>(rS); ++i) {
-                            S_fp32(i) = S_fp32(i) * cos_fp32(i) + S_other_fp32(i) * (is_left ? -sin_fp32(i) : sin_fp32(i));
-                        }
-                        // Idk but I need to copy for the convert_type to work
-                        Tensor S_fp32_copy = make_fragment_like(S_fp32);
-                        cute::copy(S_fp32, S_fp32_copy);
-                        using T = typename Engine0::value_type;
-                        Tensor S_og_type = convert_type<T>(S_fp32_copy);
-                        cute::copy(S_og_type, rS(_, m, k));
-                        // if (cute::thread0()) { print_tensor(rS(_, m, k)); }
-                    }
-                    cute::copy(rS(_, m, k), D(_, m, k));
-                } else if (Clear_OOB_K) {
-                    cute::clear(D(_, m, k));
-                }
-            }
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
 }  // namespace pytorch_flash
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_pipelined.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_pipelined.h
index c51f1418c80fe..25448af838565 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_pipelined.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_pipelined.h
@@ -139,7 +139,7 @@ class CustomMmaPipelined : public CustomMmaBase<Shape_, Policy_, 2> {
   /// Complex transform on B operand
   static ComplexTransform const kTransformB = Operator::kTransformB;
 
-  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  // statically assert kStages for MmaPipelined is two (Double-buffered pipeline)
   static_assert(
       (Base::kStages == 2),
       "MmaPipelined requires kStages set to value 2");
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/find_default_mma.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/find_default_mma.h
index ff31fa0329c41..c1b93bb06ba6d 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/find_default_mma.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/find_default_mma.h
@@ -59,7 +59,7 @@ template <
     typename InstructionShape,
     /// Number of stages used in the pipelined mainloop
     int Stages,
-    /// Operation perfomed by GEMM
+    /// Operation performed by GEMM
     typename Operator,
     typename Enable_ = void>
 struct FindDefaultMma {
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_from_smem.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_from_smem.h
index c0a4e3547aa5e..df2c92326f587 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_from_smem.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_from_smem.h
@@ -172,7 +172,7 @@ class MmaBaseFromSharedMemory {
       Shape::kK / WarpGemm::kK>;
   using WarpCount1 = WarpCount;
 
-  /// Number of warp-level GEMM oeprations
+  /// Number of warp-level GEMM operations
   static int const kWarpGemmIterations =
       (WarpGemm::kK / Operator::Policy::MmaShape::kK);
   static int const kWarpGemmIterations1 = kWarpGemmIterations;
@@ -425,7 +425,7 @@ class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory<
   /// Complex transform on B operand
   static ComplexTransform const kTransformB = Operator::kTransformB;
 
-  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  // statically assert kStages for MmaPipelined is two (Double-buffered pipeline)
   static_assert(
       (Base::kStages == 2),
       "MmaPipelined requires kStages set to value 2");
@@ -527,7 +527,7 @@ class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory<
 
   // For API compatibility with MmaMultistageFromSharedMemory
   // but not supported as it worsens perf: older gpus < sm80 don't
-  // support async tranfers and have to waste registers
+  // support async transfers and have to waste registers
   CUTLASS_DEVICE
   void set_prologue_done(bool value) {}
   CUTLASS_DEVICE
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h
index f89c087eb288b..a97737bfe2a1e 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h
@@ -181,7 +181,7 @@ class PredicatedTileAccessIteratorResidualLast<
   BytePointer pointer_;
 
   /// Below is used when Gather is turned on.  We need to record strided_offset
-  /// and contiguous_offset seperated to compute the offset by using
+  /// and contiguous_offset separated to compute the offset by using
   ///
   /// offset = contiguous_offset + indices[strided_offset]
   ///
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h
index 319b048fd2faf..00e1b0ac2001a 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h
@@ -87,7 +87,7 @@ namespace threadblock {
 /// To be efficient, this assumes the iterator will be dereferenced and advanced
 /// at least once outside any looping structure to minimize integer arithmetic.
 ///
-/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to
+/// Access out of bounds are safe so long as `clear_mask()` is called prior to
 /// dereferencing the iterator.
 ///
 ///
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
index 987c223fa9429..564e3f2f35222 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
@@ -57,6 +57,7 @@
 #include <ATen/native/transformers/cuda/mem_eff_attention/transform/tile_smem_loader.h>
 
 #include <cinttypes>
+#include <c10/util/Exception.h>
 
 using namespace gemm_kernel_utils;
 
@@ -624,16 +625,16 @@ struct AttentionBackwardKernel {
 
   struct Params {
     // Input tensors
-    scalar_t* query_ptr = nullptr; // [Mq, nH, K]
-    scalar_t* key_ptr = nullptr; // [Mk, nH, K]
-    scalar_t* value_ptr = nullptr; // [Mk, nH, Kv]
-    scalar_t* bias_ptr = nullptr;
-    lse_scalar_t* logsumexp_ptr = nullptr; // [nH, Mq]
-    scalar_t* output_ptr = nullptr; // [Mq, nH, Kv]
-    scalar_t* grad_output_ptr = nullptr; // [Mq, nH, Kv]
+    const scalar_t* query_ptr = nullptr; // [Mq, nH, K]
+    const scalar_t* key_ptr = nullptr; // [Mk, nH, K]
+    const scalar_t* value_ptr = nullptr; // [Mk, nH, Kv]
+    const scalar_t* bias_ptr = nullptr;
+    const lse_scalar_t* logsumexp_ptr = nullptr; // [nH, Mq]
+    const scalar_t* output_ptr = nullptr; // [Mq, nH, Kv]
+    const scalar_t* grad_output_ptr = nullptr; // [Mq, nH, Kv]
     accum_t* delta_ptr = nullptr; // [nH, Mq]
-    int32_t* cu_seqlens_q_ptr = nullptr;
-    int32_t* cu_seqlens_k_ptr = nullptr;
+    const int32_t* cu_seqlens_q_ptr = nullptr;
+    const int32_t* cu_seqlens_k_ptr = nullptr;
 
     // Output tensors
     output_t* grad_query_ptr = nullptr; //  [Mq, nH, K]
@@ -648,6 +649,9 @@ struct AttentionBackwardKernel {
     GradQTempStorage* workspace_gq =
         nullptr; // (will be calculated by the kernel)
 
+    // Sliding window. ignored if == 0
+    int32_t window_size = 0;
+
     // Scale
     accum_t scale = 1.0f;
 
@@ -1271,6 +1275,9 @@ struct AttentionBackwardKernel {
         p.num_splits_key,
         ") - too large for `num_keys` = ",
         p.num_keys);
+    if (p.window_size > 0) {
+      TORCH_CHECK(p.custom_mask_type == CausalFromTopLeft);
+    }
     return true;
   }
 
@@ -1422,7 +1429,7 @@ struct AttentionBackwardKernel {
       uint8_t lane_id) {
     cutlass::Array<cutlass::uint1b_t, MatmulDOIVJ::Mma::FragmentC::kElements>
         dropout_keep_mask_doivj;
-    dropout_keep_mask_doivj.fill(1);
+    dropout_keep_mask_doivj.fill(cutlass::uint1b_t{1});
     const float dropout_scale =
         kApplyDropout ? 1.0 / (1.0 - p.dropout_prob) : 1.0f;
 
@@ -1459,11 +1466,17 @@ struct AttentionBackwardKernel {
         ? MatmulQK::Mma::Shape::kM
         : warp_uniform(cutlass::fast_min(
               (int32_t)MatmulQK::Mma::Shape::kM, p.num_keys - key_start));
-
+    if (p.window_size > 0) {
+      if (p.custom_mask_type == CausalFromTopLeft &&
+          key_start + num_keys_in_block <=
+              int32_t(query_start) - p.window_size) {
+        return;
+      }
+    }
     auto prologueGradV = [&](int col) {
       typename MatmulGradV::Mma::IteratorB iterator_dO(
           {int32_t(p.gO_strideM)},
-          p.grad_output_ptr + query_start * p.gO_strideM + col,
+          const_cast<scalar_t*>(p.grad_output_ptr + query_start * p.gO_strideM + col),
           {num_queries_in_block, p.head_dim_value - col},
           thread_id,
           no_offset);
@@ -1476,7 +1489,7 @@ struct AttentionBackwardKernel {
     auto prologueGradQ = [&](int col) {
       typename MatmulGradQ::Mma::IteratorB iterator_K(
           {int32_t(p.k_strideM)},
-          p.key_ptr + key_start * p.k_strideM + col,
+          const_cast<scalar_t*>(p.key_ptr + key_start * p.k_strideM + col),
           {num_keys_in_block, p.head_dim - col},
           thread_id,
           no_offset);
@@ -1486,7 +1499,7 @@ struct AttentionBackwardKernel {
     auto prologueGradK = [&](int col) {
       typename MatmulGradK::Mma::IteratorB iterator_Q(
           {int32_t(p.q_strideM)},
-          p.query_ptr + query_start * p.q_strideM + col,
+          const_cast<scalar_t*>(p.query_ptr + query_start * p.q_strideM + col),
           {num_queries_in_block, p.head_dim - col},
           thread_id,
           no_offset);
@@ -1499,13 +1512,13 @@ struct AttentionBackwardKernel {
     auto prologueDOV = [&]() {
       typename MatmulDOIVJ::Mma::IteratorA iterator_A(
           {int32_t(p.gO_strideM)},
-          p.grad_output_ptr + query_start * p.gO_strideM,
+          const_cast<scalar_t*>(p.grad_output_ptr + query_start * p.gO_strideM),
           {num_queries_in_block, p.head_dim_value},
           thread_id,
           no_offset);
       typename MatmulDOIVJ::Mma::IteratorB iterator_B(
           {int32_t(p.v_strideM)},
-          p.value_ptr + key_start * p.v_strideM,
+          const_cast<scalar_t*>(p.value_ptr + key_start * p.v_strideM),
           {p.head_dim_value, num_keys_in_block},
           thread_id,
           no_offset);
@@ -1532,7 +1545,7 @@ struct AttentionBackwardKernel {
       // k_j
       typename Mma::IteratorA iterator_A(
           {int32_t(p.k_strideM)},
-          p.key_ptr + key_start * p.k_strideM,
+          const_cast<scalar_t*>(p.key_ptr + key_start * p.k_strideM),
           {problem_size.m(), problem_size.k()},
           thread_id,
           no_offset);
@@ -1540,7 +1553,7 @@ struct AttentionBackwardKernel {
       // q_i.transpose(-2, -1)
       typename Mma::IteratorB iterator_B(
           {int32_t(p.q_strideM)},
-          p.query_ptr + query_start * p.q_strideM,
+          const_cast<scalar_t*>(p.query_ptr + query_start * p.q_strideM),
           {problem_size.k(), problem_size.n()},
           thread_id,
           no_offset);
@@ -1579,7 +1592,7 @@ struct AttentionBackwardKernel {
         // load bias tile Bij into shared memory
         typename MatmulQK::BiasLoader::GmemTileIterator bias_iter(
             {cutlass::layout::RowMajor(p.bias_strideM)},
-            p.bias_ptr + query_start * p.bias_strideM + key_start,
+            const_cast<scalar_t*>(p.bias_ptr + query_start * p.bias_strideM + key_start),
             {num_queries_in_block, num_keys_in_block},
             thread_id);
         cutlass::TensorRef<scalar_t, cutlass::layout::RowMajor> bias_tensor_ref(
@@ -1625,7 +1638,26 @@ struct AttentionBackwardKernel {
             },
             [&](int accum_m) {});
       }
+      if (p.window_size > 0) {
+        auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset(
+            lane_id, warp_id, output_tile_coords);
+        int shift = query_start - key_start - p.window_size;
+        // current_key = key_start + accum_m
+        // current_query = query_start + accum_n
+        // mask if: `current_key < current_query - window_size`
+        // if accum_m < accum_n + query_start - window_size - key_start
 
+        MatmulQK::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {},
+            [&](int accum_m, int accum_n, int idx) {
+              if (accum_m <= accum_n + shift) {
+                accum[idx] =
+                    -cutlass::platform::numeric_limits<accum_t>::infinity();
+              }
+            },
+            [&](int accum_m) {});
+      }
       __syncthreads();
       if (kPrologueGV) {
         prologueGradV(0);
@@ -1720,7 +1752,7 @@ struct AttentionBackwardKernel {
             [&](int accum_m) {},
             [&](int accum_m /*q*/, int accum_n /*k*/, int idx) {
               if (zij.at({accum_n, accum_m}) == scalar_t(0)) {
-                dropout_keep_mask_doivj[idx] = cutlass::uint1b_t(0);
+                dropout_keep_mask_doivj[idx] = cutlass::uint1b_t{0};
               }
             },
             [&](int accum_m) {});
@@ -1752,7 +1784,7 @@ struct AttentionBackwardKernel {
       };
       typename Mma::IteratorB iterator_B(
           {int32_t(p.gO_strideM)},
-          p.grad_output_ptr + query_start * p.gO_strideM + col,
+          const_cast<scalar_t*>(p.grad_output_ptr + query_start * p.gO_strideM + col),
           {num_queries_in_block, p.head_dim_value - col},
           thread_id,
           no_offset);
@@ -1825,7 +1857,7 @@ struct AttentionBackwardKernel {
       // do_i
       typename Mma::IteratorA iterator_A(
           {int32_t(p.gO_strideM)},
-          p.grad_output_ptr + query_start * p.gO_strideM,
+          const_cast<scalar_t*>(p.grad_output_ptr + query_start * p.gO_strideM),
           {num_queries_in_block, p.head_dim_value},
           thread_id,
           no_offset);
@@ -1833,7 +1865,7 @@ struct AttentionBackwardKernel {
       // v_j.transpose(-2, -1)
       typename Mma::IteratorB iterator_B(
           {int32_t(p.v_strideM)},
-          p.value_ptr + key_start * p.v_strideM,
+          const_cast<scalar_t*>(p.value_ptr + key_start * p.v_strideM),
           {p.head_dim_value, num_keys_in_block},
           thread_id,
           no_offset);
@@ -1994,7 +2026,7 @@ struct AttentionBackwardKernel {
       // k_j
       typename Mma::IteratorB iterator_B(
           {int32_t(p.k_strideM)},
-          p.key_ptr + key_start * p.k_strideM + col,
+          const_cast<scalar_t*>(p.key_ptr + key_start * p.k_strideM + col),
           {problem_size.k(), problem_size.n()},
           thread_id,
           no_offset);
@@ -2129,7 +2161,7 @@ struct AttentionBackwardKernel {
       // q_i
       typename Mma::IteratorB iterator_B(
           {int32_t(p.q_strideM)},
-          p.query_ptr + query_start * p.q_strideM + col,
+          const_cast<scalar_t*>(p.query_ptr + query_start * p.q_strideM + col),
           {problem_size.k(), problem_size.n()},
           thread_id,
           no_offset);
@@ -2247,7 +2279,15 @@ struct AttentionBackwardKernel {
     if (p.custom_mask_type == CausalFromTopLeft) {
       int32_t last_key_for_block = query_start + kBlockSizeI - 1;
       last_key_for_block = cutlass::fast_min(last_key_for_block, p.num_keys);
-      num_key_blocks = ceil_div(last_key_for_block, kBlockSizeJ);
+      if (p.window_size == 0) {
+        num_key_blocks = ceil_div(last_key_for_block, kBlockSizeJ);
+      } else {
+        int32_t first_key_for_block =
+            cutlass::fast_max(query_start - p.window_size + 1, 0);
+        int32_t first_key_block = first_key_for_block / kBlockSizeJ;
+        int32_t last_key_block = last_key_for_block / kBlockSizeJ;
+        num_key_blocks = last_key_block - first_key_block + 1;
+      }
     } else if (p.custom_mask_type == CausalFromBottomRight) {
       int32_t last_key_for_block =
           query_start + (kBlockSizeI - 1) + (1 + p.num_keys - p.num_queries);
@@ -2278,8 +2318,14 @@ struct AttentionBackwardKernel {
         return;
       }
     } else {
-      if (next_query < p.num_queries) {
+      if (p.window_size == 0 && next_query < p.num_queries) {
         return;
+      } else if (p.window_size > 0) {
+        if (next_query <
+            cutlass::fast_min(
+                key_start + kBlockSizeJ + p.window_size, p.num_queries)) {
+          return;
+        }
       }
       // jump to next key
     }
@@ -2305,14 +2351,14 @@ struct AttentionBackwardKernel {
     int thread_id = 32 * warp_id + lane_id;
     typename MatmulQK::Mma::IteratorA iterator_A(
         {int32_t(p.k_strideM)},
-        p.key_ptr + key_start * p.k_strideM,
+        const_cast<scalar_t*>(p.key_ptr + key_start * p.k_strideM),
         {p.num_keys - key_start, p.head_dim},
         thread_id,
         cutlass::MatrixCoord{0, 0});
 
     typename MatmulQK::Mma::IteratorB iterator_B(
         {int32_t(p.q_strideM)},
-        p.query_ptr + query_start * p.q_strideM,
+        const_cast<scalar_t*>(p.query_ptr + query_start * p.q_strideM),
         {p.head_dim, p.num_queries - query_start},
         thread_id,
         cutlass::MatrixCoord{0, 0});
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h
index c0616371c7b18..74330ecd242a9 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h
@@ -65,9 +65,10 @@ constexpr int getWarpsPerSmFw() {
 }
 static CUTLASS_DEVICE float atomicMaxFloat(float* addr, float value) {
   // source: https://stackoverflow.com/a/51549250
-  return (value >= 0)
-      ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
-      : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+  return !signbit(value)
+             ? __int_as_float(atomicMax((int *)addr, __float_as_int(value)))
+             : __uint_as_float(
+                   atomicMin((unsigned int *)addr, __float_as_uint(value)));
 }
 } // namespace
 
@@ -130,15 +131,15 @@ struct AttentionKernel {
 
   struct Params {
     // Input tensors
-    scalar_t* query_ptr = nullptr; // [num_queries, num_heads, head_dim]
-    scalar_t* key_ptr = nullptr; // [num_keys, num_heads, head_dim]
-    scalar_t* value_ptr = nullptr; // [num_keys, num_heads, head_dim_value]
-    scalar_t* attn_bias_ptr = nullptr; // [num_heads, num_queries, num_keys]
-    int32_t* seqstart_q_ptr = nullptr;
-    int32_t* seqstart_k_ptr = nullptr;
-
-    int32_t* causal_diagonal_ptr = nullptr;
-    int32_t* seqlen_k_ptr = nullptr;
+    const scalar_t* query_ptr = nullptr; // [num_queries, num_heads, head_dim]
+    const scalar_t* key_ptr = nullptr; // [num_keys, num_heads, head_dim]
+    const scalar_t* value_ptr = nullptr; // [num_keys, num_heads, head_dim_value]
+    const scalar_t* attn_bias_ptr = nullptr; // [num_heads, num_queries, num_keys]
+    const int32_t* seqstart_q_ptr = nullptr;
+    const int32_t* seqstart_k_ptr = nullptr;
+
+    const int32_t* causal_diagonal_ptr = nullptr;
+    const int32_t* seqlen_k_ptr = nullptr;
     uint32_t causal_diagonal_offset = 0;
 
     // Output tensors
@@ -148,6 +149,9 @@ struct AttentionKernel {
     // [num_heads, num_queries] - can be null
     lse_scalar_t* logsumexp_ptr = nullptr;
 
+    // Sliding window. ignored if == 0
+    int32_t window_size = 0;
+
     // Scale
     accum_t scale;
 
@@ -616,6 +620,11 @@ struct AttentionKernel {
     TORCH_CHECK(
         p.custom_mask_type < NumCustomMaskTypes,
         "invalid value for `custom_mask_type`");
+    if (p.window_size > 0) {
+      TORCH_CHECK(
+          p.custom_mask_type == CausalFromTopLeft ||
+          p.custom_mask_type == CausalFromBottomRight);
+    }
     return true;
   }
 
@@ -698,6 +707,13 @@ struct AttentionKernel {
     // Iterate through keys
     for (int32_t iter_key_start = 0; iter_key_start < p.num_keys;
          iter_key_start += kKeysPerBlock) {
+      if (p.window_size > 0) {
+        // don't compute anything if below attention band
+        if (iter_key_start + kKeysPerBlock <
+            int32_t(query_start + p.causal_diagonal_offset) - p.window_size) {
+          continue;
+        }
+      }
       int32_t problem_size_0_m =
           cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries);
       int32_t problem_size_0_n = cutlass::fast_min(
@@ -709,7 +725,7 @@ struct AttentionKernel {
       auto prologueV = [&](int blockN) {
         typename MM1::Mma::IteratorB iterator_V(
             typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
-            p.value_ptr + iter_key_start * p.v_strideM,
+            const_cast<scalar_t*>(p.value_ptr + iter_key_start * p.v_strideM),
             {problem_size_1_k, problem_size_1_n},
             thread_id(),
             cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
@@ -745,7 +761,7 @@ struct AttentionKernel {
       typename MM0::IteratorA iterator_A(
           typename MM0::IteratorA::Params(
               typename MM0::MmaCore::LayoutA(p.q_strideM)),
-          p.query_ptr,
+          const_cast<scalar_t*>(p.query_ptr),
           {problem_size_0_m, problem_size_0_k},
           thread_id(),
           tb_offset_A);
@@ -753,7 +769,7 @@ struct AttentionKernel {
       typename MM0::IteratorB iterator_B(
           typename MM0::IteratorB::Params(
               typename MM0::MmaCore::LayoutB(p.k_strideM)),
-          p.key_ptr + iter_key_start * p.k_strideM,
+          const_cast<scalar_t*>(p.key_ptr + iter_key_start * p.k_strideM),
           {problem_size_0_k, problem_size_0_n},
           thread_id(),
           tb_offset_B);
@@ -800,7 +816,7 @@ struct AttentionKernel {
             {cutlass::layout::RowMajor(p.bias_strideM)},
             // attn_bias_pointer points to matrix of size (n_queries, n_keys)
             // for the relevant batch_id and head_id
-            p.attn_bias_ptr + query_start * p.bias_strideM + iter_key_start,
+            const_cast<scalar_t*>(p.attn_bias_ptr + query_start * p.bias_strideM + iter_key_start),
             {problem_size_0_m, problem_size_0_n},
             thread_id());
         cutlass::TensorRef<scalar_t, cutlass::layout::RowMajor> bias_tensor_ref(
@@ -855,6 +871,40 @@ struct AttentionKernel {
             },
             [&](int accum_m) {});
       }
+
+      // Mask out lower left corner of block if window_size > 0
+      // only required if current block intersects with the lower left corner
+      // block starts at x_lowerleft = iter_key_start // y = query_start +
+      // kQueriesPerBlock first non masked value at this y is : x_first =
+      // query_start + kQueriesPerBlock - window_size mask if x_fist >
+      // x_lowerleft
+
+      if (p.window_size > 0 &&
+          (query_start + p.causal_diagonal_offset +
+               cutlass::fast_min(
+                   int32_t(kQueriesPerBlock), int32_t(p.num_queries)) -
+               p.window_size >=
+           iter_key_start)) {
+        auto query_start = blockIdx.x * kQueriesPerBlock;
+        auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
+            my_lane_id, my_warp_id, iteratorC_tile_offset);
+        int32_t first_col;
+        const int32_t offset = query_start + p.causal_diagonal_offset -
+            p.window_size - iter_key_start;
+        MM0::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) { first_col = accum_m + offset; },
+            [&](int accum_m, int accum_n, int idx) {
+              if (accum_n <= first_col) {
+                accum[idx] =
+                    -cutlass::platform::numeric_limits<accum_t>::infinity();
+              }
+            },
+            [&](int accum_m) {});
+        // print_warp_accum<MM0::AccumLambdaIterator>(accum, lane_offset, 12,
+        // 12);
+      }
+
       // Update `mi` from accum stored in registers
       // Also does accum[i] <- exp(accum[i] - mi)
       iterative_softmax<typename MM0::Mma::Operator::IteratorC>(
@@ -969,7 +1019,7 @@ struct AttentionKernel {
 
         typename MM1::Mma::IteratorB iterator_V(
             typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
-            p.value_ptr + iter_key_start * p.v_strideM,
+            const_cast<scalar_t*>(p.value_ptr + iter_key_start * p.v_strideM),
             {problem_size_1_k, problem_size_1_n},
             thread_id(),
             cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
@@ -994,8 +1044,20 @@ struct AttentionKernel {
         }
 
         if (!kKeepOutputInRF) {
+          int first_key = 0;
+          if (p.window_size > 0) {
+            first_key = (cutlass::fast_max(
+                             int(query_start + p.causal_diagonal_offset) -
+                                 p.window_size + 1,
+                             0) /
+                         kKeysPerBlock) *
+                kKeysPerBlock;
+          }
+
+          // int first_key_block = 0;
+          // MM1::Mma::drain_cp_asyncs(); # TODO figure out if this is needed for correctness
           DISPATCH_BOOL(
-              iter_key_start == 0, kIsFirst, ([&] {
+              iter_key_start == first_key, kIsFirst, ([&] {
                 DISPATCH_BOOL(
                     (iter_key_start + kKeysPerBlock) >= p.num_keys,
                     kIsLast,
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB.h
index 157b04b8a5fff..9bf561d26dd76 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB.h
@@ -436,6 +436,16 @@ __global__ void __launch_bounds__(
     AttentionBackwardKernel<cutlass::arch::Sm50, float, true, false, false, 64, 64, 65536>::kNumThreads,
     AttentionBackwardKernel<cutlass::arch::Sm50, float, true, false, false, 64, 64, 65536>::kMinBlocksPerSm)
 fmha_cutlassB_f32_aligned_64x64_k65536_sm50(typename AttentionBackwardKernel<cutlass::arch::Sm50, float, true, false, false, 64, 64, 65536>::Params p);
+#if defined(CUDA_VERSION) && CUDA_VERSION == 12040 && !defined(USE_ROCM)
+__global__ void __launch_bounds__(
+    AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 32>::kNumThreads,
+    AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 32>::kMinBlocksPerSm)
+fmha_cutlassB_f32_aligned_32x32_k32_dropout_sm50(typename AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 32>::Params p);
+__global__ void __launch_bounds__(
+    AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 64>::kNumThreads,
+    AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 64>::kMinBlocksPerSm)
+fmha_cutlassB_f32_aligned_32x32_k64_dropout_sm50(typename AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 64>::Params p);
+#else
 __global__ void __launch_bounds__(
     AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 32>::kNumThreads,
     AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 32>::kMinBlocksPerSm)
@@ -444,6 +454,7 @@ __global__ void __launch_bounds__(
     AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 64>::kNumThreads,
     AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 64>::kMinBlocksPerSm)
 fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50(typename AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 64>::Params p);
+#endif
 __global__ void __launch_bounds__(
     AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 128>::kNumThreads,
     AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 128>::kMinBlocksPerSm)
@@ -490,8 +501,13 @@ template <typename T> void dispatch_cutlassB_f32_sm50(T cb, int cc) {
     cb(AttentionBackwardKernel<cutlass::arch::Sm50, float, true, false, false, 64, 64, 64>(), fmha_cutlassB_f32_aligned_64x64_k64_sm50);
     cb(AttentionBackwardKernel<cutlass::arch::Sm50, float, true, false, false, 64, 64, 128>(), fmha_cutlassB_f32_aligned_64x64_k128_sm50);
     cb(AttentionBackwardKernel<cutlass::arch::Sm50, float, true, false, false, 64, 64, 65536>(), fmha_cutlassB_f32_aligned_64x64_k65536_sm50);
+#if defined(CUDA_VERSION) && CUDA_VERSION == 12040 && !defined(USE_ROCM)
+    cb(AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 32>(), fmha_cutlassB_f32_aligned_32x32_k32_dropout_sm50);
+    cb(AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 64>(), fmha_cutlassB_f32_aligned_32x32_k64_dropout_sm50);
+#else
     cb(AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 32>(), fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50);
     cb(AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 64>(), fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50);
+#endif
     cb(AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 128>(), fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50);
     cb(AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 65536>(), fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50);
     cb(AttentionBackwardKernel<cutlass::arch::Sm50, float, false, false, false, 64, 64, 32>(), fmha_cutlassB_f32_notaligned_64x64_k32_sm50);
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB_f32_aligned_k32_dropout.cu b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB_f32_aligned_k32_dropout.cu
index 7250f11f96205..714e9cf3fa018 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB_f32_aligned_k32_dropout.cu
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB_f32_aligned_k32_dropout.cu
@@ -8,6 +8,27 @@
 // This file is auto-generated. See "generate_kernels.py"
 #include <ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h>
 using namespace PyTorchMemEffAttention;
+#if defined(CUDA_VERSION) && CUDA_VERSION == 12040 && !defined(USE_ROCM)
+__global__ void __launch_bounds__(
+    AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 32>::kNumThreads,
+    AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 32>::kMinBlocksPerSm)
+fmha_cutlassB_f32_aligned_32x32_k32_dropout_sm50(typename AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 32>::Params p) {
+#ifdef __CUDA_ARCH__
+#if __CUDA_ARCH__ >= 500
+#if __CUDA_ARCH__ < 700
+  if (!p.advance_to_block()) {
+    return;
+  }
+  AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 32>::attention_kernel(p);
+  return;
+#endif
+#endif
+    printf(
+        "FATAL: kernel `fmha_cutlassB_f32_aligned_32x32_k32_dropout_sm50` is for sm50-sm70, but was built for sm%d\n",
+        int(__CUDA_ARCH__ + 0) / 10);
+#endif
+}
+#else
 __global__ void __launch_bounds__(
     AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 32>::kNumThreads,
     AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 32>::kMinBlocksPerSm)
@@ -27,6 +48,7 @@ fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50(typename AttentionBackwardKerne
         int(__CUDA_ARCH__ + 0) / 10);
 #endif
 }
+#endif
 __global__ void __launch_bounds__(
     AttentionBackwardKernel<cutlass::arch::Sm70, float, true, true, false, 64, 64, 32>::kNumThreads,
     AttentionBackwardKernel<cutlass::arch::Sm70, float, true, true, false, 64, 64, 32>::kMinBlocksPerSm)
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB_f32_aligned_k64_dropout.cu b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB_f32_aligned_k64_dropout.cu
index 4db2f760302ff..d3aedcb58093f 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB_f32_aligned_k64_dropout.cu
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB_f32_aligned_k64_dropout.cu
@@ -8,6 +8,27 @@
 // This file is auto-generated. See "generate_kernels.py"
 #include <ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h>
 using namespace PyTorchMemEffAttention;
+#if defined(CUDA_VERSION) && CUDA_VERSION == 12040 && !defined(USE_ROCM)
+__global__ void __launch_bounds__(
+    AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 64>::kNumThreads,
+    AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 64>::kMinBlocksPerSm)
+fmha_cutlassB_f32_aligned_32x32_k64_dropout_sm50(typename AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 64>::Params p) {
+#ifdef __CUDA_ARCH__
+#if __CUDA_ARCH__ >= 500
+#if __CUDA_ARCH__ < 700
+  if (!p.advance_to_block()) {
+    return;
+  }
+  AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 32, 32, 64>::attention_kernel(p);
+  return;
+#endif
+#endif
+    printf(
+        "FATAL: kernel `fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50` is for sm50-sm70, but was built for sm%d\n",
+        int(__CUDA_ARCH__ + 0) / 10);
+#endif
+}
+#else
 __global__ void __launch_bounds__(
     AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 64>::kNumThreads,
     AttentionBackwardKernel<cutlass::arch::Sm50, float, true, true, false, 64, 64, 64>::kMinBlocksPerSm)
@@ -27,6 +48,7 @@ fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50(typename AttentionBackwardKerne
         int(__CUDA_ARCH__ + 0) / 10);
 #endif
 }
+#endif
 __global__ void __launch_bounds__(
     AttentionBackwardKernel<cutlass::arch::Sm70, float, true, true, false, 64, 64, 64>::kNumThreads,
     AttentionBackwardKernel<cutlass::arch::Sm70, float, true, true, false, 64, 64, 64>::kMinBlocksPerSm)
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
index 202fe5a256499..96b839820efd7 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -21,6 +21,10 @@
 #include <cmath>
 #include <functional>
 
+#if USE_ROCM
+#include <aotriton/flash.h>
+#endif
+
 /**
 * Note [SDPA Runtime Dispatch]
 * SDPA relies on a runtime dispatch mechanism to select the appropriate
@@ -45,6 +49,7 @@ namespace {
 // flash_attention V2 is universally faster than efficient_attention and Math
 std::array<SDPBackend, num_backends> priority_order(sdp_params const& params) {
   constexpr std::array<SDPBackend, num_backends> default_order{
+      SDPBackend::cudnn_attention,
       SDPBackend::flash_attention,
       SDPBackend::efficient_attention,
       SDPBackend::math};
@@ -181,32 +186,18 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
   // Check that the gpu is capable of running flash attention
   using sm80 = SMVersion<8, 0>;
   using sm90 = SMVersion<9, 0>;
-  auto dprops = at::cuda::getCurrentDeviceProperties();
 #if USE_ROCM
-  constexpr std::string_view mi200 = "gfx90a:sramecc+:xnack-";
-  static const char *over_arch = [] {
-    auto rc = std::getenv("PYTORCH_DEBUG_FLASH_ATTENTION_GCN_ARCH_OVERRIDE");
-    if (rc) {
-        TORCH_WARN("SDPA functions only loads value from PYTORCH_DEBUG_FLASH_ATTENTION_GCN_ARCH_OVERRIDE once. "
-                   "Later changes to this environment variable with os.environ "
-                   "(or other methods) will not affect SDPA function's behavior.");
-    }
-    return rc;
-  }();
-  const char* real_arch = dprops->gcnArchName;
-  const char* arch = over_arch ? over_arch : real_arch;
-  if (mi200 != arch) {
-    if (debug) {
-      TORCH_WARN(
-          "Flash attention only supports gpu architecture gfx90a, for now. Attempting to run on a ",
-          arch,
-          ".",
-          over_arch ? " This is overrided by PYTORCH_DEBUG_FLASH_ATTENTION_GCN_ARCH_OVERRIDE. Real architecture is " : "",
-          over_arch ? real_arch : "");
-    }
-    return false;
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
+  if (hipSuccess != aotriton::v2::flash::check_gpu(stream)) {
+      auto dprops = at::cuda::getCurrentDeviceProperties();
+      if (debug) {
+          TORCH_WARN(
+                  "Flash attention was not compiled for current AMD GPU architecture. Attempting to run on architecture ", dprops->gcnArchName);
+      }
+      return false;
   }
 #else
+  auto dprops = at::cuda::getCurrentDeviceProperties();
   if (!check_sm_version<sm80, sm90>(dprops)) {
     if (debug) {
       TORCH_WARN(
@@ -241,7 +232,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
   return true;
 }
 
-bool check_requires_grad_and_head_dim_gt192_and_sm_ge86_lt90(
+bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89(
     sdp_params const& params,
     bool debug) {
   // Flash Attention will raise an error in the backward pass if the head_dim
@@ -251,11 +242,19 @@ bool check_requires_grad_and_head_dim_gt192_and_sm_ge86_lt90(
   auto dprops = at::cuda::getCurrentDeviceProperties();
   bool is_sm86_or_sm89 = check_sm_version<sm86, sm89>(dprops);
   bool is_head_dim_gt192 = params.query.sym_size(-1) > 192;
-  if (input_requires_grad(params) && is_sm86_or_sm89 && is_head_dim_gt192) {
+  bool is_head_dim_lte224 = params.query.sym_size(-1) <= 224;
+  bool is_dropout = params.dropout > 0.0;
+  //  head_dim size  in (192, 224] is not supported on sm86 and sm89
+  bool cond1 = is_head_dim_gt192 && is_head_dim_lte224;
+  // head_dim size > 224 and is_dropout is not supported on sm86 and sm89
+  bool cond2 = params.query.sym_size(-1) > 224 && is_dropout;
+  if (input_requires_grad(params) && is_sm86_or_sm89 && (cond1 || cond2)) {
     if (debug) {
       TORCH_WARN(
-          "Flash attention currently doesn't support training with head_dim greater than 192 on gpu architectures in the range[sm86, sm89].",
-          "Attempting to run with head_dim: ",
+          "Flash attention currently doesn't support training with head_dim ∈ (192, 224] or "
+          "(head_dim ∈ (224, 256] and dropout > 0.0) on gpu architectures in the range[sm86, sm89].",
+          "Attempting to run with dropout set to: ", params.dropout,
+          "and head_dim: ",
           params.query.sym_size(-1), " on a sm ", dprops->major, ".",
           dprops->minor, " gpu.");
     }
@@ -264,7 +263,6 @@ bool check_requires_grad_and_head_dim_gt192_and_sm_ge86_lt90(
   return true;
 }
 
-
 bool check_flash_causal_non_square_seqlens(sdp_params const& params, bool debug) {
   // FlashAttention 2 updated the default mask meaning for causal in this PR:
   // 9e5e8bc91e it is now aligned to lower_right which would be a BC break
@@ -302,7 +300,156 @@ bool check_all_tensors_on_device(sdp_params const& params, bool debug) {
   return true;
 }
 
+bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
+  const auto num_heads{params.query.sym_size(1)},
+      query_lengths{params.query.sym_size(2)},
+      head_dim{params.query.sym_size(3)};
+  const bool ok = query_lengths % 64 == 0 && head_dim % 64 == 0;
+  if (!ok) {
+    if (debug) {
+      TORCH_WARN(
+          "CuDNN requires sequence length and head dim to be divisible by 64. Got sequence length: ",
+          query_lengths,
+          ", head dim: ",
+          head_dim,
+          ".");
+    }
+    return false;
+  }
+  return true;
+}
+
+bool check_cudnn_layout(sdp_params const& params, bool debug) {
+  const int64_t h = params.query.size(1);
+  const int64_t s_q = params.query.size(2);
+  const int64_t d = params.query.size(3);
+  const int64_t s_k = params.key.size(2);
+  const int64_t s_v = params.value.size(2);
+  // corresponds to cuDNN's "packed QKV" layout
+  const bool query_layout_ok = (params.query.stride(0) == s_q * 3 * h * d) &&
+                                 (params.query.stride(1) == d) &&
+                                 (params.query.stride(2) == 3 * h * d) &&
+                                 (params.query.stride(3) == 1);
+  const bool key_layout_ok = (params.key.stride(0) == s_k * 3 * h * d) &&
+                               (params.key.stride(1) == d) &&
+                               (params.key.stride(2) == 3 * h * d) &&
+                               (params.key.stride(3) == 1);
+  const bool value_layout_ok = (params.value.stride(0) == s_v * 3 * h * d) &&
+                                 (params.value.stride(1) == d) &&
+                                 (params.value.stride(2) == 3 * h * d) &&
+                                 (params.value.stride(3) == 1);
+  if (debug) {
+    if (!query_layout_ok) { TORCH_WARN("Query tensor was not in cuDNN-supported packed QKV layout", params.query.strides()); }
+    if (!key_layout_ok) { TORCH_WARN("Key tensor was not in cuDNN-supported packed QKV layout"); }
+    if (!value_layout_ok) { TORCH_WARN("Value tensor was not in cuDNN-supported packed QKV layout"); }
+  }
+  return query_layout_ok && key_layout_ok && value_layout_ok;
+}
+
+bool check_cudnn_hardware_support(sdp_params const& params, bool debug) {
+  using sm80 = SMVersion<8, 0>;
+  using sm90 = SMVersion<9, 0>;
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  if (!check_sm_version<sm80, sm90>(dprops)) {
+    if (debug) {
+      TORCH_WARN(
+          "cuDNN MHA only supports gpu architectures in the range [sm80, sm90]. Attempting to run on a sm ",
+          dprops->major,
+          ".",
+          dprops->minor,
+          " gpu.");
+    }
+    return false;
+  }
+  return true;
+}
+
+bool check_is_causal(sdp_params const& params, bool debug) {
+  // Check that the input is causal
+  if (!params.is_causal) {
+    if (debug) {
+      TORCH_WARN("CuDNN requires is_causal=True.");
+    }
+    return false;
+  }
+  return true;
+}
+
+bool check_for_nested_inputs(sdp_params const& params, bool debug) {
+  // Check that the input is nested
+  if (has_for_nested_inputs(params)) {
+    if (debug) {
+      TORCH_WARN("CuDNN currently does not support nested inputs.");
+    }
+    return false;
+  }
+  return true;
+}
+
+bool check_dtypes_low_precision(sdp_params const& params, bool debug) {
+  auto dprop = at::cuda::getCurrentDeviceProperties();
+  if (dprop->major >= 8) {
+    constexpr auto sm80_dtypes =
+        array_of<at::ScalarType>(at::kHalf, at::kBFloat16);
+    return check_tensor_dtype(params, sm80_dtypes, debug);
+  } else {
+    constexpr auto default_dtypes = array_of<at::ScalarType>(at::kHalf);
+    return check_tensor_dtype(params, default_dtypes, debug);
+  }
+}
+
+bool check_runtime_enabled_cudnn(sdp_params const& params, bool debug) {
+  static c10::once_flag supported_flag;
+  static bool supported = false;
+  c10::call_once(supported_flag, []() {
+    supported = (c10::utils::check_env("TORCH_CUDNN_SDPA_ENABLED") == true);
+  });
+  if (!supported) {
+    if (debug) {
+      TORCH_WARN(
+          "The CuDNN backend needs to be enabled by setting the enviornment variable`TORCH_CUDNN_SDPA_ENABLED=1`");
+    }
+    return false;
+  }
+  return true;
+}
+
+bool check_cudnn_requires_grad(sdp_params const& params, bool debug) {
+  // Check that the input is causal
+  if (input_requires_grad(params)) {
+    if (debug) {
+      TORCH_WARN("CuDNN does not currently support inputs with requires_grad=True.");
+    }
+    return false;
+  }
+  return true;
+}
+
 } // namespace
+
+bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
+
+  // Define gate functions that determine if a flash kernel can be ran
+  // Replace with std::to_array when we migrate to c++20
+  constexpr auto general_constraints =
+      array_of<bool (*)(sdp_params const&, bool)>(
+          check_runtime_enabled_cudnn,
+          check_cudnn_hardware_support);
+          // check_all_tensors_on_device,
+          // check_cudnn_tensor_shapes,
+          // check_cudnn_layout,
+          // check_is_causal,
+          // check_for_nested_inputs,
+          // check_cudnn_requires_grad,
+          // check_dtypes_low_precision
+  for (auto& constraint : general_constraints) {
+    if (!constraint(params, debug)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool can_use_flash_attention(sdp_params const& params, bool debug) {
 #ifndef USE_FLASH_ATTENTION
   TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention.");
@@ -318,8 +465,9 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
       check_for_attn_mask,
       check_head_dim_size_flash,
       check_flash_attention_hardware_support,
-      check_requires_grad_and_head_dim_gt192_and_sm_ge86_lt90,
-      check_flash_causal_non_square_seqlens);
+      check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89,
+      check_flash_causal_non_square_seqlens,
+      check_dtypes_low_precision);
   for (auto& constraint : general_constraints) {
     if (!constraint(params, debug)) {
       return false;
@@ -348,16 +496,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
       }
     }
   }
-
-  auto dprop = at::cuda::getCurrentDeviceProperties();
-  if (dprop->major >= 8) {
-    constexpr auto sm80_flash_dtypes =
-        array_of<at::ScalarType>(at::kHalf, at::kBFloat16);
-    return check_tensor_dtype(params, sm80_flash_dtypes, debug);
-  } else {
-    constexpr auto default_flash_dtypes = array_of<at::ScalarType>(at::kHalf);
-    return check_tensor_dtype(params, default_flash_dtypes, debug);
-  }
+  return true;
 }
 
 bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
@@ -421,7 +560,7 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
   // 3. Math fallback
   auto& ctx = at::globalContext();
   if (!ctx.userEnabledMathSDP() && !ctx.userEnabledFlashSDP() &&
-      !ctx.userEnabledMemEfficientSDP()) {
+      !ctx.userEnabledMemEfficientSDP() && !ctx.userEnabledCuDNNSDP()) {
     return SDPBackend::error;
   }
   // Get ideal kernel ordering
@@ -432,6 +571,12 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
   bool print_debug = false;
   for (auto& backend : ordering) {
     switch (backend) {
+      case SDPBackend::cudnn_attention:
+        if (sdp::can_use_cudnn_attention(kernel_params, print_debug)) {
+              TORCH_WARN("USING CUDNN SDPA");
+              return SDPBackend::cudnn_attention;
+        }
+        break;
       case SDPBackend::flash_attention:
         if (sdp::can_use_flash_attention(kernel_params, print_debug)) {
           return SDPBackend::flash_attention;
@@ -463,6 +608,8 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
   sdp::can_use_mem_efficient_attention(kernel_params, print_debug);
   TORCH_WARN("Flash attention kernel not used because:");
   sdp::can_use_flash_attention(kernel_params, print_debug);
+  TORCH_WARN("CuDNN attention kernel not used because:");
+  sdp::can_use_cudnn_attention(kernel_params, print_debug);
   TORCH_CHECK(!print_debug, "No available kernel. Aborting execution.")
   return SDPBackend::error;
 }
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index 2c6238b75c67c..3587b8ace2e72 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -11,5 +11,6 @@ bool check_for_seq_len_1_nested_tensor(sdp_params const& params, bool debug);
 SDPBackend select_sdp_backend(sdp_params const& kernel_params);
 C10_EXPORT bool can_use_flash_attention(sdp_params const& params, bool debug);
 C10_EXPORT bool can_use_mem_efficient_attention(sdp_params const& params, bool debug);
+C10_EXPORT bool can_use_cudnn_attention(sdp_params const& params, bool debug);
 
 } // namespace sdp
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.hip b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.hip
index 61999bc706c69..e110e4ae1c64b 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.hip
@@ -59,40 +59,102 @@
 #include <c10/util/Exception.h>
 #include <c10/util/CallOnce.h>
 
-// OORT headers
-#include <oort/attn_fwd.h>
-#include <oort/bwd_kernel_dk_dv.h>
-#include <oort/bwd_kernel_dq.h>
-#include <oort/bwd_preprocess.h>
+// AOTriton headers
+#include <aotriton/dtypes.h>
+#include <aotriton/flash.h>
+#include <aotriton/runtime.h>
+#include <aotriton/util.h>
 
 namespace pytorch_flash {
 
 namespace {
 
-c10::once_flag fa_gcn_arch_override_flag;
-const char* fa_override_arch = nullptr;
-
-void init_fa_override_arch() {
-  fa_override_arch = std::getenv("PYTORCH_DEBUG_FLASH_ATTENTION_GCN_ARCH_OVERRIDE");
-  if (fa_override_arch) {
-      TORCH_WARN("ROCM flash attention backend only loads value from PYTORCH_DEBUG_FLASH_ATTENTION_GCN_ARCH_OVERRIDE once. "
-                 "Later changes to this environment variable with os.environ "
-                 "(or other methods) will not affect this backend's behavior.");
+void check_gpu_arch(hipStream_t stream) {
+  auto ret = aotriton::v2::flash::check_gpu(stream);
+  if (hipSuccess != ret) {
+      TORCH_CHECK(false,
+                  "FlashAttention only supports MI200/MI300X GPUs (gfx90a:sramecc+:xnack- or gfx94a:sramecc+:xnack-)")
   }
 }
 
-void check_gpu_arch() {
-  auto dprops = at::cuda::getCurrentDeviceProperties();
+aotriton::DType cast_dtype(caffe2::TypeMeta t_dtype)
+{
+#define CAST_TYPE(aname, dtname) if (t_dtype == at::aname) return aotriton::DType::dtname
+  CAST_TYPE(kByte, kUInt8);
+  CAST_TYPE(kUInt16, kUInt16);
+  CAST_TYPE(kUInt32, kUInt32);
+  CAST_TYPE(kUInt64, kUInt64);
+  CAST_TYPE(kChar, kInt8);
+  CAST_TYPE(kShort, kInt16);
+  CAST_TYPE(kInt, kInt32);
+  CAST_TYPE(kLong, kInt64);
+  CAST_TYPE(kHalf, kFloat16);
+  CAST_TYPE(kFloat, kFloat32);
+  CAST_TYPE(kBFloat16, kBFloat16);
+  return aotriton::DType::kUnknown;
+#undef CAST_TYPE
+}
 
-  constexpr std::string_view mi200 = "gfx90a:sramecc+:xnack-";
-  c10::call_once(fa_gcn_arch_override_flag, init_fa_override_arch);
-  if (fa_override_arch) {
-      TORCH_CHECK(mi200 == fa_override_arch,
-                  "FlashAttention only supports MI200/MI250 GPUs (gfx90a:sramecc+:xnack-), current gcnArchName: " + std::string(dprops->gcnArchName) + " override as " + fa_override_arch);
-  } else {
-      TORCH_CHECK(mi200 == dprops->gcnArchName,
-                  "FlashAttention only supports MI200/MI250 GPUs (gfx90a:sramecc+:xnack-), current gcnArchName: " + std::string(dprops->gcnArchName));
+template<typename TargetType, int Rank>
+struct IntArrayRefCaster {
+  // std::array<TargetType, Rank> cast(IntArrayRef);
+};
+
+template<typename TargetType>
+struct IntArrayRefCaster<TargetType, 1> {
+  static auto cast(at::IntArrayRef ref) {
+    return std::array<TargetType, 1>{{ static_cast<TargetType>(ref.at(0)) }};
+  }
+};
+
+template<typename TargetType>
+struct IntArrayRefCaster<TargetType, 2> {
+  static auto cast(at::IntArrayRef ref) {
+    return std::array<TargetType, 2>{{
+      static_cast<TargetType>(ref.at(0)),
+      static_cast<TargetType>(ref.at(1))
+    }};
   }
+};
+
+template<typename TargetType>
+struct IntArrayRefCaster<TargetType, 3> {
+  static auto cast(at::IntArrayRef ref) {
+    return std::array<TargetType, 3>{{
+      static_cast<TargetType>(ref.at(0)),
+      static_cast<TargetType>(ref.at(1)),
+      static_cast<TargetType>(ref.at(2))
+    }};
+  }
+};
+
+template<typename TargetType>
+struct IntArrayRefCaster<TargetType, 4> {
+  static auto cast(at::IntArrayRef ref) {
+    return std::array<TargetType, 4>{{
+      static_cast<TargetType>(ref.at(0)),
+      static_cast<TargetType>(ref.at(1)),
+      static_cast<TargetType>(ref.at(2)),
+      static_cast<TargetType>(ref.at(3))
+    }};
+  }
+};
+
+
+template<int Rank = 4>
+aotriton::TensorView<Rank> mk_aotensor(const at::Tensor& q, c10::string_view tensor_name)
+{
+  const auto strides = q.strides();
+  int real_rank = strides.size();
+  if (real_rank != Rank) {  // Lazy convertion of tensor_name
+    TORCH_CHECK(false,
+                std::string(tensor_name) + "'s rank should be " + std::to_string(Rank)
+                + " but is " + std::to_string(real_rank));
+  }
+  return aotriton::TensorView<Rank>(reinterpret_cast<intptr_t>(q.data_ptr()),
+                                    IntArrayRefCaster<uint64_t, Rank>::cast(q.sizes()),
+                                    IntArrayRefCaster<uint64_t, Rank>::cast(strides),
+                                    cast_dtype(q.dtype()));
 }
 
 }
@@ -106,14 +168,16 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
         const at::Tensor &k,         // batch_size x seqlen_k x num_heads_k x head_size
         const at::Tensor &v,         // batch_size x seqlen_k x num_heads_k x head_size
         c10::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
+        c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
         const float p_dropout,
         const float softmax_scale,
         bool is_causal,
-        const int window_size_left,
+        int window_size_left,
         int window_size_right,
         const bool return_softmax,
         c10::optional<at::Generator> gen_) {
-  check_gpu_arch();
+  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
+  check_gpu_arch(stream);
 
   auto q_dtype = q.dtype();
   TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
@@ -205,102 +269,51 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
       seed_t = at::empty({}, at::dtype(at::kLong));
       offset_t = at::empty({}, at::dtype(at::kLong));
     }
-
   }
 
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
-  //reorder tensors and make contiguous
-  at::Tensor q_t = q_padded.permute({0,2,1,3}).contiguous();
-  at::Tensor k_t = k_padded.permute({0,2,1,3}).contiguous();
-  at::Tensor v_t = v_padded.permute({0,2,1,3}).contiguous();
-  at::Tensor output_t = out.permute({0,2,1,3}).contiguous();
-
-  at::Tensor M = at::empty({batch_size, num_heads, seqlen_q}, at::dtype(at::kFloat).device(q.device())); // aka softmax_lse
+  at::PhiloxCudaState philox_args;
+  if (p_dropout > 0.0) {
+    if (at::cuda::currentStreamCaptureStatus() ==
+        at::cuda::CaptureStatus::None)
+    {
+      philox_args = at::PhiloxCudaState(*seed_t.data_ptr<int64_t>(), *offset_t.data_ptr<int64_t>());
+    } else { // dropout + capture
+      philox_args = at::PhiloxCudaState(seed_t.data_ptr<int64_t>(), offset_t.data_ptr<int64_t>(), 0);
+    }
+  }
 
-  constexpr int BLOCK_M = 16;
-  constexpr int BLOCK_N = 16;
-  dim3  grid;
-  grid.x = (q_t.sizes()[2] + BLOCK_M - 1) / BLOCK_M;
-  grid.y = q_t.sizes()[0] * q_t.sizes()[1];
-  grid.z = 1;
-  dim3 block { 64 * 4, 1, 1 };  // compiled triton kernel intrinsic
+  // Transpose tensors to meet AOTriton's Flash API
+  at::Tensor q_t = q_padded.permute({0,2,1,3});
+  at::Tensor k_t = k_padded.permute({0,2,1,3});
+  at::Tensor v_t = v_padded.permute({0,2,1,3});
+  at::Tensor output_t = out.permute({0,2,1,3});
 
-  at::Tensor softmax_fa_t = at::empty({batch_size, num_heads, seqlen_q, seqlen_k},
-                                      at::dtype(q.dtype()).device(q.device()));
+  at::Tensor M = at::empty({batch_size * num_heads, seqlen_q}, at::dtype(at::kFloat).device(q.device())); // aka softmax_lse
 
-  hipError_t err; // TODO: Error handling
-#define CALL_FWD(FP, STAGE, BLOCK_M, BLOCK_DMODEL, BLOCK_N, pre_load_v, ENABLE_DROPOUT, RETURN_ENCODED_SOFTMAX)               \
-  do {                                                                                                                      \
-    oort::attn_fwd<STAGE,BLOCK_M, BLOCK_DMODEL, BLOCK_N, pre_load_v, ENABLE_DROPOUT, RETURN_ENCODED_SOFTMAX> fwd_opt;       \
-    err   = fwd_opt(grid, block,                                                                                            \
-                    (FP*)(q_t.data_ptr()), (FP*)(k_t.data_ptr()), (FP*)(v_t.data_ptr()),                                    \
-                    softmax_scale, (float*)M.data_ptr(), (FP*)output_t.data_ptr(),                                          \
-                    q_t.stride(0), q_t.stride(1), q_t.stride(2), q_t.stride(3),                                             \
-                    k_t.stride(0), k_t.stride(1), k_t.stride(2), k_t.stride(3),                                             \
-                    v_t.stride(0), v_t.stride(1), v_t.stride(2), v_t.stride(3),                                             \
-                    output_t.stride(0), output_t.stride(1), output_t.stride(2), output_t.stride(3),                         \
-                    q_t.sizes()[0], q_t.sizes()[1], seqlen_q, seqlen_k, p_dropout,                                          \
-                    *(uint64_t*)(seed_t.data_ptr()), *(uint32_t*)(offset_t.data_ptr()),                                     \
-                    (FP*)(softmax_fa_t.data_ptr()),                                                                         \
-                    stream);                                                                                                \
-  } while(0)
-
-  // TODO: Ugly but works
-  constexpr int kFwdUseCausal = 3;
-  constexpr int kFwdNoCausal  = 1;
-  int d_head = q_t.sizes()[3];
-  constexpr int BM = BLOCK_M;
-  constexpr int BN = BLOCK_N;
-  if (q_dtype == at::kHalf) {
-    if (is_causal) {
-      if (d_head == 16)
-        CALL_FWD(__fp16,kFwdUseCausal,BM,16,BN,true,true,true);
-      else if (d_head == 32)
-        CALL_FWD(__fp16,kFwdUseCausal,BM,32,BN,true,true,true);
-      else if (d_head == 64)
-        CALL_FWD(__fp16,kFwdUseCausal,BM,64,BN,true,true,true);
-      else if (d_head == 128)
-        CALL_FWD(__fp16,kFwdUseCausal,BM,128,BN,true,true,true);
-    } else {
-      if (d_head == 16)
-        CALL_FWD(__fp16,kFwdNoCausal,BM,16,BN,true,true,true);
-      else if (d_head == 32)
-        CALL_FWD(__fp16,kFwdNoCausal,BM,32,BN,true,true,true);
-      else if (d_head == 64)
-        CALL_FWD(__fp16,kFwdNoCausal,BM,64,BN,true,true,true);
-      else if (d_head == 128)
-        CALL_FWD(__fp16,kFwdNoCausal,BM,128,BN,true,true,true);
-    }
-  } else if (q_dtype == at::kBFloat16) {
-    if (is_causal) {
-      if (d_head == 16)
-        CALL_FWD(__bf16,kFwdUseCausal,BM,16,BN,true,true,true);
-      else if (d_head == 32)
-        CALL_FWD(__bf16,kFwdUseCausal,BM,32,BN,true,true,true);
-      else if (d_head == 64)
-        CALL_FWD(__bf16,kFwdUseCausal,BM,64,BN,true,true,true);
-      else if (d_head == 128)
-        CALL_FWD(__bf16,kFwdUseCausal,BM,128,BN,true,true,true);
-    } else {
-      if (d_head == 16)
-        CALL_FWD(__bf16,kFwdNoCausal,BM,16,BN,true,true,true);
-      else if (d_head == 32)
-        CALL_FWD(__bf16,kFwdNoCausal,BM,32,BN,true,true,true);
-      else if (d_head == 64)
-        CALL_FWD(__bf16,kFwdNoCausal,BM,64,BN,true,true,true);
-      else if (d_head == 128)
-        CALL_FWD(__bf16,kFwdNoCausal,BM,128,BN,true,true,true);
-    }
+  at::Tensor softmax_fa_t;
+  if (return_softmax) {
+    softmax_fa_t = at::empty({batch_size, num_heads, seqlen_q, seqlen_k},
+                             at::dtype(q.dtype()).device(q.device()));
+  } else {
+    softmax_fa_t = at::empty({ 0, 0, 0, 0 }, at::dtype(q.dtype()).device(q.device()));
   }
 
-  //undo reorder tensors
-  q_padded = q_t.permute({0,2,1,3}).contiguous();
-  k_padded = k_t.permute({0,2,1,3}).contiguous();
-  v_padded = v_t.permute({0,2,1,3}).contiguous();
-  out = output_t.permute({0,2,1,3}).contiguous();
+  hipError_t err; // TODO: Error handling
+  using aotriton::v2::flash::attn_fwd;
+  err = attn_fwd(mk_aotensor(q_t, "q"),
+                 mk_aotensor(k_t, "k"),
+                 mk_aotensor(v_t, "v"),
+                 softmax_scale,
+                 mk_aotensor<2>(M, "M"),
+                 mk_aotensor(output_t, "Out"),
+                 p_dropout,
+                 philox_args.seed_.val,
+                 philox_args.offset_.val,
+                 mk_aotensor(softmax_fa_t, "encoded_softmax"),
+                 is_causal,
+                 stream);
 
   return {out, q_padded, k_padded, v_padded, M, seed_t, offset_t, softmax_fa_t};
-#undef CALL_FWD
 }
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
@@ -311,13 +324,14 @@ mha_varlen_fwd(const at::Tensor &q,  // total_q x num_heads x head_size, total_q
                const at::Tensor &cu_seqlens_q,  // b+1
                const at::Tensor &cu_seqlens_k,  // b+1
                c10::optional<at::Tensor> &seqused_k, // b. If given, only this many elements of each batch element's keys are used.
-               const int max_seqlen_q,
+               c10::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
+               int max_seqlen_q,
                const int max_seqlen_k,
                const float p_dropout,
                const float softmax_scale,
                const bool zero_tensors,
-               const bool is_causal,
-               const int window_size_left,
+               bool is_causal,
+               int window_size_left,
                int window_size_right,
                const bool return_softmax,
                c10::optional<at::Generator> gen_) {
@@ -343,17 +357,19 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
         c10::optional<at::Tensor> &dq_,   // batch_size x seqlen_q x num_heads x head_size
         c10::optional<at::Tensor> &dk_,   // batch_size x seqlen_k x num_heads_k x head_size
         c10::optional<at::Tensor> &dv_,   // batch_size x seqlen_k x num_heads_k x head_size
+        c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
         const float p_dropout,         // probability to drop
         const float softmax_scale,
         const bool is_causal,
-        const int window_size_left,
+        int window_size_left,
         int window_size_right,
+        const bool deterministic,
         const at::Tensor philox_seed,
         const at::Tensor philox_offset) {
-  check_gpu_arch();
+  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
+  check_gpu_arch(stream);
 
   bool is_dropout = p_dropout > 0.0;
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
 
   auto q_dtype = q.dtype();
   TORCH_CHECK(q_dtype == at::kHalf || q_dtype == at::kBFloat16,
@@ -436,23 +452,12 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
 
   // const at::Tensor& dout_padded = dout;
 
-  // bool loop = seqlen_k > blocksize_c;
-  // TODO: change later, for now set to true for simplicity
-  bool loop = true;
-
   // Otherwise the kernel will be launched from cuda:0 device
   // Cast to char to avoid compiler warning about narrowing
   at::hip::HIPGuardMasqueradingAsCUDA device_guard{(char)q.get_device()};
 
   auto opts = q.options();
   auto softmax_d = at::empty({batch_size, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
-  at::Tensor dq_accum;
-  at::Tensor dk_accum, dv_accum;
-  if (loop) {
-    dq_accum = at::empty({batch_size, seqlen_q_rounded, num_heads, head_size_rounded}, opts.dtype(at::kFloat));
-    // dk_accum = at::empty({batch_size, num_heads_k, seqlen_k_rounded, head_size_rounded}, opts.dtype(at::kFloat));
-    // dv_accum = at::empty({batch_size, num_heads_k, seqlen_k_rounded, head_size_rounded}, opts.dtype(at::kFloat));
-  }
 
   at::Tensor dk_expanded, dv_expanded;
   if (num_heads_k != num_heads) {  // MQA / GQA
@@ -464,150 +469,50 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
   }
 
   at::PhiloxCudaState philox_args;
-  if (is_dropout) {
+  if (p_dropout > 0.0) {
     if (at::cuda::currentStreamCaptureStatus() ==
         at::cuda::CaptureStatus::None)
     {
       philox_args = at::PhiloxCudaState(*philox_seed.data_ptr<int64_t>(), *philox_offset.data_ptr<int64_t>());
     } else { // dropout + capture
-      philox_args = at::PhiloxCudaState(
-                                        philox_seed.data_ptr<int64_t>(), philox_offset.data_ptr<int64_t>(), 0);
+      philox_args = at::PhiloxCudaState(philox_seed.data_ptr<int64_t>(), philox_offset.data_ptr<int64_t>(), 0);
     }
   }
 
-  //JCG TODO WE GO IN HERE TODO backwards
-  //reorder tensors and make contiguous
-  at::Tensor q_t = q.permute({0,2,1,3}).contiguous();
-  at::Tensor k_t = k.permute({0,2,1,3}).contiguous();
-  at::Tensor v_t = v.permute({0,2,1,3}).contiguous();
-  at::Tensor out_t = out.permute({0,2,1,3}).contiguous();
-
-  //reorder tensors and make contiguous
-  at::Tensor dq_t = dq.permute({0,2,1,3}).contiguous();
-  at::Tensor dk_t = dk.permute({0,2,1,3}).contiguous();
-  at::Tensor dv_t = dv.permute({0,2,1,3}).contiguous();
-  at::Tensor dout_t = dout.permute({0,2,1,3}).contiguous();
+  at::Tensor q_t = q.permute({0,2,1,3});
+  at::Tensor k_t = k.permute({0,2,1,3});
+  at::Tensor v_t = v.permute({0,2,1,3});
+  at::Tensor out_t = out.permute({0,2,1,3});
+  at::Tensor dq_t = dq.permute({0,2,1,3});
+  at::Tensor dk_t = dk.permute({0,2,1,3});
+  at::Tensor dv_t = dv.permute({0,2,1,3});
+  at::Tensor dout_t = dout.permute({0,2,1,3});
 
-  dim3 block { 64 * 4, 1, 1 };
-
-  at::Tensor new_do = at::empty_like(dout_t).contiguous();
+  at::Tensor softmax_lse_cont = softmax_lse.contiguous();
   at::Tensor delta = at::empty_like(softmax_lse).contiguous();
 
   int d_head = head_size_og;
   hipError_t err; // TODO: Error handling
-#define CALL_BWD_PP(FP, PP_BLOCK, PP_DMODEL)                                                                                \
-  do {                                                                                                                      \
-    dim3 pp_grid;                                                                                                           \
-    pp_grid.x = batch_size * num_heads * ((dout_t.size(2) + PP_BLOCK - 1) / PP_BLOCK);                                      \
-    pp_grid.y = 1;                                                                                                          \
-    pp_grid.z = 1;                                                                                                          \
-    oort::bwd_preprocess<PP_BLOCK, PP_DMODEL> pre_opt;                                                                      \
-    err = pre_opt(pp_grid, block,                                                                                           \
-                  (FP*)(out_t.data_ptr()),                                                                                  \
-                  (FP*)(dout_t.data_ptr()),                                                                                 \
-                  (FP*)(new_do.data_ptr()),                                                                                 \
-                  (float*)(delta.data_ptr()),                                                                               \
-                  stream);                                                                                                  \
-  } while (0)
-
-#define CALL_BWD_PP_DMODEL(FP, PP_BLOCK)                                                                                    \
-  do {                                                                                                                      \
-    if (d_head == 16)                                                                                                       \
-      CALL_BWD_PP(FP, PP_BLOCK, 16);                                                                                        \
-    else if (d_head == 32)                                                                                                  \
-      CALL_BWD_PP(FP, PP_BLOCK, 32);                                                                                        \
-    else if (d_head == 64)                                                                                                  \
-      CALL_BWD_PP(FP, PP_BLOCK, 64);                                                                                        \
-    else if (d_head == 128)                                                                                                 \
-      CALL_BWD_PP(FP, PP_BLOCK, 128);                                                                                       \
-  } while (0)
-
-  if(q_dtype == at::kHalf) {
-    if (seqlen_q >= 64)
-      CALL_BWD_PP_DMODEL(__fp16, 16);
-    else
-      CALL_BWD_PP_DMODEL(__fp16, 16);
-  } else if (q_dtype == at::kBFloat16) {
-    if (seqlen_q >= 64)
-      CALL_BWD_PP_DMODEL(__bf16, 16);
-    else
-      CALL_BWD_PP_DMODEL(__bf16, 16);
-  }
-#undef CALL_BWD_PP
-
-#define CALL_BWD(FP, BLOCK_M, BLOCK_DMODEL, BLOCK_N, CAUSAL, ENABLE_DROPOUT)                                                \
-  do {                                                                                                                      \
-    dim3  grid;                                                                                                             \
-    grid.x = (seqlen_k + BLOCK_M - 1) / BLOCK_M;                                                                            \
-    grid.y = batch_size * num_heads;                                                                                        \
-    grid.z = 1;                                                                                                             \
-    oort::bwd_kernel_dk_dv<BLOCK_M, BLOCK_DMODEL, BLOCK_N, CAUSAL, ENABLE_DROPOUT> dk_dv_opt;                               \
-    err = dk_dv_opt(grid, block,                                                                                            \
-                    (FP*)(q_t.data_ptr()), (FP*)(k_t.data_ptr()), (FP*)(v_t.data_ptr()),                                    \
-                    softmax_scale, (FP*)out_t.data_ptr(), (FP*)dout_t.data_ptr(),                                           \
-                    (FP*)dk_t.data_ptr(),(FP*)dv_t.data_ptr(),                                                              \
-                    (float*)(softmax_lse.data_ptr()),                                                                       \
-                    (float*)(delta.data_ptr()),                                                                             \
-                    q_t.stride(0), q_t.stride(1), q_t.stride(2), q_t.stride(3),                                             \
-                    k_t.stride(0), k_t.stride(1), k_t.stride(2), k_t.stride(3),                                             \
-                    v_t.stride(0), v_t.stride(1), v_t.stride(2), v_t.stride(3),                                             \
-                    q_t.sizes()[0], q_t.sizes()[1], seqlen_q, seqlen_k, p_dropout,                                          \
-                    (uint64_t)(philox_args.seed_.val), (uint32_t)(philox_args.offset_.val), stream);                        \
-    grid.x = (seqlen_q + BLOCK_M - 1) / BLOCK_M;                                                                            \
-    oort::bwd_kernel_dq<BLOCK_M, BLOCK_DMODEL, BLOCK_N, CAUSAL, ENABLE_DROPOUT> dq_opt;                                     \
-    err = dq_opt(grid, block,                                                                                               \
-                 (FP*)(q_t.data_ptr()), (FP*)(k_t.data_ptr()), (FP*)(v_t.data_ptr()),                                       \
-                 softmax_scale, (FP*)out_t.data_ptr(), (FP*)dout_t.data_ptr(),                                              \
-                 (FP*)dq_t.data_ptr(),                                                                                      \
-                 (float*)(softmax_lse.data_ptr()),                                                                          \
-                 (float*)(delta.data_ptr()),                                                                                \
-                 q_t.stride(0), q_t.stride(1), q_t.stride(2), q_t.stride(3),                                                \
-                 k_t.stride(0), k_t.stride(1), k_t.stride(2), k_t.stride(3),                                                \
-                 v_t.stride(0), v_t.stride(1), v_t.stride(2), v_t.stride(3),                                                \
-                 q_t.sizes()[0], q_t.sizes()[1], seqlen_q, seqlen_k, p_dropout,                                             \
-                 (uint64_t)(philox_args.seed_.val), (uint32_t)(philox_args.offset_.val), stream);                           \
-  } while(0)
-
-#define CALL_BWD_DROPOUT(FP, BLOCK_M, BLOCK_DMODEL, BLOCK_N, CAUSAL)                                                        \
-  do {                                                                                                                      \
-    if (p_dropout > 0.0) {                                                                                                  \
-      CALL_BWD(FP, BLOCK_M, BLOCK_DMODEL, BLOCK_N, CAUSAL, true);                                                           \
-    } else {                                                                                                                \
-      CALL_BWD(FP, BLOCK_M, BLOCK_DMODEL, BLOCK_N, CAUSAL, false);                                                          \
-    }                                                                                                                       \
-  } while (0)
-
-#define CALL_BWD_DROPOUT_DMODEL(FP, BLOCK_M, BLOCK_N, CAUSAL)                                                               \
-  do {                                                                                                                      \
-    if (d_head == 16)                                                                                                       \
-      CALL_BWD_DROPOUT(FP, BLOCK_M, 16, BLOCK_N, CAUSAL);                                                                   \
-    else if (d_head == 32)                                                                                                  \
-      CALL_BWD_DROPOUT(FP, BLOCK_M, 32, BLOCK_N, CAUSAL);                                                                   \
-    else if (d_head == 64)                                                                                                  \
-      CALL_BWD_DROPOUT(FP, BLOCK_M, 64, BLOCK_N, CAUSAL);                                                                   \
-    else if (d_head == 128)                                                                                                 \
-      CALL_BWD_DROPOUT(FP, BLOCK_M, 128, BLOCK_N, CAUSAL);                                                                  \
-  } while (0)
-
-  if (q_dtype == at::kHalf) {
-    if (is_causal) {
-      CALL_BWD_DROPOUT_DMODEL(__fp16, 16, 16, true);
-    } else {
-      CALL_BWD_DROPOUT_DMODEL(__fp16, 16, 16, false);
-    }
-  } else if (q_dtype == at::kBFloat16) {
-    if (is_causal) {
-      CALL_BWD_DROPOUT_DMODEL(__bf16, 16, 16, true);
-    } else {
-      CALL_BWD_DROPOUT_DMODEL(__bf16, 16, 16, false);
-    }
+  {
+    using aotriton::v2::flash::attn_bwd;
+    err = attn_bwd(mk_aotensor(q_t, "q"),
+                   mk_aotensor(k_t, "k"),
+                   mk_aotensor(v_t, "v"),
+                   softmax_scale,
+                   mk_aotensor(out_t, "out"),
+                   mk_aotensor(dout_t, "dout"),
+                   mk_aotensor(dq_t, "dq"),
+                   mk_aotensor(dk_t, "dk"),
+                   mk_aotensor(dv_t, "dv"),
+                   mk_aotensor<2>(softmax_lse_cont, "L"),
+                   mk_aotensor<2>(delta, "delta"),
+                   p_dropout,
+                   philox_args.seed_.val,
+                   philox_args.offset_.val,
+                   is_causal,
+                   stream);
   }
 
-  //undo reorder tensors for returns
-  dq = dq_t.permute({0,2,1,3}).contiguous();
-  dk = dk_t.permute({0,2,1,3}).contiguous();
-  dv = dv_t.permute({0,2,1,3}).contiguous();
-
   // For MQA/GQA we need to sum dK and dV across the groups
   if (num_heads_k != num_heads) {
     at::sum_out(dk, at::reshape(dk_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3});
@@ -630,14 +535,16 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
                c10::optional<at::Tensor> &dv_,   // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
                const at::Tensor &cu_seqlens_q,  // b+1
                const at::Tensor &cu_seqlens_k,  // b+1
+               c10::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
                const int max_seqlen_q,
                const int max_seqlen_k,          // max sequence length to choose the kernel
                const float p_dropout,         // probability to drop
                const float softmax_scale,
                const bool zero_tensors,
                const bool is_causal,
-               const int window_size_left,
+               int window_size_left,
                int window_size_right,
+               const bool deterministic,
                const at::Tensor philox_seed,
                const at::Tensor philox_offset) {
   TORCH_CHECK(false, "mha_varlen_bwd not supported on ROCm");
diff --git a/aten/src/ATen/native/transformers/sdp_utils_cpp.cpp b/aten/src/ATen/native/transformers/sdp_utils_cpp.cpp
index f2159f8f0fdbd..77e39496d1aaf 100644
--- a/aten/src/ATen/native/transformers/sdp_utils_cpp.cpp
+++ b/aten/src/ATen/native/transformers/sdp_utils_cpp.cpp
@@ -34,7 +34,7 @@ bool check_head_dim_size_cpp(sdp_params const& params, bool debug) {
 
 bool use_flash_attention_cpp(sdp_params const& params, bool debug) {
   constexpr auto cpp_supported_flash_dtypes =
-      array_of<at::ScalarType>(at::kFloat, at::kDouble, at::kBFloat16);
+      array_of<at::ScalarType>(at::kFloat, at::kDouble, at::kBFloat16, at::kHalf);
 
   // Define gate functions that determine if a flash kernel can be run
   constexpr auto constraints = array_of<bool (*)(sdp_params const&, bool)>(
diff --git a/aten/src/ATen/native/transformers/sdp_utils_cpp.h b/aten/src/ATen/native/transformers/sdp_utils_cpp.h
index d5d136c79609e..6e15a27fae542 100644
--- a/aten/src/ATen/native/transformers/sdp_utils_cpp.h
+++ b/aten/src/ATen/native/transformers/sdp_utils_cpp.h
@@ -22,12 +22,13 @@
 
 namespace sdp {
 
-constexpr int32_t num_backends = 3;
+constexpr int32_t num_backends = 4;
 enum class SDPBackend {
   error = -1,
   math = 0,
   flash_attention = 1,
-  efficient_attention = 2
+  efficient_attention = 2,
+  cudnn_attention = 3
 };
 
 // Note that if this changed make sure to update
@@ -265,7 +266,7 @@ inline bool check_requires_grad_and_nested(sdp_params const& params, bool debug)
 inline bool check_for_attn_mask(sdp_params const& params, bool debug) {
   if (params.attn_mask.has_value()) {
     if (debug) {
-      TORCH_WARN("Both fused kernels do not support non-null attn_mask.");
+      TORCH_WARN("Flash Attention does not support non-null attn_mask.");
     }
     return false;
   }
diff --git a/aten/src/ATen/native/vulkan/api/Adapter.cpp b/aten/src/ATen/native/vulkan/api/Adapter.cpp
index 62369170a3f34..173479a0c2de0 100644
--- a/aten/src/ATen/native/vulkan/api/Adapter.cpp
+++ b/aten/src/ATen/native/vulkan/api/Adapter.cpp
@@ -1,7 +1,7 @@
 #include <ATen/native/vulkan/api/Adapter.h>
-#include <c10/util/irange.h>
 
 #include <bitset>
+#include <cstring>
 #include <iomanip>
 #include <sstream>
 #include <utility>
@@ -28,7 +28,7 @@ PhysicalDevice::PhysicalDevice(VkPhysicalDevice physical_device_handle)
   // DEVICE_LOCAL property flags
   const VkMemoryPropertyFlags unified_memory_flags =
       VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
-  for (const uint32_t i : c10::irange(memory_properties.memoryTypeCount)) {
+  for (size_t i = 0; i < memory_properties.memoryTypeCount; ++i) {
     if (memory_properties.memoryTypes[i].propertyFlags | unified_memory_flags) {
       has_unified_memory = true;
       break;
@@ -44,11 +44,10 @@ PhysicalDevice::PhysicalDevice(VkPhysicalDevice physical_device_handle)
       handle, &queue_family_count, queue_families.data());
 
   // Find the total number of compute queues
-  for (const uint32_t family_i : c10::irange(queue_families.size())) {
-    const VkQueueFamilyProperties& properties = queue_families[family_i];
+  for (const VkQueueFamilyProperties& p : queue_families) {
     // Check if this family has compute capability
-    if (properties.queueFlags & VK_QUEUE_COMPUTE_BIT) {
-      num_compute_queues += properties.queueCount;
+    if (p.queueFlags & VK_QUEUE_COMPUTE_BIT) {
+      num_compute_queues += p.queueCount;
     }
   }
 }
@@ -96,8 +95,8 @@ VkDevice create_logical_device(
   queues_to_get.reserve(num_queues_to_create);
 
   uint32_t remaining_queues = num_queues_to_create;
-  for (const uint32_t family_i :
-       c10::irange(physical_device.queue_families.size())) {
+  for (uint32_t family_i = 0; family_i < physical_device.queue_families.size();
+       ++family_i) {
     const VkQueueFamilyProperties& queue_properties =
         physical_device.queue_families.at(family_i);
     // Check if this family has compute capability
@@ -115,7 +114,7 @@ VkDevice create_logical_device(
           queue_priorities.data(), // pQueuePriorities
       });
 
-      for (const uint32_t queue_i : c10::irange(queues_to_init)) {
+      for (size_t queue_i = 0; queue_i < queues_to_init; ++queue_i) {
         // Use this to get the queue handle once device is created
         queues_to_get.emplace_back(family_i, queue_i);
       }
@@ -249,7 +248,7 @@ DeviceHandle::DeviceHandle(DeviceHandle&& other) noexcept
 }
 
 DeviceHandle::~DeviceHandle() {
-  if C10_LIKELY (VK_NULL_HANDLE == handle_) {
+  if (VK_NULL_HANDLE == handle_) {
     return;
   }
   vkDestroyDevice(handle_, nullptr);
@@ -287,7 +286,7 @@ Adapter::Queue Adapter::request_queue() {
 
   uint32_t min_usage = UINT32_MAX;
   uint32_t min_used_i = 0;
-  for (const uint32_t i : c10::irange(queues_.size())) {
+  for (size_t i = 0; i < queues_.size(); ++i) {
     if (queue_usage_[i] < min_usage) {
       min_used_i = i;
       min_usage = queue_usage_[i];
@@ -299,7 +298,7 @@ Adapter::Queue Adapter::request_queue() {
 }
 
 void Adapter::return_queue(Adapter::Queue& compute_queue) {
-  for (const uint32_t i : c10::irange(queues_.size())) {
+  for (size_t i = 0; i < queues_.size(); ++i) {
     if ((queues_[i].family_index == compute_queue.family_index) &&
         (queues_[i].queue_index == compute_queue.queue_index)) {
       std::lock_guard<std::mutex> lock(queue_usage_mutex_);
@@ -395,7 +394,7 @@ std::string Adapter::stringize() const {
 
   ss << "  Memory Info {" << std::endl;
   ss << "    Memory Types [" << std::endl;
-  for (const auto i : c10::irange(mem_props.memoryTypeCount)) {
+  for (size_t i = 0; i < mem_props.memoryTypeCount; ++i) {
     ss << "      "
        << " [Heap " << mem_props.memoryTypes[i].heapIndex << "] "
        << get_memory_properties_str(mem_props.memoryTypes[i].propertyFlags)
@@ -403,7 +402,7 @@ std::string Adapter::stringize() const {
   }
   ss << "    ]" << std::endl;
   ss << "    Memory Heaps [" << std::endl;
-  for (const auto i : c10::irange(mem_props.memoryHeapCount)) {
+  for (size_t i = 0; i < mem_props.memoryHeapCount; ++i) {
     ss << "      " << mem_props.memoryHeaps[i].size << std::endl;
   }
   ss << "    ]" << std::endl;
diff --git a/aten/src/ATen/native/vulkan/api/Adapter.h b/aten/src/ATen/native/vulkan/api/Adapter.h
index eeca4add20e47..3fec5bbe23f25 100644
--- a/aten/src/ATen/native/vulkan/api/Adapter.h
+++ b/aten/src/ATen/native/vulkan/api/Adapter.h
@@ -4,11 +4,13 @@
 
 #ifdef USE_VULKAN_API
 
-#include <ATen/native/vulkan/api/Common.h>
+#include <ATen/native/vulkan/api/vk_api.h>
+
 #include <ATen/native/vulkan/api/Pipeline.h>
 #include <ATen/native/vulkan/api/Shader.h>
 #include <ATen/native/vulkan/api/Utils.h>
 
+#include <array>
 #include <mutex>
 #include <ostream>
 
@@ -64,7 +66,7 @@ class DeviceHandle final {
 // instance for each physical device visible to the VkInstance. Upon
 // construction, this class will populate the physical device properties, but
 // will not create the logical device until specifically requested via the
-// init_device() funtion.
+// init_device() function.
 //
 // init_device() will create the logical device and obtain the VkDevice handle
 // for it. It will also create a number of compute queues up to the amount
diff --git a/aten/src/ATen/native/vulkan/api/Allocator.h b/aten/src/ATen/native/vulkan/api/Allocator.h
index d0c8bdf9ecd4c..0b28bef94c2c1 100644
--- a/aten/src/ATen/native/vulkan/api/Allocator.h
+++ b/aten/src/ATen/native/vulkan/api/Allocator.h
@@ -5,9 +5,9 @@
 // Always include this file (Allocator.h) instead.
 //
 
-#ifdef USE_VULKAN_API
+#include <ATen/native/vulkan/api/vk_api.h>
 
-#include <ATen/native/vulkan/api/Common.h>
+#ifdef USE_VULKAN_API
 
 #define VMA_VULKAN_VERSION 1000000
 
diff --git a/aten/src/ATen/native/vulkan/api/Command.cpp b/aten/src/ATen/native/vulkan/api/Command.cpp
index b043d2f12b033..23098eebf5e83 100644
--- a/aten/src/ATen/native/vulkan/api/Command.cpp
+++ b/aten/src/ATen/native/vulkan/api/Command.cpp
@@ -1,6 +1,5 @@
 #include <ATen/native/vulkan/api/Adapter.h>
 #include <ATen/native/vulkan/api/Command.h>
-#include <ATen/native/vulkan/api/Utils.h>
 
 #include <mutex>
 
@@ -44,7 +43,7 @@ CommandBuffer& CommandBuffer::operator=(CommandBuffer&& other) noexcept {
 }
 
 void CommandBuffer::begin() {
-  TORCH_CHECK(
+  VK_CHECK_COND(
       state_ == CommandBuffer::State::NEW,
       "Vulkan CommandBuffer: called begin() on a command buffer whose state "
       "is not NEW.");
@@ -61,7 +60,7 @@ void CommandBuffer::begin() {
 }
 
 void CommandBuffer::end() {
-  TORCH_CHECK(
+  VK_CHECK_COND(
       state_ == CommandBuffer::State::RECORDING ||
           state_ == CommandBuffer::State::SUBMITTED,
       "Vulkan CommandBuffer: called end() on a command buffer whose state "
@@ -77,7 +76,7 @@ void CommandBuffer::bind_pipeline(
     VkPipeline pipeline,
     VkPipelineLayout pipeline_layout,
     const utils::uvec3 local_workgroup_size) {
-  TORCH_CHECK(
+  VK_CHECK_COND(
       state_ == CommandBuffer::State::RECORDING,
       "Vulkan CommandBuffer: called bind_pipeline() on a command buffer whose state "
       "is not RECORDING.");
@@ -95,7 +94,7 @@ void CommandBuffer::bind_pipeline(
 }
 
 void CommandBuffer::bind_descriptors(VkDescriptorSet descriptors) {
-  TORCH_CHECK(
+  VK_CHECK_COND(
       state_ == CommandBuffer::State::PIPELINE_BOUND,
       "Vulkan CommandBuffer: called bind_descriptors() on a command buffer whose state "
       "is not PIPELINE_BOUND.");
@@ -117,26 +116,29 @@ void CommandBuffer::bind_descriptors(VkDescriptorSet descriptors) {
   state_ = CommandBuffer::State::DESCRIPTORS_BOUND;
 }
 
-void CommandBuffer::insert_barrier(const PipelineBarrier& pipeline_barrier) {
-  TORCH_CHECK(
+void CommandBuffer::insert_barrier(PipelineBarrier& pipeline_barrier) {
+  VK_CHECK_COND(
       state_ == CommandBuffer::State::DESCRIPTORS_BOUND ||
           state_ == CommandBuffer::State::RECORDING,
       "Vulkan CommandBuffer: called insert_barrier() on a command buffer whose state "
       "is not DESCRIPTORS_BOUND or RECORDING.");
 
   if (pipeline_barrier) {
-    c10::SmallVector<VkBufferMemoryBarrier, 4u> buffer_memory_barriers;
+    if (!pipeline_barrier.buffer_barrier_handles.empty()) {
+      pipeline_barrier.buffer_barrier_handles.clear();
+    }
     for (const api::BufferMemoryBarrier& memory_barrier :
          pipeline_barrier.buffers) {
-      buffer_memory_barriers.push_back(memory_barrier.handle);
+      pipeline_barrier.buffer_barrier_handles.push_back(memory_barrier.handle);
     }
 
-    c10::SmallVector<VkImageMemoryBarrier, 4u> image_memory_barriers;
+    if (!pipeline_barrier.image_barrier_handles.empty()) {
+      pipeline_barrier.image_barrier_handles.clear();
+    }
     for (const api::ImageMemoryBarrier& memory_barrier :
          pipeline_barrier.images) {
-      image_memory_barriers.push_back(memory_barrier.handle);
+      pipeline_barrier.image_barrier_handles.push_back(memory_barrier.handle);
     }
-
     vkCmdPipelineBarrier(
         handle_, // commandBuffer
         pipeline_barrier.stage.src, // srcStageMask
@@ -144,17 +146,21 @@ void CommandBuffer::insert_barrier(const PipelineBarrier& pipeline_barrier) {
         0u, // dependencyFlags
         0u, // memoryBarrierCount
         nullptr, // pMemoryBarriers
-        buffer_memory_barriers.size(), // bufferMemoryBarrierCount
-        buffer_memory_barriers.data(), // pMemoryBarriers
-        image_memory_barriers.size(), // imageMemoryBarrierCount
-        image_memory_barriers.data()); // pImageMemoryBarriers
+        pipeline_barrier.buffers.size(), // bufferMemoryBarrierCount
+        !pipeline_barrier.buffers.empty()
+            ? pipeline_barrier.buffer_barrier_handles.data()
+            : nullptr, // pMemoryBarriers
+        pipeline_barrier.images.size(), // imageMemoryBarrierCount
+        !pipeline_barrier.images.empty()
+            ? pipeline_barrier.image_barrier_handles.data()
+            : nullptr); // pImageMemoryBarriers
   }
 
   state_ = CommandBuffer::State::BARRIERS_INSERTED;
 }
 
 void CommandBuffer::dispatch(const utils::uvec3& global_workgroup_size) {
-  TORCH_CHECK(
+  VK_CHECK_COND(
       state_ == CommandBuffer::State::BARRIERS_INSERTED,
       "Vulkan CommandBuffer: called dispatch() on a command buffer whose state "
       "is not BARRIERS_INSERTED.");
@@ -178,7 +184,7 @@ void CommandBuffer::copy_buffer_to_buffer(
     const api::utils::uvec3& copy_range,
     const api::utils::uvec3& src_offset,
     const api::utils::uvec3& dst_offset) {
-  TORCH_CHECK(
+  VK_CHECK_COND(
       state_ == CommandBuffer::State::BARRIERS_INSERTED,
       "Vulkan CommandBuffer: called copy_buffer_to_buffer() on a command buffer whose state "
       "is not BARRIERS_INSERTED.");
@@ -201,7 +207,7 @@ void CommandBuffer::copy_texture_to_texture(
     const api::utils::uvec3& copy_range,
     const api::utils::uvec3& src_offset,
     const api::utils::uvec3& dst_offset) {
-  TORCH_CHECK(
+  VK_CHECK_COND(
       state_ == CommandBuffer::State::BARRIERS_INSERTED,
       "Vulkan CommandBuffer: called copy_texture_to_texture() on a command buffer whose state "
       "is not BARRIERS_INSERTED.");
@@ -246,7 +252,7 @@ void CommandBuffer::copy_texture_to_buffer(
     const api::utils::uvec3& copy_range,
     const api::utils::uvec3& src_offset,
     const api::utils::uvec3& dst_offset) {
-  TORCH_CHECK(
+  VK_CHECK_COND(
       state_ == CommandBuffer::State::BARRIERS_INSERTED,
       "Vulkan CommandBuffer: called copy_texture_to_buffer() on a command buffer whose state "
       "is not BARRIERS_INSERTED.");
@@ -284,7 +290,7 @@ void CommandBuffer::copy_buffer_to_texture(
     const api::utils::uvec3& copy_range,
     const api::utils::uvec3& src_offset,
     const api::utils::uvec3& dst_offset) {
-  TORCH_CHECK(
+  VK_CHECK_COND(
       state_ == CommandBuffer::State::BARRIERS_INSERTED,
       "Vulkan CommandBuffer: called copy_buffer_to_texture() on a command buffer whose state "
       "is not BARRIERS_INSERTED.");
@@ -318,7 +324,7 @@ void CommandBuffer::copy_buffer_to_texture(
 
 void CommandBuffer::write_timestamp(VkQueryPool querypool, const uint32_t idx)
     const {
-  TORCH_CHECK(
+  VK_CHECK_COND(
       state_ == CommandBuffer::State::RECORDING,
       "Vulkan CommandBuffer: called write_timestamp() on a command buffer whose state "
       "is not RECORDING.");
@@ -331,7 +337,7 @@ void CommandBuffer::reset_querypool(
     VkQueryPool querypool,
     const uint32_t first_idx,
     const uint32_t count) const {
-  TORCH_CHECK(
+  VK_CHECK_COND(
       state_ == CommandBuffer::State::RECORDING,
       "Vulkan CommandBuffer: called reset_querypool() on a command buffer whose state "
       "is not RECORDING.");
@@ -340,7 +346,7 @@ void CommandBuffer::reset_querypool(
 }
 
 VkCommandBuffer CommandBuffer::get_submit_handle(const bool final_use) {
-  TORCH_CHECK(
+  VK_CHECK_COND(
       state_ == CommandBuffer::State::READY,
       "Vulkan CommandBuffer: called begin() on a command buffer whose state "
       "is not READY.");
@@ -414,7 +420,7 @@ void CommandPool::flush() {
 }
 
 void CommandPool::allocate_new_batch(const uint32_t count) {
-  // No-ops if there are still command buffers availble
+  // No-ops if there are still command buffers available
   if (in_use_ < buffers_.size()) {
     return;
   }
diff --git a/aten/src/ATen/native/vulkan/api/Command.h b/aten/src/ATen/native/vulkan/api/Command.h
index c566e4f9ab9d8..fda4701c6d41e 100644
--- a/aten/src/ATen/native/vulkan/api/Command.h
+++ b/aten/src/ATen/native/vulkan/api/Command.h
@@ -4,12 +4,13 @@
 
 #ifdef USE_VULKAN_API
 
-#include <ATen/native/vulkan/api/Common.h>
+#include <ATen/native/vulkan/api/vk_api.h>
+
 #include <ATen/native/vulkan/api/Descriptor.h>
 #include <ATen/native/vulkan/api/Pipeline.h>
 #include <ATen/native/vulkan/api/Resource.h>
 #include <ATen/native/vulkan/api/Shader.h>
-#include <c10/util/ArrayRef.h>
+#include <ATen/native/vulkan/api/Utils.h>
 
 namespace at {
 namespace native {
@@ -83,7 +84,7 @@ class CommandBuffer final {
   void bind_pipeline(VkPipeline, VkPipelineLayout, const utils::uvec3);
   void bind_descriptors(VkDescriptorSet);
 
-  void insert_barrier(const PipelineBarrier& pipeline_barrier);
+  void insert_barrier(PipelineBarrier& pipeline_barrier);
   void dispatch(const utils::uvec3&);
 
   void copy_buffer_to_buffer(
diff --git a/aten/src/ATen/native/vulkan/api/Common.h b/aten/src/ATen/native/vulkan/api/Common.h
deleted file mode 100644
index 3192e674ffb4f..0000000000000
--- a/aten/src/ATen/native/vulkan/api/Common.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#pragma once
-
-#ifdef USE_VULKAN_API
-
-#include <c10/util/Exception.h>
-#include <utility>
-
-#include <ATen/native/vulkan/api/vk_api.h>
-
-/*
- * Check that the return code of a Vulkan API call is VK_SUCCESS, throwing an
- * error with the returned code if not. If STRIP_ERROR_MESSAGES is defined then
- * only the return code will be preserved.
- */
-#ifdef STRIP_ERROR_MESSAGES
-#define VK_CHECK(function)                                       \
-  do {                                                           \
-    const VkResult result = (function);                          \
-    if (VK_SUCCESS != result) {                                  \
-      throw c10::Error(                                          \
-          {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
-          c10::str(result));                                     \
-    }                                                            \
-  } while (false)
-#else
-#define VK_CHECK(function)                                       \
-  do {                                                           \
-    const VkResult result = (function);                          \
-    if (VK_SUCCESS != result) {                                  \
-      throw c10::Error(                                          \
-          {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
-          c10::str(                                              \
-              C10_STRINGIZE(__FILE__),                           \
-              "[",                                               \
-              C10_STRINGIZE(__LINE__),                           \
-              "] Expected VK_SUCCESS, got VkResult of ",         \
-              result));                                          \
-    }                                                            \
-  } while (false)
-#endif /* STRIP_ERROR_MESSAGES */
-
-#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/api/Context.cpp b/aten/src/ATen/native/vulkan/api/Context.cpp
index 6e606adcafc9c..8bcf2d18549cf 100644
--- a/aten/src/ATen/native/vulkan/api/Context.cpp
+++ b/aten/src/ATen/native/vulkan/api/Context.cpp
@@ -1,7 +1,17 @@
 #include <ATen/native/vulkan/api/Context.h>
 
+#include <cstring>
+#include <memory>
 #include <sstream>
 
+#ifndef VULKAN_DESCRIPTOR_POOL_SIZE
+#define VULKAN_DESCRIPTOR_POOL_SIZE 1024u
+#endif
+
+#ifndef VULKAN_QUERY_POOL_SIZE
+#define VULKAN_QUERY_POOL_SIZE 4096u
+#endif
+
 namespace at {
 namespace native {
 namespace vulkan {
@@ -38,12 +48,10 @@ Context::~Context() {
     // Let the device know the context is done with the queue
     adapter_p_->return_queue(queue_);
   } catch (...) {
-    TORCH_WARN("Exception thrown when destroying the Vulkan Context!");
   }
 }
 
-DescriptorSet Context::submit_compute_prologue(
-    CommandBuffer& command_buffer,
+DescriptorSet Context::get_descriptor_set(
     const ShaderInfo& shader_descriptor,
     const utils::uvec3& local_workgroup_size) {
   VkDescriptorSetLayout shader_layout =
@@ -57,21 +65,34 @@ DescriptorSet Context::submit_compute_prologue(
        shader_cache().retrieve(shader_descriptor),
        local_workgroup_size});
 
-  command_buffer.bind_pipeline(pipeline, pipeline_layout, local_workgroup_size);
+  cmd_.bind_pipeline(pipeline, pipeline_layout, local_workgroup_size);
 
   return descriptor_pool().get_descriptor_set(
       shader_layout, shader_descriptor.kernel_layout);
 }
 
-void Context::submit_compute_epilogue(
-    CommandBuffer& command_buffer,
+void Context::register_shader_dispatch(
     const DescriptorSet& descriptors,
-    const PipelineBarrier& pipeline_barrier,
+    PipelineBarrier& pipeline_barrier,
+    const ShaderInfo& shader_descriptor,
     const utils::uvec3& global_workgroup_size) {
-  command_buffer.bind_descriptors(descriptors.get_bind_handle());
-  command_buffer.insert_barrier(pipeline_barrier);
-
-  command_buffer.dispatch(global_workgroup_size);
+  // Adjust the global workgroup size based on the output tile size
+  const utils::uvec3 effective_global_wg = {
+      utils::div_up(
+          global_workgroup_size.data[0u],
+          shader_descriptor.out_tile_size.data[0u]),
+      utils::div_up(
+          global_workgroup_size.data[1u],
+          shader_descriptor.out_tile_size.data[1u]),
+      utils::div_up(
+          global_workgroup_size.data[2u],
+          shader_descriptor.out_tile_size.data[2u]),
+  };
+
+  cmd_.bind_descriptors(descriptors.get_bind_handle());
+  cmd_.insert_barrier(pipeline_barrier);
+
+  cmd_.dispatch(effective_global_wg);
 }
 
 void Context::submit_cmd_to_gpu(VkFence fence_handle, const bool final_use) {
@@ -90,6 +111,11 @@ void Context::flush() {
   command_pool_.flush();
   descriptor_pool_.flush();
 
+  // If there is an existing command buffer, invalidate it
+  if (cmd_) {
+    cmd_.invalidate();
+  }
+
   std::lock_guard<std::mutex> bufferlist_lock(buffer_clearlist_mutex_);
   std::lock_guard<std::mutex> imagelist_lock(image_clearlist_mutex_);
   buffers_to_clear_.clear();
@@ -111,16 +137,16 @@ Context* context() {
       };
 
       const DescriptorPoolConfig descriptor_pool_config{
-          1024u, // descriptorPoolMaxSets
-          1024u, // descriptorUniformBufferCount
-          1024u, // descriptorStorageBufferCount
-          1024u, // descriptorCombinedSamplerCount
-          1024u, // descriptorStorageImageCount
+          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorPoolMaxSets
+          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorUniformBufferCount
+          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageBufferCount
+          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorCombinedSamplerCount
+          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageImageCount
           32u, // descriptorPileSizes
       };
 
       const QueryPoolConfig query_pool_config{
-          4096u, // maxQueryCount
+          VULKAN_QUERY_POOL_SIZE, // maxQueryCount
           256u, // initialReserveSize
       };
 
@@ -132,28 +158,12 @@ Context* context() {
       };
 
       return new Context(runtime()->default_adapter_i(), config);
-    } catch (const c10::Error& e) {
-      TORCH_WARN(
-          "Pytorch Vulkan Context: Failed to initialize global vulkan context: ",
-          e.what());
-    } catch (const std::exception& e) {
-      TORCH_WARN(
-          "Pytorch Vulkan Context: Failed to initialize global vulkan context: ",
-          e.what());
     } catch (...) {
-      TORCH_WARN(
-          "Pytorch Vulkan Context: Failed to initialize global vulkan context!");
     }
 
     return nullptr;
   }());
 
-  if (!context) {
-    TORCH_WARN(
-        "Pytorch Vulkan Context: The global context could not be retrieved "
-        "because it failed to initialize.");
-  }
-
   return context.get();
 }
 
@@ -166,12 +176,13 @@ namespace {
 void memcpy_to_buffer(const VulkanBuffer& src, VulkanBuffer& dst) {
   MemoryMap dst_mapping(dst, MemoryAccessType::WRITE);
 
-  MemoryMap src_mapping(src, api::MemoryAccessType::READ);
+  MemoryMap src_mapping(src, MemoryAccessType::READ);
   src_mapping.invalidate();
 
   void* dst_ptr = dst_mapping.template data<void>();
   void* src_ptr = src_mapping.template data<void>();
 
+  // @lint-ignore CLANGTIDY facebook-security-vulnerable-memcpy
   memcpy(dst_ptr, src_ptr, src.mem_size());
 }
 
diff --git a/aten/src/ATen/native/vulkan/api/Context.h b/aten/src/ATen/native/vulkan/api/Context.h
index 63327d7a14546..3e152aba4b458 100644
--- a/aten/src/ATen/native/vulkan/api/Context.h
+++ b/aten/src/ATen/native/vulkan/api/Context.h
@@ -4,15 +4,17 @@
 
 #ifdef USE_VULKAN_API
 
+#include <ATen/native/vulkan/api/vk_api.h>
+
 #include <ATen/native/vulkan/api/Adapter.h>
 #include <ATen/native/vulkan/api/Command.h>
-#include <ATen/native/vulkan/api/Common.h>
 #include <ATen/native/vulkan/api/Descriptor.h>
 #include <ATen/native/vulkan/api/Pipeline.h>
 #include <ATen/native/vulkan/api/QueryPool.h>
 #include <ATen/native/vulkan/api/Resource.h>
 #include <ATen/native/vulkan/api/Runtime.h>
 #include <ATen/native/vulkan/api/Shader.h>
+#include <ATen/native/vulkan/api/Utils.h>
 
 namespace at {
 namespace native {
@@ -166,22 +168,17 @@ class Context final {
     }
   }
 
- private:
-  DescriptorSet submit_compute_prologue(
-      CommandBuffer&,
-      const ShaderInfo&,
-      const utils::uvec3&);
+  DescriptorSet get_descriptor_set(const ShaderInfo&, const utils::uvec3&);
 
-  void submit_compute_epilogue(
-      CommandBuffer&,
+  void register_shader_dispatch(
       const DescriptorSet&,
-      const PipelineBarrier&,
+      PipelineBarrier&,
+      const ShaderInfo&,
       const utils::uvec3&);
 
- public:
   template <class S, class D>
   bool submit_copy(
-      const PipelineBarrier&,
+      PipelineBarrier&,
       const S&,
       const D&,
       const api::utils::uvec3&,
@@ -192,7 +189,7 @@ class Context final {
   template <typename... Arguments>
   bool submit_compute_job(
       const ShaderInfo&,
-      const PipelineBarrier&,
+      PipelineBarrier&,
       const utils::uvec3&,
       const utils::uvec3&,
       VkFence fence_handle,
@@ -208,6 +205,7 @@ class Context final {
 class UniformParamsBuffer final {
  private:
   Context* context_p_;
+  size_t nbytes_;
   VulkanBuffer vulkan_buffer_;
 
  public:
@@ -216,6 +214,7 @@ class UniformParamsBuffer final {
   template <typename Block>
   UniformParamsBuffer(Context* context_p, const Block& block)
       : context_p_(context_p),
+        nbytes_(sizeof(block)),
         vulkan_buffer_(
             context_p_->adapter_ptr()->vma().create_params_buffer(block)) {}
 
@@ -234,26 +233,43 @@ class UniformParamsBuffer final {
   VulkanBuffer& buffer() {
     return vulkan_buffer_;
   }
+
+  template <typename Block>
+  void update(const Block& block) {
+    if (sizeof(block) != nbytes_) {
+      VK_THROW(
+          "Attempted to update UniformParamsBuffer with data of different size");
+    }
+    // Fill the uniform buffer with data in block
+    {
+      MemoryMap mapping(vulkan_buffer_, MemoryAccessType::WRITE);
+      Block* data_ptr = mapping.template data<Block>();
+
+      *data_ptr = block;
+    }
+  }
 };
 
 class StorageBuffer final {
  private:
   Context* context_p_;
-  c10::ScalarType dtype_;
+  ScalarType dtype_;
   size_t numel_;
+  size_t nbytes_;
   VulkanBuffer vulkan_buffer_;
 
  public:
   StorageBuffer(
       Context* context_p,
-      const c10::ScalarType dtype,
+      const ScalarType dtype,
       const size_t numel,
       const bool gpuonly = false)
       : context_p_(context_p),
         dtype_(dtype),
         numel_(numel),
+        nbytes_(element_size(dtype_) * numel_),
         vulkan_buffer_(context_p_->adapter_ptr()->vma().create_storage_buffer(
-            c10::elementSize(dtype_) * numel_,
+            nbytes_,
             gpuonly)) {}
 
   StorageBuffer(const StorageBuffer&) = delete;
@@ -266,13 +282,21 @@ class StorageBuffer final {
     context_p_->register_buffer_cleanup(vulkan_buffer_);
   }
 
-  inline c10::ScalarType dtype() {
+  inline ScalarType dtype() {
     return dtype_;
   }
 
   inline VulkanBuffer& buffer() {
     return vulkan_buffer_;
   }
+
+  inline size_t numel() {
+    return numel_;
+  }
+
+  inline size_t nbytes() {
+    return nbytes_;
+  }
 };
 
 bool available();
@@ -300,7 +324,7 @@ inline void arg_is_empty(bool& any_is_empty, const VulkanImage& image) {
 template <typename... Arguments>
 inline bool any_arg_is_empty(Arguments&&... arguments) {
   bool any_is_empty = false;
-  C10_UNUSED const int _[]{
+  VK_UNUSED const int _[]{
       0,
       (arg_is_empty(any_is_empty, std::forward<Arguments>(arguments)), 0)...,
   };
@@ -313,7 +337,7 @@ inline void bind(
     DescriptorSet& descriptor_set,
     const std::index_sequence<Indices...>&,
     Arguments&&... arguments) {
-  C10_UNUSED const int _[]{
+  VK_UNUSED const int _[]{
       0,
       (descriptor_set.bind(Indices, std::forward<Arguments>(arguments)), 0)...,
   };
@@ -387,7 +411,7 @@ inline void record_copy<VulkanBuffer, VulkanImage>(
  */
 template <class S, class D>
 inline bool Context::submit_copy(
-    const PipelineBarrier& pipeline_barrier,
+    PipelineBarrier& pipeline_barrier,
     const S& source,
     const D& destination,
     const api::utils::uvec3& copy_range,
@@ -454,7 +478,7 @@ inline bool Context::submit_copy(
 template <typename... Arguments>
 inline bool Context::submit_compute_job(
     const ShaderInfo& shader,
-    const PipelineBarrier& pipeline_barrier,
+    PipelineBarrier& pipeline_barrier,
     const utils::uvec3& global_work_group,
     const utils::uvec3& local_work_group_size,
     VkFence fence_handle,
@@ -500,23 +524,16 @@ inline bool Context::submit_compute_job(
 
   // Factor out template parameter independent code to minimize code bloat.
   DescriptorSet descriptor_set =
-      submit_compute_prologue(cmd_, shader, local_work_group_size);
+      get_descriptor_set(shader, local_work_group_size);
 
   detail::bind(
       descriptor_set,
       std::index_sequence_for<Arguments...>{},
       std::forward<Arguments>(arguments)...);
 
-  // Adjust the global workgroup size based on the output tile size
-  const utils::uvec3 effective_global_wg = {
-      utils::div_up(global_work_group.data[0u], shader.out_tile_size.data[0u]),
-      utils::div_up(global_work_group.data[1u], shader.out_tile_size.data[1u]),
-      utils::div_up(global_work_group.data[2u], shader.out_tile_size.data[2u]),
-  };
-
   // Factor out template parameter independent code to minimize code bloat.
-  submit_compute_epilogue(
-      cmd_, descriptor_set, pipeline_barrier, effective_global_wg);
+  register_shader_dispatch(
+      descriptor_set, pipeline_barrier, shader, global_work_group);
 
 #ifdef USE_VULKAN_GPU_DIAGNOSTICS
   if (enable_op_profiling_) {
diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.cpp b/aten/src/ATen/native/vulkan/api/Descriptor.cpp
index 42c2c36724e48..1d48d41025a5c 100644
--- a/aten/src/ATen/native/vulkan/api/Descriptor.cpp
+++ b/aten/src/ATen/native/vulkan/api/Descriptor.cpp
@@ -1,6 +1,7 @@
 #include <ATen/native/vulkan/api/Descriptor.h>
 #include <ATen/native/vulkan/api/Utils.h>
 
+#include <algorithm>
 #include <utility>
 
 namespace at {
@@ -43,6 +44,10 @@ DescriptorSet& DescriptorSet::operator=(DescriptorSet&& other) noexcept {
 DescriptorSet& DescriptorSet::bind(
     const uint32_t idx,
     const VulkanBuffer& buffer) {
+  VK_CHECK_COND(
+      buffer.has_memory(),
+      "Buffer must be bound to memory for it to be usable");
+
   DescriptorSet::ResourceBinding binder{};
   binder.binding_idx = idx; // binding_idx
   binder.descriptor_type = shader_layout_signature_[idx]; // descriptor_type
@@ -58,6 +63,9 @@ DescriptorSet& DescriptorSet::bind(
 DescriptorSet& DescriptorSet::bind(
     const uint32_t idx,
     const VulkanImage& image) {
+  VK_CHECK_COND(
+      image.has_memory(), "Image must be bound to memory for it to be usable");
+
   VkImageLayout binding_layout = image.layout();
   if (shader_layout_signature_[idx] == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) {
     binding_layout = VK_IMAGE_LAYOUT_GENERAL;
@@ -76,7 +84,7 @@ DescriptorSet& DescriptorSet::bind(
 }
 
 VkDescriptorSet DescriptorSet::get_bind_handle() const {
-  c10::SmallVector<VkWriteDescriptorSet, 6u> write_descriptor_sets;
+  std::vector<VkWriteDescriptorSet> write_descriptor_sets;
 
   for (const ResourceBinding& binding : bindings_) {
     VkWriteDescriptorSet write{
@@ -159,7 +167,7 @@ VkDescriptorSet DescriptorSetPile::get_descriptor_set() {
 }
 
 void DescriptorSetPile::allocate_new_batch() {
-  // No-ops if there are still descriptor sets availble
+  // No-ops if there are still descriptor sets available
   if (in_use_ < descriptors_.size() &&
       descriptors_[in_use_] != VK_NULL_HANDLE) {
     return;
@@ -194,7 +202,26 @@ DescriptorPool::DescriptorPool(
       config_(config),
       mutex_{},
       piles_{} {
-  c10::SmallVector<VkDescriptorPoolSize, 4u> type_sizes{
+  if (config.descriptorPoolMaxSets > 0) {
+    init(config);
+  }
+}
+
+DescriptorPool::~DescriptorPool() {
+  if (VK_NULL_HANDLE == pool_) {
+    return;
+  }
+  vkDestroyDescriptorPool(device_, pool_, nullptr);
+}
+
+void DescriptorPool::init(const DescriptorPoolConfig& config) {
+  VK_CHECK_COND(
+      pool_ == VK_NULL_HANDLE,
+      "Trying to init a DescriptorPool that has already been created!");
+
+  config_ = config;
+
+  std::vector<VkDescriptorPoolSize> type_sizes{
       {
           VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
           config_.descriptorUniformBufferCount,
@@ -225,16 +252,12 @@ DescriptorPool::DescriptorPool(
   VK_CHECK(vkCreateDescriptorPool(device_, &create_info, nullptr, &pool_));
 }
 
-DescriptorPool::~DescriptorPool() {
-  if (VK_NULL_HANDLE == pool_) {
-    return;
-  }
-  vkDestroyDescriptorPool(device_, pool_, nullptr);
-}
-
 DescriptorSet DescriptorPool::get_descriptor_set(
     VkDescriptorSetLayout set_layout,
     const ShaderLayout::Signature& signature) {
+  VK_CHECK_COND(
+      pool_ != VK_NULL_HANDLE, "DescriptorPool has not yet been initialized!");
+
   auto it = piles_.find(set_layout);
   if (piles_.cend() == it) {
     it = piles_
@@ -252,8 +275,10 @@ DescriptorSet DescriptorPool::get_descriptor_set(
 }
 
 void DescriptorPool::flush() {
-  VK_CHECK(vkResetDescriptorPool(device_, pool_, 0u));
-  piles_.clear();
+  if (pool_ != VK_NULL_HANDLE) {
+    VK_CHECK(vkResetDescriptorPool(device_, pool_, 0u));
+    piles_.clear();
+  }
 }
 
 } // namespace api
diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.h b/aten/src/ATen/native/vulkan/api/Descriptor.h
index b413cb59a8dea..d91cb9268f334 100644
--- a/aten/src/ATen/native/vulkan/api/Descriptor.h
+++ b/aten/src/ATen/native/vulkan/api/Descriptor.h
@@ -4,10 +4,12 @@
 
 #ifdef USE_VULKAN_API
 
-#include <ATen/native/vulkan/api/Common.h>
+#include <ATen/native/vulkan/api/vk_api.h>
+
 #include <ATen/native/vulkan/api/Resource.h>
 #include <ATen/native/vulkan/api/Shader.h>
-#include <c10/util/flat_hash_map.h>
+
+#include <unordered_map>
 
 namespace at {
 namespace native {
@@ -41,7 +43,7 @@ class DescriptorSet final {
   VkDevice device_;
   VkDescriptorSet handle_;
   ShaderLayout::Signature shader_layout_signature_;
-  c10::SmallVector<ResourceBinding, 6u> bindings_;
+  std::vector<ResourceBinding> bindings_;
 
  public:
   DescriptorSet& bind(const uint32_t, const VulkanBuffer&);
@@ -114,9 +116,15 @@ class DescriptorPool final {
   DescriptorPoolConfig config_;
   // New Descriptors
   std::mutex mutex_;
-  ska::flat_hash_map<VkDescriptorSetLayout, DescriptorSetPile> piles_;
+  std::unordered_map<VkDescriptorSetLayout, DescriptorSetPile> piles_;
 
  public:
+  operator bool() const {
+    return (pool_ != VK_NULL_HANDLE);
+  }
+
+  void init(const DescriptorPoolConfig& config);
+
   DescriptorSet get_descriptor_set(
       VkDescriptorSetLayout handle,
       const ShaderLayout::Signature& signature);
diff --git a/aten/src/ATen/native/vulkan/api/Exception.cpp b/aten/src/ATen/native/vulkan/api/Exception.cpp
new file mode 100644
index 0000000000000..9b8b653e0619e
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/Exception.cpp
@@ -0,0 +1,77 @@
+#include <ATen/native/vulkan/api/Exception.h>
+
+#include <sstream>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+
+#define VK_RESULT_CASE(code) \
+  case code:                 \
+    out << #code;            \
+    break;
+
+std::ostream& operator<<(std::ostream& out, const VkResult result) {
+  switch (result) {
+    VK_RESULT_CASE(VK_SUCCESS)
+    VK_RESULT_CASE(VK_NOT_READY)
+    VK_RESULT_CASE(VK_TIMEOUT)
+    VK_RESULT_CASE(VK_EVENT_SET)
+    VK_RESULT_CASE(VK_EVENT_RESET)
+    VK_RESULT_CASE(VK_INCOMPLETE)
+    VK_RESULT_CASE(VK_ERROR_OUT_OF_HOST_MEMORY)
+    VK_RESULT_CASE(VK_ERROR_OUT_OF_DEVICE_MEMORY)
+    VK_RESULT_CASE(VK_ERROR_INITIALIZATION_FAILED)
+    VK_RESULT_CASE(VK_ERROR_DEVICE_LOST)
+    VK_RESULT_CASE(VK_ERROR_MEMORY_MAP_FAILED)
+    VK_RESULT_CASE(VK_ERROR_LAYER_NOT_PRESENT)
+    VK_RESULT_CASE(VK_ERROR_EXTENSION_NOT_PRESENT)
+    VK_RESULT_CASE(VK_ERROR_FEATURE_NOT_PRESENT)
+    VK_RESULT_CASE(VK_ERROR_INCOMPATIBLE_DRIVER)
+    VK_RESULT_CASE(VK_ERROR_TOO_MANY_OBJECTS)
+    VK_RESULT_CASE(VK_ERROR_FORMAT_NOT_SUPPORTED)
+    VK_RESULT_CASE(VK_ERROR_FRAGMENTED_POOL)
+    default:
+      out << "VK_ERROR_UNKNOWN (VkResult " << result << ")";
+      break;
+  }
+  return out;
+}
+
+#undef VK_RESULT_CASE
+
+//
+// SourceLocation
+//
+
+std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
+  out << loc.function << " at " << loc.file << ":" << loc.line;
+  return out;
+}
+
+//
+// Exception
+//
+
+Error::Error(SourceLocation source_location, std::string msg)
+    : msg_(std::move(msg)), source_location_{source_location} {
+  std::ostringstream oss;
+  oss << "Exception raised from " << source_location_ << ": ";
+  oss << msg_;
+  what_ = oss.str();
+}
+
+Error::Error(SourceLocation source_location, const char* cond, std::string msg)
+    : msg_(std::move(msg)), source_location_{source_location} {
+  std::ostringstream oss;
+  oss << "Exception raised from " << source_location_ << ": ";
+  oss << "(" << cond << ") is false! ";
+  oss << msg_;
+  what_ = oss.str();
+}
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/api/Exception.h b/aten/src/ATen/native/vulkan/api/Exception.h
new file mode 100644
index 0000000000000..328ff67894e89
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/Exception.h
@@ -0,0 +1,81 @@
+#pragma once
+// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
+#ifdef USE_VULKAN_API
+
+#include <exception>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include <ATen/native/vulkan/api/StringUtil.h>
+#include <ATen/native/vulkan/api/vk_api.h>
+
+#define VK_CHECK(function)                                       \
+  do {                                                           \
+    const VkResult result = (function);                          \
+    if (VK_SUCCESS != result) {                                  \
+      throw ::at::native::vulkan::api::Error(                    \
+          {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
+          ::at::native::vulkan::api::concat_str(                 \
+              #function, " returned ", result));                 \
+    }                                                            \
+  } while (false)
+
+#define VK_CHECK_COND(cond, ...)                                 \
+  do {                                                           \
+    if (!(cond)) {                                               \
+      throw ::at::native::vulkan::api::Error(                    \
+          {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
+          #cond,                                                 \
+          ::at::native::vulkan::api::concat_str(__VA_ARGS__));   \
+    }                                                            \
+  } while (false)
+
+#define VK_THROW(...)                                          \
+  do {                                                         \
+    throw ::at::native::vulkan::api::Error(                    \
+        {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
+        ::at::native::vulkan::api::concat_str(__VA_ARGS__));   \
+  } while (false)
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+
+std::ostream& operator<<(std::ostream& out, const VkResult loc);
+
+struct SourceLocation {
+  const char* function;
+  const char* file;
+  uint32_t line;
+};
+
+std::ostream& operator<<(std::ostream& out, const SourceLocation& loc);
+
+class Error : public std::exception {
+ public:
+  Error(SourceLocation source_location, std::string msg);
+  Error(SourceLocation source_location, const char* cond, std::string msg);
+
+ private:
+  std::string msg_;
+  SourceLocation source_location_;
+  std::string what_;
+
+ public:
+  const std::string& msg() const {
+    return msg_;
+  }
+
+  const char* what() const noexcept override {
+    return what_.c_str();
+  }
+};
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.cpp b/aten/src/ATen/native/vulkan/api/Pipeline.cpp
index 8c1ce13457ccf..114fad05afaa5 100644
--- a/aten/src/ATen/native/vulkan/api/Pipeline.cpp
+++ b/aten/src/ATen/native/vulkan/api/Pipeline.cpp
@@ -75,22 +75,18 @@ VkImageLayout vk_layout(
           return VK_IMAGE_LAYOUT_GENERAL;
       }
       break;
-
     case PipelineStage::TRANSFER:
       switch (access) {
         case MemoryAccessType::READ:
           return VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
-
         case MemoryAccessType::WRITE:
           return VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
-
         default:
-          TORCH_INTERNAL_ASSERT(false, "Invalid!");
+          VK_THROW("Invalid memory access type for transfer stage!");
       }
       break;
-
     default:
-      TORCH_INTERNAL_ASSERT(false, "Invalid!");
+      VK_THROW("Cannot determine appropriate image layout");
   }
 
   return VK_IMAGE_LAYOUT_UNDEFINED;
@@ -125,7 +121,7 @@ PipelineLayout::PipelineLayout(PipelineLayout&& other) noexcept
 }
 
 PipelineLayout::~PipelineLayout() {
-  if C10_LIKELY (VK_NULL_HANDLE == handle_) {
+  if (VK_NULL_HANDLE == handle_) {
     return;
   }
   vkDestroyPipelineLayout(device_, handle_, nullptr);
@@ -216,7 +212,7 @@ ComputePipeline::ComputePipeline(ComputePipeline&& other) noexcept
 }
 
 ComputePipeline::~ComputePipeline() {
-  if C10_LIKELY (VK_NULL_HANDLE == handle_) {
+  if (VK_NULL_HANDLE == handle_) {
     return;
   }
   vkDestroyPipeline(device_, handle_, nullptr);
@@ -264,7 +260,7 @@ VkPipelineLayout PipelineLayoutCache::retrieve(
   std::lock_guard<std::mutex> lock(cache_mutex_);
 
   auto it = cache_.find(key);
-  if C10_UNLIKELY (cache_.cend() == it) {
+  if (cache_.cend() == it) {
     it = cache_.insert({key, PipelineLayoutCache::Value(device_, key)}).first;
   }
 
@@ -311,7 +307,7 @@ ComputePipelineCache::ComputePipelineCache(
 ComputePipelineCache::~ComputePipelineCache() {
   purge();
 
-  if C10_LIKELY (VK_NULL_HANDLE == pipeline_cache_) {
+  if (VK_NULL_HANDLE == pipeline_cache_) {
     return;
   }
   vkDestroyPipelineCache(device_, pipeline_cache_, nullptr);
@@ -323,7 +319,7 @@ VkPipeline ComputePipelineCache::retrieve(
   std::lock_guard<std::mutex> lock(cache_mutex_);
 
   auto it = cache_.find(key);
-  if C10_UNLIKELY (cache_.cend() == it) {
+  if (cache_.cend() == it) {
     it = cache_
              .insert(
                  {key,
diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.h b/aten/src/ATen/native/vulkan/api/Pipeline.h
index e1866fc43f69b..4425f5300cc31 100644
--- a/aten/src/ATen/native/vulkan/api/Pipeline.h
+++ b/aten/src/ATen/native/vulkan/api/Pipeline.h
@@ -4,13 +4,13 @@
 
 #ifdef USE_VULKAN_API
 
-#include <ATen/native/vulkan/api/Common.h>
+#include <ATen/native/vulkan/api/vk_api.h>
+
 #include <ATen/native/vulkan/api/Resource.h>
 #include <ATen/native/vulkan/api/Shader.h>
-#include <c10/util/SmallVector.h>
-#include <c10/util/flat_hash_map.h>
 
 #include <mutex>
+#include <unordered_map>
 
 namespace at {
 namespace native {
@@ -23,8 +23,10 @@ struct PipelineBarrier final {
     VkPipelineStageFlags dst;
   } stage;
 
-  c10::SmallVector<BufferMemoryBarrier, 4u> buffers;
-  c10::SmallVector<ImageMemoryBarrier, 4u> images;
+  std::vector<BufferMemoryBarrier> buffers;
+  std::vector<ImageMemoryBarrier> images;
+  std::vector<VkBufferMemoryBarrier> buffer_barrier_handles;
+  std::vector<VkImageMemoryBarrier> image_barrier_handles;
 
   inline operator bool() const {
     return (0u != stage.src) || (0u != stage.dst) || !buffers.empty() ||
@@ -125,7 +127,7 @@ class PipelineLayoutCache final {
 
   struct Hasher {
     inline size_t operator()(VkDescriptorSetLayout descriptor_layout) const {
-      return c10::get_hash(descriptor_layout);
+      return std::hash<VkDescriptorSetLayout>()(descriptor_layout);
     }
   };
 
@@ -135,7 +137,7 @@ class PipelineLayoutCache final {
   std::mutex cache_mutex_;
 
   VkDevice device_;
-  ska::flat_hash_map<Key, Value, Hasher> cache_;
+  std::unordered_map<Key, Value, Hasher> cache_;
 
  public:
   VkPipelineLayout retrieve(const Key&);
@@ -160,12 +162,19 @@ class ComputePipelineCache final {
   struct Hasher {
     inline size_t operator()(
         const ComputePipeline::Descriptor& descriptor) const {
-      return c10::get_hash(
-          descriptor.pipeline_layout,
-          descriptor.shader_module,
-          descriptor.local_work_group.data[0u],
-          descriptor.local_work_group.data[1u],
-          descriptor.local_work_group.data[2u]);
+      size_t seed = 0;
+      seed = utils::hash_combine(
+          seed, std::hash<VkPipelineLayout>()(descriptor.pipeline_layout));
+      seed = utils::hash_combine(
+          seed, std::hash<VkShaderModule>()(descriptor.shader_module));
+      seed = utils::hash_combine(
+          seed, std::hash<uint32_t>()(descriptor.local_work_group.data[0u]));
+      seed = utils::hash_combine(
+          seed, std::hash<uint32_t>()(descriptor.local_work_group.data[1u]));
+      seed = utils::hash_combine(
+          seed, std::hash<uint32_t>()(descriptor.local_work_group.data[2u]));
+
+      return seed;
     }
   };
 
@@ -176,7 +185,7 @@ class ComputePipelineCache final {
 
   VkDevice device_;
   VkPipelineCache pipeline_cache_;
-  ska::flat_hash_map<Key, Value, Hasher> cache_;
+  std::unordered_map<Key, Value, Hasher> cache_;
 
  public:
   VkPipeline retrieve(const Key&);
diff --git a/aten/src/ATen/native/vulkan/api/QueryPool.cpp b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
index 562426a92ff73..e74b7431b0f79 100644
--- a/aten/src/ATen/native/vulkan/api/QueryPool.cpp
+++ b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
@@ -6,6 +6,7 @@
 #endif // USE_KINETO
 
 #include <cmath>
+#include <iomanip>
 #include <iostream>
 #include <utility>
 
@@ -43,7 +44,7 @@ QueryPool::QueryPool(const QueryPoolConfig& config, const Adapter* adapter_p)
 
   shader_log().reserve(config_.initialReserveSize);
 
-  TORCH_CHECK(adapter_p, "Valid GPU device must be created for QueryPool");
+  VK_CHECK_COND(adapter_p, "Valid GPU device must be created for QueryPool");
   ns_per_tick_ = std::lround(adapter_p->timestamp_period());
   ns_per_tick_ = (ns_per_tick_ == 0) ? kDefaultNsPerTick : ns_per_tick_;
 
@@ -77,7 +78,7 @@ void QueryPool::reset(const CommandBuffer& cmd) {
 }
 
 size_t QueryPool::write_timestamp(const CommandBuffer& cmd) {
-  TORCH_CHECK(
+  VK_CHECK_COND(
       in_use_ < config_.maxQueryCount,
       "Vulkan QueryPool: Exceeded the maximum number of queries "
       "allowed by the queryPool (",
@@ -187,7 +188,7 @@ std::string QueryPool::generate_string_report() {
 
   std::stringstream ss;
 
-  int kernel_name_w = 25;
+  int kernel_name_w = 40;
   int global_size_w = 15;
   int duration_w = 25;
 
@@ -245,7 +246,7 @@ std::tuple<std::string, uint64_t> QueryPool::
   std::lock_guard<std::mutex> lock(mutex_);
 
   const size_t entry_count = shader_logs_entry_count_thread_unsafe();
-  TORCH_CHECK(
+  VK_CHECK_COND(
       (query_index >= 0 && query_index < entry_count),
       "query_index of ",
       query_index,
diff --git a/aten/src/ATen/native/vulkan/api/QueryPool.h b/aten/src/ATen/native/vulkan/api/QueryPool.h
index ecede3dc14585..b78a904fb16db 100644
--- a/aten/src/ATen/native/vulkan/api/QueryPool.h
+++ b/aten/src/ATen/native/vulkan/api/QueryPool.h
@@ -2,11 +2,13 @@
 
 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
 
+#include <functional>
 #ifdef USE_VULKAN_API
 
+#include <ATen/native/vulkan/api/vk_api.h>
+
 #include <ATen/native/vulkan/api/Adapter.h>
 #include <ATen/native/vulkan/api/Command.h>
-#include <ATen/native/vulkan/api/Common.h>
 #include <ATen/native/vulkan/api/Pipeline.h>
 
 namespace at {
diff --git a/aten/src/ATen/native/vulkan/api/Resource.cpp b/aten/src/ATen/native/vulkan/api/Resource.cpp
index f42d37810aa5b..ebed09d47e7d4 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.cpp
+++ b/aten/src/ATen/native/vulkan/api/Resource.cpp
@@ -1,75 +1,11 @@
 #include <ATen/native/vulkan/api/Adapter.h>
 #include <ATen/native/vulkan/api/Resource.h>
 
-#include <c10/core/ScalarTypeToTypeMeta.h>
-
 namespace at {
 namespace native {
 namespace vulkan {
 namespace api {
 
-//
-// Utility Functions
-//
-
-/*
- * This function is used to determine what image format to use for a given
- * dtype.
- *
- * TODO: enable proper format selection between kFloat and kHalf.
- *
- * Context: due to limitations of the shader compilation system, at the moment
- * it is not possible to support both 32 bit and 16 bit float formats since
- * shaders will have to specify the format qualifier of texture inputs. Right
- * now, shaders are compiled with either rgba16f or rgba32f qualifiers depending
- * on whether USE_VULKAN_FP16_INFERENCE is set. Therefore, textures must be
- * always created with the corresponding VkFormat. Consequently, kHalf tensors
- * are currently unsupported in favor of enforcing inputs to be of kFloat dtype.
- */
-VkFormat vk_format(const at::ScalarType dtype) {
-  switch (dtype) {
-    case c10::kBool:
-      return VK_FORMAT_R8G8B8A8_SINT;
-    case kFloat:
-#ifdef USE_VULKAN_FP16_INFERENCE
-      return VK_FORMAT_R16G16B16A16_SFLOAT;
-#else
-      return VK_FORMAT_R32G32B32A32_SFLOAT;
-#endif /* USE_VULKAN_FP16_INFERENCE */
-    case c10::kQUInt8:
-      return VK_FORMAT_R8G8B8A8_UINT;
-    case c10::kQInt8:
-      return VK_FORMAT_R8G8B8A8_SINT;
-    case c10::kQInt32:
-      return VK_FORMAT_R32G32B32A32_SINT;
-
-    default:
-      TORCH_CHECK(
-          false,
-          "Vulkan vk_format(): no corresponding format for dtype: ",
-          dtype);
-  }
-}
-
-/*
- * This function is used to map a texture format to a corresponding
- * c10::ScalarType. It is primarily used to set the data type of a
- * StorageBuffer object that will receive copied data from a texture.
- */
-c10::ScalarType c10_scalartype(const VkFormat image_format) {
-  switch (image_format) {
-    case VK_FORMAT_R32G32B32A32_SFLOAT:
-      return c10::kFloat;
-    case VK_FORMAT_R16G16B16A16_SFLOAT:
-      return c10::kHalf;
-    case VK_FORMAT_R8G8B8A8_UINT:
-      return c10::kQUInt8;
-
-    default:
-      TORCH_CHECK(false, "vulkan c10_scalartype(): Unknown VkFormat.");
-  }
-}
-
 //
 // MemoryBarrier
 //
@@ -84,29 +20,82 @@ MemoryBarrier::MemoryBarrier(
           dst_access_flags, // dstAccessMask
       } {}
 
+//
+// MemoryAllocation
+//
+
+MemoryAllocation::MemoryAllocation()
+    : memory_requirements{},
+      create_info{},
+      allocator(VK_NULL_HANDLE),
+      allocation(VK_NULL_HANDLE) {}
+
+MemoryAllocation::MemoryAllocation(
+    VmaAllocator vma_allocator,
+    const VkMemoryRequirements& mem_props,
+    const VmaAllocationCreateInfo& create_info)
+    : memory_requirements(mem_props),
+      create_info(create_info),
+      allocator(vma_allocator),
+      allocation(VK_NULL_HANDLE) {
+  VK_CHECK(vmaAllocateMemory(
+      allocator, &memory_requirements, &create_info, &allocation, nullptr));
+}
+
+MemoryAllocation::MemoryAllocation(MemoryAllocation&& other) noexcept
+    : memory_requirements(other.memory_requirements),
+      create_info(other.create_info),
+      allocator(other.allocator),
+      allocation(other.allocation) {
+  other.allocation = VK_NULL_HANDLE;
+}
+
+MemoryAllocation& MemoryAllocation::operator=(
+    MemoryAllocation&& other) noexcept {
+  VmaAllocation tmp_allocation = allocation;
+
+  memory_requirements = other.memory_requirements;
+  create_info = other.create_info;
+  allocator = other.allocator;
+  allocation = other.allocation;
+
+  other.allocation = tmp_allocation;
+
+  return *this;
+}
+
+MemoryAllocation::~MemoryAllocation() {
+  if (VK_NULL_HANDLE != allocation) {
+    vmaFreeMemory(allocator, allocation);
+  }
+}
+
 //
 // VulkanBuffer
 //
 
 VulkanBuffer::VulkanBuffer()
-    : memory_properties_{},
-      buffer_properties_{},
+    : buffer_properties_{},
       allocator_(VK_NULL_HANDLE),
-      allocation_(VK_NULL_HANDLE),
+      memory_{},
+      owns_memory_(false),
       handle_(VK_NULL_HANDLE) {}
 
 VulkanBuffer::VulkanBuffer(
     VmaAllocator vma_allocator,
     const VkDeviceSize size,
-    const VulkanBuffer::MemoryProperties& mem_props)
-    : memory_properties_(mem_props),
-      buffer_properties_({
+    const VmaAllocationCreateInfo& allocation_create_info,
+    const VkBufferUsageFlags usage,
+    const bool allocate_memory)
+    : buffer_properties_({
           size,
           0u,
           size,
+          usage,
       }),
       allocator_(vma_allocator),
-      allocation_(VK_NULL_HANDLE),
+      memory_{},
+      owns_memory_(allocate_memory),
       handle_(VK_NULL_HANDLE) {
   // Only allocate memory if the buffer has non-zero size
   if (size == 0) {
@@ -118,65 +107,75 @@ VulkanBuffer::VulkanBuffer(
       nullptr, // pNext
       0u, // flags
       size, // size
-      memory_properties_.buffer_usage, // usage
+      buffer_properties_.buffer_usage, // usage
       VK_SHARING_MODE_EXCLUSIVE, // sharingMode
       0u, // queueFamilyIndexCount
       nullptr, // pQueueFamilyIndices
   };
 
-  // TODO: enable creation with a custom pool
-  VmaAllocationCreateInfo alloc_create_info{
-      memory_properties_.create_flags, // flags
-      memory_properties_.memory_usage, // usage
-      memory_properties_.required_mem_flags, // requiredFlags
-      memory_properties_.preferred_mem_flags, // preferredFlags
-      0u, // memoryTypeBits
-      VK_NULL_HANDLE, // pool
-      nullptr, // pUserData
-      0.5f, // priority
-  };
-
-  VK_CHECK(vmaCreateBuffer(
-      allocator_,
-      &buffer_create_info,
-      &alloc_create_info,
-      &handle_,
-      &allocation_,
-      nullptr));
+  memory_.create_info = allocation_create_info;
+
+  if (allocate_memory) {
+    VK_CHECK(vmaCreateBuffer(
+        allocator_,
+        &buffer_create_info,
+        &allocation_create_info,
+        &handle_,
+        &(memory_.allocation),
+        nullptr));
+  } else {
+    VmaAllocatorInfo allocator_info{};
+    vmaGetAllocatorInfo(allocator_, &allocator_info);
+    VK_CHECK(vkCreateBuffer(
+        allocator_info.device, &buffer_create_info, nullptr, &handle_));
+  }
 }
 
 VulkanBuffer::VulkanBuffer(VulkanBuffer&& other) noexcept
-    : memory_properties_(other.memory_properties_),
-      buffer_properties_(other.buffer_properties_),
+    : buffer_properties_(other.buffer_properties_),
       allocator_(other.allocator_),
-      allocation_(other.allocation_),
+      memory_(std::move(other.memory_)),
+      owns_memory_(other.owns_memory_),
       handle_(other.handle_) {
-  other.allocation_ = VK_NULL_HANDLE;
   other.handle_ = VK_NULL_HANDLE;
 }
 
 VulkanBuffer& VulkanBuffer::operator=(VulkanBuffer&& other) noexcept {
-  VmaAllocation tmp_allocation = allocation_;
   VkBuffer tmp_buffer = handle_;
+  bool tmp_owns_memory = owns_memory_;
 
-  memory_properties_ = other.memory_properties_;
   buffer_properties_ = other.buffer_properties_;
   allocator_ = other.allocator_;
-  allocation_ = other.allocation_;
+  memory_ = std::move(other.memory_);
+  owns_memory_ = other.owns_memory_;
   handle_ = other.handle_;
 
-  other.allocation_ = tmp_allocation;
   other.handle_ = tmp_buffer;
+  other.owns_memory_ = tmp_owns_memory;
 
   return *this;
 }
 
 VulkanBuffer::~VulkanBuffer() {
   if (VK_NULL_HANDLE != handle_) {
-    vmaDestroyBuffer(allocator_, handle_, allocation_);
+    if (owns_memory_) {
+      vmaDestroyBuffer(allocator_, handle_, memory_.allocation);
+    } else {
+      vkDestroyBuffer(this->device(), handle_, nullptr);
+    }
+    // Prevent the underlying memory allocation from being freed; it was either
+    // freed by vmaDestroyBuffer, or this resource does not own the underlying
+    // memory
+    memory_.allocation = VK_NULL_HANDLE;
   }
 }
 
+VkMemoryRequirements VulkanBuffer::get_memory_requirements() const {
+  VkMemoryRequirements memory_requirements;
+  vkGetBufferMemoryRequirements(this->device(), handle_, &memory_requirements);
+  return memory_requirements;
+}
+
 //
 // MemoryMap
 //
@@ -203,7 +202,7 @@ MemoryMap::MemoryMap(MemoryMap&& other) noexcept
 }
 
 MemoryMap::~MemoryMap() {
-  if (C10_UNLIKELY(!data_)) {
+  if (!data_) {
     return;
   }
 
@@ -296,7 +295,7 @@ ImageSampler::ImageSampler(ImageSampler&& other) noexcept
 }
 
 ImageSampler::~ImageSampler() {
-  if C10_LIKELY (VK_NULL_HANDLE == handle_) {
+  if (VK_NULL_HANDLE == handle_) {
     return;
   }
   vkDestroySampler(device_, handle_, nullptr);
@@ -304,8 +303,15 @@ ImageSampler::~ImageSampler() {
 
 size_t ImageSampler::Hasher::operator()(
     const ImageSampler::Properties& props) const {
-  return c10::get_hash(
-      props.filter, props.mipmap_mode, props.address_mode, props.border_color);
+  size_t seed = 0;
+  seed = utils::hash_combine(seed, std::hash<VkFilter>()(props.filter));
+  seed = utils::hash_combine(
+      seed, std::hash<VkSamplerMipmapMode>()(props.mipmap_mode));
+  seed = utils::hash_combine(
+      seed, std::hash<VkSamplerAddressMode>()(props.address_mode));
+  seed =
+      utils::hash_combine(seed, std::hash<VkBorderColor>()(props.border_color));
+  return seed;
 }
 
 void swap(ImageSampler& lhs, ImageSampler& rhs) noexcept {
@@ -324,12 +330,12 @@ void swap(ImageSampler& lhs, ImageSampler& rhs) noexcept {
 //
 
 VulkanImage::VulkanImage()
-    : memory_properties_{},
-      image_properties_{},
+    : image_properties_{},
       view_properties_{},
       sampler_properties_{},
       allocator_(VK_NULL_HANDLE),
-      allocation_(VK_NULL_HANDLE),
+      memory_{},
+      owns_memory_(false),
       handles_{
           VK_NULL_HANDLE,
           VK_NULL_HANDLE,
@@ -339,25 +345,28 @@ VulkanImage::VulkanImage()
 
 VulkanImage::VulkanImage(
     VmaAllocator vma_allocator,
-    VkDevice device,
-    const MemoryProperties& mem_props,
+    const VmaAllocationCreateInfo& allocation_create_info,
     const ImageProperties& image_props,
     const ViewProperties& view_props,
     const SamplerProperties& sampler_props,
     const VkImageLayout layout,
-    VkSampler sampler)
-    : memory_properties_(mem_props),
-      image_properties_(image_props),
+    VkSampler sampler,
+    const bool allocate_memory)
+    : image_properties_(image_props),
       view_properties_(view_props),
       sampler_properties_(sampler_props),
       allocator_(vma_allocator),
-      allocation_(VK_NULL_HANDLE),
+      memory_{},
+      owns_memory_{allocate_memory},
       handles_{
           VK_NULL_HANDLE,
           VK_NULL_HANDLE,
           sampler,
       },
       layout_(layout) {
+  VmaAllocatorInfo allocator_info{};
+  vmaGetAllocatorInfo(allocator_, &allocator_info);
+
   // If any dims are zero, then no memory will be allocated for the image.
   if (image_props.image_extents.width == 0 ||
       image_props.image_extents.height == 0 ||
@@ -376,113 +385,129 @@ VulkanImage::VulkanImage(
       1u, // arrayLayers
       VK_SAMPLE_COUNT_1_BIT, // samples
       VK_IMAGE_TILING_OPTIMAL, // tiling
-      memory_properties_.image_usage, // usage
+      image_properties_.image_usage, // usage
       VK_SHARING_MODE_EXCLUSIVE, // sharingMode
       0u, // queueFamilyIndexCount
       nullptr, // pQueueFamilyIndices
       layout_, // initialLayout
   };
 
-  // TODO: enable creation with a custom pool
-  const VmaAllocationCreateInfo alloc_create_info{
-      memory_properties_.create_flags, // flags
-      memory_properties_.memory_usage, // usage
-      memory_properties_.required_mem_flags, // requiredFlags
-      memory_properties_.preferred_mem_flags, // preferredFlags
-      0u, // memoryTypeBits
-      VK_NULL_HANDLE, // pool
-      nullptr, // pUserData
-      0.5f, // priority
-  };
-
-  VK_CHECK(vmaCreateImage(
-      allocator_,
-      &image_create_info,
-      &alloc_create_info,
-      &(handles_.image),
-      &allocation_,
-      nullptr));
-
-  // Image View
-
-  const VkComponentMapping component_mapping{
-      VK_COMPONENT_SWIZZLE_IDENTITY, // r
-      VK_COMPONENT_SWIZZLE_IDENTITY, // g
-      VK_COMPONENT_SWIZZLE_IDENTITY, // b
-      VK_COMPONENT_SWIZZLE_IDENTITY, // a
-  };
-
-  const VkImageSubresourceRange subresource_range{
-      VK_IMAGE_ASPECT_COLOR_BIT, // aspectMask
-      0u, // baseMipLevel
-      VK_REMAINING_MIP_LEVELS, // levelCount
-      0u, // baseArrayLayer
-      VK_REMAINING_ARRAY_LAYERS, // layerCount
-  };
-
-  const VkImageViewCreateInfo image_view_create_info{
-      VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      handles_.image, // image
-      view_properties_.view_type, // viewType
-      view_properties_.view_format, // format
-      component_mapping, // components
-      subresource_range, // subresourceRange
-  };
-
-  VK_CHECK(vkCreateImageView(
-      device, &image_view_create_info, nullptr, &(handles_.image_view)));
+  memory_.create_info = allocation_create_info;
+
+  if (allocate_memory) {
+    VK_CHECK(vmaCreateImage(
+        allocator_,
+        &image_create_info,
+        &allocation_create_info,
+        &(handles_.image),
+        &(memory_.allocation),
+        nullptr));
+    // Only create the image view if the image has been bound to memory
+    create_image_view();
+  } else {
+    VK_CHECK(vkCreateImage(
+        allocator_info.device, &image_create_info, nullptr, &(handles_.image)));
+  }
 }
 
 VulkanImage::VulkanImage(VulkanImage&& other) noexcept
-    : memory_properties_(other.memory_properties_),
-      image_properties_(other.image_properties_),
+    : image_properties_(other.image_properties_),
       view_properties_(other.view_properties_),
       sampler_properties_(other.sampler_properties_),
       allocator_(other.allocator_),
-      allocation_(other.allocation_),
+      memory_(std::move(other.memory_)),
+      owns_memory_(other.owns_memory_),
       handles_(other.handles_),
       layout_(other.layout_) {
-  other.allocation_ = VK_NULL_HANDLE;
   other.handles_.image = VK_NULL_HANDLE;
   other.handles_.image_view = VK_NULL_HANDLE;
   other.handles_.sampler = VK_NULL_HANDLE;
+  other.owns_memory_ = false;
 }
 
 VulkanImage& VulkanImage::operator=(VulkanImage&& other) noexcept {
-  VmaAllocation tmp_allocation = allocation_;
   VkImage tmp_image = handles_.image;
   VkImageView tmp_image_view = handles_.image_view;
+  bool tmp_owns_memory = owns_memory_;
 
-  memory_properties_ = other.memory_properties_;
   image_properties_ = other.image_properties_;
   view_properties_ = other.view_properties_;
   sampler_properties_ = other.sampler_properties_;
   allocator_ = other.allocator_;
-  allocation_ = other.allocation_;
+  memory_ = std::move(other.memory_);
+  owns_memory_ = other.owns_memory_;
   handles_ = other.handles_;
   layout_ = other.layout_;
 
-  other.allocation_ = tmp_allocation;
   other.handles_.image = tmp_image;
   other.handles_.image_view = tmp_image_view;
+  other.owns_memory_ = tmp_owns_memory;
 
   return *this;
 }
 
 VulkanImage::~VulkanImage() {
   if (VK_NULL_HANDLE != handles_.image_view) {
-    VmaAllocatorInfo allocator_info{};
-    vmaGetAllocatorInfo(allocator_, &allocator_info);
-    vkDestroyImageView(allocator_info.device, handles_.image_view, nullptr);
+    vkDestroyImageView(this->device(), handles_.image_view, nullptr);
   }
 
   if (VK_NULL_HANDLE != handles_.image) {
-    vmaDestroyImage(allocator_, handles_.image, allocation_);
+    if (owns_memory_) {
+      vmaDestroyImage(allocator_, handles_.image, memory_.allocation);
+    } else {
+      vkDestroyImage(this->device(), handles_.image, nullptr);
+    }
+    // Prevent the underlying memory allocation from being freed; it was either
+    // freed by vmaDestroyImage, or this resource does not own the underlying
+    // memory
+    memory_.allocation = VK_NULL_HANDLE;
   }
 }
 
+void VulkanImage::create_image_view() {
+  VmaAllocatorInfo allocator_info{};
+  vmaGetAllocatorInfo(allocator_, &allocator_info);
+
+  const VkComponentMapping component_mapping{
+      VK_COMPONENT_SWIZZLE_IDENTITY, // r
+      VK_COMPONENT_SWIZZLE_IDENTITY, // g
+      VK_COMPONENT_SWIZZLE_IDENTITY, // b
+      VK_COMPONENT_SWIZZLE_IDENTITY, // a
+  };
+
+  const VkImageSubresourceRange subresource_range{
+      VK_IMAGE_ASPECT_COLOR_BIT, // aspectMask
+      0u, // baseMipLevel
+      VK_REMAINING_MIP_LEVELS, // levelCount
+      0u, // baseArrayLayer
+      VK_REMAINING_ARRAY_LAYERS, // layerCount
+  };
+
+  const VkImageViewCreateInfo image_view_create_info{
+      VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, // sType
+      nullptr, // pNext
+      0u, // flags
+      handles_.image, // image
+      view_properties_.view_type, // viewType
+      view_properties_.view_format, // format
+      component_mapping, // components
+      subresource_range, // subresourceRange
+  };
+
+  VK_CHECK(vkCreateImageView(
+      allocator_info.device,
+      &(image_view_create_info),
+      nullptr,
+      &(handles_.image_view)));
+}
+
+VkMemoryRequirements VulkanImage::get_memory_requirements() const {
+  VkMemoryRequirements memory_requirements;
+  vkGetImageMemoryRequirements(
+      this->device(), handles_.image, &memory_requirements);
+  return memory_requirements;
+}
+
 //
 // ImageMemoryBarrier
 //
@@ -533,7 +558,7 @@ VkSampler SamplerCache::retrieve(const SamplerCache::Key& key) {
   std::lock_guard<std::mutex> lock(cache_mutex_);
 
   auto it = cache_.find(key);
-  if C10_UNLIKELY (cache_.cend() == it) {
+  if (cache_.cend() == it) {
     it = cache_.insert({key, SamplerCache::Value(device_, key)}).first;
   }
 
@@ -590,12 +615,41 @@ MemoryAllocator::MemoryAllocator(MemoryAllocator&& other) noexcept
 }
 
 MemoryAllocator::~MemoryAllocator() {
-  if C10_LIKELY (VK_NULL_HANDLE == allocator_) {
+  if (VK_NULL_HANDLE == allocator_) {
     return;
   }
   vmaDestroyAllocator(allocator_);
 }
 
+MemoryAllocation MemoryAllocator::create_allocation(
+    const VkMemoryRequirements& memory_requirements,
+    const VmaAllocationCreateInfo& create_info) {
+  VmaAllocationCreateInfo alloc_create_info = create_info;
+  // Protect against using VMA_MEMORY_USAGE_AUTO_* flags when allocating memory
+  // directly, since those usage flags require that VkBufferCreateInfo and/or
+  // VkImageCreateInfo also be available.
+  switch (create_info.usage) {
+    // The logic for the below usage options are too complex, therefore prevent
+    // those from being used with direct memory allocation.
+    case VMA_MEMORY_USAGE_AUTO:
+    case VMA_MEMORY_USAGE_AUTO_PREFER_HOST:
+      VK_THROW(
+          "Only the VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE usage flag is compatible with create_allocation()");
+      break;
+    // Most of the time, VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE will simply set the
+    // DEVICE_LOCAL_BIT as a preferred memory flag. Therefore the below is a
+    // decent approximation for VMA behaviour.
+    case VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE:
+      alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+      alloc_create_info.usage = VMA_MEMORY_USAGE_UNKNOWN;
+      break;
+    default:
+      break;
+  }
+
+  return MemoryAllocation(allocator_, memory_requirements, alloc_create_info);
+}
+
 VulkanImage MemoryAllocator::create_image(
     const VkExtent3D& extents,
     const VkFormat image_format,
@@ -603,7 +657,8 @@ VulkanImage MemoryAllocator::create_image(
     const VkImageViewType image_view_type,
     const VulkanImage::SamplerProperties& sampler_props,
     VkSampler sampler,
-    const bool allow_transfer) {
+    const bool allow_transfer,
+    const bool allocate_memory) {
   VkImageUsageFlags usage =
       VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT;
   if (allow_transfer) {
@@ -611,18 +666,15 @@ VulkanImage MemoryAllocator::create_image(
         (VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
   }
 
-  const VulkanImage::MemoryProperties mem_props{
-      DEFAULT_ALLOCATION_STRATEGY,
-      VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE,
-      0u,
-      0u,
-      usage,
-  };
+  VmaAllocationCreateInfo alloc_create_info = {};
+  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
+  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
 
   const VulkanImage::ImageProperties image_props{
       image_type,
       image_format,
       extents,
+      usage,
   };
 
   const VulkanImage::ViewProperties view_props{
@@ -634,71 +686,66 @@ VulkanImage MemoryAllocator::create_image(
 
   return VulkanImage(
       allocator_,
-      device_,
-      mem_props,
+      alloc_create_info,
       image_props,
       view_props,
       sampler_props,
       initial_layout,
-      sampler);
+      sampler,
+      allocate_memory);
 }
 
 VulkanBuffer MemoryAllocator::create_storage_buffer(
     const VkDeviceSize size,
-    const bool gpu_only) {
+    const bool gpu_only,
+    const bool allocate_memory) {
   const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
 
-  VmaAllocationCreateFlags create_flags = DEFAULT_ALLOCATION_STRATEGY;
+  VmaAllocationCreateInfo alloc_create_info = {};
+  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
+  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
+
+  // The create storage buffer will be accessed by both the CPU and GPU, so set
+  // the appropriate flags to indicate that the host device will be accessing
+  // the data from this buffer.
   if (!gpu_only) {
-    create_flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
+    // Deferred memory allocation should only be used for GPU only buffers.
+    VK_CHECK_COND(
+        allocate_memory,
+        "Only GPU-only buffers should use deferred memory allocation");
+
+    alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
+    alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
+    alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+    alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+        VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
   }
 
-  const VmaMemoryUsage vma_usage =
-      gpu_only ? VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE : VMA_MEMORY_USAGE_AUTO;
-
-  const VkMemoryPropertyFlags required_mem_props =
-      gpu_only ? 0u : VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
-
-  const VkMemoryPropertyFlags preferred_mem_props = gpu_only
-      ? 0u
-      : VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
-          VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
-
-  const VulkanBuffer::MemoryProperties mem_props{
-      create_flags,
-      vma_usage,
-      required_mem_props,
-      preferred_mem_props,
-      buffer_usage,
-  };
-
-  return VulkanBuffer(allocator_, size, mem_props);
+  return VulkanBuffer(
+      allocator_, size, alloc_create_info, buffer_usage, allocate_memory);
 }
 
 VulkanBuffer MemoryAllocator::create_staging_buffer(const VkDeviceSize size) {
-  const VulkanBuffer::MemoryProperties mem_props{
-      DEFAULT_ALLOCATION_STRATEGY,
-      VMA_MEMORY_USAGE_AUTO_PREFER_HOST,
-      0u,
-      0u,
-      VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
-  };
+  VmaAllocationCreateInfo alloc_create_info = {};
+  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
+  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
 
-  return VulkanBuffer(allocator_, size, mem_props);
+  VkBufferUsageFlags buffer_usage =
+      VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+
+  return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage);
 }
 
 VulkanBuffer MemoryAllocator::create_uniform_buffer(const VkDeviceSize size) {
-  const VulkanBuffer::MemoryProperties mem_props{
-      DEFAULT_ALLOCATION_STRATEGY |
-          VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT,
-      VMA_MEMORY_USAGE_AUTO,
-      0u,
-      0u,
-      VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
-  };
+  VmaAllocationCreateInfo alloc_create_info = {};
+  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY |
+      VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
+  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO;
 
-  VulkanBuffer uniform_buffer(allocator_, size, mem_props);
+  VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
 
+  VulkanBuffer uniform_buffer(
+      allocator_, size, alloc_create_info, buffer_usage);
   return uniform_buffer;
 }
 
@@ -739,7 +786,7 @@ VulkanFence& VulkanFence::operator=(VulkanFence&& other) noexcept {
 }
 
 VulkanFence::~VulkanFence() {
-  if C10_LIKELY (VK_NULL_HANDLE == handle_) {
+  if (VK_NULL_HANDLE == handle_) {
     return;
   }
   vkDestroyFence(device_, handle_, nullptr);
@@ -756,7 +803,7 @@ void VulkanFence::wait() {
       // The timeout (last) arg is in units of ns
       fence_status = vkWaitForFences(device_, 1u, &handle_, VK_TRUE, 100000);
 
-      TORCH_CHECK(
+      VK_CHECK_COND(
           fence_status != VK_ERROR_DEVICE_LOST,
           "Vulkan Fence: Device lost while waiting for fence!");
     } while (fence_status != VK_SUCCESS);
diff --git a/aten/src/ATen/native/vulkan/api/Resource.h b/aten/src/ATen/native/vulkan/api/Resource.h
index 807da7cb9462c..331d4c49f5ccd 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.h
+++ b/aten/src/ATen/native/vulkan/api/Resource.h
@@ -4,14 +4,18 @@
 
 #ifdef USE_VULKAN_API
 
+#include <ATen/native/vulkan/api/vk_api.h>
+
 #include <ATen/native/vulkan/api/Allocator.h>
+#include <ATen/native/vulkan/api/Types.h>
 #include <ATen/native/vulkan/api/Utils.h>
 
-#include <c10/core/ScalarType.h>
-#include <c10/util/flat_hash_map.h>
-#include <c10/util/typeid.h>
-
+#include <mutex>
+#include <ostream>
 #include <stack>
+#include <unordered_map>
+
+std::ostream& operator<<(std::ostream& out, VmaTotalStatistics stats);
 
 namespace at {
 namespace native {
@@ -20,10 +24,6 @@ namespace api {
 
 using MemoryAccessFlags = uint8_t;
 
-VkFormat vk_format(const at::ScalarType dtype);
-
-c10::ScalarType c10_scalartype(const VkFormat image_format);
-
 constexpr VmaAllocationCreateFlags DEFAULT_ALLOCATION_STRATEGY =
     VMA_ALLOCATION_CREATE_STRATEGY_MIN_MEMORY_BIT;
 
@@ -41,22 +41,42 @@ struct MemoryBarrier final {
       const VkAccessFlags dst_access_flags);
 };
 
-class VulkanBuffer final {
- public:
-  struct MemoryProperties final {
-    VmaAllocationCreateFlags create_flags;
+struct MemoryAllocation final {
+  explicit MemoryAllocation();
 
-    VmaMemoryUsage memory_usage;
-    VkMemoryPropertyFlags required_mem_flags;
-    VkMemoryPropertyFlags preferred_mem_flags;
+  explicit MemoryAllocation(
+      const VmaAllocator,
+      const VkMemoryRequirements&,
+      const VmaAllocationCreateInfo&);
 
-    VkBufferUsageFlags buffer_usage;
-  };
+  MemoryAllocation(const MemoryAllocation&) = delete;
+  MemoryAllocation& operator=(const MemoryAllocation&) = delete;
+
+  MemoryAllocation(MemoryAllocation&&) noexcept;
+  MemoryAllocation& operator=(MemoryAllocation&&) noexcept;
+
+  ~MemoryAllocation();
+
+  VkMemoryRequirements memory_requirements;
+  // The properties this allocation was created with
+  VmaAllocationCreateInfo create_info;
+  // The allocator object this was allocated from
+  VmaAllocator allocator;
+  // Handles to the allocated memory
+  VmaAllocation allocation;
 
+  operator bool() const {
+    return (allocation != VK_NULL_HANDLE);
+  }
+};
+
+class VulkanBuffer final {
+ public:
   struct BufferProperties final {
     VkDeviceSize size;
     VkDeviceSize mem_offset;
     VkDeviceSize mem_range;
+    VkBufferUsageFlags buffer_usage;
   };
 
   explicit VulkanBuffer();
@@ -64,7 +84,9 @@ class VulkanBuffer final {
   explicit VulkanBuffer(
       const VmaAllocator,
       const VkDeviceSize,
-      const MemoryProperties&);
+      const VmaAllocationCreateInfo&,
+      const VkBufferUsageFlags,
+      const bool allocate_memory = true);
 
   VulkanBuffer(const VulkanBuffer&) = delete;
   VulkanBuffer& operator=(const VulkanBuffer&) = delete;
@@ -83,21 +105,30 @@ class VulkanBuffer final {
   friend struct BufferMemoryBarrier;
 
  private:
-  MemoryProperties memory_properties_;
   BufferProperties buffer_properties_;
-  // The allocator object this was allocated from
   VmaAllocator allocator_;
-  // Handles to the allocated memory
-  VmaAllocation allocation_;
+  MemoryAllocation memory_;
+  // Indicates whether the underlying memory is owned by this resource
+  bool owns_memory_;
   VkBuffer handle_;
 
  public:
+  inline VkDevice device() const {
+    VmaAllocatorInfo allocator_info{};
+    vmaGetAllocatorInfo(allocator_, &allocator_info);
+    return allocator_info.device;
+  }
+
   inline VmaAllocator vma_allocator() const {
     return allocator_;
   }
 
   inline VmaAllocation allocation() const {
-    return allocation_;
+    return memory_.allocation;
+  }
+
+  inline VmaAllocationCreateInfo allocation_create_info() const {
+    return VmaAllocationCreateInfo(memory_.create_info);
   }
 
   inline VkBuffer handle() const {
@@ -116,9 +147,25 @@ class VulkanBuffer final {
     return buffer_properties_.size;
   }
 
+  inline bool has_memory() const {
+    return (memory_.allocation != VK_NULL_HANDLE);
+  }
+
+  inline bool owns_memory() const {
+    return owns_memory_;
+  }
+
   operator bool() const {
-    return (allocation_ != VK_NULL_HANDLE);
+    return (handle_ != VK_NULL_HANDLE);
+  }
+
+  inline void bind_allocation(const MemoryAllocation& memory) {
+    VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");
+    VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_));
+    memory_.allocation = memory.allocation;
   }
+
+  VkMemoryRequirements get_memory_requirements() const;
 };
 
 class MemoryMap final {
@@ -204,20 +251,11 @@ class ImageSampler final {
 
 class VulkanImage final {
  public:
-  struct MemoryProperties final {
-    VmaAllocationCreateFlags create_flags;
-
-    VmaMemoryUsage memory_usage;
-    VkMemoryPropertyFlags required_mem_flags;
-    VkMemoryPropertyFlags preferred_mem_flags;
-
-    VkImageUsageFlags image_usage;
-  };
-
   struct ImageProperties final {
     VkImageType image_type;
     VkFormat image_format;
     VkExtent3D image_extents;
+    VkImageUsageFlags image_usage;
   };
 
   struct ViewProperties final {
@@ -237,13 +275,13 @@ class VulkanImage final {
 
   explicit VulkanImage(
       const VmaAllocator,
-      VkDevice,
-      const MemoryProperties&,
+      const VmaAllocationCreateInfo&,
       const ImageProperties&,
       const ViewProperties&,
       const SamplerProperties&,
       const VkImageLayout layout,
-      VkSampler);
+      VkSampler,
+      const bool allocate_memory = true);
 
   VulkanImage(const VulkanImage&) = delete;
   VulkanImage& operator=(const VulkanImage&) = delete;
@@ -263,25 +301,38 @@ class VulkanImage final {
   friend struct ImageMemoryBarrier;
 
  private:
-  MemoryProperties memory_properties_;
   ImageProperties image_properties_;
   ViewProperties view_properties_;
   SamplerProperties sampler_properties_;
   // The allocator object this was allocated from
   VmaAllocator allocator_;
   // Handles to the allocated memory
-  VmaAllocation allocation_;
+  MemoryAllocation memory_;
+  // Indicates whether the underlying memory is owned by this resource
+  bool owns_memory_;
   Handles handles_;
   // Layout
   VkImageLayout layout_;
 
  public:
+  void create_image_view();
+
+  inline VkDevice device() const {
+    VmaAllocatorInfo allocator_info{};
+    vmaGetAllocatorInfo(allocator_, &allocator_info);
+    return allocator_info.device;
+  }
+
   inline VmaAllocator vma_allocator() const {
     return allocator_;
   }
 
   inline VmaAllocation allocation() const {
-    return allocation_;
+    return memory_.allocation;
+  }
+
+  inline VmaAllocationCreateInfo allocation_create_info() const {
+    return VmaAllocationCreateInfo(memory_.create_info);
   }
 
   inline VkFormat format() const {
@@ -321,9 +372,28 @@ class VulkanImage final {
     layout_ = layout;
   }
 
+  inline bool has_memory() const {
+    return (memory_.allocation != VK_NULL_HANDLE);
+  }
+
+  inline bool owns_memory() const {
+    return owns_memory_;
+  }
+
   inline operator bool() const {
-    return (allocation_ != VK_NULL_HANDLE);
+    return (handles_.image != VK_NULL_HANDLE);
   }
+
+  inline void bind_allocation(const MemoryAllocation& memory) {
+    VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");
+    VK_CHECK(vmaBindImageMemory(allocator_, memory.allocation, handles_.image));
+    memory_.allocation = memory.allocation;
+
+    // Only create the image view if the image has been bound to memory
+    create_image_view();
+  }
+
+  VkMemoryRequirements get_memory_requirements() const;
 };
 
 struct ImageMemoryBarrier final {
@@ -359,7 +429,7 @@ class SamplerCache final {
   std::mutex cache_mutex_;
 
   VkDevice device_;
-  ska::flat_hash_map<Key, Value, Hasher> cache_;
+  std::unordered_map<Key, Value, Hasher> cache_;
 
  public:
   VkSampler retrieve(const Key&);
@@ -388,6 +458,10 @@ class MemoryAllocator final {
   VmaAllocator allocator_;
 
  public:
+  MemoryAllocation create_allocation(
+      const VkMemoryRequirements& memory_requirements,
+      const VmaAllocationCreateInfo& create_info);
+
   VulkanImage create_image(
       const VkExtent3D&,
       const VkFormat,
@@ -395,11 +469,13 @@ class MemoryAllocator final {
       const VkImageViewType,
       const VulkanImage::SamplerProperties&,
       VkSampler,
-      const bool allow_transfer = false);
+      const bool allow_transfer = false,
+      const bool allocate_memory = true);
 
   VulkanBuffer create_storage_buffer(
       const VkDeviceSize,
-      const bool gpu_only = true);
+      const bool gpu_only = true,
+      const bool allocate_memory = true);
 
   VulkanBuffer create_staging_buffer(const VkDeviceSize);
 
@@ -413,6 +489,12 @@ class MemoryAllocator final {
    */
   template <typename Block>
   VulkanBuffer create_params_buffer(const Block& block);
+
+  VmaTotalStatistics get_memory_statistics() const {
+    VmaTotalStatistics stats = {};
+    vmaCalculateStatistics(allocator_, &stats);
+    return stats;
+  }
 };
 
 class VulkanFence final {
diff --git a/aten/src/ATen/native/vulkan/api/Runtime.cpp b/aten/src/ATen/native/vulkan/api/Runtime.cpp
index 4498ceb5d2bd5..c84d8d4e4c8e9 100644
--- a/aten/src/ATen/native/vulkan/api/Runtime.cpp
+++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp
@@ -1,9 +1,9 @@
+#include <cstring>
+#include <iostream>
+#include <sstream>
+
 #include <ATen/native/vulkan/api/Adapter.h>
 #include <ATen/native/vulkan/api/Runtime.h>
-#include <c10/util/Logging.h>
-#include <c10/util/irange.h>
-
-#include <sstream>
 
 namespace at {
 namespace native {
@@ -100,7 +100,7 @@ VkInstance create_instance(const RuntimeConfiguration& config) {
 
   VkInstance instance{};
   VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance));
-  TORCH_CHECK(instance, "Invalid Vulkan instance!");
+  VK_CHECK_COND(instance, "Invalid Vulkan instance!");
 
 #ifdef USE_VULKAN_VOLK
   volkLoadInstance(instance);
@@ -139,21 +139,13 @@ VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
     const char* const layer_prefix,
     const char* const message,
     void* const /* user_data */) {
+  (void)flags;
+
   std::stringstream stream;
   stream << layer_prefix << " " << message_code << " " << message << std::endl;
   const std::string log = stream.str();
 
-  if (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT) {
-    LOG(ERROR) << log;
-  } else if (flags & VK_DEBUG_REPORT_WARNING_BIT_EXT) {
-    LOG(WARNING) << log;
-  } else if (flags & VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT) {
-    LOG(WARNING) << "Performance:" << log;
-  } else if (flags & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) {
-    LOG(INFO) << log;
-  } else if (flags & VK_DEBUG_REPORT_DEBUG_BIT_EXT) {
-    LOG(INFO) << "Debug: " << log;
-  }
+  std::cout << log;
 
   return VK_FALSE;
 }
@@ -180,7 +172,7 @@ VkDebugReportCallbackEXT create_debug_report_callback(
       (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr(
           instance, "vkCreateDebugReportCallbackEXT");
 
-  TORCH_CHECK(
+  VK_CHECK_COND(
       vkCreateDebugReportCallbackEXT,
       "Could not load vkCreateDebugReportCallbackEXT");
 
@@ -191,7 +183,7 @@ VkDebugReportCallbackEXT create_debug_report_callback(
       nullptr,
       &debug_report_callback));
 
-  TORCH_CHECK(debug_report_callback, "Invalid Vulkan debug report callback!");
+  VK_CHECK_COND(debug_report_callback, "Invalid Vulkan debug report callback!");
 
   return debug_report_callback;
 }
@@ -202,19 +194,16 @@ VkDebugReportCallbackEXT create_debug_report_callback(
 
 uint32_t select_first(const std::vector<Runtime::DeviceMapping>& devices) {
   if (devices.empty()) {
-    TORCH_WARN(
-        "Pytorch Vulkan Runtime: no device devices are available for selection!");
     return devices.size() + 1; // return out of range to signal invalidity
   }
 
   // Select the first adapter that has compute capability
-  for (const uint32_t i : c10::irange(devices.size())) {
+  for (size_t i = 0; i < devices.size(); ++i) {
     if (devices[i].first.num_compute_queues > 0) {
       return i;
     }
   }
 
-  TORCH_WARN("Pytorch Vulkan Runtime: no device devices support compute!");
   return devices.size() + 1;
 }
 
@@ -226,16 +215,10 @@ std::unique_ptr<Runtime> init_global_vulkan_runtime() {
   // Load Vulkan drivers
 #if defined(USE_VULKAN_VOLK)
   if (VK_SUCCESS != volkInitialize()) {
-    TORCH_WARN(
-        "Pytorch Vulkan Runtime: Failed to load Vulkan driver using volkInitialize()! "
-        "The global vulkan runtime is invalid.");
     return std::unique_ptr<Runtime>(nullptr);
   }
 #elif defined(USE_VULKAN_WRAPPER)
   if (!InitVulkan()) {
-    TORCH_WARN(
-        "Pytorch Vulkan Runtime: Failed to load Vulkan driver using initVulkan()! "
-        "The global vulkan runtime is invalid.");
     return std::unique_ptr<Runtime>(nullptr);
   }
 #endif /* USE_VULKAN_VOLK, USE_VULKAN_WRAPPER */
@@ -258,21 +241,7 @@ std::unique_ptr<Runtime> init_global_vulkan_runtime() {
 
   try {
     return std::make_unique<Runtime>(Runtime(default_config));
-  } catch (const c10::Error& e) {
-    TORCH_WARN(
-        "Pytorch Vulkan Runtime: Failed to initialize the global vulkan runtime! "
-        "The global vulkan runtime is invalid. Error: ",
-        e.what());
-  } catch (const std::exception& e) {
-    TORCH_WARN(
-        "Pytorch Vulkan Runtime: Failed to initialize the global vulkan runtime! "
-        "The global vulkan runtime is invalid. Error: ",
-        e.what());
   } catch (...) {
-    TORCH_WARN(
-        "Pytorch Vulkan Runtime: Failed to initialize the global vulkan runtime! "
-        "The global vulkan runtime is invalid. "
-        "Error: Unknown");
   }
 
   return std::unique_ptr<Runtime>(nullptr);
@@ -296,24 +265,13 @@ Runtime::Runtime(const RuntimeConfiguration config)
         case AdapterSelector::First:
           default_adapter_i_ = create_adapter(select_first);
       }
-    } catch (const c10::Error& e) {
-      TORCH_WARN(
-          "Pytorch Vulkan Runtime: Could not initialize default device! Error: ",
-          e.what());
-    } catch (const std::exception& e) {
-      TORCH_WARN(
-          "Pytorch Vulkan Runtime: Could not initialize default device! Error: ",
-          e.what());
     } catch (...) {
-      TORCH_WARN(
-          "Pytorch Vulkan Runtime: Could not initialize default device! Error: "
-          "Unknown.");
     }
   }
 }
 
 Runtime::~Runtime() {
-  if C10_LIKELY (VK_NULL_HANDLE == instance_) {
+  if (VK_NULL_HANDLE == instance_) {
     return;
   }
 
@@ -328,12 +286,10 @@ Runtime::~Runtime() {
         (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr(
             instance_, "vkDestroyDebugReportCallbackEXT");
 
-    TORCH_CHECK(
-        vkDestroyDebugReportCallbackEXT,
-        "Pytorch Vulkan Runtime: Could not load vkDestroyDebugReportCallbackEXT "
-        "when destroying debug_report_callback_");
-
-    vkDestroyDebugReportCallbackEXT(instance_, debug_report_callback_, nullptr);
+    if (vkDestroyDebugReportCallbackEXT) {
+      vkDestroyDebugReportCallbackEXT(
+          instance_, debug_report_callback_, nullptr);
+    }
 
     debug_report_callback_ = {};
   }
@@ -353,13 +309,13 @@ Runtime::Runtime(Runtime&& other) noexcept
 }
 
 uint32_t Runtime::create_adapter(const Selector& selector) {
-  TORCH_CHECK(
+  VK_CHECK_COND(
       !device_mappings_.empty(),
       "Pytorch Vulkan Runtime: Could not initialize adapter because no "
       "devices were found by the Vulkan instance.");
 
   uint32_t physical_device_i = selector(device_mappings_);
-  TORCH_CHECK(
+  VK_CHECK_COND(
       physical_device_i < device_mappings_.size(),
       "Pytorch Vulkan Runtime: no suitable device adapter was selected! "
       "Device could not be initialized");
@@ -387,7 +343,7 @@ Runtime* runtime() {
   static const std::unique_ptr<Runtime> p_runtime =
       init_global_vulkan_runtime();
 
-  TORCH_CHECK(
+  VK_CHECK_COND(
       p_runtime,
       "Pytorch Vulkan Runtime: The global runtime could not be retrieved "
       "because it failed to initialize.");
diff --git a/aten/src/ATen/native/vulkan/api/Runtime.h b/aten/src/ATen/native/vulkan/api/Runtime.h
index b79890feac2c9..bf3920774aed6 100644
--- a/aten/src/ATen/native/vulkan/api/Runtime.h
+++ b/aten/src/ATen/native/vulkan/api/Runtime.h
@@ -2,10 +2,13 @@
 
 // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
 
+#include <functional>
+#include <memory>
 #ifdef USE_VULKAN_API
 
+#include <ATen/native/vulkan/api/vk_api.h>
+
 #include <ATen/native/vulkan/api/Adapter.h>
-#include <ATen/native/vulkan/api/Common.h>
 
 namespace at {
 namespace native {
@@ -14,7 +17,7 @@ namespace api {
 
 //
 // A Vulkan Runtime initializes a Vulkan instance and decouples the concept of
-// Vulkan instance initialization from intialization of, and subsequent
+// Vulkan instance initialization from initialization of, and subsequent
 // interactions with,  Vulkan [physical and logical] devices as a precursor to
 // multi-GPU support.  The Vulkan Runtime can be queried for available Adapters
 // (i.e. physical devices) in the system which in turn can be used for creation
@@ -67,14 +70,14 @@ class Runtime final {
   }
 
   inline Adapter* get_adapter_p() {
-    TORCH_CHECK(
+    VK_CHECK_COND(
         default_adapter_i_ >= 0 && default_adapter_i_ < adapters_.size(),
         "Pytorch Vulkan Runtime: Default device adapter is not set correctly!");
     return adapters_[default_adapter_i_].get();
   }
 
   inline Adapter* get_adapter_p(uint32_t i) {
-    TORCH_CHECK(
+    VK_CHECK_COND(
         i >= 0 && i < adapters_.size(),
         "Pytorch Vulkan Runtime: Adapter at index ",
         i,
diff --git a/aten/src/ATen/native/vulkan/api/Shader.cpp b/aten/src/ATen/native/vulkan/api/Shader.cpp
index b99ead12fc414..e3d5066a6a36b 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.cpp
+++ b/aten/src/ATen/native/vulkan/api/Shader.cpp
@@ -1,3 +1,5 @@
+#include <utility>
+
 #include <ATen/native/vulkan/api/Shader.h>
 
 namespace at {
@@ -19,19 +21,19 @@ ShaderInfo::ShaderInfo(
     std::string name,
     const uint32_t* const spirv_bin,
     const uint32_t size,
-    const std::vector<VkDescriptorType>& layout)
+    std::vector<VkDescriptorType>  layout)
     : src_code{
           spirv_bin,
           size,
       },
       kernel_name{std::move(name)},
-      kernel_layout{layout} {}
+      kernel_layout{std::move(layout)} {}
 
 ShaderInfo::ShaderInfo(
     std::string name,
     const uint32_t* const spirv_bin,
     const uint32_t size,
-    const std::vector<VkDescriptorType>& layout,
+    std::vector<VkDescriptorType>  layout,
     const std::vector<uint32_t>& tile_size,
     const StorageType bias_storage_type,
     const StorageType weight_storage_type)
@@ -40,7 +42,7 @@ ShaderInfo::ShaderInfo(
           size,
       },
       kernel_name{std::move(name)},
-      kernel_layout{layout},
+      kernel_layout{std::move(layout)},
       tile_size(tile_size),
       bias_storage_type(bias_storage_type),
       weight_storage_type(weight_storage_type) {
@@ -63,7 +65,7 @@ ShaderLayout::ShaderLayout(
     VkDevice device,
     const ShaderLayout::Signature& signature)
     : device_(device), handle_{VK_NULL_HANDLE} {
-  c10::SmallVector<VkDescriptorSetLayoutBinding, 6u> bindings;
+  std::vector<VkDescriptorSetLayoutBinding> bindings;
 
   uint32_t binding_num = 0u;
   for (const VkDescriptorType type : signature) {
@@ -94,7 +96,7 @@ ShaderLayout::ShaderLayout(ShaderLayout&& other) noexcept
 }
 
 ShaderLayout::~ShaderLayout() {
-  if C10_LIKELY (VK_NULL_HANDLE == handle_) {
+  if (VK_NULL_HANDLE == handle_) {
     return;
   }
   vkDestroyDescriptorSetLayout(device_, handle_, nullptr);
@@ -139,7 +141,7 @@ ShaderModule::ShaderModule(ShaderModule&& other) noexcept
 }
 
 ShaderModule::~ShaderModule() {
-  if C10_LIKELY (VK_NULL_HANDLE == handle_) {
+  if (VK_NULL_HANDLE == handle_) {
     return;
   }
   vkDestroyShaderModule(device_, handle_, nullptr);
@@ -178,7 +180,7 @@ VkDescriptorSetLayout ShaderLayoutCache::retrieve(
   std::lock_guard<std::mutex> lock(cache_mutex_);
 
   auto it = cache_.find(key);
-  if C10_UNLIKELY (cache_.cend() == it) {
+  if (cache_.cend() == it) {
     it = cache_.insert({key, ShaderLayoutCache::Value(device_, key)}).first;
   }
 
@@ -210,7 +212,7 @@ VkShaderModule ShaderCache::retrieve(const ShaderCache::Key& key) {
   std::lock_guard<std::mutex> lock(cache_mutex_);
 
   auto it = cache_.find(key);
-  if C10_UNLIKELY (cache_.cend() == it) {
+  if (cache_.cend() == it) {
     it = cache_.insert({key, ShaderCache::Value(device_, key)}).first;
   }
 
diff --git a/aten/src/ATen/native/vulkan/api/Shader.h b/aten/src/ATen/native/vulkan/api/Shader.h
index 1039596e759a1..27568aea4982a 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.h
+++ b/aten/src/ATen/native/vulkan/api/Shader.h
@@ -4,13 +4,13 @@
 
 #ifdef USE_VULKAN_API
 
-#include <ATen/native/vulkan/api/Common.h>
+#include <ATen/native/vulkan/api/vk_api.h>
+
 #include <ATen/native/vulkan/api/Types.h>
 #include <ATen/native/vulkan/api/Utils.h>
-#include <c10/util/flat_hash_map.h>
-#include <c10/util/hash.h>
 
 #include <mutex>
+#include <unordered_map>
 
 namespace at {
 namespace native {
@@ -19,7 +19,7 @@ namespace api {
 
 class ShaderLayout final {
  public:
-  using Signature = c10::SmallVector<VkDescriptorType, 6u>;
+  using Signature = std::vector<VkDescriptorType>;
 
   explicit ShaderLayout(VkDevice, const Signature&);
 
@@ -58,7 +58,7 @@ struct ShaderInfo final {
   // Shader Metadata
   utils::uvec3 out_tile_size{1u, 1u, 1u};
 
-  c10::SmallVector<uint32_t, 4> tile_size;
+  std::vector<uint32_t> tile_size;
   StorageType bias_storage_type{StorageType::UNKNOWN};
   StorageType weight_storage_type{StorageType::UNKNOWN};
 
@@ -68,12 +68,12 @@ struct ShaderInfo final {
       std::string,
       const uint32_t*,
       const uint32_t,
-      const std::vector<VkDescriptorType>&);
+      std::vector<VkDescriptorType>);
   explicit ShaderInfo(
       std::string,
       const uint32_t*,
       const uint32_t,
-      const std::vector<VkDescriptorType>&,
+      std::vector<VkDescriptorType>,
       const std::vector<uint32_t>& tile_size,
       const StorageType bias_storage_type,
       const StorageType weight_storage_type);
@@ -128,7 +128,8 @@ class ShaderLayoutCache final {
       size_t hashed = 0u;
 
       for (const VkDescriptorType type : signature) {
-        hashed = c10::hash_combine(hashed, c10::get_hash(type));
+        hashed =
+            utils::hash_combine(hashed, std::hash<VkDescriptorType>()(type));
       }
 
       return hashed;
@@ -141,7 +142,7 @@ class ShaderLayoutCache final {
   std::mutex cache_mutex_;
 
   VkDevice device_;
-  ska::flat_hash_map<Key, Value, Hasher> cache_;
+  std::unordered_map<Key, Value, Hasher> cache_;
 
  public:
   VkDescriptorSetLayout retrieve(const Key&);
@@ -165,7 +166,13 @@ class ShaderCache final {
 
   struct Hasher {
     inline size_t operator()(const ShaderInfo& source) const {
-      return c10::get_hash(source.src_code.bin, source.src_code.size);
+      size_t seed = 0;
+      seed = utils::hash_combine(
+          seed, std::hash<const uint32_t*>()(source.src_code.bin));
+      seed = utils::hash_combine(
+          seed, std::hash<uint32_t>()(source.src_code.size));
+
+      return seed;
     }
   };
 
@@ -175,7 +182,7 @@ class ShaderCache final {
   std::mutex cache_mutex_;
 
   VkDevice device_;
-  ska::flat_hash_map<Key, Value, Hasher> cache_;
+  std::unordered_map<Key, Value, Hasher> cache_;
 
  public:
   VkShaderModule retrieve(const Key&);
diff --git a/aten/src/ATen/native/vulkan/api/ShaderRegistry.cpp b/aten/src/ATen/native/vulkan/api/ShaderRegistry.cpp
new file mode 100644
index 0000000000000..a66fbbf60f821
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/ShaderRegistry.cpp
@@ -0,0 +1,61 @@
+#include <ATen/native/vulkan/api/ShaderRegistry.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+
+bool ShaderRegistry::has_shader(const std::string& shader_name) {
+  const ShaderListing::const_iterator it = listings_.find(shader_name);
+  return it != listings_.end();
+}
+
+bool ShaderRegistry::has_dispatch(const std::string& op_name) {
+  const Registry::const_iterator it = registry_.find(op_name);
+  return it != registry_.end();
+}
+
+void ShaderRegistry::register_shader(ShaderInfo&& shader_info) {
+  if (has_shader(shader_info.kernel_name)) {
+    VK_THROW(
+        "Shader with name ", shader_info.kernel_name, "already registered");
+  }
+  listings_.emplace(shader_info.kernel_name, shader_info);
+}
+
+void ShaderRegistry::register_op_dispatch(
+    const std::string& op_name,
+    const DispatchKey key,
+    const std::string& shader_name) {
+  if (!has_dispatch(op_name)) {
+    registry_.emplace(op_name, Dispatcher());
+  }
+  const Dispatcher::const_iterator it = registry_[op_name].find(key);
+  if (it != registry_[op_name].end()) {
+    registry_[op_name][key] = shader_name;
+  } else {
+    registry_[op_name].emplace(key, shader_name);
+  }
+}
+
+const ShaderInfo& ShaderRegistry::get_shader_info(
+    const std::string& shader_name) {
+  const ShaderListing::const_iterator it = listings_.find(shader_name);
+
+  VK_CHECK_COND(
+      it != listings_.end(),
+      "Could not find ShaderInfo with name ",
+      shader_name);
+
+  return it->second;
+}
+
+ShaderRegistry& shader_registry() {
+  static ShaderRegistry registry;
+  return registry;
+}
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/api/ShaderRegistry.h b/aten/src/ATen/native/vulkan/api/ShaderRegistry.h
new file mode 100644
index 0000000000000..d49c2f3e50bc0
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/ShaderRegistry.h
@@ -0,0 +1,87 @@
+#pragma once
+
+// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/api/Shader.h>
+
+#include <string>
+#include <unordered_map>
+
+#define VK_KERNEL(shader_name) \
+  ::at::native::vulkan::api::shader_registry().get_shader_info(#shader_name)
+
+#define VK_KERNEL_FROM_STR(shader_name_str) \
+  ::at::native::vulkan::api::shader_registry().get_shader_info(shader_name_str)
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+
+enum class DispatchKey : int8_t {
+  CATCHALL,
+  ADRENO,
+  MALI,
+  OVERRIDE,
+};
+
+class ShaderRegistry final {
+  using ShaderListing = std::unordered_map<std::string, ShaderInfo>;
+  using Dispatcher = std::unordered_map<DispatchKey, std::string>;
+  using Registry = std::unordered_map<std::string, Dispatcher>;
+
+  ShaderListing listings_;
+  Dispatcher dispatcher_;
+  Registry registry_;
+
+ public:
+  /*
+   * Check if the registry has a shader registered under the given name
+   */
+  bool has_shader(const std::string& shader_name);
+
+  /*
+   * Check if the registry has a dispatch registered under the given name
+   */
+  bool has_dispatch(const std::string& op_name);
+
+  /*
+   * Register a ShaderInfo to a given shader name
+   */
+  void register_shader(ShaderInfo&& shader_info);
+
+  /*
+   * Register a dispatch entry to the given op name
+   */
+  void register_op_dispatch(
+      const std::string& op_name,
+      const DispatchKey key,
+      const std::string& shader_name);
+
+  /*
+   * Given a shader name, return the ShaderInfo which contains the SPIRV binary
+   */
+  const ShaderInfo& get_shader_info(const std::string& shader_name);
+};
+
+class ShaderRegisterInit final {
+  using InitFn = void();
+
+ public:
+  ShaderRegisterInit(InitFn* init_fn) {
+    init_fn();
+  };
+};
+
+// The global shader registry is retrieved using this function, where it is
+// declared as a static local variable.
+ShaderRegistry& shader_registry();
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/caffe2/contrib/__init__.py b/aten/src/ATen/native/vulkan/api/StringUtil.cpp
similarity index 100%
rename from caffe2/contrib/__init__.py
rename to aten/src/ATen/native/vulkan/api/StringUtil.cpp
diff --git a/aten/src/ATen/native/vulkan/api/StringUtil.h b/aten/src/ATen/native/vulkan/api/StringUtil.h
new file mode 100644
index 0000000000000..b4e8b6ebc623e
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/StringUtil.h
@@ -0,0 +1,89 @@
+#pragma once
+// @lint-ignore-every CLANGTIDY facebook-hte-LocalUncheckedArrayBounds
+#ifdef USE_VULKAN_API
+
+#include <exception>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+
+namespace detail {
+
+struct CompileTimeEmptyString {
+  operator const std::string&() const {
+    static const std::string empty_string_literal;
+    return empty_string_literal;
+  }
+  operator const char*() const {
+    return "";
+  }
+};
+
+template <typename T>
+struct CanonicalizeStrTypes {
+  using type = const T&;
+};
+
+template <size_t N>
+struct CanonicalizeStrTypes<char[N]> {
+  using type = const char*;
+};
+
+inline std::ostream& _str(std::ostream& ss) {
+  return ss;
+}
+
+template <typename T>
+inline std::ostream& _str(std::ostream& ss, const T& t) {
+  ss << t;
+  return ss;
+}
+
+template <>
+inline std::ostream& _str<CompileTimeEmptyString>(
+    std::ostream& ss,
+    const CompileTimeEmptyString&) {
+  return ss;
+}
+
+template <typename T, typename... Args>
+inline std::ostream& _str(std::ostream& ss, const T& t, const Args&... args) {
+  return _str(_str(ss, t), args...);
+}
+
+template <typename... Args>
+struct _str_wrapper final {
+  static std::string call(const Args&... args) {
+    std::ostringstream ss;
+    _str(ss, args...);
+    return ss.str();
+  }
+};
+
+template <>
+struct _str_wrapper<> final {
+  static CompileTimeEmptyString call() {
+    return CompileTimeEmptyString();
+  }
+};
+
+} // namespace detail
+
+template <typename... Args>
+inline std::string concat_str(const Args&... args) {
+  return detail::_str_wrapper<
+      typename detail::CanonicalizeStrTypes<Args>::type...>::call(args...);
+}
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/api/Tensor.cpp b/aten/src/ATen/native/vulkan/api/Tensor.cpp
index 7ac00ec0db085..f89cd0fb9aaca 100644
--- a/aten/src/ATen/native/vulkan/api/Tensor.cpp
+++ b/aten/src/ATen/native/vulkan/api/Tensor.cpp
@@ -1,6 +1,5 @@
 #include <ATen/native/vulkan/api/Tensor.h>
 #include <ATen/native/vulkan/api/Utils.h>
-#include <c10/util/accumulate.h>
 
 namespace at {
 namespace native {
@@ -8,75 +7,45 @@ namespace vulkan {
 
 namespace {
 
-/**
- * Determines an appropriate GPU Memory Layout qualifier based on the the
- * StorageType requested and the c10::MemoryFormat specified.
- */
-api::GPUMemoryLayout get_gpu_memory_layout(
-    const api::StorageType storage_type,
-    const c10::MemoryFormat memory_format) {
-  if (storage_type == api::StorageType::BUFFER) {
-    switch (memory_format) {
-      case c10::MemoryFormat::Contiguous:
-        return api::GPUMemoryLayout::TENSOR_WIDTH_PACKED;
-      case c10::MemoryFormat::ChannelsLast:
-        return api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-      default:
-        TORCH_CHECK(false, "Invalid memory format used to create vTensor!");
-    }
-  }
-  // For texture storage, always return a memory layout that packs the channels
-  // dimension. for now. With the way texture storage currently works, for 2-dim
-  // tensors, a channel dimension is added, as well as 3 channels of zero
-  // padding resulting in a final shape of {4, H, W}. For 1-dim tensors, it is
-  // unsqueezed to size {1, 1, L} and 3 channels of zero padding are added to
-  // produce a final size of {4, 1, L}. This is to ensure that physical texture
-  // positions correspond directly to logical tensor coordinates (so
-  // texelFetch(ivec3(x, y, 0), 0) will correspond to tensor[y, x].
-  //
-  // TODO(ssjia): have 2D and 1D tensors use TENSOR_WIDTH_PACKED by default.
-  return api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-}
-
 /*
  * Calculates the strides of a contiguous tensor. empty_tensor_restride from
  * TensorImpl.h was used as a reference.
  */
-c10::SmallVector<int64_t, 6u> calc_contiguous_strides(const IntArrayRef sizes) {
+std::vector<int64_t> calc_contiguous_strides(
+    const std::vector<int64_t>& sizes) {
   int64_t ndim = static_cast<int64_t>(sizes.size());
-  c10::SmallVector<int64_t, 6u> strides(ndim);
+  std::vector<int64_t> strides(ndim);
 
   int64_t running_product = 1;
   if (ndim >= 1) {
-    strides[ndim - 1] = running_product;
+    strides.at(ndim - 1) = running_product;
     for (int i = static_cast<int>(sizes.size()) - 2; i >= 0; --i) {
-      running_product *= sizes[i + 1];
-      strides[i] = running_product;
+      running_product *= sizes.at(i + 1);
+      strides.at(i) = running_product;
     }
   }
 
   return strides;
 }
 
-c10::SmallVector<int64_t, 6u> calc_channels_last_strides(
-    const IntArrayRef sizes) {
-  c10::SmallVector<int64_t, 6u> strides(sizes.size());
+std::vector<int64_t> calc_channels_last_strides(
+    const std::vector<int64_t>& sizes) {
+  std::vector<int64_t> strides(sizes.size());
 
   switch (sizes.size()) {
     case 4:
-      strides[1] = 1;
-      strides[3] = sizes[1];
-      strides[2] = strides[3] * sizes[3];
-      strides[0] = strides[2] * sizes[2];
+      strides.at(1) = 1;
+      strides.at(3) = sizes.at(1);
+      strides.at(2) = strides.at(3) * sizes.at(3);
+      strides.at(0) = strides.at(2) * sizes.at(2);
       return strides;
     case 3:
-      strides[0] = 1;
-      strides[2] = sizes[0];
-      strides[1] = strides[2] * sizes[2];
+      strides.at(0) = 1;
+      strides.at(2) = sizes.at(0);
+      strides.at(1) = strides.at(2) * sizes.at(2);
       return strides;
     default:
-      TORCH_CHECK(
-          false, "ChannelsLast format only available for 3 <= ndim <= 4!");
+      VK_THROW("ChannelsLast format only available for 3 <= ndim <= 4!");
   }
 
   return strides;
@@ -87,8 +56,8 @@ c10::SmallVector<int64_t, 6u> calc_channels_last_strides(
  * that strides are only valid for vTensors that are backed by buffer storage;
  * if texture storage is used then the strides are invalid and set to zeros.
  */
-c10::SmallVector<int64_t, 6u> calc_strides(
-    const IntArrayRef sizes,
+std::vector<int64_t> calc_strides(
+    const std::vector<int64_t>& sizes,
     const api::GPUMemoryLayout memory_layout,
     const api::StorageType storage_type) {
   switch (storage_type) {
@@ -101,14 +70,14 @@ c10::SmallVector<int64_t, 6u> calc_strides(
           return calc_channels_last_strides(sizes);
           break;
         default:
-          TORCH_CHECK(false, "Invalid memory format used to create vTensor!");
+          VK_THROW("Invalid memory format used to create vTensor!");
       }
       break;
     case api::StorageType::TEXTURE_3D:
     case api::StorageType::TEXTURE_2D:
-      return c10::SmallVector<int64_t, 6u>(sizes.size());
+      return std::vector<int64_t>(sizes.size());
     default:
-      TORCH_CHECK(false, "Invalid storage type used to create vTensor!");
+      VK_THROW("Invalid storage type used to create vTensor!");
   }
 }
 
@@ -120,178 +89,61 @@ c10::SmallVector<int64_t, 6u> calc_strides(
  * returns a sizes array describing the dimensions of the memory used to store
  * the tensor data on the GPU.
  */
-c10::SmallVector<int64_t, 6u> calc_gpu_sizes(
-    const IntArrayRef sizes,
+std::vector<int64_t> calc_gpu_sizes(
+    const std::vector<int64_t>& sizes,
     const api::GPUMemoryLayout memory_layout,
     const api::StorageType storage_type) {
-  size_t ndim = sizes.size();
+  VK_CHECK_COND(storage_type != api::StorageType::UNKNOWN);
 
-  TORCH_CHECK(storage_type != api::StorageType::UNKNOWN);
-
-  // For buffer formats, the innermost dim (i.e. where the stride is 1) will be
-  // aligned up. Which dim is the innermost is described by the GPUMemoryLayout.
+  std::vector<int64_t> gpu_sizes;
   if (storage_type == api::StorageType::BUFFER) {
-    c10::SmallVector<int64_t, 6u> gpu_sizes{sizes};
-
-    switch (memory_layout) {
-      case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
-        gpu_sizes[ndim - 1] = api::utils::align_up(sizes[ndim - 1], INT64_C(4));
-        break;
-
-      case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
-        switch (ndim) {
-          case 3:
-            gpu_sizes[0] = api::utils::align_up(sizes[0], INT64_C(4));
-            break;
-
-          case 4:
-            gpu_sizes[1] = api::utils::align_up(sizes[1], INT64_C(4));
-            break;
-        }
-        break;
-
-      default:
-        TORCH_CHECK(false, "Invalid memory format used to create vTensor!");
-        break;
+    gpu_sizes.resize(sizes.size());
+    for (size_t i = 0; i < sizes.size(); i++) {
+      gpu_sizes.at(i) = sizes.at(i);
     }
-
-    return gpu_sizes;
   }
-  // If StorageType is not BUFFER, that means TEXTURE storage will be used. For
-  // texture storage, the returned gpu_sizes will be at least 3 dimensional to
-  // represent the extents of the image texture that will be allocated. For 4
-  // dimensional tensors, The gpu_sizes will also be 4 dimensional in order to
-  // preserve the size of the batch dim to facilitate conversion between logical
-  // tensor coordinates and physical texel positions. Based on the GPU memory
-  // layout, whichever dimension is packed will be aligned up to the next
-  // multiple of 4, as each texel shall store 4 consecutive elements from the
-  // packed dimension.
+  // For texture storage, tensors are typically stored using 3D image textures.
+  // Batches are stacked along the depth dimension. To represent the physical
+  // 3 dimensionality of the image texture (with concatenated batches) GPU sizes
+  // will be fixed to 4 dimensions when using texture storage.
   else {
-    TORCH_CHECK(
-        ndim >= 0 && ndim <= 4,
+    VK_CHECK_COND(
+        sizes.size() >= 0 && sizes.size() <= 4,
         "Texture storage only valid for 0 <= ndim <= 4, received: ",
-        ndim);
-
-    c10::SmallVector<int64_t, 6u> gpu_sizes(ndim == 4 ? 4 : 3);
-
-    // Channel dim will be be aligned to the next multiple of 4
-    switch (ndim) {
-      case 0:
-        switch (memory_layout) {
-          case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
-            // 0-dimension tensors only has 1 element. Hence it is always {4, 1,
-            // 1} when stored as image textures. Channels need to be multiple of
-            // 4 due to packing.
-            gpu_sizes[0] = 4;
-            gpu_sizes[1] = 1;
-            gpu_sizes[2] = 1;
-            break;
-          default:
-            TORCH_CHECK(
-                false,
-                "Invalid memory format used to create vTensor with zero-dim!");
-        }
-        break;
-      case 1:
-        switch (memory_layout) {
-          case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
-            gpu_sizes[0] = 1;
-            gpu_sizes[1] = 1;
-            gpu_sizes[2] = api::utils::align_up(sizes[0], INT64_C(4));
-            break;
-          case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
-            // 1-dimension tensors are interpreted as 3-dimensional tensors with
-            // size {1, 1, L} when stored as image textures, thus channel
-            // packing is valid even though the original tensor does not
-            // technically have a channels dimension. In this mode, 3 channels
-            // of zero padding are added to the unsqueezed size of {1, 1, L}
-            // producing a final shape of {4, 1, L}.
-            gpu_sizes[0] = 4;
-            gpu_sizes[1] = 1;
-            gpu_sizes[2] = sizes[0];
-            break;
-          default:
-            TORCH_CHECK(false, "Invalid memory format used to create vTensor!");
-        }
-        break;
+        sizes.size());
 
-      case 2:
-        switch (memory_layout) {
-          case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
-            gpu_sizes[0] = 1;
-            gpu_sizes[1] = sizes[0];
-            gpu_sizes[2] = api::utils::align_up(sizes[1], INT64_C(4));
-            break;
-          case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
-            gpu_sizes[0] = 1;
-            gpu_sizes[1] = api::utils::align_up(sizes[0], INT64_C(4));
-            gpu_sizes[2] = sizes[1];
-            break;
-          case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
-            // 2-dimension tensors are interpreted as 3-dimensional tensors with
-            // size {1, H, W} when stored as image textures, thus channel
-            // packing is valid even though the original tensor does not
-            // technically have a channels dimension. In this mode, 3 channels
-            // of zero padding are added to the unsqueezed size of {1, H, W}
-            // producing a final shape of {4, H, W}.
-            gpu_sizes[0] = 4;
-            gpu_sizes[1] = sizes[0];
-            gpu_sizes[2] = sizes[1];
-            break;
-          default:
-            TORCH_CHECK(false, "Invalid memory format used to create vTensor!");
-        }
-        break;
+    gpu_sizes.resize(4);
+    gpu_sizes.at(0) = api::utils::val_at(-4, sizes);
+    gpu_sizes.at(1) = api::utils::val_at(-3, sizes);
+    gpu_sizes.at(2) = api::utils::val_at(-2, sizes);
+    gpu_sizes.at(3) = api::utils::val_at(-1, sizes);
+  }
 
-      case 3:
-        switch (memory_layout) {
-          case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
-            gpu_sizes[0] = sizes[0];
-            gpu_sizes[1] = sizes[1];
-            gpu_sizes[2] = api::utils::align_up(sizes[2], INT64_C(4));
-            break;
-          case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
-            gpu_sizes[0] = sizes[0];
-            gpu_sizes[1] = api::utils::align_up(sizes[1], INT64_C(4));
-            gpu_sizes[2] = sizes[2];
-            break;
-          case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
-            gpu_sizes[0] = api::utils::align_up(sizes[0], INT64_C(4));
-            gpu_sizes[1] = sizes[1];
-            gpu_sizes[2] = sizes[2];
-            break;
-          default:
-            TORCH_CHECK(false, "Invalid memory format used to create vTensor!");
-        }
-        break;
+  size_t ndim = gpu_sizes.size();
+  switch (memory_layout) {
+    case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
+      if (ndim >= 1) {
+        gpu_sizes.at(ndim - 1) =
+            api::utils::align_up(api::utils::val_at(-1, sizes), INT64_C(4));
+      }
+      break;
 
-      case 4:
-        switch (memory_layout) {
-          case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
-            gpu_sizes[0] = sizes[0];
-            gpu_sizes[1] = sizes[1];
-            gpu_sizes[2] = sizes[3];
-            gpu_sizes[3] = api::utils::align_up(sizes[3], INT64_C(4));
-            break;
-          case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
-            gpu_sizes[0] = sizes[0];
-            gpu_sizes[1] = sizes[1];
-            gpu_sizes[2] = api::utils::align_up(sizes[2], INT64_C(4));
-            gpu_sizes[3] = sizes[3];
-            break;
-          case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
-            gpu_sizes[0] = sizes[0];
-            gpu_sizes[1] = api::utils::align_up(sizes[1], INT64_C(4));
-            gpu_sizes[2] = sizes[2];
-            gpu_sizes[3] = sizes[3];
-            break;
-          default:
-            TORCH_CHECK(false, "Invalid memory format used to create vTensor!");
-        }
-        break;
-    }
-    return gpu_sizes;
+    case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
+      if (ndim >= 2) {
+        gpu_sizes.at(ndim - 2) =
+            api::utils::align_up(api::utils::val_at(-2, sizes), INT64_C(4));
+      }
+      break;
+
+    case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
+      if (ndim >= 3) {
+        gpu_sizes.at(ndim - 3) =
+            api::utils::align_up(api::utils::val_at(-3, sizes), INT64_C(4));
+      }
+      break;
   }
+
+  return gpu_sizes;
 }
 
 /*
@@ -299,7 +151,7 @@ c10::SmallVector<int64_t, 6u> calc_gpu_sizes(
  * created to store a tensor of a given size.
  */
 api::utils::uvec3 create_image_extents(
-    const IntArrayRef gpu_sizes,
+    const std::vector<int64_t>& gpu_sizes,
     const api::StorageType storage_type,
     const api::GPUMemoryLayout memory_layout) {
   size_t ndim = gpu_sizes.size();
@@ -308,30 +160,31 @@ api::utils::uvec3 create_image_extents(
     // image extents do not apply to buffer storage
     return {0u, 0u, 0u};
   } else {
-    TORCH_CHECK(
+    VK_CHECK_COND(
         ndim >= 1 && ndim <= 4,
         "Texture storage only valid for 1 <= ndim <= 4!");
 
-    uint32_t width = api::utils::val_at(-1, gpu_sizes);
-    uint32_t height = api::utils::val_at(-2, gpu_sizes);
-    uint32_t channels = api::utils::val_at(-3, gpu_sizes);
-    uint32_t batch = api::utils::val_at(-4, gpu_sizes);
+    using namespace api::utils;
+    uint32_t width = safe_downcast<uint32_t>(val_at(-1, gpu_sizes));
+    uint32_t height = safe_downcast<uint32_t>(val_at(-2, gpu_sizes));
+    uint32_t channels = safe_downcast<uint32_t>(val_at(-3, gpu_sizes));
+    uint32_t batch = safe_downcast<uint32_t>(val_at(-4, gpu_sizes));
 
     switch (memory_layout) {
       case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
-        TORCH_CHECK(width % 4 == 0, "Channels must be divisible by 4!")
+        VK_CHECK_COND(width % 4 == 0, "Channels must be divisible by 4!");
         width /= 4;
         break;
       case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
-        TORCH_CHECK(height % 4 == 0, "Channels must be divisible by 4!")
+        VK_CHECK_COND(height % 4 == 0, "Channels must be divisible by 4!");
         height /= 4;
         break;
       case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
-        TORCH_CHECK(channels % 4 == 0, "Channels must be divisible by 4!")
+        VK_CHECK_COND(channels % 4 == 0, "Channels must be divisible by 4!");
         channels /= 4;
         break;
       default:
-        TORCH_CHECK(false, "Invalid memory format used!");
+        VK_THROW("Invalid memory format used!");
     }
 
     return {width, height, batch * channels};
@@ -340,18 +193,18 @@ api::utils::uvec3 create_image_extents(
 
 api::UniformParamsBuffer make_metadata_uniform(
     api::Context* const context,
-    const IntArrayRef sizes,
-    const IntArrayRef strides,
+    const std::vector<int64_t>& sizes,
+    const std::vector<int64_t>& strides,
     const api::StorageType storage_type) {
   if (storage_type != api::StorageType::BUFFER) {
     return api::UniformParamsBuffer();
   }
 
   vTensor::BufferMetadata metadata{
-      api::utils::make_nchw_uvec4(sizes),
-      api::utils::make_nchw_uvec4(strides),
+      api::utils::make_whcn_uvec4(sizes),
+      api::utils::make_whcn_uvec4(strides),
       api::utils::safe_downcast<uint32_t>(sizes.size()),
-      api::utils::safe_downcast<uint32_t>(c10::multiply_integers(sizes)),
+      api::utils::safe_downcast<uint32_t>(api::utils::multiply_integers(sizes)),
   };
 
   return api::UniformParamsBuffer(context, metadata);
@@ -365,52 +218,56 @@ api::UniformParamsBuffer make_metadata_uniform(
 
 vTensor::vTensor(
     api::Context* const context,
-    const IntArrayRef sizes,
-    const c10::ScalarType dtype,
+    const std::vector<int64_t>& sizes,
+    const api::ScalarType dtype,
     const api::StorageType storage_type,
-    const api::GPUMemoryLayout memory_layout)
+    const api::GPUMemoryLayout memory_layout,
+    const bool allocate_memory)
     : dtype_(dtype),
       memory_layout_(memory_layout),
       // Calculate sizes and strides
-      sizes_{sizes},
+      sizes_(sizes.begin(), sizes.end()),
       strides_{calc_strides(sizes, memory_layout_, storage_type)},
       gpu_sizes_{calc_gpu_sizes(sizes, memory_layout_, storage_type)},
       gpu_strides_{calc_strides(gpu_sizes_, memory_layout_, storage_type)},
-      // Vulkan uniform buffer containing sizes and stride info
-      metadata_uniform_{make_metadata_uniform(
-          context,
-          gpu_sizes_,
-          gpu_strides_,
-          storage_type)},
+      virtual_extents_(
+          create_image_extents(gpu_sizes_, storage_type, memory_layout)),
+      // Utility Uniform Buffers that can be passed to shaders as arguments
+      metadata_uniform_(),
+      cpu_sizes_uniform_(nullptr),
+      gpu_sizes_uniform_(nullptr),
+      extents_uniform_(nullptr),
       // Construct Tensor storage
       view_(std::make_shared<vTensorStorage>(
           context,
           storage_type,
           memory_layout_,
           gpu_sizes_,
-          dtype_)) {}
+          dtype_,
+          allocate_memory)) {}
 
 vTensor::vTensor(
     api::Context* const context,
-    const IntArrayRef sizes,
+    const std::vector<int64_t>& sizes,
     double q_scale,
     int64_t q_zero_point,
-    const c10::ScalarType dtype,
+    const api::ScalarType dtype,
     const api::StorageType storage_type,
     const api::GPUMemoryLayout memory_layout)
     : dtype_(dtype),
       memory_layout_(memory_layout),
       // Calculate sizes and strides
-      sizes_{sizes},
+      sizes_(sizes.begin(), sizes.end()),
       strides_{calc_strides(sizes, memory_layout_, storage_type)},
       gpu_sizes_{calc_gpu_sizes(sizes, memory_layout_, storage_type)},
       gpu_strides_{calc_strides(gpu_sizes_, memory_layout_, storage_type)},
+      virtual_extents_(
+          create_image_extents(gpu_sizes_, storage_type, memory_layout)),
       // Vulkan uniform buffer containing sizes and stride info
-      metadata_uniform_{make_metadata_uniform(
-          context,
-          gpu_sizes_,
-          gpu_strides_,
-          storage_type)},
+      metadata_uniform_(),
+      cpu_sizes_uniform_(nullptr),
+      gpu_sizes_uniform_(nullptr),
+      extents_uniform_(nullptr),
       // Quantization params
       is_quantized_{true},
       q_scale_{q_scale},
@@ -423,36 +280,6 @@ vTensor::vTensor(
           gpu_sizes_,
           dtype_)) {}
 
-vTensor::vTensor(
-    api::Context* const context,
-    const IntArrayRef sizes,
-    const c10::ScalarType dtype,
-    const api::StorageType storage_type,
-    const c10::MemoryFormat memory_format)
-    : vTensor(
-          context,
-          sizes,
-          dtype,
-          storage_type,
-          get_gpu_memory_layout(storage_type, memory_format)) {}
-
-vTensor::vTensor(
-    api::Context* const context,
-    const IntArrayRef sizes,
-    double q_scale,
-    int64_t q_zero_point,
-    const c10::ScalarType dtype,
-    const api::StorageType storage_type,
-    const c10::MemoryFormat memory_format)
-    : vTensor(
-          context,
-          sizes,
-          q_scale,
-          q_zero_point,
-          dtype,
-          storage_type,
-          get_gpu_memory_layout(storage_type, memory_format)) {}
-
 api::VulkanImage& vTensor::image(
     api::PipelineBarrier& pipeline_barrier,
     const api::PipelineStageFlags stage) const& {
@@ -483,15 +310,152 @@ api::VulkanBuffer& vTensor::buffer(
   return view_->buffer_;
 }
 
+api::VulkanBuffer& vTensor::buffer_metadata() {
+  if (!metadata_uniform_.buffer()) {
+    metadata_uniform_ = make_metadata_uniform(
+        view_->context_, gpu_sizes_, gpu_strides_, storage_type());
+  }
+  return metadata_uniform_.buffer();
+}
+
+std::shared_ptr<api::UniformParamsBuffer> vTensor::cpu_sizes_ubo() {
+  if (!cpu_sizes_uniform_) {
+    cpu_sizes_uniform_.reset(new api::UniformParamsBuffer(
+        view_->context_, api::utils::make_whcn_ivec4(sizes_)));
+  }
+  return cpu_sizes_uniform_;
+}
+
+std::shared_ptr<api::UniformParamsBuffer> vTensor::gpu_sizes_ubo() {
+  if (!gpu_sizes_uniform_) {
+    gpu_sizes_uniform_.reset(new api::UniformParamsBuffer(
+        view_->context_, api::utils::make_whcn_ivec4(gpu_sizes_)));
+  }
+  return gpu_sizes_uniform_;
+}
+
+std::shared_ptr<api::UniformParamsBuffer> vTensor::extents_ubo() {
+  if (!extents_uniform_) {
+    extents_uniform_.reset(new api::UniformParamsBuffer(
+        view_->context_,
+        api::utils::uvec4(
+            {view_->extents_.data[0],
+             view_->extents_.data[1],
+             view_->extents_.data[2],
+             1u})));
+  }
+  return extents_uniform_;
+}
+
 vTensor::BufferMetadata vTensor::get_cpu_buffer_metadata() const {
   return {
-      api::utils::make_nchw_uvec4(sizes_),
-      api::utils::make_nchw_uvec4(strides_),
+      api::utils::make_whcn_uvec4(sizes_),
+      api::utils::make_whcn_uvec4(strides_),
       api::utils::safe_downcast<uint32_t>(sizes_.size()),
-      api::utils::safe_downcast<uint32_t>(c10::multiply_integers(sizes_)),
+      api::utils::safe_downcast<uint32_t>(
+          api::utils::multiply_integers(sizes_)),
   };
 }
 
+VmaAllocationCreateInfo vTensor::get_allocation_create_info() const {
+  switch (storage_type()) {
+    case api::StorageType::BUFFER:
+      return view_->buffer_.allocation_create_info();
+    case api::StorageType::TEXTURE_2D:
+    case api::StorageType::TEXTURE_3D:
+      return view_->image_.allocation_create_info();
+    case api::StorageType::UNKNOWN:
+      break;
+  }
+  return {};
+}
+
+VkMemoryRequirements vTensor::get_memory_requirements() const {
+  switch (storage_type()) {
+    case api::StorageType::BUFFER:
+      return view_->buffer_.get_memory_requirements();
+    case api::StorageType::TEXTURE_2D:
+    case api::StorageType::TEXTURE_3D:
+      return view_->image_.get_memory_requirements();
+    case api::StorageType::UNKNOWN:
+      break;
+  }
+  return {};
+}
+
+void vTensor::bind_allocation(const api::MemoryAllocation& allocation) {
+  switch (storage_type()) {
+    case api::StorageType::BUFFER:
+      view_->buffer_.bind_allocation(allocation);
+      break;
+    case api::StorageType::TEXTURE_2D:
+    case api::StorageType::TEXTURE_3D:
+      view_->image_.bind_allocation(allocation);
+      break;
+    case api::StorageType::UNKNOWN:
+      break;
+  }
+}
+
+void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
+  sizes_ = new_sizes;
+  gpu_sizes_ = calc_gpu_sizes(sizes_, memory_layout_, storage_type());
+  virtual_extents_ =
+      create_image_extents(gpu_sizes_, storage_type(), memory_layout_);
+
+  if (cpu_sizes_uniform_) {
+    cpu_sizes_uniform_->update(api::utils::make_whcn_ivec4(sizes_));
+  }
+
+  if (gpu_sizes_uniform_) {
+    gpu_sizes_uniform_->update(api::utils::make_whcn_ivec4(gpu_sizes_));
+  }
+
+  if (extents_uniform_) {
+    extents_uniform_->update(api::utils::uvec4(
+        {virtual_extents_.data[0],
+         virtual_extents_.data[1],
+         virtual_extents_.data[2],
+         1u}));
+  }
+}
+
+void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
+  update_size_metadata(new_sizes);
+  view_->discard_and_reallocate(
+      calc_gpu_sizes(new_sizes, memory_layout_, storage_type()),
+      memory_layout_,
+      dtype_);
+}
+
+void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
+  update_size_metadata(new_sizes);
+  if (storage_type() == api::StorageType::BUFFER) {
+    if (gpu_nbytes() > view_->buffer_.mem_size()) {
+      VK_THROW(
+          "Cannot virtual_resize a vTensor with sizes that require a larger "
+          "buffer! reallocate() should be used instead.");
+    }
+  } else {
+    bool valid_resize = true;
+    if (virtual_extents_.data[0] > view_->extents_.data[0]) {
+      valid_resize = false;
+    }
+    if (virtual_extents_.data[1] > view_->extents_.data[1]) {
+      valid_resize = false;
+    }
+    if (virtual_extents_.data[2] > view_->extents_.data[2]) {
+      valid_resize = false;
+    }
+
+    if (!valid_resize) {
+      VK_THROW(
+          "Cannot virtual_resize a vTensor with sizes that require a larger "
+          "image texture! reallocate() should be used instead.");
+    }
+  }
+}
+
 //
 // vTensorStorage
 //
@@ -500,7 +464,8 @@ api::VulkanImage allocate_image(
     api::Context* const context_ptr,
     api::utils::uvec3& extents,
     const api::StorageType storage_type,
-    const VkFormat image_format) {
+    const VkFormat image_format,
+    const bool allocate_memory) {
   api::Adapter* adapter_ptr = context_ptr->adapter_ptr();
 
   api::ImageSampler::Properties sampler_props{
@@ -536,14 +501,16 @@ api::VulkanImage allocate_image(
       image_view_type,
       sampler_props,
       sampler,
-      true);
+      /*allow_transfer = */ true,
+      /*allocate_memory = */ allocate_memory);
 }
 
 api::VulkanBuffer allocate_buffer(
     api::Context* const context_ptr,
     const int64_t numel,
     const api::StorageType storage_type,
-    const c10::ScalarType dtype) {
+    const api::ScalarType dtype,
+    const bool allocate_memory) {
   api::Adapter* adapter_ptr = context_ptr->adapter_ptr();
 
   switch (storage_type) {
@@ -555,34 +522,46 @@ api::VulkanBuffer allocate_buffer(
   }
 
   return adapter_ptr->vma().create_storage_buffer(
-      c10::elementSize(dtype) * numel, true);
+      api::element_size(dtype) * numel, /*gpu_only = */ true, allocate_memory);
 }
 
 vTensorStorage::vTensorStorage(
     api::Context* const context,
     const api::StorageType storage_type,
     const api::GPUMemoryLayout gpu_memory_layout,
-    const IntArrayRef gpu_sizes,
-    const at::ScalarType dtype)
+    const std::vector<int64_t>& gpu_sizes,
+    const api::ScalarType dtype,
+    const bool allocate_memory)
     : context_(context),
       storage_type_{storage_type},
       extents_(
           create_image_extents(gpu_sizes, storage_type, gpu_memory_layout)),
-      buffer_length_{c10::multiply_integers(gpu_sizes)},
+      buffer_length_{api::utils::multiply_integers(gpu_sizes)},
       image_(allocate_image(
           context_,
           extents_,
           storage_type_,
-          api::vk_format(dtype))),
-      buffer_(allocate_buffer(context_, buffer_length_, storage_type_, dtype)),
+          api::to_vkformat(dtype),
+          allocate_memory)),
+      buffer_(allocate_buffer(
+          context_,
+          buffer_length_,
+          storage_type_,
+          dtype,
+          allocate_memory)),
       last_access_{} {}
 
 vTensorStorage::~vTensorStorage() {
+  flush();
+}
+
+void vTensorStorage::flush() {
   if (image_) {
     context_->register_image_cleanup(image_);
   } else if (buffer_) {
     context_->register_buffer_cleanup(buffer_);
   }
+  last_access_ = {};
 }
 
 void vTensorStorage::transition(
@@ -619,19 +598,19 @@ void vTensorStorage::transition(
     pipeline_barrier.stage.dst |= dst_stage;
 
     if (image_) {
-      pipeline_barrier.images.push_back(api::ImageMemoryBarrier(
+      pipeline_barrier.images.emplace_back(
           api::vk_access(prev_stage, prev_access),
           api::vk_access(cur_stage, cur_access),
           cur_layout,
           new_layout,
-          image_));
+          image_);
 
       image_.set_layout(new_layout);
     } else if (buffer_) {
-      pipeline_barrier.buffers.push_back(api::BufferMemoryBarrier(
+      pipeline_barrier.buffers.emplace_back(
           api::vk_access(prev_stage, prev_access),
           api::vk_access(cur_stage, cur_access),
-          buffer_));
+          buffer_);
     }
   }
 
@@ -665,13 +644,35 @@ void add_buffer_barrier(
     pipeline_barrier.stage.src |= src_stage;
     pipeline_barrier.stage.dst |= dst_stage;
 
-    pipeline_barrier.buffers.push_back(api::BufferMemoryBarrier(
+    pipeline_barrier.buffers.emplace_back(
         api::vk_access(prev_stage, prev_access),
         api::vk_access(cur_stage, cur_access),
-        buffer));
+        buffer);
   }
 }
 
+void vTensorStorage::discard_and_reallocate(
+    const std::vector<int64_t>& gpu_sizes,
+    const api::GPUMemoryLayout gpu_memory_layout,
+    const api::ScalarType dtype) {
+  const bool image_owns_memory = image_.owns_memory();
+  const bool buffer_owns_memory = buffer_.owns_memory();
+
+  flush();
+
+  extents_ = create_image_extents(gpu_sizes, storage_type_, gpu_memory_layout);
+  image_ = allocate_image(
+      context_,
+      extents_,
+      storage_type_,
+      api::to_vkformat(dtype),
+      image_owns_memory);
+
+  buffer_length_ = api::utils::multiply_integers(gpu_sizes);
+  buffer_ = allocate_buffer(
+      context_, buffer_length_, storage_type_, dtype, buffer_owns_memory);
+}
+
 } // namespace vulkan
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/vulkan/api/Tensor.h b/aten/src/ATen/native/vulkan/api/Tensor.h
index 235c50b5dd753..6542c4610eec9 100644
--- a/aten/src/ATen/native/vulkan/api/Tensor.h
+++ b/aten/src/ATen/native/vulkan/api/Tensor.h
@@ -5,8 +5,7 @@
 #ifdef USE_VULKAN_API
 
 #include <ATen/native/vulkan/api/Context.h>
-#include <c10/core/MemoryFormat.h>
-#include <c10/util/accumulate.h>
+#include <ATen/native/vulkan/api/Types.h>
 
 namespace at {
 namespace native {
@@ -35,8 +34,9 @@ class vTensorStorage final {
       api::Context* context,
       const api::StorageType storage_type,
       const api::GPUMemoryLayout gpu_memory_layout,
-      const IntArrayRef sizes,
-      const at::ScalarType dtype);
+      const std::vector<int64_t>& sizes,
+      const api::ScalarType dtype,
+      const bool allocate_memory = true);
 
   vTensorStorage(const vTensorStorage&) = delete;
   vTensorStorage& operator=(const vTensorStorage&) = delete;
@@ -66,6 +66,9 @@ class vTensorStorage final {
   LastAccess last_access_;
 
  private:
+  // Registers underlying memory for cleanup
+  void flush();
+
   // Memory barrier insertion
   void transition(
       api::PipelineBarrier&,
@@ -79,6 +82,11 @@ class vTensorStorage final {
   inline VkFormat texture_format() {
     return image_.format();
   }
+
+  void discard_and_reallocate(
+      const std::vector<int64_t>& gpu_sizes,
+      const api::GPUMemoryLayout gpu_memory_layout,
+      const api::ScalarType dtype);
 };
 
 class vTensor final {
@@ -89,38 +97,23 @@ class vTensor final {
   // Default constructor
   vTensor(
       api::Context* context,
-      IntArrayRef sizes,
-      const c10::ScalarType dtype,
-      const api::StorageType storage_type,
-      const api::GPUMemoryLayout memory_layout);
-
-  // Default constructor for quantized vTensor
-  vTensor(
-      api::Context* const context,
-      const IntArrayRef sizes,
-      double q_scale,
-      int64_t q_zero_point,
-      const c10::ScalarType dtype,
-      const api::StorageType storage_type,
-      const api::GPUMemoryLayout memory_layout);
-
-  // Allows construction of vTensor from aten Tensor params
-  vTensor(
-      api::Context* context,
-      IntArrayRef sizes,
-      const c10::ScalarType dtype = c10::kFloat,
+      const std::vector<int64_t>& sizes,
+      const api::ScalarType dtype,
       const api::StorageType storage_type = api::StorageType::TEXTURE_3D,
-      const c10::MemoryFormat memory_format = c10::MemoryFormat::Contiguous);
+      const api::GPUMemoryLayout memory_layout =
+          api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
+      const bool allocate_memory = true);
 
-  // Allows construction of quantized vTensor from aten Tensor params
+  // Default constructor for quantized vTensor
   vTensor(
       api::Context* const context,
-      const IntArrayRef sizes,
+      const std::vector<int64_t>& sizes,
       double q_scale,
       int64_t q_zero_point,
-      const c10::ScalarType dtype = c10::kQUInt8,
+      const api::ScalarType dtype,
       const api::StorageType storage_type = api::StorageType::TEXTURE_3D,
-      const c10::MemoryFormat memory_format = c10::MemoryFormat::Contiguous);
+      const api::GPUMemoryLayout memory_layout =
+          api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
 
   // Copy Constructor and Assignment; Ideally copying  would be disabled
   // (see the reasoning for move assignment below) but it is required for
@@ -142,24 +135,43 @@ class vTensor final {
 
  private:
   // Tensor Options
-  c10::ScalarType dtype_;
+  api::ScalarType dtype_;
 
   // GPU specific memory layout qualifier
   api::GPUMemoryLayout memory_layout_;
 
   // Sizes and Strides
-  c10::SmallVector<int64_t, 6u> sizes_;
-  c10::SmallVector<int64_t, 6u> strides_;
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
 
   // Storage Dimensions. When stored on the GPU, one dimension will be aligned
   // to the next multiple of 4 in order to take advantage of vec4 data types.
-  c10::SmallVector<int64_t, 6u> gpu_sizes_;
-  c10::SmallVector<int64_t, 6u> gpu_strides_;
+  std::vector<int64_t> gpu_sizes_;
+  std::vector<int64_t> gpu_strides_;
+
+  // The extents that correspond to the tensor's size metadata. Note that this
+  // may not be the same as the extents of the underlying image texture because
+  // vTensor can be virtually resized via virtual_resize() which will cause it
+  // to be interpreted as a tensor with a different size.
+  api::utils::uvec3 virtual_extents_;
 
   // A Vulkan uniform buffer containing sizes and strides of the GPU buffer that
   // can be passed into a shader.
   api::UniformParamsBuffer metadata_uniform_;
 
+  // A Vulkan uniform buffer containing the tensor sizes that can be passed into
+  // a shader.
+  std::shared_ptr<api::UniformParamsBuffer> cpu_sizes_uniform_;
+
+  // A Vulkan uniform buffer containing the GPU tensor sizes that can be passed
+  // into a shader. GPU sizes refers to the sizes of the tensor after padding
+  // has been applied to one dimension to align it to the next multiple of 4.
+  std::shared_ptr<api::UniformParamsBuffer> gpu_sizes_uniform_;
+
+  // A Vulkan uniform buffer containing the image extents of the underlying
+  // image texture that can be passed into a shader.
+  std::shared_ptr<api::UniformParamsBuffer> extents_uniform_;
+
   // Quantization params
   bool is_quantized_{false};
   double q_scale_{1.0f};
@@ -167,7 +179,7 @@ class vTensor final {
 
   // Even at the cost of a heap allocation plus the resulting negative impact
   // on cache locality due to the subsequent pointer chasing, it is still
-  // critcal to share the view across vTensor implementations to minimize
+  // critical to share the view across vTensor implementations to minimize
   // programmer errors.  Ideally this class should have been only made movable,
   // and non-copyable - something we cannot do unfortunately due to the inner
   // workings of at::TensorImpl requiring copy semantics in
@@ -180,7 +192,7 @@ class vTensor final {
   // this trap and not pay the cost of indirection, but the resulting bugs of
   // missing memory barriers will be so frustrating to hunt down for those
   // unfamiliar with the internal mechanics of this class, that I decided to
-  // take the performance pentalty of this extra layer of indirection in favor
+  // take the performance penalty of this extra layer of indirection in favor
   // of making this class easier to use.
   std::shared_ptr<vTensorStorage> view_;
 
@@ -193,6 +205,10 @@ class vTensor final {
     return view_->storage_type_;
   }
 
+  inline api::VulkanImage& image() const& {
+    return view_->image_;
+  }
+
   api::VulkanImage& image(api::PipelineBarrier&, const api::PipelineStageFlags)
       const&;
 
@@ -201,6 +217,10 @@ class vTensor final {
       const api::PipelineStageFlags,
       const api::MemoryAccessFlags) &;
 
+  inline api::VulkanBuffer& buffer() const& {
+    return view_->buffer_;
+  }
+
   api::VulkanBuffer& buffer(
       api::PipelineBarrier&,
       const api::PipelineStageFlags) const&;
@@ -219,17 +239,18 @@ class vTensor final {
   }
 
   /*
-   * Extract a ScalarType from the TensorOptions member
+   * Extract an `api::ScalarType` from the TensorOptions member
    */
-  inline c10::ScalarType dtype() const {
+  inline api::ScalarType dtype() const {
     return dtype_;
   }
 
   /*
-   * Get a c10::ScalarType that corresponds to the image format of the texture
+   * Get an `api::ScalarType` that corresponds to the image format of the
+   * texture
    */
-  inline c10::ScalarType texture_dtype() const {
-    return api::c10_scalartype(view_->texture_format());
+  inline api::ScalarType texture_dtype() const {
+    return api::element_scalartype(view_->texture_format());
   }
 
   inline api::GPUMemoryLayout gpu_memory_layout() const {
@@ -240,29 +261,52 @@ class vTensor final {
     return static_cast<uint32_t>(memory_layout_);
   }
 
-  inline IntArrayRef sizes() const {
+  inline const std::vector<int64_t>& sizes() const {
     return sizes_;
   }
 
-  inline IntArrayRef strides() const {
+  inline const std::vector<int64_t>& strides() const {
     return strides_;
   }
 
-  inline IntArrayRef gpu_sizes() const {
+  inline const std::vector<int64_t>& gpu_sizes() const {
     return gpu_sizes_;
   }
 
-  inline IntArrayRef gpu_strides() const {
+  inline const std::vector<int64_t>& gpu_strides() const {
     return gpu_strides_;
   }
 
+  inline const api::utils::uvec3& virtual_extents() const {
+    return virtual_extents_;
+  }
+
   /*
    * Get a uniform buffer containing sizes and strides information of the GPU
    * buffer
    */
-  inline api::VulkanBuffer& buffer_metadata() {
-    return metadata_uniform_.buffer();
-  }
+  api::VulkanBuffer& buffer_metadata();
+
+  /*
+   * Get a uniform buffer object containing the tensor sizes to use in a compute
+   * shader. Note that the UBO will be created the first time this function is
+   * called.
+   */
+  std::shared_ptr<api::UniformParamsBuffer> cpu_sizes_ubo();
+
+  /*
+   * Get a uniform buffer object containing the tensor GPU sizes to use in a
+   * compute shader. Note that the UBO will be created the first time this
+   * function is called.
+   */
+  std::shared_ptr<api::UniformParamsBuffer> gpu_sizes_ubo();
+
+  /*
+   * Get a uniform buffer object containing the image extents to use in a
+   * compute shader. Note that the UBO will be created the first time this
+   * function is called.
+   */
+  std::shared_ptr<api::UniformParamsBuffer> extents_ubo();
 
   /*
    * Constructs a BufferMetdata struct based on the original sizes and strides
@@ -303,26 +347,62 @@ class vTensor final {
   }
 
   inline size_t numel() const {
-    return c10::multiply_integers(sizes());
+    return api::utils::multiply_integers(sizes());
   }
 
   inline size_t nbytes() const {
-    return c10::elementSize(dtype()) * numel();
+    return api::element_size(dtype()) * numel();
   }
 
   /*
    * Returns numel but based on gpu_sizes_ instead of sizes_
    */
   inline size_t gpu_numel() const {
-    return view_->buffer_length_;
+    return api::utils::multiply_integers(gpu_sizes_);
   }
 
   /*
    * Return nbytes but bnased on gpu_sizes_ instead of sizes_
    */
   inline VkDeviceSize gpu_nbytes() const {
-    return c10::elementSize(dtype()) * gpu_numel();
+    return api::element_size(dtype()) * gpu_numel();
   }
+
+  /*
+   * Return the VmaAllocationCreateInfo of the underlying resource
+   */
+  VmaAllocationCreateInfo get_allocation_create_info() const;
+
+  /*
+   * Return the VkMemoryRequirements of the underlying resource
+   */
+  VkMemoryRequirements get_memory_requirements() const;
+
+  /*
+   * Binds the underlying resource to the given memory allocation
+   */
+  void bind_allocation(const api::MemoryAllocation& allocation);
+
+ private:
+  /*
+   * Update the size metadata of the vTensor to be new sizes. Should not be used
+   * directly, reallocate() or virtual_resize() should be used instead.
+   */
+  void update_size_metadata(const std::vector<int64_t>& new_sizes);
+
+ public:
+  /*
+   * Discard the underlying VkImage or VkBuffer and re-allocate based on new
+   * tensor sizes
+   */
+  void reallocate(const std::vector<int64_t>& new_sizes);
+
+  /*
+   * Perform a virtual resize of the vTensor by modifying the size metadata that
+   * gets used in compute shaders. This allows the shader to treat the
+   * underlying resource as if it were a different size.
+   */
+  void virtual_resize(const std::vector<int64_t>& new_sizes);
 };
 
 void add_buffer_barrier(
diff --git a/aten/src/ATen/native/vulkan/api/Types.h b/aten/src/ATen/native/vulkan/api/Types.h
index 3e92dad9b5353..548703aa8a956 100644
--- a/aten/src/ATen/native/vulkan/api/Types.h
+++ b/aten/src/ATen/native/vulkan/api/Types.h
@@ -1,11 +1,154 @@
 #pragma once
 
+// @lint-ignore-every CLANGTIDY bugprone-branch-clone
+
 #ifdef USE_VULKAN_API
+
+#include <cstddef>
+#include <cstdint>
+
+#include <ATen/native/vulkan/api/vk_api.h>
+
+#include <ATen/native/vulkan/api/Exception.h>
+
+#ifdef USE_VULKAN_FP16_INFERENCE
+#define VK_FORMAT_FLOAT4 VK_FORMAT_R16G16B16A16_SFLOAT
+#else
+#define VK_FORMAT_FLOAT4 VK_FORMAT_R32G32B32A32_SFLOAT
+#endif /* USE_VULKAN_FP16_INFERENCE */
+
+#define VK_FORALL_SCALAR_TYPES(_)               \
+  _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, Byte)     \
+  _(int8_t, VK_FORMAT_R8G8B8A8_SINT, Char)      \
+  _(int32_t, VK_FORMAT_R32G32B32A32_SINT, Int)  \
+  _(bool, VK_FORMAT_R8G8B8A8_SINT, Bool)        \
+  _(float, VK_FORMAT_R16G16B16A16_SFLOAT, Half) \
+  _(float, VK_FORMAT_FLOAT4, Float)             \
+  _(int8_t, VK_FORMAT_R8G8B8A8_SINT, QInt8)     \
+  _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, QUInt8)   \
+  _(int32_t, VK_FORMAT_R32G32B32A32_SINT, QInt32)
+
 namespace at {
 namespace native {
 namespace vulkan {
 namespace api {
 
+//
+// Scalar Types
+//
+
+enum class ScalarType : int8_t {
+#define DEFINE_ENUM_VAL_(ctype, vkformat, name) name,
+  VK_FORALL_SCALAR_TYPES(DEFINE_ENUM_VAL_)
+#undef DEFINE_ENUM_VAL_
+      Undefined,
+  NumOptions
+};
+
+#define DEFINE_CONSTANT(ctype, vkformat, name) \
+  constexpr ScalarType k##name = ScalarType::name;
+
+VK_FORALL_SCALAR_TYPES(DEFINE_CONSTANT)
+#undef DEFINE_CONSTANT
+
+/*
+ * Given a `ScalarType`, return the corresponding `VkFormat` that should be used
+ * for image texture storage. The `ScalarType` to `VkFormat` mapping is dictated
+ * by the `VK_FORALL_SCALAR_TYPE` macro in `api/Types.h`
+ */
+inline VkFormat to_vkformat(const ScalarType t) {
+#define CASE_VK_FORMAT(ctype, vkformat, name) \
+  case ScalarType::name:                      \
+    return vkformat;
+
+  switch (t) {
+    VK_FORALL_SCALAR_TYPES(CASE_VK_FORMAT)
+    default:
+      VK_THROW("Unknown ScalarType: ", t);
+  }
+#undef CASE_VK_FORMAT
+}
+
+/*
+ * Given a `VkFormat`, return the `ScalarType` that best represents the data
+ * type of invidivual elements in an image texture of the `VkFormat`. Note that
+ * this mapping is different from the `to_vkformat()` function, since different
+ * `ScalarType`s may use the same `VkFormat`.
+ */
+inline ScalarType element_scalartype(const VkFormat vkformat) {
+  switch (vkformat) {
+    case VK_FORMAT_R8G8B8A8_SINT:
+      return kChar;
+    case VK_FORMAT_R8G8B8A8_UINT:
+      return kByte;
+    case VK_FORMAT_R32G32B32A32_SINT:
+      return kInt;
+    case VK_FORMAT_R32G32B32A32_SFLOAT:
+      return kFloat;
+    case VK_FORMAT_R16G16B16A16_SFLOAT:
+      return kHalf;
+    default:
+      VK_THROW("No corresponding scalar type for unknown VkFormat: ", vkformat);
+  }
+}
+
+/*
+ * Given a ScalarType, return `sizeof(ctype)` where ctype is the C type
+ * corresponding to the ScalarType. The C type to ScalarType mapping is dictated
+ * by the VK_FORALL_SCALAR_TYPE macro in api/Types.h
+ */
+inline size_t element_size(const ScalarType t) {
+#define CASE_ELEMENTSIZE_CASE(ctype, vkformat, name) \
+  case ScalarType::name:                             \
+    return sizeof(ctype);
+
+  switch (t) {
+    VK_FORALL_SCALAR_TYPES(CASE_ELEMENTSIZE_CASE)
+    default:
+      VK_THROW("Unknown ScalarType: ", t);
+  }
+#undef CASE_ELEMENTSIZE_CASE
+}
+
+inline const char* to_string(const ScalarType t) {
+#define CASE_TO_STRING(ctype, vkformat, name) \
+  case ScalarType::name:                      \
+    return #name;
+
+  switch (t) {
+    VK_FORALL_SCALAR_TYPES(CASE_TO_STRING)
+    default:
+      return "UNKNOWN_SCALAR_TYPE";
+  }
+#undef CASE_TO_STRING
+}
+
+inline std::ostream& operator<<(std::ostream& os, const ScalarType dtype) {
+  return os << to_string(dtype);
+}
+
+//
+// Map ScalarTypes to C++ types
+//
+
+template <ScalarType N>
+struct ScalarTypeToCType;
+
+#define SPECIALIZE_ScalarTypeToCType(ctype, vkformat, scalar_type) \
+  template <>                                                      \
+  struct ScalarTypeToCType<                                        \
+      ::at::native::vulkan::api::ScalarType::scalar_type> {        \
+    using type = ctype;                                            \
+  };
+
+VK_FORALL_SCALAR_TYPES(SPECIALIZE_ScalarTypeToCType)
+
+#undef SPECIALIZE_ScalarTypeToCPPType
+
+//
+// GPU Storage Options
+//
+
 /**
  * The enum below is used to describe what type of GPU memory will be used to
  * store a particular tensor's data.
diff --git a/aten/src/ATen/native/vulkan/api/Utils.h b/aten/src/ATen/native/vulkan/api/Utils.h
index 9c264929d46be..db4e012e23f57 100644
--- a/aten/src/ATen/native/vulkan/api/Utils.h
+++ b/aten/src/ATen/native/vulkan/api/Utils.h
@@ -1,18 +1,41 @@
 #pragma once
 
-#include <c10/util/ArrayRef.h>
-#include <c10/util/Half.h> // For c10::overflows
+#include <cmath>
+#include <numeric>
 
-#include <ATen/native/vulkan/api/Common.h>
+#include <ATen/native/vulkan/api/vk_api.h>
+
+#include <ATen/native/vulkan/api/Exception.h>
 
 #ifdef USE_VULKAN_API
 
+// Compiler Macros
+
+// Suppress an unused variable. Copied from C10_UNUSED
+#if defined(_MSC_VER) && !defined(__clang__)
+#define VK_UNUSED __pragma(warning(suppress : 4100 4101))
+#else
+#define VK_UNUSED __attribute__((__unused__))
+#endif //_MSC_VER
+
 namespace at {
 namespace native {
 namespace vulkan {
 namespace api {
 namespace utils {
 
+//
+// Hashing
+//
+
+/**
+ * hash_combine is taken from c10/util/hash.h, which in turn is based on
+ * implementation from Boost
+ */
+inline size_t hash_combine(size_t seed, size_t value) {
+  return seed ^ (value + 0x9e3779b9 + (seed << 6u) + (seed >> 2u));
+}
+
 //
 // Alignment
 //
@@ -33,14 +56,142 @@ inline constexpr Type div_up(const Type& numerator, const Type& denominator) {
 }
 
 //
-// Cast
+// Casting Utilities
 //
 
 namespace detail {
 
+/*
+ * x cannot be less than 0 if x is unsigned
+ */
+template <typename T>
+static inline constexpr bool is_negative(
+    const T& /*x*/,
+    std::true_type /*is_unsigned*/) {
+  return false;
+}
+
+/*
+ * check if x is less than 0 if x is signed
+ */
+template <typename T>
+static inline constexpr bool is_negative(
+    const T& x,
+    std::false_type /*is_unsigned*/) {
+  return x < T(0);
+}
+
+/*
+ * Returns true if x < 0
+ */
+template <typename T>
+inline constexpr bool is_negative(const T& x) {
+  return is_negative(x, std::is_unsigned<T>());
+}
+
+/*
+ * Returns true if x < lowest(Limit); standard comparison
+ */
+template <typename Limit, typename T>
+static inline constexpr bool less_than_lowest(
+    const T& x,
+    std::false_type /*limit_is_unsigned*/,
+    std::false_type /*x_is_unsigned*/) {
+  return x < std::numeric_limits<Limit>::lowest();
+}
+
+/*
+ * Limit can contained negative values, but x cannot; return false
+ */
+template <typename Limit, typename T>
+static inline constexpr bool less_than_lowest(
+    const T& /*x*/,
+    std::false_type /*limit_is_unsigned*/,
+    std::true_type /*x_is_unsigned*/) {
+  return false;
+}
+
+/*
+ * Limit cannot contained negative values, but x can; check if x is negative
+ */
+template <typename Limit, typename T>
+static inline constexpr bool less_than_lowest(
+    const T& x,
+    std::true_type /*limit_is_unsigned*/,
+    std::false_type /*x_is_unsigned*/) {
+  return x < T(0);
+}
+
+/*
+ * Both x and Limit cannot be negative; return false
+ */
+template <typename Limit, typename T>
+static inline constexpr bool less_than_lowest(
+    const T& /*x*/,
+    std::true_type /*limit_is_unsigned*/,
+    std::true_type /*x_is_unsigned*/) {
+  return false;
+}
+
+/*
+ * Returns true if x is less than the lowest value of type T
+ */
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(const T& x) {
+  return less_than_lowest<Limit>(
+      x, std::is_unsigned<Limit>(), std::is_unsigned<T>());
+}
+
+// Suppress sign compare warning when compiling with GCC
+// as later does not account for short-circuit rule before
+// raising the warning, see https://godbolt.org/z/Tr3Msnz99
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#endif
+
+/*
+ * Returns true if x is greater than the greatest value of the type Limit
+ */
+template <typename Limit, typename T>
+inline constexpr bool greater_than_max(const T& x) {
+  constexpr bool can_overflow =
+      std::numeric_limits<T>::digits > std::numeric_limits<Limit>::digits;
+  return can_overflow && x > std::numeric_limits<Limit>::max();
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+template <typename To, typename From>
+std::enable_if_t<std::is_integral_v<From> && !std::is_same_v<From, bool>, bool>
+overflows(From f) {
+  using limit = std::numeric_limits<To>;
+  // Casting from signed to unsigned; allow for negative numbers to wrap using
+  // two's complement arithmetic.
+  if (!limit::is_signed && std::numeric_limits<From>::is_signed) {
+    return greater_than_max<To>(f) ||
+        (is_negative(f) && -static_cast<uint64_t>(f) > limit::max());
+  }
+  // standard case, check if f is outside the range of type To
+  else {
+    return less_than_lowest<To>(f) || greater_than_max<To>(f);
+  }
+}
+
+template <typename To, typename From>
+std::enable_if_t<std::is_floating_point_v<From>, bool> overflows(From f) {
+  using limit = std::numeric_limits<To>;
+  if (limit::has_infinity && std::isinf(static_cast<double>(f))) {
+    return false;
+  }
+  return f < limit::lowest() || f > limit::max();
+}
+
 template <typename To, typename From>
 inline constexpr To safe_downcast(const From& v) {
-  TORCH_CHECK(!c10::overflows<To>(v), "Cast failed: out of range!");
+  VK_CHECK_COND(!overflows<To>(v), "Cast failed: out of range!");
   return static_cast<To>(v);
 }
 
@@ -56,7 +207,7 @@ template <
     typename From,
     std::enable_if_t<detail::is_signed_to_unsigned<To, From>(), bool> = true>
 inline constexpr To safe_downcast(const From& v) {
-  TORCH_CHECK(v >= From{}, "Cast failed: negative signed to unsigned!");
+  VK_CHECK_COND(v >= From{}, "Cast failed: negative signed to unsigned!");
   return detail::safe_downcast<To, From>(v);
 }
 
@@ -107,25 +258,28 @@ inline std::ostream& operator<<(std::ostream& os, const uvec3& v) {
 }
 
 //
-// IntArrayRef Handling
+// std::vector<T> Handling
 //
 
 /*
- * Utility function to perform indexing on an IntArrayRef. Negative indexing is
- * allowed. For instance, passing an index of -1 will retrieve the last element.
- * If the requested index is out of bounds, then 1u will be returned.
+ * Utility function to perform indexing on an std::vector<T>. Negative indexing
+ * is allowed. For instance, passing an index of -1 will retrieve the last
+ * element. If the requested index is out of bounds, then 1u will be returned.
  */
-inline uint32_t val_at(int32_t index, const IntArrayRef sizes) {
-  const int32_t ndim = static_cast<int32_t>(sizes.size());
+template <typename T>
+inline T val_at(const int64_t index, const std::vector<T>& sizes) {
+  const int64_t ndim = static_cast<int64_t>(sizes.size());
   if (index >= 0) {
-    return index >= ndim ? 1 : safe_downcast<uint32_t>(sizes[index]);
+    return index >= ndim ? 1 : sizes[index];
   } else {
-    return ndim + index < 0 ? 1 : safe_downcast<uint32_t>(sizes[ndim + index]);
+    return ndim + index < 0 ? 1 : sizes[ndim + index];
   }
 }
 
-inline ivec2 make_ivec2(IntArrayRef ints, bool reverse = false) {
-  TORCH_CHECK(ints.size() == 2);
+inline ivec2 make_ivec2(
+    const std::vector<int64_t>& ints,
+    bool reverse = false) {
+  VK_CHECK_COND(ints.size() == 2);
   if (reverse) {
     return {safe_downcast<int32_t>(ints[1]), safe_downcast<int32_t>(ints[0])};
   } else {
@@ -133,8 +287,10 @@ inline ivec2 make_ivec2(IntArrayRef ints, bool reverse = false) {
   }
 }
 
-inline ivec4 make_ivec4(IntArrayRef ints, bool reverse = false) {
-  TORCH_CHECK(ints.size() == 4);
+inline ivec4 make_ivec4(
+    const std::vector<int64_t>& ints,
+    bool reverse = false) {
+  VK_CHECK_COND(ints.size() == 4);
   if (reverse) {
     return {
         safe_downcast<int32_t>(ints[3]),
@@ -152,8 +308,8 @@ inline ivec4 make_ivec4(IntArrayRef ints, bool reverse = false) {
   }
 }
 
-inline ivec4 make_ivec4_prepadded1(IntArrayRef ints) {
-  TORCH_CHECK(ints.size() <= 4);
+inline ivec4 make_ivec4_prepadded1(const std::vector<int64_t>& ints) {
+  VK_CHECK_COND(ints.size() <= 4);
 
   ivec4 result = {1, 1, 1, 1};
   size_t base = 4 - ints.size();
@@ -172,18 +328,47 @@ inline ivec3 make_ivec3(uvec3 ints) {
 }
 
 /*
- * Given an IntArrayRef of up to 4 elements, constructs a uvec4 containing those
- * elements in reverse order.
+ * Given an vector of up to 4 uint64_t representing the sizes of a tensor,
+ * constructs a uvec4 containing those elements in reverse order.
+ */
+inline uvec4 make_whcn_uvec4(const std::vector<int64_t>& arr) {
+  uint32_t w = safe_downcast<uint32_t>(val_at(-1, arr));
+  uint32_t h = safe_downcast<uint32_t>(val_at(-2, arr));
+  uint32_t c = safe_downcast<uint32_t>(val_at(-3, arr));
+  uint32_t n = safe_downcast<uint32_t>(val_at(-4, arr));
+
+  return {w, h, c, n};
+}
+
+/*
+ * Given an vector of up to 4 int64_t representing the sizes of a tensor,
+ * constructs an ivec4 containing those elements in reverse order.
  */
-inline uvec4 make_nchw_uvec4(const IntArrayRef arr) {
-  uint32_t w = val_at(-1, arr);
-  uint32_t h = val_at(-2, arr);
-  uint32_t c = val_at(-3, arr);
-  uint32_t n = val_at(-4, arr);
+inline ivec4 make_whcn_ivec4(const std::vector<int64_t>& arr) {
+  int32_t w = val_at(-1, arr);
+  int32_t h = val_at(-2, arr);
+  int32_t c = val_at(-3, arr);
+  int32_t n = val_at(-4, arr);
 
   return {w, h, c, n};
 }
 
+/*
+ * Wrapper around std::accumulate that accumulates values of a container of
+ * integral types into int64_t. Taken from `multiply_integers` in
+ * <c10/util/accumulate.h>
+ */
+template <
+    typename C,
+    std::enable_if_t<std::is_integral_v<typename C::value_type>, int> = 0>
+inline int64_t multiply_integers(const C& container) {
+  return std::accumulate(
+      container.begin(),
+      container.end(),
+      static_cast<int64_t>(1),
+      std::multiplies<>());
+}
+
 } // namespace utils
 
 inline bool operator==(const utils::uvec3& _1, const utils::uvec3& _2) {
@@ -194,7 +379,7 @@ inline bool operator==(const utils::uvec3& _1, const utils::uvec3& _2) {
 
 inline VkOffset3D create_offset3d(const utils::uvec3& offsets) {
   return VkOffset3D{
-      static_cast<int32_t>(offsets.data[0u]),
+      utils::safe_downcast<int32_t>(offsets.data[0u]),
       static_cast<int32_t>(offsets.data[1u]),
       static_cast<int32_t>(offsets.data[2u])};
 }
diff --git a/aten/src/ATen/native/vulkan/api/api.h b/aten/src/ATen/native/vulkan/api/api.h
index 1873ec126ce3b..65c8d4adbae96 100644
--- a/aten/src/ATen/native/vulkan/api/api.h
+++ b/aten/src/ATen/native/vulkan/api/api.h
@@ -2,8 +2,6 @@
 
 #ifdef USE_VULKAN_API
 
-#include <ATen/native/vulkan/api/Common.h>
-
 #include <ATen/native/vulkan/api/Adapter.h>
 #include <ATen/native/vulkan/api/Command.h>
 #include <ATen/native/vulkan/api/Context.h>
@@ -12,6 +10,7 @@
 #include <ATen/native/vulkan/api/Resource.h>
 #include <ATen/native/vulkan/api/Runtime.h>
 #include <ATen/native/vulkan/api/Shader.h>
+#include <ATen/native/vulkan/api/ShaderRegistry.h>
 #include <ATen/native/vulkan/api/Tensor.h>
 #include <ATen/native/vulkan/api/Utils.h>
 
diff --git a/aten/src/ATen/native/vulkan/glsl/conv1d.glsl b/aten/src/ATen/native/vulkan/glsl/conv1d.glsl
index 0b4532f5a45ea..8003ea61826fa 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv1d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv1d.glsl
@@ -17,69 +17,98 @@ layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel;
 layout(set = 0, binding = 3) uniform PRECISION sampler3D uBias;
 
 layout(set = 0, binding = 4) uniform PRECISION restrict Block {
-  int out_channels;
-  int in_lengths;
+  int in_length;
   int kernel_size;
+  int strides;
+  int padding;
+  int dilation;
+  int in_group_size;
+  int out_group_size;
+  int batch_size;
 }
 uBlock;
 
-// This implementation optimize for simplicity (and partially performance) for a
-// (1, C, L) where C == groups. Hence we only focus on calculating the rolling
-// kernel of the L dimension.
+// Let us define
+//
+// input = (N, in_C, in_L),
+// output = (N, out_C, out_L),
+// groups = G,
+// kernel = K,
+//
+// which results in shapes
+//
+// weight = (out_C, in_C / G, K),
+// bias = (out_C,).
+//
+// This implementation performs out_C shader invocations, where each invocation
+// calculates the rolling kernel of the length dimension for each batch, i.e.,
+// computes out_L * N results.
+//
+// Note that we can rewrite this implementation as out_L * out_C * ceil(N / 4)
+// shader invocations, where each invocation computes 1 result. But that
+// performs worse.
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  const int out_channels = uBlock.out_channels;
-  const int in_lengths = uBlock.in_lengths;
+  const int in_length = uBlock.in_length;
   const int kernel_size = uBlock.kernel_size;
-
-  // The global workgroup should have taken care of it. We only perform one
-  // work item for each 1d tensor on lengths
-  if (pos.x >= 1) {
-    return;
-  }
-
-  int c = pos.y;
-  if (c >= out_channels) {
-    return;
-  }
-
-  // Assume n = 1, do not handle n > 1 case for now.
-  int n = pos.z;
-  if (n >= 1) {
-    return;
-  }
-
-  vec4 bias = texelFetch(uBias, ivec3(c, 0, 0), 0);
-
-  for (int i = 0; i < in_lengths - kernel_size + 1; i++) {
-    vec4 v = vec4(0,0,0,0);
-    for (int k = 0; k < kernel_size; k++) {
-      const ivec3 in_pos = ivec3(i+k, c, 0);
-      const vec4 input_value = texelFetch(uInput, in_pos, 0);
-
-      // Note that we are reading weight in the inner loop, this could be
-      // improved by moving it before the outer loop. Since the weight vector is
-      // contant for the entire call.
-
-      // weight in input-space: (c, 0, k);
-      // notice that c is 4-packed. We need to mod 4 to get the actual weight.
-      const ivec3 w_pos = ivec3(k, 0, c / 4);
-      const vec4 weight = texelFetch(uKernel, w_pos, 0);
-
-      float w = weight.x;
-      if (c % 4 == 1) {
-        w = weight.y;
-      } else if (c % 4 == 2) {
-        w = weight.z;
-      } else if (c % 4 == 3) {
-        w = weight.w;
+  const int strides = uBlock.strides;
+  const int padding = uBlock.padding;
+  const int dilation = uBlock.dilation;
+  const int in_group_size = uBlock.in_group_size;
+  const int out_group_size = uBlock.out_group_size;
+  const int batch_size = uBlock.batch_size;
+
+  // "out_c" is the output's channel index where we write our result.
+  // Across shader invocations, this is the only value that varies.
+  int out_c = pos.y;
+  vec4 bias = texelFetch(uBias, ivec3(out_c, 0, 0), 0);
+
+  // "in_c" tracks the input's channel start index.
+  // We iterate over the input group that corresponds to the output group.
+  int c_start = (out_c / out_group_size) * in_group_size;
+  int c_end = c_start + in_group_size;
+
+  // "in_l" tracks the input's length start index for our input-kernel overlay
+  // region.
+  int l_start = -padding;
+  int l_end = in_length + padding - dilation * (kernel_size - 1);
+
+  // Since the input/output tensors are channel-packed, which is along the
+  // batch dimension, we can batch-read/write four elements at a time.
+  for (int n = 0; n < batch_size; n += 4) {
+    // "out_l" tracks the output's length index where we write our result.
+    int out_l = 0;
+
+    for (int in_l = l_start; in_l < l_end; in_l += strides, ++out_l) {
+      vec4 sum = vec4(0,0,0,0);
+
+      for (int in_c = c_start; in_c < c_end; ++in_c) {
+        // "k" tracks the kernel's index for our input-kernel computation.
+        // It reads out-of-bound zeros, but trying to avoid them complicates
+        // for-loop conditions, which results in worse performance.
+        for (int k = 0; k < kernel_size; k += 4) {
+          // Since the weight tensor is width-packed, which is along the length
+          // dimension, we can batch-read four elements at a time.
+          const ivec3 w_pos = ivec3(k / 4, in_c % in_group_size, out_c);
+          const vec4 weight = texelFetch(uKernel, w_pos, 0);
+
+          const ivec3 in_pos_0 = ivec3(in_l + k * dilation, in_c, n / 4);
+          sum = fma(weight.xxxx, texelFetch(uInput, in_pos_0, 0), sum);
+
+          const ivec3 in_pos_1 = ivec3(in_l + (k+1) * dilation, in_c, n / 4);
+          sum = fma(weight.yyyy, texelFetch(uInput, in_pos_1, 0), sum);
+
+          const ivec3 in_pos_2 = ivec3(in_l + (k+2) * dilation, in_c, n / 4);
+          sum = fma(weight.zzzz, texelFetch(uInput, in_pos_2, 0), sum);
+
+          const ivec3 in_pos_3 = ivec3(in_l + (k+3) * dilation, in_c, n / 4);
+          sum = fma(weight.wwww, texelFetch(uInput, in_pos_3, 0), sum);
+        }
       }
 
-      v += w * input_value.x;
+      ivec3 out_pos = ivec3(out_l, out_c, n / 4);
+      imageStore(uOutput, out_pos, sum + bias.x);
     }
-
-    ivec3 out_pos = ivec3(i, c, 0);
-    imageStore(uOutput, out_pos, vec4(v.x + bias.x, 0, 0, 0));
   }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/cumsum.glsl b/aten/src/ATen/native/vulkan/glsl/cumsum.glsl
deleted file mode 100644
index b9cabdaf261d6..0000000000000
--- a/aten/src/ATen/native/vulkan/glsl/cumsum.glsl
+++ /dev/null
@@ -1,27 +0,0 @@
-#version 450 core
-#define PRECISION ${PRECISION}
-#define FORMAT ${FORMAT}
-
-layout(std430) buffer;
-
-/* Qualifiers: layout - storage - precision - memory */
-
-layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D   uOutput;
-layout(set = 0, binding = 1)         uniform PRECISION                    sampler3D uInput;
-layout(set = 0, binding = 2)         uniform PRECISION restrict           Block {
-  int axis;
-} uBlock;
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  ivec3 spos = pos;
-  vec4 sum = vec4(0);
-  for(spos[uBlock.axis] = 0; spos!=pos; ++spos[uBlock.axis]) {
-    sum += texelFetch(uInput, spos, 0);
-  }
-  sum += texelFetch(uInput, spos, 0);
-  imageStore(uOutput, pos, sum);
-}
diff --git a/aten/src/ATen/native/vulkan/glsl/cumsum_batch_height_width.glsl b/aten/src/ATen/native/vulkan/glsl/cumsum_batch_height_width.glsl
new file mode 100644
index 0000000000000..de783f47424cd
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/cumsum_batch_height_width.glsl
@@ -0,0 +1,45 @@
+#version 450 core
+#define PRECISION ${PRECISION}
+#define FORMAT ${FORMAT}
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION                    sampler3D uInput;
+
+/*
+ * Params Buffer
+ * input_shader_extents is the dimensions of the Vulkan 3D texture XYZ
+ * with a zero pad at W.
+ * input_tensor_dims is the dimensions of the NCHW PyTorch Tensor.
+ * input_dim_stride is the stride to include elements along the scan
+ * dimension calculation. early_exit is the global workgroup position-based
+ * condition for unnecessary invocations to exit.
+ */
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  ivec4 input_shader_extents;
+  ivec4 input_tensor_dims;
+  ivec4 input_dim_stride;
+  ivec4 early_exit;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * This shader can compute cumsum along batch, height, and width.
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  if (!all(lessThan(pos, uBlock.early_exit.xyz))) {
+    return;
+  }
+  ivec3 cand_pos = pos;
+  vec4 sum = vec4(0, 0, 0, 0);
+  while (all(lessThan(cand_pos, uBlock.input_shader_extents.xyz))) {
+    sum += texelFetch(uInput, cand_pos, 0);
+    imageStore(uOutput, cand_pos, sum);
+    cand_pos += uBlock.input_dim_stride.xyz;
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/cumsum_channel.glsl b/aten/src/ATen/native/vulkan/glsl/cumsum_channel.glsl
new file mode 100644
index 0000000000000..1cbc399d1206f
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/cumsum_channel.glsl
@@ -0,0 +1,68 @@
+#version 450 core
+#define PRECISION ${PRECISION}
+#define FORMAT ${FORMAT}
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION                    sampler3D uInput;
+
+/*
+ * Params Buffer
+ * input_shader_extents is the dimensions of the Vulkan 3D texture XYZ
+ * with a zero pad at W.
+ * input_tensor_dims is the dimensions of the NCHW PyTorch Tensor.
+ * input_dim_stride is the stride to include elements along the scan
+ * dimension calculation. early_exit is the global workgroup position-based
+ * condition for unnecessary invocations to exit.
+ */
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  ivec4 input_shader_extents;
+  ivec4 input_tensor_dims;
+  ivec4 input_dim_stride;
+  ivec4 early_exit;
+} uBlock;
+
+/*
+ * Local Work Group Size
+ */
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  // how "wide" a batch is in terms of z. Only have one invocation per batch,
+  // as one batch width has elements from every channel in-memory.
+  if (!all(lessThan(pos, uBlock.early_exit.xyz))) {
+    return;
+  }
+  const int b_stride = int(ceil(uBlock.input_tensor_dims.y / 4.0));
+  const ivec3 src_pos = ivec3(pos.x, pos.y, pos.z * b_stride);
+  // tail case, padded zeros in memory if tensor's channel dim % 4 != 0
+  uint tail_case_size = uBlock.input_tensor_dims.y % 4;
+  if (tail_case_size == 0) {
+    tail_case_size = 4;
+  }
+
+  float sum = 0;
+  for (int c = 0; c < b_stride - 1; c++) {
+    const ivec3 dst_pos = ivec3(src_pos.x, src_pos.y, src_pos.z + c);
+    const vec4 c_texel =
+        texelFetch(uInput, ivec3(src_pos.x, src_pos.y, src_pos.z + c), 0);
+    vec4 out_texel = vec4(0, 0, 0, 0);
+    for (int t = 0; t < 4; t++) {
+      sum += c_texel[t];
+      out_texel[t] = sum;
+    }
+    imageStore(uOutput, dst_pos, out_texel);
+  }
+  ivec3 dst_pos = ivec3(src_pos.x, src_pos.y, src_pos.z + b_stride - 1);
+  vec4 c_texel = texelFetch(uInput, dst_pos, 0);
+  vec4 out_texel = vec4(0, 0, 0, 0);
+  for (int t = 0; t < tail_case_size; t++) {
+    sum += c_texel[t];
+    out_texel[t] = sum;
+  }
+  imageStore(uOutput, dst_pos, out_texel);
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/log_softmax.glsl b/aten/src/ATen/native/vulkan/glsl/log_softmax.glsl
index 9d2fe91d2bf19..917be2a1498a1 100644
--- a/aten/src/ATen/native/vulkan/glsl/log_softmax.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/log_softmax.glsl
@@ -14,7 +14,7 @@ layout(set = 0, binding = 2)         uniform PRECISION restrict           Block
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// This implementation is suboptimal and should be revisted.
+// This implementation is suboptimal and should be revisited.
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_clamp_qint8.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_clamp_qint8.glsl
new file mode 100644
index 0000000000000..396117e77af11
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_clamp_qint8.glsl
@@ -0,0 +1,30 @@
+#version 450 core
+#define PRECISION ${PRECISION}
+#define FORMAT    ${FORMAT}
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba8i) uniform PRECISION restrict writeonly iimage3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION                    isampler3D uInput; // quantized input
+layout(set = 0, binding = 2)         uniform PRECISION restrict           Block {
+  ivec4 size;
+  vec2 clamp;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    vec4 temp = texelFetch(uInput, pos, 0);
+    temp = clamp(temp, uBlock.clamp.x, uBlock.clamp.y);
+    ivec4 store = ivec4(temp);
+    imageStore(
+        uOutput,
+        pos,
+        store);
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_clamp_qint8_.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_clamp_qint8_.glsl
new file mode 100644
index 0000000000000..c420f73f9e1b6
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_clamp_qint8_.glsl
@@ -0,0 +1,28 @@
+#version 450 core
+#define PRECISION ${PRECISION}
+#define FORMAT    ${FORMAT}
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba8i) uniform PRECISION restrict iimage3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION restrict           Block {
+  ivec4 size;
+  vec2 clamp;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    vec4 temp = imageLoad(uOutput, pos);
+    temp = clamp(temp, uBlock.clamp.x, uBlock.clamp.y);
+    ivec4 store = ivec4(temp);
+    imageStore(
+        uOutput,
+        pos,
+        store);
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_clamp_quint8.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_clamp_quint8.glsl
new file mode 100644
index 0000000000000..5e3a2a01df92c
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_clamp_quint8.glsl
@@ -0,0 +1,30 @@
+#version 450 core
+#define PRECISION ${PRECISION}
+#define FORMAT    ${FORMAT}
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba8ui) uniform PRECISION restrict writeonly uimage3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION                    isampler3D uInput; // quantized input
+layout(set = 0, binding = 2)         uniform PRECISION restrict           Block {
+  ivec4 size;
+  vec2 clamp;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    vec4 temp = texelFetch(uInput, pos, 0);
+    temp = clamp(temp, uBlock.clamp.x, uBlock.clamp.y);
+    uvec4 store = uvec4(temp);
+    imageStore(
+        uOutput,
+        pos,
+        store);
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_clamp_quint8_.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_clamp_quint8_.glsl
new file mode 100644
index 0000000000000..36f4ad9e835f5
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_clamp_quint8_.glsl
@@ -0,0 +1,29 @@
+#version 450 core
+#define PRECISION ${PRECISION}
+#define FORMAT    ${FORMAT}
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba8ui) uniform PRECISION restrict uimage3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION restrict           Block {
+  ivec4 size;
+  vec2 clamp;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    vec4 temp = imageLoad(uOutput, pos);
+    temp = clamp(temp, uBlock.clamp.x, uBlock.clamp.y);
+    uvec4 store = uvec4(temp);
+    imageStore(
+        uOutput,
+        pos,
+        store);
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_conv_transpose2d.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_conv_transpose2d.glsl
new file mode 100644
index 0000000000000..828aa15a5d2ec
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_conv_transpose2d.glsl
@@ -0,0 +1,120 @@
+#version 450 core
+#define PRECISION ${PRECISION}
+#define FORMAT ${FORMAT}
+
+/*
+ * TILE_SIZE = (1, 1, 1)
+ * WEIGHT_STORAGE = TEXTURE_2D
+ * BIAS_STORAGE = TEXTURE_2D
+ */
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+/*
+ * Output Image
+ */
+layout(set = 0, binding = 0, rgba8ui) uniform PRECISION restrict writeonly uimage3D uOutput;
+
+/*
+ * Input Textures
+ */
+layout(set = 0, binding = 1) uniform PRECISION isampler3D uInput;
+layout(set = 0, binding = 2) uniform PRECISION sampler2D uKernel;
+layout(set = 0, binding = 3) uniform PRECISION sampler2D uBias;
+
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 4) uniform PRECISION restrict Block {
+  // quantization scales, xy corresponds to output, input
+  vec4 scales;
+  // quantization zero points, xy corresponds to output, input
+  ivec4 zero_points;
+  // extents of the output texture
+  ivec4 out_extents;
+  // extents of the input texture
+  ivec4 in_extents;
+  // size of the overlay region of the kernel
+  ivec4 overlay_region;
+  // width and height of the kernel
+  ivec2 kernel_size;
+  // convolution parameters
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilate;
+  vec2 clamp_thresh;
+}
+uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Dequantizes a float texel based on a scale and zero point.
+ */
+vec4 dequantize(ivec4 tex, float scale, int zero_point) {
+  return scale * (tex - zero_point);
+}
+
+/*
+ * Quantizes a float texel based on a scale and zero point.
+ */
+uvec4 quantize(vec4 tex, float scale, int zero_point) {
+  return uvec4(roundEven(tex / scale) + zero_point);
+}
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  // Return if this global position is outside output texture bounds
+  if (any(greaterThanEqual(pos, uBlock.out_extents.xyz))) {
+    return;
+  }
+
+  const vec2 ksize = vec2(uBlock.kernel_size);
+  const vec2 stride = vec2(uBlock.stride);
+  const vec2 padding = vec2(uBlock.padding);
+
+  ivec2 ipos = pos.xy + uBlock.padding;
+  vec2 ipos_f = vec2(ipos);
+
+  const ivec2 start = max(ivec2(0), ivec2(ceil((ipos_f - ksize + 1) / stride)));
+  const ivec2 end =
+      min(uBlock.in_extents.xy, ivec2(floor(ipos_f / stride)) + 1);
+  ivec2 kstart = start;
+
+  vec4 sum = texelFetch(uBias, ivec2(pos.z, 0), 0);
+
+  const int ic4 = uBlock.overlay_region.z;
+
+  int ky_start = uBlock.overlay_region.y - 1 -
+      (ipos.y - uBlock.stride.y * start.y) + pos.z * uBlock.kernel_size.y;
+  int kx_start =
+      (uBlock.overlay_region.x - 1 - (ipos.x - uBlock.stride.x * start.x)) *
+      ic4;
+  int kx_stride = ic4 * (uBlock.stride.x - 1);
+
+  for (int y = start.y, ky = ky_start; y < end.y; ++y, ky += uBlock.stride.y) {
+    int kx = kx_start;
+    for (int x = start.x, kx = kx_start; x < end.x; ++x, kx += kx_stride) {
+      for (int z4 = 0; z4 < ic4 / 4; ++z4, kx += 4) {
+        const vec4 In = dequantize(texelFetch(uInput, ivec3(x, y, z4), 0),
+                            uBlock.scales.y,
+                            uBlock.zero_points.y);
+        const ivec4 kxs = kx + ivec4(0, 1, 2, 3);
+
+        sum = fma(In.xxxx, texelFetch(uKernel, ivec2(kxs.x, ky), 0), sum);
+        sum = fma(In.yyyy, texelFetch(uKernel, ivec2(kxs.y, ky), 0), sum);
+        sum = fma(In.zzzz, texelFetch(uKernel, ivec2(kxs.z, ky), 0), sum);
+        sum = fma(In.wwww, texelFetch(uKernel, ivec2(kxs.w, ky), 0), sum);
+      }
+    }
+  }
+
+  uvec4 out_tex = quantize(
+      clamp(sum, uBlock.clamp_thresh.x, uBlock.clamp_thresh.y),
+      uBlock.scales.x,
+      uBlock.zero_points.x);
+  imageStore(uOutput, pos, out_tex);
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_gelu_tanh_qint8.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_gelu_tanh_qint8.glsl
new file mode 100644
index 0000000000000..a8ea3032c24ca
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_gelu_tanh_qint8.glsl
@@ -0,0 +1,35 @@
+#version 450 core
+#define PRECISION ${PRECISION}
+#define FORMAT ${FORMAT}
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba8i)  uniform PRECISION restrict writeonly  iimage3D   uOutput;
+layout(set = 0, binding = 1)          uniform PRECISION                     isampler3D uInput; //quantized input
+layout(set = 0, binding = 2)          uniform PRECISION restrict            Block {
+  ivec4 size;
+  float kBeta; /* M_SQRT2 * M_2_SQRTPI * 0.5 */
+  float scale;
+  int zero_point;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const vec4 qua_inval = texelFetch(uInput, pos, 0);
+
+    const vec4 inval = uBlock.scale * (qua_inval - uBlock.zero_point); //dequantize
+    const vec4 invalcube = inval * inval * inval;
+    const vec4 inner = vec4(uBlock.kBeta) * (inval + vec4(0.044715) * invalcube);
+    const vec4 res = vec4(0.5) * inval * (vec4(1.0) + tanh(inner));
+    const vec4 qua_res = roundEven(res / uBlock.scale) + uBlock.zero_point;
+    const ivec4 outval = ivec4(qua_res);
+
+    imageStore(uOutput, pos, outval);
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_gelu_tanh_qint8_.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_gelu_tanh_qint8_.glsl
new file mode 100644
index 0000000000000..7e3e9d127b1c8
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_gelu_tanh_qint8_.glsl
@@ -0,0 +1,34 @@
+#version 450 core
+#define PRECISION ${PRECISION}
+#define FORMAT ${FORMAT}
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba8i) uniform PRECISION restrict iimage3D uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION restrict Block {
+  ivec4 size;
+  float kBeta; /* M_SQRT2 * M_2_SQRTPI * 0.5 */
+  float scale;
+  int zero_point;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const vec4 qua_inval = imageLoad(uOutput, pos);
+
+    const vec4 inval = uBlock.scale * (qua_inval - uBlock.zero_point); //dequantize
+    const vec4 invalcube = inval * inval * inval;
+    const vec4 inner = vec4(uBlock.kBeta) * (inval + vec4(0.044715) * invalcube);
+    const vec4 res = vec4(0.5) * inval * (vec4(1.0) + tanh(inner));
+    const vec4 qua_res = roundEven(res / uBlock.scale) + uBlock.zero_point;
+    const ivec4 outval = ivec4(qua_res);
+
+    imageStore(uOutput, pos, outval);
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_gelu_tanh_quint8.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_gelu_tanh_quint8.glsl
new file mode 100644
index 0000000000000..760a89aa7e12c
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_gelu_tanh_quint8.glsl
@@ -0,0 +1,35 @@
+#version 450 core
+#define PRECISION ${PRECISION}
+#define FORMAT ${FORMAT}
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba8ui) uniform PRECISION restrict writeonly uimage3D   uOutput;
+layout(set = 0, binding = 1)          uniform PRECISION                    isampler3D uInput; //quantized input
+layout(set = 0, binding = 2)          uniform PRECISION restrict           Block {
+  ivec4 size;
+  float kBeta; /* M_SQRT2 * M_2_SQRTPI * 0.5 */
+  float scale;
+  int zero_point;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const vec4 qua_inval = texelFetch(uInput, pos, 0);
+
+    const vec4 inval = uBlock.scale * (qua_inval - uBlock.zero_point); //dequantize
+    const vec4 invalcube = inval * inval * inval;
+    const vec4 inner = vec4(uBlock.kBeta) * (inval + vec4(0.044715) * invalcube);
+    const vec4 res = vec4(0.5) * inval * (vec4(1.0) + tanh(inner));
+    const vec4 qua_res = roundEven(res / uBlock.scale) + uBlock.zero_point;
+    const uvec4 outval = uvec4(qua_res);
+
+    imageStore(uOutput, pos, outval);
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_gelu_tanh_quint8_.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_gelu_tanh_quint8_.glsl
new file mode 100644
index 0000000000000..0081c3044ec9e
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_gelu_tanh_quint8_.glsl
@@ -0,0 +1,34 @@
+#version 450 core
+#define PRECISION ${PRECISION}
+#define FORMAT ${FORMAT}
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba8ui) uniform PRECISION restrict uimage3D uOutput;
+layout(set = 0, binding = 1)          uniform PRECISION restrict Block {
+  ivec4 size;
+  float kBeta; /* M_SQRT2 * M_2_SQRTPI * 0.5 */
+  float scale;
+  int zero_point;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const vec4 qua_inval = imageLoad(uOutput, pos);
+
+    const vec4 inval = uBlock.scale * (qua_inval - uBlock.zero_point); //dequantize
+    const vec4 invalcube = inval * inval * inval;
+    const vec4 inner = vec4(uBlock.kBeta) * (inval + vec4(0.044715) * invalcube);
+    const vec4 res = vec4(0.5) * inval * (vec4(1.0) + tanh(inner));
+    const vec4 qua_res = roundEven(res / uBlock.scale) + uBlock.zero_point;
+    const uvec4 outval = uvec4(qua_res);
+
+    imageStore(uOutput, pos, outval);
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/unsqueeze.glsl b/aten/src/ATen/native/vulkan/glsl/unsqueeze.glsl
index fb44ac931ac6f..213d01edb1a9c 100644
--- a/aten/src/ATen/native/vulkan/glsl/unsqueeze.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/unsqueeze.glsl
@@ -38,17 +38,13 @@ void main() {
   const int dim = uBlock.info.x;
   const int channels = uBlock.info.y;
   vec4 out_texel = vec4(0, 0, 0, 0);
-  if (dim == 0) {
-    imageStore(uOutput, pos, texelFetch(uImage, pos, 0));
-  } else if (dim == 1) {
+  if (dim == 1) {
     int src_x = pos.x;
     int src_y = pos.y;
-    for (int i = 0; i < 4; i++) {
-      int src_z = pos.z / (channels * 4);
-      int p = (pos.z / channels) % 4;
-      const vec4 v = texelFetch(uImage, ivec3(src_x, src_y, src_z), 0);
-      out_texel[i] = v[p];
-    }
+    int src_z = pos.z / 4;
+    int p = pos.z % 4;
+    const vec4 v = texelFetch(uImage, ivec3(src_x, src_y, src_z), 0);
+    out_texel[0] = v[p];
     imageStore(uOutput, pos, out_texel);
   } else if (dim == 2) {
     int src_x = pos.x;
diff --git a/aten/src/ATen/native/vulkan/graph/Arithmetic.cpp b/aten/src/ATen/native/vulkan/graph/Arithmetic.cpp
deleted file mode 100644
index 0d9b1261e5d32..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Arithmetic.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-#include <ATen/native/vulkan/impl/Common.h>
-
-#include <ATen/native/vulkan/graph/Arithmetic.h>
-#include <ATen/native/vulkan/graph/Staging.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-void add_arithmetic_node(
-    ComputeGraph& graph,
-    const ValueRef t1,
-    const ValueRef t2,
-    const ValueRef out,
-    const float alpha,
-    const arithmetic::OpType optype) {
-  // Prepacking first arg (if needed)
-  ValueRef arg1 = t1;
-  if (graph.get_val(t1).isTensorRef()) {
-    TensorRef& t1_asref = graph.get_val(t1).toTensorRef();
-    ValueRef t1_vten = graph.add_tensor(t1_asref.sizes, t1_asref.dtype);
-    graph.prepack_nodes().emplace_back(new ArithmeticPrepack(t1, t1_vten));
-    arg1 = t1_vten;
-  }
-  VKGRAPH_CHECK(graph.get_val(arg1).isTensor());
-  // Prepacking second arg (if needed)
-  ValueRef arg2 = t2;
-  if (graph.get_val(t2).isTensorRef()) {
-    TensorRef& t2_asref = graph.get_val(t2).toTensorRef();
-    ValueRef t2_vten = graph.add_tensor(t2_asref.sizes, t2_asref.dtype);
-    graph.prepack_nodes().emplace_back(new ArithmeticPrepack(t2, t2_vten));
-    arg2 = t2_vten;
-  }
-  VKGRAPH_CHECK(graph.get_val(arg2).isTensor());
-
-  graph.execute_nodes().emplace_back(
-      new ArithmeticNode(arg1, arg2, out, alpha, optype));
-}
-
-ValueRef add_arithmetic_node(
-    ComputeGraph& graph,
-    const ValueRef t1,
-    const ValueRef t2,
-    const float alpha,
-    const arithmetic::OpType optype) {
-  IntArrayRef t1_sizes = graph.get_val_sizes(t1);
-  c10::ScalarType t1_dtype = graph.get_val_dtype(t1);
-
-  ValueRef out = graph.add_tensor(t1_sizes, t1_dtype);
-  add_arithmetic_node(graph, t1, t2, out, alpha, optype);
-  return out;
-}
-
-ArithmeticPrepack::ArithmeticPrepack(
-    const ValueRef tref,
-    const ValueRef packed) {
-  inputs_.emplace_back(tref);
-  outputs_.emplace_back(packed);
-}
-
-void ArithmeticPrepack::encode_prepack(ComputeGraph* graph) const {
-  TensorRef tref = graph->get_val(inputs_[0]).toTensorRef();
-  vTensor packed = graph->get_val(outputs_[0]).toTensor();
-
-  api::StorageBuffer staging(
-      graph->context(), packed.dtype(), packed.gpu_nbytes());
-
-  size_t numel = c10::multiply_integers(tref.sizes);
-  size_t nbytes = numel * c10::elementSize(tref.dtype);
-  copy_ptr_to_staging(tref.data, staging, nbytes);
-
-  encode_copy_to_vtensor(graph->context(), staging, packed);
-}
-
-ArithmeticNode::ArithmeticNode(
-    const ValueRef t1,
-    const ValueRef t2,
-    const ValueRef out,
-    const float alpha,
-    const arithmetic::OpType optype)
-    : alpha_(alpha), optype_(optype) {
-  inputs_.emplace_back(t1);
-  inputs_.emplace_back(t2);
-  outputs_.emplace_back(out);
-}
-
-void ArithmeticNode::encode_execute(ComputeGraph* graph) const {
-  vTensor& in1 = graph->get_val(inputs_[0]).toTensor();
-  vTensor& in2 = graph->get_val(inputs_[1]).toTensor();
-  vTensor& out = graph->get_val(outputs_[0]).toTensor();
-
-  api::ShaderInfo kernel = arithmetic::get_shader(optype_);
-  arithmetic::record_op(graph->context(), kernel, in1, in2, out, alpha_);
-}
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/graph/Arithmetic.h b/aten/src/ATen/native/vulkan/graph/Arithmetic.h
deleted file mode 100644
index 1b8d621ab2e2f..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Arithmetic.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#pragma once
-
-#ifdef USE_VULKAN_API
-
-#include <ATen/native/vulkan/impl/Arithmetic.h>
-
-#include <ATen/native/vulkan/graph/Graph.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-void add_arithmetic_node(
-    ComputeGraph& graph,
-    const ValueRef t1,
-    const ValueRef t2,
-    const ValueRef out,
-    const float alpha,
-    const arithmetic::OpType optype);
-
-ValueRef add_arithmetic_node(
-    ComputeGraph& graph,
-    const ValueRef t1,
-    const ValueRef t2,
-    const float alpha,
-    const arithmetic::OpType optype);
-
-class ArithmeticPrepack : public virtual OpNode {
- public:
-  explicit ArithmeticPrepack(const ValueRef tref, const ValueRef packed);
-
-  void encode_prepack(ComputeGraph* graph) const override;
-};
-
-class ArithmeticNode : public virtual OpNode {
- public:
-  explicit ArithmeticNode(
-      const ValueRef t1,
-      const ValueRef t2,
-      const ValueRef out,
-      const float alpha,
-      const arithmetic::OpType optype);
-
-  void encode_execute(ComputeGraph* graph) const override;
-
- private:
-  float alpha_;
-  arithmetic::OpType optype_;
-};
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
-
-#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Config.h b/aten/src/ATen/native/vulkan/graph/Config.h
deleted file mode 100644
index df600180e221a..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Config.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#pragma once
-
-#ifdef USE_VULKAN_API
-
-#include <ATen/native/vulkan/api/Context.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-struct GraphConfig final {
-  api::ContextConfig contextConfig;
-};
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
-
-#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Constant.cpp b/aten/src/ATen/native/vulkan/graph/Constant.cpp
deleted file mode 100644
index f9f6d871ffc0a..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Constant.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#include <ATen/native/vulkan/graph/Constant.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-TensorRef::TensorRef(
-    const IntArrayRef t_sizes,
-    c10::ScalarType t_dtype,
-    const void* const t_data)
-    : sizes{}, dtype{t_dtype}, data{t_data} {
-  size_t ndim = t_sizes.size();
-  sizes.resize(ndim);
-  for (int i = 0; i < ndim; ++i) {
-    sizes[i] = t_sizes[i];
-  }
-}
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/graph/Constant.h b/aten/src/ATen/native/vulkan/graph/Constant.h
deleted file mode 100644
index 11e54aa0cd455..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Constant.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#pragma once
-
-#ifdef USE_VULKAN_API
-
-#include <ATen/native/vulkan/api/Context.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-/*
- * Represents a reference to a tensor that has been serialized with the model,
- * such as a serialized weight tensor. It contains some metadata as well as a
- * raw pointer to the data of the tensor, which is assumed to be contiguous.
- */
-struct TensorRef final {
-  std::vector<int64_t> sizes;
-  c10::ScalarType dtype;
-  const void* data;
-
-  explicit TensorRef(
-      const IntArrayRef t_sizes,
-      c10::ScalarType t_dtype,
-      const void* const t_data);
-
-  TensorRef(const TensorRef&) = default;
-  TensorRef& operator=(const TensorRef&) = default;
-
-  TensorRef(TensorRef&&) = default;
-  TensorRef& operator=(TensorRef&&) = default;
-};
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
-
-#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Copy.cpp b/aten/src/ATen/native/vulkan/graph/Copy.cpp
deleted file mode 100644
index d123665cddb56..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Copy.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-#include <ATen/native/vulkan/graph/Copy.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-void add_copy_node(
-    ComputeGraph& graph,
-    const ValueRef from,
-    const ValueRef to) {
-  graph.execute_nodes().emplace_back(new CopyNode(from, to));
-}
-
-ValueRef add_copy_node(ComputeGraph& graph, const ValueRef from) {
-  IntArrayRef out_sizes = graph.get_val_sizes(from);
-  c10::ScalarType out_dtype = graph.get_val_dtype(from);
-  ValueRef to = graph.add_tensor(out_sizes, out_dtype);
-  add_copy_node(graph, from, to);
-  return to;
-}
-
-CopyNode::CopyNode(const ValueRef from, const ValueRef to) {
-  inputs_.emplace_back(from);
-  outputs_.emplace_back(to);
-}
-
-void CopyNode::encode_execute(ComputeGraph* graph) const {
-  api::PipelineBarrier pipeline_barrier{};
-
-  vTensor& from_tensor = graph->get_val(inputs_[0]).toTensor();
-  vTensor& to_tensor = graph->get_val(outputs_[0]).toTensor();
-
-  graph->context()->submit_copy<api::VulkanImage, api::VulkanImage>(
-      // pipeline barrier
-      pipeline_barrier,
-      // resources
-      from_tensor.image(
-          pipeline_barrier,
-          api::PipelineStage::TRANSFER,
-          api::MemoryAccessType::READ),
-      to_tensor.image(
-          pipeline_barrier,
-          api::PipelineStage::TRANSFER,
-          api::MemoryAccessType::WRITE),
-      // copy details
-      from_tensor.extents(),
-      {0u, 0u, 0u},
-      {0u, 0u, 0u},
-      // fence handle
-      VK_NULL_HANDLE);
-}
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/graph/Copy.h b/aten/src/ATen/native/vulkan/graph/Copy.h
deleted file mode 100644
index af9893d693474..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Copy.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#ifdef USE_VULKAN_API
-
-#include <ATen/native/vulkan/graph/Graph.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-void add_copy_node(ComputeGraph& graph, const ValueRef from, const ValueRef to);
-ValueRef add_copy_node(ComputeGraph& graph, const ValueRef from);
-
-class CopyNode : public virtual OpNode {
- public:
-  explicit CopyNode(const ValueRef from, const ValueRef to);
-
-  void encode_execute(ComputeGraph* graph) const override;
-};
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
-
-#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Exception.cpp b/aten/src/ATen/native/vulkan/graph/Exception.cpp
deleted file mode 100644
index d8db8a38a773b..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Exception.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <ATen/native/vulkan/graph/Exception.h>
-
-#include <sstream>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
-  out << loc.func << " at " << loc.file << ": " << loc.line;
-  return out;
-}
-
-Error::Error(SourceLocation location, std::string msg)
-    : location_{location}, msg_(std::move(msg)) {
-  refresh_what();
-}
-
-void Error::refresh_what() {
-  what_ = compute_what(/*include_source =*/true);
-}
-
-std::string Error::compute_what(bool include_source) const {
-  std::ostringstream oss;
-  oss << msg_;
-
-  if (include_source) {
-    oss << "\n"
-        << "Raised from: " << location_;
-  }
-
-  return oss.str();
-}
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/graph/Exception.h b/aten/src/ATen/native/vulkan/graph/Exception.h
deleted file mode 100644
index a317d8de498fa..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Exception.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#pragma once
-
-#ifdef USE_VULKAN_API
-
-#include <exception>
-#include <ostream>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-/*
- * Same as c10::SourceLocation, represents a location in source code
- */
-struct SourceLocation {
-  const char* func;
-  const char* file;
-  uint32_t line;
-};
-
-std::ostream& operator<<(std::ostream& out, const SourceLocation& loc);
-
-/*
- * Simple error class modeled after c10::Error
- */
-class Error : public std::exception {
- public:
-  // Constructors
-  Error(SourceLocation location, std::string msg);
-
- private:
-  // The source location of the exception
-  SourceLocation location_;
-  // The actual error message
-  std::string msg_;
-
-  std::string what_;
-
- public:
-  const char* what() const noexcept override {
-    return what_.c_str();
-  }
-
-  const std::string& msg() const {
-    return msg_;
-  }
-
- private:
-  void refresh_what();
-  std::string compute_what(bool include_source) const;
-};
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
-
-#define VKGRAPH_THROW(...)                                   \
-  throw ::at::native::vulkan::Error(                         \
-      {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
-      c10::str(__VA_ARGS__));
-
-#define VKGRAPH_CHECK(cond, ...)                               \
-  if (C10_UNLIKELY_OR_CONST(!(cond))) {                        \
-    throw ::at::native::vulkan::Error(                         \
-        {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
-        c10::str(__VA_ARGS__));                                \
-  }
-
-#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Graph.cpp b/aten/src/ATen/native/vulkan/graph/Graph.cpp
deleted file mode 100644
index aa8abfdb567a4..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Graph.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-#include <ATen/native/vulkan/graph/Graph.h>
-#include <ATen/native/vulkan/graph/Staging.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-ComputeGraph::ComputeGraph(GraphConfig config)
-    : config_{config},
-      context_{new api::Context(
-          api::runtime()->default_adapter_i(),
-          config_.contextConfig)},
-      values_{},
-      prepack_nodes_{},
-      execute_nodes_{},
-      inputs_{},
-      outputs_{} {
-  context_->set_cmd(/*reusable = */ true);
-}
-
-ComputeGraph::~ComputeGraph() {
-  values_.clear();
-
-  prepack_nodes_.clear();
-  execute_nodes_.clear();
-
-  context_->flush();
-}
-
-ValueRef ComputeGraph::add_tensor(
-    const IntArrayRef sizes,
-    const c10::ScalarType dtype) {
-  ValueRef idx(static_cast<int>(values_.size()));
-  values_.emplace_back(vTensor(context(), sizes, dtype));
-  return idx;
-}
-
-ValueRef ComputeGraph::add_tensorref(
-    const IntArrayRef sizes,
-    const c10::ScalarType dtype,
-    const void* const data) {
-  ValueRef idx(static_cast<int>(values_.size()));
-  values_.emplace_back(TensorRef(sizes, dtype, data));
-  return idx;
-}
-
-ValueRef ComputeGraph::add_staging(
-    const c10::ScalarType dtype,
-    const size_t numel) {
-  ValueRef idx(static_cast<int>(values_.size()));
-  values_.emplace_back(api::StorageBuffer(context(), dtype, numel));
-  return idx;
-}
-
-ValueRef ComputeGraph::set_input_tensor(
-    const ValueRef idx,
-    const bool use_staging) {
-  if (use_staging) {
-    vTensor& tensor = get_val(idx).toTensor();
-    ValueRef staging_idx = add_staging(tensor.dtype(), tensor.gpu_numel());
-    execute_nodes_.emplace_back(new StagingNode(staging_idx, idx));
-    inputs_.push_back(staging_idx);
-    return staging_idx;
-  }
-  inputs_.push_back(idx);
-  return idx;
-}
-
-ValueRef ComputeGraph::set_output_tensor(
-    const ValueRef idx,
-    const bool use_staging) {
-  if (use_staging) {
-    vTensor& tensor = get_val(idx).toTensor();
-    ValueRef staging_idx = add_staging(tensor.dtype(), tensor.gpu_numel());
-    execute_nodes_.emplace_back(new StagingNode(idx, staging_idx));
-    outputs_.push_back(staging_idx);
-    return staging_idx;
-  }
-  outputs_.push_back(idx);
-  return idx;
-}
-
-void ComputeGraph::copy_into_staging(
-    const ValueRef idx,
-    const void* data,
-    const size_t numel) {
-  Value& in_val = get_val(idx);
-  api::StorageBuffer& staging = in_val.toStaging();
-  size_t nbytes = numel * c10::elementSize(staging.dtype());
-  copy_ptr_to_staging(data, staging, nbytes);
-}
-
-void ComputeGraph::copy_from_staging(
-    const ValueRef idx,
-    void* data,
-    const size_t numel) {
-  Value& out_val = get_val(idx);
-  api::StorageBuffer& staging = out_val.toStaging();
-  size_t nbytes = numel * c10::elementSize(staging.dtype());
-  copy_staging_to_ptr(staging, data, nbytes);
-}
-
-void ComputeGraph::encode_prepack() {
-  for (std::unique_ptr<OpNode>& node : prepack_nodes_) {
-    node->encode_prepack(this);
-  }
-}
-
-void ComputeGraph::prepack() const {
-  // Submit and execute the command buffer
-  api::VulkanFence fence = context_->fences().get_fence();
-  context_->submit_cmd_to_gpu(fence.get_submit_handle(), /*final_use = */ true);
-  fence.wait();
-
-  // Flush the context and obtain a new command buffer
-  context_->flush();
-  context_->set_cmd(/*reusable = */ true);
-}
-
-void ComputeGraph::encode_execute() {
-  for (std::unique_ptr<OpNode>& node : execute_nodes_) {
-    node->encode_execute(this);
-  }
-}
-
-void ComputeGraph::execute() const {
-  api::VulkanFence fence = context_->fences().get_fence();
-  context_->submit_cmd_to_gpu(fence.get_submit_handle());
-  fence.wait();
-}
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/graph/Graph.h b/aten/src/ATen/native/vulkan/graph/Graph.h
deleted file mode 100644
index eaeac3aa4bc02..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Graph.h
+++ /dev/null
@@ -1,167 +0,0 @@
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#ifdef USE_VULKAN_API
-
-#include <ATen/native/vulkan/api/Context.h>
-#include <ATen/native/vulkan/api/Tensor.h>
-
-#include <ATen/native/vulkan/graph/Config.h>
-#include <ATen/native/vulkan/graph/Exception.h>
-#include <ATen/native/vulkan/graph/Value.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-using ValueRef = int32_t;
-class ComputeGraph;
-
-/*
- * Represents a single op in a ML model. In graph mode, ops will be implemented
- * introducing a derived class that implements encode_execute, which will
- * implement encoding of the shader corresponding to the op into the command
- * buffer of a ComputeGraph, as well as encode_prepack, which will implement
- * encoding of shaders transferring necessary data (such as weights and biases)
- * to the GPU, wherever prepacking is necessary.
- */
-class OpNode {
-  friend class ComputeGraph;
-
- public:
-  virtual ~OpNode() = default;
-
- protected:
-  std::vector<ValueRef> inputs_;
-  std::vector<ValueRef> outputs_;
-
- public:
-  virtual void encode_prepack(ComputeGraph* graph) const {}
-  virtual void encode_execute(ComputeGraph* graph) const {}
-};
-
-/*
- * This is the core data structure used to execute Vulkan models in graph mode.
- * As opposed to ATen/eager mode where a command buffer is encoded every
- * inference (since ops are executed with the model), in graph mode the ops that
- * compose the model are intended to be parsed only once, upon which a command
- * buffer will be encoded. Model inference will then execute the cached command
- * buffer without needing to encode a new one.
- */
-class ComputeGraph final {
- public:
-  explicit ComputeGraph(GraphConfig config);
-
-  ComputeGraph(ComputeGraph&&) = default;
-  ComputeGraph& operator=(ComputeGraph&&) = default;
-
-  ~ComputeGraph();
-
- private:
-  GraphConfig config_;
-  std::unique_ptr<api::Context> context_;
-  std::vector<Value> values_;
-
-  std::vector<std::unique_ptr<OpNode>> prepack_nodes_;
-  std::vector<std::unique_ptr<OpNode>> execute_nodes_;
-
-  std::vector<ValueRef> inputs_;
-  std::vector<ValueRef> outputs_;
-
- public:
-  //
-  // Accessors
-  //
-
-  inline api::Context* context() {
-    return context_.get();
-  }
-
-  inline std::vector<ValueRef>& inputs() {
-    return inputs_;
-  }
-
-  inline std::vector<ValueRef>& outputs() {
-    return outputs_;
-  }
-
-  /*
-   * Returns the value at a particular reference
-   */
-  inline Value& get_val(ValueRef idx) {
-    return values_[idx];
-  }
-
-  inline IntArrayRef get_val_sizes(ValueRef idx) {
-    Value& val = get_val(idx);
-    if (val.isTensor()) {
-      return val.toTensor().sizes();
-    } else if (val.isTensorRef()) {
-      return val.toTensorRef().sizes;
-    }
-    VKGRAPH_THROW("Could not get sizes of value with type ", val.type());
-  }
-
-  inline c10::ScalarType get_val_dtype(ValueRef idx) {
-    Value& val = get_val(idx);
-    if (val.isTensor()) {
-      return val.toTensor().dtype();
-    } else if (val.isTensorRef()) {
-      return val.toTensorRef().dtype;
-    }
-    VKGRAPH_THROW("Could not get dtype of value with type ", val.type());
-  }
-
-  inline std::vector<std::unique_ptr<OpNode>>& prepack_nodes() {
-    return prepack_nodes_;
-  }
-
-  inline std::vector<std::unique_ptr<OpNode>>& execute_nodes() {
-    return execute_nodes_;
-  }
-
-  //
-  // Graph Building
-  //
-
-  ValueRef add_tensor(const IntArrayRef sizes, const c10::ScalarType dtype);
-  ValueRef add_tensorref(
-      const IntArrayRef sizes,
-      const c10::ScalarType dtype,
-      const void* const data);
-  ValueRef add_staging(const c10::ScalarType dtype, const size_t numel);
-
-  ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true);
-  ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);
-
-  //
-  // Input/Output
-  //
-
-  void copy_into_staging(
-      const ValueRef idx,
-      const void* data,
-      const size_t numel);
-  void copy_from_staging(const ValueRef idx, void* data, const size_t numel);
-
-  //
-  // Graph Prepacking
-  //
-
-  void encode_prepack();
-  void prepack() const;
-
-  //
-  // Graph Execution
-  //
-
-  void encode_execute();
-  void execute() const;
-};
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
-
-#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Staging.cpp b/aten/src/ATen/native/vulkan/graph/Staging.cpp
deleted file mode 100644
index 2d46071af55c3..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Staging.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#include <ATen/native/vulkan/impl/Packing.h>
-
-#include <ATen/native/vulkan/graph/Exception.h>
-#include <ATen/native/vulkan/graph/Staging.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-void memcpy_to_mapping(
-    const void* src,
-    api::MemoryMap& dst_mapping,
-    const size_t nbytes,
-    const c10::ScalarType dtype) {
-  if (dtype == at::kFloat) {
-    memcpy_to_mapping_impl<float>(src, dst_mapping, nbytes);
-  } else if (dtype == at::kHalf) {
-    memcpy_to_mapping_impl<c10::Half>(src, dst_mapping, nbytes);
-  } else if (dtype == c10::kQUInt8) {
-    memcpy_to_mapping_impl<c10::quint8>(src, dst_mapping, nbytes);
-  } else if (dtype == c10::kQInt8) {
-    memcpy_to_mapping_impl<c10::qint8>(src, dst_mapping, nbytes);
-  } else if (dtype == c10::kQInt32) {
-    memcpy_to_mapping_impl<c10::qint32>(src, dst_mapping, nbytes);
-  } else {
-    VKGRAPH_THROW("Unrecognized dtype!");
-  }
-}
-
-void memcpy_from_mapping(
-    api::MemoryMap& src_mapping,
-    void* dst,
-    const size_t nbytes,
-    const c10::ScalarType dtype) {
-  if (dtype == at::kFloat) {
-    memcpy_from_mapping_impl<float>(src_mapping, dst, nbytes);
-  } else if (dtype == at::kHalf) {
-    memcpy_from_mapping_impl<c10::Half>(src_mapping, dst, nbytes);
-  } else if (dtype == c10::kQUInt8) {
-    memcpy_from_mapping_impl<c10::quint8>(src_mapping, dst, nbytes);
-  } else if (dtype == c10::kQInt8) {
-    memcpy_from_mapping_impl<c10::qint8>(src_mapping, dst, nbytes);
-  } else if (dtype == c10::kQInt32) {
-    memcpy_from_mapping_impl<c10::qint32>(src_mapping, dst, nbytes);
-  } else {
-    VKGRAPH_THROW("Unrecognized dtype!");
-  }
-}
-
-void copy_ptr_to_staging(
-    const void* src,
-    api::StorageBuffer& staging,
-    const size_t nbytes) {
-  api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE);
-  mapping.invalidate();
-  memcpy_to_mapping(src, mapping, nbytes, staging.dtype());
-}
-
-void copy_staging_to_ptr(
-    api::StorageBuffer& staging,
-    void* dst,
-    const size_t nbytes) {
-  api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::READ);
-  mapping.invalidate();
-  memcpy_from_mapping(mapping, dst, nbytes, staging.dtype());
-}
-
-void encode_copy_to_vtensor(
-    api::Context* context,
-    api::StorageBuffer& staging,
-    vTensor& tensor) {
-  api::ShaderInfo shader = packing::get_nchw_to_image_shader(tensor);
-  api::PipelineBarrier pipeline_barrier{};
-  packing::record_nchw_to_image_op(
-      context,
-      shader,
-      staging.buffer(),
-      tensor,
-      pipeline_barrier,
-      VK_NULL_HANDLE);
-}
-
-void encode_copy_from_vtensor(
-    api::Context* context,
-    vTensor& tensor,
-    api::StorageBuffer& staging) {
-  api::ShaderInfo shader = packing::get_image_to_nchw_shader(tensor);
-  api::PipelineBarrier pipeline_barrier{};
-  packing::record_image_to_nchw_op(
-      context,
-      shader,
-      tensor,
-      staging.buffer(),
-      pipeline_barrier,
-      VK_NULL_HANDLE);
-}
-
-StagingNode::StagingNode(ValueRef from, ValueRef to) {
-  inputs_.emplace_back(from);
-  outputs_.emplace_back(to);
-}
-
-void StagingNode::encode_execute(ComputeGraph* graph) const {
-  Value& in_val = graph->get_val(inputs_[0]);
-  Value& out_val = graph->get_val(outputs_[0]);
-
-  if (in_val.isStaging() && out_val.isTensor()) {
-    api::StorageBuffer& from_staging = graph->get_val(inputs_[0]).toStaging();
-    vTensor& to_tensor = graph->get_val(outputs_[0]).toTensor();
-    encode_copy_to_vtensor(graph->context(), from_staging, to_tensor);
-  } else if (in_val.isTensor() && out_val.isStaging()) {
-    vTensor& from_tensor = graph->get_val(inputs_[0]).toTensor();
-    api::StorageBuffer& to_staging = graph->get_val(outputs_[0]).toStaging();
-    encode_copy_from_vtensor(graph->context(), from_tensor, to_staging);
-  } else {
-    VKGRAPH_THROW(
-        "Unexpected input value type ",
-        in_val.type(),
-        " and output value type ",
-        out_val.type());
-  }
-}
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/graph/Staging.h b/aten/src/ATen/native/vulkan/graph/Staging.h
deleted file mode 100644
index 96c287f01512e..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Staging.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#pragma once
-
-#ifdef USE_VULKAN_API
-
-#include <ATen/native/vulkan/graph/Graph.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-//
-// Functions to memcpy data into staging buffer
-//
-
-void memcpy_to_mapping(
-    const void* src,
-    api::MemoryMap& dst_mapping,
-    const size_t nbytes,
-    const c10::ScalarType dtype);
-void memcpy_from_mapping(
-    const api::MemoryMap& src_mapping,
-    void* dst,
-    const size_t nbytes,
-    const c10::ScalarType dtype);
-
-//
-// Utility functions for memcpy
-//
-
-template <typename T>
-void memcpy_to_mapping_impl(
-    const void* src,
-    api::MemoryMap& dst_mapping,
-    const size_t nbytes) {
-  T* data_ptr = dst_mapping.template data<T>();
-  memcpy(data_ptr, reinterpret_cast<const T*>(src), nbytes);
-}
-
-template <typename T>
-void memcpy_from_mapping_impl(
-    api::MemoryMap& src_mapping,
-    void* dst,
-    const size_t nbytes) {
-  T* data_ptr = src_mapping.template data<T>();
-  memcpy(reinterpret_cast<T*>(dst), data_ptr, nbytes);
-}
-
-//
-// Functions to copy data into and out of a staging buffer
-//
-
-void copy_ptr_to_staging(
-    const void* src,
-    api::StorageBuffer& staging,
-    const size_t nbytes);
-void copy_staging_to_ptr(
-    api::StorageBuffer& staging,
-    void* dst,
-    const size_t nbytes);
-
-//
-// Functions to record copying data between a staging buffer and a vTensor
-//
-
-void encode_copy_to_vtensor(
-    api::Context* context,
-    api::StorageBuffer& staging,
-    vTensor& tensor);
-void encode_copy_from_vtensor(
-    api::Context* context,
-    vTensor& tensor,
-    api::StorageBuffer& staging);
-
-/*
- * OpNode that allows copying data into and out of a staging buffer.
- */
-class StagingNode : public virtual OpNode {
- public:
-  explicit StagingNode(ValueRef from, ValueRef to);
-
-  void encode_execute(ComputeGraph* graph) const override;
-};
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
-
-#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Types.cpp b/aten/src/ATen/native/vulkan/graph/Types.cpp
deleted file mode 100644
index b8ba6df7da0d0..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Types.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#include <ATen/native/vulkan/graph/Types.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-std::ostream& operator<<(std::ostream& out, const TypeTag& tag) {
-  switch (tag) {
-    case TypeTag::NONE:
-      out << "NONE";
-      break;
-    case TypeTag::TENSOR:
-      out << "TENSOR";
-      break;
-    case TypeTag::STAGING:
-      out << "STAGING";
-      break;
-    default:
-      out << "UNKNOWN";
-      break;
-  }
-  return out;
-}
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/vulkan/graph/Types.h b/aten/src/ATen/native/vulkan/graph/Types.h
deleted file mode 100644
index 6736f6e503850..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Types.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#pragma once
-
-#ifdef USE_VULKAN_API
-
-#include <ostream>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-/*
- * This class is modelled after c10::IValue; however, it is simplified and does
- * not support as many types. However, the core design is the same; it is a
- * tagged union over the types supported by the Vulkan Graph type.
- */
-enum class TypeTag : uint32_t {
-  NONE,
-  TENSOR,
-  STAGING,
-  TENSORREF,
-  INT,
-  DOUBLE,
-  BOOL,
-};
-
-std::ostream& operator<<(std::ostream& out, const TypeTag& tag);
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
-
-#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Value.h b/aten/src/ATen/native/vulkan/graph/Value.h
deleted file mode 100644
index 6bfff424f010c..0000000000000
--- a/aten/src/ATen/native/vulkan/graph/Value.h
+++ /dev/null
@@ -1,182 +0,0 @@
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#ifdef USE_VULKAN_API
-
-#include <ATen/native/vulkan/api/Context.h>
-#include <ATen/native/vulkan/api/Tensor.h>
-
-#include <ATen/native/vulkan/graph/Constant.h>
-#include <ATen/native/vulkan/graph/Exception.h>
-#include <ATen/native/vulkan/graph/Types.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-/*
- * This class is modelled after c10::IValue; however, it is simplified and does
- * not support as many types. However, the core design is the same; it is a
- * tagged union over the types supported by the Vulkan Graph type.
- */
-struct Value final {
- private:
-  /*
-   * The union type which is used to store the value of the Value.
-   */
-  union Payload {
-    /*
-     * Similar to IValue::Payload, trivially copyable types are nested in their
-     * own union.
-     */
-    union TriviallyCopyablePayload {
-      TriviallyCopyablePayload() : as_int(0) {}
-      int64_t as_int;
-      double as_double;
-      bool as_bool;
-    } u;
-
-    vTensor as_tensor;
-    api::StorageBuffer as_staging;
-    TensorRef as_tensorref;
-
-    Payload() : u() {}
-    // NOLINTNEXTLINE
-    ~Payload(){};
-  };
-
- public:
-  //
-  // Copy constructor and assignment (disabled)
-  //
-
-  Value(const Value& rhs) = delete;
-  Value& operator=(const Value&) = delete;
-
-  //
-  // Move constructor and assignment; Move assignment is disabled but
-  // construction is implemented to allow for use in container types.
-  //
-
-  Value& operator=(Value&&) = delete;
-
-  Value(Value&& rhs) noexcept : tag(rhs.tag) {
-    if (rhs.isTensor()) {
-      new (&payload.as_tensor) vTensor(std::move(rhs.payload.as_tensor));
-    } else if (rhs.isStaging()) {
-      new (&payload.as_staging)
-          api::StorageBuffer(std::move(rhs.payload.as_staging));
-    } else if (rhs.isTensorRef()) {
-      payload.as_tensorref = std::move(rhs.payload.as_tensorref);
-    } else {
-      payload.u = rhs.payload.u;
-    }
-    tag = rhs.tag;
-    rhs.clearToNone();
-  }
-
-  //
-  // Accessors
-  //
-
-  inline TypeTag type() const {
-    return tag;
-  }
-
-  //
-  // Destructor
-  //
-
-  ~Value() {
-    if (this->isTensor()) {
-      payload.as_tensor.~vTensor();
-    } else if (this->isStaging()) {
-      payload.as_staging.~StorageBuffer();
-    } else if (this->isTensorRef()) {
-      payload.as_tensorref.~TensorRef();
-    }
-  }
-
-  //
-  // Tensor
-  //
-
-  explicit Value(vTensor&& t) : tag(TypeTag::TENSOR) {
-    new (&payload.as_tensor) vTensor(std::move(t));
-  }
-
-  inline bool isTensor() const {
-    return TypeTag::TENSOR == tag;
-  }
-
-  inline vTensor& toTensor() {
-    VKGRAPH_CHECK(
-        isTensor(),
-        "Expected value to have type TENSOR, got ",
-        tag,
-        " instead.");
-    return payload.as_tensor;
-  }
-
-  //
-  // Staging
-  //
-
-  explicit Value(api::StorageBuffer&& t) : tag(TypeTag::STAGING) {
-    new (&payload.as_staging) api::StorageBuffer(std::move(t));
-  }
-
-  inline bool isStaging() const {
-    return TypeTag::STAGING == tag;
-  }
-
-  inline api::StorageBuffer& toStaging() {
-    VKGRAPH_CHECK(
-        isStaging(),
-        "Expected value to have type STAGING, got ",
-        tag,
-        " instead.");
-    return payload.as_staging;
-  }
-
-  //
-  // TensorRef
-  //
-
-  explicit Value(TensorRef&& t) : tag(TypeTag::TENSORREF) {
-    payload.as_tensorref = std::move(t);
-  }
-
-  inline bool isTensorRef() const {
-    return TypeTag::TENSORREF == tag;
-  }
-
-  inline TensorRef& toTensorRef() {
-    VKGRAPH_CHECK(
-        isTensorRef(),
-        "Expected value to have type TENSORREF, got ",
-        tag,
-        " instead.");
-    return payload.as_tensorref;
-  }
-
- private:
-  Payload payload;
-  TypeTag tag;
-
-  //
-  // Utility Functions
-  //
-
-  inline void clearToNone() noexcept {
-    payload.u.as_int = 0;
-    tag = TypeTag::NONE;
-  }
-};
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
-
-#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/impl/Arithmetic.cpp b/aten/src/ATen/native/vulkan/impl/Arithmetic.cpp
index d06c5003874da..24d0eea4dddd7 100644
--- a/aten/src/ATen/native/vulkan/impl/Arithmetic.cpp
+++ b/aten/src/ATen/native/vulkan/impl/Arithmetic.cpp
@@ -16,8 +16,12 @@ api::ShaderInfo get_shader(const OpType type) {
       return VK_KERNEL(mul);
     case OpType::DIV:
       return VK_KERNEL(div);
+    case OpType::FLOOR_DIV:
+      return VK_KERNEL(floor_divide);
+    case OpType::POW:
+      return VK_KERNEL(pow);
   }
-  TORCH_INTERNAL_ASSERT(false, "Invalid OpType");
+  VK_THROW("Invalid OpType");
 }
 
 struct Params final {
diff --git a/aten/src/ATen/native/vulkan/impl/Arithmetic.h b/aten/src/ATen/native/vulkan/impl/Arithmetic.h
index ea0d22fa6cbf4..4af81d5f49bcf 100644
--- a/aten/src/ATen/native/vulkan/impl/Arithmetic.h
+++ b/aten/src/ATen/native/vulkan/impl/Arithmetic.h
@@ -14,6 +14,8 @@ enum class OpType : uint32_t {
   SUB,
   MUL,
   DIV,
+  FLOOR_DIV,
+  POW,
 };
 
 api::ShaderInfo get_shader(const OpType type);
diff --git a/aten/src/ATen/native/vulkan/impl/Common.h b/aten/src/ATen/native/vulkan/impl/Common.h
index bee8896dad5d4..b21e13080112b 100644
--- a/aten/src/ATen/native/vulkan/impl/Common.h
+++ b/aten/src/ATen/native/vulkan/impl/Common.h
@@ -3,12 +3,6 @@
 #ifdef USE_VULKAN_API
 
 #include <ATen/native/vulkan/api/api.h>
-#include <ATen/native/vulkan/impl/Registry.h>
-
-#define VK_KERNEL(shader_name) \
-  ::at::native::vulkan::get_shader_info(#shader_name)
-#define VK_LOOKUP_KERNEL(op_name) \
-  ::at::native::vulkan::look_up_shader_info(#op_name)
 
 namespace at {
 namespace native {
@@ -61,7 +55,7 @@ struct DimTConv2DKernel {
  * these functions.
  */
 template <uint32_t N>
-uint32_t dim_at(const IntArrayRef sizes) {
+uint32_t dim_at(const std::vector<int64_t>& sizes) {
   const uint32_t dims = sizes.size();
   return dims < N ? 1 : api::utils::safe_downcast<uint32_t>(sizes[dims - N]);
 }
diff --git a/aten/src/ATen/native/vulkan/impl/Packing.cpp b/aten/src/ATen/native/vulkan/impl/Packing.cpp
index 954cd8a44056e..3c47208c4805f 100644
--- a/aten/src/ATen/native/vulkan/impl/Packing.cpp
+++ b/aten/src/ATen/native/vulkan/impl/Packing.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/api/Types.h>
 #include <ATen/native/vulkan/api/Utils.h>
 #include <ATen/native/vulkan/impl/Common.h>
 #include <ATen/native/vulkan/impl/Packing.h>
@@ -12,100 +13,97 @@ api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst) {
     switch (v_dst.storage_type()) {
       case api::StorageType::TEXTURE_3D:
         switch (v_dst.dtype()) {
-          case c10::ScalarType::QUInt8:
+          case api::ScalarType::QUInt8:
             return VK_KERNEL(nchw_to_image_uint8);
-          case c10::ScalarType::QInt8:
+          case api::ScalarType::QInt8:
             return VK_KERNEL(nchw_to_image_int8);
-          case c10::ScalarType::QInt32:
+          case api::ScalarType::QInt32:
             return VK_KERNEL(nchw_to_image_int32);
           default:
-            TORCH_CHECK(
-                false,
+            VK_THROW(
                 "Vulkan quantization currently not supported for dtype ",
                 v_dst.dtype());
         }
       case api::StorageType::TEXTURE_2D:
         switch (v_dst.dtype()) {
-          case c10::ScalarType::QUInt8:
+          case api::ScalarType::QUInt8:
             return VK_KERNEL(nchw_to_image2d_uint8);
-          case c10::ScalarType::QInt8:
+          case api::ScalarType::QInt8:
             return VK_KERNEL(nchw_to_image2d_int8);
-          case c10::ScalarType::QInt32:
+          case api::ScalarType::QInt32:
             return VK_KERNEL(nchw_to_image2d_int32);
           default:
-            TORCH_CHECK(
-                false,
+            VK_THROW(
                 "Vulkan quantization currently not supported for dtype ",
                 v_dst.dtype());
         }
       default:
-        TORCH_CHECK(false, "No kernel available!");
+        VK_THROW("No kernel available!");
       case api::StorageType::BUFFER:
       case api::StorageType::UNKNOWN:
-        TORCH_CHECK(false, "Requested storage type must be a texture type.");
+        VK_THROW("Requested storage type must be a texture type.");
     }
   }
 
-  if (v_dst.dtype() == at::kFloat) {
+  if (v_dst.dtype() == api::kFloat) {
     switch (v_dst.storage_type()) {
       case api::StorageType::TEXTURE_3D:
         return VK_KERNEL(nchw_to_image);
       case api::StorageType::TEXTURE_2D:
         return VK_KERNEL(nchw_to_image2d);
       default:
-        TORCH_CHECK(false, "No kernel available!");
+        VK_THROW("No kernel available!");
     }
-  } else if (v_dst.dtype() == at::kBool) {
+  } else if (v_dst.dtype() == api::kBool) {
     switch (v_dst.storage_type()) {
       case api::StorageType::TEXTURE_3D:
         return VK_KERNEL(nchw_to_image_bool);
       default:
-        TORCH_CHECK(false, "No kernel available!");
+        VK_THROW("No kernel available!");
     }
   } else {
-    TORCH_CHECK(false, "Unsupported dtype!");
+    VK_THROW("Unsupported dtype!");
   }
 }
 
 api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src) {
-  if (v_src.is_quantized() || v_src.dtype() == at::kBool) {
+  if (v_src.is_quantized() || v_src.dtype() == api::kBool) {
     auto plane_size =
         dim_at<Dim4D::Height>(v_src) * dim_at<Dim4D::Width>(v_src);
     switch (v_src.storage_type()) {
       case api::StorageType::TEXTURE_3D:
         switch (v_src.dtype()) {
-          case c10::ScalarType::QUInt8:
-          case c10::ScalarType::QInt8:
-          case at::kBool:
+          case api::ScalarType::QUInt8:
+          case api::ScalarType::QInt8:
+          case api::kBool:
             return plane_size % 4 == 0 ? VK_KERNEL(image_to_nchw_quantized_mul4)
                                        : VK_KERNEL(image_to_nchw_uint);
-          case c10::ScalarType::QInt32:
+          case api::ScalarType::QInt32:
             return VK_KERNEL(image_to_nchw_int32);
           default:
-            TORCH_CHECK(
-                false,
+            VK_THROW(
                 "Vulkan quantization currently not supported for dtype ",
                 v_src.dtype());
         }
       default:
-        TORCH_CHECK(false, "No kernel available!");
+        VK_THROW("No kernel available!");
       case api::StorageType::BUFFER:
       case api::StorageType::UNKNOWN:
-        TORCH_CHECK(false, "Requested storage type must be a texture type.");
+        VK_THROW("Requested storage type must be a texture type.");
     }
   }
 
-  if (v_src.dtype() == at::kFloat) {
+  if (v_src.dtype() == api::kFloat) {
     switch (v_src.storage_type()) {
       case api::StorageType::TEXTURE_3D:
         return VK_KERNEL(image_to_nchw);
       case api::StorageType::TEXTURE_2D:
         return VK_KERNEL(image2d_to_nchw);
       default:
-        TORCH_CHECK(false, "No kernel available!");
+        VK_THROW("No kernel available!");
     }
   } else {
-    TORCH_CHECK(false, "Unsupported dtype!");
+    VK_THROW("Unsupported dtype!");
   }
 }
 
@@ -189,8 +187,8 @@ bool record_image_to_nchw_op(
       {c_depth, channels},
   };
 
-  if (v_src.dtype() == c10::ScalarType::QUInt8 ||
-      v_src.dtype() == c10::ScalarType::QInt8 || v_src.dtype() == at::kBool) {
+  if (v_src.dtype() == api::ScalarType::QUInt8 ||
+      v_src.dtype() == api::ScalarType::QInt8 || v_src.dtype() == api::kBool) {
     // Special case using optimized shader, image_to_nchw_quantized_mul4
     if (plane_size % 4 == 0) {
       global_size.data[0u] = plane_size / 4;
diff --git a/aten/src/ATen/native/vulkan/impl/Registry.cpp b/aten/src/ATen/native/vulkan/impl/Registry.cpp
deleted file mode 100644
index b750914dfeb7a..0000000000000
--- a/aten/src/ATen/native/vulkan/impl/Registry.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifdef USE_VULKAN_API
-
-#include <ATen/native/vulkan/api/Shader.h>
-#include <ATen/native/vulkan/impl/Registry.h>
-#include <ATen/native/vulkan/spv.h>
-
-namespace at {
-namespace native {
-namespace vulkan {
-
-const api::ShaderInfo& get_shader_info(const std::string& shader_name) {
-  const ShaderListing::const_iterator shader_infos_iterator =
-      get_shader_infos().find(shader_name);
-
-  TORCH_CHECK(
-      shader_infos_iterator != get_shader_infos().end(),
-      "Could not get ShaderInfo named ",
-      shader_name);
-
-  return shader_infos_iterator->second;
-}
-
-const api::ShaderInfo& look_up_shader_info(const std::string& op_name) {
-  const ShaderRegistry::iterator registry_iterator =
-      get_shader_registry().find(op_name);
-
-  TORCH_CHECK(
-      registry_iterator != get_shader_registry().end(),
-      "Could not look up ShaderInfo for ",
-      op_name,
-      " in shader registry");
-
-  const RegistryKeyMap& registry_key_map = registry_iterator->second;
-
-  // Look for "override" and "catchall" keys
-  for (const std::string key : {"override", "catchall"}) {
-    const RegistryKeyMap::const_iterator registry_key_iterator =
-        registry_key_map.find(key);
-    if (registry_key_iterator != registry_key_map.end()) {
-      const ShaderListing::const_iterator shader_infos_iterator =
-          get_shader_infos().find(registry_key_iterator->second);
-
-      TORCH_CHECK(
-          shader_infos_iterator != get_shader_infos().end(),
-          "Could not get ShaderInfo named ",
-          registry_key_iterator->second,
-          " (listed under ",
-          op_name,
-          " -> ",
-          key,
-          " in shader registry)");
-
-      return shader_infos_iterator->second;
-    }
-  }
-
-  TORCH_CHECK(
-      false,
-      "Could not look up ShaderInfo for ",
-      op_name,
-      " with a valid key in shader registry");
-}
-
-void set_registry_override(
-    const std::string& op_name,
-    const std::string& shader_name) {
-  const ShaderRegistry::iterator registry_iterator =
-      get_shader_registry().find(op_name);
-
-  TORCH_CHECK(
-      registry_iterator != get_shader_registry().end(),
-      "Could not look up ShaderInfo for ",
-      op_name,
-      " in shader registry");
-
-  TORCH_CHECK(
-      get_shader_infos().find(shader_name) != get_shader_infos().end(),
-      "Could not get ShaderInfo named ",
-      shader_name);
-
-  registry_iterator->second["override"] = shader_name;
-}
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
-
-#endif // USE_VULKAN_API
diff --git a/aten/src/ATen/native/vulkan/impl/Registry.h b/aten/src/ATen/native/vulkan/impl/Registry.h
deleted file mode 100644
index f6f043ab96abb..0000000000000
--- a/aten/src/ATen/native/vulkan/impl/Registry.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#pragma once
-
-#ifdef USE_VULKAN_API
-
-#include <string>
-
-namespace at {
-namespace native {
-namespace vulkan {
-namespace api {
-// Forward declaration of ShaderInfo
-struct ShaderInfo;
-} // namespace api
-
-/**
- * Get the shader with a given name
- */
-const api::ShaderInfo& get_shader_info(const std::string& shader_name);
-
-/**
- * Look up which shader to use for a given op in the shader registry
- */
-const api::ShaderInfo& look_up_shader_info(const std::string& op_name);
-
-void set_registry_override(
-    const std::string& op_name,
-    const std::string& shader_name);
-
-} // namespace vulkan
-} // namespace native
-} // namespace at
-
-#endif // USE_VULKAN_API
diff --git a/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp b/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp
index 73c57c94cf6a1..3f583ddc3c4ae 100644
--- a/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp
@@ -223,7 +223,7 @@ Tensor run_batchnorm_context(
   vTensor v_output{
       context,
       v_input.sizes(),
-      input_arg.scalar_type(),
+      v_input.dtype(),
   };
 
   batchnorm::record_op(
diff --git a/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp b/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp
index 50638df55ddee..c08363a17f8eb 100644
--- a/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/BinaryOp.cpp
@@ -25,7 +25,7 @@ Tensor binary_op_scalar(
   vTensor v_output{
       context,
       v_self.sizes(),
-      self_arg.scalar_type(),
+      v_self.dtype(),
   };
 
   const float other_val = alpha_arg ? other.to<float>() * alpha_arg->to<float>()
@@ -167,7 +167,7 @@ Tensor binary_op_tensor(
   vTensor v_output{
       context,
       utils::broadcast_size(self_arg, other_arg),
-      self_arg.scalar_type(),
+      v_self.dtype(),
   };
 
   const double alpha = alpha_arg ? alpha_arg->to<double>() : 1.0;
@@ -244,7 +244,7 @@ Tensor quantized_binary_op_tensor(
       utils::broadcast_size(self_arg, other_arg),
       scale,
       zero_point,
-      c10::kQUInt8,
+      api::kQUInt8,
   };
 
   const double scale1 = v_self.get_scale();
diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
index dc22b987efa7e..3cc4dd3d3c4bc 100644
--- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
@@ -24,23 +24,51 @@ Tensor _clamp(
   vTensor v_output{
       context,
       v_self.sizes(),
-      self_arg.scalar_type(),
+      v_self.dtype(),
   };
+  if (v_self.is_quantized()) {
+    v_output.set_is_quantized();
+    v_output.set_scale(v_self.get_scale());
+    v_output.set_zero_point(v_self.get_zero_point());
+  }
 
-  const struct Block final {
-    uvec3 extents;
-    uint32_t _;
-    vec2 clamp;
-  } block{
-      v_output.extents(),
-      0u,
-      {
-          min ? min->to<float>() : -std::numeric_limits<float>::infinity(),
-          max ? max->to<float>() : std::numeric_limits<float>::infinity(),
-      },
-  };
+  api::UniformParamsBuffer params;
+
+  if (v_self.is_quantized()) {
+    float mini = min
+        ? roundevenf(min->to<float>() / float(v_self.get_scale())) +
+            float(v_self.get_zero_point())
+        : -std::numeric_limits<float>::infinity();
+    float maxi = max
+        ? roundevenf(max->to<float>() / float(v_self.get_scale())) +
+            float(v_self.get_zero_point())
+        : std::numeric_limits<float>::infinity();
+    const struct Block final {
+      uvec3 extents;
+      uint32_t align;
+      vec2 clamp;
+    } block{
+        v_output.extents(),
+        0u,
+        {mini, maxi},
+    };
+    params = api::UniformParamsBuffer(context, block);
+  } else {
+    const struct Block final {
+      uvec3 extents;
+      uint32_t align;
+      vec2 clamp;
+    } block{
+        v_output.extents(),
+        0u,
+        {
+            min ? min->to<float>() : -std::numeric_limits<float>::infinity(),
+            max ? max->to<float>() : std::numeric_limits<float>::infinity(),
+        },
+    };
+    params = api::UniformParamsBuffer(context, block);
+  }
 
-  api::UniformParamsBuffer params(context, block);
   api::PipelineBarrier pipeline_barrier{};
 
   context->submit_compute_job(
@@ -89,20 +117,42 @@ Tensor& _clamp_(
   const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
   vTensor& v_self = convert(self);
 
-  const struct Block final {
-    uvec3 extents;
-    uint32_t _;
-    vec2 clamp;
-  } block{
-      v_self.extents(),
-      0u,
-      {
-          min ? min->to<float>() : -std::numeric_limits<float>::infinity(),
-          max ? max->to<float>() : std::numeric_limits<float>::infinity(),
-      },
-  };
+  api::UniformParamsBuffer params;
 
-  api::UniformParamsBuffer params(context, block);
+  if (v_self.is_quantized()) {
+    float mini = min
+        ? roundevenf(min->to<float>() / float(v_self.get_scale())) +
+            float(v_self.get_zero_point())
+        : -std::numeric_limits<float>::infinity();
+    float maxi = max
+        ? roundevenf(max->to<float>() / float(v_self.get_scale())) +
+            float(v_self.get_zero_point())
+        : std::numeric_limits<float>::infinity();
+    const struct Block final {
+      uvec3 extents;
+      uint32_t align;
+      vec2 clamp;
+    } block{
+        v_self.extents(),
+        0u,
+        {mini, maxi},
+    };
+    params = api::UniformParamsBuffer(context, block);
+  } else {
+    const struct Block final {
+      uvec3 extents;
+      uint32_t align;
+      vec2 clamp;
+    } block{
+        v_self.extents(),
+        0u,
+        {
+            min ? min->to<float>() : -std::numeric_limits<float>::infinity(),
+            max ? max->to<float>() : std::numeric_limits<float>::infinity(),
+        },
+    };
+    params = api::UniformParamsBuffer(context, block);
+  }
   api::PipelineBarrier pipeline_barrier{};
 
   context->submit_compute_job(
@@ -152,7 +202,7 @@ Tensor activation(
   vTensor v_output{
       context,
       v_self.sizes(),
-      self_arg.scalar_type(),
+      v_self.dtype(),
   };
 
   const struct Block final {
@@ -242,11 +292,25 @@ Tensor& hardtanh_(Tensor& self, const Scalar& min, const Scalar& max) {
 }
 
 Tensor relu(const Tensor& self) {
-  return ops::_clamp(self, 0, c10::nullopt, VK_KERNEL(clamp));
+  return (
+      (self.scalar_type() == at::kQUInt8)
+          ? ops::_clamp(
+                self, 0, c10::nullopt, VK_KERNEL(quantized_clamp_quint8))
+          : ((self.scalar_type() == at::kQInt8)
+                 ? ops::_clamp(
+                       self, 0, c10::nullopt, VK_KERNEL(quantized_clamp_qint8))
+                 : ops::_clamp(self, 0, c10::nullopt, VK_KERNEL(clamp))));
 }
 
 Tensor& relu_(Tensor& self) {
-  return ops::_clamp_(self, 0, c10::nullopt, VK_KERNEL(clamp_));
+  return (
+      (self.scalar_type() == at::kQUInt8)
+          ? ops::_clamp_(
+                self, 0, c10::nullopt, VK_KERNEL(quantized_clamp_quint8_))
+          : ((self.scalar_type() == at::kQInt8)
+                 ? ops::_clamp_(
+                       self, 0, c10::nullopt, VK_KERNEL(quantized_clamp_qint8_))
+                 : ops::_clamp_(self, 0, c10::nullopt, VK_KERNEL(clamp_))));
 }
 
 Tensor hardswish(const Tensor& self) {
@@ -277,22 +341,45 @@ Tensor activation_scalar(
   vTensor v_output{
       context,
       v_self.sizes(),
-      self_arg.scalar_type(),
+      v_self.dtype(),
   };
 
   api::UniformParamsBuffer params;
 
+  if (v_self.is_quantized()) {
+    v_output.set_is_quantized();
+    v_output.set_scale(v_self.get_scale());
+    v_output.set_zero_point(v_self.get_zero_point());
+  }
+
   if (scalar_arg.size() == 1) {
-    const struct Block final {
-      uvec3 extents;
-      uint32_t _;
-      float scalar_value;
-    } block{
-        v_output.extents(),
-        0u,
-        scalar_arg[0].to<float>(),
-    };
-    params = api::UniformParamsBuffer(context, block);
+    if (v_self.is_quantized()) {
+      const struct Block final {
+        uvec3 extents;
+        uint32_t _;
+        float scalar_value;
+        float scale;
+        int zero_point;
+      } block{
+          v_output.extents(),
+          0u,
+          scalar_arg[0].to<float>(),
+          safe_downcast<float>(v_self.get_scale()),
+          safe_downcast<int32_t>(v_self.get_zero_point()),
+      };
+      params = api::UniformParamsBuffer(context, block);
+    } else {
+      const struct Block final {
+        uvec3 extents;
+        uint32_t _;
+        float scalar_value;
+      } block{
+          v_output.extents(),
+          0u,
+          scalar_arg[0].to<float>(),
+      };
+      params = api::UniformParamsBuffer(context, block);
+    }
   } else {
     const struct Block final {
       uvec3 extents;
@@ -348,16 +435,33 @@ Tensor& activation_scalar_(
   api::UniformParamsBuffer params;
 
   if (scalar_arg.size() == 1) {
-    const struct Block final {
-      uvec3 extents;
-      uint32_t _;
-      float scalar_value;
-    } block{
-        v_self.extents(),
-        0u,
-        scalar_arg[0].to<float>(),
-    };
-    params = api::UniformParamsBuffer(context, block);
+    if (v_self.is_quantized()) {
+      const struct Block final {
+        uvec3 extents;
+        uint32_t _;
+        float scalar_value;
+        float scale;
+        int zero_point;
+      } block{
+          v_self.extents(),
+          0u,
+          scalar_arg[0].to<float>(),
+          safe_downcast<float>(v_self.get_scale()),
+          safe_downcast<int32_t>(v_self.get_zero_point()),
+      };
+      params = api::UniformParamsBuffer(context, block);
+    } else {
+      const struct Block final {
+        uvec3 extents;
+        uint32_t _;
+        float scalar_value;
+      } block{
+          v_self.extents(),
+          0u,
+          scalar_arg[0].to<float>(),
+      };
+      params = api::UniformParamsBuffer(context, block);
+    }
   } else {
     const struct Block final {
       uvec3 extents;
@@ -397,13 +501,24 @@ Tensor& activation_scalar_(
   return self_arg;
 }
 
-Tensor gelu(const Tensor& self_arg, c10::string_view approximate) {
+Tensor gelu(const Tensor& self, c10::string_view approximate) {
   TORCH_CHECK(
       approximate == "tanh", "Vulkan: gelu only supported for tanh type");
   Scalar kBetaVec = M_SQRT2 * M_2_SQRTPI * 0.5;
   std::vector<Scalar> scalar;
   scalar.push_back(kBetaVec);
-  return ops::activation_scalar(self_arg, scalar, VK_KERNEL(gelu_tanh));
+
+  if (self.scalar_type() == at::kQUInt8) {
+    return ops::activation_scalar(
+        self, scalar, VK_KERNEL(quantized_gelu_tanh_quint8));
+  }
+
+  if (self.scalar_type() == at::kQInt8) {
+    return ops::activation_scalar(
+        self, scalar, VK_KERNEL(quantized_gelu_tanh_qint8));
+  }
+
+  return ops::activation_scalar(self, scalar, VK_KERNEL(gelu_tanh));
 }
 
 Tensor& gelu_(Tensor& self, c10::string_view approximate) {
@@ -412,6 +527,17 @@ Tensor& gelu_(Tensor& self, c10::string_view approximate) {
   Scalar kBetaVec = M_SQRT2 * M_2_SQRTPI * 0.5;
   std::vector<Scalar> scalar;
   scalar.push_back(kBetaVec);
+
+  if (self.scalar_type() == at::kQUInt8) {
+    return ops::activation_scalar_(
+        self, scalar, VK_KERNEL(quantized_gelu_tanh_quint8_));
+  }
+
+  if (self.scalar_type() == at::kQInt8) {
+    return ops::activation_scalar_(
+        self, scalar, VK_KERNEL(quantized_gelu_tanh_qint8_));
+  }
+
   return ops::activation_scalar_(self, scalar, VK_KERNEL(gelu_tanh_));
 }
 
diff --git a/aten/src/ATen/native/vulkan/ops/Common.h b/aten/src/ATen/native/vulkan/ops/Common.h
index 7549e50009557..83cb45b163a2a 100644
--- a/aten/src/ATen/native/vulkan/ops/Common.h
+++ b/aten/src/ATen/native/vulkan/ops/Common.h
@@ -2,17 +2,14 @@
 
 #ifdef USE_VULKAN_API
 
+#include <c10/util/ArrayRef.h>
+
 #include <ATen/core/List.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/vulkan/api/api.h>
 #include <ATen/native/vulkan/impl/Common.h>
 #include <ATen/native/vulkan/ops/Convert.h>
 
-#define VK_KERNEL(shader_name) \
-  ::at::native::vulkan::get_shader_info(#shader_name)
-#define VK_LOOKUP_KERNEL(op_name) \
-  ::at::native::vulkan::look_up_shader_info(#op_name)
-
 namespace at {
 namespace native {
 namespace vulkan {
@@ -93,6 +90,10 @@ inline c10::optional<Scalar> get_optional_scalar(
                                       : c10::optional<Scalar>();
 }
 
+inline float roundevenf(float v) {
+  return (float)nearbyint(v);
+}
+
 } // namespace ops
 } // namespace vulkan
 } // namespace native
diff --git a/aten/src/ATen/native/vulkan/ops/Concat.cpp b/aten/src/ATen/native/vulkan/ops/Concat.cpp
index a39a6635b3b13..ae0d2ec0d729f 100644
--- a/aten/src/ATen/native/vulkan/ops/Concat.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Concat.cpp
@@ -305,7 +305,8 @@ Tensor cat(const at::ITensorListRef& tensors, const int64_t in_dim) {
   TORCH_INTERNAL_ASSERT(!result_size.empty(), "Accessing empty array");
   result_size[dim] = cat_dim_size;
 
-  vTensor v_output{api::context(), result_size, tensor.scalar_type()};
+  vTensor v_output{
+      api::context(), result_size, convert_dtype(tensor.scalar_type())};
 
   if (dim == ndim - 1) {
     return cat_width(materialized, v_output);
diff --git a/aten/src/ATen/native/vulkan/ops/Convert.h b/aten/src/ATen/native/vulkan/ops/Convert.h
index 60dcbfb158633..d5d65434b26b3 100644
--- a/aten/src/ATen/native/vulkan/ops/Convert.h
+++ b/aten/src/ATen/native/vulkan/ops/Convert.h
@@ -4,6 +4,7 @@
 
 #include <ATen/native/vulkan/VulkanOpaqueTensorImpl.h>
 #include <ATen/native/vulkan/api/Tensor.h>
+#include <ATen/native/vulkan/api/Types.h>
 #include <c10/util/accumulate.h>
 
 namespace at {
@@ -11,12 +12,76 @@ namespace native {
 namespace vulkan {
 namespace ops {
 
+/**
+ * Determines an appropriate GPU Memory Layout qualifier based on the the
+ * StorageType requested and the c10::MemoryFormat specified.
+ */
+inline api::GPUMemoryLayout get_gpu_memory_layout(
+    const api::StorageType storage_type,
+    const c10::MemoryFormat memory_format) {
+  if (storage_type == api::StorageType::BUFFER) {
+    switch (memory_format) {
+      case c10::MemoryFormat::Contiguous:
+        return api::GPUMemoryLayout::TENSOR_WIDTH_PACKED;
+      case c10::MemoryFormat::ChannelsLast:
+        return api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
+      default:
+        VK_THROW("Invalid memory format used to create vTensor!");
+    }
+  }
+  // For texture storage, always return a memory layout that packs the channels
+  // dimension. for now. With the way texture storage currently works, for 2-dim
+  // tensors, a channel dimension is added, as well as 3 channels of zero
+  // padding resulting in a final shape of {4, H, W}. For 1-dim tensors, it is
+  // unsqueezed to size {1, 1, L} and 3 channels of zero padding are added to
+  // produce a final size of {4, 1, L}. This is to ensure that physical texture
+  // positions correspond directly to logical tensor coordinates (so
+  // texelFetch(ivec3(x, y, 0), 0) will correspond to tensor[y, x].
+  //
+  // TODO(ssjia): have 2D and 1D tensors use TENSOR_WIDTH_PACKED by default.
+  return api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
+}
+
+/*
+ * Converts a `c10::ScalarType` to an equivalent
+ * `::at::native::vulkan::api::ScalarType`.
+ */
+static inline api::ScalarType convert_dtype(const c10::ScalarType dtype) {
+#define DEFINE_CASE(ctype, vkformat, name) \
+  case c10::ScalarType::name:              \
+    return ::at::native::vulkan::api::ScalarType::name;
+
+  switch (dtype) {
+    VK_FORALL_SCALAR_TYPES(DEFINE_CASE)
+    default:
+      TORCH_CHECK(false, "Not a supported Vulkan ScalarType!");
+  }
+#undef DEFINE_CASE
+}
+
+/*
+ * Converts an `::at::native::vulkan::api::ScalarType` to an equivalent
+ * `c10::ScalarType`.
+ */
+static inline c10::ScalarType convert_dtype(const api::ScalarType dtype) {
+#define DEFINE_CASE(ctype, vkformat, name)          \
+  case ::at::native::vulkan::api::ScalarType::name: \
+    return c10::ScalarType::name;
+
+  switch (dtype) {
+    VK_FORALL_SCALAR_TYPES(DEFINE_CASE)
+    default:
+      TORCH_CHECK(false, "Not a supported c10::ScalarType!");
+  }
+#undef DEFINE_CASE
+}
+
 using vTensorImpl = VulkanOpaqueTensorImpl<vTensor>;
 
 inline Tensor convert(const vTensor& tensor) {
   return at::detail::make_tensor<vTensorImpl>(
       DispatchKeySet(DispatchKey::Vulkan),
-      c10::scalarTypeToTypeMeta(tensor.dtype()),
+      c10::scalarTypeToTypeMeta(convert_dtype(tensor.dtype())),
       at::Device(at::kVulkan),
       tensor,
       tensor.sizes(),
@@ -27,7 +92,7 @@ inline Tensor convert_quantized(const vTensor& tensor) {
   TORCH_CHECK(tensor.is_quantized(), "Not a Quantized Tensor");
   return at::detail::make_tensor<vTensorImpl>(
       DispatchKeySet(DispatchKey::Vulkan),
-      c10::scalarTypeToTypeMeta(tensor.dtype()),
+      c10::scalarTypeToTypeMeta(convert_dtype(tensor.dtype())),
       at::Device(at::kVulkan),
       tensor,
       tensor.sizes(),
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index 60bfd3f575e56..01dccac003011 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -4,6 +4,7 @@
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/utils/ParamUtils.h>
 #include <ATen/native/vulkan/api/Utils.h>
+#include <ATen/native/vulkan/impl/Packing.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <ATen/native/vulkan/ops/Convolution.h>
 #include <ATen/native/vulkan/ops/Copy.h>
@@ -293,7 +294,8 @@ static api::ShaderInfo get_shader(
 
   if (quantized) {
     if (transposed) {
-      TORCH_CHECK(false, "Quantized TConv not supported!");
+      shader = VK_KERNEL(quantized_conv_transpose2d);
+      return shader;
     }
 
     switch (method) {
@@ -318,7 +320,7 @@ static api::ShaderInfo get_shader(
 
   switch (method) {
     case Conv2dSlidingWindow:
-      shader = VK_LOOKUP_KERNEL(conv2d);
+      shader = VK_KERNEL(conv2d);
       break;
     case Conv2dDepthwise:
       shader = VK_KERNEL(conv2d_dw);
@@ -334,7 +336,7 @@ static api::ShaderInfo get_shader(
       }
       break;
     case Conv2dPointwise:
-      shader = VK_LOOKUP_KERNEL(conv2d_pw);
+      shader = VK_KERNEL(conv2d_pw_output_tile_2x2);
       break;
   }
   return shader;
@@ -383,11 +385,11 @@ void record_op(
       0u,
       api::utils::make_ivec3(v_input.extents()),
       0u,
-      api::utils::make_ivec4(overlay_region, /*reverse=*/true),
-      api::utils::make_ivec2({kernel_size[3], kernel_size[2]}),
-      api::utils::make_ivec2(stride, /*reverse=*/true),
-      api::utils::make_ivec2(padding, /*reverse=*/true),
-      api::utils::make_ivec2(dilation, /*reverse=*/true),
+      utils::make_ivec4(overlay_region, /*reverse=*/true),
+      utils::make_ivec2({kernel_size[3], kernel_size[2]}),
+      utils::make_ivec2(stride, /*reverse=*/true),
+      utils::make_ivec2(padding, /*reverse=*/true),
+      utils::make_ivec2(dilation, /*reverse=*/true),
       {output_min, output_max},
   };
   api::UniformParamsBuffer params(context, block);
@@ -468,11 +470,11 @@ void record_quantized_op(
       0u,
       api::utils::make_ivec3(v_input.extents()),
       0u,
-      api::utils::make_ivec4(overlay_region, /*reverse=*/true),
-      api::utils::make_ivec2({kernel_size[3], kernel_size[2]}),
-      api::utils::make_ivec2(stride, /*reverse=*/true),
-      api::utils::make_ivec2(padding, /*reverse=*/true),
-      api::utils::make_ivec2(dilation, /*reverse=*/true),
+      utils::make_ivec4(overlay_region, /*reverse=*/true),
+      utils::make_ivec2({kernel_size[3], kernel_size[2]}),
+      utils::make_ivec2(stride, /*reverse=*/true),
+      utils::make_ivec2(padding, /*reverse=*/true),
+      utils::make_ivec2(dilation, /*reverse=*/true),
       {output_min, output_max},
   };
   api::UniformParamsBuffer params(context, block);
@@ -530,8 +532,8 @@ vTensor pack_weights(
 
   vTensor v_weight{
       api::context(),
-      weight_rearranged.sizes(),
-      weight_arg.scalar_type(),
+      weight_rearranged.sizes().vec(),
+      convert_dtype(weight_rearranged.scalar_type()),
       api::StorageType::TEXTURE_2D,
   };
 
@@ -555,8 +557,8 @@ vTensor pack_biases(
 
   vTensor v_bias{
       api::context(),
-      bias_rearranged.sizes(),
-      bias_rearranged.scalar_type(),
+      bias_rearranged.sizes().vec(),
+      convert_dtype(bias_rearranged.scalar_type()),
       api::StorageType::TEXTURE_2D,
   };
 
@@ -834,21 +836,33 @@ Tensor quantized_convolution(
 
 namespace conv1d {
 
-// This implementation only cover a special case. It only supports
-// input = (n=1, input_channel, lengths)
-// ouput = (n=1, output_channel, lengths - kernel_size + 1)
-// stride=1, padding=0, dilation=1, groups=input_channels=output_channels
-//
-// Hence:
-// weight's shape should be (output_channel, 1, kernel_size)
-// bias's shape (if applicable) should be (output_channel,)
-//
-// In this implementation, it reduces to running a 1d convolution for reach
-// channel.
-// There are multiple perf improvement opportunities: e.g. width-packing
-// input and weight tensors, batch reading when groups is low.
+vTensor pack_weights_using_width_packing(const Tensor& weight_arg) {
+  Tensor weight = weight_arg;
+
+  if (weight.is_cpu()) {
+    weight = weight.vulkan();
+  }
+
+  TORCH_CHECK(weight.is_vulkan(), "Weight must be on Vulkan device!");
+
+  vTensor v_weight = convert(weight);
+  if (v_weight.gpu_memory_layout() ==
+      api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED) {
+    v_weight = packing::convert_image_channels_packed_to_width_packed(v_weight);
+  }
+
+  TORCH_CHECK(
+      v_weight.gpu_memory_layout() == api::GPUMemoryLayout::TENSOR_WIDTH_PACKED,
+      "After packing, the v_weight must be in TENSOR_WIDTH_PACKED format");
+
+  return v_weight;
+}
 
-Tensor conv1d(
+/*
+ * This is a full implementation. For algorithm details, refer to the shader
+ * kernel code.
+ */
+Tensor run_conv1d_context_impl(
     const Tensor& input_arg,
     const Tensor& weight_arg,
     const c10::optional<Tensor>& bias_arg_opt,
@@ -864,7 +878,8 @@ Tensor conv1d(
   const IntArrayRef& input_sizes = input.sizes();
   const IntArrayRef& weight_sizes = weight.sizes();
 
-  int32_t weight_out_channels = static_cast<int32_t>(weight_sizes[0]);
+  int32_t in_channels = static_cast<int32_t>(input_sizes[1]);
+  int32_t out_channels = static_cast<int32_t>(weight_sizes[0]);
   int32_t kernel_size = static_cast<int32_t>(weight_sizes[2]);
 
   Tensor bias;
@@ -875,45 +890,44 @@ Tensor conv1d(
       bias = bias_arg_opt.value().vulkan();
     }
   } else {
-    bias = at::zeros({weight_out_channels}).vulkan();
+    bias = at::zeros({out_channels}).vulkan();
   }
 
   TORCH_CHECK(input.dim() == 3, "input must be a 3-dim tensor");
   TORCH_CHECK(weight.dim() == 3, "weight must be a 3-dim tensor");
-  TORCH_CHECK(stride == IntArrayRef(1), "stride must be 1");
-  TORCH_CHECK(padding == IntArrayRef(0), "padding must be 1");
-  TORCH_CHECK(dilation == IntArrayRef(1), "dilation must be 1");
-
-  TORCH_CHECK(input_sizes[0] == 1, "Only support single batch");
-  TORCH_CHECK(input_sizes[1] == groups, "input_channel must equals to groups");
   TORCH_CHECK(
-      weight_sizes[0] == groups, "weight.sizes()[0] must equals to groups");
-  TORCH_CHECK(weight_sizes[1] == 1, "weight.sizes()[1] must equals to 1");
+      in_channels % groups == 0, "in_channels must be divisible by groups");
+  TORCH_CHECK(
+      out_channels % groups == 0, "out_channels must be divisible by groups");
 
   const vTensor& v_input = convert(input);
-  const IntArrayRef v_input_sizes = v_input.sizes();
-
   const vTensor& v_weight = convert(weight);
   const vTensor& v_bias = convert(bias);
 
   vTensor v_output{
       context,
-      {
-          v_input_sizes[0],
-          weight_out_channels,
-          v_input_sizes[2] - kernel_size + 1,
-      },
-      input_arg.scalar_type(),
+      conv_output_size(input_sizes, weight_sizes, padding, stride, dilation),
+      v_input.dtype(),
   };
 
   const struct Block final {
-    int32_t out_channels;
-    int32_t in_lengths;
+    int32_t in_length;
     int32_t kernel_size;
+    int32_t stride;
+    int32_t padding;
+    int32_t dilation;
+    int32_t in_group_size;
+    int32_t out_group_size;
+    int32_t batch_size;
   } block{
-      weight_out_channels,
       static_cast<int32_t>(input_sizes[2]),
       kernel_size,
+      static_cast<int32_t>(stride[0]),
+      static_cast<int32_t>(padding[0]),
+      static_cast<int32_t>(dilation[0]),
+      static_cast<int32_t>(in_channels / groups),
+      static_cast<int32_t>(out_channels / groups),
+      static_cast<int32_t>(input_sizes[0]),
   };
 
   api::UniformParamsBuffer params(context, block);
@@ -925,7 +939,7 @@ Tensor conv1d(
       // pipeline barrier
       pipeline_barrier,
       // global work group size
-      {1, static_cast<uint32_t>(weight_out_channels), 1},
+      {1, static_cast<uint32_t>(out_channels), 1},
       // local work group size
       {1, 1, 1},
       // fence handle
@@ -1112,13 +1126,36 @@ c10::intrusive_ptr<Conv2dPackedContext> create_qconv2d_context(
       output_max));
 }
 
+c10::intrusive_ptr<Conv2dPackedContext> create_qtconv2d_context(
+    Tensor&& weight,
+    c10::optional<Tensor>&& bias,
+    std::vector<int64_t>&& stride,
+    std::vector<int64_t>&& padding,
+    std::vector<int64_t>&& output_padding,
+    std::vector<int64_t>&& dilation,
+    const int64_t groups,
+    const c10::optional<Scalar>& output_min,
+    const c10::optional<Scalar>& output_max) {
+  return c10::make_intrusive<Conv2dPackedContext>(Conv2dPackedContext(
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      /* transposed = */ true,
+      /* quantized = */ true,
+      output_padding,
+      groups,
+      output_min,
+      output_max));
+}
+
 Tensor run_conv2d_context_impl(
     const Tensor& input_arg,
     const c10::intrusive_ptr<Conv2dPackedContext>& conv_context,
     double scale,
     int64_t zero_point) {
   api::Context* const context = api::context();
-
   // Validate input tensor is a Vulkan tensor, then convert to vTensor
   TORCH_CHECK(input_arg.is_vulkan(), "Input tensor must be Vulkan!");
   const vTensor& v_input = convert(input_arg);
@@ -1186,7 +1223,7 @@ Tensor run_conv2d_context_impl(
   vTensor v_output{
       context,
       output_size,
-      input_arg.scalar_type(),
+      v_input.dtype(),
   };
 
   if (quantized) {
@@ -1356,9 +1393,94 @@ Tensor conv2d_clamp_run(
   return context->run(input);
 }
 
+Conv1dPackedContext::Conv1dPackedContext(
+    const Tensor& weight,
+    const c10::optional<Tensor>& bias,
+    const IntArrayRef stride_arg,
+    const IntArrayRef padding_arg,
+    const IntArrayRef dilation_arg,
+    const int64_t groups)
+    : unpacked_{c10::AnyType::get()} {
+  packed_.reserve(Packed::NumArgs);
+  packed_.emplace_back(
+      convert(conv1d::pack_weights_using_width_packing(weight.vulkan())));
+  packed_.emplace_back(bias->vulkan());
+  packed_.emplace_back(stride_arg);
+  packed_.emplace_back(padding_arg);
+  packed_.emplace_back(dilation_arg);
+  packed_.emplace_back(safe_downcast<int32_t>(groups));
+
+  compute_shader_ = VK_KERNEL(conv1d);
+
+  if (!at::globalContext().releaseWeightsWhenPrepacking()) {
+    unpacked_.reserve(Unpacked::NumArgs);
+    unpacked_.emplace_back(weight);
+    unpacked_.emplace_back(bias);
+    unpacked_.emplace_back(stride_arg.vec());
+    unpacked_.emplace_back(padding_arg.vec());
+    unpacked_.emplace_back(dilation_arg.vec());
+    unpacked_.emplace_back(safe_downcast<int32_t>(groups));
+  }
+}
+
+Conv1dPackedContext Conv1dPackedContext::pack(c10::impl::GenericList unpacked) {
+  return Conv1dPackedContext(
+      unpacked.get(Unpacked::Weight).toTensor(),
+      get_optional_tensor(unpacked, Unpacked::Bias),
+      unpacked.get(Unpacked::Stride).toIntVector(),
+      unpacked.get(Unpacked::Padding).toIntVector(),
+      unpacked.get(Unpacked::Dilation).toIntVector(),
+      unpacked.get(Unpacked::Groups).toInt());
+}
+
+c10::intrusive_ptr<Conv1dPackedContext> create_conv1d_context(
+    Tensor&& weight,
+    c10::optional<Tensor>&& bias,
+    std::vector<int64_t>&& stride,
+    std::vector<int64_t>&& padding,
+    std::vector<int64_t>&& dilation,
+    const int64_t groups) {
+  return c10::make_intrusive<Conv1dPackedContext>(
+      Conv1dPackedContext(weight, bias, stride, padding, dilation, groups));
+}
+
+Tensor convolution1d(
+    const Tensor& input,
+    const Tensor& weight,
+    const c10::optional<Tensor>& bias,
+    const IntArrayRef stride,
+    const IntArrayRef padding,
+    const IntArrayRef dilation,
+    const int64_t groups) {
+  Conv1dPackedContext conv1d_context =
+      Conv1dPackedContext(weight, bias, stride, padding, dilation, groups);
+
+  return run_conv1d_context(
+      input, c10::make_intrusive<Conv1dPackedContext>(conv1d_context));
+}
+
+Tensor run_conv1d_context(
+    const Tensor& input,
+    const c10::intrusive_ptr<Conv1dPackedContext>& context) {
+  const Tensor weight =
+      context->get_val(Conv1dPackedContext::Packed::Weight).toTensor();
+  const c10::optional<Tensor>& bias_opt =
+      context->get_val(Conv1dPackedContext::Packed::Bias).toTensor();
+  const auto stride =
+      context->get_val(Conv1dPackedContext::Packed::Stride).toIntVector();
+  const auto padding =
+      context->get_val(Conv1dPackedContext::Packed::Padding).toIntVector();
+  const auto dilation =
+      context->get_val(Conv1dPackedContext::Packed::Dilation).toIntVector();
+  const auto groups =
+      context->get_val(Conv1dPackedContext::Packed::Groups).toInt();
+  return conv1d::run_conv1d_context_impl(
+      input, weight, bias_opt, stride, padding, dilation, groups);
+}
+
 TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl("convolution_overrideable", convolution);
-  m.impl(TORCH_SELECTIVE_NAME("aten::conv1d"), TORCH_FN(conv1d::conv1d));
+  m.impl(TORCH_SELECTIVE_NAME("aten::conv1d"), TORCH_FN(convolution1d));
 }
 
 } // namespace ops
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.h b/aten/src/ATen/native/vulkan/ops/Convolution.h
index 0524c70b3355e..1d51190b8cab5 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.h
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.h
@@ -170,6 +170,17 @@ Tensor run_qconv2d_context(
     int64_t zero_point,
     const c10::intrusive_ptr<Conv2dPackedContext>& conv_context);
 
+c10::intrusive_ptr<Conv2dPackedContext> create_qtconv2d_context(
+    Tensor&& weight,
+    c10::optional<Tensor>&& bias,
+    std::vector<int64_t>&& stride,
+    std::vector<int64_t>&& padding,
+    std::vector<int64_t>&& output_padding,
+    std::vector<int64_t>&& dilation,
+    const int64_t groups,
+    const c10::optional<Scalar>& output_min = c10::nullopt,
+    const c10::optional<Scalar>& output_max = c10::nullopt);
+
 // Backwards compatibility
 class Conv2dOpContext final : public torch::jit::CustomClassHolder {
  public:
@@ -217,6 +228,75 @@ c10::intrusive_ptr<Conv2dOpContext> conv2d_clamp_prepack(
     const c10::optional<Scalar>& output_min,
     const c10::optional<Scalar>& output_max);
 
+class Conv1dPackedContext final : virtual public VulkanPackedContext,
+                                  public torch::jit::CustomClassHolder {
+ private:
+  c10::impl::GenericList unpacked_;
+  api::ShaderInfo compute_shader_{};
+
+ public:
+  Conv1dPackedContext(
+      const Tensor& weight,
+      const c10::optional<Tensor>& bias,
+      const IntArrayRef stride_arg,
+      const IntArrayRef padding_arg,
+      const IntArrayRef dilation_arg,
+      const int64_t groups);
+
+  /*
+   * Assigns a name to each index in the unpacked list.
+   */
+  struct Unpacked final {
+    static constexpr uint32_t Weight = 0u;
+    static constexpr uint32_t Bias = 1u;
+    static constexpr uint32_t Stride = 2u;
+    static constexpr uint32_t Padding = 3u;
+    static constexpr uint32_t Dilation = 4u;
+    static constexpr uint32_t Groups = 5u;
+
+    static constexpr uint32_t NumArgs = 6u;
+  };
+
+  /*
+   * Assigns a name to each index in the packed list.
+   */
+  struct Packed final {
+    static constexpr uint32_t Weight = 0u;
+    static constexpr uint32_t Bias = 1u;
+    static constexpr uint32_t Stride = 2u;
+    static constexpr uint32_t Padding = 3u;
+    static constexpr uint32_t Dilation = 4u;
+    static constexpr uint32_t Groups = 5u;
+    static constexpr uint32_t WeightSizes = 6u;
+
+    static constexpr uint32_t NumArgs = 7u;
+  };
+
+  static Conv1dPackedContext pack(c10::impl::GenericList);
+
+  const c10::impl::GenericList unpack() const override {
+    TORCH_CHECK(unpacked_.size() > 0u, "unpacked_ does not have any elements!");
+
+    return unpacked_;
+  }
+
+  inline api::ShaderInfo& compute_shader() {
+    return compute_shader_;
+  }
+};
+
+c10::intrusive_ptr<Conv1dPackedContext> create_conv1d_context(
+    Tensor&& weight,
+    c10::optional<Tensor>&& bias,
+    std::vector<int64_t>&& stride,
+    std::vector<int64_t>&& padding,
+    std::vector<int64_t>&& dilation,
+    const int64_t groups);
+
+Tensor run_conv1d_context(
+    const Tensor& input,
+    const c10::intrusive_ptr<Conv1dPackedContext>& context);
+
 } // namespace ops
 } // namespace vulkan
 } // namespace native
diff --git a/aten/src/ATen/native/vulkan/ops/Copy.cpp b/aten/src/ATen/native/vulkan/ops/Copy.cpp
index 0da4ce83a5be4..60bc3a341ba0d 100644
--- a/aten/src/ATen/native/vulkan/ops/Copy.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Copy.cpp
@@ -66,7 +66,8 @@ void transfer_cpu_to_vulkan(const Tensor& src, vTensor& v_dst) {
   // Convert to dtype corresponding to the image format of the texture to
   // ensure that byte alignment is consistent when copying. In some cases
   // a 16 bit format will be used for at::kFloat.
-  Tensor src_nc4hw = utils::nchw_to_nc4hw(src).to(v_dst.texture_dtype());
+  Tensor src_nc4hw =
+      utils::nchw_to_nc4hw(src).to(convert_dtype(v_dst.texture_dtype()));
 
   api::StorageBuffer staging(context, v_dst.texture_dtype(), v_dst.gpu_numel());
   // Copy data into the staging buffer
@@ -118,7 +119,8 @@ void transfer_vulkan_to_cpu(vTensor& v_src, Tensor& dst) {
 
   context->fences().return_fence(fence);
 
-  dst = utils::nc4hw_to_nchw(dst_tmp, v_src.sizes()).to(v_src.dtype());
+  dst = utils::nc4hw_to_nchw(dst_tmp, v_src.sizes())
+            .to(convert_dtype(v_src.dtype()));
 }
 
 void transfer_vulkan_to_vulkan(vTensor& src, vTensor& dst) {
@@ -159,7 +161,7 @@ void pack_cpu_to_vulkan(const Tensor& src, vTensor& dst) {
   // of floats as input. GLSL/Vulkan does not natively support 16 bit arithmetic
   // types, so for now storage buffers created for compute shaders must define
   // floats as their base data type.
-  api::StorageBuffer staging(context, at::kFloat, dst.gpu_numel());
+  api::StorageBuffer staging(context, api::kFloat, dst.gpu_numel());
   {
     api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE);
 
@@ -184,7 +186,7 @@ void pack_vulkan_to_cpu(vTensor& src, Tensor& dst) {
 
   // Refer to the comment in pack_cpu_to_vulkan for why at::kFloat is specified
   // for the storage buffer below.
-  api::StorageBuffer staging(context, at::kFloat, src.gpu_numel());
+  api::StorageBuffer staging(context, api::kFloat, src.gpu_numel());
 
   api::VulkanFence fence = context->fences().get_fence();
 
@@ -277,10 +279,10 @@ vTensor to_vulkan(at::Tensor& src, const api::StorageType storage_type) {
 
   vTensor v_ret{
       api::context(),
-      src.sizes(),
-      src.scalar_type(),
+      src.sizes().vec(),
+      convert_dtype(src.scalar_type()),
       storage_type,
-      src.suggest_memory_format(),
+      get_gpu_memory_layout(storage_type, src.suggest_memory_format()),
   };
 
   ops::pack_cpu_to_vulkan(src, v_ret);
@@ -290,7 +292,7 @@ vTensor to_vulkan(at::Tensor& src, const api::StorageType storage_type) {
 
 at::Tensor from_vulkan(vTensor& v_src) {
   at::TensorOptions opt(at::kCPU);
-  opt = opt.dtype(v_src.dtype());
+  opt = opt.dtype(convert_dtype(v_src.dtype()));
 
   c10::MemoryFormat v_src_memory_format;
 
diff --git a/aten/src/ATen/native/vulkan/ops/Factory.cpp b/aten/src/ATen/native/vulkan/ops/Factory.cpp
index ec6f7b8012eb1..b746868c238fd 100644
--- a/aten/src/ATen/native/vulkan/ops/Factory.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Factory.cpp
@@ -15,14 +15,16 @@ Tensor _empty_affine_quantized(
     const double scale,
     const int64_t zero_point,
     const optional<MemoryFormat> memory_format) {
+  api::StorageType storage_type = api::StorageType::TEXTURE_3D;
   return convert_quantized(vTensor{
       api::context(),
-      sizes,
+      sizes.vec(),
       scale,
       zero_point,
-      dtype ? *dtype : c10::kFloat,
-      api::StorageType::TEXTURE_3D,
-      memory_format ? *memory_format : c10::MemoryFormat::Contiguous,
+      convert_dtype(dtype ? *dtype : c10::kFloat),
+      storage_type,
+      memory_format ? get_gpu_memory_layout(storage_type, *memory_format)
+                    : api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
   });
 }
 
@@ -33,12 +35,14 @@ Tensor empty_memory_format(
     const c10::optional<Device> device,
     const c10::optional<bool> pin_memory,
     const optional<MemoryFormat> memory_format) {
+  api::StorageType storage_type = api::StorageType::TEXTURE_3D;
   return convert(vTensor{
       api::context(),
-      sizes,
-      dtype ? *dtype : c10::kFloat,
-      api::StorageType::TEXTURE_3D,
-      memory_format ? *memory_format : c10::MemoryFormat::Contiguous,
+      sizes.vec(),
+      convert_dtype(dtype ? *dtype : c10::kFloat),
+      storage_type,
+      memory_format ? get_gpu_memory_layout(storage_type, *memory_format)
+                    : api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
   });
 }
 
diff --git a/aten/src/ATen/native/vulkan/ops/Flip.cpp b/aten/src/ATen/native/vulkan/ops/Flip.cpp
index cec9873fe1ba9..c3dacf3796613 100644
--- a/aten/src/ATen/native/vulkan/ops/Flip.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Flip.cpp
@@ -25,8 +25,8 @@ Tensor flip(const at::Tensor& self, const IntArrayRef dim_list) {
   // Create the output texture
   vTensor v_output{
       context,
-      self.sizes().vec(),
-      self.scalar_type(),
+      v_input.sizes(),
+      convert_dtype(self.scalar_type()),
   };
 
   // Required to determine how to insert memory barriers in the command buffer
diff --git a/aten/src/ATen/native/vulkan/ops/Glu.cpp b/aten/src/ATen/native/vulkan/ops/Glu.cpp
index 5b103b3b95df1..6fc8dd5cac5a7 100644
--- a/aten/src/ATen/native/vulkan/ops/Glu.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Glu.cpp
@@ -31,7 +31,7 @@ Tensor glu(const at::Tensor& input_arg, const int64_t dim = -1) {
   vTensor v_output{
       context,
       {v_input_sizes[0], output_ch_size, v_input_sizes[2], v_input_sizes[3]},
-      input_arg.scalar_type(),
+      v_input.dtype(),
   };
 
   const struct Block final {
diff --git a/aten/src/ATen/native/vulkan/ops/Lerp.cpp b/aten/src/ATen/native/vulkan/ops/Lerp.cpp
index a7ee70cf938cb..3584c7134666b 100644
--- a/aten/src/ATen/native/vulkan/ops/Lerp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Lerp.cpp
@@ -59,7 +59,7 @@ Tensor _lerp_scalar(
   vTensor v_output{
       context,
       v_start.sizes(),
-      start_arg.scalar_type(),
+      v_start.dtype(),
   };
 
   const float weight = weight_arg.to<float>();
@@ -184,7 +184,7 @@ Tensor _lerp_tensor(
   vTensor v_output{
       context,
       v_start.sizes(),
-      start_arg.scalar_type(),
+      v_start.dtype(),
   };
 
   const struct Block final {
diff --git a/aten/src/ATen/native/vulkan/ops/Mean.cpp b/aten/src/ATen/native/vulkan/ops/Mean.cpp
index 0983e1fa4d67e..f5d99687fb3fa 100644
--- a/aten/src/ATen/native/vulkan/ops/Mean.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mean.cpp
@@ -38,7 +38,7 @@ Tensor mean_dim(
   dim = utils::normalize(dim, self.dim());
 
   // Create the output texture
-  std::vector<int64_t> output_size = self.sizes().vec();
+  std::vector<int64_t> output_size = v_input.sizes();
   uint32_t dim_size = output_size[dim];
   if (keepdim) {
     output_size[dim] = 1;
@@ -54,7 +54,7 @@ Tensor mean_dim(
   vTensor v_output{
       context,
       output_size,
-      type,
+      convert_dtype(type),
   };
 
   // Required to determine how to insert memory barriers in the command buffer
diff --git a/aten/src/ATen/native/vulkan/ops/Mm.cpp b/aten/src/ATen/native/vulkan/ops/Mm.cpp
index 19bef2d559cf0..e5893e8172875 100644
--- a/aten/src/ATen/native/vulkan/ops/Mm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mm.cpp
@@ -128,7 +128,8 @@ vTensor pack_weights(const Tensor& weight_arg, const bool use_batch = false) {
       dst_kw_sz,
   };
 
-  vTensor v_weight{context, dst_vtensor_sizes, weight_arg.scalar_type()};
+  vTensor v_weight{
+      context, dst_vtensor_sizes, convert_dtype(weight_arg.scalar_type())};
 
   v_weight.set_is_quantized();
   v_weight.set_scale(weight_arg.q_scale());
@@ -161,6 +162,133 @@ vTensor pack_biases(
   }
 }
 
+// Old version of pack_biases that fixes issues with quantization and to be
+// removed in the future.
+vTensor pack_biases_quantized_weights(
+    const Tensor& weight_arg,
+    const c10::optional<Tensor>& bias_arg,
+    const bool use_batch = false) {
+  TORCH_CHECK(
+      weight_arg.is_quantized(),
+      "pack_biases_quantized to be used only when using quantized linear ops");
+
+  if (bias_arg && bias_arg->is_vulkan()) {
+    return convert(*bias_arg);
+  }
+
+  api::Context* const context = api::context();
+
+  if (bias_arg) {
+    const Tensor bias = bias_arg->contiguous();
+    const IntArrayRef b_sizes = bias.sizes();
+    const float* const src_bias_ptr = bias.const_data_ptr<float>();
+
+    /* Source */
+    int64_t src_kb_sz = 0;
+    int64_t src_kw_sz = 0;
+    int64_t src_kh_sz = 0;
+    if (use_batch) {
+      if (bias.sizes().size() == 3) {
+        src_kb_sz = b_sizes[Layout::BatchMatrices::batch];
+        src_kw_sz = b_sizes[Layout::BatchMatrices::width];
+        src_kh_sz = b_sizes[Layout::BatchMatrices::height];
+      } else if (bias.sizes().size() == 2) {
+        // skip batch dim for boardcasting; index -1
+        src_kb_sz = 1;
+        src_kw_sz = b_sizes[Layout::BatchMatrices::height];
+        src_kh_sz = b_sizes[Layout::BatchMatrices::batch];
+      } else {
+        // skip batch & height dim for boardcasting; index -2
+        src_kb_sz = 1;
+        src_kw_sz = b_sizes[Layout::BatchMatrices::batch];
+        src_kh_sz = 1;
+      }
+    } else {
+      src_kb_sz = 1;
+      if (bias.sizes().size() == 2) {
+        src_kw_sz = b_sizes[Layout::Parameter::width];
+        src_kh_sz = b_sizes[Layout::Parameter::height];
+      } else {
+        src_kw_sz = b_sizes[Layout::Parameter::height];
+        src_kh_sz = 1;
+      }
+    }
+    const int64_t src_matrix_sz = src_kw_sz * src_kh_sz;
+
+    /* Destination */
+    const int64_t dst_kw_sz = div_up(src_kw_sz, INT64_C(2));
+    const int64_t dst_kh_sz = div_up(src_kh_sz, INT64_C(2));
+    const int64_t dst_plane_sz = dst_kw_sz * dst_kh_sz;
+    const int64_t dst_matrix_sz = dst_plane_sz * 4;
+
+    vTensor v_bias{
+        context,
+        {
+            src_kb_sz,
+            4,
+            dst_kh_sz,
+            dst_kw_sz,
+        },
+        convert_dtype(bias_arg->scalar_type()),
+    };
+
+    api::StorageBuffer staging(
+        context, api::ScalarType::Float, v_bias.gpu_numel());
+    {
+      api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE);
+
+      float* dst_bias_ptr = mapping.template data<float>();
+
+      memset(dst_bias_ptr, 0, v_bias.nbytes());
+
+      for (const auto src_b : c10::irange(src_kb_sz)) {
+        for (const auto src_h : c10::irange(src_kh_sz == 1 ? 2 : src_kh_sz)) {
+          for (const auto src_w :
+               c10::irange((use_batch && src_kw_sz == 1) ? 2 : src_kw_sz)) {
+            int64_t dst_plane = 2 * (src_h % 2) + (src_w % 2);
+            int64_t dst_index = (src_h / 2) * dst_kw_sz + (src_w / 2);
+            memcpy(
+                dst_bias_ptr + src_b * dst_matrix_sz +
+                    dst_plane * dst_plane_sz + dst_index,
+                src_bias_ptr + src_b * src_matrix_sz +
+                    (src_kh_sz == 1 ? 0 : src_h * src_kw_sz) +
+                    ((use_batch && src_kw_sz == 1) ? 0 : src_w),
+                sizeof(float));
+          }
+        }
+      }
+    }
+    utils::pack_staging_to_vtensor(staging.buffer(), v_bias);
+
+    return v_bias;
+  } else {
+    vTensor v_bias{
+        api::context(),
+        {1},
+        convert_dtype(weight_arg.scalar_type()),
+    };
+
+    api::StorageBuffer staging(
+        context, api::ScalarType::Float, v_bias.gpu_numel());
+    {
+      api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE);
+
+      float* data_ptr = mapping.template data<float>();
+
+      memset(
+          data_ptr,
+          // 2's complement integers and IEEE-754 floating point numbers both
+          // have identical bit representations for 0, so can use memset which
+          // only accepts uint8_t parameter.
+          0,
+          v_bias.nbytes());
+    }
+    utils::pack_staging_to_vtensor(staging.buffer(), v_bias);
+
+    return v_bias;
+  }
+}
+
 bool available_check_with_batch(
     const Tensor& weight,
     const c10::optional<Tensor>& bias) {
@@ -286,8 +414,12 @@ bool usable(
 
 static Tensor reshape_to_2d(const Tensor& input_arg) {
   TORCH_CHECK(
-      input_arg.dim() >= 2,
-      "Vulkan Linear op only supports input tensor with dim >= 2");
+      input_arg.dim() >= 1,
+      "Vulkan Linear op only supports input tensor with dim >= 1");
+
+  if (input_arg.dim() == 1) {
+    return input_arg.unsqueeze(0);
+  }
   const IntArrayRef input_sizes = input_arg.sizes();
   const auto d =
       c10::multiply_integers(input_sizes.cbegin(), input_sizes.end() - 1);
@@ -307,8 +439,7 @@ Tensor run_quantized_addmm_context(
       input_arg.dim() == 2 ? input_arg : reshape_to_2d(input_arg);
   const Tensor input =
       input_arg_2d.is_vulkan() ? input_arg_2d : input_arg_2d.vulkan();
-  const vTensor& v_input = pack_inputs_using_width_packing(input);
-
+  const vTensor& v_input = convert(input);
   const vTensor& packed_v_weight = convert(
       linear_context->get_val(LinearPackedContext::Packed::Weight).toTensor());
   const vTensor& packed_v_bias = convert(
@@ -331,18 +462,13 @@ Tensor run_quantized_addmm_context(
       (packed_v_weight.is_quantized() && v_input.is_quantized()),
       "run_quantized_addmm_context called for quantized version with unquantized input");
 
-  TORCH_CHECK(
-      packed_v_weight.gpu_memory_layout() ==
-          api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED,
-      "run_quantized_addmm_context called for non-quantized version with unpacked weight");
-
   vTensor v_output{
       context,
       {
           input_arg_2d.sizes()[Layout::Parameter::height],
           unpacked_weight_sizes[Layout::Parameter::width],
       },
-      input_arg.scalar_type(),
+      v_input.dtype(),
   };
 
   v_output.set_is_quantized();
@@ -409,9 +535,9 @@ Tensor run_quantized_addmm_context(
         // global work group size
         {
             safe_downcast<uint32_t>(
-                div_up(v_output.sizes()[Layout::Parameter::width], INT64_C(4))),
+                div_up(v_output.sizes()[Layout::Parameter::width], INT64_C(2))),
             safe_downcast<uint32_t>(div_up(
-                v_output.sizes()[Layout::Parameter::height], INT64_C(4))),
+                v_output.sizes()[Layout::Parameter::height], INT64_C(2))),
             1,
         },
         // local work group size
@@ -481,9 +607,9 @@ Tensor run_quantized_addmm_context(
         // global work group size
         {
             safe_downcast<uint32_t>(
-                div_up(v_output.sizes()[Layout::Parameter::width], INT64_C(4))),
+                div_up(v_output.sizes()[Layout::Parameter::width], INT64_C(2))),
             safe_downcast<uint32_t>(div_up(
-                v_output.sizes()[Layout::Parameter::height], INT64_C(4))),
+                v_output.sizes()[Layout::Parameter::height], INT64_C(2))),
             1,
         },
         // local work group size
@@ -569,7 +695,7 @@ Tensor run_addmm_context(
           input_arg_2d.sizes()[Layout::Parameter::height],
           unpacked_weight_sizes[Layout::Parameter::width],
       },
-      input_arg.scalar_type(),
+      v_input.dtype(),
   };
 
   api::UniformParamsBuffer params;
@@ -695,7 +821,7 @@ Tensor run_baddbmm_context(
           packed_v_input.sizes()[Layout::BatchMatrices::height],
           unpacked_weight_sizes.back(), // "w" dimension in weight matrix
       },
-      input_arg.scalar_type(),
+      packed_v_input.dtype(),
   };
 
   const struct {
@@ -826,7 +952,10 @@ LinearPackedContext::LinearPackedContext(
 
   packed_.reserve(Packed::NumArgs);
   packed_.emplace_back(convert(pack_weights(weight, use_batch)));
-  packed_.emplace_back(convert(pack_biases(weight, bias, use_batch)));
+  const auto& packed_biases = weight.is_quantized()
+      ? pack_biases_quantized_weights(weight, bias, use_batch)
+      : pack_biases(weight, bias, use_batch);
+  packed_.emplace_back(convert(packed_biases));
   packed_.emplace_back(weight.sizes());
   packed_.emplace_back(bias && bias->defined());
 
diff --git a/aten/src/ATen/native/vulkan/ops/Mm.h b/aten/src/ATen/native/vulkan/ops/Mm.h
index d196570f6db28..b4fcb31bc315c 100644
--- a/aten/src/ATen/native/vulkan/ops/Mm.h
+++ b/aten/src/ATen/native/vulkan/ops/Mm.h
@@ -26,8 +26,8 @@ void stage_pack_weights(
   const int64_t src_matrix_sz = src_kw_sz * src_kh_sz;
   const int64_t dst_plane_sz = dst_kw_sz * dst_kh_sz;
   const int64_t dst_matrix_sz = dst_plane_sz * 4;
-  const T* const src_weight_ptr = weight.data_ptr<T>();
-  api::StorageBuffer staging(context, at::kFloat, v_weight.gpu_numel());
+  const T* const src_weight_ptr = weight.const_data_ptr<T>();
+  api::StorageBuffer staging(context, api::kFloat, v_weight.gpu_numel());
   {
     api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE);
 
diff --git a/aten/src/ATen/native/vulkan/ops/Padding.cpp b/aten/src/ATen/native/vulkan/ops/Padding.cpp
index 3ce6043c79714..7632df382c7a7 100644
--- a/aten/src/ATen/native/vulkan/ops/Padding.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Padding.cpp
@@ -38,7 +38,7 @@ Tensor pad2d(
   const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan();
   const vTensor& v_self = convert(self);
 
-  c10::SmallVector<int64_t, 4> output_size(input_dim);
+  std::vector<int64_t> output_size(input_dim);
   for (const auto d : c10::irange(input_dim)) {
     if (d == input_dim - 1) {
       output_size[d] = input_size[d] + pad_right + pad_left;
@@ -52,7 +52,7 @@ Tensor pad2d(
   vTensor v_output{
       context,
       output_size,
-      self_arg.scalar_type(),
+      v_self.dtype(),
   };
 
   const struct Block final {
diff --git a/aten/src/ATen/native/vulkan/ops/Permute.cpp b/aten/src/ATen/native/vulkan/ops/Permute.cpp
index 11da8592c5366..2e4a66efbf63d 100644
--- a/aten/src/ATen/native/vulkan/ops/Permute.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Permute.cpp
@@ -102,7 +102,12 @@ Tensor permute(const Tensor& self, IntArrayRef dims) {
     return self;
   }
 
-  vTensor v_output{api::context(), newSizes, self.scalar_type()};
+  IntArrayRef output_sizes(newSizes);
+  vTensor v_output{
+      api::context(),
+      output_sizes.vec(),
+      convert_dtype(self.scalar_type()),
+  };
 
   return permute_4d(self, in_size, out_size, out_dims, v_output);
 }
diff --git a/aten/src/ATen/native/vulkan/ops/Pool.cpp b/aten/src/ATen/native/vulkan/ops/Pool.cpp
index 2902a8c1efb0c..fab4f05b4a98b 100644
--- a/aten/src/ATen/native/vulkan/ops/Pool.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Pool.cpp
@@ -30,7 +30,7 @@ Tensor adaptive_avg_pool2d(
           output_size[Layout::Activation4D::batch],
           output_size[Layout::Activation4D::channels],
       },
-      self_arg.scalar_type(),
+      v_self.dtype(),
   };
 
   const uvec3 v_output_size = v_output.extents();
@@ -159,7 +159,7 @@ Tensor pool2d(
           output_height,
           output_width,
       },
-      self_arg.scalar_type(),
+      v_self.dtype(),
   };
   if (v_self.is_quantized()) {
     v_output.set_is_quantized();
diff --git a/aten/src/ATen/native/vulkan/ops/QuantizedTensor.cpp b/aten/src/ATen/native/vulkan/ops/QuantizedTensor.cpp
index e2b968e8e35da..81f1a9c0197ab 100644
--- a/aten/src/ATen/native/vulkan/ops/QuantizedTensor.cpp
+++ b/aten/src/ATen/native/vulkan/ops/QuantizedTensor.cpp
@@ -39,7 +39,13 @@ Tensor quantize_per_tensor(
   const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
   const vTensor& v_input = convert(input);
 
-  vTensor v_output{context, input.sizes(), scale, zero_point, dtype};
+  vTensor v_output{
+      context,
+      v_input.sizes(),
+      scale,
+      zero_point,
+      convert_dtype(dtype),
+  };
 
   const struct Block final {
     uvec3 extents;
@@ -110,8 +116,8 @@ Tensor dequantize_helper(
 
   vTensor v_output{
       context,
-      input.sizes(),
-      c10::kFloat,
+      v_input.sizes(),
+      api::kFloat,
   };
 
   const struct Block final {
diff --git a/aten/src/ATen/native/vulkan/ops/Register.cpp b/aten/src/ATen/native/vulkan/ops/Register.cpp
index 2e481f1fafc4f..02fd827b04480 100644
--- a/aten/src/ATen/native/vulkan/ops/Register.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Register.cpp
@@ -37,6 +37,25 @@ int register_vulkan_conv2d_packed_context() {
   return 0;
 }
 
+int register_vulkan_conv1d_packed_context() {
+  static auto register_vulkan_conv1d_context =
+      torch::selective_class_<Conv1dPackedContext>(
+          "vulkan", TORCH_SELECTIVE_CLASS("Conv1dPackedContext"))
+          .def_pickle(
+              // __getstate__
+              [](const c10::intrusive_ptr<Conv1dPackedContext>& context) {
+                // context is packed
+                return context->unpack();
+              },
+              // __setstate__
+              [](c10::impl::GenericList state) {
+                // state is unpacked
+                return c10::make_intrusive<Conv1dPackedContext>(
+                    Conv1dPackedContext::pack(state));
+              });
+  return 0;
+}
+
 int register_vulkan_linear_packed_context() {
   static auto register_vulkan_linear_context =
       torch::selective_class_<LinearPackedContext>(
@@ -118,6 +137,7 @@ TORCH_LIBRARY(vulkan, m) {
                 LstmPackedContext::pack(state));
           });
   register_vulkan_conv2d_packed_context();
+  register_vulkan_conv1d_packed_context();
   register_vulkan_linear_packed_context();
   register_vulkan_layernorm_packed_context();
   // To maintain backwards compatibility.
@@ -174,6 +194,18 @@ TORCH_LIBRARY(vulkan_prepack, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
       "vulkan_prepack::run_qconv2d_context(Tensor X, float scale, int zero_point, "
       "__torch__.torch.classes.vulkan.Conv2dPackedContext vk_context) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "vulkan_prepack::create_conv1d_context(Tensor W, Tensor? B, int[2] stride, "
+      "int[2] padding, int[2] dilation, int groups) "
+      "-> __torch__.torch.classes.vulkan.Conv1dPackedContext"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "vulkan_prepack::run_conv1d_context(Tensor X, "
+      "__torch__.torch.classes.vulkan.Conv1dPackedContext W_prepack) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "vulkan_prepack::create_qtconv2d_context(Tensor W, Tensor? B, int[2] stride, "
+      "int[2] padding, int[2] output_padding, int[2] dilation, int groups, "
+      "Scalar? output_min=None, Scalar? output_max=None) "
+      "-> __torch__.torch.classes.vulkan.Conv2dPackedContext"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "vulkan_prepack::create_linear_context(Tensor W, Tensor? B) "
       "-> __torch__.torch.classes.vulkan.LinearPackedContext"));
@@ -244,6 +276,9 @@ TORCH_LIBRARY_IMPL(vulkan_prepack, CPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("vulkan_prepack::create_tconv2d_context"),
       TORCH_FN(create_tconv2d_context));
+  m.impl(
+      TORCH_SELECTIVE_NAME("vulkan_prepack::create_conv1d_context"),
+      TORCH_FN(create_conv1d_context));
   m.impl(
       TORCH_SELECTIVE_NAME("vulkan_prepack::create_linear_context"),
       TORCH_FN(create_linear_context));
@@ -265,6 +300,9 @@ TORCH_LIBRARY_IMPL(vulkan_prepack, QuantizedCPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("vulkan_prepack::create_qconv2d_context"),
       TORCH_FN(create_qconv2d_context));
+  m.impl(
+      TORCH_SELECTIVE_NAME("vulkan_prepack::create_qtconv2d_context"),
+      TORCH_FN(create_qtconv2d_context));
 }
 
 TORCH_LIBRARY_IMPL(vulkan_prepack, Vulkan, m) {
@@ -280,6 +318,9 @@ TORCH_LIBRARY_IMPL(vulkan_prepack, Vulkan, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("vulkan_prepack::run_qconv2d_context"),
       TORCH_FN(run_qconv2d_context));
+  m.impl(
+      TORCH_SELECTIVE_NAME("vulkan_prepack::run_conv1d_context"),
+      TORCH_FN(run_conv1d_context));
   m.impl(
       TORCH_SELECTIVE_NAME("vulkan_prepack::run_linear_context"),
       TORCH_FN(run_linear_context));
diff --git a/aten/src/ATen/native/vulkan/ops/Register.h b/aten/src/ATen/native/vulkan/ops/Register.h
index dda3aea818e61..b66a8f016fd35 100644
--- a/aten/src/ATen/native/vulkan/ops/Register.h
+++ b/aten/src/ATen/native/vulkan/ops/Register.h
@@ -6,6 +6,7 @@ namespace vulkan {
 namespace ops {
 
 int register_vulkan_conv2d_packed_context();
+int register_vulkan_conv1d_packed_context();
 int register_vulkan_linear_packed_context();
 int register_vulkan_layernorm_packed_context();
 
diff --git a/aten/src/ATen/native/vulkan/ops/Select.cpp b/aten/src/ATen/native/vulkan/ops/Select.cpp
index b46e0f94297df..d966385c5243d 100644
--- a/aten/src/ATen/native/vulkan/ops/Select.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Select.cpp
@@ -18,7 +18,7 @@ Tensor select_batch_4d(const Tensor& input_arg, uint32_t index) {
   vTensor v_output{
       context,
       {v_input_sizes[1], v_input_sizes[2], v_input_sizes[3]},
-      input_arg.scalar_type(),
+      v_input.dtype(),
   };
   /*
   Input tensor: (n, c, h, w)
@@ -70,7 +70,7 @@ Tensor select_depth_3d(const Tensor& input_arg, uint32_t index) {
   vTensor v_output{
       context,
       {v_input_sizes[1], v_input_sizes[2]},
-      input_arg.scalar_type(),
+      v_input.dtype(),
   };
 
   const struct Block final {
@@ -117,7 +117,7 @@ Tensor select_depth_4d(const Tensor& input_arg, uint32_t index) {
   vTensor v_output{
       context,
       {v_input_sizes[0], v_input_sizes[2], v_input_sizes[3]},
-      input_arg.scalar_type(),
+      v_input.dtype(),
   };
   /*
   Input tensor: (n, c, h, w)
@@ -170,7 +170,7 @@ Tensor select_height_3d(const Tensor& input_arg, uint32_t index) {
   vTensor v_output{
       context,
       {v_input_sizes[0], v_input_sizes[2]},
-      input_arg.scalar_type(),
+      v_input.dtype(),
   };
   // Input tensor is a (c, h, w)
   // Output tensor is a (c, w)
@@ -229,7 +229,7 @@ Tensor select_height_4d(const Tensor& input_arg, uint32_t index) {
   vTensor v_output{
       context,
       {v_input_sizes[0], v_input_sizes[1], v_input_sizes[3]},
-      input_arg.scalar_type(),
+      v_input.dtype(),
   };
   /*
   Input tensor: (n, c, h, w)
@@ -283,7 +283,7 @@ Tensor select_width_3d(const Tensor& input_arg, uint32_t index) {
   vTensor v_output{
       context,
       {v_input_sizes[0], v_input_sizes[1]},
-      input_arg.scalar_type(),
+      v_input.dtype(),
   };
 
   const struct Block final {
@@ -343,7 +343,7 @@ Tensor select_width_4d(const Tensor& input_arg, uint32_t index) {
   vTensor v_output{
       context,
       {v_input_sizes[0], v_input_sizes[1], v_input_sizes[2]},
-      input_arg.scalar_type(),
+      v_input.dtype(),
   };
   /*
   Input tensor: (n, c, h, w)
diff --git a/aten/src/ATen/native/vulkan/ops/Shape.cpp b/aten/src/ATen/native/vulkan/ops/Shape.cpp
index 2b9ad021ea88f..2a13979523f63 100644
--- a/aten/src/ATen/native/vulkan/ops/Shape.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Shape.cpp
@@ -15,11 +15,12 @@ Tensor view_internal(const Tensor& self_arg, const IntArrayRef shape) {
   vTensor& v_self = convert(self);
 
   at::DimVector inferred_size = at::infer_size_dv(shape, self.numel());
+  IntArrayRef output_size(inferred_size);
 
   vTensor v_output{
       context,
-      inferred_size,
-      self_arg.scalar_type(),
+      output_size.vec(),
+      v_self.dtype(),
   };
   if (v_self.is_quantized()) {
     v_output.set_is_quantized();
@@ -27,7 +28,7 @@ Tensor view_internal(const Tensor& self_arg, const IntArrayRef shape) {
     v_output.set_zero_point(v_self.get_zero_point());
   }
 
-  api::StorageBuffer buffer(context, at::kFloat, v_self.gpu_numel(), true);
+  api::StorageBuffer buffer(context, api::kFloat, v_self.gpu_numel(), true);
 
   utils::pack_vtensor_to_staging(v_self, buffer.buffer());
 
diff --git a/aten/src/ATen/native/vulkan/ops/Slice.cpp b/aten/src/ATen/native/vulkan/ops/Slice.cpp
index 83d93102c8ed5..7d7721bcb7b15 100644
--- a/aten/src/ATen/native/vulkan/ops/Slice.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Slice.cpp
@@ -276,7 +276,9 @@ Tensor slice(
   }
   dim += 4 - nDims;
 
-  vTensor v_output{api::context(), newSizes, self.scalar_type()};
+  IntArrayRef output_sizes(newSizes);
+  vTensor v_output{
+      api::context(), output_sizes.vec(), convert_dtype(self.scalar_type())};
 
   if (dim == 3) {
     slice_width(self, start_val, end_val, step, v_output);
diff --git a/aten/src/ATen/native/vulkan/ops/Softmax.cpp b/aten/src/ATen/native/vulkan/ops/Softmax.cpp
index 6078bb365ad0e..cc5a6ab04827f 100644
--- a/aten/src/ATen/native/vulkan/ops/Softmax.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Softmax.cpp
@@ -103,12 +103,11 @@ Tensor softmax_internal(
 
   const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
   const vTensor& v_input = convert(input);
-  const IntArrayRef v_input_sizes = v_input.sizes();
 
   vTensor v_output{
       context,
-      v_input_sizes,
-      input_arg.scalar_type(),
+      v_input.sizes(),
+      v_input.dtype(),
   };
   const api::utils::uvec3 global_workgroup_extents = v_output.extents();
   api::utils::ivec4 input_shader_extents = {
@@ -143,7 +142,7 @@ Tensor softmax_internal(
   set_softmax_kernel_params(
       input_arg.dim(),
       dim,
-      v_input_sizes,
+      v_input.sizes(),
       shader_descriptor,
       input_shader_extents,
       early_exit,
@@ -194,7 +193,15 @@ Tensor log_softmax(
     const at::Tensor& input_arg,
     const int64_t dim,
     const bool half_to_float) {
-  return softmax_internal(input_arg, dim, half_to_float).log();
+  // After computing softmax, some values are so small that they are below the
+  // float16 precision. These values are represented as 0 in float16 and result
+  // in -inf when log is applied. According to Wikipedia:
+  // https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Exponent_encoding,
+  // the minimum strictly positive (subnormal) value is 2^−24 ≈ 5.9605 × 10^−8.
+  // Therefore, we add 6 x 10^-8 to the output of softmax to avoid the numerical
+  // issue.
+  float epsilon = 6e-8;
+  return softmax_internal(input_arg, dim, half_to_float).add(epsilon).log();
 }
 
 #ifdef USE_VULKAN_API
diff --git a/aten/src/ATen/native/vulkan/ops/Stack.cpp b/aten/src/ATen/native/vulkan/ops/Stack.cpp
index 90ce3a952506d..eba11259f5583 100644
--- a/aten/src/ATen/native/vulkan/ops/Stack.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Stack.cpp
@@ -19,11 +19,11 @@ namespace {
 using namespace api::utils;
 
 Tensor stack(const at::TensorList tensors, const int64_t dim) {
-  TORCH_CHECK(tensors.size() > 0, "Vulkan stack expects at least one tensor");
-  at::Tensor tensor = tensors[0];
+  TORCH_CHECK(!tensors.empty(), "Vulkan stack expects at least one tensor");
+  const at::Tensor& tensor = tensors[0];
   TORCH_CHECK(
-      tensor.dim() >= 1 || tensor.dim() <= 3,
-      "Vulkan stack supports 1d, 2d, 3d tensors as input!");
+      tensor.dim() <= 3,
+      "Vulkan stack only supports up to 3d tensors as input!");
 
   TORCH_CHECK(
       dim >= -tensor.dim() - 1 && dim <= tensor.dim(),
diff --git a/aten/src/ATen/native/vulkan/ops/Sum.cpp b/aten/src/ATen/native/vulkan/ops/Sum.cpp
index 9b51040e54ca3..56eed26448dd5 100644
--- a/aten/src/ATen/native/vulkan/ops/Sum.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Sum.cpp
@@ -27,7 +27,7 @@ Tensor sum_dim(
   const vTensor& v_input = convert(input);
 
   // Create the output texture
-  std::vector<int64_t> output_size = self.sizes().vec();
+  std::vector<int64_t> output_size = v_input.sizes();
   uint32_t dim_size = output_size[dim];
   if (keepdim) {
     output_size[dim] = 1;
@@ -43,7 +43,7 @@ Tensor sum_dim(
   vTensor v_output{
       context,
       output_size,
-      type,
+      convert_dtype(type),
   };
 
   // Required to determine how to insert memory barriers in the command buffer
diff --git a/aten/src/ATen/native/vulkan/ops/Transpose.cpp b/aten/src/ATen/native/vulkan/ops/Transpose.cpp
index 2c35c5357b194..d35f373a6a0a5 100644
--- a/aten/src/ATen/native/vulkan/ops/Transpose.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Transpose.cpp
@@ -121,7 +121,12 @@ Tensor transpose(const Tensor& self, int64_t index0, int64_t index1) {
   newSizes[new_index0] = oldSizes[new_index1];
   newSizes[new_index1] = oldSizes[new_index0];
 
-  vTensor v_output{api::context(), newSizes, self.scalar_type()};
+  IntArrayRef output_size(newSizes);
+  vTensor v_output{
+      api::context(),
+      output_size.vec(),
+      convert_dtype(self.scalar_type()),
+  };
 
   return transpose_4d(self, in_size, out_size, out_dims, v_output);
 }
diff --git a/aten/src/ATen/native/vulkan/ops/UnaryOp.cpp b/aten/src/ATen/native/vulkan/ops/UnaryOp.cpp
index 56c9d1d373a2c..cb2c6888a4cc1 100644
--- a/aten/src/ATen/native/vulkan/ops/UnaryOp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/UnaryOp.cpp
@@ -22,7 +22,7 @@ Tensor unary_op(
   vTensor v_output{
       context,
       v_self.sizes(),
-      self_arg.scalar_type(),
+      v_self.dtype(),
   };
 
   const struct Block final {
diff --git a/aten/src/ATen/native/vulkan/ops/Unsqueeze.cpp b/aten/src/ATen/native/vulkan/ops/Unsqueeze.cpp
index ac587f02fa1fc..937847447ceb8 100644
--- a/aten/src/ATen/native/vulkan/ops/Unsqueeze.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Unsqueeze.cpp
@@ -16,8 +16,8 @@ struct Block final {
 
 Tensor unsqueeze(const at::Tensor& self, int64_t dim) {
   TORCH_CHECK(
-      self.dim() >= 1 || self.dim() <= 3,
-      "Vulkan unsqueeze supports 1d, 2d, 3d tensors as input!");
+      self.dim() <= 3,
+      "Vulkan unsqueeze only supports up to 3d tensors as input!");
   TORCH_CHECK(
       dim >= -self.dim() - 1 && dim <= self.dim(),
       "Vulkan unsqueeze dimension out of range expected to be in range of [",
@@ -35,7 +35,7 @@ Tensor unsqueeze(const at::Tensor& self, int64_t dim) {
   const vTensor& v_input = convert(input);
 
   // Create the output texture. For unsqueeze, add a dimension.
-  std::vector<int64_t> output_size = self.sizes().vec();
+  std::vector<int64_t> output_size = v_input.sizes();
   if (dim < 0) {
     dim += (self.dim() + 1);
   }
@@ -44,7 +44,7 @@ Tensor unsqueeze(const at::Tensor& self, int64_t dim) {
   vTensor v_output{
       context,
       output_size,
-      self.scalar_type(),
+      convert_dtype(self.scalar_type()),
   };
 
   // Required to determine how to insert memory barriers in the command buffer
diff --git a/aten/src/ATen/native/vulkan/ops/Upsample.cpp b/aten/src/ATen/native/vulkan/ops/Upsample.cpp
index 0a089246c2f3e..776d1e79ce705 100644
--- a/aten/src/ATen/native/vulkan/ops/Upsample.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Upsample.cpp
@@ -32,7 +32,8 @@ Tensor upsample_nearest2d(
           output_sizes[Layout::Parameter::height],
           output_sizes[Layout::Parameter::width],
       },
-      input_arg.scalar_type()};
+      v_input.dtype(),
+  };
 
   if (v_input.is_quantized()) {
     v_output.set_is_quantized();
@@ -116,7 +117,7 @@ Tensor upsample_bilinear2d(
           output_sizes[Layout::Parameter::height],
           output_sizes[Layout::Parameter::width],
       },
-      input_arg.scalar_type(),
+      v_input.dtype(),
   };
 
   const api::utils::uvec3 output_extents = v_output.extents();
diff --git a/aten/src/ATen/native/vulkan/ops/Utils.cpp b/aten/src/ATen/native/vulkan/ops/Utils.cpp
index e36a07dc96f6a..1e6c18cfa43b1 100644
--- a/aten/src/ATen/native/vulkan/ops/Utils.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Utils.cpp
@@ -82,7 +82,8 @@ Tensor create_staging_tensor(const vTensor& v_in) {
   // the staging tensor matches the number of bytes in the image texture. Refer
   // to comments for api::vk_format()
   return at::empty(
-      {NC4, H, W, 4}, at::device(at::kCPU).dtype(v_in.texture_dtype()));
+      {NC4, H, W, 4},
+      at::device(at::kCPU).dtype(convert_dtype(v_in.texture_dtype())));
 }
 
 /*
@@ -328,10 +329,11 @@ api::utils::vec4 extract_texel(const Tensor& input, const ivec3& pos) {
   // x, y, z, w all using a single element tensor. We intend to pull
   // (0, 0, 0).x from each tensor. This allows us to isolate the effect
   // of most packing mechanism.
-  vTensor v_outputs_x{context, output_size, input.scalar_type()};
-  vTensor v_outputs_y{context, output_size, input.scalar_type()};
-  vTensor v_outputs_z{context, output_size, input.scalar_type()};
-  vTensor v_outputs_w{context, output_size, input.scalar_type()};
+  api::ScalarType dtype = convert_dtype(input.scalar_type());
+  vTensor v_outputs_x{context, output_size, dtype};
+  vTensor v_outputs_y{context, output_size, dtype};
+  vTensor v_outputs_z{context, output_size, dtype};
+  vTensor v_outputs_w{context, output_size, dtype};
 
   const struct Block final {
     ivec3 pos;
diff --git a/aten/src/ATen/native/vulkan/ops/Utils.h b/aten/src/ATen/native/vulkan/ops/Utils.h
index 600661d669525..bf677e58fd019 100644
--- a/aten/src/ATen/native/vulkan/ops/Utils.h
+++ b/aten/src/ATen/native/vulkan/ops/Utils.h
@@ -62,6 +62,42 @@ api::utils::vec4 extract_texel(
     const Tensor& tensor,
     const api::utils::ivec3& pos);
 
+inline api::utils::ivec2 make_ivec2(
+    const IntArrayRef ints,
+    bool reverse = false) {
+  VK_CHECK_COND(ints.size() == 2);
+  if (reverse) {
+    return {
+        api::utils::safe_downcast<int32_t>(ints[1]),
+        api::utils::safe_downcast<int32_t>(ints[0])};
+  } else {
+    return {
+        api::utils::safe_downcast<int32_t>(ints[0]),
+        api::utils::safe_downcast<int32_t>(ints[1])};
+  }
+}
+
+inline api::utils::ivec4 make_ivec4(
+    const IntArrayRef ints,
+    bool reverse = false) {
+  VK_CHECK_COND(ints.size() == 4);
+  if (reverse) {
+    return {
+        api::utils::safe_downcast<int32_t>(ints[3]),
+        api::utils::safe_downcast<int32_t>(ints[2]),
+        api::utils::safe_downcast<int32_t>(ints[1]),
+        api::utils::safe_downcast<int32_t>(ints[0]),
+    };
+  } else {
+    return {
+        api::utils::safe_downcast<int32_t>(ints[0]),
+        api::utils::safe_downcast<int32_t>(ints[1]),
+        api::utils::safe_downcast<int32_t>(ints[2]),
+        api::utils::safe_downcast<int32_t>(ints[3]),
+    };
+  }
+}
+
 } // namespace utils
 } // namespace ops
 } // namespace vulkan
diff --git a/aten/src/ATen/native/vulkan/ops/Zero.cpp b/aten/src/ATen/native/vulkan/ops/Zero.cpp
index 094d0caf87566..5ceaae07cdc3e 100644
--- a/aten/src/ATen/native/vulkan/ops/Zero.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Zero.cpp
@@ -55,8 +55,8 @@ Tensor zeros(
   // Create the output texture
   vTensor v_output{
       context,
-      size,
-      ScalarType::Float,
+      size.vec(),
+      api::ScalarType::Float,
   };
 
   // Required to determine how to insert memory barriers in the command buffer
diff --git a/aten/src/ATen/native/vulkan/ops/cumsum.cpp b/aten/src/ATen/native/vulkan/ops/cumsum.cpp
index 47fd6ab26e3e0..c0e8a0c09362d 100644
--- a/aten/src/ATen/native/vulkan/ops/cumsum.cpp
+++ b/aten/src/ATen/native/vulkan/ops/cumsum.cpp
@@ -10,53 +10,170 @@ namespace {
 
 using namespace api::utils;
 
+void set_cumsum_kernel_params(
+    const long long num_dims,
+    const long long dim,
+    const IntArrayRef v_input_sizes,
+    api::ShaderInfo& shader_descriptor,
+    api::utils::ivec4& input_shader_extents,
+    api::utils::ivec4& early_exit,
+    api::utils::ivec4& input_dim_stride,
+    api::utils::ivec4& input_tensor_dims) {
+  if (num_dims == 1) {
+    early_exit.data[0u] = 1;
+    input_dim_stride.data[0u] = 1;
+    shader_descriptor = VK_KERNEL(cumsum_batch_height_width);
+  } else if (num_dims == 2) {
+    // for height, width dim case, we can reuse a single shader
+    // with vectorized parameters
+    shader_descriptor = VK_KERNEL(cumsum_batch_height_width);
+    if (dim == 0) {
+      early_exit.data[1u] = 1;
+      input_dim_stride.data[1u] = 1;
+    } else { // dim == 1
+      early_exit.data[0u] = 1;
+      input_dim_stride.data[0u] = 1;
+    }
+  } else if (num_dims == 3) {
+    for (uint32_t i = 0; i < num_dims; i++) {
+      input_tensor_dims.data[i + 1] = safe_downcast<int32_t>(v_input_sizes[i]);
+    }
+    if (dim == 0) {
+      early_exit.data[2u] = 1;
+      input_dim_stride.data[2u] = 1;
+      shader_descriptor = VK_KERNEL(cumsum_channel);
+    } else if (dim == 1) {
+      // for height, width dim case, we can reuse a single shader
+      // with vectorized parameters
+      early_exit.data[1u] = 1;
+      input_dim_stride.data[1u] = 1;
+      shader_descriptor = VK_KERNEL(cumsum_batch_height_width);
+    } else { // dim == 2
+      early_exit.data[0u] = 1;
+      input_dim_stride.data[0u] = 1;
+      shader_descriptor = VK_KERNEL(cumsum_batch_height_width);
+    }
+  } else {
+    // assume num_dims is 4
+    for (uint32_t i = 0; i < num_dims; i++) {
+      input_tensor_dims.data[i] = safe_downcast<int32_t>(v_input_sizes[i]);
+    }
+    if (dim == 1) {
+      // for 4-rank Tensor, scan along channel dim case, the memory layout
+      // forces a different shader algorithm than other dims
+      input_shader_extents.data[2u] =
+          v_input_sizes[Layout::Activation4D::batch];
+      shader_descriptor = VK_KERNEL(cumsum_channel);
+    } else {
+      // for batch, height, width dim case, we can reuse a single shader
+      // with vectorized parameters
+      if (dim == 0) {
+        early_exit.data[2u] = safe_downcast<int32_t>(
+            std::ceil(v_input_sizes[Layout::Activation4D::channels] / 4.0));
+        input_dim_stride.data[2u] = safe_downcast<int32_t>(
+            std::ceil(v_input_sizes[Layout::Activation4D::channels] / 4.0));
+      } else if (dim == 2) {
+        early_exit.data[1u] = 1;
+        input_dim_stride.data[1u] = 1;
+      } else { // dim == 3
+        early_exit.data[0u] = 1;
+        input_dim_stride.data[0u] = 1;
+      }
+      shader_descriptor = VK_KERNEL(cumsum_batch_height_width);
+    }
+  }
+}
+
 Tensor cumsum(
     const at::Tensor& input_arg,
-    const int64_t dim,
+    const int64_t dim_arg,
     const c10::optional<ScalarType> dtype) {
   TORCH_CHECK(
-      input_arg.dim() <= 4, "Vulkan cumsum expects input dimension <= 4!");
+      input_arg.dim() >= 1 && input_arg.dim() <= 4,
+      "Vulkan cumsum expects 1 <= input dimension <= 4, Tensor input dimensions ",
+      input_arg.dim());
 
   TORCH_CHECK(
-      get_dim<Dim4D::Batch>(input_arg) == 1,
-      "Vulkan cumsum expects batch size <= 1!");
+      dim_arg < input_arg.dim(),
+      "cumsum dim input was ",
+      dim_arg,
+      " out of range for Tensor input with dimensions ",
+      input_arg.dim());
 
-  TORCH_CHECK(dim < 4, "Vulkan cumsum expects dim < 4!");
-
-  if (dim <= 1) {
-    // TODO: dim<0, dim=0, dim=1(z axis)
-    TORCH_CHECK(false, "Not implemented!");
-  }
+  int64_t dim = utils::normalize(dim_arg, input_arg.dim());
 
   api::Context* const context = api::context();
 
   const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
   const vTensor& v_input = convert(input);
+  const IntArrayRef v_input_sizes = v_input.sizes();
 
   vTensor v_output{
       context,
-      input_arg.sizes(),
-      input_arg.scalar_type(),
+      v_input.sizes(),
+      v_input.dtype(),
   };
 
+  const api::utils::uvec3 global_workgroup_extents = v_output.extents();
+  api::utils::ivec4 input_shader_extents = {
+      safe_downcast<int32_t>(v_input.extents().data[0u]),
+      safe_downcast<int32_t>(v_input.extents().data[1u]),
+      safe_downcast<int32_t>(v_input.extents().data[2u]),
+      0 // zero pad
+  };
+  // early_exit is the global workgroup position-based condition for
+  // unnecessary invocations to exit.
+  api::utils::ivec4 early_exit = {
+      safe_downcast<int32_t>(v_input.extents().data[0u]),
+      safe_downcast<int32_t>(v_input.extents().data[1u]),
+      safe_downcast<int32_t>(v_input.extents().data[2u]),
+      0 // zero pad
+  };
+  // for batch/height/width, they share the same shader
+  // vectorized by input_dim_stride for each dimension case
+  api::utils::ivec4 input_dim_stride = {
+      0,
+      0,
+      0,
+      0, // zero pad
+  };
+  api::utils::ivec4 input_tensor_dims = {
+      0,
+      0,
+      0,
+      0,
+  };
+  api::ShaderInfo shader_descriptor;
+  set_cumsum_kernel_params(
+      input_arg.dim(),
+      dim,
+      v_input_sizes,
+      shader_descriptor,
+      input_shader_extents,
+      early_exit,
+      input_dim_stride,
+      input_tensor_dims);
+
   const struct Block final {
-    int32_t axis;
+    ivec4 input_shader_extents;
+    ivec4 input_tensor_dims;
+    ivec4 input_dim_stride;
+    ivec4 early_exit;
   } block{
-      (3 - safe_downcast<int32_t>(dim)),
-  };
+      input_shader_extents, input_tensor_dims, input_dim_stride, early_exit};
 
   api::UniformParamsBuffer params(context, block);
   api::PipelineBarrier pipeline_barrier{};
 
   context->submit_compute_job(
       // shader descriptor
-      VK_KERNEL(cumsum),
+      shader_descriptor,
       // pipeline barrier
       pipeline_barrier,
       // global work group size
-      v_input.extents(),
+      global_workgroup_extents,
       // local work group size
-      adaptive_work_group_size(v_output.extents()),
+      adaptive_work_group_size(global_workgroup_extents),
       // fence handle
       VK_NULL_HANDLE,
       // shader arguments
diff --git a/aten/src/ATen/native/xnnpack/Activation.cpp b/aten/src/ATen/native/xnnpack/Activation.cpp
index fd8f46fc145d0..16fb3073c6d2a 100644
--- a/aten/src/ATen/native/xnnpack/Activation.cpp
+++ b/aten/src/ATen/native/xnnpack/Activation.cpp
@@ -22,9 +22,6 @@ static Tensor& hardswish_impl(Tensor& input, Tensor& output) {
 
   xnn_operator_t hardswish_op{};
   const xnn_status create_status = xnn_create_hardswish_nc_f32(
-    1, // channels
-    1, // input stride
-    1, // output stride
     0, // flags
     &hardswish_op);
 
@@ -37,6 +34,9 @@ static Tensor& hardswish_impl(Tensor& input, Tensor& output) {
   const xnn_status reshape_status = xnn_reshape_hardswish_nc_f32(
     hardswish_op,
     input.numel(),  // Batch
+    1, // channels
+    1, // input stride
+    1, // output stride
     caffe2::pthreadpool_());  // threadpool
 
   TORCH_CHECK(
diff --git a/aten/src/ATen/native/xnnpack/AveragePooling.cpp b/aten/src/ATen/native/xnnpack/AveragePooling.cpp
index 1ee7df98a241b..f3c2ecdac233c 100644
--- a/aten/src/ATen/native/xnnpack/AveragePooling.cpp
+++ b/aten/src/ATen/native/xnnpack/AveragePooling.cpp
@@ -33,11 +33,6 @@ Tensor global_average_pool(const Tensor& input) {
 
   xnn_operator_t global_average_pooling_op{};
   const xnn_status create_status = xnn_create_global_average_pooling_nwc_f32(
-      input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
-      input_padded_contig_nhwc.size(
-          Layout::Activation4D::channels), // input stride
-      input_padded_contig_nhwc.size(
-          Layout::Activation4D::channels), // output stride
       -std::numeric_limits<float>::infinity(),
       std::numeric_limits<float>::infinity(),
       0 /* flags */,
@@ -57,6 +52,11 @@ Tensor global_average_pool(const Tensor& input) {
       input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
       input_padded_contig_nhwc.size(Layout::Activation4D::width) *
           input_padded_contig_nhwc.size(Layout::Activation4D::height), // width
+      input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
+      input_padded_contig_nhwc.size(
+          Layout::Activation4D::channels), // input stride
+      input_padded_contig_nhwc.size(
+          Layout::Activation4D::channels), // output stride
       &workspace_size, // workspace_size
       &workspace_alignment, // workspace_alignment
       caffe2::pthreadpool_());
diff --git a/aten/src/ATen/native/xnnpack/MaxPooling.cpp b/aten/src/ATen/native/xnnpack/MaxPooling.cpp
index 668f075e44936..ccd85f7cf7912 100644
--- a/aten/src/ATen/native/xnnpack/MaxPooling.cpp
+++ b/aten/src/ATen/native/xnnpack/MaxPooling.cpp
@@ -200,9 +200,6 @@ Tensor max_pool2d(
       parameters.stride[Layout::Parameter::width],                    // subsampling_width
       parameters.dilation[Layout::Parameter::height],                 // dilation_height
       parameters.dilation[Layout::Parameter::width],                  // dilation_width
-      input_padded_contig_nhwc.size(Layout::Activation4D::channels),  // channels
-      input_padded_contig_nhwc.size(Layout::Activation4D::channels),  // input_pixel_stride - NHWC Contiguous
-      output_padded_contig_nhwc.size(Layout::Activation4D::channels), // output_pixel_stride - NHWC Contiguous
       output_min,                                                     // output_min
       output_max,                                                     // output_max
       0u,                                                             // flags
@@ -215,13 +212,16 @@ Tensor max_pool2d(
       "xnn_create_max_pooling2d_nhwc_f32 failed!");
 
   const xnn_status reshape_status = xnn_reshape_max_pooling2d_nhwc_f32(
-      max_pool_op,                                                  // operator
-      input_padded_contig_nhwc.size(Layout::Activation4D::batch),   // batch_size
-      input_padded_contig_nhwc.size(Layout::Activation4D::height),  // input_height
-      input_padded_contig_nhwc.size(Layout::Activation4D::width),   // input_width
-      nullptr,                                                      // output_height_out
-      nullptr,                                                      // output_width_out
-      caffe2::pthreadpool_());                                      // threadpool
+      max_pool_op,                                                    // operator
+      input_padded_contig_nhwc.size(Layout::Activation4D::batch),     // batch_size
+      input_padded_contig_nhwc.size(Layout::Activation4D::height),    // input_height
+      input_padded_contig_nhwc.size(Layout::Activation4D::width),     // input_width
+      input_padded_contig_nhwc.size(Layout::Activation4D::channels),  // channels
+      input_padded_contig_nhwc.size(Layout::Activation4D::channels),  // input_pixel_stride - NHWC Contiguous
+      output_padded_contig_nhwc.size(Layout::Activation4D::channels), // output_pixel_stride - NHWC Contiguous
+      nullptr,                                                        // output_height_out
+      nullptr,                                                        // output_width_out
+      caffe2::pthreadpool_());                                        // threadpool
 
   TORCH_CHECK(
     xnn_status_success == reshape_status,
diff --git a/aten/src/ATen/ops/from_blob.h b/aten/src/ATen/ops/from_blob.h
index c7139cf11b0b6..8ebc01a922029 100644
--- a/aten/src/ATen/ops/from_blob.h
+++ b/aten/src/ATen/ops/from_blob.h
@@ -16,9 +16,8 @@ TORCH_API inline void noopDelete(void*) {}
 ///
 ///     at::Tensor tensor = at::for_blob(data, sizes)
 ///             .strides(strides)
-///             .context(context, [](void *ctx) { delete static_cast<Ctx*>(ctx); })
-///             .options(...)
-///             .make_tensor();
+///             .context(context, [](void *ctx) { delete static_cast<Ctx*>(ctx);
+///             }) .options(...) .make_tensor();
 ///
 class TORCH_API TensorMaker {
   friend TensorMaker for_blob(void* data, IntArrayRef sizes) noexcept;
@@ -83,7 +82,7 @@ class TORCH_API TensorMaker {
 
   std::size_t computeStorageSize() const noexcept;
 
-  DataPtr makeDataPtrFromDeleter() const;
+  DataPtr makeDataPtrFromDeleter() noexcept;
 
   DataPtr makeDataPtrFromContext() noexcept;
 
@@ -140,11 +139,11 @@ inline Tensor from_blob(
 inline Tensor from_blob(
     void* data,
     IntArrayRef sizes,
-    const std::function<void(void*)>& deleter,
+    std::function<void(void*)> deleter,
     const TensorOptions& options = {},
     const c10::optional<Device> target_device = c10::nullopt) {
   return for_blob(data, sizes)
-      .deleter(deleter)
+      .deleter(std::move(deleter))
       .options(options)
       .target_device(target_device)
       .make_tensor();
@@ -155,10 +154,7 @@ inline Tensor from_blob(
     IntArrayRef sizes,
     IntArrayRef strides,
     const TensorOptions& options = {}) {
-  return for_blob(data, sizes)
-      .strides(strides)
-      .options(options)
-      .make_tensor();
+  return for_blob(data, sizes).strides(strides).options(options).make_tensor();
 }
 
 inline Tensor from_blob(
@@ -168,4 +164,4 @@ inline Tensor from_blob(
   return for_blob(data, sizes).options(options).make_tensor();
 }
 
-}  // namespace at
+} // namespace at
diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp
index c19925acb8a37..ef8f8deb4973b 100644
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@@ -146,12 +146,13 @@ inline Tensor new_qtensor(
   auto scalar_type = typeMetaToScalarType(dtype);
   int64_t size_bytes = get_sub_byte_tensor_size(sizes, dtype.itemsize(), scalar_type);
 
-  auto storage = c10::make_intrusive<StorageImpl>(
+  auto storage = make_storage_impl(
       StorageImpl::use_byte_size_t(),
       size_bytes,
       allocator->allocate(size_bytes),
       allocator,
-      /*resizable=*/true);
+      /*resizable=*/true,
+      device);
   auto tensor = detail::make_tensor<QTensorImpl>(
       storage, at::DispatchKeySet(tensorDispatchKey), dtype, quantizer);
   get_qtensorimpl(tensor)->set_sizes_contiguous(sizes);
diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp
index 270ab457283bb..bc3a0ba517483 100644
--- a/aten/src/ATen/record_function.cpp
+++ b/aten/src/ATen/record_function.cpp
@@ -226,7 +226,7 @@ CallbackHandle GlobalCallbackManager::addCallback(RecordFunctionCallback cb) {
   std::lock_guard<std::mutex> guard(update_mutex_);
   ++version_;
   auto handle = next_unique_callback_handle();
-  global_callbacks_.emplace_back(std::move(cb), handle);
+  global_callbacks_.emplace_back(cb, handle);
   return handle;
 }
 
@@ -416,7 +416,7 @@ CallbackHandle LocalCallbackManager::addCallback(
     RecordFunctionCallback callback) {
   auto handle = next_unique_callback_handle();
   auto& callbacks = registered_callbacks_.sorted_tls_callbacks_;
-  callbacks.emplace_back(std::move(callback), handle);
+  callbacks.emplace_back(callback, handle);
   rebuild_callback_scopes(
       GlobalCallbackManager::get().getSnapshot(), callbacks.back().callback_);
   return handle;
@@ -647,12 +647,12 @@ bool hasThreadLocalCallbacks() {
 
 CallbackHandle addThreadLocalCallback(
     RecordFunctionCallback cb) {
-  return LocalCallbackManager::get().addCallback(std::move(cb));
+  return LocalCallbackManager::get().addCallback(cb);
 }
 
 CallbackHandle addGlobalCallback(
     RecordFunctionCallback cb) {
-  return GlobalCallbackManager::get().addCallback(std::move(cb));
+  return GlobalCallbackManager::get().addCallback(cb);
 }
 
 void removeCallback(CallbackHandle handle) {
diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h
index 93a3f1bfaefbb..c6f79289e6c21 100644
--- a/aten/src/ATen/record_function.h
+++ b/aten/src/ATen/record_function.h
@@ -7,7 +7,6 @@
 #include <c10/util/SmallVector.h>
 
 #include <array>
-#include <atomic>
 #include <functional>
 #include <memory>
 #include <variant>
@@ -244,7 +243,7 @@ constexpr CallbackHandle INVALID_CALLBACK_HANDLE{0};
 // thread-local function callbacks. Moreover, it prevents saving to
 // ThreadLocalState because std::atomic is non-copyable.
 struct RecordFunctionCallbacksEntry {
-  RecordFunctionCallbacksEntry(RecordFunctionCallback&& cb, CallbackHandle h)
+  RecordFunctionCallbacksEntry(RecordFunctionCallback cb, CallbackHandle h)
       : callback_(cb), handle_(h) {}
 
   RecordFunctionCallback callback_;
@@ -311,6 +310,19 @@ struct TORCH_API RecordFunction {
         current_sequence_nr);
   }
 
+  template <typename F>
+  void before(
+      F fn,
+      const std::vector<IValue>* args,
+      const std::unordered_map<std::string, IValue>* kwargs,
+      int64_t current_sequence_nr = -1) {
+    if (!isActive()) {
+      return;
+    }
+    kwinputs_ = *kwargs;
+    before(std::move(fn), args, current_sequence_nr);
+  }
+
   // Destructor calls end callbacks
   virtual ~RecordFunction();
 
@@ -331,6 +343,15 @@ struct TORCH_API RecordFunction {
     return inputs_;
   }
 
+  std::unordered_map<std::string, IValue> kwinputs() const {
+#ifndef NDEBUG
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        inputs_valid_,
+        "Called kwinputs() outside RecordFunction start callback");
+#endif
+    return kwinputs_;
+  }
+
   const std::vector<c10::IValue>& outputs() const {
     return outputs_;
   }
@@ -469,6 +490,7 @@ struct TORCH_API RecordFunction {
 
   int64_t sequence_nr_ = -1;
   c10::ArrayRef<const IValue> inputs_;
+  std::unordered_map<std::string, IValue> kwinputs_;
   std::vector<c10::IValue> outputs_;
 
   // For backward functions - thread id of the forward function
diff --git a/aten/src/ATen/templates/Functions.cpp b/aten/src/ATen/templates/Functions.cpp
index 7026406ef3cc3..05962bc248291 100644
--- a/aten/src/ATen/templates/Functions.cpp
+++ b/aten/src/ATen/templates/Functions.cpp
@@ -78,8 +78,8 @@ Tensor TensorMaker::make_tensor() {
    return storage_size;
  }
 
- inline DataPtr TensorMaker::makeDataPtrFromDeleter() const {
-   return InefficientStdFunctionContext::makeDataPtr(data_, deleter_, *device_);
+ inline DataPtr TensorMaker::makeDataPtrFromDeleter() noexcept {
+   return InefficientStdFunctionContext::makeDataPtr(data_, std::move(deleter_), *device_);
  }
 
  inline DataPtr TensorMaker::makeDataPtrFromContext() noexcept {
diff --git a/aten/src/ATen/templates/RegisterFunctionalization.cpp b/aten/src/ATen/templates/RegisterFunctionalization.cpp
index 4eb587ab468d2..fabc12a03fa9f 100644
--- a/aten/src/ATen/templates/RegisterFunctionalization.cpp
+++ b/aten/src/ATen/templates/RegisterFunctionalization.cpp
@@ -85,8 +85,8 @@ inline c10::List<Tensor> to_meta(const c10::List<Tensor>& t_list) {
   return outputs;
 }
 
-inline c10::List<c10::optional<Tensor>> to_meta(const c10::List<c10::optional<Tensor>>& t_list) {
-  c10::List<c10::optional<Tensor>> outputs;
+inline c10::List<::std::optional<Tensor>> to_meta(const c10::List<::std::optional<Tensor>>& t_list) {
+  c10::List<::std::optional<Tensor>> outputs;
   outputs.reserve(t_list.size());
   for (const auto i : c10::irange(t_list.size())) {
     outputs.push_back(to_meta(t_list[i]));
diff --git a/aten/src/ATen/templates/TensorMethods.cpp b/aten/src/ATen/templates/TensorMethods.cpp
index 264d5a9be6357..76439040eda45 100644
--- a/aten/src/ATen/templates/TensorMethods.cpp
+++ b/aten/src/ATen/templates/TensorMethods.cpp
@@ -26,6 +26,12 @@ void check_type(const TensorBase& tensor, ScalarType type, c10::string_view type
    }                                                                 \
                                                                      \
    template <>                                                       \
+   TORCH_API const T* TensorBase::const_data_ptr<const T>() const {  \
+     check_type(*this, ScalarType::name, #name);                     \
+     return this->unsafeGetTensorImpl()->data_ptr_impl<std::remove_const_t<T>>(); \
+   }                                                                 \
+                                                                     \
+   template <>                                                       \
    TORCH_API T* TensorBase::mutable_data_ptr() const {               \
      check_type(*this, ScalarType::name, #name);                     \
      return this->unsafeGetTensorImpl()->mutable_data_ptr_impl<T>(); \
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index aa976778617da..a1a6249414dea 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -13,6 +13,7 @@ list(APPEND ATen_CPU_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/atest.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/basic.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/broadcast_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/cpu_allocator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_generator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_profiling_allocator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_rng_test.cpp
@@ -54,6 +55,7 @@ list(APPEND ATen_CPU_TEST_SRCS
   )
 
 list(APPEND ATen_CUDA_TEST_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_allocator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_apply_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_atomic_ops_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_caching_host_allocator_test.cpp
@@ -114,6 +116,13 @@ if(APPLE AND USE_MPS)
     ${CMAKE_CURRENT_SOURCE_DIR}/mps_test_objc_interface.mm)
 endif()
 
+list(APPEND ATen_XPU_TEST_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/xpu_caching_host_allocator_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/xpu_device_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/xpu_event_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/xpu_generator_test.cpp
+  )
+
 # Caffe2 specific tests
 if(BUILD_CAFFE2)
   list(APPEND ATen_CPU_TEST_SRCS
@@ -132,3 +141,4 @@ set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
 set(ATen_MOBILE_TEST_SRCS ${ATen_MOBILE_TEST_SRCS} PARENT_SCOPE)
 set(ATen_VEC_TEST_SRCS ${ATen_VEC_TEST_SRCS} PARENT_SCOPE)
 set(ATen_MPS_TEST_SRCS ${ATen_MPS_TEST_SRCS} PARENT_SCOPE)
+set(ATen_XPU_TEST_SRCS ${ATen_XPU_TEST_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/test/NamedTensor_test.cpp b/aten/src/ATen/test/NamedTensor_test.cpp
index 76bbe532dd01a..d2fe4bddfa1de 100644
--- a/aten/src/ATen/test/NamedTensor_test.cpp
+++ b/aten/src/ATen/test/NamedTensor_test.cpp
@@ -4,7 +4,6 @@
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/TensorNames.h>
 #include <c10/util/Exception.h>
-#include <c10/util/C++17.h>
 #include <c10/util/irange.h>
 
 using at::Dimname;
@@ -12,7 +11,6 @@ using at::DimnameList;
 using at::Symbol;
 using at::namedinference::TensorName;
 using at::namedinference::TensorNames;
-using std::make_unique;
 
 static Dimname dimnameFromString(const std::string& str) {
   return Dimname::fromSymbol(Symbol::dimname(str));
diff --git a/aten/src/ATen/test/allocator_clone_test.h b/aten/src/ATen/test/allocator_clone_test.h
new file mode 100644
index 0000000000000..79a1f5f79c875
--- /dev/null
+++ b/aten/src/ATen/test/allocator_clone_test.h
@@ -0,0 +1,35 @@
+#pragma once
+#include <gtest/gtest.h>
+#include <ATen/ATen.h>
+
+void test_allocator_clone(c10::Allocator* allocator) {
+  ASSERT_TRUE(allocator != nullptr);
+
+  c10::Storage a_storage(c10::make_intrusive<c10::StorageImpl>(
+    c10::StorageImpl::use_byte_size_t(),
+    0,
+    allocator,
+    /*resizable=*/true));
+
+  c10::Storage b_storage(c10::make_intrusive<c10::StorageImpl>(
+    c10::StorageImpl::use_byte_size_t(),
+    0,
+    allocator,
+    /*resizable=*/true));
+
+  at::Tensor a = at::empty({0}, at::TensorOptions().device(a_storage.device())).set_(a_storage);
+  at::Tensor b = at::empty({0}, at::TensorOptions().device(b_storage.device())).set_(b_storage);
+
+  std::vector<int64_t> sizes({13, 4, 5});
+
+  at::rand_out(a, sizes);
+  at::rand_out(b, sizes);
+
+  ASSERT_TRUE(a_storage.nbytes() == static_cast<size_t>(a.numel() * a.element_size()));
+  ASSERT_TRUE(a_storage.nbytes() == b_storage.nbytes());
+
+  void* a_data_ptr = a_storage.mutable_data();
+  b_storage.set_data_ptr(allocator->clone(a_data_ptr, a_storage.nbytes()));
+
+  ASSERT_TRUE((a == b).all().item<bool>());
+}
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index f102bb6e0077a..1d4d644c5f098 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -62,8 +62,7 @@ void TestSort(DeprecatedTypeProperties& type) {
 void TestRandperm(DeprecatedTypeProperties& type) {
   if (type.backend() != Backend::CUDA) {
     Tensor b = randperm(15, type);
-    Tensor rv, ri;
-    std::tie(rv, ri) = sort(b, 0);
+    auto [rv, ri] = sort(b, 0);
     bool isLE = (rv[0].item<float>() <= rv[1].item<float>());
     ASSERT_TRUE(isLE);
   }
diff --git a/aten/src/ATen/test/cpu_allocator_test.cpp b/aten/src/ATen/test/cpu_allocator_test.cpp
new file mode 100644
index 0000000000000..db98522c57c56
--- /dev/null
+++ b/aten/src/ATen/test/cpu_allocator_test.cpp
@@ -0,0 +1,10 @@
+#include <gtest/gtest.h>
+
+#include <c10/core/CPUAllocator.h>
+#include <ATen/ATen.h>
+
+#include <ATen/test/allocator_clone_test.h>
+
+TEST(AllocatorTestCPU, test_clone) {
+  test_allocator_clone(c10::GetDefaultCPUAllocator());
+}
diff --git a/aten/src/ATen/test/cuda_allocator_test.cpp b/aten/src/ATen/test/cuda_allocator_test.cpp
new file mode 100644
index 0000000000000..27a352e7d5a26
--- /dev/null
+++ b/aten/src/ATen/test/cuda_allocator_test.cpp
@@ -0,0 +1,10 @@
+#include <gtest/gtest.h>
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+
+#include <ATen/test/allocator_clone_test.h>
+
+TEST(AllocatorTestCUDA, test_clone) {
+  test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
+}
diff --git a/aten/src/ATen/test/cuda_distributions_test.cu b/aten/src/ATen/test/cuda_distributions_test.cu
index 6f6cfcaaee551..82d3d7777bc23 100644
--- a/aten/src/ATen/test/cuda_distributions_test.cu
+++ b/aten/src/ATen/test/cuda_distributions_test.cu
@@ -119,7 +119,6 @@ TEST(DistributionsTest, TestPhiloxIncrementBigUniformTensor) {
   // calculate maximum number of threads that can be launched
   // and set the numel to be 8 times that
   const int block_size = 256;
-  dim3 dim_block(block_size);
   uint32_t blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size;
   dim3 grid(static_cast<uint32_t>(at::cuda::getCurrentDeviceProperties()->multiProcessorCount) * blocks_per_sm);
   auto numel = block_size * grid.x * 8;
diff --git a/aten/src/ATen/test/cuda_dlconvertor_test.cpp b/aten/src/ATen/test/cuda_dlconvertor_test.cpp
index 9a89f67ef5925..697a6c8b7112f 100644
--- a/aten/src/ATen/test/cuda_dlconvertor_test.cpp
+++ b/aten/src/ATen/test/cuda_dlconvertor_test.cpp
@@ -8,10 +8,6 @@
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/cuda/CUDAContext.h>
 
-#include <string.h>
-#include <iostream>
-#include <sstream>
-
 using namespace at;
 TEST(TestDlconvertor, TestDlconvertorCUDA) {
   manual_seed(123);
diff --git a/aten/src/ATen/test/extension_backend_test.cpp b/aten/src/ATen/test/extension_backend_test.cpp
index f2ce15e99ecda..4be68b1d0a710 100644
--- a/aten/src/ATen/test/extension_backend_test.cpp
+++ b/aten/src/ATen/test/extension_backend_test.cpp
@@ -6,8 +6,8 @@
 
 #include <torch/csrc/jit/runtime/operator.h>
 
-// NB. These tests use the ORT dispatch key to test backend dispatching
-// machinery, but these tests are not specific to ORT at all. The ORT
+// NB. These tests use the MAIA dispatch key to test backend dispatching
+// machinery, but these tests are not specific to MAIA at all. The MAIA
 // backend is fully out-of-tree, so it's safe to use this key for
 // in-tree tests.
 
@@ -22,16 +22,16 @@ Tensor empty_override(SymIntArrayRef size, c10::optional<ScalarType> dtype, c10:
       Storage(
           Storage::use_byte_size_t(),
           0,
-          at::DataPtr(nullptr, Device(DeviceType::ORT, 1)),
+          at::DataPtr(nullptr, Device(DeviceType::MAIA, 1)),
           nullptr,
           false),
-      DispatchKey::ORT,
+      DispatchKey::MAIA,
       caffe2::TypeMeta::Make<float>());
   return Tensor(std::move(tensor_impl));
 }
 
 Tensor add_override(const Tensor & a, const Tensor & b , const Scalar& c) {
-  auto out = empty({5, 5}, at::kORT);  // Don't return self as-is
+  auto out = empty({5, 5}, at::kMAIA);  // Don't return self as-is
   test_int = 2;
   return out;
 }
@@ -47,28 +47,28 @@ Tensor empty_strided_override(
   return empty_override(fromIntArrayRefSlow(size), dtype, layout, device, pin_memory, c10::nullopt);
 }
 
-TORCH_LIBRARY_IMPL(aten, ORT, m) {
+TORCH_LIBRARY_IMPL(aten, MAIA, m) {
   m.impl("aten::empty.memory_format",  empty_override);
   m.impl("aten::empty_strided",        empty_strided_override);
   m.impl("aten::add.Tensor",           add_override);
 }
 
 TEST(BackendExtensionTest, TestRegisterOp) {
-  Tensor a = empty({5, 5}, at::kORT);
-  ASSERT_EQ(a.device().type(), at::kORT);
+  Tensor a = empty({5, 5}, at::kMAIA);
+  ASSERT_EQ(a.device().type(), at::kMAIA);
   ASSERT_EQ(a.device().index(), 1);
   ASSERT_EQ(a.dtype(), caffe2::TypeMeta::Make<float>());
   ASSERT_EQ(test_int, 1);
 
-  Tensor b = empty_like(a, at::kORT);
-  ASSERT_EQ(b.device().type(), at::kORT);
+  Tensor b = empty_like(a, at::kMAIA);
+  ASSERT_EQ(b.device().type(), at::kMAIA);
   ASSERT_EQ(b.device().index(), 1);
   ASSERT_EQ(b.dtype(), caffe2::TypeMeta::Make<float>());
 
   add(a, b);
   ASSERT_EQ(test_int, 2);
 
-  // Ensure that non-ORT operator still works
+  // Ensure that non-MAIA operator still works
   Tensor d = empty({5, 5}, at::kCPU);
   ASSERT_EQ(d.device().type(), at::kCPU);
 }
diff --git a/aten/src/ATen/test/half_test.cpp b/aten/src/ATen/test/half_test.cpp
index 32a04d5d74073..900758233432d 100644
--- a/aten/src/ATen/test/half_test.cpp
+++ b/aten/src/ATen/test/half_test.cpp
@@ -86,11 +86,11 @@ TEST(TestHalf, HalfNumericLimits) {
 // Check the declared type of members of numeric_limits<Half> matches
 // the declared type of that member on numeric_limits<float>
 
-#define ASSERT_SAME_TYPE(name)                                \
-  static_assert(                                              \
-      std::is_same<                                           \
-          decltype(std::numeric_limits<Half>::name),          \
-          decltype(std::numeric_limits<float>::name)>::value, \
+#define ASSERT_SAME_TYPE(name)                         \
+  static_assert(                                       \
+      std::is_same_v<                                  \
+          decltype(std::numeric_limits<Half>::name),   \
+          decltype(std::numeric_limits<float>::name)>, \
       "decltype(" #name ") differs")
 
 ASSERT_SAME_TYPE(is_specialized);
diff --git a/aten/src/ATen/test/ivalue_test.cpp b/aten/src/ATen/test/ivalue_test.cpp
index 08312305975c1..f583a3cda6a43 100644
--- a/aten/src/ATen/test/ivalue_test.cpp
+++ b/aten/src/ATen/test/ivalue_test.cpp
@@ -49,6 +49,14 @@ TEST(IValueTest, Basic) {
   ASSERT_TRUE(dlist.isNone());
   dlist = IValue(c10::List<double>({3.4}));
   ASSERT_TRUE(dlist.toDoubleVector() == std::vector<double>({3.4}));
+  dlist = IValue(std::vector<double>({3.3, 3.2}));
+  ASSERT_TRUE(dlist.toDoubleVector() == std::vector<double>({3.3, 3.2}));
+  IValue blist(std::vector<bool>{true, false});
+  ASSERT_TRUE(blist.isList());
+  const auto blistRef = blist.toListRef();
+  ASSERT_EQ(blistRef.size(), 2);
+  ASSERT_TRUE(blistRef[0].toBool());
+  ASSERT_FALSE(blistRef[1].toBool());
   IValue the_list(
       at::ivalue::Tuple::create({IValue(3.4), IValue(4), IValue(foo)}));
   ASSERT_EQ(foo.use_count(), 3);
diff --git a/aten/src/ATen/test/pow_test.cpp b/aten/src/ATen/test/pow_test.cpp
index bb2e606064739..fb3b073f29f3a 100644
--- a/aten/src/ATen/test/pow_test.cpp
+++ b/aten/src/ATen/test/pow_test.cpp
@@ -87,7 +87,7 @@ const std::vector<double> doubles {
 };
 
 template <class T,
-  typename std::enable_if<std::is_floating_point<T>::value,T>::type* = nullptr>
+  typename std::enable_if_t<std::is_floating_point_v<T>, T>* = nullptr>
 void assert_eq(T val, T act, T exp) {
   if (std::isnan(act) || std::isnan(exp)) {
     return;
@@ -96,7 +96,7 @@ void assert_eq(T val, T act, T exp) {
 }
 
 template <class T,
-  typename std::enable_if<std::is_integral<T>::value, T>::type* = nullptr>
+  typename std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
 void assert_eq(T val, T act, T exp) {
   if (val != 0 && act == 0) {
     return;
@@ -112,12 +112,12 @@ void assert_eq(T val, T act, T exp) {
 }
 
 template <class T,
-  typename std::enable_if<std::is_floating_point<T>::value,T>::type* = nullptr>
+  typename std::enable_if_t<std::is_floating_point_v<T>, T>* = nullptr>
 T typed_pow(T base, T exp) {
   return std::pow(base, exp);
 }
 template <class T,
-  typename std::enable_if<std::is_integral<T>::value,T>::type* = nullptr>
+  typename std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
 T typed_pow(T base, T exp) {
   return native::powi(base, exp);
 }
diff --git a/aten/src/ATen/test/quantized_test.cpp b/aten/src/ATen/test/quantized_test.cpp
index 2363f8313702d..0262052d52efb 100644
--- a/aten/src/ATen/test/quantized_test.cpp
+++ b/aten/src/ATen/test/quantized_test.cpp
@@ -316,7 +316,7 @@ TEST(TestQTensor, TestArmVectorizedQuantizeDequantize) {
           quantize_val_with_datatype(scale, zero_point, x_values[i]).val_);
       }
       const Tensor r = q.dequantize();
-      const float* r_data = r.data_ptr<float>();
+      const float* r_data = r.const_data_ptr<float>();
       for (const auto i : c10::irange(numel)) {
         ASSERT_FLOAT_EQ(
           r_data[i],
diff --git a/aten/src/ATen/test/rng_test.h b/aten/src/ATen/test/rng_test.h
index c7ac20edecb8c..df04d340893fb 100644
--- a/aten/src/ATen/test/rng_test.h
+++ b/aten/src/ATen/test/rng_test.h
@@ -12,49 +12,49 @@ namespace {
 constexpr auto int64_min_val = std::numeric_limits<int64_t>::lowest();
 constexpr auto int64_max_val = std::numeric_limits<int64_t>::max();
 template <typename T,
-          typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
+          typename std::enable_if_t<std::is_floating_point_v<T>, int> = 0>
 constexpr int64_t _min_val() {
   return int64_min_val;
 }
 
 template <typename T,
-          typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+          typename std::enable_if_t<std::is_integral_v<T>, int> = 0>
 constexpr int64_t _min_val() {
   return static_cast<int64_t>(std::numeric_limits<T>::lowest());
 }
 
 template <typename T,
-          typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
+          typename std::enable_if_t<std::is_floating_point_v<T>, int> = 0>
 constexpr int64_t _min_from() {
   return -(static_cast<int64_t>(1) << std::numeric_limits<T>::digits);
 }
 
 template <typename T,
-          typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+          typename std::enable_if_t<std::is_integral_v<T>, int> = 0>
 constexpr int64_t _min_from() {
   return _min_val<T>();
 }
 
 template <typename T,
-          typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
+          typename std::enable_if_t<std::is_floating_point_v<T>, int> = 0>
 constexpr int64_t _max_val() {
   return int64_max_val;
 }
 
 template <typename T,
-          typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+          typename std::enable_if_t<std::is_integral_v<T>, int> = 0>
 constexpr int64_t _max_val() {
   return static_cast<int64_t>(std::numeric_limits<T>::max());
 }
 
 template <typename T,
-          typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
+          typename std::enable_if_t<std::is_floating_point_v<T>, int> = 0>
 constexpr int64_t _max_to() {
   return static_cast<int64_t>(1) << std::numeric_limits<T>::digits;
 }
 
 template <typename T,
-          typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+          typename std::enable_if_t<std::is_integral_v<T>, int> = 0>
 constexpr int64_t _max_to() {
   return _max_val<T>();
 }
@@ -77,7 +77,7 @@ void test_random_from_to(const at::Device& device) {
       1L,
       static_cast<c10::optional<int64_t>>(c10::nullopt)
     };
-  } else if constexpr (::std::is_signed<T>::value) {
+  } else if constexpr (::std::is_signed_v<T>) {
     constexpr int64_t min_from = _min_from<T>();
     froms = {
       min_from,
@@ -186,7 +186,7 @@ void test_random(const at::Device& device) {
     actual.random_(gen);
 
     uint64_t range;
-    if constexpr (::std::is_floating_point<T>::value) {
+    if constexpr (::std::is_floating_point_v<T>) {
       range = static_cast<uint64_t>((1ULL << ::std::numeric_limits<T>::digits) + 1);
     } else if constexpr (::std::is_same_v<T, bool>) {
       range = 2;
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index b6762e1739458..c10e8386d683a 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -28,8 +28,6 @@
 using std::cout;
 using namespace at;
 
-constexpr auto Float = ScalarType::Float;
-
 template<typename scalar_type>
 struct Foo {
   static void apply(Tensor a, Tensor b) {
diff --git a/aten/src/ATen/test/tensor_iterator_test.cpp b/aten/src/ATen/test/tensor_iterator_test.cpp
index d62633b4bb8dc..647ab25daa1e3 100644
--- a/aten/src/ATen/test/tensor_iterator_test.cpp
+++ b/aten/src/ATen/test/tensor_iterator_test.cpp
@@ -217,6 +217,32 @@ TEST(TensorIteratorTest, FailNonPromotingBinaryOp) {
   ASSERT_ANY_THROW(config.build());
 }
 
+TEST(TensorIteratorTest, ForEachConstInput) {
+  at::Tensor out = at::zeros({10});
+  at::Tensor a = at::_lazy_clone(at::arange({10}).to(at::kFloat));
+  EXPECT_TRUE(c10::impl::cow::is_cow_data_ptr(a.storage().data_ptr()));
+
+  at::TensorIteratorConfig iter_config;
+  iter_config
+    .add_output(out)
+    .add_const_input(a);
+  auto iter = iter_config.build();
+
+  auto my_loop = [](char** data, const int64_t* strides, int64_t n) {
+    auto* out_data = data[0];
+    auto* in_data = data[1];
+    for (int64_t i = 0; i < n; i++) {
+      *reinterpret_cast<float*>(out_data) += *reinterpret_cast<float*>(in_data);
+      out_data += strides[0];
+      in_data += strides[1];
+    }
+  };
+
+  iter.for_each(my_loop);
+  EXPECT_TRUE(c10::impl::cow::is_cow_data_ptr(a.storage().data_ptr()));
+  EXPECT_TRUE(out.eq(a).all().item<bool>());
+}
+
 #define MULTIPLE_OUTPUTS_TEST_ITER_FOR_TYPE(ctype,name)                                             \
 TEST(TensorIteratorTest, CpuKernelMultipleOutputs_##name) {                                         \
   auto in1 = random_tensor_for_type(k##name);                                                       \
diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp
index 85e9b644afd3f..4c7e3e5b2b02f 100644
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@@ -66,6 +66,12 @@ namespace {
     class FunctionalTests : public ::testing::Test {};
     template <typename T>
     class FunctionalTestsReducedFloat : public ::testing::Test {};
+    template <typename T>
+    class InfiniteTests : public ::testing::Test {};
+    template <typename T>
+    class VecConvertTests : public ::testing::Test {};
+    template <typename T>
+    class VecMaskTests : public ::testing::Test {};
     using RealFloatTestedTypes = ::testing::Types<vfloat, vdouble>;
     using FloatTestedTypes = ::testing::Types<vfloat, vdouble, vcomplex, vcomplexDbl>;
     using ALLTestedTypes = ::testing::Types<vfloat, vdouble, vcomplex, vlong, vint, vshort, vqint8, vquint8, vqint>;
@@ -106,6 +112,7 @@ namespace {
     TYPED_TEST_SUITE(BitwiseFloatsAdditional, RealFloatTestedTypes);
     TYPED_TEST_SUITE(BitwiseFloatsAdditional2, FloatTestedTypes);
     TYPED_TEST_SUITE(QuantizationTests, QuantTestedTypes);
+    TYPED_TEST_SUITE(InfiniteTests, RealFloatTestedTypes);
 #if (defined(CPU_CAPABILITY_AVX2) ||  defined(CPU_CAPABILITY_AVX512))  && !defined(_MSC_VER)
     TYPED_TEST_SUITE(
         Quantization8BitWithTailTests,
@@ -113,6 +120,8 @@ namespace {
 #endif
     TYPED_TEST_SUITE(FunctionalTests, RealFloatIntTestedTypes);
     TYPED_TEST_SUITE(FunctionalTestsReducedFloat, ReducedFloatTestedTypes);
+    TYPED_TEST_SUITE(VecConvertTests, RealFloatIntTestedTypes);
+    TYPED_TEST_SUITE(VecMaskTests, RealFloatIntTestedTypes);
     TYPED_TEST(Memory, UnAlignedLoadStore) {
         using vec = TypeParam;
         using VT = ValueType<TypeParam>;
@@ -546,7 +555,7 @@ namespace {
         using UVT = UvalueType<vec>;
         UVT tolerance = getDefaultTolerance<UVT>();
         // double: 2e+305  float: 4e+36 (https://sleef.org/purec.xhtml#eg)
-        UVT maxCorrect = std::is_same<UVT, float>::value ? (UVT)4e+36 : (UVT)2e+305;
+        UVT maxCorrect = std::is_same_v<UVT, float> ? (UVT)4e+36 : (UVT)2e+305;
         TestingCase<vec> testCase = TestingCase<vec>::getBuilder()
             .addDomain(CheckWithinDomains<UVT>{ { {(UVT)-100, (UVT)0}}, true, tolerance})
             .addDomain(CheckWithinDomains<UVT>{ { {(UVT)0, (UVT)1000 }}, true, tolerance})
@@ -1587,7 +1596,185 @@ namespace {
             << "Test failed for uint16 to float " << u16 << "\n";
       }
     }
+    TYPED_TEST(InfiniteTests, HasInfNan) {
+      using vec = TypeParam;
+      using VT = UholdType<TypeParam>;
+      auto vec_size = vec::size();
+      VT values[20];
+      for (const auto i : c10::irange(20)) {
+        values[i] = i + 0.3;
+      }
+      auto vec_val = vec::loadu(values);
+      auto seed = TestSeed();
+      ValueGen<int> generator(int(0), int(vec_size - 1), seed);
+      int index = generator.get();
+      int nanBits = 0x7FC00000;
+      VT v_nan = static_cast<VT>(*(float *)&nanBits);
+      values[index] = v_nan;
+      auto vec_nan = vec::loadu(values);
+      int infBits = 0x7F800000;
+      VT v_pinf = static_cast<VT>(*(float *)&infBits);
+      values[index] = v_pinf;
+      auto vec_pinf = vec::loadu(values);
+      int negInfBits = 0xFF800000;
+      VT v_ninf  = static_cast<VT>(*(float *)&negInfBits);
+      values[index] = v_ninf;
+      auto vec_ninf = vec::loadu(values);
 
+      ASSERT_TRUE(!(vec_val.has_inf_nan())) << "Test failed for normal value\n";
+      ASSERT_TRUE(vec_nan.has_inf_nan()) << "Test failed for NAN\n";
+      ASSERT_TRUE(vec_pinf.has_inf_nan()) << "Test failed for positive Infinity\n";
+      ASSERT_TRUE(vec_ninf.has_inf_nan()) << "Test failed for negative Infinity\n";
+    }
+    TYPED_TEST(VecConvertTests, Convert) {
+      using vec = TypeParam;
+      using src_t = ValueType<TypeParam>;
+      constexpr auto N = vec::size();
+    #define TEST_CONVERT_TO(dst_t)                                     \
+      do {                                                             \
+        CACHE_ALIGN src_t x[N];                                        \
+        CACHE_ALIGN dst_t y[N];                                        \
+        CACHE_ALIGN dst_t ref[N];                                      \
+        auto seed = TestSeed();                                        \
+        auto low = std::is_signed_v<dst_t> ? src_t(-100) : 0;          \
+        ValueGen<src_t> generator(low, src_t(100), seed);              \
+        for (const auto i : c10::irange(N)) {                          \
+          x[i] = generator.get();                                      \
+        }                                                              \
+        for (const auto i : c10::irange(N)) {                          \
+          ref[i] = static_cast<dst_t>(x[i]);                           \
+        }                                                              \
+        auto x_vec = vec::loadu(x);                                    \
+        auto y_vec = at::vec::convert<dst_t>(x_vec);                   \
+        constexpr int num_dst_elements =                               \
+            std::min(N, at::vec::Vectorized<dst_t>::size());           \
+        y_vec.store(y, num_dst_elements);                              \
+        for (const auto i : c10::irange(num_dst_elements)) {           \
+          ASSERT_EQ(y[i], ref[i])                                      \
+              << "Failure Details:\nTest Seed to reproduce: " << seed  \
+              << " x[" << i << "]=" << x[i] << " dst_t=" #dst_t;       \
+        }                                                              \
+        constexpr int dst_n = N / num_dst_elements;                    \
+        auto y_vec_n = at::vec::convert<dst_t, dst_n, src_t, 1>(       \
+            at::vec::VectorizedN<src_t, 1>(x_vec));                    \
+        y_vec_n.store(y, N);                                           \
+        for (const auto i : c10::irange(N)) {                          \
+          ASSERT_EQ(y[i], ref[i])                                      \
+              << "Failure Details:\nTest Seed to reproduce: " << seed  \
+              << " x[" << i << "]=" << x[i] << " dst_t=" #dst_t;       \
+        }                                                              \
+      } while (0)
+      TEST_CONVERT_TO(int8_t);
+      TEST_CONVERT_TO(uint8_t);
+      TEST_CONVERT_TO(int16_t);
+      TEST_CONVERT_TO(uint16_t);
+      TEST_CONVERT_TO(int32_t);
+      TEST_CONVERT_TO(uint32_t);
+      TEST_CONVERT_TO(int64_t);
+      TEST_CONVERT_TO(uint64_t);
+      TEST_CONVERT_TO(c10::BFloat16);
+      TEST_CONVERT_TO(c10::Half);
+      TEST_CONVERT_TO(float);
+      TEST_CONVERT_TO(double);
+    #undef TEST_CONVERT_TO
+    }
+    TYPED_TEST(VecMaskTests, MaskedLoad) {
+      using vec = TypeParam;
+      using VT = ValueType<TypeParam>;
+      constexpr auto N = vec::size();
+      CACHE_ALIGN VT x[N];
+      CACHE_ALIGN VT y[N];
+      CACHE_ALIGN VT ref[N];
+      auto seed = TestSeed();
+      ValueGen<VT> generator(VT(-100), VT(100), seed);
+      for (const auto i : c10::irange(N)) {
+        x[i] = generator.get();
+      }
+      auto vec_mask = generate_vec_mask<VT>(seed);
+      auto x_vec = vec_mask.template loadu<VT, 1>(x);
+      x_vec.store(y);
+      for (const auto i : c10::irange(N)) {
+        if (vec_mask.is_masked(i)) {
+          ref[i] = x[i];
+        } else {
+          ref[i] = 0;
+        }
+      }
+      for (const auto i : c10::irange(N)) {
+        ASSERT_EQ(y[i], ref[i])
+            << "Failure Details:\nTest Seed to reproduce: " << seed;
+      }
+    }
+    TYPED_TEST(VecMaskTests, MaskedCheck) {
+      using VT = ValueType<TypeParam>;
+      auto vec_mask = create_vec_mask<VT>(0);
+      ASSERT_TRUE(vec_mask.all_zero()) << "all_zero check failed";
+      vec_mask = create_vec_mask<VT>(-1);
+      ASSERT_TRUE(vec_mask.all_masked()) << "all_masked check failed";
+      vec_mask = create_vec_mask<VT>(2);
+      ASSERT_TRUE(vec_mask.is_masked(1)) << "is_masked(1) check failed";
+      ASSERT_TRUE(!vec_mask.is_masked(0)) << "!is_masked(0) check failed";
+    }
+    TYPED_TEST(VecMaskTests, ToFrom) {
+      using vec = TypeParam;
+      using VT = ValueType<TypeParam>;
+      constexpr auto N = vec::size();
+      auto vec_mask = at::vec::VecMask<VT, 1>::from(1);
+      ASSERT_TRUE(vec_mask.all_masked()) << "expect all_masked with from(1)";
+      vec_mask = at::vec::VecMask<VT, 1>::from(0);
+      ASSERT_TRUE(vec_mask.all_zero()) << "expect all_zero with from(0)";
+
+      CACHE_ALIGN VT x[N];
+      CACHE_ALIGN VT y[N];
+      auto seed = TestSeed();
+      ValueGen<VT> generator(VT(0), VT(2), seed);
+      for (const auto i : c10::irange(N)) {
+        x[i] = generator.get();
+      }
+      auto x_vec = vec::loadu(x);
+      vec_mask = at::vec::VecMask<VT, 1>::template from<VT, 1>(x_vec);
+      auto y_vec = vec_mask.template to<VT, 1>();
+      y_vec.store(y);
+      for (const auto i : c10::irange(N)) {
+        ASSERT_EQ(y[i] != 0, x[i] != 0)
+            << "Failure Details:\nTest Seed to reproduce: " << seed;
+      }
+    }
+    TYPED_TEST(VecMaskTests, Cast) {
+      using vec = TypeParam;
+      using src_t = ValueType<TypeParam>;
+      constexpr auto N = vec::size();
+    #define TEST_MASK_CAST(dst_t)                                      \
+      do {                                                             \
+        CACHE_ALIGN src_t x[N];                                        \
+        CACHE_ALIGN dst_t y[N];                                        \
+        auto seed = TestSeed();                                        \
+        auto vec_mask = generate_vec_mask<src_t>(seed);                \
+        constexpr int num_dst_elements =                               \
+            std::min(N, at::vec::Vectorized<dst_t>::size());           \
+        constexpr int dst_n = N / num_dst_elements;                    \
+        auto vec_mask_new = vec_mask.template cast<dst_t, dst_n>();    \
+        vec_mask.template to<src_t, 1>().store(x);                     \
+        vec_mask_new.template to<dst_t, dst_n>().store(y, N);          \
+        for (const auto i : c10::irange(N)) {                          \
+          ASSERT_EQ(y[i], x[i])                                        \
+              << "Failure Details:\nTest Seed to reproduce: " << seed; \
+        }                                                              \
+      } while (0)
+      TEST_MASK_CAST(int8_t);
+      TEST_MASK_CAST(uint8_t);
+      TEST_MASK_CAST(int16_t);
+      TEST_MASK_CAST(uint16_t);
+      TEST_MASK_CAST(int32_t);
+      TEST_MASK_CAST(uint32_t);
+      TEST_MASK_CAST(int64_t);
+      TEST_MASK_CAST(uint64_t);
+      TEST_MASK_CAST(c10::BFloat16);
+      TEST_MASK_CAST(c10::Half);
+      TEST_MASK_CAST(float);
+      TEST_MASK_CAST(double);
+    #undef TEST_MASK_CAST
+    }
 #else
 #error GTEST does not have TYPED_TEST
 #endif
diff --git a/aten/src/ATen/test/vec_test_all_types.h b/aten/src/ATen/test/vec_test_all_types.h
index 4514adb860e8d..91788fcd56039 100644
--- a/aten/src/ATen/test/vec_test_all_types.h
+++ b/aten/src/ATen/test/vec_test_all_types.h
@@ -192,34 +192,34 @@ constexpr size_t size(T(&)[N]) {
 }
 
 template <typename Filter, typename T>
-typename std::enable_if_t<std::is_same<Filter, std::nullptr_t>::value, void>
+typename std::enable_if_t<std::is_same_v<Filter, std::nullptr_t>, void>
 call_filter(Filter filter, T& val) {}
 
 template <typename Filter, typename T>
-typename std::enable_if_t< std::is_same<Filter, std::nullptr_t>::value, void>
+typename std::enable_if_t< std::is_same_v<Filter, std::nullptr_t>, void>
 call_filter(Filter filter, T& first, T& second) { }
 
 template <typename Filter, typename T>
-typename std::enable_if_t< std::is_same<Filter, std::nullptr_t>::value, void>
+typename std::enable_if_t< std::is_same_v<Filter, std::nullptr_t>, void>
 call_filter(Filter filter, T& first, T& second, T& third) {  }
 
 template <typename Filter, typename T>
 typename std::enable_if_t<
-    !std::is_same<Filter, std::nullptr_t>::value, void>
+    !std::is_same_v<Filter, std::nullptr_t>, void>
     call_filter(Filter filter, T& val) {
     return filter(val);
 }
 
 template <typename Filter, typename T>
 typename std::enable_if_t<
-    !std::is_same<Filter, std::nullptr_t>::value, void>
+    !std::is_same_v<Filter, std::nullptr_t>, void>
     call_filter(Filter filter, T& first, T& second) {
     return filter(first, second);
 }
 
 template <typename Filter, typename T>
 typename std::enable_if_t<
-    !std::is_same<Filter, std::nullptr_t>::value, void>
+    !std::is_same_v<Filter, std::nullptr_t>, void>
     call_filter(Filter filter, T& first, T& second, T& third) {
     return filter(first, second, third);
 }
@@ -268,38 +268,30 @@ std::ostream& operator<<(std::ostream& stream, const CheckWithinDomains<T>& dmn)
 }
 
 template <typename T>
-std::enable_if_t<std::is_floating_point<T>::value, bool> check_both_nan(T x,
-    T y) {
-    return std::isnan(x) && std::isnan(y);
-}
-
-template <typename T>
-std::enable_if_t<!std::is_floating_point<T>::value, bool> check_both_nan(T x,
-    T y) {
+bool check_both_nan(T x, T y) {
+    if constexpr (std::is_floating_point_v<T>) {
+        return std::isnan(x) && std::isnan(y);
+    }
     return false;
 }
 
 template <typename T>
-std::enable_if_t<std::is_floating_point<T>::value, bool> check_both_inf(T x,
-    T y) {
-    return std::isinf(x) && std::isinf(y);
-}
-
-template <typename T>
-std::enable_if_t<!std::is_floating_point<T>::value, bool> check_both_inf(T x,
-    T y) {
+bool check_both_inf(T x, T y) {
+    if constexpr (std::is_floating_point_v<T>) {
+        return std::isinf(x) && std::isinf(y);
+    }
     return false;
 }
 
 template<typename T>
-std::enable_if_t<!std::is_floating_point<T>::value, bool> check_both_big(T x, T y) {
+std::enable_if_t<!std::is_floating_point_v<T>, bool> check_both_big(T x, T y) {
     return false;
 }
 
 template<typename T>
-std::enable_if_t<std::is_floating_point<T>::value, bool> check_both_big(T x, T y) {
-    T cmax = std::is_same<T, float>::value ? static_cast<T>(1e+30) : static_cast<T>(1e+300);
-    T cmin = std::is_same<T, float>::value ? static_cast<T>(-1e+30) : static_cast<T>(-1e+300);
+std::enable_if_t<std::is_floating_point_v<T>, bool> check_both_big(T x, T y) {
+    T cmax = std::is_same_v<T, float> ? static_cast<T>(1e+30) : static_cast<T>(1e+300);
+    T cmin = std::is_same_v<T, float> ? static_cast<T>(-1e+30) : static_cast<T>(-1e+300);
     //only allow when one is inf
     bool x_inf = std::isinf(x);
     bool y_inf = std::isinf(y);
@@ -330,7 +322,7 @@ T safe_fpt_division(T f1, T f2)
 }
 
 template<class T>
-std::enable_if_t<std::is_floating_point<T>::value, bool>
+std::enable_if_t<std::is_floating_point_v<T>, bool>
 nearlyEqual(T a, T b, T tolerance) {
     if (check_both_nan<T>(a, b)) return true;
     if (check_both_big(a, b)) return true;
@@ -346,7 +338,7 @@ nearlyEqual(T a, T b, T tolerance) {
 }
 
 template<class T>
-std::enable_if_t<!std::is_floating_point<T>::value, bool>
+std::enable_if_t<!std::is_floating_point_v<T>, bool>
 nearlyEqual(T a, T b, T tolerance) {
     return a == b;
 }
@@ -403,13 +395,12 @@ void copy_interleave(VT(&vals)[N], VT(&interleaved)[N]) {
 }
 
 template <typename T>
-std::enable_if_t<std::is_floating_point<T>::value, bool> is_zero(T val) {
-    return std::fpclassify(val) == FP_ZERO;
-}
-
-template <typename T>
-std::enable_if_t<!std::is_floating_point<T>::value, bool> is_zero(T val) {
-    return val == 0;
+bool is_zero(T val) {
+    if constexpr (std::is_floating_point_v<T>) {
+        return std::fpclassify(val) == FP_ZERO;
+    } else {
+        return val == 0;
+    }
 }
 
 template <typename T>
@@ -420,7 +411,7 @@ void filter_clamp(T& f, T& s, T& t) {
 }
 
 template <typename T>
-std::enable_if_t<std::is_floating_point<T>::value, void> filter_fmod(T& a, T& b) {
+std::enable_if_t<std::is_floating_point_v<T>, void> filter_fmod(T& a, T& b) {
     // This is to make sure fmod won't cause overflow when doing the div
     if (std::abs(b) < (T)1) {
       b = b < (T)0 ? (T)-1 : T(1);
@@ -428,7 +419,7 @@ std::enable_if_t<std::is_floating_point<T>::value, void> filter_fmod(T& a, T& b)
 }
 
 template <typename T>
-std::enable_if_t<std::is_floating_point<T>::value, void> filter_fmadd(T& a, T& b, T& c) {
+std::enable_if_t<std::is_floating_point_v<T>, void> filter_fmadd(T& a, T& b, T& c) {
     // This is to setup a limit to make sure fmadd (a * b + c) won't overflow
     T max = std::sqrt(std::numeric_limits<T>::max()) / T(2.0);
     T min = ((T)0 - max);
@@ -550,7 +541,7 @@ filter_div_ub(T& val1, T& val2) {
     if (is_zero(val2)) {
         val2 = 1;
     }
-    else if (std::is_integral<T>::value && val1 == std::numeric_limits<T>::min() && val2 == -1) {
+    else if (std::is_integral_v<T> && val1 == std::numeric_limits<T>::min() && val2 == -1) {
         val2 = 1;
     }
 }
@@ -574,7 +565,7 @@ struct TestSeed {
     uint64_t seed;
 };
 
-template <typename T, bool is_floating_point = std::is_floating_point<T>::value, bool is_complex = is_complex<T>::value>
+template <typename T, bool is_floating_point = std::is_floating_point_v<T>, bool is_complex = is_complex<T>::value>
 struct ValueGen
 {
     std::uniform_int_distribution<int64_t> dis;
@@ -936,7 +927,7 @@ void test_unary(
     CACHE_ALIGN VT vals[el_count];
     CACHE_ALIGN VT expected[el_count];
     bool bitwise = testCase.isBitwise();
-    UVT default_start = std::is_floating_point<UVT>::value ? std::numeric_limits<UVT>::lowest() : std::numeric_limits<UVT>::min();
+    UVT default_start = std::is_floating_point_v<UVT> ? std::numeric_limits<UVT>::lowest() : std::numeric_limits<UVT>::min();
     UVT default_end = std::numeric_limits<UVT>::max();
     auto domains = testCase.getDomains();
     auto domains_size = domains.size();
@@ -992,7 +983,7 @@ void test_binary(
     CACHE_ALIGN VT vals1[el_count];
     CACHE_ALIGN VT expected[el_count];
     bool bitwise = testCase.isBitwise();
-    UVT default_start = std::is_floating_point<UVT>::value ? std::numeric_limits<UVT>::lowest() : std::numeric_limits<UVT>::min();
+    UVT default_start = std::is_floating_point_v<UVT> ? std::numeric_limits<UVT>::lowest() : std::numeric_limits<UVT>::min();
     UVT default_end = std::numeric_limits<UVT>::max();
     auto domains = testCase.getDomains();
     auto domains_size = domains.size();
@@ -1053,7 +1044,7 @@ void test_ternary(
     CACHE_ALIGN VT vals2[el_count];
     CACHE_ALIGN VT expected[el_count];
     bool bitwise = testCase.isBitwise();
-    UVT default_start = std::is_floating_point<UVT>::value ? std::numeric_limits<UVT>::lowest() : std::numeric_limits<UVT>::min();
+    UVT default_start = std::is_floating_point_v<UVT> ? std::numeric_limits<UVT>::lowest() : std::numeric_limits<UVT>::min();
     UVT default_end = std::numeric_limits<UVT>::max();
     auto domains = testCase.getDomains();
     auto domains_size = domains.size();
@@ -1443,11 +1434,29 @@ double getDefaultTolerance() {
     return 1.e-9;
 }
 
+template<typename T>
+at::vec::VecMask<T, 1> create_vec_mask(uint64_t bitmask) {
+  constexpr auto N = at::vec::Vectorized<T>::size();
+  std::array<int, N> mask;
+  for (int i = 0; i < N; i++) {
+    mask[i] = (bitmask >> i) & 1;
+  }
+  return at::vec::VecMask<T, 1>::from(mask.data());
+}
+
+template<typename T>
+at::vec::VecMask<T, 1> generate_vec_mask(int seed) {
+  constexpr auto N = at::vec::Vectorized<T>::size();
+  ValueGen<uint64_t> generator(0, (1ULL << N) - 1, seed);
+  auto bitmask = generator.get();
+  return create_vec_mask<T>(bitmask);
+}
+
 template<typename T>
 TestingCase<T> createDefaultUnaryTestCase(TestSeed seed = TestSeed(), bool bitwise = false, bool checkWithTolerance = false, size_t trials = 0) {
     using UVT = UvalueType<T>;
     TestingCase<T> testCase;
-    if (!bitwise && std::is_floating_point<UVT>::value) {
+    if (!bitwise && std::is_floating_point_v<UVT>) {
         //for float types lets add manual ranges
         UVT tolerance = getDefaultTolerance<UVT>();
         testCase = TestingCase<T>::getBuilder()
@@ -1475,7 +1484,7 @@ template<typename T>
 TestingCase<T> createDefaultBinaryTestCase(TestSeed seed = TestSeed(), bool bitwise = false, bool checkWithTolerance = false, size_t trials = 0) {
     using UVT = UvalueType<T>;
     TestingCase<T> testCase;
-    if (!bitwise && std::is_floating_point<UVT>::value) {
+    if (!bitwise && std::is_floating_point_v<UVT>) {
         //for float types lets add manual ranges
         UVT tolerance = getDefaultTolerance<UVT>();
         testCase = TestingCase<T>::getBuilder()
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 7eb319d12feb9..5b6a31e0b5147 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -451,7 +451,7 @@ TEST_F(VulkanAPITest, add_invalid_inputs) {
   auto in_cpu = at::rand({2, 3, 4, 5}, at::device(at::kCPU).dtype(at::kFloat));
   auto other_cpu = at::rand({2, 4, 4, 5}, at::device(at::kCPU).dtype(at::kFloat));
 
-  EXPECT_THROW(at::add(in_cpu.vulkan(), other_cpu.vulkan(), 1.0f), ::c10::Error);
+  EXPECT_THROW(at::add(in_cpu.vulkan(), other_cpu.vulkan(), 1.0f), ::std::exception);
 }
 
 TEST_F(VulkanAPITest, add) {
@@ -529,7 +529,7 @@ TEST_F(VulkanAPITest, add_other_cpu_unsupported_type_should_fail) {
   const auto other_cpu =
     at::zeros({2, 2, 2}, at::device(at::kCPU).dtype(at::kComplexFloat));
 
-  EXPECT_THROW(at::add(in_cpu.vulkan(), other_cpu.vulkan(), 1.0f), ::c10::Error);
+  EXPECT_THROW(at::add(in_cpu.vulkan(), other_cpu.vulkan(), 1.0f), ::std::exception);
 }
 
 TEST_F(VulkanAPITest, add_) {
@@ -772,7 +772,7 @@ TEST_F(VulkanAPITest, addmm_error_bias) {
   const auto m1_cpu = at::rand({17, 6}, at::device(at::kCPU).dtype(at::kFloat));
   const auto m2_cpu = at::rand({6, 9}, at::device(at::kCPU).dtype(at::kFloat));
   const auto m1_vulkan = m1_cpu.vulkan();
-  EXPECT_THROW(at::addmm(bias_cpu, m1_vulkan, m2_cpu, beta, alpha), ::c10::Error);
+  EXPECT_THROW(at::addmm(bias_cpu, m1_vulkan, m2_cpu, beta, alpha), ::std::exception);
 }
 
 TEST_F(VulkanAPITest, avg_pool2d) {
@@ -803,7 +803,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       0.1,
       1e-05,
       false);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: Vulkan batchnorm expects 4-dim input
   EXPECT_THROW({
@@ -817,7 +817,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       0.1,
       1e-05,
       false);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: Vulkan batchnorm expects 4-dim input
   EXPECT_THROW({
@@ -831,7 +831,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       0.1,
       1e-05,
       false);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: Vulkan batchnorm expects channel dim to be multiple of 4
   EXPECT_THROW({
@@ -845,7 +845,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       0.1,
       1e-05,
       false);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: weight tensor contains incorrect number of elements
   EXPECT_THROW({
@@ -859,7 +859,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       0.1,
       1e-05,
       false);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: bias tensor contains incorrect number of elements
   EXPECT_THROW({
@@ -873,7 +873,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       0.1,
       1e-05,
       false);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: running mean tensor contains incorrect number of elements
   EXPECT_THROW({
@@ -887,7 +887,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       0.1,
       1e-05,
       false);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: running var tensor contains incorrect number of elements
   EXPECT_THROW({
@@ -901,7 +901,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       0.1,
       1e-05,
       false);
-  }, ::c10::Error);
+  }, ::std::exception);
 }
 
 TEST_F(VulkanAPITest, batch_norm_small) {
@@ -1081,7 +1081,7 @@ TEST_F(VulkanAPITest, baddbmm_bais_error) {
       at::rand({150, 67, 163}, at::device(at::kCPU).dtype(at::kFloat));
   const auto m1_vulkan = m1_cpu.vulkan();
   EXPECT_THROW(
-      at::baddbmm(bias_cpu, m1_vulkan, m2_cpu, beta, alpha), ::c10::Error);
+      at::baddbmm(bias_cpu, m1_vulkan, m2_cpu, beta, alpha), ::std::exception);
 }
 
 TEST_F(VulkanAPITest, baddbmm_bias_boardcast_batch) {
@@ -1338,7 +1338,7 @@ TEST_F(VulkanAPITest, bmm_error) {
   const auto m2_cpu =
       at::rand({200, 546, 267}, at::device(at::kCPU).dtype(at::kFloat));
   const auto m1_vulkan = m1_cpu.vulkan();
-  EXPECT_THROW(m1_vulkan.bmm(m2_cpu), ::c10::Error);
+  EXPECT_THROW(m1_vulkan.bmm(m2_cpu), ::std::exception);
 }
 
 TEST_F(VulkanAPITest, clamp) {
@@ -1394,20 +1394,15 @@ TEST_F(VulkanAPITest, conv1d_simple) {
   const auto weights_vk = weights_cpu.vulkan();
   const auto bias_vk = bias_cpu.vulkan();
 
-  std::array<int64_t, 1> stride{1};
-  std::array<int64_t, 1> padding{0};
-  std::array<int64_t, 1> dilation{1};
+  int64_t stride = 1;
+  int64_t padding = 0;
+  int64_t dilation = 1;
 
   const auto output_cpu = at::conv1d(
-      input_cpu, weights_cpu, bias_cpu,
-      stride, padding, dilation, channels);
+      input_cpu, weights_cpu, bias_cpu, stride, padding, dilation, channels);
 
   const auto output_vk = at::conv1d(
-      input_vk, weights_vk, bias_vk,
-      stride,
-      padding,
-      dilation,
-      channels);
+      input_vk, weights_vk, bias_vk, stride, padding, dilation, channels);
   const auto output_vk_cpu = output_vk.cpu();
 
   const bool check = almostEqual(output_cpu, output_vk_cpu);
@@ -1418,32 +1413,34 @@ TEST_F(VulkanAPITest, conv1d_simple) {
   ASSERT_TRUE(check);
 }
 
-void test_conv1d(int64_t kernel_size, int64_t channels, int64_t lengths) {
+void test_conv1d(
+    int64_t kernel_size,
+    int64_t groups,
+    int64_t lengths,
+    int64_t stride = 1,
+    int64_t padding = 0,
+    int64_t dilation = 1,
+    int64_t in_group_size = 1,
+    int64_t out_group_size = 1,
+    int64_t batch_size = 1) {
   c10::InferenceMode mode;
 
-  const auto input_cpu = at::rand({1, channels, lengths}, at::kFloat);
-  const auto weights_cpu = at::rand({channels, 1, kernel_size}, at::kFloat);
-  const auto bias_cpu = at::rand({channels,}, at::kFloat);
+  int64_t in_channels = in_group_size * groups;
+  int64_t out_channels = out_group_size * groups;
+
+  const auto input_cpu = at::rand({batch_size, in_channels, lengths}, at::kFloat);
+  const auto weights_cpu = at::rand({out_channels, in_group_size, kernel_size}, at::kFloat);
+  const auto bias_cpu = at::rand({out_channels,}, at::kFloat);
 
   const auto input_vk = input_cpu.vulkan();
   const auto weights_vk = weights_cpu.vulkan();
   const auto bias_vk = bias_cpu.vulkan();
 
-  std::array<int64_t, 1> stride{1};
-  std::array<int64_t, 1> padding{0};
-  std::array<int64_t, 1> dilation{1};
-  int64_t groups = channels;
-
   const auto output_cpu = at::conv1d(
-      input_cpu, weights_cpu, bias_cpu,
-      stride, padding, dilation, groups);
+      input_cpu, weights_cpu, bias_cpu, stride, padding, dilation, groups);
 
   const auto output_vk = at::conv1d(
-      input_vk, weights_vk, bias_vk,
-      stride,
-      padding,
-      dilation,
-      channels);
+      input_vk, weights_vk, bias_vk, stride, padding, dilation, groups);
   const auto output_vk_cpu = output_vk.cpu();
 
   const bool check = almostEqual(output_cpu, output_vk_cpu);
@@ -1460,6 +1457,16 @@ TEST_F(VulkanAPITest, conv1d) {
   test_conv1d(1, 12, 3);
   test_conv1d(1, 12, 1);
   test_conv1d(10, 12, 20);
+  test_conv1d(3, 5, 9, 2, 0, 1);
+  test_conv1d(3, 5, 9, 2, 1, 1);
+  test_conv1d(3, 5, 9, 2, 1, 2);
+  test_conv1d(3, 5, 9, 1, 4, 2);
+  test_conv1d(6, 22, 30, 5, 5, 3);
+  test_conv1d(6, 5, 30, 5, 5, 3, 3, 5);
+  test_conv1d(6, 5, 30, 5, 5, 3, 4, 2);
+  test_conv1d(6, 5, 30, 5, 5, 3, 4, 2, 2);
+  test_conv1d(6, 5, 30, 5, 5, 3, 4, 2, 5);
+  test_conv1d(6, 5, 30, 5, 5, 3, 4, 2, 9);
 }
 
 
@@ -2170,31 +2177,39 @@ TEST_F(VulkanAPITest, copy) {
   ASSERT_TRUE(check);
 }
 
-TEST_F(VulkanAPITest, cumsum) {
-  c10::InferenceMode mode;
+void test_cumsum(const at::IntArrayRef input_shape, const int64_t dim) {
+  const auto in_cpu = at::rand(input_shape, at::TensorOptions(at::kCPU).dtype(at::kFloat));
 
-  const auto in_cpu = at::rand({1, 17, 37, 49}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
-  // 0 do nothing
-  // 1 frame
-  // not implemented
+  const auto out_cpu = at::cumsum(in_cpu, dim);
+  const auto out_vulkan = at::cumsum(in_cpu.vulkan(), dim);
+  const auto check = almostEqual(out_cpu, out_vulkan.cpu());
+  if (!check) {
+    showRtol(out_cpu, out_vulkan.cpu());
+  }
+  ASSERT_TRUE(check);
+}
 
-  // 2 height
-  const auto out_cpu2 = at::cumsum(in_cpu, 2);
-  const auto out_vulkan2 = at::cumsum(in_cpu.vulkan(), 2);
-  const auto check2 = almostEqual(out_cpu2, out_vulkan2.cpu());
-  if (!check2) {
-    showRtol(out_cpu2, out_vulkan2.cpu());
+TEST_F(VulkanAPITest, cumsum_1d) {
+  test_cumsum({37}, 0);
+  test_cumsum({37}, -1);
+}
+
+TEST_F(VulkanAPITest, cumsum_2d) {
+  for (int64_t i = -1; i <= 1; i++) {
+    test_cumsum({17, 37}, i);
+  }
+}
+
+TEST_F(VulkanAPITest, cumsum_3d) {
+  for (int64_t i = -2; i <= 2; i++) {
+    test_cumsum({17, 37, 49}, i);
   }
-  ASSERT_TRUE(check2);
+}
 
-  // 3 width
-  const auto out_cpu3 = at::cumsum(in_cpu, 3);
-  const auto out_vulkan3 = at::cumsum(in_cpu.vulkan(), 3);
-  const auto check3 = almostEqual(out_cpu3, out_vulkan3.cpu());
-  if (!check3) {
-    showRtol(out_cpu3, out_vulkan3.cpu());
+TEST_F(VulkanAPITest, cumsum_4d) {
+  for (int64_t i = -3; i <= 3; i++) {
+    test_cumsum({12, 17, 37, 49}, i);
   }
-  ASSERT_TRUE(check3);
 }
 
 void test_div(const at::IntArrayRef input_shape, const at::IntArrayRef other_shape) {
@@ -2426,23 +2441,23 @@ void test_expand(const at::IntArrayRef input_shape, const at::IntArrayRef output
 TEST_F(VulkanAPITest, expand_exceptions) {
   // Vulkan expand supports input dims <= 4
   auto in_cpu = at::rand({1, 2, 3, 4, 5}, at::device(at::kCPU).dtype(at::kFloat));
-  EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({1, 2, 3, 4}), ::c10::Error);
+  EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({1, 2, 3, 4}), ::std::exception);
 
   // Vulkan expand supports output_size <= 4
   in_cpu = at::rand({1, 2, 3, 4}, at::device(at::kCPU).dtype(at::kFloat));
-  EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({1, 1, 2, 3, 4}), ::c10::Error);
+  EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({1, 1, 2, 3, 4}), ::std::exception);
 
   // Vulkan expand expects output size >= input
   in_cpu = at::rand({1, 2, 3}, at::device(at::kCPU).dtype(at::kFloat));
-  EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({2, 3}), ::c10::Error);
+  EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({2, 3}), ::std::exception);
 
   // Non-singleton dimensions must match
   in_cpu = at::rand({3, 1}, at::device(at::kCPU).dtype(at::kFloat));
-  EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({1, 1}), ::c10::Error);
+  EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({1, 1}), ::std::exception);
 
   // -1 not allowed in leading, non-existing dimension
   in_cpu = at::rand({3, 1}, at::device(at::kCPU).dtype(at::kFloat));
-  EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({-1, 3, 1}), ::c10::Error);
+  EXPECT_THROW(const auto out_vulkan = in_cpu.vulkan().expand({-1, 3, 1}), ::std::exception);
 }
 
 TEST_F(VulkanAPITest, expand_1d) {
@@ -2783,7 +2798,7 @@ TEST_F(VulkanAPITest, layer_norm_invalid_inputs) {
       at::rand({8, 5}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       1e-05,
       false);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: incorrect weight dimensions
   EXPECT_THROW({
@@ -2794,7 +2809,7 @@ TEST_F(VulkanAPITest, layer_norm_invalid_inputs) {
       at::rand({3, 5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       1e-05,
       false);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: incorrect bias dimensions
   EXPECT_THROW({
@@ -2805,7 +2820,7 @@ TEST_F(VulkanAPITest, layer_norm_invalid_inputs) {
       at::rand({5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       1e-05,
       false);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: input has too many dimensions
   EXPECT_THROW({
@@ -2816,7 +2831,7 @@ TEST_F(VulkanAPITest, layer_norm_invalid_inputs) {
       at::rand({3, 5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       1e-05,
       false);
-  }, ::c10::Error);
+  }, ::std::exception);
 }
 
 void test_layer_norm(
@@ -3204,7 +3219,7 @@ TEST_F(VulkanAPITest, masked_fill_invalidinputs_exceptions) {
               in_cpu.vulkan().masked_fill(mask_cpu.vulkan(), -7.0f);
           ;
         },
-        ::c10::Error);
+        ::std::exception);
   }
 
   // Arrange: Vulkan masked_fill expects mask of dim <= 4
@@ -3221,7 +3236,7 @@ TEST_F(VulkanAPITest, masked_fill_invalidinputs_exceptions) {
               in_cpu.vulkan().masked_fill(mask_cpu.vulkan(), -7.0f);
           ;
         },
-        ::c10::Error);
+        ::std::exception);
   }
 
   // Arrange: shapes of input tensor and mask tensor should be broadcastable
@@ -3238,7 +3253,7 @@ TEST_F(VulkanAPITest, masked_fill_invalidinputs_exceptions) {
               in_cpu.vulkan().masked_fill(mask_cpu.vulkan(), -7.0f);
           ;
         },
-        ::c10::Error);
+        ::std::exception);
   }
 
   // Arrange: value should be a 0-dimensional value tensor or a scalar
@@ -3255,7 +3270,7 @@ TEST_F(VulkanAPITest, masked_fill_invalidinputs_exceptions) {
               in_cpu.vulkan().masked_fill(mask_cpu.vulkan(), at::rand({1, 2}));
           ;
         },
-        ::c10::Error);
+        ::std::exception);
   }
 }
 
@@ -3423,31 +3438,31 @@ TEST_F(VulkanAPITest, mean_invalid_inputs) {
   EXPECT_THROW({
     at::mean(at::rand({3, 5, 7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
       .vulkan(), {3});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: dimension out of range
   EXPECT_THROW({
     at::mean(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
       .vulkan(), {3});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: dimension out of range
   EXPECT_THROW({
     at::mean(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
       .vulkan(), {-4});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: repeated dimensions
   EXPECT_THROW({
     at::mean(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
       .vulkan(), {1, 1});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: repeated dimensions
   EXPECT_THROW({
     at::mean(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
       .vulkan(), {1, -2});
-  }, ::c10::Error);
+  }, ::std::exception);
 }
 
 void test_mean_dim(const at::IntArrayRef input_shape, const at::IntArrayRef dim_list, bool keepdim=false) {
@@ -3603,7 +3618,7 @@ TEST_F(VulkanAPITest, mm_error) {
   const auto m2_cpu = at::rand({67, 163}, at::device(at::kCPU).dtype(at::kFloat));
   const auto m1_vulkan = m1_cpu.vulkan();
 
-  EXPECT_THROW(m1_vulkan.mm(m2_cpu), ::c10::Error);
+  EXPECT_THROW(m1_vulkan.mm(m2_cpu), ::std::exception);
 }
 
 void test_mul(const at::IntArrayRef input_shape, const at::IntArrayRef other_shape) {
@@ -4007,7 +4022,7 @@ TEST_F(VulkanAPITest, floor_divide_scalar_error) {
 
   auto in_cpu = at::rand({2, 3, 4}, at::device(at::kCPU).dtype(at::kFloat));
   auto in_vulkan = in_cpu.vulkan();
-  EXPECT_THROW(at::floor_divide(in_vulkan, 0.0f), ::c10::Error);
+  EXPECT_THROW(at::floor_divide(in_vulkan, 0.0f), ::std::exception);
 }
 
 void test_floor_divide_scalar_inplace(const at::IntArrayRef input_shape, float input_scale, float other) {
@@ -4038,7 +4053,7 @@ TEST_F(VulkanAPITest, floor_divide_scalar_inplace_error) {
 
   auto in_cpu = at::rand({2, 3, 4}, at::device(at::kCPU).dtype(at::kFloat));
   auto in_vulkan = in_cpu.vulkan();
-  EXPECT_THROW(in_vulkan.floor_divide(0.0f), ::c10::Error);
+  EXPECT_THROW(in_vulkan.floor_divide(0.0f), ::std::exception);
 }
 
 TEST_F(VulkanAPITest, floor_divide_scalar_inplace) {
@@ -4193,7 +4208,7 @@ TEST_F(VulkanAPITest, repeat_invalid_inputs_outputs_exceptions) {
     // Act
     EXPECT_THROW(
         { const auto out_vulkan = in_cpu.vulkan().repeat(repeats); },
-        ::c10::Error);
+        ::std::exception);
   }
 
   // Arrange: Number of dimensions of repeat dims can not be smaller than
@@ -4206,7 +4221,7 @@ TEST_F(VulkanAPITest, repeat_invalid_inputs_outputs_exceptions) {
     // Act
     EXPECT_THROW(
         { const auto out_vulkan = in_cpu.vulkan().repeat(repeats); },
-        ::c10::Error);
+        ::std::exception);
   }
 
   // Arrange: Vulkan repeat only supports output of dims <= 4
@@ -4218,7 +4233,7 @@ TEST_F(VulkanAPITest, repeat_invalid_inputs_outputs_exceptions) {
     // Act
     EXPECT_THROW(
         { const auto out_vulkan = in_cpu.vulkan().repeat(repeats); },
-        ::c10::Error);
+        ::std::exception);
   }
 }
 
@@ -4457,6 +4472,56 @@ TEST_F(VulkanAPITest, sigmoid_) {
   ASSERT_TRUE(check);
 }
 
+TEST_F(VulkanAPITest, DISABLED_log_softmax_underflow_exception) {
+  // We apply softmax and log in a sequence to the tesnor [20, 0].
+  // The output of softmax on CPU is [1.0000e+00, 2.0612e-09]; while
+  // the output on Vulkan is [1, 0] since 2.0612e-09 is smaller than
+  // the smallest represetable positive 5.96e−8. We expect to see nan
+  // or -inf when applying log.
+  float data[] = {20, 0};
+  const auto in_cpu = at::from_blob(data, {2}, at::kFloat);
+  const auto in_vulkan = in_cpu.vulkan();
+
+  const auto softmax_out_cpu = at::softmax(in_cpu, 0);
+  const auto softmax_out_vulkan = at::softmax(in_vulkan, 0);
+
+  const auto log_out_cpu = at::log(softmax_out_cpu);
+  const auto log_out_vulkan = at::log(softmax_out_vulkan);
+
+  auto has_nan = log_out_vulkan.cpu().isnan().any().item().to<bool>();
+  auto has_inf = log_out_vulkan.cpu().isinf().any().item().to<bool>();
+
+  // We expect the output of log containing nan or inf.
+  const auto check = has_nan || has_inf;
+  if (!check) {
+    std::cout << "expect log_out_vulkan contains nan or inf, but got" << std::endl;
+    std::cout << log_out_vulkan.cpu() << std::endl;
+  }
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, log_softmax_underflow) {
+  // The minimum strictly positive (subnormal) value of float16 on Vulkan is 2−24 ≈ 5.96 × 10^−8.
+  // https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Exponent_encoding
+  // then smallest_representable_log = log(5.96 × 10^−8) = -16.64.
+  // The implementation of `log_softmax` adds 6e-8 to the output of softmax before applying `log`
+  // to deal with underflow, so there won't be nan or -inf as shown in the
+  // `log_softmax_underflow_exception` test above
+  float smallest_representable_log = -16.64f;
+  float data[] = {20, 0};
+  const auto in_cpu = at::from_blob(data, {2}, at::kFloat);
+  const auto in_vulkan = in_cpu.vulkan();
+
+  const auto log_softmax_cpu = at::log_softmax(in_cpu, 0);
+  const auto log_softmax_vulkan = at::log_softmax(in_vulkan, 0);
+
+  const auto check = checkRtol(log_softmax_cpu - log_softmax_vulkan.cpu(), -smallest_representable_log);
+  if (!check) {
+    showRtol(log_softmax_cpu, log_softmax_vulkan.cpu());
+  }
+  ASSERT_TRUE(check);
+}
+
 void test_softmax(const at::IntArrayRef shape, bool log_softmax = false) {
   at::Tensor in_cpu =
       at::rand(shape, at::TensorOptions(at::kCPU).dtype(at::kFloat));
@@ -4503,7 +4568,7 @@ TEST_F(VulkanAPITest, softmax) {
   }
 }
 
-TEST_F(VulkanAPITest, log_softmax) {
+TEST_F(VulkanAPITest, DISABLED_log_softmax) {
   c10::InferenceMode mode;
   std::vector<std::vector<int64_t>> test_in_dims = {
       {1, 3, 4, 2},
@@ -4520,30 +4585,6 @@ TEST_F(VulkanAPITest, log_softmax) {
   }
 }
 
-// TODO: Currently the op is not working correctly. Add it back when it is fixed.
-TEST_F(VulkanAPITest, DISABLED_log_softmax) {
-  at::Tensor test_in[] = {
-    at::rand({1, 196, 302, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
-    at::rand({1, 197, 302, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
-    at::rand({1, 198, 302, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
-    at::rand({1, 199, 302, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
-  };
-
-  for (auto in_cpu : test_in) {
-    const auto out_cpu = at::softmax(in_cpu, 1);
-
-    const auto in_vulkan = in_cpu.vulkan();
-    const auto out_vulkan = at::log_softmax(in_vulkan, 1);
-
-    const auto check = almostEqual(out_cpu, out_vulkan.cpu());
-    if (!check) {
-      showRtol(out_cpu, out_vulkan.cpu());
-    }
-
-    ASSERT_TRUE(check);
-  }
-}
-
 TEST_F(VulkanAPITest, abs) {
   const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30;
   const auto in_vulkan = in_cpu.vulkan();
@@ -4824,31 +4865,31 @@ TEST_F(VulkanAPITest, sum_invalid_inputs) {
   EXPECT_THROW({
     at::sum(at::rand({3, 5, 7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
       .vulkan(), {3});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: dimension out of range
   EXPECT_THROW({
     at::sum(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
       .vulkan(), {3});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: dimension out of range
   EXPECT_THROW({
     at::sum(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
       .vulkan(), {-4});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: repeated dimensions
   EXPECT_THROW({
     at::sum(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
       .vulkan(), {1, 1});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: repeated dimensions
   EXPECT_THROW({
     at::sum(at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
       .vulkan(), {1, -2});
-  }, ::c10::Error);
+  }, ::std::exception);
 }
 
 void test_sum_dim(const at::IntArrayRef input_shape, const at::IntArrayRef dim_list, bool keepdim=false) {
@@ -5079,7 +5120,7 @@ TEST_F(VulkanAPITest, normal_error) {
 
   auto a_vulkan =
       at::zeros({30, 40, 50, 60}, at::device(at::kCPU).dtype(at::kFloat)).vulkan();
-  EXPECT_THROW(a_vulkan.normal_(a_mean, a_std), ::c10::Error);
+  EXPECT_THROW(a_vulkan.normal_(a_mean, a_std), ::std::exception);
 }
 
 TEST_F(VulkanAPITest, randn_like) {
@@ -5425,6 +5466,11 @@ void test_unsqueeze(const at::IntArrayRef input_shape, int64_t dim) {
   ASSERT_TRUE(check);
 }
 
+TEST_F(VulkanAPITest, unsqueeze_0dto1d_dim0) {
+  test_unsqueeze({}, 0);
+  test_unsqueeze({}, -1);
+}
+
 TEST_F(VulkanAPITest, unsqueeze_1dto2d_dim0) {
   test_unsqueeze({5}, 0);
   test_unsqueeze({6}, -2);
@@ -5751,7 +5797,7 @@ TEST_F(VulkanAPITest, view_invalid_inputs) {
   EXPECT_THROW({
     at::rand({7, 8, 9}, at::device(at::kCPU).dtype(at::kFloat))
       .vulkan().view({7, 8, -2});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: incompatible shape
   EXPECT_THROW({
@@ -5770,7 +5816,7 @@ TEST_F(VulkanAPITest, cat_4d_dim0_invalidinputs_exceptions) {
     // Act
     EXPECT_THROW({
       const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0);
-    }, ::c10::Error);
+    }, ::std::exception);
   }
 
   // Arrange: Vulkan cat expects 4 dimensional inputs
@@ -5782,7 +5828,7 @@ TEST_F(VulkanAPITest, cat_4d_dim0_invalidinputs_exceptions) {
     // Act
     EXPECT_THROW({
       const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0);
-    }, ::c10::Error);
+    }, ::std::exception);
   }
 }
 
@@ -6211,7 +6257,7 @@ TEST_F(VulkanAPITest, cat_4d_dim2_invalidinputs_exceptions) {
     // Act
     EXPECT_THROW({
       const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 2);
-    }, ::c10::Error);
+    }, ::std::exception);
   }
 
   // Arrange: Vulkan cat expects inputs of same dimensions
@@ -6223,7 +6269,7 @@ TEST_F(VulkanAPITest, cat_4d_dim2_invalidinputs_exceptions) {
     // Act
     EXPECT_THROW({
       const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 2);
-    }, ::c10::Error);
+    }, ::std::exception);
   }
 }
 
@@ -6237,7 +6283,7 @@ TEST_F(VulkanAPITest, cat_4d_dim3_invalidinputs_exceptions) {
     // Act
     EXPECT_THROW({
       const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 3);
-    }, ::c10::Error);
+    }, ::std::exception);
   }
 
   // Arrange: Vulkan cat expects 4 dimensional inputs
@@ -6249,7 +6295,7 @@ TEST_F(VulkanAPITest, cat_4d_dim3_invalidinputs_exceptions) {
     // Act
     EXPECT_THROW({
       const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 3);
-    }, ::c10::Error);
+    }, ::std::exception);
   }
 }
 
@@ -6781,52 +6827,52 @@ TEST_F(VulkanAPITest, permute_invalidinputs_exceptions) {
   // Act: Repeated dim
   EXPECT_THROW({
     const auto out_vulkan = at::permute(in_cpu.vulkan(), {2, 2, 1, 0});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   EXPECT_THROW({
     const auto out_vulkan = in_cpu.vulkan();
     out_vulkan.permute({2, 2, 1, 0});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: Number of dims don't match
   EXPECT_THROW({
     const auto out_vulkan = at::permute(in_cpu.vulkan(), {4, 3, 2, 1, 0});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   EXPECT_THROW({
     const auto out_vulkan = at::permute(in_cpu.vulkan(), {2, 1, 0});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   EXPECT_THROW({
     const auto out_vulkan = in_cpu.vulkan();
     out_vulkan.permute({4, 3, 2, 1, 0});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   EXPECT_THROW({
     const auto out_vulkan = in_cpu.vulkan();
     out_vulkan.permute({2, 1, 0});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: Dim out of range
   EXPECT_THROW({
     const auto out_vulkan = at::permute(in_cpu.vulkan(), {5, 2, 1, 0});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   EXPECT_THROW({
     const auto out_vulkan = in_cpu.vulkan();
     out_vulkan.permute({5, 2, 1, 0});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: Input tensor size > 4D
   const auto in_cpu_5d = at::rand({1, 2, 1, 2, 161}, at::device(at::kCPU).dtype(at::kFloat));
   EXPECT_THROW({
     const auto out_vulkan_5d = at::permute(in_cpu_5d.vulkan(), {4, 3, 2, 1, 0});
-  }, ::c10::Error);
+  }, ::std::exception);
 
   EXPECT_THROW({
     const auto out_vulkan_5d = in_cpu_5d.vulkan();
     out_vulkan_5d.permute({4, 3, 2, 1, 0});
-  }, ::c10::Error);
+  }, ::std::exception);
 }
 
 TEST_F(VulkanAPITest, slice_width_success) {
@@ -6889,14 +6935,14 @@ TEST_F(VulkanAPITest, slice_invalidinputs_exceptions) {
   // Act: slice step must be positive
   EXPECT_THROW({
     slice_test({2, 3, 4, 5}, 3, 0, 3, 0);
-  }, ::c10::Error);
+  }, ::std::exception);
 }
 
 TEST_F(VulkanAPITest, stack_invalid_inputs) {
   // Act: Vulkan stack expects at least one tensor
   EXPECT_THROW({
     at::stack({}, 0);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: Vulkan stack inputs must have matching sizes
   EXPECT_THROW({
@@ -6904,7 +6950,7 @@ TEST_F(VulkanAPITest, stack_invalid_inputs) {
         at::rand({5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
         at::rand({5, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
         at::rand({6, 7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan()}, 0);
-  }, ::c10::Error);
+  }, ::std::exception);
 }
 
 void test_stack(const at::IntArrayRef input_shape, int64_t dim, int numTensors) {
@@ -6927,6 +6973,12 @@ void test_stack(const at::IntArrayRef input_shape, int64_t dim, int numTensors)
   ASSERT_TRUE(check);
 }
 
+TEST_F(VulkanAPITest, stack_0d) {
+  test_stack({}, 0, 1);
+  test_stack({}, 0, 2);
+  test_stack({}, 0, 3);
+}
+
 TEST_F(VulkanAPITest, stack_1d) {
   test_stack({221}, 0, 2);
   test_stack({193}, 1, 3);
@@ -6967,7 +7019,7 @@ TEST_F(VulkanAPITest, tile_invalid_inputs_exceptions) {
     // Act
     EXPECT_THROW(
         { const auto out_vulkan = at::tile(in_cpu.vulkan(), repeats); },
-        ::c10::Error);
+        ::std::exception);
   }
 }
 
@@ -6981,7 +7033,7 @@ TEST_F(VulkanAPITest, tile_invalid_outpus_exceptions) {
     // Act
     EXPECT_THROW(
         { const auto out_vulkan = at::tile(in_cpu.vulkan(), repeats); },
-        ::c10::Error);
+        ::std::exception);
   }
 }
 
@@ -7093,12 +7145,12 @@ TEST_F(VulkanAPITest, clone_invalidinputs_exceptions) {
   // Act: Vulkan supports Preserve and Contiguous memory foramts
   EXPECT_THROW({
     clone_test({2, 3, 5, 161}, c10::MemoryFormat::ChannelsLast);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: Vulkan supports Preserve and Contiguous memory foramts
   EXPECT_THROW({
     clone_test({2, 3, 5, 161}, c10::MemoryFormat::ChannelsLast3d);
-  }, ::c10::Error);
+  }, ::std::exception);
 }
 
 enum class OpType {
@@ -7528,7 +7580,7 @@ TEST_F(VulkanAPITest, gru_invalidinputs_exceptions) {
     at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1) },
       has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: non-3D input tensor
   EXPECT_THROW({
@@ -7536,7 +7588,7 @@ TEST_F(VulkanAPITest, gru_invalidinputs_exceptions) {
     at::gru(in_cpu_2d.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
       has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: non-3D hidden tensor
   EXPECT_THROW({
@@ -7544,42 +7596,42 @@ TEST_F(VulkanAPITest, gru_invalidinputs_exceptions) {
     at::gru(in_cpu.vulkan(), h0_cpu_2d.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
       has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: has_biases should be true
   EXPECT_THROW({
     at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
       false, num_layers, gru_dropout, train, bidirectional, batch_first);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: train should be false
   EXPECT_THROW({
     at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
       has_biases, num_layers, gru_dropout, true, bidirectional, batch_first);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: bidirectional should be false
   EXPECT_THROW({
     at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
       has_biases, num_layers, gru_dropout, train, true, batch_first);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: batch_first should be true
   EXPECT_THROW({
     at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
       has_biases, num_layers, gru_dropout, train, bidirectional, false);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: dropout should be 0.0
   EXPECT_THROW({
     at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
       weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) },
       has_biases, num_layers, 1.0, train, bidirectional, batch_first);
-  }, ::c10::Error);
+  }, ::std::exception);
 }
 
 TEST_F(VulkanAPITest, gru_prepack_success) {
@@ -7697,7 +7749,7 @@ TEST_F(VulkanAPITest, gru_prepack_invalidinputs_exceptions) {
         std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
             weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1) }),
         has_biases, num_layers, gru_dropout, train, bidirectional, batch_first);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: non-3D input tensor
   EXPECT_THROW({
@@ -7712,7 +7764,7 @@ TEST_F(VulkanAPITest, gru_prepack_invalidinputs_exceptions) {
         "vulkan_prepack::run_gru_context",
         "",
         in_cpu_2d.vulkan(), h0_cpu.vulkan(), prepack[0]);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: non-3D hidden tensor
   EXPECT_THROW({
@@ -7727,7 +7779,7 @@ TEST_F(VulkanAPITest, gru_prepack_invalidinputs_exceptions) {
         "vulkan_prepack::run_gru_context",
         "",
         in_cpu.vulkan(), h0_cpu_2d.vulkan(), prepack[0]);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: has_biases should be true
   EXPECT_THROW({
@@ -7737,7 +7789,7 @@ TEST_F(VulkanAPITest, gru_prepack_invalidinputs_exceptions) {
         std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
            weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
         false, num_layers, gru_dropout, train, bidirectional, batch_first);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: train should be false
   EXPECT_THROW({
@@ -7747,7 +7799,7 @@ TEST_F(VulkanAPITest, gru_prepack_invalidinputs_exceptions) {
         std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
            weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
         has_biases, num_layers, gru_dropout, true, bidirectional, batch_first);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: bidirectional should be false
   EXPECT_THROW({
@@ -7757,7 +7809,7 @@ TEST_F(VulkanAPITest, gru_prepack_invalidinputs_exceptions) {
         std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
            weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
         has_biases, num_layers, gru_dropout, train, true, batch_first);
- }, ::c10::Error);
+ }, ::std::exception);
 
   // Act: batch_first should be true
   EXPECT_THROW({
@@ -7771,7 +7823,7 @@ TEST_F(VulkanAPITest, gru_prepack_invalidinputs_exceptions) {
         "vulkan_prepack::run_gru_context",
         "",
         in_cpu.vulkan(), h0_cpu.vulkan(), prepack[0]);
-  }, ::c10::Error);
+  }, ::std::exception);
 
   // Act: dropout should be 0.0
   EXPECT_THROW({
@@ -7781,7 +7833,7 @@ TEST_F(VulkanAPITest, gru_prepack_invalidinputs_exceptions) {
         std::vector<at::Tensor>({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0),
            weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }),
         has_biases, num_layers, 1.0, train, bidirectional, batch_first);
-  }, ::c10::Error);
+  }, ::std::exception);
 }
 
 void test_linear(
@@ -7816,6 +7868,14 @@ void test_linear(
   ASSERT_TRUE(check);
 }
 
+TEST_F(VulkanAPITest, linear_1d_small) {
+  test_linear({3}, {4, 3}, {4});
+}
+
+TEST_F(VulkanAPITest, linear_1d_large) {
+  test_linear({37}, {23, 37}, {23});
+}
+
 TEST_F(VulkanAPITest, linear_2d_flat) {
   test_linear({1, 37}, {41, 37}, {41});
 }
diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
index 93176b94f5374..031154de17f85 100644
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@@ -2,9 +2,10 @@
 
 #include <ATen/ATen.h>
 #include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/native/quantized/PackedParams.h>
 #include <ATen/native/quantized/cpu/QuantUtils.h>
-#include <ATen/native/vulkan/api/api.h>
 #include <ATen/native/vulkan/api/Utils.h>
+#include <ATen/native/vulkan/api/api.h>
 #include <ATen/native/vulkan/impl/Packing.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <ATen/native/vulkan/ops/Convert.h>
@@ -14,11 +15,10 @@
 #include <ATen/native/vulkan/ops/QuantizedFunctions.h>
 #include <c10/util/irange.h>
 #include <gtest/gtest.h>
-#include <cstring>
-#include <random>
 #include <math.h>
+#include <cstring>
 #include <iostream>
-#include <ATen/native/quantized/PackedParams.h>
+#include <random>
 
 #include <cstdio>
 
@@ -74,8 +74,11 @@ bool exactlyEqual(const at::Tensor& a, const at::Tensor& b) {
 }
 */
 
-void showRtol(const at::Tensor& a, const at::Tensor& b,
-    long *xpos=nullptr, long *ypos=nullptr) {
+void showRtol(
+    const at::Tensor& a,
+    const at::Tensor& b,
+    long* xpos = nullptr,
+    long* ypos = nullptr) {
   const auto diff = (a - b).abs();
 
   double maxValue = a.abs().max().item<double>();
@@ -924,25 +927,15 @@ void test_quantize_per_tensor_and_dequantize(
   input_zero_point_qparam[0] = input_zero_point;
 
   // quantize tensors
-  at::Tensor out_q_cpu = use_qparams ?
-      at::quantize_per_tensor(input,
-      input_scale_qparam,
-      input_zero_point_qparam,
-      dtype) :
-      at::quantize_per_tensor(input,
-      input_scale,
-      input_zero_point,
-      dtype);
-  at::Tensor out_q_vk = use_qparams ?
-      at::quantize_per_tensor(
-      input.vulkan(),
-      input_scale_qparam,
-      input_zero_point_qparam,
-      dtype) :
-      at::quantize_per_tensor(
-      input.vulkan(),
-      input_scale,
-      input_zero_point, dtype);
+  at::Tensor out_q_cpu = use_qparams
+      ? at::quantize_per_tensor(
+            input, input_scale_qparam, input_zero_point_qparam, dtype)
+      : at::quantize_per_tensor(input, input_scale, input_zero_point, dtype);
+  at::Tensor out_q_vk = use_qparams
+      ? at::quantize_per_tensor(
+            input.vulkan(), input_scale_qparam, input_zero_point_qparam, dtype)
+      : at::quantize_per_tensor(
+            input.vulkan(), input_scale, input_zero_point, dtype);
 
   // dequantize tensors
   const auto out_cpu_deq = at::dequantize(out_q_cpu);
@@ -1012,18 +1005,22 @@ TEST_F(VulkanAPITest, quantize_per_tensor_and_dequantize_quint8_qparams) {
   test_quantize_per_tensor_and_dequantize({1, 1, 7, 7}, 0.3, 87, dtype, true);
   test_quantize_per_tensor_and_dequantize({1, 1, 8, 8}, 0.1, 10, dtype, true);
   test_quantize_per_tensor_and_dequantize({3, 5, 8, 8}, 0.04, 97, dtype, true);
-  test_quantize_per_tensor_and_dequantize({1, 1, 11, 17}, 0.07, 15, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {1, 1, 11, 17}, 0.07, 15, dtype, true);
   test_quantize_per_tensor_and_dequantize({1, 1, 12, 17}, 0.1, 10, dtype, true);
   test_quantize_per_tensor_and_dequantize({3, 5, 12, 17}, 0.1, 10, dtype, true);
   test_quantize_per_tensor_and_dequantize({1, 1, 17, 12}, 0.1, 10, dtype, true);
   test_quantize_per_tensor_and_dequantize({2, 4, 17, 12}, 0.1, 10, dtype, true);
-  test_quantize_per_tensor_and_dequantize({1, 1, 10, 14}, 0.001, 101, dtype, true);
-  test_quantize_per_tensor_and_dequantize({3, 5, 10, 14}, 0.009, 43, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {1, 1, 10, 14}, 0.001, 101, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {3, 5, 10, 14}, 0.009, 43, dtype, true);
   test_quantize_per_tensor_and_dequantize({3, 5, 10, 15}, 0.1, 19, dtype, true);
   test_quantize_per_tensor_and_dequantize({4, 4, 9, 17}, 0.1, 19, dtype, true);
   test_quantize_per_tensor_and_dequantize({3, 5, 25, 29}, 0.1, 19, dtype, true);
   test_quantize_per_tensor_and_dequantize({4, 4, 25, 29}, 0.1, 19, dtype, true);
-  test_quantize_per_tensor_and_dequantize({11, 17, 25, 29}, 0.027, 89, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {11, 17, 25, 29}, 0.027, 89, dtype, true);
 
   for (int i = 0; i < 20; i += 1) {
     test_quantize_per_tensor_and_dequantize_random(dtype, true);
@@ -1064,18 +1061,25 @@ TEST_F(VulkanAPITest, quantize_per_tensor_and_dequantize_qint8_qparams) {
   test_quantize_per_tensor_and_dequantize({1, 1, 7, 7}, 0.3, 87, dtype, true);
   test_quantize_per_tensor_and_dequantize({1, 1, 8, 8}, 0.1, -10, dtype, true);
   test_quantize_per_tensor_and_dequantize({3, 5, 8, 8}, 0.04, 97, dtype, true);
-  test_quantize_per_tensor_and_dequantize({1, 1, 11, 17}, 0.07, -15, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {1, 1, 11, 17}, 0.07, -15, dtype, true);
   test_quantize_per_tensor_and_dequantize({1, 1, 12, 17}, 0.1, 10, dtype, true);
-  test_quantize_per_tensor_and_dequantize({3, 5, 12, 17}, 0.1, -10, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {3, 5, 12, 17}, 0.1, -10, dtype, true);
   test_quantize_per_tensor_and_dequantize({1, 1, 17, 12}, 0.1, 10, dtype, true);
-  test_quantize_per_tensor_and_dequantize({2, 4, 17, 12}, 0.1, -10, dtype, true);
-  test_quantize_per_tensor_and_dequantize({1, 1, 10, 14}, 0.001, 101, dtype, true);
-  test_quantize_per_tensor_and_dequantize({3, 5, 10, 14}, 0.009, -43, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {2, 4, 17, 12}, 0.1, -10, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {1, 1, 10, 14}, 0.001, 101, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {3, 5, 10, 14}, 0.009, -43, dtype, true);
   test_quantize_per_tensor_and_dequantize({3, 5, 10, 15}, 0.1, 19, dtype, true);
   test_quantize_per_tensor_and_dequantize({4, 4, 9, 17}, 0.1, -19, dtype, true);
   test_quantize_per_tensor_and_dequantize({3, 5, 25, 29}, 0.1, 19, dtype, true);
-  test_quantize_per_tensor_and_dequantize({4, 4, 25, 29}, 0.1, -19, dtype, true);
-  test_quantize_per_tensor_and_dequantize({11, 17, 25, 29}, 0.027, 89, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {4, 4, 25, 29}, 0.1, -19, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {11, 17, 25, 29}, 0.027, 89, dtype, true);
 
   for (int i = 0; i < 20; i += 1) {
     test_quantize_per_tensor_and_dequantize_random(dtype, true);
@@ -1110,31 +1114,48 @@ TEST_F(VulkanAPITest, quantize_per_tensor_and_dequantize_qint32) {
 
 TEST_F(VulkanAPITest, quantize_per_tensor_and_dequantize_qint32_qparams) {
   const c10::ScalarType dtype = c10::ScalarType::QInt32;
-  test_quantize_per_tensor_and_dequantize({1, 1, 1, 1}, 0.13, -21123, dtype, true);
-  test_quantize_per_tensor_and_dequantize({1, 1, 1, 4}, 0.339, 8734, dtype, true);
-  test_quantize_per_tensor_and_dequantize({1, 1, 4, 1}, 0.228, -12023, dtype, true);
-  test_quantize_per_tensor_and_dequantize({1, 1, 7, 7}, 0.338, 8723, dtype, true);
-  test_quantize_per_tensor_and_dequantize({1, 1, 8, 8}, 0.193, -1023, dtype, true);
-  test_quantize_per_tensor_and_dequantize({3, 5, 8, 8}, 0.0449, 972, dtype, true);
-  test_quantize_per_tensor_and_dequantize({1, 1, 11, 17}, 0.073, -15, dtype, true);
-  test_quantize_per_tensor_and_dequantize({1, 1, 12, 17}, 0.1572, 102, dtype, true);
-  test_quantize_per_tensor_and_dequantize({3, 5, 12, 17}, 0.147, -156, dtype, true);
-  test_quantize_per_tensor_and_dequantize({1, 1, 17, 12}, 0.129, 10448, dtype, true);
-  test_quantize_per_tensor_and_dequantize({2, 4, 17, 12}, 0.137, -10, dtype, true);
-  test_quantize_per_tensor_and_dequantize({1, 1, 10, 14}, 0.001, 101, dtype, true);
-  test_quantize_per_tensor_and_dequantize({3, 5, 10, 14}, 0.009, -43267, dtype, true);
-  test_quantize_per_tensor_and_dequantize({3, 5, 10, 15}, 0.1243, 19, dtype, true);
-  test_quantize_per_tensor_and_dequantize({4, 4, 9, 17}, 0.1889, -19784, dtype, true);
-  test_quantize_per_tensor_and_dequantize({3, 5, 25, 29}, 0.1345, 196, dtype, true);
-  test_quantize_per_tensor_and_dequantize({4, 4, 25, 29}, 0.129, -19489, dtype, true);
-  test_quantize_per_tensor_and_dequantize({11, 17, 25, 29}, 0.027, 89, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {1, 1, 1, 1}, 0.13, -21123, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {1, 1, 1, 4}, 0.339, 8734, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {1, 1, 4, 1}, 0.228, -12023, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {1, 1, 7, 7}, 0.338, 8723, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {1, 1, 8, 8}, 0.193, -1023, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {3, 5, 8, 8}, 0.0449, 972, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {1, 1, 11, 17}, 0.073, -15, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {1, 1, 12, 17}, 0.1572, 102, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {3, 5, 12, 17}, 0.147, -156, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {1, 1, 17, 12}, 0.129, 10448, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {2, 4, 17, 12}, 0.137, -10, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {1, 1, 10, 14}, 0.001, 101, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {3, 5, 10, 14}, 0.009, -43267, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {3, 5, 10, 15}, 0.1243, 19, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {4, 4, 9, 17}, 0.1889, -19784, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {3, 5, 25, 29}, 0.1345, 196, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {4, 4, 25, 29}, 0.129, -19489, dtype, true);
+  test_quantize_per_tensor_and_dequantize(
+      {11, 17, 25, 29}, 0.027, 89, dtype, true);
 
   for (int i = 0; i < 20; i += 1) {
     test_quantize_per_tensor_and_dequantize_random(dtype, true);
   }
 }
 
-
 TEST_F(VulkanAPITest, quantized_add) {
   const auto in_cpu =
       at::rand({2, 13, 32, 27}, at::device(at::kCPU).dtype(at::kFloat)) * 6;
@@ -1747,6 +1768,141 @@ TEST_F(VulkanAPITest, conv2d_dw) {
   ASSERT_TRUE(check);
 }
 
+static void test_quantized_conv_transpose2d(
+    const at::IntArrayRef input_shape,
+    const at::IntArrayRef weight_shape,
+    const at::IntArrayRef bias_shape,
+    const c10::ScalarType w_dtype,
+    const c10::ScalarType bias_dtype,
+    std::vector<int64_t> stride,
+    std::vector<int64_t> padding,
+    std::vector<int64_t> output_padding,
+    std::vector<int64_t> dilation,
+    int64_t groups) {
+  c10::InferenceMode mode;
+
+  const at::Tensor input =
+      at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
+  const at::Tensor weight =
+      at::rand(weight_shape, at::device(at::kCPU).dtype(at::kFloat));
+  const at::Tensor bias =
+      at::rand(bias_shape, at::device(at::kCPU).dtype(at::kFloat));
+
+  const auto input_quant_params =
+      compute_quant_params(input, c10::ScalarType::QUInt8);
+  double input_scale = std::get<0>(input_quant_params);
+  input_scale = safe_downcast<float>(input_scale);
+  int32_t input_zero_point = std::get<1>(input_quant_params);
+  auto input_cpu_q = at::quantize_per_tensor(
+      input, input_scale, input_zero_point, c10::ScalarType::QUInt8);
+
+  const auto weight_quant_params = compute_quant_params(weight, w_dtype);
+  double weight_scale = std::get<0>(weight_quant_params);
+  weight_scale = safe_downcast<float>(weight_scale);
+  int32_t weight_zero_point = std::get<1>(weight_quant_params);
+  auto weight_cpu_q =
+      at::quantize_per_tensor(weight, weight_scale, weight_zero_point, w_dtype);
+
+  double out_scale = produce_random_scale();
+  out_scale = safe_downcast<float>(out_scale);
+  int out_zero_point = produce_random_zero_point(c10::ScalarType::QUInt8);
+
+  at::Tensor bias_cpu_q;
+  // quantize bias
+  if (bias_dtype != c10::ScalarType::Float) {
+    const auto bias_quant_params = compute_quant_params(bias, bias_dtype);
+    double bias_scale = std::get<0>(weight_quant_params);
+    bias_scale = safe_downcast<float>(bias_scale);
+    int32_t bias_zero_point = std::get<1>(bias_quant_params);
+    bias_cpu_q =
+        at::quantize_per_tensor(bias, bias_scale, bias_zero_point, bias_dtype);
+  } else {
+    bias_cpu_q = bias;
+  }
+
+  auto pack = callOpByName(
+      "quantized::conv_transpose2d_prepack",
+      "",
+      weight_cpu_q,
+      bias_cpu_q,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups);
+
+  auto out_cpu_quant = callOpByName(
+      "quantized::conv_transpose2d",
+      "",
+      input_cpu_q,
+      pack[0],
+      out_scale,
+      out_zero_point);
+
+  const at::Tensor out_cpu = at::dequantize(out_cpu_quant[0].toTensor());
+
+  // vulkan
+  const auto prepack_vulkan = callOpByName(
+      "vulkan_prepack::create_qtconv2d_context",
+      "",
+      weight_cpu_q,
+      bias_cpu_q,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      c10::nullopt,
+      c10::nullopt);
+
+  const auto input_vk_q = at::quantize_per_tensor(
+      input.vulkan(), input_scale, input_zero_point, c10::ScalarType::QUInt8);
+  auto vulkan_output = callOpByName(
+      "vulkan_prepack::run_qconv2d_context",
+      "",
+      input_vk_q,
+      out_scale,
+      out_zero_point,
+      prepack_vulkan[0]);
+
+  const auto out_vk_dequant = at::dequantize(vulkan_output[0].toTensor());
+  const auto out_vk_cpu = out_vk_dequant.cpu();
+
+  // check
+  const auto check = almostEqual(out_cpu, out_vk_cpu, out_scale);
+  if (!check) {
+    showRtol(out_cpu, out_vk_cpu);
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, conv_tranpose2d_quantized_int8_float) {
+  test_quantized_conv_transpose2d(
+      {1, 3, 2, 2}, // input_shape
+      {3, 3, 2, 2}, // weight_shape
+      {3}, // bias_shape
+      c10::ScalarType::QInt8, // weight quantization dtype
+      c10::ScalarType::Float, // bias quantization dtype
+      {1, 2}, // stride
+      {1, 0}, // padding
+      {0, 1}, // output_padding
+      {1, 1}, // dilation
+      1); // groups
+
+  test_quantized_conv_transpose2d(
+      {1, 55, 7, 19}, // input_shape
+      {55, 47, 2, 3}, // weight_shape
+      {47}, // bias_shape
+      c10::ScalarType::QInt8, // weight quantization dtype
+      c10::ScalarType::Float, // bias quantization dtype
+      {1, 2}, // stride
+      {1, 0}, // padding
+      {0, 1}, // output_padding
+      {1, 1}, // dilation
+      1); // groups
+}
+
 TEST_F(VulkanAPITest, quantized_sub) {
   float r1 = 4.0;
   float r2 = 7.0;
@@ -2266,24 +2422,23 @@ void test_max_pool2d(
   scale = safe_downcast<float>(scale);
   int zero_point = std::get<1>(input_quant_params);
 
-  auto in_cpu_quantized = at::quantize_per_tensor(in_cpu,
-      scale,
-      zero_point,
-      dtype);
+  auto in_cpu_quantized =
+      at::quantize_per_tensor(in_cpu, scale, zero_point, dtype);
 
-  const auto out_cpu_quantized = at::max_pool2d(in_cpu_quantized, {3, 4}, {2, 1}, {1, 1}, {1, 1}, false);
-  auto in_vk_quantized = at::quantize_per_tensor(in_cpu.vulkan(),
-      scale,
-      zero_point,
-      dtype);
+  const auto out_cpu_quantized =
+      at::max_pool2d(in_cpu_quantized, {3, 4}, {2, 1}, {1, 1}, {1, 1}, false);
+  auto in_vk_quantized =
+      at::quantize_per_tensor(in_cpu.vulkan(), scale, zero_point, dtype);
 
-  const auto out_vk_quantized = at::max_pool2d(in_vk_quantized, {3, 4}, {2, 1}, {1, 1}, {1,1}, false);
+  const auto out_vk_quantized =
+      at::max_pool2d(in_vk_quantized, {3, 4}, {2, 1}, {1, 1}, {1, 1}, false);
 
   const auto out_cpu_deq = at::dequantize(out_cpu_quantized);
   const auto out_vk_deq = at::dequantize(out_vk_quantized);
   const auto out_vk_deq_cpu = out_vk_deq.cpu();
 
-  const auto check = almostEqual(out_vk_deq_cpu, out_cpu_deq, safe_downcast<float>(scale));
+  const auto check =
+      almostEqual(out_vk_deq_cpu, out_cpu_deq, safe_downcast<float>(scale));
 
   if (!check) {
     showRtol(out_cpu_deq, out_vk_deq_cpu);
@@ -3138,7 +3293,8 @@ TEST_F(VulkanAPITest, quantized_tensor_get_scale_zero_point) {
   double vulkan_quantized_scale = vulkan_quantized.q_scale();
   int64_t vulkan_quantized_zero_point = vulkan_quantized.q_zero_point();
 
-  ASSERT_TRUE(cpu_quantized_scale == vulkan_quantized_scale &&
+  ASSERT_TRUE(
+      cpu_quantized_scale == vulkan_quantized_scale &&
       cpu_quantized_zero_point == vulkan_quantized_zero_point);
 }
 
@@ -3150,35 +3306,41 @@ bool _test_quantized_linear(
     int out_zero_point,
     bool input_quant_dtype_int8,
     bool weight_quant_dtype_int8) {
-  const auto input_quant_params = compute_quant_params(input_cpu,
-      input_quant_dtype_int8 ? c10::ScalarType::QInt8 : c10::ScalarType::QUInt8);
+  const auto input_quant_params = compute_quant_params(
+      input_cpu,
+      input_quant_dtype_int8 ? c10::ScalarType::QInt8
+                             : c10::ScalarType::QUInt8);
   double scale = std::get<0>(input_quant_params);
   scale = safe_downcast<float>(scale);
   int zero_point = std::get<1>(input_quant_params);
-  auto input_cpu_quantized = at::quantize_per_tensor(input_cpu,
+  auto input_cpu_quantized = at::quantize_per_tensor(
+      input_cpu,
       scale,
       zero_point,
-      input_quant_dtype_int8 ? c10::ScalarType::QInt8 : c10::ScalarType::QUInt8);
+      input_quant_dtype_int8 ? c10::ScalarType::QInt8
+                             : c10::ScalarType::QUInt8);
 
-  const auto weight_quant_params = compute_quant_params(weight,
-      weight_quant_dtype_int8 ? c10::ScalarType::QInt8 : c10::ScalarType::QUInt8);
+  const auto weight_quant_params = compute_quant_params(
+      weight,
+      weight_quant_dtype_int8 ? c10::ScalarType::QInt8
+                              : c10::ScalarType::QUInt8);
   double w_scale = std::get<0>(weight_quant_params);
   w_scale = safe_downcast<float>(w_scale);
   // Weight zero point is expected to always be 0
   int w_zero_point = 0;
-  const auto weight_cpu_quantized = at::quantize_per_tensor(weight,
+  const auto weight_cpu_quantized = at::quantize_per_tensor(
+      weight,
       w_scale,
       w_zero_point,
-      weight_quant_dtype_int8 ? c10::ScalarType::QInt8 : c10::ScalarType::QUInt8);
+      weight_quant_dtype_int8 ? c10::ScalarType::QInt8
+                              : c10::ScalarType::QUInt8);
 
   auto pack =
-      callOpByName(
-          "quantized::linear_prepack", "",
-          weight_cpu_quantized,
-          bias);
+      callOpByName("quantized::linear_prepack", "", weight_cpu_quantized, bias);
 
   auto out_cpu_quant = callOpByName(
-      "quantized::linear", "",
+      "quantized::linear",
+      "",
       input_cpu_quantized,
       pack[0],
       out_scale,
@@ -3187,26 +3349,27 @@ bool _test_quantized_linear(
   at::Tensor out_cpu_dequant = at::dequantize(out_cpu_quant[0].toTensor());
 
   // Vulkan
-  auto input_vk_quantized =
-      at::quantize_per_tensor(
-          input_cpu.vulkan(),
-          scale, zero_point,
-          input_quant_dtype_int8 ? c10::ScalarType::QInt8 : c10::ScalarType::QUInt8);
+  auto input_vk_quantized = at::quantize_per_tensor(
+      input_cpu.vulkan(),
+      scale,
+      zero_point,
+      input_quant_dtype_int8 ? c10::ScalarType::QInt8
+                             : c10::ScalarType::QUInt8);
 
   at::Tensor out_vk_quant;
 
   c10::intrusive_ptr<at::native::vulkan::ops::LinearPackedContext> vk_pack =
-      at::native::vulkan::ops::create_linear_context(weight_cpu_quantized.t(), bias);
+      at::native::vulkan::ops::create_linear_context(
+          weight_cpu_quantized.t(), bias);
 
-  out_vk_quant =
-      at::native::vulkan::ops::run_qlinear_context(
-          input_vk_quantized,
-          out_scale, out_zero_point, vk_pack);
+  out_vk_quant = at::native::vulkan::ops::run_qlinear_context(
+      input_vk_quantized, out_scale, out_zero_point, vk_pack);
 
   auto out_vk_dequant = at::dequantize(out_vk_quant);
   auto out_vk_to_cpu_dequant = vulkan_to_cpu(out_vk_dequant, out_cpu_dequant);
 
-  const auto check = almostEqual(out_cpu_dequant, out_vk_to_cpu_dequant, safe_downcast<float>(out_scale));
+  const auto check = almostEqual(
+      out_cpu_dequant, out_vk_to_cpu_dequant, safe_downcast<float>(out_scale));
   if (!check) {
     long xpos = -1, ypos = -1;
     if (input_cpu.sizes().size() == 2) {
@@ -3216,28 +3379,33 @@ bool _test_quantized_linear(
       showRtol(out_cpu_dequant, out_vk_to_cpu_dequant);
     }
     if (xpos != -1 && ypos != -1) {
-      std::cout << "\nFailure caused on row/col: " << ypos << "/" << xpos << "\n";
-      std::cout << "Input tensor scale: " << scale << " zerop: " << zero_point << "\n";
+      std::cout << "\nFailure caused on row/col: " << ypos << "/" << xpos
+                << "\n";
+      std::cout << "Input tensor scale: " << scale << " zerop: " << zero_point
+                << "\n";
       std::cout << "Input tensor row " << ypos << "\n";
       for (int i = 0; i < input_cpu.sizes()[1]; i++) {
         std::cout << input_cpu[ypos][i].item<double>() << ", ";
       }
       std::cout << "\n";
 
-      std::cout << "Weight tensor scale: " << w_scale << " zerop: " << w_zero_point << "\n";
+      std::cout << "Weight tensor scale: " << w_scale
+                << " zerop: " << w_zero_point << "\n";
       std::cout << "Weight tensor col " << xpos << "\n";
       for (int i = 0; i < weight.sizes()[1]; i++) {
         std::cout << weight[xpos][i].item<double>() << ", ";
       }
       std::cout << "\n";
 
-      std::cout << "Input tensor quantized row " << ypos << " with dtype " << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
+      std::cout << "Input tensor quantized row " << ypos << " with dtype "
+                << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
       for (int i = 0; i < input_cpu.sizes()[1]; i++) {
         std::cout << input_cpu_quantized[ypos][i].item<double>() << ", ";
       }
       std::cout << "\n";
 
-      std::cout << "Weight tensor quantized col " << xpos << " with dtype " << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
+      std::cout << "Weight tensor quantized col " << xpos << " with dtype "
+                << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
       for (int i = 0; i < weight.sizes()[1]; i++) {
         std::cout << weight_cpu_quantized[xpos][i].item<double>() << ", ";
       }
@@ -3249,10 +3417,13 @@ bool _test_quantized_linear(
       }
       std::cout << "\n";
 
-      std::cout << "out_scale: " << out_scale << " out_zero_point: " << out_zero_point << "\n";
+      std::cout << "out_scale: " << out_scale
+                << " out_zero_point: " << out_zero_point << "\n";
 
-      std::cout << "cpu unmatched output: " << out_cpu_dequant[ypos][xpos].item<double>() << "\n";
-      std::cout << "vk unmatched output: " << out_vk_to_cpu_dequant[ypos][xpos].item<double>() << "\n";
+      std::cout << "cpu unmatched output: "
+                << out_cpu_dequant[ypos][xpos].item<double>() << "\n";
+      std::cout << "vk unmatched output: "
+                << out_vk_to_cpu_dequant[ypos][xpos].item<double>() << "\n";
     }
   }
   return check;
@@ -3267,19 +3438,32 @@ bool test_quantized_linear_for_dtypes(
   double out_scale = produce_random_scale();
   out_scale = safe_downcast<float>(out_scale);
   int out_zero_point = produce_random_zero_point(
-      input_quant_dtype_int8 ? c10::ScalarType::QInt8 : c10::ScalarType::QUInt8);
-  const auto check = _test_quantized_linear(input_cpu, weight, bias,
-      out_scale, out_zero_point, input_quant_dtype_int8, weight_quant_dtype_int8);
+      input_quant_dtype_int8 ? c10::ScalarType::QInt8
+                             : c10::ScalarType::QUInt8);
+  const auto check = _test_quantized_linear(
+      input_cpu,
+      weight,
+      bias,
+      out_scale,
+      out_zero_point,
+      input_quant_dtype_int8,
+      weight_quant_dtype_int8);
   if (!check) {
     // on failure we want to print the exact row/col that causes the
     // failure in 2D, so we can debug
     if (input_cpu.sizes().size() != 2) {
-      const auto d =
-        c10::multiply_integers(input_cpu.sizes().cbegin(), input_cpu.sizes().end() - 1);
+      const auto d = c10::multiply_integers(
+          input_cpu.sizes().cbegin(), input_cpu.sizes().end() - 1);
       auto input_cpu_2d = input_cpu.view({d, input_cpu.size(-1)});
 
-      _test_quantized_linear(input_cpu_2d, weight, bias, out_scale, out_zero_point,
-          input_quant_dtype_int8, weight_quant_dtype_int8);
+      _test_quantized_linear(
+          input_cpu_2d,
+          weight,
+          bias,
+          out_scale,
+          out_zero_point,
+          input_quant_dtype_int8,
+          weight_quant_dtype_int8);
     }
   }
   return check;
@@ -3297,11 +3481,10 @@ void test_quantized_linear(
 
   const auto bias = produce_random_tensor(bias_shape);
 
-  bool check = test_quantized_linear_for_dtypes(input_cpu, weight, bias,
-      false, true);
+  bool check =
+      test_quantized_linear_for_dtypes(input_cpu, weight, bias, false, true);
   ASSERT_TRUE(check);
-  check = test_quantized_linear_for_dtypes(input_cpu, weight, bias,
-      true, true);
+  check = test_quantized_linear_for_dtypes(input_cpu, weight, bias, true, true);
   ASSERT_TRUE(check);
 }
 
@@ -3519,6 +3702,131 @@ TEST_F(VulkanAPITest, channel_to_width_packing_test) {
   ASSERT_TRUE(!has_failure);
 }
 
+void test_gelu(
+    const at::IntArrayRef input_shape,
+    const c10::ScalarType dtype,
+    bool self_test) {
+  const auto& in_cpu = produce_random_tensor(input_shape);
+
+  auto [scale, zero_point] = compute_quant_params(in_cpu, dtype);
+  scale = safe_downcast<float>(scale);
+
+  auto in_cpu_quantized =
+      at::quantize_per_tensor(in_cpu, scale, zero_point, dtype);
+
+  auto in_vk_quantized =
+      at::quantize_per_tensor(in_cpu.vulkan(), scale, zero_point, dtype);
+
+  auto approximate = "tanh";
+
+  const auto& out_cpu_quantized = self_test
+      ? at::gelu_(in_cpu_quantized, approximate)
+      : at::gelu(in_cpu_quantized, approximate);
+
+  const auto& out_vk_quantized = self_test
+      ? at::gelu_(in_vk_quantized, approximate)
+      : at::gelu(in_vk_quantized, approximate);
+
+  const auto& out_cpu_deq = at::dequantize(out_cpu_quantized);
+  const auto& out_vk_deq = at::dequantize(out_vk_quantized);
+  const auto& out_vk_deq_cpu = out_vk_deq.cpu();
+
+  const auto check = almostEqual(out_vk_deq_cpu, out_cpu_deq, scale);
+
+  if (!check) {
+    showRtol(out_cpu_deq, out_vk_deq_cpu);
+  }
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, gelu_qint8) {
+  test_gelu({200, 20}, c10::ScalarType::QInt8, false);
+  test_gelu({200, 20, 10}, c10::ScalarType::QInt8, false);
+  test_gelu({200, 20, 30, 10}, c10::ScalarType::QInt8, false);
+}
+
+TEST_F(VulkanAPITest, gelu_qint8_self) {
+  test_gelu({4, 1, 4}, c10::ScalarType::QInt8, true);
+  test_gelu({200, 20}, c10::ScalarType::QInt8, true);
+  test_gelu({200, 20, 10}, c10::ScalarType::QInt8, true);
+  test_gelu({200, 20, 30, 10}, c10::ScalarType::QInt8, true);
+}
+
+TEST_F(VulkanAPITest, gelu_quint8) {
+  test_gelu({200, 20}, c10::ScalarType::QUInt8, false);
+  test_gelu({200, 20, 10}, c10::ScalarType::QUInt8, false);
+  test_gelu({200, 20, 30, 10}, c10::ScalarType::QUInt8, false);
+}
+
+TEST_F(VulkanAPITest, gelu_quint8_self) {
+  test_gelu({4, 1, 4}, c10::ScalarType::QUInt8, true);
+  test_gelu({200, 20}, c10::ScalarType::QUInt8, true);
+  test_gelu({200, 20, 10}, c10::ScalarType::QUInt8, true);
+  test_gelu({200, 20, 30, 10}, c10::ScalarType::QUInt8, true);
+}
+
+void test_relu(
+    const at::IntArrayRef input_shape,
+    const c10::ScalarType dtype,
+    bool inplace) {
+  const auto in_cpu = produce_random_tensor(input_shape);
+
+  const auto input_quant_params = compute_quant_params(in_cpu, dtype);
+  double scale = std::get<0>(input_quant_params);
+  scale = safe_downcast<float>(scale);
+  int zero_point = std::get<1>(input_quant_params);
+
+  auto in_cpu_quantized =
+      at::quantize_per_tensor(in_cpu, scale, zero_point, dtype);
+
+  auto in_vk_quantized =
+      at::quantize_per_tensor(in_cpu.vulkan(), scale, zero_point, dtype);
+
+  const auto out_cpu_quantized =
+      inplace ? at::relu_(in_cpu_quantized) : at::relu(in_cpu_quantized);
+
+  const auto out_vk_quantized =
+      inplace ? at::relu_(in_vk_quantized) : at::relu(in_vk_quantized);
+
+  const auto out_cpu_deq = at::dequantize(out_cpu_quantized);
+  const auto out_vk_deq = at::dequantize(out_vk_quantized);
+  const auto out_vk_deq_cpu = out_vk_deq.cpu();
+
+  const auto check =
+      almostEqual(out_vk_deq_cpu, out_cpu_deq, safe_downcast<float>(scale));
+
+  if (!check) {
+    showRtol(out_cpu_deq, out_vk_deq_cpu);
+  }
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, relu_qint8) {
+  test_relu({200, 20}, c10::ScalarType::QInt8, false);
+  test_relu({200, 20, 10}, c10::ScalarType::QInt8, false);
+  test_relu({200, 20, 30, 10}, c10::ScalarType::QInt8, false);
+}
+
+TEST_F(VulkanAPITest, relu_qint8_inplace) {
+  test_relu({4, 1, 4}, c10::ScalarType::QInt8, true);
+  test_relu({200, 20}, c10::ScalarType::QInt8, true);
+  test_relu({200, 20, 10}, c10::ScalarType::QInt8, true);
+  test_relu({200, 20, 30, 10}, c10::ScalarType::QInt8, true);
+}
+
+TEST_F(VulkanAPITest, relu_quint8) {
+  test_relu({200, 20}, c10::ScalarType::QUInt8, false);
+  test_relu({200, 20, 10}, c10::ScalarType::QUInt8, false);
+  test_relu({200, 20, 30, 10}, c10::ScalarType::QUInt8, false);
+}
+
+TEST_F(VulkanAPITest, relu_quint8_inplace) {
+  test_relu({4, 1, 4}, c10::ScalarType::QUInt8, true);
+  test_relu({200, 20}, c10::ScalarType::QUInt8, true);
+  test_relu({200, 20, 10}, c10::ScalarType::QUInt8, true);
+  test_relu({200, 20, 30, 10}, c10::ScalarType::QUInt8, true);
+}
+
 } // namespace
 
 #endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/test/xla_tensor_test.cpp b/aten/src/ATen/test/xla_tensor_test.cpp
index 21f911ec1a46f..bca63697ba9fd 100644
--- a/aten/src/ATen/test/xla_tensor_test.cpp
+++ b/aten/src/ATen/test/xla_tensor_test.cpp
@@ -2,6 +2,8 @@
 
 #include <ATen/ATen.h>
 
+#include <ATen/test/allocator_clone_test.h>
+
 using namespace at;
 
 void XLAFree(void *ptr) {
@@ -15,13 +17,16 @@ void* XLAMalloc(ptrdiff_t size) {
 }
 
 struct XLAAllocator final : public at::Allocator {
-  at::DataPtr allocate(size_t size) const override {
+  at::DataPtr allocate(size_t size) override {
     auto* ptr = XLAMalloc(size);
     return {ptr, ptr, &XLAFree, at::DeviceType::XLA};
   }
   at::DeleterFnPtr raw_deleter() const override {
     return &XLAFree;
   }
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    default_copy_data(dest, src, count);
+  }
 };
 
 TEST(XlaTensorTest, TestNoStorage) {
@@ -33,3 +38,11 @@ TEST(XlaTensorTest, TestNoStorage) {
   at::Tensor t(std::move(tensor_impl));
   ASSERT_TRUE(t.device() == at::Device(DeviceType::XLA, 0));
 }
+
+TEST(XlaTensorTest, test_allocator_clone) {
+  if (!at::hasXLA()) {
+    return;
+  }
+  XLAAllocator allocator;
+  test_allocator_clone(&allocator);
+}
diff --git a/aten/src/ATen/test/xpu_caching_host_allocator_test.cpp b/aten/src/ATen/test/xpu_caching_host_allocator_test.cpp
new file mode 100644
index 0000000000000..25da86353aa72
--- /dev/null
+++ b/aten/src/ATen/test/xpu_caching_host_allocator_test.cpp
@@ -0,0 +1,180 @@
+#include <gtest/gtest.h>
+
+#include <ATen/ATen.h>
+#include <ATen/TensorIndexing.h>
+#include <ATen/xpu/CachingHostAllocator.h>
+#include <ATen/xpu/XPUContext.h>
+#include <ATen/xpu/XPUEvent.h>
+#include <c10/core/ScalarType.h>
+#include <c10/xpu/XPUStream.h>
+
+constexpr int64_t N = 100;
+
+TEST(CachingHostAllocatorTest, testPinnedAliasSlice) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+
+  // Check a standard pinned tensor can be correctly recorded.
+  auto pinned_tensor =
+      at::empty({N}, at::TensorOptions().dtype(at::kByte).pinned_memory(true));
+  // TODO: Uncomment this line when op `pin_memory` is supported on XPU.
+  // ASSERT_TRUE(pinned_tensor.is_pinned());
+  ASSERT_TRUE(at::xpu::CachingHostAllocator_recordEvent(
+      pinned_tensor.data_ptr(),
+      pinned_tensor.storage().data_ptr().get_context(),
+      at::xpu::getCurrentXPUStream()));
+
+  // Check an tensor constructed with from_blob can be correctly recorded (via
+  // the shared data_ptr)
+  auto alias_tensor = at::from_blob(
+      pinned_tensor.data_ptr(), pinned_tensor.sizes(), pinned_tensor.options());
+  // ASSERT_TRUE(alias_tensor.is_pinned());
+
+  ASSERT_FALSE(
+      alias_tensor.storage().data_ptr().get_context() ==
+      pinned_tensor.storage().data_ptr().get_context());
+  ASSERT_EQ(alias_tensor.data_ptr(), pinned_tensor.data_ptr());
+  ASSERT_TRUE(at::xpu::CachingHostAllocator_recordEvent(
+      alias_tensor.data_ptr(),
+      alias_tensor.storage().data_ptr().get_context(),
+      at::xpu::getCurrentXPUStream()));
+
+  // Check an tensor constructed with slicing can be correctly recorded (via
+  // the shared context)
+  auto slice_tensor =
+      pinned_tensor.index({at::indexing::Slice(1, at::indexing::None, 2)});
+  ASSERT_EQ(
+      slice_tensor.storage().data_ptr().get_context(),
+      pinned_tensor.storage().data_ptr().get_context());
+  ASSERT_NE(slice_tensor.data_ptr(), pinned_tensor.data_ptr());
+  ASSERT_TRUE(at::xpu::CachingHostAllocator_recordEvent(
+      slice_tensor.data_ptr(),
+      slice_tensor.storage().data_ptr().get_context(),
+      at::xpu::getCurrentXPUStream()));
+
+  // Check a tensor that has neither a matching context nor data_ptr cannot be
+  // recorded.
+  auto alias_slice_tensor = at::from_blob(
+      slice_tensor.data_ptr(), slice_tensor.sizes(), slice_tensor.options());
+  // ASSERT_TRUE(alias_slice_tensor.is_pinned());
+  ASSERT_FALSE(at::xpu::CachingHostAllocator_recordEvent(
+      alias_slice_tensor.data_ptr(),
+      alias_slice_tensor.storage().data_ptr().get_context(),
+      at::xpu::getCurrentXPUStream()));
+  ASSERT_NE(
+      alias_slice_tensor.storage().data_ptr().get(),
+      slice_tensor.storage().data_ptr().get());
+}
+
+TEST(CachingHostAllocatorTest, testRawAllocation) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+
+  auto data_ptr = at::xpu::getCachingHostAllocator()->allocate(N);
+  class UserDataDeleter {
+   public:
+    explicit UserDataDeleter(std::unique_ptr<void, c10::DeleterFnPtr> ptr)
+        : ptr_(std::move(ptr)) {}
+
+   private:
+    std::unique_ptr<void, c10::DeleterFnPtr> ptr_;
+  };
+  auto* user_data_deleter = new UserDataDeleter(data_ptr.move_context());
+
+  struct IOBuf {
+    explicit IOBuf(void* buf, void* ctx, std::function<void(void*)> deleter)
+        : buf_(buf), ctx_(ctx), deleter_(std::move(deleter)) {}
+    void* buf_;
+    void* ctx_;
+    std::function<void(void*)> deleter_;
+    ~IOBuf() {
+      deleter_(ctx_);
+    }
+  };
+  auto iobuf =
+      std::make_unique<IOBuf>(data_ptr.get(), user_data_deleter, [](void* ctx) {
+        delete static_cast<UserDataDeleter*>(ctx);
+      });
+  auto pinned_tensor =
+      at::for_blob(iobuf->buf_, {N})
+          .context(
+              iobuf.release(),
+              [](void* ctx) { delete static_cast<IOBuf*>(ctx); })
+          .make_tensor();
+
+  // ASSERT_TRUE(pinned_tensor.is_pinned());
+  ASSERT_TRUE(at::xpu::CachingHostAllocator_recordEvent(
+      pinned_tensor.data_ptr(),
+      pinned_tensor.storage().data_ptr().get_context(),
+      at::xpu::getCurrentXPUStream()));
+}
+
+TEST(CachingHostAllocatorTest, testUnknownTensor) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+
+  auto unpinned_tensor =
+      at::empty({N}, at::TensorOptions().dtype(at::kByte).pinned_memory(false));
+
+  ASSERT_FALSE(at::xpu::CachingHostAllocator_recordEvent(
+      unpinned_tensor.data_ptr(),
+      unpinned_tensor.storage().data_ptr().get_context(),
+      at::xpu::getCurrentXPUStream()));
+}
+
+TEST(CachingHostAllocatorTest, testEmptyCache) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+
+  void* ptr{nullptr};
+  void* ctx{nullptr};
+  {
+    auto pinned_tensor = at::empty(
+        {N}, at::TensorOptions().dtype(at::kByte).pinned_memory(true));
+    ptr = pinned_tensor.data_ptr();
+    ctx = pinned_tensor.storage().data_ptr().get_context();
+    ASSERT_TRUE(at::xpu::CachingHostAllocator_recordEvent(
+        ptr, ctx, at::xpu::getCurrentXPUStream()));
+  }
+
+  {
+    auto pinned_tensor = at::empty(
+        {N}, at::TensorOptions().dtype(at::kByte).pinned_memory(true));
+    at::xpu::syncStreamsOnDevice();
+  }
+
+  at::xpu::CachingHostAllocator_emptyCache();
+  ASSERT_FALSE(at::xpu::CachingHostAllocator_recordEvent(
+      ptr, ctx, at::xpu::getCurrentXPUStream()));
+}
+
+TEST(CachingHostAllocatorTest, testReuse) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+
+  void* ptr{nullptr};
+  void* ctx{nullptr};
+  {
+    auto pinned_tensor = at::empty(
+        {N}, at::TensorOptions().dtype(at::kByte).pinned_memory(true));
+    ptr = pinned_tensor.data_ptr();
+    ctx = pinned_tensor.storage().data_ptr().get_context();
+  }
+  // Ensure we reuse the allocation.
+  {
+    auto pinned_tensor = at::empty(
+        {N}, at::TensorOptions().dtype(at::kByte).pinned_memory(true));
+    ASSERT_EQ(ptr, pinned_tensor.data_ptr());
+    ASSERT_EQ(ctx, pinned_tensor.storage().data_ptr().get_context());
+  }
+}
+
+int main(int argc, char* argv[]) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/aten/src/ATen/test/xpu_device_test.cpp b/aten/src/ATen/test/xpu_device_test.cpp
new file mode 100644
index 0000000000000..3f3f809d73be7
--- /dev/null
+++ b/aten/src/ATen/test/xpu_device_test.cpp
@@ -0,0 +1,64 @@
+#include <gtest/gtest.h>
+
+#include <ATen/xpu/XPUContext.h>
+#include <ATen/xpu/XPUDevice.h>
+#include <torch/torch.h>
+
+TEST(XpuDeviceTest, getDeviceProperties) {
+  EXPECT_EQ(at::xpu::is_available(), torch::xpu::is_available());
+  if (!at::xpu::is_available()) {
+    return;
+  }
+
+  c10::xpu::DeviceProp* cur_device_prop = at::xpu::getCurrentDeviceProperties();
+  c10::xpu::DeviceProp* device_prop = at::xpu::getDeviceProperties(0);
+
+  EXPECT_EQ(cur_device_prop->name, device_prop->name);
+  EXPECT_EQ(cur_device_prop->platform_name, device_prop->platform_name);
+  EXPECT_EQ(cur_device_prop->gpu_eu_count, device_prop->gpu_eu_count);
+}
+
+TEST(XpuDeviceTest, getDeviceFromPtr) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+
+  sycl::device& raw_device = at::xpu::get_raw_device(0);
+  void* ptr = sycl::malloc_device(8, raw_device, at::xpu::get_device_context());
+
+  at::Device device = at::xpu::getDeviceFromPtr(ptr);
+  sycl::free(ptr, at::xpu::get_device_context());
+  EXPECT_EQ(device.index(), 0);
+  EXPECT_EQ(device.type(), at::kXPU);
+
+  int dummy = 0;
+  ASSERT_THROW(at::xpu::getDeviceFromPtr(&dummy), c10::Error);
+}
+
+TEST(XpuDeviceTest, getGlobalIdxFromDevice) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+
+  int target_device = 0;
+  auto global_index = at::xpu::getGlobalIdxFromDevice(target_device);
+  auto devices = sycl::device::get_devices();
+  EXPECT_EQ(devices[global_index], at::xpu::get_raw_device(target_device));
+
+  void* ptr = sycl::malloc_device(8, devices[global_index], at::xpu::get_device_context());
+  at::Device device = at::xpu::getDeviceFromPtr(ptr);
+  sycl::free(ptr, at::xpu::get_device_context());
+  EXPECT_EQ(device.index(), target_device);
+  EXPECT_EQ(device.type(), at::kXPU);
+
+  if (at::xpu::device_count() == 1) {
+    return;
+  }
+  // Test the last device.
+  target_device = at::xpu::device_count() - 1;
+  global_index = at::xpu::getGlobalIdxFromDevice(target_device);
+  EXPECT_EQ(devices[global_index], at::xpu::get_raw_device(target_device));
+
+  target_device = at::xpu::device_count();
+  ASSERT_THROW(at::xpu::getGlobalIdxFromDevice(target_device), c10::Error);
+}
diff --git a/aten/src/ATen/test/xpu_event_test.cpp b/aten/src/ATen/test/xpu_event_test.cpp
new file mode 100644
index 0000000000000..826415cedd14f
--- /dev/null
+++ b/aten/src/ATen/test/xpu_event_test.cpp
@@ -0,0 +1,104 @@
+#include <gtest/gtest.h>
+
+#include <ATen/xpu/XPUEvent.h>
+#include <c10/util/irange.h>
+#include <c10/xpu/test/impl/XPUTest.h>
+
+TEST(XpuEventTest, testXPUEventBehavior) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+  auto stream = c10::xpu::getStreamFromPool();
+  at::xpu::XPUEvent event;
+
+  EXPECT_TRUE(event.query());
+  EXPECT_TRUE(!event.isCreated());
+
+  event.recordOnce(stream);
+  EXPECT_TRUE(event.isCreated());
+
+  auto wait_stream0 = c10::xpu::getStreamFromPool();
+  auto wait_stream1 = c10::xpu::getStreamFromPool();
+
+  event.block(wait_stream0);
+  event.block(wait_stream1);
+
+  wait_stream0.synchronize();
+  EXPECT_TRUE(event.query());
+}
+
+TEST(XpuEventTest, testXPUEventCrossDevice) {
+  if (at::xpu::device_count() <= 1) {
+    return;
+  }
+
+  const auto stream0 = at::xpu::getStreamFromPool();
+  at::xpu::XPUEvent event0;
+
+  const auto stream1 = at::xpu::getStreamFromPool(false, 1);
+  at::xpu::XPUEvent event1;
+
+  event0.record(stream0);
+  event1.record(stream1);
+
+  event0 = std::move(event1);
+
+  EXPECT_EQ(event0.device(), at::Device(at::kXPU, 1));
+
+  event0.block(stream0);
+
+  stream0.synchronize();
+  ASSERT_TRUE(event0.query());
+}
+
+void eventSync(sycl::event& event) {
+  event.wait();
+}
+
+TEST(XpuEventTest, testXPUEventFunction) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+
+  constexpr int numel = 1024;
+  int hostData[numel];
+  initHostData(hostData, numel);
+
+  auto stream = c10::xpu::getStreamFromPool();
+  int* deviceData = sycl::malloc_device<int>(numel, stream);
+
+  // H2D
+  stream.queue().memcpy(deviceData, hostData, sizeof(int) * numel);
+  at::xpu::XPUEvent event;
+  event.record(stream);
+  // To validate the implicit conversion of an XPUEvent to sycl::event.
+  eventSync(event);
+  EXPECT_TRUE(event.query());
+
+  clearHostData(hostData, numel);
+
+  // D2H
+  stream.queue().memcpy(hostData, deviceData, sizeof(int) * numel);
+  event.record(stream);
+  event.synchronize();
+
+  validateHostData(hostData, numel);
+
+  clearHostData(hostData, numel);
+  // D2H
+  stream.queue().memcpy(hostData, deviceData, sizeof(int) * numel);
+  // The event has already been created, so there will be no recording of the
+  // stream via recordOnce() here.
+  event.recordOnce(stream);
+  EXPECT_TRUE(event.query());
+
+  stream.synchronize();
+  sycl::free(deviceData, c10::xpu::get_device_context());
+
+  if (at::xpu::device_count() <= 1) {
+    return;
+  }
+  c10::xpu::set_device(1);
+  auto stream1 = c10::xpu::getStreamFromPool();
+  ASSERT_THROW(event.record(stream1), c10::Error);
+}
diff --git a/aten/src/ATen/test/xpu_generator_test.cpp b/aten/src/ATen/test/xpu_generator_test.cpp
new file mode 100644
index 0000000000000..f47ca4d72118e
--- /dev/null
+++ b/aten/src/ATen/test/xpu_generator_test.cpp
@@ -0,0 +1,82 @@
+#include <gtest/gtest.h>
+
+#include <ATen/ATen.h>
+#include <ATen/xpu/XPUContext.h>
+#include <ATen/xpu/XPUGeneratorImpl.h>
+#include <ATen/core/PhiloxRNGEngine.h>
+
+#include <assert.h>
+#include <thread>
+
+TEST(XpuGeneratorTest, testGeneratorDynamicCast) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+  auto foo = at::xpu::detail::createXPUGenerator();
+  auto result = foo.get<at::XPUGeneratorImpl>();
+  EXPECT_EQ(typeid(at::XPUGeneratorImpl*).hash_code(), typeid(result).hash_code());
+}
+
+TEST(XpuGeneratorTest, testDefaultGenerator) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+  auto foo = at::xpu::detail::getDefaultXPUGenerator();
+  auto bar = at::xpu::detail::getDefaultXPUGenerator();
+  EXPECT_EQ(foo, bar);
+
+  auto offset = foo.get_offset() << 1;
+  foo.set_offset(offset);
+  EXPECT_EQ(foo.get_offset(), offset);
+
+  if (c10::xpu::device_count() >= 2) {
+    foo = at::xpu::detail::getDefaultXPUGenerator(0);
+    bar = at::xpu::detail::getDefaultXPUGenerator(0);
+    EXPECT_EQ(foo, bar);
+
+    foo = at::xpu::detail::getDefaultXPUGenerator(0);
+    bar = at::xpu::detail::getDefaultXPUGenerator(1);
+    EXPECT_NE(foo, bar);
+  }
+}
+
+TEST(XpuGeneratorTest, testCloning) {
+  if (!at::xpu::is_available()) {
+    return;
+  }
+  auto gen1 = at::xpu::detail::createXPUGenerator();
+  gen1.set_current_seed(123); // modify gen1 state
+  auto xpu_gen1 = at::check_generator<at::XPUGeneratorImpl>(gen1);
+  xpu_gen1->set_philox_offset_per_thread(4);
+  auto gen2 = at::xpu::detail::createXPUGenerator();
+  gen2 = gen1.clone();
+  auto xpu_gen2 = at::check_generator<at::XPUGeneratorImpl>(gen2);
+  EXPECT_EQ(gen1.current_seed(), gen2.current_seed());
+  EXPECT_EQ(
+    xpu_gen1->philox_offset_per_thread(),
+    xpu_gen2->philox_offset_per_thread()
+  );
+}
+
+void thread_func_get_set_current_seed(at::Generator generator) {
+  std::lock_guard<std::mutex> lock(generator.mutex());
+  auto current_seed = generator.current_seed();
+  current_seed++;
+  generator.set_current_seed(current_seed);
+}
+
+TEST(XpuGeneratorTest, testMultithreadingGetSetCurrentSeed) {
+  // See Note [Acquire lock when using random generators]
+  if (!at::xpu::is_available()) {
+    return;
+  }
+  auto gen1 = at::xpu::detail::getDefaultXPUGenerator();
+  auto initial_seed = gen1.current_seed();
+  std::thread t0{thread_func_get_set_current_seed, gen1};
+  std::thread t1{thread_func_get_set_current_seed, gen1};
+  std::thread t2{thread_func_get_set_current_seed, gen1};
+  t0.join();
+  t1.join();
+  t2.join();
+  EXPECT_EQ(gen1.current_seed(), initial_seed+3);
+}
diff --git a/aten/src/ATen/xpu/CachingHostAllocator.cpp b/aten/src/ATen/xpu/CachingHostAllocator.cpp
new file mode 100644
index 0000000000000..13cd1b6124a9b
--- /dev/null
+++ b/aten/src/ATen/xpu/CachingHostAllocator.cpp
@@ -0,0 +1,76 @@
+#include <ATen/xpu/CachingHostAllocator.h>
+
+namespace at::xpu {
+namespace {
+
+constexpr size_t kHostAlignment = 512;
+
+using Block = HostBlock<XPUStream>;
+
+struct XPUCachingHostAllocatorImpl
+    : public CachingHostAllocatorImpl<XPUStream, XPUEvent> {
+  /* These following functions are runtime-related. */
+  void allocate_host_memory(size_t size, void** ptr) override {
+    *ptr = sycl::aligned_alloc_host(
+        kHostAlignment, size, c10::xpu::get_device_context());
+  }
+
+  void free_block(Block* block) override {
+    sycl::free(block->ptr_, c10::xpu::get_device_context());
+  }
+
+  void record_stream(
+      c10::optional<std::vector<XPUEvent>>& events,
+      XPUStream stream) override {
+    XPUEvent event;
+    event.record(stream);
+    events->push_back(std::move(event));
+  }
+
+  bool query_event(XPUEvent& event) override {
+    return event.query();
+  }
+};
+
+void raw_local_deleter(void* ptr);
+
+struct XPUCachingHostAllocator final
+    : public CachingHostAllocatorInterface<XPUCachingHostAllocatorImpl> {
+  at::DataPtr allocate(size_t size) override {
+    auto ptr_and_ctx = impl_->allocate(size);
+    return {
+        ptr_and_ctx.first,
+        ptr_and_ctx.second,
+        &raw_local_deleter,
+        at::DeviceType::CPU};
+  }
+};
+
+static XPUCachingHostAllocator caching_host_allocator;
+
+static inline XPUCachingHostAllocator& getXPUCachingHostAllocator() {
+  return caching_host_allocator;
+}
+
+void raw_local_deleter(void* ptr) {
+  getXPUCachingHostAllocator().free(ptr);
+}
+
+} // anonymous namespace
+
+bool CachingHostAllocator_recordEvent(
+    void* ptr,
+    void* ctx,
+    c10::xpu::XPUStream stream) {
+  return getXPUCachingHostAllocator().record_event(ptr, ctx, stream);
+}
+
+void CachingHostAllocator_emptyCache() {
+  getXPUCachingHostAllocator().empty_cache();
+}
+
+at::Allocator* getCachingHostAllocator() {
+  return &getXPUCachingHostAllocator();
+}
+
+} // namespace at::xpu
diff --git a/aten/src/ATen/xpu/CachingHostAllocator.h b/aten/src/ATen/xpu/CachingHostAllocator.h
new file mode 100644
index 0000000000000..5a4afc0dc7486
--- /dev/null
+++ b/aten/src/ATen/xpu/CachingHostAllocator.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <ATen/core/CachingHostAllocator.h>
+#include <ATen/xpu/XPUEvent.h>
+#include <c10/core/Allocator.h>
+#include <c10/xpu/XPUStream.h>
+
+namespace at::xpu {
+
+TORCH_XPU_API c10::Allocator* getCachingHostAllocator();
+
+TORCH_XPU_API bool CachingHostAllocator_recordEvent(
+    void* ptr,
+    void* ctx,
+    c10::xpu::XPUStream stream);
+
+TORCH_XPU_API void CachingHostAllocator_emptyCache();
+
+inline TORCH_XPU_API at::DataPtr HostAlloc(size_t size) {
+  return getCachingHostAllocator()->allocate(size);
+}
+
+} // namespace at::xpu
diff --git a/aten/src/ATen/xpu/PinnedMemoryAllocator.h b/aten/src/ATen/xpu/PinnedMemoryAllocator.h
new file mode 100644
index 0000000000000..4a209c9552491
--- /dev/null
+++ b/aten/src/ATen/xpu/PinnedMemoryAllocator.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/xpu/CachingHostAllocator.h>
+#include <c10/core/Allocator.h>
+
+namespace at::xpu {
+
+inline TORCH_XPU_API at::Allocator* getPinnedMemoryAllocator() {
+  return getCachingHostAllocator();
+}
+} // namespace at::xpu
diff --git a/aten/src/ATen/xpu/XPUContext.cpp b/aten/src/ATen/xpu/XPUContext.cpp
new file mode 100644
index 0000000000000..a45c80791a101
--- /dev/null
+++ b/aten/src/ATen/xpu/XPUContext.cpp
@@ -0,0 +1,87 @@
+#include <ATen/xpu/XPUContext.h>
+#include <c10/util/CallOnce.h>
+#include <c10/util/Exception.h>
+
+#include <deque>
+#include <vector>
+
+namespace at::xpu {
+namespace {
+
+/*
+ * Currently, there is one device properties pool containing the information and
+ * capability about each compute-device.
+ *
+ * Device properties are lazily initialized when the first time properties are
+ * requested for a device.
+ */
+DeviceIndex num_gpus = -1;
+c10::once_flag init_flag;
+std::deque<c10::once_flag> device_prop_flags;
+std::vector<DeviceProp> device_properties;
+
+std::deque<c10::once_flag> device_global_idx_flags;
+std::vector<int32_t> device_global_idxs;
+
+void initXPUContextVectors() {
+  num_gpus = c10::xpu::device_count();
+  device_prop_flags.resize(num_gpus);
+  device_properties.resize(num_gpus);
+  device_global_idx_flags.resize(num_gpus);
+  device_global_idxs.resize(num_gpus);
+}
+
+void initDeviceProperty(DeviceIndex device) {
+  c10::xpu::get_device_properties(&device_properties[device], device);
+}
+
+void initDeviceGlobalIdx(DeviceIndex device) {
+  sycl::device& raw_device = c10::xpu::get_raw_device(device);
+  // Get all SYCL devices associated with the SYCL platform.
+  auto devices = sycl::device::get_devices();
+  auto match_device = [raw_device](const auto& dev) -> bool {
+    return raw_device == dev;
+  };
+  auto it = std::find_if(devices.begin(), devices.end(), match_device);
+  TORCH_CHECK(
+      it != devices.end(), "Can't find the global index of XPU device.");
+  device_global_idxs[device] =
+      static_cast<int32_t>(std::distance(devices.begin(), it));
+}
+
+inline void check_device(DeviceIndex device) {
+  TORCH_CHECK(
+      device >= 0 && device < num_gpus,
+      "device is out of range, device is ",
+      static_cast<int>(device),
+      ", total number of device is ",
+      static_cast<int>(num_gpus),
+      ".");
+}
+
+} // anonymous namespace
+
+DeviceProp* getCurrentDeviceProperties() {
+  auto device = c10::xpu::current_device();
+  return getDeviceProperties(device);
+}
+
+DeviceProp* getDeviceProperties(DeviceIndex device) {
+  c10::call_once(init_flag, initXPUContextVectors);
+  if (device == -1)
+    device = c10::xpu::current_device();
+  check_device(device);
+  c10::call_once(device_prop_flags[device], initDeviceProperty, device);
+  return &device_properties[device];
+}
+
+// Return the global index enumerated by sycl::device::get_devices based on the
+// index of a XPU device in the framework.
+int32_t getGlobalIdxFromDevice(DeviceIndex device) {
+  c10::call_once(init_flag, initXPUContextVectors);
+  check_device(device);
+  c10::call_once(device_global_idx_flags[device], initDeviceGlobalIdx, device);
+  return device_global_idxs[device];
+}
+
+} // namespace at::xpu
diff --git a/aten/src/ATen/xpu/XPUContext.h b/aten/src/ATen/xpu/XPUContext.h
new file mode 100644
index 0000000000000..fb8fbe9c0aa42
--- /dev/null
+++ b/aten/src/ATen/xpu/XPUContext.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <ATen/Context.h>
+#include <c10/xpu/XPUFunctions.h>
+#include <c10/xpu/XPUStream.h>
+
+namespace at::xpu {
+
+// XPU is available if we compiled with XPU.
+inline bool is_available() {
+  return c10::xpu::device_count() > 0;
+}
+
+TORCH_XPU_API DeviceProp* getCurrentDeviceProperties();
+
+TORCH_XPU_API DeviceProp* getDeviceProperties(DeviceIndex device);
+
+TORCH_XPU_API int32_t getGlobalIdxFromDevice(DeviceIndex device);
+
+} // namespace at::xpu
diff --git a/aten/src/ATen/xpu/XPUDevice.h b/aten/src/ATen/xpu/XPUDevice.h
new file mode 100644
index 0000000000000..d4ab7187513c1
--- /dev/null
+++ b/aten/src/ATen/xpu/XPUDevice.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <ATen/Context.h>
+#include <c10/xpu/XPUFunctions.h>
+
+namespace at::xpu {
+
+inline Device getDeviceFromPtr(void* ptr) {
+  auto device = c10::xpu::get_device_idx_from_pointer(ptr);
+  return {c10::DeviceType::XPU, device};
+}
+
+} // namespace at::xpu
diff --git a/aten/src/ATen/xpu/XPUEvent.h b/aten/src/ATen/xpu/XPUEvent.h
new file mode 100644
index 0000000000000..2417ee5f6b79a
--- /dev/null
+++ b/aten/src/ATen/xpu/XPUEvent.h
@@ -0,0 +1,166 @@
+#pragma once
+#include <ATen/xpu/XPUContext.h>
+
+#include <optional>
+
+namespace at::xpu {
+
+/*
+ * XPUEvent are movable not copyable wrappers around SYCL event. XPUEvent are
+ * constructed lazily when first recorded. It has a device, and this device is
+ * acquired from the first recording stream. Later streams that record the event
+ * must match the same device.
+ *
+ * Currently, XPUEvent does NOT support to export an inter-process event from
+ * another process via inter-process comunication(IPC). So it means that
+ * inter-process communication for event handles between different processes is
+ * not available. This could impact some applications that rely on cross-process
+ * synchronization and communication.
+ */
+struct TORCH_XPU_API XPUEvent {
+  // Constructors
+  XPUEvent(bool enable_timing = false) noexcept
+      : enable_timing_{enable_timing} {}
+
+  ~XPUEvent() {
+    if (isCreated()) {
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_deletion(
+            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
+      }
+    }
+  }
+
+  XPUEvent(const XPUEvent&) = delete;
+  XPUEvent& operator=(const XPUEvent&) = delete;
+
+  XPUEvent(XPUEvent&& other) = default;
+  XPUEvent& operator=(XPUEvent&& other) = default;
+
+  operator sycl::event&() const {
+    return event();
+  }
+
+  std::optional<at::Device> device() const {
+    if (isCreated()) {
+      return at::Device(at::kXPU, device_index_);
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  inline bool isCreated() const {
+    return (event_.get() != nullptr);
+  }
+
+  DeviceIndex device_index() const {
+    return device_index_;
+  }
+
+  sycl::event& event() const {
+    return *event_;
+  }
+
+  bool query() const {
+    using namespace sycl::info;
+    if (!isCreated()) {
+      return true;
+    }
+
+    return event().get_info<event::command_execution_status>() ==
+        event_command_status::complete;
+  }
+
+  void record() {
+    record(getCurrentXPUStream());
+  }
+
+  void recordOnce(const XPUStream& stream) {
+    if (!isCreated()) {
+      record(stream);
+    }
+  }
+
+  void record(const XPUStream& stream) {
+    if (!isCreated()) {
+      device_index_ = stream.device_index();
+      event_ = std::make_unique<sycl::event>(
+          stream.queue().ext_oneapi_submit_barrier());
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_creation(
+            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
+      }
+    } else {
+      TORCH_CHECK(
+          device_index_ == stream.device_index(),
+          "Event device ",
+          device_index_,
+          " does not match recording stream's device ",
+          stream.device_index(),
+          ".");
+      event_.reset();
+      event_ = std::make_unique<sycl::event>(
+          stream.queue().ext_oneapi_submit_barrier());
+    }
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_record(
+          at::kXPU,
+          reinterpret_cast<uintptr_t>(event_.get()),
+          reinterpret_cast<uintptr_t>(&stream.queue()));
+    }
+  }
+
+  void block(const XPUStream& stream) {
+    if (isCreated()) {
+      std::vector<sycl::event> event_list{event()};
+      // Make this stream wait until event_ is completed.
+      stream.queue().ext_oneapi_submit_barrier(event_list);
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_wait(
+            at::kXPU,
+            reinterpret_cast<uintptr_t>(event_.get()),
+            reinterpret_cast<uintptr_t>(&stream.queue()));
+      }
+    }
+  }
+
+  float elapsed_time(const XPUEvent& other) const {
+    TORCH_CHECK(
+        isCreated() && other.isCreated(),
+        "Both events must be recorded before calculating elapsed time.");
+    TORCH_CHECK(
+        query() && other.query(),
+        "Both events must be completed before calculating elapsed time.");
+    TORCH_CHECK(
+        enable_timing_ && other.enable_timing_,
+        "Both events must be created with argument 'enable_timing=True'.");
+    // TODO: provides the ability to time the execution of commands in a SYCL
+    // queue without enabling profiling on the entire queue
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, "elapsed_time is not supported by XPUEvent.");
+  }
+
+  void synchronize() const {
+    if (isCreated()) {
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_synchronization(
+            at::kXPU, reinterpret_cast<uintptr_t>(event_.get()));
+      }
+      event().wait_and_throw();
+    }
+  }
+
+ private:
+  bool enable_timing_ = false;
+  DeviceIndex device_index_ = -1;
+  // Only need to track the last event, as events in an in-order queue are
+  // executed sequentially.
+  std::unique_ptr<sycl::event> event_;
+};
+
+} // namespace at::xpu
diff --git a/aten/src/ATen/xpu/XPUGeneratorImpl.cpp b/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
new file mode 100644
index 0000000000000..95011baa820d8
--- /dev/null
+++ b/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
@@ -0,0 +1,171 @@
+#include <ATen/Utils.h>
+#include <ATen/xpu/XPUGeneratorImpl.h>
+#include <c10/core/StreamGuard.h>
+#include <c10/util/CallOnce.h>
+#include <c10/xpu/XPUFunctions.h>
+
+namespace at {
+namespace xpu::detail {
+namespace {
+
+/*
+ * Currently, there is one generator pool containing XPU generator per device.
+ * Each generator is lazily initialized the first time generator is
+ * requested for a device.
+ */
+c10::once_flag init_flag;
+DeviceIndex num_gpus = -1;
+std::deque<c10::once_flag> xpu_gens_init_flag;
+std::vector<Generator> default_gens_xpu;
+
+void initXPUGenVector() {
+  num_gpus = device_count();
+  xpu_gens_init_flag.resize(num_gpus);
+  default_gens_xpu.resize(num_gpus);
+}
+
+inline void check_device(DeviceIndex device) {
+  TORCH_CHECK(
+      device >= 0 && device < num_gpus,
+      "device is out of range, device is ",
+      static_cast<int16_t>(device),
+      ", total number of device is ",
+      static_cast<int16_t>(num_gpus),
+      ".");
+}
+
+} // anonymous namespace
+
+// Get the default generator with a random seed for a specific xpu device.
+const Generator& getDefaultXPUGenerator(DeviceIndex device) {
+  c10::call_once(init_flag, initXPUGenVector);
+  if (device == -1) {
+    device = c10::xpu::current_device();
+  }
+  check_device(device);
+  c10::call_once(xpu_gens_init_flag[device], [&]() {
+    default_gens_xpu[device] = make_generator<XPUGeneratorImpl>(device);
+    default_gens_xpu[device].seed();
+  });
+  return default_gens_xpu[device];
+}
+
+// Create a generator with a fixed seed for a specific xpu device.
+Generator createXPUGenerator(DeviceIndex device) {
+  c10::call_once(init_flag, initXPUGenVector);
+  if (device == -1) {
+    device = c10::xpu::current_device();
+  }
+  check_device(device);
+  auto gen = make_generator<XPUGeneratorImpl>(device);
+  auto xpu_gen = check_generator<XPUGeneratorImpl>(gen);
+  xpu_gen->set_current_seed(default_rng_seed_val);
+  xpu_gen->set_philox_offset_per_thread(0);
+  return gen;
+}
+
+} // namespace xpu::detail
+
+XPUGeneratorImpl::XPUGeneratorImpl(DeviceIndex device_index)
+    : GeneratorImpl{
+          Device(DeviceType::XPU, device_index),
+          DispatchKeySet(c10::DispatchKey::XPU)} {}
+
+void XPUGeneratorImpl::set_current_seed(uint64_t seed) {
+  seed_ = seed;
+  set_philox_offset_per_thread(0);
+}
+
+void XPUGeneratorImpl::set_offset(uint64_t offset) {
+  set_philox_offset_per_thread(offset);
+}
+
+uint64_t XPUGeneratorImpl::get_offset() const {
+  return philox_offset_per_thread_;
+}
+
+uint64_t XPUGeneratorImpl::current_seed() const {
+  return seed_;
+}
+
+uint64_t XPUGeneratorImpl::seed() {
+  auto random = c10::detail::getNonDeterministicRandom(true);
+  this->set_current_seed(random);
+  return random;
+}
+
+c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
+  // The RNG state comprises the seed, and an offset used for Philox.
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(uint64_t);
+  static const size_t total_size = seed_size + offset_size;
+
+  // The internal state is returned as a CPU byte tensor.
+  auto state_tensor = at::detail::empty_cpu(
+      {static_cast<int64_t>(total_size)},
+      ScalarType::Byte,
+      c10::nullopt,
+      c10::nullopt,
+      c10::nullopt,
+      c10::nullopt);
+  auto rng_state = state_tensor.data_ptr<uint8_t>();
+  auto current_seed = this->current_seed();
+  auto offset = this->philox_offset_per_thread();
+  memcpy(rng_state, &current_seed, seed_size);
+  memcpy(rng_state + seed_size, &offset, offset_size);
+
+  return state_tensor.getIntrusivePtr();
+}
+
+void XPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(uint64_t);
+  static const size_t total_size = seed_size + offset_size;
+
+  at::detail::check_rng_state(new_state);
+  auto new_state_size = new_state.numel();
+  TORCH_CHECK(new_state_size == total_size, "RNG state is wrong size");
+
+  uint64_t input_seed;
+  auto new_rng_state = new_state.data_dtype_initialized<uint8_t>();
+  memcpy(&input_seed, new_rng_state, seed_size);
+  this->set_current_seed(input_seed);
+  uint64_t philox_offset;
+  memcpy(&philox_offset, new_rng_state + seed_size, offset_size);
+  this->set_philox_offset_per_thread(philox_offset);
+}
+
+void XPUGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
+  TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
+  philox_offset_per_thread_ = offset;
+}
+
+uint64_t XPUGeneratorImpl::philox_offset_per_thread() const {
+  return philox_offset_per_thread_;
+}
+
+std::pair<uint64_t, uint64_t> XPUGeneratorImpl::philox_engine_inputs(
+    uint64_t increment) {
+  increment = ((increment + 3) / 4) * 4;
+  TORCH_INTERNAL_ASSERT(this->philox_offset_per_thread_ % 4 == 0);
+  uint64_t offset = this->philox_offset_per_thread_;
+  this->philox_offset_per_thread_ += increment;
+  return std::make_pair(this->seed_, offset);
+}
+
+DeviceType XPUGeneratorImpl::device_type() {
+  return DeviceType::XPU;
+}
+
+std::shared_ptr<XPUGeneratorImpl> XPUGeneratorImpl::clone() const {
+  return std::shared_ptr<XPUGeneratorImpl>(this->clone_impl());
+}
+
+XPUGeneratorImpl* XPUGeneratorImpl::clone_impl() const {
+  auto gen = new XPUGeneratorImpl(this->device().index());
+  gen->set_current_seed(this->seed_);
+  gen->set_philox_offset_per_thread(this->philox_offset_per_thread_);
+  return gen;
+}
+
+} // namespace at
diff --git a/aten/src/ATen/xpu/XPUGeneratorImpl.h b/aten/src/ATen/xpu/XPUGeneratorImpl.h
new file mode 100644
index 0000000000000..ce77d2e444e6b
--- /dev/null
+++ b/aten/src/ATen/xpu/XPUGeneratorImpl.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <ATen/core/Generator.h>
+
+namespace at {
+
+struct TORCH_API XPUGeneratorImpl : public GeneratorImpl {
+  // Constructors
+  XPUGeneratorImpl(DeviceIndex device_index = -1);
+  ~XPUGeneratorImpl() override = default;
+
+  // XPUGeneratorImpl methods
+  std::shared_ptr<XPUGeneratorImpl> clone() const;
+  void set_current_seed(uint64_t seed) override;
+  void set_offset(uint64_t offset) override;
+  uint64_t get_offset() const override;
+  uint64_t current_seed() const override;
+  uint64_t seed() override;
+  void set_state(const c10::TensorImpl& new_state) override;
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
+  void set_philox_offset_per_thread(uint64_t offset);
+  uint64_t philox_offset_per_thread() const;
+  std::pair<uint64_t, uint64_t> philox_engine_inputs(uint64_t increment);
+  static c10::DeviceType device_type();
+
+ private:
+  XPUGeneratorImpl* clone_impl() const override;
+  uint64_t seed_ = default_rng_seed_val;
+  uint64_t philox_offset_per_thread_ = 0;
+};
+
+namespace xpu::detail {
+
+TORCH_XPU_API const Generator& getDefaultXPUGenerator(DeviceIndex device = -1);
+
+TORCH_XPU_API Generator createXPUGenerator(DeviceIndex device = -1);
+
+} // namespace xpu::detail
+} // namespace at
diff --git a/aten/src/ATen/xpu/detail/XPUHooks.cpp b/aten/src/ATen/xpu/detail/XPUHooks.cpp
new file mode 100644
index 0000000000000..22f4ff22b4bba
--- /dev/null
+++ b/aten/src/ATen/xpu/detail/XPUHooks.cpp
@@ -0,0 +1,73 @@
+#include <ATen/xpu/PinnedMemoryAllocator.h>
+#include <ATen/xpu/XPUContext.h>
+#include <ATen/xpu/XPUDevice.h>
+#include <ATen/xpu/XPUGeneratorImpl.h>
+#include <ATen/xpu/detail/XPUHooks.h>
+#include <c10/util/CallOnce.h>
+#include <c10/util/Logging.h>
+#include <c10/xpu/XPUCachingAllocator.h>
+
+namespace at::xpu::detail {
+
+void XPUHooks::initXPU() const {
+  C10_LOG_API_USAGE_ONCE("aten.init.xpu");
+  const auto device_count = c10::xpu::device_count_ensure_non_zero();
+  c10::xpu::XPUCachingAllocator::init(device_count);
+}
+
+bool XPUHooks::hasXPU() const {
+  return true;
+}
+
+std::string XPUHooks::showConfig() const {
+  return "XPU backend";
+}
+
+int32_t XPUHooks::getGlobalIdxFromDevice(const at::Device& device) const {
+  TORCH_CHECK(device.is_xpu(), "Only the XPU device type is expected.");
+  return at::xpu::getGlobalIdxFromDevice(device.index());
+}
+
+Generator XPUHooks::getXPUGenerator(DeviceIndex device_index) const {
+  return make_generator<at::XPUGeneratorImpl>(device_index);
+}
+
+const Generator& XPUHooks::getDefaultXPUGenerator(
+    DeviceIndex device_index) const {
+  return at::xpu::detail::getDefaultXPUGenerator(device_index);
+}
+
+Device XPUHooks::getDeviceFromPtr(void* data) const {
+  return at::xpu::getDeviceFromPtr(data);
+}
+
+c10::DeviceIndex XPUHooks::getNumGPUs() const {
+  return at::xpu::device_count();
+}
+
+DeviceIndex XPUHooks::current_device() const {
+  return c10::xpu::current_device();
+}
+
+void XPUHooks::deviceSynchronize(DeviceIndex device_index) const {
+  // Only the SYCL queues we have reserved will be synchronized, see Note
+  // [Synchronize Streams on Device].
+  c10::xpu::syncStreamsOnDevice(device_index);
+}
+
+Allocator* XPUHooks::getPinnedMemoryAllocator() const {
+  return at::xpu::getPinnedMemoryAllocator();
+}
+
+bool XPUHooks::isPinnedPtr(const void* data) const {
+  if (!at::xpu::is_available()) {
+    return false;
+  }
+
+  return sycl::usm::alloc::host ==
+      sycl::get_pointer_type(data, c10::xpu::get_device_context());
+}
+
+REGISTER_XPU_HOOKS(XPUHooks);
+
+} // namespace at::xpu::detail
diff --git a/aten/src/ATen/xpu/detail/XPUHooks.h b/aten/src/ATen/xpu/detail/XPUHooks.h
new file mode 100644
index 0000000000000..30279583ff01a
--- /dev/null
+++ b/aten/src/ATen/xpu/detail/XPUHooks.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <ATen/detail/XPUHooksInterface.h>
+
+namespace at::xpu::detail {
+
+// The real implementation of XPUHooksInterface
+struct XPUHooks : public at::XPUHooksInterface {
+  XPUHooks(at::XPUHooksArgs) {}
+  void initXPU() const override;
+  bool hasXPU() const override;
+  std::string showConfig() const override;
+  int32_t getGlobalIdxFromDevice(const at::Device& device) const override;
+  Generator getXPUGenerator(DeviceIndex device_index = -1) const override;
+  const Generator& getDefaultXPUGenerator(
+      DeviceIndex device_index = -1) const override;
+  Device getDeviceFromPtr(void* data) const override;
+  c10::DeviceIndex getNumGPUs() const override;
+  DeviceIndex current_device() const override;
+  void deviceSynchronize(DeviceIndex device_index) const override;
+  Allocator* getPinnedMemoryAllocator() const override;
+  bool isPinnedPtr(const void* data) const override;
+};
+
+} // namespace at::xpu::detail
diff --git a/benchmarks/distributed/intra_node_comm/allgather_matmul.py b/benchmarks/distributed/intra_node_comm/allgather_matmul.py
new file mode 100644
index 0000000000000..f3fddd53044cd
--- /dev/null
+++ b/benchmarks/distributed/intra_node_comm/allgather_matmul.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# This file contains an example for using IntraNodeComm to implement efficient fused
+# allgather_matmul (inspired by https://dl.acm.org/doi/pdf/10.1145/3567955.3567959 and
+# @lw's efficient GPU implementation in xformers). Its purpose to help guide the
+# development of relevant primitives and serve as an example for interested users.
+#
+# The benchmark can be executed as follows:
+#   torchrun --nproc-per-node 8 allgather_matmul.py
+#
+# NOTE: _IntraNodeComm is a prototype API which WILL change over time.
+import os
+
+import torch
+import torch._C._distributed_c10d as c10d
+
+M = 16384
+N = 8192
+K = 2752
+
+WARMUP_ITERS = 200
+BENCH_ITERS = 50
+
+
+comm = None
+internal_stream = None
+internal_event = None
+
+
+def allgather_matmul(A_shard, B, out, rank, world_size):
+    """
+    Equivalent to `torch.matmul(dist.all_gather(A_shard), B)`.
+    """
+    buf_0 = torch.empty_like(A_shard)
+    buf_1 = torch.empty_like(A_shard)
+    out_shards = [
+        out[i : i + A_shard.shape[0]]
+        for i in range(0, world_size * A_shard.shape[0], A_shard.shape[0])
+    ]
+
+    # Perform matmul with the local input shard
+    torch.matmul(A_shard, B, out=out_shards[rank])
+
+    # In another stream, copy the local input shard into the intra-node
+    # buffer. After the barrier, all peers' input shards are accessible
+    # via their intra-node buffer without requiring synchronization.
+    with torch.cuda.stream(internal_stream):
+        comm.put(A_shard)
+        comm.barrier()
+        internal_event.record()
+    internal_event.wait()
+
+    # Copy input shard from remote buffer and perform matmul.
+    # Alternate between two streams to offset the wave quantization
+    # effect of smaller matmuls.
+    for i in range(1, world_size):
+        if i % 2 == 0:
+            buf = buf_0
+            stream = torch.cuda.current_stream()
+        else:
+            buf = buf_1
+            stream = internal_stream
+        remote = (i + rank) % world_size
+        with torch.cuda.stream(stream):
+            comm.get(remote, buf)
+            torch.matmul(buf, B, out=out_shards[remote])
+
+    # Perform another barrier to ensure all peers have completed consuming the
+    # intra-node buffer so it can be reused.
+    with torch.cuda.stream(internal_stream):
+        comm.barrier()
+        internal_event.record()
+    internal_event.wait()
+
+
+def do_bench(fn):
+    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
+    begin_evts = [torch.cuda.Event(enable_timing=True) for _ in range(BENCH_ITERS)]
+    end_evts = [torch.cuda.Event(enable_timing=True) for _ in range(BENCH_ITERS)]
+    for _ in range(WARMUP_ITERS):
+        fn()
+    for i in range(BENCH_ITERS):
+        cache.zero_()
+        begin_evts[i].record()
+        fn()
+        end_evts[i].record()
+
+    torch.cuda.synchronize()
+    return sum(b.elapsed_time(e) for b, e in zip(begin_evts, end_evts)) / BENCH_ITERS
+
+
+def main():
+    os.environ["ENABLE_INTRA_NODE_COMM"] = "1"
+
+    rank = int(os.environ["RANK"])
+    local_rank = int(os.environ["LOCAL_RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+
+    assert M % world_size == 0
+
+    torch.cuda.set_device(local_rank)
+    store, _, _ = next(torch.distributed.rendezvous("env://", rank, world_size))
+
+    global comm, internal_stream, internal_event
+    comm = c10d._IntraNodeComm(
+        store=store,
+        rank=rank,
+        world_size=world_size,
+        buffer_size=M * K * torch.finfo(torch.bfloat16).bits // 8 // world_size,
+    )
+    internal_stream = torch.cuda.Stream()
+    internal_event = torch.cuda.Event()
+
+    torch.manual_seed(42)
+    A = torch.randn((M, K), dtype=torch.bfloat16, device="cuda")
+    B = torch.randn((K, N), dtype=torch.bfloat16, device="cuda")
+    out = torch.empty((M, N), dtype=torch.bfloat16, device="cuda")
+
+    stride = M // world_size
+    A_shard = A[rank * stride : (rank + 1) * stride]
+
+    comm.barrier()
+    torch.cuda.synchronize()
+    allgather_matmul_ms = do_bench(
+        lambda: allgather_matmul(A_shard, B, out, rank, world_size)
+    )
+
+    comm.barrier()
+    torch.cuda.synchronize()
+    matmul_ms = do_bench(lambda: torch.matmul(A, B))
+
+    if rank == 0:
+        print(
+            "allgather_matmul "
+            f"(M={M // world_size}, N={N}, K={K}, world_size={world_size}): "
+            f"{allgather_matmul_ms:.4} ms/iter"
+        )
+        print(f"matmul (M={M}, N={N}, K={K}): {matmul_ms:.4} ms/iter")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/distributed/pipeline/pipe.py b/benchmarks/distributed/pipeline/pipe.py
index 8dbad789dfe19..670c1d5c15185 100644
--- a/benchmarks/distributed/pipeline/pipe.py
+++ b/benchmarks/distributed/pipeline/pipe.py
@@ -187,9 +187,7 @@ def get_last_device(model):
             cur_loss = total_loss / log_interval
             elapsed = time.time() - start_time
             print(
-                "| batch {:5d} | wps {:5.2f} | loss {:5.2f} | ppl {:8.2f}".format(
-                    i, word_counter / elapsed, cur_loss, math.exp(cur_loss)
-                )
+                f"| batch {i:5d} | wps {word_counter / elapsed:5.2f} | loss {cur_loss:5.2f} | ppl {math.exp(cur_loss):8.2f}"
             )
             word_counter = 0
             total_loss = 0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
index 731522270dd85..a5e00513153d9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@@ -2,31 +2,31 @@ name,accuracy,graph_breaks
 
 
 
-AlbertForMaskedLM,pass,5
+AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,4
+AlbertForQuestionAnswering,pass,5
 
 
 
-AllenaiLongformerBase,pass,8
+AllenaiLongformerBase,pass,9
 
 
 
-BartForCausalLM,pass,13
+BartForCausalLM,pass,12
 
 
 
-BartForConditionalGeneration,pass,25
+BartForConditionalGeneration,pass,24
 
 
 
-BertForMaskedLM,pass,4
+BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,4
+BertForQuestionAnswering,pass,5
 
 
 
@@ -34,23 +34,23 @@ BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
-BlenderbotSmallForCausalLM,pass,13
+BlenderbotSmallForCausalLM,pass,12
 
 
 
-BlenderbotSmallForConditionalGeneration,pass,25
+BlenderbotSmallForConditionalGeneration,pass,24
 
 
 
-CamemBert,pass,4
+CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,4
+DebertaForMaskedLM,pass,5
 
 
 
-DebertaForQuestionAnswering,pass,4
+DebertaForQuestionAnswering,pass,5
 
 
 
@@ -62,63 +62,63 @@ DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
 
 
 
-DistilBertForMaskedLM,pass,4
+DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,4
+DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,4
+DistillGPT2,pass,5
 
 
 
-ElectraForCausalLM,pass,5
+ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,4
+ElectraForQuestionAnswering,pass,5
 
 
 
-GPT2ForSequenceClassification,pass,6
+GPT2ForSequenceClassification,pass,7
 
 
 
-GoogleFnet,pass,4
+GoogleFnet,pass,5
 
 
 
-LayoutLMForMaskedLM,pass,4
+LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
+LayoutLMForSequenceClassification,pass,7
 
 
 
-M2M100ForConditionalGeneration,pass,5
+M2M100ForConditionalGeneration,pass,4
 
 
 
-MBartForCausalLM,pass,13
+MBartForCausalLM,pass,12
 
 
 
-MBartForConditionalGeneration,pass,25
+MBartForConditionalGeneration,pass,24
 
 
 
-MT5ForConditionalGeneration,pass,4
+MT5ForConditionalGeneration,pass,5
 
 
 
-MegatronBertForCausalLM,pass,4
+MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,4
+MegatronBertForQuestionAnswering,pass,5
 
 
 
@@ -130,19 +130,19 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,13
+OPTForCausalLM,pass,12
 
 
 
-PLBartForCausalLM,pass,13
+PLBartForCausalLM,pass,12
 
 
 
-PLBartForConditionalGeneration,pass,30
+PLBartForConditionalGeneration,pass,29
 
 
 
-PegasusForCausalLM,pass,13
+PegasusForCausalLM,pass,12
 
 
 
@@ -150,36 +150,36 @@ PegasusForConditionalGeneration,pass,23
 
 
 
-RobertaForCausalLM,pass,4
+RobertaForCausalLM,pass,5
 
 
 
-RobertaForQuestionAnswering,pass,4
+RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,13
+Speech2Text2ForCausalLM,pass,12
 
 
 
-T5ForConditionalGeneration,pass,4
+T5ForConditionalGeneration,pass,5
 
 
 
-T5Small,pass,4
+T5Small,pass,5
 
 
 
-TrOCRForCausalLM,pass,13
+TrOCRForCausalLM,pass,12
 
 
 
-XGLMForCausalLM,pass,13
+XGLMForCausalLM,pass,12
 
 
 
-XLNetLMHeadModel,pass,4
+XLNetLMHeadModel,pass,5
 
 
 
-YituTechConvBert,pass,4
+YituTechConvBert,pass,5
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
index 0f64c0c4e19a1..1def1d99bd536 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
@@ -2,15 +2,15 @@ name,accuracy,graph_breaks
 
 
 
-adv_inception_v3,pass,7
+adv_inception_v3,pass,6
 
 
 
-beit_base_patch16_224,pass,6
+beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,7
+botnet26t_256,pass,6
 
 
 
@@ -18,7 +18,7 @@ cait_m36_384,eager_fail_to_run,0
 
 
 
-coat_lite_mini,pass,7
+coat_lite_mini,pass,6
 
 
 
@@ -30,71 +30,71 @@ convmixer_768_32,pass,5
 
 
 
-convnext_base,pass,6
+convnext_base,pass,7
 
 
 
-crossvit_9_240,pass,6
+crossvit_9_240,pass,7
 
 
 
-cspdarknet53,pass,6
+cspdarknet53,pass,7
 
 
 
-deit_base_distilled_patch16_224,pass,6
+deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,6
+dla102,pass,7
 
 
 
-dm_nfnet_f0,pass,7
+dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,7
+dpn107,pass,6
 
 
 
-eca_botnext26ts_256,pass,6
+eca_botnext26ts_256,pass,7
 
 
 
-eca_halonext26ts,pass,6
+eca_halonext26ts,pass,7
 
 
 
-ese_vovnet19b_dw,pass,6
+ese_vovnet19b_dw,pass,7
 
 
 
-fbnetc_100,pass,6
+fbnetc_100,pass,7
 
 
 
-fbnetv3_b,pass,7
+fbnetv3_b,pass,6
 
 
 
-gernet_l,pass,7
+gernet_l,pass,6
 
 
 
-ghostnet_100,pass,7
+ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,6
+gluon_inception_v3,pass,7
 
 
 
-gmixer_24_224,pass,7
+gmixer_24_224,pass,6
 
 
 
-gmlp_s16_224,pass,6
+gmlp_s16_224,pass,7
 
 
 
@@ -102,51 +102,51 @@ hrnet_w18,pass,5
 
 
 
-inception_v3,pass,7
+inception_v3,pass,6
 
 
 
-jx_nest_base,pass,6
+jx_nest_base,pass,7
 
 
 
-lcnet_050,fail_accuracy,7
+lcnet_050,fail_accuracy,6
 
 
 
-levit_128,pass,6
+levit_128,pass,7
 
 
 
-mixer_b16_224,pass,6
+mixer_b16_224,pass,7
 
 
 
-mixnet_l,pass,7
+mixnet_l,pass,6
 
 
 
-mnasnet_100,pass,6
+mnasnet_100,pass,7
 
 
 
-mobilenetv2_100,pass,6
+mobilenetv2_100,pass,7
 
 
 
-mobilenetv3_large_100,pass,6
+mobilenetv3_large_100,pass,7
 
 
 
-mobilevit_s,pass,7
+mobilevit_s,pass,6
 
 
 
-nfnet_l0,pass,6
+nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,7
+pit_b_224,pass,6
 
 
 
@@ -154,92 +154,92 @@ pnasnet5large,pass,5
 
 
 
-poolformer_m36,pass,7
+poolformer_m36,pass,6
 
 
 
-regnety_002,pass,7
+regnety_002,pass,6
 
 
 
-repvgg_a2,pass,6
+repvgg_a2,pass,7
 
 
 
-res2net101_26w_4s,pass,7
+res2net101_26w_4s,pass,6
 
 
 
-res2net50_14w_8s,pass,7
+res2net50_14w_8s,pass,6
 
 
 
-res2next50,pass,7
+res2next50,pass,6
 
 
 
-resmlp_12_224,pass,7
+resmlp_12_224,pass,6
 
 
 
-resnest101e,pass,7
+resnest101e,pass,6
 
 
 
-rexnet_100,pass,6
+rexnet_100,pass,7
 
 
 
-sebotnet33ts_256,pass,7
+sebotnet33ts_256,pass,6
 
 
 
-selecsls42b,pass,7
+selecsls42b,pass,6
 
 
 
-spnasnet_100,pass,6
+spnasnet_100,pass,7
 
 
 
-swin_base_patch4_window7_224,pass,6
+swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,7
+swsl_resnext101_32x16d,pass,6
 
 
 
-tf_efficientnet_b0,pass,7
+tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,7
+tf_mixnet_l,pass,6
 
 
 
-tinynet_a,pass,7
+tinynet_a,pass,6
 
 
 
-tnt_s_patch16_224,pass,6
+tnt_s_patch16_224,pass,7
 
 
 
-twins_pcpvt_base,pass,6
+twins_pcpvt_base,pass,7
 
 
 
-visformer_small,pass,6
+visformer_small,pass,7
 
 
 
-vit_base_patch16_224,pass,6
+vit_base_patch16_224,pass,7
 
 
 
-volo_d1_224,pass,6
+volo_d1_224,pass,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,6
+xcit_large_24_p8_224,pass_due_to_skip,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
index 3a36d787de484..5b5646e854875 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@@ -14,7 +14,7 @@ Background_Matting,pass_due_to_skip,0
 
 
 
-DALLE2_pytorch,pass,11
+DALLE2_pytorch,pass,12
 
 
 
@@ -86,7 +86,7 @@ detectron2_fasterrcnn_r_50_fpn,eager_fail_to_run,0
 
 
 
-detectron2_fcos_r_50_fpn,pass,41
+detectron2_fcos_r_50_fpn,pass,21
 
 
 
@@ -142,6 +142,14 @@ hf_Bart,pass,0
 
 
 
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
 hf_BigBird,pass,0
 
 
@@ -182,7 +190,7 @@ hf_Whisper,pass,0
 
 
 
-hf_clip,eager_fail_to_run,0
+hf_distil_whisper,pass,0
 
 
 
@@ -198,6 +206,14 @@ llama_v2_7b_16h,model_fail_to_load,0
 
 
 
+llava,model_fail_to_load,0
+
+
+
+maml,pass_due_to_skip,0
+
+
+
 maml_omniglot,pass,0
 
 
@@ -218,23 +234,23 @@ mobilenet_v3_large,pass,0
 
 
 
-moco,pass,11
+moco,pass,5
 
 
 
-nanogpt,pass,0
+moondream,model_fail_to_load,0
 
 
 
-nvidia_deeprecommender,pass,0
+nanogpt,pass,0
 
 
 
-opacus_cifar10,pass,0
+nvidia_deeprecommender,pass,0
 
 
 
-phi_1_5,model_fail_to_load,0
+opacus_cifar10,pass,0
 
 
 
@@ -294,6 +310,10 @@ sam,pass,0
 
 
 
+sam_fast,pass,0
+
+
+
 shufflenet_v2_x1_0,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
index 1c8dbb156c3e3..0dd9ce3482f4a 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@@ -2,11 +2,11 @@ name,accuracy,graph_breaks
 
 
 
-torchrec_dlrm,pass,7
+torchrec_dlrm,pass,6
 
 
 
-BERT_pytorch,pass,7
+BERT_pytorch,pass,6
 
 
 
@@ -18,51 +18,43 @@ DALLE2_pytorch,eager_fail_to_run,0
 
 
 
-LearningToPaint,pass,7
+LearningToPaint,pass,6
 
 
 
-Super_SloMo,pass,6
+Super_SloMo,pass,7
 
 
 
-alexnet,pass,7
+alexnet,pass,6
 
 
 
-basic_gnn_edgecnn,pass,21
+basic_gnn_edgecnn,pass,22
 
 
 
-basic_gnn_gcn,pass,12
+basic_gnn_gcn,pass,13
 
 
 
-basic_gnn_gin,pass,6
+basic_gnn_gin,pass,7
 
 
 
-basic_gnn_sage,pass,6
+basic_gnn_sage,pass,7
 
 
 
-cm3leon_generate,eager_fail_to_run,0
+dcgan,pass,6
 
 
 
-dcgan,pass,7
+demucs,pass,9
 
 
 
-demucs,pass,10
-
-
-
-densenet121,pass,7
-
-
-
-detectron2_fcos_r_50_fpn,model_fail_to_load,0
+densenet121,pass,6
 
 
 
@@ -70,51 +62,51 @@ detectron2_maskrcnn_r_50_c4,eager_fail_to_run,0
 
 
 
-dlrm,pass,7
+dlrm,pass,6
 
 
 
-doctr_det_predictor,eager_fail_to_run,0
+drq,pass,6
 
 
 
-doctr_reco_predictor,eager_fail_to_run,0
+fastNLP_Bert,pass,10
 
 
 
-drq,pass,5
+functorch_dp_cifar10,pass,7
 
 
 
-fastNLP_Bert,pass,11
+functorch_maml_omniglot,pass,7
 
 
 
-functorch_dp_cifar10,pass,6
+hf_Albert,pass,6
 
 
 
-functorch_maml_omniglot,pass,6
+hf_Bart,pass,6
 
 
 
-hf_Albert,pass,5
+hf_Bert,pass,6
 
 
 
-hf_Bart,pass,5
+hf_Bert_large,pass,6
 
 
 
-hf_BigBird,pass,5
+hf_BigBird,pass,6
 
 
 
-hf_DistilBert,pass,5
+hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,5
+hf_GPT2,pass,6
 
 
 
@@ -122,43 +114,43 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,25
+hf_Reformer,pass,26
 
 
 
-hf_T5_base,pass,6
+hf_T5_base,eager_2nd_run_OOM,0
 
 
 
-hf_T5_generate,eager_fail_to_run,0
+hf_T5_large,pass_due_to_skip,0
 
 
 
-hf_T5_large,pass_due_to_skip,0
+hf_Whisper,pass,6
 
 
 
-hf_Whisper,pass,5
+hf_distil_whisper,model_fail_to_load,0
 
 
 
-hf_clip,eager_fail_to_run,0
+lennard_jones,pass,7
 
 
 
-lennard_jones,pass,6
+llava,model_fail_to_load,0
 
 
 
-maml_omniglot,pass,6
+maml_omniglot,pass,7
 
 
 
-mnasnet1_0,pass,6
+mnasnet1_0,pass,7
 
 
 
-mobilenet_v2,pass,7
+mobilenet_v2,pass,6
 
 
 
@@ -166,19 +158,19 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0
 
 
 
-mobilenet_v3_large,pass,6
+mobilenet_v3_large,pass,7
 
 
 
-moco,pass,18
+moco,pass,11
 
 
 
-nanogpt,pass,6
+nanogpt,pass,7
 
 
 
-nvidia_deeprecommender,pass,6
+nvidia_deeprecommender,pass,7
 
 
 
@@ -186,35 +178,35 @@ opacus_cifar10,eager_fail_to_run,0
 
 
 
-phlippe_densenet,pass,7
+phlippe_densenet,pass,6
 
 
 
-phlippe_resnet,pass,7
+phlippe_resnet,pass,6
 
 
 
-pytorch_CycleGAN_and_pix2pix,pass,7
+pytorch_CycleGAN_and_pix2pix,pass,6
 
 
 
-pytorch_stargan,pass,7
+pytorch_stargan,pass,6
 
 
 
-pytorch_unet,pass,6
+pytorch_unet,pass_due_to_skip,7
 
 
 
-resnet152,pass,6
+resnet152,pass,7
 
 
 
-resnet18,pass,7
+resnet18,pass,6
 
 
 
-resnet50,pass,7
+resnet50,pass,6
 
 
 
@@ -222,7 +214,7 @@ resnet50_quantized_qat,eager_fail_to_run,0
 
 
 
-resnext50_32x4d,pass,6
+resnext50_32x4d,pass,7
 
 
 
@@ -230,23 +222,23 @@ sam,eager_fail_to_run,0
 
 
 
-shufflenet_v2_x1_0,pass,7
+shufflenet_v2_x1_0,pass,6
 
 
 
-soft_actor_critic,pass,5
+soft_actor_critic,pass,6
 
 
 
-speech_transformer,pass,17
+speech_transformer,pass,16
 
 
 
-squeezenet1_1,pass,7
+squeezenet1_1,pass,6
 
 
 
-stable_diffusion_text_encoder,pass,6
+stable_diffusion_text_encoder,pass,5
 
 
 
@@ -254,19 +246,19 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,6
+timm_efficientnet,pass,7
 
 
 
-timm_regnet,pass,7
+timm_regnet,pass,6
 
 
 
-timm_resnest,pass,6
+timm_resnest,pass,7
 
 
 
-timm_vision_transformer,pass,7
+timm_vision_transformer,pass,6
 
 
 
@@ -274,24 +266,24 @@ timm_vision_transformer_large,pass_due_to_skip,0
 
 
 
-timm_vovnet,pass,7
+timm_vovnet,pass,6
 
 
 
-torch_multimodal_clip,pass,6
+torch_multimodal_clip,pass,7
 
 
 
-tts_angular,pass,8
+tts_angular,pass,9
 
 
 
-vgg16,pass,7
+vgg16,pass,6
 
 
 
-vision_maskrcnn,pass,35
+vision_maskrcnn,pass,34
 
 
 
-yolov3,pass,8
+yolov3,pass,9
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
index 2fb7214b0c50a..3e0af614a38c4 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
@@ -130,6 +130,14 @@ hf_Bart,pass,0
 
 
 
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
 hf_BigBird,fail_accuracy,0
 
 
@@ -162,7 +170,7 @@ hf_Whisper,pass,0
 
 
 
-hf_clip,eager_fail_to_run,0
+hf_distil_whisper,pass,0
 
 
 
@@ -178,6 +186,14 @@ llama_v2_7b_16h,model_fail_to_load,0
 
 
 
+llava,model_fail_to_load,0
+
+
+
+maml,pass_due_to_skip,0
+
+
+
 maml_omniglot,pass,0
 
 
@@ -202,15 +218,15 @@ moco,fail_to_run,0
 
 
 
-nanogpt,pass,0
+moondream,model_fail_to_load,0
 
 
 
-nvidia_deeprecommender,pass,0
+nanogpt,pass,0
 
 
 
-phi_1_5,model_fail_to_load,0
+nvidia_deeprecommender,pass,0
 
 
 
@@ -238,7 +254,7 @@ pytorch_CycleGAN_and_pix2pix,pass,0
 
 
 
-pytorch_stargan,fail_to_run,0
+pytorch_stargan,pass,0
 
 
 
@@ -270,6 +286,10 @@ sam,fail_to_run,0
 
 
 
+sam_fast,fail_to_run,0
+
+
+
 shufflenet_v2_x1_0,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
index 42471adb539fe..fcd87f4d24545 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@@ -54,47 +54,47 @@ densenet121,pass,0
 
 
 
-detectron2_fasterrcnn_r_101_c4,pass,52
+detectron2_fasterrcnn_r_101_c4,pass,42
 
 
 
-detectron2_fasterrcnn_r_101_dc5,pass,52
+detectron2_fasterrcnn_r_101_dc5,pass,42
 
 
 
-detectron2_fasterrcnn_r_101_fpn,pass,56
+detectron2_fasterrcnn_r_101_fpn,pass,46
 
 
 
-detectron2_fasterrcnn_r_50_c4,pass,52
+detectron2_fasterrcnn_r_50_c4,pass,42
 
 
 
-detectron2_fasterrcnn_r_50_dc5,pass,52
+detectron2_fasterrcnn_r_50_dc5,pass,42
 
 
 
-detectron2_fasterrcnn_r_50_fpn,pass,56
+detectron2_fasterrcnn_r_50_fpn,pass,46
 
 
 
-detectron2_fcos_r_50_fpn,pass,44
+detectron2_fcos_r_50_fpn,pass,23
 
 
 
-detectron2_maskrcnn_r_101_c4,fail_accuracy,67
+detectron2_maskrcnn_r_101_c4,fail_accuracy,57
 
 
 
-detectron2_maskrcnn_r_101_fpn,pass,74
+detectron2_maskrcnn_r_101_fpn,pass,64
 
 
 
-detectron2_maskrcnn_r_50_c4,pass,67
+detectron2_maskrcnn_r_50_c4,pass,57
 
 
 
-detectron2_maskrcnn_r_50_fpn,pass,74
+detectron2_maskrcnn_r_50_fpn,pass,64
 
 
 
@@ -134,6 +134,14 @@ hf_Bart,pass,0
 
 
 
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
 hf_DistilBert,pass,0
 
 
@@ -158,7 +166,7 @@ hf_T5_large,pass_due_to_skip,0
 
 
 
-hf_clip,eager_fail_to_run,0
+hf_distil_whisper,pass,0
 
 
 
@@ -170,6 +178,10 @@ llama,pass,0
 
 
 
+maml,pass_due_to_skip,0
+
+
+
 maml_omniglot,pass,0
 
 
@@ -182,7 +194,7 @@ mobilenet_v2,pass,0
 
 
 
-mobilenet_v2_quantized_qat,eager_fail_to_run,0
+mobilenet_v2_quantized_qat,pass,2
 
 
 
@@ -194,15 +206,15 @@ moco,model_fail_to_load,0
 
 
 
-nvidia_deeprecommender,pass,0
+moondream,pass,0
 
 
 
-opacus_cifar10,pass,0
+nvidia_deeprecommender,pass,0
 
 
 
-phi_1_5,pass,0
+opacus_cifar10,pass,0
 
 
 
@@ -250,7 +262,7 @@ resnet50,pass,0
 
 
 
-resnet50_quantized_qat,eager_fail_to_run,0
+resnet50_quantized_qat,pass,2
 
 
 
@@ -322,7 +334,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,pass,29
+vision_maskrcnn,pass,28
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
index 731522270dd85..a5e00513153d9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
@@ -2,31 +2,31 @@ name,accuracy,graph_breaks
 
 
 
-AlbertForMaskedLM,pass,5
+AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,4
+AlbertForQuestionAnswering,pass,5
 
 
 
-AllenaiLongformerBase,pass,8
+AllenaiLongformerBase,pass,9
 
 
 
-BartForCausalLM,pass,13
+BartForCausalLM,pass,12
 
 
 
-BartForConditionalGeneration,pass,25
+BartForConditionalGeneration,pass,24
 
 
 
-BertForMaskedLM,pass,4
+BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,4
+BertForQuestionAnswering,pass,5
 
 
 
@@ -34,23 +34,23 @@ BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
-BlenderbotSmallForCausalLM,pass,13
+BlenderbotSmallForCausalLM,pass,12
 
 
 
-BlenderbotSmallForConditionalGeneration,pass,25
+BlenderbotSmallForConditionalGeneration,pass,24
 
 
 
-CamemBert,pass,4
+CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,4
+DebertaForMaskedLM,pass,5
 
 
 
-DebertaForQuestionAnswering,pass,4
+DebertaForQuestionAnswering,pass,5
 
 
 
@@ -62,63 +62,63 @@ DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
 
 
 
-DistilBertForMaskedLM,pass,4
+DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,4
+DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,4
+DistillGPT2,pass,5
 
 
 
-ElectraForCausalLM,pass,5
+ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,4
+ElectraForQuestionAnswering,pass,5
 
 
 
-GPT2ForSequenceClassification,pass,6
+GPT2ForSequenceClassification,pass,7
 
 
 
-GoogleFnet,pass,4
+GoogleFnet,pass,5
 
 
 
-LayoutLMForMaskedLM,pass,4
+LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
+LayoutLMForSequenceClassification,pass,7
 
 
 
-M2M100ForConditionalGeneration,pass,5
+M2M100ForConditionalGeneration,pass,4
 
 
 
-MBartForCausalLM,pass,13
+MBartForCausalLM,pass,12
 
 
 
-MBartForConditionalGeneration,pass,25
+MBartForConditionalGeneration,pass,24
 
 
 
-MT5ForConditionalGeneration,pass,4
+MT5ForConditionalGeneration,pass,5
 
 
 
-MegatronBertForCausalLM,pass,4
+MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,4
+MegatronBertForQuestionAnswering,pass,5
 
 
 
@@ -130,19 +130,19 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,13
+OPTForCausalLM,pass,12
 
 
 
-PLBartForCausalLM,pass,13
+PLBartForCausalLM,pass,12
 
 
 
-PLBartForConditionalGeneration,pass,30
+PLBartForConditionalGeneration,pass,29
 
 
 
-PegasusForCausalLM,pass,13
+PegasusForCausalLM,pass,12
 
 
 
@@ -150,36 +150,36 @@ PegasusForConditionalGeneration,pass,23
 
 
 
-RobertaForCausalLM,pass,4
+RobertaForCausalLM,pass,5
 
 
 
-RobertaForQuestionAnswering,pass,4
+RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,13
+Speech2Text2ForCausalLM,pass,12
 
 
 
-T5ForConditionalGeneration,pass,4
+T5ForConditionalGeneration,pass,5
 
 
 
-T5Small,pass,4
+T5Small,pass,5
 
 
 
-TrOCRForCausalLM,pass,13
+TrOCRForCausalLM,pass,12
 
 
 
-XGLMForCausalLM,pass,13
+XGLMForCausalLM,pass,12
 
 
 
-XLNetLMHeadModel,pass,4
+XLNetLMHeadModel,pass,5
 
 
 
-YituTechConvBert,pass,4
+YituTechConvBert,pass,5
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv
index 0f64c0c4e19a1..1def1d99bd536 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv
@@ -2,15 +2,15 @@ name,accuracy,graph_breaks
 
 
 
-adv_inception_v3,pass,7
+adv_inception_v3,pass,6
 
 
 
-beit_base_patch16_224,pass,6
+beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,7
+botnet26t_256,pass,6
 
 
 
@@ -18,7 +18,7 @@ cait_m36_384,eager_fail_to_run,0
 
 
 
-coat_lite_mini,pass,7
+coat_lite_mini,pass,6
 
 
 
@@ -30,71 +30,71 @@ convmixer_768_32,pass,5
 
 
 
-convnext_base,pass,6
+convnext_base,pass,7
 
 
 
-crossvit_9_240,pass,6
+crossvit_9_240,pass,7
 
 
 
-cspdarknet53,pass,6
+cspdarknet53,pass,7
 
 
 
-deit_base_distilled_patch16_224,pass,6
+deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,6
+dla102,pass,7
 
 
 
-dm_nfnet_f0,pass,7
+dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,7
+dpn107,pass,6
 
 
 
-eca_botnext26ts_256,pass,6
+eca_botnext26ts_256,pass,7
 
 
 
-eca_halonext26ts,pass,6
+eca_halonext26ts,pass,7
 
 
 
-ese_vovnet19b_dw,pass,6
+ese_vovnet19b_dw,pass,7
 
 
 
-fbnetc_100,pass,6
+fbnetc_100,pass,7
 
 
 
-fbnetv3_b,pass,7
+fbnetv3_b,pass,6
 
 
 
-gernet_l,pass,7
+gernet_l,pass,6
 
 
 
-ghostnet_100,pass,7
+ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,6
+gluon_inception_v3,pass,7
 
 
 
-gmixer_24_224,pass,7
+gmixer_24_224,pass,6
 
 
 
-gmlp_s16_224,pass,6
+gmlp_s16_224,pass,7
 
 
 
@@ -102,51 +102,51 @@ hrnet_w18,pass,5
 
 
 
-inception_v3,pass,7
+inception_v3,pass,6
 
 
 
-jx_nest_base,pass,6
+jx_nest_base,pass,7
 
 
 
-lcnet_050,fail_accuracy,7
+lcnet_050,fail_accuracy,6
 
 
 
-levit_128,pass,6
+levit_128,pass,7
 
 
 
-mixer_b16_224,pass,6
+mixer_b16_224,pass,7
 
 
 
-mixnet_l,pass,7
+mixnet_l,pass,6
 
 
 
-mnasnet_100,pass,6
+mnasnet_100,pass,7
 
 
 
-mobilenetv2_100,pass,6
+mobilenetv2_100,pass,7
 
 
 
-mobilenetv3_large_100,pass,6
+mobilenetv3_large_100,pass,7
 
 
 
-mobilevit_s,pass,7
+mobilevit_s,pass,6
 
 
 
-nfnet_l0,pass,6
+nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,7
+pit_b_224,pass,6
 
 
 
@@ -154,92 +154,92 @@ pnasnet5large,pass,5
 
 
 
-poolformer_m36,pass,7
+poolformer_m36,pass,6
 
 
 
-regnety_002,pass,7
+regnety_002,pass,6
 
 
 
-repvgg_a2,pass,6
+repvgg_a2,pass,7
 
 
 
-res2net101_26w_4s,pass,7
+res2net101_26w_4s,pass,6
 
 
 
-res2net50_14w_8s,pass,7
+res2net50_14w_8s,pass,6
 
 
 
-res2next50,pass,7
+res2next50,pass,6
 
 
 
-resmlp_12_224,pass,7
+resmlp_12_224,pass,6
 
 
 
-resnest101e,pass,7
+resnest101e,pass,6
 
 
 
-rexnet_100,pass,6
+rexnet_100,pass,7
 
 
 
-sebotnet33ts_256,pass,7
+sebotnet33ts_256,pass,6
 
 
 
-selecsls42b,pass,7
+selecsls42b,pass,6
 
 
 
-spnasnet_100,pass,6
+spnasnet_100,pass,7
 
 
 
-swin_base_patch4_window7_224,pass,6
+swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,7
+swsl_resnext101_32x16d,pass,6
 
 
 
-tf_efficientnet_b0,pass,7
+tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,7
+tf_mixnet_l,pass,6
 
 
 
-tinynet_a,pass,7
+tinynet_a,pass,6
 
 
 
-tnt_s_patch16_224,pass,6
+tnt_s_patch16_224,pass,7
 
 
 
-twins_pcpvt_base,pass,6
+twins_pcpvt_base,pass,7
 
 
 
-visformer_small,pass,6
+visformer_small,pass,7
 
 
 
-vit_base_patch16_224,pass,6
+vit_base_patch16_224,pass,7
 
 
 
-volo_d1_224,pass,6
+volo_d1_224,pass,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,6
+xcit_large_24_p8_224,pass_due_to_skip,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
index 75eefb82684e2..07bbe765f1616 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@@ -14,7 +14,7 @@ Background_Matting,pass_due_to_skip,0
 
 
 
-DALLE2_pytorch,pass,11
+DALLE2_pytorch,pass,12
 
 
 
@@ -86,6 +86,10 @@ detectron2_fasterrcnn_r_50_fpn,eager_fail_to_run,0
 
 
 
+detectron2_fcos_r_50_fpn,pass,21
+
+
+
 detectron2_maskrcnn_r_101_c4,eager_fail_to_run,0
 
 
@@ -138,6 +142,14 @@ hf_Bart,pass,0
 
 
 
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
 hf_BigBird,fail_to_run,0
 
 
@@ -178,7 +190,7 @@ hf_Whisper,pass,0
 
 
 
-hf_clip,eager_fail_to_run,0
+hf_distil_whisper,pass,0
 
 
 
@@ -194,6 +206,14 @@ llama_v2_7b_16h,model_fail_to_load,0
 
 
 
+llava,model_fail_to_load,0
+
+
+
+maml,pass_due_to_skip,0
+
+
+
 maml_omniglot,pass,0
 
 
@@ -214,23 +234,23 @@ mobilenet_v3_large,pass,0
 
 
 
-moco,pass,11
+moco,pass,5
 
 
 
-nanogpt,pass,0
+moondream,model_fail_to_load,0
 
 
 
-nvidia_deeprecommender,pass,0
+nanogpt,pass,0
 
 
 
-opacus_cifar10,pass,0
+nvidia_deeprecommender,pass,0
 
 
 
-phi_1_5,model_fail_to_load,0
+opacus_cifar10,pass,0
 
 
 
@@ -250,6 +270,10 @@ pyhpc_isoneutral_mixing,pass,0
 
 
 
+pyhpc_turbulent_kinetic_energy,pass,0
+
+
+
 pytorch_CycleGAN_and_pix2pix,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
index 79c44a62900a0..80035c453fbf0 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@@ -2,63 +2,59 @@ name,accuracy,graph_breaks
 
 
 
-BERT_pytorch,pass,7
+torchrec_dlrm,fail_to_run,3
 
 
 
-Background_Matting,pass_due_to_skip,0
-
-
-
-DALLE2_pytorch,eager_fail_to_run,0
+BERT_pytorch,pass,6
 
 
 
-LearningToPaint,pass,7
+Background_Matting,pass_due_to_skip,0
 
 
 
-Super_SloMo,pass,6
+DALLE2_pytorch,eager_fail_to_run,0
 
 
 
-alexnet,pass,7
+LearningToPaint,pass,6
 
 
 
-basic_gnn_edgecnn,pass,21
+Super_SloMo,pass,7
 
 
 
-basic_gnn_gcn,pass,12
+alexnet,pass,6
 
 
 
-basic_gnn_gin,pass,6
+basic_gnn_edgecnn,pass,22
 
 
 
-basic_gnn_sage,pass,6
+basic_gnn_gcn,pass,13
 
 
 
-cm3leon_generate,eager_fail_to_run,0
+basic_gnn_gin,pass,7
 
 
 
-dcgan,pass,7
+basic_gnn_sage,pass,7
 
 
 
-demucs,pass,10
+dcgan,pass,6
 
 
 
-densenet121,pass,7
+demucs,pass,9
 
 
 
-detectron2_fcos_r_50_fpn,model_fail_to_load,0
+densenet121,pass,6
 
 
 
@@ -66,39 +62,39 @@ detectron2_maskrcnn_r_50_c4,eager_fail_to_run,0
 
 
 
-dlrm,pass,7
+dlrm,pass,6
 
 
 
-doctr_det_predictor,eager_fail_to_run,0
+drq,pass,6
 
 
 
-doctr_reco_predictor,eager_fail_to_run,0
+fastNLP_Bert,pass,10
 
 
 
-drq,pass,5
+functorch_dp_cifar10,pass,7
 
 
 
-fastNLP_Bert,pass,11
+functorch_maml_omniglot,pass,7
 
 
 
-functorch_dp_cifar10,pass,6
+hf_Albert,pass,6
 
 
 
-functorch_maml_omniglot,pass,6
+hf_Bart,pass,6
 
 
 
-hf_Albert,pass,5
+hf_Bert,pass,6
 
 
 
-hf_Bart,pass,5
+hf_Bert_large,pass,6
 
 
 
@@ -106,11 +102,11 @@ hf_BigBird,fail_to_run,3
 
 
 
-hf_DistilBert,pass,5
+hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,5
+hf_GPT2,pass,6
 
 
 
@@ -118,43 +114,43 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,25
+hf_Reformer,pass,26
 
 
 
-hf_T5_base,pass,6
+hf_T5_base,eager_2nd_run_OOM,0
 
 
 
-hf_T5_generate,eager_fail_to_run,0
+hf_T5_large,pass_due_to_skip,0
 
 
 
-hf_T5_large,pass_due_to_skip,0
+hf_Whisper,pass,6
 
 
 
-hf_Whisper,pass,5
+hf_distil_whisper,model_fail_to_load,0
 
 
 
-hf_clip,eager_fail_to_run,0
+lennard_jones,pass,7
 
 
 
-lennard_jones,pass,6
+llava,model_fail_to_load,0
 
 
 
-maml_omniglot,pass,6
+maml_omniglot,pass,7
 
 
 
-mnasnet1_0,pass,6
+mnasnet1_0,pass,7
 
 
 
-mobilenet_v2,pass,7
+mobilenet_v2,pass,6
 
 
 
@@ -162,19 +158,19 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0
 
 
 
-mobilenet_v3_large,pass,6
+mobilenet_v3_large,pass,7
 
 
 
-moco,pass,18
+moco,pass,11
 
 
 
-nanogpt,pass,6
+nanogpt,pass,7
 
 
 
-nvidia_deeprecommender,pass,6
+nvidia_deeprecommender,pass,7
 
 
 
@@ -182,35 +178,35 @@ opacus_cifar10,eager_fail_to_run,0
 
 
 
-phlippe_densenet,pass,7
+phlippe_densenet,pass,6
 
 
 
-phlippe_resnet,pass,7
+phlippe_resnet,pass,6
 
 
 
-pytorch_CycleGAN_and_pix2pix,pass,7
+pytorch_CycleGAN_and_pix2pix,pass,6
 
 
 
-pytorch_stargan,pass,7
+pytorch_stargan,pass,6
 
 
 
-pytorch_unet,pass,6
+pytorch_unet,pass_due_to_skip,7
 
 
 
-resnet152,pass,6
+resnet152,pass,7
 
 
 
-resnet18,pass,7
+resnet18,pass,6
 
 
 
-resnet50,pass,7
+resnet50,pass,6
 
 
 
@@ -218,7 +214,7 @@ resnet50_quantized_qat,eager_fail_to_run,0
 
 
 
-resnext50_32x4d,pass,6
+resnext50_32x4d,pass,7
 
 
 
@@ -226,19 +222,19 @@ sam,eager_fail_to_run,0
 
 
 
-shufflenet_v2_x1_0,pass,7
+shufflenet_v2_x1_0,pass,6
 
 
 
-soft_actor_critic,pass,5
+soft_actor_critic,pass,6
 
 
 
-squeezenet1_1,pass,7
+squeezenet1_1,pass,6
 
 
 
-stable_diffusion_text_encoder,pass,6
+stable_diffusion_text_encoder,pass,5
 
 
 
@@ -246,19 +242,19 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,6
+timm_efficientnet,pass,7
 
 
 
-timm_regnet,pass,7
+timm_regnet,pass,6
 
 
 
-timm_resnest,pass,6
+timm_resnest,pass,7
 
 
 
-timm_vision_transformer,pass,7
+timm_vision_transformer,pass,6
 
 
 
@@ -266,24 +262,24 @@ timm_vision_transformer_large,pass_due_to_skip,0
 
 
 
-timm_vovnet,pass,7
+timm_vovnet,pass,6
 
 
 
-torch_multimodal_clip,pass,6
+torch_multimodal_clip,pass,7
 
 
 
-tts_angular,pass,8
+tts_angular,pass,9
 
 
 
-vgg16,pass,7
+vgg16,pass,6
 
 
 
-vision_maskrcnn,pass,35
+vision_maskrcnn,pass,34
 
 
 
-yolov3,pass,8
+yolov3,pass,9
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
index 44a8982db83cc..ce271939b18cc 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@@ -54,6 +54,10 @@ densenet121,pass,0
 
 
 
+detectron2_fcos_r_50_fpn,pass,23
+
+
+
 dlrm,pass,0
 
 
@@ -90,6 +94,14 @@ hf_Bart,pass,0
 
 
 
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
 hf_DistilBert,pass,0
 
 
@@ -114,7 +126,7 @@ hf_T5_large,pass_due_to_skip,0
 
 
 
-hf_clip,eager_fail_to_run,0
+hf_distil_whisper,pass,0
 
 
 
@@ -126,6 +138,10 @@ llama,pass,0
 
 
 
+maml,pass_due_to_skip,0
+
+
+
 maml_omniglot,pass,0
 
 
@@ -138,7 +154,7 @@ mobilenet_v2,pass,0
 
 
 
-mobilenet_v2_quantized_qat,eager_fail_to_run,0
+mobilenet_v2_quantized_qat,pass,2
 
 
 
@@ -150,15 +166,15 @@ moco,model_fail_to_load,0
 
 
 
-nvidia_deeprecommender,pass,0
+moondream,pass,0
 
 
 
-opacus_cifar10,pass,0
+nvidia_deeprecommender,pass,0
 
 
 
-phi_1_5,pass,0
+opacus_cifar10,pass,0
 
 
 
@@ -178,6 +194,10 @@ pyhpc_isoneutral_mixing,pass,0
 
 
 
+pyhpc_turbulent_kinetic_energy,pass,0
+
+
+
 pytorch_CycleGAN_and_pix2pix,pass,0
 
 
@@ -202,7 +222,7 @@ resnet50,pass,0
 
 
 
-resnet50_quantized_qat,eager_fail_to_run,0
+resnet50_quantized_qat,pass,2
 
 
 
@@ -274,7 +294,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,pass,29
+vision_maskrcnn,pass,28
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
index 731522270dd85..a5e00513153d9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
@@ -2,31 +2,31 @@ name,accuracy,graph_breaks
 
 
 
-AlbertForMaskedLM,pass,5
+AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,4
+AlbertForQuestionAnswering,pass,5
 
 
 
-AllenaiLongformerBase,pass,8
+AllenaiLongformerBase,pass,9
 
 
 
-BartForCausalLM,pass,13
+BartForCausalLM,pass,12
 
 
 
-BartForConditionalGeneration,pass,25
+BartForConditionalGeneration,pass,24
 
 
 
-BertForMaskedLM,pass,4
+BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,4
+BertForQuestionAnswering,pass,5
 
 
 
@@ -34,23 +34,23 @@ BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
-BlenderbotSmallForCausalLM,pass,13
+BlenderbotSmallForCausalLM,pass,12
 
 
 
-BlenderbotSmallForConditionalGeneration,pass,25
+BlenderbotSmallForConditionalGeneration,pass,24
 
 
 
-CamemBert,pass,4
+CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,4
+DebertaForMaskedLM,pass,5
 
 
 
-DebertaForQuestionAnswering,pass,4
+DebertaForQuestionAnswering,pass,5
 
 
 
@@ -62,63 +62,63 @@ DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
 
 
 
-DistilBertForMaskedLM,pass,4
+DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,4
+DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,4
+DistillGPT2,pass,5
 
 
 
-ElectraForCausalLM,pass,5
+ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,4
+ElectraForQuestionAnswering,pass,5
 
 
 
-GPT2ForSequenceClassification,pass,6
+GPT2ForSequenceClassification,pass,7
 
 
 
-GoogleFnet,pass,4
+GoogleFnet,pass,5
 
 
 
-LayoutLMForMaskedLM,pass,4
+LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
+LayoutLMForSequenceClassification,pass,7
 
 
 
-M2M100ForConditionalGeneration,pass,5
+M2M100ForConditionalGeneration,pass,4
 
 
 
-MBartForCausalLM,pass,13
+MBartForCausalLM,pass,12
 
 
 
-MBartForConditionalGeneration,pass,25
+MBartForConditionalGeneration,pass,24
 
 
 
-MT5ForConditionalGeneration,pass,4
+MT5ForConditionalGeneration,pass,5
 
 
 
-MegatronBertForCausalLM,pass,4
+MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,4
+MegatronBertForQuestionAnswering,pass,5
 
 
 
@@ -130,19 +130,19 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,13
+OPTForCausalLM,pass,12
 
 
 
-PLBartForCausalLM,pass,13
+PLBartForCausalLM,pass,12
 
 
 
-PLBartForConditionalGeneration,pass,30
+PLBartForConditionalGeneration,pass,29
 
 
 
-PegasusForCausalLM,pass,13
+PegasusForCausalLM,pass,12
 
 
 
@@ -150,36 +150,36 @@ PegasusForConditionalGeneration,pass,23
 
 
 
-RobertaForCausalLM,pass,4
+RobertaForCausalLM,pass,5
 
 
 
-RobertaForQuestionAnswering,pass,4
+RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,13
+Speech2Text2ForCausalLM,pass,12
 
 
 
-T5ForConditionalGeneration,pass,4
+T5ForConditionalGeneration,pass,5
 
 
 
-T5Small,pass,4
+T5Small,pass,5
 
 
 
-TrOCRForCausalLM,pass,13
+TrOCRForCausalLM,pass,12
 
 
 
-XGLMForCausalLM,pass,13
+XGLMForCausalLM,pass,12
 
 
 
-XLNetLMHeadModel,pass,4
+XLNetLMHeadModel,pass,5
 
 
 
-YituTechConvBert,pass,4
+YituTechConvBert,pass,5
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
index f763d3783a4ee..e5464160d32f7 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
@@ -2,15 +2,15 @@ name,accuracy,graph_breaks
 
 
 
-adv_inception_v3,pass,7
+adv_inception_v3,pass,6
 
 
 
-beit_base_patch16_224,pass,6
+beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,7
+botnet26t_256,pass,6
 
 
 
@@ -18,7 +18,7 @@ cait_m36_384,eager_fail_to_run,0
 
 
 
-coat_lite_mini,pass,7
+coat_lite_mini,pass,6
 
 
 
@@ -30,71 +30,71 @@ convmixer_768_32,pass,5
 
 
 
-convnext_base,pass,6
+convnext_base,pass,7
 
 
 
-crossvit_9_240,pass,6
+crossvit_9_240,pass,7
 
 
 
-cspdarknet53,pass,6
+cspdarknet53,pass,7
 
 
 
-deit_base_distilled_patch16_224,pass,6
+deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,6
+dla102,pass,7
 
 
 
-dm_nfnet_f0,pass,7
+dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,7
+dpn107,pass,6
 
 
 
-eca_botnext26ts_256,pass,6
+eca_botnext26ts_256,pass,7
 
 
 
-eca_halonext26ts,pass,6
+eca_halonext26ts,pass,7
 
 
 
-ese_vovnet19b_dw,pass,6
+ese_vovnet19b_dw,pass,7
 
 
 
-fbnetc_100,pass,6
+fbnetc_100,pass,7
 
 
 
-fbnetv3_b,pass,7
+fbnetv3_b,pass,6
 
 
 
-gernet_l,pass,7
+gernet_l,pass,6
 
 
 
-ghostnet_100,pass,7
+ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,6
+gluon_inception_v3,pass,7
 
 
 
-gmixer_24_224,pass,7
+gmixer_24_224,pass,6
 
 
 
-gmlp_s16_224,pass,6
+gmlp_s16_224,pass,7
 
 
 
@@ -102,51 +102,51 @@ hrnet_w18,pass,5
 
 
 
-inception_v3,pass,7
+inception_v3,pass,6
 
 
 
-jx_nest_base,pass,6
+jx_nest_base,pass,7
 
 
 
-lcnet_050,pass,7
+lcnet_050,pass,6
 
 
 
-levit_128,pass,6
+levit_128,pass,7
 
 
 
-mixer_b16_224,pass,6
+mixer_b16_224,pass,7
 
 
 
-mixnet_l,pass,7
+mixnet_l,pass,6
 
 
 
-mnasnet_100,pass,6
+mnasnet_100,pass,7
 
 
 
-mobilenetv2_100,pass,6
+mobilenetv2_100,pass,7
 
 
 
-mobilenetv3_large_100,pass,6
+mobilenetv3_large_100,pass,7
 
 
 
-mobilevit_s,pass,7
+mobilevit_s,pass,6
 
 
 
-nfnet_l0,pass,6
+nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,7
+pit_b_224,pass,6
 
 
 
@@ -154,92 +154,92 @@ pnasnet5large,pass,5
 
 
 
-poolformer_m36,pass,7
+poolformer_m36,pass,6
 
 
 
-regnety_002,pass,7
+regnety_002,pass,6
 
 
 
-repvgg_a2,pass,6
+repvgg_a2,pass,7
 
 
 
-res2net101_26w_4s,pass,7
+res2net101_26w_4s,pass,6
 
 
 
-res2net50_14w_8s,pass,7
+res2net50_14w_8s,pass,6
 
 
 
-res2next50,pass,7
+res2next50,pass,6
 
 
 
-resmlp_12_224,pass,7
+resmlp_12_224,pass,6
 
 
 
-resnest101e,pass,7
+resnest101e,pass,6
 
 
 
-rexnet_100,pass,6
+rexnet_100,pass,7
 
 
 
-sebotnet33ts_256,pass,7
+sebotnet33ts_256,pass,6
 
 
 
-selecsls42b,pass,7
+selecsls42b,pass,6
 
 
 
-spnasnet_100,pass,6
+spnasnet_100,pass,7
 
 
 
-swin_base_patch4_window7_224,pass,6
+swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,7
+swsl_resnext101_32x16d,pass,6
 
 
 
-tf_efficientnet_b0,pass,7
+tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,7
+tf_mixnet_l,pass,6
 
 
 
-tinynet_a,pass,7
+tinynet_a,pass,6
 
 
 
-tnt_s_patch16_224,pass,6
+tnt_s_patch16_224,pass,7
 
 
 
-twins_pcpvt_base,pass,6
+twins_pcpvt_base,pass,7
 
 
 
-visformer_small,pass,6
+visformer_small,pass,7
 
 
 
-vit_base_patch16_224,pass,6
+vit_base_patch16_224,pass,7
 
 
 
-volo_d1_224,pass,6
+volo_d1_224,pass,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,6
+xcit_large_24_p8_224,pass_due_to_skip,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
index 75eefb82684e2..07bbe765f1616 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@@ -14,7 +14,7 @@ Background_Matting,pass_due_to_skip,0
 
 
 
-DALLE2_pytorch,pass,11
+DALLE2_pytorch,pass,12
 
 
 
@@ -86,6 +86,10 @@ detectron2_fasterrcnn_r_50_fpn,eager_fail_to_run,0
 
 
 
+detectron2_fcos_r_50_fpn,pass,21
+
+
+
 detectron2_maskrcnn_r_101_c4,eager_fail_to_run,0
 
 
@@ -138,6 +142,14 @@ hf_Bart,pass,0
 
 
 
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
 hf_BigBird,fail_to_run,0
 
 
@@ -178,7 +190,7 @@ hf_Whisper,pass,0
 
 
 
-hf_clip,eager_fail_to_run,0
+hf_distil_whisper,pass,0
 
 
 
@@ -194,6 +206,14 @@ llama_v2_7b_16h,model_fail_to_load,0
 
 
 
+llava,model_fail_to_load,0
+
+
+
+maml,pass_due_to_skip,0
+
+
+
 maml_omniglot,pass,0
 
 
@@ -214,23 +234,23 @@ mobilenet_v3_large,pass,0
 
 
 
-moco,pass,11
+moco,pass,5
 
 
 
-nanogpt,pass,0
+moondream,model_fail_to_load,0
 
 
 
-nvidia_deeprecommender,pass,0
+nanogpt,pass,0
 
 
 
-opacus_cifar10,pass,0
+nvidia_deeprecommender,pass,0
 
 
 
-phi_1_5,model_fail_to_load,0
+opacus_cifar10,pass,0
 
 
 
@@ -250,6 +270,10 @@ pyhpc_isoneutral_mixing,pass,0
 
 
 
+pyhpc_turbulent_kinetic_energy,pass,0
+
+
+
 pytorch_CycleGAN_and_pix2pix,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
index cf77a05eea673..eb1195caa9a14 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
@@ -2,51 +2,51 @@ name,accuracy,graph_breaks
 
 
 
-BERT_pytorch,pass,7
+torchrec_dlrm,fail_to_run,3
 
 
 
-Background_Matting,pass_due_to_skip,0
+BERT_pytorch,pass,6
 
 
 
-DALLE2_pytorch,eager_fail_to_run,0
+Background_Matting,pass_due_to_skip,0
 
 
 
-LearningToPaint,pass,7
+DALLE2_pytorch,eager_fail_to_run,0
 
 
 
-Super_SloMo,pass,6
+LearningToPaint,pass,6
 
 
 
-alexnet,pass,7
+Super_SloMo,pass,7
 
 
 
-basic_gnn_edgecnn,pass,21
+alexnet,pass,6
 
 
 
-basic_gnn_gcn,pass,12
+basic_gnn_edgecnn,pass,22
 
 
 
-basic_gnn_gin,pass,6
+basic_gnn_gcn,pass,13
 
 
 
-basic_gnn_sage,pass,6
+basic_gnn_gin,pass,7
 
 
 
-cm3leon_generate,eager_fail_to_run,0
+basic_gnn_sage,pass,7
 
 
 
-dcgan,pass,7
+dcgan,pass,6
 
 
 
@@ -54,11 +54,7 @@ demucs,fail_to_run,4
 
 
 
-densenet121,pass,7
-
-
-
-detectron2_fcos_r_50_fpn,model_fail_to_load,0
+densenet121,pass,6
 
 
 
@@ -66,39 +62,39 @@ detectron2_maskrcnn_r_50_c4,eager_fail_to_run,0
 
 
 
-dlrm,pass,7
+dlrm,pass,6
 
 
 
-doctr_det_predictor,eager_fail_to_run,0
+drq,pass,6
 
 
 
-doctr_reco_predictor,eager_fail_to_run,0
+fastNLP_Bert,pass,10
 
 
 
-drq,pass,5
+functorch_dp_cifar10,pass,7
 
 
 
-fastNLP_Bert,pass,11
+functorch_maml_omniglot,pass,7
 
 
 
-functorch_dp_cifar10,pass,6
+hf_Albert,pass,6
 
 
 
-functorch_maml_omniglot,pass,6
+hf_Bart,pass,6
 
 
 
-hf_Albert,pass,5
+hf_Bert,pass,6
 
 
 
-hf_Bart,pass,5
+hf_Bert_large,pass,6
 
 
 
@@ -106,11 +102,11 @@ hf_BigBird,fail_to_run,3
 
 
 
-hf_DistilBert,pass,5
+hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,5
+hf_GPT2,pass,6
 
 
 
@@ -118,43 +114,43 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,25
+hf_Reformer,pass,26
 
 
 
-hf_T5_base,OOM,3
+hf_T5_base,eager_2nd_run_OOM,0
 
 
 
-hf_T5_generate,eager_fail_to_run,0
+hf_T5_large,pass_due_to_skip,0
 
 
 
-hf_T5_large,pass_due_to_skip,0
+hf_Whisper,pass,6
 
 
 
-hf_Whisper,pass,5
+hf_distil_whisper,model_fail_to_load,0
 
 
 
-hf_clip,eager_fail_to_run,0
+lennard_jones,pass,7
 
 
 
-lennard_jones,pass,6
+llava,model_fail_to_load,0
 
 
 
-maml_omniglot,pass,6
+maml_omniglot,pass,7
 
 
 
-mnasnet1_0,pass,6
+mnasnet1_0,pass,7
 
 
 
-mobilenet_v2,pass,7
+mobilenet_v2,pass,6
 
 
 
@@ -162,19 +158,19 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0
 
 
 
-mobilenet_v3_large,pass,6
+mobilenet_v3_large,pass,7
 
 
 
-moco,pass,18
+moco,pass,11
 
 
 
-nanogpt,pass,6
+nanogpt,pass,7
 
 
 
-nvidia_deeprecommender,pass,6
+nvidia_deeprecommender,pass,7
 
 
 
@@ -182,35 +178,35 @@ opacus_cifar10,eager_fail_to_run,0
 
 
 
-phlippe_densenet,pass,7
+phlippe_densenet,pass,6
 
 
 
-phlippe_resnet,pass,7
+phlippe_resnet,pass,6
 
 
 
-pytorch_CycleGAN_and_pix2pix,pass,7
+pytorch_CycleGAN_and_pix2pix,pass,6
 
 
 
-pytorch_stargan,pass,7
+pytorch_stargan,pass,6
 
 
 
-pytorch_unet,pass,6
+pytorch_unet,pass_due_to_skip,7
 
 
 
-resnet152,pass,6
+resnet152,pass,7
 
 
 
-resnet18,pass,7
+resnet18,pass,6
 
 
 
-resnet50,pass,7
+resnet50,pass,6
 
 
 
@@ -218,7 +214,7 @@ resnet50_quantized_qat,eager_fail_to_run,0
 
 
 
-resnext50_32x4d,pass,6
+resnext50_32x4d,pass,7
 
 
 
@@ -226,19 +222,19 @@ sam,eager_fail_to_run,0
 
 
 
-shufflenet_v2_x1_0,pass,7
+shufflenet_v2_x1_0,pass,6
 
 
 
-soft_actor_critic,pass,5
+soft_actor_critic,pass,6
 
 
 
-squeezenet1_1,pass,7
+squeezenet1_1,pass,6
 
 
 
-stable_diffusion_text_encoder,pass,6
+stable_diffusion_text_encoder,pass,5
 
 
 
@@ -246,19 +242,19 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,6
+timm_efficientnet,pass,7
 
 
 
-timm_regnet,pass,7
+timm_regnet,pass,6
 
 
 
-timm_resnest,pass,6
+timm_resnest,pass,7
 
 
 
-timm_vision_transformer,pass,7
+timm_vision_transformer,pass,6
 
 
 
@@ -266,24 +262,24 @@ timm_vision_transformer_large,pass_due_to_skip,0
 
 
 
-timm_vovnet,pass,7
+timm_vovnet,pass,6
 
 
 
-torch_multimodal_clip,pass,6
+torch_multimodal_clip,pass,7
 
 
 
-tts_angular,pass,8
+tts_angular,pass,9
 
 
 
-vgg16,pass,7
+vgg16,pass,6
 
 
 
-vision_maskrcnn,pass,35
+vision_maskrcnn,pass,34
 
 
 
-yolov3,pass,8
+yolov3,pass,9
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
index 731522270dd85..a5e00513153d9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
@@ -2,31 +2,31 @@ name,accuracy,graph_breaks
 
 
 
-AlbertForMaskedLM,pass,5
+AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,4
+AlbertForQuestionAnswering,pass,5
 
 
 
-AllenaiLongformerBase,pass,8
+AllenaiLongformerBase,pass,9
 
 
 
-BartForCausalLM,pass,13
+BartForCausalLM,pass,12
 
 
 
-BartForConditionalGeneration,pass,25
+BartForConditionalGeneration,pass,24
 
 
 
-BertForMaskedLM,pass,4
+BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,4
+BertForQuestionAnswering,pass,5
 
 
 
@@ -34,23 +34,23 @@ BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
-BlenderbotSmallForCausalLM,pass,13
+BlenderbotSmallForCausalLM,pass,12
 
 
 
-BlenderbotSmallForConditionalGeneration,pass,25
+BlenderbotSmallForConditionalGeneration,pass,24
 
 
 
-CamemBert,pass,4
+CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,4
+DebertaForMaskedLM,pass,5
 
 
 
-DebertaForQuestionAnswering,pass,4
+DebertaForQuestionAnswering,pass,5
 
 
 
@@ -62,63 +62,63 @@ DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
 
 
 
-DistilBertForMaskedLM,pass,4
+DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,4
+DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,4
+DistillGPT2,pass,5
 
 
 
-ElectraForCausalLM,pass,5
+ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,4
+ElectraForQuestionAnswering,pass,5
 
 
 
-GPT2ForSequenceClassification,pass,6
+GPT2ForSequenceClassification,pass,7
 
 
 
-GoogleFnet,pass,4
+GoogleFnet,pass,5
 
 
 
-LayoutLMForMaskedLM,pass,4
+LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
+LayoutLMForSequenceClassification,pass,7
 
 
 
-M2M100ForConditionalGeneration,pass,5
+M2M100ForConditionalGeneration,pass,4
 
 
 
-MBartForCausalLM,pass,13
+MBartForCausalLM,pass,12
 
 
 
-MBartForConditionalGeneration,pass,25
+MBartForConditionalGeneration,pass,24
 
 
 
-MT5ForConditionalGeneration,pass,4
+MT5ForConditionalGeneration,pass,5
 
 
 
-MegatronBertForCausalLM,pass,4
+MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,4
+MegatronBertForQuestionAnswering,pass,5
 
 
 
@@ -130,19 +130,19 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,13
+OPTForCausalLM,pass,12
 
 
 
-PLBartForCausalLM,pass,13
+PLBartForCausalLM,pass,12
 
 
 
-PLBartForConditionalGeneration,pass,30
+PLBartForConditionalGeneration,pass,29
 
 
 
-PegasusForCausalLM,pass,13
+PegasusForCausalLM,pass,12
 
 
 
@@ -150,36 +150,36 @@ PegasusForConditionalGeneration,pass,23
 
 
 
-RobertaForCausalLM,pass,4
+RobertaForCausalLM,pass,5
 
 
 
-RobertaForQuestionAnswering,pass,4
+RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,13
+Speech2Text2ForCausalLM,pass,12
 
 
 
-T5ForConditionalGeneration,pass,4
+T5ForConditionalGeneration,pass,5
 
 
 
-T5Small,pass,4
+T5Small,pass,5
 
 
 
-TrOCRForCausalLM,pass,13
+TrOCRForCausalLM,pass,12
 
 
 
-XGLMForCausalLM,pass,13
+XGLMForCausalLM,pass,12
 
 
 
-XLNetLMHeadModel,pass,4
+XLNetLMHeadModel,pass,5
 
 
 
-YituTechConvBert,pass,4
+YituTechConvBert,pass,5
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv
index f763d3783a4ee..e5464160d32f7 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv
@@ -2,15 +2,15 @@ name,accuracy,graph_breaks
 
 
 
-adv_inception_v3,pass,7
+adv_inception_v3,pass,6
 
 
 
-beit_base_patch16_224,pass,6
+beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,7
+botnet26t_256,pass,6
 
 
 
@@ -18,7 +18,7 @@ cait_m36_384,eager_fail_to_run,0
 
 
 
-coat_lite_mini,pass,7
+coat_lite_mini,pass,6
 
 
 
@@ -30,71 +30,71 @@ convmixer_768_32,pass,5
 
 
 
-convnext_base,pass,6
+convnext_base,pass,7
 
 
 
-crossvit_9_240,pass,6
+crossvit_9_240,pass,7
 
 
 
-cspdarknet53,pass,6
+cspdarknet53,pass,7
 
 
 
-deit_base_distilled_patch16_224,pass,6
+deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,6
+dla102,pass,7
 
 
 
-dm_nfnet_f0,pass,7
+dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,7
+dpn107,pass,6
 
 
 
-eca_botnext26ts_256,pass,6
+eca_botnext26ts_256,pass,7
 
 
 
-eca_halonext26ts,pass,6
+eca_halonext26ts,pass,7
 
 
 
-ese_vovnet19b_dw,pass,6
+ese_vovnet19b_dw,pass,7
 
 
 
-fbnetc_100,pass,6
+fbnetc_100,pass,7
 
 
 
-fbnetv3_b,pass,7
+fbnetv3_b,pass,6
 
 
 
-gernet_l,pass,7
+gernet_l,pass,6
 
 
 
-ghostnet_100,pass,7
+ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,6
+gluon_inception_v3,pass,7
 
 
 
-gmixer_24_224,pass,7
+gmixer_24_224,pass,6
 
 
 
-gmlp_s16_224,pass,6
+gmlp_s16_224,pass,7
 
 
 
@@ -102,51 +102,51 @@ hrnet_w18,pass,5
 
 
 
-inception_v3,pass,7
+inception_v3,pass,6
 
 
 
-jx_nest_base,pass,6
+jx_nest_base,pass,7
 
 
 
-lcnet_050,pass,7
+lcnet_050,pass,6
 
 
 
-levit_128,pass,6
+levit_128,pass,7
 
 
 
-mixer_b16_224,pass,6
+mixer_b16_224,pass,7
 
 
 
-mixnet_l,pass,7
+mixnet_l,pass,6
 
 
 
-mnasnet_100,pass,6
+mnasnet_100,pass,7
 
 
 
-mobilenetv2_100,pass,6
+mobilenetv2_100,pass,7
 
 
 
-mobilenetv3_large_100,pass,6
+mobilenetv3_large_100,pass,7
 
 
 
-mobilevit_s,pass,7
+mobilevit_s,pass,6
 
 
 
-nfnet_l0,pass,6
+nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,7
+pit_b_224,pass,6
 
 
 
@@ -154,92 +154,92 @@ pnasnet5large,pass,5
 
 
 
-poolformer_m36,pass,7
+poolformer_m36,pass,6
 
 
 
-regnety_002,pass,7
+regnety_002,pass,6
 
 
 
-repvgg_a2,pass,6
+repvgg_a2,pass,7
 
 
 
-res2net101_26w_4s,pass,7
+res2net101_26w_4s,pass,6
 
 
 
-res2net50_14w_8s,pass,7
+res2net50_14w_8s,pass,6
 
 
 
-res2next50,pass,7
+res2next50,pass,6
 
 
 
-resmlp_12_224,pass,7
+resmlp_12_224,pass,6
 
 
 
-resnest101e,pass,7
+resnest101e,pass,6
 
 
 
-rexnet_100,pass,6
+rexnet_100,pass,7
 
 
 
-sebotnet33ts_256,pass,7
+sebotnet33ts_256,pass,6
 
 
 
-selecsls42b,pass,7
+selecsls42b,pass,6
 
 
 
-spnasnet_100,pass,6
+spnasnet_100,pass,7
 
 
 
-swin_base_patch4_window7_224,pass,6
+swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,7
+swsl_resnext101_32x16d,pass,6
 
 
 
-tf_efficientnet_b0,pass,7
+tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,7
+tf_mixnet_l,pass,6
 
 
 
-tinynet_a,pass,7
+tinynet_a,pass,6
 
 
 
-tnt_s_patch16_224,pass,6
+tnt_s_patch16_224,pass,7
 
 
 
-twins_pcpvt_base,pass,6
+twins_pcpvt_base,pass,7
 
 
 
-visformer_small,pass,6
+visformer_small,pass,7
 
 
 
-vit_base_patch16_224,pass,6
+vit_base_patch16_224,pass,7
 
 
 
-volo_d1_224,pass,6
+volo_d1_224,pass,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,6
+xcit_large_24_p8_224,pass_due_to_skip,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
index 3a36d787de484..5b5646e854875 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@@ -14,7 +14,7 @@ Background_Matting,pass_due_to_skip,0
 
 
 
-DALLE2_pytorch,pass,11
+DALLE2_pytorch,pass,12
 
 
 
@@ -86,7 +86,7 @@ detectron2_fasterrcnn_r_50_fpn,eager_fail_to_run,0
 
 
 
-detectron2_fcos_r_50_fpn,pass,41
+detectron2_fcos_r_50_fpn,pass,21
 
 
 
@@ -142,6 +142,14 @@ hf_Bart,pass,0
 
 
 
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
 hf_BigBird,pass,0
 
 
@@ -182,7 +190,7 @@ hf_Whisper,pass,0
 
 
 
-hf_clip,eager_fail_to_run,0
+hf_distil_whisper,pass,0
 
 
 
@@ -198,6 +206,14 @@ llama_v2_7b_16h,model_fail_to_load,0
 
 
 
+llava,model_fail_to_load,0
+
+
+
+maml,pass_due_to_skip,0
+
+
+
 maml_omniglot,pass,0
 
 
@@ -218,23 +234,23 @@ mobilenet_v3_large,pass,0
 
 
 
-moco,pass,11
+moco,pass,5
 
 
 
-nanogpt,pass,0
+moondream,model_fail_to_load,0
 
 
 
-nvidia_deeprecommender,pass,0
+nanogpt,pass,0
 
 
 
-opacus_cifar10,pass,0
+nvidia_deeprecommender,pass,0
 
 
 
-phi_1_5,model_fail_to_load,0
+opacus_cifar10,pass,0
 
 
 
@@ -294,6 +310,10 @@ sam,pass,0
 
 
 
+sam_fast,pass,0
+
+
+
 shufflenet_v2_x1_0,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
index 1c8dbb156c3e3..0dd9ce3482f4a 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
@@ -2,11 +2,11 @@ name,accuracy,graph_breaks
 
 
 
-torchrec_dlrm,pass,7
+torchrec_dlrm,pass,6
 
 
 
-BERT_pytorch,pass,7
+BERT_pytorch,pass,6
 
 
 
@@ -18,51 +18,43 @@ DALLE2_pytorch,eager_fail_to_run,0
 
 
 
-LearningToPaint,pass,7
+LearningToPaint,pass,6
 
 
 
-Super_SloMo,pass,6
+Super_SloMo,pass,7
 
 
 
-alexnet,pass,7
+alexnet,pass,6
 
 
 
-basic_gnn_edgecnn,pass,21
+basic_gnn_edgecnn,pass,22
 
 
 
-basic_gnn_gcn,pass,12
+basic_gnn_gcn,pass,13
 
 
 
-basic_gnn_gin,pass,6
+basic_gnn_gin,pass,7
 
 
 
-basic_gnn_sage,pass,6
+basic_gnn_sage,pass,7
 
 
 
-cm3leon_generate,eager_fail_to_run,0
+dcgan,pass,6
 
 
 
-dcgan,pass,7
+demucs,pass,9
 
 
 
-demucs,pass,10
-
-
-
-densenet121,pass,7
-
-
-
-detectron2_fcos_r_50_fpn,model_fail_to_load,0
+densenet121,pass,6
 
 
 
@@ -70,51 +62,51 @@ detectron2_maskrcnn_r_50_c4,eager_fail_to_run,0
 
 
 
-dlrm,pass,7
+dlrm,pass,6
 
 
 
-doctr_det_predictor,eager_fail_to_run,0
+drq,pass,6
 
 
 
-doctr_reco_predictor,eager_fail_to_run,0
+fastNLP_Bert,pass,10
 
 
 
-drq,pass,5
+functorch_dp_cifar10,pass,7
 
 
 
-fastNLP_Bert,pass,11
+functorch_maml_omniglot,pass,7
 
 
 
-functorch_dp_cifar10,pass,6
+hf_Albert,pass,6
 
 
 
-functorch_maml_omniglot,pass,6
+hf_Bart,pass,6
 
 
 
-hf_Albert,pass,5
+hf_Bert,pass,6
 
 
 
-hf_Bart,pass,5
+hf_Bert_large,pass,6
 
 
 
-hf_BigBird,pass,5
+hf_BigBird,pass,6
 
 
 
-hf_DistilBert,pass,5
+hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,5
+hf_GPT2,pass,6
 
 
 
@@ -122,43 +114,43 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,25
+hf_Reformer,pass,26
 
 
 
-hf_T5_base,pass,6
+hf_T5_base,eager_2nd_run_OOM,0
 
 
 
-hf_T5_generate,eager_fail_to_run,0
+hf_T5_large,pass_due_to_skip,0
 
 
 
-hf_T5_large,pass_due_to_skip,0
+hf_Whisper,pass,6
 
 
 
-hf_Whisper,pass,5
+hf_distil_whisper,model_fail_to_load,0
 
 
 
-hf_clip,eager_fail_to_run,0
+lennard_jones,pass,7
 
 
 
-lennard_jones,pass,6
+llava,model_fail_to_load,0
 
 
 
-maml_omniglot,pass,6
+maml_omniglot,pass,7
 
 
 
-mnasnet1_0,pass,6
+mnasnet1_0,pass,7
 
 
 
-mobilenet_v2,pass,7
+mobilenet_v2,pass,6
 
 
 
@@ -166,19 +158,19 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0
 
 
 
-mobilenet_v3_large,pass,6
+mobilenet_v3_large,pass,7
 
 
 
-moco,pass,18
+moco,pass,11
 
 
 
-nanogpt,pass,6
+nanogpt,pass,7
 
 
 
-nvidia_deeprecommender,pass,6
+nvidia_deeprecommender,pass,7
 
 
 
@@ -186,35 +178,35 @@ opacus_cifar10,eager_fail_to_run,0
 
 
 
-phlippe_densenet,pass,7
+phlippe_densenet,pass,6
 
 
 
-phlippe_resnet,pass,7
+phlippe_resnet,pass,6
 
 
 
-pytorch_CycleGAN_and_pix2pix,pass,7
+pytorch_CycleGAN_and_pix2pix,pass,6
 
 
 
-pytorch_stargan,pass,7
+pytorch_stargan,pass,6
 
 
 
-pytorch_unet,pass,6
+pytorch_unet,pass_due_to_skip,7
 
 
 
-resnet152,pass,6
+resnet152,pass,7
 
 
 
-resnet18,pass,7
+resnet18,pass,6
 
 
 
-resnet50,pass,7
+resnet50,pass,6
 
 
 
@@ -222,7 +214,7 @@ resnet50_quantized_qat,eager_fail_to_run,0
 
 
 
-resnext50_32x4d,pass,6
+resnext50_32x4d,pass,7
 
 
 
@@ -230,23 +222,23 @@ sam,eager_fail_to_run,0
 
 
 
-shufflenet_v2_x1_0,pass,7
+shufflenet_v2_x1_0,pass,6
 
 
 
-soft_actor_critic,pass,5
+soft_actor_critic,pass,6
 
 
 
-speech_transformer,pass,17
+speech_transformer,pass,16
 
 
 
-squeezenet1_1,pass,7
+squeezenet1_1,pass,6
 
 
 
-stable_diffusion_text_encoder,pass,6
+stable_diffusion_text_encoder,pass,5
 
 
 
@@ -254,19 +246,19 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,6
+timm_efficientnet,pass,7
 
 
 
-timm_regnet,pass,7
+timm_regnet,pass,6
 
 
 
-timm_resnest,pass,6
+timm_resnest,pass,7
 
 
 
-timm_vision_transformer,pass,7
+timm_vision_transformer,pass,6
 
 
 
@@ -274,24 +266,24 @@ timm_vision_transformer_large,pass_due_to_skip,0
 
 
 
-timm_vovnet,pass,7
+timm_vovnet,pass,6
 
 
 
-torch_multimodal_clip,pass,6
+torch_multimodal_clip,pass,7
 
 
 
-tts_angular,pass,8
+tts_angular,pass,9
 
 
 
-vgg16,pass,7
+vgg16,pass,6
 
 
 
-vision_maskrcnn,pass,35
+vision_maskrcnn,pass,34
 
 
 
-yolov3,pass,8
+yolov3,pass,9
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
index 731522270dd85..a5e00513153d9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
@@ -2,31 +2,31 @@ name,accuracy,graph_breaks
 
 
 
-AlbertForMaskedLM,pass,5
+AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,4
+AlbertForQuestionAnswering,pass,5
 
 
 
-AllenaiLongformerBase,pass,8
+AllenaiLongformerBase,pass,9
 
 
 
-BartForCausalLM,pass,13
+BartForCausalLM,pass,12
 
 
 
-BartForConditionalGeneration,pass,25
+BartForConditionalGeneration,pass,24
 
 
 
-BertForMaskedLM,pass,4
+BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,4
+BertForQuestionAnswering,pass,5
 
 
 
@@ -34,23 +34,23 @@ BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
-BlenderbotSmallForCausalLM,pass,13
+BlenderbotSmallForCausalLM,pass,12
 
 
 
-BlenderbotSmallForConditionalGeneration,pass,25
+BlenderbotSmallForConditionalGeneration,pass,24
 
 
 
-CamemBert,pass,4
+CamemBert,pass,5
 
 
 
-DebertaForMaskedLM,pass,4
+DebertaForMaskedLM,pass,5
 
 
 
-DebertaForQuestionAnswering,pass,4
+DebertaForQuestionAnswering,pass,5
 
 
 
@@ -62,63 +62,63 @@ DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
 
 
 
-DistilBertForMaskedLM,pass,4
+DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,4
+DistilBertForQuestionAnswering,pass,5
 
 
 
-DistillGPT2,pass,4
+DistillGPT2,pass,5
 
 
 
-ElectraForCausalLM,pass,5
+ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,4
+ElectraForQuestionAnswering,pass,5
 
 
 
-GPT2ForSequenceClassification,pass,6
+GPT2ForSequenceClassification,pass,7
 
 
 
-GoogleFnet,pass,4
+GoogleFnet,pass,5
 
 
 
-LayoutLMForMaskedLM,pass,4
+LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
+LayoutLMForSequenceClassification,pass,7
 
 
 
-M2M100ForConditionalGeneration,pass,5
+M2M100ForConditionalGeneration,pass,4
 
 
 
-MBartForCausalLM,pass,13
+MBartForCausalLM,pass,12
 
 
 
-MBartForConditionalGeneration,pass,25
+MBartForConditionalGeneration,pass,24
 
 
 
-MT5ForConditionalGeneration,pass,4
+MT5ForConditionalGeneration,pass,5
 
 
 
-MegatronBertForCausalLM,pass,4
+MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,4
+MegatronBertForQuestionAnswering,pass,5
 
 
 
@@ -130,19 +130,19 @@ MobileBertForQuestionAnswering,pass,3
 
 
 
-OPTForCausalLM,pass,13
+OPTForCausalLM,pass,12
 
 
 
-PLBartForCausalLM,pass,13
+PLBartForCausalLM,pass,12
 
 
 
-PLBartForConditionalGeneration,pass,30
+PLBartForConditionalGeneration,pass,29
 
 
 
-PegasusForCausalLM,pass,13
+PegasusForCausalLM,pass,12
 
 
 
@@ -150,36 +150,36 @@ PegasusForConditionalGeneration,pass,23
 
 
 
-RobertaForCausalLM,pass,4
+RobertaForCausalLM,pass,5
 
 
 
-RobertaForQuestionAnswering,pass,4
+RobertaForQuestionAnswering,pass,5
 
 
 
-Speech2Text2ForCausalLM,pass,13
+Speech2Text2ForCausalLM,pass,12
 
 
 
-T5ForConditionalGeneration,pass,4
+T5ForConditionalGeneration,pass,5
 
 
 
-T5Small,pass,4
+T5Small,pass,5
 
 
 
-TrOCRForCausalLM,pass,13
+TrOCRForCausalLM,pass,12
 
 
 
-XGLMForCausalLM,pass,13
+XGLMForCausalLM,pass,12
 
 
 
-XLNetLMHeadModel,pass,4
+XLNetLMHeadModel,pass,5
 
 
 
-YituTechConvBert,pass,4
+YituTechConvBert,pass,5
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv
index f763d3783a4ee..e5464160d32f7 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv
@@ -2,15 +2,15 @@ name,accuracy,graph_breaks
 
 
 
-adv_inception_v3,pass,7
+adv_inception_v3,pass,6
 
 
 
-beit_base_patch16_224,pass,6
+beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,7
+botnet26t_256,pass,6
 
 
 
@@ -18,7 +18,7 @@ cait_m36_384,eager_fail_to_run,0
 
 
 
-coat_lite_mini,pass,7
+coat_lite_mini,pass,6
 
 
 
@@ -30,71 +30,71 @@ convmixer_768_32,pass,5
 
 
 
-convnext_base,pass,6
+convnext_base,pass,7
 
 
 
-crossvit_9_240,pass,6
+crossvit_9_240,pass,7
 
 
 
-cspdarknet53,pass,6
+cspdarknet53,pass,7
 
 
 
-deit_base_distilled_patch16_224,pass,6
+deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,6
+dla102,pass,7
 
 
 
-dm_nfnet_f0,pass,7
+dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,7
+dpn107,pass,6
 
 
 
-eca_botnext26ts_256,pass,6
+eca_botnext26ts_256,pass,7
 
 
 
-eca_halonext26ts,pass,6
+eca_halonext26ts,pass,7
 
 
 
-ese_vovnet19b_dw,pass,6
+ese_vovnet19b_dw,pass,7
 
 
 
-fbnetc_100,pass,6
+fbnetc_100,pass,7
 
 
 
-fbnetv3_b,pass,7
+fbnetv3_b,pass,6
 
 
 
-gernet_l,pass,7
+gernet_l,pass,6
 
 
 
-ghostnet_100,pass,7
+ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,6
+gluon_inception_v3,pass,7
 
 
 
-gmixer_24_224,pass,7
+gmixer_24_224,pass,6
 
 
 
-gmlp_s16_224,pass,6
+gmlp_s16_224,pass,7
 
 
 
@@ -102,51 +102,51 @@ hrnet_w18,pass,5
 
 
 
-inception_v3,pass,7
+inception_v3,pass,6
 
 
 
-jx_nest_base,pass,6
+jx_nest_base,pass,7
 
 
 
-lcnet_050,pass,7
+lcnet_050,pass,6
 
 
 
-levit_128,pass,6
+levit_128,pass,7
 
 
 
-mixer_b16_224,pass,6
+mixer_b16_224,pass,7
 
 
 
-mixnet_l,pass,7
+mixnet_l,pass,6
 
 
 
-mnasnet_100,pass,6
+mnasnet_100,pass,7
 
 
 
-mobilenetv2_100,pass,6
+mobilenetv2_100,pass,7
 
 
 
-mobilenetv3_large_100,pass,6
+mobilenetv3_large_100,pass,7
 
 
 
-mobilevit_s,pass,7
+mobilevit_s,pass,6
 
 
 
-nfnet_l0,pass,6
+nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,7
+pit_b_224,pass,6
 
 
 
@@ -154,92 +154,92 @@ pnasnet5large,pass,5
 
 
 
-poolformer_m36,pass,7
+poolformer_m36,pass,6
 
 
 
-regnety_002,pass,7
+regnety_002,pass,6
 
 
 
-repvgg_a2,pass,6
+repvgg_a2,pass,7
 
 
 
-res2net101_26w_4s,pass,7
+res2net101_26w_4s,pass,6
 
 
 
-res2net50_14w_8s,pass,7
+res2net50_14w_8s,pass,6
 
 
 
-res2next50,pass,7
+res2next50,pass,6
 
 
 
-resmlp_12_224,pass,7
+resmlp_12_224,pass,6
 
 
 
-resnest101e,pass,7
+resnest101e,pass,6
 
 
 
-rexnet_100,pass,6
+rexnet_100,pass,7
 
 
 
-sebotnet33ts_256,pass,7
+sebotnet33ts_256,pass,6
 
 
 
-selecsls42b,pass,7
+selecsls42b,pass,6
 
 
 
-spnasnet_100,pass,6
+spnasnet_100,pass,7
 
 
 
-swin_base_patch4_window7_224,pass,6
+swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,7
+swsl_resnext101_32x16d,pass,6
 
 
 
-tf_efficientnet_b0,pass,7
+tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,7
+tf_mixnet_l,pass,6
 
 
 
-tinynet_a,pass,7
+tinynet_a,pass,6
 
 
 
-tnt_s_patch16_224,pass,6
+tnt_s_patch16_224,pass,7
 
 
 
-twins_pcpvt_base,pass,6
+twins_pcpvt_base,pass,7
 
 
 
-visformer_small,pass,6
+visformer_small,pass,7
 
 
 
-vit_base_patch16_224,pass,6
+vit_base_patch16_224,pass,7
 
 
 
-volo_d1_224,pass,6
+volo_d1_224,pass,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,6
+xcit_large_24_p8_224,pass_due_to_skip,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
index 934a5c5abf505..4ced1b19f2455 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@@ -14,7 +14,7 @@ Background_Matting,pass_due_to_skip,0
 
 
 
-DALLE2_pytorch,pass,11
+DALLE2_pytorch,pass,12
 
 
 
@@ -86,7 +86,7 @@ detectron2_fasterrcnn_r_50_fpn,eager_fail_to_run,0
 
 
 
-detectron2_fcos_r_50_fpn,pass,42
+detectron2_fcos_r_50_fpn,pass,21
 
 
 
@@ -142,6 +142,14 @@ hf_Bart,pass,0
 
 
 
+hf_Bert,pass,0
+
+
+
+hf_Bert_large,pass,0
+
+
+
 hf_BigBird,fail_accuracy,0
 
 
@@ -182,7 +190,7 @@ hf_Whisper,pass,0
 
 
 
-hf_clip,eager_fail_to_run,0
+hf_distil_whisper,pass,0
 
 
 
@@ -198,6 +206,14 @@ llama_v2_7b_16h,model_fail_to_load,0
 
 
 
+llava,model_fail_to_load,0
+
+
+
+maml,pass_due_to_skip,0
+
+
+
 maml_omniglot,pass,0
 
 
@@ -218,23 +234,23 @@ mobilenet_v3_large,pass,0
 
 
 
-moco,pass,11
+moco,pass,5
 
 
 
-nanogpt,pass,0
+moondream,model_fail_to_load,0
 
 
 
-nvidia_deeprecommender,pass,0
+nanogpt,pass,0
 
 
 
-opacus_cifar10,pass,0
+nvidia_deeprecommender,pass,0
 
 
 
-phi_1_5,model_fail_to_load,0
+opacus_cifar10,pass,0
 
 
 
@@ -294,6 +310,10 @@ sam,pass,0
 
 
 
+sam_fast,pass,0
+
+
+
 shufflenet_v2_x1_0,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
index 1fe7f282dfa23..0dd9ce3482f4a 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -2,11 +2,11 @@ name,accuracy,graph_breaks
 
 
 
-torchrec_dlrm,pass,7
+torchrec_dlrm,pass,6
 
 
 
-BERT_pytorch,pass,7
+BERT_pytorch,pass,6
 
 
 
@@ -18,51 +18,43 @@ DALLE2_pytorch,eager_fail_to_run,0
 
 
 
-LearningToPaint,pass,7
+LearningToPaint,pass,6
 
 
 
-Super_SloMo,pass,6
+Super_SloMo,pass,7
 
 
 
-alexnet,pass,7
+alexnet,pass,6
 
 
 
-basic_gnn_edgecnn,pass,21
+basic_gnn_edgecnn,pass,22
 
 
 
-basic_gnn_gcn,pass,12
+basic_gnn_gcn,pass,13
 
 
 
-basic_gnn_gin,pass,6
+basic_gnn_gin,pass,7
 
 
 
-basic_gnn_sage,pass,6
+basic_gnn_sage,pass,7
 
 
 
-cm3leon_generate,eager_fail_to_run,0
+dcgan,pass,6
 
 
 
-dcgan,pass,7
+demucs,pass,9
 
 
 
-demucs,pass,10
-
-
-
-densenet121,pass,7
-
-
-
-detectron2_fcos_r_50_fpn,model_fail_to_load,0
+densenet121,pass,6
 
 
 
@@ -70,51 +62,51 @@ detectron2_maskrcnn_r_50_c4,eager_fail_to_run,0
 
 
 
-dlrm,pass,7
+dlrm,pass,6
 
 
 
-doctr_det_predictor,eager_fail_to_run,0
+drq,pass,6
 
 
 
-doctr_reco_predictor,eager_fail_to_run,0
+fastNLP_Bert,pass,10
 
 
 
-drq,pass,5
+functorch_dp_cifar10,pass,7
 
 
 
-fastNLP_Bert,pass,11
+functorch_maml_omniglot,pass,7
 
 
 
-functorch_dp_cifar10,pass,6
+hf_Albert,pass,6
 
 
 
-functorch_maml_omniglot,pass,6
+hf_Bart,pass,6
 
 
 
-hf_Albert,pass,5
+hf_Bert,pass,6
 
 
 
-hf_Bart,pass,5
+hf_Bert_large,pass,6
 
 
 
-hf_BigBird,pass,5
+hf_BigBird,pass,6
 
 
 
-hf_DistilBert,pass,5
+hf_DistilBert,pass,6
 
 
 
-hf_GPT2,pass,5
+hf_GPT2,pass,6
 
 
 
@@ -122,43 +114,43 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,pass,25
+hf_Reformer,pass,26
 
 
 
-hf_T5_base,OOM,3
+hf_T5_base,eager_2nd_run_OOM,0
 
 
 
-hf_T5_generate,eager_fail_to_run,0
+hf_T5_large,pass_due_to_skip,0
 
 
 
-hf_T5_large,pass_due_to_skip,0
+hf_Whisper,pass,6
 
 
 
-hf_Whisper,pass,5
+hf_distil_whisper,model_fail_to_load,0
 
 
 
-hf_clip,eager_fail_to_run,0
+lennard_jones,pass,7
 
 
 
-lennard_jones,pass,6
+llava,model_fail_to_load,0
 
 
 
-maml_omniglot,pass,6
+maml_omniglot,pass,7
 
 
 
-mnasnet1_0,pass,6
+mnasnet1_0,pass,7
 
 
 
-mobilenet_v2,pass,7
+mobilenet_v2,pass,6
 
 
 
@@ -166,19 +158,19 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0
 
 
 
-mobilenet_v3_large,pass,6
+mobilenet_v3_large,pass,7
 
 
 
-moco,pass,18
+moco,pass,11
 
 
 
-nanogpt,pass,6
+nanogpt,pass,7
 
 
 
-nvidia_deeprecommender,pass,6
+nvidia_deeprecommender,pass,7
 
 
 
@@ -186,35 +178,35 @@ opacus_cifar10,eager_fail_to_run,0
 
 
 
-phlippe_densenet,pass,7
+phlippe_densenet,pass,6
 
 
 
-phlippe_resnet,pass,7
+phlippe_resnet,pass,6
 
 
 
-pytorch_CycleGAN_and_pix2pix,pass,7
+pytorch_CycleGAN_and_pix2pix,pass,6
 
 
 
-pytorch_stargan,pass,7
+pytorch_stargan,pass,6
 
 
 
-pytorch_unet,pass,6
+pytorch_unet,pass_due_to_skip,7
 
 
 
-resnet152,pass,6
+resnet152,pass,7
 
 
 
-resnet18,pass,7
+resnet18,pass,6
 
 
 
-resnet50,pass,7
+resnet50,pass,6
 
 
 
@@ -222,7 +214,7 @@ resnet50_quantized_qat,eager_fail_to_run,0
 
 
 
-resnext50_32x4d,pass,6
+resnext50_32x4d,pass,7
 
 
 
@@ -230,23 +222,23 @@ sam,eager_fail_to_run,0
 
 
 
-shufflenet_v2_x1_0,pass,7
+shufflenet_v2_x1_0,pass,6
 
 
 
-soft_actor_critic,pass,5
+soft_actor_critic,pass,6
 
 
 
-speech_transformer,pass,17
+speech_transformer,pass,16
 
 
 
-squeezenet1_1,pass,7
+squeezenet1_1,pass,6
 
 
 
-stable_diffusion_text_encoder,pass,6
+stable_diffusion_text_encoder,pass,5
 
 
 
@@ -254,19 +246,19 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,6
+timm_efficientnet,pass,7
 
 
 
-timm_regnet,pass,7
+timm_regnet,pass,6
 
 
 
-timm_resnest,pass,6
+timm_resnest,pass,7
 
 
 
-timm_vision_transformer,pass,7
+timm_vision_transformer,pass,6
 
 
 
@@ -274,24 +266,24 @@ timm_vision_transformer_large,pass_due_to_skip,0
 
 
 
-timm_vovnet,pass,7
+timm_vovnet,pass,6
 
 
 
-torch_multimodal_clip,pass,6
+torch_multimodal_clip,pass,7
 
 
 
-tts_angular,pass,8
+tts_angular,pass,9
 
 
 
-vgg16,pass,7
+vgg16,pass,6
 
 
 
-vision_maskrcnn,pass,35
+vision_maskrcnn,pass,34
 
 
 
-yolov3,pass,8
+yolov3,pass,9
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 6489ef4a738e8..eeab67e609f6c 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -51,9 +51,9 @@
 import torch._dynamo.utils
 import torch._export
 import torch.distributed
-import torch.fx._pytree as fx_pytree
 import torch.multiprocessing as mp
 from scipy.stats import gmean, ttest_ind
+from torch._C import _has_cuda as HAS_CUDA, _has_xpu as HAS_XPU
 from torch._dynamo.profiler import fx_insert_profiling, Profiler
 from torch._dynamo.testing import (
     dummy_fx_compile,
@@ -61,13 +61,21 @@
     reset_rng_state,
     same,
 )
-from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 
 try:
-    from torch._dynamo.utils import clone_inputs, graph_break_reasons
+    from torch._dynamo.utils import (
+        clone_inputs,
+        graph_break_reasons,
+        maybe_enable_compiled_autograd,
+    )
     from torch._inductor.utils import fresh_inductor_cache
 except ImportError:
-    from _dynamo.utils import clone_inputs, graph_break_reasons
+    from _dynamo.utils import (
+        clone_inputs,
+        graph_break_reasons,
+        maybe_enable_compiled_autograd,
+    )
+import torch._functorch.config
 from torch._functorch.aot_autograd import set_model_name
 from torch._inductor import config as inductor_config, metrics
 from torch._subclasses.fake_tensor import FakeTensorMode
@@ -129,6 +137,10 @@ class CI(NamedTuple):
     # We should be able to graphbreak there.
     "doctr_det_predictor",
     "dlrm",
+    "pyhpc_isoneutral_mixing",
+    "pyhpc_equation_of_state",
+    "pyhpc_turbulent_kinetic_energy",
+    "detectron2_fcos_r_50_fpn",
 }
 
 # These models currently fail accuracy with eager Adam optimizer
@@ -248,6 +260,15 @@ class CI(NamedTuple):
 DO_NOT_CAST_INPUTS = {"stable_diffusion"}
 
 
+# Maps a benchmark model name to a list of status codes. For any listed entry, we'll
+# capture TORCH_COMPILE_DEBUG logs in CI runs and preseve them (i.e., for upload) if
+# the result status matches one listed.
+CI_PRESERVE_COMPILE_DEBUG = {
+    # For example:
+    # "mnasnet1_0": ["fail_accuracy"],
+}
+
+
 def model_specified_by_path(path_and_class_str):
     return ":" in path_and_class_str
 
@@ -314,10 +335,16 @@ def deterministic_torch_manual_seed(*args, **kwargs):
         from torch._C import default_generator
 
         seed = 1337
-        import torch.cuda
+        if HAS_CUDA:
+            import torch.cuda
 
-        if not torch.cuda._is_in_bad_fork():
-            torch.cuda.manual_seed_all(seed)
+            if not torch.cuda._is_in_bad_fork():
+                torch.cuda.manual_seed_all(seed)
+        if HAS_XPU:
+            import torch.xpu
+
+            if not torch.xpu._is_in_bad_fork():
+                torch.xpu.manual_seed_all(seed)
         return default_generator.manual_seed(seed)
 
     torch.manual_seed = deterministic_torch_manual_seed
@@ -662,7 +689,9 @@ def maybe_mark_profile(*args, **kwargs):
             # call mark_step between the 2 calls to make the comparison fair.
             maybe_mark_step(args)
 
-            with maybe_mark_profile(p=p, mark="actual"):
+            with maybe_mark_profile(p=p, mark="actual"), maybe_enable_compiled_autograd(
+                args.compiled_autograd
+            ):
                 timings[rep, 1], actual_output = timed(
                     model,
                     frozen_model_iter_fn,
@@ -673,7 +702,10 @@ def maybe_mark_profile(*args, **kwargs):
                 )
 
     if args.export_profiler_trace:
-        name = args.profiler_trace_name + "_" + model.name + ".json"
+        name = args.profiler_trace_name + "_" + model.name
+        if hasattr(args, "rank"):
+            name += f"_rank_{args.rank}"
+        name += ".json"
         name = os.path.join(torch._dynamo.config.base_dir, name)
         p.export_chrome_trace(name)
     median = np.median(timings, axis=0)
@@ -722,6 +754,11 @@ def maybe_mark_profile(*args, **kwargs):
         row.append(kwargs["compression_ratio"])
         row.append(kwargs["eager_peak_mem"])
         row.append(kwargs["dynamo_peak_mem"])
+
+    if "cache_lookup_latency" in kwargs:
+        headers.append("cache_lookup_latency")
+        row.append(kwargs["cache_lookup_latency"])
+
     if "dynamo_stats" in kwargs:
         for k, v in kwargs["dynamo_stats"].items():
             headers.append(k)
@@ -1102,6 +1139,9 @@ class AOTInductorModelCache:
 
     @classmethod
     def load(cls, model, example_inputs, device):
+        import torch._inductor
+        import torch.export._trace
+
         key = weakref.ref(model)
         if key not in cls.cache:
             # Register the output dataclass to pytree
@@ -1110,16 +1150,28 @@ def load(cls, model, example_inputs, device):
                 # copy.deepcopy is required to prevent any surprising side-effect,
                 # see https://github.com/pytorch/pytorch/issues/113029
                 example_outputs = copy.deepcopy(model)(*example_args, **example_kwargs)
-            _register_dataclass_output_as_pytree(example_outputs)
 
-            so_path = torch._export.aot_compile(model, example_args, example_kwargs)
+            if pytree._is_namedtuple_instance(example_outputs):
+                typ = type(example_outputs)
+                pytree._register_namedtuple(
+                    typ,
+                    serialized_type_name=f"{typ.__module__}.{typ.__name__}",
+                )
+            else:
+                _register_dataclass_output_as_pytree(example_outputs)
 
-            runner = (
-                torch._C._aoti.AOTIModelContainerRunnerCpu(so_path, 1)
-                if device == "cpu"
-                else torch._C._aoti.AOTIModelContainerRunnerCuda(so_path, 1)
-            )
-            cls.cache[key] = runner
+            gm = torch.export._trace._export(
+                model,
+                example_args,
+                example_kwargs,
+                pre_dispatch=True,
+            ).module()
+            with torch.no_grad():
+                so_path = torch._inductor.aot_compile(
+                    gm, example_args, example_kwargs
+                )  # type: ignore[arg-type]
+
+            cls.cache[key] = torch._export.aot_load(so_path, device)
 
         return cls.cache[key]
 
@@ -1139,19 +1191,11 @@ def opt_export(_, example_inputs):
 
 
 def export_aot_inductor(model, example_inputs, device):
-    runner = AOTInductorModelCache.load(model, example_inputs, device)
-    call_spec = runner.get_call_spec()
-    in_spec = pytree.treespec_loads(call_spec[0])
-    out_spec = pytree.treespec_loads(call_spec[1])
+    optimized = AOTInductorModelCache.load(model, example_inputs, device)
 
     def opt_aot_inductor(_, example_inputs, collect_outputs=False):
         example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
-
-        flat_inputs = fx_pytree.tree_flatten_spec(
-            (example_args, example_kwargs), in_spec
-        )
-        flat_outputs = runner.run(flat_inputs)
-        return pytree.tree_unflatten(flat_outputs, out_spec)
+        return optimized(*example_args, **example_kwargs)
 
     return opt_aot_inductor
 
@@ -1225,6 +1269,31 @@ def __init__(
             self.model_dir / f"{model_name}_{self._COMPILER_NAME}.onnx"
         )
 
+    def _determine_deepcopy_target_device(self):
+        if current_device == "cpu":
+            target_device = "cpu"
+        else:
+            if torch.cuda.device_count() > 1:
+                # Copy to another cuda device to avoid OOM.
+                target_device = "cuda:1"
+            else:
+                target_device = "cuda"
+        return target_device
+
+    def deepcopy_model_and_inputs_to_device(self, model, example_inputs, target_device):
+        # Deepcopy model before export to avoid modification to baseline model.
+        # To avoid OOM, the model is first moved to CPU. Both models are then moved to device.
+        model_device = next(model.parameters()).device
+        model.to("cpu")
+        model_copy = copy.deepcopy(model).to(target_device)
+        model.to(model_device)
+
+        target_device_example_inputs = tree_map_only(
+            torch.Tensor, lambda x: x.to(device=target_device), example_inputs
+        )
+
+        return model_copy, target_device_example_inputs
+
     @classmethod
     def _generate_onnx_model_directory(
         cls, output_directory: str, compiler_name: str, model_name: str
@@ -1407,7 +1476,9 @@ def __init__(
     def _export(self, model, example_inputs, output_path: str, /, **kwargs) -> None:
         if self.copy_before_export:
             # Deepcopy model before export to avoid modification to baseline model.
-            model = copy.deepcopy(model)
+            model, example_inputs = self.deepcopy_model_and_inputs_to_device(
+                model, example_inputs, self._determine_deepcopy_target_device()
+            )
 
         # Hack for huggingface models (kwargs only).
         if isinstance(example_inputs, dict):
@@ -1489,7 +1560,9 @@ def _export(
     ) -> torch.onnx.ONNXProgram:
         if self.copy_before_export:
             # Deepcopy model before export to avoid modification to baseline model.
-            model = copy.deepcopy(model)
+            model, example_inputs = self.deepcopy_model_and_inputs_to_device(
+                model, example_inputs, self._determine_deepcopy_target_device()
+            )
 
         example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
         options = torch.onnx.ExportOptions(dynamic_shapes=self._dynamic_shapes)
@@ -1516,6 +1589,12 @@ class OnnxModelFromDynamoAotInline(OnnxModelFromDynamo):
     def _export(
         self, model, example_inputs, output_path: str
     ) -> torch.onnx.ONNXProgram:
+        if self.copy_before_export:
+            # Deepcopy model before export to avoid modification to baseline model.
+            model, example_inputs = self.deepcopy_model_and_inputs_to_device(
+                model, example_inputs, self._determine_deepcopy_target_device()
+            )
+
         example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
         options = torch.onnx.ExportOptions(dynamic_shapes=self._dynamic_shapes)
         onnx_program = torch.onnx.dynamo_export(
@@ -1537,6 +1616,40 @@ def _export(
         return onnx_program
 
 
+class OnnxModelFromDynamoAotOptimize(OnnxModelFromDynamo):
+    """Dynamo and Fx based export, with AOT optimize post export. `torch.onnx.dynamo_export`."""
+
+    _COMPILER_NAME = "dynamo_aot_optimize"
+
+    def _export(
+        self, model, example_inputs, output_path: str
+    ) -> torch.onnx.ONNXProgram:
+        if self.copy_before_export:
+            # Deepcopy model before export to avoid modification to baseline model.
+            model, example_inputs = self.deepcopy_model_and_inputs_to_device(
+                model, example_inputs, self._determine_deepcopy_target_device()
+            )
+
+        example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
+        options = torch.onnx.ExportOptions(dynamic_shapes=self._dynamic_shapes)
+        export_output = torch.onnx.dynamo_export(
+            model, *example_args, **example_kwargs, export_options=options
+        )
+
+        import onnx
+        from onnxscript.rewriter.onnxruntime import rewrite
+
+        model_proto = rewrite(export_output.model_proto)
+        onnx.save_model(
+            model_proto,
+            output_path,
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+        )
+
+        return export_output
+
+
 class _OnnxPatch:
     @classmethod
     def patch_non_tensor_outputs(cls, correct_result, new_result, fp64_outputs):
@@ -1796,7 +1909,7 @@ class TimeOutException(Exception):
 
 
 def alarm_handler(signum, frame):
-    raise TimeOutException()
+    raise TimeOutException
 
 
 def exit_after(s):
@@ -1879,6 +1992,15 @@ def get_dynamo_stats():
             "graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()),
             # NB: The plus removes zero counts
             "unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]),
+            "autograd_captures": torch._dynamo.utils.counters["compiled_autograd"][
+                "captures"
+            ],
+            "autograd_compiles": torch._dynamo.utils.counters["compiled_autograd"][
+                "compiles"
+            ],
+            "cudagraph_skips": torch._dynamo.utils.counters["inductor"][
+                "cudagraph_skips"
+            ],
         }
     )
 
@@ -1926,6 +2048,29 @@ def maybe_init_distributed(should_init_distributed, rank, world_size, port="6789
             torch.distributed.destroy_process_group()
 
 
+@contextmanager
+def maybe_snapshot_memory(should_snapshot_memory, suffix):
+    # Enables Memory Snapshot tool for memory deep dives:
+    # https://pytorch.org/blog/understanding-gpu-memory-1/
+    try:
+        if should_snapshot_memory:
+            torch.cuda.memory._record_memory_history(max_entries=100000)
+        yield
+    finally:
+        if should_snapshot_memory:
+            try:
+                torch.cuda.memory._dump_snapshot(
+                    os.path.join(
+                        torch._dynamo.config.base_dir,
+                        f"{output_filename.rstrip('.csv')}_{suffix}.pickle",
+                    )
+                )
+            except Exception as e:
+                logging.error("Failed to save memory snapshot, %s", e)
+
+            torch.cuda.memory._record_memory_history(enabled=None)
+
+
 class BenchmarkRunner:
     def __init__(self):
         self.model_iter_fn = None
@@ -1941,31 +2086,30 @@ def setup_amp(self, current_device=None):
 
         devices = [current_device] if current_device else self.args.devices
         if self.args.amp:
-            if devices == ["cuda"]:
-                # AMP training can lead to small loss values which can undeflow
-                # gradient values returning in zero gradients. To solve this
-                # problem, PyTorch introduces GradScaler. GradScaler is a stateful
-                # structure, that scales the loss values to prevent underflow. Loss
-                # values are big at the beginning of training (therefore not
-                # requiring scaling), while loss value tends to be small as network
-                # starts getting better (requiring scaling). GradScaler manages all
-                # of this fine tuning, checking the gradients are turning to inf,
-                # discarding such batches.
-
-                # Since we are not running a long iteration, default value of
-                # init_scale 65536 is going to turn all gradients to inf. Therefore,
-                # we just use a init_scale of 2.0 for benchmarking purpose.
-
-                # Disabling Gradscaler because
-                #  1) Benchmark setup runs 2 iterations of fwd-bwd. So, not useful.
-                #  2) Current setup shares grad_scaler for eager and dynamo model,
-                #  which is bad as Gradscaler has state and can adjust the scaling
-                #  factor between eager and dynamo run, making accuracy check
-                #  harder.
-                # self.grad_scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
-                self.autocast = torch.cuda.amp.autocast
-            if devices == ["cpu"]:
-                self.autocast = torch.cpu.amp.autocast
+            # AMP training can lead to small loss values which can undeflow
+            # gradient values returning in zero gradients. To solve this
+            # problem, PyTorch introduces GradScaler. GradScaler is a stateful
+            # structure, that scales the loss values to prevent underflow. Loss
+            # values are big at the beginning of training (therefore not
+            # requiring scaling), while loss value tends to be small as network
+            # starts getting better (requiring scaling). GradScaler manages all
+            # of this fine tuning, checking the gradients are turning to inf,
+            # discarding such batches.
+
+            # Since we are not running a long iteration, default value of
+            # init_scale 65536 is going to turn all gradients to inf. Therefore,
+            # we just use a init_scale of 2.0 for benchmarking purpose.
+
+            # Disabling Gradscaler because
+            #  1) Benchmark setup runs 2 iterations of fwd-bwd. So, not useful.
+            #  2) Current setup shares grad_scaler for eager and dynamo model,
+            #  which is bad as Gradscaler has state and can adjust the scaling
+            #  factor between eager and dynamo run, making accuracy check
+            #  harder.
+            # self.grad_scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
+            self.autocast = functools.partial(
+                torch.amp.autocast, device_type=devices[0]
+            )
             if self.args.amp_dtype:
                 amp_dtype = (
                     torch.float16
@@ -1978,6 +2122,12 @@ def init_optimizer(self, name, device, params):
         if device == "cuda" and self.args.training and name not in CI_SKIP_OPTIMIZER:
             if (name in CI_USE_SGD and self.args.ci) or name in BENCHMARK_USE_SGD:
                 self.optimizer = torch.optim.SGD(params, lr=0.01, foreach=True)
+                # Disable multi_tensor_sgd for benchmarking, there isn't a large performance benefit (~1%) to compiling
+                # this optimizer because it is a single foreach add, and increases compile time.
+                # After autotuning and fake tensor caching lands, we can enable, becuase the compile time impact will be lower.
+                # Fake Tensor caching: https://github.com/pytorch/pytorch/pull/113873
+                # Autotuning: https://github.com/pytorch/pytorch/issues/117447
+                self.optimizer.step = torch._dynamo.disable(self.optimizer.step)
             else:
                 self.optimizer = torch.optim.Adam(
                     params, lr=0.01, capturable=True, foreach=True
@@ -2005,6 +2155,10 @@ def skip_models_for_cuda(self):
     def skip_models_for_cpu(self):
         return set()
 
+    @property
+    def skip_models_for_freezing(self):
+        return set()
+
     @property
     def slow_models(self):
         return set()
@@ -2057,8 +2211,12 @@ def skip_multiprocess_models(self):
     def skip_models_due_to_control_flow(self):
         return set()
 
+    @property
+    def guard_on_nn_module_models(self):
+        return set()
+
     def get_tolerance_and_cosine_flag(self, is_training, current_device, name):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @property
     def equal_nan(self):
@@ -2185,7 +2343,7 @@ def get_benchmark_indices(self, length):
         )
         return start, end
 
-    def get_fsdp_auto_wrap_policy(self, model_name: str) -> Optional[ModuleWrapPolicy]:
+    def get_fsdp_auto_wrap_policy(self, model_name: str):
         from diffusers.models.transformer_2d import Transformer2DModel
 
         from torch.distributed.fsdp.wrap import (
@@ -2217,7 +2375,7 @@ def get_fsdp_auto_wrap_policy(self, model_name: str) -> Optional[ModuleWrapPolic
 
         return ModuleWrapPolicy(MODEL_FSDP_WRAP[model_name])
 
-    def deepcopy_and_maybe_ddp(self, model):
+    def deepcopy_and_maybe_parallelize(self, model):
         model = self.deepcopy_model(model)
         if self.args.ddp:
             assert (
@@ -2313,7 +2471,7 @@ def record_status(accuracy_status, dynamo_start_stats):
             inputs_fp64 = None
             try:
                 model_fp64, inputs_fp64 = cast_to_fp64(
-                    self.deepcopy_and_maybe_ddp(model),
+                    self.deepcopy_and_maybe_parallelize(model),
                     clone_inputs(example_inputs),
                 )
                 self.init_optimizer(name, current_device, model_fp64.parameters())
@@ -2347,7 +2505,7 @@ def record_status(accuracy_status, dynamo_start_stats):
             reset_rng_state()
             model_copy = None
             try:
-                model_copy = self.deepcopy_and_maybe_ddp(model)
+                model_copy = self.deepcopy_and_maybe_parallelize(model)
                 self.init_optimizer(name, current_device, model_copy.parameters())
                 correct_result = self.run_n_iterations(
                     model_copy, clone_inputs(example_inputs)
@@ -2358,7 +2516,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                     if isinstance(e, torch.cuda.OutOfMemoryError)
                     else "eager_1st_run_fail"
                 )
-                log.exception(e)
+                log.exception("")
                 return record_status(accuracy_status, dynamo_start_stats=start_stats)
             finally:
                 del model_copy
@@ -2368,7 +2526,7 @@ def record_status(accuracy_status, dynamo_start_stats):
             reset_rng_state()
             model_copy = None
             try:
-                model_copy = self.deepcopy_and_maybe_ddp(model)
+                model_copy = self.deepcopy_and_maybe_parallelize(model)
                 self.init_optimizer(name, current_device, model_copy.parameters())
                 correct_rerun_result = self.run_n_iterations(
                     model_copy, clone_inputs(example_inputs)
@@ -2379,7 +2537,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                     if isinstance(e, torch.cuda.OutOfMemoryError)
                     else "eager_2nd_run_fail"
                 )
-                log.exception(e)
+                log.exception("")
                 return record_status(accuracy_status, dynamo_start_stats=start_stats)
             finally:
                 del model_copy
@@ -2415,7 +2573,7 @@ def record_status(accuracy_status, dynamo_start_stats):
             torch._dynamo.reset()
             model_copy = None
             try:
-                model_copy = self.deepcopy_and_maybe_ddp(model)
+                model_copy = self.deepcopy_and_maybe_parallelize(model)
                 self.init_optimizer(name, current_device, model_copy.parameters())
                 if self.args.export or self.args.export_aot_inductor:
                     # apply export on module directly
@@ -2428,9 +2586,10 @@ def record_status(accuracy_status, dynamo_start_stats):
                         new_result = optimized_model_iter_fn(model_copy, example_inputs)
                 else:
                     optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)
-                    new_result = optimized_model_iter_fn(model_copy, example_inputs)
+                    with maybe_enable_compiled_autograd(self.args.compiled_autograd):
+                        new_result = optimized_model_iter_fn(model_copy, example_inputs)
             except Exception as e:
-                log.exception(e)
+                log.exception("")
                 print(
                     "TorchDynamo optimized model failed to run because of following error"
                 )
@@ -2467,6 +2626,14 @@ def record_status(accuracy_status, dynamo_start_stats):
                 # E.g., the output order might not match, None might be part of output, etc.
 
             try:
+                if self.args.training and self.args.amp:
+                    if process_fn := self.get_output_amp_train_process_func.get(
+                        name, None
+                    ):
+                        correct_result = process_fn(correct_result)
+                        new_result = process_fn(new_result)
+                        fp64_outputs = process_fn(fp64_outputs)
+
                 if not same(
                     correct_result,
                     new_result,
@@ -2524,7 +2691,7 @@ def check_tolerance(
                 optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)
                 new_result = optimized_model_iter_fn(model, example_inputs)
             except Exception as e:
-                log.exception(e)
+                log.exception("")
                 print(
                     "TorchDynamo optimized model failed to run because of following error"
                 )
@@ -2598,18 +2765,34 @@ def warmup(fn, model, example_inputs, mode, niters=5):
         model, example_inputs = self.maybe_cast(model, example_inputs)
 
         # Use distributed wrapping as necessary
-        model = self.deepcopy_and_maybe_ddp(model)
+        model = self.deepcopy_and_maybe_parallelize(model)
 
         self.init_optimizer(name, current_device, model.parameters())
-        with self.pick_grad(name, self.args.training):
+
+        # The self.autocast context is needed for the model we export with aot_compile,
+        # similar to what we do in the check_accuracy function
+        ctx = (
+            self.autocast(**self.autocast_arg)
+            if self.args.export_aot_inductor
+            else contextlib.nullcontext()
+        )
+
+        with self.pick_grad(name, self.args.training), ctx:
             ok, total = Stats.reset_counters()
             experiment_kwargs = {}
             if tag is not None:
                 experiment_kwargs["tag"] = tag
             results = []
-            eager_latency, eager_peak_mem, _ = warmup(
-                self.model_iter_fn, model, example_inputs, "eager"
-            )
+            with maybe_snapshot_memory(
+                self.args.snapshot_memory, f"eager_{self.args.only}"
+            ):
+                eager_latency, eager_peak_mem, _ = warmup(
+                    self.model_iter_fn, model, example_inputs, "eager"
+                )
+                if self.args.use_warm_peak_memory:
+                    _, eager_peak_mem, _ = warmup(
+                        self.model_iter_fn, model, example_inputs, "eager", niters=1
+                    )
 
             if self.args.export_aot_inductor:
                 t_0 = time.perf_counter()
@@ -2620,9 +2803,37 @@ def warmup(fn, model, example_inputs, mode, niters=5):
                 optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
                 aot_compilation_time = 0
 
-            dynamo_latency, dynamo_peak_mem, dynamo_stats = warmup(
-                optimized_model_iter_fn, model, example_inputs, "dynamo"
-            )
+            with maybe_enable_compiled_autograd(
+                self.args.compiled_autograd
+            ), maybe_snapshot_memory(
+                self.args.snapshot_memory, f"compiled_{self.args.only}"
+            ):
+                dynamo_latency, dynamo_peak_mem, dynamo_stats = warmup(
+                    optimized_model_iter_fn, model, example_inputs, "dynamo"
+                )
+                if self.args.use_warm_peak_memory:
+                    _, dynamo_peak_mem, _ = warmup(
+                        optimized_model_iter_fn,
+                        model,
+                        example_inputs,
+                        "dynamo",
+                        niters=1,
+                    )
+
+            if self.args.profile_dynamo_cache_lookup:
+                with torch.profiler.profile(
+                    activities=[torch.profiler.ProfilerActivity.CPU]
+                ) as prof:
+                    with maybe_enable_compiled_autograd(self.args.compiled_autograd):
+                        warmup(optimized_model_iter_fn, model, example_inputs, "dynamo")
+
+                events = list(
+                    filter(
+                        lambda event: "TorchDynamo Cache Lookup" in event.key,
+                        prof.key_averages(),
+                    )
+                )
+                dynamo_cache_lookup_latency = events[0].self_cpu_time_total
 
             compilation_time = dynamo_latency - eager_latency + aot_compilation_time
             compression_ratio = (
@@ -2635,12 +2846,19 @@ def warmup(fn, model, example_inputs, mode, niters=5):
                     f"ratio: {compression_ratio:.2f}"
                 )
 
+            if self.args.print_compilation_time:
+                print(f"Compilation time: {compilation_time:.2f}")
+
             if experiment.func is speedup_experiment:
                 experiment_kwargs["compilation_latency"] = compilation_time
                 experiment_kwargs["compression_ratio"] = compression_ratio
                 experiment_kwargs["eager_peak_mem"] = eager_peak_mem
                 experiment_kwargs["dynamo_peak_mem"] = dynamo_peak_mem
                 experiment_kwargs["dynamo_stats"] = dynamo_stats
+                if self.args.profile_dynamo_cache_lookup:
+                    experiment_kwargs[
+                        "cache_lookup_latency"
+                    ] = dynamo_cache_lookup_latency
 
             if experiment.func is coverage_experiment:
                 ok, total = Stats.reset_counters()
@@ -2701,6 +2919,24 @@ def minify_model(
                 repro_dir,
             )
 
+    def maybe_preserve_compile_debug(self, name, status):
+        if (
+            name in CI_PRESERVE_COMPILE_DEBUG
+            and status in CI_PRESERVE_COMPILE_DEBUG[name]
+        ):
+            src_dir = torch._dynamo.utils.get_debug_dir()
+            if os.path.isdir(src_dir):
+                dbg_dir = os.path.join(
+                    os.getcwd(), "test", "debug", "torch_compile_debug"
+                )
+                dst_dir = os.path.join(dbg_dir, os.path.basename(src_dir))
+                try:
+                    os.makedirs(dbg_dir, exist_ok=True)
+                    os.rename(src_dir, dst_dir)
+                    log.warning("Moved %s to %s", src_dir, dst_dir)
+                except OSError:
+                    log.exception("Failed to preserve %s", src_dir)
+
     def run_one_model(
         self,
         name,
@@ -2738,6 +2974,8 @@ def run_one_model(
             print(status)
         torch.cuda.empty_cache()
 
+        self.maybe_preserve_compile_debug(name, status)
+
         if self.args.timing:
             from torch._dynamo.utils import op_count, print_time_report
             from torch.utils._stats import simple_call_counter
@@ -2940,9 +3178,10 @@ def get_example_inputs(self):
     """,
     )
     parser.add_argument(
-        "--no-optimize-ddp",
-        action="store_true",
-        help="Disables dynamo DDPOptimizer (graph breaks). (Applies only when using --ddp benchmark mode).",
+        "--optimize-ddp-mode",
+        type=str,
+        default="ddp_optimizer",
+        help="Specify the DDP optimization mode -- the value of torch._dynamo.config.optimize_ddp.",
     )
     parser.add_argument(
         "--distributed-master-port",
@@ -2954,6 +3193,11 @@ def get_example_inputs(self):
         action="store_true",
         help="Runs a dynamic shapes version of the benchmark, if available.",
     )
+    parser.add_argument(
+        "--propagate-real-tensors",
+        action="store_true",
+        help="Capture as much data dependent as you can by unsoundly propagating real tensors",
+    )
     parser.add_argument(
         "--dynamic-batch-only",
         action="store_true",
@@ -3032,11 +3276,22 @@ def get_example_inputs(self):
         action="store_true",
         help="print graph counter stats",
     )
+    parser.add_argument(
+        "--use-warm-peak-memory",
+        "--use_warm_peak_memory",
+        action="store_true",
+        help="Measure peak memory using a warm run to reduce autotuning noise",
+    )
     parser.add_argument(
         "--print-memory",
         action="store_true",
         help="print extra memory statistics",
     )
+    parser.add_argument(
+        "--print-compilation-time",
+        action="store_true",
+        help="print compilation latency",
+    )
     parser.add_argument(
         "--print-dataframe-summary",
         action="store_true",
@@ -3140,6 +3395,26 @@ def get_example_inputs(self):
         help="Enable minification when failure is below tolerance. Save repro script for each model.",
     )
 
+    parser.add_argument(
+        "--compiled-autograd",
+        action="store_true",
+        help="Enables compiled autograd on compiled benchmark",
+    )
+
+    parser.add_argument(
+        "--profile_dynamo_cache_lookup",
+        "--profile-dynamo-cache-lookup",
+        action="store_true",
+        help="profiles TorchDynamo cache lookup",
+    )
+
+    parser.add_argument(
+        "--snapshot-memory",
+        "--snapshot_memory",
+        action="store_true",
+        help="Enables Memory Snapshot tool for memory deep dives: https://pytorch.org/blog/understanding-gpu-memory-1/",
+    )
+
     group_fuser = parser.add_mutually_exclusive_group()
     # --nvfuser is now the default, keep the option to not break scripts
     group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS)
@@ -3233,6 +3508,12 @@ def get_example_inputs(self):
         action="store_true",
         help="Measure speedup with Dynamo ONNX AOT Inline, i.e. `torch.onnx.dynamo_export`",
     )
+    group.add_argument(
+        "--dynamo-onnx-aot-optimize",
+        "--dynamo_onnx_aot_optimize",
+        action="store_true",
+        help="Measure speedup with Dynamo ONNX w/ ort fusions, i.e. `torch.onnx.dynamo_export`",
+    )
     group.add_argument(
         "--backend",
         choices=torch._dynamo.list_backends(exclude_tags=None),
@@ -3371,6 +3652,11 @@ def run(runner, args, original_dir=None):
     if args.dynamic_shapes:
         if not args.dynamic_batch_only:
             torch._dynamo.config.assume_static_by_default = False
+    if args.propagate_real_tensors:
+        # TODO: Separate flag for data dependent
+        torch._dynamo.config.capture_scalar_outputs = True
+        torch._dynamo.config.capture_dynamic_output_shape_ops = True
+        torch._functorch.config.fake_tensor_propagate_real_tensors = True
     if args.specialize_int:
         torch._dynamo.config.specialize_int = True
     if args.ci:
@@ -3385,18 +3671,8 @@ def run(runner, args, original_dir=None):
             CI, args.backend, training=args.training, dynamic=args.dynamic_shapes
         )
     if args.ddp:
-        # TODO: we could also hook DDP bench up to --speedup bench, _not_ for mgpu e2e perf,
-        # but just to measure impact on singlenode of performing graph-breaks.
-        # Left it as a follow up to keep this PR isolated.
-        assert (
-            args.accuracy
-        ), "DDP benchmark is currently only hooked up to --accuracy bench"
         assert args.training, "DDP benchmark requires --training mode"
-        if args.no_optimize_ddp:
-            torch._dynamo.config.optimize_ddp = False
-        else:
-            # TODO(whc) after enabling DDPOptimizer by default this could be removed or assert
-            torch._dynamo.config.optimize_ddp = True
+        torch._dynamo.config.optimize_ddp = args.optimize_ddp_mode
         if args.only == "dlrm":
             log.error(
                 "DLRM+DDP is unsupported as it requires sharding the embedding layer separately from DDP"
@@ -3432,6 +3708,9 @@ def run(runner, args, original_dir=None):
             "Wav2Vec2ForCTC",
             "Wav2Vec2ForPreTraining",
             "sam",
+            "sam_fast",
+            "resnet50_quantized_qat",
+            "mobilenet_v2_quantized_qat",
         }:
             # some of the models do not support use_deterministic_algorithms
             torch.use_deterministic_algorithms(True)
@@ -3472,9 +3751,9 @@ def run(runner, args, original_dir=None):
             log.warning("torch.cuda.is_available() == False, using CPU")
             args.devices = ["cpu"]
 
-    if args.devices != ["cpu"] and torch.cuda.is_available():
+    if args.devices != ["cpu"] and (HAS_CUDA or HAS_XPU):
         global synchronize
-        synchronize = torch.cuda.synchronize
+        synchronize = torch.cuda.synchronize if HAS_CUDA else torch.xpu.synchronize
 
     if (
         args.devices == ["cuda"]
@@ -3530,6 +3809,9 @@ def run(runner, args, original_dir=None):
     if not args.multiprocess:
         runner.skip_models.update(runner.skip_multiprocess_models)
 
+    if args.freezing:
+        runner.skip_models.update(runner.skip_models_for_freezing)
+
     if args.no_skip:
         runner.skip_models.clear()
 
@@ -3596,6 +3878,17 @@ def run(runner, args, original_dir=None):
         experiment = speedup_experiment_onnx
         output_filename = "dynamo_onnx_aot_inline.csv"
         current_onnx_compiler = "dynamo"
+    elif args.dynamo_onnx_aot_optimize:
+        optimize_ctx = functools.partial(
+            optimize_onnx_ctx,
+            args.output_directory or ".",
+            OnnxModelFromDynamoAotOptimize,
+            dynamic_shapes=args.dynamic_shapes,
+            copy_before_export=args.performance,
+        )
+        experiment = speedup_experiment_onnx
+        output_filename = "dynamo_onnx_aot_optimize.csv"
+        current_onnx_compiler = "dynamo"
     elif args.speedup_dynamo_ts:
         optimize_ctx = torch._dynamo.optimize("ts", nopython=args.nopython)
         experiment = speedup_experiment
@@ -3783,7 +4076,7 @@ def run(runner, args, original_dir=None):
                                     batch_size=batch_size,
                                     extra_args=extra_args,
                                 )
-                except RuntimeError as e:
+                except Exception as e:
                     import traceback
 
                     mode = "train" if args.training else "eval"
@@ -3849,15 +4142,20 @@ def detect_and_mark_batch(t):
             else:
                 model, example_inputs = runner.cast_based_on_args(model, example_inputs)
             runner.setup_amp(current_device)
-            runner.run_one_model(
-                name,
-                model,
-                example_inputs,
-                optimize_ctx,
-                experiment,
-                explain=args.explain,
-                tag=args.tag,
-            )
+            guard_ctx = contextlib.nullcontext()
+            if name in runner.guard_on_nn_module_models:
+                guard_ctx = torch._dynamo.config.patch(guard_nn_modules=True)
+
+            with guard_ctx:
+                runner.run_one_model(
+                    name,
+                    model,
+                    example_inputs,
+                    optimize_ctx,
+                    experiment,
+                    explain=args.explain,
+                    tag=args.tag,
+                )
         if args.generate_aot_autograd_stats:
             stats_file = output_filename.split(".csv")[0] + "_stats.csv"
             output_csv(
@@ -3887,8 +4185,13 @@ def detect_and_mark_batch(t):
                 timeout = args.timeout
                 if should_diff_branch(args):
                     timeout *= 2
+                env = os.environ.copy()
+                if args.ci and name in CI_PRESERVE_COMPILE_DEBUG:
+                    env["TORCH_COMPILE_DEBUG"] = "1"
                 subprocess.check_call(
-                    [sys.executable] + sys.argv + [f"--only={name}"], timeout=timeout
+                    [sys.executable] + sys.argv + [f"--only={name}"],
+                    timeout=timeout,
+                    env=env,
                 )
             except subprocess.TimeoutExpired:
                 write_csv_when_exception(args, name, "timeout")
diff --git a/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv b/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
new file mode 100644
index 0000000000000..d3359f5c2f6f4
--- /dev/null
+++ b/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
@@ -0,0 +1,12 @@
+#name,data_type,shape,wrapper,perf_speedup_target_c5_12xlarge
+#timm_vision_transformer,float32,static,default,1.1585628
+phlippe_densenet,float32,static,default,1.99590617
+basic_gnn_gcn,float32,dynamic,default,1.24639561
+llama_v2_7b_16h,float32,dynamic,default,1.27455818
+resnet50,float32,dynamic,default,2.28794694
+timm_efficientnet,float32,static,cpp,2.72195686
+mobilenet_v3_large,float32,static,cpp,3.02274304
+timm_resnest,float32,dynamic,cpp,2.10118744
+shufflenet_v2_x1_0,float32,dynamic,cpp,1.8976929
+#hf_GPT2,float32,dynamic,cpp,1.6702305
+hf_GPT2,float32,dynamic,cpp,1.1183002
diff --git a/benchmarks/dynamo/huggingface.py b/benchmarks/dynamo/huggingface.py
index c0065a73a8f1f..d6014706479e3 100755
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@@ -55,6 +55,12 @@ def pip_install(package):
 ]
 
 
+def process_hf_reformer_output(out):
+    assert isinstance(out, list)
+    # second output is unstable
+    return [elem for i, elem in enumerate(out) if i != 1]
+
+
 try:
     mod = importlib.import_module("transformers")
     for cls in imports:
@@ -172,6 +178,7 @@ def pip_install(package):
     "AlbertForQuestionAnswering",
 }
 REQUIRE_HIGHER_TOLERANCE_INFERENCE = {
+    "GPT2ForSequenceClassification",
     "RobertaForQuestionAnswering",
 }
 
@@ -531,6 +538,10 @@ def skip_accuracy_checks_large_models_dashboard(self):
             return SKIP_ACCURACY_CHECK_MODELS
         return set()
 
+    @property
+    def get_output_amp_train_process_func(self):
+        return {}
+
     def pick_grad(self, name, is_training):
         if is_training:
             return torch.enable_grad()
diff --git a/benchmarks/dynamo/microbenchmarks/dynamo_microbenchmarks.py b/benchmarks/dynamo/microbenchmarks/dynamo_microbenchmarks.py
new file mode 100644
index 0000000000000..f86e5f7033819
--- /dev/null
+++ b/benchmarks/dynamo/microbenchmarks/dynamo_microbenchmarks.py
@@ -0,0 +1,43 @@
+import cProfile
+import pstats
+import timeit
+
+import torch
+
+
+@torch.compile(backend="eager", fullgraph=True)
+def symbolic_convert_overhead_stress_test(x, y, n):
+    while n > 0:
+        n -= 1
+        x, y = y, x
+    return x + y
+
+
+def main():
+    def fn():
+        torch._dynamo.reset()
+        symbolic_convert_overhead_stress_test(x, y, 100000)
+
+    x = torch.randn(16)
+    y = torch.randn(16)
+    t = min(timeit.repeat(fn, number=1, repeat=3))
+    print(f"symbolic_convert_overhead_stress_test: {t:.1f}s")
+
+
+def profile():
+    x = torch.randn(16)
+    y = torch.randn(16)
+    torch._dynamo.reset()
+    pr = cProfile.Profile()
+    pr.enable()
+    # 100k > 33k roughly cancels out the overhead of cProfile
+    symbolic_convert_overhead_stress_test(x, y, 33000)
+    pr.disable()
+    ps = pstats.Stats(pr)
+    ps.dump_stats("dynamo_microbenchmarks.prof")
+    print("snakeviz dynamo_microbenchmarks.prof")
+
+
+if __name__ == "__main__":
+    main()
+    profile()
diff --git a/benchmarks/dynamo/microbenchmarks/fx_microbenchmarks.py b/benchmarks/dynamo/microbenchmarks/fx_microbenchmarks.py
new file mode 100644
index 0000000000000..2c575c69418c7
--- /dev/null
+++ b/benchmarks/dynamo/microbenchmarks/fx_microbenchmarks.py
@@ -0,0 +1,30 @@
+import timeit
+
+import torch.fx
+
+N = 100000
+K = 1000
+
+
+def huge_graph():
+    def fn(x):
+        for _ in range(N):
+            x = x.sin()
+        return x
+
+    return torch.fx.symbolic_trace(fn)
+
+
+def main():
+    g = huge_graph()
+
+    def fn():
+        for n in g.graph.nodes:
+            pass
+
+    t = min(timeit.repeat(fn, number=K, repeat=3))
+    print(f"iterating over {N*K} FX nodes took {t:.1f}s ({N*K/t:.0f} nodes/s)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py b/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
index f085cbca11da2..28139e9339015 100644
--- a/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
+++ b/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
@@ -80,7 +80,7 @@ def serialize_sparse_tensor(e):
 
 
 def deserialize_sparse_tensor(size, dtype, layout, is_coalesced, nnz=None):
-    raise NotImplementedError()
+    raise NotImplementedError
 
 
 def deserialize_tensor(size, dtype, stride=None):
diff --git a/benchmarks/dynamo/microbenchmarks/overheads.py b/benchmarks/dynamo/microbenchmarks/overheads.py
new file mode 100644
index 0000000000000..687fe58cc7953
--- /dev/null
+++ b/benchmarks/dynamo/microbenchmarks/overheads.py
@@ -0,0 +1,41 @@
+import time
+import timeit
+
+import numpy as np
+
+import torch
+
+
+def add1(x):
+    return x + 1
+
+
+def bench(name, fn, requires_grad):
+    torch._dynamo.reset()
+    x = torch.randn(1, requires_grad=requires_grad)
+    start = time.perf_counter()
+    for _ in range(3):
+        fn(x)
+    end = time.perf_counter()
+
+    results = timeit.repeat(lambda: fn(x), number=1000, repeat=1000)
+    print(f"{name} {np.median(results)*1000:.1f}us (warmup={end-start:.1f}s)")
+
+
+def main():
+    print("requires_grad=False")
+    bench("eager   ", add1, False)
+    bench("compiled", torch.compile(add1), False)
+    print()
+    print("requires_grad=True")
+    bench("eager   ", add1, True)
+    bench("compiled", torch.compile(add1), True)
+    print()
+    print("inference_mode()")
+    with torch.inference_mode():
+        bench("eager   ", add1, False)
+        bench("compiled", torch.compile(add1), False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/dynamo/microbenchmarks/tensor_layout_mini_benchmark.py b/benchmarks/dynamo/microbenchmarks/tensor_layout_mini_benchmark.py
index 3eebb8ca6610f..3738f041fe806 100644
--- a/benchmarks/dynamo/microbenchmarks/tensor_layout_mini_benchmark.py
+++ b/benchmarks/dynamo/microbenchmarks/tensor_layout_mini_benchmark.py
@@ -1,6 +1,6 @@
 import torch
 from torch._inductor import ir
-from torch._inductor.utils import do_bench
+from torch._inductor.runtime.runtime_utils import do_bench
 
 
 def to_channels_last(x):
diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 504e354a58b6d..2a21613874722 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -369,7 +369,7 @@ def get_mode(args):
     return "training"
 
 
-def get_skip_tests(suite, is_training: bool):
+def get_skip_tests(suite, device, is_training: bool):
     """
     Generate -x seperated string to skip the unusual setup training tests
     """
@@ -378,10 +378,21 @@ def get_skip_tests(suite, is_training: bool):
     module = importlib.import_module(suite)
     os.chdir(original_dir)
 
-    if hasattr(module, "SKIP"):
-        skip_tests.update(module.SKIP)
-    if is_training and hasattr(module, "SKIP_TRAIN"):
-        skip_tests.update(module.SKIP_TRAIN)
+    if suite == "torchbench":
+        skip_tests.update(module.TorchBenchmarkRunner().skip_models)
+        if is_training:
+            skip_tests.update(
+                module.TorchBenchmarkRunner().skip_not_suitable_for_training_models
+            )
+        if device == "cpu":
+            skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cpu)
+        elif device == "cuda":
+            skip_tests.update(module.TorchBenchmarkRunner().skip_models_for_cuda)
+    else:
+        if hasattr(module, "SKIP"):
+            skip_tests.update(module.SKIP)
+        if is_training and hasattr(module, "SKIP_TRAIN"):
+            skip_tests.update(module.SKIP_TRAIN)
 
     skip_tests = (f"-x {name}" for name in skip_tests)
     skip_str = " ".join(skip_tests)
@@ -429,7 +440,7 @@ def generate_commands(args, dtypes, suites, devices, compilers, output_dir):
                         launcher_cmd = f"python -m torch.backends.xeon.run_cpu {args.cpu_launcher_args}"
                     cmd = f"{launcher_cmd} benchmarks/dynamo/{suite}.py --{testing} --{dtype} -d{device} --output={output_filename}"
                     cmd = f"{cmd} {base_cmd} {args.extra_args} --no-skip --dashboard"
-                    skip_tests_str = get_skip_tests(suite, args.training)
+                    skip_tests_str = get_skip_tests(suite, device, args.training)
                     cmd = f"{cmd} {skip_tests_str}"
 
                     if args.log_operator_inputs:
@@ -1441,7 +1452,7 @@ def update(self):
             try:
                 RegressionTracker(self.args).diff()
             except Exception as e:
-                logging.exception(e)
+                logging.exception("")
                 with open(f"{self.args.output_dir}/gh_regression.txt", "w") as gh_fh:
                     gh_fh.write("")
 
diff --git a/benchmarks/dynamo/test.py b/benchmarks/dynamo/test.py
index d506c4df23288..b2b72b01d9701 100644
--- a/benchmarks/dynamo/test.py
+++ b/benchmarks/dynamo/test.py
@@ -33,6 +33,7 @@ def test_benchmark_infra_runs(self) -> None:
                 [
                     "-dcpu",
                     "--inductor",
+                    "--training",
                     "--performance",
                     "--only=BERT_pytorch",
                     "-n1",
diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py
index 98c9ef52620bc..ed5132001827a 100755
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@@ -43,28 +43,21 @@ def pip_install(package):
 
 BATCH_SIZE_DIVISORS = {
     "beit_base_patch16_224": 2,
-    "cait_m36_384": 8,
     "convit_base": 2,
     "convmixer_768_32": 2,
     "convnext_base": 2,
     "cspdarknet53": 2,
     "deit_base_distilled_patch16_224": 2,
-    "dpn107": 2,
     "gluon_xception65": 2,
     "mobilevit_s": 2,
-    "pit_b_224": 2,
     "pnasnet5large": 2,
     "poolformer_m36": 2,
-    "res2net101_26w_4s": 2,
     "resnest101e": 2,
-    "sebotnet33ts_256": 2,
     "swin_base_patch4_window7_224": 2,
     "swsl_resnext101_32x16d": 2,
-    "twins_pcpvt_base": 2,
     "vit_base_patch16_224": 2,
     "volo_d1_224": 2,
     "jx_nest_base": 4,
-    "xcit_large_24_p8_224": 4,
 }
 
 REQUIRE_HIGHER_TOLERANCE = {
@@ -72,10 +65,19 @@ def pip_install(package):
     "gmixer_24_224",
     "hrnet_w18",
     "inception_v3",
+    "mixer_b16_224",
     "sebotnet33ts_256",
     "selecsls42b",
 }
 
+REQUIRE_HIGHER_TOLERANCE_FOR_FREEZING = {
+    "adv_inception_v3",
+    "botnet26t_256",
+    "gluon_inception_v3",
+    "selecsls42b",
+    "swsl_resnext101_32x16d",
+}
+
 SCALED_COMPUTE_LOSS = {
     "ese_vovnet19b_dw",
     "fbnetc_100",
@@ -165,11 +167,9 @@ def populate_family(models):
         del all_models_family[key]
 
     chosen_models = set()
-    for value in docs_models_family.values():
-        chosen_models.add(value[0])
+    chosen_models.update(value[0] for value in docs_models_family.values())
 
-    for key, value in all_models_family.items():
-        chosen_models.add(value[0])
+    chosen_models.update(value[0] for key, value in all_models_family.items())
 
     filename = "timm_models_list.txt"
     if os.path.exists("benchmarks"):
@@ -192,12 +192,22 @@ def force_amp_for_fp16_bf16_models(self):
     def force_fp16_for_bf16_models(self):
         return set()
 
+    @property
+    def get_output_amp_train_process_func(self):
+        return {}
+
     @property
     def skip_accuracy_check_as_eager_non_deterministic(self):
         if self.args.accuracy and self.args.training:
             return SKIP_ACCURACY_CHECK_AS_EAGER_NON_DETERMINISTIC_MODELS
         return set()
 
+    @property
+    def guard_on_nn_module_models(self):
+        return {
+            "convit_base",
+        }
+
     @download_retry_decorator
     def _download_model(self, model_name):
         model = create_model(
@@ -310,8 +320,17 @@ def pick_grad(self, name, is_training):
     def get_tolerance_and_cosine_flag(self, is_training, current_device, name):
         cosine = self.args.cosine
         tolerance = 1e-3
+
+        if self.args.freezing and name in REQUIRE_HIGHER_TOLERANCE_FOR_FREEZING:
+            # the conv-batchnorm fusion used under freezing may cause relatively
+            # large numerical difference. We need are larger tolerance.
+            # Check https://github.com/pytorch/pytorch/issues/120545 for context
+            tolerance = 8 * 1e-2
+
         if is_training:
-            if name in REQUIRE_HIGHER_TOLERANCE:
+            if name in ["levit_128"]:
+                tolerance = 8 * 1e-2
+            elif name in REQUIRE_HIGHER_TOLERANCE:
                 tolerance = 4 * 1e-2
             else:
                 tolerance = 1e-2
diff --git a/benchmarks/dynamo/timm_models_list.txt b/benchmarks/dynamo/timm_models_list.txt
index ff739ddbe727b..0c13a8cb1d243 100644
--- a/benchmarks/dynamo/timm_models_list.txt
+++ b/benchmarks/dynamo/timm_models_list.txt
@@ -1,12 +1,12 @@
 adv_inception_v3 128
 beit_base_patch16_224 128
 botnet26t_256 128
-cait_m36_384 8
+cait_m36_384 4
 coat_lite_mini 128
 convit_base 128
 convmixer_768_32 64
 convnext_base 128
-crossvit_9_240 128
+crossvit_9_240 256
 cspdarknet53 128
 deit_base_distilled_patch16_224 128
 dla102 128
@@ -14,38 +14,38 @@ dm_nfnet_f0 128
 dpn107 64
 eca_botnext26ts_256 128
 eca_halonext26ts 128
-ese_vovnet19b_dw 128
-fbnetc_100 128
-fbnetv3_b 128
+ese_vovnet19b_dw 256
+fbnetc_100 512
+fbnetv3_b 256
 gernet_l 128
-ghostnet_100 128
-gluon_inception_v3 128
+ghostnet_100 512
+gluon_inception_v3 256
 gmixer_24_224 128
 gmlp_s16_224 128
 hrnet_w18 128
 inception_v3 128
 jx_nest_base 128
-lcnet_050 128
-levit_128 128
+lcnet_050 256
+levit_128 1024
 mixer_b16_224 128
 mixnet_l 128
-mnasnet_100 128
+mnasnet_100 512
 mobilenetv2_100 128
-mobilenetv3_large_100 128
+mobilenetv3_large_100 512
 mobilevit_s 128
 nfnet_l0 128
-pit_b_224 128
+pit_b_224 64
 pnasnet5large 32
 poolformer_m36 128
-regnety_002 128
+regnety_002 1024
 repvgg_a2 128
 res2net101_26w_4s 128
 res2net50_14w_8s 128
 res2next50 128
 resmlp_12_224 128
 resnest101e 128
-rexnet_100 128
-sebotnet33ts_256 128
+rexnet_100 256
+sebotnet33ts_256 64
 selecsls42b 128
 spnasnet_100 128
 swin_base_patch4_window7_224 128
@@ -58,4 +58,4 @@ twins_pcpvt_base 128
 visformer_small 128
 vit_base_patch16_224 128
 volo_d1_224 128
-xcit_large_24_p8_224 23
+xcit_large_24_p8_224 16
diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py
index 8baf8037f48a2..3086bddc4bb5b 100755
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import functools
 import gc
 import importlib
 import logging
@@ -6,9 +7,11 @@
 import re
 import sys
 import warnings
+from collections import namedtuple
 from os.path import abspath, exists
 
 import torch
+import yaml
 
 try:
     from .common import BenchmarkRunner, main
@@ -22,6 +25,20 @@
 torch.backends.cuda.matmul.allow_tf32 = True
 
 
+def _reassign_parameters(model):
+    # torch_geometric models register parameter as tensors due to
+    # https://github.com/pyg-team/pytorch_geometric/blob/master/torch_geometric/nn/dense/linear.py#L158-L168
+    # Since it is unusual thing to do, we just reassign them to parameters
+    def state_dict_hook(module, destination, prefix, local_metadata):
+        for name, param in module.named_parameters():
+            if isinstance(destination[name], torch.Tensor) and not isinstance(
+                destination[name], torch.nn.Parameter
+            ):
+                destination[name] = torch.nn.Parameter(destination[name])
+
+    model._register_state_dict_hook(state_dict_hook)
+
+
 def setup_torchbench_cwd():
     original_dir = abspath(os.getcwd())
 
@@ -46,238 +63,52 @@ def setup_torchbench_cwd():
     return original_dir
 
 
-# Some models have large dataset that doesn't fit in memory. Lower the batch
-# size to test the accuracy.
-USE_SMALL_BATCH_SIZE = {
-    "demucs": 4,
-    "dlrm": 1024,
-    "densenet121": 4,
-    "hf_Reformer": 4,
-    "hf_T5_base": 4,
-    "timm_efficientdet": 1,
-    "llama_v2_7b_16h": 1,
-    "yolov3": 8,  # reduced from 16 due to cudagraphs OOM in TorchInductor dashboard
-}
-
-INFERENCE_SMALL_BATCH_SIZE = {
-    "timm_efficientdet": 32,
-}
-
-DETECTRON2_MODELS = {
-    "detectron2_fasterrcnn_r_101_c4",
-    "detectron2_fasterrcnn_r_101_dc5",
-    "detectron2_fasterrcnn_r_101_fpn",
-    "detectron2_fasterrcnn_r_50_c4",
-    "detectron2_fasterrcnn_r_50_dc5",
-    "detectron2_fasterrcnn_r_50_fpn",
-    "detectron2_maskrcnn_r_101_c4",
-    "detectron2_maskrcnn_r_101_fpn",
-    "detectron2_maskrcnn_r_50_fpn",
-}
-
-SKIP = {
-    # https://github.com/pytorch/torchdynamo/issues/101
-    "detectron2_maskrcnn",
-    # https://github.com/pytorch/torchdynamo/issues/145
-    "fambench_xlmr",
-    # TIMEOUT, https://github.com/pytorch/pytorch/issues/98467
-    "tacotron2",
-    "hf_Bert",  # Error: RelaxedUnspecConstraint(L['input_ids'].size()[0]) - inferred constant (4)
-    "hf_Bert_large",  # Error: RelaxedUnspecConstraint(L['input_ids'].size()[0]) - inferred constant (4)
-    # takes too long, extreme slowdown (< .001)
-    "maml",
-    # Failing in eager mode
-    "clip",
-    # multi gpu not always available in benchmark runners
-    "simple_gpt_tp_manual",
-}
-
-SKIP_DUE_TO_CONTROL_FLOW = {
-    "cm3leon_generate",
-    "detectron2_fcos_r_50_fpn",
-    "fastNLP_Bert",
-    "hf_Longformer",
-    "hf_Reformer",
-    "hf_T5_generate",
-    "opacus_cifar10",
-    "speech_transformer",
-}
-
-SKIP_FOR_CPU = {
-    "hf_T5_generate",  # OOMs
-    "cm3leon_generate",  # model is CUDA only
-    "nanogpt",  # timeout
-    "sam",  # timeout
-    "llama_v2_7b_16h",  # model is CUDA only
-    "stable_diffusion",  # flaky
-    "torchrec_dlrm",  # requires FBGEMM, CUDA only
-    "simple_gpt",
-    "hf_Whisper",  # works on cuda, accuracy failure on cpu
-    "stable_diffusion_text_encoder",
-}
-
-SKIP_FOR_CUDA = {
-    "gat",  # only works on CPU
-    "gcn",  # only works on CPU
-    "sage",  # only works on CPU
-}
-
-# Additional models that are skipped in training
-SKIP_TRAIN = {
-    # not designed for training
-    "pyhpc_equation_of_state",
-    "pyhpc_isoneutral_mixing",
-    "pyhpc_turbulent_kinetic_energy",
-    "maml",
-    "llama",
-    "llama_v2_7b_16h",
-    "simple_gpt",
-    # doesnt fit in memory
-    "phi_1_5",
-}
-SKIP_TRAIN.update(DETECTRON2_MODELS)
-
-# These models support only train mode. So accuracy checking can't be done in
-# eval mode.
-ONLY_TRAINING_MODE = {
-    "tts_angular",
-    "tacotron2",
-    "demucs",
-    "hf_Reformer",
-    "pytorch_struct",
-    "yolov3",
-}
-ONLY_TRAINING_MODE.update(DETECTRON2_MODELS)
-
-# Need lower tolerance on GPU. GPU kernels have non deterministic kernels for these models.
-REQUIRE_HIGHER_TOLERANCE = {
-    "alexnet",
-    "attention_is_all_you_need_pytorch",
-    "densenet121",
-    "hf_Albert",
-    "vgg16",
-    "mobilenet_v3_large",
-    "nvidia_deeprecommender",
-    "timm_efficientdet",
-}
-
-# These models need >1e-3 tolerance
-REQUIRE_EVEN_HIGHER_TOLERANCE = {
-    "soft_actor_critic",
-    "tacotron2",
-}
-
-REQUIRE_HIGHER_FP16_TOLERANCE = {
-    "doctr_reco_predictor",
-    "drq",
-    "hf_Whisper",
-}
-
-
-REQUIRE_HIGHER_BF16_TOLERANCE = {
-    "doctr_reco_predictor",
-    "drq",
-    "hf_Whisper",
-}
-
-REQUIRE_COSINE_TOLERACE = {
-    # Just keeping it here even though its empty, if we need this in future.
-}
-
-# non-deterministic output / cant check correctness
-NONDETERMINISTIC = {
-    # https://github.com/pytorch/pytorch/issues/98355
-    "mobilenet_v3_large",
-}
-
-# These benchmarks took >600s on an i9-11900K CPU
-VERY_SLOW_BENCHMARKS = {
-    "hf_BigBird",  # 3339s
-    "hf_Longformer",  # 3062s
-    "hf_T5",  # 930s
-}
-
-# These benchmarks took >60s on an i9-11900K CPU
-SLOW_BENCHMARKS = {
-    *VERY_SLOW_BENCHMARKS,
-    "BERT_pytorch",  # 137s
-    "demucs",  # 116s
-    "fastNLP_Bert",  # 242s
-    "hf_Albert",  # 221s
-    "hf_Bart",  # 400s
-    "hf_Bert",  # 334s
-    "hf_DistilBert",  # 187s
-    "hf_GPT2",  # 470s
-    "hf_Reformer",  # 141s
-    "speech_transformer",  # 317s
-    "vision_maskrcnn",  # 99s
-}
+@functools.lru_cache(maxsize=1)
+def load_yaml_file():
+    filename = "torchbench.yaml"
+    filepath = os.path.join(os.path.dirname(__file__), filename)
 
-TRT_NOT_YET_WORKING = {
-    "alexnet",
-    "resnet18",
-    "resnet50",
-    "mobilenet_v2",
-    "mnasnet1_0",
-    "squeezenet1_1",
-    "shufflenetv2_x1_0",
-    "vgg16",
-    "resnext50_32x4d",
-}
+    with open(filepath) as f:
+        data = yaml.safe_load(f)
 
-DONT_CHANGE_BATCH_SIZE = {
-    "demucs",
-    "pytorch_struct",
-    "pyhpc_turbulent_kinetic_energy",
-    "vision_maskrcnn",  # https://github.com/pytorch/benchmark/pull/1656
-}
+    def flatten(lst):
+        for item in lst:
+            if isinstance(item, list):
+                yield from flatten(item)
+            else:
+                yield item
 
+    def maybe_list_to_set(obj):
+        if isinstance(obj, dict):
+            return {k: maybe_list_to_set(v) for k, v in obj.items()}
+        if isinstance(obj, list):
+            return set(flatten(obj))
+        return obj
 
-SKIP_ACCURACY_CHECK_MODELS = {
-    # Models too large to have eager, dynamo and fp64_numbers simultaneosuly
-    # even for 40 GB machine. We have tested accuracy for smaller version of
-    # these models
-    "hf_GPT2_large",
-    "hf_T5_large",
-    "timm_vision_transformer_large",
-    "maml",  # accuracy https://github.com/pytorch/pytorch/issues/93847
-    "llama_v2_7b_16h",
-    "Background_Matting",
-    "stable_diffusion_unet",
-}
+    return maybe_list_to_set(data)
 
-SKIP_ACCURACY_CHECK_AS_EAGER_NON_DETERMINISTIC_MODELS = {
-    # Models that deterministic algorithms can not be turned on for eager mode.
-    "Background_Matting",
-}
 
+def process_hf_reformer_output(out):
+    assert isinstance(out, list)
+    # second output is unstable
+    return [elem for i, elem in enumerate(out) if i != 1]
 
-MAX_BATCH_SIZE_FOR_ACCURACY_CHECK = {
-    "hf_GPT2": 2,
-    "pytorch_unet": 2,
-}
 
-FORCE_AMP_FOR_FP16_BF16_MODELS = {
-    "DALLE2_pytorch",
-    "doctr_det_predictor",
-    "doctr_reco_predictor",
-    "Super_SloMo",
-    "tts_angular",
-    "pyhpc_turbulent_kinetic_energy",
-    "detectron2_fcos_r_50_fpn",
-}
+def process_hf_whisper_output(out):
+    out_ret = []
+    for i, elem in enumerate(out):
+        if i == 0:
+            assert isinstance(elem, dict)
+            out_ret.append({k: v for k, v in elem.items() if k != "logits"})
+        elif i != 1:
+            out_ret.append(elem)
 
-FORCE_FP16_FOR_BF16_MODELS = {"vision_maskrcnn"}
+    return out_ret
 
-# models in canary_models that we should run anyway
-CANARY_MODELS = {
-    "torchrec_dlrm",
-    "clip",  # torchbench removed torchtext dependency
-}
 
-ONLY_MULTIPROCESS = {
-    # Models that should only run in --multiprocess mode
-    "simple_gpt"
+process_train_model_output = {
+    "hf_Reformer": process_hf_reformer_output,
+    "hf_Whisper": process_hf_whisper_output,
 }
 
 
@@ -287,65 +118,99 @@ def __init__(self):
         self.suite_name = "torchbench"
         self.optimizer = None
 
+    @property
+    def _config(self):
+        return load_yaml_file()
+
+    @property
+    def _skip(self):
+        return self._config["skip"]
+
+    @property
+    def _batch_size(self):
+        return self._config["batch_size"]
+
+    @property
+    def _tolerance(self):
+        return self._config["tolerance"]
+
+    @property
+    def _accuracy(self):
+        return self._config["accuracy"]
+
     @property
     def skip_models(self):
-        return SKIP
+        return self._skip["all"]
 
     @property
     def skip_models_for_cpu(self):
-        return SKIP_FOR_CPU
+        return self._skip["device"]["cpu"]
 
     @property
     def skip_models_for_cuda(self):
-        return SKIP_FOR_CUDA
+        return self._skip["device"]["cuda"]
+
+    @property
+    def skip_models_for_freezing(self):
+        return self._skip["freezing"]
 
     @property
     def slow_models(self):
-        return SLOW_BENCHMARKS
+        return self._config["slow"]
 
     @property
     def very_slow_models(self):
-        return VERY_SLOW_BENCHMARKS
+        return self._config["very_slow"]
 
     @property
     def non_deterministic_models(self):
-        return NONDETERMINISTIC
+        return self._config["non_deterministic"]
+
+    @property
+    def get_output_amp_train_process_func(self):
+        return process_train_model_output
 
     @property
     def skip_not_suitable_for_training_models(self):
-        return SKIP_TRAIN
+        return self._skip["test"]["training"]
 
     @property
     def failing_fx2trt_models(self):
-        return TRT_NOT_YET_WORKING
+        return self._config["trt_not_yet_working"]
 
     @property
     def force_amp_for_fp16_bf16_models(self):
-        return FORCE_AMP_FOR_FP16_BF16_MODELS
+        return self._config["dtype"]["force_amp_for_fp16_bf16_models"]
 
     @property
     def force_fp16_for_bf16_models(self):
-        return FORCE_FP16_FOR_BF16_MODELS
+        return self._config["dtype"]["force_fp16_for_bf16_models"]
 
     @property
     def skip_accuracy_checks_large_models_dashboard(self):
         if self.args.dashboard or self.args.accuracy:
-            return SKIP_ACCURACY_CHECK_MODELS
+            return self._accuracy["skip"]["large_models"]
         return set()
 
     @property
     def skip_accuracy_check_as_eager_non_deterministic(self):
         if self.args.accuracy and self.args.training:
-            return SKIP_ACCURACY_CHECK_AS_EAGER_NON_DETERMINISTIC_MODELS
+            return self._accuracy["skip"]["eager_not_deterministic"]
         return set()
 
     @property
     def skip_multiprocess_models(self):
-        return ONLY_MULTIPROCESS
+        return self._skip["multiprocess"]
 
     @property
     def skip_models_due_to_control_flow(self):
-        return SKIP_DUE_TO_CONTROL_FLOW
+        return self._skip["control_flow"]
+
+    @property
+    def guard_on_nn_module_models(self):
+        return {
+            "vision_maskrcnn",
+        }
 
     def load_model(
         self,
@@ -385,22 +250,26 @@ def load_model(
 
         cant_change_batch_size = (
             not getattr(benchmark_cls, "ALLOW_CUSTOMIZE_BSIZE", True)
-            or model_name in DONT_CHANGE_BATCH_SIZE
+            or model_name in self._config["dont_change_batch_size"]
         )
         if cant_change_batch_size:
             batch_size = None
-        if batch_size is None and is_training and model_name in USE_SMALL_BATCH_SIZE:
-            batch_size = USE_SMALL_BATCH_SIZE[model_name]
+        if (
+            batch_size is None
+            and is_training
+            and model_name in self._batch_size["training"]
+        ):
+            batch_size = self._batch_size["training"][model_name]
         elif (
             batch_size is None
             and not is_training
-            and model_name in INFERENCE_SMALL_BATCH_SIZE
+            and model_name in self._batch_size["inference"]
         ):
-            batch_size = INFERENCE_SMALL_BATCH_SIZE[model_name]
+            batch_size = self._batch_size["inference"][model_name]
 
         # Control the memory footprint for few models
-        if self.args.accuracy and model_name in MAX_BATCH_SIZE_FOR_ACCURACY_CHECK:
-            batch_size = min(batch_size, MAX_BATCH_SIZE_FOR_ACCURACY_CHECK[model_name])
+        if self.args.accuracy and model_name in self._accuracy["max_batch_size"]:
+            batch_size = min(batch_size, self._accuracy["max_batch_size"][model_name])
 
         # workaround "RuntimeError: not allowed to set torch.backends.cudnn flags"
         torch.backends.__allow_nonbracketed_mutation_flag = True
@@ -409,6 +278,11 @@ def load_model(
         if part:
             extra_args += ["--part", part]
 
+        # sam_fast only runs with amp
+        if model_name == "sam_fast":
+            self.args.amp = True
+            self.setup_amp()
+
         if model_name == "vision_maskrcnn" and is_training:
             # Output of vision_maskrcnn model is a list of bounding boxes,
             # sorted on the basis of their scores. This makes accuracy
@@ -439,15 +313,35 @@ def load_model(
                 extra_args=extra_args,
             )
         model, example_inputs = benchmark.get_module()
+        if model_name in [
+            "basic_gnn_edgecnn",
+            "basic_gnn_gcn",
+            "basic_gnn_sage",
+            "basic_gnn_gin",
+        ]:
+            _reassign_parameters(model)
 
         # Models that must be in train mode while training
-        if is_training and (not use_eval_mode or model_name in ONLY_TRAINING_MODE):
+        if is_training and (
+            not use_eval_mode or model_name in self._config["only_training"]
+        ):
             model.train()
         else:
             model.eval()
         gc.collect()
         batch_size = benchmark.batch_size
-
+        if model_name == "torchrec_dlrm":
+            batch_namedtuple = namedtuple(
+                "Batch", "dense_features sparse_features labels"
+            )
+            example_inputs = tuple(
+                batch_namedtuple(
+                    dense_features=batch.dense_features,
+                    sparse_features=batch.sparse_features,
+                    labels=batch.labels,
+                )
+                for batch in example_inputs
+            )
         # Torchbench has quite different setup for yolov3, so directly passing
         # the right example_inputs
         if model_name == "yolov3":
@@ -475,7 +369,7 @@ def iter_model_names(self, args):
         models += [
             f
             for f in _list_canary_model_paths()
-            if os.path.basename(f) in CANARY_MODELS
+            if os.path.basename(f) in self._config["canary_models"]
         ]
         models.sort()
 
@@ -506,21 +400,21 @@ def get_tolerance_and_cosine_flag(self, is_training, current_device, name):
         cosine = self.args.cosine
         # Increase the tolerance for torch allclose
         if self.args.float16 or self.args.amp:
-            if name in REQUIRE_HIGHER_FP16_TOLERANCE:
+            if name in self._tolerance["higher_fp16"]:
                 return 1e-2, cosine
             return 1e-3, cosine
 
         if self.args.bfloat16:
-            if name in REQUIRE_HIGHER_BF16_TOLERANCE:
+            if name in self._tolerance["higher_bf16"]:
                 return 1e-2, cosine
 
-        if is_training and current_device == "cuda":
+        if is_training and (current_device == "cuda" or current_device == "xpu"):
             tolerance = 1e-3
-            if name in REQUIRE_COSINE_TOLERACE:
+            if name in self._tolerance["cosine"]:
                 cosine = True
-            elif name in REQUIRE_HIGHER_TOLERANCE:
+            elif name in self._tolerance["higher"]:
                 tolerance = 1e-3
-            elif name in REQUIRE_EVEN_HIGHER_TOLERANCE:
+            elif name in self._tolerance["even_higher"]:
                 tolerance = 8 * 1e-2
         return tolerance, cosine
 
diff --git a/benchmarks/dynamo/torchbench.yaml b/benchmarks/dynamo/torchbench.yaml
new file mode 100644
index 0000000000000..bf848e81b32fa
--- /dev/null
+++ b/benchmarks/dynamo/torchbench.yaml
@@ -0,0 +1,259 @@
+# Some models have large dataset that doesn't fit in memory. Lower the batch
+# size to test the accuracy.
+batch_size:
+  training:
+    demucs: 4
+    dlrm: 1024
+    densenet121: 4
+    hf_Reformer: 4
+    hf_T5_base: 4
+    timm_efficientdet: 1
+    llama_v2_7b_16h: 1
+    # reduced from 16 due to cudagraphs OOM in TorchInductor dashboard
+    yolov3: 8
+
+  inference:
+    timm_efficientdet: 32
+
+
+dont_change_batch_size:
+  - demucs
+  - pytorch_struct
+  - pyhpc_turbulent_kinetic_energy
+  # https://github.com/pytorch/benchmark/pull/1656
+  - vision_maskrcnn
+
+
+tolerance:
+  # Need lower tolerance on GPU. GPU kernels have non deterministic kernels for these models.
+  higher:
+    - alexnet
+    - attention_is_all_you_need_pytorch
+    - densenet121
+    - hf_Albert
+    - vgg16
+    - mobilenet_v3_large
+    - nvidia_deeprecommender
+    - timm_efficientdet
+
+  # These models need >1e-3 tolerance
+  even_higher:
+    - soft_actor_critic
+    - tacotron2
+
+  higher_fp16:
+    - doctr_reco_predictor
+    - drq
+    - hf_Whisper
+
+  higher_bf16:
+    - doctr_reco_predictor
+    - drq
+    - hf_Whisper
+
+  cosine: []
+
+
+# These benchmarks took >600s on an i9-11900K CPU
+very_slow: &VERY_SLOW_MODELS
+  # 3339s
+  - hf_BigBird
+  # 3062s
+  - hf_Longformer
+  # 930s
+  - hf_T5
+
+
+# These benchmarks took >60s on an i9-11900K CPU
+slow:
+  - *VERY_SLOW_MODELS
+  # 137s
+  - BERT_pytorch
+  # 116s
+  - demucs
+  # 242s
+  - fastNLP_Bert
+  # 221s
+  - hf_Albert
+  # 400s
+  - hf_Bart
+  # 334s
+  - hf_Bert
+  # 187s
+  - hf_DistilBert
+  # 470s
+  - hf_GPT2
+  # 141s
+  - hf_Reformer
+  # 317s
+  - speech_transformer
+  # 99s
+  - vision_maskrcnn
+
+
+non_deterministic:
+  # https://github.com/pytorch/pytorch/issues/98355
+  - mobilenet_v3_large
+  - sam_fast
+
+
+dtype:
+  force_amp_for_fp16_bf16_models:
+    - DALLE2_pytorch
+    - doctr_det_predictor
+    - doctr_reco_predictor
+    - Super_SloMo
+    - tts_angular
+    - pyhpc_turbulent_kinetic_energy
+    - detectron2_fcos_r_50_fpn
+
+  force_fp16_for_bf16_models:
+    - vision_maskrcnn
+
+
+# models in canary_models that we should run anyway
+canary_models:
+  - torchrec_dlrm
+
+
+detectron2_models: &DETECTRON2_MODELS
+  - detectron2_fasterrcnn_r_101_c4
+  - detectron2_fasterrcnn_r_101_dc5
+  - detectron2_fasterrcnn_r_101_fpn
+  - detectron2_fasterrcnn_r_50_c4
+  - detectron2_fasterrcnn_r_50_dc5
+  - detectron2_fasterrcnn_r_50_fpn
+  - detectron2_maskrcnn_r_101_c4
+  - detectron2_maskrcnn_r_101_fpn
+  - detectron2_maskrcnn_r_50_fpn
+
+
+# These models support only train mode. So accuracy checking can't be done in
+# eval mode.
+only_training:
+  - *DETECTRON2_MODELS
+  - tts_angular
+  - tacotron2
+  - demucs
+  - hf_Reformer
+  - pytorch_struct
+  - yolov3
+
+
+trt_not_yet_working:
+  - alexnet
+  - resnet18
+  - resnet50
+  - mobilenet_v2
+  - mnasnet1_0
+  - squeezenet1_1
+  - shufflenetv2_x1_0
+  - vgg16
+  - resnext50_32x4d
+
+
+skip:
+  all:
+    # OOMs (A100 40G)
+    - detectron2_maskrcnn
+    # TIMEOUT, https://github.com/pytorch/pytorch/issues/98467
+    - tacotron2
+    # Failing in eager mode
+    - hf_clip
+    # multi gpu not always available in benchmark runners
+    - simple_gpt_tp_manual
+
+  device:
+    cpu:
+      # OOMs
+      - hf_T5_generate
+      # model is CUDA only
+      - cm3leon_generate
+      # timeout
+      - nanogpt
+      # timeout
+      - sam
+      # model is CUDA only
+      - sam_fast
+      # model is CUDA only
+      - llama_v2_7b_16h
+      # flaky
+      - stable_diffusion
+      # requires FBGEMM, CUDA only
+      - torchrec_dlrm
+      - simple_gpt
+      # works on cuda, accuracy failure on cpu
+      - hf_Whisper
+      - stable_diffusion_text_encoder
+      - llava
+
+    cuda: []
+
+  test:
+    training:
+      - *DETECTRON2_MODELS
+      # not designed for training
+      - pyhpc_equation_of_state
+      - pyhpc_isoneutral_mixing
+      - pyhpc_turbulent_kinetic_energy
+      - maml
+      - llama
+      - llama_v2_7b_16h
+      - simple_gpt
+      - sam_fast
+      # Model's DEFAULT_TRAIN_BSIZE is not implemented
+      - cm3leon_generate
+      - hf_T5_generate
+      - doctr_det_predictor
+      - doctr_reco_predictor
+      - moondream
+      # doesnt fit in memory
+      - phi_1_5
+      - detectron2_fcos_r_50_fpn
+
+  control_flow:
+    - cm3leon_generate
+    - detectron2_fcos_r_50_fpn
+    - fastNLP_Bert
+    - hf_Longformer
+    - hf_Reformer
+    - hf_T5_generate
+    - opacus_cifar10
+    - speech_transformer
+
+  # Models that should only run in --multiprocess mode
+  multiprocess:
+    - simple_gpt
+
+  # for these models, conv-batchnorm fusing causes big numerical churn.
+  # Skip them
+  freezing:
+    - mnasnet1_0
+    - moco
+    - shufflenet_v2_x1_0
+
+
+
+
+accuracy:
+  skip:
+    large_models:
+      # Models too large to have eager, dynamo and fp64_numbers simultaneosuly
+      # even for 40 GB machine. We have tested accuracy for smaller version of
+      # these models
+      - hf_GPT2_large
+      - hf_T5_large
+      - timm_vision_transformer_large
+      # accuracy https://github.com/pytorch/pytorch/issues/93847
+      - maml
+      - llama_v2_7b_16h
+      - Background_Matting
+      - stable_diffusion_unet
+    eager_not_deterministic:
+      # Models that deterministic algorithms can not be turned on for eager mode.
+      - Background_Matting
+      - pytorch_unet
+
+  max_batch_size:
+    hf_GPT2: 2
+    pytorch_unet: 2
diff --git a/benchmarks/gpt_fast/benchmark.py b/benchmarks/gpt_fast/benchmark.py
new file mode 100644
index 0000000000000..27a2c7456d7ad
--- /dev/null
+++ b/benchmarks/gpt_fast/benchmark.py
@@ -0,0 +1,276 @@
+import argparse
+import csv
+import dataclasses
+import itertools
+import os
+import time
+from typing import Optional, Tuple
+
+import torch
+import torch._inductor.config
+from mixtral_moe_model import Transformer as MixtralMoE
+from mixtral_moe_quantize import (
+    WeightOnlyInt8QuantHandler as MixtralMoEWeightOnlyInt8QuantHandler,
+)
+from model import Transformer as LLaMA
+from quantize import WeightOnlyInt8QuantHandler as LLaMAWeightOnlyInt8QuantHandler
+
+torch._inductor.config.coordinate_descent_tuning = True
+torch._inductor.config.triton.unique_kernel_names = True
+torch._inductor.config.fx_graph_cache = True  # Experimental feature to reduce compilation times, will be on by default in future
+torch._inductor.config.assert_indirect_indexing = False
+
+
+@dataclasses.dataclass
+class Experiment:
+    name: str
+    module: type
+    mode: Optional[str]
+    quantizer: type
+    target: float
+
+
+all_experiments = {
+    "llama-7b-fp16": Experiment(
+        "Llama-2-7b-chat-hf", LLaMA, "bfloat16", LLaMAWeightOnlyInt8QuantHandler, 104
+    ),
+    "llama-7b-int8": Experiment(
+        "Llama-2-7b-chat-hf", LLaMA, "int8", LLaMAWeightOnlyInt8QuantHandler, 155
+    ),
+    "mixtral-int8": Experiment(  # We reduced the original number of layers from 32 to 16 to adapt CI memory limitation.
+        "Mixtral-8x7B-v0.1",
+        MixtralMoE,
+        "int8",
+        MixtralMoEWeightOnlyInt8QuantHandler,
+        197,
+    ),
+}
+
+output_filename = "gpt_fast_benchmark.csv"
+
+
+def device_sync(device):
+    if "cuda" in device:
+        torch.cuda.synchronize(device)
+    elif "cpu" in device:
+        pass
+    else:
+        print(f"device={device} is not yet suppported")
+
+
+def multinomial_sample_one_no_sync(
+    probs_sort,
+):  # Does multinomial sampling without a cuda synchronization
+    q = torch.empty_like(probs_sort).exponential_(1)
+    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
+
+
+def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
+    logits = logits / max(temperature, 1e-5)
+
+    if top_k is not None:
+        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+        pivot = v.select(-1, -1).unsqueeze(-1)
+        logits = torch.where(logits < pivot, -float("Inf"), logits)
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    return probs
+
+
+def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
+    probs = logits_to_probs(logits[0, -1], temperature, top_k)
+    idx_next = multinomial_sample_one_no_sync(probs)
+    return idx_next, probs
+
+
+@torch.compile(fullgraph=True)
+def prefill(
+    model: torch.nn.Module, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
+) -> torch.Tensor:
+    # input_pos: [B, S]
+    logits = model(x, input_pos)
+    return sample(logits, **sampling_kwargs)[0]
+
+
+@torch.compile(fullgraph=True, mode="reduce-overhead")
+def decode_one_token(
+    model: torch.nn.Module, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # input_pos: [B, 1]
+    assert input_pos.shape[-1] == 1
+    logits = model(x, input_pos)
+    return sample(logits, **sampling_kwargs)
+
+
+def decode_n_tokens(
+    model: torch.nn.Module,
+    cur_token: torch.Tensor,
+    input_pos: torch.Tensor,
+    num_new_tokens: int,
+    **sampling_kwargs,
+):
+    new_tokens, new_probs = [], []
+    for i in range(num_new_tokens):
+        with torch.nn.attention.sdpa_kernel(
+            torch.nn.attention.SDPBackend.MATH
+        ):  # Actually better for Inductor to codegen attention here
+            next_token, next_prob = decode_one_token(
+                model, cur_token, input_pos, **sampling_kwargs
+            )
+            input_pos += 1
+            new_tokens.append(next_token.clone())
+            new_probs.append(next_prob.clone())
+            cur_token = next_token.view(1, -1)
+
+    return new_tokens, new_probs
+
+
+@torch.no_grad()
+def generate(
+    model: torch.nn.Module, prompt: torch.Tensor, max_new_tokens: int, **sampling_kwargs
+) -> torch.Tensor:
+    device, dtype = prompt.device, prompt.dtype
+    T = prompt.size(0)
+    T_new = T + max_new_tokens
+    max_seq_length = min(T_new, model.config.block_size)
+
+    with torch.device(device):
+        model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
+
+    # create an empty tensor of the expected final shape and fill in the current tokens
+    empty = torch.empty(T_new, dtype=dtype, device=device)
+    empty[:T] = prompt
+    seq = empty
+    input_pos = torch.arange(0, T, device=device)
+
+    next_token = prefill(model, prompt.view(1, -1), input_pos, **sampling_kwargs)
+    seq[T] = next_token
+
+    input_pos = torch.tensor([T], device=device, dtype=torch.int)
+
+    generated_tokens, _ = decode_n_tokens(
+        model, next_token.view(1, -1), input_pos, max_new_tokens - 1, **sampling_kwargs
+    )
+    seq[T + 1 :] = torch.cat(generated_tokens)
+    return seq
+
+
+def _load_model(x: Experiment, device="cuda", precision=torch.bfloat16):
+    with torch.device("meta"):
+        model = x.module.from_name(x.name)
+    model = model.to(dtype=precision)
+
+    if x.mode == "int8":
+        print("Using int8 weight-only quantization!")
+        model = x.quantizer(model).convert_for_runtime()
+
+    state_dict = model.state_dict()
+    for k, v in state_dict.items():
+        state_dict[k] = torch.nn.Parameter(
+            torch.randn(v.shape, device=device).to(dtype=v.dtype),
+            requires_grad=v.requires_grad,
+        )
+    model.load_state_dict(state_dict, assign=True)
+    return model.eval()
+
+
+def run_experiment(
+    x: Experiment,
+    num_samples: int = 5,
+    max_new_tokens: int = 200,
+    top_k: int = 200,
+    temperature: float = 0.8,
+) -> None:
+    device = "cuda"
+    print("Loading model ...")
+    t0 = time.time()
+    model = _load_model(x)
+    device_sync(device=device)  # MKG
+    print(f"Time to load model: {time.time() - t0:.02f} seconds")
+
+    prompt = torch.tensor(
+        [1, 15043, 29892, 590, 1024, 338], device=device, dtype=torch.int32
+    )
+    prompt_length = prompt.size(0)
+
+    torch.manual_seed(1234)
+    model_size = sum(
+        p.numel() * p.dtype.itemsize
+        for p in itertools.chain(model.parameters(), model.buffers())
+    )
+
+    aggregate_metrics = {"tokens_per_sec": []}
+    start = -1
+
+    for i in range(start, num_samples):
+        device_sync(device=device)  # MKG
+
+        t0 = time.perf_counter()
+        y = generate(
+            model, prompt, max_new_tokens, temperature=temperature, top_k=top_k
+        )
+
+        if i == -1:
+            print(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
+            continue
+
+        device_sync(device=device)  # MKG
+        t = time.perf_counter() - t0
+        tokens_generated = y.size(0) - prompt_length
+        tokens_sec = tokens_generated / t
+        aggregate_metrics["tokens_per_sec"].append(tokens_sec)
+
+    token_per_sec = torch.mean(torch.tensor(aggregate_metrics["tokens_per_sec"])).item()
+    print(f"Average tokens/sec: {token_per_sec:.2f}")
+    print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
+    return token_per_sec
+
+
+def output_csv(filename, headers, row):
+    if os.path.exists(filename):
+        with open(filename) as fd:
+            lines = list(csv.reader(fd)) or [[]]
+            if headers and len(headers) > len(lines[0]):
+                # if prior results failed the header might not be filled in yet
+                lines[0] = headers
+            else:
+                headers = lines[0]
+    else:
+        lines = [headers]
+    lines.append([(f"{x:.6f}" if isinstance(x, float) else x) for x in row])
+    with open(filename, "w") as fd:
+        writer = csv.writer(fd, lineterminator="\n")
+        for line in lines:
+            writer.writerow(list(line) + ["0"] * (len(headers) - len(line)))
+
+
+def main(experiments=None):
+    results = []
+
+    if experiments is None:
+        experiments = all_experiments
+    else:
+        experiments = {k: v for k, v in all_experiments.items() if k in experiments}
+
+    for x in experiments.values():
+        actual = run_experiment(x)
+        percentage = f"{actual / x.target * 100:.2f}%"
+        results.append((x, actual, percentage))
+
+    headers = ["name", "mode", "target", "actual", "percentage"]
+    rows = [[x[0].name, x[0].mode, x[0].target, x[1], x[2]] for x in results]
+
+    for row in rows:
+        output_csv(output_filename, headers, row)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run experiments.")
+    parser.add_argument(
+        "--experiments",
+        nargs="*",
+        default=None,
+        help="Experiment names to run (default: all)",
+    )
+    args = parser.parse_args()
+
+    main(experiments=args.experiments)
diff --git a/benchmarks/gpt_fast/mixtral_moe_model.py b/benchmarks/gpt_fast/mixtral_moe_model.py
new file mode 100644
index 0000000000000..36ba9db4d5f9a
--- /dev/null
+++ b/benchmarks/gpt_fast/mixtral_moe_model.py
@@ -0,0 +1,305 @@
+# flake8: noqa: E266, C417, B950
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import functional as F
+
+
+def find_multiple(n: int, k: int) -> int:
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+
+@dataclass
+class ModelArgs:
+    block_size: int = 2048
+    vocab_size: int = 32000
+    n_layer: int = 32
+    n_head: int = 32
+    dim: int = 4096
+    intermediate_size: int = None
+    n_local_heads: int = -1
+    head_dim: int = 64
+    rope_base: float = 10000
+    norm_eps: float = 1e-5
+    num_experts: int = 8
+    num_activated_experts: int = 2
+
+    def __post_init__(self):
+        if self.n_local_heads == -1:
+            self.n_local_heads = self.n_head
+        if self.intermediate_size is None:
+            hidden_dim = 4 * self.dim
+            n_hidden = int(2 * hidden_dim / 3)
+            self.intermediate_size = find_multiple(n_hidden, 256)
+        self.head_dim = self.dim // self.n_head
+
+    @classmethod
+    def from_name(cls, name: str):
+        if name in transformer_configs:
+            return cls(**transformer_configs[name])
+        # fuzzy search
+        config = [
+            config
+            for config in transformer_configs
+            if config in str(name).upper() or config in str(name)
+        ]
+        assert len(config) == 1, name
+        return cls(**transformer_configs[config[0]])
+
+
+transformer_configs = {
+    "Mixtral-8x7B-v0.1": dict(
+        block_size=32768,
+        n_layer=16,
+        n_head=32,
+        n_local_heads=8,
+        dim=4096,
+        intermediate_size=14336,
+        rope_base=1000000.0,
+        num_experts=8,
+        num_activated_experts=2,
+    ),
+}
+
+
+class KVCache(nn.Module):
+    def __init__(
+        self, max_batch_size, max_seq_length, n_heads, head_dim, dtype=torch.bfloat16
+    ):
+        super().__init__()
+        cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
+        self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype))
+        self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype))
+
+    def update(self, input_pos, k_val, v_val):
+        # input_pos: [S], k_val: [B, H, S, D]
+        assert input_pos.shape[0] == k_val.shape[2]
+
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out[:, :, input_pos] = k_val
+        v_out[:, :, input_pos] = v_val
+
+        return k_out, v_out
+
+
+class Transformer(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.config = config
+
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
+        self.layers = nn.ModuleList(
+            TransformerBlock(config) for _ in range(config.n_layer)
+        )
+        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
+
+        self.freqs_cis: Optional[Tensor] = None
+        self.mask_cache: Optional[Tensor] = None
+        self.max_batch_size = -1
+        self.max_seq_length = -1
+
+    def setup_caches(self, max_batch_size, max_seq_length):
+        if (
+            self.max_seq_length >= max_seq_length
+            and self.max_batch_size >= max_batch_size
+        ):
+            return
+        head_dim = self.config.dim // self.config.n_head
+        max_seq_length = find_multiple(max_seq_length, 8)
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+        for b in self.layers:
+            b.attention.kv_cache = KVCache(
+                max_batch_size, max_seq_length, self.config.n_local_heads, head_dim
+            )
+
+        self.freqs_cis = precompute_freqs_cis(
+            self.config.block_size,
+            self.config.dim // self.config.n_head,
+            self.config.rope_base,
+        )
+        self.causal_mask = torch.tril(
+            torch.ones(self.max_seq_length, self.max_seq_length, dtype=torch.bool)
+        )
+
+    def forward(self, idx: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
+        assert self.freqs_cis is not None, "Caches must be initialized first"
+        mask = self.causal_mask[None, None, input_pos]
+        freqs_cis = self.freqs_cis[input_pos]
+        x = self.tok_embeddings(idx)
+
+        for i, layer in enumerate(self.layers):
+            x = layer(x, input_pos, freqs_cis, mask)
+        x = self.norm(x)
+        logits = self.output(x)
+        return logits
+
+    @classmethod
+    def from_name(cls, name: str):
+        return cls(ModelArgs.from_name(name))
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.attention = Attention(config)
+        self.block_sparse_moe = MOEFeedForward(config)
+        self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
+        self.attention_norm = RMSNorm(config.dim, config.norm_eps)
+
+    def forward(
+        self, x: Tensor, input_pos: Tensor, freqs_cis: Tensor, mask: Tensor
+    ) -> Tensor:
+        h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)
+        out = h + self.block_sparse_moe(self.ffn_norm(h))
+        return out
+
+
+class Attention(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        assert config.dim % config.n_head == 0
+
+        total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
+        # key, query, value projections for all heads, but in a batch
+        self.wqkv = nn.Linear(config.dim, total_head_dim, bias=False)
+        self.wo = nn.Linear(config.dim, config.dim, bias=False)
+        self.kv_cache = None
+
+        self.n_head = config.n_head
+        self.head_dim = config.head_dim
+        self.n_local_heads = config.n_local_heads
+        self.dim = config.dim
+        self._register_load_state_dict_pre_hook(self.load_hook)
+
+    def load_hook(self, state_dict, prefix, *args):
+        if prefix + "wq.weight" in state_dict:
+            wq = state_dict.pop(prefix + "wq.weight")
+            wk = state_dict.pop(prefix + "wk.weight")
+            wv = state_dict.pop(prefix + "wv.weight")
+            state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv])
+
+    def forward(
+        self,
+        x: Tensor,
+        freqs_cis: Tensor,
+        mask: Tensor,
+        input_pos: Optional[Tensor] = None,
+    ) -> Tensor:
+        bsz, seqlen, _ = x.shape
+
+        kv_size = self.n_local_heads * self.head_dim
+        q, k, v = self.wqkv(x).split([self.dim, kv_size, kv_size], dim=-1)
+
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+
+        q = apply_rotary_emb(q, freqs_cis)
+        k = apply_rotary_emb(k, freqs_cis)
+
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+
+        if self.kv_cache is not None:
+            k, v = self.kv_cache.update(input_pos, k, v)
+
+        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+
+        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+        y = self.wo(y)
+        return y
+
+
+class ConditionalFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.w1 = nn.Parameter(
+            torch.empty(config.num_experts, config.intermediate_size, config.dim)
+        )
+        self.w2 = nn.Parameter(
+            torch.empty(config.num_experts, config.dim, config.intermediate_size)
+        )
+        self.w3 = nn.Parameter(
+            torch.empty(config.num_experts, config.intermediate_size, config.dim)
+        )
+
+    def forward(self, x: Tensor, expert_indices: Tensor) -> Tensor:
+        w1_weights = self.w1[expert_indices]  # [T, A, D, D]
+        w3_weights = self.w3[expert_indices]  # [T, A, D, D]
+        w2_weights = self.w2[expert_indices]  # [T, A, D, D]
+        x1 = F.silu(torch.einsum("ti,taoi -> tao", x, w1_weights))
+        x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
+        expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
+        return expert_outs
+
+
+class MOEFeedForward(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.gate = nn.Linear(config.dim, config.num_experts, bias=False)
+        self.cond_ffn = ConditionalFeedForward(config)
+        self.dim = config.dim
+        self.num_activated_experts = config.num_activated_experts
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = x.view(-1, self.dim)
+        # T = num_tokens, E = num_experts, D = hidden dim, A = activated experts
+        # x: [T, D]
+        scores = self.gate(x)  # [T, E]
+        expert_weights = F.softmax(scores, dim=-1)
+        expert_weights, expert_indices = torch.topk(
+            expert_weights, self.num_activated_experts, dim=-1
+        )  # [T, A], [T, A]
+        expert_weights /= expert_weights.sum(dim=-1, keepdim=True)  # [T, A]
+        expert_outs = self.cond_ffn(x, expert_indices)
+        return torch.einsum("tai,ta -> ti", expert_outs, expert_weights)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+
+    def forward(self, x: Tensor) -> Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
+def precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 10000) -> Tensor:
+    freqs = 1.0 / (
+        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
+    )
+    t = torch.arange(seq_len, device=freqs.device)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
+    return cache.to(dtype=torch.bfloat16)
+
+
+def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
+    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
+            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
+        ],
+        -1,
+    )
+
+    x_out2 = x_out2.flatten(3)
+    return x_out2.type_as(x)
diff --git a/benchmarks/gpt_fast/mixtral_moe_quantize.py b/benchmarks/gpt_fast/mixtral_moe_quantize.py
new file mode 100644
index 0000000000000..fbf8caa8964af
--- /dev/null
+++ b/benchmarks/gpt_fast/mixtral_moe_quantize.py
@@ -0,0 +1,189 @@
+# flake8: noqa: E266, C417, B950
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mixtral_moe_model import ConditionalFeedForward
+
+##### Quantization Primitives ######
+
+
+def dynamically_quantize_per_channel(x, quant_min, quant_max, target_dtype):
+    # assumes symmetric quantization
+    # assumes axis == 0
+    # assumes dense memory format
+    # TODO(future): relax ^ as needed
+
+    # default setup for affine quantization of activations
+    eps = torch.finfo(torch.float32).eps
+
+    # get min and max
+    min_val, max_val = torch.aminmax(x, dim=1)
+
+    # calculate scales and zero_points based on min and max
+    # reference: https://fburl.com/code/srbiybme
+    min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
+    max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
+    device = min_val_neg.device
+
+    # reference: https://fburl.com/code/4wll53rk
+    max_val_pos = torch.max(-min_val_neg, max_val_pos)
+    scales = max_val_pos / (float(quant_max - quant_min) / 2)
+    # ensure scales is the same dtype as the original tensor
+    scales = torch.clamp(scales, min=eps).to(x.dtype)
+    zero_points = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device)
+
+    # quantize based on qmin/qmax/scales/zp
+    # reference: https://www.internalfb.com/code/fbsource/[8edc275012b1]/fbcode/caffe2/torch/ao/quantization/fx/_decomposed.py?lines=63
+    x_div = x / scales.unsqueeze(-1)
+    x_round = torch.round(x_div)
+    x_zp = x_round + zero_points.unsqueeze(-1)
+    quant = torch.clamp(x_zp, quant_min, quant_max).to(target_dtype)
+
+    return quant, scales, zero_points
+
+
+##### Weight-only int8 per-channel quantized code ######
+
+
+def replace_linear_weight_only_int8_per_channel(module):
+    for name, child in module.named_children():
+        if isinstance(child, nn.Linear) and name != "gate":
+            setattr(
+                module,
+                name,
+                WeightOnlyInt8Linear(
+                    child.in_features, child.out_features, target_dtype=torch.int8
+                ),
+            )
+        elif isinstance(child, ConditionalFeedForward):
+            num_experts, intermediate_size, dim = child.w1.shape
+            setattr(
+                module,
+                name,
+                ConditionalFeedForwardInt8(
+                    num_experts, intermediate_size, dim, target_dtype=torch.int8
+                ),
+            )
+        else:
+            replace_linear_weight_only_int8_per_channel(child)
+
+
+class WeightOnlyInt8QuantHandler:
+    def __init__(self, mod):
+        self.mod = mod
+
+    @torch.no_grad()
+    def create_quantized_state_dict(self):
+        cur_state_dict = self.mod.state_dict()
+        for fqn, mod in self.mod.named_modules():
+            if isinstance(mod, torch.nn.Linear) and not fqn.endswith(".gate"):
+                int8_weight, scales, _ = dynamically_quantize_per_channel(
+                    mod.weight.float(), -128, 127, torch.int8
+                )
+                cur_state_dict[f"{fqn}.weight"] = int8_weight
+                cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype)
+            elif isinstance(mod, ConditionalFeedForward):
+                for weight_idx in range(0, 3):
+                    weight_name = f"w{weight_idx + 1}"
+                    scales_name = f"scales{weight_idx + 1}"
+                    weight = getattr(mod, weight_name)
+                    num_experts, intermediate_size, dim = weight.shape
+
+                    bit8_weight_list = []
+                    scales_list = []
+                    for expert_idx in range(num_experts):
+                        bit8_weight, scales, _ = dynamically_quantize_per_channel(
+                            weight[expert_idx].float(), -128, 127, torch.int8
+                        )
+                        bit8_weight_list.append(
+                            bit8_weight.reshape(1, intermediate_size, dim)
+                        )
+                        scales_list.append(scales.reshape(1, intermediate_size))
+
+                    cur_state_dict[f"{fqn}.{weight_name}"] = torch.cat(
+                        bit8_weight_list, dim=0
+                    )
+                    cur_state_dict[f"{fqn}.{scales_name}"] = torch.cat(
+                        scales_list, dim=0
+                    )
+
+        return cur_state_dict
+
+    def convert_for_runtime(self):
+        replace_linear_weight_only_int8_per_channel(self.mod)
+        return self.mod
+
+
+class WeightOnlyInt8Linear(torch.nn.Module):
+    __constants__ = ["in_features", "out_features"]
+    in_features: int
+    out_features: int
+    weight: torch.Tensor
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+        target_dtype=None,
+    ) -> None:
+        assert target_dtype is not None
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.register_buffer(
+            "weight", torch.empty((out_features, in_features), dtype=target_dtype)
+        )
+        self.register_buffer("scales", torch.ones(out_features, dtype=torch.bfloat16))
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, self.weight.to(dtype=input.dtype)) * self.scales
+
+
+class ConditionalFeedForwardInt8(nn.Module):
+    def __init__(self, num_experts, intermediate_size, dim, target_dtype):
+        super().__init__()
+
+        self.target_dtype = target_dtype
+
+        self.register_buffer(
+            "w1", torch.empty(num_experts, intermediate_size, dim, dtype=target_dtype)
+        )
+        self.register_buffer(
+            "w2", torch.empty(num_experts, dim, intermediate_size, dtype=target_dtype)
+        )
+        self.register_buffer(
+            "w3", torch.empty(num_experts, intermediate_size, dim, dtype=target_dtype)
+        )
+
+        self.register_buffer(
+            "scales1", torch.empty(num_experts, intermediate_size, dtype=torch.bfloat16)
+        )
+        self.register_buffer(
+            "scales2", torch.empty(num_experts, dim, dtype=torch.bfloat16)
+        )
+        self.register_buffer(
+            "scales3", torch.empty(num_experts, intermediate_size, dtype=torch.bfloat16)
+        )
+
+    def forward(self, x, expert_indices):
+        w1_weights = self.w1.to(x.dtype)[expert_indices]  # [T, A, D, D]
+        w3_weights = self.w3.to(x.dtype)[expert_indices]  # [T, A, D, D]
+        w2_weights = self.w2.to(x.dtype)[expert_indices]
+        x1 = F.silu(
+            torch.einsum("ti,taoi -> tao", x, w1_weights)
+            * self.scales1[expert_indices].to(x.dtype)
+        )
+        x3 = torch.einsum("ti, taoi -> tao", x, w3_weights) * self.scales3[
+            expert_indices
+        ].to(x.dtype)
+        expert_outs = torch.einsum(
+            "tao, taio -> tai", (x1 * x3), w2_weights
+        ) * self.scales2[expert_indices].to(
+            x.dtype
+        )  # [T, A, D, D]
+        return expert_outs
diff --git a/benchmarks/gpt_fast/model.py b/benchmarks/gpt_fast/model.py
new file mode 100644
index 0000000000000..bd438aea7a002
--- /dev/null
+++ b/benchmarks/gpt_fast/model.py
@@ -0,0 +1,292 @@
+# flake8: noqa: E266, C417, B950
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import functional as F
+
+
+def find_multiple(n: int, k: int) -> int:
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+
+@dataclass
+class ModelArgs:
+    block_size: int = 2048
+    vocab_size: int = 32000
+    n_layer: int = 32
+    n_head: int = 32
+    dim: int = 4096
+    intermediate_size: int = None
+    n_local_heads: int = -1
+    head_dim: int = 64
+    rope_base: float = 10000
+    norm_eps: float = 1e-5
+
+    def __post_init__(self):
+        if self.n_local_heads == -1:
+            self.n_local_heads = self.n_head
+        if self.intermediate_size is None:
+            hidden_dim = 4 * self.dim
+            n_hidden = int(2 * hidden_dim / 3)
+            self.intermediate_size = find_multiple(n_hidden, 256)
+        self.head_dim = self.dim // self.n_head
+
+    @classmethod
+    def from_name(cls, name: str):
+        if name in transformer_configs:
+            return cls(**transformer_configs[name])
+        # fuzzy search
+        config = [
+            config
+            for config in transformer_configs
+            if config in str(name).upper() or config in str(name)
+        ]
+
+        # We may have two or more configs matched (e.g. "7B" and "Mistral-7B"). Find the best config match,
+        # take longer name (as it have more symbols matched)
+        if len(config) > 1:
+            config.sort(key=len, reverse=True)
+            assert len(config[0]) != len(
+                config[1]
+            ), name  # make sure only one 'best' match
+
+        return cls(**transformer_configs[config[0]])
+
+
+transformer_configs = {
+    "CodeLlama-7b-Python-hf": dict(
+        block_size=16384, vocab_size=32000, n_layer=32, dim=4096, rope_base=1000000
+    ),
+    "7B": dict(n_layer=32, n_head=32, dim=4096),
+    "13B": dict(n_layer=40, n_head=40, dim=5120),
+    "30B": dict(n_layer=60, n_head=52, dim=6656),
+    "34B": dict(
+        n_layer=48,
+        n_head=64,
+        dim=8192,
+        vocab_size=32000,
+        n_local_heads=8,
+        intermediate_size=22016,
+        rope_base=1000000,
+    ),  # CodeLlama-34B-Python-hf
+    "70B": dict(
+        n_layer=80, n_head=64, dim=8192, n_local_heads=8, intermediate_size=28672
+    ),
+    "Mistral-7B": dict(
+        n_layer=32,
+        n_head=32,
+        n_local_heads=8,
+        dim=4096,
+        intermediate_size=14336,
+        vocab_size=32000,
+    ),
+}
+
+
+class KVCache(nn.Module):
+    def __init__(
+        self, max_batch_size, max_seq_length, n_heads, head_dim, dtype=torch.bfloat16
+    ):
+        super().__init__()
+        cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
+        self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype))
+        self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype))
+
+    def update(self, input_pos, k_val, v_val):
+        # input_pos: [S], k_val: [B, H, S, D]
+        assert input_pos.shape[0] == k_val.shape[2]
+
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out[:, :, input_pos] = k_val
+        v_out[:, :, input_pos] = v_val
+
+        return k_out, v_out
+
+
+class Transformer(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.config = config
+
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
+        self.layers = nn.ModuleList(
+            TransformerBlock(config) for _ in range(config.n_layer)
+        )
+        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
+
+        self.freqs_cis: Optional[Tensor] = None
+        self.mask_cache: Optional[Tensor] = None
+        self.max_batch_size = -1
+        self.max_seq_length = -1
+
+    def setup_caches(self, max_batch_size, max_seq_length):
+        if (
+            self.max_seq_length >= max_seq_length
+            and self.max_batch_size >= max_batch_size
+        ):
+            return
+        head_dim = self.config.dim // self.config.n_head
+        max_seq_length = find_multiple(max_seq_length, 8)
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+        for b in self.layers:
+            b.attention.kv_cache = KVCache(
+                max_batch_size, max_seq_length, self.config.n_local_heads, head_dim
+            )
+
+        self.freqs_cis = precompute_freqs_cis(
+            self.config.block_size,
+            self.config.dim // self.config.n_head,
+            self.config.rope_base,
+        )
+        self.causal_mask = torch.tril(
+            torch.ones(self.max_seq_length, self.max_seq_length, dtype=torch.bool)
+        )
+
+    def forward(self, idx: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
+        assert self.freqs_cis is not None, "Caches must be initialized first"
+        mask = self.causal_mask[None, None, input_pos]
+        freqs_cis = self.freqs_cis[input_pos]
+        x = self.tok_embeddings(idx)
+
+        for i, layer in enumerate(self.layers):
+            x = layer(x, input_pos, freqs_cis, mask)
+        x = self.norm(x)
+        logits = self.output(x)
+        return logits
+
+    @classmethod
+    def from_name(cls, name: str):
+        return cls(ModelArgs.from_name(name))
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.attention = Attention(config)
+        self.feed_forward = FeedForward(config)
+        self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
+        self.attention_norm = RMSNorm(config.dim, config.norm_eps)
+
+    def forward(
+        self, x: Tensor, input_pos: Tensor, freqs_cis: Tensor, mask: Tensor
+    ) -> Tensor:
+        h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+
+
+class Attention(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        assert config.dim % config.n_head == 0
+
+        total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
+        # key, query, value projections for all heads, but in a batch
+        self.wqkv = nn.Linear(config.dim, total_head_dim, bias=False)
+        self.wo = nn.Linear(config.dim, config.dim, bias=False)
+        self.kv_cache = None
+
+        self.n_head = config.n_head
+        self.head_dim = config.head_dim
+        self.n_local_heads = config.n_local_heads
+        self.dim = config.dim
+        self._register_load_state_dict_pre_hook(self.load_hook)
+
+    def load_hook(self, state_dict, prefix, *args):
+        if prefix + "wq.weight" in state_dict:
+            wq = state_dict.pop(prefix + "wq.weight")
+            wk = state_dict.pop(prefix + "wk.weight")
+            wv = state_dict.pop(prefix + "wv.weight")
+            state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv])
+
+    def forward(
+        self,
+        x: Tensor,
+        freqs_cis: Tensor,
+        mask: Tensor,
+        input_pos: Optional[Tensor] = None,
+    ) -> Tensor:
+        bsz, seqlen, _ = x.shape
+
+        kv_size = self.n_local_heads * self.head_dim
+        q, k, v = self.wqkv(x).split([self.dim, kv_size, kv_size], dim=-1)
+
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+
+        q = apply_rotary_emb(q, freqs_cis)
+        k = apply_rotary_emb(k, freqs_cis)
+
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+
+        if self.kv_cache is not None:
+            k, v = self.kv_cache.update(input_pos, k, v)
+
+        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+
+        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+        y = self.wo(y)
+        return y
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.w1 = nn.Linear(config.dim, config.intermediate_size, bias=False)
+        self.w3 = nn.Linear(config.dim, config.intermediate_size, bias=False)
+        self.w2 = nn.Linear(config.intermediate_size, config.dim, bias=False)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+
+    def forward(self, x: Tensor) -> Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
+def precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 10000) -> Tensor:
+    freqs = 1.0 / (
+        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
+    )
+    t = torch.arange(seq_len, device=freqs.device)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
+    return cache.to(dtype=torch.bfloat16)
+
+
+def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
+    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
+            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
+        ],
+        -1,
+    )
+
+    x_out2 = x_out2.flatten(3)
+    return x_out2.type_as(x)
diff --git a/benchmarks/gpt_fast/quantize.py b/benchmarks/gpt_fast/quantize.py
new file mode 100644
index 0000000000000..89f16ad931d40
--- /dev/null
+++ b/benchmarks/gpt_fast/quantize.py
@@ -0,0 +1,106 @@
+# flake8: noqa: E266, C417, B950
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+##### Quantization Primitives ######
+
+
+def dynamically_quantize_per_channel(x, quant_min, quant_max, target_dtype):
+    # assumes symmetric quantization
+    # assumes axis == 0
+    # assumes dense memory format
+    # TODO(future): relax ^ as needed
+
+    # default setup for affine quantization of activations
+    eps = torch.finfo(torch.float32).eps
+
+    # get min and max
+    min_val, max_val = torch.aminmax(x, dim=1)
+
+    # calculate scales and zero_points based on min and max
+    # reference: https://fburl.com/code/srbiybme
+    min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
+    max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
+    device = min_val_neg.device
+
+    # reference: https://fburl.com/code/4wll53rk
+    max_val_pos = torch.max(-min_val_neg, max_val_pos)
+    scales = max_val_pos / (float(quant_max - quant_min) / 2)
+    # ensure scales is the same dtype as the original tensor
+    scales = torch.clamp(scales, min=eps).to(x.dtype)
+    zero_points = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device)
+
+    # quantize based on qmin/qmax/scales/zp
+    # reference: https://www.internalfb.com/code/fbsource/[8edc275012b1]/fbcode/caffe2/torch/ao/quantization/fx/_decomposed.py?lines=63
+    x_div = x / scales.unsqueeze(-1)
+    x_round = torch.round(x_div)
+    x_zp = x_round + zero_points.unsqueeze(-1)
+    quant = torch.clamp(x_zp, quant_min, quant_max).to(target_dtype)
+
+    return quant, scales, zero_points
+
+
+##### Weight-only int8 per-channel quantized code ######
+
+
+def replace_linear_weight_only_int8_per_channel(module):
+    for name, child in module.named_children():
+        if isinstance(child, nn.Linear):
+            setattr(
+                module,
+                name,
+                WeightOnlyInt8Linear(child.in_features, child.out_features),
+            )
+        else:
+            replace_linear_weight_only_int8_per_channel(child)
+
+
+class WeightOnlyInt8QuantHandler:
+    def __init__(self, mod):
+        self.mod = mod
+
+    @torch.no_grad()
+    def create_quantized_state_dict(self):
+        cur_state_dict = self.mod.state_dict()
+        for fqn, mod in self.mod.named_modules():
+            if isinstance(mod, torch.nn.Linear):
+                int8_weight, scales, _ = dynamically_quantize_per_channel(
+                    mod.weight.float(), -128, 127, torch.int8
+                )
+                cur_state_dict[f"{fqn}.weight"] = int8_weight.to("cpu")
+                cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype).to("cpu")
+
+        return cur_state_dict
+
+    def convert_for_runtime(self):
+        replace_linear_weight_only_int8_per_channel(self.mod)
+        return self.mod
+
+
+class WeightOnlyInt8Linear(torch.nn.Module):
+    __constants__ = ["in_features", "out_features"]
+    in_features: int
+    out_features: int
+    weight: torch.Tensor
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.register_buffer(
+            "weight", torch.empty((out_features, in_features), dtype=torch.int8)
+        )
+        self.register_buffer("scales", torch.ones(out_features, dtype=torch.bfloat16))
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, self.weight.to(dtype=input.dtype)) * self.scales
diff --git a/benchmarks/instruction_counts/core/expand.py b/benchmarks/instruction_counts/core/expand.py
index 1b960bcb5c655..1944aaf00a593 100644
--- a/benchmarks/instruction_counts/core/expand.py
+++ b/benchmarks/instruction_counts/core/expand.py
@@ -79,7 +79,7 @@ def _generate_torchscript_file(model_src: str, name: str) -> Optional[str]:
     assert isinstance(
         jit_model, (torch.jit.ScriptFunction, torch.jit.ScriptModule)
     ), f"Expected ScriptFunction or ScriptModule, got: {type(jit_model)}"
-    jit_model.save(artifact_path)
+    jit_model.save(artifact_path)  # type: ignore[call-arg]
 
     # Cleanup now that we have the actual serialized model.
     os.remove(module_path)
diff --git a/benchmarks/operator_benchmark/README.md b/benchmarks/operator_benchmark/README.md
index bef7e0067de45..549bb137a9d3c 100644
--- a/benchmarks/operator_benchmark/README.md
+++ b/benchmarks/operator_benchmark/README.md
@@ -18,40 +18,40 @@ Key Features:
 
 ## Initial Setup
 The instruction below installs a cpp\_extension for PyTorch and it is required to run the benchmark suite.
-```
-$ cd pt_extension
-$ python setup.py install
+```bash
+cd pt_extension
+python setup.py install
 ```
 
 ## How to run the benchmarks:
 
 Run `torch.add` benchmark:
-```
-$ cd pytorch/benchmarks/operator_benchmark
-$ python -m pt.add_test --omp-num-threads 1 --mkl-num-threads 1
+```bash
+cd pytorch/benchmarks/operator_benchmark
+python -m pt.add_test --omp-num-threads 1 --mkl-num-threads 1
 ```
 Note: we set the number of OpenMP and MKL threads both to 1. If you want to benchmark operators with multithreading (intra-op parallelism), use the `--omp-num-threads` and `--mkl-num-threads` flags.
 
 List all the supported tests:
-```
-$ python -m pt.add_test --list-tests
+```bash
+python -m pt.add_test --list-tests
 ```
 
 Filter and run a test (use `add_M8_N16_K32` as an example):
-```
-$ python -m pt.add_test --test-name add_K32_M8_N1
+```bash
+python -m pt.add_test --test-name add_K32_M8_N1
 --omp-num-threads 1 --mkl-num-threads 1
 ```
 
 Run all the supported benchmarks:
-```
-$ python -m benchmark_all_test
+```bash
+python -m benchmark_all_test
 ```
 
 ## Code to support `torch.add` in the benchmark
 The following example shows the code to support `torch.add` with 27 different tests. In the subpages of this wiki, we'll step through the complete flow of adding PyTorch and Caffe2 operators to the benchmark suite. Existing benchmarks for operators are in `pt` and `c2` directories and we highly recommend putting your new operators in those locations.
 
-```
+```python
 add_short_configs = op_bench.cross_product_configs(
     M=[8, 64, 128],
     N=range(2, 10, 3),
@@ -115,34 +115,34 @@ At a high level, the output includes the execution time of `torch.add` with thre
 
 ### Command-Line Control
 You can control all the aspects of the benchmark suite through the command-line. Please find details of those arguments by running the following command or look into `benchmark_runner.py`.
-```
-$ python benchmark_runner.py --help
+```bash
+python benchmark_runner.py --help
 ```
 
 Run all the supported benchmarks:
-```
-$ python -m benchmark_all_test --omp-num-threads 1 --mkl-num-threads 1
+```bash
+python -m benchmark_all_test --omp-num-threads 1 --mkl-num-threads 1
 ```
 
 List all the supported operators:
-```
-$ python -m benchmark_all_test --list-ops
+```bash
+python -m benchmark_all_test --list-ops
 ```
 
 List all the supported tests:
-```
-$ python -m benchmark_all_test --list-tests
+```bash
+python -m benchmark_all_test --list-tests
 ```
 
 Filter and run an operator (use add as an example):
-```
-$ python -m benchmark_all_test --operators add --omp-num-threads 1 --mkl-num-threads 1
+```bash
+python -m benchmark_all_test --operators add --omp-num-threads 1 --mkl-num-threads 1
 ```
 Note: this filter is based on the operator name rather than the file name.
 
 Run torch.add benchmark with tag 'long':
-```
-$ python -m pt.add_test --tag-filter long
+```bash
+python -m pt.add_test --tag-filter long
 ```
 
 ## Adding New Operators to the Benchmark Suite
@@ -150,11 +150,11 @@ In the previous sections, we gave several examples to show how to run the alread
 
 ### Add a New PyTorch Operator
 Let's say you want to measure the execution time of the following operator:
-```
+```python
 C = torch.add(A, B) # Shape of A and B is [M, N, K]
 ```
 The code below shows how to add it to the benchmark suite. Let's go over the example line by line.
-```
+```python
 import operator_benchmark as op_bench
 import torch
 
@@ -194,7 +194,7 @@ if __name__ == "__main__":
 
 #### Part 1. Specify Inputs to Operators
 For the `torch.add` operator, we would like to make sure it delivers good performance with input tensors which are of small, medium and large sizes. We have introduced two helper functions for users to easily generate a combination of inputs.
-```
+```python
 # Generate list configurations that will be used for benchmark experiments
 add_long_configs = op_bench.cross_product_configs(
     M=[8, 64, 128],
@@ -225,7 +225,7 @@ After inputs are provided, we now look at adding the computation of an operator.
 * `forward` includes the operator to be tested and the computation based on the created tensors in `init`. Apart from `self`, the order of the arguments must match the entries specified in `self.inputs`.
 
 The example below shows the code for `torch.add`:
-```
+```python
 # Given one set of M, N, K, the init method creates input tensors based on
 # that. The forward method does torch.add calculation on those input tensors.
 
@@ -247,14 +247,14 @@ class AddBenchmark(op_bench.TorchBenchmarkBase):
 
 #### Part 3. Register Tests With the Benchmark Suite
 After we have inputs and the benchmark class, it's time to register them with our benchmark suite. Here is how it looks like:
-```
+```python
 op_bench.generate_pt_test(add_long_configs + add_short_configs, AddBenchmark)
 ```
 `generate_pt_test` takes two parameters which are inputs configs and the benchmark class.
 
 #### Part 4. Run the Registered Tests
 To run the benchmark, we use the main method in `benchmark_runner` module.
-```
+```python
 if __name__ == "__main__":
     op_bench.benchmark_runner.main()
 ```
@@ -263,7 +263,7 @@ That's it. You just added a new operator to the benchmark suite!
 
 ### Add a New Caffe2 Operator
 The steps to add a new Caffe2 operator is the same as that for a PyTorch operator. The code below shows how to add Caffe2 `Add` operator:
-```
+```python
 import operator_benchmark as op_bench
 from caffe2.python import core
 
@@ -313,12 +313,12 @@ There are two things worth mentioning in this code:
 In the previous sections, we introduced the steps required to add a single operator to the benchmark suite. There are scenarios where you want to extend the benchmark suite with a list of operators which can share the same inputs. For example, to benchmark `abs` and `acos` operators, you can use the same set of inputs for both.
 
 Let's say we want to benchmark the following operators separately:
-```
+```python
 C = torch.abs(A) # Shape of A [M, N]
 C = torch.acos(A) # Shape of A [M, N]
 ```
 The following code shows how to do that:
-```
+```python
 import operator_benchmark as op_bench
 import torch
 
@@ -363,7 +363,7 @@ To add a list of operators to the benchmark suite, we introduce the `op_bench.op
 * `attr_names` stores the names of values in attrs.
 
 The example below shows the code to add `torch.abs` and `torch.acos` :
-```
+```python
 unary_ops_list = op_bench.op_list(
     attr_names=["op_name", "op_func"],
     attrs=[
@@ -380,7 +380,7 @@ Every new subclass is required to implement 3 methods:
 * `forward` includes the operator to be tested and the computation based on the created tensors in `init`. Apart from `self`, the order of the arguments must match the entries specified in `self.inputs`.
 Here is the code for `abs` and `acos`:
 
-```
+```python
 class UnaryOpBenchmark(op_bench.TorchBenchmarkBase):
     def init(self, M, N, device, op_func):
         # The M and N match with the attr_names in the input configuration
@@ -397,7 +397,7 @@ class UnaryOpBenchmark(op_bench.TorchBenchmarkBase):
 #### Part 3. Register a List of Operators
 To register multiple operators,  we introduced the `generate_pt_tests_from_op_list` function which takes three parameters. First, the list of operators. Second,the configs. Third, the benchmark class.
 Here is an example:
-```
+```python
 op_bench.generate_pt_tests_from_op_list(unary_ops_list, unary_ops_configs, UnaryOpBenchmark)
 ```
 
@@ -412,13 +412,13 @@ To measure the performance of an operator in its backward path, there are only t
 2\. Use `generate_pt_gradient_test` to register the tests.
 
 The example below shows the relevant code for that:
-```
+```python
 self.input_one = torch.rand(M, N, K, requires_grad=True)
 generate_pt_gradient_test(long_configs + short_configs, TorchAddBenchmark)
 ```
 #### For Caffe2 Gradient Ops
 To add Caffe2 gradient ops, we need to implement a new backward method in the benchmark class:
-```
+```python
 class AddBenchmark(op_bench.Caffe2BenchmarkBase):
 
     def init(self, M, N, K):
diff --git a/benchmarks/operator_benchmark/benchmark_caffe2.py b/benchmarks/operator_benchmark/benchmark_caffe2.py
index 3f619f0fe1365..24b61e52d5b54 100644
--- a/benchmarks/operator_benchmark/benchmark_caffe2.py
+++ b/benchmarks/operator_benchmark/benchmark_caffe2.py
@@ -185,9 +185,7 @@ def generate_c2_test_from_ops(ops_metadata, bench_op, tags):
         op = bench_op()
         op.init(**test_attrs)
         test_name = op.test_name("short")
-        input_config = "Shapes: {}, Type: {}, Args: {}".format(
-            op_metadata.input_dims, op_metadata.input_types, str(op_metadata.args)
-        )
+        input_config = f"Shapes: {op_metadata.input_dims}, Type: {op_metadata.input_types}, Args: {str(op_metadata.args)}"
         test_config = TestConfig(test_name, input_config, tags, run_backward=False)
         if op is not None:
             create_caffe2_op_test_case(op, test_config)
diff --git a/benchmarks/operator_benchmark/benchmark_utils.py b/benchmarks/operator_benchmark/benchmark_utils.py
index d68a0504b886b..d7e45b7c16856 100644
--- a/benchmarks/operator_benchmark/benchmark_utils.py
+++ b/benchmarks/operator_benchmark/benchmark_utils.py
@@ -345,8 +345,9 @@ def get_operator_range(chars_range):
             ops_start_chars_set.add(item.lower())
             continue
         start, end = item.split("-")
-        for c in range(ord(start), ord(end) + 1):
-            ops_start_chars_set.add(chr(c).lower())
+        ops_start_chars_set.update(
+            chr(c).lower() for c in range(ord(start), ord(end) + 1)
+        )
     return ops_start_chars_set
 
 
diff --git a/benchmarks/operator_benchmark/pt/softmax_test.py b/benchmarks/operator_benchmark/pt/softmax_test.py
index f13fafb5aa84d..a7d3918f399a7 100644
--- a/benchmarks/operator_benchmark/pt/softmax_test.py
+++ b/benchmarks/operator_benchmark/pt/softmax_test.py
@@ -24,7 +24,12 @@
 
 
 softmax_configs_long = op_bench.cross_product_configs(
-    N=[8, 16], C=[3], H=[256, 512], W=[256, 512], device=["cpu", "cuda"], tags=["long"]
+    N=[8, 16],
+    C=[3],
+    H=[256, 512],
+    W=[256, 512],
+    device=["cpu", "cuda"],
+    tags=["long"],
 )
 
 
@@ -40,14 +45,30 @@
 softmax_two_dims_ops_list = op_bench.op_list(
     attr_names=["op_name", "op_func"],
     attrs=[
+        ["Softmax", nn.Softmax],
         ["LogSoftmax", nn.LogSoftmax],
     ],
 )
 
 
 softmax_two_dims_configs = op_bench.config_list(
-    attr_names=["N", "seq_len", "dim"],
-    attrs=[[700, 23258, 0], [700, 23258, 1], [1024, 23258, 1]],
+    attr_names=["M", "N", "dim"],
+    attrs=[
+        [700, 23258, 0],
+        [700, 23258, 1],
+        [1024, 23258, 1],
+        [128, 128, 1],
+        [48, 128, 1],
+        [16, 1024, 1],
+        [32, 1024, 1],
+        [48, 1024, 1],
+        [16, 512, 1],
+        [32, 512, 1],
+        [48, 512, 1],
+        [16, 256, 1],
+        [32, 256, 1],
+        [48, 256, 1],
+    ],
     cross_product_configs={
         "device": ["cpu", "cuda"],
     },
@@ -65,8 +86,8 @@ def forward(self, input):
 
 
 class Softmax2DimsBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, N, seq_len, dim, device, op_func):
-        self.inputs = {"input": torch.rand(N, seq_len, device=device)}
+    def init(self, M, N, dim, device, op_func):
+        self.inputs = {"input": torch.rand(M, N, device=device)}
         self.op_func = op_func(dim=dim)
 
     def forward(self, input):
diff --git a/benchmarks/static_runtime/test_static_module.cc b/benchmarks/static_runtime/test_static_module.cc
index 5fc5d9274931b..2434e2987807f 100644
--- a/benchmarks/static_runtime/test_static_module.cc
+++ b/benchmarks/static_runtime/test_static_module.cc
@@ -37,6 +37,12 @@ bool testCanEnableStaticRuntime(const std::string& jit_script) {
   return canEnableStaticRuntime(graph);
 }
 
+bool testCanEnableStaticRuntimeWithIR(const std::string& ir) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(ir, graph.get(), {});
+  return canEnableStaticRuntime(graph);
+}
+
 bool testModuleHasOp(const std::string& jit_script, const char* op_name) {
   script::Module module("module");
   module.define(jit_script);
@@ -345,6 +351,15 @@ TEST(StaticRuntime, CanEnableStaticRuntime) {
   EXPECT_TRUE(testCanEnableStaticRuntime(is_script_none));
   EXPECT_FALSE(testCanEnableStaticRuntime(is_not_script_tensors));
   EXPECT_TRUE(testCanEnableStaticRuntime(is_not_script_none));
+
+}
+TEST(StaticRuntime, CanEnableStaticRuntimeCallMethod) {
+  const auto call_method = R"IR(
+      graph(%x : Tensor):
+          %1 : Tensor = prim::CallMethod[name="offsets"](%x)
+          return (%1)
+  )IR";
+  EXPECT_FALSE(testCanEnableStaticRuntimeWithIR(call_method));
 }
 
 TEST(StaticRuntime, CanEnableStaticRuntimeSubBlocks) {
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index ef3bc75f921b2..8fe1e88d376d3 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -649,6 +649,19 @@ TEST(StaticRuntime, EmbeddingBagWithExtraneousOutput) {
   testStaticRuntime(embedding_bag_max_last_offset_ir, args, args2);
 }
 
+TEST(StaticRuntime, EmbeddingBagWithMixedInt32Int64Input) {
+  const std::string embedding_bag_default = R"JIT(
+    def forward(self, a: Tensor, b: Tensor, c: Tensor):
+        x, y, z, _ = torch.embedding_bag(a, b, c)
+        return (x.clone(), y.clone(), z.clone(), _.clone())
+  )JIT";
+  auto weight = torch::randn({3, 11}, at::ScalarType::Float);
+  auto input = torch::tensor({0, 1, 0, 2}, at::ScalarType::Long);
+  auto offset = torch::tensor({0, 2, 4}, at::ScalarType::Int);
+  std::vector<IValue> args{weight, input, offset};
+  testStaticRuntime(embedding_bag_default, args);
+}
+
 TEST(StaticRuntime, LayerNorm) {
   const std::string layer_norm_with_weights = R"JIT(
     def forward(self, input: Tensor, normalized_shape: List[int], weight: Tensor, bias: Tensor):
@@ -660,6 +673,12 @@ TEST(StaticRuntime, LayerNorm) {
         return torch.layer_norm(input, normalized_shape, None, None, 1e-05, False).clone()
   )JIT";
 
+  const std::string layer_norm_with_noncontiguous_input = R"JIT(
+    def forward(self, input: Tensor, normalized_shape: List[int], weight: Tensor, bias: Tensor):
+        input = torch.transpose(input, 1, 2)
+        return torch.layer_norm(input, normalized_shape, weight, bias, 1e-05, False).clone()
+  )JIT";
+
   const auto a = torch::rand({1, 2, 2, 2});
   const auto b = torch::rand({3, 2, 2, 2});
   for (int normalized_size : {2, 3}) {
@@ -671,6 +690,7 @@ TEST(StaticRuntime, LayerNorm) {
     std::vector<IValue> args1{b, normalized_shape, weight, bias};
     testStaticRuntime(layer_norm_with_weights, args);
     testStaticRuntime(layer_norm_with_weights, args, args1);
+    testStaticRuntime(layer_norm_with_noncontiguous_input, args);
 
     args = {a, normalized_shape};
     testStaticRuntime(layer_norm_without_weights, args);
diff --git a/benchmarks/tensorexpr/benchmark.py b/benchmarks/tensorexpr/benchmark.py
index 569a95fe814d5..d6fa1485436e8 100644
--- a/benchmarks/tensorexpr/benchmark.py
+++ b/benchmarks/tensorexpr/benchmark.py
@@ -211,7 +211,7 @@ def dump_result(self, result_dict):
                 msg += f", compute {result_dict['compute_workload']:.2f} Gops/s"
             print(msg)
         else:
-            raise Exception("Unknown output_type " + self.output_type)
+            raise Exception("Unknown output_type " + self.output_type)  # noqa: TRY002
 
 
 @contextlib.contextmanager
diff --git a/benchmarks/tensorexpr/rnn_eltwise.py b/benchmarks/tensorexpr/rnn_eltwise.py
index 39cffe0bea8eb..a923af9505a1a 100644
--- a/benchmarks/tensorexpr/rnn_eltwise.py
+++ b/benchmarks/tensorexpr/rnn_eltwise.py
@@ -57,7 +57,7 @@ def memory_workload(self):
         def memsize(t):
             return t.numel() * t.element_size()
 
-        input_size = sum([memsize(t) for t in self.inputs])
+        input_size = sum(memsize(t) for t in self.inputs)
         output_size = 2 * memsize(self.cx)
         io_size = input_size + output_size
         return {"sol": io_size, "algorithmic": io_size}
diff --git a/benchmarks/transformer/attention_bias_benchmarks.py b/benchmarks/transformer/attention_bias_benchmarks.py
index ae288fbafd47e..fbe57b540dc26 100644
--- a/benchmarks/transformer/attention_bias_benchmarks.py
+++ b/benchmarks/transformer/attention_bias_benchmarks.py
@@ -222,7 +222,7 @@ def print_results(results: List[Experiment]):
         {
             "Type": "Average",
             "Speedup": np.mean(speedups),
-            **{key: None for key in max_config_dict},
+            **dict.fromkeys(max_config_dict),
         },
         {"Type": "Max", "Speedup": speedups[max_speedup_index], **max_config_dict},
         {"Type": "Min", "Speedup": speedups[min_speedup_index], **min_config_dict},
diff --git a/benchmarks/transformer/better_transformer_vs_mha_functional.py b/benchmarks/transformer/better_transformer_vs_mha_functional.py
index 5b4f794d0fbea..71be7db4562a0 100644
--- a/benchmarks/transformer/better_transformer_vs_mha_functional.py
+++ b/benchmarks/transformer/better_transformer_vs_mha_functional.py
@@ -152,8 +152,8 @@ def run(
     result_entry["sequence_length"] = sequence_length
     result_entry["n_heads"] = num_heads
     result_entry["embed_dim"] = embed_dim
-    result_entry["time_native_mha_slow(μs)"] = f"{time_native_mha_slow:.3f}"
-    result_entry["time_native_mha_fast (μs)"] = f"{time_native_mha_fast:.3f}"
+    result_entry["time_native_mha_slow(\u00B5s)"] = f"{time_native_mha_slow:.3f}"
+    result_entry["time_native_mha_fast (\u00B5s)"] = f"{time_native_mha_fast:.3f}"
     result_entry["speedup flash_mha v native_mha"] = f"{speedup_fast_internal:.3f}"
     result_entry["padding"] = f"{padding:.3f}"
     return result_entry
diff --git a/benchmarks/transformer/score_mod.py b/benchmarks/transformer/score_mod.py
new file mode 100644
index 0000000000000..0e9e8d11a35b9
--- /dev/null
+++ b/benchmarks/transformer/score_mod.py
@@ -0,0 +1,257 @@
+import itertools
+from collections import defaultdict
+from dataclasses import asdict, dataclass
+from functools import partial
+from typing import Callable, List
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from tabulate import tabulate
+from torch.nn.attention._flex_attention import _flex_attention
+from tqdm import tqdm
+
+torch._dynamo.config.automatic_dynamic_shapes = False
+# Needed since changing args to function causes recompiles
+torch._dynamo.config.cache_size_limit = 1000
+
+
+from triton.testing import do_bench
+
+
+def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
+    # warmup
+    for _ in range(5):
+        func(*args, **kwargs)
+    return do_bench(lambda: func(*args, **kwargs)) * 1e3
+
+
+@dataclass(frozen=True)
+class ExperimentConfig:
+    batch_size: int
+    num_heads: int
+    q_seq_len: int
+    k_seq_len: int
+    head_dim: int
+    score_mod: Callable
+    dtype: torch.dtype
+
+    def asdict(self):
+        return asdict(self)
+
+
+@dataclass(frozen=True)
+class ExperimentResults:
+    eager_time: float
+    compiled_time: float
+
+    def get_entries(self) -> List:
+        return [
+            f"{self.eager_time:2f}",
+            f"{self.compiled_time:2f}",
+        ]
+
+
+@dataclass(frozen=True)
+class Experiment:
+    config: ExperimentConfig
+    results: ExperimentResults
+
+    def get_entries(self) -> List:
+        return self.config.get_entries() + self.results.get_entries()
+
+    def asdict(self):
+        dict1 = asdict(self.config)
+        dict2 = asdict(self.results)
+        return {**dict1, **dict2}
+
+
+def generate_inputs(
+    batch_size,
+    num_heads,
+    q_sequence_length,
+    kv_sequence_length,
+    head_dim,
+    dtype,
+    device,
+):
+    q_shape = (batch_size, q_sequence_length, num_heads * head_dim)
+    kv_shape = (batch_size, kv_sequence_length, num_heads * head_dim)
+
+    make_q = partial(torch.rand, q_shape, device=device, dtype=dtype)
+    make_kv = partial(torch.rand, kv_shape, device=device, dtype=dtype)
+    query = (
+        make_q()
+        .view(batch_size, q_sequence_length, num_heads, head_dim)
+        .transpose(1, 2)
+    )
+    key = (
+        make_kv()
+        .view(batch_size, kv_sequence_length, num_heads, head_dim)
+        .transpose(1, 2)
+    )
+    value = (
+        make_kv()
+        .view(batch_size, kv_sequence_length, num_heads, head_dim)
+        .transpose(1, 2)
+    )
+    return query, key, value
+
+
+def run_single_experiment(config: ExperimentConfig) -> ExperimentResults:
+    device = torch.device("cuda")
+    query, key, value = generate_inputs(
+        config.batch_size,
+        config.num_heads,
+        config.q_seq_len,
+        config.k_seq_len,
+        config.head_dim,
+        config.dtype,
+        device,
+    )
+
+    def eager_sdpa(query, key, value, _):
+        return F.scaled_dot_product_attention(query, key, value)
+
+    compiled_sdpa = torch.compile(_flex_attention)
+
+    score_mod = config.score_mod
+
+    forward_eager_time = benchmark_torch_function_in_microseconds(
+        eager_sdpa, query, key, value, score_mod
+    )
+    forward_compiled_time = benchmark_torch_function_in_microseconds(
+        compiled_sdpa, query, key, value, score_mod
+    )
+
+    return ExperimentResults(
+        eager_time=forward_eager_time,
+        compiled_time=forward_compiled_time,
+    )
+
+
+def calculate_speedup(results: ExperimentResults) -> float:
+    return results.eager_time / results.compiled_time
+
+
+def get_func_name(func):
+    return func.__name__.split("<locals>.")[-1].split(" at ")[0]
+
+
+def get_average_speedups(results: List[Experiment]):
+    # Calculate speedups
+    speedups = [calculate_speedup(r.results) for r in results]
+
+    # Find indices of max and min speedups
+    max_speedup_index = np.argmax(speedups)
+    min_speedup_index = np.argmin(speedups)
+
+    # Get the config dictionaries
+    max_config_dict = results[max_speedup_index].config.asdict()
+    min_config_dict = results[min_speedup_index].config.asdict()
+
+    # Extract function names from score_mod strings
+    max_config_dict["score_mod"] = (
+        max_config_dict["score_mod"].__name__.split("<locals>.")[-1].split(" at ")[0]
+    )
+    min_config_dict["score_mod"] = (
+        min_config_dict["score_mod"].__name__.split("<locals>.")[-1].split(" at ")[0]
+    )
+
+    # Create table data
+    table_data = [
+        {
+            "Type": "Average",
+            "Speedup": np.mean(speedups),
+            **dict.fromkeys(max_config_dict),
+        },
+        {"Type": "Max", "Speedup": speedups[max_speedup_index], **max_config_dict},
+        {"Type": "Min", "Speedup": speedups[min_speedup_index], **min_config_dict},
+    ]
+
+    return table_data
+
+
+def print_results(results: List[Experiment]):
+    table_data = defaultdict(list)
+    for experiment in results:
+        for key, value in experiment.asdict().items():
+            if key == "eager_time" or key == "compiled_time":
+                value = float(value)
+            table_data[key].append(value)
+
+    # Calculate speedups
+    speedups = [calculate_speedup(r.results) for r in results]
+    table_data["speedup"] = speedups
+
+    table_data["score_mod"] = [get_func_name(func) for func in table_data["score_mod"]]
+    print(tabulate(table_data, headers="keys", tablefmt="github", floatfmt=".3f"))
+
+    average_data = get_average_speedups(results)
+    print(tabulate(average_data, headers="keys", tablefmt="github", floatfmt=".3f"))
+
+
+def generate_score_mods() -> List[Callable]:
+    def noop(score, b, h, m, n):
+        return score
+
+    def causal_mask(score, b, h, token_q, token_kv):
+        return torch.where(token_q >= token_kv, score, float("-inf"))
+
+    def relative_bias(score, b, h, m, n):
+        return score + (m - n)
+
+    def head_bias(score, b, h, m, n):
+        return score + 2 * h
+
+    return [noop, causal_mask, relative_bias, head_bias]
+
+
+def generate_experiment_configs() -> List[ExperimentConfig]:
+    batch_sizes = [1, 8, 16]
+    num_heads = [16]
+    q_kv_seq_lens = [(512, 512), (1024, 1024), (4096, 4096)]
+    head_dims = [64, 128, 256]
+    dtypes = [
+        torch.bfloat16,
+    ]
+    score_mods = generate_score_mods()
+    all_configs = []
+    for (
+        bsz,
+        n_heads,
+        (q_seq_len, kv_seq_len),
+        head_dim,
+        score_mod,
+        dtype,
+    ) in itertools.product(
+        batch_sizes, num_heads, q_kv_seq_lens, head_dims, score_mods, dtypes
+    ):
+        all_configs.append(
+            ExperimentConfig(
+                batch_size=bsz,
+                num_heads=n_heads,
+                q_seq_len=q_seq_len,
+                k_seq_len=kv_seq_len,
+                head_dim=head_dim,
+                score_mod=score_mod,
+                dtype=dtype,
+            )
+        )
+
+    return all_configs
+
+
+def main():
+    seed = 123
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    results = []
+    for config in tqdm(generate_experiment_configs()):
+        results.append(Experiment(config, run_single_experiment(config)))
+
+    print_results(results)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/transformer/sdp.py b/benchmarks/transformer/sdp.py
index e032b071078f6..c79ab8358bdc6 100644
--- a/benchmarks/transformer/sdp.py
+++ b/benchmarks/transformer/sdp.py
@@ -29,6 +29,7 @@ class ExperimentConfig:
     enable_math: bool
     enable_flash: bool
     enable_mem_efficient: bool
+    enable_cudnn: bool
 
     def get_entries(self) -> List:
         return [
@@ -41,6 +42,7 @@ def get_entries(self) -> List:
             self.enable_math,
             self.enable_flash,
             self.enable_mem_efficient,
+            self.enable_cudnn,
         ]
 
     @classmethod
@@ -55,6 +57,7 @@ def get_entry_names(cls) -> List[str]:
             "enable_math",
             "enable_flash",
             "enable_mem_efficient",
+            "enable_cudnn",
         ]
 
 
@@ -78,10 +81,10 @@ def get_entries(self) -> List:
     @classmethod
     def get_entry_names(cls) -> List[str]:
         return [
-            "nn_mha_time (μs)",
-            "compiled_nn_mha_time (μs)",
-            "composite_mha_time (μs)",
-            "compiled_composite_mha_time (μs)",
+            "nn_mha_time (\u00B5s)",
+            "compiled_nn_mha_time (\u00B5s)",
+            "composite_mha_time (\u00B5s)",
+            "compiled_composite_mha_time (\u00B5s)",
         ]
 
 
@@ -206,6 +209,7 @@ def run_single_experiment(config: ExperimentConfig) -> ExperimentResults:
         enable_math=config.enable_math,
         enable_flash=config.enable_flash,
         enable_mem_efficient=config.enable_mem_efficient,
+        enable_cudnn=config.enable_cudnn,
     ) as kernel_choice, torch.inference_mode() as inference_mode:
         dropout_p = 0.0
         mask = None
@@ -286,6 +290,7 @@ def generate_experiments(
                 enable_math=False,
                 enable_flash=True,
                 enable_mem_efficient=True,
+                enable_cudnn=True,
             )
         )
     return configs
@@ -307,6 +312,7 @@ def main(save_path: Optional[Path]):
         enable_math=False,
         enable_flash=True,
         enable_mem_efficient=True,
+        enable_cudnn=True,
     )
 
     experiment = run_single_experiment(config)
diff --git a/benchmarks/transformer/sdpa.py b/benchmarks/transformer/sdpa.py
index 0fe95506a22ea..6492561b6a915 100644
--- a/benchmarks/transformer/sdpa.py
+++ b/benchmarks/transformer/sdpa.py
@@ -1,11 +1,13 @@
 import itertools
 from collections import defaultdict
+from contextlib import nullcontext
 from dataclasses import asdict, dataclass
 from typing import Callable, List, Tuple
 
 import torch
 import torch.utils.benchmark as benchmark
 from tabulate import tabulate
+from torch.nn.attention import sdpa_kernel, SDPBackend
 from torch.nn.functional import scaled_dot_product_attention
 from tqdm import tqdm
 
@@ -30,6 +32,7 @@ class ExperimentConfig:
     embed_dim: int
     is_causal: bool
     dtype: torch.dtype
+    backend: SDPBackend
     device: torch.device = torch.device("cuda")
 
     @property
@@ -89,22 +92,25 @@ def get_input(
 def run_single_experiment(config: ExperimentConfig) -> ExperimentResults:
     q, k, v = get_input(config)
     is_causal = config.is_causal
-
-    forward_time = benchmark_torch_function_in_microseconds(
-        scaled_dot_product_attention,
-        q,
-        k,
-        v,
-        is_causal=is_causal,
-        attn_mask=None,
-    )
-    out_torch = scaled_dot_product_attention(
-        q, k, v, is_causal=is_causal, attn_mask=None
-    )
-    dOut = torch.randn_like(out_torch)
-    backward_time = benchmark_torch_function_in_microseconds(
-        out_torch.backward, dOut, retain_graph=True
+    context = (
+        sdpa_kernel(config.backend) if config.backend is not None else nullcontext()
     )
+    with context:
+        forward_time = benchmark_torch_function_in_microseconds(
+            scaled_dot_product_attention,
+            q,
+            k,
+            v,
+            is_causal=is_causal,
+            attn_mask=None,
+        )
+        out_torch = scaled_dot_product_attention(
+            q, k, v, is_causal=is_causal, attn_mask=None
+        )
+        dOut = torch.randn_like(out_torch)
+        backward_time = benchmark_torch_function_in_microseconds(
+            out_torch.backward, dOut, retain_graph=True
+        )
 
     return ExperimentResults(
         forward_time=forward_time,
@@ -120,6 +126,7 @@ def generate_experiment_configs() -> List[ExperimentConfig]:
     num_heads = [16]
     q_kv_seq_lens = [(128, 128), (256, 256), (512, 512), (1024, 1024)]
     embed_dims = [2048]
+    backends = [None]  # If set to None, all backends are enabled
     dtypes = [
         torch.bfloat16,
     ]
@@ -132,8 +139,9 @@ def generate_experiment_configs() -> List[ExperimentConfig]:
         embed_dim,
         causal,
         dtype,
+        backend,
     ) in itertools.product(
-        batch_sizes, num_heads, q_kv_seq_lens, embed_dims, is_causal, dtypes
+        batch_sizes, num_heads, q_kv_seq_lens, embed_dims, is_causal, dtypes, backends
     ):
         all_configs.append(
             ExperimentConfig(
@@ -144,6 +152,7 @@ def generate_experiment_configs() -> List[ExperimentConfig]:
                 embed_dim=embed_dim,
                 is_causal=causal,
                 dtype=dtype,
+                backend=backend,
             )
         )
 
@@ -156,6 +165,8 @@ def print_results(experiments: List[Experiment]):
         for key, value in experiment.asdict().items():
             table_data[key].append(value)
     del table_data["device"]
+    if table_data["backend"][0] is None:
+        del table_data["backend"]
     print(tabulate(table_data, headers="keys", tablefmt="pretty", floatfmt=".3f"))
 
 
diff --git a/binaries/CMakeLists.txt b/binaries/CMakeLists.txt
index 15f47bf52aee5..b728cd11de3df 100644
--- a/binaries/CMakeLists.txt
+++ b/binaries/CMakeLists.txt
@@ -8,20 +8,6 @@ if(INTERN_BUILD_MOBILE)
 endif()
 
 if(BUILD_CAFFE2)
-  caffe2_binary_target("convert_caffe_image_db.cc")
-  caffe2_binary_target("convert_db.cc")
-  caffe2_binary_target("make_cifar_db.cc")
-  caffe2_binary_target("make_mnist_db.cc")
-  caffe2_binary_target("predictor_verifier.cc")
-  caffe2_binary_target("speed_benchmark.cc")
-  caffe2_binary_target("split_db.cc")
-  caffe2_binary_target("print_registered_core_operators.cc")
-
-  if(USE_OBSERVERS)
-    caffe2_binary_target(caffe2_benchmark "caffe2_benchmark.cc" "benchmark_helper.cc")
-    target_include_directories(caffe2_benchmark PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../modules)
-  endif()
-
   caffe2_binary_target("at_launch_benchmark.cc")
   target_include_directories(at_launch_benchmark PUBLIC
     ${CMAKE_BINARY_DIR}/aten/src)
@@ -29,12 +15,6 @@ if(BUILD_CAFFE2)
   caffe2_binary_target("intra_inter_benchmark.cc")
   target_include_directories(intra_inter_benchmark PUBLIC
     ${CMAKE_BINARY_DIR}/aten/src)
-
-  caffe2_binary_target("run_plan.cc")
-  caffe2_binary_target("db_throughput.cc")
-
-  # ---[ tutorials
-  caffe2_binary_target("tutorial_blob.cc")
 endif()
 
 caffe2_binary_target("parallel_info.cc")
@@ -77,34 +57,11 @@ if(USE_ROCM)
   endif()
 endif()
 
-if(USE_ZMQ)
-  caffe2_binary_target("zmq_feeder.cc")
-  target_link_libraries(zmq_feeder ${ZMQ_LIBRARIES})
-endif()
-
 if(USE_MPI)
   caffe2_binary_target("run_plan_mpi.cc")
   target_link_libraries(run_plan_mpi ${MPI_CXX_LIBRARIES})
 endif()
 
-if(USE_OPENCV AND USE_LEVELDB)
-  caffe2_binary_target("convert_encoded_to_raw_leveldb.cc")
-  target_link_libraries(
-      convert_encoded_to_raw_leveldb
-      ${OpenCV_LIBS} ${LevelDB_LIBRARIES} ${Snappy_LIBRARIES})
-endif()
-
-if(USE_OPENCV)
-  caffe2_binary_target("make_image_db.cc")
-  target_link_libraries(make_image_db ${OpenCV_LIBS})
-  caffe2_binary_target("convert_image_to_tensor.cc")
-  target_link_libraries(convert_image_to_tensor ${OpenCV_LIBS})
-endif()
-
-if(USE_OBSERVERS AND USE_OPENCV)
-  caffe2_binary_target("convert_and_benchmark.cc")
-  target_link_libraries(convert_and_benchmark ${OpenCV_LIBS})
-endif()
 
 caffe2_binary_target("dump_operator_names.cc")
 caffe2_binary_target("optimize_for_mobile.cc")
diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
deleted file mode 100644
index 117e118d91253..0000000000000
--- a/binaries/benchmark_helper.cc
+++ /dev/null
@@ -1,523 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <chrono>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <thread>
-#ifdef _WIN32
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
-#include <psapi.h>
-#endif
-
-#include <binaries/benchmark_helper.h>
-#include "caffe2/core/blob_serialization.h"
-#ifdef __CUDA_ARCH__
-#include "caffe2/core/context_gpu.h"
-#endif
-#include "caffe2/core/init.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/net.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/tensor_int8.h"
-#include "caffe2/utils/bench_utils.h"
-#include "caffe2/utils/string_utils.h"
-#include <observers/net_observer_reporter_print.h>
-#include <observers/observer_config.h>
-#include <observers/perf_observer.h>
-
-#if defined(TARGET_OS_MAC) || \
-defined(TARGET_OS_IPHONE) || \
-defined(TARGET_IPHONE_SIMULATOR)
-#include <malloc/malloc.h>
-#else
-#include <malloc.h>
-#endif
-
-
-void observerConfig() {
-  caffe2::ClearGlobalNetObservers();
-  caffe2::AddGlobalNetObserverCreator([](caffe2::NetBase* subject) {
-    return std::make_unique<caffe2::PerfNetObserver>(subject);
-  });
-  caffe2::ObserverConfig::setReporter(
-      std::make_unique<caffe2::NetObserverReporterPrint>());
-}
-
-bool backendCudaSet(const string& backend) {
-  bool run_on_gpu = false;
-  if (backend == "cuda") {
-#ifdef __CUDA_ARCH__
-    if (caffe2::HasCudaGPU()) {
-      run_on_gpu = true;
-    } else {
-      CAFFE_THROW("NO GPU support on this host machine");
-    }
-#else
-    CAFFE_THROW("NO GPU support");
-#endif
-  }
-  return run_on_gpu;
-}
-
-void setDeviceType(caffe2::NetDef* net_def, caffe2::DeviceType& run_dev) {
-  for (int j = 0; j < net_def->op_size(); j++) {
-    caffe2::OperatorDef* op = net_def->mutable_op(j);
-    op->mutable_device_option()->set_device_type(caffe2::TypeToProto(run_dev));
-  }
-}
-
-void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) {
-  if (backend != "builtin") {
-    string engine = backend == "nnpack"
-        ? "NNPACK"
-        : backend == "eigen" ? "EIGEN"
-                             : backend == "mkl" ? "MKLDNN"
-                                                : backend == "cuda"
-                    ? "CUDA"
-                    : backend == "dnnlowp" ? "DNNLOWP"
-                                           : backend == "dnnlowp_acc16"
-                            ? "DNNLOWP_ACC16"
-                            : backend == "default" ? "" : "NONE";
-    CAFFE_ENFORCE(engine != "NONE", "Backend is not supported");
-    for (int i = 0; i < net_def->op_size(); i++) {
-      caffe2::OperatorDef* op_def = net_def->mutable_op(i);
-      op_def->set_engine(engine);
-    }
-  }
-}
-
-int loadInput(
-    shared_ptr<caffe2::Workspace> workspace,
-    const bool run_on_gpu,
-    map<string, caffe2::TensorProtos>& tensor_protos_map,
-    const string& input,
-    const string& input_file,
-    const string& input_dims,
-    const string& input_type) {
-  // How many input blobs are in the inputs
-  int blob_num = 1;
-  // Load input.
-  if (input.size()) {
-    vector<string> input_names = caffe2::split(',', input);
-    if (input_file.size()) {
-      vector<string> input_files = caffe2::split(',', input_file);
-      CAFFE_ENFORCE_EQ(
-          input_names.size(),
-          input_files.size(),
-          "Input name and file should have the same number.");
-      for (int i = 0; i < input_names.size(); ++i) {
-        caffe2::TensorProtos tensor_protos;
-        CAFFE_ENFORCE(
-            caffe2::ReadProtoFromFile(input_files[i], &tensor_protos));
-        workspace->CreateBlob(input_names[i]);
-        tensor_protos_map.insert(std::make_pair(input_names[i], tensor_protos));
-      }
-      // Check that all blobs have the same number of entries
-      blob_num = tensor_protos_map[input_names[0]].protos_size();
-      for (int i = 1; i < input_names.size(); ++i) {
-        int bnum = tensor_protos_map[input_names[i]].protos_size();
-        CAFFE_ENFORCE_EQ(
-            blob_num,
-            bnum,
-            "Number of blobs are not the same for all inputs");
-      }
-    } else if (input_dims.size() || input_type.size()) {
-      CAFFE_ENFORCE_GE(
-          input_dims.size(),
-          0,
-          "Input dims must be specified when input tensors are used.");
-      CAFFE_ENFORCE_GE(
-          input_type.size(),
-          0,
-          "Input type must be specified when input tensors are used.");
-
-      vector<string> input_dims_list = caffe2::split(';', input_dims);
-      CAFFE_ENFORCE_EQ(
-          input_names.size(),
-          input_dims_list.size(),
-          "Input name and dims should have the same number of items.");
-      vector<string> input_type_list = caffe2::split(';', input_type);
-      CAFFE_ENFORCE_EQ(
-          input_names.size(),
-          input_type_list.size(),
-          "Input name and type should have the same number of items.");
-      for (size_t i = 0; i < input_names.size(); ++i) {
-        vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
-        vector<int> input_dims;
-        for (const string& s : input_dims_str) {
-          input_dims.push_back(std::stoi(s));
-        }
-        caffe2::Blob* blob = workspace->GetBlob(input_names[i]);
-        if (blob == nullptr) {
-          blob = workspace->CreateBlob(input_names[i]);
-        }
-        if (run_on_gpu) {
-          LOG(INFO) << "Running on GPU.";
-#ifdef __CUDA_ARCH__
-          caffe2::TensorCUDA* tensor = blob->GetMutable<caffe2::TensorCUDA>();
-          TORCH_CHECK_NOTNULL(tensor);
-          tensor->Resize(input_dims);
-          if (input_type_list[i] == "uint8_t") {
-            tensor->mutable_data<uint8_t>();
-          } else if (input_type_list[i] == "float") {
-            tensor->mutable_data<float>();
-          } else {
-            CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
-          }
-#else
-          CAFFE_THROW("Not support GPU on mobile.");
-#endif
-        } else {
-          if (input_type_list[i] == "uint8_t") {
-            caffe2::int8::Int8TensorCPU* tensor =
-                blob->GetMutable<caffe2::int8::Int8TensorCPU>();
-            TORCH_CHECK_NOTNULL(tensor);
-            tensor->t.Resize(input_dims);
-            tensor->t.mutable_data<uint8_t>();
-          } else if (input_type_list[i] == "float") {
-            caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
-            TORCH_CHECK_NOTNULL(tensor);
-            tensor->Resize(input_dims);
-            tensor->mutable_data<float>();
-          } else if (input_type_list[i] == "int") {
-            caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
-            TORCH_CHECK_NOTNULL(tensor);
-            tensor->Resize(input_dims);
-            tensor->mutable_data<int>();
-          } else {
-            CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
-          }
-        }
-      }
-    } else {
-      CAFFE_THROW(
-          "You requested input tensors, but neither input_file nor "
-          "input_dims is set.");
-    }
-  }
-  return blob_num;
-}
-
-void fillInputBlob(
-    shared_ptr<caffe2::Workspace> workspace,
-    map<string, caffe2::TensorProtos>& tensor_protos_map,
-    int iteration) {
-  if (tensor_protos_map.empty()) {
-    return;
-  }
-  static caffe2::TensorDeserializer deserializer;
-  for (auto& tensor_kv : tensor_protos_map) {
-    caffe2::Blob* blob = workspace->GetBlob(tensor_kv.first);
-    if (blob == nullptr) {
-      blob = workspace->CreateBlob(tensor_kv.first);
-    }
-    // todo: support gpu and make this function a template
-    int protos_size = tensor_kv.second.protos_size();
-    if (protos_size == 1 && iteration > 0) {
-      // Do not override the input data if there is only one input data,
-      // since it will clear all caches. Rely on wipe_cache to
-      // clear caches
-      continue;
-    }
-    caffe2::TensorProto* tensor_proto =
-        tensor_kv.second.mutable_protos(iteration % protos_size);
-    BlobSetTensor(blob, deserializer.Deserialize(*tensor_proto));
-    // todo: for other types
-  }
-}
-
-void runNetwork(
-    shared_ptr<caffe2::Workspace> workspace,
-    caffe2::NetBase* net,
-    map<string, caffe2::TensorProtos>& tensor_protos_map,
-    const bool wipe_cache,
-    const bool run_individual,
-    const bool run_on_gpu,
-    const bool text_output,
-    const int warmup,
-    const int iter,
-    const int num_blobs,
-    const int sleep_before_run,
-    const int sleep_between_iteration,
-    const int sleep_between_net_and_operator,
-    const std::string& output,
-    const std::string& output_folder) {
-
-  LOG(INFO) << "Starting benchmark.";
-  caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup);
-  LOG(INFO) << "Running warmup runs.";
-  for (int i = 0; i < warmup; ++i) {
-    fillInputBlob(workspace, tensor_protos_map, i);
-    CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed.");
-  }
-
-  if (wipe_cache) {
-    caffe2::wipe_cache();
-  }
-  if (sleep_before_run > 0) {
-    std::this_thread::sleep_for(std::chrono::seconds(sleep_before_run));
-  }
-  LOG(INFO) << "Main runs.";
-  CAFFE_ENFORCE(
-      iter >= 0,
-      "Number of main runs should be non negative, provided ",
-      iter,
-      ".");
-  LOG(INFO) << "net runs.";
-  long long duration_sum = 0;
-  for (int i = 0; i < iter; ++i) {
-    caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, warmup);
-    fillInputBlob(workspace, tensor_protos_map, i);
-    if (wipe_cache) {
-      caffe2::wipe_cache();
-    }
-    auto start = std::chrono::high_resolution_clock::now();
-    CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed.");
-    auto stop = std::chrono::high_resolution_clock::now();
-    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
-    duration_sum += duration.count();
-    // Write the output for the first num_blobs times
-    writeOutput(
-        workspace,
-        run_on_gpu,
-        output,
-        output_folder,
-        text_output,
-        i,
-        num_blobs);
-    if (wipe_cache) {
-      caffe2::wipe_cache();
-    }
-    if (sleep_between_iteration > 0) {
-      std::this_thread::sleep_for(
-          std::chrono::seconds(sleep_between_iteration));
-    }
-  }
-  std::cout << "Average Duration: " << (duration_sum/iter) << " us" << std::endl;
-  if (run_individual) {
-    LOG(INFO) << "operator runs.";
-    if (sleep_between_net_and_operator > 0) {
-      std::this_thread::sleep_for(
-          std::chrono::seconds(sleep_between_net_and_operator));
-    }
-    for (int i = 0; i < iter; ++i) {
-      caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, warmup);
-      fillInputBlob(workspace, tensor_protos_map, i);
-      CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed.");
-      if (wipe_cache) {
-        caffe2::wipe_cache();
-      }
-      if (sleep_between_iteration > 0) {
-        std::this_thread::sleep_for(
-            std::chrono::seconds(sleep_between_iteration));
-      }
-    }
-  }
-}
-
-void writeOutput(
-    shared_ptr<caffe2::Workspace> workspace,
-    const bool run_on_gpu,
-    const string& output,
-    const string& output_folder,
-    const bool text_output,
-    const int index,
-    const int num_blobs) {
-  if (output.size() == 0) {
-    return;
-  }
-  string output_prefix = output_folder.size() ? output_folder + "/" : "";
-  vector<string> output_names = caffe2::split(',', output);
-  if (output == "*") {
-    output_names = workspace->Blobs();
-  }
-  for (const string& name : output_names) {
-    CAFFE_ENFORCE(
-        workspace->HasBlob(name),
-        "You requested a non-existing blob: ",
-        name);
-    if (text_output) {
-      if (run_on_gpu) {
-#ifdef __CUDA_ARCH__
-        writeTextOutput<caffe2::CUDAContext, caffe2::TensorCUDA>(
-            workspace->GetBlob(name)->GetMutable<caffe2::TensorCUDA>(),
-            output_prefix,
-            name,
-            index,
-            num_blobs);
-#else
-        CAFFE_THROW("Not support GPU.");
-#endif
-      } else {
-        writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
-            BlobGetMutableTensor(workspace->GetBlob(name), caffe2::CPU),
-            output_prefix,
-            name,
-            index,
-            num_blobs);
-      }
-    } else {
-      // Do not support multiple entries per blob.
-      CAFFE_ENFORCE(
-          index == 0,
-          "Binary file only support one output.");
-      string serialized = SerializeBlob(*workspace->GetBlob(name), name);
-      string output_filename = output_prefix + name;
-      caffe2::WriteStringToFile(serialized, output_filename.c_str());
-    }
-  }
-}
-
-void logBenchmarkResult(
-    const std::string& type,
-    const std::string& metric,
-    const std::string& unit,
-    const int value) {
-  LOG(INFO) << caffe2::NetObserverReporterPrint::IDENTIFIER << "{"
-            << "\"type\": \"" << type << "\", "
-            << "\"metric\": \"" << metric << "\", "
-            << "\"unit\": \"" << unit << "\", "
-            << "\"value\": " << c10::to_string(value) << "}\n";
-}
-
-long getVirtualMemoryIfOptionEnabled(bool FLAGS_measure_memory) {
-  if (FLAGS_measure_memory) {
-#if defined(TARGET_OS_IPHONE) || \
-defined(TARGET_OS_MAC) || \
-defined(TARGET_IPHONE_SIMULATOR)
-    malloc_statistics_t stats = {0};
-    malloc_zone_statistics(nullptr, &stats);
-    return stats.size_allocated;
-#elif defined(_WIN32)
-    PROCESS_MEMORY_COUNTERS_EX pmc;
-    GetProcessMemoryInfo(
-        GetCurrentProcess(), (PROCESS_MEMORY_COUNTERS*)&pmc, sizeof(pmc));
-    return pmc.PrivateUsage;
-#else
-    struct mallinfo info = mallinfo();
-    return info.uordblks;
-#endif
-  }
-
-  return 0;
-}
-
-int benchmark(
-    int argc,
-    char* argv[],
-    const string& FLAGS_backend,
-    const string& FLAGS_init_net,
-    const string& FLAGS_input,
-    const string& FLAGS_input_dims,
-    const string& FLAGS_input_file,
-    const string& FLAGS_input_type,
-    int FLAGS_iter,
-    bool FLAGS_measure_memory,
-    const string& FLAGS_net,
-    const string& FLAGS_output,
-    const string& FLAGS_output_folder,
-    bool FLAGS_run_individual,
-    int FLAGS_sleep_before_run,
-    int FLAGS_sleep_between_iteration,
-    int FLAGS_sleep_between_net_and_operator,
-    bool FLAGS_text_output,
-    int FLAGS_warmup,
-    bool FLAGS_wipe_cache) {
-  // Check arguments to be correct
-  {
-    // Need to check whether file exists, as the file reader does not assert if
-    // file does not exist
-    std::ifstream net_file(FLAGS_net);
-    CAFFE_ENFORCE(net_file.good());
-    net_file.close();
-
-    std::ifstream init_net_file(FLAGS_init_net);
-    CAFFE_ENFORCE(init_net_file.good());
-    init_net_file.close();
-
-    if (FLAGS_input_file.size() > 0) {
-      vector<string> input_files = caffe2::split(',', FLAGS_input_file);
-      for (auto input_file : input_files) {
-        std::ifstream ifile(input_file);
-        CAFFE_ENFORCE(ifile.good());
-        ifile.close();
-      }
-    }
-  }
-
-  observerConfig();
-  caffe2::ShowLogInfoToStderr();
-
-  auto workspace = std::make_shared<caffe2::Workspace>(new caffe2::Workspace());
-  bool run_on_gpu = backendCudaSet(FLAGS_backend);
-  // Run initialization network, measure resources used.
-  long init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory);
-  caffe2::NetDef init_net_def;
-  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def));
-  setOperatorEngine(&init_net_def, FLAGS_backend);
-  CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
-  init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory) - init_vmem;
-
-  map<string, caffe2::TensorProtos> tensor_protos_map;
-  int num_blobs = loadInput(
-      workspace,
-      run_on_gpu,
-      tensor_protos_map,
-      FLAGS_input,
-      FLAGS_input_file,
-      FLAGS_input_dims,
-      FLAGS_input_type);
-
-  // Run main network.
-  long predict_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory);
-  caffe2::NetDef net_def;
-  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
-  setOperatorEngine(&net_def, FLAGS_backend);
-  if (!net_def.has_name()) {
-    net_def.set_name("benchmark");
-  }
-  caffe2::NetBase* net = workspace->CreateNet(net_def);
-  TORCH_CHECK_NOTNULL(net);
-  runNetwork(
-      workspace,
-      net,
-      tensor_protos_map,
-      FLAGS_wipe_cache,
-      FLAGS_run_individual,
-      run_on_gpu,
-      FLAGS_text_output,
-      FLAGS_warmup,
-      FLAGS_iter,
-      num_blobs,
-      FLAGS_sleep_before_run,
-      FLAGS_sleep_between_iteration,
-      FLAGS_sleep_between_net_and_operator,
-      FLAGS_output,
-      FLAGS_output_folder);
-  predict_vmem = getVirtualMemoryIfOptionEnabled(
-      FLAGS_measure_memory) - predict_vmem;
-  if (FLAGS_measure_memory) {
-    logBenchmarkResult(
-        "NET_", "memory", "kB", (init_vmem + predict_vmem) / 1024);
-  }
-
-  return 0;
-}
diff --git a/binaries/benchmark_helper.h b/binaries/benchmark_helper.h
deleted file mode 100644
index bd48be7ff3a30..0000000000000
--- a/binaries/benchmark_helper.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <string>
-
-#include "caffe2/core/blob_serialization.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/net.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/string_utils.h"
-#include "c10/util/string_utils.h"
-#include <c10/util/irange.h>
-
-using std::map;
-using std::shared_ptr;
-using std::string;
-using std::vector;
-
-template <typename ContextType, typename TensorType>
-void writeTextOutput(
-    TensorType* tensor,
-    const string& output_prefix,
-    const string& name,
-    int index,
-    int num_blobs) {
-  if (index >= num_blobs) {
-    return;
-  }
-  string filename = name;
-  std::replace(filename.begin(), filename.end(), '/', '_');
-  string output_name = output_prefix + "/" + filename + ".txt";
-  caffe2::TensorSerializer ser;
-  caffe2::BlobProto blob_proto;
-
-  ser.Serialize(
-      *tensor, output_name, blob_proto.mutable_tensor(), 0, tensor->numel());
-  blob_proto.set_name(output_name);
-  blob_proto.set_type("Tensor");
-  CAFFE_ENFORCE(blob_proto.has_tensor());
-  caffe2::TensorProto tensor_proto = blob_proto.tensor();
-  int dims_size = tensor_proto.dims_size();
-  long long elem_dim_size =
-      dims_size > 1 ? tensor_proto.dims(1) : tensor_proto.dims(0);
-  for (const auto i : c10::irange(2, dims_size)) {
-    elem_dim_size *= tensor_proto.dims(i);
-  }
-  std::vector<std::string> lines;
-  std::string dims;
-  for (const auto i : c10::irange(dims_size)) {
-    int dim = tensor_proto.dims(i);
-    if (i > 0) {
-      dims += ", ";
-    }
-    dims += c10::to_string(dim);
-  }
-  lines.push_back(dims);
-  std::stringstream line;
-  if (tensor_proto.data_type() == caffe2::TensorProto::FLOAT) {
-    auto start = tensor_proto.float_data().begin();
-    auto end = tensor_proto.float_data().end();
-    copy(start, end, std::ostream_iterator<float>(line, ","));
-  } else if (tensor_proto.data_type() == caffe2::TensorProto::INT32) {
-    auto start = tensor_proto.int32_data().begin();
-    auto end = tensor_proto.int32_data().end();
-    copy(start, end, std::ostream_iterator<int>(line, ","));
-  } else {
-    CAFFE_THROW("Unimplemented Blob type.");
-  }
-  // remove the last ,
-  string str = line.str();
-  if(str.length() != 0) {
-    str.pop_back();
-  }
-  lines.push_back(str);
-
-  // static casts are workaround for MSVC build
-  auto flags = static_cast<std::ios_base::openmode>(std::ios::out);
-  if (index != 0) {
-    flags |= static_cast<std::ios_base::openmode>(std::ios::app);
-  } else {
-    flags |= static_cast<std::ios_base::openmode>(std::ios::trunc);
-  }
-  std::ofstream output_file(output_name, flags);
-  std::ostream_iterator<std::string> output_iterator(output_file, "\n");
-  std::copy(lines.begin(), lines.end(), output_iterator);
-}
-
-void observerConfig();
-bool backendCudaSet(const string&);
-void setDeviceType(caffe2::NetDef*, caffe2::DeviceType&);
-void setOperatorEngine(caffe2::NetDef*, const string&);
-int loadInput(
-    shared_ptr<caffe2::Workspace> workspace,
-    const bool run_on_gpu,
-    map<string, caffe2::TensorProtos>& tensor_protos_map,
-    const string& input,
-    const string& input_file,
-    const string& input_dims,
-    const string& input_type);
-void fillInputBlob(
-    shared_ptr<caffe2::Workspace> workspace,
-    map<string, caffe2::TensorProtos>& tensor_protos_map,
-    int iteration);
-void writeOutput(
-    shared_ptr<caffe2::Workspace> workspace,
-    const bool run_on_gpu,
-    const string& output,
-    const string& output_folder,
-    const bool text_output,
-    const int index,
-    const int num_blobs);
-void logBenchmarkResult(
-    const std::string& type,
-    const std::string& metric,
-    const std::string& unit,
-    const int value);
-long getVirtualMemoryIfOptionEnabled(bool FLAGS_measure_memory);
-void runNetwork(
-    shared_ptr<caffe2::Workspace> workspace,
-    caffe2::NetBase* net,
-    map<string, caffe2::TensorProtos>& tensor_protos_map,
-    const bool wipe_cache,
-    const bool run_individual,
-    const bool run_on_gpu,
-    const bool text_output,
-    const int warmup,
-    const int iter,
-    const int num_blobs,
-    const int sleep_before_run,
-    const int sleep_between_iteration,
-    const int sleep_between_net_and_operator,
-    const std::string& output,
-    const std::string& output_folder);
-int benchmark(
-    int argc,
-    char* argv[],
-    const string& FLAGS_backend,
-    const string& FLAGS_init_net,
-    const string& FLAGS_input,
-    const string& FLAGS_input_dims,
-    const string& FLAGS_input_file,
-    const string& FLAGS_input_type,
-    int FLAGS_iter,
-    bool FLAGS_measure_memory,
-    const string& FLAGS_net,
-    const string& FLAGS_output,
-    const string& FLAGS_output_folder,
-    bool FLAGS_run_individual,
-    int FLAGS_sleep_before_run,
-    int FLAGS_sleep_between_iteration,
-    int FLAGS_sleep_between_net_and_operator,
-    bool FLAGS_text_output,
-    int FLAGS_warmup,
-    bool FLAGS_wipe_cache);
diff --git a/binaries/caffe2_benchmark.cc b/binaries/caffe2_benchmark.cc
deleted file mode 100644
index 65eb7b7ba3af8..0000000000000
--- a/binaries/caffe2_benchmark.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-#include <fstream>
-#include <iterator>
-#include <string>
-
-#include "binaries/benchmark_args.h"
-#include "binaries/benchmark_helper.h"
-
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-  benchmark(
-      argc,
-      argv,
-      FLAGS_backend,
-      FLAGS_init_net,
-      FLAGS_input,
-      FLAGS_input_dims,
-      FLAGS_input_file,
-      FLAGS_input_type,
-      FLAGS_iter,
-      FLAGS_measure_memory,
-      FLAGS_net,
-      FLAGS_output,
-      FLAGS_output_folder,
-      FLAGS_run_individual,
-      FLAGS_sleep_before_run,
-      FLAGS_sleep_between_iteration,
-      FLAGS_sleep_between_net_and_operator,
-      FLAGS_text_output,
-      FLAGS_warmup,
-      FLAGS_wipe_cache);
-}
diff --git a/binaries/convert_and_benchmark.cc b/binaries/convert_and_benchmark.cc
deleted file mode 100644
index ceb029c304749..0000000000000
--- a/binaries/convert_and_benchmark.cc
+++ /dev/null
@@ -1,770 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <opencv2/opencv.hpp>
-#include <cmath>
-#include <fstream>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/db.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/timer.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/proto_utils.h"
-#include "caffe2/utils/string_utils.h"
-#include "caffe2/utils/bench_utils.h"
-
-#include "binaries/benchmark_args.h"
-#include "binaries/benchmark_helper.h"
-
-#include <observers/net_observer_reporter_print.h>
-#include <observers/observer_config.h>
-#include <observers/perf_observer.h>
-
-
-C10_DEFINE_int(
-    batch_size,
-    -1,
-    "Specify the batch size of the input. The number of items in the "
-    "input needs to be multiples of the batch size. If the batch size "
-    "is less than 0, all inputs are in one batch.")
-C10_DEFINE_bool(color, true, "If set, load images in color.");
-C10_DEFINE_string(
-    crop,
-    "-1,-1",
-    "The center cropped hight and width. If the value is less than zero, "
-    "it is not cropped.");
-C10_DEFINE_string(input_image_files, "", "Files containing imput images");
-C10_DEFINE_string(input_text_files, "", "Text files to be written to blobs");
-C10_DEFINE_string(
-    preprocess,
-    "",
-    "Options to specify the preprocess routines. The available options are "
-    "subtract128, normalize, mean, std, bgrtorgb. If multiple steps are provided, they "
-    "are separated by comma (,) in sequence.");
-C10_DEFINE_string(
-    report_time,
-    "",
-    "Report the conversion stage time to screen. "
-    "The format of the string is <type>|<identifier>. "
-    "The valid type is 'json'. "
-    "The valid identifier is nothing or an identifier that prefix every line");
-C10_DEFINE_string(
-    scale,
-    "-1,-1",
-    "Scale the images to be within the min,max box. The shorter edge is "
-    "min pixels. But if the other edge is more than the max pixels, the "
-    "other edge and scaled to max pixels (and the shorter edge can be less "
-    "than the min pixels");
-C10_DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
-
-namespace caffe2 {
-
-void reportTime(
-    std::string type,
-    double ts,
-    std::string metric,
-    std::string unit) {
-  if (FLAGS_report_time == "") {
-    return;
-  }
-  vector<string> s = caffe2::split('|', FLAGS_report_time);
-  assert(s[0] == "json");
-  std::string identifier = "";
-  if (s.size() > 1) {
-    identifier = s[1];
-  }
-  std::cout << identifier << "{\"type\": \"" << type << "\", \"value\": " << ts
-            << ", \"metric\": \"" << metric << "\", \"unit\": \"" << unit
-            << "\"}" << std::endl;
-}
-
-void splitSizes(const std::string& arg, int* ptr0, int* ptr1) {
-  vector<string> sizes = caffe2::split(',', arg);
-  if (sizes.size() == 2) {
-    *ptr0 = std::stoi(sizes[0]);
-    *ptr1 = std::stoi(sizes[1]);
-  } else if (sizes.size() == 1) {
-    *ptr0 = std::stoi(sizes[0]);
-    *ptr1 = std::stoi(sizes[0]);
-  } else {
-    assert(false);
-  }
-}
-
-
-cv::Mat resizeImage(cv::Mat& img) {
-  int min_size, max_size;
-  splitSizes(FLAGS_scale, &min_size, &max_size);
-  if ((min_size <= 0) && (max_size <= 0)) {
-    return img;
-  }
-  if (max_size < 0) {
-    max_size = INT_MAX;
-  }
-  assert(min_size <= max_size);
-
-  int im_min_size = img.rows > img.cols ? img.cols : img.rows;
-  int im_max_size = img.rows > img.cols ? img.rows : img.cols;
-
-  double im_scale = 1.0 * min_size / im_min_size;
-  if (im_scale * im_max_size > max_size) {
-    im_scale = 1.0 * max_size / im_max_size;
-  }
-  int scaled_width = int(round(img.cols * im_scale));
-  int scaled_height = int(round(img.rows * im_scale));
-  assert((scaled_width <= max_size) && (scaled_height <= max_size));
-  if ((scaled_width < min_size) || (scaled_height < min_size)) {
-    assert((scaled_width == max_size) || (scaled_height == max_size));
-  } else {
-    assert((scaled_width == min_size) || (scaled_height == min_size));
-  }
-  cv::Mat resized_img;
-  cv::resize(
-      img,
-      resized_img,
-      cv::Size(),
-      im_scale,
-      im_scale,
-      cv::INTER_LINEAR);
-  return resized_img;
-}
-
-cv::Mat cropToRec(cv::Mat& img, int* height_ptr, int* width_ptr) {
-  int height = *height_ptr;
-  int width = *width_ptr;
-  if ((height > 0) && (width > 0) &&
-      ((img.rows != height) || (img.cols != width))) {
-    cv::Mat cropped_img, cimg;
-    cv::Rect roi;
-    roi.x = int((img.cols - width) / 2);
-    roi.y = int((img.rows - height) / 2);
-    roi.x = roi.x < 0 ? 0 : roi.x;
-    roi.y = roi.y < 0 ? 0 : roi.y;
-    width = width > img.cols ? img.cols : width;
-    height = height > img.rows ? img.rows : height;
-    roi.width = width;
-    roi.height = height;
-    assert(
-        0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= img.cols &&
-        0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= img.rows);
-    cropped_img = img(roi);
-    // Make the image in continuous space in memory
-    cimg = cropped_img.clone();
-    *height_ptr = height;
-    *width_ptr = width;
-    return cimg;
-  } else {
-    return img;
-  }
-}
-
-std::vector<float> convertToVector(cv::Mat& img) {
-  std::vector<float> normalize(3, 1);
-  std::vector<float> mean(3, 0);
-  std::vector<float> std(3, 1);
-  bool bgrtorgb = false;
-  int size = img.cols * img.rows;
-  vector<string> steps = caffe2::split(',', FLAGS_preprocess);
-  for (int i = 0; i < steps.size(); i++) {
-    auto step = steps[i];
-    if (step == "subtract128") {
-      mean = {128, 128, 128};
-      std = {1, 1, 1};
-      normalize = {1, 1, 1};
-    } else if (step == "normalize") {
-      normalize = {255, 255, 255};
-    } else if (step == "mean") {
-      mean = {0.406f, 0.456f, 0.485f};
-    } else if (step == "std") {
-      std = {0.225f, 0.224f, 0.229f};
-    } else if (step == "bgrtorgb") {
-      bgrtorgb = true;
-    } else {
-      CAFFE_ENFORCE(
-          false,
-          "Unsupported preprocess step. The supported steps are: subtract128, "
-          "normalize,mean, std, swaprb.");
-    }
-  }
-
-  int C = FLAGS_color ? 3 : 1;
-  int total_size = C * size;
-  std::vector<float> values(total_size);
-  if (C == 1) {
-    cv::MatIterator_<float> it, end;
-    int idx = 0;
-    for (it = img.begin<float>(), end = img.end<float>(); it != end; ++it) {
-      values[idx++] = (*it / normalize[0] - mean[0]) / std[0];
-    }
-  } else {
-    int i = 0;
-    cv::MatIterator_<cv::Vec3f> it, end;
-    int b = bgrtorgb ? 2 : 0;
-    int g = 1;
-    int r = bgrtorgb ? 0 : 2;
-    for (it = img.begin<cv::Vec3f>(), end = img.end<cv::Vec3f>(); it != end;
-         ++it, i++) {
-      values[i] = (((*it)[b] / normalize[0] - mean[0]) / std[0]);
-      int offset = size + i;
-      values[offset] = (((*it)[g] / normalize[1] - mean[1]) / std[1]);
-      offset = size + offset;
-      values[offset] = (((*it)[r] / normalize[2] - mean[2]) / std[2]);
-    }
-  }
-  return values;
-}
-
-std::vector<float> convertOneImage(
-    std::string& filename,
-    int* height_ptr,
-    int* width_ptr) {
-  assert(filename[0] != '~');
-
-  std::cout << "Converting " << filename << std::endl;
-
-  // Load image
-  cv::Mat img_uint8 = cv::imread(
-#if CV_MAJOR_VERSION <= 3
-      filename, FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
-#else
-      filename, FLAGS_color ? cv::IMREAD_COLOR : cv::IMREAD_GRAYSCALE);
-#endif
-  caffe2::Timer timer;
-  timer.Start();
-  cv::Mat img;
-  // Convert image to floating point values
-  img_uint8.convertTo(img, CV_32F);
-  // Resize image
-  cv::Mat resized_img = resizeImage(img);
-
-  int height, width;
-  splitSizes(FLAGS_crop, &height, &width);
-  if ((height <= 0) || (width <= 0)) {
-    height = resized_img.rows;
-    width = resized_img.cols;
-  }
-  cv::Mat crop = cropToRec(resized_img, &height, &width);
-
-  // Assert we don't have to deal with alignment
-  DCHECK(crop.isContinuous());
-  assert(crop.rows == height);
-  assert(crop.cols == width);
-  std::vector<float> one_image_values = convertToVector(crop);
-  *height_ptr = height;
-  *width_ptr = width;
-  double ts = timer.MicroSeconds();
-  reportTime("image_preprocess", ts, "convert", "us");
-  return one_image_values;
-}
-
-int getBatchSize(int num_items) {
-  int batch_size = FLAGS_batch_size;
-  if (batch_size < 0) {
-    batch_size = num_items;
-  } else {
-    assert(num_items % batch_size == 0);
-  }
-  return batch_size;
-}
-
-TensorProtos writeValues(
-    std::vector<std::vector<std::vector<float>>>& values,
-    std::vector<std::vector<int>>& dims) {
-
-  caffe2::Timer timer;
-  timer.Start();
-
-  assert(dims.size() == values.size());
-  int num_batches = dims.size();
-
-  TensorProtos protos;
-  for (int k = 0; k < num_batches; k++) {
-    TensorProto* data;
-    data = protos.add_protos();
-    data->set_data_type(TensorProto::FLOAT);
-    auto one_dim = dims[k];
-    for (int dim : one_dim) {
-      data->add_dims(dim);
-    }
-    int batch_size = one_dim[0];
-    long long int entry_size = 1;
-    for (int i = 1; i < one_dim.size(); i++) {
-      entry_size *= one_dim[i];
-    }
-
-    // Not optimized
-    for (int i = 0; i < batch_size; i++) {
-      assert(values[k][i].size() == entry_size);
-      for (int j = 0; j < values[k][i].size(); j++) {
-        data->add_float_data(values[k][i][j]);
-      }
-    }
-  }
-  double ts = timer.MicroSeconds();
-  reportTime("preprocess", ts, "data_pack", "us");
-
-  return protos;
-}
-
-TensorProtos convertImages(std::string& image_file) {
-  vector<string> file_names;
-  if (image_file != "") {
-    std::ifstream infile(image_file);
-    std::string line;
-    while (std::getline(infile, line)) {
-      vector<string> file_name = caffe2::split(',', line);
-      string name;
-      if (file_name.size() == 3) {
-        name = file_name[2];
-      } else {
-        name = line;
-      }
-      file_names.push_back(name);
-    }
-  } else {
-    TensorProtos proto;
-    return proto;
-  }
-  int batch_size = getBatchSize(file_names.size());
-  int num_batches = file_names.size() / batch_size;
-  assert(file_names.size() == batch_size * num_batches);
-  std::vector<std::vector<std::vector<float>>> values;
-  std::vector<std::vector<int>> dims;
-  int C = FLAGS_color ? 3 : 1;
-  for (int k = 0; k < num_batches; k++) {
-    std::vector<std::vector<float>> one_value;
-    int height = -1;
-    int width = -1;
-    for (int i = 0; i < batch_size; i++) {
-      int idx = k * batch_size + i;
-      int one_height, one_width;
-      std::vector<float> one_image_values =
-          convertOneImage(file_names[idx], &one_height, &one_width);
-      if (height < 0 && width < 0) {
-        height = one_height;
-        width = one_width;
-      } else {
-        assert(height == one_height);
-        assert(width == one_width);
-      }
-      one_value.push_back(one_image_values);
-    }
-    vector<int> one_dim = {batch_size, C, height, width};
-    dims.push_back(one_dim);
-    values.push_back(one_value);
-  }
-  return writeValues(values, dims);
-}
-
-template <class TYPE>
-vector<TYPE> splitString(std::string& line) {
-  vector<string> vector_str = caffe2::split(',', line);
-  vector<TYPE> vector_int;
-  for (string str : vector_str) {
-    vector_int.push_back((TYPE)std::stod(str));
-  }
-  return vector_int;
-}
-
-/* Convert the values in a json file to blobs
-   The format of the json file should be:
-   <number of items>,  <dim2>.... (dimensions of items)
-   <entry>, <entry>, <entry>... (all entries in one item)
-   <entry>, <entry>, <entry>...
-   ....
-*/
-TensorProtos convertValues(std::string& file_name) {
-  if (file_name == "") {
-    TensorProtos proto;
-    return proto;
-  }
-  std::ifstream infile(file_name);
-  std::string line;
-  std::getline(infile, line);
-  vector<int> file_dims = splitString <int>(line);
-  assert(file_dims.size() >= 2);
-
-  int num_items = file_dims[0];
-  int batch_size = getBatchSize(num_items);
-  int num_batches = num_items / batch_size;
-  assert(num_items == batch_size * num_batches);
-  vector<string> lines;
-  while (std::getline(infile, line)) {
-    lines.push_back(line);
-  }
-  assert(lines.size() == num_items);
-  std::vector<std::vector<std::vector<float>>> values;
-  std::vector<std::vector<int>> dims;
-  for (int i = 0; i < num_batches; i++) {
-    std::vector<std::vector<float>> one_value;
-    int num = -1;
-    for (int j = 0; j < batch_size; j++) {
-      int idx = i * batch_size + j;
-      std::string line = lines[idx];
-      vector<float> item = splitString<float>(line);
-      if (num < 0) {
-        num = item.size();
-      } else {
-        assert(num == item.size());
-      }
-      one_value.push_back(item);
-    }
-    vector<int> batch_dims = file_dims;
-    batch_dims[0] = batch_size;
-    dims.push_back(batch_dims);
-    values.push_back(one_value);
-  }
-
-  return writeValues(values, dims);
-}
-
-} // namespace caffe2
-
-void observerConfig() {
-  caffe2::ClearGlobalNetObservers();
-  caffe2::AddGlobalNetObserverCreator([](caffe2::NetBase* subject) {
-    return std::make_unique<caffe2::PerfNetObserver>(subject);
-  });
-  caffe2::ObserverConfig::setReporter(
-      std::make_unique<caffe2::NetObserverReporterPrint>());
-}
-
-bool backendCudaSet(const string& backend) {
-  bool run_on_gpu = false;
-  if (backend == "cuda") {
-#ifdef __CUDA_ARCH__
-    if (caffe2::HasCudaGPU()) {
-      run_on_gpu = true;
-    } else {
-      CAFFE_THROW("NO GPU support on this host machine");
-    }
-#else
-    CAFFE_THROW("NO GPU support");
-#endif
-  }
-  return run_on_gpu;
-}
-
-void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) {
-  if (backend != "builtin") {
-    string engine;
-    if( backend == "nnpack" ) {
-      engine = "NNPACK";
-    } else if ( backend == "eigen" ) {
-      engine = "EIGEN";
-    } else if ( backend == "mkl" ) {
-      engine = "MKLDNN";
-    } else if ( backend == "cuda" ) {
-      engine = "CUDA";
-    } else if ( backend == "dnnlowp" ) {
-      engine = "DNNLOWP";
-    } else if ( backend == "dnnlowp_acc16" ) {
-      engine = "DNNLOWP_ACC16";
-    } else if ( backend == "default" ) {
-      engine = "";
-    } else {
-      engine = "NONE";
-    }
-    CAFFE_ENFORCE(engine != "NONE", "Backend is not supported");
-    for (int i = 0; i < net_def->op_size(); i++) {
-      caffe2::OperatorDef* op_def = net_def->mutable_op(i);
-      op_def->set_engine(engine);
-    }
-  }
-}
-
-void fillInputBlob(
-    shared_ptr<caffe2::Workspace> workspace,
-    map<string, caffe2::TensorProtos>& tensor_protos_map,
-    int iteration) {
-  if (tensor_protos_map.empty()) {
-    return;
-  }
-  static caffe2::TensorDeserializer deserializer;
-  for (auto& tensor_kv : tensor_protos_map) {
-    caffe2::Blob* blob = workspace->GetBlob(tensor_kv.first);
-    if (blob == nullptr) {
-      blob = workspace->CreateBlob(tensor_kv.first);
-    }
-    // todo: support gpu and make this function a template
-    int protos_size = tensor_kv.second.protos_size();
-    if (protos_size == 1 && iteration > 0) {
-      // Do not override the input data if there is only one input data,
-      // since it will clear all caches. Rely on wipe_cache to
-      // clear caches
-      continue;
-    }
-    caffe2::TensorProto* tensor_proto =
-        tensor_kv.second.mutable_protos(iteration % protos_size);
-    BlobSetTensor(blob, deserializer.Deserialize(*tensor_proto));
-    // todo: for other types
-  }
-}
-
-void writeOutput(
-    shared_ptr<caffe2::Workspace> workspace,
-    const bool run_on_gpu,
-    const string& output,
-    const string& output_folder,
-    const bool text_output,
-    const int index,
-    const int num_blobs) {
-  if (output.size() == 0) {
-    return;
-  }
-  string output_prefix = output_folder.size() ? output_folder + "/" : "";
-  vector<string> output_names = caffe2::split(',', output);
-  if (output == "*") {
-    output_names = workspace->Blobs();
-  }
-  for (const string& name : output_names) {
-    CAFFE_ENFORCE(
-        workspace->HasBlob(name),
-        "You requested a non-existing blob: ",
-        name);
-    if (text_output) {
-      if (run_on_gpu) {
-#ifdef __CUDA_ARCH__
-        writeTextOutput<caffe2::CUDAContext, caffe2::TensorCUDA>(
-            workspace->GetBlob(name)->GetMutable<caffe2::TensorCUDA>(),
-            output_prefix,
-            name,
-            index,
-            num_blobs);
-#else
-        CAFFE_THROW("Not support GPU.");
-#endif
-      } else {
-        writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
-            BlobGetMutableTensor(workspace->GetBlob(name), caffe2::CPU),
-            output_prefix,
-            name,
-            index,
-            num_blobs);
-      }
-    } else {
-      // Do not support multiple entries per blob.
-      CAFFE_ENFORCE(
-          index == 0,
-          "Binary file only support one output.");
-      string serialized = SerializeBlob(*workspace->GetBlob(name), name);
-      string output_filename = output_prefix + name;
-      caffe2::WriteStringToFile(serialized, output_filename.c_str());
-    }
-  }
-}
-
-void runNetwork(
-    shared_ptr<caffe2::Workspace> workspace,
-    caffe2::NetDef& net_def,
-    map<string, caffe2::TensorProtos>& tensor_protos_map,
-    const bool wipe_cache,
-    const bool run_individual,
-    const bool run_on_gpu,
-    const bool text_output,
-    const int warmup,
-    const int iter,
-    const int num_blobs,
-    const int sleep_before_run,
-    const int sleep_between_iteration,
-    const int sleep_between_net_and_operator,
-    const std::string& output,
-    const std::string& output_folder) {
-
-  if (!net_def.has_name()) {
-    net_def.set_name("benchmark");
-  }
-
-  caffe2::NetBase* net = workspace->CreateNet(net_def);
-  TORCH_CHECK_NOTNULL(net);
-
-  LOG(INFO) << "Starting benchmark.";
-  caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup);
-  LOG(INFO) << "Running warmup runs.";
-  for (int i = 0; i < warmup; ++i) {
-    fillInputBlob(workspace, tensor_protos_map, i);
-    CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed.");
-  }
-
-  if (wipe_cache) {
-    caffe2::wipe_cache();
-  }
-  if (sleep_before_run > 0) {
-    std::this_thread::sleep_for(std::chrono::seconds(sleep_before_run));
-  }
-  LOG(INFO) << "Main runs.";
-  CAFFE_ENFORCE(
-      iter >= 0,
-      "Number of main runs should be non negative, provided ",
-      iter,
-      ".");
-  LOG(INFO) << "net runs.";
-  for (int i = 0; i < iter; ++i) {
-    caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, warmup);
-    fillInputBlob(workspace, tensor_protos_map, i);
-    if (wipe_cache) {
-      caffe2::wipe_cache();
-    }
-    CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed.");
-    // Write the output for the first num_blobs times
-    writeOutput(
-        workspace,
-        run_on_gpu,
-        output,
-        output_folder,
-        text_output,
-        i,
-        num_blobs);
-    if (wipe_cache) {
-      caffe2::wipe_cache();
-    }
-    if (sleep_between_iteration > 0) {
-      std::this_thread::sleep_for(
-          std::chrono::seconds(sleep_between_iteration));
-    }
-  }
-  if (run_individual) {
-    LOG(INFO) << "operator runs.";
-    if (sleep_between_net_and_operator > 0) {
-      std::this_thread::sleep_for(
-          std::chrono::seconds(sleep_between_net_and_operator));
-    }
-    for (int i = 0; i < iter; ++i) {
-      caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, warmup);
-      fillInputBlob(workspace, tensor_protos_map, i);
-      CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed.");
-      if (wipe_cache) {
-        caffe2::wipe_cache();
-      }
-      if (sleep_between_iteration > 0) {
-        std::this_thread::sleep_for(
-            std::chrono::seconds(sleep_between_iteration));
-      }
-    }
-  }
-}
-
-int benchmark(
-    int argc,
-    char* argv[],
-    const string& FLAGS_backend,
-    const string& FLAGS_init_net,
-    const string& FLAGS_input_dims,
-    int FLAGS_iter,
-    const string& FLAGS_net,
-    const string& FLAGS_output,
-    const string& FLAGS_output_folder,
-    bool FLAGS_run_individual,
-    int FLAGS_sleep_before_run,
-    int FLAGS_sleep_between_iteration,
-    int FLAGS_sleep_between_net_and_operator,
-    bool FLAGS_text_output,
-    int FLAGS_warmup,
-    bool FLAGS_wipe_cache) {
-  // Check arguments to be correct
-  {
-    // Need to check whether file exists, as the file reader does not assert if
-    // file does not exist
-    std::ifstream net_file(FLAGS_net);
-    CAFFE_ENFORCE(net_file.good());
-    net_file.close();
-
-    std::ifstream init_net_file(FLAGS_init_net);
-    CAFFE_ENFORCE(init_net_file.good());
-    init_net_file.close();
-  }
-
-  observerConfig();
-  caffe2::ShowLogInfoToStderr();
-
-  auto workspace = std::make_shared<caffe2::Workspace>(new caffe2::Workspace());
-  bool run_on_gpu = backendCudaSet(FLAGS_backend);
-  // Run initialization network.
-  caffe2::NetDef init_net_def;
-  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def));
-  setOperatorEngine(&init_net_def, FLAGS_backend);
-  CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
-
-  // Run main network.
-  caffe2::NetDef net_def;
-  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
-  setOperatorEngine(&net_def, FLAGS_backend);
-
-  map<string, caffe2::TensorProtos> tensor_protos_map;
-
-  int num_blobs;
-  vector<string> images = caffe2::split(';', FLAGS_input_image_files);
-  for (int i = 0; i < images.size(); ++i) {
-    vector<string> mapping = caffe2::split(',', images[i]);
-    caffe2::TensorProtos proto_images = caffe2::convertImages(mapping[1]);
-    workspace->CreateBlob(mapping[0]);
-    tensor_protos_map.insert(std::make_pair(mapping[0], proto_images));
-    num_blobs = proto_images.protos_size();
-  }
-
-  vector<string> values = caffe2::split(';', FLAGS_input_text_files);
-  for (int i = 0; i < values.size(); ++i) {
-    vector<string> mapping = caffe2::split(',', values[i]);
-    caffe2::TensorProtos proto_values = caffe2::convertValues(mapping[1]);
-    workspace->CreateBlob(mapping[0]);
-    tensor_protos_map.insert(std::make_pair(mapping[0], proto_values));
-    num_blobs = proto_values.protos_size();
-  }
-
-  runNetwork(
-      workspace,
-      net_def,
-      tensor_protos_map,
-      FLAGS_wipe_cache,
-      FLAGS_run_individual,
-      run_on_gpu,
-      FLAGS_text_output,
-      FLAGS_warmup,
-      FLAGS_iter,
-      num_blobs,
-      FLAGS_sleep_before_run,
-      FLAGS_sleep_between_iteration,
-      FLAGS_sleep_between_net_and_operator,
-      FLAGS_output,
-      FLAGS_output_folder);
-
-  return 0;
-}
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-  benchmark(
-      argc,
-      argv,
-      FLAGS_backend,
-      FLAGS_init_net,
-      FLAGS_input_dims,
-      FLAGS_iter,
-      FLAGS_net,
-      FLAGS_output,
-      FLAGS_output_folder,
-      FLAGS_run_individual,
-      FLAGS_sleep_before_run,
-      FLAGS_sleep_between_iteration,
-      FLAGS_sleep_between_net_and_operator,
-      FLAGS_text_output,
-      FLAGS_warmup,
-      FLAGS_wipe_cache);
-
-  return 0;
-}
diff --git a/binaries/convert_caffe_image_db.cc b/binaries/convert_caffe_image_db.cc
deleted file mode 100644
index dca13d6e97377..0000000000000
--- a/binaries/convert_caffe_image_db.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/db.h"
-#include "caffe2/core/init.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/proto/caffe2_legacy.pb.h"
-#include "caffe2/core/logging.h"
-
-C10_DEFINE_string(input_db, "", "The input db.");
-C10_DEFINE_string(input_db_type, "", "The input db type.");
-C10_DEFINE_string(output_db, "", "The output db.");
-C10_DEFINE_string(output_db_type, "", "The output db type.");
-C10_DEFINE_int(batch_size, 1000, "The write batch size.");
-
-using caffe2::db::Cursor;
-using caffe2::db::DB;
-using caffe2::db::Transaction;
-using caffe2::CaffeDatum;
-using caffe2::TensorProto;
-using caffe2::TensorProtos;
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-
-  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
-      FLAGS_input_db_type, FLAGS_input_db, caffe2::db::READ));
-  std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
-      FLAGS_output_db_type, FLAGS_output_db, caffe2::db::NEW));
-  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
-  std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
-  int count = 0;
-  for (; cursor->Valid(); cursor->Next()) {
-    CaffeDatum datum;
-    CAFFE_ENFORCE(datum.ParseFromString(cursor->value()));
-    TensorProtos protos;
-    TensorProto* data = protos.add_protos();
-    TensorProto* label = protos.add_protos();
-    label->set_data_type(TensorProto::INT32);
-    label->add_dims(1);
-    label->add_int32_data(datum.label());
-    if (datum.encoded()) {
-      // This is an encoded image. we will copy over the data directly.
-      data->set_data_type(TensorProto::STRING);
-      data->add_dims(1);
-      data->add_string_data(datum.data());
-    } else {
-      // float data not supported right now.
-      CAFFE_ENFORCE_EQ(datum.float_data_size(), 0);
-      std::vector<char> buffer_vec(datum.data().size());
-      char* buffer = buffer_vec.data();
-      // swap order from CHW to HWC
-      int channels = datum.channels();
-      int size = datum.height() * datum.width();
-      CAFFE_ENFORCE_EQ(datum.data().size(), channels * size);
-      for (int c = 0; c < channels; ++c) {
-        char* dst = buffer + c;
-        const char* src = datum.data().c_str() + c * size;
-        for (int n = 0; n < size; ++n) {
-          dst[n*channels] = src[n];
-        }
-      }
-      data->set_data_type(TensorProto::BYTE);
-      data->add_dims(datum.height());
-      data->add_dims(datum.width());
-      data->add_dims(datum.channels());
-      data->set_byte_data(buffer, datum.data().size());
-    }
-    transaction->Put(cursor->key(), SerializeAsString_EnforceCheck(protos));
-    if (++count % FLAGS_batch_size == 0) {
-      transaction->Commit();
-      LOG(INFO) << "Converted " << count << " items so far.";
-    }
-  }
-  LOG(INFO) << "A total of " << count << " items processed.";
-  return 0;
-}
diff --git a/binaries/convert_db.cc b/binaries/convert_db.cc
deleted file mode 100644
index 0fc73c317bcb3..0000000000000
--- a/binaries/convert_db.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/db.h"
-#include "caffe2/core/init.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/core/logging.h"
-
-C10_DEFINE_string(input_db, "", "The input db.");
-C10_DEFINE_string(input_db_type, "", "The input db type.");
-C10_DEFINE_string(output_db, "", "The output db.");
-C10_DEFINE_string(output_db_type, "", "The output db type.");
-C10_DEFINE_int(batch_size, 1000, "The write batch size.");
-
-using caffe2::db::Cursor;
-using caffe2::db::DB;
-using caffe2::db::Transaction;
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-
-  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
-      FLAGS_input_db_type, FLAGS_input_db, caffe2::db::READ));
-  std::unique_ptr<DB> out_db(caffe2::db::CreateDB(
-      FLAGS_output_db_type, FLAGS_output_db, caffe2::db::NEW));
-  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
-  std::unique_ptr<Transaction> transaction(out_db->NewTransaction());
-  int count = 0;
-  for (; cursor->Valid(); cursor->Next()) {
-    transaction->Put(cursor->key(), cursor->value());
-    if (++count % FLAGS_batch_size == 0) {
-      transaction->Commit();
-      LOG(INFO) << "Converted " << count << " items so far.";
-    }
-  }
-  LOG(INFO) << "A total of " << count << " items processed.";
-  return 0;
-}
diff --git a/binaries/convert_encoded_to_raw_leveldb.cc b/binaries/convert_encoded_to_raw_leveldb.cc
deleted file mode 100644
index c0727d7e3fcd6..0000000000000
--- a/binaries/convert_encoded_to_raw_leveldb.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// This script converts an image dataset to leveldb.
-//
-// FLAGS_input_folder is the root folder that holds all the images, and
-// FLAGS_list_file should be a list of files as well as their labels, in
-// the format as
-//   subfolder1/file1.JPEG 7
-//   ....
-
-#include <opencv2/opencv.hpp>
-
-#include <fstream>  // NOLINT(readability/streams)
-#include <memory>
-#include <random>
-#include <string>
-
-#include "caffe2/core/init.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/core/logging.h"
-#include "leveldb/db.h"
-#include "leveldb/write_batch.h"
-
-C10_DEFINE_string(input_db_name, "", "The input image file name.");
-C10_DEFINE_string(output_db_name, "", "The output training leveldb name.");
-C10_DEFINE_bool(color, true, "If set, load images in color.");
-C10_DEFINE_int(
-    scale,
-    256,
-    "If FLAGS_raw is set, scale all the images' shorter edge to the given "
-    "value.");
-C10_DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
-
-namespace caffe2 {
-
-
-void ConvertToRawDataset(
-    const string& input_db_name, const string& output_db_name) {
-  // input leveldb
-  std::unique_ptr<leveldb::DB> input_db;
-  LOG(INFO) << "Opening input leveldb " << input_db_name;
-  {
-    leveldb::Options options;
-    options.create_if_missing = false;
-    leveldb::DB* db_temp;
-    leveldb::Status status = leveldb::DB::Open(
-        options, input_db_name, &db_temp);
-    CAFFE_ENFORCE(status.ok(), "Failed to open leveldb ", input_db_name, ".");
-    input_db.reset(db_temp);
-  }
-
-  // output leveldb
-  std::unique_ptr<leveldb::DB> output_db;
-  std::unique_ptr<leveldb::WriteBatch> batch;
-  LOG(INFO) << "Opening leveldb " << output_db_name;
-  {
-    leveldb::Options options;
-    options.error_if_exists = true;
-    options.create_if_missing = true;
-    options.write_buffer_size = 268435456;
-    leveldb::DB* db_temp;
-    leveldb::Status status = leveldb::DB::Open(
-        options, output_db_name, &db_temp);
-    CAFFE_ENFORCE(
-        status.ok(),
-        "Failed to open leveldb ",
-        output_db_name,
-        ". Is it already existing?");
-    output_db.reset(db_temp);
-  }
-  batch.reset(new leveldb::WriteBatch());
-
-  TensorProtos input_protos;
-  TensorProtos output_protos;
-  TensorProto* data = output_protos.add_protos();
-  TensorProto* label = output_protos.add_protos();
-  data->set_data_type(TensorProto::BYTE);
-  data->add_dims(0);
-  data->add_dims(0);
-  if (FLAGS_color) {
-    data->add_dims(3);
-  }
-  string value;
-
-  unique_ptr<leveldb::Iterator> iter;
-  iter.reset(input_db->NewIterator(leveldb::ReadOptions()));
-  iter->SeekToFirst();
-  int count = 0;
-  for (; iter->Valid(); iter->Next()) {
-    CAFFE_ENFORCE(input_protos.ParseFromString(iter->value().ToString()));
-    label->CopyFrom(input_protos.protos(1));
-    const string& encoded_image = input_protos.protos(0).string_data(0);
-    int encoded_size = encoded_image.size();
-    cv::Mat img = cv::imdecode(
-        cv::Mat(
-            1, &encoded_size, CV_8UC1, const_cast<char*>(encoded_image.data())),
-        FLAGS_color ? cv::IMREAD_COLOR : cv::IMREAD_GRAYSCALE);
-    cv::Mat resized_img;
-    int scaled_width, scaled_height;
-    if (FLAGS_warp) {
-      scaled_width = FLAGS_scale;
-      scaled_height = FLAGS_scale;
-    } else if (img.rows > img.cols) {
-      scaled_width = FLAGS_scale;
-      scaled_height = static_cast<float>(img.rows) * FLAGS_scale / img.cols;
-    } else {
-      scaled_height = FLAGS_scale;
-      scaled_width = static_cast<float>(img.cols) * FLAGS_scale / img.rows;
-    }
-    cv::resize(img, resized_img, cv::Size(scaled_width, scaled_height), 0, 0,
-                 cv::INTER_LINEAR);
-    data->set_dims(0, scaled_height);
-    data->set_dims(1, scaled_width);
-    DCHECK(resized_img.isContinuous());
-    data->set_byte_data(
-        resized_img.ptr(),
-        scaled_height * scaled_width * (FLAGS_color ? 3 : 1));
-    output_protos.SerializeToString(&value);
-    // Put in db
-    batch->Put(iter->key(), value);
-    if (++count % 1000 == 0) {
-      output_db->Write(leveldb::WriteOptions(), batch.get());
-      batch.reset(new leveldb::WriteBatch());
-      LOG(INFO) << "Processed " << count << " files.";
-    }
-  }
-  // write the last batch
-  if (count % 1000 != 0) {
-    output_db->Write(leveldb::WriteOptions(), batch.get());
-  }
-  LOG(INFO) << "Processed a total of " << count << " files.";
-}
-
-}  // namespace caffe2
-
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-  caffe2::ConvertToRawDataset(FLAGS_input_db_name, FLAGS_output_db_name);
-  return 0;
-}
diff --git a/binaries/convert_image_to_tensor.cc b/binaries/convert_image_to_tensor.cc
deleted file mode 100644
index 2cbfb0703b3d3..0000000000000
--- a/binaries/convert_image_to_tensor.cc
+++ /dev/null
@@ -1,450 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <opencv2/opencv.hpp>
-#include <cmath>
-#include <fstream>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/db.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/timer.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/proto_utils.h"
-#include "caffe2/utils/string_utils.h"
-
-
-C10_DEFINE_int(
-    batch_size,
-    -1,
-    "Specify the batch size of the input. The number of items in the "
-    "input needs to be multiples of the batch size. If the batch size "
-    "is less than 0, all inputs are in one batch.")
-C10_DEFINE_bool(color, true, "If set, load images in color.");
-C10_DEFINE_string(
-    crop,
-    "-1,-1",
-    "The center cropped hight and width. If the value is less than zero, "
-    "it is not cropped.");
-C10_DEFINE_string(input_images, "", "Comma separated images");
-C10_DEFINE_string(input_image_file, "", "The file containing imput images");
-C10_DEFINE_string(input_text_file, "", "the text file to be written to blobs");
-C10_DEFINE_string(
-    output_tensor,
-    "",
-    "The output tensor file in NCHW for input images");
-C10_DEFINE_string(
-    output_text_tensor,
-    "",
-    "The output tensor file for the text input specified in input_text_file");
-C10_DEFINE_string(
-    preprocess,
-    "",
-    "Options to specify the preprocess routines. The available options are "
-    "subtract128, normalize, mean, std, bgrtorgb. If multiple steps are provided, they "
-    "are separated by comma (,) in sequence.");
-C10_DEFINE_string(
-    report_time,
-    "",
-    "Report the conversion stage time to screen. "
-    "The format of the string is <type>|<identifier>. "
-    "The valid type is 'json'. "
-    "The valid identifier is nothing or an identifier that prefix every line");
-C10_DEFINE_string(
-    scale,
-    "-1,-1",
-    "Scale the images to be within the min,max box. The shorter edge is "
-    "min pixels. But if the other edge is more than the max pixels, the "
-    "other edge and scaled to max pixels (and the shorter edge can be less "
-    "than the min pixels");
-C10_DEFINE_bool(text_output, false, "Write the output in text format.");
-C10_DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
-
-namespace caffe2 {
-
-void reportTime(
-    std::string type,
-    double ts,
-    std::string metric,
-    std::string unit) {
-  if (FLAGS_report_time == "") {
-    return;
-  }
-  vector<string> s = caffe2::split('|', FLAGS_report_time);
-  assert(s[0] == "json");
-  std::string identifier = "";
-  if (s.size() > 1) {
-    identifier = s[1];
-  }
-  std::cout << identifier << "{\"type\": \"" << type << "\", \"value\": " << ts
-            << ", \"metric\": \"" << metric << "\", \"unit\": \"" << unit
-            << "\"}" << std::endl;
-}
-
-void splitSizes(const std::string& arg, int* ptr0, int* ptr1) {
-  vector<string> sizes = caffe2::split(',', arg);
-  if (sizes.size() == 2) {
-    *ptr0 = std::stoi(sizes[0]);
-    *ptr1 = std::stoi(sizes[1]);
-  } else if (sizes.size() == 1) {
-    *ptr0 = std::stoi(sizes[0]);
-    *ptr1 = std::stoi(sizes[0]);
-  } else {
-    assert(false);
-  }
-}
-
-
-cv::Mat resizeImage(cv::Mat& img) {
-  int min_size, max_size;
-  splitSizes(FLAGS_scale, &min_size, &max_size);
-  if ((min_size <= 0) && (max_size <= 0)) {
-    return img;
-  }
-  if (max_size < 0) {
-    max_size = INT_MAX;
-  }
-  assert(min_size <= max_size);
-
-  int im_min_size = img.rows > img.cols ? img.cols : img.rows;
-  int im_max_size = img.rows > img.cols ? img.rows : img.cols;
-
-  double im_scale = 1.0 * min_size / im_min_size;
-  if (im_scale * im_max_size > max_size) {
-    im_scale = 1.0 * max_size / im_max_size;
-  }
-  int scaled_width = int(round(img.cols * im_scale));
-  int scaled_height = int(round(img.rows * im_scale));
-  assert((scaled_width <= max_size) && (scaled_height <= max_size));
-  if ((scaled_width < min_size) || (scaled_height < min_size)) {
-    assert((scaled_width == max_size) || (scaled_height == max_size));
-  } else {
-    assert((scaled_width == min_size) || (scaled_height == min_size));
-  }
-  cv::Mat resized_img;
-  cv::resize(
-      img,
-      resized_img,
-      cv::Size(),
-      im_scale,
-      im_scale,
-      cv::INTER_LINEAR);
-  return resized_img;
-}
-
-cv::Mat cropToRec(cv::Mat& img, int* height_ptr, int* width_ptr) {
-  int height = *height_ptr;
-  int width = *width_ptr;
-  if ((height > 0) && (width > 0) &&
-      ((img.rows != height) || (img.cols != width))) {
-    cv::Mat cropped_img, cimg;
-    cv::Rect roi;
-    roi.x = int((img.cols - width) / 2);
-    roi.y = int((img.rows - height) / 2);
-    roi.x = roi.x < 0 ? 0 : roi.x;
-    roi.y = roi.y < 0 ? 0 : roi.y;
-    width = width > img.cols ? img.cols : width;
-    height = height > img.rows ? img.rows : height;
-    roi.width = width;
-    roi.height = height;
-    assert(
-        0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= img.cols &&
-        0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= img.rows);
-    cropped_img = img(roi);
-    // Make the image in continuous space in memory
-    cimg = cropped_img.clone();
-    *height_ptr = height;
-    *width_ptr = width;
-    return cimg;
-  } else {
-    return img;
-  }
-}
-
-std::vector<float> convertToVector(cv::Mat& img) {
-  std::vector<float> normalize(3, 1);
-  std::vector<float> mean(3, 0);
-  std::vector<float> std(3, 1);
-  bool bgrtorgb = false;
-  int size = img.cols * img.rows;
-  vector<string> steps = caffe2::split(',', FLAGS_preprocess);
-  for (int i = 0; i < steps.size(); i++) {
-    auto step = steps[i];
-    if (step == "subtract128") {
-      mean = {128, 128, 128};
-      std = {1, 1, 1};
-      normalize = {1, 1, 1};
-    } else if (step == "normalize") {
-      normalize = {255, 255, 255};
-    } else if (step == "mean") {
-      mean = {0.406f, 0.456f, 0.485f};
-    } else if (step == "std") {
-      std = {0.225f, 0.224f, 0.229f};
-    } else if (step == "bgrtorgb") {
-      bgrtorgb = true;
-    } else {
-      CAFFE_ENFORCE(
-          false,
-          "Unsupported preprocess step. The supported steps are: subtract128, "
-          "normalize,mean, std, swaprb.");
-    }
-  }
-
-  int C = FLAGS_color ? 3 : 1;
-  int total_size = C * size;
-  std::vector<float> values(total_size);
-  if (C == 1) {
-    cv::MatIterator_<float> it, end;
-    int idx = 0;
-    for (it = img.begin<float>(), end = img.end<float>(); it != end; ++it) {
-      values[idx++] = (*it / normalize[0] - mean[0]) / std[0];
-    }
-  } else {
-    int i = 0;
-    cv::MatIterator_<cv::Vec3f> it, end;
-    int b = bgrtorgb ? 2 : 0;
-    int g = 1;
-    int r = bgrtorgb ? 0 : 2;
-    for (it = img.begin<cv::Vec3f>(), end = img.end<cv::Vec3f>(); it != end;
-         ++it, i++) {
-      values[i] = (((*it)[b] / normalize[0] - mean[0]) / std[0]);
-      int offset = size + i;
-      values[offset] = (((*it)[g] / normalize[1] - mean[1]) / std[1]);
-      offset = size + offset;
-      values[offset] = (((*it)[r] / normalize[2] - mean[2]) / std[2]);
-    }
-  }
-  return values;
-}
-
-std::vector<float> convertOneImage(
-    std::string& filename,
-    int* height_ptr,
-    int* width_ptr) {
-  assert(filename[0] != '~');
-
-  std::cout << "Converting " << filename << std::endl;
-
-  // Load image
-  cv::Mat img_uint8 = cv::imread(
-#if CV_MAJOR_VERSION <= 3
-      filename, FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE);
-#else
-      filename, FLAGS_color ? cv::IMREAD_COLOR : cv::IMREAD_GRAYSCALE);
-#endif
-  caffe2::Timer timer;
-  timer.Start();
-  cv::Mat img;
-  // Convert image to floating point values
-  img_uint8.convertTo(img, CV_32F);
-  // Resize image
-  cv::Mat resized_img = resizeImage(img);
-
-  int height, width;
-  splitSizes(FLAGS_crop, &height, &width);
-  if ((height <= 0) || (width <= 0)) {
-    height = resized_img.rows;
-    width = resized_img.cols;
-  }
-  cv::Mat crop = cropToRec(resized_img, &height, &width);
-
-  // Assert we don't have to deal with alignment
-  DCHECK(crop.isContinuous());
-  assert(crop.rows == height);
-  assert(crop.cols == width);
-  std::vector<float> one_image_values = convertToVector(crop);
-  *height_ptr = height;
-  *width_ptr = width;
-  double ts = timer.MicroSeconds();
-  reportTime("image_preprocess", ts, "convert", "us");
-  return one_image_values;
-}
-
-int getBatchSize(int num_items) {
-  int batch_size = FLAGS_batch_size;
-  if (batch_size < 0) {
-    batch_size = num_items;
-  } else {
-    assert(num_items % batch_size == 0);
-  }
-  return batch_size;
-}
-
-void writeValues(
-    std::vector<std::vector<std::vector<float>>>& values,
-    std::vector<std::vector<int>>& dims,
-    std::string output_file) {
-
-  caffe2::Timer timer;
-  timer.Start();
-
-  assert(dims.size() == values.size());
-  int num_batches = dims.size();
-
-  TensorProtos protos;
-  for (int k = 0; k < num_batches; k++) {
-    TensorProto* data;
-    data = protos.add_protos();
-    data->set_data_type(TensorProto::FLOAT);
-    auto one_dim = dims[k];
-    for (int dim : one_dim) {
-      data->add_dims(dim);
-    }
-    int batch_size = one_dim[0];
-    long long int entry_size = 1;
-    for (int i = 1; i < one_dim.size(); i++) {
-      entry_size *= one_dim[i];
-    }
-
-    // Not optimized
-    for (int i = 0; i < batch_size; i++) {
-      assert(values[k][i].size() == entry_size);
-      for (int j = 0; j < values[k][i].size(); j++) {
-        data->add_float_data(values[k][i][j]);
-      }
-    }
-  }
-  double ts = timer.MicroSeconds();
-  reportTime("preprocess", ts, "data_pack", "us");
-
-  if (FLAGS_text_output) {
-    caffe2::WriteProtoToTextFile(protos, output_file);
-  } else {
-    caffe2::WriteProtoToBinaryFile(protos, output_file);
-  }
-}
-
-void convertImages() {
-  vector<string> file_names;
-  if (FLAGS_input_images != "") {
-    file_names = caffe2::split(',', FLAGS_input_images);
-  } else if (FLAGS_input_image_file != "") {
-    std::ifstream infile(FLAGS_input_image_file);
-    std::string line;
-    while (std::getline(infile, line)) {
-      vector<string> file_name = caffe2::split(',', line);
-      string name;
-      if (file_name.size() == 3) {
-        name = file_name[2];
-      } else {
-        name = line;
-      }
-      file_names.push_back(name);
-    }
-  } else {
-    return;
-  }
-  int batch_size = getBatchSize(file_names.size());
-  int num_batches = file_names.size() / batch_size;
-  assert(file_names.size() == batch_size * num_batches);
-  std::vector<std::vector<std::vector<float>>> values;
-  std::vector<std::vector<int>> dims;
-  int C = FLAGS_color ? 3 : 1;
-  for (int k = 0; k < num_batches; k++) {
-    std::vector<std::vector<float>> one_value;
-    int height = -1;
-    int width = -1;
-    for (int i = 0; i < batch_size; i++) {
-      int idx = k * batch_size + i;
-      int one_height, one_width;
-      std::vector<float> one_image_values =
-          convertOneImage(file_names[idx], &one_height, &one_width);
-      if (height < 0 && width < 0) {
-        height = one_height;
-        width = one_width;
-      } else {
-        assert(height == one_height);
-        assert(width == one_width);
-      }
-      one_value.push_back(one_image_values);
-    }
-    vector<int> one_dim = {batch_size, C, height, width};
-    dims.push_back(one_dim);
-    values.push_back(one_value);
-  }
-  writeValues(values, dims, FLAGS_output_tensor);
-}
-
-template <class TYPE>
-vector<TYPE> splitString(std::string& line) {
-  vector<string> vector_str = caffe2::split(',', line);
-  vector<TYPE> vector_int;
-  for (string str : vector_str) {
-    vector_int.push_back((TYPE)std::stod(str));
-  }
-  return vector_int;
-}
-
-/* Convert the values in a json file to blobs
-   The format of the json file should be:
-   <number of items>,  <dim2>.... (dimensions of items)
-   <entry>, <entry>, <entry>... (all entries in one item)
-   <entry>, <entry>, <entry>...
-   ....
-*/
-void convertValues() {
-  if (FLAGS_input_text_file == "") {
-    return;
-  }
-  std::ifstream infile(FLAGS_input_text_file);
-  std::string line;
-  std::getline(infile, line);
-  vector<int> file_dims = splitString <int>(line);
-  assert(file_dims.size() >= 2);
-
-  int num_items = file_dims[0];
-  int batch_size = getBatchSize(num_items);
-  int num_batches = num_items / batch_size;
-  assert(num_items == batch_size * num_batches);
-  vector<string> lines;
-  while (std::getline(infile, line)) {
-    lines.push_back(line);
-  }
-  assert(lines.size() == num_items);
-  std::vector<std::vector<std::vector<float>>> values;
-  std::vector<std::vector<int>> dims;
-  for (int i = 0; i < num_batches; i++) {
-    std::vector<std::vector<float>> one_value;
-    int num = -1;
-    for (int j = 0; j < batch_size; j++) {
-      int idx = i * batch_size + j;
-      std::string line = lines[idx];
-      vector<float> item = splitString<float>(line);
-      if (num < 0) {
-        num = item.size();
-      } else {
-        assert(num == item.size());
-      }
-      one_value.push_back(item);
-    }
-    vector<int> batch_dims = file_dims;
-    batch_dims[0] = batch_size;
-    dims.push_back(batch_dims);
-    values.push_back(one_value);
-  }
-
-  writeValues(values, dims, FLAGS_output_text_tensor);
-}
-
-} // namespace caffe2
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-  caffe2::convertImages();
-  caffe2::convertValues();
-  return 0;
-}
diff --git a/binaries/db_throughput.cc b/binaries/db_throughput.cc
deleted file mode 100644
index 851447acdc77d..0000000000000
--- a/binaries/db_throughput.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdio>
-#include <thread>
-#include <vector>
-
-#include "caffe2/core/db.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/timer.h"
-#include "caffe2/core/logging.h"
-
-C10_DEFINE_string(input_db, "", "The input db.");
-C10_DEFINE_string(input_db_type, "", "The input db type.");
-C10_DEFINE_int(report_interval, 1000, "The report interval.");
-C10_DEFINE_int(repeat, 10, "The number to repeat the throughput test.");
-C10_DEFINE_bool(use_reader, false, "If true, use the reader interface.");
-C10_DEFINE_int(
-    num_read_threads,
-    1,
-    "The number of concurrent reading threads.");
-
-using caffe2::db::Cursor;
-using caffe2::db::DB;
-using caffe2::db::DBReader;
-using caffe2::string;
-
-void TestThroughputWithDB() {
-  std::unique_ptr<DB> in_db(caffe2::db::CreateDB(
-      FLAGS_input_db_type, FLAGS_input_db, caffe2::db::READ));
-  std::unique_ptr<Cursor> cursor(in_db->NewCursor());
-  for (int iter_id = 0; iter_id < FLAGS_repeat; ++iter_id) {
-    caffe2::Timer timer;
-    for (int i = 0; i < FLAGS_report_interval; ++i) {
-      string key = cursor->key();
-      string value = cursor->value();
-      //VLOG(1) << "Key " << key;
-      cursor->Next();
-      if (!cursor->Valid()) {
-        cursor->SeekToFirst();
-      }
-    }
-    double elapsed_seconds = timer.Seconds();
-    printf(
-        "Iteration %03d, took %4.5f seconds, throughput %f items/sec.\n",
-        iter_id,
-        elapsed_seconds,
-        FLAGS_report_interval / elapsed_seconds);
-  }
-}
-
-void TestThroughputWithReaderWorker(const DBReader* reader, int thread_id) {
-  string key, value;
-  for (int iter_id = 0; iter_id < FLAGS_repeat; ++iter_id) {
-    caffe2::Timer timer;
-    for (int i = 0; i < FLAGS_report_interval; ++i) {
-      reader->Read(&key, &value);
-    }
-    double elapsed_seconds = timer.Seconds();
-    printf(
-        "Thread %03d iteration %03d, took %4.5f seconds, "
-        "throughput %f items/sec.\n",
-        thread_id,
-        iter_id,
-        elapsed_seconds,
-        FLAGS_report_interval / elapsed_seconds);
-  }
-}
-
-void TestThroughputWithReader() {
-  caffe2::db::DBReader reader(FLAGS_input_db_type, FLAGS_input_db);
-  std::vector<std::unique_ptr<std::thread>> reading_threads(
-      FLAGS_num_read_threads);
-  for (int i = 0; i < reading_threads.size(); ++i) {
-    reading_threads[i].reset(new std::thread(
-        TestThroughputWithReaderWorker, &reader, i));
-  }
-  for (int i = 0; i < reading_threads.size(); ++i) {
-    reading_threads[i]->join();
-  }
-}
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-  if (FLAGS_use_reader) {
-    TestThroughputWithReader();
-  } else {
-    TestThroughputWithDB();
-  }
-  return 0;
-}
diff --git a/binaries/intra_inter_benchmark.cc b/binaries/intra_inter_benchmark.cc
deleted file mode 100644
index 0f191dc36a121..0000000000000
--- a/binaries/intra_inter_benchmark.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-#include "ATen/ATen.h"
-#include "ATen/Parallel.h"
-
-#include "c10/util/Flags.h"
-#include "caffe2/core/init.h"
-
-#include <chrono>
-#include <condition_variable>
-#include <ctime>
-#include <iostream>
-#include <mutex>
-#include <thread>
-
-C10_DEFINE_int(iter_pow, 10, "Number of tasks, 2^N");
-C10_DEFINE_int(sub_iter, 1024, "Number of subtasks");
-C10_DEFINE_int(warmup_iter_pow, 3, "Number of warmup tasks, 2^N");
-C10_DEFINE_int(inter_op_threads, 0, "Number of inter-op threads");
-C10_DEFINE_int(intra_op_threads, 0, "Number of intra-op threads");
-C10_DEFINE_int(tensor_dim, 50, "Tensor dim");
-C10_DEFINE_int(benchmark_iter, 10, "Number of times to run benchmark")
-C10_DEFINE_bool(extra_stats, false,
-    "Collect extra stats; warning: skews results");
-C10_DEFINE_string(task_type, "add", "Tensor operation: add or mm");
-
-namespace {
-std::atomic<int> counter{0};
-int overall_tasks = 0;
-std::condition_variable cv;
-std::mutex tasks_mutex;
-bool run_mm = false;
-
-std::mutex stats_mutex;
-std::unordered_set<std::thread::id> tids;
-}
-
-void wait() {
-  std::unique_lock<std::mutex> lk(tasks_mutex);
-  while (counter < overall_tasks) {
-    cv.wait(lk);
-  }
-}
-
-void _launch_tasks_tree(
-    int level, int end_level, at::Tensor& left, at::Tensor& right) {
-  if (level == end_level) {
-    at::parallel_for(0, FLAGS_sub_iter, 1,
-        [&left, &right](int64_t begin, int64_t end) {
-      if (FLAGS_extra_stats) {
-        std::unique_lock<std::mutex> lk(stats_mutex);
-        tids.insert(std::this_thread::get_id());
-      }
-      for (auto k = begin; k < end; ++k) {
-        if (run_mm) {
-          left.mm(right);
-        } else {
-          left.add(right);
-        }
-        auto cur_ctr = ++counter;
-        if (cur_ctr == overall_tasks) {
-          std::unique_lock<std::mutex> lk(tasks_mutex);
-          cv.notify_one();
-        }
-      }
-    });
-  } else {
-    at::launch([&left, &right, level, end_level]() {
-      _launch_tasks_tree(level + 1, end_level, left, right);
-    });
-    at::launch([&left, &right, level, end_level]() {
-      _launch_tasks_tree(level + 1, end_level, left, right);
-    });
-  }
-};
-
-void launch_tasks_and_wait(at::Tensor& left, at::Tensor& right, int iter_pow) {
-  overall_tasks = pow(2, iter_pow) * FLAGS_sub_iter;
-  counter = 0;
-
-  _launch_tasks_tree(0, iter_pow, left, right);
-  wait();
-}
-
-void reset_extra_stats() {
-  tids.clear();
-}
-
-void print_extra_stats() {
-  std::cout << "# threads: " << tids.size() << std::endl;
-}
-
-void print_runtime_stats(const std::vector<float>& runtimes) {
-  TORCH_INTERNAL_ASSERT(!runtimes.empty());
-  float sum = 0.0;
-  float sqr_sum = 0.0;
-  size_t N = runtimes.size();
-  for (size_t idx = 0; idx < N; ++idx) {
-    sum += runtimes[idx];
-    sqr_sum += runtimes[idx] * runtimes[idx];
-  }
-  float mean = sum / N;
-  float sd = std::sqrt(sqr_sum / N - mean * mean);
-  std::cout << "N = " << N << ", mean = " << mean << ", sd = " << sd
-            << std::endl;
-}
-
-int main(int argc, char** argv) {
-  if (!c10::ParseCommandLineFlags(&argc, &argv)) {
-    std::cout << "Failed to parse command line flags" << std::endl;
-    return -1;
-  }
-  caffe2::unsafeRunCaffe2InitFunction("registerThreadPools");
-  at::init_num_threads();
-
-  if (FLAGS_inter_op_threads > 0) {
-    at::set_num_interop_threads(FLAGS_inter_op_threads);
-  }
-  if (FLAGS_intra_op_threads > 0) {
-    at::set_num_threads(FLAGS_intra_op_threads);
-  }
-
-  TORCH_CHECK(FLAGS_task_type == "add" || FLAGS_task_type == "mm");
-  run_mm = FLAGS_task_type == "mm";
-
-  auto left = at::ones({FLAGS_tensor_dim, FLAGS_tensor_dim}, at::kFloat);
-  auto right = at::ones({FLAGS_tensor_dim, FLAGS_tensor_dim}, at::kFloat);
-
-  std::cout << "Launching " << pow(2, FLAGS_warmup_iter_pow)
-            << " warmup tasks" << std::endl;
-
-  typedef std::chrono::high_resolution_clock clock;
-  typedef std::chrono::milliseconds ms;
-
-  std::chrono::time_point<clock> start_time = clock::now();
-  launch_tasks_and_wait(left, right, FLAGS_warmup_iter_pow);
-  auto duration = static_cast<float>(
-      std::chrono::duration_cast<ms>(clock::now() - start_time).count());
-
-  std::cout << "Warmup time: " << duration << " ms." << std::endl;
-
-  std::cout << "Launching " << pow(2, FLAGS_iter_pow) << " tasks with "
-            << FLAGS_sub_iter << " subtasks each, using "
-            << at::get_num_interop_threads() << " inter-op threads and "
-            << at::get_num_threads() << " intra-op threads, "
-            << "tensor dim: " << FLAGS_tensor_dim
-            << ", task type: " << FLAGS_task_type << std::endl;
-
-  std::vector<float> runtimes;
-  for (auto bench_iter = 0; bench_iter < FLAGS_benchmark_iter; ++bench_iter) {
-    reset_extra_stats();
-    start_time = clock::now();
-    launch_tasks_and_wait(left, right, FLAGS_iter_pow);
-    duration = static_cast<float>(
-        std::chrono::duration_cast<ms>(clock::now() - start_time).count());
-    runtimes.push_back(duration);
-
-    if (FLAGS_extra_stats) {
-      print_extra_stats();
-    }
-
-    std::cout << "Runtime: " << duration << " ms." << std::endl;
-  }
-
-  print_runtime_stats(runtimes);
-
-  return 0;
-}
diff --git a/binaries/make_cifar_db.cc b/binaries/make_cifar_db.cc
deleted file mode 100644
index 3c627101f548d..0000000000000
--- a/binaries/make_cifar_db.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-//
-// This script converts the CIFAR dataset to the leveldb format used
-// by caffe to perform classification.
-// Usage:
-//    convert_cifar_data input_folder output_db_file
-// The CIFAR dataset could be downloaded at
-//    http://www.cs.toronto.edu/~kriz/cifar.html
-
-#include <array>
-#include <fstream>  // NOLINT(readability/streams)
-#include <sstream>
-#include <string>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/db.h"
-#include "caffe2/core/init.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/core/logging.h"
-
-C10_DEFINE_string(input_folder, "", "The input folder name.");
-C10_DEFINE_string(output_train_db_name, "", "The output training db name.");
-C10_DEFINE_string(output_test_db_name, "", "The output testing db name.");
-C10_DEFINE_string(db, "leveldb", "The db type.");
-C10_DEFINE_bool(
-    is_cifar100,
-    false,
-    "If set, convert cifar100. Otherwise do cifar10.");
-
-namespace caffe2 {
-
-using std::stringstream;
-
-const int kCIFARSize = 32;
-const int kCIFARImageNBytes = kCIFARSize * kCIFARSize * 3;
-const int kCIFAR10BatchSize = 10000;
-const int kCIFAR10TestDataSize = 10000;
-const int kCIFAR10TrainBatches = 5;
-
-const int kCIFAR100TrainDataSize = 50000;
-const int kCIFAR100TestDataSize = 10000;
-
-void ReadImage(std::ifstream* file, int* label, char* buffer) {
-  char label_char;
-  if (FLAGS_is_cifar100) {
-    // Skip the coarse label.
-    file->read(&label_char, 1);
-  }
-  file->read(&label_char, 1);
-  *label = label_char;
-  // Yes, there are better ways to do it, like in-place swap... but I am too
-  // lazy so let's just write it in a memory-wasteful way.
-  std::array<char, kCIFARImageNBytes> channel_first_storage;
-  file->read(channel_first_storage.data(), kCIFARImageNBytes);
-  for (int c = 0; c < 3; ++c) {
-    for (int i = 0; i < kCIFARSize * kCIFARSize; ++i) {
-      buffer[i * 3 + c] =
-          channel_first_storage[c * kCIFARSize * kCIFARSize + i];
-    }
-  }
-  return;
-}
-
-void WriteToDB(const string& filename, const int num_items,
-                    const int& offset, db::DB* db) {
-  TensorProtos protos;
-  TensorProto* data = protos.add_protos();
-  TensorProto* label = protos.add_protos();
-  data->set_data_type(TensorProto::BYTE);
-  data->add_dims(kCIFARSize);
-  data->add_dims(kCIFARSize);
-  data->add_dims(3);
-  label->set_data_type(TensorProto::INT32);
-  label->add_dims(1);
-  label->add_int32_data(0);
-
-  LOG(INFO) << "Converting file " << filename;
-  std::ifstream data_file(filename.c_str(),
-      std::ios::in | std::ios::binary);
-  CAFFE_ENFORCE(data_file, "Unable to open file ", filename);
-  char str_buffer[kCIFARImageNBytes];
-  int label_value;
-  std::unique_ptr<db::Transaction> transaction(db->NewTransaction());
-  for (int itemid = 0; itemid < num_items; ++itemid) {
-    ReadImage(&data_file, &label_value, str_buffer);
-    data->set_byte_data(str_buffer, kCIFARImageNBytes);
-    label->set_int32_data(0, label_value);
-    snprintf(str_buffer, kCIFARImageNBytes, "%05d",
-        offset + itemid);
-    transaction->Put(string(str_buffer), protos.SerializeAsString());
-  }
-}
-
-void ConvertCIFAR() {
-  std::unique_ptr<db::DB> train_db(
-      db::CreateDB(FLAGS_db, FLAGS_output_train_db_name, db::NEW));
-  std::unique_ptr<db::DB> test_db(
-      db::CreateDB(FLAGS_db, FLAGS_output_test_db_name, db::NEW));
-
-  if (!FLAGS_is_cifar100) {
-    // This is cifar 10.
-    for (int fileid = 0; fileid < kCIFAR10TrainBatches; ++fileid) {
-      stringstream train_file;
-      train_file << FLAGS_input_folder << "/data_batch_" << fileid + 1
-                 << ".bin";
-      WriteToDB(train_file.str(), kCIFAR10BatchSize,
-                fileid * kCIFAR10BatchSize, train_db.get());
-    }
-    stringstream test_file;
-    test_file << FLAGS_input_folder << "/test_batch.bin";
-    WriteToDB(test_file.str(), kCIFAR10TestDataSize, 0, test_db.get());
-  } else {
-    // This is cifar 100.
-    stringstream train_file;
-    train_file << FLAGS_input_folder << "/train.bin";
-    WriteToDB(train_file.str(), kCIFAR100TrainDataSize, 0, train_db.get());
-    stringstream test_file;
-    test_file << FLAGS_input_folder << "/test.bin";
-    WriteToDB(test_file.str(), kCIFAR100TestDataSize, 0, test_db.get());
-  }
-}
-
-}  // namespace caffe2
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-  caffe2::ConvertCIFAR();
-  return 0;
-}
diff --git a/binaries/make_image_db.cc b/binaries/make_image_db.cc
deleted file mode 100644
index 3bbe15a062975..0000000000000
--- a/binaries/make_image_db.cc
+++ /dev/null
@@ -1,279 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// This script converts an image dataset to a database.
-//
-// FLAGS_input_folder is the root folder that holds all the images
-//
-// FLAGS_list_file is the path to a file containing a list of files
-// and their labels, as follows:
-//
-//   subfolder1/file1.JPEG 7
-//   subfolder1/file2.JPEG 7
-//   subfolder2/file1.JPEG 8
-//   ...
-//
-
-#include <opencv2/opencv.hpp>
-
-#include <algorithm>
-#include <fstream>
-#include <queue>
-#include <random>
-#include <string>
-#include <thread>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/db.h"
-#include "caffe2/core/init.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/core/logging.h"
-
-C10_DEFINE_bool(
-    shuffle,
-    false,
-    "Randomly shuffle the order of images and their labels");
-C10_DEFINE_string(input_folder, "", "The input image file name.");
-C10_DEFINE_string(
-    list_file,
-    "",
-    "The text file containing the list of images.");
-C10_DEFINE_string(output_db_name, "", "The output training leveldb name.");
-C10_DEFINE_string(db, "leveldb", "The db type.");
-C10_DEFINE_bool(
-    raw,
-    false,
-    "If set, we pre-read the images and store the raw buffer.");
-C10_DEFINE_bool(color, true, "If set, load images in color.");
-C10_DEFINE_int(
-    scale,
-    256,
-    "If FLAGS_raw is set, scale the shorter edge to the given value.");
-C10_DEFINE_bool(warp, false, "If warp is set, warp the images to square.");
-C10_DEFINE_int(
-    num_threads,
-    -1,
-    "Number of image parsing and conversion threads.");
-
-namespace caffe2 {
-
-class Converter {
- public:
-  explicit Converter() {
-    data_ = protos_.add_protos();
-    label_ = protos_.add_protos();
-    if (FLAGS_raw) {
-      data_->set_data_type(TensorProto::BYTE);
-      data_->add_dims(0);
-      data_->add_dims(0);
-      if (FLAGS_color) {
-        data_->add_dims(3);
-      }
-    } else {
-      data_->set_data_type(TensorProto::STRING);
-      data_->add_dims(1);
-      data_->add_string_data("");
-    }
-    label_->set_data_type(TensorProto::INT32);
-    label_->add_dims(1);
-    label_->add_int32_data(0);
-  }
-
-  ~Converter() {
-    if (thread_.joinable()) {
-      thread_.join();
-    }
-  }
-
-  void queue(const std::pair<std::string, int>& pair) {
-    in_.push(pair);
-  }
-
-  void start() {
-    thread_ = std::thread(&Converter::run, this);
-  }
-
-  std::string get() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    while (out_.empty()) {
-      cv_.wait(lock);
-    }
-
-    auto value = out_.front();
-    out_.pop();
-    cv_.notify_one();
-    return value;
-  }
-
-  void run() {
-    const auto& input_folder = FLAGS_input_folder;
-    std::unique_lock<std::mutex> lock(mutex_);
-    std::string value;
-    while (!in_.empty()) {
-      auto pair = in_.front();
-      in_.pop();
-      lock.unlock();
-
-      label_->set_int32_data(0, pair.second);
-
-      // Add raw file contents to DB if !raw
-      if (!FLAGS_raw) {
-        std::ifstream image_file_stream(input_folder + pair.first);
-        if (!image_file_stream) {
-          LOG(ERROR) << "Cannot open " << input_folder << pair.first
-                     << ". Skipping.";
-        } else {
-          data_->mutable_string_data(0)->assign(
-              std::istreambuf_iterator<char>(image_file_stream),
-              std::istreambuf_iterator<char>());
-        }
-      } else {
-        // Load image
-        cv::Mat img = cv::imread(
-            input_folder + pair.first,
-            FLAGS_color ? cv::IMREAD_COLOR : cv::IMREAD_GRAYSCALE);
-
-        // Resize image
-        cv::Mat resized_img;
-        int scaled_width, scaled_height;
-        if (FLAGS_warp) {
-          scaled_width = FLAGS_scale;
-          scaled_height = FLAGS_scale;
-        } else if (img.rows > img.cols) {
-          scaled_width = FLAGS_scale;
-          scaled_height = static_cast<float>(img.rows) * FLAGS_scale / img.cols;
-        } else {
-          scaled_height = FLAGS_scale;
-          scaled_width = static_cast<float>(img.cols) * FLAGS_scale / img.rows;
-        }
-        cv::resize(
-            img,
-            resized_img,
-            cv::Size(scaled_width, scaled_height),
-            0,
-            0,
-            cv::INTER_LINEAR);
-        data_->set_dims(0, scaled_height);
-        data_->set_dims(1, scaled_width);
-
-        // Assert we don't have to deal with alignment
-        DCHECK(resized_img.isContinuous());
-        auto nbytes = resized_img.total() * resized_img.elemSize();
-        data_->set_byte_data(resized_img.ptr(), nbytes);
-      }
-
-      protos_.SerializeToString(&value);
-
-      // Add serialized proto to out queue or wait if it is not empty
-      lock.lock();
-      while (!out_.empty()) {
-        cv_.wait(lock);
-      }
-      out_.push(value);
-      cv_.notify_one();
-    }
-  }
-
- protected:
-  TensorProtos protos_;
-  TensorProto* data_;
-  TensorProto* label_;
-  std::queue<std::pair<std::string, int>> in_;
-  std::queue<std::string> out_;
-
-  std::mutex mutex_;
-  std::condition_variable cv_;
-  std::thread thread_;
-};
-
-void ConvertImageDataset(
-    const string& input_folder,
-    const string& list_filename,
-    const string& output_db_name,
-    const bool /*shuffle*/) {
-  std::ifstream list_file(list_filename);
-  std::vector<std::pair<std::string, int> > lines;
-  std::string filename;
-  int file_label;
-  while (list_file >> filename >> file_label) {
-    lines.push_back(std::make_pair(filename, file_label));
-  }
-
-  if (FLAGS_shuffle) {
-    LOG(INFO) << "Shuffling data";
-    std::shuffle(lines.begin(), lines.end(), std::default_random_engine(1701));
-  }
-
-  auto num_threads = FLAGS_num_threads;
-  if (num_threads < 1) {
-    num_threads = std::thread::hardware_concurrency();
-  }
-
-  LOG(INFO) << "Processing " << lines.size() << " images...";
-  LOG(INFO) << "Opening DB " << output_db_name;
-
-  auto db = db::CreateDB(FLAGS_db, output_db_name, db::NEW);
-  auto transaction = db->NewTransaction();
-
-  LOG(INFO) << "Using " << num_threads << " processing threads...";
-  std::vector<Converter> converters(num_threads);
-
-  // Queue entries across converters
-  for (auto i = 0; i < lines.size(); i++) {
-    converters[i % converters.size()].queue(lines[i]);
-  }
-
-  // Start all converters
-  for (auto& converter : converters) {
-    converter.start();
-  }
-
-  constexpr auto key_max_length = 256;
-  char key_cstr[key_max_length];
-  int count = 0;
-  for (auto i = 0; i < lines.size(); i++) {
-    // Get serialized proto for this entry
-    auto value = converters[i % converters.size()].get();
-
-    // Synthesize key for this entry
-    auto key_len = snprintf(
-        key_cstr, sizeof(key_cstr), "%08d_%s", i, lines[i].first.c_str());
-    TORCH_DCHECK_LE(key_len, sizeof(key_cstr));
-
-    // Put in db
-    transaction->Put(string(key_cstr), std::move(value));
-
-    if (++count % 1000 == 0) {
-      // Commit the current writes.
-      transaction->Commit();
-      LOG(INFO) << "Processed " << count << " files.";
-    }
-  }
-
-  // Commit final transaction
-  transaction->Commit();
-  LOG(INFO) << "Processed " << count << " files.";
-}
-
-}  // namespace caffe2
-
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-  caffe2::ConvertImageDataset(
-      FLAGS_input_folder, FLAGS_list_file, FLAGS_output_db_name, FLAGS_shuffle);
-  return 0;
-}
diff --git a/binaries/make_mnist_db.cc b/binaries/make_mnist_db.cc
deleted file mode 100644
index 53e9d87d0e151..0000000000000
--- a/binaries/make_mnist_db.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// This script converts the MNIST dataset to leveldb.
-// The MNIST dataset could be downloaded at
-//    http://yann.lecun.com/exdb/mnist/
-
-#include <fstream>  // NOLINT(readability/streams)
-#include <string>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/db.h"
-#include "caffe2/core/init.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/core/logging.h"
-
-C10_DEFINE_string(image_file, "", "The input image file name.");
-C10_DEFINE_string(label_file, "", "The label file name.");
-C10_DEFINE_string(output_file, "", "The output db name.");
-C10_DEFINE_string(db, "leveldb", "The db type.");
-C10_DEFINE_int(
-    data_limit,
-    -1,
-    "If set, only output this number of data points.");
-C10_DEFINE_bool(
-    channel_first,
-    false,
-    "If set, write the data as channel-first (CHW order) as the old "
-    "Caffe does.");
-
-namespace caffe2 {
-uint32_t swap_endian(uint32_t val) {
-    val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
-    return (val << 16) | (val >> 16);
-}
-
-void convert_dataset(const char* image_filename, const char* label_filename,
-        const char* db_path, const int data_limit) {
-  // Open files
-  std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
-  std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
-  CAFFE_ENFORCE(image_file, "Unable to open file ", image_filename);
-  CAFFE_ENFORCE(label_file, "Unable to open file ", label_filename);
-  // Read the magic and the meta data
-  uint32_t magic;
-  uint32_t num_items;
-  uint32_t num_labels;
-  uint32_t rows;
-  uint32_t cols;
-
-  image_file.read(reinterpret_cast<char*>(&magic), 4);
-  magic = swap_endian(magic);
-  if (magic == 529205256) {
-    LOG(FATAL) <<
-        "It seems that you forgot to unzip the mnist dataset. You should "
-        "first unzip them using e.g. gunzip on Linux.";
-  }
-  CAFFE_ENFORCE_EQ(magic, 2051, "Incorrect image file magic.");
-  label_file.read(reinterpret_cast<char*>(&magic), 4);
-  magic = swap_endian(magic);
-  CAFFE_ENFORCE_EQ(magic, 2049, "Incorrect label file magic.");
-  image_file.read(reinterpret_cast<char*>(&num_items), 4);
-  num_items = swap_endian(num_items);
-  label_file.read(reinterpret_cast<char*>(&num_labels), 4);
-  num_labels = swap_endian(num_labels);
-  CAFFE_ENFORCE_EQ(num_items, num_labels);
-  image_file.read(reinterpret_cast<char*>(&rows), 4);
-  rows = swap_endian(rows);
-  image_file.read(reinterpret_cast<char*>(&cols), 4);
-  cols = swap_endian(cols);
-
-  // leveldb
-  std::unique_ptr<db::DB> mnist_db(db::CreateDB(FLAGS_db, db_path, db::NEW));
-  std::unique_ptr<db::Transaction> transaction(mnist_db->NewTransaction());
-  // Storing to db
-  char label_value;
-  std::vector<char> pixels(rows * cols);
-  int count = 0;
-  const int kMaxKeyLength = 11;
-  char key_cstr[kMaxKeyLength];
-
-  TensorProtos protos;
-  TensorProto* data = protos.add_protos();
-  TensorProto* label = protos.add_protos();
-  data->set_data_type(TensorProto::BYTE);
-  if (FLAGS_channel_first) {
-    data->add_dims(1);
-    data->add_dims(rows);
-    data->add_dims(cols);
-  } else {
-    data->add_dims(rows);
-    data->add_dims(cols);
-    data->add_dims(1);
-  }
-  label->set_data_type(TensorProto::INT32);
-  label->add_int32_data(0);
-
-  LOG(INFO) << "A total of " << num_items << " items.";
-  LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
-  for (int item_id = 0; item_id < num_items; ++item_id) {
-    image_file.read(pixels.data(), rows * cols);
-    label_file.read(&label_value, 1);
-    for (int i = 0; i < rows * cols; ++i) {
-      data->set_byte_data(pixels.data(), rows * cols);
-    }
-    label->set_int32_data(0, static_cast<int>(label_value));
-    snprintf(key_cstr, kMaxKeyLength, "%08d", item_id);
-    string keystr(key_cstr);
-
-    // Put in db
-    transaction->Put(keystr, protos.SerializeAsString());
-    if (++count % 1000 == 0) {
-      transaction->Commit();
-    }
-    if (data_limit > 0 && count == data_limit) {
-      LOG(INFO) << "Reached data limit of " << data_limit << ", stop.";
-      break;
-    }
-  }
-}
-}  // namespace caffe2
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-  caffe2::convert_dataset(
-      FLAGS_image_file.c_str(),
-      FLAGS_label_file.c_str(),
-      FLAGS_output_file.c_str(),
-      FLAGS_data_limit);
-  return 0;
-}
diff --git a/binaries/predictor_verifier.cc b/binaries/predictor_verifier.cc
deleted file mode 100644
index 47332dd273ce3..0000000000000
--- a/binaries/predictor_verifier.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/flags.h"
-#include "caffe2/core/init.h"
-#include "caffe2/predictor/predictor.h"
-#include "caffe2/utils/proto_utils.h"
-
-C10_DEFINE_string(init_net, "", "The given path to the init protobuffer.");
-C10_DEFINE_string(
-    predict_net,
-    "",
-    "The given path to the predict protobuffer.");
-
-namespace caffe2 {
-
-void run() {
-  if (FLAGS_init_net.empty()) {
-    LOG(FATAL) << "No init net specified. Use --init_net=/path/to/net.";
-  }
-  if (FLAGS_predict_net.empty()) {
-    LOG(FATAL) << "No predict net specified. Use --predict_net=/path/to/net.";
-  }
-  caffe2::NetDef init_net, predict_net;
-  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net));
-  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_predict_net, &predict_net));
-  // Can be large due to constant fills
-  VLOG(1) << "Init net: " << ProtoDebugString(init_net);
-  LOG(INFO) << "Predict net: " << ProtoDebugString(predict_net);
-  auto predictor = std::make_unique<Predictor>(init_net, predict_net);
-  LOG(INFO) << "Checking that a null forward-pass works";
-  Predictor::TensorList inputVec, outputVec;
-  (*predictor)(inputVec, &outputVec);
-  CAFFE_ENFORCE_GT(outputVec.size(), 0);
-}
-}
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-  caffe2::run();
-  // This is to allow us to use memory leak checks.
-  caffe2::ShutdownProtobufLibrary();
-  return 0;
-}
diff --git a/binaries/print_registered_core_operators.cc b/binaries/print_registered_core_operators.cc
deleted file mode 100644
index a96af0d675518..0000000000000
--- a/binaries/print_registered_core_operators.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <iostream>
-#include <string>
-
-#include "caffe2/core/init.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/operator_schema.h"
-
-C10_DEFINE_string(schema, "", "Print doc and schema of a particular operator");
-
-static bool HasSchema(const std::string& str) {
-  return caffe2::OpSchemaRegistry::Schema(str);
-}
-
-static bool HasDoc(const std::string& str) {
-  const auto* schema = caffe2::OpSchemaRegistry::Schema(str);
-  return (schema != nullptr) && (schema->doc() != nullptr);
-}
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-
-  if (!FLAGS_schema.empty()) {
-    const auto* schema = caffe2::OpSchemaRegistry::Schema(FLAGS_schema);
-    if (!schema) {
-      std::cerr << "Operator " << FLAGS_schema << " doesn't have a schema"
-                << std::endl;
-      return 1;
-    }
-    std::cout << "Operator " << FLAGS_schema << ": " << std::endl << *schema;
-    return 0;
-  }
-
-  for (const auto& pair : *caffe2::gDeviceTypeRegistry()) {
-    std::cout << "Device type " << pair.first
-#ifndef CAFFE2_USE_LITE_PROTO
-              << " ("
-              << at::DeviceTypeName(static_cast<caffe2::DeviceType>(pair.first))
-              << ")"
-#endif
-              << std::endl;
-    for (const auto& key : pair.second->Keys()) {
-      std::cout << "\t(schema: " << HasSchema(key) << ", doc: " << HasDoc(key)
-                << ")\t" << key << std::endl;
-    }
-  }
-
-  std::cout << "Operators that have gradients registered:" << std::endl;
-  for (const auto& key : caffe2::GradientRegistry()->Keys()) {
-    std::cout << "\t(schema: " << HasSchema(key) << ", doc: "
-              << HasDoc(key) << ")\t"
-              << key << std::endl;
-  }
-  return 0;
-}
diff --git a/binaries/run_plan.cc b/binaries/run_plan.cc
deleted file mode 100644
index c5d467bd688b2..0000000000000
--- a/binaries/run_plan.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/init.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/proto_utils.h"
-#include "caffe2/core/logging.h"
-
-C10_DEFINE_string(plan, "", "The given path to the plan protobuffer.");
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-  if (FLAGS_plan.size() == 0) {
-    LOG(ERROR) << "No plan specified. Use --plan=/path/to/plan.";
-    return 0;
-  }
-  LOG(INFO) << "Loading plan: " << FLAGS_plan;
-  caffe2::PlanDef plan_def;
-  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_plan, &plan_def));
-  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
-  workspace->RunPlan(plan_def);
-
-  // This is to allow us to use memory leak checks.
-  caffe2::ShutdownProtobufLibrary();
-  return 0;
-}
diff --git a/binaries/run_plan_mpi.cc b/binaries/run_plan_mpi.cc
deleted file mode 100644
index 476aec14f5364..0000000000000
--- a/binaries/run_plan_mpi.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <mpi.h>
-
-#include "c10/util/Flags.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/proto_utils.h"
-
-C10_DEFINE_string(plan, "", "The given path to the plan protobuffer.");
-
-int main(int argc, char** argv) {
-  c10::SetUsageMessage("Runs a caffe2 plan that has MPI operators in it.");
-  int mpi_ret;
-  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_ret);
-  if (mpi_ret != MPI_THREAD_MULTIPLE &&
-      mpi_ret != MPI_THREAD_SERIALIZED) {
-    std::cerr << "Caffe2 MPI requires the underlying MPI to support the "
-                 "MPI_THREAD_SERIALIZED or MPI_THREAD_MULTIPLE mode.\n";
-    return 1;
-  }
-  caffe2::GlobalInit(&argc, &argv);
-  LOG(INFO) << "Loading plan: " << FLAGS_plan;
-  caffe2::PlanDef plan_def;
-  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_plan, &plan_def));
-  std::unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
-  workspace->RunPlan(plan_def);
-
-  // This is to allow us to use memory leak checks.
-  caffe2::ShutdownProtobufLibrary();
-  MPI_Finalize();
-  return 0;
-}
diff --git a/binaries/speed_benchmark.cc b/binaries/speed_benchmark.cc
deleted file mode 100644
index 6028cef734f29..0000000000000
--- a/binaries/speed_benchmark.cc
+++ /dev/null
@@ -1,210 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <string>
-
-#include "caffe2/core/blob_serialization.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/tensor_int8.h"
-#ifdef CAFFE2_OPTIMIZER
-#include "caffe2/opt/optimizer.h"
-#endif
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/utils/proto_utils.h"
-#include "caffe2/utils/string_utils.h"
-
-C10_DEFINE_string(net, "", "The given net to benchmark.");
-C10_DEFINE_string(init_net, "", "The given net to initialize any parameters.");
-C10_DEFINE_string(
-    input,
-    "",
-    "Input that is needed for running the network. If "
-    "multiple input needed, use comma separated string.");
-C10_DEFINE_string(
-    input_file,
-    "",
-    "Input file that contain the serialized protobuf for "
-    "the input blobs. If multiple input needed, use comma "
-    "separated string. Must have the same number of items "
-    "as input does.");
-C10_DEFINE_string(
-    input_dims,
-    "",
-    "Alternate to input_files, if all inputs are simple "
-    "float TensorCPUs, specify the dimension using comma "
-    "separated numbers. If multiple input needed, use "
-    "semicolon to separate the dimension of different "
-    "tensors.");
-C10_DEFINE_string(input_type, "", "Input type (uint8_t/float)");
-C10_DEFINE_string(
-    output,
-    "",
-    "Output that should be dumped after the execution "
-    "finishes. If multiple outputs are needed, use comma "
-    "separated string. If you want to dump everything, pass "
-    "'*' as the output value.");
-C10_DEFINE_string(
-    output_folder,
-    "",
-    "The folder that the output should be written to. This "
-    "folder must already exist in the file system.");
-C10_DEFINE_int(warmup, 0, "The number of iterations to warm up.");
-C10_DEFINE_int(iter, 10, "The number of iterations to run.");
-C10_DEFINE_int(opt, 0, "The level of optimization to run automatically.");
-C10_DEFINE_bool(
-    run_individual,
-    false,
-    "Whether to benchmark individual operators.");
-
-C10_DEFINE_bool(force_engine, false, "Force engine field for all operators");
-C10_DEFINE_string(engine, "", "Forced engine field value");
-C10_DEFINE_bool(force_algo, false, "Force algo arg for all operators");
-C10_DEFINE_string(algo, "", "Forced algo arg value");
-
-using std::string;
-using std::unique_ptr;
-using std::vector;
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-  unique_ptr<caffe2::Workspace> workspace(new caffe2::Workspace());
-
-  // Run initialization network.
-  caffe2::NetDef net_def;
-  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &net_def));
-  CAFFE_ENFORCE(workspace->RunNetOnce(net_def));
-
-  // Load input.
-  if (FLAGS_input.size()) {
-    vector<string> input_names = caffe2::split(',', FLAGS_input);
-    if (FLAGS_input_file.size()) {
-      vector<string> input_files = caffe2::split(',', FLAGS_input_file);
-      CAFFE_ENFORCE_EQ(
-          input_names.size(),
-          input_files.size(),
-          "Input name and file should have the same number.");
-      for (int i = 0; i < input_names.size(); ++i) {
-        caffe2::BlobProto blob_proto;
-        CAFFE_ENFORCE(caffe2::ReadProtoFromFile(input_files[i], &blob_proto));
-        DeserializeBlob(blob_proto, workspace->CreateBlob(input_names[i]));
-      }
-    } else if (FLAGS_input_dims.size() || FLAGS_input_type.size()) {
-      CAFFE_ENFORCE_GE(
-          FLAGS_input_dims.size(),
-          0,
-          "Input dims must be specified when input tensors are used.");
-      CAFFE_ENFORCE_GE(
-          FLAGS_input_type.size(),
-          0,
-          "Input type must be specified when input tensors are used.");
-
-      vector<string> input_dims_list = caffe2::split(';', FLAGS_input_dims);
-      CAFFE_ENFORCE_EQ(
-          input_names.size(),
-          input_dims_list.size(),
-          "Input name and dims should have the same number of items.");
-      vector<string> input_type_list = caffe2::split(';', FLAGS_input_type);
-      CAFFE_ENFORCE_EQ(
-          input_names.size(),
-          input_type_list.size(),
-          "Input name and type should have the same number of items.");
-      for (size_t i = 0; i < input_names.size(); ++i) {
-        vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
-        vector<int> input_dims;
-        for (const string& s : input_dims_str) {
-          input_dims.push_back(std::stoi(s));
-        }
-        caffe2::Blob* blob = workspace->GetBlob(input_names[i]);
-        if (blob == nullptr) {
-          blob = workspace->CreateBlob(input_names[i]);
-        }
-        if (input_type_list[i] == "uint8_t") {
-          caffe2::int8::Int8TensorCPU* tensor =
-              blob->GetMutable<caffe2::int8::Int8TensorCPU>();
-          TORCH_CHECK_NOTNULL(tensor);
-          tensor->t.Resize(input_dims);
-          tensor->t.mutable_data<uint8_t>();
-        } else if (input_type_list[i] == "float") {
-          caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
-          TORCH_CHECK_NOTNULL(tensor);
-          tensor->Resize(input_dims);
-          tensor->mutable_data<float>();
-        } else {
-          CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
-        }
-      }
-    } else {
-      CAFFE_THROW(
-          "You requested input tensors, but neither input_file nor "
-          "input_dims is set.");
-    }
-  }
-
-  // Run main network.
-  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
-  if (!net_def.has_name()) {
-    net_def.set_name("benchmark");
-  }
-  // force changing engine and algo
-  if (FLAGS_force_engine) {
-    LOG(INFO) << "force engine be: " << FLAGS_engine;
-    for (const auto& op : net_def.op()) {
-      const_cast<caffe2::OperatorDef*>(&op)->set_engine(FLAGS_engine);
-    }
-  }
-  if (FLAGS_force_algo) {
-    LOG(INFO) << "force algo be: " << FLAGS_algo;
-    for (const auto& op : net_def.op()) {
-      caffe2::GetMutableArgument(
-          "algo", true, const_cast<caffe2::OperatorDef*>(&op))
-          ->set_s(FLAGS_algo);
-    }
-  }
-  if (FLAGS_opt) {
-#ifdef CAFFE2_OPTIMIZER
-    net_def = caffe2::opt::optimize(net_def, workspace.get(), FLAGS_opt);
-#else
-    LOG(WARNING) << "Caffe2 not compiled with optimization passes.";
-#endif
-  }
-
-  caffe2::NetBase* net = workspace->CreateNet(net_def);
-  TORCH_CHECK_NOTNULL(net);
-  CAFFE_ENFORCE(net->Run());
-  net->TEST_Benchmark(FLAGS_warmup, FLAGS_iter, FLAGS_run_individual);
-
-  string output_prefix =
-      FLAGS_output_folder.size() ? FLAGS_output_folder + "/" : "";
-  if (FLAGS_output.size()) {
-    vector<string> output_names = caffe2::split(',', FLAGS_output);
-    if (FLAGS_output == "*") {
-      output_names = workspace->Blobs();
-    }
-    for (const string& name : output_names) {
-      CAFFE_ENFORCE(
-          workspace->HasBlob(name),
-          "You requested a non-existing blob: ",
-          name);
-      string serialized = SerializeBlob(*workspace->GetBlob(name), name);
-      string output_filename = output_prefix + name;
-      caffe2::WriteStringToFile(serialized, output_filename.c_str());
-    }
-  }
-
-  return 0;
-}
diff --git a/binaries/split_db.cc b/binaries/split_db.cc
deleted file mode 100644
index 34c0b60dd1514..0000000000000
--- a/binaries/split_db.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <string>
-#include <sstream>
-
-#include "caffe2/core/db.h"
-#include "caffe2/core/init.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/core/logging.h"
-
-C10_DEFINE_string(input_db, "", "The input db.");
-C10_DEFINE_int(splits, 0, "The number of splits.");
-C10_DEFINE_string(db_type, "", "The db type.");
-C10_DEFINE_int(batch_size, 1000, "The write batch size.");
-
-namespace caffe2 {
-
-static int Split(int argc, char** argv) {
-  GlobalInit(&argc, &argv);
-
-  CAFFE_ENFORCE(FLAGS_input_db.size(), "Must specify --input_db=/path/to/db.");
-  CAFFE_ENFORCE(FLAGS_splits > 0, "Must specify a nonnegative split number.");
-  CAFFE_ENFORCE(FLAGS_db_type.size(), "Must specify --db_type=[a db type].");
-
-  unique_ptr<db::DB> in_db(
-      db::CreateDB(FLAGS_db_type, FLAGS_input_db, db::READ));
-  CAFFE_ENFORCE(in_db != nullptr, "Cannot open input db: ", FLAGS_input_db);
-  unique_ptr<db::Cursor> cursor(in_db->NewCursor());
-  // This usually won't happen, but FWIW.
-  CAFFE_ENFORCE(
-      cursor != nullptr, "Cannot obtain cursor for input db: ", FLAGS_input_db);
-
-  vector<unique_ptr<db::DB>> out_dbs;
-  vector<unique_ptr<db::Transaction>> transactions;
-  for (int i = 0; i < FLAGS_splits; ++i) {
-    out_dbs.push_back(unique_ptr<db::DB>(db::CreateDB(
-        FLAGS_db_type, FLAGS_input_db + "_split_" + to_string(i), db::NEW)));
-    CAFFE_ENFORCE(out_dbs.back().get(), "Cannot create output db #", i);
-    transactions.push_back(
-        unique_ptr<db::Transaction>(out_dbs[i]->NewTransaction()));
-    CAFFE_ENFORCE(
-        transactions.back().get(), "Cannot get transaction for output db #", i);
-  }
-
-  int count = 0;
-  for (; cursor->Valid(); cursor->Next()) {
-    transactions[count % FLAGS_splits]->Put(cursor->key(), cursor->value());
-    if (++count % FLAGS_batch_size == 0) {
-      for (int i = 0; i < FLAGS_splits; ++i) {
-        transactions[i]->Commit();
-      }
-      LOG(INFO) << "Split " << count << " items so far.";
-    }
-  }
-  LOG(INFO) << "A total of " << count << " items processed.";
-  return 0;
-}
-
-} // namespace caffe2
-
-int main(int argc, char** argv) {
-  return caffe2::Split(argc, argv);
-}
diff --git a/binaries/tutorial_blob.cc b/binaries/tutorial_blob.cc
deleted file mode 100644
index ac74ebb5ffb78..0000000000000
--- a/binaries/tutorial_blob.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/blob.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/core/logging.h"
-
-// We will be lazy and just use the whole namespace.
-using namespace caffe2;
-
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-  caffe2::ShowLogInfoToStderr();
-
-  LOG(INFO) <<
-      "This script corresponds to the Blob part of the Caffe2 C++ "
-      "tutorial.";
-
-  LOG(INFO) << "Let's create a blob myblob.";
-
-  Blob myblob;
-
-  LOG(INFO) << "Let's set it to int and set the value to 10.";
-
-  int* myint = myblob.GetMutable<int>();
-  *myint = 10;
-
-  LOG(INFO)
-      << "Is the blob type int? "
-      << myblob.IsType<int>();
-
-  LOG(INFO)
-      << "Is the blob type float? "
-      << myblob.IsType<float>();
-
-  const int& myint_const = myblob.Get<int>();
-  LOG(INFO)
-      << "The value of the int number stored in the blob is: "
-      << myint_const;
-
-  LOG(INFO)
-      << "Let's try to get a float pointer. This will trigger an exception.";
-
-  try {
-    const float& myfloat = myblob.Get<float>();
-    LOG(FATAL) << "This line should never happen.";
-  } catch (std::exception& e) {
-    LOG(INFO)
-        << "As expected, we got an exception. Its content says: "
-        << e.what();
-  }
-
-  LOG(INFO) <<
-      "However, we can change the content type (and destroy the old "
-      "content) by calling GetMutable. Let's change it to double.";
-
-  double* mydouble = myblob.GetMutable<double>();
-  *mydouble = 3.14;
-
-  LOG(INFO) << "The new content is: " << myblob.Get<double>();
-
-  LOG(INFO) <<
-      "If we have a pre-created object, we can use Reset() to transfer the "
-      "object to a blob.";
-
-  std::string* pvec = new std::string();
-  myblob.Reset(pvec); // no need to release pvec, myblob takes ownership.
-
-  LOG(INFO) << "Is the blob now of type string? "
-            << myblob.IsType<std::string>();
-
-  LOG(INFO) << "This concludes the blob tutorial.";
-  return 0;
-}
diff --git a/binaries/zmq_feeder.cc b/binaries/zmq_feeder.cc
deleted file mode 100644
index 8a6a71f5397c2..0000000000000
--- a/binaries/zmq_feeder.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// This binary provides an easy way to open a zeromq server and feeds data to
-// clients connect to it. It uses the Caffe2 db as the backend, thus allowing
-// one to convert any db-compliant storage to a zeromq service.
-
-#include "caffe2/core/db.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/utils/zmq_helper.h"
-
-C10_DEFINE_string(server, "tcp://*:5555", "The server address.");
-C10_DEFINE_string(input_db, "", "The input db.");
-C10_DEFINE_string(input_db_type, "", "The input db type.");
-
-
-int main(int argc, char** argv) {
-  caffe2::GlobalInit(&argc, &argv);
-
-  LOG(INFO) << "Opening DB...";
-  auto in_db = caffe2::db::CreateDB(
-      FLAGS_input_db_type, FLAGS_input_db, caffe2::db::READ);
-  CAFFE_ENFORCE(
-      in_db,
-      "Cannot load input db " + FLAGS_input_db + " of expected type " +
-          FLAGS_input_db_type);
-  auto cursor = in_db->NewCursor();
-  LOG(INFO) << "DB opened.";
-
-  LOG(INFO) << "Starting ZeroMQ server...";
-
-  //  Socket to talk to clients
-  caffe2::ZmqSocket sender(ZMQ_PUSH);
-  sender.Bind(FLAGS_server);
-  LOG(INFO) << "Server created at " << FLAGS_server;
-
-  while (1) {
-    VLOG(1) << "Sending " << cursor->key();
-    sender.SendTillSuccess(cursor->key(), ZMQ_SNDMORE);
-    sender.SendTillSuccess(cursor->value(), 0);
-    cursor->Next();
-    if (!cursor->Valid()) {
-      cursor->SeekToFirst();
-    }
-  }
-  // We do not do an elegant quit since this binary is going to be terminated by
-  // control+C.
-  return 0;
-}
diff --git a/buckbuild.bzl b/buckbuild.bzl
index 305a0cf3c896f..89707dd9bc3f0 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -144,6 +144,9 @@ def get_glsl_paths():
         ],
     )
 
+def spv_shader_library():
+    pass
+
 # @lint-ignore BUCKRESTRICTEDSYNTAX
 IS_OSS = read_config("pt", "is_oss", "0") == "1"  # True for OSS BUCK build, and False for internal BUCK build
 
@@ -700,6 +703,43 @@ def gen_aten_libtorch_files(name, extra_params = [], compatible_with = [], apple
         apple_sdks = apple_sdks,
     )
 
+def vulkan_spv_shader_library(name, spv_filegroup):
+    genrule_cmd = [
+        "$(exe //xplat/caffe2/tools:gen_aten_vulkan_spv_bin)",
+        "--glsl-paths $(location {})".format(spv_filegroup),
+        "--output-path $OUT --env FLOAT_IMAGE_FORMAT={}".format(get_glsl_image_format()),
+        "--glslc-path=$(exe //xplat/caffe2/fb/vulkan/dotslash:glslc)",
+        "--tmp-dir-path=$TMP",
+    ]
+
+    genrule_name = "gen_{}_cpp".format(name)
+    fb_xplat_genrule(
+        name = "gen_{}_cpp".format(name),
+        outs = {
+            "{}.cpp".format(name): ["spv.cpp"],
+        },
+        cmd = " ".join(genrule_cmd),
+        default_outs = ["."],
+        labels = ["uses_dotslash"],
+    )
+
+    fb_xplat_cxx_library(
+        name = name,
+        srcs = [
+            ":{}[{}.cpp]".format(genrule_name, name),
+        ],
+        # Static initialization is used to register shaders to the global shader registry,
+        # therefore link_whole must be True to make sure unused symbols are not discarded.
+        # @lint-ignore BUCKLINT: Avoid `link_whole=True`
+        link_whole = True,
+        # Define a soname that can be used for dynamic loading in Java, Python, etc.
+        soname = "lib{}.$(ext)".format(name),
+        visibility = ["PUBLIC"],
+        exported_deps = [
+            "//xplat/caffe2:torch_vulkan_api",
+        ],
+    )
+
 def copy_metal(name, apple_sdks = None):
     cmd = []
     cmd_exe = []
@@ -960,6 +1000,7 @@ def define_buck_targets(
             "Functions.h": ":gen_aten_libtorch[autograd/generated/Functions.h]",
             "VariableType.h": ":gen_aten_libtorch[autograd/generated/VariableType.h]",
             "variable_factories.h": ":gen_aten_libtorch[autograd/generated/variable_factories.h]",
+            "ViewFuncs.h": ":gen_aten_libtorch[autograd/generated/ViewFuncs.h]",
             # Don't build python bindings on mobile.
             #"python_functions.h",
         },
@@ -1466,6 +1507,7 @@ def define_buck_targets(
             "torch/csrc/jit/mobile/train/random.cpp",
             "torch/csrc/jit/mobile/train/sequential.cpp",
             ":gen_aten_libtorch[autograd/generated/Functions.cpp]",
+            ":gen_aten_libtorch[autograd/generated/ViewFuncs.cpp]",
         ],
         compiler_flags = get_pt_compiler_flags(),
         exported_preprocessor_flags = get_pt_preprocessor_flags() + ["-DUSE_MOBILE_CLASSTYPE"],
diff --git a/build.bzl b/build.bzl
index 6490a7f3839eb..5ab9f92acecca 100644
--- a/build.bzl
+++ b/build.bzl
@@ -261,6 +261,7 @@ _GENERATED_AUTOGRAD_PYTHON_HEADERS = [
 _GENERATED_AUTOGRAD_CPP_HEADERS = [
     "torch/csrc/autograd/generated/Functions.h",
     "torch/csrc/autograd/generated/VariableType.h",
+    "torch/csrc/autograd/generated/ViewFuncs.h",
     "torch/csrc/autograd/generated/variable_factories.h",
 ]
 
@@ -303,6 +304,7 @@ GENERATED_AUTOGRAD_CPP = [
     "torch/csrc/autograd/generated/VariableType_2.cpp",
     "torch/csrc/autograd/generated/VariableType_3.cpp",
     "torch/csrc/autograd/generated/VariableType_4.cpp",
+    "torch/csrc/autograd/generated/ViewFuncs.cpp",
     "torch/csrc/autograd/generated/TraceType_0.cpp",
     "torch/csrc/autograd/generated/TraceType_1.cpp",
     "torch/csrc/autograd/generated/TraceType_2.cpp",
diff --git a/build_variables.bzl b/build_variables.bzl
index 693c440fda349..5939da825cc00 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -24,6 +24,7 @@ def libtorch_generated_sources(gencode_pattern):
         "torch/csrc/autograd/generated/VariableType_2.cpp",
         "torch/csrc/autograd/generated/VariableType_3.cpp",
         "torch/csrc/autograd/generated/VariableType_4.cpp",
+        "torch/csrc/autograd/generated/ViewFuncs.cpp",
         "torch/csrc/autograd/generated/TraceType_0.cpp",
         "torch/csrc/autograd/generated/TraceType_1.cpp",
         "torch/csrc/autograd/generated/TraceType_2.cpp",
@@ -122,6 +123,7 @@ core_trainer_sources = [
     "torch/csrc/autograd/autograd_not_implemented_fallback.cpp",
     "torch/csrc/autograd/cpp_hook.cpp",
     "torch/csrc/autograd/custom_function.cpp",
+    "torch/csrc/autograd/variable_info.cpp",
     "torch/csrc/autograd/engine.cpp",
     "torch/csrc/autograd/function.cpp",
     "torch/csrc/autograd/input_metadata.cpp",
@@ -744,9 +746,11 @@ torch_cpp_srcs = [
     "torch/csrc/api/src/optim/serialize.cpp",
     "torch/csrc/api/src/optim/sgd.cpp",
     "torch/csrc/api/src/optim/schedulers/lr_scheduler.cpp",
+    "torch/csrc/api/src/optim/schedulers/reduce_on_plateau_scheduler.cpp",
     "torch/csrc/api/src/optim/schedulers/step_lr.cpp",
     "torch/csrc/api/src/serialize/input-archive.cpp",
     "torch/csrc/api/src/serialize/output-archive.cpp",
+    "torch/csrc/api/src/xpu.cpp",
 ]
 
 libtorch_python_cuda_core_sources = [
@@ -767,6 +771,12 @@ libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [
     "torch/csrc/cuda/Tensor.cpp",
 ]
 
+libtorch_python_xpu_sources = [
+    "torch/csrc/xpu/Event.cpp",
+    "torch/csrc/xpu/Module.cpp",
+    "torch/csrc/xpu/Stream.cpp",
+]
+
 libtorch_python_core_sources = [
     "torch/csrc/DataLoader.cpp",
     "torch/csrc/Device.cpp",
@@ -785,6 +795,7 @@ libtorch_python_core_sources = [
     "torch/csrc/StorageMethods.cpp",
     "torch/csrc/StorageSharing.cpp",
     "torch/csrc/Stream.cpp",
+    "torch/csrc/Event.cpp",
     "torch/csrc/TypeInfo.cpp",
     "torch/csrc/api/src/python/init.cpp",
     "torch/csrc/autograd/functions/init.cpp",
@@ -802,13 +813,16 @@ libtorch_python_core_sources = [
     "torch/csrc/autograd/python_variable.cpp",
     "torch/csrc/autograd/python_variable_indexing.cpp",
     "torch/csrc/dynamo/python_compiled_autograd.cpp",
+    "torch/csrc/dynamo/cache_entry.cpp",
     "torch/csrc/dynamo/cpp_shim.cpp",
     "torch/csrc/dynamo/cpython_defs.c",
     "torch/csrc/dynamo/eval_frame.c",
+    "torch/csrc/dynamo/extra_state.cpp",
     "torch/csrc/dynamo/guards.cpp",
     "torch/csrc/dynamo/init.cpp",
     "torch/csrc/functorch/init.cpp",
     "torch/csrc/mps/Module.cpp",
+    "torch/csrc/mtia/Module.cpp",
     "torch/csrc/inductor/aoti_runner/pybind.cpp",
     "torch/csrc/jit/backends/backend_init.cpp",
     "torch/csrc/jit/python/init.cpp",
@@ -862,7 +876,7 @@ libtorch_python_core_sources = [
     "torch/csrc/utils/init.cpp",
     "torch/csrc/utils/throughput_benchmark.cpp",
     "torch/csrc/utils.cpp",
-    "torch/csrc/utils/cuda_lazy_init.cpp",
+    "torch/csrc/utils/device_lazy_init.cpp",
     "torch/csrc/utils/invalid_arguments.cpp",
     "torch/csrc/utils/nested.cpp",
     "torch/csrc/utils/object_ptr.cpp",
@@ -935,7 +949,7 @@ aten_cpu_non_globed_sources = [
     "aten/src/ATen/detail/CUDAHooksInterface.cpp",
     "aten/src/ATen/detail/HIPHooksInterface.cpp",
     "aten/src/ATen/detail/MPSHooksInterface.cpp",
-    "aten/src/ATen/detail/ORTHooksInterface.cpp",
+    "aten/src/ATen/detail/MAIAHooksInterface.cpp",
     "aten/src/ATen/detail/PrivateUse1HooksInterface.cpp",
     "aten/src/ATen/detail/XPUHooksInterface.cpp",
     "aten/src/ATen/detail/MTIAHooksInterface.cpp",
@@ -948,10 +962,11 @@ aten_cpu_non_globed_sources = [
 aten_cpu_non_globed_headers = [
     "aten/src/ATen/CPUGeneratorImpl.h",
     "aten/src/ATen/NumericUtils.h",
+    "aten/src/ATen/detail/AcceleratorHooksInterface.h",
     "aten/src/ATen/detail/CUDAHooksInterface.h",
     "aten/src/ATen/detail/MPSHooksInterface.h",
     "aten/src/ATen/detail/HIPHooksInterface.h",
-    "aten/src/ATen/detail/ORTHooksInterface.h",
+    "aten/src/ATen/detail/MAIAHooksInterface.h",
     "aten/src/ATen/detail/PrivateUse1HooksInterface.h",
     "aten/src/ATen/detail/XPUHooksInterface.h",
     "aten/src/ATen/detail/MTIAHooksInterface.h",
@@ -962,6 +977,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/AccumulateType.cpp",
     "aten/src/ATen/LegacyBatchedTensorImpl.cpp",
     "aten/src/ATen/CPUGeneratorImpl.cpp",
+    "aten/src/ATen/DeviceAccelerator.cpp",
     "aten/src/ATen/Context.cpp",
     "aten/src/ATen/DLConvertor.cpp",
     "aten/src/ATen/EmptyTensor.cpp",
@@ -1025,7 +1041,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/core/operator_name.cpp",
     "aten/src/ATen/core/TorchDispatchUtils.cpp",
     "aten/src/ATen/core/register_symbols.cpp",
-    "aten/src/ATen/core/SingletonSymNodeImpl.cpp",
+    "aten/src/ATen/core/NestedIntSymNodeImpl.cpp",
     "aten/src/ATen/core/class_type.cpp",
     "aten/src/ATen/core/type.cpp",
     "aten/src/ATen/core/type_factory.cpp",
@@ -1141,7 +1157,10 @@ aten_native_source_codegen_list = [
     "aten/src/ATen/native/cpu/airy_ai.cpp",
     "aten/src/ATen/native/cpu/batch_norm_kernel.cpp",
     "aten/src/ATen/native/cpu/group_norm_kernel.cpp",
+    "aten/src/ATen/native/cpu/int4mm_kernel.cpp",
+    "aten/src/ATen/native/cpu/int8mm_kernel.cpp",
     "aten/src/ATen/native/cpu/layer_norm_kernel.cpp",
+    "aten/src/ATen/native/cpu/AmpGradScalerKernels.cpp",
     "aten/src/ATen/native/cpu/scaled_modified_bessel_k0.cpp",
     "aten/src/ATen/native/cpu/scaled_modified_bessel_k1.cpp",
     "aten/src/ATen/native/cpu/spherical_bessel_j0.cpp",
@@ -1149,6 +1168,8 @@ aten_native_source_codegen_list = [
     "aten/src/ATen/native/cpu/SpmmReduceKernel.cpp",
     "aten/src/ATen/native/cpu/SparseFactories.cpp",
     "aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp",
+    "aten/src/ATen/native/cpu/FusedAdamKernel.cpp",
+    "aten/src/ATen/native/cpu/FusedSGDKernel.cpp",
 ]
 
 # This aten native source file list will not go through aten codegen process
@@ -1342,6 +1363,7 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/WeightNorm.cpp",
     "aten/src/ATen/native/group_norm.cpp",
     "aten/src/ATen/native/layer_norm.cpp",
+    "aten/src/ATen/native/AmpKernels.cpp",
     "aten/src/ATen/native/mkl/LinearAlgebra.cpp",
     "aten/src/ATen/native/mkl/SparseBlasImpl.cpp",
     "aten/src/ATen/native/mkl/SparseCsrLinearAlgebra.cpp",
@@ -1382,6 +1404,8 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/xnnpack/OpContext.cpp",
     "aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp",
     "aten/src/ATen/native/xnnpack/Shim.cpp",
+    "aten/src/ATen/native/FusedAdam.cpp",
+    "aten/src/ATen/native/FusedSGD.cpp",
     # Files not in native, but depends on native symbols
     # "aten/src/ATen/TensorIndexing.cpp",
     "aten/src/ATen/TensorIterator.cpp",
@@ -1398,6 +1422,8 @@ aten_cuda_cu_source_list = [
     "aten/src/ATen/cuda/CUDABlas.cpp",
     "aten/src/ATen/cuda/CUDASparseBlas.cpp",
     "aten/src/ATen/cuda/CublasHandlePool.cpp",
+    "aten/src/ATen/cuda/tunable/StreamTimer.cpp",
+    "aten/src/ATen/cuda/tunable/Tunable.cpp",
     "aten/src/ATen/native/cuda/Activation.cpp",
     "aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp",
     "aten/src/ATen/native/cuda/Blas.cpp",
diff --git a/c10/BUCK.oss b/c10/BUCK.oss
index 70f45415e34f9..4453622f88537 100644
--- a/c10/BUCK.oss
+++ b/c10/BUCK.oss
@@ -8,6 +8,7 @@ cxx_library(
             "test/**/*.cpp",
             "benchmark/**/*.cpp",
             "cuda/**/*.cpp",
+            "xpu/**/*.cpp",
         ],
     ),
       deps = [
@@ -30,6 +31,7 @@ cxx_library(
             "test/**/*.h",
             "benchmark/**/*.h",
             "cuda/**/*.h",
+            "xpu/**/*.h",
         ],
     ),
     exported_linker_flags = [],
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index 68396a654d299..1f742f4c17683 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -30,20 +30,18 @@ file(GLOB C10_SRCS
         *.cpp
         core/*.cpp
         core/impl/*.cpp
-        core/impl/cow/*.cpp
         mobile/*.cpp
         macros/*.cpp
         util/*.cpp
-        )
+      )
 file(GLOB C10_HEADERS
         *.h
         core/*.h
         core/impl/*.h
-        core/impl/cow/*.h
         mobile/*.h
         macros/*.h
         util/*.h
-        )
+      )
 add_library(c10 ${C10_SRCS} ${C10_HEADERS})
 target_compile_options_if_supported(c10 "-Wdeprecated")
 if(HAVE_SOVERSION)
@@ -71,7 +69,7 @@ if(C10_USE_IWYU)
         "--prefix_header_includes=keep"
         "-Xiwyu"
         "--mapping_file=${CMAKE_CURRENT_LIST_DIR}/../tools/iwyu/all.imp"
-        )
+      )
     set_property(TARGET c10 PROPERTY CXX_INCLUDE_WHAT_YOU_USE ${iwyu_cmd})
   endif()
 endif()
@@ -83,21 +81,26 @@ endif()
 
 # ---[ Dependency of c10
 if(C10_USE_GFLAGS)
-   target_link_libraries(c10 PUBLIC gflags)
+  target_link_libraries(c10 PUBLIC gflags)
 endif()
 
 if(C10_USE_GLOG)
-    target_link_libraries(c10 PUBLIC glog::glog)
+  target_link_libraries(c10 PUBLIC glog::glog)
 endif()
 target_link_libraries(c10 PRIVATE fmt::fmt-header-only)
 
 if(C10_USE_NUMA)
+  message(STATUS "NUMA paths:")
+  message(STATUS ${Numa_INCLUDE_DIR})
+  message(STATUS ${Numa_LIBRARIES})
   target_include_directories(c10 PRIVATE ${Numa_INCLUDE_DIR})
   target_link_libraries(c10 PRIVATE ${Numa_LIBRARIES})
+else()
+  message(STATUS "don't use NUMA")
 endif()
 
 if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "s390x" AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
-    target_link_libraries(c10 PRIVATE cpuinfo)
+  target_link_libraries(c10 PRIVATE cpuinfo)
 endif()
 
 find_package(Backtrace)
@@ -109,23 +112,13 @@ else()
   target_compile_definitions(c10 PRIVATE SUPPORTS_BACKTRACE=0)
 endif()
 
-if(USE_NUMA)
-  message(STATUS "NUMA paths:")
-  message(STATUS ${Numa_INCLUDE_DIR})
-  message(STATUS ${Numa_LIBRARIES})
-  include_directories(SYSTEM ${Numa_INCLUDE_DIR})
-  target_link_libraries(c10 PRIVATE ${Numa_LIBRARIES})
-else()
-  message(STATUS "don't use NUMA")
-endif()
-
 if(USE_MIMALLOC)
   target_link_libraries(c10 PRIVATE "mimalloc-static")
   add_dependencies(c10 mimalloc-static)
 endif()
 
 if(ANDROID)
-    target_link_libraries(c10 PRIVATE log)
+  target_link_libraries(c10 PRIVATE log)
 endif()
 
 target_include_directories(
@@ -147,6 +140,10 @@ if(USE_ROCM)
   add_subdirectory(hip)
 endif()
 
+if(USE_XPU)
+  add_subdirectory(xpu)
+endif()
+
 # ---[ Installation
 # Note: for now, we will put all export path into one single Caffe2Targets group
 # to deal with the cmake deployment need. Inside the Caffe2Targets set, the
diff --git a/c10/core/Allocator.cpp b/c10/core/Allocator.cpp
index dada5bb0eac40..491c85b081e88 100644
--- a/c10/core/Allocator.cpp
+++ b/c10/core/Allocator.cpp
@@ -4,17 +4,34 @@
 
 namespace c10 {
 
+DataPtr Allocator::clone(const void* data, std::size_t n) {
+  DataPtr new_data = allocate(n);
+  copy_data(new_data.mutable_get(), data, n);
+  return new_data;
+}
+
+void Allocator::default_copy_data(
+    void* dest,
+    const void* src,
+    std::size_t count) const {
+  std::memcpy(dest, src, count);
+}
+
+bool Allocator::is_simple_data_ptr(const DataPtr& data_ptr) const {
+  return data_ptr.get() == data_ptr.get_context();
+}
+
 static void deleteInefficientStdFunctionContext(void* ptr) {
   delete static_cast<InefficientStdFunctionContext*>(ptr);
 }
 
 at::DataPtr InefficientStdFunctionContext::makeDataPtr(
     void* ptr,
-    const std::function<void(void*)>& deleter,
+    std::function<void(void*)> deleter,
     Device device) {
   return {
       ptr,
-      new InefficientStdFunctionContext({ptr, deleter}),
+      new InefficientStdFunctionContext(ptr, std::move(deleter)),
       &deleteInefficientStdFunctionContext,
       device};
 }
diff --git a/c10/core/Allocator.h b/c10/core/Allocator.h
index fb7f6c0b4a763..412412557a0d1 100644
--- a/c10/core/Allocator.h
+++ b/c10/core/Allocator.h
@@ -160,7 +160,22 @@ inline bool operator!=(std::nullptr_t, const DataPtr& dp) noexcept {
 struct C10_API Allocator {
   virtual ~Allocator() = default;
 
-  virtual DataPtr allocate(size_t n) const = 0;
+  virtual DataPtr allocate(size_t n) = 0;
+
+  // Clones an allocation that came from this allocator.
+  //
+  // To perform the copy, this function calls `copy_data`, which
+  // must be implemented by derived classes.
+  //
+  // Note that this explicitly ignores any context that may have been
+  // attached to the input data.
+  //
+  // Requires: input data was allocated by the same allocator.
+  DataPtr clone(const void* data, std::size_t n);
+
+  // Checks if DataPtr has a simple context, not wrapped with any out of the
+  // ordinary contexts.
+  virtual bool is_simple_data_ptr(const DataPtr& data_ptr) const;
 
   // If this returns a non nullptr, it means that allocate()
   // is guaranteed to return a unique_ptr with this deleter attached;
@@ -179,6 +194,22 @@ struct C10_API Allocator {
     AT_ASSERT(d);
     d(ptr);
   }
+
+  // Copies data from one allocation to another.
+  // Pure virtual, so derived classes must define behavior.
+  // Derived class implementation can simply call `default_copy_data`
+  // to use `std::memcpy`.
+  //
+  // Requires: src and dest were allocated by this allocator
+  // Requires: src and dest both have length >= count
+  virtual void copy_data(void* dest, const void* src, std::size_t count)
+      const = 0;
+
+ protected:
+  // Uses `std::memcpy` to copy data.
+  // Child classes can use this as `copy_data` when an alternative copy
+  // API is not needed.
+  void default_copy_data(void* dest, const void* src, std::size_t count) const;
 };
 
 // This context is used to generate DataPtr which have arbitrary
@@ -192,13 +223,18 @@ struct C10_API Allocator {
 // allocation InefficientStdFunctionContext, on top of the dynamic
 // allocation which is implied by std::function itself.
 struct C10_API InefficientStdFunctionContext {
-  std::unique_ptr<void, std::function<void(void*)>> ptr_;
-  InefficientStdFunctionContext(
-      std::unique_ptr<void, std::function<void(void*)>>&& ptr)
-      : ptr_(std::move(ptr)) {}
+  void* ptr_;
+  std::function<void(void*)> deleter_;
+  InefficientStdFunctionContext(void* ptr, std::function<void(void*)> deleter)
+      : ptr_(ptr), deleter_(std::move(deleter)) {}
+  ~InefficientStdFunctionContext() {
+    if (deleter_) {
+      deleter_(ptr_);
+    }
+  }
   static DataPtr makeDataPtr(
       void* ptr,
-      const std::function<void(void*)>& deleter,
+      std::function<void(void*)> deleter,
       Device device);
 };
 
diff --git a/c10/core/Backend.h b/c10/core/Backend.h
index cf03a8d80e921..1cf1782fa5707 100644
--- a/c10/core/Backend.h
+++ b/c10/core/Backend.h
@@ -42,7 +42,11 @@ enum class Backend {
   SparseVE,
   SparseXPU,
   SparsePrivateUse1,
-  ORT,
+  SparseCsrHIP,
+  SparseCsrVE,
+  SparseCsrXPU,
+  SparseCsrPrivateUse1,
+  MAIA,
   XLA,
   Vulkan,
   Metal,
@@ -72,8 +76,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) {
     return Backend::VE;
   } else if (t == DispatchKey::FPGA) {
     return Backend::FPGA;
-  } else if (t == DispatchKey::ORT) {
-    return Backend::ORT;
+  } else if (t == DispatchKey::MAIA) {
+    return Backend::MAIA;
   } else if (t == DispatchKey::XLA || t == DispatchKey::AutogradXLA) {
     return Backend::XLA;
   } else if (t == DispatchKey::Lazy || t == DispatchKey::AutogradLazy) {
@@ -100,6 +104,12 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) {
     return Backend::SparseCsrCPU;
   } else if (t == DispatchKey::SparseCsrCUDA) {
     return Backend::SparseCsrCUDA;
+  } else if (t == DispatchKey::SparseCsrHIP) {
+    return Backend::SparseCsrHIP;
+  } else if (t == DispatchKey::SparseCsrVE) {
+    return Backend::SparseCsrVE;
+  } else if (t == DispatchKey::SparseCsrPrivateUse1) {
+    return Backend::SparseCsrPrivateUse1;
   } else if (t == DispatchKey::MkldnnCPU) {
     return Backend::MkldnnCPU;
   } else if (t == DispatchKey::QuantizedCPU) {
@@ -112,6 +122,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) {
     return Backend::XPU;
   } else if (t == DispatchKey::SparseXPU) {
     return Backend::SparseXPU;
+  } else if (t == DispatchKey::SparseCsrXPU) {
+    return Backend::SparseCsrXPU;
   } else if (t == DispatchKey::QuantizedXPU) {
     return Backend::QuantizedXPU;
   } else if (t == DispatchKey::QuantizedPrivateUse1) {
@@ -142,8 +154,8 @@ static inline DispatchKey backendToDispatchKey(Backend b) {
       return DispatchKey::VE;
     case Backend::FPGA:
       return DispatchKey::FPGA;
-    case Backend::ORT:
-      return DispatchKey::ORT;
+    case Backend::MAIA:
+      return DispatchKey::MAIA;
     case Backend::XLA:
       return DispatchKey::XLA;
     case Backend::Lazy:
@@ -154,6 +166,8 @@ static inline DispatchKey backendToDispatchKey(Backend b) {
       return DispatchKey::XPU;
     case Backend::SparseXPU:
       return DispatchKey::SparseXPU;
+    case Backend::SparseCsrXPU:
+      return DispatchKey::SparseCsrXPU;
     case Backend::SparseCPU:
       return DispatchKey::SparseCPU;
     case Backend::SparseCUDA:
@@ -168,6 +182,12 @@ static inline DispatchKey backendToDispatchKey(Backend b) {
       return DispatchKey::SparseCsrCPU;
     case Backend::SparseCsrCUDA:
       return DispatchKey::SparseCsrCUDA;
+    case Backend::SparseCsrHIP:
+      return DispatchKey::SparseCsrHIP;
+    case Backend::SparseCsrVE:
+      return DispatchKey::SparseCsrVE;
+    case Backend::SparseCsrPrivateUse1:
+      return DispatchKey::SparseCsrPrivateUse1;
     case Backend::MkldnnCPU:
       return DispatchKey::MkldnnCPU;
     case Backend::Vulkan:
@@ -216,8 +236,8 @@ static inline DeviceType backendToDeviceType(Backend b) {
       return DeviceType::VE;
     case Backend::FPGA:
       return DeviceType::FPGA;
-    case Backend::ORT:
-      return DeviceType::ORT;
+    case Backend::MAIA:
+      return DeviceType::MAIA;
     case Backend::XLA:
       return DeviceType::XLA;
     case Backend::Lazy:
@@ -226,10 +246,15 @@ static inline DeviceType backendToDeviceType(Backend b) {
       return DeviceType::HIP;
     case Backend::SparseVE:
       return DeviceType::VE;
+    case Backend::SparseCsrHIP:
+      return DeviceType::HIP;
+    case Backend::SparseCsrVE:
+      return DeviceType::VE;
     case Backend::IPU:
       return DeviceType::IPU;
     case Backend::XPU:
     case Backend::SparseXPU:
+    case Backend::SparseCsrXPU:
     case Backend::QuantizedXPU:
       return DeviceType::XPU;
     case Backend::Vulkan:
@@ -246,6 +271,7 @@ static inline DeviceType backendToDeviceType(Backend b) {
       return DeviceType::MTIA;
     case Backend::PrivateUse1:
     case Backend::SparsePrivateUse1:
+    case Backend::SparseCsrPrivateUse1:
     case Backend::QuantizedPrivateUse1:
       return DeviceType::PrivateUse1;
     case Backend::Undefined:
@@ -272,8 +298,8 @@ static inline const char* toString(Backend b) {
       return "XPU";
     case Backend::IPU:
       return "IPU";
-    case Backend::ORT:
-      return "ORT";
+    case Backend::MAIA:
+      return "MAIA";
     case Backend::XLA:
       return "XLA";
     case Backend::Lazy:
@@ -296,6 +322,14 @@ static inline const char* toString(Backend b) {
       return "SparseCsrCPU";
     case Backend::SparseCsrCUDA:
       return "SparseCsrCUDA";
+    case Backend::SparseCsrHIP:
+      return "SparseCsrHIP";
+    case Backend::SparseCsrVE:
+      return "SparseCsrVE";
+    case Backend::SparseCsrXPU:
+      return "SparseCsrXPU";
+    case Backend::SparseCsrPrivateUse1:
+      return "SparseCsrPrivateUse1";
     case Backend::MkldnnCPU:
       return "MkldnnCPU";
     case Backend::Vulkan:
@@ -339,8 +373,12 @@ static inline bool isSparse(Backend b) {
 
 static inline bool isSparseCsr(Backend b) {
   switch (b) {
+    case Backend::SparseCsrXPU:
     case Backend::SparseCsrCPU:
     case Backend::SparseCsrCUDA:
+    case Backend::SparseCsrHIP:
+    case Backend::SparseCsrVE:
+    case Backend::SparseCsrPrivateUse1:
       return true;
     default:
       return false;
diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp
index c103c42a28292..144e1b27b6de6 100644
--- a/c10/core/CPUAllocator.cpp
+++ b/c10/core/CPUAllocator.cpp
@@ -17,7 +17,7 @@ namespace c10 {
 
 struct C10_API DefaultCPUAllocator final : at::Allocator {
   DefaultCPUAllocator() = default;
-  at::DataPtr allocate(size_t nbytes) const override {
+  at::DataPtr allocate(size_t nbytes) override {
     void* data = nullptr;
     try {
       data = c10::alloc_cpu(nbytes);
@@ -40,6 +40,10 @@ struct C10_API DefaultCPUAllocator final : at::Allocator {
   at::DeleterFnPtr raw_deleter() const override {
     return &ReportAndDelete;
   }
+
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    default_copy_data(dest, src, count);
+  }
 };
 
 ProfiledCPUMemoryReporter& profiledCPUMemoryReporter() {
@@ -99,7 +103,7 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
     }
   }
 
-  DataPtr allocate(const size_t nbytes) const override {
+  DataPtr allocate(const size_t nbytes) override {
     if (C10_UNLIKELY(0u == nbytes)) {
       return {
           nullptr,
@@ -142,6 +146,16 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
   DeleterFnPtr raw_deleter() const override {
     return deleter;
   }
+
+  bool is_simple_data_ptr(const c10::DataPtr& data_ptr) const final {
+    return reinterpret_cast<const uint8_t*>(data_ptr.get()) ==
+        reinterpret_cast<const uint8_t*>(data_ptr.get_context()) +
+        PreGuardBytes;
+  }
+
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    default_copy_data(dest, src, count);
+  }
 };
 
 void NoDelete(void*) {}
diff --git a/c10/core/ConstantSymNodeImpl.cpp b/c10/core/ConstantSymNodeImpl.cpp
index e1d1e1c49febe..1f809fcc424cd 100644
--- a/c10/core/ConstantSymNodeImpl.cpp
+++ b/c10/core/ConstantSymNodeImpl.cpp
@@ -3,15 +3,15 @@
 namespace c10 {
 
 // This is used to support the case where the lhs is a constant symnode
-// and the rhs is a singleton symnode. This situation occurs today when we
-// perform a binary op between singleton int and plain int and the
-// singleton promotes the int into a constant symnode. If we'd like to
+// and the rhs is a nested int symnode. This situation occurs today when we
+// perform a binary op between nested int and plain int and the
+// int is promoted into a constant symnode. If we'd like to
 // support more combinations in the future, we may need to implement some
 // kind of multiple dispatch.
 #define DEFINE_BINARY_OP(OP, ROP)                                        \
   template <typename T>                                                  \
   c10::SymNode ConstantSymNodeImpl<T>::OP(const c10::SymNode& other) {   \
-    TORCH_INTERNAL_ASSERT(other->singleton_int().has_value());           \
+    TORCH_INTERNAL_ASSERT(other->is_nested_int());                       \
     return other->ROP(                                                   \
         c10::intrusive_ptr<ConstantSymNodeImpl<T>>::reclaim_copy(this)); \
   }
diff --git a/c10/core/Contiguity.h b/c10/core/Contiguity.h
index fe76f7a89c98b..36f41b6251c0c 100644
--- a/c10/core/Contiguity.h
+++ b/c10/core/Contiguity.h
@@ -1,4 +1,6 @@
 #pragma once
+#include <c10/core/SymBool.h>
+#include <c10/core/SymInt.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/SmallVector.h>
 #include <c10/util/irange.h>
@@ -11,14 +13,15 @@ namespace c10 {
 template <typename T>
 bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
   bool is_contiguous = true;
-  if (numel == 0)
+  if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(numel, 0))) {
     return is_contiguous;
+  }
   T z = 1;
   // NB: make sure we do signed arithmetic
   for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
     const auto& size_d = sizes[d];
-    if (size_d != 1) {
-      if (strides[d] == z) {
+    if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(strides[d], z))) {
         z *= size_d;
       } else {
         is_contiguous = false;
@@ -40,8 +43,8 @@ bool _compute_channels_last_contiguous_2d(
       T expected = 1;
       for (auto& d : {1, 3, 2, 0}) {
         const auto& size_d = sizes[d];
-        if (size_d != 1) {
-          if (strides[d] != expected) {
+        if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
+          if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) {
             return false;
           }
           expected *= size_d;
@@ -69,8 +72,8 @@ bool _compute_channels_last_contiguous_3d(
       T expected = 1;
       for (auto& d : {1, 4, 3, 2, 0}) {
         const auto& size_d = sizes[d];
-        if (size_d != 1) {
-          if (strides[d] != expected) {
+        if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
+          if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) {
             return false;
           }
           expected *= size_d;
diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp
index 7cc97d1a33aca..1b19114663c1f 100644
--- a/c10/core/Device.cpp
+++ b/c10/core/Device.cpp
@@ -26,7 +26,7 @@ DeviceType parse_type(const std::string& device_string) {
           {"hip", DeviceType::HIP},
           {"ve", DeviceType::VE},
           {"fpga", DeviceType::FPGA},
-          {"ort", DeviceType::ORT},
+          {"maia", DeviceType::MAIA},
           {"xla", DeviceType::XLA},
           {"lazy", DeviceType::Lazy},
           {"vulkan", DeviceType::Vulkan},
diff --git a/c10/core/Device.h b/c10/core/Device.h
index c58c03c9b9adf..cbe9129852ade 100644
--- a/c10/core/Device.h
+++ b/c10/core/Device.h
@@ -142,9 +142,9 @@ struct C10_API Device final {
     return type_ == DeviceType::Metal;
   }
 
-  /// Return true if the device is of ORT type.
-  bool is_ort() const noexcept {
-    return type_ == DeviceType::ORT;
+  /// Return true if the device is of MAIA type.
+  bool is_maia() const noexcept {
+    return type_ == DeviceType::MAIA;
   }
 
   /// Return true if the device is of META type.
diff --git a/c10/core/DeviceType.cpp b/c10/core/DeviceType.cpp
index 93b14274bfc1f..3cd70f42e2746 100644
--- a/c10/core/DeviceType.cpp
+++ b/c10/core/DeviceType.cpp
@@ -27,8 +27,8 @@ std::string DeviceTypeName(DeviceType d, bool lower_case) {
       return lower_case ? "ve" : "VE";
     case DeviceType::FPGA:
       return lower_case ? "fpga" : "FPGA";
-    case DeviceType::ORT:
-      return lower_case ? "ort" : "ORT";
+    case DeviceType::MAIA:
+      return lower_case ? "maia" : "MAIA";
     case DeviceType::XLA:
       return lower_case ? "xla" : "XLA";
     case DeviceType::Lazy:
@@ -83,7 +83,7 @@ bool isValidDeviceType(DeviceType d) {
     case DeviceType::HIP:
     case DeviceType::VE:
     case DeviceType::FPGA:
-    case DeviceType::ORT:
+    case DeviceType::MAIA:
     case DeviceType::XLA:
     case DeviceType::Lazy:
     case DeviceType::MPS:
@@ -148,4 +148,8 @@ void register_privateuse1_backend(const std::string& backend_name) {
   privateuse1_backend_name_set.store(true, std::memory_order_relaxed);
 }
 
+bool is_privateuse1_backend_registered() {
+  return privateuse1_backend_name_set.load(std::memory_order_acquire);
+}
+
 } // namespace c10
diff --git a/c10/core/DeviceType.h b/c10/core/DeviceType.h
index ea64ff02c2006..911c863363f96 100644
--- a/c10/core/DeviceType.h
+++ b/c10/core/DeviceType.h
@@ -42,7 +42,7 @@ enum class DeviceType : int8_t {
   IDEEP = 5, // IDEEP.
   HIP = 6, // AMD HIP
   FPGA = 7, // FPGA
-  ORT = 8, // ONNX Runtime / Microsoft
+  MAIA = 8, // ONNX Runtime / Microsoft
   XLA = 9, // XLA / TPU
   Vulkan = 10, // Vulkan
   Metal = 11, // Metal
@@ -66,7 +66,7 @@ constexpr DeviceType kCPU = DeviceType::CPU;
 constexpr DeviceType kCUDA = DeviceType::CUDA;
 constexpr DeviceType kHIP = DeviceType::HIP;
 constexpr DeviceType kFPGA = DeviceType::FPGA;
-constexpr DeviceType kORT = DeviceType::ORT;
+constexpr DeviceType kMAIA = DeviceType::MAIA;
 constexpr DeviceType kXLA = DeviceType::XLA;
 constexpr DeviceType kMPS = DeviceType::MPS;
 constexpr DeviceType kMeta = DeviceType::Meta;
@@ -104,6 +104,8 @@ C10_API std::ostream& operator<<(std::ostream& stream, DeviceType type);
 C10_API void register_privateuse1_backend(const std::string& backend_name);
 C10_API std::string get_privateuse1_backend(bool lower_case = true);
 
+C10_API bool is_privateuse1_backend_registered();
+
 } // namespace c10
 
 namespace std {
diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp
index d47de578fafd5..0388234efd5b3 100644
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@@ -66,8 +66,8 @@ const char* toString(DispatchKey t) {
       return "Dense";
     case DispatchKey::FPGA:
       return "FPGA";
-    case DispatchKey::ORT:
-      return "ORT";
+    case DispatchKey::MAIA:
+      return "MAIA";
     case DispatchKey::Vulkan:
       return "Vulkan";
     case DispatchKey::Metal:
@@ -91,10 +91,9 @@ const char* toString(DispatchKey t) {
 
     case DispatchKey::Sparse:
       return "Sparse";
-    case DispatchKey::SparseCsrCPU:
-      return "SparseCsrCPU";
-    case DispatchKey::SparseCsrCUDA:
-      return "SparseCsrCUDA";
+
+    case DispatchKey::SparseCsr:
+      return "SparseCsr";
 
     case DispatchKey::NestedTensor:
       return "NestedTensor";
@@ -264,7 +263,7 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
       {"Undefined", c10::DispatchKey::Undefined},
       {"Dense", c10::DispatchKey::Dense},
       {"FPGA", c10::DispatchKey::FPGA},
-      {"ORT", c10::DispatchKey::ORT},
+      {"MAIA", c10::DispatchKey::MAIA},
       {"MPS", c10::DispatchKey::MPS},
       {"Vulkan", c10::DispatchKey::Vulkan},
       {"Metal", c10::DispatchKey::Metal},
@@ -274,8 +273,7 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
       {"CustomRNGKeyId", c10::DispatchKey::CustomRNGKeyId},
       {"MkldnnCPU", c10::DispatchKey::MkldnnCPU},
       {"Sparse", c10::DispatchKey::Sparse},
-      {"SparseCsrCPU", c10::DispatchKey::SparseCsrCPU},
-      {"SparseCsrCUDA", c10::DispatchKey::SparseCsrCUDA},
+      {"SparseCsr", c10::DispatchKey::SparseCsr},
       {"BackendSelect", c10::DispatchKey::BackendSelect},
       {"Python", c10::DispatchKey::Python},
       {"PythonTLSSnapshot", c10::DispatchKey::PythonTLSSnapshot},
@@ -346,6 +344,14 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
       {"SparseMeta", c10::DispatchKey::SparseMeta},
       {"SparsePrivateUse1", c10::DispatchKey::SparsePrivateUse1},
 
+      {"SparseCsrCPU", c10::DispatchKey::SparseCsrCPU},
+      {"SparseCsrCUDA", c10::DispatchKey::SparseCsrCUDA},
+      {"SparseCsrHIP", c10::DispatchKey::SparseCsrHIP},
+      {"SparseCsrXPU", c10::DispatchKey::SparseCsrXPU},
+      {"SparseCsrVE", c10::DispatchKey::SparseCsrVE},
+      {"SparseCsrMeta", c10::DispatchKey::SparseCsrMeta},
+      {"SparseCsrPrivateUse1", c10::DispatchKey::SparseCsrPrivateUse1},
+
       {"AutogradCPU", c10::DispatchKey::AutogradCPU},
       {"AutogradCUDA", c10::DispatchKey::AutogradCUDA},
       {"AutogradXLA", c10::DispatchKey::AutogradXLA},
diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h
index 840367241e49b..71277ebfd891e 100644
--- a/c10/core/DispatchKey.h
+++ b/c10/core/DispatchKey.h
@@ -57,6 +57,7 @@ namespace c10 {
   _(Dense, )                             \
   _(Quantized, Quantized)                \
   _(Sparse, Sparse)                      \
+  _(SparseCsr, SparseCsr)                \
   _(NestedTensor, NestedTensor)          \
   _(AutogradFunctionality, Autograd)
 
@@ -180,13 +181,11 @@ enum class DispatchKey : uint16_t {
   // https://gitlab.com/pytorch-complex/vitis_kernels
 
   // TODO: put this in BackendComponents
-  // ONNX Runtime, lives out of tree at https://github.com/pytorch/ort and
-  // https://github.com/microsoft/onnxruntime, and is also used to test general
-  // backend/extension machinery in the core. cf:
-  // - test/cpp_extensions/ort_extension.cpp
+  // MAIA backend lives out of tree
+  // - test/cpp_extensions/maia_extension.cpp
   // - test/test_torch.py
   // - aten/src/ATen/test/extension_backend_test.cpp
-  ORT,
+  MAIA,
 
   Vulkan, // TODO: put this in BackendComponents
   Metal, // TODO: put this in BackendComponents
@@ -217,9 +216,7 @@ enum class DispatchKey : uint16_t {
   // See [Note: Per-Backend Functionality Dispatch Keys]
   Sparse,
 
-  // TODO: Make SparseCsr a functionality key
-  SparseCsrCPU,
-  SparseCsrCUDA,
+  SparseCsr,
 
   NestedTensor,
 
@@ -548,7 +545,8 @@ constexpr bool isAliasDispatchKey(DispatchKey k) {
 
 constexpr bool isPerBackendFunctionalityKey(DispatchKey k) {
   if (k == DispatchKey::Dense || k == DispatchKey::Quantized ||
-      k == DispatchKey::Sparse || k == DispatchKey::AutogradFunctionality ||
+      k == DispatchKey::Sparse || k == DispatchKey::SparseCsr ||
+      k == DispatchKey::AutogradFunctionality ||
       k == DispatchKey::NestedTensor) {
     return true;
   } else {
@@ -635,6 +633,12 @@ constexpr BackendComponent toBackendComponent(DispatchKey k) {
     return static_cast<BackendComponent>(
         static_cast<uint8_t>(k) -
         static_cast<uint8_t>(DispatchKey::StartOfSparseBackends));
+  } else if (
+      k >= DispatchKey::StartOfSparseCsrBackends &&
+      k <= DispatchKey::EndOfSparseCsrBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfSparseCsrBackends));
   } else if (
       k >= DispatchKey::StartOfNestedTensorBackends &&
       k <= DispatchKey::EndOfNestedTensorBackends) {
@@ -662,6 +666,8 @@ constexpr DispatchKey toFunctionalityKey(DispatchKey k) {
     return DispatchKey::Quantized;
   } else if (k <= DispatchKey::EndOfSparseBackends) {
     return DispatchKey::Sparse;
+  } else if (k <= DispatchKey::EndOfSparseCsrBackends) {
+    return DispatchKey::SparseCsr;
   } else if (k <= DispatchKey::EndOfNestedTensorBackends) {
     return DispatchKey::NestedTensor;
   } else if (k <= DispatchKey::EndOfAutogradFunctionalityBackends) {
@@ -692,6 +698,11 @@ constexpr DispatchKey toRuntimePerBackendFunctionalityKey(
         static_cast<uint8_t>(DispatchKey::StartOfSparseBackends) +
         static_cast<uint8_t>(backend_k));
   }
+  if (functionality_k == DispatchKey::SparseCsr) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfSparseCsrBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
   if (functionality_k == DispatchKey::Quantized) {
     return static_cast<DispatchKey>(
         static_cast<uint8_t>(DispatchKey::StartOfQuantizedBackends) +
diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h
index 040249cf0ca55..f7461ea73a6dd 100644
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@@ -2,7 +2,6 @@
 #include <c10/core/DispatchKey.h>
 #include <c10/macros/Export.h>
 #include <c10/macros/Macros.h>
-#include <c10/util/C++17.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Metaprogramming.h>
 #include <c10/util/TypeList.h>
@@ -80,6 +79,7 @@ C10_ALWAYS_INLINE static const std::
 // we have:
 // - "Dense":     CPU, CUDA, XLA, ... (~12 keys)
 // - "Sparse":    SparseCPU, SparseCUDA, ...
+// - "SparseCsr": SparseCsrCPU, SparseCsrCUDA, ...
 // - "Quantized": QuantizedCPU, QuantizedCUDA, QuantizedXLA, ...
 // - "Autograd":  AutogradCPU, AutogradCUDA, Autograd XLA, ...
 // The problem is that total number of keys grows quadratically with [#
@@ -93,7 +93,7 @@ C10_ALWAYS_INLINE static const std::
 // (1) "Building block" keys
 //    (a) backends: Everything in the BackendComponent enum (e.g. CPUBit,
 //    CUDABit) (b) functionalities: (per-backend) functionality-bit DispatchKeys
-//    (e.g. AutogradFunctionality, Sparse, Dense)
+//    (e.g. AutogradFunctionality, SparseCsr, Sparse, Dense)
 // (2) "Runtime" keys
 //    (a) "non-customizable backends" (e.g. FPGA)
 //    (b) "non-customizable functionalities" (e.g. Functionalize)
@@ -117,14 +117,16 @@ C10_ALWAYS_INLINE static const std::
 // Backend keys and functionality keys that count as "building blocks" will
 // contribute to a full cross product of functionality that can be overriden.
 //
-// For example, right now we have at least 12 "backend" building blocks (CPU,
-// CUDA, XLA, ...) and at least 4 "functionality" building blocks (Dense,
-// Sparse, Quantized, AutogradFunctionality, ...). These keys together allow
-// every dispatcher operator to be customized in up to 12*4 different ways. Each
-// of those requires a slot in the operator table of every dispatcher operator.
-// Not every piece of functionality necessarily needs to be customizable
-// per-backend, and not every backend necessarily needs to be able to customize
-// every type of functionality.
+// For example, right now we have at least 12 "backend" building
+// blocks (CPU, CUDA, XLA, ...) and at least 5 "functionality"
+// building blocks (Dense, Sparse, SparseCsr, Quantized,
+// AutogradFunctionality, ...). These keys together allow every
+// dispatcher operator to be customized in up to 12*4 different
+// ways. Each of those requires a slot in the operator table of every
+// dispatcher operator.  Not every piece of functionality necessarily
+// needs to be customizable per-backend, and not every backend
+// necessarily needs to be able to customize every type of
+// functionality.
 //
 //
 // (2) Every runtime key corresponds directly to a slot in an operator's runtime
@@ -308,6 +310,7 @@ class DispatchKeySet final {
                              DispatchKey::Dense,
                              DispatchKey::Quantized,
                              DispatchKey::Sparse,
+                             DispatchKey::SparseCsr,
                              DispatchKey::AutogradFunctionality,
                          })
               .repr_) == 0));
@@ -686,8 +689,7 @@ constexpr DispatchKeySet python_ks = DispatchKeySet({
 
 constexpr DispatchKeySet sparse_ks = DispatchKeySet(DispatchKey::Sparse);
 
-constexpr DispatchKeySet sparse_csr_ks =
-    DispatchKeySet({DispatchKey::SparseCsrCPU, DispatchKey::SparseCsrCUDA});
+constexpr DispatchKeySet sparse_csr_ks = DispatchKeySet(DispatchKey::SparseCsr);
 
 constexpr DispatchKeySet mkldnn_ks = DispatchKeySet(DispatchKey::MkldnnCPU);
 
@@ -700,15 +702,14 @@ constexpr DispatchKeySet autogradother_backends =
         // Technically, HIP will now redispatch to its own custom AutogradHIP
         // slot in the runtime table.
         {DispatchKey::FPGA,
-         DispatchKey::ORT,
+         DispatchKey::MAIA,
          DispatchKey::Vulkan,
          DispatchKey::Metal,
-         DispatchKey::SparseCsrCPU,
-         DispatchKey::SparseCsrCUDA,
          DispatchKey::CustomRNGKeyId,
          DispatchKey::MkldnnCPU,
          // Sparse and Quantized backends also live here.
          DispatchKey::Sparse,
+         DispatchKey::SparseCsr,
          DispatchKey::Quantized})
     // Including the backend bits because this keyset is used during op
     // registration, which requires looping over all runtime autogradother
@@ -786,6 +787,7 @@ constexpr DispatchKeySet backend_functionality_keys =
         DispatchKey::Dense,
         DispatchKey::Quantized,
         DispatchKey::Sparse,
+        DispatchKey::SparseCsr,
     }) |
     DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
 
@@ -914,7 +916,7 @@ static inline DispatchKey legacyExtractDispatchKey(DispatchKeySet s) {
 }
 
 template <class T>
-using is_not_DispatchKeySet = guts::negation<std::is_same<DispatchKeySet, T>>;
+using is_not_DispatchKeySet = std::negation<std::is_same<DispatchKeySet, T>>;
 
 // Given a function type, constructs a function_traits type that drops the first
 // parameter type if the first parameter is of type DispatchKeySet. NB:
diff --git a/c10/core/Event.h b/c10/core/Event.h
index 2cbaf180220b0..b94db9f4f26d0 100644
--- a/c10/core/Event.h
+++ b/c10/core/Event.h
@@ -118,6 +118,18 @@ struct Event final {
     return impl_.query();
   }
 
+  double elapsedTime(const Event& event) const {
+    return impl_.elapsedTime(event.impl_);
+  }
+
+  void* eventId() const {
+    return impl_.eventId();
+  }
+
+  void synchronize() const {
+    return impl_.synchronize();
+  }
+
  private:
   impl::InlineEvent<impl::VirtualGuardImpl> impl_;
 };
diff --git a/c10/core/GeneratorImpl.cpp b/c10/core/GeneratorImpl.cpp
index e893b5c6dbe1a..d7bb389d70453 100644
--- a/c10/core/GeneratorImpl.cpp
+++ b/c10/core/GeneratorImpl.cpp
@@ -31,6 +31,18 @@ c10::intrusive_ptr<GeneratorImpl> GeneratorImpl::clone() const {
   return c10::intrusive_ptr<GeneratorImpl>::reclaim(res);
 }
 
+void GeneratorImpl::graphsafe_set_state(
+    const c10::intrusive_ptr<c10::GeneratorImpl>& state) {
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "graphsafe_set_state is not supported in this Generator");
+}
+
+c10::intrusive_ptr<c10::GeneratorImpl> GeneratorImpl::graphsafe_get_state()
+    const {
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "graphsafe_get_state is not supported in this Generator");
+}
+
 /**
  * Gets the device of a generator.
  */
diff --git a/c10/core/GeneratorImpl.h b/c10/core/GeneratorImpl.h
index e378981c1f443..6757b6de6f65c 100644
--- a/c10/core/GeneratorImpl.h
+++ b/c10/core/GeneratorImpl.h
@@ -73,6 +73,9 @@ struct C10_API GeneratorImpl : public c10::intrusive_ptr_target {
   virtual uint64_t seed() = 0;
   virtual void set_state(const c10::TensorImpl& new_state) = 0;
   virtual c10::intrusive_ptr<c10::TensorImpl> get_state() const = 0;
+  virtual void graphsafe_set_state(
+      const c10::intrusive_ptr<c10::GeneratorImpl>& new_state);
+  virtual c10::intrusive_ptr<c10::GeneratorImpl> graphsafe_get_state() const;
   Device device() const;
 
   // See Note [Acquire lock when using random generators]
diff --git a/c10/core/Layout.h b/c10/core/Layout.h
index 3ca32977f4bae..82a9129501d9d 100644
--- a/c10/core/Layout.h
+++ b/c10/core/Layout.h
@@ -35,14 +35,18 @@ inline Layout layout_from_backend(Backend backend) {
     case Backend::SparseHIP:
     case Backend::SparseVE:
     case Backend::SparseXPU:
+    case Backend::SparsePrivateUse1:
       return Layout::Sparse;
     case Backend::MkldnnCPU:
       return Layout::Mkldnn;
     case Backend::SparseCsrCPU:
     case Backend::SparseCsrCUDA:
+    case Backend::SparseCsrHIP:
+    case Backend::SparseCsrVE:
+    case Backend::SparseCsrXPU:
       TORCH_CHECK(
           false,
-          "Cannot map Backend SparseCsrCPU|SparseCsrCUDA to a unique layout.");
+          "Cannot map Backend SparseCsr(CPU|CUDA|HIP|VE|XPU) to a unique layout.");
     default:
       return Layout::Strided;
   }
diff --git a/c10/core/SafePyObject.h b/c10/core/SafePyObject.h
index 4e56384b2feef..19f8f62c716dd 100644
--- a/c10/core/SafePyObject.h
+++ b/c10/core/SafePyObject.h
@@ -55,6 +55,22 @@ struct C10_API SafePyObject {
   c10::impl::PyInterpreter* pyinterpreter_;
 };
 
+// A newtype wrapper around SafePyObject for type safety when a python object
+// represents a specific type. Note that `T` is only used as a tag and isn't
+// actually used for any true purpose.
+template <typename T>
+struct SafePyObjectT : private SafePyObject {
+  SafePyObjectT(PyObject* data, c10::impl::PyInterpreter* pyinterpreter)
+      : SafePyObject(data, pyinterpreter) {}
+  SafePyObjectT(SafePyObjectT&& other) noexcept : SafePyObject(other) {}
+  SafePyObjectT(SafePyObjectT const&) = delete;
+  SafePyObjectT& operator=(SafePyObjectT const&) = delete;
+
+  using SafePyObject::ptr;
+  using SafePyObject::pyinterpreter;
+  using SafePyObject::release;
+};
+
 // Like SafePyObject, but non-owning.  Good for references to global PyObjects
 // that will be leaked on interpreter exit.  You get a copy constructor/assign
 // this way.
diff --git a/c10/core/Scalar.h b/c10/core/Scalar.h
index 612adee0e06b4..2cd164693e57d 100644
--- a/c10/core/Scalar.h
+++ b/c10/core/Scalar.h
@@ -59,6 +59,23 @@ class C10_API Scalar {
       DEFINE_IMPLICIT_CTOR)
   AT_FORALL_COMPLEX_TYPES(DEFINE_IMPLICIT_CTOR)
 
+  // Helper constructors to allow Scalar creation from long and long long types
+  // As std::is_same_v<long, long long> is false(except Android), one needs to
+  // provide a constructor from either long or long long in addition to one from
+  // int64_t
+#if defined(__APPLE__) || defined(__MACOSX)
+  static_assert(
+      std::is_same_v<long long, int64_t>,
+      "int64_t is the same as long long on MacOS");
+  Scalar(long vv) : Scalar(vv, true) {}
+#endif
+#if defined(__linux__) && !defined(__ANDROID__)
+  static_assert(
+      std::is_same_v<long, int64_t>,
+      "int64_t is the same as long on Linux");
+  Scalar(long long vv) : Scalar(vv, true) {}
+#endif
+
   Scalar(uint16_t vv) : Scalar(vv, true) {}
   Scalar(uint32_t vv) : Scalar(vv, true) {}
   Scalar(uint64_t vv) {
diff --git a/c10/core/ScalarType.cpp b/c10/core/ScalarType.cpp
index a942ae252d1cd..f9704c8157615 100644
--- a/c10/core/ScalarType.cpp
+++ b/c10/core/ScalarType.cpp
@@ -84,6 +84,13 @@ ScalarType promoteTypes(ScalarType a, ScalarType b) {
     // - We must not promote uint64 to int64 because this will overflow.
     //
     // It'll be a bit of work to fix it, so we're punting on it for now.
+    // However, float promotion is fine, so we handle that.
+    if (isFloatingType(a)) {
+      return a;
+    }
+    if (isFloatingType(b)) {
+      return b;
+    }
     TORCH_CHECK(
         false,
         "Promotion for uint16, uint32, uint64 types is not supported, attempted to promote ",
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index 02bb47bc92ab6..590b24a7bc20e 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -25,6 +25,11 @@
 
 namespace c10 {
 
+// dummy struct for uint1 to uint7, actual functionality
+// of these dtypes will be implemented in python with Tensor subclass
+template <unsigned int N>
+struct dummy_uint1_7_t {};
+
 // For the macros below:
 //
 // For users: If you want to macro some code for all non-QInt scalar types
@@ -77,7 +82,14 @@ namespace c10 {
   _(c10::Float8_e4m3fnuz, Float8_e4m3fnuz) /* 26 */      \
   _(uint16_t, UInt16) /* 27 */                           \
   _(uint32_t, UInt32) /* 28 */                           \
-  _(uint64_t, UInt64) /* 29 */
+  _(uint64_t, UInt64) /* 29 */                           \
+  _(c10::dummy_uint1_7_t<1>, UInt1) /* 30 */             \
+  _(c10::dummy_uint1_7_t<2>, UInt2) /* 31 */             \
+  _(c10::dummy_uint1_7_t<3>, UInt3) /* 32 */             \
+  _(c10::dummy_uint1_7_t<4>, UInt4) /* 33 */             \
+  _(c10::dummy_uint1_7_t<5>, UInt5) /* 34 */             \
+  _(c10::dummy_uint1_7_t<6>, UInt6) /* 35 */             \
+  _(c10::dummy_uint1_7_t<7>, UInt7) /* 36 */
 
 // If you want to support ComplexHalf for real, add ComplexHalf
 // into this macro (and change the name).  But beware: convert()
@@ -247,87 +259,6 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
              ::c10::ScalarType::SCALARTYPE3>::t),                             \
     SCALARTYPE3)
 
-#define AT_FORALL_SCALAR_TYPES_AND4(                       \
-    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, _) \
-  _(uint8_t, Byte)                                         \
-  _(int8_t, Char)                                          \
-  _(int16_t, Short)                                        \
-  _(int, Int)                                              \
-  _(int64_t, Long)                                         \
-  _(float, Float)                                          \
-  _(double, Double)                                        \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<             \
-             ::c10::ScalarType::SCALARTYPE1>::t),          \
-    SCALARTYPE1)                                           \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<             \
-             ::c10::ScalarType::SCALARTYPE2>::t),          \
-    SCALARTYPE2)                                           \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<             \
-             ::c10::ScalarType::SCALARTYPE3>::t),          \
-    SCALARTYPE3)                                           \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<             \
-             ::c10::ScalarType::SCALARTYPE4>::t),          \
-    SCALARTYPE4)
-
-#define AT_FORALL_SCALAR_TYPES_AND5(                                    \
-    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, _) \
-  _(uint8_t, Byte)                                                      \
-  _(int8_t, Char)                                                       \
-  _(int16_t, Short)                                                     \
-  _(int, Int)                                                           \
-  _(int64_t, Long)                                                      \
-  _(float, Float)                                                       \
-  _(double, Double)                                                     \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<                          \
-             ::c10::ScalarType::SCALARTYPE1>::t),                       \
-    SCALARTYPE1)                                                        \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<                          \
-             ::c10::ScalarType::SCALARTYPE2>::t),                       \
-    SCALARTYPE2)                                                        \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<                          \
-             ::c10::ScalarType::SCALARTYPE3>::t),                       \
-    SCALARTYPE3)                                                        \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<                          \
-             ::c10::ScalarType::SCALARTYPE4>::t),                       \
-    SCALARTYPE4)                                                        \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<                          \
-             ::c10::ScalarType::SCALARTYPE5>::t),                       \
-    SCALARTYPE5)
-
-#define AT_FORALL_SCALAR_TYPES_AND6(              \
-    SCALARTYPE1,                                  \
-    SCALARTYPE2,                                  \
-    SCALARTYPE3,                                  \
-    SCALARTYPE4,                                  \
-    SCALARTYPE5,                                  \
-    SCALARTYPE6,                                  \
-    _)                                            \
-  _(uint8_t, Byte)                                \
-  _(int8_t, Char)                                 \
-  _(int16_t, Short)                               \
-  _(int, Int)                                     \
-  _(int64_t, Long)                                \
-  _(float, Float)                                 \
-  _(double, Double)                               \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
-             ::c10::ScalarType::SCALARTYPE1>::t), \
-    SCALARTYPE1)                                  \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
-             ::c10::ScalarType::SCALARTYPE2>::t), \
-    SCALARTYPE2)                                  \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
-             ::c10::ScalarType::SCALARTYPE3>::t), \
-    SCALARTYPE3)                                  \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
-             ::c10::ScalarType::SCALARTYPE4>::t), \
-    SCALARTYPE4)                                  \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
-             ::c10::ScalarType::SCALARTYPE5>::t), \
-    SCALARTYPE5)                                  \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
-             ::c10::ScalarType::SCALARTYPE6>::t), \
-    SCALARTYPE6)
-
 #define AT_FORALL_SCALAR_TYPES_AND7(              \
     SCALARTYPE1,                                  \
     SCALARTYPE2,                                  \
@@ -460,8 +391,11 @@ static inline bool isBitsType(ScalarType t) {
 }
 
 static inline bool isBarebonesUnsignedType(ScalarType t) {
-  return t == ScalarType::UInt16 || t == ScalarType::UInt32 ||
-      t == ScalarType::UInt64;
+  return t == ScalarType::UInt1 || t == ScalarType::UInt2 ||
+      t == ScalarType::UInt3 || t == ScalarType::UInt4 ||
+      t == ScalarType::UInt5 || t == ScalarType::UInt6 ||
+      t == ScalarType::UInt7 || t == ScalarType::UInt16 ||
+      t == ScalarType::UInt32 || t == ScalarType::UInt64;
 }
 
 static inline ScalarType toQIntType(ScalarType t) {
@@ -494,28 +428,60 @@ static inline ScalarType toUnderlying(ScalarType t) {
 }
 
 static inline bool isSignedType(ScalarType t) {
-  TORCH_CHECK(!isQIntType(t), "isSignedType not supported for quantized types");
-#define CASE_SIGNED(ctype, name) \
-  case ScalarType::name:         \
-    return std::numeric_limits<ctype>::is_signed;
+#define CASE_ISSIGNED(name)     \
+  case ScalarType::name:        \
+    return std::numeric_limits< \
+        ::c10::impl::ScalarTypeToCPPTypeT<ScalarType::name>>::is_signed;
 
   switch (t) {
+    case ScalarType::QInt8:
+    case ScalarType::QUInt8:
+    case ScalarType::QInt32:
+    case ScalarType::QUInt4x2:
+    case ScalarType::QUInt2x4:
+      TORCH_CHECK(false, "isSignedType not supported for quantized types");
     case ScalarType::Bits1x8:
     case ScalarType::Bits2x4:
     case ScalarType::Bits4x2:
     case ScalarType::Bits8:
     case ScalarType::Bits16:
       TORCH_CHECK(false, "Bits types are undefined");
-    case ScalarType::ComplexHalf:
-    case ScalarType::ComplexFloat:
-    case ScalarType::ComplexDouble:
+      CASE_ISSIGNED(UInt16);
+      CASE_ISSIGNED(UInt32);
+      CASE_ISSIGNED(UInt64);
+      CASE_ISSIGNED(BFloat16);
+      CASE_ISSIGNED(Float8_e5m2);
+      CASE_ISSIGNED(Float8_e5m2fnuz);
+      CASE_ISSIGNED(Float8_e4m3fn);
+      CASE_ISSIGNED(Float8_e4m3fnuz);
+      CASE_ISSIGNED(Byte);
+      CASE_ISSIGNED(Char);
+      CASE_ISSIGNED(Short);
+      CASE_ISSIGNED(Int);
+      CASE_ISSIGNED(Long);
+      CASE_ISSIGNED(Half);
+      CASE_ISSIGNED(Float);
+      CASE_ISSIGNED(Double);
+      CASE_ISSIGNED(ComplexHalf);
+      CASE_ISSIGNED(ComplexFloat);
+      CASE_ISSIGNED(ComplexDouble);
+      CASE_ISSIGNED(Bool);
+    case ScalarType::UInt1:
+    case ScalarType::UInt2:
+    case ScalarType::UInt3:
+    case ScalarType::UInt4:
+    case ScalarType::UInt5:
+    case ScalarType::UInt6:
+    case ScalarType::UInt7:
       return true;
-      AT_FORALL_SCALAR_TYPES_AND5(
-          Half, Bool, BFloat16, Float8_e5m2, Float8_e4m3fn, CASE_SIGNED)
-    default:
-      TORCH_CHECK(false, "Unknown ScalarType");
+    case ScalarType::Undefined:
+    case ScalarType::NumOptions:
+      break;
+      // Do not add default here, but rather define behavior of every new entry
+      // here.  `-Wswitch-enum` would raise a warning in those cases.
   }
-#undef CASE_SIGNED
+  TORCH_CHECK(false, "Unknown ScalarType ", t);
+#undef CASE_ISSIGNED
 }
 
 static inline bool isUnderlying(ScalarType type, ScalarType qtype) {
diff --git a/c10/core/StorageImpl.cpp b/c10/core/StorageImpl.cpp
index eb7312527f24c..dc36064ddca4e 100644
--- a/c10/core/StorageImpl.cpp
+++ b/c10/core/StorageImpl.cpp
@@ -11,6 +11,32 @@ C10_API std::array<StorageImplCreateHelper, at::COMPILE_TIME_MAX_DEVICE_TYPES>
 static ska::flat_hash_set<c10::DeviceType> DeviceTypeAllowList{
     DeviceType::PrivateUse1};
 
+void throwNullDataPtrError() {
+  TORCH_CHECK(
+      false,
+      "Cannot access data pointer of Tensor (e.g. FakeTensor, FunctionalTensor). "
+      "If you're using torch.compile/export/fx, it is likely that we are erroneously "
+      "tracing into a custom kernel. To fix this, please wrap the custom kernel into "
+      "an opaque custom op. Please see the following for details: "
+      "https://docs.google.com/document/d/1W--T6wz8IY8fOI0Vm8BF44PdBgs283QvpelJZWieQWQ");
+}
+
+// NOTE: [FakeTensor.data_ptr deprecation]
+// Today:
+// - FakeTensor.data_ptr errors out in torch.compile.
+// - FakeTensor.data_ptr raises the following deprecation warning otherwise.
+// - the following deprecation warning is only for FakeTensor (for now).
+//   In the future we can consider extending to more wrapper Tensor subclasses.
+void warnDeprecatedDataPtr() {
+  TORCH_WARN_ONCE(
+      "Accessing the data pointer of FakeTensor is deprecated and will error in "
+      "PyTorch 2.5. This is almost definitely a bug in your code and will "
+      "cause undefined behavior with subsystems like torch.compile. "
+      "Please wrap calls to tensor.data_ptr() in an opaque custom op; "
+      "If all else fails, you can guard accesses to tensor.data_ptr() on "
+      "isinstance(tensor, FakeTensor).")
+}
+
 void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr) {
   // Allowlist verification.
   // Only if the devicetype is in the allowlist,
@@ -36,4 +62,43 @@ StorageImplCreateHelper GetStorageImplCreate(DeviceType t) {
   return StorageImplCreate[device_type];
 }
 
+c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
+    c10::StorageImpl::use_byte_size_t use_byte_size,
+    c10::SymInt size_bytes,
+    c10::DataPtr data_ptr,
+    c10::Allocator* allocator,
+    bool resizable,
+    c10::optional<at::Device> device_opt) {
+  // This will be non-nullptr only when there is a custom StorageImpl
+  // constructor for the given device
+  c10::StorageImplCreateHelper fptr = nullptr;
+  if (device_opt.has_value()) {
+    // We only need to check this here as this is the only case where we can
+    // have a device that is not CPU (and thus for which the StorageImpl
+    // constructor can be overwritten).
+    fptr = c10::GetStorageImplCreate(device_opt.value().type());
+  }
+
+  if (fptr != nullptr) {
+    return fptr(
+        use_byte_size,
+        std::move(size_bytes),
+        std::move(data_ptr),
+        allocator,
+        resizable);
+  }
+
+  // Create a c10::StorageImpl object.
+  if (data_ptr != nullptr) {
+    return c10::make_intrusive<c10::StorageImpl>(
+        use_byte_size,
+        std::move(size_bytes),
+        std::move(data_ptr),
+        allocator,
+        resizable);
+  }
+  return c10::make_intrusive<c10::StorageImpl>(
+      use_byte_size, std::move(size_bytes), allocator, resizable);
+}
+
 } // namespace c10
diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h
index 328286338a003..4ee9f62e620f5 100644
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@@ -4,6 +4,8 @@
 #include <c10/core/Device.h>
 #include <c10/core/DeviceType.h>
 #include <c10/core/SymInt.h>
+#include <c10/core/impl/COW.h>
+#include <c10/core/impl/COWDeleter.h>
 #include <c10/core/impl/PyObjectSlot.h>
 #include <c10/macros/Export.h>
 #include <c10/util/Exception.h>
@@ -14,6 +16,9 @@
 
 namespace c10 {
 
+C10_API void throwNullDataPtrError();
+C10_API void warnDeprecatedDataPtr();
+
 // A storage represents the underlying backing data buffer for a
 // tensor.  This concept was inherited from the original Torch7
 // codebase; we'd kind of like to get rid of the concept
@@ -57,6 +62,7 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
       TORCH_INTERNAL_ASSERT(
           allocator_, "For resizable storage, allocator must be provided");
     }
+    refresh_has_data_ptr_check();
   }
 
   StorageImpl(
@@ -116,23 +122,39 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
     return resizable_;
   }
 
+  const at::DataPtr& data_ptr() const {
+    return data_ptr_;
+  }
+
   at::DataPtr& mutable_data_ptr() {
+    if (C10_UNLIKELY(has_data_ptr_check_)) {
+      if (throw_on_mutable_data_ptr_) {
+        throwNullDataPtrError();
+      }
+      if (warn_deprecated_on_mutable_data_ptr_) {
+        warnDeprecatedDataPtr();
+      }
+      maybe_materialize_cow();
+    }
     return data_ptr_;
   }
 
-  const at::DataPtr& data_ptr() const {
+  // Returns the data_ptr. Bypasses all checks.
+  at::DataPtr& _mutable_data_ptr_no_checks() {
     return data_ptr_;
   }
 
   // Returns the previous data_ptr
   at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) {
-    at::DataPtr old_data_ptr(std::move(data_ptr_));
-    data_ptr_ = std::move(data_ptr);
-    return old_data_ptr;
+    // We need to materialize the old COW DataPtr because it is
+    // being returned as mutable.
+    maybe_materialize_cow();
+    return set_data_ptr_no_materialize_cow(std::move(data_ptr));
   }
 
   void set_data_ptr_noswap(at::DataPtr&& data_ptr) {
     data_ptr_ = std::move(data_ptr);
+    refresh_has_data_ptr_check();
   }
 
   const void* data() const {
@@ -140,6 +162,15 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
   }
 
   void* mutable_data() {
+    if (C10_UNLIKELY(has_data_ptr_check_)) {
+      if (throw_on_mutable_data_ptr_) {
+        throwNullDataPtrError();
+      }
+      if (warn_deprecated_on_mutable_data_ptr_) {
+        warnDeprecatedDataPtr();
+      }
+      maybe_materialize_cow();
+    }
     return data_ptr_.mutable_get();
   }
 
@@ -217,7 +248,46 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
     return &pyobj_slot_;
   }
 
+  void set_throw_on_mutable_data_ptr() {
+    throw_on_mutable_data_ptr_ = true;
+    refresh_has_data_ptr_check();
+  }
+
+  void set_warn_deprecated_on_mutable_data_ptr() {
+    warn_deprecated_on_mutable_data_ptr_ = true;
+    refresh_has_data_ptr_check();
+  }
+
+ protected:
+  // materialize_cow_storage needs to call set_data_ptr_no_materlize_cow
+  friend void c10::impl::cow::materialize_cow_storage(StorageImpl& storage);
+
+  // Returns the previous data_ptr. If the old data_ptr was COW,
+  // this avoids materializing it
+  at::DataPtr set_data_ptr_no_materialize_cow(at::DataPtr&& data_ptr) {
+    at::DataPtr old_data_ptr(std::move(data_ptr_));
+    data_ptr_ = std::move(data_ptr);
+    refresh_has_data_ptr_check();
+    return old_data_ptr;
+  }
+
  private:
+  void refresh_has_data_ptr_check() {
+    has_data_ptr_check_ = is_cow() || throw_on_mutable_data_ptr_ ||
+        warn_deprecated_on_mutable_data_ptr_;
+  }
+
+  inline bool is_cow() const {
+    return c10::impl::cow::is_cow_data_ptr(data_ptr_);
+  }
+
+  // Triggers a copy if this is a copy-on-write tensor.
+  void maybe_materialize_cow() {
+    if (is_cow()) {
+      impl::cow::materialize_cow_storage(*this);
+    }
+  }
+
   DataPtr data_ptr_;
   SymInt size_bytes_;
   bool size_bytes_is_heap_allocated_;
@@ -225,6 +295,14 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
   // Identifies that Storage was received from another process and doesn't have
   // local to process cuda memory allocation
   bool received_cuda_;
+  // All special checks in data/data_ptr calls are guarded behind this single
+  // boolean. This is for performance: .data/.data_ptr calls are commonly in the
+  // hot-path.
+  bool has_data_ptr_check_ = false;
+  // If we should throw when mutable_data_ptr() or mutable_data() is called.
+  bool throw_on_mutable_data_ptr_ = false;
+  // If we warn when mutable_data_ptr() or mutable_data() is called.
+  bool warn_deprecated_on_mutable_data_ptr_ = false;
   Allocator* allocator_;
   impl::PyObjectSlot pyobj_slot_;
 };
@@ -233,6 +311,7 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
 using StorageImplCreateHelper = intrusive_ptr<StorageImpl> (*)(
     StorageImpl::use_byte_size_t,
     SymInt size_bytes,
+    DataPtr data_ptr,
     Allocator* allocator,
     bool resizable);
 
@@ -240,4 +319,12 @@ C10_API void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr);
 
 C10_API StorageImplCreateHelper GetStorageImplCreate(DeviceType t);
 
+C10_API c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
+    c10::StorageImpl::use_byte_size_t use_byte_size,
+    c10::SymInt size_bytes,
+    c10::DataPtr data_ptr,
+    c10::Allocator* allocator,
+    bool resizable,
+    c10::optional<at::Device> device_opt);
+
 } // namespace c10
diff --git a/c10/core/SymBool.cpp b/c10/core/SymBool.cpp
index 335a926cba935..1b5269c9da131 100644
--- a/c10/core/SymBool.cpp
+++ b/c10/core/SymBool.cpp
@@ -64,6 +64,14 @@ bool SymBool::guard_bool(const char* file, int64_t line) const {
   return a->guard_bool(file, line);
 }
 
+bool SymBool::guard_size_oblivious(const char* file, int64_t line) const {
+  if (auto ma = maybe_as_bool()) {
+    return *ma;
+  }
+  SymNode a = toSymNodeImpl();
+  return a->guard_size_oblivious(file, line);
+}
+
 bool SymBool::expect_true(const char* file, int64_t line) const {
   if (auto ma = maybe_as_bool()) {
     return *ma;
@@ -73,7 +81,7 @@ bool SymBool::expect_true(const char* file, int64_t line) const {
 }
 
 bool SymBool::has_hint() const {
-  if (auto ma = maybe_as_bool()) {
+  if (maybe_as_bool()) {
     return true;
   }
   return toSymNodeImpl()->has_hint();
diff --git a/c10/core/SymBool.h b/c10/core/SymBool.h
index 176da3a4eb7f7..cf984611e2340 100644
--- a/c10/core/SymBool.h
+++ b/c10/core/SymBool.h
@@ -58,6 +58,7 @@ class C10_API SymBool {
   // bool, so it's not so common to have to call this
   bool guard_bool(const char* file, int64_t line) const;
   bool expect_true(const char* file, int64_t line) const;
+  bool guard_size_oblivious(const char* file, int64_t line) const;
 
   bool has_hint() const;
 
@@ -89,4 +90,18 @@ C10_API std::ostream& operator<<(std::ostream& os, const SymBool& s);
 #define TORCH_SYM_INTERNAL_ASSERT(cond, ...) \
   TORCH_INTERNAL_ASSERT((cond).expect_true(__FILE__, __LINE__), __VA_ARGS__)
 
+inline bool guard_size_oblivious(bool b, const char* file, int64_t line) {
+  return b;
+}
+
+inline bool guard_size_oblivious(
+    const c10::SymBool& b,
+    const char* file,
+    int64_t line) {
+  return b.guard_size_oblivious(file, line);
+}
+
+#define TORCH_GUARD_SIZE_OBLIVIOUS(cond) \
+  c10::guard_size_oblivious((cond), __FILE__, __LINE__)
+
 } // namespace c10
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index 83fd3f660efb7..79ce4054b8640 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -364,4 +364,60 @@ DECLARE_SYMINT_OP(size_t, SymInt)
 
 C10_API std::ostream& operator<<(std::ostream& os, const SymInt& s);
 C10_API SymInt operator-(const SymInt& s);
+
+inline bool sym_eq(int64_t a, int64_t b) {
+  return a == b;
+}
+
+inline SymBool sym_eq(const SymInt& a, const SymInt& b) {
+  return a.sym_eq(b);
+}
+
+inline bool sym_ne(int64_t a, int64_t b) {
+  return a != b;
+}
+
+inline SymBool sym_ne(const SymInt& a, const SymInt& b) {
+  return a.sym_ne(b);
+}
+
+inline bool sym_lt(int64_t a, int64_t b) {
+  return a < b;
+}
+
+inline SymBool sym_lt(const SymInt& a, const SymInt& b) {
+  return a.sym_lt(b);
+}
+
+inline bool sym_le(int64_t a, int64_t b) {
+  return a <= b;
+}
+
+inline SymBool sym_le(const SymInt& a, const SymInt& b) {
+  return a.sym_le(b);
+}
+
+inline bool sym_gt(int64_t a, int64_t b) {
+  return a > b;
+}
+
+inline SymBool sym_gt(const SymInt& a, const SymInt& b) {
+  return a.sym_gt(b);
+}
+
+inline bool sym_ge(int64_t a, int64_t b) {
+  return a >= b;
+}
+
+inline SymBool sym_ge(const SymInt& a, const SymInt& b) {
+  return a.sym_ge(b);
+}
+
+inline bool definitely_true(
+    const c10::SymBool& b,
+    const char* file,
+    int64_t line) {
+  return b.has_hint() && b.guard_bool(file, line);
+}
+
 } // namespace c10
diff --git a/c10/core/SymNodeImpl.h b/c10/core/SymNodeImpl.h
index 7182912b686be..0413b9ff28482 100644
--- a/c10/core/SymNodeImpl.h
+++ b/c10/core/SymNodeImpl.h
@@ -37,6 +37,9 @@ class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
   virtual bool is_float() {
     TORCH_CHECK(false, "NYI");
   };
+  virtual bool is_nested_int() const {
+    return false;
+  };
   virtual SymNode add(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
   };
@@ -158,6 +161,11 @@ class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
   virtual double guard_float(const char* file, int64_t line) {
     TORCH_CHECK(false, "NYI");
   };
+  virtual bool guard_size_oblivious(const char* file, int64_t line) {
+    // No improvement for unbacked SymBools by default, replace this
+    // with a better implementation!
+    return guard_bool(file, line);
+  }
   virtual bool expect_true(const char* file, int64_t line) {
     // No improvement for unbacked SymBools by default, replace this
     // with a better implementation!
@@ -180,10 +188,10 @@ class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
   virtual std::string str() {
     TORCH_CHECK(false, "NYI");
   };
-  virtual c10::optional<int64_t> singleton_int() {
+  virtual c10::optional<int64_t> nested_int() {
     return c10::nullopt;
   }
-  virtual c10::optional<int64_t> singleton_coeff() {
+  virtual c10::optional<int64_t> nested_int_coeff() {
     return c10::nullopt;
   }
   virtual c10::optional<int64_t> constant_int() {
diff --git a/c10/core/SymbolicShapeMeta.cpp b/c10/core/SymbolicShapeMeta.cpp
index 28eaa0a86653a..04b2f8da832f4 100644
--- a/c10/core/SymbolicShapeMeta.cpp
+++ b/c10/core/SymbolicShapeMeta.cpp
@@ -1,5 +1,6 @@
 #include <c10/core/Contiguity.h>
 #include <c10/core/MemoryFormat.h>
+#include <c10/core/SymInt.h>
 #include <c10/core/SymIntArrayRef.h>
 #include <c10/core/SymbolicShapeMeta.h>
 
@@ -101,10 +102,7 @@ SymBool SymbolicShapeMeta::compute_contiguous() const {
     }                                                           \
     auto n = normalize_sym_sizes_strides(sizes_, strides_);     \
     if (n.has_value()) {                                        \
-      SymNode base;                                             \
-      std::vector<SymNode> size_nodes;                          \
-      std::vector<SymNode> stride_nodes;                        \
-      std::tie(base, size_nodes, stride_nodes) = *n;            \
+      auto [base, size_nodes, stride_nodes] = *n;               \
       return SymBool(base->nodeimpl(size_nodes, stride_nodes)); \
     } else {                                                    \
       c10::SymIntArrayRef sizes(sizes_);                        \
@@ -130,17 +128,13 @@ DEFINE_SYMBOOL_COMPUTE(compute_non_overlapping_and_dense, is_non_overlapping_and
 // test_aot_autograd_symbolic_exhaustive_nn_functional_unfold_cpu_float32 to run
 // very slowly.
 
-static bool definitely_true(const SymBool& b) {
-  return b.has_hint() && b.guard_bool(__FILE__, __LINE__);
-}
-
 SymBool SymbolicShapeMeta::compute_is_non_overlapping_and_dense_dim4() const {
   init_is_contiguous();
-  if (definitely_true(is_contiguous())) {
+  if (definitely_true(is_contiguous(), __FILE__, __LINE__)) {
     return true;
   }
   init_is_channels_last_contiguous();
-  if (definitely_true(is_channels_last_contiguous())) {
+  if (definitely_true(is_channels_last_contiguous(), __FILE__, __LINE__)) {
     return true;
   }
   return is_contiguous() | is_channels_last_contiguous() |
@@ -149,7 +143,7 @@ SymBool SymbolicShapeMeta::compute_is_non_overlapping_and_dense_dim4() const {
 
 SymBool SymbolicShapeMeta::compute_channels_last_contiguous_3d_dim5() const {
   init_is_channels_last_contiguous();
-  if (definitely_true(is_channels_last_contiguous())) {
+  if (definitely_true(is_channels_last_contiguous(), __FILE__, __LINE__)) {
     return false;
   }
   return ~is_channels_last_contiguous() & compute_channels_last_contiguous_3d();
@@ -157,7 +151,7 @@ SymBool SymbolicShapeMeta::compute_channels_last_contiguous_3d_dim5() const {
 
 SymBool SymbolicShapeMeta::compute_channels_last_2d_dim5() const {
   init_is_channels_last_3d_contiguous();
-  if (definitely_true(is_channels_last_3d_contiguous())) {
+  if (definitely_true(is_channels_last_3d_contiguous(), __FILE__, __LINE__)) {
     return false;
   }
   return ~is_channels_last_3d_contiguous() &
@@ -165,20 +159,20 @@ SymBool SymbolicShapeMeta::compute_channels_last_2d_dim5() const {
 }
 
 SymBool SymbolicShapeMeta::compute_channels_last_3d_dim5() const {
-  if (definitely_true(is_channels_last())) {
+  if (definitely_true(is_channels_last(), __FILE__, __LINE__)) {
     return false;
   }
   return ~is_channels_last() & compute_strides_like_channels_last_3d();
 }
 
 SymBool SymbolicShapeMeta::compute_is_non_overlapping_and_dense_dim5() const {
-  if (definitely_true(is_contiguous())) {
+  if (definitely_true(is_contiguous(), __FILE__, __LINE__)) {
     return true;
   }
-  if (definitely_true(is_channels_last_contiguous())) {
+  if (definitely_true(is_channels_last_contiguous(), __FILE__, __LINE__)) {
     return true;
   }
-  if (definitely_true(is_channels_last_3d_contiguous())) {
+  if (definitely_true(is_channels_last_3d_contiguous(), __FILE__, __LINE__)) {
     return true;
   }
   return is_contiguous() | is_channels_last_contiguous() |
@@ -186,7 +180,7 @@ SymBool SymbolicShapeMeta::compute_is_non_overlapping_and_dense_dim5() const {
 }
 
 SymBool SymbolicShapeMeta::compute_is_non_overlapping_and_dense_anydim() const {
-  if (definitely_true(is_contiguous())) {
+  if (definitely_true(is_contiguous(), __FILE__, __LINE__)) {
     return true;
   }
   return is_contiguous() | compute_non_overlapping_and_dense();
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 16485705dcd26..95e7a0e3b6117 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -1066,6 +1066,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return layout() == kSparseCsr;
   }
 
+  // Whether a tensor is sparse CSR/CSC/BSR/BSC or not.
+  bool is_sparse_compressed() const {
+    return key_set_.has_all(c10::sparse_csr_ks);
+  }
+
   bool is_quantized() const {
     // NB: This method is not virtual and avoid dispatches for performance
     // reasons.
@@ -1199,11 +1204,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return device_opt_.has_value() && device_opt_->type() == kMPS;
   }
 
-  bool is_ort() const {
+  bool is_maia() const {
     if (C10_UNLIKELY(device_policy_)) {
-      return device_custom().is_ort();
+      return device_custom().is_maia();
     }
-    return device_opt_.has_value() && device_opt_->type() == kORT;
+    return device_opt_.has_value() && device_opt_->type() == kMAIA;
   }
 
   bool is_nested() const {
@@ -1231,7 +1236,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return no_ADInplaceOrView && no_Autograd;
   }
 
-  int64_t get_device() const {
+  DeviceIndex get_device() const {
     if (C10_UNLIKELY(device_policy_)) {
       return device_custom().index();
     }
@@ -1269,7 +1274,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       return kStrided;
     } else if (is_sparse()) {
       return kSparse;
-    } else if (key_set_.has_any(c10::sparse_csr_ks)) {
+    } else if (is_sparse_compressed()) {
       // Typically, the tensor dispatch keys define the tensor layout
       // uniquely. This allows using non-virtual layout method for
       // better performance. However, when tensor's layout depends,
@@ -1571,8 +1576,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     }
     TORCH_CHECK(
         storage_initialized(),
-        "The tensor has a non-zero number of elements, but its data is not allocated yet. "
-        "Caffe2 uses a lazy allocation, so you will need to call "
+        "The tensor has a non-zero number of elements, but its data is not allocated yet.\n"
+        "If you're using torch.compile/export/fx, it is likely that we are erroneously "
+        "tracing into a custom kernel. To fix this, please wrap the custom kernel into "
+        "an opaque custom op. Please see the following for details: "
+        "https://docs.google.com/document/d/1W--T6wz8IY8fOI0Vm8BF44PdBgs283QvpelJZWieQWQ\n"
+        "If you're using Caffe2, Caffe2 uses a lazy allocation, so you will need to call "
         "mutable_data() or raw_mutable_data() to actually allocate memory.");
     // Caller does the type check.
     // Note: storage_offset_ can be non-null even for zero-elements tensors
@@ -2035,8 +2044,15 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       constexpr auto sparse_k = DispatchKeySet(DispatchKey::Sparse);
       return ts.has_any(sparse_k) && ts.has_any(sparse_backends);
     };
+    auto is_sparse_compressed = [](DispatchKeySet ts) {
+      constexpr auto sparse_compressed_k =
+          DispatchKeySet(DispatchKey::SparseCsr);
+      return ts.has_any(sparse_compressed_k);
+    };
     return (key_set_ == from) || (is_dense(key_set_) && is_dense(from)) ||
-        (is_sparse(key_set_) && is_sparse(from));
+        (is_sparse(key_set_) && is_sparse(from)) ||
+        (is_sparse_compressed(key_set_) && is_sparse_compressed(from));
+    ;
   }
 
  private:
@@ -2250,7 +2266,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
             storage_offset_ == 0); // because we just reallocated
         return storage_.mutable_data();
       }
-      const Allocator* allocator = storage_.allocator();
+      Allocator* allocator = storage_.allocator();
       // Storage might have nullptr allocator in rare cases, for example, if
       // an external memory segment has been wrapped with Tensor and we don't
       // know how to reallocate it. However, in order to preserve legacy C2
@@ -2423,10 +2439,24 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return is_non_overlapping_and_dense_default();
   }
 
+  // if this returns true, then it is guaranteed that this tensor has symbolic
+  // sizes/strides
   bool has_symbolic_sizes_strides() const {
     return has_symbolic_sizes_strides_;
   }
 
+  // if this returns true, then it is guaranteed that this tensor does NOT have
+  // symbolic sizes/strides. This is different from the above, because it's
+  // possible that has_symbolic_sizes_strides() returns false, but we do
+  // not have symbolic sizes/strides. This exists for the case of
+  // Nested Tensor python subclass, where the sizes are implemented in python
+  // (TODO: clean this up and just implement sizes in nested tensor without a
+  // python implementation)
+  bool does_not_have_symbolic_sizes_strides() const {
+    return !has_symbolic_sizes_strides() &&
+        !matches_policy(SizesStridesPolicy::CustomStrides);
+  }
+
  private:
   void HandleResize();
 
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index 14003e839c82e..765f474702ef7 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -291,7 +291,7 @@ struct C10_API TensorOptions {
   }
 
   /// Returns the device index of the `TensorOptions`.
-  int32_t device_index() const noexcept {
+  c10::DeviceIndex device_index() const noexcept {
     return device().index();
   }
 
@@ -359,10 +359,18 @@ struct C10_API TensorOptions {
     return layout_ == c10::Layout::Sparse;
   }
 
+  /// Returns if the layout is sparse CSR, deprecated, use
+  /// is_sparse_compressed() instead
   bool is_sparse_csr() const {
     return layout_ == c10::Layout::SparseCsr;
   }
 
+  bool is_sparse_compressed() const {
+    return layout_ == c10::Layout::SparseCsr ||
+        layout_ == c10::Layout::SparseCsc ||
+        layout_ == c10::Layout::SparseBsr || layout_ == c10::Layout::SparseBsc;
+  }
+
   // For compatibility with legacy tensor.type() comparisons
   bool type_equal(const TensorOptions& other) const {
     return computeDispatchKey() == other.computeDispatchKey() &&
@@ -590,9 +598,8 @@ inline TensorOptions device(Device device) {
 
 /// Convenience function that returns a `TensorOptions` object with the
 /// `device` set to CUDA and the `device_index` set to the given one.
-inline TensorOptions device_index(int16_t device_index) {
-  return TensorOptions().device_index(
-      static_cast<c10::DeviceIndex>(device_index));
+inline TensorOptions device_index(c10::DeviceIndex device_index) {
+  return TensorOptions().device_index(device_index);
 }
 
 /// Convenience function that returns a `TensorOptions` object with the
@@ -646,8 +653,8 @@ inline DispatchKey computeDispatchKey(
 #undef DO_CASE
         case c10::DeviceType::FPGA:
           return DispatchKey::FPGA;
-        case c10::DeviceType::ORT:
-          return DispatchKey::ORT;
+        case c10::DeviceType::MAIA:
+          return DispatchKey::MAIA;
         case c10::DeviceType::Vulkan:
           return DispatchKey::Vulkan;
         case c10::DeviceType::Metal:
@@ -697,12 +704,15 @@ inline DispatchKey computeDispatchKey(
     case Layout::SparseBsr:
     case Layout::SparseBsc:
       switch (device_.type()) {
-        case c10::DeviceType::CPU:
-          return DispatchKey::SparseCsrCPU;
-        case c10::DeviceType::CUDA:
-          return DispatchKey::SparseCsrCUDA;
+#define DO_CASE(device, _)                 \
+  case c10::DeviceType::device: {          \
+    return DispatchKey::SparseCsr##device; \
+  }
+        C10_FORALL_BACKEND_DEVICE_TYPES(DO_CASE, unused)
+#undef DO_CASE
         default:
-          AT_ERROR(
+          TORCH_CHECK_NOT_IMPLEMENTED(
+              false,
               "Unsupported device type for ",
               layout_,
               " layout: ",
@@ -719,13 +729,11 @@ inline Layout dispatchKeyToLayout(DispatchKey dispatch_key) {
     C10_FORALL_BACKEND_COMPONENTS(DO_CASE, unused)
 #undef DO_CASE
     return Layout::Sparse;
-    case DispatchKey::SparseCsrCPU:
-    case DispatchKey::SparseCsrCUDA:
-      TORCH_CHECK(
-          false,
-          "Cannot map DispatchKey ",
-          dispatch_key,
-          " to a unique layout.");
+#define DO_CASE(bc, _) case DispatchKey::SparseCsr##bc:
+    C10_FORALL_BACKEND_COMPONENTS(DO_CASE, unused)
+#undef DO_CASE
+    TORCH_CHECK(
+        false, "Cannot map DispatchKey ", dispatch_key, " to a unique layout.");
     case DispatchKey::MkldnnCPU:
       return Layout::Mkldnn;
     default:
@@ -749,8 +757,8 @@ inline c10::DeviceType dispatchKeyToDeviceType(DispatchKey dispatch_key) {
     case DispatchKey::Vulkan:
       return c10::DeviceType::Vulkan;
 
-    case DispatchKey::ORT:
-      return c10::DeviceType::ORT;
+    case DispatchKey::MAIA:
+      return c10::DeviceType::MAIA;
     default:
       TORCH_CHECK(
           false,
diff --git a/c10/core/WrapDimMinimal.cpp b/c10/core/WrapDimMinimal.cpp
index 2375dc3ac5cf7..c957534325805 100644
--- a/c10/core/WrapDimMinimal.cpp
+++ b/c10/core/WrapDimMinimal.cpp
@@ -1,7 +1,6 @@
 #include <c10/core/WrapDimMinimal.h>
 
-namespace c10 {
-namespace detail {
+namespace c10::detail {
 
 template <typename T>
 T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar) {
@@ -40,5 +39,4 @@ maybe_wrap_dim_slow(int64_t dim, int64_t dim_post_expr, bool wrap_scalar);
 template C10_API SymInt
 maybe_wrap_dim_slow(SymInt dim, SymInt dim_post_expr, bool wrap_scalar);
 
-} // namespace detail
-} // namespace c10
+} // namespace c10::detail
diff --git a/c10/core/build.bzl b/c10/core/build.bzl
index 781c740d4c761..45fc5ea3390f8 100644
--- a/c10/core/build.bzl
+++ b/c10/core/build.bzl
@@ -62,7 +62,6 @@ def define_targets(rules):
             exclude = [
                 "CPUAllocator.cpp",
                 "impl/alloc_cpu.cpp",
-                "impl/cow/*.cpp",
             ],
         ),
         hdrs = rules.glob(
@@ -73,7 +72,6 @@ def define_targets(rules):
             exclude = [
                 "CPUAllocator.h",
                 "impl/alloc_cpu.h",
-                "impl/cow/*.h",
             ],
         ),
         linkstatic = True,
@@ -92,22 +90,6 @@ def define_targets(rules):
         alwayslink = True,
     )
 
-    rules.cc_library(
-        name = "impl_cow",
-        srcs = rules.glob([
-            "impl/cow/*.cpp",
-        ]),
-        hdrs = rules.glob([
-            "impl/cow/*.h",
-        ]),
-        deps = [
-            ":base",
-            ":CPUAllocator",
-        ],
-        visibility = ["//c10/test:__pkg__"],
-
-    )
-
     rules.filegroup(
         name = "headers",
         srcs = rules.glob(
diff --git a/c10/core/impl/COW.cpp b/c10/core/impl/COW.cpp
new file mode 100644
index 0000000000000..81bc86e64bdab
--- /dev/null
+++ b/c10/core/impl/COW.cpp
@@ -0,0 +1,152 @@
+#include <c10/core/impl/COW.h>
+
+#include <c10/core/Allocator.h>
+#include <c10/core/StorageImpl.h>
+#include <c10/core/alignment.h>
+#include <c10/core/impl/COWDeleter.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ParallelGuard.h>
+#include <c10/util/UniqueVoidPtr.h>
+
+#include <memory>
+#include <optional>
+
+namespace c10::impl::cow {
+
+namespace {
+
+// Wraps a DataPtr with a copy-on-write DataPtr.
+at::DataPtr make_data_ptr(
+    at::DataPtr const& data_ptr,
+    cow::COWDeleterContext& ctx) {
+  return at::DataPtr(data_ptr.get(), &ctx, cow::cow_deleter, data_ptr.device());
+}
+
+/// Copies a copy-on-write DataPtr.
+at::DataPtr copy_data_ptr(at::DataPtr const& data_ptr) {
+  auto* ctx = data_ptr.cast_context<cow::COWDeleterContext>(cow::cow_deleter);
+  TORCH_INTERNAL_ASSERT(ctx != nullptr);
+  ctx->increment_refcount();
+  return make_data_ptr(data_ptr, *ctx);
+}
+
+} // namespace
+
+bool has_simple_data_ptr(const c10::StorageImpl& storage) {
+  const c10::DataPtr& data_ptr = storage.data_ptr();
+  const void* ctx = data_ptr.get_context();
+  const void* data = data_ptr.get();
+  const c10::Allocator* allocator = storage.allocator();
+  if (allocator != nullptr) {
+    return allocator->is_simple_data_ptr(data_ptr);
+  } else {
+    return ctx == data;
+  }
+}
+
+bool is_cow_data_ptr(const c10::DataPtr& data_ptr) {
+  return (void*)data_ptr.get_deleter() == (void*)&cow::cow_deleter;
+}
+
+c10::intrusive_ptr<StorageImpl> lazy_clone_storage(StorageImpl& storage) {
+  const at::DataPtr& data_ptr = storage.data_ptr();
+
+  // There are three possible circumstances:
+  //
+  // 1) The storage has a normal data pointer with no out of the ordinary
+  //    context. In this case we know that there are no blind aliases to the
+  //    storage impl: they all will be public aliases and the user is expected
+  //    to synchronize manually.
+  //
+  //    No locking is required in this case.
+  //
+  // 2) The storage already has a copy on write context. There
+  //    is a potential race condition with a blind alias (i.e. an
+  //    alias that the user is not required to synchronize
+  //    with). Because our input storage is bound to a live reference
+  //    to the data, we know that it isn't going away. A blind alias
+  //    could be copying from it right now, but we will grab the
+  //    context's mutex to protect us.
+  //
+  //    We do not need to lock in this case either, because we're just
+  //    wrapping a context that we know isn't going away.
+  //
+  // 3) The storage has a context that is not the copy on write
+  //    context. This is not supported, so we just return null.
+  //
+  //    No locking is required in this case.
+
+  std::optional<DataPtr> new_data_ptr; // must be set below
+
+  if (has_simple_data_ptr(storage)) {
+    // Case 1) We have a simple data pointer: wrap it.
+    std::unique_ptr<void, DeleterFnPtr> original_ctx =
+        storage._mutable_data_ptr_no_checks().move_context();
+
+    // Save this for the result.
+    new_data_ptr = make_data_ptr(
+        data_ptr, *new cow::COWDeleterContext(std::move(original_ctx)));
+
+    // Update this storage to the new copy on write context.
+    storage.set_data_ptr_noswap(copy_data_ptr(*new_data_ptr));
+  } else if (is_cow_data_ptr(data_ptr)) {
+    // Case 2): there is already a copy on write context. Just return a
+    // new storage impl.
+    new_data_ptr = copy_data_ptr(data_ptr);
+  } else {
+    // Case 3) There is a context and it's not copy-on-write. Nothing
+    // we can do here.
+    return nullptr;
+  }
+
+  TORCH_INTERNAL_ASSERT(new_data_ptr.has_value());
+
+  return make_storage_impl(
+      StorageImpl::use_byte_size_t(),
+      storage.sym_nbytes(),
+      *std::move(new_data_ptr),
+      storage.allocator(),
+      storage.resizable(),
+      storage.device_type());
+}
+
+C10_API void materialize_cow_storage(StorageImpl& storage) {
+  TORCH_INTERNAL_ASSERT(
+      !c10::ParallelGuard::is_enabled(),
+      "Materializing a storage in the loop function of at::parallel_for is forbidden");
+  const at::DataPtr& data_ptr = storage.data_ptr();
+
+  auto* ctx = data_ptr.cast_context<cow::COWDeleterContext>(cow::cow_deleter);
+  TORCH_INTERNAL_ASSERT(ctx != nullptr);
+
+  auto result = ctx->decrement_refcount();
+
+  // This must be set by each branch below.
+  std::optional<DataPtr> new_data_ptr;
+
+  if (std::holds_alternative<cow::COWDeleterContext::LastReference>(result)) {
+    // This is the only reference to the data. If there were any racing writes,
+    // the context ensured they finished before giving us the result.
+    std::unique_ptr<void, DeleterFnPtr> data =
+        std::get<cow::COWDeleterContext::LastReference>(std::move(result));
+    TORCH_INTERNAL_ASSERT(data.get() == data_ptr.get());
+    new_data_ptr = DataPtr(
+        data.release(), data_ptr.get(), data.get_deleter(), data_ptr.device());
+  } else {
+    TORCH_INTERNAL_ASSERT(
+        std::holds_alternative<cow::COWDeleterContext::NotLastReference>(
+            result));
+    // We don't need to consume the result, it's just a shared lock ensuring
+    // that the data will remain while we copy it.
+    new_data_ptr = storage.allocator()->clone(data_ptr.get(), storage.nbytes());
+  }
+
+  TORCH_INTERNAL_ASSERT(new_data_ptr.has_value());
+  DataPtr old_data_ptr =
+      storage.set_data_ptr_no_materialize_cow(*std::move(new_data_ptr));
+  // The refcount of the context was already decremented above. Release the
+  // reference to the context so the refcount doesn't get decremented again
+  old_data_ptr.release_context();
+}
+
+} // namespace c10::impl::cow
diff --git a/c10/core/impl/cow/COW.h b/c10/core/impl/COW.h
similarity index 87%
rename from c10/core/impl/cow/COW.h
rename to c10/core/impl/COW.h
index 07ef5e4fe9040..1cf81eda1ca6f 100644
--- a/c10/core/impl/cow/COW.h
+++ b/c10/core/impl/COW.h
@@ -26,4 +26,7 @@ C10_API bool has_simple_data_ptr(const c10::StorageImpl& storage);
 // Check if a DataPtr is COW
 C10_API bool is_cow_data_ptr(const c10::DataPtr& data_ptr);
 
+// Eagerly copies a COW storage's data, turning it into a non-COW storage.
+C10_API void materialize_cow_storage(StorageImpl& storage);
+
 } // namespace c10::impl::cow
diff --git a/c10/core/impl/cow/COWDeleter.cpp b/c10/core/impl/COWDeleter.cpp
similarity index 96%
rename from c10/core/impl/cow/COWDeleter.cpp
rename to c10/core/impl/COWDeleter.cpp
index 9da38f5cf1b51..bd8b4963cf1ae 100644
--- a/c10/core/impl/cow/COWDeleter.cpp
+++ b/c10/core/impl/COWDeleter.cpp
@@ -1,4 +1,4 @@
-#include <c10/core/impl/cow/COWDeleter.h>
+#include <c10/core/impl/COWDeleter.h>
 #include <c10/util/Exception.h>
 #include <mutex>
 
diff --git a/c10/core/impl/cow/COWDeleter.h b/c10/core/impl/COWDeleter.h
similarity index 100%
rename from c10/core/impl/cow/COWDeleter.h
rename to c10/core/impl/COWDeleter.h
diff --git a/c10/core/impl/DeviceGuardImplInterface.cpp b/c10/core/impl/DeviceGuardImplInterface.cpp
index cc54c54c97080..581f32f1e130b 100644
--- a/c10/core/impl/DeviceGuardImplInterface.cpp
+++ b/c10/core/impl/DeviceGuardImplInterface.cpp
@@ -1,7 +1,6 @@
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 
-namespace c10 {
-namespace impl {
+namespace c10::impl {
 
 // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
 std::atomic<const DeviceGuardImplInterface*>
@@ -14,5 +13,4 @@ DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
   device_guard_impl_registry[static_cast<size_t>(type)].store(impl);
 }
 
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h
index 4e74b856614f6..59210a92d6d35 100644
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@@ -21,23 +21,16 @@ class DataPtr;
  * PYTORCH_DEFAULT and BACKEND_DEFAULT are valid for all backends. The
  * BACKEND_DEFAULT is what a particular backend would select if no
  * flags were given. PYTORCH_DEFAULT is the PyTorch's framework default
- * choice for events on that backend, which may not be the same. For example,
- * when PyTorch creates a CUDA event it sets the flag
- * CUDA_EVENT_DISABLING_TIMING by default to improve performance.
+ * choice for events on that backend, which may not be the same.
  *
  * The mapping of PYTORCH_DEFAULT and BACKEND_DEFAULT is done by each
- * backend implementation. Backend-specific flags, like CUDA_EVENT_DEFAULT,
- * should map one-to-one with actual event flags for those backends.
+ * backend implementation.
  */
 enum class EventFlag {
+  // Disable timing
   PYTORCH_DEFAULT,
+  // Enable timing
   BACKEND_DEFAULT,
-  // CUDA flags
-  CUDA_EVENT_DEFAULT,
-  CUDA_EVENT_DISABLE_TIMING, // PyTorch-default for CUDA
-  // HIP flags
-  HIP_EVENT_DEFAULT,
-  HIP_EVENT_DISABLE_TIMING, // PyTorch-default for HIP
   // FOR TESTING ONLY
   INVALID
 };
@@ -129,6 +122,16 @@ struct C10_API DeviceGuardImplInterface {
     TORCH_CHECK(false, "Backend doesn't support acquiring a stream from pool.")
   }
 
+  /**
+   * Return a new stream for a given device and priority. The stream will be
+   * copied and shared around, device backend should be able to correctly handle
+   * the lifetime of the stream.
+   */
+  virtual Stream getNewStream(Device, int priority = 0) const {
+    (void)priority;
+    TORCH_CHECK(false, "Backend doesn't support create a new Stream.")
+  }
+
   /**
    * Set a stream to be the thread local current stream for its device.
    * Return the previous stream for that device. You are NOT required
@@ -201,6 +204,14 @@ struct C10_API DeviceGuardImplInterface {
     TORCH_CHECK(false, "Backend doesn't support synchronizing streams.");
   }
 
+  /**
+   * Wait (by blocking the calling thread) until all the work previously
+   * recorded on the event has completed running on the device.
+   */
+  virtual void synchronizeEvent(void* /*event*/) const {
+    TORCH_CHECK(false, "Backend doesn't support synchronizing events.");
+  }
+
   /**
    * Ensure the caching allocator (if any) is aware that the given DataPtr is
    * being used on the given stream, and that it should thus avoid recycling the
@@ -209,6 +220,13 @@ struct C10_API DeviceGuardImplInterface {
   virtual void recordDataPtrOnStream(const c10::DataPtr&, const Stream&) const {
   }
 
+  /**
+   * Fetch the elapsed time between two recorded events.
+   */
+  virtual double elapsedTime(void* /*event1*/, void* /*event2*/) const {
+    TORCH_CHECK(false, "Backend doesn't support elapsedTime.");
+  }
+
   /**
    * Intended use of this class is to leak the DeviceGuardImpl at program end.
    * So you better not call the destructor, buster!
@@ -241,6 +259,13 @@ struct NoOpDeviceGuardImpl final : public DeviceGuardImplInterface {
     // no-op
     return Stream(Stream::DEFAULT, Device(D, -1));
   }
+
+  Stream getNewStream(Device, int priority = 0) const override {
+    // no-op
+    (void)priority;
+    return Stream(Stream::DEFAULT, Device(D, -1));
+  }
+
   // NB: These do NOT set the current device
   Stream exchangeStream(Stream) const noexcept override {
     // no-op
diff --git a/c10/core/impl/FakeGuardImpl.h b/c10/core/impl/FakeGuardImpl.h
index c9b472156edd3..c8bfe91619edc 100644
--- a/c10/core/impl/FakeGuardImpl.h
+++ b/c10/core/impl/FakeGuardImpl.h
@@ -4,8 +4,7 @@
 
 #include <array>
 
-namespace c10 {
-namespace impl {
+namespace c10::impl {
 
 // FakeGuardImpl is hardcoded to have eight devices.  Not for
 // any good reason, just to simplify code.
@@ -100,5 +99,4 @@ template <DeviceType T>
 thread_local std::array<StreamId, kFakeGuardImplMaxDevices>
     FakeGuardImpl<T>::current_streams_ = {0, 0, 0, 0, 0, 0, 0, 0};
 
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/c10/core/impl/GPUTrace.cpp b/c10/core/impl/GPUTrace.cpp
index 403f1e65ce56a..ac1f08d7e091d 100644
--- a/c10/core/impl/GPUTrace.cpp
+++ b/c10/core/impl/GPUTrace.cpp
@@ -1,8 +1,7 @@
 #include <c10/core/impl/GPUTrace.h>
 #include <c10/util/CallOnce.h>
 
-namespace c10 {
-namespace impl {
+namespace c10::impl {
 
 std::atomic<const PyInterpreter*> GPUTrace::gpuTraceState{nullptr};
 
@@ -16,5 +15,4 @@ void GPUTrace::set_trace(const PyInterpreter* trace) {
   });
 }
 
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/c10/core/impl/GPUTrace.h b/c10/core/impl/GPUTrace.h
index 377af88be034a..3acb875b54a32 100644
--- a/c10/core/impl/GPUTrace.h
+++ b/c10/core/impl/GPUTrace.h
@@ -2,8 +2,7 @@
 
 #include <c10/core/impl/PyInterpreter.h>
 
-namespace c10 {
-namespace impl {
+namespace c10::impl {
 
 struct C10_API GPUTrace {
   // On the x86 architecture the atomic operations are lock-less.
@@ -26,5 +25,4 @@ struct C10_API GPUTrace {
   }
 };
 
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/c10/core/impl/HermeticPyObjectTLS.cpp b/c10/core/impl/HermeticPyObjectTLS.cpp
index a7eb89430be8a..ae1f6fd8ee6ba 100644
--- a/c10/core/impl/HermeticPyObjectTLS.cpp
+++ b/c10/core/impl/HermeticPyObjectTLS.cpp
@@ -1,7 +1,6 @@
 #include <c10/core/impl/HermeticPyObjectTLS.h>
 
-namespace c10 {
-namespace impl {
+namespace c10::impl {
 
 thread_local std::atomic<bool> hermeticPyObjectState{false};
 
@@ -19,5 +18,4 @@ void HermeticPyObjectTLS::init_state() {
   haveState_ = true;
 }
 
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/c10/core/impl/InlineEvent.h b/c10/core/impl/InlineEvent.h
index ef1e2c6d6fc4c..3485da37c9210 100644
--- a/c10/core/impl/InlineEvent.h
+++ b/c10/core/impl/InlineEvent.h
@@ -101,6 +101,32 @@ struct InlineEvent final {
     return backend_.queryEvent(event_);
   }
 
+  void* eventId() const {
+    return event_;
+  }
+
+  double elapsedTime(const InlineEvent& other) const {
+    TORCH_CHECK(
+        other.was_marked_for_recording(),
+        "other was not marked for recording.");
+    TORCH_CHECK(
+        was_marked_for_recording(), "self was not marked for recording.");
+    TORCH_CHECK(
+        other.device_type() == device_type_,
+        "Event device type ",
+        DeviceTypeName(device_type_),
+        " does not match other's device type ",
+        DeviceTypeName(other.device_type()),
+        ".");
+    return backend_.elapsedTime(event_, other.event_);
+  }
+
+  void synchronize() const {
+    if (!was_marked_for_recording_)
+      return;
+    backend_.synchronizeEvent(event_);
+  }
+
  private:
   void* event_ = nullptr;
   T backend_;
diff --git a/c10/core/impl/LocalDispatchKeySet.cpp b/c10/core/impl/LocalDispatchKeySet.cpp
index 0b6cf6fa66f88..0b3d87fce2410 100644
--- a/c10/core/impl/LocalDispatchKeySet.cpp
+++ b/c10/core/impl/LocalDispatchKeySet.cpp
@@ -1,7 +1,6 @@
 #include <c10/core/impl/LocalDispatchKeySet.h>
 
-namespace c10 {
-namespace impl {
+namespace c10::impl {
 
 // NB: POD, must be zero initialized!
 // Note [TLS Initialization]
@@ -115,5 +114,4 @@ bool tls_is_dispatch_keyset_excluded(DispatchKeySet ks) {
 bool tls_is_dispatch_keyset_included(DispatchKeySet ks) {
   return raw_local_dispatch_key_set.included().isSupersetOf(ks);
 }
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h
index ef4acbb551115..176d0a6b64219 100644
--- a/c10/core/impl/LocalDispatchKeySet.h
+++ b/c10/core/impl/LocalDispatchKeySet.h
@@ -117,14 +117,16 @@ class C10_API ExcludeDispatchKeyGuard {
 
 struct C10_API ForceDispatchKeyGuard {
  public:
+  ForceDispatchKeyGuard()
+      : saved_keyset_(c10::impl::tls_local_dispatch_key_set()) {}
   ForceDispatchKeyGuard(c10::impl::LocalDispatchKeySet key_set)
-      : saved_keyset_(c10::impl::tls_local_dispatch_key_set()) {
+      : ForceDispatchKeyGuard() {
     c10::impl::_force_tls_local_dispatch_key_set(key_set);
   }
   ForceDispatchKeyGuard(
       c10::DispatchKeySet include,
       c10::DispatchKeySet exclude)
-      : saved_keyset_(c10::impl::tls_local_dispatch_key_set()) {
+      : ForceDispatchKeyGuard() {
     auto updated_set = saved_keyset_;
     updated_set.included_ = include;
     updated_set.excluded_ = exclude;
diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp
index f555ee5c34589..04f9d7c9722d3 100644
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@@ -2,8 +2,7 @@
 #include <c10/core/TensorImpl.h>
 #include <c10/core/impl/PyInterpreter.h>
 
-namespace c10 {
-namespace impl {
+namespace c10::impl {
 
 struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
   std::string name() const override {
@@ -35,7 +34,9 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
   void python_op_registration_trampoline(
       const c10::OperatorHandle& op,
       c10::DispatchKey,
-      torch::jit::Stack* stack) const override {
+      c10::DispatchKeySet keyset,
+      torch::jit::Stack* stack,
+      bool with_keyset) const override {
     PANIC(python_op_registration_trampoline);
   }
 
@@ -95,17 +96,32 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
   }
 
   // Just swallow the event, don't do anything
-  void trace_gpu_event_creation(uintptr_t event) const override {}
-  void trace_gpu_event_deletion(uintptr_t event) const override {}
-  void trace_gpu_event_record(uintptr_t event, uintptr_t stream)
+  void trace_gpu_event_creation(c10::DeviceType device_type, uintptr_t event)
+      const override {}
+  void trace_gpu_event_deletion(c10::DeviceType device_type, uintptr_t event)
+      const override {}
+  void trace_gpu_event_record(
+      c10::DeviceType device_type,
+      uintptr_t event,
+      uintptr_t stream) const override {}
+  void trace_gpu_event_wait(
+      c10::DeviceType device_type,
+      uintptr_t event,
+      uintptr_t stream) const override {}
+  void trace_gpu_memory_allocation(c10::DeviceType device_type, uintptr_t ptr)
+      const override {}
+  void trace_gpu_memory_deallocation(c10::DeviceType device_type, uintptr_t ptr)
+      const override {}
+  void trace_gpu_stream_creation(c10::DeviceType device_type, uintptr_t stream)
       const override {}
-  void trace_gpu_event_wait(uintptr_t event, uintptr_t stream) const override {}
-  void trace_gpu_memory_allocation(uintptr_t ptr) const override {}
-  void trace_gpu_memory_deallocation(uintptr_t ptr) const override {}
-  void trace_gpu_stream_creation(uintptr_t stream) const override {}
-  void trace_gpu_device_synchronization() const override {}
-  void trace_gpu_stream_synchronization(uintptr_t stream) const override {}
-  void trace_gpu_event_synchronization(uintptr_t event) const override {}
+  void trace_gpu_device_synchronization(
+      c10::DeviceType device_type) const override {}
+  void trace_gpu_stream_synchronization(
+      c10::DeviceType device_type,
+      uintptr_t stream) const override {}
+  void trace_gpu_event_synchronization(
+      c10::DeviceType device_type,
+      uintptr_t event) const override {}
 
   void reset_backward_hooks(const TensorImpl* self) const override {
     PANIC(reset_backward_hooks);
@@ -125,5 +141,4 @@ void PyInterpreter::disarm() noexcept {
   vtable_ = &noop_vtable;
 }
 
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/c10/core/impl/PyInterpreter.h b/c10/core/impl/PyInterpreter.h
index e1aaa23ea05e8..2685496899e92 100644
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@@ -148,7 +148,9 @@ struct C10_API PyInterpreterVTable {
   virtual void python_op_registration_trampoline(
       const c10::OperatorHandle& op,
       c10::DispatchKey,
-      torch::jit::Stack* stack) const = 0;
+      c10::DispatchKeySet keyset,
+      torch::jit::Stack* stack,
+      bool with_keyset) const = 0;
 
   virtual void throw_abstract_impl_not_imported_error(
       std::string opname,
@@ -177,18 +179,37 @@ struct C10_API PyInterpreterVTable {
   virtual c10::SymIntArrayRef sym_strides(const TensorImpl* self) const = 0;
   virtual c10::SymInt sym_storage_offset(const TensorImpl* self) const = 0;
 
-  virtual void trace_gpu_event_creation(uintptr_t event) const = 0;
-  virtual void trace_gpu_event_deletion(uintptr_t event) const = 0;
-  virtual void trace_gpu_event_record(uintptr_t event, uintptr_t stream)
-      const = 0;
-  virtual void trace_gpu_event_wait(uintptr_t event, uintptr_t stream)
-      const = 0;
-  virtual void trace_gpu_memory_allocation(uintptr_t ptr) const = 0;
-  virtual void trace_gpu_memory_deallocation(uintptr_t ptr) const = 0;
-  virtual void trace_gpu_stream_creation(uintptr_t stream) const = 0;
-  virtual void trace_gpu_device_synchronization() const = 0;
-  virtual void trace_gpu_stream_synchronization(uintptr_t stream) const = 0;
-  virtual void trace_gpu_event_synchronization(uintptr_t event) const = 0;
+  virtual void trace_gpu_event_creation(
+      c10::DeviceType device_type,
+      uintptr_t event) const = 0;
+  virtual void trace_gpu_event_deletion(
+      c10::DeviceType device_type,
+      uintptr_t event) const = 0;
+  virtual void trace_gpu_event_record(
+      c10::DeviceType device_type,
+      uintptr_t event,
+      uintptr_t stream) const = 0;
+  virtual void trace_gpu_event_wait(
+      c10::DeviceType device_type,
+      uintptr_t event,
+      uintptr_t stream) const = 0;
+  virtual void trace_gpu_memory_allocation(
+      c10::DeviceType device_type,
+      uintptr_t ptr) const = 0;
+  virtual void trace_gpu_memory_deallocation(
+      c10::DeviceType device_type,
+      uintptr_t ptr) const = 0;
+  virtual void trace_gpu_stream_creation(
+      c10::DeviceType device_type,
+      uintptr_t stream) const = 0;
+  virtual void trace_gpu_device_synchronization(
+      c10::DeviceType device_type) const = 0;
+  virtual void trace_gpu_stream_synchronization(
+      c10::DeviceType device_type,
+      uintptr_t stream) const = 0;
+  virtual void trace_gpu_event_synchronization(
+      c10::DeviceType device_type,
+      uintptr_t event) const = 0;
 
   virtual void reset_backward_hooks(const TensorImpl* self) const = 0;
 };
diff --git a/c10/core/impl/PyObjectSlot.cpp b/c10/core/impl/PyObjectSlot.cpp
index ac6d5915afb63..400903bc7a651 100644
--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@@ -1,7 +1,6 @@
 #include <c10/core/impl/PyObjectSlot.h>
 
-namespace c10 {
-namespace impl {
+namespace c10::impl {
 
 PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
 
@@ -71,5 +70,4 @@ void PyObjectSlot::set_owns_pyobj(bool b) {
       reinterpret_cast<uintptr_t>(_unchecked_untagged_pyobj()) | b);
 }
 
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/c10/core/impl/PyObjectSlot.h b/c10/core/impl/PyObjectSlot.h
index bb4ed869b63b0..b3a4b85f05e8e 100644
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@@ -56,7 +56,7 @@ struct C10_API PyObjectSlot {
         // fallthrough, we lost the race.  We are guaranteed not to lose the
         // race with ourself, as calls to init_pyobj with the same interpreter
         // ID must be sequentialized by the GIL
-        C10_FALLTHROUGH;
+        [[fallthrough]];
       case impl::PyInterpreterStatus::TAGGED_BY_OTHER:
         TORCH_CHECK(
             false,
diff --git a/c10/core/impl/PythonDispatcherTLS.cpp b/c10/core/impl/PythonDispatcherTLS.cpp
index 5ca57a9da4667..a53e293eaa326 100644
--- a/c10/core/impl/PythonDispatcherTLS.cpp
+++ b/c10/core/impl/PythonDispatcherTLS.cpp
@@ -2,8 +2,7 @@
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/core/impl/PythonDispatcherTLS.h>
 
-namespace c10 {
-namespace impl {
+namespace c10::impl {
 
 thread_local PyInterpreter* pythonDispatcherState;
 
@@ -27,5 +26,4 @@ void PythonDispatcherTLS::reset_state() {
       DispatchKey::PythonDispatcher, false);
 }
 
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/c10/core/impl/PythonDispatcherTLS.h b/c10/core/impl/PythonDispatcherTLS.h
index 8998935279eec..9016c3e11e157 100644
--- a/c10/core/impl/PythonDispatcherTLS.h
+++ b/c10/core/impl/PythonDispatcherTLS.h
@@ -3,8 +3,7 @@
 #include <c10/core/impl/PyInterpreter.h>
 #include <c10/macros/Export.h>
 
-namespace c10 {
-namespace impl {
+namespace c10::impl {
 
 struct C10_API PythonDispatcherTLS {
   static void set_state(PyInterpreter* state);
@@ -22,5 +21,4 @@ struct C10_API DisablePythonDispatcher {
   PyInterpreter* old_;
 };
 
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/c10/core/impl/cow/README.md b/c10/core/impl/README-cow.md
similarity index 100%
rename from c10/core/impl/cow/README.md
rename to c10/core/impl/README-cow.md
diff --git a/c10/core/impl/SizesAndStrides.cpp b/c10/core/impl/SizesAndStrides.cpp
index ca859ae0681db..6c63109ea5afd 100644
--- a/c10/core/impl/SizesAndStrides.cpp
+++ b/c10/core/impl/SizesAndStrides.cpp
@@ -1,7 +1,6 @@
 #include <c10/core/impl/SizesAndStrides.h>
 
-namespace c10 {
-namespace impl {
+namespace c10::impl {
 
 void SizesAndStrides::resizeSlowPath(
     const size_t newSize,
@@ -77,5 +76,4 @@ void SizesAndStrides::resizeSlowPath(
   size_ = newSize;
 }
 
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/c10/core/impl/TorchDispatchModeTLS.cpp b/c10/core/impl/TorchDispatchModeTLS.cpp
index 1ee5aa85cc231..e558a70522aca 100644
--- a/c10/core/impl/TorchDispatchModeTLS.cpp
+++ b/c10/core/impl/TorchDispatchModeTLS.cpp
@@ -6,8 +6,7 @@
 
 #include <utility>
 
-namespace c10 {
-namespace impl {
+namespace c10::impl {
 
 thread_local TorchDispatchModeTLS torchDispatchModeState;
 
@@ -26,7 +25,7 @@ bool TorchDispatchModeTLS::any_modes_set(bool skip_infra_modes) {
 }
 
 void TorchDispatchModeTLS::push_non_infra_mode_onto_stack(
-    std::shared_ptr<SafePyObject> mode) {
+    std::shared_ptr<PyObject_TorchDispatchMode> mode) {
   if (!any_modes_set()) {
     c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true);
     c10::impl::tls_set_dispatch_key_included(
@@ -35,8 +34,9 @@ void TorchDispatchModeTLS::push_non_infra_mode_onto_stack(
   torchDispatchModeState.stack_.push_back(std::move(mode));
 }
 
-const std::shared_ptr<SafePyObject> TorchDispatchModeTLS::pop_stack() {
-  std::shared_ptr<SafePyObject> out;
+const std::shared_ptr<PyObject_TorchDispatchMode> TorchDispatchModeTLS::
+    pop_stack() {
+  std::shared_ptr<PyObject_TorchDispatchMode> out;
   if (!torchDispatchModeState.stack_.empty()) {
     out = torchDispatchModeState.stack_.back();
     torchDispatchModeState.stack_.pop_back();
@@ -61,8 +61,9 @@ const std::shared_ptr<SafePyObject> TorchDispatchModeTLS::pop_stack() {
   }
   return out;
 }
-const std::tuple<std::shared_ptr<SafePyObject>, TorchDispatchModeKey>
-TorchDispatchModeTLS::pop_highest_infra_mode() {
+const std::
+    tuple<std::shared_ptr<PyObject_TorchDispatchMode>, TorchDispatchModeKey>
+    TorchDispatchModeTLS::pop_highest_infra_mode() {
   for (int64_t i = static_cast<size_t>(TorchDispatchModeKey::NUM_MODE_KEYS) - 1;
        i >= 0;
        --i) {
@@ -83,8 +84,8 @@ TorchDispatchModeTLS::pop_highest_infra_mode() {
       false, "Called pop_highest_infra_mode, but no infra modes were active.")
 }
 
-const std::shared_ptr<SafePyObject>& TorchDispatchModeTLS::get_stack_at(
-    int64_t idx) {
+const std::shared_ptr<PyObject_TorchDispatchMode>& TorchDispatchModeTLS::
+    get_stack_at(int64_t idx) {
   TORCH_CHECK(idx < stack_len(), "Tried to get stack at idx that's too big");
   // Our "logical" stack includes both:
   // - any user modes (the entire torchDispatchModeState.stack_)
@@ -120,13 +121,13 @@ int64_t TorchDispatchModeTLS::stack_len() {
   return stack_len + infra_modes_len;
 }
 
-const c10::optional<std::shared_ptr<SafePyObject>> TorchDispatchModeTLS::
-    get_mode(TorchDispatchModeKey mode_key) {
+const c10::optional<std::shared_ptr<PyObject_TorchDispatchMode>>
+TorchDispatchModeTLS::get_mode(TorchDispatchModeKey mode_key) {
   return torchDispatchModeState.infra_modes_[static_cast<size_t>(mode_key)];
 }
 
 void TorchDispatchModeTLS::set_mode(
-    const std::shared_ptr<SafePyObject>& mode,
+    const std::shared_ptr<PyObject_TorchDispatchMode>& mode,
     TorchDispatchModeKey mode_key) {
   TORCH_CHECK(
       torchDispatchModeState.infra_modes_[static_cast<size_t>(mode_key)] ==
@@ -144,8 +145,8 @@ void TorchDispatchModeTLS::set_mode(
   torchDispatchModeState.infra_modes_[static_cast<size_t>(mode_key)] = mode;
 }
 
-const c10::optional<std::shared_ptr<SafePyObject>> TorchDispatchModeTLS::
-    unset_mode(TorchDispatchModeKey mode_key) {
+const c10::optional<std::shared_ptr<PyObject_TorchDispatchMode>>
+TorchDispatchModeTLS::unset_mode(TorchDispatchModeKey mode_key) {
   auto out = torchDispatchModeState.infra_modes_[static_cast<size_t>(mode_key)];
   torchDispatchModeState.infra_modes_[static_cast<size_t>(mode_key)] =
       c10::nullopt;
@@ -192,5 +193,4 @@ std::string to_string(TorchDispatchModeKey mode_key) {
   }
 }
 
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/c10/core/impl/TorchDispatchModeTLS.h b/c10/core/impl/TorchDispatchModeTLS.h
index 55d0d92b230f4..d9ac8d8449b49 100644
--- a/c10/core/impl/TorchDispatchModeTLS.h
+++ b/c10/core/impl/TorchDispatchModeTLS.h
@@ -3,8 +3,7 @@
 #include <c10/core/SafePyObject.h>
 #include <c10/macros/Export.h>
 
-namespace c10 {
-namespace impl {
+namespace c10::impl {
 
 enum class TorchDispatchModeKey : int8_t {
   FAKE,
@@ -13,31 +12,35 @@ enum class TorchDispatchModeKey : int8_t {
   NUM_MODE_KEYS
 };
 
+using PyObject_TorchDispatchMode = SafePyObjectT<TorchDispatchModeKey>;
+
 struct C10_API TorchDispatchModeTLS {
   // This API is NOT invariant safe.
   // It must not take in an infra mode that uses TorchDispatchModeKey
   // If you're pushing an infra mode onto the stack, we expect
   // you to use set_mode
   static void push_non_infra_mode_onto_stack(
-      std::shared_ptr<SafePyObject> mode);
+      std::shared_ptr<PyObject_TorchDispatchMode> mode);
   // Pops the top mode of the stack,
   // giving precedence to user modes before attempting to pop
   // any infra modes
-  static const std::shared_ptr<SafePyObject> pop_stack();
+  static const std::shared_ptr<PyObject_TorchDispatchMode> pop_stack();
   // Returns the highest-priority infra mode on the stack,
   // along with its mode key.
-  static const std::tuple<std::shared_ptr<SafePyObject>, TorchDispatchModeKey>
-  pop_highest_infra_mode();
+  static const std::
+      tuple<std::shared_ptr<PyObject_TorchDispatchMode>, TorchDispatchModeKey>
+      pop_highest_infra_mode();
 
-  static const std::shared_ptr<SafePyObject>& get_stack_at(int64_t idx);
+  static const std::shared_ptr<PyObject_TorchDispatchMode>& get_stack_at(
+      int64_t idx);
   static int64_t stack_len();
 
-  static const c10::optional<std::shared_ptr<SafePyObject>> get_mode(
-      TorchDispatchModeKey mode_key);
-  static const c10::optional<std::shared_ptr<SafePyObject>> unset_mode(
-      TorchDispatchModeKey mode_key);
+  static const c10::optional<std::shared_ptr<PyObject_TorchDispatchMode>>
+  get_mode(TorchDispatchModeKey mode_key);
+  static const c10::optional<std::shared_ptr<PyObject_TorchDispatchMode>>
+  unset_mode(TorchDispatchModeKey mode_key);
   static void set_mode(
-      const std::shared_ptr<SafePyObject>& mode,
+      const std::shared_ptr<PyObject_TorchDispatchMode>& mode,
       TorchDispatchModeKey mode_key);
 
   static const TorchDispatchModeTLS& get_state();
@@ -46,13 +49,13 @@ struct C10_API TorchDispatchModeTLS {
   static bool any_modes_set(bool skip_infra_modes = false);
 
  private:
-  std::vector<std::shared_ptr<c10::SafePyObject>> stack_;
+  std::vector<std::shared_ptr<PyObject_TorchDispatchMode>> stack_;
   // Users are allowed to push multiple ProxyTorchDispatchMode objects onto the
   // stack
   // However, we only allow a single FakeTensorMode onto the stack at a time
   // (Pushing additional FakeTensorModes onto the stack is a no-op)
   std::array<
-      c10::optional<std::shared_ptr<c10::SafePyObject>>,
+      c10::optional<std::shared_ptr<PyObject_TorchDispatchMode>>,
       static_cast<size_t>(TorchDispatchModeKey::NUM_MODE_KEYS)>
       infra_modes_;
 };
@@ -61,5 +64,4 @@ C10_API bool dispatch_mode_enabled();
 
 C10_API std::string to_string(TorchDispatchModeKey mode_key);
 
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/c10/core/impl/VirtualGuardImpl.h b/c10/core/impl/VirtualGuardImpl.h
index ce32411d3b095..20651505350ed 100644
--- a/c10/core/impl/VirtualGuardImpl.h
+++ b/c10/core/impl/VirtualGuardImpl.h
@@ -39,6 +39,9 @@ class VirtualGuardImpl final : public DeviceGuardImplInterface {
   Stream getStream(Device d) const noexcept override {
     return impl_->getStream(d);
   }
+  Stream getNewStream(Device d, int priority = 0) const override {
+    return impl_->getNewStream(d, priority);
+  }
   Stream getDefaultStream(Device d) const override {
     return impl_->getDefaultStream(d);
   }
@@ -84,6 +87,14 @@ class VirtualGuardImpl final : public DeviceGuardImplInterface {
     impl_->recordDataPtrOnStream(data_ptr, stream);
   }
 
+  double elapsedTime(void* event1, void* event2) const override {
+    return impl_->elapsedTime(event1, event2);
+  }
+
+  void synchronizeEvent(void* event) const override {
+    return impl_->synchronizeEvent(event);
+  }
+
  private:
   const DeviceGuardImplInterface* impl_ = nullptr;
 };
diff --git a/c10/core/impl/cow/COW.cpp b/c10/core/impl/cow/COW.cpp
deleted file mode 100644
index f32e0ea668044..0000000000000
--- a/c10/core/impl/cow/COW.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-#include <c10/core/impl/cow/COW.h>
-
-#include <c10/core/Allocator.h>
-#include <c10/core/CPUAllocator.h>
-#include <c10/core/StorageImpl.h>
-#include <c10/core/alignment.h>
-#include <c10/core/impl/cow/COWDeleter.h>
-#include <c10/util/Exception.h>
-#include <c10/util/UniqueVoidPtr.h>
-
-#include <memory>
-#include <optional>
-
-namespace c10::impl::cow {
-
-namespace {
-
-// Wraps a DataPtr with a copy-on-write DataPtr.
-at::DataPtr make_data_ptr(
-    at::DataPtr const& data_ptr,
-    cow::COWDeleterContext& ctx) {
-  return at::DataPtr(data_ptr.get(), &ctx, cow::cow_deleter, data_ptr.device());
-}
-
-/// Copies a copy-on-write DataPtr.
-at::DataPtr copy_data_ptr(at::DataPtr const& data_ptr) {
-  auto* ctx = data_ptr.cast_context<cow::COWDeleterContext>(cow::cow_deleter);
-  TORCH_INTERNAL_ASSERT(ctx != nullptr);
-  ctx->increment_refcount();
-  return make_data_ptr(data_ptr, *ctx);
-}
-
-bool is_simple_context(
-    const void* context,
-    const void* data,
-    const at::Allocator* allocator) {
-  if (allocator == c10::GetDefaultMobileCPUAllocator()) {
-    return reinterpret_cast<size_t>(data) ==
-        reinterpret_cast<size_t>(context) + c10::gAlignment;
-  } else {
-    return data == context;
-  }
-}
-
-} // namespace
-
-bool has_simple_data_ptr(const c10::StorageImpl& storage) {
-  const c10::DataPtr& data_ptr = storage.data_ptr();
-  return is_simple_context(
-      data_ptr.get_context(), data_ptr.get(), storage.allocator());
-}
-
-bool is_cow_data_ptr(const c10::DataPtr& data_ptr) {
-  return (void*)data_ptr.get_deleter() == (void*)&cow::cow_deleter;
-}
-
-c10::intrusive_ptr<StorageImpl> lazy_clone_storage(StorageImpl& storage) {
-  const at::DataPtr& data_ptr = storage.data_ptr();
-
-  // There are three possible circumstances:
-  //
-  // 1) The storage has a normal data pointer with no out of the ordinary
-  //    context. In this case we know that there are no blind aliases to the
-  //    storage impl: they all will be public aliases and the user is expected
-  //    to synchronize manually.
-  //
-  //    No locking is required in this case.
-  //
-  // 2) The storage already has a copy on write context. There
-  //    is a potential race condition with a blind alias (i.e. an
-  //    alias that the user is not required to synchronize
-  //    with). Because our input storage is bound to a live reference
-  //    to the data, we know that it isn't going away. A blind alias
-  //    could be copying from it right now, but we will grab the
-  //    context's mutex to protect us.
-  //
-  //    We do not need to lock in this case either, because we're just
-  //    wrapping a context that we know isn't going away.
-  //
-  // 3) The storage has a context that is not the copy on write
-  //    context. This is not supported, so we just return null.
-  //
-  //    No locking is required in this case.
-
-  std::optional<DataPtr> new_data_ptr; // must be set below
-
-  if (has_simple_data_ptr(storage)) {
-    // Case 1) We have a simple data pointer: wrap it.
-    std::unique_ptr<void, DeleterFnPtr> original_ctx =
-        storage.mutable_data_ptr().move_context();
-    TORCH_INTERNAL_ASSERT(is_simple_context(
-        original_ctx.get(), data_ptr.get(), storage.allocator()));
-
-    // Save this for the result.
-    new_data_ptr = make_data_ptr(
-        data_ptr, *new cow::COWDeleterContext(std::move(original_ctx)));
-
-    // Update this storage to the new copy on write context.
-    storage.set_data_ptr_noswap(copy_data_ptr(*new_data_ptr));
-  } else if (is_cow_data_ptr(data_ptr)) {
-    // Case 2): there is already a copy on write context. Just return a
-    // new storage impl.
-    new_data_ptr = copy_data_ptr(data_ptr);
-  } else {
-    // Case 3) There is a context and it's not copy-on-write. Nothing
-    // we can do here.
-    return nullptr;
-  }
-
-  TORCH_INTERNAL_ASSERT(new_data_ptr.has_value());
-
-  return make_intrusive<StorageImpl>(
-      StorageImpl::use_byte_size_t(),
-      storage.sym_nbytes(),
-      *std::move(new_data_ptr),
-      storage.allocator(),
-      storage.resizable());
-}
-
-} // namespace c10::impl::cow
diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp
index 45a1233beb3e7..1f81ed47b67b1 100644
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@@ -1,12 +1,12 @@
 #include <c10/cuda/CUDAAllocatorConfig.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/llvmMathExtras.h>
 
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/driver_api.h>
 #endif
 
-namespace c10 {
-namespace cuda {
-namespace CUDACachingAllocator {
+namespace c10::cuda::CUDACachingAllocator {
 
 constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
 
@@ -16,7 +16,8 @@ CUDAAllocatorConfig::CUDAAllocatorConfig()
       m_pinned_num_register_threads(1),
       m_expandable_segments(false),
       m_release_lock_on_cudamalloc(false),
-      m_pinned_use_cuda_host_register(false) {
+      m_pinned_use_cuda_host_register(false),
+      m_last_allocator_settings("") {
   m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
 }
 
@@ -66,7 +67,7 @@ void CUDAAllocatorConfig::consumeToken(
     size_t i,
     const char c) {
   TORCH_CHECK(
-      i < config.size() && config[i].compare(std::string(1, c)) == 0,
+      i < config.size() && config[i] == std::string(1, c),
       "Error parsing CachingAllocator settings, expected ",
       c,
       "");
@@ -76,15 +77,16 @@ size_t CUDAAllocatorConfig::parseMaxSplitSize(
     const std::vector<std::string>& config,
     size_t i) {
   consumeToken(config, ++i, ':');
+  constexpr int mb = 1024 * 1024;
   if (++i < config.size()) {
     size_t val1 = stoi(config[i]);
     TORCH_CHECK(
-        val1 > kLargeBuffer / (1024 * 1024),
+        val1 > kLargeBuffer / mb,
         "CachingAllocator option max_split_size_mb too small, must be > ",
-        kLargeBuffer / (1024 * 1024),
+        kLargeBuffer / mb,
         "");
-    val1 = std::max(val1, kLargeBuffer / (1024 * 1024));
-    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
+    val1 = std::max(val1, kLargeBuffer / mb);
+    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
     m_max_split_size = val1 * 1024 * 1024;
   } else {
     TORCH_CHECK(false, "Error, expecting max_split_size_mb value", "");
@@ -117,9 +119,9 @@ size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
   bool first_value = true;
 
   if (++i < config.size()) {
-    if (config[i].compare("[") == 0) {
+    if (std::string_view(config[i]) == "[") {
       size_t last_index = 0;
-      while (++i < config.size() && config[i].compare("]") != 0) {
+      while (++i < config.size() && std::string_view(config[i]) != "]") {
         const std::string& val1 = config[i];
         size_t val2 = 0;
 
@@ -135,7 +137,7 @@ size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
             "For roundups, the divisons has to be power of 2 ",
             "");
 
-        if (val1.compare(">") == 0) {
+        if (std::string_view(val1) == ">") {
           std::fill(
               std::next(
                   m_roundup_power2_divisions.begin(),
@@ -170,7 +172,7 @@ size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
           last_index = index;
         }
 
-        if (config[i + 1].compare("]") != 0) {
+        if (std::string_view(config[i + 1]) != "]") {
           consumeToken(config, ++i, ',');
         }
       }
@@ -243,56 +245,70 @@ void CUDAAllocatorConfig::parseArgs(const char* env) {
   if (env == nullptr) {
     return;
   }
+  {
+    std::lock_guard<std::mutex> lock(m_last_allocator_settings_mutex);
+    m_last_allocator_settings = env;
+  }
 
   std::vector<std::string> config;
   lexArgs(env, config);
 
   for (size_t i = 0; i < config.size(); i++) {
-    if (config[i].compare("max_split_size_mb") == 0) {
+    std::string_view config_item_view(config[i]);
+    if (config_item_view == "max_split_size_mb") {
       i = parseMaxSplitSize(config, i);
       used_native_specific_option = true;
-    } else if (config[i].compare("garbage_collection_threshold") == 0) {
+    } else if (config_item_view == "garbage_collection_threshold") {
       i = parseGarbageCollectionThreshold(config, i);
       used_native_specific_option = true;
-    } else if (config[i].compare("roundup_power2_divisions") == 0) {
+    } else if (config_item_view == "roundup_power2_divisions") {
       i = parseRoundUpPower2Divisions(config, i);
       used_native_specific_option = true;
-    } else if (config[i].compare("backend") == 0) {
+    } else if (config_item_view == "backend") {
       i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
-    } else if (config[i] == "expandable_segments") {
+    } else if (config_item_view == "expandable_segments") {
       used_native_specific_option = true;
       consumeToken(config, ++i, ':');
       ++i;
       TORCH_CHECK(
-          i < config.size() && (config[i] == "True" || config[i] == "False"),
+          i < config.size() &&
+              (std::string_view(config[i]) == "True" ||
+               std::string_view(config[i]) == "False"),
           "Expected a single True/False argument for expandable_segments");
-      m_expandable_segments = (config[i] == "True");
+      config_item_view = config[i];
+      m_expandable_segments = (config_item_view == "True");
     } else if (
         // ROCm build's hipify step will change "cuda" to "hip", but for ease of
         // use, accept both. We must break up the string to prevent hipify here.
-        config[i].compare("release_lock_on_hipmalloc") == 0 ||
-        config[i].compare("release_lock_on_c"
-                          "udamalloc") == 0) {
+        config_item_view == "release_lock_on_hipmalloc" ||
+        config_item_view ==
+            "release_lock_on_c"
+            "udamalloc") {
       used_native_specific_option = true;
       consumeToken(config, ++i, ':');
       ++i;
       TORCH_CHECK(
-          i < config.size() && (config[i] == "True" || config[i] == "False"),
+          i < config.size() &&
+              (std::string_view(config[i]) == "True" ||
+               std::string_view(config[i]) == "False"),
           "Expected a single True/False argument for release_lock_on_cudamalloc");
-      m_release_lock_on_cudamalloc = (config[i] == "True");
+      config_item_view = config[i];
+      m_release_lock_on_cudamalloc = (config_item_view == "True");
     } else if (
         // ROCm build's hipify step will change "cuda" to "hip", but for ease of
         // use, accept both. We must break up the string to prevent hipify here.
-        config[i].compare("pinned_use_hip_host_register") == 0 ||
-        config[i].compare("pinned_use_c"
-                          "uda_host_register") == 0) {
+        config_item_view == "pinned_use_hip_host_register" ||
+        config_item_view ==
+            "pinned_use_c"
+            "uda_host_register") {
       i = parsePinnedUseCudaHostRegister(config, i);
       used_native_specific_option = true;
-    } else if (config[i].compare("pinned_num_register_threads") == 0) {
+    } else if (config_item_view == "pinned_num_register_threads") {
       i = parsePinnedNumRegisterThreads(config, i);
       used_native_specific_option = true;
     } else {
-      TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", config[i]);
+      TORCH_CHECK(
+          false, "Unrecognized CachingAllocator option: ", config_item_view);
     }
 
     if (i + 1 < config.size()) {
@@ -352,6 +368,4 @@ void setAllocatorSettings(const std::string& env) {
   CUDACachingAllocator::CUDAAllocatorConfig::instance().parseArgs(env.c_str());
 }
 
-} // namespace CUDACachingAllocator
-} // namespace cuda
-} // namespace c10
+} // namespace c10::cuda::CUDACachingAllocator
diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h
index 68fe160a8a969..3106fc1b46bae 100644
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@@ -1,13 +1,13 @@
 #pragma once
 
-#include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/util/Exception.h>
-#include <c10/util/llvmMathExtras.h>
-#include <cuda_runtime_api.h>
 
 #include <atomic>
+#include <cstddef>
+#include <cstdlib>
+#include <mutex>
+#include <string>
 #include <vector>
 
 namespace c10::cuda::CUDACachingAllocator {
@@ -59,6 +59,16 @@ class C10_CUDA_API CUDAAllocatorConfig {
   // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
   static size_t roundup_power2_divisions(size_t size);
 
+  static std::vector<size_t> roundup_power2_divisions() {
+    return instance().m_roundup_power2_divisions;
+  }
+
+  static std::string last_allocator_settings() {
+    std::lock_guard<std::mutex> lock(
+        instance().m_last_allocator_settings_mutex);
+    return instance().m_last_allocator_settings;
+  }
+
   static CUDAAllocatorConfig& instance() {
     static CUDAAllocatorConfig* s_instance = ([]() {
       auto inst = new CUDAAllocatorConfig();
@@ -74,8 +84,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
  private:
   CUDAAllocatorConfig();
 
-  void lexArgs(const char* env, std::vector<std::string>& config);
-  void consumeToken(
+  static void lexArgs(const char* env, std::vector<std::string>& config);
+  static void consumeToken(
       const std::vector<std::string>& config,
       size_t i,
       const char c);
@@ -104,6 +114,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
   std::atomic<bool> m_expandable_segments;
   std::atomic<bool> m_release_lock_on_cudamalloc;
   std::atomic<bool> m_pinned_use_cuda_host_register;
+  std::string m_last_allocator_settings;
+  std::mutex m_last_allocator_settings_mutex;
 };
 
 // General caching allocator utilities
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 393651e9b58f3..c472e82ce2f1e 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -9,6 +9,7 @@
 #include <c10/util/ScopeExit.h>
 #include <c10/util/UniqueVoidPtr.h>
 #include <c10/util/flat_hash_map.h>
+#include <c10/util/hash.h>
 #include <c10/util/irange.h>
 #include <c10/util/llvmMathExtras.h>
 #include <c10/util/static_tracepoint.h>
@@ -22,13 +23,10 @@
 #include <c10/util/Exception.h>
 #include <cuda_runtime_api.h>
 #include <algorithm>
-#include <bitset>
 #include <cstddef>
 #include <cstdint>
 #include <deque>
 #include <iostream>
-#include <iterator>
-#include <map>
 #include <memory>
 #include <mutex>
 #include <regex>
@@ -43,8 +41,7 @@ namespace c10 {
 
 C10_DEFINE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
 
-namespace cuda {
-namespace CUDACachingAllocator {
+namespace cuda::CUDACachingAllocator {
 
 // Included here as this is externally used in CUDAAllocatorConfig
 const size_t kLargeBuffer =
@@ -69,9 +66,9 @@ namespace Native {
 //   smallest available free block or allocate a new block using cudaMalloc.
 // - To reduce fragmentation, requests between 1MB and 10MB will allocate and
 //   split a 20MB block, if no free block of sufficient size is available.
-// - To further reduce fragmentation, blocks >= 200MB are not allowed to be
-//   split. These oversize cached blocks will still satisfy requests within
-//   20MB of the oversize cached block size.
+// - To further reduce fragmentation, blocks >= max_split_size are not allowed
+//   to be split. These oversize cached blocks will still satisfy requests
+//   within 1MB of the oversize cached block size.
 //
 // With this allocator, allocations and frees should logically be considered
 // "usages" of the memory segment associated with streams, just like kernel
@@ -132,20 +129,18 @@ using stream_set = ska::flat_hash_set<cuda::CUDAStream>;
 
 using StatTypes = std::array<bool, static_cast<size_t>(StatType::NUM_TYPES)>;
 
-void update_stat(Stat& stat, int64_t amount) {
-  stat.current += amount;
+void increase_stat(Stat& stat, size_t amount) {
+  stat.current += static_cast<int64_t>(amount);
+  stat.peak = std::max(stat.current, stat.peak);
+  stat.allocated += static_cast<int64_t>(amount);
+}
 
+void decrease_stat(Stat& stat, size_t amount) {
+  stat.current -= static_cast<int64_t>(amount);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       stat.current >= 0,
       "Negative tracked stat in CUDA allocator (likely logic error).");
-
-  stat.peak = std::max(stat.current, stat.peak);
-  if (amount > 0) {
-    stat.allocated += amount;
-  }
-  if (amount < 0) {
-    stat.freed += -amount;
-  }
+  stat.freed += static_cast<int64_t>(amount);
 }
 
 void reset_accumulated_stat(Stat& stat) {
@@ -166,13 +161,13 @@ void for_each_selected_stat_type(const StatTypes& stat_types, Func f) {
   }
 }
 
-void update_stat_array(
+void decrease_stat_array(
     StatArray& stat_array,
-    int64_t amount,
+    size_t amount,
     const StatTypes& stat_types) {
   for_each_selected_stat_type(
       stat_types, [&stat_array, amount](size_t stat_type) {
-        update_stat(stat_array[stat_type], amount);
+        decrease_stat(stat_array[stat_type], amount);
       });
 }
 
@@ -188,16 +183,25 @@ struct BlockPool {
         unmapped(BlockComparatorAddress),
         is_small(small),
         owner_PrivatePool(private_pool) {}
+
+  // Do not insert a Block to blocks directly; use insert_into_blocks(),
+  // instead.
   std::set<Block*, Comparison> blocks;
   std::set<Block*, Comparison> unmapped;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const bool is_small;
   PrivatePool* owner_PrivatePool;
+  int64_t get_free_blocks_call_count{0};
+
+  // Add a Block into blocks set with updating gc counter.
+  std::pair<std::set<Block*, Comparison>::iterator, bool> insert_into_blocks(
+      Block* block);
 };
 
 struct ExpandableSegment;
 
 struct Block {
-  int device; // gpu
+  c10::DeviceIndex device; // gpu
   cudaStream_t stream; // allocation stream
   stream_set stream_uses; // streams on which the block was used
   size_t size; // block size in bytes
@@ -213,8 +217,7 @@ struct Block {
   Block* prev{nullptr}; // prev block if split from a larger allocation
   Block* next{nullptr}; // next block if split from a larger allocation
   int event_count{0}; // number of outstanding CUDA events
-  int gc_count{0}; // counter for prioritizing older / less useful blocks for
-                   // garbage collection
+  int64_t gc_count_base{0}; // get_free_blocks_call_count when Block is inserted
   std::shared_ptr<GatheredContext> context_when_allocated;
   // only set for the first block in the segment (when prev == null)
   // this records the frame information when cudaMalloc was called
@@ -225,7 +228,7 @@ struct Block {
   ExpandableSegment* expandable_segment_{nullptr};
 
   Block(
-      int device,
+      c10::DeviceIndex device,
       cudaStream_t stream,
       size_t size,
       BlockPool* pool,
@@ -239,13 +242,18 @@ struct Block {
         ptr(ptr) {}
 
   // constructor for search key
-  Block(int device, cudaStream_t stream, size_t size)
+  Block(c10::DeviceIndex device, cudaStream_t stream, size_t size)
       : device(device),
         stream(stream),
         stream_uses(),
         size(size),
         requested_size(0) {}
 
+  size_t gc_count() {
+    TORCH_INTERNAL_ASSERT(pool);
+    return static_cast<int>(pool->get_free_blocks_call_count - gc_count_base);
+  }
+
   bool is_split() const {
     return (prev != nullptr) || (next != nullptr);
   }
@@ -263,6 +271,12 @@ struct Block {
   }
 };
 
+std::pair<std::set<Block*, Comparison>::iterator, bool> BlockPool::
+    insert_into_blocks(Block* block) {
+  block->gc_count_base = get_free_blocks_call_count;
+  return blocks.insert(block);
+}
+
 struct SegmentRange {
   char* ptr;
   size_t size;
@@ -367,13 +381,12 @@ Instead these mapping have to be done manually. The allocator now has an
 
 struct ExpandableSegment {
   ExpandableSegment(
-      int device,
+      c10::DeviceIndex device,
       cudaStream_t stream,
       size_t size,
-      std::vector<int> peers)
+      std::vector<c10::DeviceIndex> peers)
       : device_(device),
         stream_(stream),
-        max_handles_(0),
         // 2MB for small pool, 20MB for large pool
         segment_size_(size),
         peers_(std::move(peers)) {
@@ -406,7 +419,8 @@ struct ExpandableSegment {
       CUmemAllocationProp prop = {};
       prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
       prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-      prop.location.id = device_;
+      // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+      prop.location.id = static_cast<int>(device_);
       auto status =
           DriverAPI::get()->cuMemCreate_(&handle, segment_size_, &prop, 0);
       if (status == CUDA_ERROR_OUT_OF_MEMORY) {
@@ -451,13 +465,14 @@ struct ExpandableSegment {
   }
 
   char* ptr() const {
-    return (char*)ptr_;
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
+    return reinterpret_cast<char*>(ptr_);
   }
   size_t size() const {
     return max_handles_ * segment_size_;
   }
 
-  void addPeer(int device) {
+  void addPeer(c10::DeviceIndex device) {
     peers_.push_back(device);
     forEachAllocatedRange(
         [&](size_t begin, size_t end) { setAccess(device, begin, end); });
@@ -471,10 +486,11 @@ struct ExpandableSegment {
   }
 
  private:
-  void setAccess(int device, size_t begin, size_t end) {
+  void setAccess(c10::DeviceIndex device, size_t begin, size_t end) {
     CUmemAccessDesc desc;
     desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    desc.location.id = device;
+    // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+    desc.location.id = static_cast<int>(device);
     desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
     C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemSetAccess_(
         ptr_ + begin * segment_size_, (end - begin) * segment_size_, &desc, 1));
@@ -503,8 +519,8 @@ struct ExpandableSegment {
       handles_.pop_back();
     }
   }
-  void forEachAllocatedRange(std::function<void(size_t, size_t)> fn) {
-    auto start = 0;
+  void forEachAllocatedRange(const std::function<void(size_t, size_t)>& fn) {
+    size_t start = 0;
     for (auto i : c10::irange(handles_.size())) {
       if (handles_.at(i) && (i == 0 || !handles_.at(i - 1))) {
         start = i;
@@ -529,23 +545,23 @@ struct ExpandableSegment {
     return SegmentRange(
         ptr() + segment_size_ * begin, segment_size_ * (end - begin));
   }
-  int device_;
+  c10::DeviceIndex device_;
   cudaStream_t stream_;
   CUdeviceptr ptr_{};
-  size_t max_handles_;
+  size_t max_handles_{0};
   size_t segment_size_;
   std::vector<c10::optional<CUmemGenericAllocationHandle>> handles_;
   // devices on which this memory should be mapped in addition
   // to the device where the physical memory lives (device_).
-  std::vector<int> peers_;
+  std::vector<c10::DeviceIndex> peers_;
 };
 #else
 struct ExpandableSegment {
   ExpandableSegment(
-      int device,
+      c10::DeviceIndex device,
       cudaStream_t stream,
       size_t size,
-      const std::vector<int>& peers) {
+      const std::vector<c10::DeviceIndex>& peers) {
     TORCH_INTERNAL_ASSERT(false, "expandable segment not supported");
   }
   SegmentRange map(SegmentRange range) {
@@ -560,7 +576,7 @@ struct ExpandableSegment {
   size_t size() const {
     return 0;
   }
-  void addPeer(int device) {}
+  void addPeer(c10::DeviceIndex device) {}
 };
 #endif
 
@@ -568,13 +584,13 @@ struct ExpandableSegment {
 // needed to reconstruct a private pool to a previous state. See note
 // [Checkpointing PrivatePoolState]
 struct BlockState {
-  int device = 0;
+  c10::DeviceIndex device = 0;
   cudaStream_t stream = nullptr;
   stream_set stream_uses = {};
   size_t size = 0;
   void* ptr = nullptr;
   bool allocated = false;
-  int gc_count = 0;
+  int64_t gc_count_base = 0;
   // maintain invariant that event_count == 0 ;
   // history will be left alone in checkpoint
 
@@ -622,19 +638,15 @@ static bool BlockComparatorAddress(const Block* a, const Block* b) {
 
 struct AllocParams {
   AllocParams(
-      int device,
+      c10::DeviceIndex device,
       size_t size,
       cudaStream_t stream,
       BlockPool* pool,
       size_t alloc_size,
       DeviceStats& stats)
-      : search_key(device, stream, size),
-        pool(pool),
-        alloc_size(alloc_size),
-        block(nullptr),
-        err(cudaSuccess) {}
+      : search_key(device, stream, size), pool(pool), alloc_size(alloc_size) {}
 
-  int device() const {
+  c10::DeviceIndex device() const {
     return search_key.device;
   }
   cudaStream_t stream() const {
@@ -647,9 +659,9 @@ struct AllocParams {
   Block search_key;
   BlockPool* pool;
   size_t alloc_size;
-  Block* block;
+  Block* block{nullptr};
   StatTypes stat_types = {false};
-  cudaError_t err;
+  cudaError_t err{cudaSuccess};
 };
 
 // Note: cudaEventCreate when concurrently invoked from multiple threads can be
@@ -664,7 +676,7 @@ class EventPool {
   // TODO: Explicit device count
   EventPool() : pools_(at::cuda::device_count()) {}
 
-  Event get(int device) {
+  Event get(c10::DeviceIndex device) {
     TORCH_INTERNAL_ASSERT(0 <= device);
     TORCH_INTERNAL_ASSERT(device < static_cast<int>(pools_.size()));
     auto& pool = pools_[device];
@@ -709,19 +721,17 @@ class EventPool {
 // CUDA graphs helper
 struct PrivatePool {
   PrivatePool()
-      : use_count(1),
-        cudaMalloc_count(0),
-        large_blocks(/*small=*/false, this),
+      : large_blocks(/*small=*/false, this),
         small_blocks(/*small=*/true, this) {}
   PrivatePool(const PrivatePool&) = delete;
   PrivatePool(PrivatePool&&) = delete;
   PrivatePool& operator=(const PrivatePool&) = delete;
   // Number of live graphs using this pool
-  int use_count;
+  int use_count{1};
   // Number of unfreed cudaMallocs made for this pool. When use_count and
   // cudaMalloc_count drop to zero, we can delete this PrivatePool from
   // graph_pools.
-  int cudaMalloc_count;
+  int cudaMalloc_count{0};
   // Instead of maintaining private BlockPools here, I could stuff all blocks
   // (private or no) into the top-level large_blocks and small_blocks, and
   // distinguish private blocks by adding a "pool id" check above the stream
@@ -737,7 +747,7 @@ BlockState::BlockState(Block* block)
       size(block->size),
       ptr(block->ptr),
       allocated(block->allocated),
-      gc_count(block->gc_count) {
+      gc_count_base(block->gc_count_base) {
   TORCH_CHECK(
       block->event_count == 0,
       "Events should have synchronized when checkpointing block");
@@ -788,7 +798,7 @@ cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) {
 } // anonymous namespace
 } // namespace Native
 
-static std::string reportProcessMemoryInfo(int device) {
+static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
 #ifdef PYTORCH_C10_DRIVER_API_SUPPORTED
   void* nvml_handle = DriverAPI::get_nvml_handle();
   if (!nvml_handle) {
@@ -802,6 +812,7 @@ static std::string reportProcessMemoryInfo(int device) {
   cudaDeviceProp prop{};
   C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
 
+  // NOLINTNEXTLINE(*-c-arrays)
   char pci_id[80];
   snprintf(
       pci_id,
@@ -819,7 +830,7 @@ static std::string reportProcessMemoryInfo(int device) {
 
   std::vector<nvmlProcessInfo_v1_t> procs(8);
   unsigned int size = procs.size();
-  nvmlReturn_t r;
+  nvmlReturn_t r{};
   while ((r = DriverAPI::get()->nvmlDeviceGetComputeRunningProcesses_(
               nvml_device, &size, procs.data())) ==
          NVML_ERROR_INSUFFICIENT_SIZE) {
@@ -886,7 +897,7 @@ class DeviceCachingAllocator {
 
   // all live expandable segments
   std::vector<ExpandableSegment*> expandable_segments_;
-  std::vector<int> devices_with_peer_access_;
+  std::vector<c10::DeviceIndex> devices_with_peer_access_;
 
   bool set_fraction = false;
 
@@ -919,11 +930,13 @@ class DeviceCachingAllocator {
   std::vector<AllocatorTraceTracker> trace_trackers_;
 
  public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   DeviceCachingAllocator()
       : large_blocks(/*small=*/false),
         small_blocks(/*small=*/true),
         alloc_trace(new std::vector<TraceEntry>()) {
-    stats.max_split_size = CUDAAllocatorConfig::max_split_size();
+    stats.max_split_size =
+        static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
     context_recorder_.store(nullptr);
   }
 
@@ -938,8 +951,10 @@ class DeviceCachingAllocator {
     context_recorder_.store(record_history ? context_recorder : nullptr);
     alloc_trace_max_entries_ = std::max(size_t(1), alloc_trace_max_entries);
     record_context_ = enabled ? when : RecordContext::NEVER;
-    alloc_trace_next = 0;
-    alloc_trace->clear();
+    if (!enabled) {
+      alloc_trace_next = 0;
+      alloc_trace->clear();
+    }
   }
 
   bool isHistoryEnabled() {
@@ -956,9 +971,13 @@ class DeviceCachingAllocator {
     TORCH_CHECK(pool_it != graph_pools.end(), "Could not find pool of id");
     pool = pool_it->second.get();
 
+    TORCH_INTERNAL_ASSERT(pool != nullptr);
+
     size_t allocated_pool_blocks = 0;
 
     for (Block* b : active_blocks) {
+      TORCH_INTERNAL_ASSERT(b != nullptr);
+      TORCH_INTERNAL_ASSERT(b->pool != nullptr);
       if (b->allocated && b->pool->owner_PrivatePool == pool) {
         if (!expected_live_allocations.count(b->ptr)) {
           return false;
@@ -991,14 +1010,17 @@ class DeviceCachingAllocator {
   // All public methods (except the above) acquire the allocator mutex.
   // Thus, do not call a public method from another public method.
 
-  Block* malloc(int device, size_t orig_size, cudaStream_t stream) {
+  Block* malloc(
+      c10::DeviceIndex device,
+      size_t orig_size,
+      cudaStream_t stream) {
     // done outside the lock because we don't know what locks the recorder needs
     // to have...
     auto context = maybeGatherContext(RecordContext::STATE);
 
     std::unique_lock<std::recursive_mutex> lock(mutex);
 
-    if (C10_LIKELY(captures_underway.size() == 0)) {
+    if (C10_LIKELY(captures_underway.empty())) {
       // Processes end-of-life events for outstanding allocations used on
       // multiple streams (checks if their GPU-side uses are complete and
       // recycles their memory if so)
@@ -1030,7 +1052,7 @@ class DeviceCachingAllocator {
       if (C10_UNLIKELY(
               set_fraction &&
               CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
-        garbage_collect_cached_blocks();
+        garbage_collect_cached_blocks(context);
       }
       // Attempt allocate
       // WARNING: alloc_block may release the allocator lock when calling
@@ -1040,10 +1062,10 @@ class DeviceCachingAllocator {
       block_found = alloc_block(params, false, context, lock)
           // Free enough available cached blocks to satisfy alloc and retry
           // alloc.
-          || (release_available_cached_blocks(params) &&
+          || (release_available_cached_blocks(params, context) &&
               alloc_block(params, false, context, lock))
           // Free all non-split cached blocks and retry alloc.
-          || (C10_LIKELY(captures_underway.size() == 0) &&
+          || (C10_LIKELY(captures_underway.empty()) &&
               release_cached_blocks(context) &&
               alloc_block(params, true, context, lock));
     }
@@ -1074,12 +1096,12 @@ class DeviceCachingAllocator {
       stats.num_ooms += 1;
 
       c10::reportOutOfMemoryToProfiler(
-          size,
+          static_cast<int64_t>(size),
           stats.allocated_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
               .current,
           stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
               .current,
-          c10::Device(c10::DeviceType::CUDA, static_cast<DeviceIndex>(device)));
+          c10::Device(c10::DeviceType::CUDA, device));
 
       auto allocated_bytes =
           stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
@@ -1127,7 +1149,7 @@ class DeviceCachingAllocator {
           "CUDA out of memory. Tried to allocate ",
           format_size(alloc_size),
           ". GPU ",
-          device,
+          static_cast<int>(device),
           " has a total capacity of ",
           format_size(device_total),
           " of which ",
@@ -1147,11 +1169,11 @@ class DeviceCachingAllocator {
 
     bool split_remainder = should_split(params.block, params.size());
     return alloc_found_block(
-        std::move(params), orig_size, std::move(context), split_remainder);
+        params, orig_size, std::move(context), split_remainder);
   }
 
   Block* alloc_found_block(
-      AllocParams params,
+      const AllocParams& params,
       size_t orig_size,
       std::shared_ptr<GatheredContext> context,
       bool split_remainder) {
@@ -1181,33 +1203,28 @@ class DeviceCachingAllocator {
       remaining->prev = block;
       remaining->ptr = static_cast<char*>(remaining->ptr) + size;
       remaining->size -= size;
-      bool inserted = pool->blocks.insert(remaining).second;
+      // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+      bool inserted = pool->insert_into_blocks(remaining).second;
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
 
       if (already_split && !block->expandable_segment_) {
         // An already-split inactive block is being shrunk by size bytes.
-        update_stat_array(
-            stats.inactive_split_bytes,
-            -static_cast<std::int64_t>(block->size),
-            params.stat_types);
+        decrease_stat_array(
+            stats.inactive_split_bytes, block->size, params.stat_types);
       } else if (!block->expandable_segment_) {
         // A new split inactive block is being created from a previously unsplit
         // block, size remaining->size bytes.
         for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
-          update_stat(
-              stats.inactive_split_bytes[stat_type],
-              static_cast<std::int64_t>(remaining->size));
-          update_stat(stats.inactive_split[stat_type], 1);
+          increase_stat(stats.inactive_split_bytes[stat_type], remaining->size);
+          increase_stat(stats.inactive_split[stat_type], 1);
         });
       }
 
     } else if (already_split && !block->expandable_segment_) {
       // An already-split block is becoming active
       for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
-        update_stat(
-            stats.inactive_split_bytes[stat_type],
-            -static_cast<std::int64_t>(block->size));
-        update_stat(stats.inactive_split[stat_type], -1);
+        decrease_stat(stats.inactive_split_bytes[stat_type], block->size);
+        decrease_stat(stats.inactive_split[stat_type], 1);
       });
     }
 
@@ -1223,28 +1240,23 @@ class DeviceCachingAllocator {
         block->device,
         block->context_when_allocated);
 
+    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
     bool inserted = active_blocks.insert(block).second;
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
 
     for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
-      update_stat(stats.allocation[stat_type], 1);
-      update_stat(
-          stats.allocated_bytes[stat_type],
-          static_cast<std::int64_t>(block->size));
-      update_stat(stats.active[stat_type], 1);
-      update_stat(
-          stats.active_bytes[stat_type],
-          static_cast<std::int64_t>(block->size));
-      update_stat(
-          stats.requested_bytes[stat_type],
-          static_cast<std::int64_t>(block->requested_size));
+      increase_stat(stats.allocation[stat_type], 1);
+      increase_stat(stats.allocated_bytes[stat_type], block->size);
+      increase_stat(stats.active[stat_type], 1);
+      increase_stat(stats.active_bytes[stat_type], block->size);
+      increase_stat(stats.requested_bytes[stat_type], block->requested_size);
     });
     if (block->size >= CUDAAllocatorConfig::max_split_size())
-      update_stat(stats.oversize_allocations, 1);
+      increase_stat(stats.oversize_allocations, 1);
 
     c10::reportMemoryUsageToProfiler(
         block->ptr,
-        block->size,
+        static_cast<int64_t>(block->size),
         stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
         stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
         c10::Device(c10::DeviceType::CUDA, device));
@@ -1266,10 +1278,8 @@ class DeviceCachingAllocator {
 
     StatTypes stat_types = get_stat_types_for_pool(*block->pool);
     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
-      update_stat(stats.allocation[stat_type], -1);
-      update_stat(
-          stats.allocated_bytes[stat_type],
-          -static_cast<std::int64_t>(block->size));
+      decrease_stat(stats.allocation[stat_type], 1);
+      decrease_stat(stats.allocated_bytes[stat_type], block->size);
     });
 
     record_trace(
@@ -1281,10 +1291,10 @@ class DeviceCachingAllocator {
         context ? context : block->context_when_allocated);
 
     if (block->size >= CUDAAllocatorConfig::max_split_size())
-      update_stat(stats.oversize_allocations, -1);
+      decrease_stat(stats.oversize_allocations, 1);
 
     if (!block->stream_uses.empty()) {
-      if (C10_UNLIKELY(captures_underway.size())) {
+      if (C10_UNLIKELY(!captures_underway.empty())) {
         // It's forbidden to cudaEventQuery an event recorded during CUDA graph
         // capture. We conservatively defer recording end-of-life events until
         // the next call to process_events() (which won't happen until no
@@ -1299,7 +1309,7 @@ class DeviceCachingAllocator {
 
     c10::reportMemoryUsageToProfiler(
         orig_block_ptr,
-        -orig_block_size,
+        -static_cast<int64_t>(orig_block_size),
         stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
         stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
         c10::Device(c10::DeviceType::CUDA, block->device));
@@ -1340,7 +1350,8 @@ class DeviceCachingAllocator {
     size_t device_free = 0;
     size_t device_total = 0;
     C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
-    allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
+    allowed_memory_maximum =
+        static_cast<size_t>(fraction * static_cast<double>(device_total));
     set_fraction = true;
   }
 
@@ -1394,6 +1405,9 @@ class DeviceCachingAllocator {
 
     stats.num_alloc_retries = 0;
     stats.num_ooms = 0;
+    stats.num_sync_all_streams = 0;
+    stats.num_device_alloc = 0;
+    stats.num_device_free = 0;
     reset_accumulated_stat(stats.oversize_allocations);
     reset_accumulated_stat(stats.oversize_segments);
   }
@@ -1436,8 +1450,6 @@ class DeviceCachingAllocator {
   }
 
   void freeBlocksAllocatedToPool(PrivatePool* private_pool, RestoreResult& rr) {
-    std::unordered_map<void*, Block*> orig_ptrs_to_blocks;
-
     auto pool_blocks = get_private_pool_head_blocks(private_pool);
 
     std::vector<Block*> head_blocks;
@@ -1479,7 +1491,7 @@ class DeviceCachingAllocator {
   void setSegmentStateToCheckpoint(
       Block* block,
       SegmentState& segment,
-      std::shared_ptr<GatheredContext> context,
+      const std::shared_ptr<GatheredContext>& context,
       RestoreResult& rr) {
     Block* curr_block = block;
     Block* last_block = block;
@@ -1509,8 +1521,7 @@ class DeviceCachingAllocator {
 
       // curr_block will become next pointer if it is split, so reassign with
       // the returned value
-      curr_block = alloc_found_block(
-          std::move(params), block_state.size, context, split);
+      curr_block = alloc_found_block(params, block_state.size, context, split);
 
       TORCH_CHECK(curr_block->ptr == block_state.ptr);
       TORCH_CHECK(curr_block->size == block_state.size);
@@ -1669,7 +1680,7 @@ class DeviceCachingAllocator {
       result.emplace_back();
       SegmentInfo& segment_info = result.back();
       segment_info.device = head_block->device;
-      segment_info.address = reinterpret_cast<int64_t>(head_block->ptr);
+      segment_info.address = reinterpret_cast<size_t>(head_block->ptr);
       segment_info.stream = head_block->stream;
       segment_info.is_large = (!head_block->pool->is_small);
       segment_info.is_expandable = head_block->expandable_segment_;
@@ -1717,18 +1728,22 @@ class DeviceCachingAllocator {
   }
 
   std::vector<TraceEntry> trace(
-      std::function<time_t(approx_time_t)> tsc_to_us) {
+      const std::function<time_t(approx_time_t)>& tsc_to_us) {
     std::lock_guard<std::recursive_mutex> lock(mutex);
     std::vector<TraceEntry> result;
     result.reserve(alloc_trace->size());
     result.insert(
         result.end(),
-        alloc_trace->begin() + alloc_trace_next,
+        alloc_trace->begin() +
+            static_cast<std::vector<TraceEntry>::difference_type>(
+                alloc_trace_next),
         alloc_trace->end());
     result.insert(
         result.end(),
         alloc_trace->begin(),
-        alloc_trace->begin() + alloc_trace_next);
+        alloc_trace->begin() +
+            static_cast<std::vector<TraceEntry>::difference_type>(
+                alloc_trace_next));
 
     // Convert all the timestamps from tsc to epoch time in microseconds.
     for (auto& te : result) {
@@ -1838,13 +1853,14 @@ class DeviceCachingAllocator {
     if (uc == 0) {
       // Allows free_cached_blocks to begin cudaFreeing this pool's memory,
       // and makes sure this pool wasn't somehow made freeable already.
+      // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
       bool inserted =
           graph_pools_freeable.insert({mempool_id, it->second.get()}).second;
       TORCH_INTERNAL_ASSERT(inserted);
     }
   }
 
-  void addPeerAccess(int dev_to_access) {
+  void addPeerAccess(c10::DeviceIndex dev_to_access) {
     if (std::find(
             devices_with_peer_access_.begin(),
             devices_with_peer_access_.end(),
@@ -1911,7 +1927,7 @@ class DeviceCachingAllocator {
   // where there is enough free address space to fit size
   // may be composed of free and unmapped segments
   Block* find_expandable_block(
-      int device,
+      c10::DeviceIndex device,
       cudaStream_t stream,
       BlockPool* pool,
       size_t size) {
@@ -1994,15 +2010,16 @@ class DeviceCachingAllocator {
     try_merge_blocks(to_map, to_map->prev, pool);
     try_merge_blocks(to_map, to_map->next, pool);
 
-    pool.blocks.insert(to_map);
+    pool.insert_into_blocks(to_map);
 
     // update statistics
     total_allocated_memory += mapped_range.size;
     StatTypes stat_types = get_stat_types_for_pool(*to_map->pool);
     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
-      update_stat(stats.reserved_bytes[stat_type], mapped_range.size);
+      increase_stat(stats.reserved_bytes[stat_type], mapped_range.size);
     });
 
+    stats.num_device_alloc++;
     record_trace(
         TraceEntry::SEGMENT_MAP,
         int64_t(mapped_range.ptr),
@@ -2018,7 +2035,7 @@ class DeviceCachingAllocator {
   }
 
   Block* try_allocate_expandable_block(
-      int device,
+      c10::DeviceIndex device,
       cudaStream_t stream,
       BlockPool* pool,
       size_t size,
@@ -2076,23 +2093,23 @@ class DeviceCachingAllocator {
 
     const std::array<Block*, 2> merge_candidates = {block->prev, block->next};
     for (Block* merge_candidate : merge_candidates) {
-      const int64_t subsumed_size =
-          try_merge_blocks(block, merge_candidate, pool);
+      const auto subsumed_size = try_merge_blocks(block, merge_candidate, pool);
       if (subsumed_size > 0) {
         net_change_inactive_split_blocks -= 1;
-        net_change_inactive_split_size -= subsumed_size;
+        net_change_inactive_split_size -= static_cast<int64_t>(subsumed_size);
       }
     }
 
     active_blocks.erase(block);
     // Makes sure the Block* isn't already present in the pool we're freeing it
     // back into.
-    bool inserted = pool.blocks.insert(block).second;
+    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+    bool inserted = pool.insert_into_blocks(block).second;
     TORCH_INTERNAL_ASSERT(inserted);
 
     if (block->is_split()) {
       net_change_inactive_split_blocks += 1;
-      net_change_inactive_split_size += block->size;
+      net_change_inactive_split_size += static_cast<int64_t>(block->size);
     }
 
     StatTypes stat_types = get_stat_types_for_pool(pool);
@@ -2105,19 +2122,28 @@ class DeviceCachingAllocator {
       // so we simply just exclude expandable segments from
       // inactive_split
       if (!block->expandable_segment_) {
-        update_stat(
-            stats.inactive_split[stat_type], net_change_inactive_split_blocks);
-        update_stat(
-            stats.inactive_split_bytes[stat_type],
-            net_change_inactive_split_size);
+        if (net_change_inactive_split_blocks > 0) {
+          increase_stat(
+              stats.inactive_split[stat_type],
+              static_cast<size_t>(net_change_inactive_split_blocks));
+        } else if (net_change_inactive_split_blocks < 0) {
+          decrease_stat(
+              stats.inactive_split[stat_type],
+              static_cast<size_t>(-net_change_inactive_split_blocks));
+        }
+        if (net_change_inactive_split_size > 0) {
+          increase_stat(
+              stats.inactive_split_bytes[stat_type],
+              static_cast<size_t>(net_change_inactive_split_size));
+        } else if (net_change_inactive_split_size < 0) {
+          decrease_stat(
+              stats.inactive_split_bytes[stat_type],
+              static_cast<size_t>(-net_change_inactive_split_size));
+        }
       }
-      update_stat(stats.active[stat_type], -1);
-      update_stat(
-          stats.active_bytes[stat_type],
-          -static_cast<std::int64_t>(original_block_size));
-      update_stat(
-          stats.requested_bytes[stat_type],
-          -static_cast<std::int64_t>(requested_size));
+      decrease_stat(stats.active[stat_type], 1);
+      decrease_stat(stats.active_bytes[stat_type], original_block_size);
+      decrease_stat(stats.requested_bytes[stat_type], requested_size);
     });
   }
 
@@ -2147,6 +2173,7 @@ class DeviceCachingAllocator {
     }
     const size_t subsumed_size = src->size;
     dst->size += subsumed_size;
+    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
     auto erased =
         src->mapped ? pool.blocks.erase(src) : pool.unmapped.erase(src);
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(erased == 1);
@@ -2161,7 +2188,7 @@ class DeviceCachingAllocator {
     // capturing. It's only non-empty if some thread has begun and not yet ended
     // a capture, so it's usually 0, and we can short-circuit
     // cudaStreamCaptureStatus (which does a TLS lookup).
-    if (C10_UNLIKELY(captures_underway.size())) {
+    if (C10_UNLIKELY(!captures_underway.empty())) {
       for (auto& entry : captures_underway) {
         if (entry.second(stream)) {
           auto it1 = graph_pools.find(entry.first);
@@ -2217,9 +2244,7 @@ class DeviceCachingAllocator {
             set_fraction &&
             CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
       // Track block reuse interval only when garbage collection is enabled.
-      for (auto& b : pool.blocks) {
-        ++b->gc_count;
-      }
+      ++pool.get_free_blocks_call_count;
     }
     auto it = pool.blocks.lower_bound(&p.search_key);
     if (it == pool.blocks.end() || (*it)->stream != p.stream())
@@ -2267,7 +2292,6 @@ class DeviceCachingAllocator {
         ((*it)->size >= p.size() + kLargeBuffer))
       return false;
     p.block = *it;
-    (*it)->gc_count = 0; // Denote this block has been used
     pool.blocks.erase(it);
     return true;
   }
@@ -2281,14 +2305,15 @@ class DeviceCachingAllocator {
     return freed_memory;
   }
 
-  void garbage_collect_cached_blocks() {
+  void garbage_collect_cached_blocks(
+      const std::shared_ptr<GatheredContext>& context) {
     // Free unused cached blocks to reclaim GPU memory.
     // Unlike release_cached_blocks(), this does not enforce synchronization and
     // therefore should be of less overheads.
 
     size_t gc_threshold = static_cast<size_t>(
         CUDAAllocatorConfig::garbage_collection_threshold() *
-        allowed_memory_maximum);
+        static_cast<double>(allowed_memory_maximum));
     // No need to trigger GC yet
     if (total_allocated_memory <= gc_threshold) {
       return;
@@ -2298,11 +2323,11 @@ class DeviceCachingAllocator {
 
     // Calculate the total age of the free-able blocks. We'll use it later to
     // get "avg age" threshold.
-    double total_age = 0.0;
+    size_t total_age = 0.0;
     int freeable_block_count = 0;
     for (auto& b : large_blocks.blocks) {
       if (!b->is_split()) {
-        total_age += b->gc_count;
+        total_age += b->gc_count();
         ++freeable_block_count;
       }
     }
@@ -2316,7 +2341,8 @@ class DeviceCachingAllocator {
     while (gc_reclaimed < target_size && block_freed == true &&
            freeable_block_count > 0) {
       // Free blocks exceeding this age threshold first.
-      double age_threshold = total_age / freeable_block_count;
+      double age_threshold =
+          static_cast<double>(total_age) / freeable_block_count;
       // Stop iteration if we can no longer free a block.
       block_freed = false;
 
@@ -2326,12 +2352,13 @@ class DeviceCachingAllocator {
       while (it != large_blocks.blocks.end()) {
         Block* block = *it;
         ++it;
-        if (!block->is_split() && block->gc_count >= age_threshold) {
+        if (!block->is_split() &&
+            static_cast<double>(block->gc_count()) >= age_threshold) {
           block_freed = true;
           gc_reclaimed += block->size;
-          total_age -= block->gc_count; // Decrement the age
+          total_age -= block->gc_count(); // Decrement the age
           freeable_block_count--; // One less block that can be freed
-          release_block(block);
+          release_block(block, context);
         }
       }
     }
@@ -2417,14 +2444,15 @@ class DeviceCachingAllocator {
     total_allocated_memory += size;
     p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
     for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
-      update_stat(stats.segment[stat_type], 1);
-      update_stat(stats.reserved_bytes[stat_type], size);
+      increase_stat(stats.segment[stat_type], 1);
+      increase_stat(stats.reserved_bytes[stat_type], size);
     });
     if (size >= CUDAAllocatorConfig::max_split_size())
-      update_stat(stats.oversize_segments, 1);
+      increase_stat(stats.oversize_segments, 1);
 
     // p.block came from new, not cudaMalloc. It should not be nullptr here.
     TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
+    stats.num_device_alloc++;
     record_trace(
         TraceEntry::SEGMENT_ALLOC,
         int64_t(p.block->ptr),
@@ -2439,19 +2467,17 @@ class DeviceCachingAllocator {
   /** Free one or more oversize blocks to the system allocator.  But only enough
    * **/
   /** to satisfy the target size **/
-  bool release_available_cached_blocks(const AllocParams& p) {
+  bool release_available_cached_blocks(
+      const AllocParams& p,
+      const std::shared_ptr<GatheredContext>& context) {
     if (CUDAAllocatorConfig::max_split_size() ==
         std::numeric_limits<size_t>::max())
       return false;
     BlockPool& pool = *p.pool;
 
     // because of std::unique_ptr, block cannot be trivially copied
-    Block key(
-        p.search_key.device,
-        p.search_key.stream,
-        p.search_key.size,
-        p.search_key.pool,
-        p.search_key.ptr);
+    // Use constructor for search key.
+    Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
     key.size = (key.size < CUDAAllocatorConfig::max_split_size())
         ? CUDAAllocatorConfig::max_split_size()
         : key.size;
@@ -2471,16 +2497,16 @@ class DeviceCachingAllocator {
         totalReleased += (*it)->size;
         if (it != pool.blocks.begin()) {
           --it;
-          release_block(*cur);
+          release_block(*cur, context);
         } else {
-          release_block(*cur);
+          release_block(*cur, context);
           break;
         }
       }
       if (totalReleased < key.size)
         return false;
     } else {
-      release_block(*it);
+      release_block(*it, context);
     }
     return true;
   }
@@ -2491,15 +2517,15 @@ class DeviceCachingAllocator {
     synchronize_and_free_events(context);
 
     // Free all non-split cached blocks to system allocator
-    release_blocks(large_blocks);
-    release_blocks(small_blocks);
+    release_blocks(large_blocks, context);
+    release_blocks(small_blocks, context);
 
     for (auto it = graph_pools_freeable.begin();
          it != graph_pools_freeable.end();) {
       // See notifyCaptureDestroy for the strategy here.
       TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
-      release_blocks(it->second->small_blocks);
-      release_blocks(it->second->large_blocks);
+      release_blocks(it->second->small_blocks, context);
+      release_blocks(it->second->large_blocks, context);
       if (it->second->cudaMalloc_count == 0) {
         auto erase_count = graph_pools.erase(it->first);
         TORCH_INTERNAL_ASSERT(erase_count == 1);
@@ -2528,15 +2554,18 @@ class DeviceCachingAllocator {
     delete block;
   }
 
-  void release_block(Block* block) {
+  void release_block(
+      Block* block,
+      const std::shared_ptr<GatheredContext>& context) {
     TORCH_INTERNAL_ASSERT(!block->expandable_segment_);
+    stats.num_device_free++;
     record_trace(
         TraceEntry::SEGMENT_FREE,
         int64_t(block->ptr),
         block->size,
         block->stream,
         block->device,
-        nullptr);
+        context ? context : block->context_when_segment_allocated);
 
     C10_CUDA_CHECK(cudaFree((void*)block->ptr));
     total_allocated_memory -= block->size;
@@ -2550,19 +2579,19 @@ class DeviceCachingAllocator {
 
     StatTypes stat_types = get_stat_types_for_pool(*pool);
     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
-      update_stat(stats.segment[stat_type], -1);
-      update_stat(
-          stats.reserved_bytes[stat_type],
-          -static_cast<std::int64_t>(block->size));
+      decrease_stat(stats.segment[stat_type], 1);
+      decrease_stat(stats.reserved_bytes[stat_type], block->size);
     });
 
     if (block->size >= CUDAAllocatorConfig::max_split_size())
-      update_stat(stats.oversize_segments, -1);
+      decrease_stat(stats.oversize_segments, 1);
     pool->blocks.erase(block);
     delete block;
   }
 
-  void unmap_block(Block* block) {
+  void unmap_block(
+      Block* block,
+      const std::shared_ptr<GatheredContext>& context) {
     auto unmapped = block->expandable_segment_->unmap(
         SegmentRange{block->ptr, block->size});
     if (unmapped.size == 0) {
@@ -2578,7 +2607,7 @@ class DeviceCachingAllocator {
           block->device, block->stream, before_size, block->pool, block->ptr);
       before_free->expandable_segment_ = block->expandable_segment_;
       before_free->splice(block->prev, block);
-      block->pool->blocks.insert(before_free);
+      block->pool->insert_into_blocks(before_free);
     }
 
     auto after_size = block->size - (before_size + unmapped.size);
@@ -2592,7 +2621,7 @@ class DeviceCachingAllocator {
           static_cast<char*>(unmapped.ptr) + unmapped.size);
       after_free->expandable_segment_ = block->expandable_segment_;
       after_free->splice(block, block->next);
-      block->pool->blocks.insert(after_free);
+      block->pool->insert_into_blocks(after_free);
     }
 
     block->ptr = unmapped.ptr;
@@ -2607,18 +2636,21 @@ class DeviceCachingAllocator {
     total_allocated_memory -= unmapped.size;
     StatTypes stat_types = get_stat_types_for_pool(*block->pool);
     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
-      update_stat(stats.reserved_bytes[stat_type], -unmapped.size);
+      decrease_stat(stats.reserved_bytes[stat_type], unmapped.size);
     });
 
+    stats.num_device_free++;
     record_trace(
         TraceEntry::SEGMENT_UNMAP,
         int64_t(unmapped.ptr),
         unmapped.size,
         block->stream,
         block->device,
-        nullptr);
+        context ? context : block->context_when_segment_allocated);
   }
-  void release_blocks(BlockPool& pool) {
+  void release_blocks(
+      BlockPool& pool,
+      const std::shared_ptr<GatheredContext>& context) {
     std::vector<Block*> to_unmap;
     // Frees all non-split blocks
     auto it = pool.blocks.begin();
@@ -2631,18 +2663,18 @@ class DeviceCachingAllocator {
         // to avoid invalidating the iterator
         to_unmap.push_back(block);
       } else if (!block->prev && !block->next) {
-        release_block(block);
+        release_block(block, context);
       }
     }
     for (Block* block : to_unmap) {
-      unmap_block(block);
+      unmap_block(block, context);
       if (!block->prev && !block->next) {
         release_expandable_segment(block);
       }
     }
   }
 
-  EventPool::Event create_event_internal(int idx) {
+  EventPool::Event create_event_internal(c10::DeviceIndex idx) {
     // Leak the event pool to avoid shutdown issues.
     static auto* event_pool = new EventPool();
     return event_pool->get(idx);
@@ -2651,10 +2683,11 @@ class DeviceCachingAllocator {
   void synchronize_and_free_events(
       const std::shared_ptr<GatheredContext>& context) {
     // Synchronize on outstanding events and then free associated blocks.
+    stats.num_sync_all_streams++;
 
     // This function syncs, so capture should not be underway. Might as well
     // make sure capture-deferred end of life events get processed too.
-    TORCH_INTERNAL_ASSERT(captures_underway.size() == 0);
+    TORCH_INTERNAL_ASSERT(captures_underway.empty());
     insert_events_deferred_until_no_capture();
 
     for (auto& st : cuda_events) {
@@ -2675,7 +2708,7 @@ class DeviceCachingAllocator {
   }
 
   void insert_events(Block* block) {
-    int prev_device = 0;
+    c10::DeviceIndex prev_device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&prev_device));
 
     stream_set streams(std::move(block->stream_uses));
@@ -2683,8 +2716,7 @@ class DeviceCachingAllocator {
     for (auto& stream : streams) {
       C10_CUDA_CHECK(c10::cuda::SetDevice(stream.device_index()));
 
-      EventPool::Event event =
-          create_event_internal(static_cast<int>(stream.device_index()));
+      EventPool::Event event = create_event_internal(stream.device_index());
       C10_CUDA_CHECK(cudaEventRecord(*event, stream.stream()));
 
       block->event_count++;
@@ -2759,12 +2791,12 @@ class DeviceCachingAllocator {
 
   void record_trace(
       TraceEntry::Action action,
-      int64_t addr,
+      size_t addr,
       size_t size,
       cudaStream_t stream,
-      int device,
+      c10::DeviceIndex device,
       std::shared_ptr<GatheredContext> context) {
-    if (!record_history && !trace_trackers_.size())
+    if (!record_history && trace_trackers_.empty())
       return;
 
     auto te = TraceEntry(
@@ -2810,7 +2842,8 @@ static void uncached_delete(void* ptr) {
 
   const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
   if (C10_UNLIKELY(interp)) {
-    (*interp)->trace_gpu_memory_deallocation(reinterpret_cast<uintptr_t>(ptr));
+    (*interp)->trace_gpu_memory_deallocation(
+        c10::kCUDA, reinterpret_cast<uintptr_t>(ptr));
   }
   C10_CUDA_CHECK(cudaFree(ptr));
 }
@@ -2819,14 +2852,29 @@ void local_raw_delete(void* ptr);
 
 class NativeCachingAllocator : public CUDAAllocator {
  private:
-  std::mutex mutex;
+  // Shard allocation region to have independent mutexes to reduce contention.
+  static constexpr size_t kNumMutexShard = 67;
+
+  // TODO: use std::hardware_destructive_interference_size once available
+  struct alignas(64) AlignedMutex {
+    std::mutex m;
+  };
+
+  std::array<AlignedMutex, kNumMutexShard> mutex;
 
   // allocated blocks by device pointer
-  ska::flat_hash_map<void*, Block*> allocated_blocks;
+  std::array<ska::flat_hash_map<void*, Block*>, kNumMutexShard>
+      allocated_blocks;
+
+  static size_t get_mutex_shard_id(void* ptr) {
+    return twang_mix64((size_t)ptr) % kNumMutexShard;
+  }
 
   void add_allocated_block(Block* block) {
-    std::lock_guard<std::mutex> lock(mutex);
-    allocated_blocks[block->ptr] = block;
+    // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+    const auto mutex_shard_id = get_mutex_shard_id(block->ptr);
+    std::lock_guard<std::mutex> lock(mutex[mutex_shard_id].m);
+    allocated_blocks[mutex_shard_id][block->ptr] = block;
   }
 
   c10::ApproximateClockToUnixTimeConverter clock_converter;
@@ -2835,14 +2883,15 @@ class NativeCachingAllocator : public CUDAAllocator {
   std::vector<std::unique_ptr<DeviceCachingAllocator>> device_allocator;
 
   Block* get_allocated_block(void* ptr, bool remove = false) {
-    std::lock_guard<std::mutex> lock(mutex);
-    auto it = allocated_blocks.find(ptr);
-    if (it == allocated_blocks.end()) {
+    const auto mutex_shard_id = get_mutex_shard_id(ptr);
+    std::lock_guard<std::mutex> lock(mutex[mutex_shard_id].m);
+    auto it = allocated_blocks[mutex_shard_id].find(ptr);
+    if (it == allocated_blocks[mutex_shard_id].end()) {
       return nullptr;
     }
     Block* block = it->second;
     if (remove) {
-      allocated_blocks.erase(it);
+      allocated_blocks[mutex_shard_id].erase(it);
     }
     return block;
   }
@@ -2862,7 +2911,11 @@ class NativeCachingAllocator : public CUDAAllocator {
   }
 
   /** allocates a block which is safe to use from the provided stream */
-  void malloc(void** devPtr, int device, size_t size, cudaStream_t stream) {
+  void malloc(
+      void** devPtr,
+      c10::DeviceIndex device,
+      size_t size,
+      cudaStream_t stream) {
     TORCH_INTERNAL_ASSERT(
         0 <= device && static_cast<size_t>(device) < device_allocator.size(),
         "Allocator not initialized for device ",
@@ -2874,7 +2927,7 @@ class NativeCachingAllocator : public CUDAAllocator {
     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
     if (C10_UNLIKELY(interp)) {
       (*interp)->trace_gpu_memory_allocation(
-          reinterpret_cast<uintptr_t>(*devPtr));
+          c10::kCUDA, reinterpret_cast<uintptr_t>(*devPtr));
     }
   }
 
@@ -2889,12 +2942,12 @@ class NativeCachingAllocator : public CUDAAllocator {
     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
     if (C10_UNLIKELY(interp)) {
       (*interp)->trace_gpu_memory_deallocation(
-          reinterpret_cast<uintptr_t>(block->ptr));
+          c10::kCUDA, reinterpret_cast<uintptr_t>(block->ptr));
     }
     device_allocator[block->device]->free(block);
   }
 
-  void setMemoryFraction(double fraction, int device) override {
+  void setMemoryFraction(double fraction, c10::DeviceIndex device) override {
     TORCH_INTERNAL_ASSERT(
         0 <= device && static_cast<size_t>(device) < device_allocator.size(),
         "Allocator not initialized for device ",
@@ -2921,13 +2974,13 @@ class NativeCachingAllocator : public CUDAAllocator {
   }
 
   bool isHistoryEnabled() override {
-    int device = 0;
+    c10::DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     return device_allocator[device]->isHistoryEnabled();
   }
 
   bool checkPoolLiveAllocations(
-      int device,
+      c10::DeviceIndex device,
       MempoolId_t mempool_id,
       const std::unordered_set<void*>& expected_live_allocations) override {
     return device_allocator[device]->checkPoolLiveAllocations(
@@ -2936,7 +2989,7 @@ class NativeCachingAllocator : public CUDAAllocator {
 
   void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override {
     for (auto& allocator : device_allocator) {
-      allocator->attachOutOfMemoryObserver(std::move(observer));
+      allocator->attachOutOfMemoryObserver(observer);
     }
   }
 
@@ -2993,11 +3046,28 @@ class NativeCachingAllocator : public CUDAAllocator {
       auto snap = da->snapshot();
       result.segments.insert(result.segments.end(), snap.begin(), snap.end());
     }
+
+    auto& md = result.config_metadata;
+    md.garbage_collection_threshold =
+        CUDAAllocatorConfig::garbage_collection_threshold();
+    md.max_split_size = CUDAAllocatorConfig::max_split_size();
+    md.pinned_num_register_threads =
+        CUDAAllocatorConfig::pinned_num_register_threads();
+    md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
+    md.release_lock_on_malloc =
+        CUDAAllocatorConfig::release_lock_on_cudamalloc();
+    md.pinned_use_host_register =
+        CUDAAllocatorConfig::pinned_use_cuda_host_register();
+    md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
+    md.roundup_power2_divisions =
+        CUDAAllocatorConfig::roundup_power2_divisions();
+
     return result;
   }
 
-  std::shared_ptr<AllocatorState> getCheckpointState(int device, MempoolId_t id)
-      override {
+  std::shared_ptr<AllocatorState> getCheckpointState(
+      c10::DeviceIndex device,
+      MempoolId_t id) override {
     return device_allocator[device]->getCheckpointState(id);
   }
 
@@ -3014,7 +3084,7 @@ class NativeCachingAllocator : public CUDAAllocator {
    * functions for all allocated blocks in the new checkpoint state.
    */
   CheckpointDelta setCheckpointPoolState(
-      int device,
+      c10::DeviceIndex device,
       std::shared_ptr<AllocatorState> as) override {
     std::shared_ptr<PrivatePoolState> pps =
         std::dynamic_pointer_cast<PrivatePoolState>(as);
@@ -3040,13 +3110,13 @@ class NativeCachingAllocator : public CUDAAllocator {
     return cpd;
   }
 
-  DataPtr allocate(size_t size) const override {
+  DataPtr allocate(size_t size) override {
     constexpr size_t one_exa_bytes = 1152921504606846976ULL;
     TORCH_CHECK_WITH(
         OutOfMemoryError,
         size < one_exa_bytes,
         "CUDA out of memory. Tried to allocate more than 1EB memory.");
-    int device = 0;
+    c10::DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* devPtr = nullptr;
     void (*deleteFunc)(void*) = &local_raw_delete;
@@ -3061,13 +3131,11 @@ class NativeCachingAllocator : public CUDAAllocator {
       const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
       if (C10_UNLIKELY(interp)) {
         (*interp)->trace_gpu_memory_allocation(
-            reinterpret_cast<uintptr_t>(devPtr));
+            c10::kCUDA, reinterpret_cast<uintptr_t>(devPtr));
       }
     } else {
       if (size != 0) {
-        // Allocator declars allocate const!?
-        const_cast<NativeCachingAllocator*>(this)->malloc(
-            &devPtr, device, size, stream);
+        this->malloc(&devPtr, device, size, stream);
       }
     }
 
@@ -3084,10 +3152,10 @@ class NativeCachingAllocator : public CUDAAllocator {
       return &local_raw_delete;
     }
   }
-  void cacheInfo(int dev_id, size_t* largestBlock) override {
-    device_allocator[dev_id]->cacheInfo(largestBlock);
+  void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override {
+    device_allocator[device]->cacheInfo(largestBlock);
   }
-  void assertValidDevice(int device) {
+  void assertValidDevice(c10::DeviceIndex device) {
     const auto device_num = device_allocator.size();
     TORCH_CHECK(
         0 <= device && device < static_cast<int64_t>(device_num),
@@ -3096,23 +3164,23 @@ class NativeCachingAllocator : public CUDAAllocator {
         ": did you call init?");
   }
 
-  DeviceStats getDeviceStats(int device) override {
+  DeviceStats getDeviceStats(c10::DeviceIndex device) override {
     assertValidDevice(device);
     return device_allocator[device]->getStats();
   }
 
-  void resetAccumulatedStats(int device) override {
+  void resetAccumulatedStats(c10::DeviceIndex device) override {
     assertValidDevice(device);
     device_allocator[device]->resetAccumulatedStats();
   }
 
-  void resetPeakStats(int device) override {
+  void resetPeakStats(c10::DeviceIndex device) override {
     assertValidDevice(device);
     device_allocator[device]->resetPeakStats();
   }
   // CUDAGraph interactions
   void beginAllocateToPool(
-      int device,
+      c10::DeviceIndex device,
       MempoolId_t mempool_id,
       std::function<bool(cudaStream_t)> filter) override {
     assertValidDevice(device);
@@ -3120,12 +3188,13 @@ class NativeCachingAllocator : public CUDAAllocator {
         std::move(mempool_id), std::move(filter));
   }
 
-  void endAllocateToPool(int device, MempoolId_t mempool_id) override {
+  void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id)
+      override {
     assertValidDevice(device);
     device_allocator[device]->endAllocateToPool(mempool_id);
   }
 
-  void releasePool(int device, MempoolId_t mempool_id) override {
+  void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override {
     assertValidDevice(device);
     device_allocator[device]->releasePool(std::move(mempool_id));
   }
@@ -3134,7 +3203,7 @@ class NativeCachingAllocator : public CUDAAllocator {
     if (nbytes == 0) {
       return nullptr;
     }
-    int device = 0;
+    c10::DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
     malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
@@ -3145,14 +3214,15 @@ class NativeCachingAllocator : public CUDAAllocator {
     if (nbytes == 0) {
       return nullptr;
     }
-    int device = 0;
+    c10::DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
     malloc(&r, device, nbytes, stream);
     return r;
   }
 
-  void enablePeerAccess(int dev, int dev_to_access) override {
+  void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access)
+      override {
     c10::cuda::CUDAGuard device_guard(dev);
     cudaError_t err = cudaDeviceEnablePeerAccess(dev_to_access, 0);
     if (err == cudaErrorPeerAccessAlreadyEnabled) {
@@ -3222,7 +3292,7 @@ class NativeCachingAllocator : public CUDAAllocator {
     C10_CUDA_CHECK(cudaIpcOpenMemHandle(
         &dev, *ipc_handle, cudaIpcMemLazyEnablePeerAccess));
     // devPtr has to be deleted in same device when created.
-    int curr_device = 0;
+    c10::DeviceIndex curr_device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&curr_device));
     auto sp =
         std::shared_ptr<void>(dev, [handle, curr_device, this](void* ptr) {
@@ -3243,6 +3313,10 @@ class NativeCachingAllocator : public CUDAAllocator {
   std::string name() override {
     return "native";
   }
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    C10_CUDA_CHECK(
+        cudaMemcpy(dest, src, count, cudaMemcpyKind::cudaMemcpyDeviceToDevice));
+  }
 };
 
 NativeCachingAllocator allocator;
@@ -3264,13 +3338,13 @@ std::string format_size(uint64_t size) {
   if (size <= 1024) {
     os << size << " bytes";
   } else if (size <= 1048576) {
-    os << (size / 1024.0);
+    os << (static_cast<double>(size) / 1024.0);
     os << " KiB";
   } else if (size <= 1073741824ULL) {
-    os << size / 1048576.0;
+    os << static_cast<double>(size) / 1048576.0;
     os << " MiB";
   } else {
-    os << size / 1073741824.0;
+    os << static_cast<double>(size) / 1073741824.0;
     os << " GiB";
   }
   return os.str();
@@ -3325,6 +3399,6 @@ struct BackendStaticInitializer {
 std::atomic<CUDAAllocator*> allocator;
 BackendStaticInitializer backend_static_initializer;
 
-} // namespace CUDACachingAllocator
-} // namespace cuda
+} // namespace cuda::CUDACachingAllocator
+
 } // namespace c10
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index 0706cbaf64ff4..438ed8d77f75d 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -1,17 +1,22 @@
 #pragma once
 
 #include <c10/core/Allocator.h>
-#include <c10/core/StorageImpl.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/ApproximateClock.h>
+#include <c10/util/Exception.h>
 #include <c10/util/Registry.h>
 
 #include <array>
-#include <mutex>
-#include <set>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
 #include <unordered_set>
+#include <utility>
 
 namespace c10 {
 
@@ -97,17 +102,28 @@ struct DeviceStats {
   // COUNT: total number of oversize blocks requiring malloc
   Stat oversize_segments;
 
+  // COUNT: total number of synchronize_and_free_events() calls
+  int64_t num_sync_all_streams = 0;
+
+  // COUNT: total number of CUDA allocation calls. This includes both cuMemMap
+  // and cudaMalloc.
+  int64_t num_device_alloc = 0;
+
+  // COUNT: total number of CUDA free calls. This includes both cuMemUnmap
+  // and cudaFree.
+  int64_t num_device_free = 0;
+
   // SIZE: maximum block size that is allowed to be split.
   int64_t max_split_size = 0;
 };
 
-typedef std::shared_ptr<GatheredContext> (*CreateContextFn)(void);
+typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
 
 // Struct containing info of an allocation block (i.e. a fractional part of a
 // cudaMalloc)..
 struct BlockInfo {
-  int64_t size = 0;
-  int64_t requested_size = 0;
+  size_t size = 0;
+  size_t requested_size = 0;
   int32_t gc_counter = 0;
   bool allocated = false;
   bool active = false;
@@ -117,13 +133,13 @@ struct BlockInfo {
 
 // Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
 struct SegmentInfo {
-  int64_t device = 0;
-  int64_t address = 0;
-  int64_t total_size = 0;
-  int64_t requested_size = 0; // unrounded, actually requested size
-  int64_t allocated_size = 0;
-  int64_t active_size = 0;
-  cudaStream_t stream = 0;
+  c10::DeviceIndex device = 0;
+  size_t address = 0;
+  size_t total_size = 0;
+  size_t requested_size = 0; // unrounded, actually requested size
+  size_t allocated_size = 0;
+  size_t active_size = 0;
+  cudaStream_t stream = nullptr;
   bool is_large = false;
   bool is_expandable = false;
   MempoolId_t owner_private_pool_id = {0, 0};
@@ -159,8 +175,8 @@ struct TraceEntry {
   };
   TraceEntry(
       Action action,
-      int device,
-      int64_t addr,
+      c10::DeviceIndex device,
+      size_t addr,
       size_t size,
       cudaStream_t stream,
       approx_time_t time,
@@ -174,17 +190,29 @@ struct TraceEntry {
     time_.approx_t_ = time;
   }
   Action action_;
-  int device_;
-  int64_t addr_; // for OOM, this is the amount of free bytes reported by cuda
+  c10::DeviceIndex device_;
+  size_t addr_; // for OOM, this is the amount of free bytes reported by cuda
   std::shared_ptr<GatheredContext> context_;
-  cudaStream_t stream_;
-  int64_t size_;
-  trace_time_ time_;
+  cudaStream_t stream_{};
+  size_t size_;
+  trace_time_ time_{};
+};
+
+struct AllocatorConfigInfo {
+  double garbage_collection_threshold;
+  size_t max_split_size;
+  size_t pinned_num_register_threads;
+  bool expandable_segments;
+  bool release_lock_on_malloc;
+  bool pinned_use_host_register;
+  std::string last_allocator_settings;
+  std::vector<size_t> roundup_power2_divisions;
 };
 
 struct SnapshotInfo {
   std::vector<SegmentInfo> segments;
   std::vector<std::vector<TraceEntry>> device_traces;
+  AllocatorConfigInfo config_metadata;
 };
 
 // returns the pointers freed in the pool
@@ -207,9 +235,9 @@ std::string format_size(uint64_t size);
 
 using OutOfMemoryObserver = std::function<void(
     int64_t device,
-    int64_t allocated,
-    int64_t device_total,
-    int64_t device_free)>;
+    size_t allocated,
+    size_t device_total,
+    size_t device_free)>;
 
 using AllocatorTraceTracker = std::function<void(const TraceEntry&)>;
 
@@ -220,24 +248,26 @@ class CUDAAllocator : public Allocator {
   virtual void raw_delete(void* ptr) = 0;
   virtual void init(int device_count) = 0;
   virtual bool initialized() = 0;
-  virtual void setMemoryFraction(double fraction, int device) = 0;
+  virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
   virtual void emptyCache() = 0;
-  virtual void cacheInfo(int dev_id, size_t* largestBlock) = 0;
+  virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
   virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
   virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
-  virtual DeviceStats getDeviceStats(int device) = 0;
-  virtual void resetAccumulatedStats(int device) = 0;
-  virtual void resetPeakStats(int device) = 0;
+  virtual DeviceStats getDeviceStats(c10::DeviceIndex device) = 0;
+  virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
+  virtual void resetPeakStats(c10::DeviceIndex device) = 0;
   virtual SnapshotInfo snapshot() = 0;
   virtual void beginAllocateToPool(
-      int device,
+      c10::DeviceIndex device,
       MempoolId_t mempool_id,
       std::function<bool(cudaStream_t)> filter) = 0;
-  virtual void endAllocateToPool(int device, MempoolId_t mempool_id) = 0;
-  virtual void releasePool(int device, MempoolId_t mempool_id) = 0;
+  virtual void endAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id) = 0;
+  virtual void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) = 0;
   // returns true if the allocated blocks are equal to expected live allocations
   virtual bool checkPoolLiveAllocations(
-      int device,
+      c10::DeviceIndex device,
       MempoolId_t mempool_id,
       const std::unordered_set<void*>& expected_live_allocations) {
     TORCH_CHECK(
@@ -269,7 +299,9 @@ class CUDAAllocator : public Allocator {
   // callback.
   virtual void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) = 0;
 
-  virtual void enablePeerAccess(int dev, int dev_to_access) = 0;
+  virtual void enablePeerAccess(
+      c10::DeviceIndex dev,
+      c10::DeviceIndex dev_to_access) = 0;
 
   // memory not allocated from cudaMalloc cannot be copied
   // across devices using cudaMemcpyAsync if peer to peer access is disabled.
@@ -291,10 +323,10 @@ class CUDAAllocator : public Allocator {
       cudaStream_t stream,
       bool p2p_enabled) = 0;
   virtual std::shared_ptr<AllocatorState> getCheckpointState(
-      int device,
+      c10::DeviceIndex device,
       MempoolId_t id) = 0;
   virtual CheckpointDelta setCheckpointPoolState(
-      int device,
+      c10::DeviceIndex device,
       std::shared_ptr<AllocatorState> pps) = 0;
   virtual std::string name() = 0;
 };
@@ -327,7 +359,7 @@ inline void init(int device_count) {
   return get()->init(device_count);
 }
 
-inline void setMemoryFraction(double fraction, int device) {
+inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
   return get()->setMemoryFraction(fraction, device);
 }
 
@@ -335,8 +367,8 @@ inline void emptyCache() {
   return get()->emptyCache();
 }
 
-inline void cacheInfo(int dev_id, size_t* largestBlock) {
-  return get()->cacheInfo(dev_id, largestBlock);
+inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
+  return get()->cacheInfo(device, largestBlock);
 }
 
 inline void* getBaseAllocation(void* ptr, size_t* size) {
@@ -347,15 +379,15 @@ inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) {
   return get()->recordStream(dataPtr, stream);
 }
 
-inline DeviceStats getDeviceStats(int device) {
+inline DeviceStats getDeviceStats(c10::DeviceIndex device) {
   return get()->getDeviceStats(device);
 }
 
-inline void resetAccumulatedStats(int device) {
+inline void resetAccumulatedStats(c10::DeviceIndex device) {
   return get()->resetAccumulatedStats(device);
 }
 
-inline void resetPeakStats(int device) {
+inline void resetPeakStats(c10::DeviceIndex device) {
   return get()->resetPeakStats(device);
 }
 
@@ -364,26 +396,26 @@ inline SnapshotInfo snapshot() {
 }
 
 inline std::shared_ptr<AllocatorState> getCheckpointState(
-    int device,
+    c10::DeviceIndex device,
     MempoolId_t id) {
   return get()->getCheckpointState(device, id);
 }
 
 inline CheckpointDelta setCheckpointPoolState(
-    int device,
+    c10::DeviceIndex device,
     std::shared_ptr<AllocatorState> pps) {
-  return get()->setCheckpointPoolState(device, pps);
+  return get()->setCheckpointPoolState(device, std::move(pps));
 }
 
 // CUDAGraph interactions
 inline void beginAllocateToPool(
-    int device,
+    c10::DeviceIndex device,
     MempoolId_t mempool_id,
     std::function<bool(cudaStream_t)> filter) {
   get()->beginAllocateToPool(device, mempool_id, std::move(filter));
 }
 
-inline void endAllocateToPool(int device, MempoolId_t mempool_id) {
+inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) {
   get()->endAllocateToPool(device, mempool_id);
 }
 
@@ -401,7 +433,7 @@ inline bool isHistoryEnabled() {
 }
 
 inline bool checkPoolLiveAllocations(
-    int device,
+    c10::DeviceIndex device,
     MempoolId_t mempool_id,
     const std::unordered_set<void*>& expected_live_allocations) {
   return get()->checkPoolLiveAllocations(
@@ -409,19 +441,19 @@ inline bool checkPoolLiveAllocations(
 }
 
 inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
-  return get()->attachOutOfMemoryObserver(observer);
+  return get()->attachOutOfMemoryObserver(std::move(observer));
 }
 
 inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
-  return get()->attachAllocatorTraceTracker(tracker);
+  return get()->attachAllocatorTraceTracker(std::move(tracker));
 }
 
-inline void releasePool(int device, MempoolId_t mempool_id) {
+inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
   return get()->releasePool(device, mempool_id);
 }
 // Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
 inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
-  return get()->getIpcDevPtr(handle);
+  return get()->getIpcDevPtr(std::move(handle));
 }
 
 inline std::string name() {
@@ -440,7 +472,9 @@ inline cudaError_t memcpyAsync(
       dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
 }
 
-inline void enablePeerAccess(int dev, int dev_to_access) {
+inline void enablePeerAccess(
+    c10::DeviceIndex dev,
+    c10::DeviceIndex dev_to_access) {
   return get()->enablePeerAccess(dev, dev_to_access);
 }
 
diff --git a/c10/cuda/CUDADeviceAssertionHost.cpp b/c10/cuda/CUDADeviceAssertionHost.cpp
index 1a3b2a68c69c3..1d52af7812273 100644
--- a/c10/cuda/CUDADeviceAssertionHost.cpp
+++ b/c10/cuda/CUDADeviceAssertionHost.cpp
@@ -6,13 +6,12 @@
 #include <c10/util/irange.h>
 #include <cuda_runtime.h>
 
-#include <algorithm>
-#include <iostream>
 #include <memory>
-#include <sstream>
-#include <stdexcept>
 #include <string>
+#ifdef TORCH_USE_CUDA_DSA
+#include <chrono>
 #include <thread>
+#endif
 
 #define C10_CUDA_CHECK_WO_DSA(EXPR)                                 \
   do {                                                              \
@@ -26,8 +25,7 @@
         false);                                                     \
   } while (0)
 
-namespace c10 {
-namespace cuda {
+namespace c10::cuda {
 
 namespace {
 
@@ -36,7 +34,7 @@ namespace {
 /// We need our own implementation of this function to prevent
 /// an infinite initialization loop for CUDAKernelLaunchRegistry
 int dsa_get_device_id() {
-  int device = -1;
+  c10::DeviceIndex device = -1;
   C10_CUDA_CHECK_WO_DSA(c10::cuda::GetDevice(&device));
   return device;
 }
@@ -343,5 +341,4 @@ bool CUDAKernelLaunchRegistry::has_failed() const {
   return false;
 }
 
-} // namespace cuda
-} // namespace c10
+} // namespace c10::cuda
diff --git a/c10/cuda/CUDADeviceAssertionHost.h b/c10/cuda/CUDADeviceAssertionHost.h
index 7a9b070c85e00..a945915c2878b 100644
--- a/c10/cuda/CUDADeviceAssertionHost.h
+++ b/c10/cuda/CUDADeviceAssertionHost.h
@@ -2,9 +2,11 @@
 
 #include <c10/cuda/CUDAMacros.h>
 
+#include <cstdint>
 #include <memory>
 #include <mutex>
 #include <string>
+#include <utility>
 #include <vector>
 
 #ifdef USE_CUDA
@@ -22,19 +24,24 @@ namespace c10::cuda {
 /// Held in managed memory and access by both the CPU and the GPU.
 struct DeviceAssertionData {
   /// Stringification of the assertion
-  char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN];
+  // NOLINTNEXTLINE(*-c-arrays)
+  char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]{};
   /// File the assertion was in
-  char filename[C10_CUDA_DSA_MAX_STR_LEN];
+  // NOLINTNEXTLINE(*-c-arrays)
+  char filename[C10_CUDA_DSA_MAX_STR_LEN]{};
   /// Name of the function the assertion was in
-  char function_name[C10_CUDA_DSA_MAX_STR_LEN];
+  // NOLINTNEXTLINE(*-c-arrays)
+  char function_name[C10_CUDA_DSA_MAX_STR_LEN]{};
   /// Line number the assertion was at
-  int line_number;
+  int line_number{};
   /// Number uniquely identifying the kernel launch that triggered the assertion
-  uint32_t caller;
+  uint32_t caller{};
   /// block_id of the thread that failed the assertion
-  int32_t block_id[3];
+  // NOLINTNEXTLINE(*-c-arrays)
+  int32_t block_id[3]{};
   /// third_id of the thread that failed the assertion
-  int32_t thread_id[3];
+  // NOLINTNEXTLINE(*-c-arrays)
+  int32_t thread_id[3]{};
 };
 
 /// Used to hold assertions generated by the device
@@ -42,9 +49,10 @@ struct DeviceAssertionData {
 struct DeviceAssertionsData {
   /// Total number of assertions found; a subset of thse will be recorded
   /// in `assertions`
-  int32_t assertion_count;
+  int32_t assertion_count{};
   /// An array of assertions that will be written to in a race-free manner
-  DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT];
+  // NOLINTNEXTLINE(*-c-arrays)
+  DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]{};
 };
 
 /// Use to hold info about kernel launches so that we can run kernels
diff --git a/c10/cuda/CUDAException.cpp b/c10/cuda/CUDAException.cpp
index 24f3d928af696..2d725747c969d 100644
--- a/c10/cuda/CUDAException.cpp
+++ b/c10/cuda/CUDAException.cpp
@@ -6,8 +6,7 @@
 
 #include <string>
 
-namespace c10 {
-namespace cuda {
+namespace c10::cuda {
 
 void c10_cuda_check_implementation(
     const int32_t err,
@@ -44,5 +43,4 @@ void c10_cuda_check_implementation(
   TORCH_CHECK(false, check_message);
 }
 
-} // namespace cuda
-} // namespace c10
+} // namespace c10::cuda
diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp
index 1c4b7756590d7..652f222385465 100644
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@@ -117,23 +117,26 @@ DeviceIndex device_count_ensure_non_zero() {
   int count = device_count_impl(/*fail_if_no_driver=*/true);
   // Zero gpus doesn't produce a warning in `device_count` but we fail here
   TORCH_CHECK(count, "No CUDA GPUs are available");
+  TORCH_INTERNAL_ASSERT(
+      count <= std::numeric_limits<DeviceIndex>::max(),
+      "Too many CUDA devices, DeviceIndex overflowed");
   return static_cast<DeviceIndex>(count);
 }
 
 DeviceIndex current_device() {
-  int cur_device = 0;
+  DeviceIndex cur_device = -1;
   C10_CUDA_CHECK(c10::cuda::GetDevice(&cur_device));
-  return static_cast<DeviceIndex>(cur_device);
+  return cur_device;
 }
 
 void set_device(DeviceIndex device) {
-  C10_CUDA_CHECK(c10::cuda::SetDevice(static_cast<int>(device)));
+  C10_CUDA_CHECK(c10::cuda::SetDevice(device));
 }
 
 void device_synchronize() {
   const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
   if (C10_UNLIKELY(interp)) {
-    (*interp)->trace_gpu_device_synchronization();
+    (*interp)->trace_gpu_device_synchronization(c10::kCUDA);
   }
   C10_CUDA_CHECK(cudaDeviceSynchronize());
 }
@@ -205,18 +208,28 @@ cudaError_t GetDeviceCount(int* dev_count) {
 // call y = torch.empty(1, device=“cuda”) # CUDA context is created on cuda:0
 // ```
 #if CUDA_VERSION >= 12000
-thread_local int targetDeviceIndex = -1;
+thread_local DeviceIndex targetDeviceIndex = -1;
 
-cudaError_t GetDevice(int* device) {
+cudaError_t GetDevice(DeviceIndex* device) {
   if (targetDeviceIndex >= 0) {
     *device = targetDeviceIndex;
     return cudaSuccess;
   }
-  return cudaGetDevice(device);
+  int tmp_device = -1;
+  auto err = cudaGetDevice(&tmp_device);
+  if (err == cudaSuccess) {
+    TORCH_INTERNAL_ASSERT(
+        tmp_device >= 0 &&
+            tmp_device <= std::numeric_limits<DeviceIndex>::max(),
+        "cudaGetDevice returns invalid device ",
+        tmp_device);
+    *device = static_cast<DeviceIndex>(tmp_device);
+  }
+  return err;
 }
 
-cudaError_t SetDevice(int device) {
-  TORCH_CHECK(device >= 0, "device id must be positive!");
+cudaError_t SetDevice(DeviceIndex device) {
+  TORCH_CHECK(device >= 0, "device id must be positive!", device);
   targetDeviceIndex = -1;
   int cur_device = -1;
   C10_CUDA_CHECK(cudaGetDevice(&cur_device));
@@ -226,7 +239,7 @@ cudaError_t SetDevice(int device) {
   return cudaSetDevice(device);
 }
 
-cudaError_t MaybeSetDevice(int device) {
+cudaError_t MaybeSetDevice(DeviceIndex device) {
   if (hasPrimaryContext(device)) {
     return c10::cuda::SetDevice(device);
   }
@@ -236,11 +249,13 @@ cudaError_t MaybeSetDevice(int device) {
 
 // This function always initializes the CUDA context
 // on to_device
-int ExchangeDevice(int to_device) {
-  int cur_device = targetDeviceIndex;
+DeviceIndex ExchangeDevice(DeviceIndex to_device) {
+  auto cur_device = targetDeviceIndex;
   targetDeviceIndex = -1;
   if (cur_device < 0) {
-    C10_CUDA_CHECK(cudaGetDevice(&cur_device));
+    int tmp_device = -1;
+    C10_CUDA_CHECK(cudaGetDevice(&tmp_device));
+    cur_device = static_cast<DeviceIndex>(tmp_device);
     if (to_device == cur_device) {
       return cur_device;
     }
@@ -251,10 +266,16 @@ int ExchangeDevice(int to_device) {
 
 // This function does not initialize the CUDA context
 // on to_device if it does not already exist
-int MaybeExchangeDevice(int to_device) {
-  int cur_device = -1;
-  C10_CUDA_CHECK(cudaGetDevice(&cur_device));
-  if (to_device == cur_device) {
+DeviceIndex MaybeExchangeDevice(DeviceIndex to_device) {
+  int tmp_cur_device = -1;
+  C10_CUDA_CHECK(cudaGetDevice(&tmp_cur_device));
+  TORCH_INTERNAL_ASSERT(
+      tmp_cur_device >= 0 &&
+          tmp_cur_device <= std::numeric_limits<DeviceIndex>::max(),
+      "cudaGetDevice returns invalid device ",
+      tmp_cur_device);
+  auto cur_device = static_cast<DeviceIndex>(tmp_cur_device);
+  if (to_device == tmp_cur_device) {
     return cur_device;
   }
   if (hasPrimaryContext(to_device)) {
@@ -271,12 +292,22 @@ void SetTargetDevice() {
   }
 }
 #else
-cudaError_t GetDevice(int* device) {
-  return cudaGetDevice(device);
+cudaError_t GetDevice(DeviceIndex* device) {
+  int tmp_device = -1;
+  auto err = cudaGetDevice(&tmp_device);
+  if (err == cudaSuccess) {
+    TORCH_INTERNAL_ASSERT(
+        tmp_device >= 0 &&
+            tmp_device <= std::numeric_limits<DeviceIndex>::max(),
+        "cudaGetDevice returns invalid device ",
+        tmp_device);
+    *device = static_cast<DeviceIndex>(tmp_device);
+  }
+  return err;
 }
 
-cudaError_t SetDevice(int device) {
-  TORCH_CHECK(device >= 0, "device id must be positive!");
+cudaError_t SetDevice(DeviceIndex device) {
+  TORCH_CHECK(device >= 0, "device id must be positive!", device);
   int cur_device = -1;
   C10_CUDA_CHECK(cudaGetDevice(&cur_device));
   if (device == cur_device) {
@@ -285,12 +316,12 @@ cudaError_t SetDevice(int device) {
   return cudaSetDevice(device);
 }
 
-cudaError_t MaybeSetDevice(int device) {
+cudaError_t MaybeSetDevice(DeviceIndex device) {
   return c10::cuda::SetDevice(device);
 }
 
-int ExchangeDevice(int to_device) {
-  int cur_device = -1;
+DeviceIndex ExchangeDevice(DeviceIndex to_device) {
+  DeviceIndex cur_device = -1;
   C10_CUDA_CHECK(c10::cuda::GetDevice(&cur_device));
   if (to_device == cur_device) {
     return cur_device;
@@ -299,7 +330,7 @@ int ExchangeDevice(int to_device) {
   return cur_device;
 }
 
-int MaybeExchangeDevice(int to_device) {
+DeviceIndex MaybeExchangeDevice(DeviceIndex to_device) {
   return c10::cuda::ExchangeDevice(to_device);
 }
 
diff --git a/c10/cuda/CUDAFunctions.h b/c10/cuda/CUDAFunctions.h
index 1b9a8fb67d51f..72fdfc6fd692f 100644
--- a/c10/cuda/CUDAFunctions.h
+++ b/c10/cuda/CUDAFunctions.h
@@ -36,15 +36,15 @@ C10_CUDA_API void warn_or_error_on_sync();
 // Raw CUDA device management functions
 C10_CUDA_API cudaError_t GetDeviceCount(int* dev_count);
 
-C10_CUDA_API cudaError_t GetDevice(int* device);
+C10_CUDA_API cudaError_t GetDevice(DeviceIndex* device);
 
-C10_CUDA_API cudaError_t SetDevice(int device);
+C10_CUDA_API cudaError_t SetDevice(DeviceIndex device);
 
-C10_CUDA_API cudaError_t MaybeSetDevice(int device);
+C10_CUDA_API cudaError_t MaybeSetDevice(DeviceIndex device);
 
-C10_CUDA_API int ExchangeDevice(int device);
+C10_CUDA_API DeviceIndex ExchangeDevice(DeviceIndex device);
 
-C10_CUDA_API int MaybeExchangeDevice(int device);
+C10_CUDA_API DeviceIndex MaybeExchangeDevice(DeviceIndex device);
 
 C10_CUDA_API void SetTargetDevice();
 
@@ -87,7 +87,7 @@ C10_CUDA_API void __inline__ memcpy_and_sync(
   const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
   if (C10_UNLIKELY(interp)) {
     (*interp)->trace_gpu_stream_synchronization(
-        reinterpret_cast<uintptr_t>(stream));
+        c10::kCUDA, reinterpret_cast<uintptr_t>(stream));
   }
 #if defined(TORCH_HIP_VERSION) && (TORCH_HIP_VERSION >= 301)
   C10_CUDA_CHECK(hipMemcpyWithStream(dst, src, nbytes, kind, stream));
@@ -105,7 +105,7 @@ C10_CUDA_API void __inline__ stream_synchronize(cudaStream_t stream) {
   const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
   if (C10_UNLIKELY(interp)) {
     (*interp)->trace_gpu_stream_synchronization(
-        reinterpret_cast<uintptr_t>(stream));
+        c10::kCUDA, reinterpret_cast<uintptr_t>(stream));
   }
   C10_CUDA_CHECK(cudaStreamSynchronize(stream));
 }
diff --git a/c10/cuda/CUDAGraphsC10Utils.h b/c10/cuda/CUDAGraphsC10Utils.h
index 7aca8852fe893..79a36d80cead5 100644
--- a/c10/cuda/CUDAGraphsC10Utils.h
+++ b/c10/cuda/CUDAGraphsC10Utils.h
@@ -19,8 +19,8 @@ using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
 // that controls the error-checking strictness of a capture.
 #if !defined(USE_ROCM) || ROCM_VERSION >= 50300
 struct C10_CUDA_API CUDAStreamCaptureModeGuard {
-  CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired) {
-    strictness_ = desired;
+  CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired)
+      : strictness_(desired) {
     C10_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&strictness_));
   }
   ~CUDAStreamCaptureModeGuard() {
@@ -79,7 +79,7 @@ inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) {
 // Use this version where you're sure a CUDA context exists already.
 inline CaptureStatus currentStreamCaptureStatusMayInitCtx() {
 #if !defined(USE_ROCM) || ROCM_VERSION >= 50300
-  cudaStreamCaptureStatus is_capturing;
+  cudaStreamCaptureStatus is_capturing{cudaStreamCaptureStatusNone};
   C10_CUDA_CHECK(
       cudaStreamIsCapturing(c10::cuda::getCurrentCUDAStream(), &is_capturing));
   return CaptureStatus(is_capturing);
diff --git a/c10/cuda/CUDAGuard.h b/c10/cuda/CUDAGuard.h
index aebd59dd161cf..254522893d5e0 100644
--- a/c10/cuda/CUDAGuard.h
+++ b/c10/cuda/CUDAGuard.h
@@ -6,8 +6,6 @@
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/cuda/impl/CUDAGuardImpl.h>
 
-#include <cstddef>
-
 namespace c10::cuda {
 
 // This code is kind of boilerplatey.  See Note [Whither the DeviceGuard
diff --git a/c10/cuda/CUDAMacros.h b/c10/cuda/CUDAMacros.h
index b766fb473a52d..4fe25918c3082 100644
--- a/c10/cuda/CUDAMacros.h
+++ b/c10/cuda/CUDAMacros.h
@@ -41,10 +41,11 @@
 /**
  * The maximum number of GPUs that we recognizes. Increasing this beyond the
  * initial limit of 16 broke Caffe2 testing, hence the ifdef guards.
- */
+ * This value cannot be more than 128 because our DeviceIndex is a uint8_t.
+o */
 #ifdef FBCODE_CAFFE2
 // fbcode depends on this value being 16
 #define C10_COMPILE_TIME_MAX_GPUS 16
 #else
-#define C10_COMPILE_TIME_MAX_GPUS 64
+#define C10_COMPILE_TIME_MAX_GPUS 120
 #endif
diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp
index dbc6e8626be3e..f5c3c948f7c3e 100644
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@@ -9,10 +9,7 @@
 #include <unordered_set>
 #include <vector>
 
-namespace c10 {
-namespace cuda {
-namespace CUDACachingAllocator {
-namespace CudaMallocAsync {
+namespace c10::cuda::CUDACachingAllocator::CudaMallocAsync {
 
 #if CUDA_VERSION >= 11040
 // CUDA device allocator that uses cudaMallocAsync to implement
@@ -32,11 +29,12 @@ namespace {
 
 struct UsageStream {
   cudaStream_t stream;
-  int device;
+  c10::DeviceIndex device;
   UsageStream() = default;
-  UsageStream(cudaStream_t s, int d) : stream(s), device(d) {}
+  UsageStream(cudaStream_t s, c10::DeviceIndex d) : stream(s), device(d) {}
   UsageStream(const UsageStream& us) = default;
-  UsageStream(const UsageStream&& us) : stream(us.stream), device(us.device) {}
+  UsageStream(const UsageStream&& us) noexcept
+      : stream(us.stream), device(us.device) {}
   UsageStream& operator=(UsageStream other) {
     stream = other.stream;
     device = other.device;
@@ -146,7 +144,7 @@ bool capture_underway = false;
 // Implementation functions
 
 // Assumes the caller holds general_mutex
-inline void lazy_init_device(int device) {
+inline void lazy_init_device(c10::DeviceIndex device) {
   if (!devs_initialized_flags[device]) {
     CUDAGuard g(device);
 
@@ -316,7 +314,11 @@ void freeAsync(void* ptr) {
 
 // Symmetric with NativeCachingAllocator::malloc for now,
 // although I don't think we absolutely need the symmetry.
-void mallocAsync(void** devPtr, int device, size_t size, cudaStream_t stream) {
+void mallocAsync(
+    void** devPtr,
+    c10::DeviceIndex device,
+    size_t size,
+    cudaStream_t stream) {
   TORCH_INTERNAL_ASSERT(
       0 <= device && device < device_count,
       "Invalid device index ",
@@ -404,13 +406,13 @@ void local_raw_delete(void* ptr);
 
 // Same pattern as CUDACachingAllocator.cpp.
 struct CudaMallocAsyncAllocator : public CUDAAllocator {
-  DataPtr allocate(size_t size) const override {
+  DataPtr allocate(size_t size) override {
     constexpr size_t one_exa_bytes = 1152921504606846976ULL;
     TORCH_CHECK_WITH(
         OutOfMemoryError,
         size < one_exa_bytes,
         "CUDA out of memory. Tried to allocate more than 1EB memory.");
-    int device = 0;
+    c10::DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
     if (size != 0) {
@@ -445,12 +447,12 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     return !devs_initialized_flags.empty();
   }
 
-  static inline void assertValidDevice(int device) {
+  static inline void assertValidDevice(c10::DeviceIndex device) {
     TORCH_CHECK(
         0 <= device && device < device_count, "Invalid device argument.");
   }
 
-  void setMemoryFraction(double fraction, int device) override {
+  void setMemoryFraction(double fraction, c10::DeviceIndex device) override {
     TORCH_INTERNAL_ASSERT(
         0 <= fraction && fraction <= 1,
         "invalid fraction:",
@@ -470,7 +472,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     size_t device_total = 0;
     C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
     pytorch_memory_limits[device] =
-        static_cast<uint64_t>(fraction * device_total);
+        static_cast<uint64_t>(fraction * static_cast<double>(device_total));
 
     // Alternative: Instead of a manual hard limit, we could use
     // cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold,
@@ -486,7 +488,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
 
     for (int dev = 0; dev < device_count; dev++) {
       if (devs_initialized_flags[dev]) {
-        CUDAGuard g(dev);
+        CUDAGuard g(static_cast<c10::DeviceIndex>(dev));
 
         cudaMemPool_t mempool = nullptr;
         cudaDeviceGetDefaultMemPool(&mempool, dev);
@@ -496,7 +498,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     }
   }
 
-  void cacheInfo(int device, size_t* maxWorkspaceGuess) override {
+  void cacheInfo(c10::DeviceIndex device, size_t* maxWorkspaceGuess) override {
     // The only consumer of cacheInfo is getMaxWorkspaceSize in Conv_v7.cpp.
     // Afaict, the role of cacheInfo is to give getMaxWorkspaceSize a reasonable
     // maximum workspace size to use for an upcoming cudnnFind call.
@@ -639,8 +641,9 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
         "If you need it, please file an issue describing your use case.");
   }
 
-  std::shared_ptr<AllocatorState> getCheckpointState(int device, MempoolId_t id)
-      override {
+  std::shared_ptr<AllocatorState> getCheckpointState(
+      c10::DeviceIndex device,
+      MempoolId_t id) override {
     TORCH_CHECK(
         false,
         "cudaMallocAsync does not yet support getCheckpointState. "
@@ -648,7 +651,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
   }
 
   CheckpointDelta setCheckpointPoolState(
-      int device,
+      c10::DeviceIndex device,
       std::shared_ptr<AllocatorState> pps) override {
     TORCH_CHECK(
         false,
@@ -658,7 +661,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
 
   // Collects stats for device.
   // If device hasn't been used yet, returns 0s without creating a context.
-  DeviceStats getDeviceStats(int device) override {
+  DeviceStats getDeviceStats(c10::DeviceIndex device) override {
     assertValidDevice(device);
 
     // Memory currently reserved by the mempool
@@ -707,28 +710,28 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     // We simply ask the driver's opinion about active memory.
     // We don't bother distinguishing between allocated_bytes and active_bytes.
     stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current =
-        used_mem_current;
+        static_cast<int64_t>(used_mem_current);
     stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].peak =
-        used_mem_peak;
+        static_cast<int64_t>(used_mem_peak);
     stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current =
-        used_mem_current;
+        static_cast<int64_t>(used_mem_current);
     stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].peak =
-        used_mem_peak;
+        static_cast<int64_t>(used_mem_peak);
     stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current =
-        reserved_mem_current;
+        static_cast<int64_t>(reserved_mem_current);
     stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].peak =
-        reserved_mem_peak;
+        static_cast<int64_t>(reserved_mem_peak);
 
     return stats;
   }
 
-  void resetAccumulatedStats(int device) override {
+  void resetAccumulatedStats(c10::DeviceIndex device) override {
     assertValidDevice(device);
     TORCH_WARN_ONCE(
         "For backend:cudaMallocAsync, resetAccumulatedStats has no effect.");
   }
 
-  void resetPeakStats(int device) override {
+  void resetPeakStats(c10::DeviceIndex device) override {
     assertValidDevice(device);
 
     CUDAGuard g(device);
@@ -759,7 +762,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
 
   // CUDAGraph interactions
   void beginAllocateToPool(
-      int device,
+      c10::DeviceIndex device,
       MempoolId_t mempool_id,
       std::function<bool(cudaStream_t)>) override {
     std::lock_guard<std::mutex> lk(general_mutex);
@@ -771,7 +774,8 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     capture_underway = true;
   }
 
-  void endAllocateToPool(int device, MempoolId_t mempool_id) override {
+  void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id)
+      override {
     assertValidDevice(device);
 
     std::lock_guard<std::mutex> lk(general_mutex);
@@ -805,7 +809,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     capture_underway = false;
   }
 
-  void releasePool(int device, MempoolId_t mempool_id) override {
+  void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override {
     // Q: Do we need to do anything special here, like clear long-lived
     //    pointers created during the original capture (for example,
     //    tensors intended as the graph's I/O surface) that might still
@@ -824,7 +828,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     if (nbytes == 0) {
       return nullptr;
     }
-    int device = 0;
+    c10::DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
     mallocAsync(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
@@ -835,7 +839,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     if (nbytes == 0) {
       return nullptr;
     }
-    int device = 0;
+    c10::DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
     mallocAsync(&r, device, nbytes, stream);
@@ -844,7 +848,8 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
   void raw_delete(void* ptr) override {
     freeAsync(ptr);
   }
-  void enablePeerAccess(int dev, int dev_to_access) override {
+  void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access)
+      override {
     // Double-checks allocator backend hasn't changed, which would definitely be
     // an error. cudaMallocAsync pools are unaffected by
     // cudaDeviceEnablePeerAccess. We need pool-specific enablement. See
@@ -854,6 +859,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     C10_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, dev_to_access));
     cudaMemAccessDesc desc = {};
     desc.location.type = cudaMemLocationTypeDevice;
+    // NOLINTNEXTLINE(bugprone-signed-char-misuse)
     desc.location.id = dev;
     desc.flags = cudaMemAccessFlagsProtReadWrite;
     C10_CUDA_CHECK(cudaMemPoolSetAccess(mempool, &desc, 1 /* numDescs */));
@@ -875,6 +881,10 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
   std::string name() override {
     return "cudaMallocAsync";
   }
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    C10_CUDA_CHECK(
+        cudaMemcpy(dest, src, count, cudaMemcpyKind::cudaMemcpyDeviceToDevice));
+  }
 };
 
 CudaMallocAsyncAllocator device_allocator;
@@ -894,7 +904,4 @@ CUDAAllocator* allocator() {
 
 #endif
 
-} // namespace CudaMallocAsync
-} // namespace CUDACachingAllocator
-} // namespace cuda
-} // namespace c10
+} // namespace c10::cuda::CUDACachingAllocator::CudaMallocAsync
diff --git a/c10/cuda/CUDAMiscFunctions.cpp b/c10/cuda/CUDAMiscFunctions.cpp
index 37225d2a9a0b1..f55bba13e948d 100644
--- a/c10/cuda/CUDAMiscFunctions.cpp
+++ b/c10/cuda/CUDAMiscFunctions.cpp
@@ -1,8 +1,7 @@
 #include <c10/cuda/CUDAMiscFunctions.h>
-#include <stdlib.h>
+#include <cstdlib>
 
-namespace c10 {
-namespace cuda {
+namespace c10::cuda {
 
 const char* get_cuda_check_suffix() noexcept {
   static char* device_blocking_flag = getenv("CUDA_LAUNCH_BLOCKING");
@@ -13,7 +12,7 @@ const char* get_cuda_check_suffix() noexcept {
   } else {
     return "\nCUDA kernel errors might be asynchronously reported at some"
            " other API call, so the stacktrace below might be incorrect."
-           "\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1.";
+           "\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1";
   }
 }
 std::mutex* getFreeMutex() {
@@ -21,5 +20,4 @@ std::mutex* getFreeMutex() {
   return &cuda_free_mutex;
 }
 
-} // namespace cuda
-} // namespace c10
+} // namespace c10::cuda
diff --git a/c10/cuda/CUDAStream.cpp b/c10/cuda/CUDAStream.cpp
index 08916cfee4a66..23ab4a7f6edcd 100644
--- a/c10/cuda/CUDAStream.cpp
+++ b/c10/cuda/CUDAStream.cpp
@@ -6,13 +6,11 @@
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 
+#include <array>
 #include <atomic>
 #include <cstdint>
-#include <mutex>
-#include <vector>
 
-namespace c10 {
-namespace cuda {
+namespace c10::cuda {
 
 namespace {
 
@@ -39,13 +37,34 @@ static int max_stream_priorities;
 // already been destroyed and thus invoking cudaStreamDestroy could lead to a
 // crash. It's likely an issue in CUDA, but to be safe - let's just "forget"
 // the destruction.
-static c10::once_flag device_flags[C10_COMPILE_TIME_MAX_GPUS];
-static std::atomic<uint32_t>
-    priority_counters[c10::cuda::max_compile_time_stream_priorities]
-                     [C10_COMPILE_TIME_MAX_GPUS];
+#if !defined(USE_ROCM)
+// CUDA-only: used to initializes the stream pools (once)
+static std::array<c10::once_flag, C10_COMPILE_TIME_MAX_GPUS> device_flags;
+#endif
+static std::array<
+    std::array<std::atomic<uint32_t>, C10_COMPILE_TIME_MAX_GPUS>,
+    c10::cuda::max_compile_time_stream_priorities>
+    priority_counters;
+
+static std::array<
+    std::array<
+        std::array<cudaStream_t, kStreamsPerPool>,
+        C10_COMPILE_TIME_MAX_GPUS>,
+    c10::cuda::max_compile_time_stream_priorities>
+    streams;
+#ifdef USE_ROCM
+static c10::once_flag
+    stream_flags[c10::cuda::max_compile_time_stream_priorities]
+                [C10_COMPILE_TIME_MAX_GPUS][kStreamsPerPool];
+#endif
 
-static cudaStream_t streams[c10::cuda::max_compile_time_stream_priorities]
-                           [C10_COMPILE_TIME_MAX_GPUS][kStreamsPerPool];
+// Note [HIP Lazy Streams]
+// ~~~~~~~~~~~~~~~~~~~~~~~
+// For ROCm/HIP, each stream is lazily initialized rather than creating all
+// streams when the first stream is requested. HIP streams are not as
+// lightweight as CUDA streams; the pooling strategy can affect performance.
+// Rather than changing the pooling implementation, ROCm/HIP will lazy init
+// each stream when it is first requested.
 
 // Note [StreamId assignment]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -146,6 +165,7 @@ StreamId makeStreamId(StreamIdType st, size_t si) {
 }
 
 // Thread-local current streams
+// NOLINTNEXTLINE(*-arrays)
 static thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
 
 // Populates global values.
@@ -177,6 +197,21 @@ static void initGlobalStreamState() {
       : range;
 }
 
+// Init a single CUDA or HIP stream
+// See Note [HIP Lazy Streams]
+static void initSingleStream(int p, DeviceIndex device_index, int i) {
+  auto& stream = streams[p][device_index][i];
+  auto pri = -p; // lower number is higher priority
+
+  C10_CUDA_CHECK(cudaStreamCreateWithPriority(&stream, kDefaultFlags, pri));
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_stream_creation(
+        c10::kCUDA, reinterpret_cast<uintptr_t>(stream));
+    priority_counters[p][device_index] = 0;
+  }
+}
+
 // Creates the low and high priority stream pools for the specified device
 // Warning: only call once per device!
 static void initDeviceStreamState(DeviceIndex device_index) {
@@ -185,16 +220,7 @@ static void initDeviceStreamState(DeviceIndex device_index) {
   CUDAGuard device_guard{device_index};
   for (const auto i : c10::irange(kStreamsPerPool)) {
     for (const auto p : c10::irange(max_stream_priorities)) {
-      auto& stream = streams[p][device_index][i];
-      auto pri = -p; // lower number is higher priority
-
-      C10_CUDA_CHECK(cudaStreamCreateWithPriority(&stream, kDefaultFlags, pri));
-      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
-      if (C10_UNLIKELY(interp)) {
-        (*interp)->trace_gpu_stream_creation(
-            reinterpret_cast<uintptr_t>(stream));
-        priority_counters[p][device_index] = 0;
-      }
+      initSingleStream(p, device_index, i);
     }
   }
 }
@@ -209,6 +235,7 @@ static void initCUDAStreamsOnce() {
   }
 
   // Inits current streams (thread local) to default streams
+  // NOLINTNEXTLINE(*-arrays)
   current_streams = std::make_unique<StreamId[]>(num_gpus);
   for (const auto i : c10::irange(num_gpus)) {
     current_streams[i] = makeStreamId(StreamIdType::DEFAULT, 0);
@@ -256,6 +283,7 @@ cudaStream_t CUDAStream::stream() const {
         " official API like c10::cuda::getStreamFromPool() to get a new stream.");
     return nullptr;
   } else if (st.isExt()) {
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
     return reinterpret_cast<cudaStream_t>(stream_id);
   } else {
     auto streamType = st.getStreamType();
@@ -268,6 +296,15 @@ cudaStream_t CUDAStream::stream() const {
         " with the value ",
         streamType,
         ")");
+#ifdef USE_ROCM
+    // See Note [HIP Lazy Streams]
+    c10::call_once(
+        stream_flags[st.getStreamType() - 1][device_index][si],
+        initSingleStream,
+        st.getStreamType() - 1,
+        device_index,
+        si);
+#endif
     return streams[st.getStreamType() - 1][device_index][si];
   }
 }
@@ -286,9 +323,12 @@ CUDAStream getStreamFromPool(const int priority, DeviceIndex device_index) {
       "Expected cuda stream priority to be less than or equal to 0, got ",
       priority);
   check_gpu(device_index);
-  // Initializes the stream pools (once)
+#if !defined(USE_ROCM)
+  // See Note [HIP Lazy Streams]
+  // CUDA-only: Initializes the stream pools (once)
   c10::call_once(
       device_flags[device_index], initDeviceStreamState, device_index);
+#endif
   auto pri_idx = -priority;
   pri_idx =
       std::min(pri_idx, max_stream_priorities - 1); // pri_idx is zero-based
@@ -339,5 +379,4 @@ std::ostream& operator<<(std::ostream& stream, const CUDAStream& s) {
   return stream << s.unwrap();
 }
 
-} // namespace cuda
-} // namespace c10
+} // namespace c10::cuda
diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h
index 9d3ca2aa22d1e..05c314469f87c 100644
--- a/c10/cuda/CUDAStream.h
+++ b/c10/cuda/CUDAStream.h
@@ -1,8 +1,5 @@
 #pragma once
 
-#include <cstdint>
-#include <utility>
-
 #include <cuda_runtime_api.h>
 
 #include <c10/core/DeviceGuard.h>
diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp
index 56243e663c796..fde4da864fabc 100644
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@@ -3,9 +3,8 @@
 #include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
 #include <dlfcn.h>
-#include <iostream>
-namespace c10 {
-namespace cuda {
+
+namespace c10::cuda {
 
 namespace {
 
@@ -42,7 +41,6 @@ C10_EXPORT DriverAPI* DriverAPI::get() {
   return &singleton;
 }
 
-} // namespace cuda
-} // namespace c10
+} // namespace c10::cuda
 
 #endif
diff --git a/c10/cuda/impl/CUDAGuardImpl.cpp b/c10/cuda/impl/CUDAGuardImpl.cpp
index 584430f7a2206..35161209a15ff 100644
--- a/c10/cuda/impl/CUDAGuardImpl.cpp
+++ b/c10/cuda/impl/CUDAGuardImpl.cpp
@@ -1,11 +1,7 @@
 #include <c10/cuda/impl/CUDAGuardImpl.h>
 
-namespace c10 {
-namespace cuda {
-namespace impl {
+namespace c10::cuda::impl {
 
 C10_REGISTER_GUARD_IMPL(CUDA, CUDAGuardImpl);
 
-} // namespace impl
-} // namespace cuda
-} // namespace c10
+} // namespace c10::cuda::impl
diff --git a/c10/cuda/impl/CUDAGuardImpl.h b/c10/cuda/impl/CUDAGuardImpl.h
index 1d580ef10a883..2d983beaf898d 100644
--- a/c10/cuda/impl/CUDAGuardImpl.h
+++ b/c10/cuda/impl/CUDAGuardImpl.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <c10/core/DeviceGuard.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/core/impl/GPUTrace.h>
 #include <c10/macros/Macros.h>
@@ -11,11 +10,15 @@
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/cuda/CUDAStream.h>
 
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/util/Optional.h>
 #include <cuda_runtime_api.h>
+#include <cstdint>
 
-namespace c10 {
-namespace cuda {
-namespace impl {
+namespace c10::cuda::impl {
 
 struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   static constexpr DeviceType static_type = DeviceType::CUDA;
@@ -29,16 +32,16 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   }
   Device exchangeDevice(Device d) const override {
     TORCH_INTERNAL_ASSERT(d.is_cuda());
-    int old_device_index = c10::cuda::ExchangeDevice(d.index());
+    auto old_device_index = c10::cuda::ExchangeDevice(d.index());
     return Device(DeviceType::CUDA, old_device_index);
   }
   Device getDevice() const override {
-    int device;
+    DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     return Device(DeviceType::CUDA, device);
   }
   c10::optional<Device> uncheckedGetDevice() const noexcept {
-    int device;
+    DeviceIndex device{-1};
     const auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDevice(&device));
     C10_CUDA_CHECK_WARN(err);
     if (err != cudaSuccess) {
@@ -59,6 +62,9 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   Stream getDefaultStream(Device d) const override {
     return getDefaultCUDAStream(d.index());
   }
+  Stream getNewStream(Device d, int priority = 0) const override {
+    return getStreamFromPool(priority, d.index());
+  }
   Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
       const override {
     return getStreamFromPool(isHighPriority, d.index());
@@ -80,11 +86,9 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     auto cuda_flag = cudaEventDefault;
     switch (flag) {
       case EventFlag::PYTORCH_DEFAULT:
-      case EventFlag::CUDA_EVENT_DISABLE_TIMING:
         cuda_flag = cudaEventDisableTiming;
         break;
       case EventFlag::BACKEND_DEFAULT:
-      case EventFlag::CUDA_EVENT_DEFAULT:
         cuda_flag = cudaEventDefault;
         break;
       default:
@@ -95,7 +99,7 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
     if (C10_UNLIKELY(interp)) {
       (*interp)->trace_gpu_event_creation(
-          reinterpret_cast<uintptr_t>(cuda_event));
+          c10::kCUDA, reinterpret_cast<uintptr_t>(cuda_event));
     }
   }
 
@@ -104,13 +108,13 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     if (!event)
       return;
     auto cuda_event = static_cast<cudaEvent_t>(event);
-    int orig_device;
+    DeviceIndex orig_device{-1};
     C10_CUDA_CHECK_WARN(c10::cuda::GetDevice(&orig_device));
     C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(device_index));
     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
     if (C10_UNLIKELY(interp)) {
       (*interp)->trace_gpu_event_deletion(
-          reinterpret_cast<uintptr_t>(cuda_event));
+          c10::kCUDA, reinterpret_cast<uintptr_t>(cuda_event));
     }
     C10_CUDA_CHECK_WARN(cudaEventDestroy(cuda_event));
     C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(orig_device));
@@ -145,6 +149,7 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
     if (C10_UNLIKELY(interp)) {
       (*interp)->trace_gpu_event_record(
+          c10::kCUDA,
           reinterpret_cast<uintptr_t>(cuda_event),
           reinterpret_cast<uintptr_t>(cuda_stream.stream()));
     }
@@ -167,6 +172,7 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
     if (C10_UNLIKELY(interp)) {
       (*interp)->trace_gpu_event_wait(
+          c10::kCUDA,
           reinterpret_cast<uintptr_t>(cuda_event),
           reinterpret_cast<uintptr_t>(cuda_stream.stream()));
     }
@@ -206,6 +212,4 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   }
 };
 
-} // namespace impl
-} // namespace cuda
-} // namespace c10
+} // namespace c10::cuda::impl
diff --git a/c10/cuda/impl/CUDATest.cpp b/c10/cuda/impl/CUDATest.cpp
index f2d4ae976512d..287479dab9458 100644
--- a/c10/cuda/impl/CUDATest.cpp
+++ b/c10/cuda/impl/CUDATest.cpp
@@ -5,9 +5,7 @@
 
 #include <cuda_runtime.h>
 
-namespace c10 {
-namespace cuda {
-namespace impl {
+namespace c10::cuda::impl {
 
 bool has_cuda_gpu() {
   int count = 0;
@@ -29,6 +27,4 @@ int c10_cuda_private_test() {
   return 2;
 }
 
-} // namespace impl
-} // namespace cuda
-} // namespace c10
+} // namespace c10::cuda::impl
diff --git a/c10/cuda/impl/CUDATest.h b/c10/cuda/impl/CUDATest.h
index 593905d156721..c4316e8a6b730 100644
--- a/c10/cuda/impl/CUDATest.h
+++ b/c10/cuda/impl/CUDATest.h
@@ -2,12 +2,8 @@
 
 #include <c10/cuda/CUDAMacros.h>
 
-namespace c10 {
-namespace cuda {
-namespace impl {
+namespace c10::cuda::impl {
 
 C10_CUDA_API int c10_cuda_test();
 
 }
-} // namespace cuda
-} // namespace c10
diff --git a/c10/macros/Export.h b/c10/macros/Export.h
index b439e74b37e05..cb68060ed8129 100644
--- a/c10/macros/Export.h
+++ b/c10/macros/Export.h
@@ -144,6 +144,12 @@
 #define TORCH_HIP_API C10_IMPORT
 #endif
 
+#if defined(TORCH_XPU_BUILD_MAIN_LIB)
+#define TORCH_XPU_API C10_EXPORT
+#else
+#define TORCH_XPU_API C10_IMPORT
+#endif
+
 // Enums only need to be exported on windows for non-CUDA files
 #if defined(_WIN32) && defined(__CUDACC__)
 #define C10_API_ENUM C10_API
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 18ad3a1719afc..8c0dfea6e2b49 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -155,6 +155,7 @@
 namespace c10 {}
 namespace c10::cuda {}
 namespace c10::hip {}
+namespace c10::xpu {}
 
 // Since C10 is the core library for caffe2 (and aten), we will simply reroute
 // all abstractions defined in c10 to be available in caffe2 as well.
@@ -182,6 +183,10 @@ namespace at::cuda {
 using namespace c10::hip;
 } // namespace at::cuda
 
+namespace at::xpu {
+using namespace c10::xpu;
+} // namespace at::xpu
+
 // C10_LIKELY/C10_UNLIKELY
 //
 // These macros provide parentheses, so you can use these macros as:
@@ -230,13 +235,6 @@ using namespace c10::hip;
 
 #define C10_ERASE C10_ALWAYS_INLINE C10_ATTR_VISIBILITY_HIDDEN
 
-// C10_FALLTHROUGH - Annotate fallthrough to the next case in a switch.
-#if C10_HAS_CPP_ATTRIBUTE(fallthrough)
-#define C10_FALLTHROUGH [[fallthrough]]
-#else
-#define C10_FALLTHROUGH
-#endif
-
 #include <cstdint>
 
 #ifdef __HIPCC__
@@ -432,16 +430,6 @@ __host__ __device__
 #define C10_ALWAYS_INLINE_UNLESS_MOBILE C10_ALWAYS_INLINE
 #endif
 
-// Portable determination of whether type T is trivially copyable.
-// Warning: __has_trivial_copy for GCC may not always detect the non-POD
-// correctly. For example, T = std::unique_ptr may evaluate to true and be
-// treated as POD. This can cause unexpected behavior.
-#if defined(__GNUG__) && __GNUC__ < 5 && !defined(__clang__)
-#define C10_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
-#else
-#define C10_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
-#endif
-
 #if defined(__CUDA_ARCH__)
 #if defined(_MSC_VER) && defined(__CUDACC__)
 #define CONSTEXPR_EXCEPT_WIN_CUDA const
diff --git a/c10/mobile/CPUCachingAllocator.cpp b/c10/mobile/CPUCachingAllocator.cpp
index c1b7ee366622d..cafef1030f3eb 100644
--- a/c10/mobile/CPUCachingAllocator.cpp
+++ b/c10/mobile/CPUCachingAllocator.cpp
@@ -16,7 +16,7 @@ inline void* CPUCachingAllocator::allocate_and_cache(const size_t bytes) {
   void* ptr;
   try {
     ptr = c10::alloc_cpu(bytes);
-  } catch (c10::Error& e) {
+  } catch (c10::Error&) {
     // If allocation fails, try freeing cached available blocks.
     // For now free all available cached blocks.
     free_cached();
diff --git a/c10/mobile/CPUCachingAllocator.h b/c10/mobile/CPUCachingAllocator.h
index 94e1305d80c77..6e695af121fe4 100644
--- a/c10/mobile/CPUCachingAllocator.h
+++ b/c10/mobile/CPUCachingAllocator.h
@@ -1,7 +1,9 @@
 #pragma once
 
+#include <cstddef>
 #include <mutex>
 
+#include <c10/macros/Export.h>
 #include <c10/util/SmallVector.h>
 #include <c10/util/flat_hash_map.h>
 
diff --git a/c10/mobile/CPUProfilingAllocator.h b/c10/mobile/CPUProfilingAllocator.h
index f26c5d25e609d..02e8ea4414a26 100644
--- a/c10/mobile/CPUProfilingAllocator.h
+++ b/c10/mobile/CPUProfilingAllocator.h
@@ -1,6 +1,9 @@
 #pragma once
 
+#include <c10/macros/Export.h>
 #include <c10/util/flat_hash_map.h>
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <vector>
 
diff --git a/c10/ovrsource_defs.bzl b/c10/ovrsource_defs.bzl
index 0ca1f728631b5..66b973de763cc 100644
--- a/c10/ovrsource_defs.bzl
+++ b/c10/ovrsource_defs.bzl
@@ -8,6 +8,7 @@ cpu_supported_platforms = [
     "ovr_config//os:macos",
     "ovr_config//os:windows-x86_64",
     "ovr_config//runtime:arm64-linux-ubuntu-neon",
+    "ovr_config//os:linux-arm64",
 ]
 
 cuda_supported_platforms = [
diff --git a/c10/test/build.bzl b/c10/test/build.bzl
index d9b58971b2e17..2f54c8a2faa5b 100644
--- a/c10/test/build.bzl
+++ b/c10/test/build.bzl
@@ -21,7 +21,6 @@ def define_targets(rules):
             "//c10/core:base",
             "//c10/util:base",
             "//c10/core:CPUAllocator",
-            "//c10/core:impl_cow",
             "@com_google_googletest//:gtest_main",
         ],
     )
diff --git a/c10/test/core/DispatchKeySet_test.cpp b/c10/test/core/DispatchKeySet_test.cpp
index 74c52a4cd7fe7..7877cc76fbba3 100644
--- a/c10/test/core/DispatchKeySet_test.cpp
+++ b/c10/test/core/DispatchKeySet_test.cpp
@@ -223,12 +223,14 @@ TEST(DispatchKeySet, DoubletonPerBackend) {
       // Skip these because they aren't real keys.
       if (tid1 == DispatchKey::StartOfDenseBackends ||
           tid1 == DispatchKey::StartOfSparseBackends ||
+          tid1 == DispatchKey::StartOfSparseCsrBackends ||
           tid1 == DispatchKey::StartOfQuantizedBackends ||
           tid1 == DispatchKey::StartOfNestedTensorBackends ||
           tid1 == DispatchKey::StartOfAutogradFunctionalityBackends)
         continue;
       if (tid2 == DispatchKey::StartOfDenseBackends ||
           tid2 == DispatchKey::StartOfSparseBackends ||
+          tid2 == DispatchKey::StartOfSparseCsrBackends ||
           tid2 == DispatchKey::StartOfQuantizedBackends ||
           tid2 == DispatchKey::StartOfNestedTensorBackends ||
           tid2 == DispatchKey::StartOfAutogradFunctionalityBackends)
@@ -326,6 +328,12 @@ TEST(DispatchKeySet, getHighestPriorityBackendTypeId) {
   ASSERT_EQ(
       DispatchKey::SparseCUDA, c10::highestPriorityBackendTypeId(sparse_cuda));
 
+  DispatchKeySet sparse_compressed_cuda(
+      {DispatchKey::Functionalize, DispatchKey::SparseCsrCUDA});
+  ASSERT_EQ(
+      DispatchKey::SparseCsrCUDA,
+      c10::highestPriorityBackendTypeId(sparse_compressed_cuda));
+
   // quantizedCUDA has higher priority than CUDA
   DispatchKeySet quantized_cuda(
       {DispatchKey::CUDA, DispatchKey::QuantizedCUDA});
@@ -417,6 +425,7 @@ TEST(DispatchKeySet, TestFunctionalityDispatchKeyToString) {
         k == DispatchKey::StartOfDenseBackends ||
         k == DispatchKey::StartOfQuantizedBackends ||
         k == DispatchKey::StartOfSparseBackends ||
+        k == DispatchKey::StartOfSparseCsrBackends ||
         k == DispatchKey::StartOfNestedTensorBackends ||
         k == DispatchKey::StartOfAutogradFunctionalityBackends)
       continue;
diff --git a/c10/test/core/Scalar_test.cpp b/c10/test/core/Scalar_test.cpp
index ef19ca0a99e16..d87cc05166050 100644
--- a/c10/test/core/Scalar_test.cpp
+++ b/c10/test/core/Scalar_test.cpp
@@ -47,3 +47,9 @@ TEST(ScalarTest, Equality) {
   // ensure that we don't incorrectly coerce bitrep
   ASSERT_FALSE(Scalar(-1).equal(0xFFFFFFFFFFFFFFFF));
 }
+
+TEST(ScalarTest, LongsAndLongLongs) {
+  Scalar longOne = 1L;
+  Scalar longlongOne = 1LL;
+  ASSERT_EQ(longOne.toInt(), longlongOne.toInt());
+}
diff --git a/c10/test/core/impl/cow_test.cpp b/c10/test/core/impl/cow_test.cpp
index 5fd30f509e579..f00243adee245 100644
--- a/c10/test/core/impl/cow_test.cpp
+++ b/c10/test/core/impl/cow_test.cpp
@@ -1,5 +1,5 @@
-#include <c10/core/impl/cow/COW.h>
-#include <c10/core/impl/cow/COWDeleter.h>
+#include <c10/core/impl/COW.h>
+#include <c10/core/impl/COWDeleter.h>
 
 #include <c10/core/CPUAllocator.h>
 #include <c10/core/StorageImpl.h>
@@ -167,6 +167,84 @@ TEST(lazy_clone_storage_test, already_copy_on_write) {
   ASSERT_THAT(new_storage->data(), testing::Eq(original_storage.data()));
 }
 
+TEST(materialize_test, not_copy_on_write_context) {
+  StorageImpl storage(
+      {}, /*size_bytes=*/6, GetCPUAllocator(), /*resizable=*/false);
+  ASSERT_THAT(storage, testing::Not(is_copy_on_write()));
+
+  void const* original_data = storage.data();
+
+  // Nothing to materialize.
+  ASSERT_THAT(storage.mutable_data(), testing::Eq(original_data));
+}
+
+TEST(materialize_test, copy_on_write_single_reference) {
+  // A copy-on-write storage with only a single reference can just
+  // drop the copy-on-write context upon materialization.
+  std::unique_ptr<void, DeleterFnPtr> data(
+      new std::byte[4],
+      +[](void* bytes) { delete[] static_cast<std::byte*>(bytes); });
+  void* data_ptr = data.get();
+  StorageImpl storage(
+      {},
+      /*size_bytes=*/4,
+      at::DataPtr(
+          /*data=*/data_ptr,
+          /*ctx=*/new cow::COWDeleterContext(std::move(data)),
+          cow::cow_deleter,
+          Device(Device::Type::CPU)),
+      /*allocator=*/nullptr,
+      /*resizable=*/false);
+
+  ASSERT_THAT(storage, is_copy_on_write());
+
+  ASSERT_THAT(storage.data(), testing::Eq(data_ptr));
+
+  void const* original_data = storage.data();
+
+  // Materializes storage. Only reference, so no new allocation.
+  ASSERT_THAT(storage.mutable_data(), testing::Eq(original_data));
+  // But it is no longer copy-on-write.
+  ASSERT_THAT(storage, testing::Not(is_copy_on_write()));
+}
+
+bool buffers_are_equal(const void* a, const void* b, size_t nbytes) {
+  const char* a_ = static_cast<const char*>(a);
+  const char* b_ = static_cast<const char*>(b);
+
+  for (size_t idx = 0; idx < nbytes; idx++) {
+    if (a_[idx] != b_[idx]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+TEST(materialize_test, copy_on_write) {
+  StorageImpl original_storage(
+      {}, /*size_bytes=*/6, GetCPUAllocator(), /*resizable=*/false);
+  std::memcpy(original_storage.mutable_data(), "abcd", 4);
+  void const* original_data = original_storage.data();
+
+  auto new_storage = cow::lazy_clone_storage(original_storage);
+  ASSERT_THAT(new_storage, testing::NotNull());
+
+  auto context = new_storage->data_ptr().cast_context<cow::COWDeleterContext>(
+      cow::cow_deleter);
+  ASSERT_THAT(context, testing::NotNull());
+
+  // Materialized storage has new copy of data.
+  ASSERT_THAT(new_storage->mutable_data(), testing::Ne(original_data));
+
+  // But the original storage still has the original copy.
+  ASSERT_THAT(original_storage.data(), testing::Eq(original_data));
+
+  // And their data is the same
+  ASSERT_TRUE(new_storage->nbytes() == original_storage.nbytes());
+  ASSERT_TRUE(buffers_are_equal(
+      new_storage->data(), original_storage.data(), new_storage->nbytes()));
+}
+
 } // namespace
 } // namespace c10::impl
 // NOLINTEND(clang-analyzer-cplusplus*)
diff --git a/c10/test/util/Half_test.cpp b/c10/test/util/Half_test.cpp
index 7e666c432df37..1176837c06782 100644
--- a/c10/test/util/Half_test.cpp
+++ b/c10/test/util/Half_test.cpp
@@ -1,10 +1,14 @@
+#include <cmath>
+#include <limits>
 #include <vector>
 
 #include <c10/util/Half.h>
+#include <c10/util/floating_point_utils.h>
+#include <c10/util/irange.h>
 #include <gtest/gtest.h>
 
 namespace {
-namespace half_legacy_impl {
+
 float halfbits2float(unsigned short h) {
   unsigned sign = ((h >> 15) & 1);
   unsigned exponent = ((h >> 10) & 0x1f);
@@ -31,18 +35,11 @@ float halfbits2float(unsigned short h) {
 
   unsigned result_bit = (sign << 31) | (exponent << 23) | mantissa;
 
-  // Reinterpret the result bit pattern as a float
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  float result_float;
-  std::memcpy(&result_float, &result_bit, sizeof(result_float));
-  return result_float;
-};
+  return c10::detail::fp32_from_bits(result_bit);
+}
 
 unsigned short float2halfbits(float src) {
-  // Reinterpret the float as a bit pattern
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  unsigned x;
-  std::memcpy(&x, &src, sizeof(x));
+  unsigned x = c10::detail::fp32_to_bits(src);
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables,cppcoreguidelines-avoid-magic-numbers)
   unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
@@ -91,9 +88,8 @@ unsigned short float2halfbits(float src) {
   }
 
   return (sign | (exponent << 10) | mantissa);
-};
-} // namespace half_legacy_impl
-TEST(HalfDoubleConversionTest, Half2Double) {
+}
+TEST(HalfConversionTest, TestPorableConversion) {
   std::vector<uint16_t> inputs = {
       0,
       0xfbff, // 1111 1011 1111 1111
@@ -102,12 +98,47 @@ TEST(HalfDoubleConversionTest, Half2Double) {
   };
   for (auto x : inputs) {
     auto target = c10::detail::fp16_ieee_to_fp32_value(x);
-    EXPECT_EQ(half_legacy_impl::halfbits2float(x), target)
+    EXPECT_EQ(halfbits2float(x), target)
         << "Test failed for uint16 to float " << x << "\n";
     EXPECT_EQ(
-        half_legacy_impl::float2halfbits(target),
-        c10::detail::fp16_ieee_from_fp32_value(target))
+        float2halfbits(target), c10::detail::fp16_ieee_from_fp32_value(target))
         << "Test failed for float to uint16" << target << "\n";
   }
 }
+
+TEST(HalfConversion, TestNativeConversionToFloat) {
+  // There are only 2**16 possible values, so test them all
+  for (auto x : c10::irange(std::numeric_limits<uint16_t>::max() + 1)) {
+    auto h = c10::Half(x, c10::Half::from_bits());
+    auto f = halfbits2float(x);
+    // NaNs are not equal to each other
+    if (std::isnan(f) && std::isnan(static_cast<float>(h))) {
+      continue;
+    }
+    EXPECT_EQ(f, static_cast<float>(h)) << "Conversion error using " << x;
+  }
+}
+
+TEST(HalfConversion, TestNativeConversionToHalf) {
+  auto check_conversion = [](float f) {
+    auto h = c10::Half(f);
+    auto h_bits = float2halfbits(f);
+    // NaNs are not equal to each other, just check that half is NaN
+    if (std::isnan(f)) {
+      EXPECT_TRUE(std::isnan(static_cast<float>(h)));
+    } else {
+      EXPECT_EQ(h.x, h_bits) << "Conversion error using " << f;
+    }
+  };
+
+  for (auto x : c10::irange(std::numeric_limits<uint16_t>::max() + 1)) {
+    check_conversion(halfbits2float(x));
+  }
+  // Check a few values outside of Half range
+  check_conversion(std::numeric_limits<float>::max());
+  check_conversion(std::numeric_limits<float>::min());
+  check_conversion(std::numeric_limits<float>::epsilon());
+  check_conversion(std::numeric_limits<float>::lowest());
+}
+
 } // namespace
diff --git a/c10/test/util/lazy_test.cpp b/c10/test/util/lazy_test.cpp
new file mode 100644
index 0000000000000..ea59c59d49632
--- /dev/null
+++ b/c10/test/util/lazy_test.cpp
@@ -0,0 +1,97 @@
+#include <atomic>
+#include <thread>
+#include <vector>
+
+#include <c10/util/Lazy.h>
+#include <gtest/gtest.h>
+
+namespace c10_test {
+
+// Long enough not to fit in typical SSO.
+const std::string kLongString = "I am a long enough string";
+
+TEST(LazyTest, OptimisticLazy) {
+  std::atomic<size_t> invocations = 0;
+  auto factory = [&] {
+    ++invocations;
+    return kLongString;
+  };
+
+  c10::OptimisticLazy<std::string> s;
+
+  constexpr size_t kNumThreads = 16;
+  std::vector<std::thread> threads;
+  std::atomic<std::string*> address = nullptr;
+
+  for (size_t i = 0; i < kNumThreads; ++i) {
+    threads.emplace_back([&] {
+      auto* p = &s.ensure(factory);
+      auto old = address.exchange(p);
+      if (old != nullptr) {
+        // Even racing ensure()s should return a stable reference.
+        EXPECT_EQ(old, p);
+      }
+    });
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  EXPECT_GE(invocations.load(), 1);
+  EXPECT_EQ(*address.load(), kLongString);
+
+  invocations = 0;
+  s.reset();
+  s.ensure(factory);
+  EXPECT_EQ(invocations.load(), 1);
+
+  invocations = 0;
+
+  auto sCopy = s;
+  EXPECT_EQ(sCopy.ensure(factory), kLongString);
+  EXPECT_EQ(invocations.load(), 0);
+
+  auto sMove = std::move(s);
+  EXPECT_EQ(sMove.ensure(factory), kLongString);
+  EXPECT_EQ(invocations.load(), 0);
+  // NOLINTNEXTLINE(bugprone-use-after-move)
+  EXPECT_EQ(s.ensure(factory), kLongString);
+  EXPECT_EQ(invocations.load(), 1);
+
+  invocations = 0;
+
+  s = sCopy;
+  EXPECT_EQ(s.ensure(factory), kLongString);
+  EXPECT_EQ(invocations.load(), 0);
+
+  s = std::move(sCopy);
+  EXPECT_EQ(s.ensure(factory), kLongString);
+  EXPECT_EQ(invocations.load(), 0);
+}
+
+TEST(LazyTest, PrecomputedLazyValue) {
+  static const std::string kLongString = "I am a string";
+  EXPECT_EQ(
+      std::make_shared<c10::PrecomputedLazyValue<std::string>>(kLongString)
+          ->get(),
+      kLongString);
+}
+
+TEST(LazyTest, OptimisticLazyValue) {
+  static const std::string kLongString = "I am a string";
+
+  class LazyString : public c10::OptimisticLazyValue<std::string> {
+    std::string compute() const override {
+      return kLongString;
+    }
+  };
+
+  auto ls = std::make_shared<LazyString>();
+  EXPECT_EQ(ls->get(), kLongString);
+
+  // Returned reference should be stable.
+  EXPECT_EQ(&ls->get(), &ls->get());
+}
+
+} // namespace c10_test
diff --git a/c10/test/util/logging_test.cpp b/c10/test/util/logging_test.cpp
index a521856509389..ca1fab0528cd2 100644
--- a/c10/test/util/logging_test.cpp
+++ b/c10/test/util/logging_test.cpp
@@ -1,4 +1,5 @@
 #include <algorithm>
+#include <optional>
 
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Logging.h>
@@ -156,4 +157,69 @@ TEST(LoggingDeathTest, TestEnforceUsingFatal) {
 }
 #endif
 
+C10_NOINLINE void f1() {
+  CAFFE_THROW("message");
+}
+
+C10_NOINLINE void f2() {
+  f1();
+}
+
+C10_NOINLINE void f3() {
+  f2();
+}
+
+#ifdef FBCODE_CAFFE2
+TEST(LoggingTest, ExceptionWhat) {
+  std::optional<::c10::Error> error;
+  try {
+    f3();
+  } catch (const ::c10::Error& e) {
+    error = e;
+  }
+
+  ASSERT_TRUE(error);
+  std::string what = error->what();
+
+  EXPECT_TRUE(what.find("c10_test::f1()") != std::string::npos) << what;
+  EXPECT_TRUE(what.find("c10_test::f2()") != std::string::npos) << what;
+  EXPECT_TRUE(what.find("c10_test::f3()") != std::string::npos) << what;
+
+  // what() should be recomputed.
+  error->add_context("NewContext");
+  what = error->what();
+  EXPECT_TRUE(what.find("c10_test::f1()") != std::string::npos) << what;
+  EXPECT_TRUE(what.find("c10_test::f2()") != std::string::npos) << what;
+  EXPECT_TRUE(what.find("c10_test::f3()") != std::string::npos) << what;
+  EXPECT_TRUE(what.find("NewContext") != std::string::npos) << what;
+}
+#endif
+
+TEST(LoggingTest, LazyBacktrace) {
+  struct CountingLazyString : ::c10::OptimisticLazyValue<std::string> {
+    mutable size_t invocations{0};
+
+    std::string compute() const override {
+      ++invocations;
+      return "A string";
+    }
+  };
+
+  auto backtrace = std::make_shared<CountingLazyString>();
+  ::c10::Error ex("", backtrace);
+  // The backtrace is not computed on construction, and then it is not computed
+  // more than once.
+  EXPECT_EQ(backtrace->invocations, 0);
+  const char* w1 = ex.what();
+  EXPECT_EQ(backtrace->invocations, 1);
+  const char* w2 = ex.what();
+  EXPECT_EQ(backtrace->invocations, 1);
+  // what() should not be recomputed.
+  EXPECT_EQ(w1, w2);
+
+  ex.add_context("");
+  ex.what();
+  EXPECT_EQ(backtrace->invocations, 1);
+}
+
 } // namespace c10_test
diff --git a/c10/test/util/string_util_test.cpp b/c10/test/util/string_util_test.cpp
new file mode 100644
index 0000000000000..963253a4fcb0c
--- /dev/null
+++ b/c10/test/util/string_util_test.cpp
@@ -0,0 +1,80 @@
+#include <c10/util/StringUtil.h>
+
+#include <gtest/gtest.h>
+
+namespace {
+
+namespace test_str_narrow_single {
+TEST(StringUtilTest, testStrNarrowSingle) {
+  std::string s = "narrow test string";
+  EXPECT_EQ(s, c10::str(s));
+
+  const char* c_str = s.c_str();
+  EXPECT_EQ(s, c10::str(c_str));
+
+  char c = 'a';
+  EXPECT_EQ(std::string(1, c), c10::str(c));
+}
+} // namespace test_str_narrow_single
+
+namespace test_str_wide_single {
+TEST(StringUtilTest, testStrWideSingle) {
+  std::wstring s = L"wide test string";
+  std::string narrow = "wide test string";
+  EXPECT_EQ(narrow, c10::str(s));
+
+  const wchar_t* c_str = s.c_str();
+  EXPECT_EQ(narrow, c10::str(c_str));
+
+  wchar_t c = L'a';
+  std::string narrowC = "a";
+  EXPECT_EQ(narrowC, c10::str(c));
+}
+} // namespace test_str_wide_single
+
+namespace test_str_wide_single_multibyte {
+TEST(StringUtilTest, testStrWideSingleMultibyte) {
+  std::wstring s = L"\u00EC blah";
+  std::string narrow = "\xC3\xAC blah";
+  EXPECT_EQ(narrow, c10::str(s));
+
+  const wchar_t* c_str = s.c_str();
+  EXPECT_EQ(narrow, c10::str(c_str));
+
+  wchar_t c = L'\u00EC';
+  std::string narrowC = "\xC3\xAC";
+  EXPECT_EQ(narrowC, c10::str(c));
+}
+} // namespace test_str_wide_single_multibyte
+
+namespace test_str_wide_empty {
+TEST(StringUtilTest, testStrWideEmpty) {
+  std::wstring s = L"";
+  std::string narrow = "";
+  EXPECT_EQ(narrow, c10::str(s));
+
+  const wchar_t* c_str = s.c_str();
+  EXPECT_EQ(narrow, c10::str(c_str));
+
+  wchar_t c = L'\0';
+  std::string narrowC(1, '\0');
+  EXPECT_EQ(narrowC, c10::str(c));
+}
+} // namespace test_str_wide_empty
+
+namespace test_str_multi {
+TEST(StringUtilTest, testStrMulti) {
+  std::string result = c10::str(
+      "c_str ",
+      'c',
+      std::string(" std::string "),
+      42,
+      L" wide c_str ",
+      L'w',
+      std::wstring(L" std::wstring "));
+  std::string expected = "c_str c std::string 42 wide c_str w std::wstring ";
+  EXPECT_EQ(expected, result);
+}
+} // namespace test_str_multi
+
+} // namespace
diff --git a/c10/util/BFloat16-math.h b/c10/util/BFloat16-math.h
index fcb46044a34af..88a6b849d37bf 100644
--- a/c10/util/BFloat16-math.h
+++ b/c10/util/BFloat16-math.h
@@ -2,7 +2,6 @@
 
 #include <c10/util/BFloat16.h>
 #include <c10/util/Half.h>
-#include <c10/util/math_compat.h>
 
 C10_CLANG_DIAGNOSTIC_PUSH()
 #if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
diff --git a/c10/util/BFloat16.h b/c10/util/BFloat16.h
index 9046902e4d45d..95bc5f91838b6 100644
--- a/c10/util/BFloat16.h
+++ b/c10/util/BFloat16.h
@@ -7,6 +7,8 @@
 #include <cmath>
 #include <cstdint>
 #include <cstring>
+#include <iosfwd>
+#include <ostream>
 
 #if defined(__CUDACC__) && !defined(USE_ROCM)
 #include <cuda_bf16.h>
@@ -112,6 +114,13 @@ struct alignas(2) BFloat16 {
 #endif
 };
 
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const BFloat16& value) {
+  out << (float)value;
+  return out;
+}
+
 } // namespace c10
 
 #include <c10/util/BFloat16-inl.h> // IWYU pragma: keep
diff --git a/c10/util/Bfloat16.cpp b/c10/util/Bfloat16.cpp
new file mode 100644
index 0000000000000..ae6a839455dcc
--- /dev/null
+++ b/c10/util/Bfloat16.cpp
@@ -0,0 +1,10 @@
+#include <c10/util/BFloat16.h>
+#include <type_traits>
+
+namespace c10 {
+
+static_assert(
+    std::is_standard_layout_v<BFloat16>,
+    "c10::BFloat16 must be standard layout.");
+
+} // namespace c10
diff --git a/c10/util/C++17.h b/c10/util/C++17.h
index 6dddfcbdabb76..4c2c3a6634658 100644
--- a/c10/util/C++17.h
+++ b/c10/util/C++17.h
@@ -3,23 +3,20 @@
 #define C10_UTIL_CPP17_H_
 
 #include <c10/macros/Macros.h>
-#include <cstdlib>
 #include <functional>
 #include <memory>
-#include <sstream>
-#include <string>
 #include <type_traits>
 #include <utility>
 
 #if !defined(__clang__) && !defined(_MSC_VER) && defined(__GNUC__) && \
-    __GNUC__ < 5
+    __GNUC__ < 9
 #error \
-    "You're trying to build PyTorch with a too old version of GCC. We need GCC 5 or later."
+    "You're trying to build PyTorch with a too old version of GCC. We need GCC 9 or later."
 #endif
 
-#if defined(__clang__) && __clang_major__ < 4
+#if defined(__clang__) && __clang_major__ < 9
 #error \
-    "You're trying to build PyTorch with a too old version of Clang. We need Clang 4 or later."
+    "You're trying to build PyTorch with a too old version of Clang. We need Clang 9 or later."
 #endif
 
 #if (defined(_MSC_VER) && (!defined(_MSVC_LANG) || _MSVC_LANG < 201703L)) || \
@@ -53,11 +50,7 @@ using invoke_result_t = typename invoke_result<F, args...>::type;
 // std::is_trivial are introduced in C++11, std::conjunction has been introduced
 // in C++17.
 template <typename T>
-#if defined(__cpp_lib_logical_traits) && __cpp_lib_logical_traits >= 201510L
 using is_pod = std::conjunction<std::is_standard_layout<T>, std::is_trivial<T>>;
-#else
-using is_pod = std::is_pod<T>;
-#endif
 
 template <typename T>
 constexpr bool is_pod_v = is_pod<T>::value;
@@ -73,8 +66,6 @@ make_unique_base(Args&&... args) {
   return std::unique_ptr<Base>(new Child(std::forward<Args>(args)...));
 }
 
-#if defined(__cpp_lib_logical_traits) && !(defined(_MSC_VER) && _MSC_VER < 1920)
-
 template <class... B>
 using conjunction = std::conjunction<B...>;
 template <class... B>
@@ -84,66 +75,13 @@ using bool_constant = std::bool_constant<B>;
 template <class B>
 using negation = std::negation<B>;
 
-#else
-
-// Implementation taken from http://en.cppreference.com/w/cpp/types/conjunction
-template <class...>
-struct conjunction : std::true_type {};
-template <class B1>
-struct conjunction<B1> : B1 {};
-template <class B1, class... Bn>
-struct conjunction<B1, Bn...>
-    : std::conditional_t<bool(B1::value), conjunction<Bn...>, B1> {};
-
-// Implementation taken from http://en.cppreference.com/w/cpp/types/disjunction
-template <class...>
-struct disjunction : std::false_type {};
-template <class B1>
-struct disjunction<B1> : B1 {};
-template <class B1, class... Bn>
-struct disjunction<B1, Bn...>
-    : std::conditional_t<bool(B1::value), B1, disjunction<Bn...>> {};
-
-// Implementation taken from
-// http://en.cppreference.com/w/cpp/types/integral_constant
-template <bool B>
-using bool_constant = std::integral_constant<bool, B>;
-
-// Implementation taken from http://en.cppreference.com/w/cpp/types/negation
-template <class B>
-struct negation : bool_constant<!bool(B::value)> {};
-
-#endif
-
-#ifdef __cpp_lib_void_t
-
 template <class T>
 using void_t = std::void_t<T>;
 
-#else
-
-// Implementation taken from http://en.cppreference.com/w/cpp/types/void_t
-// (it takes CWG1558 into account and also works for older compilers)
-template <typename... Ts>
-struct make_void {
-  typedef void type;
-};
-template <typename... Ts>
-using void_t = typename make_void<Ts...>::type;
-
-#endif
-
-#if defined(USE_ROCM)
-// rocm doesn't like the C10_HOST_DEVICE
-#define CUDA_HOST_DEVICE
-#else
-#define CUDA_HOST_DEVICE C10_HOST_DEVICE
-#endif
-
-#if defined(__cpp_lib_apply) && !defined(__CUDA_ARCH__)
+#if defined(__cpp_lib_apply) && !defined(__CUDA_ARCH__) && !defined(__HIP__)
 
 template <class F, class Tuple>
-CUDA_HOST_DEVICE inline constexpr decltype(auto) apply(F&& f, Tuple&& t) {
+C10_HOST_DEVICE inline constexpr decltype(auto) apply(F&& f, Tuple&& t) {
   return std::apply(std::forward<F>(f), std::forward<Tuple>(t));
 }
 
@@ -164,7 +102,7 @@ C10_HOST_DEVICE constexpr auto apply_impl(
     std::index_sequence<INDEX...>)
 #else
 // GCC/Clang need the decltype() return type
-CUDA_HOST_DEVICE constexpr decltype(auto) apply_impl(
+C10_HOST_DEVICE constexpr decltype(auto) apply_impl(
     F&& f,
     Tuple&& t,
     std::index_sequence<INDEX...>)
@@ -175,7 +113,7 @@ CUDA_HOST_DEVICE constexpr decltype(auto) apply_impl(
 } // namespace detail
 
 template <class F, class Tuple>
-CUDA_HOST_DEVICE constexpr decltype(auto) apply(F&& f, Tuple&& t) {
+C10_HOST_DEVICE constexpr decltype(auto) apply(F&& f, Tuple&& t) {
   return detail::apply_impl(
       std::forward<F>(f),
       std::forward<Tuple>(t),
@@ -185,8 +123,6 @@ CUDA_HOST_DEVICE constexpr decltype(auto) apply(F&& f, Tuple&& t) {
 
 #endif
 
-#undef CUDA_HOST_DEVICE
-
 template <typename Functor, typename... Args>
 std::enable_if_t<
     std::is_member_pointer_v<std::decay_t<Functor>>,
@@ -216,21 +152,12 @@ struct _identity final {
 
 template <class Func, class Enable = void>
 struct function_takes_identity_argument : std::false_type {};
-#if defined(_MSC_VER)
-// For some weird reason, MSVC shows a compiler error when using guts::void_t
-// instead of std::void_t. But we're only building on MSVC versions that have
-// std::void_t, so let's just use that one.
+
 template <class Func>
 struct function_takes_identity_argument<
     Func,
     std::void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {
 };
-#else
-template <class Func>
-struct function_takes_identity_argument<
-    Func,
-    void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {};
-#endif
 } // namespace detail
 
 } // namespace guts
diff --git a/c10/util/DeadlockDetection.cpp b/c10/util/DeadlockDetection.cpp
index bb95939fc53fc..320fa7873c6f2 100644
--- a/c10/util/DeadlockDetection.cpp
+++ b/c10/util/DeadlockDetection.cpp
@@ -2,8 +2,7 @@
 
 #include <cstdlib>
 
-namespace c10 {
-namespace impl {
+namespace c10::impl {
 
 namespace {
 PythonGILHooks* python_gil_hooks = nullptr;
@@ -28,5 +27,4 @@ void SetPythonGILHooks(PythonGILHooks* hooks) {
   python_gil_hooks = hooks;
 }
 
-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
diff --git a/c10/util/Exception.cpp b/c10/util/Exception.cpp
index 57caf8afebae0..a0b9fa1e72ec8 100644
--- a/c10/util/Exception.cpp
+++ b/c10/util/Exception.cpp
@@ -8,7 +8,7 @@
 
 namespace c10 {
 
-Error::Error(std::string msg, std::string backtrace, const void* caller)
+Error::Error(std::string msg, Backtrace backtrace, const void* caller)
     : msg_(std::move(msg)), backtrace_(std::move(backtrace)), caller_(caller) {
   refresh_what();
 }
@@ -23,7 +23,7 @@ Error::Error(
     const uint32_t line,
     const char* condition,
     const std::string& msg,
-    const std::string& backtrace,
+    Backtrace backtrace,
     const void* caller)
     : Error(
           str("[enforce fail at ",
@@ -34,7 +34,7 @@ Error::Error(
               condition,
               ". ",
               msg),
-          backtrace,
+          std::move(backtrace),
           caller) {}
 
 std::string Error::compute_what(bool include_backtrace) const {
@@ -51,15 +51,37 @@ std::string Error::compute_what(bool include_backtrace) const {
     }
   }
 
-  if (include_backtrace) {
-    oss << "\n" << backtrace_;
+  if (include_backtrace && backtrace_) {
+    oss << "\n" << backtrace_->get();
   }
 
   return oss.str();
 }
 
+const Error::Backtrace& Error::backtrace() const {
+  return backtrace_;
+}
+
+const char* Error::what() const noexcept {
+  return what_
+      .ensure([this] {
+        try {
+          return compute_what(/*include_backtrace*/ true);
+        } catch (...) {
+          // what() is noexcept, we need to return something here.
+          return std::string{"<Error computing Error::what()>"};
+        }
+      })
+      .c_str();
+}
+
 void Error::refresh_what() {
-  what_ = compute_what(/*include_backtrace*/ true);
+  // Do not compute what_ eagerly, as it would trigger the computation of the
+  // backtrace. Instead, invalidate it, it will be computed on first access.
+  // refresh_what() is only called by non-const public methods which are not
+  // supposed to be called concurrently with any other method, so it is safe to
+  // invalidate here.
+  what_.reset();
   what_without_backtrace_ = compute_what(/*include_backtrace*/ false);
 }
 
diff --git a/c10/util/Exception.h b/c10/util/Exception.h
index 02c6d2ff0fb41..750e978059ba9 100644
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@@ -3,10 +3,12 @@
 
 #include <c10/macros/Export.h>
 #include <c10/macros/Macros.h>
+#include <c10/util/Lazy.h>
 #include <c10/util/StringUtil.h>
 
 #include <cstdint>
 #include <exception>
+#include <memory>
 #include <string>
 #include <variant>
 #include <vector>
@@ -25,6 +27,12 @@ namespace c10 {
 /// NB: c10::Error is handled specially by the default torch to suppress the
 /// backtrace, see torch/csrc/Exceptions.h
 class C10_API Error : public std::exception {
+ public:
+  // Symbolizing the backtrace can be expensive; pass it around as a lazy string
+  // so it is symbolized only if actually needed.
+  using Backtrace = std::shared_ptr<const LazyValue<std::string>>;
+
+ private:
   // The actual error message.
   std::string msg_;
 
@@ -36,14 +44,14 @@ class C10_API Error : public std::exception {
   // The C++ backtrace at the point when this exception was raised.  This
   // may be empty if there is no valid backtrace.  (We don't use optional
   // here to reduce the dependencies this file has.)
-  std::string backtrace_;
+  Backtrace backtrace_;
 
   // These two are derived fields from msg_stack_ and backtrace_, but we need
   // fields for the strings so that we can return a const char* (as the
   // signature of std::exception requires).  Currently, the invariant
   // is that these fields are ALWAYS populated consistently with respect
   // to msg_stack_ and backtrace_.
-  std::string what_;
+  mutable OptimisticLazy<std::string> what_;
   std::string what_without_backtrace_;
 
   // This is a little debugging trick: you can stash a relevant pointer
@@ -64,11 +72,14 @@ class C10_API Error : public std::exception {
       const uint32_t line,
       const char* condition,
       const std::string& msg,
-      const std::string& backtrace,
+      Backtrace backtrace,
       const void* caller = nullptr);
 
   // Base constructor
-  Error(std::string msg, std::string backtrace, const void* caller = nullptr);
+  Error(
+      std::string msg,
+      Backtrace backtrace = nullptr,
+      const void* caller = nullptr);
 
   // Add some new context to the message stack.  The last added context
   // will be formatted at the end of the context list upon printing.
@@ -84,16 +95,12 @@ class C10_API Error : public std::exception {
     return context_;
   }
 
-  const std::string& backtrace() const {
-    return backtrace_;
-  }
+  const Backtrace& backtrace() const;
 
   /// Returns the complete error message, including the source location.
   /// The returned pointer is invalidated if you call add_context() on
   /// this object.
-  const char* what() const noexcept override {
-    return what_.c_str();
-  }
+  const char* what() const noexcept override;
 
   const void* caller() const noexcept {
     return caller_;
diff --git a/c10/util/Float8_e4m3fn.cpp b/c10/util/Float8_e4m3fn.cpp
index 7e502d21be0ed..9cb648b2b403e 100644
--- a/c10/util/Float8_e4m3fn.cpp
+++ b/c10/util/Float8_e4m3fn.cpp
@@ -1,5 +1,4 @@
 #include <c10/util/Float8_e4m3fn.h>
-#include <iostream>
 #include <type_traits>
 
 namespace c10 {
@@ -8,8 +7,4 @@ static_assert(
     std::is_standard_layout_v<Float8_e4m3fn>,
     "c10::Float8_e4m3fn must be standard layout.");
 
-std::ostream& operator<<(std::ostream& out, const Float8_e4m3fn& value) {
-  out << (float)value;
-  return out;
-}
 } // namespace c10
diff --git a/c10/util/Float8_e4m3fn.h b/c10/util/Float8_e4m3fn.h
index 8d3e339ca6196..d51feabcc8c4d 100644
--- a/c10/util/Float8_e4m3fn.h
+++ b/c10/util/Float8_e4m3fn.h
@@ -15,7 +15,6 @@
 /// and inspired by Half implementation from pytorch/c10/util/Half.h
 
 #include <c10/macros/Macros.h>
-#include <c10/util/C++17.h>
 #include <c10/util/TypeSafeSignMath.h>
 #include <c10/util/floating_point_utils.h>
 #include <type_traits>
@@ -96,7 +95,7 @@ inline C10_HOST_DEVICE float fp8e4m3fn_to_fp32_value(uint8_t input) {
    * mantissa will shift into exponent, turning the biased exponent into 1, and
    * making mantissa normalized (i.e. without leading 1).
    */
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   uint32_t renorm_shift = __clz(nonsign);
 #elif defined(__SYCL_DEVICE_ONLY__)
   // Note: zero is not a supported input into `__builtin_clz`
@@ -240,7 +239,12 @@ struct alignas(1) Float8_e4m3fn {
   inline C10_HOST_DEVICE bool isnan() const;
 };
 
-C10_API std::ostream& operator<<(std::ostream& out, const Float8_e4m3fn& value);
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const Float8_e4m3fn& value) {
+  out << (float)value;
+  return out;
+}
 
 } // namespace c10
 
diff --git a/c10/util/Float8_e4m3fnuz-inl.h b/c10/util/Float8_e4m3fnuz-inl.h
index c1aab8bfe4dcc..e89eaeadd47b4 100644
--- a/c10/util/Float8_e4m3fnuz-inl.h
+++ b/c10/util/Float8_e4m3fnuz-inl.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include <c10/macros/Macros.h>
+#include <c10/util/Float8_fnuz_cvt.h>
+#include <cstring>
 #include <limits>
 
 C10_CLANG_DIAGNOSTIC_PUSH()
@@ -12,21 +14,208 @@ namespace c10 {
 
 /// Constructors
 
-C10_HOST_DEVICE inline Float8_e4m3fnuz::Float8_e4m3fnuz(float value)
+inline C10_HOST_DEVICE Float8_e4m3fnuz::Float8_e4m3fnuz(float value)
     : x(detail::fp8e4m3fnuz_from_fp32_value(value)) {}
 
 /// Implicit conversions
 
-C10_HOST_DEVICE inline Float8_e4m3fnuz::operator float() const {
-  return detail::fp8e4m3fnuz_to_fp32_value(x);
+inline C10_HOST_DEVICE Float8_e4m3fnuz::operator float() const {
+  return detail::fp8_fnuz_to_fp32_value<4, 3>(x);
 }
 
 /// Special values helper
 
-C10_HOST_DEVICE inline bool Float8_e4m3fnuz::isnan() const {
+inline C10_HOST_DEVICE bool Float8_e4m3fnuz::isnan() const {
   return x == 0b10000000;
 }
 
+/// Arithmetic
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz
+operator+(const Float8_e4m3fnuz& a, const Float8_e4m3fnuz& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz
+operator-(const Float8_e4m3fnuz& a, const Float8_e4m3fnuz& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz
+operator*(const Float8_e4m3fnuz& a, const Float8_e4m3fnuz& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(
+    const Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(const Float8_e4m3fnuz& a) {
+  return -static_cast<float>(a);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz& operator+=(
+    Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz& operator-=(
+    Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz& operator*=(
+    Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz& operator/=(
+    Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(Float8_e4m3fnuz a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(Float8_e4m3fnuz a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(Float8_e4m3fnuz a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(Float8_e4m3fnuz a, float b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, Float8_e4m3fnuz b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, Float8_e4m3fnuz b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, Float8_e4m3fnuz b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, Float8_e4m3fnuz b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const Float8_e4m3fnuz& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const Float8_e4m3fnuz& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const Float8_e4m3fnuz& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const Float8_e4m3fnuz& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(Float8_e4m3fnuz a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(Float8_e4m3fnuz a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(Float8_e4m3fnuz a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(Float8_e4m3fnuz a, double b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, Float8_e4m3fnuz b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, Float8_e4m3fnuz b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, Float8_e4m3fnuz b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, Float8_e4m3fnuz b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator+(Float8_e4m3fnuz a, int b) {
+  return a + static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(Float8_e4m3fnuz a, int b) {
+  return a - static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator*(Float8_e4m3fnuz a, int b) {
+  return a * static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(Float8_e4m3fnuz a, int b) {
+  return a / static_cast<Float8_e4m3fnuz>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator+(int a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(int a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator*(int a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(int a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator+(Float8_e4m3fnuz a, int64_t b) {
+  return a + static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(Float8_e4m3fnuz a, int64_t b) {
+  return a - static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator*(Float8_e4m3fnuz a, int64_t b) {
+  return a * static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(Float8_e4m3fnuz a, int64_t b) {
+  return a / static_cast<Float8_e4m3fnuz>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator+(int64_t a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(int64_t a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator*(int64_t a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(int64_t a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Float8_e4m3fnuz to float.
+
 } // namespace c10
 
 namespace std {
diff --git a/c10/util/Float8_e4m3fnuz.cpp b/c10/util/Float8_e4m3fnuz.cpp
index 7bf301ff1b4a7..b18167f692c7f 100644
--- a/c10/util/Float8_e4m3fnuz.cpp
+++ b/c10/util/Float8_e4m3fnuz.cpp
@@ -1,283 +1,9 @@
 #include <c10/util/Float8_e4m3fnuz.h>
-#include <array>
-#include <iostream>
 
 namespace c10 {
 
-namespace detail {
-
-C10_HOST_DEVICE float fp8e4m3fnuz_to_fp32_value(uint8_t input) {
-  constexpr std::array<float, 256> e4m3fnuz_lut = {
-      0.0f,
-      0.0009765625f,
-      0.001953125f,
-      0.0029296875f,
-      0.00390625f,
-      0.0048828125f,
-      0.005859375f,
-      0.0068359375f,
-      0.0078125f,
-      0.0087890625f,
-      0.009765625f,
-      0.0107421875f,
-      0.01171875f,
-      0.0126953125f,
-      0.013671875f,
-      0.0146484375f,
-      0.015625f,
-      0.017578125f,
-      0.01953125f,
-      0.021484375f,
-      0.0234375f,
-      0.025390625f,
-      0.02734375f,
-      0.029296875f,
-      0.03125f,
-      0.03515625f,
-      0.0390625f,
-      0.04296875f,
-      0.046875f,
-      0.05078125f,
-      0.0546875f,
-      0.05859375f,
-      0.0625f,
-      0.0703125f,
-      0.078125f,
-      0.0859375f,
-      0.09375f,
-      0.1015625f,
-      0.109375f,
-      0.1171875f,
-      0.125f,
-      0.140625f,
-      0.15625f,
-      0.171875f,
-      0.1875f,
-      0.203125f,
-      0.21875f,
-      0.234375f,
-      0.25f,
-      0.28125f,
-      0.3125f,
-      0.34375f,
-      0.375f,
-      0.40625f,
-      0.4375f,
-      0.46875f,
-      0.5f,
-      0.5625f,
-      0.625f,
-      0.6875f,
-      0.75f,
-      0.8125f,
-      0.875f,
-      0.9375f,
-      1.0f,
-      1.125f,
-      1.25f,
-      1.375f,
-      1.5f,
-      1.625f,
-      1.75f,
-      1.875f,
-      2.0f,
-      2.25f,
-      2.5f,
-      2.75f,
-      3.0f,
-      3.25f,
-      3.5f,
-      3.75f,
-      4.0f,
-      4.5f,
-      5.0f,
-      5.5f,
-      6.0f,
-      6.5f,
-      7.0f,
-      7.5f,
-      8.0f,
-      9.0f,
-      10.0f,
-      11.0f,
-      12.0f,
-      13.0f,
-      14.0f,
-      15.0f,
-      16.0f,
-      18.0f,
-      20.0f,
-      22.0f,
-      24.0f,
-      26.0f,
-      28.0f,
-      30.0f,
-      32.0f,
-      36.0f,
-      40.0f,
-      44.0f,
-      48.0f,
-      52.0f,
-      56.0f,
-      60.0f,
-      64.0f,
-      72.0f,
-      80.0f,
-      88.0f,
-      96.0f,
-      104.0f,
-      112.0f,
-      120.0f,
-      128.0f,
-      144.0f,
-      160.0f,
-      176.0f,
-      192.0f,
-      208.0f,
-      224.0f,
-      240.0f,
-      std::numeric_limits<float>::signaling_NaN(),
-      -0.0009765625f,
-      -0.001953125f,
-      -0.0029296875f,
-      -0.00390625f,
-      -0.0048828125f,
-      -0.005859375f,
-      -0.0068359375f,
-      -0.0078125f,
-      -0.0087890625f,
-      -0.009765625f,
-      -0.0107421875f,
-      -0.01171875f,
-      -0.0126953125f,
-      -0.013671875f,
-      -0.0146484375f,
-      -0.015625f,
-      -0.017578125f,
-      -0.01953125f,
-      -0.021484375f,
-      -0.0234375f,
-      -0.025390625f,
-      -0.02734375f,
-      -0.029296875f,
-      -0.03125f,
-      -0.03515625f,
-      -0.0390625f,
-      -0.04296875f,
-      -0.046875f,
-      -0.05078125f,
-      -0.0546875f,
-      -0.05859375f,
-      -0.0625f,
-      -0.0703125f,
-      -0.078125f,
-      -0.0859375f,
-      -0.09375f,
-      -0.1015625f,
-      -0.109375f,
-      -0.1171875f,
-      -0.125f,
-      -0.140625f,
-      -0.15625f,
-      -0.171875f,
-      -0.1875f,
-      -0.203125f,
-      -0.21875f,
-      -0.234375f,
-      -0.25f,
-      -0.28125f,
-      -0.3125f,
-      -0.34375f,
-      -0.375f,
-      -0.40625f,
-      -0.4375f,
-      -0.46875f,
-      -0.5f,
-      -0.5625f,
-      -0.625f,
-      -0.6875f,
-      -0.75f,
-      -0.8125f,
-      -0.875f,
-      -0.9375f,
-      -1.0f,
-      -1.125f,
-      -1.25f,
-      -1.375f,
-      -1.5f,
-      -1.625f,
-      -1.75f,
-      -1.875f,
-      -2.0f,
-      -2.25f,
-      -2.5f,
-      -2.75f,
-      -3.0f,
-      -3.25f,
-      -3.5f,
-      -3.75f,
-      -4.0f,
-      -4.5f,
-      -5.0f,
-      -5.5f,
-      -6.0f,
-      -6.5f,
-      -7.0f,
-      -7.5f,
-      -8.0f,
-      -9.0f,
-      -10.0f,
-      -11.0f,
-      -12.0f,
-      -13.0f,
-      -14.0f,
-      -15.0f,
-      -16.0f,
-      -18.0f,
-      -20.0f,
-      -22.0f,
-      -24.0f,
-      -26.0f,
-      -28.0f,
-      -30.0f,
-      -32.0f,
-      -36.0f,
-      -40.0f,
-      -44.0f,
-      -48.0f,
-      -52.0f,
-      -56.0f,
-      -60.0f,
-      -64.0f,
-      -72.0f,
-      -80.0f,
-      -88.0f,
-      -96.0f,
-      -104.0f,
-      -112.0f,
-      -120.0f,
-      -128.0f,
-      -144.0f,
-      -160.0f,
-      -176.0f,
-      -192.0f,
-      -208.0f,
-      -224.0f,
-      -240.0f,
-  };
-
-  return e4m3fnuz_lut[input];
-}
-
-} // namespace detail
-
 static_assert(
     std::is_standard_layout_v<Float8_e4m3fnuz>,
     "c10::Float8_e4m3fnuz must be standard layout.");
 
-std::ostream& operator<<(std::ostream& out, const Float8_e4m3fnuz& value) {
-  out << (float)value;
-  return out;
-}
-
 } // namespace c10
diff --git a/c10/util/Float8_e4m3fnuz.h b/c10/util/Float8_e4m3fnuz.h
index 0b42c062a280a..bed29891749a3 100644
--- a/c10/util/Float8_e4m3fnuz.h
+++ b/c10/util/Float8_e4m3fnuz.h
@@ -4,13 +4,11 @@
 /// conversions to standard C types and basic arithmetic operations. Note that
 /// arithmetic operations are implemented by converting to floating point and
 /// performing the operation in float32.
-///
 /// Binary configuration remains the same as Float8_e4m3fn:
 /// s eeee mmm
 /// 1 sign bit
 /// 4 exponent bits
 /// 3 mantissa bits
-///
 /// The key differences versus Float8_e4m3fn are:
 /// bias = 8
 /// no infinities or negative zero
@@ -20,9 +18,9 @@
 /// the existing Float8_e4m3fn implementation.
 
 #include <c10/macros/Macros.h>
-#include <c10/util/C++17.h>
 #include <c10/util/TypeSafeSignMath.h>
 #include <c10/util/floating_point_utils.h>
+#include <type_traits>
 
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #include <cstdint>
@@ -38,27 +36,11 @@ namespace c10 {
 
 namespace detail {
 
-/*
- * Convert a 8-bit floating-point number in fp8 E4M3FNUZ format, in bit
- * representation, to a 32-bit floating-point number in IEEE single-precision
- * format, in bit representation.
- *
- * @note The implementation doesn't use any floating-point operations.
- */
-#if defined(__CUDA_ARCH__) || defined(__HIP__)
-C10_HOST_DEVICE C10_API inline float fp8e4m3fnuz_to_fp32_value(uint8_t) {
-  CUDA_KERNEL_ASSERT(false && "e4m3fnuz is not supported by CUDA or HIP");
-  return -1.0;
-}
-#else
-C10_API float fp8e4m3fnuz_to_fp32_value(uint8_t input);
-#endif
-
 /*
  * Convert a 32-bit floating-point number in IEEE single-precision format to a
  * 8-bit floating-point number in fp8 E4M3FNUZ format, in bit representation.
  */
-C10_HOST_DEVICE inline uint8_t fp8e4m3fnuz_from_fp32_value(float f) {
+inline C10_HOST_DEVICE uint8_t fp8e4m3fnuz_from_fp32_value(float f) {
   /*
    * Binary representation of 256.0f, which is the first value not representable
    * (i.e. the first value which would overflow in to the sign bit, resulting in
@@ -70,7 +52,7 @@ C10_HOST_DEVICE inline uint8_t fp8e4m3fnuz_from_fp32_value(float f) {
 
   /*
    * A mask for converting fp32 numbers lower than fp8e4m3fnuz normal range
-   * into denormalized representation.
+   * into denorm representation
    * magic number: ((127 - 8) + (23 - 3) + 1)
    */
   constexpr uint32_t denorm_mask = UINT32_C(0x8C) << 23;
@@ -123,7 +105,6 @@ C10_HOST_DEVICE inline uint8_t fp8e4m3fnuz_from_fp32_value(float f) {
   }
 
   result |= sign >> 24;
-
   return result;
 }
 
@@ -133,7 +114,7 @@ struct alignas(1) Float8_e4m3fnuz {
   uint8_t x;
 
   struct from_bits_t {};
-  static constexpr C10_HOST_DEVICE from_bits_t from_bits() {
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
     return from_bits_t();
   }
 
@@ -146,9 +127,12 @@ struct alignas(1) Float8_e4m3fnuz {
   inline C10_HOST_DEVICE bool isnan() const;
 };
 
-C10_API std::ostream& operator<<(
+C10_API inline std::ostream& operator<<(
     std::ostream& out,
-    const Float8_e4m3fnuz& value);
+    const Float8_e4m3fnuz& value) {
+  out << (float)value;
+  return out;
+}
 
 } // namespace c10
 
diff --git a/c10/util/Float8_e5m2.cpp b/c10/util/Float8_e5m2.cpp
index edbd99f1004d9..3a9fc99981f27 100644
--- a/c10/util/Float8_e5m2.cpp
+++ b/c10/util/Float8_e5m2.cpp
@@ -1,5 +1,4 @@
 #include <c10/util/Float8_e5m2.h>
-#include <iostream>
 
 namespace c10 {
 
@@ -7,8 +6,4 @@ static_assert(
     std::is_standard_layout<Float8_e5m2>::value,
     "c10::Float8_e5m2 must be standard layout.");
 
-std::ostream& operator<<(std::ostream& out, const Float8_e5m2& value) {
-  out << (float)value;
-  return out;
-}
 } // namespace c10
diff --git a/c10/util/Float8_e5m2.h b/c10/util/Float8_e5m2.h
index c05f974067a7b..442b7ee87e3a6 100644
--- a/c10/util/Float8_e5m2.h
+++ b/c10/util/Float8_e5m2.h
@@ -136,7 +136,12 @@ struct alignas(1) Float8_e5m2 {
   inline C10_HOST_DEVICE bool isinf() const;
 };
 
-C10_API std::ostream& operator<<(std::ostream& out, const Float8_e5m2& value);
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const Float8_e5m2& value) {
+  out << (float)value;
+  return out;
+}
 
 } // namespace c10
 
diff --git a/c10/util/Float8_e5m2fnuz-inl.h b/c10/util/Float8_e5m2fnuz-inl.h
index 8aad01f445842..3af233a87b844 100644
--- a/c10/util/Float8_e5m2fnuz-inl.h
+++ b/c10/util/Float8_e5m2fnuz-inl.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include <c10/macros/Macros.h>
+#include <c10/util/Float8_fnuz_cvt.h>
+#include <cstring>
 #include <limits>
 
 C10_CLANG_DIAGNOSTIC_PUSH()
@@ -12,21 +14,212 @@ namespace c10 {
 
 /// Constructors
 
-C10_HOST_DEVICE inline Float8_e5m2fnuz::Float8_e5m2fnuz(float value)
+inline C10_HOST_DEVICE Float8_e5m2fnuz::Float8_e5m2fnuz(float value)
     : x(detail::fp8e5m2fnuz_from_fp32_value(value)) {}
 
 /// Implicit conversions
 
-C10_HOST_DEVICE inline Float8_e5m2fnuz::operator float() const {
-  return detail::fp8e5m2fnuz_to_fp32_value(x);
+inline C10_HOST_DEVICE Float8_e5m2fnuz::operator float() const {
+  return detail::fp8_fnuz_to_fp32_value<5, 2>(x);
 }
 
 /// Special values helpers
 
-C10_HOST_DEVICE inline bool Float8_e5m2fnuz::isnan() const {
+inline C10_HOST_DEVICE bool Float8_e5m2fnuz::isnan() const {
   return x == 0b10000000;
 }
 
+inline C10_HOST_DEVICE bool Float8_e5m2fnuz::isinf() const {
+  return false;
+}
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz
+operator+(const Float8_e5m2fnuz& a, const Float8_e5m2fnuz& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz
+operator-(const Float8_e5m2fnuz& a, const Float8_e5m2fnuz& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz
+operator*(const Float8_e5m2fnuz& a, const Float8_e5m2fnuz& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(
+    const Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(const Float8_e5m2fnuz& a) {
+  return -static_cast<float>(a);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz& operator+=(
+    Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz& operator-=(
+    Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz& operator*=(
+    Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz& operator/=(
+    Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(Float8_e5m2fnuz a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(Float8_e5m2fnuz a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(Float8_e5m2fnuz a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(Float8_e5m2fnuz a, float b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, Float8_e5m2fnuz b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, Float8_e5m2fnuz b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, Float8_e5m2fnuz b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, Float8_e5m2fnuz b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const Float8_e5m2fnuz& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const Float8_e5m2fnuz& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const Float8_e5m2fnuz& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const Float8_e5m2fnuz& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(Float8_e5m2fnuz a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(Float8_e5m2fnuz a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(Float8_e5m2fnuz a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(Float8_e5m2fnuz a, double b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, Float8_e5m2fnuz b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, Float8_e5m2fnuz b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, Float8_e5m2fnuz b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, Float8_e5m2fnuz b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator+(Float8_e5m2fnuz a, int b) {
+  return a + static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(Float8_e5m2fnuz a, int b) {
+  return a - static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator*(Float8_e5m2fnuz a, int b) {
+  return a * static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(Float8_e5m2fnuz a, int b) {
+  return a / static_cast<Float8_e5m2fnuz>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator+(int a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(int a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator*(int a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(int a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator+(Float8_e5m2fnuz a, int64_t b) {
+  return a + static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(Float8_e5m2fnuz a, int64_t b) {
+  return a - static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator*(Float8_e5m2fnuz a, int64_t b) {
+  return a * static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(Float8_e5m2fnuz a, int64_t b) {
+  return a / static_cast<Float8_e5m2fnuz>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator+(int64_t a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(int64_t a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator*(int64_t a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(int64_t a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Float8_e5m2fnuz to float.
+
 } // namespace c10
 
 namespace std {
diff --git a/c10/util/Float8_e5m2fnuz.cpp b/c10/util/Float8_e5m2fnuz.cpp
index c98f6dc6d7613..e3349b5872482 100644
--- a/c10/util/Float8_e5m2fnuz.cpp
+++ b/c10/util/Float8_e5m2fnuz.cpp
@@ -1,283 +1,9 @@
 #include <c10/util/Float8_e5m2fnuz.h>
-#include <array>
-#include <iostream>
 
 namespace c10 {
 
-namespace detail {
-
-C10_HOST_DEVICE float fp8e5m2fnuz_to_fp32_value(uint8_t input) {
-  constexpr std::array<float, 256> e5m2fnuz_lut = {
-      0.0f,
-      7.62939453125e-06f,
-      1.52587890625e-05f,
-      2.288818359375e-05f,
-      3.0517578125e-05f,
-      3.814697265625e-05f,
-      4.57763671875e-05f,
-      5.340576171875e-05f,
-      6.103515625e-05f,
-      7.62939453125e-05f,
-      9.1552734375e-05f,
-      0.0001068115234375f,
-      0.0001220703125f,
-      0.000152587890625f,
-      0.00018310546875f,
-      0.000213623046875f,
-      0.000244140625f,
-      0.00030517578125f,
-      0.0003662109375f,
-      0.00042724609375f,
-      0.00048828125f,
-      0.0006103515625f,
-      0.000732421875f,
-      0.0008544921875f,
-      0.0009765625f,
-      0.001220703125f,
-      0.00146484375f,
-      0.001708984375f,
-      0.001953125f,
-      0.00244140625f,
-      0.0029296875f,
-      0.00341796875f,
-      0.00390625f,
-      0.0048828125f,
-      0.005859375f,
-      0.0068359375f,
-      0.0078125f,
-      0.009765625f,
-      0.01171875f,
-      0.013671875f,
-      0.015625f,
-      0.01953125f,
-      0.0234375f,
-      0.02734375f,
-      0.03125f,
-      0.0390625f,
-      0.046875f,
-      0.0546875f,
-      0.0625f,
-      0.078125f,
-      0.09375f,
-      0.109375f,
-      0.125f,
-      0.15625f,
-      0.1875f,
-      0.21875f,
-      0.25f,
-      0.3125f,
-      0.375f,
-      0.4375f,
-      0.5f,
-      0.625f,
-      0.75f,
-      0.875f,
-      1.0f,
-      1.25f,
-      1.5f,
-      1.75f,
-      2.0f,
-      2.5f,
-      3.0f,
-      3.5f,
-      4.0f,
-      5.0f,
-      6.0f,
-      7.0f,
-      8.0f,
-      10.0f,
-      12.0f,
-      14.0f,
-      16.0f,
-      20.0f,
-      24.0f,
-      28.0f,
-      32.0f,
-      40.0f,
-      48.0f,
-      56.0f,
-      64.0f,
-      80.0f,
-      96.0f,
-      112.0f,
-      128.0f,
-      160.0f,
-      192.0f,
-      224.0f,
-      256.0f,
-      320.0f,
-      384.0f,
-      448.0f,
-      512.0f,
-      640.0f,
-      768.0f,
-      896.0f,
-      1024.0f,
-      1280.0f,
-      1536.0f,
-      1792.0f,
-      2048.0f,
-      2560.0f,
-      3072.0f,
-      3584.0f,
-      4096.0f,
-      5120.0f,
-      6144.0f,
-      7168.0f,
-      8192.0f,
-      10240.0f,
-      12288.0f,
-      14336.0f,
-      16384.0f,
-      20480.0f,
-      24576.0f,
-      28672.0f,
-      32768.0f,
-      40960.0f,
-      49152.0f,
-      57344.0f,
-      std::numeric_limits<float>::signaling_NaN(),
-      -7.62939453125e-06f,
-      -1.52587890625e-05f,
-      -2.288818359375e-05f,
-      -3.0517578125e-05f,
-      -3.814697265625e-05f,
-      -4.57763671875e-05f,
-      -5.340576171875e-05f,
-      -6.103515625e-05f,
-      -7.62939453125e-05f,
-      -9.1552734375e-05f,
-      -0.0001068115234375f,
-      -0.0001220703125f,
-      -0.000152587890625f,
-      -0.00018310546875f,
-      -0.000213623046875f,
-      -0.000244140625f,
-      -0.00030517578125f,
-      -0.0003662109375f,
-      -0.00042724609375f,
-      -0.00048828125f,
-      -0.0006103515625f,
-      -0.000732421875f,
-      -0.0008544921875f,
-      -0.0009765625f,
-      -0.001220703125f,
-      -0.00146484375f,
-      -0.001708984375f,
-      -0.001953125f,
-      -0.00244140625f,
-      -0.0029296875f,
-      -0.00341796875f,
-      -0.00390625f,
-      -0.0048828125f,
-      -0.005859375f,
-      -0.0068359375f,
-      -0.0078125f,
-      -0.009765625f,
-      -0.01171875f,
-      -0.013671875f,
-      -0.015625f,
-      -0.01953125f,
-      -0.0234375f,
-      -0.02734375f,
-      -0.03125f,
-      -0.0390625f,
-      -0.046875f,
-      -0.0546875f,
-      -0.0625f,
-      -0.078125f,
-      -0.09375f,
-      -0.109375f,
-      -0.125f,
-      -0.15625f,
-      -0.1875f,
-      -0.21875f,
-      -0.25f,
-      -0.3125f,
-      -0.375f,
-      -0.4375f,
-      -0.5f,
-      -0.625f,
-      -0.75f,
-      -0.875f,
-      -1.0f,
-      -1.25f,
-      -1.5f,
-      -1.75f,
-      -2.0f,
-      -2.5f,
-      -3.0f,
-      -3.5f,
-      -4.0f,
-      -5.0f,
-      -6.0f,
-      -7.0f,
-      -8.0f,
-      -10.0f,
-      -12.0f,
-      -14.0f,
-      -16.0f,
-      -20.0f,
-      -24.0f,
-      -28.0f,
-      -32.0f,
-      -40.0f,
-      -48.0f,
-      -56.0f,
-      -64.0f,
-      -80.0f,
-      -96.0f,
-      -112.0f,
-      -128.0f,
-      -160.0f,
-      -192.0f,
-      -224.0f,
-      -256.0f,
-      -320.0f,
-      -384.0f,
-      -448.0f,
-      -512.0f,
-      -640.0f,
-      -768.0f,
-      -896.0f,
-      -1024.0f,
-      -1280.0f,
-      -1536.0f,
-      -1792.0f,
-      -2048.0f,
-      -2560.0f,
-      -3072.0f,
-      -3584.0f,
-      -4096.0f,
-      -5120.0f,
-      -6144.0f,
-      -7168.0f,
-      -8192.0f,
-      -10240.0f,
-      -12288.0f,
-      -14336.0f,
-      -16384.0f,
-      -20480.0f,
-      -24576.0f,
-      -28672.0f,
-      -32768.0f,
-      -40960.0f,
-      -49152.0f,
-      -57344.0f,
-  };
-
-  return e5m2fnuz_lut[input];
-}
-
-} // namespace detail
-
 static_assert(
     std::is_standard_layout_v<Float8_e5m2fnuz>,
     "c10::Float8_e5m2 must be standard layout.");
 
-std::ostream& operator<<(std::ostream& out, const Float8_e5m2fnuz& value) {
-  out << (float)value;
-  return out;
-}
-
 } // namespace c10
diff --git a/c10/util/Float8_e5m2fnuz.h b/c10/util/Float8_e5m2fnuz.h
index e09ce99fbb548..f63773914c112 100644
--- a/c10/util/Float8_e5m2fnuz.h
+++ b/c10/util/Float8_e5m2fnuz.h
@@ -4,13 +4,11 @@
 /// conversions to standard C types and basic arithmetic operations. Note that
 /// arithmetic operations are implemented by converting to floating point and
 /// performing the operation in float32.
-///
 /// Binary configuration remains the same as e5m2:
 /// s eeeee mm
 /// 1 sign bit
 /// 5 exponent bits
 /// 2 mantissa bits
-///
 /// The key differences that e5m2fnuz brings are:
 /// bias = 16
 /// no infinities or negative zero
@@ -20,7 +18,6 @@
 /// the existing Float8_e4m3fn implementation.
 
 #include <c10/macros/Macros.h>
-#include <c10/util/C++17.h>
 #include <c10/util/TypeSafeSignMath.h>
 #include <c10/util/floating_point_utils.h>
 
@@ -38,27 +35,11 @@ namespace c10 {
 
 namespace detail {
 
-/*
- * Convert a 8-bit floating-point number in fp8 E5M2FNUZ format, in bit
- * representation, to a 32-bit floating-point number in IEEE single-precision
- * format, in bit representation.
- *
- * @note The implementation doesn't use any floating-point operations.
- */
-#if defined(__CUDA_ARCH__) || defined(__HIP__)
-C10_HOST_DEVICE C10_API inline float fp8e5m2fnuz_to_fp32_value(uint8_t) {
-  CUDA_KERNEL_ASSERT(false && "e5m2fnuz is not supported by CUDA or HIP");
-  return -1.0;
-}
-#else
-C10_API float fp8e5m2fnuz_to_fp32_value(uint8_t input);
-#endif
-
 /*
  * Convert a 32-bit floating-point number in IEEE single-precision format to a
  * 8-bit floating-point number in fp8 E5M2 format, in bit representation.
  */
-C10_HOST_DEVICE inline uint8_t fp8e5m2fnuz_from_fp32_value(float f) {
+inline C10_HOST_DEVICE uint8_t fp8e5m2fnuz_from_fp32_value(float f) {
   /*
    * Binary representation of 65536.0f, which is the first value not
    * representable (i.e. the first value which would overflow in to the sign
@@ -76,7 +57,6 @@ C10_HOST_DEVICE inline uint8_t fp8e5m2fnuz_from_fp32_value(float f) {
   constexpr uint32_t denorm_mask = UINT32_C(0x85) << 23;
 
   uint32_t f_bits = fp32_to_bits(f);
-
   uint32_t result = 0u;
 
   /*
@@ -132,7 +112,7 @@ struct alignas(1) Float8_e5m2fnuz {
   uint8_t x;
 
   struct from_bits_t {};
-  static constexpr C10_HOST_DEVICE from_bits_t from_bits() {
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
     return from_bits_t();
   }
 
@@ -143,11 +123,15 @@ struct alignas(1) Float8_e5m2fnuz {
   inline C10_HOST_DEVICE Float8_e5m2fnuz(float value);
   inline C10_HOST_DEVICE operator float() const;
   inline C10_HOST_DEVICE bool isnan() const;
+  inline C10_HOST_DEVICE bool isinf() const;
 };
 
-C10_API std::ostream& operator<<(
+C10_API inline std::ostream& operator<<(
     std::ostream& out,
-    const Float8_e5m2fnuz& value);
+    const Float8_e5m2fnuz& value) {
+  out << (float)value;
+  return out;
+}
 
 } // namespace c10
 
diff --git a/c10/util/Float8_fnuz_cvt.h b/c10/util/Float8_fnuz_cvt.h
new file mode 100644
index 0000000000000..983063a0230fc
--- /dev/null
+++ b/c10/util/Float8_fnuz_cvt.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <c10/util/floating_point_utils.h>
+
+#include <cstdint>
+
+namespace c10::detail {
+
+/*
+ * Convert a 8-bit floating-point number in either f8 E4M3FNUZ or bf8 E5M2FNUZ
+ * format, in bit representation, to a 32-bit floating-point number.
+ */
+template <uint32_t we, uint32_t wm>
+inline C10_HOST_DEVICE float fp8_fnuz_to_fp32_value(uint8_t x) {
+  static_assert((we == 4 && wm == 3) || (we == 5 && wm == 2));
+  constexpr uint32_t weo = 8;
+  constexpr uint32_t wmo = 23;
+
+  if (x == 0) {
+    return 0;
+  }
+
+  if (x == 0x80) {
+    constexpr uint32_t ifNaN = 0x7F800001;
+    return fp32_from_bits(ifNaN);
+  }
+
+  uint32_t mantissa = x & ((1 << wm) - 1);
+  uint32_t exponent = (x & 0x7F) >> wm;
+
+  // subnormal input
+  if (exponent == 0) {
+    // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    uint32_t renorm_shift = __clz(mantissa);
+#elif defined(_MSC_VER)
+    unsigned long nonsign_bsr;
+    _BitScanReverse(&nonsign_bsr, (unsigned long)mantissa);
+    uint32_t renorm_shift = (uint32_t)nonsign_bsr ^ 31;
+#else
+    uint32_t renorm_shift = __builtin_clz(mantissa);
+#endif
+    uint32_t sh = 1 + renorm_shift - (32 - wm);
+    mantissa <<= sh;
+    exponent += 1 - sh;
+    mantissa &= ((1 << wm) - 1);
+  }
+
+  const uint32_t exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1));
+  exponent += exp_low_cutoff - 1;
+  mantissa <<= wmo - wm;
+
+  uint32_t sign = x >> 7;
+  uint32_t retval = (sign << 31) | (exponent << 23) | mantissa;
+  return fp32_from_bits(retval);
+}
+
+} // namespace c10::detail
diff --git a/c10/util/Half-inl.h b/c10/util/Half-inl.h
index d2c836eecf775..cad9762d4469f 100644
--- a/c10/util/Half-inl.h
+++ b/c10/util/Half-inl.h
@@ -32,7 +32,13 @@ C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 
 namespace c10 {
 
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
 /// Constructors
+inline Half::Half(float16_t value) : x(detail::fp16_to_bits(value)) {}
+inline Half::operator float16_t() const {
+  return detail::fp16_from_bits(x);
+}
+#else
 
 inline C10_HOST_DEVICE Half::Half(float value)
     :
@@ -40,14 +46,12 @@ inline C10_HOST_DEVICE Half::Half(float value)
       x(__half_as_short(__float2half(value)))
 #elif defined(__SYCL_DEVICE_ONLY__)
       x(c10::bit_cast<uint16_t>(sycl::half(value)))
-#else
-#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+#elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
     !defined(__APPLE__)
       x(at::vec::float2half_scalar(value))
 #else
       x(detail::fp16_ieee_from_fp32_value(value))
 #endif
-#endif
 {
 }
 
@@ -58,16 +62,19 @@ inline C10_HOST_DEVICE Half::operator float() const {
   return __half2float(*reinterpret_cast<const __half*>(&x));
 #elif defined(__SYCL_DEVICE_ONLY__)
   return float(c10::bit_cast<sycl::half>(x));
-#else
-#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+#elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
     !defined(__APPLE__)
   return at::vec::half2float_scalar(x);
+#elif defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+  return detail::native_fp16_to_fp32_value(x);
 #else
   return detail::fp16_ieee_to_fp32_value(x);
 #endif
-#endif
 }
 
+#endif /* !defined(__aarch64__) || defined(C10_MOBILE) || defined(__CUDACC__) \
+        */
+
 #if defined(__CUDACC__) || defined(__HIPCC__)
 inline C10_HOST_DEVICE Half::Half(const __half& value) {
   x = *reinterpret_cast<const unsigned short*>(&value);
diff --git a/c10/util/Half.cpp b/c10/util/Half.cpp
index 52588dc4dcf4d..e977aedf9d3bb 100644
--- a/c10/util/Half.cpp
+++ b/c10/util/Half.cpp
@@ -1,5 +1,4 @@
 #include <c10/util/Half.h>
-#include <iostream>
 #include <type_traits>
 
 namespace c10 {
@@ -8,8 +7,4 @@ static_assert(
     std::is_standard_layout_v<Half>,
     "c10::Half must be standard layout.");
 
-std::ostream& operator<<(std::ostream& out, const Half& value) {
-  out << (float)value;
-  return out;
-}
 } // namespace c10
diff --git a/c10/util/Half.h b/c10/util/Half.h
index 93ffcfc288f22..3d5a38cb365c7 100644
--- a/c10/util/Half.h
+++ b/c10/util/Half.h
@@ -30,6 +30,7 @@
 #include <cstring>
 #include <iosfwd>
 #include <limits>
+#include <ostream>
 
 #ifdef __CUDACC__
 #include <cuda_fp16.h>
@@ -45,6 +46,10 @@
 #include <sycl/sycl.hpp> // for SYCL 2020
 #endif
 
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+#include <arm_neon.h>
+#endif
+
 namespace c10 {
 
 namespace detail {
@@ -324,6 +329,34 @@ inline uint16_t fp16_ieee_from_fp32_value(float f) {
       (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign));
 }
 
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+constexpr inline float16_t fp16_from_bits(uint16_t h) {
+  union {
+    uint16_t as_bits;
+    float16_t as_value;
+  } fp16 = {h};
+  return fp16.as_value;
+}
+
+constexpr inline uint16_t fp16_to_bits(float16_t f) {
+  union {
+    float16_t as_value;
+    uint16_t as_bits;
+  } fp16 = {.as_value = f};
+  return fp16.as_bits;
+}
+
+// According to https://godbolt.org/z/8s14GvEjo it would translate to single
+// fcvt s0, h0
+inline float native_fp16_to_fp32_value(uint16_t h) {
+  return static_cast<float>(fp16_from_bits(h));
+}
+
+inline uint16_t native_fp16_from_fp32_value(float f) {
+  return fp16_to_bits(static_cast<float16_t>(f));
+}
+#endif
+
 } // namespace detail
 
 struct alignas(2) Half {
@@ -342,8 +375,13 @@ struct alignas(2) Half {
 #endif
 
   constexpr C10_HOST_DEVICE Half(unsigned short bits, from_bits_t) : x(bits) {}
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+  inline Half(float16_t value);
+  inline operator float16_t() const;
+#else
   inline C10_HOST_DEVICE Half(float value);
   inline C10_HOST_DEVICE operator float() const;
+#endif
 
 #if defined(__CUDACC__) || defined(__HIPCC__)
   inline C10_HOST_DEVICE Half(const __half& value);
@@ -494,7 +532,10 @@ std::enable_if_t<is_complex<From>::value, bool> overflows(
              typename From::value_type>(f.imag());
 }
 
-C10_API std::ostream& operator<<(std::ostream& out, const Half& value);
+C10_API inline std::ostream& operator<<(std::ostream& out, const Half& value) {
+  out << (float)value;
+  return out;
+}
 
 } // namespace c10
 
diff --git a/c10/util/Lazy.h b/c10/util/Lazy.h
new file mode 100644
index 0000000000000..34424691a8d8b
--- /dev/null
+++ b/c10/util/Lazy.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <atomic>
+#include <utility>
+
+namespace c10 {
+
+/**
+ * Thread-safe lazy value with opportunistic concurrency: on concurrent first
+ * access, the factory may be called by multiple threads, but only one result is
+ * stored and its reference returned to all the callers.
+ *
+ * Value is heap-allocated; this optimizes for the case in which the value is
+ * never actually computed.
+ */
+template <class T>
+class OptimisticLazy {
+ public:
+  OptimisticLazy() = default;
+  OptimisticLazy(const OptimisticLazy& other) {
+    if (T* value = other.value_.load(std::memory_order_acquire)) {
+      value_ = new T(*value);
+    }
+  }
+  OptimisticLazy(OptimisticLazy&& other) noexcept
+      : value_(other.value_.exchange(nullptr, std::memory_order_acq_rel)) {}
+  ~OptimisticLazy() {
+    reset();
+  }
+
+  template <class Factory>
+  T& ensure(Factory&& factory) {
+    if (T* value = value_.load(std::memory_order_acquire)) {
+      return *value;
+    }
+    T* value = new T(factory());
+    T* old = nullptr;
+    if (!value_.compare_exchange_strong(
+            old, value, std::memory_order_release, std::memory_order_acquire)) {
+      delete value;
+      value = old;
+    }
+    return *value;
+  }
+
+  // The following methods are not thread-safe: they should not be called
+  // concurrently with any other method.
+
+  OptimisticLazy& operator=(const OptimisticLazy& other) {
+    *this = OptimisticLazy{other};
+    return *this;
+  }
+
+  OptimisticLazy& operator=(OptimisticLazy&& other) noexcept {
+    if (this != &other) {
+      reset();
+      value_.store(
+          other.value_.exchange(nullptr, std::memory_order_acquire),
+          std::memory_order_release);
+    }
+    return *this;
+  }
+
+  void reset() {
+    if (T* old = value_.load(std::memory_order_relaxed)) {
+      value_.store(nullptr, std::memory_order_relaxed);
+      delete old;
+    }
+  }
+
+ private:
+  std::atomic<T*> value_{nullptr};
+};
+
+/**
+ * Interface for a value that is computed on first access.
+ */
+template <class T>
+class LazyValue {
+ public:
+  virtual ~LazyValue() = default;
+
+  virtual const T& get() const = 0;
+};
+
+/**
+ * Convenience thread-safe LazyValue implementation with opportunistic
+ * concurrency.
+ */
+template <class T>
+class OptimisticLazyValue : public LazyValue<T> {
+ public:
+  const T& get() const override {
+    return value_.ensure([this] { return compute(); });
+  }
+
+ private:
+  virtual T compute() const = 0;
+
+  mutable OptimisticLazy<T> value_;
+};
+
+/**
+ * Convenience immutable (thus thread-safe) LazyValue implementation for cases
+ * in which the value is not actually lazy.
+ */
+template <class T>
+class PrecomputedLazyValue : public LazyValue<T> {
+ public:
+  PrecomputedLazyValue(T value) : value_(std::move(value)) {}
+
+  const T& get() const override {
+    return value_;
+  }
+
+ private:
+  T value_;
+};
+
+} // namespace c10
diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp
index 862bc8d19a0de..27feb9346f880 100644
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@@ -1,10 +1,16 @@
 #include <c10/util/Backtrace.h>
 #include <c10/util/Flags.h>
+#include <c10/util/Lazy.h>
 #include <c10/util/Logging.h>
 #ifdef FBCODE_CAFFE2
+#include <common/process/StackTrace.h>
 #include <folly/synchronization/SanitizeThread.h>
 #endif
 
+#ifndef _WIN32
+#include <sys/time.h>
+#endif
+
 #include <algorithm>
 #include <cstdlib>
 #include <iostream>
@@ -20,16 +26,37 @@ C10_DEFINE_bool(
 namespace c10 {
 
 namespace {
-std::function<string()>* GetFetchStackTrace() {
-  static std::function<string()> func = []() {
-    return get_backtrace(/*frames_to_skip=*/1);
+std::function<::c10::Error::Backtrace()>& GetFetchStackTrace() {
+  static std::function<::c10::Error::Backtrace()> func = []() {
+#ifdef FBCODE_CAFFE2
+    // Same implementation as get_backtrace() in fbcode, but with lazy
+    // symbolization.
+    class LazyBacktrace : public OptimisticLazyValue<std::string> {
+      facebook::process::StackTrace st_;
+
+      std::string compute() const override {
+        return st_.toString();
+      }
+    };
+
+    return std::make_shared<LazyBacktrace>();
+#else
+    return std::make_shared<PrecomputedLazyValue<std::string>>(
+        get_backtrace(/*frames_to_skip=*/1));
+#endif
   };
-  return &func;
+  return func;
 };
 } // namespace
 
-void SetStackTraceFetcher(std::function<string(void)> fetcher) {
-  *GetFetchStackTrace() = std::move(fetcher);
+void SetStackTraceFetcher(std::function<::c10::Error::Backtrace()> fetcher) {
+  GetFetchStackTrace() = std::move(fetcher);
+}
+
+void SetStackTraceFetcher(std::function<string()> fetcher) {
+  SetStackTraceFetcher([fetcher = std::move(fetcher)] {
+    return std::make_shared<PrecomputedLazyValue<std::string>>(fetcher());
+  });
 }
 
 void ThrowEnforceNotMet(
@@ -38,7 +65,7 @@ void ThrowEnforceNotMet(
     const char* condition,
     const std::string& msg,
     const void* caller) {
-  c10::Error e(file, line, condition, msg, (*GetFetchStackTrace())(), caller);
+  c10::Error e(file, line, condition, msg, GetFetchStackTrace()(), caller);
   if (FLAGS_caffe2_use_fatal_for_enforce) {
     LOG(FATAL) << e.msg();
   }
@@ -61,7 +88,7 @@ void ThrowEnforceFiniteNotMet(
     const std::string& msg,
     const void* caller) {
   throw c10::EnforceFiniteError(
-      file, line, condition, msg, (*GetFetchStackTrace())(), caller);
+      file, line, condition, msg, GetFetchStackTrace()(), caller);
 }
 
 void ThrowEnforceFiniteNotMet(
@@ -72,15 +99,35 @@ void ThrowEnforceFiniteNotMet(
     const void* caller) {
   ThrowEnforceFiniteNotMet(file, line, condition, std::string(msg), caller);
 }
+
+namespace {
+
+class PyTorchStyleBacktrace : public OptimisticLazyValue<std::string> {
+ public:
+  PyTorchStyleBacktrace(SourceLocation source_location)
+      : backtrace_(GetFetchStackTrace()()), source_location_(source_location) {}
+
+ private:
+  std::string compute() const override {
+    return str(
+        "Exception raised from ",
+        source_location_,
+        " (most recent call first):\n",
+        backtrace_->get());
+  }
+
+  ::c10::Error::Backtrace backtrace_;
+  SourceLocation source_location_;
+};
+
+} // namespace
+
 // PyTorch-style error message
 // (This must be defined here for access to GetFetchStackTrace)
 Error::Error(SourceLocation source_location, std::string msg)
     : Error(
           std::move(msg),
-          str("Exception raised from ",
-              source_location,
-              " (most recent call first):\n",
-              (*GetFetchStackTrace())())) {}
+          std::make_shared<PyTorchStyleBacktrace>(source_location)) {}
 
 using APIUsageLoggerType = std::function<void(const std::string&)>;
 using APIUsageMetadataLoggerType = std::function<void(
@@ -351,27 +398,37 @@ MessageLogger::MessageLogger(const char* file, int line, int severity)
 #else // !ANDROID
   tag_ = "";
 #endif // ANDROID
-  /*
-  time_t rawtime;
-  struct tm * timeinfo;
+
+  time_t rawtime = 0;
   time(&rawtime);
-  timeinfo = localtime(&rawtime);
-  std::chrono::nanoseconds ns =
-      std::chrono::duration_cast<std::chrono::nanoseconds>(
-          std::chrono::high_resolution_clock::now().time_since_epoch());
-  */
+
+#ifndef _WIN32
+  struct tm raw_timeinfo = {0};
+  struct tm* timeinfo = &raw_timeinfo;
+  localtime_r(&rawtime, timeinfo);
+#else
+  // is thread safe on Windows
+  struct tm* timeinfo = localtime(&rawtime);
+#endif
+
+#ifndef _WIN32
+  // Get the current nanoseconds since epoch
+  struct timespec ts = {0};
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  long ns = ts.tv_nsec;
+#else
+  long ns = 0;
+#endif
+
   if (GLOBAL_RANK != -1) {
     stream_ << "[rank" << GLOBAL_RANK << "]:";
   }
-  stream_ << "["
-          << CAFFE2_SEVERITY_PREFIX[std::min(4, GLOG_FATAL - severity_)]
-          //<< (timeinfo->tm_mon + 1) * 100 + timeinfo->tm_mday
-          //<< std::setfill('0')
-          //<< " " << std::setw(2) << timeinfo->tm_hour
-          //<< ":" << std::setw(2) << timeinfo->tm_min
-          //<< ":" << std::setw(2) << timeinfo->tm_sec
-          //<< "." << std::setw(9) << ns.count() % 1000000000
-          << " " << c10::detail::StripBasename(std::string(file)) << ":" << line
+  stream_ << "[" << CAFFE2_SEVERITY_PREFIX[std::min(4, GLOG_FATAL - severity_)]
+          << (timeinfo->tm_mon + 1) * 100 + timeinfo->tm_mday
+          << std::setfill('0') << " " << std::setw(2) << timeinfo->tm_hour
+          << ":" << std::setw(2) << timeinfo->tm_min << ":" << std::setw(2)
+          << timeinfo->tm_sec << "." << std::setw(9) << ns << " "
+          << c10::detail::StripBasename(std::string(file)) << ":" << line
           << "] ";
 }
 
@@ -420,8 +477,7 @@ MessageLogger::~MessageLogger() {
 
 #endif // !C10_USE_GLOG
 
-namespace c10 {
-namespace detail {
+namespace c10::detail {
 namespace {
 
 void setLogLevelFlagFromEnv() {
@@ -467,5 +523,4 @@ void setLogLevelFlagFromEnv() {
 }
 
 } // namespace
-} // namespace detail
-} // namespace c10
+} // namespace c10::detail
diff --git a/c10/util/Logging.h b/c10/util/Logging.h
index 9fa0465827e8b..caab50c8e0cda 100644
--- a/c10/util/Logging.h
+++ b/c10/util/Logging.h
@@ -126,7 +126,14 @@ constexpr bool IsUsingGoogleLogging() {
  */
 C10_API void ShowLogInfoToStderr();
 
-C10_API void SetStackTraceFetcher(std::function<string(void)> fetcher);
+C10_API void SetStackTraceFetcher(
+    std::function<::c10::Error::Backtrace()> fetcher);
+
+/**
+ * Convenience function for non-lazy stack trace fetchers. The Backtrace
+ * overload should be preferred when stringifying the backtrace is expensive.
+ */
+C10_API void SetStackTraceFetcher(std::function<std::string()> fetcher);
 
 using EnforceNotMet = ::c10::Error;
 
@@ -196,40 +203,6 @@ std::string enforceFailMsgImpl(const T1& x, const T2& y, const Args&... args) {
   return c10::str(x, " vs ", y, ". ", args...);
 }
 
-// GCC7 is getting an internal compiler error on the new
-// implementation, so keep the old one (which evaluates the error
-// message eagerly and therefore is undesirable for general use
-// compared to the new one) around for it.
-#if defined(__GNUG__) && __GNUC__ <= 7 && !defined(__clang__)
-template <typename Pred, typename T1, typename T2, typename... Args>
-void enforceThatImpl(
-    Pred p,
-    const T1& lhs,
-    const T2& rhs,
-    const char* file,
-    int line,
-    const char* expr,
-    const void* caller,
-    const Args&... args) {
-  if (C10_UNLIKELY(!(p(lhs, rhs)))) {
-    ::c10::ThrowEnforceNotMet(
-        file,
-        line,
-        expr,
-        ::c10::enforce_detail::enforceFailMsgImpl(lhs, rhs, args...),
-        caller);
-  }
-}
-
-#define CAFFE_ENFORCE_THAT_IMPL(op, lhs, rhs, expr, ...) \
-  ::c10::enforce_detail::enforceThatImpl(                \
-      op, lhs, rhs, __FILE__, __LINE__, expr, nullptr, ##__VA_ARGS__)
-
-#define CAFFE_ENFORCE_THAT_IMPL_WITH_CALLER(op, lhs, rhs, expr, ...) \
-  ::c10::enforce_detail::enforceThatImpl(                            \
-      op, (lhs), (rhs), __FILE__, __LINE__, expr, this, ##__VA_ARGS__)
-
-#else
 template <typename Pred, typename T1, typename T2, typename GetFailMsgFunc>
 void enforceThatImpl(
     Pred p,
@@ -272,7 +245,6 @@ void enforceThatImpl(
         return ::c10::enforce_detail::enforceFailMsgImpl(            \
             arg1, arg2, ##__VA_ARGS__);                              \
       })
-#endif
 
 } // namespace enforce_detail
 
diff --git a/c10/util/ParallelGuard.cpp b/c10/util/ParallelGuard.cpp
new file mode 100644
index 0000000000000..29d1b88dae337
--- /dev/null
+++ b/c10/util/ParallelGuard.cpp
@@ -0,0 +1,19 @@
+#include <c10/util/ParallelGuard.h>
+
+namespace c10 {
+
+thread_local bool in_at_parallel = false;
+
+bool ParallelGuard::is_enabled() {
+  return in_at_parallel;
+}
+
+ParallelGuard::ParallelGuard(bool state) : previous_state_(is_enabled()) {
+  in_at_parallel = state;
+}
+
+ParallelGuard::~ParallelGuard() {
+  in_at_parallel = previous_state_;
+}
+
+} // namespace c10
diff --git a/c10/util/ParallelGuard.h b/c10/util/ParallelGuard.h
new file mode 100644
index 0000000000000..e28289745ae52
--- /dev/null
+++ b/c10/util/ParallelGuard.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+// RAII thread local guard that tracks whether code is being executed in
+// `at::parallel_for` or `at::parallel_reduce` loop function.
+class C10_API ParallelGuard {
+ public:
+  static bool is_enabled();
+
+  ParallelGuard(bool state);
+  ~ParallelGuard();
+
+ private:
+  bool previous_state_;
+};
+
+} // namespace c10
diff --git a/c10/util/SmallBuffer.h b/c10/util/SmallBuffer.h
index 10a6628dfb64c..a4c243ab1c9dc 100644
--- a/c10/util/SmallBuffer.h
+++ b/c10/util/SmallBuffer.h
@@ -55,11 +55,10 @@ class SmallBuffer {
       delete[] data_;
     }
   }
-
-  T& operator[](int64_t idx) {
+  T& operator[](size_t idx) {
     return data()[idx];
   }
-  const T& operator[](int64_t idx) const {
+  const T& operator[](size_t idx) const {
     return data()[idx];
   }
   T* data() {
diff --git a/c10/util/StringUtil.cpp b/c10/util/StringUtil.cpp
index 098d0eca0f381..084c59c7d1618 100644
--- a/c10/util/StringUtil.cpp
+++ b/c10/util/StringUtil.cpp
@@ -2,6 +2,13 @@
 
 #include <string>
 
+#ifndef _WIN32
+#include <codecvt>
+#include <locale>
+#else
+#include <c10/util/Unicode.h>
+#endif
+
 namespace c10 {
 
 namespace detail {
@@ -28,6 +35,37 @@ std::string ExcludeFileExtension(const std::string& file_name) {
   return file_name.substr(0, end_index);
 }
 
+// Narrows the wstr argument and then passes it to _str.
+// Assumes that the input (wide) text is encoded as UTF-16.
+std::ostream& _strFromWide(std::ostream& ss, const std::wstring& wString);
+
+#ifndef _WIN32
+
+std::ostream& _strFromWide(std::ostream& ss, const std::wstring& wString) {
+  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+  return _str(ss, converter.to_bytes(wString));
+}
+
+#else // #ifndef _WIN32
+// The WIN32 implementation of wstring_convert leaks memory; see
+// https://github.com/microsoft/STL/issues/443
+
+std::ostream& _strFromWide(std::ostream& ss, const std::wstring& wString) {
+  return _str(ss, u16u8(wString));
+}
+
+#endif // _WIN32
+
+std::ostream& _str(std::ostream& ss, const wchar_t* wCStr) {
+  return _strFromWide(ss, std::wstring(wCStr));
+}
+std::ostream& _str(std::ostream& ss, const wchar_t& wChar) {
+  return _strFromWide(ss, std::wstring(1, wChar));
+}
+std::ostream& _str(std::ostream& ss, const std::wstring& wString) {
+  return _strFromWide(ss, wString);
+}
+
 } // namespace detail
 
 std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
diff --git a/c10/util/StringUtil.h b/c10/util/StringUtil.h
index 0d7b0303a81ae..157a4f4be28dc 100644
--- a/c10/util/StringUtil.h
+++ b/c10/util/StringUtil.h
@@ -56,6 +56,11 @@ inline std::ostream& _str(std::ostream& ss, const T& t) {
   return ss;
 }
 
+// Overloads of _str for wide types; forces narrowing.
+C10_API std::ostream& _str(std::ostream& ss, const wchar_t* wCStr);
+C10_API std::ostream& _str(std::ostream& ss, const wchar_t& wChar);
+C10_API std::ostream& _str(std::ostream& ss, const std::wstring& wString);
+
 template <>
 inline std::ostream& _str<CompileTimeEmptyString>(
     std::ostream& ss,
diff --git a/c10/util/TypeCast.h b/c10/util/TypeCast.h
index 31fe3a3397adc..558418d3bc054 100644
--- a/c10/util/TypeCast.h
+++ b/c10/util/TypeCast.h
@@ -40,6 +40,21 @@ struct maybe_real<true, src_t> {
   }
 };
 
+template <bool, typename src_t>
+struct maybe_bool {
+  C10_HOST_DEVICE static inline src_t apply(src_t src) {
+    return src;
+  }
+};
+
+template <typename src_t>
+struct maybe_bool<true, src_t> {
+  C10_HOST_DEVICE static inline decltype(auto) apply(src_t src) {
+    // Don't use bool operator so as to to also compile for ComplexHalf.
+    return src.real() || src.imag();
+  }
+};
+
 // Note: deliberately ignores undefined behavior, consistent with NumPy.
 // PyTorch's type conversions can cause a variety of undefined behavior,
 // including float to integral overflow and signed to unsigned integer overflow.
@@ -54,6 +69,17 @@ struct static_cast_with_inter_type {
   }
 };
 
+// Partial template specialization for casting to bool.
+// Need to handle complex types separately, as we don't
+// simply want to cast the real part to bool.
+template <typename src_t>
+struct static_cast_with_inter_type<bool, src_t> {
+  C10_HOST_DEVICE static inline bool apply(src_t src) {
+    constexpr bool complex = needs_real<bool, src_t>::value;
+    return static_cast<bool>(maybe_bool<complex, src_t>::apply(src));
+  }
+};
+
 // Partial template instantiation for casting to uint8.
 // Note: Converting from negative float values to unsigned integer types is
 // undefined behavior in C++, and current CPU and GPU compilers exhibit
diff --git a/c10/util/TypeList.h b/c10/util/TypeList.h
index f1422e99e4d8f..a540a0c5c6744 100644
--- a/c10/util/TypeList.h
+++ b/c10/util/TypeList.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <c10/util/C++17.h>
 #include <c10/util/TypeTraits.h>
 #include <algorithm>
 #include <cstddef>
@@ -501,10 +500,8 @@ struct map_types_to_values final {
 template <class... Types>
 struct map_types_to_values<typelist<Types...>> final {
   template <class Func>
-  static std::tuple<c10::invoke_result_t<Func, type_<Types>>...> call(
-      Func&& func) {
-    return std::tuple<c10::invoke_result_t<Func, type_<Types>>...>{
-        std::forward<Func>(func)(type_<Types>())...};
+  static auto call(Func&& func) {
+    return std::tuple{std::forward<Func>(func)(type_<Types>())...};
   }
 };
 } // namespace detail
diff --git a/c10/util/UniqueVoidPtr.cpp b/c10/util/UniqueVoidPtr.cpp
index d4e9e32810ffe..3554bcf8ee624 100644
--- a/c10/util/UniqueVoidPtr.cpp
+++ b/c10/util/UniqueVoidPtr.cpp
@@ -1,9 +1,7 @@
 #include <c10/util/UniqueVoidPtr.h>
 
-namespace c10 {
-namespace detail {
+namespace c10::detail {
 
 void deleteNothing(void*) {}
 
-} // namespace detail
-} // namespace c10
+} // namespace c10::detail
diff --git a/c10/util/Unroll.h b/c10/util/Unroll.h
index ee7aa9649a814..1154ee75cf0d8 100644
--- a/c10/util/Unroll.h
+++ b/c10/util/Unroll.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <c10/macros/Macros.h>
+#include <type_traits>
 
 // Utility to guarantee complete unrolling of a loop where the bounds are known
 // at compile time. Various pragmas achieve similar effects, but are not as
@@ -11,18 +12,18 @@ namespace c10 {
 
 template <int n>
 struct ForcedUnroll {
-  template <typename Func>
-  C10_ALWAYS_INLINE void operator()(const Func& f) const {
-    ForcedUnroll<n - 1>{}(f);
-    f(n - 1);
+  template <typename Func, typename... Args>
+  C10_ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+    ForcedUnroll<n - 1>{}(f, args...);
+    f(std::integral_constant<int, n - 1>{}, args...);
   }
 };
 
 template <>
 struct ForcedUnroll<1> {
-  template <typename Func>
-  C10_ALWAYS_INLINE void operator()(const Func& f) const {
-    f(0);
+  template <typename Func, typename... Args>
+  C10_ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+    f(std::integral_constant<int, 0>{}, args...);
   }
 };
 
diff --git a/c10/util/complex_math.cpp b/c10/util/complex_math.cpp
index 4029133c33be6..886aadb14151f 100644
--- a/c10/util/complex_math.cpp
+++ b/c10/util/complex_math.cpp
@@ -1,5 +1,4 @@
 #include <c10/util/complex.h>
-#include <c10/util/math_compat.h>
 
 #include <cmath>
 
diff --git a/c10/util/copysign.h b/c10/util/copysign.h
index 1707fc47d3f97..e77b7e61b0aef 100644
--- a/c10/util/copysign.h
+++ b/c10/util/copysign.h
@@ -2,7 +2,6 @@
 
 #include <c10/util/BFloat16.h>
 #include <c10/util/Half.h>
-#include <c10/util/math_compat.h>
 
 namespace c10 {
 
diff --git a/c10/util/hash.h b/c10/util/hash.h
index c8993e6dbaff8..a6a1c7334038d 100644
--- a/c10/util/hash.h
+++ b/c10/util/hash.h
@@ -240,6 +240,17 @@ struct sha1 {
   std::size_t bit_count_high{};
 };
 
+constexpr uint64_t twang_mix64(uint64_t key) noexcept {
+  key = (~key) + (key << 21); // key *= (1 << 21) - 1; key -= 1;
+  key = key ^ (key >> 24);
+  key = key + (key << 3) + (key << 8); // key *= 1 + (1 << 3) + (1 << 8)
+  key = key ^ (key >> 14);
+  key = key + (key << 2) + (key << 4); // key *= 1 + (1 << 2) + (1 << 4)
+  key = key ^ (key >> 28);
+  key = key + (key << 31); // key *= 1 + (1 << 31)
+  return key;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // c10::hash implementation
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index da6865659e63e..035f22e3c1867 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -5,6 +5,7 @@
 #include <atomic>
 #include <climits>
 #include <memory>
+#include <type_traits>
 
 namespace pybind11 {
 template <typename, typename...>
@@ -24,6 +25,11 @@ inline void incref(intrusive_ptr_target* self);
 // constructor tag used by intrusive_ptr constructors
 struct DontIncreaseRefcount {};
 } // namespace raw
+
+namespace detail {
+constexpr uint32_t kImpracticallyHugeReferenceCount = 0x0FFFFFFF;
+} // namespace detail
+
 /**
  * intrusive_ptr<T> is an alternative to shared_ptr<T> that has better
  * performance because it does the refcounting intrusively
@@ -74,8 +80,8 @@ class C10_API intrusive_ptr_target {
   //    atomically increment the use count, if it is greater than 0.
   //    If it is not, you must report that the storage is dead.
   //
-  mutable std::atomic<size_t> refcount_;
-  mutable std::atomic<size_t> weakcount_;
+  mutable std::atomic<uint32_t> refcount_;
+  mutable std::atomic<uint32_t> weakcount_;
 
   template <typename T, typename NullType>
   friend class intrusive_ptr;
@@ -119,15 +125,17 @@ class C10_API intrusive_ptr_target {
         // now it's getting destroyed through whatever mechanism the
         // caller of unsafe_adapt_non_heap_allocated wanted to
         // use). We choose our reference count such that the count
-        // will not dip below INT_MAX regardless.
-        refcount_.load() == 0 || refcount_.load() >= INT_MAX,
+        // will not dip below kImpracticallyHugeReferenceCount regardless.
+        refcount_.load() == 0 ||
+            refcount_.load() >= detail::kImpracticallyHugeReferenceCount,
         "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it; refcount was ",
         refcount_.load());
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         // See ~intrusive_ptr for optimization that will frequently result in 1
         // at destruction time.
         weakcount_.load() == 1 || weakcount_.load() == 0 ||
-            weakcount_.load() == INT_MAX - 1 || weakcount_.load() == INT_MAX,
+            weakcount_.load() == detail::kImpracticallyHugeReferenceCount - 1 ||
+            weakcount_.load() == detail::kImpracticallyHugeReferenceCount,
         "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it");
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
@@ -190,23 +198,23 @@ TTarget* assign_ptr_(TTarget* rhs) {
 
 // Increment needs to be acquire-release to make use_count() and
 // unique() reliable.
-inline size_t atomic_refcount_increment(std::atomic<size_t>& refcount) {
+inline uint32_t atomic_refcount_increment(std::atomic<uint32_t>& refcount) {
   return refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
 }
 
 // weak_use_count() is only used for testing, so we don't need it to
 // be reliable. Relaxed should be fine.
-inline size_t atomic_weakcount_increment(std::atomic<size_t>& weakcount) {
+inline uint32_t atomic_weakcount_increment(std::atomic<uint32_t>& weakcount) {
   return weakcount.fetch_add(1, std::memory_order_relaxed) + 1;
 }
 
 // Both decrements need to be acquire-release for correctness. See
 // e.g. std::shared_ptr implementation.
-inline size_t atomic_refcount_decrement(std::atomic<size_t>& refcount) {
+inline uint32_t atomic_refcount_decrement(std::atomic<uint32_t>& refcount) {
   return refcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
 }
 
-inline size_t atomic_weakcount_decrement(std::atomic<size_t>& weakcount) {
+inline uint32_t atomic_weakcount_decrement(std::atomic<uint32_t>& weakcount) {
   return weakcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
 }
 
@@ -261,7 +269,7 @@ class intrusive_ptr final {
 
   void retain_() {
     if (target_ != NullType::singleton()) {
-      size_t new_refcount =
+      uint32_t new_refcount =
           detail::atomic_refcount_increment(target_->refcount_);
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
           new_refcount != 1,
@@ -384,10 +392,10 @@ class intrusive_ptr final {
     return *this;
   }
 
+  // Assignment is implemented using copy and swap. That's safe for self
+  // assignment.
+  // NOLINTNEXTLINE(bugprone-unhandled-self-assignment)
   intrusive_ptr& operator=(const intrusive_ptr& rhs) & noexcept {
-    if (this == &rhs) {
-      return *this;
-    }
     // NOLINTNEXTLINE(*assign-operator, *assignment-signature)
     return operator= <TTarget, NullType>(rhs);
   }
@@ -433,14 +441,14 @@ class intrusive_ptr final {
     return target_ != NullType::singleton();
   }
 
-  size_t use_count() const noexcept {
+  uint32_t use_count() const noexcept {
     if (target_ == NullType::singleton()) {
       return 0;
     }
     return target_->refcount_.load(std::memory_order_acquire);
   }
 
-  size_t weak_use_count() const noexcept {
+  uint32_t weak_use_count() const noexcept {
     if (target_ == NullType::singleton()) {
       return 0;
     }
@@ -532,22 +540,24 @@ class intrusive_ptr final {
    */
   static intrusive_ptr unsafe_adapt_non_heap_allocated(
       TTarget* raw_ptr,
-      size_t expected_decrefs) {
+      uint32_t expected_decrefs) {
     intrusive_ptr result(raw_ptr, raw::DontIncreaseRefcount{});
-    // INT_MAX is impractically huge for a reference count, while
-    // being in no danger of overflowing size_t. We actually only need to
-    // initialize the refcount to 2 -- we are just doing an unbalanced
+    // kImpracticallyHugeReferenceCount is impractically huge for a reference
+    // count, while being in no danger of overflowing uint32_t. We actually only
+    // need to initialize the refcount to 2 -- we are just doing an unbalanced
     // incref to prevent the non-heap-allocated target from being
     // freed, and we are optimizing that incref by directly
     // initializing the refcounts rather than doing an expensive
-    // atomic increment. The reason to use INT_MAX is to accommodate
-    // the debug assertions in ~intrusive_ptr_target.
+    // atomic increment. The reason to use kImpracticallyHugeReferenceCount is
+    // to accommodate the debug assertions in ~intrusive_ptr_target.
 #ifdef NDEBUG
     expected_decrefs = 0;
 #endif
     result.target_->refcount_.store(
-        INT_MAX + expected_decrefs, std::memory_order_relaxed);
-    result.target_->weakcount_.store(INT_MAX, std::memory_order_relaxed);
+        detail::kImpracticallyHugeReferenceCount + expected_decrefs,
+        std::memory_order_relaxed);
+    result.target_->weakcount_.store(
+        detail::kImpracticallyHugeReferenceCount, std::memory_order_relaxed);
     return result;
   }
 
@@ -695,7 +705,7 @@ class weak_intrusive_ptr final {
 
   void retain_() {
     if (target_ != NullType::singleton()) {
-      size_t new_weakcount =
+      uint32_t new_weakcount =
           detail::atomic_weakcount_increment(target_->weakcount_);
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
           new_weakcount != 1,
@@ -834,7 +844,7 @@ class weak_intrusive_ptr final {
     return target_;
   }
 
-  size_t use_count() const noexcept {
+  uint32_t use_count() const noexcept {
     if (target_ == NullType::singleton()) {
       return 0;
     }
@@ -842,7 +852,7 @@ class weak_intrusive_ptr final {
         std::memory_order_acquire); // refcount, not weakcount!
   }
 
-  size_t weak_use_count() const noexcept {
+  uint32_t weak_use_count() const noexcept {
     if (target_ == NullType::singleton()) {
       return 0;
     }
@@ -1002,7 +1012,7 @@ inline T* make_weak(T* self) {
   return wptr.release();
 }
 
-inline size_t use_count(intrusive_ptr_target* self) {
+inline uint32_t use_count(intrusive_ptr_target* self) {
   auto ptr = c10::intrusive_ptr<intrusive_ptr_target>::reclaim(self);
   auto r = ptr.use_count();
   ptr.release();
@@ -1033,7 +1043,7 @@ inline T* lock(T* self) {
 }
 
 // This gives the STRONG refcount of a WEAK pointer
-inline size_t use_count(weak_intrusive_ptr_target* self) {
+inline uint32_t use_count(weak_intrusive_ptr_target* self) {
   auto wptr = c10::weak_intrusive_ptr<intrusive_ptr_target>::reclaim(self);
   auto r = wptr.use_count();
   wptr.release();
diff --git a/c10/util/math_compat.h b/c10/util/math_compat.h
deleted file mode 100644
index 39cc5b04c0812..0000000000000
--- a/c10/util/math_compat.h
+++ /dev/null
@@ -1,256 +0,0 @@
-#pragma once
-
-#include <cmath>
-
-// Android NDK platform < 21 with libstdc++ has spotty C++11 support.
-// Various hacks in this header allow the rest of the codebase to use
-// standard APIs.
-#if (defined(__ANDROID__) && __ANDROID_API__ < 21 && defined(__GLIBCXX__)) || \
-    defined(__NEWLIB__)
-#include <stdexcept>
-
-namespace std {
-// Import double versions of these functions from the global namespace.
-using ::acosh;
-using ::asinh;
-using ::atanh;
-using ::erf;
-using ::erfc;
-using ::expm1;
-using ::lgamma;
-using ::log1p;
-using ::nearbyint;
-using ::round;
-using ::tgamma;
-using ::trunc;
-using ::truncf;
-
-// Define float versions the same way as more recent libstdc++
-inline float acosh(float x) {
-  return __builtin_acoshf(x);
-}
-inline float asinh(float x) {
-  return __builtin_asinhf(x);
-}
-inline float atanh(float x) {
-  return __builtin_atanhf(x);
-}
-inline float copysign(float x, float y) {
-  return __builtin_copysignf(x, y);
-}
-inline float erf(float x) {
-  return __builtin_erff(x);
-}
-inline float erfc(float x) {
-  return __builtin_erfcf(x);
-}
-inline float expm1(float x) {
-  return __builtin_expm1f(x);
-}
-inline float fmax(float x, float y) {
-  return __builtin_fmaxf(x, y);
-}
-inline float fmin(float x, float y) {
-  return __builtin_fminf(x, y);
-}
-inline float lgamma(float x) {
-  return __builtin_lgammaf(x);
-}
-inline float log1p(float x) {
-  return __builtin_log1pf(x);
-}
-inline float nearbyint(float x) {
-  return __builtin_nearbyintf(x);
-}
-inline float remainder(float x, float y) {
-  return __builtin_remainderf(x, y);
-}
-inline float round(float x) {
-  return __builtin_roundf(x);
-}
-inline float tgamma(float x) {
-  return __builtin_tgammaf(x);
-}
-inline float trunc(float x) {
-  return __builtin_truncf(x);
-}
-
-// __builtin_nexttoward isn't doesn't work.  It appears to try to
-// link against the global nexttoward function, which is not present
-// prior to API 18.  Just bail for now.
-inline float nexttoward(float x, long double y) {
-  throw std::runtime_error("std::nexttoward is not present on older Android");
-}
-inline double nexttoward(double x, long double y) {
-  throw std::runtime_error("std::nexttoward is not present on older Android");
-}
-
-#if !defined(__NEWLIB__)
-// TODO: this function needs to be implemented and tested. Currently just throw
-// an error.
-inline float hypot(float x, float y) {
-  throw std::runtime_error("std::hypot is not implemented on older Android");
-}
-inline double hypot(double x, double y) {
-  throw std::runtime_error("std::hypot is not implemented on older Android");
-}
-#else
-inline float hypot(float x, float y) {
-  return hypot((double)x, (double)y);
-}
-#endif
-
-// TODO: this function needs to be implemented and tested. Currently just throw
-// an error.
-inline float igamma(float x, float y) {
-  throw std::runtime_error("igamma is not implemented on older Android");
-}
-inline double igamma(double x, double y) {
-  throw std::runtime_error("igamma is not implemented on older Android");
-}
-inline float igammac(float x, float y) {
-  throw std::runtime_error("igammac is not implemented on older Android");
-}
-inline double igammac(double x, double y) {
-  throw std::runtime_error("igammac is not implemented on older Android");
-}
-
-// Note: std::signbit returns true for negative zero (-0), but this
-// implementation returns false.
-inline bool signbit(float x) {
-  return x < 0;
-}
-inline bool signbit(double x) {
-  return x < 0;
-}
-inline bool signbit(long double x) {
-  return x < 0;
-}
-
-#if !defined(__NEWLIB__)
-// TODO: this function needs to be implemented and tested. Currently just throw
-// an error.
-inline float nextafter(float x, float y) {
-  throw std::runtime_error(
-      "std::nextafter is not implemented on older Android");
-}
-inline double nextafter(double x, double y) {
-  throw std::runtime_error(
-      "std::nextafter is not implemented on older Android");
-}
-#else
-inline float nextafter(float x, float y) {
-  return nextafter((double)x, (double)y);
-}
-#endif
-
-#if !defined(__NEWLIB__)
-// TODO: this function needs to be implemented and tested. Currently just throw
-// an error.
-inline float exp2(float x) {
-  throw std::runtime_error("std::exp2 is not implemented on older Android");
-}
-inline double exp2(double x) {
-  throw std::runtime_error("std::exp2 is not implemented on older Android");
-}
-#else
-inline float exp2(float x) {
-  return exp2((double)x);
-}
-#endif
-
-// Define integral versions the same way as more recent libstdc++
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value, double>::type acosh(T x) {
-  return __builtin_acosh(x);
-}
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value, double>::type asinh(T x) {
-  return __builtin_asinh(x);
-}
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value, double>::type atanh(T x) {
-  return __builtin_atanh(x);
-}
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value, double>::type erf(T x) {
-  return __builtin_erf(x);
-}
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value, double>::type erfc(T x) {
-  return __builtin_erfc(x);
-}
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value, double>::type expm1(T x) {
-  return __builtin_expm1(x);
-}
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value, double>::type lgamma(T x) {
-  return __builtin_lgamma(x);
-}
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value, double>::type log1p(T x) {
-  return __builtin_log1p(x);
-}
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value, double>::type nearbyint(
-    T x) {
-  return __builtin_nearbyint(x);
-}
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value, double>::type round(T x) {
-  return __builtin_round(x);
-}
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value, double>::type tgamma(T x) {
-  return __builtin_tgamma(x);
-}
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value, double>::type trunc(T x) {
-  return __builtin_trunc(x);
-}
-
-// Convoluted definition of these binary functions for overloads other than
-// (float,float) and (double,double).  Using a template from __gnu_cxx
-// is dirty, but this code is only enabled on a dead platform, so there
-// shouldn't be any risk of it breaking due to updates.
-template <typename T, typename U>
-typename __gnu_cxx::__promote_2<T, U>::__type fmax(T x, U y) {
-  typedef typename __gnu_cxx::__promote_2<T, U>::__type type;
-  return fmax(type(x), type(y));
-}
-template <typename T, typename U>
-typename __gnu_cxx::__promote_2<T, U>::__type fmin(T x, U y) {
-  typedef typename __gnu_cxx::__promote_2<T, U>::__type type;
-  return fmin(type(x), type(y));
-}
-template <typename T, typename U>
-typename __gnu_cxx::__promote_2<T, U>::__type copysign(T x, U y) {
-  typedef typename __gnu_cxx::__promote_2<T, U>::__type type;
-  return copysign(type(x), type(y));
-}
-template <typename T, typename U>
-typename __gnu_cxx::__promote_2<T, U>::__type remainder(T x, U y) {
-  typedef typename __gnu_cxx::__promote_2<T, U>::__type type;
-  return remainder(type(x), type(y));
-}
-
-// log2 is a macro on Android API < 21, so we need to define it ourselves.
-inline float log2(float arg) {
-  return ::log(arg) / ::log(2.0);
-}
-#if !defined(__NEWLIB__)
-inline double log2(double arg) {
-  return ::log(arg) / ::log(2.0);
-}
-#endif
-inline long double log2(long double arg) {
-  return ::log(arg) / ::log(2.0);
-}
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value, double>::type log2(T x) {
-  return ::log(x) / ::log(2.0);
-}
-} // namespace std
-
-#endif
diff --git a/c10/util/signal_handler.cpp b/c10/util/signal_handler.cpp
index ca1993416c2e8..e9b1127238548 100644
--- a/c10/util/signal_handler.cpp
+++ b/c10/util/signal_handler.cpp
@@ -10,10 +10,14 @@
 #include <sys/syscall.h>
 #include <unistd.h>
 
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <iostream>
+#include <mutex>
 
 #ifdef C10_ANDROID
 #ifndef SYS_gettid
@@ -109,8 +113,9 @@ FatalSignalHandler::FatalSignalHandler()
     : fatalSignalHandlersInstalled(false),
       fatalSignalReceived(false),
       fatalSignalName("<UNKNOWN>"),
-      writingCond(PTHREAD_COND_INITIALIZER),
-      writingMutex(PTHREAD_MUTEX_INITIALIZER) {}
+      writingCond(),
+      writingMutex(),
+      signalReceived(false) {}
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
 FatalSignalHandler::signal_handler FatalSignalHandler::kSignalHandlers[] = {
@@ -157,8 +162,10 @@ void FatalSignalHandler::callPreviousSignalHandler(
 
 // needsLock signals whether we need to lock our writing mutex.
 void FatalSignalHandler::stacktraceSignalHandler(bool needsLock) {
+  std::unique_lock<std::mutex> ul(writingMutex, std::defer_lock);
   if (needsLock) {
-    pthread_mutex_lock(&writingMutex);
+    ul.lock();
+    signalReceived = true;
   }
   pid_t tid = static_cast<pid_t>(syscall(SYS_gettid));
   std::string backtrace = fmt::format(
@@ -170,8 +177,8 @@ void FatalSignalHandler::stacktraceSignalHandler(bool needsLock) {
       c10::get_backtrace());
   std::cerr << backtrace << std::endl;
   if (needsLock) {
-    pthread_mutex_unlock(&writingMutex);
-    pthread_cond_signal(&writingCond);
+    ul.unlock();
+    writingCond.notify_all();
   }
 }
 
@@ -204,23 +211,32 @@ void FatalSignalHandler::fatalSignalHandler(int signum) {
     pid_t pid = getpid();
     pid_t currentTid = static_cast<pid_t>(syscall(SYS_gettid));
     struct dirent* entry = nullptr;
-    pthread_mutex_lock(&writingMutex);
+    std::unique_lock<std::mutex> ul(writingMutex);
     while ((entry = readdir(procDir)) != nullptr) {
       if (entry->d_name[0] == '.') {
         continue;
       }
       pid_t tid = atoi(entry->d_name);
       // If we've found the current thread then we'll jump into the SIGUSR2
-      // handler before calling pthread_cond_wait thus deadlocking, so branch
-      // our directly to the backtrace handler instead of signaling it.
+      // handler instead of signaling to avoid deadlocking.
       if (tid != currentTid) {
+        signalReceived = false;
         syscall(SYS_tgkill, pid, tid, SIGUSR2);
-        pthread_cond_wait(&writingCond, &writingMutex);
+        auto now = std::chrono::system_clock::now();
+        using namespace std::chrono_literals;
+        // we use wait_until instead of wait because on ROCm there was
+        // a single thread that wouldn't receive the SIGUSR2
+        if (std::cv_status::timeout == writingCond.wait_until(ul, now + 2s)) {
+          if (!signalReceived) {
+            std::cerr << "signal lost waiting for stacktrace " << pid << ":"
+                      << tid << std::endl;
+            break;
+          }
+        }
       } else {
         stacktraceSignalHandler(false);
       }
     }
-    pthread_mutex_unlock(&writingMutex);
   } else {
     perror("Failed to open /proc/self/task");
   }
diff --git a/c10/util/signal_handler.h b/c10/util/signal_handler.h
index 3efaf27b85c64..122d3598424ec 100644
--- a/c10/util/signal_handler.h
+++ b/c10/util/signal_handler.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <atomic>
+#include <condition_variable>
 #include <csignal>
 #include <cstdint>
 #include <mutex>
@@ -89,8 +90,10 @@ class C10_API FatalSignalHandler {
   // This wait condition is used to wait for other threads to finish writing
   // their stack trace when in fatal sig handler (we can't use pthread_join
   // because there's no way to convert from a tid to a pthread_t).
-  pthread_cond_t writingCond;
-  pthread_mutex_t writingMutex;
+  std::condition_variable writingCond;
+  std::mutex writingMutex;
+  // used to indicate if the other thread responded to the signal
+  bool signalReceived;
 
   struct signal_handler {
     const char* name;
diff --git a/c10/util/strong_type.h b/c10/util/strong_type.h
index da17e2da89345..2a8dd1faea8de 100644
--- a/c10/util/strong_type.h
+++ b/c10/util/strong_type.h
@@ -20,11 +20,7 @@
 #include <type_traits>
 #include <utility>
 
-#if __cplusplus >= 201703L
 #define STRONG_NODISCARD [[nodiscard]]
-#else
-#define STRONG_NODISCARD
-#endif
 
 #if defined(_MSC_VER) && !defined(__clang__) && __MSC_VER < 1922
 #define STRONG_CONSTEXPR
diff --git a/c10/xpu/CMakeLists.txt b/c10/xpu/CMakeLists.txt
new file mode 100644
index 0000000000000..c14f1790d9d8d
--- /dev/null
+++ b/c10/xpu/CMakeLists.txt
@@ -0,0 +1,46 @@
+# Build file for the C10 XPU.
+#
+# C10 XPU is a minimal library, but it does depend on SYCL.
+
+include(../../cmake/public/xpu.cmake)
+
+set(C10_XPU_SRCS
+    XPUCachingAllocator.cpp
+    XPUFunctions.cpp
+    XPUStream.cpp
+    impl/XPUGuardImpl.cpp
+)
+set(C10_XPU_HEADERS
+    XPUCachingAllocator.h
+    XPUDeviceProp.h
+    XPUException.h
+    XPUFunctions.h
+    XPUMacros.h
+    XPUStream.h
+    impl/XPUGuardImpl.h
+)
+
+add_library(c10_xpu ${C10_XPU_SRCS} ${C10_XPU_HEADERS})
+target_compile_options(c10_xpu PRIVATE "-DC10_XPU_BUILD_MAIN_LIB")
+# Enable hidden visibility if compiler supports it.
+if(${COMPILER_SUPPORTS_HIDDEN_VISIBILITY})
+  target_compile_options(c10_xpu PRIVATE "-fvisibility=hidden")
+endif()
+
+# ---[ Dependency of c10_xpu
+target_link_libraries(c10_xpu PUBLIC c10 torch::xpurt)
+target_include_directories(
+    c10_xpu PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../..>
+    $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
+    $<INSTALL_INTERFACE:include>
+    )
+
+add_subdirectory(test)
+
+# ---[ Installation
+install(TARGETS c10_xpu EXPORT Caffe2Targets DESTINATION lib)
+foreach(file ${C10_XPU_HEADERS})
+  get_filename_component(dir ${file} DIRECTORY)
+  install(FILES ${file} DESTINATION include/c10/xpu/${dir})
+endforeach()
diff --git a/c10/xpu/XPUCachingAllocator.cpp b/c10/xpu/XPUCachingAllocator.cpp
new file mode 100644
index 0000000000000..da57191fa1601
--- /dev/null
+++ b/c10/xpu/XPUCachingAllocator.cpp
@@ -0,0 +1,584 @@
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/irange.h>
+#include <c10/xpu/XPUCachingAllocator.h>
+
+#include <deque>
+#include <mutex>
+#include <set>
+#include <vector>
+
+namespace c10::xpu::XPUCachingAllocator {
+
+// newly allocated memory with 512-byte alignment.
+constexpr size_t kDeviceAlignment = 512;
+// all sizes are rounded to at least 512 bytes
+constexpr size_t kMinBlockSize = 512;
+// largest "small" allocation is 1 MiB
+constexpr size_t kSmallSize = 1048576;
+// "small" allocations are packed in 2 MiB blocks
+constexpr size_t kSmallBuffer = 2097152;
+// "large" allocations may be packed in 20 MiB blocks
+constexpr size_t kLargeBuffer = 20971520;
+// allocations between 1 and 10 MiB may use kLargeBuffer
+constexpr size_t kMinLargeAlloc = 10485760;
+// round up large allocations to 2 MiB
+constexpr size_t kRoundLarge = 2097152;
+
+namespace {
+using stream_set = ska::flat_hash_set<xpu::XPUStream>;
+
+struct Block;
+typedef bool (*Comparison)(const Block*, const Block*);
+bool BlockComparatorSize(const Block* a, const Block* b);
+
+struct BlockPool {
+  BlockPool(bool small) : blocks(BlockComparatorSize), is_small(small) {}
+  std::set<Block*, Comparison> blocks;
+  const bool is_small;
+};
+
+struct Block {
+  DeviceIndex device;
+  sycl::queue* queue{nullptr}; // underlying queue of the allocation stream
+  stream_set stream_uses; // streams on which the block was used
+  size_t size; // block size in bytes
+  size_t requested_size; // memory originally requested
+  BlockPool* pool{nullptr}; // owning memory pool
+  void* ptr{nullptr}; // memory address
+  bool allocated{false}; // in-use flag
+  Block* prev{nullptr}; // prev block if split from a larger allocation
+  Block* next{nullptr}; // next block if split from a larger allocation
+  int event_count{0}; // number of outstanding XPU events
+
+  Block(
+      DeviceIndex device,
+      sycl::queue* queue,
+      size_t size,
+      BlockPool* pool,
+      void* ptr)
+      : device(device),
+        queue(queue),
+        stream_uses(),
+        size(size),
+        requested_size(0),
+        pool(pool),
+        ptr(ptr) {}
+
+  // constructor for search key
+  Block(DeviceIndex device, sycl::queue* queue, size_t size)
+      : device(device),
+        queue(queue),
+        stream_uses(),
+        size(size),
+        requested_size(0) {}
+
+  bool is_split() const {
+    return (prev != nullptr) || (next != nullptr);
+  }
+};
+
+bool BlockComparatorSize(const Block* a, const Block* b) {
+  if (a->queue != b->queue) {
+    return reinterpret_cast<uintptr_t>(a->queue) <
+        reinterpret_cast<uintptr_t>(b->queue);
+  }
+  if (a->size != b->size) {
+    return a->size < b->size;
+  }
+  return reinterpret_cast<uintptr_t>(a->ptr) <
+      reinterpret_cast<uintptr_t>(b->ptr);
+}
+
+struct AllocParams {
+  AllocParams(
+      DeviceIndex device,
+      size_t size,
+      sycl::queue* queue,
+      BlockPool* pool,
+      size_t alloc_size)
+      : search_key(device, queue, size),
+        pool(pool),
+        alloc_size(alloc_size),
+        block(nullptr) {}
+
+  DeviceIndex device() const {
+    return search_key.device;
+  }
+
+  sycl::queue* queue() const {
+    return search_key.queue;
+  }
+
+  size_t size() const {
+    return search_key.size;
+  }
+
+  Block search_key;
+  BlockPool* pool;
+  size_t alloc_size;
+  Block* block;
+};
+
+} // anonymous namespace
+
+class DeviceCachingAllocator {
+ private:
+  mutable std::recursive_mutex mutex;
+  BlockPool large_blocks; // unallocated cached blocks larger than 1 MB
+  BlockPool small_blocks; // unallocated cached blocks 1 MB or smaller
+  ska::flat_hash_set<Block*> active_blocks; // allocated or in use by a stream
+  ska::flat_hash_map<xpu::XPUStream, std::deque<std::pair<sycl::event, Block*>>>
+      xpu_events;
+  DeviceIndex device_index;
+
+  size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
+    if (!src || src->allocated || src->event_count > 0 ||
+        !src->stream_uses.empty()) {
+      return 0;
+    }
+
+    TORCH_INTERNAL_ASSERT(dst->is_split() && src->is_split());
+    if (dst->prev == src) { // [src dst]
+      dst->ptr = src->ptr;
+      dst->prev = src->prev;
+      if (dst->prev) {
+        dst->prev->next = dst;
+      }
+    } else { // [dst src]
+      dst->next = src->next;
+      if (dst->next) {
+        dst->next->prev = dst;
+      }
+    }
+    const size_t subsumed_size = src->size;
+    dst->size += subsumed_size;
+    auto erased = pool.blocks.erase(src);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(erased == 1);
+    delete src;
+
+    return subsumed_size;
+  }
+
+  void free_block(Block* block) {
+    TORCH_INTERNAL_ASSERT(
+        !block->allocated && block->event_count == 0 &&
+        block->stream_uses.empty());
+
+    auto& pool = *block->pool;
+    const std::array<Block*, 2> merge_candidates = {block->prev, block->next};
+    for (Block* merge_candidate : merge_candidates) {
+      try_merge_blocks(block, merge_candidate, pool);
+    }
+
+    active_blocks.erase(block);
+    bool inserted = pool.blocks.insert(block).second;
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
+  }
+
+  void process_events() {
+    using namespace sycl::info;
+    for (auto it = xpu_events.begin(); it != xpu_events.end();) {
+      while (!it->second.empty()) {
+        auto& e = it->second.front();
+        auto event = e.first;
+        auto* block = e.second;
+        if (event.get_info<event::command_execution_status>() !=
+            event_command_status::complete) {
+          break;
+        }
+        block->event_count--;
+        if (block->event_count == 0) {
+          free_block(block);
+        }
+        it->second.pop_front();
+      }
+
+      if (it->second.empty()) {
+        it = xpu_events.erase(it);
+      } else {
+        it++;
+      }
+    }
+  }
+
+  static size_t round_size(size_t size) {
+    if (size < kMinBlockSize) {
+      return kMinBlockSize;
+    } else {
+      return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
+    }
+  }
+
+  static size_t get_allocation_size(size_t size) {
+    if (size <= kSmallSize) {
+      return kSmallBuffer;
+    } else if (size < kMinLargeAlloc) {
+      return kLargeBuffer;
+    } else {
+      return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
+    }
+  }
+
+  BlockPool& get_pool(size_t size) {
+    if (size < kSmallSize) {
+      return small_blocks;
+    } else {
+      return large_blocks;
+    }
+  }
+
+  bool get_free_block(AllocParams& p) {
+    BlockPool& pool = *p.pool;
+    auto it = pool.blocks.lower_bound(&p.search_key);
+    if (it == pool.blocks.end() || (*it)->queue != p.queue()) {
+      return false;
+    }
+    p.block = *it;
+    pool.blocks.erase(it);
+    return true;
+  }
+
+  bool alloc_block(AllocParams& p) {
+    auto size = p.alloc_size;
+    auto device = p.device();
+    void* ptr = sycl::aligned_alloc_device(
+        kDeviceAlignment,
+        size,
+        xpu::get_raw_device(device),
+        xpu::get_device_context());
+    if (!ptr) {
+      return false;
+    }
+    p.block = new Block(device, p.queue(), size, p.pool, ptr);
+    return true;
+  }
+
+  void synchronize_and_free_events() {
+    for (auto& xe : xpu_events) {
+      for (auto& e : xe.second) {
+        auto event = e.first;
+        auto* block = e.second;
+        event.wait();
+        block->event_count--;
+        if (block->event_count == 0) {
+          free_block(block);
+        }
+      }
+    }
+    xpu_events.clear();
+  }
+
+  void release_block(Block* block) {
+    /*
+     * Note [Safe to Free Blocks on BlockPool]
+     *
+     * Callers must ensure that all accesses to the block, whose raw pointer is
+     * allocated by SYCL APIs, have been completed before invoking sycl::free.
+     *
+     * We have to do a device-level synchronization before free these blocks to
+     * guarantee that all kernels can access to the blocks have finished.
+     */
+    sycl::free(block->ptr, xpu::get_device_context());
+    auto* pool = block->pool;
+    pool->blocks.erase(block);
+    delete block;
+  }
+
+  void release_blocks(BlockPool& pool) {
+    auto it = pool.blocks.begin();
+    while (it != pool.blocks.end()) {
+      Block* block = *it;
+      ++it;
+      if (!block->prev && !block->next) {
+        release_block(block);
+      }
+    }
+  }
+
+  bool release_cached_blocks() {
+    synchronize_and_free_events();
+    // See Note [Safe to Free Blocks on BlockPool]
+    c10::xpu::syncStreamsOnDevice(device_index);
+
+    release_blocks(large_blocks);
+    release_blocks(small_blocks);
+    return true;
+  }
+
+  bool should_split(const Block* block, size_t size) {
+    size_t remaining = block->size - size;
+    if (block->pool->is_small) {
+      return remaining >= kMinBlockSize;
+    } else {
+      return remaining > kSmallSize;
+    }
+  }
+
+  Block* alloc_found_block(
+      AllocParams params,
+      size_t orig_size,
+      bool split_remainder) {
+    auto size = params.size();
+    auto device = params.device();
+    BlockPool* pool = params.pool;
+    sycl::queue* queue = params.queue();
+
+    TORCH_INTERNAL_ASSERT(
+        params.block != nullptr && params.block->ptr != nullptr);
+    Block* block = params.block;
+    Block* remaining = nullptr;
+
+    if (split_remainder) {
+      remaining = block;
+
+      block = new Block(device, queue, size, pool, block->ptr);
+      block->prev = remaining->prev;
+      if (block->prev) {
+        block->prev->next = block;
+      }
+      block->next = remaining;
+
+      remaining->prev = block;
+      remaining->ptr = static_cast<char*>(remaining->ptr) + size;
+      remaining->size -= size;
+      bool inserted = pool->blocks.insert(remaining).second;
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
+    }
+
+    block->allocated = true;
+    block->requested_size = orig_size;
+    bool inserted = active_blocks.insert(block).second;
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted)
+
+    return block;
+  }
+
+  void insert_events(Block* block) {
+    stream_set streams(std::move(block->stream_uses));
+    TORCH_INTERNAL_ASSERT(block->stream_uses.empty());
+    for (auto& stream : streams) {
+      block->event_count++;
+      xpu_events[stream].emplace_back(
+          stream.queue().ext_oneapi_submit_barrier(), block);
+    }
+  }
+
+ public:
+  DeviceCachingAllocator(DeviceIndex device_index)
+      : large_blocks(/* small */ false),
+        small_blocks(/* small */ true),
+        device_index(device_index) {}
+
+  Block* malloc(DeviceIndex device, size_t orig_size, sycl::queue& queue) {
+    std::scoped_lock<std::recursive_mutex> lock(mutex);
+    process_events();
+    size_t size = round_size(orig_size);
+    auto& pool = get_pool(size);
+    const size_t alloc_size = get_allocation_size(size);
+    AllocParams params(device, size, &queue, &pool, alloc_size);
+
+    // First, try to get a block from the existing pool.
+    bool block_found = get_free_block(params);
+    // Can't reuse an existing block, try to get a new one.
+    if (!block_found) {
+      block_found = alloc_block(params) ||
+          (release_cached_blocks() && alloc_block(params));
+    }
+    TORCH_CHECK(
+        block_found,
+        "XPU out of memory, please use `empty_cache` to release all unoccupied cached memory.");
+    bool split_remainder = should_split(params.block, params.size());
+    return alloc_found_block(std::move(params), orig_size, split_remainder);
+  }
+
+  void free(Block* block) {
+    std::scoped_lock<std::recursive_mutex> lock(mutex);
+    block->allocated = false;
+
+    if (!block->stream_uses.empty()) {
+      insert_events(block);
+    } else {
+      free_block(block);
+    }
+  }
+
+  void recordStream(Block* block, xpu::XPUStream stream) {
+    std::scoped_lock<std::recursive_mutex> lock(mutex);
+    if (stream.queue() == *block->queue) {
+      return;
+    }
+    block->stream_uses.insert(stream);
+  }
+
+  void emptyCache() {
+    std::scoped_lock<std::recursive_mutex> lock(mutex);
+    release_cached_blocks();
+  }
+};
+
+void local_raw_delete(void* ptr);
+
+class XPUAllocator : public Allocator {
+ private:
+  std::mutex mutex;
+  ska::flat_hash_map<void*, Block*> allocated_blocks;
+
+  void add_allocated_block(Block* block) {
+    std::lock_guard<std::mutex> lock(mutex);
+    allocated_blocks[block->ptr] = block;
+  }
+
+  Block* get_allocated_block(void* ptr, bool remove = false) {
+    std::scoped_lock<std::mutex> lock(mutex);
+    auto it = allocated_blocks.find(ptr);
+    if (it == allocated_blocks.end()) {
+      return nullptr;
+    }
+    Block* block = it->second;
+    if (remove) {
+      allocated_blocks.erase(it);
+    }
+    return block;
+  }
+
+ public:
+  std::vector<std::unique_ptr<DeviceCachingAllocator>> device_allocators;
+
+  void init(DeviceIndex device_count) {
+    const auto size = static_cast<DeviceIndex>(device_allocators.size());
+    if (size < device_count) {
+      device_allocators.resize(device_count);
+      for (const auto i : c10::irange(size, device_count)) {
+        device_allocators[i] = std::make_unique<DeviceCachingAllocator>(i);
+      }
+    }
+  }
+
+  void malloc(
+      void** devPtr,
+      DeviceIndex device,
+      size_t size,
+      sycl::queue& queue) {
+    TORCH_INTERNAL_ASSERT(
+        0 <= device && static_cast<size_t>(device) < device_allocators.size(),
+        "Allocator not initialized for device ",
+        static_cast<int16_t>(device),
+        ": did you call init?");
+    Block* block = device_allocators[device]->malloc(device, size, queue);
+    add_allocated_block(block);
+    *devPtr = block->ptr;
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_memory_allocation(
+          c10::kXPU, reinterpret_cast<uintptr_t>(*devPtr));
+    }
+  }
+
+  void free(void* ptr) {
+    if (!ptr) {
+      return;
+    }
+    Block* block = get_allocated_block(ptr, /* remove */ true);
+    TORCH_CHECK(block, "invalid device pointer: ", ptr);
+    device_allocators[block->device]->free(block);
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_memory_deallocation(
+          c10::kXPU, reinterpret_cast<uintptr_t>(block->ptr));
+    }
+  }
+
+  void emptyCache() {
+    for (auto& da : device_allocators) {
+      da->emptyCache();
+    }
+  }
+
+  void recordStream(const DataPtr& ptr, XPUStream stream) {
+    if (!ptr.get()) {
+      return;
+    }
+    if (ptr.get_deleter() != &local_raw_delete) {
+      return;
+    }
+
+    Block* block = get_allocated_block(ptr.get());
+    TORCH_CHECK(block, "No allocated block can be found.");
+    device_allocators[block->device]->recordStream(block, stream);
+  }
+
+  DataPtr allocate(size_t size) override {
+    auto device = c10::xpu::current_device();
+    void* r = nullptr;
+    if (size != 0) {
+      this->malloc(&r, device, size, xpu::getCurrentXPUStream(device));
+    }
+    return {r, r, &local_raw_delete, Device(DeviceType::XPU, device)};
+  }
+
+  DeleterFnPtr raw_deleter() const override {
+    return &local_raw_delete;
+  }
+
+  void* raw_alloc(size_t size) {
+    if (size == 0) {
+      return nullptr;
+    }
+    auto device = c10::xpu::current_device();
+    void* r = nullptr;
+    malloc(&r, device, size, xpu::getCurrentXPUStream(device));
+    return r;
+  }
+
+  void* raw_alloc_with_stream(size_t size, XPUStream stream) {
+    if (size == 0) {
+      return nullptr;
+    }
+    auto device = c10::xpu::current_device();
+    void* r = nullptr;
+    malloc(&r, device, size, stream);
+    return r;
+  }
+
+  void raw_delete(void* ptr) {
+    this->free(ptr);
+  }
+
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    xpu::getCurrentXPUStream().queue().memcpy(dest, src, count);
+  }
+};
+
+static XPUAllocator allocator;
+
+void local_raw_delete(void* ptr) {
+  allocator.free(ptr);
+}
+
+Allocator* get() {
+  return &allocator;
+}
+
+void init(DeviceIndex device_count) {
+  return allocator.init(device_count);
+}
+
+void emptyCache() {
+  return allocator.emptyCache();
+}
+
+void* raw_alloc(size_t size) {
+  return allocator.raw_alloc(size);
+}
+
+void raw_delete(void* ptr) {
+  return allocator.raw_delete(ptr);
+}
+
+void recordStream(const DataPtr& dataPtr, XPUStream stream) {
+  return allocator.recordStream(dataPtr, stream);
+}
+
+REGISTER_ALLOCATOR(kXPU, &allocator)
+
+} // namespace c10::xpu::XPUCachingAllocator
diff --git a/c10/xpu/XPUCachingAllocator.h b/c10/xpu/XPUCachingAllocator.h
new file mode 100644
index 0000000000000..683654263a473
--- /dev/null
+++ b/c10/xpu/XPUCachingAllocator.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/xpu/XPUStream.h>
+
+namespace c10::xpu::XPUCachingAllocator {
+
+C10_XPU_API Allocator* get();
+
+C10_XPU_API void init(DeviceIndex device_count);
+
+C10_XPU_API void emptyCache();
+
+C10_XPU_API void* raw_alloc(size_t size);
+
+C10_XPU_API void raw_delete(void* ptr);
+
+C10_XPU_API void recordStream(const DataPtr& dataPtr, XPUStream stream);
+
+} // namespace c10::xpu::XPUCachingAllocator
diff --git a/c10/xpu/XPUDeviceProp.h b/c10/xpu/XPUDeviceProp.h
new file mode 100644
index 0000000000000..7c7381fd2901c
--- /dev/null
+++ b/c10/xpu/XPUDeviceProp.h
@@ -0,0 +1,170 @@
+#pragma once
+
+#include <c10/xpu/XPUMacros.h>
+#include <sycl/sycl.hpp>
+
+namespace c10::xpu {
+
+#define AT_FORALL_XPU_DEVICE_PROPERTIES(_)                                     \
+  /* the device name of this SYCL device. */                                   \
+  _(name)                                                                      \
+                                                                               \
+  /* the device type associated with the device. */                            \
+  _(device_type)                                                               \
+                                                                               \
+  /* the vendor of this SYCL device. */                                        \
+  _(vendor)                                                                    \
+                                                                               \
+  /* a backend-defined driver version as a std::string. */                     \
+  _(driver_version)                                                            \
+                                                                               \
+  /* the SYCL version as a std::string in the form <major>.<minor> */          \
+  _(version)                                                                   \
+                                                                               \
+  /* true if the SYCL device is available. Otherwise, return false. */         \
+  _(is_available)                                                              \
+                                                                               \
+  /* the maximum size in bytes of the arguments that can be passed to a        \
+   * kernel. */                                                                \
+  _(max_parameter_size)                                                        \
+                                                                               \
+  /* the number of parallel compute units available to the device. */          \
+  _(max_compute_units)                                                         \
+                                                                               \
+  /* the maximum dimensions that specify the global and local work-item IDs    \
+   * used by the data parallel execution model. */                             \
+  _(max_work_item_dimensions)                                                  \
+                                                                               \
+  /* the maximum number of workitems that are permitted in a work-group        \
+   * executing a kernel on a single compute unit. */                           \
+  _(max_work_group_size)                                                       \
+                                                                               \
+  /* the maximum number of subgroups in a work-group for any kernel executed   \
+   * on the device. */                                                         \
+  _(max_num_sub_groups)                                                        \
+                                                                               \
+  /* a std::vector of size_t containing the set of sub-group sizes  supported  \
+   * by the device. */                                                         \
+  _(sub_group_sizes)                                                           \
+                                                                               \
+  /* the maximum configured clock frequency of this SYCL device in MHz. */     \
+  _(max_clock_frequency)                                                       \
+                                                                               \
+  /* the default compute device address space size specified as an unsigned    \
+   * integer value in bits. Must return either 32 or 64. */                    \
+  _(address_bits)                                                              \
+                                                                               \
+  /* the maximum size of memory object allocation in bytes. */                 \
+  _(max_mem_alloc_size)                                                        \
+                                                                               \
+  /* the minimum value in bits of the largest supported SYCL built-in data     \
+   * type if this SYCL device is not of device type                            \
+   * sycl::info::device_type::custom. */                                       \
+  _(mem_base_addr_align)                                                       \
+                                                                               \
+  /* a std::vector of info::fp_config describing the half/single/double        \
+   * precision floating-point capability of this SYCL device. */               \
+  _(half_fp_config)                                                            \
+  _(single_fp_config)                                                          \
+  _(double_fp_config)                                                          \
+                                                                               \
+  /* the size of global device memory in bytes. */                             \
+  _(global_mem_size)                                                           \
+                                                                               \
+  /* the type of global memory cache supported. */                             \
+  _(global_mem_cache_type)                                                     \
+                                                                               \
+  /* the size of global memory cache in bytes. */                              \
+  _(global_mem_cache_size)                                                     \
+                                                                               \
+  /* the size of global memory cache line in bytes. */                         \
+  _(global_mem_cache_line_size)                                                \
+                                                                               \
+  /* the type of local memory supported. */                                    \
+  _(local_mem_type)                                                            \
+                                                                               \
+  /* the size of local memory arena in bytes. */                               \
+  _(local_mem_size)                                                            \
+                                                                               \
+  /* the maximum number of sub-devices that can be created when this device is \
+   * partitioned. */                                                           \
+  _(partition_max_sub_devices)                                                 \
+                                                                               \
+  /* the resolution of device timer in nanoseconds. */                         \
+  _(profiling_timer_resolution)                                                \
+                                                                               \
+  /* the preferred native vector width size for built-in scalar types that can \
+   * be put into vectors. */                                                   \
+  _(preferred_vector_width_char)                                               \
+  _(preferred_vector_width_short)                                              \
+  _(preferred_vector_width_int)                                                \
+  _(preferred_vector_width_long)                                               \
+  _(preferred_vector_width_float)                                              \
+  _(preferred_vector_width_double)                                             \
+  _(preferred_vector_width_half)                                               \
+                                                                               \
+  /* the native ISA vector width. The vector width is defined as the number of \
+   * scalar elements that can be stored in the vector. */                      \
+  _(native_vector_width_char)                                                  \
+  _(native_vector_width_short)                                                 \
+  _(native_vector_width_int)                                                   \
+  _(native_vector_width_long)                                                  \
+  _(native_vector_width_float)                                                 \
+  _(native_vector_width_double)                                                \
+  _(native_vector_width_half)
+
+#define AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(_)           \
+  /* the number of EUs associated with the Intel GPU. */ \
+  _(gpu_eu_count, 512)                                   \
+                                                         \
+  /* the number of EUs in a subslice. */                 \
+  _(gpu_eu_count_per_subslice, 8)                        \
+                                                         \
+  /* the simd width of EU of GPU. */                     \
+  _(gpu_eu_simd_width, 8)                                \
+                                                         \
+  /* the number of hardware threads per EU of GPU. */    \
+  _(gpu_hw_threads_per_eu, 8)
+
+#define AT_FORALL_XPU_DEVICE_ASPECT(_)                  \
+  /* sycl::half is supported on device. */              \
+  _(fp16)                                               \
+                                                        \
+  /* double is supported on device. */                  \
+  _(fp64)                                               \
+                                                        \
+  /* 64-bit atomic operation is supported on device. */ \
+  _(atomic64)
+
+#define _DEFINE_SYCL_PROP(ns, property, member) \
+  ns::property::return_type member;
+
+#define DEFINE_DEVICE_PROP(property) \
+  _DEFINE_SYCL_PROP(sycl::info::device, property, property)
+
+#define DEFINE_PLATFORM_PROP(property, member) \
+  _DEFINE_SYCL_PROP(sycl::info::platform, property, member)
+
+#define DEFINE_EXT_DEVICE_PROP(property, ...) \
+  _DEFINE_SYCL_PROP(sycl::ext::intel::info::device, property, property)
+
+#define DEFINE_DEVICE_ASPECT(member) bool has_##member;
+
+struct C10_XPU_API DeviceProp {
+  AT_FORALL_XPU_DEVICE_PROPERTIES(DEFINE_DEVICE_PROP);
+
+  // the platform name.
+  DEFINE_PLATFORM_PROP(name, platform_name);
+
+  AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(DEFINE_EXT_DEVICE_PROP);
+
+  AT_FORALL_XPU_DEVICE_ASPECT(DEFINE_DEVICE_ASPECT);
+};
+
+#undef _DEFINE_SYCL_PROP
+#undef DEFINE_DEVICE_PROP
+#undef DEFINE_PLATFORM_PROP
+#undef DEFINE_EXT_DEVICE_PROP
+#undef DEFINE_DEVICE_ASPECT
+
+} // namespace c10::xpu
diff --git a/c10/xpu/XPUException.h b/c10/xpu/XPUException.h
new file mode 100644
index 0000000000000..9bc64ec3f39b7
--- /dev/null
+++ b/c10/xpu/XPUException.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <sycl/sycl.hpp>
+
+namespace c10::xpu {
+
+static inline sycl::async_handler asyncHandler = [](sycl::exception_list el) {
+  if (el.size() == 0) {
+    return;
+  }
+  for (const auto& e : el) {
+    try {
+      std::rethrow_exception(e);
+    } catch (sycl::exception& e) {
+      TORCH_WARN("SYCL Exception: ", e.what());
+    }
+  }
+  throw;
+};
+
+} // namespace c10::xpu
diff --git a/c10/xpu/XPUFunctions.cpp b/c10/xpu/XPUFunctions.cpp
new file mode 100644
index 0000000000000..15e24d94f5dc0
--- /dev/null
+++ b/c10/xpu/XPUFunctions.cpp
@@ -0,0 +1,193 @@
+#include <c10/util/CallOnce.h>
+#include <c10/util/Exception.h>
+#include <c10/xpu/XPUFunctions.h>
+
+#include <sys/wait.h>
+#include <unistd.h>
+#include <vector>
+
+namespace c10::xpu {
+namespace {
+
+/*
+ * Note [Device Management]
+ *
+ * An Intel GPU device qualifies as a type of SYCL device. This classification
+ * allows for the runtime querying of Intel GPU device information through the
+ * SYCL runtime library.
+ *
+ * Device status is managed through a SYCL device pool, with SYCL devices
+ * determined at runtime. There's currently a SYCL device pool that is lazily
+ * created and only initialized once, ensuring thread-local safety. Each device
+ * within the device pool shares the same default context.
+ */
+c10::once_flag init_flag;
+thread_local DeviceIndex curDeviceIndex = 0;
+
+struct DevicePool {
+  std::vector<std::unique_ptr<sycl::device>> devices;
+  std::unique_ptr<sycl::context> context;
+} gDevicePool;
+
+void enumDevices(std::vector<std::unique_ptr<sycl::device>>& devices) {
+  auto platform_list = sycl::platform::get_platforms();
+  // Enumerated GPU devices from the specific platform.
+  for (const auto& platform : platform_list) {
+    if (platform.get_backend() != sycl::backend::ext_oneapi_level_zero) {
+      continue;
+    }
+    auto device_list = platform.get_devices();
+    for (const auto& device : device_list) {
+      if (device.is_gpu()) {
+        devices.push_back(std::make_unique<sycl::device>(device));
+      }
+    }
+  }
+}
+
+inline void initGlobalDevicePoolState() {
+  // Enumerate all GPU devices and record them.
+  enumDevices(gDevicePool.devices);
+  if (gDevicePool.devices.empty()) {
+    TORCH_WARN("XPU device count is zero!");
+    return;
+  }
+
+  // The default context is utilized for each Intel GPU device, allowing the
+  // retrieval of the context from any GPU device.
+  gDevicePool.context = std::make_unique<sycl::context>(
+      gDevicePool.devices[0]->get_platform().ext_oneapi_get_default_context());
+}
+
+inline void initDevicePoolCallOnce() {
+  c10::call_once(init_flag, initGlobalDevicePoolState);
+}
+
+void initDeviceProperties(DeviceProp* device_prop, int device) {
+  using namespace sycl::info;
+  using namespace sycl::ext;
+  // Get raw sycl device associated with device index.
+  auto& raw_device = *gDevicePool.devices[device];
+
+  // Initialize the device properties associated with the specific device.
+#define ASSIGN_DEVICE_PROP(property) \
+  device_prop->property = raw_device.get_info<device::property>();
+
+#define ASSIGN_EXT_DEVICE_PROP(property, default_value)                      \
+  device_prop->property = raw_device.has(sycl::aspect::ext_intel_##property) \
+      ? raw_device.get_info<intel::info::device::property>()                 \
+      : default_value;
+
+#define ASSIGN_DEVICE_ASPECT(member) \
+  device_prop->has_##member = raw_device.has(sycl::aspect::member);
+
+  AT_FORALL_XPU_DEVICE_PROPERTIES(ASSIGN_DEVICE_PROP);
+
+  device_prop->platform_name =
+      raw_device.get_info<device::platform>().get_info<platform::name>();
+
+  AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(ASSIGN_EXT_DEVICE_PROP);
+
+  AT_FORALL_XPU_DEVICE_ASPECT(ASSIGN_DEVICE_ASPECT);
+  return;
+}
+
+inline void check_device(DeviceIndex device) {
+  // TODO: Use c10::Device::MAX_NUM_DEVICES directly. DeviceIndex is a int8_t
+  // value, and the maximum number of GPUs that PyTorch recognizes is 64. So, we
+  // have to check if there is an overflow happen. When DeviceIndex changes to
+  // int16_t and c10::Device::MAX_NUM_DEVICES is provided, we should use it
+  // directly to check if too many XPU devices are detected.
+  TORCH_CHECK(
+      gDevicePool.devices.size() <= std::numeric_limits<DeviceIndex>::max(),
+      "Too many XPU devices, DeviceIndex overflowed");
+  auto total = static_cast<DeviceIndex>(gDevicePool.devices.size());
+  TORCH_CHECK(
+      device >= 0 && device < total,
+      "device is out of range, device is ",
+      device,
+      ", total number of device is ",
+      total,
+      ".");
+}
+
+} // anonymous namespace
+
+sycl::device& get_raw_device(DeviceIndex device) {
+  initDevicePoolCallOnce();
+  check_device(device);
+  return *gDevicePool.devices[device];
+}
+
+sycl::context& get_device_context() {
+  initDevicePoolCallOnce();
+  TORCH_CHECK(
+      gDevicePool.context,
+      "Device pool initialization failed, you might not have an XPU device.")
+  return *gDevicePool.context;
+}
+
+void get_device_properties(DeviceProp* device_prop, DeviceIndex device) {
+  initDevicePoolCallOnce();
+  TORCH_CHECK(device_prop, "device_prop is an invalid pointer.");
+  check_device(device);
+  initDeviceProperties(device_prop, device);
+}
+
+DeviceIndex get_device_idx_from_pointer(void* ptr) {
+  initDevicePoolCallOnce();
+  TORCH_CHECK(ptr, "ptr is an invalid pointer.");
+  auto type = sycl::get_pointer_type(ptr, get_device_context());
+  TORCH_CHECK(
+      type == sycl::usm::alloc::device, "ptr is not a device type pointer.");
+
+  sycl::device raw_device = sycl::get_pointer_device(ptr, get_device_context());
+  auto match_device = [raw_device](const auto& device) -> bool {
+    return raw_device == *device;
+  };
+  auto it = std::find_if(
+      gDevicePool.devices.begin(), gDevicePool.devices.end(), match_device);
+  TORCH_CHECK(
+      it != gDevicePool.devices.end(),
+      "Can't find the pointer from XPU devices.");
+  return static_cast<DeviceIndex>(
+      std::distance(gDevicePool.devices.begin(), it));
+}
+
+DeviceIndex device_count() {
+  initDevicePoolCallOnce();
+  return static_cast<DeviceIndex>(gDevicePool.devices.size());
+}
+
+DeviceIndex device_count_ensure_non_zero() {
+  auto count = device_count();
+  // Zero gpus could produce a warning in `device_count` but we fail here.
+  TORCH_CHECK(count, "No XPU devices are available.");
+  return count;
+}
+
+DeviceIndex current_device() {
+  initDevicePoolCallOnce();
+  return curDeviceIndex;
+}
+
+void set_device(DeviceIndex device) {
+  initDevicePoolCallOnce();
+  check_device(device);
+  curDeviceIndex = device;
+}
+
+c10::DeviceIndex exchange_device(c10::DeviceIndex to_device) {
+  auto cur_device = current_device();
+  if (to_device == cur_device) {
+    return cur_device;
+  }
+  set_device(to_device);
+  return cur_device;
+}
+
+c10::DeviceIndex maybe_exchange_device(c10::DeviceIndex to_device) {
+  return exchange_device(to_device);
+}
+
+} // namespace c10::xpu
diff --git a/c10/xpu/XPUFunctions.h b/c10/xpu/XPUFunctions.h
new file mode 100644
index 0000000000000..126d1d5fe66bf
--- /dev/null
+++ b/c10/xpu/XPUFunctions.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/xpu/XPUDeviceProp.h>
+#include <c10/xpu/XPUMacros.h>
+
+// The naming convention used here matches the naming convention of torch.xpu
+
+namespace c10::xpu {
+
+// Log a warning only once if no devices are detected.
+C10_XPU_API DeviceIndex device_count();
+
+// Throws an error if no devices are detected.
+C10_XPU_API DeviceIndex device_count_ensure_non_zero();
+
+C10_XPU_API DeviceIndex current_device();
+
+C10_XPU_API void set_device(DeviceIndex device);
+
+C10_XPU_API DeviceIndex exchange_device(DeviceIndex device);
+
+C10_XPU_API DeviceIndex maybe_exchange_device(DeviceIndex to_device);
+
+C10_XPU_API sycl::device& get_raw_device(DeviceIndex device);
+
+C10_XPU_API sycl::context& get_device_context();
+
+C10_XPU_API void get_device_properties(
+    DeviceProp* device_prop,
+    DeviceIndex device);
+
+C10_XPU_API DeviceIndex get_device_idx_from_pointer(void* ptr);
+
+} // namespace c10::xpu
diff --git a/c10/xpu/XPUMacros.h b/c10/xpu/XPUMacros.h
new file mode 100644
index 0000000000000..fc6aad92229c2
--- /dev/null
+++ b/c10/xpu/XPUMacros.h
@@ -0,0 +1,19 @@
+#pragma once
+
+// See c10/macros/Export.h for a detailed explanation of what the function
+// of these macros are.  We need one set of macros for every separate library
+// we build.
+
+#if defined(__GNUC__)
+#define C10_XPU_EXPORT __attribute__((__visibility__("default")))
+#else // defined(__GNUC__)
+#define C10_XPU_EXPORT
+#endif // defined(__GNUC__)
+#define C10_XPU_IMPORT C10_XPU_EXPORT
+
+// This one is being used by libc10_xpu.so
+#ifdef C10_XPU_BUILD_MAIN_LIB
+#define C10_XPU_API C10_XPU_EXPORT
+#else
+#define C10_XPU_API C10_XPU_IMPORT
+#endif
diff --git a/c10/xpu/XPUStream.cpp b/c10/xpu/XPUStream.cpp
new file mode 100644
index 0000000000000..abf380f17b437
--- /dev/null
+++ b/c10/xpu/XPUStream.cpp
@@ -0,0 +1,295 @@
+#include <c10/util/CallOnce.h>
+#include <c10/util/irange.h>
+#include <c10/xpu/XPUException.h>
+#include <c10/xpu/XPUStream.h>
+
+#include <atomic>
+#include <deque>
+#include <mutex>
+#include <vector>
+
+namespace c10::xpu {
+namespace {
+
+// Global stream state and constants
+c10::once_flag init_flag;
+DeviceIndex num_gpus = -1;
+constexpr int kStreamsPerPoolBits = 5;
+constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
+constexpr int kStreamTypeBits = 3;
+
+// The SYCL queue pools are lazily initialized when the first queue is requested
+// for a device. The device flags track the initialization of each device. When
+// a queue is requested, the next queue in the pool to be returned in a
+// round-robin fashion, see Note [Stream Management].
+std::deque<c10::once_flag> device_flags;
+std::vector<std::array<
+    std::array<std::unique_ptr<sycl::queue>, kStreamsPerPool>,
+    max_compile_time_stream_priorities>>
+    streams;
+std::deque<
+    std::array<std::atomic<uint32_t>, max_compile_time_stream_priorities>>
+    priority_counters;
+
+thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
+
+// Note [StreamId assignment]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~
+// How do we assign stream IDs?
+//
+// -- 57 bits --  -- 5 bits -----  -- 3 bits --
+//     zeros      StreamIdIndex    StreamIdType
+//
+// Where StreamIdType:
+//  000 = normal priority queue
+//  001 = high priority queue
+//
+// StreamId is 64-bit, so we can just rely on regular promotion rules.
+// We rely on StreamIdIndex and StreamIdType being non-negative;
+
+using StreamIdIndex = uint8_t;
+enum class StreamIdType : uint8_t {
+  // The higher the type number, the higher the priority.
+  NORMAL = 0x0,
+  HIGH = 0X1,
+};
+
+inline std::ostream& operator<<(std::ostream& stream, StreamIdType q) {
+  switch (q) {
+    case StreamIdType::NORMAL:
+      return stream << "NORMAL";
+    case StreamIdType::HIGH:
+      return stream << "HIGH";
+    default:
+      break;
+  }
+  return stream << static_cast<int16_t>(q);
+}
+
+inline StreamIdType streamIdType(StreamId s) {
+  int mask_for_type = (1 << kStreamTypeBits) - 1;
+  auto st = static_cast<StreamIdType>(s & mask_for_type);
+  TORCH_CHECK(
+      st == StreamIdType::NORMAL || st == StreamIdType::HIGH,
+      "invalid StreamId: ",
+      s);
+  return st;
+}
+
+inline StreamIdIndex streamIdIndex(StreamId s) {
+  return static_cast<StreamIdIndex>(
+      (s >> kStreamTypeBits) & ((1 << kStreamsPerPoolBits) - 1));
+}
+
+inline StreamId makeStreamId(StreamIdType st, StreamIdIndex si) {
+  return (static_cast<StreamId>(si) << kStreamTypeBits) |
+      static_cast<StreamId>(st);
+}
+
+void initGlobalStreamState() {
+  num_gpus = c10::xpu::device_count();
+  device_flags.resize(num_gpus);
+  streams.resize(num_gpus);
+  priority_counters.resize(num_gpus);
+}
+
+// Creates the reserved SYCL queue pools for the specified device. It should be
+// call only once.
+void initDeviceStreamState(DeviceIndex device) {
+  using namespace sycl::ext::oneapi::property;
+  // Need to align with StreamIdType.
+  const std::vector<sycl::property_list> properties = {
+      {sycl::property::queue::in_order(), queue::priority_normal()},
+      {sycl::property::queue::in_order(), queue::priority_high()}};
+  for (const auto p : c10::irange(max_compile_time_stream_priorities)) {
+    for (const auto i : c10::irange(kStreamsPerPool)) {
+      auto& stream = streams[device][p][i];
+      stream = std::make_unique<sycl::queue>(sycl::queue(
+          c10::xpu::get_device_context(),
+          c10::xpu::get_raw_device(device),
+          c10::xpu::asyncHandler,
+          properties[p]));
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_stream_creation(
+            c10::kXPU, reinterpret_cast<uintptr_t>(stream.get()));
+      }
+    }
+    priority_counters[device][p] = 0;
+  }
+}
+
+void initXPUStreamsOnce() {
+  c10::call_once(init_flag, initGlobalStreamState);
+
+  if (current_streams) {
+    return;
+  }
+
+  // Inits current streams (thread local) to the last queue in the "normal
+  // priority" queue pool. Note: the queue pool have not been initialized yet.
+  // It will be initialized in initDeviceStreamState for the specified device.
+  current_streams = std::make_unique<StreamId[]>(num_gpus);
+  for (const auto i : c10::irange(num_gpus)) {
+    // Assigning the current stream to the last one in the pool can be
+    // beneficial in certain scenarios, particularly when users initialize their
+    // workload to perform computations with the current stream (the last one)
+    // and utilize stream (the first one) from the pool for communication, it
+    // allows for different streams to overlap in computation and communication.
+    current_streams[i] =
+        makeStreamId(StreamIdType::NORMAL, kStreamsPerPool - 1);
+  }
+}
+
+// Creates the reserved sycl queue pools for the specified device to ensure
+// initialization only occurs once.
+inline void initDeviceStreamOnce(DeviceIndex device) {
+  c10::call_once(device_flags[device], initDeviceStreamState, device);
+}
+
+inline void check_device(DeviceIndex device) {
+  TORCH_CHECK(
+      device >= 0 && device < num_gpus,
+      "device is out of range, device is ",
+      static_cast<int16_t>(device),
+      ", total number of device is ",
+      static_cast<int16_t>(num_gpus),
+      ".");
+}
+
+uint32_t get_idx(std::atomic<uint32_t>& counter) {
+  auto raw_idx = counter++;
+  return raw_idx % kStreamsPerPool;
+}
+
+XPUStream XPUStreamForId(DeviceIndex device_index, StreamId stream_id) {
+  return XPUStream(
+      XPUStream::UNCHECKED,
+      Stream(
+          Stream::UNSAFE,
+          c10::Device(DeviceType::XPU, device_index),
+          stream_id));
+}
+
+} // anonymous namespace
+
+int XPUStream::priority() const {
+  StreamId stream_id = stream_.id();
+  StreamIdType st = streamIdType(stream_id);
+  // StreamIdType and priority number are inversely related.
+  return -static_cast<int>(st);
+}
+
+// See Note [StreamId assignment]
+sycl::queue& XPUStream::queue() const {
+  DeviceIndex device_index = stream_.device_index();
+  StreamId stream_id = stream_.id();
+  StreamIdType st = streamIdType(stream_id);
+  StreamIdIndex si = streamIdIndex(stream_id);
+  switch (st) {
+    case StreamIdType::NORMAL:
+    case StreamIdType::HIGH:
+      return *streams[device_index][static_cast<uint8_t>(st)][si];
+    default:
+      TORCH_CHECK(
+          false,
+          "Unrecognized stream ",
+          stream_,
+          " (I didn't recognize the stream type, ",
+          st,
+          ").",
+          " Did you manufacture the StreamId yourself?  Don't do that;");
+  }
+}
+
+// Returns a stream from the requested pool
+// Note: The stream pools will be initialized if needed, at the first invocation
+// to this function.
+XPUStream getStreamFromPool(const int priority, DeviceIndex device) {
+  initXPUStreamsOnce();
+  if (device == -1) {
+    device = c10::xpu::current_device();
+  }
+  check_device(device);
+  TORCH_CHECK(
+      priority <= 0,
+      "Expected XPU stream priority to be less than or equal to 0, got ",
+      priority);
+  // Initializes the stream pools (once)
+  initDeviceStreamOnce(device);
+  auto priority_idx =
+      std::min(-priority, max_compile_time_stream_priorities - 1);
+  const auto idx = get_idx(priority_counters[device][priority_idx]);
+  auto id_type = static_cast<StreamIdType>(priority_idx);
+  return XPUStreamForId(device, makeStreamId(id_type, idx));
+}
+
+XPUStream getStreamFromPool(const bool isHighPriority, DeviceIndex device) {
+  initXPUStreamsOnce();
+  // If isHighPriority is true, return the stream with the highest priority.
+  int priority = isHighPriority ? -max_compile_time_stream_priorities + 1 : 0;
+  return getStreamFromPool(priority, device);
+}
+
+// Note: The stream pools will be initialized if needed, at the first invocation
+// to this function.
+XPUStream getCurrentXPUStream(DeviceIndex device) {
+  initXPUStreamsOnce();
+  if (device == -1) {
+    device = c10::xpu::current_device();
+  }
+  check_device(device);
+  // Initializes the stream pool (once)
+  initDeviceStreamOnce(device);
+  return XPUStreamForId(device, current_streams[device]);
+}
+
+// Note: The stream pools will be initialized if needed, at the first invocation
+// to this function.
+void setCurrentXPUStream(XPUStream stream) {
+  initXPUStreamsOnce();
+  current_streams[stream.device_index()] = stream.id();
+}
+
+std::ostream& operator<<(std::ostream& stream, const XPUStream& s) {
+  return stream << s.unwrap();
+}
+
+/*
+ * Note [Synchronize Streams on Device]
+ *
+ * There are two stream pools per device to manage our reserved SYCL queues.
+ * When syncStreamsOnDevice is called, all reserved SYCL queues in the pools of
+ * the specified device will be blocked, and wait for their synchronizations. We
+ * realize the semantics via a loop through the stream pools of the specified
+ * device and make each command queue synchronization sequentially.
+ *
+ * There is a semantic gap with device synchronization because only the SYCL
+ * queues we have reserved (in our pools) will be synchronized, rather than
+ * synchronizing all SYCL queues on the specified device.
+ */
+
+// Note: The stream pools will be initialized if needed, at the first invocation
+// to this function.
+void syncStreamsOnDevice(DeviceIndex device) {
+  initXPUStreamsOnce();
+  if (device == -1) {
+    device = c10::xpu::current_device();
+  }
+  check_device(device);
+  // Initializes the stream pools (once)
+  initDeviceStreamOnce(device);
+
+  // For each device, we have kStreamsPerPool (32) reserved queues per priority.
+  for (const auto p : c10::irange(max_compile_time_stream_priorities)) {
+    for (const auto i : c10::irange(kStreamsPerPool)) {
+      streams[device][p][i]->wait();
+    }
+  }
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_device_synchronization(c10::kXPU);
+  }
+}
+
+} // namespace c10::xpu
diff --git a/c10/xpu/XPUStream.h b/c10/xpu/XPUStream.h
new file mode 100644
index 0000000000000..9aa0838826aed
--- /dev/null
+++ b/c10/xpu/XPUStream.h
@@ -0,0 +1,189 @@
+#pragma once
+
+#include <c10/core/Stream.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/xpu/XPUFunctions.h>
+
+namespace c10::xpu {
+
+/*
+ * Note [Stream Management]
+ *
+ * An XPUStream is an abstraction of an actual SYCL queue in which SYCL kernel
+ * can execute. Currently, there are several pools per device to manage SYCL
+ * queue, and a device's pool is lazily created.
+ *
+ * There are two pools per device. The first pool contains "normal priority"
+ * queues. The second pool is the "high priority" queues. There are 32 queues in
+ * per pool per device, and when a queue is requested one of these queues is
+ * returned round-robin. That is, the first queue requested is at index 0, the
+ * second at index 1... to index 31, then index 0 again.
+ *
+ * This means that if 33 queues are requested, the first and last queues
+ * requested are actually the same queue (under the covers) and kernels enqueued
+ * on them cannot run concurrently.
+ *
+ * It is safe to enqueue a kernel on the same queue from two different
+ * threads as the SYCL specification described.
+ */
+
+static constexpr int max_compile_time_stream_priorities = 2;
+
+/*
+ * This serves as a wrapper around c10::Stream and acts as a representation for
+ * a SYCL queue, which allows asynchronous execution of XPU tasks.
+ */
+class C10_XPU_API XPUStream {
+ public:
+  enum Unchecked { UNCHECKED };
+
+  /// Construct a XPUStream from a Stream. This construction is checked, and
+  /// will raise an error if the Stream is not, in fact, a XPU stream.
+  explicit XPUStream(Stream stream) : stream_(stream) {
+    TORCH_CHECK(stream_.device_type() == DeviceType::XPU);
+  }
+
+  /// Construct a XPUStream from a Stream with no error checking.
+  explicit XPUStream(Unchecked, Stream stream) : stream_(stream) {}
+
+  bool operator==(const XPUStream& other) const noexcept {
+    return unwrap() == other.unwrap();
+  }
+
+  bool operator!=(const XPUStream& other) const noexcept {
+    return unwrap() != other.unwrap();
+  }
+
+  /// Implicit conversion to sycl::queue&.
+  operator sycl::queue&() const {
+    return queue();
+  }
+
+  /// Implicit conversion to Stream (a.k.a., forget that the stream is a
+  /// XPU stream).
+  operator Stream() const {
+    return unwrap();
+  }
+
+  /// Get the XPU device type that this stream is associated with.
+  DeviceType device_type() const {
+    return DeviceType::XPU;
+  }
+
+  /// Get the XPU device index that this stream is associated with.
+  DeviceIndex device_index() const {
+    return stream_.device_index();
+  }
+
+  /// Get the full Device that this stream is associated with. The Device is
+  /// guaranteed to be a XPU device.
+  Device device() const {
+    return Device(DeviceType::XPU, device_index());
+  }
+
+  /// Return the stream ID corresponding to this particular stream. StreamId is
+  /// a int64_t representation generated by its type and index.
+  StreamId id() const {
+    return stream_.id();
+  }
+
+  /// Return true if all enqueued tasks in this stream have been completed,
+  /// otherwise return false.
+  bool query() const {
+    return queue().ext_oneapi_empty();
+  }
+
+  /// Performs a blocking wait for the completion of all enqueued tasks in this
+  /// stream.
+  void synchronize() const {
+    queue().wait_and_throw();
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_stream_synchronization(
+          c10::kXPU, reinterpret_cast<uintptr_t>(&queue()));
+    }
+  }
+
+  /// Return the priority that this stream is associated with. Lower numbers
+  /// represent higher priority.
+  int priority() const;
+
+  /// Explicit conversion to sycl::queue&.
+  sycl::queue& queue() const;
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const {
+    return stream_;
+  }
+
+  /// Reversibly pack a XPUStream into a struct representation. The XPUStream
+  /// can be unpacked using unpack3().
+  struct c10::StreamData3 pack3() const {
+    return stream_.pack3();
+  }
+
+  /// Unpack a XPUStream from the 3 fields generated by pack3().
+  static XPUStream unpack3(
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
+    return XPUStream(Stream::unpack3(stream_id, device_index, device_type));
+  }
+
+  /// Return the range of priority **supported by PyTorch**.
+  static std::tuple<int, int> priority_range() {
+    return std::make_tuple(0, -max_compile_time_stream_priorities + 1);
+  }
+
+ private:
+  Stream stream_;
+};
+
+/**
+ * Get a stream from the pool in a round-robin fashion.
+ *
+ * You can request a stream from the highest priority pool by setting
+ * isHighPriority to true for a specific device.
+ */
+C10_XPU_API XPUStream
+getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
+
+/**
+ * Get a stream from the pool in a round-robin fashion.
+ *
+ * You can request a stream by setting a priority value for a specific device.
+ * The priority number lower, the priority higher.
+ */
+C10_XPU_API XPUStream
+getStreamFromPool(const int priority, DeviceIndex device = -1);
+
+/**
+ * Get the current XPU stream, for the passed XPU device, or for the current
+ * device if no device index is passed.
+ */
+C10_XPU_API XPUStream getCurrentXPUStream(DeviceIndex device = -1);
+
+/**
+ * Set the current stream on the device of the passed in stream to be the passed
+ * in stream.
+ */
+C10_XPU_API void setCurrentXPUStream(XPUStream stream);
+
+C10_XPU_API std::ostream& operator<<(std::ostream& stream, const XPUStream& s);
+
+/**
+ * Block all reserved SYCL queues in the stream pools on the device, and wait
+ * for their synchronizations.
+ */
+C10_XPU_API void syncStreamsOnDevice(DeviceIndex device = -1);
+
+} // namespace c10::xpu
+
+namespace std {
+template <>
+struct hash<c10::xpu::XPUStream> {
+  size_t operator()(c10::xpu::XPUStream s) const noexcept {
+    return std::hash<c10::Stream>{}(s.unwrap());
+  }
+};
+} // namespace std
diff --git a/c10/xpu/impl/XPUGuardImpl.cpp b/c10/xpu/impl/XPUGuardImpl.cpp
new file mode 100644
index 0000000000000..357c057380ff8
--- /dev/null
+++ b/c10/xpu/impl/XPUGuardImpl.cpp
@@ -0,0 +1,7 @@
+#include <c10/xpu/impl/XPUGuardImpl.h>
+
+namespace c10::xpu::impl {
+
+C10_REGISTER_GUARD_IMPL(XPU, XPUGuardImpl);
+
+} // namespace c10::xpu::impl
diff --git a/c10/xpu/impl/XPUGuardImpl.h b/c10/xpu/impl/XPUGuardImpl.h
new file mode 100644
index 0000000000000..4bbbfe2ee5b15
--- /dev/null
+++ b/c10/xpu/impl/XPUGuardImpl.h
@@ -0,0 +1,156 @@
+#pragma once
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/xpu/XPUCachingAllocator.h>
+#include <c10/xpu/XPUFunctions.h>
+#include <c10/xpu/XPUStream.h>
+
+#include <vector>
+
+namespace c10::xpu::impl {
+
+struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr DeviceType static_type = kXPU;
+
+  XPUGuardImpl() = default;
+
+  explicit XPUGuardImpl(DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == kXPU);
+  }
+
+  DeviceType type() const override {
+    return kXPU;
+  }
+
+  Device exchangeDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_xpu());
+    const auto old_device_index = c10::xpu::exchange_device(d.index());
+    return Device(kXPU, old_device_index);
+  }
+
+  Device getDevice() const override {
+    const auto device = c10::xpu::current_device();
+    return Device(kXPU, device);
+  }
+
+  void setDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_xpu());
+    c10::xpu::set_device(d.index());
+  }
+
+  void uncheckedSetDevice(Device d) const noexcept override {
+    c10::xpu::set_device(d.index());
+  }
+
+  Stream getStream(Device d) const noexcept override {
+    return getCurrentXPUStream(d.index()).unwrap();
+  }
+
+  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
+      const override {
+    return getStreamFromPool(isHighPriority, d.index());
+  }
+
+  // NB: These do NOT set the current device
+  Stream exchangeStream(Stream s) const noexcept override {
+    const XPUStream stream(s);
+    const auto old_stream = getCurrentXPUStream(s.device().index());
+    setCurrentXPUStream(stream);
+    return old_stream.unwrap();
+  }
+
+  DeviceIndex deviceCount() const noexcept override {
+    return c10::xpu::device_count();
+  }
+
+  // Event-related functions
+  void destroyEvent(void* event, const DeviceIndex device_index)
+      const noexcept override {
+    if (!event)
+      return;
+
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_deletion(
+          c10::kXPU, reinterpret_cast<uintptr_t>(event));
+    }
+
+    delete reinterpret_cast<sycl::event*>(event);
+  }
+
+  void record(
+      void** event,
+      const Stream& stream,
+      const DeviceIndex device_index,
+      const EventFlag flag) const override {
+    TORCH_CHECK(
+        device_index == -1 || device_index == stream.device_index(),
+        "Event device index ",
+        device_index,
+        " does not match recording stream's device index ",
+        stream.device_index(),
+        ".");
+
+    auto* xpu_event = reinterpret_cast<sycl::event*>(*event);
+    const XPUStream xpu_stream{stream};
+
+    // Delete the event previously recorded.
+    if (xpu_event)
+      delete xpu_event;
+    xpu_event = new sycl::event(xpu_stream.queue().ext_oneapi_submit_barrier());
+
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_record(
+          c10::kXPU,
+          reinterpret_cast<uintptr_t>(xpu_event),
+          reinterpret_cast<uintptr_t>(&xpu_stream.queue()));
+    }
+  }
+
+  void block(void* event, const Stream& stream) const override {
+    if (!event)
+      return;
+    auto* xpu_event = reinterpret_cast<sycl::event*>(event);
+    std::vector<sycl::event> event_list{*xpu_event};
+    const XPUStream xpu_stream(stream);
+    xpu_stream.queue().ext_oneapi_submit_barrier(event_list);
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_wait(
+          c10::kXPU,
+          reinterpret_cast<uintptr_t>(xpu_event),
+          reinterpret_cast<uintptr_t>(&xpu_stream.queue()));
+    }
+  }
+
+  bool queryEvent(void* event) const override {
+    using namespace sycl::info;
+    if (!event)
+      return true;
+    auto* xpu_event = reinterpret_cast<sycl::event*>(event);
+    return xpu_event->get_info<event::command_execution_status>() ==
+        event_command_status::complete;
+  }
+
+  // Stream-related functions
+  bool queryStream(const Stream& stream) const override {
+    const XPUStream xpu_stream{stream};
+    return xpu_stream.query();
+  }
+
+  void synchronizeStream(const Stream& stream) const override {
+    const XPUStream xpu_stream{stream};
+    xpu_stream.synchronize();
+  }
+
+  void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
+      const override {
+    const XPUStream xpu_stream{stream};
+    XPUCachingAllocator::recordStream(data_ptr, xpu_stream);
+  }
+};
+
+} // namespace c10::xpu::impl
diff --git a/c10/xpu/test/CMakeLists.txt b/c10/xpu/test/CMakeLists.txt
new file mode 100644
index 0000000000000..fa91cc9d171a3
--- /dev/null
+++ b/c10/xpu/test/CMakeLists.txt
@@ -0,0 +1,20 @@
+# ---[ Test binaries.
+
+set(C10_XPU_ALL_TEST_FILES
+    impl/XPUCachingAllocatorTest.cpp
+    impl/XPUDeviceTest.cpp
+    impl/XPUGuardTest.cpp
+    impl/XPUStreamTest.cpp
+)
+if(BUILD_TEST)
+  foreach(test_src ${C10_XPU_ALL_TEST_FILES})
+    get_filename_component(test_file_name ${test_src} NAME_WE)
+    set(test_name "c10_xpu_${test_file_name}")
+    add_executable(${test_name} "${test_src}")
+    target_link_libraries(${test_name} c10_xpu gtest_main)
+    add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+    if(INSTALL_TEST)
+      install(TARGETS ${test_name} DESTINATION test)
+    endif()
+  endforeach()
+endif()
diff --git a/c10/xpu/test/impl/XPUCachingAllocatorTest.cpp b/c10/xpu/test/impl/XPUCachingAllocatorTest.cpp
new file mode 100644
index 0000000000000..a337126f061bd
--- /dev/null
+++ b/c10/xpu/test/impl/XPUCachingAllocatorTest.cpp
@@ -0,0 +1,88 @@
+#include <gtest/gtest.h>
+
+#include <c10/util/irange.h>
+#include <c10/xpu/XPUCachingAllocator.h>
+
+bool has_xpu() {
+  return c10::xpu::device_count() > 0;
+}
+
+TEST(XPUCachingAllocatorTest, GetXPUAllocator) {
+  auto* allocator = c10::xpu::XPUCachingAllocator::get();
+
+  auto _500mb = 500 * 1024 * 1024;
+  auto buffer = allocator->allocate(_500mb);
+  EXPECT_TRUE(buffer.get());
+
+  auto* xpu_allocator = c10::GetAllocator(buffer.device().type());
+  EXPECT_EQ(allocator, xpu_allocator);
+}
+
+TEST(XPUCachingAllocatorTest, DeviceCachingAllocate) {
+  c10::xpu::XPUCachingAllocator::emptyCache();
+  auto* allocator = c10::xpu::XPUCachingAllocator::get();
+  // 500M memory is reserved, can be reused later.
+  {
+    auto _500mb = 500 * 1024 * 1024;
+    auto cache = allocator->allocate(_500mb);
+  }
+  auto _10mb = 10 * 1024 * 1024;
+  auto buffer = allocator->allocate(_10mb);
+  void* ptr0 = buffer.get();
+  // tmp is not allocated via device caching allocator.
+  void* tmp = sycl::aligned_alloc_device(
+      512, _10mb, c10::xpu::get_raw_device(0), c10::xpu::get_device_context());
+  void* ptr1 = c10::xpu::XPUCachingAllocator::raw_alloc(_10mb);
+  // We have reserved 500M memory that can be reused. When we allocate ptr0
+  // and ptr1 via device caching allocator, they should be on the same block.
+  // And ptr1 is the next block of ptr0, like [ptr0, ptr1]. This is because tmp
+  // pointer is not allocated via device caching allocator so that it can NOT
+  // reuse our reserved memory. So the offset between ptr0 and ptr1 should equal
+  // to ptr0's size (10M).
+  auto diff = static_cast<char*>(ptr1) - static_cast<char*>(ptr0);
+  EXPECT_EQ(diff, _10mb);
+  c10::xpu::XPUCachingAllocator::raw_delete(ptr1);
+  sycl::free(tmp, c10::xpu::get_device_context());
+  c10::xpu::XPUCachingAllocator::emptyCache();
+}
+
+TEST(XPUCachingAllocatorTest, AllocateMemory) {
+  c10::xpu::XPUCachingAllocator::emptyCache();
+  auto* allocator = c10::xpu::XPUCachingAllocator::get();
+  auto _10mb = 10 * 1024 * 1024;
+  auto buffer = allocator->allocate(_10mb);
+  auto* deviceData = static_cast<int*>(buffer.get());
+
+  constexpr int numel = 1024;
+  int hostData[numel];
+  for (const auto i : c10::irange(numel)) {
+    hostData[i] = i;
+  }
+
+  auto stream = c10::xpu::getStreamFromPool();
+  // H2D
+  stream.queue().memcpy(deviceData, hostData, sizeof(int) * numel);
+  c10::xpu::syncStreamsOnDevice();
+
+  for (const auto i : c10::irange(numel)) {
+    hostData[i] = 0;
+  }
+
+  // D2H
+  stream.queue().memcpy(hostData, deviceData, sizeof(int) * numel);
+  c10::xpu::syncStreamsOnDevice();
+
+  for (const auto i : c10::irange(numel)) {
+    EXPECT_EQ(hostData[i], i);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  ::testing::InitGoogleTest(&argc, argv);
+  auto device = c10::xpu::device_count();
+  if (device <= 0) {
+    return 0;
+  }
+  c10::xpu::XPUCachingAllocator::init(device);
+  return RUN_ALL_TESTS();
+}
diff --git a/c10/xpu/test/impl/XPUDeviceTest.cpp b/c10/xpu/test/impl/XPUDeviceTest.cpp
new file mode 100644
index 0000000000000..7a639385178c5
--- /dev/null
+++ b/c10/xpu/test/impl/XPUDeviceTest.cpp
@@ -0,0 +1,53 @@
+#include <gtest/gtest.h>
+
+#include <c10/xpu/XPUFunctions.h>
+
+bool has_xpu() {
+  return c10::xpu::device_count() > 0;
+}
+
+TEST(XPUDeviceTest, DeviceBehavior) {
+  if (!has_xpu()) {
+    return;
+  }
+
+  c10::xpu::set_device(0);
+  EXPECT_EQ(c10::xpu::current_device(), 0);
+
+  if (c10::xpu::device_count() <= 1) {
+    return;
+  }
+
+  c10::xpu::set_device(1);
+  EXPECT_EQ(c10::xpu::current_device(), 1);
+  EXPECT_EQ(c10::xpu::exchange_device(0), 1);
+  EXPECT_EQ(c10::xpu::current_device(), 0);
+}
+
+TEST(XPUDeviceTest, DeviceProperties) {
+  if (!has_xpu()) {
+    return;
+  }
+
+  c10::xpu::DeviceProp device_prop{};
+  c10::xpu::get_device_properties(&device_prop, 0);
+
+  EXPECT_TRUE(device_prop.max_compute_units > 0);
+  EXPECT_TRUE(device_prop.gpu_eu_count > 0);
+}
+
+TEST(XPUDeviceTest, PointerGetDevice) {
+  if (!has_xpu()) {
+    return;
+  }
+
+  sycl::device& raw_device = c10::xpu::get_raw_device(0);
+  void* ptr =
+      sycl::malloc_device(8, raw_device, c10::xpu::get_device_context());
+
+  EXPECT_EQ(c10::xpu::get_device_idx_from_pointer(ptr), 0);
+  sycl::free(ptr, c10::xpu::get_device_context());
+
+  int dummy = 0;
+  ASSERT_THROW(c10::xpu::get_device_idx_from_pointer(&dummy), c10::Error);
+}
diff --git a/c10/xpu/test/impl/XPUGuardTest.cpp b/c10/xpu/test/impl/XPUGuardTest.cpp
new file mode 100644
index 0000000000000..7112068db3ea7
--- /dev/null
+++ b/c10/xpu/test/impl/XPUGuardTest.cpp
@@ -0,0 +1,94 @@
+#include <gtest/gtest.h>
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Event.h>
+#include <c10/xpu/XPUStream.h>
+#include <c10/xpu/test/impl/XPUTest.h>
+
+bool has_xpu() {
+  return c10::xpu::device_count() > 0;
+}
+
+TEST(XPUGuardTest, GuardBehavior) {
+  if (!has_xpu()) {
+    return;
+  }
+
+  {
+    auto device = c10::Device(c10::kXPU);
+    const c10::DeviceGuard device_guard(device);
+    EXPECT_EQ(c10::xpu::current_device(), 0);
+  }
+
+  std::vector<c10::xpu::XPUStream> streams0 = {
+      c10::xpu::getStreamFromPool(), c10::xpu::getStreamFromPool(true)};
+  EXPECT_EQ(streams0[0].device_index(), 0);
+  EXPECT_EQ(streams0[1].device_index(), 0);
+  c10::xpu::setCurrentXPUStream(streams0[0]);
+  EXPECT_EQ(c10::xpu::getCurrentXPUStream(), streams0[0]);
+
+  if (c10::xpu::device_count() <= 1) {
+    return;
+  }
+
+  // Test DeviceGuard for XPU.
+  std::vector<c10::xpu::XPUStream> streams1;
+  {
+    auto device = c10::Device(c10::kXPU, 1);
+    const c10::DeviceGuard device_guard(device);
+    streams1.push_back(c10::xpu::getStreamFromPool());
+    streams1.push_back(c10::xpu::getStreamFromPool());
+  }
+
+  EXPECT_EQ(streams1[0].device_index(), 1);
+  EXPECT_EQ(streams1[1].device_index(), 1);
+  EXPECT_EQ(c10::xpu::current_device(), 0);
+}
+
+TEST(XPUGuardTest, EventBehavior) {
+  if (!has_xpu()) {
+    return;
+  }
+
+  auto device = c10::Device(c10::kXPU, c10::xpu::current_device());
+  c10::impl::VirtualGuardImpl impl(device.type());
+  c10::Stream stream1 = impl.getStream(device);
+  c10::Stream stream2 = impl.getStream(device);
+  c10::Event event(device.type());
+
+  constexpr int numel = 1024;
+  int hostData1[numel];
+  initHostData(hostData1, numel);
+  int hostData2[numel];
+  clearHostData(hostData2, numel);
+
+  auto xpu_stream1 = c10::xpu::XPUStream(stream1);
+  int* deviceData = sycl::malloc_device<int>(numel, xpu_stream1);
+
+  // Copy hostData1 to deviceData via stream1, and then copy deviceData to
+  // hostData2 via stream2.
+  xpu_stream1.queue().memcpy(deviceData, hostData1, sizeof(int) * numel);
+  // stream2 wait on stream1's completion.
+  event.record(stream1);
+  event.block(stream2);
+  auto xpu_stream2 = c10::xpu::XPUStream(stream2);
+  xpu_stream2.queue().memcpy(hostData2, deviceData, sizeof(int) * numel);
+  xpu_stream2.synchronize();
+
+  EXPECT_TRUE(event.query());
+  validateHostData(hostData2, numel);
+  event.record(stream2);
+  EXPECT_TRUE(event.query());
+
+  clearHostData(hostData2, numel);
+  xpu_stream1.queue().memcpy(deviceData, hostData1, sizeof(int) * numel);
+  // stream2 wait on stream1's completion.
+  event.record(stream1);
+  event.block(stream2);
+  // event will overwrite the previously captured state.
+  event.record(stream2);
+  xpu_stream2.queue().memcpy(hostData2, deviceData, sizeof(int) * numel);
+  xpu_stream2.synchronize();
+  EXPECT_TRUE(event.query());
+  validateHostData(hostData2, numel);
+}
diff --git a/c10/xpu/test/impl/XPUStreamTest.cpp b/c10/xpu/test/impl/XPUStreamTest.cpp
new file mode 100644
index 0000000000000..16f6e20c2163e
--- /dev/null
+++ b/c10/xpu/test/impl/XPUStreamTest.cpp
@@ -0,0 +1,180 @@
+#include <gtest/gtest.h>
+
+#include <c10/util/Optional.h>
+#include <c10/util/irange.h>
+#include <c10/xpu/XPUStream.h>
+#include <c10/xpu/test/impl/XPUTest.h>
+
+#include <thread>
+#include <unordered_set>
+
+bool has_xpu() {
+  return c10::xpu::device_count() > 0;
+}
+
+TEST(XPUStreamTest, CopyAndMoveTest) {
+  if (!has_xpu()) {
+    return;
+  }
+
+  int32_t device = -1;
+  sycl::queue queue;
+  c10::xpu::XPUStream copyStream = c10::xpu::getStreamFromPool();
+  {
+    auto s = c10::xpu::getStreamFromPool();
+    device = s.device_index();
+    queue = s.queue();
+
+    copyStream = s;
+
+    EXPECT_EQ(copyStream.device_index(), device);
+    EXPECT_EQ(copyStream.queue(), queue);
+  }
+
+  EXPECT_EQ(copyStream.device_index(), device);
+  EXPECT_EQ(copyStream.queue(), queue);
+
+  // Tests that moving works as expected and preserves the stream
+  c10::xpu::XPUStream moveStream = c10::xpu::getStreamFromPool();
+  {
+    auto s = c10::xpu::getStreamFromPool();
+    device = s.device_index();
+    queue = s.queue();
+
+    moveStream = std::move(s);
+
+    EXPECT_EQ(moveStream.device_index(), device);
+    EXPECT_EQ(moveStream.queue(), queue);
+  }
+
+  EXPECT_EQ(moveStream.device_index(), device);
+  EXPECT_EQ(moveStream.queue(), queue);
+}
+
+TEST(XPUStreamTest, StreamBehavior) {
+  if (!has_xpu()) {
+    return;
+  }
+
+  c10::xpu::XPUStream stream = c10::xpu::getStreamFromPool();
+  EXPECT_EQ(stream.device_type(), c10::kXPU);
+  c10::xpu::setCurrentXPUStream(stream);
+  c10::xpu::XPUStream cur_stream = c10::xpu::getCurrentXPUStream();
+
+  EXPECT_EQ(cur_stream, stream);
+  EXPECT_EQ(stream.priority(), 0);
+
+  auto [least_priority, greatest_priority] =
+      c10::xpu::XPUStream::priority_range();
+  EXPECT_EQ(least_priority, 0);
+  EXPECT_TRUE(greatest_priority < 0);
+
+  stream = c10::xpu::getStreamFromPool(/* isHighPriority */ true);
+  EXPECT_TRUE(stream.priority() < 0);
+
+  if (c10::xpu::device_count() <= 1) {
+    return;
+  }
+
+  c10::xpu::set_device(0);
+  stream = c10::xpu::getStreamFromPool(false, 1);
+  EXPECT_EQ(stream.device_index(), 1);
+  EXPECT_NE(stream.device_index(), c10::xpu::current_device());
+}
+
+void thread_fun(c10::optional<c10::xpu::XPUStream>& cur_thread_stream) {
+  auto new_stream = c10::xpu::getStreamFromPool();
+  c10::xpu::setCurrentXPUStream(new_stream);
+  cur_thread_stream = {c10::xpu::getCurrentXPUStream()};
+  EXPECT_EQ(*cur_thread_stream, new_stream);
+}
+
+// Ensures streams are thread local
+TEST(XPUStreamTest, MultithreadStreamBehavior) {
+  if (!has_xpu()) {
+    return;
+  }
+  c10::optional<c10::xpu::XPUStream> s0, s1;
+
+  std::thread t0{thread_fun, std::ref(s0)};
+  std::thread t1{thread_fun, std::ref(s1)};
+  t0.join();
+  t1.join();
+
+  c10::xpu::XPUStream cur_stream = c10::xpu::getCurrentXPUStream();
+
+  EXPECT_NE(cur_stream, *s0);
+  EXPECT_NE(cur_stream, *s1);
+  EXPECT_NE(s0, s1);
+}
+
+// Ensure queue pool round-robin fashion
+TEST(XPUStreamTest, StreamPoolRoundRobinTest) {
+  if (!has_xpu()) {
+    return;
+  }
+
+  std::vector<c10::xpu::XPUStream> streams{};
+  for (C10_UNUSED const auto _ : c10::irange(200)) {
+    streams.emplace_back(c10::xpu::getStreamFromPool());
+  }
+
+  std::unordered_set<sycl::queue> queue_set{};
+  bool hasDuplicates = false;
+  for (const auto i : c10::irange(streams.size())) {
+    auto& queue = streams[i].queue();
+    auto result_pair = queue_set.insert(queue);
+    if (!result_pair.second) { // already existed
+      hasDuplicates = true;
+    } else { // newly inserted
+      EXPECT_TRUE(!hasDuplicates);
+    }
+  }
+  EXPECT_TRUE(hasDuplicates);
+
+  auto stream = c10::xpu::getStreamFromPool(/* isHighPriority */ true);
+  auto result_pair = queue_set.insert(stream.queue());
+  EXPECT_TRUE(result_pair.second);
+}
+
+void asyncMemCopy(sycl::queue& queue, int* dst, int* src, size_t numBytes) {
+  queue.memcpy(dst, src, numBytes);
+}
+
+TEST(XPUStreamTest, StreamFunction) {
+  if (!has_xpu()) {
+    return;
+  }
+
+  constexpr int numel = 1024;
+  int hostData[numel];
+  initHostData(hostData, numel);
+
+  auto stream = c10::xpu::getStreamFromPool();
+  EXPECT_TRUE(stream.query());
+  int* deviceData = sycl::malloc_device<int>(numel, stream);
+
+  // H2D
+  asyncMemCopy(stream, deviceData, hostData, sizeof(int) * numel);
+  c10::xpu::syncStreamsOnDevice();
+  EXPECT_TRUE(stream.query());
+
+  clearHostData(hostData, numel);
+
+  // D2H
+  asyncMemCopy(stream, hostData, deviceData, sizeof(int) * numel);
+  c10::xpu::syncStreamsOnDevice();
+
+  validateHostData(hostData, numel);
+
+  stream = c10::xpu::getStreamFromPool(-1);
+
+  clearHostData(hostData, numel);
+
+  // D2H
+  asyncMemCopy(stream, hostData, deviceData, sizeof(int) * numel);
+  c10::xpu::syncStreamsOnDevice();
+
+  validateHostData(hostData, numel);
+  sycl::free(deviceData, c10::xpu::get_device_context());
+}
diff --git a/c10/xpu/test/impl/XPUTest.h b/c10/xpu/test/impl/XPUTest.h
new file mode 100644
index 0000000000000..083c75a700473
--- /dev/null
+++ b/c10/xpu/test/impl/XPUTest.h
@@ -0,0 +1,21 @@
+#include <gtest/gtest.h>
+
+#include <c10/util/irange.h>
+
+static inline void initHostData(int* hostData, int numel) {
+  for (const auto i : c10::irange(numel)) {
+    hostData[i] = i;
+  }
+}
+
+static inline void clearHostData(int* hostData, int numel) {
+  for (const auto i : c10::irange(numel)) {
+    hostData[i] = 0;
+  }
+}
+
+static inline void validateHostData(int* hostData, int numel) {
+  for (const auto i : c10::irange(numel)) {
+    EXPECT_EQ(hostData[i], i);
+  }
+}
diff --git a/c2_defs.bzl b/c2_defs.bzl
index 63519a3b20347..2ee896f7222f1 100644
--- a/c2_defs.bzl
+++ b/c2_defs.bzl
@@ -8,7 +8,6 @@ load("@fbsource//tools/build_defs:fb_xplat_cxx_library.bzl", "fb_xplat_cxx_libra
 load("@fbsource//tools/build_defs:fbsource_utils.bzl", "is_arvr_mode", "is_fbcode_mode_mac")
 load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "CXX", "IOS", "MACOSX", "WINDOWS")
 load("@fbsource//tools/build_defs/apple:build_mode_defs.bzl", "is_production_build")
-load("@fbsource//tools/build_defs/apple:config_utils_defs.bzl", "STATIC_LIBRARY_IOS_CONFIG", "STATIC_LIBRARY_MAC_CONFIG", "fbobjc_configs")
 load("@fbsource//xplat/caffe2:buckbuild.bzl", "read_bool")
 load("@fbsource//xplat/pfh/Msgr/Mobile/ProductInfra:DEFS.bzl", "Msgr_Mobile_ProductInfra")
 
@@ -274,6 +273,9 @@ C2_FBOBJC_EXTRA_TARGET_CONFIG = {
     "MTL_LANGUAGE_REVISION": "Metal12",
 }
 
+def get_c2_torch_vulkan_compiler_flags():
+    return ["-Wno-missing-prototypes"]
+
 def get_c2_default_cxx_args():
     return dict(
         header_namespace = "",
@@ -292,10 +294,6 @@ def get_c2_default_cxx_args():
             (".*x86.*", C2_FBANDROID_X86_COMPILER_FLAGS),
         ],
         fbobjc_compiler_flags = get_c2_fbobjc_compiler_flags() + get_c2_fbobjc_xplat_compiler_flags(),
-        fbobjc_configs = fbobjc_configs(
-            STATIC_LIBRARY_IOS_CONFIG,
-            extra_target_config = C2_FBOBJC_EXTRA_TARGET_CONFIG,
-        ),
         fbobjc_exported_platform_preprocessor_flags = [
             (
                 "iphoneos",
@@ -307,9 +305,6 @@ def get_c2_default_cxx_args():
             ("iphoneos", C2_FBOBJC_IPHONE_COMPILER_FLAGS),
         ],
         macosx_compiler_flags = C2_FBOBJC_MACOSX_COMPILER_FLAGS,
-        fbobjc_macosx_configs_override = fbobjc_configs(
-            STATIC_LIBRARY_MAC_CONFIG,
-        ),
         macosx_frameworks_override = get_c2_fbobjc_frameworks(),
         preprocessor_flags = [
             # Use the internal pthreadpool impl for all Caffe2 targets on all
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index f1394b62b8255..0f2c54bb0e506 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -59,42 +59,32 @@ if(INTERN_BUILD_ATEN_OPS)
 
   # Generate the headers wrapped by our operator
   file(GLOB_RECURSE torchgen_python "${PROJECT_SOURCE_DIR}/torchgen/*.py")
-  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/contrib/aten/aten_op.h
-  COMMAND
-  "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/gen_op.py
-    --aten_root=${CMAKE_CURRENT_SOURCE_DIR}/../aten
-    --template_dir=${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten
-    --yaml_dir=${CMAKE_BINARY_DIR}/aten/src/ATen
-    --install_dir=${CMAKE_CURRENT_BINARY_DIR}/contrib/aten
-  DEPENDS
-  ${torchgen_python}
-  ${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml
-  ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/gen_op.py
-  ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/aten_op_template.h)
 
-  add_custom_target(__aten_op_header_gen
-    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/contrib/aten/aten_op.h)
-  add_library(aten_op_header_gen INTERFACE)
-  add_dependencies(aten_op_header_gen __aten_op_header_gen)
 
   # Add source, includes, and libs to lists
   list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
   list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_CPP_SRCS})
+  list(APPEND Caffe2_XPU_SRCS ${ATen_XPU_SRCS})
+  list(APPEND Caffe2_XPU_INCLUDE ${ATen_XPU_INCLUDE})
+  list(APPEND Caffe2_XPU_DEPENDENCY_LIBS ${ATen_XPU_DEPENDENCY_LIBS})
   list(APPEND Caffe2_GPU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY})
   list(APPEND Caffe2_GPU_CU_SRCS ${ATen_CUDA_CU_SRCS})
   list(APPEND Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY})
   list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS})
   list(APPEND Caffe2_MPS_SRCS ${ATen_MPS_SRCS})
+  list(APPEND Caffe2_XPU_SRCS ${ATen_XPU_SRCS})
   list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS_W_SORT_BY_KEY})
   list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS})
   list(APPEND Caffe2_MPS_TEST_SRCS ${ATen_MPS_TEST_SRCS})
   list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS})
   list(APPEND Caffe2_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS})
+  list(APPEND Caffe2_XPU_TEST_SRCS ${ATen_XPU_TEST_SRCS})
   list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS})
   list(APPEND Caffe2_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS})
   list(APPEND Caffe2_CPU_INCLUDE ${ATen_CPU_INCLUDE})
   list(APPEND Caffe2_GPU_INCLUDE ${ATen_CUDA_INCLUDE})
   list(APPEND Caffe2_HIP_INCLUDE ${ATen_HIP_INCLUDE})
+  list(APPEND Caffe2_XPU_INCLUDE ${ATen_XPU_INCLUDE})
   list(APPEND Caffe2_VULKAN_INCLUDE ${ATen_VULKAN_INCLUDE})
   list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS})
   list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS})
@@ -126,41 +116,14 @@ endif()
 
 # Skip modules that are not used by libtorch mobile yet.
 if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
-  add_subdirectory(contrib)
-  add_subdirectory(predictor)
-  add_subdirectory(predictor/emulator)
   add_subdirectory(core/nomnigraph)
   if(USE_NVRTC)
     add_subdirectory(cuda_rtc)
   endif()
-  add_subdirectory(db)
-  add_subdirectory(distributed)
-  # add_subdirectory(experiments) # note, we may remove this folder at some point
-  add_subdirectory(ideep)
-  add_subdirectory(image)
-  add_subdirectory(video)
-  add_subdirectory(mobile)
-  add_subdirectory(mpi)
-  add_subdirectory(observers)
-  add_subdirectory(onnx)
   if(BUILD_CAFFE2_OPS)
-    add_subdirectory(operators)
-    add_subdirectory(operators/rnn)
-    if(USE_FBGEMM)
-      add_subdirectory(quantization/server)
-    endif()
-    if(USE_QNNPACK)
-      add_subdirectory(operators/quantized)
-    endif()
   endif()
-  add_subdirectory(opt)
   add_subdirectory(proto)
   add_subdirectory(python)
-  add_subdirectory(queue)
-  add_subdirectory(sgd)
-  add_subdirectory(share)
-  # add_subdirectory(test) # todo: use caffe2_gtest_main instead of gtest_main because we will need to call GlobalInit
-  add_subdirectory(transforms)
 endif()
 if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
   add_subdirectory(proto)
@@ -171,6 +134,7 @@ endif()
 if(CAFFE2_ALLOWLISTED_FILES)
   caffe2_do_allowlist(Caffe2_CPU_SRCS CAFFE2_ALLOWLISTED_FILES)
   caffe2_do_allowlist(Caffe2_GPU_SRCS CAFFE2_ALLOWLISTED_FILES)
+  caffe2_do_allowlist(Caffe2_XPU_SRCS CAFFE2_ALLOWLISTED_FILES)
   caffe2_do_allowlist(Caffe2_GPU_SRCS_W_SORT_BY_KEY CAFFE2_ALLOWLISTED_FILES)
   caffe2_do_allowlist(Caffe2_GPU_CU_SRCS CAFFE2_ALLOWLISTED_FILES)
   caffe2_do_allowlist(Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY CAFFE2_ALLOWLISTED_FILES)
@@ -233,6 +197,11 @@ if(PRINT_CMAKE_DEBUG_INFO)
     message(STATUS "  " ${tmp})
   endforeach()
 
+  message(STATUS "XPU sources: ")
+  foreach(tmp ${Caffe2_XPU_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
   message(STATUS "HIP test sources: ")
   foreach(tmp ${Caffe2_HIP_TEST_SRCS})
     message(STATUS "  " ${tmp})
@@ -258,6 +227,11 @@ if(PRINT_CMAKE_DEBUG_INFO)
     message(STATUS "  " ${tmp})
   endforeach()
 
+  message(STATUS "ATen XPU test sources: ")
+  foreach(tmp ${ATen_XPU_TEST_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
   message(STATUS "ATen Vulkan test sources: ")
   foreach(tmp ${ATen_VULKAN_TEST_SRCS})
     message(STATUS "  " ${tmp})
@@ -338,6 +312,7 @@ configure_file("${TORCH_SRC_DIR}/csrc/api/include/torch/version.h.in"
 
 set(GENERATED_CXX_TORCH
   "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/ViewFuncs.cpp"
   )
 
 if(NOT INTERN_DISABLE_AUTOGRAD AND NOT BUILD_LITE_INTERPRETER)
@@ -354,6 +329,7 @@ if(NOT INTERN_DISABLE_AUTOGRAD AND NOT BUILD_LITE_INTERPRETER)
     "${TORCH_SRC_DIR}/csrc/autograd/generated/TraceType_4.cpp"
     "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_0.cpp"
     "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_1.cpp"
+    "${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/generated/c_shim_cpu.cpp"
   )
   if(BUILD_LAZY_TS_BACKEND)
     list(APPEND GENERATED_CXX_TORCH
@@ -367,6 +343,7 @@ endif()
 set(GENERATED_H_TORCH
   "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.h"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/variable_factories.h"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/ViewFuncs.h"
   )
 
 if(NOT INTERN_DISABLE_AUTOGRAD)
@@ -407,12 +384,17 @@ set(GENERATED_TESTING_PYTHON
   "${TORCH_SRC_DIR}/testing/_internal/generated/annotated_fn_args.py"
   )
 
+set(GENERATED_CXX_TORCH_CUDA
+  "${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/generated/c_shim_cuda.cpp"
+  )
+
 set(TORCH_GENERATED_CODE
   ${GENERATED_CXX_TORCH}
   ${GENERATED_H_TORCH}
   ${GENERATED_CXX_PYTHON}
   ${GENERATED_H_PYTHON}
   ${GENERATED_TESTING_PYTHON}
+  ${GENERATED_CXX_TORCH_CUDA}
   )
 
 set(GEN_PER_OPERATOR_FLAG)
@@ -539,6 +521,12 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/tensorexpr/llvm_codegen.cpp PROPERTIES COMPILE_FLAGS "-Wno-init-list-lifetime")
 endif()
 
+# Enable conditional FP16 arithmetic intrinsics
+if(CPU_AARCH64 AND LINUX)
+set_source_files_properties(${TORCH_ROOT}/aten/src/ATen/native/BlasKernel.cpp PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+fp16")
+endif()
+
+
 if(NOT INTERN_DISABLE_MOBILE_INTERP)
   set(MOBILE_SRCS
      ${TORCH_SRC_DIR}/csrc/jit/mobile/function.cpp
@@ -764,6 +752,7 @@ if(NOT NO_API AND NOT BUILD_LITE_INTERPRETER)
     ${TORCH_SRC_DIR}/csrc/api/src/optim/schedulers/reduce_on_plateau_scheduler.cpp
     ${TORCH_SRC_DIR}/csrc/api/src/serialize/input-archive.cpp
     ${TORCH_SRC_DIR}/csrc/api/src/serialize/output-archive.cpp
+    ${TORCH_SRC_DIR}/csrc/api/src/xpu.cpp
   )
 endif()
 
@@ -954,9 +943,11 @@ endif()
 # Compile exposed libraries.
 if(USE_ROCM)
   set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
+  list(APPEND Caffe2_HIP_SRCS ${GENERATED_CXX_TORCH_CUDA})
   hip_add_library(torch_hip ${Caffe2_HIP_SRCS})
   if(USE_FLASH_ATTENTION)
-    target_link_libraries(torch_hip PRIVATE __caffe2_oort)
+    target_link_libraries(torch_hip PRIVATE __caffe2_aotriton)
+    add_dependencies(torch_hip aotriton_external)
   endif()
   set(CUDA_LINK_LIBRARIES_KEYWORD)
   torch_compile_options(torch_hip)  # see cmake/public/utils.cmake
@@ -972,6 +963,7 @@ if(USE_ROCM)
   endif()
 elseif(USE_CUDA)
   set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
+  list(APPEND Caffe2_GPU_SRCS ${GENERATED_CXX_TORCH_CUDA})
   if(CUDA_SEPARABLE_COMPILATION)
     # Separate compilation fails when kernels using `thrust::sort_by_key`
     # are linked with the rest of CUDA code. Workaround by linking them separately.
@@ -1061,6 +1053,63 @@ elseif(USE_CUDA)
   endif()
 endif()
 
+if(USE_XPU)
+  add_library(torch_xpu ${Caffe2_XPU_SRCS})
+  torch_compile_options(torch_xpu)  # see cmake/public/utils.cmake
+  target_compile_options_if_supported(torch_xpu "-Wno-deprecated-copy")  # see cmake/public/utils.cmake
+  target_compile_definitions(torch_xpu PRIVATE USE_XPU)
+
+  # ATen XPU implementation
+  set(TORCH_XPU_OPS_DIR ${TORCH_ROOT}/third_party/torch-xpu-ops)
+  set(TORCH_XPU_OPS_REPO_URL https://github.com/intel/torch-xpu-ops.git)
+  file(READ "${TORCH_ROOT}/third_party/xpu.txt" TORCH_XPU_OPS_COMMIT)
+  string(REGEX REPLACE "\n$" "" TORCH_XPU_OPS_COMMIT "${TORCH_XPU_OPS_COMMIT}")
+  if(NOT EXISTS "${TORCH_XPU_OPS_DIR}/.git")
+    execute_process(
+      COMMAND git clone --quiet ${TORCH_XPU_OPS_REPO_URL} ${TORCH_XPU_OPS_DIR}
+      RESULT_VARIABLE _exitcode)
+    if(NOT _exitcode EQUAL 0)
+      message(FATAL_ERROR "Fail to clone ${TORCH_XPU_OPS_REPO_URL}")
+    endif()
+  endif()
+  execute_process(
+    COMMAND git fetch --quiet
+    WORKING_DIRECTORY ${TORCH_XPU_OPS_DIR}
+    RESULT_VARIABLE _exitcode)
+  if(NOT _exitcode EQUAL 0)
+    message(FATAL_ERROR "Fail to fetch ${TORCH_XPU_OPS_REPO_URL}")
+  endif()
+  execute_process(
+    COMMAND git checkout --quiet ${TORCH_XPU_OPS_COMMIT}
+    WORKING_DIRECTORY ${TORCH_XPU_OPS_DIR}
+    RESULT_VARIABLE _exitcode)
+  if(NOT _exitcode EQUAL 0)
+    message(FATAL_ERROR "Fail to checkout ${TORCH_XPU_OPS_REPO_URL} to ${TORCH_XPU_OPS_COMMIT}")
+  endif()
+
+  set(TORCH_XPU_OPS_INCLUDE_DIRS
+      ${TORCH_SRC_DIR}/csrc/api
+      ${TORCH_SRC_DIR}/csrc/api/include
+      ${Caffe2_CPU_INCLUDE}
+      ${Caffe2_XPU_INCLUDE})
+  # Pass the target as a dependency so that ATen headers generation
+  # could be followed by torch-xpu-ops build.
+  # 1. Sources in torch-xpu-ops depend on generated ATen headers.
+  # 2. Using add_custom_command in torch-xpu-ops to define sycl device sources
+  #    compilation. add_custom_command requires an explicit dependency.
+  set(TORCH_XPU_OPS_PYTORCH_DEPS ATEN_CPU_FILES_GEN_TARGET)
+
+  add_subdirectory(${TORCH_ROOT}/third_party/torch-xpu-ops
+      ${CMAKE_BINARY_DIR}/caffe2/aten_xpu)
+  if(NOT TARGET torch_xpu_ops)
+    message(WARNING "Failed to include ATen XPU implementation target")
+  else()
+    target_link_libraries(torch_xpu PRIVATE torch_xpu_ops)
+    target_link_libraries(torch_xpu PRIVATE
+      "-Wl,--whole-archive,\"$<TARGET_FILE:torch_xpu_ops>\" -Wl,--no-whole-archive")
+  endif()
+endif()
+
 if(NOT MSVC AND USE_XNNPACK)
   TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)
 endif()
@@ -1169,11 +1218,6 @@ if(USE_TBB)
   target_link_libraries(torch_cpu PUBLIC TBB::tbb)
 endif()
 
-if(BUILD_CAFFE2 AND BUILD_CAFFE2_OPS AND USE_FBGEMM)
-  # FIXME: quantization/server/conv_dnnlowp_op.cc depends on fbgemm/src/RefImplementations.h
-  target_include_directories(torch_cpu PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../third_party)
-endif()
-
 target_include_directories(torch_cpu PRIVATE ${ATen_CPU_INCLUDE})
 
 target_include_directories(torch_cpu PRIVATE
@@ -1244,8 +1288,11 @@ if(BUILD_TEST)
   endif()
   if(BUILD_AOT_INDUCTOR_TEST)
     add_subdirectory(
-      ${TORCH_ROOT}/test/cpp/aot_inductor
-      ${CMAKE_BINARY_DIR}/test_aot_inductor)
+      ${TORCH_ROOT}/test/cpp/aoti_abi_check
+      ${CMAKE_BINARY_DIR}/test_aoti_abi_check)
+    add_subdirectory(
+      ${TORCH_ROOT}/test/cpp/aoti_inference
+      ${CMAKE_BINARY_DIR}/test_aoti_inference)
   endif()
 endif()
 
@@ -1302,11 +1349,19 @@ if(USE_ROCM)
     USE_ROCM
     __HIP_PLATFORM_AMD__
     )
+
+  if(NOT ROCM_SOURCE_DIR)
+    set(ROCM_SOURCE_DIR "$ENV{ROCM_SOURCE_DIR}")
+  endif()
+  if($ROCM_SOURCE_DIR STREQUAL "")
+    set(ROCM_SOURCE_DIR "/opt/rocm")
+  endif()
+  message(INFO "caffe2 ROCM_SOURCE_DIR = ${ROCM_SOURCE_DIR}")
   target_include_directories(torch_hip PRIVATE
-    /opt/rocm/include
-    /opt/rocm/hcc/include
-    /opt/rocm/rocblas/include
-    /opt/rocm/hipsparse/include
+    ${ROCM_SOURCE_DIR}/include
+    ${ROCM_SOURCE_DIR}/hcc/include
+    ${ROCM_SOURCE_DIR}/rocblas/include
+    ${ROCM_SOURCE_DIR}/hipsparse/include
     )
   if(USE_FLASH_ATTENTION)
     target_compile_definitions(torch_hip PRIVATE USE_FLASH_ATTENTION)
@@ -1420,6 +1475,10 @@ elseif(USE_ROCM)
   target_compile_definitions(torch_hip PRIVATE TORCH_HIP_BUILD_MAIN_LIB)
 endif()
 
+if(USE_XPU)
+  target_compile_definitions(torch_xpu PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+endif()
+
 set(EXPERIMENTAL_SINGLE_THREAD_POOL "0" CACHE STRING
   "Experimental option to use a single thread pool for inter- and intra-op parallelism")
 if("${EXPERIMENTAL_SINGLE_THREAD_POOL}")
@@ -1507,6 +1566,8 @@ if(USE_CUDA)
   caffe2_interface_library(torch_cuda torch_cuda_library)
 elseif(USE_ROCM)
   caffe2_interface_library(torch_hip torch_hip_library)
+elseif(USE_XPU)
+  caffe2_interface_library(torch_xpu torch_xpu_library)
 endif()
 
 caffe2_interface_library(torch torch_library)
@@ -1517,7 +1578,10 @@ if(USE_CUDA)
   install(TARGETS torch_cuda torch_cuda_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 elseif(USE_ROCM)
   install(TARGETS torch_hip torch_hip_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+elseif(USE_XPU)
+  install(TARGETS torch_xpu torch_xpu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 endif()
+
 install(TARGETS torch torch_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 
 target_link_libraries(torch PUBLIC torch_cpu_library)
@@ -1528,6 +1592,10 @@ elseif(USE_ROCM)
   target_link_libraries(torch PUBLIC torch_hip_library)
 endif()
 
+if(USE_XPU)
+  target_link_libraries(torch PUBLIC torch_xpu_library)
+endif()
+
 if(PRINT_CMAKE_DEBUG_INFO)
   print_target_properties(torch)
   print_target_properties(torch_cpu)
@@ -1548,7 +1616,7 @@ if(USE_CUDA)
   # FIXME: If kineto is linked with CUPTI it pollutes torch_cpu with CUDA dependencies
   # Even worse, it never declares that it depends on cudart, but calls the API, see
   # https://github.com/pytorch/kineto/blob/aef2f5c0f15e3be52406ac0b885e8689de6bc9f6/libkineto/src/CudaDeviceProperties.cpp#L24
-  if(USE_KINETO AND NOT MSVC)
+  if(USE_KINETO AND NOT MSVC AND NOT LIBKINETO_NOCUPTI)
     target_link_libraries(torch_cpu PRIVATE torch::cudart)
   endif()
   target_link_libraries(torch_cuda INTERFACE torch::cudart)
@@ -1567,6 +1635,21 @@ if(USE_CUDA)
   target_link_libraries(torch_cuda PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
 endif()
 
+# ---[ XPU library.
+if(USE_XPU)
+  target_link_libraries(torch_xpu INTERFACE torch::xpurt)
+  target_link_libraries(torch_xpu PUBLIC c10_xpu)
+
+  target_include_directories(
+      torch_xpu INTERFACE $<INSTALL_INTERFACE:include>)
+  target_include_directories(
+      torch_xpu PRIVATE ${Caffe2_XPU_INCLUDE})
+  target_link_libraries(
+      torch_xpu PRIVATE ${Caffe2_XPU_DEPENDENCY_LIBS})
+
+  target_link_libraries(torch_xpu PUBLIC torch_cpu_library)
+endif()
+
 # ---[ Metal(OSX) modification
 if(APPLE AND USE_PYTORCH_METAL)
   if(NOT INTERN_BUILD_MOBILE)
@@ -1711,7 +1794,7 @@ if(BUILD_TEST)
           endif()
         else()
           add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
-          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main)
+          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
         endif()
         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<INSTALL_INTERFACE:include>)
         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
@@ -1729,6 +1812,9 @@ if(BUILD_TEST)
     get_filename_component(test_name ${test_src} NAME_WE)
     add_executable(${test_name} "${test_src}")
     target_link_libraries(${test_name} torch_library gtest_main)
+    if(NOT MSVC)
+      target_link_libraries(${test_name} stdc++)
+    endif()
     target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
     target_include_directories(${test_name} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
     target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
@@ -1785,6 +1871,23 @@ if(BUILD_TEST)
         endif()
       endif()
     endforeach()
+    if(TARGET context_gpu_test)
+      target_link_libraries(context_gpu_test caffe2::curand caffe2::cublas)
+    endif()
+  endif()
+
+  if(USE_XPU)
+    foreach(test_src ${Caffe2_XPU_TEST_SRCS})
+      get_filename_component(test_name ${test_src} NAME_WE)
+      add_executable(${test_name} "${test_src}")
+      target_link_libraries(${test_name} torch_library gtest_main)
+      target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
+      target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
+      add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+      if(INSTALL_TEST)
+        install(TARGETS ${test_name} DESTINATION test)
+      endif()
+    endforeach()
   endif()
 
   if(USE_VULKAN)
@@ -1971,6 +2074,9 @@ if(BUILD_PYTHON)
     target_include_directories(caffe2_pybind11_state_gpu PRIVATE ${Caffe2_CPU_INCLUDE})
     target_link_libraries(caffe2_pybind11_state_gpu PRIVATE
         torch_library python::python pybind::pybind11)
+    if(USE_MKLDNN)
+        target_link_libraries(caffe2_pybind11_state_gpu PRIVATE caffe2::mkldnn)
+    endif()
     if(WIN32)
       target_link_libraries(caffe2_pybind11_state_gpu PRIVATE onnx_proto)
     endif(WIN32)
diff --git a/caffe2/contrib/CMakeLists.txt b/caffe2/contrib/CMakeLists.txt
deleted file mode 100644
index b23e38841dbb7..0000000000000
--- a/caffe2/contrib/CMakeLists.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-add_subdirectory(aten)
-add_subdirectory(nccl)
-add_subdirectory(opencl)
-add_subdirectory(prof)
-add_subdirectory(shm_mutex)
-add_subdirectory(fakelowp)
-if(USE_TENSORRT)
-add_subdirectory(tensorrt)
-endif()
-
-# Only build Gloo Caffe2 ops on Linux, as it hardcodes
-# the Linux-specific `gloo::transport::tcp` namespace.
-if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
-add_subdirectory(gloo)
-endif()
-
-# Pass the src lists back to the parent
-
-# CPU source, include, deps, test sources, binary sources
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-set(Caffe2_CPU_INCLUDE ${Caffe2_CPU_INCLUDE} PARENT_SCOPE)
-set(Caffe2_DEPENDENCY_LIBS ${Caffe2_DEPENDENCY_LIBS} PARENT_SCOPE)
-set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
-
-# GPU source, include, deps, test sources, binary sources
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_INCLUDE ${Caffe2_GPU_INCLUDE} PARENT_SCOPE)
-set(Caffe2_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
-set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
-
-# HIP sources, include, test sources
-set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
-set(Caffe2_HIP_INCLUDE ${Caffe2_HIP_INCLUDE} PARENT_SCOPE)
-set(Caffe2_HIP_DEPENDENCY_LIBS ${Caffe2_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
-set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
diff --git a/caffe2/contrib/aten/CMakeLists.txt b/caffe2/contrib/aten/CMakeLists.txt
deleted file mode 100644
index d9f84fcb71748..0000000000000
--- a/caffe2/contrib/aten/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-if(NOT INTERN_BUILD_MOBILE AND BUILD_CAFFE2_OPS)
-  # Add source generated by Codegen.cmake and pass to parent
-  list(APPEND Caffe2_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc)
-  list(APPEND Caffe2_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op_gpu.cc)
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-
-  if(USE_ROCM)
-    list(APPEND Caffe2_HIP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/hip/aten_op_gpu.cc)
-    set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
-  endif()
-endif()
diff --git a/caffe2/contrib/aten/README.md b/caffe2/contrib/aten/README.md
deleted file mode 100644
index df8901ae318ad..0000000000000
--- a/caffe2/contrib/aten/README.md
+++ /dev/null
@@ -1,80 +0,0 @@
-# An ATen operator for Caffe2
-
-ATen is a simple tensor library thats exposes the Tensor operations in Torch
-and PyTorch directly in C++17. This library provides a generated wrapper around the ATen API
-that makes these functions available in Caffe2 as an operator. It also makes it accessible using the
-ToffeeIR.
-
-
-### Example Usage in Caffe2
-
-First identify a function in ATen you want to call in Functions.h,
-Tensor.h, or Type.h.
-
-We will call the `pow` operator:
-
-```
-static inline Tensor pow(const Tensor & self, Scalar exponent);
-```
-
-Now create a Caffe2 operator to call this op. The name of the operator is always `"ATen"`,
-and there is always a string attribute `operator` that defines which ATen function to call:
-
-
-```
-import numpy as np
-from caffe2.python import core, workspace
-
-
-# create the Caffe2 Op:
-op = core.CreateOperator(
-    "ATen",
-    ["MyInput"],
-    ["MyOutput"],
-    operator="pow", exponent=2.0)
-
-```
-
-Each `Tensor` input becomes an Caffe2 input Blob, and each output becomes a Caffe2 output blob.
-Non-tensor inputs such as `Scalar exponent` become Caffe2 `arg` attributes.
-In the case of `Scalar` the attributes can be either an integers or floating point numbers.
-
-The op can now be run like any other Caffe2 operator:
-
-```
-workspace.FeedBlob("MyInput",np.random.randn(2,3).astype(np.float32))
-workspace.RunOperatorOnce(op)
-print(workspace.FetchBlob("MyOutput")
-```
-
-For methods, the first input is always the `this` Tensor in C++.
-To call methods of ATen's `Type` objects, you provide an additional string attribute
-that determines the type:
-
-```
-# create a 2x4 tensor filled with floating point ones
-op = core.CreateOperator(
-    "ATen",
-    [],
-    ["MyOutput"],
-    operator="ones", type="Float", size={2,4})
-```
-
-Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA.
-
-### Example Usage via PyTorch Symbolic
-
-The ATen operator can also be used to define `symbolic` definitions for PyTorch when an operator is being exported
-to ONNX. In this case, the definition of the operator looks the same but is defined using PyTorch's ONNX API:
-
-```
-class Add(torch.autograd.Function):
-
-    @staticmethod
-    def symbolic(g, a, b):
-        return g.at("add", a, b)
-
-    @staticmethod
-    def forward(ctx, a, b):
-        return a + b
-```
diff --git a/caffe2/contrib/aten/aten_op.cc b/caffe2/contrib/aten/aten_op.cc
deleted file mode 100644
index dba68d21c2dd1..0000000000000
--- a/caffe2/contrib/aten/aten_op.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-#include "caffe2/contrib/aten/aten_op.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-namespace internal {
-at::Tensor index_with_uint8_handling(
-    const at::Tensor& self,
-    const torch::List<c10::optional<at::Tensor>>& indices) {
-  // Support BC only for the simplest case of mask indexing
-  if (indices.size() == 1) {
-    c10::optional<at::Tensor> first = indices[0];
-    if (first.has_value()
-        && first->scalar_type() == at::kByte) {
-      TORCH_WARN(
-          "Indexing with uint8 mask tensor in ATenOp is now deprecated,"
-          " please use a bool mask instead.");
-      return at::index(self, {first->to(at::kBool)});
-    }
-  }
-  return at::index(self, indices);
-}
-} // namespace internal
-
-REGISTER_CPU_OPERATOR(ATen, ATenOp<CPUContext>);
-template <>
-at::Backend ATenOp<CPUContext>::backend() const {
-  return at::Backend::CPU;
-}
-
-OPERATOR_SCHEMA(ATen);
-
-namespace math {
-
-template <>
-void Set<at::Half, CPUContext>(
-    const std::int64_t /* N */,
-    const at::Half h,
-    at::Half* v,
-    CPUContext* c) {
-  Set(0, h.x, (uint16_t*)v, c);
-}
-
-template <>
-void Set<at::BFloat16, CPUContext>(
-    const std::int64_t /* N */,
-    const at::BFloat16 b,
-    at::BFloat16* v,
-    CPUContext* c) {
-  Set(0, b.x, (uint16_t*)v, c);
-}
-
-
-} // namespace math
-
-} // namespace caffe2
diff --git a/caffe2/contrib/aten/aten_op.h b/caffe2/contrib/aten/aten_op.h
deleted file mode 100644
index 7161e4a59d24f..0000000000000
--- a/caffe2/contrib/aten/aten_op.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "caffe2/caffe2/contrib/aten/gen_aten_op.h"
diff --git a/caffe2/contrib/aten/aten_op_gpu.cc b/caffe2/contrib/aten/aten_op_gpu.cc
deleted file mode 100644
index 093f9bd6ea966..0000000000000
--- a/caffe2/contrib/aten/aten_op_gpu.cc
+++ /dev/null
@@ -1,12 +0,0 @@
-#include "caffe2/contrib/aten/aten_op.h"
-#include "caffe2/core/context_gpu.h"
-
-namespace caffe2 {
-
-REGISTER_CUDA_OPERATOR(ATen, ATenOp<CUDAContext>);
-template<>
-at::Backend ATenOp<CUDAContext>::backend() const {
-  return at::Backend::CUDA;
-}
-
-}
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
deleted file mode 100644
index 281913c4911e9..0000000000000
--- a/caffe2/contrib/aten/aten_op_template.h
+++ /dev/null
@@ -1,237 +0,0 @@
-#pragma once
-#include <unordered_map>
-#include <string>
-#include <ATen/Functions.h>
-#include <c10/macros/Macros.h>
-#include <c10/util/irange.h>
-#include <caffe2/core/context.h>
-#include <caffe2/core/operator.h>
-#include <caffe2/utils/math.h>
-#include <iostream>
-
-// a map from descriptor strings (see [DESCRIPTORS])
-// to the key in the switch statement that implements them
-static std::unordered_map<std::string, int> op_to_key = {
-  ${mappings}
-};
-
-namespace caffe2 {
-
-using at::Half; // for AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ...)
-
-namespace internal {
-TORCH_API at::Tensor index_with_uint8_handling(
-    const at::Tensor& self,
-    const torch::List<c10::optional<at::Tensor>>& indices);
-}
-
-template <class Context>
-class ATenOp : public Operator<Context> {
- public:
-  ATenOp(const OperatorDef& operator_def, Workspace* ws)
-  : Operator<Context>(operator_def, ws) {
-    VLOG(2) << "ATen OpDef: " << ProtoDebugString(operator_def) << "\n";
-    switch(findImplementation(operator_def)) {
-      ${cases}
-      default:
-        CAFFE_THROW("Unexpected key value for aten operator");
-    }
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    return run_op();
-  }
-private:
-  // actual operator implementation is initialized in ctor.
-  std::function<bool()> run_op;
-  at::Backend backend() const;
-
-  TypeMeta typeMetaFor(const at::Tensor & t) {
-    return typeMetaFor(t.scalar_type());
-  }
-  TypeMeta typeMetaFor(at::ScalarType st) {
-    #define DEFINE_CASE(ctype,aten_name) \
-      case at::k##aten_name: \
-        return TypeMeta::Make<ctype>();
-    switch(st) {
-      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DEFINE_CASE)
-    default:
-      CAFFE_THROW("Unknown ATen Type");
-    }
-    #undef DEFINE_CASE
-  }
-
-  at::TensorOptions optionsFor(const Tensor& ten) {
-    at::Device device = ten.GetDevice();
-#if defined(USE_ROCM)
-    if (backend() == at::Backend::HIP) {
-      device = at::Device(kCUDA, device.index());
-    }
-#endif
-    return at::TensorOptions(device).dtype(ten.dtype());
-  }
-
-  at::Tensor tensorWrapping(const Tensor& ten_) {
-    auto& ten = const_cast<Tensor&>(ten_);
-    return at::from_blob(
-        ten.raw_mutable_data(),
-        ten.sizes(),
-        optionsFor(ten));
-  }
-
-  at::Tensor peek(size_t i, size_t N) {
-    auto real_idx = InputSize() - N + i;
-    return tensorWrapping(Input(real_idx));
-  }
-
-  std::vector<at::Tensor> peekSlice(size_t i, size_t len, size_t N) {
-    std::vector<at::Tensor> results;
-    results.reserve(len);
-    for (size_t ii = i; ii < i + len; ++ii) {
-      results.push_back(peek(ii, N));
-    }
-    return results;
-  }
-
-  torch::List<c10::optional<at::Tensor>> peekSliceOptionals(size_t i, size_t len, size_t N) {
-    torch::List<c10::optional<at::Tensor>> results;
-    results.reserve(len);
-    for (size_t ii = i; ii < i + len; ++ii) {
-      results.push_back(peek(ii, N));
-    }
-    return results;
-  }
-
-  void assignTo(Tensor* dst, const at::Tensor& src_) {
-    at::Tensor src = src_.contiguous();
-    auto at_sizes = src.sizes();
-    caffe2::TypeMeta type_meta = typeMetaFor(src);
-    at::Device device = src.device();
-#if defined(USE_ROCM)
-    if (device.is_cuda()) {
-      device = at::Device(at::DeviceType::HIP, device.index());
-    }
-#endif
-    at::TensorImpl* src_impl = src.unsafeReleaseTensorImpl();
-    std::vector<int64_t> dims(at_sizes.begin(), at_sizes.end());
-    dst->Resize(dims);
-    dst->ShareExternalPointer(
-        at::DataPtr(
-            src_impl->mutable_data(),
-            static_cast<void*>(src_impl),
-            [](void* t_ptr) -> void {
-              at::TensorImpl* local_impl = static_cast<at::TensorImpl*>(t_ptr);
-              c10::raw::intrusive_ptr::decref(local_impl);
-            },
-            device),
-        type_meta,
-        0);
-  }
-  void assignListStartingAt(
-      size_t offset,
-      const std::vector<at::Tensor>& tensors) {
-    for (const auto i : c10::irange(tensors.size())) {
-      assignTo(Output(offset + i), tensors[i]);
-    }
-  }
-
-  template<typename T,
-          typename std::enable_if<std::numeric_limits<T>::is_integer, bool>::type* =
-              nullptr>
-  int64_t extract(const at::Scalar &s) {
-    return s.toLong();
-  }
-
-  template<typename T,
-          typename std::enable_if<!std::numeric_limits<T>::is_integer, bool>::type* =
-              nullptr>
-  int64_t extract(const at::Scalar &s) {
-    return s.toDouble();
-  }
-
-  void assignTo(Tensor* dst, at::ScalarType scalar_type, const at::Scalar& scalar) {
-    switch(scalar_type) {
-      #define DEFINE_CASE(ctype,aten_name) \
-        case at::k##aten_name: { \
-          auto value = extract<ctype>(scalar); \
-          assignToValue<ctype>(dst, at::convert<ctype,decltype(value)>(value)); \
-        } break;
-      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DEFINE_CASE)
-#undef DEFINE_CASE
-      default:
-        CAFFE_THROW("Unknown ATen Type");
-    }
-  }
-  template <typename T>
-  void assignToValue(Tensor* dst, T v) {
-    dst->Resize(std::vector<int64_t>());
-    math::Set(1, v, dst->template mutable_data<T>(), &context_);
-  }
-  int findImplementation(const OperatorDef& operator_def) {
-    CAFFE_ENFORCE(HasArgument("operator"));
-    std::string op = OperatorBase::GetSingleArgument<std::string>("operator", "");
-    // construct descriptor string ([DESCRIPTORS]) given the attributes
-    // and inputs of this operator_def, and look up the implementation key
-    // for this variant
-    std::stringstream descriptor;
-    descriptor << op;
-    std::vector<std::string> attrs;
-    for (const auto i : c10::irange(operator_def.arg_size())) {
-      auto & attr = operator_def.arg(i);
-      if (attr.name() == "operator" || attr.name() == "type" || attr.name() == "overload_name") {
-        continue;
-      }
-      attrs.push_back(attr.name());
-    }
-    std::sort(attrs.begin(), attrs.end());
-    for(auto & a : attrs)
-      descriptor << "-" << a;
-
-    std::string descriptor_sized =
-        descriptor.str() + "-" + c10::to_string(InputSize());
-    std::string descriptor_var_args = descriptor.str() + "-*";
-    if (op_to_key.count(descriptor_sized) > 0) {
-      return op_to_key[descriptor_sized];
-    }
-    if (op_to_key.count(descriptor_var_args) > 0) {
-      return op_to_key[descriptor_var_args];
-    }
-    std::stringstream ss;
-    ss << "Attempting to run unknown ATen operator configuration: "
-       << descriptor_sized;
-    CAFFE_THROW(ss.str());
-  }
-  at::Scalar readScalarAttribute(const std::string & name) {
-    if(OperatorBase::HasSingleArgumentOfType<int64_t>(name)) {
-      return OperatorBase::GetSingleArgument<int64_t>(name, 0);
-    } else {
-      CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<float>(name));
-      return OperatorBase::GetSingleArgument<float>(name, 0);
-    }
-  }
-  template<typename T>
-  T readAttribute(const std::string & name) {
-    CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<T>(name));
-    return OperatorBase::GetSingleArgument<T>(name, 0);
-  }
-  std::vector<int64_t> readIntArrayRef(const std::string & name) {
-    CAFFE_ENFORCE(OperatorBase::HasArgument(name));
-    return OperatorBase::GetRepeatedArgument<int64_t>(name, {});
-  }
-  template <int N>
-  std::array<bool, N> readBoolMask(const std::string& name) {
-    CAFFE_ENFORCE(OperatorBase::HasArgument(name));
-    std::vector<int64_t> ints =
-        OperatorBase::GetRepeatedArgument<int64_t>(name, {});
-    std::array<bool, N> result;
-    for (const auto i : c10::irange(N)) {
-      result[i] = ints.at(i);
-    }
-    return result;
-  }
-
-  ${implementations}
-};
-
-}
diff --git a/caffe2/contrib/aten/aten_test.py b/caffe2/contrib/aten/aten_test.py
deleted file mode 100644
index 6574884245f81..0000000000000
--- a/caffe2/contrib/aten/aten_test.py
+++ /dev/null
@@ -1,131 +0,0 @@
-from caffe2.python import core
-from hypothesis import given
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestATen(hu.HypothesisTestCase):
-
-    @given(inputs=hu.tensors(n=2), **hu.gcs)
-    def test_add(self, inputs, gc, dc):
-        op = core.CreateOperator(
-            "ATen",
-            ["X", "Y"],
-            ["Z"],
-            operator="add")
-
-        def ref(X, Y):
-            return [X + Y]
-        self.assertReferenceChecks(gc, op, inputs, ref)
-
-    @given(inputs=hu.tensors(n=2, dtype=np.float16), **hu.gcs_gpu_only)
-    def test_add_half(self, inputs, gc, dc):
-        op = core.CreateOperator(
-            "ATen",
-            ["X", "Y"],
-            ["Z"],
-            operator="add")
-
-        def ref(X, Y):
-            return [X + Y]
-        self.assertReferenceChecks(gc, op, inputs, ref)
-
-    @given(inputs=hu.tensors(n=1), **hu.gcs)
-    def test_pow(self, inputs, gc, dc):
-        op = core.CreateOperator(
-            "ATen",
-            ["S"],
-            ["Z"],
-            operator="pow", exponent=2.0)
-
-        def ref(X):
-            return [np.square(X)]
-
-        self.assertReferenceChecks(gc, op, inputs, ref)
-
-    @given(x=st.integers(min_value=2, max_value=8), **hu.gcs)
-    def test_sort(self, x, gc, dc):
-        inputs = [np.random.permutation(x)]
-        op = core.CreateOperator(
-            "ATen",
-            ["S"],
-            ["Z", "I"],
-            operator="sort")
-
-        def ref(X):
-            return [np.sort(X), np.argsort(X)]
-        self.assertReferenceChecks(gc, op, inputs, ref)
-
-    @given(inputs=hu.tensors(n=1), **hu.gcs)
-    def test_sum(self, inputs, gc, dc):
-        op = core.CreateOperator(
-            "ATen",
-            ["S"],
-            ["Z"],
-            operator="sum")
-
-        def ref(X):
-            return [np.sum(X)]
-
-        self.assertReferenceChecks(gc, op, inputs, ref)
-
-    @given(**hu.gcs)
-    def test_index_uint8(self, gc, dc):
-        # Indexing with uint8 is deprecated, but we need to provide backward compatibility for some old models exported through ONNX
-        op = core.CreateOperator(
-            "ATen",
-            ['self', 'mask'],
-            ["Z"],
-            operator="index")
-
-        def ref(self, mask):
-            return (self[mask.astype(np.bool_)],)
-
-        tensor = np.random.randn(2, 3, 4).astype(np.float32)
-        mask = np.array([[1, 0, 0], [1, 1, 0]]).astype(np.uint8)
-
-        self.assertReferenceChecks(gc, op, [tensor, mask], ref)
-
-    @given(**hu.gcs)
-    def test_index_put(self, gc, dc):
-        op = core.CreateOperator(
-            "ATen",
-            ['self', 'indices', 'values'],
-            ["Z"],
-            operator="index_put")
-
-        def ref(self, indices, values):
-            self[indices] = values
-            return (self,)
-
-        tensor = np.random.randn(3, 3).astype(np.float32)
-        mask = np.array([[True, True, True], [True, False, False], [True, True, False]])
-        values = np.random.randn(6).astype(np.float32)
-
-        self.assertReferenceChecks(gc, op, [tensor, mask, values], ref)
-
-    @given(**hu.gcs)
-    def test_unique(self, gc, dc):
-        op = core.CreateOperator(
-            "ATen",
-            ['self'],
-            ["output"],
-            sorted=True,
-            return_inverse=True,
-            # return_counts=False,
-            operator="_unique")
-
-        def ref(self):
-            index, _ = np.unique(self, return_index=False, return_inverse=True, return_counts=False)
-            return (index,)
-
-        tensor = np.array([1, 2, 6, 4, 2, 3, 2])
-        print(ref(tensor))
-        self.assertReferenceChecks(gc, op, [tensor], ref)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/contrib/aten/docs/pytorch_to_caffe2.md b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md
deleted file mode 100644
index 1b65f43224d7f..0000000000000
--- a/caffe2/contrib/aten/docs/pytorch_to_caffe2.md
+++ /dev/null
@@ -1,157 +0,0 @@
-# Using ONNX and ATen to export models from PyTorch to Caffe2
-
-When using ONNX to export a model from PyTorch into Caffe2, you sometimes end up
-hitting operators that are not yet part of the ONNX specification. These may be
-operators that haven't been standardized yet, or custom `torch.autograd.Function` types that
-are specific to a network.
-
-To bridge this gap, we provide an experimental operator in ONNX that allows you to directly access PyTorch's tensor functions using the ATen library.
-[ATen](https://github.com/pytorch/pytorch/tree/main/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/pytorch/pytorch/tree/main/caffe2/contrib/aten)
-that can run these tensor functions in a Caffe2 network after importing them through ONNX.
-
-This guide explains how to configure Caffe2 and modify your PyTorch program to use
-this functionality.
-
-### Enable ATen in Caffe2
-
-The ATen facility in Caffe2 is part of a contrib package and needs to be enabled
-when you configure Caffe2 using cmake:
-
-```
-git clone https://github.com/caffe2/caffe2/
-mkdir caffe2/build
-cd caffe2/build
-cmake -DUSE_ATEN=ON <other build options> ..
-make install
-```
-
-### Describe How to Export a PyTorch Autograd Function using ATen
-
-To export a model to ONNX, PyTorch first creates a trace of all the `torch.autograd.Function`s run
-in the forward pass of a network. For each function in the trace, it calls that function's
-`symbolic` method which describes how to construct the part of the ONNX graph
-that will compute this function (see [basic_ops.py](https://github.com/pytorch/pytorch/blob/main/torch/autograd/_functions/basic_ops.py#L59) for examples).
-
-When equivalent ONNX operators do not exist, you can instead call any ATen function.
-As an example let's assume we have an autograd function which computes `x*x+y`:
-
-```
-  class MyFunction(Function):
-    @staticmethod
-    def forward(ctx, x, y):
-      return x*x + y
-```
-
-We can add a `symbolic` method to it like so:
-
-```
-  class MyFunction(Function):
-    @staticmethod
-    def forward(ctx, x, y):
-      return x*x + y
-    @staticmethod
-    def symbolic(graph, x, y):
-      x2 = graph.at("mul", x, x)
-      r = graph.at("add", x2, y)
-      # x, y, x2, and r are 'Node' objects
-      # print(r) or print(graph) will print out a textual representation for debugging.
-      # this representation will be converted to ONNX protobufs on export.
-      return r
-```
-
-The function `graph.at` adds a new ATen op the computation graph.
-You can call any ATen function using this facility. To do so,
-first identify a function in ATen you want to call in Functions.h,
-Tensor.h, or Type.h.
-
-As an example, we might want to call the `pow` operator:
-
-```
-static inline Tensor pow(const Tensor & self, Scalar exponent);
-```
-
-We can translate this into the equivalent `graph.at` function:
-
-```
-  def symbolic(graph, x):
-    graph.at("pow", x, exponent_f = 2.0) # compute x**2
-```
-
-Tensor arguments to ATen functions become arguments to `graph.at`, while a `Scalar`
-like `exponent` becomes a keyword argument that specify ONNX attributes.
-Attributes are suffixed with their type (`_f` for floats and `_i` for integers, and `_s` for strings).
-
-For methods, the first input is always the `this` Tensor in C++.
-To call methods of ATen's `Type` objects, you provide an additional string attribute
-that determines the type. For instance, `ones` creates a new constant tensor of all ones:
-```
-class Type {
-  ...
-  virtual Tensor ones(IntArrayRef size) const;
-  ...
-};
-```
-
-From PyTorch it can be created by adding the type as an additional attribute:
-
-```
-  def symbolic(graph, x):
-    return graph.at("ones", type_s="float", size_i=[2,4])
-```
-
-
-Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA.
-
-## Putting it together
-
-With these building blocks we can now write and export networks that include custom operators using `torch.onnx.export`:
-
-```
-class MyModule(nn.Module):
-    def forward(self, x, y):
-        # you can combine your ATen ops with standard onnx ones
-        x = nn.ReLU()(x)
-        return MyFunction.apply(x, y)
-
-torch.onnx.export(MyModule(),
-                  (Variable(torch.ones(3,4)), Variable(torch.ones(3,4))),
-                  "output.onnx",
-                  verbose=True)
-```
-
-This exports the following graph, which contains calls the `ATen` operator:
-
-```
-graph(%1 : Float(3, 4)
-       %2 : Float(3, 4)) {
-   %3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1];
-   %4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0];
-   %5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0];
-   return (%5);
-}
-```
-
-The graph can then be imported using ONNX and run with Caffe2:
-
-```
-import onnx
-import caffe2.python.onnx.backend
-import numpy as np
-
-graph = onnx.load("output.onnx")
-
-a = np.random.randn(3, 2).astype(np.float32)
-b = np.random.randn(3, 2).astype(np.float32)
-
-prepared_backend = caffe2.python.onnx.backend.prepare(graph)
-W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b}
-c2_out = prepared_backend.run(W)[0]
-
-x = np.maximum(a, 0)
-r = x*x + b
-np.testing.assert_array_almost_equal(r, c2_out)
-```
-
-### Code
-
-For the full source code for this tutorial, see [sample.py](sample.py).
diff --git a/caffe2/contrib/aten/docs/sample.py b/caffe2/contrib/aten/docs/sample.py
deleted file mode 100644
index 6896f2379d8c5..0000000000000
--- a/caffe2/contrib/aten/docs/sample.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import tempfile
-
-import numpy as np
-
-from torch import nn
-from torch.autograd import Variable, Function
-import torch.onnx
-
-import onnx
-import caffe2.python.onnx.backend
-
-class MyFunction(Function):
-    @staticmethod
-    def forward(ctx, x, y):
-        return x * x + y
-
-    @staticmethod
-    def symbolic(graph, x, y):
-        x2 = graph.at("mul", x, x)
-        r = graph.at("add", x2, y)
-        # x, y, x2, and r are 'Node' objects
-        # print(r) or print(graph) will print out a textual representation for debugging.
-        # this representation will be converted to ONNX protobufs on export.
-        return r
-
-class MyModule(nn.Module):
-    def forward(self, x, y):
-        # you can combine your ATen ops with standard onnx ones
-        x = nn.ReLU()(x)
-        return MyFunction.apply(x, y)
-
-f = tempfile.NamedTemporaryFile()
-torch.onnx.export(MyModule(),
-                  (Variable(torch.ones(3, 4)), Variable(torch.ones(3, 4))),
-                  f, verbose=True)
-
-# prints the graph for debugging:
-# graph(%input : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu),
-#       %y : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu)):
-#   %2 : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu) = onnx::Relu(%input)
-#   %3 : Tensor = aten::ATen[operator="mul"](%2, %2)
-#   %4 : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu) = aten::ATen[operator="add"](%3, %y)
-#   return (%4)
-
-graph = onnx.load(f.name)
-
-a = np.random.randn(3, 4).astype(np.float32)
-b = np.random.randn(3, 4).astype(np.float32)
-
-prepared_backend = caffe2.python.onnx.backend.prepare(graph)
-W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b}
-c2_out = prepared_backend.run(W)[0]
-
-x = np.maximum(a, 0)
-r = x * x + b
-np.testing.assert_array_almost_equal(r, c2_out)
diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py
deleted file mode 100755
index 5189af6f2c536..0000000000000
--- a/caffe2/contrib/aten/gen_op.py
+++ /dev/null
@@ -1,330 +0,0 @@
-#!/bin/env python3
-
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-import sys
-import yaml
-import argparse
-import os
-from copy import deepcopy
-from typing import Dict, List, Set
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--template_dir", default=".", help="where template.h is")
-parser.add_argument("--yaml_dir", default="aten/src/ATen/ATen",
-                    help="where ATen yaml files are")
-parser.add_argument("--output_prefix", default="", help="")
-parser.add_argument(
-    "--install_dir", default=".", help="where to put generated file")
-parser.add_argument("--aten_root", default="", help="root directory of aten")
-args, _ = parser.parse_known_args()
-
-if args.aten_root:
-    if not os.path.exists(args.aten_root):
-        raise ValueError('aten_root ({}) does not exist'.format(
-            args.aten_root))
-    sys.path.insert(0, os.path.join(args.aten_root, '..'))
-    from torchgen.code_template import CodeTemplate as CT
-else:
-    from torchgen.code_template import CodeTemplate as CT
-
-OP_TEMPLATE = CT.from_file(
-    os.path.join(args.template_dir, 'aten_op_template.h'))
-
-
-try:
-    # use faster C loader if available
-    from yaml import CSafeLoader as Loader
-except ImportError:
-    from yaml import SafeLoader as Loader  # type: ignore[assignment, misc]
-
-
-def write(filename, s):
-    with open(filename, "w") as f:
-        f.write(s)
-
-
-def read(filename):
-    with open(filename, "r") as f:
-        return f.read()
-
-
-def value_has_tensors(v):
-    # Sparse shouldn't appear in public API, seems to be temporary bug
-    return "Tensor" in v['dynamic_type'] and "Sparse" not in v['dynamic_type']
-
-
-def value_is_tensor_type(v):
-    return value_has_tensors(v) and v['dynamic_type'] not in TENSORLIST_TYPE
-
-TENSORLIST_TYPE = [
-    'at::TensorList',
-    'const at::ITensorListRef &',
-    'const c10::List<c10::optional<at::Tensor>> &',
-]
-
-# for each aten type, how do we handle a return value of that type?
-RETURN_MAP = {
-    'at::Tensor': 'assignTo(Output(${offset}),${output});',
-    'at::Scalar': 'assignTo(Output(${offset}),${output}.type(), ${output});',
-    'bool': 'assignToValue<int64_t>(Output(${offset}),${output});',
-    'int64_t': 'assignToValue<int64_t>(Output(${offset}),${output});',
-    '::std::vector<at::Tensor>': 'assignListStartingAt(${offset}, ${output});',
-}
-
-# for each non-Tensor aten argument, how to we read it from caffe2's
-# attribute list. Most of these call runtime functions defined in the
-# template class.
-ARGUMENT_MAP = {
-    'const at::Scalar &': 'at::Scalar ${arg} = readScalarAttribute("${arg}");',
-    'bool': 'bool ${arg} = readAttribute<int64_t>("${arg}");',
-    'int': 'int ${arg} = readAttribute<int64_t>("${arg}");',
-    'double': 'double ${arg} = readAttribute<float>("${arg}");',
-    'int64_t': 'int64_t ${arg} = readAttribute<int64_t>("${arg}");',
-    'at::IntArrayRef': 'auto ${arg} = readIntArrayRef("${arg}");',
-    '::std::array<bool,2>': 'auto ${arg} = readBoolMask<2>("${arg}");',
-    '::std::array<bool,3>': 'auto ${arg} = readBoolMask<3>("${arg}");',
-}
-
-# for BC reasons we want to route some of the functions to different
-# implementations
-SPECIAL_IMPLEMENTATIONS = {
-    'index': 'internal::index_with_uint8_handling',
-}
-
-def expand(o):
-    num_defaults = sum(1 if 'default' in arg else 0 for arg in o['arguments'])
-    results = [o]
-    for i in range(0, num_defaults):
-        # last num_default values should be default
-        assert('default' in o['arguments'][-(i + 1)])
-        v = deepcopy(o)
-        v['arguments'] = v['arguments'][:-(i + 1)]
-        results.append(v)
-    return results
-
-
-# filter the list of declarations removing things we cannot support
-def supports(o, factory_methods):
-    # Ignore all families (!) of functions that have TensorOptions (i.e. tensor factory methods).
-    if o['name'] in factory_methods:
-        if factory_methods[o['name']] == 0:
-            print("Skipping {} because it is a factory method".format(o['name']))
-        factory_methods[o['name']] += 1
-        return False
-
-    # skip all in-place operators for now since aten cannot Resize
-    # caffe2 memory inside an operator
-    if o['inplace']:
-        return False
-
-    # _out variants also work in-place on arguments taken as destinations
-    # we also cannot handle these because aten cannot resize caffe2 Tensors
-    if "_out" in o['name']:
-        return False
-
-    # skip if no return, previously it is 'void'
-    if len(o['returns']) == 0:
-        return False
-
-    # skip return types we cannot handle
-    for ret in o['returns']:
-        if not value_has_tensors(ret) and ret['type'] not in RETURN_MAP:
-            print("Skipping {} Because of Ret: {} ({})".format(
-                  o['name'], ret['type'], ret['dynamic_type']))
-            return False
-
-    # skip arguments we cannot handle
-    for arg in o['arguments']:
-        if not value_has_tensors(arg) and arg['type'] not in ARGUMENT_MAP:
-            print("Skipping {} Because of Arg: {} ({}) ".format(
-                  o['name'], arg['type'], arg['dynamic_type']))
-            return False
-    return True
-
-
-# template for each potential operator.
-# each operator has an integer 'key' associated with it, and
-# a lambda that defines the operator
-# non-tensor attributes are created in ${initialization}
-# and then saved as arguments to the lambda
-# Inputs/Outputs are read inside the lambda
-#
-# each implementation is defined in a separate method annotated with
-# C10_NOINLINE to avoid inlining into the ATenOp constructor, which would
-# trigger pathological compile times.
-IMPLEMENTATION_TEMPLATE = CT("""\
-C10_NOINLINE void implementation_${key}() { // ${name}
-    ${initialization}
-    run_op = [=] {
-        at::AutoDispatchBelowAutograd guard;
-        ${statements}
-        auto the_result = ${invocation};
-        ${assignments}
-        return true;
-    };
-}
-""")
-
-CASE_TEMPLATE = CT("""\
-case ${key}: // ${name}
-  implementation_${key}();
-  break;
-""")
-
-ASSIGN_CHECK_SIZE_TEMPLATE = CT("""\
-  if(OutputSize() > ${offset}) {${assignment}}
-""")
-
-
-def get_output(o, i):
-    if len(o['returns']) == 1:
-        return 'the_result'
-    else:
-        return '::std::get<{}>(the_result)'.format(i)
-
-
-def attribute_names(o):
-    return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a)])
-
-
-def required_attribute_names(o):
-    return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a) and 'default' not in a])
-
-
-def self_as_first_argument(arguments):
-    return ([a for a in arguments if a['name'] == 'self'] +
-            [a for a in arguments if a['name'] != 'self'])
-
-
-def get_num_inputs(o):
-    args = 0
-    for a in o['arguments']:
-        if a['type'] in TENSORLIST_TYPE:
-            return '*'
-        elif value_has_tensors(a):
-            args += 1
-    return str(args)
-
-
-def find_factory_methods(decls):
-    factory_methods = {}
-    for o in decls:
-        if any(arg['dynamic_type'] == 'at::TensorOptions' for arg in o['arguments']):
-            factory_methods[o['name']] = 0
-    return factory_methods
-
-
-def emit_assignments(o, env):
-    for i, r in enumerate(o['returns']):
-        t = RETURN_MAP[r['type'] if not value_is_tensor_type(r) else 'at::Tensor']
-        assignment = CT(t).substitute(env, offset=i, output=get_output(o, i))
-        check_size_assignment = ASSIGN_CHECK_SIZE_TEMPLATE.substitute(env, offset=i, assignment=assignment)
-
-        env['assignments'].append(check_size_assignment)
-
-
-if __name__ == '__main__':
-    decls = yaml.load(read(os.path.join(args.yaml_dir, 'Declarations.yaml')), Loader=Loader)
-    factory_methods = find_factory_methods(decls)
-    filtered = [expanded for o in decls for expanded in expand(o) if supports(expanded, factory_methods)]
-    top_env: Dict[str, List] = {
-        'mappings': [],
-        'implementations': [],
-        'cases': [],
-    }
-    seen: Set[str] = set()
-    key = 0
-    for o in filtered:
-        # [DESCRIPTORS]
-        # each option is associated with a descriptor string that is used
-        # to figure out which version of an op is being used:
-        # The format is:
-        #     opname-num_inputs-attribute_1-attribute2
-        # Example:
-        #  lerp-2-weight
-        #  the operator lerp takes 2 arguments and has the attribute weight
-        attr_names = attribute_names(o)
-        num_inputs = get_num_inputs(o)
-        descriptor = '-'.join([o['name']] + attr_names + [num_inputs])
-        if descriptor in seen:
-            continue
-        seen.add(descriptor)
-
-        # map from descriptor string to the integer key in the switch statements
-        # that initializes the operators
-        top_env['mappings'].append('{{ "{}", {} }},'.format(descriptor, key))
-        env = {
-            'name': o['name'],
-            'statements': [],
-            'arguments': [],
-            'assignments': [],
-            'initialization': [],
-            'key': str(key),
-        }
-
-        if 'namespace' not in o['method_of'] and 'Tensor' not in o['method_of']:
-            # methods on type like 'ones' or 'zeros' always take a
-            # string attribute that is translated into the at::Type object
-            # e.g. "Float" is at::kFloat
-            assert('Type' in o['method_of'])
-
-        static_tensor_inputs = sum(arg['type'] not in TENSORLIST_TYPE and value_is_tensor_type(arg) for arg in o['arguments'])
-        has_tensorlist = any(arg['type'] in TENSORLIST_TYPE for arg in o['arguments'])
-        if has_tensorlist:
-            tensorlist_idx = [i for i, arg in enumerate(o['arguments']) if arg['type'] in TENSORLIST_TYPE][0]
-
-        real_inputs = 0
-        for i, arg in enumerate(o['arguments']):
-            env['arguments'].append(arg['name'])
-            # Pretend the flat argument list is a stack where the end is the top.
-            view_length = 'InputSize()' if has_tensorlist and i < tensorlist_idx else static_tensor_inputs
-            if arg['type'] == 'at::TensorList' or arg['type'] == 'const at::ITensorListRef &':
-                # NOTE: do not advance real_inputs here. After this we will
-                # switch to indexing the "stack" from the end
-                env['statements'].append(
-                    'auto {} = peekSlice({}, InputSize() - {}, InputSize());'
-                    .format(arg['name'], real_inputs, static_tensor_inputs))
-            elif arg['type'] == 'const c10::List<c10::optional<at::Tensor>> &':
-                # NOTE: do not advance real_inputs here. After this we will
-                # switch to indexing the "stack" from the end
-                env['statements'].append(
-                    'auto {} = peekSliceOptionals({}, InputSize() - {}, InputSize());'
-                    .format(arg['name'], real_inputs, static_tensor_inputs))
-            elif value_is_tensor_type(arg):
-                # load tensor inputs from Caffe2
-                env['statements'].append(
-                    'auto {} = peek({}, {});'.format(arg['name'], real_inputs, view_length))
-                real_inputs += 1
-            else:
-                init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name'])
-                env['initialization'].append(init)
-
-        emit_assignments(o, env)
-
-        if o['name'] in SPECIAL_IMPLEMENTATIONS:
-            env['invocation'] = "{}({})".format(SPECIAL_IMPLEMENTATIONS[o['name']], ','.join(env['arguments']))
-        elif 'namespace' in o['method_of']:
-            env['invocation'] = CT("at::${name}(${arguments})").substitute(env)
-        else:
-            assert('Tensor' in o['method_of'])
-            env['invocation'] = "self.{}({})".format(
-                o['name'], ', '.join(env['arguments'][1:]))
-
-        top_env['implementations'].append(IMPLEMENTATION_TEMPLATE.substitute(env))
-        top_env['cases'].append(CASE_TEMPLATE.substitute(env))
-        key += 1
-    write(os.path.join(args.install_dir, args.output_prefix + "aten_op.h"), OP_TEMPLATE.substitute(top_env))
diff --git a/caffe2/contrib/fakelowp/CMakeLists.txt b/caffe2/contrib/fakelowp/CMakeLists.txt
deleted file mode 100644
index e19ffe9b0b60a..0000000000000
--- a/caffe2/contrib/fakelowp/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-if(USE_FAKELOWP)
-  message(STATUS "Including FakeLowP operators")
-
-  # ---[ CPU files.
-  file(GLOB_RECURSE tmp *.cc)
-  set(FAKELOWP_CPU_SRCS ${FAKELOWP_CPU_SRCS} ${tmp})
-  # exclude test files and gpu files
-  file(GLOB_RECURSE tmp *_test.cc)
-  exclude(FAKELOWP_CPU_SRCS "${FAKELOWP_CPU_SRCS}" ${tmp})
-
-  # We will only build the perf kernel files if the compiler supports avx2
-  # extensions.
-  if(CXX_AVX2_FOUND)
-    add_library(caffe2_fakelowp_ops OBJECT ${FAKELOWP_CPU_SRCS})
-    add_dependencies(caffe2_fakelowp_ops fbgemm cpuinfo Caffe2_PROTO c10 aten_op_header_gen)
-    target_include_directories(caffe2_fakelowp_ops BEFORE
-      PRIVATE $<BUILD_INTERFACE:${FBGEMM_SOURCE_DIR}/include>)
-    target_include_directories(caffe2_fakelowp_ops BEFORE
-      PRIVATE $<BUILD_INTERFACE:${CPUINFO_SOURCE_DIR}/include>)
-
-    if(MSVC)
-      set_property(SOURCE ${FAKELOWP_CPU_SRCS}
-        APPEND_STRING PROPERTY COMPILE_FLAGS " /arch:AVX2 ")
-    else()
-      set_property(SOURCE ${FAKELOWP_CPU_SRCS}
-        APPEND_STRING PROPERTY COMPILE_FLAGS " -mavx2 -mfma -mf16c -mxsave ")
-    endif()
-    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS}
-      $<TARGET_OBJECTS:caffe2_fakelowp_ops>)
-  endif()
-  # ---[ Send the lists to the parent scope.
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-else()
-  message(STATUS "Excluding FakeLowP operators")
-endif()
diff --git a/caffe2/contrib/fakelowp/batch_matmul_fp16_fake_op.cc b/caffe2/contrib/fakelowp/batch_matmul_fp16_fake_op.cc
deleted file mode 100644
index aa7d1a9d0d5a5..0000000000000
--- a/caffe2/contrib/fakelowp/batch_matmul_fp16_fake_op.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-#include "batch_matmul_fp16_fake_op.h"
-
-#include "caffe2/core/operator_schema.h"
-
-namespace caffe2 {
-
-vector<TensorShape> TensorInferenceForBatchMatMul(
-    const OperatorDef& def,
-    const vector<TensorShape>& in);
-OpSchema::Cost CostInferenceForBatchMatMul(
-    const OperatorDef& def,
-    const vector<TensorShape>& in);
-
-REGISTER_CPU_OPERATOR(BatchMatMulFP16Fake, BatchMatMulFP16FakeOp<CPUContext>);
-
-OPERATOR_SCHEMA(BatchMatMulFP16Fake)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Batch Matrix multiplication Yi = Ai * Bi, where A has shape (dim0, dim1, ... M, K),
-B has shape (dim0, dim1, ... K, N), Y has shape (dim0, dim1, ... M, N) and i ranges
-from 0 to (dim0 * dim1 ...) - 1. rank(A) == rank(B) >= 2. In case of A and B being
-two diemnsional, it behaves like normal matrix multiplication.
-)DOC")
-    .Input(0, "A", "tensor of shape (dim0, dim1 ... M, K)")
-    .Input(1, "B", "tensor of shpae (dim0, dim2 ... K, N)")
-    .Output(0, "Y", "tensor of shape (dim0, dim1 ... M, N)")
-    .Arg(
-        "trans_a",
-        "Pass 1 to transpose the last two dimensions of A before "
-        "doing multiplication")
-    .Arg(
-        "trans_b",
-        "Pass 1 to transpose the last two dimensions of B before "
-        "doing multiplication")
-    .Arg(
-        "broadcast",
-        "Pass 1 to allow broadcasting of dimensions. Behavior is the same as numpy.matmul. Gradient is currently not supported when running in broadcast mode.")
-    .TensorInferenceFunction(TensorInferenceForBatchMatMul)
-    .CostInferenceFunction(
-        OpSchema::CostInferenceFunctionType(CostInferenceForBatchMatMul))
-    .InheritOnnxSchema();
-
-REGISTER_CPU_OPERATOR(
-    BatchMatMulFP16Acc16Fake,
-    BatchMatMulFP16FakeOp<
-        CPUContext,
-        DefaultEngine,
-        true /*use custom fp16 gemm acc16*/,
-        false /*not using temp accmulator*/,
-        false /*use math fp16 gemm acc 32*/>);
-
-OPERATOR_SCHEMA(BatchMatMulFP16Acc16Fake).NumInputs(2).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(
-    BatchMatMulFP16Acc32Fake,
-    BatchMatMulFP16FakeOp<
-        CPUContext,
-        DefaultEngine,
-        false /*use custom fp16 gemm acc16*/,
-        false /*not using temp accmulator*/,
-        true /*use custom fp16 gemm acc32*/>);
-
-OPERATOR_SCHEMA(BatchMatMulFP16Acc32Fake).NumInputs(2).NumOutputs(1);
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/batch_matmul_fp16_fake_op.h b/caffe2/contrib/fakelowp/batch_matmul_fp16_fake_op.h
deleted file mode 100644
index 9013dafeffae3..0000000000000
--- a/caffe2/contrib/fakelowp/batch_matmul_fp16_fake_op.h
+++ /dev/null
@@ -1,440 +0,0 @@
-#ifndef CAFFE2_OPERATORS_BATCH_MATMUL_OP_H_
-#define CAFFE2_OPERATORS_BATCH_MATMUL_OP_H_
-
-#include <ATen/Utils.h>
-#include <c10/util/accumulate.h>
-#include <fbgemm/FbgemmConvert.h>
-
-#include "caffe2/contrib/fakelowp/fp16_gemm_utils.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-
-#include <algorithm>
-#include <functional>
-#include <numeric>
-#include <string>
-#include <vector>
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-template <
-    class Context,
-    class Engine = DefaultEngine,
-    bool USE_ACC_FP16 = false,
-    bool USE_TMP_ACCUMULATOR = false,
-    bool USE_CUSTOM_ACC32 =
-        false> /* if  USE_ACC_FP16=false, set to true to use custom gemm kernel
-                 in fp16_gemm_utils.cc instead of math.h gemm functions */
-class BatchMatMulFP16FakeOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  template <class... Args>
-  explicit BatchMatMulFP16FakeOp(Args&&... args)
-      : Operator<Context>(std::forward<Args>(args)...),
-        OP_SINGLE_ARG(bool, "trans_a", trans_a_, false),
-        OP_SINGLE_ARG(bool, "trans_b", trans_b_, false),
-        OP_SINGLE_ARG(bool, "broadcast", broadcast_, false) {}
-
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
-  }
-
-  template <typename T>
-  bool DoRunWithType() {
-    const auto& A = Input(0);
-    const auto& B = Input(1);
-    const int A_ndim = A.dim();
-    const int B_ndim = B.dim();
-    const std::vector<std::int64_t> A_dims = A.sizes().vec();
-    const std::vector<std::int64_t> B_dims = B.sizes().vec();
-    const T* A_data = A.template data<T>();
-    const T* B_data = B.template data<T>();
-
-    // Fake fp16 rounding of input
-    std::vector<float> A_rounded(A.numel());
-    std::vector<float> B_rounded(B.numel());
-    fbgemm::RoundToFloat16(
-        A_data,
-        A_rounded.data(),
-        A.numel(),
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-        USE_ACC_FP16);
-    fbgemm::RoundToFloat16(
-        B_data,
-        B_rounded.data(),
-        B.numel(),
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-        USE_ACC_FP16);
-    A_data = A_rounded.data();
-    B_data = B_rounded.data();
-
-    if (A_ndim == 1 && B_ndim == 1) {
-      CAFFE_ENFORCE_EQ(A.numel(), B.numel());
-      auto* Y = Output(0, {1}, at::dtype<T>());
-      T* Y_data = Y->template mutable_data<T>();
-      math::Dot<T, Context>(A.numel(), A_data, B_data, Y_data, &context_);
-      fbgemm::RoundToFloat16(
-          reinterpret_cast<const float*>(Y_data),
-          Y_data,
-          Y->numel(),
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-          USE_ACC_FP16);
-      return true;
-    }
-    if (A_ndim == 1) {
-      const int N = A.numel();
-      if (trans_b_) {
-        CAFFE_ENFORCE_EQ(B_dims[B_ndim - 1], N);
-      } else {
-        CAFFE_ENFORCE_EQ(B_dims[B_ndim - 2], N);
-      }
-      std::vector<std::int64_t> Y_dims(B_ndim - 1);
-      if (trans_b_) {
-        std::copy_n(B_dims.cbegin(), B_ndim - 1, Y_dims.begin());
-      } else {
-        std::copy_n(B_dims.cbegin(), B_ndim - 2, Y_dims.begin());
-        Y_dims.back() = B_dims.back();
-      }
-      auto* Y = Output(0, Y_dims, at::dtype<T>());
-      T* Y_data = Y->template mutable_data<T>();
-      if (trans_b_) {
-        const int M = B.numel() / N;
-        caffe2::custom_fp16_gemv(
-            USE_ACC_FP16,
-            USE_CUSTOM_ACC32,
-            USE_TMP_ACCUMULATOR,
-            CblasNoTrans,
-            M,
-            N,
-            1.0f,
-            B_data,
-            A_data,
-            0.0f,
-            Y_data,
-            &context_);
-      } else {
-        const int M = B_dims[B_ndim - 1];
-        const int batch_size = B.numel() / (M * N);
-        if (batch_size == 1) {
-          caffe2::custom_fp16_gemv(
-              USE_ACC_FP16,
-              USE_CUSTOM_ACC32,
-              USE_TMP_ACCUMULATOR,
-              CblasTrans,
-              N,
-              M,
-              1.0f,
-              B_data,
-              A_data,
-              0.0f,
-              Y_data,
-              &context_);
-        } else {
-          caffe2::custom_fp16_gemm_strided_batched(
-              USE_ACC_FP16,
-              USE_CUSTOM_ACC32,
-              USE_TMP_ACCUMULATOR,
-              CblasTrans,
-              CblasNoTrans,
-              batch_size,
-              M,
-              1,
-              N,
-              1.0f,
-              B_data,
-              M * N,
-              A_data,
-              0,
-              0.0f,
-              Y_data,
-              M,
-              &context_);
-        }
-      }
-      fbgemm::RoundToFloat16(
-          reinterpret_cast<const float*>(Y_data),
-          Y_data,
-          Y->numel(),
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-          USE_ACC_FP16);
-      return true;
-    }
-    if (B_ndim == 1) {
-      const int N = B.numel();
-      if (trans_a_) {
-        CAFFE_ENFORCE_EQ(A_dims[A_ndim - 2], N);
-      } else {
-        CAFFE_ENFORCE_EQ(A_dims[A_ndim - 1], N);
-      }
-      const std::vector<std::int64_t> Y_dims(
-          A_dims.cbegin(), A_dims.cbegin() + A_ndim - 1);
-      auto* Y = Output(0, Y_dims, at::dtype<T>());
-      T* Y_data = Y->template mutable_data<T>();
-      if (trans_a_) {
-        const int M = A_dims[A_ndim - 1];
-        const int batch_size = A.numel() / (M * N);
-        if (batch_size == 1) {
-          caffe2::custom_fp16_gemv(
-              USE_ACC_FP16,
-              USE_CUSTOM_ACC32,
-              USE_TMP_ACCUMULATOR,
-              CblasTrans,
-              N,
-              M,
-              1.0f,
-              A_data,
-              B_data,
-              0.0f,
-              Y_data,
-              &context_);
-        } else {
-          caffe2::custom_fp16_gemm_strided_batched(
-              USE_ACC_FP16,
-              USE_CUSTOM_ACC32,
-              USE_TMP_ACCUMULATOR,
-              CblasTrans,
-              CblasNoTrans,
-              batch_size,
-              M,
-              1,
-              N,
-              1.0f,
-              A_data,
-              M * N,
-              B_data,
-              0,
-              0.0f,
-              Y_data,
-              M,
-              &context_);
-        }
-      } else {
-        const int M = A.numel() / N;
-        caffe2::custom_fp16_gemv(
-            USE_ACC_FP16,
-            USE_CUSTOM_ACC32,
-            USE_TMP_ACCUMULATOR,
-            CblasNoTrans,
-            M,
-            N,
-            1.0f,
-            A_data,
-            B_data,
-            0.0f,
-            Y_data,
-            &context_);
-      }
-      fbgemm::RoundToFloat16(
-          reinterpret_cast<const float*>(Y_data),
-          Y_data,
-          Y->numel(),
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      return true;
-    }
-
-    const int M = trans_a_ ? A_dims[A_ndim - 1] : A_dims[A_ndim - 2];
-    const int K = trans_a_ ? A_dims[A_ndim - 2] : A_dims[A_ndim - 1];
-    if (trans_b_) {
-      CAFFE_ENFORCE_EQ(B_dims[B_ndim - 1], K);
-    } else {
-      CAFFE_ENFORCE_EQ(B_dims[B_ndim - 2], K);
-    }
-    const int N = trans_b_ ? B_dims[B_ndim - 2] : B_dims[B_ndim - 1];
-    const int ndim = std::max(A_ndim, B_ndim);
-    std::vector<std::int64_t> A_broadcast_dims(ndim);
-    std::vector<std::int64_t> B_broadcast_dims(ndim);
-    std::vector<std::int64_t> Y_broadcast_dims(ndim);
-    math::utils::ComputeBroadcastBinaryOpDims(
-        A_ndim - 2,
-        A_dims.data(),
-        B_ndim - 2,
-        B_dims.data(),
-        A_broadcast_dims.data(),
-        B_broadcast_dims.data(),
-        Y_broadcast_dims.data());
-    Y_broadcast_dims[ndim - 2] = M;
-    Y_broadcast_dims[ndim - 1] = N;
-    auto* Y = Output(0, Y_broadcast_dims, at::dtype<T>());
-    T* Y_data = Y->template mutable_data<T>();
-
-    const int batch_dim = ndim - 2;
-    const bool is_broadcast_dims = !std::equal(
-        A_broadcast_dims.cbegin(),
-        A_broadcast_dims.cbegin() + batch_dim,
-        B_broadcast_dims.cbegin());
-    if (is_broadcast_dims) {
-      CAFFE_ENFORCE(broadcast_);
-    }
-
-    const std::int64_t A_batch_size = c10::multiply_integers(
-        A_broadcast_dims.cbegin(),
-        A_broadcast_dims.cbegin() + batch_dim);
-    const std::int64_t B_batch_size = c10::multiply_integers(
-        B_broadcast_dims.cbegin(),
-        B_broadcast_dims.cbegin() + batch_dim);
-    const std::int64_t Y_batch_size = c10::multiply_integers(
-        Y_broadcast_dims.cbegin(),
-        Y_broadcast_dims.cbegin() + batch_dim);
-    if (Y_batch_size == 0) {
-      fbgemm::RoundToFloat16(
-          reinterpret_cast<const float*>(Y_data),
-          Y_data,
-          Y->numel(),
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      return true;
-    }
-    if (A_batch_size == 1 && B_batch_size == 1) {
-      if (USE_ACC_FP16) {
-        caffe2::custom_fp16_gemm_with_trans(
-            trans_a_ ? CblasTrans : CblasNoTrans,
-            trans_b_ ? CblasTrans : CblasNoTrans,
-            M,
-            K,
-            N,
-            A_data,
-            B_data,
-            0.0f,
-            Y_data,
-            true, /* use acc16*/
-            USE_TMP_ACCUMULATOR);
-      } else if (USE_CUSTOM_ACC32) {
-        caffe2::custom_fp16_gemm_with_trans(
-            trans_a_ ? CblasTrans : CblasNoTrans,
-            trans_b_ ? CblasTrans : CblasNoTrans,
-            M,
-            K,
-            N,
-            A_data,
-            B_data,
-            0.0f,
-            Y_data,
-            false, /* use acc32*/
-            USE_TMP_ACCUMULATOR);
-      } else {
-        math::Gemm<T, Context, Engine>(
-            trans_a_ ? CblasTrans : CblasNoTrans,
-            trans_b_ ? CblasTrans : CblasNoTrans,
-            M,
-            N,
-            K,
-            1.0f,
-            A_data,
-            B_data,
-            0.0f,
-            Y_data,
-            &context_);
-      }
-
-    } else if (A_batch_size == 1) {
-      caffe2::custom_fp16_gemm_strided_batched(
-          USE_ACC_FP16,
-          USE_CUSTOM_ACC32,
-          USE_TMP_ACCUMULATOR,
-          trans_a_ ? CblasTrans : CblasNoTrans,
-          trans_b_ ? CblasTrans : CblasNoTrans,
-          Y_batch_size,
-          M,
-          N,
-          K,
-          1.0f,
-          A_data,
-          0,
-          B_data,
-          K * N,
-          0.0f,
-          Y_data,
-          M * N,
-          &context_);
-    } else if (B_batch_size == 1) {
-      caffe2::custom_fp16_gemm_strided_batched(
-          USE_ACC_FP16,
-          USE_CUSTOM_ACC32,
-          USE_TMP_ACCUMULATOR,
-          trans_a_ ? CblasTrans : CblasNoTrans,
-          trans_b_ ? CblasTrans : CblasNoTrans,
-          Y_batch_size,
-          M,
-          N,
-          K,
-          1.0f,
-          A_data,
-          M * K,
-          B_data,
-          0,
-          0.0f,
-          Y_data,
-          M * N,
-          &context_);
-    } else if (!is_broadcast_dims) {
-      caffe2::custom_fp16_gemm_strided_batched(
-          USE_ACC_FP16,
-          USE_CUSTOM_ACC32,
-          USE_TMP_ACCUMULATOR,
-          trans_a_ ? CblasTrans : CblasNoTrans,
-          trans_b_ ? CblasTrans : CblasNoTrans,
-          Y_batch_size,
-          M,
-          N,
-          K,
-          1.0f,
-          A_data,
-          M * K,
-          B_data,
-          K * N,
-          0.0f,
-          Y_data,
-          M * N,
-          &context_);
-    } else {
-      std::vector<const T*> A_ptr(Y_batch_size);
-      std::vector<const T*> B_ptr(Y_batch_size);
-      std::vector<T*> Y_ptr(Y_batch_size);
-      std::vector<std::int64_t> index(batch_dim);
-      for (std::int64_t i = 0; i < Y_batch_size; ++i) {
-        const std::int64_t A_index = math::utils::GetIndexFromDims(
-            batch_dim, A_broadcast_dims.data(), index.data());
-        const std::int64_t B_index = math::utils::GetIndexFromDims(
-            batch_dim, B_broadcast_dims.data(), index.data());
-        A_ptr[i] = A_data + A_index * M * K;
-        B_ptr[i] = B_data + B_index * K * N;
-        Y_ptr[i] = Y_data + i * M * N;
-        math::utils::IncreaseIndexInDims(
-            batch_dim, Y_broadcast_dims.data(), index.data());
-      }
-      caffe2::custom_fp16_gemm_batched(
-          USE_ACC_FP16,
-          USE_CUSTOM_ACC32,
-          USE_TMP_ACCUMULATOR,
-          trans_a_ ? CblasTrans : CblasNoTrans,
-          trans_b_ ? CblasTrans : CblasNoTrans,
-          Y_batch_size,
-          M,
-          N,
-          K,
-          1.0f,
-          A_ptr.data(),
-          B_ptr.data(),
-          0.0f,
-          Y_ptr.data(),
-          &context_);
-    }
-    fbgemm::RoundToFloat16(
-        reinterpret_cast<const float*>(Y_data),
-        Y_data,
-        Y->numel(),
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    return true;
-  }
-
- private:
-  const bool trans_a_;
-  const bool trans_b_;
-  const bool broadcast_;
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_BATCH_MATMUL_OP_H_
diff --git a/caffe2/contrib/fakelowp/common.cc b/caffe2/contrib/fakelowp/common.cc
deleted file mode 100644
index cf6848c37eaad..0000000000000
--- a/caffe2/contrib/fakelowp/common.cc
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "caffe2/core/init.h"
-
-C10_DEFINE_bool(caffe2_fbgemm_fake_fp16_clamp, true, "");
-
-C10_DEFINE_bool(caffe2_fbgemm_fake_fp16_clamp_denorms, true, "");
diff --git a/caffe2/contrib/fakelowp/common.h b/caffe2/contrib/fakelowp/common.h
deleted file mode 100644
index 34c738a91f58d..0000000000000
--- a/caffe2/contrib/fakelowp/common.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#pragma once
-
-namespace caffe2 {
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/elementwise_fp16_fake_op.cc b/caffe2/contrib/fakelowp/elementwise_fp16_fake_op.cc
deleted file mode 100644
index 241ef90bfd141..0000000000000
--- a/caffe2/contrib/fakelowp/elementwise_fp16_fake_op.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-#include <fbgemm/FbgemmConvert.h>
-#include "caffe2/contrib/fakelowp/sum_fp16_fake_op.h"
-#include "caffe2/operators/elementwise_add_op.h"
-#include "caffe2/operators/elementwise_div_op.h"
-#include "caffe2/operators/elementwise_mul_op.h"
-#include "caffe2/operators/elementwise_sub_op.h"
-#include "caffe2/operators/utility_ops.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-namespace {
-
-int getSizeFromDims(const std::vector<int>& dims) {
-  int tot = 1;
-  for (auto i = 0; i < dims.size(); i++) {
-    tot *= dims[i];
-  }
-  return tot;
-}
-
-template <class Functor>
-struct FP16PairWiseCPUFunctor {
-  template <typename TIn, typename TOut>
-  bool Forward(
-      const std::vector<int>& A_dims,
-      const std::vector<int>& B_dims,
-      const TIn* A,
-      const TIn* B,
-      TOut* C,
-      CPUContext* context) const {
-    functor.Forward(A_dims, B_dims, A, B, C, context);
-
-    return true;
-  }
-
-  template<>
-  bool Forward<float, float>(
-      const std::vector<int>& A_dims,
-      const std::vector<int>& B_dims,
-      const float* A,
-      const float* B,
-      float* C,
-      CPUContext* context) const {
-    auto A_sz = getSizeFromDims(A_dims);
-    auto B_sz = getSizeFromDims(B_dims);
-
-    std::vector<float> A_fp16(A_sz);
-    std::vector<float> B_fp16(B_sz);
-
-    fbgemm::RoundToFloat16(
-        A, A_fp16.data(), A_sz, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(
-        B, B_fp16.data(), B_sz, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    functor.Forward(A_dims, B_dims, A_fp16.data(), B_fp16.data(), C, context);
-    fbgemm::RoundToFloat16(C, C, A_sz, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    return true;
-  }
-
-  Functor functor;
-};
-} // namespace
-
-REGISTER_CPU_OPERATOR(SumFakeFp16, SumFP16FP16AccOp<CPUContext>);
-OPERATOR_SCHEMA(SumFakeFp16).NumInputs(1, INT_MAX).NumOutputs(1, INT_MAX);
-
-REGISTER_CPU_OPERATOR(
-    AddFakeFp16,
-    BinaryElementwiseOp<
-        TensorTypes<float, int, long>,
-        CPUContext,
-        FP16PairWiseCPUFunctor<AddFunctor<CPUContext>>>);
-OPERATOR_SCHEMA(AddFakeFp16).NumInputs(2).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(
-    DivFakeFp16,
-    BinaryElementwiseOp<
-        TensorTypes<float, double>,
-        CPUContext,
-        FP16PairWiseCPUFunctor<DivFunctor<CPUContext>>>);
-OPERATOR_SCHEMA(DivFakeFp16).NumInputs(2).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(
-    MulFakeFp16,
-    BinaryElementwiseOp<
-        TensorTypes<float>,
-        CPUContext,
-        FP16PairWiseCPUFunctor<MulFunctor<CPUContext>>>);
-OPERATOR_SCHEMA(MulFakeFp16).NumInputs(2).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(
-    SubFakeFp16,
-    BinaryElementwiseOp<
-        TensorTypes<float>,
-        CPUContext,
-        FP16PairWiseCPUFunctor<SubFunctor<CPUContext>>>);
-OPERATOR_SCHEMA(SubFakeFp16).NumInputs(2).NumOutputs(1);
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/fp16_fc_acc_op.cc b/caffe2/contrib/fakelowp/fp16_fc_acc_op.cc
deleted file mode 100644
index a5080ad786756..0000000000000
--- a/caffe2/contrib/fakelowp/fp16_fc_acc_op.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <functional>
-
-#include "caffe2/contrib/fakelowp/fp16_fc_acc_op.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/operators/fc_inference.h"
-
-namespace caffe2 {
-
-template <>
-int Fp16FCAccOp<CPUContext, DefaultEngine, false>::runs = 0;
-template <>
-float Fp16FCAccOp<CPUContext, DefaultEngine, false>::total_error = 0.0;
-template <>
-float Fp16FCAccOp<CPUContext, DefaultEngine, false>::total_error_with_bias =
-    0.0;
-
-template <>
-int Fp16FCAccOp<CPUContext, DefaultEngine, true>::runs = 0;
-template <>
-float Fp16FCAccOp<CPUContext, DefaultEngine, true>::total_error = 0.0;
-template <>
-float Fp16FCAccOp<CPUContext, DefaultEngine, true>::total_error_with_bias = 0.0;
-
-REGISTER_CPU_OPERATOR(
-    Fp16FCAcc32,
-    Fp16FCAccOp<
-        CPUContext,
-        DefaultEngine,
-        false /* USE_ACC_FP16 */,
-        true /* USE_TMP_ACCUMULATOR */,
-        false /* ADD_BIAS_FIRST */>);
-
-using namespace std::placeholders;
-
-OPERATOR_SCHEMA(Fp16FCAcc32)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
-    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
-        std::bind(CostInferenceForFC, _1, _2, false)))
-    .SetDoc(R"DOC(Same as FC)DOC");
-
-REGISTER_CPU_OPERATOR(
-    Fp16FCAcc16,
-    Fp16FCAccOp<
-        CPUContext,
-        DefaultEngine,
-        true /* USE_ACC_FP16 */,
-        true /* USE_TMP_ACCUMULATOR */,
-        false /* ADD_BIAS_FIRST */>);
-
-OPERATOR_SCHEMA(Fp16FCAcc16)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
-    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
-        std::bind(CostInferenceForFC, _1, _2, false)))
-    .SetDoc(R"DOC(Same as FC)DOC");
-
-REGISTER_CPU_OPERATOR(
-    Fp16FCAcc32NNPI,
-    Fp16FCAccOp<
-        CPUContext,
-        DefaultEngine,
-        false /* USE_ACC_FP16 */,
-        false /* USE_TMP_ACCUMULATOR */,
-        true /* ADD_BIAS_FIRST */>);
-
-OPERATOR_SCHEMA(Fp16FCAcc32NNPI)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
-    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
-        std::bind(CostInferenceForFC, _1, _2, false)))
-    .SetDoc(R"DOC(Same as FC)DOC");
-
-REGISTER_CPU_OPERATOR(
-    Fp16FCAcc16NNPI,
-    Fp16FCAccOp<
-        CPUContext,
-        DefaultEngine,
-        true /* USE_ACC_FP16 */,
-        false /* USE_TMP_ACCUMULATOR */,
-        true /* ADD_BIAS_FIRST */>);
-
-OPERATOR_SCHEMA(Fp16FCAcc16NNPI)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
-    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
-        std::bind(CostInferenceForFC, _1, _2, false)))
-    .SetDoc(R"DOC(Same as FC)DOC");
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/fp16_fc_acc_op.h b/caffe2/contrib/fakelowp/fp16_fc_acc_op.h
deleted file mode 100644
index 27dccc3230b0a..0000000000000
--- a/caffe2/contrib/fakelowp/fp16_fc_acc_op.h
+++ /dev/null
@@ -1,398 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <fbgemm/FbgemmConvert.h>
-#include <fbgemm/FbgemmFP16.h>
-#include <immintrin.h>
-
-#include "caffe2/contrib/fakelowp/fp16_gemm_utils.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/utils/conversions.h"
-#include "caffe2/utils/math.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-using namespace std;
-
-// C2 wrapper for fp16 gemm with fp16 accumulation
-template <
-    class Context,
-    class Engine = DefaultEngine,
-    bool USE_ACC_FP16 = false, // Whether use fp16 accumulation
-    bool USE_TMP_ACCUMULATOR = false,
-    bool ADD_BIAS_FIRST = false>
-class Fp16FCAccOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  Fp16FCAccOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
-        axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)) {}
-  ~Fp16FCAccOp() noexcept override {
-    if (X_fp16_ != nullptr) {
-      delete[] X_fp16_;
-    }
-    if (W_fp16_ != nullptr) {
-      delete[] W_fp16_;
-    }
-    if (b_fp16_ != nullptr) {
-      delete[] b_fp16_;
-    }
-    if (bias_multiplier_fp16_ != nullptr) {
-      delete[] bias_multiplier_fp16_;
-    }
-  }
-
-  // template on X, B, W and Y.
-  template <typename T_X, typename T_B, typename T_W, typename T_Y>
-  bool DoRunWithType() {
-    const auto& X = Input(0);
-    const auto& W_blob = OperatorBase::InputBlob(1);
-    const auto& b = Input(2);
-    auto* Y = Output(0);
-    CAFFE_ENFORCE(b.ndim() == 1, b.ndim());
-    // batch size
-    const auto canonical_axis = X.canonical_axis_index(axis_);
-    const int M = X.size_to_dim(canonical_axis);
-    const int N = b.size();
-    const int K = X.size_from_dim(canonical_axis);
-
-    Y_shape_cache_ = X.sizes().vec();
-    // This is an invariant of canonical_axis, so we can DCHECK.
-    TORCH_DCHECK_LE(canonical_axis + 1, Y_shape_cache_.size());
-    Y_shape_cache_.resize(canonical_axis + 1);
-    Y_shape_cache_[canonical_axis] = N;
-    Y->Resize(Y_shape_cache_);
-
-    if (X.size() == 0) {
-      // skip the rest of the computation if X is empty
-      Y->template mutable_data<T_Y>();
-      return true;
-    }
-
-    // Convert X and W to fp16
-    int X_size = M * K;
-    int W_size = N * K;
-    if (X_fp16_ == nullptr) {
-      X_fp16_ = new float[X_size];
-      X_size_cached_ = X_size;
-    } else if (X_size > X_size_cached_) {
-      delete[] X_fp16_;
-      X_fp16_ = new float[X_size];
-      X_size_cached_ = X_size;
-    }
-    fbgemm::RoundToFloat16(
-        X.template data<T_X>(),
-        X_fp16_,
-        X_size,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    if (W_fp16_ == nullptr) {
-      W_fp16_ = new float[W_size];
-      const T_W* W_data = nullptr;
-      if (W_blob.template IsType<
-              caffe2::unique_ptr<fbgemm::PackedGemmMatrixFP16>>()) {
-        auto* W_fbgemm =
-            OperatorBase::Input<
-                caffe2::unique_ptr<fbgemm::PackedGemmMatrixFP16>>(1)
-                .get();
-
-        if (!W_fbgemm->packed()) {
-          float* W_fp16_trans = new float[W_size];
-          fbgemm::Float16ToFloat_avx2(W_fbgemm->pmat(), W_fp16_trans, W_size);
-          for (const auto i : c10::irange(N)) {
-            for (const auto j : c10::irange(K)) {
-              W_fp16_[j * N + i] = W_fp16_trans[i * K + j];
-            }
-          }
-          delete[] W_fp16_trans;
-        } else {
-          vector<fbgemm::float16> unpacked_mat;
-          unpacked_mat.resize(W_size);
-          W_fbgemm->unpack(
-              unpacked_mat.data(), fbgemm::matrix_op_t::NoTranspose);
-          fbgemm::Float16ToFloat_avx2(unpacked_mat.data(), W_fp16_, W_size);
-        }
-
-      } else {
-        const auto& W = Input(1);
-        W_data = W.template data<T_W>();
-        // Transpose W
-        for (const auto i : c10::irange(N)) {
-          for (const auto j : c10::irange(K)) {
-            W_fp16_[j * N + i] = W_data[i * K + j];
-          }
-        }
-      }
-
-      fbgemm::RoundToFloat16(
-          W_fp16_, W_fp16_, W_size, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    }
-
-    auto Y_data = Y->template mutable_data<T_Y>();
-    int Y_size = M * N;
-
-    // Initialize Y
-    memset(Y_data, 0.0, sizeof(float) * Y_size);
-
-    // Add bias term, accumulation is in fp16.
-    if (bias_multiplier_.size() != M) {
-      // If the helper bias multiplier is not M, reshape and fill it with one.
-      bias_multiplier_.Resize(M);
-      math::Set<T_B, Context>(
-          M,
-          convert::To<float, T_B>(1),
-          bias_multiplier_.template mutable_data<T_B>(),
-          &context_);
-    }
-    if (bias_multiplier_fp16_ == nullptr) {
-      bias_multiplier_fp16_ = new float[M];
-      M_cached_ = M;
-    } else if (M > M_cached_) {
-      delete[] bias_multiplier_fp16_;
-      bias_multiplier_fp16_ = new float[M];
-      M_cached_ = M;
-    }
-    fbgemm::RoundToFloat16(
-        bias_multiplier_.template data<T_B>(),
-        bias_multiplier_fp16_,
-        M,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    if (b_fp16_ == nullptr) {
-      b_fp16_ = new float[N];
-    }
-    fbgemm::RoundToFloat16(
-        b.template data<T_B>(),
-        b_fp16_,
-        N,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    if (ADD_BIAS_FIRST) {
-      custom_fp16_gemm(
-          M,
-          1,
-          N,
-          bias_multiplier_fp16_,
-          b_fp16_,
-          0.f,
-          Y->template mutable_data<T_Y>(),
-          USE_ACC_FP16,
-          USE_TMP_ACCUMULATOR);
-#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
-      float* Y_ref = new float[M * N]();
-      TensorProto::DataType math_type = TensorProto_DataType_FLOAT;
-      math::Gemm<T_B, Context, Engine>(
-          CblasNoTrans,
-          CblasNoTrans,
-          M,
-          N,
-          1,
-          1,
-          bias_multiplier_.template data<T_B>(),
-          b.template data<T_B>(),
-          0.f,
-          Y_ref,
-          &context_,
-          math_type);
-
-      relative_error =
-          compute_relative_error(Y->template mutable_data<T_Y>(), Y_ref, M * N);
-      total_error_with_bias += relative_error;
-      VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
-          << "Relative error for Y = bias_multiplier_ * b' = " << relative_error
-          << ", average error with bias after " << runs
-          << " runs = " << total_error_with_bias / runs << endl;
-#endif
-
-      custom_fp16_gemm(
-          M,
-          K,
-          N,
-          X_fp16_,
-          W_fp16_,
-          1.f,
-          Y->template mutable_data<T_Y>(),
-          USE_ACC_FP16,
-          USE_TMP_ACCUMULATOR);
-
-#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
-      if (!W_blob.IsType<caffe2::unique_ptr<fbgemm::PackedGemmMatrixFP16>>()) {
-        const auto& W = Input(1);
-        math::Gemm<float, Context, Engine>(
-            CblasNoTrans,
-            CblasTrans,
-            M,
-            N,
-            K,
-            1,
-            X.template data<T_X>(),
-            W.template data<T_W>(),
-            1.f,
-            Y_ref,
-            &context_,
-            math_type);
-
-        runs++;
-        float relative_error = compute_relative_error(
-            Y->template mutable_data<T_Y>(), Y_ref, M * N);
-        total_error += relative_error;
-        VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
-            << "Relative error for Y = bias_multiplier_ * b' + X * W' = "
-            << relative_error << ", average error after " << runs
-            << " runs = " << total_error / runs << endl;
-
-        if (Y_ref != nullptr) {
-          delete[] Y_ref;
-        }
-      }
-#endif
-
-    } else {
-      custom_fp16_gemm(
-          M,
-          K,
-          N,
-          X_fp16_,
-          W_fp16_,
-          0.f,
-          Y->template mutable_data<T_Y>(),
-          USE_ACC_FP16,
-          USE_TMP_ACCUMULATOR);
-#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
-      if (!W_blob.IsType<caffe2::unique_ptr<fbgemm::PackedGemmMatrixFP16>>()) {
-        const auto& W = Input(1);
-        float* Y_ref = new float[M * N]();
-        TensorProto::DataType math_type = TensorProto_DataType_FLOAT;
-        math::Gemm<float, Context, Engine>(
-            CblasNoTrans,
-            CblasTrans,
-            M,
-            N,
-            K,
-            1,
-            X.template data<T_X>(),
-            W.template data<T_W>(),
-            0.f,
-            Y_ref,
-            &context_,
-            math_type);
-
-        runs++;
-        float relative_error = compute_relative_error(
-            Y->template mutable_data<T_Y>(), Y_ref, M * N);
-        total_error += relative_error;
-        VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
-            << "Relative error for Y = X * W' = " << relative_error
-            << ", average error after " << runs
-            << " runs = " << total_error / runs << endl;
-      }
-#endif
-
-      custom_fp16_gemm(
-          M,
-          1,
-          N,
-          bias_multiplier_fp16_,
-          b_fp16_,
-          1.f,
-          Y->template mutable_data<T_Y>(),
-          USE_ACC_FP16,
-          USE_TMP_ACCUMULATOR);
-
-#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
-      math::Gemm<T_B, Context, Engine>(
-          CblasNoTrans,
-          CblasNoTrans,
-          M,
-          N,
-          1,
-          1,
-          bias_multiplier_.template data<T_B>(),
-          b.template data<T_B>(),
-          1,
-          Y_ref,
-          &context_,
-          math_type);
-
-      relative_error =
-          compute_relative_error(Y->template mutable_data<T_Y>(), Y_ref, M * N);
-      total_error_with_bias += relative_error;
-      VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
-          << "Relative error for Y = X * W' + bias_multiplier_ * b' = "
-          << relative_error << ", average error with bias after " << runs
-          << " runs = " << total_error_with_bias / runs << endl;
-      if (Y_ref != nullptr) {
-        delete[] Y_ref;
-      }
-#endif
-    }
-
-    return true;
-  }
-
-#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
-  float compute_L2_norm(float* A, int size) {
-    float square_sum = 0.0;
-    for (const auto i : c10::irange(size)) {
-      square_sum += A[i] * A[i];
-    }
-    return std::sqrt(square_sum);
-  }
-
-  float compute_relative_error(float* A, float* A_ref, int size) {
-    float error = 0.0;
-    for (const auto i : c10::irange(size)) {
-      error += (A[i] - A_ref[i]) * (A[i] - A_ref[i]);
-    }
-    error = std::sqrt(error);
-    float L2_norm = compute_L2_norm(A, size);
-    return error / L2_norm;
-  }
-#endif
-
-  bool RunOnDevice() override {
-    return DoRunWithType<
-        float, // X
-        float, // B
-        float, // W
-        float>(); // Y
-  }
-
- protected:
-  size_t axis_{1};
-  size_t axis_w_{1};
-  size_t X_size_cached_{0};
-  size_t M_cached_{0};
-  static int runs;
-  static float total_error;
-  static float total_error_with_bias;
-  float* X_fp16_ = nullptr;
-  float* W_fp16_ = nullptr;
-  float* b_fp16_ = nullptr;
-  float* bias_multiplier_fp16_ = nullptr;
-  // A local vector to cache the output shape so we don't need to recreate
-  // a vector object every time we run Run().
-  vector<int64_t> Y_shape_cache_;
-  Tensor bias_multiplier_{Context::GetDeviceType()};
-};
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/fp16_fma.cc b/caffe2/contrib/fakelowp/fp16_fma.cc
deleted file mode 100644
index ddca9d44256b7..0000000000000
--- a/caffe2/contrib/fakelowp/fp16_fma.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-#include "fp16_fma.h"
-#include <immintrin.h>
-#include <cmath>
-#include <cstdint>
-
-namespace fake_fp16 {
-
-// Compute fp16 FMA using fp16
-// Out = FMA (A, B, Out)
-//
-// Algorithm:
-//  Do an FMA in fp64
-//  Since fp16 has 10 bits of mantissa and fp64 has 52, zero out
-//   42 bits.
-//  Extract the exponent.
-//  If the exponent ends up in the subnormal range, shift out
-//  only 42 - (14 + exponent).
-//  Compute the bounce value as a value that is big enough to
-//  push all the digits except for the required ones in fp16,
-//  the objective is to push digits to let the machine do rounding.
-//  Add 42 or the computed number (in case of denormals) to the exponent.
-//  For negative numbers set the highest bit of the mantissa to 1.
-void fma_fp16(int N, const float* A, const float* B, float* Out) {
-  constexpr int blockSize = 4;
-  constexpr uint64_t mask = 0x7ff0000000000000;
-  constexpr uint64_t shift_bits = 52;
-  constexpr uint64_t offset = 1023;
-  constexpr uint64_t dbl_threehalf = 0x3ff8000000000000;
-
-  uint64_t expo_bouncer;
-
-  // It can be proven than in the absence of intermediate overflow
-  // the desired numerical result can be obtained even with the
-  // possibility of a double rounding, as follow.
-  //    round-to-fp16-precision(   (double)A * (double)B + (double)C  )
-  // This statement is not proved here; but we explain how to round a fp64
-  // number into fp16 precision using the technique of a "Bouncer"
-  // Suppose a numerical value in fp64 has exponent value of E
-  // If -14 <= E <= 15 (the fp16 exponent value for normalized number),
-  // the lsb of this value in fp16 precision is 2^(E-10).
-  // Now consider this fp64 number Bouncer which is 2^(52+(E-10)) * 3/2
-  // The lsb of Bouncer is (by design) 2^(E-10). Because Bouncer is
-  // is very much bigger than the fp16 value, denoted by say x,
-  //          2^(52+(E-10)) < Bouncer + x < 2^(53+(E-10))
-  // Thus TMP := Bouncer + x  in double precision forces x to be rounded off
-  // at the lsb position of 2^(E-10).
-  // Consequently, the subtraction yields the desired result
-  //          x_fp16_precision := TMP - Bouncer;
-  // If E < -14, we are dealing with the subnormal number range, there the lsb
-  // of fp16 precision is FIXED at 2^(-24) (definition of fp16).
-  // Hence the Bouncer is set at 2^(52-24) = 2^(28)
-
-  int n = 0;
-  for (; n + blockSize < N; n += blockSize) {
-    __m256d mA = _mm256_cvtps_pd(_mm_loadu_ps(A + n));
-    __m256d mB = _mm256_cvtps_pd(_mm_loadu_ps(B + n));
-    __m256d mOut = _mm256_cvtps_pd(_mm_loadu_ps(Out + n));
-
-    mOut = _mm256_fmadd_pd(mA, mB, mOut);
-
-    __m256i mExpv =
-        _mm256_and_si256(_mm256_castpd_si256(mOut), _mm256_set1_epi64x(mask));
-    mExpv = _mm256_srli_epi64(mExpv, shift_bits);
-    mExpv = _mm256_sub_epi64(mExpv, _mm256_set1_epi64x(offset));
-
-    __m256i cmp = _mm256_cmpgt_epi64(_mm256_set1_epi64x(-14), mExpv);
-
-    __m256i mExpoBouncer = _mm256_and_si256(cmp, _mm256_set1_epi64x(28));
-    mExpoBouncer = _mm256_or_si256(
-        mExpoBouncer,
-        _mm256_andnot_si256(
-            cmp, _mm256_add_epi64(_mm256_set1_epi64x(42), mExpv)));
-
-    __m256i mBouncer = _mm256_add_epi64(
-        _mm256_set1_epi64x(dbl_threehalf),
-        _mm256_slli_epi64(mExpoBouncer, shift_bits));
-
-    mOut = _mm256_sub_pd(
-        _mm256_add_pd(_mm256_castsi256_pd(mBouncer), mOut),
-        _mm256_castsi256_pd(mBouncer));
-
-    _mm_storeu_ps(Out + n, _mm256_cvtpd_ps(mOut));
-  }
-  // Epilogue
-  for (; n < N; n++) {
-    typedef union {
-      uint64_t I;
-      double F;
-    } flint64;
-
-    flint64 A_, B_, Out_, Bouncer;
-    A_.F = A[n];
-    B_.F = B[n];
-    Out_.F = Out[n];
-
-    // This is FMA in FP64
-    Out_.F = std::fma(A_.F, B_.F, Out_.F);
-
-    // We now round Out_.F to fp16 precision using a Bouncer
-
-    // First, figure out the exponent value E of Out_.F
-    int64_t expv = ((Out_.I & mask) >> shift_bits) - offset;
-
-    // Second: create the Bouncer. To do that, we
-    // first compute its exponent and then add that exponent value
-    // to the exponent field of the constant 3/2.
-    if (expv < -14) {
-      expo_bouncer = 28;
-    } else {
-      expo_bouncer = 42 + expv;
-    }
-    Bouncer.I = dbl_threehalf + (expo_bouncer << shift_bits);
-
-    // This is rounding to fp16 precision; add and subtract Bouncer
-    Out_.F = (Bouncer.F + Out_.F) - Bouncer.F;
-    Out[n] = Out_.F;
-  }
-}
-
-float fmafp32_avx_emulation(float v1, float v2, float v3) {
-  __m256 v1Vec = _mm256_set1_ps(v1);
-  __m256 v2Vec = _mm256_set1_ps(v2);
-  __m256 v3Vec = _mm256_set1_ps(v3);
-  __m256 resVec = _mm256_fmadd_ps(v1Vec, v2Vec, v3Vec);
-  float *result = (float *)&resVec;
-  return *result;
-}
-
-} // namespace fake_fp16
diff --git a/caffe2/contrib/fakelowp/fp16_fma.h b/caffe2/contrib/fakelowp/fp16_fma.h
deleted file mode 100644
index 9f30a075656e5..0000000000000
--- a/caffe2/contrib/fakelowp/fp16_fma.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#pragma once
-#include <glog/logging.h>
-
-namespace fake_fp16 {
-
-// Compute FMA using fp16 accumulation
-// Out = FMA (A, B, Out)
-void fma_fp16(int N, const float* A, const float* B, float* Out);
-
-void fma_fp16_slow(int N, const float* A, const float* B, float* Out);
-
-float fma_fp16_slow(const float A, const float B, float Out);
-
-float fmafp32_avx_emulation(float v1, float v2, float v3);
-
-} // namespace fake_fp16
diff --git a/caffe2/contrib/fakelowp/fp16_fma_slow.cc b/caffe2/contrib/fakelowp/fp16_fma_slow.cc
deleted file mode 100644
index 3f0ef25b29c0e..0000000000000
--- a/caffe2/contrib/fakelowp/fp16_fma_slow.cc
+++ /dev/null
@@ -1,540 +0,0 @@
-#include <immintrin.h>
-#include "fp16_fma.h"
-
-namespace fp16_fma {
-
-typedef int int16;
-typedef char int8;
-typedef unsigned short int bits16;
-typedef unsigned int bits32;
-typedef signed char Word8;
-typedef unsigned char UWord8;
-typedef signed short Word16;
-typedef unsigned short UWord16;
-typedef signed int Word32;
-typedef unsigned int UWord32;
-typedef long long Word64;
-typedef unsigned long long UWord64;
-typedef unsigned short float16;
-typedef signed int sbits32;
-typedef signed short int sbits16;
-
-typedef char flag;
-
-#define MAX_U32 (UWord32)0xffffffffL
-#define MAX_U16 (UWord16)0xffff
-#define BITMASK_T(typ, w) (((typ)1 << (w)) - 1)
-#define TESTBIT(x, n) (((x) >> (n)) & 1)
-
-#define float16_default_nan 0x7E00
-#define float16_default_nan_pos 0x7E00
-#define float16_default_nan_neg 0xFE00
-
-int8 float_exception_flags = 0;
-
-enum {
-  float_round_nearest_even = 0,
-  float_round_down = 1,
-  float_round_up = 2,
-  float_round_to_zero = 3
-};
-
-int8 float_rounding_mode = float_round_nearest_even;
-enum { float_tininess_after_rounding = 0, float_tininess_before_rounding = 1 };
-int float_detect_tininess = float_tininess_after_rounding;
-
-inline bits16 extractFloat16Frac(float16 a) {
-  return a & 0x3FF;
-}
-
-inline int16 extractFloat16Exp(float16 a) {
-  return (a >> 10) & 0x1F;
-}
-
-inline flag extractFloat16Sign(float16 a) {
-  return a >> 15;
-}
-
-flag float16_is_quiet_nan(float16 a) {
-  return (0xFC00 <= (bits16)(a << 1));
-}
-
-flag float16_is_signaling_nan(float16 a) {
-  return (((a >> 9) & 0x3F) == 0x3E) && (a & 0x01FF);
-}
-
-enum {
-  float_flag_inexact = 1,
-  float_flag_divbyzero = 2,
-  float_flag_underflow = 4,
-  float_flag_overflow = 8,
-  float_flag_invalid = 16
-};
-
-void float_raise(int8 flags) {
-  float_exception_flags |= flags;
-}
-int pickNaNMulAdd(
-    flag aIsQNaN,
-    flag aIsSNaN,
-    flag bIsQNaN,
-    flag bIsSNaN,
-    flag cIsQNaN,
-    flag cIsSNaN,
-    flag infzero) {
-  if (infzero) {
-    float_raise(float_flag_invalid);
-    return 2;
-  }
-
-  if (cIsSNaN || cIsQNaN) {
-    return 2;
-  } else if (bIsSNaN || bIsQNaN) {
-    return 1;
-  } else {
-    return 0;
-  }
-}
-
-inline float16 packFloat16(flag zSign, int16 zExp, bits16 zSig) {
-  return (((bits16)zSign) << 15) + (((bits16)zExp) << 10) + zSig;
-}
-
-float16
-propagateFloat16MulAddNaN(float16 a, float16 b, float16 c, flag infzero) {
-  flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN, cIsQuietNaN,
-      cIsSignalingNaN;
-  int selNaN;
-
-  aIsQuietNaN = float16_is_quiet_nan(a);
-  aIsSignalingNaN = float16_is_signaling_nan(a);
-  bIsQuietNaN = float16_is_quiet_nan(b);
-  bIsSignalingNaN = float16_is_signaling_nan(b);
-  cIsQuietNaN = float16_is_quiet_nan(c);
-  cIsSignalingNaN = float16_is_signaling_nan(c);
-
-  if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) {
-    float_raise(float_flag_invalid);
-  }
-
-  selNaN = pickNaNMulAdd(
-      aIsQuietNaN,
-      aIsSignalingNaN,
-      bIsQuietNaN,
-      bIsSignalingNaN,
-      cIsQuietNaN,
-      cIsSignalingNaN,
-      infzero);
-
-  switch (selNaN) {
-    case 0:
-      return a | (1 << 9);
-    case 1:
-      return b | (1 << 9);
-    case 2:
-      return c | (1 << 9);
-    case 3:
-    default:
-      return float16_default_nan;
-  }
-}
-
-inline void shift32RightJamming(bits32 a, int16 count, bits32* zPtr) {
-  bits32 z;
-
-  if (count == 0) {
-    z = a;
-  } else if (count < 32) {
-    z = (a >> count) | ((a << ((-count) & 31)) != 0);
-  } else {
-    z = (a != 0);
-  }
-  *zPtr = z;
-}
-
-void shift16RightJamming(bits16 a, int16 count, bits16* zPtr) {
-  bits16 z;
-
-  if (count == 0) {
-    z = a;
-  } else if (count < 16) {
-    z = (a >> count) | (((a << ((-count) & 15)) & 0xffff) != 0);
-  } else {
-    z = (a != 0);
-  }
-  *zPtr = z;
-}
-
-Word8 GetRound(Word32 fcr) {
-  Word8 res, round_mode;
-  round_mode = fcr & 0x3; // lower 2 bits as rounding mode in FCR
-  res = (round_mode == 3)
-      ? 1
-      : ((round_mode == 2)
-             ? 2
-             : ((round_mode == 1) ? 3 : 0)); // Translate to float_rounding_mode
-  return res;
-}
-
-Word8 GetException(Word32 fsr) {
-  Word8 res = 0;
-  if (TESTBIT(fsr, 7) == 1)
-    res |= 32; // float_flag_inexact
-  if (TESTBIT(fsr, 8) == 1)
-    res |= 16; // float_flag_underflow
-  if (TESTBIT(fsr, 9) == 1)
-    res |= 8; // float_flag_overflow
-  if (TESTBIT(fsr, 10) == 1)
-    res |= 4; // float_flag_divbyzero
-  if (TESTBIT(fsr, 11) == 1)
-    res |= 1; // float_flag_invalid
-  return res;
-}
-
-float16 roundAndPackFloat16(flag zSign, int16 zExp, bits16 zSig) {
-  int8 roundingMode;
-  flag roundNearestEven;
-  int8 roundIncrement, roundBits;
-  flag isTiny;
-
-  roundingMode = float_rounding_mode;
-  roundNearestEven = (roundingMode == float_round_nearest_even);
-  roundIncrement = 0x8;
-  if (!roundNearestEven) {
-    //    if ( ( ! roundNearestEven ) && ( roundingMode !=
-    //    float_round_ties_away) ) {
-    if (roundingMode == float_round_to_zero) {
-      roundIncrement = 0;
-    } else {
-      roundIncrement = 0xF;
-      if (zSign) {
-        if (roundingMode == float_round_up)
-          roundIncrement = 0;
-      } else {
-        if (roundingMode == float_round_down)
-          roundIncrement = 0;
-      }
-    }
-  }
-  roundBits = zSig & 0xF;
-  if (0x1D <= (bits16)zExp) {
-    if ((0x1D < zExp) ||
-        ((zExp == 0x1D) && ((sbits16)(zSig + roundIncrement) < 0))) {
-      float_raise(float_flag_overflow | float_flag_inexact);
-      return packFloat16(zSign, 0x1F, 0) - (roundIncrement == 0);
-    }
-    if (zExp < 0) {
-      isTiny = (float_detect_tininess == float_tininess_before_rounding) ||
-          (zExp < -1) || (zSig + roundIncrement < 0x8000);
-      shift16RightJamming(zSig, -zExp, &zSig);
-      zExp = 0;
-      roundBits = zSig & 0xF;
-
-      if (isTiny && roundBits)
-        float_raise(float_flag_underflow);
-    }
-  }
-  if (roundBits)
-    float_exception_flags |= float_flag_inexact;
-  zSig = (zSig + roundIncrement) >> 4;
-  zSig &= ~(((roundBits ^ 0x8) == 0) & roundNearestEven);
-  if (zSig == 0)
-    zExp = 0;
-  return packFloat16(zSign, zExp, zSig);
-}
-
-int8 countLeadingZeros32(bits32 a) {
-  static const int8 countLeadingZerosHigh[] = {
-      8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
-      3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  int8 shiftCount;
-
-  shiftCount = 0;
-  if (a < 0x10000) {
-    shiftCount += 16;
-    a <<= 16;
-  }
-  if (a < 0x1000000) {
-    shiftCount += 8;
-    a <<= 8;
-  }
-  shiftCount += countLeadingZerosHigh[a >> 24];
-  return shiftCount;
-}
-
-void normalizeFloat16Subnormal(bits16 aSig, int16* zExpPtr, bits16* zSigPtr) {
-  int8 shiftCount;
-
-  shiftCount = countLeadingZeros32((bits32)aSig) - 16 - 5;
-  *zSigPtr = aSig << shiftCount;
-  *zExpPtr = 1 - shiftCount;
-}
-
-float16 float16_muladd(float16 a, float16 b, float16 c, flag negate_product) {
-  flag aSign, bSign, cSign, zSign;
-  int16 aExp, bExp, cExp, pExp, zExp, expDiff;
-  bits16 aSig, bSig, cSig;
-  flag pInf, pZero, pSign;
-  bits32 pSig32, cSig32, zSig32;
-  bits16 pSig;
-  int shiftcount;
-  flag infzero;
-
-  /* Extract the sign bit, exponent and significant  */
-  aSig = extractFloat16Frac(a);
-  aExp = extractFloat16Exp(a);
-  aSign = extractFloat16Sign(a);
-
-  bSig = extractFloat16Frac(b);
-  bExp = extractFloat16Exp(b);
-  bSign = extractFloat16Sign(b);
-
-  cSig = extractFloat16Frac(c);
-  cExp = extractFloat16Exp(c);
-  cSign = extractFloat16Sign(c);
-
-  /* Flag to indicate fusedMultiplyAdd(0, inf,  or fusedMultiplyAdd(inf, 0 c) */
-  infzero =
-      ((aExp == 0 && aSig == 0 && bExp == 0x1f && bSig == 0) ||
-       (aExp == 0x1f && aSig == 0 && bExp == 0 && bSig == 0));
-
-  /* CASE1: if any input is NaN =>  NaN propagate */
-
-  /* It is implementation-defined whether the cases of (0,inf,qnan)
-   * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
-   * they return if they do), so we have to hand this information
-   * off to the target-specific pick-a-NaN routine.
-   */
-
-  /* IEEE754 7.2 - Invalid: fusedMultiplyAdd(0, inf, c) or
-   * fusedMultiplyAdd(inf, 0 , c) unless c is a quiet NaN; If c is a
-   * quiet NaN then it is implementation defined whether the invalid operation
-   * exception is signaled.
-   */
-  if (((aExp == 0x1f) && aSig) || ((bExp == 0x1f) && bSig) ||
-      ((cExp == 0x1f) && cSig)) {
-    return propagateFloat16MulAddNaN(a, b, c, infzero);
-  }
-
-  /* Work out the sign and type of the product */
-  pSign = aSign ^ bSign;
-  if (negate_product) {
-    pSign ^= 1;
-  }
-
-  /* CASE2: fusedMultiplyAdd(0, inf, c) or fusedMultiplyAdd(inf,0,  c) and c is
-   * not NaN  => raise invalid */
-  if (infzero) {
-    float_raise(float_flag_invalid);
-    return float16_default_nan;
-  }
-
-  pInf = (aExp == 0x1f) || (bExp == 0x1f);
-  pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
-
-  /* CASE3 and CASE4: c is inf, p is number or inf*/
-  if (cExp == 0x1f) {
-    if (pInf && (pSign ^ cSign)) {
-      /* CASE3: addition of opposite-signed infinities => InvalidOperation */
-      float_raise(float_flag_invalid);
-      return float16_default_nan;
-    }
-    /* CASE4: Otherwise generate an infinity of the same sign */
-    return packFloat16(cSign, 0x1f, 0);
-  }
-
-  /* CASE5: c is number and p is inf */
-  if (pInf) {
-    return packFloat16(pSign, 0x1f, 0);
-  }
-
-  /* CASE6: c is number, p is zero */
-  if (pZero) {
-    if (cExp == 0) {
-      if (cSig == 0) {
-        /* Adding two exact zeroes */
-        if (pSign == cSign) {
-          zSign = pSign;
-        } else if (float_rounding_mode == float_round_down) {
-          zSign = 1;
-        } else {
-          zSign = 0;
-        }
-        return packFloat16(zSign, 0, 0);
-      }
-    }
-    /* CASE7: Zero plus something non-zero : just return the something */
-    return c;
-  }
-
-  if (aExp == 0) {
-    normalizeFloat16Subnormal(aSig, &aExp, &aSig);
-  }
-  if (bExp == 0) {
-    normalizeFloat16Subnormal(bSig, &bExp, &bSig);
-  }
-
-  /* Calculate the actual result a * b + c */
-
-  /* NOTE: we subtract 0x7e where float16_mul() subtracts 0x7f
-   * because we want the true exponent, not the "one-less-than"
-   * flavour that roundAndPackFloat16() takes.
-   */
-  pExp = aExp + bExp - 0xe;
-  aSig = (aSig | 0x0400) << 4;
-  bSig = (bSig | 0x0400) << 5;
-  pSig32 = (bits32)aSig * bSig;
-  if ((sbits32)(pSig32 << 1) >= 0) {
-    pSig32 <<= 1;
-    pExp--;
-  }
-
-  zSign = pSign;
-
-  /* Now pSig32 is the significand of the multiply, with the explicit bit in
-   * position 30.
-   */
-  if (cExp == 0) {
-    if (!cSig) {
-      /* Throw out the special case of c being an exact zero now */
-      shift32RightJamming(pSig32, 16, &pSig32);
-      pSig = pSig32;
-      return roundAndPackFloat16(zSign, pExp - 1, pSig);
-    }
-    normalizeFloat16Subnormal(cSig, &cExp, &cSig);
-  }
-
-  cSig32 = (bits32)cSig << (30 - 10);
-  cSig32 |= 0x40000000;
-  expDiff = pExp - cExp;
-
-  if (pSign == cSign) {
-    /* Addition */
-    if (expDiff > 0) {
-      /* scale c to match p */
-      shift32RightJamming(cSig32, expDiff, &cSig32);
-      zExp = pExp;
-    } else if (expDiff < 0) {
-      /* scale p to match c */
-      shift32RightJamming(pSig32, -expDiff, &pSig32);
-      zExp = cExp;
-    } else {
-      /* no scaling needed */
-      zExp = cExp;
-    }
-    /* Add significands and make sure explicit bit ends up in posn 62 */
-    zSig32 = pSig32 + cSig32;
-    if ((sbits32)zSig32 < 0) {
-      shift32RightJamming(zSig32, 1, &zSig32);
-    } else {
-      zExp--;
-    }
-  } else {
-    /* Subtraction */
-    if (expDiff > 0) {
-      shift32RightJamming(cSig32, expDiff, &cSig32);
-      zSig32 = pSig32 - cSig32;
-      zExp = pExp;
-    } else if (expDiff < 0) {
-      shift32RightJamming(pSig32, -expDiff, &pSig32);
-      zSig32 = cSig32 - pSig32;
-      zExp = cExp;
-      zSign ^= 1;
-    } else {
-      zExp = pExp;
-      if (cSig32 < pSig32) {
-        zSig32 = pSig32 - cSig32;
-      } else if (pSig32 < cSig32) {
-        zSig32 = cSig32 - pSig32;
-        zSign ^= 1;
-      } else {
-        /* Exact zero */
-        zSign = 0;
-        if (float_rounding_mode == float_round_down) {
-          zSign ^= 1;
-        }
-        return packFloat16(zSign, 0, 0);
-      }
-    }
-    --zExp;
-    /* Normalize to put the explicit bit back into bit 62. */
-    shiftcount = countLeadingZeros32(zSig32) - 1;
-    zSig32 <<= shiftcount;
-    zExp -= shiftcount;
-  }
-  shift32RightJamming(zSig32, 16, &zSig32);
-  return roundAndPackFloat16(zSign, zExp, zSig32);
-}
-
-void fp_mac_h(
-    Word16 d0,
-    Word16 d1,
-    Word16 d2,
-    Word32 negate_product,
-    Word32 fcr,
-    Word32 fsr_i,
-    Word16* res,
-    Word32* fsr_o) {
-  // Extract rounding mode from FCR/FSR to softfloat
-  float_rounding_mode = GetRound(fcr);
-  float_exception_flags = GetException(fsr_i);
-  // Call softfloat lib
-  *res = float16_muladd(d1, d2, d0, negate_product);
-  //*fsr_o =  PutException(float_exception_flags, fsr_i);
-}
-
-void fma16(
-    const Word16 input,
-    const Word16 a,
-    const Word16 b,
-    const Word32 fcr,
-    const Word32 fsr_i,
-    Word16* result,
-    Word32* fsr_o) {
-  Word16 res;
-  Word32 fsr = 0;
-  // Call fp utility
-  fp_mac_h(b, input, a, 0, fcr, fsr_i, &res, &fsr);
-  // Output result
-  *fsr_o = fsr;
-  *result = res;
-}
-
-float fake_fma_fp16_slow(float v1, float v2, float v3) {
-  uint32_t fcr_val = 0;
-  uint32_t fsr_val = 0x00000F80;
-  uint32_t exception_flags = 0;
-
-  uint16_t hv1, hv2, hv3, hresult;
-  hv1 = _cvtss_sh(v1, 0);
-  hv2 = _cvtss_sh(v2, 0);
-  hv3 = _cvtss_sh(v3, 0);
-
-  fma16(
-      *reinterpret_cast<Word16*>(&hv1),
-      *reinterpret_cast<Word16*>(&hv2),
-      *reinterpret_cast<Word16*>(&hv3),
-      *reinterpret_cast<Word32*>(&fcr_val),
-      *reinterpret_cast<Word32*>(&fsr_val),
-      reinterpret_cast<Word16*>(&hresult),
-      reinterpret_cast<Word32*>(&exception_flags));
-
-  return _cvtsh_ss(hresult);
-}
-
-void fake_fma_fp16_slow(int N, const float* A, const float* B, float* Out) {
-  for (int n = 0; n < N; n++) {
-    Out[n] = fake_fma_fp16_slow(A[n], B[n], Out[n]);
-  }
-}
-
-} // namespace fp16_fma
diff --git a/caffe2/contrib/fakelowp/fp16_fma_test.cc b/caffe2/contrib/fakelowp/fp16_fma_test.cc
deleted file mode 100644
index 95ec5b2e37e3f..0000000000000
--- a/caffe2/contrib/fakelowp/fp16_fma_test.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include <cmath>
-#include <vector>
-#include "fp16_fma.h"
-
-using namespace std;
-using namespace fake_fp16;
-
-TEST(FP16_FMA, Simple) {
-  int x = 1;
-  x += 2;
-  int N = 6;
-
-  vector<float> A(N, 1.23);
-  vector<float> B(N, 2.34);
-  vector<float> C(N, 3.45);
-  fma_fp16(N, A.data(), B.data(), C.data());
-
-  for (int i = 0; i < N; i++) {
-    LOG(INFO) << C[i] << " ";
-    ASSERT_TRUE(abs(C[i] - 6.32812) < 1e-3);
-  }
-}
-
-TEST(FP16_FMA, Comprehensive) {
-#if 0
-#pragma omp parallel num_threads(30)
-  for (uint16_t a = 0; a < 1 << 15; a++) {
-    for (uint16_t b = 0; b < 1 << 15; b++) {
-      for (uint16_t c = 0; c < 1 << 15; c++) {
-        uint16_t z = a + b * c;
-
-        //       fake_fma_fp16_slow(A[0], B[0], C[0]);
-      }
-    }
-  }
-
-  fake_fma_fp16_slow(A[0], B[0], C[0]);
-#endif
-}
diff --git a/caffe2/contrib/fakelowp/fp16_gemm_utils.cc b/caffe2/contrib/fakelowp/fp16_gemm_utils.cc
deleted file mode 100644
index b00e265024328..0000000000000
--- a/caffe2/contrib/fakelowp/fp16_gemm_utils.cc
+++ /dev/null
@@ -1,467 +0,0 @@
-#include "caffe2/contrib/fakelowp/fp16_gemm_utils.h"
-#include <fbgemm/FbgemmConvert.h>
-#include <fbgemm/FbgemmFP16.h>
-#include <glog/logging.h>
-#include <immintrin.h>
-#include "caffe2/core/context.h"
-#include "caffe2/utils/math.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-// dimA(before transpose) = M x N, dimA (after transpose) = N x M.
-void transpose(const float* A, std::vector<float>& A_trans, int M, int N) {
-  CAFFE_ENFORCE_EQ(M * N, A_trans.size());
-  fbgemm::transpose_simd(M, N, A, N, A_trans.data(), M);
-}
-
-void custom_fp16_gemm_with_trans(
-    const CBLAS_TRANSPOSE trans_A,
-    const CBLAS_TRANSPOSE trans_B,
-    const int m,
-    const int k,
-    const int n,
-    const float* A,
-    const float* B,
-    const float beta,
-    float* C,
-    const bool use_acc_fp16,
-    const bool use_temp_accumulator) {
-  switch (trans_A) {
-    case CblasNoTrans: {
-      switch (trans_B) {
-        case CblasNoTrans: {
-          // A * B
-          custom_fp16_gemm(
-              m, k, n, A, B, beta, C, use_acc_fp16, use_temp_accumulator);
-          break;
-        }
-        case CblasTrans: {
-          // A * B_trans
-          std::vector<float> B_trans(n * k);
-          transpose(B, B_trans, n, k);
-          custom_fp16_gemm(
-              m,
-              k,
-              n,
-              A,
-              B_trans.data(),
-              beta,
-              C,
-              use_acc_fp16,
-              use_temp_accumulator);
-          break;
-        }
-        default:
-          LOG(FATAL) << "Unexpected CBLAS_TRAnSPOSE for trans_B";
-      }
-    } break;
-    case CblasTrans: {
-      switch (trans_B) {
-        case CblasNoTrans: {
-          // A_trans * B
-          std::vector<float> A_trans(k * m);
-          transpose(A, A_trans, k, m);
-          custom_fp16_gemm(
-              m,
-              k,
-              n,
-              A_trans.data(),
-              B,
-              beta,
-              C,
-              use_acc_fp16,
-              use_temp_accumulator);
-          break;
-        }
-        case CblasTrans: {
-          // A_trans * B_trans
-          std::vector<float> A_trans(k * m);
-          std::vector<float> B_trans(n * k);
-          transpose(A, A_trans, k, m);
-          transpose(B, B_trans, n, k);
-          custom_fp16_gemm(
-              m,
-              k,
-              n,
-              A_trans.data(),
-              B_trans.data(),
-              beta,
-              C,
-              use_acc_fp16,
-              use_temp_accumulator);
-          break;
-        }
-        default:
-          LOG(FATAL) << "Unexpected CBLAS_TRAnSPOSE for trans_B";
-      }
-    } break;
-    default:
-      LOG(FATAL) << "Unexpected CBLAS_TRAnSPOSE for trans_A";
-  }
-}
-
-static inline __m256 clamp_subnormals(__m256 input, const float epsilon_) {
-  __m256 epsilon = _mm256_set1_ps(epsilon_);
-  __m256 nepsilon = _mm256_set1_ps(-epsilon_);
-
-  __m256 mask = _mm256_or_ps(
-      _mm256_cmp_ps(input, nepsilon, _CMP_LE_OS),
-      _mm256_cmp_ps(input, epsilon, _CMP_GE_OS));
-  return _mm256_and_ps(input, mask);
-}
-
-void custom_fp16_gemm(
-    const int m,
-    const int k,
-    const int n,
-    const float* A_fp16,
-    const float* B_fp16,
-    const float beta,
-    float* C,
-    const bool use_acc_fp16,
-    const bool use_temp_accumulator) {
-#ifdef LOG_LEVEL_FOR_FBFCPACkEDACC16_PERFORmAnCE_LOG
-  clock_t begin = clock();
-#endif
-  int C_size = m * n;
-  if (beta == 0) {
-    // In Caffe2 we often do a lazy initialization, which may contain NaNs in
-    // the float values. As a result, if beta is 0, we explicitly do a setzero.
-    memset(C, 0, C_size * sizeof(C[0]));
-  } else {
-    float beta_fp16 = fbgemm::cpu_half2float(fbgemm::cpu_float2half_rn(beta));
-
-    __m256 mBetaFp16 = _mm256_broadcast_ss(&beta_fp16);
-    int i = 0;
-    for (i = 0; i + 8 <= C_size; i += 8) {
-      __m256 mC = _mm256_loadu_ps(C + i);
-      mC = _mm256_mul_ps(mC, mBetaFp16);
-      _mm256_storeu_ps(C + i, mC);
-    }
-    for (; i < C_size; i++) {
-      C[i] = C[i] * beta_fp16;
-    }
-  }
-
-  // Encode the smallest normal number in float16
-  union epsilon_t {
-    float f;
-    uint32_t i;
-  };
-
-  union epsilon_t epsilon;
-  epsilon.i = 0x38800000u; // 1 / 16384
-
-  constexpr int VLEn = 8;
-  const int kb_max = 128;
-  for (int i = 0; i < m; i++) {
-    for (int l = 0; l < k; l += kb_max) {
-      int kb = std::min(kb_max, k - l);
-      for (int j = 0; j < n; j += VLEn) {
-        int nb = std::min(VLEn, n - j);
-        if (nb == VLEn) {
-          __m256 mC = _mm256_loadu_ps(C + i * n + j);
-          __m256 mC_temp = _mm256_setzero_ps();
-          for (int l2 = l; l2 < l + kb; l2++) {
-            __m256 mA_fp16 = _mm256_broadcast_ss(A_fp16 + i * k + l2);
-            __m256 mB_fp16 = _mm256_loadu_ps((B_fp16 + l2 * n + j));
-
-            if (use_acc_fp16) {
-              mA_fp16 = clamp_subnormals(mA_fp16, epsilon.f);
-              mB_fp16 = clamp_subnormals(mB_fp16, epsilon.f);
-            }
-
-            __m256 mAB = _mm256_mul_ps(mA_fp16, mB_fp16);
-
-            if (use_acc_fp16) {
-              __m256 mAB_fp16 = _mm256_cvtph_ps(_mm256_cvtps_ph(mAB, 0));
-              mAB_fp16 = clamp_subnormals(mAB_fp16, epsilon.f);
-
-              if (use_temp_accumulator) {
-                mC_temp = _mm256_add_ps(mC_temp, mAB_fp16);
-                mC_temp = _mm256_cvtph_ps(_mm256_cvtps_ph(mC_temp, 0));
-              } else {
-                mC = _mm256_add_ps(mC, mAB_fp16);
-                mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-              }
-            } else {
-              if (use_temp_accumulator) {
-                mC_temp = _mm256_add_ps(mC_temp, mAB);
-              } else {
-                mC = _mm256_add_ps(mC, mAB);
-              }
-            }
-
-            if (use_acc_fp16) {
-              mC = clamp_subnormals(mC, epsilon.f);
-            }
-          }
-          if (use_temp_accumulator) {
-            if (use_acc_fp16) {
-              mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-              mC = _mm256_add_ps(mC, mC_temp);
-              mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-            } else {
-              mC = _mm256_add_ps(mC, mC_temp);
-            }
-          }
-          _mm256_storeu_ps(C + i * n + j, mC);
-        } else {
-          __m256 mC_temp = _mm256_setzero_ps();
-          int32_t mask_src[] = {
-              -1,
-              -1,
-              -1,
-              -1,
-              -1,
-              -1,
-              -1,
-              -1,
-              0,
-              0,
-              0,
-              0,
-              0,
-              0,
-              0,
-              0,
-          };
-          __m256i imask =
-              _mm256_loadu_si256((__m256i const*)(mask_src + 8 - nb));
-          __m256 mC = _mm256_maskload_ps(C + i * n + j, imask);
-          for (int l2 = l; l2 < l + kb; l2++) {
-            __m256 mA_fp16 = _mm256_broadcast_ss(A_fp16 + i * k + l2);
-            __m256 mB_fp16 = _mm256_maskload_ps(B_fp16 + l2 * n + j, imask);
-
-            if (use_acc_fp16) {
-              mA_fp16 = clamp_subnormals(mA_fp16, epsilon.f);
-              mB_fp16 = clamp_subnormals(mB_fp16, epsilon.f);
-            }
-
-            __m256 mAB = _mm256_mul_ps(mA_fp16, mB_fp16);
-
-            if (use_acc_fp16) {
-              __m256 mAB_fp16 = _mm256_cvtph_ps(_mm256_cvtps_ph(mAB, 0));
-              mAB_fp16 = clamp_subnormals(mAB_fp16, epsilon.f);
-
-              if (use_temp_accumulator) {
-                mC_temp = _mm256_add_ps(mC_temp, mAB_fp16);
-                mC_temp = _mm256_cvtph_ps(_mm256_cvtps_ph(mC_temp, 0));
-              } else {
-                mC = _mm256_add_ps(mC, mAB_fp16);
-                mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-              }
-            } else {
-              if (use_temp_accumulator) {
-                mC_temp = _mm256_add_ps(mC_temp, mAB);
-              } else {
-                mC = _mm256_add_ps(mC, mAB);
-              }
-            }
-
-            if (use_acc_fp16) {
-              mC = clamp_subnormals(mC, epsilon.f);
-            }
-          }
-
-          if (use_temp_accumulator) {
-            if (use_acc_fp16) {
-              mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-              mC = _mm256_add_ps(mC, mC_temp);
-              mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-            } else {
-              mC = _mm256_add_ps(mC, mC_temp);
-            }
-          }
-          _mm256_maskstore_ps(C + i * n + j, imask, mC);
-        }
-      }
-    }
-  }
-
-  if (!use_acc_fp16) {
-    constexpr int kSize=8;
-    int i = 0;
-    for (; i + kSize <= C_size; i+= kSize) {
-      __m256 mC = _mm256_loadu_ps(C + i);
-      mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-      _mm256_storeu_ps(C + i, mC);
-    }
-    if (i < C_size){
-      vector<float> tmp(8);
-      for (int kk =0; kk + i < C_size; kk++) {
-        tmp[kk] = C[i + kk];
-      }
-      __m256 mC = _mm256_loadu_ps(tmp.data());
-      mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-      _mm256_storeu_ps(tmp.data(), mC);
-      for (int kk =0; kk + i < C_size; kk++) {
-        C[i + kk] = tmp[kk];
-      }
-    }
-  }
-
-#ifdef LOG_LEVEL_FOR_FBFCPACkEDACC16_PERFORmAnCE_LOG
-  clock_t end = clock();
-  double elapsed_secs = double(end - begin) / CLOCkS_PER_SEC;
-  VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
-      << "cblas_gemm_compute_acc16 run time = " << elapsed_secs << endl;
-#endif
-}
-
-void custom_fp16_gemv(
-    const bool use_acc_fp16,
-    const bool use_custom_acc32,
-    const bool use_temp_accumulator,
-    const CBLAS_TRANSPOSE trans_A,
-    const int M,
-    const int N,
-    const float alpha,
-    const float* A,
-    const float* x,
-    const float beta,
-    float* y,
-    CPUContext* context) {
-  if (use_acc_fp16) {
-    custom_fp16_gemm_with_trans(
-        trans_A,
-        CblasNoTrans,
-        M,
-        1,
-        N,
-        A,
-        x,
-        beta,
-        y,
-        true /* use acc_fp16 */,
-        use_temp_accumulator);
-  } else if (use_custom_acc32 && use_temp_accumulator) {
-    custom_fp16_gemm_with_trans(
-        trans_A,
-        CblasNoTrans,
-        M,
-        1,
-        N,
-        A,
-        x,
-        beta,
-        y,
-        false /* use acc_fp32 */,
-        use_temp_accumulator);
-  } else {
-    math::Gemv<float, CPUContext>(trans_A, M, N, alpha, A, x, beta, y, context);
-  }
-}
-
-void custom_fp16_gemm_batched(
-    const bool use_acc_fp16,
-    const bool use_custom_acc32,
-    const bool use_temp_accumulator,
-    const CBLAS_TRANSPOSE trans_A,
-    const CBLAS_TRANSPOSE trans_B,
-    const int batch_size,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float** A,
-    const float** B,
-    const float beta,
-    float** C,
-    CPUContext* context) {
-  if (!use_acc_fp16 && (!use_custom_acc32 || !use_temp_accumulator)) {
-    math::GemmBatched<float, CPUContext>(
-        trans_A, trans_B, batch_size, M, N, K, alpha, A, B, beta, C, context);
-    return;
-  }
-
-  for (int i = 0; i < batch_size; ++i) {
-    if (use_acc_fp16) {
-      custom_fp16_gemm_with_trans(
-          trans_A,
-          trans_B,
-          M,
-          K,
-          N,
-          A[i],
-          B[i],
-          beta,
-          C[i],
-          true /* use acc_fp16 */,
-          use_temp_accumulator);
-    } else {
-      CAFFE_ENFORCE(use_custom_acc32 && use_temp_accumulator);
-      custom_fp16_gemm_with_trans(
-          trans_A,
-          trans_B,
-          M,
-          K,
-          N,
-          A[i],
-          B[i],
-          beta,
-          C[i],
-          false /* use acc_fp32 */,
-          use_temp_accumulator);
-    }
-  }
-}
-
-void custom_fp16_gemm_strided_batched(
-    const bool use_acc_fp16,
-    const bool use_custom_acc32,
-    const bool use_temp_accumulator,
-    const CBLAS_TRANSPOSE trans_A,
-    const CBLAS_TRANSPOSE trans_B,
-    const int batch_size,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha /* unused */,
-    const float* A,
-    const int A_stride,
-    const float* B,
-    const int B_stride,
-    const float beta,
-    float* C,
-    const int C_stride,
-    CPUContext* context) {
-  // loop over matrices in the batch
-  for (int i = 0; i < batch_size; ++i) {
-    if (use_acc_fp16) {
-      custom_fp16_gemm_with_trans(
-          trans_A,
-          trans_B,
-          M,
-          K,
-          N,
-          A,
-          B,
-          beta,
-          C,
-          true /* use_acc_fp16 */,
-          use_temp_accumulator);
-
-    } else {
-      custom_fp16_gemm_with_trans(
-          trans_A,
-          trans_B,
-          M,
-          K,
-          N,
-          A,
-          B,
-          beta,
-          C,
-          false /* use acc_fp32*/,
-          use_temp_accumulator);
-    }
-    A += A_stride;
-    B += B_stride;
-    C += C_stride;
-  }
-}
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/fp16_gemm_utils.h b/caffe2/contrib/fakelowp/fp16_gemm_utils.h
deleted file mode 100644
index 7d2cf690b3bfc..0000000000000
--- a/caffe2/contrib/fakelowp/fp16_gemm_utils.h
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-#pragma once
-#include "caffe2/core/context.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-void custom_fp16_gemm(
-    const int m,
-    const int k,
-    const int n,
-    const float* A_fp16,
-    const float* B_fp16,
-    const float beta,
-    float* C,
-    const bool use_acc_fp16,
-    const bool use_temp_accumulator);
-
-void custom_fp16_gemm_with_trans(
-    const CBLAS_TRANSPOSE trans_A,
-    const CBLAS_TRANSPOSE trans_B,
-    const int m,
-    const int k,
-    const int n,
-    const float* A_fp16,
-    const float* B_fp16,
-    const float beta,
-    float* C,
-    const bool use_acc_fp16,
-    const bool use_temp_accumulator);
-
-void transpose(const float* A, float* A_trans, int M, int N);
-void custom_fp16_gemv(
-    const bool use_acc_fp16,
-    const bool use_custom_acc32,
-    const bool use_temp_accumulator,
-    const CBLAS_TRANSPOSE trans_A,
-    const int M,
-    const int N,
-    const float alpha,
-    const float* A,
-    const float* x,
-    const float beta,
-    float* y,
-    CPUContext* context);
-
-void custom_fp16_gemm_batched(
-    const bool use_acc_fp16,
-    const bool use_custom_acc32,
-    const bool use_temp_accumulator,
-    const CBLAS_TRANSPOSE trans_A,
-    const CBLAS_TRANSPOSE trans_B,
-    const int batch_size,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float** A,
-    const float** B,
-    const float beta,
-    float** C,
-    CPUContext* context);
-void custom_fp16_gemm_strided_batched(
-    const bool use_acc_fp16,
-    const bool use_custom_acc32,
-    const bool use_temp_accumulator,
-    const CBLAS_TRANSPOSE trans_A,
-    const CBLAS_TRANSPOSE trans_B,
-    const int batch_size,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha /* unused */,
-    const float* A,
-    const int A_stride,
-    const float* B,
-    const int B_stride,
-    const float beta,
-    float* C,
-    const int C_stride,
-    CPUContext* context);
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.cc b/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.cc
deleted file mode 100644
index 9cef5366107a6..0000000000000
--- a/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.cc
+++ /dev/null
@@ -1,14 +0,0 @@
-#include "caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(Int8DequantizeNNPI, int8::Int8DequantizeNNPIOp);
-
-OPERATOR_SCHEMA(Int8DequantizeNNPI)
-    .IdenticalTypeAndShape()
-    .NumInputs(1)
-    .NumOutputs(1)
-    .Input(0, "qX", "Int8 Tensor qX.")
-    .Output(0, "Y", "FP32 Tensor that represents mapped real value of qX.");
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h b/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h
deleted file mode 100644
index 2c4cd39edfd85..0000000000000
--- a/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef CAFFE2_OPERATORS_INT8_DEQUANTIZE_OP_H_
-#define CAFFE2_OPERATORS_INT8_DEQUANTIZE_OP_H_
-
-#include <fbgemm/FbgemmConvert.h>
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/tensor_int8.h"
-#include "caffe2/operators/quantized/int8_utils.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-namespace int8 {
-
-namespace {
-
-void Int8DequantizeNNPI(
-    const uint8_t* in,
-    float* out,
-    const int64_t N,
-    const float X_scale,
-    const int32_t X_offset) {
-  float X_scale_fp32 = 1.0f / X_scale;
-  for (const auto i : c10::irange(N)) {
-    out[i] = (float)(static_cast<int32_t>(in[i]) - X_offset) / X_scale_fp32;
-  }
-} // namespace
-
-} // namespace
-
-class Int8DequantizeNNPIOp final : public Operator<CPUContext> {
- public:
-  using Operator<CPUContext>::Operator;
-
-  bool RunOnDevice() override {
-    const auto& X = Inputs()[0]->template Get<Int8TensorCPU>();
-
-    auto* Y = Output(0, X.t.sizes(), at::dtype<float>());
-    int32_t X_offset = X.zero_point;
-    auto X_scale = X.scale;
-    Int8DequantizeNNPI(
-        X.t.data<uint8_t>(),
-        Y->mutable_data<float>(),
-        X.t.numel(),
-        X_scale,
-        X_offset);
-    // UsingOneOverScale_);
-    return true;
-  }
-};
-
-} // namespace int8
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_INT8_DEQUANTIZE_OP_H_
diff --git a/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.cc b/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.cc
deleted file mode 100644
index 1caeea1cb9900..0000000000000
--- a/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h"
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(Int8QuantizeNNPI, int8::Int8QuantizeNNPIOp);
-
-OPERATOR_SCHEMA(Int8QuantizeNNPI)
-    .IdenticalTypeAndShape()
-    .Arg("Y_scale", "Output tensor quantization scale")
-    .Arg("Y_zero_point", "Output tensor quantization offset")
-    .NumInputs(1)
-    .NumOutputs(1)
-    .Input(0, "X", "FP32 Tensor X.")
-    .Output(0, "Y", "Int8 Tensor qX representing X with linear quantization.");
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h b/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h
deleted file mode 100644
index 723e2f741b0e4..0000000000000
--- a/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_
-#define CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_
-
-#include <fbgemm/FbgemmConvert.h>
-#include <cmath>
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/tensor_int8.h"
-#include "caffe2/operators/quantized/int8_utils.h"
-#include "fp16_fma.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-namespace int8 {
-
-namespace {
-
-
-static float ClampScale(float s)
-{
-  const float MinScale(1e-10f);
-    if (std::fabs(s) < MinScale) {
-        LOG_EVERY_N(WARNING, 1000) << "Too small scale detected: "
-            << s << " clamping to +/-" << MinScale;
-        return std::signbit(s) ? -MinScale : MinScale;
-    } else {
-        return s;
-    }
-}
-
-void Int8QuantizeNNPI(
-    const float* in,
-    uint8_t* out,
-    const int64_t N,
-    const float Y_scale,
-    const int32_t Y_offset) {
-  const int32_t qmin = std::numeric_limits<uint8_t>::min();
-  const int32_t qmax = std::numeric_limits<uint8_t>::max();
-
-  float inv_scale = ClampScale(1 / Y_scale);
-  float inv_scale_fp16 = 0;
-  fbgemm::RoundToFloat16(
-      &inv_scale, &inv_scale_fp16, 1, false /* no clamping */);
-  float offset_tmp = -Y_offset;
-  fbgemm::RoundToFloat16(
-      &offset_tmp, &offset_tmp, 1, false /* no clamping */);
-  std::vector<float> in_fp16(N);
-  fbgemm::RoundToFloat16(
-      in, in_fp16.data(), N, false /* no clamping */);
-
-  std::vector<float> inv_scalev(N, inv_scale_fp16);
-  std::vector<float> offsetv(N, -offset_tmp);
-  fake_fp16::fma_fp16(N, in_fp16.data(), inv_scalev.data(), offsetv.data());
-  for (const auto i : c10::irange(N)) {
-    offsetv[i] = round(offsetv[i]);
-  }
-  fbgemm::RoundToFloat16(
-      offsetv.data(), offsetv.data(), N, false /* no clamping */);
-  for (const auto i : c10::irange(N)) {
-    float halfRes = offsetv[i];
-    if (std::isinf(halfRes)) {
-      if (halfRes > 0) {
-        halfRes = qmax;
-      } else {
-        halfRes = qmin;
-      }
-    }
-    if (halfRes > qmax) {
-      halfRes = qmax;
-    }
-    if (halfRes < qmin) {
-      halfRes = qmin;
-    }
-    out[i] = static_cast<uint8_t>(halfRes);
-  }
-}
-
-} // namespace
-
-class Int8QuantizeNNPIOp final : public Operator<CPUContext> {
- public:
-  using Operator<CPUContext>::Operator;
-
-  bool RunOnDevice() override {
-    const auto& X = Input(0);
-    auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>();
-    Y->t.ResizeLike(X);
-    int32_t Y_offset =
-        this->template GetSingleArgument<int>("Y_zero_point", 0);
-    auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
-    Y->scale = Y_scale;
-    Y->zero_point = Y_offset;
-    Int8QuantizeNNPI(
-        X.data<float>(),
-        Y->t.mutable_data<uint8_t>(),
-        X.numel(),
-        Y_scale,
-        Y_offset);
-    return true;
-  }
-};
-
-} // namespace int8
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_
diff --git a/caffe2/contrib/fakelowp/int8_swish_op_nnpi.cc b/caffe2/contrib/fakelowp/int8_swish_op_nnpi.cc
deleted file mode 100644
index ac4675965be21..0000000000000
--- a/caffe2/contrib/fakelowp/int8_swish_op_nnpi.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "caffe2/contrib/fakelowp/int8_swish_op_nnpi.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(SwishFakeInt8NNPI, int8::SwishInt8NNPIOp);
-
-OPERATOR_SCHEMA(SwishFakeInt8NNPI)
-    .IdenticalTypeAndShape()
-    .Arg("X_scale", "Inout tensor quantization scale")
-    .Arg("X_zero_point", "Input tensor quantization offset")
-    .Arg("Y_scale", "Output tensor quantization scale")
-    .Arg("Y_zero_point", "Output tensor quantization offset")
-    .NumInputs(1)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Apply the Swish function element-wise after dequantizing input tensor.
-$$Swish(x) = \frac{x}{1+\exp(-x)}$$
-Quantize the Swish function output back to Int8.
-The input and output of this operator are converted to fp16 precision
-before applying the function.
-<details>
-</details>
-)DOC")
-    .Input(0, "X", "Int8 Tensor X.")
-    .Output(0, "Y", "Int8 Tensor Y.");
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/int8_swish_op_nnpi.h b/caffe2/contrib/fakelowp/int8_swish_op_nnpi.h
deleted file mode 100644
index 990af90616bcb..0000000000000
--- a/caffe2/contrib/fakelowp/int8_swish_op_nnpi.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#ifndef CAFFE2_OPERATORS_INT8_SWISH_OP_H_
-#define CAFFE2_OPERATORS_INT8_SWISH_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/tensor_int8.h"
-#include "caffe2/operators/quantized/int8_utils.h"
-
-namespace caffe2 {
-
-namespace int8 {
-
-namespace {
-using namespace std;
-void SwishFakeInt8NNPI(
-    const uint8_t* in,
-    uint8_t* out,
-    const int64_t N,
-    const float X_scale,
-    const int32_t X_offset,
-    const float Y_scale,
-    const int32_t Y_offset) {
-
-  const uint8_t max_val = std::numeric_limits<uint8_t>::max();
-  const uint8_t min_val = std::numeric_limits<uint8_t>::min();
-  float X_scale_fp32 = 1.0f / X_scale;
-  float deq_val = 0.0f;
-  float deq_swish = 0.0f;
-  int32_t quant_val = 0;
-  uint8_t result = 0;
-
-  for (const auto i : c10::irange(N)) {
-    deq_val = (static_cast<uint8_t>(in[i]) - X_offset) / X_scale_fp32;
-    deq_swish = deq_val / (1 + exp(-deq_val));
-    quant_val = round(deq_swish / Y_scale + Y_offset);
-    result = quant_val;
-    if (quant_val > max_val) {
-      result = max_val;
-    }
-    if (quant_val < min_val) {
-      result = min_val;
-    }
-    out[i] = static_cast<uint8_t>(result);
-  }
-}
-
-} // namespace
-
-
-class SwishInt8NNPIOp final : public Operator<CPUContext> {
- public:
-  using Operator<CPUContext>::Operator;
-
-  template <class... Args>
-  explicit SwishInt8NNPIOp(Args&&... args)
-      : Operator<CPUContext>(std::forward<Args>(args)...) {}
-
-  bool RunOnDevice() override {
-    const auto& X = Inputs()[0]->template Get<Int8TensorCPU>();
-    auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>();
-    Y->t.ResizeLike(X.t);
-
-    int32_t Y_offset_ =
-      this->template GetSingleArgument<int>("Y_zero_point", 0);
-    auto Y_scale_ = this->template GetSingleArgument<float>("Y_scale", 1);
-
-    Y->scale = Y_scale_;
-    Y->zero_point = Y_offset_;
-
-    SwishFakeInt8NNPI(
-        X.t.data<uint8_t>(),
-        Y->t.mutable_data<uint8_t>(),
-        X.t.numel(),
-        X.scale,
-        X.zero_point,
-        Y_scale_,
-        Y_offset_);
-    return true;
-  }
-
-};
-
-} // namespace int8
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_INT8_SWISH_OP_H_
diff --git a/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.cc b/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.cc
deleted file mode 100644
index 96098a4e12e5f..0000000000000
--- a/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.cc
+++ /dev/null
@@ -1,201 +0,0 @@
-#include <algorithm>
-#include "layernorm_fp16_fake_op.h"
-#include "caffe2/contrib/fakelowp/common.h"
-#include "caffe2/contrib/fakelowp/fp16_fma.h"
-
-namespace caffe2 {
-
-void LayerNormUtils::calcY(
-    const int M,
-    const int N,
-    const float* X,
-    const float* mean,
-    const float* std,
-    const float* gamma,
-    const float* beta,
-    float* Y) {
-  ConstEigenArrayMap<float> X_arr(X, N, M);
-  ConstEigenVectorArrayMap<float> mean_arr(mean, M);
-  ConstEigenVectorArrayMap<float> std_arr(std, M);
-  EigenArrayMap<float> Y_arr(Y, N, M);
-
-  std::vector<float> normalized(N);
-  for (int i = 0; i < M; ++i) {
-    float normFactor = float(1.0f / std_arr[i]);
-    fbgemm::RoundToFloat16(&normFactor, &normFactor, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    for (int j = 0; j < N; ++j) {
-      normalized[j] = X_arr.col(i)[j] - mean[i];
-    }
-    fbgemm::RoundToFloat16(normalized.data(), normalized.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    for (int j = 0; j < N; ++j) {
-      normalized[j] *= normFactor;
-    }
-    fbgemm::RoundToFloat16(normalized.data(), &Y_arr.col(i)[0], N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-  }
-
-  if (gamma != nullptr && beta != nullptr) {
-    ConstEigenVectorArrayMap<float> gamma_arr(gamma, N);
-    ConstEigenVectorArrayMap<float> beta_arr(beta, N);
-
-    for (int i = 0; i < M; ++i) {
-      vector<float> res(N);
-      for (int j = 0; j < N; j++) {
-        res[j] = beta[j];
-      }
-      fake_fp16::fma_fp16(N, &Y_arr.col(i)[0], gamma, res.data());
-      for (int j = 0; j < N; j++) {
-        Y_arr.col(i)[j] = res[j];
-      }
-    }
-  }
-}
-
-float LayerNormUtils::ReducedAdd(const std::vector<float>& vec) {
-  constexpr int VEC_SIZE = 32;
-  std::vector<float> v(vec.begin(), vec.end());
-
-  for (int factor = 2; factor <=32; factor *=2) {
-    int range = VEC_SIZE / factor;
-
-    for (int i = 0; i < range; ++i) { // 16
-      v[i] = v[2 * i] + v[2 * i + 1];
-    }
-    fbgemm::RoundToFloat16(v.data(), v.data(), range, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-  }
-
-  return v[0];
-}
-
-void LayerNormUtils::calcMeanStd(
-    const int M,
-    const int N,
-    const float eps,
-    const float* X,
-    float* mean,
-    float* std) {
-  ConstEigenArrayMap<float> X_arr(X, N, M);
-
-  std::vector<float> sqr(M, 0.0f);
-  std::vector<float> var(M, 0.0f);
-  float inv_N_val = 1.0f / N;
-  fbgemm::RoundToFloat16(&inv_N_val, &inv_N_val, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-  constexpr int VEC_SIZE = 32;
-  std::vector<float> inv_N_vec(VEC_SIZE, inv_N_val);
-  std::vector<float> inv_N_prod_vec(VEC_SIZE, 0);
-  std::vector<float> avgVec(VEC_SIZE, 0.0f);
-  std::vector<float> sqrVec(VEC_SIZE, 0.0f);
-  std::vector<float> negMeanVec(M, 0.0f);
-  int numVecs = N / VEC_SIZE;
-  int tailSize = N - (numVecs * VEC_SIZE);
-
-  vector<float> X_fp16(M * N);
-  fbgemm::RoundToFloat16(
-      X, X_fp16.data(), M * N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-  for (int i = 0; i < M; ++i) {
-    std::fill(avgVec.begin(), avgVec.end(), 0.0f);
-    std::fill(sqrVec.begin(), sqrVec.end(), 0.0f);
-    for (int j = 0; j < numVecs; ++j) {
-      fake_fp16::fma_fp16(
-          VEC_SIZE,
-          &X_fp16[i * N + VEC_SIZE * j],
-          inv_N_vec.data(),
-          avgVec.data());
-      for (int k = 0; k < VEC_SIZE; k++) {
-        inv_N_prod_vec[k] = X_fp16[i * N + VEC_SIZE * j + k] * inv_N_val;
-      }
-      fbgemm::RoundToFloat16(
-          inv_N_prod_vec.data(),
-          inv_N_prod_vec.data(),
-          VEC_SIZE,
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-      fake_fp16::fma_fp16(
-          VEC_SIZE,
-          &X_fp16[i * N + VEC_SIZE * j],
-          inv_N_prod_vec.data(),
-          sqrVec.data());
-    }
-
-    if (tailSize > 0) {
-      fake_fp16::fma_fp16(
-          tailSize,
-          &X_fp16[i * N + VEC_SIZE * numVecs],
-          inv_N_vec.data(),
-          avgVec.data());
-      for (int k = 0; k < tailSize; k++) {
-        inv_N_prod_vec[k] = X_fp16[i * N + VEC_SIZE * numVecs + k] * inv_N_val;
-      }
-      fbgemm::RoundToFloat16(
-          inv_N_prod_vec.data(),
-          inv_N_prod_vec.data(),
-          tailSize,
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-      fake_fp16::fma_fp16(
-          tailSize,
-          &X_fp16[i * N + VEC_SIZE * numVecs],
-          inv_N_prod_vec.data(),
-          sqrVec.data());
-    }
-    mean[i] = ReducedAdd(avgVec);
-    sqr[i] = ReducedAdd(sqrVec);
-  }
-
-  // // compute variance and std deviation
-  std::copy(mean, mean + M, negMeanVec.begin());
-  std::transform(negMeanVec.cbegin(),
-      negMeanVec.cend(),
-      negMeanVec.begin(),
-      std::negate<float>());
-  fake_fp16::fma_fp16(M, mean, negMeanVec.data(), sqr.data());
-  std::copy(sqr.cbegin(), sqr.cend(), var.begin());
-
-  float teps = eps;
-  std::vector<float> tmpVec(M, 0.0f);
-  fbgemm::RoundToFloat16(&teps, &teps, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-  int i = 0;
-  for (auto& v: var) {
-    if (v < 0.0) {
-      LOG_EVERY_N(WARNING, 1000) << "Variance " << v
-          << " negative, resetting to 0.";
-      v = 0.0;
-    }
-    tmpVec[i] = var[i] + teps;
-    ++i;
-  }
-  fbgemm::RoundToFloat16(
-      tmpVec.data(),
-      tmpVec.data(),
-      M,
-      FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-  i = 0;
-  for (auto& v: tmpVec) {
-    if (v < 0) {
-      LOG_EVERY_N(WARNING, 1000) << "Variance " << v
-          << " negative, resetting to 0.";
-      v = 0.0;
-    }
-    std[i] = std::sqrt(v);
-    ++i;
-  }
-  fbgemm::RoundToFloat16(
-    std,
-    std,
-    M,
-    FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-}
-
-REGISTER_CPU_OPERATOR(LayerNormFakeFP16NNPI, LayerNormFakeFp16Op<false>);
-OPERATOR_SCHEMA(LayerNormFakeFP16NNPI).NumInputs({1, 3}).NumOutputs(3);
-
-REGISTER_CPU_OPERATOR(LayerNormInt8QuantizeFakeNNPI,
-                      LayerNormFakeFp16Op<true>);
-OPERATOR_SCHEMA(LayerNormInt8QuantizeFakeNNPI)
-    .IdenticalTypeAndShape()
-    .NumInputs({1, 3})
-    .NumOutputs(3);
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h b/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h
deleted file mode 100644
index 535b0a2460ecd..0000000000000
--- a/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h
+++ /dev/null
@@ -1,207 +0,0 @@
-#pragma once
-
-#include <algorithm>
-#include <array>
-#include <functional>
-#include <string>
-#include <vector>
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-
-#include <fbgemm/FbgemmConvert.h>
-#include "caffe2/utils/eigen_utils.h"
-#include "caffe2/utils/math.h"
-#include "fp16_fma.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-
-class LayerNormUtils {
-  public:
-  static void calcY(
-      const int M,
-      const int N,
-      const float* X,
-      const float* mean,
-      const float* std,
-      const float* gamma,
-      const float* beta,
-      float* Y);
-
-  static void calcMeanStd(
-      const int M,
-      const int N,
-      const float eps,
-      const float* X,
-      float* mean,
-      float* std);
-
-  static float ReducedAdd(const std::vector<float>& vec);
-};
-
-template <bool quantizeOutput=false>
-class LayerNormFakeFp16Op final : public Operator<CPUContext> {
- public:
-  template <class... Args>
-  explicit LayerNormFakeFp16Op(Args&&... args)
-      : Operator<CPUContext>(std::forward<Args>(args)...),
-        OP_SINGLE_ARG(int, "axis", axis_, 1),
-        OP_SINGLE_ARG(float, "epsilon", epsilon_, 1e-5f),
-        OP_SINGLE_ARG(bool, "elementwise_affine", elementwise_affine_, false) {}
-  ~LayerNormFakeFp16Op() noexcept override {}
-
-  bool RunOnDevice() override {
-    return DoRunWithType();
-  }
-
-  bool DoRunWithType() {
-    const auto& X = Input(INPUT);
-    vector <float> Y_fp16;
-
-    Tensor *Y;
-    if (!quantizeOutput) {
-      Y = Output(OUTPUT, X.sizes(), at::dtype<float>());
-    } else {
-      Y_fp16.resize(X.numel());
-    }
-    CAFFE_ENFORCE_GE(X.dim(), 2, "LayerNorm requires input dim >=2.");
-    const int canonical_axis = X.canonical_axis_index(axis_);
-    std::vector<int64_t> moments_dims(
-        X.sizes().cbegin(), X.sizes().cbegin() + canonical_axis);
-    moments_dims.push_back(1);
-    auto* mean = Output(MEAN, moments_dims, at::dtype<float>());
-    auto* sigma = Output(STD, moments_dims, at::dtype<float>());
-    const int M = X.size_to_dim(canonical_axis);
-    const int N = X.size_from_dim(canonical_axis);
-
-    if (!quantizeOutput) {
-      Y->ResizeLike(X);
-    }
-
-    const float* X_data = X.template data<float>();
-
-    float *Y_data;
-    if (!quantizeOutput) {
-      Y_data = Y->template mutable_data<float>();
-    } else {
-      Y_data = Y_fp16.data();
-    }
-
-    float* mean_data = mean->template mutable_data<float>();
-    float* sigma_data = sigma->template mutable_data<float>();
-
-    std::vector<float> X_rounded(X.numel());
-    fbgemm::RoundToFloat16(
-        X_data,
-        X_rounded.data(),
-        X.numel(),
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-        false /*USE_ACC_FP16*/);
-    X_data = X_rounded.data();
-
-    // Mean and Standard Deviation computation for the input data
-    LayerNormUtils::calcMeanStd(M, N, epsilon_, X_data, mean_data, sigma_data);
-
-    const float* gamma_data = nullptr;
-    const float* beta_data = nullptr;
-
-    // Layer Normalized Output computation
-    LayerNormUtils::calcY(
-        M, N, X_data, mean_data, sigma_data, gamma_data, beta_data, Y_data);
-
-    if (InputSize() == 3) {
-      // handle scale and bias via fp16_fma
-      std::vector<float> scale_data(N);
-      std::vector<float> bias_data(N);
-      fbgemm::RoundToFloat16(
-          Input(1).template data<float>(),
-          scale_data.data(),
-          N,
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-          false /*USE_ACC_FP16*/);
-      fbgemm::RoundToFloat16(
-          Input(2).template data<float>(),
-          bias_data.data(),
-          N,
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-          false /*USE_ACC_FP16*/);
-
-      for (const auto i : c10::irange(M)) {
-        // fma_fp16(A, B, Out) -> Out = A * B + Out
-        std::vector<float> out(N);
-        std::memcpy(out.data(), bias_data.data(), sizeof(float) * N);
-        fake_fp16::fma_fp16(N, Y_data + i * N, scale_data.data(), out.data());
-        std::memcpy(Y_data + i * N, out.data(), sizeof(float) * N);
-      }
-    }
-
-    // Quantize
-    // We should be using the same quantization fucntion from int8quantize,
-    // but we need to adjust for int8 vs uint8 bias. A simple shift of the output is not enough
-    // because this causes problems when rounding inside the fma.
-    // TODO: figure out how to commonize this with int8 quantize
-    if (quantizeOutput) {
-      auto* Y_int8 = Outputs()[0]->template GetMutable<int8::Int8TensorCPU>();
-      Y_int8->t.ResizeLike(X);
-
-      int32_t Y_offset =
-          this->template GetSingleArgument<int>("Y_zero_point", 0);
-      auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
-
-      float inv_scale = 1.0f / Y_scale;
-      fbgemm::RoundToFloat16(
-        &inv_scale, &inv_scale, 1, false /* no clamping */);
-
-      Y_int8->scale = Y_scale;
-      Y_int8->zero_point = Y_offset;
-
-      int Nout = X.numel();
-
-      std::vector<float> inv_scalev(Nout, inv_scale);
-      std::vector<float> offsetv(Nout, Y_offset);
-      uint8_t* Y_uint8_data = Y_int8->t.template mutable_data<uint8_t>();
-
-      fake_fp16::fma_fp16(Nout, Y_fp16.data(), inv_scalev.data(), offsetv.data());
-
-      const int32_t qmin = std::numeric_limits<uint8_t>::min();
-      const int32_t qmax = std::numeric_limits<uint8_t>::max();
-
-      for (const auto i : c10::irange(Nout)) {
-        float halfRes = offsetv[i];
-        halfRes = round(halfRes);
-        if (std::isinf(halfRes)) {
-          if (halfRes > 0) {
-            halfRes = qmax;
-          } else {
-            halfRes = qmin;
-          }
-        }
-        if (halfRes > qmax) {
-          halfRes = qmax;
-        }
-        if (halfRes < qmin) {
-          halfRes = qmin;
-        }
-        Y_uint8_data[i] = static_cast<uint8_t>(halfRes);
-      }
-    }
-
-    return true;
-  }
-
- private:
-  const int axis_;
-  const float epsilon_;
-  // LayerNorm FP16 FakeLowP Op applies the scales and biases (or gamma and beta)
-  // whenever those inputs are provided else it will omit them.
-  // We are keeping elementwise_affine to keep it consistent with LayerNorm FP32 Op.
-  const bool elementwise_affine_;
-
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT, MEAN, STD);
-};
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.cc b/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.cc
deleted file mode 100644
index 7c07a7c8d7867..0000000000000
--- a/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-#include "lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused4BitRowwiseFakeFP16NNPI,
-    SparseLengthsFused4BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false>);
-OPERATOR_SCHEMA(SparseLengthsSumFused4BitRowwiseFakeFP16NNPI)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false>::DATA,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false>::INDICES,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false>::LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-4-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 2-byte scale and 2-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused4BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused4BitRowwiseFakeFP16NNPI);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused4BitRowwiseFakeFP16EmbeddingOnly,
-    SparseLengthsFused4BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false,
-        /*use_fp16_for_embedding_only=*/true>);
-OPERATOR_SCHEMA(SparseLengthsSumFused4BitRowwiseFakeFP16EmbeddingOnly)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false, true>::DATA,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false, true>::
-            INDICES,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false, true>::
-            LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-4-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 2-byte scale and 2-byte bias).
-Convert only embedding entries using fake fp16.
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused4BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused4BitRowwiseFakeFP16EmbeddingOnly);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI,
-    SparseLengthsFused4BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true>::DATA,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true>::INDICES,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true>::LENGTHS,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true>::WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 4-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 2-byte scale and 2-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused4BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused4BitRowwiseFakeFP16EmbeddingOnly,
-    SparseLengthsFused4BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/true,
-        /*use_fp16_for_embedding_only=*/true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused4BitRowwiseFakeFP16EmbeddingOnly)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true, true>::DATA,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true, true>::
-            INDICES,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true, true>::
-            LENGTHS,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true, true>::
-            WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 4-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 2-byte scale and 2-byte bias).
-Convert only embedding entries using fake fp16.
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused4BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused4BitRowwiseFakeFP16EmbeddingOnly);
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h b/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h
deleted file mode 100644
index 9ef11ed1c870d..0000000000000
--- a/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h
+++ /dev/null
@@ -1,216 +0,0 @@
-#pragma once
-
-#include <immintrin.h>
-#include "caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h"
-#include "fp16_fma.h"
-#include "lengths_reducer_ops.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp_denorms);
-
-namespace caffe2 {
-
-template <
-    class Context,
-    bool with_weights = 0,
-    bool use_fp16_for_embedding_only = 0>
-class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  explicit SparseLengthsFused4BitRowwiseFakeFP16Op(
-      const OperatorDef& operator_def,
-      Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
-  ~SparseLengthsFused4BitRowwiseFakeFP16Op() noexcept override {}
-
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, Input(INDICES));
-  }
-
-  template <typename IndexType>
-  bool DoRunWithType() {
-    const auto& data = Input(DATA);
-    const auto& indices = Input(INDICES);
-    const auto& lengths = Input(LENGTHS);
-
-    CAFFE_ENFORCE_EQ(indices.dim(), 1, "INDICES must be a vector");
-    CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTHS must be a vector");
-
-    const float* weights = nullptr;
-    if (with_weights) {
-      const auto& weights_input = Input(WEIGHTS);
-      CAFFE_ENFORCE_EQ(weights_input.dim(), 1, "WEIGHTS must be a vector");
-      CAFFE_ENFORCE_EQ(
-          weights_input.numel(),
-          indices.numel(),
-          "WEIGHTS should have the same length as INDICES.");
-      weights = weights_input.template data<float>();
-    }
-
-    CAFFE_ENFORCE_GT(
-        data.size(1),
-        sizeof(at::Half) * 2,
-        "DATA must have more than 4 columns");
-    constexpr int NUM_ELEM_PER_BYTE = 2;
-    // Subtract 8 from the #columns of data for the 4 bytes for scale and 4
-    // bytes for bias that we use in the fused representation (per row).
-    const std::vector<int64_t> shape = {
-        lengths.size(0),
-        static_cast<int64_t>(data.size(1) - 2 * sizeof(at::Half)) *
-            NUM_ELEM_PER_BYTE};
-    auto* output = Output(0, shape, at::dtype<float>());
-
-    // Copied from Fused8BitRowwiseEmbeddingLookupGenericSlow in
-    // fused_8bit_rowwise_embedding_lookup.cc
-
-    int64_t output_block_size = output->size(1);
-    CAFFE_ENFORCE_EQ(
-        output_block_size % NUM_ELEM_PER_BYTE,
-        0,
-        "block size must be divisible by 2");
-    int64_t input_block_size = output_block_size / NUM_ELEM_PER_BYTE;
-    int64_t output_size = output->size(0);
-    int64_t index_size = indices.numel();
-    int64_t data_size = data.size(0);
-    const uint8_t* input = data.template data<uint8_t>();
-    const IndexType* indices_data = indices.template data<IndexType>();
-    const int* lengths_data = lengths.template data<int>();
-    float* out = output->template mutable_data<float>();
-
-    std::vector<float> rowTempSums[2];
-    rowTempSums[0].resize(output_block_size);
-    rowTempSums[1].resize(output_block_size);
-
-    const auto scale_bias_offset = 2 * sizeof(at::Half);
-    const int64_t input_fused_block_size = input_block_size + scale_bias_offset;
-    int64_t current = 0;
-    for (const auto m : c10::irange(output_size)) {
-      if (!use_fp16_for_embedding_only) {
-        memset(rowTempSums[0].data(), 0, sizeof(float) * output_block_size);
-        memset(rowTempSums[1].data(), 0, sizeof(float) * output_block_size);
-      }
-
-      memset(out, 0, sizeof(float) * output_block_size);
-
-      if (current + lengths_data[m] > index_size) {
-        return false;
-      }
-
-      for (int i = 0; i < lengths_data[m]; ++i) {
-        int64_t idx = indices_data[current];
-
-        int accIdx = 0;
-        if (output_block_size % 2 == 0 && output_block_size <= 96 &&
-            data.size(1) % 2 == 0) {
-          accIdx = i % 2;
-        }
-
-        if (idx < 0 || idx >= data_size) {
-          return false;
-        }
-
-        const at::Half* scale_bias = reinterpret_cast<const at::Half*>(
-            input + input_fused_block_size * indices_data[current] +
-            input_block_size);
-
-        float weight = 1.0f;
-        if (weights) {
-          weight = weights[current];
-          if (!use_fp16_for_embedding_only) {
-            // Fake fp16 rounding of weight
-            fbgemm::RoundToFloat16(
-                &weight, &weight, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-          }
-        }
-        float scale = scale_bias[0];
-        float bias = scale_bias[1];
-
-        if (!use_fp16_for_embedding_only) {
-          scale *= weight;
-          fbgemm::RoundToFloat16(
-              &scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-        }
-
-        // Unpack int4 elements
-        std::vector<float> input_rounded(output_block_size);
-        int k = 0;
-        for (const auto j : c10::irange(input_block_size)) {
-          input_rounded[k++] =
-              input[input_fused_block_size * indices_data[current] + j] & 0x0f;
-          input_rounded[k++] =
-              input[input_fused_block_size * indices_data[current] + j] >> 4;
-        }
-
-        if (use_fp16_for_embedding_only) {
-          std::vector<float> product_rounded(output_block_size);
-          TypedAxpy<float, float>(
-              output_block_size,
-              scale,
-              input_rounded.data(),
-              product_rounded.data());
-
-          for (const auto j : c10::irange(output_block_size)) {
-            product_rounded[j] += bias;
-          }
-
-          // Fake fp16 rounding of scale x input + bias
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(product_rounded.data()),
-              product_rounded.data(),
-              output_block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp_denorms);
-
-          // Accumulate w x (scale x input + bias) to output
-          TypedAxpy<float, float>(
-              output_block_size,
-              weight,
-              reinterpret_cast<const float*>(product_rounded.data()),
-              out);
-        } else {
-          std::vector<float> product(output_block_size);
-          std::vector<float> scalev(output_block_size, scale);
-          std::vector<float> mBias(output_block_size, bias);
-          std::vector<float> mWeight(output_block_size, weight);
-
-          fake_fp16::fma_fp16(
-              output_block_size,
-              mBias.data(),
-              mWeight.data(),
-              rowTempSums[accIdx].data());
-
-          fake_fp16::fma_fp16(
-              output_block_size,
-              scalev.data(),
-              input_rounded.data(),
-              rowTempSums[accIdx].data());
-        }
-        ++current;
-      }
-
-      if (!use_fp16_for_embedding_only) {
-        for (const auto j : c10::irange(output_block_size)) {
-          out[j] = rowTempSums[0][j] + rowTempSums[1][j];
-        }
-        fbgemm::RoundToFloat16(
-            reinterpret_cast<const float*>(out),
-            out,
-            output_block_size,
-            FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      }
-
-      out += output_block_size;
-    }
-    return current == index_size;
-  }
-
-  enum {
-    DATA = 0,
-    WEIGHTS = 1,
-    INDICES = 1 + with_weights,
-    LENGTHS = 2 + with_weights,
-  };
-};
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.cc b/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.cc
deleted file mode 100644
index 74a37a44a45b4..0000000000000
--- a/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.cc
+++ /dev/null
@@ -1,722 +0,0 @@
-#include "lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused8BitRowwiseFakeFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext>);
-OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext>::LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-8-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused8BitRowwiseFakeFP16EmbeddingOnly,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/false,
-        /*use_inv_scale=*/false,
-        /*use_nnpi_fma=*/false,
-        /*use_fp16_for_embedding_only=*/true>);
-OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16EmbeddingOnly)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-8-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 4-byte scale and 4-byte bias).
-Convert only embedding entries using fake fp16.
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16EmbeddingOnly);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused8BitRowwiseFakeFP16NNPI,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/true,
-        /*use_inv_scale=*/false,
-        /*use_nnpi_fma=*/true>);
-OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16NNPI)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-8-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16NNPI);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused8BitRowwiseFakeFP32NNPI,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/false,
-        /*use_inv_scale=*/false,
-        /*use_nnpi_fp16_fma=*/false,
-        /*use_fp16_for_embedding_only*/ false,
-        /*use_acc_fp32*/ true>);
-OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP32NNPI)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-8-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP32NNPI);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused8BitRowwiseFakeFP16AccFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/true>);
-OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16AccFP16)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-8-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16AccFP16);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused8BitRowwiseFakeFP16AccInvScaleFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights*/ false,
-        /*is_mean*/ 0,
-        /*use_acc_fp16*/ true,
-        /*use_inv_scale*/ true>);
-OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16AccInvScaleFP16)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-8-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16AccInvScaleFP16);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused8BitRowwiseFakeFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, /*with_weights=*/true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true>::LENGTHS,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true>::WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused8BitRowwiseFakeFP16EmbeddingOnly,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/true,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/false,
-        /*use_inv_scale=*/false,
-        /*use_nnpi_fma=*/false,
-        /*use_fp16_for_embedding_only=*/true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16EmbeddingOnly)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            true>::LENGTHS,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            true>::WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-Convert only embedding entries using fake fp16.
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16EmbeddingOnly);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/true,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccFP16)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            LENGTHS,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccFP16);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/true,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/true,
-        /*use_inv_scale=*/false,
-        /*use_fma=*/true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            LENGTHS,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/true,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/false,
-        /*use_inv_scale=*/false,
-        /*use_nnpi_fp16_fma=*/false,
-        /*use_fp16_for_embedding_only*/ false,
-        /*use_acc_fp32*/ true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::LENGTHS,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccInvScaleFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/true,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/true,
-        /*use_inv_scale=*/true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccInvScaleFP16)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            LENGTHS,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccInvScaleFP16);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsMeanFused8BitRowwiseFakeFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false,
-        /*is_mean=*/true>);
-OPERATOR_SCHEMA(SparseLengthsMeanFused8BitRowwiseFakeFP16)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true>::
-            INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true>::
-            LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsMean, but
-operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output");
-NO_GRADIENT(SparseLengthsMeanFused8BitRowwiseFakeFP16);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsMeanFused8BitRowwiseFakeFP16AccFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false,
-        /*is_mean=*/true,
-        /*use_acc_fp16=*/true>);
-OPERATOR_SCHEMA(SparseLengthsMeanFused8BitRowwiseFakeFP16AccFP16)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true, true>::
-            DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true, true>::
-            INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true, true>::
-            LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsMean, but
-operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output");
-NO_GRADIENT(SparseLengthsMeanFused8BitRowwiseFakeFP16AccFP16);
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h b/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h
deleted file mode 100644
index a5f8a19466c55..0000000000000
--- a/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h
+++ /dev/null
@@ -1,312 +0,0 @@
-#pragma once
-
-#include "caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h"
-#include "fp16_fma.h"
-#include "lengths_reducer_ops.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp_denorms);
-
-namespace caffe2 {
-
-template <
-    class Context,
-    bool with_weights = 0,
-    bool is_mean = 0,
-    bool use_acc_fp16 = 0,
-    bool use_inv_scale = 0,
-    bool use_nnpi_fma = 0,
-    bool use_fp16_for_embedding_only = 0,
-    bool use_acc_fp32 = 0>
-class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
- public:
-  static_assert(
-      !(with_weights && is_mean),
-      "Cannot have with_weights and is_mean a the same time");
-
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  explicit SparseLengthsFused8BitRowwiseFakeFP16Op(
-      const OperatorDef& operator_def,
-      Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
-  ~SparseLengthsFused8BitRowwiseFakeFP16Op() noexcept override {}
-
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, Input(INDICES));
-  }
-
-  template <typename IndexType>
-  bool DoRunWithType() {
-    const auto& data = Input(DATA);
-    const auto& indices = Input(INDICES);
-    const auto& lengths = Input(LENGTHS);
-
-    CAFFE_ENFORCE_EQ(indices.dim(), 1, "INDICES must be a vector");
-    CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTHS must be a vector");
-
-    const float* weights = nullptr;
-    if (with_weights) {
-      const auto& weights_input = Input(WEIGHTS);
-      CAFFE_ENFORCE_EQ(weights_input.dim(), 1, "WEIGHTS must be a vector");
-      CAFFE_ENFORCE_EQ(
-          weights_input.numel(),
-          indices.numel(),
-          "WEIGHTS should have the same length as INDICES.");
-      weights = weights_input.template data<float>();
-    }
-
-    CAFFE_ENFORCE_GT(data.size(1), 8, "DATA must have more than 8 columns");
-    // Subtract 8 from the #columns of data for the 4 bytes for scale and 4
-    // bytes for bias that we use in the fused representation (per row).
-    const std::vector<int64_t> shape = {lengths.size(0), data.size(1) - 8};
-    auto* output = Output(0, shape, at::dtype<float>());
-
-    // Copied from Fused8BitRowwiseEmbeddingLookupGenericSlow in
-    // fused_8bit_rowwise_embedding_lookup.cc
-
-    int64_t block_size = output->size(1);
-    int64_t output_size = output->size(0);
-    int64_t index_size = indices.numel();
-    int64_t data_size = data.size(0);
-    const uint8_t* input = data.template data<uint8_t>();
-    const IndexType* indices_data = indices.template data<IndexType>();
-    const int* lengths_data = lengths.template data<int>();
-    bool normalize_by_length = is_mean;
-    float* out = output->template mutable_data<float>();
-
-    std::vector<float> rowTempSums[2];
-    rowTempSums[0].resize(block_size);
-    rowTempSums[1].resize(block_size);
-
-    // block_size is the number of elements and fused_block_size is the size of
-    // an entire row, including scale and bias.
-    const auto scale_bias_offset = 8 / sizeof(uint8_t);
-    const int64_t fused_block_size = block_size + scale_bias_offset;
-    int64_t current = 0;
-    for (const auto m : c10::irange(output_size)) {
-      memset(out, 0, sizeof(float) * block_size);
-      memset(rowTempSums[0].data(), 0, sizeof(float) * block_size);
-      memset(rowTempSums[1].data(), 0, sizeof(float) * block_size);
-
-      if (current + lengths_data[m] > index_size) {
-        return false;
-      }
-
-      for (int i = 0; i < lengths_data[m]; ++i) {
-        int64_t idx = indices_data[current];
-
-        int accIdx = 0;
-        // Only do double buffer accumulation when block size is even
-        if (use_nnpi_fma && block_size % 2 == 0 && block_size <= 96) {
-          accIdx = i % 2;
-        }
-
-        if (idx < 0 || idx >= data_size) {
-          return false;
-        }
-
-        const float* scale_bias = reinterpret_cast<const float*>(
-            input + fused_block_size * indices_data[current] + block_size);
-
-        float weight = 1.0f;
-        if (weights) {
-          weight = weights[current];
-          if (!use_fp16_for_embedding_only && !use_acc_fp32) {
-            // Fake fp16 rounding of weight
-            fbgemm::RoundToFloat16(
-                &weight, &weight, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-          }
-        }
-        float scale = scale_bias[0];
-        float bias = scale_bias[1];
-
-        // Vendor might store scale as s' = 1 / s which implies b' = b / s
-        // We do      x = x_q * s + b
-        // Vendor does x = (x_q + b') / s'
-        // Solving these equations yields to the results above
-        if (use_inv_scale) {
-          constexpr float kEpsilon = 1e-8;
-          if (fabs(scale) < kEpsilon) {
-            if (scale < 0) {
-              scale = -kEpsilon;
-            } else {
-              scale = kEpsilon;
-            }
-          }
-          scale = 1.0 / (1.0 / scale);
-          bias = (bias / scale) * scale;
-        }
-
-        if (!use_fp16_for_embedding_only && !use_acc_fp32) {
-          // Fake fp16 rounding of scale and bias
-          fbgemm::RoundToFloat16(
-              &scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-          fbgemm::RoundToFloat16(
-              &bias, &bias, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-          scale *= weight;
-          // Fake fp16 rounding of scale and bias
-          fbgemm::RoundToFloat16(
-              &scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-        }
-
-        // Fake fp16 rounding of input/ it is already ints
-        std::vector<float> input_rounded(block_size);
-        for (const auto j : c10::irange(block_size)) {
-          input_rounded[j] =
-              input[fused_block_size * indices_data[current] + j];
-        }
-
-        if (use_fp16_for_embedding_only) {
-          // bias *= weight;
-
-          std::vector<float> product_rounded(block_size);
-          TypedAxpy<float, float>(
-              block_size, scale, input_rounded.data(), product_rounded.data());
-
-          for (const auto j : c10::irange(block_size)) {
-            product_rounded[j] += bias;
-          }
-
-          // Fake fp16 rounding of scale x input + bias
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(product_rounded.data()),
-              product_rounded.data(),
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp_denorms);
-
-          // Accumulate w x (scale x input + bias) to output
-          TypedAxpy<float, float>(
-              block_size,
-              weight,
-              reinterpret_cast<const float*>(product_rounded.data()),
-              out);
-
-        } else if (use_nnpi_fma) {
-          std::vector<float> mScale(block_size, scale);
-          std::vector<float> mBias(block_size, bias);
-          std::vector<float> mWeight(block_size, weight);
-
-          fake_fp16::fma_fp16(
-              block_size,
-              mBias.data(),
-              mWeight.data(),
-              rowTempSums[accIdx].data());
-
-          fake_fp16::fma_fp16(
-              block_size,
-              mScale.data(),
-              input_rounded.data(),
-              rowTempSums[accIdx].data());
-        } else if (use_acc_fp16) {
-          bias *= weight;
-          fbgemm::RoundToFloat16(
-              &bias, &bias, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-          std::vector<float> product_rounded(block_size);
-          TypedAxpy<float, float>(
-              block_size, scale, input_rounded.data(), product_rounded.data());
-
-          // Fake fp16 rounding of w x scale x input
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(product_rounded.data()),
-              product_rounded.data(),
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-          for (const auto j : c10::irange(block_size)) {
-            product_rounded[j] += bias;
-          }
-          // Fake fp16 rounding of w x scale x input + w x bias
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(product_rounded.data()),
-              product_rounded.data(),
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-          // Accumulate w x scale x input + w x bias to output
-          TypedAxpy<float, float>(
-              block_size,
-              1.0,
-              reinterpret_cast<const float*>(product_rounded.data()),
-              out);
-
-          // Fake fp16 rounding of out + (w x scale x input + w x bias)
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(out),
-              out,
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-        } else if (use_acc_fp32) {
-          for (const auto j : c10::irange(block_size)) {
-            float deqVal = fake_fp16::fmafp32_avx_emulation(
-                scale,
-                input_rounded[j],
-                bias);
-            rowTempSums[accIdx][j] = fake_fp16::fmafp32_avx_emulation(
-                deqVal,
-                weight,
-                rowTempSums[accIdx][j]);
-          }
-        } else {
-          bias *= weight;
-          fbgemm::RoundToFloat16(
-              &bias, &bias, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-          TypedAxpy<float, float>(block_size, scale, input_rounded.data(), out);
-
-          for (const auto j : c10::irange(block_size)) {
-            out[j] += bias;
-          }
-        }
-        ++current;
-      }
-
-      if (use_nnpi_fma || use_acc_fp32) {
-        for (const auto j : c10::irange(block_size)) {
-          out[j] = rowTempSums[0][j] + rowTempSums[1][j];
-        }
-      }
-
-      if (use_nnpi_fma) {
-        fbgemm::RoundToFloat16(
-            reinterpret_cast<const float*>(out),
-            out,
-            block_size,
-            FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      }
-
-      if (normalize_by_length && lengths_data[m]) {
-        float scale = 1.f / lengths_data[m];
-
-        if (!use_fp16_for_embedding_only) {
-          // Fake fp16 rounding of scale and out
-          fbgemm::RoundToFloat16(
-              &scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(out),
-              out,
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-        }
-
-        // hack: context is not really used
-        math::Scale<float, float, CPUContext>(
-            block_size, scale, out, out, nullptr);
-      }
-
-      out += block_size;
-    }
-    return current == index_size;
-  }
-
-  enum {
-    DATA = 0,
-    WEIGHTS = 1,
-    INDICES = 1 + with_weights,
-    LENGTHS = 2 + with_weights,
-  };
-};
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/lengths_reducer_ops.cc b/caffe2/contrib/fakelowp/lengths_reducer_ops.cc
deleted file mode 100644
index 275ddf683aa43..0000000000000
--- a/caffe2/contrib/fakelowp/lengths_reducer_ops.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-#include "lengths_reducer_ops.h"
-
-#include "caffe2/operators/segment_reduction_op.h"
-
-namespace caffe2 {
-
-// Use _STR option because the schema is declared using _STR version too in
-// generic fashion. Otherwise it'd break schema declaration check.
-// TODO(dzhulgakov): remove _STR when all lengths ops are off generic version.
-
-using SparseLengthsSumOp =
-    SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 0, 0>;
-using SparseLengthsWeightedSumOp =
-    SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 1, 0>;
-using SparseLengthsMeanOp =
-    SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 0, 1>;
-using SparseLengthsSumAccFP16Op =
-    SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 0, 0, 0, 1>;
-using SparseLengthsWeightedSumAccFP16Op =
-    SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 1, 0, 0, 1>;
-using SparseLengthsMeanAccFP16Op =
-    SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 0, 1, 0, 1>;
-using SparseLengthsSumFakeFP16EmbeddingOnlyOp =
-    SparseLengthsReductionFakeFp16Op<
-        TensorTypes<float, at::Half>,
-        0,
-        0,
-        0,
-        0,
-        1>;
-using SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp =
-    SparseLengthsReductionFakeFp16Op<
-        TensorTypes<float, at::Half>,
-        1,
-        0,
-        0,
-        0,
-        1>;
-using SparseLengthsMeanFakeFP16EmbeddingOnlyOp =
-    SparseLengthsReductionFakeFp16Op<
-        TensorTypes<float, at::Half>,
-        0,
-        1,
-        0,
-        0,
-        1>;
-
-REGISTER_CPU_OPERATOR(SparseLengthsSumFakeFP16, SparseLengthsSumOp);
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFakeFP16,
-    SparseLengthsWeightedSumOp);
-REGISTER_CPU_OPERATOR(SparseLengthsMeanFakeFP16, SparseLengthsMeanOp);
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFakeFP16AccFP16,
-    SparseLengthsSumAccFP16Op);
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFakeFP16AccFP16,
-    SparseLengthsWeightedSumAccFP16Op);
-REGISTER_CPU_OPERATOR(
-    SparseLengthsMeanFakeFP16AccFP16,
-    SparseLengthsMeanAccFP16Op);
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFakeFP16EmbeddingOnly,
-    SparseLengthsSumFakeFP16EmbeddingOnlyOp);
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFakeFP16EmbeddingOnly,
-    SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp);
-REGISTER_CPU_OPERATOR(
-    SparseLengthsMeanFakeFP16EmbeddingOnly,
-    SparseLengthsMeanFakeFP16EmbeddingOnlyOp);
-
-template <typename Def>
-string FormatDoc() {
-  string doc = Def::doc;
-  c10::ReplaceAll(doc, "{op}", Def::OpDef::name);
-  c10::ReplaceAll(doc, "{op_doc}", Def::OpDef::doc);
-  auto replaced = c10::ReplaceAll(doc, "{extra}", "");
-  CAFFE_ENFORCE_EQ(replaced, 0);
-  return doc;
-}
-
-using SparseLengthsSumDef = AbstractSparseLengthsDef<
-    float,
-    int,
-    CPUContext,
-    SumReducerDef,
-    true /*GradientNeedIndices*/>;
-OPERATOR_SCHEMA(SparseLengthsSumFakeFP16)
-    .NumInputs(SparseLengthsSumDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsSumOp::DATA,
-        SparseLengthsSumOp::INDICES,
-        SparseLengthsSumOp::LENGTHS)
-    .SetDoc(FormatDoc<SparseLengthsSumDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsSumDef::PopulateSchema)
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFakeFP16);
-
-using SparseLengthsWeightedSumDef = AbstractSparseLengthsDef<
-    float,
-    int,
-    CPUContext,
-    WeightedSumReducerDef,
-    true /*GradientNeedIndices*/>;
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFakeFP16)
-    .NumInputs(SparseLengthsWeightedSumDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsWeightedSumOp::DATA,
-        SparseLengthsWeightedSumOp::INDICES,
-        SparseLengthsWeightedSumOp::LENGTHS,
-        SparseLengthsWeightedSumOp::WEIGHT)
-    .SetDoc(FormatDoc<SparseLengthsWeightedSumDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsWeightedSumDef::PopulateSchema)
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsWeightedSumFakeFP16);
-
-using SparseLengthsMeanDef = AbstractSparseLengthsDef<
-    float,
-    int,
-    CPUContext,
-    MeanReducerDef,
-    true /*GradientNeedIndices*/>;
-OPERATOR_SCHEMA(SparseLengthsMeanFakeFP16)
-    .NumInputs(SparseLengthsMeanDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsMeanOp::DATA,
-        SparseLengthsMeanOp::INDICES,
-        SparseLengthsMeanOp::LENGTHS)
-    .SetDoc(FormatDoc<SparseLengthsMeanDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsMeanDef::PopulateSchema);
-NO_GRADIENT(SparseLengthsMeanFakeFP16);
-
-OPERATOR_SCHEMA(SparseLengthsSumFakeFP16AccFP16)
-    .NumInputs(SparseLengthsSumDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsSumOp::DATA,
-        SparseLengthsSumOp::INDICES,
-        SparseLengthsSumOp::LENGTHS)
-    .SetDoc(FormatDoc<SparseLengthsSumDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsSumDef::PopulateSchema)
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFakeFP16AccFP16);
-
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFakeFP16AccFP16)
-    .NumInputs(SparseLengthsWeightedSumDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsWeightedSumOp::DATA,
-        SparseLengthsWeightedSumOp::INDICES,
-        SparseLengthsWeightedSumOp::LENGTHS,
-        SparseLengthsWeightedSumOp::WEIGHT)
-    .SetDoc(FormatDoc<SparseLengthsWeightedSumDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsWeightedSumDef::PopulateSchema)
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsWeightedSumFakeFP16AccFP16);
-
-OPERATOR_SCHEMA(SparseLengthsMeanFakeFP16AccFP16)
-    .NumInputs(SparseLengthsMeanDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsMeanOp::DATA,
-        SparseLengthsMeanOp::INDICES,
-        SparseLengthsMeanOp::LENGTHS)
-    .SetDoc(FormatDoc<SparseLengthsMeanDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsMeanDef::PopulateSchema);
-NO_GRADIENT(SparseLengthsMeanFakeFP16AccFP16);
-
-OPERATOR_SCHEMA(SparseLengthsSumFakeFP16EmbeddingOnly)
-    .NumInputs(SparseLengthsSumDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsSumFakeFP16EmbeddingOnlyOp::DATA,
-        SparseLengthsSumFakeFP16EmbeddingOnlyOp::INDICES,
-        SparseLengthsSumFakeFP16EmbeddingOnlyOp::LENGTHS)
-    .SetDoc(FormatDoc<SparseLengthsSumDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsSumDef::PopulateSchema)
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFakeFP16EmbeddingOnly);
-
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFakeFP16EmbeddingOnly)
-    .NumInputs(SparseLengthsWeightedSumDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp::DATA,
-        SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp::INDICES,
-        SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp::LENGTHS,
-        SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp::WEIGHT)
-    .SetDoc(FormatDoc<SparseLengthsWeightedSumDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsWeightedSumDef::PopulateSchema)
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsWeightedSumFakeFP16EmbeddingOnly);
-
-OPERATOR_SCHEMA(SparseLengthsMeanFakeFP16EmbeddingOnly)
-    .NumInputs(SparseLengthsMeanDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsMeanFakeFP16EmbeddingOnlyOp::DATA,
-        SparseLengthsMeanFakeFP16EmbeddingOnlyOp::INDICES,
-        SparseLengthsMeanFakeFP16EmbeddingOnlyOp::LENGTHS)
-    .SetDoc(FormatDoc<SparseLengthsMeanDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsMeanDef::PopulateSchema);
-NO_GRADIENT(SparseLengthsMeanFakeFP16EmbeddingOnly);
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/lengths_reducer_ops.h b/caffe2/contrib/fakelowp/lengths_reducer_ops.h
deleted file mode 100644
index f451b83001eea..0000000000000
--- a/caffe2/contrib/fakelowp/lengths_reducer_ops.h
+++ /dev/null
@@ -1,268 +0,0 @@
-#pragma once
-
-#include <fbgemm/FbgemmConvert.h>
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/perfkernels/typed_axpy.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp_denorms);
-
-namespace caffe2 {
-
-// A templated class that implements SparseLengths[Sum,WeightedSum,Mean].
-template <
-    class InputTypes, // supported input types, such as TensorTypes<float>
-    bool USE_WEIGHT = 0, // Whether it is SparseLengthsWeightedSum
-    bool USE_MEAN = 0, // Whether this is SparseLengthsMean
-    bool USE_POSITIONAL_WEIGHT = 0,
-    bool USE_ACC_FP16 = 0, // Whether use fp16 accumulation
-    bool USE_FP16_FOR_EMBEDDING_ONLY =
-        0 // Whether use fp16 for embedding entries only
-    // USE_WEIGHT = 1 and USE_POSITIONAL_WEIGHT = 1
-    // -> SparseLengthsPositionalWeightedSum
-    >
-class SparseLengthsReductionFakeFp16Op final : public Operator<CPUContext> {
- public:
-  USE_OPERATOR_FUNCTIONS(CPUContext);
-  template <class... Args>
-  explicit SparseLengthsReductionFakeFp16Op(Args&&... args)
-      : Operator<CPUContext>(std::forward<Args>(args)...) {
-    static_assert(
-        !(USE_WEIGHT & USE_MEAN), "Cannot both specify weight and mean.");
-  }
-
-  ~SparseLengthsReductionFakeFp16Op() noexcept override {}
-
-  // Currently, we support float and at::Half inputs for input data type, and
-  // int32_t and int64_t for the index type.
-
-  bool RunOnDevice() override {
-    return DispatchHelper<InputTypes>::call(this, Input(DATA));
-  }
-
-  template <typename InputType>
-  bool DoRunWithType() {
-    return DispatchHelper<TensorTypes2<int32_t, int64_t>, InputType>::call(
-        this, Input(INDICES));
-  }
-
-  template <typename InputType, typename IndexType>
-  bool DoRunWithType2() {
-    auto& dataInput = Input(DATA);
-    auto& indicesInput = Input(INDICES);
-    auto& lengthsInput = Input(LENGTHS);
-
-    CAFFE_ENFORCE_EQ(1, indicesInput.dim(), "INDICES must be a vector");
-    CAFFE_ENFORCE_EQ(1, lengthsInput.dim(), "LENGTHS must be a vector");
-    const int64_t N = dataInput.size(0);
-    const int D = dataInput.size_from_dim(1);
-    const int64_t M = lengthsInput.size(0);
-    const int64_t indices_size = indicesInput.numel();
-
-    auto shape = dataInput.sizes().vec();
-    shape[0] = M;
-    auto* output = Output(0, shape, at::dtype<float>());
-    float* out_data = output->template mutable_data<float>();
-
-    const InputType* in_data = dataInput.template data<InputType>();
-    const IndexType* indices = indicesInput.template data<IndexType>();
-    const int* lengths = lengthsInput.template data<int>();
-    const float* in_weight = nullptr;
-
-    if (USE_WEIGHT) {
-      // static if
-      auto& weightInput = Input(WEIGHT);
-      CAFFE_ENFORCE_EQ(1, weightInput.dim(), "WEIGHT must be a vector");
-      if (!USE_POSITIONAL_WEIGHT) {
-        CAFFE_ENFORCE_EQ(
-            weightInput.numel(),
-            indices_size,
-            "Weight should have the same length as indices.");
-      }
-      in_weight = weightInput.template data<float>();
-    }
-
-    // Copied from EmbeddingLookupGenericSlow in perfkernels/embedding_lookup.cc
-    int64_t block_size = D;
-    int64_t output_size = M;
-    int64_t index_size = indices_size;
-    int64_t data_size = N;
-    const InputType* input = in_data;
-    const float* weights = in_weight;
-    bool normalize_by_lengths = USE_MEAN;
-    float* out = out_data;
-
-    int64_t current = 0;
-    for (const auto m : c10::irange(output_size)) {
-      memset(out, 0, sizeof(float) * block_size);
-      if (current + lengths[m] > index_size) {
-        return false;
-      }
-      for (int i = 0; i < lengths[m]; ++i) {
-        int64_t idx = indices[current];
-        if (idx < 0 || idx >= data_size) {
-          return false;
-        }
-
-        float w = 1.f;
-        if (weights) {
-          w = weights[USE_POSITIONAL_WEIGHT ? i : current];
-          if (!USE_FP16_FOR_EMBEDDING_ONLY) {
-            // Fake fp16 rounding of w
-            fbgemm::RoundToFloat16(
-                &w, &w, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-          }
-        }
-
-        if (USE_FP16_FOR_EMBEDDING_ONLY) {
-          std::vector<float> product_rounded(block_size);
-          if (std::is_same<InputType, at::Half>::value) {
-            TypedAxpy<InputType, float>(
-                block_size,
-                w,
-                input + block_size * indices[current],
-                product_rounded.data());
-          } else {
-            bool is_float = std::is_same<InputType, float>::value;
-            assert(is_float);
-            // Fake fp16 rounding of input
-            std::vector<float> input_rounded(block_size);
-            fbgemm::RoundToFloat16(
-                reinterpret_cast<const float*>(
-                    input + block_size * indices[current]),
-                input_rounded.data(),
-                block_size,
-                FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-                FLAGS_caffe2_fbgemm_fake_fp16_clamp_denorms);
-
-            TypedAxpy<float, float>(
-                block_size,
-                w,
-                reinterpret_cast<const float*>(input_rounded.data()),
-                product_rounded.data());
-          }
-
-          // Accumulate w x input to output
-          TypedAxpy<float, float>(
-              block_size,
-              1.0,
-              reinterpret_cast<const float*>(product_rounded.data()),
-              out);
-        } else if (USE_ACC_FP16) {
-          std::vector<float> product_rounded(block_size);
-          if (std::is_same<InputType, at::Half>::value) {
-            TypedAxpy<InputType, float>(
-                block_size,
-                w,
-                input + block_size * indices[current],
-                product_rounded.data());
-          } else {
-            bool is_float = std::is_same<InputType, float>::value;
-            assert(is_float);
-            // Fake fp16 rounding of input
-            std::vector<float> input_rounded(block_size);
-            fbgemm::RoundToFloat16(
-                reinterpret_cast<const float*>(
-                    input + block_size * indices[current]),
-                input_rounded.data(),
-                block_size,
-                FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-            TypedAxpy<float, float>(
-                block_size,
-                w,
-                reinterpret_cast<const float*>(input_rounded.data()),
-                product_rounded.data());
-          }
-
-          // Fake fp16 rounding of w x input
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(product_rounded.data()),
-              product_rounded.data(),
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-          // Accumulate w x input to output
-          TypedAxpy<float, float>(
-              block_size,
-              1.0,
-              reinterpret_cast<const float*>(product_rounded.data()),
-              out);
-
-          // Fake fp16 rounding of out + w x input
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(out),
-              out,
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-        } else {
-          if (std::is_same<InputType, at::Half>::value) {
-            TypedAxpy<InputType, float>(
-                block_size, w, input + block_size * indices[current], out);
-          } else {
-            bool is_float = std::is_same<InputType, float>::value;
-            assert(is_float);
-            // Fake fp16 rounding of input
-            std::vector<float> input_rounded(block_size);
-            fbgemm::RoundToFloat16(
-                reinterpret_cast<const float*>(
-                    input + block_size * indices[current]),
-                input_rounded.data(),
-                block_size,
-                FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-            TypedAxpy<float, float>(
-                block_size,
-                w,
-                reinterpret_cast<const float*>(input_rounded.data()),
-                out);
-          }
-        }
-
-        ++current;
-      }
-      if (normalize_by_lengths && lengths[m]) {
-        float scale = 1.f / lengths[m];
-
-        if (!USE_FP16_FOR_EMBEDDING_ONLY) {
-          // Fake fp16 rounding of scale and out
-          fbgemm::RoundToFloat16(
-              &scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(out),
-              out,
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-        }
-
-        // hack: context is not really used
-        math::Scale<float, float, CPUContext>(
-            block_size, scale, out, out, nullptr);
-      }
-
-      if (!USE_FP16_FOR_EMBEDDING_ONLY) {
-        // Fake fp16 rounding of out
-        fbgemm::RoundToFloat16(
-            reinterpret_cast<const float*>(out),
-            reinterpret_cast<float*>(out),
-            block_size,
-            FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      }
-
-      out += block_size;
-    }
-    return current == index_size;
-  }
-
-  enum {
-    DATA = 0, // Data input.
-    WEIGHT = 1, // Weight input used in SparseLengthsWeightedSum
-    INDICES = 1 + USE_WEIGHT, // 1 in SparseLengths[Sum,Mean] and
-                              // 2 in SparseLengthsWeightedSum
-    LENGTHS = 2 + USE_WEIGHT, // 2 in SparseLengths[Sum, Mean],
-                              // 3 in SparseLengthsWeightedSum
-  };
-};
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.cc b/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.cc
deleted file mode 100644
index ed302ef924bc3..0000000000000
--- a/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(TanhQuantFakeFp16NNPI, TanhInt8QuantizeNNPIOp);
-
-OPERATOR_SCHEMA(TanhQuantFakeFp16NNPI)
-    .Arg("Y_scale", "Output tensor quantization scale")
-    .Arg("Y_zero_point", "Output tensor quantization offset")
-    .NumInputs(1)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Apply TanH and convert the result to Int8.
-<details>
-</details>
-)DOC")
-    .Input(0, "X", "Float Tensor X.")
-    .Output(0, "Y", "Int8 Tensor Y.");
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h b/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h
deleted file mode 100644
index 3dcac9a9bb882..0000000000000
--- a/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#pragma once
-
-#include <array>
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/tensor_int8.h"
-#include "caffe2/operators/quantized/int8_utils.h"
-
-#include <immintrin.h>
-#include <emmintrin.h>
-
-
-namespace caffe2 {
-
-namespace {
-
-
-class TanhInt8QuantizeNNPIOp final : public Operator<CPUContext> {
- public:
-  using Operator<CPUContext>::Operator;
-
-  bool RunOnDevice() override {
-    const auto& X = Input(0);
-    auto* Y = Outputs()[0]->template GetMutable<int8::Int8TensorCPU>();
-    Y->t.ResizeLike(X);
-
-    int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0);
-    auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
-
-    Y->scale = Y_scale;
-    Y->zero_point = Y_offset;
-
-    constexpr int tanhLUTMinOffset = 0;
-    constexpr int tanhLUTMaxOffset = 18000;
-    constexpr int lutSize = tanhLUTMaxOffset - tanhLUTMinOffset;
-
-    std::array<uint8_t, lutSize> tanhLUT;
-
-    Y_scale = 1.0f / Y_scale;
-
-    // create table once
-    for (const auto i : c10::irange(lutSize)) {
-        short input = i + tanhLUTMinOffset;
-        float x = _cvtsh_ss(input);
-        float tanh_x = tanh(x);
-        tanh_x = round(tanh_x * Y_scale + Y_offset);
-
-        if (tanh_x < 0 || tanh_x > 255.0) {
-            tanh_x = 255.0;
-        }
-        uint32_t tanh_quant = (uint32_t)(tanh_x);
-
-        tanhLUT[i] = (uint8_t)tanh_quant;
-    }
-
-    const float* X_data = X.template data<float>();
-    for (const auto i : c10::irange(X.numel())) {
-        short val = _cvtss_sh(X_data[i], 0);
-        unsigned short max16BitPositive = 0x7FFF;
-        unsigned short input16Bit = (*(unsigned short*)& val);
-        short shortAbsInput = input16Bit & max16BitPositive; // mask out negative bit
-        short clampShortAbsInput = shortAbsInput;
-        if (shortAbsInput < (short)tanhLUTMinOffset) {
-            clampShortAbsInput = (short)tanhLUTMinOffset;
-        }
-
-        if (shortAbsInput > (short)(tanhLUTMaxOffset - 1)) {
-            clampShortAbsInput = (short)(tanhLUTMaxOffset - 1);
-        }
-        short inputInLutRange = clampShortAbsInput - tanhLUTMinOffset;
-        short temp =  tanhLUT[inputInLutRange];
-
-        if (input16Bit > max16BitPositive) {  // negative value
-            temp = temp - Y_offset;
-            temp = temp * (-1);
-            temp = temp + Y_offset;
-        }
-        uint8_t output = (uint8_t)temp;
-        if (temp < 0) {
-            output = 0;
-        }
-
-        Y->t.mutable_data<uint8_t>()[i] = output;
-    }
-
-    return true;
-  }
-};
-
-}
-}
diff --git a/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.cc b/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.cc
deleted file mode 100644
index 990d4635f01c2..0000000000000
--- a/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "spatial_batch_norm_fp16_fake_op.h"
-
-#include <array>
-
-#include "caffe2/utils/eigen_utils.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(SpatialBNFakeLoweredFp16NNPI, SpatialBNFakeLoweredFp16Op);
-OPERATOR_SCHEMA(SpatialBNFakeLoweredFp16NNPI).NumInputs({1, 5}).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(SpatialBNFakeFp16NNPI, SpatialBNFakeFp16Op);
-OPERATOR_SCHEMA(SpatialBNFakeFp16NNPI).NumInputs({1, 5}).NumOutputs(1);
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.h b/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.h
deleted file mode 100644
index 9b909695040d4..0000000000000
--- a/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.h
+++ /dev/null
@@ -1,395 +0,0 @@
-#pragma once
-
-#include <algorithm>
-#include <array>
-#include <functional>
-#include <string>
-#include <vector>
-
-#include <fbgemm/FbgemmConvert.h>
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/eigen_utils.h"
-#include "caffe2/utils/math.h"
-#include "fp16_fma.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-class SpatialBNFakeLoweredFp16Op : public Operator<CPUContext> {
- public:
-  USE_OPERATOR_FUNCTIONS(CPUContext);
-
-  template <class... Args>
-  explicit SpatialBNFakeLoweredFp16Op(Args&&... args)
-      : Operator<CPUContext>(std::forward<Args>(args)...),
-        OP_SINGLE_ARG(bool, OpSchema::Arg_IsTest, is_test_, false),
-        OP_SINGLE_ARG(double, "epsilon", epsilon_, 1e-5),
-        order_(StringToStorageOrder(
-            this->template GetSingleArgument<std::string>("order", "NCHW"))),
-        OP_SINGLE_ARG(int, "num_batches", num_batches_, 1) {
-    // TODO: only support NCHW for now
-    CAFFE_ENFORCE_EQ(order_, StorageOrder::NCHW);
-    CAFFE_ENFORCE(
-        (is_test_ && OutputSize() == 1) || (!is_test_ && OutputSize() == 5));
-    CAFFE_ENFORCE_GT(epsilon_, 0);
-  }
-
-   ~SpatialBNFakeLoweredFp16Op() override = default;
-
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
-  }
-
-  template <typename T>
-  bool DoRunWithType() {
-    const auto& X = Input(INPUT);
-    const auto& scale = Input(SCALE);
-    const auto& bias = Input(BIAS);
-
-    const int ndim = X.dim();
-    CAFFE_ENFORCE_GE(ndim, 2);
-    const int N = X.dim32(0);
-    const int C =
-        (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1));
-    const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
-    const int HxW =
-        std::accumulate(
-            X_dims.cbegin() + 1, X_dims.cend(), 1, std::multiplies<int>()) /
-        C;
-    CAFFE_ENFORCE_EQ(scale.numel(), C);
-    CAFFE_ENFORCE_EQ(bias.numel(), C);
-
-    auto* Y = Output(OUTPUT, X.sizes(), at::dtype<T>());
-    T* Y_data = Y->template mutable_data<T>();
-    ReinitializeTensor(
-        &alpha_, {C}, at::dtype<T>().device(CPUContext::GetDeviceType()));
-    T* alpha_data = alpha_.template mutable_data<T>();
-
-    // We only support this case at the moment
-    CAFFE_ENFORCE(is_test_);
-
-    std::vector<float> X_fp16(X.numel());
-
-    fbgemm::RoundToFloat16(
-        X.template data<T>(),
-        X_fp16.data(),
-        N * C * HxW,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    if (N == 0) {
-      return true;
-    }
-    const auto& mean = Input(EST_MEAN);
-    const auto& var = Input(EST_VAR);
-    CAFFE_ENFORCE_EQ(mean.numel(), C);
-    CAFFE_ENFORCE_EQ(var.numel(), C);
-    std::vector<float> mean_fp16(C), var_fp16(C);
-    std::vector<float> scale_fp16(C), bias_fp16(C);
-
-    fbgemm::RoundToFloat16(
-        scale.template data<T>(),
-        scale_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(
-        bias.template data<T>(),
-        bias_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(
-        mean.template data<T>(),
-        mean_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(
-        var.template data<T>(),
-        var_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    EigenVectorArrayMap<T> alpha_arr(alpha_data, C);
-    std::vector<float> tmp(C);
-    EigenVectorArrayMap<T> tmp_arr(tmp.data(), C);
-
-    auto epsilon = static_cast<T>(epsilon_);
-    fbgemm::RoundToFloat16(
-        &epsilon, &epsilon, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    tmp_arr = (ConstEigenVectorArrayMap<T>(var_fp16.data(), C) + epsilon);
-    fbgemm::RoundToFloat16(
-        tmp.data(), tmp.data(), C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    tmp_arr = tmp_arr.pow(0.5);
-    fbgemm::RoundToFloat16(
-        tmp.data(), tmp.data(), C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    alpha_arr = ConstEigenVectorArrayMap<T>(scale_fp16.data(), C) / tmp_arr;
-    fbgemm::RoundToFloat16(
-        alpha_data, alpha_data, C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    AffineChannel_NCHW(
-        N,
-        C,
-        HxW,
-        X_fp16.data(),
-        alpha_data,
-        bias_fp16.data(),
-        mean_fp16.data(),
-        Y_data);
-
-    fbgemm::RoundToFloat16(
-        Y_data, Y_data, N * HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    return true;
-  }
-
- protected:
-  void AffineChannel_NCHW(
-      const int N,
-      const int C,
-      const int HxW,
-      const float* X,
-      const float* scale,
-      const float* bias,
-      const float* mean,
-      float* Y) {
-    ConstEigenVectorArrayMap<float> scale_arr(scale, C);
-    ConstEigenVectorArrayMap<float> bias_arr(bias, C);
-    ConstEigenVectorArrayMap<float> mean_arr(mean, C);
-    const int stride = C * HxW;
-    const float* X_ptr = X;
-    float* Y_ptr = Y;
-    for (const auto i : c10::irange(N)) {
-      EigenArrayMap<float>(Y_ptr, HxW, C) =
-          ConstEigenArrayMap<float>(X_ptr, HxW, C).rowwise() -
-          mean_arr.transpose();
-      fbgemm::RoundToFloat16(
-          Y_ptr, Y_ptr, HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      EigenArrayMap<float>(Y_ptr, HxW, C).rowwise() *= scale_arr.transpose();
-      fbgemm::RoundToFloat16(
-          Y_ptr, Y_ptr, HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      EigenArrayMap<float>(Y_ptr, HxW, C).rowwise() += bias_arr.transpose();
-
-      X_ptr += stride;
-      Y_ptr += stride;
-    }
-    fbgemm::RoundToFloat16(
-        Y, Y, N * HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-  }
-
-  const bool is_test_;
-  double epsilon_;
-  const StorageOrder order_;
-  const int num_batches_;
-
-  Tensor alpha_;
-
-  INPUT_TAGS(
-      INPUT,
-      SCALE,
-      BIAS,
-      EST_MEAN,
-      EST_VAR,
-      BATCH_MEAN_SUM,
-      BATCH_VAR_SUM);
-  OUTPUT_TAGS(OUTPUT, RUNNING_MEAN, RUNNING_VAR, SAVED_MEAN, SAVED_INV_STD);
-};
-
-// Emulation of the NNPI SpatialBN kernel
-class SpatialBNFakeFp16Op : public Operator<CPUContext> {
- public:
-  USE_OPERATOR_FUNCTIONS(CPUContext);
-
-  template <class... Args>
-  explicit SpatialBNFakeFp16Op(Args&&... args)
-      : Operator<CPUContext>(std::forward<Args>(args)...),
-        OP_SINGLE_ARG(bool, OpSchema::Arg_IsTest, is_test_, false),
-        OP_SINGLE_ARG(float, "epsilon", epsilon_, 1e-5),
-        order_(StringToStorageOrder(
-            this->template GetSingleArgument<std::string>("order", "NCHW"))),
-        OP_SINGLE_ARG(int, "num_batches", num_batches_, 1) {
-    // TODO: only support NCHW for now
-    CAFFE_ENFORCE_EQ(order_, StorageOrder::NCHW);
-    // We only support this case at the moment
-    CAFFE_ENFORCE(is_test_);
-    CAFFE_ENFORCE_GT(epsilon_, 0);
-  }
-
-   ~SpatialBNFakeFp16Op() override = default;
-
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
-  }
-
-  template <typename T>
-  bool DoRunWithType() {
-    LOG(INFO) << "Running with " << sizeof(T);
-    const auto& X = Input(INPUT);
-    const auto& scale = Input(SCALE);
-    const auto& bias = Input(BIAS);
-
-    const int ndim = X.dim();
-    CAFFE_ENFORCE_GE(ndim, 2);
-    const int N = X.dim32(0);
-    const int C =
-        (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1));
-    const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
-    const int HxW =
-        std::accumulate(
-            X_dims.cbegin() + 1, X_dims.cend(), 1, std::multiplies<int>()) /
-        C;
-    CAFFE_ENFORCE_EQ(scale.numel(), C);
-    CAFFE_ENFORCE_EQ(bias.numel(), C);
-
-    auto* Y = Output(OUTPUT, X.sizes(), at::dtype<T>());
-    T* Y_data = Y->template mutable_data<T>();
-    ReinitializeTensor(
-        &alpha_, {C}, at::dtype<T>().device(CPUContext::GetDeviceType()));
-    ReinitializeTensor(
-        &beta_, {C}, at::dtype<T>().device(CPUContext::GetDeviceType()));
-    T* alpha_data = alpha_.template mutable_data<T>();
-    T* beta_data = beta_.template mutable_data<T>();
-
-    std::vector<float> X_fp16(X.numel());
-
-    fbgemm::RoundToFloat16(
-        X.template data<T>(),
-        X_fp16.data(),
-        N * C * HxW,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    const auto& mean = Input(EST_MEAN);
-    const auto& var = Input(EST_VAR);
-    CAFFE_ENFORCE_EQ(mean.numel(), C);
-    CAFFE_ENFORCE_EQ(var.numel(), C);
-    std::vector<float> mean_fp16(C), var_fp16(C);
-    std::vector<float> scale_fp16(C), bias_fp16(C);
-
-    fbgemm::RoundToFloat16(
-        scale.template data<T>(),
-        scale_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(
-        bias.template data<T>(),
-        bias_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(
-        mean.template data<T>(),
-        mean_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(
-        var.template data<T>(),
-        var_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    // This part is run on the CPU/x86 core
-    ComputeFusedParam<T>(
-        C,
-        scale_fp16.data(),
-        bias_fp16.data(),
-        mean_fp16.data(),
-        var_fp16.data(),
-        alpha_data,
-        beta_data);
-    AffineChannel_NCHW(N, C, HxW, X_fp16.data(), alpha_data, beta_data, Y_data);
-
-    fbgemm::RoundToFloat16(
-        Y_data, Y_data, N * HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    return true;
-  }
-
- protected:
-  template <typename T>
-  void ComputeFusedParam(
-      const int C,
-      const T* scale,
-      const T* bias,
-      const T* mean,
-      const T* var,
-      T* alpha,
-      T* beta) {
-    // alpha = scale / sqrt(var + epsilon)
-    // beta = bias - alpha * mean
-    EigenVectorArrayMap<T> alpha_arr(alpha, C);
-    EigenVectorArrayMap<T> beta_arr(beta, C);
-
-    std::vector<T> tmp(C, 0.0);
-    EigenVectorArrayMap<T> tmp_arr(tmp.data(), C);
-    tmp_arr = ConstEigenVectorArrayMap<T>(var, C) + static_cast<T>(epsilon_);
-
-    // sqrt using intrinsics
-    int i = 0;
-    constexpr int blockSize = 8;
-    for (i = 0; i + blockSize <= C; i += blockSize) {
-      __m256 t = _mm256_loadu_ps(&tmp[i]);
-      _mm256_storeu_ps(&tmp[i], _mm256_sqrt_ps(t));
-    }
-    for (; i < C; i++) {
-      tmp[i] = sqrt(tmp[i]);
-    }
-
-    alpha_arr = ConstEigenVectorArrayMap<T>(scale, C) / tmp_arr;
-    beta_arr = ConstEigenVectorArrayMap<T>(bias, C) -
-        alpha_arr * ConstEigenVectorArrayMap<T>(mean, C);
-    fbgemm::RoundToFloat16(
-        alpha, alpha, C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(beta, beta, C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-  }
-
-  void AffineChannel_NCHW(
-      const int N,
-      const int C,
-      const int HxW,
-      const float* X,
-      const float* scale,
-      const float* bias,
-      float* Y) {
-    ConstEigenVectorArrayMap<float> scale_arr(scale, C);
-    ConstEigenVectorArrayMap<float> bias_arr(bias, C);
-    const int stride = C * HxW;
-    const float* X_ptr = X;
-    float* Y_ptr = Y;
-
-    // Do Y = X * scale + bias
-    for (const auto i : c10::irange(N)) {
-      for (const auto j : c10::irange(C)) {
-        for (const auto k : c10::irange(HxW)) {
-          Y_ptr[HxW * j + k] = bias[j];
-        }
-
-        std::vector<float> s2(HxW, scale[j]);
-        fake_fp16::fma_fp16(
-            HxW, X_ptr + j * HxW, s2.data(), Y_ptr + HxW * j); // b2.data());
-      }
-      X_ptr += stride;
-      Y_ptr += stride;
-    }
-    fbgemm::RoundToFloat16(
-        Y, Y, N * HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-  }
-
-  const bool is_test_;
-  float epsilon_;
-  const StorageOrder order_;
-  const int num_batches_;
-
-  Tensor alpha_;
-  Tensor beta_;
-
-  INPUT_TAGS(
-      INPUT,
-      SCALE,
-      BIAS,
-      EST_MEAN,
-      EST_VAR,
-      BATCH_MEAN_SUM,
-      BATCH_VAR_SUM);
-  OUTPUT_TAGS(OUTPUT, RUNNING_MEAN, RUNNING_VAR, SAVED_MEAN, SAVED_INV_STD);
-}; // namespace caffe2
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/sum_fp16_fake_op.h b/caffe2/contrib/fakelowp/sum_fp16_fake_op.h
deleted file mode 100644
index 0f654110321f2..0000000000000
--- a/caffe2/contrib/fakelowp/sum_fp16_fake_op.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#pragma once
-
-#include <caffe2/core/operator.h>
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-template <class Context>
-class SumFP16FP16AccOp : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  USE_SIMPLE_CTOR_DTOR(SumFP16FP16AccOp);
-
-  bool DoRunWithFloat() {
-    auto& input0 = Input(0);
-
-    size_t N = input0.numel();
-    auto* output = Output(0, input0.sizes(), at::dtype<float>());
-    // Dimension checking
-    for (const auto i : c10::irange(1, InputSize())) {
-      if (output->sizes() != Input(i).sizes()) {
-        CAFFE_THROW(
-            "Check failed: output->sizes() == Input(i).sizes().",
-            "Description: Input #",
-            i,
-            ", input dimension:",
-            Input(i).sizes(),
-            " should match output dimension: ",
-            output->sizes());
-      }
-    }
-
-    float* output_data = output->template mutable_data<float>();
-    memset(output_data, 0, sizeof(float) * input0.numel());
-
-    std::vector<float> t1(N);
-    std::vector<float> t2(N);
-
-    for (const auto i : c10::irange(InputSize())) {
-      fbgemm::RoundToFloat16(
-          Input(i).template data<float>(),
-          t1.data(),
-          N,
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      fbgemm::RoundToFloat16(
-          output_data, t2.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-      math::Add(N, t1.data(), t2.data(), output_data, &context_);
-    }
-    fbgemm::RoundToFloat16(
-        output_data, output_data, N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    return true;
-  }
-
-  bool RunOnDevice() override {
-    if (Input(0).template IsType<float>()) {
-      return DoRunWithFloat();
-    } else {
-      CAFFE_THROW(
-          "Sum operator only supports 32-bit float, but",
-          " input was of type ",
-          Input(0).dtype().name());
-    }
-  }
-};
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/test/README.md b/caffe2/contrib/fakelowp/test/README.md
deleted file mode 100644
index 37adc2f5f4388..0000000000000
--- a/caffe2/contrib/fakelowp/test/README.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# How to run FakeLowP vs Glow tests
-This was tested on Ubuntu 16.04 LTS but should work in general Linux system. The tested compiler is Clang-8.
-
-## Build Glow Onnxifi Library
-Follow https://github.com/pytorch/glow/blob/master/README.md to install the dependency of Glow. Then at glow root run
-```
-mkdir build && cd build
-cmake -G Ninja -DGLOW_BUILD_ONNXIFI_DYNLIB=ON ..
-ninja all
-```
-Note that here you probably want to add other flags like `-DGLOW_WITH_NNPI=1` to enable specific backend if you have the flow set up. Also, make sure you have the LD_LIBRARY_PATH set correctly pointing to libomp.so path when compiling with -DGLOW_WITH_NNPI=1.
-```
-export LD_LIBRARY_PATH=/usr/lib/llvm-8/lib
-```
-Once built successfully, you will get an dynamic library at `build/lib/Onnxifi/libonnxifi.so`. We will use it later.
-
-## Build and Install PyTorch
-Follow https://github.com/pytorch/pytorch/blob/main/README.md to install the dependency of PyTorch. It might be easy to
-setup a python virtualenv or conda. And please use Python > 3.5.2 because hypothesis library will expose a bug in Python which
-is fixed after 3.5.2. Something like 3.7 might be good enough. You can install python3.7 with
-```
-sudo apt-get install -y build-essential checkinstall libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev zlib1g-dev openssl libffi-dev python3-dev python3-setuptools wget
-wget https://www.python.org/ftp/python/3.7.4/Python-3.7.4.tgz && tar -xf Python-3.7.4.tgz
-cd Python-3.7.4
-./configure && make -j 8 && sudo make altinstall
-```
-
-Once you installed Python 3.7, here I give a virtualenv flow:
-```
-sudo pip3.7 install virtualenv
-python3.7 -m venv venv3
-source venv3/bin/activate
-cd pytorch
-pip install -r requirements.txt
-pip install pytest hypothesis protobuf
-```
-You probably need to install gflags-dev too with
-```
-sudo apt-get install libgflags-dev
-```
-
-Once you have all the dependency libs installed, build PyTorch with FakeLowP op support
-```
-USE_CUDA=0 USE_ROCM=0 USE_FAKELOWP=ON DEBUG=1 CMAKE_BUILD_TYPE=Debug USE_GFLAGS=1 USE_GLOG=1 USE_MKLDNN=0 BUILD_TEST=0 python setup.py install
-```
-The key options here are `USE_FAKELOWP=ON` which enables building of FakeLowP operators and `USE_GFLAGS=1` which enables gflags as we
-use gflags in Glow to pass options. Other flags are mostl for fast build time and debug purpose.
-
-## Run the test
-You can now run the tests with command like the following  when you are inside the virtual python env:
-```
-OSS_ONNXIFI_LIB=${PATH_TO_GLOW}/build/lib/Onnxifi/libonnxifi.so pytest pytorch/caffe2/contrib/fakelowp/test --hypothesis-show-statistics
-```
diff --git a/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py
deleted file mode 100644
index d6e5c5db6d2a4..0000000000000
--- a/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import numpy as np
-import unittest
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import datetime
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
-
-
-class TestBatchMatMul(serial.SerializedTestCase):
-    @given(
-        C=st.integers(min_value=1, max_value=10),
-        M=st.integers(min_value=1, max_value=50),
-        K=st.integers(min_value=1, max_value=512),
-        N=st.integers(min_value=1, max_value=50),
-        rand_seed=st.integers(0, 65534),
-        trans_a=st.booleans(),
-        trans_b=st.booleans(),
-        run_ints=st.booleans()
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_batch_matmul(self, M, K, N, C, rand_seed, trans_a, trans_b, run_ints):
-        np.random.seed(rand_seed)
-        workspace.ResetWorkspace()
-
-        batch_dims = [C]
-
-        if run_ints:
-            X = np.random.randint(low=1, high=3, size=((C, M, K))).astype(np.float32)
-        else:
-            X = 100 * (np.random.rand(*(batch_dims + [M, K])).astype(np.float32) - 0.5)
-        if trans_a:
-            X = X.swapaxes(-1, -2)
-
-        if run_ints:
-            Y = np.random.randint(low=1, high=3, size=((C, K, N))).astype(np.float32)
-        else:
-            Y = 100 * (np.random.rand(*(batch_dims + [K, N])).astype(np.float32) - 0.5)
-        if trans_b:
-            Y = Y.swapaxes(-1, -2)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "Y"])
-        pred_net.external_output.append("out")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                'BatchMatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b
-            )
-        )
-
-        pred_net_ref = core.Net("pred_net_ref")
-
-        # Reference updated to fp16 with fp32 accumulation
-        pred_net_ref.BatchMatMulFP16Acc32Fake(
-            ["X", "Y"], ['out'], trans_a=trans_a, trans_b=trans_b)
-
-        print("dims", batch_dims, X.shape, Y.shape)
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                {"X": X.shape, "Y": Y.shape},
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net_ref)
-
-        # Run Glow net
-        workspace.RunNet(pred_net_onnxified.name)
-        out_glow = workspace.FetchBlob('out')
-
-        # Run caffe2 net
-        workspace.RunNet(pred_net_ref)
-        out_c2_fakefp16 = workspace.FetchBlob('out')
-
-        diff = np.abs(out_c2_fakefp16 - out_glow)
-
-        if not np.allclose(out_glow, out_c2_fakefp16):
-            print_test_debug_info("bmm", {
-                "seed": rand_seed,
-                "m": M, "k": K,
-                "n": N, "X": X.shape, "Y": Y.shape,
-                "trans_a": trans_a,
-                "trans_b": trans_b,
-                "run_ints": run_ints,
-                "out_glow": out_glow,
-                "out_c2_fakefp16": out_c2_fakefp16,
-                "diff": diff
-            })
-            assert(0)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py
deleted file mode 100644
index 56ac6733f13d7..0000000000000
--- a/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import numpy as np
-import unittest
-
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-from caffe2.python import workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-import datetime
-
-core.GlobalInit(["caffe2", "--glow_global_fp16=1",
-                 "--glow_global_fused_scale_offset_fp16=1",
-                 "--glow_global_force_sls_fp16_accum=1"])
-
-GLOW_LOWERED_BATCHNORM = False
-
-
-def reference_spatialbn_test16(X, scale, bias, mean, var, epsilon, order):
-    X = X.astype(np.float16)
-    scale = scale.astype(np.float16)
-    bias = bias.astype(np.float16)
-    mean = mean.astype(np.float16)
-    # var = var.astype(np.float16)
-    assert(order == "NCHW")
-
-    scale = scale[np.newaxis, :, np.newaxis, np.newaxis]
-    bias = bias[np.newaxis, :, np.newaxis, np.newaxis]
-    mean = mean[np.newaxis, :, np.newaxis, np.newaxis]
-    var = var[np.newaxis, :, np.newaxis, np.newaxis]
-    Y = ((X - mean) * (scale / np.sqrt(var + epsilon).astype(np.float16))) + bias
-    return Y.astype(np.float32)
-
-
-# Test the lowered BN op
-class BatchnormTest(serial.SerializedTestCase):
-    # TODO: using hypothesis seed, sweep dimensions
-    @given(seed=st.integers(0, 65535),
-           size=st.integers(2, 30),
-           input_channels=st.integers(2, 40),
-           batch_size=st.integers(2, 20))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_bn(self, seed, size, input_channels, batch_size):
-        workspace.ResetWorkspace()
-        np.random.seed(seed)
-
-        order = "NCHW"
-        epsilon = 1e-3
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "scale", "bias", "mean", "var"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SpatialBN",
-                ["X", "scale", "bias", "mean", "var"],
-                ["Y"],
-                order=order,
-                is_test=True,
-                epsilon=epsilon
-            )
-        )
-
-        if GLOW_LOWERED_BATCHNORM:
-            refopname = "SpatialBNFakeLoweredFp16NNPI"
-        else:
-            refopname = "SpatialBNFakeFp16NNPI"
-
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "pred"
-        pred_net_ref.external_input.extend(["X", "scale", "bias", "mean", "var"])
-        pred_net_ref.external_output.append("X")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                refopname,
-                ["X", "scale", "bias", "mean", "var"],
-                ["Y"],
-                order=order,
-                is_test=True,
-                epsilon=epsilon
-            )
-        )
-
-        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channels).astype(np.float32)
-        var = np.random.rand(input_channels).astype(np.float32) + 0.5
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-
-        workspace.FeedBlob("scale", scale)
-        workspace.FeedBlob("bias", bias)
-        workspace.FeedBlob("mean", mean)
-        workspace.FeedBlob("var", var)
-
-        # Use for reference to debug
-        # Y_np = reference_spatialbn_test16(X, scale, bias, mean, var, epsilon, order)
-
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {"X": [batch_size, input_channels, size, size],
-             "scale": [input_channels],
-             "bias": [input_channels],
-             "mean": [input_channels],
-             "var": [input_channels]},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("X", X)
-
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net_ref)
-
-        workspace.RunNet(pred_net_ref.name)
-        Y_c2 = workspace.FetchBlob("Y")
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)):
-            diff = np.abs(Y_glow - Y_c2).astype(np.float16)
-            print_test_debug_info(
-                "bn",
-                {
-                    "seed": seed,
-                    "scale": scale,
-                    "bias": bias,
-                    "mean": mean,
-                    "var": var,
-                    "Y_np": Y_c2,
-                    "Y_glow": Y_glow,
-                    "diff": diff,
-                    "rowwise_diff": np.max(np.abs(diff), -1)})
-            assert(0)
diff --git a/caffe2/contrib/fakelowp/test/test_chunking.py b/caffe2/contrib/fakelowp/test/test_chunking.py
deleted file mode 100644
index 6753aaf93ab9a..0000000000000
--- a/caffe2/contrib/fakelowp/test/test_chunking.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Must happen before importing caffe2.python.*
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-import datetime
-import numpy as np
-from hypothesis import given, settings, example
-from hypothesis import strategies as st
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-# Test that parallel chunks behave the same way as the serial one
-
-workspace.GlobalInit(
-    [
-        "caffe2",
-        "--glow_global_fp16=1",
-        "--glow_global_fused_scale_offset_fp16=1",
-        "--glow_global_force_sls_fp16_accum=1",
-        "--glow_nnpi_num_parallel_chunks=2",
-        "--glow_use_dag_optimizer=false",
-        "--glow_dump_graph=true",
-    ]
-)
-
-class Fusions(serial.SerializedTestCase):
-    def _get_scale_zp(self, tensor):
-        tensor_max = np.max(tensor)
-        tensor_min = min(0, np.min(tensor))
-        scale = np.float32(np.float16((tensor_max - tensor_min) / 255.0))
-        if scale < 1e-6:
-            scale = np.float32(1e-6)
-        zero_point = 0 - tensor_min / scale
-        zero_point = int(round(np.clip(zero_point, 0, 255.0)))
-        return (scale, zero_point)
-
-    @given(
-        scale=st.floats(1e-4, 1e2),
-        zp=st.integers(-128, 128),
-        rand_seed=st.integers(0, 65534),
-        m=st.integers(32, 64),
-        k=st.integers(1000, 6000),
-        n=st.integers(200, 600),
-    )
-    # @example(m=64, k=5423, n=553, scale=1e-3, zp=120, rand_seed=1)
-    @settings(deadline=datetime.timedelta(seconds=1000), max_examples=1)
-    def test_ParallelFC(self, m, k, n, scale, zp, rand_seed):
-        np.random.seed(rand_seed)
-        workspace.ResetWorkspace()
-
-        # Y = W_T * X + b
-        X_fp32 = np.random.uniform(-1, 1, size=(m, k)).astype(np.float16) \
-            .astype(np.float32)
-
-        W_fp32 = np.random.uniform(-1, 1, size=(n, k)).astype(np.float32)
-        b_fp32 = np.zeros((n,), dtype=np.float32)
-
-        X_scale, X_zero_point = self._get_scale_zp(X_fp32)
-
-        workspace.FeedBlob("X", X_fp32)
-        workspace.FeedBlob("W", W_fp32)
-        workspace.FeedBlob("b", b_fp32)
-
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "Int8FCPackWeight",
-                ["W"],
-                ["W_int8"],
-                engine="DNNLOWP",
-                save_unpacked_weights=True,
-                in_scale=X_scale,
-            )
-        )
-
-        ref_net = core.Net("net")
-        ref_net.Int8QuantizeNNPI(
-            ["X"],
-            ["X_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point
-        )
-        ref_net.Int8FCFakeAcc32NNPI(
-            ["X_int8", "W_int8", "b"],
-            ["Y_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point,
-        )
-        ref_net.Int8Relu(
-            ["Y_int8"],
-            ["Y_relu"],
-            Y_zero_point=X_zero_point,
-            Y_scale=X_scale,
-        )
-        ref_net.Int8DequantizeNNPI(
-            ["Y_relu"],
-            ["Y"]
-        )
-        ref_net.Proto().external_output.append("Y")
-
-        # run ref_net
-        workspace.RunNetOnce(ref_net)
-        Y_fbgemm = workspace.FetchBlob("Y")
-
-        # run onnxifi net
-        ref_net.Proto().op[0].type = "Int8Quantize"
-        ref_net.Proto().op[1].type = "Int8FC"
-        ref_net.Proto().op[2].type = "Int8Relu"
-        ref_net.Proto().op[3].type = "Int8Dequantize"
-        net_onnxified = onnxifi_caffe2_net(
-            ref_net.Proto(),
-            {},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-            weight_names=["W_int8", "b"],
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
-        )
-        print(net_onnxified)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.CreateNet(net_onnxified)
-        workspace.RunNet(net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_glow, Y_fbgemm):
-            diff_Y = np.abs(Y_glow - Y_fbgemm)
-            print_test_debug_info(
-                "int8_fc",
-                {
-                    "seed": rand_seed,
-                    "n": n,
-                    "X": X_fp32,
-                    "W": W_fp32,
-                    "b": b_fp32,
-                    "Y_fbgemm": Y_fbgemm,
-                    "Y_glow": Y_glow,
-                    "diff": diff_Y,
-                    "maxdiff": diff_Y.max(axis=1),
-                },
-            )
-            assert 0
diff --git a/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py b/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py
deleted file mode 100644
index 7ee160e196027..0000000000000
--- a/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import numpy as np
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-import datetime
-from hypothesis import settings
-
-core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
-
-class DeqSwishQuantTest(serial.SerializedTestCase):
-    def _get_scale_zp(self, tensor):
-        tensor_max = np.max(tensor)
-        tensor_min = min(0, np.min(tensor))
-        scale = np.float32(np.float16((tensor_max - tensor_min) / 255.))
-        zero_point = -tensor_min / scale
-        zero_point = int(round(np.clip(zero_point, 0, 255.0)))
-        return (scale, zero_point)
-
-    def _sigmoid(self, x):
-        return 1. / (1. + np.exp(np.float32(-x)))
-
-    def _swish(self, x):
-        return np.float32(x) * self._sigmoid(x)
-
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_swish_int8(self):
-        np.random.seed(0)
-        workspace.ResetWorkspace()
-        n = 256
-
-        X_fp32 = np.linspace(-20.5, 8., num=n).astype(np.float32).reshape(1, n)
-        Y_fp32 = self._swish(X_fp32)
-        X_scale, X_zero_point = self._get_scale_zp(X_fp32)
-        Y_scale, Y_zero_point = self._get_scale_zp(Y_fp32)
-        W_fp32 = np.identity(n, dtype=np.float32)
-        b_fp32 = np.zeros((n,), dtype=np.float32)
-
-        workspace.FeedBlob("X", X_fp32)
-        workspace.FeedBlob("W", W_fp32)
-        workspace.FeedBlob("b", b_fp32)
-
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "Int8FCPackWeight",
-                ["W"],
-                ["W_int8"],
-                engine="DNNLOWP",
-                save_unpacked_weights=True,
-                in_scale=X_scale,
-            )
-        )
-
-        ref_net1 = core.Net("net")
-        ref_net1.Int8QuantizeNNPI(
-            ["X"],
-            ["X_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point
-        )
-        ref_net1.Int8FCFakeAcc32NNPI(
-            ["X_int8", "W_int8", "b"],
-            ["U_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point,
-        )
-        ref_net1.SwishFakeInt8NNPI(
-            ["U_int8"],
-            ["Y"],
-            X_scale=X_scale,
-            X_zero_point=X_zero_point,
-            Y_scale=Y_scale,
-            Y_zero_point=Y_zero_point
-        )
-        ref_net1.Proto().external_output.append("Y")
-
-        ref_net = core.Net("net")
-        ref_net.Int8QuantizeNNPI(
-            ["X"],
-            ["X_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point
-        )
-        ref_net.Int8FCFakeAcc32NNPI(
-            ["X_int8", "W_int8", "b"],
-            ["U_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point,
-        )
-        ref_net.Int8DequantizeNNPI(
-            ["U_int8"],
-            ["U_fp16"],
-            UsingOneOverScale=False
-        )
-        ref_net.SwishFakeFp16NNPI(
-            ["U_fp16"],
-            ["Y_fp16"]
-        )
-        ref_net.Int8QuantizeNNPI(
-            ["Y_fp16"],
-            ["Y"],
-            Y_scale=Y_scale,
-            Y_zero_point=Y_zero_point
-        )
-        ref_net.Proto().external_output.append("Y")
-
-        # run ref_net
-        workspace.RunNetOnce(ref_net1)
-        Y_fbgemm = workspace.FetchInt8Blob("Y")
-
-        # run onnxifi net
-        ref_net.Proto().op[0].type = "Int8Quantize"
-        ref_net.Proto().op[1].type = "Int8FC"
-        ref_net.Proto().op[2].type = "Int8Dequantize"
-        ref_net.Proto().op[3].type = "Swish"
-        ref_net.Proto().op[4].type = "Int8Quantize"
-        net_onnxified = onnxifi_caffe2_net(
-            ref_net.Proto(),
-            {},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-            weight_names=["W_int8", "b"],
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        # TODO: add an assertion to check the optimized net
-        # fused Dequantize->Swish->Quantize to QuantizedSwish
-        workspace.CreateNet(net_onnxified)
-        workspace.RunNet(net_onnxified.name)
-        Y_glow = workspace.FetchInt8Blob("Y")
-        U_int8 = workspace.FetchInt8Blob("U_int8")
-
-        diff_Y = np.abs(Y_glow.data - Y_fbgemm.data)
-
-        num_mismatches = np.count_nonzero(diff_Y)
-        max_diff = np.max(diff_Y)
-        if max_diff > 0 or Y_glow.scale != Y_fbgemm.scale or \
-           Y_glow.zero_point != Y_fbgemm.zero_point:
-            print_test_debug_info(
-                "QuantizedSwish",
-                {
-                    "X": X_fp32,
-                    "X_scale": X_scale,
-                    "X_zero_point": X_zero_point,
-                    "Y_scale": Y_scale,
-                    "Y_zero_point": Y_zero_point,
-                    "U_int8": U_int8,
-                    "Y_fbgemm": Y_fbgemm,
-                    "Y_glow": Y_glow,
-                    "diff": diff_Y,
-                    "max_diff": max_diff,
-                    "num_mismatches": num_mismatches,
-                },
-            )
-            assert 0
diff --git a/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py
deleted file mode 100644
index d90af480dfd63..0000000000000
--- a/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py
+++ /dev/null
@@ -1,357 +0,0 @@
-import numpy as np
-import unittest
-
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-from caffe2.python import workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import datetime
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
-
-GLOW_MATMUL_RTOL = 0
-
-
-class FCTest(serial.SerializedTestCase):
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_clip(self, seed):
-        np.random.seed(seed)
-        m, n, k = 8, 8, 8
-        dtype = np.float32
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "W0", "b0", "W1", "b1"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FC",
-                ["X", "W0", "b0"],
-                ["X1"],
-            )
-        )
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FC",
-                ["X1", "W1", "b1"],
-                ["Y"],
-            )
-        )
-        workspace.GlobalInit(
-            ['caffe2', '--caffe2_log_level=0', '--glow_global_fp16=1',
-             '--glow_clip_fp16', '--glow_global_fp16_constants=1'])
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.ResetWorkspace()
-        W0 = np.full((n, k), 65536.0, dtype)
-        b0 = np.random.randint(low=1, high=3, size=(n)).astype(dtype)
-        W1 = np.random.randint(low=1, high=3, size=(n, k)).astype(dtype)
-        b1 = np.random.randint(low=1, high=3, size=(n)).astype(dtype)
-        workspace.FeedBlob("W0", W0)
-        workspace.FeedBlob("b0", b0)
-        workspace.FeedBlob("W1", W1)
-        workspace.FeedBlob("b1", b1)
-
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {"X": (m, k)},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False
-        )
-
-        X = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(pred_net_onnxified)
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-        np.testing.assert_allclose(Y_glow, np.full((m, n), 65504.0, dtype))
-
-    @given(
-        m=st.integers(4, 50),
-        k=st.integers(4, 50),
-        n=st.integers(4, 50),
-        seed=st.integers(0, 65534)
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_fc_exercise(self, m, k, n, seed):
-        """ Test that the matmul engine is working, this doesn't test
-            precision
-        """
-        np.random.seed(seed)
-        dtype = np.float32
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "W0", "b0"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FC",
-                ["X", "W0", "b0"],
-                ["Y"],
-            )
-        )
-
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.ResetWorkspace()
-        W0 = np.random.randint(low=1, high=3, size=(n, k)).astype(dtype)
-        b0 = np.random.randint(low=1, high=3, size=(n)).astype(dtype)
-        workspace.FeedBlob("W0", W0)
-        workspace.FeedBlob("b0", b0)
-
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                {"X": (m, k)},
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        X0 = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype)
-        workspace.FeedBlob("X", X0)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net)
-
-        num_iterations = 2
-        for _ in range(num_iterations):
-            X0 = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype)
-            workspace.FeedBlob("X", X0)
-            # Run Glow net
-            workspace.RunNet(pred_net_onnxified.name)
-            Y_glow = workspace.FetchBlob('Y')
-            # Run caffe2 net
-            workspace.RunNet(pred_net.name)
-            Y_c2 = workspace.FetchBlob('Y')
-            if not np.allclose(Y_c2, Y_glow):
-                print_test_debug_info("fc", {
-                    "seed": seed,
-                    "m": m,
-                    "k": k,
-                    "n": n,
-                    "X": X0,
-                    "W0": W0,
-                    "b0": b0,
-                    "Y_glow": Y_glow,
-                    "Y_c2": Y_c2,
-                    "diff": np.abs((Y_c2 - Y_glow) / Y_c2)})
-                assert(0)
-
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_fc_numeric_cases(self, seed):
-        """ Test numerics, use examples found from the unit test.
-            Use Fp16FCAcc16NNPI as a reference.
-        """
-        np.random.seed(seed)
-        m = 1
-        k = 20
-        n = 1
-        dtype = np.float32
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "W0", "b0"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FC",
-                ["X", "W0", "b0"],
-                ["Y"],
-            )
-        )
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "pred"
-        pred_net_ref.external_input.extend(["X", "W0", "b0"])
-        pred_net_ref.external_output.append("Y")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                "Fp16FCAcc32NNPI",
-                ["X", "W0", "b0"],
-                ["Y"],
-            )
-        )
-
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.ResetWorkspace()
-
-        W0 = np.array([[0.04882812, 0.21520996, 0.1027832, 0.04489136,
-                        -0.07635498, 0.14587402,
-                        -0.06240845, 0.3918457, 0.46362305, -0.11657715,
-                        0.29174805, 0.02890015,
-                        0.0680542, 0.4255371, -0.42895508, -0.4128418,
-                        -0.47973633, 0.33251953,
-                        0.27807617, 0.3701172]], dtype=np.float32)
-        b0 = np.array([0.47851562], dtype=np.float32)
-
-        workspace.FeedBlob("W0", W0)
-        workspace.FeedBlob("b0", b0)
-
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                {"X": (m, k)},
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        X_inputs = [
-            np.array([[
-                -2.94921875e-01, -3.58642578e-01, -1.92871094e-01,
-                2.81250000e-01, -1.30126953e-01, 2.32696533e-02,
-                -4.55566406e-01, -2.31811523e-01, -1.95190430e-01,
-                -7.76977539e-02, -1.29394531e-01, 2.94677734e-01,
-                8.96453857e-04, 4.97314453e-01, -6.07604980e-02,
-                2.55371094e-01, 3.49853516e-01, -1.37695312e-01,
-                2.95410156e-01, -3.67187500e-01]], dtype=np.float32),
-            np.array([[
-                -0.4494629, -0.22192383, -0.1640625, 0.11480713,
-                -0.09851074, -0.02084351,
-                0.19091797, -0.17468262, -0.47485352, 0.07489014,
-                0.03897095, 0.00197601,
-                0.02835083, -0.27294922, 0.26757812, -0.20996094,
-                -0.31103516, -0.41601562,
-                0.09918213, -0.07696533]], dtype=np.float32),
-            np.array([[
-                0.01150513, -0.20507812, 0.46704102, 0.00906372,
-                0.19848633, 0.3720703,
-                0.46557617, -0.47436523, -0.35107422, -0.0362854,
-                -0.20812988, 0.41918945,
-                0.09716797, 0.19897461, 0.3876953, -0.0165863,
-                0.23535156, 0.29956055,
-                0.24389648, -0.23486328]], dtype=np.float32)
-        ]
-
-        # keep onnxifi happy by feeding something with a shape
-        workspace.FeedBlob("X", X_inputs[0])
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net_ref)
-
-        for i in range(len(X_inputs)):
-            workspace.FeedBlob("X", X_inputs[i])
-            # Run Glow net
-            workspace.RunNet(pred_net_onnxified.name)
-            Y_glow = workspace.FetchBlob('Y')
-            workspace.RunNet(pred_net_ref.name)
-            Y_c2 = workspace.FetchBlob('Y')
-
-            diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8))
-            rowdiff = np.max(diff, axis=1)
-
-            n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL])
-            if n_offenders > 0:
-                print_test_debug_info("fc", {
-                    "seed": seed,
-                    "iter": i,
-                    "m": m,
-                    "k": k,
-                    "n": n,
-                    "W0": W0,
-                    "b0": b0,
-                    "Y_glow": Y_glow,
-                    "Y_c2": Y_c2,
-                    "diff": diff,
-                    "rowdiff": rowdiff})
-                assert(0)
-
-    @given(
-        m=st.integers(1, 50),
-        k=st.integers(1, 1000),
-        n=st.integers(1, 50),
-        seed=st.integers(0, 65534),
-        use_packed=st.integers(0, 2)
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_fc_num0(self, seed, m, k, n, use_packed):
-        """ Test numerics, fix a dimension and determine the ranges of error.
-            Use Fp16FCAcc16 as a reference.
-        """
-        W = "W_packed" if use_packed else "W0"
-        dtype = np.float32
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", W, "b0"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FbFCPacked" if use_packed else "FC",
-                ["X", W, "b0"],
-                ["Y"],
-            )
-        )
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "pred"
-        pred_net_ref.external_input.extend(["X", W, "b0"])
-        pred_net_ref.external_output.append("Y")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                "Fp16FCAcc32NNPI",
-                ["X", W, "b0"],
-                ["Y"],
-            )
-        )
-
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.ResetWorkspace()
-        W0 = 10 * (np.random.rand(n, k) - 0.5).astype(np.float16).astype(np.float32)
-        b0 = 1 * (np.random.rand(n) - 0.5).astype(np.float16).astype(np.float32)
-
-        workspace.FeedBlob("W0", W0)
-        workspace.FeedBlob("b0", b0)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FbGemmPack",
-                ['W0'],
-                ['W_packed'],
-                no_packing=True,
-            )
-        )
-
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                {"X": (m, k)},
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        X0 = np.random.rand(m, k).astype(dtype) - 0.5
-        workspace.FeedBlob("X", X0)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net_ref)
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob('Y')
-
-        # Run caffe2 net
-        workspace.RunNet(pred_net_ref.name)
-        Y_c2 = workspace.FetchBlob('Y')
-
-        diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8))
-        rowdiff = np.max(diff, axis=1)
-
-        n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL])
-        if n_offenders > 0:
-            print_test_debug_info("fc", {
-                "seed": seed,
-                "use_packed": use_packed,
-                "m": m,
-                "k": k,
-                "n": n,
-                "X": X0.shape,
-                "W0": W0.shape,
-                "b0": b0.shape,
-                "Y_glow": Y_glow,
-                "Y_c2": Y_c2,
-                "diff": diff,
-                "rowdiff": rowdiff})
-            assert(0)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/contrib/fakelowp/test/test_fusions.py b/caffe2/contrib/fakelowp/test/test_fusions.py
deleted file mode 100644
index 3e22d7c5937be..0000000000000
--- a/caffe2/contrib/fakelowp/test/test_fusions.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Must happen before importing caffe2.python.*
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-import datetime
-import numpy as np
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-workspace.GlobalInit(
-    [
-        "caffe2",
-        "--glow_global_fp16=1",
-        "--glow_global_fused_scale_offset_fp16=1",
-        "--glow_global_force_sls_fp16_accum=1",
-    ]
-)
-
-class Fusions(serial.SerializedTestCase):
-    @given(
-        scale=st.floats(1e-4, 1e2),
-        zp=st.integers(-128, 128),
-        size=st.integers(1, 100000),
-        rand_seed=st.integers(0, 65534),
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_tanhquantize(self, scale, zp, size, rand_seed):
-        np.random.seed(rand_seed)
-
-        workspace.ResetWorkspace()
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "ref"
-        pred_net.external_input.append("X")
-        pred_net.external_output.append("Y_q")
-
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Tanh", ["X"], ["Y"]
-            )
-        )
-
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Int8Quantize", ["Y"], ["Y_q"], Y_scale=scale, Y_zero_point=zp
-            )
-        )
-
-        X = np.linspace(-1, 1, size).astype(np.float16).astype(np.float32)
-
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {"X": X.shape},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchInt8Blob("Y_q")
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.append("X")
-        ref_net.external_output.append("Y_q")
-
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "TanhQuantFakeFp16NNPI", ["X"], ["Y_q"], Y_scale=scale, Y_zero_point=zp
-            )
-        )
-
-        workspace.CreateNet(ref_net)
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchInt8Blob("Y_q")
-
-        if not np.array_equal(Y_ref.data, Y_glow.data) or \
-           not Y_ref.scale == Y_glow.scale or \
-           not Y_ref.zero_point == Y_glow.zero_point:
-            print_test_debug_info(
-                "tanhfusion",
-                {
-                    "scale": scale,
-                    "zp": zp,
-                    "input": X,
-                    "ideal nonquant": np.tanh(X),
-                    "Y_glow": Y_glow,
-                    "Y_c2": Y_ref,
-                }
-            )
-            assert(0)
diff --git a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
deleted file mode 100644
index 4d8078a0fc04b..0000000000000
--- a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
+++ /dev/null
@@ -1,322 +0,0 @@
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-import numpy as np
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from hypothesis import given, strategies as st, settings
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-import datetime
-
-core.GlobalInit(["caffe2",
-                 "--caffe2_log_level=-3",
-                 "--glow_global_fp16=1",
-                 "--glow_clip_quant_range_to_fp16=1",
-                 "--glow_global_fp16_constants=1"
-                 ])
-
-
-class Int8OpsTest(serial.SerializedTestCase):
-    def _get_scale_zp(self, tensor):
-        tensor_max = np.max(tensor)
-        tensor_min = min(0, np.min(tensor))
-        scale = np.float32(np.float16((tensor_max - tensor_min) / 255.0))
-        if scale < 1e-6:
-            scale = np.float32(1e-6)
-        zero_point = 0 - tensor_min / scale
-        zero_point = int(round(np.clip(zero_point, 0, 255.0)))
-        return (scale, zero_point)
-
-    @given(
-        n=st.integers(2, 1024),
-        rand_seed=st.integers(0, 65534),
-        non_zero_offset=st.booleans()
-    )
-    @settings(deadline=datetime.timedelta(seconds=50))
-    def test_int8_quantize(self, n, rand_seed, non_zero_offset):
-        print("n={}, rand_seed={}".format(n, rand_seed))
-        np.random.seed(rand_seed)
-        workspace.ResetWorkspace()
-
-        if non_zero_offset:
-            X_fp32 = np.random.uniform(-1, 1, size=(n, n)).astype(np.float16) \
-                .astype(np.float32)
-        else:
-            X_fp32 = np.random.rand(n, n).astype(np.float16).astype(np.float32)
-
-        W_fp32 = np.identity(n, dtype=np.float32)
-        b_fp32 = np.zeros((n,), dtype=np.float32)
-
-        X_scale, X_zero_point = self._get_scale_zp(X_fp32)
-
-        workspace.FeedBlob("X", X_fp32)
-        workspace.FeedBlob("W", W_fp32)
-        workspace.FeedBlob("b", b_fp32)
-
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "Int8FCPackWeight",
-                ["W"],
-                ["W_int8"],
-                engine="DNNLOWP",
-                save_unpacked_weights=True,
-                in_scale=X_scale,
-            )
-        )
-
-        ref_net = core.Net("net")
-        ref_net.Int8QuantizeNNPI(
-            ["X"],
-            ["X_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point
-        )
-        ref_net.Int8FCFakeAcc32NNPI(
-            ["X_int8", "W_int8", "b"],
-            ["Y_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point,
-        )
-        ref_net.Int8DequantizeNNPI(
-            ["Y_int8"],
-            ["Y"]
-        )
-        ref_net.Proto().external_output.append("Y")
-
-        # run ref_net
-        workspace.RunNetOnce(ref_net)
-        Y_fbgemm = workspace.FetchBlob("Y")
-
-        # run onnxifi net
-        ref_net.Proto().op[0].type = "Int8Quantize"
-        ref_net.Proto().op[1].type = "Int8FC"
-        ref_net.Proto().op[2].type = "Int8Dequantize"
-        net_onnxified = onnxifi_caffe2_net(
-            ref_net.Proto(),
-            {},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-            weight_names=["W_int8", "b"],
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.CreateNet(net_onnxified)
-        workspace.RunNet(net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_glow, Y_fbgemm):
-            diff_Y = np.abs(Y_glow - Y_fbgemm)
-            print_test_debug_info(
-                "int8_fc",
-                {
-                    "seed": rand_seed,
-                    "n": n,
-                    "X": X_fp32,
-                    "W": W_fp32,
-                    "b": b_fp32,
-                    "Y_fbgemm": Y_fbgemm,
-                    "Y_glow": Y_glow,
-                    "diff": diff_Y,
-                    "maxdiff": diff_Y.max(axis=1),
-                },
-            )
-            assert 0
-
-    @given(
-        n=st.integers(1, 1024),
-        m=st.integers(1, 1024),
-        k=st.integers(1, 1024),
-        f=st.integers(1, 1),  # TODO: figure a safe number to increase
-        rand_seed=st.integers(0, 65534),
-        quantize_bias=st.sampled_from([False]),
-    )
-    @settings(deadline=datetime.timedelta(seconds=50))
-    def test_int8_fc(
-        self, n, m, k, rand_seed, quantize_bias, f
-    ):
-        print(
-            f"n={n}, m={m}, k={k}, rand_seed={rand_seed}, quantize_bias={quantize_bias}"
-        )
-        np.random.seed(rand_seed)
-        workspace.ResetWorkspace()
-
-        ff = float(f)
-        X_fp32 = np.random.uniform(-ff, ff, size=(m, k)).astype(np.float32)
-        W_fp32 = np.random.uniform(-ff, ff, size=(n, k)).astype(np.float32)
-        b_fp32 = np.random.uniform(-ff, ff, size=(n)).astype(np.float32)
-
-        X_scale, X_zero_point = self._get_scale_zp(X_fp32)
-        Y_fp32 = np.dot(X_fp32, W_fp32.T) + b_fp32
-        Y_scale, Y_zero_point = self._get_scale_zp(Y_fp32)
-
-        workspace.FeedBlob("X", X_fp32)
-        workspace.FeedBlob("W", W_fp32)
-        workspace.FeedBlob("b", b_fp32)
-
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "Int8FCPackWeight",
-                ["W", "b"] if quantize_bias else ["W"],
-                ["W_int8", "b_int32"] if quantize_bias else ["W_int8"],
-                engine="DNNLOWP",
-                save_unpacked_weights=True,
-                in_scale=X_scale,
-            )
-        )
-
-        ref_net = core.Net("net")
-        ref_net.Int8QuantizeNNPI(
-            ["X"],
-            ["X_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point
-        )
-        ref_net.Int8FCFakeAcc32NNPI(
-            ["X_int8", "W_int8", "b_int32" if quantize_bias else "b"],
-            ["Y_int8"],
-            Y_scale=Y_scale,
-            Y_zero_point=Y_zero_point,
-        )
-        ref_net.Int8DequantizeNNPI(
-            ["Y_int8"],
-            ["Y"]
-        )
-        ref_net.Proto().external_output.append("Y")
-
-        # run ref_net
-        workspace.RunNetOnce(ref_net)
-        Y_fbgemm = workspace.FetchBlob("Y")
-
-        # run onnxifi net
-        ref_net.Proto().op[0].type = "Int8Quantize"
-        ref_net.Proto().op[1].type = "Int8FC"
-        ref_net.Proto().op[2].type = "Int8Dequantize"
-        net_onnxified = onnxifi_caffe2_net(
-            ref_net.Proto(),
-            {},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-            weight_names=["W_int8", "b_int32"] if quantize_bias else ["W_int8", "b"],
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.CreateNet(net_onnxified)
-        workspace.RunNet(net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_glow, Y_fbgemm):
-            diff_Y = np.abs(Y_glow - Y_fbgemm)
-            print_test_debug_info(
-                "int8_fc",
-                {
-                    "seed": rand_seed,
-                    "n": n,
-                    "m": m,
-                    "k": k,
-                    "X": X_fp32,
-                    "W": W_fp32,
-                    "b": b_fp32,
-                    "Y_fbgemm": Y_fbgemm,
-                    "Y_glow": Y_glow,
-                    "diff": diff_Y,
-                    "maxdiff": diff_Y.max(axis=1),
-                },
-            )
-            assert 0
-
-    @given(
-        n=st.integers(1, 4),
-        rand_seed=st.integers(0, 65534)
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_int8_small_input(self, n, rand_seed):
-        print("n={}, rand_seed={}".format(n, rand_seed))
-        np.random.seed(rand_seed)
-        workspace.ResetWorkspace()
-
-        X_fp32 = np.random.uniform(0.01, 0.03, size=(n, n)).astype(np.float32)
-        W_fp32 = np.identity(n, dtype=np.float32)
-        b_fp32 = np.zeros((n,), dtype=np.float32)
-
-        X_scale, X_zero_point = self._get_scale_zp(X_fp32)
-
-        workspace.FeedBlob("X", X_fp32)
-        workspace.FeedBlob("W", W_fp32)
-        workspace.FeedBlob("b", b_fp32)
-
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "Int8FCPackWeight",
-                ["W"],
-                ["W_int8"],
-                engine="DNNLOWP",
-                save_unpacked_weights=True,
-                in_scale=X_scale,
-            )
-        )
-
-        ref_net = core.Net("net")
-        ref_net.Int8QuantizeNNPI(
-            ["X"],
-            ["X_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point
-        )
-        ref_net.Int8FCFakeAcc32NNPI(
-            ["X_int8", "W_int8", "b"],
-            ["Y_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point,
-        )
-        ref_net.Int8DequantizeNNPI(
-            ["Y_int8"],
-            ["Y"]
-        )
-        ref_net.Proto().external_output.append("Y")
-
-        # run ref_net
-        workspace.RunNetOnce(ref_net)
-        Y_fbgemm = workspace.FetchBlob("Y")
-
-        # run onnxifi net
-        ref_net.Proto().op[0].type = "Int8Quantize"
-        ref_net.Proto().op[1].type = "Int8FC"
-        ref_net.Proto().op[2].type = "Int8Dequantize"
-        net_onnxified = onnxifi_caffe2_net(
-            ref_net.Proto(),
-            {},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-            weight_names=["W_int8", "b"],
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.CreateNet(net_onnxified)
-        workspace.RunNet(net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_glow, Y_fbgemm):
-            diff_Y = np.abs(Y_glow - Y_fbgemm)
-            print_test_debug_info(
-                "int8_fc",
-                {
-                    "seed": rand_seed,
-                    "n": n,
-                    "X": X_fp32,
-                    "W": W_fp32,
-                    "b": b_fp32,
-                    "Y_fbgemm": Y_fbgemm,
-                    "Y_glow": Y_glow,
-                    "diff": diff_Y,
-                    "maxdiff": diff_Y.max(axis=1),
-                },
-            )
-            assert 0
diff --git a/caffe2/contrib/fakelowp/test/test_int8_quant.py b/caffe2/contrib/fakelowp/test/test_int8_quant.py
deleted file mode 100644
index 2770dc7bef046..0000000000000
--- a/caffe2/contrib/fakelowp/test/test_int8_quant.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Must happen before importing caffe2.python.*
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-import datetime
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import settings
-
-workspace.GlobalInit(
-    [
-        "caffe2",
-        "--glow_global_fp16=0",
-        "--glow_global_fused_scale_offset_fp16=0",
-        "--glow_global_force_sls_fp16_accum=0",
-    ]
-)
-
-class QuantTest(serial.SerializedTestCase):
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_dequantize(self):
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.append("X")
-        pred_net.external_output.append("Y")
-        x_scale = 0.10000000149011612
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Int8Quantize", ["X"], ["I"], Y_scale=x_scale, Y_zero_point=0
-            )
-        )
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Int8Dequantize", ["I"], ["Y"],
-            )
-        )
-        print(pred_net)
-        X = np.asarray([[1, 0], [0, 1]]).astype(np.float32)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(pred_net)
-        workspace.RunNet(pred_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-        workspace.ResetWorkspace()
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {"X": [5, 2]},
-            debug=True,
-            adjust_batch=True,
-            block_list=[0],
-            use_onnx=False,
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
-        )
-        np.testing.assert_equal(len(pred_net_onnxified.op), 2)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-        np.testing.assert_equal(Y_ref, Y_glow)
-
-    @settings(deadline=datetime.timedelta(seconds=20))
-    def test_quantize(self):
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.append("X")
-        pred_net.external_output.append("Y")
-        x_scale = 0.10000000149011612
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Int8Quantize", ["X"], ["Y"], Y_scale=x_scale, Y_zero_point=0
-            )
-        )
-        print(pred_net)
-        X = np.asarray([[1, 0], [0, 1]]).astype(np.float32)
-        workspace.FeedBlob("X", X)
-        workspace.RunNetOnce(pred_net)
-        Y_ref = workspace.FetchInt8Blob("Y")
-        workspace.ResetWorkspace()
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {"X": [2, 2]},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchInt8Blob("Y")
-        np.testing.assert_equal(Y_ref.data, Y_glow.data)
diff --git a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
deleted file mode 100644
index 099331af348e0..0000000000000
--- a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
+++ /dev/null
@@ -1,240 +0,0 @@
-import numpy as np
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-from caffe2.python import workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-from hypothesis import given, settings
-from hypothesis import strategies as st
-import caffe2.python.serialized_test.serialized_test_util as serial
-import datetime
-
-core.GlobalInit(["caffe2",
-                 "--glow_global_fp16=1",
-                 "--glow_global_fused_scale_offset_fp16=1",
-                 "--glow_global_force_sls_fp16_accum=1"])
-
-GLOW_LOWERED_BATCHNORM = False
-
-
-# Test the lowered LayerNorm op
-class LayerNorm(serial.SerializedTestCase):
-
-    @given(seed=st.integers(0, 65535),
-           batch_size=st.integers(min_value=1, max_value=50),
-           size=st.integers(min_value=2, max_value=128),
-           epsilon=st.floats(min_value=1e-4, max_value=1e-3),
-           elementwise_affine=st.booleans())
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_layernorm(self, seed, batch_size, size, epsilon, elementwise_affine):
-        np.random.seed(seed)
-        # Reset the workspace
-        workspace.ResetWorkspace()
-        axis = 1
-
-        dims = np.array(([batch_size, size]))
-        X = np.random.uniform(size=dims).astype(np.float32) - 0.5
-        gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
-        beta = np.random.randn(*X.shape[axis:]).astype(np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "gamma", "beta"])
-        pred_net.external_output.extend(["Y", "mean", "rstd"])
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "LayerNorm",
-                ["X", "gamma", "beta"] if elementwise_affine else ["X"],
-                ["Y", "mean", "rstd"],
-                axis=axis,
-                epsilon=epsilon,
-                elementwise_affine=elementwise_affine
-            )
-        )
-
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "pred_ref"
-        pred_net_ref.external_input.extend(["X", "gamma", "beta"])
-        pred_net_ref.external_output.extend(["Y", "mean", "rstd"])
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                "LayerNormFakeFP16NNPI",
-                ["X", "gamma", "beta"] if elementwise_affine else ["X"],
-                ["Y", "mean", "rstd"],
-                axis=axis,
-                epsilon=epsilon,
-                elementwise_affine=elementwise_affine
-            )
-        )
-
-        shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape}
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            shape_hits,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("gamma", gamma)
-        workspace.FeedBlob("beta", beta)
-
-        workspace.CreateNet(pred_net_ref)
-        workspace.CreateNet(pred_net_onnxified)
-
-        workspace.RunNet(pred_net_ref.name)
-        Y_c2 = workspace.FetchBlob("Y")
-
-        dims1 = np.array(([1, *dims]))
-        X_glow = X.reshape(dims1)
-        workspace.FeedBlob("X", X_glow)
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_glow, Y_c2):
-            diff_Y = np.abs(Y_glow - Y_c2)
-            print_test_debug_info(
-                "layernorm",
-                {
-                    "seed": seed,
-                    "size": size,
-                    "batch_size": batch_size,
-                    "epsilon": epsilon,
-                    "gamma": gamma,
-                    "beta": beta,
-                    "elementwise_affine": elementwise_affine,
-                    "X": X,
-                    "Y_glow": Y_glow,
-                    "Y_c2": Y_c2,
-                    "diff_Y": diff_Y,
-                }
-            )
-            assert(0)
-
-    def _get_scale_zp(self, tensor):
-        tensor_max = np.max(tensor)
-        tensor_min = min(0, np.min(tensor))
-        scale = np.float32(np.float16((tensor_max - tensor_min) / 255.0))
-        if scale < 1e-6:
-            scale = np.float32(1e-6)
-        zero_point = 0 - tensor_min / scale
-        zero_point = int(round(np.clip(zero_point, 0, 255.0)))
-        return (scale, zero_point)
-
-    def _layernorm_transform(self, X):
-        mean = np.mean(X, axis=1)
-        mean_exp = np.outer(mean, np.ones(X.shape[1]))
-        std = np.std(X, axis=1)
-        std_exp = np.outer(std, np.ones(X.shape[1]))
-        Y = (X - mean_exp) / std_exp
-        return Y
-
-    @given(seed=st.integers(0, 65535),
-           batch_size=st.integers(min_value=1, max_value=50),
-           size=st.integers(min_value=2, max_value=128),
-           epsilon=st.floats(min_value=1e-4, max_value=1e-3),
-           elementwise_affine=st.booleans())
-    @settings(deadline=datetime.timedelta(seconds=10))
-    # re-enable when T74553975 gets fixed
-    def test_fused_ln_quantize(self, seed, batch_size, size, epsilon, elementwise_affine):
-        np.random.seed(seed)
-
-        # Reset the workspace
-        workspace.ResetWorkspace()
-        axis = 1
-
-        dims = np.array(([batch_size, size]))
-        X = np.random.uniform(size=dims).astype(np.float32) - 0.5
-        gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
-        beta = np.random.randn(*X.shape[axis:]).astype(np.float32)
-
-        Y = self._layernorm_transform(X)
-        scale, zp = self._get_scale_zp(Y)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "gamma", "beta"])
-        pred_net.external_output.extend(["Y_q"])
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "LayerNorm",
-                ["X", "gamma", "beta"] if elementwise_affine else ["X"],
-                ["Y", "mean", "rstd"],
-                axis=axis,
-                epsilon=epsilon,
-                elementwise_affine=elementwise_affine
-            )
-        )
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Int8Quantize", ["Y"], ["Y_q"], Y_scale=scale, Y_zero_point=zp
-            )
-        )
-
-        print(pred_net)
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "pred_ref"
-        pred_net_ref.external_input.extend(["X", "gamma", "beta"])
-        pred_net_ref.external_output.extend(["Y_q"])
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                "LayerNormInt8QuantizeFakeNNPI",
-                ["X", "gamma", "beta"] if elementwise_affine else ["X"],
-                ["Y_q", "mean", "rstd"],
-                axis=axis,
-                epsilon=epsilon,
-                elementwise_affine=elementwise_affine,
-                Y_scale=scale, Y_zero_point=zp
-            )
-        )
-        shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape}
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            shape_hits,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("gamma", gamma)
-        workspace.FeedBlob("beta", beta)
-
-        workspace.CreateNet(pred_net_ref)
-        workspace.CreateNet(pred_net_onnxified)
-
-        workspace.RunNet(pred_net_ref.name)
-        Y_c2 = workspace.FetchInt8Blob("Y_q")
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchInt8Blob("Y_q")
-
-        if not np.allclose(Y_glow.data, Y_c2.data) or \
-           Y_glow.scale != Y_c2.scale or Y_glow.zero_point != Y_c2.zero_point:
-            diff_Y = np.abs(Y_glow.data.astype(np.float32) - Y_c2.data.astype(np.float32))
-            print_test_debug_info(
-                "layernorm",
-                {
-                    "seed": seed,
-                    "size": size,
-                    "batch_size": batch_size,
-                    "epsilon": epsilon,
-                    "gamma": gamma,
-                    "beta": beta,
-                    "elementwise_affine": elementwise_affine,
-                    "X": X,
-                    "Y_glow": Y_glow,
-                    "Y_c2": Y_c2,
-                    "diff_Y": diff_Y,
-                }
-            )
-            assert(0)
diff --git a/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py
deleted file mode 100644
index 17c0b2c601acc..0000000000000
--- a/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py
+++ /dev/null
@@ -1,368 +0,0 @@
-import numpy as np
-
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-import datetime
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-from caffe2.python import workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-from caffe2.python.fakelowp.test_utils import compute_ulp_error
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
-
-kEpsilon = 1e-8
-
-
-class ArithmeticOpsTest(serial.SerializedTestCase):
-    def _test_binary_op_graph(self, name, seed):
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-        # First dimension is the batch size
-        dims = np.concatenate((np.array([1]), np.random.randint(1, 20, size=3)))
-        A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
-        B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
-        # Avoid dividing by 0
-        B[np.abs(B) < 1e-3] = 1e-3
-        print(A.shape, B.shape)
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["A", "B"])
-        pred_net.external_output.append("C")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                name,
-                ["A", "B"],
-                ["C"]
-            )
-        )
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "ref"
-        pred_net_ref.external_input.extend(["A", "B"])
-        pred_net_ref.external_output.append("C_ref")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                name + "FakeFp16",
-                ["A", "B"],
-                ["C_ref"],
-            )
-        )
-
-        shape_hints = {"A": A.shape, "B": B.shape}
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                shape_hints,
-                                                debug=True,
-                                                adjust_batch=True,
-                                                use_onnx=False)
-        print(pred_net_onnxified)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.FeedBlob("A", A)
-        workspace.FeedBlob("B", B)
-
-        workspace.CreateNet(pred_net_ref)
-        workspace.CreateNet(pred_net_onnxified)
-        num_iterations = 10
-        for _ in range(num_iterations):
-            A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
-            B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
-            # Avoid dividing by 0
-            B[np.abs(B) < 1e-3] = 1e-3
-
-            workspace.FeedBlob("A", A)
-            workspace.FeedBlob("B", B)
-            # Run caffe2 net
-            workspace.RunNet(pred_net_ref.name)
-            Y_c2 = workspace.FetchBlob("C_ref")
-
-            # Run Glow net
-            workspace.RunNet(pred_net_onnxified.name)
-            Y_glow = workspace.FetchBlob("C")
-
-            Y_glow[Y_glow == np.Inf] = np.finfo(np.float16).max
-            Y_glow[Y_glow == np.NINF] = np.finfo(np.float16).min
-
-            # Ignore mismatches solely due to difference in precision
-            fp16_finite = np.isfinite(A.astype(np.float16) / B.astype(np.float16))
-
-            # Results should be identical since we are comparing with the C2 emulation
-            if not np.allclose(Y_c2[fp16_finite], Y_glow[fp16_finite]):
-                diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon))
-                print_test_debug_info(name, {
-                    "dims": dims, "iter": _, "seed": seed, "A": A, "B": B,
-                    "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff})
-                assert(0)
-
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_add_graph(self, seed):
-        self._test_binary_op_graph("Add", seed)
-
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_sub_graph(self, seed):
-        self._test_binary_op_graph("Sub", seed)
-
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_mul_graph(self, seed):
-        self._test_binary_op_graph("Mul", seed)
-
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_div_graph(self, seed):
-        self._test_binary_op_graph("Div", seed)
-
-
-class UnaryOpTest(serial.SerializedTestCase):
-    def _test_unary_op(self, opname, X, rtol=1e-5, atol=1e-8):
-        workspace.ResetWorkspace()
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.append("X")
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                opname,
-                ['X'],
-                ['Y'])
-        )
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.append("X")
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                opname + 'FakeFp16NNPI',
-                ['X'],
-                ['Y'])
-        )
-        print("REF NET = {}".format(ref_net))
-
-        shape_hints = {"X": X.shape}
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                shape_hints,
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(ref_net)
-        workspace.CreateNet(pred_net_onnxified)
-        # Run Glow net
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob('Y')
-        # Run caffe2 reference net
-        workspace.RunNet(ref_net.name)
-        Y_c2 = workspace.FetchBlob('Y')
-
-        if not np.allclose(Y_c2, Y_glow, rtol=atol, atol=atol):
-            diff = np.abs(Y_c2 - Y_glow)
-            np.save('/tmp/' + opname + 'diff', diff)
-            np.save('/tmp/' + opname + 'result', Y_c2)
-            print_test_debug_info(opname, {
-                "X": X,
-                "Y_c2": Y_c2,
-                "Y_glow": Y_glow,
-                "diff": diff
-            })
-            assert(0)
-
-        return Y_glow
-
-    def _test_op_w_ulp_error(self, seed, opname, regions, atol=0, err_threshold=2):
-        ulp_err = 0
-        for x0, x1 in regions:
-            X = np.linspace(x0, x1, num=1025, dtype=np.float16).astype(np.float32)
-            Y_glow = self._test_unary_op(opname, X, atol=atol)
-            region_err = compute_ulp_error(opname, X, Y_glow)
-            ulp_err = max(np.max(np.abs(region_err)), ulp_err)
-        if (ulp_err > err_threshold):
-            print(r'{} Op detected ulp_err={}'.format(opname, ulp_err))
-            assert(0)
-
-    # These tests doesn't need to run multiple times given that it is a
-    # linear sweep and it is deterministic.
-    # Once hypothesis.testing version is updated, we can re-enable
-    # testing with different hypothesis examples.
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=20))
-    def test_sigmoid(self, seed):
-        np.random.seed(seed)
-        opname = "Sigmoid"
-        regions = [[-8., -4.], [-4., -2.], [-2., -1.], [-1., -.5], [-.5, -.25],
-                   [-.25, .25], [.25, .5], [.5, 1.], [1., 2.], [2., 4.],
-                   [4., 8.]]
-        self._test_op_w_ulp_error(seed, opname, regions, atol=0, err_threshold=2.5)
-
-    # These tests doesn't need to run multiple times given that it is a
-    # linear sweep and it is deterministic.
-    # Once hypothesis.testing version is updated, we can re-enable
-    # testing with different hypothesis examples.
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=20))
-    def test_tanh(self, seed):
-        np.random.seed(seed)
-        opname = "Tanh"
-        regions = [[2.**(-9), 2.**(-8)], [2.**(-8), 2.**(-7)],
-                   [2.**(-7), 2.**(-6)], [2.**(-6), 2.**(-5)],
-                   [2.**(-5), 2.**(-4)], [2.**(-4), 2.**(-3)],
-                   [2.**(-3), 2.**(-2)], [2.**(-2), 2.**(-1)],
-                   [2.**(-1), 1.], [1., 2.], [2., 4.], [4., 8.]]
-        self._test_op_w_ulp_error(seed, opname, regions, atol=0, err_threshold=2)
-
-    # These tests doesn't need to run multiple times given that it is a
-    # linear sweep and it is deterministic.
-    # Once hypothesis.testing version is updated, we can re-enable
-    # testing with different hypothesis examples.
-    # TODO: move atol to 1e-8 once we get a non-lowered swish implementation
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_swish(self, seed):
-        np.random.seed(seed)
-        opname = "Swish"
-        regions = [[-20.5, -11.], [-11., -8.], [-8., -1.], [-1., -0.1],
-                   [-1. / 8., 1. / 8.], [1. / 8, 5.], [5., 8.]]
-        self._test_op_w_ulp_error(seed, opname, regions, atol=0.008, err_threshold=384)
-
-    # These tests doesn't need to run multiple times given that it is a
-    # linear sweep and it is deterministic.
-    # Once hypothesis.testing version is updated, we can re-enable
-    # testing with different hypothesis examples.
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_logit(self, seed):
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-        n = 1
-        m = 15361
-        X = np.linspace(0, 1, num=m, dtype=np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.append("X")
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                'Logit',
-                ['X'],
-                ['Y'],
-                eps=1e-6)
-        )
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.append("X")
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                'LogitFakeFp16NNPI',
-                ['X'],
-                ['Y'],
-                eps=1e-6)
-        )
-        print("REF NET = {}".format(ref_net))
-
-        shape_hints = {"X": (n, m)}
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                shape_hints,
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(ref_net)
-        workspace.CreateNet(pred_net_onnxified)
-        # Run Glow net
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob('Y')
-        # Run caffe2 reference net
-        workspace.RunNet(ref_net.name)
-        Y_c2 = workspace.FetchBlob('Y')
-
-        diff = np.abs(Y_c2 - Y_glow)
-        if np.nanmax(diff) > 9e-3:
-            np.save('/tmp/logit_diff', diff)
-            np.save('/tmp/logit_result', Y_c2)
-            print_test_debug_info('Logit', {
-                "X": X,
-                "Y_c2": Y_c2,
-                "Y_glow": Y_glow,
-                "diff": diff
-            })
-            assert(0)
-
-class ReluTest(serial.SerializedTestCase):
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def relu_test(self, inputs, gc, dc, seed):
-        np.random.seed(seed)
-        inputs = np.random.rand(1).astype(np.float32)
-        X = inputs[0]
-        # First dimension is the batch size
-        print(X.shape)
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Relu",
-                ["X"],
-                ["Y"]
-            )
-        )
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "ref"
-        pred_net_ref.external_input.extend(["X"])
-        pred_net_ref.external_output.append("Y_ref")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                "ReluFakeFp16",
-                ["X"],
-                ["Y_ref"],
-            )
-        )
-
-        shape_hints = {"X": X.shape}
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                shape_hints,
-                                                debug=True,
-                                                adjust_batch=True,
-                                                use_onnx=False)
-        print(pred_net_onnxified)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.FeedBlob("X", X)
-
-        workspace.CreateNet(pred_net_ref)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.FeedBlob("X", X)
-        # Run caffe2 net
-        workspace.RunNet(pred_net_ref.name)
-        Y_c2 = workspace.FetchBlob("Y_ref")
-
-        # Run Glow net
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        # Results should be identical since we are comparing with the C2 emulation
-        if not np.allclose(Y_c2, Y_glow):
-            diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon))
-            print_test_debug_info("Relu", {
-                "seed": seed, "X": X,
-                "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff})
-            assert(0)
diff --git a/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py
deleted file mode 100644
index 17817d46c4064..0000000000000
--- a/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py
+++ /dev/null
@@ -1,215 +0,0 @@
-import numpy as np
-import unittest
-
-# Must happen before importing caffe2.python.*
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-import datetime
-
-workspace.GlobalInit(["caffe2", "--glow_global_fp16=1",
-                      "--glow_global_fused_scale_offset_fp16=1",
-                      "--glow_global_force_sls_fp16_accum=1"])
-
-
-class SparseLengthsSum4BitFakeNNPIFp16Test(serial.SerializedTestCase):
-    @given(seed=st.integers(0, 65535))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_slws_fused_4bit_rowwise_all_same(self, seed):
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-        n = 1
-        m = 2
-        data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1
-        max_segments = 5
-        max_segment_length = 100
-        num_lengths = np.random.randint(1, max_segments + 1)
-        # number of segments to run
-        lengths = np.random.randint(0, max_segment_length + 1,
-                                    size=num_lengths).astype(np.int32)
-        num_indices = np.sum(lengths)
-        indices = np.zeros(num_indices, dtype=np.int64)
-        weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)])\
-            .astype(np.float32)
-        weights = np.ones(len(indices)).astype(np.float32)
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused4BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"])
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused4BitRowwiseQuantized",
-                ['data'],
-                ['quantized_data']
-            )
-        )
-        print("quantized", workspace.FetchBlob("quantized_data"))
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=max_segments,
-            max_seq_size=max_segment_length,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(ref_net)
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob('Y')
-        workspace.RunNet(ref_net.name)
-        Y_c2 = workspace.FetchBlob('Y')
-        if not np.allclose(Y_c2, Y_glow):
-            print_test_debug_info(
-                "slws_fused_4bit_rowwise",
-                {"seed": seed,
-                 "indices": indices,
-                 "data": data,
-                 "lengths": lengths,
-                 "weights": weights,
-                 "Y_c2": Y_c2,
-                 "Y_glow": Y_glow,
-                 "diff": Y_glow - Y_c2,
-                 "rowwise_diff": (Y_glow - Y_c2)[:, 0]})
-            assert(0)
-
-
-    @given(
-        seed=st.integers(0, 65535),
-        num_rows=st.integers(2, 20),
-        embedding_dim=st.sampled_from([8, 12, 16, 24, 32, 54, 64, 72, 128]),
-        batch_size=st.integers(1, 32),
-        max_weight=st.integers(0, 1),
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_slws_fused_4bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight):
-        workspace.ResetWorkspace()
-        np.random.seed(seed)
-        data = np.random.rand(num_rows, embedding_dim).astype(np.float32)
-        data = data * 1e-3
-
-        lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32)
-        _indices = []
-        for length in lengths:
-            _indices.extend(np.random.choice(np.arange(1, num_rows), length))
-        indices = np.asarray(_indices).astype(np.int64)
-
-        weights = np.random.uniform(
-            low=0,
-            high=max_weight,
-            size=[len(indices)]
-        ).astype(np.float32) - max_weight / 2.0
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused4BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"])
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused4BitRowwiseQuantized",
-                ["data"],
-                ["quantized_data"]
-            )
-        )
-
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=batch_size,
-            max_seq_size=np.max(lengths),
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False
-        )
-
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob('Y')
-
-        workspace.RunNet(ref_net.name)
-        Y_c2 = workspace.FetchBlob('Y')
-
-        if not np.allclose(Y_c2, Y_glow):
-            print_test_debug_info(
-                "slws_fused_4bit_rowwise",
-                {
-                    "seed": seed,
-                    "indices": indices,
-                    "data": data.shape,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_c2": Y_c2.shape,
-                    "Y_glow": Y_glow.shape,
-                    "diff": Y_glow - Y_c2,
-                    "rowwise_diff": (Y_glow - Y_c2)[:, 0]
-                }
-            )
-            assert(0)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py
deleted file mode 100644
index fe00a88b4f664..0000000000000
--- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py
+++ /dev/null
@@ -1,566 +0,0 @@
-import unittest
-from typing import Dict, Any
-
-# Must happen before importing caffe2.python.*
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-import datetime
-import numpy as np
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-workspace.GlobalInit(
-    [
-        "caffe2",
-        "--glow_global_fp16=1",
-        "--glow_global_fused_scale_offset_fp16=1",
-        "--glow_global_force_sls_fp16_accum=1",
-    ]
-)
-GLOW_MATMUL_ATOL = 1e-5
-GLOW_MATMUL_RTOL = 1e-3
-
-
-class SparseLengthsSum8BitFakeNNPIFp16Test(serial.SerializedTestCase):
-    def Skip_test_SLS_NonQuantized_fp16(self):
-        N = 20000
-        DIM = 64
-        D = (4 * np.random.random_sample((N, DIM)) + 1).astype(np.float32)
-        I = (np.random.randint(0, N, size=12)).astype(np.int64)
-        L = np.asarray([4, 4, 4]).astype(np.int32)
-        workspace.FeedBlob("D", D)
-
-        ref_c2_net = core.Net("test_ref_c2")
-        ref_c2_net.SparseLengthsSum(["D", "I", "L"], "ref_out")
-        ref_c2_net.Proto().external_input.extend(["D", "I", "L"])
-        ref_c2_net.Proto().external_output.extend(["ref_out"])
-
-        fp16_c2_net = core.Net("test_fp16_c2")
-        fp16_c2_net.SparseLengthsSumFakeFP16AccFP16(["D", "I", "L"], "fp16_out")
-
-        input_dict : Dict[Any, Any] = {}
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["D", "I", "L"])
-        pred_net.external_output.append("glow_out")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator("SparseLengthsSum", ["D", "I", "L"], ["glow_out"])
-        )
-
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            input_dict,
-            max_batch_size=3,
-            max_seq_size=16,
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-        )
-
-        num_onnxified_ops = sum(
-            1 if op.type == "Onnxifi" else 0 for op in onnxified_net.op
-        )
-        print(onnxified_net)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("I", I)
-        workspace.FeedBlob("L", L)
-
-        workspace.RunNetOnce(ref_c2_net)
-        ref_c2_out = workspace.FetchBlob("ref_out")
-
-        workspace.RunNetOnce(fp16_c2_net)
-        fp16_c2_out = workspace.FetchBlob("fp16_out")
-
-        np.testing.assert_allclose(fp16_c2_out, ref_c2_out, atol=1e-3, rtol=1e-3)
-
-        workspace.RunNetOnce(onnxified_net)
-        fp16_glow_out = workspace.FetchBlob("glow_out")
-
-        if not np.allclose(fp16_glow_out, fp16_c2_out):
-            diff = np.abs(fp16_glow_out - fp16_c2_out)
-            print_test_debug_info(
-                "sls",
-                {
-                    "indices": I,
-                    "data": D,
-                    "lengths": L,
-                    "Y_c2": fp16_c2_out,
-                    "Y_glow": fp16_glow_out,
-                    "diff": diff,
-                    "rowwise_diff": diff[:, 0],
-                },
-            )
-            assert 0
-
-    @given(seed=st.integers(0, 65535))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_slws_fused_8bit_rowwise_all_same(self, seed):
-        # Comment out for predictable debugging
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-        n = 1
-        m = 2
-        data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1
-
-        max_segments = 5
-        max_segment_length = 200
-        num_lengths = np.random.randint(1, max_segments + 1)
-        # number of segments to run
-        lengths = np.random.randint(0, max_segment_length + 1, size=num_lengths).astype(
-            np.int32
-        )
-        num_indices = np.sum(lengths)
-        indices = np.zeros(num_indices, dtype=np.int64)
-        weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)]).astype(
-            np.float32
-        )
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=max_segments,
-            max_seq_size=max_segment_length,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_c2 = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_c2, Y_glow):
-            print_test_debug_info(
-                "slws_fused_8bit_rowwise",
-                {
-                    "seed": seed,
-                    "indices": indices,
-                    "data": data,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_c2": Y_c2,
-                    "Y_glow": Y_glow,
-                    "diff": Y_glow - Y_c2,
-                    "rowwise_diff": (Y_glow - Y_c2)[:, 0],
-                },
-            )
-            assert 0
-
-    @given(
-        seed=st.integers(0, 65535),
-        num_rows=st.integers(2, 20),
-        embedding_dim=st.sampled_from([8, 12, 16, 24, 32, 54, 64, 128]),
-        batch_size=st.integers(1, 5),
-        max_weight=st.integers(0, 100),
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_slws_fused_8bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight):
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-
-        data = np.random.rand(num_rows, embedding_dim).astype(np.float32)
-        lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32)
-
-        _indices = []
-        for length in lengths:
-            _indices.extend(np.random.choice(np.arange(1, num_rows), length))
-        indices = np.asarray(_indices).astype(np.int64)
-
-        weights = np.random.uniform(
-            low=0,
-            high=max_weight,
-            size=[len(indices)]
-        ).astype(np.float32)
-
-        assert(len(weights) < 64000)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=batch_size,
-            max_seq_size=np.max(lengths),
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-
-        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            print_test_debug_info(
-                "slws_fused_8bit_rowwise_inv_scale",
-                {
-                    "seed": seed,
-                    "num_rows": num_rows,
-                    "embedding_dim": embedding_dim,
-                    "batch_size": batch_size,
-                    "max_weight": max_weight,
-                    "indices": indices,
-                    "data": data.shape,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_glow": Y_glow,
-                    "Y_ref": Y_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0
-
-    # Simple test to aid debugging order of operations
-    # Minimize the case to an SLS that adds two rows
-    @given(seed=st.integers(0, 65535))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_small_sls(self, seed):
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-
-        n = 2
-        DIM = 3
-        data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32)
-
-        lengths = np.array([n], dtype=np.int32)
-        indices = np.array(range(n), dtype=np.int64)
-        weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-
-        quantized_data = workspace.FetchBlob("quantized_data")
-
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=1,
-            max_seq_size=n,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-
-        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            np.set_printoptions(precision=12)
-            print(
-                "ref",
-                Y_ref.astype(np.float16).astype(np.float32),
-                "glow",
-                Y_glow.astype(np.float16).astype(np.float32),
-            )
-            print_test_debug_info(
-                "slws_fused_8bit_rowwise_inv_scale",
-                {
-                    "seed": seed,
-                    "indices": indices,
-                    "data": data,
-                    "quantized_data": quantized_data,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_glow": Y_glow,
-                    "Y_ref": Y_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0
-
-    @given(seed=st.integers(0, 65535))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_sls_layernorm(self, seed):
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-
-        n = 2
-        DIM = 3
-        data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32)
-
-        lengths = np.array([n], dtype=np.int32)
-        indices = np.array(range(n), dtype=np.int64)
-        weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y_norm")
-        pred_net.external_output.append("Y_mean")
-        pred_net.external_output.append("Y_std")
-
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "LayerNorm",
-                ["Y"],
-                ["Y_norm", "Y_mean", "Y_std"],
-                epsilon=1e-4,
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y_norm")
-        ref_net.external_output.append("Y_mean")
-        ref_net.external_output.append("Y_std")
-
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "LayerNormFakeFP16NNPI",
-                ["Y"],
-                ["Y_norm", "Y_mean", "Y_std"],
-                epsilon=1e-4,
-                axis=1,
-                elementwise_affine=False
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-
-        quantized_data = workspace.FetchBlob("quantized_data")
-
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=1,
-            max_seq_size=n,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        print("before", pred_net)
-        print("after", onnxified_net)
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y_norm")
-        Y_mean_glow = workspace.FetchBlob("Y_mean")
-        Y_std_glow = workspace.FetchBlob("Y_std")
-
-        workspace.RunNet(ref_net.name)
-        Y = workspace.FetchBlob("Y")
-        print("pre normalization", Y)
-        Y_ref = workspace.FetchBlob("Y_norm")
-        Y_mean_ref = workspace.FetchBlob("Y_mean")
-        Y_std_ref = workspace.FetchBlob("Y_std")
-
-        # print(Y_ref, Y_glow)
-        # print(Y_ref.shape, Y_glow.shape)
-
-        diff = np.abs(Y_ref - Y_glow)
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            np.set_printoptions(precision=12)
-            print(
-                "ref",
-                Y_ref.astype(np.float16).astype(np.float32),
-                "glow",
-                Y_glow.astype(np.float16).astype(np.float32),
-            )
-            print_test_debug_info(
-                "slws_fused_8bit_rowwise_inv_scale",
-                {
-                    "seed": seed,
-                    "indices": indices,
-                    "data": data,
-                    "quantized_data": quantized_data,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_norm_glow": Y_glow,
-                    "Y_norm_ref": Y_ref,
-                    "Y_mean_glow": Y_mean_glow,
-                    "Y_std_glow": Y_std_glow,
-                    "Y_mean_ref": Y_mean_ref,
-                    "Y_std_ref": Y_std_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py
deleted file mode 100644
index 48796d7b08603..0000000000000
--- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py
+++ /dev/null
@@ -1,264 +0,0 @@
-import unittest
-
-# Must happen before importing caffe2.python.*
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-import datetime
-import numpy as np
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-workspace.GlobalInit(
-    [
-        "caffe2",
-        "--glow_global_fp16=0",
-        "--glow_global_fused_scale_offset_fp16=0",
-        "--glow_global_force_sls_fp16_accum=0",
-    ]
-)
-GLOW_MATMUL_ATOL = 1e-5
-GLOW_MATMUL_RTOL = 1e-3
-
-class SparseLengthsSum8BitFakeNNPIFp32Test(serial.SerializedTestCase):
-    @given(
-        seed=st.integers(0, 65535),
-        num_rows=st.integers(2, 20),
-        embedding_dim=st.sampled_from([8, 12, 16, 24, 32, 54, 64, 128]),
-        batch_size=st.integers(1, 5),
-        max_weight=st.integers(0, 100),
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_slws_fused_8bit_rowwise_acc32_nnpi(
-        self, seed, num_rows, embedding_dim, batch_size, max_weight
-    ):
-        workspace.GlobalInit(
-            [
-                "caffe2",
-                "--glow_global_fp16=0",
-                "--glow_global_fused_scale_offset_fp16=0",
-                "--glow_global_force_sls_fp16_accum=0",
-            ]
-        )
-
-        workspace.ResetWorkspace()
-        np.random.seed(seed)
-        data = np.random.rand(num_rows, embedding_dim).astype(np.float32)
-        lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32)
-
-        _indices = []
-        for length in lengths:
-            _indices.extend(np.random.choice(np.arange(1, num_rows), length))
-        indices = np.asarray(_indices).astype(np.int64)
-
-        weights = np.random.uniform(
-            low=0,
-            high=max_weight,
-            size=[len(indices)]
-        ).astype(np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized",
-                ["data"],
-                ["quantized_data"]
-            )
-        )
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=batch_size,
-            max_seq_size=np.max(lengths),
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-
-        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            print_test_debug_info(
-                "test_slws_fused_8bit_rowwise_acc32_nnpi",
-                {
-                    "seed": seed,
-                    "num_rows": num_rows,
-                    "embedding_dim": embedding_dim,
-                    "batch_size": batch_size,
-                    "indices": indices,
-                    "data": data.shape,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_glow": Y_glow,
-                    "Y_ref": Y_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0
-
-
-    @given(seed=st.integers(0, 65535))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_small_sls_acc32(self, seed):
-        workspace.GlobalInit(
-            [
-                "caffe2",
-                "--glow_global_fp16=0",
-                "--glow_global_fused_scale_offset_fp16=0",
-                "--glow_global_force_sls_fp16_accum=0",
-            ]
-        )
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-
-        n = 2
-        DIM = 3
-        data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32)
-
-        lengths = np.array([n], dtype=np.int32)
-        indices = np.array(range(n), dtype=np.int64)
-        weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-
-        quantized_data = workspace.FetchBlob("quantized_data")
-
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=1,
-            max_seq_size=n,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-
-        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            np.set_printoptions(precision=12)
-            print(
-                "ref",
-                Y_ref.astype(np.float16).astype(np.float32),
-                "glow",
-                Y_glow.astype(np.float16).astype(np.float32),
-            )
-            print_test_debug_info(
-                "test_small_sls_acc32",
-                {
-                    "seed": seed,
-                    "indices": indices,
-                    "data": data,
-                    "quantized_data": quantized_data,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_glow": Y_glow,
-                    "Y_ref": Y_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/contrib/fakelowp/unary_fp16_fake_op.cc b/caffe2/contrib/fakelowp/unary_fp16_fake_op.cc
deleted file mode 100644
index 418aa6357db67..0000000000000
--- a/caffe2/contrib/fakelowp/unary_fp16_fake_op.cc
+++ /dev/null
@@ -1,1028 +0,0 @@
-#include "unary_fp16_fake_op.h"
-#include <fbgemm/FbgemmConvert.h>
-#include "caffe2/contrib/fakelowp/fp16_fma.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/eigen_utils.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace fake_fp16 {
-auto sig_lut = std::vector<at::Half>{
-    0.0000e+00f, 0.0000e+00f, 0.0000e+00f, 0.0000e+00f, 0.0000e+00f,
-    0.0000e+00f, 0.0000e+00f, 5.9605e-08f, 5.9605e-08f, 5.9605e-08f,
-    5.9605e-08f, 5.9605e-08f, 5.9605e-08f, 5.9605e-08f, 5.9605e-08f,
-    5.9605e-08f, 5.9605e-08f, 1.1921e-07f, 1.1921e-07f, 1.1921e-07f,
-    1.1921e-07f, 1.7881e-07f, 1.7881e-07f, 1.7881e-07f, 2.3842e-07f,
-    2.3842e-07f, 2.3842e-07f, 2.9802e-07f, 2.9802e-07f, 3.5763e-07f,
-    4.1723e-07f, 4.7684e-07f, 4.7684e-07f, 5.3644e-07f, 6.5565e-07f,
-    7.1526e-07f, 7.7486e-07f, 8.9407e-07f, 9.5367e-07f, 1.0729e-06f,
-    1.1921e-06f, 1.3709e-06f, 1.4901e-06f, 1.6689e-06f, 1.8477e-06f,
-    2.0862e-06f, 2.3246e-06f, 2.6226e-06f, 2.9206e-06f, 3.2187e-06f,
-    3.6359e-06f, 4.0531e-06f, 4.4703e-06f, 5.0068e-06f, 5.6028e-06f,
-    6.2585e-06f, 6.9737e-06f, 7.7486e-06f, 8.6427e-06f, 9.6560e-06f,
-    1.0788e-05f, 1.2040e-05f, 1.3411e-05f, 1.4961e-05f, 1.6689e-05f,
-    1.8597e-05f, 2.0742e-05f, 2.3186e-05f, 2.5868e-05f, 2.8849e-05f,
-    3.2187e-05f, 3.5882e-05f, 4.0054e-05f, 4.4644e-05f, 4.9829e-05f,
-    5.5552e-05f, 6.1989e-05f, 6.9141e-05f, 7.7128e-05f, 8.6069e-05f,
-    9.6023e-05f, 1.0711e-04f, 1.1951e-04f, 1.3328e-04f, 1.4865e-04f,
-    1.6594e-04f, 1.8501e-04f, 2.0647e-04f, 2.3031e-04f, 2.5702e-04f,
-    2.8658e-04f, 3.1972e-04f, 3.5667e-04f, 3.9792e-04f, 4.4370e-04f,
-    4.9496e-04f, 5.5218e-04f, 6.1607e-04f, 6.8712e-04f, 7.6675e-04f,
-    8.5497e-04f, 9.5367e-04f, 1.0643e-03f, 1.1864e-03f, 1.3237e-03f,
-    1.4763e-03f, 1.6470e-03f, 1.8368e-03f, 2.0485e-03f, 2.2850e-03f,
-    2.5482e-03f, 2.8419e-03f, 3.1700e-03f, 3.5343e-03f, 3.9406e-03f,
-    4.3945e-03f, 4.9019e-03f, 5.4626e-03f, 6.0921e-03f, 6.7902e-03f,
-    7.5722e-03f, 8.4381e-03f, 9.4070e-03f, 1.0483e-02f, 1.1681e-02f,
-    1.3008e-02f, 1.4488e-02f, 1.6144e-02f, 1.7975e-02f, 2.0004e-02f,
-    2.2263e-02f, 2.4780e-02f, 2.7557e-02f, 3.0655e-02f, 3.4058e-02f,
-    3.7872e-02f, 4.2053e-02f, 4.6692e-02f, 5.1819e-02f, 5.7434e-02f,
-    6.3660e-02f, 7.0496e-02f, 7.8003e-02f, 8.6243e-02f, 9.5276e-02f,
-    1.0516e-01f, 1.1591e-01f, 1.2756e-01f, 1.4026e-01f, 1.5393e-01f,
-    1.6882e-01f, 1.8469e-01f, 2.0178e-01f, 2.1997e-01f, 2.3926e-01f,
-    2.5977e-01f, 2.8125e-01f, 3.0396e-01f, 3.2764e-01f, 3.5205e-01f,
-    3.7744e-01f, 4.0356e-01f, 4.3018e-01f, 4.5703e-01f, 4.8438e-01f,
-    5.1172e-01f, 5.3906e-01f, 5.6592e-01f, 5.9277e-01f, 6.1865e-01f,
-    6.4404e-01f, 6.6895e-01f, 6.9287e-01f, 7.1533e-01f, 7.3730e-01f,
-    7.5781e-01f, 7.7734e-01f, 7.9590e-01f, 8.1299e-01f, 8.2910e-01f,
-    8.4375e-01f, 8.5791e-01f, 8.7061e-01f, 8.8232e-01f, 8.9355e-01f,
-    9.0332e-01f, 9.1260e-01f, 9.2090e-01f, 9.2822e-01f, 9.3555e-01f,
-    9.4189e-01f, 9.4727e-01f, 9.5264e-01f, 9.5752e-01f, 9.6143e-01f,
-    9.6533e-01f, 9.6875e-01f, 9.7217e-01f, 9.7461e-01f, 9.7754e-01f,
-    9.7949e-01f, 9.8193e-01f, 9.8340e-01f, 9.8535e-01f, 9.8682e-01f,
-    9.8828e-01f, 9.8926e-01f, 9.9023e-01f, 9.9121e-01f, 9.9219e-01f,
-    9.9316e-01f, 9.9365e-01f, 9.9463e-01f, 9.9512e-01f, 9.9561e-01f,
-    9.9609e-01f, 9.9658e-01f, 9.9658e-01f, 9.9707e-01f, 9.9756e-01f,
-    9.9756e-01f, 9.9805e-01f, 9.9805e-01f, 9.9854e-01f, 9.9854e-01f,
-    9.9854e-01f, 9.9902e-01f, 9.9902e-01f, 9.9902e-01f, 9.9902e-01f,
-    9.9902e-01f, 9.9951e-01f, 9.9951e-01f, 9.9951e-01f, 9.9951e-01f,
-    9.9951e-01f, 9.9951e-01f, 9.9951e-01f, 9.9951e-01f, 9.9951e-01f,
-    9.9951e-01f, 1.0000e+00f, 1.0000e+00f, 1.0000e+00f, 1.0000e+00f,
-    1.0000e+00f, 1.0000e+00f, 1.0000e+00f, 1.0000e+00f, 1.0000e+00f,
-    1.0000e+00f, 1.0000e+00f, 1.0000e+00f, 1.0000e+00f, 1.0000e+00f,
-    1.0000e+00f, 1.0000e+00f};
-
-at::Half CalcSigmoidByLUT(at::Half x) {
-  at::Half a = -18.0;
-  at::Half b = 10.0;
-  int nBins = 256;
-
-  at::Half delta = (b - a) / (at::Half)nBins;
-  at::Half one_over_delta = 1 / delta;
-  at::Half a_one_over_delta = a * one_over_delta;
-
-  // Clamp the input in the range of a to b
-  if (x < a) {
-    x = a;
-  }
-
-  if (x > b) {
-    x = b;
-  }
-
-  at::Half bin_calc = std::fma(x, one_over_delta, -a_one_over_delta);
-
-  uint32_t bin = bin_calc < 0 ? 0 : (uint32_t)floor(bin_calc);
-  // Clamp bin to SIGMOID_KNOT_LUT_SIZE-2, to have valid LUT access i.e. b+1 =
-  // 255 (for LUT size of 256)
-  if (bin > 254) {
-    bin = 254;
-  }
-  // Use MAC bin_x = a + delta * at::Half(bin);
-  at::Half bin_x = std::fma(delta, at::Half(bin), a);
-
-  at::Half p = at::Half(x - bin_x) * one_over_delta;
-
-  at::Half res1 = sig_lut[bin + 1] * p;
-  // Use MAC res2 = (1 - p) * lut[bin] = -p * lut[bin] + lut[bin]
-  at::Half res2 = std::fma(-p, sig_lut[bin], sig_lut[bin]);
-
-  return at::Half(res1 + res2);
-}
-
-const int TANH_LINEAR_MAX_VALUE = 10048;
-const int TANH_ASYMPTOTE_MIN_VALUE = 17538;
-
-static float tanh_lut[] = {
-    at::Half(0.02831274f), at::Half(0.02928850f), at::Half(0.03026419f),
-    at::Half(0.03123983f), at::Half(0.03319093f), at::Half(0.03514177f),
-    at::Half(0.03709235f), at::Half(0.03904264f), at::Half(0.04099264f),
-    at::Half(0.04294232f), at::Half(0.04489168f), at::Half(0.04684070f),
-    at::Half(0.04878936f), at::Half(0.05073764f), at::Half(0.05268555f),
-    at::Half(0.05463305f), at::Half(0.05658013f), at::Half(0.05852679f),
-    at::Half(0.06047300f), at::Half(0.06241875f), at::Half(0.06630881f),
-    at::Half(0.07019686f), at::Half(0.07408277f), at::Half(0.07796644f),
-    at::Half(0.08184774f), at::Half(0.08572657f), at::Half(0.08960279f),
-    at::Half(0.09347630f), at::Half(0.09734699f), at::Half(0.10121473f),
-    at::Half(0.10507942f), at::Half(0.10894093f), at::Half(0.11279916f),
-    at::Half(0.11665399f), at::Half(0.12050531f), at::Half(0.12435300f),
-    at::Half(0.13203707f), at::Half(0.13970530f), at::Half(0.14735681f),
-    at::Half(0.15499073f), at::Half(0.16260618f), at::Half(0.17020231f),
-    at::Half(0.17777826f), at::Half(0.18533320f), at::Half(0.19286629f),
-    at::Half(0.20037672f), at::Half(0.20786367f), at::Half(0.21532634f),
-    at::Half(0.22276395f), at::Half(0.23017571f), at::Half(0.23756087f),
-    at::Half(0.24491866f), at::Half(0.25954921f), at::Half(0.27406159f),
-    at::Half(0.28845021f), at::Half(0.30270973f), at::Half(0.31683500f),
-    at::Half(0.33082112f), at::Half(0.34466340f), at::Half(0.35835740f),
-    at::Half(0.37189891f), at::Half(0.38528397f), at::Half(0.39850884f),
-    at::Half(0.41157006f), at::Half(0.42446437f), at::Half(0.43718879f),
-    at::Half(0.44974055f), at::Half(0.46211716f), at::Half(0.48633602f),
-    at::Half(0.50982997f), at::Half(0.53258729f), at::Half(0.55459972f),
-    at::Half(0.57586239f), at::Half(0.59637356f), at::Half(0.61613443f),
-    at::Half(0.63514895f), at::Half(0.65342359f), at::Half(0.67096707f),
-    at::Half(0.68779021f), at::Half(0.70390560f), at::Half(0.71932750f),
-    at::Half(0.73407152f), at::Half(0.74815447f), at::Half(0.76159416f),
-    at::Half(0.78661881f), at::Half(0.80930107f), at::Half(0.82980191f),
-    at::Half(0.84828364f), at::Half(0.86490662f), at::Half(0.87982670f),
-    at::Half(0.89319334f), at::Half(0.90514825f), at::Half(0.91582454f),
-    at::Half(0.92534623f), at::Half(0.93382804f), at::Half(0.94137554f),
-    at::Half(0.94808529f), at::Half(0.95404526f), at::Half(0.95933529f),
-    at::Half(0.96402758f), at::Half(0.97187275f), at::Half(0.97802611f),
-    at::Half(0.98284503f), at::Half(0.98661430f), at::Half(0.98955975f),
-    at::Half(0.99185972f), at::Half(0.99365463f), at::Half(0.99505475f),
-    at::Half(0.99614653f), at::Half(0.99699764f), at::Half(0.99766098f),
-    at::Half(0.99817790f), at::Half(0.99858066f), at::Half(0.99889444f),
-    at::Half(0.99913889f), at::Half(0.99932930f), at::Half(0.99959315f),
-    at::Half(0.99975321f), at::Half(0.99985031f), at::Half(0.99990920f),
-    at::Half(0.99994493f), at::Half(0.99996660f), at::Half(0.99997974f),
-    at::Half(0.99998771f), at::Half(0.99999255f), at::Half(0.99999548f),
-    at::Half(0.99999726f), at::Half(0.99999834f)};
-
-static float tanh_error_lut[] = {
-    at::Half(0.00001525f), at::Half(0.00001525f), at::Half(0.00001524f),
-    at::Half(0.00003049f), at::Half(0.00003048f), at::Half(0.00003048f),
-    at::Half(0.00003047f), at::Half(0.00003047f), at::Half(0.00003046f),
-    at::Half(0.00003046f), at::Half(0.00003045f), at::Half(0.00003045f),
-    at::Half(0.00003044f), at::Half(0.00003044f), at::Half(0.00003043f),
-    at::Half(0.00003042f), at::Half(0.00003042f), at::Half(0.00003041f),
-    at::Half(0.00003040f), at::Half(0.00006078f), at::Half(0.00006075f),
-    at::Half(0.00006072f), at::Half(0.00006068f), at::Half(0.00006065f),
-    at::Half(0.00006061f), at::Half(0.00006057f), at::Half(0.00006052f),
-    at::Half(0.00006048f), at::Half(0.00006043f), at::Half(0.00006039f),
-    at::Half(0.00006034f), at::Half(0.00006028f), at::Half(0.00006023f),
-    at::Half(0.00006018f), at::Half(0.00006012f), at::Half(0.00012006f),
-    at::Half(0.00011982f), at::Half(0.00011955f), at::Half(0.00011928f),
-    at::Half(0.00011899f), at::Half(0.00011869f), at::Half(0.00011837f),
-    at::Half(0.00011805f), at::Half(0.00011770f), at::Half(0.00011735f),
-    at::Half(0.00011698f), at::Half(0.00011660f), at::Half(0.00011621f),
-    at::Half(0.00011581f), at::Half(0.00011539f), at::Half(0.00011497f),
-    at::Half(0.00022860f), at::Half(0.00022676f), at::Half(0.00022482f),
-    at::Half(0.00022281f), at::Half(0.00022071f), at::Half(0.00021853f),
-    at::Half(0.00021629f), at::Half(0.00021397f), at::Half(0.00021159f),
-    at::Half(0.00020914f), at::Half(0.00020664f), at::Half(0.00020408f),
-    at::Half(0.00020147f), at::Half(0.00019882f), at::Half(0.00019612f),
-    at::Half(0.00019338f), at::Half(0.00037842f), at::Half(0.00036709f),
-    at::Half(0.00035558f), at::Half(0.00034394f), at::Half(0.00033223f),
-    at::Half(0.00032049f), at::Half(0.00030876f), at::Half(0.00029710f),
-    at::Half(0.00028554f), at::Half(0.00027412f), at::Half(0.00026286f),
-    at::Half(0.00025180f), at::Half(0.00024097f), at::Half(0.00023038f),
-    at::Half(0.00022005f), at::Half(0.00021000f), at::Half(0.00039101f),
-    at::Half(0.00035441f), at::Half(0.00032033f), at::Half(0.00028878f),
-    at::Half(0.00025973f), at::Half(0.00023313f), at::Half(0.00020885f),
-    at::Half(0.00018680f), at::Half(0.00016682f), at::Half(0.00014878f),
-    at::Half(0.00013253f), at::Half(0.00011793f), at::Half(0.00010484f),
-    at::Half(0.00009312f), at::Half(0.00008266f), at::Half(0.00007332f),
-    at::Half(0.00012258f), at::Half(0.00009615f), at::Half(0.00007530f),
-    at::Half(0.00005889f), at::Half(0.00004602f), at::Half(0.00003594f),
-    at::Half(0.00002805f), at::Half(0.00002188f), at::Half(0.00001706f),
-    at::Half(0.00001330f), at::Half(0.00001036f), at::Half(0.00000808f),
-    at::Half(0.00000629f), at::Half(0.00000490f), at::Half(0.00000382f),
-    at::Half(0.00000298f), at::Half(0.00000000f), at::Half(0.00000000f),
-    at::Half(0.00000000f), at::Half(0.00000000f), at::Half(0.00000000f),
-    at::Half(0.00000000f), at::Half(0.00000000f), at::Half(0.00000000f),
-    at::Half(0.00000000f), at::Half(0.00000000f), at::Half(0.00000000f),
-    at::Half(0.00000000f), at::Half(0.00000000f)};
-
-at::Half CalcTanhByLUT(at::Half input) {
-  uint16_t InputInU16_temp;
-  uint16_t InputInU16;
-  int mask = 0x7FFF;
-  uint16_t sign_bit;
-  float unit = 1.0;
-  float index;
-  at::Half err_f16;
-  at::Half output = at::Half(0.0f);
-
-  /* Extracting bits 9-15 of f16 input to get the LUT index */
-  InputInU16_temp = (*((uint16_t*)&input));
-  sign_bit = InputInU16_temp & 0x8000;
-  InputInU16 = InputInU16_temp & mask; // positive number
-  if (InputInU16 < TANH_LINEAR_MAX_VALUE) {
-    output = input;
-  } else if (InputInU16 >= TANH_ASYMPTOTE_MIN_VALUE) {
-    output = unit;
-  } else {
-    index = ((InputInU16 - TANH_LINEAR_MAX_VALUE) % 64);
-    err_f16 =
-        at::Half(tanh_error_lut[(InputInU16 - TANH_LINEAR_MAX_VALUE) / 64]);
-    output = at::Half(tanh_lut[(InputInU16 - TANH_LINEAR_MAX_VALUE) / 64]);
-
-    output = at::Half(std::fma(err_f16, index, output));
-  }
-  uint16_t outputInU16_temp = (*((uint16_t*)&output)) | sign_bit;
-  output = (*((at::Half*)&outputInU16_temp));
-  return output;
-}
-
-at::Half CalcTanhByPolynomial(at::Half input) {
-  static const at::Half aCoefficient[64] = {
-      at::Half(-0.423340f), at::Half(-0.352783f), at::Half(-0.411377f),
-      at::Half(-0.284424f), at::Half(-0.335938f), at::Half(-0.333740f),
-      at::Half(-0.333252f), at::Half(-0.332275f), at::Half(-0.333252f),
-      at::Half(-0.333252f), at::Half(-0.333252f), at::Half(-0.333252f),
-      at::Half(-0.333252f), at::Half(-0.333252f), at::Half(-0.333252f),
-      at::Half(-0.333252f), at::Half(-0.333008f), at::Half(-0.333008f),
-      at::Half(-0.332764f), at::Half(-0.332275f), at::Half(-0.331055f),
-      at::Half(-0.329346f), at::Half(-0.325195f), at::Half(-0.317383f),
-      at::Half(-0.301758f), at::Half(-0.273438f), at::Half(-0.219360f),
-      at::Half(-0.136108f), at::Half(-0.018677f), at::Half(0.080872f),
-      at::Half(0.107056f),  at::Half(0.063110f),  at::Half(0.017731f),
-      at::Half(0.002533f),  at::Half(0.000147f),  at::Half(0.000003f),
-      at::Half(0.000000f),  at::Half(0.000000f),  at::Half(0.000000f),
-      at::Half(0.000000f),  at::Half(0.000000f),  at::Half(0.000000f),
-      at::Half(0.000000f),  at::Half(0.000000f),  at::Half(0.000000f),
-      at::Half(0.000000f),  at::Half(0.000000f),  at::Half(0.000000f),
-      at::Half(0.000000f),  at::Half(0.000000f),  at::Half(0.000000f),
-      at::Half(0.000000f),  at::Half(0.000000f),  at::Half(0.000000f),
-      at::Half(0.000000f),  at::Half(0.000000f),  at::Half(0.000000f),
-      at::Half(0.000000f),  at::Half(0.000000f),  at::Half(0.000000f),
-      at::Half(0.000000f),  at::Half(0.000000f),  at::Half(0.000000f),
-      at::Half(0.000000f)};
-  static const at::Half bCoefficient[64] = {
-      at::Half(0.000004f),  at::Half(0.000002f),  at::Half(0.000017f),
-      at::Half(-0.000016f), at::Half(0.000001f),  at::Half(0.000000f),
-      at::Half(0.000000f),  at::Half(-0.000001f), at::Half(-0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(-0.000000f),
-      at::Half(-0.000001f), at::Half(-0.000002f), at::Half(-0.000007f),
-      at::Half(-0.000020f), at::Half(-0.000054f), at::Half(-0.000158f),
-      at::Half(-0.000433f), at::Half(-0.001253f), at::Half(-0.003410f),
-      at::Half(-0.009712f), at::Half(-0.025681f), at::Half(-0.068665f),
-      at::Half(-0.162354f), at::Half(-0.346680f), at::Half(-0.566406f),
-      at::Half(-0.640137f), at::Half(-0.439941f), at::Half(-0.161255f),
-      at::Half(-0.030548f), at::Half(-0.002459f), at::Half(-0.000061f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(-0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(-0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(-0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(-0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(-0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(-0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(-0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(-0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(-0.000000f)};
-  static const at::Half cCoefficient[64] = {
-      at::Half(1.000000f), at::Half(1.000000f), at::Half(1.000000f),
-      at::Half(1.000000f), at::Half(1.000000f), at::Half(1.000000f),
-      at::Half(1.000000f), at::Half(1.000000f), at::Half(1.000000f),
-      at::Half(1.000000f), at::Half(1.000000f), at::Half(1.000000f),
-      at::Half(1.000000f), at::Half(1.000000f), at::Half(1.000000f),
-      at::Half(1.000000f), at::Half(1.000000f), at::Half(1.000000f),
-      at::Half(1.000000f), at::Half(1.000000f), at::Half(1.000000f),
-      at::Half(1.000000f), at::Half(1.000000f), at::Half(1.000000f),
-      at::Half(1.000977f), at::Half(1.003906f), at::Half(1.014648f),
-      at::Half(1.050781f), at::Half(1.147461f), at::Half(1.309570f),
-      at::Half(1.378906f), at::Half(1.073242f), at::Half(0.500488f),
-      at::Half(0.124329f), at::Half(0.013718f), at::Half(0.000464f),
-      at::Half(0.000004f), at::Half(0.000000f), at::Half(0.000000f),
-      at::Half(0.000000f), at::Half(0.000000f), at::Half(0.000000f),
-      at::Half(0.000000f), at::Half(0.000000f), at::Half(0.000000f),
-      at::Half(0.000000f), at::Half(0.000000f), at::Half(0.000000f),
-      at::Half(0.000000f), at::Half(0.000000f), at::Half(0.000000f),
-      at::Half(0.000000f), at::Half(0.000000f), at::Half(0.000000f),
-      at::Half(0.000000f), at::Half(0.000000f), at::Half(0.000000f),
-      at::Half(0.000000f), at::Half(0.000000f), at::Half(0.000000f),
-      at::Half(0.000000f), at::Half(0.000000f)};
-  static const at::Half dCoefficient[64] = {
-      at::Half(-0.000000f), at::Half(0.000000f),  at::Half(0.000000f),
-      at::Half(-0.000000f), at::Half(0.000000f),  at::Half(0.000000f),
-      at::Half(0.000000f),  at::Half(-0.000000f), at::Half(-0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(-0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(-0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(-0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000000f), at::Half(-0.000000f),
-      at::Half(-0.000000f), at::Half(-0.000001f), at::Half(-0.000008f),
-      at::Half(-0.000045f), at::Half(-0.000237f), at::Half(-0.001252f),
-      at::Half(-0.005722f), at::Half(-0.022766f), at::Half(-0.062866f),
-      at::Half(-0.084229f), at::Half(0.071167f),  at::Half(0.466064f),
-      at::Half(0.828125f),  at::Half(0.974121f),  at::Half(0.998535f),
-      at::Half(0.999512f),  at::Half(1.000000f),  at::Half(1.000000f),
-      at::Half(1.000000f),  at::Half(1.000000f),  at::Half(1.000000f),
-      at::Half(1.000000f),  at::Half(1.000000f),  at::Half(1.000000f),
-      at::Half(1.000000f),  at::Half(1.000000f),  at::Half(1.000000f),
-      at::Half(1.000000f),  at::Half(1.000000f),  at::Half(1.000000f),
-      at::Half(1.000000f),  at::Half(1.000000f),  at::Half(1.000000f),
-      at::Half(1.000000f),  at::Half(1.000000f),  at::Half(1.000000f),
-      at::Half(1.000000f),  at::Half(1.000000f),  at::Half(1.000000f),
-      at::Half(1.000000f),  at::Half(1.000000f),  at::Half(1.000000f),
-      at::Half(1.000000f)};
-
-  int16_t temp = *((unsigned short*)(&input));
-  int16_t index = ((temp & 0x7E00) >> 9); // extract bits 9..14
-
-  // Because tanh is anti-symmetric, we can perform the operation for abs(t_2)
-  // and then multiply the result by -1 in case the number is negative.
-  at::Half absInput = (input < 0) ? (input * at::Half(-1)) : input;
-
-  at::Half a = aCoefficient[index];
-  at::Half b = bCoefficient[index];
-  at::Half c = cCoefficient[index];
-  at::Half d = dCoefficient[index];
-
-  b = b + a * absInput;
-  c = c + b * absInput;
-  at::Half tanhResult = d + c * absInput;
-  tanhResult =
-      (input < 0) ? tanhResult * -1 : tanhResult; // tanh is anti-symmetric
-
-  return tanhResult;
-}
-
-static const float swishLutKnot[] = {
-    -0.000000025618f, -0.000000027492f, -0.000000029503f, -0.000000031660f,
-    -0.000000033974f, -0.000000036457f, -0.000000039121f, -0.000000041979f,
-    -0.000000045045f, -0.000000048335f, -0.000000051864f, -0.000000055650f,
-    -0.000000059711f, -0.000000064068f, -0.000000068742f, -0.000000073756f,
-    -0.000000079134f, -0.000000084903f, -0.000000091091f, -0.000000097729f,
-    -0.000000104849f, -0.000000112487f, -0.000000120678f, -0.000000129464f,
-    -0.000000138888f, -0.000000148995f, -0.000000159835f, -0.000000171461f,
-    -0.000000183930f, -0.000000197302f, -0.000000211643f, -0.000000227023f,
-    -0.000000243516f, -0.000000261203f, -0.000000280171f, -0.000000300510f,
-    -0.000000322320f, -0.000000345707f, -0.000000370785f, -0.000000397675f,
-    -0.000000426507f, -0.000000457421f, -0.000000490568f, -0.000000526106f,
-    -0.000000564209f, -0.000000605061f, -0.000000648857f, -0.000000695811f,
-    -0.000000746149f, -0.000000800113f, -0.000000857963f, -0.000000919978f,
-    -0.000000986455f, -0.000001057716f, -0.000001134101f, -0.000001215978f,
-    -0.000001303740f, -0.000001397807f, -0.000001498630f, -0.000001606692f,
-    -0.000001722509f, -0.000001846635f, -0.000001979663f, -0.000002122227f,
-    -0.000002275009f, -0.000002438735f, -0.000002614186f, -0.000002802195f,
-    -0.000003003658f, -0.000003219530f, -0.000003450837f, -0.000003698675f,
-    -0.000003964218f, -0.000004248724f, -0.000004553538f, -0.000004880101f,
-    -0.000005229955f, -0.000005604750f, -0.000006006252f, -0.000006436353f,
-    -0.000006897075f, -0.000007390585f, -0.000007919199f, -0.000008485397f,
-    -0.000009091833f, -0.000009741347f, -0.000010436975f, -0.000011181970f,
-    -0.000011979807f, -0.000012834208f, -0.000013749153f, -0.000014728900f,
-    -0.000015778001f, -0.000016901329f, -0.000018104095f, -0.000019391869f,
-    -0.000020770612f, -0.000022246698f, -0.000023826941f, -0.000025518629f,
-    -0.000027329555f, -0.000029268051f, -0.000031343023f, -0.000033563995f,
-    -0.000035941147f, -0.000038485362f, -0.000041208271f, -0.000044122307f,
-    -0.000047240758f, -0.000050577824f, -0.000054148682f, -0.000057969547f,
-    -0.000062057748f, -0.000066431796f, -0.000071111470f, -0.000076117902f,
-    -0.000081473662f, -0.000087202860f, -0.000093331247f, -0.000099886327f,
-    -0.000106897471f, -0.000114396042f, -0.000122415530f, -0.000130991691f,
-    -0.000140162700f, -0.000149969309f, -0.000160455017f, -0.000171666254f,
-    -0.000183652574f, -0.000196466858f, -0.000210165536f, -0.000224808818f,
-    -0.000240460939f, -0.000257190429f, -0.000275070386f, -0.000294178776f,
-    -0.000314598750f, -0.000336418978f, -0.000359734009f, -0.000384644647f,
-    -0.000411258354f, -0.000439689678f, -0.000470060709f, -0.000502501557f,
-    -0.000537150867f, -0.000574156355f, -0.000613675392f, -0.000655875606f,
-    -0.000700935531f, -0.000749045294f, -0.000800407338f, -0.000855237192f,
-    -0.000913764284f, -0.000976232803f, -0.001042902614f, -0.001114050214f,
-    -0.001189969760f, -0.001270974146f, -0.001357396136f, -0.001449589573f,
-    -0.001547930649f, -0.001652819240f, -0.001764680328f, -0.001883965486f,
-    -0.002011154451f, -0.002146756779f, -0.002291313589f, -0.002445399391f,
-    -0.002609624017f, -0.002784634645f, -0.002971117924f, -0.003169802206f,
-    -0.003381459883f, -0.003606909840f, -0.003847020016f, -0.004102710089f,
-    -0.004374954273f, -0.004664784247f, -0.004973292195f, -0.005301633984f,
-    -0.005651032459f, -0.006022780865f, -0.006418246400f, -0.006838873881f,
-    -0.007286189544f, -0.007761804944f, -0.008267420984f, -0.008804832034f,
-    -0.009375930150f, -0.009982709382f, -0.010627270142f, -0.011311823633f,
-    -0.012038696315f, -0.012810334376f, -0.013629308191f, -0.014498316738f,
-    -0.015420191928f, -0.016397902812f, -0.017434559617f, -0.018533417555f,
-    -0.019697880343f, -0.020931503363f, -0.022237996384f, -0.023621225743f,
-    -0.025085215910f, -0.026634150281f, -0.028272371115f, -0.030004378428f,
-    -0.031834827716f, -0.033768526301f, -0.035810428118f, -0.037965626711f,
-    -0.040239346190f, -0.042636929886f, -0.045163826396f, -0.047825572693f,
-    -0.050627773935f, -0.053576079592f, -0.056676155448f, -0.059933651014f,
-    -0.063354161868f, -0.066943186351f, -0.070706076060f, -0.074647979518f,
-    -0.078773778353f, -0.083088015310f, -0.087594813369f, -0.092297785215f,
-    -0.097199932315f, -0.102303532798f, -0.107610017402f, -0.113119832706f,
-    -0.118832290945f, -0.124745405760f, -0.130855713302f, -0.137158078258f,
-    -0.143645484496f, -0.150308810262f, -0.157136588078f, -0.164114749820f,
-    -0.171226357826f, -0.178451323291f, -0.185766113763f, -0.193143452081f,
-    -0.200552009794f, -0.207956098813f, -0.215315365860f, -0.222584495122f,
-    -0.229712925434f, -0.236644589255f, -0.243317681599f, -0.249664467991f,
-    -0.255611141330f, -0.261077738174f, -0.265978125496f, -0.270220069141f,
-    -0.273705395141f, -0.276330254541f, -0.277985501459f, -0.278557192623f,
-    -0.277927214632f, -0.275974042583f, -0.272573630550f, -0.267600430710f,
-    -0.260928533747f, -0.252432918694f, -0.241990795705f, -0.229483020597f,
-    -0.214795555613f, -0.197820946950f, -0.178459786438f, -0.156622122604f,
-    -0.132228785369f, -0.105212589022f, -0.075519379953f, -0.043108898898f,
-    -0.007955432116f, 0.029951768224f,  0.070608307356f,  0.113994720397f,
-    0.160076791741f,  0.208806111511f,  0.260120852100f,  0.313946741537f,
-    0.370198205218f,  0.428779643703f,  0.489586811792f,  0.552508263155f,
-    0.617426825163f,  0.684221070309f,  0.752766753312f,  0.822938186673f,
-    0.894609531639f,  0.967655986107f,  1.041954855681f,  1.117386498687f,
-    1.193835140227f,  1.271189554285f,  1.349343616257f,  1.428196731135f,
-    1.507654144811f,  1.587627147660f,  1.668033180730f,  1.748795855517f,
-    1.829844898577f,  1.911116032122f,  1.992550801364f,  2.074096358779f,
-    2.155705214697f,  2.237334962784f,  2.318947988046f,  2.400511164082f,
-    2.481995545357f,  2.563376059425f,  2.644631203185f,  2.725742746485f,
-    2.806695445713f,  2.887476769407f,  2.968076637341f,  3.048487174140f,
-    3.128702478008f,  3.208718404888f,  3.288532368044f,  3.368143152875f,
-    3.447550746568f,  3.526756182070f,  3.605761395761f,  3.684569098138f,
-    3.763182656766f,  3.841605990732f,  3.919843475836f,  3.997899859739f,
-    4.075780186311f,  4.153489728464f,  4.231033928743f,  4.308418347017f,
-    4.385648614645f,  4.462730394497f,  4.539669346292f,  4.616471096730f,
-    4.693141213927f,  4.769685185729f,  4.846108401469f,  4.922416136819f,
-    4.998613541386f,  5.074705628726f,  5.150697268515f,  5.226593180611f,
-    5.302397930762f,  5.378115927776f,  5.453751421942f,  5.529308504540f,
-    5.604791108293f,  5.680203008624f,  5.755547825582f,  5.830829026356f,
-    5.906049928253f,  5.981213702080f,  6.056323375837f,  6.131381838663f,
-    6.206391844979f,  6.281356018769f,  6.356276857966f,  6.431156738895f,
-    6.505997920746f,  6.580802550041f,  6.655572665075f,  6.730310200330f,
-    6.805016990792f,  6.879694776213f,  6.954345205283f,  7.028969839682f,
-    7.103570158049f,  7.178147559817f,  7.252703368935f,  7.327238837474f,
-    7.401755149105f,  7.476253422433f,  7.550734714247f,  7.625200022602f,
-    7.699650289794f,  7.774086405219f,  7.848509208110f,  7.922919490136f,
-    7.997318040654f,
-};
-
-at::Half CalcSwishByLUT(at::Half x) {
-  const at::Half a = (at::Half)(-20.5);
-  const at::Half b = (at::Half)(8.0);
-  const int nBins = 384;
-
-  if ((x > b) || (x == 0.0)) {
-    return x;
-  }
-
-  at::Half delta = (b - a) / (at::Half)nBins;
-  at::Half one_over_delta = at::Half(1) / delta;
-  at::Half a_one_over_delta = a * one_over_delta;
-
-  // Clamp the input in the range of a to b
-  if (x < a) {
-    x = a;
-  }
-
-  if (x > b) {
-    x = b;
-  }
-  /*
-   * bin_calc = (x - a) * one_over_delta;
-   * Use MAC bin_calc = x * one_over_delta - a_one_over_delta;
-   */
-
-  float f_x = x;
-  float f_one_over_delta = one_over_delta;
-  float f_a_one_over_delta = -a_one_over_delta;
-  fma_fp16(1, &f_x, &f_one_over_delta, &f_a_one_over_delta);
-  at::Half bin_calc = f_a_one_over_delta;
-
-  uint32_t bin = bin_calc < 0 ? 0 : (uint32_t)floor(bin_calc);
-  // Clamp bin to nBins-2, to have valid LUT access i.e. b+1 = 255 (for LUT size
-  // of 256)
-  if (bin > (nBins - 2)) {
-    bin = nBins - 2;
-  }
-
-  // Use MAC bin_x = a + delta * at::Half(bin);
-
-  float f_delta = delta;
-  float f_bin = at::Half(bin);
-  float f_a = a;
-  fma_fp16(1, &f_delta, &f_bin, &f_a);
-  at::Half bin_x = at::Half(f_a);
-
-  at::Half p = at::Half(x - bin_x) * one_over_delta;
-
-  at::Half res1 = swishLutKnot[bin + 1] * p;
-  float f_p = -p;
-  float lutVal = at::Half(swishLutKnot[bin]);
-
-  fma_fp16(1, &f_p, &lutVal, &lutVal);
-  at::Half res2 = lutVal;
-
-  return at::Half(res1 + res2);
-}
-static const float swishLutKnotCub[] = {
-    -0.00000000e+00f, -0.00000000e+00f, -0.00000000e+00f, -0.00000000e+00f,
-    -0.00000000e+00f, -0.00000000e+00f, -0.00000000e+00f, -0.00000000e+00f,
-    -0.00000000e+00f, -5.96046448e-08f, -5.96046448e-08f, -5.96046448e-08f,
-    -5.96046448e-08f, -5.96046448e-08f, -5.96046448e-08f, -5.96046448e-08f,
-    -5.96046448e-08f, -5.96046448e-08f, -1.19209290e-07f, -1.19209290e-07f,
-    -1.19209290e-07f, -1.19209290e-07f, -1.78813934e-07f, -1.78813934e-07f,
-    -1.78813934e-07f, -2.38418579e-07f, -2.38418579e-07f, -2.98023224e-07f,
-    -2.98023224e-07f, -3.57627869e-07f, -4.17232513e-07f, -4.17232513e-07f,
-    -4.76837158e-07f, -5.36441803e-07f, -6.55651093e-07f, -7.15255737e-07f,
-    -7.74860382e-07f, -8.94069672e-07f, -1.01327896e-06f, -1.13248825e-06f,
-    -1.25169754e-06f, -1.43051147e-06f, -1.60932541e-06f, -1.78813934e-06f,
-    -2.02655792e-06f, -2.26497650e-06f, -2.56299973e-06f, -2.92062759e-06f,
-    -3.27825546e-06f, -3.63588333e-06f, -4.11272049e-06f, -4.58955765e-06f,
-    -5.18560410e-06f, -5.84125519e-06f, -6.55651093e-06f, -7.33137131e-06f,
-    -8.22544098e-06f, -9.23871994e-06f, -1.03712082e-05f, -1.16825104e-05f,
-    -1.31130219e-05f, -1.46627426e-05f, -1.65104866e-05f, -1.84774399e-05f,
-    -2.07424164e-05f, -2.33054161e-05f, -2.61068344e-05f, -2.93254852e-05f,
-    -3.29017639e-05f, -3.68356705e-05f, -4.13656235e-05f, -4.63724136e-05f,
-    -5.19752502e-05f, -5.82337379e-05f, -6.52670860e-05f, -7.31945038e-05f,
-    -8.19563866e-05f, -9.18507576e-05f, -1.02877617e-04f, -1.15275383e-04f,
-    -1.29103661e-04f, -1.44600868e-04f, -1.61886215e-04f, -1.81198120e-04f,
-    -2.02775002e-04f, -2.26974487e-04f, -2.53915787e-04f, -2.84194946e-04f,
-    -3.17811966e-04f, -3.55482101e-04f, -3.97443771e-04f, -4.44412231e-04f,
-    -4.96864319e-04f, -5.55038452e-04f, -6.20365143e-04f, -6.93321228e-04f,
-    -7.74383545e-04f, -8.64505768e-04f, -9.65118408e-04f, -1.07765198e-03f,
-    -1.20258331e-03f, -1.34181976e-03f, -1.49631500e-03f, -1.66797638e-03f,
-    -1.85966492e-03f, -2.07328796e-03f, -2.30979919e-03f, -2.57301331e-03f,
-    -2.86483765e-03f, -3.18908691e-03f, -3.54766846e-03f, -3.94821167e-03f,
-    -4.39071655e-03f, -4.87899780e-03f, -5.42068481e-03f, -6.01959229e-03f,
-    -6.68334961e-03f, -7.41958618e-03f, -8.22448730e-03f, -9.12475586e-03f,
-    -1.01089478e-02f, -1.11923218e-02f, -1.23901367e-02f, -1.37023926e-02f,
-    -1.51443481e-02f, -1.67388916e-02f, -1.84631348e-02f, -2.03704834e-02f,
-    -2.24456787e-02f, -2.47192383e-02f, -2.71911621e-02f, -2.98919678e-02f,
-    -3.28063965e-02f, -3.59802246e-02f, -3.93981934e-02f, -4.30908203e-02f,
-    -4.70581055e-02f, -5.13000488e-02f, -5.58471680e-02f, -6.06689453e-02f,
-    -6.57348633e-02f, -7.11669922e-02f, -7.67822266e-02f, -8.26416016e-02f,
-    -8.86840820e-02f, -9.48486328e-02f, -1.01074219e-01f, -1.07238770e-01f,
-    -1.13342285e-01f, -1.19201660e-01f, -1.24633789e-01f, -1.29516602e-01f,
-    -1.33666992e-01f, -1.36840820e-01f, -1.38793945e-01f, -1.39160156e-01f,
-    -1.37817383e-01f, -1.34521484e-01f, -1.28662109e-01f, -1.20300293e-01f,
-    -1.08947754e-01f, -9.43603516e-02f, -7.63549805e-02f, -5.47180176e-02f,
-    -2.92968750e-02f, 0.00000000e+00f,  3.32031250e-02f,  7.02514648e-02f,
-    1.11145020e-01f,  1.55639648e-01f,  2.03491211e-01f,  2.54638672e-01f,
-    3.08837891e-01f,  3.65478516e-01f,  4.24560547e-01f,  4.85839844e-01f,
-    5.48828125e-01f,  6.13281250e-01f,  6.78710938e-01f,  7.45605469e-01f,
-    8.12988281e-01f,  8.80859375e-01f,  9.49218750e-01f,  1.01757812e+00f,
-    1.08691406e+00f,  1.15527344e+00f,  1.22363281e+00f,  1.29199219e+00f,
-    1.36035156e+00f,  1.42871094e+00f,  1.49707031e+00f,  1.56445312e+00f,
-    1.63183594e+00f,  1.69824219e+00f,  1.76562500e+00f,  1.83203125e+00f,
-    1.89843750e+00f,  1.96386719e+00f,  2.02929688e+00f,  2.09570312e+00f,
-    2.16015625e+00f,  2.22460938e+00f,  2.29101562e+00f,  2.35546875e+00f,
-    2.41992188e+00f,  2.48242188e+00f,  2.54687500e+00f,  2.61132812e+00f,
-    2.67578125e+00f,  2.73828125e+00f,  2.80273438e+00f,  2.86523438e+00f,
-    2.92968750e+00f,  2.99218750e+00f,  3.05664062e+00f,  3.11914062e+00f,
-    3.18164062e+00f,  3.24609375e+00f,  3.30859375e+00f,  3.37109375e+00f,
-    3.43359375e+00f,  3.49609375e+00f,  3.56054688e+00f,  3.62304688e+00f,
-    3.68554688e+00f,  3.74804688e+00f,  3.81054688e+00f,  3.87304688e+00f,
-    3.93554688e+00f,  3.99804688e+00f,  4.06250000e+00f};
-
-at::Half CalcSwishByLUTCubic(at::Half x) {
-  const float SWISH_KNOT_RANGE_MIN = -20.5f;
-  const float SWISH_KNOT_RANGE_MAX = 8.0f;
-  const float SWISH_KNOT_LUT_DELTA = 0.125000f;
-  const int SWISH_KNOT_LUT_BIAS = 165;
-
-  at::Half x_min = (at::Half)SWISH_KNOT_RANGE_MIN;
-  at::Half x_max = (at::Half)SWISH_KNOT_RANGE_MAX;
-  at::Half delta = (at::Half)SWISH_KNOT_LUT_DELTA;
-  at::Half bias = SWISH_KNOT_LUT_BIAS;
-
-  if (x > SWISH_KNOT_RANGE_MAX) {
-    return x;
-  }
-
-  at::Half one_over_delta = at::Half(1) / delta;
-
-  // Clamp the input in the range of a to b
-  if (x < x_min) {
-    x = x_min;
-  }
-  if (x > x_max) {
-    x = x_max;
-  }
-  at::Half x_over_delta = x * one_over_delta;
-  at::Half x_over_delta_int = std::round(x_over_delta);
-  at::Half p = x_over_delta - x_over_delta_int;
-
-  x_over_delta_int = x_over_delta_int + bias;
-  uint32_t k_bin = (uint32_t)x_over_delta_int;
-
-  at::Half y_left = swishLutKnotCub[k_bin - 1];
-  at::Half y_mid = swishLutKnotCub[k_bin];
-  at::Half y_right = swishLutKnotCub[k_bin + 1];
-
-  at::Half a = y_mid + y_mid;
-  at::Half c = (y_right + y_left);
-  c = c - a;
-  at::Half b = y_right - y_left;
-
-  at::Half result = std::fma(p, c, b);
-  result = result * p;
-  result = result + a;
-
-  if (x == (at::Half)0.0f) {
-    result = x;
-  }
-  return result;
-}
-
-at::Half CalcLogit(at::Half input, float eps) {
-  // Clamp the input in the range of eps to (1-eps)
-  float x = at::Half(input);
-  if (at::Half(input) < at::Half(eps)) {
-    x = at::Half(eps);
-  }
-  if (at::Half(input) > at::Half(1 - eps)) {
-    x = at::Half(1 - eps);
-  }
-  if (x < 0.0f || x > 1.0f) {
-    return at::Half(NAN);
-  } else {
-    if (x < eps) {
-      float lower_bound = log(eps / (1.0 - eps));
-      return at::Half(lower_bound);
-    } else if (input >= (1.0f - eps)) {
-      float upper_bound = log((1.0 - eps) / eps);
-      return at::Half(upper_bound);
-    } else {
-      return at::Half(log((x / (1 - x))));
-    }
-  }
-}
-
-} // namespace fake_fp16
-
-namespace caffe2 {
-using namespace fake_fp16;
-
-struct SigmoidEmulatorFunctor {
-  bool operator()(
-      const int N,
-      const float* X,
-      float* Y,
-      CPUContext* /* unused */) const {
-    for (int i = 0; i < N; i++) {
-      Y[i] = CalcSigmoidByLUT((at::Half)X[i]);
-    }
-    return true;
-  }
-};
-
-struct TanhEmulatorFunctor {
-  bool operator()(
-      const int N,
-      const float* X,
-      float* Y,
-      CPUContext* /* unused */) const {
-    for (int i = 0; i < N; i++) {
-      Y[i] = CalcTanhByLUT((at::Half)X[i]);
-    }
-    return true;
-  }
-};
-
-OpSchema::Cost CostInferenceForRelu(
-    const OperatorDef& def,
-    const vector<TensorShape>& in) {
-  struct OpSchema::Cost cost = PointwiseCostInference<0>(def, in);
-  cost.params_bytes = 0;
-  return cost;
-}
-
-REGISTER_CPU_OPERATOR(
-    ReluFakeFp16,
-    UnaryElementwiseOp<
-        TensorTypes<float>,
-        CPUContext,
-        ReluFakeFp16Functor<CPUContext>>);
-
-// Input: X, output: Y
-OPERATOR_SCHEMA(ReluFakeFp16)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .AllowInplace({{0, 0}})
-    .CostInferenceFunction(CostInferenceForRelu)
-    .IdenticalTypeAndShape()
-    .SetDoc(R"DOC(
-Applies rectified linear unit operation to the input data element-wise. The Relu operation takes one input $X$, produces one output $Y$, and is defined as:
-
-$$Y = max(0,X)$$
-
-The input of this operator is converted to fp16 precision. And since the ReLU
-op doesn't have any arithmetics, there is no need to convert the output.
-
-<details>
-
-<summary> <b>Example</b> </summary>
-
-**Code**
-
-```
-workspace.ResetWorkspace()
-
-op = core.CreateOperator(
-  "ReluFakeFp16",
-  ["X"],
-  ["Y"]
-  )
-
-workspace.FeedBlob("X", np.random.randn(4, 4).astype(np.float32)) // NCHW
-print("X:\n", workspace.FetchBlob("X"), "\n")
-
-workspace.RunOperatorOnce(op)
-print("Y:\n", workspace.FetchBlob("Y"))
-
-```
-
-**Result**
-
-```
-
-X:
- [[-1.4655551   0.64575136  0.7921748   0.4150579 ]
- [ 0.41085166 -0.2837964   0.9881425  -1.9300346 ]
- [ 0.39705405  0.44639114  0.9940703   0.2926532 ]
- [-0.6726489   0.01330667  1.101319    0.33858967]]
-
-Y:
- [[0.         0.64575136 0.7921748  0.4150579 ]
- [0.41085166 0.         0.9881425  0.        ]
- [0.39705405 0.44639114 0.9940703  0.2926532 ]
- [0.         0.01330667 1.101319   0.33858967]]
-
-```
-
-</details>
-
-
-)DOC")
-    .Input(0, "X", "1D input tensor")
-    .Output(0, "Y", "1D output tensor with same shape as input")
-    .InheritOnnxSchema();
-
-REGISTER_CPU_OPERATOR(
-    SigmoidFakeFp16NNPI,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, SigmoidEmulatorFunctor>);
-OPERATOR_SCHEMA(SigmoidFakeFp16NNPI).NumInputs(1).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(
-    SigmoidFakeFp16,
-    UnaryElementwiseOp<
-        TensorTypes<float>,
-        CPUContext,
-        SigmoidFakeIdealFp16Functor>);
-
-// Input: X, output: Y
-OPERATOR_SCHEMA(SigmoidFakeFp16)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .AllowInplace({{0, 0}})
-    .IdenticalTypeAndShape()
-    .SetDoc(R"DOC(
-Apply the Sigmoid function element-wise to the input tensor. This is often used
-as a non-linear activation function in a neural network. The sigmoid function is
-defined as:
-
-$$Sigmoid(x) = \frac{1}{1+\exp(-x)}$$
-
-The input and output of this operator are converted to fp16 precision.
-
-<details>
-
-<summary> <b>Example</b> </summary>
-
-**Code**
-
-```
-
-workspace.ResetWorkspace()
-
-op = core.CreateOperator(
-    "SigmoidFakeFp16",
-    ["X"],
-    ["Y"]
-)
-
-workspace.FeedBlob("X", np.random.randn(5).astype(np.float32))
-print("input:", workspace.FetchBlob("X"))
-workspace.RunOperatorOnce(op)
-print("sigmoid:", workspace.FetchBlob("Y"))
-
-```
-
-**Result**
-
-```
-
-input: [ 1.5744036   0.31632107  1.7842269   1.4450722  -2.1726978 ]
-sigmoid: [0.8284105  0.57842743 0.85621804 0.80923885 0.10222916]
-
-```
-
-</details>
-
-
-)DOC")
-    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
-    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.")
-    .InheritOnnxSchema();
-
-REGISTER_CPU_OPERATOR(
-    SqrFakeFp16,
-    UnaryElementwiseOp<
-        TensorTypes<float>,
-        CPUContext,
-        SqrFakeFp16Functor<CPUContext>>);
-
-OPERATOR_SCHEMA(SqrFakeFp16)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .AllowInplace({{0, 0}})
-    .IdenticalTypeAndShape()
-    .SetDoc(R"DOC(
-Performs element-wise squaring ($x^2$) of input tensor. Inputs are converted
-to fp16 before the operation, computation is in fp32, and the result is
-also converted to fp16.
-
-
-<details>
-
-<summary> <b>Example</b> </summary>
-
-**Code**
-
-```
-
-workspace.ResetWorkspace()
-
-op = core.CreateOperator(
-    "SqrFakeFp16",
-    ["X"],
-    ["Y"],
-)
-
-workspace.FeedBlob("X", (np.random.randint(10, size=(3,3))).astype(np.float32))
-print("X:", workspace.FetchBlob("X"))
-workspace.RunOperatorOnce(op)
-print("Y:", workspace.FetchBlob("Y"))
-
-```
-
-**Result**
-
-```
-
-X:
-[[4. 6. 2.]
- [0. 1. 6.]
- [9. 2. 7.]]
-Y:
-[[16. 36.  4.]
- [ 0.  1. 36.]
- [81.  4. 49.]]
-
-```
-
-</details>
-
-    )DOC")
-    .Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor.")
-    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.");
-
-REGISTER_CPU_OPERATOR(
-    TanhFakeFp16NNPI,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, TanhEmulatorFunctor>);
-OPERATOR_SCHEMA(TanhFakeFp16NNPI).NumInputs(1).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(
-    TanhFakeFp16,
-    UnaryElementwiseOp<
-        TensorTypes<float>,
-        CPUContext,
-        TanhFakeIdealFp16Functor>);
-OPERATOR_SCHEMA(TanhFakeFp16)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .AllowInplace({{0, 0}})
-    .IdenticalTypeAndShape()
-    .SetDoc(R"DOC(
-Calculates the hyperbolic tangent of the given input tensor element-wise. This
-operation can be done in an in-place fashion too, by providing the same input
-and output blobs.
-
-The input and output of this operator are converted to fp16 precision.
-
-<details>
-
-<summary> <b>Example</b> </summary>
-
-**Code**
-
-```
-
-workspace.ResetWorkspace()
-
-op = core.CreateOperator(
-    "TanhFakeFp16",
-    ["X"],
-    ["X"],
-)
-
-workspace.FeedBlob("X", np.random.randn(3, 3).astype(np.float32))
-print("X:\n", workspace.FetchBlob("X"), "\n")
-
-workspace.RunOperatorOnce(op)
-print("X:\n", workspace.FetchBlob("X"))
-
-```
-
-**Result**
-
-```
-
-X:
- [[ 2.032603   -2.3556721  -0.14955314]
- [ 0.39309832 -1.1020128  -0.92951244]
- [-0.62815386  0.21342885  1.4002231 ]]
-
-X:
- [[ 0.9662601  -0.982175   -0.14844811]
- [ 0.3740282  -0.8012209  -0.73036647]
- [-0.55677974  0.21024609  0.8853999 ]]
-
-```
-
-</details>
-
-)DOC")
-    .Input(0, "input", "1-D input tensor")
-    .Output(
-        0,
-        "output",
-        "The hyperbolic tangent values of the input tensor, computed "
-        "element-wise")
-    .InheritOnnxSchema();
-
-struct SwishEmulatorFunctor {
-  bool operator()(
-      const int N,
-      const float* X,
-      float* Y,
-      CPUContext* /* unused */) const {
-    for (int i = 0; i < N; i++) {
-      Y[i] = CalcSwishByLUT((at::Half)X[i]);
-    }
-    return true;
-  }
-};
-
-template <class Context>
-class LogitEmulatorFunctor final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  template <class... Args>
-  explicit LogitEmulatorFunctor(Args&&... args)
-      : Operator<CPUContext>(std::forward<Args>(args)...),
-        OP_SINGLE_ARG(float, "eps", eps_, 1e-6f) {}
-  ~LogitEmulatorFunctor() noexcept override {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(0);
-    const int N = X.numel();
-    auto* Y = Output(0, X.sizes(), at::dtype<float>());
-    Y->ResizeLike(X);
-    const float* X_data = X.template data<float>();
-    float* Y_data = Y->template mutable_data<float>();
-    std::vector<float> X_rounded(N);
-    fbgemm::RoundToFloat16(
-        X_data, X_rounded.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    X_data = X_rounded.data();
-    for (int i = 0; i < N; i++) {
-      Y_data[i] = CalcLogit((at::Half)X_data[i], eps_);
-    }
-    return true;
-  }
-
- private:
-  const float eps_;
-};
-
-REGISTER_CPU_OPERATOR(
-    SwishFakeFp16NNPI,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, SwishEmulatorFunctor>);
-
-// Input: X, output: Y
-OPERATOR_SCHEMA(SwishFakeFp16NNPI)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .AllowInplace({{0, 0}})
-    .IdenticalTypeAndShape()
-    .SetDoc(R"DOC(
-Apply the Swish function element-wise to the input tensor.
-
-$$Swish(x) = \frac{x}{1+\exp(-x)}$$
-
-The input and output of this operator are converted to fp16 precision.
-
-<details>
-</details>
-
-
-)DOC")
-    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
-    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.")
-    .InheritOnnxSchema();
-
-REGISTER_CPU_OPERATOR(LogitFakeFp16NNPI, LogitEmulatorFunctor<CPUContext>);
-
-OPERATOR_SCHEMA(LogitFakeFp16NNPI)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-      Elementwise logit fake fp16 transform:
-      $$logit(x) = log(\frac{x}{(1 - x)})$$
-      where x is the input data clampped in (eps, 1-eps).)DOC")
-    .Arg("eps (optional)", "small positive epsilon value, the default is 1e-6.")
-    .Input(0, "X", "input float tensor")
-    .Output(0, "Y", "output float tensor");
-
-} // namespace caffe2
diff --git a/caffe2/contrib/fakelowp/unary_fp16_fake_op.h b/caffe2/contrib/fakelowp/unary_fp16_fake_op.h
deleted file mode 100644
index 0e9fc7883c627..0000000000000
--- a/caffe2/contrib/fakelowp/unary_fp16_fake_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-#pragma once
-
-#include <vector>
-
-#include <fbgemm/FbgemmConvert.h>
-#include "caffe2/operators/elementwise_ops.h"
-#include "caffe2/utils/eigen_utils.h"
-#include "caffe2/utils/math.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-using namespace std;
-template <class Context>
-struct ReluFakeFp16Functor {
-  template <typename T>
-  bool operator()(const int N, const T* X, T* Y, Context* /* unused */) const {
-    std::vector<float> X_fp16(N);
-    fbgemm::RoundToFloat16(
-        X, X_fp16.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    EigenVectorMap<T>(Y, N) =
-        ConstEigenVectorMap<float>(X_fp16.data(), N).cwiseMax(T(0));
-    return true;
-  }
-};
-
-template <class Context>
-struct SqrFakeFp16Functor {
-  template <typename T>
-  bool operator()(const int N, const T* X, T* Y, Context* context) const {
-    std::vector<float> X_fp16(N);
-    fbgemm::RoundToFloat16(
-        X, X_fp16.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    math::Sqr(N, X_fp16.data(), Y, context);
-    fbgemm::RoundToFloat16(Y, Y, N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    return true;
-  }
-};
-
-struct SigmoidFakeIdealFp16Functor {
-  template <typename T>
-  bool operator()(const int N, const T* X, T* Y, CPUContext* /* unused */)
-      const {
-    std::vector<float> X_fp16(N);
-    fbgemm::RoundToFloat16(X, X_fp16.data(), N);
-    EigenVectorArrayMap<T>(Y, N) =
-        T(1) / (T(1) + (-ConstEigenVectorArrayMap<T>(X_fp16.data(), N)).exp());
-    fbgemm::RoundToFloat16(Y, Y, N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    return true;
-  }
-};
-
-struct TanhFakeIdealFp16Functor {
-  template <typename T>
-  bool operator()(const int N, const T* X, T* Y, CPUContext* context) const {
-    std::vector<float> X_fp16(N);
-    fbgemm::RoundToFloat16(
-        X, X_fp16.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    math::Tanh<T, CPUContext>(N, X_fp16.data(), Y, context);
-    fbgemm::RoundToFloat16(Y, Y, N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    return true;
-  }
-};
-
-} // namespace caffe2
-
-namespace fake_fp16 {
-
-at::Half CalcSigmoidByLUT(at::Half x);
-at::Half CalcSwishByLUT(at::Half x);
-at::Half CalcSwishByLUTCubic(at::Half x);
-at::Half CalcTanhByLUT(at::Half input);
-
-} // namespace fake_fp16
diff --git a/caffe2/contrib/gloo/CMakeLists.txt b/caffe2/contrib/gloo/CMakeLists.txt
deleted file mode 100644
index 596bf1414690d..0000000000000
--- a/caffe2/contrib/gloo/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-if(USE_GLOO)
-  set(Caffe2_CONTRIB_GLOO_CPU_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/allgather_ops.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/barrier_ops.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/context.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/reduce_scatter_ops.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/store_handler.cc"
-    )
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_GLOO_CPU_SRC} PARENT_SCOPE)
-
-  if(USE_CUDA)
-    set(Caffe2_CONTRIB_GLOO_GPU_SRC
-      "${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops_gpu.cc"
-      "${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops_gpu.cc"
-      "${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops_gpu.cc"
-      )
-    set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_GLOO_GPU_SRC} PARENT_SCOPE)
-  endif(USE_CUDA)
-
-  if(USE_ROCM)
-    set(Caffe2_CONTRIB_GLOO_HIP_SRC
-      "${CMAKE_CURRENT_SOURCE_DIR}/hip/allreduce_ops_gpu.cc"
-      "${CMAKE_CURRENT_SOURCE_DIR}/hip/broadcast_ops_gpu.cc"
-      "${CMAKE_CURRENT_SOURCE_DIR}/hip/common_world_ops_gpu.cc"
-      )
-    set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${Caffe2_CONTRIB_GLOO_HIP_SRC} PARENT_SCOPE)
-    set(Caffe2_HIP_INCLUDE ${GLOO_HIP_INCLUDE} ${Caffe2_HIP_INCLUDE} PARENT_SCOPE)
-  endif(USE_ROCM)
-endif()
diff --git a/caffe2/contrib/gloo/allgather_ops.cc b/caffe2/contrib/gloo/allgather_ops.cc
deleted file mode 100644
index 22403fa4a3c6e..0000000000000
--- a/caffe2/contrib/gloo/allgather_ops.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Copyright (c) 2017-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "allgather_ops.h"
-
-#include <gloo/allgather_ring.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-void AllgatherOp<Context>::initializeAlgorithm() {
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::AllgatherRing<float>(
-        init_.context,
-        init_.template getInputs<float>(),
-        init_.template getOutput<float>(),
-        init_.size));
-  } else if (init_.template IsType<long>()) {
-    algorithm_.reset(new ::gloo::AllgatherRing<long>(
-        init_.context,
-        init_.template getInputs<long>(),
-        init_.template getOutput<long>(),
-        init_.size));
-  } else if (init_.template IsType<int>()) {
-    algorithm_.reset(new ::gloo::AllgatherRing<int>(
-        init_.context,
-        init_.template getInputs<int>(),
-        init_.template getOutput<int>(),
-        init_.size));
-  } else if (init_.template IsType<at::Half>()) {
-    algorithm_.reset(new ::gloo::AllgatherRing<::gloo::float16>(
-        init_.context,
-        init_.template getInputs<::gloo::float16>(),
-        init_.template getOutput<::gloo::float16>(),
-        init_.size));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-// Used outside of the translation unit
-template void AllgatherOp<CPUContext>::initializeAlgorithm();
-
-namespace {
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(Allgather, GLOO, AllgatherOp<CPUContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/allgather_ops.h b/caffe2/contrib/gloo/allgather_ops.h
deleted file mode 100644
index f0d89fc2ae9ae..0000000000000
--- a/caffe2/contrib/gloo/allgather_ops.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/**
- * Copyright (c) 2017-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <algorithm>
-
-#include "caffe2/contrib/gloo/common.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/types.h"
-
-#include <gloo/algorithm.h>
-#include <gloo/common/error.h>
-#include <gloo/context.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-class AllgatherOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  AllgatherOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        ws_(ws),
-        status_blob_(
-            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
-    if (status_blob_ != "") {
-      ws_->CreateBlob(status_blob_);
-    }
-  }
-
-  ~AllgatherOp() override {}
-
-  bool RunOnDevice() override {
-    std::call_once(once_, [&] { initialize(); });
-
-    // If any parameter has changed in between runs, the initialized
-    // algorithm is invalid and cannot be used.
-    update(current_);
-    CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
-
-    try {
-      algorithm_->run();
-    } catch (::gloo::IoException& ioe) {
-      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
-      if (status_blob_ != "") {
-        signalFailure(ws_->GetBlob(status_blob_), ioe);
-        return false;
-      } else {
-        throw;
-      }
-    }
-    return true;
-  }
-
- protected:
-  void initialize() {
-    // Allocate output tensor
-    CAFFE_ENFORCE_EQ(OutputSize(), 1);
-    auto comm_size =
-        OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0)->size;
-    const auto dims = std::vector<int64_t>(
-        1, (InputSize() - 1) * Input(1).numel() * comm_size);
-    Output(0)->Resize(dims);
-
-    // Store which inputs/outputs this instance initialized with
-    update(init_);
-
-    CAFFE_ENFORCE_EQ(init_.outputs.size(), 1);
-
-    // Verify tensors all have same size
-    size_t size = Input(1).numel();
-    for (const auto i : c10::irange(2, InputSize())) {
-      CAFFE_ENFORCE_EQ(Input(i).numel(), size);
-    }
-
-    // Verify tensors all have same type
-    TypeMeta meta = Input(1).dtype();
-    for (const auto i : c10::irange(2, InputSize())) {
-      CAFFE_ENFORCE(Input(i).dtype() == meta);
-    }
-
-    // Finally initialize the algorithm
-    initializeAlgorithm();
-  }
-
-  void initializeAlgorithm();
-
-  std::once_flag once_;
-  std::unique_ptr<::gloo::Algorithm> algorithm_;
-
-  // Captures the parameters passed to Gloo when first initialized.
-  // An instance is updated every time this op runs and is compared
-  // to the reference instance for equality. If any parameter has
-  // changed from run to run, the initialized algorithm is invalid.
-  void update(GlooParameters& params) {
-    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
-    params.inputs.resize(InputSize() - 1);
-    params.size = Input(1).numel();
-    params.meta = Input(1).dtype();
-    for (const auto i : c10::irange(params.inputs.size())) {
-      params.inputs[i] = Input(i + 1).raw_data();
-    }
-    params.outputs.resize(OutputSize());
-    params.outputs[0] = Output(0)->raw_mutable_data(params.meta);
-  }
-
-  GlooParameters init_;
-  GlooParameters current_;
-  Workspace* ws_;
-  std::string status_blob_;
-};
-
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/allreduce_ops.cc b/caffe2/contrib/gloo/allreduce_ops.cc
deleted file mode 100644
index 4f2e0335a60b6..0000000000000
--- a/caffe2/contrib/gloo/allreduce_ops.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-#include "allreduce_ops.h"
-
-#include <math.h>
-
-#include <gloo/allreduce_bcube.h>
-#include <gloo/allreduce_halving_doubling.h>
-#include <gloo/allreduce_ring.h>
-#include <gloo/allreduce_ring_chunked.h>
-#include <gloo/types.h>
-
-namespace {
-/**
- * This is a helper function which attempts to get a base value depending on the
- * # of nodes. Larger the base the better performance (up to 4) is what we have
- * observed in gloo benchmarks. At the moment bcube works only if # nodes = base
- * ^ x. Where x is some constant. So, if # node don't match our expectation
- * simply return -1. This will indicate caller to switch to another algorithm
- * like halving-doubling.
- */
-static int getAllrduceBcubeBase(int nodes) {
-  auto getExponent = [](int n, int b) -> int {
-    float lg2n = log2(n);
-    float lg2b = log2(b);
-    return ceil(lg2n / lg2b);
-  };
-  auto baseCheck = [&](int n, int b) -> bool {
-    int e = getExponent(n, b);
-    return n == pow(b, e);
-  };
-  for (const auto base : {6, 5, 4, 3, 2}) {
-    if (baseCheck(nodes, base)) {
-      return base;
-    }
-    /*
-     * Base could work if # nodes is multiple of the base yet smaller than
-     * base^2
-     */
-    if (nodes < base * base && 0 == nodes % base) {
-      return base;
-    }
-  }
-  return -1;
-}
-} // namespace
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-void AllreduceOp<Context>::initializeBcube() {
-  int base = getAllrduceBcubeBase(init_.size);
-  if (-1 == base) {
-    return initializeHalvingDoubling();
-  }
-  init_.context->base = base;
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::AllreduceBcube<float>(
-        init_.context, init_.template getOutputs<float>(), init_.size));
-  } else if (init_.template IsType<::at::Half>()) {
-    algorithm_.reset(new ::gloo::AllreduceBcube<::gloo::float16>(
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-template <class Context>
-void AllreduceOp<Context>::initializeHalvingDoubling() {
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::AllreduceHalvingDoubling<float>(
-        init_.context, init_.template getOutputs<float>(), init_.size));
-  } else if (init_.template IsType<::at::Half>()) {
-    algorithm_.reset(new ::gloo::AllreduceHalvingDoubling<::gloo::float16>(
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-// Used outside of the translation unit
-template void AllreduceOp<CPUContext>::initializeHalvingDoubling();
-
-template <class Context>
-void AllreduceOp<Context>::initializeRingFull() {
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::AllreduceRing<float>(
-        init_.context, init_.template getOutputs<float>(), init_.size));
-  } else if (init_.template IsType<::at::Half>()) {
-    algorithm_.reset(new ::gloo::AllreduceRing<::gloo::float16>(
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-template <class Context>
-void AllreduceOp<Context>::initializeRingChunked() {
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::AllreduceRingChunked<float>(
-        init_.context, init_.template getOutputs<float>(), init_.size));
-  } else if (init_.template IsType<::at::Half>()) {
-    algorithm_.reset(new ::gloo::AllreduceRingChunked<::gloo::float16>(
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-namespace {
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(Allreduce, GLOO, AllreduceOp<CPUContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/allreduce_ops.h b/caffe2/contrib/gloo/allreduce_ops.h
deleted file mode 100644
index d449d88c498a4..0000000000000
--- a/caffe2/contrib/gloo/allreduce_ops.h
+++ /dev/null
@@ -1,134 +0,0 @@
-#pragma once
-
-#include <algorithm>
-
-#include "caffe2/contrib/gloo/common.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-#include <gloo/algorithm.h>
-#include <gloo/common/error.h>
-#include <gloo/context.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-class AllreduceOp final : public Operator<Context> {
-  enum Mode { RING_FULL, RING_CHUNKED, HALVING_DOUBLING, BCUBE };
-
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  AllreduceOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        ws_(ws),
-        status_blob_(
-            OperatorBase::GetSingleArgument<std::string>("status_blob", "")),
-        gpu_direct_(
-            OperatorBase::GetSingleArgument<bool>("gpu_direct", false)) {
-    if (status_blob_ != "") {
-      ws_->CreateBlob(status_blob_);
-    }
-  }
-
-  ~AllreduceOp() override {}
-
-  bool RunOnDevice() override {
-    std::call_once(once_, [&] { initialize(); });
-
-    // If any parameter has changed in between runs, the initialized
-    // algorithm is invalid and cannot be used.
-    update(current_);
-    CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
-
-    try {
-      algorithm_->run();
-    } catch (::gloo::IoException& ioe) {
-      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
-      if (status_blob_ != "") {
-        signalFailure(ws_->GetBlob(status_blob_), ioe);
-        return false;
-      } else {
-        throw;
-      }
-    }
-    return true;
-  }
-
- protected:
-  void initialize() {
-    Mode mode = HALVING_DOUBLING;
-
-    // Store which inputs/outputs this instance initialized with
-    update(init_);
-
-    // Verify inputs == outputs
-    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
-    for (const auto i : c10::irange(0U, init_.inputs.size())) {
-      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
-    }
-
-    // Verify tensors all have same size
-    auto size = Input(1).numel();
-    for (const auto i : c10::irange(2, InputSize())) {
-      CAFFE_ENFORCE_EQ(Input(i).numel(), size);
-    }
-
-    // Verify tensors all have same type
-    TypeMeta meta = Input(1).dtype();
-    for (const auto i : c10::irange(2, InputSize())) {
-      CAFFE_ENFORCE(Input(i).dtype() == meta);
-    }
-
-    switch (mode) {
-      case RING_FULL:
-        initializeRingFull();
-        return;
-      case RING_CHUNKED:
-        initializeRingChunked();
-        return;
-      case HALVING_DOUBLING:
-        initializeHalvingDoubling();
-        return;
-      case BCUBE:
-        initializeBcube();
-        return;
-    }
-
-    CAFFE_ENFORCE(false, "Unreachable code");
-  }
-
-  void initializeBcube();
-  void initializeHalvingDoubling();
-  void initializeRingFull();
-  void initializeRingChunked();
-
-  std::once_flag once_;
-  std::unique_ptr<::gloo::Algorithm> algorithm_;
-
-  // Captures the parameters passed to Gloo when first initialized.
-  // An instance is updated every time this op runs and is compared
-  // to the reference instance for equality. If any parameter has
-  // changed from run to run, the initialized algorithm is invalid.
-  void update(GlooParameters& params) {
-    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
-    params.inputs.resize(InputSize() - 1);
-    params.outputs.resize(OutputSize());
-    for (const auto i : c10::irange(0U, params.inputs.size())) {
-      params.inputs[i] = Input(i + 1).raw_data();
-      params.outputs[i] = Output(i)->raw_mutable_data();
-    }
-    params.size = Output(0)->numel();
-    params.meta = Output(0)->dtype();
-  }
-
-  GlooParameters init_;
-  GlooParameters current_;
-  Workspace* ws_;
-  std::string status_blob_;
-  const bool gpu_direct_;
-};
-
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/allreduce_ops_gpu.cc b/caffe2/contrib/gloo/allreduce_ops_gpu.cc
deleted file mode 100644
index a4c90fdfc3d18..0000000000000
--- a/caffe2/contrib/gloo/allreduce_ops_gpu.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-#include "caffe2/contrib/gloo/allreduce_ops.h"
-
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/logging.h"
-
-#include <gloo/cuda_allreduce_bcube.h>
-#include <gloo/cuda_allreduce_halving_doubling.h>
-#include <gloo/cuda_allreduce_ring.h>
-#include <gloo/cuda_allreduce_ring_chunked.h>
-#include <gloo/types.h>
-
-namespace caffe2 {
-namespace gloo {
-
-namespace {
-
-// Decides on using GPUDirect based on device support.
-template <template <typename T, typename W> class A, typename T>
-std::unique_ptr<::gloo::Algorithm> initializeAlgorithm(
-    bool gpu_direct_,
-    std::shared_ptr<::gloo::Context> context,
-    std::vector<T*> ptrs,
-    size_t size) {
-  if (gpu_direct_) {
-    if (context->getDevice()->hasGPUDirect()) {
-      return std::unique_ptr<::gloo::Algorithm>(
-        new A<T, ::gloo::CudaDeviceWorkspace<T>>(context, ptrs, size));
-    } else {
-      LOG(WARNING)
-        << "GPUDirect not available; "
-        << "Gloo communication will go through system memory instead.";
-    }
-  }
-
-  return std::unique_ptr<::gloo::Algorithm>(
-    new A<T, ::gloo::CudaHostWorkspace<T>>(context, ptrs, size));
-}
-
-/**
- * This is a helper function which attempts to get a base value depending on the
- * # of nodes. Larger the base the better performance (up to 4) is what we have
- * observed in gloo benchmarks. At the moment bcube works only if # nodes = base
- * ^ x. Where x is some constant. So, if # node don't match our expectation
- * simply return -1. This will indicate caller to switch to another algorithm
- * like halving-doubling.
- */
-static int getAllrduceBcubeBase(int nodes) {
-  auto getExponent = [](int n, int b) -> int {
-    float lg2n = log2(n);
-    float lg2b = log2(b);
-    return ceil(lg2n / lg2b);
-  };
-  auto baseCheck = [&](int n, int b) -> bool {
-    int e = getExponent(n, b);
-    return n == pow(b, e);
-  };
-  for (const auto base : {6, 5, 4, 3, 2}) {
-    if (baseCheck(nodes, base)) {
-      return base;
-    }
-    /*
-     * Base could work if # nodes is multiple of the base yet smaller than
-     * base^2
-     */
-    if (nodes < base * base && 0 == nodes % base) {
-      return base;
-    }
-  }
-  return -1;
-}
-
-} // namespace
-
-template <class Context>
-void AllreduceOp<Context>::initializeBcube() {
-  int base = getAllrduceBcubeBase(init_.size);
-  if (-1 == base) {
-    return initializeHalvingDoubling();
-  }
-  init_.context->base = base;
-  if (init_.template IsType<float>()) {
-    algorithm_ = initializeAlgorithm<::gloo::CudaAllreduceBcube, float>(
-        gpu_direct_,
-        init_.context,
-        init_.template getOutputs<float>(),
-        init_.size);
-  } else if (init_.template IsType<at::Half>()) {
-    algorithm_ =
-        initializeAlgorithm<::gloo::CudaAllreduceBcube, ::gloo::float16>(
-            gpu_direct_,
-            init_.context,
-            init_.template getOutputs<::gloo::float16>(),
-            init_.size);
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-template <class Context>
-void AllreduceOp<Context>::initializeHalvingDoubling() {
-  if (init_.template IsType<float>()) {
-    algorithm_ =
-      initializeAlgorithm<::gloo::CudaAllreduceHalvingDoubling, float>(
-        gpu_direct_,
-        init_.context,
-        init_.template getOutputs<float>(),
-        init_.size);
-  } else if (init_.template IsType<at::Half>()) {
-    algorithm_ =
-      initializeAlgorithm<::gloo::CudaAllreduceHalvingDoubling, ::gloo::float16>(
-        gpu_direct_,
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size);
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-template <class Context>
-void AllreduceOp<Context>::initializeRingFull() {
-  if (init_.template IsType<float>()) {
-    algorithm_ =
-      initializeAlgorithm<::gloo::CudaAllreduceRing, float>(
-        gpu_direct_,
-        init_.context,
-        init_.template getOutputs<float>(),
-        init_.size);
-  } else if (init_.template IsType<at::Half>()) {
-    algorithm_ =
-      initializeAlgorithm<::gloo::CudaAllreduceRing, ::gloo::float16>(
-        gpu_direct_,
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size);
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-template <class Context>
-void AllreduceOp<Context>::initializeRingChunked() {
-  if (init_.template IsType<float>()) {
-    algorithm_ =
-      initializeAlgorithm<::gloo::CudaAllreduceRingChunked, float>(
-        gpu_direct_,
-        init_.context,
-        init_.template getOutputs<float>(),
-        init_.size);
-  } else if (init_.template IsType<at::Half>()) {
-    algorithm_ =
-      initializeAlgorithm<::gloo::CudaAllreduceRingChunked, ::gloo::float16>(
-        gpu_direct_,
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size);
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-namespace {
-
-REGISTER_CUDA_OPERATOR_WITH_ENGINE(Allreduce, GLOO, AllreduceOp<CUDAContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/barrier_ops.cc b/caffe2/contrib/gloo/barrier_ops.cc
deleted file mode 100644
index 01391a54528ab..0000000000000
--- a/caffe2/contrib/gloo/barrier_ops.cc
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "barrier_ops.h"
-
-namespace caffe2 {
-namespace gloo {
-namespace {
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(Barrier, GLOO, BarrierOp<CPUContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/barrier_ops.h b/caffe2/contrib/gloo/barrier_ops.h
deleted file mode 100644
index 58c45f4ecc4c8..0000000000000
--- a/caffe2/contrib/gloo/barrier_ops.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#pragma once
-
-#include "caffe2/contrib/gloo/common.h"
-#include "caffe2/core/operator.h"
-
-#include <gloo/algorithm.h>
-#include <gloo/barrier_all_to_one.h>
-#include <gloo/common/error.h>
-#include <gloo/context.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-class BarrierOp final : public Operator<Context> {
- public:
-  BarrierOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        ws_(ws),
-        status_blob_(
-            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
-    if (status_blob_ != "") {
-      ws_->CreateBlob(status_blob_);
-    }
-  }
-
-  ~BarrierOp() override {}
-
-  bool RunOnDevice() override {
-    auto context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
-    std::call_once(once_, [&] {
-      initContext_ = context;
-      // Use an all-to-one barrier synchronizing against rank 0
-      algorithm_.reset(new ::gloo::BarrierAllToOne(initContext_, 0));
-    });
-
-    // If any parameter has changed in between runs, the initialized
-    // algorithm is invalid and cannot be used.
-    CAFFE_ENFORCE(context == initContext_, "Context has changed");
-
-    try {
-      algorithm_->run();
-    } catch (::gloo::IoException& ioe) {
-      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
-      if (status_blob_ != "") {
-        signalFailure(ws_->GetBlob(status_blob_), ioe);
-        return false;
-      } else {
-        throw;
-      }
-    }
-    return true;
-  }
-
- protected:
-  std::once_flag once_;
-  std::shared_ptr<::gloo::Context> initContext_;
-  std::unique_ptr<::gloo::Algorithm> algorithm_;
-  Workspace* ws_;
-  std::string status_blob_;
-};
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/broadcast_ops.cc b/caffe2/contrib/gloo/broadcast_ops.cc
deleted file mode 100644
index 2535ed99a9296..0000000000000
--- a/caffe2/contrib/gloo/broadcast_ops.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "broadcast_ops.h"
-
-#include <gloo/broadcast_one_to_all.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-void BroadcastOp<Context>::initializeAlgorithm() {
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::BroadcastOneToAll<float>(
-        init_.context, init_.template getOutputs<float>(), init_.size, root_));
-  } else if (init_.template IsType<long>()) {
-    algorithm_.reset(new ::gloo::BroadcastOneToAll<long>(
-        init_.context, init_.template getOutputs<long>(), init_.size, root_));
-  } else if (init_.template IsType<int>()) {
-    algorithm_.reset(new ::gloo::BroadcastOneToAll<int>(
-        init_.context, init_.template getOutputs<int>(), init_.size, root_));
-  } else if (init_.template IsType<at::Half>()) {
-    algorithm_.reset(new ::gloo::BroadcastOneToAll<::gloo::float16>(
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size,
-        root_));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-namespace {
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(Broadcast, GLOO, BroadcastOp<CPUContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/broadcast_ops.h b/caffe2/contrib/gloo/broadcast_ops.h
deleted file mode 100644
index a0d4102f2d089..0000000000000
--- a/caffe2/contrib/gloo/broadcast_ops.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#pragma once
-
-#include <algorithm>
-
-#include "caffe2/contrib/gloo/common.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/types.h"
-
-#include <gloo/algorithm.h>
-#include <gloo/common/error.h>
-#include <gloo/context.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-class BroadcastOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  BroadcastOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        root_(OperatorBase::template GetSingleArgument<int>("root", 0)),
-        ws_(ws),
-        status_blob_(
-            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
-    if (status_blob_ != "") {
-      ws_->CreateBlob(status_blob_);
-    }
-  }
-
-  ~BroadcastOp() override {}
-
-  bool RunOnDevice() override {
-    std::call_once(once_, [&] { initialize(); });
-
-    // If any parameter has changed in between runs, the initialized
-    // algorithm is invalid and cannot be used.
-    update(current_);
-    CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
-
-    try {
-      algorithm_->run();
-    } catch (::gloo::IoException& ioe) {
-      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
-      if (status_blob_ != "") {
-        signalFailure(ws_->GetBlob(status_blob_), ioe);
-        return false;
-      } else {
-        throw;
-      }
-    }
-    return true;
-  }
-
- protected:
-  void initialize() {
-    // Store which inputs/outputs this instance initialized with
-    update(init_);
-
-    // Verify inputs == outputs
-    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
-    for (const auto i : c10::irange(init_.inputs.size())) {
-      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
-    }
-
-    // Verify tensors all have same size
-    size_t size = Input(1).numel();
-    for (const auto i : c10::irange(2, InputSize())) {
-      CAFFE_ENFORCE_EQ(Input(i).numel(), size);
-    }
-
-    // Verify tensors all have same size
-    TypeMeta meta = Input(1).dtype();
-    for (const auto i : c10::irange(2, InputSize())) {
-      CAFFE_ENFORCE(Input(i).dtype() == meta);
-    }
-
-    // Finally initialize the algorithm
-    initializeAlgorithm();
-  }
-
-  void initializeAlgorithm();
-
-  const int root_;
-  std::once_flag once_;
-  std::unique_ptr<::gloo::Algorithm> algorithm_;
-
-  // Captures the parameters passed to Gloo when first initialized.
-  // An instance is updated every time this op runs and is compared
-  // to the reference instance for equality. If any parameter has
-  // changed from run to run, the initialized algorithm is invalid.
-  void update(GlooParameters& params) {
-    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
-    params.inputs.resize(InputSize() - 1);
-    params.outputs.resize(OutputSize());
-    for (const auto i : c10::irange(params.inputs.size())) {
-      params.inputs[i] = Input(i + 1).raw_data();
-      params.outputs[i] = Output(i)->raw_mutable_data();
-    }
-    params.size = Output(0)->numel();
-    params.meta = Output(0)->dtype();
-  }
-
-  GlooParameters init_;
-  GlooParameters current_;
-  Workspace* ws_;
-  std::string status_blob_;
-};
-
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/broadcast_ops_gpu.cc b/caffe2/contrib/gloo/broadcast_ops_gpu.cc
deleted file mode 100644
index 942fbdc56f090..0000000000000
--- a/caffe2/contrib/gloo/broadcast_ops_gpu.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-#include "caffe2/contrib/gloo/broadcast_ops.h"
-
-#include "caffe2/core/context_gpu.h"
-
-#include <gloo/cuda_broadcast_one_to_all.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-void BroadcastOp<Context>::initializeAlgorithm() {
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<float>(
-        init_.context, init_.template getOutputs<float>(), init_.size, root_));
-  } else if (init_.template IsType<long>()) {
-    algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<long>(
-        init_.context, init_.template getOutputs<long>(), init_.size, root_));
-  } else if (init_.template IsType<int>()) {
-    algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<int>(
-        init_.context, init_.template getOutputs<int>(), init_.size, root_));
-  } else if (init_.template IsType<at::Half>()) {
-    algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<::gloo::float16>(
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size,
-        root_));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-namespace {
-
-REGISTER_CUDA_OPERATOR_WITH_ENGINE(Broadcast, GLOO, BroadcastOp<CUDAContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/common.cc b/caffe2/contrib/gloo/common.cc
deleted file mode 100644
index d4929938f1917..0000000000000
--- a/caffe2/contrib/gloo/common.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "caffe2/contrib/gloo/common.h"
-
-#include "caffe2/core/logging.h"
-#include "caffe2/core/tensor.h"
-
-#include <gloo/transport/tcp/device.h>
-#if defined(GLOO_USE_IBVERBS) && GLOO_USE_IBVERBS
-#include <gloo/transport/ibverbs/device.h>
-#endif
-
-namespace caffe2 {
-namespace gloo {
-
-void signalFailure(Blob* status_blob, std::exception& /* unused */) {
-  auto* res = BlobGetMutableTensor(status_blob, CPU);
-  res->Resize(1);
-  res->template mutable_data<int32_t>()[0] = 1;
-}
-
-std::shared_ptr<::gloo::transport::Device> createDevice(
-    const createDeviceAttr attr) {
-  if (attr.transport == "tcp") {
-    ::gloo::transport::tcp::attr tcpAttr;
-    if (attr.interface.size() > 0) {
-      tcpAttr.iface = attr.interface;
-    }
-    return ::gloo::transport::tcp::CreateDevice(tcpAttr);
-  } else if (attr.transport == "ibverbs") {
-#if defined(GLOO_USE_IBVERBS) && GLOO_USE_IBVERBS
-    ::gloo::transport::ibverbs::attr ibverbsAttr;
-    ibverbsAttr.port = 1;
-    ibverbsAttr.index = 0;
-    if (attr.interface.size() > 0) {
-      ibverbsAttr.name = attr.interface;
-    }
-    return ::gloo::transport::ibverbs::CreateDevice(ibverbsAttr);
-#else
-    CAFFE_THROW(
-      "Gloo was not compiled with ibverbs support. ",
-      "Please recompile with -DUSE_IBVERBS=1.");
-#endif
-  }
-
-  CAFFE_THROW("Invalid transport: ", attr.transport);
-}
-
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/common.h b/caffe2/contrib/gloo/common.h
deleted file mode 100644
index f258775685bfe..0000000000000
--- a/caffe2/contrib/gloo/common.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#pragma once
-
-#include <exception>
-
-#include "caffe2/core/blob.h"
-
-#include <gloo/config.h>
-#include <gloo/context.h>
-#include <gloo/transport/device.h>
-
-namespace caffe2 {
-namespace gloo {
-
-TORCH_API void signalFailure(Blob* status_blob, std::exception& exception);
-
-struct createDeviceAttr {
-    // "tcp" or "ibverbs"
-    std::string transport;
-
-    // E.g. "eth0" (tcp), or "mlx5_0" (ibverbs).
-    // This may be empty to make Gloo figure it out.
-    std::string interface;
-};
-
-TORCH_API std::shared_ptr<::gloo::transport::Device> createDevice(
-    const createDeviceAttr attr);
-
-// Captures the parameters passed to Gloo.
-struct GlooParameters {
-  std::shared_ptr<::gloo::Context> context;
-  std::vector<const void*> inputs;
-  std::vector<void*> outputs;
-  size_t size;
-  TypeMeta meta;
-
-  template <typename T>
-  std::vector<const T*> getInputs() {
-    std::vector<const T*> result;
-    result.reserve(inputs.size());
-    for (auto& input : inputs) {
-      result.push_back(reinterpret_cast<const T*>(input));
-    }
-    return result;
-  }
-
-  template <typename T>
-  std::vector<T*> getOutputs() {
-    std::vector<T*> result;
-    result.reserve(outputs.size());
-    for (auto& output : outputs) {
-      result.push_back(reinterpret_cast<T*>(output));
-    }
-    return result;
-  }
-
-  template <typename T>
-  T* getOutput() {
-    return reinterpret_cast<T*>(outputs[0]);
-  }
-
-  template <typename T>
-  bool IsType() const {
-    return meta.Match<T>();
-  }
-
-  bool operator==(GlooParameters const& other) const {
-    return context == other.context && inputs == other.inputs &&
-        outputs == other.outputs && size == other.size;
-  }
-};
-
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/common_world_ops.cc b/caffe2/contrib/gloo/common_world_ops.cc
deleted file mode 100644
index 9e631ce137914..0000000000000
--- a/caffe2/contrib/gloo/common_world_ops.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "caffe2/contrib/gloo/common_world_ops.h"
-
-#include <gloo/transport/tcp/device.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <>
-void CreateCommonWorld<CPUContext>::initializeForContext() {
-  // Nothing to initialize for CPUContext.
-}
-
-namespace {
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(
-    CreateCommonWorld,
-    GLOO,
-    CreateCommonWorld<CPUContext>);
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(
-    CloneCommonWorld,
-    GLOO,
-    CloneCommonWorld<CPUContext>);
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(DestroyCommonWorld, GLOO, DestroyCommonWorld);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/common_world_ops.h b/caffe2/contrib/gloo/common_world_ops.h
deleted file mode 100644
index 2ba6c26114b43..0000000000000
--- a/caffe2/contrib/gloo/common_world_ops.h
+++ /dev/null
@@ -1,249 +0,0 @@
-#pragma once
-
-#include "caffe2/contrib/gloo/common.h"
-#include "caffe2/contrib/gloo/store_handler.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/distributed/store_handler.h"
-
-#include <gloo/common/error.h>
-#include <gloo/config.h>
-#include <gloo/rendezvous/context.h>
-#include <gloo/rendezvous/prefix_store.h>
-
-#if defined(GLOO_USE_MPI) && GLOO_USE_MPI
-#include <gloo/mpi/context.h>
-#endif
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-class CreateCommonWorld final : public Operator<Context> {
- public:
-  using CommonWorld = std::shared_ptr<::gloo::Context>;
-
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  CreateCommonWorld(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        size_(OperatorBase::template GetSingleArgument<int>("size", 0)),
-        rank_(OperatorBase::template GetSingleArgument<int>("rank", 0)),
-        sync_(OperatorBase::template GetSingleArgument<bool>("sync", false)),
-        transport_(OperatorBase::template GetSingleArgument<std::string>(
-                       "transport", "tcp")),
-        interface_(OperatorBase::template GetSingleArgument<std::string>(
-                       "interface", "")),
-        mpi_rendezvous_(OperatorBase::template GetSingleArgument<bool>(
-                       "mpi_rendezvous", false)),
-        status_blob_(
-            OperatorBase::GetSingleArgument<std::string>("status_blob", "")),
-        timeout_ms_(OperatorBase::GetSingleArgument<int>("timeout_ms", -1)),
-        ws_(ws) {
-    CAFFE_ENFORCE(
-        operator_def.has_name(), "CreateCommonWorld operator requires name");
-    CAFFE_ENFORCE(rank_ >= 0 && rank_ < size_);
-    name_ = operator_def.name();
-    if (status_blob_ != "") {
-      ws_->CreateBlob(status_blob_);
-    }
-    initialize();
-  }
-
-  ~CreateCommonWorld() override {
-  }
-
-  CommonWorld rendezvousWithMPI() {
-#if defined(GLOO_USE_MPI) && GLOO_USE_MPI
-    auto context = ::gloo::mpi::Context::createManaged();
-    if (timeout_ms_ != -1) {
-      context->setTimeout(std::chrono::milliseconds(timeout_ms_));
-    }
-    context->connectFullMesh(device_);
-    return context;
-#else
-    CAFFE_THROW(
-      "Gloo was not compiled with MPI support. ",
-      "Please recompile with -DUSE_MPI=1.");
-#endif
-  }
-
-  CommonWorld rendezvousWithStore(
-      const std::unique_ptr<StoreHandler>& handler) {
-    // Use PrefixStore to isolate different CreateCommonWorld instances
-    StoreHandlerWrapper wrapper(*handler);
-    ::gloo::rendezvous::PrefixStore store(name_, wrapper);
-    auto context = std::make_shared<::gloo::rendezvous::Context>(rank_, size_);
-    if (timeout_ms_ != -1) {
-      context->setTimeout(std::chrono::milliseconds(timeout_ms_));
-    }
-    context->connectFullMesh(store, device_);
-    return context;
-  }
-
-  bool RunOnDevice() override {
-    try {
-      CommonWorld context;
-      if (mpi_rendezvous_) {
-        context = rendezvousWithMPI();
-      } else {
-        CAFFE_ENFORCE_EQ(InputSize(), 1, "Expected store handler input");
-        const auto& handler =
-            OperatorBase::Input<std::unique_ptr<StoreHandler>>(STORE_HANDLER);
-        context = rendezvousWithStore(handler);
-      }
-
-      // Switch pairs to synchronous mode if configured to do so
-      if (sync_) {
-        for (int i = 0; i < context->size; i++) {
-          auto& pair = context->getPair(i);
-          if (pair) {
-            pair->setSync(true, false);
-          }
-        }
-      }
-
-      *OperatorBase::Output<CommonWorld>(COMM) = std::move(context);
-    } catch (::gloo::IoException& ioe) {
-      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
-      return handleException(ioe);
-    } catch (::caffe2::StoreHandlerTimeoutException& te) {
-      LOG(ERROR) << "Caught store handler timeout exception: " << te.what();
-      return handleException(te);
-    }
-    return true;
-  }
-
- private:
-  bool handleException(std::exception& ex) {
-    if (status_blob_ != "") {
-      signalFailure(ws_->GetBlob(status_blob_), ex);
-      return false;
-    } else {
-      throw;
-    }
-  }
-
-  void initialize() {
-    // Share single device between all common worlds.
-    static std::once_flag once;
-    static std::shared_ptr<::gloo::transport::Device> device;
-    std::call_once(once, [&]() {
-        createDeviceAttr attr;
-        attr.transport = transport_;
-        attr.interface = interface_;
-        device = createDevice(attr);
-      });
-    device_ = device;
-
-    // Context specific initialization.
-    initializeForContext();
-  }
-
-  void initializeForContext();
-
-  const int size_;
-  const int rank_;
-  const bool sync_;
-  const std::string transport_;
-  const std::string interface_;
-  const bool mpi_rendezvous_;
-  const std::string status_blob_;
-  const int timeout_ms_;
-  Workspace* ws_;
-
-  std::string name_;
-  std::shared_ptr<::gloo::transport::Device> device_;
-
-  INPUT_TAGS(STORE_HANDLER);
-  OUTPUT_TAGS(COMM);
-};
-
-template <class Context>
-class CloneCommonWorld final : public Operator<Context> {
- public:
-  using CommonWorld = std::shared_ptr<::gloo::Context>;
-
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  CloneCommonWorld(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        sync_(OperatorBase::template GetSingleArgument<bool>("sync", false)),
-        ws_(ws),
-        status_blob_(
-            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
-    if (status_blob_ != "") {
-      ws_->CreateBlob(status_blob_);
-    }
-  }
-
-  ~CloneCommonWorld() override {}
-
-  bool RunOnDevice() override {
-    try {
-      auto existing = OperatorBase::Input<CommonWorld>(EXISTING_COMM);
-      ::gloo::rendezvous::ContextFactory factory(existing);
-      auto clone = factory.makeContext(existing->getDevice());
-
-      // Switch pairs to synchronous mode if configured to do so
-      if (sync_) {
-        for (int i = 0; i < clone->size; i++) {
-          auto& pair = clone->getPair(i);
-          if (pair) {
-            pair->setSync(true, false);
-          }
-        }
-      }
-
-      *OperatorBase::Output<CommonWorld>(CLONED_COMM) = std::move(clone);
-    } catch (::gloo::IoException& ioe) {
-      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
-      return handleException(ioe);
-    }
-    return true;
-  }
-
- private:
-  bool handleException(std::exception& ex) {
-    if (status_blob_ != "") {
-      signalFailure(ws_->GetBlob(status_blob_), ex);
-      return false;
-    } else {
-      throw;
-    }
-  }
-
-  const bool sync_;
-  Workspace* ws_;
-  std::string status_blob_;
-
-  INPUT_TAGS(EXISTING_COMM);
-  OUTPUT_TAGS(CLONED_COMM);
-};
-
-class DestroyCommonWorld final : public Operator<CPUContext> {
- public:
-  DestroyCommonWorld(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws) {
-    cw_name_ = operator_def.input(0);
-  }
-
-  bool RunOnDevice() override {
-    if (OperatorBase::InputBlob(0).GetRaw() == nullptr) {
-      return true;
-    }
-    const auto& context =
-        OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
-
-    if (context) {
-      LOG(INFO) << "Closing connections: " << cw_name_;
-      context->closeConnections();
-    }
-    return true;
-  }
-
- private:
-  std::string cw_name_;
-};
-
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/common_world_ops_gpu.cc b/caffe2/contrib/gloo/common_world_ops_gpu.cc
deleted file mode 100644
index f8fb701c846da..0000000000000
--- a/caffe2/contrib/gloo/common_world_ops_gpu.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "caffe2/contrib/gloo/common_world_ops.h"
-
-#include "caffe2/core/context_gpu.h"
-
-#include <gloo/cuda.h>
-#include <gloo/transport/tcp/device.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <>
-void CreateCommonWorld<CUDAContext>::initializeForContext() {
-  static std::once_flag once;
-  std::call_once(once, [&]() {
-      // This is the first time we call Gloo code for a CUDAContext.
-      // Share Caffe2 CUDA mutex with Gloo.
-      ::gloo::CudaShared::setMutex(&CUDAContext::mutex());
-    });
-}
-
-namespace {
-
-REGISTER_CUDA_OPERATOR_WITH_ENGINE(
-    CreateCommonWorld,
-    GLOO,
-    CreateCommonWorld<CUDAContext>);
-
-REGISTER_CUDA_OPERATOR_WITH_ENGINE(
-    CloneCommonWorld,
-    GLOO,
-    CloneCommonWorld<CUDAContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/context.cc b/caffe2/contrib/gloo/context.cc
deleted file mode 100644
index 3ed90e4401576..0000000000000
--- a/caffe2/contrib/gloo/context.cc
+++ /dev/null
@@ -1,12 +0,0 @@
-#include "context.h"
-
-#include <c10/util/typeid.h>
-
-#include <gloo/types.h>
-
-namespace caffe2 {
-
-CAFFE_KNOWN_TYPE(::gloo::float16);
-CAFFE_KNOWN_TYPE(std::shared_ptr<::gloo::Context>);
-
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/context.h b/caffe2/contrib/gloo/context.h
deleted file mode 100644
index 7b427bdd36156..0000000000000
--- a/caffe2/contrib/gloo/context.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#pragma once
-
-#include <gloo/context.h>
diff --git a/caffe2/contrib/gloo/gloo_test.py b/caffe2/contrib/gloo/gloo_test.py
deleted file mode 100644
index 1bc1bf165d232..0000000000000
--- a/caffe2/contrib/gloo/gloo_test.py
+++ /dev/null
@@ -1,706 +0,0 @@
-#!/usr/bin/env python3
-
-
-
-
-
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-from multiprocessing import Process, Queue
-
-import numpy as np
-import os
-import pickle
-import tempfile
-import shutil
-
-from caffe2.python import core, workspace, dyndep
-import caffe2.python.hypothesis_test_util as hu
-from gloo.python import IoError
-
-dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
-dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:redis_store_handler_ops")
-dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:store_ops")
-dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/gloo:gloo_ops")
-dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/gloo:gloo_ops_gpu")
-
-op_engine = 'GLOO'
-
-class TemporaryDirectory:
-    def __enter__(self):
-        self.tmpdir = tempfile.mkdtemp()
-        return self.tmpdir
-
-    def __exit__(self, type, value, traceback):
-        shutil.rmtree(self.tmpdir)
-
-
-class TestCase(hu.HypothesisTestCase):
-    test_counter = 0
-    sync_counter = 0
-
-    def run_test_locally(self, fn, device_option=None, **kwargs):
-        # Queue for assertion errors on subprocesses
-        queue = Queue()
-
-        # Capture any exception thrown by the subprocess
-        def run_fn(*args, **kwargs):
-            try:
-                with core.DeviceScope(device_option):
-                    fn(*args, **kwargs)
-                    workspace.ResetWorkspace()
-                    queue.put(True)
-            except Exception as ex:
-                queue.put(ex)
-
-        # Start N processes in the background
-        procs = []
-        for i in range(kwargs['comm_size']):
-            kwargs['comm_rank'] = i
-            proc = Process(
-                target=run_fn,
-                kwargs=kwargs)
-            proc.start()
-            procs.append(proc)
-
-        # Test complete, join background processes
-        while len(procs) > 0:
-            proc = procs.pop(0)
-            while proc.is_alive():
-                proc.join(10)
-
-            # Raise exception if we find any. Otherwise each worker
-            # should put a True into the queue
-            # Note that the following is executed ALSO after
-            # the last process was joined, so if ANY exception
-            # was raised, it will be re-raised here.
-            self.assertFalse(queue.empty(), "Job failed without a result")
-            o = queue.get()
-            if isinstance(o, Exception):
-                raise o
-            else:
-                self.assertTrue(o)
-
-    def run_test_distributed(self, fn, device_option=None, **kwargs):
-        comm_rank = os.getenv('COMM_RANK')
-        self.assertIsNotNone(comm_rank)
-        comm_size = os.getenv('COMM_SIZE')
-        self.assertIsNotNone(comm_size)
-        kwargs['comm_rank'] = int(comm_rank)
-        kwargs['comm_size'] = int(comm_size)
-        with core.DeviceScope(device_option):
-            fn(**kwargs)
-            workspace.ResetWorkspace()
-
-    def create_common_world(self, comm_rank, comm_size, tmpdir=None, existing_cw=None):
-        store_handler = "store_handler"
-
-        # If REDIS_HOST is set, use RedisStoreHandler for rendezvous.
-        if existing_cw is None:
-            redis_host = os.getenv("REDIS_HOST")
-            redis_port = int(os.getenv("REDIS_PORT", 6379))
-            if redis_host is not None:
-                workspace.RunOperatorOnce(
-                    core.CreateOperator(
-                        "RedisStoreHandlerCreate",
-                        [],
-                        [store_handler],
-                        prefix=str(TestCase.test_counter) + "/",
-                        host=redis_host,
-                        port=redis_port))
-            else:
-                workspace.RunOperatorOnce(
-                    core.CreateOperator(
-                        "FileStoreHandlerCreate",
-                        [],
-                        [store_handler],
-                        path=tmpdir))
-            common_world = "common_world"
-        else:
-            common_world = str(existing_cw) + ".forked"
-
-        if existing_cw is not None:
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "CloneCommonWorld",
-                    [existing_cw],
-                    [common_world],
-                    sync=True,
-                    engine=op_engine))
-        else:
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "CreateCommonWorld",
-                    [store_handler],
-                    [common_world],
-                    size=comm_size,
-                    rank=comm_rank,
-                    sync=True,
-                    engine=op_engine))
-        return (store_handler, common_world)
-
-    def synchronize(self, store_handler, value, comm_rank=None):
-        TestCase.sync_counter += 1
-        blob = "sync_{}".format(TestCase.sync_counter)
-        if comm_rank == 0:
-            workspace.FeedBlob(blob, pickle.dumps(value))
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "StoreSet",
-                    [store_handler, blob],
-                    []))
-        else:
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "StoreGet",
-                    [store_handler],
-                    [blob]))
-        return pickle.loads(workspace.FetchBlob(blob))
-
-    def _test_broadcast(self,
-                        comm_rank=None,
-                        comm_size=None,
-                        blob_size=None,
-                        num_blobs=None,
-                        tmpdir=None,
-                        use_float16=False,
-                        ):
-        store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank,
-            comm_size=comm_size,
-            tmpdir=tmpdir)
-
-        blob_size = self.synchronize(
-            store_handler,
-            blob_size,
-            comm_rank=comm_rank)
-
-        num_blobs = self.synchronize(
-            store_handler,
-            num_blobs,
-            comm_rank=comm_rank)
-
-        for i in range(comm_size):
-            blobs = []
-            for j in range(num_blobs):
-                blob = "blob_{}".format(j)
-                offset = (comm_rank * num_blobs) + j
-                value = np.full(blob_size, offset,
-                                np.float16 if use_float16 else np.float32)
-                workspace.FeedBlob(blob, value)
-                blobs.append(blob)
-
-            net = core.Net("broadcast")
-            net.Broadcast(
-                [common_world] + blobs,
-                blobs,
-                root=i,
-                engine=op_engine)
-
-            workspace.CreateNet(net)
-            workspace.RunNet(net.Name())
-
-            for j in range(num_blobs):
-                np.testing.assert_array_equal(
-                    workspace.FetchBlob(blobs[j]),
-                    i * num_blobs)
-
-            # Run the net a few more times to check the operator
-            # works not just the first time it's called
-            for _tmp in range(4):
-                workspace.RunNet(net.Name())
-
-    @given(comm_size=st.integers(min_value=2, max_value=8),
-           blob_size=st.integers(min_value=int(1e3), max_value=int(1e6)),
-           num_blobs=st.integers(min_value=1, max_value=4),
-           device_option=st.sampled_from([hu.cpu_do]),
-           use_float16=st.booleans())
-    @settings(deadline=10000)
-    def test_broadcast(self, comm_size, blob_size, num_blobs, device_option,
-                       use_float16):
-        TestCase.test_counter += 1
-        if os.getenv('COMM_RANK') is not None:
-            self.run_test_distributed(
-                self._test_broadcast,
-                blob_size=blob_size,
-                num_blobs=num_blobs,
-                use_float16=use_float16,
-                device_option=device_option)
-        else:
-            with TemporaryDirectory() as tmpdir:
-                self.run_test_locally(
-                    self._test_broadcast,
-                    comm_size=comm_size,
-                    blob_size=blob_size,
-                    num_blobs=num_blobs,
-                    device_option=device_option,
-                    tmpdir=tmpdir,
-                    use_float16=use_float16)
-
-    def _test_allreduce(self,
-                        comm_rank=None,
-                        comm_size=None,
-                        blob_size=None,
-                        num_blobs=None,
-                        tmpdir=None,
-                        use_float16=False
-                        ):
-        store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank,
-            comm_size=comm_size,
-            tmpdir=tmpdir)
-
-        blob_size = self.synchronize(
-            store_handler,
-            blob_size,
-            comm_rank=comm_rank)
-
-        num_blobs = self.synchronize(
-            store_handler,
-            num_blobs,
-            comm_rank=comm_rank)
-
-        blobs = []
-        for i in range(num_blobs):
-            blob = "blob_{}".format(i)
-            value = np.full(blob_size, (comm_rank * num_blobs) + i,
-                            np.float16 if use_float16 else np.float32)
-            workspace.FeedBlob(blob, value)
-            blobs.append(blob)
-
-        net = core.Net("allreduce")
-        net.Allreduce(
-            [common_world] + blobs,
-            blobs,
-            engine=op_engine)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net.Name())
-
-        for i in range(num_blobs):
-            np.testing.assert_array_equal(
-                workspace.FetchBlob(blobs[i]),
-                (num_blobs * comm_size) * (num_blobs * comm_size - 1) / 2)
-
-        # Run the net a few more times to check the operator
-        # works not just the first time it's called
-        for _tmp in range(4):
-            workspace.RunNet(net.Name())
-
-    def _test_allreduce_multicw(self,
-                                comm_rank=None,
-                                comm_size=None,
-                                tmpdir=None
-                                ):
-        _store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank,
-            comm_size=comm_size,
-            tmpdir=tmpdir)
-
-        _, common_world2 = self.create_common_world(
-            comm_rank=comm_rank,
-            comm_size=comm_size,
-            tmpdir=tmpdir,
-            existing_cw=common_world)
-
-        blob_size = int(1e4)
-        num_blobs = 4
-
-        for cw in [common_world, common_world2]:
-            blobs = []
-            for i in range(num_blobs):
-                blob = "blob_{}".format(i)
-                value = np.full(blob_size, (comm_rank * num_blobs) + i, np.float32)
-                workspace.FeedBlob(blob, value)
-                blobs.append(blob)
-
-            net = core.Net("allreduce_multicw")
-            net.Allreduce(
-                [cw] + blobs,
-                blobs,
-                engine=op_engine)
-
-            workspace.RunNetOnce(net)
-            for i in range(num_blobs):
-                np.testing.assert_array_equal(
-                    workspace.FetchBlob(blobs[i]),
-                    (num_blobs * comm_size) * (num_blobs * comm_size - 1) / 2)
-
-    @given(comm_size=st.integers(min_value=2, max_value=8),
-           blob_size=st.integers(min_value=int(1e3), max_value=int(1e6)),
-           num_blobs=st.integers(min_value=1, max_value=4),
-           device_option=st.sampled_from([hu.cpu_do]),
-           use_float16=st.booleans())
-    @settings(deadline=10000)
-    def test_allreduce(self, comm_size, blob_size, num_blobs, device_option,
-                       use_float16):
-        TestCase.test_counter += 1
-        if os.getenv('COMM_RANK') is not None:
-            self.run_test_distributed(
-                self._test_allreduce,
-                blob_size=blob_size,
-                num_blobs=num_blobs,
-                use_float16=use_float16,
-                device_option=device_option)
-        else:
-            with TemporaryDirectory() as tmpdir:
-                self.run_test_locally(
-                    self._test_allreduce,
-                    comm_size=comm_size,
-                    blob_size=blob_size,
-                    num_blobs=num_blobs,
-                    device_option=device_option,
-                    tmpdir=tmpdir,
-                    use_float16=use_float16)
-
-    def _test_reduce_scatter(self,
-                             comm_rank=None,
-                             comm_size=None,
-                             blob_size=None,
-                             num_blobs=None,
-                             tmpdir=None,
-                             use_float16=False
-                             ):
-        store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank,
-            comm_size=comm_size,
-            tmpdir=tmpdir)
-
-        blob_size = self.synchronize(
-            store_handler,
-            blob_size,
-            comm_rank=comm_rank)
-
-        num_blobs = self.synchronize(
-            store_handler,
-            num_blobs,
-            comm_rank=comm_rank)
-
-        blobs = []
-        for i in range(num_blobs):
-            blob = "blob_{}".format(i)
-            value = np.full(blob_size, (comm_rank * num_blobs) + i,
-                            np.float16 if use_float16 else np.float32)
-            workspace.FeedBlob(blob, value)
-            blobs.append(blob)
-
-        # Specify distribution among ranks i.e. number of elements
-        # scattered/distributed to each process.
-        recv_counts = np.zeros(comm_size, dtype=np.int32)
-        remaining = blob_size
-        chunk_size = (blob_size + comm_size - 1) / comm_size
-        for i in range(comm_size):
-            recv_counts[i] = min(chunk_size, remaining)
-            remaining = remaining - chunk_size if remaining > chunk_size else 0
-        recv_counts_blob = "recvCounts"
-        workspace.FeedBlob(recv_counts_blob, recv_counts)
-        blobs.append(recv_counts_blob)
-
-        net = core.Net("reduce_scatter")
-        net.ReduceScatter(
-            [common_world] + blobs,
-            blobs,
-            engine=op_engine)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net.Name())
-
-        for i in range(num_blobs):
-            np.testing.assert_array_equal(
-                np.resize(workspace.FetchBlob(blobs[i]), recv_counts[comm_rank]),
-                (num_blobs * comm_size) * (num_blobs * comm_size - 1) / 2)
-
-        # Run the net a few more times to check the operator
-        # works not just the first time it's called
-        for _tmp in range(4):
-            workspace.RunNet(net.Name())
-
-    @given(comm_size=st.integers(min_value=2, max_value=8),
-           blob_size=st.integers(min_value=int(1e3), max_value=int(1e6)),
-           num_blobs=st.integers(min_value=1, max_value=4),
-           device_option=st.sampled_from([hu.cpu_do]),
-           use_float16=st.booleans())
-    @settings(deadline=10000)
-    def test_reduce_scatter(self, comm_size, blob_size, num_blobs,
-                            device_option, use_float16):
-        TestCase.test_counter += 1
-        if os.getenv('COMM_RANK') is not None:
-            self.run_test_distributed(
-                self._test_reduce_scatter,
-                blob_size=blob_size,
-                num_blobs=num_blobs,
-                use_float16=use_float16,
-                device_option=device_option)
-        else:
-            with TemporaryDirectory() as tmpdir:
-                self.run_test_locally(
-                    self._test_reduce_scatter,
-                    comm_size=comm_size,
-                    blob_size=blob_size,
-                    num_blobs=num_blobs,
-                    device_option=device_option,
-                    tmpdir=tmpdir,
-                    use_float16=use_float16)
-
-    def _test_allgather(self,
-                        comm_rank=None,
-                        comm_size=None,
-                        blob_size=None,
-                        num_blobs=None,
-                        tmpdir=None,
-                        use_float16=False
-                        ):
-        store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank,
-            comm_size=comm_size,
-            tmpdir=tmpdir)
-
-        blob_size = self.synchronize(
-            store_handler,
-            blob_size,
-            comm_rank=comm_rank)
-
-        num_blobs = self.synchronize(
-            store_handler,
-            num_blobs,
-            comm_rank=comm_rank)
-
-        blobs = []
-        for i in range(num_blobs):
-            blob = "blob_{}".format(i)
-            value = np.full(blob_size, (comm_rank * num_blobs) + i,
-                            np.float16 if use_float16 else np.float32)
-            workspace.FeedBlob(blob, value)
-            blobs.append(blob)
-
-        net = core.Net("allgather")
-        net.Allgather(
-            [common_world] + blobs,
-            ["Gathered"],
-            engine=op_engine)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net.Name())
-        # create expected output
-        expected_output = np.array([])
-        for i in range(comm_size):
-            for j in range(num_blobs):
-                value = np.full(blob_size, (i * num_blobs) + j,
-                                np.float16 if use_float16 else np.float32)
-                expected_output = np.concatenate((expected_output, value))
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("Gathered"), expected_output)
-
-        # Run the net a few more times to check the operator
-        # works not just the first time it's called
-        for _tmp in range(4):
-            workspace.RunNet(net.Name())
-
-    @given(comm_size=st.integers(min_value=2, max_value=8),
-           blob_size=st.integers(min_value=int(1e3), max_value=int(1e6)),
-           num_blobs=st.integers(min_value=1, max_value=4),
-           device_option=st.sampled_from([hu.cpu_do]),
-           use_float16=st.booleans())
-    @settings(max_examples=10, deadline=None)
-    def test_allgather(self, comm_size, blob_size, num_blobs, device_option,
-                       use_float16):
-        TestCase.test_counter += 1
-        if os.getenv('COMM_RANK') is not None:
-            self.run_test_distributed(
-                self._test_allgather,
-                blob_size=blob_size,
-                num_blobs=num_blobs,
-                use_float16=use_float16,
-                device_option=device_option)
-        else:
-            with TemporaryDirectory() as tmpdir:
-                self.run_test_locally(
-                    self._test_allgather,
-                    comm_size=comm_size,
-                    blob_size=blob_size,
-                    num_blobs=num_blobs,
-                    device_option=device_option,
-                    tmpdir=tmpdir,
-                    use_float16=use_float16)
-
-    @given(device_option=st.sampled_from([hu.cpu_do]))
-    @settings(deadline=10000)
-    def test_forked_cw(self, device_option):
-        TestCase.test_counter += 1
-        if os.getenv('COMM_RANK') is not None:
-            self.run_test_distributed(
-                self._test_allreduce_multicw,
-                device_option=device_option)
-        else:
-            # Note: this test exercises the path where we fork a common world.
-            # We therefore don't need a comm size larger than 2. It used to be
-            # run with comm_size=8, which causes flaky results in a stress run.
-            # The flakiness was caused by too many listening sockets being
-            # created by Gloo context initialization (8 processes times
-            # 7 sockets times 20-way concurrency, plus TIME_WAIT).
-            with TemporaryDirectory() as tmpdir:
-                self.run_test_locally(
-                    self._test_allreduce_multicw,
-                    comm_size=2,
-                    device_option=device_option,
-                    tmpdir=tmpdir)
-
-    def _test_barrier(
-        self,
-        comm_rank=None,
-        comm_size=None,
-        tmpdir=None,
-    ):
-        store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank, comm_size=comm_size, tmpdir=tmpdir
-        )
-
-        net = core.Net("barrier")
-        net.Barrier(
-            [common_world],
-            [],
-            engine=op_engine)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net.Name())
-
-        # Run the net a few more times to check the operator
-        # works not just the first time it's called
-        for _tmp in range(4):
-            workspace.RunNet(net.Name())
-
-    @given(comm_size=st.integers(min_value=2, max_value=8),
-           device_option=st.sampled_from([hu.cpu_do]))
-    @settings(deadline=10000)
-    def test_barrier(self, comm_size, device_option):
-        TestCase.test_counter += 1
-        if os.getenv('COMM_RANK') is not None:
-            self.run_test_distributed(
-                self._test_barrier,
-                device_option=device_option)
-        else:
-            with TemporaryDirectory() as tmpdir:
-                self.run_test_locally(
-                    self._test_barrier,
-                    comm_size=comm_size,
-                    device_option=device_option,
-                    tmpdir=tmpdir)
-
-    def _test_close_connection(
-        self,
-        comm_rank=None,
-        comm_size=None,
-        tmpdir=None,
-    ):
-        '''
-        One node calls close connection, others wait it on barrier.
-        Test will check that all will exit eventually.
-        '''
-        # Caffe's for closers only:
-        # https://www.youtube.com/watch?v=QMFwFgG9NE8
-        closer = comm_rank == comm_size // 2,
-
-        store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank, comm_size=comm_size, tmpdir=tmpdir
-        )
-
-        net = core.Net("barrier_or_close")
-        if not closer:
-            net.Barrier(
-                [common_world],
-                [],
-                engine=op_engine)
-        else:
-            net.DestroyCommonWorld(
-                [common_world], [common_world], engine=op_engine)
-            # Sleep a bit to ensure others start the barrier
-            import time
-            time.sleep(0.1)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net.Name())
-
-    @given(comm_size=st.integers(min_value=2, max_value=8),
-           device_option=st.sampled_from([hu.cpu_do]))
-    @settings(deadline=10000)
-    def test_close_connection(self, comm_size, device_option):
-        import time
-        start_time = time.time()
-        TestCase.test_counter += 1
-        if os.getenv('COMM_RANK') is not None:
-            self.run_test_distributed(
-                self._test_close_connection,
-                device_option=device_option)
-        else:
-            with TemporaryDirectory() as tmpdir:
-                self.run_test_locally(
-                    self._test_close_connection,
-                    comm_size=comm_size,
-                    device_option=device_option,
-                    tmpdir=tmpdir)
-        # Check that test finishes quickly because connections get closed.
-        # This assert used to check that the end to end runtime was less
-        # than 2 seconds, but this may not always be the case if there
-        # is significant overhead in starting processes. Ideally, this
-        # assert is replaced by one that doesn't depend on time but rather
-        # checks the success/failure status of the barrier that is run.
-        self.assertLess(time.time() - start_time, 20.0)
-
-    def _test_io_error(
-        self,
-        comm_rank=None,
-        comm_size=None,
-        tmpdir=None,
-    ):
-        '''
-        Only one node will participate in allreduce, resulting in an IoError
-        '''
-        store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank,
-            comm_size=comm_size,
-            tmpdir=tmpdir)
-
-        if comm_rank == 0:
-            blob_size = 1000
-            num_blobs = 1
-
-            blobs = []
-            for i in range(num_blobs):
-                blob = "blob_{}".format(i)
-                value = np.full(
-                    blob_size, (comm_rank * num_blobs) + i, np.float32
-                )
-                workspace.FeedBlob(blob, value)
-                blobs.append(blob)
-
-            net = core.Net("allreduce")
-            net.Allreduce(
-                [common_world] + blobs,
-                blobs,
-                engine=op_engine)
-
-            workspace.CreateNet(net)
-            workspace.RunNet(net.Name())
-
-    @given(comm_size=st.integers(min_value=2, max_value=8),
-           device_option=st.sampled_from([hu.cpu_do]))
-    @settings(deadline=10000)
-    def test_io_error(self, comm_size, device_option):
-        TestCase.test_counter += 1
-        with self.assertRaises(IoError):
-            if os.getenv('COMM_RANK') is not None:
-                self.run_test_distributed(
-                    self._test_io_error,
-                    device_option=device_option)
-            else:
-                with TemporaryDirectory() as tmpdir:
-                    self.run_test_locally(
-                        self._test_io_error,
-                        comm_size=comm_size,
-                        device_option=device_option,
-                        tmpdir=tmpdir)
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/contrib/gloo/py_export.cc b/caffe2/contrib/gloo/py_export.cc
deleted file mode 100644
index 379982e52f097..0000000000000
--- a/caffe2/contrib/gloo/py_export.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <gloo/common/error.h>
-#include <pybind11/pybind11.h>
-
-namespace gloo {
-namespace python {
-
-namespace py = pybind11;
-
-PYBIND11_MODULE(python, m) {
-  m.doc() = "Python interface for Gloo";
-  py::register_exception<IoException>(m, "IoError");
-}
-
-} // namespace python
-} // namespace gloo
diff --git a/caffe2/contrib/gloo/reduce_scatter_ops.cc b/caffe2/contrib/gloo/reduce_scatter_ops.cc
deleted file mode 100644
index 8320d1177a368..0000000000000
--- a/caffe2/contrib/gloo/reduce_scatter_ops.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/**
- * Copyright (c) 2018-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "reduce_scatter_ops.h"
-
-#include <gloo/reduce_scatter.h>
-#include <gloo/types.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-void ReduceScatterOp<Context>::initializeHalvingDoubling() {
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::ReduceScatterHalvingDoubling<float>(
-        init_.context,
-        init_.template getOutputs<float>(),
-        init_.size,
-        recvCounts_));
-  } else if (init_.template IsType<::at::Half>()) {
-    algorithm_.reset(new ::gloo::ReduceScatterHalvingDoubling<::gloo::float16>(
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size,
-        recvCounts_));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-namespace {
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(
-    ReduceScatter,
-    GLOO,
-    ReduceScatterOp<CPUContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/reduce_scatter_ops.h b/caffe2/contrib/gloo/reduce_scatter_ops.h
deleted file mode 100644
index 1ff5aeedf1477..0000000000000
--- a/caffe2/contrib/gloo/reduce_scatter_ops.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/**
- * Copyright (c) 2018-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <algorithm>
-
-#include "caffe2/contrib/gloo/common.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-#include <gloo/algorithm.h>
-#include <gloo/common/error.h>
-#include <gloo/context.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-class ReduceScatterOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  ReduceScatterOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        ws_(ws),
-        status_blob_(
-            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
-    if (status_blob_ != "") {
-      ws_->CreateBlob(status_blob_);
-    }
-  }
-
-  ~ReduceScatterOp() override {}
-
-  bool RunOnDevice() override {
-    std::call_once(once_, [&] { initialize(); });
-
-    // If any parameter has changed in between runs, the initialized
-    // algorithm is invalid and cannot be used.
-    update(current_);
-    CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
-
-    try {
-      algorithm_->run();
-    } catch (::gloo::IoException& ioe) {
-      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
-      if (status_blob_ != "") {
-        signalFailure(ws_->GetBlob(status_blob_), ioe);
-        return false;
-      } else {
-        throw;
-      }
-    }
-    return true;
-  }
-
- protected:
-  void initialize() {
-    // Store which inputs/outputs this instance initialized with
-    update(init_);
-
-    // Verify inputs == outputs
-    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
-    for (const auto i : c10::irange(init_.inputs.size())) {
-      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
-    }
-
-    // Verify tensors all have same size
-    size_t size = Input(1).numel();
-    for (auto i = 2; i < InputSize() - 1; i++) {
-      CAFFE_ENFORCE_EQ(Input(i).numel(), size);
-    }
-
-    // Verify tensors all have same type
-    TypeMeta meta = Input(1).dtype();
-    for (auto i = 2; i < InputSize() - 1; i++) {
-      CAFFE_ENFORCE(Input(i).dtype() == meta);
-    }
-
-    initializeHalvingDoubling();
-  }
-
-  void initializeHalvingDoubling();
-
-  std::once_flag once_;
-  std::unique_ptr<::gloo::Algorithm> algorithm_;
-
-  // Captures the parameters passed to Gloo when first initialized.
-  // An instance is updated every time this op runs and is compared
-  // to the reference instance for equality. If any parameter has
-  // changed from run to run, the initialized algorithm is invalid.
-  void update(GlooParameters& params) {
-    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
-    params.inputs.resize(InputSize() - 2);
-    params.outputs.resize(OutputSize() - 1);
-    for (const auto i : c10::irange(params.inputs.size())) {
-      params.inputs[i] = Input(i + 1).raw_data();
-      params.outputs[i] = Output(i)->raw_mutable_data();
-    }
-    params.size = Output(0)->numel();
-    params.meta = Output(0)->dtype();
-
-    // Verify recvCountsSize == comm_size
-    CAFFE_ENFORCE_EQ(Input(InputSize() - 1).numel(), params.context->size);
-    int* recvCounts = (int*)Input(InputSize() - 1).raw_data();
-    recvCounts_.assign(recvCounts, recvCounts + Input(InputSize() - 1).numel());
-  }
-
-  GlooParameters init_;
-  GlooParameters current_;
-  Workspace* ws_;
-  std::string status_blob_;
-  std::vector<int> recvCounts_;
-};
-
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/store_handler.cc b/caffe2/contrib/gloo/store_handler.cc
deleted file mode 100644
index e5c30d9c71cc1..0000000000000
--- a/caffe2/contrib/gloo/store_handler.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-#include "store_handler.h"
-
-namespace caffe2 {
-namespace gloo {
-
-void StoreHandlerWrapper::set(
-    const std::string& key,
-    const std::vector<char>& data) {
-  std::string stringValue(data.data(), data.size());
-  handler_.set(key, stringValue);
-}
-
-std::vector<char> StoreHandlerWrapper::get(const std::string& key) {
-  std::string str = handler_.get(key);
-  return std::vector<char>(str.begin(), str.end());
-}
-
-void StoreHandlerWrapper::wait(
-    const std::vector<std::string>& keys,
-    const std::chrono::milliseconds& timeout) {
-  handler_.wait(keys, timeout);
-}
-
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/gloo/store_handler.h b/caffe2/contrib/gloo/store_handler.h
deleted file mode 100644
index d386319131c1a..0000000000000
--- a/caffe2/contrib/gloo/store_handler.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-
-#include "caffe2/core/common.h"
-#include "caffe2/distributed/store_handler.h"
-
-#include <gloo/rendezvous/store.h>
-
-namespace caffe2 {
-namespace gloo {
-
-class TORCH_API StoreHandlerWrapper : public ::gloo::rendezvous::Store {
- public:
-  explicit StoreHandlerWrapper(StoreHandler& handler) : handler_(handler) {}
-
-  virtual ~StoreHandlerWrapper() override {}
-
-  virtual void set(const std::string& key, const std::vector<char>& data)
-      override;
-
-   std::vector<char> get(const std::string& key) override;
-
-   void wait(const std::vector<std::string>& keys) override {
-    wait(keys, ::gloo::rendezvous::Store::kDefaultTimeout);
-  }
-
-  virtual void wait(
-      const std::vector<std::string>& keys,
-      const std::chrono::milliseconds& timeout) override;
-
- protected:
-  StoreHandler& handler_;
-};
-
-} // namespace gloo
-} // namespace caffe2
diff --git a/caffe2/contrib/ideep/CMakeLists.txt b/caffe2/contrib/ideep/CMakeLists.txt
deleted file mode 100644
index 02ffe32b9a7d9..0000000000000
--- a/caffe2/contrib/ideep/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-if(USE_MKLDNN)
-  message(STATUS "Including IDEEP operators")
-
-  # ---[ CPU files.
-  file(GLOB_RECURSE tmp *.cc)
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
-  # exclude test files and gpu files
-  file(GLOB_RECURSE tmp *_test.cc)
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
-
-  # ---[ CPU test files - currently none but just to be safe
-  file(GLOB_RECURSE tmp *_test.cc)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
-
-  # ---[ Send the lists to the parent scope.
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-else()
-  message(STATUS "Excluding ideep operators as we are not using ideep")
-endif()
diff --git a/caffe2/contrib/nccl/CMakeLists.txt b/caffe2/contrib/nccl/CMakeLists.txt
deleted file mode 100644
index b583048865890..0000000000000
--- a/caffe2/contrib/nccl/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-if(USE_NCCL)
-    if(USE_CUDA)
-      message(STATUS "Include NCCL operators")
-      set(Caffe2_CONTRIB_NCCL_GPU_SRC
-          "${CMAKE_CURRENT_SOURCE_DIR}/cuda_nccl_gpu.cc"
-          "${CMAKE_CURRENT_SOURCE_DIR}/cuda_nccl_op_gpu.cc"
-      )
-
-      set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_NCCL_GPU_SRC})
-      set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-    endif(USE_CUDA)
-
-    if(USE_ROCM)
-      message(STATUS "Include AMD RCCL operators")
-      set(Caffe2_CONTRIB_NCCL_HIP_SRC
-          "${CMAKE_CURRENT_SOURCE_DIR}/hip/hip_nccl_gpu.cc"
-          "${CMAKE_CURRENT_SOURCE_DIR}/hip/hip_nccl_op_gpu.cc"
-      )
-
-      set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${Caffe2_CONTRIB_NCCL_HIP_SRC})
-      set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
-    endif(USE_ROCM)
-else()
-  message(STATUS "NCCL operators skipped due to no CUDA support")
-endif()
diff --git a/caffe2/contrib/nccl/cuda_nccl_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_gpu.cc
deleted file mode 100644
index 9a5b83c55c66b..0000000000000
--- a/caffe2/contrib/nccl/cuda_nccl_gpu.cc
+++ /dev/null
@@ -1,322 +0,0 @@
-#include "caffe2/contrib/nccl/cuda_nccl_gpu.h"
-
-namespace caffe2 {
-namespace nccl {
-namespace {
-
-std::vector<int> getDevices(const NCCLExecution& ex) {
-  std::vector<int> result;
-  result.reserve(ex.elements.size());
-  for (const auto& el : ex.elements) {
-    result.push_back(el.device);
-  }
-  return result;
-}
-
-class NCCLContext {
- public:
-  explicit NCCLContext(const NCCLExecution& ex)
-      : devices_(getDevices(ex)), master_gpu_id_(ex.stream_gpu_id) {
-    comms_.resize(devices_.size());
-    CAFFE_NCCL_CHECK(
-        ncclCommInitAll(comms_.data(), devices_.size(), devices_.data()));
-
-    streams_.resize(devices_.size());
-    events_.resize(devices_.size());
-    for (auto i = 0U; i < devices_.size(); ++i) {
-      CUDAGuard g(devices_[i]);
-      // get stream priorities
-      int lo_pri, hi_pri;
-      CUDA_ENFORCE(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri));
-      CUDA_ENFORCE(cudaStreamCreateWithPriority(
-          &streams_[i], cudaStreamNonBlocking, hi_pri));
-      CUDA_ENFORCE(cudaEventCreateWithFlags(
-          &events_[i], cudaEventDefault | cudaEventDisableTiming));
-    }
-    CUDAGuard g(master_gpu_id_);
-    CUDA_ENFORCE(cudaEventCreateWithFlags(
-        &master_event_, cudaEventDefault | cudaEventDisableTiming));
-  }
-
-  ~NCCLContext() {
-    for (auto i = 0U; i < devices_.size(); ++i) {
-      CUDAGuard g(devices_[i]);
-      CUDA_ENFORCE(cudaStreamDestroy(streams_[i]));
-      CUDA_ENFORCE(cudaEventDestroy(events_[i]));
-    }
-    CUDAGuard g(master_gpu_id_);
-    CUDA_ENFORCE(cudaEventDestroy(master_event_));
-
-    for (auto& comm : comms_) {
-      ncclCommDestroy(comm);
-    }
-  }
-
-  std::vector<int> devices_;
-  std::vector<ncclComm_t> comms_;
-  std::vector<cudaStream_t> streams_;
-  int master_gpu_id_;
-  cudaEvent_t master_event_;
-  std::vector<cudaEvent_t> events_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(NCCLContext);
-};
-
-// We share the contexts across multiple operators, hence the cache.
-static std::mutex& gContextsMutex() {
-  static std::mutex m;
-  return m;
-}
-
-std::unordered_map<std::string, std::unique_ptr<NCCLContext>>& gContexts() {
-  static std::unordered_map<std::string, std::unique_ptr<NCCLContext>> m;
-  return m;
-}
-
-std::string ncclKey(const NCCLExecution& ex) {
-  std::string result;
-  int curr_device;
-  CUDA_CHECK(cudaGetDevice(&curr_device));
-  result += to_string(curr_device) + ":";
-  for (const auto& el : ex.elements) {
-    result += to_string(el.device) + ",";
-  }
-  return result;
-}
-
-NCCLContext* getNCCLContext(const NCCLExecution& ex) {
-  auto& contexts = gContexts();
-  const auto key = ncclKey(ex);
-  if (!contexts[key]) {
-    LOG(INFO) << "Creating NCCLContext for key: " << key;
-    contexts[key].reset(new NCCLContext(ex));
-  }
-  return TORCH_CHECK_NOTNULL(contexts[key].get());
-}
-
-template <typename T>
-class ncclTypeWrapper;
-
-template <>
-class ncclTypeWrapper<float> {
- public:
-  static const ncclDataType_t type = ncclFloat;
-};
-
-template <>
-class ncclTypeWrapper<int> {
- public:
-  static const ncclDataType_t type = ncclInt;
-};
-
-#ifdef CAFFE_HAS_CUDA_FP16
-template <>
-class ncclTypeWrapper<at::Half> {
- public:
-  static const ncclDataType_t type = ncclHalf;
-};
-#endif
-
-template <typename T, typename InitF, typename F>
-void runNCCL(const NCCLExecution& ex, InitF&& init_f, F&& f) {
-  // do initialization
-  for (auto i = 0U; i < ex.elements.size(); ++i) {
-    auto& ctx = ex.elements[i];
-    CUDAGuard g(ctx.device);
-    init_f(ex.elements[i]);
-  }
-
-  std::lock_guard<std::mutex> g(gContextsMutex());
-  auto* context = getNCCLContext(ex);
-  auto& comms = context->comms_;
-  auto& streams = context->streams_;
-  auto& events = context->events_;
-  // Record an event on the master context, wait on it in each of the
-  // children streams, so the children streams are synchronized WRT
-  // the original stream.
-  {
-    CUDAGuard g(ex.stream_gpu_id);
-    CUDA_ENFORCE(cudaEventRecord(context->master_event_, ex.stream));
-  }
-
-  {
-    // lock out alloc / free while NCCL launches
-    std::lock_guard<std::mutex> lock(CUDAContext::mutex());
-
-#if NCCL_VERSION_MIN(2, 0, 0)
-    CAFFE_NCCL_CHECK(ncclGroupStart());
-#endif
-
-    for (auto i = 0U; i < ex.elements.size(); ++i) {
-      auto& ctx = ex.elements[i];
-      CUDAGuard g(ctx.device);
-      auto& comm = comms[i];
-      auto& stream = streams[i];
-
-      TORCH_DCHECK_EQ(ctx.device, GetGPUIDForPointer(ctx.src->raw_data()));
-      CUDA_ENFORCE(cudaStreamWaitEvent(stream, context->master_event_, 0));
-      f(ctx, comm, stream);
-    }
-
-#if NCCL_VERSION_MIN(2, 0, 0)
-    CAFFE_NCCL_CHECK(ncclGroupEnd());
-#endif
-
-    for (auto i = 0U; i < ex.elements.size(); ++i) {
-      auto& ctx = ex.elements[i];
-      CUDAGuard g(ctx.device);
-      auto& stream = streams[i];
-      auto& event = events[i];
-
-      // Record an event on each children stream that we have finished
-      // our computation
-      CUDA_ENFORCE(cudaEventRecord(event, stream));
-    }
-  }
-
-  // Now, wait on all the events in the original stream.
-  CUDAGuard dg(ex.stream_gpu_id);
-  for (auto& event : events) {
-    CUDA_ENFORCE(cudaStreamWaitEvent(TORCH_CHECK_NOTNULL(ex.stream), event, 0));
-  }
-}
-
-} // namespace
-
-void destroyContexts() {
-  std::lock_guard<std::mutex> g(gContextsMutex());
-  auto& contexts = gContexts();
-  contexts.clear();
-}
-
-template <typename T>
-void NCCL<T>::AllReduce(const NCCLExecution& ex) {
-  return runNCCL<T>(
-      ex,
-      [](const NCCLElement& ctx) {
-        ctx.dst->Resize(ctx.src->sizes());
-        ctx.dst->template mutable_data<T>();
-      },
-      [](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
-        CAFFE_NCCL_CHECK(ncclAllReduce(
-            ctx.src->raw_data(),
-            ctx.dst->raw_mutable_data(),
-            ctx.dst->numel(),
-            ncclTypeWrapper<T>::type,
-            ncclSum,
-            comm,
-            stream));
-      });
-}
-
-template <typename T>
-void NCCL<T>::Broadcast(const NCCLExecution& ex) {
-  return runNCCL<T>(
-      ex,
-      [](const NCCLElement& ctx) {
-        ctx.dst->Resize(ctx.src->sizes());
-        ctx.dst->template mutable_data<T>();
-      },
-      [&ex](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
-        CAFFE_NCCL_CHECK(ncclBcast(
-            ctx.dst->raw_mutable_data(),
-            ctx.dst->numel(),
-            ncclTypeWrapper<T>::type,
-            ex.root,
-            comm,
-            stream));
-      });
-}
-
-template <typename T>
-void NCCL<T>::Reduce(const NCCLExecution& ex) {
-  return runNCCL<T>(
-      ex,
-      [](const NCCLElement& ctx) {
-        if (ctx.dst) {
-          ctx.dst->Resize(ctx.src->sizes());
-          ctx.dst->template mutable_data<T>();
-        }
-      },
-      [&ex](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
-        CAFFE_NCCL_CHECK(ncclReduce(
-            ctx.src->raw_data(),
-            ctx.dst ? ctx.dst->raw_mutable_data() : nullptr,
-            ctx.src->numel(),
-            ncclTypeWrapper<T>::type,
-            ncclSum,
-            ex.root,
-            comm,
-            stream));
-      });
-}
-
-template <typename T>
-void NCCL<T>::AllGather(const NCCLExecution& ex) {
-  const auto n = ex.elements.size();
-  return runNCCL<T>(
-      ex,
-      [n](const NCCLElement& ctx) {
-        CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
-        std::vector<int64_t> dims;
-        dims.reserve(ctx.src->dim() + 1);
-        dims.push_back(n);
-        for (auto d : ctx.src->sizes()) {
-          dims.push_back(d);
-        }
-        ctx.dst->Resize(dims);
-        ctx.dst->template mutable_data<T>();
-      },
-      [](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
-#if NCCL_VERSION_MIN(2, 0, 0)
-        CAFFE_NCCL_CHECK(ncclAllGather(
-            ctx.src->raw_data(),
-            ctx.dst->raw_mutable_data(),
-            ctx.src->numel(),
-            ncclTypeWrapper<T>::type,
-            comm,
-            stream));
-#else
-        CAFFE_NCCL_CHECK(ncclAllGather(
-            ctx.src->raw_data(),
-            ctx.src->size(),
-            ncclTypeWrapper<T>::type,
-            ctx.dst->raw_mutable_data(),
-            comm,
-            stream));
-#endif
-      });
-}
-
-template <typename T>
-void NCCL<T>::ReduceScatter(const NCCLExecution& ex) {
-  return runNCCL<T>(
-      ex,
-      [](const NCCLElement& ctx) {
-        CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
-        const auto& srcDims = ctx.src->sizes();
-        std::vector<int64_t> dstDims(srcDims.begin() + 1, srcDims.end());
-        ctx.dst->Resize(dstDims);
-        ctx.dst->template mutable_data<T>();
-      },
-      [](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
-        CAFFE_NCCL_CHECK(ncclReduceScatter(
-            ctx.src->raw_data(),
-            ctx.dst->raw_mutable_data(),
-            ctx.dst->numel(),
-            ncclTypeWrapper<T>::type,
-            ncclSum,
-            comm,
-            stream));
-      });
-}
-
-// Explicit instantiation
-template class NCCL<float>;
-template class NCCL<int>;
-#ifdef CAFFE_HAS_CUDA_FP16
-template class NCCL<at::Half>;
-#endif
-
-} // namespace nccl
-} // namespace caffe2
diff --git a/caffe2/contrib/nccl/cuda_nccl_gpu.h b/caffe2/contrib/nccl/cuda_nccl_gpu.h
deleted file mode 100644
index 725b69830105b..0000000000000
--- a/caffe2/contrib/nccl/cuda_nccl_gpu.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#pragma once
-
-#include <cstddef>
-
-#include "caffe2/core/common_gpu.h"
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/logging.h"
-
-#include <nccl.h>
-#include <unordered_map>
-
-#define NCCL_VERSION_MIN(major, minor, patch) \
-  ((NCCL_MAJOR > major) ||                    \
-   ((NCCL_MAJOR == major) &&                  \
-    ((NCCL_MINOR > minor) ||                  \
-     ((NCCL_MINOR == minor) && (NCCL_PATCH >= patch)))))
-
-namespace caffe2 {
-namespace nccl {
-
-#define CAFFE_NCCL_CHECK(condition)    \
-  do {                                 \
-    ncclResult_t status = (condition); \
-    CAFFE_ENFORCE_EQ(                  \
-        status,                        \
-        ncclSuccess,                   \
-        " ",                           \
-        "Error at: ",                  \
-        __FILE__,                      \
-        __LINE__,                      \
-        ": ",                          \
-        ncclGetErrorString(status));   \
-  } while (0)
-
-struct NCCLElement {
-  const TensorCUDA* src{nullptr};
-  TensorCUDA* dst{nullptr};
-  int device{0};
-};
-
-struct NCCLExecution {
-  int stream_gpu_id{0};
-  cudaStream_t stream{nullptr};
-  std::vector<NCCLElement> elements;
-  size_t root{0};
-};
-
-// Called when the last NCCL op is destructed and all lazily created
-// NCCLContext instances can safely be destroyed.
-void destroyContexts();
-
-template <typename T>
-class NCCL {
- public:
-  static void AllReduce(const NCCLExecution& ex);
-  static void Broadcast(const NCCLExecution& ex);
-  static void Reduce(const NCCLExecution& ex);
-  static void AllGather(const NCCLExecution& ex);
-  static void ReduceScatter(const NCCLExecution& ex);
-};
-
-} // namespace nccl
-} // namespace caffe2
diff --git a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
deleted file mode 100644
index 4ffe5aedcfe41..0000000000000
--- a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
+++ /dev/null
@@ -1,275 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/operator.h"
-
-#include "caffe2/contrib/nccl/cuda_nccl_gpu.h"
-
-namespace caffe2 {
-
-nccl::NCCLExecution getNCCLElements(
-    OperatorBase* op,
-    const CUDAContext& context) {
-  // We either do an N-N op, or an N-1 op.
-  CAFFE_ENFORCE(op->InputSize() == op->OutputSize() || op->OutputSize() == 1);
-  nccl::NCCLExecution ex;
-  ex.stream_gpu_id = context.device_id();
-  ex.stream = context.cuda_stream();
-  ex.root = op->template GetSingleArgument<int>("root", 0);
-  ex.elements.resize(op->InputSize());
-  for (auto i = 0; i < op->InputSize(); ++i) {
-    auto& el = ex.elements[i];
-    el.src = &(op->Input<Tensor>(i, CUDA));
-    if (op->OutputSize() == 1) {
-      // Reduce op
-      if (i == ex.root) {
-        el.dst = op->Output<Tensor>(0, CUDA);
-      }
-    } else if (i < op->OutputSize()) {
-      el.dst = op->Output<Tensor>(i, CUDA);
-    }
-    // TODO - expensive (>1ms) - cache these.
-    el.device = GetGPUIDForPointer(op->Input<Tensor>(i, CUDA).raw_data());
-  }
-
-  return ex;
-}
-
-namespace {
-
-// Check if all inputs are float
-template <typename T>
-bool AllInputsAre(OperatorBase* op) {
-  for (auto i = 0; i < op->InputSize(); ++i) {
-    if (op->Input<Tensor>(i, CUDA).IsType<T>()) {
-      continue;
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-// Manual count of all instantiated NCCL ops.
-// If this drops to zero after destructing the last NCCL op,
-// it means we can safely destroy all lazily created NCCL contexts.
-std::atomic<int> kNCCLOpCounter(0);
-
-}; // namespace
-
-class NCCLBaseOp : public Operator<CUDAContext> {
- public:
-  using Operator::Operator;
-
-  NCCLBaseOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CUDAContext>(operator_def, ws) {
-    kNCCLOpCounter++;
-  }
-
-  ~NCCLBaseOp() {
-    if (--kNCCLOpCounter == 0) {
-      nccl::destroyContexts();
-    }
-  }
-};
-
-class NCCLAllreduceOp final : public NCCLBaseOp {
- public:
-  using NCCLBaseOp::NCCLBaseOp;
-
-  bool RunOnDevice() override {
-    if (InputSize() == 1)
-      return true;
-
-    if (AllInputsAre<float>(this)) {
-      nccl::NCCL<float>::AllReduce(getNCCLElements(this, context_));
-      return true;
-    } else if (AllInputsAre<at::Half>(this)) {
-      nccl::NCCL<at::Half>::AllReduce(getNCCLElements(this, context_));
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  static std::vector<TensorShape> ShapeInference(
-      const OperatorDef& def,
-      const std::vector<TensorShape>& in) {
-    auto n_outputs = def.output_size();
-    CAFFE_ENFORCE(
-        n_outputs == 1 || n_outputs == in.size(),
-        "NCCLAllreduce only supports N-1 or N-N reductions");
-
-    for (auto i = 0; i < in.size(); i++) {
-      CAFFE_ENFORCE(
-          in[0].dims_size() == in[i].dims_size(),
-          "NCCLAllreduce requires inputs of same dimension");
-      for (auto j = 0; j < in[0].dims_size(); j++) {
-        CAFFE_ENFORCE(
-            in[0].dims(j) == in[i].dims(j),
-            "NCCLAllreduce requires inputs to be of same shape");
-      }
-    }
-
-    std::vector<TensorShape> out(n_outputs);
-    for (auto i = 0; i < out.size(); i++) {
-      out[i] = in[0];
-    }
-    return out;
-  }
-
-  static struct OpSchema::Cost CostInference(
-      const OperatorDef& def,
-      const vector<TensorShape>& inputs) {
-    CAFFE_ENFORCE_GE(inputs.size(), 1, "Conv requires at least 1 input");
-    const TensorShape X0 = inputs[0];
-    const auto nElem = nElemFromDim(inputs[0]);
-
-    struct OpSchema::Cost c;
-    c.flops = (inputs.size() - 1) * nElem;
-    c.bytes_read = inputs.size() * nElem;
-    c.bytes_written = def.output_size() * nElem;
-    c.params_bytes = 0;
-    return c;
-  }
-};
-
-class NCCLBroadcastOp final : public NCCLBaseOp {
- public:
-  using NCCLBaseOp::NCCLBaseOp;
-
-  bool RunOnDevice() override {
-    if (InputSize() == 1)
-      return true;
-    if (AllInputsAre<float>(this)) {
-      nccl::NCCL<float>::Broadcast(getNCCLElements(this, context_));
-      return true;
-    } else if (AllInputsAre<at::Half>(this)) {
-      nccl::NCCL<at::Half>::Broadcast(getNCCLElements(this, context_));
-      return true;
-    } else {
-      return false;
-    }
-  }
-};
-
-class NCCLReduceOp final : public NCCLBaseOp {
- public:
-  using NCCLBaseOp::NCCLBaseOp;
-
-  bool RunOnDevice() override {
-    if (InputSize() == 1)
-      return true;
-    const auto& ex = getNCCLElements(this, context_);
-
-    if (AllInputsAre<float>(this)) {
-      nccl::NCCL<float>::Reduce(ex);
-      return true;
-    } else if (AllInputsAre<at::Half>(this)) {
-      nccl::NCCL<at::Half>::Reduce(ex);
-      return true;
-    } else {
-      return false;
-    }
-  }
-};
-
-class NCCLAllGatherOp final : public NCCLBaseOp {
- public:
-  using NCCLBaseOp::NCCLBaseOp;
-
-  bool RunOnDevice() override {
-    if (InputSize() == 1)
-      return true;
-    if (AllInputsAre<float>(this)) {
-      nccl::NCCL<float>::AllGather(getNCCLElements(this, context_));
-      return true;
-    } else if (AllInputsAre<at::Half>(this)) {
-      nccl::NCCL<at::Half>::AllGather(getNCCLElements(this, context_));
-      return true;
-    } else {
-      return false;
-    }
-  }
-};
-
-class NCCLReduceScatterOp final : public NCCLBaseOp {
- public:
-  using NCCLBaseOp::NCCLBaseOp;
-
-  bool RunOnDevice() override {
-    if (AllInputsAre<float>(this)) {
-      nccl::NCCL<float>::ReduceScatter(getNCCLElements(this, context_));
-      return true;
-    } else if (AllInputsAre<at::Half>(this)) {
-      nccl::NCCL<at::Half>::ReduceScatter(getNCCLElements(this, context_));
-      return true;
-    } else {
-      return false;
-    }
-  }
-};
-
-namespace {
-
-std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>> ncclOpDevInfer(
-    const OperatorDef& def) {
-  std::vector<DeviceOption> opt;
-  for (int i = 0; i < def.input().size(); ++i) {
-    DeviceOption dev;
-    dev.set_device_type(1);
-    dev.set_device_id(i);
-    opt.push_back(dev);
-  }
-  return std::make_pair(opt, opt);
-}
-
-REGISTER_CUDA_OPERATOR(NCCLAllreduce, NCCLAllreduceOp);
-OPERATOR_SCHEMA(NCCLAllreduce)
-    .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .CostInferenceFunction(NCCLAllreduceOp::CostInference)
-    .TensorInferenceFunction(NCCLAllreduceOp::ShapeInference)
-    .IdenticalTypeAndShape()
-    .InputsCanCrossDevices()
-    .AllowOneToOneInplace()
-    .DeviceInferenceFunction(ncclOpDevInfer);
-SHOULD_NOT_DO_GRADIENT(NCCLAllreduce);
-
-REGISTER_CUDA_OPERATOR(NCCLBroadcast, NCCLBroadcastOp);
-OPERATOR_SCHEMA(NCCLBroadcast)
-    .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .IdenticalTypeAndShape()
-    .InputsCanCrossDevices()
-    .EnforceOneToOneInplace()
-    .DeviceInferenceFunction(ncclOpDevInfer);
-
-SHOULD_NOT_DO_GRADIENT(NCCLBroadcast);
-
-REGISTER_CUDA_OPERATOR(NCCLReduce, NCCLReduceOp);
-OPERATOR_SCHEMA(NCCLReduce)
-    .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .NumOutputs(1)
-    .IdenticalTypeAndShapeOfInput(0)
-    .InputsCanCrossDevices()
-    .AllowInplace([](int /*in*/, int out) -> bool { return (out == 0); })
-    .DeviceInferenceFunction(ncclOpDevInfer);
-SHOULD_NOT_DO_GRADIENT(NCCLReduce);
-
-REGISTER_CUDA_OPERATOR(NCCLAllGather, NCCLAllGatherOp);
-OPERATOR_SCHEMA(NCCLAllGather)
-    .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .InputsCanCrossDevices()
-    .DeviceInferenceFunction(ncclOpDevInfer);
-SHOULD_NOT_DO_GRADIENT(NCCLAllGather);
-
-REGISTER_CUDA_OPERATOR(NCCLReduceScatter, NCCLReduceScatterOp);
-OPERATOR_SCHEMA(NCCLReduceScatter)
-    .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .InputsCanCrossDevices()
-    .DeviceInferenceFunction(ncclOpDevInfer);
-SHOULD_NOT_DO_GRADIENT(NCCLReduceScatter);
-
-} // namespace
-} // namespace caffe2
diff --git a/caffe2/contrib/nccl/nccl_ops_test.py b/caffe2/contrib/nccl/nccl_ops_test.py
deleted file mode 100644
index 2d4e9b518b9bb..0000000000000
--- a/caffe2/contrib/nccl/nccl_ops_test.py
+++ /dev/null
@@ -1,192 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given, assume
-import numpy as np
-import time
-import os
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace, muji, dyndep
-import caffe2.python.hypothesis_test_util as hu
-
-np.random.seed(1)
-
-dyndep.InitOpsLibrary('@/caffe2/caffe2/contrib/nccl:nccl_ops')
-
-
-def gpu_device(i):
-    device_option = caffe2_pb2.DeviceOption()
-    device_option.device_type = workspace.GpuDeviceType
-    device_option.device_id = i
-    return device_option
-
-
-def benchmark(ws, net, warmups=5, iters=100):
-    for _ in range(warmups):
-        ws.run(net)
-    plan = core.Plan("plan")
-    plan.AddStep(core.ExecutionStep("test-step", net, iters))
-    before = time.time()
-    ws.run(plan)
-    after = time.time()
-    print("Timing network, time taken per-iteration: {:.6f}ms".format((
-        after - before) / float(iters) * 1000.0))
-    return after - before
-
-
-@unittest.skipIf(not workspace.has_cuda_support, "NCCL only on CUDA GPU")
-class NCCLOpsTest(hu.HypothesisTestCase):
-    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
-           m=st.integers(min_value=1, max_value=1000),
-           in_place=st.booleans())
-    def test_nccl_allreduce(self, n, m, in_place):
-        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
-        inputs = [str("x_{}".format(i)) for i in range(n)]
-        prefix = "" if in_place else "o"
-        outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]
-        op = core.CreateOperator("NCCLAllreduce", inputs, outputs)
-        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
-
-        def allreduce(*args):
-            assert len(args) == n
-            output = np.sum(args, axis=0)
-            return [output for _ in range(n)]
-
-        outputs = self.assertReferenceChecks(
-            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
-            allreduce, input_device_options)
-        for output in outputs:
-            np.testing.assert_array_equal(outputs[0], output)
-            self.assertEqual(outputs[0].tobytes(), output.tobytes())
-
-    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
-           m=st.integers(min_value=1, max_value=1000),
-           root=st.integers(min_value=0,
-                            max_value=workspace.NumGpuDevices() - 1))
-    def test_nccl_broadcast(self, n, m, root):
-        assume(root < n)
-        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
-        inputs = [str("x_{}".format(i)) for i in range(n)]
-        op = core.CreateOperator("NCCLBroadcast", inputs, inputs, root=root)
-        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
-
-        def broadcast(*args):
-            assert len(args) == n
-            return [args[root] for _ in range(n)]
-
-        self.assertReferenceChecks(
-            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
-            broadcast, input_device_options)
-
-    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
-           m=st.integers(min_value=1, max_value=1000),
-           # NCCL Reduce seems to deadlock for non-zero roots.
-           root=st.integers(min_value=0, max_value=0),
-           in_place=st.booleans())
-    def test_nccl_reduce(self, n, m, root, in_place):
-        assume(in_place is False or root == 0)
-        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
-        inputs = [str("x_{}".format(i)) for i in range(n)]
-        op = core.CreateOperator(
-            "NCCLReduce", inputs,
-            inputs[root] if in_place else b"o", root=root)
-        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
-
-        def reduce(*args):
-            assert len(args) == n
-            return [np.sum(args, axis=0)]
-
-        self.assertReferenceChecks(
-            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
-            reduce, input_device_options)
-
-    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
-           m=st.integers(min_value=1, max_value=1000))
-    def test_nccl_allgather(self, n, m):
-        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
-        inputs = [str("x_{}".format(i)) for i in range(n)]
-        outputs = [str("o_{}".format(i)) for i in range(n)]
-        op = core.CreateOperator("NCCLAllGather", inputs, outputs)
-        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
-
-        def allgather(*args):
-            assert len(args) == n
-            return [np.stack(args, axis=0) for _ in range(n)]
-
-        outputs = self.assertReferenceChecks(
-            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
-            allgather, input_device_options)
-        for output in outputs:
-            np.testing.assert_array_equal(outputs[0], output)
-            self.assertEqual(outputs[0].tobytes(), output.tobytes())
-
-    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
-           m=st.integers(min_value=1, max_value=1000))
-    def test_nccl_reduce_scatter(self, n, m):
-        xs = [np.random.randn(n, m).astype(np.float32) for i in range(n)]
-        inputs = [str("x_{}".format(i)) for i in range(n)]
-        outputs = [str("o_{}".format(i)) for i in range(n)]
-        op = core.CreateOperator("NCCLReduceScatter", inputs, outputs)
-        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
-
-        def reduce_scatter(*args):
-            assert len(args) == n
-            reduced = sum(args)
-            assert len(reduced.shape) > 1
-            ref = [reduced[i, :] for i in range(n)]
-            return ref
-
-        self.assertReferenceChecks(
-            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
-            reduce_scatter, input_device_options)
-
-    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
-           m=st.integers(min_value=100000, max_value=100000),
-           iters=st.integers(min_value=1, max_value=100),
-           net_type=st.sampled_from(["dag", "async_dag", "simple"]))
-    def _test_nccl_sync(self, n, m, iters, net_type):
-        inputs = [str("x_{}".format(i)) for i in range(n)]
-        extra_inputs = [str("xe_{}".format(i)) for i in range(n)]
-        net = core.Net("asdf")
-        net.Proto().type = net_type
-        net.Proto().num_workers = n
-        for i in range(n):
-            net.ConstantFill([], inputs[i], shape=[m], value=0.0,
-                             device_option=gpu_device(i))
-            net.ConstantFill([], extra_inputs[i], shape=[m], value=1.0,
-                             device_option=gpu_device(i))
-            for _ in range(iters):
-                net.Sum([inputs[i], extra_inputs[i]], [inputs[i]],
-                        device_option=gpu_device(i))
-        net.NCCLReduce(inputs, [inputs[0]], device_option=gpu_device(0))
-        self.ws.run(net)
-        np.testing.assert_array_equal(
-            self.ws.blobs[inputs[0]].fetch(),
-            np.full(shape=(m,), fill_value=iters * n, dtype=np.float32))
-
-    @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
-    def test_timings(self):
-        for n in range(2, workspace.NumGpuDevices()):
-            for in_place in [False, True]:
-                xs = [np.random.randn(1e7).astype(np.float32)
-                      for i in range(n)]
-                inputs = [str("x_{}".format(i)) for i in range(n)]
-                prefix = "" if in_place else "o"
-                outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]
-
-                net = core.Net("test")
-                net.NCCLAllreduce(inputs, outputs)
-                net.RunAllOnGPU()
-                for i in range(n):
-                    self.ws.create_blob(inputs[i]).feed(xs[i], gpu_device(i))
-                self.ws.run(net)
-                net_time = benchmark(self.ws, net)
-                vanilla = core.Net("vanilla")
-                muji.Allreduce(vanilla, inputs)
-                vanilla_time = benchmark(self.ws, vanilla)
-                print("Speedup for NCCL: {:.2f}".format(
-                    vanilla_time / net_time))
diff --git a/caffe2/contrib/nnpack/nnpack_ops.cc b/caffe2/contrib/nnpack/nnpack_ops.cc
deleted file mode 100644
index 23e52a152eabc..0000000000000
--- a/caffe2/contrib/nnpack/nnpack_ops.cc
+++ /dev/null
@@ -1,352 +0,0 @@
-#include "caffe2/core/common.h"
-
-#ifdef CAFFE2_USE_MKL
-#include <mkl.h>
-#endif
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/operators/conv_pool_op_base.h"
-#include "caffe2/operators/leaky_relu_op.h"
-#include "caffe2/utils/cpuid.h"
-#include "caffe2/utils/math.h"
-#include "nnpack.h"
-
-C10_DEFINE_int(
-    caffe2_nnpack_num_threads,
-    1,
-    "The number of nnpack pthreadpool threads.");
-C10_DEFINE_bool(
-    caffe2_nnpack_use_mkl_num_threads,
-    true,
-    "If MKL is built, this sets nnpack to use the same number of threads as "
-    "MKL does. This overrides caffe2_nnpack_num_threads if set.");
-
-namespace caffe2 {
-////////////////////////////////////////////////////////////////////////////////
-// Helper Functions
-////////////////////////////////////////////////////////////////////////////////
-
-namespace {
-
-bool has_nnpack() {
-  // nnp_initialize is a noop after the first call so it's safe to invoke it
-  // repeatedly
-  auto nnpack_status = nnp_initialize();
-  return nnpack_status == nnp_status_success;
-}
-
-nnp_convolution_algorithm get_nnp_convolution_algorithm(
-    const std::string& algo) {
-  if (algo == "AUTO") {
-    return nnp_convolution_algorithm_auto;
-  }
-  if (algo == "WINOGRAD") {
-    return nnp_convolution_algorithm_wt8x8;
-  }
-  if (algo == "FT16") {
-    return nnp_convolution_algorithm_ft16x16;
-  }
-  if (algo == "FT8") {
-    return nnp_convolution_algorithm_ft8x8;
-  }
-  return nnp_convolution_algorithm_auto;
-}
-
-nnp_convolution_transform_strategy get_nnp_convolution_transform_strategy(
-    const std::string& kts) {
-  if (kts == "BLOCK") {
-    return nnp_convolution_transform_strategy_block_based;
-  }
-  if (kts == "TUPLE") {
-    return nnp_convolution_transform_strategy_tuple_based;
-  }
-  return nnp_convolution_transform_strategy_block_based;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Thread Pool
-////////////////////////////////////////////////////////////////////////////////
-
-static pthreadpool_t nnpack_threadpool_ = nullptr;
-
-pthreadpool_t nnpack_threadpool() {
-  if (nnpack_threadpool_ == nullptr) {
-    enum nnp_status nnpack_status = nnp_initialize();
-    CAFFE_ENFORCE(
-        nnpack_status == nnp_status_success, "NNPack is not supported here!");
-    int num_threads = FLAGS_caffe2_nnpack_num_threads;
-    if (FLAGS_caffe2_nnpack_use_mkl_num_threads) {
-#ifdef CAFFE2_USE_MKL
-      num_threads = mkl_get_max_threads();
-#else
-      VLOG(1) << "I am asked to use MKL num of threads for NNPACK but this "
-                 "Caffe2 is not built with MKL. Skipping.";
-#endif
-    }
-    nnpack_threadpool_ = pthreadpool_create(num_threads);
-  }
-  return nnpack_threadpool_;
-}
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// NNPACK Ops
-////////////////////////////////////////////////////////////////////////////////
-
-class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
- public:
-  NNPACKConvOp(const OperatorDef& operator_def, Workspace* ws)
-      : ConvPoolOpBase<CPUContext>(operator_def, ws),
-        algo_(get_nnp_convolution_algorithm(
-            OperatorBase::GetSingleArgument<std::string>("algo", "AUTO"))),
-        kts_(get_nnp_convolution_transform_strategy(
-            OperatorBase::GetSingleArgument<std::string>("kts", "TUPLE"))) {
-    OPERATOR_NEEDS_FEATURE(
-        this->order_ == StorageOrder::NCHW,
-        "NNPack only supports NCHW order. Please consider adding "
-        "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
-    OPERATOR_NEEDS_FEATURE(
-        dilation_h() == 1 && dilation_w() == 1,
-        "The NNPack convolution does not support dilation yet.");
-    // NNPACK can be built with avx2 support only and might not be able to run
-    // on a given machine.
-    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
-  }
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    auto& X = Input(0);
-    auto& filter = Input(1);
-    auto& bias = Input(2);
-    auto* Y = Output(0);
-
-    const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
-    const int M = filter.dim32(0);
-
-    CAFFE_ENFORCE(X.dim() == 4, "Input dim should be 4");
-    CAFFE_ENFORCE(filter.dim(), 4);
-    CAFFE_ENFORCE(C % this->group_ == 0, "");
-    CAFFE_ENFORCE(M % this->group_ == 0, "");
-    CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, "");
-    CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h(), "");
-    CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w(), "");
-    CAFFE_ENFORCE(bias.numel() == M, "");
-
-    ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
-    const int oH = Y->dim32(2), oW = Y->dim32(3);
-
-    if (N > 1) {
-      CAFFE_ENFORCE_EQ(
-          this->stride_h(),
-          1,
-          "NNPack only supports stride = 1 when doing batch feedforward");
-      CAFFE_ENFORCE_EQ(
-          this->stride_w(),
-          1,
-          "NNPack only supports stride = 1 when doing batch feedforward");
-    }
-    std::vector<int> pads(
-        {this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
-    std::vector<int> stride({this->stride_h(), this->stride_w()});
-
-    const size_t input_channels = X.dim32(1);
-    const size_t output_channels = Y->dim32(1);
-    const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
-                                 .height = static_cast<size_t>(X.dim32(2))};
-    // filter is MCHW
-    const nnp_size kernel_size = {
-        .width = static_cast<size_t>(filter.dim32(3)),
-        .height = static_cast<size_t>(filter.dim32(2))};
-    // pad is tblr
-    const nnp_padding padding = {.top = static_cast<size_t>(pads[0]),
-                                 .right = static_cast<size_t>(pads[3]),
-                                 .bottom = static_cast<size_t>(pads[1]),
-                                 .left = static_cast<size_t>(pads[2])};
-
-    const nnp_size output_subsample = {
-        .width = static_cast<size_t>(stride[1]),
-        .height = static_cast<size_t>(stride[0])};
-    if (N == 1) {
-      VLOG(1) << "Running inference mode";
-      for (auto g = 0; g < group_; ++g) {
-        const auto status = nnp_convolution_inference(
-            algo_,
-            kts_,
-            C / group_,
-            M / group_,
-            input_size,
-            padding,
-            kernel_size,
-            output_subsample,
-            X.template data<float>() + g * H * W * (C / group_),
-            filter.template data<float>() + filter.numel() / group_ * g,
-            bias.template data<float>() + bias.numel() / group_ * g,
-            Y->template mutable_data<float>() + g * oH * oW * (M / group_),
-            nnpack_threadpool(),
-            nullptr);
-        CAFFE_ENFORCE(nnp_status_success == status, "");
-      }
-    } else {
-      VLOG(1) << "Running batched mode";
-      for (auto g = 0; g < group_; ++g) {
-        const auto status = nnp_convolution_output(
-            algo_,
-            N,
-            C / group_,
-            M / group_,
-            input_size,
-            padding,
-            kernel_size,
-            X.template data<float>() + g * H * W * (C / group_),
-            filter.template data<float>() + filter.numel() / group_ * g,
-            bias.template data<float>() + bias.numel() / group_ * g,
-            Y->template mutable_data<float>() + g * oH * oW * (M / group_),
-            nnpack_threadpool(),
-            nullptr);
-        CAFFE_ENFORCE(nnp_status_success == status, "");
-      }
-    }
-    return true;
-  }
-
- private:
-  const nnp_convolution_algorithm algo_;
-  const nnp_convolution_transform_strategy kts_;
-};
-
-class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> {
- public:
-  NNPACKMaxPoolOp(const OperatorDef& operator_def, Workspace* ws)
-      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
-    OPERATOR_NEEDS_FEATURE(
-        this->order_ == StorageOrder::NCHW,
-        "NNPack only supports NCHW order. Please consider add "
-        "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
-    OPERATOR_NEEDS_FEATURE(
-        this->kernel_h() == 2, "NNPack only supports MaxPool kernel size 2*2!");
-    OPERATOR_NEEDS_FEATURE(
-        this->kernel_w() == 2, "NNPack only supports MaxPool kernel size 2*2!");
-    OPERATOR_NEEDS_FEATURE(
-        this->stride_h() == 2, "NNPack only supports MaxPool stride size 2*2!");
-    OPERATOR_NEEDS_FEATURE(
-        this->stride_w() == 2, "NNPack only supports MaxPool stride size 2*2!");
-    OPERATOR_NEEDS_FEATURE(
-        this->pad_t() == 0,
-        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
-    OPERATOR_NEEDS_FEATURE(
-        this->pad_l() == 0,
-        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
-    OPERATOR_NEEDS_FEATURE(
-        this->pad_r() == 0,
-        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
-    OPERATOR_NEEDS_FEATURE(
-        this->pad_b() == 0,
-        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
-    // NNPACK can be built with avx2 support only and might not be able to run
-    // on a given machine.
-    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
-  }
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    auto& X = Input(0);
-    auto* Y = Output(0);
-    CAFFE_ENFORCE(X.dim() == 4, "");
-    const int H = X.dim32(2), W = X.dim32(3);
-    ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
-    std::vector<int> pads(
-        {this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
-    std::vector<int> stride({this->stride_h(), this->stride_w()});
-    std::vector<int> pooling({this->kernel_h(), this->kernel_w()});
-
-    // Input X is in NCHW order
-    const size_t batch_size = X.dim32(0);
-    const size_t input_channels = X.dim32(1);
-    const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
-                                 .height = static_cast<size_t>(X.dim32(2))};
-    // pooling kernel
-    const nnp_size pooling_size = {.width = static_cast<size_t>(pooling[1]),
-                                   .height = static_cast<size_t>(pooling[0])};
-    // pad is tblr
-    const nnp_padding padding = {.top = static_cast<size_t>(pads[0]),
-                                 .right = static_cast<size_t>(pads[3]),
-                                 .bottom = static_cast<size_t>(pads[1]),
-                                 .left = static_cast<size_t>(pads[2])};
-
-    const nnp_size pooling_stride = {.width = static_cast<size_t>(stride[1]),
-                                     .height = static_cast<size_t>(stride[0])};
-    const auto status = nnp_max_pooling_output(
-        batch_size,
-        input_channels,
-        input_size,
-        padding,
-        pooling_size,
-        pooling_stride,
-        X.template data<float>(),
-        Y->template mutable_data<float>(),
-        nnpack_threadpool());
-    CAFFE_ENFORCE(nnp_status_success == status, "");
-    return true;
-  }
-
- private:
-};
-
-class NNPACKReluOp final : public Operator<CPUContext> {
- public:
-  NNPACKReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws) {
-    // NNPACK can be built with avx2 support only and might not be able to run
-    // on a given machine.
-    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
-  }
-
-  bool RunOnDevice() override {
-    auto& X = Input(0);
-    auto* Y = Output(0);
-    const auto status = nnp_relu_output(
-        1,
-        X.numel(),
-        X.template data<float>(),
-        Y->template mutable_data<float>(),
-        0.0,
-        nnpack_threadpool());
-    CAFFE_ENFORCE(nnp_status_success == status, "");
-    return true;
-  }
-
- private:
-};
-
-class NNPACKLeakyReluOp final : public LeakyReluOp<float, CPUContext> {
- public:
-  NNPACKLeakyReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : LeakyReluOp<float, CPUContext>(operator_def, ws) {
-    // NNPACK can be built with avx2 support only and might not be able to run
-    // on a given machine.
-    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
-  }
-
-  bool RunOnDevice() override {
-    auto& X = Input(0);
-    auto* Y = Output(0);
-    const auto status = nnp_relu_output(
-        1,
-        X.numel(),
-        X.template data<float>(),
-        Y->template mutable_data<float>(),
-        alpha_,
-        nnpack_threadpool());
-    CAFFE_ENFORCE(nnp_status_success == status, "");
-    return true;
-  }
-
- private:
-};
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp);
-REGISTER_CPU_OPERATOR_WITH_ENGINE(MaxPool, NNPACK, NNPACKMaxPoolOp);
-REGISTER_CPU_OPERATOR_WITH_ENGINE(Relu, NNPACK, NNPACKReluOp);
-REGISTER_CPU_OPERATOR_WITH_ENGINE(LeakyRelu, NNPACK, NNPACKLeakyReluOp);
-
-} // namespace caffe2
diff --git a/caffe2/contrib/nnpack/nnpack_ops_test.py b/caffe2/contrib/nnpack/nnpack_ops_test.py
deleted file mode 100644
index 4bedf0e0ecd65..0000000000000
--- a/caffe2/contrib/nnpack/nnpack_ops_test.py
+++ /dev/null
@@ -1,237 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given, assume, settings
-import numpy as np
-import time
-import os
-from caffe2.python import core, dyndep
-import caffe2.python.hypothesis_test_util as hu
-
-
-dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nnpack:nnpack_ops")
-
-np.random.seed(1)
-
-
-def benchmark(ws, net, warmups=5, iters=100):
-    for _ in range(warmups):
-        ws.run(net)
-    plan = core.Plan("plan")
-    plan.AddStep(core.ExecutionStep("test-step", net, iters))
-    before = time.time()
-    ws.run(plan)
-    after = time.time()
-    print("Timing network, time taken per-iteration: {:.6f}ms".format((
-        after - before) / float(iters) * 1000.0))
-    return after - before
-
-
-def has_avx2():
-    import subprocess
-    try:
-        subprocess.check_output(["grep", "avx2", "/proc/cpuinfo"])
-        return True
-    except subprocess.CalledProcessError:
-        # grep exits with rc 1 on no matches
-        return False
-
-
-@unittest.skipIf(not has_avx2(), "NNPACK requires AVX2")
-class NNPackOpsTest(hu.HypothesisTestCase):
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 2),
-           kernel=st.integers(3, 5),
-           size=st.integers(5, 10),
-           input_channels=st.integers(1, 8),
-           batch_size=st.integers(1, 5),
-           groups=st.integers(1, 2))
-    def test_convolution_correctness(self, stride, pad, kernel, size,
-                                     input_channels,
-                                     batch_size, groups):
-        input_channels *= groups
-        output_channels = int(input_channels / groups)
-        assume(input_channels % groups == 0)
-        assume(output_channels % groups == 0)
-        assume(output_channels == input_channels / groups)
-        assume(stride <= kernel)
-        if stride != 1:
-            assume(batch_size == 1)
-
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(
-            input_channels, output_channels, kernel, kernel).astype(np.float32)\
-            - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        order = "NCHW"
-        outputs = {}
-        for engine in ["", "NNPACK"]:
-            op = core.CreateOperator(
-                "Conv",
-                ["X", "w", "b"],
-                ["Y"],
-                stride=stride,
-                kernel=kernel,
-                pad=pad,
-                order=order,
-                kts="TUPLE",
-                engine=engine,
-                group=groups,
-            )
-            self.ws.create_blob("X").feed(X)
-            self.ws.create_blob("w").feed(w)
-            self.ws.create_blob("b").feed(b)
-            self.ws.run(op)
-            outputs[engine] = self.ws.blobs["Y"].fetch()
-        np.testing.assert_allclose(
-            outputs[""],
-            outputs["NNPACK"],
-            atol=1e-4,
-            rtol=1e-4)
-
-    @given(size=st.sampled_from([6, 8]),
-           input_channels=st.integers(1, 8),
-           batch_size=st.integers(1, 5))
-    def test_max_pool_correctness(self, size, input_channels, batch_size):
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        order = "NCHW"
-        outputs = {}
-        # only 2 * 2 stride and 2 * 2 pool is supported in NNPack now
-        stride = 2
-        kernel = 2
-        # The pooling strategy of NNPack is different from caffe2 pooling
-        pad = 0
-        for engine in ["", "NNPACK"]:
-            op = core.CreateOperator(
-                "MaxPool",
-                ["X"],
-                ["Y"],
-                stride=stride,
-                kernel=kernel,
-                pad=pad,
-                order=order,
-                engine=engine,
-            )
-            self.ws.create_blob("X").feed(X)
-            self.ws.run(op)
-            outputs[engine] = self.ws.blobs["Y"].fetch()
-        np.testing.assert_allclose(
-            outputs[""],
-            outputs["NNPACK"],
-            atol=1e-4,
-            rtol=1e-4)
-
-    @given(size=st.sampled_from([6, 8]),
-           input_channels=st.integers(1, 8),
-           batch_size=st.integers(1, 5))
-    def test_relu_correctness(self, size, input_channels, batch_size):
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        outputs = {}
-        for engine in ["", "NNPACK"]:
-            op = core.CreateOperator(
-                "Relu",
-                ["X"],
-                ["Y"],
-                engine=engine,
-            )
-            self.ws.create_blob("X").feed(X)
-            self.ws.run(op)
-            outputs[engine] = self.ws.blobs["Y"].fetch()
-        np.testing.assert_allclose(
-            outputs[""],
-            outputs["NNPACK"],
-            atol=1e-4,
-            rtol=1e-4)
-
-    @given(size=st.sampled_from([6, 8]),
-           input_channels=st.integers(1, 8),
-           batch_size=st.integers(1, 5),
-           alpha=st.floats(0, 1))
-    def test_leaky_relu_correctness(self, size, input_channels, batch_size,
-                                    alpha):
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        outputs = {}
-        for engine in ["", "NNPACK"]:
-            op = core.CreateOperator(
-                "LeakyRelu",
-                ["X"],
-                ["Y"],
-                alpha=alpha,
-                engine=engine,
-            )
-            self.ws.create_blob("X").feed(X)
-            self.ws.run(op)
-            outputs[engine] = self.ws.blobs["Y"].fetch()
-        np.testing.assert_allclose(
-            outputs[""],
-            outputs["NNPACK"],
-            atol=1e-4,
-            rtol=1e-4)
-
-    @settings(deadline=3600)
-    @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
-    @given(stride=st.integers(1, 1),
-           pad=st.integers(0, 2),
-           kernel=st.sampled_from([3, 5, 7]),
-           size=st.integers(30, 90),
-           input_channels=st.sampled_from([3, 64, 256]),
-           output_channels=st.sampled_from([32, 96, 256]),
-           batch_size=st.sampled_from([32, 64, 96, 128]))
-    def test_timings(self, stride, pad, kernel, size,
-                     input_channels, output_channels, batch_size):
-        assume(stride <= kernel)
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(output_channels, input_channels,
-                           kernel, kernel).astype(np.float32) - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        order = "NCHW"
-        times = {}
-        for engine in ["", "NNPACK"]:
-            net = core.Net(engine + "_test")
-            net.Conv(
-                ["X", "W", "b"], "Y",
-                order=order,
-                kernel=kernel,
-                stride=stride,
-                pad=pad,
-                kts="TUPLE",
-                engine=engine,
-            )
-            self.ws.create_blob("X").feed(X)
-            self.ws.create_blob("W").feed(w)
-            self.ws.create_blob("b").feed(b)
-            self.ws.run(net)
-            times[engine] = benchmark(self.ws, net)
-        print("Speedup for NNPACK: {:.2f}".format(
-            times[""] / times["NNPACK"]))
-
-    @settings(deadline=3600)
-    @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
-    @given(size=st.integers(30, 90),
-           input_channels=st.sampled_from([3, 64, 256]),
-           batch_size=st.sampled_from([32, 64, 96, 128]))
-    def test_relu_timings(self, size, input_channels, batch_size):
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        times = {}
-        for engine in ["", "NNPACK"]:
-            net = core.Net(engine + "_test")
-            net.Relu(
-                ["X"],
-                ["Y"],
-                engine=engine,
-            )
-            self.ws.create_blob("X").feed(X)
-            self.ws.run(net)
-            times[engine] = benchmark(self.ws, net)
-        print("Speedup for NNPACK: {:.2f}".format(
-            times[""] / times["NNPACK"]))
diff --git a/caffe2/contrib/opencl/CMakeLists.txt b/caffe2/contrib/opencl/CMakeLists.txt
deleted file mode 100644
index 5d4a036acd0e7..0000000000000
--- a/caffe2/contrib/opencl/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-if(USE_OPENCL)
-  set(Caffe2_CONTRIB_OPENCL_CPU_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/context.cc"
-  )
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_OPENCL_CPU_SRC} PARENT_SCOPE)
-endif()
-
diff --git a/caffe2/contrib/opencl/OpenCL/cl.hpp b/caffe2/contrib/opencl/OpenCL/cl.hpp
deleted file mode 100644
index bb33da683ff02..0000000000000
--- a/caffe2/contrib/opencl/OpenCL/cl.hpp
+++ /dev/null
@@ -1,12906 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-/*! \file
- *
- *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and 
- *       OpenCL 1.2 (rev 15)    
- *   \author Benedict R. Gaster, Laurent Morichetti and Lee Howes
- *   
- *   Additions and fixes from:
- *       Brian Cole, March 3rd 2010 and April 2012 
- *       Matt Gruenke, April 2012.
- *       Bruce Merry, February 2013.
- *       Tom Deakin and Simon McIntosh-Smith, July 2013
- *   
- *   \version 1.2.7
- *   \date January 2015
- *
- *   Optional extension support
- *
- *         cl
- *         cl_ext_device_fission
- *				#define USE_CL_DEVICE_FISSION
- */
-
-/*! \mainpage
- * \section intro Introduction
- * For many large applications C++ is the language of choice and so it seems
- * reasonable to define C++ bindings for OpenCL.
- *
- *
- * The interface is contained with a single C++ header file \em cl.hpp and all
- * definitions are contained within the namespace \em cl. There is no additional
- * requirement to include \em cl.h and to use either the C++ or original C
- * bindings it is enough to simply include \em cl.hpp.
- *
- * The bindings themselves are lightweight and correspond closely to the
- * underlying C API. Using the C++ bindings introduces no additional execution
- * overhead.
- *
- * For detail documentation on the bindings see:
- *
- * The OpenCL C++ Wrapper API 1.2 (revision 09)
- *  http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf
- *
- * \section example Example
- *
- * The following example shows a general use case for the C++
- * bindings, including support for the optional exception feature and
- * also the supplied vector and string classes, see following sections for
- * descriptions of these features.
- *
- * \code
- * #define __CL_ENABLE_EXCEPTIONS
- * 
- * #if defined(__APPLE__) || defined(__MACOSX)
- * #include <OpenCL/cl.hpp>
- * #else
- * #include <CL/cl.hpp>
- * #endif
- * #include <cstdio>
- * #include <cstdlib>
- * #include <iostream>
- * 
- *  const char * helloStr  = "__kernel void "
- *                           "hello(void) "
- *                           "{ "
- *                           "  "
- *                           "} ";
- * 
- *  int
- *  main(void)
- *  {
- *     cl_int err = CL_SUCCESS;
- *     try {
- *
- *       std::vector<cl::Platform> platforms;
- *       cl::Platform::get(&platforms);
- *       if (platforms.size() == 0) {
- *           std::cout << "Platform size 0\n";
- *           return -1;
- *       }
- *
- *       cl_context_properties properties[] = 
- *          { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
- *       cl::Context context(CL_DEVICE_TYPE_CPU, properties); 
- * 
- *       std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
- * 
- *       cl::Program::Sources source(1,
- *           std::make_pair(helloStr,strlen(helloStr)));
- *       cl::Program program_ = cl::Program(context, source);
- *       program_.build(devices);
- * 
- *       cl::Kernel kernel(program_, "hello", &err);
- * 
- *       cl::Event event;
- *       cl::CommandQueue queue(context, devices[0], 0, &err);
- *       queue.enqueueNDRangeKernel(
- *           kernel, 
- *           cl::NullRange, 
- *           cl::NDRange(4,4),
- *           cl::NullRange,
- *           NULL,
- *           &event); 
- * 
- *       event.wait();
- *     }
- *     catch (cl::Error err) {
- *        std::cerr 
- *           << "ERROR: "
- *           << err.what()
- *           << "("
- *           << err.err()
- *           << ")"
- *           << std::endl;
- *     }
- * 
- *    return EXIT_SUCCESS;
- *  }
- * 
- * \endcode
- *
- */
-#ifndef CL_HPP_
-#define CL_HPP_
-
-#ifdef _WIN32
-
-#include <malloc.h>
-
-#if defined(USE_DX_INTEROP)
-#include <CL/cl_d3d10.h>
-#include <CL/cl_dx9_media_sharing.h>
-#endif
-#endif // _WIN32
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif // _MSC_VER
-
-// 
-#if defined(USE_CL_DEVICE_FISSION)
-#include <CL/cl_ext.h>
-#endif
-
-#if defined(__APPLE__) || defined(__MACOSX)
-#include <OpenCL/opencl.h>
-#else
-#include <CL/opencl.h>
-#endif // !__APPLE__
-
-#if (_MSC_VER >= 1700) || (__cplusplus >= 201103L)
-#define CL_HPP_RVALUE_REFERENCES_SUPPORTED
-#define CL_HPP_CPP11_ATOMICS_SUPPORTED
-#include <atomic>
-#endif
-
-#if (__cplusplus >= 201103L)
-#define CL_HPP_NOEXCEPT noexcept
-#else
-#define CL_HPP_NOEXCEPT
-#endif
-
-
-// To avoid accidentally taking ownership of core OpenCL types
-// such as cl_kernel constructors are made explicit
-// under OpenCL 1.2
-#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-#define __CL_EXPLICIT_CONSTRUCTORS explicit
-#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-#define __CL_EXPLICIT_CONSTRUCTORS 
-#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-
-// Define deprecated prefixes and suffixes to ensure compilation
-// in case they are not pre-defined
-#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
-#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
-
-#if !defined(CL_CALLBACK)
-#define CL_CALLBACK
-#endif //CL_CALLBACK
-
-#include <utility>
-#include <limits>
-#include <iterator>
-
-#if defined(__CL_ENABLE_EXCEPTIONS)
-#include <exception>
-#endif // #if defined(__CL_ENABLE_EXCEPTIONS)
-
-#if !defined(__NO_STD_VECTOR)
-#include <vector>
-#endif
-
-#if !defined(__NO_STD_STRING)
-#include <string>
-#endif 
-
-#if defined(__ANDROID__) || defined(linux) || defined(__APPLE__) || defined(__MACOSX)
-#include <alloca.h>
-#endif // linux
-
-#include <cstring>
-
-
-/*! \namespace cl
- *
- * \brief The OpenCL C++ bindings are defined within this namespace.
- *
- */
-namespace cl {
-
-class Memory;
-
-/**
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
-#define __INIT_CL_EXT_FCN_PTR(name) \
-    if(!pfn_##name) { \
-        pfn_##name = (PFN_##name) \
-            clGetExtensionFunctionAddress(#name); \
-        if(!pfn_##name) { \
-        } \
-    }
-#endif // #if defined(CL_VERSION_1_1)
-
-#if defined(CL_VERSION_1_2)
-#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \
-    if(!pfn_##name) { \
-        pfn_##name = (PFN_##name) \
-            clGetExtensionFunctionAddressForPlatform(platform, #name); \
-        if(!pfn_##name) { \
-        } \
-    }
-#endif // #if defined(CL_VERSION_1_1)
-
-class Program;
-class Device;
-class Context;
-class CommandQueue;
-class Memory;
-class Buffer;
-
-#if defined(__CL_ENABLE_EXCEPTIONS)
-/*! \brief Exception class 
- * 
- *  This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined.
- */
-class Error : public std::exception
-{
-private:
-    cl_int err_;
-    const char * errStr_;
-public:
-    /*! \brief Create a new CL error exception for a given error code
-     *  and corresponding message.
-     * 
-     *  \param err error code value.
-     *
-     *  \param errStr a descriptive string that must remain in scope until
-     *                handling of the exception has concluded.  If set, it
-     *                will be returned by what().
-     */
-    Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
-    {}
-
-    ~Error() throw() {}
-
-    /*! \brief Get error string associated with exception
-     *
-     * \return A memory pointer to the error message string.
-     */
-    virtual const char * what() const throw ()
-    {
-        if (errStr_ == NULL) {
-            return "empty";
-        }
-        else {
-            return errStr_;
-        }
-    }
-
-    /*! \brief Get error code associated with exception
-     *
-     *  \return The error code.
-     */
-    cl_int err(void) const { return err_; }
-};
-
-#define __ERR_STR(x) #x
-#else
-#define __ERR_STR(x) NULL
-#endif // __CL_ENABLE_EXCEPTIONS
-
-
-namespace detail
-{
-#if defined(__CL_ENABLE_EXCEPTIONS)
-static inline cl_int errHandler (
-    cl_int err,
-    const char * errStr = NULL)
-{
-    if (err != CL_SUCCESS) {
-        throw Error(err, errStr);
-    }
-    return err;
-}
-#else
-static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
-{
-    (void) errStr; // suppress unused variable warning
-    return err;
-}
-#endif // __CL_ENABLE_EXCEPTIONS
-}
-
-
-
-//! \cond DOXYGEN_DETAIL
-#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
-#define __GET_DEVICE_INFO_ERR               __ERR_STR(clGetDeviceInfo)
-#define __GET_PLATFORM_INFO_ERR             __ERR_STR(clGetPlatformInfo)
-#define __GET_DEVICE_IDS_ERR                __ERR_STR(clGetDeviceIDs)
-#define __GET_PLATFORM_IDS_ERR              __ERR_STR(clGetPlatformIDs)
-#define __GET_CONTEXT_INFO_ERR              __ERR_STR(clGetContextInfo)
-#define __GET_EVENT_INFO_ERR                __ERR_STR(clGetEventInfo)
-#define __GET_EVENT_PROFILE_INFO_ERR        __ERR_STR(clGetEventProfileInfo)
-#define __GET_MEM_OBJECT_INFO_ERR           __ERR_STR(clGetMemObjectInfo)
-#define __GET_IMAGE_INFO_ERR                __ERR_STR(clGetImageInfo)
-#define __GET_SAMPLER_INFO_ERR              __ERR_STR(clGetSamplerInfo)
-#define __GET_KERNEL_INFO_ERR               __ERR_STR(clGetKernelInfo)
-#if defined(CL_VERSION_1_2)
-#define __GET_KERNEL_ARG_INFO_ERR               __ERR_STR(clGetKernelArgInfo)
-#endif // #if defined(CL_VERSION_1_2)
-#define __GET_KERNEL_WORK_GROUP_INFO_ERR    __ERR_STR(clGetKernelWorkGroupInfo)
-#define __GET_PROGRAM_INFO_ERR              __ERR_STR(clGetProgramInfo)
-#define __GET_PROGRAM_BUILD_INFO_ERR        __ERR_STR(clGetProgramBuildInfo)
-#define __GET_COMMAND_QUEUE_INFO_ERR        __ERR_STR(clGetCommandQueueInfo)
-
-#define __CREATE_CONTEXT_ERR                __ERR_STR(clCreateContext)
-#define __CREATE_CONTEXT_FROM_TYPE_ERR      __ERR_STR(clCreateContextFromType)
-#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   __ERR_STR(clGetSupportedImageFormats)
-
-#define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
-#define __COPY_ERR                          __ERR_STR(cl::copy)
-#define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
-#define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
-#define __CREATE_GL_RENDER_BUFFER_ERR       __ERR_STR(clCreateFromGLBuffer)
-#define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
-#if defined(CL_VERSION_1_2)
-#define __CREATE_IMAGE_ERR                  __ERR_STR(clCreateImage)
-#define __CREATE_GL_TEXTURE_ERR             __ERR_STR(clCreateFromGLTexture)
-#define __IMAGE_DIMENSION_ERR               __ERR_STR(Incorrect image dimensions)
-#endif // #if defined(CL_VERSION_1_2)
-#define __CREATE_SAMPLER_ERR                __ERR_STR(clCreateSampler)
-#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
-
-#define __CREATE_USER_EVENT_ERR             __ERR_STR(clCreateUserEvent)
-#define __SET_USER_EVENT_STATUS_ERR         __ERR_STR(clSetUserEventStatus)
-#define __SET_EVENT_CALLBACK_ERR            __ERR_STR(clSetEventCallback)
-#define __WAIT_FOR_EVENTS_ERR               __ERR_STR(clWaitForEvents)
-
-#define __CREATE_KERNEL_ERR                 __ERR_STR(clCreateKernel)
-#define __SET_KERNEL_ARGS_ERR               __ERR_STR(clSetKernelArg)
-#define __CREATE_PROGRAM_WITH_SOURCE_ERR    __ERR_STR(clCreateProgramWithSource)
-#define __CREATE_PROGRAM_WITH_BINARY_ERR    __ERR_STR(clCreateProgramWithBinary)
-#if defined(CL_VERSION_1_2)
-#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    __ERR_STR(clCreateProgramWithBuiltInKernels)
-#endif // #if defined(CL_VERSION_1_2)
-#define __BUILD_PROGRAM_ERR                 __ERR_STR(clBuildProgram)
-#if defined(CL_VERSION_1_2)
-#define __COMPILE_PROGRAM_ERR                  __ERR_STR(clCompileProgram)
-#define __LINK_PROGRAM_ERR                  __ERR_STR(clLinkProgram)
-#endif // #if defined(CL_VERSION_1_2)
-#define __CREATE_KERNELS_IN_PROGRAM_ERR     __ERR_STR(clCreateKernelsInProgram)
-
-#define __CREATE_COMMAND_QUEUE_ERR          __ERR_STR(clCreateCommandQueue)
-#define __SET_COMMAND_QUEUE_PROPERTY_ERR    __ERR_STR(clSetCommandQueueProperty)
-#define __ENQUEUE_READ_BUFFER_ERR           __ERR_STR(clEnqueueReadBuffer)
-#define __ENQUEUE_READ_BUFFER_RECT_ERR      __ERR_STR(clEnqueueReadBufferRect)
-#define __ENQUEUE_WRITE_BUFFER_ERR          __ERR_STR(clEnqueueWriteBuffer)
-#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     __ERR_STR(clEnqueueWriteBufferRect)
-#define __ENQEUE_COPY_BUFFER_ERR            __ERR_STR(clEnqueueCopyBuffer)
-#define __ENQEUE_COPY_BUFFER_RECT_ERR       __ERR_STR(clEnqueueCopyBufferRect)
-#define __ENQUEUE_FILL_BUFFER_ERR           __ERR_STR(clEnqueueFillBuffer)
-#define __ENQUEUE_READ_IMAGE_ERR            __ERR_STR(clEnqueueReadImage)
-#define __ENQUEUE_WRITE_IMAGE_ERR           __ERR_STR(clEnqueueWriteImage)
-#define __ENQUEUE_COPY_IMAGE_ERR            __ERR_STR(clEnqueueCopyImage)
-#define __ENQUEUE_FILL_IMAGE_ERR           __ERR_STR(clEnqueueFillImage)
-#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  __ERR_STR(clEnqueueCopyImageToBuffer)
-#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  __ERR_STR(clEnqueueCopyBufferToImage)
-#define __ENQUEUE_MAP_BUFFER_ERR            __ERR_STR(clEnqueueMapBuffer)
-#define __ENQUEUE_MAP_IMAGE_ERR             __ERR_STR(clEnqueueMapImage)
-#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      __ERR_STR(clEnqueueUnMapMemObject)
-#define __ENQUEUE_NDRANGE_KERNEL_ERR        __ERR_STR(clEnqueueNDRangeKernel)
-#define __ENQUEUE_TASK_ERR                  __ERR_STR(clEnqueueTask)
-#define __ENQUEUE_NATIVE_KERNEL             __ERR_STR(clEnqueueNativeKernel)
-#if defined(CL_VERSION_1_2)
-#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   __ERR_STR(clEnqueueMigrateMemObjects)
-#endif // #if defined(CL_VERSION_1_2)
-
-#define __ENQUEUE_ACQUIRE_GL_ERR            __ERR_STR(clEnqueueAcquireGLObjects)
-#define __ENQUEUE_RELEASE_GL_ERR            __ERR_STR(clEnqueueReleaseGLObjects)
-
-
-#define __RETAIN_ERR                        __ERR_STR(Retain Object)
-#define __RELEASE_ERR                       __ERR_STR(Release Object)
-#define __FLUSH_ERR                         __ERR_STR(clFlush)
-#define __FINISH_ERR                        __ERR_STR(clFinish)
-#define __VECTOR_CAPACITY_ERR               __ERR_STR(Vector capacity error)
-
-/**
- * CL 1.2 version that uses device fission.
- */
-#if defined(CL_VERSION_1_2)
-#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevices)
-#else
-#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevicesEXT)
-#endif // #if defined(CL_VERSION_1_2)
-
-/**
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
-#define __ENQUEUE_MARKER_ERR                __ERR_STR(clEnqueueMarker)
-#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       __ERR_STR(clEnqueueWaitForEvents)
-#define __ENQUEUE_BARRIER_ERR               __ERR_STR(clEnqueueBarrier)
-#define __UNLOAD_COMPILER_ERR               __ERR_STR(clUnloadCompiler)
-#define __CREATE_GL_TEXTURE_2D_ERR          __ERR_STR(clCreateFromGLTexture2D)
-#define __CREATE_GL_TEXTURE_3D_ERR          __ERR_STR(clCreateFromGLTexture3D)
-#define __CREATE_IMAGE2D_ERR                __ERR_STR(clCreateImage2D)
-#define __CREATE_IMAGE3D_ERR                __ERR_STR(clCreateImage3D)
-#endif // #if defined(CL_VERSION_1_1)
-
-#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
-//! \endcond
-
-/**
- * CL 1.2 marker and barrier commands
- */
-#if defined(CL_VERSION_1_2)
-#define __ENQUEUE_MARKER_WAIT_LIST_ERR                __ERR_STR(clEnqueueMarkerWithWaitList)
-#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               __ERR_STR(clEnqueueBarrierWithWaitList)
-#endif // #if defined(CL_VERSION_1_2)
-
-#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
-typedef std::string STRING_CLASS;
-#elif !defined(__USE_DEV_STRING) 
-
-/*! \class string
- * \brief Simple string class, that provides a limited subset of std::string
- * functionality but avoids many of the issues that come with that class.
- 
- *  \note Deprecated. Please use std::string as default or
- *  re-define the string class to match the std::string
- *  interface by defining STRING_CLASS
- */
-class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-{
-private:
-    ::size_t size_;
-    char * str_;
-public:
-    //! \brief Constructs an empty string, allocating no memory.
-    string(void) : size_(0), str_(NULL)
-    {
-    }
-
-    /*! \brief Constructs a string populated from an arbitrary value of
-     *  specified size.
-     * 
-     *  An extra '\0' is added, in case none was contained in str.
-     *
-     *  \param str the initial value of the string instance.  Note that '\0'     
-     *             characters receive no special treatment.  If NULL,
-     *             the string is left empty, with a size of 0.
-     *
-     *  \param size the number of characters to copy from str.
-     */
-    string(const char * str, ::size_t size) :
-        size_(size),
-        str_(NULL)
-    {
-        if( size > 0 ) {
-            str_ = new char[size_+1];
-            if (str_ != NULL) {
-                memcpy(str_, str, size_  * sizeof(char));
-                str_[size_] = '\0';
-            }
-            else {
-                size_ = 0;
-            }
-        }
-    }
-
-    /*! \brief Constructs a string populated from a null-terminated value.
-     *
-     *  \param str the null-terminated initial value of the string instance.
-     *             If NULL, the string is left empty, with a size of 0.
-     */
-    string(const char * str) :
-        size_(0),
-        str_(NULL)
-    {
-        if( str ) {
-            size_= ::strlen(str);
-        }
-        if( size_ > 0 ) {
-            str_ = new char[size_ + 1];
-            if (str_ != NULL) {
-                memcpy(str_, str, (size_ + 1) * sizeof(char));
-            }
-        }
-    }
-
-    void resize( ::size_t n )
-    {
-        if( size_ == n ) {
-            return;
-        }
-        if (n == 0) {
-            if( str_ ) {
-                delete [] str_;
-            }
-            str_ = NULL;
-            size_ = 0;
-        } 
-        else {
-            char *newString = new char[n + 1];
-            ::size_t copySize = n;
-            if( size_ < n ) {
-                copySize = size_;
-            }
-            size_ = n;
-            
-            if(str_) {
-                memcpy(newString, str_, (copySize + 1) * sizeof(char));
-            }
-            if( copySize < size_ ) {
-                memset(newString + copySize, 0, size_ - copySize);
-            }
-            newString[size_] = '\0';
-
-            delete [] str_;
-            str_ = newString;
-        }
-    }
-
-    const char& operator[] ( ::size_t pos ) const
-    {
-        return str_[pos];
-    }
-
-    char& operator[] ( ::size_t pos )
-    {
-        return str_[pos];
-    }
-
-    /*! \brief Copies the value of another string to this one.
-     *
-     *  \param rhs the string to copy.
-     *
-     *  \returns a reference to the modified instance.
-     */
-    string& operator=(const string& rhs)
-    {
-        if (this == &rhs) {
-            return *this;
-        }
-
-        if( str_ != NULL ) {
-            delete [] str_;
-            str_ = NULL;
-            size_ = 0;
-        }
-
-        if (rhs.size_ == 0 || rhs.str_ == NULL) {
-            str_ = NULL;
-            size_ = 0;
-        } 
-        else {
-            str_ = new char[rhs.size_ + 1];
-            size_ = rhs.size_;
-            
-            if (str_ != NULL) {
-                memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
-            }
-            else {
-                size_ = 0;
-            }
-        }
-
-        return *this;
-    }
-
-    /*! \brief Constructs a string by copying the value of another instance.
-     *
-     *  \param rhs the string to copy.
-     */
-    string(const string& rhs) :
-        size_(0),
-        str_(NULL)
-    {
-        *this = rhs;
-    }
-
-    //! \brief Destructor - frees memory used to hold the current value.
-    ~string()
-    {
-        delete[] str_;
-        str_ = NULL;
-    }
-    
-    //! \brief Queries the length of the string, excluding any added '\0's.
-    ::size_t size(void) const   { return size_; }
-
-    //! \brief Queries the length of the string, excluding any added '\0's.
-    ::size_t length(void) const { return size(); }
-
-    /*! \brief Returns a pointer to the private copy held by this instance,
-     *  or "" if empty/unset.
-     */
-    const char * c_str(void) const { return (str_) ? str_ : "";}
-};
-typedef cl::string STRING_CLASS;
-#endif // #elif !defined(__USE_DEV_STRING) 
-
-#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
-#define VECTOR_CLASS std::vector
-#elif !defined(__USE_DEV_VECTOR) 
-#define VECTOR_CLASS cl::vector 
-
-#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
-#define __MAX_DEFAULT_VECTOR_SIZE 10
-#endif
-
-/*! \class vector
- * \brief Fixed sized vector implementation that mirroring 
- *
- *  \note Deprecated. Please use std::vector as default or
- *  re-define the vector class to match the std::vector
- *  interface by defining VECTOR_CLASS
-
- *  \note Not recommended for use with custom objects as
- *  current implementation will construct N elements
- *
- * std::vector functionality.
- *  \brief Fixed sized vector compatible with std::vector.
- *
- *  \note
- *  This differs from std::vector<> not just in memory allocation,
- *  but also in terms of when members are constructed, destroyed,
- *  and assigned instead of being copy constructed.
- *
- *  \param T type of element contained in the vector.
- *
- *  \param N maximum size of the vector.
- */
-template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
-class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector
-{
-private:
-    T data_[N];
-    unsigned int size_;
-
-public:
-    //! \brief Constructs an empty vector with no memory allocated.
-    vector() :  
-        size_(static_cast<unsigned int>(0))
-    {}
-
-    //! \brief Deallocates the vector's memory and destroys all of its elements.
-    ~vector() 
-    {
-        clear();
-    }
-
-    //! \brief Returns the number of elements currently contained.
-    unsigned int size(void) const
-    {
-        return size_;
-    }
-    
-    /*! \brief Empties the vector of all elements.
-     *  \note
-     *  This does not deallocate memory but will invoke destructors
-     *  on contained elements.
-     */
-    void clear()
-    {
-        while(!empty()) {
-            pop_back();
-        }
-    }
-
-    /*! \brief Appends an element after the last valid element.
-     * Calling this on a vector that has reached capacity will throw an 
-     * exception if exceptions are enabled.
-     */
-    void push_back (const T& x)
-    { 
-        if (size() < N) {
-            new (&data_[size_]) T(x);
-            size_++;
-        } else {
-            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
-        }
-    }
-
-    /*! \brief Removes the last valid element from the vector.
-     * Calling this on an empty vector will throw an exception
-     * if exceptions are enabled.
-     */
-    void pop_back(void)
-    {
-        if (size_ != 0) {
-            --size_;
-            data_[size_].~T();
-        } else {
-            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
-        }
-    }
-
-    /*! \brief Constructs with a value copied from another.
-     *
-     *  \param vec the vector to copy.
-     */
-    vector(const vector<T, N>& vec) : 
-        size_(vec.size_)
-    {
-        if (size_ != 0) {
-            assign(vec.begin(), vec.end());
-        }
-    } 
-
-    /*! \brief Constructs with a specified number of initial elements.
-     *
-     *  \param size number of initial elements.
-     *
-     *  \param val value of initial elements.
-     */
-    vector(unsigned int size, const T& val = T()) :
-        size_(0)
-    {
-        for (unsigned int i = 0; i < size; i++) {
-            push_back(val);
-        }
-    }
-
-    /*! \brief Overwrites the current content with that copied from another
-     *         instance.
-     *
-     *  \param rhs vector to copy.
-     *
-     *  \returns a reference to this.
-     */
-    vector<T, N>& operator=(const vector<T, N>& rhs)
-    {
-        if (this == &rhs) {
-            return *this;
-        }
-
-        if (rhs.size_ != 0) {	
-            assign(rhs.begin(), rhs.end());
-        } else {
-            clear();
-        }
-
-        return *this;
-    }
-
-    /*! \brief Tests equality against another instance.
-     *
-     *  \param vec the vector against which to compare.
-     */
-    bool operator==(vector<T,N> &vec)
-    {
-        if (size() != vec.size()) {
-            return false;
-        }
-
-        for( unsigned int i = 0; i < size(); ++i ) {
-            if( operator[](i) != vec[i] ) {
-                return false;
-            }
-        }
-        return true;
-    }
-  
-    //! \brief Conversion operator to T*.
-    operator T* ()             { return data_; }
-
-    //! \brief Conversion operator to const T*.
-    operator const T* () const { return data_; }
-   
-    //! \brief Tests whether this instance has any elements.
-    bool empty (void) const
-    {
-        return size_==0;
-    }
-  
-    //! \brief Returns the maximum number of elements this instance can hold.
-    unsigned int max_size (void) const
-    {
-        return N;
-    }
-
-    //! \brief Returns the maximum number of elements this instance can hold.
-    unsigned int capacity () const
-    {
-        return N;
-    }
-
-    //! \brief Resizes the vector to the given size
-    void resize(unsigned int newSize, T fill = T())
-    {
-        if (newSize > N)
-        {
-            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
-        }
-        else
-        {
-            while (size_ < newSize)
-            {
-                new (&data_[size_]) T(fill);
-                size_++;
-            }
-            while (size_ > newSize)
-            {
-                --size_;
-                data_[size_].~T();
-            }
-        }
-    }
-
-    /*! \brief Returns a reference to a given element.
-     *
-     *  \param index which element to access.     *
-     *  \note
-     *  The caller is responsible for ensuring index is >= 0 and < size().
-     */
-    T& operator[](int index)
-    {
-        return data_[index];
-    }
-  
-    /*! \brief Returns a const reference to a given element.
-     *
-     *  \param index which element to access.
-     *
-     *  \note
-     *  The caller is responsible for ensuring index is >= 0 and < size().
-     */
-    const T& operator[](int index) const
-    {
-        return data_[index];
-    }
-  
-    /*! \brief Assigns elements of the vector based on a source iterator range.
-     *
-     *  \param start Beginning iterator of source range
-     *  \param end Enditerator of source range
-     *
-     *  \note
-     *  Will throw an exception if exceptions are enabled and size exceeded.
-     */
-    template<class I>
-    void assign(I start, I end)
-    {
-        clear();   
-        while(start != end) {
-            push_back(*start);
-            start++;
-        }
-    }
-
-    /*! \class iterator
-     * \brief Const iterator class for vectors
-     */
-    class iterator
-    {
-    private:
-        const vector<T,N> *vec_;
-        int index_;
-
-        /**
-         * Internal iterator constructor to capture reference
-         * to the vector it iterates over rather than taking 
-         * the vector by copy.
-         */
-        iterator (const vector<T,N> &vec, int index) :
-            vec_(&vec)
-        {            
-            if( !vec.empty() ) {
-                index_ = index;
-            } else {
-                index_ = -1;
-            }
-        }
-
-    public:
-        iterator(void) : 
-            index_(-1),
-            vec_(NULL)
-        {
-        }
-
-        iterator(const iterator& rhs) :
-            vec_(rhs.vec_),
-            index_(rhs.index_)
-        {
-        }
-
-        ~iterator(void) {}
-
-        static iterator begin(const cl::vector<T,N> &vec)
-        {
-            iterator i(vec, 0);
-
-            return i;
-        }
-
-        static iterator end(const cl::vector<T,N> &vec)
-        {
-            iterator i(vec, vec.size());
-
-            return i;
-        }
-    
-        bool operator==(iterator i)
-        {
-            return ((vec_ == i.vec_) && 
-                    (index_ == i.index_));
-        }
-
-        bool operator!=(iterator i)
-        {
-            return (!(*this==i));
-        }
-
-        iterator& operator++()
-        {
-            ++index_;
-            return *this;
-        }
-
-        iterator operator++(int)
-        {
-            iterator retVal(*this);
-            ++index_;
-            return retVal;
-        }
-
-        iterator& operator--()
-        {
-            --index_;
-            return *this;
-        }
-
-        iterator operator--(int)
-        {
-            iterator retVal(*this);
-            --index_;
-            return retVal;
-        }
-
-        const T& operator *() const
-        {
-            return (*vec_)[index_];
-        }
-    };
-
-    iterator begin(void)
-    {
-        return iterator::begin(*this);
-    }
-
-    iterator begin(void) const
-    {
-        return iterator::begin(*this);
-    }
-
-    iterator end(void)
-    {
-        return iterator::end(*this);
-    }
-
-    iterator end(void) const
-    {
-        return iterator::end(*this);
-    }
-
-    T& front(void)
-    {
-        return data_[0];
-    }
-
-    T& back(void)
-    {
-        return data_[size_];
-    }
-
-    const T& front(void) const
-    {
-        return data_[0];
-    }
-
-    const T& back(void) const
-    {
-        return data_[size_-1];
-    }
-} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
-
-
-
-
-
-namespace detail {
-#define __DEFAULT_NOT_INITIALIZED 1 
-#define __DEFAULT_BEING_INITIALIZED 2
-#define __DEFAULT_INITIALIZED 4
-
-    /*
-     * Compare and exchange primitives are needed for handling of defaults
-    */
-
-#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
-    inline int compare_exchange(std::atomic<int> * dest, int exchange, int comparand)
-#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-    inline int compare_exchange(volatile int * dest, int exchange, int comparand)
-#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-    {
-#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
-        std::atomic_compare_exchange_strong(dest, &comparand, exchange);
-        return comparand;
-#elif _MSC_VER
-        return (int)(_InterlockedCompareExchange(
-            (volatile long*)dest,
-            (long)exchange,
-            (long)comparand));
-#else // !_MSC_VER && !CL_HPP_CPP11_ATOMICS_SUPPORTED
-        return (__sync_val_compare_and_swap(
-            dest,
-            comparand,
-            exchange));
-#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-    }
-
-    inline void fence() {
-#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
-        std::atomic_thread_fence(std::memory_order_seq_cst);
-#elif _MSC_VER // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-        _ReadWriteBarrier();
-#else // !_MSC_VER && !CL_HPP_CPP11_ATOMICS_SUPPORTED
-        __sync_synchronize();
-#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-    }
-} // namespace detail
-
-    
-/*! \brief class used to interface between C++ and
- *  OpenCL C calls that require arrays of size_t values, whose
- *  size is known statically.
- */
-template <int N>
-class size_t
-{ 
-private:
-    ::size_t data_[N];
-
-public:
-    //! \brief Initialize size_t to all 0s
-    size_t()
-    {
-        for( int i = 0; i < N; ++i ) {
-            data_[i] = 0;
-        }
-    }
-
-    ::size_t& operator[](int index)
-    {
-        return data_[index];
-    }
-
-    const ::size_t& operator[](int index) const
-    {
-        return data_[index];
-    }
-
-    //! \brief Conversion operator to T*.
-    operator ::size_t* ()             { return data_; }
-
-    //! \brief Conversion operator to const T*.
-    operator const ::size_t* () const { return data_; }
-};
-
-namespace detail {
-
-// Generic getInfoHelper. The final parameter is used to guide overload
-// resolution: the actual parameter passed is an int, which makes this
-// a worse conversion sequence than a specialization that declares the
-// parameter as an int.
-template<typename Functor, typename T>
-inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
-{
-    return f(name, sizeof(T), param, NULL);
-}
-
-// Specialized getInfoHelper for VECTOR_CLASS params
-template <typename Func, typename T>
-inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
-{
-    ::size_t required;
-    cl_int err = f(name, 0, NULL, &required);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-
-    T* value = (T*) alloca(required);
-    err = f(name, required, value, NULL);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-
-    param->assign(&value[0], &value[required/sizeof(T)]);
-    return CL_SUCCESS;
-}
-
-/* Specialization for reference-counted types. This depends on the
- * existence of Wrapper<T>::cl_type, and none of the other types having the
- * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
- * does not work, because when using a derived type (e.g. Context) the generic
- * template will provide a better match.
- */
-template <typename Func, typename T>
-inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0)
-{
-    ::size_t required;
-    cl_int err = f(name, 0, NULL, &required);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-
-    typename T::cl_type * value = (typename T::cl_type *) alloca(required);
-    err = f(name, required, value, NULL);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-
-    ::size_t elements = required / sizeof(typename T::cl_type);
-    param->assign(&value[0], &value[elements]);
-    for (::size_t i = 0; i < elements; i++)
-    {
-        if (value[i] != NULL)
-        {
-            err = (*param)[i].retain();
-            if (err != CL_SUCCESS) {
-                return err;
-            }
-        }
-    }
-    return CL_SUCCESS;
-}
-
-// Specialized for getInfo<CL_PROGRAM_BINARIES>
-template <typename Func>
-inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int)
-{
-    cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
-
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-
-    return CL_SUCCESS;
-}
-
-// Specialized GetInfoHelper for STRING_CLASS params
-template <typename Func>
-inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long)
-{
-    ::size_t required;
-    cl_int err = f(name, 0, NULL, &required);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-
-    // std::string has a constant data member
-    // a char vector does not
-    VECTOR_CLASS<char> value(required);
-    err = f(name, required, value.data(), NULL);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-    if (param) {
-        param->assign(value.begin(), value.end());
-    }
-    return CL_SUCCESS;
-}
-
-// Specialized GetInfoHelper for cl::size_t params
-template <typename Func, ::size_t N>
-inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
-{
-    ::size_t required;
-    cl_int err = f(name, 0, NULL, &required);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-
-    ::size_t* value = (::size_t*) alloca(required);
-    err = f(name, required, value, NULL);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-
-    for (const auto i : c10::irange(N)) {
-        (*param)[i] = value[i];
-    }
-
-    return CL_SUCCESS;
-}
-
-template<typename T> struct ReferenceHandler;
-
-/* Specialization for reference-counted types. This depends on the
- * existence of Wrapper<T>::cl_type, and none of the other types having the
- * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
- * does not work, because when using a derived type (e.g. Context) the generic
- * template will provide a better match.
- */
-template<typename Func, typename T>
-inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
-{
-    typename T::cl_type value;
-    cl_int err = f(name, sizeof(value), &value, NULL);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-    *param = value;
-    if (value != NULL)
-    {
-        err = param->retain();
-        if (err != CL_SUCCESS) {
-            return err;
-        }
-    }
-    return CL_SUCCESS;
-}
-
-#define __PARAM_NAME_INFO_1_0(F) \
-    F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
-    F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
-    F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
-    F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
-    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
-    \
-    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
-    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
-    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
-    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
-    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
-    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
-    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
-    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
-    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
-    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
-    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
-    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
-    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
-    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
-    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
-    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
-    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
-    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
-    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
-    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
-    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
-    F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
-    F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
-    F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
-    F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
-    F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
-    F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
-    \
-    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
-    F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
-    F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
-    \
-    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
-    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
-    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
-    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_int) \
-    \
-    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
-    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
-    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
-    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
-    \
-    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
-    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
-    F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
-    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
-    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
-    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
-    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
-    \
-    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
-    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
-    F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
-    F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
-    F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
-    F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
-    F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
-    \
-    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
-    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
-    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \
-    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \
-    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \
-    \
-    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
-    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
-    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
-    F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \
-    F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
-    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
-    F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
-    \
-    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
-    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
-    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
-    \
-    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \
-    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
-    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
-    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
-    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
-    \
-    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
-    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
-    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
-    \
-    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
-    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
-    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
-    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
-
-#if defined(CL_VERSION_1_1)
-#define __PARAM_NAME_INFO_1_1(F) \
-    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
-    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
-    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
-    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
-    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
-    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
-    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \
-    \
-    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
-    F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
-    \
-    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
-    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
-    \
-    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
-#endif // CL_VERSION_1_1
-
-    
-#if defined(CL_VERSION_1_2)
-#define __PARAM_NAME_INFO_1_2(F) \
-    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \
-    \
-    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \
-    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \
-    \
-    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
-    \
-    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \
-    \
-    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
-    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
-    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \
-    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \
-    \
-    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \
-    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \
-    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>)  \
-    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
-    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \
-    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
-    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS)
-#endif // #if defined(CL_VERSION_1_2)
-
-#if defined(USE_CL_DEVICE_FISSION)
-#define __PARAM_NAME_DEVICE_FISSION(F) \
-    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
-    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
-    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
-    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
-    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
-#endif // USE_CL_DEVICE_FISSION
-
-template <typename enum_type, cl_int Name>
-struct param_traits {};
-
-#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \
-struct token;                                        \
-template<>                                           \
-struct param_traits<detail:: token,param_name>       \
-{                                                    \
-    enum { value = param_name };                     \
-    typedef T param_type;                            \
-};
-
-__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS)
-#if defined(CL_VERSION_1_1)
-__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS)
-#endif // CL_VERSION_1_1
-#if defined(CL_VERSION_1_2)
-__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS)
-#endif // CL_VERSION_1_1
-
-#if defined(USE_CL_DEVICE_FISSION)
-__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS);
-#endif // USE_CL_DEVICE_FISSION
-
-#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
-__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS)
-#endif
-
-#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
-#endif
-
-#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
-#endif
-#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_SIMD_WIDTH_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
-#endif
-#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
-#endif
-
-#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
-#endif
-#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
-#endif
-#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
-#endif
-#ifdef CL_DEVICE_WARP_SIZE_NV
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
-#endif
-#ifdef CL_DEVICE_GPU_OVERLAP_NV
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
-#endif
-#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
-#endif
-#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
-__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
-#endif
-
-// Convenience functions
-
-template <typename Func, typename T>
-inline cl_int
-getInfo(Func f, cl_uint name, T* param)
-{
-    return getInfoHelper(f, name, param, 0);
-}
-
-template <typename Func, typename Arg0>
-struct GetInfoFunctor0
-{
-    Func f_; const Arg0& arg0_;
-    cl_int operator ()(
-        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
-    { return f_(arg0_, param, size, value, size_ret); }
-};
-
-template <typename Func, typename Arg0, typename Arg1>
-struct GetInfoFunctor1
-{
-    Func f_; const Arg0& arg0_; const Arg1& arg1_;
-    cl_int operator ()(
-        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
-    { return f_(arg0_, arg1_, param, size, value, size_ret); }
-};
-
-template <typename Func, typename Arg0, typename T>
-inline cl_int
-getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
-{
-    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
-    return getInfoHelper(f0, name, param, 0);
-}
-
-template <typename Func, typename Arg0, typename Arg1, typename T>
-inline cl_int
-getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
-{
-    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
-    return getInfoHelper(f0, name, param, 0);
-}
-
-template<typename T>
-struct ReferenceHandler
-{ };
-
-#if defined(CL_VERSION_1_2)
-/**
- * OpenCL 1.2 devices do have retain/release.
- */
-template <>
-struct ReferenceHandler<cl_device_id>
-{
-    /**
-     * Retain the device.
-     * \param device A valid device created using createSubDevices
-     * \return 
-     *   CL_SUCCESS if the function executed successfully.
-     *   CL_INVALID_DEVICE if device was not a valid subdevice
-     *   CL_OUT_OF_RESOURCES
-     *   CL_OUT_OF_HOST_MEMORY
-     */
-    static cl_int retain(cl_device_id device)
-    { return ::clRetainDevice(device); }
-    /**
-     * Retain the device.
-     * \param device A valid device created using createSubDevices
-     * \return 
-     *   CL_SUCCESS if the function executed successfully.
-     *   CL_INVALID_DEVICE if device was not a valid subdevice
-     *   CL_OUT_OF_RESOURCES
-     *   CL_OUT_OF_HOST_MEMORY
-     */
-    static cl_int release(cl_device_id device)
-    { return ::clReleaseDevice(device); }
-};
-#else // #if defined(CL_VERSION_1_2)
-/**
- * OpenCL 1.1 devices do not have retain/release.
- */
-template <>
-struct ReferenceHandler<cl_device_id>
-{
-    // cl_device_id does not have retain().
-    static cl_int retain(cl_device_id)
-    { return CL_SUCCESS; }
-    // cl_device_id does not have release().
-    static cl_int release(cl_device_id)
-    { return CL_SUCCESS; }
-};
-#endif // #if defined(CL_VERSION_1_2)
-
-template <>
-struct ReferenceHandler<cl_platform_id>
-{
-    // cl_platform_id does not have retain().
-    static cl_int retain(cl_platform_id)
-    { return CL_SUCCESS; }
-    // cl_platform_id does not have release().
-    static cl_int release(cl_platform_id)
-    { return CL_SUCCESS; }
-};
-
-template <>
-struct ReferenceHandler<cl_context>
-{
-    static cl_int retain(cl_context context)
-    { return ::clRetainContext(context); }
-    static cl_int release(cl_context context)
-    { return ::clReleaseContext(context); }
-};
-
-template <>
-struct ReferenceHandler<cl_command_queue>
-{
-    static cl_int retain(cl_command_queue queue)
-    { return ::clRetainCommandQueue(queue); }
-    static cl_int release(cl_command_queue queue)
-    { return ::clReleaseCommandQueue(queue); }
-};
-
-template <>
-struct ReferenceHandler<cl_mem>
-{
-    static cl_int retain(cl_mem memory)
-    { return ::clRetainMemObject(memory); }
-    static cl_int release(cl_mem memory)
-    { return ::clReleaseMemObject(memory); }
-};
-
-template <>
-struct ReferenceHandler<cl_sampler>
-{
-    static cl_int retain(cl_sampler sampler)
-    { return ::clRetainSampler(sampler); }
-    static cl_int release(cl_sampler sampler)
-    { return ::clReleaseSampler(sampler); }
-};
-
-template <>
-struct ReferenceHandler<cl_program>
-{
-    static cl_int retain(cl_program program)
-    { return ::clRetainProgram(program); }
-    static cl_int release(cl_program program)
-    { return ::clReleaseProgram(program); }
-};
-
-template <>
-struct ReferenceHandler<cl_kernel>
-{
-    static cl_int retain(cl_kernel kernel)
-    { return ::clRetainKernel(kernel); }
-    static cl_int release(cl_kernel kernel)
-    { return ::clReleaseKernel(kernel); }
-};
-
-template <>
-struct ReferenceHandler<cl_event>
-{
-    static cl_int retain(cl_event event)
-    { return ::clRetainEvent(event); }
-    static cl_int release(cl_event event)
-    { return ::clReleaseEvent(event); }
-};
-
-
-// Extracts version number with major in the upper 16 bits, minor in the lower 16
-static cl_uint getVersion(const char *versionInfo)
-{
-    int highVersion = 0;
-    int lowVersion = 0;
-    int index = 7;
-    while(versionInfo[index] != '.' ) {
-        highVersion *= 10;
-        highVersion += versionInfo[index]-'0';
-        ++index;
-    }
-    ++index;
-    while(versionInfo[index] != ' ' &&  versionInfo[index] != '\0') {
-        lowVersion *= 10;
-        lowVersion += versionInfo[index]-'0';
-        ++index;
-    }
-    return (highVersion << 16) | lowVersion;
-}
-
-static cl_uint getPlatformVersion(cl_platform_id platform)
-{
-    ::size_t size = 0;
-    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
-    char *versionInfo = (char *) alloca(size);
-    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size);
-    return getVersion(versionInfo);
-}
-
-static cl_uint getDevicePlatformVersion(cl_device_id device)
-{
-    cl_platform_id platform;
-    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
-    return getPlatformVersion(platform);
-}
-
-#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-static cl_uint getContextPlatformVersion(cl_context context)
-{
-    // The platform cannot be queried directly, so we first have to grab a
-    // device and obtain its context
-    ::size_t size = 0;
-    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
-    if (size == 0)
-        return 0;
-    cl_device_id *devices = (cl_device_id *) alloca(size);
-    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL);
-    return getDevicePlatformVersion(devices[0]);
-}
-#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-
-template <typename T>
-class Wrapper
-{
-public:
-    typedef T cl_type;
-
-protected:
-    cl_type object_;
-
-public:
-    Wrapper() : object_(NULL) { }
-
-    Wrapper(const cl_type &obj) : object_(obj) { }
-
-    ~Wrapper()
-    {
-        if (object_ != NULL) { release(); }
-    }
-
-    Wrapper(const Wrapper<cl_type>& rhs)
-    {
-        object_ = rhs.object_;
-        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT
-    {
-        object_ = rhs.object_;
-        rhs.object_ = NULL;
-    }
-#endif
-
-    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
-    {
-        if (this != &rhs) {
-            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
-            object_ = rhs.object_;
-            if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
-        }
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
-    {
-        if (this != &rhs) {
-            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
-            object_ = rhs.object_;
-            rhs.object_ = NULL;
-        }
-        return *this;
-    }
-#endif
-
-    Wrapper<cl_type>& operator = (const cl_type &rhs)
-    {
-        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
-        object_ = rhs;
-        return *this;
-    }
-
-    cl_type operator ()() const { return object_; }
-
-    cl_type& operator ()() { return object_; }
-
-protected:
-    template<typename Func, typename U>
-    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
-
-    cl_int retain() const
-    {
-        return ReferenceHandler<cl_type>::retain(object_);
-    }
-
-    cl_int release() const
-    {
-        return ReferenceHandler<cl_type>::release(object_);
-    }
-};
-
-template <>
-class Wrapper<cl_device_id>
-{
-public:
-    typedef cl_device_id cl_type;
-
-protected:
-    cl_type object_;
-    bool referenceCountable_;
-
-    static bool isReferenceCountable(cl_device_id device)
-    {
-        bool retVal = false;
-        if (device != NULL) {
-            int version = getDevicePlatformVersion(device);
-            if(version > ((1 << 16) + 1)) {
-                retVal = true;
-            }
-        }
-        return retVal;
-    }
-
-public:
-    Wrapper() : object_(NULL), referenceCountable_(false) 
-    { 
-    }
-    
-    Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) 
-    {
-        referenceCountable_ = isReferenceCountable(obj); 
-    }
-
-    ~Wrapper()
-    {
-        if (object_ != NULL) { release(); }
-    }
-    
-    Wrapper(const Wrapper<cl_type>& rhs)
-    {
-        object_ = rhs.object_;
-        referenceCountable_ = isReferenceCountable(object_); 
-        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT
-    {
-        object_ = rhs.object_;
-        referenceCountable_ = rhs.referenceCountable_;
-        rhs.object_ = NULL;
-        rhs.referenceCountable_ = false;
-    }
-#endif
-
-    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
-    {
-        if (this != &rhs) {
-            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
-            object_ = rhs.object_;
-            referenceCountable_ = rhs.referenceCountable_;
-            if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
-        }
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
-    {
-        if (this != &rhs) {
-            if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
-            object_ = rhs.object_;
-            referenceCountable_ = rhs.referenceCountable_;
-            rhs.object_ = NULL;
-            rhs.referenceCountable_ = false;
-        }
-        return *this;
-    }
-#endif
-
-    Wrapper<cl_type>& operator = (const cl_type &rhs)
-    {
-        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
-        object_ = rhs;
-        referenceCountable_ = isReferenceCountable(object_); 
-        return *this;
-    }
-
-    cl_type operator ()() const { return object_; }
-
-    cl_type& operator ()() { return object_; }
-
-protected:
-    template<typename Func, typename U>
-    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
-
-    template<typename Func, typename U>
-    friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type);
-
-    cl_int retain() const
-    {
-        if( referenceCountable_ ) {
-            return ReferenceHandler<cl_type>::retain(object_);
-        }
-        else {
-            return CL_SUCCESS;
-        }
-    }
-
-    cl_int release() const
-    {
-        if( referenceCountable_ ) {
-            return ReferenceHandler<cl_type>::release(object_);
-        }
-        else {
-            return CL_SUCCESS;
-        }
-    }
-};
-
-} // namespace detail
-//! \endcond
-
-/*! \stuct ImageFormat
- *  \brief Adds constructors and member functions for cl_image_format.
- *
- *  \see cl_image_format
- */
-struct ImageFormat : public cl_image_format
-{
-    //! \brief Default constructor - performs no initialization.
-    ImageFormat(){}
-
-    //! \brief Initializing constructor.
-    ImageFormat(cl_channel_order order, cl_channel_type type)
-    {
-        image_channel_order = order;
-        image_channel_data_type = type;
-    }
-
-    //! \brief Assignment operator.
-    ImageFormat& operator = (const ImageFormat& rhs)
-    {
-        if (this != &rhs) {
-            this->image_channel_data_type = rhs.image_channel_data_type;
-            this->image_channel_order     = rhs.image_channel_order;
-        }
-        return *this;
-    }
-};
-
-/*! \brief Class interface for cl_device_id.
- *
- *  \note Copies of these objects are inexpensive, since they don't 'own'
- *        any underlying resources or data structures.
- *
- *  \see cl_device_id
- */
-class Device : public detail::Wrapper<cl_device_id>
-{
-public:
-    //! \brief Default constructor - initializes to NULL.
-    Device() : detail::Wrapper<cl_type>() { }
-
-    /*! \brief Constructor from cl_device_id.
-     * 
-     *  This simply copies the device ID value, which is an inexpensive operation.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { }
-
-    /*! \brief Returns the first device on the default context.
-     *
-     *  \see Context::getDefault()
-     */
-    static Device getDefault(cl_int * err = NULL);
-
-    /*! \brief Assignment operator from cl_device_id.
-     * 
-     *  This simply copies the device ID value, which is an inexpensive operation.
-     */
-    Device& operator = (const cl_device_id& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Device(const Device& dev) : detail::Wrapper<cl_type>(dev) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Device& operator = (const Device &dev)
-    {
-        detail::Wrapper<cl_type>::operator=(dev);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Device(Device&& dev) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(dev)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Device& operator = (Device &&dev)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(dev));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-
-    //! \brief Wrapper for clGetDeviceInfo().
-    template <typename T>
-    cl_int getInfo(cl_device_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetDeviceInfo, object_, name, param),
-            __GET_DEVICE_INFO_ERR);
-    }
-
-    //! \brief Wrapper for clGetDeviceInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_device_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_device_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-
-    /**
-     * CL 1.2 version
-     */
-#if defined(CL_VERSION_1_2)
-    //! \brief Wrapper for clCreateSubDevicesEXT().
-    cl_int createSubDevices(
-        const cl_device_partition_property * properties,
-        VECTOR_CLASS<Device>* devices)
-    {
-        cl_uint n = 0;
-        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_SUB_DEVICES);
-        }
-
-        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
-        err = clCreateSubDevices(object_, properties, n, ids, NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_SUB_DEVICES);
-        }
-
-        devices->assign(&ids[0], &ids[n]);
-        return CL_SUCCESS;
-    }
-#endif // #if defined(CL_VERSION_1_2)
-
-/**
- * CL 1.1 version that uses device fission.
- */
-#if defined(CL_VERSION_1_1)
-#if defined(USE_CL_DEVICE_FISSION)
-    cl_int createSubDevices(
-        const cl_device_partition_property_ext * properties,
-        VECTOR_CLASS<Device>* devices)
-    {
-        typedef CL_API_ENTRY cl_int 
-            ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
-                cl_device_id /*in_device*/,
-                const cl_device_partition_property_ext * /* properties */,
-                cl_uint /*num_entries*/,
-                cl_device_id * /*out_devices*/,
-                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
-        __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
-
-        cl_uint n = 0;
-        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_SUB_DEVICES);
-        }
-
-        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
-        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_SUB_DEVICES);
-        }
-
-        devices->assign(&ids[0], &ids[n]);
-        return CL_SUCCESS;
-    }
-#endif // #if defined(USE_CL_DEVICE_FISSION)
-#endif // #if defined(CL_VERSION_1_1)
-};
-
-/*! \brief Class interface for cl_platform_id.
- *
- *  \note Copies of these objects are inexpensive, since they don't 'own'
- *        any underlying resources or data structures.
- *
- *  \see cl_platform_id
- */
-class Platform : public detail::Wrapper<cl_platform_id>
-{
-public:
-    //! \brief Default constructor - initializes to NULL.
-    Platform() : detail::Wrapper<cl_type>()  { }
-
-    /*! \brief Constructor from cl_platform_id.
-     * 
-     *  This simply copies the platform ID value, which is an inexpensive operation.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { }
-
-    /*! \brief Assignment operator from cl_platform_id.
-     * 
-     *  This simply copies the platform ID value, which is an inexpensive operation.
-     */
-    Platform& operator = (const cl_platform_id& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-
-    //! \brief Wrapper for clGetPlatformInfo().
-    cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetPlatformInfo, object_, name, param),
-            __GET_PLATFORM_INFO_ERR);
-    }
-
-    //! \brief Wrapper for clGetPlatformInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_platform_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_platform_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-
-    /*! \brief Gets a list of devices for this platform.
-     * 
-     *  Wraps clGetDeviceIDs().
-     */
-    cl_int getDevices(
-        cl_device_type type,
-        VECTOR_CLASS<Device>* devices) const
-    {
-        cl_uint n = 0;
-        if( devices == NULL ) {
-            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
-        }
-        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
-        }
-
-        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
-        err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
-        }
-
-        devices->assign(&ids[0], &ids[n]);
-        return CL_SUCCESS;
-    }
-
-#if defined(USE_DX_INTEROP)
-   /*! \brief Get the list of available D3D10 devices.
-     *
-     *  \param d3d_device_source.
-     *
-     *  \param d3d_object.
-     *
-     *  \param d3d_device_set.
-     *
-     *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
-     *  values returned in devices can be used to identify a specific OpenCL
-     *  device. If \a devices argument is NULL, this argument is ignored.
-     *
-     *  \return One of the following values:
-     *    - CL_SUCCESS if the function is executed successfully.
-     *
-     *  The application can query specific capabilities of the OpenCL device(s)
-     *  returned by cl::getDevices. This can be used by the application to
-     *  determine which device(s) to use.
-     *
-     * \note In the case that exceptions are enabled and a return value
-     * other than CL_SUCCESS is generated, then cl::Error exception is
-     * generated.
-     */
-    cl_int getDevices(
-        cl_d3d10_device_source_khr d3d_device_source,
-        void *                     d3d_object,
-        cl_d3d10_device_set_khr    d3d_device_set,
-        VECTOR_CLASS<Device>* devices) const
-    {
-        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
-            cl_platform_id platform, 
-            cl_d3d10_device_source_khr d3d_device_source, 
-            void * d3d_object,
-            cl_d3d10_device_set_khr d3d_device_set,
-            cl_uint num_entries,
-            cl_device_id * devices,
-            cl_uint* num_devices);
-
-        if( devices == NULL ) {
-            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
-        }
-
-        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
-        __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR);
-
-        cl_uint n = 0;
-        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
-            object_, 
-            d3d_device_source, 
-            d3d_object,
-            d3d_device_set, 
-            0, 
-            NULL, 
-            &n);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
-        }
-
-        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
-        err = pfn_clGetDeviceIDsFromD3D10KHR(
-            object_, 
-            d3d_device_source, 
-            d3d_object,
-            d3d_device_set,
-            n, 
-            ids, 
-            NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
-        }
-
-        devices->assign(&ids[0], &ids[n]);
-        return CL_SUCCESS;
-    }
-#endif
-
-    /*! \brief Gets a list of available platforms.
-     * 
-     *  Wraps clGetPlatformIDs().
-     */
-    static cl_int get(
-        VECTOR_CLASS<Platform>* platforms)
-    {
-        cl_uint n = 0;
-
-        if( platforms == NULL ) {
-            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
-        }
-
-        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-        }
-
-        cl_platform_id* ids = (cl_platform_id*) alloca(
-            n * sizeof(cl_platform_id));
-        err = ::clGetPlatformIDs(n, ids, NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-        }
-
-        platforms->assign(&ids[0], &ids[n]);
-        return CL_SUCCESS;
-    }
-
-    /*! \brief Gets the first available platform.
-     * 
-     *  Wraps clGetPlatformIDs(), returning the first result.
-     */
-    static cl_int get(
-        Platform * platform)
-    {
-        cl_uint n = 0;
-
-        if( platform == NULL ) {
-            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
-        }
-
-        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-        }
-
-        cl_platform_id* ids = (cl_platform_id*) alloca(
-            n * sizeof(cl_platform_id));
-        err = ::clGetPlatformIDs(n, ids, NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-        }
-
-        *platform = ids[0];
-        return CL_SUCCESS;
-    }
-
-    /*! \brief Gets the first available platform, returning it by value.
-     * 
-     *  Wraps clGetPlatformIDs(), returning the first result.
-     */
-    static Platform get(
-        cl_int * errResult = NULL)
-    {
-        Platform platform;
-        cl_uint n = 0;
-        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
-        if (err != CL_SUCCESS) {
-            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-            if (errResult != NULL) {
-                *errResult = err;
-            }
-            return Platform();
-        }
-
-        cl_platform_id* ids = (cl_platform_id*) alloca(
-            n * sizeof(cl_platform_id));
-        err = ::clGetPlatformIDs(n, ids, NULL);
-
-        if (err != CL_SUCCESS) {
-            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
-            if (errResult != NULL) {
-                *errResult = err;
-            }
-            return Platform();
-        }
-
-        
-        return Platform(ids[0]);
-    }
-
-    static Platform getDefault( 
-        cl_int *errResult = NULL )
-    {
-        return get(errResult);
-    }
-
-    
-#if defined(CL_VERSION_1_2)
-    //! \brief Wrapper for clUnloadCompiler().
-    cl_int
-    unloadCompiler()
-    {
-        return ::clUnloadPlatformCompiler(object_);
-    }
-#endif // #if defined(CL_VERSION_1_2)
-}; // class Platform
-
-/**
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
-/**
- * Unload the OpenCL compiler.
- * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
- */
-inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
-UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-inline cl_int
-UnloadCompiler()
-{
-    return ::clUnloadCompiler();
-}
-#endif // #if defined(CL_VERSION_1_1)
-
-/*! \brief Class interface for cl_context.
- *
- *  \note Copies of these objects are shallow, meaning that the copy will refer
- *        to the same underlying cl_context as the original.  For details, see
- *        clRetainContext() and clReleaseContext().
- *
- *  \see cl_context
- */
-class Context 
-    : public detail::Wrapper<cl_context>
-{
-private:
-
-#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
-    static std::atomic<int> default_initialized_;
-#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-    static volatile int default_initialized_;
-#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-    static Context default_;
-    static volatile cl_int default_error_;
-public:
-    /*! \brief Constructs a context including a list of specified devices.
-     *
-     *  Wraps clCreateContext().
-     */
-    Context(
-        const VECTOR_CLASS<Device>& devices,
-        cl_context_properties* properties = NULL,
-        void (CL_CALLBACK * notifyFptr)(
-            const char *,
-            const void *,
-            ::size_t,
-            void *) = NULL,
-        void* data = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-
-        ::size_t numDevices = devices.size();
-        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
-        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
-            deviceIDs[deviceIndex] = (devices[deviceIndex])();
-        }
-
-        object_ = ::clCreateContext(
-            properties, (cl_uint) numDevices,
-            deviceIDs,
-            notifyFptr, data, &error);
-
-        detail::errHandler(error, __CREATE_CONTEXT_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    Context(
-        const Device& device,
-        cl_context_properties* properties = NULL,
-        void (CL_CALLBACK * notifyFptr)(
-            const char *,
-            const void *,
-            ::size_t,
-            void *) = NULL,
-        void* data = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-
-        cl_device_id deviceID = device();
-
-        object_ = ::clCreateContext(
-            properties, 1,
-            &deviceID,
-            notifyFptr, data, &error);
-
-        detail::errHandler(error, __CREATE_CONTEXT_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    /*! \brief Constructs a context including all or a subset of devices of a specified type.
-     *
-     *  Wraps clCreateContextFromType().
-     */
-    Context(
-        cl_device_type type,
-        cl_context_properties* properties = NULL,
-        void (CL_CALLBACK * notifyFptr)(
-            const char *,
-            const void *,
-            ::size_t,
-            void *) = NULL,
-        void* data = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-
-#if !defined(__APPLE__) && !defined(__MACOS)
-        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
-
-        if (properties == NULL) {
-            // Get a valid platform ID as we cannot send in a blank one
-            VECTOR_CLASS<Platform> platforms;
-            error = Platform::get(&platforms);
-            if (error != CL_SUCCESS) {
-                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-                if (err != NULL) {
-                    *err = error;
-                }
-                return;
-            }
-
-            // Check the platforms we found for a device of our specified type
-            cl_context_properties platform_id = 0;
-            for (unsigned int i = 0; i < platforms.size(); i++) {
-
-                VECTOR_CLASS<Device> devices;
-
-#if defined(__CL_ENABLE_EXCEPTIONS)
-                try {
-#endif
-
-                    error = platforms[i].getDevices(type, &devices);
-
-#if defined(__CL_ENABLE_EXCEPTIONS)
-                } catch (Error) {}
-    // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
-    // We do error checking next anyway, and can throw there if needed
-#endif
-
-                // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
-                if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
-                    detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-                    if (err != NULL) {
-                        *err = error;
-                    }
-                }
-
-                if (devices.size() > 0) {
-                    platform_id = (cl_context_properties)platforms[i]();
-                    break;
-                }
-            }
-
-            if (platform_id == 0) {
-                detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
-                if (err != NULL) {
-                    *err = CL_DEVICE_NOT_FOUND;
-                }
-                return;
-            }
-
-            prop[1] = platform_id;
-            properties = &prop[0];
-        }
-#endif
-        object_ = ::clCreateContextFromType(
-            properties, type, notifyFptr, data, &error);
-
-        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Context(const Context& ctx) : detail::Wrapper<cl_type>(ctx) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Context& operator = (const Context &ctx)
-    {
-        detail::Wrapper<cl_type>::operator=(ctx);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Context(Context&& ctx) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(ctx)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Context& operator = (Context &&ctx)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(ctx));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-
-    /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
-     *
-     *  \note All calls to this function return the same cl_context as the first.
-     */
-    static Context getDefault(cl_int * err = NULL) 
-    {
-        int state = detail::compare_exchange(
-            &default_initialized_, 
-            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
-        
-        if (state & __DEFAULT_INITIALIZED) {
-            if (err != NULL) {
-                *err = default_error_;
-            }
-            return default_;
-        }
-
-        if (state & __DEFAULT_BEING_INITIALIZED) {
-              // Assume writes will propagate eventually...
-              while(default_initialized_ != __DEFAULT_INITIALIZED) {
-                  detail::fence();
-              }
-
-            if (err != NULL) {
-                *err = default_error_;
-            }
-            return default_;
-        }
-
-        cl_int error;
-        default_ = Context(
-            CL_DEVICE_TYPE_DEFAULT,
-            NULL,
-            NULL,
-            NULL,
-            &error);
-
-        detail::fence();
-
-        default_error_ = error;
-        // Assume writes will propagate eventually...
-        default_initialized_ = __DEFAULT_INITIALIZED;
-
-        detail::fence();
-
-        if (err != NULL) {
-            *err = default_error_;
-        }
-        return default_;
-
-    }
-
-    //! \brief Default constructor - initializes to NULL.
-    Context() : detail::Wrapper<cl_type>() { }
-
-    /*! \brief Constructor from cl_context - takes ownership.
-     * 
-     *  This effectively transfers ownership of a refcount on the cl_context
-     *  into the new Context object.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { }
-
-    /*! \brief Assignment operator from cl_context - takes ownership.
-     * 
-     *  This effectively transfers ownership of a refcount on the rhs and calls
-     *  clReleaseContext() on the value previously held by this instance.
-     */
-    Context& operator = (const cl_context& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-
-    //! \brief Wrapper for clGetContextInfo().
-    template <typename T>
-    cl_int getInfo(cl_context_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetContextInfo, object_, name, param),
-            __GET_CONTEXT_INFO_ERR);
-    }
-
-    //! \brief Wrapper for clGetContextInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_context_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_context_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-
-    /*! \brief Gets a list of supported image formats.
-     *  
-     *  Wraps clGetSupportedImageFormats().
-     */
-    cl_int getSupportedImageFormats(
-        cl_mem_flags flags,
-        cl_mem_object_type type,
-        VECTOR_CLASS<ImageFormat>* formats) const
-    {
-        cl_uint numEntries;
-        cl_int err = ::clGetSupportedImageFormats(
-           object_, 
-           flags,
-           type, 
-           0, 
-           NULL, 
-           &numEntries);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
-        }
-
-        ImageFormat* value = (ImageFormat*)
-            alloca(numEntries * sizeof(ImageFormat));
-        err = ::clGetSupportedImageFormats(
-            object_, 
-            flags, 
-            type, 
-            numEntries,
-            (cl_image_format*) value, 
-            NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
-        }
-
-        formats->assign(&value[0], &value[numEntries]);
-        return CL_SUCCESS;
-    }
-};
-
-inline Device Device::getDefault(cl_int * err)
-{
-    cl_int error;
-    Device device;
-
-    Context context = Context::getDefault(&error);
-    detail::errHandler(error, __CREATE_CONTEXT_ERR);
-
-    if (error != CL_SUCCESS) {
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-    else {
-        device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
-        if (err != NULL) {
-            *err = CL_SUCCESS;
-        }
-    }
-
-    return device;
-}
-
-
-#ifdef _WIN32
-#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
-__declspec(selectany) std::atomic<int> Context::default_initialized_;
-#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-__declspec(selectany) Context Context::default_;
-__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS;
-#else // !_WIN32
-#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
-__attribute__((weak)) std::atomic<int> Context::default_initialized_;
-#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-__attribute__((weak)) Context Context::default_;
-__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS;
-#endif // !_WIN32
-
-/*! \brief Class interface for cl_event.
- *
- *  \note Copies of these objects are shallow, meaning that the copy will refer
- *        to the same underlying cl_event as the original.  For details, see
- *        clRetainEvent() and clReleaseEvent().
- *
- *  \see cl_event
- */
-class Event : public detail::Wrapper<cl_event>
-{
-public:
-    //! \brief Default constructor - initializes to NULL.
-    Event() : detail::Wrapper<cl_type>() { }
-
-    /*! \brief Constructor from cl_event - takes ownership.
-     * 
-     *  This effectively transfers ownership of a refcount on the cl_event
-     *  into the new Event object.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { }
-
-    /*! \brief Assignment operator from cl_event - takes ownership.
-     *
-     *  This effectively transfers ownership of a refcount on the rhs and calls
-     *  clReleaseEvent() on the value previously held by this instance.
-     */
-    Event& operator = (const cl_event& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-
-    //! \brief Wrapper for clGetEventInfo().
-    template <typename T>
-    cl_int getInfo(cl_event_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetEventInfo, object_, name, param),
-            __GET_EVENT_INFO_ERR);
-    }
-
-    //! \brief Wrapper for clGetEventInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_event_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_event_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-
-    //! \brief Wrapper for clGetEventProfilingInfo().
-    template <typename T>
-    cl_int getProfilingInfo(cl_profiling_info name, T* param) const
-    {
-        return detail::errHandler(detail::getInfo(
-            &::clGetEventProfilingInfo, object_, name, param),
-            __GET_EVENT_PROFILE_INFO_ERR);
-    }
-
-    //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_profiling_info, name>::param_type
-    getProfilingInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_profiling_info, name>::param_type param;
-        cl_int result = getProfilingInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-
-    /*! \brief Blocks the calling thread until this event completes.
-     * 
-     *  Wraps clWaitForEvents().
-     */
-    cl_int wait() const
-    {
-        return detail::errHandler(
-            ::clWaitForEvents(1, &object_),
-            __WAIT_FOR_EVENTS_ERR);
-    }
-
-#if defined(CL_VERSION_1_1)
-    /*! \brief Registers a user callback function for a specific command execution status.
-     *
-     *  Wraps clSetEventCallback().
-     */
-    cl_int setCallback(
-        cl_int type,
-        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
-        void * user_data = NULL)
-    {
-        return detail::errHandler(
-            ::clSetEventCallback(
-                object_,
-                type,
-                pfn_notify,
-                user_data), 
-            __SET_EVENT_CALLBACK_ERR);
-    }
-#endif
-
-    /*! \brief Blocks the calling thread until every event specified is complete.
-     * 
-     *  Wraps clWaitForEvents().
-     */
-    static cl_int
-    waitForEvents(const VECTOR_CLASS<Event>& events)
-    {
-        return detail::errHandler(
-            ::clWaitForEvents(
-                (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
-            __WAIT_FOR_EVENTS_ERR);
-    }
-};
-
-#if defined(CL_VERSION_1_1)
-/*! \brief Class interface for user events (a subset of cl_event's).
- * 
- *  See Event for details about copy semantics, etc.
- */
-class UserEvent : public Event
-{
-public:
-    /*! \brief Constructs a user event on a given context.
-     *
-     *  Wraps clCreateUserEvent().
-     */
-    UserEvent(
-        const Context& context,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateUserEvent(
-            context(),
-            &error);
-
-        detail::errHandler(error, __CREATE_USER_EVENT_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    //! \brief Default constructor - initializes to NULL.
-    UserEvent() : Event() { }
-
-    /*! \brief Sets the execution status of a user event object.
-     *
-     *  Wraps clSetUserEventStatus().
-     */
-    cl_int setStatus(cl_int status)
-    {
-        return detail::errHandler(
-            ::clSetUserEventStatus(object_,status), 
-            __SET_USER_EVENT_STATUS_ERR);
-    }
-};
-#endif
-
-/*! \brief Blocks the calling thread until every event specified is complete.
- * 
- *  Wraps clWaitForEvents().
- */
-inline static cl_int
-WaitForEvents(const VECTOR_CLASS<Event>& events)
-{
-    return detail::errHandler(
-        ::clWaitForEvents(
-            (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
-        __WAIT_FOR_EVENTS_ERR);
-}
-
-/*! \brief Class interface for cl_mem.
- *
- *  \note Copies of these objects are shallow, meaning that the copy will refer
- *        to the same underlying cl_mem as the original.  For details, see
- *        clRetainMemObject() and clReleaseMemObject().
- *
- *  \see cl_mem
- */
-class Memory : public detail::Wrapper<cl_mem>
-{
-public:
-    //! \brief Default constructor - initializes to NULL.
-    Memory() : detail::Wrapper<cl_type>() { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     * 
-     *  This effectively transfers ownership of a refcount on the cl_mem
-     *  into the new Memory object.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { }
-
-    /*! \brief Assignment operator from cl_mem - takes ownership.
-     *
-     *  This effectively transfers ownership of a refcount on the rhs and calls
-     *  clReleaseMemObject() on the value previously held by this instance.
-     */
-    Memory& operator = (const cl_mem& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Memory(const Memory& mem) : detail::Wrapper<cl_type>(mem) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Memory& operator = (const Memory &mem)
-    {
-        detail::Wrapper<cl_type>::operator=(mem);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Memory(Memory&& mem) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(mem)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Memory& operator = (Memory &&mem)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(mem));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-
-    //! \brief Wrapper for clGetMemObjectInfo().
-    template <typename T>
-    cl_int getInfo(cl_mem_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
-            __GET_MEM_OBJECT_INFO_ERR);
-    }
-
-    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_mem_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_mem_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-
-#if defined(CL_VERSION_1_1)
-    /*! \brief Registers a callback function to be called when the memory object
-     *         is no longer needed.
-     *
-     *  Wraps clSetMemObjectDestructorCallback().
-     *
-     *  Repeated calls to this function, for a given cl_mem value, will append
-     *  to the list of functions called (in reverse order) when memory object's
-     *  resources are freed and the memory object is deleted.
-     *
-     *  \note
-     *  The registered callbacks are associated with the underlying cl_mem
-     *  value - not the Memory class instance.
-     */
-    cl_int setDestructorCallback(
-        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
-        void * user_data = NULL)
-    {
-        return detail::errHandler(
-            ::clSetMemObjectDestructorCallback(
-                object_,
-                pfn_notify,
-                user_data), 
-            __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
-    }
-#endif
-
-};
-
-// Pre-declare copy functions
-class Buffer;
-template< typename IteratorType >
-cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
-template< typename IteratorType >
-cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
-template< typename IteratorType >
-cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
-template< typename IteratorType >
-cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
-
-
-/*! \brief Class interface for Buffer Memory Objects.
- * 
- *  See Memory for details about copy semantics, etc.
- *
- *  \see Memory
- */
-class Buffer : public Memory
-{
-public:
-
-    /*! \brief Constructs a Buffer in a specified context.
-     *
-     *  Wraps clCreateBuffer().
-     *
-     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
-     *                  specified.  Note alignment & exclusivity requirements.
-     */
-    Buffer(
-        const Context& context,
-        cl_mem_flags flags,
-        ::size_t size,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
-
-        detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    /*! \brief Constructs a Buffer in the default context.
-     *
-     *  Wraps clCreateBuffer().
-     *
-     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
-     *                  specified.  Note alignment & exclusivity requirements.
-     *
-     *  \see Context::getDefault()
-     */
-    Buffer(
-         cl_mem_flags flags,
-        ::size_t size,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-
-        Context context = Context::getDefault(err);
-
-        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
-
-        detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    /*!
-     * \brief Construct a Buffer from a host container via iterators.
-     * IteratorType must be random access.
-     * If useHostPtr is specified iterators must represent contiguous data.
-     */
-    template< typename IteratorType >
-    Buffer(
-        IteratorType startIterator,
-        IteratorType endIterator,
-        bool readOnly,
-        bool useHostPtr = false,
-        cl_int* err = NULL)
-    {
-        typedef typename std::iterator_traits<IteratorType>::value_type DataType;
-        cl_int error;
-
-        cl_mem_flags flags = 0;
-        if( readOnly ) {
-            flags |= CL_MEM_READ_ONLY;
-        }
-        else {
-            flags |= CL_MEM_READ_WRITE;
-        }
-        if( useHostPtr ) {
-            flags |= CL_MEM_USE_HOST_PTR;
-        }
-        
-        ::size_t size = sizeof(DataType)*(endIterator - startIterator);
-
-        Context context = Context::getDefault(err);
-
-        if( useHostPtr ) {
-            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
-        } else {
-            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
-        }
-
-        detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-
-        if( !useHostPtr ) {
-            error = cl::copy(startIterator, endIterator, *this);
-            detail::errHandler(error, __CREATE_BUFFER_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-    }
-
-    /*!
-     * \brief Construct a Buffer from a host container via iterators using a specified context.
-     * IteratorType must be random access.
-     * If useHostPtr is specified iterators must represent contiguous data.
-     */
-    template< typename IteratorType >
-    Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
-        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
-
-    /*!
-    * \brief Construct a Buffer from a host container via iterators using a specified queue.
-    * If useHostPtr is specified iterators must represent contiguous data.
-    */
-    template< typename IteratorType >
-    Buffer(const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator,
-        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
-
-    //! \brief Default constructor - initializes to NULL.
-    Buffer() : Memory() { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { }
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Buffer& operator = (const cl_mem& rhs)
-    {
-        Memory::operator=(rhs);
-        return *this;
-    }
-    
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Buffer(const Buffer& buf) : Memory(buf) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Buffer& operator = (const Buffer &buf)
-    {
-        Memory::operator=(buf);
-        return *this;
-    }
-    
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Buffer(Buffer&& buf) CL_HPP_NOEXCEPT : Memory(std::move(buf)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Buffer& operator = (Buffer &&buf)
-    {
-        Memory::operator=(std::move(buf));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-
-#if defined(CL_VERSION_1_1)
-    /*! \brief Creates a new buffer object from this.
-     *
-     *  Wraps clCreateSubBuffer().
-     */
-    Buffer createSubBuffer(
-        cl_mem_flags flags,
-        cl_buffer_create_type buffer_create_type,
-        const void * buffer_create_info,
-        cl_int * err = NULL)
-    {
-        Buffer result;
-        cl_int error;
-        result.object_ = ::clCreateSubBuffer(
-            object_, 
-            flags, 
-            buffer_create_type, 
-            buffer_create_info, 
-            &error);
-
-        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-
-        return result;
-    }		
-#endif
-};
-
-#if defined (USE_DX_INTEROP)
-/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
- *
- *  This is provided to facilitate interoperability with Direct3D.
- * 
- *  See Memory for details about copy semantics, etc.
- *
- *  \see Memory
- */
-class BufferD3D10 : public Buffer
-{
-public:
-    typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
-    cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
-    cl_int* errcode_ret);
-
-    /*! \brief Constructs a BufferD3D10, in a specified context, from a
-     *         given ID3D10Buffer.
-     *
-     *  Wraps clCreateFromD3D10BufferKHR().
-     */
-    BufferD3D10(
-        const Context& context,
-        cl_mem_flags flags,
-        ID3D10Buffer* bufobj,
-        cl_int * err = NULL)
-    {
-        static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
-
-#if defined(CL_VERSION_1_2)
-        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
-        cl_platform platform = -1;
-        for( int i = 0; i < props.size(); ++i ) {
-            if( props[i] == CL_CONTEXT_PLATFORM ) {
-                platform = props[i+1];
-            }
-        }
-        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
-#endif
-#if defined(CL_VERSION_1_1)
-        __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
-#endif
-
-        cl_int error;
-        object_ = pfn_clCreateFromD3D10BufferKHR(
-            context(),
-            flags,
-            bufobj,
-            &error);
-
-        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    //! \brief Default constructor - initializes to NULL.
-    BufferD3D10() : Buffer() { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferD3D10& operator = (const cl_mem& rhs)
-    {
-        Buffer::operator=(rhs);
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-    * Required for MSVC.
-    */
-    BufferD3D10(const BufferD3D10& buf) : Buffer(buf) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-    * Required for MSVC.
-    */
-    BufferD3D10& operator = (const BufferD3D10 &buf)
-    {
-        Buffer::operator=(buf);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-    * Required for MSVC.
-    */
-    BufferD3D10(BufferD3D10&& buf) CL_HPP_NOEXCEPT : Buffer(std::move(buf)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-    * Required for MSVC.
-    */
-    BufferD3D10& operator = (BufferD3D10 &&buf)
-    {
-        Buffer::operator=(std::move(buf));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-};
-#endif
-
-/*! \brief Class interface for GL Buffer Memory Objects.
- *
- *  This is provided to facilitate interoperability with OpenGL.
- * 
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class BufferGL : public Buffer
-{
-public:
-    /*! \brief Constructs a BufferGL in a specified context, from a given
-     *         GL buffer.
-     *
-     *  Wraps clCreateFromGLBuffer().
-     */
-    BufferGL(
-        const Context& context,
-        cl_mem_flags flags,
-        cl_GLuint bufobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLBuffer(
-            context(),
-            flags,
-            bufobj,
-            &error);
-
-        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    //! \brief Default constructor - initializes to NULL.
-    BufferGL() : Buffer() { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferGL& operator = (const cl_mem& rhs)
-    {
-        Buffer::operator=(rhs);
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-    * Required for MSVC.
-    */
-    BufferGL(const BufferGL& buf) : Buffer(buf) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-    * Required for MSVC.
-    */
-    BufferGL& operator = (const BufferGL &buf)
-    {
-        Buffer::operator=(buf);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-    * Required for MSVC.
-    */
-    BufferGL(BufferGL&& buf) CL_HPP_NOEXCEPT : Buffer(std::move(buf)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-    * Required for MSVC.
-    */
-    BufferGL& operator = (BufferGL &&buf)
-    {
-        Buffer::operator=(std::move(buf));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-
-    //! \brief Wrapper for clGetGLObjectInfo().
-    cl_int getObjectInfo(
-        cl_gl_object_type *type,
-        cl_GLuint * gl_object_name)
-    {
-        return detail::errHandler(
-            ::clGetGLObjectInfo(object_,type,gl_object_name),
-            __GET_GL_OBJECT_INFO_ERR);
-    }
-};
-
-/*! \brief C++ base class for Image Memory objects.
- *
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class Image : public Memory
-{
-protected:
-    //! \brief Default constructor - initializes to NULL.
-    Image() : Memory() { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { }
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image& operator = (const cl_mem& rhs)
-    {
-        Memory::operator=(rhs);
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image(const Image& img) : Memory(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image& operator = (const Image &img)
-    {
-        Memory::operator=(img);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image(Image&& img) CL_HPP_NOEXCEPT : Memory(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image& operator = (Image &&img)
-    {
-        Memory::operator=(std::move(img));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-
-public:
-    //! \brief Wrapper for clGetImageInfo().
-    template <typename T>
-    cl_int getImageInfo(cl_image_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetImageInfo, object_, name, param),
-            __GET_IMAGE_INFO_ERR);
-    }
-    
-    //! \brief Wrapper for clGetImageInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_image_info, name>::param_type
-    getImageInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_image_info, name>::param_type param;
-        cl_int result = getImageInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-};
-
-#if defined(CL_VERSION_1_2)
-/*! \brief Class interface for 1D Image Memory objects.
- *
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class Image1D : public Image
-{
-public:
-    /*! \brief Constructs a 1D Image in a specified context.
-     *
-     *  Wraps clCreateImage().
-     */
-    Image1D(
-        const Context& context,
-        cl_mem_flags flags,
-        ImageFormat format,
-        ::size_t width,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE1D,
-            width,
-            0, 0, 0, 0, 0, 0, 0, 0
-        };
-        object_ = ::clCreateImage(
-            context(), 
-            flags, 
-            &format, 
-            &desc, 
-            host_ptr, 
-            &error);
-
-        detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    //! \brief Default constructor - initializes to NULL.
-    Image1D() { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { }
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image1D& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1D(const Image1D& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1D& operator = (const Image1D &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1D(Image1D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1D& operator = (Image1D &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-};
-
-/*! \class Image1DBuffer
- * \brief Image interface for 1D buffer images.
- */
-class Image1DBuffer : public Image
-{
-public:
-    Image1DBuffer(
-        const Context& context,
-        cl_mem_flags flags,
-        ImageFormat format,
-        ::size_t width,
-        const Buffer &buffer,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE1D_BUFFER,
-            width,
-            0, 0, 0, 0, 0, 0, 0,
-            buffer()
-        };
-        object_ = ::clCreateImage(
-            context(), 
-            flags, 
-            &format, 
-            &desc, 
-            NULL, 
-            &error);
-
-        detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    Image1DBuffer() { }
-
-    __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { }
-
-    Image1DBuffer& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-    
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DBuffer(const Image1DBuffer& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DBuffer& operator = (const Image1DBuffer &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DBuffer(Image1DBuffer&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DBuffer& operator = (Image1DBuffer &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-};
-
-/*! \class Image1DArray
- * \brief Image interface for arrays of 1D images.
- */
-class Image1DArray : public Image
-{
-public:
-    Image1DArray(
-        const Context& context,
-        cl_mem_flags flags,
-        ImageFormat format,
-        ::size_t arraySize,
-        ::size_t width,
-        ::size_t rowPitch,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE1D_ARRAY,
-            width,
-            0, 0,  // height, depth (unused)
-            arraySize,
-            rowPitch,
-            0, 0, 0, 0
-        };
-        object_ = ::clCreateImage(
-            context(), 
-            flags, 
-            &format, 
-            &desc, 
-            host_ptr, 
-            &error);
-
-        detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    Image1DArray() { }
-
-    __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { }
-
-    Image1DArray& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-    
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DArray(const Image1DArray& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DArray& operator = (const Image1DArray &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DArray(Image1DArray&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image1DArray& operator = (Image1DArray &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-};
-#endif // #if defined(CL_VERSION_1_2)
-
-
-/*! \brief Class interface for 2D Image Memory objects.
- *
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class Image2D : public Image
-{
-public:
-    /*! \brief Constructs a 1D Image in a specified context.
-     *
-     *  Wraps clCreateImage().
-     */
-    Image2D(
-        const Context& context,
-        cl_mem_flags flags,
-        ImageFormat format,
-        ::size_t width,
-        ::size_t height,
-        ::size_t row_pitch = 0,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        bool useCreateImage;
-
-#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-        // Run-time decision based on the actual platform
-        {
-            cl_uint version = detail::getContextPlatformVersion(context());
-            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
-        }
-#elif defined(CL_VERSION_1_2)
-        useCreateImage = true;
-#else
-        useCreateImage = false;
-#endif
-
-#if defined(CL_VERSION_1_2)
-        if (useCreateImage)
-        {
-            cl_image_desc desc =
-            {
-                CL_MEM_OBJECT_IMAGE2D,
-                width,
-                height,
-                0, 0, // depth, array size (unused)
-                row_pitch,
-                0, 0, 0, 0
-            };
-            object_ = ::clCreateImage(
-                context(),
-                flags,
-                &format,
-                &desc,
-                host_ptr,
-                &error);
-
-            detail::errHandler(error, __CREATE_IMAGE_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-#endif // #if defined(CL_VERSION_1_2)
-#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-        if (!useCreateImage)
-        {
-            object_ = ::clCreateImage2D(
-                context(), flags,&format, width, height, row_pitch, host_ptr, &error);
-
-            detail::errHandler(error, __CREATE_IMAGE2D_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-    }
-
-    //! \brief Default constructor - initializes to NULL.
-    Image2D() { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { }
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image2D& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2D(const Image2D& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2D& operator = (const Image2D &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2D(Image2D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2D& operator = (Image2D &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-};
-
-
-#if !defined(CL_VERSION_1_2)
-/*! \brief Class interface for GL 2D Image Memory objects.
- *
- *  This is provided to facilitate interoperability with OpenGL.
- * 
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
- */
-class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
-{
-public:
-    /*! \brief Constructs an Image2DGL in a specified context, from a given
-     *         GL Texture.
-     *
-     *  Wraps clCreateFromGLTexture2D().
-     */
-    Image2DGL(
-        const Context& context,
-        cl_mem_flags flags,
-        cl_GLenum target,
-        cl_GLint  miplevel,
-        cl_GLuint texobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLTexture2D(
-            context(),
-            flags,
-            target,
-            miplevel,
-            texobj,
-            &error);
-
-        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-
-    }
-    
-    //! \brief Default constructor - initializes to NULL.
-    Image2DGL() : Image2D() { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image2DGL& operator = (const cl_mem& rhs)
-    {
-        Image2D::operator=(rhs);
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DGL(const Image2DGL& img) : Image2D(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DGL& operator = (const Image2DGL &img)
-    {
-        Image2D::operator=(img);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DGL(Image2DGL&& img) CL_HPP_NOEXCEPT : Image2D(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DGL& operator = (Image2DGL &&img)
-    {
-        Image2D::operator=(std::move(img));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-};
-#endif // #if !defined(CL_VERSION_1_2)
-
-#if defined(CL_VERSION_1_2)
-/*! \class Image2DArray
- * \brief Image interface for arrays of 2D images.
- */
-class Image2DArray : public Image
-{
-public:
-    Image2DArray(
-        const Context& context,
-        cl_mem_flags flags,
-        ImageFormat format,
-        ::size_t arraySize,
-        ::size_t width,
-        ::size_t height,
-        ::size_t rowPitch,
-        ::size_t slicePitch,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        cl_image_desc desc =
-        {
-            CL_MEM_OBJECT_IMAGE2D_ARRAY,
-            width,
-            height,
-            0,       // depth (unused)
-            arraySize,
-            rowPitch,
-            slicePitch,
-            0, 0, 0
-        };
-        object_ = ::clCreateImage(
-            context(), 
-            flags, 
-            &format, 
-            &desc, 
-            host_ptr, 
-            &error);
-
-        detail::errHandler(error, __CREATE_IMAGE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    Image2DArray() { }
-
-    __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { }
-
-    Image2DArray& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-    
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DArray(const Image2DArray& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DArray& operator = (const Image2DArray &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DArray(Image2DArray&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image2DArray& operator = (Image2DArray &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-};
-#endif // #if defined(CL_VERSION_1_2)
-
-/*! \brief Class interface for 3D Image Memory objects.
- *
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class Image3D : public Image
-{
-public:
-    /*! \brief Constructs a 3D Image in a specified context.
-     *
-     *  Wraps clCreateImage().
-     */
-    Image3D(
-        const Context& context,
-        cl_mem_flags flags,
-        ImageFormat format,
-        ::size_t width,
-        ::size_t height,
-        ::size_t depth,
-        ::size_t row_pitch = 0,
-        ::size_t slice_pitch = 0,
-        void* host_ptr = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        bool useCreateImage;
-
-#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-        // Run-time decision based on the actual platform
-        {
-            cl_uint version = detail::getContextPlatformVersion(context());
-            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
-        }
-#elif defined(CL_VERSION_1_2)
-        useCreateImage = true;
-#else
-        useCreateImage = false;
-#endif
-
-#if defined(CL_VERSION_1_2)
-        if (useCreateImage)
-        {
-            cl_image_desc desc =
-            {
-                CL_MEM_OBJECT_IMAGE3D,
-                width,
-                height,
-                depth,
-                0,      // array size (unused)
-                row_pitch,
-                slice_pitch,
-                0, 0, 0
-            };
-            object_ = ::clCreateImage(
-                context(), 
-                flags, 
-                &format, 
-                &desc, 
-                host_ptr, 
-                &error);
-
-            detail::errHandler(error, __CREATE_IMAGE_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-#endif  // #if defined(CL_VERSION_1_2)
-#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-        if (!useCreateImage)
-        {
-            object_ = ::clCreateImage3D(
-                context(), flags, &format, width, height, depth, row_pitch,
-                slice_pitch, host_ptr, &error);
-
-            detail::errHandler(error, __CREATE_IMAGE3D_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-    }
-
-    //! \brief Default constructor - initializes to NULL.
-    Image3D() : Image() { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { }
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image3D& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3D(const Image3D& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3D& operator = (const Image3D &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3D(Image3D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3D& operator = (Image3D &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-};
-
-#if !defined(CL_VERSION_1_2)
-/*! \brief Class interface for GL 3D Image Memory objects.
- *
- *  This is provided to facilitate interoperability with OpenGL.
- * 
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class Image3DGL : public Image3D
-{
-public:
-    /*! \brief Constructs an Image3DGL in a specified context, from a given
-     *         GL Texture.
-     *
-     *  Wraps clCreateFromGLTexture3D().
-     */
-    Image3DGL(
-        const Context& context,
-        cl_mem_flags flags,
-        cl_GLenum target,
-        cl_GLint  miplevel,
-        cl_GLuint texobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLTexture3D(
-            context(),
-            flags,
-            target,
-            miplevel,
-            texobj,
-            &error);
-
-        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    //! \brief Default constructor - initializes to NULL.
-    Image3DGL() : Image3D() { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image3DGL& operator = (const cl_mem& rhs)
-    {
-        Image3D::operator=(rhs);
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3DGL(const Image3DGL& img) : Image3D(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3DGL& operator = (const Image3DGL &img)
-    {
-        Image3D::operator=(img);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3DGL(Image3DGL&& img) CL_HPP_NOEXCEPT : Image3D(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Image3DGL& operator = (Image3DGL &&img)
-    {
-        Image3D::operator=(std::move(img));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-};
-#endif // #if !defined(CL_VERSION_1_2)
-
-#if defined(CL_VERSION_1_2)
-/*! \class ImageGL
- * \brief general image interface for GL interop.
- * We abstract the 2D and 3D GL images into a single instance here
- * that wraps all GL sourced images on the grounds that setup information
- * was performed by OpenCL anyway.
- */
-class ImageGL : public Image
-{
-public:
-    ImageGL(
-        const Context& context,
-        cl_mem_flags flags,
-        cl_GLenum target,
-        cl_GLint  miplevel,
-        cl_GLuint texobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLTexture(
-            context(), 
-            flags, 
-            target,
-            miplevel,
-            texobj,
-            &error);
-
-        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    ImageGL() : Image() { }
-
-    __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
-
-    ImageGL& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    ImageGL(const ImageGL& img) : Image(img) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    ImageGL& operator = (const ImageGL &img)
-    {
-        Image::operator=(img);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    ImageGL(ImageGL&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    ImageGL& operator = (ImageGL &&img)
-    {
-        Image::operator=(std::move(img));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-};
-#endif // #if defined(CL_VERSION_1_2)
-
-/*! \brief Class interface for GL Render Buffer Memory Objects.
-*
-*  This is provided to facilitate interoperability with OpenGL.
-*
-*  See Memory for details about copy semantics, etc.
-*
-*  \see Memory
-*/
-class BufferRenderGL : 
-#if defined(CL_VERSION_1_2)
-    public ImageGL
-#else // #if defined(CL_VERSION_1_2)
-    public Image2DGL
-#endif //#if defined(CL_VERSION_1_2)
-{
-public:
-    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
-    *         GL Renderbuffer.
-    *
-    *  Wraps clCreateFromGLRenderbuffer().
-    */
-    BufferRenderGL(
-        const Context& context,
-        cl_mem_flags flags,
-        cl_GLuint bufobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLRenderbuffer(
-            context(),
-            flags,
-            bufobj,
-            &error);
-
-        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    //! \brief Default constructor - initializes to NULL.
-#if defined(CL_VERSION_1_2)
-    BufferRenderGL() : ImageGL() {};
-#else // #if defined(CL_VERSION_1_2)
-    BufferRenderGL() : Image2DGL() {};
-#endif //#if defined(CL_VERSION_1_2)
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-    *
-    *  See Memory for further details.
-    */
-#if defined(CL_VERSION_1_2)
-    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : ImageGL(buffer) { }
-#else // #if defined(CL_VERSION_1_2)
-    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Image2DGL(buffer) { }
-#endif //#if defined(CL_VERSION_1_2)
-
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-    *
-    *  See Memory for further details.
-    */
-    BufferRenderGL& operator = (const cl_mem& rhs)
-    {
-#if defined(CL_VERSION_1_2)
-        ImageGL::operator=(rhs);
-#else // #if defined(CL_VERSION_1_2)
-        Image2DGL::operator=(rhs);
-#endif //#if defined(CL_VERSION_1_2)
-        
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-    * Required for MSVC.
-    */
-#if defined(CL_VERSION_1_2)
-    BufferRenderGL(const BufferRenderGL& buf) : ImageGL(buf) {}
-#else // #if defined(CL_VERSION_1_2)
-    BufferRenderGL(const BufferRenderGL& buf) : Image2DGL(buf) {}
-#endif //#if defined(CL_VERSION_1_2)
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-    * Required for MSVC.
-    */
-    BufferRenderGL& operator = (const BufferRenderGL &rhs)
-    {
-#if defined(CL_VERSION_1_2)
-        ImageGL::operator=(rhs);
-#else // #if defined(CL_VERSION_1_2)
-        Image2DGL::operator=(rhs);
-#endif //#if defined(CL_VERSION_1_2)
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-    * Required for MSVC.
-    */
-#if defined(CL_VERSION_1_2)
-    BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT : ImageGL(std::move(buf)) {}
-#else // #if defined(CL_VERSION_1_2)
-    BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT : Image2DGL(std::move(buf)) {}
-#endif //#if defined(CL_VERSION_1_2)
-    
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-    * Required for MSVC.
-    */
-    BufferRenderGL& operator = (BufferRenderGL &&buf)
-    {
-#if defined(CL_VERSION_1_2)
-        ImageGL::operator=(std::move(buf));
-#else // #if defined(CL_VERSION_1_2)
-        Image2DGL::operator=(std::move(buf));
-#endif //#if defined(CL_VERSION_1_2)
-        
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-
-    //! \brief Wrapper for clGetGLObjectInfo().
-    cl_int getObjectInfo(
-        cl_gl_object_type *type,
-        cl_GLuint * gl_object_name)
-    {
-        return detail::errHandler(
-            ::clGetGLObjectInfo(object_, type, gl_object_name),
-            __GET_GL_OBJECT_INFO_ERR);
-    }
-};
-
-/*! \brief Class interface for cl_sampler.
- *
- *  \note Copies of these objects are shallow, meaning that the copy will refer
- *        to the same underlying cl_sampler as the original.  For details, see
- *        clRetainSampler() and clReleaseSampler().
- *
- *  \see cl_sampler 
- */
-class Sampler : public detail::Wrapper<cl_sampler>
-{
-public:
-    //! \brief Default constructor - initializes to NULL.
-    Sampler() { }
-
-    /*! \brief Constructs a Sampler in a specified context.
-     *
-     *  Wraps clCreateSampler().
-     */
-    Sampler(
-        const Context& context,
-        cl_bool normalized_coords,
-        cl_addressing_mode addressing_mode,
-        cl_filter_mode filter_mode,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateSampler(
-            context(), 
-            normalized_coords,
-            addressing_mode,
-            filter_mode,
-            &error);
-
-        detail::errHandler(error, __CREATE_SAMPLER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    /*! \brief Constructor from cl_sampler - takes ownership.
-     * 
-     *  This effectively transfers ownership of a refcount on the cl_sampler
-     *  into the new Sampler object.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
-
-    /*! \brief Assignment operator from cl_sampler - takes ownership.
-     *
-     *  This effectively transfers ownership of a refcount on the rhs and calls
-     *  clReleaseSampler() on the value previously held by this instance.
-     */
-    Sampler& operator = (const cl_sampler& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Sampler(const Sampler& sam) : detail::Wrapper<cl_type>(sam) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Sampler& operator = (const Sampler &sam)
-    {
-        detail::Wrapper<cl_type>::operator=(sam);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Sampler(Sampler&& sam) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(sam)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Sampler& operator = (Sampler &&sam)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(sam));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-
-    //! \brief Wrapper for clGetSamplerInfo().
-    template <typename T>
-    cl_int getInfo(cl_sampler_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
-            __GET_SAMPLER_INFO_ERR);
-    }
-
-    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_sampler_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_sampler_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-};
-
-class Program;
-class CommandQueue;
-class Kernel;
-
-//! \brief Class interface for specifying NDRange values.
-class NDRange
-{
-private:
-    size_t<3> sizes_;
-    cl_uint dimensions_;
-
-public:
-    //! \brief Default constructor - resulting range has zero dimensions.
-    NDRange()
-        : dimensions_(0)
-    { }
-
-    //! \brief Constructs one-dimensional range.
-    NDRange(::size_t size0)
-        : dimensions_(1)
-    {
-        sizes_[0] = size0;
-    }
-
-    //! \brief Constructs two-dimensional range.
-    NDRange(::size_t size0, ::size_t size1)
-        : dimensions_(2)
-    {
-        sizes_[0] = size0;
-        sizes_[1] = size1;
-    }
-
-    //! \brief Constructs three-dimensional range.
-    NDRange(::size_t size0, ::size_t size1, ::size_t size2)
-        : dimensions_(3)
-    {
-        sizes_[0] = size0;
-        sizes_[1] = size1;
-        sizes_[2] = size2;
-    }
-
-    /*! \brief Conversion operator to const ::size_t *.
-     *  
-     *  \returns a pointer to the size of the first dimension.
-     */
-    operator const ::size_t*() const { 
-        return (const ::size_t*) sizes_; 
-    }
-
-    //! \brief Queries the number of dimensions in the range.
-    ::size_t dimensions() const { return dimensions_; }
-};
-
-//! \brief A zero-dimensional range.
-static const NDRange NullRange;
-
-//! \brief Local address wrapper for use with Kernel::setArg
-struct LocalSpaceArg
-{
-    ::size_t size_;
-};
-
-namespace detail {
-
-template <typename T>
-struct KernelArgumentHandler
-{
-    static ::size_t size(const T&) { return sizeof(T); }
-    static const T* ptr(const T& value) { return &value; }
-};
-
-template <>
-struct KernelArgumentHandler<LocalSpaceArg>
-{
-    static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
-    static const void* ptr(const LocalSpaceArg&) { return NULL; }
-};
-
-} 
-//! \endcond
-
-/*! __local
- * \brief Helper function for generating LocalSpaceArg objects.
- * Deprecated. Replaced with Local.
- */
-inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg
-__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-inline LocalSpaceArg
-__local(::size_t size)
-{
-    LocalSpaceArg ret = { size };
-    return ret;
-}
-
-/*! Local
- * \brief Helper function for generating LocalSpaceArg objects.
- */
-inline LocalSpaceArg
-Local(::size_t size)
-{
-    LocalSpaceArg ret = { size };
-    return ret;
-}
-
-//class KernelFunctor;
-
-/*! \brief Class interface for cl_kernel.
- *
- *  \note Copies of these objects are shallow, meaning that the copy will refer
- *        to the same underlying cl_kernel as the original.  For details, see
- *        clRetainKernel() and clReleaseKernel().
- *
- *  \see cl_kernel
- */
-class Kernel : public detail::Wrapper<cl_kernel>
-{
-public:
-    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
-
-    //! \brief Default constructor - initializes to NULL.
-    Kernel() { }
-
-    /*! \brief Constructor from cl_kernel - takes ownership.
-     * 
-     *  This effectively transfers ownership of a refcount on the cl_kernel
-     *  into the new Kernel object.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
-
-    /*! \brief Assignment operator from cl_kernel - takes ownership.
-     *
-     *  This effectively transfers ownership of a refcount on the rhs and calls
-     *  clReleaseKernel() on the value previously held by this instance.
-     */
-    Kernel& operator = (const cl_kernel& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Kernel& operator = (const Kernel &kernel)
-    {
-        detail::Wrapper<cl_type>::operator=(kernel);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Kernel(Kernel&& kernel) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(kernel)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Kernel& operator = (Kernel &&kernel)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(kernel));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-
-    template <typename T>
-    cl_int getInfo(cl_kernel_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetKernelInfo, object_, name, param),
-            __GET_KERNEL_INFO_ERR);
-    }
-
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_kernel_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_kernel_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-
-#if defined(CL_VERSION_1_2)
-    template <typename T>
-    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
-            __GET_KERNEL_ARG_INFO_ERR);
-    }
-
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
-    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_kernel_arg_info, name>::param_type param;
-        cl_int result = getArgInfo(argIndex, name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-#endif // #if defined(CL_VERSION_1_2)
-
-    template <typename T>
-    cl_int getWorkGroupInfo(
-        const Device& device, cl_kernel_work_group_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(
-                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
-                __GET_KERNEL_WORK_GROUP_INFO_ERR);
-    }
-
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
-        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-        detail::cl_kernel_work_group_info, name>::param_type param;
-        cl_int result = getWorkGroupInfo(device, name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-
-    template <typename T>
-    cl_int setArg(cl_uint index, const T &value)
-    {
-        return detail::errHandler(
-            ::clSetKernelArg(
-                object_,
-                index,
-                detail::KernelArgumentHandler<T>::size(value),
-                detail::KernelArgumentHandler<T>::ptr(value)),
-            __SET_KERNEL_ARGS_ERR);
-    }
-
-    cl_int setArg(cl_uint index, ::size_t size, const void* argPtr)
-    {
-        return detail::errHandler(
-            ::clSetKernelArg(object_, index, size, argPtr),
-            __SET_KERNEL_ARGS_ERR);
-    }
-};
-
-/*! \class Program
- * \brief Program interface that implements cl_program.
- */
-class Program : public detail::Wrapper<cl_program>
-{
-public:
-    typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
-    typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
-
-    Program(
-        const STRING_CLASS& source,
-        bool build = false,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-
-        const char * strings = source.c_str();
-        const ::size_t length  = source.size();
-
-        Context context = Context::getDefault(err);
-
-        object_ = ::clCreateProgramWithSource(
-            context(), (cl_uint)1, &strings, &length, &error);
-
-        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-
-        if (error == CL_SUCCESS && build) {
-
-            error = ::clBuildProgram(
-                object_,
-                0,
-                NULL,
-                "",
-                NULL,
-                NULL);
-
-            detail::errHandler(error, __BUILD_PROGRAM_ERR);
-        }
-
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    Program(
-        const Context& context,
-        const STRING_CLASS& source,
-        bool build = false,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-
-        const char * strings = source.c_str();
-        const ::size_t length  = source.size();
-
-        object_ = ::clCreateProgramWithSource(
-            context(), (cl_uint)1, &strings, &length, &error);
-
-        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-
-        if (error == CL_SUCCESS && build) {
-
-            error = ::clBuildProgram(
-                object_,
-                0,
-                NULL,
-                "",
-                NULL,
-                NULL);
-
-            detail::errHandler(error, __BUILD_PROGRAM_ERR);
-        }
-
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    Program(
-        const Context& context,
-        const Sources& sources,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-
-        const ::size_t n = (::size_t)sources.size();
-        ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
-        const char** strings = (const char**) alloca(n * sizeof(const char*));
-
-        for (::size_t i = 0; i < n; ++i) {
-            strings[i] = sources[(int)i].first;
-            lengths[i] = sources[(int)i].second;
-        }
-
-        object_ = ::clCreateProgramWithSource(
-            context(), (cl_uint)n, strings, lengths, &error);
-
-        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    /**
-     * Construct a program object from a list of devices and a per-device list of binaries.
-     * \param context A valid OpenCL context in which to construct the program.
-     * \param devices A vector of OpenCL device objects for which the program will be created.
-     * \param binaries A vector of pairs of a pointer to a binary object and its length.
-     * \param binaryStatus An optional vector that on completion will be resized to
-     *   match the size of binaries and filled with values to specify if each binary
-     *   was successfully loaded.
-     *   Set to CL_SUCCESS if the binary was successfully loaded.
-     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
-     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
-     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
-     *   CL_INVALID_CONTEXT if context is not a valid context.
-     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
-     *     or if any entry in binaries is NULL or has length 0.
-     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
-     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
-     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
-     */
-    Program(
-        const Context& context,
-        const VECTOR_CLASS<Device>& devices,
-        const Binaries& binaries,
-        VECTOR_CLASS<cl_int>* binaryStatus = NULL,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        
-        const ::size_t numDevices = devices.size();
-        
-        // Catch size mismatch early and return
-        if(binaries.size() != numDevices) {
-            error = CL_INVALID_VALUE;
-            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-            return;
-        }
-
-        ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t));
-        const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**));
-
-        for (::size_t i = 0; i < numDevices; ++i) {
-            images[i] = (const unsigned char*)binaries[i].first;
-            lengths[i] = binaries[(int)i].second;
-        }
-
-        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
-        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
-            deviceIDs[deviceIndex] = (devices[deviceIndex])();
-        }
-
-        if(binaryStatus) {
-            binaryStatus->resize(numDevices);
-        }
-        
-        object_ = ::clCreateProgramWithBinary(
-            context(), (cl_uint) devices.size(),
-            deviceIDs,
-            lengths, images, (binaryStatus != NULL && numDevices > 0)
-               ? &binaryStatus->front()
-               : NULL, &error);
-
-        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    
-#if defined(CL_VERSION_1_2)
-    /**
-     * Create program using builtin kernels.
-     * \param kernelNames Semi-colon separated list of builtin kernel names
-     */
-    Program(
-        const Context& context,
-        const VECTOR_CLASS<Device>& devices,
-        const STRING_CLASS& kernelNames,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-
-
-        ::size_t numDevices = devices.size();
-        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
-        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
-            deviceIDs[deviceIndex] = (devices[deviceIndex])();
-        }
-        
-        object_ = ::clCreateProgramWithBuiltInKernels(
-            context(), 
-            (cl_uint) devices.size(),
-            deviceIDs,
-            kernelNames.c_str(), 
-            &error);
-
-        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-#endif // #if defined(CL_VERSION_1_2)
-
-    Program() { }
-
-    __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { }
-
-    Program& operator = (const cl_program& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Program(const Program& program) : detail::Wrapper<cl_type>(program) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    Program& operator = (const Program &program)
-    {
-        detail::Wrapper<cl_type>::operator=(program);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Program(Program&& program) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(program)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    Program& operator = (Program &&program)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(program));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-
-    cl_int build(
-        const VECTOR_CLASS<Device>& devices,
-        const char* options = NULL,
-        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-        void* data = NULL) const
-    {
-        ::size_t numDevices = devices.size();
-        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
-        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
-            deviceIDs[deviceIndex] = (devices[deviceIndex])();
-        }
-
-        return detail::errHandler(
-            ::clBuildProgram(
-                object_,
-                (cl_uint)
-                devices.size(),
-                deviceIDs,
-                options,
-                notifyFptr,
-                data),
-                __BUILD_PROGRAM_ERR);
-    }
-
-    cl_int build(
-        const char* options = NULL,
-        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-        void* data = NULL) const
-    {
-        return detail::errHandler(
-            ::clBuildProgram(
-                object_,
-                0,
-                NULL,
-                options,
-                notifyFptr,
-                data),
-                __BUILD_PROGRAM_ERR);
-    }
-
-#if defined(CL_VERSION_1_2)
-    cl_int compile(
-        const char* options = NULL,
-        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-        void* data = NULL) const
-    {
-        return detail::errHandler(
-            ::clCompileProgram(
-                object_,
-                0,
-                NULL,
-                options,
-                0,
-                NULL,
-                NULL,
-                notifyFptr,
-                data),
-                __COMPILE_PROGRAM_ERR);
-    }
-#endif
-
-    template <typename T>
-    cl_int getInfo(cl_program_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetProgramInfo, object_, name, param),
-            __GET_PROGRAM_INFO_ERR);
-    }
-
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_program_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_program_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-
-    template <typename T>
-    cl_int getBuildInfo(
-        const Device& device, cl_program_build_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(
-                &::clGetProgramBuildInfo, object_, device(), name, param),
-                __GET_PROGRAM_BUILD_INFO_ERR);
-    }
-
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_program_build_info, name>::param_type
-    getBuildInfo(const Device& device, cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_program_build_info, name>::param_type param;
-        cl_int result = getBuildInfo(device, name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-
-    cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
-    {
-        cl_uint numKernels;
-        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
-        }
-
-        Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
-        err = ::clCreateKernelsInProgram(
-            object_, numKernels, (cl_kernel*) value, NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
-        }
-
-        kernels->assign(&value[0], &value[numKernels]);
-        return CL_SUCCESS;
-    }
-};
-
-#if defined(CL_VERSION_1_2)
-inline Program linkProgram(
-    Program input1,
-    Program input2,
-    const char* options = NULL,
-    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-    void* data = NULL,
-    cl_int* err = NULL) 
-{
-    cl_int error_local = CL_SUCCESS;
-
-    cl_program programs[2] = { input1(), input2() };
-
-    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>(&error_local);
-    if(error_local!=CL_SUCCESS) {
-        detail::errHandler(error_local, __LINK_PROGRAM_ERR);
-    }
-
-    cl_program prog = ::clLinkProgram(
-        ctx(),
-        0,
-        NULL,
-        options,
-        2,
-        programs,
-        notifyFptr,
-        data,
-        &error_local);
-
-    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
-    if (err != NULL) {
-        *err = error_local;
-    }
-
-    return Program(prog);
-}
-
-inline Program linkProgram(
-    VECTOR_CLASS<Program> inputPrograms,
-    const char* options = NULL,
-    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-    void* data = NULL,
-    cl_int* err = NULL) 
-{
-    cl_int error_local = CL_SUCCESS;
-
-    cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program));
-
-    if (programs != NULL) {
-        for (unsigned int i = 0; i < inputPrograms.size(); i++) {
-          programs[i] = inputPrograms[i]();
-        }
-    } 
-
-    Context ctx;
-    if(inputPrograms.size() > 0) {
-        ctx = inputPrograms[0].getInfo<CL_PROGRAM_CONTEXT>(&error_local);
-        if(error_local!=CL_SUCCESS) {
-            detail::errHandler(error_local, __LINK_PROGRAM_ERR);
-        }
-    }
-    cl_program prog = ::clLinkProgram(
-        ctx(),
-        0,
-        NULL,
-        options,
-        (cl_uint)inputPrograms.size(),
-        programs,
-        notifyFptr,
-        data,
-        &error_local);
-
-    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
-    if (err != NULL) {
-        *err = error_local;
-    }
-
-    return Program(prog);
-}
-#endif
-
-template<>
-inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
-{
-    VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
-    VECTOR_CLASS<char *> binaries;
-    for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) 
-    {
-        char *ptr = NULL;
-        if (*s != 0) 
-            ptr = new char[*s];
-        binaries.push_back(ptr);
-    }
-    
-    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries);
-    if (err != NULL) {
-        *err = result;
-    }
-    return binaries;
-}
-
-inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
-{
-    cl_int error;
-
-    object_ = ::clCreateKernel(program(), name, &error);
-    detail::errHandler(error, __CREATE_KERNEL_ERR);
-
-    if (err != NULL) {
-        *err = error;
-    }
-
-}
-
-/*! \class CommandQueue
- * \brief CommandQueue interface for cl_command_queue.
- */
-class CommandQueue : public detail::Wrapper<cl_command_queue>
-{
-private:
-#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
-    static std::atomic<int> default_initialized_;
-#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-    static volatile int default_initialized_;
-#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-    static CommandQueue default_;
-    static volatile cl_int default_error_;
-public:
-   CommandQueue(
-        cl_command_queue_properties properties,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-
-        Context context = Context::getDefault(&error);
-        detail::errHandler(error, __CREATE_CONTEXT_ERR);
-
-        if (error != CL_SUCCESS) {
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-        else {
-            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
-
-            object_ = ::clCreateCommandQueue(
-                context(), device(), properties, &error);
-
-            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-    }
-    /*!
-    * \brief Constructs a CommandQueue for an implementation defined device in the given context
-    */
-    explicit CommandQueue(
-        const Context& context,
-        cl_command_queue_properties properties = 0,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        VECTOR_CLASS<cl::Device> devices;
-        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
-
-        detail::errHandler(error, __CREATE_CONTEXT_ERR);
-
-        if (error != CL_SUCCESS)
-        {
-            if (err != NULL) {
-                *err = error;
-            }
-            return;
-        }
-
-        object_ = ::clCreateCommandQueue(context(), devices[0](), properties, &error);
-
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-
-        if (err != NULL) {
-            *err = error;
-        }
-
-    }
-
-    CommandQueue(
-        const Context& context,
-        const Device& device,
-        cl_command_queue_properties properties = 0,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateCommandQueue(
-            context(), device(), properties, &error);
-
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    /*! \brief Copy constructor to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    CommandQueue(const CommandQueue& queue) : detail::Wrapper<cl_type>(queue) {}
-
-    /*! \brief Copy assignment to forward copy to the superclass correctly.
-     * Required for MSVC.
-     */
-    CommandQueue& operator = (const CommandQueue &queue)
-    {
-        detail::Wrapper<cl_type>::operator=(queue);
-        return *this;
-    }
-
-#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-    /*! \brief Move constructor to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    CommandQueue(CommandQueue&& queue) CL_HPP_NOEXCEPT : detail::Wrapper<cl_type>(std::move(queue)) {}
-
-    /*! \brief Move assignment to forward move to the superclass correctly.
-     * Required for MSVC.
-     */
-    CommandQueue& operator = (CommandQueue &&queue)
-    {
-        detail::Wrapper<cl_type>::operator=(std::move(queue));
-        return *this;
-    }
-#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED)
-
-    static CommandQueue getDefault(cl_int * err = NULL) 
-    {
-        int state = detail::compare_exchange(
-            &default_initialized_, 
-            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
-        
-        if (state & __DEFAULT_INITIALIZED) {
-            if (err != NULL) {
-                *err = default_error_;
-            }
-            return default_;
-        }
-
-        if (state & __DEFAULT_BEING_INITIALIZED) {
-              // Assume writes will propagate eventually...
-              while(default_initialized_ != __DEFAULT_INITIALIZED) {
-                  detail::fence();
-              }
-
-            if (err != NULL) {
-                *err = default_error_;
-            }
-            return default_;
-        }
-
-        cl_int error;
-
-        Context context = Context::getDefault(&error);
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-
-        if (error != CL_SUCCESS) {
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-        else {
-            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
-
-            default_ = CommandQueue(context, device, 0, &error);
-
-            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-            if (err != NULL) {
-                *err = error;
-            }
-        }
-
-        detail::fence();
-
-        default_error_ = error;
-        // Assume writes will propagate eventually...
-        default_initialized_ = __DEFAULT_INITIALIZED;
-
-        detail::fence();
-
-        if (err != NULL) {
-            *err = default_error_;
-        }
-        return default_;
-
-    }
-
-    CommandQueue() { }
-
-    __CL_EXPLICIT_CONSTRUCTORS CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
-
-    CommandQueue& operator = (const cl_command_queue& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-
-    template <typename T>
-    cl_int getInfo(cl_command_queue_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(
-                &::clGetCommandQueueInfo, object_, name, param),
-                __GET_COMMAND_QUEUE_INFO_ERR);
-    }
-
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_command_queue_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_command_queue_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-
-    cl_int enqueueReadBuffer(
-        const Buffer& buffer,
-        cl_bool blocking,
-        ::size_t offset,
-        ::size_t size,
-        void* ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueReadBuffer(
-                object_, buffer(), blocking, offset, size,
-                ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_READ_BUFFER_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    cl_int enqueueWriteBuffer(
-        const Buffer& buffer,
-        cl_bool blocking,
-        ::size_t offset,
-        ::size_t size,
-        const void* ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueWriteBuffer(
-                object_, buffer(), blocking, offset, size,
-                ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_WRITE_BUFFER_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    cl_int enqueueCopyBuffer(
-        const Buffer& src,
-        const Buffer& dst,
-        ::size_t src_offset,
-        ::size_t dst_offset,
-        ::size_t size,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueCopyBuffer(
-                object_, src(), dst(), src_offset, dst_offset, size,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQEUE_COPY_BUFFER_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    cl_int enqueueReadBufferRect(
-        const Buffer& buffer,
-        cl_bool blocking,
-        const size_t<3>& buffer_offset,
-        const size_t<3>& host_offset,
-        const size_t<3>& region,
-        ::size_t buffer_row_pitch,
-        ::size_t buffer_slice_pitch,
-        ::size_t host_row_pitch,
-        ::size_t host_slice_pitch,
-        void *ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueReadBufferRect(
-                object_, 
-                buffer(), 
-                blocking, 
-                (const ::size_t *)buffer_offset,
-                (const ::size_t *)host_offset,
-                (const ::size_t *)region,
-                buffer_row_pitch,
-                buffer_slice_pitch,
-                host_row_pitch,
-                host_slice_pitch,
-                ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_READ_BUFFER_RECT_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    cl_int enqueueWriteBufferRect(
-        const Buffer& buffer,
-        cl_bool blocking,
-        const size_t<3>& buffer_offset,
-        const size_t<3>& host_offset,
-        const size_t<3>& region,
-        ::size_t buffer_row_pitch,
-        ::size_t buffer_slice_pitch,
-        ::size_t host_row_pitch,
-        ::size_t host_slice_pitch,
-        void *ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueWriteBufferRect(
-                object_, 
-                buffer(), 
-                blocking, 
-                (const ::size_t *)buffer_offset,
-                (const ::size_t *)host_offset,
-                (const ::size_t *)region,
-                buffer_row_pitch,
-                buffer_slice_pitch,
-                host_row_pitch,
-                host_slice_pitch,
-                ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    cl_int enqueueCopyBufferRect(
-        const Buffer& src,
-        const Buffer& dst,
-        const size_t<3>& src_origin,
-        const size_t<3>& dst_origin,
-        const size_t<3>& region,
-        ::size_t src_row_pitch,
-        ::size_t src_slice_pitch,
-        ::size_t dst_row_pitch,
-        ::size_t dst_slice_pitch,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueCopyBufferRect(
-                object_, 
-                src(), 
-                dst(), 
-                (const ::size_t *)src_origin, 
-                (const ::size_t *)dst_origin, 
-                (const ::size_t *)region,
-                src_row_pitch,
-                src_slice_pitch,
-                dst_row_pitch,
-                dst_slice_pitch,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQEUE_COPY_BUFFER_RECT_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-#if defined(CL_VERSION_1_2)
-    /**
-     * Enqueue a command to fill a buffer object with a pattern
-     * of a given size. The pattern is specified a as vector.
-     * \tparam PatternType The datatype of the pattern field. 
-     *     The pattern type must be an accepted OpenCL data type.
-     */
-    template<typename PatternType>
-    cl_int enqueueFillBuffer(
-        const Buffer& buffer,
-        PatternType pattern,
-        ::size_t offset,
-        ::size_t size,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueFillBuffer(
-                object_, 
-                buffer(),
-                static_cast<void*>(&pattern),
-                sizeof(PatternType), 
-                offset, 
-                size,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_FILL_BUFFER_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-#endif // #if defined(CL_VERSION_1_2)
-
-    cl_int enqueueReadImage(
-        const Image& image,
-        cl_bool blocking,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        ::size_t row_pitch,
-        ::size_t slice_pitch,
-        void* ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueReadImage(
-                object_, image(), blocking, (const ::size_t *) origin,
-                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_READ_IMAGE_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    cl_int enqueueWriteImage(
-        const Image& image,
-        cl_bool blocking,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        ::size_t row_pitch,
-        ::size_t slice_pitch,
-        void* ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueWriteImage(
-                object_, image(), blocking, (const ::size_t *) origin,
-                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_WRITE_IMAGE_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    cl_int enqueueCopyImage(
-        const Image& src,
-        const Image& dst,
-        const size_t<3>& src_origin,
-        const size_t<3>& dst_origin,
-        const size_t<3>& region,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueCopyImage(
-                object_, src(), dst(), (const ::size_t *) src_origin,
-                (const ::size_t *)dst_origin, (const ::size_t *) region,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_COPY_IMAGE_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-#if defined(CL_VERSION_1_2)
-    /**
-     * Enqueue a command to fill an image object with a specified color.
-     * \param fillColor is the color to use to fill the image.
-     *     This is a four component RGBA floating-point color value if
-     *     the image channel data type is not an unnormalized signed or
-     *     unsigned data type.
-     */
-    cl_int enqueueFillImage(
-        const Image& image,
-        cl_float4 fillColor,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueFillImage(
-                object_, 
-                image(),
-                static_cast<void*>(&fillColor), 
-                (const ::size_t *) origin, 
-                (const ::size_t *) region,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_FILL_IMAGE_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    /**
-     * Enqueue a command to fill an image object with a specified color.
-     * \param fillColor is the color to use to fill the image.
-     *     This is a four component RGBA signed integer color value if
-     *     the image channel data type is an unnormalized signed integer
-     *     type.
-     */
-    cl_int enqueueFillImage(
-        const Image& image,
-        cl_int4 fillColor,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueFillImage(
-                object_, 
-                image(),
-                static_cast<void*>(&fillColor), 
-                (const ::size_t *) origin, 
-                (const ::size_t *) region,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_FILL_IMAGE_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    /**
-     * Enqueue a command to fill an image object with a specified color.
-     * \param fillColor is the color to use to fill the image.
-     *     This is a four component RGBA unsigned integer color value if
-     *     the image channel data type is an unnormalized unsigned integer
-     *     type.
-     */
-    cl_int enqueueFillImage(
-        const Image& image,
-        cl_uint4 fillColor,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueFillImage(
-                object_, 
-                image(),
-                static_cast<void*>(&fillColor), 
-                (const ::size_t *) origin, 
-                (const ::size_t *) region,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-                __ENQUEUE_FILL_IMAGE_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-#endif // #if defined(CL_VERSION_1_2)
-
-    cl_int enqueueCopyImageToBuffer(
-        const Image& src,
-        const Buffer& dst,
-        const size_t<3>& src_origin,
-        const size_t<3>& region,
-        ::size_t dst_offset,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueCopyImageToBuffer(
-                object_, src(), dst(), (const ::size_t *) src_origin,
-                (const ::size_t *) region, dst_offset,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    cl_int enqueueCopyBufferToImage(
-        const Buffer& src,
-        const Image& dst,
-        ::size_t src_offset,
-        const size_t<3>& dst_origin,
-        const size_t<3>& region,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueCopyBufferToImage(
-                object_, src(), dst(), src_offset,
-                (const ::size_t *) dst_origin, (const ::size_t *) region,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    void* enqueueMapBuffer(
-        const Buffer& buffer,
-        cl_bool blocking,
-        cl_map_flags flags,
-        ::size_t offset,
-        ::size_t size,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL,
-        cl_int* err = NULL) const
-    {
-        cl_event tmp;
-        cl_int error;
-        void * result = ::clEnqueueMapBuffer(
-            object_, buffer(), blocking, flags, offset, size,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-            (event != NULL) ? &tmp : NULL,
-            &error);
-
-        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-        if (event != NULL && error == CL_SUCCESS)
-            *event = tmp;
-
-        return result;
-    }
-
-    void* enqueueMapImage(
-        const Image& buffer,
-        cl_bool blocking,
-        cl_map_flags flags,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        ::size_t * row_pitch,
-        ::size_t * slice_pitch,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL,
-        cl_int* err = NULL) const
-    {
-        cl_event tmp;
-        cl_int error;
-        void * result = ::clEnqueueMapImage(
-            object_, buffer(), blocking, flags,
-            (const ::size_t *) origin, (const ::size_t *) region,
-            row_pitch, slice_pitch,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-            (event != NULL) ? &tmp : NULL,
-            &error);
-
-        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
-        if (err != NULL) {
-              *err = error;
-        }
-        if (event != NULL && error == CL_SUCCESS)
-            *event = tmp;
-        return result;
-    }
-
-    cl_int enqueueUnmapMemObject(
-        const Memory& memory,
-        void* mapped_ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueUnmapMemObject(
-                object_, memory(), mapped_ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-#if defined(CL_VERSION_1_2)
-    /**
-     * Enqueues a marker command which waits for either a list of events to complete, 
-     * or all previously enqueued commands to complete.
-     *
-     * Enqueues a marker command which waits for either a list of events to complete, 
-     * or if the list is empty it waits for all commands previously enqueued in command_queue 
-     * to complete before it completes. This command returns an event which can be waited on, 
-     * i.e. this event can be waited on to insure that all events either in the event_wait_list 
-     * or all previously enqueued commands, queued before this command to command_queue, 
-     * have completed.
-     */
-    cl_int enqueueMarkerWithWaitList(
-        const VECTOR_CLASS<Event> *events = 0,
-        Event *event = 0)
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueMarkerWithWaitList(
-                object_,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_MARKER_WAIT_LIST_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    /**
-     * A synchronization point that enqueues a barrier operation.
-     *
-     * Enqueues a barrier command which waits for either a list of events to complete, 
-     * or if the list is empty it waits for all commands previously enqueued in command_queue 
-     * to complete before it completes. This command blocks command execution, that is, any 
-     * following commands enqueued after it do not execute until it completes. This command 
-     * returns an event which can be waited on, i.e. this event can be waited on to insure that 
-     * all events either in the event_wait_list or all previously enqueued commands, queued 
-     * before this command to command_queue, have completed.
-     */
-    cl_int enqueueBarrierWithWaitList(
-        const VECTOR_CLASS<Event> *events = 0,
-        Event *event = 0)
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueBarrierWithWaitList(
-                object_,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_BARRIER_WAIT_LIST_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-    
-    /**
-     * Enqueues a command to indicate with which device a set of memory objects
-     * should be associated.
-     */
-    cl_int enqueueMigrateMemObjects(
-        const VECTOR_CLASS<Memory> &memObjects,
-        cl_mem_migration_flags flags,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL
-        )
-    {
-        cl_event tmp;
-        
-        cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem)));
-        for( int i = 0; i < (int)memObjects.size(); ++i ) {
-            localMemObjects[i] = memObjects[i]();
-        }
-
-
-        cl_int err = detail::errHandler(
-            ::clEnqueueMigrateMemObjects(
-                object_, 
-                (cl_uint)memObjects.size(), 
-                static_cast<const cl_mem*>(localMemObjects),
-                flags,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-#endif // #if defined(CL_VERSION_1_2)
-
-    cl_int enqueueNDRangeKernel(
-        const Kernel& kernel,
-        const NDRange& offset,
-        const NDRange& global,
-        const NDRange& local = NullRange,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueNDRangeKernel(
-                object_, kernel(), (cl_uint) global.dimensions(),
-                offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
-                (const ::size_t*) global,
-                local.dimensions() != 0 ? (const ::size_t*) local : NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_NDRANGE_KERNEL_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    cl_int enqueueTask(
-        const Kernel& kernel,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueTask(
-                object_, kernel(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_TASK_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    cl_int enqueueNativeKernel(
-        void (CL_CALLBACK *userFptr)(void *),
-        std::pair<void*, ::size_t> args,
-        const VECTOR_CLASS<Memory>* mem_objects = NULL,
-        const VECTOR_CLASS<const void*>* mem_locs = NULL,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) 
-            ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
-            : NULL;
-
-        if (mems != NULL) {
-            for (unsigned int i = 0; i < mem_objects->size(); i++) {
-                mems[i] = ((*mem_objects)[i])();
-            }
-        }
-
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueNativeKernel(
-                object_, userFptr, args.first, args.second,
-                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                mems,
-                (mem_locs != NULL && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_NATIVE_KERNEL);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-/**
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
-    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
-    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-    {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            ::clEnqueueMarker(
-                object_, 
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_MARKER_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-
-    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-    cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-    {
-        return detail::errHandler(
-            ::clEnqueueWaitForEvents(
-                object_,
-                (cl_uint) events.size(),
-                events.size() > 0 ? (const cl_event*) &events.front() : NULL),
-            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
-    }
-#endif // #if defined(CL_VERSION_1_1)
-
-    cl_int enqueueAcquireGLObjects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-     {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-             ::clEnqueueAcquireGLObjects(
-                 object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
-             __ENQUEUE_ACQUIRE_GL_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-     }
-
-    cl_int enqueueReleaseGLObjects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-     {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-             ::clEnqueueReleaseGLObjects(
-                 object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
-             __ENQUEUE_RELEASE_GL_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-     }
-
-#if defined (USE_DX_INTEROP)
-typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
-    cl_command_queue command_queue, cl_uint num_objects,
-    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
-    const cl_event* event_wait_list, cl_event* event);
-typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
-    cl_command_queue command_queue, cl_uint num_objects,
-    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
-    const cl_event* event_wait_list, cl_event* event);
-
-    cl_int enqueueAcquireD3D10Objects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-    {
-        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
-#if defined(CL_VERSION_1_2)
-        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
-        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
-        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
-        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
-#endif
-#if defined(CL_VERSION_1_1)
-        __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
-#endif
-        
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-             pfn_clEnqueueAcquireD3D10ObjectsKHR(
-                 object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
-             __ENQUEUE_ACQUIRE_GL_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-     }
-
-    cl_int enqueueReleaseD3D10Objects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-    {
-        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
-#if defined(CL_VERSION_1_2)
-        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
-        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
-        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
-        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
-#endif // #if defined(CL_VERSION_1_2)
-#if defined(CL_VERSION_1_1)
-        __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
-#endif // #if defined(CL_VERSION_1_1)
-
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            pfn_clEnqueueReleaseD3D10ObjectsKHR(
-                object_,
-                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_RELEASE_GL_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-#endif
-
-/**
- * Deprecated APIs for 1.2
- */
-#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
-    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-    {
-        return detail::errHandler(
-            ::clEnqueueBarrier(object_),
-            __ENQUEUE_BARRIER_ERR);
-    }
-#endif // #if defined(CL_VERSION_1_1)
-
-    cl_int flush() const
-    {
-        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
-    }
-
-    cl_int finish() const
-    {
-        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
-    }
-};
-
-#ifdef _WIN32
-#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
-__declspec(selectany) std::atomic<int> CommandQueue::default_initialized_;
-#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-__declspec(selectany) CommandQueue CommandQueue::default_;
-__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
-#else // !_WIN32
-#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED
-__attribute__((weak)) std::atomic<int> CommandQueue::default_initialized_;
-#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
-#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED
-__attribute__((weak)) CommandQueue CommandQueue::default_;
-__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
-#endif // !_WIN32
-
-template< typename IteratorType >
-Buffer::Buffer(
-    const Context &context,
-    IteratorType startIterator,
-    IteratorType endIterator,
-    bool readOnly,
-    bool useHostPtr,
-    cl_int* err)
-{
-    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
-    cl_int error;
-
-    cl_mem_flags flags = 0;
-    if( readOnly ) {
-        flags |= CL_MEM_READ_ONLY;
-    }
-    else {
-        flags |= CL_MEM_READ_WRITE;
-    }
-    if( useHostPtr ) {
-        flags |= CL_MEM_USE_HOST_PTR;
-    }
-    
-    ::size_t size = sizeof(DataType)*(endIterator - startIterator);
-
-    if( useHostPtr ) {
-        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
-    } else {
-        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
-    }
-
-    detail::errHandler(error, __CREATE_BUFFER_ERR);
-    if (err != NULL) {
-        *err = error;
-    }
-
-    if( !useHostPtr ) {
-        CommandQueue queue(context, 0, &error);
-        detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-
-        error = cl::copy(queue, startIterator, endIterator, *this);
-        detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-}
-
-template< typename IteratorType >
-Buffer::Buffer(
-    const CommandQueue &queue,
-    IteratorType startIterator,
-    IteratorType endIterator,
-    bool readOnly,
-    bool useHostPtr,
-    cl_int* err)
-{
-    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
-    cl_int error;
-
-    cl_mem_flags flags = 0;
-    if (readOnly) {
-        flags |= CL_MEM_READ_ONLY;
-    }
-    else {
-        flags |= CL_MEM_READ_WRITE;
-    }
-    if (useHostPtr) {
-        flags |= CL_MEM_USE_HOST_PTR;
-    }
-
-    ::size_t size = sizeof(DataType)*(endIterator - startIterator);
-
-    Context context = queue.getInfo<CL_QUEUE_CONTEXT>();
-
-    if (useHostPtr) {
-        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
-    }
-    else {
-        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
-    }
-
-    detail::errHandler(error, __CREATE_BUFFER_ERR);
-    if (err != NULL) {
-        *err = error;
-    }
-
-    if (!useHostPtr) {
-        error = cl::copy(queue, startIterator, endIterator, *this);
-        detail::errHandler(error, __CREATE_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-}
-
-inline cl_int enqueueReadBuffer(
-    const Buffer& buffer,
-    cl_bool blocking,
-    ::size_t offset,
-    ::size_t size,
-    void* ptr,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-
-    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
-}
-
-inline cl_int enqueueWriteBuffer(
-        const Buffer& buffer,
-        cl_bool blocking,
-        ::size_t offset,
-        ::size_t size,
-        const void* ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL)
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-
-    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
-}
-
-inline void* enqueueMapBuffer(
-        const Buffer& buffer,
-        cl_bool blocking,
-        cl_map_flags flags,
-        ::size_t offset,
-        ::size_t size,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL,
-        cl_int* err = NULL)
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-    if (err != NULL) {
-        *err = error;
-    }
-
-    void * result = ::clEnqueueMapBuffer(
-            queue(), buffer(), blocking, flags, offset, size,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-            (cl_event*) event,
-            &error);
-
-    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-    if (err != NULL) {
-        *err = error;
-    }
-    return result;
-}
-
-inline cl_int enqueueUnmapMemObject(
-    const Memory& memory,
-    void* mapped_ptr,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-
-    cl_event tmp;
-    cl_int err = detail::errHandler(
-        ::clEnqueueUnmapMemObject(
-            queue(), memory(), mapped_ptr,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-            (event != NULL) ? &tmp : NULL),
-        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
-
-    if (event != NULL && err == CL_SUCCESS)
-        *event = tmp;
-
-    return err;
-}
-
-inline cl_int enqueueCopyBuffer(
-        const Buffer& src,
-        const Buffer& dst,
-        ::size_t src_offset,
-        ::size_t dst_offset,
-        ::size_t size,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL)
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-
-    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
-}
-
-/**
- * Blocking copy operation between iterators and a buffer.
- * Host to Device.
- * Uses default command queue.
- */
-template< typename IteratorType >
-inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS)
-        return error;
-
-    return cl::copy(queue, startIterator, endIterator, buffer);
-}
-
-/**
- * Blocking copy operation between iterators and a buffer.
- * Device to Host.
- * Uses default command queue.
- */
-template< typename IteratorType >
-inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-    if (error != CL_SUCCESS)
-        return error;
-
-    return cl::copy(queue, buffer, startIterator, endIterator);
-}
-
-/**
- * Blocking copy operation between iterators and a buffer.
- * Host to Device.
- * Uses specified queue.
- */
-template< typename IteratorType >
-inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
-{
-    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
-    cl_int error;
-    
-    ::size_t length = endIterator-startIterator;
-    ::size_t byteLength = length*sizeof(DataType);
-
-    DataType *pointer = 
-        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
-    // if exceptions enabled, enqueueMapBuffer will throw
-    if( error != CL_SUCCESS ) {
-        return error;
-    }
-#if defined(_MSC_VER)
-    std::copy(
-        startIterator, 
-        endIterator, 
-        stdext::checked_array_iterator<DataType*>(
-            pointer, length));
-#else
-    std::copy(startIterator, endIterator, pointer);
-#endif
-    Event endEvent;
-    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
-    // if exceptions enabled, enqueueUnmapMemObject will throw
-    if( error != CL_SUCCESS ) { 
-        return error;
-    }
-    endEvent.wait();
-    return CL_SUCCESS;
-}
-
-/**
- * Blocking copy operation between iterators and a buffer.
- * Device to Host.
- * Uses specified queue.
- */
-template< typename IteratorType >
-inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
-{
-    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
-    cl_int error;
-        
-    ::size_t length = endIterator-startIterator;
-    ::size_t byteLength = length*sizeof(DataType);
-
-    DataType *pointer = 
-        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
-    // if exceptions enabled, enqueueMapBuffer will throw
-    if( error != CL_SUCCESS ) {
-        return error;
-    }
-    std::copy(pointer, pointer + length, startIterator);
-    Event endEvent;
-    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
-    // if exceptions enabled, enqueueUnmapMemObject will throw
-    if( error != CL_SUCCESS ) { 
-        return error;
-    }
-    endEvent.wait();
-    return CL_SUCCESS;
-}
-
-#if defined(CL_VERSION_1_1)
-inline cl_int enqueueReadBufferRect(
-    const Buffer& buffer,
-    cl_bool blocking,
-    const size_t<3>& buffer_offset,
-    const size_t<3>& host_offset,
-    const size_t<3>& region,
-    ::size_t buffer_row_pitch,
-    ::size_t buffer_slice_pitch,
-    ::size_t host_row_pitch,
-    ::size_t host_slice_pitch,
-    void *ptr,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-
-    return queue.enqueueReadBufferRect(
-        buffer, 
-        blocking, 
-        buffer_offset, 
-        host_offset,
-        region,
-        buffer_row_pitch,
-        buffer_slice_pitch,
-        host_row_pitch,
-        host_slice_pitch,
-        ptr, 
-        events, 
-        event);
-}
-
-inline cl_int enqueueWriteBufferRect(
-    const Buffer& buffer,
-    cl_bool blocking,
-    const size_t<3>& buffer_offset,
-    const size_t<3>& host_offset,
-    const size_t<3>& region,
-    ::size_t buffer_row_pitch,
-    ::size_t buffer_slice_pitch,
-    ::size_t host_row_pitch,
-    ::size_t host_slice_pitch,
-    void *ptr,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-
-    return queue.enqueueWriteBufferRect(
-        buffer, 
-        blocking, 
-        buffer_offset, 
-        host_offset,
-        region,
-        buffer_row_pitch,
-        buffer_slice_pitch,
-        host_row_pitch,
-        host_slice_pitch,
-        ptr, 
-        events, 
-        event);
-}
-
-inline cl_int enqueueCopyBufferRect(
-    const Buffer& src,
-    const Buffer& dst,
-    const size_t<3>& src_origin,
-    const size_t<3>& dst_origin,
-    const size_t<3>& region,
-    ::size_t src_row_pitch,
-    ::size_t src_slice_pitch,
-    ::size_t dst_row_pitch,
-    ::size_t dst_slice_pitch,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-
-    return queue.enqueueCopyBufferRect(
-        src,
-        dst,
-        src_origin,
-        dst_origin,
-        region,
-        src_row_pitch,
-        src_slice_pitch,
-        dst_row_pitch,
-        dst_slice_pitch,
-        events, 
-        event);
-}
-#endif
-
-inline cl_int enqueueReadImage(
-    const Image& image,
-    cl_bool blocking,
-    const size_t<3>& origin,
-    const size_t<3>& region,
-    ::size_t row_pitch,
-    ::size_t slice_pitch,
-    void* ptr,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL) 
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-
-    return queue.enqueueReadImage(
-        image,
-        blocking,
-        origin,
-        region,
-        row_pitch,
-        slice_pitch,
-        ptr,
-        events, 
-        event);
-}
-
-inline cl_int enqueueWriteImage(
-    const Image& image,
-    cl_bool blocking,
-    const size_t<3>& origin,
-    const size_t<3>& region,
-    ::size_t row_pitch,
-    ::size_t slice_pitch,
-    void* ptr,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-
-    return queue.enqueueWriteImage(
-        image,
-        blocking,
-        origin,
-        region,
-        row_pitch,
-        slice_pitch,
-        ptr,
-        events, 
-        event);
-}
-
-inline cl_int enqueueCopyImage(
-    const Image& src,
-    const Image& dst,
-    const size_t<3>& src_origin,
-    const size_t<3>& dst_origin,
-    const size_t<3>& region,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-
-    return queue.enqueueCopyImage(
-        src,
-        dst,
-        src_origin,
-        dst_origin,
-        region,
-        events,
-        event);
-}
-
-inline cl_int enqueueCopyImageToBuffer(
-    const Image& src,
-    const Buffer& dst,
-    const size_t<3>& src_origin,
-    const size_t<3>& region,
-    ::size_t dst_offset,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-
-    return queue.enqueueCopyImageToBuffer(
-        src,
-        dst,
-        src_origin,
-        region,
-        dst_offset,
-        events,
-        event);
-}
-
-inline cl_int enqueueCopyBufferToImage(
-    const Buffer& src,
-    const Image& dst,
-    ::size_t src_offset,
-    const size_t<3>& dst_origin,
-    const size_t<3>& region,
-    const VECTOR_CLASS<Event>* events = NULL,
-    Event* event = NULL)
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-
-    return queue.enqueueCopyBufferToImage(
-        src,
-        dst,
-        src_offset,
-        dst_origin,
-        region,
-        events,
-        event);
-}
-
-
-inline cl_int flush(void)
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-
-    if (error != CL_SUCCESS) {
-        return error;
-    }
-
-    return queue.flush();
-}
-
-inline cl_int finish(void)
-{
-    cl_int error;
-    CommandQueue queue = CommandQueue::getDefault(&error);
-
-    if (error != CL_SUCCESS) {
-        return error;
-    } 
-
-
-    return queue.finish();
-}
-
-// Kernel Functor support
-// New interface as of September 2011
-// Requires the C++11 std::tr1::function (note do not support TR1)
-// Visual Studio 2010 and GCC 4.2
-
-struct EnqueueArgs
-{
-    CommandQueue queue_;
-    const NDRange offset_;
-    const NDRange global_;
-    const NDRange local_;
-    VECTOR_CLASS<Event> events_;
-
-    EnqueueArgs(NDRange global) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(NullRange), 
-      global_(global),
-      local_(NullRange)
-    {
-
-    }
-
-    EnqueueArgs(NDRange global, NDRange local) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(NullRange), 
-      global_(global),
-      local_(local)
-    {
-
-    }
-
-    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(offset), 
-      global_(global),
-      local_(local)
-    {
-
-    }
-
-    EnqueueArgs(Event e, NDRange global) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(NullRange), 
-      global_(global),
-      local_(NullRange)
-    {
-        events_.push_back(e);
-    }
-
-    EnqueueArgs(Event e, NDRange global, NDRange local) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(NullRange), 
-      global_(global),
-      local_(local)
-    {
-        events_.push_back(e);
-    }
-
-    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(offset), 
-      global_(global),
-      local_(local)
-    {
-        events_.push_back(e);
-    }
-
-    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(NullRange), 
-      global_(global),
-      local_(NullRange),
-      events_(events)
-    {
-
-    }
-
-    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(NullRange), 
-      global_(global),
-      local_(local),
-      events_(events)
-    {
-
-    }
-
-    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
-      queue_(CommandQueue::getDefault()),
-      offset_(offset), 
-      global_(global),
-      local_(local),
-      events_(events)
-    {
-
-    }
-
-    EnqueueArgs(CommandQueue &queue, NDRange global) : 
-      queue_(queue),
-      offset_(NullRange), 
-      global_(global),
-      local_(NullRange)
-    {
-
-    }
-
-    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : 
-      queue_(queue),
-      offset_(NullRange), 
-      global_(global),
-      local_(local)
-    {
-
-    }
-
-    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : 
-      queue_(queue),
-      offset_(offset), 
-      global_(global),
-      local_(local)
-    {
-
-    }
-
-    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : 
-      queue_(queue),
-      offset_(NullRange), 
-      global_(global),
-      local_(NullRange)
-    {
-        events_.push_back(e);
-    }
-
-    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : 
-      queue_(queue),
-      offset_(NullRange), 
-      global_(global),
-      local_(local)
-    {
-        events_.push_back(e);
-    }
-
-    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : 
-      queue_(queue),
-      offset_(offset), 
-      global_(global),
-      local_(local)
-    {
-        events_.push_back(e);
-    }
-
-    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) : 
-      queue_(queue),
-      offset_(NullRange), 
-      global_(global),
-      local_(NullRange),
-      events_(events)
-    {
-
-    }
-
-    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
-      queue_(queue),
-      offset_(NullRange), 
-      global_(global),
-      local_(local),
-      events_(events)
-    {
-
-    }
-
-    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
-      queue_(queue),
-      offset_(offset), 
-      global_(global),
-      local_(local),
-      events_(events)
-    {
-
-    }
-};
-
-namespace detail {
-
-class NullType {};
-
-template<int index, typename T0>
-struct SetArg
-{
-    static void set (Kernel kernel, T0 arg)
-    {
-        kernel.setArg(index, arg);
-    }
-};  
-
-template<int index>
-struct SetArg<index, NullType>
-{
-    static void set (Kernel, NullType)
-    { 
-    }
-};
-
-template <
-   typename T0,   typename T1,   typename T2,   typename T3,
-   typename T4,   typename T5,   typename T6,   typename T7,
-   typename T8,   typename T9,   typename T10,   typename T11,
-   typename T12,   typename T13,   typename T14,   typename T15,
-   typename T16,   typename T17,   typename T18,   typename T19,
-   typename T20,   typename T21,   typename T22,   typename T23,
-   typename T24,   typename T25,   typename T26,   typename T27,
-   typename T28,   typename T29,   typename T30,   typename T31
->
-class KernelFunctorGlobal
-{
-private:
-    Kernel kernel_;
-
-public:
-   KernelFunctorGlobal(
-        Kernel kernel) :
-            kernel_(kernel)
-    {}
-
-   KernelFunctorGlobal(
-        const Program& program,
-        const STRING_CLASS name,
-        cl_int * err = NULL) :
-            kernel_(program, name.c_str(), err)
-    {}
-
-    Event operator() (
-        const EnqueueArgs& args,
-        T0 t0,
-        T1 t1 = NullType(),
-        T2 t2 = NullType(),
-        T3 t3 = NullType(),
-        T4 t4 = NullType(),
-        T5 t5 = NullType(),
-        T6 t6 = NullType(),
-        T7 t7 = NullType(),
-        T8 t8 = NullType(),
-        T9 t9 = NullType(),
-        T10 t10 = NullType(),
-        T11 t11 = NullType(),
-        T12 t12 = NullType(),
-        T13 t13 = NullType(),
-        T14 t14 = NullType(),
-        T15 t15 = NullType(),
-        T16 t16 = NullType(),
-        T17 t17 = NullType(),
-        T18 t18 = NullType(),
-        T19 t19 = NullType(),
-        T20 t20 = NullType(),
-        T21 t21 = NullType(),
-        T22 t22 = NullType(),
-        T23 t23 = NullType(),
-        T24 t24 = NullType(),
-        T25 t25 = NullType(),
-        T26 t26 = NullType(),
-        T27 t27 = NullType(),
-        T28 t28 = NullType(),
-        T29 t29 = NullType(),
-        T30 t30 = NullType(),
-        T31 t31 = NullType()
-        )
-    {
-        Event event;
-        SetArg<0, T0>::set(kernel_, t0);
-        SetArg<1, T1>::set(kernel_, t1);
-        SetArg<2, T2>::set(kernel_, t2);
-        SetArg<3, T3>::set(kernel_, t3);
-        SetArg<4, T4>::set(kernel_, t4);
-        SetArg<5, T5>::set(kernel_, t5);
-        SetArg<6, T6>::set(kernel_, t6);
-        SetArg<7, T7>::set(kernel_, t7);
-        SetArg<8, T8>::set(kernel_, t8);
-        SetArg<9, T9>::set(kernel_, t9);
-        SetArg<10, T10>::set(kernel_, t10);
-        SetArg<11, T11>::set(kernel_, t11);
-        SetArg<12, T12>::set(kernel_, t12);
-        SetArg<13, T13>::set(kernel_, t13);
-        SetArg<14, T14>::set(kernel_, t14);
-        SetArg<15, T15>::set(kernel_, t15);
-        SetArg<16, T16>::set(kernel_, t16);
-        SetArg<17, T17>::set(kernel_, t17);
-        SetArg<18, T18>::set(kernel_, t18);
-        SetArg<19, T19>::set(kernel_, t19);
-        SetArg<20, T20>::set(kernel_, t20);
-        SetArg<21, T21>::set(kernel_, t21);
-        SetArg<22, T22>::set(kernel_, t22);
-        SetArg<23, T23>::set(kernel_, t23);
-        SetArg<24, T24>::set(kernel_, t24);
-        SetArg<25, T25>::set(kernel_, t25);
-        SetArg<26, T26>::set(kernel_, t26);
-        SetArg<27, T27>::set(kernel_, t27);
-        SetArg<28, T28>::set(kernel_, t28);
-        SetArg<29, T29>::set(kernel_, t29);
-        SetArg<30, T30>::set(kernel_, t30);
-        SetArg<31, T31>::set(kernel_, t31);
-        
-        args.queue_.enqueueNDRangeKernel(
-            kernel_,
-            args.offset_,
-            args.global_,
-            args.local_,
-            &args.events_,
-            &event);
-        
-        return event;
-    }
-
-};
-
-//------------------------------------------------------------------------------------------------------
-
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24,
-	typename T25,
-	typename T26,
-	typename T27,
-	typename T28,
-	typename T29,
-	typename T30,
-	typename T31>
-struct functionImplementation_
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28,
-		T29,
-		T30,
-		T31> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28,
-		T29,
-		T30,
-		T31);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24,
-		T25 arg25,
-		T26 arg26,
-		T27 arg27,
-		T28 arg28,
-		T29 arg29,
-		T30 arg30,
-		T31 arg31)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24,
-			arg25,
-			arg26,
-			arg27,
-			arg28,
-			arg29,
-			arg30,
-			arg31);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24,
-	typename T25,
-	typename T26,
-	typename T27,
-	typename T28,
-	typename T29,
-	typename T30>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	T24,
-	T25,
-	T26,
-	T27,
-	T28,
-	T29,
-	T30,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28,
-		T29,
-		T30,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28,
-		T29,
-		T30);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24,
-		T25 arg25,
-		T26 arg26,
-		T27 arg27,
-		T28 arg28,
-		T29 arg29,
-		T30 arg30)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24,
-			arg25,
-			arg26,
-			arg27,
-			arg28,
-			arg29,
-			arg30);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24,
-	typename T25,
-	typename T26,
-	typename T27,
-	typename T28,
-	typename T29>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	T24,
-	T25,
-	T26,
-	T27,
-	T28,
-	T29,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28,
-		T29,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28,
-		T29);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24,
-		T25 arg25,
-		T26 arg26,
-		T27 arg27,
-		T28 arg28,
-		T29 arg29)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24,
-			arg25,
-			arg26,
-			arg27,
-			arg28,
-			arg29);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24,
-	typename T25,
-	typename T26,
-	typename T27,
-	typename T28>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	T24,
-	T25,
-	T26,
-	T27,
-	T28,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		T28);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24,
-		T25 arg25,
-		T26 arg26,
-		T27 arg27,
-		T28 arg28)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24,
-			arg25,
-			arg26,
-			arg27,
-			arg28);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24,
-	typename T25,
-	typename T26,
-	typename T27>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	T24,
-	T25,
-	T26,
-	T27,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		T27);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24,
-		T25 arg25,
-		T26 arg26,
-		T27 arg27)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24,
-			arg25,
-			arg26,
-			arg27);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24,
-	typename T25,
-	typename T26>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	T24,
-	T25,
-	T26,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		T26);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24,
-		T25 arg25,
-		T26 arg26)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24,
-			arg25,
-			arg26);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24,
-	typename T25>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	T24,
-	T25,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		T25);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24,
-		T25 arg25)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24,
-			arg25);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23,
-	typename T24>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	T24,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		T24);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23,
-		T24 arg24)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23,
-			arg24);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22,
-	typename T23>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	T23,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		T23);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22,
-		T23 arg23)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22,
-			arg23);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21,
-	typename T22>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	T22,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		T22);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21,
-		T22 arg22)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21,
-			arg22);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20,
-	typename T21>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	T21,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		T21);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20,
-		T21 arg21)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20,
-			arg21);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19,
-	typename T20>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	T20,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		T20);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19,
-		T20 arg20)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19,
-			arg20);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18,
-	typename T19>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	T19,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		T19);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18,
-		T19 arg19)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18,
-			arg19);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17,
-	typename T18>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	T18,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		T18);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17,
-		T18 arg18)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17,
-			arg18);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16,
-	typename T17>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	T17,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		T17);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16,
-		T17 arg17)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16,
-			arg17);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15,
-	typename T16>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	T16,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		T16);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15,
-		T16 arg16)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15,
-			arg16);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14,
-	typename T15>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	T15,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		T15);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14,
-		T15 arg15)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14,
-			arg15);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13,
-	typename T14>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	T14,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		T14);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13,
-		T14 arg14)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13,
-			arg14);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12,
-	typename T13>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	T13,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		T13);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12,
-		T13 arg13)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12,
-			arg13);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11,
-	typename T12>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	T12,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		T12);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11,
-		T12 arg12)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11,
-			arg12);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10,
-	typename T11>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	T11,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		T11);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10,
-		T11 arg11)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10,
-			arg11);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9,
-	typename T10>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	T10,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		T10);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9,
-		T10 arg10)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9,
-			arg10);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8,
-	typename T9>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	T9,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		T9);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8,
-		T9 arg9)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8,
-			arg9);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7,
-	typename T8>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	T8,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		T8);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7,
-		T8 arg8)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7,
-			arg8);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6,
-	typename T7>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	T7,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		T7);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6,
-		T7 arg7)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6,
-			arg7);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5,
-	typename T6>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	T6,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		T6);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5,
-		T6 arg6)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5,
-			arg6);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4,
-	typename T5>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	T5,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		T5);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4,
-		T5 arg5)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4,
-			arg5);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3,
-	typename T4>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	T4,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		T4,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3,
-		T4);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3,
-		T4 arg4)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3,
-			arg4);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2,
-	typename T3>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	T3,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		T3,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2,
-		T3);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2,
-		T3 arg3)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2,
-			arg3);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1,
-	typename T2>
-struct functionImplementation_
-<	T0,
-	T1,
-	T2,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		T2,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1,
-		T2);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1,
-		T2 arg2)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1,
-			arg2);
-	}
-
-
-};
-
-template<
-	typename T0,
-	typename T1>
-struct functionImplementation_
-<	T0,
-	T1,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		T1,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0,
-		T1);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0,
-		T1 arg1)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0,
-			arg1);
-	}
-
-
-};
-
-template<
-	typename T0>
-struct functionImplementation_
-<	T0,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType,
-	NullType>
-{
-	typedef detail::KernelFunctorGlobal<
-		T0,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType,
-		NullType> FunctorType;
-
-    FunctorType functor_;
-
-    functionImplementation_(const FunctorType &functor) :
-        functor_(functor)
-    {
-    
-        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1))
-        // Fail variadic expansion for dev11
-        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
-        #endif
-            
-    }
-
-	//! \brief Return type of the functor
-	typedef Event result_type;
-
-	//! \brief Function signature of kernel functor with no event dependency.
-	typedef Event type_(
-		const EnqueueArgs&,
-		T0);
-
-	Event operator()(
-		const EnqueueArgs& enqueueArgs,
-		T0 arg0)
-	{
-		return functor_(
-			enqueueArgs,
-			arg0);
-	}
-
-
-};
-
-
-
-
-
-} // namespace detail
-
-//----------------------------------------------------------------------------------------------
-
-template <
-   typename T0,   typename T1 = detail::NullType,   typename T2 = detail::NullType,
-   typename T3 = detail::NullType,   typename T4 = detail::NullType,
-   typename T5 = detail::NullType,   typename T6 = detail::NullType,
-   typename T7 = detail::NullType,   typename T8 = detail::NullType,
-   typename T9 = detail::NullType,   typename T10 = detail::NullType,
-   typename T11 = detail::NullType,   typename T12 = detail::NullType,
-   typename T13 = detail::NullType,   typename T14 = detail::NullType,
-   typename T15 = detail::NullType,   typename T16 = detail::NullType,
-   typename T17 = detail::NullType,   typename T18 = detail::NullType,
-   typename T19 = detail::NullType,   typename T20 = detail::NullType,
-   typename T21 = detail::NullType,   typename T22 = detail::NullType,
-   typename T23 = detail::NullType,   typename T24 = detail::NullType,
-   typename T25 = detail::NullType,   typename T26 = detail::NullType,
-   typename T27 = detail::NullType,   typename T28 = detail::NullType,
-   typename T29 = detail::NullType,   typename T30 = detail::NullType,
-   typename T31 = detail::NullType
->
-struct make_kernel :
-    public detail::functionImplementation_<
-               T0,   T1,   T2,   T3,
-               T4,   T5,   T6,   T7,
-               T8,   T9,   T10,   T11,
-               T12,   T13,   T14,   T15,
-               T16,   T17,   T18,   T19,
-               T20,   T21,   T22,   T23,
-               T24,   T25,   T26,   T27,
-               T28,   T29,   T30,   T31
-    >
-{
-public:
-    typedef detail::KernelFunctorGlobal<             
-               T0,   T1,   T2,   T3,
-               T4,   T5,   T6,   T7,
-               T8,   T9,   T10,   T11,
-               T12,   T13,   T14,   T15,
-               T16,   T17,   T18,   T19,
-               T20,   T21,   T22,   T23,
-               T24,   T25,   T26,   T27,
-               T28,   T29,   T30,   T31
-    > FunctorType;
-
-    make_kernel(
-        const Program& program,
-        const STRING_CLASS name,
-        cl_int * err = NULL) :
-           detail::functionImplementation_<
-                    T0,   T1,   T2,   T3,
-                       T4,   T5,   T6,   T7,
-                       T8,   T9,   T10,   T11,
-                       T12,   T13,   T14,   T15,
-                       T16,   T17,   T18,   T19,
-                       T20,   T21,   T22,   T23,
-                       T24,   T25,   T26,   T27,
-                       T28,   T29,   T30,   T31
-           >(
-            FunctorType(program, name, err)) 
-    {}
-
-    make_kernel(
-        const Kernel kernel) :
-           detail::functionImplementation_<
-                    T0,   T1,   T2,   T3,
-                       T4,   T5,   T6,   T7,
-                       T8,   T9,   T10,   T11,
-                       T12,   T13,   T14,   T15,
-                       T16,   T17,   T18,   T19,
-                       T20,   T21,   T22,   T23,
-                       T24,   T25,   T26,   T27,
-                       T28,   T29,   T30,   T31
-           >(
-            FunctorType(kernel)) 
-    {}    
-};
-
-
-//----------------------------------------------------------------------------------------------------------------------
-
-#undef __ERR_STR
-#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
-#undef __GET_DEVICE_INFO_ERR
-#undef __GET_PLATFORM_INFO_ERR
-#undef __GET_DEVICE_IDS_ERR
-#undef __GET_CONTEXT_INFO_ERR
-#undef __GET_EVENT_INFO_ERR
-#undef __GET_EVENT_PROFILE_INFO_ERR
-#undef __GET_MEM_OBJECT_INFO_ERR
-#undef __GET_IMAGE_INFO_ERR
-#undef __GET_SAMPLER_INFO_ERR
-#undef __GET_KERNEL_INFO_ERR
-#undef __GET_KERNEL_ARG_INFO_ERR
-#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
-#undef __GET_PROGRAM_INFO_ERR
-#undef __GET_PROGRAM_BUILD_INFO_ERR
-#undef __GET_COMMAND_QUEUE_INFO_ERR
-
-#undef __CREATE_CONTEXT_ERR
-#undef __CREATE_CONTEXT_FROM_TYPE_ERR
-#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
-
-#undef __CREATE_BUFFER_ERR
-#undef __CREATE_SUBBUFFER_ERR
-#undef __CREATE_IMAGE2D_ERR
-#undef __CREATE_IMAGE3D_ERR
-#undef __CREATE_SAMPLER_ERR
-#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
-
-#undef __CREATE_USER_EVENT_ERR
-#undef __SET_USER_EVENT_STATUS_ERR
-#undef __SET_EVENT_CALLBACK_ERR
-#undef __SET_PRINTF_CALLBACK_ERR
-
-#undef __WAIT_FOR_EVENTS_ERR
-
-#undef __CREATE_KERNEL_ERR
-#undef __SET_KERNEL_ARGS_ERR
-#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
-#undef __CREATE_PROGRAM_WITH_BINARY_ERR
-#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
-#undef __BUILD_PROGRAM_ERR
-#undef __CREATE_KERNELS_IN_PROGRAM_ERR
-
-#undef __CREATE_COMMAND_QUEUE_ERR
-#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
-#undef __ENQUEUE_READ_BUFFER_ERR
-#undef __ENQUEUE_WRITE_BUFFER_ERR
-#undef __ENQUEUE_READ_BUFFER_RECT_ERR
-#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
-#undef __ENQEUE_COPY_BUFFER_ERR
-#undef __ENQEUE_COPY_BUFFER_RECT_ERR
-#undef __ENQUEUE_READ_IMAGE_ERR
-#undef __ENQUEUE_WRITE_IMAGE_ERR
-#undef __ENQUEUE_COPY_IMAGE_ERR
-#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
-#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
-#undef __ENQUEUE_MAP_BUFFER_ERR
-#undef __ENQUEUE_MAP_IMAGE_ERR
-#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
-#undef __ENQUEUE_NDRANGE_KERNEL_ERR
-#undef __ENQUEUE_TASK_ERR
-#undef __ENQUEUE_NATIVE_KERNEL
-
-#undef __CL_EXPLICIT_CONSTRUCTORS
-
-#undef __UNLOAD_COMPILER_ERR
-#endif //__CL_USER_OVERRIDE_ERROR_STRINGS
-
-#undef __CL_FUNCTION_TYPE
-
-// Extensions
-/**
- * Deprecated APIs for 1.2
- */
-#if defined(CL_VERSION_1_1)
-#undef __INIT_CL_EXT_FCN_PTR
-#endif // #if defined(CL_VERSION_1_1)
-#undef __CREATE_SUB_DEVICES
-
-#if defined(USE_CL_DEVICE_FISSION)
-#undef __PARAM_NAME_DEVICE_FISSION
-#endif // USE_CL_DEVICE_FISSION
-
-#undef __DEFAULT_NOT_INITIALIZED 
-#undef __DEFAULT_BEING_INITIALIZED 
-#undef __DEFAULT_INITIALIZED
-
-#undef CL_HPP_RVALUE_REFERENCES_SUPPORTED
-#undef CL_HPP_NOEXCEPT
-
-} // namespace cl
-
-#endif // CL_HPP_
diff --git a/caffe2/contrib/opencl/context.cc b/caffe2/contrib/opencl/context.cc
deleted file mode 100644
index f0cf9c3353cd4..0000000000000
--- a/caffe2/contrib/opencl/context.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-#include "context.h"
-
-#include "caffe2/core/allocator.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/tensor.h"
-
-namespace caffe2 {
-
-CAFFE_KNOWN_TYPE(Tensor<OpenCLContext>);
-
-OpenCLContextSingleton::OpenCLContextSingleton() {
-  const auto platform_id = 0;
-  const auto device_id = 0;
-
-  auto platforms = std::vector<cl::Platform>();
-  cl::Platform::get(&platforms);
-  if (platforms.size() == 0 || platform_id >= platforms.size()) {
-    CAFFE_THROW("Cannot find platform for OpenCL.");
-  }
-  platform = platforms[platform_id];
-
-  devices = std::vector<cl::Device>();
-  platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
-  if (devices.size() == 0 || device_id >= devices.size()) {
-    CAFFE_THROW("Cannot find OpenCL compatible device.");
-  }
-  device = devices[device_id];
-
-  context = cl::Context({device});
-  queue = cl::CommandQueue(context, device);
-}
-
-OpenCLContextSingleton& OpenCLContextSingleton::getInstance() {
-  static OpenCLContextSingleton* instance;
-  if (instance == nullptr) {
-    instance = new OpenCLContextSingleton();
-  }
-  return *instance;
-}
-
-std::pair<void*, MemoryDeleter> OpenCLContext::New(size_t nbytes) {
-  auto& ctx = GetSingleton();
-  cl_int err = 0;
-
-  cl::Buffer* buffer = new cl::Buffer(ctx.context, CL_MEM_READ_WRITE,
-      nbytes, nullptr, &err);
-  OPENCL_CHECK(err);
-  // TODO(bwasti): use host ptr if possible to make CopyBytes free
-  return std::make_pair((void *)buffer, OpenCLContext::Delete);
-}
-
-void OpenCLContext::Delete(void *ptr) {
-  delete (cl::Buffer *)ptr;
-}
-
-struct OpenCLContextSingleton& OpenCLContext::GetSingleton() {
-  return OpenCLContextSingleton::getInstance();
-}
-
-cl::Kernel OpenCLContext::BuildKernel(const char* src, std::string additional_options, const char* fn_name) {
-  auto& ctx = GetSingleton();
-
-  cl::Program::Sources source(1,
-      std::make_pair(src, strlen(src)));
-
-  cl::Program p = cl::Program(ctx.context, source);
-  cl_int err = CL_SUCCESS;
-  std::string options = "-cl-std=CL1.1 -cl-fast-relaxed-math -cl-single-precision-constant";
-  options += additional_options;
-  err = p.build(ctx.devices, options.c_str());
-  cl_build_status build_status = p.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(ctx.device);
-  if (err != CL_SUCCESS || build_status != CL_BUILD_SUCCESS) {
-    auto str = p.getBuildInfo<CL_PROGRAM_BUILD_LOG>(ctx.device);
-    LOG(ERROR) << str;
-    CAFFE_THROW(str);
-  }
-
-  auto kernel = cl::Kernel(p, fn_name, &err);
-  OPENCL_CHECK(err);
-  return kernel;
-}
-
-std::string OpenCLContext::BuildArgumentList(std::vector<std::pair<std::string, std::string>> args) {
-  std::string out = " "; // There may be args before this
-  for (auto arg : args) {
-    out += "-D " + arg.first + "=" + arg.second + " ";
-  }
-  return out;
-}
-
-void EventCreateOPENCL(const DeviceOption& /* unused */, Event* /* unused */) {}
-void EventRecordOPENCL(
-    Event* /* unused */,
-    const void* /* unused */,
-    const char* /* unused */) {}
-void EventWaitOPENCL(const Event* /* unused */, void* /* unused */) {}
-void EventFinishOPENCL(const Event* /* unused */) {}
-void EventResetOPENCL(Event* /* unused */) {}
-
-REGISTER_EVENT_CREATE_FUNCTION(OPENCL, EventCreateOPENCL);
-REGISTER_EVENT_RECORD_FUNCTION(OPENCL, EventRecordOPENCL);
-REGISTER_EVENT_WAIT_FUNCTION(OPENCL, OPENCL, EventWaitOPENCL);
-REGISTER_EVENT_FINISH_FUNCTION(OPENCL, EventFinishOPENCL);
-REGISTER_EVENT_RESET_FUNCTION(OPENCL, EventResetOPENCL);
-
-} // namespace caffe2
diff --git a/caffe2/contrib/opencl/context.h b/caffe2/contrib/opencl/context.h
deleted file mode 100644
index 5ea63cb80f196..0000000000000
--- a/caffe2/contrib/opencl/context.h
+++ /dev/null
@@ -1,104 +0,0 @@
-#ifndef CAFFE2_OPENCL_CONTEXT_H_
-#define CAFFE2_OPENCL_CONTEXT_H_
-
-#include "caffe2/core/context.h"
-
-#define CL_HPP_ENABLE_EXCEPTIONS 1
-#define CL_HPP_CL_1_2_DEFAULT_BUILD 1
-#define CL_HPP_TARGET_OPENCL_VERSION 120
-#define CL_HPP_MINIMUM_OPENCL_VERSION 120
-//#include "libopencl.h"
-#if defined(__APPLE__) || defined(__MACOSX)
-#include <OpenCL/cl.hpp>
-#else
-#include <CL/cl.hpp>
-#endif
-
-#define OPENCL_CHECK(expr) (void)expr
-
-namespace caffe2 {
-
-struct OpenCLContextSingleton {
- private:
-  OpenCLContextSingleton();
-  OpenCLContextSingleton(const OpenCLContextSingleton &) = delete;
-  OpenCLContextSingleton(OpenCLContextSingleton&&) = delete;
- public:
-  static OpenCLContextSingleton& getInstance();
-  cl::Platform platform;
-  cl::Device device;
-  std::vector<cl::Device> devices;
-  cl::Context context;
-  cl::CommandQueue queue;
-};
-
-class OpenCLContext final {
- public:
-  explicit OpenCLContext();
-  explicit OpenCLContext(const DeviceOption& option) {
-    TORCH_DCHECK_EQ(option.device_type(), PROTO_OPENCL);
-    OpenCLContext();
-  }
-  ~OpenCLContext() {}
-
-  /*
-   * Everything below is basically boiler plate for Context classes
-   */
-  static std::pair<void*, MemoryDeleter> New(size_t nbytes);
-
-  static void Delete(void* data);
-
-  template <class SrcContext, class DstContext>
-  inline void CopyBytes(size_t nbytes, const void* src, void* dst) {}
-
-  template <typename T, class SrcContext, class DstContext>
-  inline void Copy(int n, const T* src, T* dst) {
-    CopyBytes<SrcContext, DstContext>(
-        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
-  }
-
-  template <class SrcContext, class DstContext>
-  inline void
-  CopyItems(const TypeMeta meta, size_t n, const void* src, void* dst) {
-    CAFFE_ENFORCE(!meta.copy(), "OpenCLContext requires fundamental types.");
-    CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
-  }
-
-  void SwitchToDevice(int64_t a, ...) {
-    auto& ctx = GetSingleton();
-    CAFFE_ENFORCE(a < ctx.devices.size());
-    ctx.device = ctx.devices[a];
-  }
-  void SwitchToDevice() {
-    SwitchToDevice(0);
-  }
-
-  inline void WaitEvent(const Event& ev) { /* TODO */
-  }
-  void FinishDeviceComputation() {
-    auto& ctx = GetSingleton();
-    ctx.queue.finish();
-  }
-
-  inline void Record(Event* ev, const char*&) const { /* TODO */
-  }
-  static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
-    return true;
-  }
-  bool HasAsyncPartDefault() const {
-    return false;
-  }
-  bool SupportsAsyncScheduling() const {
-    return false;
-  }
-
-  // OpenCL specific helper functions
-  cl::Kernel BuildKernel(const char* src, std::string additional_options = "", const char* fn_name = "K");
-  static struct OpenCLContextSingleton& GetSingleton();
-  static std::string BuildArgumentList(std::vector<std::pair<std::string, std::string>> args);
-};
-
-
-} // namespace caffe2
-
-#endif /* CAFFE2_OPENCL_CONTEXT_H_ */
diff --git a/caffe2/contrib/opencl/context_test.cc b/caffe2/contrib/opencl/context_test.cc
deleted file mode 100644
index 4e71015cc5d88..0000000000000
--- a/caffe2/contrib/opencl/context_test.cc
+++ /dev/null
@@ -1,12 +0,0 @@
-#include <gtest/gtest.h>
-
-namespace caffe2 {
-namespace {
-
-TEST(ContextTest, BasicInit) {
-
-}
-
-}  // namespace
-}  // namespace caffe2
-
diff --git a/caffe2/contrib/playground/AnyExp.py b/caffe2/contrib/playground/AnyExp.py
deleted file mode 100644
index 386993d6f36e5..0000000000000
--- a/caffe2/contrib/playground/AnyExp.py
+++ /dev/null
@@ -1,491 +0,0 @@
-
-
-
-
-
-from abc import abstractmethod
-
-from caffe2.python import workspace
-from caffe2.python import timeout_guard
-from caffe2.python import data_parallel_model
-from . import checkpoint as checkpoint
-
-from . import ModuleRegister as ModuleRegister
-from . import module_map as module_map
-
-# instantiate logger outside of distributed operators may trigger error
-# logger need to be created in each idividual operator instead.
-import os
-import inspect
-import time
-import logging
-logging.basicConfig()
-log = logging.getLogger("AnyExp")
-log.setLevel(logging.DEBUG)
-
-
-def initOpts(opts):
-
-    workspace.GlobalInit(
-        ['caffe2', '--caffe2_log_level=2', '--caffe2_gpu_memory_tracking=0'])
-
-    assert (opts['distributed']['num_gpus'] > 0 or
-            opts['distributed']['num_cpus'] > 0),\
-        "Need to specify num_gpus or num_cpus to decide which device to use."
-
-    trainWithCPU = (opts['distributed']['num_gpus'] == 0)
-    num_xpus = opts['distributed']['num_cpus'] if \
-        trainWithCPU else opts['distributed']['num_gpus']
-    first_xpu = opts['distributed']['first_cpu_id'] if \
-        trainWithCPU else opts['distributed']['first_gpu_id']
-    opts['distributed']['device'] = 'cpu' if trainWithCPU else 'gpu'
-
-    opts['model_param']['combine_spatial_bn'] =\
-        trainWithCPU and opts['model_param']['combine_spatial_bn']
-
-    opts['distributed']['num_xpus'] = num_xpus
-    opts['distributed']['first_xpu_id'] = first_xpu
-    opts['temp_var'] = {}
-    opts['temp_var']['metrics_output'] = {}
-
-    return opts
-
-
-def initDefaultModuleMap():
-    registerModuleMap(module_map)
-
-
-def registerModuleMap(module_map):
-    ModuleRegister.registerModuleMap(module_map)
-
-
-def aquireDatasets(opts):
-    myAquireDataModule = ModuleRegister.getModule(opts['input']['input_name_py'])
-    return myAquireDataModule.get_input_dataset(opts)
-
-
-def createTrainerClass(opts):
-    return ModuleRegister.constructTrainerClass(AnyExpTrainer, opts)
-
-
-def overrideAdditionalMethods(myTrainerClass, opts):
-    return ModuleRegister.overrideAdditionalMethods(myTrainerClass, opts)
-
-
-def initialize_params_from_file(*args, **kwargs):
-    return checkpoint.initialize_params_from_file(*args, **kwargs)
-
-
-class AnyExpTrainer:
-
-    def __init__(self, opts):
-        import logging
-        logging.basicConfig()
-        log = logging.getLogger("AnyExp")
-        log.setLevel(logging.DEBUG)
-        self.log = log
-
-        self.opts = opts
-        self.train_dataset = None
-        self.test_dataset = None
-        self.train_df = None
-        self.test_df = None
-
-        self.metrics = {}
-        self.plotsIngredients = []
-
-        self.record_epochs = []
-        self.samples_per_sec = []
-        self.secs_per_train = []
-
-        self.metrics_output = opts['temp_var']['metrics_output']
-
-        first_xpu = opts['distributed']['first_xpu_id']
-        num_xpus = opts['distributed']['num_xpus']
-
-        self.xpus = range(first_xpu, first_xpu + num_xpus)
-
-        self.total_batch_size = \
-            self.opts['epoch_iter']['batch_per_device'] * \
-            self.opts['distributed']['num_xpus'] * \
-            self.opts['distributed']['num_shards']
-        self.epoch_iterations = \
-            self.opts['epoch_iter']['num_train_sample_per_epoch'] // \
-            self.total_batch_size
-
-        if len(opts['input']['datasets']) > 0:
-            self.train_df = opts['input']['datasets'][0]
-            if len(opts['input']['datasets']) == 2:
-                self.test_df = opts['input']['datasets'][1]
-        # at this point, the intance of this class becomes many instances
-        # running on different machines.  Most of their attributes are same,
-        # but the shard_ids are different.
-        self.shard_id = opts['temp_var']['shard_id']
-        self.start_epoch = opts['temp_var']['start_epoch']
-        self.epoch = opts['temp_var']['epoch']
-        self.epochs_to_run = opts['epoch_iter']['num_epochs_per_flow_schedule']
-
-        log.info('opts: {}'.format(str(opts)))
-
-    @abstractmethod
-    def get_input_dataset(self, opts):
-        pass
-
-    @abstractmethod
-    def get_model_input_fun(self):
-        pass
-
-    @abstractmethod
-    def init_model(self):
-        pass
-
-    def init_metrics(self):
-        metrics = self.opts['output']['metrics']
-        for metric in metrics:
-            meterClass = self.getMeterClass(metric['meter_py'])
-            # log.info('metric.meter_kargs {}'.format(metric.meter_kargs))
-            # log.info('type meter_kargs {}'.format(type(metric.meter_kargs)))
-            meterInstance = meterClass(opts=self.opts, **metric['meter_kargs'])
-            self.add_metric(metric['name'], meterInstance, metric['is_train'])
-
-    def getMeterClass(self, meterName):
-        return ModuleRegister.getClassFromModule(meterName, meterName)
-
-    def add_metric(self, name, calculator, is_train):
-        metrics = self.metrics
-        metrics[name] = {}
-        metrics[name]['calculator'] = calculator
-        metrics[name]['is_train'] = is_train
-        metrics[name]['output'] = []
-
-    def extendMetricsOutput(self):
-        metrics_output = self.metrics_output
-        if not metrics_output:
-            metrics_output['epochs'] = self.record_epochs
-            metrics_output['samples_per_sec'] = self.samples_per_sec
-            metrics_output['secs_per_train'] = self.secs_per_train
-            for metric, value in self.metrics.items():
-                metrics_output[metric] = value['output']
-        else:
-            metrics_output['epochs'].extend(self.record_epochs)
-            metrics_output['samples_per_sec'].extend(self.samples_per_sec)
-            metrics_output['secs_per_train'].extend(self.secs_per_train)
-            for metric, value in self.metrics.items():
-                metrics_output[metric].extend(value['output'])
-
-    @abstractmethod
-    def init_plots(self):
-        pass
-
-    def add_plot(self, x, x_title, ys, y_title):
-        plotsIngredients = self.plotsIngredients
-        aPlotIngredients = {}
-        aPlotIngredients['x'] = x
-        aPlotIngredients['x_title'] = x_title
-        aPlotIngredients['ys'] = ys
-        aPlotIngredients['y_title'] = y_title
-        plotsIngredients.append(aPlotIngredients)
-
-    @abstractmethod
-    def init_logs(self):
-        pass
-
-    def list_of_epochs(self):
-        iter_end_point = min(self.opts['epoch_iter']['num_epochs'],
-                             self.epoch +
-                             self.opts['epoch_iter']['num_epochs_per_flow_schedule'])
-        return range(self.epoch, iter_end_point)
-
-    def list_of_epoch_iters(self):
-        return range(0, self.epoch_iterations)
-
-    @abstractmethod
-    def fun_per_epoch_b4RunNet(self, epoch):
-        pass
-
-    @abstractmethod
-    def fun_per_epoch_aftRunNet(self, epoch):
-        pass
-
-    def checkpoint(self, epoch):
-        self.model_path = checkpoint.save_model_params(
-            True, self.train_model, self.gen_checkpoint_path(True, epoch + 1),
-            epoch + 1, self.opts, float('-inf'))
-
-    def gen_checkpoint_path(self, is_checkpoint, epoch):
-        if (is_checkpoint):
-            filename = "model_checkpoint_epoch{}.pkl".format(epoch)
-        else:
-            filename = "model_final.pkl"
-        return self.opts['output']['checkpoint_folder'] + filename
-
-    # @abstractmethod
-    # def gen_checkpoint_path(self, is_checkpoint, epoch):
-    #     pass
-
-    @abstractmethod
-    def fun_per_iter_b4RunNet(self, epoch, epoch_iter):
-        pass
-
-    @abstractmethod
-    def fun_per_iter_aftRunNetB4Test(self, epoch, epoch_iter):
-        pass
-
-    @abstractmethod
-    def fun_per_iter_aftRunNetAftTest(self, epoch, epoch_iter):
-        pass
-
-    @abstractmethod
-    def fun_conclude_operator(self, opts):
-        pass
-
-    def createMetricsPlotsModelsOutputs(self):
-        self.extendMetricsOutput()
-        self.model_output = self.model_path
-
-    @abstractmethod
-    def assembleAllOutputs(self):
-        pass
-
-    @abstractmethod
-    def gen_input_builder_fun(self, model, dataset, is_train):
-        pass
-
-    @abstractmethod
-    def gen_forward_pass_builder_fun(self, model, dataset, is_train):
-        pass
-
-    @abstractmethod
-    def gen_param_update_builder_fun(self, model, dataset, is_train):
-        pass
-
-    @abstractmethod
-    def gen_optimizer_fun(self, model, dataset, is_train):
-        pass
-
-    @abstractmethod
-    def gen_rendezvous_ctx(self, model, dataset, is_train):
-        pass
-
-    @abstractmethod
-    def run_training_net(self):
-        pass
-
-    @abstractmethod
-    def run_testing_net(self):
-        if self.test_model is None:
-            return
-        timeout = 2000.0
-        with timeout_guard.CompleteInTimeOrDie(timeout):
-            workspace.RunNet(self.test_model.net.Proto().name)
-
-    # @abstractmethod
-    def planning_output(self):
-        self.init_metrics()
-        self.init_plots()
-        self.init_logs()
-
-    def prep_data_parallel_models(self):
-        self.prep_a_data_parallel_model(self.train_model,
-                                        self.train_dataset, True)
-        self.prep_a_data_parallel_model(self.test_model,
-                                        self.test_dataset, False)
-
-    def prep_a_data_parallel_model(self, model, dataset, is_train):
-        if model is None:
-            return
-
-        log.info('in prep_a_data_parallel_model')
-
-        param_update = \
-            self.gen_param_update_builder_fun(model, dataset, is_train) \
-            if self.gen_param_update_builder_fun is not None else None
-        log.info('in prep_a_data_parallel_model param_update done ')
-
-        optimizer = \
-            self.gen_optimizer_fun(model, dataset, is_train) \
-            if self.gen_optimizer_fun is not None else None
-        log.info('in prep_a_data_parallel_model optimizer done ')
-
-        max_ops = self.opts['model_param']['max_concurrent_distributed_ops']
-        data_parallel_model.Parallelize(
-            model,
-            input_builder_fun=self.gen_input_builder_fun(model, dataset, is_train),
-            forward_pass_builder_fun=self.gen_forward_pass_builder_fun(
-                model, dataset, is_train),
-            param_update_builder_fun=param_update,
-            optimizer_builder_fun=optimizer,
-            devices=self.xpus,
-            rendezvous=self.gen_rendezvous_ctx(model, dataset, is_train),
-            broadcast_computed_params=False,
-            optimize_gradient_memory=self.opts['model_param']['memonger'],
-            use_nccl=self.opts['model_param']['cuda_nccl'],
-            max_concurrent_distributed_ops=max_ops,
-            cpu_device=(self.opts['distributed']['device'] == 'cpu'),
-            # "shared model" will only keep model parameters for cpu_0 or gpu_0
-            # will cause issue when initialize each gpu_0, gpu_1, gpu_2 ...
-            # shared_model=(self.opts['distributed']['device'] == 'cpu'),
-            combine_spatial_bn=self.opts['model_param']['combine_spatial_bn'],
-        )
-        log.info('in prep_a_data_parallel_model Parallelize done ')
-
-        # log.info("Current blobs in workspace: {}".format(workspace.Blobs()))
-
-        workspace.RunNetOnce(model.param_init_net)
-        log.info('in prep_a_data_parallel_model RunNetOnce done ')
-
-        # for op in model.net.Proto().op:
-        #     log.info('op type engine {} {}'.format(op.type, op.engine))
-
-        log.info('model.net.Proto() {}'.format(model.net.Proto()))
-
-        workspace.CreateNet(model.net)
-
-        # for op in model.net.Proto().op:
-        #     log.info('after CreateNet op type engine {} {}'.
-        #         format(op.type, op.engine))
-
-        log.info('in prep_a_data_parallel_model CreateNet done ')
-
-    def loadCheckpoint(self):
-        opts = self.opts
-        previous_checkpoint = opts['temp_var']['checkpoint_model']
-        pretrained_model = opts['temp_var']['pretrained_model']
-        num_xpus = opts['distributed']['num_xpus']
-        if (previous_checkpoint is not None):
-            if os.path.exists(previous_checkpoint):
-                log.info('Load previous checkpoint:{}'.format(
-                    previous_checkpoint
-                ))
-                start_epoch, prev_checkpointed_lr, _best_metric = \
-                    checkpoint.initialize_params_from_file(
-                        model=self.train_model,
-                        weights_file=previous_checkpoint,
-                        num_xpus=num_xpus,
-                        opts=opts,
-                        broadcast_computed_param=True,
-                        reset_epoch=False,
-                    )
-        elif pretrained_model is not None and os.path.exists(pretrained_model):
-            log.info("Load pretrained model: {}".format(pretrained_model))
-            start_epoch, prev_checkpointed_lr, best_metric = \
-                checkpoint.initialize_params_from_file(
-                    model=self.train_model,
-                    weights_file=pretrained_model,
-                    num_xpus=num_xpus,
-                    opts=opts,
-                    broadcast_computed_param=True,
-                    reset_epoch=opts['model_param']['reset_epoch'],
-                )
-
-        data_parallel_model.FinalizeAfterCheckpoint(self.train_model)
-
-    def buildModelAndTrain(self, opts):
-        log.info('in buildModelAndTrain, trainer_input: {}'.format(str(opts)))
-        log.info("check type self: {}".format(type(self)))
-        log.info("check self dir: {}".format(dir(self)))
-        log.info("check self source: {}".format(self.__dict__))
-        log.info("check self get_input_dataset methods: {}".
-                 format(inspect.getsource(self.get_input_dataset)))
-        log.info("check self gen_input_builder_fun method: {}".
-                 format(inspect.getsource(self.gen_input_builder_fun)))
-        log.info("check self gen_forward_pass_builder_fun method: {}".
-                 format(inspect.getsource(self.gen_forward_pass_builder_fun)))
-        if self.gen_param_update_builder_fun is not None:
-            log.info("check self gen_param_update_builder_fun method: {}".
-                     format(inspect.getsource(self.gen_param_update_builder_fun)))
-        else:
-            log.info("check self gen_optimizer_fun method: {}".
-                     format(inspect.getsource(self.gen_optimizer_fun)))
-        log.info("check self assembleAllOutputs method: {}".
-                 format(inspect.getsource(self.assembleAllOutputs)))
-        log.info("check self prep_data_parallel_models method: {}".
-                 format(inspect.getsource(self.prep_data_parallel_models)))
-
-        self.get_model_input_fun()
-
-        self.init_model()
-
-        self.planning_output()
-
-        self.prep_data_parallel_models()
-
-        self.loadCheckpoint()
-
-        for epoch in self.list_of_epochs():
-
-            log.info("start training epoch {}".format(epoch))
-
-            self.fun_per_epoch_b4RunNet(epoch)
-
-            for epoch_iter in self.list_of_epoch_iters():
-
-                self.iter_start_time = time.time()
-
-                self.fun_per_iter_b4RunNet(epoch, epoch_iter)
-
-                if self.train_model is not None:
-                    self.run_training_net()
-
-                self.fun_per_iter_aftRunNetB4Test(epoch, epoch_iter)
-
-                self.iter_end_time = time.time()
-
-                if (epoch_iter %
-                opts['epoch_iter']['num_train_iteration_per_test'] == 0):
-                    secs_per_train = (self.iter_end_time - self.iter_start_time)
-                    self.secs_per_train.append(secs_per_train)
-
-                    sample_trained = self.total_batch_size
-                    samples_per_sec = sample_trained / secs_per_train
-                    self.samples_per_sec.append(samples_per_sec)
-
-                    self.fract_epoch = (epoch +
-                    float(epoch_iter) / self.epoch_iterations)
-                    self.record_epochs.append(self.fract_epoch)
-
-                    for key in self.metrics:
-                        metric = self.metrics[key]
-                        if not metric['is_train']:
-                            continue
-                        metric['calculator'].Add()
-                        metric['output'].append(metric['calculator'].Compute())
-
-                    self.test_loop_start_time = time.time()
-                    for _test_iter in range(0, opts['epoch_iter']['num_test_iter']):
-                        self.run_testing_net()
-                        for key in self.metrics:
-                            metric = self.metrics[key]
-                            if metric['is_train']:
-                                continue
-                            metric['calculator'].Add()
-                    self.test_loop_end_time = time.time()
-                    self.sec_per_test_loop = \
-                        self.test_loop_end_time - self.test_loop_start_time
-
-                    for metric in self.metrics.values():
-                        if metric['is_train']:
-                            continue
-                        metric['output'].append(metric['calculator'].Compute())
-
-                    logStr = 'epoch:{}/{} iter:{}/{} secs_per_train:{} '.format(
-                        self.fract_epoch, self.opts['epoch_iter']['num_epochs'],
-                        epoch_iter, self.epoch_iterations, secs_per_train)
-                    logStr += 'samples_per_sec:{} loop {} tests takes {} sec'.format(
-                        samples_per_sec, opts['epoch_iter']['num_test_iter'],
-                        self.sec_per_test_loop)
-                    for metric, value in self.metrics.items():
-                        logStr += ' {}:{} '.format(metric, value['output'][-1])
-                    log.info('Iter Stats: {}'.format(logStr))
-
-                self.fun_per_iter_aftRunNetAftTest(epoch, epoch_iter)
-
-            self.checkpoint(epoch)
-
-            self.fun_per_epoch_aftRunNet(epoch)
-
-        self.fun_conclude_operator()
-
-        self.createMetricsPlotsModelsOutputs()
-
-        return self.assembleAllOutputs()
diff --git a/caffe2/contrib/playground/AnyExpOnTerm.py b/caffe2/contrib/playground/AnyExpOnTerm.py
deleted file mode 100644
index dcfe61f145458..0000000000000
--- a/caffe2/contrib/playground/AnyExpOnTerm.py
+++ /dev/null
@@ -1,98 +0,0 @@
-
-
-
-
-
-import argparse
-import json
-import os
-
-import caffe2.contrib.playground.AnyExp as AnyExp
-import caffe2.contrib.playground.checkpoint as checkpoint
-
-import logging
-logging.basicConfig()
-log = logging.getLogger("AnyExpOnTerm")
-log.setLevel(logging.DEBUG)
-
-
-def runShardedTrainLoop(opts, myTrainFun):
-    start_epoch = 0
-    pretrained_model = opts['model_param']['pretrained_model']
-    if pretrained_model != '' and os.path.exists(pretrained_model):
-        # Only want to get start_epoch.
-        start_epoch, prev_checkpointed_lr, best_metric = \
-            checkpoint.initialize_params_from_file(
-                model=None,
-                weights_file=pretrained_model,
-                num_xpus=1,
-                opts=opts,
-                broadcast_computed_param=True,
-                reset_epoch=opts['model_param']['reset_epoch'],
-            )
-    log.info('start epoch: {}'.format(start_epoch))
-    pretrained_model = None if pretrained_model == '' else pretrained_model
-    ret = None
-
-    pretrained_model = ""
-    shard_results = []
-
-    for epoch in range(start_epoch,
-                       opts['epoch_iter']['num_epochs'],
-                       opts['epoch_iter']['num_epochs_per_flow_schedule']):
-        # must support checkpoint or the multiple schedule will always
-        # start from initial state
-        checkpoint_model = None if epoch == start_epoch else ret['model']
-        pretrained_model = None if epoch > start_epoch else pretrained_model
-        shard_results = []
-        # with LexicalContext('epoch{}_gang'.format(epoch),gang_schedule=False):
-        for shard_id in range(opts['distributed']['num_shards']):
-            opts['temp_var']['shard_id'] = shard_id
-            opts['temp_var']['pretrained_model'] = pretrained_model
-            opts['temp_var']['checkpoint_model'] = checkpoint_model
-            opts['temp_var']['epoch'] = epoch
-            opts['temp_var']['start_epoch'] = start_epoch
-            shard_ret = myTrainFun(opts)
-            shard_results.append(shard_ret)
-
-        ret = None
-        # always only take shard_0 return
-        for shard_ret in shard_results:
-            if shard_ret is not None:
-                ret = shard_ret
-                opts['temp_var']['metrics_output'] = ret['metrics']
-                break
-        log.info('ret is: {}'.format(str(ret)))
-
-    return ret
-
-
-def trainFun():
-    def simpleTrainFun(opts):
-        trainerClass = AnyExp.createTrainerClass(opts)
-        trainerClass = AnyExp.overrideAdditionalMethods(trainerClass, opts)
-        trainer = trainerClass(opts)
-        return trainer.buildModelAndTrain(opts)
-    return simpleTrainFun
-
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser(description='Any Experiment training.')
-    parser.add_argument("--parameters-json", type=json.loads,
-                        help='model options in json format', dest="params")
-
-    args = parser.parse_args()
-    opts = args.params['opts']
-    opts = AnyExp.initOpts(opts)
-    log.info('opts is: {}'.format(str(opts)))
-
-    AnyExp.initDefaultModuleMap()
-
-    opts['input']['datasets'] = AnyExp.aquireDatasets(opts)
-
-    # defined this way so that AnyExp.trainFun(opts) can be replaced with
-    # some other custermized training function.
-    ret = runShardedTrainLoop(opts, trainFun())
-
-    log.info('ret is: {}'.format(str(ret)))
diff --git a/caffe2/contrib/playground/ModuleRegister.py b/caffe2/contrib/playground/ModuleRegister.py
deleted file mode 100644
index 27e0c07f63847..0000000000000
--- a/caffe2/contrib/playground/ModuleRegister.py
+++ /dev/null
@@ -1,120 +0,0 @@
-
-
-
-
-
-import inspect
-import logging
-logging.basicConfig()
-log = logging.getLogger("ModuleRegister")
-log.setLevel(logging.DEBUG)
-
-MODULE_MAPS = []
-
-
-def registerModuleMap(module_map):
-    MODULE_MAPS.append(module_map)
-    log.info("ModuleRegister get modules from  ModuleMap content: {}".
-             format(inspect.getsource(module_map)))
-
-
-def constructTrainerClass(myTrainerClass, opts):
-
-    log.info("ModuleRegister, myTrainerClass name is {}".
-             format(myTrainerClass.__name__))
-    log.info("ModuleRegister, myTrainerClass type is {}".
-             format(type(myTrainerClass)))
-    log.info("ModuleRegister, myTrainerClass dir is {}".
-             format(dir(myTrainerClass)))
-
-    myInitializeModelModule = getModule(opts['model']['model_name_py'])
-    log.info("ModuleRegister, myInitializeModelModule dir is {}".
-             format(dir(myInitializeModelModule)))
-
-    myTrainerClass.init_model = myInitializeModelModule.init_model
-    myTrainerClass.run_training_net = myInitializeModelModule.run_training_net
-    myTrainerClass.fun_per_iter_b4RunNet = \
-        myInitializeModelModule.fun_per_iter_b4RunNet
-    myTrainerClass.fun_per_epoch_b4RunNet = \
-        myInitializeModelModule.fun_per_epoch_b4RunNet
-
-    myInputModule = getModule(opts['input']['input_name_py'])
-    log.info("ModuleRegister, myInputModule {} dir is {}".
-             format(opts['input']['input_name_py'], myInputModule.__name__))
-
-    # Override input methods of the myTrainerClass class
-    myTrainerClass.get_input_dataset = myInputModule.get_input_dataset
-    myTrainerClass.get_model_input_fun = myInputModule.get_model_input_fun
-    myTrainerClass.gen_input_builder_fun = myInputModule.gen_input_builder_fun
-
-    # myForwardPassModule = GetForwardPassModule(opts)
-    myForwardPassModule = getModule(opts['model']['forward_pass_py'])
-    myTrainerClass.gen_forward_pass_builder_fun = \
-        myForwardPassModule.gen_forward_pass_builder_fun
-
-    myParamUpdateModule = getModule(opts['model']['parameter_update_py'])
-    myTrainerClass.gen_param_update_builder_fun =\
-        myParamUpdateModule.gen_param_update_builder_fun \
-        if myParamUpdateModule is not None else None
-
-    myOptimizerModule = getModule(opts['model']['optimizer_py'])
-    myTrainerClass.gen_optimizer_fun = \
-        myOptimizerModule.gen_optimizer_fun \
-        if myOptimizerModule is not None else None
-
-    myRendezvousModule = getModule(opts['model']['rendezvous_py'])
-    myTrainerClass.gen_rendezvous_ctx = \
-        myRendezvousModule.gen_rendezvous_ctx \
-        if myRendezvousModule is not None else None
-
-    # override output module
-    myOutputModule = getModule(opts['output']['gen_output_py'])
-
-    log.info("ModuleRegister, myOutputModule is {}".
-             format(myOutputModule.__name__))
-    myTrainerClass.fun_conclude_operator = myOutputModule.fun_conclude_operator
-    myTrainerClass.assembleAllOutputs = myOutputModule.assembleAllOutputs
-
-    return myTrainerClass
-
-
-def overrideAdditionalMethods(myTrainerClass, opts):
-    log.info("B4 additional override myTrainerClass source {}".
-        format(inspect.getsource(myTrainerClass)))
-    # override any additional modules
-    myAdditionalOverride = getModule(opts['model']['additional_override_py'])
-    if myAdditionalOverride is not None:
-        for funcName, funcValue in inspect.getmembers(myAdditionalOverride,
-                                                      inspect.isfunction):
-            setattr(myTrainerClass, funcName, funcValue)
-    log.info("Aft additional override myTrainerClass's source {}".
-        format(inspect.getsource(myTrainerClass)))
-    return myTrainerClass
-
-
-def getModule(moduleName):
-    log.info("get module {} from MODULE_MAPS content {}".format(moduleName, str(MODULE_MAPS)))
-    myModule = None
-    for ModuleMap in MODULE_MAPS:
-        log.info("iterate through MODULE_MAPS content {}".
-                 format(str(ModuleMap)))
-        for name, obj in inspect.getmembers(ModuleMap):
-            log.info("iterate through MODULE_MAPS a name {}".format(str(name)))
-            if name == moduleName:
-                log.info("AnyExp get module {} with source:{}".
-                         format(moduleName, inspect.getsource(obj)))
-                myModule = obj
-                return myModule
-    return None
-
-
-def getClassFromModule(moduleName, className):
-    myClass = None
-    for ModuleMap in MODULE_MAPS:
-        for name, obj in inspect.getmembers(ModuleMap):
-            if name == moduleName:
-                log.info("ModuleRegistry from module {} get class {} of source:{}".
-                         format(moduleName, className, inspect.getsource(obj)))
-                myClass = getattr(obj, className)
-                return myClass
-    return None
diff --git a/caffe2/contrib/playground/README.md b/caffe2/contrib/playground/README.md
deleted file mode 100644
index 32264fabf831c..0000000000000
--- a/caffe2/contrib/playground/README.md
+++ /dev/null
@@ -1,154 +0,0 @@
-# Playground for Caffe2 Models
-
-Playground is created to allow modelers to reuse the components of their models.  It is based on data parallel model of Caffe2.  Playground provide a framework that takes care of regular trainer iteration procedures and abstracting out APIs that allows user to apply customized model components of their own.  User can swap / exchange /reuse these components without rewriting the whole training script.  Once the components are in place, user can use parameterized launch command to drive their experiments.  This will be convenient for creating large amount of experiments with different components defined for each of them.  It may be used as a tool to explore different model architect / algorithms like optimizer / momentum / learning rate / batch normalization parameters.
-
-Playground project highlight:
-1. parameter driven: no need to create py script for each experiment, just swap components using parameters.  Very customizable, add your own component and add your opts in the command as you want.
-2. All models follows a typical way of train/testing epoch iteration.  Many aspects can be customized, example: run epoch by loss instead of predetermined iteration
-3. customizable components, trained metrics, also specified with parameters
-4. gpu or cpu training supported
-5. parallel training on multiple host supported
-6. checkpoint, pre-trained model helps with recover interrupted / failed experiment
-
-
-### Example Usage
-Playground comes with a resnet example, located in resnetdemo folder.  To see how playground works, do the following:
-
-1. make sure your caffe2 build successful with openCV and lmdb dependencies supported.
-
-2. make sure you have training/testing datasets ready in folders that can be accessible to trainer / distributed trainers
-
-3. specify a folder that you would like to store your checkpoint model files.
-
-4. use this command to launch a training, verify epochs are running with metrics reported in log and model file store in your checkpoint folder
-
-$ python caffe2/contrib/playground/AnyExpOnTerm.py --parameters-json '{
-"opts":{
-
-    "input":{
-        "input_name_py":"gfs_IN1k",
-        "train_input_path":"/path/to/your/training/data_lmdb/",
-        "test_input_path":"/path/to/your/testing/data_lmdb/",
-        "scale_jitter_type": 1, "color_jitter": true,      "color_lighting": true,
-        "namespace": "aml",  "table": "imagenet_data",  "column_handle": "everstore_handle",
-        "column_label": "label", "column_id": "image_id",  "label_type": 0,
-        "train_partition": {"ds": "2017-07-31", "config": "imagenet1k", "is_train": "1"},
-        "test_partition": {"ds": "2017-07-31", "config": "imagenet1k", "is_train": "0"},
-        "num_classes":1000, "loadsize" : 256, "imsize": 224, "decode_threads": 8, "datasets":[]},
-
-    "model":{
-        "model_name_py":"IN1k_resnet",
-        "forward_pass_py":"caffe2_resnet50_default_forward",
-        "parameter_update_py":"explicit_resnet_param_update",
-        "optimizer_py":"",
-        "rendezvous_py":"rendezvous_filestore",
-        "additional_override_py":""},
-
-    "model_param":{
-        "pretrained_model":"", "reset_epoch":true, "memonger" : true, "cuda_nccl": true,
-        "combine_spatial_bn":true, "max_concurrent_distributed_ops" : 16,
-        "base_learning_rate":0.05, "bn_epsilon":0.00001, "bn_momentum":0.9, "custom_bn_init": true,
-        "bn_init_gamma":1e-323, "weight_decay":1e-4, "weight_decay_bn":1e-323, "engine":"CUDNN"},
-
-    "epoch_iter":{
-        "num_train_sample_per_epoch":10240,
-        "num_test_sample": 5000,
-        "num_epochs":10,
-        "num_epochs_per_flow_schedule":5,
-        "num_train_iteration_per_test": 10,
-        "batch_per_device":32,
-        "num_test_iter":2},
-
-    "distributed":{
-        "num_shards":1,
-        "num_gpus":2,
-        "first_gpu_id":0,
-        "num_cpus":4,
-        "first_cpu_id":0},
-
-    "output":{
-        "gen_output_py":"output_generator",
-        "gen_checkpoint_path_py":"gen_checkpoint_path",
-        "checkpoint_folder":"/home/your_user_name/model_checkpoint/",
-        "metrics":[
-            {"name":"train_loss",
-             "meter_py":"ComputeLoss",
-             "meter_kargs":{"blob_name":"loss"},
-             "is_train":true},
-            {"name":"test_loss",
-             "meter_py":"ComputeLoss",
-             "meter_kargs":{"blob_name":"loss"},
-             "is_train":false},
-            {"name":"train_accuracy_top1",
-             "meter_py":"ComputeTopKAccuracy",
-             "meter_kargs":{"blob_name":["softmax", "label"], "topk":1},
-             "is_train":true},
-            {"name":"train_accuracy_top5",
-             "meter_py":"ComputeTopKAccuracy",
-             "meter_kargs":{"blob_name":["softmax", "label"], "topk":5},
-             "is_train":true},
-            {"name":"test_accuracy_top1",
-             "meter_py":"ComputeTopKAccuracy",
-             "meter_kargs":{"blob_name":["softmax", "label"], "topk":1},
-             "is_train":false},
-            {"name":"test_accuracy_top5",
-             "meter_py":"ComputeTopKAccuracy",
-             "meter_kargs":{"blob_name":["softmax", "label"], "topk":5},
-             "is_train":false}],
-        "plots":[
-            {"x":"", "x_title":"", "ys":["train_loss", "test_loss"],
-             "y_title":"train and test loss"},
-            {"x":"epochs", "x_title":"epochs",
-             "ys":["train_accuracy_top1","test_accuracy_top1",
-                   "train_accuracy_top5","test_accuracy_top5"],
-             "y_title":"Accuracy: Train top1, Test top1, Train top5, Test top5"}]}}
-
-}'
-
-5. now you can switch to different components that supplied in resnetdemo folder like so:
-
-   "forward_pass_py":"caffe2_resnet50_default_forward", --> "explicit_resnet_forward"  (which is a resnet model that allow you specify layers with "model_param"."num_layer")
-
-   and/or
-
-   "parameter_update_py":"caffe2_resnet50_default_param_update", --> "explicit_resnet_param_update"
-
-   playground should be able to launch training epochs and give you results
-
-
-### General Usage Guideline
-
-1. mandatory non empty opts: input_name_py, datasets, model_name_py, forward_pass_py, (parameter_update_py or optimizer_py), rendezvous_py, memonger, all epoch_iter opts, all distributed opts, gen_output_py
-
-2. mandatory nullable opts: pretrained_model, max_concurrent_distributed_ops, combine_spatial_bn
-
-3. other module dependent opts can be changed or removed: the rest of the opts.
-
-4. specify any additional opts depends on your modules' need, directly add them into the command line opts dictionary and no need to change any py code.  You should create your module to make sure they knows how to handle these new opts.  You access your own opts in such a manner:  self.opt['your_own_arg']['your_own_sub_arg']
-
-5. checkpoint is performed at the end of each epoch by default and generated model file can be find in log.  Each checkpoint can be used as pre-trained model to start new experiment.  Make sure new experiment is compatible with pre-trained model if you specified it.  For example, gpu experiment and cpu experiment can not share checkpoint, because the blob names are different.  Any experiments with different blob names can not share checkpoint.
-
-6. The metric and plots are reported when experiment finish running.  Intermediate results are reported in the log of the CreateTrainerAndRunManyEpochs operator as iteration goes on.
-
-7. if num_gpus is specified, the trainer will try to use gpu.  if num_gpus = 0, the trainer will use cpu to train.  For gpu training, batch_per_device are typically 32, for cpu training, batch_per_device is normally set to 2 with num_cpus higher like 8 or 16 depends on your machines' configuration.
-
-8. if train on single host, let "num_shards" = 1,  if multiple hosts, specify your "num_shards" and start parallelized training from each shards similarly to the resnet50_trainer.py example in caffe2/python/examples/ folder.
-
-
-### Develop Your Own Components
-
-1. Create a folder for your own experiment under caffe2/contrib/playground/ and go to this folder.
-
-2. Create a base model file, for example IN1kResnet.py.  In this script you need to implement init function and in it, instantiate your train/test model and give them to self.train_model and self.test_model.  In this base model class, you can also chose to override other functions you'd like to customize, for example if you want to iterate according to accuracy instead of fixed number of loops, override list_of_epochs(), and list_of_epoch_iters()
-
-3. Create component py scripts implementing the generators arguments of data_parallel_model.Parallelize().  Total four of them: input_builder_fun, forward_pass_builder_fun,  one of param_update_builder_fun or optimizer_builder_fun, and rendezvous.  This is where you can switch between different components. Examples: for the demo IN1k_resnet experiments, I created two different forward function: explicit_resnet_forward.py and caffe2_resnet50_default_forward.py.  Both implemented the API "gen_param_update_builder_fun", which is abstract method in the framework class AnyExp.py
-
-4. Next import the module components you created into module_map.py.  This import is needed to include these packages during building.  Give imported module a module name, normally if module is just a simple file contains some functions, just use the py script file name.  If the module contains class and the class is needed for module input, name it with the class name, examples are the meter classes like compute_loss.  When launching your experiment, in opts for the term “xxx_py” fill in the name you chose in module_map.py.  Playground will find your module and load it.
-
-5. Create as many modules as you need.  Then when you perform your experiment, specify the module you want in opts correspondingly and you can run your experiment with ease.
-
-6. In the demo, the opts item “gen_output_py” uses output_generator.py , which provides a minimum way to generating final experimental result, stored in the form of a dict.  It will allow user to do whatever visualization with these data after the training is finished.
-
-7. Customize your experimental result.  A meter interface is provided to implement your own metrics calculators.  Example compute_loss.py and compute_topk_accuracy.py.  For training metrics, results are calculated right away in each iteration.  For testing metrics, results are accumulated for the whole loop and finally calculated after test iteration finishes.  Once your have your meter class defined, you can start defining what metrics to report in your opts['output']['metrics'] list.  The name you give to your metrics can later be used when you define your plots.  The Playground will always record throughput metrics secs_per_train and samples_per_sec.
-
-8. an additional_override_py option is provided for the modules to allow user override any existing methods defined in the main framework AnyExp.py.  This make it easy to shut down part of the model to focus on remaining modules for experimenting or debugging.  An example is given as override_no_test_model_no_checkpoint.py, which turns off checkpointing and does neither prepare nor run test model.
diff --git a/caffe2/contrib/playground/checkpoint.py b/caffe2/contrib/playground/checkpoint.py
deleted file mode 100644
index 5ea3d2a9035c5..0000000000000
--- a/caffe2/contrib/playground/checkpoint.py
+++ /dev/null
@@ -1,174 +0,0 @@
-
-
-
-
-
-import numpy as np
-import pickle
-from collections import OrderedDict
-
-from caffe2.proto import caffe2_pb2
-
-from caffe2.python import workspace, core, scope
-
-import logging
-logging.basicConfig()
-log = logging.getLogger("AnyExpOnTerm")
-log.setLevel(logging.DEBUG)
-
-
-def initialize_params_from_file(
-        model, weights_file, num_xpus, opts,
-        broadcast_computed_param=False, reset_epoch=False):
-    start_epoch, lr, best_metric = initialize_master_xpu_model_params(
-        model, weights_file, opts, reset_epoch)
-    broadcast_parameters(opts, model, num_xpus, broadcast_computed_param)
-    return start_epoch, lr, best_metric
-
-
-def initialize_master_xpu_model_params(model, weights_file, opts, reset_epoch):
-    log.info("Initializing model params from file: {}".format(weights_file))
-    with open(weights_file, 'r') as fopen:
-        blobs = pickle.load(fopen)
-    if 'blobs' in blobs:
-        blobs = blobs['blobs']
-
-    start_epoch = 0
-    best_metric = float('-inf')
-    if 'epoch' in blobs:
-        log.info('epoch {} is found in model file'.format(blobs['epoch']))
-        if not reset_epoch:
-            start_epoch = blobs['epoch']
-        else:
-            log.info('Reset epoch')
-    else:
-        log.info('no epoch is found in model file')
-    lr = opts['model_param']['base_learning_rate']
-    if 'lr' in blobs:
-        lr = blobs['lr']
-    if 'best_metric' in blobs and not reset_epoch:
-        best_metric = blobs['best_metric']
-
-    if model is not None:
-        log.info('initialize model parameters using weights file: {}'.format(
-            weights_file
-        ))
-        ws_blobs = workspace.Blobs()
-        unscoped_blob_names = OrderedDict()
-        for blob in model.GetAllParams():
-            unscoped_blob_names[unscope_name(str(blob))] = True
-        root_xpu_id = opts['distributed']['first_xpu_id']
-        device = opts['distributed']['device']
-        caffe2_pb2_DEVICE =\
-            caffe2_pb2.CUDA if opts['distributed']['device'] == 'gpu'\
-            else caffe2_pb2.CPU
-        with core.NameScope('{}_{}'.format(device, root_xpu_id)):
-            with core.DeviceScope(core.DeviceOption(caffe2_pb2_DEVICE, 0)):
-                for unscoped_blob_name in unscoped_blob_names.keys():
-                    scoped_blob_name = scoped_name(unscoped_blob_name)
-                    if unscoped_blob_name not in blobs:
-                        log.info('{:s} not found'.format(unscoped_blob_name))
-                        continue
-                    log.info(
-                        '{:s} loaded from weights file into: {:s}'.format(
-                            unscoped_blob_name, scoped_blob_name
-                        )
-                    )
-                    if scoped_blob_name in ws_blobs:
-                        ws_blob = workspace.FetchBlob(scoped_blob_name)
-                        if not ws_blob.shape == blobs[unscoped_blob_name].shape:
-                            log.info(
-                                ('Workspace blob {} with shape {} does '
-                                    'not match weights file shape {}').format(
-                                            unscoped_blob_name, ws_blob.shape,
-                                            blobs[unscoped_blob_name].shape)
-                            )
-                        else:
-                            workspace.FeedBlob(
-                                scoped_blob_name,
-                                blobs[unscoped_blob_name].astype(
-                                    np.float32, copy=False))
-    else:
-        log.info('Skip initializing model parameters from file: {}'.format(
-            weights_file
-        ))
-    log.info('Complete initialize_master_xpu_model_params')
-    return start_epoch, lr, best_metric
-
-
-def broadcast_parameters(opts, model, num_xpus, broadcast_computed_param=False):
-    if num_xpus == 1:
-        log.info("only 1 device. Skip parameter broadcast")
-        return
-    all_params = [model.GetParams()]
-    if broadcast_computed_param:
-        all_params.append(model.GetComputedParams())
-    caffe2_pb2_DEVICE =\
-        caffe2_pb2.CUDA if opts['distributed']['device'] == 'gpu'\
-        else caffe2_pb2.CPU
-    for params in all_params:
-        assert len(params) % num_xpus == 0, \
-            "Current model doesn't match device number when loading checkpoint"
-        params_per_xpu = int(len(params) / num_xpus)
-        for idx in range(params_per_xpu):
-            blobs = [param for param in params[idx::params_per_xpu]]
-            data = workspace.FetchBlob(blobs[0])
-            log.info('Broadcasting {} to'.format(str(blobs[0])))
-            for i, p in enumerate(blobs[1:]):
-                log.info(' |-> {}'.format(str(p)))
-                with core.DeviceScope(core.DeviceOption(caffe2_pb2_DEVICE, i+1)):
-                    workspace.FeedBlob(p, data)
-    log.info("Complete parameter broadcast")
-
-
-def save_model_params(is_checkpoint, model, checkpoint_path, epoch, opts, best_metric):
-    # best_metric=float('-inf')
-    if checkpoint_path is None:
-        return None
-
-    try:
-        save_model_params_blob(
-            model, checkpoint_path, epoch, opts, best_metric
-        )
-    except Exception as e:
-        log.warning('Exception from save_model_params {}'.format(str(e)))
-    return checkpoint_path
-
-
-def save_model_params_blob(model, params_file, epoch, opts, best_metric):
-    # best_metric=float('-inf')
-    log.info("Saving model params...")
-    root_xpu_id = opts['distributed']['first_xpu_id']
-    device = opts['distributed']['device']
-    save_params = [str(param) for param in
-                   model.GetParams('{}_{}'.format(device, root_xpu_id))]
-    save_computed_params = [str(param) for param in
-                            model.GetComputedParams('{}_{}'
-                            .format(device, root_xpu_id))]
-    save_blobs = {}
-    save_blobs['epoch'] = epoch
-    save_blobs['best_metric'] = best_metric
-    save_blobs['lr'] = \
-        workspace.FetchBlob('{}_{}/lr'.format(device, root_xpu_id))
-    for param in save_params + save_computed_params:
-        scoped_blob_name = str(param)
-        unscoped_blob_name = unscope_name(scoped_blob_name)
-        if unscoped_blob_name not in save_blobs:
-            save_blobs[unscoped_blob_name] = workspace.FetchBlob(
-                scoped_blob_name)
-            log.debug(
-                '{:s} -> {:s}'.format(scoped_blob_name, unscoped_blob_name))
-    log.info('to weights file {}'.format(params_file))
-    try:
-        with open(params_file, 'w') as fwrite:
-            pickle.dump(dict(blobs=save_blobs), fwrite, pickle.HIGHEST_PROTOCOL)
-    except IOError as e:
-        log.error('I/O error({0}): {1}'.format(e.errno, e.strerror))
-
-
-def unscope_name(blob_name):
-    return blob_name[blob_name.rfind(scope._NAMESCOPE_SEPARATOR) + 1:]
-
-
-def scoped_name(blob_name):
-    return scope.CurrentNameScope() + blob_name
diff --git a/caffe2/contrib/playground/compute_loss.py b/caffe2/contrib/playground/compute_loss.py
deleted file mode 100644
index 2965ff3895ac0..0000000000000
--- a/caffe2/contrib/playground/compute_loss.py
+++ /dev/null
@@ -1,35 +0,0 @@
-
-
-
-
-
-import caffe2.contrib.playground.meter as Meter
-from caffe2.python import workspace
-
-
-class ComputeLoss(Meter.Meter):
-    def __init__(self, opts=None, blob_name=''):
-        self.blob_name = blob_name
-        self.opts = opts
-        self.iter = 0
-        self.value = 0
-
-    def Reset(self):
-        self.iter = 0
-        self.value = 0
-
-    def Add(self):
-        """Average values of a blob on each gpu"""
-        value = 0
-        for idx in range(self.opts['distributed']['first_xpu_id'],
-                         self.opts['distributed']['first_xpu_id'] +
-                         self.opts['distributed']['num_xpus']):
-            value += workspace.FetchBlob('{}_{}/{}'.
-                format(self.opts['distributed']['device'], idx, self.blob_name))
-        self.value += value
-        self.iter += 1
-
-    def Compute(self):
-        result = self.opts['distributed']['num_shards'] * self.value / self.iter
-        self.Reset()
-        return result
diff --git a/caffe2/contrib/playground/compute_topk_accuracy.py b/caffe2/contrib/playground/compute_topk_accuracy.py
deleted file mode 100644
index e2f148231c6dd..0000000000000
--- a/caffe2/contrib/playground/compute_topk_accuracy.py
+++ /dev/null
@@ -1,59 +0,0 @@
-
-
-
-
-
-import caffe2.contrib.playground.meter as Meter
-from caffe2.python import workspace
-import numpy as np
-
-
-class ComputeTopKAccuracy(Meter.Meter):
-    # Python default arguments are evaluated once when the function is
-    # defined, not each time the function is called
-    # This means that if you use a mutable default argument and mutate it,
-    # you will and have mutated that object for
-    # all future calls to the function as well.
-    # def __init__(self, blob_name=['softmax', 'label'], opts=None, topk=1):
-    def __init__(self, blob_name=None, opts=None, topk=1):
-        if blob_name is None:
-            blob_name = ['softmax', 'label']
-        self.blob_name = blob_name
-        self.opts = opts
-        self.topk = topk
-        self.iter = 0
-        self.value = 0
-
-    def Reset(self):
-        self.iter = 0
-        self.value = 0
-
-    def Add(self):
-        for idx in range(self.opts['distributed']['first_xpu_id'],
-                         self.opts['distributed']['first_xpu_id'] +
-                         self.opts['distributed']['num_xpus']):
-            prefix = '{}_{}/'.format(self.opts['distributed']['device'], idx)
-            softmax = workspace.FetchBlob(prefix + self.blob_name[0])
-            labels = workspace.FetchBlob(prefix + self.blob_name[1])
-            output = np.squeeze(softmax)
-            target = np.squeeze(labels)
-            if len(output.shape) == 1:
-                output = output.reshape((1, output.shape[0]))
-            else:
-                assert len(output.shape) == 2, \
-                    'wrong output size (1D or 2D expected)'
-            assert len(target.shape) == 1, 'wrong target size (1D expected)'
-            assert output.shape[0] == target.shape[0], \
-                'target and output do not match'
-
-            N = output.shape[0]
-            pred = np.argsort(-output, axis=1)[:, :self.topk]
-            correct = pred.astype(target.dtype) == np.repeat(
-                target.reshape((N, 1)), [self.topk], axis=1)
-            self.value += np.sum(correct[:, :self.topk])
-            self.iter += N
-
-    def Compute(self):
-        result = self.value / self.iter
-        self.Reset()
-        return result
diff --git a/caffe2/contrib/playground/meter.py b/caffe2/contrib/playground/meter.py
deleted file mode 100644
index 68897792d2841..0000000000000
--- a/caffe2/contrib/playground/meter.py
+++ /dev/null
@@ -1,25 +0,0 @@
-
-
-
-
-
-from abc import abstractmethod
-
-
-class Meter:
-
-    @abstractmethod
-    def __init__(self, **kwargs):
-        pass
-
-    @abstractmethod
-    def Reset(self):
-        pass
-
-    @abstractmethod
-    def Add(self):
-        pass
-
-    @abstractmethod
-    def Compute(self):
-        pass
diff --git a/caffe2/contrib/playground/module_map.py b/caffe2/contrib/playground/module_map.py
deleted file mode 100644
index 8eb1a3a00cdc6..0000000000000
--- a/caffe2/contrib/playground/module_map.py
+++ /dev/null
@@ -1,49 +0,0 @@
-
-
-
-
-
-# Input
-import caffe2.contrib.playground.resnetdemo.\
-    gfs_IN1k as gfs_IN1k  # noqa
-
-# model
-import caffe2.contrib.playground.resnetdemo.\
-    IN1k_resnet as IN1k_resnet # noqa
-
-import caffe2.contrib.playground.resnetdemo.\
-    IN1k_resnet_no_test_model as IN1k_resnet_no_test_model # noqa
-
-# Additional override
-import caffe2.contrib.playground.resnetdemo.\
-    override_no_test_model_no_checkpoint as override_no_test_model_no_checkpoint # noqa
-
-# FORWARD_PASS
-import caffe2.contrib.playground.resnetdemo.\
-    caffe2_resnet50_default_forward as caffe2_resnet50_default_forward # noqa
-
-import caffe2.contrib.playground.resnetdemo.\
-    explicit_resnet_forward as explicit_resnet_forward # noqa
-
-# PARAMETER_UPDATE
-import caffe2.contrib.playground.resnetdemo.\
-    caffe2_resnet50_default_param_update as caffe2_resnet50_default_param_update # noqa
-
-import caffe2.contrib.playground.resnetdemo.\
-    explicit_resnet_param_update as explicit_resnet_param_update # noqa
-
-# RENDEZVOUS
-import caffe2.contrib.playground.resnetdemo.\
-    rendezvous_filestore as rendezvous_filestore # noqa
-
-# OUTPUT
-import caffe2.contrib.playground.\
-    output_generator as output_generator # noqa
-
-# METERS
-# for meters, use the class name as your module name in this map
-import caffe2.contrib.playground.\
-    compute_loss as ComputeLoss # noqa
-
-import caffe2.contrib.playground.\
-    compute_topk_accuracy as ComputeTopKAccuracy # noqa
diff --git a/caffe2/contrib/playground/output_generator.py b/caffe2/contrib/playground/output_generator.py
deleted file mode 100644
index aaa977c08faae..0000000000000
--- a/caffe2/contrib/playground/output_generator.py
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-
-
-
-from caffe2.python import timeout_guard
-
-def fun_conclude_operator(self):
-    # Ensure the program exists. This is to "fix" some unknown problems
-    # causing the job sometimes get stuck.
-    timeout_guard.EuthanizeIfNecessary(600.0)
-
-
-def assembleAllOutputs(self):
-    output = {}
-    output['train_model'] = self.train_model
-    output['test_model'] = self.test_model
-    output['model'] = self.model_output
-    output['metrics'] = self.metrics_output
-    return output
diff --git a/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py b/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py
deleted file mode 100644
index 58085dbc37210..0000000000000
--- a/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py
+++ /dev/null
@@ -1,57 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-from caffe2.python import workspace, cnn, core
-from caffe2.python import timeout_guard
-from caffe2.proto import caffe2_pb2
-
-
-def init_model(self):
-    train_model = cnn.CNNModelHelper(
-        order="NCHW",
-        name="resnet",
-        use_cudnn=True,
-        cudnn_exhaustive_search=False
-    )
-    self.train_model = train_model
-
-    test_model = cnn.CNNModelHelper(
-        order="NCHW",
-        name="resnet_test",
-        use_cudnn=True,
-        cudnn_exhaustive_search=False,
-        init_params=False,
-    )
-    self.test_model = test_model
-
-    self.log.info("Model creation completed")
-
-
-def fun_per_epoch_b4RunNet(self, epoch):
-    pass
-
-
-def fun_per_iter_b4RunNet(self, epoch, epoch_iter):
-
-    learning_rate = 0.05
-    for idx in range(self.opts['distributed']['first_xpu_id'],
-                     self.opts['distributed']['first_xpu_id'] +
-                     self.opts['distributed']['num_xpus']):
-        caffe2_pb2_device = caffe2_pb2.CUDA if \
-            self.opts['distributed']['device'] == 'gpu' else \
-            caffe2_pb2.CPU
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2_device, idx)):
-            workspace.FeedBlob(
-                '{}_{}/lr'.format(self.opts['distributed']['device'], idx),
-                np.array(learning_rate, dtype=np.float32)
-            )
-
-
-def run_training_net(self):
-    timeout = 2000.0
-    with timeout_guard.CompleteInTimeOrDie(timeout):
-        workspace.RunNet(self.train_model.net.Proto().name)
diff --git a/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py b/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py
deleted file mode 100644
index 480070752e635..0000000000000
--- a/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py
+++ /dev/null
@@ -1,62 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-from caffe2.python import workspace, cnn, core
-from caffe2.python import timeout_guard
-from caffe2.proto import caffe2_pb2
-
-
-def init_model(self):
-    # if cudnn needs to be turned off, several other places
-    # need to be modified:
-    # 1. operators need to be constructed with engine option, like below:
-    #     conv_blob = model.Conv(...engine=engine)
-    # 2. when launch model, opts['model_param']['engine'] = "" instead of "CUDNN"
-    # 2. caffe2_disable_implicit_engine_preference in operator.cc set to true
-    train_model = cnn.CNNModelHelper(
-        order="NCHW",
-        name="resnet",
-        use_cudnn=False,
-        cudnn_exhaustive_search=False,
-    )
-    self.train_model = train_model
-
-    # test_model = cnn.CNNModelHelper(
-    #     order="NCHW",
-    #     name="resnet_test",
-    #     use_cudnn=False,
-    #     cudnn_exhaustive_search=False,
-    #     init_params=False,
-    # )
-    self.test_model = None
-
-    self.log.info("Model creation completed")
-
-
-def fun_per_epoch_b4RunNet(self, epoch):
-    pass
-
-
-def fun_per_iter_b4RunNet(self, epoch, epoch_iter):
-    learning_rate = 0.05
-    for idx in range(self.opts['distributed']['first_xpu_id'],
-                     self.opts['distributed']['first_xpu_id'] +
-                     self.opts['distributed']['num_xpus']):
-        caffe2_pb2_device = caffe2_pb2.CUDA if \
-            self.opts['distributed']['device'] == 'gpu' else \
-            caffe2_pb2.CPU
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2_device, idx)):
-            workspace.FeedBlob(
-                '{}_{}/lr'.format(self.opts['distributed']['device'], idx),
-                np.array(learning_rate, dtype=np.float32)
-            )
-
-
-def run_training_net(self):
-    timeout = 2000.0
-    with timeout_guard.CompleteInTimeOrDie(timeout):
-        workspace.RunNet(self.train_model.net.Proto().name)
diff --git a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py
deleted file mode 100644
index fa0fedd84a8c0..0000000000000
--- a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py
+++ /dev/null
@@ -1,26 +0,0 @@
-
-
-
-
-
-import caffe2.python.models.resnet as resnet
-
-
-def gen_forward_pass_builder_fun(self, model, dataset, is_train):
-    def create_resnet50_model_ops(model, loss_scale):
-        [softmax, loss] = resnet.create_resnet50(
-            model,
-            "data",
-            num_input_channels=3,
-            num_labels=1000,
-            label="label",
-        )
-        model.Accuracy([softmax, "label"], "accuracy")
-
-        my_loss_scale = 1. / self.opts['distributed']['num_xpus'] / \
-            self.opts['distributed']['num_shards']
-
-        loss = model.Scale(loss, scale=my_loss_scale)
-
-        return [loss]
-    return create_resnet50_model_ops
diff --git a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py
deleted file mode 100644
index 5697d1301b8a0..0000000000000
--- a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py
+++ /dev/null
@@ -1,42 +0,0 @@
-
-
-
-
-
-
-def gen_param_update_builder_fun(self, model, dataset, is_train):
-    if not is_train:
-        return None
-    else:
-        def add_parameter_update_ops(model):
-            model.AddWeightDecay(1e-4)
-            ITER = model.Iter("ITER")
-            stepsz = int(30 *
-                         self.opts['epoch_iter']['num_train_sample_per_epoch'] /
-                         self.total_batch_size)
-            LR = model.net.LearningRate(
-                [ITER],
-                "lr",
-                base_lr=self.opts['model_param']['base_learning_rate'],
-                policy="step",
-                stepsize=stepsz,
-                gamma=0.1,
-            )
-
-            params = model.GetParams()
-            assert(len(params) > 0)
-            for param in params:
-                param_grad = model.param_to_grad[param]
-                param_momentum = model.param_init_net.ConstantFill(
-                    [param], param + '_momentum', value=0.0
-                )
-
-                # Update param_grad and param_momentum in place
-                model.net.MomentumSGDUpdate(
-                    [param_grad, param_momentum, LR, param],
-                    [param_grad, param_momentum, param],
-                    momentum=0.9,
-                    nesterov=1
-                )
-
-        return add_parameter_update_ops
diff --git a/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py b/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py
deleted file mode 100644
index 056ddd8c9ea06..0000000000000
--- a/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py
+++ /dev/null
@@ -1,313 +0,0 @@
-
-
-
-
-
-import logging
-logging.basicConfig()
-log = logging.getLogger("AnyExp")
-log.setLevel(logging.DEBUG)
-
-# For more depths, add the block config here
-BLOCK_CONFIG = {
-    18: (2, 2, 2, 2),
-    34: (3, 4, 6, 3),
-    50: (3, 4, 6, 3),
-    101: (3, 4, 23, 3),
-    152: (3, 8, 36, 3),
-    200: (3, 32, 36, 3),
-    264: (3, 64, 36, 3),
-    284: (3, 32, 64, 3),
-}
-
-
-def gen_forward_pass_builder_fun(self, model, dataset, is_train):
-    split = 'train' if is_train else 'test'
-    opts = self.opts
-
-    def model_creator(model, loss_scale):
-        model, softmax, loss = resnet_imagenet_create_model(
-            model=model,
-            data='data',
-            labels='label',
-            split=split,
-            opts=opts,
-            dataset=dataset,
-        )
-        return [loss]
-    return model_creator
-
-
-def resnet_imagenet_create_model(model, data, labels, split, opts, dataset):
-    model_helper = ResNetModelHelper(model, split, opts)
-    opts_depth = opts['model_param']['num_layer']
-    engine = opts['model_param']['engine']
-    log.info(' | ResNet-{} Imagenet'.format(opts_depth))
-    assert opts_depth in BLOCK_CONFIG.keys(), \
-        'Block config is not defined for specified model depth. Please check.'
-    (n1, n2, n3, n4) = BLOCK_CONFIG[opts_depth]
-
-    num_features = 2048
-    residual_block = model_helper.bottleneck_block
-    if opts_depth in [18, 34]:
-        num_features = 512
-        residual_block = model_helper.basic_block
-
-    num_classes = 1000
-    conv_blob = model.Conv(
-        data, 'conv1', 3, 64, 7, stride=2, pad=3, weight_init=('MSRAFill', {}),
-        bias_init=('ConstantFill', {'value': 0.}), no_bias=0, engine=engine
-    )
-    test_mode = False
-    if split in ['test', 'val']:
-        test_mode = True
-    bn_blob = model.SpatialBN(
-        conv_blob, 'res_conv1_bn', 64,
-        # does not appear to affect test_loss performance
-        # epsilon=1e-3,
-        epsilon=opts['model_param']['bn_epsilon'],
-        # momentum=0.1,
-        momentum=opts['model_param']['bn_momentum'],
-        is_test=test_mode,
-    )
-    relu_blob = model.Relu(bn_blob, bn_blob)
-    max_pool = model.MaxPool(relu_blob, 'pool1', kernel=3, stride=2, pad=1)
-
-    # TODO: This can be further optimized by passing dim_in, dim_out = features,
-    # dim_out = features * 4
-    if opts_depth in [50, 101, 152, 200, 264, 284]:
-        blob_in, dim_in = model_helper.residual_layer(
-            residual_block, max_pool, 64, 256, stride=1, num_blocks=n1,
-            prefix='res2', dim_inner=64
-        )
-        blob_in, dim_in = model_helper.residual_layer(
-            residual_block, blob_in, dim_in, 512, stride=2, num_blocks=n2,
-            prefix='res3', dim_inner=128
-        )
-        blob_in, dim_in = model_helper.residual_layer(
-            residual_block, blob_in, dim_in, 1024, stride=2, num_blocks=n3,
-            prefix='res4', dim_inner=256
-        )
-        blob_in, dim_in = model_helper.residual_layer(
-            residual_block, blob_in, dim_in, 2048, stride=2, num_blocks=n4,
-            prefix='res5', dim_inner=512
-        )
-    elif opts_depth in [18, 34]:
-        blob_in, dim_in = model_helper.residual_layer(
-            residual_block, max_pool, 64, 64, stride=1, num_blocks=n1,
-            prefix='res2',
-        )
-        blob_in, dim_in = model_helper.residual_layer(
-            residual_block, blob_in, dim_in, 128, stride=2, num_blocks=n2,
-            prefix='res3',
-        )
-        blob_in, dim_in = model_helper.residual_layer(
-            residual_block, blob_in, dim_in, 256, stride=2, num_blocks=n3,
-            prefix='res4',
-        )
-        blob_in, dim_in = model_helper.residual_layer(
-            residual_block, blob_in, dim_in, 512, stride=2, num_blocks=n4,
-            prefix='res5',
-        )
-
-    pool_blob = model.AveragePool(blob_in, 'pool5', kernel=7, stride=1)
-
-    loss_scale = 1. / opts['distributed']['num_xpus'] / \
-        opts['distributed']['num_shards']
-
-    loss = None
-
-    fc_blob = model.FC(
-        pool_blob, 'pred', num_features, num_classes,
-        # does not appear to affect test_loss performance
-        # weight_init=('GaussianFill', {'std': opts.fc_init_std}),
-        # bias_init=('ConstantFill', {'value': 0.})
-        weight_init=None,
-        bias_init=None)
-    softmax, loss = model.SoftmaxWithLoss(
-        [fc_blob, labels],
-        ['softmax', 'loss'],
-        scale=loss_scale)
-    model.Accuracy(['softmax', labels], 'accuracy')
-    return model, softmax, loss
-
-
-class ResNetModelHelper():
-
-    def __init__(self, model, split, opts):
-        self.model = model
-        self.split = split
-        self.opts = opts
-        self.engine = opts['model_param']['engine']
-
-
-    # shortcut type B
-    def add_shortcut(self, blob_in, dim_in, dim_out, stride, prefix):
-        if dim_in == dim_out:
-            return blob_in
-        conv_blob = self.model.Conv(
-            blob_in, prefix, dim_in, dim_out, kernel=1,
-            stride=stride,
-            weight_init=("MSRAFill", {}),
-            bias_init=('ConstantFill', {'value': 0.}), no_bias=1, engine=self.engine
-        )
-        test_mode = False
-        if self.split in ['test', 'val']:
-            test_mode = True
-        bn_blob = self.model.SpatialBN(
-            conv_blob, prefix + "_bn", dim_out,
-            # epsilon=1e-3,
-            # momentum=0.1,
-            epsilon=self.opts['model_param']['bn_epsilon'],
-            momentum=self.opts['model_param']['bn_momentum'],
-            is_test=test_mode,
-        )
-        return bn_blob
-
-    def conv_bn(
-        self, blob_in, dim_in, dim_out, kernel, stride, prefix, group=1, pad=1,
-    ):
-        conv_blob = self.model.Conv(
-            blob_in, prefix, dim_in, dim_out, kernel, stride=stride,
-            pad=pad, group=group,
-            weight_init=("MSRAFill", {}),
-            bias_init=('ConstantFill', {'value': 0.}), no_bias=1, engine=self.engine
-        )
-        test_mode = False
-        if self.split in ['test', 'val']:
-            test_mode = True
-        bn_blob = self.model.SpatialBN(
-            conv_blob, prefix + "_bn", dim_out,
-            epsilon=self.opts['model_param']['bn_epsilon'],
-            momentum=self.opts['model_param']['bn_momentum'],
-            is_test=test_mode,
-        )
-        return bn_blob
-
-    def conv_bn_relu(
-        self, blob_in, dim_in, dim_out, kernel, stride, prefix, pad=1, group=1,
-    ):
-        bn_blob = self.conv_bn(
-            blob_in, dim_in, dim_out, kernel, stride, prefix, group=group,
-            pad=pad
-        )
-        return self.model.Relu(bn_blob, bn_blob)
-
-    # 3(a)this block uses multi-way group conv implementation that splits blobs
-    def multiway_bottleneck_block(
-        self, blob_in, dim_in, dim_out, stride, prefix, dim_inner, group
-    ):
-        blob_out = self.conv_bn_relu(
-            blob_in, dim_in, dim_inner, 1, 1, prefix + "_branch2a", pad=0,
-        )
-
-        conv_blob = self.model.GroupConv_Deprecated(
-            blob_out, prefix + "_branch2b", dim_inner, dim_inner, kernel=3,
-            stride=stride, pad=1, group=group, weight_init=("MSRAFill", {}),
-            bias_init=('ConstantFill', {'value': 0.}), no_bias=1, engine=self.engine
-        )
-        test_mode = False
-        if self.split in ['test', 'val']:
-            test_mode = True
-        bn_blob = self.model.SpatialBN(
-            conv_blob, prefix + "_branch2b_bn", dim_out,
-            epsilon=self.opts['model_param']['bn_epsilon'],
-            momentum=self.opts['model_param']['bn_momentum'], is_test=test_mode,
-        )
-        relu_blob = self.model.Relu(bn_blob, bn_blob)
-
-        bn_blob = self.conv_bn(
-            relu_blob, dim_inner, dim_out, 1, 1, prefix + "_branch2c", pad=0
-        )
-        if self.opts['model_param']['custom_bn_init']:
-            self.model.param_init_net.ConstantFill(
-                [bn_blob + '_s'], bn_blob + '_s',
-                value=self.opts['model_param']['bn_init_gamma'])
-
-        sc_blob = self.add_shortcut(
-            blob_in, dim_in, dim_out, stride, prefix=prefix + "_branch1"
-        )
-        sum_blob = self.model.net.Sum([bn_blob, sc_blob], prefix + "_sum")
-        return self.model.Relu(sum_blob, sum_blob)
-
-    # 3(c) this block uses cudnn group conv op
-    def group_bottleneck_block(
-        self, blob_in, dim_in, dim_out, stride, prefix, dim_inner, group
-    ):
-        blob_out = self.conv_bn_relu(
-            blob_in, dim_in, dim_inner, 1, 1, prefix + "_branch2a", pad=0,
-        )
-        blob_out = self.conv_bn_relu(
-            blob_out, dim_inner, dim_inner, 3, stride, prefix + "_branch2b",
-            group=group
-        )
-        bn_blob = self.conv_bn(
-            blob_out, dim_inner, dim_out, 1, 1, prefix + "_branch2c", pad=0
-        )
-        if self.opts['model_param']['custom_bn_init']:
-            self.model.param_init_net.ConstantFill(
-                [bn_blob + '_s'], bn_blob + '_s',
-                value=self.opts['model_param']['bn_init_gamma'])
-
-        sc_blob = self.add_shortcut(
-            blob_in, dim_in, dim_out, stride, prefix=prefix + "_branch1"
-        )
-        sum_blob = self.model.net.Sum([bn_blob, sc_blob], prefix + "_sum")
-        return self.model.Relu(sum_blob, sum_blob)
-
-    # bottleneck residual layer for 50, 101, 152 layer networks
-    def bottleneck_block(
-        self, blob_in, dim_in, dim_out, stride, prefix, dim_inner, group=None
-    ):
-        blob_out = self.conv_bn_relu(
-            blob_in, dim_in, dim_inner, 1, 1, prefix + "_branch2a", pad=0,
-        )
-        blob_out = self.conv_bn_relu(
-            blob_out, dim_inner, dim_inner, 3, stride, prefix + "_branch2b",
-        )
-        bn_blob = self.conv_bn(
-            blob_out, dim_inner, dim_out, 1, 1, prefix + "_branch2c", pad=0
-        )
-        if self.opts['model_param']['custom_bn_init']:
-            self.model.param_init_net.ConstantFill(
-                [bn_blob + '_s'], bn_blob + '_s',
-                value=self.opts['model_param']['bn_init_gamma'])
-
-        sc_blob = self.add_shortcut(
-            blob_in, dim_in, dim_out, stride, prefix=prefix + "_branch1"
-        )
-        sum_blob = self.model.net.Sum([bn_blob, sc_blob], prefix + "_sum")
-        return self.model.Relu(sum_blob, sum_blob)
-
-    # basic layer for the 18 and 34 layer networks and the CIFAR data netwrorks
-    def basic_block(
-        self, blob_in, dim_in, dim_out, stride, prefix, dim_inner=None,
-        group=None,
-    ):
-        blob_out = self.conv_bn_relu(
-            blob_in, dim_in, dim_out, 3, stride, prefix + "_branch2a"
-        )
-        bn_blob = self.conv_bn(
-            blob_out, dim_out, dim_out, 3, 1, prefix + "_branch2b", pad=1
-        )
-        sc_blob = self.add_shortcut(
-            blob_in, dim_in, dim_out, stride, prefix=prefix + "_branch1"
-        )
-        sum_blob = self.model.net.Sum([bn_blob, sc_blob], prefix + "_sum")
-        return self.model.Relu(sum_blob, sum_blob)
-
-    def residual_layer(
-        self, block_fn, blob_in, dim_in, dim_out, stride, num_blocks, prefix,
-        dim_inner=None, group=None
-    ):
-        # prefix is something like: res2, res3, etc.
-        # each res layer has num_blocks stacked
-        for idx in range(num_blocks):
-            block_prefix = "{}_{}".format(prefix, idx)
-            block_stride = 2 if (idx == 0 and stride == 2) else 1
-            blob_in = block_fn(
-                blob_in, dim_in, dim_out, block_stride, block_prefix, dim_inner,
-                group
-            )
-            dim_in = dim_out
-        return blob_in, dim_in
diff --git a/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py b/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py
deleted file mode 100644
index 5378acd618864..0000000000000
--- a/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py
+++ /dev/null
@@ -1,65 +0,0 @@
-
-
-
-
-
-from caffe2.python import workspace, core
-from caffe2.proto import caffe2_pb2
-
-
-def gen_param_update_builder_fun(self, model, dataset, is_train):
-    if not is_train:
-        return None
-    else:
-        # from sherlok
-        for idx in range(self.opts['distributed']['first_xpu_id'],
-                         self.opts['distributed']['first_xpu_id'] +
-                         self.opts['distributed']['num_xpus']):
-            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, idx)):
-                workspace.CreateBlob('{}_{}/lr'.
-                    format(self.opts['distributed']['device'], idx))
-
-        def add_parameter_update_ops(model):
-            model.Iter("ITER")
-            weight_decay = model.param_init_net.ConstantFill(
-                [], 'weight_decay', shape=[1],
-                value=self.opts['model_param']['weight_decay']
-            )
-            weight_decay_bn = model.param_init_net.ConstantFill(
-                [], 'weight_decay_bn', shape=[1],
-                value=self.opts['model_param']['weight_decay_bn']
-            )
-            one = model.param_init_net.ConstantFill(
-                [], "ONE", shape=[1], value=1.0
-            )
-
-            '''
-            Add the momentum-SGD update.
-            '''
-            params = model.GetParams()
-            assert(len(params) > 0)
-
-            for param in params:
-                param_grad = model.param_to_grad[param]
-                param_momentum = model.param_init_net.ConstantFill(
-                    [param], param + '_momentum', value=0.0
-                )
-
-                if '_bn' in str(param):
-                    model.WeightedSum(
-                        [param_grad, one, param, weight_decay_bn], param_grad
-                    )
-                else:
-                    model.WeightedSum(
-                        [param_grad, one, param, weight_decay], param_grad
-                    )
-
-                # Update param_grad and param_momentum in place
-                model.net.MomentumSGDUpdate(
-                    [param_grad, param_momentum, 'lr', param],
-                    [param_grad, param_momentum, param],
-                    momentum=0.9,
-                    nesterov=1
-                )
-
-        return add_parameter_update_ops
diff --git a/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py b/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py
deleted file mode 100644
index 496ac22ffde5b..0000000000000
--- a/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py
+++ /dev/null
@@ -1,54 +0,0 @@
-
-
-
-
-
-# # example1 using gfs as input source.
-
-def gen_input_builder_fun(self, model, dataset, is_train):
-    if is_train:
-        input_path = self.opts['input']['train_input_path']
-    else:
-        input_path = self.opts['input']['test_input_path']
-
-    reader = model.CreateDB("reader",
-                            db=input_path,
-                            db_type='lmdb',
-                            shard_id=self.shard_id,
-                            num_shards=self.opts['distributed']['num_shards'],)
-
-    def AddImageInput(model, reader, batch_size, img_size):
-        '''
-        Image input operator that loads data from reader and
-        applies certain transformations to the images.
-        '''
-        data, label = model.ImageInput(
-            reader,
-            ["data", "label"],
-            batch_size=batch_size,
-            use_caffe_datum=True,
-            mean=128.,
-            std=128.,
-            scale=256,
-            crop=img_size,
-            mirror=1,
-            is_test=True
-        )
-        data = model.StopGradient(data, data)
-
-    def add_image_input(model):
-        AddImageInput(
-            model,
-            reader,
-            batch_size=self.opts['epoch_iter']['batch_per_device'],
-            img_size=self.opts['input']['imsize'],
-        )
-    return add_image_input
-
-
-def get_input_dataset(opts):
-    return []
-
-
-def get_model_input_fun(self):
-    pass
diff --git a/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py b/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py
deleted file mode 100644
index 419d6a25e95b6..0000000000000
--- a/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py
+++ /dev/null
@@ -1,16 +0,0 @@
-
-
-
-
-
-def checkpoint(self, epoch):
-    self.model_path = None
-    pass
-
-def prep_data_parallel_models(self):
-    # only do train_model no test needed here
-    self.prep_a_data_parallel_model(self.train_model,
-                                    self.train_dataset, True)
-
-def run_testing_net(self):
-    pass
diff --git a/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py b/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py
deleted file mode 100644
index 0a56d68257eec..0000000000000
--- a/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py
+++ /dev/null
@@ -1,41 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python import dyndep
-dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:file_store_handler_ops')
-
-
-# rendezvous should NOT be unique for each operator.  It should have
-# the same run_id on different operators.  say we have two shards,
-# both shards created rendezvous of run_id "aaa_bbb_epoch_09", and this
-# rendezvous will wait for two shards to join because max_shards is specified
-# to be 2.  If each shard created an rendezvous with different run_id,
-# each of them are waiting for different rendezvous to join, they will
-# never wait for each other and therefore timeout eventually.
-
-def gen_rendezvous_ctx(self, model, dataset, is_train):
-    if self.opts['distributed']['num_shards'] < 2:
-        return None
-    # have issue when try to set this up on more shards
-    workspace.RunOperatorOnce(
-        core.CreateOperator(
-            "FileStoreHandlerCreate", [], ["store_handler"],
-            path="/tmp",
-            prefix="epoch.{}".format(self.epoch),
-        )
-    )
-
-    rendezvous = dict(
-        kv_handler="store_handler",
-        shard_id=self.shard_id,
-        num_shards=self.opts['distributed']['num_shards'],
-        engine="GLOO",
-        # transport=args.distributed_transport,
-        transport="tcp",
-        # interface=interfaces[0],
-        interface=[],
-        exit_nets=None) if is_train else None
-    return rendezvous
diff --git a/caffe2/contrib/prof/CMakeLists.txt b/caffe2/contrib/prof/CMakeLists.txt
deleted file mode 100644
index 913cb6402190e..0000000000000
--- a/caffe2/contrib/prof/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-if(USE_PROF)
-  set(Caffe2_CONTRIB_PROF_CPU_SRCS
-      "${CMAKE_CURRENT_SOURCE_DIR}/prof_dag_stats_op.cc"
-  )
-  set(Caffe2_CONTRIB_PROF_GPU_SRCS
-      "${CMAKE_CURRENT_SOURCE_DIR}/cuda_profile_ops.cc"
-  )
-
-  file(GLOB_RECURSE Caffe2_CONTRIB_PROF_CPU_CPP_TEST_SRCS *test.cc)
-
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_PROF_CPU_SRCS} PARENT_SCOPE)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS}
-      ${Caffe2_CONTRIB_PROF_CPU_CPP_TEST_SRCS} PARENT_SCOPE)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_PROF_GPU_SRCS} PARENT_SCOPE)
-
-endif()
diff --git a/caffe2/contrib/prof/cuda_profile_ops.cc b/caffe2/contrib/prof/cuda_profile_ops.cc
deleted file mode 100644
index 893d8e8415a08..0000000000000
--- a/caffe2/contrib/prof/cuda_profile_ops.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-#include "caffe2/core/common_gpu.h"
-#include "caffe2/core/operator.h"
-
-#include <stdlib.h>
-#include <string.h>
-
-#include <cuda_profiler_api.h>
-
-namespace caffe2 {
-
-static std::vector<std::string> kCudaProfileConfiguration = {
-    "gpustarttimestamp",
-    "gpuendtimestamp",
-    "gridsize3d",
-    "threadblocksize",
-    "dynsmemperblock",
-    "stasmemperblock",
-    "regperthread",
-    "memtransfersize",
-    "memtransferdir",
-    "memtransferhostmemtype",
-    "streamid",
-    "cacheconfigrequested",
-    "cacheconfigexecuted",
-    "countermodeaggregate",
-    "enableonstart 0",
-    "active_warps",
-    "active_cycles",
-};
-
-class CudaProfileInitializeOp : public OperatorBase {
- public:
-  CudaProfileInitializeOp(const OperatorDef& operator_def, Workspace* ws)
-      : OperatorBase(operator_def, ws),
-        output_(GetSingleArgument<std::string>("output", "/tmp/output")) {
-    std::array<char, 128> buf;
-    std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
-    CAFFE_ENFORCE_LT(tmpl.size(), buf.size());
-    memcpy(buf.data(), tmpl.data(), tmpl.size());
-    auto result = mktemp(buf.data());
-    CAFFE_ENFORCE_NE(strlen(result), 0, "mktemp: ", strerror(errno));
-    config_ = result;
-
-    // Write configuration to temporary file
-    {
-      std::ofstream ofs(config_, std::ios::out | std::ios::trunc);
-      CAFFE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
-      for (const auto& line : kCudaProfileConfiguration) {
-        ofs << line << std::endl;
-      }
-    }
-  }
-
-  ~CudaProfileInitializeOp() override {
-    unlink(config_.c_str());
-  }
-
-  bool Run(int /* unused */ /*stream_id*/ = 0) override {
-    // If this fails, check the contents of "output" for hints.
-#if defined(CUDA_VERSION) && CUDA_VERSION < 12000
-    // cudaProfilerInitialize is no longer needed after CUDA 12:
-    // https://forums.developer.nvidia.com/t/cudaprofilerinitialize-is-deprecated-alternative/200776/3
-    CUDA_CHECK(
-        cudaProfilerInitialize(config_.c_str(), output_.c_str(), cudaCSV));
-#endif
-    return true;
-  }
-
- protected:
-  std::string config_;
-  std::string output_;
-};
-
-class CudaProfileStartOp : public OperatorBase {
- public:
-  CudaProfileStartOp(const OperatorDef& operator_def, Workspace* ws)
-      : OperatorBase(operator_def, ws) {}
-
-  bool Run(int /* unused */ /*stream_id*/ = 0) override {
-    CUDA_ENFORCE(cudaProfilerStart());
-    return true;
-  }
-};
-
-class CudaProfileStopOp : public OperatorBase {
- public:
-  CudaProfileStopOp(const OperatorDef& operator_def, Workspace* ws)
-      : OperatorBase(operator_def, ws) {}
-
-  bool Run(int /* unused */ /*stream_id*/ = 0) override {
-    CUDA_ENFORCE(cudaProfilerStop());
-    return true;
-  }
-};
-
-OPERATOR_SCHEMA(CudaProfileInitialize);
-OPERATOR_SCHEMA(CudaProfileStart);
-OPERATOR_SCHEMA(CudaProfileStop);
-
-REGISTER_CPU_OPERATOR(CudaProfileInitialize, CudaProfileInitializeOp);
-REGISTER_CPU_OPERATOR(CudaProfileStart, CudaProfileStartOp);
-REGISTER_CPU_OPERATOR(CudaProfileStop, CudaProfileStopOp);
-
-REGISTER_CUDA_OPERATOR(CudaProfileInitialize, CudaProfileInitializeOp);
-REGISTER_CUDA_OPERATOR(CudaProfileStart, CudaProfileStartOp);
-REGISTER_CUDA_OPERATOR(CudaProfileStop, CudaProfileStopOp);
-
-} // namespace caffe2
diff --git a/caffe2/contrib/prof/cuda_profile_ops_test.py b/caffe2/contrib/prof/cuda_profile_ops_test.py
deleted file mode 100644
index c77b7ae88ba66..0000000000000
--- a/caffe2/contrib/prof/cuda_profile_ops_test.py
+++ /dev/null
@@ -1,24 +0,0 @@
-
-
-
-
-
-import unittest
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, dyndep, workspace
-
-dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/prof:cuda_profile_ops")
-
-
-class CudaProfileOpsTest(unittest.TestCase):
-    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU")
-    def test_run(self):
-        net = core.Net("net")
-        net.CudaProfileInitialize([], [], output="/tmp/cuda_profile_test")
-        net.CudaProfileStart([], [])
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
-            net.ConstantFill([], ["out"], shape=[1, 3, 244, 244])
-        net.CudaProfileStop([], [])
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net)
diff --git a/caffe2/contrib/prof/prof_dag_stats_op.cc b/caffe2/contrib/prof/prof_dag_stats_op.cc
deleted file mode 100644
index 684f764ec871d..0000000000000
--- a/caffe2/contrib/prof/prof_dag_stats_op.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "prof_dag_stats_op.h"
-
-namespace caffe2 {
-namespace {
-
-REGISTER_CPU_OPERATOR(GetProfDagStats, GetProfDagStatsOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(GetProfDagStats)
-    .NumInputs(0)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Gets the profiling statistics.
-)DOC")
-    .Arg(
-        "per_op",
-        "(bool) default to false; False: calculate per-op-type cost."
-        "True: calculate per-op cost, the cost of multiple instances of the same "
-        "op will be calculated separately")
-    .Arg(
-        "partial_net_name",
-        "(string) default to empty; describes the partial name of the net")
-    .Arg(
-        "net_name",
-        "(string) default to empty; describes the name of the net");
-} // namespace
-} // namespace caffe2
diff --git a/caffe2/contrib/prof/prof_dag_stats_op.h b/caffe2/contrib/prof/prof_dag_stats_op.h
deleted file mode 100644
index 1cb9850aad335..0000000000000
--- a/caffe2/contrib/prof/prof_dag_stats_op.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
-#define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/net_async_base.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-// This operator outputs the prof_dag stats
-template <typename T, class Context, class Engine = DefaultEngine>
-class GetProfDagStatsOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  GetProfDagStatsOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        net_name_(OperatorBase::GetSingleArgument<std::string>("net_name", "")),
-        partial_net_name_(OperatorBase::GetSingleArgument<std::string>(
-            "partial_net_name",
-            "")),
-        per_op_(OperatorBase::GetSingleArgument<bool>("per_op", false)) {
-    ws_ = ws;
-    CAFFE_ENFORCE(
-        !(net_name_.empty() && partial_net_name_.empty()),
-        "You need to provide net_name or partial_net_name");
-    CAFFE_ENFORCE(
-        net_name_.empty() || partial_net_name_.empty(),
-        "You can not provide both net_name and partial_net_name");
-  }
-  ~GetProfDagStatsOp() {}
-
-  bool RunOnDevice() override {
-    // find the net by net_name_ or partial_net_name
-    NetBase* net = nullptr;
-    if (!net_name_.empty()) {
-      net = ws_->GetNet(net_name_);
-    } else if (!partial_net_name_.empty()) {
-      for (auto& current_net : ws_->Nets()) {
-        if (current_net.find(partial_net_name_) != std::string::npos) {
-          CAFFE_ENFORCE(
-              net == nullptr,
-              "There are multiple nets with ",
-              partial_net_name_,
-              " as part of their name");
-          net = ws_->GetNet(current_net);
-        }
-      }
-      CAFFE_ENFORCE(
-          net,
-          "Can not find a net with ",
-          partial_net_name_,
-          " as part of its name");
-    }
-
-    auto async_net = dynamic_cast_if_rtti<AsyncNetBase*>(net);
-    CAFFE_ENFORCE(async_net);
-    auto stats = getProtos(async_net);
-
-    // Write protobuf message to the output blob
-    std::string serialized_data;
-    CAFFE_ENFORCE(stats.SerializeToString(&serialized_data));
-    Output(0)->Resize(1);
-    Output(0)->template mutable_data<std::string>()[0] = serialized_data;
-
-    return true;
-  }
-
-  ProfDAGProtos getProtos(AsyncNetBase* net) {
-    ProfDAGProtos stats;
-    if (per_op_) {
-      stats = net->GetPerOperatorCost();
-    } else {
-      stats = net->GetOperatorStats();
-    }
-    return stats;
-  }
-
- protected:
-  std::string net_name_;
-  std::string partial_net_name_;
-  bool per_op_;
-  Workspace* ws_;
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
diff --git a/caffe2/contrib/pytorch/script_module_op.cc b/caffe2/contrib/pytorch/script_module_op.cc
deleted file mode 100644
index a18f72fb08016..0000000000000
--- a/caffe2/contrib/pytorch/script_module_op.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-#include <caffe2/core/context.h>
-#include <caffe2/core/operator.h>
-#include <caffe2/utils/math.h>
-#include <torch/script.h>
-#include "caffe2/core/blob_serialization.h"
-
-namespace caffe2 {
-
-using torch::jit::IValue;
-using torch::jit::Method;
-using torch::jit::Module;
-
-namespace {
-class ScriptModuleSerializer : public BlobSerializerBase {
- public:
-  void Serialize(
-      const void* pointer,
-      TypeMeta typeMeta,
-      const string& name,
-      SerializationAcceptor acceptor) override {
-    CAFFE_ENFORCE(typeMeta.Match<std::unique_ptr<Module>>());
-
-    std::stringstream ss;
-    (*static_cast<const std::unique_ptr<Module>*>(pointer))->save(ss);
-
-    // NB: wrapping the entire zip archive as one string is probably not a
-    // good idea and might be slow. This is meant as a workaround, any proper
-    // usage would require splitting out tensor data separately.
-    //
-    // In the future we can do it by introducing a different "type" string for
-    // the more efficient serialization version (if we ever get to that point)
-    BlobProto blob_proto;
-    blob_proto.set_name(name);
-    blob_proto.set_type("torch::jit::Module");
-    blob_proto.set_content(ss.str());
-    acceptor(name, SerializeBlobProtoAsString_EnforceCheck(blob_proto));
-  }
-};
-
-class ScriptModuleDeserializer : public BlobDeserializerBase {
- public:
-  void Deserialize(const BlobProto& proto, Blob* blob) override {
-    const auto& serialized = proto.content();
-    // TODO: use adapter instead of istream?
-    std::stringstream ss;
-    ss << serialized;
-    ss.seekg(0);
-    blob->GetMutable<std::unique_ptr<Module>>()->reset(
-        new Module(torch::jit::load(ss)));
-  }
-};
-
-class ScriptModuleLoadOp final : public Operator<CPUContext> {
- public:
-  ScriptModuleLoadOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws) {
-    CAFFE_ENFORCE(HasArgument("serialized_binary"));
-  }
-
-  bool RunOnDevice() override {
-    auto moduleBinary = GetSingleArgument<string>("serialized_binary", "");
-    // TODO: use adapter instead of istream?
-    std::stringstream ss;
-    ss << moduleBinary;
-    ss.seekg(0);
-    OperatorBase::Output<std::unique_ptr<Module>>(0)->reset(
-        new Module(torch::jit::load(ss)));
-    return true;
-  }
-};
-
-template <class Context>
-class ScriptModuleOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  ScriptModuleOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        method_name_(
-            this->template GetSingleArgument<std::string>("method", "forward")),
-        pass_inputs_as_tensor_list_(this->template GetSingleArgument<bool>(
-            "pass_inputs_as_tensor_list",
-            false)) {
-    // TODO: we could also parse extra arguments here and allow to pass in
-    // scalars to the method invocation. But there's probably less blocking need
-    // for that.
-  }
-
-  static caffe2::Tensor castIValueToTensor(IValue v) {
-    return caffe2::Tensor(std::move(v).toTensor());
-  }
-
-  bool RunOnDevice() override {
-    // The ScriptModule could have requires-grad parameters, however we don't
-    // want their gradients to be tracked in this operator.
-    torch::NoGradGuard guard;
-
-    const auto& module = OperatorBase::Input<std::unique_ptr<Module>>(0);
-    CAFFE_ENFORCE(module);
-    Method method = module->get_method(method_name_);
-
-    // Assume all inputs are tensor for now
-    std::vector<IValue> inputs;
-    if (pass_inputs_as_tensor_list_) {
-      std::vector<at::Tensor> tensor_vector;
-      const int num_inputs = InputSize();
-      tensor_vector.reserve(num_inputs);
-      for (int i = 1; i < num_inputs; ++i) {
-        tensor_vector.emplace_back(at::Tensor(Input(i)));
-      }
-
-      inputs.emplace_back(at::TensorList(tensor_vector));
-    } else {
-      const int num_inputs = InputSize();
-      inputs.reserve(num_inputs);
-      for (int i = 1; i < num_inputs; ++i) {
-        inputs.emplace_back(at::Tensor(Input(i)));
-      }
-    }
-
-    // We just convert specified inputs. If some of the inputs were omitted and
-    // don't have default values, method::operator() is going to complain.
-    IValue output = method(inputs);
-    if (output.isTuple()) {
-      auto elems = std::move(output).toTuple();
-      CAFFE_ENFORCE_EQ(elems->elements().size(), OutputSize());
-      for (int i = 0; i < elems->elements().size(); ++i) {
-        this->SetOutputTensor(i, castIValueToTensor(elems->elements()[i]));
-      }
-    } else if (output.isTensor()) {
-      CAFFE_ENFORCE_EQ(1, OutputSize());
-      this->SetOutputTensor(0, castIValueToTensor(std::move(output)));
-    } else {
-      CAFFE_THROW("Unexpected return type: ", output.tagKind());
-    }
-    return true;
-  }
-
- private:
-  std::string method_name_;
-  bool pass_inputs_as_tensor_list_;
-};
-} // namespace
-
-CAFFE_KNOWN_TYPE(std::unique_ptr<Module>);
-
-REGISTER_BLOB_SERIALIZER(
-    (TypeMeta::Id<std::unique_ptr<Module>>()),
-    ScriptModuleSerializer);
-// NB: the first argument to REGISTER_BLOB_DESERIALIZER macro doesn't really
-// need to be a real type, it just get converted to string
-REGISTER_BLOB_DESERIALIZER(torch::jit::Module, ScriptModuleDeserializer);
-
-OPERATOR_SCHEMA(ScriptModule)
-    .NumInputs(1, INT_MAX)
-    .NumOutputs(0, INT_MAX)
-    .Input(0, "script_module_instance", "Instance of shared_ptr<Module>");
-REGISTER_CPU_OPERATOR(ScriptModule, ScriptModuleOp<CPUContext>);
-SHOULD_NOT_DO_GRADIENT(ScriptModule);
-
-OPERATOR_SCHEMA(ScriptModuleLoad)
-    .NumInputs(0)
-    .NumOutputs(1)
-    .DisallowInputFillers()
-    .Output(0, "script_module_instance", "New instance of shared_ptr<Module>")
-    .Arg(
-        "serialized_binary",
-        "Binary string representing contents of .pt file (zip container)");
-REGISTER_CPU_OPERATOR(ScriptModuleLoad, ScriptModuleLoadOp);
-NO_GRADIENT(ScriptModuleLoad);
-
-} // namespace caffe2
diff --git a/caffe2/contrib/shm_mutex/CMakeLists.txt b/caffe2/contrib/shm_mutex/CMakeLists.txt
deleted file mode 100644
index 3fd2e69b22307..0000000000000
--- a/caffe2/contrib/shm_mutex/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-if(USE_SHM_MUTEX)
-  set(Caffe2_CONTRIB_SHMMUTEX_CPU_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/shm_mutex.cc"
-    )
-
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_SHMMUTEX_CPU_SRC} PARENT_SCOPE)
-endif()
diff --git a/caffe2/contrib/shm_mutex/shm_mutex.cc b/caffe2/contrib/shm_mutex/shm_mutex.cc
deleted file mode 100644
index c323703cb306c..0000000000000
--- a/caffe2/contrib/shm_mutex/shm_mutex.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "shm_mutex.h"
-
-ShmProcessMutexCheck& ShmProcessMutexCheck::getInstance() {
-  static ShmProcessMutexCheck singleton;
-  return singleton;
-}
-
-bool ShmProcessMutexCheck::addLock(const std::string& name) {
-  std::lock_guard<std::mutex> l(m_);
-  auto p = shmLocks_.emplace(name);
-  return p.second;
-}
-
-bool ShmProcessMutexCheck::removeLock(const std::string& name) {
-  std::lock_guard<std::mutex> l(m_);
-  return shmLocks_.erase(name) == 1;
-}
diff --git a/caffe2/contrib/shm_mutex/shm_mutex.h b/caffe2/contrib/shm_mutex/shm_mutex.h
deleted file mode 100644
index 8f0293caf5f39..0000000000000
--- a/caffe2/contrib/shm_mutex/shm_mutex.h
+++ /dev/null
@@ -1,343 +0,0 @@
-/*
- * This implements a machine-wide mutex to be used
- * to synchronize CUDA calls (memory allocation and frees) and
- * NCCL calls. This prevents a potential deadlock that
- * can occur.
- *
- * The implementation has a few caveats:
- *   - it assumes that PID are not reused
- *   - there is a possible race between the creation (shm_open followed
- *     by ftruncate) and the spin on 'isInitialized' (if the memory region is
- *     not all zeroes).
- *
- * There are two implementations of the mutex and they vary mostly by how
- * they wait:
- *   - The ShmTicketMutex_t is a simple ticket based lock and processes will
- *     queue up and only attempt to grab the lock when it is their turn
- *   - The ShmTTSetMutex_t is a simple test-test-and-set mutex. It is possibly
- *     faster for low contention.
- *
- * Use both as you would use any std::mutex. Both mutexes support try_lock as
- * well.
- */
-#pragma once
-
-#include <fcntl.h>
-#include <signal.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <climits>
-
-#include <atomic>
-#include <mutex>
-#include <string>
-#include <unordered_set>
-
-#include "caffe2/core/logging.h"
-
-const int kTicketDelay = 1000;
-const int kTimeout = 1000;
-
-class ShmProcessMutexCheck {
- public:
-  static ShmProcessMutexCheck& getInstance();
-  ShmProcessMutexCheck(const ShmProcessMutexCheck&) = delete;
-  ShmProcessMutexCheck& operator=(const ShmProcessMutexCheck&) = delete;
-
-  bool addLock(const std::string& name);
-  bool removeLock(const std::string& name);
-
- protected:
-  ShmProcessMutexCheck() = default;
-  std::mutex m_;
-  std::unordered_set<std::string> shmLocks_;
-};
-
-template <class Derived>
-struct shm_traits;
-
-struct ShmBaseHeader {
-  std::atomic<bool> isInitialized;
-  std::atomic<int> countMapped;
-  std::atomic<pid_t> owner;
-};
-
-template <class Impl>
-class ShmProcessMutex {
- public:
-  using header_t = typename shm_traits<Impl>::header_t;
-
-  explicit ShmProcessMutex(const char* name)
-      : name_(name), check_(ShmProcessMutexCheck::getInstance()) {
-    CAFFE_ENFORCE(check_.addLock(name_), "Creating duplicate lock: ", name_);
-    myPid_ = getpid();
-    // Try to open and map the shared memory location
-    int fd = -1;
-    while (true) {
-      fd = shm_open(name, O_RDWR, 0);
-      if (fd == -1) {
-        CAFFE_ENFORCE(
-            errno == ENOENT,
-            "shm_open failed with not ENOENT: ",
-            strerror(errno));
-
-        // Create new object
-        fd = shm_open(name, O_RDWR | O_CREAT | O_EXCL, 0700);
-        if (fd == -1 && errno == EEXIST) {
-          // Some other process created first; loop around to re-open
-          continue;
-        }
-        CAFFE_ENFORCE(
-            fd != -1, "shm_open failed with create: ", strerror(errno));
-        // At this point, we are the creator of the shared object.
-        // Initialize the header_ (it's all 0 right now)
-        auto rv = ftruncate(fd, sizeof(header_t));
-        CAFFE_ENFORCE(rv != -1, "ftruncate: ", strerror(errno));
-
-        // Map memory and initialize
-        header_ = (header_t*)mmap(
-            nullptr,
-            sizeof(header_t),
-            PROT_READ | PROT_WRITE,
-            MAP_SHARED,
-            fd,
-            0);
-        CAFFE_ENFORCE(header_ != MAP_FAILED, "mmap: ", strerror(errno));
-
-        header_->countMapped = 1;
-        header_->owner = 0;
-        header_->isInitialized.store(true, std::memory_order_release);
-        close(fd);
-        break;
-      } else {
-        // Object exists, we just map it
-        header_ = (header_t*)mmap(
-            nullptr,
-            sizeof(header_t),
-            PROT_READ | PROT_WRITE,
-            MAP_SHARED,
-            fd,
-            0);
-        CAFFE_ENFORCE(header_ != MAP_FAILED, "mmap: ", strerror(errno));
-
-        // Wait for memory to be initialized
-        while (header_->isInitialized.load(std::memory_order_acquire) == 0) {
-          // Spin; should be done soon
-        }
-        // Now check if we can register ourself by incrementing countMapped.
-        // If we are "locked-out" (shared object being destroyed), retry
-        if (header_->countMapped.fetch_add(1, std::memory_order_relaxed) < 0) {
-          header_->countMapped.fetch_sub(1, std::memory_order_relaxed);
-          int rv = munmap(header_, sizeof(header_t));
-          CAFFE_ENFORCE(rv == 0, "munmap (to retry) failed: ", strerror(errno));
-          close(fd);
-          continue;
-        }
-        close(fd);
-        break;
-      }
-    }
-  }
-
-  ~ShmProcessMutex() {
-    if (header_ != nullptr) {
-      // We are participating in a lock. Destroy
-      internalDestroy();
-    }
-  }
-
-  // Copy and assignment operator are implicitly deleted
-
-  ShmProcessMutex(ShmProcessMutex&& toMove) noexcept
-      : header_(toMove.header_),
-        myPid_(toMove.myPid_),
-        name_(toMove.name_),
-        check_(toMove.check_) {
-    toMove.header_ = nullptr;
-    toMove.myPid_ = -1;
-  }
-
-  ShmProcessMutex& operator=(ShmProcessMutex&& toMove) {
-    CAFFE_ENFORCE(toMove.myPid_ == this->myPid_);
-    if (&toMove != this) {
-      internalDestroy();
-      header_ = toMove.header_;
-      name_ = toMove.name_;
-      toMove.header_ = nullptr;
-      toMove.myPid_ = -1;
-    }
-    return *this;
-  }
-
-  void lock() {
-    pid_t expectedPid = 0;
-    while (not header_->owner.compare_exchange_weak(
-        expectedPid,
-        myPid_,
-        std::memory_order_relaxed,
-        std::memory_order_relaxed)) {
-      if (expectedPid == 0) {
-        continue;
-      }
-      // Someone else has the lock. We check if that process is
-      // still alive
-      if (kill(expectedPid, 0) < 0 && errno == ESRCH) {
-        // The process no longer exists. Try to "steal" the lock
-        continue;
-      }
-      while (true) {
-        if (static_cast<Impl*>(this)->waitForLock()) {
-          return;
-        }
-        expectedPid = header_->owner.load(std::memory_order_relaxed);
-        if (expectedPid == 0 || (kill(expectedPid, 0) < 0 && errno == ESRCH)) {
-          break;
-        }
-      }
-    }
-  }
-
-  bool try_lock() {
-    pid_t expectedPid = 0;
-    bool firstTry = true;
-    while (not header_->owner.compare_exchange_weak(
-        expectedPid,
-        myPid_,
-        std::memory_order_relaxed,
-        std::memory_order_relaxed)) {
-      if (expectedPid == 0) {
-        continue;
-      }
-      // Someone else has the lock. We check if that process is
-      // still alive
-      if (firstTry && kill(expectedPid, 0) < 0 && errno == ESRCH) {
-        firstTry = false;
-        // The process no longer exists. Try to "steal" the lock once
-        continue;
-      }
-      return false;
-    }
-    return true;
-  }
-
-  void unlock() noexcept {
-    header_->owner.store(0, std::memory_order_relaxed);
-    static_cast<Impl*>(this)->subUnlock();
-  }
-
- protected:
-  header_t* header_;
-  pid_t myPid_;
-  std::string name_;
-
-  ShmProcessMutexCheck& check_;
-
- private:
-  void internalDestroy() {
-    CAFFE_ENFORCE(header_ != nullptr, "Internal error");
-    CAFFE_ENFORCE(check_.removeLock(name_), "Double free of lock: ", name_);
-    // Unmap the memory. If we are the last one, "lock" the
-    // shared memory and free it if successful
-    int oldCount = header_->countMapped.fetch_sub(1, std::memory_order_relaxed);
-    bool doUnlink = false;
-    if (oldCount == 1) {
-      // We were the last one. We attempt to lock out
-      // future processes by exchanging with something very negative
-      // This simplifies the checks when checking for lock out
-      oldCount = 0;
-      if (header_->countMapped.compare_exchange_strong(
-              oldCount,
-              INT_MIN,
-              std::memory_order_relaxed,
-              std::memory_order_relaxed)) {
-        doUnlink = true;
-      }
-    }
-    int rv = munmap(header_, sizeof(header_t));
-    CAFFE_ENFORCE(rv == 0, "munmap failed: ", strerror(errno));
-    if (doUnlink) {
-      rv = shm_unlink(name_.c_str());
-      CAFFE_ENFORCE(rv == 0, "shm_unlink failed: ", strerror(errno));
-    }
-  }
-};
-
-template <class T>
-class ShmTTSetMutex : public ShmProcessMutex<ShmTTSetMutex<T>> {
- public:
-  friend class ShmProcessMutex<ShmTTSetMutex<T>>;
-  explicit ShmTTSetMutex(const char* name, int timeout = kTimeout)
-      : ShmProcessMutex<ShmTTSetMutex>(name), timeout_(timeout) {}
-
- protected:
-  bool waitForLock() {
-    int delay = timeout_;
-    pid_t expectedPid = 0;
-    while (--delay > 0 &&
-           this->header_->owner.load(std::memory_order_relaxed)) {
-      // Empty loop
-      __asm__ __volatile__("");
-    }
-    return this->header_->owner.compare_exchange_strong(
-        expectedPid, this->myPid_, std::memory_order_relaxed);
-  }
-
-  void subUnlock() noexcept {}
-  int timeout_;
-};
-
-template <class T>
-class ShmTicketMutex : public ShmProcessMutex<ShmTicketMutex<T>> {
- public:
-  friend class ShmProcessMutex<ShmTicketMutex<T>>;
-  explicit ShmTicketMutex(const char* name, int delay = kTicketDelay)
-      : ShmProcessMutex<ShmTicketMutex>(name), delay_(delay) {}
-
- protected:
-  bool waitForLock() {
-    pid_t expectedPid = 0;
-    int slot = this->header_->ticket.fetch_add(1, std::memory_order_relaxed);
-    for (;;) {
-      int spintime =
-          (slot - this->header_->now.load(std::memory_order_relaxed)) * delay_;
-      for (int i = 0; i < spintime; i++) {
-        // Empty loop
-        __asm__ __volatile__("");
-      }
-      if (this->header_->now.load(std::memory_order_relaxed) == slot) {
-        break;
-      }
-    }
-    return this->header_->owner.compare_exchange_strong(
-        expectedPid, this->myPid_, std::memory_order_relaxed);
-  }
-
-  void subUnlock() noexcept {
-    this->header_->now.fetch_add(1, std::memory_order_relaxed);
-  }
-
-  int delay_;
-};
-
-template <class T>
-struct shm_traits<ShmTTSetMutex<T>> {
-  using header_t = T;
-};
-
-template <class T>
-struct shm_traits<ShmTicketMutex<T>> {
-  using header_t = T;
-};
-
-struct TicketStruct : ShmBaseHeader {
-  std::atomic<unsigned> ticket;
-  std::atomic<unsigned> now;
-};
-
-template class ShmTicketMutex<TicketStruct>;
-template class ShmTTSetMutex<ShmBaseHeader>;
-
-using ShmTicketMutex_t = ShmTicketMutex<TicketStruct>;
-using ShmTTSetMutex_t = ShmTTSetMutex<ShmBaseHeader>;
diff --git a/caffe2/contrib/tensorboard/tensorboard.md b/caffe2/contrib/tensorboard/tensorboard.md
deleted file mode 100644
index ce1f72abf0e89..0000000000000
--- a/caffe2/contrib/tensorboard/tensorboard.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# Using TensorBoard in ifbpy #
-
-## Simple Example ##
-
-```lang=py
-
-import caffe2.contrib.tensorboard.tensorboard as tb
-import caffe2.contrib.tensorboard.tensorboard_exporter as tb_exporter
-from caffe2.python import brew, core, model_helper
-
-model = model_helper.ModelHelper(name="overfeat")
-data, label = brew.image_input(
-    model, ["db"], ["data", "label"], is_test=0
-)
-with core.NameScope("conv1"):
-    conv1 = brew.conv(model, data, "conv1", 3, 96, 11, stride=4)
-    relu1 = brew.relu(model, conv1, conv1)
-    pool1 = brew.max_pool(model, relu1, "pool1", kernel=2, stride=2)
-with core.NameScope("conv2"):
-    conv2 = brew.conv(model, pool1, "conv2", 96, 256, 5)
-    relu2 = brew.relu(model, conv2, conv2)
-    pool2 = brew.max_pool(model, relu2, "pool2", kernel=2, stride=2)
-with core.NameScope("conv3"):
-    conv3 = brew.conv(model, pool2, "conv3", 256, 512, 3, pad=1)
-    relu3 = brew.relu(model, conv3, conv3)
-with core.NameScope("conv4"):
-    conv4 = brew.conv(model, relu3, "conv4", 512, 1024, 3, pad=1)
-    relu4 = brew.relu(model, conv4, conv4)
-with core.NameScope("conv5"):
-    conv5 = brew.conv(model, relu4, "conv5", 1024, 1024, 3, pad=1)
-    relu5 = brew.relu(model, conv5, conv5)
-    pool5 = brew.max_pool(model, relu5, "pool5", kernel=2, stride=2)
-with core.NameScope("fc6"):
-    fc6 = brew.fc(model, pool5, "fc6", 1024*6*6, 3072)
-    relu6 = brew.relu(model, fc6, "fc6")
-with core.NameScope("fc7"):
-    fc7 = brew.fc(model, relu6, "fc7", 3072, 4096)
-    relu7 = brew.relu(model, fc7, "fc7")
-with core.NameScope("classifier"):
-    fc8 = brew.fc(model, relu7, "fc8", 4096, 1000)
-    pred = brew.softmax(model, fc8, "pred")
-    xent = model.LabelCrossEntropy([pred, label], "xent")
-    loss = model.AveragedLoss(xent, "loss")
-model.net.RunAllOnGPU()
-model.param_init_net.RunAllOnGPU()
-model.AddGradientOperators([loss], skip=1)
-
-tb.Config.HEIGHT = 700
-tb.visualize_cnn(model)
-```
diff --git a/caffe2/contrib/tensorboard/tensorboard.py b/caffe2/contrib/tensorboard/tensorboard.py
deleted file mode 100644
index 4b89964627d26..0000000000000
--- a/caffe2/contrib/tensorboard/tensorboard.py
+++ /dev/null
@@ -1,212 +0,0 @@
-
-
-
-
-
-import click
-import collections
-import logging
-import numpy as np
-import os
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-import caffe2.contrib.tensorboard.tensorboard_exporter as tb_exporter
-
-try:
-    # tensorboard>=1.14.0
-    from tensorboard.compat.proto.summary_pb2 import Summary, HistogramProto
-    from tensorboard.compat.proto.event_pb2 import Event
-    from tensorboard.summary.writer.event_file_writer import EventFileWriter as FileWriter
-except ImportError:
-    from tensorflow.core.framework.summary_pb2 import Summary, HistogramProto
-    from tensorflow.core.util.event_pb2 import Event
-    try:
-        # tensorflow>=1.0.0
-        from tensorflow.summary import FileWriter
-    except ImportError:
-        # tensorflow<=0.12.1
-        from tensorflow.train import SummaryWriter as FileWriter
-
-class Config:
-    HEIGHT = 600
-    ASPECT_RATIO = 1.6
-
-
-CODE_TEMPLATE = """
-<script>
-  function load() {{
-    document.getElementById("{id}").pbtxt = {data};
-  }}
-</script>
-<link rel="import"
-  href="https://tensorboard.appspot.com/tf-graph-basic.build.html"
-  onload=load()
->
-<div style="height:{height}px">
-  <tf-graph-basic id="{id}"></tf-graph-basic>
-</div>
-"""
-
-IFRAME_TEMPLATE = """
-<iframe
-  seamless
-  style="width:{width}px;height:{height}px;border:0"
-  srcdoc="{code}">
-</iframe>
-"""
-
-
-def _show_graph(graph_def):
-    import IPython.display
-
-    code = CODE_TEMPLATE.format(
-        data=repr(str(graph_def)),
-        id='graph' + str(np.random.rand()),
-        height=Config.HEIGHT)
-
-    iframe = IFRAME_TEMPLATE.format(
-        code=code.replace('"', '&quot;'),
-        width=Config.HEIGHT * Config.ASPECT_RATIO,
-        height=Config.HEIGHT + 20)
-
-    IPython.display.display(IPython.display.HTML(iframe))
-
-
-def visualize_cnn(cnn, **kwargs):
-    g = tb_exporter.cnn_to_graph_def(cnn, **kwargs)
-    _show_graph(g)
-
-
-def visualize_net(nets, **kwargs):
-    g = tb_exporter.nets_to_graph_def(nets, **kwargs)
-    _show_graph(g)
-
-
-def visualize_ops(ops, **kwargs):
-    g = tb_exporter.ops_to_graph_def(ops, **kwargs)
-    _show_graph(g)
-
-
-@click.group()
-def cli():
-    pass
-
-
-def write_events(tf_dir, events):
-    writer = FileWriter(tf_dir, len(events))
-    for event in events:
-        writer.add_event(event)
-    writer.flush()
-    writer.close()
-
-
-def graph_def_to_event(step, graph_def):
-    return Event(
-        wall_time=step, step=step, graph_def=graph_def.SerializeToString())
-
-
-@cli.command("tensorboard-graphs")  # type: ignore[arg-type, attr-defined]
-@click.option("--c2-netdef", type=click.Path(exists=True, dir_okay=False),
-              multiple=True)
-@click.option("--tf-dir", type=click.Path(exists=True))
-def tensorboard_graphs(c2_netdef, tf_dir):
-    log = logging.getLogger(__name__)
-    log.setLevel(logging.INFO)
-
-    def parse_net_def(path):
-        import google.protobuf.text_format  # type: ignore[import]
-        net_def = caffe2_pb2.NetDef()
-        with open(path) as f:
-            google.protobuf.text_format.Merge(f.read(), net_def)
-        return core.Net(net_def)
-
-    graph_defs = [tb_exporter.nets_to_graph_def([parse_net_def(path)])
-                  for path in c2_netdef]
-    events = [graph_def_to_event(i, graph_def)
-              for (i, graph_def) in enumerate(graph_defs, start=1)]
-    write_events(tf_dir, events)
-    log.info("Wrote %s graphs to logdir %s", len(events), tf_dir)
-
-
-@cli.command("tensorboard-events")  # type: ignore[arg-type, attr-defined]
-@click.option("--c2-dir", type=click.Path(exists=True, file_okay=False),
-              help="Root directory of the Caffe2 run")
-@click.option("--tf-dir", type=click.Path(writable=True),
-              help="Output path to the logdir used by TensorBoard")
-def tensorboard_events(c2_dir, tf_dir):
-    np.random.seed(1701)
-    log = logging.getLogger(__name__)
-    log.setLevel(logging.INFO)
-    S = collections.namedtuple('S', ['min', 'max', 'mean', 'std'])
-
-    def parse_summary(filename):
-        try:
-            with open(filename) as f:
-                rows = [(float(el) for el in line.split()) for line in f]
-                return [S(*r) for r in rows]
-        except Exception as e:
-            log.exception(e)
-            return None
-
-    def get_named_summaries(root):
-        summaries = [
-            (fname, parse_summary(os.path.join(dirname, fname)))
-            for dirname, _, fnames in os.walk(root)
-            for fname in fnames
-        ]
-        return [(n, s) for (n, s) in summaries if s]
-
-    def inferred_histo(summary, samples=1000):
-        np.random.seed(
-            hash(
-                summary.std + summary.mean + summary.min + summary.max
-            ) % np.iinfo(np.int32).max
-        )
-        samples = np.random.randn(samples) * summary.std + summary.mean
-        samples = np.clip(samples, a_min=summary.min, a_max=summary.max)
-        (hist, edges) = np.histogram(samples)
-        upper_edges = edges[1:]
-        r = HistogramProto(
-            min=summary.min,
-            max=summary.max,
-            num=len(samples),
-            sum=samples.sum(),
-            sum_squares=(samples * samples).sum())
-        r.bucket_limit.extend(upper_edges)
-        r.bucket.extend(hist)
-        return r
-
-    def named_summaries_to_events(named_summaries):
-        names = [n for (n, _) in named_summaries]
-        summaries = [s for (_, s) in named_summaries]
-        summaries = list(zip(*summaries))
-
-        def event(step, values):
-            s = Summary()
-            scalar = [
-                Summary.Value(
-                    tag="{}/{}".format(name, field),
-                    simple_value=v)
-                for name, value in zip(names, values)
-                for field, v in value._asdict().items()]
-            hist = [
-                Summary.Value(
-                    tag="{}/inferred_normal_hist".format(name),
-                    histo=inferred_histo(value))
-                for name, value in zip(names, values)
-            ]
-            s.value.extend(scalar + hist)
-            return Event(wall_time=int(step), step=step, summary=s)
-
-        return [event(step, values)
-                for step, values in enumerate(summaries, start=1)]
-
-    named_summaries = get_named_summaries(c2_dir)
-    events = named_summaries_to_events(named_summaries)
-    write_events(tf_dir, events)
-    log.info("Wrote %s events to logdir %s", len(events), tf_dir)
-
-
-if __name__ == "__main__":
-    cli()  # type: ignore[misc]
diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter.py b/caffe2/contrib/tensorboard/tensorboard_exporter.py
deleted file mode 100644
index c505afa1567a1..0000000000000
--- a/caffe2/contrib/tensorboard/tensorboard_exporter.py
+++ /dev/null
@@ -1,343 +0,0 @@
-
-
-
-
-
-import copy
-import logging
-import os
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-
-try:
-    # tensorboard>=1.14.0
-    from tensorboard.compat.proto import tensor_shape_pb2
-    from tensorboard.compat.proto.node_def_pb2 import NodeDef
-    from tensorboard.compat.proto.graph_pb2 import GraphDef
-except ImportError:
-    from tensorflow.core.framework import tensor_shape_pb2
-    try:
-        # tensorflow>=1.0.0
-        from tensorflow import NodeDef, GraphDef
-    except ImportError:
-        # tensorflow<=0.12.1
-        from tensorflow.core.framework.graph_pb2 import NodeDef, GraphDef
-
-
-def _make_unique_name(seen, name, min_version=0):
-    assert name is not None
-    i = min_version
-    x = '%s_%d' % (name, i) if i else name
-    while x in seen:
-        i += 1
-        x = '%s_%d' % (name, i)
-    seen.add(x)
-    return x
-
-
-def _convert_to_ssa(shapes, track_blob_names, ops):
-    """
-    Convert an operator graph to SSA (i.e. out-of-place).
-
-    I.e. blobs will be renamed so that each blob is produced only once.
-    """
-    ir = core.IR(ops)
-    seen = set()
-    versioned = {}
-    shapes2 = {}
-    track_blob_names2 = {}
-
-    def ssa_name(name, versions):
-        assert name in versions
-        version = versions[name]
-        if (name, version) in versioned:
-            return versioned[(name, version)]
-        # Always setting name2 = `{name}_{version}` would work, but we also try
-        # to avoid a trailing `_0`, so we have to be careful not to introduce
-        # name collisions, such as (foo_1, 0) = foo_1 = (foo, 1).
-        # Note: operator names (if any) will be handled later.
-        name2 = _make_unique_name(seen, name, min_version=version)
-        versioned[(name, version)] = name2
-        # Transfer shape.
-        if name in shapes:
-            shapes2[name2] = shapes[name]
-        if track_blob_names and name in track_blob_names:
-            track_blob_names2[name2] = track_blob_names[name]
-        return name2
-
-    for (op, ssa) in zip(ops, ir.ssa):
-        assert op is ssa.op
-        inputs = list(op.input)
-        outputs = list(op.output)
-        del op.input[:]
-        del op.output[:]
-        op.input.extend(ssa_name(name, ssa.in_versions) for name in inputs)
-        op.output.extend(ssa_name(name, ssa.out_versions) for name in outputs)
-
-    shapes.clear()
-    shapes.update(shapes2)
-    if track_blob_names:
-        track_blob_names.clear()
-        track_blob_names.update(track_blob_names2)
-
-
-def _get_blob_names(ops):
-    names = set()
-    for op in ops:
-        names.update(op.input)
-        names.update(op.output)
-    return {name: name for name in names}
-
-
-def _remap_keys(m, f):
-    m2 = {f(key): value for key, value in m.items()}
-    m.clear()
-    m.update(m2)
-
-
-def _rename_all(shapes, track_blob_names, ops, f):
-    seen = set()
-    renamed = {}
-
-    def g(name):
-        """ Collision-free version of f.
-        """
-        if name is None:
-            return None
-        if name in renamed:
-            return renamed[name]
-        name2 = _make_unique_name(seen, f(name))
-        renamed[name] = name2
-        return name2
-
-    for op in ops:
-        inputs = list(op.input)
-        outputs = list(op.output)
-        del op.input[:]
-        del op.output[:]
-        op.input.extend(g(name) for name in inputs)
-        op.output.extend(g(name) for name in outputs)
-
-    _remap_keys(shapes, g)
-    if track_blob_names:
-        _remap_keys(track_blob_names, g)
-    # Rename all operator names (if any) independently so that the
-    # unique-fication happens only once in _fill_missing_operator_names().
-    seen.clear()
-    renamed.clear()
-    for op in ops:
-        op.name = g(op.name)
-
-
-def _add_gradient_scope(shapes, track_blob_names, ops):
-    """
-    For all operators or blobs with name containing "_grad", add a
-    "GRADIENTS/" scope.
-
-    Note: breaks graph execution since the blob -> gradient mapping is
-    hardcoded.
-    """
-    def f(name):
-        if '_grad' in name:
-            return 'GRADIENTS/{}'.format(name)
-        else:
-            return name
-    _rename_all(shapes, track_blob_names, ops, f)
-
-
-def _replace_colons(shapes, track_blob_names, ops, repl):
-    """
-    `:i` has a special meaning in Tensorflow.
-    """
-    def f(name):
-        return name.replace(':', repl)
-    _rename_all(shapes, track_blob_names, ops, f)
-
-
-def _fill_missing_operator_names(ops):
-    ''' Give missing operators a name.
-
-    We expect C2 operators to be generally unnamed. This gives them a scope
-    (inferred from their outputs) and a name after their type. Duplicates will
-    be postfixed by an index.
-    '''
-    seen = set()
-    for op in ops:
-        # Make sure operator names don't collide with blobs.
-        seen.update(op.input)
-        seen.update(op.output)
-    for op in ops:
-        if op.name:
-            name = op.name
-        elif op.output or op.input:
-            l = [os.path.dirname(name) for name in op.output or op.input]
-            scope = os.path.commonprefix(l)
-            name = os.path.join(scope, op.type)
-        else:
-            name = op.type
-        assert(name)
-        op.name = _make_unique_name(seen, name)
-
-
-def _tf_device(device_option):
-    if not device_option.HasField("device_type"):
-        return ""
-    if device_option.device_type == caffe2_pb2.CPU:
-        return "/cpu:*"
-    if device_option.device_type == caffe2_pb2.CUDA:
-        return "/gpu:{}".format(device_option.device_id)
-    raise Exception("Unhandled device", device_option)
-
-
-def _add_tf_shape(m, ints):
-    sh = tensor_shape_pb2.TensorShapeProto()
-    for i in ints:
-        dim = tensor_shape_pb2.TensorShapeProto.Dim()
-        dim.size = i
-        sh.dim.extend([dim])
-    m['_output_shapes'].list.shape.extend([sh])
-
-
-def _set_tf_attr(m, arg):
-    k = arg.name
-    if k == 'shape' and arg.ints:
-        _add_tf_shape(m, arg.ints)
-        return
-    if arg.HasField("f"):
-        m[k].f = arg.f
-        return
-    if arg.HasField("i"):
-        m[k].i = arg.i
-        return
-    if arg.HasField("s"):
-        m[k].s = (
-            arg.s if isinstance(arg.s, bytes) else str(arg.s).encode('utf-8')
-        )
-        return
-    if arg.floats:
-        m[k].list.f.extend(arg.floats)
-        return
-    if arg.ints:
-        m[k].list.i.extend(arg.ints)
-        return
-    if arg.strings:
-        m[k].list.s.extend(
-            s if isinstance(s, bytes) else str(s).encode('utf-8')
-            for s in arg.strings
-        )
-        return
-    # The value is an empty list.
-    m[k].list.s.extend([])
-
-
-def _operator_to_node(shapes, op):
-    assert op.name, op
-    n = NodeDef()
-    n.name = op.name
-    n.input.extend(op.input)
-    n.op = op.type
-    n.device = _tf_device(op.device_option)
-    if shapes:
-        # Add shapes in order.
-        for output in op.output:
-            if output not in shapes:
-                break
-            _add_tf_shape(n.attr, shapes[output])
-    for arg in op.arg:
-        _set_tf_attr(n.attr, arg)
-    return n
-
-
-def _blob_to_node(producing_ops, shapes, name):
-    assert name
-    n = NodeDef()
-    n.name = name
-    inputs = producing_ops.get(name, [])
-    if inputs:
-        n.op = 'Blob'
-    else:
-        n.op = 'Placeholder'
-    n.input.extend('%s:%d' % (op.name, i) for op, i in inputs)
-    if inputs:
-        device = inputs[0][0].device_option
-        if (all(input[0].device_option == device for input in inputs)):
-            n.device = _tf_device(device)
-    if shapes and name in shapes:
-        _add_tf_shape(n.attr, shapes[name])
-    return n
-
-
-def _operators_to_graph_def(
-    shapes,
-    ops,
-    replace_colons='$',
-    with_ssa=True,
-    with_gradient_scope=True,
-    track_blob_names=None,  # pass an empty array to track blob names
-):
-    if track_blob_names is not None:
-        track_blob_names.clear()
-        track_blob_names.update(_get_blob_names(ops))
-    if replace_colons:
-        _replace_colons(shapes, track_blob_names, ops, replace_colons)
-    if with_ssa:
-        _convert_to_ssa(shapes, track_blob_names, ops)
-    if with_gradient_scope:
-        _add_gradient_scope(shapes, track_blob_names, ops)
-    _fill_missing_operator_names(ops)
-    g = GraphDef()
-    producing_ops = {}
-    blobs = set()
-    for op in ops:
-        g.node.extend([_operator_to_node(shapes, op)])
-        for input_blob in op.input:
-            blobs.add(input_blob)
-        for i, output_blob in enumerate(op.output):
-            blobs.add(output_blob)
-            producing_ops.setdefault(output_blob, []).append((op, i))
-    for blob in blobs:
-        g.node.extend([_blob_to_node(producing_ops, shapes, blob)])
-    return g
-
-
-def _propagate_device_option(net):
-    if not net.HasField("device_option"):
-        return
-    for op in net.op:
-        if not op.HasField("device_option"):
-            op.device_option.CopyFrom(net.device_option)
-
-
-def _try_get_shapes(nets):
-    try:
-        # Note: this will inspect the workspace for better or worse.
-        shapes, _ = workspace.InferShapesAndTypes(nets)
-        return shapes
-    except Exception as e:
-        logging.warning('Failed to compute shapes: %s', e)
-        return {}
-
-
-def nets_to_graph_def(nets, shapes=None, **kwargs):
-    if shapes is None:
-        shapes = _try_get_shapes(nets)
-    nets = [copy.deepcopy(net.Proto()) for net in nets]
-    shapes = copy.deepcopy(shapes)
-    for net in nets:
-        _propagate_device_option(net)
-    return _operators_to_graph_def(
-        shapes,
-        [op for net in nets for op in net.op],
-        **kwargs
-    )
-
-
-def cnn_to_graph_def(cnn, **kwargs):
-    return nets_to_graph_def([cnn.param_init_net, cnn.net], **kwargs)
-
-
-def ops_to_graph_def(ops, shapes=None, **kwargs):
-    ops = copy.deepcopy(ops)
-    shapes = copy.deepcopy(shapes or {})
-    return _operators_to_graph_def(shapes, ops, **kwargs)
diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter_test.py b/caffe2/contrib/tensorboard/tensorboard_exporter_test.py
deleted file mode 100644
index 31ef8180fb57f..0000000000000
--- a/caffe2/contrib/tensorboard/tensorboard_exporter_test.py
+++ /dev/null
@@ -1,705 +0,0 @@
-
-
-
-
-
-import unittest
-
-from caffe2.proto import caffe2_pb2
-import caffe2.python.cnn as cnn
-import caffe2.python.core as core
-import caffe2.contrib.tensorboard.tensorboard_exporter as tb
-
-EXPECTED = """
-node {
-  name: "conv1/XavierFill"
-  op: "XavierFill"
-  device: "/gpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 96
-          }
-          dim {
-            size: 3
-          }
-          dim {
-            size: 11
-          }
-          dim {
-            size: 11
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "conv1/ConstantFill"
-  op: "ConstantFill"
-  device: "/gpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 96
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "classifier/XavierFill"
-  op: "XavierFill"
-  device: "/gpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1000
-          }
-          dim {
-            size: 4096
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "classifier/ConstantFill"
-  op: "ConstantFill"
-  device: "/gpu:0"
-  attr {
-    key: "_output_shapes"
-    value {
-      list {
-        shape {
-          dim {
-            size: 1000
-          }
-        }
-      }
-    }
-  }
-}
-node {
-  name: "ImageInput"
-  op: "ImageInput"
-  input: "db"
-  device: "/gpu:0"
-  attr {
-    key: "cudnn_exhaustive_search"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "is_test"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "use_cudnn"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "NHWC2NCHW"
-  op: "NHWC2NCHW"
-  input: "data_nhwc"
-  device: "/gpu:0"
-}
-node {
-  name: "conv1/Conv"
-  op: "Conv"
-  input: "data"
-  input: "conv1/conv1_w"
-  input: "conv1/conv1_b"
-  device: "/gpu:0"
-  attr {
-    key: "exhaustive_search"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "kernel"
-    value {
-      i: 11
-    }
-  }
-  attr {
-    key: "order"
-    value {
-      s: "NCHW"
-    }
-  }
-  attr {
-    key: "stride"
-    value {
-      i: 4
-    }
-  }
-}
-node {
-  name: "conv1/Relu"
-  op: "Relu"
-  input: "conv1/conv1"
-  device: "/gpu:0"
-  attr {
-    key: "cudnn_exhaustive_search"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "order"
-    value {
-      s: "NCHW"
-    }
-  }
-}
-node {
-  name: "conv1/MaxPool"
-  op: "MaxPool"
-  input: "conv1/conv1_1"
-  device: "/gpu:0"
-  attr {
-    key: "cudnn_exhaustive_search"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "kernel"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "order"
-    value {
-      s: "NCHW"
-    }
-  }
-  attr {
-    key: "stride"
-    value {
-      i: 2
-    }
-  }
-}
-node {
-  name: "classifier/FC"
-  op: "FC"
-  input: "conv1/pool1"
-  input: "classifier/fc_w"
-  input: "classifier/fc_b"
-  device: "/gpu:0"
-  attr {
-    key: "cudnn_exhaustive_search"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "order"
-    value {
-      s: "NCHW"
-    }
-  }
-  attr {
-    key: "use_cudnn"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "classifier/Softmax"
-  op: "Softmax"
-  input: "classifier/fc"
-  device: "/gpu:0"
-  attr {
-    key: "cudnn_exhaustive_search"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "order"
-    value {
-      s: "NCHW"
-    }
-  }
-}
-node {
-  name: "classifier/LabelCrossEntropy"
-  op: "LabelCrossEntropy"
-  input: "classifier/pred"
-  input: "label"
-  device: "/gpu:0"
-}
-node {
-  name: "classifier/AveragedLoss"
-  op: "AveragedLoss"
-  input: "classifier/xent"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/classifier/ConstantFill"
-  op: "ConstantFill"
-  input: "classifier/loss"
-  device: "/gpu:0"
-  attr {
-    key: "value"
-    value {
-      f: 1.0
-    }
-  }
-}
-node {
-  name: "GRADIENTS/classifier/AveragedLossGradient"
-  op: "AveragedLossGradient"
-  input: "classifier/xent"
-  input: "GRADIENTS/classifier/loss_autogen_grad"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/classifier/LabelCrossEntropyGradient"
-  op: "LabelCrossEntropyGradient"
-  input: "classifier/pred"
-  input: "label"
-  input: "GRADIENTS/classifier/xent_grad"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/classifier/SoftmaxGradient"
-  op: "SoftmaxGradient"
-  input: "classifier/pred"
-  input: "GRADIENTS/classifier/pred_grad"
-  device: "/gpu:0"
-  attr {
-    key: "cudnn_exhaustive_search"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "order"
-    value {
-      s: "NCHW"
-    }
-  }
-}
-node {
-  name: "GRADIENTS/c/FCGradient"
-  op: "FCGradient"
-  input: "conv1/pool1"
-  input: "classifier/fc_w"
-  input: "GRADIENTS/classifier/fc_grad"
-  device: "/gpu:0"
-  attr {
-    key: "cudnn_exhaustive_search"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "order"
-    value {
-      s: "NCHW"
-    }
-  }
-  attr {
-    key: "use_cudnn"
-    value {
-      i: 1
-    }
-  }
-}
-node {
-  name: "GRADIENTS/conv1/MaxPoolGradient"
-  op: "MaxPoolGradient"
-  input: "conv1/conv1_1"
-  input: "conv1/pool1"
-  input: "GRADIENTS/conv1/pool1_grad"
-  device: "/gpu:0"
-  attr {
-    key: "cudnn_exhaustive_search"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "kernel"
-    value {
-      i: 2
-    }
-  }
-  attr {
-    key: "order"
-    value {
-      s: "NCHW"
-    }
-  }
-  attr {
-    key: "stride"
-    value {
-      i: 2
-    }
-  }
-}
-node {
-  name: "GRADIENTS/conv1/ReluGradient"
-  op: "ReluGradient"
-  input: "conv1/conv1_1"
-  input: "GRADIENTS/conv1/conv1_grad"
-  device: "/gpu:0"
-  attr {
-    key: "cudnn_exhaustive_search"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "order"
-    value {
-      s: "NCHW"
-    }
-  }
-}
-node {
-  name: "GRADIENTS/ConvGradient"
-  op: "ConvGradient"
-  input: "data"
-  input: "conv1/conv1_w"
-  input: "GRADIENTS/conv1/conv1_grad_1"
-  device: "/gpu:0"
-  attr {
-    key: "exhaustive_search"
-    value {
-      i: 0
-    }
-  }
-  attr {
-    key: "kernel"
-    value {
-      i: 11
-    }
-  }
-  attr {
-    key: "order"
-    value {
-      s: "NCHW"
-    }
-  }
-  attr {
-    key: "stride"
-    value {
-      i: 4
-    }
-  }
-}
-node {
-  name: "GRADIENTS/NCHW2NHWC"
-  op: "NCHW2NHWC"
-  input: "GRADIENTS/data_grad"
-  device: "/gpu:0"
-}
-node {
-  name: "conv1/conv1_w"
-  op: "Blob"
-  input: "conv1/XavierFill:0"
-  device: "/gpu:0"
-}
-node {
-  name: "classifier/fc"
-  op: "Blob"
-  input: "classifier/FC:0"
-  device: "/gpu:0"
-}
-node {
-  name: "data_nhwc"
-  op: "Blob"
-  input: "ImageInput:0"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/conv1/conv1_b_grad"
-  op: "Blob"
-  input: "GRADIENTS/ConvGradient:1"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/classifier/pred_grad"
-  op: "Blob"
-  input: "GRADIENTS/classifier/LabelCrossEntropyGradient:0"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/classifier/fc_grad"
-  op: "Blob"
-  input: "GRADIENTS/classifier/SoftmaxGradient:0"
-  device: "/gpu:0"
-}
-node {
-  name: "conv1/conv1_b"
-  op: "Blob"
-  input: "conv1/ConstantFill:0"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/classifier/fc_b_grad"
-  op: "Blob"
-  input: "GRADIENTS/c/FCGradient:1"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/classifier/fc_w_grad"
-  op: "Blob"
-  input: "GRADIENTS/c/FCGradient:0"
-  device: "/gpu:0"
-}
-node {
-  name: "label"
-  op: "Blob"
-  input: "ImageInput:1"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/data_grad"
-  op: "Blob"
-  input: "GRADIENTS/ConvGradient:2"
-  device: "/gpu:0"
-}
-node {
-  name: "classifier/loss"
-  op: "Blob"
-  input: "classifier/AveragedLoss:0"
-  device: "/gpu:0"
-}
-node {
-  name: "conv1/conv1"
-  op: "Blob"
-  input: "conv1/Conv:0"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/conv1/conv1_grad"
-  op: "Blob"
-  input: "GRADIENTS/conv1/MaxPoolGradient:0"
-  device: "/gpu:0"
-}
-node {
-  name: "classifier/xent"
-  op: "Blob"
-  input: "classifier/LabelCrossEntropy:0"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/classifier/loss_autogen_grad"
-  op: "Blob"
-  input: "GRADIENTS/classifier/ConstantFill:0"
-  device: "/gpu:0"
-}
-node {
-  name: "classifier/fc_w"
-  op: "Blob"
-  input: "classifier/XavierFill:0"
-  device: "/gpu:0"
-}
-node {
-  name: "conv1/conv1_1"
-  op: "Blob"
-  input: "conv1/Relu:0"
-  device: "/gpu:0"
-}
-node {
-  name: "db"
-  op: "Placeholder"
-}
-node {
-  name: "classifier/pred"
-  op: "Blob"
-  input: "classifier/Softmax:0"
-  device: "/gpu:0"
-}
-node {
-  name: "classifier/fc_b"
-  op: "Blob"
-  input: "classifier/ConstantFill:0"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/classifier/xent_grad"
-  op: "Blob"
-  input: "GRADIENTS/classifier/AveragedLossGradient:0"
-  device: "/gpu:0"
-}
-node {
-  name: "data"
-  op: "Blob"
-  input: "NHWC2NCHW:0"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/conv1/conv1_w_grad"
-  op: "Blob"
-  input: "GRADIENTS/ConvGradient:0"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/conv1/conv1_grad_1"
-  op: "Blob"
-  input: "GRADIENTS/conv1/ReluGradient:0"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/data_nhwc_grad"
-  op: "Blob"
-  input: "GRADIENTS/NCHW2NHWC:0"
-  device: "/gpu:0"
-}
-node {
-  name: "GRADIENTS/conv1/pool1_grad"
-  op: "Blob"
-  input: "GRADIENTS/c/FCGradient:2"
-  device: "/gpu:0"
-}
-node {
-  name: "conv1/pool1"
-  op: "Blob"
-  input: "conv1/MaxPool:0"
-  device: "/gpu:0"
-}
-"""
-
-
-class TensorboardExporterTest(unittest.TestCase):
-    def test_that_operators_gets_non_colliding_names(self):
-        op = caffe2_pb2.OperatorDef()
-        op.type = 'foo'
-        op.input.extend(['foo'])
-        tb._fill_missing_operator_names([op])
-        self.assertEqual(op.input[0], 'foo')
-        self.assertEqual(op.name, 'foo_1')
-
-    def test_that_replacing_colons_gives_non_colliding_names(self):
-        # .. and update shapes
-        op = caffe2_pb2.OperatorDef()
-        op.name = 'foo:0'
-        op.input.extend(['foo:0', 'foo$0'])
-        shapes = {'foo:0': [1]}
-        track_blob_names = tb._get_blob_names([op])
-        tb._replace_colons(shapes, track_blob_names, [op], '$')
-        self.assertEqual(op.input[0], 'foo$0')
-        self.assertEqual(op.input[1], 'foo$0_1')
-        # Collision but blobs and op names are handled later by
-        # _fill_missing_operator_names.
-        self.assertEqual(op.name, 'foo$0')
-        self.assertEqual(len(shapes), 1)
-        self.assertEqual(shapes['foo$0'], [1])
-        self.assertEqual(len(track_blob_names), 2)
-        self.assertEqual(track_blob_names['foo$0'], 'foo:0')
-        self.assertEqual(track_blob_names['foo$0_1'], 'foo$0')
-
-    def test_that_adding_gradient_scope_does_no_fancy_renaming(self):
-        # because it cannot create collisions
-        op = caffe2_pb2.OperatorDef()
-        op.name = 'foo_grad'
-        op.input.extend(['foo_grad', 'foo_grad_1'])
-        shapes = {'foo_grad': [1]}
-        track_blob_names = tb._get_blob_names([op])
-        tb._add_gradient_scope(shapes, track_blob_names, [op])
-        self.assertEqual(op.input[0], 'GRADIENTS/foo_grad')
-        self.assertEqual(op.input[1], 'GRADIENTS/foo_grad_1')
-        self.assertEqual(op.name, 'GRADIENTS/foo_grad')
-        self.assertEqual(len(shapes), 1)
-        self.assertEqual(shapes['GRADIENTS/foo_grad'], [1])
-        self.assertEqual(len(track_blob_names), 2)
-        self.assertEqual(
-            track_blob_names['GRADIENTS/foo_grad'], 'foo_grad')
-        self.assertEqual(
-            track_blob_names['GRADIENTS/foo_grad_1'], 'foo_grad_1')
-
-    def test_that_auto_ssa_gives_non_colliding_names(self):
-        op1 = caffe2_pb2.OperatorDef()
-        op1.output.extend(['foo'])
-        op2 = caffe2_pb2.OperatorDef()
-        op2.input.extend(['foo'])
-        op2.output.extend(['foo'])
-        op2.output.extend(['foo_1'])
-        shapes = {'foo': [1], 'foo_1': [2]}
-        track_blob_names = tb._get_blob_names([op1, op2])
-        tb._convert_to_ssa(shapes, track_blob_names, [op1, op2])
-        self.assertEqual(op1.output[0], 'foo')
-        self.assertEqual(op2.input[0], 'foo')
-        self.assertEqual(op2.output[0], 'foo_1')
-        # Unfortunate name but we do not parse original `_` for now.
-        self.assertEqual(op2.output[1], 'foo_1_1')
-        self.assertEqual(len(shapes), 3)
-        self.assertEqual(shapes['foo'], [1])
-        self.assertEqual(shapes['foo_1'], [1])
-        self.assertEqual(shapes['foo_1_1'], [2])
-        self.assertEqual(len(track_blob_names), 3)
-        self.assertEqual(track_blob_names['foo'], 'foo')
-        self.assertEqual(track_blob_names['foo_1'], 'foo')
-        self.assertEqual(track_blob_names['foo_1_1'], 'foo_1')
-
-    def test_simple_cnnmodel(self):
-        model = cnn.CNNModelHelper("NCHW", name="overfeat")
-        data, label = model.ImageInput(["db"], ["data", "label"], is_test=0)
-        with core.NameScope("conv1"):
-            conv1 = model.Conv(data, "conv1", 3, 96, 11, stride=4)
-            relu1 = model.Relu(conv1, conv1)
-            pool1 = model.MaxPool(relu1, "pool1", kernel=2, stride=2)
-        with core.NameScope("classifier"):
-            fc = model.FC(pool1, "fc", 4096, 1000)
-            pred = model.Softmax(fc, "pred")
-            xent = model.LabelCrossEntropy([pred, label], "xent")
-            loss = model.AveragedLoss(xent, "loss")
-        model.net.RunAllOnGPU()
-        model.param_init_net.RunAllOnGPU()
-        model.AddGradientOperators([loss], skip=1)
-        track_blob_names = {}
-        graph = tb.cnn_to_graph_def(
-            model,
-            track_blob_names=track_blob_names,
-            shapes={},
-        )
-        self.assertEqual(
-            track_blob_names['GRADIENTS/conv1/conv1_b_grad'],
-            'conv1/conv1_b_grad',
-        )
-        self.maxDiff = None
-        # We can't guarantee the order in which they appear, so we sort
-        # both before we compare them
-        sep = "node {"
-        expected = "\n".join(sorted(
-            sep + "\n  " + part.strip()
-            for part in EXPECTED.strip().split(sep)
-            if part.strip()
-        ))
-        actual = "\n".join(sorted(
-            sep + "\n  " + part.strip()
-            for part in str(graph).strip().split(sep)
-            if part.strip()
-        ))
-        self.assertMultiLineEqual(actual, expected)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/contrib/tensorboard/tensorboard_test.py b/caffe2/contrib/tensorboard/tensorboard_test.py
deleted file mode 100644
index 8751be14ead5c..0000000000000
--- a/caffe2/contrib/tensorboard/tensorboard_test.py
+++ /dev/null
@@ -1,118 +0,0 @@
-
-
-
-
-
-import click.testing
-import numpy as np
-import os
-import tempfile
-import unittest
-
-from caffe2.python import brew, core, model_helper
-import caffe2.contrib.tensorboard.tensorboard as tb
-import caffe2.contrib.tensorboard.tensorboard_exporter as tb_exporter
-
-try:
-    # tensorboard>=1.14.0
-    from tensorboard.compat.proto.graph_pb2 import GraphDef
-except ImportError:
-    from tensorflow import GraphDef
-
-
-def load_events(filename):
-    try:
-        # tensorboard>=1.14.0
-        from tensorboard.backend.event_processing import event_file_loader
-        loader = event_file_loader.EventFileLoader(filename)
-        return list(loader.Load())
-    except ImportError:
-        import tensorflow as tf
-        return list(tf.train.summary_iterator(filename))
-
-
-class TensorboardTest(unittest.TestCase):
-
-    def test_events(self):
-        runner = click.testing.CliRunner()
-        c2_dir = tempfile.mkdtemp()
-        np.random.seed(1701)
-        n_iters = 2
-        blobs = ["w", "b"]
-        data = np.random.randn(len(blobs), n_iters, 10)
-        for i, blob in enumerate(blobs):
-            with open(os.path.join(c2_dir, blob), "w") as f:
-                for row in data[i]:
-                    stats = [row.min(), row.max(), row.mean(), row.std()]
-                    f.write(" ".join(str(s) for s in stats) + "\n")
-
-        # Test error handling path
-        with open(os.path.join(c2_dir, "not-a-summary"), "w") as f:
-            f.write("not-a-summary")
-
-        tf_dir = tempfile.mkdtemp()
-        result = runner.invoke(
-            tb.cli,
-            ["tensorboard-events", "--c2-dir", c2_dir, "--tf-dir", tf_dir])
-        self.assertEqual(result.exit_code, 0)
-        entries = list(os.walk(tf_dir))
-        self.assertEqual(len(entries), 1)
-        ((d, _, (fname,)),) = entries
-        self.assertEqual(tf_dir, d)
-        events = load_events(os.path.join(tf_dir, fname))
-        self.assertEqual(len(events), n_iters + 1)
-        events = events[1:]
-        self.maxDiff = None
-        self.assertEqual(len(events), 2)
-
-    def test_tensorboard_graphs(self):
-        model = model_helper.ModelHelper(name="overfeat")
-        data, label = brew.image_input(
-            model, ["db"], ["data", "label"], is_test=0
-        )
-        with core.NameScope("conv1"):
-            conv1 = brew.conv(model, data, "conv1", 3, 96, 11, stride=4)
-            relu1 = brew.relu(model, conv1, conv1)
-            pool1 = brew.max_pool(model, relu1, "pool1", kernel=2, stride=2)
-        with core.NameScope("classifier"):
-            fc = brew.fc(model, pool1, "fc", 4096, 1000)
-            pred = brew.softmax(model, fc, "pred")
-            xent = model.LabelCrossEntropy([pred, label], "xent")
-            loss = model.AveragedLoss(xent, "loss")
-        model.AddGradientOperators([loss], skip=1)
-
-        c2_dir = tempfile.mkdtemp()
-        tf_dir = tempfile.mkdtemp()
-
-        with open(os.path.join(c2_dir, "init"), "w") as f:
-            f.write(str(model.param_init_net.Proto()))
-        with open(os.path.join(c2_dir, "net"), "w") as f:
-            f.write(str(model.net.Proto()))
-        runner = click.testing.CliRunner()
-        result = runner.invoke(
-            tb.cli,
-            ["tensorboard-graphs",
-             "--c2-netdef", os.path.join(c2_dir, "init"),
-             "--c2-netdef", os.path.join(c2_dir, "net"),
-             "--tf-dir", tf_dir])
-        self.assertEqual(result.exit_code, 0)
-        entries = list(os.walk(tf_dir))
-        self.assertEqual(len(entries), 1)
-        ((d, _, (fname,)),) = entries
-        self.assertEqual(tf_dir, d)
-        events = load_events(os.path.join(tf_dir, fname))
-        self.assertEqual(len(events), 3)
-        events = events[1:]
-        nets = [model.param_init_net, model.net]
-        for i, (event, net) in enumerate(zip(events, nets), start=1):
-            self.assertEqual(event.step, i)
-            self.assertEqual(event.wall_time, i)
-            g = GraphDef()
-            g.ParseFromString(event.graph_def)
-            self.assertMultiLineEqual(
-                str(g),
-                str(tb_exporter.nets_to_graph_def([net])))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/contrib/tensorrt/CMakeLists.txt b/caffe2/contrib/tensorrt/CMakeLists.txt
deleted file mode 100644
index bb5d2884a53c0..0000000000000
--- a/caffe2/contrib/tensorrt/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# ---[ GPU files.
-file(GLOB tmp *.cc)
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-
-# ---[ Send the lists to the parent scope.
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
diff --git a/caffe2/contrib/tensorrt/README.md b/caffe2/contrib/tensorrt/README.md
deleted file mode 100644
index 6ffe1dfb53bc6..0000000000000
--- a/caffe2/contrib/tensorrt/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Caffe2 & TensorRT integration
-
-[![Jenkins Build Status](https://ci.pytorch.org/jenkins/job/caffe2-master/lastCompletedBuild/badge/icon)](https://ci.pytorch.org/jenkins/job/caffe2-master)
-
-This directory contains the code implementing `TensorRTOp` Caffe2 operator as well as Caffe2 model converter (using `ONNX` model as an intermediate format).
-To enable this functionality in your PyTorch build please set
-
-`USE_TENSORRT=1 ... python setup.py ...`
- 
- or if you use CMake directly
- 
- `-DUSE_TENSORRT=ON`
-
-For further information please explore `caffe2/python/trt/test_trt.py` test showing all possible use cases.
-
-## Questions and Feedback
-
-Please use GitHub issues (https://github.com/pytorch/pytorch/issues) to ask questions, report bugs, and request new features.
diff --git a/caffe2/contrib/tensorrt/tensorrt_op_trt.cc b/caffe2/contrib/tensorrt/tensorrt_op_trt.cc
deleted file mode 100644
index 7e183299b00ab..0000000000000
--- a/caffe2/contrib/tensorrt/tensorrt_op_trt.cc
+++ /dev/null
@@ -1,254 +0,0 @@
-#include "caffe2/contrib/tensorrt/tensorrt_op_trt.h"
-
-#include <c10/util/accumulate.h>
-#include "caffe2/contrib/tensorrt/tensorrt_tranformer.h"
-#include "caffe2/core/logging.h"
-#include "onnx/onnx_pb.h"
-
-#include <unordered_map>
-#include <numeric>
-
-namespace caffe2 {
-
-namespace {
-// Note that input of trt tensor is in CHW format, while our tensor is NCHW
-// \return -1 if there is dimension mismatch between C2 tensor and trt tensor.
-// Otherwise, return the product of CHW dimensions
-int64_t CheckDims(
-    const nvinfer1::Dims& nv_dims,
-    at::ArrayRef<int64_t> c2_dims) {
-  if (nv_dims.nbDims + 1 != c2_dims.size()) {
-    CAFFE_THROW(
-        "Mismatched dimensions between TRT input (",
-        nv_dims.nbDims + 1,
-        ") and C2 input (",
-        c2_dims.size(),
-        ")");
-  }
-  int64_t chw = 1;
-  for (int i = 0; i < nv_dims.nbDims; ++i) {
-    if (nv_dims.d[i] != c2_dims[i + 1]) {
-      CAFFE_THROW(
-          "Mismatched value at dimension ",
-          i,
-          "  between TRT input (",
-          nv_dims.d[i],
-          ") and C2 input (",
-          c2_dims[i + 1],
-          ")");
-    }
-    chw *= nv_dims.d[i];
-  }
-  return chw;
-}
-
-} // namespace
-
-// Upon construction, we build the inference engine by deserializing from
-// protobuf string. And since we know the input/output blobs, we can do the
-// binding here too.
-TensorRTOp::TensorRTOp(const OperatorDef& operator_def, Workspace* ws)
-    : Operator<CUDAContext>(operator_def, ws),
-      logger_(
-          (nvinfer1::ILogger::Severity)(OperatorBase::GetSingleArgument<int>(
-              "log_verbosity",
-              FLAGS_caffe2_log_level))),
-      max_batch_size_(
-          OperatorBase::GetSingleArgument<int>("max_batch_size", 1)) {
-  {
-    auto engine_string =
-        OperatorBase::GetSingleArgument<std::string>("backend_buffer", "");
-    if (!engine_string.empty()) {
-      auto trt_runtime =
-          tensorrt::TrtObject(nvinfer1::createInferRuntime(logger_));
-      // TODO(support trt plugin factory)
-      trt_engine_ = tensorrt::TrtObject(trt_runtime->deserializeCudaEngine(
-          engine_string.data(), engine_string.size(), nullptr));
-    } else {
-      auto onnx_model_str =
-          OperatorBase::GetSingleArgument<std::string>("onnx_model", "");
-      CAFFE_ENFORCE(!onnx_model_str.empty(), "onnx_model cannot be empty");
-      auto debug_builder = OperatorBase::GetSingleArgument<int>("debug_builder", 0);
-      auto max_workspace_size = OperatorBase::GetSingleArgument<int>(
-          "max_workspace_size", 1024 * 1024 * 2);
-
-      // Pull the weights from workspace and assembly it back to the onnx model,
-      // notice that since we may have rewritten the net, we need to map the
-      // weight names
-      auto initializers = OperatorBase::GetRepeatedArgument<std::string>("initializers");
-      CAFFE_ENFORCE_EQ(
-          initializers.size() % 2, 0, "initializers should come in pairs");
-      std::unordered_set<std::string> initializer_set;
-      std::unordered_map<std::string, std::string> input_mapping;
-      for (auto it = initializers.begin(); it != initializers.end(); ++it)  {
-        auto key = *it++;
-        input_mapping.emplace(key, *it);
-        initializer_set.emplace(key);
-      }
-      Workspace mapped_ws(ws, input_mapping);
-      ::ONNX_NAMESPACE::ModelProto onnx_model;
-      ParseProtoFromLargeString(onnx_model_str, &onnx_model);
-      BuildInitializationList(&mapped_ws, onnx_model.mutable_graph(), &initializer_set);
-      onnx_model_str.clear();
-      onnx_model.SerializeToString(&onnx_model_str);
-
-      // Build the trt engine
-      trt_engine_ = tensorrt::BuildTrtEngine(
-          onnx_model_str,
-          &logger_,
-          max_batch_size_,
-          max_workspace_size,
-          debug_builder);
-    }
-  }
-
-  CAFFE_ENFORCE(trt_engine_, "Cannot build TensorRT engine!");
-
-  // match and bind the input/output
-  const int num_bindings = trt_engine_->getNbBindings();
-  int output_idx = 0;
-  for (int b = 0; b < num_bindings; ++b) {
-    nv_dims_.push_back(trt_engine_->getBindingDimensions(b));
-    bool is_input = trt_engine_->bindingIsInput(b);
-    is_input_.push_back(is_input);
-    if (!is_input) {
-      // For output, we try to get its output size hint
-      const std::string key = c10::str("output_size_hint_", output_idx);
-      auto output_size_hint = OperatorBase::GetRepeatedArgument<int>(key);
-      if (!output_size_hint.empty()) {
-        std::vector<int64_t> dims;
-        for (const auto v : output_size_hint) {
-          dims.push_back(v);
-        }
-        output_size_hints_.emplace(output_idx, std::move(dims));
-      }
-      ++output_idx;
-    }
-  }
-
-  trt_executor_ = tensorrt::TrtObject(trt_engine_->createExecutionContext());
-}
-
-void TensorRTOp::MaybeAdjustOutputShape(
-    int output_idx,
-    std::vector<int64_t>* dims) {
-  const auto it = output_size_hints_.find(output_idx);
-  if (it != output_size_hints_.end()) {
-    const auto& dims_hint = it->second;
-    const auto total_trt = c10::multiply_integers(*dims);
-    const auto total_c2 = c10::multiply_integers(dims_hint);
-    CAFFE_ENFORCE_EQ(
-        total_trt,
-        total_c2,
-        "The total size of TensorRT op output and hint don't match: ",
-        total_trt,
-        " vs ",
-        total_c2);
-
-    // We conform to the output shape hints. NB: We might need an explicit
-    // reshape op for this
-    *dims = dims_hint;
-  }
-}
-
-bool TensorRTOp::RunOnDevice() {
-  CAFFE_ENFORCE(trt_executor_);
-  // Decide input batch size
-  size_t N = 0;
-  for (int i = 0; i < InputSize(); ++i) {
-    const auto& input_tensor = Input(i);
-    const auto tensor_dims = input_tensor.sizes();
-    CAFFE_ENFORCE(!tensor_dims.empty(), "Input tensor cannot be empty");
-    if (i == 0) {
-      N = tensor_dims.front();
-    } else {
-      CAFFE_ENFORCE_EQ(
-          N, tensor_dims.front(), "Mismatched batch size in input tensors");
-    }
-  }
-  if (N > max_batch_size_ && !batch_warning_issued_) {
-    LOG(WARNING) << "Batch size (" << N << ") is larger than max_batch_size ("
-                 << max_batch_size_ << ") optimized for TensorRT operator. "
-                 << "Performance may be sub-optimal.";
-    batch_warning_issued_ = true;
-  }
-
-  // We need to do the binding at RunOnDevice time because we only know the
-  // exact shapes of the tensors now. In addition, since TensorRT engine has
-  // max_batch_size, we need to call that multiple times if input batch size
-  // exceeeds this limit.
-  CAFFE_ENFORCE_EQ(is_input_.size(), nv_dims_.size());
-  std::vector<void*> bindings;
-  bindings.reserve(is_input_.size());
-  auto batch_size = max_batch_size_;
-  for (size_t offset = 0; offset < N; offset += batch_size) {
-    bindings.clear();
-    batch_size = std::min<size_t>(N - offset, max_batch_size_);
-    VLOG(2) << "Offset: " << offset << ", batch_size: " << batch_size
-            << ", N: " << N;
-    int input_idx = 0;
-    int output_idx = 0;
-    for (auto i = 0; i < is_input_.size(); ++i) {
-      const auto& dims = nv_dims_[i];
-      if (is_input_[i]) {
-        // input, check input dimensions
-        const auto& input_tensor = Input(input_idx++);
-        const float* input_data = input_tensor.data<float>();
-        const auto tensor_dims = input_tensor.sizes();
-        auto chw = CheckDims(dims, tensor_dims);
-        bindings.push_back((void*)(input_data + offset * chw));
-      } else {
-        // output, we need to allocate the output tensor at first batch run
-        auto* output_tensor = Output(output_idx);
-        std::vector<int64_t> tensor_dims;
-        tensor_dims.push_back(N);
-        int64_t chw = 1;
-        for (int i = 0; i < dims.nbDims; ++i) {
-          tensor_dims.push_back(dims.d[i]);
-          chw *= dims.d[i];
-        }
-
-        if (offset == 0) {
-          MaybeAdjustOutputShape(output_idx, &tensor_dims);
-          output_tensor->Resize(tensor_dims);
-        }
-        ++output_idx;
-        float* output_data = output_tensor->mutable_data<float>();
-        bindings.push_back((void*)(output_data + offset * chw));
-      }
-    }
-
-    CAFFE_ENFORCE_EQ(bindings.size(), InputSize() + OutputSize());
-    if (!trt_executor_->execute(batch_size, bindings.data())) {
-      CAFFE_THROW("Error running the TensorRT executor");
-    }
-  }
-  return true;
-}
-
-OPERATOR_SCHEMA(TensorRT)
-    .NumInputs(0, INT_MAX)
-    .NumOutputs(0, INT_MAX)
-    .SetDoc(R"DOC(
-The TensorRT operator is a black-box operator serialized from prebuilt TensorRT
-Engine string. It will take the input, do the computation by calling TensorRT
-inference engine and generate the outputs.
-
-This is a GPU only operator.
-)DOC")
-    .Arg(
-        "log_verbosity",
-        "(int default 0) verbosity of the TensorRt engine log.")
-    .Arg(
-        "backend_buffer",
-        "(string default=\"\" blob for serialized TensorRT engine."
-        "Note that serialized engine is not compatible across platform and "
-        "different TensorRT version.")
-    .Arg(
-        "max_batch_size",
-        "(int default 0) Batch size set by the TensorRT engine builder."
-        "It must be no larger than the max_batch_size of the engine builder so "
-        "it is better not to edit this manually.");
-
-REGISTER_CUDA_OPERATOR(TensorRT, TensorRTOp);
-} // namespace caffe2
diff --git a/caffe2/contrib/tensorrt/tensorrt_op_trt.h b/caffe2/contrib/tensorrt/tensorrt_op_trt.h
deleted file mode 100644
index a98b8a33a3311..0000000000000
--- a/caffe2/contrib/tensorrt/tensorrt_op_trt.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#pragma once
-
-#include "caffe2/contrib/tensorrt/trt_utils.h"
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/operator.h"
-
-#include <NvInfer.h>
-#include <unordered_map>
-
-namespace caffe2 {
-
-class TensorRTOp final : public Operator<CUDAContext> {
- public:
-  USE_OPERATOR_FUNCTIONS(CUDAContext);
-  TensorRTOp(const OperatorDef& operator_def, Workspace* ws);
-  bool RunOnDevice() override;
-  virtual ~TensorRTOp() noexcept {}
-
- private:
-  void MaybeAdjustOutputShape(int output_idx, std::vector<int64_t>* dims);
-
-  tensorrt::TrtLogger logger_;
-  int max_batch_size_;
-  std::vector<nvinfer1::Dims> nv_dims_;
-  std::vector<bool> is_input_;
-  std::unordered_map<int, std::vector<int64_t>> output_size_hints_;
-  std::shared_ptr<nvinfer1::ICudaEngine> trt_engine_{nullptr};
-  std::shared_ptr<nvinfer1::IExecutionContext> trt_executor_{nullptr};
-  bool batch_warning_issued_{false};
-};
-
-} // namespace caffe2
-
diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
deleted file mode 100644
index f1414deca8caa..0000000000000
--- a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
+++ /dev/null
@@ -1,533 +0,0 @@
-#include "caffe2/contrib/tensorrt/tensorrt_tranformer.h"
-
-#include <iostream>
-#include <unordered_set>
-
-#include <NvInfer.h>
-#include <onnx2trt.hpp>
-
-#include "onnx/proto_utils.h"
-
-#include "caffe2/contrib/tensorrt/trt_utils.h"
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/onnx/onnx_exporter.h"
-#include "caffe2/opt/backend_cutting.h"
-
-namespace caffe2 {
-
-namespace {
-
-// TODO(yinghai): Remove the awkward conversion between unordered_map and map
-std::unordered_map<std::string, TensorShape> InferShapes(
-    Workspace* ws,
-    NetDef* pred_net,
-    CaffeMap<std::string, TensorShape>* shape_hints_ordered) {
-  // Populate shapes from workplace
-  const std::vector<string>& ws_blobs = ws->Blobs();
-  for (const auto& s : ws_blobs) {
-    shape_hints_ordered->emplace(s, GetTensorShapeOfBlob(ws->GetBlob(s)));
-  }
-
-  std::vector<NetDef*> nets;
-  nets.emplace_back(pred_net);
-  InferBlobShapesAndTypes(*shape_hints_ordered, nets);
-  std::unordered_map<std::string, TensorShape> shape_hints;
-  for (const auto& kv : *shape_hints_ordered) {
-    shape_hints.emplace(kv.first, kv.second);
-  }
-
-  return shape_hints;
-}
-
-void DumpModel(const ::ONNX_NAMESPACE::ModelProto& model, const std::string& fname) {
-  std::ofstream ff(fname);
-  ff << ::ONNX_NAMESPACE::ProtoDebugString(model) << std::endl;
-  ff.close();
-}
-
-void CPUTensorToTensorProto(
-    const TensorCPU& cpu_tensor,
-    ::ONNX_NAMESPACE::TensorProto* t) {
-  const auto len = cpu_tensor.size();
-  if (cpu_tensor.template IsType<float>()) {
-    t->set_data_type(::ONNX_NAMESPACE::TensorProto::FLOAT);
-    const float* data = cpu_tensor.template data<float>();
-    for (auto i = 0; i < len; ++i) {
-      t->add_float_data(*data++);
-    }
-  } else if (cpu_tensor.template IsType<int64_t>()) {
-    t->set_data_type(::ONNX_NAMESPACE::TensorProto::INT64);
-    const int64_t* data = cpu_tensor.template data<int64_t>();
-    for (auto i = 0; i < len; ++i) {
-      t->add_int64_data(*data++);
-    }
-  } else if (cpu_tensor.template IsType<int32_t>()) {
-    t->set_data_type(::ONNX_NAMESPACE::TensorProto::INT32);
-    const int32_t* data = cpu_tensor.template data<int32_t>();
-    for (auto i = 0; i < len; ++i) {
-      t->add_int32_data(*data++);
-    }
-  } else {
-    CAFFE_THROW(
-        "Don't know how to convert workspace tensor type ",
-        cpu_tensor.meta().name(),
-        " to ONNX TensorProto");
-  }
-}
-
-void BlobToTensorProto(
-    const std::string& name,
-    Workspace* ws,
-    CUDAContext* context,
-    ::ONNX_NAMESPACE::TensorProto* t) {
-  // Set name
-  t->set_name(name);
-  const Blob* blob = ws->GetBlob(name);
-  CAFFE_ENFORCE(blob, "Blob ", name, " doesn't exist");
-
-  // Set dims
-  const auto shape = GetTensorShapeOfBlob(blob);
-  for (const auto i : shape.dims()) {
-    t->add_dims(i);
-  }
-
-  // Set values
-  if (BlobIsTensorType(*blob, CPU)) {
-    const auto& cpu_tensor = blob->template Get<TensorCPU>();
-    CPUTensorToTensorProto(cpu_tensor, t);
-  } else if (BlobIsTensorType(*blob, CUDA)) {
-    const auto& cuda_tensor = blob->template Get<TensorCUDA>();
-    const auto cpu_tensor = TensorCPU(cuda_tensor, CPU);
-    context->FinishDeviceComputation();
-    CPUTensorToTensorProto(cpu_tensor, t);
-  } else {
-    CAFFE_THROW(
-        "Initialization blob ",
-        name,
-        " needs to be either TensorCPU or TensorCUDA");
-  }
-}
-
-std::vector<::ONNX_NAMESPACE::ValueInfoProto> ConvertToValueInfo(
-    const std::vector<std::string>& names,
-    const std::unordered_map<std::string, TensorShape>& shape_hints) {
-  std::vector<::ONNX_NAMESPACE::ValueInfoProto> r;
-  for (const auto& s : names) {
-    r.emplace_back();
-    auto& value_info = r.back();
-    value_info.set_name(s);
-    const auto it = shape_hints.find(s);
-    if (it == shape_hints.end()) {
-      LOG(WARNING) << "Cannot get shape of " << s;
-    } else {
-      auto* tensor_type = value_info.mutable_type()->mutable_tensor_type();
-      tensor_type->set_elem_type(
-          ::ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT);
-      auto* shape = tensor_type->mutable_shape();
-      for (int i = 0; i < it->second.dims().size(); ++i) {
-        auto* dim = shape->add_dim();
-        dim->set_dim_value(it->second.dims(i));
-      }
-    }
-  }
-  return r;
-}
-
-void FillModelInfo(::ONNX_NAMESPACE::ModelProto* model) {
-  model->set_ir_version(::ONNX_NAMESPACE::Version::IR_VERSION);
-  model->set_producer_name("caffe2");
-  auto* opset_id = model->add_opset_import();
-  opset_id->set_domain("");
-  opset_id->set_version(7);
-}
-} // namespace
-
-void BuildInitializationList(
-    Workspace* ws,
-    ::ONNX_NAMESPACE::GraphProto* g,
-    std::unordered_set<std::string>* initialization_list) {
-  const std::vector<string>& ws_blobs = ws->Blobs();
-
-  // Create a CUDA context and reuse it for potential tensor copies across
-  // devices
-  CUDAContext context;
-
-  for (const auto& s : ws_blobs) {
-    auto it = initialization_list->find(s);
-    if (it != initialization_list->end()) {
-      auto* init_tensor = g->add_initializer();
-      BlobToTensorProto(s, ws, &context, init_tensor);
-      initialization_list->erase(it);
-    }
-  }
-  CAFFE_ENFORCE(
-      initialization_list->empty(), "Unfulfilled initialization list");
-  for (const auto& t : g->initializer()) {
-    VLOG(2) << "Initializer: " << t.name();
-  }
-}
-
-void TensorRTTransformer::AddTrtOptions(
-    OperatorDef* op,
-    const std::unordered_map<std::string, std::vector<int>>&
-        output_size_hints) {
-  auto* max_batch_size_arg = op->add_arg();
-  max_batch_size_arg->set_name("max_batch_size");
-  max_batch_size_arg->set_i(max_batch_size_);
-
-  auto* verbosity_arg = op->add_arg();
-  verbosity_arg->set_name("log_verbosity");
-  verbosity_arg->set_i(verbosity_);
-
-  for (int i = 0; i < op->output_size(); ++i) {
-    const auto& o = op->output(i);
-    const auto it = output_size_hints.find(o);
-    if (it != output_size_hints.end()) {
-      const auto& dims = it->second;
-      auto* output_size_hint_arg = op->add_arg();
-      output_size_hint_arg->set_name(c10::str("output_size_hint_", i));
-      for (const auto& d : dims) {
-        output_size_hint_arg->add_ints(d);
-      }
-
-      LOG(INFO) << "Adding output hint: " << o;
-    }
-  }
-}
-
-OperatorDef TensorRTTransformer::BuildTrtOpLazy(
-    const std::string& onnx_model_str,
-    const std::unordered_map<std::string, std::vector<int>>& output_size_hints,
-    const std::unordered_set<std::string>& initialization_list,
-    const caffe2::NetDef& net) {
-  OperatorDef op;
-  op.set_type("TensorRT");
-  auto* onnx_model_arg = op.add_arg();
-  onnx_model_arg->set_name("onnx_model");
-  onnx_model_arg->set_s(onnx_model_str);
-
-  // Add the names of the initializer blobs that we want to fetch from the
-  // workspace later
-  auto* initializers_arg = op.add_arg();
-  initializers_arg->set_name("initializers");
-  for (const auto& s : initialization_list) {
-    initializers_arg->add_strings(s);
-    initializers_arg->add_strings(input_mapping_.at(s));
-  }
-
-  // Add the input/output
-  for (const auto& input : net.external_input()) {
-    if (!initialization_list.count(input)) {
-      op.add_input(input);
-    }
-  }
-  for (const auto& output : net.external_output()) {
-    op.add_output(output);
-  }
-
-  // Additional arguments for TRT builder
-  auto* debug_builder_arg = op.add_arg();
-  debug_builder_arg->set_name("debug_builder");
-  debug_builder_arg->set_i(debug_builder_);
-  auto* max_workspace_size_arg = op.add_arg();
-  max_workspace_size_arg->set_name("max_workspace_size");
-  max_workspace_size_arg->set_i(max_workspace_size_);
-  AddTrtOptions(&op, output_size_hints);
-  return op;
-}
-
-OperatorDef TensorRTTransformer::BuildTrtOp(
-    const std::string& onnx_model_str,
-    const std::unordered_map<std::string, std::vector<int>>& output_size_hints) {
-  OperatorDef op;
-  op.set_type("TensorRT");
-
-  tensorrt::TrtLogger logger;
-  auto trt_engine = tensorrt::BuildTrtEngine(
-      onnx_model_str,
-      &logger,
-      max_batch_size_,
-      max_workspace_size_,
-      debug_builder_);
-
-  // Set up inputs/outputs in the order of they appearnce in getNbBindings
-  int num_bindings = trt_engine->getNbBindings();
-  for (int b = 0; b < num_bindings; ++b) {
-    const auto& name = trt_engine->getBindingName(b);
-    if (trt_engine->bindingIsInput(b)) {
-      op.add_input(name);
-    } else {
-      op.add_output(name);
-    }
-  }
-
-  auto engine_plan = tensorrt::TrtObject(trt_engine->serialize());
-  auto* serialized_engine_arg = op.add_arg();
-  serialized_engine_arg->set_s("");
-  serialized_engine_arg->set_name("backend_buffer");
-  auto* s = serialized_engine_arg->mutable_s();
-  s->assign((char*)engine_plan->data(), engine_plan->size());
-
-  AddTrtOptions(&op, output_size_hints);
-
-  return op;
-}
-
-NetDef TensorRTTransformer::SubnetToTrtOp(
-    const caffe2::NetDef& net,
-    Workspace* ws,
-    onnx::OnnxExporter* exporter,
-    std::unordered_map<std::string, TensorShape>* shape_hints) {
-  ::ONNX_NAMESPACE::ModelProto onnx_model;
-  FillModelInfo(&onnx_model);
-
-  // Convert c2 ops to onnx ops, add const weights if there are any
-  for (const auto& op : net.op()) {
-    const auto results = exporter->Caffe2OpToOnnxNodes(op, *shape_hints);
-    const auto& node_protos = results.first;
-    for (const auto& n : node_protos) {
-      onnx_model.mutable_graph()->add_node()->CopyFrom(n);
-    }
-    for (const auto& t : results.second) {
-      VLOG(2) << "Adding extra init tensor: " << t.name();
-      TensorShape shape;
-      shape.mutable_dims()->CopyFrom(t.dims());
-      shape_hints->emplace(t.name(), std::move(shape));
-      ::ONNX_NAMESPACE::TensorProto tf;
-      tf.set_name(t.name());
-      tf.mutable_dims()->CopyFrom(t.dims());
-
-      if (t.data_type() == ::ONNX_NAMESPACE::TensorProto::FLOAT) {
-        tf.set_data_type(::ONNX_NAMESPACE::TensorProto::FLOAT);
-        std::vector<int64_t> v;
-        v.resize(t.raw_data().size() / sizeof(int64_t));
-        memcpy(v.data(), t.raw_data().data(), t.raw_data().size());
-        std::vector<float> vf;
-        for (auto i : v) {
-          vf.push_back(static_cast<float>(i));
-        }
-        tf.mutable_raw_data()->assign(
-            reinterpret_cast<const char *>(vf.data()), sizeof(float) * vf.size());
-      } else if (t.data_type() == ::ONNX_NAMESPACE::TensorProto::INT64) {
-        tf.set_data_type(::ONNX_NAMESPACE::TensorProto::INT64);
-        tf.mutable_raw_data()->assign(t.raw_data().data(), t.raw_data().size());
-      } else {
-        CAFFE_THROW("Unsupported tensor data type for conversion: ",
-            t.data_type());
-      }
-      onnx_model.mutable_graph()->add_initializer()->CopyFrom(tf);
-    }
-  }
-
-  // Convert outputs and compute output shape hints
-  std::vector<std::string> io_names;
-  for (const auto& output : net.external_output()) {
-    io_names.emplace_back(output);
-  }
-  auto io_vec = ConvertToValueInfo(io_names, *shape_hints);
-  std::unordered_map<std::string, std::vector<int>> output_shape_hints;
-  for (const auto& i : io_vec) {
-    onnx_model.mutable_graph()->add_output()->CopyFrom(i);
-    auto ret = output_shape_hints.emplace(i.name(), std::vector<int>());
-    auto& vec = ret.first->second;
-    const auto it = shape_hints->find(i.name());
-    CAFFE_ENFORCE(
-        it != shape_hints->end(),
-        "Cannot find shape info for output ",
-        i.name());
-    const auto& shape = it->second;
-    for (int k = 0; k < shape.dims().size(); ++k) {
-      vec.push_back(shape.dims(k));
-    }
-  }
-
-  // Convert inputs and figure out weights
-  std::unordered_set<std::string> weights;
-  const std::vector<string>& ws_blobs = ws->Blobs();
-  for (const auto& s : ws_blobs) {
-    VLOG(2) << "Add weights: " << s;
-    weights.emplace(s);
-  }
-
-  std::unordered_set<std::string> total_inputs;
-  std::unordered_set<std::string> initialization_list;
-  std::vector<std::string> total_inputs_vec;
-
-  // Extra intermediate weights created during conversion
-  for (const auto& extra_weight : onnx_model.graph().initializer()) {
-    if (total_inputs.emplace(extra_weight.name()).second) {
-      total_inputs_vec.emplace_back(extra_weight.name());
-    }
-  }
-  // Boundary inputs, should not be weights
-  std::unordered_set<std::string> boundary_inputs;
-  for (const auto& i : net.external_input()) {
-    boundary_inputs.emplace(i);
-  }
-
-  for (const auto& op : net.op()) {
-    for (const auto& input : op.input()) {
-      if (total_inputs.emplace(input).second && weights.count(input)) {
-        // We add weights as inputs too
-        total_inputs_vec.emplace_back(input);
-        initialization_list.emplace(input);
-        VLOG(2) << "Add input weights: " << input;
-      } else if (boundary_inputs.count(input)) {
-        VLOG(2) << "Adding boundary input: " << input;
-        total_inputs_vec.emplace_back(input);
-      }
-    }
-  }
-  io_vec = ConvertToValueInfo(total_inputs_vec, *shape_hints);
-  for (const auto& i : io_vec) {
-    onnx_model.mutable_graph()->add_input()->CopyFrom(i);
-  }
-
-  // Debug stuff
-  if (debug_builder_) {
-    DumpModel(onnx_model, "debug.onnxtxt");
-  }
-
-  // Convert weights to initializing tensors if we are building serializable trt
-  // op or defer it to construction time of trt op
-  if (build_serializable_op_) {
-    BuildInitializationList(
-        ws, onnx_model.mutable_graph(), &initialization_list);
-  }
-
-  // Onnx model is ready. Call onnx-trt to convert to one trt c2 op
-  std::string model_str;
-  onnx_model.SerializeToString(&model_str);
-  NetDef net_opt;
-  auto* op = net_opt.add_op();
-  if (build_serializable_op_) {
-    *op = BuildTrtOp(model_str, output_shape_hints);
-  } else {
-    *op =
-        BuildTrtOpLazy(model_str, output_shape_hints, initialization_list, net);
-  }
-  for (const auto& i : op->input()) {
-    net_opt.add_external_input(i);
-  }
-  for (const auto& i : op->output()) {
-    net_opt.add_external_output(i);
-  }
-
-  return net_opt;
-}
-
-CaffeMap<std::string, TensorShape> TensorRTTransformer::SsaRewriteAndMapNames(
-    Workspace* ws,
-    NetDef* pred_net,
-    const std::unordered_map<std::string, TensorShape>& input_shape_hints) {
-  input_mapping_ = onnx::SsaRewrite(nullptr, pred_net);
-  std::unordered_map<std::string, std::string> input_reverse_mapping;
-  std::vector<std::string> external_inputs;
-  for (const auto kv : input_mapping_) {
-    input_reverse_mapping.emplace(kv.second, kv.first);
-    if (!ws->HasBlob(kv.second)) {
-      external_inputs.emplace_back(kv.first);
-    }
-  }
-  for (const auto& i : external_inputs) {
-    input_mapping_.erase(i);
-  }
-  CaffeMap<std::string, TensorShape> shape_hints_ordered;
-  for (const auto& kv : input_shape_hints) {
-    const auto it = input_reverse_mapping.find(kv.first);
-    if (it != input_reverse_mapping.end()) {
-      LOG(INFO) << "Adding input hint: " << it->second;
-      shape_hints_ordered.emplace(it->second, kv.second);
-    } else {
-      shape_hints_ordered.emplace(kv.first, kv.second);
-    }
-  }
-  return shape_hints_ordered;
-}
-
-void TensorRTTransformer::PruneUnusedWeights(
-    Workspace* ws,
-    const NetDef& pred_net) {
-  std::unordered_set<std::string> used_weights;
-  for (const auto& op : pred_net.op()) {
-    for (const auto& i : op.input()) {
-      used_weights.emplace(i);
-    }
-  }
-
-  for (const auto kv : input_mapping_) {
-    // for weights that are not referenced anywhere, we remove it from the
-    // original workspace
-    if (!used_weights.count(kv.first)) {
-      VLOG(2) << "Removing unused weight blob: " << kv.second << " ("
-              << kv.first << ")";
-      ws->RemoveBlob(kv.second);
-    }
-  }
-}
-
-// Cutting off the runnable part and replace with tensor ops. Asssume the nets
-// were topologically sorted
-void TensorRTTransformer::Transform(
-    Workspace* ws,
-    NetDef* pred_net,
-    const std::unordered_map<std::string, TensorShape>& input_shape_hints) {
-  CAFFE_ENFORCE(ws);
-  auto shape_hints_ordered =
-      SsaRewriteAndMapNames(ws, pred_net, input_shape_hints);
-  Workspace mapped_ws(ws, input_mapping_);
-  auto shape_hints = InferShapes(&mapped_ws, pred_net, &shape_hints_ordered);
-
-  CAFFE_ENFORCE(pred_net, "Predict net cannot be nullptr");
-  onnx::OnnxExporter exporter(nullptr);
-  tensorrt::TrtLogger logger;
-  auto trt_builder = tensorrt::TrtObject(nvinfer1::createInferBuilder(logger));
-  auto trt_network = tensorrt::TrtObject(trt_builder->createNetwork());
-  auto importer =
-      tensorrt::TrtObject(nvonnxparser::createParser(*trt_network, logger));
-
-  // function to tell whether TensorRT supports a given C2 op or not
-  auto supports =
-      [&exporter, &shape_hints, importer](const caffe2::OperatorDef& op) {
-        const OpSchema* schema = OpSchemaRegistry::Schema(op.type());
-        if (!schema || schema->onnx_schema().empty()) {
-          LOG(INFO) << "Cannot export c2 op " << op.type() << " to onnx";
-          return false;
-        }
-
-        auto results = exporter.Caffe2OpToOnnxNodes(op, shape_hints);
-        for (const auto& n : results.first) {
-          if (!importer->supportsOperator(n.op_type().c_str())) {
-            LOG(INFO) << "TRT does not support ONNX node " << n.op_type();
-            return false;
-          }
-        }
-        return true;
-      };
-
-  // function to convert runnable subgraph into a trt op. Note that to keep the
-  // interface clean, we do the double conversion from C2 op to Onnx ops here
-  // but it should be OK as the cost is really small. We also need to keep the
-  // same exporter throughout the process to avoid duplicated dummy name
-  // generation
-  onnx::OnnxExporter exporter2(nullptr);
-  auto trt_converter = [this, &mapped_ws, &shape_hints, &exporter2](
-                           const caffe2::NetDef& net) mutable {
-    return SubnetToTrtOp(net, &mapped_ws, &exporter2, &shape_hints);
-  };
-
-  auto cutResult = opt::OptimizeForBackend(*pred_net, supports, trt_converter);
-  NetDef net_opt = std::move(cutResult.net);
-
-  // Need to figure out a proper place to handle device option
-  net_opt.mutable_device_option()->CopyFrom(pred_net->device_option());
-  pred_net->Swap(&net_opt);
-
-  if (build_serializable_op_) {
-    PruneUnusedWeights(ws, *pred_net);
-  }
-}
-
-} // namespace caffe2
diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.h b/caffe2/contrib/tensorrt/tensorrt_tranformer.h
deleted file mode 100644
index 4d4e92dbf4bc0..0000000000000
--- a/caffe2/contrib/tensorrt/tensorrt_tranformer.h
+++ /dev/null
@@ -1,92 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/workspace.h"
-#include "caffe2/onnx/onnx_exporter.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "onnx/onnx_pb.h"
-
-namespace caffe2 {
-
-TORCH_API void BuildInitializationList(
-    Workspace* ws,
-    ::ONNX_NAMESPACE::GraphProto* g,
-    std::unordered_set<std::string>* initialization_list);
-
-class TORCH_API TensorRTTransformer {
- public:
-  TensorRTTransformer(
-      size_t max_batch_size,
-      size_t max_workspace_size,
-      int verbosity,
-      bool debug_builder,
-      bool build_serializable_op = false)
-      : build_serializable_op_(build_serializable_op),
-        max_batch_size_(max_batch_size),
-        max_workspace_size_(max_workspace_size),
-        verbosity_(verbosity),
-        debug_builder_(debug_builder) {}
-
-  OperatorDef BuildTrtOp(
-      const std::string& onnx_model_str,
-      const std::unordered_map<std::string, std::vector<int>>&
-          output_size_hints);
-
-  void Transform(
-      Workspace* ws,
-      NetDef* pred_net,
-      const std::unordered_map<std::string, TensorShape>& shape_hints);
-
- private:
-  caffe2::NetDef SubnetToTrtOp(
-      const caffe2::NetDef& net,
-      Workspace* ws,
-      onnx::OnnxExporter* exporter,
-      std::unordered_map<std::string, TensorShape>* shape_hints);
-
-  void AddTrtOptions(
-      caffe2::OperatorDef* op,
-      const std::unordered_map<std::string, std::vector<int>>&
-          output_size_hints);
-
-  // A lazy version of Trt op building function, where instead of invoking the
-  // trt build engine and serialize the trt runtime, we just attach the
-  // serialized trt model string. The runtime will be built when trt op is
-  // constructed, during which the weights will be pulled from the workspace.
-  // The benefit of doing so is that we can avoid serialize/deserialize the
-  // weights across OperatorDef.
-  OperatorDef BuildTrtOpLazy(
-      const std::string& onnx_model_str,
-      const std::unordered_map<std::string, std::vector<int>>&
-          output_size_hints,
-      const std::unordered_set<std::string>& initialization_list,
-      const caffe2::NetDef& net);
-
-  CaffeMap<std::string, TensorShape> SsaRewriteAndMapNames(
-      Workspace* ws,
-      NetDef* pred_net,
-      const std::unordered_map<std::string, TensorShape>& input_shape_hints);
-
-  // Prune the unreferenced weights in original workspace to save memory
-  void PruneUnusedWeights(Workspace* ws, const NetDef& pred_net);
-
-  // Input mapping
-  std::unordered_map<std::string, std::string> input_mapping_;
-
-  // Generate serializable trt op or defer the onnx->trt process to ctor of the
-  // Trt op
-  bool build_serializable_op_{true};
-
-  // TensorRT params
-  size_t max_batch_size_{50};
-  size_t max_workspace_size_{1024 * 1024 * 2};
-  int verbosity_{2};
-  bool debug_builder_{false};
-};
-} // namespace caffe2
diff --git a/caffe2/contrib/tensorrt/trt_utils.cc b/caffe2/contrib/tensorrt/trt_utils.cc
deleted file mode 100644
index 5a03a7459faa8..0000000000000
--- a/caffe2/contrib/tensorrt/trt_utils.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-#include "caffe2/contrib/tensorrt/trt_utils.h"
-
-#include <NvOnnxParser.h>
-
-namespace caffe2 {
-namespace tensorrt {
-std::shared_ptr<nvinfer1::ICudaEngine> BuildTrtEngine(
-    const std::string& onnx_model_str,
-    TrtLogger* logger,
-    size_t max_batch_size,
-    size_t max_workspace_size,
-    bool debug_builder) {
-  auto trt_builder = TrtObject(nvinfer1::createInferBuilder(*logger));
-#if defined(TENSORRT_VERSION_MAJOR) && (TENSORRT_VERSION_MAJOR >= 6)
-  auto trt_builder_cfg = TrtObject(trt_builder->createBuilderConfig());
-  // TensorRTOp doesn't support dynamic shapes yet
-  auto trt_network = TrtObject(trt_builder->createNetworkV2(
-      1U << static_cast<uint32_t>(nvinfer1::
-      NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));
-#else
-  auto trt_network = TrtObject(trt_builder->createNetwork());
-#endif
-  auto trt_parser =
-      TrtObject(nvonnxparser::createParser(*trt_network, *logger));
-  auto status = trt_parser->parse(onnx_model_str.data(), onnx_model_str.size());
-  if (!status) {
-    const auto num_errors = trt_parser->getNbErrors();
-    if (num_errors > 0) {
-      const auto* error = trt_parser->getError(num_errors - 1);
-      CAFFE_THROW(
-          "TensorRTTransformer ERROR: ",
-          error->file(),
-          ":",
-          error->line(),
-          " In function ",
-          error->func(),
-          ":\n",
-          "[",
-          static_cast<int>(error->code()),
-          "] ",
-          error->desc());
-    } else {
-      CAFFE_THROW("TensorRTTransformer Unknown Error");
-    }
-  }
-  trt_builder->setMaxBatchSize(max_batch_size);
-#if defined(TENSORRT_VERSION_MAJOR) && (TENSORRT_VERSION_MAJOR >= 6)
-  trt_builder_cfg->setMaxWorkspaceSize(max_workspace_size);
-  if (debug_builder) {
-    trt_builder_cfg->setFlag(nvinfer1::BuilderFlag::kDEBUG);
-  }
-  trt_builder_cfg->setDefaultDeviceType(nvinfer1::DeviceType::kGPU);
-  return TrtObject(trt_builder->
-      buildEngineWithConfig(*trt_network.get(), *trt_builder_cfg));
-#else
-  trt_builder->setMaxWorkspaceSize(max_workspace_size);
-  trt_builder->setDebugSync(debug_builder);
-  return TrtObject(trt_builder->buildCudaEngine(*trt_network.get()));
-#endif
-}
-} // namespace tensorrt
-} // namespace caffe2
diff --git a/caffe2/contrib/tensorrt/trt_utils.h b/caffe2/contrib/tensorrt/trt_utils.h
deleted file mode 100644
index 737764e534050..0000000000000
--- a/caffe2/contrib/tensorrt/trt_utils.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <NvInfer.h>
-
-#include "caffe2/core/logging.h"
-
-namespace caffe2 { namespace tensorrt {
-
-  // Logger for GIE info/warning/errors
-class TrtLogger : public nvinfer1::ILogger {
-  using nvinfer1::ILogger::Severity;
-
- public:
-  TrtLogger(Severity verbosity = Severity::kWARNING) : _verbosity(verbosity) {}
-  void log(Severity severity, const char* msg) override {
-    if (severity <= _verbosity) {
-      if (severity == Severity::kINTERNAL_ERROR || severity == Severity::kERROR) {
-        LOG(ERROR) << msg;
-      } else if (severity == Severity::kWARNING) {
-        LOG(WARNING)  << msg;
-      } else if (severity == Severity::kINFO) {
-        LOG(INFO) << msg;
-      }
-    }
-  }
-
- private:
-  Severity _verbosity;
-};
-
-struct TrtDeleter {
-  template <typename T>
-  void operator()(T* obj) const {
-    if (obj) {
-      obj->destroy();
-    }
-  }
-};
-
-template <typename T>
-inline std::shared_ptr<T> TrtObject(T* obj) {
-  CAFFE_ENFORCE(obj, "Failed to create TensorRt object");
-  return std::shared_ptr<T>(obj, TrtDeleter());
-}
-
-std::shared_ptr<nvinfer1::ICudaEngine> BuildTrtEngine(
-    const std::string& onnx_model_str,
-    TrtLogger* logger,
-    size_t max_batch_size,
-    size_t max_workspace_size,
-    bool debug_builder);
-}
-}
-
diff --git a/caffe2/contrib/warpctc/ctc_op.cpp b/caffe2/contrib/warpctc/ctc_op.cpp
deleted file mode 100644
index e5ec5ff58a062..0000000000000
--- a/caffe2/contrib/warpctc/ctc_op.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-#include "ctc_op.h"
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/operator.h"
-
-#ifdef USE_MKLDNN
-#include <caffe2/ideep/operators/operator_fallback_ideep.h>
-#include <caffe2/ideep/utils/ideep_operator.h>
-#endif
-
-namespace caffe2 {
-
-namespace detail {
-template <>
-ctcComputeInfo workspaceInfo<CPUContext>(const CPUContext& /*context*/) {
-  ctcComputeInfo result;
-  result.loc = CTC_CPU;
-  // CpuCTC overrides OMP threads set by --caffe2_omp_num_threads on init.
-  // Default to 0 to use the configured omp_get_max_threads().
-  result.num_threads = 0;
-  return result;
-}
-}
-
-REGISTER_CPU_OPERATOR(CTC, CTCOp<float, CPUContext>);
-OPERATOR_SCHEMA(CTC).NumInputs(3, 4).NumOutputs(2, 3);
-//    .EnforceInputOutputGradient({{0, 0}});
-
-#ifdef USE_MKLDNN
-REGISTER_IDEEP_OPERATOR(CTC, IDEEPFallbackOp<CTCOp<float, CPUContext>>);
-#endif
-
-namespace {
-class GetCTCGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "Copy", "", vector<string>{O(0)}, vector<string>{GI(0)});
-  }
-};
-}
-REGISTER_GRADIENT(CTC, GetCTCGradient);
-}
diff --git a/caffe2/contrib/warpctc/ctc_op.h b/caffe2/contrib/warpctc/ctc_op.h
deleted file mode 100644
index 2be5dfaa91573..0000000000000
--- a/caffe2/contrib/warpctc/ctc_op.h
+++ /dev/null
@@ -1,127 +0,0 @@
-#pragma once
-
-#include <ctc.h>
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-#include "caffe2/core/common_cudnn.h"
-
-#define CTC_CHECK(condition)           \
-  do {                                 \
-    ctcStatus_t status = condition;    \
-    CAFFE_ENFORCE_EQ(                  \
-        status,                        \
-        CTC_STATUS_SUCCESS,            \
-        " Error at: ",                 \
-        __FILE__,                      \
-        ":",                           \
-        __LINE__,                      \
-        ": ",                          \
-        ::ctcGetStatusString(status)); \
-  } while (0)
-
-namespace caffe2 {
-
-namespace detail {
-
-template <typename Context>
-ctcComputeInfo workspaceInfo(const Context& context);
-
-}
-
-template <typename T, typename Context>
-class CTCOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  CTCOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        is_test_(
-            OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
-    CAFFE_ENFORCE(
-        (is_test_ && OutputSize() == 2) || (!is_test_ && OutputSize() == 3));
-  }
-
-  bool RunOnDevice() override {
-    // inputs
-    const auto& inputs = Input(INPUTS);
-    const auto maxTimeSteps = inputs.size(0);
-    const auto minibatchSize = inputs.size(1);
-    const auto alphabetSize = inputs.size(2);
-    const auto& labels = OperatorBase::template Input<Tensor>(LABELS, CPU);
-    const auto& labelLengths =
-        OperatorBase::template Input<Tensor>(LABEL_LENGTHS, CPU);
-
-    const int* inputLengthsData = nullptr;
-    if (InputSize() == 4) {
-      const auto& inputLengths =
-          OperatorBase::template Input<Tensor>(INPUT_LENGTHS, CPU);
-      inputLengthsData = inputLengths.template data<int>();
-    } else {
-      // Input lengths not passed in. Default to max timesteps for
-      // each item in minibatch.
-      default_input_lengths_.resize(minibatchSize, maxTimeSteps);
-      inputLengthsData = default_input_lengths_.data();
-    }
-
-    // outputs
-    Tensor* gradients = nullptr;
-    TensorCPU* costs;
-    Tensor* workspace;
-    if (!is_test_) {
-      // [grads, costs, workspace] to maintain backward compatibility
-      gradients = Output(0);
-      gradients->ResizeLike(inputs);
-      costs = OperatorBase::template Output<Tensor>(1, CPU);
-      costs->ResizeLike(labelLengths);
-      workspace = Output(2);
-    } else {
-      // [costs, workspace]
-      costs = OperatorBase::template Output<Tensor>(0, CPU);
-      costs->ResizeLike(labelLengths);
-      workspace = Output(1);
-    }
-
-    size_t workspaceSizeBytes;
-    CTC_CHECK(get_workspace_size(
-        labelLengths.template data<int>(),
-        inputLengthsData,
-        alphabetSize,
-        minibatchSize,
-        detail::workspaceInfo(context_),
-        &workspaceSizeBytes));
-    workspace->Resize(workspaceSizeBytes);
-    auto* workspaceData = workspace->template mutable_data<uint8_t>();
-
-    if (is_test_ && labels.size(0) == 0) {
-      // compute_ctc_loss doesn't handle empty labels well
-      T* costsData = costs->template mutable_data<T>();
-      for (int i = 0; i < costs->numel(); ++i) {
-        costsData[i] = 0;
-      }
-      return true;
-    }
-
-    CTC_CHECK(compute_ctc_loss(
-        inputs.template data<T>(),
-        gradients ? gradients->template mutable_data<T>() : nullptr,
-        labels.template data<int>(),
-        labelLengths.template data<int>(),
-        inputLengthsData,
-        alphabetSize,
-        minibatchSize,
-        costs->template mutable_data<T>(),
-        workspaceData,
-        detail::workspaceInfo(context_)));
-    return true;
-  }
-
-private:
- bool is_test_;
- std::vector<int> default_input_lengths_;
-
- INPUT_TAGS(INPUTS, LABELS, LABEL_LENGTHS, INPUT_LENGTHS);
-};
-}
-
-#undef CTC_CHECK
diff --git a/caffe2/contrib/warpctc/ctc_op_gpu.cpp b/caffe2/contrib/warpctc/ctc_op_gpu.cpp
deleted file mode 100644
index e6b399a10a84b..0000000000000
--- a/caffe2/contrib/warpctc/ctc_op_gpu.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/operator.h"
-#include "ctc_op.h"
-
-namespace caffe2 {
-
-namespace detail {
-template <>
-ctcComputeInfo workspaceInfo<CUDAContext>(const CUDAContext& context) {
-  ctcComputeInfo result;
-  result.loc = CTC_GPU;
-  result.stream = context.cuda_stream();
-  return result;
-}
-}
-
-REGISTER_CUDA_OPERATOR(CTC, CTCOp<float, CUDAContext>);
-}
diff --git a/caffe2/contrib/warpctc/ctc_ops_test.py b/caffe2/contrib/warpctc/ctc_ops_test.py
deleted file mode 100644
index 013e80a98773d..0000000000000
--- a/caffe2/contrib/warpctc/ctc_ops_test.py
+++ /dev/null
@@ -1,108 +0,0 @@
-
-
-
-
-import numpy as np
-from caffe2.proto import caffe2_pb2
-
-from caffe2.python import core, workspace, dyndep, test_util
-
-dyndep.InitOpsLibrary('@/caffe2/caffe2/contrib/warpctc:ctc_ops')
-workspace.GlobalInit(["python"])
-
-
-def softmax(w):
-    maxes = np.amax(w, axis=-1, keepdims=True)
-    e = np.exp(w - maxes)
-    dist = e / np.sum(e, axis=-1, keepdims=True)
-    return dist
-
-
-class CTCOpsTest(test_util.TestCase):
-    def verify_cost(self, device_option, is_test, skip_input_lengths=False):
-        alphabet_size = 5
-        N = 1
-        T = 2
-
-        inputs = np.asarray(
-            [
-                [[0.1, 0.6, 0.1, 0.1, 0.1]],
-                [[0.1, 0.1, 0.6, 0.1, 0.1]],
-            ]
-        ).reshape(T, N, alphabet_size).astype(np.float32)
-
-        labels = np.asarray([1, 2]).astype(np.int32).reshape(T)
-        label_lengths = np.asarray([2]).astype(np.int32).reshape(N)
-        input_lengths = np.asarray([T]).astype(np.int32)
-
-        net = core.Net("test-net")
-        input_blobs = ["inputs", "labels", "label_lengths"]
-        if not skip_input_lengths:
-            input_blobs.append("input_lengths")
-        output_blobs = ["costs", "workspace"] if is_test \
-                else ["inputs_grad_to_be_copied", "costs", "workspace"]
-        net.CTC(input_blobs,
-                output_blobs,
-                is_test=is_test,
-                device_option=device_option)
-        if not is_test:
-            net.AddGradientOperators(["costs"])
-        self.ws.create_blob("inputs").feed(inputs, device_option=device_option)
-        self.ws.create_blob("labels").feed(labels)
-        self.ws.create_blob("label_lengths").feed(label_lengths)
-        if not skip_input_lengths:
-            self.ws.create_blob("input_lengths").feed(input_lengths)
-        self.ws.run(net)
-        probs = softmax(inputs)
-        expected = probs[0, 0, 1] * probs[1, 0, 2]
-        self.assertEqual(self.ws.blobs["costs"].fetch().shape, (N,))
-        self.assertEqual(self.ws.blobs["costs"].fetch().dtype, np.float32)
-        cost = self.ws.blobs["costs"].fetch()[0]
-        print(cost)
-        self.assertAlmostEqual(np.exp(-cost), expected)
-        if not is_test:
-            # Make sure inputs_grad was added by AddGradientOperators and
-            # it is equal to the inputs_grad_to_be_copied blob returned by CTCop
-            assert np.array_equal(
-                self.ws.blobs["inputs_grad"].fetch(),
-                self.ws.blobs["inputs_grad_to_be_copied"].fetch()
-            )
-
-    def test_ctc_cost_cpu(self):
-        self.verify_cost(
-            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU),
-            is_test=False)
-        self.verify_cost(
-            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU),
-            is_test=False, skip_input_lengths=True)
-
-    def test_ctc_cost_gpu(self):
-        self.verify_cost(
-            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
-            is_test=False)
-        self.verify_cost(
-            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
-            is_test=False,
-            skip_input_lengths=True)
-
-    def test_ctc_forward_only_cpu(self):
-        self.verify_cost(
-            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU),
-            is_test=True)
-        self.verify_cost(
-            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU),
-            is_test=True,
-            skip_input_lengths=True)
-
-    def test_ctc_forward_only_gpu(self):
-        self.verify_cost(
-            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
-            is_test=True)
-        self.verify_cost(
-            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
-                                    device_id=0),
-            is_test=True,
-            skip_input_lengths=True)
diff --git a/caffe2/core/common_cudnn.h b/caffe2/core/common_cudnn.h
index 5161d26314d28..b130103fb5cb7 100644
--- a/caffe2/core/common_cudnn.h
+++ b/caffe2/core/common_cudnn.h
@@ -16,17 +16,12 @@
 #include <cudnn.h>
 
 static_assert(
-    CUDNN_VERSION >= 5000,
-    "Caffe2 requires cudnn version 5.0 or above.");
-
-#if CUDNN_VERSION < 6000
-#pragma message "CUDNN version under 6.0 is supported at best effort."
-#pragma message "We strongly encourage you to move to 6.0 and above."
-#pragma message "This message is intended to annoy you enough to update."
-#endif // CUDNN_VERSION < 6000
+    CUDNN_VERSION >= 8200,
+    "Caffe2 requires cudnn version 8.2 or above.");
 
 #define CUDNN_VERSION_MIN(major, minor, patch) \
-  (CUDNN_VERSION >= ((major) * 1000 + (minor) * 100 + (patch)))
+    (major >= 9 ? CUDNN_VERSION >= ((major) * 10000 + (minor) * 100 + (patch)) : \
+                  CUDNN_VERSION >= ((major) * 1000 + (minor) * 100 + (patch)))
 
 namespace caffe2 {
 
@@ -135,7 +130,6 @@ class cudnnTypeWrapper<float> {
   }
 };
 
-#if CUDNN_VERSION_MIN(6, 0, 0)
 template <>
 class cudnnTypeWrapper<int> {
  public:
@@ -151,7 +145,6 @@ class cudnnTypeWrapper<int> {
     return &v;
   }
 };
-#endif // CUDNN_VERSION_MIN(6, 0, 0)
 
 template <>
 class cudnnTypeWrapper<double> {
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index 3359e88bcba20..ecc933ac7fad1 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -306,7 +306,7 @@ struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator {
     baseAllocator_ = GetDefaultCPUAllocator();
   }
   ~PinnedCPUAllocator() override {}
-  at::DataPtr allocate(size_t nbytes) const override {
+  at::DataPtr allocate(size_t nbytes) override {
     if (nbytes == 0) {
       // replicate c10::alloc_cpu behavior - return nullptr
       return {nullptr, nullptr, &Delete, at::Device(CPU)};
@@ -336,6 +336,10 @@ struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator {
     return &Delete;
   }
 
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for PinnedCPUAllocator");
+  }
+
  private:
   static void Delete(void* data) {
     if (!data) {
@@ -509,7 +513,7 @@ void TrackMemoryAlloc(size_t nbytes) {
 struct DefaultCUDAAllocator final : public at::Allocator {
   DefaultCUDAAllocator() {}
   ~DefaultCUDAAllocator() override {}
-  at::DataPtr allocate(size_t nbytes) const override {
+  at::DataPtr allocate(size_t nbytes) override {
     // Lock the mutex
     std::lock_guard<std::mutex> lock(CUDAContext::mutex());
     // A one-time caffe2 cuda initializer.
@@ -581,6 +585,10 @@ struct DefaultCUDAAllocator final : public at::Allocator {
     return &Delete;
   }
 
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for DefaultCUDAAllocator");
+  }
+
  private:
   static void Delete(void* ptr) {
     // lock the mutex
diff --git a/caffe2/cuda_rtc/common_rtc.h b/caffe2/cuda_rtc/common_rtc.h
index 9d9582d34b685..0fa6bad7a0c48 100644
--- a/caffe2/cuda_rtc/common_rtc.h
+++ b/caffe2/cuda_rtc/common_rtc.h
@@ -50,11 +50,11 @@ class CudaRTCFunction {
     if (compile_result != NVRTC_SUCCESS) {
       size_t log_size;
       NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size));
-      vector<char> nvrtc_log(log_size);
-      NVRTC_CHECK(nvrtcGetProgramLog(prog, nvrtc_log.data()));
+      std::string nvrtc_log(log_size, '\0');
+      NVRTC_CHECK(nvrtcGetProgramLog(prog, &nvrtc_log[0]));
       LOG(FATAL) << "Compilation failure for nvrtc("
                  << nvrtcGetErrorString(compile_result) << "): \n"
-                 << nvrtc_log.data();
+                 << nvrtc_log;
     }
     size_t ptx_size;
     NVRTC_CHECK(nvrtcGetPTXSize(prog, &ptx_size));
diff --git a/caffe2/db/CMakeLists.txt b/caffe2/db/CMakeLists.txt
deleted file mode 100644
index eef691ea3fd0a..0000000000000
--- a/caffe2/db/CMakeLists.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-set(Caffe2_DB_COMMON_CPU_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/create_db_op.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/protodb.cc"
-)
-set(Caffe2_DB_COMMON_GPU_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/create_db_op_gpu.cc"
-)
-set(Caffe2_DB_COMMON_HIP_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/hip/create_db_op_gpu.cc"
-)
-
-# Common files that are always going to be included.
-list(APPEND Caffe2_CPU_SRCS ${Caffe2_DB_COMMON_CPU_SRC})
-list(APPEND Caffe2_GPU_SRCS ${Caffe2_DB_COMMON_GPU_SRC})
-list(APPEND Caffe2_HIP_SRCS ${Caffe2_DB_COMMON_HIP_SRC})
-
-# DB specific files
-if(USE_LMDB)
-  list(APPEND Caffe2_CPU_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/lmdb.cc")
-endif()
-
-if(USE_LEVELDB)
-  list(APPEND Caffe2_CPU_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/leveldb.cc")
-endif()
-
-if(USE_ZMQ)
-  list(APPEND Caffe2_CPU_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/zmqdb.cc")
-endif()
-
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
diff --git a/caffe2/db/create_db_op.cc b/caffe2/db/create_db_op.cc
deleted file mode 100644
index d563cac5f2f71..0000000000000
--- a/caffe2/db/create_db_op.cc
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "caffe2/db/create_db_op.h"
-
-namespace caffe2 {
-REGISTER_CPU_OPERATOR(CreateDB, CreateDBOp<CPUContext>);
-
-OPERATOR_SCHEMA(CreateDB).NumInputs(0).NumOutputs(1);
-
-NO_GRADIENT(CreateDB);
-} // namespace caffe2
diff --git a/caffe2/db/create_db_op.h b/caffe2/db/create_db_op.h
deleted file mode 100644
index 6a964f86d1b43..0000000000000
--- a/caffe2/db/create_db_op.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef CAFFE2_DB_CREATE_DB_OP_H_
-#define CAFFE2_DB_CREATE_DB_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/db.h"
-#include "caffe2/core/operator.h"
-
-namespace caffe2 {
-
-template <class Context>
-class CreateDBOp final : public Operator<Context> {
- public:
-  CreateDBOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        db_type_(OperatorBase::template GetSingleArgument<string>(
-            "db_type",
-            "leveldb")),
-        db_name_(OperatorBase::template GetSingleArgument<string>("db", "")),
-        num_shards_(
-            OperatorBase::template GetSingleArgument<int>("num_shards", 1)),
-        shard_id_(
-            OperatorBase::template GetSingleArgument<int>("shard_id", 0)) {
-    CAFFE_ENFORCE_GT(db_name_.size(), 0, "Must specify a db name.");
-  }
-
-  bool RunOnDevice() final {
-    OperatorBase::Output<db::DBReader>(0)->Open(
-        db_type_, db_name_, num_shards_, shard_id_);
-    return true;
-  }
-
- private:
-  string db_type_;
-  string db_name_;
-  uint32_t num_shards_;
-  uint32_t shard_id_;
-  C10_DISABLE_COPY_AND_ASSIGN(CreateDBOp);
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_DB_CREATE_DB_OP_H_
diff --git a/caffe2/db/create_db_op_gpu.cc b/caffe2/db/create_db_op_gpu.cc
deleted file mode 100644
index 07552aa445c6d..0000000000000
--- a/caffe2/db/create_db_op_gpu.cc
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/db/create_db_op.h"
-
-namespace caffe2 {
-REGISTER_CUDA_OPERATOR(CreateDB, CreateDBOp<CUDAContext>);
-} // namespace caffe2
diff --git a/caffe2/db/db_test.cc b/caffe2/db/db_test.cc
deleted file mode 100644
index fc5bfe0b95bd3..0000000000000
--- a/caffe2/db/db_test.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-#include <cstdio>
-#include <iomanip>
-#include <sstream>
-#include <thread>
-
-#include <gtest/gtest.h>
-#include "caffe2/core/blob_serialization.h"
-#include "caffe2/core/db.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "common/gtest/gtest_extensions.h"
-
-namespace caffe2 {
-namespace db {
-
-constexpr int kMaxItems = 10;
-
-static bool CreateAndFill(const string& db_type, const string& name) {
-  VLOG(1) << "Creating db: " << name;
-  std::unique_ptr<DB> db(CreateDB(db_type, name, NEW));
-  if (!db.get()) {
-    LOG(ERROR) << "Cannot create db of type " << db_type;
-    return false;
-  }
-  std::unique_ptr<Transaction> trans(db->NewTransaction());
-  for (int i = 0; i < kMaxItems; ++i) {
-    std::stringstream ss;
-    ss << std::setw(2) << std::setfill('0') << i;
-    trans->Put(ss.str(), std::string{ss.str()});
-  }
-  trans->Commit();
-  trans.reset();
-  db.reset();
-  return true;
-}
-
-static void TestCursor(Cursor* cursor) {
-  // Test the first key.
-  cursor->SeekToFirst();
-  EXPECT_EQ(cursor->key(), "00");
-  EXPECT_EQ(cursor->value(), "00");
-  // Test if Next() works.
-  cursor->Next();
-  EXPECT_EQ(cursor->key(), "01");
-  cursor->Next();
-  EXPECT_EQ(cursor->key(), "02");
-  // Test if we can return to the first key.
-  cursor->SeekToFirst();
-  EXPECT_EQ(cursor->key(), "00");
-  // Test seeking to a key that exists.
-  cursor->Seek("05");
-  EXPECT_EQ(cursor->key(), "05");
-  // Test seeking to a key that does not exist - that should give us the
-  // immediate next key.
-  cursor->Seek("07.5");
-  EXPECT_EQ(cursor->key(), "08");
-  // Test seeking over the end of the db - that should make the current
-  // cursor invalid.
-  cursor->Seek("11");
-  EXPECT_FALSE(cursor->Valid());
-  // Test seeking to empty string, aka the beginning
-  cursor->Seek("");
-  EXPECT_EQ(cursor->key(), "00");
-}
-
-static void DBSeekTestWrapper(const string& db_type) {
-  std::string name = std::tmpnam(nullptr);
-  if (!CreateAndFill(db_type, name)) {
-    // Manually fail the test, and not do anything onwards.
-    EXPECT_TRUE(0);
-  } else {
-    std::unique_ptr<DB> db(CreateDB(db_type, name, READ));
-    std::unique_ptr<Cursor> cursor(db->NewCursor());
-    TestCursor(cursor.get());
-  }
-}
-
-TEST(DBSeekTest, RocksDB) {
-  SKIP() << "The test is broken. So skip.";
-  DBSeekTestWrapper("rocksdb");
-}
-
-TEST(DBSeekTest, LevelDB) {
-  DBSeekTestWrapper("leveldb");
-}
-
-TEST(DBSeekTest, LMDB) {
-  SKIP() << "The test is broken. So skip.";
-  DBSeekTestWrapper("lmdb");
-}
-
-TEST(DBReaderTest, Reader) {
-  std::string name = std::tmpnam(nullptr);
-  CreateAndFill("leveldb", name);
-  std::unique_ptr<DBReader> reader(new DBReader("leveldb", name));
-  EXPECT_TRUE(reader->cursor() != nullptr);
-  // DBReader should have a full-fledged cursor.
-  TestCursor(reader->cursor());
-  // Test the Read() functionality.
-  reader->cursor()->Seek("05");
-  EXPECT_EQ(reader->cursor()->key(), "05");
-  string key;
-  string value;
-  reader->Read(&key, &value);
-  EXPECT_EQ(key, "05");
-  EXPECT_EQ(value, "05");
-  reader->Read(&key, &value);
-  EXPECT_EQ(key, "06");
-  EXPECT_EQ(value, "06");
-
-  // Test if we are able to serialize it using the blob serialization
-  // interface.
-  reader->cursor()->Seek("05");
-  EXPECT_EQ(reader->cursor()->key(), "05");
-  Blob reader_blob;
-  reader_blob.Reset(reader.release());
-  std::string str = SerializeBlob(reader_blob, "saved_reader");
-  // Release to close the old reader.
-  reader_blob.Reset();
-  BlobProto blob_proto;
-  CHECK(blob_proto.ParseFromString(str));
-  EXPECT_EQ(blob_proto.name(), "saved_reader");
-  EXPECT_EQ(blob_proto.type(), "DBReader");
-  DBReaderProto proto;
-  CHECK(proto.ParseFromString(blob_proto.content()));
-  EXPECT_EQ(proto.source(), name);
-  EXPECT_EQ(proto.db_type(), "leveldb");
-  EXPECT_EQ(proto.key(), "05");
-  // Test restoring the reader from the serialized proto.
-  EXPECT_NO_THROW(DeserializeBlob(str, &reader_blob));
-  EXPECT_TRUE(reader_blob.IsType<DBReader>());
-  const DBReader& new_reader = reader_blob.Get<DBReader>();
-  EXPECT_TRUE(new_reader.cursor() != nullptr);
-  EXPECT_EQ(new_reader.cursor()->key(), "05");
-
-  // Test Reader's multi-threading capability.
-  vector<unique_ptr<std::thread>> threads(kMaxItems);
-  vector<string> keys(kMaxItems);
-  vector<string> values(kMaxItems);
-  for (int i = 0; i < kMaxItems; ++i) {
-    threads[i].reset(new std::thread(
-        [&new_reader](string* key, string* value) {
-          new_reader.Read(key, value);
-        },
-        &keys[i],
-        &values[i]));
-  }
-  for (int i = 0; i < kMaxItems; ++i) {
-    threads[i]->join();
-    EXPECT_TRUE(keys[i].size() > 0);
-  }
-  // Check if the names are all unique by putting them into a set and
-  // checking the size.
-  std::set<string> keys_set(keys.begin(), keys.end());
-  EXPECT_EQ(keys_set.size(), kMaxItems);
-}
-
-TEST(DBReaderShardedTest, Reader) {
-  std::string name = std::tmpnam(nullptr);
-  CreateAndFill("leveldb", name);
-
-  std::unique_ptr<DBReader> reader0(new DBReader("leveldb", name, 3, 0));
-  string key;
-  string value;
-  reader0->Read(&key, &value);
-  EXPECT_EQ(key, "00");
-  EXPECT_EQ(value, "00");
-  reader0->Read(&key, &value);
-  EXPECT_EQ(key, "03");
-  EXPECT_EQ(value, "03");
-  reader0->Read(&key, &value);
-  EXPECT_EQ(key, "06");
-  EXPECT_EQ(value, "06");
-  reader0->Read(&key, &value);
-  EXPECT_EQ(key, "09");
-  EXPECT_EQ(value, "09");
-  reader0->Read(&key, &value);
-  EXPECT_EQ(key, "00");
-  EXPECT_EQ(value, "00");
-  reader0->Read(&key, &value);
-  EXPECT_EQ(key, "03");
-  EXPECT_EQ(value, "03");
-
-  CreateAndFill("leveldb", name + "1");
-  std::unique_ptr<DBReader> reader1(new DBReader("leveldb", name + "1", 3, 1));
-  reader1->Read(&key, &value);
-  EXPECT_EQ(key, "01");
-  EXPECT_EQ(value, "01");
-  reader1->Read(&key, &value);
-  EXPECT_EQ(key, "04");
-  EXPECT_EQ(value, "04");
-
-  CreateAndFill("leveldb", name + "2");
-  std::unique_ptr<DBReader> reader2(new DBReader("leveldb", name + "2", 3, 2));
-  reader2->Read(&key, &value);
-  EXPECT_EQ(key, "02");
-  EXPECT_EQ(value, "02");
-  reader2->Read(&key, &value);
-  EXPECT_EQ(key, "05");
-  EXPECT_EQ(value, "05");
-}
-
-} // namespace db
-} // namespace caffe2
diff --git a/caffe2/db/leveldb.cc b/caffe2/db/leveldb.cc
deleted file mode 100644
index 78e10f3067bb2..0000000000000
--- a/caffe2/db/leveldb.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-#include "caffe2/core/db.h"
-#include "caffe2/core/flags.h"
-#include "caffe2/core/logging.h"
-#include "leveldb/db.h"
-#include "leveldb/write_batch.h"
-
-C10_DEFINE_int(
-    caffe2_leveldb_block_size,
-    65536,
-    "The caffe2 leveldb block size when writing a leveldb.");
-
-namespace caffe2 {
-namespace db {
-
-class LevelDBCursor : public Cursor {
- public:
-  explicit LevelDBCursor(leveldb::DB* db)
-      : iter_(db->NewIterator(leveldb::ReadOptions())) {
-    SeekToFirst();
-  }
-  ~LevelDBCursor() override {}
-  void Seek(const string& key) override {
-    iter_->Seek(key);
-  }
-  bool SupportsSeek() override {
-    return true;
-  }
-  void SeekToFirst() override {
-    iter_->SeekToFirst();
-  }
-  void Next() override {
-    iter_->Next();
-  }
-  string key() override {
-    return iter_->key().ToString();
-  }
-  string value() override {
-    return iter_->value().ToString();
-  }
-  bool Valid() override {
-    return iter_->Valid();
-  }
-
- private:
-  std::unique_ptr<leveldb::Iterator> iter_;
-};
-
-class LevelDBTransaction : public Transaction {
- public:
-  explicit LevelDBTransaction(leveldb::DB* db) : db_(db) {
-    CAFFE_ENFORCE(db_);
-    batch_.reset(new leveldb::WriteBatch());
-  }
-  ~LevelDBTransaction() override {
-    Commit();
-  }
-  void Put(const string& key, string&& value) override {
-    batch_->Put(key, value);
-  }
-  void Commit() override {
-    leveldb::Status status = db_->Write(leveldb::WriteOptions(), batch_.get());
-    batch_.reset(new leveldb::WriteBatch());
-    CAFFE_ENFORCE(
-        status.ok(), "Failed to write batch to leveldb. ", status.ToString());
-  }
-
- private:
-  leveldb::DB* db_;
-  std::unique_ptr<leveldb::WriteBatch> batch_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(LevelDBTransaction);
-};
-
-class LevelDB : public DB {
- public:
-  LevelDB(const string& source, Mode mode) : DB(source, mode) {
-    leveldb::Options options;
-    options.block_size = FLAGS_caffe2_leveldb_block_size;
-    options.write_buffer_size = 268435456;
-    options.max_open_files = 100;
-    options.error_if_exists = mode == NEW;
-    options.create_if_missing = mode != READ;
-    leveldb::DB* db_temp;
-    leveldb::Status status = leveldb::DB::Open(options, source, &db_temp);
-    CAFFE_ENFORCE(
-        status.ok(),
-        "Failed to open leveldb ",
-        source,
-        ". ",
-        status.ToString());
-    db_.reset(db_temp);
-    VLOG(1) << "Opened leveldb " << source;
-  }
-
-  void Close() override {
-    db_.reset();
-  }
-  unique_ptr<Cursor> NewCursor() override {
-    return make_unique<LevelDBCursor>(db_.get());
-  }
-  unique_ptr<Transaction> NewTransaction() override {
-    return make_unique<LevelDBTransaction>(db_.get());
-  }
-
- private:
-  std::unique_ptr<leveldb::DB> db_;
-};
-
-REGISTER_CAFFE2_DB(LevelDB, LevelDB);
-// For lazy-minded, one can also call with lower-case name.
-REGISTER_CAFFE2_DB(leveldb, LevelDB);
-
-} // namespace db
-} // namespace caffe2
diff --git a/caffe2/db/lmdb.cc b/caffe2/db/lmdb.cc
deleted file mode 100644
index b34430c152793..0000000000000
--- a/caffe2/db/lmdb.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-#include "lmdb.h" // NOLINT
-
-#if defined(_MSC_VER)
-#include <direct.h>
-#endif
-
-#include <sys/stat.h>
-
-#include <string>
-
-#include "caffe2/core/db.h"
-#include "caffe2/core/logging.h"
-
-namespace caffe2 {
-namespace db {
-
-constexpr size_t LMDB_MAP_SIZE = 1099511627776; // 1 TB
-
-inline void MDB_CHECK(int mdb_status) {
-  CAFFE_ENFORCE_EQ(mdb_status, MDB_SUCCESS, mdb_strerror(mdb_status));
-}
-
-class LMDBCursor : public Cursor {
- public:
-  explicit LMDBCursor(MDB_env* mdb_env) : mdb_env_(mdb_env), valid_(false) {
-    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn_));
-    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
-    MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
-    SeekToFirst();
-  }
-  ~LMDBCursor() override {
-    mdb_cursor_close(mdb_cursor_);
-    mdb_dbi_close(mdb_env_, mdb_dbi_);
-    mdb_txn_abort(mdb_txn_);
-  }
-
-  void Seek(const string& key) override {
-    if (key.size() == 0) {
-      SeekToFirst();
-      return;
-    }
-    // a key of 16k size should be enough? I am not sure though.
-    mdb_key_.mv_size = key.size();
-    mdb_key_.mv_data = const_cast<char*>(key.c_str());
-    int mdb_status =
-        mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_SET_RANGE);
-    if (mdb_status == MDB_NOTFOUND) {
-      valid_ = false;
-    } else {
-      MDB_CHECK(mdb_status);
-      valid_ = true;
-    }
-  }
-
-  bool SupportsSeek() override {
-    return true;
-  }
-
-  void SeekToFirst() override {
-    SeekLMDB(MDB_FIRST);
-  }
-
-  void Next() override {
-    SeekLMDB(MDB_NEXT);
-  }
-
-  string key() override {
-    return string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
-  }
-
-  string value() override {
-    return string(
-        static_cast<const char*>(mdb_value_.mv_data), mdb_value_.mv_size);
-  }
-
-  bool Valid() override {
-    return valid_;
-  }
-
- private:
-  void SeekLMDB(MDB_cursor_op op) {
-    int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
-    if (mdb_status == MDB_NOTFOUND) {
-      valid_ = false;
-    } else {
-      MDB_CHECK(mdb_status);
-      valid_ = true;
-    }
-  }
-
-  MDB_env* mdb_env_;
-  MDB_txn* mdb_txn_;
-  MDB_dbi mdb_dbi_;
-  MDB_cursor* mdb_cursor_;
-  MDB_val mdb_key_, mdb_value_;
-  bool valid_;
-};
-
-class LMDBTransaction final : public Transaction {
- public:
-  explicit LMDBTransaction(MDB_env* mdb_env) : mdb_env_(mdb_env) {
-    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
-    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
-  }
-  ~LMDBTransaction() override {
-    MDB_CHECK(mdb_txn_commit(mdb_txn_));
-    mdb_dbi_close(mdb_env_, mdb_dbi_);
-  }
-  void Put(const string& key, string&& value) override;
-  void Commit() override {
-    MDB_CHECK(mdb_txn_commit(mdb_txn_));
-    mdb_dbi_close(mdb_env_, mdb_dbi_);
-    // Begin a new transaction.
-    MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn_));
-    MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
-  }
-
- private:
-  MDB_env* mdb_env_;
-  MDB_dbi mdb_dbi_;
-  MDB_txn* mdb_txn_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(LMDBTransaction);
-};
-
-class LMDB : public DB {
- public:
-  LMDB(const string& source, Mode mode);
-  ~LMDB() override {
-    Close();
-  }
-  void Close() override {
-    if (mdb_env_ != NULL) {
-      mdb_env_close(mdb_env_);
-      mdb_env_ = NULL;
-    }
-  }
-  unique_ptr<Cursor> NewCursor() override {
-    return make_unique<LMDBCursor>(mdb_env_);
-  }
-  unique_ptr<Transaction> NewTransaction() override {
-    return make_unique<LMDBTransaction>(mdb_env_);
-  }
-
- private:
-  MDB_env* mdb_env_;
-};
-
-LMDB::LMDB(const string& source, Mode mode) : DB(source, mode) {
-  MDB_CHECK(mdb_env_create(&mdb_env_));
-  MDB_CHECK(mdb_env_set_mapsize(mdb_env_, LMDB_MAP_SIZE));
-  if (mode == NEW) {
-#if defined(_MSC_VER)
-    CAFFE_ENFORCE_EQ(_mkdir(source.c_str()), 0, "mkdir ", source, " failed");
-#else
-    CAFFE_ENFORCE_EQ(
-        mkdir(source.c_str(), 0744), 0, "mkdir ", source, " failed");
-#endif
-  }
-  int flags = 0;
-  if (mode == READ) {
-    flags = MDB_RDONLY | MDB_NOTLS | MDB_NOLOCK;
-  }
-  MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664));
-  VLOG(1) << "Opened lmdb " << source;
-}
-
-void LMDBTransaction::Put(const string& key, string&& value) {
-  MDB_val mdb_key, mdb_value;
-  mdb_key.mv_data = const_cast<char*>(key.data());
-  mdb_key.mv_size = key.size();
-  mdb_value.mv_data = const_cast<char*>(value.data());
-  mdb_value.mv_size = value.size();
-  MDB_CHECK(mdb_put(mdb_txn_, mdb_dbi_, &mdb_key, &mdb_value, 0));
-}
-
-REGISTER_CAFFE2_DB(LMDB, LMDB);
-REGISTER_CAFFE2_DB(lmdb, LMDB);
-
-} // namespace db
-} // namespace caffe2
diff --git a/caffe2/db/protodb.cc b/caffe2/db/protodb.cc
deleted file mode 100644
index 26da74e2ef7e1..0000000000000
--- a/caffe2/db/protodb.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-#include <unordered_set>
-
-#include "caffe2/core/db.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/utils/proto_utils.h"
-
-namespace caffe2 {
-namespace db {
-
-class ProtoDBCursor : public Cursor {
- public:
-  explicit ProtoDBCursor(const TensorProtos* proto) : proto_(proto), iter_(0) {}
-  // NOLINTNEXTLINE(modernize-use-equals-default)
-  ~ProtoDBCursor() override {}
-
-  void Seek(const string& /*str*/) override {
-    CAFFE_THROW("ProtoDB is not designed to support seeking.");
-  }
-
-  void SeekToFirst() override {
-    iter_ = 0;
-  }
-  void Next() override {
-    ++iter_;
-  }
-  string key() override {
-    return proto_->protos(iter_).name();
-  }
-  string value() override {
-    return SerializeAsString_EnforceCheck(
-        proto_->protos(iter_), "ProtoDBCursor");
-  }
-  bool Valid() override {
-    return iter_ < proto_->protos_size();
-  }
-
- private:
-  const TensorProtos* proto_;
-  int iter_;
-};
-
-class ProtoDBTransaction : public Transaction {
- public:
-  explicit ProtoDBTransaction(TensorProtos* proto)
-      : proto_(proto), existing_names_() {
-    for (const auto& tensor : proto_->protos()) {
-      existing_names_.insert(tensor.name());
-    }
-  }
-  ~ProtoDBTransaction() override {
-    // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
-    Commit();
-  }
-  void Put(const string& key, string&& value) override {
-    if (existing_names_.count(key)) {
-      CAFFE_THROW("An item with key ", key, " already exists.");
-    }
-    auto* tensor = proto_->add_protos();
-    CAFFE_ENFORCE(
-        tensor->ParseFromString(value),
-        "Cannot parse content from the value string.");
-    CAFFE_ENFORCE(
-        tensor->name() == key,
-        "Passed in key ",
-        key,
-        " does not equal to the tensor name ",
-        tensor->name());
-  }
-  // Commit does nothing. The protocol buffer will be written at destruction
-  // of ProtoDB.
-  void Commit() override {}
-
- private:
-  TensorProtos* proto_;
-  std::unordered_set<string> existing_names_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(ProtoDBTransaction);
-};
-
-class ProtoDB : public DB {
- public:
-  ProtoDB(const string& source, Mode mode)
-      : DB(source, mode), proto_(), source_(source) {
-    if (mode == READ || mode == WRITE) {
-      // Read the current protobuffer.
-      CAFFE_ENFORCE(
-          ReadProtoFromFile(source, &proto_), "Cannot read protobuffer.");
-    }
-    LOG(INFO) << "Opened protodb " << source;
-  }
-  ~ProtoDB() override {
-    // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
-    Close();
-  }
-
-  void Close() override {
-    if (mode_ == NEW || mode_ == WRITE) {
-      WriteProtoToBinaryFile(proto_, source_);
-    }
-  }
-
-  unique_ptr<Cursor> NewCursor() override {
-    return make_unique<ProtoDBCursor>(&proto_);
-  }
-  unique_ptr<Transaction> NewTransaction() override {
-    return make_unique<ProtoDBTransaction>(&proto_);
-  }
-
- private:
-  TensorProtos proto_;
-  string source_;
-};
-
-REGISTER_CAFFE2_DB(ProtoDB, ProtoDB);
-// For lazy-minded, one can also call with lower-case name.
-REGISTER_CAFFE2_DB(protodb, ProtoDB);
-
-} // namespace db
-} // namespace caffe2
diff --git a/caffe2/db/zmqdb.cc b/caffe2/db/zmqdb.cc
deleted file mode 100644
index c58654f27f659..0000000000000
--- a/caffe2/db/zmqdb.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-#include <atomic>
-#include <condition_variable>
-#include <mutex>
-#include <thread> // NOLINT
-
-#include "caffe2/core/db.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/utils/zmq_helper.h"
-
-namespace caffe2 {
-namespace db {
-
-class ZmqDBCursor : public Cursor {
- public:
-  explicit ZmqDBCursor(const string& source)
-      : source_(source),
-        socket_(ZMQ_PULL),
-        prefetched_(false),
-        finalize_(false) {
-    socket_.Connect(source_);
-    // Start prefetching thread.
-    prefetch_thread_.reset(new std::thread([this] { this->Prefetch(); }));
-    // obtain the first value.
-    Next();
-  }
-
-  ~ZmqDBCursor() override {
-    finalize_ = true;
-    prefetched_ = false;
-    producer_.notify_one();
-    // Wait for the prefetch thread to finish elegantly.
-    prefetch_thread_->join();
-    socket_.Disconnect(source_);
-  }
-
-  void Seek(const string& /*key*/) override { /* do nothing */
-  }
-
-  void SeekToFirst() override { /* do nothing */
-  }
-
-  void Next() override {
-    std::unique_lock<std::mutex> lock(prefetch_access_mutex_);
-    while (!prefetched_)
-      consumer_.wait(lock);
-    key_ = prefetch_key_;
-    value_ = prefetch_value_;
-    prefetched_ = false;
-    producer_.notify_one();
-  }
-
-  string key() override {
-    return key_;
-  }
-  string value() override {
-    return value_;
-  }
-  bool Valid() override {
-    return true;
-  }
-
- private:
-  void Prefetch() {
-    while (!finalize_) {
-      std::unique_lock<std::mutex> lock(prefetch_access_mutex_);
-      while (prefetched_)
-        producer_.wait(lock);
-      if (finalize_) {
-        return;
-      }
-      ZmqMessage msg;
-      socket_.RecvTillSuccess(&msg);
-      prefetch_key_.assign(static_cast<char*>(msg.data()), msg.size());
-      socket_.RecvTillSuccess(&msg);
-      prefetch_value_.assign(static_cast<char*>(msg.data()), msg.size());
-      prefetched_ = true;
-      consumer_.notify_one();
-    }
-  }
-
-  string source_;
-  ZmqSocket socket_;
-  string key_;
-  string value_;
-  string prefetch_key_;
-  string prefetch_value_;
-
-  unique_ptr<std::thread> prefetch_thread_;
-  std::mutex prefetch_access_mutex_;
-  std::condition_variable producer_, consumer_;
-  std::atomic<bool> prefetched_;
-  // finalize_ is used to tell the prefetcher to quit.
-  std::atomic<bool> finalize_;
-};
-
-class ZmqDB : public DB {
- public:
-  ZmqDB(const string& source, Mode mode) : DB(source, mode), source_(source) {
-    CAFFE_ENFORCE(mode == READ, "ZeroMQ DB only supports read mode.");
-  }
-
-  ~ZmqDB() override {}
-
-  void Close() override {}
-
-  unique_ptr<Cursor> NewCursor() override {
-    return make_unique<ZmqDBCursor>(source_);
-  }
-
-  unique_ptr<Transaction> NewTransaction() override {
-    CAFFE_THROW("ZeroMQ DB does not support writing with a transaction.");
-    return nullptr; // dummy placeholder to suppress old compiler warnings.
-  }
-
- private:
-  string source_;
-};
-
-REGISTER_CAFFE2_DB(ZmqDB, ZmqDB);
-// For lazy-minded, one can also call with lower-case name.
-REGISTER_CAFFE2_DB(zmqdb, ZmqDB);
-
-} // namespace db
-} // namespace caffe2
diff --git a/caffe2/distributed/CMakeLists.txt b/caffe2/distributed/CMakeLists.txt
deleted file mode 100644
index db8620ec04410..0000000000000
--- a/caffe2/distributed/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-set(Caffe2_STORE_COMMON_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/file_store_handler.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/file_store_handler_op.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/store_handler.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/store_ops.cc"
-)
-
-set(Caffe2_STORE_COMMON_GPU_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/file_store_handler_op_gpu.cc"
-)
-
-set(Caffe2_STORE_REDIS_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/redis_store_handler.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/redis_store_handler_op.cc"
-)
-
-set(Caffe2_STORE_REDIS_GPU_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/redis_store_handler_op_gpu.cc"
-)
-
-# Common files that are always going to be included.
-list(APPEND Caffe2_CPU_SRCS ${Caffe2_STORE_COMMON_SRC})
-list(APPEND Caffe2_GPU_SRCS ${Caffe2_STORE_COMMON_GPU_SRC})
-
-if(USE_REDIS)
-  list(APPEND Caffe2_CPU_SRCS ${Caffe2_STORE_REDIS_SRC})
-  list(APPEND Caffe2_GPU_SRCS ${Caffe2_STORE_REDIS_GPU_SRC})
-endif()
-
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
diff --git a/caffe2/distributed/file_store_handler.cc b/caffe2/distributed/file_store_handler.cc
deleted file mode 100644
index c5f85b57ab39e..0000000000000
--- a/caffe2/distributed/file_store_handler.cc
+++ /dev/null
@@ -1,184 +0,0 @@
-#include "file_store_handler_op.h"
-
-// NOLINTNEXTLINE(modernize-deprecated-headers)
-#include <errno.h>
-#include <fcntl.h>
-// NOLINTNEXTLINE(modernize-deprecated-headers)
-#include <limits.h>
-// NOLINTNEXTLINE(modernize-deprecated-headers)
-#include <stdio.h>
-// NOLINTNEXTLINE(modernize-deprecated-headers)
-#include <stdlib.h>
-#include <sys/stat.h>
-
-#include <array>
-#include <chrono>
-#include <iostream>
-#include <thread>
-
-#if defined(_MSC_VER)
-#include <direct.h> // for _mkdir
-#endif
-
-#include "c10/util/StringUtil.h"
-
-#include "caffe2/utils/murmur_hash3.h"
-
-namespace caffe2 {
-
-static std::string encodeName(const std::string& name) {
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  std::array<uint64_t, 2> out;
-  MurmurHash3_x64_128(name.data(), name.size(), 0xcafef00d, out.data());
-
-  // Size is 33 to have space for final NUL
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers)
-  std::array<char, 33> buf;
-  for (int i = 0; i < 16; i++) {
-    snprintf(&buf[i * 2], buf.size() - (i * 2), "%02x", ((char*)out.data())[i]);
-  }
-
-  // Return everything but the final NUL
-  return std::string(buf.data(), buf.size() - 1);
-}
-
-FileStoreHandler::FileStoreHandler(
-    const std::string& path,
-    const std::string& prefix) {
-  basePath_ = realPath(path);
-  if (!prefix.empty()) {
-    basePath_ = basePath_ + "/" + encodeName(prefix);
-  }
-#if defined(_MSC_VER)
-  auto ret = _mkdir(basePath_.c_str());
-#else
-  auto ret = mkdir(basePath_.c_str(), 0777);
-#endif // defined(_MSC_VER)
-  if (ret == -1) {
-    TORCH_CHECK_EQ(errno, EEXIST) << "mkdir: " << strerror(errno);
-  }
-}
-
-// NOLINTNEXTLINE(modernize-use-equals-default)
-FileStoreHandler::~FileStoreHandler() {}
-
-std::string FileStoreHandler::realPath(const std::string& path) {
-#if defined(_MSC_VER)
-  std::array<char, _MAX_PATH> buf;
-  auto ret = _fullpath(buf.data(), path.c_str(), buf.size());
-#else
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  std::array<char, PATH_MAX> buf;
-  auto ret = realpath(path.c_str(), buf.data());
-#endif
-  TORCH_CHECK_EQ(buf.data(), ret) << "realpath: " << strerror(errno);
-  return std::string(buf.data());
-}
-
-std::string FileStoreHandler::tmpPath(const std::string& name) {
-  return basePath_ + "/." + encodeName(name);
-}
-
-std::string FileStoreHandler::objectPath(const std::string& name) {
-  return basePath_ + "/" + encodeName(name);
-}
-
-void FileStoreHandler::set(const std::string& name, const std::string& data) {
-  auto tmp = tmpPath(name);
-  auto path = objectPath(name);
-
-  {
-    std::ofstream ofs(tmp.c_str(), std::ios::out | std::ios::trunc);
-    if (!ofs.is_open()) {
-      CAFFE_ENFORCE(
-          false, "File cannot be created: ", tmp, " (", ofs.rdstate(), ")");
-    }
-    ofs << data;
-  }
-
-  // Atomically movve result to final location
-  auto rv = rename(tmp.c_str(), path.c_str());
-  CAFFE_ENFORCE_EQ(rv, 0, "rename: ", strerror(errno));
-}
-
-std::string FileStoreHandler::get(
-    const std::string& name,
-    const std::chrono::milliseconds& timeout) {
-  auto path = objectPath(name);
-  std::string result;
-
-  // Block until key is set
-  wait({name}, timeout);
-
-  std::ifstream ifs(path.c_str(), std::ios::in);
-  if (!ifs) {
-    CAFFE_ENFORCE(
-        false, "File cannot be opened: ", path, " (", ifs.rdstate(), ")");
-  }
-  ifs.seekg(0, std::ios::end);
-  size_t n = ifs.tellg();
-  result.resize(n);
-  ifs.seekg(0);
-  ifs.read(&result[0], n);
-  return result;
-}
-
-int64_t FileStoreHandler::add(
-    const std::string& /* unused */,
-    int64_t /* unused */) {
-  CHECK(false) << "add not implemented for FileStoreHandler";
-  return 0;
-}
-
-int64_t FileStoreHandler::getNumKeys() {
-  CHECK(false) << "getNumKeys not implemented for FileStoreHandler";
-  return 0;
-}
-
-bool FileStoreHandler::deleteKey(const std::string& /* unused */) {
-  CHECK(false) << "deleteKey not implemented for FileStoreHandler";
-  return false;
-}
-
-bool FileStoreHandler::check(const std::vector<std::string>& names) {
-  std::vector<std::string> paths;
-  for (const auto& name : names) {
-    // NOLINTNEXTLINE(performance-inefficient-vector-operation)
-    paths.push_back(objectPath(name));
-  }
-
-  for (const auto& path : paths) {
-    int fd = open(path.c_str(), O_RDONLY);
-    if (fd == -1) {
-      // Only deal with files that don't exist.
-      // Anything else is a problem.
-      TORCH_CHECK_EQ(errno, ENOENT);
-
-      // One of the paths doesn't exist; return early
-      return false;
-    }
-
-    close(fd);
-  }
-
-  return true;
-}
-
-void FileStoreHandler::wait(
-    const std::vector<std::string>& names,
-    const std::chrono::milliseconds& timeout) {
-  // Not using inotify because it doesn't work on many
-  // shared filesystems (such as NFS).
-  const auto start = std::chrono::steady_clock::now();
-  while (!check(names)) {
-    const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
-        std::chrono::steady_clock::now() - start);
-    if (timeout != kNoTimeout && elapsed > timeout) {
-      STORE_HANDLER_TIMEOUT(
-          "Wait timeout for name(s): ", c10::Join(" ", names));
-    }
-    /* sleep override */
-    std::this_thread::sleep_for(std::chrono::milliseconds(10));
-  }
-}
-} // namespace caffe2
diff --git a/caffe2/distributed/file_store_handler.h b/caffe2/distributed/file_store_handler.h
deleted file mode 100644
index 46c0cdfde60ff..0000000000000
--- a/caffe2/distributed/file_store_handler.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#pragma once
-
-#include <caffe2/distributed/store_handler.h>
-
-namespace caffe2 {
-
-class TORCH_API FileStoreHandler : public StoreHandler {
- public:
-  explicit FileStoreHandler(const std::string& path, const std::string& prefix);
-  ~FileStoreHandler() override;
-
-  void set(const std::string& name, const std::string& data) override;
-
-  virtual std::string get(
-      const std::string& name,
-      const std::chrono::milliseconds& timeout = kDefaultTimeout) override;
-
-  int64_t add(const std::string& name, int64_t value) override;
-
-  bool deleteKey(const std::string& key) override;
-
-  int64_t getNumKeys() override;
-
-  bool check(const std::vector<std::string>& names) override;
-
-  virtual void wait(
-      const std::vector<std::string>& names,
-      const std::chrono::milliseconds& timeout = kDefaultTimeout) override;
-
- protected:
-  std::string basePath_;
-
-  std::string realPath(const std::string& path);
-
-  std::string tmpPath(const std::string& name);
-
-  std::string objectPath(const std::string& name);
-};
-
-} // namespace caffe2
diff --git a/caffe2/distributed/file_store_handler_op.cc b/caffe2/distributed/file_store_handler_op.cc
deleted file mode 100644
index a018daf200c18..0000000000000
--- a/caffe2/distributed/file_store_handler_op.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "file_store_handler_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(
-    FileStoreHandlerCreate,
-    FileStoreHandlerCreateOp<CPUContext>);
-
-OPERATOR_SCHEMA(FileStoreHandlerCreate)
-    .NumInputs(0)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Creates a unique_ptr<StoreHandler> that uses the filesystem as backing
-store (typically a filesystem shared between many nodes, such as NFS).
-This store handler is not built to be fast. Its recommended use is for
-integration tests and prototypes where extra dependencies are
-cumbersome. Use an ephemeral path to ensure multiple processes or runs
-don't interfere.
-)DOC")
-    .Arg("path", "base path used by the FileStoreHandler")
-    .Arg("prefix", "prefix for all keys used by this store")
-    .Output(0, "handler", "unique_ptr<StoreHandler>");
-
-NO_GRADIENT(FileStoreHandlerCreateOp);
-
-} // namespace caffe2
diff --git a/caffe2/distributed/file_store_handler_op.h b/caffe2/distributed/file_store_handler_op.h
deleted file mode 100644
index 82aeb341f1a49..0000000000000
--- a/caffe2/distributed/file_store_handler_op.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-#include "file_store_handler.h"
-
-#include <caffe2/core/operator.h>
-
-namespace caffe2 {
-
-template <class Context>
-class FileStoreHandlerCreateOp final : public Operator<Context> {
- public:
-  explicit FileStoreHandlerCreateOp(
-      const OperatorDef& operator_def,
-      Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        basePath_(
-            OperatorBase::template GetSingleArgument<std::string>("path", "")),
-        prefix_(OperatorBase::template GetSingleArgument<std::string>(
-            "prefix",
-            "")) {
-    CAFFE_ENFORCE_NE(basePath_, "", "path is a required argument");
-  }
-
-  bool RunOnDevice() override {
-    auto ptr =
-        std::unique_ptr<StoreHandler>(new FileStoreHandler(basePath_, prefix_));
-    *OperatorBase::Output<std::unique_ptr<StoreHandler>>(HANDLER) =
-        std::move(ptr);
-    return true;
-  }
-
- private:
-  std::string basePath_;
-  std::string prefix_;
-
-  OUTPUT_TAGS(HANDLER);
-};
-
-} // namespace caffe2
diff --git a/caffe2/distributed/file_store_handler_op_gpu.cc b/caffe2/distributed/file_store_handler_op_gpu.cc
deleted file mode 100644
index 2263b443cf23d..0000000000000
--- a/caffe2/distributed/file_store_handler_op_gpu.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "caffe2/distributed/file_store_handler_op.h"
-
-#if !defined(USE_ROCM)
-#include <caffe2/core/context_gpu.h>
-#else
-#include <caffe2/core/hip/context_gpu.h>
-#endif
-
-namespace caffe2 {
-
-#if !defined(USE_ROCM)
-REGISTER_CUDA_OPERATOR(
-    FileStoreHandlerCreate,
-    FileStoreHandlerCreateOp<CUDAContext>);
-#else
-REGISTER_HIP_OPERATOR(
-    FileStoreHandlerCreate,
-    FileStoreHandlerCreateOp<HIPContext>);
-#endif
-
-} // namespace caffe2
diff --git a/caffe2/distributed/file_store_handler_op_test.py b/caffe2/distributed/file_store_handler_op_test.py
deleted file mode 100644
index 3f60b5ada3405..0000000000000
--- a/caffe2/distributed/file_store_handler_op_test.py
+++ /dev/null
@@ -1,62 +0,0 @@
-
-
-
-
-
-import errno
-import os
-import tempfile
-import shutil
-
-from caffe2.distributed.python import StoreHandlerTimeoutError
-from caffe2.distributed.store_ops_test_util import StoreOpsTests
-from caffe2.python import core, workspace, dyndep
-from caffe2.python.test_util import TestCase
-
-dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
-dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:store_ops")
-
-
-class TestFileStoreHandlerOp(TestCase):
-    testCounter = 0
-
-    def setUp(self):
-        super().setUp()
-        self.tmpdir = tempfile.mkdtemp()
-
-        # Use counter to tell test cases apart
-        TestFileStoreHandlerOp.testCounter += 1
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdir)
-        super().tearDown()
-
-    def create_store_handler(self):
-        # Use new path for every test so they are isolated
-        path = self.tmpdir + "/" + str(TestFileStoreHandlerOp.testCounter)
-
-        # Ensure path exists (including counter)
-        try:
-            os.makedirs(path)
-        except OSError as exc:
-            if exc.errno == errno.EEXIST and os.path.isdir(path):
-                pass
-            else:
-                raise
-
-        store_handler = "store_handler"
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FileStoreHandlerCreate",
-                [],
-                [store_handler],
-                path=path))
-
-        return store_handler
-
-    def test_set_get(self):
-        StoreOpsTests.test_set_get(self.create_store_handler)
-
-    def test_get_timeout(self):
-        with self.assertRaises(StoreHandlerTimeoutError):
-            StoreOpsTests.test_get_timeout(self.create_store_handler)
diff --git a/caffe2/distributed/py_export.cc b/caffe2/distributed/py_export.cc
deleted file mode 100644
index 75b86e65c820f..0000000000000
--- a/caffe2/distributed/py_export.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-#include <pybind11/pybind11.h>
-
-#include "caffe2/distributed/store_handler.h"
-
-namespace caffe2 {
-namespace python {
-
-namespace py = pybind11;
-
-PYBIND11_MODULE(python, m) {
-  m.doc() = "Python interface for distributed Caffe2";
-
-  py::register_exception<StoreHandlerNotAvailableException>(
-      m, "StoreHandlerNotAvailableError");
-
-  py::register_exception<StoreHandlerTimeoutException>(
-      m, "StoreHandlerTimeoutError");
-}
-
-} // namespace python
-} // namespace caffe2
diff --git a/caffe2/distributed/python.pyi b/caffe2/distributed/python.pyi
deleted file mode 100644
index 10e01c65766ec..0000000000000
--- a/caffe2/distributed/python.pyi
+++ /dev/null
@@ -1,2 +0,0 @@
-class StoreHandlerNotAvailableException(Exception): ...
-class StoreHandlerTimeoutError(Exception): ...
diff --git a/caffe2/distributed/redis_store_handler.cc b/caffe2/distributed/redis_store_handler.cc
deleted file mode 100644
index e424c0e719fd8..0000000000000
--- a/caffe2/distributed/redis_store_handler.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-#include "redis_store_handler.h"
-
-#include <caffe2/core/logging.h>
-
-#include <chrono>
-#include <thread>
-#include <vector>
-
-namespace caffe2 {
-
-RedisStoreHandler::RedisStoreHandler(
-    std::string& host,
-    int port,
-    std::string& prefix)
-    : host_(host), port_(port), prefix_(prefix) {
-  struct timeval tv = {
-      .tv_sec = 5,
-      .tv_usec = 0,
-  };
-
-  redis_ = redisConnectWithTimeout(host.c_str(), port, tv);
-  CAFFE_ENFORCE_NE(redis_, (redisContext*)nullptr);
-  CAFFE_ENFORCE_EQ(redis_->err, 0, redis_->errstr);
-}
-
-RedisStoreHandler::~RedisStoreHandler() {
-  redisFree(redis_);
-}
-
-std::string RedisStoreHandler::compoundKey(const std::string& name) {
-  return prefix_ + name;
-}
-
-void RedisStoreHandler::set(const std::string& name, const std::string& data) {
-  auto key = compoundKey(name);
-  void* ptr = redisCommand(
-      redis_,
-      "SETNX %b %b",
-      key.c_str(),
-      (size_t)key.size(),
-      data.c_str(),
-      (size_t)data.size());
-  CAFFE_ENFORCE_NE(ptr, (void*)nullptr, redis_->errstr);
-  redisReply* reply = static_cast<redisReply*>(ptr);
-  CAFFE_ENFORCE_EQ(reply->type, REDIS_REPLY_INTEGER);
-  CAFFE_ENFORCE_EQ(
-      reply->integer,
-      1,
-      "Value at ",
-      name,
-      " was already set",
-      " (perhaps you reused a run ID you have used before?)");
-}
-
-std::string RedisStoreHandler::get(
-    const std::string& name,
-    const std::chrono::milliseconds& timeout) {
-  // Block until key is set
-  wait({name}, timeout);
-
-  auto key = compoundKey(name);
-  void* ptr = redisCommand(redis_, "GET %b", key.c_str(), (size_t)key.size());
-  CAFFE_ENFORCE_NE(ptr, (void*)nullptr, redis_->errstr);
-  redisReply* reply = static_cast<redisReply*>(ptr);
-  CAFFE_ENFORCE_EQ(reply->type, REDIS_REPLY_STRING);
-  return std::string(reply->str, reply->len);
-}
-
-int64_t RedisStoreHandler::add(const std::string& name, int64_t value) {
-  auto key = compoundKey(name);
-  void* ptr = redisCommand(
-      redis_, "INCRBY %b %ld", key.c_str(), (size_t)key.size(), value);
-  CAFFE_ENFORCE_NE(ptr, (void*)nullptr, redis_->errstr);
-  redisReply* reply = static_cast<redisReply*>(ptr);
-  CAFFE_ENFORCE_EQ(reply->type, REDIS_REPLY_INTEGER);
-  return reply->integer;
-}
-
-int64_t RedisStoreHandler::getNumKeys() {
-  CHECK(false) << "getNumKeys not implemented for RedisStoreHandler";
-  return 0;
-}
-
-bool RedisStoreHandler::deleteKey(const std::string& /* unused */) {
-  CHECK(false) << "deleteKey not implemented for RedisStoreHandler";
-  return false;
-}
-
-bool RedisStoreHandler::check(const std::vector<std::string>& names) {
-  std::vector<std::string> args;
-  args.push_back("EXISTS");
-  for (const auto& name : names) {
-    args.push_back(compoundKey(name));
-  }
-
-  std::vector<const char*> argv;
-  std::vector<size_t> argvlen;
-  for (const auto& arg : args) {
-    argv.push_back(arg.c_str());
-    argvlen.push_back(arg.length());
-  }
-
-  auto argc = argv.size();
-  void* ptr = redisCommandArgv(redis_, argc, argv.data(), argvlen.data());
-  CAFFE_ENFORCE_NE(ptr, (void*)nullptr, redis_->errstr);
-  redisReply* reply = static_cast<redisReply*>(ptr);
-  CAFFE_ENFORCE_EQ(reply->type, REDIS_REPLY_INTEGER);
-  return reply->integer == names.size();
-}
-
-void RedisStoreHandler::wait(
-    const std::vector<std::string>& names,
-    const std::chrono::milliseconds& timeout) {
-  // Simple approach: poll...
-  // Complex approach: use pub/sub.
-  // Polling is fine for the typical rendezvous use case, as it is
-  // only done at initialization time and  not at run time.
-  const auto start = std::chrono::steady_clock::now();
-  while (!check(names)) {
-    const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
-        std::chrono::steady_clock::now() - start);
-    if (timeout != kNoTimeout && elapsed > timeout) {
-      STORE_HANDLER_TIMEOUT(
-          "Wait timeout for name(s): ", c10::Join(" ", names));
-    }
-    /* sleep override */
-    std::this_thread::sleep_for(std::chrono::milliseconds(10));
-  }
-}
-} // namespace caffe2
diff --git a/caffe2/distributed/redis_store_handler.h b/caffe2/distributed/redis_store_handler.h
deleted file mode 100644
index b63ca3a9916c7..0000000000000
--- a/caffe2/distributed/redis_store_handler.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#pragma once
-
-#include <caffe2/distributed/store_handler.h>
-
-extern "C" {
-#include <hiredis/hiredis.h>
-}
-
-#include <string>
-
-namespace caffe2 {
-
-class TORCH_API RedisStoreHandler : public StoreHandler {
- public:
-  explicit RedisStoreHandler(std::string& host, int port, std::string& prefix);
-  virtual ~RedisStoreHandler();
-
-  void set(const std::string& name, const std::string& data) override;
-
-  virtual std::string get(
-      const std::string& name,
-      const std::chrono::milliseconds& timeout = kDefaultTimeout) override;
-
-  int64_t add(const std::string& name, int64_t value) override;
-
-  int64_t getNumKeys() override;
-
-  bool deleteKey(const std::string& key) override;
-
-  bool check(const std::vector<std::string>& names) override;
-
-  virtual void wait(
-      const std::vector<std::string>& names,
-      const std::chrono::milliseconds& timeout = kDefaultTimeout) override;
-
- private:
-  std::string host_;
-  int port_;
-  std::string prefix_;
-
-  redisContext* redis_;
-
-  std::string compoundKey(const std::string& name);
-};
-
-} // namespace caffe2
diff --git a/caffe2/distributed/redis_store_handler_op.cc b/caffe2/distributed/redis_store_handler_op.cc
deleted file mode 100644
index 1ff32a093c9c1..0000000000000
--- a/caffe2/distributed/redis_store_handler_op.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-#include "redis_store_handler_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(
-    RedisStoreHandlerCreate,
-    RedisStoreHandlerCreateOp<CPUContext>);
-
-OPERATOR_SCHEMA(RedisStoreHandlerCreate)
-    .NumInputs(0)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Creates a unique_ptr<StoreHandler> that uses a Redis server as backing store.
-)DOC")
-    .Arg("host", "host name of Redis server")
-    .Arg("port", "port number of Redis server")
-    .Arg("prefix", "keys used by this instance are prefixed with this string")
-    .Output(0, "handler", "unique_ptr<StoreHandler>");
-
-NO_GRADIENT(RedisStoreHandlerCreateOp);
-
-} // namespace caffe2
diff --git a/caffe2/distributed/redis_store_handler_op.h b/caffe2/distributed/redis_store_handler_op.h
deleted file mode 100644
index 5908fbd178bac..0000000000000
--- a/caffe2/distributed/redis_store_handler_op.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#pragma once
-
-#include "redis_store_handler.h"
-
-#include <caffe2/core/operator.h>
-
-#include <string>
-
-namespace caffe2 {
-
-template <class Context>
-class RedisStoreHandlerCreateOp final : public Operator<Context> {
- public:
-  explicit RedisStoreHandlerCreateOp(
-      const OperatorDef& operator_def,
-      Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        host_(
-            OperatorBase::template GetSingleArgument<std::string>("host", "")),
-        port_(OperatorBase::template GetSingleArgument<int>("port", 0)),
-        prefix_(OperatorBase::template GetSingleArgument<std::string>(
-            "prefix",
-            "")) {
-    CAFFE_ENFORCE_NE(host_, "", "host is a required argument");
-    CAFFE_ENFORCE_NE(port_, 0, "port is a required argument");
-  }
-
-  bool RunOnDevice() override {
-    auto ptr = std::unique_ptr<StoreHandler>(
-        new RedisStoreHandler(host_, port_, prefix_));
-    *OperatorBase::Output<std::unique_ptr<StoreHandler>>(HANDLER) =
-        std::move(ptr);
-    return true;
-  }
-
- private:
-  std::string host_;
-  int port_;
-  std::string prefix_;
-
-  OUTPUT_TAGS(HANDLER);
-};
-
-} // namespace caffe2
diff --git a/caffe2/distributed/redis_store_handler_op_gpu.cc b/caffe2/distributed/redis_store_handler_op_gpu.cc
deleted file mode 100644
index 7403d25e35d82..0000000000000
--- a/caffe2/distributed/redis_store_handler_op_gpu.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "caffe2/distributed/redis_store_handler_op.h"
-
-#if !defined(USE_ROCM)
-#include <caffe2/core/context_gpu.h>
-#else
-#include <caffe2/core/hip/context_gpu.h>
-#endif
-
-namespace caffe2 {
-
-#if !defined(USE_ROCM)
-REGISTER_CUDA_OPERATOR(
-    RedisStoreHandlerCreate,
-    RedisStoreHandlerCreateOp<CUDAContext>);
-#else
-REGISTER_HIP_OPERATOR(
-    RedisStoreHandlerCreate,
-    RedisStoreHandlerCreateOp<HIPContext>);
-#endif
-
-} // namespace caffe2
diff --git a/caffe2/distributed/redis_store_handler_op_test.py b/caffe2/distributed/redis_store_handler_op_test.py
deleted file mode 100644
index 0c8361c1e9587..0000000000000
--- a/caffe2/distributed/redis_store_handler_op_test.py
+++ /dev/null
@@ -1,40 +0,0 @@
-
-
-
-
-
-import os
-import uuid
-
-from caffe2.distributed.python import StoreHandlerTimeoutError
-from caffe2.distributed.store_ops_test_util import StoreOpsTests
-from caffe2.python import core, workspace, dyndep
-from caffe2.python.test_util import TestCase
-
-dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:redis_store_handler_ops")
-dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:store_ops")
-
-
-class TestRedisStoreHandlerOp(TestCase):
-    def setUp(self):
-        super().setUp()
-        self.uuid = str(uuid.uuid4()) + "/"
-
-    def create_store_handler(self):
-        store_handler = "store_handler"
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "RedisStoreHandlerCreate",
-                [],
-                [store_handler],
-                prefix=self.uuid,
-                host=os.getenv("REDIS_HOST", "localhost"),
-                port=int(os.getenv("REDIS_PORT", 6379))))
-        return store_handler
-
-    def test_set_get(self):
-        StoreOpsTests.test_set_get(self.create_store_handler)
-
-    def test_get_timeout(self):
-        with self.assertRaises(StoreHandlerTimeoutError):
-            StoreOpsTests.test_get_timeout(self.create_store_handler)
diff --git a/caffe2/distributed/store_handler.cc b/caffe2/distributed/store_handler.cc
deleted file mode 100644
index 6276dced6559b..0000000000000
--- a/caffe2/distributed/store_handler.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "store_handler.h"
-
-#include <memory>
-
-#include <c10/util/typeid.h>
-
-namespace caffe2 {
-
-// NOLINTNEXTLINE(modernize-use-equals-default)
-StoreHandler::~StoreHandler() {
-  // NOP; definition is here to make sure library contains
-  // symbols for this abstract class.
-}
-
-CAFFE_KNOWN_TYPE(std::unique_ptr<StoreHandler>);
-
-} // namespace caffe2
diff --git a/caffe2/distributed/store_handler.h b/caffe2/distributed/store_handler.h
deleted file mode 100644
index c9e27f0b90440..0000000000000
--- a/caffe2/distributed/store_handler.h
+++ /dev/null
@@ -1,90 +0,0 @@
-#pragma once
-
-#include <chrono>
-#include <cstdint>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include "caffe2/core/common.h"
-
-namespace caffe2 {
-
-class TORCH_API StoreHandler {
- public:
-  static constexpr std::chrono::milliseconds kDefaultTimeout =
-      std::chrono::seconds(30);
-  static constexpr std::chrono::milliseconds kNoTimeout =
-      std::chrono::milliseconds::zero();
-
-  virtual ~StoreHandler();
-
-  /*
-   * Set data for the key if it doesn't exist.
-   * If the key exists the data should be the same as the existing key.
-   */
-  virtual void set(const std::string& name, const std::string& data) = 0;
-
-  /*
-   * Get the data for the key.
-   * The call should wait until the key is stored with specified timeout
-   * and return data if set else fail.
-   */
-  virtual std::string get(
-      const std::string& name,
-      const std::chrono::milliseconds& timeout = kDefaultTimeout) = 0;
-
-  /*
-   * Does an atomic add operation on the key and returns the latest updated
-   * value.
-   * Note: To access the current value for this counter call with value = 0
-   */
-  virtual int64_t add(const std::string& name, int64_t value) = 0;
-
-  /*
-   * Returns the number of keys in this store.
-   */
-  virtual int64_t getNumKeys() = 0;
-
-  /*
-   * Removes the specified key from the store.
-   */
-  virtual bool deleteKey(const std::string& key) = 0;
-
-  /*
-   * Check if a keys exist in the store.
-   */
-  virtual bool check(const std::vector<std::string>& names) = 0;
-
-  /*
-   * Wait for Keys to be stored.
-   */
-  virtual void wait(
-      const std::vector<std::string>& names,
-      const std::chrono::milliseconds& timeout = kDefaultTimeout) = 0;
-};
-
-/*
- * The backing store is no longer available. It may have been deleted.
- */
-struct TORCH_API StoreHandlerNotAvailableException : public std::runtime_error {
-  explicit StoreHandlerNotAvailableException(const std::string& msg)
-      : std::runtime_error(msg) {}
-};
-
-#define STORE_HANDLER_NOT_AVAILABLE(...)             \
-  throw ::caffe2::StoreHandlerNotAvailableException( \
-      ::c10::str("[", __FILE__, ":", __LINE__, "] ", __VA_ARGS__));
-
-/*
- * Timeout accessing the store.
- */
-struct TORCH_API StoreHandlerTimeoutException : public std::runtime_error {
-  explicit StoreHandlerTimeoutException(const std::string& msg)
-      : std::runtime_error(msg) {}
-};
-
-#define STORE_HANDLER_TIMEOUT(...)              \
-  throw ::caffe2::StoreHandlerTimeoutException( \
-      ::c10::str("[", __FILE__, ":", __LINE__, "] ", __VA_ARGS__));
-} // namespace caffe2
diff --git a/caffe2/distributed/store_ops.cc b/caffe2/distributed/store_ops.cc
deleted file mode 100644
index 0b29040d2b6de..0000000000000
--- a/caffe2/distributed/store_ops.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-#include "store_ops.h"
-
-#include "caffe2/core/blob_serialization.h"
-
-namespace caffe2 {
-
-constexpr auto kBlobName = "blob_name";
-constexpr auto kAddValue = "add_value";
-
-StoreSetOp::StoreSetOp(const OperatorDef& operator_def, Workspace* ws)
-    : Operator<CPUContext>(operator_def, ws),
-      blobName_(
-          GetSingleArgument<std::string>(kBlobName, operator_def.input(DATA))) {
-}
-
-bool StoreSetOp::RunOnDevice() {
-  // Serialize and pass to store
-  auto* handler =
-      OperatorBase::Input<std::unique_ptr<StoreHandler>>(HANDLER).get();
-  handler->set(blobName_, SerializeBlob(InputBlob(DATA), blobName_));
-  return true;
-}
-
-REGISTER_CPU_OPERATOR(StoreSet, StoreSetOp);
-OPERATOR_SCHEMA(StoreSet)
-    .NumInputs(2)
-    .NumOutputs(0)
-    .SetDoc(R"DOC(
-Set a blob in a store. The key is the input blob's name and the value
-is the data in that blob. The key can be overridden by specifying the
-'blob_name' argument.
-)DOC")
-    .Arg("blob_name", "alternative key for the blob (optional)")
-    .Input(0, "handler", "unique_ptr<StoreHandler>")
-    .Input(1, "data", "data blob");
-
-StoreGetOp::StoreGetOp(const OperatorDef& operator_def, Workspace* ws)
-    : Operator<CPUContext>(operator_def, ws),
-      blobName_(GetSingleArgument<std::string>(
-          kBlobName,
-          operator_def.output(DATA))) {}
-
-bool StoreGetOp::RunOnDevice() {
-  // Get from store and deserialize
-  auto* handler =
-      OperatorBase::Input<std::unique_ptr<StoreHandler>>(HANDLER).get();
-  DeserializeBlob(handler->get(blobName_), OperatorBase::Outputs()[DATA]);
-  return true;
-}
-
-REGISTER_CPU_OPERATOR(StoreGet, StoreGetOp);
-OPERATOR_SCHEMA(StoreGet)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Get a blob from a store. The key is the output blob's name. The key
-can be overridden by specifying the 'blob_name' argument.
-)DOC")
-    .Arg("blob_name", "alternative key for the blob (optional)")
-    .Input(0, "handler", "unique_ptr<StoreHandler>")
-    .Output(0, "data", "data blob");
-
-StoreAddOp::StoreAddOp(const OperatorDef& operator_def, Workspace* ws)
-    : Operator<CPUContext>(operator_def, ws),
-      blobName_(GetSingleArgument<std::string>(kBlobName, "")),
-      addValue_(GetSingleArgument<int64_t>(kAddValue, 1)) {
-  CAFFE_ENFORCE(HasArgument(kBlobName));
-}
-
-bool StoreAddOp::RunOnDevice() {
-  auto* handler =
-      OperatorBase::Input<std::unique_ptr<StoreHandler>>(HANDLER).get();
-  Output(VALUE)->Resize(1);
-  Output(VALUE)->mutable_data<int64_t>()[0] =
-      handler->add(blobName_, addValue_);
-  return true;
-}
-
-REGISTER_CPU_OPERATOR(StoreAdd, StoreAddOp);
-OPERATOR_SCHEMA(StoreAdd)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Add a value to a remote counter. If the key is not set, the store
-initializes it to 0 and then performs the add operation. The operation
-returns the resulting counter value.
-)DOC")
-    .Arg("blob_name", "key of the counter (required)")
-    .Arg("add_value", "value that is added (optional, default: 1)")
-    .Input(0, "handler", "unique_ptr<StoreHandler>")
-    .Output(0, "value", "the current value of the counter");
-
-StoreWaitOp::StoreWaitOp(const OperatorDef& operator_def, Workspace* ws)
-    : Operator<CPUContext>(operator_def, ws),
-      blobNames_(GetRepeatedArgument<std::string>(kBlobName)) {}
-
-bool StoreWaitOp::RunOnDevice() {
-  auto* handler =
-      OperatorBase::Input<std::unique_ptr<StoreHandler>>(HANDLER).get();
-  if (InputSize() == 2 && Input(1).IsType<std::string>()) {
-    CAFFE_ENFORCE(
-        blobNames_.empty(), "cannot specify both argument and input blob");
-    std::vector<std::string> blobNames;
-    auto* namesPtr = Input(1).data<std::string>();
-    for (int i = 0; i < Input(1).size(); ++i) {
-      // NOLINTNEXTLINE(performance-inefficient-vector-operation)
-      blobNames.push_back(namesPtr[i]);
-    }
-    handler->wait(blobNames);
-  } else {
-    handler->wait(blobNames_);
-  }
-  return true;
-}
-
-REGISTER_CPU_OPERATOR(StoreWait, StoreWaitOp);
-OPERATOR_SCHEMA(StoreWait)
-    .NumInputs(1, 2)
-    .NumOutputs(0)
-    .SetDoc(R"DOC(
-Wait for the specified blob names to be set. The blob names can be passed
-either as an input blob with blob names or as an argument.
-)DOC")
-    .Arg("blob_names", "names of the blobs to wait for (optional)")
-    .Input(0, "handler", "unique_ptr<StoreHandler>")
-    .Input(1, "names", "names of the blobs to wait for (optional)");
-} // namespace caffe2
diff --git a/caffe2/distributed/store_ops.h b/caffe2/distributed/store_ops.h
deleted file mode 100644
index dc57b495fb88a..0000000000000
--- a/caffe2/distributed/store_ops.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#pragma once
-
-#include "store_handler.h"
-
-#include <caffe2/core/operator.h>
-
-namespace caffe2 {
-
-class StoreSetOp final : public Operator<CPUContext> {
- public:
-  StoreSetOp(const OperatorDef& operator_def, Workspace* ws);
-  bool RunOnDevice() override;
-
- private:
-  std::string blobName_;
-
-  INPUT_TAGS(HANDLER, DATA);
-};
-
-class StoreGetOp final : public Operator<CPUContext> {
- public:
-  StoreGetOp(const OperatorDef& operator_def, Workspace* ws);
-  bool RunOnDevice() override;
-
- private:
-  std::string blobName_;
-
-  INPUT_TAGS(HANDLER);
-  OUTPUT_TAGS(DATA);
-};
-
-class StoreAddOp final : public Operator<CPUContext> {
- public:
-  StoreAddOp(const OperatorDef& operator_def, Workspace* ws);
-  bool RunOnDevice() override;
-
- private:
-  std::string blobName_;
-  int addValue_;
-
-  INPUT_TAGS(HANDLER);
-  OUTPUT_TAGS(VALUE);
-};
-
-class StoreWaitOp final : public Operator<CPUContext> {
- public:
-  StoreWaitOp(const OperatorDef& operator_def, Workspace* ws);
-  bool RunOnDevice() override;
-
- private:
-  std::vector<std::string> blobNames_;
-
-  INPUT_TAGS(HANDLER);
-};
-} // namespace caffe2
diff --git a/caffe2/distributed/store_ops_test_util.py b/caffe2/distributed/store_ops_test_util.py
deleted file mode 100644
index b089d650511f8..0000000000000
--- a/caffe2/distributed/store_ops_test_util.py
+++ /dev/null
@@ -1,76 +0,0 @@
-## @package store_ops_test_util
-# Module caffe2.distributed.store_ops_test_util
-
-
-
-
-
-from multiprocessing import Process, Queue
-
-import numpy as np
-
-from caffe2.python import core, workspace
-
-
-class StoreOpsTests:
-    @classmethod
-    def _test_set_get(cls, queue, create_store_handler_fn, index, num_procs):
-        store_handler = create_store_handler_fn()
-        blob = "blob"
-        value = np.full(1, 1, np.float32)
-
-        # Use last process to set blob to make sure other processes
-        # are waiting for the blob before it is set.
-        if index == (num_procs - 1):
-            workspace.FeedBlob(blob, value)
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "StoreSet",
-                    [store_handler, blob],
-                    [],
-                    blob_name=blob))
-
-        output_blob = "output_blob"
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "StoreGet",
-                [store_handler],
-                [output_blob],
-                blob_name=blob))
-
-        try:
-            np.testing.assert_array_equal(workspace.FetchBlob(output_blob), 1)
-        except AssertionError as err:
-            queue.put(err)
-
-        workspace.ResetWorkspace()
-
-    @classmethod
-    def test_set_get(cls, create_store_handler_fn):
-        # Queue for assertion errors on subprocesses
-        queue = Queue()
-
-        # Start N processes in the background
-        num_procs = 4
-        procs = []
-        for index in range(num_procs):
-            proc = Process(
-                target=cls._test_set_get,
-                args=(queue, create_store_handler_fn, index, num_procs, ))
-            proc.start()
-            procs.append(proc)
-
-        # Test complete, join background processes
-        for proc in procs:
-            proc.join()
-
-        # Raise first error we find, if any
-        if not queue.empty():
-            raise queue.get()
-
-    @classmethod
-    def test_get_timeout(cls, create_store_handler_fn):
-        store_handler = create_store_handler_fn()
-        net = core.Net('get_missing_blob')
-        net.StoreGet([store_handler], 1, blob_name='blob')
-        workspace.RunNetOnce(net)
diff --git a/caffe2/experiments/operators/fully_connected_op_decomposition.cc b/caffe2/experiments/operators/fully_connected_op_decomposition.cc
deleted file mode 100644
index 82825275f01a5..0000000000000
--- a/caffe2/experiments/operators/fully_connected_op_decomposition.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/experiments/operators/fully_connected_op_decomposition.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(FC_Decomp, FullyConnectedOpDecomp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    FCGradient_Decomp,
-    FullyConnectedDecompGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(FC_Decomp).NumInputs(4).NumOutputs(1);
-OPERATOR_SCHEMA(FCGradient_Decomp).NumInputs(4).NumOutputs(3, 4);
-
-class GetFCDecompGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    CAFFE_ENFORCE_EQ(def_.input_size(), 4);
-    // TODO(wyiming): Check whether it is right? Let's move fast first.
-    return SingleGradientDef(
-        "FCGradient_Decomp",
-        "",
-        vector<string>{I(0), I(1), I(2), GO(0)},
-        vector<string>{GI(1), GI(2), GI(3), GI(0)});
-  }
-};
-REGISTER_GRADIENT(FC_Decomp, GetFCDecompGradient);
-} // namespace caffe2
diff --git a/caffe2/experiments/operators/fully_connected_op_decomposition.h b/caffe2/experiments/operators/fully_connected_op_decomposition.h
deleted file mode 100644
index 0c734b17be33f..0000000000000
--- a/caffe2/experiments/operators/fully_connected_op_decomposition.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_DECOMPOSITION_H_
-#define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_DECOMPOSITION_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-/*
- * Although a FC_decomp is just like 2 small FC,
- * it is better to have it as one op for future analysis.
- * And if we have 2 FC with bias, it is not right.
- * TODO(wyiming): decompose the layer into 2 matrices
- * W(N * K) = U(N * middle) * trans(V(K * middle))
- * */
-// This is Caffe's InnerProductOp, with a name that fits its purpose better.
-template <typename T, class Context, class Engine = DefaultEngine>
-class FullyConnectedOpDecomp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  FullyConnectedOpDecomp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
-  ~FullyConnectedOpDecomp() {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(0);
-    const auto& U = Input(1);
-    const auto& V = Input(2);
-    const auto& b = Input(3);
-
-    // auto* buffer_ptr = Output(1);
-    // Size M * middle;
-    // auto& multi_buffer_ = *buffer_ptr;
-    CAFFE_ENFORCE_GE(X.dim(), 1);
-    CAFFE_ENFORCE_GE(U.dim(), 2);
-    CAFFE_ENFORCE_GE(V.dim(), 2);
-    if (X.dim() > 2 || U.dim() > 2 || V.dim() > 2) {
-      VLOG(1) << "Using legacy support for arbitrary input and weight "
-                 "dimensions.";
-    }
-    CAFFE_ENFORCE_EQ(b.dim(), 1);
-    // batch size
-    int M = X.dim() > 1 ? X.dim32(0) : 1;
-    // Feature dimension
-    int K = X.numel() / M;
-    // number of outputs.
-    int N = U.dim32(0);
-    int middle = U.dim32(0);
-    CAFFE_ENFORCE_EQ(K, V.dim32(0));
-    CAFFE_ENFORCE_EQ(N, b.dim32(0));
-    std::vector<int64_t> dims;
-    if (X.dim() > 1) {
-      dims = {M, N};
-      multi_buffer_.Resize(M, middle);
-    } else {
-      dims = {N};
-      multi_buffer_.Resize(middle);
-    }
-    auto* Y = Output(0, dims, at::dtype<T>());
-    // The col buffer is stored in CHW order as well - kernel_dim, and the
-    // height and width.
-    //  multi_buffer_.Resize(M, middle);
-    T* multi_buffer_data = multi_buffer_.template mutable_data<T>();
-    //  X * V * tans(U)
-    math::Gemm<T, Context, Engine>(
-        CblasNoTrans,
-        CblasNoTrans,
-        M,
-        middle,
-        K,
-        1,
-        X.template data<T>(),
-        V.template data<T>(),
-        0,
-        multi_buffer_data,
-        &context_);
-    math::Gemm<T, Context, Engine>(
-        CblasNoTrans,
-        CblasTrans,
-        M,
-        N,
-        middle,
-        1,
-        multi_buffer_data,
-        U.template data<T>(),
-        0,
-        Y->template mutable_data<T>(),
-        &context_);
-    // Add bias term
-    if (bias_multiplier_.numel() != M) {
-      // If the helper bias multiplier is not M, reshape and fill it with one.
-      bias_multiplier_.Resize(M);
-      math::Set<T, Context>(
-          M,
-          static_cast<T>(1),
-          bias_multiplier_.template mutable_data<T>(),
-          &context_);
-    }
-    math::Gemm<T, Context, Engine>(
-        CblasNoTrans,
-        CblasNoTrans,
-        M,
-        N,
-        1,
-        1,
-        bias_multiplier_.template data<T>(),
-        b.template data<T>(),
-        1,
-        Y->template mutable_data<T>(),
-        &context_);
-    return true;
-  }
-
- protected:
-  Tensor bias_multiplier_{Context::GetDeviceType()};
-  Tensor multi_buffer_{Context::GetDeviceType()};
-};
-
-template <typename T, class Context, class Engine = DefaultEngine>
-class FullyConnectedDecompGradientOp : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  FullyConnectedDecompGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
-  ~FullyConnectedDecompGradientOp() {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(0);
-    const auto& U = Input(1);
-    const auto& V = Input(2);
-    const auto& dY = Input(3);
-    TORCH_DCHECK_GE(X.dim(), 1);
-    TORCH_DCHECK_GE(U.dim(), 2);
-    TORCH_DCHECK_GE(V.dim(), 2);
-    TORCH_DCHECK_LE(dY.dim(), 2);
-    // batch size
-    int M = X.dim() > 1 ? X.dim32(0) : 1;
-    // Feature dimension
-    int K = X.numel() / M;
-    // number of outputs.
-    int N = U.dim32(0);
-    int middle = U.dim32(1);
-    TORCH_DCHECK_EQ(K, V.dim32(0));
-    if (dY.dim() > 1) {
-      TORCH_DCHECK_EQ(M, dY.dim32(0));
-      TORCH_DCHECK_EQ(N, dY.dim32(1));
-    } else {
-      TORCH_DCHECK_EQ(X.dim(), 1);
-      TORCH_DCHECK_EQ(N, dY.numel());
-    }
-
-    auto* dU = Output(0, U.sizes(), at::dtype<T>());
-    auto* dV = Output(1, V.sizes(), at::dtype<T>());
-    auto* db = Output(2, {N}, at::dtype<T>());
-
-    // Compute dU
-    // first compute X * V
-    du_buffer_.Resize(N, middle);
-    T* du_buffer_data = du_buffer_.template mutable_data<T>();
-    math::Gemm<T, Context, Engine>(
-        CblasNoTrans,
-        CblasNoTrans,
-        M,
-        middle,
-        K,
-        1,
-        X.template data<T>(),
-        V.template data<T>(),
-        0,
-        du_buffer_data,
-        &context_);
-    math::Gemm<T, Context, Engine>(
-        CblasTrans,
-        CblasNoTrans,
-        N,
-        middle,
-        M,
-        1,
-        dY.template data<T>(),
-        du_buffer_data,
-        0,
-        dU->template mutable_data<T>(),
-        &context_);
-    // Compute dV
-    // first compute dY * U
-    dv_buffer_.Resize(M, middle);
-    T* dv_buffer_data = dv_buffer_.template mutable_data<T>();
-    math::Gemm<T, Context, Engine>(
-        CblasNoTrans,
-        CblasNoTrans,
-        M,
-        middle,
-        N,
-        1,
-        dY.template data<T>(),
-        U.template data<T>(),
-        0,
-        dv_buffer_data,
-        &context_);
-    math::Gemm<T, Context, Engine>(
-        CblasTrans,
-        CblasNoTrans,
-        K,
-        middle,
-        M,
-        1,
-        dY.template data<T>(),
-        du_buffer_data,
-        0,
-        dV->template mutable_data<T>(),
-        &context_);
-    if (bias_multiplier_.numel() != M) {
-      // If the helper bias multiplier is not M, reshape and fill it with one.
-      bias_multiplier_.Resize(M);
-      math::Set<T, Context>(
-          M,
-          static_cast<T>(1),
-          bias_multiplier_.template mutable_data<T>(),
-          &context_);
-    }
-    // Compute dB
-    math::Gemv<T, Context>(
-        CblasTrans,
-        M,
-        N,
-        1,
-        dY.template data<T>(),
-        bias_multiplier_.template data<T>(),
-        0,
-        db->template mutable_data<T>(),
-        &context_);
-    // Compute dX if necessary.
-    if (OutputSize() == 4) {
-      auto* dX = Output(3, X.sizes(), at::dtype<T>());
-      dx_buffer_.Resize(M, middle);
-      T* dx_buffer_data = dx_buffer_.template mutable_data<T>();
-      math::Gemm<T, Context, Engine>(
-          CblasNoTrans,
-          CblasNoTrans,
-          M,
-          middle,
-          N,
-          1,
-          dY.template data<T>(),
-          U.template data<T>(),
-          0,
-          dx_buffer_data,
-          &context_);
-      math::Gemm<T, Context, Engine>(
-          CblasNoTrans,
-          CblasTrans,
-          M,
-          K,
-          middle,
-          1,
-          dx_buffer_data,
-          V.template data<T>(),
-          0,
-          dX->template mutable_data<T>(),
-          &context_);
-    }
-
-    return true;
-  }
-
- protected:
-  Tensor bias_multiplier_{Context::GetDeviceType()};
-  Tensor du_buffer_{Context::GetDeviceType()};
-  Tensor dv_buffer_{Context::GetDeviceType()};
-  Tensor dx_buffer_{Context::GetDeviceType()};
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
diff --git a/caffe2/experiments/operators/fully_connected_op_decomposition_gpu.cc b/caffe2/experiments/operators/fully_connected_op_decomposition_gpu.cc
deleted file mode 100644
index f6012c1082e87..0000000000000
--- a/caffe2/experiments/operators/fully_connected_op_decomposition_gpu.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/experiments/operators/fully_connected_op_decomposition.h"
-
-namespace caffe2 {
-
-REGISTER_CUDA_OPERATOR(FC_Decomp, FullyConnectedOpDecomp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(
-    FCGradient_Decomp,
-    FullyConnectedDecompGradientOp<float, CUDAContext>);
-
-} // namespace caffe2
diff --git a/caffe2/experiments/operators/fully_connected_op_prune.cc b/caffe2/experiments/operators/fully_connected_op_prune.cc
deleted file mode 100644
index 6e65a1386e778..0000000000000
--- a/caffe2/experiments/operators/fully_connected_op_prune.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/experiments/operators/fully_connected_op_prune.h"
-
-namespace caffe2 {
-namespace {
-
-REGISTER_CPU_OPERATOR(FC_Prune, FullyConnectedOpPrune<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    FCGradient_Prune,
-    FullyConnectedPruneGradientOp<float, CPUContext>);
-/* 8 Inputs:
- * X    W   Mask  bias  Ag_dw   Mask_seq  thres   comp_lb
- * */
-OPERATOR_SCHEMA(FC_Prune).NumInputs(8).NumOutputs(1, 2);
-OPERATOR_SCHEMA(FCGradient_Prune)
-    .NumInputs(8)
-    .NumOutputs(6, 7)
-    .AllowInplace({{1, 2}, {2, 3}, {4, 4}, {5, 5}});
-
-class GetFCPruneGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    CAFFE_ENFORCE_EQ(def_.input_size(), 8);
-    return SingleGradientDef(
-        "FCGradient_Prune",
-        "",
-        vector<string>{I(0), I(1), I(2), GO(0), I(4), I(5), I(6), I(7)},
-        vector<string>{GI(1), GI(3), I(1), I(2), I(4), I(5), GI(0)});
-  }
-};
-REGISTER_GRADIENT(FC_Prune, GetFCPruneGradient);
-} // namespace
-} // namespace caffe2
diff --git a/caffe2/experiments/operators/fully_connected_op_prune.h b/caffe2/experiments/operators/fully_connected_op_prune.h
deleted file mode 100644
index 70834a707d134..0000000000000
--- a/caffe2/experiments/operators/fully_connected_op_prune.h
+++ /dev/null
@@ -1,407 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_PRUNE_H_
-#define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_PRUNE_H_
-
-#include <c10/util/Logging.h>
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-namespace {
-
-template <int N>
-using Shape = std::array<int, N>;
-
-template <int N>
-const std::vector<int64_t>& shape(Shape<N> vs) {
-  static thread_local std::vector<int64_t> cache;
-  cache.resize(vs.size());
-  for (const auto i : c10::irange(vs.size())) {
-    cache[i] = vs[i];
-  }
-  return cache;
-}
-
-inline const std::vector<int64_t>& shape(int i) {
-  return shape<1>(Shape<1>({i}));
-}
-
-inline const std::vector<int64_t>& shape(int i, int j) {
-  return shape<2>(Shape<2>({i, j}));
-}
-
-template <typename T, class Context>
-void MaskMatrix(const T* mask, T* mat, int M, int N);
-
-template <typename T, class Context>
-void MaskMatrix_Inc(T* mask_seq, T* mat, int M, int N, int seq_len, T target);
-
-template <typename T, class Context>
-void AggrDW(T* ag_dw, const T* dw, int N, int K, Context* context);
-
-template <typename T>
-int MatrixCompare_LT(const T* mat, float thres, T* mask_seq, int M, int N);
-
-// TODO(wyiming): write an incremental Mask
-// Incremental Mask: only give the new mask positions;
-// Assuming that weights masked will not be mask again;
-// The incremental mask can also be used to update mask matrix;
-// But this will include template for bool and float;
-template <>
-void MaskMatrix<float, CPUContext>(
-    const float* mask,
-    float* mat,
-    int M,
-    int N) {
-  int offset = 0;
-  for (int i = 0; i < M; ++i) {
-    for (int j = 0; j < N; ++j) {
-      mat[offset] = mask[offset] ? mat[offset] : 0;
-      offset++;
-    }
-  }
-}
-
-template <>
-void MaskMatrix_Inc<float, CPUContext>(
-    float* mask_seq,
-    float* mat,
-    int /*M*/,
-    int /*N*/,
-    int seq_len,
-    float target) {
-  for (const auto i : c10::irange(seq_len)) {
-    // assume that the mask_seq is smaller than size
-    // Although it seems that random access gets bad performance,
-    // we make sure that seq is in order;
-    mat[static_cast<int>(mask_seq[i])] = target;
-  }
-}
-
-template <>
-void AggrDW<float, CPUContext>(
-    float* ag_dw,
-    const float* dw,
-    int N,
-    int K,
-    CPUContext* context) {
-  math::Add<float, CPUContext>(N * K, dw, ag_dw, ag_dw, context);
-}
-
-template <>
-int MatrixCompare_LT<float>(
-    const float* mat,
-    float thres,
-    float* mask_seq,
-    int M,
-    int N) {
-  int seq_len = 0;
-  int offset = 0;
-  for (int i = 0; i < M; ++i) {
-    for (int j = 0; j < N; ++j) {
-      if (mat[offset] != 0 && (mat[offset] < thres && mat[offset] > -thres)) {
-        mask_seq[seq_len++] = static_cast<float>(offset);
-      }
-      offset++;
-    }
-  }
-  return seq_len;
-}
-
-} // namespace
-
-// This is Caffe's InnerProductOp, with a name that fits its purpose better.
-template <typename T, class Context, class Engine = DefaultEngine>
-class FullyConnectedOpPrune final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  FullyConnectedOpPrune(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
-  ~FullyConnectedOpPrune() {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(0);
-    const auto& W = Input(1);
-    const auto& Mask = Input(2);
-    const auto& b = Input(3);
-
-    CAFFE_ENFORCE_GE(X.dim(), 1);
-    CAFFE_ENFORCE_GE(W.dim(), 2);
-    if (X.dim() > 2 || W.dim() > 2) {
-      VLOG(1) << "Using legacy support for arbitrary input and weight "
-                 "dimensions.";
-    }
-    CAFFE_ENFORCE_EQ(b.dim(), 1);
-    // batch size
-    int M = X.dim() > 1 ? X.dim32(0) : 1;
-    // Feature dimension
-    int K = X.numel() / M;
-    // number of outputs.
-    int N = W.dim32(0);
-    CAFFE_ENFORCE_EQ(K, W.numel() / W.dim32(0));
-    CAFFE_ENFORCE_EQ(N, b.dim32(0));
-    std::vector<int64_t> dims;
-    if (X.dim() > 1) {
-      dims = {M, N};
-    } else {
-      dims = {N};
-    }
-    auto* Y = Output(0, dims, at::dtype<T>());
-    // W * x
-    math::Gemm<T, Context, Engine>(
-        CblasNoTrans,
-        CblasTrans,
-        M,
-        N,
-        K,
-        1,
-        X.template data<T>(),
-        W.template data<T>(),
-        0,
-        Y->template mutable_data<T>(),
-        &context_);
-    // Add bias term
-    if (bias_multiplier_.numel() != M) {
-      // If the helper bias multiplier is not M,
-      // reshape and fill it with one.
-      bias_multiplier_.Resize(M);
-      math::Set<T, Context>(
-          M,
-          static_cast<T>(1),
-          bias_multiplier_.template mutable_data<T>(),
-          &context_);
-    }
-    math::Gemm<T, Context, Engine>(
-        CblasNoTrans,
-        CblasNoTrans,
-        M,
-        N,
-        1,
-        1,
-        bias_multiplier_.template data<T>(),
-        b.template data<T>(),
-        1,
-        Y->template mutable_data<T>(),
-        &context_);
-    if (OutputSize() == 2) {
-      auto* Comp_rate = Output(1, vector<int64_t>(), at::dtype<T>());
-      T* comp_data = Comp_rate->template mutable_data<T>();
-      math::Sum<T, Context>(
-          Mask.numel(), Mask.template data<T>(), comp_data, &context_);
-      math::Scale<float, T, Context>(
-          1,
-          static_cast<T>(1.) / Mask.numel(),
-          comp_data,
-          comp_data,
-          &context_);
-    }
-    return true;
-  }
-
- protected:
-  Tensor bias_multiplier_{Context::GetDeviceType()};
-};
-
-template <typename T, class Context, class Engine = DefaultEngine>
-class FullyConnectedPruneGradientOp : public Operator<Context> {
- public:
-  int iter_offset;
-
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  FullyConnectedPruneGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws) {
-    iter_offset = 0;
-  }
-  ~FullyConnectedPruneGradientOp() {}
-
-  bool RunOnDevice() override {
-    const auto& X = Input(0);
-    // const auto& W = Input(1);
-    auto* W_ptr = Output(2);
-    auto& W = *W_ptr;
-    // const auto& Mask = Input(2);
-    auto* Mask_ptr = Output(3);
-    auto& Mask = *Mask_ptr;
-    const auto& dY = Input(3);
-    // const auto& Ag_dW = Input(4);
-    auto* Ag_dW_ptr = Output(4);
-    auto& Ag_dW = *Ag_dW_ptr;
-    // it is also the Input(5)
-
-    // how about get threshold
-    auto& thres = Input(6);
-    // TODO(wyiming): check comp_lb is a float
-    auto& comp_lb = Input(7);
-    TORCH_DCHECK_GE(X.dim(), 1);
-    TORCH_DCHECK_GE(W.dim(), 2);
-    TORCH_DCHECK_LE(dY.dim(), 2);
-    // batch size
-    int M = X.dim() > 1 ? X.dim32(0) : 1;
-    // Feature dimension
-    int K = X.numel() / M;
-    // number of outputs.
-    int N = W.dim32(0);
-    // TODO(wyiming): add this window_size to workspace?
-    int window_size = 100;
-    // TODO(wyiming): this threshold should be
-    // based on distribution of the layer weight
-    float thr = 0.01;
-    TORCH_DCHECK_EQ(Mask.dim32(0), W.dim32(0));
-    TORCH_DCHECK_EQ(Mask.dim32(1), W.dim32(1));
-    TORCH_DCHECK_EQ(Ag_dW.dim32(0), W.dim32(0));
-    TORCH_DCHECK_EQ(Ag_dW.dim32(1), W.dim32(1));
-    TORCH_DCHECK_EQ(K, W.numel() / W.dim32(0));
-    if (dY.dim() > 1) {
-      TORCH_DCHECK_EQ(M, dY.dim32(0));
-      TORCH_DCHECK_EQ(N, dY.dim32(1));
-    } else {
-      TORCH_DCHECK_EQ(X.dim(), 1);
-      TORCH_DCHECK_EQ(N, dY.numel());
-    }
-
-    auto* dW = Output(0, W.sizes(), at::dtype<T>());
-    auto* db = Output(1, {N}, at::dtype<T>());
-
-    // Compute dW
-    math::Gemm<T, Context, Engine>(
-        CblasTrans,
-        CblasNoTrans,
-        N,
-        K,
-        M,
-        1,
-        dY.template data<T>(),
-        X.template data<T>(),
-        0,
-        dW->template mutable_data<T>(),
-        &context_);
-
-    comp_r_buf_.Resize(vector<int64_t>());
-    T* comp_data = comp_r_buf_.template mutable_data<T>();
-    math::Sum<T, Context>(
-        Mask.numel(), Mask.template data<T>(), comp_data, &context_);
-    math::Scale<float, T, Context>(
-        1, static_cast<T>(1.) / Mask.numel(), comp_data, comp_data, &context_);
-    // update W size window
-    // Notice here we need to maintain state in OP.
-    // This is new in Caffe2.
-    // And this is something we might need to discuss in the future.
-    // at most mask half of the matrix at time
-    // 1. mask dw with previous mask
-    MaskMatrix<T, Context>(
-        Mask.template mutable_data<T>(), dW->template mutable_data<T>(), N, K);
-    if (*comp_data > *(comp_lb.template data<T>())) {
-      iter_offset++;
-      if (iter_offset % window_size == 0) {
-        // TODO(wyiming):do the prune here;
-        sum_buffer_.ResizeLike(W);
-        math::Add<T, Context>(
-            W.numel(),
-            W.template mutable_data<T>(),
-            Ag_dW.template mutable_data<T>(),
-            sum_buffer_.template mutable_data<T>(),
-            &context_);
-        auto* mask_seq_auto = Output(5, W.sizes(), at::dtype<T>());
-        T* mask_seq = mask_seq_auto->template mutable_data<T>();
-        math::Set<T, Context>(
-            N * K,
-            static_cast<T>(0),
-            mask_seq_auto->template mutable_data<T>(),
-            &context_);
-        // 2. find dw below thres but not eq 0
-        int seq_len = MatrixCompare_LT<T>(
-            Ag_dW_ptr->template mutable_data<T>(),
-            *thres.template data<T>(),
-            mask_seq,
-            N,
-            K);
-        // 3. use the mask_seq to update W and dw
-        MaskMatrix_Inc<T, Context>(
-            mask_seq, dW->template mutable_data<T>(), N, K, seq_len, 0);
-        MaskMatrix_Inc<T, Context>(
-            mask_seq, W.template mutable_data<T>(), N, K, seq_len, 0);
-        MaskMatrix_Inc<T, Context>(
-            mask_seq, Mask.template mutable_data<T>(), N, K, seq_len, 0);
-        math::Set<T, Context>(
-            N * K,
-            static_cast<T>(0),
-            Ag_dW.template mutable_data<T>(),
-            &context_);
-      } else {
-        // add dW to Aggregate dW.
-        AggrDW<T, Context>(
-            Ag_dW.template mutable_data<T>(),
-            dW->template mutable_data<T>(),
-            N,
-            K,
-            &context_);
-      }
-    }
-    if (bias_multiplier_.numel() != M) {
-      // If the helper bias multiplier is not M,
-      // reshape and fill it with one.
-      bias_multiplier_.Resize(M);
-      math::Set<T, Context>(
-          M,
-          static_cast<T>(1),
-          bias_multiplier_.template mutable_data<T>(),
-          &context_);
-    }
-    // Compute dB
-    math::Gemv<T, Context>(
-        CblasTrans,
-        M,
-        N,
-        1,
-        dY.template data<T>(),
-        bias_multiplier_.template data<T>(),
-        0,
-        db->template mutable_data<T>(),
-        &context_);
-    // Compute dX if necessary.
-    if (OutputSize() == 7) {
-      auto* dX = Output(6, X.sizes(), at::dtype<T>());
-      math::Gemm<T, Context, Engine>(
-          CblasNoTrans,
-          CblasNoTrans,
-          M,
-          K,
-          N,
-          1,
-          dY.template data<T>(),
-          W.template data<T>(),
-          0,
-          dX->template mutable_data<T>(),
-          &context_);
-    }
-
-    return true;
-  }
-
- protected:
-  Tensor bias_multiplier_{Context::GetDeviceType()};
-  Tensor sum_buffer_{Context::GetDeviceType()};
-  Tensor comp_r_buf_{Context::GetDeviceType()};
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
diff --git a/caffe2/experiments/operators/fully_connected_op_sparse.cc b/caffe2/experiments/operators/fully_connected_op_sparse.cc
deleted file mode 100644
index de96e35c7e4ec..0000000000000
--- a/caffe2/experiments/operators/fully_connected_op_sparse.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/experiments/operators/fully_connected_op_sparse.h"
-
-namespace caffe2 {
-namespace {
-
-REGISTER_CPU_OPERATOR(FC_Sparse, FullyConnectedOp_SPARSE<float, CPUContext>);
-
-OPERATOR_SCHEMA(FC_Sparse).NumInputs(5).NumOutputs(1);
-} // namespace
-} // namespace caffe2
diff --git a/caffe2/experiments/operators/fully_connected_op_sparse.h b/caffe2/experiments/operators/fully_connected_op_sparse.h
deleted file mode 100644
index 699948bb50857..0000000000000
--- a/caffe2/experiments/operators/fully_connected_op_sparse.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef CAFFE2_OPERATORS_FULLY_CONNECTED_OP_SPARSE_H_
-#define CAFFE2_OPERATORS_FULLY_CONNECTED_OP_SPARSE_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-#ifdef CAFFE2_USE_MKL
-#include <mkl.h>
-#endif // CAFFE2_USE_MKL
-
-namespace caffe2 {
-
-namespace {
-
-template <int N>
-using Shape = std::array<int, N>;
-
-template <int N>
-const std::vector<int64_t>& shape(Shape<N> vs) {
-  static thread_local std::vector<int64_t> cache;
-  cache.resize(vs.size());
-  for (const auto i : c10::irange(vs.size())) {
-    cache[i] = vs[i];
-  }
-  return cache;
-}
-
-inline const std::vector<int64_t>& shape(int i) {
-  return shape<1>(Shape<1>({i}));
-}
-
-inline const std::vector<int64_t>& shape(int i, int j) {
-  return shape<2>(Shape<2>({i, j}));
-}
-
-template <typename T, class Context>
-void Sparse_mm(
-    const T* acsr,
-    const int* ia,
-    const int* ja,
-    int m,
-    int k,
-    int n,
-    const T* b,
-    T* c,
-    Context* context);
-
-template <typename T, class Context>
-void trans_mat(const T* o, T* t, int m, int n, Context* context);
-
-template <>
-void trans_mat<float, CPUContext>(
-    const float* o,
-    float* t,
-    int m,
-    int n,
-    CPUContext* /*context*/) {
-  for (const auto i : c10::irange(m)) {
-    for (const auto j : c10::irange(n)) {
-      t[j * m + i] = o[i * n + j];
-    }
-  }
-}
-
-// C = A(sparse) * B
-// No transpose;
-template <>
-void Sparse_mm<float, CPUContext>(
-    const float* acsr,
-    const int* ia,
-    const int* ja,
-    int m,
-    int k,
-    int n,
-    const float* b,
-    float* c,
-    CPUContext* /*context*/) {
-#ifdef CAFFE2_USE_MKL
-
-  float alpha = 1.0, beta = 0.;
-  mkl_scsrmm(
-      "N",
-      &m,
-      &n,
-      &k,
-      &alpha,
-      "GLNC",
-      acsr,
-      ja,
-      ia,
-      ia + 1,
-      b,
-      &n,
-      &beta,
-      c,
-      &n);
-
-#else
-  throw std::runtime_error("Not compiled with MKL");
-#endif
-}
-
-} // namespace
-
-// This is Caffe's InnerProductOp, with a name that fits its purpose better.
-template <typename T, class Context, class Engine = DefaultEngine>
-class FullyConnectedOp_SPARSE final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  FullyConnectedOp_SPARSE(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
-  ~FullyConnectedOp_SPARSE() {}
-
-  bool RunOnDevice() override {
-    const auto& Xt = Input(0); // transposed X
-    const auto& Wcsr = Input(1);
-    const auto& iw = Input(2);
-    const auto& jw = Input(3);
-    // Notice that we do not need to transpose b
-    const auto& b = Input(4);
-    // transposed Y
-    // here we assume X is k-by-m
-    CAFFE_ENFORCE_EQ(Xt.dim(), 2);
-    CAFFE_ENFORCE_EQ(b.dim(), 1);
-    // batch size
-    int K = Xt.dim() > 1 ? Xt.dim32(0) : 1;
-    // Feature dimension
-    int M = Xt.numel() / K;
-    // number of outputs.
-    int N = iw.dim32(0) - 1;
-    CAFFE_ENFORCE_EQ(N, b.dim32(0));
-    auto* Yt = Output(0, shape(N, M), at::dtype<T>());
-
-    // Y' = W * X';
-    Sparse_mm<T, Context>(
-        Wcsr.template data<T>(),
-        iw.template data<int>(),
-        jw.template data<int>(),
-        N,
-        K,
-        M,
-        Xt.template data<T>(),
-        Yt->template mutable_data<T>(),
-        &context_);
-    // Add bias term
-    if (bias_multiplier_.numel() != M) {
-      // If the helper bias multiplier is not M, reshape and fill it with one.
-      bias_multiplier_.Resize(shape(M));
-      math::Set<T, Context>(
-          M,
-          static_cast<T>(1),
-          bias_multiplier_.template mutable_data<T>(),
-          &context_);
-    }
-    math::Gemm<T, Context, Engine>(
-        CblasNoTrans,
-        CblasNoTrans,
-        N,
-        M,
-        1,
-        1,
-        b.template data<T>(),
-        bias_multiplier_.template data<T>(),
-        1,
-        Yt->template mutable_data<T>(),
-        &context_);
-    return true;
-  }
-
- protected:
-  Tensor bias_multiplier_{Context::GetDeviceType()};
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_FULLY_CONNECTED_OP_H_
diff --git a/caffe2/experiments/operators/funhash_op.cc b/caffe2/experiments/operators/funhash_op.cc
deleted file mode 100644
index 50a031bac30b6..0000000000000
--- a/caffe2/experiments/operators/funhash_op.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/experiments/operators/funhash_op.h"
-
-namespace caffe2 {
-namespace {
-
-REGISTER_CPU_OPERATOR(FunHash, FunHashOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(FunHashGradient, FunHashGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(FunHash)
-    .NumInputs(4, 5)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-This layer compresses a fully-connected layer for sparse inputs
-via hashing.
-It takes four required inputs and an optional fifth input.
-The first three inputs `scalars`, `indices`, and `segment_ids` are
-the sparse segmented representation of sparse data, which are the
-same as the last three inputs of the `SparseSortedSegmentWeightedSum`
-operator. If the argument `num_segments` is specified, it would be used
-as the first dimension for the output; otherwise it would be derived
-from the maximum segment ID.
-
-The fourth input is a 1D weight vector. Each entry of the fully-connected
-layer would be randomly mapped from one of the entries in this vector.
-
-When the optional fifth input vector is present, each weight of the
-fully-connected layer would be the linear combination of K entries
-randomly mapped from the weight vector, provided the input
-(length-K vector) serves as the coefficients.
-)DOC")
-    .Input(0, "scalars", "Values of the non-zero entries of the sparse data.")
-    .Input(1, "indices", "Indices to the non-zero valued features.")
-    .Input(
-        2,
-        "segment_ids",
-        "Segment IDs corresponding to the non-zero entries.")
-    .Input(3, "weight", "Weight vector")
-    .Input(
-        4,
-        "alpha",
-        "Optional coefficients for linear combination of hashed weights.")
-    .Output(
-        0,
-        "output",
-        "Output tensor with the first dimension equal to the number "
-        "of segments.")
-    .Arg("num_outputs", "Number of outputs")
-    .Arg("num_segments", "Number of segments");
-
-OPERATOR_SCHEMA(FunHashGradient).NumInputs(5, 6).NumOutputs(1, 2);
-
-class GetFunHashGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    if (def_.input_size() == 4) {
-      return SingleGradientDef(
-          "FunHashGradient",
-          "",
-          vector<string>{GO(0), I(0), I(1), I(2), I(3)},
-          vector<string>{GI(3)});
-    }
-    // def_.input_size() == 5
-    return SingleGradientDef(
-        "FunHashGradient",
-        "",
-        vector<string>{GO(0), I(0), I(1), I(2), I(3), I(4)},
-        vector<string>{GI(3), GI(4)});
-  }
-};
-
-REGISTER_GRADIENT(FunHash, GetFunHashGradient);
-
-} // namespace
-} // namespace caffe2
diff --git a/caffe2/experiments/operators/funhash_op.h b/caffe2/experiments/operators/funhash_op.h
deleted file mode 100644
index 3f52a906a2765..0000000000000
--- a/caffe2/experiments/operators/funhash_op.h
+++ /dev/null
@@ -1,236 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef CAFFE2_OPERATORS_FUNHASH_OP_H_
-#define CAFFE2_OPERATORS_FUNHASH_OP_H_
-
-#include <xxhash.h>
-#include <array>
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-#define SIGN_MAGIC 0x9e3779b97f4a7c15
-#define INDEX_MAGIC 0xf39cc0605cedc834
-
-#define USE_SIGN
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class FunHashOp : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  FunHashOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        num_outputs_(
-            OperatorBase::GetSingleArgument<int64_t>("num_outputs", -1)),
-        num_segments_(
-            OperatorBase::GetSingleArgument<int64_t>("num_segments", -1)),
-        seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
-    CAFFE_ENFORCE(
-        OperatorBase::HasArgument("num_outputs"),
-        "Argument `num_outputs` is missing.");
-    // If alpha is provided, use adaptive hashing parameterized by alpha.
-    adaptive_ = (InputSize() == 5);
-  }
-
-  bool RunOnDevice() override {
-    const auto& val = Input(0);
-    const auto& key = Input(1);
-    const auto& seg = Input(2);
-    const auto& weight = Input(3);
-
-    int64_t num_alpha = 1;
-    if (adaptive_) {
-      const auto& alpha = Input(4);
-      num_alpha = alpha.size(0);
-    }
-
-    const auto* seg_data = seg.template data<int>();
-
-    int64_t num_weight = weight.size(0);
-    int64_t num_nz_ent = seg.size(0);
-
-    int64_t n_segments = num_segments_;
-    if (num_segments_ == -1) {
-      for (const auto i : c10::irange(num_nz_ent)) {
-        if (seg_data[i] > n_segments) {
-          n_segments = seg_data[i];
-        }
-      }
-      ++n_segments;
-    }
-
-    auto* output = Output(0, {n_segments, num_outputs_}, at::dtype<T>());
-
-    T* output_data = output->template mutable_data<T>();
-
-    memset(output_data, 0, sizeof(T) * n_segments * num_outputs_);
-
-    const auto* weight_data = weight.template data<T>();
-    const auto* alpha_data = adaptive_ ? Input(4).template data<T>() : 0;
-    const auto* val_data = val.template data<T>();
-    const auto* key_data = key.template data<int64_t>();
-
-    for (const auto j : c10::irange(num_nz_ent)) {
-      int64_t cur_seg = seg_data[j];
-      int64_t cur_key = key_data[j];
-      T cur_val = val_data[j];
-      int64_t output_stride = cur_seg * num_outputs_;
-      for (const auto i : c10::irange(num_outputs_)) {
-        T sum = 0;
-        for (const auto k : c10::irange(num_alpha)) {
-          uint64_t hash;
-          // The hash function takes as input four integers:
-          // 1. feature index
-          // 2. output index
-          // 3. alpha index
-          // 4. magic number: SIGN_MAGIC for sign (-1/+1)
-          //                  INDEX_MAGIC for weight index
-          hash_data[0] = cur_key;
-          hash_data[1] = i;
-          hash_data[2] = k;
-
-          hash_data[3] = INDEX_MAGIC;
-          hash = XXH64(hash_data.data(), hash_data.size(), seed_);
-          int64_t index = hash % num_weight;
-
-          T cur_weight = weight_data[index];
-#ifdef USE_SIGN
-          hash_data[3] = SIGN_MAGIC;
-          hash = XXH64(hash_data.data(), hash_data.size(), seed_);
-          if (hash % 2) {
-            cur_weight = -cur_weight;
-          }
-#endif // USE_SIGN
-
-          if (adaptive_) {
-            sum += cur_weight * alpha_data[k];
-          } else {
-            sum += cur_weight;
-          }
-        }
-        output_data[output_stride + i] += sum * cur_val;
-      }
-    }
-
-    return true;
-  }
-
- protected:
-  int64_t num_outputs_;
-  int64_t num_segments_;
-  uint64_t seed_;
-  std::array<uint64_t, 4> hash_data;
-  bool adaptive_;
-};
-
-template <typename T, class Context>
-class FunHashGradientOp : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  FunHashGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        num_outputs_(
-            OperatorBase::GetSingleArgument<int64_t>("num_outputs", -1)),
-        seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
-    adaptive_ = (InputSize() == 6);
-  }
-
-  bool RunOnDevice() override {
-    const auto& grad_out = Input(0);
-    const auto& val = Input(1);
-    const auto& key = Input(2);
-    const auto& seg = Input(3);
-    const auto& weight = Input(4);
-
-    int64_t num_alpha = 1;
-    T* grad_alpha_data = 0;
-
-    if (adaptive_) {
-      const auto& alpha = Input(5);
-      num_alpha = alpha.size(0);
-
-      auto* grad_alpha = Output(1, alpha.sizes(), at::dtype<T>());
-      grad_alpha_data = grad_alpha->template mutable_data<T>();
-      memset(grad_alpha_data, 0, sizeof(T) * num_alpha);
-    }
-
-    const auto* seg_data = seg.template data<int>();
-
-    int64_t num_weight = weight.size(0);
-    int64_t num_nz_ent = seg.size(0);
-
-    auto* grad_weight = Output(0, weight.sizes(), at::dtype<T>());
-    T* grad_weight_data = grad_weight->template mutable_data<T>();
-
-    const auto* grad_out_data = grad_out.template data<T>();
-    const auto* weight_data = weight.template data<T>();
-    const auto* alpha_data = adaptive_ ? Input(5).template data<T>() : 0;
-    const auto* val_data = val.template data<T>();
-    const auto* key_data = key.template data<int64_t>();
-
-    memset(grad_weight_data, 0, sizeof(T) * num_weight);
-
-    for (const auto j : c10::irange(num_nz_ent)) {
-      int64_t cur_seg = seg_data[j];
-      int64_t cur_key = key_data[j];
-      T cur_val = val_data[j];
-      int64_t grad_out_stride = cur_seg * num_outputs_;
-      for (const auto i : c10::irange(num_outputs_)) {
-        T grad_out_scale = grad_out_data[grad_out_stride + i] * cur_val;
-        for (const auto k : c10::irange(num_alpha)) {
-          uint64_t hash;
-          hash_data[0] = cur_key;
-          hash_data[1] = i;
-          hash_data[2] = k;
-
-          hash_data[3] = INDEX_MAGIC;
-          hash = XXH64(hash_data.data(), hash_data.size(), seed_);
-          int64_t index = hash % num_weight;
-
-          T cur_grad_out_scale = grad_out_scale;
-#ifdef USE_SIGN
-          hash_data[3] = SIGN_MAGIC;
-          hash = XXH64(hash_data.data(), hash_data.size(), seed_);
-          if (hash % 2) {
-            cur_grad_out_scale = -cur_grad_out_scale;
-          }
-#endif // USE_SIGN
-
-          if (adaptive_) {
-            grad_alpha_data[k] += cur_grad_out_scale * weight_data[index];
-            grad_weight_data[index] += alpha_data[k] * cur_grad_out_scale;
-          } else {
-            grad_weight_data[index] += cur_grad_out_scale;
-          }
-        }
-      }
-    }
-    return true;
-  }
-
- protected:
-  int64_t num_outputs_;
-  uint64_t seed_;
-  std::array<uint64_t, 4> hash_data;
-  bool adaptive_;
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_FUNHASH_OP_H_
diff --git a/caffe2/experiments/operators/sparse_funhash_op.cc b/caffe2/experiments/operators/sparse_funhash_op.cc
deleted file mode 100644
index 20b29498e7c5e..0000000000000
--- a/caffe2/experiments/operators/sparse_funhash_op.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/experiments/operators/sparse_funhash_op.h"
-
-namespace caffe2 {
-namespace {
-
-REGISTER_CPU_OPERATOR(SparseFunHash, SparseFunHashOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    SparseFunHashGradient,
-    SparseFunHashGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(SparseFunHash)
-    .NumInputs(4, 5)
-    .NumOutputs(1)
-    .DisallowInputFillers() // TODO: enable the filler
-    .SetDoc(R"DOC(
-This layer compresses a fully-connected layer for sparse inputs
-via hashing.
-It takes four required inputs and an option fifth input.
-The first three inputs `scalars`, `indices`, and `segment_ids` are
-the sparse segmented representation of sparse data, which are the
-same as the last three inputs of the `SparseSortedSegmentWeightedSum`
-operator. If the argument `num_segments` is specified, it would be used
-as the first dimension for the output; otherwise it would be derived
-from the maximum segment ID.
-
-The fourth input is a 1D weight vector. Each entry of the fully-connected
-layer would be randomly mapped from one of the entries in this vector.
-
-When the optional fifth input vector is present, each weight of the
-fully-connected layer would be the linear combination of K entries
-randomly mapped from the weight vector, provided the input
-(length-K vector) serves as the coefficients.
-)DOC")
-    .Input(0, "scalars", "Values of the non-zero entries of the sparse data.")
-    .Input(1, "indices", "Indices to the non-zero valued features.")
-    .Input(
-        2,
-        "segment_ids",
-        "Segment IDs corresponding to the non-zero entries.")
-    .Input(3, "weight", "Weight vector")
-    .Input(
-        4,
-        "alpha",
-        "Optional coefficients for linear combination of hashed weights.")
-    .Output(
-        0,
-        "output",
-        "Output tensor with the first dimension equal to the number "
-        "of segments.")
-    .Arg("num_outputs", "Number of outputs")
-    .Arg("num_segments", "Number of segments");
-
-OPERATOR_SCHEMA(SparseFunHashGradient)
-    .NumInputs(5, 6)
-    .NumOutputs(2, 3)
-    .DisallowInputFillers();
-
-class GetSparseFunHashGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    if (def_.input_size() == 4) {
-      return SingleGradientDef(
-          "SparseFunHashGradient",
-          "",
-          vector<string>{GO(0), I(0), I(1), I(2), I(3)},
-          vector<string>{GI_V(3), GI_I(3)});
-    }
-    // def_.input_size() == 5
-    return SingleGradientDef(
-        "SparseFunHashGradient",
-        "",
-        vector<string>{GO(0), I(0), I(1), I(2), I(3), I(4)},
-        vector<string>{GI_V(3), GI_I(3), GI(4)});
-  }
-};
-
-REGISTER_GRADIENT(SparseFunHash, GetSparseFunHashGradient);
-
-} // namespace
-} // namespace caffe2
diff --git a/caffe2/experiments/operators/sparse_funhash_op.h b/caffe2/experiments/operators/sparse_funhash_op.h
deleted file mode 100644
index 1ba2cc5a342d7..0000000000000
--- a/caffe2/experiments/operators/sparse_funhash_op.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef CAFFE2_OPERATORS_SPARSE_FUNHASH_OP_H_
-#define CAFFE2_OPERATORS_SPARSE_FUNHASH_OP_H_
-
-#include <xxhash.h>
-#include <array>
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-#define HASH_MAGIC 0x9e3779b97f4a7c15
-
-#define USE_SIGN
-
-namespace caffe2 {
-
-template <typename T, class Context>
-class SparseFunHashOp : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  SparseFunHashOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        num_outputs_(
-            OperatorBase::GetSingleArgument<int64_t>("num_outputs", -1)),
-        num_segments_(
-            OperatorBase::GetSingleArgument<int64_t>("num_segments", -1)),
-        seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
-    CAFFE_ENFORCE(
-        OperatorBase::HasArgument("num_outputs"),
-        "Argument `num_outputs` is missing.");
-    // If alpha is provided, use adaptive hashing parameterized by alpha.
-    adaptive_ = (InputSize() == 5);
-  }
-
-  bool RunOnDevice() override {
-    const auto& val = Input(0);
-    const auto& key = Input(1);
-    const auto& seg = Input(2);
-    const auto& weight = Input(3);
-
-    int64_t num_alpha = 1;
-    if (adaptive_) {
-      const auto& alpha = Input(4);
-      num_alpha = alpha.size(0);
-    }
-
-    const auto* seg_data = seg.template data<int>();
-
-    int64_t num_weight = weight.size(0);
-    int64_t num_nz_ent = seg.size(0);
-
-    int64_t n_segments = num_segments_;
-    if (num_segments_ == -1) {
-      for (const auto i : c10::irange(num_nz_ent)) {
-        if (seg_data[i] > n_segments) {
-          n_segments = seg_data[i];
-        }
-      }
-      ++n_segments;
-    }
-
-    auto* output = Output(0, {n_segments, num_outputs_}, at::dtype<T>());
-
-    T* output_data = output->template mutable_data<T>();
-
-    memset(output_data, 0, sizeof(T) * n_segments * num_outputs_);
-
-    const auto* weight_data = weight.template data<T>();
-    const auto* alpha_data = adaptive_ ? Input(4).template data<T>() : 0;
-    const auto* val_data = val.template data<T>();
-    const auto* key_data = key.template data<int64_t>();
-
-    for (const auto j : c10::irange(num_nz_ent)) {
-      int64_t cur_seg = seg_data[j];
-      int64_t cur_key = key_data[j];
-      T cur_val = val_data[j];
-      int64_t output_stride = cur_seg * num_outputs_;
-      for (const auto i : c10::irange(num_outputs_)) {
-        T sum = 0;
-        for (const auto k : c10::irange(num_alpha)) {
-          // The hash function takes as input three integers:
-          // 1. feature index
-          // 2. output index
-          // 3. alpha index
-          // 4. magic number to improve hashing
-          hash_data[0] = cur_key;
-          hash_data[1] = i;
-          hash_data[2] = k;
-          hash_data[3] = HASH_MAGIC;
-
-          uint64_t hash = XXH64(hash_data.data(), hash_data.size(), seed_);
-
-#ifdef USE_SIGN
-          // Use the least significant bit for sign, the rest for weights.
-          int64_t index = (hash >> 1) % num_weight;
-          T cur_weight = weight_data[index];
-          if (hash & 1) {
-            cur_weight = -cur_weight;
-          }
-#else
-          int64_t index = hash % num_weight;
-          T cur_weight = weight_data[index];
-#endif
-
-          if (adaptive_) {
-            sum += cur_weight * alpha_data[k];
-          } else {
-            sum += cur_weight;
-          }
-        }
-        output_data[output_stride + i] += sum * cur_val;
-      }
-    }
-
-    return true;
-  }
-
- protected:
-  int64_t num_outputs_;
-  int64_t num_segments_;
-  uint64_t seed_;
-  std::array<uint64_t, 4> hash_data;
-  bool adaptive_;
-};
-
-template <typename T, class Context>
-class SparseFunHashGradientOp : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  SparseFunHashGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        num_outputs_(
-            OperatorBase::GetSingleArgument<int64_t>("num_outputs", -1)),
-        seed_(OperatorBase::GetSingleArgument<uint64_t>("seed", 0)) {
-    adaptive_ = (InputSize() == 6);
-  }
-
-  bool RunOnDevice() override {
-    const auto& grad_out = Input(0);
-    const auto& val = Input(1);
-    const auto& key = Input(2);
-    const auto& seg = Input(3);
-    const auto& weight = Input(4);
-
-    int64_t num_alpha = 1;
-    T* grad_alpha_data = 0;
-
-    if (adaptive_) {
-      const auto& alpha = Input(5);
-      num_alpha = alpha.size(0);
-
-      auto* grad_alpha = Output(2, alpha.sizes(), at::dtype<T>());
-      grad_alpha_data = grad_alpha->template mutable_data<T>();
-      memset(grad_alpha_data, 0, sizeof(T) * num_alpha);
-    }
-
-    const auto* seg_data = seg.template data<int>();
-
-    int64_t num_weight = weight.size(0);
-    int64_t num_nz_ent = seg.size(0);
-
-    int64_t grad_weight_size = num_nz_ent * num_outputs_ * num_alpha;
-
-    auto* grad_weight_val = Output(0, {grad_weight_size}, at::dtype<T>());
-    T* grad_weight_val_data = grad_weight_val->template mutable_data<T>();
-
-    auto* grad_weight_ind = Output(1, {grad_weight_size}, at::dtype<int64_t>());
-    auto* grad_weight_ind_data =
-        grad_weight_ind->template mutable_data<int64_t>();
-
-    const auto* grad_out_data = grad_out.template data<T>();
-    const auto* weight_data = weight.template data<T>();
-    const auto* alpha_data = adaptive_ ? Input(5).template data<T>() : 0;
-    const auto* val_data = val.template data<T>();
-    const auto* key_data = key.template data<int64_t>();
-
-    int64_t w_ind = 0;
-    for (const auto j : c10::irange(num_nz_ent)) {
-      int64_t cur_seg = seg_data[j];
-      int64_t cur_key = key_data[j];
-      T cur_val = val_data[j];
-      int64_t grad_out_stride = cur_seg * num_outputs_;
-      for (const auto i : c10::irange(num_outputs_)) {
-        T grad_out_scale = grad_out_data[grad_out_stride + i] * cur_val;
-        for (const auto k : c10::irange(num_alpha)) {
-          hash_data[0] = cur_key;
-          hash_data[1] = i;
-          hash_data[2] = k;
-          hash_data[3] = HASH_MAGIC;
-
-          uint64_t hash = XXH64(hash_data.data(), hash_data.size(), seed_);
-
-          T cur_grad_out_scale = grad_out_scale;
-#ifdef USE_SIGN
-          int64_t index = (hash >> 1) % num_weight;
-          if (hash & 1) {
-            cur_grad_out_scale = -cur_grad_out_scale;
-          }
-#else
-          int64_t index = hash % num_weight;
-#endif
-
-          if (adaptive_) {
-            grad_alpha_data[k] += cur_grad_out_scale * weight_data[index];
-            grad_weight_val_data[w_ind] = alpha_data[k] * cur_grad_out_scale;
-          } else {
-            grad_weight_val_data[w_ind] = cur_grad_out_scale;
-          }
-          grad_weight_ind_data[w_ind] = index;
-          ++w_ind;
-        }
-      }
-    }
-    return true;
-  }
-
- protected:
-  int64_t num_outputs_;
-  uint64_t seed_;
-  std::array<uint64_t, 4> hash_data;
-  bool adaptive_;
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_SPARSE_FUNHASH_OP_H_
diff --git a/caffe2/experiments/operators/sparse_matrix_reshape_op.cc b/caffe2/experiments/operators/sparse_matrix_reshape_op.cc
deleted file mode 100644
index 1b1a9dab55f9e..0000000000000
--- a/caffe2/experiments/operators/sparse_matrix_reshape_op.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/experiments/operators/sparse_matrix_reshape_op.h"
-
-namespace caffe2 {
-namespace {
-
-REGISTER_CPU_OPERATOR(SparseMatrixReshape, SparseMatrixReshapeOp<CPUContext>);
-
-OPERATOR_SCHEMA(SparseMatrixReshape)
-    .NumInputs(2)
-    .NumOutputs(2)
-    .DisallowInputFillers() // TODO: enable the filler
-    .AllowInplace({{0, 0}, {1, 1}})
-    .SetDoc(R"DOC(
-Compute the indices of the reshaped sparse matrix.
-
-It takes two 1D tensors as input: the column indices (in int64) and
-the row indices (in int), which correspond to `INDICES` and `SEGMENT_IDS`
-in `SparseSortedSegment` family.
-It outputs the corresponding reshaped column and row indices.
-
-Two arguments are required:
-an argument `old_shape` specifies the original shape of the matrix,
-and `new_shape` specifies the new shape.
-One of the dimension in `old_shape` and `new_shape` can be -1.
-The valid combinations are listed below, where p, q, r, s are
-strictly positive integers.
-
-old_shape=(p, q)
-new_shape=(r, s)
-
-old_shape=(p, q)
-new_shape=(-1, s)
-
-old_shape=(p, q)
-new_shape=(r, -1)
-
-old_shape=(-1, q)
-new_shape=(-1, s)
-
-Note that only the first dimension in `old_shape` can be -1. In that case
-the second dimension in `new_shape` must NOT be -1.
-)DOC")
-    .Arg("old_shape", "Old shape.")
-    .Arg("new_shape", "New shape.")
-    .Input(0, "old_col", "Original column indices.")
-    .Input(1, "old_row", "Original row indices.")
-    .Output(0, "new_col", "New column indices.")
-    .Output(1, "new_row", "New row indices.");
-
-SHOULD_NOT_DO_GRADIENT(SparseMatrixReshape);
-
-} // namespace
-} // namespace caffe2
diff --git a/caffe2/experiments/operators/sparse_matrix_reshape_op.h b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
deleted file mode 100644
index 298ffcf7f47af..0000000000000
--- a/caffe2/experiments/operators/sparse_matrix_reshape_op.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef CAFFE2_OPERATORS_SPARSE_MATRIX_RESHAPE_H_
-#define CAFFE2_OPERATORS_SPARSE_MATRIX_RESHAPE_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <class Context>
-class SparseMatrixReshapeOp : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  SparseMatrixReshapeOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws) {
-    CAFFE_ENFORCE(
-        OperatorBase::HasArgument("old_shape"),
-        "Argument `old_shape` is missing.");
-    CAFFE_ENFORCE(
-        OperatorBase::HasArgument("new_shape"),
-        "Argument `new_shape` is missing.");
-
-    vector<int64_t> old_shape =
-        OperatorBase::GetRepeatedArgument<int64_t>("old_shape");
-    vector<int64_t> new_shape =
-        OperatorBase::GetRepeatedArgument<int64_t>("new_shape");
-
-    CAFFE_ENFORCE(
-        old_shape.size() == 2,
-        "Argument `old_shape` must contain exactly two integers.");
-    CAFFE_ENFORCE(
-        new_shape.size() == 2,
-        "Argument `new_shape` must contain exactly two integers.");
-
-    CAFFE_ENFORCE(
-        old_shape[1] > 0,
-        "The second dimension in argument `old_shape` must be positive.");
-
-    old_stride_ = old_shape[1];
-
-    if (old_shape[0] == -1) {
-      CAFFE_ENFORCE(
-          new_shape[1] > 0,
-          "The second dimension in `new_shape` must be positive.");
-    } else {
-      CAFFE_ENFORCE(
-          old_shape[0] > 0,
-          "The first dimension in `old_shape` must be positive.");
-
-      int64_t matrix_size = old_shape[0] * old_shape[1];
-
-      if (new_shape[0] == -1) {
-        CAFFE_ENFORCE(
-            new_shape[1] > 0,
-            "Only one dimension in argument `new_shape` can be -1.");
-        CAFFE_ENFORCE(
-            matrix_size % new_shape[1] == 0,
-            "Argument `new_shape` does not agree with `old_shape`.");
-      } else {
-        CAFFE_ENFORCE(
-            new_shape[0] > 0 && (new_shape[1] == -1 || new_shape[1] > 0),
-            "Dimensions in argument `new_shape` must be positive or -1.");
-        if (new_shape[1] == -1) {
-          CAFFE_ENFORCE(
-              matrix_size % new_shape[0] == 0,
-              "Argument `new_shape` does not agree with `old_shape`.");
-          new_shape[1] = matrix_size / new_shape[0];
-        } else {
-          CAFFE_ENFORCE(
-              new_shape[0] * new_shape[1] == matrix_size,
-              "Argument `new_shape` does not agree with `old_shape`.");
-        }
-      }
-    }
-    new_stride_ = new_shape[1];
-  }
-
-  bool RunOnDevice() override {
-    auto& old_col = Input(0);
-    CAFFE_ENFORCE(old_col.dim() == 1, "Row index tensor must be 1-D.");
-    auto& old_row = Input(1);
-    CAFFE_ENFORCE(old_row.dim() == 1, "Column index tensor must be 1-D.");
-
-    const auto nnz = old_col.numel();
-    CAFFE_ENFORCE(
-        old_row.numel() == nnz,
-        "Column and row tensors must have the same size.");
-
-    auto* new_col = Output(0, {nnz}, at::dtype<int64_t>());
-    auto* new_row = Output(1, {nnz}, at::dtype<int>());
-
-    const auto* old_col_data = old_col.template data<int64_t>();
-    const auto* old_row_data = old_row.template data<int>();
-
-    auto* new_col_data = new_col->template mutable_data<int64_t>();
-    auto* new_row_data = new_row->template mutable_data<int>();
-
-    for (const auto i : c10::irange(nnz)) {
-      int64_t offset = old_row_data[i] * old_stride_ + old_col_data[i];
-      new_row_data[i] = offset / new_stride_;
-      new_col_data[i] = offset % new_stride_;
-    }
-
-    return true;
-  }
-
- private:
-  int64_t old_stride_;
-  int64_t new_stride_;
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_SPARSE_MATRIX_RESHAPE_H_
diff --git a/caffe2/experiments/operators/tt_contraction_op.cc b/caffe2/experiments/operators/tt_contraction_op.cc
deleted file mode 100644
index 68c61d6b4ad46..0000000000000
--- a/caffe2/experiments/operators/tt_contraction_op.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/experiments/operators/tt_contraction_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(TTContraction, TTContractionOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(TTContraction)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Tensor contraction C = A * B
-)DOC")
-    .Arg("K", "i_{k-1} * r_k")
-    .Arg("M", "r_{k-1} * o_{k-1}")
-    .Arg("N", "o_k")
-    .Input(0, "A", "2D matrix of size (K x M)")
-    .Input(1, "B", "tensor")
-    .Output(0, "C", "contracted tensor");
-
-REGISTER_CPU_OPERATOR(
-    TTContractionGradient,
-    TTContractionGradientOp<float, CPUContext>);
-
-OPERATOR_SCHEMA(TTContractionGradient).NumInputs(3).NumOutputs(2);
-
-class GetTTContractionGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "TTContractionGradient",
-        "",
-        vector<string>{GO(0), I(0), I(1)},
-        vector<string>{GI(0), GI(1)},
-        Def().arg());
-  }
-};
-
-REGISTER_GRADIENT(TTContraction, GetTTContractionGradient);
-
-} // namespace caffe2
diff --git a/caffe2/experiments/operators/tt_contraction_op.h b/caffe2/experiments/operators/tt_contraction_op.h
deleted file mode 100644
index e7d89ec752ee9..0000000000000
--- a/caffe2/experiments/operators/tt_contraction_op.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef CAFFE2_OPERATORS_TT_CONTRACTION_OP_H_
-#define CAFFE2_OPERATORS_TT_CONTRACTION_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context, class Engine = DefaultEngine>
-class TTContractionOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  TTContractionOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        K_(OperatorBase::GetSingleArgument<int64_t>("K", 0)),
-        M_(OperatorBase::GetSingleArgument<int64_t>("M", 0)),
-        N_(OperatorBase::GetSingleArgument<int64_t>("N", 0)) {
-    CAFFE_ENFORCE(OperatorBase::HasArgument("K"), "Argument `K` is missing.");
-    CAFFE_ENFORCE(OperatorBase::HasArgument("M"), "Argument `M` is missing.");
-    CAFFE_ENFORCE(OperatorBase::HasArgument("N"), "Argument `N` is missing.");
-  }
-
-  bool RunOnDevice() override {
-    const auto& A = Input(0);
-    const auto& B = Input(1);
-
-    CAFFE_ENFORCE(A.dim() == 2, A.dim());
-
-    int64_t A_size = A.numel();
-    int64_t B_size = B.numel();
-
-    CAFFE_ENFORCE(
-        K_ * M_ == A_size,
-        "Argument `K` and `M` do not agree with the size of A.");
-
-    CAFFE_ENFORCE(
-        B_size % (K_ * N_) == 0,
-        "Argument `K` and `N` do not agree with the size of B.");
-
-    int64_t D_ = B_size / (K_ * N_);
-
-    int64_t C_size = D_ * M_ * N_;
-    auto* C = Output(0, vector<int64_t>{C_size}, at::dtype<T>());
-
-    int64_t B_stride = K_ * N_;
-    int64_t C_stride = M_ * N_;
-
-    const T* A_data = A.template data<T>();
-    const T* B_data = B.template data<T>();
-    T* C_data = C->template mutable_data<T>();
-
-    for (int64_t B_index = 0; B_index < B_size; B_index += B_stride) {
-      math::Gemm<T, Context, Engine>(
-          CblasTrans,
-          CblasNoTrans,
-          M_,
-          N_,
-          K_,
-          1,
-          A_data,
-          B_data + B_index,
-          0,
-          C_data,
-          &context_);
-      C_data += C_stride;
-    }
-
-    return true;
-  }
-
- protected:
-  int64_t K_;
-  int64_t M_;
-  int64_t N_;
-};
-
-template <typename T, class Context, class Engine = DefaultEngine>
-class TTContractionGradientOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  TTContractionGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        K_(OperatorBase::GetSingleArgument<int64_t>("K", 0)),
-        M_(OperatorBase::GetSingleArgument<int64_t>("M", 0)),
-        N_(OperatorBase::GetSingleArgument<int64_t>("N", 0)) {}
-
-  bool RunOnDevice() override {
-    const auto& G = Input(0);
-    const auto& A = Input(1);
-    const auto& B = Input(2);
-
-    int64_t G_size = G.numel();
-    int64_t D_ = G_size / (M_ * N_);
-
-    int64_t dB_size = D_ * K_ * N_;
-
-    auto* dA = Output(0, A.sizes(), at::dtype<T>());
-    auto* dB = Output(1, B.sizes(), at::dtype<T>());
-
-    int64_t B_stride = K_ * N_;
-    int64_t G_stride = M_ * N_;
-
-    const T* G_data = G.template data<T>();
-    const T* A_data = A.template data<T>();
-    const T* B_data = B.template data<T>();
-
-    T* dA_data = dA->template mutable_data<T>();
-    T* dB_data = dB->template mutable_data<T>();
-
-    const T* G_ptr = G_data;
-    for (int64_t B_index = 0; B_index < dB_size; B_index += B_stride) {
-      math::Gemm<T, Context, Engine>(
-          CblasNoTrans,
-          CblasTrans,
-          K_,
-          M_,
-          N_,
-          1,
-          B_data + B_index,
-          G_ptr,
-          B_index == 0 ? 0 : 1,
-          dA_data,
-          &context_);
-      G_ptr += G_stride;
-    }
-
-    G_ptr = G_data;
-    for (int64_t B_index = 0; B_index < dB_size; B_index += B_stride) {
-      math::Gemm<T, Context, Engine>(
-          CblasNoTrans,
-          CblasNoTrans,
-          K_,
-          N_,
-          M_,
-          1,
-          A_data,
-          G_ptr,
-          0,
-          dB_data + B_index,
-          &context_);
-      G_ptr += G_stride;
-    }
-
-    return true;
-  }
-
- protected:
-  int64_t K_;
-  int64_t M_;
-  int64_t N_;
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_TT_CONTRACTION_OP_H_
diff --git a/caffe2/experiments/operators/tt_contraction_op_gpu.cc b/caffe2/experiments/operators/tt_contraction_op_gpu.cc
deleted file mode 100644
index 496d8cb8dfcb9..0000000000000
--- a/caffe2/experiments/operators/tt_contraction_op_gpu.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/experiments/operators/tt_contraction_op.h"
-
-namespace caffe2 {
-
-REGISTER_CUDA_OPERATOR(TTContraction, TTContractionOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(
-    TTContractionGradient,
-    TTContractionGradientOp<float, CUDAContext>);
-
-} // namespace caffe2
diff --git a/caffe2/experiments/operators/tt_pad_op.cc b/caffe2/experiments/operators/tt_pad_op.cc
deleted file mode 100644
index 4d8d9f302c646..0000000000000
--- a/caffe2/experiments/operators/tt_pad_op.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/experiments/operators/tt_pad_op.h"
-
-namespace caffe2 {
-namespace {
-
-REGISTER_CPU_OPERATOR(TTPad, TTPadOp<float, CPUContext>);
-OPERATOR_SCHEMA(TTPad).NumInputs(1).NumOutputs(2).EnforceInplace({{0, 0}});
-
-REGISTER_CPU_OPERATOR(TTPadGradient, TTPadGradientOp<float, CPUContext>);
-OPERATOR_SCHEMA(TTPadGradient)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .EnforceInplace({{0, 0}});
-
-class GetTTPadGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "TTPadGradient",
-        "",
-        vector<string>{GO(0), O(1)},
-        vector<string>{GI(0)},
-        Def().arg());
-  }
-};
-
-REGISTER_GRADIENT(TTPad, GetTTPadGradient);
-
-} // namespace
-} // namespace caffe2
diff --git a/caffe2/experiments/operators/tt_pad_op.h b/caffe2/experiments/operators/tt_pad_op.h
deleted file mode 100644
index b482e380f257f..0000000000000
--- a/caffe2/experiments/operators/tt_pad_op.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef CAFFE2_OPERATORS_TT_PAD_OP_H_
-#define CAFFE2_OPERATORS_TT_PAD_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-template <typename T, class Context, class Engine = DefaultEngine>
-class TTPadOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  TTPadOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        scale_(OperatorBase::GetSingleArgument<int64_t>("scale", 0)) {
-    CAFFE_ENFORCE(
-        OperatorBase::HasArgument("scale"), "Argument `scale` is missing.");
-  }
-
-  bool RunOnDevice() override {
-    const auto& X = Input(0);
-    auto* X_pad = Output(0);
-    CAFFE_ENFORCE(&X == X_pad);
-
-    CAFFE_ENFORCE(X.dim() == 2, X.dim());
-
-    auto X_dim0 = X.size(0);
-    auto X_dim1 = X.size(1);
-
-    auto* X_orig_dim0 = Output(1, {1}, at::dtype<int64_t>());
-    *X_orig_dim0->template mutable_data<int64_t>() = X_dim0;
-
-    if (X_dim0 % scale_ != 0) {
-      int64_t padded_dim0 = (X_dim0 / scale_ + 1) * scale_;
-      auto dim0_diff = padded_dim0 - X_dim0;
-      // set growthPct to the upper bound percentage: (100 * scale_ / X_dim0)
-      X_pad->Extend(dim0_diff, 100 * scale_ / X_dim0);
-
-      auto* X_pad_data = X_pad->template mutable_data<T>();
-      int64_t X_size = X_dim0 * X_dim1;
-      memset(X_pad_data + X_size, 0, dim0_diff * X_dim1 * sizeof(T));
-    }
-
-    return true;
-  }
-
- protected:
-  int64_t scale_;
-};
-
-template <typename T, class Context, class Engine = DefaultEngine>
-class TTPadGradientOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  TTPadGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
-
-  bool RunOnDevice() override {
-    const auto& G = Input(0);
-    auto* output = Output(0);
-    CAFFE_ENFORCE(&G == output);
-
-    auto old_dim0 = *Input(1).template data<int64_t>();
-    auto new_dim0 = G.size(0);
-    auto dim1 = G.size(1);
-
-    if (old_dim0 < new_dim0) {
-      output->ShrinkTo(old_dim0);
-    }
-
-    return true;
-  }
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_TT_PAD_OP_H_
diff --git a/caffe2/experiments/python/SparseTransformer.py b/caffe2/experiments/python/SparseTransformer.py
deleted file mode 100644
index d97f076a7bb3d..0000000000000
--- a/caffe2/experiments/python/SparseTransformer.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-## @package SparseTransformer
-# Module caffe2.experiments.python.SparseTransformer
-
-
-
-
-from caffe2.python import workspace
-import scipy.sparse
-
-
-class NetDefNode():
-
-    def __init__(self, name, optype, p=None, op=None):
-        self.name = name
-        self.optype = optype
-        self.ops = {}
-        self.prev = {}
-        self.insertInput(p)
-        self.visited = False
-        self.op = op
-
-    def insertInput(self, p):
-        """
-        Insert input of this op
-        also maintain the output of previous op
-        p: a node or a list of node
-        """
-        if isinstance(p, list):
-            for i in p:
-                self.prev[i.name] = i
-                i.ops[self.name] = self
-        elif isinstance(p, NetDefNode):
-            self.prev[p.name] = p
-            p.ops[self.name] = self
-
-    def deleteInput(self, p):
-        if isinstance(p, NetDefNode):
-            del self.prev[p.name]
-            del p.ops[self.name]
-
-
-def maskNallocate(weight_name):
-    """
-    Combine mask and weights
-    create wcsr, iw, jw, return their names
-    """
-    w = workspace.FetchBlob(weight_name)
-    w_csr = scipy.sparse.csr_matrix(w)
-    wcsr = w_csr.data
-    iw = w_csr.indptr
-    jw = w_csr.indices
-    workspace.FeedBlob(weight_name + "wcsr", wcsr)
-    workspace.FeedBlob(weight_name + "iw", iw)
-    workspace.FeedBlob(weight_name + "jw", jw)
-    return weight_name + "wcsr", weight_name + "iw", weight_name + "jw"
-
-
-def transFCRelu(cur, id2node, name2id, ops, model):
-    """
-    Add trans before and after this FC_Prune->(Relu)->FC_Prune chain.
-    """
-    # 1. add trans before the start of this chain
-    # assuming that cur is a FC_Prune, and it has only one input
-    pre = cur.prev.itervalues().next()
-    # Create a node /op and insert it.
-    # TODO(wyiming): check whether it is correct here
-    current_blob = model.Transpose(cur.op.input[0], cur.op.input[0] + "_trans")
-#     print model.net.Proto()
-    trans_op = model.net.Proto().op[-1]
-    trans_node = NetDefNode(trans_op.output[0], "Transpose", pre, trans_op)
-    trans_node.visited = True
-    pre_new = trans_node
-
-    # 2. use while loop to visit the chain
-    while True:
-        # breakup with the parent
-        cur.deleteInput(pre)
-        if not (cur.optype == "FC_Prune" or cur.optype == "Relu"):
-            print("Reaching the end of the chain")
-            break
-        if len(cur.ops) > 1:
-            print("A FC/Relu giving more than 1 useful outputs")
-        if cur.optype == "FC_Prune":
-            op = cur.op
-            wcsr, iw, jw = maskNallocate(op.input[1])
-            bias_name = op.input[3]
-            # TODO(wyiming): create a new Op here
-            current_blob = model.FC_Sparse(current_blob,
-                                           cur.op.output[0] + "_Sparse",
-                                           wcsr, iw, jw, bias_name)
-            sps_op = model.net.Proto().op[-1]
-            sps_node = NetDefNode(cur.op.output[0] + "_Sparse",
-                                  "FC_Sparse",
-                                  pre_new, sps_op)
-            sps_node.visited = True
-            pre_new = sps_node
-        if cur.optype == "Relu":
-            op = cur.op
-            current_blob = model.Relu(current_blob, current_blob)
-            rel_op = model.net.Proto().op[-1]
-            rel_node = NetDefNode(str(current_blob), "Relu",
-                                  pre_new, rel_op)
-            rel_node.visited = True
-            pre_new = rel_node
-
-        cur.visited = True
-        pre = cur
-        flag = False
-        for _, temp in cur.ops.iteritems():
-            if temp.optype == "Relu" or temp.optype == "FC_Prune":
-                flag = True
-                cur = temp
-        if not flag:
-            # assume that there is only 1 output that is not PrintOP
-            cur = cur.ops.itervalues().next()
-            cur.deleteInput(pre)
-            print("No FC/RElu children")
-            print(cur.op.type)
-            break
-    # 3. add trans after this chain like 1.
-    current_blob = model.Transpose(current_blob, pre.op.output[0])
-    trans_op = model.net.Proto().op[-1]
-    trans_node = NetDefNode(str(current_blob), "Transpose", pre_new, trans_op)
-    trans_node.visited = True
-    cur.insertInput(trans_node)
-    print(cur.prev)
-    print(trans_node.ops)
-
-
-def Prune2Sparse(cur, id2node, name2id, ops, model):
-    # Assume that FC and Relu takes in only 1 input;
-    # If not raise warning
-    if not cur.visited and cur.optype == "FC_Prune":
-        transFCRelu(cur, id2node, name2id, ops, model)
-
-    cur.visited = True
-    for name, n in cur.ops.iteritems():
-        Prune2Sparse(n, id2node, name2id, ops, model)
-
-
-def net2list(net_root):
-    """
-    Use topological order(BFS) to print the op of a net in a list
-    """
-    bfs_queue = []
-    op_list = []
-    cur = net_root
-    for _, n in cur.ops.iteritems():
-        bfs_queue.append(n)
-    while bfs_queue:
-        node = bfs_queue[0]
-        bfs_queue = bfs_queue[1:]
-        op_list.append(node.op)
-        for _, n in node.ops.iteritems():
-            bfs_queue.append(n)
-
-    return op_list
-
-
-def netbuilder(model):
-    print("Welcome to model checker")
-    proto = model.net.Proto()
-    net_name2id = {}
-    net_id2node = {}
-    net_root = NetDefNode("net_root", "root", None)
-
-    for op_id, op in enumerate(proto.op):
-        if op.type == "Print":
-            continue
-        op_name = '%s/%s (op#%d)' % (op.name, op.type, op_id) \
-                  if op.name else '%s (op#%d)' % (op.type, op_id)
-        # print(op_name)
-        op_node = NetDefNode(op_name, op.type, op=op)
-        net_id2node[op_id] = op_node
-
-        if_has_layer_input = False
-        for input_name in op.input:
-            if input_name not in net_name2id:
-                # assume that un_occured name are non_layers
-                # TODO: write a non-layer checker and log it
-                continue
-            op_node.insertInput(net_id2node[net_name2id[input_name]])
-            if_has_layer_input = True
-
-        if not if_has_layer_input:
-            op_node.insertInput(net_root)
-
-        for output_name in op.output:
-            net_name2id[output_name] = op_id
-
-    return net_root, net_name2id, net_id2node
diff --git a/caffe2/experiments/python/convnet_benchmarks.py b/caffe2/experiments/python/convnet_benchmarks.py
deleted file mode 100644
index ff9b7a20bc73c..0000000000000
--- a/caffe2/experiments/python/convnet_benchmarks.py
+++ /dev/null
@@ -1,699 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-## @package convnet_benchmarks
-# Module caffe2.experiments.python.convnet_benchmarks
-
-
-
-
-"""
-Benchmark for common convnets.
-
-(NOTE: Numbers below prior with missing parameter=update step, TODO to update)
-
-Speed on Titan X, with 10 warmup steps and 10 main steps and with different
-versions of cudnn, are as follows (time reported below is per-batch time,
-forward / forward+backward):
-
-                    CuDNN V3        CuDNN v4
-                    AlexNet         32.5 / 108.0    27.4 /  90.1
-                    OverFeat       113.0 / 342.3    91.7 / 276.5
-                    Inception      134.5 / 485.8   125.7 / 450.6
-                    VGG (batch 64) 200.8 / 650.0   164.1 / 551.7
-
-Speed on Inception with varied batch sizes and CuDNN v4 is as follows:
-
-Batch Size   Speed per batch     Speed per image
-16             22.8 /  72.7         1.43 / 4.54
-32             38.0 / 127.5         1.19 / 3.98
-64             67.2 / 233.6         1.05 / 3.65
-128            125.7 / 450.6         0.98 / 3.52
-
-Speed on Tesla M40, which 10 warmup steps and 10 main steps and with cudnn
-v4, is as follows:
-
-AlexNet         68.4 / 218.1
-OverFeat       210.5 / 630.3
-Inception      300.2 / 1122.2
-VGG (batch 64) 405.8 / 1327.7
-
-(Note that these numbers involve a "full" backprop, i.e. the gradient
-with respect to the input image is also computed.)
-
-To get the numbers, simply run:
-
-for MODEL in AlexNet OverFeat Inception; do
-PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
-    --batch_size 128 --model $MODEL --forward_only True
-done
-for MODEL in AlexNet OverFeat Inception; do
-PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
-    --batch_size 128 --model $MODEL
-done
-PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
-    --batch_size 64 --model VGGA --forward_only True
-PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
-    --batch_size 64 --model VGGA
-
-for BS in 16 32 64 128; do
-PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
-    --batch_size $BS --model Inception --forward_only True
-PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
-    --batch_size $BS --model Inception
-done
-
-Note that VGG needs to be run at batch 64 due to memory limit on the backward
-pass.
-"""
-
-import argparse
-import time
-
-from caffe2.python import cnn, workspace, core
-
-import caffe2.python.SparseTransformer as SparseTransformer  # type: ignore[import]
-
-
-def MLP(order):
-    model = cnn.CNNModelHelper()
-    d = 256
-    depth = 20
-    width = 3
-    for i in range(depth):
-        for j in range(width):
-            current = "fc_{}_{}".format(i, j) if i > 0 else "data"
-            next_ = "fc_{}_{}".format(i + 1, j)
-            model.FC(
-                current, next_,
-                dim_in=d, dim_out=d,
-                weight_init=model.XavierInit,
-                bias_init=model.XavierInit)
-            model.Sum(["fc_{}_{}".format(depth, j)
-                       for j in range(width)], ["sum"])
-            model.FC("sum", "last",
-                     dim_in=d, dim_out=1000,
-                     weight_init=model.XavierInit,
-                     bias_init=model.XavierInit)
-            xent = model.LabelCrossEntropy(["last", "label"], "xent")
-            model.AveragedLoss(xent, "loss")
-            return model, d
-
-
-def AlexNet(order):
-    model = cnn.CNNModelHelper(order, name="alexnet",
-                               use_cudnn=True, cudnn_exhaustive_search=True)
-    conv1 = model.Conv(
-        "data",
-        "conv1",
-        3,
-        64,
-        11,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        stride=4,
-        pad=2
-    )
-
-    relu1 = model.Relu(conv1, "conv1")
-    pool1 = model.MaxPool(relu1, "pool1", kernel=3, stride=2)
-    conv2 = model.Conv(
-        pool1,
-        "conv2",
-        64,
-        192,
-        5,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=2
-    )
-    relu2 = model.Relu(conv2, "conv2")
-    pool2 = model.MaxPool(relu2, "pool2", kernel=3, stride=2)
-    conv3 = model.Conv(
-        pool2,
-        "conv3",
-        192,
-        384,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu3 = model.Relu(conv3, "conv3")
-    conv4 = model.Conv(
-        relu3,
-        "conv4",
-        384,
-        256,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu4 = model.Relu(conv4, "conv4")
-    conv5 = model.Conv(
-        relu4,
-        "conv5",
-        256,
-        256,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu5 = model.Relu(conv5, "conv5")
-    pool5 = model.MaxPool(relu5, "pool5", kernel=3, stride=2)
-    fc6 = model.FC(
-        pool5, "fc6", 256 * 6 * 6, 4096, ('XavierFill', {}),
-        ('ConstantFill', {})
-    )
-    relu6 = model.Relu(fc6, "fc6")
-    fc7 = model.FC(
-        relu6, "fc7", 4096, 4096, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    relu7 = model.Relu(fc7, "fc7")
-    fc8 = model.FC(
-        relu7, "fc8", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    pred = model.Softmax(fc8, "pred")
-    xent = model.LabelCrossEntropy([pred, "label"], "xent")
-    model.AveragedLoss(xent, "loss")
-    return model, 224
-
-
-def OverFeat(order):
-    model = cnn.CNNModelHelper(order, name="overfeat",
-                               use_cudnn=True, cudnn_exhaustive_search=True)
-    conv1 = model.Conv(
-        "data",
-        "conv1",
-        3,
-        96,
-        11,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        stride=4
-    )
-    relu1 = model.Relu(conv1, "conv1")
-    pool1 = model.MaxPool(relu1, "pool1", kernel=2, stride=2)
-    conv2 = model.Conv(
-        pool1, "conv2", 96, 256, 5, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    relu2 = model.Relu(conv2, "conv2")
-    pool2 = model.MaxPool(relu2, "pool2", kernel=2, stride=2)
-    conv3 = model.Conv(
-        pool2,
-        "conv3",
-        256,
-        512,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu3 = model.Relu(conv3, "conv3")
-    conv4 = model.Conv(
-        relu3,
-        "conv4",
-        512,
-        1024,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu4 = model.Relu(conv4, "conv4")
-    conv5 = model.Conv(
-        relu4,
-        "conv5",
-        1024,
-        1024,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu5 = model.Relu(conv5, "conv5")
-    pool5 = model.MaxPool(relu5, "pool5", kernel=2, stride=2)
-    fc6 = model.FC(
-        pool5, "fc6", 1024 * 6 * 6, 3072, ('XavierFill', {}),
-        ('ConstantFill', {})
-    )
-    relu6 = model.Relu(fc6, "fc6")
-    fc7 = model.FC(
-        relu6, "fc7", 3072, 4096, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    relu7 = model.Relu(fc7, "fc7")
-    fc8 = model.FC(
-        relu7, "fc8", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    pred = model.Softmax(fc8, "pred")
-    xent = model.LabelCrossEntropy([pred, "label"], "xent")
-    model.AveragedLoss(xent, "loss")
-    return model, 231
-
-
-def VGGA(order):
-    model = cnn.CNNModelHelper(order, name='vgg-a',
-                               use_cudnn=True, cudnn_exhaustive_search=True)
-    conv1 = model.Conv(
-        "data",
-        "conv1",
-        3,
-        64,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu1 = model.Relu(conv1, "conv1")
-    pool1 = model.MaxPool(relu1, "pool1", kernel=2, stride=2)
-    conv2 = model.Conv(
-        pool1,
-        "conv2",
-        64,
-        128,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu2 = model.Relu(conv2, "conv2")
-    pool2 = model.MaxPool(relu2, "pool2", kernel=2, stride=2)
-    conv3 = model.Conv(
-        pool2,
-        "conv3",
-        128,
-        256,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu3 = model.Relu(conv3, "conv3")
-    conv4 = model.Conv(
-        relu3,
-        "conv4",
-        256,
-        256,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu4 = model.Relu(conv4, "conv4")
-    pool4 = model.MaxPool(relu4, "pool4", kernel=2, stride=2)
-    conv5 = model.Conv(
-        pool4,
-        "conv5",
-        256,
-        512,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu5 = model.Relu(conv5, "conv5")
-    conv6 = model.Conv(
-        relu5,
-        "conv6",
-        512,
-        512,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu6 = model.Relu(conv6, "conv6")
-    pool6 = model.MaxPool(relu6, "pool6", kernel=2, stride=2)
-    conv7 = model.Conv(
-        pool6,
-        "conv7",
-        512,
-        512,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu7 = model.Relu(conv7, "conv7")
-    conv8 = model.Conv(
-        relu7,
-        "conv8",
-        512,
-        512,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu8 = model.Relu(conv8, "conv8")
-    pool8 = model.MaxPool(relu8, "pool8", kernel=2, stride=2)
-
-    fcix = model.FC(
-        pool8, "fcix", 512 * 7 * 7, 4096, ('XavierFill', {}),
-        ('ConstantFill', {})
-    )
-    reluix = model.Relu(fcix, "fcix")
-    fcx = model.FC(
-        reluix, "fcx", 4096, 4096, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    relux = model.Relu(fcx, "fcx")
-    fcxi = model.FC(
-        relux, "fcxi", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    pred = model.Softmax(fcxi, "pred")
-    xent = model.LabelCrossEntropy([pred, "label"], "xent")
-    model.AveragedLoss(xent, "loss")
-    return model, 231
-
-
-def net_DAG_Builder(model):
-    print("====================================================")
-    print("                 Start Building DAG                 ")
-    print("====================================================")
-    net_root = SparseTransformer.netbuilder(model)
-    return net_root
-
-
-def _InceptionModule(
-    model, input_blob, input_depth, output_name, conv1_depth, conv3_depths,
-    conv5_depths, pool_depth
-):
-    # path 1: 1x1 conv
-    conv1 = model.Conv(
-        input_blob, output_name + ":conv1", input_depth, conv1_depth, 1,
-        ('XavierFill', {}), ('ConstantFill', {})
-    )
-    conv1 = model.Relu(conv1, conv1)
-    # path 2: 1x1 conv + 3x3 conv
-    conv3_reduce = model.Conv(
-        input_blob, output_name +
-        ":conv3_reduce", input_depth, conv3_depths[0],
-        1, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    conv3_reduce = model.Relu(conv3_reduce, conv3_reduce)
-    conv3 = model.Conv(
-        conv3_reduce,
-        output_name + ":conv3",
-        conv3_depths[0],
-        conv3_depths[1],
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    conv3 = model.Relu(conv3, conv3)
-    # path 3: 1x1 conv + 5x5 conv
-    conv5_reduce = model.Conv(
-        input_blob, output_name +
-        ":conv5_reduce", input_depth, conv5_depths[0],
-        1, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    conv5_reduce = model.Relu(conv5_reduce, conv5_reduce)
-    conv5 = model.Conv(
-        conv5_reduce,
-        output_name + ":conv5",
-        conv5_depths[0],
-        conv5_depths[1],
-        5,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=2
-    )
-    conv5 = model.Relu(conv5, conv5)
-    # path 4: pool + 1x1 conv
-    pool = model.MaxPool(
-        input_blob,
-        output_name + ":pool",
-        kernel=3,
-        stride=1,
-        pad=1
-    )
-    pool_proj = model.Conv(
-        pool, output_name + ":pool_proj", input_depth, pool_depth, 1,
-        ('XavierFill', {}), ('ConstantFill', {})
-    )
-    pool_proj = model.Relu(pool_proj, pool_proj)
-    output = model.Concat([conv1, conv3, conv5, pool_proj], output_name)
-    return output
-
-
-def Inception(order):
-    model = cnn.CNNModelHelper(order, name="inception",
-                               use_cudnn=True, cudnn_exhaustive_search=True)
-    conv1 = model.Conv(
-        "data",
-        "conv1",
-        3,
-        64,
-        7,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        stride=2,
-        pad=3
-    )
-    relu1 = model.Relu(conv1, "conv1")
-    pool1 = model.MaxPool(relu1, "pool1", kernel=3, stride=2, pad=1)
-    conv2a = model.Conv(
-        pool1, "conv2a", 64, 64, 1, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    conv2a = model.Relu(conv2a, conv2a)
-    conv2 = model.Conv(
-        conv2a,
-        "conv2",
-        64,
-        192,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu2 = model.Relu(conv2, "conv2")
-    pool2 = model.MaxPool(relu2, "pool2", kernel=3, stride=2, pad=1)
-    # Inception modules
-    inc3 = _InceptionModule(
-        model, pool2, 192, "inc3", 64, [96, 128], [16, 32], 32
-    )
-    inc4 = _InceptionModule(
-        model, inc3, 256, "inc4", 128, [128, 192], [32, 96], 64
-    )
-    pool5 = model.MaxPool(inc4, "pool5", kernel=3, stride=2, pad=1)
-    inc5 = _InceptionModule(
-        model, pool5, 480, "inc5", 192, [96, 208], [16, 48], 64
-    )
-    inc6 = _InceptionModule(
-        model, inc5, 512, "inc6", 160, [112, 224], [24, 64], 64
-    )
-    inc7 = _InceptionModule(
-        model, inc6, 512, "inc7", 128, [128, 256], [24, 64], 64
-    )
-    inc8 = _InceptionModule(
-        model, inc7, 512, "inc8", 112, [144, 288], [32, 64], 64
-    )
-    inc9 = _InceptionModule(
-        model, inc8, 528, "inc9", 256, [160, 320], [32, 128], 128
-    )
-    pool9 = model.MaxPool(inc9, "pool9", kernel=3, stride=2, pad=1)
-    inc10 = _InceptionModule(
-        model, pool9, 832, "inc10", 256, [160, 320], [32, 128], 128
-    )
-    inc11 = _InceptionModule(
-        model, inc10, 832, "inc11", 384, [192, 384], [48, 128], 128
-    )
-    pool11 = model.AveragePool(inc11, "pool11", kernel=7, stride=1)
-    fc = model.FC(
-        pool11, "fc", 1024, 1000, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    # It seems that Soumith's benchmark does not have softmax on top
-    # for Inception. We will add it anyway so we can have a proper
-    # backward pass.
-    pred = model.Softmax(fc, "pred")
-    xent = model.LabelCrossEntropy([pred, "label"], "xent")
-    model.AveragedLoss(xent, "loss")
-    return model, 224
-
-
-def AddInput(model, batch_size, db, db_type):
-    """Adds the data input part."""
-    data_uint8, label = model.TensorProtosDBInput(
-        [], ["data_uint8", "label"], batch_size=batch_size,
-        db=db, db_type=db_type
-    )
-    data = model.Cast(data_uint8, "data_nhwc", to=core.DataType.FLOAT)
-    data = model.NHWC2NCHW(data, "data")
-    data = model.Scale(data, data, scale=float(1. / 256))
-    data = model.StopGradient(data, data)
-    return data, label
-
-
-def AddParameterUpdate(model):
-    """ Simple plain SGD update -- not tuned to actually train the models """
-    ITER = model.Iter("iter")
-    LR = model.LearningRate(
-        ITER, "LR", base_lr=-1e-8, policy="step", stepsize=10000, gamma=0.999)
-    ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
-    for param in model.params:
-        param_grad = model.param_to_grad[param]
-        model.WeightedSum([param, ONE, param_grad, LR], param)
-
-
-def Benchmark(model_gen, arg):
-    model, input_size = model_gen(arg.order)
-    model.Proto().type = arg.net_type
-    model.Proto().num_workers = arg.num_workers
-
-    # In order to be able to run everything without feeding more stuff, let's
-    # add the data and label blobs to the parameter initialization net as well.
-
-    if arg.order == "NCHW":
-        input_shape = [arg.batch_size, 3, input_size, input_size]
-    else:
-        input_shape = [arg.batch_size, input_size, input_size, 3]
-        if arg.model == "MLP":
-            input_shape = [arg.batch_size, input_size]
-
-    model.param_init_net.GaussianFill(
-        [],
-        "data",
-        shape=input_shape,
-        mean=0.0,
-        std=1.0
-    )
-    model.param_init_net.UniformIntFill(
-        [],
-        "label",
-        shape=[arg.batch_size, ],
-        min=0,
-        max=999
-    )
-
-    if arg.forward_only:
-        print('{}: running forward only.'.format(arg.model))
-    else:
-        print('{}: running forward-backward.'.format(arg.model))
-        model.AddGradientOperators(["loss"])
-        AddParameterUpdate(model)
-
-        if arg.order == 'NHWC':
-            print(
-                '==WARNING==\n'
-                'NHWC order with CuDNN may not be supported yet, so I might\n'
-                'exit suddenly.'
-            )
-
-    if not arg.cpu:
-        model.param_init_net.RunAllOnGPU()
-        model.net.RunAllOnGPU()
-
-    if arg.dump_model:
-        # Writes out the pbtxt for benchmarks on e.g. Android
-        with open(
-            "{0}_init_batch_{1}.pbtxt".format(arg.model, arg.batch_size), "w"
-        ) as fid:
-            fid.write(str(model.param_init_net.Proto()))
-            with open("{0}.pbtxt".format(arg.model), "w") as fid:
-                fid.write(str(model.net.Proto()))
-
-    workspace.RunNetOnce(model.param_init_net)
-    workspace.CreateNet(model.net)
-    for i in range(arg.warmup_iterations):
-        workspace.RunNet(model.net.Proto().name)
-
-    plan = core.Plan("plan")
-    plan.AddStep(core.ExecutionStep("run", model.net, arg.iterations))
-    start = time.time()
-    workspace.RunPlan(plan)
-    print('Spent: {}'.format((time.time() - start) / arg.iterations))
-    if arg.layer_wise_benchmark:
-        print('Layer-wise benchmark.')
-        workspace.BenchmarkNet(model.net.Proto().name, 1, arg.iterations, True)
-
-
-def GetArgumentParser():
-    parser = argparse.ArgumentParser(description="Caffe2 benchmark.")
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=128,
-        help="The batch size."
-    )
-    parser.add_argument("--model", type=str, help="The model to benchmark.")
-    parser.add_argument(
-        "--order",
-        type=str,
-        default="NCHW",
-        help="The order to evaluate."
-    )
-    parser.add_argument(
-        "--cudnn_ws",
-        type=int,
-        default=-1,
-        help="The cudnn workspace size."
-    )
-    parser.add_argument(
-        "--iterations",
-        type=int,
-        default=10,
-        help="Number of iterations to run the network."
-    )
-    parser.add_argument(
-        "--warmup_iterations",
-        type=int,
-        default=10,
-        help="Number of warm-up iterations before benchmarking."
-    )
-    parser.add_argument(
-        "--forward_only",
-        action='store_true',
-        help="If set, only run the forward pass."
-    )
-    parser.add_argument(
-        "--layer_wise_benchmark",
-        action='store_true',
-        help="If True, run the layer-wise benchmark as well."
-    )
-    parser.add_argument(
-        "--cpu",
-        action='store_true',
-        help="If True, run testing on CPU instead of GPU."
-    )
-    parser.add_argument(
-        "--dump_model",
-        action='store_true',
-        help="If True, dump the model prototxts to disk."
-    )
-    parser.add_argument("--net_type", type=str, default="dag")
-    parser.add_argument("--num_workers", type=int, default=2)
-    return parser
-
-
-if __name__ == '__main__':
-    args = GetArgumentParser().parse_args()
-    if (
-        not args.batch_size or not args.model or not args.order or
-        not args.cudnn_ws
-    ):
-        GetArgumentParser().print_help()
-
-    workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
-    model_map = {
-        'AlexNet': AlexNet,
-        'OverFeat': OverFeat,
-        'VGGA': VGGA,
-        'Inception': Inception,
-        'MLP': MLP,
-    }
-    Benchmark(model_map[args.model], args)
diff --git a/caffe2/experiments/python/device_reduce_sum_bench.py b/caffe2/experiments/python/device_reduce_sum_bench.py
deleted file mode 100644
index c57bff57fe3ea..0000000000000
--- a/caffe2/experiments/python/device_reduce_sum_bench.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-## @package device_reduce_sum_bench
-# Module caffe2.experiments.python.device_reduce_sum_bench
-
-
-
-
-
-import argparse
-import itertools
-import logging
-import os
-
-import numpy as np
-
-from caffe2.python import workspace, core
-from caffe2.python.hypothesis_test_util import runOpBenchmark, gpu_do
-
-logging.basicConfig()
-logger = logging.getLogger(os.path.basename(__file__))
-logger.setLevel(logging.INFO)
-
-ALL_BENCHMARKS = {}
-
-
-class BenchmarkMeta(type):
-    def __new__(metacls, name, bases, class_dict):
-        cls = type.__new__(metacls, name, bases, class_dict)
-        if name != 'Benchmark':
-            ALL_BENCHMARKS[name] = cls
-        return cls
-
-
-class Benchmark(metaclass=BenchmarkMeta):
-
-    def __init__(self):
-        self.results = []
-
-    def display(self):
-        print('Results ({}):'.format(type(self).__name__))
-        print('input size                      ms/iter')
-        print('------------------------------  -----------')
-        for size, ms in self.results:
-            print('{!s:<30}  {:.4f}'.format(size, ms))
-
-
-class SumElements(Benchmark):
-    def run(self):
-        op = core.CreateOperator(
-            "SumElements",
-            ["X"],
-            ["y"]
-        )
-
-        for n in itertools.imap(pow, itertools.cycle([10]), range(10)):
-            X = np.random.rand(n).astype(np.float32)
-            logger.info('Running benchmark for n = {}'.format(n))
-            ret = runOpBenchmark(gpu_do, op, inputs=[X])
-            self.results.append((n, ret[1]))
-
-
-class SumSqrElements(Benchmark):
-    def run(self):
-        op = core.CreateOperator(
-            "SumSqrElements",
-            ["X"],
-            ["y"]
-        )
-
-        for n in itertools.imap(pow, itertools.cycle([10]), range(10)):
-            X = np.random.rand(n).astype(np.float32)
-            logger.info('Running benchmark for n = {}'.format(n))
-            ret = runOpBenchmark(gpu_do, op, inputs=[X])
-            self.results.append((n, ret[1]))
-
-
-class SoftMaxWithLoss(Benchmark):
-    def run(self):
-        op = core.CreateOperator(
-            "SoftmaxWithLoss",
-            ["X", "label"],
-            ["probs", "avgloss"],
-        )
-
-        for n in itertools.imap(pow, itertools.cycle([10]), range(8)):
-            for D in itertools.imap(pow, itertools.cycle([10]), range(3)):
-                X = np.random.rand(n, D).astype(np.float32)
-                label = (np.random.rand(n) * D).astype(np.int32)
-                logger.info('Running benchmark for n = {}, D= {}'.format(n, D))
-                ret = runOpBenchmark(gpu_do, op, inputs=[X, label])
-                self.results.append(((n, D), ret[1]))
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(os.path.basename(__file__))
-    parser.add_argument('-b', '--benchmarks', nargs='+',
-                        default=ALL_BENCHMARKS.keys(),
-                        help='benchmarks to run (default: %(default)s))')
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    benchmarks = [ALL_BENCHMARKS[name]() for name in args.benchmarks]
-    for bench in benchmarks:
-        bench.run()
-    for bench in benchmarks:
-        bench.display()
-
-
-if __name__ == '__main__':
-    workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
-    main()
diff --git a/caffe2/experiments/python/funhash_op_test.py b/caffe2/experiments/python/funhash_op_test.py
deleted file mode 100644
index 3fc4c8bf54fd7..0000000000000
--- a/caffe2/experiments/python/funhash_op_test.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-
-
-
-
-
-import numpy as np
-from scipy.sparse import coo_matrix
-
-from hypothesis import given
-import hypothesis.strategies as st
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestFunHash(hu.HypothesisTestCase):
-    @given(n_out=st.integers(min_value=5, max_value=20),
-           n_in=st.integers(min_value=10, max_value=20),
-           n_data=st.integers(min_value=2, max_value=8),
-           n_weight=st.integers(min_value=8, max_value=15),
-           n_alpha=st.integers(min_value=3, max_value=8),
-           sparsity=st.floats(min_value=0.1, max_value=1.0),
-           **hu.gcs)
-    def test_funhash(self, n_out, n_in, n_data, n_weight, n_alpha, sparsity,
-                     gc, dc):
-        A = np.random.rand(n_data, n_in)
-        A[A > sparsity] = 0
-        A_coo = coo_matrix(A)
-        val, key, seg = A_coo.data, A_coo.col, A_coo.row
-
-        weight = np.random.rand(n_weight).astype(np.float32)
-        alpha = np.random.rand(n_alpha).astype(np.float32)
-        val = val.astype(np.float32)
-        key = key.astype(np.int64)
-        seg = seg.astype(np.int32)
-
-        op = core.CreateOperator(
-            'FunHash',
-            ['val', 'key', 'seg', 'weight', 'alpha'],
-            ['out'],
-            num_outputs=n_out)
-
-        # Check over multiple devices
-        self.assertDeviceChecks(
-            dc, op, [val, key, seg, weight, alpha], [0])
-        # Gradient check wrt weight
-        self.assertGradientChecks(
-            gc, op, [val, key, seg, weight, alpha], 3, [0])
-        # Gradient check wrt alpha
-        self.assertGradientChecks(
-            gc, op, [val, key, seg, weight, alpha], 4, [0])
-
-        op2 = core.CreateOperator(
-            'FunHash',
-            ['val', 'key', 'seg', 'weight'],
-            ['out'],
-            num_outputs=n_out)
-
-        # Check over multiple devices
-        self.assertDeviceChecks(
-            dc, op2, [val, key, seg, weight], [0])
-        # Gradient check wrt weight
-        self.assertGradientChecks(
-            gc, op2, [val, key, seg, weight], 3, [0])
diff --git a/caffe2/experiments/python/net_construct_bench.py b/caffe2/experiments/python/net_construct_bench.py
deleted file mode 100644
index ec12517c03bec..0000000000000
--- a/caffe2/experiments/python/net_construct_bench.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-## @package net_construct_bench
-# Module caffe2.experiments.python.net_construct_bench
-
-
-
-
-
-import argparse
-import logging
-import time
-
-from caffe2.python import workspace, data_parallel_model
-from caffe2.python import cnn
-
-import caffe2.python.models.resnet as resnet
-
-'''
-Simple benchmark that creates a data-parallel resnet-50 model
-and measures the time.
-'''
-
-
-logging.basicConfig()
-log = logging.getLogger("net_construct_bench")
-log.setLevel(logging.DEBUG)
-
-
-def AddMomentumParameterUpdate(train_model, LR):
-    '''
-    Add the momentum-SGD update.
-    '''
-    params = train_model.GetParams()
-    assert(len(params) > 0)
-    ONE = train_model.param_init_net.ConstantFill(
-        [], "ONE", shape=[1], value=1.0,
-    )
-    NEGONE = train_model.param_init_net.ConstantFill(
-        [], 'NEGONE', shape=[1], value=-1.0,
-    )
-
-    for param in params:
-        param_grad = train_model.param_to_grad[param]
-        param_momentum = train_model.param_init_net.ConstantFill(
-            [param], param + '_momentum', value=0.0
-        )
-
-        # Update param_grad and param_momentum in place
-        train_model.net.MomentumSGD(
-            [param_grad, param_momentum, LR],
-            [param_grad, param_momentum],
-            momentum=0.9,
-            nesterov=1
-        )
-
-        # Update parameters by applying the moment-adjusted gradient
-        train_model.WeightedSum(
-            [param, ONE, param_grad, NEGONE],
-            param
-        )
-
-
-def Create(args):
-    gpus = list(range(args.num_gpus))
-    log.info("Running on gpus: {}".format(gpus))
-
-    # Create CNNModeLhelper object
-    train_model = cnn.CNNModelHelper(
-        order="NCHW",
-        name="resnet50",
-        use_cudnn=True,
-        cudnn_exhaustive_search=False
-    )
-
-    # Model building functions
-    def create_resnet50_model_ops(model, loss_scale):
-        [softmax, loss] = resnet.create_resnet50(
-            model,
-            "data",
-            num_input_channels=3,
-            num_labels=1000,
-            label="label",
-        )
-        model.Accuracy([softmax, "label"], "accuracy")
-        return [loss]
-
-    # SGD
-    def add_parameter_update_ops(model):
-        model.AddWeightDecay(1e-4)
-        ITER = model.Iter("ITER")
-        stepsz = int(30)
-        LR = model.net.LearningRate(
-            [ITER],
-            "LR",
-            base_lr=0.1,
-            policy="step",
-            stepsize=stepsz,
-            gamma=0.1,
-        )
-        AddMomentumParameterUpdate(model, LR)
-
-    def add_image_input(model):
-        pass
-
-    start_time = time.time()
-
-    # Create parallelized model
-    data_parallel_model.Parallelize_GPU(
-        train_model,
-        input_builder_fun=add_image_input,
-        forward_pass_builder_fun=create_resnet50_model_ops,
-        param_update_builder_fun=add_parameter_update_ops,
-        devices=gpus,
-    )
-
-    ct = time.time() - start_time
-    train_model.net._CheckLookupTables()
-
-    log.info("Model create for {} gpus took: {} secs".format(len(gpus), ct))
-
-
-def main():
-    # TODO: use argv
-    parser = argparse.ArgumentParser(
-        description="Caffe2: Benchmark for net construction"
-    )
-    parser.add_argument("--num_gpus", type=int, default=1,
-                        help="Number of GPUs.")
-    args = parser.parse_args()
-
-    Create(args)
-
-
-if __name__ == '__main__':
-    workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
-    import cProfile
-
-    cProfile.run('main()', sort="cumulative")
diff --git a/caffe2/experiments/python/sparse_funhash_op_test.py b/caffe2/experiments/python/sparse_funhash_op_test.py
deleted file mode 100644
index cfc7a0bb61657..0000000000000
--- a/caffe2/experiments/python/sparse_funhash_op_test.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-
-
-
-
-
-import numpy as np
-from scipy.sparse import coo_matrix
-
-from hypothesis import given
-import hypothesis.strategies as st
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestFunHash(hu.HypothesisTestCase):
-    @given(n_out=st.integers(min_value=5, max_value=20),
-           n_in=st.integers(min_value=10, max_value=20),
-           n_data=st.integers(min_value=2, max_value=8),
-           n_weight=st.integers(min_value=8, max_value=15),
-           n_alpha=st.integers(min_value=3, max_value=8),
-           sparsity=st.floats(min_value=0.1, max_value=1.0),
-           **hu.gcs)
-    def test_funhash(self, n_out, n_in, n_data, n_weight, n_alpha, sparsity,
-                     gc, dc):
-        A = np.random.rand(n_data, n_in)
-        A[A > sparsity] = 0
-        A_coo = coo_matrix(A)
-        val, key, seg = A_coo.data, A_coo.col, A_coo.row
-
-        weight = np.random.rand(n_weight).astype(np.float32)
-        alpha = np.random.rand(n_alpha).astype(np.float32)
-        val = val.astype(np.float32)
-        key = key.astype(np.int64)
-        seg = seg.astype(np.int32)
-
-        op = core.CreateOperator(
-            'SparseFunHash',
-            ['val', 'key', 'seg', 'weight', 'alpha'],
-            ['out'],
-            num_outputs=n_out)
-
-        # Gradient check wrt weight
-        self.assertGradientChecks(
-            gc, op, [val, key, seg, weight, alpha], 3, [0])
-        # Gradient check wrt alpha
-        self.assertGradientChecks(
-            gc, op, [val, key, seg, weight, alpha], 4, [0])
-
-        op2 = core.CreateOperator(
-            'SparseFunHash',
-            ['val', 'key', 'seg', 'weight'],
-            ['out'],
-            num_outputs=n_out)
-
-        # Gradient check wrt weight
-        self.assertGradientChecks(
-            gc, op2, [val, key, seg, weight], 3, [0])
diff --git a/caffe2/experiments/python/sparse_reshape_op_test.py b/caffe2/experiments/python/sparse_reshape_op_test.py
deleted file mode 100644
index a22bf561ce863..0000000000000
--- a/caffe2/experiments/python/sparse_reshape_op_test.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-
-
-
-
-
-import numpy as np
-from scipy.sparse import coo_matrix
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-
-
-def test_reshape(old_shape, new_shape, stride_only=False):
-    blob_in0 = 'col'
-    blob_out0 = 'col_out'
-
-    blob_in1 = 'row'
-    blob_out1 = 'row_out'
-
-    old_shape_for_op = (-1, old_shape[1]) if stride_only else old_shape
-
-    op = core.CreateOperator('SparseMatrixReshape',
-                             [blob_in0, blob_in1],
-                             [blob_out0, blob_out1],
-                             old_shape=old_shape_for_op,
-                             new_shape=new_shape)
-
-    A = np.random.random_sample(old_shape)
-    A[np.random.random_sample(old_shape) > .5] = 0
-    A_coo = coo_matrix(A)
-    old_row, old_col = A_coo.row, A_coo.col
-
-    workspace.FeedBlob(blob_in0, old_col.astype(np.int64))
-    workspace.FeedBlob(blob_in1, old_row.astype(np.int32))
-
-    workspace.RunOperatorOnce(op)
-
-    A_new_coo = coo_matrix(A.reshape(new_shape))
-    new_row, new_col = A_new_coo.row, A_new_coo.col
-
-    col_out = workspace.FetchBlob(blob_out0)
-    row_out = workspace.FetchBlob(blob_out1)
-
-    np.testing.assert_array_equal(col_out, new_col)
-    np.testing.assert_array_equal(row_out, new_row)
-
-
-class TestSparseMatrixReshapeOp(TestCase):
-    def test_basic_reshape(self):
-        test_reshape(old_shape=(3, 4), new_shape=(4, 3))
-
-    def test_missing_dim(self):
-        test_reshape(old_shape=(2, 8), new_shape=(-1, 4))
-
-    def test_stride_only(self):
-        test_reshape(old_shape=(2, 8), new_shape=(-1, 4), stride_only=True)
-
-    def test_sparse_reshape_mm(self):
-        M, N, K = 300, 400, 500
-        A = np.random.rand(M, K).astype(np.float32)
-        A_sparse = A * (np.random.rand(*A.shape) > .5)
-        A_sparse = A_sparse.reshape((K, M))
-        A_coo = coo_matrix(A_sparse)
-        idx0, idx1, a = A_coo.row, A_coo.col, A_coo.data
-        B = np.random.rand(K, N).astype(np.float32)
-
-        workspace.FeedBlob('col', idx1.astype(np.int64))
-        workspace.FeedBlob('row', idx0.astype(np.int32))
-        workspace.FeedBlob('B', B)
-        workspace.FeedBlob('a', a)
-
-        reshape_op = core.CreateOperator(
-            'SparseMatrixReshape',
-            ['col', 'row'],
-            ['new_col', 'new_row'],
-            old_shape=(K, M),
-            new_shape=(M, K))
-
-        mm_op = core.CreateOperator(
-            'SparseUnsortedSegmentWeightedSum',
-            ['B', 'a', 'new_col', 'new_row'],
-            ['Y'])
-
-        workspace.RunOperatorOnce(reshape_op)
-        workspace.RunOperatorOnce(mm_op)
-
-        Y = workspace.FetchBlob('Y')
-        np.testing.assert_allclose(A_sparse.reshape(M, K).dot(B), Y,
-                                   rtol=1e-4)
diff --git a/caffe2/experiments/python/tt_contraction_op_test.py b/caffe2/experiments/python/tt_contraction_op_test.py
deleted file mode 100644
index 1e41e9ed8ddd7..0000000000000
--- a/caffe2/experiments/python/tt_contraction_op_test.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-
-
-
-
-
-import numpy as np
-
-from hypothesis import given
-import hypothesis.strategies as st
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestTTContraction(hu.HypothesisTestCase):
-    @given(D=st.integers(min_value=5, max_value=20),
-           K=st.integers(min_value=5, max_value=20),
-           M=st.integers(min_value=5, max_value=20),
-           N=st.integers(min_value=5, max_value=20),
-           **hu.gcs)
-    def test_tt_contraction(self, D, K, M, N, gc, dc):
-        A = np.random.rand(K, M).astype(np.float32)
-        B = np.random.rand(D, K, N).astype(np.float32)
-
-        workspace.FeedBlob('A', A)
-        workspace.FeedBlob('B', B)
-
-        op = core.CreateOperator(
-            'TTContraction',
-            ['A', 'B'],
-            ['C'],
-            K=K,
-            M=M,
-            N=N)
-        workspace.RunOperatorOnce(op)
-
-        def tt_contraction_ref(A_, B_):
-            return ((A_[:, :, np.newaxis] * B_[:, :, np.newaxis, :])
-                    .sum(axis=1).flatten()),
-
-        # Check against numpy reference
-        self.assertReferenceChecks(gc, op, [A, B], tt_contraction_ref)
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [A, B], [0])
-        # Gradient check wrt A
-        self.assertGradientChecks(gc, op, [A, B], 0, [0])
-        # Gradient check wrt B
-        self.assertGradientChecks(gc, op, [A, B], 1, [0])
diff --git a/caffe2/experiments/python/tt_pad_op_test.py b/caffe2/experiments/python/tt_pad_op_test.py
deleted file mode 100644
index 27d13543348b3..0000000000000
--- a/caffe2/experiments/python/tt_pad_op_test.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-
-
-
-
-
-import numpy as np
-
-from hypothesis import given
-import hypothesis.strategies as st
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestTTPad(hu.HypothesisTestCase):
-    @given(K=st.integers(min_value=2, max_value=10),
-           M=st.integers(min_value=10, max_value=20),
-           N=st.integers(min_value=10, max_value=20),
-           **hu.gcs)
-    def test_tt_pad(self, K, M, N, gc, dc):
-        op = core.CreateOperator(
-            'TTPad',
-            ['A'],
-            ['A', 'dim0'],
-            scale=(K))
-
-        A = np.random.rand(M, N).astype(np.float32)
-        workspace.FeedBlob('A', A)
-        workspace.RunOperatorOnce(op)
-
-        def tt_pad_ref(A_):
-            M_ = A_.shape[0]
-            if M_ % K == 0:
-                new_dim0 = M_
-            else:
-                new_dim0 = (M_ // K + 1) * K
-            return (np.vstack((A_, np.zeros((new_dim0 - M_, A_.shape[1])))),
-                    np.array([A.shape[0]]))
-
-        # Check against numpy reference
-        self.assertReferenceChecks(gc, op, [A], tt_pad_ref)
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [A], [0])
-        # Gradient check wrt A
-        self.assertGradientChecks(gc, op, [A], 0, [0])
diff --git a/caffe2/ideep/operators/conv_transpose_unpool_base_op.h b/caffe2/ideep/operators/conv_transpose_unpool_base_op.h
index 3ed7c60abc9b7..5d9486708ab27 100644
--- a/caffe2/ideep/operators/conv_transpose_unpool_base_op.h
+++ b/caffe2/ideep/operators/conv_transpose_unpool_base_op.h
@@ -3,21 +3,19 @@
 #include "caffe2/ideep/ideep_utils.h"
 #include "caffe2/proto/caffe2_legacy.pb.h"
 
-using namespace caffe2;
-
 namespace {
 
-class IDEEPConvTransposeUnpoolBase : public IDEEPOperator {
+class IDEEPConvTransposeUnpoolBase : public caffe2::IDEEPOperator {
  public:
   USE_IDEEP_DEF_ALIASES();
   USE_IDEEP_OPERATOR_FUNCTIONS();
 
-  IDEEPConvTransposeUnpoolBase(const OperatorDef& operator_def, Workspace* ws)
+  IDEEPConvTransposeUnpoolBase(const caffe2::OperatorDef& operator_def, caffe2::Workspace* ws)
       : IDEEPOperator(operator_def, ws),
         legacy_pad_(
-            static_cast<LegacyPadding>(OperatorBase::GetSingleArgument<int>(
+            static_cast<caffe2::LegacyPadding>(OperatorBase::GetSingleArgument<int>(
                 "legacy_pad",
-                LegacyPadding::NOTSET))),
+                caffe2::LegacyPadding::NOTSET))),
         kernel_(OperatorBase::GetRepeatedArgument<int>("kernels")),
         stride_(OperatorBase::GetRepeatedArgument<int>("strides")),
         pads_(OperatorBase::GetRepeatedArgument<int>("pads")),
@@ -26,8 +24,8 @@ class IDEEPConvTransposeUnpoolBase : public IDEEPOperator {
             OperatorBase::GetSingleArgument<int>("shared_buffer", 0)) {
     // For the padding, they should either be the legacy padding strategy
     // (VALID or SAME), or an explicit, non-negative value.
-    if (legacy_pad_ == LegacyPadding::VALID ||
-        legacy_pad_ == LegacyPadding::SAME) {
+    if (legacy_pad_ == caffe2::LegacyPadding::VALID ||
+        legacy_pad_ == caffe2::LegacyPadding::SAME) {
       CAFFE_ENFORCE(
           !OperatorBase::HasArgument("pads"),
           "If you use legacy padding VALID or SAME, you should not specify "
@@ -63,8 +61,8 @@ class IDEEPConvTransposeUnpoolBase : public IDEEPOperator {
 
     if (OperatorBase::HasArgument("pad")) {
       CAFFE_ENFORCE(
-          legacy_pad_ != LegacyPadding::VALID &&
-              legacy_pad_ != LegacyPadding::SAME,
+          legacy_pad_ != caffe2::LegacyPadding::VALID &&
+              legacy_pad_ != caffe2::LegacyPadding::SAME,
           "If you use legacy padding VALID or SAME, you should not specify "
           "any specific padding values.");
       pads_.resize(4, OperatorBase::GetSingleArgument<int>("pad", 0));
@@ -74,8 +72,8 @@ class IDEEPConvTransposeUnpoolBase : public IDEEPOperator {
         OperatorBase::HasArgument("pad_b") &&
         OperatorBase::HasArgument("pad_r")) {
       CAFFE_ENFORCE(
-          legacy_pad_ != LegacyPadding::VALID &&
-              legacy_pad_ != LegacyPadding::SAME,
+          legacy_pad_ != caffe2::LegacyPadding::VALID &&
+              legacy_pad_ != caffe2::LegacyPadding::SAME,
           "If you use legacy padding VALID or SAME, you should not specify "
           "any specific padding values.");
       pads_.push_back(OperatorBase::GetSingleArgument<int>("pad_t", 0));
@@ -104,8 +102,8 @@ class IDEEPConvTransposeUnpoolBase : public IDEEPOperator {
     CAFFE_ENFORCE_EQ(stride_.size(), kernel_.size());
     CAFFE_ENFORCE_EQ(adj_.size(), kernel_.size());
 
-    if (legacy_pad_ != LegacyPadding::VALID &&
-        legacy_pad_ != LegacyPadding::SAME) {
+    if (legacy_pad_ != caffe2::LegacyPadding::VALID &&
+        legacy_pad_ != caffe2::LegacyPadding::SAME) {
       CAFFE_ENFORCE_EQ(pads_.size(), 2 * kernel_.size());
     }
 
@@ -174,13 +172,13 @@ class IDEEPConvTransposeUnpoolBase : public IDEEPOperator {
   }
 
  private:
-  LegacyPadding legacy_pad_;
+  caffe2::LegacyPadding legacy_pad_;
 
  protected:
-  vector<int> kernel_;
-  vector<int> stride_;
-  vector<int> pads_;
-  vector<int> adj_;
+  std::vector<int> kernel_;
+  std::vector<int> stride_;
+  std::vector<int> pads_;
+  std::vector<int> adj_;
   bool shared_buffer_;
 
   // Accessors for 2D conv params.
@@ -234,7 +232,7 @@ class IDEEPConvTransposeUnpoolBase : public IDEEPOperator {
       int* pad_tail,
       int* out_size) {
     switch (legacy_pad_) {
-      case LegacyPadding::NOTSET:
+      case caffe2::LegacyPadding::NOTSET:
         CAFFE_ENFORCE_GE(*pad_head, 0);
         CAFFE_ENFORCE_GE(*pad_tail, 0);
         *out_size =
@@ -242,13 +240,13 @@ class IDEEPConvTransposeUnpoolBase : public IDEEPOperator {
         break;
       // We handle cases of LegacyPadding::VALID and LegacyPadding::SAME
       // the same way
-      case LegacyPadding::VALID:
-      case LegacyPadding::SAME:
+      case caffe2::LegacyPadding::VALID:
+      case caffe2::LegacyPadding::SAME:
         *pad_head = 0;
         *pad_tail = 0;
         *out_size = (in_size - 1) * stride + kernel + adj;
         break;
-      case LegacyPadding::CAFFE_LEGACY_POOLING:
+      case caffe2::LegacyPadding::CAFFE_LEGACY_POOLING:
         LOG(FATAL) << "CAFFE_LEGACY_POOLING is no longer supported.";
         break;
     }
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.cc b/caffe2/ideep/operators/operator_fallback_ideep.cc
index 1558e9b23e01b..b145af7cfc37d 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.cc
+++ b/caffe2/ideep/operators/operator_fallback_ideep.cc
@@ -50,15 +50,6 @@
 #include "caffe2/operators/bbox_transform_op.h"
 #include "caffe2/operators/box_with_nms_limit_op.h"
 
-#if __linux__ && defined(CAFFE2_USE_GLOO)
-#include <caffe2/contrib/gloo/common_world_ops.h>
-#include <caffe2/contrib/gloo/broadcast_ops.h>
-#include <caffe2/contrib/gloo/allreduce_ops.h>
-#include <caffe2/contrib/gloo/allgather_ops.h>
-#include <caffe2/contrib/gloo/barrier_ops.h>
-#include <caffe2/contrib/gloo/reduce_scatter_ops.h>
-#endif
-
 // can add more non-IDEEP operators if needed
 namespace caffe2 {
 
@@ -284,35 +275,5 @@ REGISTER_IDEEP_OPERATOR(
     BatchMatMul,
     IDEEPFallbackOp<BatchMatMulOp<CPUContext>>);
 
-#if __linux__ && defined(CAFFE2_USE_GLOO)
-namespace gloo {
-// gloo operators
-REGISTER_IDEEP_OPERATOR(
-    CreateCommonWorld,
-    IDEEPFallbackOp<CreateCommonWorld<CPUContext>, SkipIndices<0>>);
-REGISTER_IDEEP_OPERATOR(
-    CloneCommonWorld,
-    IDEEPFallbackOp<CloneCommonWorld<CPUContext>, SkipIndices<0>>);
-REGISTER_IDEEP_OPERATOR(
-    DestroyCommonWorld,
-    IDEEPFallbackOp<DestroyCommonWorld>);
-REGISTER_IDEEP_OPERATOR(
-    Broadcast,
-    IDEEPFallbackOp<BroadcastOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    Allreduce,
-    IDEEPFallbackOp<AllreduceOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    Allgather,
-    IDEEPFallbackOp<AllgatherOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    Barrier,
-    IDEEPFallbackOp<BarrierOp<CPUContext>>);
-REGISTER_IDEEP_OPERATOR(
-    ReduceScatter,
-    IDEEPFallbackOp<ReduceScatterOp<CPUContext>>);
-
-} // namespace gloo
-#endif
 
 } // namespace caffe2
diff --git a/caffe2/image/CMakeLists.txt b/caffe2/image/CMakeLists.txt
deleted file mode 100644
index 023df8ebd0dfb..0000000000000
--- a/caffe2/image/CMakeLists.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-if(USE_OPENCV AND OpenCV_FOUND)
-        message(STATUS "Including image processing operators")
-  # ---[ GPU files
-  # ------[ general GPU
-  file(GLOB tmp *_gpu.cc)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-  # ------[ CUDA sources
-  file(GLOB tmp *.cu)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-  # exclude test files
-  file(GLOB tmp *_test.cc)
-  exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
-
-  # ---[ HIP files
-  # ------[ general HIP
-  file(GLOB tmp hip/*.cc)
-  set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
-  # ------[ HIP sources
-  file(GLOB tmp hip/*.hip)
-  set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
-  # exclude test files
-  file(GLOB tmp hip/*_test.cc)
-  exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
-
-  # ---[ CPU files.
-  file(GLOB tmp *.cc)
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
-  # exclude test files and gpu files
-  file(GLOB tmp *_test.cc)
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
-
-  # ---[ GPU test files
-  file(GLOB tmp *_gpu_test.cc)
-  set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
-
-  # ---[ HIP test files
-  file(GLOB tmp hip/*_test.cc)
-  set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
-
-  # ---[ CPU test files
-  file(GLOB tmp *_test.cc)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
-  exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
-  exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS})
-
-  # ---[ Send the lists to the parent scope.
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-  set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-  set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
-  set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
-else()
-        message(STATUS "Excluding image processing operators due to no opencv")
-endif()
diff --git a/caffe2/image/image_input_op.cc b/caffe2/image/image_input_op.cc
deleted file mode 100644
index ff868e1370501..0000000000000
--- a/caffe2/image/image_input_op.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-#include "caffe2/image/image_input_op.h"
-
-#ifdef USE_MKLDNN
-#include <caffe2/ideep/operators/operator_fallback_ideep.h>
-#include <caffe2/ideep/utils/ideep_operator.h>
-#endif
-
-namespace caffe2 {
-
-template <>
-bool ImageInputOp<CPUContext>::ApplyTransformOnGPU(
-    const std::vector<std::int64_t>&,
-    const c10::Device&) {
-  return false;
-}
-
-REGISTER_CPU_OPERATOR(ImageInput, ImageInputOp<CPUContext>);
-
-OPERATOR_SCHEMA(ImageInput)
-    .NumInputs(0, 1)
-    .NumOutputs(2, INT_MAX)
-    .TensorInferenceFunction([](const OperatorDef& def,
-                                const vector<TensorShape>& /* unused */) {
-      vector<TensorShape> out(2);
-      ArgumentHelper helper(def);
-      int batch_size = helper.GetSingleArgument<int>("batch_size", 0);
-      int crop = helper.GetSingleArgument<int>("crop", -1);
-      int color = helper.GetSingleArgument<int>("color", 1);
-      TORCH_CHECK_GT(crop, 0);
-      out[0] = CreateTensorShape(
-          vector<int>{batch_size, crop, crop, color ? 3 : 1},
-          TensorProto::FLOAT);
-      out[1] =
-          CreateTensorShape(vector<int>{1, batch_size}, TensorProto::INT32);
-      return out;
-    })
-    .SetDoc(R"DOC(
-Imports and processes images from a database. For each run of the operator,
-batch_size images will be processed. GPUs can optionally be used for
-part of the processing.
-
-The following transformations are applied to the image
-  - A bounding box is applied to the initial image (optional)
-  - The image is rescaled either up or down (with the scale argument) or
-    just up (with the minsize argument)
-  - The image is randomly cropped (crop size is passed as an argument but
-    the location of the crop is random except if is_test is passed in which case
-    the image in cropped at the center)
-  - The image is normalized. Each of its color channels can have separate
-    normalization values
-
-The dimension of the output image will always be cropxcrop
-)DOC")
-    .Arg(
-        "batch_size",
-        "Number of images to output for each run of the operator"
-        ". Must be 1 or greater")
-    .Arg("color", "Number of color channels (1 or 3). Defaults to 1")
-    .Arg("color_jitter", "Whether or not to do color jitter. Defaults to 0")
-    .Arg(
-        "img_saturation",
-        "Image saturation scale used in color jittering. "
-        "Defaults to 0.4")
-    .Arg(
-        "img_brightness",
-        "Image brightness scale used in color jittering. "
-        "Defaults to 0.4")
-    .Arg(
-        "img_contrast",
-        "Image contrast scale used in color jittering. "
-        "Defaults to 0.4")
-    .Arg(
-        "color_lighting",
-        "Whether or not to do color lighting."
-        " Defaults to 0")
-    .Arg(
-        "color_lighting_std",
-        "Std of normal distribution where color lighting"
-        " scaling factor is sampled. Defaults to 0.1")
-    .Arg(
-        "scale_jitter_type",
-        "Type 0: No scale jittering "
-        "Type 1: Inception-style scale jittering")
-    .Arg(
-        "label_type",
-        "Type 0: single integer label for multi-class "
-        "classification. Type 1: sparse active label indices for multi-label "
-        "classification. Type 2: dense label embedding vector for label "
-        "embedding regression")
-    .Arg(
-        "scale",
-        "Scale the size of the smallest dimension of the image to"
-        " this. Scale and minsize are mutually exclusive."
-        " Must be larger than crop")
-    .Arg(
-        "minsize",
-        "Scale the size of the smallest dimension of the image to"
-        " this only if the size is initially smaller. Scale and minsize are"
-        " mutually exclusive. Must be larger than crop.")
-    .Arg(
-        "warp",
-        "If 1, both dimensions of the image will be set to minsize or"
-        " scale; otherwise, the other dimension is proportionally scaled."
-        " Defaults to 0")
-    .Arg("crop", "Size to crop the image to. Must be provided")
-    .Arg("mirror", "Whether or not to mirror the image. Defaults to 0")
-    .Arg(
-        "mean",
-        "Mean by which to normalize color channels."
-        " Defaults to 0.")
-    .Arg(
-        "mean_per_channel",
-        "Vector of means per color channel "
-        " (1 or 3 elements). Defaults to mean argument. Channel order BGR")
-    .Arg(
-        "std",
-        "Standard deviation by which to normalize color channels."
-        " Defaults to 1.")
-    .Arg(
-        "std_per_channel",
-        "Vector of standard dev. per color channel "
-        " (1 or 3 elements). Defaults to std argument. Channel order is BGR")
-    .Arg("bounding_ymin", "Bounding box coordinate. Defaults to -1 (none)")
-    .Arg("bounding_xmin", "Bounding box coordinate. Defaults to -1 (none)")
-    .Arg("bounding_height", "Bounding box coordinate. Defaults to -1 (none)")
-    .Arg("bounding_width", "Bounding box coordinate. Defaults to -1 (none)")
-    .ArgIsTest("Set to 1 to do deterministic cropping. Defaults to 0")
-    .Arg("use_caffe_datum", "1 if the input is in Caffe format. Defaults to 0")
-    .Arg(
-        "use_gpu_transform",
-        "1 if GPU acceleration should be used."
-        " Defaults to 0. Can only be 1 in a CUDAContext")
-    .Arg(
-        "decode_threads",
-        "Number of CPU decode/transform threads."
-        " Defaults to 4")
-    .Arg("output_type", "If gpu_transform, can set to FLOAT or FLOAT16.")
-    .Arg("db", "Name of the database (if not passed as input)")
-    .Arg(
-        "db_type",
-        "Type of database (if not passed as input)."
-        " Defaults to leveldb")
-    .Arg(
-        "output_sizes",
-        "The sizes of any outputs besides the data and label "
-        "(should have a number of elements equal to the number of additional "
-        "outputs)")
-    .Arg(
-        "random_scale",
-        "[min, max] shortest-side desired for image resize. "
-        "Defaults to [-1, -1] or no random resize desired.")
-    .Input(0, "reader", "The input reader (a db::DBReader)")
-    .Output(0, "data", "Tensor containing the images")
-    .Output(1, "label", "Tensor containing the labels")
-    .Output(
-        2,
-        "additional outputs",
-        "Any outputs after the first 2 will be "
-        "Tensors read from the input TensorProtos");
-
-NO_GRADIENT(ImageInput);
-
-#ifdef USE_MKLDNN
-REGISTER_IDEEP_OPERATOR(ImageInput, IDEEPFallbackOp<ImageInputOp<CPUContext>>);
-#endif
-
-} // namespace caffe2
diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h
deleted file mode 100644
index 51367788c0660..0000000000000
--- a/caffe2/image/image_input_op.h
+++ /dev/null
@@ -1,1347 +0,0 @@
-
-#ifndef CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
-#define CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
-
-#include <opencv2/opencv.hpp>
-
-#include <algorithm>
-#include <iostream>
-
-#include "c10/core/thread_pool.h"
-#include <c10/util/irange.h>
-#include "caffe2/core/common.h"
-#include "caffe2/core/db.h"
-#include "caffe2/image/transform_gpu.h"
-#include "caffe2/operators/prefetch_op.h"
-#include "caffe2/proto/caffe2_legacy.pb.h"
-#include "caffe2/utils/cast.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-class CUDAContext;
-
-template <class Context>
-class ImageInputOp final : public PrefetchOperator<Context> {
-  // SINGLE_LABEL: single integer label for multi-class classification
-  // MULTI_LABEL_SPARSE: sparse active label indices for multi-label
-  // classification MULTI_LABEL_DENSE: dense label embedding vector for label
-  // embedding regression MULTI_LABEL_WEIGHTED_SPARSE: sparse active label
-  // indices with per-label weights for multi-label classification
-  // SINGLE_LABEL_WEIGHTED: single integer label for multi-class classification
-  // with weighted sampling EMBEDDING_LABEL: an array of floating numbers
-  // representing dense embedding.
-  //   It is useful for model distillation
-  enum LABEL_TYPE {
-    SINGLE_LABEL = 0,
-    MULTI_LABEL_SPARSE = 1,
-    MULTI_LABEL_DENSE = 2,
-    MULTI_LABEL_WEIGHTED_SPARSE = 3,
-    SINGLE_LABEL_WEIGHTED = 4,
-    EMBEDDING_LABEL = 5,
-  };
-
-  // INCEPTION_STYLE: Random crop with size 8% - 100% image area and aspect
-  // ratio in [3/4, 4/3]. Reference: GoogleNet paper
-  enum SCALE_JITTER_TYPE {
-    NO_SCALE_JITTER = 0,
-    INCEPTION_STYLE = 1
-    // TODO(zyan3): ResNet-style random scale jitter
-  };
-
- public:
-  using OperatorBase::OutputSize;
-  using PrefetchOperator<Context>::context_;
-  using PrefetchOperator<Context>::prefetch_thread_;
-  explicit ImageInputOp(const OperatorDef& operator_def, Workspace* ws);
-  ~ImageInputOp() {
-    PrefetchOperator<Context>::Finalize();
-  }
-
-  bool Prefetch() override;
-  bool CopyPrefetched() override;
-
- private:
-  struct BoundingBox {
-    bool valid;
-    int ymin;
-    int xmin;
-    int height;
-    int width;
-  };
-
-  // Structure to store per-image information
-  // This can be modified by the DecodeAnd* so needs
-  // to be privatized per launch.
-  struct PerImageArg { BoundingBox bounding_params; };
-
-  bool GetImageAndLabelAndInfoFromDBValue(
-      const string& value,
-      cv::Mat* img,
-      PerImageArg& info,
-      int item_id,
-      std::mt19937* randgen);
-  void DecodeAndTransform(
-      const std::string& value,
-      float* image_data,
-      int item_id,
-      const int channels,
-      std::size_t thread_index);
-  void DecodeAndTransposeOnly(
-      const std::string& value,
-      uint8_t* image_data,
-      int item_id,
-      const int channels,
-      std::size_t thread_index);
-  bool ApplyTransformOnGPU(
-      const std::vector<std::int64_t>& dims,
-      const c10::Device& type);
-
-  unique_ptr<db::DBReader> owned_reader_;
-  const db::DBReader* reader_;
-  Tensor prefetched_image_;
-  Tensor prefetched_label_;
-  vector<Tensor> prefetched_additional_outputs_;
-  Tensor prefetched_image_on_device_;
-  Tensor prefetched_label_on_device_;
-  vector<Tensor> prefetched_additional_outputs_on_device_;
-  // Default parameters for images
-  PerImageArg default_arg_;
-  int batch_size_;
-  LABEL_TYPE label_type_;
-  int num_labels_;
-
-  bool color_;
-  bool color_jitter_;
-  float img_saturation_;
-  float img_brightness_;
-  float img_contrast_;
-  bool color_lighting_;
-  float color_lighting_std_;
-  std::vector<std::vector<float>> color_lighting_eigvecs_;
-  std::vector<float> color_lighting_eigvals_;
-  SCALE_JITTER_TYPE scale_jitter_type_;
-  int scale_;
-  // Minsize is similar to scale except that it will only
-  // force the image to scale up if it is too small. In other words,
-  // it ensures that both dimensions of the image are at least minsize_
-  int minsize_;
-  bool warp_;
-  int crop_;
-  std::vector<float> mean_;
-  std::vector<float> std_;
-  Tensor mean_gpu_;
-  Tensor std_gpu_;
-  bool mirror_;
-  bool is_test_;
-  bool use_caffe_datum_;
-  bool gpu_transform_;
-  bool mean_std_copied_ = false;
-
-  // thread pool for parse + decode
-  int num_decode_threads_;
-  int additional_inputs_offset_;
-  int additional_inputs_count_;
-  std::vector<int> additional_output_sizes_;
-  std::shared_ptr<TaskThreadPool> thread_pool_;
-
-  // Output type for GPU transform path
-  TensorProto_DataType output_type_;
-
-  // random minsize
-  vector<int> random_scale_;
-  bool random_scaling_;
-
-  // Working variables
-  std::vector<std::mt19937> randgen_per_thread_;
-
-  // number of exceptions produced by opencv while reading image data
-  std::atomic<long> num_decode_errors_in_batch_{0};
-  // opencv exceptions tolerance
-  float max_decode_error_ratio_;
-};
-
-template <class Context>
-ImageInputOp<Context>::ImageInputOp(
-    const OperatorDef& operator_def,
-    Workspace* ws)
-    : PrefetchOperator<Context>(operator_def, ws),
-      reader_(nullptr),
-      batch_size_(
-          OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
-      label_type_(static_cast<LABEL_TYPE>(
-          OperatorBase::template GetSingleArgument<int>("label_type", 0))),
-      num_labels_(
-          OperatorBase::template GetSingleArgument<int>("num_labels", 0)),
-      color_(OperatorBase::template GetSingleArgument<int>("color", 1)),
-      color_jitter_(
-          OperatorBase::template GetSingleArgument<int>("color_jitter", 0)),
-      img_saturation_(OperatorBase::template GetSingleArgument<float>(
-          "img_saturation",
-          0.4)),
-      img_brightness_(OperatorBase::template GetSingleArgument<float>(
-          "img_brightness",
-          0.4)),
-      img_contrast_(
-          OperatorBase::template GetSingleArgument<float>("img_contrast", 0.4)),
-      color_lighting_(
-          OperatorBase::template GetSingleArgument<int>("color_lighting", 0)),
-      color_lighting_std_(OperatorBase::template GetSingleArgument<float>(
-          "color_lighting_std",
-          0.1)),
-      scale_jitter_type_(static_cast<SCALE_JITTER_TYPE>(
-          OperatorBase::template GetSingleArgument<int>(
-              "scale_jitter_type",
-              0))),
-      scale_(OperatorBase::template GetSingleArgument<int>("scale", -1)),
-      minsize_(OperatorBase::template GetSingleArgument<int>("minsize", -1)),
-      warp_(OperatorBase::template GetSingleArgument<int>("warp", 0)),
-      crop_(OperatorBase::template GetSingleArgument<int>("crop", -1)),
-      mirror_(OperatorBase::template GetSingleArgument<int>("mirror", 0)),
-      is_test_(OperatorBase::template GetSingleArgument<int>(
-          OpSchema::Arg_IsTest,
-          0)),
-      use_caffe_datum_(
-          OperatorBase::template GetSingleArgument<int>("use_caffe_datum", 0)),
-      gpu_transform_(OperatorBase::template GetSingleArgument<int>(
-          "use_gpu_transform",
-          0)),
-      num_decode_threads_(
-          OperatorBase::template GetSingleArgument<int>("decode_threads", 4)),
-      additional_output_sizes_(
-          OperatorBase::template GetRepeatedArgument<int>("output_sizes", {})),
-      thread_pool_(std::make_shared<TaskThreadPool>(num_decode_threads_)),
-      // output type only supported with CUDA and use_gpu_transform for now
-      output_type_(
-          cast::GetCastDataType(ArgumentHelper(operator_def), "output_type")),
-      random_scale_(OperatorBase::template GetRepeatedArgument<int>(
-          "random_scale",
-          {-1, -1})),
-      max_decode_error_ratio_(OperatorBase::template GetSingleArgument<float>(
-          "max_decode_error_ratio",
-          1.0)) {
-  if ((random_scale_[0] == -1) || (random_scale_[1] == -1)) {
-    random_scaling_ = false;
-  } else {
-    random_scaling_ = true;
-    minsize_ = random_scale_[0];
-  }
-
-  mean_ = OperatorBase::template GetRepeatedArgument<float>(
-      "mean_per_channel",
-      {OperatorBase::template GetSingleArgument<float>("mean", 0.)});
-
-  std_ = OperatorBase::template GetRepeatedArgument<float>(
-      "std_per_channel",
-      {OperatorBase::template GetSingleArgument<float>("std", 1.)});
-
-  if (additional_output_sizes_.size() == 0) {
-    additional_output_sizes_ = std::vector<int>(OutputSize() - 2, 1);
-  } else {
-    CAFFE_ENFORCE(
-        additional_output_sizes_.size() == OutputSize() - 2,
-        "If the output sizes are specified, they must be specified for all "
-        "additional outputs");
-  }
-  additional_inputs_count_ = OutputSize() - 2;
-
-  default_arg_.bounding_params = {
-      false,
-      OperatorBase::template GetSingleArgument<int>("bounding_ymin", -1),
-      OperatorBase::template GetSingleArgument<int>("bounding_xmin", -1),
-      OperatorBase::template GetSingleArgument<int>("bounding_height", -1),
-      OperatorBase::template GetSingleArgument<int>("bounding_width", -1),
-  };
-
-  if (operator_def.input_size() == 0) {
-    LOG(ERROR) << "You are using an old ImageInputOp format that creates "
-                  "a local db reader. Consider moving to the new style "
-                  "that takes in a DBReader blob instead.";
-    string db_name = OperatorBase::template GetSingleArgument<string>("db", "");
-    CAFFE_ENFORCE_GT(db_name.size(), 0, "Must specify a db name.");
-    owned_reader_.reset(new db::DBReader(
-        OperatorBase::template GetSingleArgument<string>("db_type", "leveldb"),
-        db_name));
-    reader_ = owned_reader_.get();
-  }
-
-  // hard-coded PCA eigenvectors and eigenvalues, based on RBG channel order
-  color_lighting_eigvecs_.push_back(
-      std::vector<float>{-144.7125f, 183.396f, 102.2295f});
-  color_lighting_eigvecs_.push_back(
-      std::vector<float>{-148.104f, -1.1475f, -207.57f});
-  color_lighting_eigvecs_.push_back(
-      std::vector<float>{-148.818f, -177.174f, 107.1765f});
-
-  color_lighting_eigvals_ = std::vector<float>{0.2175f, 0.0188f, 0.0045f};
-
-  CAFFE_ENFORCE_GT(batch_size_, 0, "Batch size should be nonnegative.");
-  if (use_caffe_datum_) {
-    CAFFE_ENFORCE(
-        label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED,
-        "Caffe datum only supports single integer label");
-  }
-  if (label_type_ != SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) {
-    CAFFE_ENFORCE_GT(
-        num_labels_,
-        0,
-        "Number of labels must be set for using either sparse label indices or dense label embedding.");
-  }
-  if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE ||
-      label_type_ == SINGLE_LABEL_WEIGHTED) {
-    additional_inputs_offset_ = 3;
-  } else {
-    additional_inputs_offset_ = 2;
-  }
-  CAFFE_ENFORCE(
-      (scale_ > 0) != (minsize_ > 0),
-      "Must provide one and only one of scaling or minsize");
-  CAFFE_ENFORCE_GT(crop_, 0, "Must provide the cropping value.");
-  CAFFE_ENFORCE_GE(
-      scale_ > 0 ? scale_ : minsize_,
-      crop_,
-      "The scale/minsize value must be no smaller than the crop value.");
-
-  CAFFE_ENFORCE_EQ(
-      mean_.size(),
-      std_.size(),
-      "The mean and std. dev vectors must be of the same size.");
-  CAFFE_ENFORCE(
-      mean_.size() == 1 || mean_.size() == 3,
-      "The mean and std. dev vectors must be of size 1 or 3");
-  CAFFE_ENFORCE(
-      !use_caffe_datum_ || OutputSize() == 2,
-      "There can only be 2 outputs if the Caffe datum format is used");
-
-  CAFFE_ENFORCE(
-      random_scale_.size() == 2, "Must provide [scale_min, scale_max]");
-  CAFFE_ENFORCE_GE(
-      random_scale_[1],
-      random_scale_[0],
-      "random scale must provide a range [min, max]");
-
-  if (default_arg_.bounding_params.ymin < 0 ||
-      default_arg_.bounding_params.xmin < 0 ||
-      default_arg_.bounding_params.height < 0 ||
-      default_arg_.bounding_params.width < 0) {
-    default_arg_.bounding_params.valid = false;
-  } else {
-    default_arg_.bounding_params.valid = true;
-  }
-
-  if (mean_.size() == 1) {
-    // We are going to extend to 3 using the first value
-    mean_.resize(3, mean_[0]);
-    std_.resize(3, std_[0]);
-  }
-
-  LOG(INFO) << "Creating an image input op with the following setting: ";
-  LOG(INFO) << "    Using " << num_decode_threads_ << " CPU threads;";
-  if (gpu_transform_) {
-    LOG(INFO) << "    Performing transformation on GPU";
-  }
-  LOG(INFO) << "    Outputting in batches of " << batch_size_ << " images;";
-  LOG(INFO) << "    Treating input image as "
-            << (color_ ? "color " : "grayscale ") << "image;";
-  if (default_arg_.bounding_params.valid) {
-    LOG(INFO) << "    Applying a default bounding box of Y ["
-              << default_arg_.bounding_params.ymin << "; "
-              << default_arg_.bounding_params.ymin +
-            default_arg_.bounding_params.height
-              << ") x X [" << default_arg_.bounding_params.xmin << "; "
-              << default_arg_.bounding_params.xmin +
-            default_arg_.bounding_params.width
-              << ")";
-  }
-  if (scale_ > 0 && !random_scaling_) {
-    LOG(INFO) << "    Scaling image to " << scale_
-              << (warp_ ? " with " : " without ") << "warping;";
-  } else {
-    if (random_scaling_) {
-      // randomly set min_size_ for each image
-      LOG(INFO) << "    Randomly scaling shortest side between "
-                << random_scale_[0] << " and " << random_scale_[1];
-    } else {
-      // Here, minsize_ > 0
-      LOG(INFO) << "    Ensuring minimum image size of " << minsize_
-                << (warp_ ? " with " : " without ") << "warping;";
-    }
-  }
-  LOG(INFO) << "    " << (is_test_ ? "Central" : "Random")
-            << " cropping image to " << crop_
-            << (mirror_ ? " with " : " without ") << "random mirroring;";
-  LOG(INFO) << "Label Type: " << label_type_;
-  LOG(INFO) << "Num Labels: " << num_labels_;
-
-  auto mit = mean_.begin();
-  auto sit = std_.begin();
-
-  for (int i = 0; mit != mean_.end() && sit != std_.end(); ++mit, ++sit, ++i) {
-    LOG(INFO) << "    Default [Channel " << i << "] Subtract mean " << *mit
-              << " and divide by std " << *sit << ".";
-    // We actually will use the inverse of std, so inverse it here
-    *sit = 1.f / *sit;
-  }
-  LOG(INFO) << "    Outputting images as "
-            << OperatorBase::template GetSingleArgument<string>(
-                   "output_type", "unknown")
-            << ".";
-
-  std::mt19937 meta_randgen(time(nullptr));
-  for (const auto i : c10::irange(num_decode_threads_)) {
-    randgen_per_thread_.emplace_back(meta_randgen());
-  }
-  ReinitializeTensor(
-      &prefetched_image_,
-      {int64_t(batch_size_),
-       int64_t(crop_),
-       int64_t(crop_),
-       int64_t(color_ ? 3 : 1)},
-      at::dtype<uint8_t>().device(CPU));
-  std::vector<int64_t> sizes;
-  if (label_type_ != SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) {
-    sizes = std::vector<int64_t>{int64_t(batch_size_), int64_t(num_labels_)};
-  } else {
-    sizes = std::vector<int64_t>{batch_size_};
-  }
-  // data type for prefetched_label_ is actually not known here..
-  ReinitializeTensor(&prefetched_label_, sizes, at::dtype<int>().device(CPU));
-
-  for (const auto i : c10::irange(additional_output_sizes_.size())) {
-    prefetched_additional_outputs_on_device_.emplace_back();
-    prefetched_additional_outputs_.emplace_back();
-  }
-}
-
-// Inception-stype scale jittering
-template <class Context>
-bool RandomSizedCropping(cv::Mat* img, const int crop, std::mt19937* randgen) {
-  cv::Mat scaled_img;
-  bool inception_scale_jitter = false;
-  int im_height = img->rows, im_width = img->cols;
-  int area = im_height * im_width;
-  std::uniform_real_distribution<> area_dis(0.08, 1.0);
-  std::uniform_real_distribution<> aspect_ratio_dis(3.0 / 4.0, 4.0 / 3.0);
-
-  cv::Mat cropping;
-  for (const auto i : c10::irange(10)) {
-    int target_area = int(ceil(area_dis(*randgen) * area));
-    float aspect_ratio = aspect_ratio_dis(*randgen);
-    int nh = floor(std::sqrt(((float)target_area / aspect_ratio)));
-    int nw = floor(std::sqrt(((float)target_area * aspect_ratio)));
-    if (nh >= 1 && nh <= im_height && nw >= 1 && nw <= im_width) {
-      int height_offset =
-          std::uniform_int_distribution<>(0, im_height - nh)(*randgen);
-      int width_offset =
-          std::uniform_int_distribution<>(0, im_width - nw)(*randgen);
-      cv::Rect ROI(width_offset, height_offset, nw, nh);
-      cropping = (*img)(ROI);
-      cv::resize(
-          cropping, scaled_img, cv::Size(crop, crop), 0, 0, cv::INTER_AREA);
-      *img = scaled_img;
-      inception_scale_jitter = true;
-      break;
-    }
-  }
-  return inception_scale_jitter;
-}
-
-template <class Context>
-bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
-    const string& value,
-    cv::Mat* img,
-    PerImageArg& info,
-    int item_id,
-    std::mt19937* randgen) {
-  //
-  // recommend using --caffe2_use_fatal_for_enforce=1 when using ImageInputOp
-  // as this function runs on a worker thread and the exceptions from
-  // CAFFE_ENFORCE are silently dropped by the thread worker functions
-  //
-  cv::Mat src;
-
-  // Use the default information for images
-  info = default_arg_;
-  if (use_caffe_datum_) {
-    // The input is a caffe datum format.
-    CaffeDatum datum;
-    CAFFE_ENFORCE(datum.ParseFromString(value));
-
-    prefetched_label_.mutable_data<int>()[item_id] = datum.label();
-    if (datum.encoded()) {
-      // encoded image in datum.
-      // count the number of exceptions from opencv imdecode
-      try {
-        src = cv::imdecode(
-            cv::Mat(
-                1,
-                datum.data().size(),
-                CV_8UC1,
-                const_cast<char*>(datum.data().data())),
-            color_ ? cv::IMREAD_COLOR : cv::IMREAD_GRAYSCALE);
-        if (src.rows == 0 || src.cols == 0) {
-          num_decode_errors_in_batch_++;
-          src = cv::Mat::zeros(cv::Size(224, 224), CV_8UC3);
-        }
-      } catch (cv::Exception& e) {
-        num_decode_errors_in_batch_++;
-        src = cv::Mat::zeros(cv::Size(224, 224), CV_8UC3);
-      }
-    } else {
-      // Raw image in datum.
-      CAFFE_ENFORCE(datum.channels() == 3 || datum.channels() == 1);
-
-      int src_c = datum.channels();
-      src.create(
-          datum.height(), datum.width(), (src_c == 3) ? CV_8UC3 : CV_8UC1);
-
-      if (src_c == 1) {
-        memcpy(src.ptr<uchar>(0), datum.data().data(), datum.data().size());
-      } else {
-        // Datum stores things in CHW order, let's do HWC for images to make
-        // things more consistent with conventional image storage.
-        for (const auto c : c10::irange(3)) {
-          const char* datum_buffer =
-              datum.data().data() + datum.height() * datum.width() * c;
-          uchar* ptr = src.ptr<uchar>(0) + c;
-          for (const auto h : c10::irange(datum.height())) {
-            for (const auto w : c10::irange(datum.width())) {
-              *ptr = *(datum_buffer++);
-              ptr += 3;
-            }
-          }
-        }
-      }
-    }
-  } else {
-    // The input is a caffe2 format.
-    TensorProtos protos;
-    CAFFE_ENFORCE(protos.ParseFromString(value));
-    const TensorProto& image_proto = protos.protos(0);
-    const TensorProto& label_proto = protos.protos(1);
-    // add handle protos
-    vector<TensorProto> additional_output_protos;
-    int start = additional_inputs_offset_;
-    int end = start + additional_inputs_count_;
-    for (const auto i : c10::irange(start, end)) {
-      additional_output_protos.push_back(protos.protos(i));
-    }
-
-    if (protos.protos_size() == end + 1) {
-      // We have bounding box information
-      const TensorProto& bounding_proto = protos.protos(end);
-      TORCH_DCHECK_EQ(bounding_proto.data_type(), TensorProto::INT32);
-      TORCH_DCHECK_EQ(bounding_proto.int32_data_size(), 4);
-      info.bounding_params.valid = true;
-      info.bounding_params.ymin = bounding_proto.int32_data(0);
-      info.bounding_params.xmin = bounding_proto.int32_data(1);
-      info.bounding_params.height = bounding_proto.int32_data(2);
-      info.bounding_params.width = bounding_proto.int32_data(3);
-    }
-
-    if (image_proto.data_type() == TensorProto::STRING) {
-      // encoded image string.
-      TORCH_DCHECK_EQ(image_proto.string_data_size(), 1);
-      const string& encoded_image_str = image_proto.string_data(0);
-      int encoded_size = encoded_image_str.size();
-      // We use a cv::Mat to wrap the encoded str so we do not need a copy.
-      // count the number of exceptions from opencv imdecode
-      try {
-        src = cv::imdecode(
-            cv::Mat(
-                1,
-                &encoded_size,
-                CV_8UC1,
-                const_cast<char*>(encoded_image_str.data())),
-            color_ ? cv::IMREAD_COLOR : cv::IMREAD_GRAYSCALE);
-        if (src.rows == 0 || src.cols == 0) {
-          num_decode_errors_in_batch_++;
-          src = cv::Mat::zeros(cv::Size(224, 224), CV_8UC3);
-        }
-      } catch (cv::Exception& e) {
-        num_decode_errors_in_batch_++;
-        src = cv::Mat::zeros(cv::Size(224, 224), CV_8UC3);
-      }
-    } else if (image_proto.data_type() == TensorProto::BYTE) {
-      // raw image content.
-      int src_c = (image_proto.dims_size() == 3) ? image_proto.dims(2) : 1;
-      CAFFE_ENFORCE(src_c == 3 || src_c == 1);
-
-      src.create(
-          image_proto.dims(0),
-          image_proto.dims(1),
-          (src_c == 3) ? CV_8UC3 : CV_8UC1);
-      memcpy(
-          src.ptr<uchar>(0),
-          image_proto.byte_data().data(),
-          image_proto.byte_data().size());
-    } else {
-      LOG(FATAL) << "Unknown image data type.";
-    }
-
-    // TODO: if image decoding was unsuccessful, set label to 0
-    if (label_proto.data_type() == TensorProto::FLOAT) {
-      if (label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED) {
-        TORCH_DCHECK_EQ(label_proto.float_data_size(), 1);
-        prefetched_label_.mutable_data<float>()[item_id] =
-            label_proto.float_data(0);
-      } else if (label_type_ == MULTI_LABEL_SPARSE) {
-        float* label_data =
-            prefetched_label_.mutable_data<float>() + item_id * num_labels_;
-        memset(label_data, 0, sizeof(float) * num_labels_);
-        for (const auto i : c10::irange(label_proto.float_data_size())) {
-          label_data[(int)label_proto.float_data(i)] = 1.0;
-        }
-      } else if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE) {
-        const TensorProto& weight_proto = protos.protos(2);
-        float* label_data =
-            prefetched_label_.mutable_data<float>() + item_id * num_labels_;
-        memset(label_data, 0, sizeof(float) * num_labels_);
-        for (const auto i : c10::irange(label_proto.float_data_size())) {
-          label_data[(int)label_proto.float_data(i)] =
-              weight_proto.float_data(i);
-        }
-      } else if (
-          label_type_ == MULTI_LABEL_DENSE || label_type_ == EMBEDDING_LABEL) {
-        CAFFE_ENFORCE(label_proto.float_data_size() == num_labels_);
-        float* label_data =
-            prefetched_label_.mutable_data<float>() + item_id * num_labels_;
-        for (const auto i : c10::irange(label_proto.float_data_size())) {
-          label_data[i] = label_proto.float_data(i);
-        }
-      } else {
-        LOG(ERROR) << "Unknown label type:" << label_type_;
-      }
-    } else if (label_proto.data_type() == TensorProto::INT32) {
-      if (label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED) {
-        TORCH_DCHECK_EQ(label_proto.int32_data_size(), 1);
-        prefetched_label_.mutable_data<int>()[item_id] =
-            label_proto.int32_data(0);
-      } else if (label_type_ == MULTI_LABEL_SPARSE) {
-        int* label_data =
-            prefetched_label_.mutable_data<int>() + item_id * num_labels_;
-        memset(label_data, 0, sizeof(int) * num_labels_);
-        for (const auto i : c10::irange(label_proto.int32_data_size())) {
-          label_data[label_proto.int32_data(i)] = 1;
-        }
-      } else if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE) {
-        const TensorProto& weight_proto = protos.protos(2);
-        float* label_data =
-            prefetched_label_.mutable_data<float>() + item_id * num_labels_;
-        memset(label_data, 0, sizeof(float) * num_labels_);
-        for (const auto i : c10::irange(label_proto.int32_data_size())) {
-          label_data[label_proto.int32_data(i)] = weight_proto.float_data(i);
-        }
-      } else if (
-          label_type_ == MULTI_LABEL_DENSE || label_type_ == EMBEDDING_LABEL) {
-        CAFFE_ENFORCE(label_proto.int32_data_size() == num_labels_);
-        int* label_data =
-            prefetched_label_.mutable_data<int>() + item_id * num_labels_;
-        for (const auto i : c10::irange(label_proto.int32_data_size())) {
-          label_data[i] = label_proto.int32_data(i);
-        }
-      } else {
-        LOG(ERROR) << "Unknown label type:" << label_type_;
-      }
-    } else {
-      LOG(FATAL) << "Unsupported label data type.";
-    }
-
-    for (const auto i : c10::irange(additional_output_protos.size())) {
-      auto additional_output_proto = additional_output_protos[i];
-      if (additional_output_proto.data_type() == TensorProto::FLOAT) {
-        float* additional_output =
-            prefetched_additional_outputs_[i].template mutable_data<float>() +
-            item_id * additional_output_proto.float_data_size();
-
-        for (const auto j : c10::irange(additional_output_proto.float_data_size())) {
-          additional_output[j] = additional_output_proto.float_data(j);
-        }
-      } else if (additional_output_proto.data_type() == TensorProto::INT32) {
-        int* additional_output =
-            prefetched_additional_outputs_[i].template mutable_data<int>() +
-            item_id * additional_output_proto.int32_data_size();
-
-        for (const auto j : c10::irange(additional_output_proto.int32_data_size())) {
-          additional_output[j] = additional_output_proto.int32_data(j);
-        }
-      } else if (additional_output_proto.data_type() == TensorProto::INT64) {
-        int64_t* additional_output =
-            prefetched_additional_outputs_[i].template mutable_data<int64_t>() +
-            item_id * additional_output_proto.int64_data_size();
-
-        for (const auto j : c10::irange(additional_output_proto.int64_data_size())) {
-          additional_output[j] = additional_output_proto.int64_data(j);
-        }
-      } else if (additional_output_proto.data_type() == TensorProto::UINT8) {
-        uint8_t* additional_output =
-            prefetched_additional_outputs_[i].template mutable_data<uint8_t>() +
-            item_id * additional_output_proto.int32_data_size();
-
-        for (const auto j : c10::irange(additional_output_proto.int32_data_size())) {
-          additional_output[j] =
-              static_cast<uint8_t>(additional_output_proto.int32_data(j));
-        }
-      } else {
-        LOG(FATAL) << "Unsupported output type.";
-      }
-    }
-  }
-
-  //
-  // convert source to the color format requested from Op
-  //
-  int out_c = color_ ? 3 : 1;
-  if (out_c == src.channels()) {
-    *img = src;
-  } else {
-    cv::cvtColor(
-        src, *img, (out_c == 1) ? cv::COLOR_BGR2GRAY : cv::COLOR_GRAY2BGR);
-  }
-
-  // Note(Yangqing): I believe that the mat should be created continuous.
-  CAFFE_ENFORCE(img->isContinuous());
-
-  // Sanity check now that we decoded everything
-
-  // Ensure that the bounding box is legit
-  if (info.bounding_params.valid &&
-      (src.rows < info.bounding_params.ymin + info.bounding_params.height ||
-       src.cols < info.bounding_params.xmin + info.bounding_params.width)) {
-    info.bounding_params.valid = false;
-  }
-
-  // Apply the bounding box if requested
-  if (info.bounding_params.valid) {
-    // If we reach here, we know the parameters are sane
-    cv::Rect bounding_box(
-        info.bounding_params.xmin,
-        info.bounding_params.ymin,
-        info.bounding_params.width,
-        info.bounding_params.height);
-    *img = (*img)(bounding_box);
-
-    /*
-    LOG(INFO) << "Did bounding with ymin:"
-              << info.bounding_params.ymin << " xmin:" <<
-    info.bounding_params.xmin
-              << " height:" << info.bounding_params.height
-              << " width:" << info.bounding_params.width << "\n";
-    LOG(INFO) << "Bounded matrix: " << img;
-    */
-  } else {
-    // LOG(INFO) << "No bounding\n";
-  }
-
-  cv::Mat scaled_img;
-  bool inception_scale_jitter = false;
-  if (scale_jitter_type_ == INCEPTION_STYLE) {
-    if (!is_test_) {
-      // Inception-stype scale jittering is only used for training
-      inception_scale_jitter =
-          RandomSizedCropping<Context>(img, crop_, randgen);
-      // if a random crop is still not found, do simple random cropping later
-    }
-  }
-
-  if ((scale_jitter_type_ == NO_SCALE_JITTER) ||
-      (scale_jitter_type_ == INCEPTION_STYLE && !inception_scale_jitter)) {
-    int scaled_width, scaled_height;
-    int scale_to_use = scale_ > 0 ? scale_ : minsize_;
-
-    // set the random minsize
-    if (random_scaling_) {
-      scale_to_use = std::uniform_int_distribution<>(
-          random_scale_[0], random_scale_[1])(*randgen);
-    }
-
-    if (warp_) {
-      scaled_width = scale_to_use;
-      scaled_height = scale_to_use;
-    } else if (img->rows > img->cols) {
-      scaled_width = scale_to_use;
-      scaled_height = static_cast<float>(img->rows) * scale_to_use / img->cols;
-    } else {
-      scaled_height = scale_to_use;
-      scaled_width = static_cast<float>(img->cols) * scale_to_use / img->rows;
-    }
-    if ((scale_ > 0 &&
-         (scaled_height != img->rows || scaled_width != img->cols)) ||
-        (scaled_height > img->rows || scaled_width > img->cols)) {
-      // We rescale in all cases if we are using scale_
-      // but only to make the image bigger if using minsize_
-      /*
-      LOG(INFO) << "Scaling to " << scaled_width << " x " << scaled_height
-                << " From " << img->cols << " x " << img->rows;
-      */
-      cv::resize(
-          *img,
-          scaled_img,
-          cv::Size(scaled_width, scaled_height),
-          0,
-          0,
-          cv::INTER_AREA);
-      *img = scaled_img;
-    }
-  }
-
-  // TODO(Yangqing): return false if any error happens.
-  return true;
-}
-
-// assume HWC order and color channels BGR
-template <class Context>
-void Saturation(
-    float* img,
-    const int img_size,
-    const float alpha_rand,
-    std::mt19937* randgen) {
-  float alpha = 1.0f +
-      std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
-  // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
-  int p = 0;
-  for (const auto h : c10::irange(img_size)) {
-    for (const auto w : c10::irange(img_size)) {
-      float gray_color = img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f +
-          img[3 * p + 2] * 0.299f;
-      for (const auto c : c10::irange(3)) {
-        img[3 * p + c] = img[3 * p + c] * alpha + gray_color * (1.0f - alpha);
-      }
-      p++;
-    }
-  }
-}
-
-// assume HWC order and color channels BGR
-template <class Context>
-void Brightness(
-    float* img,
-    const int img_size,
-    const float alpha_rand,
-    std::mt19937* randgen) {
-  float alpha = 1.0f +
-      std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
-  int p = 0;
-  for (const auto h : c10::irange(img_size)) {
-    for (const auto w : c10::irange(img_size)) {
-      for (const auto c : c10::irange(3)) {
-        img[p++] *= alpha;
-      }
-    }
-  }
-}
-
-// assume HWC order and color channels BGR
-template <class Context>
-void Contrast(
-    float* img,
-    const int img_size,
-    const float alpha_rand,
-    std::mt19937* randgen) {
-  float gray_mean = 0;
-  int p = 0;
-  for (const auto h : c10::irange(img_size)) {
-    for (const auto w : c10::irange(img_size)) {
-      // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
-      gray_mean += img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f +
-          img[3 * p + 2] * 0.299f;
-      p++;
-    }
-  }
-  gray_mean /= (img_size * img_size);
-
-  float alpha = 1.0f +
-      std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
-  p = 0;
-  for (const auto h : c10::irange(img_size)) {
-    for (const auto w : c10::irange(img_size)) {
-      for (const auto c : c10::irange(3)) {
-        img[p] = img[p] * alpha + gray_mean * (1.0f - alpha);
-        p++;
-      }
-    }
-  }
-}
-
-// assume HWC order and color channels BGR
-template <class Context>
-void ColorJitter(
-    float* img,
-    const int img_size,
-    const float saturation,
-    const float brightness,
-    const float contrast,
-    std::mt19937* randgen) {
-  std::srand(unsigned(std::time(0)));
-  std::vector<int> jitter_order{0, 1, 2};
-  // obtain a time-based seed:
-  unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
-  std::shuffle(
-      jitter_order.begin(),
-      jitter_order.end(),
-      std::default_random_engine(seed));
-
-  for (const auto i : c10::irange(3)) {
-    if (jitter_order[i] == 0) {
-      Saturation<Context>(img, img_size, saturation, randgen);
-    } else if (jitter_order[i] == 1) {
-      Brightness<Context>(img, img_size, brightness, randgen);
-    } else {
-      Contrast<Context>(img, img_size, contrast, randgen);
-    }
-  }
-}
-
-// assume HWC order and color channels BGR
-template <class Context>
-void ColorLighting(
-    float* img,
-    const int img_size,
-    const float alpha_std,
-    const std::vector<std::vector<float>>& eigvecs,
-    const std::vector<float>& eigvals,
-    std::mt19937* randgen) {
-  std::normal_distribution<float> d(0, alpha_std);
-  std::vector<float> alphas(3);
-  for (const auto i : c10::irange(3)) {
-    alphas[i] = d(*randgen);
-  }
-
-  std::vector<float> delta_rgb(3, 0.0);
-  for (const auto i : c10::irange(3)) {
-    for (const auto j : c10::irange(3)) {
-      delta_rgb[i] += eigvecs[i][j] * eigvals[j] * alphas[j];
-    }
-  }
-
-  int p = 0;
-  for (const auto h : c10::irange(img_size)) {
-    for (const auto w : c10::irange(img_size)) {
-      for (const auto c : c10::irange(3)) {
-        img[p++] += delta_rgb[2 - c];
-      }
-    }
-  }
-}
-
-// assume HWC order and color channels BGR
-// mean subtraction and scaling.
-template <class Context>
-void ColorNormalization(
-    float* img,
-    const int img_size,
-    const int channels,
-    const std::vector<float>& mean,
-    const std::vector<float>& std) {
-  int p = 0;
-  for (const auto h : c10::irange(img_size)) {
-    for (const auto w : c10::irange(img_size)) {
-      for (const auto c : c10::irange(channels)) {
-        img[p] = (img[p] - mean[c]) * std[c];
-        p++;
-      }
-    }
-  }
-}
-
-// Factored out image transformation
-template <class Context>
-void TransformImage(
-    const cv::Mat& scaled_img,
-    const int channels,
-    float* image_data,
-    const bool color_jitter,
-    const float saturation,
-    const float brightness,
-    const float contrast,
-    const bool color_lighting,
-    const float color_lighting_std,
-    const std::vector<std::vector<float>>& color_lighting_eigvecs,
-    const std::vector<float>& color_lighting_eigvals,
-    const int crop,
-    const bool mirror,
-    const std::vector<float>& mean,
-    const std::vector<float>& std,
-    std::mt19937* randgen,
-    std::bernoulli_distribution* mirror_this_image,
-    bool is_test = false) {
-  CAFFE_ENFORCE_GE(
-      scaled_img.rows, crop, "Image height must be bigger than crop.");
-  CAFFE_ENFORCE_GE(
-      scaled_img.cols, crop, "Image width must be bigger than crop.");
-
-  // find the cropped region, and copy it to the destination matrix
-  int width_offset, height_offset;
-  if (is_test) {
-    width_offset = (scaled_img.cols - crop) / 2;
-    height_offset = (scaled_img.rows - crop) / 2;
-  } else {
-    width_offset =
-        std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen);
-    height_offset =
-        std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen);
-  }
-
-  float* image_data_ptr = image_data;
-  if (!is_test && mirror && (*mirror_this_image)(*randgen)) {
-    // Copy mirrored image.
-    for (int h = height_offset; h < height_offset + crop; ++h) {
-      for (int w = width_offset + crop - 1; w >= width_offset; --w) {
-        const uint8_t* cv_data = scaled_img.ptr(h) + w * channels;
-        for (const auto c : c10::irange(channels)) {
-          *(image_data_ptr++) = static_cast<float>(cv_data[c]);
-        }
-      }
-    }
-  } else {
-    // Copy normally.
-    for (int h = height_offset; h < height_offset + crop; ++h) {
-      for (int w = width_offset; w < width_offset + crop; ++w) {
-        const uint8_t* cv_data = scaled_img.ptr(h) + w * channels;
-        for (const auto c : c10::irange(channels)) {
-          *(image_data_ptr++) = static_cast<float>(cv_data[c]);
-        }
-      }
-    }
-  }
-
-  if (color_jitter && channels == 3 && !is_test) {
-    ColorJitter<Context>(
-        image_data, crop, saturation, brightness, contrast, randgen);
-  }
-  if (color_lighting && channels == 3 && !is_test) {
-    ColorLighting<Context>(
-        image_data,
-        crop,
-        color_lighting_std,
-        color_lighting_eigvecs,
-        color_lighting_eigvals,
-        randgen);
-  }
-
-  // Color normalization
-  // Mean subtraction and scaling.
-  ColorNormalization<Context>(image_data, crop, channels, mean, std);
-}
-
-// Only crop / transpose the image
-// leave in uint8_t dataType
-template <class Context>
-void CropTransposeImage(
-    const cv::Mat& scaled_img,
-    const int channels,
-    uint8_t* cropped_data,
-    const int crop,
-    const bool mirror,
-    std::mt19937* randgen,
-    std::bernoulli_distribution* mirror_this_image,
-    bool is_test = false) {
-  CAFFE_ENFORCE_GE(
-      scaled_img.rows, crop, "Image height must be bigger than crop.");
-  CAFFE_ENFORCE_GE(
-      scaled_img.cols, crop, "Image width must be bigger than crop.");
-
-  // find the cropped region, and copy it to the destination matrix
-  int width_offset, height_offset;
-  if (is_test) {
-    width_offset = (scaled_img.cols - crop) / 2;
-    height_offset = (scaled_img.rows - crop) / 2;
-  } else {
-    width_offset =
-        std::uniform_int_distribution<>(0, scaled_img.cols - crop)(*randgen);
-    height_offset =
-        std::uniform_int_distribution<>(0, scaled_img.rows - crop)(*randgen);
-  }
-
-  if (mirror && (*mirror_this_image)(*randgen)) {
-    // Copy mirrored image.
-    for (int h = height_offset; h < height_offset + crop; ++h) {
-      for (int w = width_offset + crop - 1; w >= width_offset; --w) {
-        const uint8_t* cv_data = scaled_img.ptr(h) + w * channels;
-        for (const auto c : c10::irange(channels)) {
-          *(cropped_data++) = cv_data[c];
-        }
-      }
-    }
-  } else {
-    // Copy normally.
-    for (int h = height_offset; h < height_offset + crop; ++h) {
-      for (int w = width_offset; w < width_offset + crop; ++w) {
-        const uint8_t* cv_data = scaled_img.ptr(h) + w * channels;
-        for (const auto c : c10::irange(channels)) {
-          *(cropped_data++) = cv_data[c];
-        }
-      }
-    }
-  }
-}
-
-// Parse datum, decode image, perform transform
-// Intended as entry point for binding to thread pool
-template <class Context>
-void ImageInputOp<Context>::DecodeAndTransform(
-    const std::string& value,
-    float* image_data,
-    int item_id,
-    const int channels,
-    std::size_t thread_index) {
-  CAFFE_ENFORCE((int)thread_index < num_decode_threads_);
-
-  std::bernoulli_distribution mirror_this_image(0.5f);
-  std::mt19937* randgen = &(randgen_per_thread_[thread_index]);
-
-  cv::Mat img;
-  // Decode the image
-  PerImageArg info;
-  CHECK(
-      GetImageAndLabelAndInfoFromDBValue(value, &img, info, item_id, randgen));
-  // Factor out the image transformation
-  TransformImage<Context>(
-      img,
-      channels,
-      image_data,
-      color_jitter_,
-      img_saturation_,
-      img_brightness_,
-      img_contrast_,
-      color_lighting_,
-      color_lighting_std_,
-      color_lighting_eigvecs_,
-      color_lighting_eigvals_,
-      crop_,
-      mirror_,
-      mean_,
-      std_,
-      randgen,
-      &mirror_this_image,
-      is_test_);
-}
-
-template <class Context>
-void ImageInputOp<Context>::DecodeAndTransposeOnly(
-    const std::string& value,
-    uint8_t* image_data,
-    int item_id,
-    const int channels,
-    std::size_t thread_index) {
-  CAFFE_ENFORCE((int)thread_index < num_decode_threads_);
-
-  std::bernoulli_distribution mirror_this_image(0.5f);
-  std::mt19937* randgen = &(randgen_per_thread_[thread_index]);
-
-  cv::Mat img;
-  // Decode the image
-  PerImageArg info;
-  CHECK(
-      GetImageAndLabelAndInfoFromDBValue(value, &img, info, item_id, randgen));
-
-  // Factor out the image transformation
-  CropTransposeImage<Context>(
-      img,
-      channels,
-      image_data,
-      crop_,
-      mirror_,
-      randgen,
-      &mirror_this_image,
-      is_test_);
-}
-
-template <class Context>
-bool ImageInputOp<Context>::Prefetch() {
-  if (!owned_reader_.get()) {
-    // if we are not owning the reader, we will get the reader pointer from
-    // input. Otherwise the constructor should have already set the reader
-    // pointer.
-    reader_ = &OperatorBase::Input<db::DBReader>(0);
-  }
-  const int channels = color_ ? 3 : 1;
-  // Call mutable_data() once to allocate the underlying memory.
-  if (gpu_transform_) {
-    // we'll transfer up in int8, then convert later
-    prefetched_image_.mutable_data<uint8_t>();
-  } else {
-    prefetched_image_.mutable_data<float>();
-  }
-
-  prefetched_label_.mutable_data<int>();
-  // Prefetching handled with a thread pool of "decode_threads" threads.
-
-  for (const auto item_id : c10::irange(batch_size_)) {
-    std::string key, value;
-    cv::Mat img;
-
-    // read data
-    reader_->Read(&key, &value);
-
-    // determine label type based on first item
-    if (item_id == 0) {
-      if (use_caffe_datum_) {
-        prefetched_label_.mutable_data<int>();
-      } else {
-        TensorProtos protos;
-        CAFFE_ENFORCE(protos.ParseFromString(value));
-        TensorProto_DataType labeldt = protos.protos(1).data_type();
-        if (labeldt == TensorProto::INT32) {
-          prefetched_label_.mutable_data<int>();
-        } else if (labeldt == TensorProto::FLOAT) {
-          prefetched_label_.mutable_data<float>();
-        } else {
-          LOG(FATAL) << "Unsupported label type.";
-        }
-
-        for (const auto i : c10::irange(additional_inputs_count_)) {
-          int index = additional_inputs_offset_ + i;
-          TensorProto additional_output_proto = protos.protos(index);
-          auto sizes =
-              std::vector<int64_t>({batch_size_, additional_output_sizes_[i]});
-          if (additional_output_proto.data_type() == TensorProto::FLOAT) {
-            prefetched_additional_outputs_[i] =
-                caffe2::empty(sizes, at::dtype<float>().device(CPU));
-          } else if (
-              additional_output_proto.data_type() == TensorProto::INT32) {
-            prefetched_additional_outputs_[i] =
-                caffe2::empty(sizes, at::dtype<int>().device(CPU));
-          } else if (
-              additional_output_proto.data_type() == TensorProto::INT64) {
-            prefetched_additional_outputs_[i] =
-                caffe2::empty(sizes, at::dtype<int64_t>().device(CPU));
-          } else if (
-              additional_output_proto.data_type() == TensorProto::UINT8) {
-            prefetched_additional_outputs_[i] =
-                caffe2::empty(sizes, at::dtype<uint8_t>().device(CPU));
-          } else {
-            LOG(FATAL) << "Unsupported output type.";
-          }
-        }
-      }
-    }
-
-    // launch into thread pool for processing
-    // TODO: support color jitter and color lighting in gpu_transform
-    if (gpu_transform_) {
-      // output of decode will still be int8
-      uint8_t* image_data = prefetched_image_.mutable_data<uint8_t>() +
-          crop_ * crop_ * channels * item_id;
-      thread_pool_->runTaskWithID(std::bind(
-          &ImageInputOp<Context>::DecodeAndTransposeOnly,
-          this,
-          std::string(value),
-          image_data,
-          item_id,
-          channels,
-          std::placeholders::_1));
-    } else {
-      float* image_data = prefetched_image_.mutable_data<float>() +
-          crop_ * crop_ * channels * item_id;
-      thread_pool_->runTaskWithID(std::bind(
-          &ImageInputOp<Context>::DecodeAndTransform,
-          this,
-          std::string(value),
-          image_data,
-          item_id,
-          channels,
-          std::placeholders::_1));
-    }
-  }
-  thread_pool_->waitWorkComplete();
-
-  // we allow to get at most max_decode_error_ratio from
-  // opencv imdecode until raising a runtime exception
-  if ((float)num_decode_errors_in_batch_ / batch_size_ >
-      max_decode_error_ratio_) {
-    throw std::runtime_error(
-        "max_decode_error_ratio exceeded " +
-        c10::to_string(max_decode_error_ratio_));
-  }
-
-  // If the context is not CPUContext, we will need to do a copy in the
-  // prefetch function as well.
-  auto device = at::device(Context::GetDeviceType());
-  if (!std::is_same<Context, CPUContext>::value) {
-    // do sync copies
-    ReinitializeAndCopyFrom(
-        &prefetched_image_on_device_, device, prefetched_image_);
-    ReinitializeAndCopyFrom(
-        &prefetched_label_on_device_, device, prefetched_label_);
-
-    for (const auto i : c10::irange(prefetched_additional_outputs_on_device_.size())) {
-      ReinitializeAndCopyFrom(
-          &prefetched_additional_outputs_on_device_[i],
-          device,
-          prefetched_additional_outputs_[i]);
-    }
-  }
-
-  num_decode_errors_in_batch_ = 0;
-
-  return true;
-}
-
-template <class Context>
-bool ImageInputOp<Context>::CopyPrefetched() {
-  auto type = Device(Context::GetDeviceType());
-  auto options = at::device(type);
-
-  // Note(jiayq): The if statement below should be optimized away by the
-  // compiler since std::is_same is a constexpr.
-  if (std::is_same<Context, CPUContext>::value) {
-    OperatorBase::OutputTensorCopyFrom(
-        0, options, prefetched_image_, /* async */ true);
-    OperatorBase::OutputTensorCopyFrom(
-        1, options, prefetched_label_, /* async */ true);
-
-    for (const auto i : c10::irange(2, OutputSize())) {
-      OperatorBase::OutputTensorCopyFrom(
-          i, options, prefetched_additional_outputs_[i - 2], /* async */ true);
-    }
-  } else {
-    // TODO: support color jitter and color lighting in gpu_transform
-    if (gpu_transform_) {
-      if (!mean_std_copied_) {
-        ReinitializeTensor(
-            &mean_gpu_,
-            {static_cast<int64_t>(mean_.size())},
-            at::dtype<float>().device(Context::GetDeviceType()));
-        ReinitializeTensor(
-            &std_gpu_,
-            {static_cast<int64_t>(std_.size())},
-            at::dtype<float>().device(Context::GetDeviceType()));
-
-        context_.template CopyFromCPU<float>(
-            mean_.size(),
-            mean_.data(),
-            mean_gpu_.template mutable_data<float>());
-        context_.template CopyFromCPU<float>(
-            std_.size(), std_.data(), std_gpu_.template mutable_data<float>());
-        mean_std_copied_ = true;
-      }
-      const auto& X = prefetched_image_on_device_;
-      // data comes in as NHWC
-      const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
-      // data goes out as NCHW
-      auto dims = std::vector<int64_t>{N, C, H, W};
-      if (!ApplyTransformOnGPU(dims, type)) {
-        return false;
-      }
-
-    } else {
-      OperatorBase::OutputTensorCopyFrom(
-          0, type, prefetched_image_on_device_, /* async */ true);
-    }
-    OperatorBase::OutputTensorCopyFrom(
-        1, type, prefetched_label_on_device_, /* async */ true);
-
-    for (const auto i : c10::irange(2, OutputSize())) {
-      OperatorBase::OutputTensorCopyFrom(
-          i,
-          type,
-          prefetched_additional_outputs_on_device_[i - 2],
-          /* async */ true);
-    }
-  }
-  return true;
-}
-} // namespace caffe2
-
-#endif // CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
diff --git a/caffe2/image/image_input_op_gpu.cc b/caffe2/image/image_input_op_gpu.cc
deleted file mode 100644
index a484585770e04..0000000000000
--- a/caffe2/image/image_input_op_gpu.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-#include "caffe2/core/common_gpu.h"
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/image/image_input_op.h"
-
-namespace caffe2 {
-
-template <>
-bool ImageInputOp<CUDAContext>::ApplyTransformOnGPU(
-    const std::vector<std::int64_t>& dims,
-    const c10::Device& type) {
-  // GPU transform kernel allows explicitly setting output type
-  if (output_type_ == TensorProto_DataType_FLOAT) {
-    auto* image_output =
-        OperatorBase::OutputTensor(0, dims, at::dtype<float>().device(type));
-    TransformOnGPU<uint8_t, float, CUDAContext>(
-        prefetched_image_on_device_,
-        image_output,
-        mean_gpu_,
-        std_gpu_,
-        &context_);
-  } else if (output_type_ == TensorProto_DataType_FLOAT16) {
-    auto* image_output =
-        OperatorBase::OutputTensor(0, dims, at::dtype<at::Half>().device(type));
-    TransformOnGPU<uint8_t, at::Half, CUDAContext>(
-        prefetched_image_on_device_,
-        image_output,
-        mean_gpu_,
-        std_gpu_,
-        &context_);
-  } else {
-    return false;
-  }
-  return true;
-}
-
-REGISTER_CUDA_OPERATOR(ImageInput, ImageInputOp<CUDAContext>);
-
-} // namespace caffe2
diff --git a/caffe2/image/transform_gpu.cu b/caffe2/image/transform_gpu.cu
deleted file mode 100644
index b89886f5fc882..0000000000000
--- a/caffe2/image/transform_gpu.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/image/transform_gpu.h"
-#include "caffe2/utils/conversions.h"
-
-/**
- *
- * Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
- * Distributed under 2-clause BSD license; see accompanying LICENSE file
- *
- **/
-
-namespace caffe2 {
-
-namespace {
-
-// input in (int8, NHWC), output in (fp32, NCHW)
-template <typename In, typename Out>
-__global__ void transform_kernel(
-    const int C,
-    const int H,
-    const int W,
-    const float* mean,
-    const float* std,
-    const In* in,
-    Out* out) {
-  const auto n = blockIdx.x;
-
-  const auto nStride = C*H*W;
-
-  // pointers to data for this image
-  const In *const input_ptr = &in[n*nStride];
-  Out *const output_ptr = &out[n*nStride];
-
-  // either read or write uncoalesced - try reading
-  for (int c=0; c < C; ++c) {
-    for (int h=threadIdx.y; h < H; h += blockDim.y) {
-      for (int w=threadIdx.x; w < W; w += blockDim.x) {
-        const int in_idx = c + C*w + C*W*h;  // HWC
-        const int out_idx = c*H*W + h*W + w;  // CHW
-
-        output_ptr[out_idx] = convert::To<float,Out>(
-          (convert::To<In,float>(input_ptr[in_idx])-mean[c]) * std[c]);
-      }
-    }
-  }
-}
-
-}
-
-template <typename T_IN, typename T_OUT, class Context>
-
-bool TransformOnGPU(
-    Tensor& X,
-    Tensor* Y,
-    Tensor& mean,
-    Tensor& std,
-    Context* context) {
-  const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
-  auto* input_data = X.template data<T_IN>();
-  auto* output_data = Y->template mutable_data<T_OUT>();
-
-  transform_kernel<
-    T_IN, T_OUT><<<N, dim3(16, 16), 0, context->cuda_stream()>>>(
-      C, H, W, mean.template data<float>(), std.template data<float>(),
-      input_data, output_data);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-  return true;
-};
-
-template bool TransformOnGPU<uint8_t, float, CUDAContext>(
-    Tensor& X,
-    Tensor* Y,
-    Tensor& mean,
-    Tensor& std,
-    CUDAContext* context);
-
-template bool TransformOnGPU<uint8_t, at::Half, CUDAContext>(
-    Tensor& X,
-    Tensor* Y,
-    Tensor& mean,
-    Tensor& std,
-    CUDAContext* context);
-
-}  // namespace caffe2
diff --git a/caffe2/image/transform_gpu.h b/caffe2/image/transform_gpu.h
deleted file mode 100644
index 3ca11ce159feb..0000000000000
--- a/caffe2/image/transform_gpu.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef CAFFE2_IMAGE_TRANSFORM_GPU_H_
-#define CAFFE2_IMAGE_TRANSFORM_GPU_H_
-
-/**
- *
- * Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- **/
-
-#include "caffe2/core/context.h"
-
-namespace caffe2 {
-
-template <typename T_IN, typename T_OUT, class Context>
-bool TransformOnGPU(
-    Tensor& X,
-    Tensor* Y,
-    Tensor& mean,
-    Tensor& std,
-    Context* context);
-
-}  // namespace caffe2
-
-#endif
diff --git a/caffe2/onnx/onnx_exporter.cc b/caffe2/onnx/onnx_exporter.cc
index 16cdd62270712..f9e322924a0d9 100644
--- a/caffe2/onnx/onnx_exporter.cc
+++ b/caffe2/onnx/onnx_exporter.cc
@@ -1367,13 +1367,13 @@ ConvertedResult OnnxExporter::CreateGemmNodes(
 
   // capture the outer shape if needed.
   if (axis > 1) {
-    const auto x_shape = dummy_->NewDummyName();
-    nodes.emplace_back(MakeNode("Shape", {x}, {x_shape}));
+    const auto x_shape_2 = dummy_->NewDummyName();
+    nodes.emplace_back(MakeNode("Shape", {x}, {x_shape_2}));
 
     const auto x_shape_outer = dummy_->NewDummyName();
     nodes.emplace_back(MakeNode(
         "Slice",
-        {x_shape},
+        {x_shape_2},
         {x_shape_outer},
         std::vector<AttributeProto>{
             MakeAttribute("starts", std::vector<int64_t>{0}),
diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc
index 36a5f4c4bcaeb..b2c6ccef7ce8d 100644
--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@@ -37,29 +37,9 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
           "The current padding scheme leads to unequal padding on the left "
           "and right, which is not supported by cudnn.");
     }
-    // dilated convolution supported by some algorithms in cuDNN v6
-#if !(CUDNN_VERSION_MIN(6, 0, 0))
-    OPERATOR_NEEDS_FEATURE(
-        dilation_h() == 1 && dilation_w() == 1,
-        "The cudnn convolution does not support dilation yet.");
-#endif
-    // dilated grouped convolution supported in cuDNN v7.1
-#if !(CUDNN_VERSION_MIN(7, 1, 0))
-    if (group_ != 1) {
-      for (int dim = 0; dim < kernel_.size(); ++dim) {
-        OPERATOR_NEEDS_FEATURE(
-            dilation_[dim] == 1,
-            "When group is used, dilation should not be set at the same time.");
-      }
-    }
-#endif
 
-#if CUDNN_VERSION_MIN(7, 0, 0)
     // verify TensorCore math is supported
     enable_tensor_core_ &= TensorCoreAvailable();
-#else
-    enable_tensor_core_ = false;
-#endif
 
     bool individual_force_algo = OperatorBase::HasArgument("force_algo_fwd") ||
         OperatorBase::HasArgument("force_algo_dgrad") ||
@@ -108,11 +88,7 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
       int H,
       int W,
       int D) {
-#if CUDNN_VERSION_MIN(7, 0, 0)
     const int CC = C;
-#else
-    const int CC = C / group_;
-#endif
     switch (order_) {
       case StorageOrder::NHWC:
         if (size == 4) {
@@ -182,7 +158,6 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
       int dilation_height = 0;
       int dilation_width = 0;
 
-#if CUDNN_VERSION_MIN(6, 0, 0)
       CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
           input,
           &pad_height,
@@ -193,19 +168,7 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
           &dilation_width,
           &mode,
           &dataType));
-#else
-      CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
-          input,
-          &pad_height,
-          &pad_width,
-          &stride_height,
-          &stride_width,
-          &dilation_height,
-          &dilation_width,
-          &mode));
-#endif
 
-#if CUDNN_VERSION_MIN(6, 0, 0)
       CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
           copy,
           pad_height,
@@ -216,17 +179,6 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
           dilation_width,
           mode,
           dataType));
-#else
-      CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
-          copy,
-          pad_height,
-          pad_width,
-          stride_height,
-          stride_width,
-          dilation_height,
-          dilation_width,
-          mode));
-#endif
     } else {
       cudnnConvolutionMode_t mode;
       cudnnDataType_t dataType;
@@ -278,7 +230,6 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
   }
 
   void SetConvDescFromArguments() {
-#if CUDNN_VERSION_MIN(6, 0, 0)
     if (kernel_.size() == 1 || kernel_.size() == 2) {
       CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
           conv_desc_,
@@ -300,29 +251,6 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
           CUDNN_CROSS_CORRELATION,
           compute_type_));
     }
-#else
-    if (kernel_.size() == 2) {
-      CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
-          conv_desc_,
-          pad_t(),
-          pad_l(),
-          stride_h(),
-          stride_w(),
-          1,
-          1,
-          CUDNN_CROSS_CORRELATION));
-    } else {
-      vector<int> ones(dilation_.size(), 1);
-      CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
-          conv_desc_,
-          kernel_.size(),
-          pads_.data(),
-          stride_.data(),
-          ones.data(),
-          CUDNN_CROSS_CORRELATION,
-          compute_type_));
-    }
-#endif
   }
 
   void SetConvDescComputeType(
@@ -338,7 +266,6 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
       int dilation_height = 0;
       int dilation_width = 0;
 
-#if CUDNN_VERSION_MIN(6, 0, 0)
       CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
           conv_desc,
           &pad_height,
@@ -349,19 +276,7 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
           &dilation_width,
           &mode,
           &dataType));
-#else
-      CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
-          conv_desc,
-          &pad_height,
-          &pad_width,
-          &stride_height,
-          &stride_width,
-          &dilation_height,
-          &dilation_width,
-          &mode));
-#endif
 
-#if CUDNN_VERSION_MIN(6, 0, 0)
       CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
           conv_desc,
           pad_height,
@@ -372,17 +287,6 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
           dilation_width,
           mode,
           math));
-#else
-      CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
-          conv_desc,
-          pad_height,
-          pad_width,
-          stride_height,
-          stride_width,
-          dilation_height,
-          dilation_width,
-          mode));
-#endif
     } else {
       cudnnConvolutionMode_t mode;
       cudnnDataType_t dataType;
@@ -523,7 +427,8 @@ bool CudnnConvOp::DoRunWithType() {
   auto* Y = Output(0, output_sizes, at::dtype<T_Y>());
 
   int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
-  int group_offset_X = 0, group_offset_Y = 0;
+  [[maybe_unused]] int group_offset_X = 0;
+  [[maybe_unused]] int group_offset_Y = 0;
 
   switch (order_) {
     case StorageOrder::NHWC:
@@ -576,9 +481,6 @@ bool CudnnConvOp::DoRunWithType() {
     return true;
   }
 
-#if !CUDNN_VERSION_MIN(7, 0, 0)
-  int group_offset_filter = filter.numel() / group_;
-#endif
 
   // Set up the cudnn algorithms & workspace if necessary
   bool input_changed = (X.sizes() != cudnn_input_dims_);
@@ -592,11 +494,7 @@ bool CudnnConvOp::DoRunWithType() {
     if (filter_changed) {
       cudnn_filter_dims_ = filter.sizes().vec();
       if (kernel_.size() == 1 || kernel_.size() == 2) {
-#if CUDNN_VERSION_MIN(7, 0, 0)
         const int MM = M;
-#else
-        const int MM = M / group_;
-#endif
         CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
             filter_desc_,
             cudnnTypeWrapper<T_W>::type,
@@ -607,12 +505,6 @@ bool CudnnConvOp::DoRunWithType() {
             kernel_.size() == 1 ? 1 : kernel_w()));
       } else {
         vector<int> dims(filter.sizes().begin(), filter.sizes().end());
-#if !CUDNN_VERSION_MIN(7, 0, 0)
-        // We only need to divide dims by group_ when CUDNN version < 7.0
-        // see CUDA group convolution doc: https://fburl.com/dgj6dvpd
-        order_ == StorageOrder::NCHW ? dims[1] /= group_
-                                     : dims[filter.ndim() - 1] /= group_;
-#endif
         CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
             filter_desc_,
             cudnnTypeWrapper<T_W>::type,
@@ -674,7 +566,6 @@ bool CudnnConvOp::DoRunWithType() {
     compute_type_ = DetermineComputeTypeFromInput(X);
     SetConvDescFromArguments();
 
-#if CUDNN_VERSION_MIN(7, 0, 0)
     if (enable_tensor_core_) {
       CUDNN_ENFORCE(
           cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
@@ -682,7 +573,6 @@ bool CudnnConvOp::DoRunWithType() {
 
     // enable cuDNN conv groups
     CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_));
-#endif
 
     if (force_algo_[ALGO_FWD] >= 0) {
       algo_ = (cudnnConvolutionFwdAlgo_t)force_algo_[ALGO_FWD];
@@ -808,7 +698,6 @@ bool CudnnConvOp::DoRunWithType() {
 
   // Now, actually run the computation.
   // Run directly through cuDNN if possible
-#if CUDNN_VERSION_MIN(7, 0, 0)
   cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
     CUDNN_ENFORCE(cudnnConvolutionForward(
         state->cudnn_handle(),
@@ -825,27 +714,6 @@ bool CudnnConvOp::DoRunWithType() {
         top_desc_,
         Y->template mutable_data<T_Y>()));
   });
-#else
-  // otherwise manually run through groups
-  for (int i = 0; i < group_; ++i) {
-    cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
-      CUDNN_ENFORCE(cudnnConvolutionForward(
-          state->cudnn_handle(),
-          cudnnTypeWrapper<T_X>::kOne(),
-          bottom_desc_,
-          X.template data<T_X>() + i * group_offset_X,
-          filter_desc_,
-          filter.template data<T_W>() + i * group_offset_filter,
-          conv_desc_,
-          algo_,
-          state->workspace().get(cudnn_ws_nbytes_),
-          cudnn_ws_nbytes_,
-          cudnnTypeWrapper<T_Y>::kZero(),
-          top_desc_,
-          Y->template mutable_data<T_Y>() + i * group_offset_Y));
-    });
-  }
-#endif
   // Bias
   if (InputSize() == 3) {
     auto& bias = Input(BIAS);
@@ -905,7 +773,8 @@ bool CudnnConvGradientOp::DoRunWithType() {
 
   const int M = filter.dim32(0);
   int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
-  int group_offset_X = 0, group_offset_Y = 0;
+  [[maybe_unused]] int group_offset_X = 0;
+  [[maybe_unused]] int group_offset_Y = 0;
 
   switch (order_) {
     case StorageOrder::NHWC:
@@ -953,9 +822,6 @@ bool CudnnConvGradientOp::DoRunWithType() {
       "If you set group, the number of output channels should be divisible "
       "by group.");
 
-#if !CUDNN_VERSION_MIN(7, 0, 0)
-  int group_offset_filter = filter.numel() / group_;
-#endif
   if (kernel_.size() == 1) {
     ConvPoolOpBase<CUDAContext>::ComputePads({H});
   } else if (kernel_.size() == 2) {
@@ -1003,11 +869,7 @@ bool CudnnConvGradientOp::DoRunWithType() {
     if (filter_changed) {
       cudnn_filter_dims_ = filter.sizes().vec();
       if (kernel_.size() == 1 || kernel_.size() == 2) {
-#if CUDNN_VERSION_MIN(7, 0, 0)
         const int MM = M;
-#else
-        const int MM = M / group_;
-#endif
         CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
             filter_desc_,
             cudnnTypeWrapper<T_W>::type,
@@ -1018,12 +880,6 @@ bool CudnnConvGradientOp::DoRunWithType() {
             kernel_.size() == 1 ? 1 : kernel_w()));
       } else {
         vector<int> dims(filter.sizes().begin(), filter.sizes().end());
-#if !CUDNN_VERSION_MIN(7, 0, 0)
-        // We only need to divide dims by group_ when CUDNN version < 7.0
-        // see CUDA group convolution doc: https://fburl.com/dgj6dvpd
-        order_ == StorageOrder::NCHW ? dims[1] /= group_
-                                     : dims[filter.ndim() - 1] /= group_;
-#endif
 
         CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
             filter_desc_,
@@ -1091,7 +947,6 @@ bool CudnnConvGradientOp::DoRunWithType() {
     DuplicateConvDesc(
         conv_desc_, kernel_.size(), dilation_.size(), bwd_data_conv_desc_);
 
-#if CUDNN_VERSION_MIN(7, 0, 0)
     if (enable_tensor_core_) {
       CUDNN_ENFORCE(cudnnSetConvolutionMathType(
           bwd_filter_conv_desc_, CUDNN_TENSOR_OP_MATH));
@@ -1102,7 +957,6 @@ bool CudnnConvGradientOp::DoRunWithType() {
     // set cuDNN groups if appropriate
     CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_filter_conv_desc_, group_));
     CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_data_conv_desc_, group_));
-#endif
 
     // Choose dW algorithm
     if (force_algo_[ALGO_WGRAD] >= 0) {
@@ -1388,7 +1242,6 @@ bool CudnnConvGradientOp::DoRunWithType() {
         dbias->template mutable_data<T_DB>()));
   }
 
-#if CUDNN_VERSION_MIN(7, 0, 0)
   cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
     CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
         state->cudnn_handle(),
@@ -1427,45 +1280,6 @@ bool CudnnConvGradientOp::DoRunWithType() {
           dX->template mutable_data<T_DX>()));
     }
   });
-#else
-  for (int i = 0; i < group_; ++i) {
-    cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
-      CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
-          state->cudnn_handle(),
-          cudnnTypeWrapper<T_X>::kOne(),
-          bottom_desc_,
-          X.template data<T_X>() + i * group_offset_X,
-          top_desc_,
-          dY.template data<T_DY>() + i * group_offset_Y,
-          bwd_filter_conv_desc_,
-          bwd_filter_algo_,
-          state->workspace().get(cudnn_ws_nbytes_),
-          cudnn_ws_nbytes_,
-          cudnnTypeWrapper<T_DW>::kZero(),
-          filter_desc_,
-          dfilter->template mutable_data<T_DW>() + i * group_offset_filter));
-      if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
-        // Compute the gradient w.r.t. the input.
-        auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
-        dX->ResizeLike(X);
-        CUDNN_ENFORCE(cudnnConvolutionBackwardData(
-            state->cudnn_handle(),
-            cudnnTypeWrapper<T_W>::kOne(),
-            filter_desc_,
-            filter.template data<T_W>() + i * group_offset_filter,
-            top_desc_,
-            dY.template data<T_DY>() + i * group_offset_Y,
-            bwd_data_conv_desc_,
-            bwd_data_algo_,
-            state->workspace().get(cudnn_ws_nbytes_),
-            cudnn_ws_nbytes_,
-            cudnnTypeWrapper<T_DX>::kZero(),
-            bottom_desc_,
-            dX->template mutable_data<T_DX>() + i * group_offset_X));
-      }
-    });
-  }
-#endif
   return true;
 }
 
diff --git a/caffe2/operators/conv_transpose_op_cudnn.cc b/caffe2/operators/conv_transpose_op_cudnn.cc
index d432e4d30780f..b9333af99e3e7 100644
--- a/caffe2/operators/conv_transpose_op_cudnn.cc
+++ b/caffe2/operators/conv_transpose_op_cudnn.cc
@@ -77,11 +77,7 @@ class CudnnConvTransposeOpBase : public ConvTransposeUnpoolBase<CUDAContext> {
       const int H,
       const int W,
       cudnnTensorDescriptor_t* desc) const {
-#if CUDNN_VERSION_MIN(7, 0, 0)
     const int CC = C;
-#else
-    const int CC = C / group_;
-#endif
     switch (order_) {
       case StorageOrder::NCHW: {
         CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
@@ -242,11 +238,7 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() {
     }
     if (filter_changed) {
       cudnn_filter_dims_ = filter.sizes().vec();
-#if CUDNN_VERSION_MIN(7, 0, 0)
       const int MM = M;
-#else
-      const int MM = M / group_;
-#endif
       CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
           filter_desc_,
           cudnnTypeWrapper<T>::type,
@@ -292,7 +284,6 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() {
         "The current padding scheme leads to unequal padding on the left "
         "and right, which is not supported by cudnn.");
     // Set the convolution descriptor
-#if CUDNN_VERSION_MIN(6, 0, 0)
     CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
         conv_desc_,
         pad_t(),
@@ -303,19 +294,7 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() {
         1,
         CUDNN_CROSS_CORRELATION,
         cudnnTypeWrapper<T>::type));
-#else
-    CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
-        conv_desc_,
-        pad_t(),
-        pad_l(),
-        stride_h(),
-        stride_w(),
-        1,
-        1,
-        CUDNN_CROSS_CORRELATION));
-#endif
 
-#if CUDNN_VERSION_MIN(7, 0, 0)
     // enable TensorCore math if desired
     enable_tensor_core_ &= TensorCoreAvailable();
     if (enable_tensor_core_) {
@@ -324,7 +303,6 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() {
     }
     // set cuDNN groups if appropriate
     CUDNN_ENFORCE(cudnnSetConvolutionGroupCount(conv_desc_, group_));
-#endif
 
     if (force_algo_[ALGO_DGRAD] >= 0) {
       bwd_data_algo_ = (cudnnConvolutionBwdDataAlgo_t)force_algo_[ALGO_DGRAD];
@@ -400,7 +378,6 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() {
 
   // Now, actually run the computation.
   // Filter
-#if CUDNN_VERSION_MIN(7, 0, 0)
   cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
     CUDNN_ENFORCE(cudnnConvolutionBackwardData(
         state->cudnn_handle(),
@@ -417,33 +394,6 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() {
         top_desc_,
         Y_data));
   });
-#else
-  const int X_HxW = H * W;
-  const int Y_HxW = H_out * W_out;
-  const int group_offset_X =
-      order_ == StorageOrder::NCHW ? M / group_ * X_HxW : M / group_;
-  const int group_offset_Y =
-      order_ == StorageOrder::NCHW ? C / group_ * Y_HxW : C / group_;
-  const int group_offset_filter = filter.numel() / group_;
-  for (int i = 0; i < group_; ++i) {
-    cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
-      CUDNN_ENFORCE(
-          cudnnConvolutionBackwardData(state->cudnn_handle(),
-                                       cudnnTypeWrapper<T>::kOne(),
-                                       filter_desc_,
-                                       filter_data + i * group_offset_filter,
-                                       bottom_desc_,
-                                       X_data + i * group_offset_X;
-                                       conv_desc_,
-                                       bwd_data_algo_,
-                                       state->workspace().get(cudnn_ws_nbytes_),
-                                       cudnn_ws_nbytes_,
-                                       cudnnTypeWrapper<T_DX>::kZero(),
-                                       top_desc_,
-                                       Y_data + i * group_offset_Y));
-    });
-  }
-#endif
   // Bias
   if (InputSize() == 3) {
     CUDNN_ENFORCE(cudnnAddTensor(
@@ -527,11 +477,7 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
     }
     if (filter_changed) {
       cudnn_filter_dims_ = filter.sizes().vec();
-#if CUDNN_VERSION_MIN(7, 0, 0)
       const int MM = M;
-#else
-      const int MM = M / group_;
-#endif
       CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
           filter_desc_,
           cudnnTypeWrapper<T>::type,
@@ -576,7 +522,6 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
         pad_r(),
         "The current padding scheme leads to unequal padding on the left "
         "and right, which is not supported by cudnn.");
-#if CUDNN_VERSION_MIN(6, 0, 0)
     CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
         conv_desc_,
         pad_t(),
@@ -587,18 +532,6 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
         1,
         CUDNN_CROSS_CORRELATION,
         cudnnTypeWrapper<T>::type));
-#else
-    CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
-        conv_desc_,
-        pad_t(),
-        pad_l(),
-        stride_h(),
-        stride_w(),
-        1,
-        1,
-        CUDNN_CROSS_CORRELATION));
-#endif
-#if CUDNN_VERSION_MIN(7, 0, 0)
     // enable TensorCore math if desired
     enable_tensor_core_ &= TensorCoreAvailable();
     if (enable_tensor_core_) {
@@ -607,7 +540,6 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
     }
     // set cuDNN groups if appropriate
     CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_));
-#endif
     if (force_algo_[ALGO_WGRAD] >= 0) {
       bwd_filter_algo_ =
           (cudnnConvolutionBwdFilterAlgo_t)force_algo_[ALGO_WGRAD];
@@ -762,7 +694,6 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
         dbias->template mutable_data<T>()));
   }
 
-#if CUDNN_VERSION_MIN(7, 0, 0)
   cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
     CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
         state->cudnn_handle(),
@@ -801,55 +732,6 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() {
           dX->template mutable_data<T>()));
     }
   });
-#else
-  const int X_HxW = H * W;
-  const int Y_HxW = H_out * W_out;
-  const int group_offset_X =
-      order_ == StorageOrder::NCHW ? M / group_ * X_HxW : M / group_;
-  const int group_offset_Y =
-      order_ == StorageOrder::NCHW ? C / group_ * Y_HxW : C / group_;
-  const int group_offset_filter = filter.numel() / group_;
-  for (int i = 0; i < group_; ++i) {
-    cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
-      CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
-          state->cudnn_handle(),
-          cudnnTypeWrapper<T>::kOne(),
-          top_desc_,
-          dY.template data<T>() + i * group_offset_Y,
-          bottom_desc_,
-          X.template data<T>() + i * group_offset_X,
-          conv_desc_,
-          bwd_filter_algo_,
-          state->workspace().get(cudnn_ws_nbytes_),
-          cudnn_ws_nbytes_,
-          cudnnTypeWrapper<T>::kZero(),
-          filter_desc_,
-          dfilter->template mutable_data<T>() + i * group_offset_filter));
-      if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
-        // Compute the gradient w.r.t. the input.
-        auto* dX = Output(
-            no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD,
-            X.sizes(),
-            at::dtype<T>());
-        cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
-          CUDNN_ENFORCE(cudnnConvolutionForward(
-              state->cudnn_handle(),
-              cudnnTypeWrapper<T>::kOne(),
-              top_desc_,
-              dY.template data<T>() + i * group_offset_Y,
-              filter_desc_,
-              filter.template data<T>() + i * group_offset_filter,
-              conv_desc_,
-              algo_,
-              state->workspace().get(cudnn_ws_nbytes_),
-              cudnn_ws_nbytes_,
-              cudnnTypeWrapper<T>::kZero(),
-              bottom_desc_,
-              dX->template mutable_data<T>() + i * group_offset_X));
-        });
-      }
-  }
-#endif
   return true;
 }
 
diff --git a/caffe2/operators/dropout_op_cudnn.cc b/caffe2/operators/dropout_op_cudnn.cc
index 01f21544867d5..2d0ca63e4ebe4 100644
--- a/caffe2/operators/dropout_op_cudnn.cc
+++ b/caffe2/operators/dropout_op_cudnn.cc
@@ -5,10 +5,6 @@
 
 namespace caffe2 {
 
-// cudnnRestoreDropoutDescriptor is needed for correctness and
-// doesn't exist prior to cuDNN v7
-#if CUDNN_VERSION_MIN(7,0,0)
-
 class CuDNNDropoutOp final : public Operator<CUDAContext> {
  public:
   USE_OPERATOR_FUNCTIONS(CUDAContext);
@@ -293,6 +289,4 @@ REGISTER_CUDNN_OPERATOR(Dropout, CuDNNDropoutOp);
 REGISTER_CUDNN_OPERATOR(DropoutGrad, CuDNNDropoutGradientOp);
 }
 
-#endif
-
 }; // namespace caffe2
diff --git a/caffe2/operators/generate_proposals_op_util_boxes.h b/caffe2/operators/generate_proposals_op_util_boxes.h
index 53cadc6e76969..e4caf0b1323fd 100644
--- a/caffe2/operators/generate_proposals_op_util_boxes.h
+++ b/caffe2/operators/generate_proposals_op_util_boxes.h
@@ -148,12 +148,12 @@ EArrXXt<typename Derived1::Scalar> bbox_transform_rotated(
     // targets by bbox_transform_inv.
     const int period = angle_bound_hi - angle_bound_lo;
     CAFFE_ENFORCE(period > 0 && period % 180 == 0);
-    auto angles = pred_boxes.col(4);
-    for (const auto i : c10::irange(angles.size())) {
-      if (angles[i] < angle_bound_lo) {
-        angles[i] += T(period);
-      } else if (angles[i] > angle_bound_hi) {
-        angles[i] -= T(period);
+    auto angles_2 = pred_boxes.col(4);
+    for (const auto i : c10::irange(angles_2.size())) {
+      if (angles_2[i] < angle_bound_lo) {
+        angles_2[i] += T(period);
+      } else if (angles_2[i] > angle_bound_hi) {
+        angles_2[i] -= T(period);
       }
     }
   }
diff --git a/caffe2/operators/op_utils_cudnn.h b/caffe2/operators/op_utils_cudnn.h
index ca5c19e629182..cb7377a2eca34 100644
--- a/caffe2/operators/op_utils_cudnn.h
+++ b/caffe2/operators/op_utils_cudnn.h
@@ -14,7 +14,6 @@ static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 64 * 1024 * 1024;
 // This does not have any performance implications, as we will always find the
 // fastest algorithm; setting them to the right number of algorithms will enable
 // us to best report the statistics when doing an exhaustive search, though.
-#if CUDNN_VERSION_MIN(7, 0, 0)
 // Note: Double each of these due to potential
 // tensorcore + non-tensorcore versions
 // which are treated as separate returned algos
@@ -24,11 +23,6 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
     2 * CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
 static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
     2 * CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
-#else
-static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
-static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
-static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
-#endif
 
 namespace {
 template <typename ArrayOfcudnnConvolutionAlgoPerf_t>
diff --git a/caffe2/operators/pool_op_cudnn.cc b/caffe2/operators/pool_op_cudnn.cc
index e65680148c71a..251fc380bc831 100644
--- a/caffe2/operators/pool_op_cudnn.cc
+++ b/caffe2/operators/pool_op_cudnn.cc
@@ -434,11 +434,7 @@ struct CuDNNMaxPoolFunctor {
         deterministic(op.GetSingleArgument<bool>("deterministic", false)) {}
 
   cudnnPoolingMode_t GetPoolingMode() const {
-#if CUDNN_VERSION_MIN(6, 0, 0)
     return deterministic ? CUDNN_POOLING_MAX_DETERMINISTIC : CUDNN_POOLING_MAX;
-#else
-    return CUDNN_POOLING_MAX;
-#endif
   }
 
   template <typename T, StorageOrder kOrder>
diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc
index 3679c9d2a7d32..c23662f13f21c 100644
--- a/caffe2/operators/rnn/recurrent_op_cudnn.cc
+++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc
@@ -98,7 +98,6 @@ void RecurrentBaseOp<T>::initialize(
 
   // RNN setup
   {
-#if CUDNN_VERSION_MIN(7, 0, 0)
     CUDNN_ENFORCE(cudnnSetRNNDescriptor_v6(
         cudnn_wrapper_.inline_cudnn_handle(),
         rnnDesc_,
@@ -110,17 +109,6 @@ void RecurrentBaseOp<T>::initialize(
         rnnMode,
         CUDNN_RNN_ALGO_STANDARD, // TODO: verify correctness / efficiency.
         cudnnTypeWrapper<T>::type));
-#else
-    CUDNN_ENFORCE(cudnnSetRNNDescriptor(
-        rnnDesc_,
-        hiddenSize,
-        numLayers,
-        dropoutDesc_,
-        rnnInput,
-        rnnDirection,
-        rnnMode,
-        cudnnTypeWrapper<T>::type));
-#endif
   }
   // X setup
   {
diff --git a/caffe2/operators/softmax_op_cudnn.cc b/caffe2/operators/softmax_op_cudnn.cc
index e51e1c623ebe1..7ff9cbe230c2c 100644
--- a/caffe2/operators/softmax_op_cudnn.cc
+++ b/caffe2/operators/softmax_op_cudnn.cc
@@ -5,14 +5,6 @@
 
 namespace caffe2 {
 
-namespace {
-constexpr int NUM_DESCRIPTORS = 2;
-constexpr int GRADIENT_NUM_DESCRIPTORS = 3;
-constexpr int BOTTOM_DESC_ID = 0;
-constexpr int TOP_DESC_ID = 1;
-constexpr int TOP_GRADIENT_DESC_ID = 2;
-} // namespace
-
 class CuDNNSoftmaxOp final : public Operator<CUDAContext> {
  public:
   template <class... Args>
diff --git a/caffe2/operators/spatial_batch_norm_op_cudnn.cu b/caffe2/operators/spatial_batch_norm_op_cudnn.cu
index 4b629caa8cc08..d5ad3ac54efdd 100644
--- a/caffe2/operators/spatial_batch_norm_op_cudnn.cu
+++ b/caffe2/operators/spatial_batch_norm_op_cudnn.cu
@@ -10,7 +10,6 @@
 #include "caffe2/operators/spatial_batch_norm_op_impl.cuh"
 #include "caffe2/utils/math.h"
 
-#if CUDNN_VERSION_MIN(5, 0, 0)
 
 namespace caffe2 {
 
@@ -63,16 +62,12 @@ class CuDNNSpatialBNOp final : public SpatialBNOp<CUDAContext> {
   CuDNNSpatialBNOp(const OperatorDef& operator_def, Workspace* ws)
       : SpatialBNOp<CUDAContext>(operator_def, ws),
         cudnn_wrapper_(&context_),
-#if CUDNN_VERSION_MIN(7, 0, 0)
         // TODO(T31829456): The new CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode was
         // introduced in CuDNN 7 for performance optimization, but it results in
         // accuracy losses in convolution models such as ResNeXt-101 and
         // video R(2+1)D. We will fall back to the normal
         // CUDNN_BATCHNORM_SPATIAL for now
         mode_(CUDNN_BATCHNORM_SPATIAL) {
-#else
-        mode_(CUDNN_BATCHNORM_SPATIAL) {
-#endif
     CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
     CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&param_desc_));
     if (epsilon_ < CUDNN_BN_MIN_EPSILON) {
@@ -192,7 +187,6 @@ class CuDNNSpatialBNOp final : public SpatialBNOp<CUDAContext> {
       }
       const double alpha = static_cast<double>(1.0f - momentum_);
 
-#if CUDNN_VERSION_MIN(8, 0, 0)
       // Currently not supporting CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION
       auto op = CUDNN_BATCHNORM_OPS_BN;
 
@@ -250,26 +244,6 @@ class CuDNNSpatialBNOp final : public SpatialBNOp<CUDAContext> {
               state->workspace().get(reserve_size),
               reserve_size));
           });
-#else
-      CUDNN_ENFORCE(cudnnBatchNormalizationForwardTraining(
-          cudnn_wrapper_.inline_cudnn_handle(),
-          mode_,
-          cudnnTypeWrapper<T>::kOne(),
-          cudnnTypeWrapper<T>::kZero(),
-          data_desc_,
-          X_data,
-          data_desc_,
-          Y_data,
-          param_desc_,
-          scale_data,
-          bias_data,
-          alpha,
-          running_mean_data,
-          running_var_data,
-          epsilon_,
-          saved_mean_data,
-          saved_inv_std_data));
-#endif // CUDNN_VERSION_MIN(8, 0, 0)
     }
     return true;
   }
@@ -290,16 +264,12 @@ class CuDNNSpatialBNGradientOp final : public SpatialBNGradientOp<CUDAContext> {
   CuDNNSpatialBNGradientOp(const OperatorDef& operator_def, Workspace* ws)
       : SpatialBNGradientOp<CUDAContext>(operator_def, ws),
         cudnn_wrapper_(&context_),
-#if CUDNN_VERSION_MIN(7, 0, 0)
         // TODO(T31829456): The new CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode was
         // introduced in CuDNN 7 for performance optimization, but it results in
         // accuracy losses in convolution models such as ResNeXt-101 and
         // video R(2+1)D. We will fall back to the normal
         // CUDNN_BATCHNORM_SPATIAL for now
         mode_(CUDNN_BATCHNORM_SPATIAL) {
-#else
-        mode_(CUDNN_BATCHNORM_SPATIAL) {
-#endif
     CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
     CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&param_desc_));
     if (epsilon_ < CUDNN_BN_MIN_EPSILON) {
@@ -375,7 +345,6 @@ class CuDNNSpatialBNGradientOp final : public SpatialBNGradientOp<CUDAContext> {
           data_desc_,
           param_desc_);
     }
-#if CUDNN_VERSION_MIN(8, 0, 0)
     // Currently not supporting CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION
     auto op = CUDNN_BATCHNORM_OPS_BN;
 
@@ -439,28 +408,6 @@ class CuDNNSpatialBNGradientOp final : public SpatialBNGradientOp<CUDAContext> {
             state->workspace().get(reserve_size),
             reserve_size));
       });
-#else
-    CUDNN_ENFORCE(cudnnBatchNormalizationBackward(
-        cudnn_wrapper_.inline_cudnn_handle(),
-        mode_,
-        cudnnTypeWrapper<T>::kOne(),
-        cudnnTypeWrapper<T>::kZero(),
-        cudnnTypeWrapper<T>::kOne(),
-        cudnnTypeWrapper<T>::kZero(),
-        data_desc_,
-        X_data,
-        data_desc_,
-        dY_data,
-        data_desc_,
-        dX_data,
-        param_desc_,
-        scale_data,
-        dscale_data,
-        dbias_data,
-        epsilon_,
-        saved_mean_data,
-        saved_rstd_data));
-#endif // CUDNN_VERSION_MIN(8, 0, 0)
     return true;
   }
 
@@ -478,5 +425,3 @@ REGISTER_CUDNN_OPERATOR(SpatialBN, CuDNNSpatialBNOp);
 REGISTER_CUDNN_OPERATOR(SpatialBNGradient, CuDNNSpatialBNGradientOp);
 
 } // namespace caffe2
-
-#endif // CUDNN_VERSION_MIN(5, 0, 0)
diff --git a/caffe2/operators/square_root_divide_op.h b/caffe2/operators/square_root_divide_op.h
index acfc5c88a3b0e..48347109ffb2b 100644
--- a/caffe2/operators/square_root_divide_op.h
+++ b/caffe2/operators/square_root_divide_op.h
@@ -43,9 +43,9 @@ class SquareRootDivideOp final : public Operator<Context> {
     auto* dataPtr = data.template data<TData>();
     auto* yPtr = Y->template mutable_data<TData>();
     for (const auto i : c10::irange(0U, batchSize)) {
-      auto scale = scalePtr[i];
-      CAFFE_ENFORCE(scale >= 0, scale, " < 0");
-      auto multiplier = scale == 0 ? 1.0 : 1 / std::sqrt(scale);
+      auto scale_2 = scalePtr[i];
+      CAFFE_ENFORCE(scale_2 >= 0, scale_2, " < 0");
+      auto multiplier = scale_2 == 0 ? 1.0 : 1 / std::sqrt(scale_2);
       math::Scale<float, TData, Context>(
           exampleSize,
           multiplier,
diff --git a/caffe2/operators/transpose_op_cudnn.cc b/caffe2/operators/transpose_op_cudnn.cc
index 7e9b5b73610e5..5f08c72cbe1ee 100644
--- a/caffe2/operators/transpose_op_cudnn.cc
+++ b/caffe2/operators/transpose_op_cudnn.cc
@@ -134,33 +134,6 @@ class CuDNNTransposeOp final : public Operator<CUDAContext> {
   std::vector<std::int32_t> axes_;
 };
 
-#if !CUDNN_VERSION_MIN(6, 0, 0)
-
-// CuDNN 5.1 does not have int support yet.
-template <>
-bool CuDNNTransposeOp::DoRunWithType<int>() {
-  const auto& X = Input(0);
-  const int ndim = X.dim();
-  if (axes_.empty()) {
-    axes_.resize(ndim);
-    std::iota(axes_.rbegin(), axes_.rend(), 0);
-  } else {
-    CAFFE_ENFORCE_EQ(axes_.size(), ndim);
-  }
-  std::vector<std::int64_t> X_dims = X.sizes().vec();
-  std::vector<std::int64_t> Y_dims(ndim);
-  for (int i = 0; i < ndim; ++i) {
-    Y_dims[i] = X_dims[axes_[i]];
-  }
-  auto* Y = Output(0, Y_dims, at::dtype<T>());
-  const T* X_data = X.template data<T>();
-  T* Y_data = Y->template mutable_data<T>();
-  math::Transpose<std::int64_t, T, CUDAContext>(
-      ndim, X_dims.data(), axes_.data(), X_data, Y_data, &context_);
-  return true;
-}
-
-#endif // !CUDNN_VERSION_MIN(6, 0, 0)
 
 } // namespace
 
diff --git a/caffe2/opt/custom/concat_elim.cc b/caffe2/opt/custom/concat_elim.cc
index c476f7166b873..cec3e6cfc84fe 100644
--- a/caffe2/opt/custom/concat_elim.cc
+++ b/caffe2/opt/custom/concat_elim.cc
@@ -6,6 +6,8 @@
 #include "nomnigraph/Support/Common.h"
 #include "nomnigraph/Transformations/SubgraphMatcher.h"
 
+using namespace nom::repr;
+
 namespace caffe2 {
 namespace opt {
 
diff --git a/caffe2/opt/fusion.cc b/caffe2/opt/fusion.cc
index a1a014855c981..ede4c3a690c1f 100644
--- a/caffe2/opt/fusion.cc
+++ b/caffe2/opt/fusion.cc
@@ -3,8 +3,7 @@
 #include "caffe2/opt/converter.h"
 #include "caffe2/opt/passes.h"
 
-namespace caffe2 {
-namespace opt {
+namespace caffe2::opt {
 
 using namespace nom;
 
@@ -33,7 +32,7 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
 
     auto bnNode = consumer;
     auto bn = repr::nn::get<repr::BatchNormalization>(bnNode);
-    auto bnOutputs = nn::getOutputs(bnNode);
+    auto bnOutputs = repr::nn::getOutputs(bnNode);
     NOM_REQUIRE_OR_CONT(bnOutputs.size() == 1);
     auto bnOutput = bnOutputs.front();
 
@@ -124,5 +123,4 @@ void fuseConvBN(nom::repr::NNModule* nn, caffe2::Workspace* ws) {
 
 REGISTER_WS_OPT_PASS_FROM_FUNC(FuseConvBN, fuseConvBN);
 
-} // namespace opt
-} // namespace caffe2
+} // namespace caffe2::opt
diff --git a/caffe2/opt/nql/ast.h b/caffe2/opt/nql/ast.h
index 0d6aefc90c4e1..da84d0870663e 100644
--- a/caffe2/opt/nql/ast.h
+++ b/caffe2/opt/nql/ast.h
@@ -21,10 +21,12 @@ struct ASTExpr {
     return starInputsFlag;
   }
   void dump(int level = 0) const {
-    for (const auto i : c10::irange(level))std::cout << "  ";
-    if (!isCall())
+    for ([[maybe_unused]] const auto _ : c10::irange(level)) {
+      std::cout << "  ";
+    }
+    if (!isCall()) {
       std::cout << "Var: " << name << std::endl;
-    else {
+    } else {
       std::cout << "Function: " << name << ", args: " << std::endl;
       for (auto* e : children) {
         e->dump(level + 1);
@@ -41,11 +43,14 @@ struct ASTStmt {
     delete rhs;
   }
   void dump(int level = 0) const {
-    for (const auto i : c10::irange(level))std::cout << "  ";
+    for ([[maybe_unused]] const auto _ : c10::irange(level)) {
+      std::cout << "  ";
+    }
     std::cout << "LHS:" << std::endl;
     for (auto s : lhs) {
-      for (int i = 0; i < level + 1; i++)
+      for (int i = 0; i < level + 1; i++) {
         std::cout << "  ";
+      }
       std::cout << s << std::endl;
     }
     rhs->dump(level);
diff --git a/caffe2/opt/nql/graphmatcher.cc b/caffe2/opt/nql/graphmatcher.cc
index 96e0ca2320c70..a78a29573272c 100644
--- a/caffe2/opt/nql/graphmatcher.cc
+++ b/caffe2/opt/nql/graphmatcher.cc
@@ -251,9 +251,7 @@ std::string convertToNQLString(NNGraph& g) {
   // doesn't mutate the graph and use const reference in this function too.
   auto topoMatch = nom::algorithm::tarjans(&g);
   std::vector<NNGraph::NodeRef> nodes;
-  int sccNum = 0;
   for (auto scc : topoMatch) {
-    sccNum++;
     for (auto node : scc.getNodes()) {
       nodes.emplace_back(node);
     }
diff --git a/caffe2/opt/passes.cc b/caffe2/opt/passes.cc
index 74250d1bbb3b1..ba0bc1e72d056 100644
--- a/caffe2/opt/passes.cc
+++ b/caffe2/opt/passes.cc
@@ -5,8 +5,8 @@ namespace caffe2 {
 C10_DEFINE_REGISTRY(
     WorkspaceOptimizationPassRegistry,
     WorkspaceOptimizationPass,
-    NNModule*,
+    nom::repr::NNModule*,
     Workspace*);
-C10_DEFINE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*);
+C10_DEFINE_REGISTRY(OptimizationPassRegistry, OptimizationPass, nom::repr::NNModule*);
 
 } // namespace caffe2
diff --git a/caffe2/opt/passes.h b/caffe2/opt/passes.h
index b2ef81c2d4247..fbf46ce541f92 100644
--- a/caffe2/opt/passes.h
+++ b/caffe2/opt/passes.h
@@ -7,8 +7,6 @@
 
 #include "nomnigraph/Representations/NeuralNet.h"
 
-using namespace nom::repr;
-
 namespace caffe2 {
 
 /* This file sets up the optimization pass registry.
@@ -23,18 +21,18 @@ namespace caffe2 {
 
 class TORCH_API OptimizationPass {
  public:
-  OptimizationPass(NNModule* nn) : nn_(nn) {}
+  OptimizationPass(nom::repr::NNModule* nn) : nn_(nn) {}
   virtual void run() = 0;
-  virtual ~OptimizationPass() {}
+  virtual ~OptimizationPass() = default;
 
  protected:
-  NNModule* nn_;
+  nom::repr::NNModule* nn_;
 };
 
 class TORCH_API WorkspaceOptimizationPass : public OptimizationPass {
  public:
-  WorkspaceOptimizationPass(NNModule* nn, Workspace* ws) : OptimizationPass(nn), ws_(ws) {}
-  virtual ~WorkspaceOptimizationPass() {}
+  WorkspaceOptimizationPass(nom::repr::NNModule* nn, Workspace* ws) : OptimizationPass(nn), ws_(ws) {}
+  virtual ~WorkspaceOptimizationPass() = default;
 
  protected:
   Workspace* ws_;
@@ -43,7 +41,7 @@ class TORCH_API WorkspaceOptimizationPass : public OptimizationPass {
 C10_DECLARE_REGISTRY(
     WorkspaceOptimizationPassRegistry,
     WorkspaceOptimizationPass,
-    NNModule*,
+    nom::repr::NNModule*,
     Workspace*);
 #define REGISTER_WS_OPT_PASS(clsname) \
   C10_REGISTER_CLASS(WorkspaceOptimizationPassRegistry, clsname, clsname)
@@ -57,7 +55,7 @@ C10_DECLARE_REGISTRY(
   };                                                            \
   REGISTER_WS_OPT_PASS(passname);
 
-C10_DECLARE_REGISTRY(OptimizationPassRegistry, OptimizationPass, NNModule*);
+C10_DECLARE_REGISTRY(OptimizationPassRegistry, OptimizationPass, nom::repr::NNModule*);
 #define REGISTER_OPT_PASS(clsname) \
   C10_REGISTER_CLASS(OptimizationPassRegistry, clsname, clsname)
 #define REGISTER_OPT_PASS_FROM_FUNC(passname, funcname) \
diff --git a/caffe2/perfkernels/adagrad.cc b/caffe2/perfkernels/adagrad.cc
index cef2b28a604fd..c589187cb2eb1 100644
--- a/caffe2/perfkernels/adagrad.cc
+++ b/caffe2/perfkernels/adagrad.cc
@@ -63,6 +63,7 @@ void adagrad_fp16_update_prefetch__base(
 
 // version without prefetching
 decltype(adagrad_update__base) adagrad_update__avx2_fma;
+decltype(adagrad_update__base) adagrad_update__avx512;
 void adagrad_update(
     int N,
     const float* w,
@@ -74,6 +75,7 @@ void adagrad_update(
     float decay,
     float lr,
     float weight_decay) {
+  AVX512_DO(adagrad_update, N, w, g, h, nw, nh, epsilon, decay, lr, weight_decay);
   AVX2_FMA_DO(
       adagrad_update, N, w, g, h, nw, nh, epsilon, decay, lr, weight_decay);
   BASE_DO(adagrad_update, N, w, g, h, nw, nh, epsilon, decay, lr, weight_decay);
diff --git a/caffe2/perfkernels/adagrad_avx512.cc b/caffe2/perfkernels/adagrad_avx512.cc
new file mode 100644
index 0000000000000..417dd1ca8bab5
--- /dev/null
+++ b/caffe2/perfkernels/adagrad_avx512.cc
@@ -0,0 +1,45 @@
+#include "caffe2/perfkernels/adagrad.h"
+#include "caffe2/perfkernels/cvtsh_ss_bugfix.h"
+
+#include <emmintrin.h>
+#include <immintrin.h>
+
+namespace caffe2 {
+
+// version without prefetching
+void adagrad_update__avx512(
+    int N,
+    const float* w,
+    const float* g,
+    const float* h,
+    float* nw,
+    float* nh,
+    float epsilon,
+    float decay,
+    float lr,
+    float weight_decay = 0.f) {
+  constexpr int kSize = 16;
+  auto i = 0;
+  for (; i + kSize <= N; i += kSize) {
+    __m512 gi = _mm512_loadu_ps(g + i);
+    __m512 hi = _mm512_loadu_ps(h + i);
+    __m512 wi = _mm512_loadu_ps(w + i);
+    gi = _mm512_fmadd_ps(_mm512_set1_ps(weight_decay), wi, gi);
+
+    __m512 nhi = _mm512_add_ps(
+        _mm512_mul_ps(_mm512_set1_ps(decay), hi), _mm512_mul_ps(gi, gi));
+    _mm512_storeu_ps(nh + i, nhi);
+    __m512 vtmp = _mm512_div_ps(
+        _mm512_mul_ps(_mm512_set1_ps(lr), gi),
+        _mm512_add_ps(_mm512_sqrt_ps(nhi), _mm512_set1_ps(epsilon)));
+    _mm512_storeu_ps(nw + i, _mm512_add_ps(wi, vtmp));
+  }
+
+  for (; i < N; ++i) {
+    float gi = std::fma(weight_decay, w[i], g[i]);
+    float hi = nh[i] = decay * h[i] + gi * gi;
+    nw[i] = w[i] + lr * gi / (std::sqrt(hi) + epsilon);
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
index 861a6c5d43740..077e7b0ed5446 100644
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@@ -218,7 +218,7 @@ enum DeviceTypeProto {
   PROTO_IDEEP = 5; // IDEEP.
   PROTO_HIP = 6; // AMD HIP
   PROTO_FPGA = 7; // FPGA
-  PROTO_ORT = 8; // ONNX Runtime
+  PROTO_MAIA = 8; // MAIA
   PROTO_XLA = 9; // XLA / TPU
   PROTO_MPS = 10; // MPS
   // Change the following number if you add more devices in the code.
diff --git a/caffe2/proto/caffe2_pb2.pyi b/caffe2/proto/caffe2_pb2.pyi
index ed1f4249a43ee..43249ebf75dbd 100644
--- a/caffe2/proto/caffe2_pb2.pyi
+++ b/caffe2/proto/caffe2_pb2.pyi
@@ -23,7 +23,7 @@ class _DeviceTypeProto(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapp
     PROTO_IDEEP = DeviceTypeProto.V(5)
     PROTO_HIP = DeviceTypeProto.V(6)
     PROTO_FPGA = DeviceTypeProto.V(7)
-    PROTO_ORT = DeviceTypeProto.V(8)
+    PROTO_MAIA = DeviceTypeProto.V(8)
     PROTO_XLA = DeviceTypeProto.V(9)
     PROTO_MPS = DeviceTypeProto.V(10)
     PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = DeviceTypeProto.V(11)
@@ -37,7 +37,7 @@ PROTO_OPENCL = DeviceTypeProto.V(4)
 PROTO_IDEEP = DeviceTypeProto.V(5)
 PROTO_HIP = DeviceTypeProto.V(6)
 PROTO_FPGA = DeviceTypeProto.V(7)
-PROTO_ORT = DeviceTypeProto.V(8)
+PROTO_MAIA = DeviceTypeProto.V(8)
 PROTO_XLA = DeviceTypeProto.V(9)
 PROTO_MPS = DeviceTypeProto.V(10)
 PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = DeviceTypeProto.V(11)
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index e083ca2f8e963..f04a58a247397 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -2011,15 +2011,8 @@ def _InvalidateLookupTables(self):
         self._recreate_lookup_tables = True
 
     def _RecreateLookupTables(self):
-        self._op_outputs = set()
-        for op in self._net.op:
-            for o in op.output:
-                self._op_outputs.add(o)
-
-        self._external_input_map = set()
-        for inp in self._net.external_input:
-            self._external_input_map.add(inp)
-
+        self._op_outputs = {o for op in self._net.op for o in op.output}
+        self._external_input_map = {inp for inp in self._net.external_input}
         self._recreate_lookup_tables = False
 
     def AddGradientOperators(self, ys, skip=0):
diff --git a/caffe2/python/examples/imagenet_trainer.py b/caffe2/python/examples/imagenet_trainer.py
index 414079507ec04..53c0c96c4667b 100644
--- a/caffe2/python/examples/imagenet_trainer.py
+++ b/caffe2/python/examples/imagenet_trainer.py
@@ -219,12 +219,12 @@ def RunEpoch(
         for _ in range(test_epoch_iters):
             workspace.RunNet(test_model.net.Proto().name)
             for g in test_model._devices:
-                test_accuracy += np.asscalar(workspace.FetchBlob(
+                test_accuracy += workspace.FetchBlob(
                     "{}_{}".format(test_model._device_prefix, g) + '/accuracy'
-                ))
-                test_accuracy_top5 += np.asscalar(workspace.FetchBlob(
+                ).item()
+                test_accuracy_top5 += workspace.FetchBlob(
                     "{}_{}".format(test_model._device_prefix, g) + '/accuracy_top5'
-                ))
+                ).item()
                 ntests += 1
         test_accuracy /= ntests
         test_accuracy_top5 /= ntests
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index 0b09f85896448..894e886235312 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -590,7 +590,7 @@ def rmsprop(grad, ms, mom, lr):
     @staticmethod
     def _dense_ftrl(alpha, beta, lambda1, lambda2, w, nz, g):
         if isinstance(alpha, np.ndarray):
-            alpha = np.asscalar(alpha)
+            alpha = alpha.item()
         n = np.take(nz, 0, axis=-1)
         z = np.take(nz, 1, axis=-1)
         # python port of Sigrid's implementation
@@ -636,7 +636,7 @@ def test_ftrl_sgd(self, inputs, in_place, alpha, beta, lambda1, lambda2,
     @staticmethod
     def _dense_gftrl(alpha, beta, lambda1, lambda2, w, nz, g):
         if isinstance(alpha, np.ndarray):
-            alpha = np.asscalar(alpha)
+            alpha = alpha.item()
 
         old_shape = g.shape
 
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index 477ded3284e8f..fa06a7e68c872 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -700,7 +700,7 @@ def prepare(cls, model, device='CPU', raw_values_dict=None, **kwargs):
                 opset_version = 1
 
         # Prior to onnx version update to onnx-1.8.0, errors caused by failures in
-        # in the onnx shape inference call were being supressed. Hence a try-catch block
+        # in the onnx shape inference call were being suppressed. Hence a try-catch block
         # is added around the infer_shapes call to avoid these failures and preserve status
         try:
             model = onnx.shape_inference.infer_shapes(model)
@@ -874,7 +874,7 @@ def _onnx_model_to_caffe2_net(cls, onnx_model, device, opset_version, include_in
         device_option = get_device_option(Device(device))
 
         # Prior to onnx version update to onnx-1.8.0, errors caused by failures in
-        # in the onnx shape inference call were being supressed. Hence a try-catch block
+        # in the onnx shape inference call were being suppressed. Hence a try-catch block
         # is added around the infer_shapes call to avoid these failures and preserve status
         try:
             onnx_model = onnx.utils.polish_model(onnx_model)
diff --git a/caffe2/python/operator_test/rank_loss_operator_test.py b/caffe2/python/operator_test/rank_loss_operator_test.py
index 2d52da2931273..592ad4684c221 100644
--- a/caffe2/python/operator_test/rank_loss_operator_test.py
+++ b/caffe2/python/operator_test/rank_loss_operator_test.py
@@ -36,12 +36,12 @@ def test_pair_wise_loss_predictions(self, X, label, gc, dc):
         new_output = workspace.FetchBlob('new_output')
         sign = 1 if label[0] > label[1] else -1
         if label[0] == label[1]:
-            self.assertEqual(np.asscalar(output), 0)
+            self.assertEqual(output.item(), 0)
             return
 
         self.assertAlmostEqual(
-            np.asscalar(output),
-            np.asscalar(np.log(1 + np.exp(sign * (X[1] - X[0])))),
+            output.item(),
+            np.log(1 + np.exp(sign * (X[1] - X[0]))).item(),
             delta=1e-4
         )
         # check swapping row order doesn't alter overall loss
@@ -71,14 +71,14 @@ def test_pair_wise_loss_gradient(self, X, label, dY, gc, dc):
         dx = workspace.FetchBlob('dX')
         sign = 1 if label[0] > label[1] else -1
         if label[0] == label[1]:
-            self.assertEqual(np.asscalar(dx[0]), 0)
+            self.assertEqual(dx[0].item(), 0)
             return
         self.assertAlmostEqual(
-            np.asscalar(dx[0]),
-            np.asscalar(-dY[0] * sign / (1 + np.exp(sign * (X[0] - X[1])))),
-            delta=1e-2 * abs(np.asscalar(dx[0])))
+            dx[0].item(),
+            (-dY[0] * sign / (1 + np.exp(sign * (X[0] - X[1])))).item(),
+            delta=1e-2 * abs(dx[0].item()))
 
-        self.assertEqual(np.asscalar(dx[0]), np.asscalar(-dx[1]))
+        self.assertEqual(dx[0].item(), (-dx[1]).item())
         delta = 1e-3
         up_x = np.array([[X[0] + delta], [X[1]]], dtype=np.float32)
         down_x = np.array([[X[0] - delta], [X[1]]], dtype=np.float32)
@@ -94,10 +94,9 @@ def test_pair_wise_loss_gradient(self, X, label, dY, gc, dc):
         down_output_pred = workspace.FetchBlob('down_output')
         up_output_pred = workspace.FetchBlob('up_output')
         np.testing.assert_allclose(
-            np.asscalar(dx[0]),
-            np.asscalar(
-                0.5 * dY[0] *
-                (up_output_pred[0] - down_output_pred[0]) / delta),
+            dx[0].item(),
+            (0.5 * dY[0] *
+                (up_output_pred[0] - down_output_pred[0]) / delta).item(),
             rtol=1e-2, atol=1e-2)
 
     @serial.given(n=st.integers(0, 10), k=st.integers(1, 5), **hu.gcs_cpu_only)
diff --git a/caffe2/python/pybind_state_gpu.cc b/caffe2/python/pybind_state_gpu.cc
index 84d98bebc1ca7..f46493d409f82 100644
--- a/caffe2/python/pybind_state_gpu.cc
+++ b/caffe2/python/pybind_state_gpu.cc
@@ -18,10 +18,6 @@
 #include "caffe2/operators/operator_fallback_gpu.h"
 #include "caffe2/python/pybind_state_registry.h"
 
-#ifdef CAFFE2_USE_TRT
-#include "caffe2/contrib/tensorrt/tensorrt_tranformer.h"
-#endif // CAFFE2_USE_TRT
-
 namespace caffe2 {
 namespace python {
 
@@ -82,13 +78,6 @@ void addCUDAGlobalMethods(py::module& m) {
          int verbosity,
          bool debug_builder) -> py::bytes {
 #ifdef CAFFE2_USE_TRT
-        TensorRTTransformer t(
-            max_batch_size, max_workspace_size, verbosity, debug_builder);
-        auto op_def =
-            t.BuildTrtOp(onnx_model_str.cast<std::string>(), output_size_hints);
-        std::string out;
-        op_def.SerializeToString(&out);
-        return py::bytes(out);
 #else
         CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1");
 #endif // CAFFE2_USE_TRT
@@ -103,26 +92,6 @@ void addCUDAGlobalMethods(py::module& m) {
          bool debug_builder,
          bool build_serializable_op) -> py::bytes {
 #ifdef CAFFE2_USE_TRT
-        caffe2::NetDef pred_net;
-        if (!ParseProtoFromLargeString(
-                pred_net_str.cast<std::string>(), &pred_net)) {
-          LOG(ERROR) << "broken pred_net protobuf";
-        }
-        std::unordered_map<std::string, TensorShape> tensor_shapes;
-        for (const auto& it : shapes) {
-          tensor_shapes.emplace(
-              it.first, CreateTensorShape(it.second, TensorProto::FLOAT));
-        }
-        TensorRTTransformer ts(
-            max_batch_size,
-            max_workspace_size,
-            verbosity,
-            debug_builder,
-            build_serializable_op);
-        ts.Transform(GetCurrentWorkspace(), &pred_net, tensor_shapes);
-        std::string pred_net_str2;
-        pred_net.SerializeToString(&pred_net_str2);
-        return py::bytes(pred_net_str2);
 #else
         CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1");
 #endif // CAFFE2_USE_TRT
diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py
index 3ae0964c10812..2a9bdae2b614d 100644
--- a/caffe2/python/rnn_cell.py
+++ b/caffe2/python/rnn_cell.py
@@ -289,7 +289,7 @@ def create_states(self, model):
         ]
 
 
-# based on https://pytorch.org/docs/master/nn.html#torch.nn.RNNCell
+# based on https://pytorch.org/docs/main/nn.html#torch.nn.RNNCell
 class BasicRNNCell(RNNCell):
     def __init__(
         self,
diff --git a/caffe2/python/test_util.py b/caffe2/python/test_util.py
index afe3134937c93..6b0573ccd81f8 100644
--- a/caffe2/python/test_util.py
+++ b/caffe2/python/test_util.py
@@ -57,22 +57,6 @@ def get_default_test_flags():
     ]
 
 
-def caffe2_flaky(test_method):
-    # This decorator is used to mark a test method as flaky.
-    # This is used in conjunction with the environment variable
-    # CAFFE2_RUN_FLAKY_TESTS that specifies "flaky tests" mode
-    # If flaky tests mode are on, only flaky tests are run
-    # If flaky tests mode are off, only non-flaky tests are run
-    # NOTE: the decorator should be applied as the top-level decorator
-    # in a test method.
-    test_method.__caffe2_flaky__ = True
-    return test_method
-
-
-def is_flaky_test_mode():
-    return os.getenv('CAFFE2_RUN_FLAKY_TESTS', '0') == '1'
-
-
 class TestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -82,15 +66,6 @@ def setUpClass(cls):
         core.SetEnginePref({}, {})
 
     def setUp(self):
-        # Skip tests based on whether we're in flaky test mode and
-        # the test is decorated as a flaky test.
-        test_method = getattr(self, self._testMethodName)
-        is_flaky_test = getattr(test_method, "__caffe2_flaky__", False)
-        if (is_flaky_test_mode() and not is_flaky_test):
-            raise unittest.SkipTest("Non-flaky tests are skipped in flaky test mode")
-        elif (not is_flaky_test_mode() and is_flaky_test):
-            raise unittest.SkipTest("Flaky tests are skipped in regular test mode")
-
         self.ws = workspace.C.Workspace()
         workspace.ResetWorkspace()
 
diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py
index 8c82faee33a4c..d0a47f0c67be9 100644
--- a/caffe2/python/utils.py
+++ b/caffe2/python/utils.py
@@ -140,7 +140,7 @@ def MakeArgument(key, value):
         value = value.flatten().tolist()
     elif isinstance(value, np.generic):
         # convert numpy scalar to native python type
-        value = np.asscalar(value)
+        value = value.item()
 
     if type(value) is float:
         argument.f = value
diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc
index 533fd42a0455d..00d922f356dfc 100644
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@@ -93,7 +93,15 @@ static std::string parentdir(const std::string& name) {
     end = name.find_last_of('\\');
   }
 
-  if(end == std::string::npos) {
+  #ifdef WIN32
+  if (end != std::string::npos && end > 1 && name[end - 1] == ':') {
+    // This is a Windows root directory, so include the slash in
+    // the parent directory
+    end++;
+  }
+  #endif
+
+  if (end == std::string::npos) {
     return "";
   }
 
diff --git a/caffe2/serialize/inline_container.h b/caffe2/serialize/inline_container.h
index aa0cb8e043290..6a13d414feb9e 100644
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@@ -104,6 +104,7 @@ class TORCH_API ChunkRecordIterator {
 
   // Read at most `chunkSize` into `buf`. Return the number of actual bytes read.
   size_t next(void* buf);
+  size_t recordSize() const { return recordSize_; }
 
  private:
  ChunkRecordIterator(
diff --git a/caffe2/utils/math_test.cc b/caffe2/utils/math_test.cc
index d29860bd3ac58..0389a10f29e0f 100644
--- a/caffe2/utils/math_test.cc
+++ b/caffe2/utils/math_test.cc
@@ -166,8 +166,6 @@ TEST(MathTest, GemmNoTransTrans) {
 
 namespace {
 
-constexpr float kEps = 1e-5;
-
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 class GemmBatchedTest
     : public testing::TestWithParam<testing::tuple<bool, bool>> {
diff --git a/caffe2/utils/threadpool/pthreadpool.cc b/caffe2/utils/threadpool/pthreadpool.cc
index 50e061f3e0fad..44c758db5cb1e 100644
--- a/caffe2/utils/threadpool/pthreadpool.cc
+++ b/caffe2/utils/threadpool/pthreadpool.cc
@@ -53,7 +53,7 @@ void legacy_pthreadpool_compute_1d_tiled(
   size_t range,
   size_t tile)
 {
-  if (threadpool == NULL) {
+  if (threadpool == nullptr) {
     /* No thread pool provided: execute function sequentially on the calling thread */
     for (size_t i = 0; i < range; i += tile) {
       function(argument, i, min(range - i, tile));
@@ -92,7 +92,7 @@ void legacy_pthreadpool_compute_2d(
   size_t range_i,
   size_t range_j)
 {
-  if (threadpool == NULL) {
+  if (threadpool == nullptr) {
     /* No thread pool provided: execute function sequentially on the calling thread */
     for (size_t i = 0; i < range_i; i++) {
       for (size_t j = 0; j < range_j; j++) {
@@ -144,7 +144,7 @@ void legacy_pthreadpool_compute_2d_tiled(
   size_t tile_i,
   size_t tile_j)
 {
-  if (threadpool == NULL) {
+  if (threadpool == nullptr) {
     /* No thread pool provided: execute function sequentially on the calling thread */
     for (size_t i = 0; i < range_i; i += tile_i) {
       for (size_t j = 0; j < range_j; j += tile_j) {
@@ -215,7 +215,7 @@ void legacy_pthreadpool_compute_3d_tiled(
     size_t tile_i,
     size_t tile_j,
     size_t tile_k) {
-  if (threadpool == NULL) {
+  if (threadpool == nullptr) {
     /* No thread pool provided: execute function sequentially on the calling
      * thread */
     for (size_t i = 0; i < range_i; i += tile_i) {
@@ -322,7 +322,7 @@ void legacy_pthreadpool_compute_4d_tiled(
     size_t tile_j,
     size_t tile_k,
     size_t tile_l) {
-  if (threadpool == NULL) {
+  if (threadpool == nullptr) {
     /* No thread pool provided: execute function sequentially on the calling
      * thread */
     for (size_t i = 0; i < range_i; i += tile_i) {
diff --git a/caffe2/video/CMakeLists.txt b/caffe2/video/CMakeLists.txt
deleted file mode 100644
index 195c3c04a4ca0..0000000000000
--- a/caffe2/video/CMakeLists.txt
+++ /dev/null
@@ -1,59 +0,0 @@
-if(USE_OPENCV AND OpenCV_FOUND AND USE_FFMPEG AND FFMPEG_FOUND)
-        message(STATUS "Including video processing operators")
-  # ---[ GPU files
-  # ------[ general GPU
-  file(GLOB tmp *_gpu.cc)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-  # ------[ CUDA sources
-  file(GLOB tmp *.cu)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
-  # exclude test files
-  file(GLOB tmp *_test.cc)
-  exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
-
-  # ---[ HIP files
-  # ------[ general HIP
-  file(GLOB tmp hip/*.cc)
-  set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
-  # ------[ HIP sources
-  file(GLOB tmp hip/*.hip)
-  set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
-  # exclude test files
-  file(GLOB tmp hip/*_test.cc)
-  exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
-
-  # ---[ CPU files.
-  file(GLOB tmp *.cc)
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
-  # exclude test files and gpu files
-  file(GLOB tmp *_test.cc)
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
-
-  # ---[ GPU test files
-  file(GLOB tmp *_gpu_test.cc)
-  set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
-
-  # ---[ HIP test files
-  file(GLOB tmp hip/*_test.cc)
-  set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
-
-  # ---[ CPU test files
-  file(GLOB tmp *_test.cc)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
-  exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}"
-    ${Caffe2_GPU_TEST_SRCS})
-  exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}"
-    ${Caffe2_GPU_TEST_SRCS})
-
-  # ---[ Send the lists to the parent scope.
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-  set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-  set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
-  set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
-else()
-        message(STATUS "Excluding video processing operators due to no opencv")
-endif()
diff --git a/caffe2/video/optical_flow.cc b/caffe2/video/optical_flow.cc
deleted file mode 100644
index 8d343042f1f61..0000000000000
--- a/caffe2/video/optical_flow.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-#include <caffe2/video/optical_flow.h>
-
-namespace caffe2 {
-
-void OpticalFlowExtractor(
-    const cv::Mat& prev_gray,
-    const cv::Mat& curr_gray,
-    const int flow_alg_type,
-    cv::Mat& flow) {
-#if CV_MAJOR_VERSION >= 4
-  cv::Ptr<cv::DISOpticalFlow> tvl1 = cv::DISOpticalFlow::create();
-#else
-  cv::Ptr<cv::DualTVL1OpticalFlow> tvl1 = cv::DualTVL1OpticalFlow::create();
-#endif
-  switch (flow_alg_type) {
-    case FLowAlgType::FarnebackOpticalFlow:
-      cv::calcOpticalFlowFarneback(
-          prev_gray,
-          curr_gray,
-          flow,
-          std::sqrt(2) / 2.0,
-          5,
-          10,
-          2,
-          7,
-          1.5,
-          cv::OPTFLOW_FARNEBACK_GAUSSIAN);
-      break;
-    case FLowAlgType::DensePyrLKOpticalFlow:
-      LOG(ERROR) << "DensePyrLKOpticalFlow only has sparse version on CPU";
-      break;
-    case FLowAlgType::BroxOpticalFlow:
-      LOG(ERROR) << "BroxOpticalFlow on CPU is not available";
-      break;
-    case FLowAlgType::OpticalFlowDual_TVL1:
-      tvl1->calc(prev_gray, curr_gray, flow);
-      break;
-    default:
-      LOG(ERROR) << "Unsupported optical flow type " << flow_alg_type;
-      break;
-  }
-}
-
-void MergeOpticalFlow(cv::Mat& prev_flow, const cv::Mat& curr_flow) {
-  const int rows = prev_flow.rows;
-  const int cols = prev_flow.cols;
-
-  // merge two optical flows into one
-  for (int y = 0; y < rows; y++) {
-    for (int x = 0; x < cols; x++) {
-      cv::Point2f u = prev_flow.at<cv::Point2f>(y, x);
-      // get the new location
-      int x_new = std::min(cols - 1, std::max(0, cvRound(u.x + x)));
-      int y_new = std::min(rows - 1, std::max(0, cvRound(u.y + y)));
-      cv::Point2f u_new = curr_flow.at<cv::Point2f>(y_new, x_new);
-
-      // update the flow
-      prev_flow.at<cv::Point2f>(y, x) += u_new;
-    }
-  }
-}
-
-void MultiFrameOpticalFlowExtractor(
-    const std::vector<cv::Mat>& grays,
-    const int optical_flow_alg_type,
-    cv::Mat& flow) {
-  int num_frames = grays.size();
-  CAFFE_ENFORCE_GE(num_frames, 2, "need at least 2 frames!");
-
-  // compute optical flow for every two frames
-  std::vector<cv::Mat> flows;
-  for (int i = 0; i < num_frames - 1; i++) {
-    cv::Mat tmp;
-    OpticalFlowExtractor(grays[i], grays[i + 1], optical_flow_alg_type, tmp);
-    flows.push_back(tmp);
-  }
-
-  flows[0].copyTo(flow);
-  // aggregate optical flow across multiple frame
-  for (int i = 1; i < num_frames - 1; i++) {
-    MergeOpticalFlow(flow, flows[i]);
-  }
-}
-
-} // namespace caffe2
diff --git a/caffe2/video/optical_flow.h b/caffe2/video/optical_flow.h
deleted file mode 100644
index 2dbd7e31e4d58..0000000000000
--- a/caffe2/video/optical_flow.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef CAFFE2_VIDEO_OPTICAL_FLOW_H_
-#define CAFFE2_VIDEO_OPTICAL_FLOW_H_
-
-#include <opencv2/core.hpp>
-#include <opencv2/highgui.hpp>
-#include <opencv2/opencv.hpp>
-#include <opencv2/video.hpp>
-
-#include <caffe2/core/logging.h>
-
-namespace caffe2 {
-
-// Four different types of optical flow algorithms supported;
-// BroxOpticalFlow doesn't have a CPU version;
-// DensePyrLKOpticalFlow only has sparse CPU version;
-enum FLowAlgType {
-  FarnebackOpticalFlow = 0,
-  DensePyrLKOpticalFlow = 1,
-  BroxOpticalFlow = 2,
-  OpticalFlowDual_TVL1 = 3,
-};
-
-// Define different types of optical flow data type
-// 0: original two channel optical flow
-// 1: three channel optical flow with magnitude as the third channel
-// 2: two channel optical flow + one channel gray
-// 3: two channel optical flow + three channel rgb
-enum FlowDataType {
-  Flow2C = 0,
-  Flow3C = 1,
-  FlowWithGray = 2,
-  FlowWithRGB = 3,
-};
-
-void OpticalFlowExtractor(
-    const cv::Mat& prev_gray,
-    const cv::Mat& curr_gray,
-    const int optical_flow_alg_type,
-    cv::Mat& flow);
-
-void MergeOpticalFlow(cv::Mat& prev_flow, const cv::Mat& curr_flow);
-
-void MultiFrameOpticalFlowExtractor(
-    const std::vector<cv::Mat>& grays,
-    const int optical_flow_alg_type,
-    cv::Mat& flow);
-
-} // namespace caffe2
-
-#endif // CAFFE2_VIDEO_OPTICAL_FLOW_H_
diff --git a/caffe2/video/video_decoder.cc b/caffe2/video/video_decoder.cc
deleted file mode 100644
index 86bfbfa5ad2a0..0000000000000
--- a/caffe2/video/video_decoder.cc
+++ /dev/null
@@ -1,802 +0,0 @@
-#include <assert.h>
-#include <caffe2/core/logging.h>
-#include <caffe2/video/video_decoder.h>
-#include <array>
-#include <mutex>
-#include <random>
-
-namespace caffe2 {
-
-VideoDecoder::VideoDecoder() {
-  static bool gInitialized = false;
-  static std::mutex gMutex;
-  std::unique_lock<std::mutex> lock(gMutex);
-  if (!gInitialized) {
-    av_register_all();
-    avcodec_register_all();
-    avformat_network_init();
-    gInitialized = true;
-  }
-}
-
-void VideoDecoder::getAudioSample(
-    AVPacket& packet,
-    AVCodecContext* audioCodecContext_,
-    AVFrame* audioStreamFrame_,
-    SwrContext* convertCtx_,
-    Callback& callback,
-    const Params& params) {
-  int frame_finished = 0;
-  auto result = avcodec_decode_audio4(
-      audioCodecContext_, audioStreamFrame_, &frame_finished, &packet);
-
-  if (frame_finished) {
-    // from
-    // https://www.ffmpeg.org/doxygen/2.3/decoding_encoding_8c-example.html#a57
-    auto c = audioCodecContext_;
-    int data_size = av_samples_get_buffer_size(
-        nullptr, c->channels, audioStreamFrame_->nb_samples, c->sample_fmt, 1);
-    if (data_size < 0) {
-      // This should not occur, checking just for paranoia
-      LOG(ERROR) << "Failed to calculate data size";
-    }
-
-    // from https://www.ffmpeg.org/doxygen/2.1/group__lswr.html#details
-    uint8_t* output;
-    auto swr = convertCtx_;
-    auto inrate = audioCodecContext_->sample_rate;
-    auto in_samples = audioStreamFrame_->nb_samples;
-
-    int out_samples = av_rescale_rnd(
-        swr_get_delay(swr, inrate) + in_samples,
-        params.outrate_,
-        inrate,
-        AV_ROUND_UP);
-
-    if (out_samples > 0) {
-      auto input = (const uint8_t**)&audioStreamFrame_->data[0];
-      av_samples_alloc(
-          &output,
-          nullptr,
-          c->channels,
-          out_samples,
-          (AVSampleFormat)params.outfmt_,
-          0);
-
-      // resample the audio data
-      out_samples = swr_convert(swr, &output, out_samples, input, in_samples);
-      auto sample_size = out_samples * c->channels * sizeof(float);
-      auto buffer = std::make_unique<float[]>(sample_size);
-      memcpy(buffer.get(), output, sample_size);
-      av_freep(&output);
-
-      unique_ptr<DecodedAudio> audio_sample = make_unique<DecodedAudio>();
-      audio_sample->dataSize_ = data_size;
-      audio_sample->outSampleSize_ = out_samples * c->channels;
-      audio_sample->audio_data_ = std::move(buffer);
-      callback.audioDecoded(std::move(audio_sample));
-    }
-  } else {
-    result = packet.size;
-  }
-  packet.size -= result;
-  packet.data += result;
-}
-
-void VideoDecoder::ResizeAndKeepAspectRatio(
-    const int origWidth,
-    const int origHeight,
-    const int short_edge,
-    const int long_edge,
-    int& outWidth,
-    int& outHeight) {
-  if (origWidth < origHeight) {
-    // dominant height
-    if (short_edge > 0) {
-      // use short_edge for rescale
-      float ratio = short_edge / float(origWidth);
-      outWidth = short_edge;
-      outHeight = (int)round(ratio * origHeight);
-    } else {
-      // use long_edge for rescale
-      float ratio = long_edge / float(origHeight);
-      outHeight = long_edge;
-      outWidth = (int)round(ratio * origWidth);
-    }
-  } else {
-    // dominant width
-    if (short_edge > 0) {
-      // use short_edge for rescale
-      float ratio = short_edge / float(origHeight);
-      outHeight = short_edge;
-      outWidth = (int)round(ratio * origWidth);
-    } else {
-      // use long_edge for rescale
-      float ratio = long_edge / float(origWidth);
-      outWidth = long_edge;
-      outHeight = (int)round(ratio * origHeight);
-    }
-  }
-}
-
-void VideoDecoder::decodeLoop(
-    const string& videoName,
-    VideoIOContext& ioctx,
-    const Params& params,
-    const int start_frm,
-    Callback& callback) {
-  AVPixelFormat pixFormat = params.pixelFormat_;
-  AVFormatContext* inputContext = avformat_alloc_context();
-  AVStream* videoStream_ = nullptr;
-  AVCodecContext* videoCodecContext_ = nullptr;
-  AVCodecContext* audioCodecContext_ = nullptr;
-  AVFrame* videoStreamFrame_ = nullptr;
-  AVFrame* audioStreamFrame_ = nullptr;
-  SwrContext* convertCtx_ = nullptr;
-  AVPacket packet;
-  av_init_packet(&packet); // init packet
-  SwsContext* scaleContext_ = nullptr;
-
-  try {
-    inputContext->pb = ioctx.get_avio();
-    inputContext->flags |= AVFMT_FLAG_CUSTOM_IO;
-    int ret = 0;
-
-    // Determining the input format:
-    int probeSz = 1 * 1024 + AVPROBE_PADDING_SIZE;
-    DecodedFrame::AvDataPtr probe((uint8_t*)av_malloc(probeSz));
-    memset(probe.get(), 0, probeSz);
-    int len = ioctx.read(probe.get(), probeSz - AVPROBE_PADDING_SIZE);
-    if (len < probeSz - AVPROBE_PADDING_SIZE) {
-      LOG(ERROR) << "Insufficient data to determine video format";
-      return;
-    }
-    // seek back to start of stream
-    ioctx.seek(0, SEEK_SET);
-
-    unique_ptr<AVProbeData> probeData(new AVProbeData());
-    probeData->buf = probe.get();
-    probeData->buf_size = len;
-    probeData->filename = "";
-    // Determine the input-format:
-    inputContext->iformat = av_probe_input_format(probeData.get(), 1);
-    // this is to avoid the double-free error
-    if (inputContext->iformat == nullptr) {
-      LOG(ERROR) << "inputContext iformat is nullptr!";
-      return;
-    }
-
-    ret = avformat_open_input(&inputContext, "", nullptr, nullptr);
-    if (ret < 0) {
-      LOG(ERROR) << "Unable to open stream : " << ffmpegErrorStr(ret);
-      return;
-    }
-
-    ret = avformat_find_stream_info(inputContext, nullptr);
-    if (ret < 0) {
-      LOG(ERROR) << "Unable to find stream info in " << videoName << " "
-                 << ffmpegErrorStr(ret);
-      return;
-    }
-
-    // Decode the first video stream
-    int videoStreamIndex_ = params.streamIndex_;
-    int audioStreamIndex_ = params.streamIndex_;
-    if (params.streamIndex_ == -1) {
-      for (int i = 0; i < inputContext->nb_streams; i++) {
-        auto stream = inputContext->streams[i];
-        if (stream->codec->codec_type == AVMEDIA_TYPE_VIDEO &&
-            videoStreamIndex_ == -1) {
-          videoStreamIndex_ = i;
-          videoStream_ = stream;
-        } else if (
-            stream->codec->codec_type == AVMEDIA_TYPE_AUDIO &&
-            audioStreamIndex_ == -1) {
-          audioStreamIndex_ = i;
-        }
-        if (videoStreamIndex_ != -1 && audioStreamIndex_ != -1) {
-          break;
-        }
-      }
-    }
-    if (videoStream_ == nullptr) {
-      LOG(ERROR) << "Unable to find video stream in " << videoName << " "
-                 << ffmpegErrorStr(ret);
-      return;
-    }
-
-    // Initialize codec
-    AVDictionary* opts = nullptr;
-    videoCodecContext_ = videoStream_->codec;
-    try {
-      ret = avcodec_open2(
-          videoCodecContext_,
-          avcodec_find_decoder(videoCodecContext_->codec_id),
-          &opts);
-    } catch (const std::exception&) {
-      LOG(ERROR) << "Exception during open video codec";
-      return;
-    }
-
-    if (ret < 0) {
-      LOG(ERROR) << "Cannot open video codec : "
-                 << videoCodecContext_->codec->name;
-      return;
-    }
-
-    if (params.getAudio_ && audioStreamIndex_ >= 0) {
-      // see e.g. ridge/decoder/StreamDecoder.cpp
-      audioCodecContext_ = inputContext->streams[audioStreamIndex_]->codec;
-      ret = avcodec_open2(
-          audioCodecContext_,
-          avcodec_find_decoder(audioCodecContext_->codec_id),
-          nullptr);
-
-      if (ret < 0) {
-        LOG(ERROR) << "Cannot open audio codec : "
-                   << audioCodecContext_->codec->name;
-        return;
-      }
-
-      convertCtx_ = swr_alloc_set_opts(
-          nullptr,
-          params.outlayout_,
-          (AVSampleFormat)params.outfmt_,
-          params.outrate_,
-          audioCodecContext_->channel_layout,
-          audioCodecContext_->sample_fmt,
-          audioCodecContext_->sample_rate,
-          0,
-          nullptr);
-
-      if (convertCtx_ == nullptr) {
-        LOG(ERROR) << "Cannot setup sample format converter.";
-        return;
-      }
-      if (swr_init(convertCtx_) < 0) {
-        LOG(ERROR) << "Cannot init sample format converter.";
-        return;
-      }
-    }
-
-    // Calculate if we need to rescale the frames
-    const int origWidth = videoCodecContext_->width;
-    const int origHeight = videoCodecContext_->height;
-    int outWidth = origWidth;
-    int outHeight = origHeight;
-
-    if (params.video_res_type_ == VideoResType::ORIGINAL_RES) {
-      // if the original resolution is too low,
-      // make it at least the same size as crop_size_
-      if (params.crop_size_ > origWidth || params.crop_size_ > origHeight) {
-        ResizeAndKeepAspectRatio(
-            origWidth, origHeight, params.crop_size_, -1, outWidth, outHeight);
-      }
-    } else if (params.video_res_type_ == VideoResType::USE_SHORT_EDGE) {
-      // resize the image to the predefined
-      // short_edge_ resolution while keep the aspect ratio
-      ResizeAndKeepAspectRatio(
-          origWidth, origHeight, params.short_edge_, -1, outWidth, outHeight);
-    } else if (params.video_res_type_ == VideoResType::USE_WIDTH_HEIGHT) {
-      // resize the image to the predefined
-      // resolution and ignore the aspect ratio
-      outWidth = params.outputWidth_;
-      outHeight = params.outputHeight_;
-    } else {
-      LOG(ERROR) << "Unknown VideoResType: " << params.video_res_type_;
-      return;
-    }
-
-    // Make sure that we have a valid format
-    if (videoCodecContext_->pix_fmt == AV_PIX_FMT_NONE) {
-      LOG(ERROR) << "pixel format is not valid.";
-      return;
-    }
-
-    // Create a scale context
-    scaleContext_ = sws_getContext(
-        videoCodecContext_->width,
-        videoCodecContext_->height,
-        videoCodecContext_->pix_fmt,
-        outWidth,
-        outHeight,
-        pixFormat,
-        SWS_FAST_BILINEAR,
-        nullptr,
-        nullptr,
-        nullptr);
-
-    // Getting video meta data
-    VideoMeta videoMeta;
-    videoMeta.codec_type = videoCodecContext_->codec_type;
-    videoMeta.width = outWidth;
-    videoMeta.height = outHeight;
-    videoMeta.pixFormat = pixFormat;
-
-    // avoid division by zero, code adapted from
-    // https://www.ffmpeg.org/doxygen/0.6/rational_8h-source.html
-    if (videoStream_->avg_frame_rate.num == 0 ||
-        videoStream_->avg_frame_rate.den == 0) {
-      LOG(ERROR) << "Frame rate is wrong. No data found.";
-      return;
-    }
-
-    videoMeta.fps = av_q2d(videoStream_->avg_frame_rate);
-    callback.videoDecodingStarted(videoMeta);
-
-    if (params.intervals_.size() == 0) {
-      LOG(ERROR) << "Empty sampling intervals.";
-      return;
-    }
-
-    std::vector<SampleInterval>::const_iterator itvlIter =
-        params.intervals_.begin();
-    if (itvlIter->timestamp != 0) {
-      LOG(ERROR) << "Sampling interval starting timestamp is not zero.";
-      return;
-    }
-
-    double currFps = itvlIter->fps;
-    if (currFps < 0 && currFps != SpecialFps::SAMPLE_ALL_FRAMES &&
-        currFps != SpecialFps::SAMPLE_TIMESTAMP_ONLY) {
-      // fps must be 0, -1, -2 or > 0
-      LOG(ERROR) << "Invalid sampling fps.";
-      return;
-    }
-
-    double prevTimestamp = itvlIter->timestamp;
-    itvlIter++;
-    if (itvlIter != params.intervals_.end() &&
-        prevTimestamp >= itvlIter->timestamp) {
-      LOG(ERROR) << "Sampling interval timestamps must be strictly ascending.";
-      return;
-    }
-
-    double lastFrameTimestamp = -1.0;
-    double timestamp = -1.0;
-
-    // Initialize frame and packet.
-    // These will be reused across calls.
-    videoStreamFrame_ = av_frame_alloc();
-    audioStreamFrame_ = av_frame_alloc();
-
-    // frame index in video stream
-    int frameIndex = -1;
-    // frame index of outputed frames
-    int outputFrameIndex = -1;
-
-    /* identify the starting point from where we must start decoding */
-    std::mt19937 meta_randgen(time(nullptr));
-    long int start_ts = -1;
-    bool mustDecodeAll = false;
-
-    if (videoStream_->duration > 0 && videoStream_->nb_frames > 0) {
-      /* we have a valid duration and nb_frames. We can safely
-       * detect an intermediate timestamp to start decoding from. */
-
-      // leave a margin of 10 frames to take in to account the error
-      // from av_seek_frame
-      long int margin =
-          int(ceil((10 * videoStream_->duration) / (videoStream_->nb_frames)));
-      // if we need to do temporal jittering
-      if (params.decode_type_ == DecodeType::DO_TMP_JITTER) {
-        /* estimate the average duration for the required # of frames */
-        double maxFramesDuration =
-            (videoStream_->duration * params.num_of_required_frame_) /
-            (videoStream_->nb_frames);
-        int ts1 = 0;
-        int ts2 = videoStream_->duration - int(ceil(maxFramesDuration));
-        ts2 = ts2 > 0 ? ts2 : 0;
-        // pick a random timestamp between ts1 and ts2. ts2 is selected such
-        // that you have enough frames to satisfy the required # of frames.
-        start_ts = std::uniform_int_distribution<>(ts1, ts2)(meta_randgen);
-        // seek a frame at start_ts
-        ret = av_seek_frame(
-            inputContext,
-            videoStreamIndex_,
-            0 > (start_ts - margin) ? 0 : (start_ts - margin),
-            AVSEEK_FLAG_BACKWARD);
-
-        // if we need to decode from the start_frm
-      } else if (params.decode_type_ == DecodeType::USE_START_FRM) {
-        if (videoStream_ == nullptr) {
-          LOG(ERROR) << "Nullptr found at videoStream_";
-          return;
-        }
-        start_ts = int(floor(
-            (videoStream_->duration * start_frm) / (videoStream_->nb_frames)));
-        // seek a frame at start_ts
-        ret = av_seek_frame(
-            inputContext,
-            videoStreamIndex_,
-            0 > (start_ts - margin) ? 0 : (start_ts - margin),
-            AVSEEK_FLAG_BACKWARD);
-      } else {
-        mustDecodeAll = true;
-      }
-
-      if (ret < 0) {
-        LOG(INFO) << "Unable to decode from a random start point";
-        /* fall back to default decoding of all frames from start */
-        av_seek_frame(inputContext, videoStreamIndex_, 0, AVSEEK_FLAG_BACKWARD);
-        mustDecodeAll = true;
-      }
-    } else {
-      mustDecodeAll = true;
-    }
-
-    int gotPicture = 0;
-    int eof = 0;
-    int selectiveDecodedFrames = 0;
-
-    int maxFrames = (params.decode_type_ == DecodeType::DO_UNIFORM_SMP)
-        ? MAX_DECODING_FRAMES
-        : params.num_of_required_frame_;
-    // There is a delay between reading packets from the
-    // transport and getting decoded frames back.
-    // Therefore, after EOF, continue going while
-    // the decoder is still giving us frames.
-    int ipacket = 0;
-    while ((!eof || gotPicture) &&
-           /* either you must decode all frames or decode up to maxFrames
-            * based on status of the mustDecodeAll flag */
-           (mustDecodeAll || (selectiveDecodedFrames < maxFrames)) &&
-           /* If on the last interval and not autodecoding keyframes and a
-            * SpecialFps indicates no more frames are needed, stop decoding */
-           !((itvlIter == params.intervals_.end() &&
-              (currFps == SpecialFps::SAMPLE_TIMESTAMP_ONLY ||
-               currFps == SpecialFps::SAMPLE_NO_FRAME)) &&
-             !params.keyFrames_)) {
-      try {
-        if (!eof) {
-          ret = av_read_frame(inputContext, &packet);
-          if (ret == AVERROR_EOF) {
-            eof = 1;
-            av_free_packet(&packet);
-            packet.data = nullptr;
-            packet.size = 0;
-            // stay in the while loop to flush frames
-          } else if (ret == AVERROR(EAGAIN)) {
-            av_free_packet(&packet);
-            continue;
-          } else if (ret < 0) {
-            LOG(ERROR) << "Error reading packet : " << ffmpegErrorStr(ret);
-            return;
-          }
-          ipacket++;
-
-          auto si = packet.stream_index;
-          if (params.getAudio_ && audioStreamIndex_ >= 0 &&
-              si == audioStreamIndex_) {
-            // Audio packets can have multiple audio frames in a single packet
-            while (packet.size > 0) {
-              assert(audioCodecContext_ != nullptr);
-              assert(convertCtx_ != nullptr);
-              getAudioSample(
-                  packet,
-                  audioCodecContext_,
-                  audioStreamFrame_,
-                  convertCtx_,
-                  callback,
-                  params);
-            }
-          }
-
-          if (si != videoStreamIndex_) {
-            av_free_packet(&packet);
-            continue;
-          }
-        }
-
-        ret = avcodec_decode_video2(
-            videoCodecContext_, videoStreamFrame_, &gotPicture, &packet);
-        if (ret < 0) {
-          LOG(ERROR) << "Error decoding video frame : " << ffmpegErrorStr(ret);
-          return;
-        }
-        try {
-          // Nothing to do without a picture
-          if (!gotPicture) {
-            av_free_packet(&packet);
-            continue;
-          }
-          frameIndex++;
-
-          long int frame_ts =
-              av_frame_get_best_effort_timestamp(videoStreamFrame_);
-          timestamp = frame_ts * av_q2d(videoStream_->time_base);
-          if ((frame_ts >= start_ts && !mustDecodeAll) || mustDecodeAll) {
-            /* process current frame if:
-             * 1) We are not doing selective decoding and mustDecodeAll
-             *    OR
-             * 2) We are doing selective decoding and current frame
-             *   timestamp is >= start_ts from where we start selective
-             *   decoding*/
-            // if reaching the next interval, update the current fps
-            // and reset lastFrameTimestamp so the current frame could be
-            // sampled (unless fps == SpecialFps::SAMPLE_NO_FRAME)
-            if (itvlIter != params.intervals_.end() &&
-                timestamp >= itvlIter->timestamp) {
-              lastFrameTimestamp = -1.0;
-              currFps = itvlIter->fps;
-              prevTimestamp = itvlIter->timestamp;
-              itvlIter++;
-              if (itvlIter != params.intervals_.end() &&
-                  prevTimestamp >= itvlIter->timestamp) {
-                LOG(ERROR)
-                    << "Sampling interval timestamps must be strictly ascending.";
-                return;
-              }
-            }
-
-            // keyFrame will bypass all checks on fps sampling settings
-            bool keyFrame = params.keyFrames_ && videoStreamFrame_->key_frame;
-            if (!keyFrame) {
-              // if fps == SpecialFps::SAMPLE_NO_FRAME (0), don't sample at all
-              if (currFps == SpecialFps::SAMPLE_NO_FRAME) {
-                av_free_packet(&packet);
-                continue;
-              }
-
-              // fps is considered reached in the following cases:
-              // 1. lastFrameTimestamp < 0 - start of a new interval
-              //    (or first frame)
-              // 2. currFps == SpecialFps::SAMPLE_ALL_FRAMES (-1) - sample every
-              //    frame
-              // 3. timestamp - lastFrameTimestamp has reached target fps and
-              //    currFps > 0 (not special fps setting)
-              // different modes for fps:
-              // SpecialFps::SAMPLE_NO_FRAMES (0):
-              //     disable fps sampling, no frame sampled at all
-              // SpecialFps::SAMPLE_ALL_FRAMES (-1):
-              //     unlimited fps sampling, will sample at native video fps
-              // SpecialFps::SAMPLE_TIMESTAMP_ONLY (-2):
-              //     disable fps sampling, but will get the frame at specific
-              //     timestamp
-              // others (> 0): decoding at the specified fps
-              bool fpsReached = lastFrameTimestamp < 0 ||
-                  currFps == SpecialFps::SAMPLE_ALL_FRAMES ||
-                  (currFps > 0 &&
-                   timestamp >= lastFrameTimestamp + (1 / currFps));
-
-              if (!fpsReached) {
-                av_free_packet(&packet);
-                continue;
-              }
-            }
-
-            lastFrameTimestamp = timestamp;
-
-            outputFrameIndex++;
-            if (params.maximumOutputFrames_ != -1 &&
-                outputFrameIndex >= params.maximumOutputFrames_) {
-              // enough frames
-              av_free_packet(&packet);
-              break;
-            }
-
-            AVFrame* rgbFrame = av_frame_alloc();
-            if (!rgbFrame) {
-              LOG(ERROR) << "Error allocating AVframe";
-              return;
-            }
-
-            try {
-              // Determine required buffer size and allocate buffer
-              int numBytes = avpicture_get_size(pixFormat, outWidth, outHeight);
-              DecodedFrame::AvDataPtr buffer(
-                  (uint8_t*)av_malloc(numBytes * sizeof(uint8_t)));
-
-              int size = avpicture_fill(
-                  (AVPicture*)rgbFrame,
-                  buffer.get(),
-                  pixFormat,
-                  outWidth,
-                  outHeight);
-
-              sws_scale(
-                  scaleContext_,
-                  videoStreamFrame_->data,
-                  videoStreamFrame_->linesize,
-                  0,
-                  videoCodecContext_->height,
-                  rgbFrame->data,
-                  rgbFrame->linesize);
-
-              unique_ptr<DecodedFrame> frame = make_unique<DecodedFrame>();
-              frame->width_ = outWidth;
-              frame->height_ = outHeight;
-              frame->data_ = std::move(buffer);
-              frame->size_ = size;
-              frame->index_ = frameIndex;
-              frame->outputFrameIndex_ = outputFrameIndex;
-              frame->timestamp_ = timestamp;
-              frame->keyFrame_ = videoStreamFrame_->key_frame;
-
-              callback.frameDecoded(std::move(frame));
-
-              selectiveDecodedFrames++;
-              av_frame_free(&rgbFrame);
-            } catch (const std::exception&) {
-              av_frame_free(&rgbFrame);
-            }
-          }
-          av_frame_unref(videoStreamFrame_);
-          av_frame_unref(audioStreamFrame_);
-        } catch (const std::exception&) {
-          av_frame_unref(videoStreamFrame_);
-          av_frame_unref(audioStreamFrame_);
-        }
-
-        av_free_packet(&packet);
-      } catch (const std::exception&) {
-        av_free_packet(&packet);
-      }
-    } // of while loop
-    callback.videoDecodingEnded(timestamp);
-
-    // free all stuffs
-    sws_freeContext(scaleContext_);
-    swr_free(&convertCtx_);
-    av_packet_unref(&packet);
-    av_frame_free(&videoStreamFrame_);
-    av_frame_free(&audioStreamFrame_);
-    avcodec_close(videoCodecContext_);
-    if (audioCodecContext_ != nullptr) {
-      avcodec_close(audioCodecContext_);
-    }
-    avformat_close_input(&inputContext);
-    avformat_free_context(inputContext);
-  } catch (const std::exception&) {
-    // In case of decoding error
-    // free all stuffs
-    sws_freeContext(scaleContext_);
-    swr_free(&convertCtx_);
-    av_packet_unref(&packet);
-    av_frame_free(&videoStreamFrame_);
-    av_frame_free(&audioStreamFrame_);
-    avcodec_close(videoCodecContext_);
-    avcodec_close(audioCodecContext_);
-    avformat_close_input(&inputContext);
-    avformat_free_context(inputContext);
-  }
-}
-
-void VideoDecoder::decodeMemory(
-    const string& videoName,
-    const char* buffer,
-    const int size,
-    const Params& params,
-    const int start_frm,
-    Callback& callback) {
-  VideoIOContext ioctx(buffer, size);
-  decodeLoop(videoName, ioctx, params, start_frm, callback);
-}
-
-void VideoDecoder::decodeFile(
-    const string& file,
-    const Params& params,
-    const int start_frm,
-    Callback& callback) {
-  VideoIOContext ioctx(file);
-  decodeLoop(file, ioctx, params, start_frm, callback);
-}
-
-string VideoDecoder::ffmpegErrorStr(int result) {
-  std::array<char, 128> buf;
-  av_strerror(result, buf.data(), buf.size());
-  return string(buf.data());
-}
-
-void FreeDecodedData(
-    std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames,
-    std::vector<std::unique_ptr<DecodedAudio>>& sampledAudio) {
-  // free the sampledFrames and sampledAudio
-  for (int i = 0; i < sampledFrames.size(); i++) {
-    DecodedFrame* p = sampledFrames[i].release();
-    delete p;
-  }
-  for (int i = 0; i < sampledAudio.size(); i++) {
-    DecodedAudio* p = sampledAudio[i].release();
-    delete p;
-  }
-  sampledFrames.clear();
-  sampledAudio.clear();
-}
-
-bool DecodeMultipleClipsFromVideo(
-    const char* video_buffer,
-    const std::string& video_filename,
-    const int encoded_size,
-    const Params& params,
-    const int start_frm,
-    const int clip_per_video,
-    const std::vector<int>& clip_start_positions,
-    const bool use_local_file,
-    int& height,
-    int& width,
-    std::vector<unsigned char*>& buffer_rgb) {
-  std::vector<std::unique_ptr<DecodedFrame>> sampledFrames;
-  std::vector<std::unique_ptr<DecodedAudio>> sampledAudio;
-  VideoDecoder decoder;
-
-  CallbackImpl callback;
-  // decoding from buffer or file
-  if (!use_local_file) {
-    decoder.decodeMemory(
-        string("Memory Buffer"),
-        video_buffer,
-        encoded_size,
-        params,
-        start_frm,
-        callback);
-  } else {
-    decoder.decodeFile(video_filename, params, start_frm, callback);
-  }
-
-  for (auto& frame : callback.frames) {
-    sampledFrames.push_back(std::move(frame));
-  }
-  for (auto& audio_sample : callback.audio_samples) {
-    sampledAudio.push_back(std::move(audio_sample));
-  }
-
-  for (int i = 0; i < buffer_rgb.size(); i++) {
-    unsigned char* buff = buffer_rgb[i];
-    delete[] buff;
-  }
-  buffer_rgb.clear();
-
-  if (sampledFrames.size() < params.num_of_required_frame_) {
-    LOG(ERROR)
-        << "The video seems faulty and we could not decode enough frames: "
-        << sampledFrames.size() << " VS " << params.num_of_required_frame_;
-    FreeDecodedData(sampledFrames, sampledAudio);
-    return true;
-  }
-  if (sampledFrames.size() == 0) {
-    LOG(ERROR) << "The samples frames have size 0, no frame to process";
-    FreeDecodedData(sampledFrames, sampledAudio);
-    return true;
-  }
-  height = sampledFrames[0]->height_;
-  width = sampledFrames[0]->width_;
-  float sample_stepsz = (clip_per_video <= 1)
-      ? 0
-      : (float(sampledFrames.size() - params.num_of_required_frame_) /
-         (clip_per_video - 1));
-
-  int image_size = 3 * height * width;
-  int clip_size = params.num_of_required_frame_ * image_size;
-  // get the RGB frames for each clip
-  if (clip_start_positions.size() > 0) {
-    for (int i = 0; i < clip_start_positions.size(); i++) {
-      unsigned char* buffer_rgb_ptr = new unsigned char[clip_size];
-      int clip_start = clip_start_positions[i];
-      for (int j = 0; j < params.num_of_required_frame_; j++) {
-        memcpy(
-            buffer_rgb_ptr + j * image_size,
-            (unsigned char*)sampledFrames[j + clip_start]->data_.get(),
-            image_size * sizeof(unsigned char));
-      }
-      buffer_rgb.push_back(buffer_rgb_ptr);
-    }
-  } else {
-    for (int i = 0; i < clip_per_video; i++) {
-      unsigned char* buffer_rgb_ptr = new unsigned char[clip_size];
-      int clip_start = floor(i * sample_stepsz);
-      for (int j = 0; j < params.num_of_required_frame_; j++) {
-        memcpy(
-            buffer_rgb_ptr + j * image_size,
-            (unsigned char*)sampledFrames[j + clip_start]->data_.get(),
-            image_size * sizeof(unsigned char));
-      }
-      buffer_rgb.push_back(buffer_rgb_ptr);
-    }
-  }
-  FreeDecodedData(sampledFrames, sampledAudio);
-
-  return true;
-}
-
-} // namespace caffe2
diff --git a/caffe2/video/video_decoder.h b/caffe2/video/video_decoder.h
deleted file mode 100644
index ba607fd8da3f0..0000000000000
--- a/caffe2/video/video_decoder.h
+++ /dev/null
@@ -1,525 +0,0 @@
-#ifndef CAFFE2_VIDEO_VIDEO_DECODER_H_
-#define CAFFE2_VIDEO_VIDEO_DECODER_H_
-
-#include <caffe2/core/logging.h>
-#include <stdio.h>
-#include <memory>
-#include <string>
-#include <vector>
-
-extern "C" {
-#include <libavcodec/avcodec.h>
-#include <libavformat/avformat.h>
-#include <libavformat/avio.h>
-#include <libavutil/log.h>
-#include <libavutil/motion_vector.h>
-#include <libswresample/swresample.h>
-#include <libswscale/swscale.h>
-}
-
-namespace caffe2 {
-
-#define VIO_BUFFER_SZ 32768
-#define MAX_DECODING_FRAMES 10000
-
-// enum to specify 3 special fps sampling behaviors:
-// 0: disable fps sampling, no frame sampled at all
-// -1: unlimited fps sampling, will sample at native video fps
-// -2: disable fps sampling, but will get the frame at specific timestamp
-enum SpecialFps {
-  SAMPLE_NO_FRAME = 0,
-  SAMPLE_ALL_FRAMES = -1,
-  SAMPLE_TIMESTAMP_ONLY = -2,
-};
-
-// three different types of resolution when decoding the video
-// 0: resize to width x height and ignore the aspect ratio;
-// 1: resize to short_edge and keep the aspect ratio;
-// 2: using the original resolution of the video; if resolution
-//    is smaller than crop_size x crop_size, resize to crop_size
-//    and keep the aspect ratio;
-// 3: for xray video service
-enum VideoResType {
-  USE_WIDTH_HEIGHT = 0,
-  USE_SHORT_EDGE = 1,
-  ORIGINAL_RES = 2,
-};
-
-// three different types of decoding behavior are supported
-// 0: do temporal jittering to sample a random clip from the video
-// 1: uniformly sample multiple clips from the video;
-// 2: sample a clip from a given starting frame
-// 3: for xray video service
-enum DecodeType {
-  DO_TMP_JITTER = 0,
-  DO_UNIFORM_SMP = 1,
-  USE_START_FRM = 2,
-};
-
-// sampling interval for fps starting at specified timestamp
-// use enum SpecialFps to set special fps decoding behavior
-// note sampled fps will not always accurately follow the target fps,
-// because sampled frame has to snap to actual frame timestamp,
-// e.g. video fps = 25, sample fps = 4 will sample every 0.28s, not 0.25
-// video fps = 25, sample fps = 5 will sample every 0.24s, not 0.2,
-// because of floating-point division accuracy (1 / 5.0 is not exactly 0.2)
-struct SampleInterval {
-  double timestamp;
-  double fps;
-  SampleInterval() : timestamp(-1), fps(SpecialFps::SAMPLE_ALL_FRAMES) {}
-  SampleInterval(double ts, double f) : timestamp(ts), fps(f) {}
-  bool operator<(const SampleInterval& itvl) const {
-    return (timestamp < itvl.timestamp);
-  }
-};
-
-class Params {
- public:
-  // return all key-frames regardless of specified fps
-  bool keyFrames_ = false;
-
-  // return audio data while decoding the video
-  bool getAudio_ = false;
-
-  // for sampling audio data
-  int outrate_ = 22000;
-  int outfmt_ = AV_SAMPLE_FMT_FLT;
-  int64_t outlayout_ = AV_CH_LAYOUT_MONO;
-
-  // Output image pixel format
-  AVPixelFormat pixelFormat_ = AVPixelFormat::AV_PIX_FMT_RGB24;
-
-  // Index of stream to decode.
-  // -1 will automatically decode the first video stream.
-  int streamIndex_ = -1;
-
-  // How many frames to output at most from the video
-  // -1 no limit
-  int maximumOutputFrames_ = -1;
-
-  // params for video resolution
-  int video_res_type_ = VideoResType::USE_WIDTH_HEIGHT;
-  int crop_size_ = -1;
-  int short_edge_ = -1;
-
-  // Output video size, -1 to preserve origianl dimension
-  int outputWidth_ = -1;
-  int outputHeight_ = -1;
-
-  // max output dimension, -1 to preserve original size
-  // the larger dimension of the video will be scaled to this size,
-  // and the second dimension will be scaled to preserve aspect ratio
-  int maxOutputDimension_ = -1;
-
-  // params for decoding behavior
-  int decode_type_ = DecodeType::DO_TMP_JITTER;
-  int num_of_required_frame_ = -1;
-
-  // intervals_ control variable sampling fps between different timestamps
-  // intervals_ must be ordered strictly ascending by timestamps
-  // the first interval must have a timestamp of zero
-  // fps must be either the 3 special fps defined in SpecialFps, or > 0
-  std::vector<SampleInterval> intervals_ = {{0, SpecialFps::SAMPLE_ALL_FRAMES}};
-
-  Params() {}
-
-  /**
-   * FPS of output frames
-   * setting here will reset intervals_ and force decoding at target FPS
-   * This can be used if user just want to decode at a steady fps
-   */
-  Params& fps(float v) {
-    intervals_.clear();
-    intervals_.emplace_back(0, v);
-    return *this;
-  }
-
-  /**
-   * Sample output frames at a specified list of timestamps
-   * Timestamps must be in increasing order, and timestamps past the end of the
-   * video will be ignored
-   * Setting here will reset intervals_
-   */
-  Params& setSampleTimestamps(const std::vector<double>& timestamps) {
-    intervals_.clear();
-    // insert an interval per desired frame.
-    for (auto& timestamp : timestamps) {
-      intervals_.emplace_back(timestamp, SpecialFps::SAMPLE_TIMESTAMP_ONLY);
-    }
-    return *this;
-  }
-
-  /**
-   * Pixel format of output buffer, default PIX_FMT_RGB24
-   */
-  Params& pixelFormat(AVPixelFormat pixelFormat) {
-    pixelFormat_ = pixelFormat;
-    return *this;
-  }
-
-  /**
-   * Return all key-frames
-   */
-  Params& keyFrames(bool keyFrames) {
-    keyFrames_ = keyFrames;
-    return *this;
-  }
-
-  /**
-   * Index of video stream to process, defaults to the first video stream
-   */
-  Params& streamIndex(int index) {
-    streamIndex_ = index;
-    return *this;
-  }
-
-  /**
-   * Only output this many frames, default to no limit
-   */
-  Params& maxOutputFrames(int count) {
-    maximumOutputFrames_ = count;
-    return *this;
-  }
-
-  /**
-   * Output frame width, default to video width
-   */
-  Params& outputWidth(int width) {
-    outputWidth_ = width;
-    return *this;
-  }
-
-  /**
-   * Output frame height, default to video height
-   */
-  Params& outputHeight(int height) {
-    outputHeight_ = height;
-    return *this;
-  }
-
-  /**
-   * Max dimension of either width or height, if any is bigger
-   * it will be scaled down to this and econd dimension
-   * will be scaled down to maintain aspect ratio.
-   */
-  Params& maxOutputDimension(int size) {
-    maxOutputDimension_ = size;
-    return *this;
-  }
-};
-
-// data structure for storing decoded video frames
-class DecodedFrame {
- public:
-  struct avDeleter {
-    void operator()(unsigned char* p) const {
-      av_free(p);
-    }
-  };
-  using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;
-
-  // decoded data buffer
-  AvDataPtr data_;
-
-  // size in bytes
-  int size_ = 0;
-
-  // frame dimensions
-  int width_ = 0;
-  int height_ = 0;
-
-  // timestamp in seconds since beginning of video
-  double timestamp_ = 0;
-
-  // true if this is a key frame.
-  bool keyFrame_ = false;
-
-  // index of frame in video
-  int index_ = -1;
-
-  // Sequential number of outputted frame
-  int outputFrameIndex_ = -1;
-};
-
-// data structure for storing decoded audio data
-struct DecodedAudio {
-  int dataSize_;
-  int outSampleSize_;
-  std::unique_ptr<float[]> audio_data_;
-
-  explicit DecodedAudio(
-      int dataSize = 0,
-      int outSampleSize = 0,
-      std::unique_ptr<float[]> audio_data = nullptr)
-      : dataSize_(dataSize),
-        outSampleSize_(outSampleSize),
-        audio_data_(std::move(audio_data)) {}
-};
-
-class VideoIOContext {
- public:
-  explicit VideoIOContext(const std::string& fname)
-      : workBuffersize_(VIO_BUFFER_SZ),
-        workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
-        inputFile_(nullptr),
-        inputBuffer_(nullptr),
-        inputBufferSize_(0) {
-    inputFile_ = fopen(fname.c_str(), "rb");
-    if (inputFile_ == nullptr) {
-      LOG(ERROR) << "Error opening video file " << fname;
-      return;
-    }
-    ctx_ = avio_alloc_context(
-        static_cast<unsigned char*>(workBuffer_.get()),
-        workBuffersize_,
-        0,
-        this,
-        &VideoIOContext::readFile,
-        nullptr, // no write function
-        &VideoIOContext::seekFile);
-  }
-
-  explicit VideoIOContext(const char* buffer, int size)
-      : workBuffersize_(VIO_BUFFER_SZ),
-        workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
-        inputFile_(nullptr),
-        inputBuffer_(buffer),
-        inputBufferSize_(size) {
-    ctx_ = avio_alloc_context(
-        static_cast<unsigned char*>(workBuffer_.get()),
-        workBuffersize_,
-        0,
-        this,
-        &VideoIOContext::readMemory,
-        nullptr, // no write function
-        &VideoIOContext::seekMemory);
-  }
-
-  ~VideoIOContext() {
-    av_free(ctx_);
-    if (inputFile_) {
-      fclose(inputFile_);
-    }
-  }
-
-  int read(unsigned char* buf, int buf_size) {
-    if (inputBuffer_) {
-      return readMemory(this, buf, buf_size);
-    } else if (inputFile_) {
-      return readFile(this, buf, buf_size);
-    } else {
-      return -1;
-    }
-  }
-
-  int64_t seek(int64_t offset, int whence) {
-    if (inputBuffer_) {
-      return seekMemory(this, offset, whence);
-    } else if (inputFile_) {
-      return seekFile(this, offset, whence);
-    } else {
-      return -1;
-    }
-  }
-
-  static int readFile(void* opaque, unsigned char* buf, int buf_size) {
-    VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
-    if (feof(h->inputFile_)) {
-      return AVERROR_EOF;
-    }
-    size_t ret = fread(buf, 1, buf_size, h->inputFile_);
-    if (ret < buf_size) {
-      if (ferror(h->inputFile_)) {
-        return -1;
-      }
-    }
-    return ret;
-  }
-
-  static int64_t seekFile(void* opaque, int64_t offset, int whence) {
-    VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
-    switch (whence) {
-      case SEEK_CUR: // from current position
-      case SEEK_END: // from eof
-      case SEEK_SET: // from beginning of file
-        return fseek(h->inputFile_, static_cast<long>(offset), whence);
-        break;
-      case AVSEEK_SIZE:
-        int64_t cur = ftell(h->inputFile_);
-        fseek(h->inputFile_, 0L, SEEK_END);
-        int64_t size = ftell(h->inputFile_);
-        fseek(h->inputFile_, cur, SEEK_SET);
-        return size;
-    }
-
-    return -1;
-  }
-
-  static int readMemory(void* opaque, unsigned char* buf, int buf_size) {
-    VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
-    if (buf_size < 0) {
-      return -1;
-    }
-
-    int reminder = h->inputBufferSize_ - h->offset_;
-    int r = buf_size < reminder ? buf_size : reminder;
-    if (r < 0) {
-      return AVERROR_EOF;
-    }
-
-    memcpy(buf, h->inputBuffer_ + h->offset_, r);
-    h->offset_ += r;
-    return r;
-  }
-
-  static int64_t seekMemory(void* opaque, int64_t offset, int whence) {
-    VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
-    switch (whence) {
-      case SEEK_CUR: // from current position
-        h->offset_ += offset;
-        break;
-      case SEEK_END: // from eof
-        h->offset_ = h->inputBufferSize_ + offset;
-        break;
-      case SEEK_SET: // from beginning of file
-        h->offset_ = offset;
-        break;
-      case AVSEEK_SIZE:
-        return h->inputBufferSize_;
-    }
-    return h->offset_;
-  }
-
-  AVIOContext* get_avio() {
-    return ctx_;
-  }
-
- private:
-  int workBuffersize_;
-  DecodedFrame::AvDataPtr workBuffer_;
-  // for file mode
-  FILE* inputFile_;
-
-  // for memory mode
-  const char* inputBuffer_;
-  int inputBufferSize_;
-  int offset_ = 0;
-
-  AVIOContext* ctx_;
-};
-
-struct VideoMeta {
-  double fps;
-  int width;
-  int height;
-  enum AVMediaType codec_type;
-  AVPixelFormat pixFormat;
-  VideoMeta()
-      : fps(-1),
-        width(-1),
-        height(-1),
-        codec_type(AVMEDIA_TYPE_VIDEO),
-        pixFormat(AVPixelFormat::AV_PIX_FMT_RGB24) {}
-};
-
-class Callback {
- public:
-  virtual void frameDecoded(std::unique_ptr<DecodedFrame> img) = 0;
-  virtual void audioDecoded(
-      std::unique_ptr<DecodedAudio> /*decoded audio data*/) {}
-  virtual void videoDecodingStarted(const VideoMeta& /*videoMeta*/) {}
-  virtual void videoDecodingEnded(double /*lastFrameTimestamp*/) {}
-  virtual ~Callback() {}
-};
-
-class VideoDecoder {
- public:
-  VideoDecoder();
-
-  void decodeFile(
-      const std::string& filename,
-      const Params& params,
-      const int start_frm,
-      Callback& callback);
-
-  void decodeMemory(
-      const std::string& filename,
-      const char* buffer,
-      const int size,
-      const Params& params,
-      const int start_frm,
-      Callback& callback);
-
- private:
-  std::string ffmpegErrorStr(int result);
-
-  void ResizeAndKeepAspectRatio(
-      const int origWidth,
-      const int origHeight,
-      const int short_edge,
-      const int long_edge,
-      int& outWidth,
-      int& outHeight);
-
-  void getAudioSample(
-      AVPacket& packet,
-      AVCodecContext* audioCodecContext_,
-      AVFrame* audioStreamFrame_,
-      SwrContext* convertCtx_,
-      Callback& callback,
-      const Params& params);
-
-  void decodeLoop(
-      const std::string& videoName,
-      VideoIOContext& ioctx,
-      const Params& params,
-      const int start_frm,
-      Callback& callback);
-};
-
-TORCH_API void FreeDecodedData(
-    std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames,
-    std::vector<std::unique_ptr<DecodedAudio>>& sampledAudio);
-
-TORCH_API bool DecodeMultipleClipsFromVideo(
-    const char* video_buffer,
-    const std::string& video_filename,
-    const int encoded_size,
-    const Params& params,
-    const int start_frm,
-    const int clip_per_video,
-    const std::vector<int>& clip_start_positions,
-    const bool use_local_file,
-    int& height,
-    int& width,
-    std::vector<unsigned char*>& buffer_rgb);
-
-class CallbackImpl : public Callback {
- public:
-  std::vector<std::unique_ptr<DecodedFrame>> frames;
-  std::vector<std::unique_ptr<DecodedAudio>> audio_samples;
-
-  explicit CallbackImpl() {
-    clear();
-  }
-
-  void clear() {
-    FreeDecodedData(frames, audio_samples);
-  }
-
-  void frameDecoded(std::unique_ptr<DecodedFrame> frame) override {
-    frames.push_back(std::move(frame));
-  }
-
-  void audioDecoded(std::unique_ptr<DecodedAudio> audio_sample) override {
-    audio_samples.push_back(std::move(audio_sample));
-  }
-
-  void videoDecodingStarted(const VideoMeta& /*videoMeta*/) override {
-    clear();
-  }
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_VIDEO_VIDEO_DECODER_H_
diff --git a/caffe2/video/video_input_op.cc b/caffe2/video/video_input_op.cc
deleted file mode 100644
index 8a6530a207b5b..0000000000000
--- a/caffe2/video/video_input_op.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-#include <caffe2/video/video_input_op.h>
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(VideoInput, VideoInputOp<CPUContext>);
-
-OPERATOR_SCHEMA(VideoInput)
-    .NumInputs(0, 1)
-    .NumOutputs(2, 5)
-    .TensorInferenceFunction(
-        [](const OperatorDef& def,
-           const vector<TensorShape>& /* unused */ /*in*/) {
-          ArgumentHelper helper(def);
-          int batch_size = helper.GetSingleArgument<int>("batch_size", 0);
-          int clip_per_video =
-              helper.GetSingleArgument<int>("clip_per_video", 1);
-          int crop_size = helper.GetSingleArgument<int>("crop_size", -1);
-          int length_rgb = helper.GetSingleArgument<int>("length_rgb", 0);
-          int channels_rgb = helper.GetSingleArgument<int>("channels_rgb", 3);
-          int length_of = helper.GetSingleArgument<int>("length_of", 0);
-          int channels_of = helper.GetSingleArgument<int>("channels_of", 2);
-
-          // get the flags
-          bool get_rgb = helper.GetSingleArgument<bool>("get_rgb", true);
-          bool get_optical_flow =
-              helper.GetSingleArgument<bool>("get_optical_flow", false);
-          bool do_multi_label =
-              helper.GetSingleArgument<bool>("do_multi_label", false);
-          bool get_video_id =
-              helper.GetSingleArgument<bool>("get_video_id", false);
-          bool get_start_frame =
-              helper.GetSingleArgument<bool>("get_start_frame", false);
-          // get starting positions if available
-          vector<int> clip_start_positions =
-              helper.GetRepeatedArgument<int>("clip_start_positions", {});
-          // In case clip_start_positions are given, set the clip_per_video arg
-          if (clip_start_positions.size() > 0) {
-            clip_per_video = clip_start_positions.size();
-          }
-
-          int output_size = 1;
-          if (get_rgb) {
-            output_size++;
-          }
-          if (get_optical_flow) {
-            output_size++;
-          }
-          if (get_video_id) {
-            output_size++;
-          }
-          if (get_start_frame) {
-            output_size++;
-          }
-
-          int index = 0;
-          vector<TensorShape> out(output_size);
-          TORCH_CHECK_GT(crop_size, 0);
-          batch_size *= clip_per_video;
-          if (get_rgb) {
-            out[index++] = CreateTensorShape(
-                vector<int>{
-                    batch_size, channels_rgb, length_rgb, crop_size, crop_size},
-                TensorProto::FLOAT);
-          }
-          if (get_optical_flow) {
-            out[index++] = CreateTensorShape(
-                vector<int>{
-                    batch_size, channels_of, length_of, crop_size, crop_size},
-                TensorProto::FLOAT);
-          }
-          if (!do_multi_label) {
-            out[index++] = CreateTensorShape(
-                vector<int>{1, batch_size}, TensorProto::INT32);
-          } else {
-            int num_of_class = helper.GetSingleArgument<int>("num_of_class", 0);
-            out[index++] = CreateTensorShape(
-                vector<int>{batch_size, num_of_class}, TensorProto::INT32);
-          }
-          if (get_video_id) {
-            out[index++] = CreateTensorShape(
-                vector<int64_t>{1, batch_size}, TensorProto::INT64);
-          }
-          if (get_start_frame) {
-            out[index] = CreateTensorShape(
-                vector<int>{1, batch_size}, TensorProto::INT32);
-          }
-
-          return out;
-        });
-
-NO_GRADIENT(VideoInput);
-
-} // namespace caffe2
diff --git a/caffe2/video/video_input_op.h b/caffe2/video/video_input_op.h
deleted file mode 100644
index 36d7be54b3260..0000000000000
--- a/caffe2/video/video_input_op.h
+++ /dev/null
@@ -1,1024 +0,0 @@
-#ifndef CAFFE2_VIDEO_VIDEO_INPUT_OP_H_
-#define CAFFE2_VIDEO_VIDEO_INPUT_OP_H_
-
-#include <exception>
-#include <istream>
-#include <ostream>
-#include <random>
-#include <string>
-
-#include <c10/core/thread_pool.h>
-#include <c10/util/irange.h>
-#include <caffe2/core/db.h>
-#include <caffe2/core/logging.h>
-#include <caffe2/operators/prefetch_op.h>
-#include <caffe2/utils/math.h>
-#include <caffe2/video/video_decoder.h>
-#include <caffe2/video/video_io.h>
-
-namespace caffe2 {
-
-template <class Context>
-class VideoInputOp final : public PrefetchOperator<Context> {
- public:
-  using OperatorBase::OutputSize;
-  using PrefetchOperator<Context>::context_;
-  using PrefetchOperator<Context>::prefetch_thread_;
-  explicit VideoInputOp(const OperatorDef& operator_def, Workspace* ws);
-  ~VideoInputOp() {
-    PrefetchOperator<Context>::Finalize();
-  }
-
-  // override methods
-  bool Prefetch() override;
-  bool CopyPrefetched() override;
-
- private:
-  void CheckParamsAndPrint();
-
-  bool GetClipsAndLabelsFromDBValue(
-      const std::string& value,
-      int& height,
-      int& width,
-      std::vector<unsigned char*>& buffer_rgb,
-      int* label_data,
-      int64_t* video_id_data,
-      int* start_frame_data,
-      std::mt19937* randgen);
-
-  void DecodeAndTransform(
-      const std::string& value,
-      float* clip_rgb_data,
-      float* clip_of_data,
-      int* label_data,
-      int64_t* video_id_data,
-      int* start_frame_data,
-      std::mt19937* randgen,
-      std::bernoulli_distribution* mirror_this_clip);
-
-  void GetLabelsFromProto(const TensorProto& label_proto, int* label_data);
-
-  bool GetImageAndLabelsFromDBValue(
-      const std::string& value,
-      int& height,
-      int& width,
-      std::vector<unsigned char*>& buffer_rgb,
-      int* label_data);
-
-  const db::DBReader* reader_;
-  CPUContext cpu_context_;
-  Tensor prefetched_clip_rgb_;
-  Tensor prefetched_clip_of_;
-  Tensor prefetched_label_;
-  Tensor prefetched_video_id_;
-  Tensor prefetched_start_frame_;
-  Tensor prefetched_clip_rgb_on_device_{Context::GetDeviceType()};
-  Tensor prefetched_clip_of_on_device_{Context::GetDeviceType()};
-  Tensor prefetched_label_on_device_{Context::GetDeviceType()};
-  Tensor prefetched_video_id_on_device_{Context::GetDeviceType()};
-  Tensor prefetched_start_frame_on_device_{Context::GetDeviceType()};
-
-  int batch_size_;
-  int clip_per_video_;
-  std::vector<int> clip_start_positions_;
-  std::vector<float> mean_rgb_;
-  std::vector<float> inv_std_rgb_;
-  std::vector<float> mean_of_;
-  std::vector<float> inv_std_of_;
-  int channels_rgb_;
-  int channels_of_;
-  int crop_size_;
-  int scale_h_;
-  int scale_w_;
-  int short_edge_;
-  std::vector<int> jitter_scales_;
-  int length_rgb_;
-  int sampling_rate_rgb_;
-  int random_sampling_rate_;
-  int num_of_required_frame_;
-  int length_of_;
-  int sampling_rate_of_;
-  int frame_gap_of_;
-  bool random_mirror_;
-  int num_of_class_;
-  bool use_local_file_;
-  bool random_crop_;
-  int crop_per_clip_;
-  int flow_data_type_;
-  int flow_alg_type_;
-  int decode_type_;
-  int video_res_type_;
-  bool do_flow_aggregation_;
-  bool image_as_input_;
-  bool get_rgb_;
-  bool get_optical_flow_;
-  bool get_video_id_;
-  bool get_start_frame_;
-  bool do_multi_label_;
-
-  // thread pool for parse + decode
-  int num_decode_threads_;
-  std::shared_ptr<TaskThreadPool> thread_pool_;
-};
-
-template <class Context>
-void VideoInputOp<Context>::CheckParamsAndPrint() {
-  // check whether the input parameters are valid or not
-  CAFFE_ENFORCE_GT(batch_size_, 0, "Batch size should be positive.");
-  CAFFE_ENFORCE_GT(
-      clip_per_video_, 0, "Number of clips per video should be positive.");
-  CAFFE_ENFORCE_GT(crop_size_, 0, "Must provide the cropping value.");
-
-  if (!image_as_input_) {
-    CAFFE_ENFORCE_GT(
-        num_of_required_frame_,
-        0,
-        "Required number of frames must be positive.");
-  }
-
-  if (image_as_input_) {
-    CAFFE_ENFORCE_EQ(
-        video_res_type_,
-        VideoResType::USE_WIDTH_HEIGHT,
-        "Currently only USE_WIDTH_HEIGHT option is supported with images");
-  }
-
-  if (video_res_type_ == VideoResType::USE_SHORT_EDGE) {
-    CAFFE_ENFORCE_GT(short_edge_, 0, "Must provide the short edge value.");
-    CAFFE_ENFORCE_GE(
-        short_edge_,
-        crop_size_,
-        "The short edge must be no smaller than the crop value.");
-  } else if (video_res_type_ == VideoResType::USE_WIDTH_HEIGHT) {
-    CAFFE_ENFORCE_GT(scale_h_, 0, "Must provide the scale height value.");
-    CAFFE_ENFORCE_GT(scale_w_, 0, "Must provide the scale width value.");
-    CAFFE_ENFORCE_GE(
-        scale_h_,
-        crop_size_,
-        "The scaled height must be no smaller than the crop value.");
-    CAFFE_ENFORCE_GE(
-        scale_w_,
-        crop_size_,
-        "The scaled width must be no smaller than the crop value.");
-  }
-
-  if (jitter_scales_.size() > 0) {
-    CAFFE_ENFORCE_GE(
-        video_res_type_,
-        VideoResType::USE_SHORT_EDGE,
-        "Scale jittering is used with short_edge scaling only");
-  }
-
-  if (get_rgb_) {
-    CAFFE_ENFORCE_GT(length_rgb_, 0, "Must provide rgb clip length.");
-    CAFFE_ENFORCE_GT(
-        sampling_rate_rgb_, 0, "4 frames for mc2; 2 frames for res3d.");
-    CAFFE_ENFORCE_EQ(
-        channels_rgb_, mean_rgb_.size(), "Number rgb channels is wrong!");
-    CAFFE_ENFORCE_EQ(
-        channels_rgb_, inv_std_rgb_.size(), "Number rgb channels is wrong!");
-  }
-
-  if (get_optical_flow_) {
-    CAFFE_ENFORCE_GT(length_of_, 0, "Must provide optical flow clip length.");
-    CAFFE_ENFORCE_GT(
-        sampling_rate_of_, 0, "4 frames for mc2; 2 frames for res3d.");
-    CAFFE_ENFORCE_EQ(
-        channels_of_,
-        mean_of_.size(),
-        "Number of optical flow channels is wrong!");
-    CAFFE_ENFORCE_EQ(
-        channels_of_,
-        inv_std_of_.size(),
-        "Number of optical flow channels is wrong!");
-  }
-
-  if (clip_per_video_ > 1) {
-    CAFFE_ENFORCE_EQ(
-        decode_type_,
-        DecodeType::DO_UNIFORM_SMP,
-        "Only uniformly sampling is supported when sampling multiple clips!");
-  }
-
-  if (do_multi_label_) {
-    CAFFE_ENFORCE_GT(
-        num_of_class_,
-        0,
-        "Number of classes must be set when using multiple labels.");
-  }
-
-  // print out the parameter settings
-  LOG(INFO) << "Creating a clip input op with the following setting: ";
-  LOG(INFO) << "    Input Type: " << (image_as_input_ ? "Image" : "Video");
-  LOG(INFO) << "    Using " << num_decode_threads_ << " CPU threads;";
-  LOG(INFO) << "    Outputting in batches of " << batch_size_ << " videos;";
-  LOG(INFO) << "    Each video has " << clip_per_video_ << " clips;";
-  LOG(INFO) << "    Scaling image to " << scale_h_ << "x" << scale_w_;
-  LOG(INFO) << "    Cropping video frame to " << crop_size_
-            << (random_mirror_ ? " with " : " without ") << "random mirroring;";
-  LOG(INFO) << "    Using " << (random_crop_ ? "random" : "center") << " crop";
-  LOG(INFO) << "    Using " << crop_per_clip_ << " spatial crop(s)";
-
-  if (get_rgb_) {
-    LOG(INFO) << "    Using a clip of " << length_rgb_ << " rgb frames "
-              << "with " << channels_rgb_ << " channels "
-              << "and a sampling rate of 1:" << sampling_rate_rgb_;
-    if (random_sampling_rate_) {
-      LOG(INFO) << "random sampling with max:" << random_sampling_rate_;
-    }
-    for (const auto i : c10::irange(channels_rgb_)) {
-      LOG(INFO) << "    RGB " << i << "-th channel mean: " << mean_rgb_[i]
-                << " std: " << 1.f / inv_std_rgb_[i];
-    }
-  }
-
-  if (get_optical_flow_) {
-    LOG(INFO) << "    Using a clip of " << length_of_ << " optical flow frames "
-              << "with " << channels_of_ << " channels "
-              << "and a sampling rate of 1:" << sampling_rate_of_
-              << " flow_data_type_: " << flow_data_type_
-              << " flow_alg_type_: " << flow_alg_type_;
-    for (const auto i : c10::irange(channels_of_)) {
-      LOG(INFO) << "    Optical flow" << i
-                << "-th channel mean: " << mean_of_[i]
-                << " std: " << 1.f / inv_std_of_[i];
-    }
-  }
-
-  if (video_res_type_ == VideoResType::ORIGINAL_RES) {
-    LOG(INFO) << "    Use original resolution";
-  } else if (video_res_type_ == VideoResType::USE_SHORT_EDGE) {
-    LOG(INFO) << "    Resize and keep aspect ratio";
-  } else if (video_res_type_ == VideoResType::USE_WIDTH_HEIGHT) {
-    LOG(INFO) << "    Resize and ignore aspect ratio";
-  } else {
-    LOG(ERROR) << "    Unknown video resolution type";
-  }
-
-  if (video_res_type_ == VideoResType::USE_SHORT_EDGE) {
-    if (jitter_scales_.size() > 0) {
-      LOG(INFO) << "Using scale jittering:";
-      for (const auto idx : c10::irange(jitter_scales_.size())) {
-        LOG(INFO) << "scale " << idx << ": " << jitter_scales_[idx];
-      }
-    } else {
-      LOG(INFO) << "No scale jittering is used.";
-    }
-  }
-
-  if (decode_type_ == DecodeType::DO_TMP_JITTER) {
-    LOG(INFO) << "    Do temporal jittering";
-  } else if (decode_type_ == DecodeType::USE_START_FRM) {
-    LOG(INFO) << "    Use start_frm for decoding";
-  } else if (decode_type_ == DecodeType::DO_UNIFORM_SMP) {
-    LOG(INFO) << "    Do uniformly sampling";
-  } else {
-    LOG(ERROR) << "    Unknown video decoding type";
-  }
-  if (get_start_frame_) {
-    CAFFE_ENFORCE_EQ(
-        decode_type_,
-        DecodeType::USE_START_FRM,
-        "Only decoding with starting frame is supported w/ get start_frame!");
-    CAFFE_ENFORCE_EQ(
-        clip_per_video_, 1, "get start frame support only clip per video = 1");
-  }
-}
-
-template <class Context>
-VideoInputOp<Context>::VideoInputOp(
-    const OperatorDef& operator_def,
-    Workspace* ws)
-    : PrefetchOperator<Context>(operator_def, ws),
-      reader_(nullptr),
-      batch_size_(
-          OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
-      clip_per_video_(
-          OperatorBase::template GetSingleArgument<int>("clip_per_video", 1)),
-      clip_start_positions_(OperatorBase::template GetRepeatedArgument<int>(
-          "clip_start_positions",
-          {})),
-      channels_rgb_(
-          OperatorBase::template GetSingleArgument<int>("channels_rgb", 3)),
-      channels_of_(
-          OperatorBase::template GetSingleArgument<int>("channels_of", 2)),
-      crop_size_(OperatorBase::template GetSingleArgument<int>("crop_size", 0)),
-      scale_h_(OperatorBase::template GetSingleArgument<int>("scale_h", 0)),
-      scale_w_(OperatorBase::template GetSingleArgument<int>("scale_w", 0)),
-      short_edge_(
-          OperatorBase::template GetSingleArgument<int>("short_edge", 0)),
-      jitter_scales_(
-          OperatorBase::template GetRepeatedArgument<int>("jitter_scales", {})),
-      length_rgb_(
-          OperatorBase::template GetSingleArgument<int>("length_rgb", 0)),
-      sampling_rate_rgb_(OperatorBase::template GetSingleArgument<int>(
-          "sampling_rate_rgb",
-          1)),
-      random_sampling_rate_(OperatorBase::template GetSingleArgument<int>(
-          "random_sampling_rate",
-          0)),
-      length_of_(OperatorBase::template GetSingleArgument<int>("length_of", 0)),
-      sampling_rate_of_(
-          OperatorBase::template GetSingleArgument<int>("sampling_rate_of", 1)),
-      frame_gap_of_(
-          OperatorBase::template GetSingleArgument<int>("frame_gap_of", 1)),
-      random_mirror_(OperatorBase::template GetSingleArgument<bool>(
-          "random_mirror",
-          true)),
-      num_of_class_(
-          OperatorBase::template GetSingleArgument<int>("num_of_class", 0)),
-      use_local_file_(OperatorBase::template GetSingleArgument<bool>(
-          "use_local_file",
-          false)),
-      random_crop_(
-          OperatorBase::template GetSingleArgument<bool>("random_crop", true)),
-      crop_per_clip_(
-          OperatorBase::template GetSingleArgument<int>("crop_per_clip", 1)),
-      flow_data_type_(
-          OperatorBase::template GetSingleArgument<int>("flow_data_type", 0)),
-      flow_alg_type_(
-          OperatorBase::template GetSingleArgument<int>("flow_alg_type", 0)),
-      decode_type_(
-          OperatorBase::template GetSingleArgument<int>("decode_type", 0)),
-      video_res_type_(
-          OperatorBase::template GetSingleArgument<int>("video_res_type", 0)),
-      do_flow_aggregation_(OperatorBase::template GetSingleArgument<bool>(
-          "do_flow_aggregation",
-          true)),
-      image_as_input_(OperatorBase::template GetSingleArgument<bool>(
-          "image_as_input",
-          false)),
-      get_rgb_(OperatorBase::template GetSingleArgument<bool>("get_rgb", true)),
-      get_optical_flow_(OperatorBase::template GetSingleArgument<bool>(
-          "get_optical_flow",
-          false)),
-      get_video_id_(OperatorBase::template GetSingleArgument<bool>(
-          "get_video_id",
-          false)),
-      get_start_frame_(OperatorBase::template GetSingleArgument<bool>(
-          "get_start_frame",
-          false)),
-      do_multi_label_(OperatorBase::template GetSingleArgument<bool>(
-          "do_multi_label",
-          false)),
-      num_decode_threads_(OperatorBase::template GetSingleArgument<int>(
-          "num_decode_threads",
-          4)),
-      thread_pool_(std::make_shared<TaskThreadPool>(num_decode_threads_)) {
-  try {
-    num_of_required_frame_ = 0;
-
-    // mean and std for normalizing different optical flow data type;
-    // Example statistics generated from SOA are shown below, and you may
-    // want to change them if you are running on a different dataset;
-
-    // 7 channels: (flow_x, flow_y, flow_magitude, gray, Red, Green, Blue)
-    const std::vector<float> InputDataMean = {
-        0.0046635, 0.0046261, 0.963986, 102.976, 110.201, 100.64, 95.9966};
-    const std::vector<float> InputDataStd = {
-        0.972347, 0.755146, 1.43588, 55.3691, 58.1489, 56.4701, 55.3324};
-
-    // if we need RGB as an input
-    if (get_rgb_ && !image_as_input_) {
-      // how many frames we need for RGB
-      num_of_required_frame_ = std::max(
-          num_of_required_frame_, (length_rgb_ - 1) * sampling_rate_rgb_ + 1);
-
-      if (random_sampling_rate_) {
-        num_of_required_frame_ = std::max(
-            num_of_required_frame_,
-            (length_rgb_ - 1) * random_sampling_rate_ + 1);
-      }
-
-      channels_rgb_ = 3;
-      for (const auto i : c10::irange(4, 7)) {
-        mean_rgb_.push_back(InputDataMean[i]);
-        inv_std_rgb_.push_back(1.f / InputDataStd[i]);
-      }
-    }
-
-    if (image_as_input_) {
-      channels_rgb_ = 3;
-      length_rgb_ = 1;
-      clip_per_video_ = 1;
-      get_optical_flow_ = false;
-      get_rgb_ = true;
-      sampling_rate_rgb_ = 1;
-      for (const auto i : c10::irange(4, 7)) {
-        mean_rgb_.push_back(InputDataMean[i]);
-        inv_std_rgb_.push_back(1.f / InputDataStd[i]);
-      }
-    }
-
-    // if we need optical flow as an input
-    if (get_optical_flow_) {
-      // how many frames we need for optical flow
-      num_of_required_frame_ = std::max(
-          num_of_required_frame_,
-          (length_of_ - 1) * sampling_rate_of_ + frame_gap_of_ + 1);
-
-      // set the parameters for different input data types
-      switch (flow_data_type_) {
-        case FlowDataType::Flow2C:
-          channels_of_ = 2;
-          for (const auto i : c10::irange(channels_of_)) {
-            mean_of_.push_back(InputDataMean[i]);
-            inv_std_of_.push_back(1.f / InputDataStd[i]);
-          }
-          break;
-
-        case FlowDataType::Flow3C:
-          channels_of_ = 3;
-          for (const auto i : c10::irange(channels_of_)) {
-            mean_of_.push_back(InputDataMean[i]);
-            inv_std_of_.push_back(1.f / InputDataStd[i]);
-          }
-          break;
-
-        // early fusion with gray
-        case FlowDataType::FlowWithGray:
-          channels_of_ = 3;
-          for (const auto i : c10::irange(2)) {
-            mean_of_.push_back(InputDataMean[i]);
-            inv_std_of_.push_back(1.f / InputDataStd[i]);
-          }
-          mean_of_.push_back(InputDataMean[3]);
-          inv_std_of_.push_back(1.f / InputDataStd[3]);
-          break;
-
-        // early fusion with RGB
-        case FlowDataType::FlowWithRGB:
-          channels_of_ = 5;
-          for (const auto i : c10::irange(2)) {
-            mean_of_.push_back(InputDataMean[i]);
-            inv_std_of_.push_back(1.f / InputDataStd[i]);
-          }
-          for (const auto i : c10::irange(4, 7)) {
-            mean_of_.push_back(InputDataMean[i]);
-            inv_std_of_.push_back(1.f / InputDataStd[i]);
-          }
-          break;
-
-        default:
-          LOG(ERROR) << "Unknown optical flow type " << flow_data_type_;
-          break;
-      }
-    }
-
-    CheckParamsAndPrint();
-    // Always need a dbreader, even when using local video files
-    CAFFE_ENFORCE_GT(
-        operator_def.input_size(), 0, "Need to have a DBReader blob input");
-
-    vector<int64_t> data_shape(5);
-    vector<int64_t> label_shape(2);
-
-    // In case clip_start_positions are given, set the clip_per_video arg
-    if (clip_start_positions_.size() > 0) {
-      clip_per_video_ = clip_start_positions_.size();
-    }
-
-    // for RGB data
-    data_shape[0] = batch_size_ * clip_per_video_ * crop_per_clip_;
-    data_shape[1] = channels_rgb_;
-    data_shape[2] = length_rgb_;
-    data_shape[3] = crop_size_;
-    data_shape[4] = crop_size_;
-    ReinitializeTensor(
-        &prefetched_clip_rgb_, data_shape, at::dtype<float>().device(CPU));
-
-    // for optical flow data
-    data_shape[1] = channels_of_;
-    data_shape[2] = length_of_;
-    ReinitializeTensor(
-        &prefetched_clip_of_, data_shape, at::dtype<float>().device(CPU));
-
-    // If do_multi_label is used, output label is a binary vector
-    // of length num_of_class indicating which labels present
-    if (do_multi_label_) {
-      label_shape[0] = batch_size_ * clip_per_video_ * crop_per_clip_;
-      label_shape[1] = num_of_class_;
-      ReinitializeTensor(
-          &prefetched_label_, label_shape, at::dtype<int>().device(CPU));
-    } else {
-      ReinitializeTensor(
-          &prefetched_label_,
-          vector<int64_t>(1, batch_size_ * clip_per_video_ * crop_per_clip_),
-          at::dtype<int>().device(CPU));
-    }
-
-    ReinitializeTensor(
-        &prefetched_video_id_,
-        vector<int64_t>(1, batch_size_ * clip_per_video_ * crop_per_clip_),
-        at::dtype<int>().device(CPU));
-    ReinitializeTensor(
-        &prefetched_start_frame_,
-        vector<int64_t>(1, batch_size_ * clip_per_video_ * crop_per_clip_),
-        at::dtype<int>().device(CPU));
-
-  } catch (const std::exception& exc) {
-    std::cerr << "While calling VideoInputOp initialization\n";
-    std::cerr << exc.what();
-  }
-}
-
-template <class Context>
-void VideoInputOp<Context>::GetLabelsFromProto(
-    const TensorProto& label_proto,
-    int* label_data) {
-  int num_clips = clip_per_video_ * crop_per_clip_;
-  if (!do_multi_label_) {
-    for (const auto i : c10::irange(num_clips)) {
-      label_data[i] = label_proto.int32_data(0);
-    }
-  } else {
-    // For multiple label case, output label is a binary vector
-    // where presented concepts are marked 1
-    memset(label_data, 0, sizeof(int) * num_of_class_ * num_clips);
-    for (const auto i : c10::irange(num_clips)) {
-      for (const auto j : c10::irange(label_proto.int32_data_size())) {
-        CAFFE_ENFORCE_LT(
-            label_proto.int32_data(j),
-            num_of_class_,
-            "Label should be less than the number of classes.");
-        label_data[i * num_of_class_ + label_proto.int32_data(j)] = 1;
-      }
-    }
-  }
-}
-
-template <class Context>
-bool VideoInputOp<Context>::GetImageAndLabelsFromDBValue(
-    const std::string& value,
-    int& height,
-    int& width,
-    std::vector<unsigned char*>& buffer_rgb,
-    int* label_data) {
-  TensorProtos protos;
-  CAFFE_ENFORCE(protos.ParseFromString(value));
-  const TensorProto& image_proto = protos.protos(0);
-  const TensorProto& label_proto = protos.protos(1);
-
-  GetLabelsFromProto(label_proto, label_data);
-
-  cv::Mat src;
-  if (image_proto.data_type() == TensorProto::STRING) {
-    // encoded image string.
-    TORCH_DCHECK_EQ(image_proto.string_data_size(), 1);
-    const string& encoded_image_str = image_proto.string_data(0);
-    int encoded_size = encoded_image_str.size();
-    // We use a cv::Mat to wrap the encoded str so we do not need a copy.
-    src = cv::imdecode(
-        cv::Mat(
-            1,
-            &encoded_size,
-            CV_8UC1,
-            const_cast<char*>(encoded_image_str.data())),
-        cv::IMREAD_COLOR);
-    if (src.rows == 0 || src.cols == 0) {
-      throw std::runtime_error("Both rows and cols are 0 for image");
-    }
-  } else if (image_proto.data_type() == TensorProto::BYTE) {
-    // raw image content.
-    int src_c = (image_proto.dims_size() == 3) ? image_proto.dims(2) : 1;
-    CAFFE_ENFORCE(src_c == 3 || src_c == 1);
-
-    src.create(
-        image_proto.dims(0),
-        image_proto.dims(1),
-        (src_c == 3) ? CV_8UC3 : CV_8UC1);
-    memcpy(
-        src.ptr<uchar>(0),
-        image_proto.byte_data().data(),
-        image_proto.byte_data().size());
-  } else {
-    throw std::runtime_error(
-        "Unknown image data type: " +
-        caffe2::to_string(image_proto.data_type()));
-  }
-  CAFFE_ENFORCE(src.isContinuous());
-
-  cv::Mat scaled_img;
-  cv::resize(
-      src, scaled_img, cv::Size(scale_w_, scale_h_), 0, 0, cv::INTER_AREA);
-
-  cv::Mat img;
-  if (channels_rgb_ == src.channels()) {
-    img = scaled_img;
-  } else {
-    cv::cvtColor(
-        scaled_img,
-        img,
-        (channels_rgb_ == 1) ? cv::COLOR_BGR2GRAY : cv::COLOR_GRAY2BGR);
-  }
-
-  cv::Mat rgb_img;
-
-  if (channels_rgb_ == 1) {
-    cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
-  } else {
-    rgb_img = img;
-  }
-  CAFFE_ENFORCE(rgb_img.isContinuous());
-
-  unsigned char* data = new unsigned char[scale_h_ * scale_w_ * channels_rgb_];
-  memcpy(
-      data,
-      rgb_img.data,
-      scale_h_ * scale_w_ * channels_rgb_ * sizeof(unsigned char));
-  buffer_rgb.push_back(data);
-  width = scale_w_;
-  height = scale_h_;
-  return true;
-}
-
-template <class Context>
-bool VideoInputOp<Context>::GetClipsAndLabelsFromDBValue(
-    const std::string& value,
-    int& height,
-    int& width,
-    std::vector<unsigned char*>& buffer_rgb,
-    int* label_data,
-    int64_t* video_id_data,
-    int* start_frame_data,
-    std::mt19937* randgen) {
-  TensorProtos protos;
-  int curr_proto_idx = 0;
-  CAFFE_ENFORCE(protos.ParseFromString(value));
-  const TensorProto& video_proto = protos.protos(curr_proto_idx++);
-  const TensorProto& label_proto = protos.protos(curr_proto_idx++);
-
-  int start_frm = 0;
-  int num_clips = clip_per_video_ * crop_per_clip_;
-  // start_frm is only valid when sampling 1 clip per video without
-  // temporal jitterring
-  if (decode_type_ == DecodeType::USE_START_FRM) {
-    CAFFE_ENFORCE_GE(
-        protos.protos_size(),
-        curr_proto_idx + 1,
-        "Start frm proto not provided");
-    const TensorProto& start_frm_proto = protos.protos(curr_proto_idx++);
-    start_frm = start_frm_proto.int32_data(0);
-    if (get_start_frame_) {
-      for (const auto i : c10::irange(num_clips)) {
-        start_frame_data[i] = start_frm;
-      }
-    }
-  }
-
-  if (get_video_id_) {
-    CAFFE_ENFORCE_GE(
-        protos.protos_size(), curr_proto_idx + 1, "Video Id not provided");
-    const TensorProto& video_id_proto = protos.protos(curr_proto_idx);
-    for (const auto i : c10::irange(num_clips)) {
-      video_id_data[i] = video_id_proto.int64_data(0);
-    }
-  }
-
-  // assign labels
-  GetLabelsFromProto(label_proto, label_data);
-
-  if (use_local_file_) {
-    CAFFE_ENFORCE_EQ(
-        video_proto.data_type(),
-        TensorProto::STRING,
-        "Database with a file_list is expected to be string data");
-  }
-
-  // initializing the decoding params
-  Params params;
-  params.maximumOutputFrames_ = MAX_DECODING_FRAMES;
-  params.video_res_type_ = video_res_type_;
-  params.crop_size_ = crop_size_;
-  params.short_edge_ = short_edge_;
-  params.outputWidth_ = scale_w_;
-  params.outputHeight_ = scale_h_;
-  params.decode_type_ = decode_type_;
-  params.num_of_required_frame_ = num_of_required_frame_;
-
-  if (jitter_scales_.size() > 0) {
-    int select_idx =
-        std::uniform_int_distribution<>(0, jitter_scales_.size() - 1)(*randgen);
-    params.short_edge_ = jitter_scales_[select_idx];
-  }
-
-  char* video_buffer = nullptr; // for decoding from buffer
-  std::string video_filename; // for decoding from file
-  int encoded_size = 0;
-  if (video_proto.data_type() == TensorProto::STRING) {
-    const string& encoded_video_str = video_proto.string_data(0);
-    if (!use_local_file_) {
-      encoded_size = encoded_video_str.size();
-      video_buffer = const_cast<char*>(encoded_video_str.data());
-    } else {
-      video_filename = encoded_video_str;
-    }
-  } else if (video_proto.data_type() == TensorProto::BYTE) {
-    if (!use_local_file_) {
-      encoded_size = video_proto.byte_data().size();
-      video_buffer = const_cast<char*>(video_proto.byte_data().data());
-    } else {
-      // TODO: does this works?
-      video_filename = video_proto.string_data(0);
-    }
-  } else {
-    CAFFE_ENFORCE(false, "Unknown video data type.");
-  }
-
-  DecodeMultipleClipsFromVideo(
-      video_buffer,
-      video_filename,
-      encoded_size,
-      params,
-      start_frm,
-      clip_per_video_,
-      clip_start_positions_,
-      use_local_file_,
-      height,
-      width,
-      buffer_rgb);
-  return true;
-}
-
-template <class Context>
-void VideoInputOp<Context>::DecodeAndTransform(
-    const std::string& value,
-    float* clip_rgb_data,
-    float* clip_of_data,
-    int* label_data,
-    int64_t* video_id_data,
-    int* start_frame_data,
-    std::mt19937* randgen,
-    std::bernoulli_distribution* mirror_this_clip) {
-  try {
-    std::vector<unsigned char*> buffer_rgb;
-    // get the video resolution after decoding
-    int height = 0;
-    int width = 0;
-
-    if (image_as_input_) {
-      CHECK(GetImageAndLabelsFromDBValue(
-          value, height, width, buffer_rgb, label_data));
-    } else {
-      // Decode the video from memory or read from a local file
-      CHECK(GetClipsAndLabelsFromDBValue(
-          value,
-          height,
-          width,
-          buffer_rgb,
-          label_data,
-          video_id_data,
-          start_frame_data,
-          randgen));
-    }
-    int clip_offset_rgb = channels_rgb_ * length_rgb_ * crop_size_ * crop_size_;
-    int clip_offset_of = channels_of_ * length_of_ * crop_size_ * crop_size_;
-    for (int i = 0; i < std::min(clip_per_video_, int(buffer_rgb.size()));
-         i++) {
-      for (const auto j : c10::irange(crop_per_clip_)) {
-        // get the rectangle for cropping
-        int h_off = 0;
-        int w_off = 0;
-        if (crop_per_clip_ > 1) {
-          CAFFE_ENFORCE(
-              random_crop_ == false,
-              "Only using multiple crops w/o random cropping");
-        }
-        if (random_crop_) {
-          // using random crop for training
-          h_off =
-              std::uniform_int_distribution<>(0, height - crop_size_)(*randgen);
-          w_off =
-              std::uniform_int_distribution<>(0, width - crop_size_)(*randgen);
-        } else {
-          // using multiple spatial crops
-          if (crop_per_clip_ > 1) { // normally 3 crops
-            if (height < width) {
-              h_off = (height - crop_size_) / 2;
-              w_off = j * (width - crop_size_) / (crop_per_clip_ - 1);
-            } else {
-              h_off = j * (height - crop_size_) / (crop_per_clip_ - 1);
-              w_off = (width - crop_size_) / 2;
-            }
-            // LOG(INFO) << "crop " << j << "-th " << h_off << " & " << w_off;
-          } else { // using center crop for testing
-            h_off = (height - crop_size_) / 2;
-            w_off = (width - crop_size_) / 2;
-          }
-        }
-        cv::Rect rect(w_off, h_off, crop_size_, crop_size_);
-
-        int this_clip_sampling_rate;
-        if (random_sampling_rate_) {
-          this_clip_sampling_rate = std::uniform_int_distribution<>(
-              1, random_sampling_rate_)(*randgen);
-        }
-
-        // randomly mirror the image or not
-        bool mirror_me = random_mirror_ && (*mirror_this_clip)(*randgen);
-
-        if (get_rgb_ && clip_rgb_data) {
-          ClipTransformRGB(
-              buffer_rgb[i],
-              crop_size_,
-              length_rgb_,
-              channels_rgb_,
-              (random_sampling_rate_ == 0 ? sampling_rate_rgb_
-                                          : this_clip_sampling_rate),
-              height,
-              width,
-              h_off,
-              w_off,
-              mirror_me,
-              mean_rgb_,
-              inv_std_rgb_,
-              clip_rgb_data + ((i * crop_per_clip_ + j) * clip_offset_rgb));
-        }
-
-        if (get_optical_flow_ && clip_of_data) {
-          ClipTransformOpticalFlow(
-              buffer_rgb[i],
-              crop_size_,
-              length_of_,
-              channels_of_,
-              sampling_rate_of_,
-              height,
-              width,
-              rect,
-              channels_rgb_,
-              mirror_me,
-              flow_alg_type_,
-              flow_data_type_,
-              frame_gap_of_,
-              do_flow_aggregation_,
-              mean_of_,
-              inv_std_of_,
-              clip_of_data + ((i * crop_per_clip_ + j) * clip_offset_of));
-        }
-      }
-    }
-    if (buffer_rgb.size() > 0) {
-      for (const auto i : c10::irange(buffer_rgb.size())) {
-        unsigned char* buff = buffer_rgb[i];
-        delete[] buff;
-      }
-    }
-    buffer_rgb.clear();
-  } catch (const std::exception& exc) {
-    std::cerr << "While calling DecodeAndTransform()\n";
-    std::cerr << exc.what();
-  }
-}
-
-template <class Context>
-bool VideoInputOp<Context>::Prefetch() {
-  try {
-    // We will get the reader pointer from input.
-    // If we use local clips, db will store the list
-    reader_ = &OperatorBase::Input<db::DBReader>(0);
-
-    // Call mutable_data() once to allocate the underlying memory.
-    prefetched_clip_rgb_.mutable_data<float>();
-    prefetched_clip_of_.mutable_data<float>();
-    prefetched_label_.mutable_data<int>();
-    prefetched_video_id_.mutable_data<int64_t>();
-    prefetched_start_frame_.mutable_data<int>();
-
-    // Prefetching handled with a thread pool of "decode_threads" threads.
-    std::mt19937 meta_randgen(time(nullptr));
-    std::vector<std::mt19937> randgen_per_thread;
-    for (const auto i : c10::irange(num_decode_threads_)) {
-      randgen_per_thread.emplace_back(meta_randgen());
-    }
-
-    std::bernoulli_distribution mirror_this_clip(0.5);
-    for (const auto item_id : c10::irange(batch_size_)) {
-      std::mt19937* randgen =
-          &randgen_per_thread[item_id % num_decode_threads_];
-
-      int frame_size = crop_size_ * crop_size_;
-      // get the clip data pointer for the item_id -th example
-      float* clip_rgb_data = prefetched_clip_rgb_.mutable_data<float>() +
-          frame_size * length_rgb_ * channels_rgb_ * item_id * clip_per_video_ *
-              crop_per_clip_;
-
-      // get the optical flow data for the current clip
-      float* clip_of_data = prefetched_clip_of_.mutable_data<float>() +
-          frame_size * length_of_ * channels_of_ * item_id * clip_per_video_ *
-              crop_per_clip_;
-
-      // get the label data pointer for the item_id -th example
-      int* label_data = prefetched_label_.mutable_data<int>() +
-          (do_multi_label_ ? num_of_class_ : 1) * item_id * clip_per_video_ *
-              crop_per_clip_;
-
-      // get the video id data pointer for the item_id -th example
-      int64_t* video_id_data = prefetched_video_id_.mutable_data<int64_t>() +
-          item_id * clip_per_video_ * crop_per_clip_;
-
-      int* start_frame_data = prefetched_start_frame_.mutable_data<int>() +
-          item_id * clip_per_video_ * crop_per_clip_;
-
-      std::string key, value;
-      // read data
-      reader_->Read(&key, &value);
-
-      thread_pool_->run(std::bind(
-          &VideoInputOp<Context>::DecodeAndTransform,
-          this,
-          std::string(value),
-          clip_rgb_data,
-          clip_of_data,
-          label_data,
-          video_id_data,
-          start_frame_data,
-          randgen,
-          &mirror_this_clip));
-    } // for over the batch
-    thread_pool_->waitWorkComplete();
-
-    // If the context is not CPUContext, we will need to do a copy in the
-    // prefetch function as well.
-    if (!std::is_same<Context, CPUContext>::value) {
-      if (get_rgb_) {
-        prefetched_clip_rgb_on_device_.CopyFrom(
-            prefetched_clip_rgb_, &context_);
-      }
-      if (get_optical_flow_) {
-        prefetched_clip_of_on_device_.CopyFrom(prefetched_clip_of_, &context_);
-      }
-      prefetched_label_on_device_.CopyFrom(prefetched_label_, &context_);
-      if (get_video_id_) {
-        prefetched_video_id_on_device_.CopyFrom(
-            prefetched_video_id_, &context_);
-      }
-      if (get_start_frame_) {
-        prefetched_start_frame_on_device_.CopyFrom(
-            prefetched_start_frame_, &context_);
-      }
-    }
-  } catch (const std::exception& exc) {
-    std::cerr << "While calling Prefetch()\n";
-    std::cerr << exc.what();
-  }
-  return true;
-}
-
-template <class Context>
-bool VideoInputOp<Context>::CopyPrefetched() {
-  try {
-    int index = 0;
-    auto type = Context::GetDeviceType();
-    if (get_rgb_) {
-      auto* clip_rgb_output = OperatorBase::Output<Tensor>(index++, type);
-      if (std::is_same<Context, CPUContext>::value) {
-        clip_rgb_output->CopyFrom(prefetched_clip_rgb_, &context_);
-      } else {
-        clip_rgb_output->CopyFrom(prefetched_clip_rgb_on_device_, &context_);
-      }
-    }
-
-    if (get_optical_flow_) {
-      auto* clip_of_output = OperatorBase::Output<Tensor>(index++, type);
-      if (std::is_same<Context, CPUContext>::value) {
-        clip_of_output->CopyFrom(prefetched_clip_of_, &context_);
-      } else {
-        clip_of_output->CopyFrom(prefetched_clip_of_on_device_, &context_);
-      }
-    }
-
-    auto* label_output = OperatorBase::Output<Tensor>(index++, type);
-    if (std::is_same<Context, CPUContext>::value) {
-      label_output->CopyFrom(prefetched_label_, &context_);
-    } else {
-      label_output->CopyFrom(prefetched_label_on_device_, &context_);
-    }
-
-    if (get_video_id_) {
-      auto* video_id_output = OperatorBase::Output<Tensor>(index++, type);
-      if (std::is_same<Context, CPUContext>::value) {
-        video_id_output->CopyFrom(prefetched_video_id_, &context_);
-      } else {
-        video_id_output->CopyFrom(prefetched_video_id_on_device_, &context_);
-      }
-    }
-    if (get_start_frame_) {
-      auto* start_frame_output = OperatorBase::Output<Tensor>(index, type);
-      if (std::is_same<Context, CPUContext>::value) {
-        start_frame_output->CopyFrom(prefetched_start_frame_, &context_);
-      } else {
-        start_frame_output->CopyFrom(
-            prefetched_start_frame_on_device_, &context_);
-      }
-    }
-  } catch (const std::exception& exc) {
-    std::cerr << "While calling CopyPrefetched()\n";
-    std::cerr << exc.what();
-  }
-
-  return true;
-}
-
-} // namespace caffe2
-
-#endif // CAFFE2_VIDEO_VIDEO_INPUT_OP_H_
diff --git a/caffe2/video/video_input_op_gpu.cc b/caffe2/video/video_input_op_gpu.cc
deleted file mode 100644
index b9e60e63094a1..0000000000000
--- a/caffe2/video/video_input_op_gpu.cc
+++ /dev/null
@@ -1,9 +0,0 @@
-#include <caffe2/core/common_gpu.h>
-#include <caffe2/core/context_gpu.h>
-#include <caffe2/video/video_input_op.h>
-
-namespace caffe2 {
-
-REGISTER_CUDA_OPERATOR(VideoInput, VideoInputOp<CUDAContext>);
-
-} // namespace caffe2
diff --git a/caffe2/video/video_io.cc b/caffe2/video/video_io.cc
deleted file mode 100644
index 0b70dc56768ae..0000000000000
--- a/caffe2/video/video_io.cc
+++ /dev/null
@@ -1,210 +0,0 @@
-#include <caffe2/core/logging.h>
-#include <caffe2/video/video_io.h>
-#include <algorithm>
-#include <random>
-#include <string>
-
-namespace caffe2 {
-
-void ClipTransformRGB(
-    const unsigned char* buffer_rgb,
-    const int crop_size,
-    const int length_rgb,
-    const int channels_rgb,
-    const int sampling_rate_rgb,
-    const int height,
-    const int width,
-    const int h_off,
-    const int w_off,
-    const bool mirror_me,
-    const std::vector<float>& mean_rgb,
-    const std::vector<float>& inv_std_rgb,
-    float* transformed_clip) {
-  // The order of output dimensions is C, L, H, W
-  int orig_index, tran_index;
-  for (int c = 0; c < channels_rgb; ++c) {
-    for (int l = 0; l < length_rgb; ++l) {
-      int orig_index_l = l * sampling_rate_rgb * height * width * channels_rgb;
-      int tran_index_l = (c * length_rgb + l) * crop_size;
-
-      for (int h = 0; h < crop_size; ++h) {
-        int orig_index_h = orig_index_l + (h + h_off) * width * channels_rgb;
-        int tran_index_h = (tran_index_l + h) * crop_size;
-
-        for (int w = 0; w < crop_size; ++w) {
-          orig_index = orig_index_h + (w + w_off) * channels_rgb + c;
-
-          // mirror the frame
-          if (mirror_me) {
-            tran_index = tran_index_h + (crop_size - 1 - w);
-          } else {
-            tran_index = tran_index_h + w;
-          }
-
-          // normalize and transform the clip
-          transformed_clip[tran_index] =
-              (buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c];
-        }
-      }
-    }
-  }
-}
-
-void ClipTransformOpticalFlow(
-    const unsigned char* buffer_rgb,
-    const int crop_size,
-    const int length_of,
-    const int channels_of,
-    const int sampling_rate_of,
-    const int height,
-    const int width,
-    const cv::Rect& rect,
-    const int channels_rgb,
-    const bool mirror_me,
-    const int flow_alg_type,
-    const int flow_data_type,
-    const int frame_gap_of,
-    const bool do_flow_aggregation,
-    const std::vector<float>& mean_of,
-    const std::vector<float>& inv_std_of,
-    float* transformed_clip) {
-  const int frame_size = crop_size * crop_size;
-  const int channel_size_flow = length_of * frame_size;
-
-  // for get the mean and std of the input data
-  bool extract_statistics = false;
-  static std::vector<double> mean_static(channels_of, 0.f);
-  static std::vector<double> std_static(channels_of, 0.f);
-  static long long count = 0;
-  cv::Scalar mean_img, std_img;
-
-  for (int l = 0; l < length_of; l++) {
-    // get the grayscale frames
-    std::vector<cv::Mat> grays, rgbs;
-    int step_size = do_flow_aggregation ? 1 : frame_gap_of;
-    for (int j = 0; j <= frame_gap_of; j += step_size) {
-      // get the current frame
-      const unsigned char* curr_frame = buffer_rgb +
-          (l * sampling_rate_of + j) * height * width * channels_rgb;
-      cv::Mat img = cv::Mat::zeros(height, width, CV_8UC3);
-      memcpy(
-          img.data,
-          curr_frame,
-          height * width * channels_rgb * sizeof(unsigned char));
-
-      // crop and mirror the frame
-      cv::Mat img_cropped = img(rect);
-      if (mirror_me) {
-        cv::flip(img_cropped, img_cropped, 1);
-      }
-
-      cv::Mat gray;
-      cv::cvtColor(img_cropped, gray, cv::COLOR_RGB2GRAY);
-      grays.push_back(gray);
-      rgbs.push_back(img_cropped);
-    }
-
-    cv::Mat first_gray, first_rgb;
-    cv::Mat flow = cv::Mat::zeros(crop_size, crop_size, CV_32FC2);
-    MultiFrameOpticalFlowExtractor(grays, flow_alg_type, flow);
-
-    std::vector<cv::Mat> imgs;
-    cv::split(flow, imgs);
-    // save the 2-channel optical flow first
-    int c = 0;
-    for (; c < 2; c++) {
-      if (extract_statistics) {
-        cv::meanStdDev(imgs[c], mean_img, std_img);
-        mean_static[c] += mean_img[0];
-        std_static[c] += std_img[0];
-      }
-
-      imgs[c] -= mean_of[c];
-      imgs[c] *= inv_std_of[c];
-      memcpy(
-          transformed_clip + c * channel_size_flow + l * frame_size,
-          imgs[c].data,
-          frame_size * sizeof(float));
-    }
-
-    cv::Mat mag;
-    std::vector<cv::Mat> chans;
-    // augment the optical flow with more channels
-    switch (flow_data_type) {
-      case FlowDataType::Flow2C:
-        // nothing to do if we only need two channels
-        break;
-
-      case FlowDataType::Flow3C:
-        // use magnitude as the third channel
-        mag = cv::abs(imgs[0]) + cv::abs(imgs[1]);
-        if (extract_statistics) {
-          cv::meanStdDev(mag, mean_img, std_img);
-          mean_static[c] += mean_img[0];
-          std_static[c] += std_img[0];
-        }
-
-        mag -= mean_of[c];
-        mag *= inv_std_of[c];
-        memcpy(
-            transformed_clip + c * channel_size_flow + l * frame_size,
-            mag.data,
-            frame_size * sizeof(float));
-        break;
-
-      case FlowDataType::FlowWithGray:
-        // add grayscale image as the third channel
-        grays[0].convertTo(first_gray, CV_32FC1);
-        if (extract_statistics) {
-          cv::meanStdDev(first_gray, mean_img, std_img);
-          mean_static[c] += mean_img[0];
-          std_static[c] += std_img[0];
-        }
-
-        first_gray -= mean_of[c];
-        first_gray *= inv_std_of[c];
-        memcpy(
-            transformed_clip + c * channel_size_flow + l * frame_size,
-            first_gray.data,
-            frame_size * sizeof(float));
-        break;
-
-      case FlowDataType::FlowWithRGB:
-        // add all three rgb channels
-        rgbs[0].convertTo(first_rgb, CV_32FC3);
-        cv::split(first_rgb, chans);
-        for (; c < channels_of; c++) {
-          if (extract_statistics) {
-            cv::meanStdDev(chans[c - 2], mean_img, std_img);
-            mean_static[c] += mean_img[0];
-            std_static[c] += std_img[0];
-          }
-
-          chans[c - 2] -= mean_of[c];
-          chans[c - 2] *= inv_std_of[c];
-          memcpy(
-              transformed_clip + c * channel_size_flow + l * frame_size,
-              chans[c - 2].data,
-              frame_size * sizeof(float));
-        }
-        break;
-
-      default:
-        LOG(ERROR) << "Unsupported optical flow data type " << flow_data_type;
-        break;
-    }
-
-    if (extract_statistics) {
-      count++;
-      if (count % 1000 == 1) {
-        for (int i = 0; i < channels_of; i++) {
-          LOG(INFO) << i
-                    << "-th channel mean: " << mean_static[i] / float(count)
-                    << " std: " << std_static[i] / float(count);
-        }
-      }
-    }
-  }
-}
-
-} // namespace caffe2
diff --git a/caffe2/video/video_io.h b/caffe2/video/video_io.h
deleted file mode 100644
index beefd7b0782d9..0000000000000
--- a/caffe2/video/video_io.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef CAFFE2_VIDEO_VIDEO_IO_H_
-#define CAFFE2_VIDEO_VIDEO_IO_H_
-
-#include <caffe2/core/common.h>
-#include <caffe2/video/optical_flow.h>
-#include <caffe2/video/video_decoder.h>
-#include <opencv2/opencv.hpp>
-#include <random>
-
-#include <istream>
-#include <ostream>
-
-namespace caffe2 {
-
-TORCH_API void ClipTransformRGB(
-    const unsigned char* buffer_rgb,
-    const int crop_size,
-    const int length_rgb,
-    const int channels_rgb,
-    const int sampling_rate_rgb,
-    const int height,
-    const int width,
-    const int h_off,
-    const int w_off,
-    const bool mirror_me,
-    const std::vector<float>& mean_rgb,
-    const std::vector<float>& inv_std_rgb,
-    float* transformed_clip);
-
-TORCH_API void ClipTransformOpticalFlow(
-    const unsigned char* buffer_rgb,
-    const int crop_size,
-    const int length_of,
-    const int channels_of,
-    const int sampling_rate_of,
-    const int height,
-    const int width,
-    const cv::Rect& rect,
-    const int channels_rgb,
-    const bool mirror_me,
-    const int flow_alg_type,
-    const int flow_data_type,
-    const int frame_gap_of,
-    const bool do_flow_aggregation,
-    const std::vector<float>& mean_of,
-    const std::vector<float>& inv_std_of,
-    float* transformed_clip);
-
-} // namespace caffe2
-
-#endif // CAFFE2_VIDEO_VIDEO_IO_H_
diff --git a/cmake/Caffe2Config.cmake.in b/cmake/Caffe2Config.cmake.in
index 35ebb046b098b..30e53c5fc7528 100644
--- a/cmake/Caffe2Config.cmake.in
+++ b/cmake/Caffe2Config.cmake.in
@@ -101,6 +101,14 @@ if(@USE_CUDA@)
   endif()
 endif()
 
+if(@USE_XPU@)
+  # Add current directory to module path so we pick up FindSYCLToolkit.cmake
+  set(old_CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}")
+  list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+  include("${CMAKE_CURRENT_LIST_DIR}/public/xpu.cmake")
+  set(CMAKE_MODULE_PATH "${old_CMAKE_MODULE_PATH}")
+endif()
+
 if(@CAFFE2_USE_MKL@)
   include("${CMAKE_CURRENT_LIST_DIR}/public/mkl.cmake")
 endif()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 4613e046ef30b..f984bbebfab0d 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -93,6 +93,16 @@ if(USE_CUDA)
   endif()
 endif()
 
+# ---[ XPU
+if(USE_XPU)
+  include(${CMAKE_CURRENT_LIST_DIR}/public/xpu.cmake)
+  if(NOT PYTORCH_FOUND_XPU)
+    # message(WARNING "Not compiling with XPU. Could NOT find SYCL."
+    # "Suppress this warning with -DUSE_XPU=OFF.")
+    caffe2_update_option(USE_XPU OFF)
+  endif()
+endif()
+
 # ---[ Custom Protobuf
 if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_BUILD_MOBILE)
   disable_ubsan()
@@ -227,6 +237,12 @@ elseif(BLAS STREQUAL "MKL")
     set(CAFFE2_USE_EIGEN_FOR_BLAS ON)
     set(CAFFE2_USE_MKL OFF)
   endif()
+elseif(BLAS STREQUAL "NVPL")
+  find_package(NVPL_BLAS REQUIRED)
+  list(APPEND Caffe2_DEPENDENCY_LIBS nvpl::blas_lp64_omp)
+  set(BLAS_INFO "nvpl")
+  set(BLAS_FOUND 1)
+  set(BLAS_USE_CBLAS_DOT TRUE)
 elseif(BLAS STREQUAL "vecLib")
   find_package(vecLib REQUIRED)
   include_directories(SYSTEM ${vecLib_INCLUDE_DIR})
@@ -259,7 +275,7 @@ if(NOT INTERN_BUILD_MOBILE)
   set(AT_MKL_ENABLED 0)
   set(AT_MKL_SEQUENTIAL 0)
   set(USE_BLAS 1)
-  if(NOT (ATLAS_FOUND OR BLIS_FOUND OR GENERIC_BLAS_FOUND OR MKL_FOUND OR OpenBLAS_FOUND OR VECLIB_FOUND OR FlexiBLAS_FOUND))
+  if(NOT (ATLAS_FOUND OR BLIS_FOUND OR GENERIC_BLAS_FOUND OR MKL_FOUND OR OpenBLAS_FOUND OR VECLIB_FOUND OR FlexiBLAS_FOUND OR NVPL_BLAS_FOUND))
     message(WARNING "Preferred BLAS (" ${BLAS} ") cannot be found, now searching for a general BLAS library")
     find_package(BLAS)
     if(NOT BLAS_FOUND)
@@ -625,6 +641,15 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
     # Disable I8MM For CI since clang 9 does not support neon i8mm.
     set(XNNPACK_ENABLE_ARM_I8MM OFF CACHE BOOL "")
 
+    # Conditionally disable AVX512AMX, as it requires Clang 11 or later. Note that
+    # XNNPACK does conditionally compile this based on GCC version. Once it also does
+    # so based on Clang version, this logic can be removed.
+    IF(CMAKE_C_COMPILER_ID STREQUAL "Clang")
+      IF(CMAKE_C_COMPILER_VERSION VERSION_LESS "11")
+        set(XNNPACK_ENABLE_AVX512AMX OFF CACHE BOOL "")
+      ENDIF()
+    ENDIF()
+
     # Setting this global PIC flag for all XNNPACK targets.
     # This is needed for Object libraries within XNNPACK which must
     # be PIC to successfully link this static libXNNPACK with pytorch
@@ -840,42 +865,13 @@ else()
   caffe2_update_option(USE_FAKELOWP OFF)
 endif()
 
-# ---[ LMDB
-if(USE_LMDB)
-  find_package(LMDB)
-  if(LMDB_FOUND)
-    include_directories(SYSTEM ${LMDB_INCLUDE_DIR})
-    list(APPEND Caffe2_DEPENDENCY_LIBS ${LMDB_LIBRARIES})
-  else()
-    message(WARNING "Not compiling with LMDB. Suppress this warning with -DUSE_LMDB=OFF")
-    caffe2_update_option(USE_LMDB OFF)
-  endif()
-endif()
-
 if(USE_OPENCL)
   message(INFO "USING OPENCL")
   find_package(OpenCL REQUIRED)
   include_directories(SYSTEM ${OpenCL_INCLUDE_DIRS})
-  include_directories(${CMAKE_CURRENT_LIST_DIR}/../caffe2/contrib/opencl)
   list(APPEND Caffe2_DEPENDENCY_LIBS ${OpenCL_LIBRARIES})
 endif()
 
-# ---[ LevelDB
-# ---[ Snappy
-if(USE_LEVELDB)
-  find_package(LevelDB)
-  find_package(Snappy)
-  if(LEVELDB_FOUND AND SNAPPY_FOUND)
-    include_directories(SYSTEM ${LevelDB_INCLUDE})
-    list(APPEND Caffe2_DEPENDENCY_LIBS ${LevelDB_LIBRARIES})
-    include_directories(SYSTEM ${Snappy_INCLUDE_DIR})
-    list(APPEND Caffe2_DEPENDENCY_LIBS ${Snappy_LIBRARIES})
-  else()
-    message(WARNING "Not compiling with LevelDB. Suppress this warning with -DUSE_LEVELDB=OFF")
-    caffe2_update_option(USE_LEVELDB OFF)
-  endif()
-endif()
-
 # ---[ NUMA
 if(USE_NUMA)
   if(LINUX)
@@ -890,69 +886,6 @@ if(USE_NUMA)
   endif()
 endif()
 
-# ---[ ZMQ
-if(USE_ZMQ)
-  find_package(ZMQ)
-  if(ZMQ_FOUND)
-    include_directories(SYSTEM ${ZMQ_INCLUDE_DIR})
-    list(APPEND Caffe2_DEPENDENCY_LIBS ${ZMQ_LIBRARIES})
-  else()
-    message(WARNING "Not compiling with ZMQ. Suppress this warning with -DUSE_ZMQ=OFF")
-    caffe2_update_option(USE_ZMQ OFF)
-  endif()
-endif()
-
-# ---[ Redis
-if(USE_REDIS)
-  find_package(Hiredis)
-  if(HIREDIS_FOUND)
-    include_directories(SYSTEM ${Hiredis_INCLUDE})
-    list(APPEND Caffe2_DEPENDENCY_LIBS ${Hiredis_LIBRARIES})
-  else()
-    message(WARNING "Not compiling with Redis. Suppress this warning with -DUSE_REDIS=OFF")
-    caffe2_update_option(USE_REDIS OFF)
-  endif()
-endif()
-
-
-# ---[ OpenCV
-if(USE_OPENCV)
-  # OpenCV 4
-  find_package(OpenCV 4 QUIET COMPONENTS core highgui imgproc imgcodecs optflow videoio video)
-  if(NOT OpenCV_FOUND)
-    # OpenCV 3
-    find_package(OpenCV 3 QUIET COMPONENTS core highgui imgproc imgcodecs videoio video)
-    if(NOT OpenCV_FOUND)
-      # OpenCV 2
-      find_package(OpenCV QUIET COMPONENTS core highgui imgproc)
-    endif()
-  endif()
-  if(OpenCV_FOUND)
-    include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
-    list(APPEND Caffe2_DEPENDENCY_LIBS ${OpenCV_LIBS})
-    if(MSVC AND USE_CUDA)
-        list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${OpenCV_LIBS})
-    endif()
-    message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
-  else()
-    message(WARNING "Not compiling with OpenCV. Suppress this warning with -DUSE_OPENCV=OFF")
-    caffe2_update_option(USE_OPENCV OFF)
-  endif()
-endif()
-
-# ---[ FFMPEG
-if(USE_FFMPEG)
-  find_package(FFmpeg REQUIRED)
-  if(FFMPEG_FOUND)
-    message("Found FFMPEG/LibAV libraries")
-    include_directories(SYSTEM ${FFMPEG_INCLUDE_DIR})
-    list(APPEND Caffe2_DEPENDENCY_LIBS ${FFMPEG_LIBRARIES})
-  else()
-    message("Not compiling with FFmpeg. Suppress this warning with -DUSE_FFMPEG=OFF")
-    caffe2_update_option(USE_FFMPEG OFF)
-  endif()
-endif()
-
 if(USE_ITT)
   find_package(ITT)
   if(ITT_FOUND)
@@ -1137,6 +1070,16 @@ if(APPLE)
   target_link_options(pybind::pybind11 INTERFACE -undefined dynamic_lookup)
 endif()
 
+# ---[ OpenTelemetry API headers
+find_package(OpenTelemetryApi)
+if(NOT OpenTelemetryApi_FOUND)
+  message(STATUS "Using third_party/opentelemetry-cpp.")
+  set(OpenTelemetryApi_INCLUDE_DIRS ${CMAKE_CURRENT_LIST_DIR}/../third_party/opentelemetry-cpp/api/include)
+endif()
+message(STATUS "opentelemetry api include dirs: " "${OpenTelemetryApi_INCLUDE_DIRS}")
+add_library(opentelemetry::api INTERFACE IMPORTED)
+target_include_directories(opentelemetry::api SYSTEM INTERFACE ${OpenTelemetryApi_INCLUDE_DIRS})
+
 # ---[ MPI
 if(USE_MPI)
   find_package(MPI)
@@ -1212,6 +1155,9 @@ endif(USE_LLVM)
 
 # ---[ cuDNN
 if(USE_CUDNN)
+  if(CUDNN_VERSION VERSION_LESS 8.5)
+    message(FATAL_ERROR "PyTorch needs CuDNN-8.5 or above, but found ${CUDNN_VERSION}. Builds are still possible with `USE_CUDNN=0`")
+  endif()
   set(CUDNN_FRONTEND_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/../third_party/cudnn_frontend/include)
   target_include_directories(torch::cudnn INTERFACE ${CUDNN_FRONTEND_INCLUDE_DIR})
 endif()
@@ -1222,9 +1168,19 @@ if(USE_ROCM)
   # Currently only active for Ubuntu 20.04 and greater versions.
   if(UNIX AND EXISTS "/etc/os-release")
     file(STRINGS /etc/os-release OS_RELEASE)
-    string(REGEX REPLACE "NAME=\"([A-Za-z]+).*" "\\1" OS_DISTRO ${OS_RELEASE})
-    string(REGEX REPLACE ".*VERSION_ID=\"([0-9\.]+).*" "\\1" OS_VERSION ${OS_RELEASE})
-    if(OS_DISTRO STREQUAL "Ubuntu" AND OS_VERSION VERSION_GREATER_EQUAL "20.04")
+    set(DISTRO_NAME "")
+    set(DISTRO_VERSION "")
+    foreach(line ${OS_RELEASE})
+      string(REGEX MATCH "^NAME=" DISTRO_NAME_MATCH ${line})
+      if(NOT DISTRO_NAME_MATCH STREQUAL "")
+        string(REGEX REPLACE "^NAME=\"(.*)\"" "\\1" DISTRO_NAME ${line})
+      endif()
+      string(REGEX MATCH "^VERSION_ID=" DISTRO_VERSION_MATCH ${line})
+      if(NOT DISTRO_VERSION_MATCH STREQUAL "")
+        string(REGEX REPLACE "^VERSION_ID=\"(.*)\"" "\\1" DISTRO_VERSION ${line})
+      endif()
+    endforeach()
+    if(DISTRO_NAME STREQUAL "Ubuntu" AND DISTRO_VERSION VERSION_GREATER_EQUAL "20.04")
       find_library(LIBTINFO_LOC tinfo NO_CMAKE_PATH NO_CMAKE_ENVIRONMENT_PATH)
       if(LIBTINFO_LOC)
         get_filename_component(LIBTINFO_LOC_PARENT ${LIBTINFO_LOC} DIRECTORY)
@@ -1266,6 +1222,12 @@ if(USE_ROCM)
     if(HIPBLASLT_CUSTOM_COMPUTE_TYPE)
       list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_CUSTOM_COMPUTE_TYPE)
     endif()
+    if(HIPBLASLT_HAS_GETINDEXFROMALGO)
+      list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_HAS_GETINDEXFROMALGO)
+    endif()
+    if(HIP_NEW_TYPE_ENUMS)
+      list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS)
+    endif()
     add_definitions(-DROCM_VERSION=${ROCM_VERSION_DEV_INT})
     add_definitions(-DTORCH_HIP_VERSION=${TORCH_HIP_VERSION})
     message("TORCH_HIP_VERSION=${TORCH_HIP_VERSION} is added as a compiler defines")
@@ -1276,6 +1238,9 @@ if(USE_ROCM)
        list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
     endif(CMAKE_BUILD_TYPE MATCHES Debug)
 
+    # needed for compat with newer versions of hip-clang that introduced C++20 mangling rules
+    list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17)
+
     set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
     # Ask hcc to generate device code during compilation so we can use
     # host linker to link.
@@ -1307,9 +1272,7 @@ if(USE_ROCM)
       message(STATUS "Disabling Kernel Assert for ROCm")
     endif()
 
-    if(USE_FLASH_ATTENTION)
-      include(${CMAKE_CURRENT_LIST_DIR}/External/oort.cmake)
-    endif()
+    include(${CMAKE_CURRENT_LIST_DIR}/External/aotriton.cmake)
     if(USE_CUDA)
       caffe2_update_option(USE_MEM_EFF_ATTENTION OFF)
     endif()
@@ -1483,28 +1446,11 @@ if(USE_SNPE AND ANDROID)
   endif()
 endif()
 
-if(USE_METAL)
-  if(NOT IOS)
-    message(WARNING "Metal is only used in ios builds.")
-    caffe2_update_option(USE_METAL OFF)
-  endif()
-endif()
-
 if(USE_NNAPI AND NOT ANDROID)
   message(WARNING "NNApi is only used in android builds.")
   caffe2_update_option(USE_NNAPI OFF)
 endif()
 
-if(NOT INTERN_BUILD_MOBILE AND BUILD_CAFFE2_OPS)
-  if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-    list(APPEND Caffe2_DEPENDENCY_LIBS aten_op_header_gen)
-    if(USE_CUDA)
-      list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS aten_op_header_gen)
-    endif()
-    include_directories(${PROJECT_BINARY_DIR}/caffe2/contrib/aten)
-  endif()
-endif()
-
 if(USE_ZSTD)
   if(USE_SYSTEM_ZSTD)
     find_package(zstd REQUIRED)
@@ -1888,6 +1834,8 @@ if(USE_KINETO)
       set(CUPTI_LIB_NAME "cupti.lib")
     endif()
 
+    set(NVPERF_HOST_LIB_NAME "libnvperf_host.so")
+
     find_library(CUPTI_LIBRARY_PATH ${CUPTI_LIB_NAME} PATHS
         ${CUDA_SOURCE_DIR}
         ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64
@@ -1902,13 +1850,27 @@ if(USE_KINETO)
         ${CUDA_SOURCE_DIR}/include
         NO_DEFAULT_PATH)
 
+    find_library(NVPERF_HOST_LIBRARY_PATH ${NVPERF_HOST_LIB_NAME} PATHS
+        ${CUDA_SOURCE_DIR}
+        ${CUDA_SOURCE_DIR}/lib
+        ${CUDA_SOURCE_DIR}/lib64
+        ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64
+        NO_DEFAULT_PATH)
+
     if(CUPTI_LIBRARY_PATH AND CUPTI_INCLUDE_DIR)
       message(STATUS "  CUPTI_INCLUDE_DIR = ${CUPTI_INCLUDE_DIR}")
       set(CUDA_cupti_LIBRARY ${CUPTI_LIBRARY_PATH})
       message(STATUS "  CUDA_cupti_LIBRARY = ${CUDA_cupti_LIBRARY}")
+      # CUPTI Range Profiler requires the NVPerf library
+      # for configuring metrics
+      if(NVPERF_HOST_LIBRARY_PATH)
+        set(CUDA_nvperf_host_LIBRARY ${NVPERF_HOST_LIBRARY_PATH})
+        message(STATUS "  CUDA_nvperf_host_LIBRARY = ${NVPERF_HOST_LIBRARY_PATH}")
+      endif()
       message(STATUS "Found CUPTI")
       set(LIBKINETO_NOCUPTI OFF CACHE STRING "" FORCE)
 
+
       # I've only tested this sanity check on Linux; if someone
       # runs into this bug on another platform feel free to
       # generalize it accordingly
@@ -1941,7 +1903,7 @@ if(USE_KINETO)
   endif()
 
   if(NOT LIBKINETO_NOROCTRACER)
-    if(NOT ENV{ROCM_SOURCE_DIR})
+    if("$ENV{ROCM_SOURCE_DIR}" STREQUAL "")
       set(ENV{ROCM_SOURCE_DIR} "/opt/rocm")
     endif()
   endif()
diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
new file mode 100644
index 0000000000000..de64370b37a26
--- /dev/null
+++ b/cmake/External/aotriton.cmake
@@ -0,0 +1,28 @@
+if(NOT __AOTRITON_INCLUDED)
+  set(__AOTRITON_INCLUDED TRUE)
+
+  set(__AOTRITON_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/aotriton/src")
+  set(__AOTRITON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/aotriton/build")
+  set(__AOTRITON_INSTALL_DIR "${PROJECT_SOURCE_DIR}/torch")
+  ExternalProject_Add(aotriton_external
+    GIT_REPOSITORY https://github.com/ROCm/aotriton.git
+    GIT_TAG 24a3fe9cb57e5cda3c923df29743f9767194cc27
+    SOURCE_DIR ${__AOTRITON_SOURCE_DIR}
+    BINARY_DIR ${__AOTRITON_BUILD_DIR}
+    PREFIX ${__AOTRITON_INSTALL_DIR}
+    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${__AOTRITON_INSTALL_DIR}
+    -DAOTRITON_COMPRESS_KERNEL=OFF
+    -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+    -DAOTRITON_NO_PYTHON=ON
+    -DAOTRITON_NO_SHARED=ON
+    # CONFIGURE_COMMAND ""
+    # BUILD_COMMAND ${MAKE_COMMAND}
+    BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.a"
+    # INSTALL_COMMAND ${MAKE_COMMAND} install
+    )
+  set(AOTRITON_FOUND TRUE)
+  add_library(__caffe2_aotriton INTERFACE)
+  add_dependencies(__caffe2_aotriton aotriton_external)
+  target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.a)
+  target_include_directories(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include)
+endif() # __AOTRITON_INCLUDED
diff --git a/cmake/External/oort.cmake b/cmake/External/oort.cmake
deleted file mode 100644
index 29c9a1005a7fb..0000000000000
--- a/cmake/External/oort.cmake
+++ /dev/null
@@ -1,25 +0,0 @@
-if(NOT __OORT_INCLUDED)
-  set(__OORT_INCLUDED TRUE)
-
-  set(__OORT_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/oort/src")
-  set(__OORT_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/oort/build")
-  set(__OORT_INSTALL_DIR "${PROJECT_SOURCE_DIR}/torch")
-  ExternalProject_Add(oort_external
-    GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/triton.git
-    GIT_TAG 29e1252c1ac8e6a54deb883701e553e5b201a1ba
-    SOURCE_DIR ${__OORT_SOURCE_DIR}
-    SOURCE_SUBDIR mathaot
-    BINARY_DIR ${__OORT_BUILD_DIR}
-    PREFIX ${__OORT_INSTALL_DIR}
-    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${__OORT_INSTALL_DIR}
-    # CONFIGURE_COMMAND ""
-    # BUILD_COMMAND ${MAKE_COMMAND}
-    BUILD_BYPRODUCTS "${__OORT_INSTALL_DIR}/lib/liboort.a"
-    # INSTALL_COMMAND ${MAKE_COMMAND} install
-    )
-  set(OORT_FOUND TRUE)
-  add_library(__caffe2_oort INTERFACE)
-  add_dependencies(__caffe2_oort oort_external)
-  target_link_libraries(__caffe2_oort INTERFACE ${__OORT_INSTALL_DIR}/lib/liboort.a)
-  target_include_directories(__caffe2_oort INTERFACE ${__OORT_INSTALL_DIR}/include)
-endif() # __OORT_INCLUDED
diff --git a/cmake/Modules/FindAVX.cmake b/cmake/Modules/FindAVX.cmake
index 64ce9f04adc89..9604723e2cd30 100644
--- a/cmake/Modules/FindAVX.cmake
+++ b/cmake/Modules/FindAVX.cmake
@@ -51,9 +51,9 @@ MACRO(CHECK_SSE lang type flags)
     IF(NOT ${lang}_${type}_FOUND)
       SET(CMAKE_REQUIRED_FLAGS ${__FLAG})
       IF(lang STREQUAL "CXX")
-        CHECK_CXX_SOURCE_RUNS("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
+        CHECK_CXX_SOURCE_COMPILES("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
       ELSE()
-        CHECK_C_SOURCE_RUNS("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
+        CHECK_C_SOURCE_COMPILES("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
       ENDIF()
       IF(${lang}_HAS_${type}_${__FLAG_I})
         SET(${lang}_${type}_FOUND TRUE CACHE BOOL "${lang} ${type} support")
diff --git a/cmake/Modules/FindFFmpeg.cmake b/cmake/Modules/FindFFmpeg.cmake
deleted file mode 100644
index 04437562ee25e..0000000000000
--- a/cmake/Modules/FindFFmpeg.cmake
+++ /dev/null
@@ -1,71 +0,0 @@
-# - Try to find ffmpeg libraries
-#     (libavcodec, libavformat, libavutil, libswscale)
-# Once done this will define
-#
-# FFMPEG_FOUND - system has ffmpeg or libav
-# FFMPEG_INCLUDE_DIR - the ffmpeg include directory
-# FFMPEG_LIBRARIES - Link these to use ffmpeg
-#
-
-if (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
-  # in cache already
-  set(FFMPEG_FOUND TRUE)
-else (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
-
-  find_path(FFMPEG_AVCODEC_INCLUDE_DIR
-    NAMES libavcodec/avcodec.h
-    PATHS ${_FFMPEG_AVCODEC_INCLUDE_DIRS} /usr/include /usr/local/include /opt/local/include /sw/include
-    PATH_SUFFIXES ffmpeg libav
-  )
-
-  find_library(FFMPEG_LIBAVCODEC
-    NAMES avcodec
-    PATHS ${_FFMPEG_AVCODEC_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-  )
-
-  find_library(FFMPEG_LIBAVFORMAT
-    NAMES avformat
-    PATHS ${_FFMPEG_AVFORMAT_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-  )
-
-  find_library(FFMPEG_LIBAVUTIL
-    NAMES avutil
-    PATHS ${_FFMPEG_AVUTIL_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-  )
-
-
-  find_library(FFMPEG_LIBSWSCALE
-    NAMES swscale
-    PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-  )
-
-  find_library(FFMPEG_LIBSWRESAMPLE
-    NAMES swresample
-    PATHS ${_FFMPEG_SWSCALE_LIBRARY_DIRS} /usr/lib /usr/local/lib /opt/local/lib /sw/lib
-  )
-
-  if (FFMPEG_LIBAVCODEC AND FFMPEG_LIBAVFORMAT)
-    set(FFMPEG_FOUND TRUE)
-  endif()
-
-  if (FFMPEG_FOUND)
-    set(FFMPEG_INCLUDE_DIR ${FFMPEG_AVCODEC_INCLUDE_DIR})
-
-    set(FFMPEG_LIBRARIES
-      ${FFMPEG_LIBAVCODEC}
-      ${FFMPEG_LIBAVFORMAT}
-      ${FFMPEG_LIBAVUTIL}
-      ${FFMPEG_LIBSWSCALE}
-      ${FFMPEG_LIBSWRESAMPLE}
-    )
-
-    if (NOT FFMPEG_FIND_QUIETLY)
-      message(STATUS "Found FFMPEG or Libav: ${FFMPEG_LIBRARIES}, ${FFMPEG_INCLUDE_DIR}")
-    endif (NOT FFMPEG_FIND_QUIETLY)
-  else (FFMPEG_FOUND)
-    if (FFMPEG_FIND_REQUIRED)
-      message(FATAL_ERROR "Could not find libavcodec or libavformat or libavutil")
-    endif (FFMPEG_FIND_REQUIRED)
-  endif (FFMPEG_FOUND)
-
-endif (FFMPEG_LIBRARIES AND FFMPEG_INCLUDE_DIR)
diff --git a/cmake/Modules/FindHiredis.cmake b/cmake/Modules/FindHiredis.cmake
deleted file mode 100644
index f1389ded5868c..0000000000000
--- a/cmake/Modules/FindHiredis.cmake
+++ /dev/null
@@ -1,23 +0,0 @@
-# Find the Hiredis libraries
-#
-# The following variables are optionally searched for defaults
-#  HIREDIS_ROOT_DIR:    Base directory where all Hiredis components are found
-#
-# The following are set after configuration is done:
-#  HIREDIS_FOUND
-#  Hiredis_INCLUDE_DIR
-#  Hiredis_LIBRARIES
-
-find_path(Hiredis_INCLUDE_DIR NAMES hiredis/hiredis.h
-                             PATHS ${HIREDIS_ROOT_DIR} ${HIREDIS_ROOT_DIR}/include)
-
-find_library(Hiredis_LIBRARIES NAMES hiredis
-                              PATHS ${HIREDIS_ROOT_DIR} ${HIREDIS_ROOT_DIR}/lib)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(Hiredis DEFAULT_MSG Hiredis_INCLUDE_DIR Hiredis_LIBRARIES)
-
-if(HIREDIS_FOUND)
-  message(STATUS "Found Hiredis  (include: ${Hiredis_INCLUDE_DIR}, library: ${Hiredis_LIBRARIES})")
-  mark_as_advanced(Hiredis_INCLUDE_DIR Hiredis_LIBRARIES)
-endif()
diff --git a/cmake/Modules/FindLAPACK.cmake b/cmake/Modules/FindLAPACK.cmake
index 02367ff9866ca..fc8bf50d7d5d6 100644
--- a/cmake/Modules/FindLAPACK.cmake
+++ b/cmake/Modules/FindLAPACK.cmake
@@ -95,6 +95,13 @@ if(BLAS_FOUND)
     SET(LAPACK_INFO "mkl")
   ENDIF()
 
+  # NVPL
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "nvpl"))
+    FIND_PACKAGE(NVPL_LAPACK REQUIRED)
+    SET(LAPACK_LIBRARIES nvpl::lapack_lp64_omp)
+    SET(LAPACK_INFO "nvpl")
+  ENDIF()
+
   # Accelerate
   IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "accelerate"))
     SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
diff --git a/cmake/Modules/FindLMDB.cmake b/cmake/Modules/FindLMDB.cmake
deleted file mode 100644
index 2f0adb1b6d608..0000000000000
--- a/cmake/Modules/FindLMDB.cmake
+++ /dev/null
@@ -1,32 +0,0 @@
-# Try to find the LMBD libraries and headers
-#  LMDB_FOUND - system has LMDB lib
-#  LMDB_INCLUDE_DIR - the LMDB include directory
-#  LMDB_LIBRARIES - Libraries needed to use LMDB
-
-# FindCWD based on FindGMP by:
-# Copyright (c) 2006, Laurent Montel, <montel@kde.org>
-#
-# Redistribution and use is allowed according to the terms of the BSD license.
-
-# Adapted from FindCWD by:
-# Copyright 2013 Conrad Steenberg <conrad.steenberg@gmail.com>
-# Aug 31, 2013
-
-if(MSVC)
-  find_package(LMDB NO_MODULE)
-else()
-  find_path(LMDB_INCLUDE_DIR NAMES  lmdb.h PATHS "$ENV{LMDB_DIR}/include")
-  find_library(LMDB_LIBRARIES NAMES lmdb   PATHS "$ENV{LMDB_DIR}/lib" )
-endif()
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(LMDB DEFAULT_MSG LMDB_INCLUDE_DIR LMDB_LIBRARIES)
-
-if(LMDB_FOUND)
-  message(STATUS "Found lmdb    (include: ${LMDB_INCLUDE_DIR}, library: ${LMDB_LIBRARIES})")
-  mark_as_advanced(LMDB_INCLUDE_DIR LMDB_LIBRARIES)
-
-  caffe_parse_header(${LMDB_INCLUDE_DIR}/lmdb.h
-                     LMDB_VERSION_LINES MDB_VERSION_MAJOR MDB_VERSION_MINOR MDB_VERSION_PATCH)
-  set(LMDB_VERSION "${MDB_VERSION_MAJOR}.${MDB_VERSION_MINOR}.${MDB_VERSION_PATCH}")
-endif()
diff --git a/cmake/Modules/FindLevelDB.cmake b/cmake/Modules/FindLevelDB.cmake
deleted file mode 100644
index 320c246b5549c..0000000000000
--- a/cmake/Modules/FindLevelDB.cmake
+++ /dev/null
@@ -1,44 +0,0 @@
-# - Find LevelDB
-#
-#  LevelDB_INCLUDES  - List of LevelDB includes
-#  LevelDB_LIBRARIES - List of libraries when using LevelDB.
-#  LevelDB_FOUND     - True if LevelDB found.
-
-# Look for the header file.
-find_path(LevelDB_INCLUDE NAMES leveldb/db.h
-                          PATHS $ENV{LEVELDB_ROOT}/include /opt/local/include /usr/local/include /usr/include
-                          DOC "Path in which the file leveldb/db.h is located." )
-
-# Look for the library.
-find_library(LevelDB_LIBRARY NAMES leveldb
-                             PATHS /usr/lib $ENV{LEVELDB_ROOT}/lib
-                             DOC "Path to leveldb library." )
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(LevelDB DEFAULT_MSG LevelDB_INCLUDE LevelDB_LIBRARY)
-
-if(LEVELDB_FOUND)
-  message(STATUS "Found LevelDB (include: ${LevelDB_INCLUDE}, library: ${LevelDB_LIBRARY})")
-  set(LevelDB_INCLUDES ${LevelDB_INCLUDE})
-  set(LevelDB_LIBRARIES ${LevelDB_LIBRARY})
-  mark_as_advanced(LevelDB_INCLUDE LevelDB_LIBRARY)
-
-  if(EXISTS "${LevelDB_INCLUDE}/leveldb/db.h")
-    file(STRINGS "${LevelDB_INCLUDE}/leveldb/db.h" __version_lines
-           REGEX "static const int k[^V]+Version[ \t]+=[ \t]+[0-9]+;")
-
-    foreach(__line ${__version_lines})
-      if(__line MATCHES "[^k]+kMajorVersion[ \t]+=[ \t]+([0-9]+);")
-        set(LEVELDB_VERSION_MAJOR ${CMAKE_MATCH_1})
-      elseif(__line MATCHES "[^k]+kMinorVersion[ \t]+=[ \t]+([0-9]+);")
-        set(LEVELDB_VERSION_MINOR ${CMAKE_MATCH_1})
-      endif()
-    endforeach()
-
-    if(LEVELDB_VERSION_MAJOR AND LEVELDB_VERSION_MINOR)
-      set(LEVELDB_VERSION "${LEVELDB_VERSION_MAJOR}.${LEVELDB_VERSION_MINOR}")
-    endif()
-
-    # caffe_clear_vars(__line __version_lines)
-  endif()
-endif()
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
index 615fa49abc4fc..01de7c7cec15e 100644
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@@ -104,9 +104,6 @@ ELSE(CMAKE_COMPILER_IS_GNUCC)
   ELSE()
     SET(mklthreads "mkl_intel_thread")
     SET(mklrtls "iomp5" "guide")
-    IF (MSVC)
-      SET(mklrtls "libiomp5md")
-    ENDIF (MSVC)
   ENDIF()
   SET(mklifaces  "intel")
 ENDIF (CMAKE_COMPILER_IS_GNUCC)
@@ -264,8 +261,15 @@ MACRO(CHECK_ALL_LIBRARIES LIBRARIES OPENMP_TYPE OPENMP_LIBRARY _name _list _flag
         # Separately handling compiled TBB
         SET(_found_tbb TRUE)
       ELSE()
-        SET(lib_names ${_library})
-        FIND_LIBRARY(${_prefix}_${_library}_LIBRARY NAMES ${lib_names})
+        IF(MSVC)
+          SET(lib_names ${_library}_dll)
+          SET(lib_names_static ${_library})
+          # Both seek shared and static mkl library.
+          FIND_LIBRARY(${_prefix}_${_library}_LIBRARY NAMES ${lib_names} ${lib_names_static})
+        ELSE()
+          SET(lib_names ${_library})
+          FIND_LIBRARY(${_prefix}_${_library}_LIBRARY NAMES ${lib_names})
+        ENDIF()
       ENDIF()
       MARK_AS_ADVANCED(${_prefix}_${_library}_LIBRARY)
       IF(NOT (${_library} STREQUAL "tbb"))
@@ -370,10 +374,14 @@ ENDFOREACH(mklrtl)
 IF (MKL_LIBRARIES)
   SET(CMAKE_REQUIRED_LIBRARIES ${MKL_LIBRARIES})
   check_function_exists("cblas_gemm_bf16bf16f32" MKL_HAS_SBGEMM)
+  check_function_exists("cblas_gemm_f16f16f32" MKL_HAS_SHGEMM)
   set(CMAKE_REQUIRED_LIBRARIES)
   IF(MKL_HAS_SBGEMM)
     add_compile_options(-DMKL_HAS_SBGEMM)
   ENDIF(MKL_HAS_SBGEMM)
+  IF(MKL_HAS_SHGEMM)
+    add_compile_options(-DMKL_HAS_SHGEMM)
+  ENDIF(MKL_HAS_SHGEMM)
 ENDIF (MKL_LIBRARIES)
 
 # Check for older versions
diff --git a/cmake/Modules/FindMKLDNN.cmake b/cmake/Modules/FindMKLDNN.cmake
index e62d86897fee2..f6a19812c83de 100644
--- a/cmake/Modules/FindMKLDNN.cmake
+++ b/cmake/Modules/FindMKLDNN.cmake
@@ -18,6 +18,49 @@ IF(NOT MKLDNN_FOUND)
 
   SET(IDEEP_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep")
   SET(MKLDNN_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep/mkl-dnn")
+
+  if(USE_XPU) # Build oneDNN GPU library
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+      set(DNNL_HOST_COMPILER "g++")
+      # g++ is soft linked to /usr/bin/cxx, oneDNN would not treat it as an absolute path
+    else()
+      message(FATAL_ERROR "oneDNN library currently only supports GUN g++ compiler for XPU backend")
+    endif()
+
+    set(DNNL_MAKE_COMMAND "cmake" "--build" ".")
+    ExternalProject_Add(xpu_mkldnn_proj
+      SOURCE_DIR ${MKLDNN_ROOT}
+      PREFIX ${XPU_MKLDNN_DIR_PREFIX}
+      BUILD_IN_SOURCE 0
+      CMAKE_ARGS  -DCMAKE_C_COMPILER=icx
+      -DCMAKE_CXX_COMPILER=icpx
+      -DCMAKE_CXX_COMPILER_ID=IntelLLVM
+      -DDNNL_GPU_RUNTIME=SYCL
+      -DDNNL_CPU_RUNTIME=THREADPOOL
+      -DDNNL_BUILD_TESTS=OFF
+      -DDNNL_BUILD_EXAMPLES=OFF
+      -DONEDNN_BUILD_GRAPH=OFF
+      -DDNNL_LIBRARY_TYPE=STATIC
+      -DDNNL_DPCPP_HOST_COMPILER=${DNNL_HOST_COMPILER} # Use global cxx compiler as host compiler
+      -G ${CMAKE_GENERATOR} # Align Generator to Torch
+      BUILD_COMMAND ${DNNL_MAKE_COMMAND}
+      BUILD_BYPRODUCTS "xpu_mkldnn_proj-prefix/src/xpu_mkldnn_proj-build/src/libdnnl.a"
+      INSTALL_COMMAND ""
+    )
+
+    ExternalProject_Get_Property(xpu_mkldnn_proj BINARY_DIR)
+    set(__XPU_MKLDNN_BUILD_DIR ${BINARY_DIR})
+    set(XPU_MKLDNN_LIBRARIES ${__XPU_MKLDNN_BUILD_DIR}/src/libdnnl.a)
+    set(XPU_MKLDNN_INCLUDE ${__XPU_MKLDNN_BUILD_DIR}/include)
+    # This target would be further linked to libtorch_xpu.so.
+    # The libtorch_xpu.so would contain Conv&GEMM operators that depend on
+    # oneDNN primitive implementations inside libdnnl.a.
+    add_library(xpu_mkldnn INTERFACE)
+    add_dependencies(xpu_mkldnn xpu_mkldnn_proj)
+    target_link_libraries(xpu_mkldnn INTERFACE ${__XPU_MKLDNN_BUILD_DIR}/src/libdnnl.a)
+    target_include_directories(xpu_mkldnn INTERFACE ${XPU_MKLDNN_INCLUDE})
+  endif()
+
   IF(NOT APPLE AND NOT WIN32 AND NOT BUILD_LITE_INTERPRETER)
     MESSAGE("-- Will build oneDNN Graph")
     SET(LLGA_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep/mkl-dnn")
@@ -88,7 +131,8 @@ IF(NOT MKLDNN_FOUND)
   ELSE()
     IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
       IF(CPU_INTEL)
-        SET(DNNL_ARCH_OPT_FLAGS "-msse4" CACHE STRING "" FORCE)
+        # Do not specify arch in oneDNN build option, for the portability in older systems
+        SET(DNNL_ARCH_OPT_FLAGS "" CACHE STRING "" FORCE)
       ELSEIF(CPU_AARCH64)
         SET(DNNL_ARCH_OPT_FLAGS "-mcpu=generic" CACHE STRING "" FORCE)
       ENDIF()
diff --git a/cmake/Modules/FindOpenTelemetryApi.cmake b/cmake/Modules/FindOpenTelemetryApi.cmake
new file mode 100644
index 0000000000000..01334ed7047bb
--- /dev/null
+++ b/cmake/Modules/FindOpenTelemetryApi.cmake
@@ -0,0 +1,19 @@
+# Try to find the OpenTelemetry API headers
+# OpenTelemetryApi_FOUND - system has OpenTelemetry API headers
+# OpenTelemetryApi_INCLUDE_DIRS - the OpenTelemetry API headers dir
+
+find_path(OpenTelemetryApi_INCLUDE_DIR
+    NAMES include/opentelemetry/version.h
+    DOC "The directory where Open Telemetry API headers reside"
+)
+
+set(OpenTelemetryApi_INCLUDE_DIRS ${OpenTelemetryApi_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(OpenTelemetryApi
+    FOUND_VAR OpenTelemetryApi_FOUND
+    REQUIRED_VARS OpenTelemetryApi_INCLUDE_DIRS
+)
+
+mark_as_advanced(OpenTelemetryApi_FOUND)
diff --git a/cmake/Modules/FindRocksDB.cmake b/cmake/Modules/FindRocksDB.cmake
deleted file mode 100644
index ef7f3b5f94b3d..0000000000000
--- a/cmake/Modules/FindRocksDB.cmake
+++ /dev/null
@@ -1,23 +0,0 @@
-# Find the RocksDB libraries
-#
-# The following variables are optionally searched for defaults
-#  ROCKSDB_ROOT_DIR:    Base directory where all RocksDB components are found
-#
-# The following are set after configuration is done:
-#  ROCKSDB_FOUND
-#  RocksDB_INCLUDE_DIR
-#  RocksDB_LIBRARIES
-
-find_path(RocksDB_INCLUDE_DIR NAMES rocksdb/db.h
-                             PATHS ${ROCKSDB_ROOT_DIR} ${ROCKSDB_ROOT_DIR}/include)
-
-find_library(RocksDB_LIBRARIES NAMES rocksdb
-                              PATHS ${ROCKSDB_ROOT_DIR} ${ROCKSDB_ROOT_DIR}/lib)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(RocksDB DEFAULT_MSG RocksDB_INCLUDE_DIR RocksDB_LIBRARIES)
-
-if(ROCKSDB_FOUND)
-  message(STATUS "Found RocksDB  (include: ${RocksDB_INCLUDE_DIR}, library: ${RocksDB_LIBRARIES})")
-  mark_as_advanced(RocksDB_INCLUDE_DIR RocksDB_LIBRARIES)
-endif()
diff --git a/cmake/Modules/FindSYCLToolkit.cmake b/cmake/Modules/FindSYCLToolkit.cmake
new file mode 100644
index 0000000000000..d9345bb2fe0d0
--- /dev/null
+++ b/cmake/Modules/FindSYCLToolkit.cmake
@@ -0,0 +1,77 @@
+# This will define the following variables:
+# SYCL_FOUND               : True if the system has the SYCL library.
+# SYCL_INCLUDE_DIR         : Include directories needed to use SYCL.
+# SYCL_LIBRARY_DIR         ：The path to the SYCL library.
+# SYCL_LIBRARY             : SYCL library fullname.
+
+include(FindPackageHandleStandardArgs)
+
+set(SYCL_ROOT "")
+if(DEFINED ENV{SYCL_ROOT})
+  set(SYCL_ROOT $ENV{SYCL_ROOT})
+elseif(DEFINED ENV{CMPLR_ROOT})
+  set(SYCL_ROOT $ENV{CMPLR_ROOT})
+endif()
+
+string(COMPARE EQUAL "${SYCL_ROOT}" "" nosyclfound)
+if(nosyclfound)
+  set(SYCL_FOUND False)
+  set(SYCL_REASON_FAILURE "SYCL library not set!!")
+  set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
+  return()
+endif()
+
+# Find include path from binary.
+find_file(
+  SYCL_INCLUDE_DIR
+  NAMES include
+  HINTS ${SYCL_ROOT}
+  NO_DEFAULT_PATH
+  )
+
+# Find include/sycl path from include path.
+find_file(
+  SYCL_INCLUDE_SYCL_DIR
+  NAMES sycl
+  HINTS ${SYCL_ROOT}/include/
+  NO_DEFAULT_PATH
+  )
+
+# Due to the unrecognized compilation option `-fsycl` in other compiler.
+list(APPEND SYCL_INCLUDE_DIR ${SYCL_INCLUDE_SYCL_DIR})
+
+# Find library directory from binary.
+find_file(
+  SYCL_LIBRARY_DIR
+  NAMES lib lib64
+  HINTS ${SYCL_ROOT}
+  NO_DEFAULT_PATH
+  )
+
+# Find SYCL library fullname.
+find_library(
+  SYCL_LIBRARY
+  NAMES sycl
+  HINTS ${SYCL_LIBRARY_DIR}
+  NO_DEFAULT_PATH
+)
+
+find_library(
+  OCL_LIBRARY
+  NAMES OpenCL
+  HINTS ${SYCL_LIBRARY_DIR}
+  NO_DEFAULT_PATH
+)
+
+if((NOT SYCL_INCLUDE_DIR) OR (NOT SYCL_LIBRARY_DIR) OR (NOT SYCL_LIBRARY))
+  set(SYCL_FOUND False)
+  set(SYCL_REASON_FAILURE "SYCL library is incomplete!!")
+  set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
+  return()
+endif()
+
+find_package_handle_standard_args(
+  SYCL
+  FOUND_VAR SYCL_FOUND
+  REQUIRED_VARS SYCL_INCLUDE_DIR SYCL_LIBRARY_DIR SYCL_LIBRARY
+  REASON_FAILURE_MESSAGE "${SYCL_REASON_FAILURE}")
diff --git a/cmake/Modules/FindSnappy.cmake b/cmake/Modules/FindSnappy.cmake
deleted file mode 100644
index 97219e85075a8..0000000000000
--- a/cmake/Modules/FindSnappy.cmake
+++ /dev/null
@@ -1,26 +0,0 @@
-# Find the Snappy libraries
-#
-# The following variables are optionally searched for defaults
-#  SNAPPY_ROOT_DIR:    Base directory where all Snappy components are found
-#
-# The following are set after configuration is done:
-#  SNAPPY_FOUND
-#  Snappy_INCLUDE_DIR
-#  Snappy_LIBRARIES
-
-find_path(Snappy_INCLUDE_DIR NAMES snappy.h
-                             PATHS ${SNAPPY_ROOT_DIR} ${SNAPPY_ROOT_DIR}/include)
-
-find_library(Snappy_LIBRARIES NAMES snappy
-                              PATHS ${SNAPPY_ROOT_DIR} ${SNAPPY_ROOT_DIR}/lib)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(Snappy DEFAULT_MSG Snappy_INCLUDE_DIR Snappy_LIBRARIES)
-
-if(SNAPPY_FOUND)
-  message(STATUS "Found Snappy  (include: ${Snappy_INCLUDE_DIR}, library: ${Snappy_LIBRARIES})")
-  mark_as_advanced(Snappy_INCLUDE_DIR Snappy_LIBRARIES)
-  caffe_parse_header(${Snappy_INCLUDE_DIR}/snappy-stubs-public.h
-                     SNAPPY_VERION_LINES SNAPPY_MAJOR SNAPPY_MINOR SNAPPY_PATCHLEVEL)
-  set(Snappy_VERSION "${SNAPPY_MAJOR}.${SNAPPY_MINOR}.${SNAPPY_PATCHLEVEL}")
-endif()
diff --git a/cmake/Modules/FindZMQ.cmake b/cmake/Modules/FindZMQ.cmake
deleted file mode 100644
index 43ce50b949578..0000000000000
--- a/cmake/Modules/FindZMQ.cmake
+++ /dev/null
@@ -1,32 +0,0 @@
-# Find the ZMQ libraries
-#
-# The following variables are optionally searched for defaults
-#  ZMQ_ROOT_DIR:    Base directory where all ZMQ components are found
-#
-# The following are set after configuration is done:
-#  ZMQ_FOUND
-#  ZMQ_INCLUDE_DIR
-#  ZMQ_LIBRARIES
-#  ZMQ_VERSION_MAJOR
-
-find_path(ZMQ_INCLUDE_DIR NAMES zmq.h
-                             PATHS ${ZMQ_ROOT_DIR} ${ZMQ_ROOT_DIR}/include)
-
-find_library(ZMQ_LIBRARIES NAMES zmq
-                              PATHS ${ZMQ_ROOT_DIR} ${ZMQ_ROOT_DIR}/lib)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(ZMQ DEFAULT_MSG ZMQ_INCLUDE_DIR ZMQ_LIBRARIES)
-
-if(ZMQ_FOUND)
-  message(STATUS "Found ZMQ  (include: ${ZMQ_INCLUDE_DIR}, library: ${ZMQ_LIBRARIES})")
-  mark_as_advanced(ZMQ_INCLUDE_DIR ZMQ_LIBRARIES)
-
-  caffe_parse_header(${ZMQ_INCLUDE_DIR}/zmq.h ZMQ_VERSION_LINES ZMQ_VERSION_MAJOR)
-  if(${ZMQ_VERSION_MAJOR} VERSION_LESS "3")
-    message(WARNING "Caffe2 requires zmq version 3 or above, but found " ${ZMQ_VERSION_MAJOR} ". Disabling zmq for now.")
-    set(ZMQ_FOUND)
-  else()
-
-  endif()
-endif()
diff --git a/cmake/Modules/FindZVECTOR.cmake b/cmake/Modules/FindZVECTOR.cmake
index cf4cf1126fffd..b4e8994d0a9ed 100644
--- a/cmake/Modules/FindZVECTOR.cmake
+++ b/cmake/Modules/FindZVECTOR.cmake
@@ -1,22 +1,5 @@
-
 IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
   message("-- <FindZVECTOR>")
-  set(Z_ARCH_LIST "")
-  #firstly, tries to add the arch of the platform
-  EXEC_PROGRAM(LD_SHOW_AUXV=1 ARGS "/bin/true" OUTPUT_VARIABLE bintrue)
-  if(bintrue MATCHES "AT_PLATFORM:[ \\t\\n\\r]*([a-zA-Z0-9_]+)[ \\t\\n\\r]*")
-    if(CMAKE_MATCH_COUNT GREATER 0)
-      string(TOLOWER ${CMAKE_MATCH_1} platform)
-      if(${platform} MATCHES "^z(14|15|16)")
-        message("-- Z ARCH Platform: ${platform}")
-        list( APPEND Z_ARCH_LIST  "${platform}" )
-      endif()
-    endif()
-  endif()
-  #adds other archs in descending order. as its cached nothing will be checked  twice
-  list( APPEND Z_ARCH_LIST  "z16" )
-  list( APPEND Z_ARCH_LIST  "z15" )
-  list( APPEND Z_ARCH_LIST  "z14" )
 
   SET(VECTORIZATION_CODE  "
     #include <vecintrin.h>
@@ -32,25 +15,25 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
         vuint32  selector= {0xFFFFFFFF, 0, 0xFFFFFFFF, 0xFFFFFFFF};
         vfloat32 hf = vsel_ext(selector, h1,h2);
         int ret = (int)(hf[0]*1000+hf[1]*100+hf[2]*10+hf[3]);
-        return ret==3856;
+        return (ret == 3856) ? 0 : -1;
     }
    ")
 
-  foreach(Z_ARCH  ${Z_ARCH_LIST})
-    SET(ARCH_SIMD_TEST_FLAGS_${Z_ARCH} " -mvx -mzvector -march=${Z_ARCH} -mtune=${Z_ARCH}")
-    message("-- check ${Z_ARCH}")
-    SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-    SET(CMAKE_REQUIRED_FLAGS "${ARCH_SIMD_TEST_FLAGS_${Z_ARCH}}")
-    set(VECTORIZATION_CODE_${Z_ARCH} "${VECTORIZATION_CODE}")
-    CHECK_CXX_SOURCE_COMPILES("${VECTORIZATION_CODE_${Z_ARCH}}"  COMPILE_OUT_${Z_ARCH})
-    SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
-    if(COMPILE_OUT_${Z_ARCH})
-      message("-- ${Z_ARCH} SIMD flags were set.")
-      set(CXX_ZVECTOR_FOUND TRUE)
-      SET(CXX_ZVECTOR_FLAGS  "${ARCH_SIMD_TEST_FLAGS_${Z_ARCH}}" )
-      break()
-    endif()
-  endforeach()
+  SET(ARCH_SIMD_TEST_FLAGS " -mvx -mzvector")
+  SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+  SET(CMAKE_REQUIRED_FLAGS "${ARCH_SIMD_TEST_FLAGS}")
+  # Do compilation check instead of runtime check
+  # in case it is compiled on older hardware
+  # or crosscompiled
+  CHECK_CXX_SOURCE_COMPILES("${VECTORIZATION_CODE}"  COMPILE_OUT_ZVECTOR)
+  SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+  if(COMPILE_OUT_ZVECTOR)
+    message("-- ZVECTOR flags were set.")
+    set(CXX_ZVECTOR_FOUND TRUE)
+    SET(CXX_ZVECTOR_FLAGS  "${ARCH_SIMD_TEST_FLAGS}" )
+  else()
+    message("-- ZVECTOR flags were NOT set.")
+  endif()
   message("-- </FindZVECTOR>")
 
 endif()
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
index 420ee63cfad36..f642072bdc51c 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
@@ -712,24 +712,27 @@ if(CMAKE_CROSSCOMPILING)
   SET (CUDA_TOOLKIT_ROOT $ENV{CUDA_TOOLKIT_ROOT})
   if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
     # Support for NVPACK
-    set (CUDA_TOOLKIT_TARGET_NAME "armv7-linux-androideabi")
+    set (CUDA_TOOLKIT_TARGET_NAMES "armv7-linux-androideabi")
   elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
     # Support for arm cross compilation
-    set(CUDA_TOOLKIT_TARGET_NAME "armv7-linux-gnueabihf")
+    set(CUDA_TOOLKIT_TARGET_NAMES "armv7-linux-gnueabihf")
   elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
     # Support for aarch64 cross compilation
     if (ANDROID_ARCH_NAME STREQUAL "arm64")
-      set(CUDA_TOOLKIT_TARGET_NAME "aarch64-linux-androideabi")
+      set(CUDA_TOOLKIT_TARGET_NAMES "aarch64-linux-androideabi")
     else()
-      set(CUDA_TOOLKIT_TARGET_NAME "aarch64-linux")
+      set(CUDA_TOOLKIT_TARGET_NAMES "aarch64-linux" "sbsa-linux")
     endif (ANDROID_ARCH_NAME STREQUAL "arm64")
   endif()
 
-  if (EXISTS "${CUDA_TOOLKIT_ROOT}/targets/${CUDA_TOOLKIT_TARGET_NAME}")
-    set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT}/targets/${CUDA_TOOLKIT_TARGET_NAME}" CACHE PATH "CUDA Toolkit target location.")
-    SET (CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT})
-    mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR)
-  endif()
+  foreach(CUDA_TOOLKIT_TARGET_NAME IN LISTS CUDA_TOOLKIT_TARGET_NAMES)
+    if (EXISTS "${CUDA_TOOLKIT_ROOT}/targets/${CUDA_TOOLKIT_TARGET_NAME}")
+      set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT}/targets/${CUDA_TOOLKIT_TARGET_NAME}" CACHE PATH "CUDA Toolkit target location.")
+      SET (CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT} CACHE PATH "Toolkit location." FORCE)
+      mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR)
+      break()
+    endif()
+  endforeach()
 
   # add known CUDA targetr root path to the set of directories we search for programs, libraries and headers
   set( CMAKE_FIND_ROOT_PATH "${CUDA_TOOLKIT_TARGET_DIR};${CMAKE_FIND_ROOT_PATH}")
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
index 769ddacfcf2ce..90de8fb0d8444 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
@@ -75,6 +75,8 @@ endif()
 if(NOT CUDA_VERSION VERSION_LESS "12.0")
   list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "9.0a")
   list(APPEND CUDA_ALL_GPU_ARCHITECTURES "9.0a")
+  list(REMOVE_ITEM CUDA_COMMON_GPU_ARCHITECTURES "3.5")
+  list(REMOVE_ITEM CUDA_ALL_GPU_ARCHITECTURES "3.5")
 endif()
 
 ################################################################################################
@@ -184,7 +186,7 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
       set(add_ptx TRUE)
       set(arch_name ${CMAKE_MATCH_1})
     endif()
-    if(arch_name MATCHES "^([0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
+    if(arch_name MATCHES "^([0-9]\\.[0-9]a?(\\([0-9]\\.[0-9]\\))?)$")
       set(arch_bin ${CMAKE_MATCH_1})
       set(arch_ptx ${arch_bin})
     else()
@@ -240,8 +242,8 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
   # remove dots and convert to lists
   string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
   string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
-  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+  string(REGEX MATCHALL "[0-9()]+a?" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+a?"   cuda_arch_ptx "${cuda_arch_ptx}")
 
   if(cuda_arch_bin)
     list(REMOVE_DUPLICATES cuda_arch_bin)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index d8fe95805ab3b..72c1243c24ea9 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -113,6 +113,11 @@ function(caffe2_print_configuration_summary)
       message(STATUS "      TensorRT include path   : ${TENSORRT_INCLUDE_DIR}")
     endif()
   endif()
+  message(STATUS "  USE_XPU               : ${USE_XPU}")
+  if(${USE_XPU})
+    message(STATUS "    SYCL include path   : ${SYCL_INCLUDE_DIR}")
+    message(STATUS "    SYCL library        : ${SYCL_LIBRARY}")
+  endif()
   message(STATUS "  USE_ROCM              : ${USE_ROCM}")
   if(${USE_ROCM})
     message(STATUS "    ROCM_VERSION        : ${ROCM_VERSION}")
@@ -123,20 +128,9 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  USE_FBGEMM            : ${USE_FBGEMM}")
   message(STATUS "    USE_FAKELOWP          : ${USE_FAKELOWP}")
   message(STATUS "  USE_KINETO            : ${USE_KINETO}")
-  message(STATUS "  USE_FFMPEG            : ${USE_FFMPEG}")
   message(STATUS "  USE_GFLAGS            : ${USE_GFLAGS}")
   message(STATUS "  USE_GLOG              : ${USE_GLOG}")
-  message(STATUS "  USE_LEVELDB           : ${USE_LEVELDB}")
-  if(${USE_LEVELDB})
-    message(STATUS "    LevelDB version     : ${LEVELDB_VERSION}")
-    message(STATUS "    Snappy version      : ${Snappy_VERSION}")
-  endif()
   message(STATUS "  USE_LITE_PROTO        : ${USE_LITE_PROTO}")
-  message(STATUS "  USE_LMDB              : ${USE_LMDB}")
-  if(${USE_LMDB})
-    message(STATUS "    LMDB version        : ${LMDB_VERSION}")
-  endif()
-  message(STATUS "  USE_METAL             : ${USE_METAL}")
   message(STATUS "  USE_PYTORCH_METAL     : ${USE_PYTORCH_METAL}")
   message(STATUS "  USE_PYTORCH_METAL_EXPORT     : ${USE_PYTORCH_METAL_EXPORT}")
   message(STATUS "  USE_MPS               : ${USE_MPS}")
@@ -159,10 +153,6 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  USE_NUMPY             : ${USE_NUMPY}")
   message(STATUS "  USE_OBSERVERS         : ${USE_OBSERVERS}")
   message(STATUS "  USE_OPENCL            : ${USE_OPENCL}")
-  message(STATUS "  USE_OPENCV            : ${USE_OPENCV}")
-  if(${USE_OPENCV})
-    message(STATUS "    OpenCV version      : ${OpenCV_VERSION}")
-  endif()
   message(STATUS "  USE_OPENMP            : ${USE_OPENMP}")
   message(STATUS "  USE_TBB               : ${USE_TBB}")
   if(${USE_TBB})
@@ -178,9 +168,6 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  USE_QNNPACK           : ${USE_QNNPACK}")
   message(STATUS "  USE_PYTORCH_QNNPACK   : ${USE_PYTORCH_QNNPACK}")
   message(STATUS "  USE_XNNPACK           : ${USE_XNNPACK}")
-  message(STATUS "  USE_REDIS             : ${USE_REDIS}")
-  message(STATUS "  USE_ROCKSDB           : ${USE_ROCKSDB}")
-  message(STATUS "  USE_ZMQ               : ${USE_ZMQ}")
   message(STATUS "  USE_DISTRIBUTED       : ${USE_DISTRIBUTED}")
   if(${USE_DISTRIBUTED})
     message(STATUS "    USE_MPI               : ${USE_MPI}")
diff --git a/cmake/prioritized_text.txt b/cmake/prioritized_text.txt
new file mode 100644
index 0000000000000..e5e36f34f98d5
--- /dev/null
+++ b/cmake/prioritized_text.txt
@@ -0,0 +1,1204 @@
+__aarch64_ldadd4_acq_rel
+_ZN3c1013intrusive_ptrINS_10TensorImplENS_19UndefinedTensorImplEE6reset_Ev
+_ZN3c1013intrusive_ptrINS_11StorageImplENS_6detail34intrusive_target_default_null_typeIS1_EEE6reset_Ev
+_ZN3c106SymIntC1ERKS0_
+_ZNK3c1010TensorImpl5sizesEv
+_ZN2at18TensorIteratorBaseD2Ev
+_ZN3c1017asIntArrayRefSlowENS_8ArrayRefINS_6SymIntEEEPKcl
+_ZN3c1013intrusive_ptrINS_11SymNodeImplENS_6detail34intrusive_target_default_null_typeIS1_EEE6reset_Ev
+_ZN3c1019fromIntArrayRefSlowENS_8ArrayRefIlEE
+_ZNK2at10TensorBase7optionsEv
+_ZNK3c1010TensorImpl4sizeEl
+_ZNK2at10TensorBase3dimEv
+_ZNK3c1010TensorImpl7stridesEv
+_ZNSt6vectorIN3c106SymIntESaIS1_EED2Ev
+_ZNSt6vectorIN2at6TensorESaIS1_EED1Ev
+_ZN2at5emptyEN3c108ArrayRefIlEENS0_13TensorOptionsESt8optionalINS0_12MemoryFormatEE
+_ZNK3c1010TensorImpl8sym_sizeEl
+_ZNK3c104cuda4impl13CUDAGuardImpl18uncheckedSetDeviceENS_6DeviceE
+_ZNR2at6TensoraSEONS_10TensorBaseE.isra.0
+_ZN2at12_GLOBAL__N_110create_outEN3c108ArrayRefIlEES3_RKNS1_13TensorOptionsE
+_ZNK2at10TensorBase6strideEl
+_ZNK3c108IListRefIN2at6TensorEE11materializeEv
+_ZN3c106SymInt8release_Ev.isra.0
+_ZN3c1010TensorImpl21set_sizes_and_stridesENS_8ArrayRefIlEES2_St8optionalIlE.isra.0
+_ZN3c1015SmallVectorImplIlEaSEOS1_.isra.0
+_ZN2at11NoTF32Guard19should_disable_tf32Ev
+_ZN2at11OperandInfo6tensorEON3c1010MaybeOwnedINS_10TensorBaseEEE
+_ZN2at11OperandInfo8validateEv
+_ZN2at12checkSameGPUEPKcRKNS_9TensorArgES4_
+_ZN2at13globalContextEv
+_ZN2at14TensorIterator22set_output_raw_stridedElN3c108ArrayRefIlEES3_NS1_13TensorOptionsENS2_INS_7DimnameEEE
+_ZN2at14namedinference15propagate_namesEPN3c1010TensorImplES3_
+_ZN2at14namedinference15propagate_namesERKNS_6TensorES3_
+_ZN2at14namedinference26compute_broadcast_outnamesERKNS_6TensorES3_
+_ZN2at14namedinference27propagate_names_if_nonemptyEPN3c1010TensorImplENS1_8ArrayRefINS_7DimnameEEEb
+_ZN2at14namedinference27propagate_names_if_nonemptyERKNS_6TensorEN3c108ArrayRefINS_7DimnameEEEb
+_ZN2at15checkAllSameGPUEPKcN3c108ArrayRefINS_9TensorArgEEE
+_ZN2at15infer_size_implIN3c108ArrayRefINS1_6SymIntEEES3_NS1_11SmallVectorIS3_Lj5EEEEEvT_T0_RT1_
+_ZN2at18TensorIteratorBase11fast_set_upERKNS_20TensorIteratorConfigE
+_ZN2at18TensorIteratorBase12mark_outputsEv
+_ZN2at18TensorIteratorBase13compute_namesERKNS_20TensorIteratorConfigE
+_ZN2at18TensorIteratorBase13compute_shapeERKNS_20TensorIteratorConfigE
+_ZN2at18TensorIteratorBase13compute_typesERKNS_20TensorIteratorConfigE
+_ZN2at18TensorIteratorBase15compute_stridesERKNS_20TensorIteratorConfigE
+_ZN2at18TensorIteratorBase17populate_operandsERNS_20TensorIteratorConfigE
+_ZN2at18TensorIteratorBase18permute_dimensionsEN3c108ArrayRefIlEE
+_ZN2at18TensorIteratorBase18reorder_dimensionsEv
+_ZN2at18TensorIteratorBase19coalesce_dimensionsEv
+_ZN2at18TensorIteratorBase19mark_resize_outputsERKNS_20TensorIteratorConfigE
+_ZN2at18TensorIteratorBase20compute_mem_overlapsERKNS_20TensorIteratorConfigE
+_ZN2at18TensorIteratorBase22set_output_raw_stridedElN3c108ArrayRefIlEES3_NS1_13TensorOptionsENS2_INS_7DimnameEEE
+_ZN2at18TensorIteratorBase23compute_fast_setup_typeERKNS_20TensorIteratorConfigE
+_ZN2at18TensorIteratorBase25build_borrowing_binary_opERKNS_10TensorBaseES3_S3_
+_ZN2at18TensorIteratorBase26allocate_or_resize_outputsEv
+_ZN2at18TensorIteratorBase30build_borrowing_unary_float_opERKNS_10TensorBaseES3_
+_ZN2at18TensorIteratorBase5buildERNS_20TensorIteratorConfigE
+_ZN2at18TensorIteratorBaseC2Ev
+_ZN2at20TensorIteratorConfig15is_tensor_constEm
+_ZN2at20TensorIteratorConfig18add_borrowed_inputERKNS_10TensorBaseE
+_ZN2at20TensorIteratorConfig19add_borrowed_outputERKNS_10TensorBaseE
+_ZN2at20TensorIteratorConfig24add_borrowed_const_inputERKNS_10TensorBaseE
+_ZN2at20has_internal_overlapEPN3c1010TensorImplE
+_ZN2at20infer_size_dimvectorEN3c108ArrayRefIlEES2_
+_ZN2at25assert_no_partial_overlapERKNS_10TensorBaseES2_
+_ZN2at26assert_no_internal_overlapEPN3c1010TensorImplE
+_ZN2at26assert_no_internal_overlapERKNS_10TensorBaseE
+_ZN2at4impl22PythonTorchFunctionTLS18get_disabled_stateEv
+_ZN2at4impl22PythonTorchFunctionTLS9stack_lenEv
+_ZN2at4impl27torch_function_mode_enabledEv
+_ZN2at6detail12_GLOBAL__N_126GetCPUAllocatorMaybePinnedEb
+_ZN2at6detail13computeStrideEN3c108ArrayRefIlEES3_RKNS1_11SmallVectorIlLj5EEE
+_ZN2at6detail13empty_genericEN3c108ArrayRefIlEEPNS1_9AllocatorENS1_14DispatchKeySetENS1_10ScalarTypeESt8optionalINS1_12MemoryFormatEE
+_ZN2at6detail14_empty_genericIlEENS_10TensorBaseEN3c108ArrayRefIT_EEPNS3_9AllocatorENS3_14DispatchKeySetENS3_10ScalarTypeESt8optionalINS3_12MemoryFormatEE
+_ZN2at6detail16make_tensor_baseIN3c1010TensorImplEJNS2_13intrusive_ptrINS2_11StorageImplENS2_6detail34intrusive_target_default_null_typeIS5_EEEERNS2_14DispatchKeySetERN6caffe28TypeMetaEEEENS_10TensorBaseEDpOT0_.isra.0
+_ZN2at6detail18computeStride_implIN3c1011SmallVectorINS2_6SymIntELj5EEENS2_8ArrayRefIS4_EES4_EESt8optionalIT_ERKT0_SD_SD_PFS9_SD_E
+_ZN2at6detail18computeStride_implIN3c1011SmallVectorIlLj5EEENS2_8ArrayRefIlEElEESt8optionalIT_ERKT0_SC_SC_PFS8_SC_E
+_ZN2at6detail20computeStorageNbytesEN3c108ArrayRefIlEES3_mm
+_ZN2at6detail20scalar_tensor_staticERKN3c106ScalarESt8optionalINS1_10ScalarTypeEES5_INS1_6DeviceEE
+_ZN2at6detail21empty_strided_genericEN3c108ArrayRefIlEES3_PNS1_9AllocatorENS1_14DispatchKeySetENS1_10ScalarTypeE
+_ZN2at6detail22check_size_nonnegativeEN3c108ArrayRefIlEE
+_ZN2at6detail30computeStorageNbytesContiguousEN3c108ArrayRefIlEEmm
+_ZN2at6detail9empty_cpuEN3c108ArrayRefIlEENS1_10ScalarTypeEbSt8optionalINS1_12MemoryFormatEE
+_ZN2at8internal23OpaqueOptionalTensorRefC2Ev
+_ZN2at8internal23OpaqueOptionalTensorRefD1Ev
+_ZN2atL16original_optionsERKNS_11OperandInfoE
+_ZN3c1010TensorImpl19_refresh_contiguousIbEEvv
+_ZN3c1010TensorImpl21empty_tensor_restrideENS_12MemoryFormatE
+_ZN3c1011SmallVectorINS_6SymIntELj5EED2Ev
+_ZN3c1011StorageImplC1ENS0_15use_byte_size_tERKNS_6SymIntEPNS_9AllocatorEb
+_ZNK2at18TensorIteratorBase11invert_permEN3c108ArrayRefIlEE
+_ZNK2at18TensorIteratorBase13is_contiguousEv
+_ZNK2at18TensorIteratorBase13is_cpu_scalarEi
+_ZNK2at18TensorIteratorBase17compatible_strideEi
+_ZNK2at18TensorIteratorBase22can_use_32bit_indexingEv
+_ZNK2at18TensorIteratorBase5numelEv
+_ZNK2at18TensorIteratorBase8data_ptrEi
+_ZNK2at18TensorIteratorBase9is_scalarEi
+_ZNK2at7Context15allowTF32CuBLASEv
+_ZNK2at7Context23deterministicAlgorithmsEv
+_ZNK2at7Context33alertCuBLASConfigNotDeterministicEv
+_ZNK3c106Scalar8toDoubleEv
+_ZNSt14_Function_baseD2Ev
+_ZZN2at6detail11scalar_fillERNS_6TensorERKN3c106ScalarEENKUlvE_clEv
+_ZN3c104cuda12_GLOBAL__N_1L19initCUDAStreamsOnceEv
+_ZN3c104cuda12device_countEv
+_ZN3c104cuda14ExchangeDeviceEi
+_ZN3c104cuda14MaybeSetDeviceEi
+_ZN3c104cuda14current_deviceEv
+_ZN3c104cuda17hasPrimaryContextEa
+_ZN3c104cuda20CUDACachingAllocator19CUDAAllocatorConfig24roundup_power2_divisionsEm
+_ZN3c104cuda20CUDACachingAllocator19CUDAAllocatorConfig8instanceEv
+_ZN3c104cuda20CUDACachingAllocator6Native12_GLOBAL__N_15BlockD2Ev
+_ZN3c104cuda20CUDACachingAllocator6Native12_GLOBAL__N_1L19BlockComparatorSizeEPKNS3_5BlockES6_
+_ZN3c104cuda20CUDACachingAllocator6Native16local_raw_deleteEPv
+_ZN3c104cuda20CUDACachingAllocator6Native22DeviceCachingAllocator10free_blockEPNS2_12_GLOBAL__N_15BlockERKSt10shared_ptrINS_15GatheredContextEE
+_ZN3c104cuda20CUDACachingAllocator6Native22DeviceCachingAllocator12record_traceENS1_10TraceEntry6ActionElmP11CUstream_stiSt10shared_ptrINS_15GatheredContextEE
+_ZN3c104cuda20CUDACachingAllocator6Native22DeviceCachingAllocator14get_free_blockERNS2_12_GLOBAL__N_111AllocParamsE.isra.0
+_ZN3c104cuda20CUDACachingAllocator6Native22DeviceCachingAllocator14process_eventsERKSt10shared_ptrINS_15GatheredContextEE
+_ZN3c104cuda20CUDACachingAllocator6Native22DeviceCachingAllocator16try_merge_blocksEPNS2_12_GLOBAL__N_15BlockES6_RNS4_9BlockPoolE.constprop.0
+_ZN3c104cuda20CUDACachingAllocator6Native22DeviceCachingAllocator17alloc_found_blockENS2_12_GLOBAL__N_111AllocParamsEmSt10shared_ptrINS_15GatheredContextEEb
+_ZN3c104cuda20CUDACachingAllocator6Native22DeviceCachingAllocator4freeEPNS2_12_GLOBAL__N_15BlockE
+_ZN3c104cuda20CUDACachingAllocator6Native22DeviceCachingAllocator6mallocEimP11CUstream_st
+_ZN3c104cuda20CUDACachingAllocator6Native22NativeCachingAllocator6mallocEPPvimP11CUstream_st
+_ZN3c104cuda20getCurrentCUDAStreamEa
+_ZN3c104cuda24CUDAKernelLaunchRegistry17get_singleton_refEv
+_ZN3c104cuda29c10_cuda_check_implementationEiPKcS2_ib
+_ZN3c104cuda9GetDeviceEPi
+_ZN3c104cuda9SetDeviceEi
+_ZN3ska8detailv317sherwood_v3_tableIPN3c104cuda20CUDACachingAllocator6Native12_GLOBAL__N_15BlockES8_St4hashIS8_ENS0_15functor_storageImSA_EESt8equal_toIS8_ENSB_IbSE_EESaIS8_ESaINS0_17sherwood_v3_entryIS8_EEEE15emplace_new_keyIRKS8_JEEESt4pairINSK_18templated_iteratorIS8_EEbEaPSI_OT_DpOT0_
+_ZN3ska8detailv317sherwood_v3_tableISt4pairIPvPN3c104cuda20CUDACachingAllocator6Native12_GLOBAL__N_15BlockEES3_St4hashIS3_ENS0_16KeyOrValueHasherIS3_SB_SD_EESt8equal_toIS3_ENS0_18KeyOrValueEqualityIS3_SB_SH_EESaISB_ESaINS0_17sherwood_v3_entryISB_EEEE15emplace_new_keyIRKS3_JNS_13flat_hash_mapIS3_SA_SD_SH_SK_E20convertible_to_valueEEEES2_INSO_18templated_iteratorISB_EEbEaPSM_OT_DpOT0_.isra.0
+_ZNK3c104cuda10CUDAStream6streamEv
+_ZNK3c104cuda20CUDACachingAllocator6Native22NativeCachingAllocator8allocateEm
+_ZNK3c104cuda24CUDAKernelLaunchRegistry10has_failedEv
+_ZNK3c104cuda4impl13CUDAGuardImpl14exchangeDeviceENS_6DeviceE
+_ZNSt14__shared_countILN9__gnu_cxx12_Lock_policyE2EEC1ERKS2_
+_ZNSt8_Rb_treeIPN3c104cuda20CUDACachingAllocator6Native12_GLOBAL__N_15BlockES6_St9_IdentityIS6_EPFbPKS5_SA_ESaIS6_EE16_M_insert_uniqueIRKS6_EESt4pairISt17_Rb_tree_iteratorIS6_EbEOT_
+_ZNSt8_Rb_treeIPN3c104cuda20CUDACachingAllocator6Native12_GLOBAL__N_15BlockES6_St9_IdentityIS6_EPFbPKS5_SA_ESaIS6_EE5eraseERKS6_.isra.0
+_ZNK3c1010TensorImpl9sym_sizesEv
+_ZN3c1013intrusive_ptrINS_10TensorImplENS_19UndefinedTensorImplEE7retain_Ev.isra.0
+_ZN3c1010TensorImpl18set_storage_offsetEl
+_ZN3c1010TensorImplC1ENS0_8ImplTypeEONS_7StorageENS_14DispatchKeySetEN6caffe28TypeMetaE
+_ZN3c1010TensorImplC1EONS_7StorageENS_14DispatchKeySetEN6caffe28TypeMetaESt8optionalINS_6DeviceEE
+_ZN3c1010TensorImplC2EONS_7StorageENS_14DispatchKeySetEN6caffe28TypeMetaE
+_ZN3c1010TensorImplD0Ev
+_ZN3c1010TensorImplD1Ev
+_ZN3c1012WarningUtils19get_warning_handlerEv
+_ZN3c1012WarningUtils19set_warning_handlerEPNS_14WarningHandlerE
+_ZN3c1013intrusive_ptrINS_15VariableVersion14VersionCounterENS_6detail34intrusive_target_default_null_typeIS2_EEE6reset_Ev
+_ZN3c1020ThreadLocalDebugInfo3getENS_13DebugInfoKindE
+_ZN3c1021AutogradMetaInterfaceD1Ev
+_ZN3c1034_compute_non_overlapping_and_denseIlEEbNS_8ArrayRefIT_EES3_
+_ZN3c104impl12PyObjectSlot10owns_pyobjEv
+_ZN3c104impl12PyObjectSlot19maybe_destroy_pyobjEv
+_ZN3c104impl12PyObjectSlotC1Ev
+_ZN3c104impl12PyObjectSlotD2Ev
+_ZN3c104impl19HermeticPyObjectTLS13get_tls_stateEv
+_ZN3c104impl20TorchDispatchModeTLS13any_modes_setEb
+_ZN3c104impl23ExcludeDispatchKeyGuardC1ENS_14DispatchKeySetE
+_ZN3c104impl23ExcludeDispatchKeyGuardD2Ev
+_ZN3c104impl29tls_set_dispatch_key_excludedENS_11DispatchKeyEb
+_ZN3c106SymIntdVERKS0_
+_ZN3c106SymIntmLERKS0_
+_ZN3c106SymIntpLERKS0_
+_ZN3c109alloc_cpuEm
+_ZN3c10L14tls_debug_infoMUlvE_4_FUNEv
+_ZN3c10eqERKNS_6SymIntEi
+_ZN3c10ltERKNS_6SymIntEi
+_ZNK3c1010TensorImpl11has_storageEv
+_ZNK3c1010TensorImpl13requires_gradEv
+_ZNK3c1010TensorImpl18compute_contiguousENS0_8identityIbEE
+_ZNK3c1010TensorImpl33compute_non_overlapping_and_denseENS0_8identityIbEE
+_ZNK3c1010TensorImpl35compute_channels_last_contiguous_2dENS0_8identityIbEE
+_ZNK3c1010TensorImpl37compute_strides_like_channels_last_2dENS0_8identityIbEE
+_ZNK3c1010TensorImpl7storageEv
+_ZNK3c106SymIntdvERKS0_
+_ZNK3c106SymIntmlERKS0_
+_ZNK3c106SymIntplERKS0_
+_ZNK3c106SymIntrmERKS0_
+_ZSt16__insertion_sortIPlN9__gnu_cxx5__ops15_Iter_comp_iterIZN3c1034_compute_non_overlapping_and_denseIlEEbNS4_8ArrayRefIT_EES8_EUlllE_EEEvS7_S7_T0_
+_ZSt16__introsort_loopIPllN9__gnu_cxx5__ops15_Iter_comp_iterIZN3c1034_compute_non_overlapping_and_denseIlEEbNS4_8ArrayRefIT_EES8_EUlllE_EEEvS7_S7_T0_T1_
+_ZN8pybind1112cpp_function10dispatcherEP7_objectS2_S2_
+_ZN8pybind1118gil_scoped_acquireC2Ev
+_ZN8pybind116detail13function_callD2Ev
+_ZN8pybind116detail13get_internalsEv
+_ZN8pybind116detail13get_type_infoERKSt10type_indexb
+_ZN8pybind116detail19get_local_internalsEv
+_ZN8pybind116detail19loader_life_supportD2Ev
+_ZN8pybind116detail19type_caster_generic12src_and_typeEPKvRKSt9type_infoPS5_
+_ZN8pybind116detail19type_caster_generic4castEPKvNS_19return_value_policyENS_6handleEPKNS0_9type_infoEPFPvS3_ESB_S3_
+_ZNKR8pybind116handle7dec_refEv.isra.0
+_ZNSt10_HashtableISt10type_indexSt4pairIKS0_PN8pybind116detail9type_infoEESaIS7_ENSt8__detail10_Select1stESt8equal_toIS0_ESt4hashIS0_ENS9_18_Mod_range_hashingENS9_20_Default_ranged_hashENS9_20_Prime_rehash_policyENS9_17_Hashtable_traitsILb0ELb0ELb1EEEE4findERS2_
+_ZNSt13_Bvector_baseISaIbEE13_M_deallocateEv
+_ZN3c1018computeDispatchKeyESt8optionalINS_10ScalarTypeEES0_INS_6LayoutEES0_INS_6DeviceEE.isra.0
+_ZN8pybind1118gil_scoped_releaseD2Ev.constprop.0
+_ZN3c1012GetAllocatorERKNS_10DeviceTypeE
+_ZN3c1013AutogradState13get_tls_stateEv
+_ZN3c1013InferenceMode10is_enabledEv
+_ZN3c1015GetCPUAllocatorEv
+_ZN3c1017get_default_dtypeEv
+_ZN3c1019DefaultCPUAllocator15ReportAndDeleteEPv
+_ZN3c1020isSharedStorageAliasERKNS_7StorageES2_
+_ZN3c1025ProfiledCPUMemoryReporter3NewEPvm
+_ZN3c1025ProfiledCPUMemoryReporter6DeleteEPv
+_ZN3c1027reportMemoryUsageToProfilerEPvlmmNS_6DeviceE
+_ZN3c108GradMode10is_enabledEv
+_ZNK3c1019DefaultCPUAllocator8allocateEm
+_ZNK3c106SymInt12maybe_as_intEv.isra.0
+_ZNK3c106SymInt6sym_eqERKS0_
+_ZNK3c106SymInt6sym_geERKS0_
+_ZNK3c106SymInt6sym_ltERKS0_
+_ZNK3c106SymInt6sym_neERKS0_
+_ZNK3c106SymInt9guard_intEPKcl
+_ZNK3c107SymBool10guard_boolEPKcl
+_ZNK3c107SymBool20guard_size_obliviousEPKcl
+_ZN3c1011SmallVectorINS_6SymIntELj5EEC2IPKS1_vEET_S6_
+_ZN2at15infer_size_implIN3c108ArrayRefIlEElNS1_11SmallVectorIlLj5EEEEEvT_T0_RT1_
+_ZN2at6detail11make_tensorIN3c1010TensorImplEJNS3_8ImplTypeENS2_7StorageENS2_14DispatchKeySetEN6caffe28TypeMetaEEEENS_6TensorEDpOT0_.isra.0
+_ZN2at6native10setStridedIlEEvRKNS_6TensorEN3c108ArrayRefIT_EES8_S7_
+_ZN2at6native11result_typeERKNS0_15ResultTypeStateE
+_ZN2at6native14reshape_symintERKNS_6TensorEN3c108ArrayRefINS4_6SymIntEEE
+_ZN2at6native1tERKNS_6TensorE
+_ZN2at6native21as_strided_tensorimplERKNS_6TensorEN3c108ArrayRefIlEES6_St8optionalIlE
+_ZN2at6native24update_result_type_stateERKNS_6TensorERKNS0_15ResultTypeStateE
+_ZN2at6native26check_cat_shape_except_dimERKNS_6TensorES3_ll
+_ZN2at6native28alias_with_sizes_and_stridesIN3c1011SmallVectorIlLj5EEEEENS_6TensorERKS5_RKT_SA_
+_ZN2at6native2toERKNS_6TensorEN3c1010ScalarTypeEbbSt8optionalINS4_12MemoryFormatEE
+_ZN2at6native4viewERKNS_6TensorEN3c108ArrayRefIlEE
+_ZN2at6native5sliceERKNS_6TensorElSt8optionalIlES5_l
+_ZN2at6native6expandERKNS_6TensorEN3c108ArrayRefIlEEb
+_ZN2at6native9transposeERKNS_6TensorEll
+_ZN2at6native9view_implERKNS_6TensorEN3c108ArrayRefIlEE
+_ZN2at6nativeL7check_tERKNS_6TensorEPKc
+_ZNK2at6Tensor10as_stridedEN3c108ArrayRefIlEES3_St8optionalIlE.isra.0
+_ZNK2at6Tensor17as_strided_symintEN3c108ArrayRefINS1_6SymIntEEES4_St8optionalIS3_E
+_ZNK3c1010TensorImpl11sym_stridesEv
+_Z19THPUtils_unpackLongP7_object
+_ZN3c1011StorageImplD0Ev
+_ZN5torch10PythonArgs10symintlistEi
+_ZN5torch10PythonArgs14optionalTensorEi
+_ZN5torch10PythonArgs6scalarEi
+_ZN5torch10PythonArgs6tensorEi
+_ZN5torch10PythonArgs6tensorEi.constprop.0
+_ZN5torch10PythonArgs6toBoolEi
+_ZN5torch10PythonArgs7toInt64Ei
+_ZN5torch10PythonArgs8toDoubleEi
+_ZN5torch8autograd5utils4wrapEN2at6TensorE
+_ZN5torch8autogradL14THPVariable_toEP7_objectS2_S2_
+_ZN5torch8autogradL15THPVariable_addEP7_objectS2_S2_
+_ZN5torch8autogradL15THPVariable_mulEP7_objectS2_S2_
+_ZN5torch8autogradL21THPVariable_transposeEP7_objectS2_S2_
+_ZN5torch8autogradL28TypeError_to_NotImplemented_IXadL_ZNS0_L15THPVariable_addEP7_objectS3_S3_EEEES3_S3_S3_S3_
+_ZN5torch8autogradL28TypeError_to_NotImplemented_IXadL_ZNS0_L15THPVariable_mulEP7_objectS3_S3_EEEES3_S3_S3_S3_
+_ZNSt6vectorIN3c106SymIntESaIS1_EE7reserveEm
+_ZN2at4cuda14get_p2p_accessEii
+_ZN2at4cuda19getDevicePropertiesEl
+_ZN2at4cuda22getCUDADeviceAllocatorEv
+_ZN2at4cuda26getCurrentDevicePropertiesEv
+_ZN2at4cuda6detail12_GLOBAL__N_118_hasPrimaryContextEa
+_ZN2at4cuda6detail5nvrtcEv
+_ZN2at4cuda9warp_sizeEv
+_ZN2at6detail10empty_cudaEN3c108ArrayRefIlEENS1_10ScalarTypeESt8optionalINS1_6DeviceEES5_INS1_12MemoryFormatEE
+_ZN2at6detail10empty_cudaEN3c108ArrayRefIlEERKNS1_13TensorOptionsE
+_ZN2at6detail10empty_cudaEN3c108ArrayRefIlEESt8optionalINS1_10ScalarTypeEES4_INS1_6LayoutEES4_INS1_6DeviceEES4_IbES4_INS1_12MemoryFormatEE
+_ZN3c104impl17InlineDeviceGuardINS0_16VirtualGuardImplEEC2ENS_6DeviceE
+_ZNK3c1010TensorImpl9data_implIvZNS0_12mutable_dataEvEUlvE_EEPT_RKT0_
+_ZNK3c106Scalar7toFloatEv
+_ZNK3c104impl13OperatorEntry6lookupENS_14DispatchKeySetE
+_ZN3c104implL21computeDispatchKeySetENS_14DispatchKeySetES1_
+_ZN2at6native20_resize_output_checkIlEEbRKNS_6TensorEN3c108ArrayRefIT_EE
+_ZNK3c1010TensorImpl6layoutEv
+_Z19THPUtils_checkIndexP7_object
+_Z20THPUtils_checkScalarP7_object
+_ZN10THPPointerI7_objectE4freeEv
+_ZN5torch10PythonArgs11tensor_slowEi
+_ZN5torch15PythonArgParser16check_deprecatedERKNS_17FunctionSignatureE
+_ZN5torch15PythonArgParser9raw_parseEP7_objectS2_S2_PS2_
+_ZN5torch16get_symint_classEv
+_ZN5torch17FunctionParameter5checkEP7_objectRSt6vectorIS2_SaIS2_EEiPl
+_ZN5torch17FunctionSignature5parseEP7_objectS2_S2_PS2_RSt6vectorIS2_SaIS2_EEb
+_ZN5torch18get_symfloat_classEv
+_ZN5torch24check_has_torch_functionEP7_objectb
+_ZN5torch31is_tensor_and_append_overloadedEP7_objectPSt6vectorIS1_SaIS1_EE
+_ZN5torch5utils12is_numpy_intEP7_object
+_ZN5torch5utils15is_numpy_scalarEP7_object
+_ZN5torch5utils18is_numpy_availableEv
+_ZN5torchL16is_int_or_symintEP7_object
+_ZN5torchL21is_int_or_symint_listEP7_objectiPl
+_ZN3c1015VariableVersion14VersionCounterD0Ev
+_Z16getPyInterpreterv
+_Z23THPSize_NewFromSymSizesRKN2at6TensorE
+_ZN5torch16PyWarningHandlerC2Ev
+_ZN5torch16PyWarningHandlerD1Ev
+_ZNK3c104impl12PyObjectSlot11check_pyobjEPNS0_13PyInterpreterEb
+_Z15isResurrectableP11THPVariable
+_Z16THPVariable_WrapN2at10TensorBaseE
+_Z21THPVariable_get_shapeP11THPVariablePv
+_Z28THPVariable_subclass_deallocP7_object
+_ZL17THPVariable_clearP11THPVariable
+_ZL17THPVariable_dtypeP11THPVariablePv
+_ZL22THPVariable_NewWithVarP11_typeobjectN2at6TensorEN3c104impl19PyInterpreterStatusEb
+_ZN2at8indexing11TensorIndexD1Ev
+_ZN2at8indexing4implL10applySliceERKNS_6TensorElN3c106SymIntES6_S6_bRKNS5_6DeviceERKSt8optionalINS5_8ArrayRefIS6_EEE
+_ZN5torch8autograd19THPVariable_getitemEP7_objectS2_
+_ZN5torch8autogradL12applySlicingERKN2at6TensorEP7_objectRSt6vectorIS2_SaIS2_EEbRKN3c106DeviceERKSt8optionalIlEl.constprop.0
+_ZN5torch8autogradL26count_specified_dimensionsEP7_object
+_ZNSt6vectorIlSaIlEE17_M_realloc_insertIJRKlEEEvN9__gnu_cxx17__normal_iteratorIPlS1_EEDpOT_
+_ZN12_GLOBAL__N_1L17_cublasOpFromCharEc
+_ZN12_GLOBAL__N_1L21_cublasAdjustLdLevel3EcclllPlS0_S0_
+_ZN2at4cuda12_GLOBAL__N_133cublas_handle_stream_to_workspaceEv
+_ZN2at4cuda24getCurrentCUDABlasHandleEv
+_ZN2at4cuda24parseChosenWorkspaceSizeEv
+_ZN2at6native10add_kernelERNS_18TensorIteratorBaseERKN3c106ScalarE
+_ZN2at6native10gpu_kernelINS0_15CUDAFunctor_addIN3c104HalfEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native12_GLOBAL__N_116cublasCommonArgsC2ERKNS_6TensorES5_RS3_
+_ZN2at6native12_GLOBAL__N_119addmm_out_cuda_implERNS_6TensorERKS2_S5_S5_RKN3c106ScalarES9_NS1_10ActivationE.isra.0
+_ZN2at6native12_GLOBAL__N_125prepare_matrix_for_cublasERKNS_6TensorERbb
+_ZN2at6native15gpu_kernel_implINS0_15CUDAFunctor_addIN3c104HalfEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native22gpu_kernel_impl_nocastINS0_15CUDAFunctor_addIN3c104HalfEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native25structured_ufunc_add_CUDA4implERKNS_6TensorES4_RKN3c106ScalarES4_
+_ZN3c1010MaybeOwnedIN2at6TensorEEaSEOS3_.isra.0
+_ZZN2at6native10add_kernelERNS_18TensorIteratorBaseERKN3c106ScalarEENKUlvE_clEv
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_EXadL_ZN5torch15ADInplaceOrView12_GLOBAL__N_11tES7_S9_EEEES6_NS_4guts8typelist8typelistIJS7_S9_EEEEESA_E4callEPNS_14OperatorKernelES7_S9_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_lSt8optionalINS_6SymIntEESC_SB_EXadL_ZN5torch15ADInplaceOrView12_GLOBAL__N_112slice_TensorES7_S9_lSC_SC_SB_EEEES6_NS_4guts8typelist8typelistIJS7_S9_lSC_SC_SB_EEEEESD_E4callEPNS_14OperatorKernelES7_S9_lSC_SC_SB_
+_ZN5torch15ADInplaceOrView12_GLOBAL__N_11tEN3c1014DispatchKeySetERKN2at6TensorE
+_ZN5torch8autograd33make_variable_differentiable_viewERKN2at6TensorESt8optionalINS0_8ViewInfoEES7_bNS0_12CreationMetaEb.isra.0
+_ZN5torch8autograd7as_viewERKN2at6TensorES4_bbSt8functionIFS2_S4_EES7_NS0_12CreationMetaEb
+_ZN5torch8autograd8ViewInfoC1EN2at6TensorESt8functionIFS3_RKS3_EES8_
+_ZN5torch8autograd8ViewInfoD2Ev
+_ZNK3c1010TensorImpl18support_as_stridedEv
+_ZNSt8functionIFN2at6TensorERKS1_EEC1ERKS5_
+_ZN2at4_ops12slice_Tensor10redispatchEN3c1014DispatchKeySetERKNS_6TensorElSt8optionalINS2_6SymIntEES9_S8_
+_ZN2at4_ops14_reshape_alias4callERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEES8_
+_ZN2at4_ops4view4callERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEE
+_ZN2at4_ops6matmul4callERKNS_6TensorES4_
+_ZN2at4_ops7reshape4callERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEE
+_ZN2at4_ops9unsqueeze10redispatchEN3c1014DispatchKeySetERKNS_6TensorEl
+_ZN2at4_ops9unsqueeze4callERKNS_6TensorEl
+_ZN3c1025callUnboxedKernelFunctionIN2at6TensorEJRKS2_lSt8optionalINS_6SymIntEES7_S6_EEET_PvPNS_14OperatorKernelENS_14DispatchKeySetEDpOT0_.isra.0
+_ZNK3c1010Dispatcher10redispatchIN2at6TensorEJRKS3_lSt8optionalINS_6SymIntEES8_S7_EEET_RKNS_19TypedOperatorHandleIFS9_DpT0_EEENS_14DispatchKeySetESC_.isra.0
+_ZN5torch8autogradL15THPVariable_catEP7_objectS2_S2_
+_ZN5torch8autogradL18THPVariable_linearEP7_objectS2_S2_
+_ZN2at12_GLOBAL__N_112_GLOBAL__N_124wrapper_CUDA__as_stridedERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEES8_St8optionalIS7_E
+_ZN2at12_GLOBAL__N_112_GLOBAL__N_127wrapper_CUDA__empty_stridedEN3c108ArrayRefINS2_6SymIntEEES5_St8optionalINS2_10ScalarTypeEES6_INS2_6LayoutEES6_INS2_6DeviceEES6_IbE
+_ZN2at12_GLOBAL__N_112_GLOBAL__N_132wrapper_CUDA_memory_format_emptyEN3c108ArrayRefINS2_6SymIntEEESt8optionalINS2_10ScalarTypeEES6_INS2_6LayoutEES6_INS2_6DeviceEES6_IbES6_INS2_12MemoryFormatEE
+_ZN2at12_GLOBAL__N_164structured_special_shifted_chebyshev_polynomial_w_out_functional16maybe_get_outputEl
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_8ArrayRefINS_6SymIntEEES9_St8optionalINS_10ScalarTypeEESA_INS_6LayoutEESA_INS_6DeviceEESA_IbEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_127wrapper_CUDA__empty_stridedES9_S9_SC_SE_SG_SH_EEEES6_NS_4guts8typelist8typelistIJS9_S9_SC_SE_SG_SH_EEEEESI_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES9_S9_SC_SE_SG_SH_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_8ArrayRefINS_6SymIntEEESt8optionalINS_10ScalarTypeEESA_INS_6LayoutEESA_INS_6DeviceEESA_IbESA_INS_12MemoryFormatEEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_132wrapper_CUDA_memory_format_emptyES9_SC_SE_SG_SH_SJ_EEEES6_NS_4guts8typelist8typelistIJS9_SC_SE_SG_SH_SJ_EEEEESK_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES9_SC_SE_SG_SH_SJ_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_NS_8ArrayRefINS_6SymIntEEEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_118wrapper_CUDA__viewES8_SB_EEEES6_NS_4guts8typelist8typelistIJS8_SB_EEEEESC_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_
+_ZN3c104impl30check_and_update_common_deviceERSt8optionalINS_6DeviceEERKN2at6TensorEPKcSA_
+_ZNK3c1013TensorOptions6deviceEv
+_ZNSt22_Optional_payload_baseIN3c104impl17InlineDeviceGuardINS0_4cuda4impl13CUDAGuardImplEEEE8_M_resetEv.part.0
+_Z22make_offset_calculatorILi3ELb0EE16OffsetCalculatorIXT_EjXT0_EERKN2at18TensorIteratorBaseE
+_ZN2at4_ops13transpose_int4callERKNS_6TensorEll
+_ZN2at4_ops1t4callERKNS_6TensorE
+_ZN3c104impl25InlineOptionalDeviceGuardINS_4cuda4impl13CUDAGuardImplEE12reset_deviceIS4_vEEvNS_6DeviceE
+_ZN5torch8autograd22DifferentiableViewMetaC2EPN3c1010TensorImplESt8optionalINS0_8ViewInfoEES7_bNS0_12CreationMetaE
+_ZN5torch8autograd22DifferentiableViewMetaD0Ev
+_ZN5torch8autograd4impl15version_counterERKN2at6TensorE
+_ZN5torch8autograd4impl17get_autograd_metaERKN2at10TensorBaseE
+_ZN5torch8autograd4impl22get_view_autograd_metaERKN2at10TensorBaseE
+_ZN5torch8autograd4impl24try_get_grad_accumulatorERKN2at6TensorE
+_ZNK5torch8autograd8ViewInfo5chainERKN2at6TensorES5_St8functionIFS3_S5_EES8_
+_ZN2at12_GLOBAL__N_116wrapper_CUDA_bmmERKNS_6TensorES3_
+_ZN2at12_GLOBAL__N_123wrapper_CUDA_add_TensorERKNS_6TensorES3_RKN3c106ScalarE
+_ZN2at12_GLOBAL__N_136structured_ufunc_add_CUDA_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_NS_8ArrayRefINS_6SymIntEEESB_St8optionalISA_EEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_124wrapper_CUDA__as_stridedES8_SB_SB_SD_EEEES6_NS_4guts8typelist8typelistIJS8_SB_SB_SD_EEEEESE_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_SB_SD_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_S8_RKNS_6ScalarEEXadL_ZNS5_12_GLOBAL__N_123wrapper_CUDA_add_TensorES8_S8_SB_EEEES6_NS_4guts8typelist8typelistIJS8_S8_SB_EEEEESC_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S8_SB_
+_ZN2at6nativeL9copy_implERNS_6TensorERKS1_b.isra.0
+_ZN2at4_ops12slice_Tensor4callERKNS_6TensorElSt8optionalIN3c106SymIntEES8_S7_
+_ZN2at6native16DispatchStubImpl12get_call_ptrEN3c1010DeviceTypeEPv
+_ZN2at6native20canUse32BitIndexMathERKNS_10TensorBaseEl
+_ZN2at6native6linearERKNS_6TensorES3_RKSt8optionalIS1_E
+_ZN2at6native10empty_cudaEN3c108ArrayRefIlEESt8optionalINS1_10ScalarTypeEES4_INS1_6LayoutEES4_INS1_6DeviceEES4_IbES4_INS1_12MemoryFormatEE
+_ZN2at4_ops1t10redispatchEN3c1014DispatchKeySetERKNS_6TensorE
+_ZN2at4meta21structured_add_Tensor4metaERKNS_6TensorES4_RKN3c106ScalarE
+_ZN2at6native11alpha_checkEN3c1010ScalarTypeERKNS1_6ScalarE.isra.0
+_ZN2at6native18structured_mul_out4implERKNS_6TensorES4_S4_
+_ZN2at4_ops10as_strided4callERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEES8_St8optionalIS7_E
+_ZN2at4_ops10mul_Tensor4callERKNS_6TensorES4_
+_ZN2at4_ops6linear4callERKNS_6TensorES4_RKSt8optionalIS2_E
+_ZN5torch8autograd12AutogradMetaD2Ev
+_ZN2at12_GLOBAL__N_120LocalCallbackManager30rebuildActiveCallbacksIfNeededEv.constprop.0
+_ZN2at12_GLOBAL__N_121GlobalCallbackManager3getEv
+_ZN2at27getStepCallbacksUnlessEmptyENS_11RecordScopeE
+_ZN2at4impl9has_namesEPKN3c1010TensorImplE
+_ZN2at9NamesMode10is_enabledEv
+_ZN3c1015_maybe_wrap_dimIlEET_S1_S1_b
+_ZN2at4_ops10add_Tensor4callERKNS_6TensorES4_RKN3c106ScalarE
+_ZN2at4_ops13empty_strided4callEN3c108ArrayRefINS2_6SymIntEEES5_St8optionalINS2_10ScalarTypeEES6_INS2_6LayoutEES6_INS2_6DeviceEES6_IbE
+_ZN2at4_ops19empty_memory_format10redispatchEN3c1014DispatchKeySetENS2_8ArrayRefINS2_6SymIntEEESt8optionalINS2_10ScalarTypeEES7_INS2_6LayoutEES7_INS2_6DeviceEES7_IbES7_INS2_12MemoryFormatEE
+_ZN2at4_ops19empty_memory_format4callEN3c108ArrayRefINS2_6SymIntEEESt8optionalINS2_10ScalarTypeEES6_INS2_6LayoutEES6_INS2_6DeviceEES6_IbES6_INS2_12MemoryFormatEE
+_ZN3c1014DispatchKeySetC2ENS_11DispatchKeyE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_8ArrayRefINS_6SymIntEEES9_St8optionalINS_10ScalarTypeEESA_INS_6LayoutEESA_INS_6DeviceEESA_IbEEXadL_ZNS5_12_GLOBAL__N_113empty_stridedES9_S9_SC_SE_SG_SH_EEEES6_NS_4guts8typelist8typelistIJS9_S9_SC_SE_SG_SH_EEEEESI_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES9_S9_SC_SE_SG_SH_
+_ZN3c10L10isQIntTypeENS_10ScalarTypeE
+_ZN2at6native10gpu_kernelINS0_13BinaryFunctorIN3c104HalfES4_S4_NS0_15binary_internal10MulFunctorIfEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native15mul_kernel_cudaERNS_18TensorIteratorBaseE
+_ZN2at6native10gpu_kernelI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_23direct_copy_kernel_cudaES5_EELj18EEFN3c104HalfESA_EJEEEEvS5_RKT_.isra.0
+_ZN2at6native15gpu_kernel_implI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_23direct_copy_kernel_cudaES5_EELj18EEFN3c104HalfESA_EJEEEEvS5_RKT_.isra.0
+_ZN2at6native21copy_device_to_deviceERNS_14TensorIteratorEbb
+_ZN2at6native23direct_copy_kernel_cudaERNS_18TensorIteratorBaseE
+_ZN2at6nativeL16copy_kernel_cudaERNS_14TensorIteratorEb
+_ZN5torch3jit6tracer15getTracingStateEv
+_ZN2at12_GLOBAL__N_110check_typeERKNS_10TensorBaseEN3c1010ScalarTypeENS4_17basic_string_viewIcEE
+_ZNK2at10TensorBase14const_data_ptrIN3c104HalfEEEPKT_v
+_ZNK2at10TensorBase16mutable_data_ptrIN3c104HalfEEEPT_v
+_ZNK2at10TensorBase16mutable_data_ptrIfEEPT_v
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_EXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_136wrapper_CompositeExplicitAutograd__tES8_EEEES6_NS_4guts8typelist8typelistIJS8_EEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_llEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_147wrapper_CompositeExplicitAutograd_int_transposeES8_llEEEES6_NS_4guts8typelist8typelistIJS8_llEEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_ll
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFRN2at6TensorES7_RKS6_bEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_140wrapper_CompositeExplicitAutograd__copy_ES7_S9_bEEEES7_NS_4guts8typelist8typelistIJS7_S9_bEEEEESA_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES7_S9_b
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_lSt8optionalINS_6SymIntEESB_SA_EXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_146wrapper_CompositeExplicitAutograd_Tensor_sliceES8_lSB_SB_SA_EEEES6_NS_4guts8typelist8typelistIJS8_lSB_SB_SA_EEEEESC_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_lSB_SB_SA_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_NS_10ScalarTypeEbbSt8optionalINS_12MemoryFormatEEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_142wrapper_CompositeImplicitAutograd_dtype_toES8_S9_bbSC_EEEES6_NS_4guts8typelist8typelistIJS8_S9_bbSC_EEEEESD_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S9_bbSC_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_NS_8ArrayRefINS_6SymIntEEEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_142wrapper_CompositeImplicitAutograd__reshapeES8_SB_EEEES6_NS_4guts8typelist8typelistIJS8_SB_EEEEESC_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_S8_RKSt8optionalIS6_EEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_141wrapper_CompositeImplicitAutograd__linearES8_S8_SC_EEEES6_NS_4guts8typelist8typelistIJS8_S8_SC_EEEEESD_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S8_SC_
+_ZN2at10empty_likeERKNS_6TensorEN3c1013TensorOptionsESt8optionalINS3_12MemoryFormatEE
+_ZN2at14namedinference25propagate_names_for_addmmERKNS_6TensorES3_S3_
+_ZN2at16toAccumulateTypeEN3c1010ScalarTypeENS0_10DeviceTypeE
+_ZN2at16toAccumulateTypeEN3c1010ScalarTypeEb
+_ZN2at18TensorIteratorBase20compute_common_dtypeEv
+_ZN2at6detail13computeStrideEN3c108ArrayRefINS1_6SymIntEEES4_S4_
+_ZNK2at7Context16userEnabledCuDNNEv
+_ZN2at14namedinference20compute_cat_outnamesERKSt6vectorISt17reference_wrapperIKNS_6TensorEESaIS5_EE
+_ZN2at14namedinference26propagate_names_for_expandERKNS_6TensorES3_
+_ZN2at18TensorIteratorBase14remove_operandEi
+_ZN2at18get_overlap_statusEPKN3c1010TensorImplES3_
+_ZN2at29inferExpandGeometry_dimvectorEN3c108ArrayRefIlEES2_S2_
+_ZN2at6detail9empty_cpuEN3c108ArrayRefIlEESt8optionalINS1_10ScalarTypeEES4_INS1_6LayoutEES4_INS1_6DeviceEES4_IbES4_INS1_12MemoryFormatEE
+_ZN2at9has_namesERKN3c108IListRefINS_6TensorEEE
+_ZN3c1015SmallVectorImplINS_6SymIntEEaSEOS2_.isra.0
+_ZNSt12_Vector_baseIlSaIlEED2Ev
+_ZN3c104cuda15SetTargetDeviceEv
+_ZN2at14namedinference23compute_matmul_outnamesERKNS_6TensorES3_
+_ZN2at18TensorIteratorBase14build_unary_opERKNS_10TensorBaseES3_
+_ZN2at20TensorIteratorConfig15add_owned_inputERKNS_10TensorBaseE
+_ZN2at20TensorIteratorConfig16add_owned_outputERKNS_10TensorBaseE
+_ZN2at20TensorIteratorConfig21add_owned_const_inputERKNS_10TensorBaseE
+_ZNK3c106Scalar6toLongEv
+_ZNK3c106Scalar15toComplexDoubleEv
+_ZN8pybind116detail9load_typeINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEENS0_11type_casterINS0_14intrinsic_typeIT_E4typeEvEERKNS_6handleE
+_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.0
+_ZN2at6native21wrapped_scalar_tensorERKN3c106ScalarENS1_6DeviceE
+_ZN3c10rmERKNS_6SymIntEi
+_ZN2at20TensorIteratorConfigD2Ev
+_ZN3c1012promoteTypesENS_10ScalarTypeES0_
+_ZN3c1013IsNUMAEnabledEv
+_ZN3c1022memoryProfilingEnabledEv
+_ZN3c108NUMAMoveEPvmi
+_ZN3c108free_cpuEPv
+_ZN3c10geERKNS_6SymIntEl
+_ZN3c10gtERKNS_6SymIntEi
+_ZN3c10miERKNS_6SymIntEi
+_ZN3c10neERKNS_6SymIntEi
+_ZNK3c1010TensorImpl13autograd_metaEv
+_ZNK3c104impl12PyObjectSlot25_unchecked_untagged_pyobjEv
+_ZNK3c106SymInt6sym_gtERKS0_
+_ZNK3c106SymInt6sym_leERKS0_
+_ZNK3c106SymIntmiERKS0_
+_ZN3c1010TensorImpl17set_autograd_metaESt10unique_ptrINS_21AutogradMetaInterfaceESt14default_deleteIS2_EE
+_ZN3c1018GetCurrentNUMANodeEv
+_ZN3c104impl12PyObjectSlot14set_owns_pyobjEb
+_ZN3c104impl22GetAutogradMetaFactoryEv
+_ZN3c10geERKNS_6SymIntEi
+_ZNK3c1010TensorImpl8_fw_gradEmRKN2at10TensorBaseE
+_ZN5torch8autogradL19THPVariable_permuteEP7_objectS2_S2_
+_ZN5torch8autogradL19THPVariable_reshapeEP7_objectS2_S2_
+_ZN3c1025profiledCPUMemoryReporterEv
+_ZNK3c107SymBool11expect_trueEPKcl
+_ZNK3c108SymFloat4sqrtEv
+_ZNK3c108SymFloatdvERKS0_
+_ZN2at6native12_unsafe_viewERKNS_6TensorEN3c108ArrayRefIlEE
+_ZN2at6native9empty_cpuEN3c108ArrayRefIlEESt8optionalINS1_10ScalarTypeEES4_INS1_6LayoutEES4_INS1_6DeviceEES4_IbES4_INS1_12MemoryFormatEE
+_ZN3c104impl17InlineDeviceGuardINS0_16VirtualGuardImplEED2Ev
+_ZN2at6detail18empty_strided_cudaEN3c108ArrayRefIlEES3_NS1_10ScalarTypeESt8optionalINS1_6DeviceEE
+_ZN2at6detail18empty_strided_cudaEN3c108ArrayRefIlEES3_St8optionalINS1_10ScalarTypeEES4_INS1_6LayoutEES4_INS1_6DeviceEES4_IbE
+_ZN2at6native10empty_likeERKNS_6TensorESt8optionalIN3c1010ScalarTypeEES4_INS5_6LayoutEES4_INS5_6DeviceEES4_IbES4_INS5_12MemoryFormatEE
+_ZN2at6native13resize_outputERKNS_6TensorEN3c108ArrayRefIlEE
+_ZN2at6native28alias_with_sizes_and_stridesIN3c108ArrayRefIlEEEENS_6TensorERKS5_RKT_SA_
+_ZN2at6native7permuteERKNS_6TensorEN3c108ArrayRefIlEE
+_ZN2at6nativeL31_permute_size_stride_estimationERKNS_6TensorEN3c108ArrayRefIlEE
+_ZNK3c1013TensorOptions8merge_inES0_
+_ZN2at4meta14structured_cat4metaERKN3c108IListRefINS_6TensorEEEl
+_ZN2at6native11result_typeEN3c108IListRefINS_6TensorEEE
+_ZN2at6native14_reshape_aliasERKNS_6TensorEN3c108ArrayRefIlEES6_
+_ZN2at6native8_to_copyERKNS_6TensorESt8optionalIN3c1010ScalarTypeEES4_INS5_6LayoutEES4_INS5_6DeviceEES4_IbEbS4_INS5_12MemoryFormatEE
+_ZNK3c108ArrayRefIlE3vecEv.isra.0
+_ZNK3c108ArrayRefIlE6equalsES1_.isra.0
+_ZN5torch10PythonArgs18intlistWithDefaultEiSt6vectorIlSaIlEE
+_ZN5torch10PythonArgs20memoryformatOptionalEi.isra.0
+_ZN5torch8autogradL16THPVariable_viewEP7_objectS2_S2_
+_ZN5torch8autogradL23THPVariable_bool_scalarEP7_objectS2_
+_ZN5torch17get_symbool_classEv
+_ZN3sdp23can_use_flash_attentionERKNS_10sdp_paramsEb
+_ZSt9__find_ifIPN3c1010ScalarTypeEN9__gnu_cxx5__ops16_Iter_equals_valIKN6caffe28TypeMetaEEEET_SA_SA_T0_St26random_access_iterator_tag.isra.0
+_ZN2at6native17layer_norm_symintERKNS_6TensorEN3c108ArrayRefINS4_6SymIntEEERKSt8optionalIS1_ESB_db
+_ZN5torch8autogradL16THPVariable_sizeEP7_objectS2_S2_
+_ZN5torch8autogradL22THPVariable_contiguousEP7_objectS2_S2_
+_Z34THPModule_has_torch_function_unaryP7_objectS0_
+_Z37THPModule_has_torch_function_variadicP7_objectPKS0_l
+_ZN5torch10PythonArgs11scalar_slowEP7_object
+_ZN5torch10PythonArgs11scalar_slowEi
+_ZN5torch36is_tensor_list_and_append_overloadedEP7_objectPSt6vectorIS1_SaIS1_EEib
+_ZN2at8indexing11TensorIndexC1ENS0_17EllipsisIndexTypeE
+_ZN2at8indexing5SliceC1ESt8optionalIN3c106SymIntEES5_S5_
+_ZN2at8indexing5SliceD1Ev
+_ZN2at8indexingL27handleDimInMultiDimIndexingERKNS_6TensorES3_RKNS0_11TensorIndexEPlS7_lRSt6vectorIS1_SaIS1_EEbRKN3c106DeviceERKSt8optionalINSC_8ArrayRefINSC_6SymIntEEEE
+_ZN5torch8autogradL16__PySlice_UnpackEP7_object
+_ZN5torch8autogradL9wrapTupleEP7_object
+_ZNSt22_Optional_payload_baseIN3c106SymIntEE8_M_resetEv.part.0
+_ZN5torch11getTHPDtypeEN3c1010ScalarTypeE
+_ZNK2at10TensorBase21suggest_memory_formatEb
+_ZNK2at6Tensor10contiguousEN3c1012MemoryFormatE
+_Z26THPModule_userEnabledCuDNNP7_objectS0_
+_ZNK12_GLOBAL__N_127ConcretePyInterpreterVTable6decrefEP7_objectb
+_ZNSt6vectorIN2at6TensorESaIS1_EE7reserveEm
+_ZN5torch8autograd15isFwGradDefinedERKSt8optionalIN2at6TensorEE.part.0
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_NS_8ArrayRefINS_6SymIntEEEEXadL_ZN5torch15ADInplaceOrView12_GLOBAL__N_14viewES7_S9_SC_EEEES6_NS_4guts8typelist8typelistIJS7_S9_SC_EEEEESD_E4callEPNS_14OperatorKernelES7_S9_SC_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_NS_8ArrayRefIlEEEXadL_ZN5torch15ADInplaceOrView12_GLOBAL__N_17permuteES7_S9_SB_EEEES6_NS_4guts8typelist8typelistIJS7_S9_SB_EEEEESC_E4callEPNS_14OperatorKernelES7_S9_SB_
+_ZN5torch15ADInplaceOrView12_GLOBAL__N_113transpose_intEN3c1014DispatchKeySetERKN2at6TensorEll
+_ZN5torch15ADInplaceOrView12_GLOBAL__N_14viewEN3c1014DispatchKeySetERKN2at6TensorENS2_8ArrayRefINS2_6SymIntEEE
+_ZN5torch15ADInplaceOrView12_GLOBAL__N_17permuteEN3c1014DispatchKeySetERKN2at6TensorENS2_8ArrayRefIlEE
+_ZN5torch8autograd8ViewInfoC1EOS1_
+_Z13wrap_tuple_fnIPPFP7_objectS1_S1_EXadL_ZN12_GLOBAL__N_112mp_subscriptEEEJS1_S1_EES1_DpT1_
+_ZN5torch8autogradL16THPVariable_siluEP7_objectS2_S2_
+_ZN5torch8autogradL40THPVariable_scaled_dot_product_attentionEP7_objectS2_S2_
+_ZN12_GLOBAL__N_1L17_getWorkspaceSizeEv
+_ZN2at4cuda26getCurrentCUDABlasLtHandleEv
+_ZN2at4cuda4blas13gemm_and_biasIN3c104HalfEEEvbblllNS_10OpMathTypeIT_E4typeEPKS6_lSA_lSA_PS6_lNS1_29GEMMAndBiasActivationEpilogueE
+_ZN2at6native25structured_addmm_out_cuda4implERKNS_6TensorES4_S4_RKN3c106ScalarES8_S4_
+_ZN2at6native22structured_mm_out_cuda4implERKNS_6TensorES4_S4_
+_ZN2at4_ops12_unsafe_view4callERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEE
+_ZNK5torch8autograd12_GLOBAL__N_127ConcreteAutogradMetaFactory16undefined_tensorEv
+_ZN2at9device_ofERKNS_6TensorE
+_ZN2at6native12_GLOBAL__N_121baddbmm_out_cuda_implERKNS_6TensorES4_S4_S4_RKN3c106ScalarES8_.isra.0
+_ZN2at6native23structured_bmm_out_cuda4implERKNS_6TensorES4_S4_
+_ZN2at6native31prepare_batch_matrix_for_cublasERKNS_6TensorERbRlbll
+_ZN2at4_ops10empty_like4callERKNS_6TensorESt8optionalIN3c1010ScalarTypeEES5_INS6_6LayoutEES5_INS6_6DeviceEES5_IbES5_INS6_12MemoryFormatEE
+_ZN2at4_ops14_reshape_alias10redispatchEN3c1014DispatchKeySetERKNS_6TensorENS2_8ArrayRefINS2_6SymIntEEES9_
+_ZN2at4_ops4view10redispatchEN3c1014DispatchKeySetERKNS_6TensorENS2_8ArrayRefINS2_6SymIntEEE
+_ZN2at4_ops7permute4callERKNS_6TensorEN3c108ArrayRefIlEE
+_ZN2at12_GLOBAL__N_170structured__convert_indices_from_csr_to_coo_structured_cuda_functional16maybe_get_outputEl
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_NS_8ArrayRefINS_6SymIntEEESB_EXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_128wrapper_CUDA___reshape_aliasES8_SB_SB_EEEES6_NS_4guts8typelist8typelistIJS8_SB_SB_EEEEESC_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_SB_
+_ZN3c104impl30check_and_update_common_deviceERSt8optionalINS_6DeviceEENS_8IListRefIN2at6TensorEEEPKcSA_
+_ZN2at6native18empty_strided_cudaEN3c108ArrayRefIlEES3_St8optionalINS1_10ScalarTypeEES4_INS1_6LayoutEES4_INS1_6DeviceEES4_IbE
+_ZN2at4_ops2mm4callERKNS_6TensorES4_
+_ZN2at4_ops4silu4callERKNS_6TensorE
+_ZN5torch8autograd4impl12bump_versionERKN2at6TensorE
+_ZNK5torch8autograd12AutogradMeta13requires_gradEv
+_ZNK5torch8autograd22DifferentiableViewMeta13requires_gradEv
+_ZN2at6nativeL20review_reduce_resultERKNS_6TensorEiSt6bitsetILm64EEb
+_ZN2at12_GLOBAL__N_112_GLOBAL__N_131wrapper_CUDA__native_layer_normERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEERKSt8optionalIS2_ESC_d
+_ZN2at12_GLOBAL__N_118wrapper_CUDA_addmmERKNS_6TensorES3_S3_RKN3c106ScalarES7_
+_ZN2at12_GLOBAL__N_136structured_addmm_out_cuda_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_S8_S8_RKNS_6ScalarESB_EXadL_ZNS5_12_GLOBAL__N_118wrapper_CUDA_addmmES8_S8_S8_SB_SB_EEEES6_NS_4guts8typelist8typelistIJS8_S8_S8_SB_SB_EEEEESC_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S8_S8_SB_SB_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFSt5tupleIJN2at6TensorES7_S7_EERKS7_NS_8ArrayRefINS_6SymIntEEERKSt8optionalIS7_ESH_dEXadL_ZNS6_12_GLOBAL__N_112_GLOBAL__N_131wrapper_CUDA__native_layer_normESA_SD_SH_SH_dEEEES8_NS_4guts8typelist8typelistIJSA_SD_SH_SH_dEEEEESI_E4callEPNS_14OperatorKernelENS_14DispatchKeySetESA_SD_SH_SH_d
+_ZN2at12_GLOBAL__N_116wrapper_CUDA_catERKN3c108IListRefINS_6TensorEEEl
+_ZN2at12_GLOBAL__N_123wrapper_CUDA_mul_TensorERKNS_6TensorES3_
+_ZN2at12_GLOBAL__N_129structured_mul_out_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN2at12_GLOBAL__N_134structured_bmm_out_cuda_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN2at12_GLOBAL__N_134structured_cat_out_cuda_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKNS_8IListRefIS6_EElEXadL_ZNS5_12_GLOBAL__N_116wrapper_CUDA_catESA_lEEEES6_NS_4guts8typelist8typelistIJSA_lEEEEESB_E4callEPNS_14OperatorKernelENS_14DispatchKeySetESA_l
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_S8_EXadL_ZNS5_12_GLOBAL__N_123wrapper_CUDA_mul_TensorES8_S8_EEEES6_NS_4guts8typelist8typelistIJS8_S8_EEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S8_
+_ZN2at4meta13structured_mm4metaERKNS_6TensorES4_
+_ZN2at6native6matmulERKNS_6TensorES3_
+_ZN2at6nativeL12_matmul_implERNS_6TensorERKS1_S4_
+_ZN5torch10PythonArgs10tensorlistEi.isra.0
+_ZN5torch8autograd11ForwardGrad10undef_gradEv
+_ZNK5torch8autograd12AutogradMeta7fw_gradEmRKN2at10TensorBaseE
+_ZN2at13empty_stridedEN3c108ArrayRefIlEES2_NS0_13TensorOptionsE
+_ZN2at4meta21structured_mul_Tensor4metaERKNS_6TensorES4_
+_ZN2at6native5copy_ERNS_6TensorERKS1_b
+_ZNK3c1010TensorImpl13is_contiguousENS_12MemoryFormatE
+_ZN2at4meta15structured_silu4metaERKNS_6TensorE
+_ZN2at6native19structured_silu_out4implERKNS_6TensorES4_
+_ZN2at12empty_symintEN3c108ArrayRefINS0_6SymIntEEENS0_13TensorOptionsESt8optionalINS0_12MemoryFormatEE
+_ZN2at4meta16structured_addmm4metaERKNS_6TensorES4_S4_RKN3c106ScalarES8_
+_ZN2at6native7dropoutERKNS_6TensorEdb
+_ZN2at6nativeL18_flatten_nd_linearERKNS_6TensorES3_S3_
+_ZN5torch8autogradL19THPVariable_dropoutEP7_objectS2_S2_
+_ZN2at4meta14structured_bmm4metaERKNS_6TensorES4_
+_ZN2at6native16embedding_symintERKNS_6TensorES3_N3c106SymIntEbb
+_ZN2at6native5fill_ERNS_6TensorERKN3c106ScalarE
+_ZN2at6native8fill_outERNS_6TensorERKN3c106ScalarE
+_ZN3c10eqIlEEbNS_8ArrayRefIT_EERKSt6vectorIS2_SaIS2_EE.isra.0
+_ZN2at4_ops13transpose_int10redispatchEN3c1014DispatchKeySetERKNS_6TensorEll
+_ZN2at4_ops3bmm4callERKNS_6TensorES4_
+_ZN2at4_ops5copy_4callERNS_6TensorERKS2_b
+_ZN2at4_ops6expand4callERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEEb
+_ZN2at4_ops28scaled_dot_product_attention4callERKNS_6TensorES4_S4_RKSt8optionalIS2_EdbS5_IdE
+_ZN2at9NamesMode11set_enabledEb
+_ZN2at4_ops3cat4callERKN3c108IListRefINS_6TensorEEEl
+_ZN2at4_ops8_to_copy4callERKNS_6TensorESt8optionalIN3c1010ScalarTypeEES5_INS6_6LayoutEES5_INS6_6DeviceEES5_IbEbS5_INS6_12MemoryFormatEE
+_ZN2at12_GLOBAL__N_115wrapper_CUDA_mmERKNS_6TensorES3_
+_ZN2at12_GLOBAL__N_133structured_mm_out_cuda_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_EXadL_ZNS5_12_GLOBAL__N_117wrapper_CUDA_siluES8_EEEES6_NS_4guts8typelist8typelistIJS8_EEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_
+_ZN2at4_ops5addmm4callERKNS_6TensorES4_S4_RKN3c106ScalarES8_
+_ZN2at4_ops7dropout4callERKNS_6TensorEdb
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_S9_RKNS_6ScalarEEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_110add_TensorES7_S9_S9_SC_EEEES6_NS_4guts8typelist8typelistIJS7_S9_S9_SC_EEEEESD_E4callEPNS_14OperatorKernelES7_S9_S9_SC_
+_ZN5torch8autograd12VariableType12_GLOBAL__N_110add_TensorEN3c1014DispatchKeySetERKN2at6TensorES8_RKNS3_6ScalarE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_S9_EXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_110mul_TensorES7_S9_S9_EEEES6_NS_4guts8typelist8typelistIJS7_S9_S9_EEEEESA_E4callEPNS_14OperatorKernelES7_S9_S9_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_S9_S9_RKNS_6ScalarESC_EXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_15addmmES7_S9_S9_S9_SC_SC_EEEES6_NS_4guts8typelist8typelistIJS7_S9_S9_S9_SC_SC_EEEEESD_E4callEPNS_14OperatorKernelES7_S9_S9_S9_SC_SC_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFSt5tupleIJN2at6TensorES7_S7_EENS_14DispatchKeySetERKS7_NS_8ArrayRefINS_6SymIntEEERKSt8optionalIS7_ESI_dEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_117native_layer_normES9_SB_SE_SI_SI_dEEEES8_NS_4guts8typelist8typelistIJS9_SB_SE_SI_SI_dEEEEESJ_E4callEPNS_14OperatorKernelES9_SB_SE_SI_SI_d
+_ZN5torch8autograd12VariableType12_GLOBAL__N_110mul_TensorEN3c1014DispatchKeySetERKN2at6TensorES8_
+_ZN5torch8autograd12VariableType12_GLOBAL__N_117native_layer_normEN3c1014DispatchKeySetERKN2at6TensorENS3_8ArrayRefINS3_6SymIntEEERKSt8optionalIS6_ESF_d
+_ZN5torch8autograd12VariableType12_GLOBAL__N_13catEN3c1014DispatchKeySetERKNS3_8IListRefIN2at6TensorEEEl
+_ZN5torch8autograd12VariableType12_GLOBAL__N_15addmmEN3c1014DispatchKeySetERKN2at6TensorES8_S8_RKNS3_6ScalarESB_
+_ZN5torch8autograd12VariableType12_GLOBAL__N_15cloneEN3c1014DispatchKeySetERKN2at6TensorESt8optionalINS3_12MemoryFormatEE
+_ZN2at6native15gpu_kernel_implINS0_13BinaryFunctorIN3c104HalfES4_S4_NS0_15binary_internal10MulFunctorIfEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native15gpu_kernel_implINS0_13BinaryFunctorIfffNS0_15binary_internal10MulFunctorIfEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native22gpu_kernel_impl_nocastINS0_13BinaryFunctorIN3c104HalfES4_S4_NS0_15binary_internal10MulFunctorIfEEEEEEvRNS_18TensorIteratorBaseERKT_
+_Z22make_offset_calculatorILi2ELb0EE16OffsetCalculatorIXT_EjXT0_EERKN2at18TensorIteratorBaseE
+_ZN2at4_ops18masked_fill_Scalar4callERKNS_6TensorES4_RKN3c106ScalarE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_St8optionalINS_10ScalarTypeEES9_INS_6LayoutEES9_INS_6DeviceEES9_IbEbS9_INS_12MemoryFormatEEEXadL_ZNS5_12_GLOBAL__N_18_to_copyES8_SB_SD_SF_SG_bSI_EEEES6_NS_4guts8typelist8typelistIJS8_SB_SD_SF_SG_bSI_EEEEESJ_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_SD_SF_SG_bSI_
+_ZN2at6native10gpu_kernelI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_23direct_copy_kernel_cudaES5_EELj15EEFffEJEEEEvS5_RKT_.isra.0
+_ZN2at6native15gpu_kernel_implI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_23direct_copy_kernel_cudaES5_EELj15EEFffEJEEEEvS5_RKT_.isra.0
+_ZN2at6native20compare_eq_ne_kernelERNS_18TensorIteratorBaseENS0_51_GLOBAL__N__86fa8531_18_CompareEQKernel_cu_d8008c968EqOpTypeE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_EXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_11tES7_S9_EEEES6_NS_4guts8typelist8typelistIJS7_S9_EEEEESA_E4callEPNS_14OperatorKernelES7_S9_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_NS_8ArrayRefINS_6SymIntEEEEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_14viewES7_S9_SC_EEEES6_NS_4guts8typelist8typelistIJS7_S9_SC_EEEEESD_E4callEPNS_14OperatorKernelES7_S9_SC_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_NS_8ArrayRefIlEEEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_17permuteES7_S9_SB_EEEES6_NS_4guts8typelist8typelistIJS7_S9_SB_EEEEESC_E4callEPNS_14OperatorKernelES7_S9_SB_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_llEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_113transpose_intES7_S9_llEEEES6_NS_4guts8typelist8typelistIJS7_S9_llEEEEESA_E4callEPNS_14OperatorKernelES7_S9_ll
+_ZN5torch8autograd12VariableType12_GLOBAL__N_113transpose_intEN3c1014DispatchKeySetERKN2at6TensorEll
+_ZN5torch8autograd12VariableType12_GLOBAL__N_11tEN3c1014DispatchKeySetERKN2at6TensorE
+_ZN5torch8autograd12VariableType12_GLOBAL__N_14viewEN3c1014DispatchKeySetERKN2at6TensorENS3_8ArrayRefINS3_6SymIntEEE
+_ZN2at4impl13get_opt_namesEPKN3c1010TensorImplE
+_ZN2at4_ops10add_Tensor10redispatchEN3c1014DispatchKeySetERKNS_6TensorES6_RKNS2_6ScalarE
+_ZN2at4_ops10layer_norm4callERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEERKSt8optionalIS2_ESC_db
+_ZN2at4_ops13empty_strided10redispatchEN3c1014DispatchKeySetENS2_8ArrayRefINS2_6SymIntEEES6_St8optionalINS2_10ScalarTypeEES7_INS2_6LayoutEES7_INS2_6DeviceEES7_IbE
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at18TensorIteratorBaseERKS_ILb0ELb1ELb0ES0_IPFvS3_EXadL_ZNS1_6native23direct_copy_kernel_cudaES3_EELj18EEFN3c104HalfES9_EJEEEXadL_ZNS6_22gpu_kernel_impl_nocastISB_EEvS3_RKT_EELj1EEFviEJ16OffsetCalculatorILi2EjLb0EENS1_6detail5ArrayIPcLi2EEESC_EE7managerIZNSG_ISB_EEvS3_SJ_EUliE_E9do_deleteEPv
+_ZN2at4_ops8_to_copy10redispatchEN3c1014DispatchKeySetERKNS_6TensorESt8optionalINS2_10ScalarTypeEES7_INS2_6LayoutEES7_INS2_6DeviceEES7_IbEbS7_INS2_12MemoryFormatEE
+_ZN3c106detail19MultiDispatchKeySetclERKNS_8IListRefIN2at6TensorEEE
+_ZN2at4_ops10mul_Tensor10redispatchEN3c1014DispatchKeySetERKNS_6TensorES6_
+_ZN2at4_ops3cat10redispatchEN3c1014DispatchKeySetERKNS2_8IListRefINS_6TensorEEEl
+_ZN2at4_ops5addmm10redispatchEN3c1014DispatchKeySetERKNS_6TensorES6_S6_RKNS2_6ScalarES9_
+_ZN3sdp15calculate_scaleERKN2at6TensorESt8optionalIdE.isra.0
+_ZN2at6native28scaled_dot_product_attentionERKNS_6TensorES3_S3_RKSt8optionalIS1_EdbS4_IdE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_lSt8optionalINS_6SymIntEESC_SB_EXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_112slice_TensorES7_S9_lSC_SC_SB_EEEES6_NS_4guts8typelist8typelistIJS7_S9_lSC_SC_SB_EEEEESD_E4callEPNS_14OperatorKernelES7_S9_lSC_SC_SB_
+_ZN5torch8autograd12VariableType12_GLOBAL__N_110sub_TensorEN3c1014DispatchKeySetERKN2at6TensorES8_RKNS3_6ScalarE
+_ZN5torch8autograd12VariableType12_GLOBAL__N_112_unsafe_viewEN3c1014DispatchKeySetERKN2at6TensorENS3_8ArrayRefINS3_6SymIntEEE
+_ZN5torch8autograd12VariableType12_GLOBAL__N_121checked_cast_variableERKN2at6TensorEPKci
+_ZN5torch8autograd12VariableType6unpackERKN2at6TensorEPKci
+_ZN5torch8autograd9generated7details7cat_jvpERKN3c108IListRefIN2at6TensorEEEl
+_ZN5torch8autograd21compute_requires_gradIJRKN2at6TensorEEEEbDpOT_
+_ZN5torch8autograd21compute_requires_gradIJRKN2at6TensorES5_EEEbDpOT_
+_ZN5torch8autograd32throw_error_for_complex_autogradERKN2at6TensorEPKc
+_ZNSt8optionalIN2at6TensorEEaSIS1_EENSt9enable_ifIX7__and_vISt6__not_ISt7is_sameIS2_NSt9remove_cvINSt16remove_referenceIT_E4typeEE4typeEEES5_ISt6__and_IJSt9is_scalarIS1_ES6_IS1_NSt5decayIS9_E4typeEEEEESt16is_constructibleIS1_JS9_EESt13is_assignableIRS1_S9_EEERS2_E4typeEOS9_.isra.0
+_ZN2at4_ops17native_layer_norm10redispatchEN3c1014DispatchKeySetERKNS_6TensorENS2_8ArrayRefINS2_6SymIntEEERKSt8optionalIS4_ESD_d
+_ZN2at4_ops17native_layer_norm4callERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEERKSt8optionalIS2_ESC_d
+_ZNK2at10TensorBase8data_ptrIN3c104HalfEEEPT_v
+_ZN2at4_ops5clone4callERKNS_6TensorESt8optionalIN3c1012MemoryFormatEE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_8ArrayRefINS_6SymIntEEESt8optionalINS_10ScalarTypeEESA_INS_6LayoutEESA_INS_6DeviceEESA_IbESA_INS_12MemoryFormatEEEXadL_ZNS5_12_GLOBAL__N_119empty_memory_formatES9_SC_SE_SG_SH_SJ_EEEES6_NS_4guts8typelist8typelistIJS9_SC_SE_SG_SH_SJ_EEEEESK_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES9_SC_SE_SG_SH_SJ_
+_ZN2at4_ops8to_dtype4callERKNS_6TensorEN3c1010ScalarTypeEbbSt8optionalINS5_12MemoryFormatEE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_NS_8ArrayRefIlEEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_142wrapper_CompositeExplicitAutograd__permuteES8_SA_EEEES6_NS_4guts8typelist8typelistIJS8_SA_EEEEESB_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SA_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_St8optionalINS_10ScalarTypeEES9_INS_6LayoutEES9_INS_6DeviceEES9_IbES9_INS_12MemoryFormatEEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_145wrapper_CompositeExplicitAutograd__empty_likeES8_SB_SD_SF_SG_SI_EEEES6_NS_4guts8typelist8typelistIJS8_SB_SD_SF_SG_SI_EEEEESJ_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_SD_SF_SG_SI_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_St8optionalINS_12MemoryFormatEEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_140wrapper_CompositeExplicitAutograd__cloneES8_SB_EEEES6_NS_4guts8typelist8typelistIJS8_SB_EEEEESC_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_
+_ZN2at6native23structured_cat_out_cuda4implERKN3c108IListRefINS_6TensorEEEllbbbNS2_12MemoryFormatERKS4_
+_ZN2at6native40_GLOBAL__N__5df19e2c_8_Shape_cu_49f7391c12parallel_catINS1_10OpaqueTypeILj2EEELi64ELi64EEEvRKNS_6TensorERKSt6vectorISt17reference_wrapperIS6_ESaISA_EEliN3c1012MemoryFormatE
+_ZNK2at10TensorBase14const_data_ptrIfEEPKT_v
+_ZN2at12_GLOBAL__N_112_GLOBAL__N_146wrapper_CompositeExplicitAutograd_Tensor_sliceERKNS_6TensorElSt8optionalIN3c106SymIntEES8_S7_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_NS_8ArrayRefINS_6SymIntEEEbEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_141wrapper_CompositeExplicitAutograd__expandES8_SB_bEEEES6_NS_4guts8typelist8typelistIJS8_SB_bEEEEESC_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_b
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_St8optionalINS_10ScalarTypeEES9_INS_6LayoutEES9_INS_6DeviceEES9_IbEbS9_INS_12MemoryFormatEEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_143wrapper_CompositeExplicitAutograd___to_copyES8_SB_SD_SF_SG_bSI_EEEES6_NS_4guts8typelist8typelistIJS8_SB_SD_SF_SG_bSI_EEEEESJ_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_SD_SF_SG_bSI_
+_ZN2at6native15gpu_kernel_implI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_62_GLOBAL__N__82494415_23_ActivationSiluKernel_cu_f9d27b8c_3273411silu_kernelES5_EELj5EEFN3c104HalfESB_EJEEEEvS5_RKT_.isra.0
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_S8_S8_RKSt8optionalIS6_EdbS9_IdEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_163wrapper_CompositeImplicitAutograd__scaled_dot_product_attentionES8_S8_S8_SC_dbSD_EEEES6_NS_4guts8typelist8typelistIJS8_S8_S8_SC_dbSD_EEEEESE_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S8_S8_SC_dbSD_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_lNS_6SymIntEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_144wrapper_CompositeExplicitAutograd_int_selectES8_lS9_EEEES6_NS_4guts8typelist8typelistIJS8_lS9_EEEEESA_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_lS9_
+_ZN2at6native15layer_norm_cudaERKNS_6TensorEN3c108ArrayRefIlEERKSt8optionalIS1_ESA_d
+_ZN2at6native53_GLOBAL__N__e6784c59_20_layer_norm_kernel_cu_9c5ada8a27LayerNormKernelImplInternalIN3c104HalfEfEEvRKNS_6TensorES7_S7_llT0_PS5_S9_S9_
+_ZN2at6native40_GLOBAL__N__5df19e2c_8_Shape_cu_49f7391c12parallel_catINS1_10OpaqueTypeILj2EEELi128ELi1EEEvRKNS_6TensorERKSt6vectorISt17reference_wrapperIS6_ESaISA_EEliN3c1012MemoryFormatE
+_ZN2at6native22_fused_sdp_choice_cudaERKNS_6TensorES3_S3_RKSt8optionalIS1_EdbS4_IdE
+_ZN2at18TensorIteratorBase12scalar_valueIfEET_i
+_ZN2at6native15gpu_kernel_implINS0_13AUnaryFunctorIN3c104HalfES4_S4_NS0_15binary_internal10MulFunctorIfEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6nativeL22make_output_calculatorIjEE16OffsetCalculatorILi2ET_Lb0EERKNS_14TensorIteratorE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_NS_8ArrayRefINS_6SymIntEEERKSt8optionalIS6_ESF_dbEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_145wrapper_CompositeImplicitAutograd__layer_normES8_SB_SF_SF_dbEEEES6_NS_4guts8typelist8typelistIJS8_SB_SF_SF_dbEEEEESG_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_SF_SF_db
+_ZN2at6native17masked_fill__cudaERNS_6TensorERKS1_RKN3c106ScalarE
+_ZZN8pybind1112cpp_function10initializeIZN5torch3jit6tracer24initPythonTracerBindingsEP7_objectEUlvE7_St10shared_ptrINS4_12TracingStateEEJEJNS_4nameENS_5scopeENS_7siblingEEEEvOT_PFT0_DpT1_EDpRKT2_ENUlRNS_6detail13function_callEE1_4_FUNESR_
+_ZN2at12_GLOBAL__N_115infer_size_implIN3c1011SmallVectorINS2_6SymIntELj5EEENS2_8ArrayRefIS4_EEEET_T0_S9_
+_ZN2at14namedinference20compute_bmm_outnamesERKNS_6TensorES3_S3_
+_ZN2at14namedinference21broadcast_to_outnamesERKNS_6TensorES3_PKc
+_ZN2at14namedinference22propagate_names_exceptERKNS_6TensorES3_N3c108ArrayRefIlEE
+_ZN2at14namedinference24compute_baddbmm_outnamesERKNS_6TensorES3_S3_S3_
+_ZN2at23infer_size_symdimvectorEN3c108ArrayRefINS0_6SymIntEEES3_
+_ZN3c1011SmallVectorINS_6SymIntELj5EEC1EmRKS1_
+_ZN3c1015SmallVectorImplINS_6SymIntEE6assignEmRKS1_
+_ZNK2at8internal23OpaqueOptionalTensorRef9getTensorEv
+_ZNSt20__uninitialized_copyILb0EE13__uninit_copyIPKN3c106SymIntEPS3_EET0_T_S8_S7_
+_ZN2at13checkDimRangeEPKcRKNS_17TensorGeometryArgEll
+_ZN2at13checkSameTypeEPKcRKNS_9TensorArgES4_
+_ZN2at16checkAllSameTypeEPKcN3c108ArrayRefINS_9TensorArgEEE
+_ZN2at6detail12getCUDAHooksEv
+_ZNK2at7Context18deterministicCuDNNEv
+_ZN2at14TensorIterator9reduce_opERNS_10TensorBaseERKS1_
+_ZN2at14namedinference29propagate_names_for_reductionERKNS_6TensorES3_N3c108ArrayRefIlEEb
+_ZN2at15expand_outplaceEN3c108ArrayRefINS_6TensorEEE
+_ZN2at18TensorIteratorBase47build_output_borrowing_argument_owning_unary_opERKNS_10TensorBaseES3_
+_ZN2at18TensorIteratorBase50build_borrowing_except_last_argument_comparison_opERKNS_10TensorBaseES3_S3_
+_ZN2at20TensorIteratorConfig31declare_static_dtype_and_deviceEN3c1010ScalarTypeENS1_6DeviceE
+_ZN2at8autocast11clear_cacheEv
+_ZN2at8autocast17decrement_nestingEv
+_ZN2at8autocast25is_autocast_cache_enabledEv
+_ZNK2at18TensorIteratorBase15num_reduce_dimsEv
+_ZNK2at7Context18userEnabledMathSDPEv
+_ZNK2at7Context24allowFP16ReductionCuBLASEv
+_ZN3c1014DeviceTypeNameB5cxx11ENS_10DeviceTypeEb
+_ZN3c1015SmallVectorBaseIjE8grow_podEPvmm
+_ZN3c1023get_privateuse1_backendB5cxx11Eb
+_ZN3c104impl15SizesAndStrides14resizeSlowPathEmm
+_ZN3c104impl28tls_is_dispatch_key_excludedENS_11DispatchKeyE
+_ZN3c10lsERSoNS_10DeviceTypeE
+_ZN3c10ngERKNS_6SymIntE
+_ZN3c10rmERKNS_6SymIntEl
+_ZNK3c106ScalarngEv
+_ZNK3c106SymIntcvNS_8SymFloatEEv
+_ZN5torch10PythonArgs10scalartypeEi.isra.0
+_ZN5torch10PythonArgs18scalartypeOptionalEi
+_ZN5torch10PythonArgs7intlistEi
+_ZN5torch8autogradL16THPVariable_boolEP7_objectS2_S2_
+_ZN5torch8autogradL19THPVariable_baddbmmEP7_objectS2_S2_
+_ZN5torch8autogradL19THPVariable_softmaxEP7_objectS2_S2_
+_ZN2at6native12resize_cuda_ERKNS_6TensorEN3c108ArrayRefIlEESt8optionalINS4_12MemoryFormatEE
+_ZN3c10dvERKNS_6SymIntEl
+_ZN3c10plERKNS_6SymIntEl
+_ZN3c1010TensorImpl21set_sizes_and_stridesENS_8ArrayRefINS_6SymIntEEES3_St8optionalIS2_E
+_ZN3c10miEiRKNS_6SymIntE
+_ZNK3c1010TensorImpl35compute_channels_last_contiguous_3dENS0_8identityIbEE
+_ZNK3c1010TensorImpl37compute_strides_like_channels_last_3dENS0_8identityIbEE
+_ZN2at6native10group_normERKNS_6TensorElRKSt8optionalIS1_ES7_db
+_ZN2at6native17native_group_normERKNS_6TensorERKSt8optionalIS1_ES7_lllld
+_ZN2at6native23check_group_norm_inputsIN3c106SymIntEEEvRKNS_6TensorES6_S6_T_l
+_ZN2at6native23check_group_norm_inputsIlEEvRKNS_6TensorES4_S4_T_l
+_ZN2at6native5chunkERKNS_6TensorEll
+_ZN2at4meta14structured_cos4metaERKNS_6TensorE
+_ZN2at4meta14structured_neg4metaERKNS_6TensorE
+_ZN2at4meta14structured_sin4metaERKNS_6TensorE
+_ZN2at4meta16structured_rsqrt4metaERKNS_6TensorE
+_ZN2at4meta23structured_index_Tensor4metaERKNS_6TensorEN3c108IListRefINS_17OptionalTensorRefEEE
+_ZN2at6native11result_typeERKNS_6TensorERKN3c106ScalarE
+_ZN2at6native12_GLOBAL__N_122inferUnsqueezeGeometryERKNS_6TensorEl
+_ZN2at6native13AdvancedIndexC1ERKNS_6TensorEN3c108ArrayRefIS2_EE
+_ZN2at6native13narrow_symintERKNS_6TensorElN3c106SymIntES5_
+_ZN2at6native18structured_cos_out4implERKNS_6TensorES4_
+_ZN2at6native18structured_neg_out4implERKNS_6TensorES4_
+_ZN2at6native18structured_sin_out4implERKNS_6TensorES4_
+_ZN2at6native20structured_index_out4implERKNS_6TensorEN3c1011SmallVectorIlLj5EEES7_S4_
+_ZN2at6native20structured_rsqrt_out4implERKNS_6TensorES4_
+_ZN2at6native24update_result_type_stateERKN3c106ScalarERKNS0_15ResultTypeStateE
+_ZN2at6native9unsqueezeERKNS_6TensorEl
+_ZN2at6nativeL9make_infoENS_6TensorEN3c108IListRefINS_17OptionalTensorRefEEE
+_ZNSt6vectorIN2at6TensorESaIS1_EE12emplace_backIJRKS1_EEERS1_DpOT_.isra.0
+_ZZN2at4metaL34check_indices_on_cpu_or_selfdeviceERKNS_6TensorERKSt6vectorINS_17OptionalTensorRefESaIS5_EEENKUlRKS5_E_clESB_
+_ZN5torch10PythonArgs15toInt64OptionalEi
+_ZN5torch10PythonArgs6deviceEi.isra.0
+_ZN5torch8autograd5utils4wrapEN3c108ArrayRefIN2at6TensorEEE
+_ZN5torch8autogradL17THPVariable_chunkEP7_objectS2_S2_
+_ZN5torch8autogradL22THPVariable_group_normEP7_objectS2_S2_
+_ZN5torch8autogradL28TypeError_to_NotImplemented_IXadL_ZNS0_L15THPVariable_divEP7_objectS3_S3_EEEES3_S3_S3_S3_
+_ZNSt22_Optional_payload_baseIN3c104impl17InlineDeviceGuardINS1_16VirtualGuardImplEEEE8_M_resetEv.part.0
+_ZN14cudnn_frontend16isLoggingEnabledEv
+_ZN14cudnn_frontend9getLoggerEv
+_ZN2at14TensorGeometryC2ERKNS_10TensorBaseE
+_ZN2at14TensorGeometryD1Ev
+_ZN2at17CUDAGeneratorImpl11device_typeEv
+_ZN2at4cuda6detail23getDefaultCUDAGeneratorEa
+_ZN2at6native13run_conv_planEP12cudnnContextRKNS_6TensorES5_S5_RKN14cudnn_frontend16ExecutionPlan_v8E
+_ZN2at6native14getCudnnHandleEv
+_ZN2at6native15run_single_convE28cudnnBackendDescriptorType_tRKNS_6TensorES4_S4_N3c108ArrayRefIlEES7_S7_lbbb
+_ZN2at6native16getCudnnDataTypeERKNS_6TensorE
+_ZN2at6native17cudnn_convolutionERKNS_6TensorES3_N3c108ArrayRefIlEES6_S6_lbbb
+_ZN2at6native20setConvolutionParamsEPNS0_17ConvolutionParamsERKNS_6TensorES5_N3c108ArrayRefIlEES8_S8_lbbNS6_12MemoryFormatE
+_ZN2at6native29cudnn_convolution_forward_outERNS_9TensorArgEPKcRKS1_S6_N3c108ArrayRefIlEES9_S9_lbbb
+_ZN2at6native30getCudnnDataTypeFromScalarTypeEN3c1010ScalarTypeE
+_ZN2at6nativeL10check_argsEPKcN3c108ArrayRefIlEEmS2_
+_ZN2at6nativeL23convolution_shape_checkEPKcRKNS_17TensorGeometryArgES5_S5_N3c108ArrayRefIlEES8_S8_l
+_ZN2at6nativeL27cudnnv8_enabled_check_debugEv
+_ZN2at6nativeL32cudnn_conv_suggest_memory_formatERKNS_6TensorES3_
+_ZN3sdp12_GLOBAL__N_125check_head_dim_size_flashERKNS_10sdp_paramsEb
+_ZN3sdp12_GLOBAL__N_155check_requires_grad_and_head_dim_gt192_and_sm_ge86_lt90ERKNS_10sdp_paramsEb
+_ZN3sdp36check_batch_size_and_num_heads_denseERKNS_10sdp_paramsEb
+_ZNK2at4cuda6detail9CUDAHooks35supportsDilatedConvolutionWithCuDNNEv
+_ZNKSt10_HashtableIN2at6native12_GLOBAL__N_115CacheKeyWrapperESt4pairIKS3_S4_IN14cudnn_frontend16ExecutionPlan_v8ESt14_List_iteratorIS3_EEESaISB_ENSt8__detail10_Select1stESt8equal_toIS3_ENS1_17ParamsWrapperHashIS3_EENSD_18_Mod_range_hashingENSD_20_Default_ranged_hashENSD_20_Prime_rehash_policyENSD_17_Hashtable_traitsILb1ELb0ELb1EEEE19_M_find_before_nodeEmRS5_m.constprop.0
+_ZNSt10_HashtableIN2at6native12_GLOBAL__N_115CacheKeyWrapperESt4pairIKS3_S4_IN14cudnn_frontend16ExecutionPlan_v8ESt14_List_iteratorIS3_EEESaISB_ENSt8__detail10_Select1stESt8equal_toIS3_ENS1_17ParamsWrapperHashIS3_EENSD_18_Mod_range_hashingENSD_20_Default_ranged_hashENSD_20_Prime_rehash_policyENSD_17_Hashtable_traitsILb1ELb0ELb1EEEE4findERS5_.constprop.0
+_ZNSt16_Sp_counted_baseILN9__gnu_cxx12_Lock_policyE2EE10_M_releaseEv
+_ZNSt23_Sp_counted_ptr_inplaceIN14cudnn_frontend20OpaqueBackendPointerESaIS1_ELN9__gnu_cxx12_Lock_policyE2EE10_M_destroyEv
+_ZNSt23_Sp_counted_ptr_inplaceIN14cudnn_frontend20OpaqueBackendPointerESaIS1_ELN9__gnu_cxx12_Lock_policyE2EE10_M_disposeEv
+_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.0
+_ZNSt8__detail9_Map_baseIiSt4pairIKiP12cudnnContextESaIS5_ENS_10_Select1stESt8equal_toIiESt4hashIiENS_18_Mod_range_hashingENS_20_Default_ranged_hashENS_20_Prime_rehash_policyENS_17_Hashtable_traitsILb0ELb0ELb1EEELb1EEixERS2_
+_ZN8pybind116detail11type_casterIN3c106SymIntEvE4loadENS_6handleEb
+_ZN2at6detail18empty_strided_cudaEN3c108ArrayRefIlEES3_RKNS1_13TensorOptionsE
+_ZN2at6native17resize_bytes_cudaEPN3c1011StorageImplEm
+_ZN3sdp12_GLOBAL__N_127check_all_tensors_on_deviceERKNS_10sdp_paramsEb
+_ZN3sdp12_GLOBAL__N_133check_head_dim_size_mem_efficientERKNS_10sdp_paramsEb
+_ZN3sdp12_GLOBAL__N_136check_mem_efficient_hardware_supportERKNS_10sdp_paramsEb
+_ZN3sdp18check_tensor_dtypeISt5arrayIN3c1010ScalarTypeELm3EEEEbRKNS_10sdp_paramsET_b
+_ZN3sdp18select_sdp_backendERKNS_10sdp_paramsE
+_ZN3sdp19check_for_attn_maskERKNS_10sdp_paramsEb
+_ZN3sdp19check_tensor_shapesERKNS_10sdp_paramsEb
+_ZN3sdp31can_use_mem_efficient_attentionERKNS_10sdp_paramsEb
+_ZN3sdp36check_last_dim_stride_equals_1_denseILb0EEEbRKNS_10sdp_paramsEb
+_ZNK2at10TensorBase10sym_strideEl
+_ZN2at4meta15structured_tanh4metaERKNS_6TensorE
+_ZN2at4meta19structured__softmax4metaERKNS_6TensorElb
+_ZN2at4metaL17meta_func_cum_opsERNS_4impl8MetaBaseEPKcRKNS_6TensorElSt8optionalIN3c1010ScalarTypeEE.constprop.0
+_ZN2at6native11masked_fillERKNS_6TensorES3_RKN3c106ScalarE
+_ZN2at6native13select_symintERKNS_6TensorElN3c106SymIntE
+_ZN2at6native13to_will_aliasERKNS_6TensorESt8optionalIN3c1010ScalarTypeEES4_INS5_6LayoutEES4_INS5_6DeviceEEbS4_INS5_12MemoryFormatEE
+_ZN2at6native19structured_tanh_out4implERKNS_6TensorES4_
+_ZN2at6native21structured_argmax_out4implERKNS_6TensorESt8optionalIlEbS4_
+_ZN2at6native2toERKNS_6TensorEN3c106DeviceENS4_10ScalarTypeEbbSt8optionalINS4_12MemoryFormatEE
+_ZN2at6native2toERKNS_6TensorESt8optionalIN3c1010ScalarTypeEES4_INS5_6LayoutEES4_INS5_6DeviceEES4_IbEbbS4_INS5_12MemoryFormatEE
+_ZN2at6native4itemERKNS_6TensorE
+_ZN2at6native5cloneERKNS_6TensorESt8optionalIN3c1012MemoryFormatEE
+_ZN2at6native6unfoldERKNS_6TensorElll
+_ZN2at6native9expand_asERKNS_6TensorES3_
+_ZNK2at6Tensor6toTypeEN3c1010ScalarTypeE
+_ZNSt6vectorIN3c106SymIntESaIS1_EE8_M_eraseEN9__gnu_cxx17__normal_iteratorIPS1_S3_EE
+_Z12THPFInfo_NewRKN3c1010ScalarTypeE
+_Z14THPFInfo_pynewP11_typeobjectP7_objectS2_
+_ZL12THPFInfo_minP8THPFInfoPv
+_ZN2at8indexing4implL11applySelectERKNS_6TensorElN3c106SymIntElRKNS5_6DeviceERKSt8optionalINS5_8ArrayRefIS6_EEE.constprop.0
+_ZN5torch8autograd13UnpackedSliceD1Ev
+_ZN8pybind1110isinstanceENS_6handleES0_
+_ZN5torch5utils14tensor_to_listERKN2at6TensorE
+_ZN5torch7tensors24get_default_dispatch_keyEv
+_Z17THPVariable_CheckP7_object
+_ZL18THPVariable_deviceP11THPVariablePv
+_ZN2at8indexing11TensorIndexC1ESt9nullopt_t
+_ZN2at8indexing4implL18typeConvertIndicesERKNS_6TensorEOSt6vectorIS2_SaIS2_EE.constprop.0
+_ZNSt6vectorIN2at6TensorESaIS1_EE17_M_default_appendEm
+_ZNSt6vectorIN2at6TensorESaIS1_EE17_M_realloc_insertIJS1_EEEvN9__gnu_cxx17__normal_iteratorIPS1_S3_EEDpOT_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFRN2at6TensorENS_14DispatchKeySetES7_RKS6_RKNS_6ScalarEEXadL_ZN5torch15ADInplaceOrView12_GLOBAL__N_111add__TensorES8_S7_SA_SD_EEEES7_NS_4guts8typelist8typelistIJS8_S7_SA_SD_EEEEESE_E4callEPNS_14OperatorKernelES8_S7_SA_SD_
+_ZN5torch15ADInplaceOrView12_GLOBAL__N_19unsqueezeEN3c1014DispatchKeySetERKN2at6TensorEl
+_ZN13pytorch_flash16set_params_fpropERNS_16Flash_fwd_paramsEmmmmmmmmmN2at6TensorES3_S3_S3_PvS4_S4_S4_S4_ffii
+_ZN13pytorch_flash7mha_fwdERKN2at6TensorES3_S3_RSt8optionalIS1_EffbiibS4_INS0_9GeneratorEE
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at18TensorIteratorBaseERKNS1_6native15CUDAFunctor_addIN3c104HalfEEEEXadL_ZNS4_22gpu_kernel_impl_nocastIS8_EEvS3_RKT_EELj1EEFviEJ16OffsetCalculatorILi3EjLb0EENS1_6detail5ArrayIPcLi3EEES9_EE7managerIZNSD_IS8_EEvS3_SG_EUliE_E7do_copyEPv
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at18TensorIteratorBaseERKNS1_6native15CUDAFunctor_addIN3c104HalfEEEEXadL_ZNS4_22gpu_kernel_impl_nocastIS8_EEvS3_RKT_EELj1EEFviEJ16OffsetCalculatorILi3EjLb0EENS1_6detail5ArrayIPcLi3EEES9_EE7managerIZNSD_IS8_EEvS3_SG_EUliE_E9do_deleteEPv
+_ZN2at6native24structured_gelu_out_cuda4implERKNS_6TensorEN3c1017basic_string_viewIcEES4_
+_Z13THPDevice_NewRKN3c106DeviceE
+_Z14THPDevice_typeP9THPDeviceP7_object
+_ZN5torch15PythonArgParser5parseILi1EEENS_10PythonArgsEP7_objectS4_S4_RNS_10ParsedArgsIXT_EEE
+_ZN5torch8autogradL15is_grad_enabledEP7_objectS2_
+_ZN5torch8autogradL16set_grad_enabledEP7_objectS2_S2_
+_ZN5torch8autogradL18THPVariable_argmaxEP7_objectS2_S2_
+_ZN5torch8autogradL20clear_autocast_cacheEP7_objectS2_
+_ZN5torch8autogradL20set_autocast_enabledEP7_objectS2_
+_ZN5torch8autogradL22get_autocast_gpu_dtypeEP7_objectS2_
+_ZN5torch8autogradL22set_autocast_gpu_dtypeEP7_objectS2_
+_ZN5torch8autogradL25is_autocast_cache_enabledEP7_objectS2_
+_ZL13THPSize_pynewP11_typeobjectP7_objectS2_
+_ZN2at4_ops10contiguous4callERKNS_6TensorEN3c1012MemoryFormatE
+_ZN2at4_ops17cudnn_convolution4callERKNS_6TensorES4_N3c108ArrayRefINS5_6SymIntEEES8_S8_S7_bbb
+_ZN2at4_ops17native_group_norm4callERKNS_6TensorERKSt8optionalIS2_ES8_N3c106SymIntESA_SA_ld
+_ZN2at4_ops18upsample_nearest2d4callERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEESt8optionalIdESA_
+_ZN2at4_ops6conv2d4callERKNS_6TensorES4_RKSt8optionalIS2_EN3c108ArrayRefINS9_6SymIntEEESC_SC_SB_
+_ZN5torch8autogradL15THPVariable_expEP7_objectS2_S2_
+_ZN5torch8autogradL18THPVariable_conv2dEP7_objectS2_S2_
+_ZN5torch8autogradL30THPVariable_upsample_nearest2dEP7_objectS2_S2_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_NS_8ArrayRefINS_6SymIntEEESC_EXadL_ZN5torch15ADInplaceOrView12_GLOBAL__N_114_reshape_aliasES7_S9_SC_SC_EEEES6_NS_4guts8typelist8typelistIJS7_S9_SC_SC_EEEEESD_E4callEPNS_14OperatorKernelES7_S9_SC_SC_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_lNS_6SymIntEEXadL_ZN5torch15ADInplaceOrView12_GLOBAL__N_110select_intES7_S9_lSA_EEEES6_NS_4guts8typelist8typelistIJS7_S9_lSA_EEEEESB_E4callEPNS_14OperatorKernelES7_S9_lSA_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_llEXadL_ZN5torch15ADInplaceOrView12_GLOBAL__N_113transpose_intES7_S9_llEEEES6_NS_4guts8typelist8typelistIJS7_S9_llEEEEESA_E4callEPNS_14OperatorKernelES7_S9_ll
+_ZN5torch15ADInplaceOrView12_GLOBAL__N_110select_intEN3c1014DispatchKeySetERKN2at6TensorElNS2_6SymIntE
+_ZN5torch15ADInplaceOrView12_GLOBAL__N_114_reshape_aliasEN3c1014DispatchKeySetERKN2at6TensorENS2_8ArrayRefINS2_6SymIntEEESA_
+_ZN5torch8autograd37make_variable_non_differentiable_viewERKN2at6TensorES4_b
+_ZN2at12_GLOBAL__N_112_GLOBAL__N_132wrapper_CUDA_Scalar_masked_fill_ERNS_6TensorERKS2_RKN3c106ScalarE
+_ZN2at12_GLOBAL__N_117wrapper_CUDA_tanhERKNS_6TensorE
+_ZN2at12_GLOBAL__N_120wrapper_CUDA_baddbmmERKNS_6TensorES3_S3_RKN3c106ScalarES7_
+_ZN2at12_GLOBAL__N_121wrapper_CUDA__softmaxERKNS_6TensorElb
+_ZN2at12_GLOBAL__N_123wrapper_CUDA_sub_TensorERKNS_6TensorES3_RKN3c106ScalarE
+_ZN2at12_GLOBAL__N_130structured_tanh_out_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN2at12_GLOBAL__N_138structured_baddbmm_out_cuda_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN2at12_GLOBAL__N_138structured_softmax_cuda_out_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_EXadL_ZNS5_12_GLOBAL__N_117wrapper_CUDA_tanhES8_EEEES6_NS_4guts8typelist8typelistIJS8_EEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_S8_EXadL_ZNS5_12_GLOBAL__N_116wrapper_CUDA_bmmES8_S8_EEEES6_NS_4guts8typelist8typelistIJS8_S8_EEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S8_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_S8_S8_RKNS_6ScalarESB_EXadL_ZNS5_12_GLOBAL__N_120wrapper_CUDA_baddbmmES8_S8_S8_SB_SB_EEEES6_NS_4guts8typelist8typelistIJS8_S8_S8_SB_SB_EEEEESC_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S8_S8_SB_SB_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_lbEXadL_ZNS5_12_GLOBAL__N_121wrapper_CUDA__softmaxES8_lbEEEES6_NS_4guts8typelist8typelistIJS8_lbEEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_lb
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFRN2at6TensorES7_RKS6_RKNS_6ScalarEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_132wrapper_CUDA_Scalar_masked_fill_ES7_S9_SC_EEEES7_NS_4guts8typelist8typelistIJS7_S9_SC_EEEEESD_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES7_S9_SC_
+_ZN2at4cuda4blas5bgemmIN3c104HalfEEEvcclllNS_10OpMathTypeIT_E4typeEPKS6_llSA_llS8_PS6_lll
+_ZN2at6native10gpu_kernelINS0_21CUDAFunctorOnSelf_addIN3c104HalfEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native15gpu_kernel_implINS0_21CUDAFunctorOnSelf_addIN3c104HalfEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native22gpu_kernel_impl_nocastINS0_21CUDAFunctorOnSelf_addIN3c104HalfEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native22gpu_kernel_impl_nocastINS0_22CUDAFunctorOnOther_addIlEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native27structured_baddbmm_out_cuda4implERKNS_6TensorES4_S4_RKN3c106ScalarES8_S4_
+_ZN2at6nativeL18contiguous_out_argERKNS_6TensorE
+_ZN2at6nativeL23resize_reduction_resultERNS_6TensorERKS1_St6bitsetILm64EEbN3c1010ScalarTypeE.constprop.0
+_ZN2at4cuda4blas4gemmIN3c104HalfEEEvcclllNS_10OpMathTypeIT_E4typeEPKS6_lSA_lS8_PS6_l
+_ZN2at4cuda4blas5bgemmIfEEvcclllNS_10OpMathTypeIT_E4typeEPKS4_llS8_llS6_PS4_lll
+_ZN2at6native10gpu_kernelINS0_21CUDAFunctorOnSelf_addIfEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native15gpu_kernel_implINS0_21CUDAFunctorOnSelf_addIfEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native22gpu_kernel_impl_nocastINS0_21CUDAFunctorOnSelf_addIfEEEEvRNS_18TensorIteratorBaseERKT_
+_ZNSt6vectorIlSaIlEE17_M_realloc_insertIJlEEEvN9__gnu_cxx17__normal_iteratorIPlS1_EEDpOT_
+_ZNSt6vectorIlSaIlEE7reserveEm
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at18TensorIteratorBaseERKNS1_6native13BinaryFunctorIN3c104HalfES7_S7_NS4_15binary_internal10MulFunctorIfEEEEEXadL_ZNS4_22gpu_kernel_impl_nocastISB_EEvS3_RKT_EELj1EEFviEJ16OffsetCalculatorILi3EjLb0EENS1_6detail5ArrayIPcLi3EEESC_EE7managerIZNSG_ISB_EEvS3_SJ_EUliE_E7do_copyEPv
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at18TensorIteratorBaseERKNS1_6native13BinaryFunctorIN3c104HalfES7_S7_NS4_15binary_internal10MulFunctorIfEEEEEXadL_ZNS4_22gpu_kernel_impl_nocastISB_EEvS3_RKT_EELj1EEFviEJ16OffsetCalculatorILi3EjLb0EENS1_6detail5ArrayIPcLi3EEESC_EE7managerIZNSG_ISB_EEvS3_SJ_EUliE_E9do_deleteEPv
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at18TensorIteratorBaseERKNS1_6native13BinaryFunctorIfffNS4_15binary_internal10MulFunctorIfEEEEEXadL_ZNS4_22gpu_kernel_impl_nocastIS9_EEvS3_RKT_EELj1EEFviEJ16OffsetCalculatorILi3EjLb0EENS1_6detail5ArrayIPcLi3EEESA_EE7managerIZNSE_IS9_EEvS3_SH_EUliE_E7do_copyEPv
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at18TensorIteratorBaseERKS_ILb0ELb1ELb0ES0_IPFvS3_EXadL_ZNS1_6native23direct_copy_kernel_cudaES3_EELj18EEFN3c104HalfES9_EJEEEXadL_ZNS6_22gpu_kernel_impl_nocastISB_EEvS3_RKT_EELj1EEFviEJ16OffsetCalculatorILi2EjLb0EENS1_6detail5ArrayIPcLi2EEESC_EE7managerIZNSG_ISB_EEvS3_SJ_EUliE_E7do_copyEPv
+_ZN2at6native10gpu_kernelINS0_13BinaryFunctorIfffNS0_15binary_internal10MulFunctorIfEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native22gpu_kernel_impl_nocastINS0_13BinaryFunctorIfffNS0_15binary_internal10MulFunctorIfEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at4_ops12prod_dim_int4callERKNS_6TensorElbSt8optionalIN3c1010ScalarTypeEE
+_ZN2at4_ops15constant_pad_nd4callERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEERKNS5_6ScalarE
+_ZN2at4_ops18result_type_Scalar4callERKNS_6TensorERKN3c106ScalarE
+_ZN2at4_ops28_efficient_attention_forward4callERKNS_6TensorES4_S4_RKSt8optionalIS2_ES8_S8_S5_IlES9_dlbS5_IdES8_S8_
+_ZN2at4_ops39_scaled_dot_product_efficient_attention4callERKNS_6TensorES4_S4_RKSt8optionalIS2_EbdbS5_IdE
+_ZN2at4_ops7resize_4callERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEESt8optionalINS5_12MemoryFormatEE
+_ZN2at4_ops8mean_dim4callERKNS_6TensorEN3c1016OptionalArrayRefIlEEbSt8optionalINS5_10ScalarTypeEE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_8ArrayRefINS_6SymIntEEESt8optionalINS_10ScalarTypeEESA_INS_6LayoutEESA_INS_6DeviceEESA_IbESA_INS_12MemoryFormatEEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_131wrapper_CPU_memory_format_emptyES9_SC_SE_SG_SH_SJ_EEEES6_NS_4guts8typelist8typelistIJS9_SC_SE_SG_SH_SJ_EEEEESK_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES9_SC_SE_SG_SH_SJ_
+_ZN3c1013intrusive_ptrINS_6detail8ListImplENS1_34intrusive_target_default_null_typeIS2_EEE6reset_Ev
+_ZN3c104ListISt8optionalIN2at6TensorEEEC1Ev
+_ZN5torch10PythonArgs15intlistOptionalEi
+_ZN5torch8autograd13make_variableEN2at6TensorEbb
+_ZN5torch8autogradL14THPVariable_eqEP7_objectS2_S2_
+_ZN5torch8autogradL15THPVariable_anyEP7_objectS2_S2_
+_ZN5torch8autogradL15THPVariable_negEP7_objectS2_
+_ZN5torch8autogradL15THPVariable_powEP7_objectS2_S2_
+_ZN5torch8autogradL15THPVariable_sinEP7_objectS2_
+_ZN5torch8autogradL16THPVariable_meanEP7_objectS2_S2_
+_ZN5torch8autogradL16THPVariable_prodEP7_objectS2_S2_
+_ZN5torch8autogradL17THPVariable_floatEP7_objectS2_S2_
+_ZN5torch8autogradL17THPVariable_rsqrtEP7_objectS2_S2_
+_ZN5torch8autogradL18THPVariable_arangeEP7_objectS2_S2_
+_ZN5torch8autogradL18THPVariable_expandEP7_objectS2_S2_
+_ZN5torch8autogradL18THPVariable_matmulEP7_objectS2_S2_
+_ZN5torch8autogradL19THPVariable_to_typeEP7_objectN3c1010ScalarTypeESt8optionalINS3_12MemoryFormatEE
+_ZN5torch8autogradL21THPVariable_unsqueezeEP7_objectS2_S2_
+_ZN5torch8autogradL28TypeError_to_NotImplemented_IXadL_ZNS0_L18THPVariable_matmulEP7_objectS3_S3_EEEES3_S3_S3_S3_
+_ZN8pybind1118gil_scoped_releaseD2Ev
+_ZN3c106detail8ListImplC2ESt6vectorINS_6IValueESaIS3_EENS_4Type24SingletonOrSharedTypePtrIS6_EE
+_ZN2at12_GLOBAL__N_157structured_special_shifted_chebyshev_polynomial_w_out_out16maybe_get_outputEl
+_ZN2at4_ops12fill__Scalar4callERNS_6TensorERKN3c106ScalarE
+_ZN2at4_ops17pow_Tensor_Scalar4callERKNS_6TensorERKN3c106ScalarE
+_ZN2at4_ops3cos4callERKNS_6TensorE
+_ZN2at4_ops3neg4callERKNS_6TensorE
+_ZN2at4_ops3sin4callERKNS_6TensorE
+_ZN2at6native11convolutionERKNS_6TensorES3_RKSt8optionalIS1_EN3c108ArrayRefIlEESA_SA_bSA_l
+_ZN2at6native12_convolutionERKNS_6TensorES3_RKSt8optionalIS1_EN3c108ArrayRefIlEESA_SA_bSA_lbbbb
+_ZN2at6native13conv2d_symintERKNS_6TensorES3_RKSt8optionalIS1_EN3c108ArrayRefINS8_6SymIntEEESB_SB_SA_
+_ZN2at6native20_select_conv_backendIlEENS0_11ConvBackendERKNS_6TensorES5_RKSt8optionalIS3_EN3c1016OptionalArrayRefIT_EEbRKNS0_10ConvParamsISC_EE.isra.0
+_ZN2at6nativeL19check_shape_forwardIlEEvRKNS_6TensorERKN3c108ArrayRefIT_EES4_RKNS0_10ConvParamsIS7_EE
+_ZN2at6nativeL35check_input_same_type_as_parametersERKNS_6TensorES3_S3_
+_ZN2at6nativeL8batchifyERKNS_6TensorElRKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
+_ZNK2at6native10ConvParamsIlE12is_depthwiseERKNS_6TensorES5_
+_ZNK2at6native10ConvParamsIlE29needs_64bit_indexing_no_splitERKNS_6TensorES5_
+_ZNK2at6native10ConvParamsIlE9use_cudnnERKNS_6TensorES5_
+_ZNK3c1013TensorOptions10type_equalERKS0_
+_ZN2at4meta19structured_mean_dim4metaERKNS_6TensorEN3c1016OptionalArrayRefIlEEbSt8optionalINS5_10ScalarTypeEE
+_ZN2at4meta28structured_pow_Tensor_Scalar4metaERKNS_6TensorERKN3c106ScalarE
+_ZN2at4metaL14make_reductionERKNS_6TensorES3_N3c1016OptionalArrayRefIlEEbNS4_10ScalarTypeE.isra.0
+_ZN2at4metaL16resize_reductionERNS_4impl8MetaBaseERKNS_6TensorEN3c1016OptionalArrayRefIlEEbNS7_10ScalarTypeEb
+_ZN2at4metaL19get_reduction_shapeERKNS_6TensorEN3c108ArrayRefIlEEbb
+_ZN2at4metaL25infer_dtype_from_optionalERKNS_6TensorERKSt8optionalIN3c1010ScalarTypeEES3_
+_ZN2at4metaL26make_reduction_from_out_tyERKNS_6TensorES3_N3c1016OptionalArrayRefIlEEbNS4_10ScalarTypeE.isra.0
+_ZN2at6native10pad_symintERKNS_6TensorEN3c108ArrayRefINS4_6SymIntEEENS4_17basic_string_viewIcEESt8optionalIdE
+_ZN2at6native11allany_implILi0ENS0_7or_stubEEEvRKNS_6TensorES5_N3c1016OptionalArrayRefIlEEbRT0_.isra.0
+_ZN2at6native15constant_pad_ndERKNS_6TensorEN3c108ArrayRefIlEERKNS4_6ScalarE
+_ZN2at6native16_pad_enum_symintERKNS_6TensorEN3c108ArrayRefINS4_6SymIntEEElSt8optionalIdE
+_ZN2at6native19structured_mean_out4implERKNS_6TensorEN3c1016OptionalArrayRefIlEEbSt8optionalINS5_10ScalarTypeEES4_
+_ZN2at6native32structured_pow_Tensor_Scalar_out4implERKNS_6TensorERKN3c106ScalarES4_
+_ZN2at6nativeL14impl_func_prodERKNS_6TensorEN3c108ArrayRefIlEEbSt8optionalINS4_10ScalarTypeEES3_.constprop.0
+_ZN2atL18dim_list_to_bitsetEN3c1016OptionalArrayRefIlEEm.constprop.0.isra.0
+_ZN2at4_ops10group_norm4callERKNS_6TensorElRKSt8optionalIS2_ES8_db
+_ZN2at4_ops10reciprocal4callERKNS_6TensorE
+_ZN2at4_ops11convolution10redispatchEN3c1014DispatchKeySetERKNS_6TensorES6_RKSt8optionalIS4_ENS2_8ArrayRefINS2_6SymIntEEESD_SD_bSD_SC_
+_ZN2at4_ops11convolution4callERKNS_6TensorES4_RKSt8optionalIS2_EN3c108ArrayRefINS9_6SymIntEEESC_SC_bSC_SB_
+_ZN2at4_ops12_convolution4callERKNS_6TensorES4_RKSt8optionalIS2_EN3c108ArrayRefINS9_6SymIntEEESC_SC_bSC_SB_bbbb
+_ZN2at4_ops12split_Tensor4callERKNS_6TensorEN3c106SymIntEl
+_ZN2at4_ops3neg10redispatchEN3c1014DispatchKeySetERKNS_6TensorE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_S9_EXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_110div_TensorES7_S9_S9_EEEES6_NS_4guts8typelist8typelistIJS7_S9_S9_EEEEESA_E4callEPNS_14OperatorKernelES7_S9_S9_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_S9_RKSt8optionalIS6_ENS_8ArrayRefINS_6SymIntEEESG_SG_bSG_SF_EXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_111convolutionES7_S9_S9_SD_SG_SG_SG_bSG_SF_EEEES6_NS_4guts8typelist8typelistIJS7_S9_S9_SD_SG_SG_SG_bSG_SF_EEEEESH_E4callEPNS_14OperatorKernelES7_S9_S9_SD_SG_SG_SG_bSG_SF_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_St8optionalINS_12MemoryFormatEEEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_15cloneES7_S9_SC_EEEES6_NS_4guts8typelist8typelistIJS7_S9_SC_EEEEESD_E4callEPNS_14OperatorKernelES7_S9_SC_
+_ZN5torch8autograd12VariableType12_GLOBAL__N_110div_TensorEN3c1014DispatchKeySetERKN2at6TensorES8_
+_ZN5torch8autograd12VariableType12_GLOBAL__N_111convolutionEN3c1014DispatchKeySetERKN2at6TensorES8_RKSt8optionalIS6_ENS3_8ArrayRefINS3_6SymIntEEESF_SF_bSF_SE_
+_ZN5torch8autograd12VariableType12_GLOBAL__N_112split_TensorEN3c1014DispatchKeySetERKN2at6TensorENS3_6SymIntEl
+_ZN5torch8autograd12VariableType12_GLOBAL__N_117pow_Tensor_ScalarEN3c1014DispatchKeySetERKN2at6TensorERKNS3_6ScalarE
+_ZN5torch8autograd12VariableType12_GLOBAL__N_135_scaled_dot_product_flash_attentionEN3c1014DispatchKeySetERKN2at6TensorES8_S8_dbbSt8optionalIdE
+_Z171__device_stub__Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N22PyTorchMemEffAttention15AttentionKernelIN7cutlass6half_tENS1_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1EE6ParamsERN22PyTorchMemEffAttention15AttentionKernelIN7cutlass6half_tENS1_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1EE6ParamsE
+_Z40fmha_cutlassF_f16_aligned_64x128_rf_sm80N22PyTorchMemEffAttention15AttentionKernelIN7cutlass6half_tENS1_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1EE6ParamsE
+_ZN2at12_GLOBAL__N_112_GLOBAL__N_125wrapper_CUDA_Scalar_fill_ERNS_6TensorERKN3c106ScalarE
+_ZN2at12_GLOBAL__N_112_GLOBAL__N_142wrapper_CUDA___efficient_attention_forwardERKNS_6TensorES4_S4_RKSt8optionalIS2_ES8_S8_S5_IlES9_dlbS5_IdES8_S8_
+_ZN2at12_GLOBAL__N_112_GLOBAL__N_153wrapper_CUDA___scaled_dot_product_efficient_attentionERKNS_6TensorES4_S4_RKSt8optionalIS2_EbdbS5_IdE
+_ZN3c1015SmallVectorImplIlEaSERKS1_.isra.0
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFSt5tupleIJN2at6TensorES7_S7_S7_NS_6SymIntES8_EERKS7_SB_SB_RKSt8optionalIS7_ESF_SF_SC_IlESG_dlbSC_IdESF_SF_EXadL_ZNS6_12_GLOBAL__N_112_GLOBAL__N_142wrapper_CUDA___efficient_attention_forwardESB_SB_SB_SF_SF_SF_SG_SG_dlbSH_SF_SF_EEEES9_NS_4guts8typelist8typelistIJSB_SB_SB_SF_SF_SF_SG_SG_dlbSH_SF_SF_EEEEESI_E4callEPNS_14OperatorKernelENS_14DispatchKeySetESB_SB_SB_SF_SF_SF_SG_SG_dlbSH_SF_SF_
+_ZN2at4_ops10select_int10redispatchEN3c1014DispatchKeySetERKNS_6TensorElNS2_6SymIntE
+_ZN2at4_ops10select_int4callERKNS_6TensorElN3c106SymIntE
+_ZN2at4_ops11rsub_Scalar4callERKNS_6TensorERKN3c106ScalarES8_
+_ZN2at4_ops11softmax_int4callERKNS_6TensorElSt8optionalIN3c1010ScalarTypeEE
+_ZN2at4_ops12arange_start4callERKN3c106ScalarES5_St8optionalINS2_10ScalarTypeEES6_INS2_6LayoutEES6_INS2_6DeviceEES6_IbE
+_ZN2at4_ops18masked_fill_Scalar10redispatchEN3c1014DispatchKeySetERKNS_6TensorES6_RKNS2_6ScalarE
+_ZN2at4_ops3bmm10redispatchEN3c1014DispatchKeySetERKNS_6TensorES6_
+_ZN2at4_ops7permute10redispatchEN3c1014DispatchKeySetERKNS_6TensorENS2_8ArrayRefIlEE
+_ZN2at4_ops8_softmax10redispatchEN3c1014DispatchKeySetERKNS_6TensorElb
+_ZN2at4_ops8_softmax4callERKNS_6TensorElb
+_ZN2at4_ops11add__Tensor4callERNS_6TensorERKS2_RKN3c106ScalarE
+_ZN2at4_ops17native_group_norm10redispatchEN3c1014DispatchKeySetERKNS_6TensorERKSt8optionalIS4_ESA_NS2_6SymIntESB_SB_ld
+_ZN2at4_ops2mm10redispatchEN3c1014DispatchKeySetERKNS_6TensorES6_
+_ZN2at4_ops4silu10redispatchEN3c1014DispatchKeySetERKNS_6TensorE
+_ZN2at4_ops5chunk4callERKNS_6TensorEll
+_ZN2at4_ops5copy_10redispatchEN3c1014DispatchKeySetERNS_6TensorERKS4_b
+_ZN2at4_ops5rsqrt4callERKNS_6TensorE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_EXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_14siluES7_S9_EEEES6_NS_4guts8typelist8typelistIJS7_S9_EEEEESA_E4callEPNS_14OperatorKernelES7_S9_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_NS_8ArrayRefINS_6SymIntEEEEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_112_unsafe_viewES7_S9_SC_EEEES6_NS_4guts8typelist8typelistIJS7_S9_SC_EEEEESD_E4callEPNS_14OperatorKernelES7_S9_SC_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_S9_EXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_12mmES7_S9_S9_EEEES6_NS_4guts8typelist8typelistIJS7_S9_S9_EEEEESA_E4callEPNS_14OperatorKernelES7_S9_S9_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_lEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_19unsqueezeES7_S9_lEEEES6_NS_4guts8typelist8typelistIJS7_S9_lEEEEESA_E4callEPNS_14OperatorKernelES7_S9_l
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFRN2at6TensorENS_14DispatchKeySetES7_RKS6_RKNS_6ScalarEEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_111add__TensorES8_S7_SA_SD_EEEES7_NS_4guts8typelist8typelistIJS8_S7_SA_SD_EEEEESE_E4callEPNS_14OperatorKernelES8_S7_SA_SD_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFSt5tupleIJN2at6TensorES7_S7_EENS_14DispatchKeySetERKS7_RKSt8optionalIS7_ESF_NS_6SymIntESG_SG_ldEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_117native_group_normES9_SB_SF_SF_SG_SG_SG_ldEEEES8_NS_4guts8typelist8typelistIJS9_SB_SF_SF_SG_SG_SG_ldEEEEESH_E4callEPNS_14OperatorKernelES9_SB_SF_SF_SG_SG_SG_ld
+_ZN5torch8autograd12VariableType12_GLOBAL__N_117native_group_normEN3c1014DispatchKeySetERKN2at6TensorERKSt8optionalIS6_ESC_NS3_6SymIntESD_SD_ld
+_ZN5torch8autograd12VariableType12_GLOBAL__N_12mmEN3c1014DispatchKeySetERKN2at6TensorES8_
+_ZN5torch8autograd12VariableType12_GLOBAL__N_14siluEN3c1014DispatchKeySetERKN2at6TensorE
+_ZN2at12_GLOBAL__N_112_GLOBAL__N_131wrapper_CUDA__native_group_normERKNS_6TensorERKSt8optionalIS2_ES8_N3c106SymIntESA_SA_ld
+_ZN2at12_GLOBAL__N_112_GLOBAL__N_138wrapper_CUDA___flash_attention_forwardERKNS_6TensorES4_S4_RKSt8optionalIS2_ES8_N3c106SymIntESA_dbbS5_IdE
+_ZN2at12_GLOBAL__N_112_GLOBAL__N_149wrapper_CUDA___scaled_dot_product_flash_attentionERKNS_6TensorES4_S4_dbbSt8optionalIdE
+_ZN2at12_GLOBAL__N_117wrapper_CUDA_geluERKNS_6TensorEN3c1017basic_string_viewIcEE
+_ZN2at12_GLOBAL__N_123wrapper_CUDA_div_TensorERKNS_6TensorES3_
+_ZN2at12_GLOBAL__N_124wrapper_CUDA_add__TensorERNS_6TensorERKS1_RKN3c106ScalarE
+_ZN2at12_GLOBAL__N_129structured_div_out_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN2at12_GLOBAL__N_133structured_ufunc_add_CUDA_inplace22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_S8_NS_8ArrayRefINS_6SymIntEEESB_SB_SA_bbbEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_131wrapper_CUDA__cudnn_convolutionES8_S8_SB_SB_SB_SA_bbbEEEES6_NS_4guts8typelist8typelistIJS8_S8_SB_SB_SB_SA_bbbEEEEESC_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S8_SB_SB_SB_SA_bbb
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFRN2at6TensorES7_RKS6_RKNS_6ScalarEEXadL_ZNS5_12_GLOBAL__N_124wrapper_CUDA_add__TensorES7_S9_SC_EEEES7_NS_4guts8typelist8typelistIJS7_S9_SC_EEEEESD_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES7_S9_SC_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFSt5tupleIJN2at6TensorES7_S7_EERKS7_RKSt8optionalIS7_ESE_NS_6SymIntESF_SF_ldEXadL_ZNS6_12_GLOBAL__N_112_GLOBAL__N_131wrapper_CUDA__native_group_normESA_SE_SE_SF_SF_SF_ldEEEES8_NS_4guts8typelist8typelistIJSA_SE_SE_SF_SF_SF_ldEEEEESG_E4callEPNS_14OperatorKernelENS_14DispatchKeySetESA_SE_SE_SF_SF_SF_ld
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFSt5tupleIJN2at6TensorES7_S7_S7_S7_EERKS7_SA_SA_RKSt8optionalIS7_ESE_NS_6SymIntESF_dbbSB_IdEEXadL_ZNS6_12_GLOBAL__N_112_GLOBAL__N_138wrapper_CUDA___flash_attention_forwardESA_SA_SA_SE_SE_SF_SF_dbbSG_EEEES8_NS_4guts8typelist8typelistIJSA_SA_SA_SE_SE_SF_SF_dbbSG_EEEEESH_E4callEPNS_14OperatorKernelENS_14DispatchKeySetESA_SA_SA_SE_SE_SF_SF_dbbSG_
+_ZN2at12_GLOBAL__N_116wrapper_CUDA_cosERKNS_6TensorE
+_ZN2at12_GLOBAL__N_116wrapper_CUDA_negERKNS_6TensorE
+_ZN2at12_GLOBAL__N_116wrapper_CUDA_sinERKNS_6TensorE
+_ZN2at12_GLOBAL__N_117wrapper_CUDA_siluERKNS_6TensorE
+_ZN2at12_GLOBAL__N_118wrapper_CUDA_rsqrtERKNS_6TensorE
+_ZN2at12_GLOBAL__N_121wrapper_CUDA_mean_dimERKNS_6TensorEN3c1016OptionalArrayRefIlEEbSt8optionalINS4_10ScalarTypeEE
+_ZN2at12_GLOBAL__N_125wrapper_CUDA_index_TensorERKNS_6TensorERKN3c104ListISt8optionalIS1_EEE
+_ZN2at12_GLOBAL__N_129structured_neg_out_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN2at12_GLOBAL__N_129structured_sin_out_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN2at12_GLOBAL__N_130structured_mean_out_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN2at12_GLOBAL__N_130structured_silu_out_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN2at12_GLOBAL__N_130wrapper_CUDA_pow_Tensor_ScalarERKNS_6TensorERKN3c106ScalarE
+_ZN2at12_GLOBAL__N_131structured_index_out_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN2at12_GLOBAL__N_131structured_rsqrt_out_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN2at12_GLOBAL__N_132structured_argmax_out_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN2at12_GLOBAL__N_143structured_pow_Tensor_Scalar_out_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_EXadL_ZNS5_12_GLOBAL__N_116wrapper_CUDA_cosES8_EEEES6_NS_4guts8typelist8typelistIJS8_EEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_EXadL_ZNS5_12_GLOBAL__N_116wrapper_CUDA_negES8_EEEES6_NS_4guts8typelist8typelistIJS8_EEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_EXadL_ZNS5_12_GLOBAL__N_116wrapper_CUDA_sinES8_EEEES6_NS_4guts8typelist8typelistIJS8_EEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_EXadL_ZNS5_12_GLOBAL__N_118wrapper_CUDA_rsqrtES8_EEEES6_NS_4guts8typelist8typelistIJS8_EEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_NS_16OptionalArrayRefIlEEbSt8optionalINS_10ScalarTypeEEEXadL_ZNS5_12_GLOBAL__N_121wrapper_CUDA_mean_dimES8_SA_bSD_EEEES6_NS_4guts8typelist8typelistIJS8_SA_bSD_EEEEESE_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SA_bSD_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_RKNS_4ListISt8optionalIS6_EEEEXadL_ZNS5_12_GLOBAL__N_125wrapper_CUDA_index_TensorES8_SE_EEEES6_NS_4guts8typelist8typelistIJS8_SE_EEEEESF_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SE_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_RKNS_6ScalarEEXadL_ZNS5_12_GLOBAL__N_130wrapper_CUDA_pow_Tensor_ScalarES8_SB_EEEES6_NS_4guts8typelist8typelistIJS8_SB_EEEEESC_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_S8_EXadL_ZNS5_12_GLOBAL__N_115wrapper_CUDA_mmES8_S8_EEEES6_NS_4guts8typelist8typelistIJS8_S8_EEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S8_
+_ZN2at11expand_sizeERKNS_6TensorEN3c108ArrayRefIlEEPKc
+_ZN2at15expand_outplaceERKNS_6TensorES2_
+_ZN2at4meta18structured_baddbmm4metaERKNS_6TensorES4_S4_RKN3c106ScalarES8_
+_ZN2at6native24structured_eq_Scalar_out4implERKNS_6TensorERKN3c106ScalarES4_
+_ZN2at6native4rsubERKNS_6TensorERKN3c106ScalarES7_
+_ZN2at4meta21structured_sub_Tensor4metaERKNS_6TensorES4_RKN3c106ScalarE
+_ZN2at6native18structured_div_out4implERKNS_6TensorES4_S4_
+_ZN2at6native18structured_sub_out4implERKNS_6TensorES4_RKN3c106ScalarES4_
+_ZN2at6native10is_nonzeroERKNS_6TensorE
+_ZN2at14RecordFunctionC2EONS_13StepCallbacksE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_lEXadL_ZN5torch15ADInplaceOrView12_GLOBAL__N_19unsqueezeES7_S9_lEEEES6_NS_4guts8typelist8typelistIJS7_S9_lEEEEESA_E4callEPNS_14OperatorKernelES7_S9_l
+_ZN2at4_ops4tanh10redispatchEN3c1014DispatchKeySetERKNS_6TensorE
+_ZN2at4_ops4tanh4callERKNS_6TensorE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_NS_8ArrayRefINS_6SymIntEEESC_EXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_114_reshape_aliasES7_S9_SC_SC_EEEES6_NS_4guts8typelist8typelistIJS7_S9_SC_SC_EEEEESD_E4callEPNS_14OperatorKernelES7_S9_SC_SC_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_NS_8ArrayRefINS_6SymIntEEEbEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_16expandES7_S9_SC_bEEEES6_NS_4guts8typelist8typelistIJS7_S9_SC_bEEEEESD_E4callEPNS_14OperatorKernelES7_S9_SC_b
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_S9_EXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_13bmmES7_S9_S9_EEEES6_NS_4guts8typelist8typelistIJS7_S9_S9_EEEEESA_E4callEPNS_14OperatorKernelES7_S9_S9_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_S9_S9_RKNS_6ScalarESC_EXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_17baddbmmES7_S9_S9_S9_SC_SC_EEEES6_NS_4guts8typelist8typelistIJS7_S9_S9_S9_SC_SC_EEEEESD_E4callEPNS_14OperatorKernelES7_S9_S9_S9_SC_SC_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_lNS_6SymIntEEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_110select_intES7_S9_lSA_EEEES6_NS_4guts8typelist8typelistIJS7_S9_lSA_EEEEESB_E4callEPNS_14OperatorKernelES7_S9_lSA_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_lbEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_18_softmaxES7_S9_lbEEEES6_NS_4guts8typelist8typelistIJS7_S9_lbEEEEESA_E4callEPNS_14OperatorKernelES7_S9_lb
+_ZN5torch8autograd12VariableType12_GLOBAL__N_110select_intEN3c1014DispatchKeySetERKN2at6TensorElNS3_6SymIntE
+_ZN5torch8autograd12VariableType12_GLOBAL__N_114_reshape_aliasEN3c1014DispatchKeySetERKN2at6TensorENS3_8ArrayRefINS3_6SymIntEEESB_
+_ZN5torch8autograd12VariableType12_GLOBAL__N_13bmmEN3c1014DispatchKeySetERKN2at6TensorES8_
+_ZN5torch8autograd12VariableType12_GLOBAL__N_17baddbmmEN3c1014DispatchKeySetERKN2at6TensorES8_S8_RKNS3_6ScalarESB_
+_ZN5torch8autograd12VariableType12_GLOBAL__N_17permuteEN3c1014DispatchKeySetERKN2at6TensorENS3_8ArrayRefIlEE
+_ZN5torch8autograd12VariableType12_GLOBAL__N_18_softmaxEN3c1014DispatchKeySetERKN2at6TensorElb
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKNS_8IListRefIS6_EElEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_13catES7_SB_lEEEES6_NS_4guts8typelist8typelistIJS7_SB_lEEEEESC_E4callEPNS_14OperatorKernelES7_SB_l
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_EXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_14tanhES7_S9_EEEES6_NS_4guts8typelist8typelistIJS7_S9_EEEEESA_E4callEPNS_14OperatorKernelES7_S9_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_St8optionalINS_10ScalarTypeEESA_INS_6LayoutEESA_INS_6DeviceEESA_IbEbSA_INS_12MemoryFormatEEEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_18_to_copyES7_S9_SC_SE_SG_SH_bSJ_EEEES6_NS_4guts8typelist8typelistIJS7_S9_SC_SE_SG_SH_bSJ_EEEEESK_E4callEPNS_14OperatorKernelES7_S9_SC_SE_SG_SH_bSJ_
+_ZN5torch8autograd12VariableType12_GLOBAL__N_14tanhEN3c1014DispatchKeySetERKN2at6TensorE
+_ZN5torch8autograd12VariableType12_GLOBAL__N_18_to_copyEN3c1014DispatchKeySetERKN2at6TensorESt8optionalINS3_10ScalarTypeEES9_INS3_6LayoutEES9_INS3_6DeviceEES9_IbEbS9_INS3_12MemoryFormatEE
+_ZN2at6native18GeluCUDAKernelImplERNS_18TensorIteratorBaseENS0_8GeluTypeE
+_ZN3c1012function_refIFvPPcPKlllEE11callback_fnIZZZN2at6native7DEFAULTL25reduced_float_copy_kernelERNS8_18TensorIteratorBaseEbENKUlvE0_clEvENKUlvE_clEvEUlS2_S4_llE_EEvlS2_S4_ll
+_ZN2at4_ops15to_dtype_layout4callERKNS_6TensorESt8optionalIN3c1010ScalarTypeEES5_INS6_6LayoutEES5_INS6_6DeviceEES5_IbEbbS5_INS6_12MemoryFormatEE
+_ZN2at4_ops7baddbmm10redispatchEN3c1014DispatchKeySetERKNS_6TensorES6_S6_RKNS2_6ScalarES9_
+_ZN2at4_ops7baddbmm4callERKNS_6TensorES4_S4_RKN3c106ScalarES8_
+_ZN2at6native12_GLOBAL__N_110add_kernelERNS_18TensorIteratorBaseERKN3c106ScalarE
+_ZN2at6native24structured_ufunc_add_CPU4implERKNS_6TensorES4_RKN3c106ScalarES4_
+_ZN2at12_GLOBAL__N_122wrapper_CPU_sub_TensorERKNS_6TensorES3_RKN3c106ScalarE
+_ZN2at12_GLOBAL__N_129structured_exp_out_functional22set_output_raw_stridedElN3c108ArrayRefIlEES4_NS2_13TensorOptionsENS3_INS_7DimnameEEE
+_ZN5torch8autograd12VariableType6unpackERKN3c108IListRefIN2at6TensorEEEPKci
+_ZN5torch8autogradL15THPVariable_bmmEP7_objectS2_S2_
+_ZN5torch8autogradL16THPVariable_tanhEP7_objectS2_S2_
+_ZN5torch8autogradL22THPVariable_layer_normEP7_objectS2_S2_
+_ZN5torch8autogradL23THPVariable_masked_fillEP7_objectS2_S2_
+_ZN2at6native15setReduceConfigIffLi4EEENS0_12ReduceConfigERKNS_14TensorIteratorE
+_ZN2at6native17gpu_reduce_kernelIffLi4ENS0_7MeanOpsIffffEEdEEvRNS_14TensorIteratorERKT2_T3_PNS0_18AccumulationBufferEl
+_ZN2at6native40_GLOBAL__N__5df19e2c_8_Shape_cu_49f7391c12parallel_catINS1_10OpaqueTypeILj4EEELi128ELi1EEEvRKNS_6TensorERKSt6vectorISt17reference_wrapperIS6_ESaISA_EEliN3c1012MemoryFormatE
+_ZN2at6nativeL16mean_kernel_cudaERNS_14TensorIteratorE
+_ZN2at6nativeL16prod_kernel_cudaERNS_14TensorIteratorE
+_ZN2at6nativeL21make_input_calculatorIjEE16OffsetCalculatorILi1ET_Lb0EERKNS_14TensorIteratorE
+_ZNK2at10TensorBase8data_ptrIfEEPT_v
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_NS_8ArrayRefINS_6SymIntEEEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_147wrapper_CompositeExplicitAutograd___unsafe_viewES8_SB_EEEES6_NS_4guts8typelist8typelistIJS8_SB_EEEEESC_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_NS_8ArrayRefINS_6SymIntEEERKNS_6ScalarEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_150wrapper_CompositeExplicitAutograd__constant_pad_ndES8_SB_SE_EEEES6_NS_4guts8typelist8typelistIJS8_SB_SE_EEEEESF_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_SE_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_lEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_144wrapper_CompositeExplicitAutograd__unsqueezeES8_lEEEES6_NS_4guts8typelist8typelistIJS8_lEEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_l
+_ZN5torch8autograd13check_inplaceERKN2at6TensorEb
+_ZN2at4_ops19masked_fill__Scalar4callERNS_6TensorERKS2_RKN3c106ScalarE
+_ZN2at4_ops9embedding4callERKNS_6TensorES4_N3c106SymIntEbb
+_ZN2at6native10gpu_kernelINS0_13AUnaryFunctorIN3c104HalfES4_S4_NS0_15binary_internal10MulFunctorIfEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native10gpu_kernelINS0_13BinaryFunctorIlllNS0_15binary_internal10MulFunctorIlEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native15gpu_kernel_implINS0_13BinaryFunctorIlllNS0_15binary_internal10MulFunctorIlEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native22gpu_kernel_impl_nocastINS0_13AUnaryFunctorIN3c104HalfES4_S4_NS0_15binary_internal10MulFunctorIfEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native22gpu_kernel_impl_nocastINS0_13AUnaryFunctorIlllNS0_15binary_internal10MulFunctorIlEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native22gpu_kernel_impl_nocastINS0_13BinaryFunctorIlllNS0_15binary_internal10MulFunctorIlEEEEEEvRNS_18TensorIteratorBaseERKT_
+_Z22make_offset_calculatorILi4ELb0EE16OffsetCalculatorIXT_EjXT0_EERKN2at18TensorIteratorBaseE
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at18TensorIteratorBaseERKS_ILb0ELb1ELb0ES0_IPFvRKNS1_6TensorES6_S6_llllN3c104HalfERS4_S9_S9_EXadL_ZNS1_6native59_GLOBAL__N__d4303601_20_group_norm_kernel_cu_28d559ca_3597327GroupNormKernelImplInternalIS8_EEvS6_S6_S6_llllT_S9_S9_S9_EELj2EEFS8_S8_ffEJEEEXadL_ZNSC_22gpu_kernel_impl_nocastISI_EEvS3_RKSF_EELj1EEFviEJ16OffsetCalculatorILi4EjLb0EENS1_6detail5ArrayIPcLi4EEESJ_EE7managerIZNSN_ISI_EEvS3_SP_EUliE_E7do_copyEPv
+_ZN2at6native10gpu_kernelI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRKNS_6TensorES6_S6_llllN3c104HalfERS4_S9_S9_EXadL_ZNS0_59_GLOBAL__N__d4303601_20_group_norm_kernel_cu_28d559ca_3597327GroupNormKernelImplInternalIS8_EEvS6_S6_S6_llllT_S9_S9_S9_EELj2EEFS8_S8_ffEJEEEEvRNS_18TensorIteratorBaseERKSE_.isra.0
+_ZN2at6native59_GLOBAL__N__d4303601_20_group_norm_kernel_cu_28d559ca_3597319GroupNormKernelImplERKNS_6TensorES4_S4_lllldRS2_S5_S5_
+_ZN2at6native59_GLOBAL__N__d4303601_20_group_norm_kernel_cu_28d559ca_3597327GroupNormKernelImplInternalIN3c104HalfEEEvRKNS_6TensorES7_S7_llllT_RS5_S9_S9_
+_ZN3c1019fromIntArrayRefSlowENS_8ArrayRefIlEE.constprop.0
+_ZN2at12_GLOBAL__N_112_GLOBAL__N_146wrapper_CompositeExplicitAutograd__convolutionERKNS_6TensorES4_RKSt8optionalIS2_EN3c108ArrayRefINS9_6SymIntEEESC_SC_bSC_SB_
+_ZN2at12_GLOBAL__N_112_GLOBAL__N_147wrapper_CompositeExplicitAutograd___convolutionERKNS_6TensorES4_RKSt8optionalIS2_EN3c108ArrayRefINS9_6SymIntEEESC_SC_bSC_SB_bbbb
+_ZN2at4_ops10div_Tensor4callERKNS_6TensorES4_
+_ZN2at4_ops35_scaled_dot_product_flash_attention10redispatchEN3c1014DispatchKeySetERKNS_6TensorES6_S6_dbbSt8optionalIdE
+_ZN2at4_ops4gelu4callERKNS_6TensorEN3c1017basic_string_viewIcEE
+_ZN2at6native10gpu_kernelI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_62_GLOBAL__N__82494415_23_ActivationSiluKernel_cu_f9d27b8c_3273411silu_kernelES5_EELj5EEFN3c104HalfESB_EJEEEEvS5_RKT_.isra.0
+_ZN2at6native10gpu_kernelINS0_13BUnaryFunctorIN3c104HalfES4_S4_NS0_15binary_internal10MulFunctorIfEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native15binary_internal20div_true_kernel_cudaERNS_18TensorIteratorBaseE
+_ZN2at6native15gpu_kernel_implINS0_13BUnaryFunctorIN3c104HalfES4_S4_NS0_15binary_internal10MulFunctorIfEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native22gpu_kernel_impl_nocastINS0_13BUnaryFunctorIN3c104HalfES4_S4_NS0_15binary_internal10MulFunctorIfEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_NS_12MemoryFormatEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_145wrapper_CompositeImplicitAutograd__contiguousES8_S9_EEEES6_NS_4guts8typelist8typelistIJS8_S9_EEEEESA_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S9_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_S8_RKSt8optionalIS6_ENS_8ArrayRefINS_6SymIntEEESF_SF_bSF_SE_EXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_146wrapper_CompositeExplicitAutograd__convolutionES8_S8_SC_SF_SF_SF_bSF_SE_EEEES6_NS_4guts8typelist8typelistIJS8_S8_SC_SF_SF_SF_bSF_SE_EEEEESG_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S8_SC_SF_SF_SF_bSF_SE_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_S8_RKSt8optionalIS6_ENS_8ArrayRefINS_6SymIntEEESF_SF_bSF_SE_bbbbEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_147wrapper_CompositeExplicitAutograd___convolutionES8_S8_SC_SF_SF_SF_bSF_SE_bbbbEEEES6_NS_4guts8typelist8typelistIJS8_S8_SC_SF_SF_SF_bSF_SE_bbbbEEEEESG_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S8_SC_SF_SF_SF_bSF_SE_bbbb
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_lRKSt8optionalIS6_ESC_dbEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_145wrapper_CompositeImplicitAutograd__group_normES8_lSC_SC_dbEEEES6_NS_4guts8typelist8typelistIJS8_lSC_SC_dbEEEEESD_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_lSC_SC_db
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at18TensorIteratorBaseEfEXadL_ZNS1_6native51_GLOBAL__N__e9b6561f_12_PowKernel_cu_40e48458_3413229pow_tensor_scalar_kernel_implIffEEvS3_T0_EELj1EEFffEJEE7managerIZNS8_IffEEvS3_S9_EUlfE_E7do_copyEPv
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at18TensorIteratorBaseEfEXadL_ZNS1_6native51_GLOBAL__N__e9b6561f_12_PowKernel_cu_40e48458_3413229pow_tensor_scalar_kernel_implIffEEvS3_T0_EELj1EEFffEJEE7managerIZNS8_IffEEvS3_S9_EUlfE_E9do_deleteEPv
+_ZN2at6native10gpu_kernelI18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEfEXadL_ZNS0_51_GLOBAL__N__e9b6561f_12_PowKernel_cu_40e48458_3413229pow_tensor_scalar_kernel_implIffEEvS5_T0_EELj1EEFffEJEEEEvS5_RKT_
+_ZN2at6native15gpu_kernel_implI18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEfEXadL_ZNS0_51_GLOBAL__N__e9b6561f_12_PowKernel_cu_40e48458_3413229pow_tensor_scalar_kernel_implIffEEvS5_T0_EELj1EEFffEJEEEEvS5_RKT_
+_ZN2at6native22gpu_kernel_impl_nocastI18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEfEXadL_ZNS0_51_GLOBAL__N__e9b6561f_12_PowKernel_cu_40e48458_3413229pow_tensor_scalar_kernel_implIffEEvS5_T0_EELj1EEFffEJEEEEvS5_RKT_
+_ZN2at6native51_GLOBAL__N__e9b6561f_12_PowKernel_cu_40e48458_3413224pow_tensor_scalar_kernelERNS_18TensorIteratorBaseERKN3c106ScalarE
+_ZN2at6native51_GLOBAL__N__e9b6561f_12_PowKernel_cu_40e48458_3413229pow_tensor_scalar_kernel_implIffEEvRNS_18TensorIteratorBaseET0_
+_ZN2at6native22gpu_kernel_impl_nocastINS0_13BinaryFunctorIllbNS0_51_GLOBAL__N__86fa8531_18_CompareEQKernel_cu_d8008c9616CompareEqFunctorIlEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at4_ops12index_Tensor4callERKNS_6TensorERKN3c104ListISt8optionalIS2_EEE
+_ZN2at4_ops3pad4callERKNS_6TensorEN3c108ArrayRefINS5_6SymIntEEENS5_17basic_string_viewIcEESt8optionalIdE
+_ZN2at4_ops9ne_Tensor4callERKNS_6TensorES4_
+_ZN2at6native28_efficient_attention_forwardERKNS_6TensorES3_S3_RKSt8optionalIS1_ES7_S7_S4_IlES8_dlbS4_IdES7_S7_
+_ZN2at6native44_scaled_dot_product_efficient_attention_cudaERKNS_6TensorES3_S3_RKSt8optionalIS1_EbdbS4_IdE
+_ZNK2at10TensorBase6strideEl.constprop.3
+_ZNK3c1010TensorImpl4sizeEl.constprop.0
+_ZNK3c1010TensorImpl4sizeEl.constprop.2
+_ZNK3c1010TensorImpl7is_cudaEv
+_ZNR2at6TensoraSEOS0_.isra.0
+_ZZN2at6native28_efficient_attention_forwardERKNS_6TensorES3_S3_RKSt8optionalIS1_ES7_S7_S4_IlES8_dlbS4_IdES7_S7_ENKUlT_T0_E_clIN22PyTorchMemEffAttention15AttentionKernelIN7cutlass6half_tENSG_4arch4Sm80ELb1ELi32ELi128ELi65536ELb1ELb1EEEPFvNSK_6ParamsEEEEDaSA_SB_.constprop.0
+_ZZN2at6native28_efficient_attention_forwardERKNS_6TensorES3_S3_RKSt8optionalIS1_ES7_S7_S4_IlES8_dlbS4_IdES7_S7_ENKUlT_T0_E_clIN22PyTorchMemEffAttention15AttentionKernelIN7cutlass6half_tENSG_4arch4Sm80ELb1ELi64ELi128ELi128ELb1ELb1EEEPFvNSK_6ParamsEEEEDaSA_SB_.constprop.0
+_ZZN2at6native28_efficient_attention_forwardERKNS_6TensorES3_S3_RKSt8optionalIS1_ES7_S7_S4_IlES8_dlbS4_IdES7_S7_ENKUlT_T0_E_clIN22PyTorchMemEffAttention15AttentionKernelIN7cutlass6half_tENSG_4arch4Sm80ELb1ELi64ELi64ELi64ELb1ELb1EEEPFvNSK_6ParamsEEEEDaSA_SB_.constprop.0
+_ZN2at6native62_GLOBAL__N__82494415_23_ActivationSiluKernel_cu_f9d27b8c_3273411silu_kernelERNS_18TensorIteratorBaseE
+_Z24dispatch_softmax_forwardIfffLb0ELb0EEvPT0_PKT_iiiPKbib
+_ZN2at6native27structured_softmax_cuda_out4implERKNS_6TensorElbS4_
+_ZN2at6native40_GLOBAL__N__5df19e2c_8_Shape_cu_49f7391c12parallel_catINS1_10OpaqueTypeILj8EEELi128ELi1EEEvRKNS_6TensorERKSt6vectorISt17reference_wrapperIS6_ESaISA_EEliN3c1012MemoryFormatE
+_ZN2at6native43_GLOBAL__N__4da63efb_10_SoftMax_cu_9f978f6312host_softmaxINS1_22SoftMaxForwardEpilogueELb0EEENS_6TensorERKS4_lbS6_
+_ZN43_GLOBAL__N__4da63efb_10_SoftMax_cu_9f978f639log2_ceilEi
+_ZZZN2at6native43_GLOBAL__N__4da63efb_10_SoftMax_cu_9f978f6312host_softmaxINS1_22SoftMaxForwardEpilogueELb0EEENS_6TensorERKS4_lbS6_ENKUlvE_clEvENKUlvE0_clEv
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at18TensorIteratorBaseERKS_ILb0ELb1ELb0ES0_IPFvS3_EXadL_ZNS1_6native15neg_kernel_cudaES3_EELj8EEFN3c104HalfES9_EJEEEXadL_ZNS6_22gpu_kernel_impl_nocastISB_EEvS3_RKT_EELj1EEFviEJ16OffsetCalculatorILi2EjLb0EENS1_6detail5ArrayIPcLi2EEESC_EE7managerIZNSG_ISB_EEvS3_SJ_EUliE_E7do_copyEPv
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at18TensorIteratorBaseERKS_ILb0ELb1ELb0ES0_IPFvS3_EXadL_ZNS1_6native15neg_kernel_cudaES3_EELj8EEFN3c104HalfES9_EJEEEXadL_ZNS6_22gpu_kernel_impl_nocastISB_EEvS3_RKT_EELj1EEFviEJ16OffsetCalculatorILi2EjLb0EENS1_6detail5ArrayIPcLi2EEESC_EE7managerIZNSG_ISB_EEvS3_SJ_EUliE_E9do_deleteEPv
+_ZN2at6native10gpu_kernelI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_15cos_kernel_cudaES5_EELj2EEFffEJEEEEvS5_RKT_.isra.0
+_ZN2at6native10gpu_kernelI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_15neg_kernel_cudaES5_EELj8EEFN3c104HalfESA_EJEEEEvS5_RKT_.isra.0
+_ZN2at6native10gpu_kernelI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_15sin_kernel_cudaES5_EELj2EEFffEJEEEEvS5_RKT_.isra.0
+_ZN2at6native10gpu_kernelI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_17rsqrt_kernel_cudaES5_EELj2EEFffEJEEEEvS5_RKT_.isra.0
+_ZN2at6native15cos_kernel_cudaERNS_18TensorIteratorBaseE
+_ZN2at6native15gpu_kernel_implI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_15cos_kernel_cudaES5_EELj2EEFffEJEEEEvS5_RKT_.isra.0
+_ZN2at6native15gpu_kernel_implI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_15sin_kernel_cudaES5_EELj2EEFffEJEEEEvS5_RKT_.isra.0
+_ZN2at6native15gpu_kernel_implI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_17rsqrt_kernel_cudaES5_EELj2EEFffEJEEEEvS5_RKT_.isra.0
+_ZN2at6native15neg_kernel_cudaERNS_18TensorIteratorBaseE
+_ZN2at6native15sin_kernel_cudaERNS_18TensorIteratorBaseE
+_ZN2at6native17rsqrt_kernel_cudaERNS_18TensorIteratorBaseE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_NS_8ArrayRefINS_6SymIntEEENS_17basic_string_viewIcEESt8optionalIdEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_138wrapper_CompositeImplicitAutograd__padES8_SB_SD_SF_EEEES6_NS_4guts8typelist8typelistIJS8_SB_SD_SF_EEEEESG_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_SD_SF_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_S8_EXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_141wrapper_CompositeImplicitAutograd__matmulES8_S8_EEEES6_NS_4guts8typelist8typelistIJS8_S8_EEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S8_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFNS_10ScalarTypeERKN2at6TensorERKNS_6ScalarEEXadL_ZNS6_12_GLOBAL__N_112_GLOBAL__N_152wrapper_CompositeImplicitAutograd_Scalar_result_typeES9_SC_EEEES5_NS_4guts8typelist8typelistIJS9_SC_EEEEESD_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES9_SC_
+_ZN2at6native40_scaled_dot_product_flash_attention_cudaERKNS_6TensorES3_S3_dbbSt8optionalIdE
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_NS_8ArrayRefINS_6SymIntEEESt8optionalINS_10ScalarTypeEESC_INS_6LayoutEESC_INS_6DeviceEESC_IbEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_143wrapper_CompositeExplicitAutograd__new_onesES8_SB_SE_SG_SI_SJ_EEEES6_NS_4guts8typelist8typelistIJS8_SB_SE_SG_SI_SJ_EEEEESK_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_SE_SG_SI_SJ_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_S8_RKNS_6ScalarEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_152wrapper_CompositeExplicitAutograd_Scalar_masked_fillES8_S8_SB_EEEES6_NS_4guts8typelist8typelistIJS8_S8_SB_EEEEESC_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S8_SB_
+_ZN2at6native53_GLOBAL__N__e6784c59_20_layer_norm_kernel_cu_9c5ada8a19LayerNormKernelImplERKNS_6TensorES4_S4_lldPS2_S5_S5_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorENS_14DispatchKeySetERKS6_S9_RKNS_6ScalarEEXadL_ZN5torch8autograd12VariableType12_GLOBAL__N_118masked_fill_ScalarES7_S9_S9_SC_EEEES6_NS_4guts8typelist8typelistIJS7_S9_S9_SC_EEEEESD_E4callEPNS_14OperatorKernelES7_S9_S9_SC_
+_ZN5torch8autograd12VariableType12_GLOBAL__N_118masked_fill_ScalarEN3c1014DispatchKeySetERKN2at6TensorES8_RKNS3_6ScalarE
+_ZN5torch8autograd12VariableType12_GLOBAL__N_19unsqueezeEN3c1014DispatchKeySetERKN2at6TensorEl
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at18TensorIteratorBaseERKS_ILb0ELb0ELb0ES0_IPFvRNS1_14TensorIteratorERKN3c106ScalarEEXadL_ZNS1_6native44_GLOBAL__N__9e5ddf9f_11_Indexing_cu_89862edb18masked_fill_kernelES5_S9_EELj7EEFffbEJKfEEEXadL_ZNSC_22gpu_kernel_impl_nocastISH_EEvS3_RKT_EELj1EEFviEJ16OffsetCalculatorILi3EjLb0EENS1_6detail5ArrayIPcLi3EEESI_EE7managerIZNSM_ISH_EEvS3_SP_EUliE_E7do_copyEPv
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at18TensorIteratorBaseERKS_ILb0ELb0ELb0ES0_IPFvRNS1_14TensorIteratorERKN3c106ScalarEEXadL_ZNS1_6native44_GLOBAL__N__9e5ddf9f_11_Indexing_cu_89862edb18masked_fill_kernelES5_S9_EELj7EEFffbEJKfEEEXadL_ZNSC_22gpu_kernel_impl_nocastISH_EEvS3_RKT_EELj1EEFviEJ16OffsetCalculatorILi3EjLb0EENS1_6detail5ArrayIPcLi3EEESI_EE7managerIZNSM_ISH_EEvS3_SP_EUliE_E9do_deleteEPv
+_ZN2at6native10gpu_kernelI18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRNS_14TensorIteratorERKN3c106ScalarEEXadL_ZNS0_44_GLOBAL__N__9e5ddf9f_11_Indexing_cu_89862edb18masked_fill_kernelES5_S9_EELj11EEFNS6_4HalfESE_bEJKSE_EEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native10gpu_kernelI18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRNS_14TensorIteratorERKN3c106ScalarEEXadL_ZNS0_44_GLOBAL__N__9e5ddf9f_11_Indexing_cu_89862edb18masked_fill_kernelES5_S9_EELj7EEFffbEJKfEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native15gpu_kernel_implI18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRNS_14TensorIteratorERKN3c106ScalarEEXadL_ZNS0_44_GLOBAL__N__9e5ddf9f_11_Indexing_cu_89862edb18masked_fill_kernelES5_S9_EELj7EEFffbEJKfEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native22gpu_kernel_impl_nocastI18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRNS_14TensorIteratorERKN3c106ScalarEEXadL_ZNS0_44_GLOBAL__N__9e5ddf9f_11_Indexing_cu_89862edb18masked_fill_kernelES5_S9_EELj11EEFNS6_4HalfESE_bEJKSE_EEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native22gpu_kernel_impl_nocastI18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRNS_14TensorIteratorERKN3c106ScalarEEXadL_ZNS0_44_GLOBAL__N__9e5ddf9f_11_Indexing_cu_89862edb18masked_fill_kernelES5_S9_EELj7EEFffbEJKfEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native44_GLOBAL__N__9e5ddf9f_11_Indexing_cu_89862edb18masked_fill_kernelERNS_14TensorIteratorERKN3c106ScalarE
+_ZN2at6native44_GLOBAL__N__9e5ddf9f_11_Indexing_cu_89862edb26index_select_out_cuda_implIN3c104HalfEEEvRNS_6TensorERKS5_lS8_
+_ZN2at6native22gpu_kernel_impl_nocastINS0_13AUnaryFunctorIfffNS0_15binary_internal10MulFunctorIfEEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_St8optionalINS_10ScalarTypeEES9_INS_6LayoutEES9_INS_6DeviceEES9_IbEbbS9_INS_12MemoryFormatEEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_149wrapper_CompositeImplicitAutograd_dtype_layout_toES8_SB_SD_SF_SG_bbSI_EEEES6_NS_4guts8typelist8typelistIJS8_SB_SD_SF_SG_bbSI_EEEEESJ_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_SB_SD_SF_SG_bbSI_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_dbEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_142wrapper_CompositeImplicitAutograd__dropoutES8_dbEEEES6_NS_4guts8typelist8typelistIJS8_dbEEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_db
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_lSt8optionalINS_10ScalarTypeEEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_145wrapper_CompositeImplicitAutograd_int_softmaxES8_lSB_EEEES6_NS_4guts8typelist8typelistIJS8_lSB_EEEEESC_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_lSB_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFbRKN2at6TensorEEXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_145wrapper_CompositeImplicitAutograd__is_nonzeroES8_EEEEbNS_4guts8typelist8typelistIJS8_EEEEES9_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_
+_ZN2at6native10gpu_kernelI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_16tanh_kernel_cudaES5_EELj3EEFN3c104HalfESA_EJEEEEvS5_RKT_.isra.0
+_ZN2at6native15gpu_kernel_implI18__nv_hdl_wrapper_tILb0ELb1ELb0E11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEEXadL_ZNS0_16tanh_kernel_cudaES5_EELj3EEFN3c104HalfESA_EJEEEEvS5_RKT_.isra.0
+_ZN2at6native16tanh_kernel_cudaERNS_18TensorIteratorBaseE
+_ZN2at6native16gpu_index_kernelI17__nv_dl_wrapper_tI11__nv_dl_tagIPFvRNS_18TensorIteratorBaseEN3c108ArrayRefIlEES8_EXadL_ZNS0_17index_kernel_implINS0_10OpaqueTypeILi2EEEEEvS5_S8_S8_EELj1EEJEEEEvS5_S8_S8_RKT_.isra.0
+_ZN2at6nativeL12index_kernelERNS_18TensorIteratorBaseEN3c108ArrayRefIlEES5_
+_ZZN2at6nativeL12index_kernelERNS_18TensorIteratorBaseEN3c108ArrayRefIlEES5_ENKUlvE_clEv
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at14TensorIteratorERKN3c106ScalarEEXadL_ZNS1_6native44_GLOBAL__N__9e5ddf9f_11_Indexing_cu_89862edb18masked_fill_kernelES3_S7_EELj7EEFffbEJKfEE7managerIZZZNSB_18masked_fill_kernelES3_S7_ENKUlvE_clEvENKUlvE5_clEvEUlfbE_E7do_copyEPv
+_ZN18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRN2at14TensorIteratorERKN3c106ScalarEEXadL_ZNS1_6native44_GLOBAL__N__9e5ddf9f_11_Indexing_cu_89862edb18masked_fill_kernelES3_S7_EELj7EEFffbEJKfEE7managerIZZZNSB_18masked_fill_kernelES3_S7_ENKUlvE_clEvENKUlvE5_clEvEUlfbE_E9do_deleteEPv
+_ZN2at6native10gpu_kernelINS0_11FillFunctorIN3c104HalfEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native15gpu_kernel_implINS0_11FillFunctorIN3c104HalfEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native16fill_kernel_cudaERNS_14TensorIteratorERKN3c106ScalarE
+_ZN2at6native22gpu_kernel_impl_nocastINS0_11FillFunctorIN3c104HalfEEEEEvRNS_18TensorIteratorBaseERKT_
+_ZN3c1012function_refIFvPPcPKlllEE11callback_fnIZN2at18TensorIteratorBase15loop_2d_from_1dIZZZNS8_6native7DEFAULT11sqrt_kernelERS9_ENKUlvE_clEvENKUlvE0_clEvEUlS2_S4_lE_EEDaRKT_EUlS2_S4_llE_EEvlS2_S4_ll
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFN2at6TensorERKS6_S8_RKSt8optionalIS6_ENS_8ArrayRefINS_6SymIntEEESF_SF_SE_EXadL_ZNS5_12_GLOBAL__N_112_GLOBAL__N_141wrapper_CompositeImplicitAutograd__conv2dES8_S8_SC_SF_SF_SF_SE_EEEES6_NS_4guts8typelist8typelistIJS8_S8_SC_SF_SF_SF_SE_EEEEESG_E4callEPNS_14OperatorKernelENS_14DispatchKeySetES8_S8_SC_SF_SF_SF_SE_
+_ZN3c104impl28wrap_kernel_functor_unboxed_INS0_6detail24WrapFunctionIntoFunctor_INS_26CompileTimeFunctionPointerIFRN2at6TensorENS_14DispatchKeySetES7_RKS6_bEXadL_ZN5torch15ADInplaceOrViewL5copy_ES8_S7_SA_bEEEES7_NS_4guts8typelist8typelistIJS8_S7_SA_bEEEEESB_E4callEPNS_14OperatorKernelES8_S7_SA_b
+_ZN2at6native15gpu_kernel_implI18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRNS_14TensorIteratorERKN3c106ScalarEEXadL_ZNS0_44_GLOBAL__N__9e5ddf9f_11_Indexing_cu_89862edb18masked_fill_kernelES5_S9_EELj11EEFNS6_4HalfESE_bEJKSE_EEEEvRNS_18TensorIteratorBaseERKT_
+_ZN2at6native15gpu_kernel_implI18__nv_hdl_wrapper_tILb0ELb0ELb0E11__nv_dl_tagIPFvRNS_14TensorIteratorERKN3c106ScalarEEXadL_ZNS0_44_GLOBAL__N__9e5ddf9f_11_Indexing_cu_89862edb18masked_fill_kernelES5_S9_EELj4EEFllbEJKlEEEEvRNS_18TensorIteratorBaseERKT_
+_ZNK3c106Scalar6toHalfEv
+_ZN2at6native51_GLOBAL__N__e9b6561f_12_PowKernel_cu_40e48458_3413224pow_tensor_tensor_kernelERNS_18TensorIteratorBaseE
+_ZN3c1014fetch_and_castIlEET_NS_10ScalarTypeEPKv
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 197c60e7b7f62..f6ca263c5e5b8 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -202,12 +202,12 @@ if(HIP_FOUND)
       "}\n"
       )
 
-    try_compile(hipblaslt_compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
+    try_compile(hipblaslt_compile_result_custom_datatype ${PROJECT_RANDOM_BINARY_DIR} ${file}
       CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
       COMPILE_DEFINITIONS -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__
       OUTPUT_VARIABLE hipblaslt_compile_output)
 
-    if(hipblaslt_compile_result)
+    if(hipblaslt_compile_result_custom_datatype)
       set(HIPBLASLT_CUSTOM_DATA_TYPE ON)
       #message("hipblaslt is using custom data type: ${hipblaslt_compile_output}")
       message("hipblaslt is using custom data type")
@@ -227,12 +227,12 @@ if(HIP_FOUND)
       "}\n"
       )
 
-    try_compile(hipblaslt_compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
+    try_compile(hipblaslt_compile_result_custom_compute_type ${PROJECT_RANDOM_BINARY_DIR} ${file}
       CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
       COMPILE_DEFINITIONS -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__
       OUTPUT_VARIABLE hipblaslt_compile_output)
 
-    if(hipblaslt_compile_result)
+    if(hipblaslt_compile_result_custom_compute_type)
       set(HIPBLASLT_CUSTOM_COMPUTE_TYPE ON)
       #message("hipblaslt is using custom compute type: ${hipblaslt_compile_output}")
       message("hipblaslt is using custom compute type")
@@ -241,6 +241,61 @@ if(HIP_FOUND)
       #message("hipblaslt is NOT using custom compute type: ${hipblaslt_compile_output}")
       message("hipblaslt is NOT using custom compute type")
     endif()
+
+    # check whether hipblaslt provides getIndexFromAlgo
+    set(file "${PROJECT_BINARY_DIR}/hipblaslt_test_getIndexFromAlgo.cc")
+    file(WRITE ${file} ""
+      "#include <hipblaslt/hipblaslt.h>\n"
+      "#include <hipblaslt/hipblaslt-ext.hpp>\n"
+      "int main() {\n"
+      "    hipblasLtMatmulAlgo_t algo;\n"
+      "    return hipblaslt_ext::getIndexFromAlgo(algo);\n"
+      "    return 0;\n"
+      "}\n"
+      )
+
+    try_compile(hipblaslt_compile_result_getindexfromalgo ${PROJECT_RANDOM_BINARY_DIR} ${file}
+      CMAKE_FLAGS
+        "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
+        "-DLINK_DIRECTORIES=${ROCM_PATH}/lib"
+      LINK_LIBRARIES ${hipblaslt_LIBRARIES}
+      COMPILE_DEFINITIONS -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__
+      OUTPUT_VARIABLE hipblaslt_compile_output)
+
+    if(hipblaslt_compile_result_getindexfromalgo)
+      set(HIPBLASLT_HAS_GETINDEXFROMALGO ON)
+      #message("hipblaslt provides getIndexFromAlgo: ${hipblaslt_compile_output}")
+      message("hipblaslt provides getIndexFromAlgo")
+    else()
+      set(HAS_GETINDEXFROMALGO OFF)
+      #message("hipblaslt does not provide getIndexFromAlgo: ${hipblaslt_compile_output}")
+      message("hipblaslt does not provide getIndexFromAlgo")
+    endif()
+  endif()
+
+  # check whether HIP declares new types
+  set(file "${PROJECT_BINARY_DIR}/hip_new_types.cc")
+  file(WRITE ${file} ""
+    "#include <hip/library_types.h>\n"
+    "int main() {\n"
+    "    hipDataType baz = HIP_R_8F_E4M3_FNUZ;\n"
+    "    return 0;\n"
+    "}\n"
+    )
+
+  try_compile(hipblaslt_compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
+    CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
+    COMPILE_DEFINITIONS -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__
+    OUTPUT_VARIABLE hipblaslt_compile_output)
+
+  if(hipblaslt_compile_result)
+    set(HIP_NEW_TYPE_ENUMS ON)
+    #message("HIP is using new type enums: ${hipblaslt_compile_output}")
+    message("HIP is using new type enums")
+  else()
+    set(HIP_NEW_TYPE_ENUMS OFF)
+    #message("HIP is NOT using new type enums: ${hipblaslt_compile_output}")
+    message("HIP is NOT using new type enums")
   endif()
 
 endif()
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index c7595774d810b..8160b5e1fa88b 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -251,8 +251,8 @@ if(CAFFE2_USE_CUDNN)
       "Cannot find cuDNN library. Turning the option off")
     set(CAFFE2_USE_CUDNN OFF)
   else()
-    if(CUDNN_VERSION VERSION_LESS "8.0.0")
-      message(FATAL_ERROR "PyTorch requires cuDNN 8 and above.")
+    if(CUDNN_VERSION VERSION_LESS "8.1.0")
+      message(FATAL_ERROR "PyTorch requires cuDNN 8.1 and above.")
     endif()
   endif()
 
diff --git a/cmake/public/mkl.cmake b/cmake/public/mkl.cmake
index 5ce3989e36827..2f6d1fd905aa3 100644
--- a/cmake/public/mkl.cmake
+++ b/cmake/public/mkl.cmake
@@ -1,9 +1,10 @@
 find_package(MKL QUIET)
 
-if(NOT TARGET caffe2::mkl)
-  add_library(caffe2::mkl INTERFACE IMPORTED)
+if(TARGET caffe2::mkl)
+  return()
 endif()
 
+add_library(caffe2::mkl INTERFACE IMPORTED)
 target_include_directories(caffe2::mkl INTERFACE ${MKL_INCLUDE_DIR})
 target_link_libraries(caffe2::mkl INTERFACE ${MKL_LIBRARIES})
 foreach(MKL_LIB IN LISTS MKL_LIBRARIES)
@@ -20,3 +21,20 @@ endforeach()
 set_property(
   TARGET caffe2::mkl PROPERTY INTERFACE_LINK_DIRECTORIES
   ${MKL_ROOT}/lib ${MKL_ROOT}/lib/intel64 ${MKL_ROOT}/lib/intel64_win ${MKL_ROOT}/lib/win-x64)
+
+if(UNIX)
+  if(USE_STATIC_MKL)
+    foreach(MKL_LIB_PATH IN LISTS MKL_LIBRARIES)
+      if(NOT EXISTS "${MKL_LIB_PATH}")
+        continue()
+      endif()
+
+      get_filename_component(MKL_LIB_NAME "${MKL_LIB_PATH}" NAME)
+
+      # Match archive libraries starting with "libmkl_"
+      if(MKL_LIB_NAME MATCHES "^libmkl_" AND MKL_LIB_NAME MATCHES ".a$")
+        target_link_options(caffe2::mkl INTERFACE "-Wl,--exclude-libs,${MKL_LIB_NAME}")
+      endif()
+    endforeach()
+  endif()
+endif()
diff --git a/cmake/public/xpu.cmake b/cmake/public/xpu.cmake
new file mode 100644
index 0000000000000..d1a442f8efd41
--- /dev/null
+++ b/cmake/public/xpu.cmake
@@ -0,0 +1,30 @@
+# ---[ xpu
+
+# Poor man's include guard
+if(TARGET torch::xpurt)
+  return()
+endif()
+
+# Find SYCL library.
+find_package(SYCLToolkit REQUIRED)
+if(NOT SYCL_FOUND)
+  set(PYTORCH_FOUND_XPU FALSE)
+  return()
+endif()
+set(PYTORCH_FOUND_XPU TRUE)
+
+# SYCL library interface
+add_library(torch::sycl INTERFACE IMPORTED)
+
+set_property(
+    TARGET torch::sycl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    ${SYCL_INCLUDE_DIR})
+set_property(
+    TARGET torch::sycl PROPERTY INTERFACE_LINK_LIBRARIES
+    ${SYCL_LIBRARY})
+
+# xpurt
+add_library(torch::xpurt INTERFACE IMPORTED)
+set_property(
+    TARGET torch::xpurt PROPERTY INTERFACE_LINK_LIBRARIES
+    torch::sycl)
diff --git a/docker.Makefile b/docker.Makefile
index 0129ca447ab1b..a33c411907bc2 100644
--- a/docker.Makefile
+++ b/docker.Makefile
@@ -12,7 +12,7 @@ CUDA_VERSION_SHORT       ?= 12.1
 CUDA_VERSION             ?= 12.1.1
 CUDNN_VERSION            ?= 8
 BASE_RUNTIME              = ubuntu:22.04
-BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu22.04
+BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-devel-ubuntu22.04
 CMAKE_VARS               ?=
 
 # The conda channel to use to install cudatoolkit
@@ -20,8 +20,9 @@ CUDA_CHANNEL              = nvidia
 # The conda channel to use to install pytorch / torchvision
 INSTALL_CHANNEL          ?= pytorch
 
-PYTHON_VERSION           ?= 3.10
-PYTORCH_VERSION          ?= $(shell git describe --tags --always)
+PYTHON_VERSION           ?= 3.11
+# Match versions that start with v followed by a number, to avoid matching with tags like ciflow
+PYTORCH_VERSION          ?= $(shell git describe --tags --always --match "v[1-9]*.*")
 # Can be either official / dev
 BUILD_TYPE               ?= dev
 BUILD_PROGRESS           ?= auto
@@ -56,8 +57,7 @@ PUSH_FLAG                 = --push
 endif
 endif
 
-DOCKER_BUILD              = DOCKER_BUILDKIT=1 \
-							docker $(BUILD) \
+DOCKER_BUILD              = docker $(BUILD) \
 								--progress=$(BUILD_PROGRESS) \
 								$(EXTRA_DOCKER_BUILD_FLAGS) \
 								$(PLATFORMS_FLAG) \
@@ -82,6 +82,22 @@ devel-push: DOCKER_TAG := $(PYTORCH_VERSION)-cuda$(CUDA_VERSION_SHORT)-cudnn$(CU
 devel-push:
 	$(DOCKER_PUSH)
 
+ifeq ("$(CUDA_VERSION_SHORT)","cpu")
+
+.PHONY: runtime-image
+runtime-image: BASE_IMAGE := $(BASE_RUNTIME)
+runtime-image: DOCKER_TAG := $(PYTORCH_VERSION)-runtime
+runtime-image:
+	$(DOCKER_BUILD)
+
+.PHONY: runtime-push
+runtime-push: BASE_IMAGE := $(BASE_RUNTIME)
+runtime-push: DOCKER_TAG := $(PYTORCH_VERSION)-runtime
+runtime-push:
+	$(DOCKER_PUSH)
+
+else
+
 .PHONY: runtime-image
 runtime-image: BASE_IMAGE := $(BASE_RUNTIME)
 runtime-image: DOCKER_TAG := $(PYTORCH_VERSION)-cuda$(CUDA_VERSION_SHORT)-cudnn$(CUDNN_VERSION)-runtime
@@ -94,6 +110,8 @@ runtime-push: DOCKER_TAG := $(PYTORCH_VERSION)-cuda$(CUDA_VERSION_SHORT)-cudnn$(
 runtime-push:
 	$(DOCKER_PUSH)
 
+endif
+
 .PHONY: clean
 clean:
 	-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))
diff --git a/docs/cpp/requirements.txt b/docs/cpp/requirements.txt
deleted file mode 100644
index da401f2883a6c..0000000000000
--- a/docs/cpp/requirements.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-sphinx==3.1.2
-Jinja2==3.0.*
-breathe==4.25.0
-exhale==0.2.3
-docutils==0.16
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
-bs4
-lxml
diff --git a/docs/cpp/source/Doxyfile b/docs/cpp/source/Doxyfile
index a17d742a461ef..8df07dbbb9762 100644
--- a/docs/cpp/source/Doxyfile
+++ b/docs/cpp/source/Doxyfile
@@ -56,6 +56,7 @@ INPUT                  = ../../../aten/src/ATen/ATen.h \
                          ../../../c10/util/Optional.h \
                          ../../../c10/cuda/CUDAGuard.h \
                          ../../../c10/cuda/CUDAStream.h \
+                         ../../../c10/xpu/XPUStream.h \
                          ../../../torch/csrc/api/include \
                          ../../../torch/csrc/api/src \
                          ../../../torch/csrc/autograd/autograd.h \
diff --git a/docs/source/_static/img/profiling_torch_compile/graph_breaks_with_torch_compiled_region.png b/docs/source/_static/img/profiling_torch_compile/graph_breaks_with_torch_compiled_region.png
new file mode 100644
index 0000000000000..6cd6ed68195d1
Binary files /dev/null and b/docs/source/_static/img/profiling_torch_compile/graph_breaks_with_torch_compiled_region.png differ
diff --git a/docs/source/_static/img/profiling_torch_compile/kernel_launch_labeled.png b/docs/source/_static/img/profiling_torch_compile/kernel_launch_labeled.png
new file mode 100644
index 0000000000000..86ba85a5210f2
Binary files /dev/null and b/docs/source/_static/img/profiling_torch_compile/kernel_launch_labeled.png differ
diff --git a/docs/source/_static/img/profiling_torch_compile/noninductor_triton_kernel.png b/docs/source/_static/img/profiling_torch_compile/noninductor_triton_kernel.png
new file mode 100644
index 0000000000000..e0060b5e891bc
Binary files /dev/null and b/docs/source/_static/img/profiling_torch_compile/noninductor_triton_kernel.png differ
diff --git a/docs/source/_static/img/profiling_torch_compile/triton_kernel_launch.png b/docs/source/_static/img/profiling_torch_compile/triton_kernel_launch.png
new file mode 100644
index 0000000000000..b639910a29071
Binary files /dev/null and b/docs/source/_static/img/profiling_torch_compile/triton_kernel_launch.png differ
diff --git a/docs/source/_templates/autosummary/classnoinheritance.rst b/docs/source/_templates/autosummary/classnoinheritance.rst
new file mode 100644
index 0000000000000..d1edf770c230e
--- /dev/null
+++ b/docs/source/_templates/autosummary/classnoinheritance.rst
@@ -0,0 +1,11 @@
+.. role:: hidden
+    :class: hidden-section
+.. currentmodule:: {{ module }}
+
+
+{{ name | underline}}
+
+.. autoclass:: {{ name }}
+    :members:
+
+.. autogenerated from source/_templates/autosummary/class.rst
diff --git a/docs/source/amp.rst b/docs/source/amp.rst
index 06ec53948dfcc..a9d98a0aa0989 100644
--- a/docs/source/amp.rst
+++ b/docs/source/amp.rst
@@ -19,9 +19,9 @@ are much faster in ``lower_precision_fp``. Other ops, like reductions, often req
 range of ``float32``.  Mixed precision tries to match each op to its appropriate datatype.
 
 Ordinarily, "automatic mixed precision training" with datatype of ``torch.float16`` uses :class:`torch.autocast` and
-:class:`torch.cuda.amp.GradScaler` together, as shown in the :ref:`CUDA Automatic Mixed Precision examples<amp-examples>`
+:class:`torch.cpu.amp.GradScaler` or :class:`torch.cuda.amp.GradScaler` together, as shown in the :ref:`CUDA Automatic Mixed Precision examples<amp-examples>`
 and `CUDA Automatic Mixed Precision recipe <https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html>`_.
-However, :class:`torch.autocast` and :class:`torch.cuda.amp.GradScaler` are modular, and may be used separately if desired.
+However, :class:`torch.autocast` and :class:`torch.GradScaler` are modular, and may be used separately if desired.
 As shown in the CPU example section of :class:`torch.autocast`, "automatic mixed precision training/inference" on CPU with
 datatype of ``torch.bfloat16`` only uses :class:`torch.autocast`.
 
@@ -29,6 +29,8 @@ For CUDA and CPU, APIs are also provided separately:
 
 * ``torch.autocast("cuda", args...)`` is equivalent to ``torch.cuda.amp.autocast(args...)``.
 * ``torch.autocast("cpu", args...)`` is equivalent to ``torch.cpu.amp.autocast(args...)``. For CPU, only lower precision floating point datatype of ``torch.bfloat16`` is supported for now.
+* ``torch.GradScaler("cuda", args...)`` is equivalent to ``torch.cuda.amp.GradScaler(args...)``.
+* ``torch.GradScaler("cpu", args...)`` is equivalent to ``torch.cpu.amp.GradScaler(args...)``.
 
 :class:`torch.autocast` and :class:`torch.cpu.amp.autocast` are new in version `1.10`.
 
@@ -38,6 +40,10 @@ For CUDA and CPU, APIs are also provided separately:
 
 Autocasting
 ^^^^^^^^^^^
+.. currentmodule:: torch.amp.autocast_mode
+
+.. autofunction::  is_autocast_available
+
 .. currentmodule:: torch
 
 .. autoclass:: autocast
@@ -392,4 +398,6 @@ regardless of whether autocast is enabled.
 .. py:module:: torch.cpu.amp.autocast_mode
 .. py:module:: torch.cuda.amp.autocast_mode
 .. py:module:: torch.cuda.amp.common
+.. py:module:: torch.amp.grad_scaler
+.. py:module:: torch.cpu.amp.grad_scaler
 .. py:module:: torch.cuda.amp.grad_scaler
diff --git a/docs/source/autograd.rst b/docs/source/autograd.rst
index f3899fd0a60d2..195e96cd39016 100644
--- a/docs/source/autograd.rst
+++ b/docs/source/autograd.rst
@@ -33,6 +33,9 @@ for detailed steps on how to use this API.
     forward_ad.dual_level
     forward_ad.make_dual
     forward_ad.unpack_dual
+    forward_ad.enter_dual_level
+    forward_ad.exit_dual_level
+    forward_ad.UnpackedDualTensor
 
 .. _functional-api:
 
@@ -196,6 +199,8 @@ Tensor autograd functions
     Function.jvp
     Function.vmap
 
+.. _context_method_mixins:
+
 Context method mixins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 When creating a new :class:`Function`, the following methods are available to `ctx`.
@@ -209,6 +214,27 @@ When creating a new :class:`Function`, the following methods are available to `c
     function.FunctionCtx.save_for_backward
     function.FunctionCtx.set_materialize_grads
 
+Custom Function utilities
+^^^^^^^^^^^^^^^^^^^^^^^^^
+Decorator for backward method.
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    function.once_differentiable
+
+Base custom :class:`Function` used to build PyTorch utilities
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    function.BackwardCFunction
+    function.InplaceFunction
+    function.NestedIOFunction
+
+
 .. _grad-check:
 
 Numerical gradient checking
@@ -224,6 +250,7 @@ Numerical gradient checking
 
     gradcheck
     gradgradcheck
+    GradcheckError
 
 .. Just to reset the base path for the rest of this file
 .. currentmodule:: torch.autograd
@@ -249,6 +276,14 @@ and vtune profiler based using
     profiler.profile.key_averages
     profiler.profile.self_cpu_time_total
     profiler.profile.total_average
+    profiler.parse_nvprof_trace
+    profiler.EnforceUnique
+    profiler.KinetoStepTracker
+    profiler.record_function
+    profiler_util.Interval
+    profiler_util.Kernel
+    profiler_util.MemRecordsAcc
+    profiler_util.StringTable
 
 .. autoclass:: torch.autograd.profiler.emit_nvtx
 .. autoclass:: torch.autograd.profiler.emit_itt
@@ -260,13 +295,20 @@ and vtune profiler based using
 
     profiler.load_nvprof
 
-Anomaly detection
+Debugging and anomaly detection
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autoclass:: detect_anomaly
 
 .. autoclass:: set_detect_anomaly
 
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    grad_mode.set_multithreading_enabled
+
+
 
 Autograd graph
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -286,6 +328,7 @@ enabled and at least one of the inputs required gradients), or ``None`` otherwis
     graph.Node.next_functions
     graph.Node.register_hook
     graph.Node.register_prehook
+    graph.increment_version
 
 Some operations need intermediary results to be saved during the forward pass
 in order to execute the backward pass.
diff --git a/docs/source/backends.rst b/docs/source/backends.rst
index 2a11c0e4bee7a..ef3c720e83356 100644
--- a/docs/source/backends.rst
+++ b/docs/source/backends.rst
@@ -66,9 +66,9 @@ torch.backends.cuda
 
         Clears a cuFFT plan cache.
 
-.. autofunction:: torch.backends.cuda.preferred_linalg_library
+.. autofunction:: torch.backends.cuda.preferred_blas_library
 
-.. autoclass:: torch.backends.cuda.SDPBackend
+.. autofunction:: torch.backends.cuda.preferred_linalg_library
 
 .. autoclass:: torch.backends.cuda.SDPAParams
 
@@ -84,6 +84,10 @@ torch.backends.cuda
 
 .. autofunction:: torch.backends.cuda.enable_math_sdp
 
+.. autofunction:: torch.backends.cuda.cudnn_sdp_enabled
+
+.. autofunction:: torch.backends.cuda.enable_cudnn_sdp
+
 .. autofunction:: torch.backends.cuda.can_use_flash_attention
 
 .. autofunction:: torch.backends.cuda.can_use_efficient_attention
diff --git a/docs/source/benchmark_utils.rst b/docs/source/benchmark_utils.rst
index c93fbfd66c3d9..7546179c503fd 100644
--- a/docs/source/benchmark_utils.rst
+++ b/docs/source/benchmark_utils.rst
@@ -19,6 +19,9 @@ Benchmark Utils - torch.utils.benchmark
 .. autoclass:: FunctionCounts
     :members:
 
+.. autoclass:: Compare
+    :members:
+
 .. These are missing documentation. Adding them here until a better place
 .. is made in this file.
 .. py:module:: torch.utils.benchmark.examples
diff --git a/docs/source/checkpoint.rst b/docs/source/checkpoint.rst
index e003d2460b558..f7bc160fa98bd 100644
--- a/docs/source/checkpoint.rst
+++ b/docs/source/checkpoint.rst
@@ -3,8 +3,8 @@ torch.utils.checkpoint
 
 .. note::
     Checkpointing is implemented by rerunning a forward-pass segment for
-    each checkpointed segment during backward.  This can cause persistent
-    states like the RNG state to be advanced than they would without
+    each checkpointed segment during backward propagation.  This can cause persistent
+    states like the RNG state to be more advanced than they would without
     checkpointing.  By default, checkpointing includes logic to juggle
     the RNG state such that checkpointed passes making use of RNG
     (through dropout for example) have deterministic output as
diff --git a/docs/source/community/contribution_guide.rst b/docs/source/community/contribution_guide.rst
index c6ec6d75e46e3..ab6fb13da763b 100644
--- a/docs/source/community/contribution_guide.rst
+++ b/docs/source/community/contribution_guide.rst
@@ -303,7 +303,7 @@ Frequently Asked Questions
    tasks or pull requests with your environment details is helpful and
    appreciated.
 -  **CI tests failed, what does it mean?** Maybe your PR is based
-   off a broken main bracnh? You can try to rebase your change on top
+   off a broken main branch? You can try to rebase your change on top
    of the latest main branch. You can also see the current status of
    main branch's CI at https://hud.pytorch.org/.
 -  **What are the most high risk changes?** Anything that touches build
diff --git a/docs/source/complex_numbers.rst b/docs/source/complex_numbers.rst
index 7f18c0e19c5e3..87de9f8a4088e 100644
--- a/docs/source/complex_numbers.rst
+++ b/docs/source/complex_numbers.rst
@@ -129,9 +129,38 @@ Autograd
 
 PyTorch supports autograd for complex tensors. The gradient computed is the Conjugate Wirtinger derivative,
 the negative of which is precisely the direction of steepest descent used in Gradient Descent algorithm. Thus,
-all the existing optimizers work out of the box with complex parameters. For more details,
+all the existing optimizers can be implemented to work out of the box with complex parameters. For more details,
 check out the note :ref:`complex_autograd-doc`.
 
+
+Optimizers
+----------
+
+Semantically, we define stepping through a PyTorch optimizer with complex parameters as being equivalent to stepping
+through the same optimizer on the :func:`torch.view_as_real` equivalent of the complex params. More concretely:
+
+::
+
+     >>> params = [torch.rand(2, 3, dtype=torch.complex64) for _ in range(5)]
+     >>> real_params = [torch.view_as_real(p) for p in params]
+
+     >>> complex_optim = torch.optim.AdamW(params)
+     >>> real_optim = torch.optim.AdamW(real_params)
+
+
+`real_optim` and `complex_optim` will compute the same updates on the parameters, though there may be slight numerical
+discrepancies between the two optimizers, similar to numerical discrepancies between foreach vs forloop optimizers
+and capturable vs default optimizers. For more details, see https://pytorch.org/docs/stable/notes/numerical_accuracy.html.
+
+Specifically, while you can think of our optimizer's handling of complex tensors as the same as optimizing over their
+`p.real` and `p.imag` pieces separately, the implementation details are not precisely that. Note that the
+:func:`torch.view_as_real` equivalent will convert a complex tensor to a real tensor with shape :math:`(..., 2)`,
+whereas splitting a complex tensor into two tensors is 2 tensors of size :math:`(...)`. This distinction has no impact on
+pointwise optimizers (like AdamW) but will cause slight discrepancy in optimizers that do global reductions (like LBFGS).
+We currently do not have optimizers that do per-Tensor reductions and thus do not yet define this behavior. Open an issue
+if you have a use case that requires precisely defining this behavior.
+
+
 We do not fully support the following subsystems:
 
 * Quantization
diff --git a/docs/source/cond.rst b/docs/source/cond.rst
index 8f11cb239f030..9d1fb7515ff4b 100644
--- a/docs/source/cond.rst
+++ b/docs/source/cond.rst
@@ -19,7 +19,7 @@ and can logically be seen as implemented as follows.
         else:
             return false_fn(*operands)
 
-Its unique power lies in its aibilty of expressing **data-dependent control flow**: it lowers to a conditional
+Its unique power lies in its ability of expressing **data-dependent control flow**: it lowers to a conditional
 operator (`torch.ops.higher_order.cond`), which preserves predicate, true function and false functions.
 This unlocks great flexibilty in writing and deploying models that change model architecture based on
 the **value** or **shape** of inputs or intermediate outputs of tensor operations.
@@ -113,7 +113,7 @@ Here is another exmaple that showcases how to express a data-dependet control fl
 
 .. code-block:: python
 
-    class DataDependentCondPredicacte(torch.nn.Module):
+    class DataDependentCondPredicate(torch.nn.Module):
         """
         A basic usage of cond based on data dependent predicate.
         """
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 80693231b83ce..0f89d2799fa52 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -91,9 +91,6 @@
 coverage_ignore_functions = [
     # torch
     "typename",
-    # torch.autograd
-    "register_py_tensor_class_for_device",
-    "variable",
     # torch.cuda
     "check_error",
     "cudart",
@@ -312,7 +309,6 @@
     "reference_representation_rewrite",
     # torch.ao.quantization.pt2e.utils
     "fold_bn_weights_into_conv_node",
-    "get_aten_graph_module",
     "remove_tensor_overload_for_qdq_ops",
     # torch.ao.quantization.qconfig
     "get_default_qat_qconfig",
@@ -390,20 +386,6 @@
     "weight_dtype",
     "weight_is_quantized",
     "weight_is_statically_quantized",
-    # torch.autograd.forward_ad
-    "enter_dual_level",
-    "exit_dual_level",
-    # torch.autograd.function
-    "once_differentiable",
-    "traceable",
-    # torch.autograd.gradcheck
-    "get_analytical_jacobian",
-    "get_numerical_jacobian",
-    "get_numerical_jacobian_wrt_specific_input",
-    # torch.autograd.graph
-    "increment_version",
-    # torch.autograd.profiler
-    "parse_nvprof_trace",
     # torch.backends.cudnn.rnn
     "get_cudnn_mode",
     "init_dropout_state",
@@ -597,6 +579,7 @@
     "barrier",
     "get_all",
     "synchronize",
+    "store_timeout",
     # torch.distributed.fsdp.wrap
     "always_wrap_policy",
     "enable_wrap",
@@ -685,6 +668,8 @@
     "parallelize_module",
     # torch.distributed.tensor.parallel.input_reshard
     "input_reshard",
+    # torch.distributed.tensor.parallel.loss
+    "loss_parallel",
     # torch.distributed.tensor.parallel.style
     "make_sharded_output_tensor",
     # torch.distributions.utils
@@ -896,7 +881,7 @@
     "extract_val",
     "fake_signature",
     "fetch_sym_proxy",
-    "fetch_tensor_proxy",
+    "fetch_object_proxy",
     "get_innermost_proxy_mode",
     "get_isolated_graphmodule",
     "get_proxy_slot",
@@ -939,11 +924,7 @@
     # torch.fx.experimental.symbolic_shapes
     "bind_symbols",
     "cast_symbool_to_symint_guardless",
-    "constrain_range",
-    "constrain_unify",
     "create_contiguous",
-    "definitely_false",
-    "definitely_true",
     "error",
     "eval_guards",
     "eval_is_non_overlapping_and_dense",
@@ -959,23 +940,15 @@
     "guard_scalar",
     "has_hint",
     "has_symbolic_sizes_strides",
-    "has_free_symbols",
-    "hint_int",
     "is_channels_last_contiguous_2d",
     "is_channels_last_contiguous_3d",
     "is_channels_last_strides_2d",
     "is_channels_last_strides_3d",
-    "is_concrete_bool",
-    "is_concrete_int",
     "is_contiguous",
     "is_non_overlapping_and_dense_indicator",
-    "is_singleton",
+    "is_nested_int",
     "is_symbol_binding_fx_node",
     "is_symbolic",
-    "parallel_and",
-    "parallel_or",
-    "sym_eq",
-    "canonicalize_bool_expr",
     # torch.fx.experimental.unification.core
     "reify",
     # torch.fx.experimental.unification.match
@@ -2529,43 +2502,9 @@
     "QuantWrapper",
     # torch.ao.quantization.utils
     "MatchAllNode",
-    # torch.autograd.forward_ad
-    "UnpackedDualTensor",
-    # torch.autograd.function
-    "BackwardCFunction",
-    "Function",
-    "FunctionCtx",
-    "FunctionMeta",
-    "InplaceFunction",
-    "NestedIOFunction",
-    # torch.autograd.grad_mode
-    "inference_mode",
-    "set_grad_enabled",
-    "set_multithreading_enabled",
-    # torch.autograd.gradcheck
-    "GradcheckError",
-    # torch.autograd.profiler
-    "EnforceUnique",
-    "KinetoStepTracker",
-    "profile",
-    "record_function",
-    # torch.autograd.profiler_legacy
-    "profile",
-    # torch.autograd.profiler_util
-    "EventList",
-    "FormattedTimesMixin",
-    "FunctionEvent",
-    "FunctionEventAvg",
-    "Interval",
-    "Kernel",
-    "MemRecordsAcc",
-    "StringTable",
-    # torch.autograd.variable
-    "Variable",
-    "VariableMeta",
     # torch.backends.cudnn.rnn
     "Unserializable",
-    # torch.cuda.amp.grad_scaler
+    # torch.amp.grad_scaler
     "GradScaler",
     "OptState",
     # torch.cuda.graphs
@@ -2855,22 +2794,14 @@
     # torch.fx.experimental.symbolic_shapes
     "Constraint",
     "ConstraintViolationError",
-    "DimConstraints",
-    "DimDynamic",
     "DynamicDimConstraintPrinter",
-    "EqualityConstraint",
     "GuardOnDataDependentSymNode",
     "LoggingShapeGuardPrinter",
     "RelaxedUnspecConstraint",
     "RuntimeAssert",
-    "ShapeEnv",
     "ShapeGuardPrinter",
-    "StrictMinMaxConstraint",
     "SymDispatchMode",
     "SymbolicContext",
-    "StatelessSymbolicContext",
-    "StatefulSymbolicContext",
-    "SubclassSymbolicContext",
     # torch.fx.experimental.unification.match
     "Dispatcher",
     "VarDispatcher",
@@ -3627,6 +3558,15 @@
 
 from sphinx.ext.coverage import CoverageBuilder
 
+# NB: Due to some duplications of the following modules/functions, we keep
+# them as expected failures for the time being instead of return 1
+ignore_duplicated_modules = {
+    "torch.nn.utils.weight_norm",
+    "torch.nn.utils.spectral_norm",
+    "torch.nn.parallel.data_parallel",
+    "torch.ao.quantization.quantize",
+}
+
 
 def coverage_post_process(app, exception):
     if exception is not None:
@@ -3667,7 +3607,7 @@ def is_not_internal(modname):
         path=torch.__path__, prefix=torch.__name__ + "."
     ):
         if is_not_internal(modname):
-            if modname not in modules:
+            if modname not in modules and modname not in ignore_duplicated_modules:
                 missing.add(modname)
 
     output = []
diff --git a/docs/source/cuda.rst b/docs/source/cuda.rst
index 33b697a288f80..cee1ec6af2e8a 100644
--- a/docs/source/cuda.rst
+++ b/docs/source/cuda.rst
@@ -133,6 +133,7 @@ NVIDIA Tools Extension (NVTX)
     nvtx.mark
     nvtx.range_push
     nvtx.range_pop
+    nvtx.range
 
 Jiterator (beta)
 -----------------------------
@@ -167,4 +168,4 @@ See the :doc:`documentation <cuda._sanitizer>` for information on how to use it.
 .. py:module:: torch.cuda.profiler
 .. py:module:: torch.cuda.random
 .. py:module:: torch.cuda.sparse
-.. py:module:: torch.cuda.streams
\ No newline at end of file
+.. py:module:: torch.cuda.streams
diff --git a/docs/source/cuda_environment_variables.rst b/docs/source/cuda_environment_variables.rst
new file mode 100644
index 0000000000000..3138d113d22f5
--- /dev/null
+++ b/docs/source/cuda_environment_variables.rst
@@ -0,0 +1,55 @@
+.. _cuda_environment_variables:
+
+CUDA Environment Variables
+==========================
+For more information on CUDA runtime environment variables, see `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_.
+
+**PyTorch Environment Variables**
+
+.. list-table::
+  :header-rows: 1
+
+  * - Variable
+    - Description
+  * - ``PYTORCH_NO_CUDA_MEMORY_CACHING``
+    - If set to ``1``, disables caching of memory allocations in CUDA. This can be useful for debugging.
+  * - ``PYTORCH_CUDA_ALLOC_CONF``
+    - For a more in depth explanation of this environment variable, see :ref:`cuda-memory-management`.
+  * - ``PYTORCH_NVML_BASED_CUDA_CHECK``
+    - If set to ``1``, before importing PyTorch modules that check if CUDA is available, PyTorch will use NVML to check if the CUDA driver is functional instead of using the CUDA runtime. This can be helpful if forked processes fail with a CUDA initialization error.
+  * - ``TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT``
+    - The cache limit for the cuDNN v8 API. This is used to limit the memory used by the cuDNN v8 API. The default value is 10000, which roughly corresponds to 2GiB assuming 200KiB per ExecutionPlan. Set to ``0`` for no limit or a negative value for no caching.
+  * - ``TORCH_CUDNN_V8_API_DISABLED``
+    - If set to ``1``, disables the cuDNN v8 API. And will fall back to the cuDNN v7 API.
+  * - ``TORCH_ALLOW_TF32_CUBLAS_OVERRIDE``
+    - If set to ``1``, forces TF32 enablement, overrides ``set_float32_matmul_precision`` setting.
+  * - ``TORCH_NCCL_USE_COMM_NONBLOCKING``
+    - If set to ``1``, enables non-blocking error handling in NCCL.
+  * - ``TORCH_NCCL_AVOID_RECORD_STREAMS``
+    - If set to ``0``, enables fallback to record streams-based synchronization behavior in NCCL.
+  * - ``TORCH_CUDNN_V8_API_DEBUG``
+    - If set to ``1``, sanity check whether cuDNN V8 is being used.
+
+**CUDA Runtime and Libraries Environment Variables**
+
+.. list-table::
+  :header-rows: 1
+
+  * - Variable
+    - Description
+  * - ``CUDA_VISIBLE_DEVICES``
+    - Comma-separated list of GPU device IDs that should be made available to CUDA runtime. If set to ``-1``, no GPUs are made available.
+  * - ``CUDA_LAUNCH_BLOCKING``
+    - If set to ``1``, makes CUDA calls synchronous. This can be useful for debugging.
+  * - ``CUBLAS_WORKSPACE_CONFIG``
+    - This environment variable is used to set the workspace configuration for cuBLAS per allocation. The format is ``:[SIZE]:[COUNT]``.
+      As an example, the default workspace size per allocation is ``CUBLAS_WORKSPACE_CONFIG=:4096:2:16:8`` which specifies a total size of ``2 * 4096 + 8 * 16 KiB``.
+      To force cuBLAS to avoid using workspaces, set ``CUBLAS_WORKSPACE_CONFIG=:0:0``.
+  * - ``CUDNN_CONV_WSCAP_DBG``
+    - Similar to ``CUBLAS_WORKSPACE_CONFIG``, this environment variable is used to set the workspace configuration for cuDNN per allocation.
+  * - ``CUBLASLT_WORKSPACE_SIZE``
+    - Similar to ``CUBLAS_WORKSPACE_CONFIG``, this environment variable is used to set the workspace size for cuBLASLT.
+  * - ``CUDNN_ERRATA_JSON_FILE``
+    - Can be set to a file path for an errata filter that can be passed to cuDNN to avoid specific engine configs, used primarily for debugging or to hardcode autotuning.
+  * - ``NVIDIA_TF32_OVERRIDE``
+    - If set to ``0``, disables TF32 globally across all kernels, overriding all PyTorch settings.
diff --git a/docs/source/debugging_environment_variables.rst b/docs/source/debugging_environment_variables.rst
new file mode 100644
index 0000000000000..ebc38d454ebd5
--- /dev/null
+++ b/docs/source/debugging_environment_variables.rst
@@ -0,0 +1,15 @@
+.. _debugging_environment_variables:
+
+Debugging Environment Variables
+===============================
+.. list-table::
+  :header-rows: 1
+
+  * - Variable
+    - Description
+  * - ``TORCH_SHOW_CPP_STACKTRACES``
+    - If set to ``1``, makes PyTorch print out a stack trace when it detects a C++ error.
+  * - ``TORCH_CPP_LOG_LEVEL``
+    - Set the log level of c10 logging facility (supports both GLOG and c10 loggers). Valid values are ``INFO``, ``WARNING``, ``ERROR``, and ``FATAL`` or their numerical equivalents ``0``, ``1``, ``2``, and ``3``.
+  * - ``TORCH_LOGS``
+    -  For a more in depth explanation of this environment variable, see :doc:`/logging`.
\ No newline at end of file
diff --git a/docs/source/distributed.checkpoint.rst b/docs/source/distributed.checkpoint.rst
index 7234c25dd4b4d..573faa429b74a 100644
--- a/docs/source/distributed.checkpoint.rst
+++ b/docs/source/distributed.checkpoint.rst
@@ -18,13 +18,27 @@ The entrypoints to load and save a checkpoint are the following:
 
 .. automodule:: torch.distributed.checkpoint
 
-.. currentmodule:: torch.distributed.checkpoint
+.. currentmodule:: torch.distributed.checkpoint.state_dict_saver
 
-.. autofunction::  load
 .. autofunction::  save
-.. autofunction::  load_state_dict
+.. autofunction::  async_save
 .. autofunction::  save_state_dict
 
+.. currentmodule:: torch.distributed.checkpoint.state_dict_loader
+
+.. autofunction::  load
+.. autofunction::  load_state_dict
+
+The following module is also useful for additional customization of the staging mechanisms used for asynchronous checkpointing (`torch.distributed.checkpoint.async_save`):
+
+.. automodule:: torch.distributed.checkpoint.staging
+
+.. autoclass:: torch.distributed.checkpoint.staging.AsyncStager
+  :members:
+
+.. autoclass:: torch.distributed.checkpoint.staging.BlockingAsyncStager
+  :members:
+
 In addition to the above entrypoints, `Stateful` objects, as described below, provide additional customization during saving/loading
 .. automodule:: torch.distributed.checkpoint.stateful
 
@@ -58,7 +72,7 @@ The following types define the planner interface used during checkpoint:
 .. autoclass:: torch.distributed.checkpoint.SavePlan
   :members:
 
-.. autoclass:: torch.distributed.checkpoint.WriteItem
+.. autoclass:: torch.distributed.checkpoint.planner.WriteItem
   :members:
 
 We provide a filesystem based storage layer:
@@ -78,8 +92,15 @@ can handle all of torch.distributed constructs such as FSDP, DDP, ShardedTensor
 .. autoclass:: torch.distributed.checkpoint.DefaultLoadPlanner
   :members:
 
-We provide a set of APIs to help users do get and set state_dict easily. This is
-an experimental feature and is subject to change.
+
+Due to legacy design decisions, the state dictionaries of `FSDP` and `DDP` may have different keys or fully qualified names (e.g., layer1.weight) even when the original unparallelized model is identical. Moreover, `FSDP` offers various types of model state dictionaries, such as full and sharded state dictionaries. Additionally, optimizer state dictionaries employ parameter IDs instead of fully qualified names to identify parameters, potentially causing issues when parallelisms are used (e.g., pipeline parallelism).
+
+To tackle these challenges, we offer a collection of APIs for users to easily manage state_dicts. `get_model_state_dict` returns a model state dictionary with keys consistent with those returned by the unparallelized model state dictionary. Similarly, `get_optimizer_state_dict` provides the optimizer state dictionary with keys uniform across all parallelisms applied. To achieve this consistency, `get_optimizer_state_dict` converts parameter IDs to fully qualified names identical to those found in the unparallelized model state dictionary.
+
+Note that results returned by hese APIs can be used directly with the `torch.distributed.checkpoint.save()` and `torch.distributed.checkpoint.load()` methods without requiring any additional conversions.
+
+Note that this feature is experimental, and API signatures might change in the future.
+
 
 .. autofunction:: torch.distributed.checkpoint.state_dict.get_state_dict
 
@@ -95,3 +116,25 @@ an experimental feature and is subject to change.
 
 .. autoclass:: torch.distributed.checkpoint.state_dict.StateDictOptions
    :members:
+
+For users which are used to using and sharing models in the `torch.save` format, the following methods are provided which provide offline utilities for converting betweeing formats.
+
+.. automodule:: torch.distributed.checkpoint.format_utils
+
+.. currentmodule:: torch.distributed.checkpoint.format_utils
+
+.. autofunction:: dcp_to_torch_save
+.. autofunction:: torch_save_to_dcp
+
+The following classes can also be utilized for online loading and resharding of models from the torch.save format.
+
+.. autoclass:: torch.distributed.checkpoint.format_utils.BroadcastingTorchSaveReader
+   :members:
+
+.. autoclass:: torch.distributed.checkpoint.format_utils.DynamicMetaLoadPlanner
+   :members:
+
+The following experimental interfaces are provided for improved observability in production environments:
+
+.. py:module:: torch.distributed.checkpoint.logger
+.. py:module:: torch.distributed.checkpoint.logging_handlers
diff --git a/docs/source/distributed.elastic.rst b/docs/source/distributed.elastic.rst
index 0a23912bfa850..24d33d1982dfb 100644
--- a/docs/source/distributed.elastic.rst
+++ b/docs/source/distributed.elastic.rst
@@ -28,6 +28,7 @@ Documentation
    elastic/timer
    elastic/metrics
    elastic/events
+   elastic/subprocess_handler
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/distributed.pipelining.rst b/docs/source/distributed.pipelining.rst
new file mode 100644
index 0000000000000..2967fa29efe96
--- /dev/null
+++ b/docs/source/distributed.pipelining.rst
@@ -0,0 +1,212 @@
+.. role:: hidden
+    :class: hidden-section
+
+Pipeline Parallelism
+####################
+
+.. note:: ``torch.distributed.pipelining`` is a package migrated from the `PiPPy <https://github.com/pytorch/PiPPy>`_ project. It is currently in alpha state and under extensive development. For examples that work with our APIs, please refer to PiPPy's `examples <https://github.com/pytorch/PiPPy/tree/main/examples>`_ directory.
+
+Why Pipeline Parallel?
+**********************
+
+One of the most important techniques for advancing the state of the art in deep learning is scaling. Common techniques for scaling neural networks include *data parallelism*, *tensor/operation parallelism*, and *pipeline parallelism* (or *pipelining*). Pipelining is a technique in which the *code* of the model is partitioned and multiple *micro-batches* execute different parts of the model code concurrently. In many cases, pipeline parallelism can be an effective technique for scaling, in particular for large-scale jobs or bandwidth-limited interconnects. To learn more about pipeline parallelism in deep learning, see `this article <https://www.deepspeed.ai/tutorials/pipeline/>`_.
+
+What is ``torch.distributed.pipelining``?
+*****************************************
+
+.. automodule:: torch.distributed.pipelining
+
+.. currentmodule:: torch.distributed.pipelining
+
+While promising for scaling, pipelining is often difficult to implement, requiring intrusive code changes to model code and difficult-to-implement runtime orchestration code. ``torch.distributed.pipelining`` aims to provide **a toolkit that does said things automatically to allow high-productivity scaling of models.** It consists of a **compiler** and a **runtime** stack for easy pipelining of PyTorch models. In particular, it provides the following features:
+
+* Splitting of model code based on your specification. The goal is for the user to provide model code as-is to the system for parallelization, without having to make heavyweight modifications to make parallelism work. The specification is also simple.
+* Support for rich pipeline scheduling paradigms, including GPipe, 1F1B, Interleaved 1F1B and Looped BFS. It will be also easy to customize your own schedule under this framework.
+* First-class support for cross-host pipeline parallelism, as this is where PP is typically used (over slower interconnects).
+* Composability with other PyTorch parallel schemes such as data parallelism (DDP, FSDP) or tensor  parallelism (overall, known as "3d parallelism").
+
+Examples
+********
+
+In the `PiPPy <https://github.com/pytorch/PiPPy>`_ repo where this package is migrated from, we provide rich examples based on realistic models. In particular, we show how to apply pipelining without any model code change. You can refer to the `HuggingFace examples directory <https://github.com/pytorch/PiPPy/tree/main/examples/huggingface>`_. Popular examples include: `GPT2 <https://github.com/pytorch/PiPPy/tree/main/examples/huggingface/pippy_gpt2.py>`_, and `LLaMA <https://github.com/pytorch/PiPPy/tree/main/examples/llama>`_.
+
+Techniques Explained
+********************
+
+``torch.distributed.pipelining`` consists of two parts: a *compiler* and a *runtime*. The compiler takes your model code, splits it up, and transforms it into a ``Pipe``, which is a wrapper that describes the model at each pipeline stage and their data-flow relationship. The runtime executes the ``PipelineStage`` in parallel, handling things like micro-batch splitting, scheduling, communication, and gradient propagation, etc. We will cover the APIs for these concepts in this section.
+
+Splitting a Model with ``pipeline``
+===================================
+
+To see how we can split a model into a pipeline, let's first take an example trivial neural network:
+
+.. code-block:: python
+
+  import torch
+
+  class MyNetworkBlock(torch.nn.Module):
+      def __init__(self, in_dim, out_dim):
+          super().__init__()
+          self.lin = torch.nn.Linear(in_dim, out_dim)
+
+      def forward(self, x):
+          x = self.lin(x)
+          x = torch.relu(x)
+          return x
+
+
+  class MyNetwork(torch.nn.Module):
+      def __init__(self, in_dim, layer_dims):
+          super().__init__()
+
+          prev_dim = in_dim
+          for i, dim in enumerate(layer_dims):
+              setattr(self, f'layer{i}', MyNetworkBlock(prev_dim, dim))
+              prev_dim = dim
+
+          self.num_layers = len(layer_dims)
+          # 10 output classes
+          self.output_proj = torch.nn.Linear(layer_dims[-1], 10)
+
+      def forward(self, x):
+          for i in range(self.num_layers):
+              x = getattr(self, f'layer{i}')(x)
+
+          return self.output_proj(x)
+
+
+  in_dim = 512
+  layer_dims = [512, 1024, 256]
+  mn = MyNetwork(in_dim, layer_dims).to(device)
+
+This network is written as free-form Python code; it has not been modified for any specific parallelism technique.
+
+Let us see our usage of the ``pipeline`` interface:
+
+.. code-block:: python
+
+  from torch.distributed.pipelining import annotate_split_points, pipeline, Pipe, SplitPoint
+
+  annotate_split_points(mn, {'layer0': SplitPoint.END,
+                            'layer1': SplitPoint.END})
+
+  batch_size = 32
+  example_input = torch.randn(batch_size, in_dim, device=device)
+  chunks = 4
+
+  pipe = pipeline(mn, chunks, example_args=(example_input,))
+  print(pipe)
+
+::
+
+  ************************************* pipe *************************************
+  GraphModule(
+    (submod_0): GraphModule(
+      (layer0): InterpreterModule(
+        (lin): InterpreterModule()
+      )
+    )
+    (submod_1): GraphModule(
+      (layer1): InterpreterModule(
+        (lin): InterpreterModule()
+      )
+    )
+    (submod_2): GraphModule(
+      (layer2): InterpreterModule(
+        (lin): InterpreterModule()
+      )
+      (output_proj): InterpreterModule()
+    )
+  )
+
+  def forward(self, arg8_1):
+      submod_0 = self.submod_0(arg8_1);  arg8_1 = None
+      submod_1 = self.submod_1(submod_0);  submod_0 = None
+      submod_2 = self.submod_2(submod_1);  submod_1 = None
+      return (submod_2,)
+
+So what's going on here? First, ``pipeline`` turns our model into a directed acyclic graph (DAG) by tracing the model. Then, it groups together the operations and parameters into *pipeline stages*. Stages are represented as ``submod_N`` submodules, where ``N`` is a natural number.
+
+We used ``annotate_split_points`` to specify that the code should be split and the end of ``layer0`` and ``layer1``. Our code has thus been split into *three* pipeline stages. Our library also provides ``SplitPoint.BEGINNING`` if a user wants to split before certain annotation point.
+
+While the ``annotate_split_points`` API gives users a way to specify the split points without modifying the model, our library also provides an API for in-model annotation: ``pipe_split()``. For details, you can read `this example <https://github.com/pytorch/PiPPy/blob/main/test/test_pipe.py>`_.
+
+This covers the basic usage of the ``Pipe`` API. For more information, please see the documentation.
+
+Using ``PipelineSchedule`` for Execution
+========================================
+
+Given the above ``Pipe`` object, we can use one of the ``PipelineStage`` classes to execute our model in a pipelined fashion. First off, let us instantiate a ``PipelineStage`` instance:
+
+.. code-block:: python
+
+  # We are using `torchrun` to run this example with multiple processes.
+  # `torchrun` defines two environment variables: `RANK` and `WORLD_SIZE`.
+  rank = int(os.environ["RANK"])
+  world_size = int(os.environ["WORLD_SIZE"])
+
+  # Initialize distributed environment
+  import torch.distributed as dist
+  dist.init_process_group(rank=rank, world_size=world_size)
+
+  # Pipeline stage is our main pipeline runtime. It takes in the pipe object,
+  # the rank of this process, and the device.
+  from torch.distributed.pipelining import PipelineStage
+  stage = PipelineStage(pipe, rank, device)
+
+We can now attach the ``PipelineStage`` to a pipeline schedule, GPipe for example, and run with data:
+
+.. code-block:: python
+
+  from torch.distributed.pipelining import ScheduleGPipe
+  schedule = ScheduleGPipe(stage, chunks)
+
+  # Input data
+  x = torch.randn(batch_size, in_dim, device=device)
+
+  # Run the pipeline with input `x`. Divide the batch into 4 micro-batches
+  # and run them in parallel on the pipeline
+  if rank == 0:
+      schedule.step(x)
+  else:
+      output = schedule.step()
+
+Note that since we split our model into three stages, we must run this script with three workers. For this example, we will use ``torchrun`` to run multiple processes within a single machine for demonstration purposes. We can collect up all of the code blocks above into a file named `example.py <https://github.com/pytorch/PiPPy/tree/main/examples/basic>`_ and then run it with ``torchrun`` like so:
+
+.. code-block:: bash
+
+  torchrun --nproc_per_node=3 example.py
+
+Pipeline Transformation APIs
+============================
+
+The following set of APIs transform your model into a pipeline representation.
+
+.. currentmodule:: torch.distributed.pipelining
+
+.. autoclass:: SplitPoint
+
+.. autofunction:: pipeline
+
+.. autoclass:: Pipe
+
+.. autofunction:: annotate_split_points
+
+.. autofunction:: pipe_split
+
+.. autoclass:: ArgsChunkSpec
+
+.. autoclass:: KwargsChunkSpec
+
+Microbatch Utilities
+====================
+
+.. automodule:: torch.distributed.pipelining.microbatch
+
+.. currentmodule:: torch.distributed.pipelining.microbatch
+
+.. autoclass:: TensorChunkSpec
+
+.. autofunction:: split_args_kwargs_into_chunks
+
+.. autofunction:: merge_chunks
diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
index 978dabf0298cd..750c8966c6862 100644
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@@ -372,6 +372,10 @@ as they should never be created manually, but they are guaranteed to support two
 
 .. autofunction:: irecv
 
+.. autofunction:: send_object_list
+
+.. autofunction:: recv_object_list
+
 .. autofunction:: batch_isend_irecv
 
 .. autoclass:: P2POp
@@ -960,6 +964,7 @@ If you are running single node training, it may be convenient to interactively b
 .. py:module:: torch.distributed.tensor.parallel.ddp
 .. py:module:: torch.distributed.tensor.parallel.fsdp
 .. py:module:: torch.distributed.tensor.parallel.input_reshard
+.. py:module:: torch.distributed.tensor.parallel.loss
 .. py:module:: torch.distributed.tensor.parallel.style
 .. py:module:: torch.distributed.utils
 .. py:module:: torch.distributed.checkpoint.state_dict
diff --git a/docs/source/distributed.tensor.parallel.rst b/docs/source/distributed.tensor.parallel.rst
index f6e76d98e7b8c..de7525b32e2e2 100644
--- a/docs/source/distributed.tensor.parallel.rst
+++ b/docs/source/distributed.tensor.parallel.rst
@@ -6,7 +6,7 @@ Tensor Parallelism - torch.distributed.tensor.parallel
 
 Tensor Parallelism(TP) is built on top of the PyTorch DistributedTensor
 (`DTensor <https://github.com/pytorch/pytorch/blob/main/torch/distributed/_tensor/README.md>`__)
-and provides different parallelism styles: Colwise and Rowwise Parallelism.
+and provides different parallelism styles: Colwise, Rowwise, and Sequence Parallelism.
 
 .. warning ::
     Tensor Parallelism APIs are experimental and subject to change.
@@ -29,10 +29,14 @@ Tensor Parallelism supports the following parallel styles:
   :members:
   :undoc-members:
 
+.. autoclass:: torch.distributed.tensor.parallel.SequenceParallel
+  :members:
+  :undoc-members:
+
 To simply configure the nn.Module's inputs and outputs with DTensor layouts
 and perform necessary layout redistributions, without distribute the module
-parameters to DTensors, the following classes can be used in
-the ``parallelize_plan`` of ``parallelize_module``:
+parameters to DTensors, the following ``ParallelStyle`` s can be used in
+the ``parallelize_plan`` when calling ``parallelize_module``:
 
 .. autoclass:: torch.distributed.tensor.parallel.PrepareModuleInput
   :members:
@@ -42,7 +46,22 @@ the ``parallelize_plan`` of ``parallelize_module``:
   :members:
   :undoc-members:
 
+.. note:: when using the ``Shard(dim)`` as the input/output layouts for the above
+  ``ParallelStyle`` s, we assume the input/output activation tensors are evenly sharded on
+  the tensor dimension ``dim`` on the ``DeviceMesh`` that TP operates on. For instance,
+  since ``RowwiseParallel`` accepts input that is sharded on the last dimension, it assumes
+  the input tensor has already been evenly sharded on the last dimension. For the case of uneven
+  sharded activation tensors, one could pass in DTensor directly to the partitioned modules,
+  and use ``use_local_output=False`` to return DTensor after each ``ParallelStyle``, where
+  DTensor could track the uneven sharding information.
 
 For models like Transformer, we recommend users to use ``ColwiseParallel``
 and ``RowwiseParallel`` together in the parallelize_plan for achieve the desired
 sharding for the entire model (i.e. Attention and MLP).
+
+Parallelized cross-entropy loss computation (loss parallelism), is supported via the following context manager:
+
+.. autofunction:: torch.distributed.tensor.parallel.loss_parallel
+
+.. warning ::
+    The loss_parallel API is experimental and subject to change.
diff --git a/docs/source/elastic/agent.rst b/docs/source/elastic/agent.rst
index db1465d71e682..ac42403761f39 100644
--- a/docs/source/elastic/agent.rst
+++ b/docs/source/elastic/agent.rst
@@ -74,3 +74,21 @@ will internally create a unique file name and set it to the environment
 variable ```TORCHELASTIC_TIMER_FILE```, and this environment variable will
 be propagated to the worker processes to allow them to connect to the same
 named pipe that ```LocalElasticAgent``` uses.
+
+
+Health Check Server
+-------------------
+
+A health check monitoring server can be enabled in ```LocalElasticAgent```
+if an environment variable ``TORCHELASTIC_HEALTH_CHECK_PORT`` has been defined
+in the ```LocalElasticAgent``` process.
+Adding interface for health check server which can be extended by starting tcp/http
+server on the specified port number.
+Additionally, health check server will have callback to check watchdog is alive.
+
+.. automodule:: torch.distributed.elastic.agent.server.health_check_server
+
+.. autoclass:: HealthCheckServer
+   :members:
+
+.. autofunction:: torch.distributed.elastic.agent.server.health_check_server.create_healthcheck_server
diff --git a/docs/source/elastic/multiprocessing.rst b/docs/source/elastic/multiprocessing.rst
index fc5866c01e7c7..c44f65cc696bf 100644
--- a/docs/source/elastic/multiprocessing.rst
+++ b/docs/source/elastic/multiprocessing.rst
@@ -22,3 +22,11 @@ Process Context
 .. autoclass:: SubprocessContext
 
 .. autoclass:: RunProcsResult
+
+.. autoclass:: DefaultLogsSpecs
+    :members:
+
+.. autoclass:: LogsDest
+
+.. autoclass:: LogsSpecs
+    :members:
diff --git a/docs/source/elastic/rendezvous.rst b/docs/source/elastic/rendezvous.rst
index 1b4628ef0124c..4c7551514b752 100644
--- a/docs/source/elastic/rendezvous.rst
+++ b/docs/source/elastic/rendezvous.rst
@@ -35,6 +35,7 @@ Exceptions
 .. autoclass:: RendezvousTimeoutError
 .. autoclass:: RendezvousConnectionError
 .. autoclass:: RendezvousStateError
+.. autoclass:: RendezvousGracefulExitError
 
 Implementations
 ---------------
diff --git a/docs/source/elastic/subprocess_handler.rst b/docs/source/elastic/subprocess_handler.rst
new file mode 100644
index 0000000000000..9483b044e4b0d
--- /dev/null
+++ b/docs/source/elastic/subprocess_handler.rst
@@ -0,0 +1,20 @@
+.. _elastic_subprocess_handler-api:
+
+Subprocess Handling
+======================
+
+.. automodule:: torch.distributed.elastic.multiprocessing.subprocess_handler
+.. automodule:: torch.distributed.elastic.multiprocessing.subprocess_handler.subprocess_handler
+.. automodule:: torch.distributed.elastic.multiprocessing.subprocess_handler.handlers
+
+Retrieve SubprocessHandler
+---------------------------
+
+.. autofunction:: torch.distributed.elastic.multiprocessing.subprocess_handler.handlers.get_subprocess_handler
+
+SubprocessHandler
+---------------------
+
+.. currentmodule:: torch.distributed.elastic.multiprocessing.subprocess_handler.subprocess_handler
+
+.. autoclass:: SubprocessHandler
diff --git a/docs/source/elastic/timer.rst b/docs/source/elastic/timer.rst
index f64597c4ce2bf..3f124a00b33eb 100644
--- a/docs/source/elastic/timer.rst
+++ b/docs/source/elastic/timer.rst
@@ -50,3 +50,11 @@ the server and client.
 
 .. autoclass:: TimerClient
    :members:
+
+
+Debug info logging
+-------------------
+
+.. automodule:: torch.distributed.elastic.timer.debug_info_logging
+
+.. autofunction:: torch.distributed.elastic.timer.debug_info_logging.log_debug_info_for_expired_timers
diff --git a/docs/source/export.ir_spec.rst b/docs/source/export.ir_spec.rst
index 220fa382c69b4..13a498b44df8a 100644
--- a/docs/source/export.ir_spec.rst
+++ b/docs/source/export.ir_spec.rst
@@ -110,7 +110,7 @@ Example::
        def forward(self, x, y):
          return x + y
 
-   mod = torch._export.export(MyModule())
+   mod = torch.export.export(MyModule())
    print(mod.graph)
 
 The above is the textual representation of a Graph, with each line being a node.
diff --git a/docs/source/export.rst b/docs/source/export.rst
index 3ded969188ab1..a4217e8081bad 100644
--- a/docs/source/export.rst
+++ b/docs/source/export.rst
@@ -22,15 +22,16 @@ serialized.
     import torch
     from torch.export import export
 
-    def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-        a = torch.sin(x)
-        b = torch.cos(y)
-        return a + b
+    class Mod(torch.nn.Module):
+        def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            a = torch.sin(x)
+            b = torch.cos(y)
+            return a + b
 
     example_args = (torch.randn(10, 10), torch.randn(10, 10))
 
     exported_program: torch.export.ExportedProgram = export(
-        f, args=example_args
+        Mod(), args=example_args
     )
     print(exported_program)
 
@@ -73,10 +74,6 @@ following invariants. More specifications about the IR can be found
   from the original programs are inlined to form one fully flattened
   computational graph.
 
-* **Defined Operator Set**: The graph produced contains only a small defined
-  :ref:`Core ATen IR <torch.compiler_ir>` opset and registered custom
-  operators.
-
 * **Graph properties**: The graph is purely functional, meaning it does not
   contain operations with side effects such as mutations or aliasing. It does
   not mutate any intermediate values, parameters, or buffers.
@@ -92,7 +89,7 @@ Under the hood, ``torch.export`` leverages the following latest technologies:
   rewrites needed in order to fully trace the PyTorch code.
 
 * **AOT Autograd** provides a functionalized PyTorch graph and ensures the graph
-  is decomposed/lowered to the small defined Core ATen operator set.
+  is decomposed/lowered to the ATen operator set.
 
 * **Torch FX (torch.fx)** is the underlying representation of the graph,
   allowing flexible Python-based transformations.
@@ -225,9 +222,9 @@ Inspecting the ``ExportedProgram``, we can note the following:
 * The :class:`torch.fx.Graph` contains the computation graph of the original
   program, along with records of the original code for easy debugging.
 
-* The graph contains only ``torch.ops.aten`` operators found in the
-  :ref:`Core ATen IR <torch.compiler_ir>` opset and custom operators, and is
-  fully functional, without any inplace operators such as ``torch.add_``.
+* The graph contains only ``torch.ops.aten`` operators found `here <https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml>`__
+  and custom operators, and is fully functional, without any inplace operators
+  such as ``torch.add_``.
 
 * The parameters (weight and bias to conv) are lifted as inputs to the graph,
   resulting in no ``get_attr`` nodes in the graph, which previously existed in
@@ -241,6 +238,59 @@ Inspecting the ``ExportedProgram``, we can note the following:
   ``torch.float32`` and shape (1, 16, 256, 256).
 
 
+.. _Non-Strict Export:
+
+Non-Strict Export
+^^^^^^^^^^^^^^^^^
+
+In PyTorch 2.3, we introduced a new mode of tracing called **non-strict mode**.
+It's still going through hardening, so if you run into any issues, please file
+them to Github with the "oncall: export" tag.
+
+In *non-strict mode*, we trace through the program using the Python interpreter.
+Your code will execute exactly as it would in eager mode; the only difference is
+that all Tensor objects will be replaced by ProxyTensors, which will record all
+their operations into a graph.
+
+In *strict* mode, which is currently the default, we first trace through the
+program using TorchDynamo, a bytecode analysis engine. TorchDynamo does not
+actually execute your Python code. Instead, it symbolically analyzes it and
+builds a graph based on the results. This analysis allows torch.export to
+provide stronger guarantees about safety, but not all Python code is supported.
+
+An example of a case where one might want to use non-strict mode is if you run
+into a unsupported TorchDynamo feature that might not be easily solved, and you
+know the python code is not exactly needed for computation. For example:
+
+::
+
+    import contextlib
+    import torch
+
+    class ContextManager():
+        def __init__(self):
+            self.count = 0
+        def __enter__(self):
+            self.count += 1
+        def __exit__(self, exc_type, exc_value, traceback):
+            self.count -= 1
+
+    class M(torch.nn.Module):
+        def forward(self, x):
+            with ContextManager():
+                return x.sin() + x.cos()
+
+    export(M(), (torch.ones(3, 3),), strict=False)  # Non-strict traces successfully
+    export(M(), (torch.ones(3, 3),))  # Strict mode fails with torch._dynamo.exc.Unsupported: ContextManager
+
+In this example, the first call using non-strict mode (through the
+``strict=False`` flag) traces successfully whereas the second call using strict
+mode (default) results with a failure, where TorchDynamo is unable to support
+context managers. One option is to rewrite the code (see :ref:`Limitations of torch.expot <Limitations of
+torch.export>`), but seeing as the context manager does not affect the tensor
+computations in the model, we can go with the non-strict mode's result.
+
+
 Expressing Dynamism
 ^^^^^^^^^^^^^^^^^^^
 
@@ -345,11 +395,59 @@ Some additional things to note:
   `The 0/1 Specialization Problem <https://docs.google.com/document/d/16VPOa3d-Liikf48teAOmxLc92rgvJdfosIy-yoT38Io/edit?fbclid=IwAR3HNwmmexcitV0pbZm_x1a4ykdXZ9th_eJWK-3hBtVgKnrkmemz6Pm5jRQ#heading=h.ez923tomjvyk>`_
   for an in-depth discussion of this topic.
 
-(A legacy mechanism for specifying dynamic shapes
-involves marking and constraining dynamic dimensions with the
-:func:`torch.export.dynamic_dim` API and passing them into :func:`torch.export.export`
-through the ``constraints`` argument. That mechanism is now **deprecated** and will
-not be supported in the future.)
+
+We can also specify more expressive relationships between input shapes, such as
+where a pair of shapes might differ by one, a shape might be double of
+another, or a shape is even. An example:
+
+::
+
+    class M(torch.nn.Module):
+        def forward(self, x, y):
+            return x + y[1:]
+
+    x, y = torch.randn(5), torch.randn(6)
+    dimx = torch.export.Dim("dimx", min=3, max=6)
+    dimy = dimx + 1
+
+    exported_program = torch.export.export(
+        M(), (x, y), dynamic_shapes=({0: dimx}, {0: dimy}),
+    )
+    print(exported_program)
+
+.. code-block::
+
+    ExportedProgram:
+    class GraphModule(torch.nn.Module):
+        def forward(self, arg0_1: "f32[s0]", arg1_1: "f32[s0 + 1]"):
+            # code: return x + y[1:]
+            slice_1: "f32[s0]" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 9223372036854775807);  arg1_1 = None
+            add: "f32[s0]" = torch.ops.aten.add.Tensor(arg0_1, slice_1);  arg0_1 = slice_1 = None
+            return (add,)
+
+    Graph signature: ExportGraphSignature(
+        input_specs=[
+            InputSpec(kind=<InputKind.USER_INPUT: 1>, arg=TensorArgument(name='arg0_1'), target=None, persistent=None),
+            InputSpec(kind=<InputKind.USER_INPUT: 1>, arg=TensorArgument(name='arg1_1'), target=None, persistent=None)
+        ],
+        output_specs=[
+            OutputSpec(kind=<OutputKind.USER_OUTPUT: 1>, arg=TensorArgument(name='add'), target=None)]
+    )
+    Range constraints: {s0: ValueRanges(lower=3, upper=6, is_bool=False), s0 + 1: ValueRanges(lower=4, upper=7, is_bool=False)}
+
+Some things to note:
+
+* By specifying ``{0: dimx}`` for the first input, we see that the resulting
+  shape of the first input is now dynamic, being ``[s0]``. And now by specifying
+  ``{0: dimy}`` for the second input, we see that the resulting shape of the
+  second input is also dynamic. However, because we expressed ``dimy = dimx + 1``,
+  instead of ``arg1_1``'s shape containing a new symbol, we see that it is
+  now being represented with the same symbol used in ``arg0_1``, ``s0``. We can
+  see that relationship of ``dimy = dimx + 1`` is being shown through ``s0 + 1``.
+
+* Looking at the range constraints, we see that ``s0`` has the range [3, 6],
+  which is specified initially, and we can see that ``s0 + 1`` has the solved
+  range of [4, 7].
 
 
 Serialization
@@ -376,31 +474,55 @@ An example:
     saved_exported_program = torch.export.load('exported_program.pt2')
 
 
-Specialization
-^^^^^^^^^^^^^^
+Specializations
+^^^^^^^^^^^^^^^
+
+A key concept in understanding the behavior of ``torch.export`` is the
+difference between *static* and *dynamic* values.
+
+A *dynamic* value is one that can change from run to run. These behave like
+normal arguments to a Python function—you can pass different values for an
+argument and expect your function to do the right thing. Tensor *data* is
+treated as dynamic.
+
+
+A *static* value is a value that is fixed at export time and cannot change
+between executions of the exported program. When the value is encountered during
+tracing, the exporter will treat it as a constant and hard-code it into the
+graph.
+
+When an operation is performed (e.g. ``x + y``) and all inputs are static, then
+the output of the operation will be directly hard-coded into the graph, and the
+operation won’t show up (i.e. it will get constant-folded).
 
-Input shapes
-~~~~~~~~~~~~
+When a value has been hard-coded into the graph, we say that the graph has been
+*specialized* to that value.
 
-As mentioned before, by default, ``torch.export`` will trace the program
-specializing on the input tensors' shapes, unless a dimension is specified as
-dynamic via the :func:`torch.export.dynamic_dim` API. This means that if there
-exists shape-dependent control flow, ``torch.export`` will specialize on the
-branch that is being taken with the given sample inputs. For example:
+The following values are static:
+
+Input Tensor Shapes
+~~~~~~~~~~~~~~~~~~~
+
+By default, ``torch.export`` will trace the program specializing on the input
+tensors' shapes, unless a dimension is specified as dynamic via the
+``dynamic_shapes`` argumen to ``torch.export``. This means that if there exists
+shape-dependent control flow, ``torch.export`` will specialize on the branch
+that is being taken with the given sample inputs. For example:
 
 ::
 
     import torch
     from torch.export import export
 
-    def fn(x):
-        if x.shape[0] > 5:
-            return x + 1
-        else:
-            return x - 1
+    class Mod(torch.nn.Module):
+        def forward(self, x):
+            if x.shape[0] > 5:
+                return x + 1
+            else:
+                return x - 1
 
     example_inputs = (torch.rand(10, 2),)
-    exported_program = export(fn, example_inputs)
+    exported_program = export(Mod(), example_inputs)
     print(exported_program)
 
 .. code-block::
@@ -420,13 +542,15 @@ branching behavior based on the shape of a tensor in the traced graph,
 of the input tensor (``x.shape[0]``) to be dynamic, and the source code will
 need to be :ref:`rewritten <Data/Shape-Dependent Control Flow>`.
 
-Non-tensor inputs
+Note that tensors that are part of the module state (e.g. parameters and
+buffers) always have static shapes.
+
+Python Primitives
 ~~~~~~~~~~~~~~~~~
 
-``torch.export`` also specializes the traced graph based on the values of inputs
-that are not ``torch.Tensor``, such as ``int``, ``float``, ``bool``, and ``str``.
-However, we will likely change this in the near future to not specialize on
-inputs of primitive types.
+``torch.export`` also specializes on Python primtivies,
+such as ``int``, ``float``, ``bool``, and ``str``. However they do have dynamic
+variants such as ``SymInt``, ``SymFloat``, and ``SymBool``.
 
 For example:
 
@@ -435,13 +559,14 @@ For example:
     import torch
     from torch.export import export
 
-    def fn(x: torch.Tensor, const: int, times: int):
-        for i in range(times):
-            x = x + const
-        return x
+    class Mod(torch.nn.Module):
+        def forward(self, x: torch.Tensor, const: int, times: int):
+            for i in range(times):
+                x = x + const
+            return x
 
     example_inputs = (torch.rand(2, 2), 1, 3)
-    exported_program = export(fn, example_inputs)
+    exported_program = export(Mod(), example_inputs)
     print(exported_program)
 
 .. code-block::
@@ -455,11 +580,21 @@ For example:
                 return (add_2,)
 
 Because integers are specialized, the ``torch.ops.aten.add.Tensor`` operations
-are all computed with the inlined constant ``1``, rather than ``arg1_1``.
+are all computed with the hard-coded constant ``1``, rather than ``arg1_1``. If
+a user passes a different value for ``arg1_1`` at runtime, like 2, than the one used
+during export time, 1, this will result in an error.
 Additionally, the ``times`` iterator used in the ``for`` loop is also "inlined"
 in the graph through the 3 repeated ``torch.ops.aten.add.Tensor`` calls, and the
 input ``arg2_1`` is never used.
 
+Python Containers
+~~~~~~~~~~~~~~~~~
+
+Python containers (``List``, ``Dict``, ``NamedTuple``, etc.) are considered to
+have static structure.
+
+
+.. _Limitations of torch.export:
 
 Limitations of torch.export
 ---------------------------
@@ -482,6 +617,9 @@ When a graph break is encountered, :ref:`ExportDB <torch.export_db>` is a great
 resource for learning about the kinds of programs that are supported and
 unsupported, along with ways to rewrite programs to make them traceable.
 
+An option to get past dealing with this graph breaks is by using
+:ref:`non-strict export <Non-Strict Export>`
+
 .. _Data/Shape-Dependent Control Flow:
 
 Data/Shape-Dependent Control Flow
@@ -530,7 +668,8 @@ Read More
    :caption: Deep Dive for PyTorch Developers
    :maxdepth: 1
 
-   torch.compiler_deepdive
+   torch.compiler_dynamo_overview
+   torch.compiler_dynamo_deepdive
    torch.compiler_dynamic_shapes
    torch.compiler_fake_tensor
 
@@ -546,6 +685,10 @@ API Reference
 .. autofunction:: register_dataclass
 .. autofunction:: torch.export.dynamic_shapes.Dim
 .. autofunction:: dims
+.. autoclass:: torch.export.dynamic_shapes.ShapesCollection
+
+    .. automethod:: dynamic_shapes
+
 .. autoclass:: Constraint
 .. autoclass:: ExportedProgram
 
@@ -554,6 +697,7 @@ API Reference
     .. automethod:: named_buffers
     .. automethod:: parameters
     .. automethod:: named_parameters
+    .. automethod:: run_decompositions
 
 .. autoclass:: ExportBackwardSignature
 .. autoclass:: ExportGraphSignature
@@ -570,7 +714,13 @@ API Reference
 .. autoclass:: ExportGraphSignature
 
     .. automethod:: replace_all_uses
+    .. automethod:: get_replace_hook
+
+.. autoclass:: torch.export.graph_signature.CustomObjArgument
 
 .. py:module:: torch.export.dynamic_shapes
 
 .. automodule:: torch.export.unflatten
+    :members:
+
+.. automodule:: torch.export.custom_obj
diff --git a/docs/source/future_mod.rst b/docs/source/future_mod.rst
new file mode 100644
index 0000000000000..1ef2a25330ea3
--- /dev/null
+++ b/docs/source/future_mod.rst
@@ -0,0 +1,10 @@
+torch.__future__
+===================================
+
+.. automodule:: torch.__future__
+.. currentmodule:: torch.__future__
+
+.. autofunction:: set_overwrite_module_params_on_conversion
+.. autofunction:: get_overwrite_module_params_on_conversion
+.. autofunction:: set_swap_module_params_on_conversion
+.. autofunction:: get_swap_module_params_on_conversion
diff --git a/docs/source/fx.experimental.rst b/docs/source/fx.experimental.rst
new file mode 100644
index 0000000000000..6abfb89971cd9
--- /dev/null
+++ b/docs/source/fx.experimental.rst
@@ -0,0 +1,52 @@
+.. currentmodule:: torch.fx.experimental
+
+torch.fx.experimental
+=====================
+
+.. warning::
+   These APIs are experimental and subject to change without notice.
+
+torch.fx.experimental.symbolic_shapes
+-------------------------------------
+.. currentmodule:: torch.fx.experimental.symbolic_shapes
+.. automodule:: torch.fx.experimental.symbolic_shapes
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    ShapeEnv
+    DimDynamic
+    StrictMinMaxConstraint
+    RelaxedUnspecConstraint
+    EqualityConstraint
+    SymbolicContext
+    StatelessSymbolicContext
+    StatefulSymbolicContext
+    SubclassSymbolicContext
+    DimConstraints
+    ShapeEnvSettings
+    ConvertIntKey
+    CallMethodKey
+    PropagateUnbackedSymInts
+    DivideByKey
+
+    hint_int
+    is_concrete_int
+    is_concrete_bool
+    has_free_symbols
+    definitely_true
+    definitely_false
+    guard_size_oblivious
+    parallel_or
+    parallel_and
+    sym_eq
+    constrain_range
+    constrain_unify
+    canonicalize_bool_expr
+    statically_known_true
+    lru_cache
+    check_consistent
+    compute_unbacked_bindings
+    rebind_unbacked
+    resolve_unbacked_bindings
diff --git a/docs/source/fx.rst b/docs/source/fx.rst
index df0ce8904d374..e9b7cd2d57236 100644
--- a/docs/source/fx.rst
+++ b/docs/source/fx.rst
@@ -1148,7 +1148,6 @@ API Reference
 .. py:module:: torch.fx.experimental.refinement_types
 .. py:module:: torch.fx.experimental.rewriter
 .. py:module:: torch.fx.experimental.schema_type_annotation
-.. py:module:: torch.fx.experimental.symbolic_shapes
 .. py:module:: torch.fx.experimental.sym_node
 .. py:module:: torch.fx.experimental.unification.core
 .. py:module:: torch.fx.experimental.unification.dispatch
@@ -1184,6 +1183,7 @@ API Reference
 .. py:module:: torch.fx.passes.param_fetch
 .. py:module:: torch.fx.passes.pass_manager
 .. py:module:: torch.fx.passes.reinplace
+.. py:module:: torch.fx.passes.runtime_assert
 .. py:module:: torch.fx.passes.shape_prop
 .. py:module:: torch.fx.passes.split_module
 .. py:module:: torch.fx.passes.split_utils
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e1f990a6f8a8f..ea704f20c3af7 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -68,6 +68,9 @@ Features described in this documentation are classified by release status:
    cuda
    torch.cuda.memory <torch_cuda_memory>
    mps
+   xpu
+   mtia
+   meta
    torch.backends <backends>
    torch.export <export>
    torch.distributed <distributed>
@@ -75,6 +78,7 @@ Features described in this documentation are classified by release status:
    torch.distributed.elastic <distributed.elastic>
    torch.distributed.fsdp <fsdp>
    torch.distributed.optim <distributed.optim>
+   torch.distributed.pipelining <distributed.pipelining>
    torch.distributed.tensor.parallel <distributed.tensor.parallel>
    torch.distributed.checkpoint <distributed.checkpoint>
    torch.distributions <distributions>
@@ -83,6 +87,7 @@ Features described in this documentation are classified by release status:
    torch.func <func>
    futures
    fx
+   fx.experimental
    torch.hub <hub>
    torch.jit <jit>
    torch.linalg <linalg>
@@ -93,7 +98,7 @@ Features described in this documentation are classified by release status:
    torch.package <package>
    profiler
    nn.init
-   nn.attention.bias
+   nn.attention
    onnx
    optim
    complex_numbers
@@ -104,6 +109,7 @@ Features described in this documentation are classified by release status:
    torch.random <random>
    masked
    torch.nested <nested>
+   size
    sparse
    storage
    torch.testing <testing>
@@ -119,11 +125,14 @@ Features described in this documentation are classified by release status:
    torch.utils.mobile_optimizer <mobile_optimizer>
    torch.utils.model_zoo <model_zoo>
    torch.utils.tensorboard <tensorboard>
+   torch.utils.module_tracker <module_tracker>
    type_info
    named_tensor
    name_inference
    torch.__config__ <config_mod>
+   torch.__future__ <future_mod>
    logging
+   torch_environment_variables
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/library.rst b/docs/source/library.rst
index 4da7485d98567..236da45f93c11 100644
--- a/docs/source/library.rst
+++ b/docs/source/library.rst
@@ -4,21 +4,41 @@ torch.library
 .. currentmodule:: torch.library
 
 torch.library is a collection of APIs for extending PyTorch's core library
-of operators. It contains utilities for creating new custom operators as
-well as extending operators defined with PyTorch's C++ operator
+of operators. It contains utilities for testing custom operators, creating new
+custom operators, and extending operators defined with PyTorch's C++ operator
 registration APIs (e.g. aten operators).
 
 For a detailed guide on effectively using these APIs, please see
 `this gdoc <https://docs.google.com/document/d/1W--T6wz8IY8fOI0Vm8BF44PdBgs283QvpelJZWieQWQ/edit>`_
 
-Use :func:`torch.library.define` to define new custom operators. Use the
-impl methods, such as :func:`torch.library.impl` and
-func:`torch.library.impl_abstract`, to add implementations
-for any operators (they may have been created using :func:`torch.library.define` or
+Testing custom ops
+------------------
+
+Use :func:`torch.library.opcheck` to test custom ops for incorrect usage of the
+Python torch.library and/or C++ TORCH_LIBRARY APIs. Also, if your operator supports
+training, use :func:`torch.autograd.gradcheck` to test that the gradients are
+mathematically correct.
+
+.. autofunction:: opcheck
+
+Creating new custom ops in Python
+---------------------------------
+
+Use :func:`torch.library.custom_op` to create new custom ops.
+
+.. autofunction:: custom_op
+
+Extending custom ops (created from Python or C++)
+-------------------------------------------------
+
+Use the register.* methods, such as :func:`torch.library.register_kernel` and
+func:`torch.library.register_fake`, to add implementations
+for any operators (they may have been created using :func:`torch.library.custom_op` or
 via PyTorch's C++ operator registration APIs).
 
-.. autofunction:: define
-.. autofunction:: impl
+.. autofunction:: register_kernel
+.. autofunction:: register_autograd
+.. autofunction:: register_fake
 .. autofunction:: impl_abstract
 .. autofunction:: get_ctx
 
@@ -41,3 +61,7 @@ A tutorial that walks you through some examples on how to use this API is availa
   :members:
 
 .. autofunction:: fallthrough_kernel
+
+.. autofunction:: define
+
+.. autofunction:: impl
diff --git a/docs/source/logging.rst b/docs/source/logging.rst
index 9901bf77c788a..457ebd9dbce41 100644
--- a/docs/source/logging.rst
+++ b/docs/source/logging.rst
@@ -9,7 +9,7 @@ given different log level settings. For instance, one component's log messages
 can be completely disabled, while another component's log messages can be
 set to maximum verbosity.
 
-.. warning:: This feature is a prototype and may have compatibility breaking
+.. warning:: This feature is in beta and may have compatibility breaking
     changes in the future.
 
 .. warning:: This feature has not been expanded to control the log messages of
diff --git a/docs/source/meta.rst b/docs/source/meta.rst
new file mode 100644
index 0000000000000..21fd3af9dfe43
--- /dev/null
+++ b/docs/source/meta.rst
@@ -0,0 +1,84 @@
+Meta device
+============
+
+The "meta" device is an abstract device which denotes a tensor which records
+only metadata, but no actual data.  Meta tensors have two primary use cases:
+
+* Models can be loaded on the meta device, allowing you to load a
+  representation of the model without actually loading the actual parameters
+  into memory.  This can be helpful if you need to make transformations on
+  the model before you load the actual data.
+
+* Most operations can be performed on meta tensors, producing new meta
+  tensors that describe what the result would have been if you performed
+  the operation on a real tensor.  You can use this to perform abstract
+  analysis without needing to spend time on compute or space to represent
+  the actual tensors.  Because meta tensors do not have real data, you cannot
+  perform data-dependent operations like :func:`torch.nonzero` or
+  :meth:`~torch.Tensor.item`.  In some cases, not all device types (e.g., CPU
+  and CUDA) have exactly the same output metadata for an operation; we
+  typically prefer representing the CUDA behavior faithfully in this
+  situation.
+
+.. warning::
+
+    Although in principle meta tensor computation should always be faster than
+    an equivalent CPU/CUDA computation, many meta tensor implementations are
+    implemented in Python and have not been ported to C++ for speed, so you
+    may find that you get lower absolute framework latency with small CPU tensors.
+
+Idioms for working with meta tensors
+------------------------------------
+
+An object can be loaded with :func:`torch.load` onto meta device by specifying
+``map_location='meta'``::
+
+    >>> torch.save(torch.randn(2), 'foo.pt')
+    >>> torch.load('foo.pt', map_location='meta')
+    tensor(..., device='meta', size=(2,))
+
+If you have some arbitrary code which performs some tensor construction without
+explicitly specifying a device, you can override it to instead construct on meta device by using
+the :func:`torch.device` context manager::
+
+    >>> with torch.device('meta'):
+    ...     print(torch.randn(30, 30))
+    ...
+    tensor(..., device='meta', size=(30, 30))
+
+This is especially helpful NN module construction, where you often are not
+able to explicitly pass in a device for initialization::
+
+    >>> from torch.nn.modules import Linear
+    >>> with torch.device('meta'):
+    ...     print(Linear(20, 30))
+    ...
+    Linear(in_features=20, out_features=30, bias=True)
+
+You cannot convert a meta tensor directly to a CPU/CUDA tensor, because the
+meta tensor stores no data and we do not know what the correct data values for
+your new tensor are::
+
+    >>> torch.ones(5, device='meta').to("cpu")
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    NotImplementedError: Cannot copy out of meta tensor; no data!
+
+Use a factory function like :func:`torch.empty_like` to explicitly specify how
+you would like the missing data to be filled in.
+
+NN modules have a convenience method :meth:`torch.nn.Module.to_empty` that
+allow you to the module to another device, leaving all parameters
+uninitialized.  You are expected to explicitly reinitialize the parameters
+manually::
+
+    >>> from torch.nn.modules import Linear
+    >>> with torch.device('meta'):
+    ...     m = Linear(20, 30)
+    >>> m.to_empty(device="cpu")
+    Linear(in_features=20, out_features=30, bias=True)
+
+:mod:`torch._subclasses.meta_utils` contains undocumented utilities for taking
+an arbitrary Tensor and constructing an equivalent meta Tensor with high
+fidelity.  These APIs are experimental and may be changed in a BC breaking way
+at any time.
diff --git a/docs/source/miscellaneous_environment_variables.rst b/docs/source/miscellaneous_environment_variables.rst
new file mode 100644
index 0000000000000..37b43d4946bd7
--- /dev/null
+++ b/docs/source/miscellaneous_environment_variables.rst
@@ -0,0 +1,13 @@
+.. _miscellaneous_environment_variables:
+
+Miscellaneous Environment Variables
+===================================
+.. list-table::
+  :header-rows: 1
+
+  * - Variable
+    - Description
+  * - ``TORCH_FORCE_WEIGHTS_ONLY_LOAD``
+    - If set to [``1``, ``y``, ``yes``, ``true``], the torch.load will use ``weight_only=True``. For more documentation on this, see :func:`torch.load`.
+  * - ``TORCH_AUTOGRAD_SHUTDOWN_WAIT_LIMIT``
+    - Under some conditions, autograd threads can hang on shutdown, therefore we do not wait for them to shutdown indefinitely but rely on timeout that is default set to ``10`` seconds. This environment variable can be used to set the timeout in seconds.
\ No newline at end of file
diff --git a/docs/source/module_tracker.rst b/docs/source/module_tracker.rst
new file mode 100644
index 0000000000000..ecb84b22f32e5
--- /dev/null
+++ b/docs/source/module_tracker.rst
@@ -0,0 +1,8 @@
+torch.utils.module_tracker
+===================================
+.. automodule:: torch.utils.module_tracker
+
+This utility can be used to track the current position inside an :class:`torch.nn.Module` hierarchy.
+It can be used within other tracking tools to be able to easily associate measured quantities to user-friendly names. This is used in particular in the FlopCounterMode today.
+
+.. autoclass:: torch.utils.module_tracker.ModuleTracker
diff --git a/docs/source/mps.rst b/docs/source/mps.rst
index bff15666ff33a..bab0d3378ea86 100644
--- a/docs/source/mps.rst
+++ b/docs/source/mps.rst
@@ -7,6 +7,7 @@ torch.mps
     :toctree: generated
     :nosignatures:
 
+    device_count
     synchronize
     get_rng_state
     set_rng_state
@@ -39,4 +40,4 @@ MPS Event
 .. This module needs to be documented. Adding here in the meantime
 .. for tracking purposes
 .. py:module:: torch.mps.event
-.. py:module:: torch.mps.profiler
\ No newline at end of file
+.. py:module:: torch.mps.profiler
diff --git a/docs/source/mtia.rst b/docs/source/mtia.rst
new file mode 100644
index 0000000000000..f2f5b5195dcbe
--- /dev/null
+++ b/docs/source/mtia.rst
@@ -0,0 +1,34 @@
+torch.mtia
+===================================
+
+The MTIA backend is implemented out of the tree, only interfaces are be defined here.
+
+.. automodule:: torch.mtia
+.. currentmodule:: torch.mtia
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    StreamContext
+    current_device
+    current_stream
+    default_stream
+    device_count
+    init
+    is_available
+    is_initialized
+    set_stream
+    stream
+    synchronize
+    device
+    DeferredMtiaCallError
+
+Streams and events
+------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Event
+    Stream
diff --git a/docs/source/nn.attention.bias.rst b/docs/source/nn.attention.bias.rst
index 60cedce36ef6f..200f0e09e43b5 100644
--- a/docs/source/nn.attention.bias.rst
+++ b/docs/source/nn.attention.bias.rst
@@ -10,7 +10,13 @@ torch.nn.attention.bias
 CausalBias
 ==========
 
-.. autoclass:: CausalBias
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classnoinheritance.rst
+
+    CausalBias
+
 
 .. autosummary::
     :toctree: generated
diff --git a/docs/source/nn.attention.rst b/docs/source/nn.attention.rst
new file mode 100644
index 0000000000000..82fd62b65632a
--- /dev/null
+++ b/docs/source/nn.attention.rst
@@ -0,0 +1,28 @@
+.. role:: hidden
+    :class: hidden-section
+
+torch.nn.attention
+==================
+
+.. automodule:: torch.nn.attention
+
+Utils
+-------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    sdpa_kernel
+    SDPBackend
+
+Submodules
+----------
+.. autosummary::
+    :nosignatures:
+
+    bias
+
+.. toctree::
+    :hidden:
+
+    nn.attention.bias
\ No newline at end of file
diff --git a/docs/source/nn.functional.rst b/docs/source/nn.functional.rst
index a2241cb4764e2..9d2ea0eef5ea7 100644
--- a/docs/source/nn.functional.rst
+++ b/docs/source/nn.functional.rst
@@ -108,6 +108,7 @@ Non-linear activation functions
     instance_norm
     layer_norm
     local_response_norm
+    rms_norm
     normalize
 
 .. _Link 1: https://arxiv.org/abs/1611.00712
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 38d9e5860269e..3380c5cf3fb0b 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -131,6 +131,9 @@ Padding Layers
     nn.ConstantPad1d
     nn.ConstantPad2d
     nn.ConstantPad3d
+    nn.CircularPad1d
+    nn.CircularPad2d
+    nn.CircularPad3d
 
 Non-linear Activations (weighted sum, nonlinearity)
 ---------------------------------------------------
@@ -204,6 +207,7 @@ Normalization Layers
     nn.LazyInstanceNorm3d
     nn.LayerNorm
     nn.LocalResponseNorm
+    nn.RMSNorm
 
 Recurrent Layers
 ----------------
@@ -524,10 +528,21 @@ Lazy Modules Initialization
 
     nn.modules.lazy.LazyModuleMixin
 
+Aliases
+_______
+
+The following are aliases to their counterparts in ``torch.nn``:
+
+.. currentmodule:: torch
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    nn.modules.normalization.RMSNorm
 
 .. This module needs to be documented. Adding here in the meantime
 .. for tracking purposes
-.. py:module:: torch.nn.attention
 .. py:module:: torch.nn.backends
 .. py:module:: torch.nn.utils.stateless
 .. py:module:: torch.nn.backends.thnn
@@ -561,7 +576,6 @@ Lazy Modules Initialization
 .. py:module:: torch.nn.modules.upsampling
 .. py:module:: torch.nn.modules.utils
 .. py:module:: torch.nn.parallel.comm
-.. py:module:: torch.nn.parallel.data_parallel
 .. py:module:: torch.nn.parallel.distributed
 .. py:module:: torch.nn.parallel.parallel_apply
 .. py:module:: torch.nn.parallel.replicate
@@ -576,5 +590,3 @@ Lazy Modules Initialization
 .. py:module:: torch.nn.utils.parametrize
 .. py:module:: torch.nn.utils.prune
 .. py:module:: torch.nn.utils.rnn
-.. py:module:: torch.nn.utils.spectral_norm
-.. py:module:: torch.nn.utils.weight_norm
diff --git a/docs/source/notes/autograd.rst b/docs/source/notes/autograd.rst
index 2892e75fcfb2b..81ebf64bc43ac 100644
--- a/docs/source/notes/autograd.rst
+++ b/docs/source/notes/autograd.rst
@@ -418,8 +418,8 @@ The short version:
   the gradients are computed under the assumption that the function is a part of a larger real-valued
   loss function :math:`g(input)=L`. The gradient computed is :math:`\frac{\partial L}{\partial z^*}`
   (note the conjugation of z), the negative of which is precisely the direction of steepest descent
-  used in Gradient Descent algorithm. Thus, all the existing optimizers work out of
-  the box with complex parameters.
+  used in Gradient Descent algorithm. Thus, there is a viable path in making the existing optimizers
+  work out of the box with complex parameters.
 - This convention matches TensorFlow's convention for complex
   differentiation, but is different from JAX (which computes
   :math:`\frac{\partial L}{\partial z}`).
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index 4afa8aa1e505c..7d434bbbba64c 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -1,8 +1,13 @@
+.. meta::
+   :description: A guide to torch.cuda, a PyTorch module to run CUDA operations
+   :keywords: memory management, PYTORCH_CUDA_ALLOC_CONF, optimize PyTorch, CUDA
+
 .. _cuda-semantics:
 
 CUDA semantics
 ==============
 
+
 :mod:`torch.cuda` is used to set up and run CUDA operations. It keeps track of
 the currently selected GPU, and all CUDA tensors you allocate will by default be
 created on that device. The selected device can be changed with a
@@ -415,8 +420,8 @@ underlying allocation patterns produced by your code.
 
 .. _cuda-memory-envvars:
 
-Environment variables
-^^^^^^^^^^^^^^^^^^^^^
+Optimizing memory usage  with ``PYTORCH_CUDA_ALLOC_CONF``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Use of a caching allocator can interfere with memory checking tools such as
 ``cuda-memcheck``.  To debug memory errors using ``cuda-memcheck``, set
@@ -971,9 +976,12 @@ Violating any of these will likely cause a runtime error:
   :class:`~torch.cuda.graph` and
   :func:`~torch.cuda.make_graphed_callables` set a side stream for you.)
 * Ops that synchronize the CPU with the GPU (e.g., ``.item()`` calls) are prohibited.
-* CUDA RNG ops are allowed, but must use default generators. For example, explicitly constructing a
-  new :class:`torch.Generator` instance and passing it as the ``generator`` argument to an RNG function
-  is prohibited.
+* CUDA RNG operations are permitted, and when using multiple :class:`torch.Generator` instances within a graph,
+  they must be registered using :meth:`CUDAGraph.register_generator_state<torch.cuda.CUDAGraph.register_generator_state>` before graph capture.
+  Avoid using :meth:`Generator.get_state<torch.get_state>` and :meth:`Generator.set_state<torch.set_state>` during capture;
+  instead, utilize :meth:`Generator.graphsafe_set_state<torch.Generator.graphsafe_set_state>` and :meth:`Generator.graphsafe_get_state<torch.Generator.graphsafe_get_state>`
+  for managing generator states safely within the graph context. This ensures proper RNG operation and generator management within CUDA graphs.
+
 
 Violating any of these will likely cause silent numerical errors or undefined behavior:
 
diff --git a/docs/source/notes/ddp.rst b/docs/source/notes/ddp.rst
index 43256a2a68677..07804f8bae166 100644
--- a/docs/source/notes/ddp.rst
+++ b/docs/source/notes/ddp.rst
@@ -218,8 +218,8 @@ to fire in-between sections of backwards, and schedule communications to overlap
 
 See `this blog post <https://dev-discuss.pytorch.org/t/torchdynamo-update-9-making-ddp-work-with-torchdynamo/860/1>`_ for
 a more in-depth explanation and experimental results, or read the docs and code at
-`torch/_dynamo/optimizations/distributed.py <https://github.com/pytorch/pytorch/blob/4908a12542798a3e8641faae6b74f068fdfc6778/torch/_dynamo/optimizations/distributed.py#L56>`_
+`torch/_dynamo/optimizations/distributed.py <https://github.com/pytorch/pytorch/blob/bbc39b7bb48d28d67e3253a89cc82df3687ddd1b/torch/_dynamo/backends/distributed.py#L124>`_
 
-To Debug DDPOptimizer, set `torch._dynamo.config.log_level` to DEBUG (for full graph dumps) or INFO
+To Debug DDPOptimizer, set `TORCH_LOGS='ddp_graphs'` for full graph dumps. For logs without graphs, add any of 'dynamo', 'distributed', or 'dist_ddp' to  `TORCH_LOGS`
 (for basic info about bucket boundaries).  To disable DDPOptimizer, set `torch._dynamo.config.optimize_ddp=False`.
 DDP and TorchDynamo should still work correctly without DDPOptimizer, but with performance degradation.
diff --git a/docs/source/notes/fsdp.rst b/docs/source/notes/fsdp.rst
new file mode 100644
index 0000000000000..5a3c1ab377ab5
--- /dev/null
+++ b/docs/source/notes/fsdp.rst
@@ -0,0 +1,148 @@
+.. _fsdp_notes:
+
+FSDP Notes
+==========
+
+.. _fsdp_prefetch:
+
+FSDP Prefetch Nuances
+---------------------
+
+For overlapping ``forward`` all-gathers with ``forward`` compute, there are two possible mechanisms:
+
+1. Implicit forward prefetching (always enabled)
+2. Explicit forward prefetching (``forward_prefetch=True``)
+
+Implicit ``forward`` prefetching refers to relying on issuing the all-gathers from a separate CUDA
+stream to allow for overlapping an all-gather with ``forward`` compute issued before it (from the CPU
+perspective). For example, if we have layer 0 all-gather -> layer 0 ``forward`` compute -> layer 1
+all-gather -> …, then layer 1 all-gather can overlap with layer 0 ``forward`` compute even though the
+CPU thread issued it afterwards. (The 1st all-gather will not be able to overlap with anything.)
+
+Explicit ``forward`` prefetching refers to changing the CPU thread’s issue order: e.g. layer 0
+all-gather -> layer 1 all-gather -> layer 0 ``forward`` compute -> …. In eager mode, there is no way to
+know in general which layer is the next layer (e.g. layer 1 in the example) when still executing on
+layer 0. Therefore, explicit ``forward`` prefetching should only be used for models whose execution
+order is fixed from iteration to iteration (which we sometimes call “static graph”). An example of a
+model that does not satisfy this constraint is `FLAVA
+<https://pytorch.org/blog/scaling-multimodal-foundation-models-in-torchmultimodal-with-pytorch-distributed/>`_).
+
+Explicit ``forward`` prefetching only saves the time taken to issue a layer’s ``forward`` compute kernels at
+the cost that the next all-gather’s output tensor must be allocated while the current one is still
+in use. By issuing the next all- gather before the current ``forward`` compute kernels, the next
+all-gather can start sooner on GPU. For most LLM workloads, this is not the case, so there is no
+motivation for enabling ``forward_prefetch=True``.
+
+In contrast, for ``backward``, we must use explicit ``backward`` prefetching or else there will be 0 overlap
+of communication and computation. The reason is because we use a single NCCL process group for both
+all-gather and reduce-scatter (partially because in earlier NCCL versions, it was not safe to use
+multiple concurrently on the same device over the same ranks). A single NCCL process group means a
+single internal NCCL stream on which reduce-scatters and all-gathers run serially. As such, unless
+we explicitly reorder the CPU issue order to be next all-gather -> current reduce-scatter, then the
+current reduce-scatter would block the next all-gather and hence the next ``backward`` computation,
+preventing the current reduce-scatter from overlapping.
+
+.. _fsdp_comms_payload_size:
+
+Communication payload size
+--------------------------
+
+In FSDP the communications are:
+
+1. all-gather on parameters in ``forward``
+2. all-gather on parameters in ``backward``
+3. reduce-scatter on gradients in ``backward``
+
+If activation checkpointing (:func:`~torch.utils.checkpoint.checkpoint`) is used there is no
+additional communication since the parameters are prefetched anyway during ``backward``.
+
+In the FSDP design, the communication payload per rank is determined as follows: Each call to
+:class:`FullyShardedDataParallel` creates one communication group consisting of the parameters in
+``module.parameters()`` except any already assigned to a nested :class:`FullyShardedDataParallel`
+instance. For example, for Llama, if you apply :class:`FullyShardedDataParallel` to every
+transformer block and also to the root module, then there is one communication group for each
+transformer block and finally one communication group with the initial embedding and final linear.
+Each communication group corresponds to a single all-gather call and single reduce-scatter call. In
+that way, how you apply :class:`FullyShardedDataParallel` determines the communication size. In
+general, applying FSDP to each transformer block is a good heuristic for LLMs, and it is hard to do
+better than that given the current design.
+
+Let's consider an example where we have a Transformer-based model sharded over 8 GPUs, where the
+sharding happens at the transformer block-level only, and each transformer block contains 1.6B
+parameters and the parameters are in fp32 (4 bytes each). Which means that once sharded, each
+transformer block will contain 0.2B parameters on each rank.
+
+* The ``forward`` pass will communicate in chunks of ``0.2*4 = 0.8GB`` in all-gather
+* The ``backward`` pass will communicate 2 times ``0.8GB`` each (1x all-gather and 1x reduce-scatter)
+
+In other words there will be 3 communications with a payload of ``0.8GB`` each. If the model was
+comprised of 10 transformer blocks there would be a total of 30 communications for a total of
+``30*0.8=24GB``.
+
+To formalize the payload size per communication per rank is
+``total_transformer_block_params_in_B*dtype_bytes/num_gpus`` (GBs).
+
+Please note that in this example we didn't include the additional communications required for the
+embedding, which should be accounted for as well. And the math would depend on whether the input and
+output embeddings are tied or not. If they aren't tied there will be 2x more communications.
+
+.. _fsdp_buffers_sizes:
+
+FSDP buffers sizes
+------------------
+
+First, let's cover the buffers allocated for communications:
+
+``forward`` currently requires 2x all-gather buffer size. Here is why:
+
+As explained in :ref:`fsdp_prefetch` in the case of explicit ``forward`` prefetching
+(``forward_prefetch=True`) case of layer 0 all-gather -> layer 0 forward compute -> layer 1
+all-gather there is a need for 2 all-gather-sized buffers, because one buffer is used in the current ``forward`` while the other is used to do the prefetching.
+
+While the implicit ``forward`` prefetching (``forward_prefetch=False``, default) case of the same sequence in theory should need only 1 buffer, in reality it's still 2x all-gather-sized buffers. The reason is that in the flat-parameter FSDP design, we do not copy-out of the all-gather buffer. The parameters used for compute are directly viewed into the all-gather buffer (in fact, the main benefit of the "flat parameter" is exactly this reason). In that case, while 'layer 1 all-gather' is overlapping with 'layer 0 forward compute', the 'layer 0 forward compute' is using the parameters viewed into the 'layer 0 all-gather' buffer.
+
+A natural question then is, when would you want ``forward_prefetch=False``? For static-graph models (like most LLMs), there is a major technical reason. It is more that, practically, we added this option quickly for some CPU-bound internal models and have not tested every code path with it in unit testing, so we are less confident in it. ``forward_prefetching=False`` can be slightly easier to reason about since we do not have to check the recorded forward order as a possible 'failure mode'; a module's all-gather can always be found under its own ``record_function`` label in its profiler trace.
+
+``backward`` currently requires at least 2x all-gather buffer size and potentially a bit more. Here is why:
+
+The current FSDP design uses ``recordStream`` to manage allocations produced in one stream consumed in another, which can lead to more memory usage than expected. How much more can be "non-deterministic" in that it depends on GPU kernel timing relative to the CPU. The ``limit_all_gathers=True`` argument is a mitigation to that - for more details refer to this discussion is `FSDP & CUDACachingAllocator <https://dev-discuss.pytorch.org/t/fsdp-cudacachingallocator-an-outsider-newb-perspective/1486/1>`_.
+
+The way existing FSDP works with autograd:
+
+* Existing FSDP all-gathers the ``flat_param``, which is the autograd leaf.
+* It calls ``torch.split`` to get 1D views into the ``flat_param`` corresponding to its constituent original parameters.
+* It calls ``torch.view`` on each 1D split to view back to ND.
+* This means that in ``backward``, we end up with ``ViewBackward`` (ND -> 1D) and ``SplitWithSizesBackward`` (which is a concat). In particular, each individual gradient is computed as a separate allocation, and an explicit concat happens to construct the reduce-scatter input buffer. This implies actually a 2x buffer size for reduce-scatter at that peak memory point.
+
+In summary, for ``backward``, it is about 2x buffer size for reduce-scatter plus any ``recordStream`` effects.
+
+Second, let's discuss the additional buffers:
+
+Once the sharded parameters are gathered from all ranks, they require an additional buffer of `total_transformer_block_params_in_B*dtype_bytes` for the full parameters - so continuing the example from earlier if each transformer block is 1.6B parameters and the parameters are in fp32, then it'd be `1.6*4=6.4GB` buffer.
+
+And there is a need for 2 of those buffers, since there is one currently being used and another being prefetched.
+
+To summarize, we have:
+
+1. 2 times communication buffers of ``total_transformer_block_params_in_B*dtype_bytes/num_gpus``
+2. 2 times unsharded transformer block parameters buffer ````total_transformer_block_params_in_B*dtype_bytes``
+
+or if you have been following the example:
+
+1. ``2*1.6*4/8=1.6GB``
+2. ``2**1.6*4=12.8GB``
+
+and the total of ``14.4GB``.
+
+Now let's briefly discuss what happens to the embeddings as we have left those out from the calculations:
+
+Given the rule we discussed that you included in the note starting with "the communication buffer
+size is determined as follows", we can analyze as follows:
+
+* Suppose we apply FSDP to the root module (e.g. the ``Transformer`` class). Suppose we further apply FSDP to each transformer block (e.g. the ``TransformerBlock`` class).
+* Most commonly, the embedding and final linear projection are direct children of the root ``Transformer`` class.
+* Following our rule, that means that the embedding and final linear projection are assigned to the root ``Transformer``'s flat parameter.
+* We have _another_ special rule, which is that the root does not free its parameters after forward because they will be anyways immediately all-gathered in backward.
+* Putting this together, this means that the root's flat parameter including the embedding and final projection are all-gathered to begin forward and kept in GPU memory until the end of backward.
+* If the embedding and final linear are not weight-tied, then we _could_ further apply FSDP to the embedding and to the final linear. For weight-tied parameters, we require them to be part of the same flat parameter (or else it would get double-counted). That would allow the embedding to be freed after its usage in forward and only all-gathered toward the end of backward.
+* Hopefully, this gives a better sense -- each FSDP module gets assigned parameters in its ``module.parameters`` except those already assigned to another nested FSDP module, and the FSDP module's ``forward`` defines the 'live' interval for its parameters. Hence, the nested ``nn.Module`` structure can affect the all-gather/free schedule and hence the memory/throughput performance.
diff --git a/docs/source/notes/randomness.rst b/docs/source/notes/randomness.rst
index 7c71a6ed6929f..ff29fdb6d67aa 100644
--- a/docs/source/notes/randomness.rst
+++ b/docs/source/notes/randomness.rst
@@ -127,7 +127,7 @@ deterministic implementation will be used::
 
 Furthermore, if you are using CUDA tensors, and your CUDA version is 10.2 or greater, you
 should set the environment variable `CUBLAS_WORKSPACE_CONFIG` according to CUDA documentation:
-`<https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility>`_
+`<https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility>`_
 
 CUDA convolution determinism
 ----------------------------
diff --git a/docs/source/notes/serialization.rst b/docs/source/notes/serialization.rst
index c20ee693d1c91..09fd9e858b87e 100644
--- a/docs/source/notes/serialization.rst
+++ b/docs/source/notes/serialization.rst
@@ -176,6 +176,44 @@ can use this pattern:
     >>> new_m.load_state_dict(m_state_dict)
     <All keys matched successfully>
 
+.. _serialized-file-format:
+
+Serialized file format for ``torch.save``
+-----------------------------------------
+
+Since PyTorch 1.6.0, ``torch.save`` defaults to returning an uncompressed ZIP64
+archive unless the user sets ``_use_new_zipfile_serialization=False``.
+
+In this archive, the files are ordered as such
+
+.. code-block:: text
+
+    checkpoint.pth
+    ├── data.pkl
+    ├── byteorder  # added in PyTorch 2.1.0
+    ├── data/
+    │   ├── 0
+    │   ├── 1
+    │   ├── 2
+    │   └── …
+    └── version
+
+The entries are as follows:
+  * ``data.pkl`` is the result of pickling the object passed to ``torch.save``
+    excluding ``torch.Storage`` objects that it contains
+  * ``byteorder`` contains a string with the ``sys.byteorder`` when saving (“little” or “big”)
+  * ``data/`` contains all the storages in the object, where each storage is a separate file
+  * ``version`` contains a version number at save time that can be used at load time
+
+When saving, PyTorch will ensure that the local file header of each file is padded
+to an offset that is a multiple of 64 bytes, ensuring that the offset of each file
+is 64-byte aligned.
+
+.. note::
+    Tensors on certain devices such as XLA are serialized as pickled numpy arrays. As
+    such, their storages are not serialized. In these cases ``data/`` might not exist
+    in the checkpoint.
+
 .. _serializing-python-modules:
 
 Serializing torch.nn.Modules and loading them in C++
@@ -354,3 +392,5 @@ The following utility functions are related to serialization:
 .. autofunction:: register_package
 .. autofunction:: get_default_load_endianness
 .. autofunction:: set_default_load_endianness
+.. autofunction:: get_default_mmap_options
+.. autofunction:: set_default_mmap_options
diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index 9a8f766913693..ffaa8ef836b78 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -18,7 +18,7 @@ Microsoft's `ONNX Runtime <https://www.onnxruntime.ai>`_.
 TorchDynamo-based ONNX Exporter
 -------------------------------
 
-*The TorchDynamo-based ONNX exporter is the newest (and Beta) exporter for PyTorch 2.0 and newer*
+*The TorchDynamo-based ONNX exporter is the newest (and Beta) exporter for PyTorch 2.1 and newer*
 
 TorchDynamo engine is leveraged to hook into Python's frame evaluation API and dynamically rewrite its
 bytecode into an FX Graph. The resulting FX Graph is then polished before it is finally translated into an
@@ -77,6 +77,8 @@ also be interested in reading our `development wiki <https://github.com/pytorch/
 .. py:module:: torch.onnx.symbolic_opset16
 .. py:module:: torch.onnx.symbolic_opset17
 .. py:module:: torch.onnx.symbolic_opset18
+.. py:module:: torch.onnx.symbolic_opset19
+.. py:module:: torch.onnx.symbolic_opset20
 .. py:module:: torch.onnx.symbolic_opset7
 .. py:module:: torch.onnx.symbolic_opset8
 .. py:module:: torch.onnx.symbolic_opset9
diff --git a/docs/source/optim.rst b/docs/source/optim.rst
index 4ce1fcdf8f008..f6d6ffb923c3d 100644
--- a/docs/source/optim.rst
+++ b/docs/source/optim.rst
@@ -31,6 +31,17 @@ a ``params`` key, containing a list of parameters belonging to it. Other keys
 should match the keyword arguments accepted by the optimizers, and will be used
 as optimization options for this group.
 
+For example, this is very useful when one wants to specify per-layer learning rates::
+
+    optim.SGD([
+                    {'params': model.base.parameters(), 'lr': 1e-2},
+                    {'params': model.classifier.parameters()}
+                ], lr=1e-3, momentum=0.9)
+
+This means that ``model.base``'s parameters will use a learning rate of ``1e-2``, whereas
+``model.classifier``'s parameters will stick to the default learning rate of ``1e-3``.
+Finally a momentum of ``0.9`` will be used for all parameters.
+
 .. note::
 
     You can still pass options as keyword arguments. They will be used as
@@ -38,17 +49,24 @@ as optimization options for this group.
     only want to vary a single option, while keeping all others consistent
     between parameter groups.
 
+Also consider the following example related to the distinct penalization of parameters.
+Remember that :func:`~torch.nn.Module.parameters` returns an iterable that
+contains all learnable parameters, including biases and other
+parameters that may prefer distinct penalization. To address this, one can specify
+individual penalization weights for each parameter group::
 
-For example, this is very useful when one wants to specify per-layer learning rates::
+    bias_params = [p for name, p in self.named_parameters() if 'bias' in name]
+    others = [p for name, p in self.named_parameters() if 'bias' not in name]
 
     optim.SGD([
-                    {'params': model.base.parameters()},
-                    {'params': model.classifier.parameters(), 'lr': 1e-3}
-                ], lr=1e-2, momentum=0.9)
+                    {'params': others},
+                    {'params': bias_params, 'weight_decay': 0}
+                ], weight_decay=1e-2, lr=1e-2)
+
+In this manner, bias terms are isolated from non-bias terms, and a ``weight_decay``
+of ``0`` is set specifically for the bias terms, as to avoid any penalization for
+this group.
 
-This means that ``model.base``'s parameters will use the default learning rate of ``1e-2``,
-``model.classifier``'s parameters will use a learning rate of ``1e-3``, and a momentum of
-``0.9`` will be used for all parameters.
 
 Taking an optimization step
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -428,4 +446,4 @@ We train the model for a total of 300 epochs and start to collect EMA averages i
 .. py:module:: torch.optim.rprop
 .. py:module:: torch.optim.sgd
 .. py:module:: torch.optim.sparse_adam
-.. py:module:: torch.optim.swa_utils
\ No newline at end of file
+.. py:module:: torch.optim.swa_utils
diff --git a/docs/source/pipeline.rst b/docs/source/pipeline.rst
index 4b0cd6886d9d0..94d730ee223d3 100644
--- a/docs/source/pipeline.rst
+++ b/docs/source/pipeline.rst
@@ -8,7 +8,10 @@ Pipeline parallelism was original introduced in the
 technique to train large models on multiple GPUs.
 
 .. warning ::
-     Pipeline Parallelism is experimental and subject to change.
+     torch.distributed.pipeline is deprecated, so is this document. For
+     up-to-date pipeline parallel implementation, please refer to the
+     `PiPPy <https://github.com/pytorch/PiPPy>`__ library under the PyTorch
+     organization (Pipeline Parallelism for PyTorch).
 
 Model Parallelism using multiple GPUs
 -------------------------------------
diff --git a/docs/source/quantization-support.rst b/docs/source/quantization-support.rst
index 49b77f13d7653..90f6a9b24a44b 100644
--- a/docs/source/quantization-support.rst
+++ b/docs/source/quantization-support.rst
@@ -131,6 +131,18 @@ torch.ao.quantization.pt2e (quantization in pytorch 2.0 export implementation)
 .. automodule:: torch.ao.quantization.pt2e
 .. automodule:: torch.ao.quantization.pt2e.representation
 
+torch.ao.quantization.pt2e.export_utils
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.ao.quantization.pt2e.export_utils
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    model_is_exported
+
 torch (quantization related functions)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index 1b4ae6ed200c0..f41c1c277be5f 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -41,11 +41,11 @@ to lower precision with minimal accuracy loss.
 Quantization API Summary
 -----------------------------
 
-PyTorch provides three different modes of quantization: Eager Mode Quantization, FX Graph Mode Quantization (maintainence) and PyTorch 2 Export Quantization.
+PyTorch provides three different modes of quantization: Eager Mode Quantization, FX Graph Mode Quantization (maintenance) and PyTorch 2 Export Quantization.
 
 Eager Mode Quantization is a beta feature. User needs to do fusion and specify where quantization and dequantization happens manually, also it only supports modules and not functionals.
 
-FX Graph Mode Quantization is an automated quantization workflow in PyTorch, and currently it's a prototype feature, it is in maintainence mode since we have PyTorch 2 Export Quantization. It improves upon Eager Mode Quantization by adding support for functionals and automating the quantization process, although people might need to refactor the model to make the model compatible with FX Graph Mode Quantization (symbolically traceable with ``torch.fx``). Note that FX Graph Mode Quantization is not expected to work on arbitrary models since the model might not be symbolically traceable, we will integrate it into domain libraries like torchvision and users will be able to quantize models similar to the ones in supported domain libraries with FX Graph Mode Quantization. For arbitrary models we'll provide general guidelines, but to actually make it work, users might need to be familiar with ``torch.fx``, especially on how to make a model symbolically traceable.
+FX Graph Mode Quantization is an automated quantization workflow in PyTorch, and currently it's a prototype feature, it is in maintenance mode since we have PyTorch 2 Export Quantization. It improves upon Eager Mode Quantization by adding support for functionals and automating the quantization process, although people might need to refactor the model to make the model compatible with FX Graph Mode Quantization (symbolically traceable with ``torch.fx``). Note that FX Graph Mode Quantization is not expected to work on arbitrary models since the model might not be symbolically traceable, we will integrate it into domain libraries like torchvision and users will be able to quantize models similar to the ones in supported domain libraries with FX Graph Mode Quantization. For arbitrary models we'll provide general guidelines, but to actually make it work, users might need to be familiar with ``torch.fx``, especially on how to make a model symbolically traceable.
 
 PyTorch 2 Export Quantization is the new full graph mode quantization workflow, released as prototype feature in PyTorch 2.1. With PyTorch 2, we are moving to a better solution for full program capture (torch.export) since it can capture a higher percentage (88.8% on 14K models) of models compared to torch.fx.symbolic_trace (72.7% on 14K models), the program capture solution used by FX Graph Mode Quantization. torch.export still has limitations around some python constructs and requires user involvement to support dynamism in the exported model, but overall it is an improvement over the previous program capture solution. PyTorch 2 Export Quantization is built for models captured by torch.export, with flexibility and productivity of both modeling users and backend developers in mind. The main features are
 (1). Programmable API for configuring how a model is quantized that can scale to many more use cases
@@ -62,7 +62,7 @@ The following table compares the differences between Eager Mode Quantization, FX
 |                 |                   |Quantization       |                         |
 +-----------------+-------------------+-------------------+-------------------------+
 |Release          |beta               |prototype          |prototype                |
-|Status           |                   |(maintainence)     |                         |
+|Status           |                   |(maintenance)      |                         |
 +-----------------+-------------------+-------------------+-------------------------+
 |Operator         |Manual             |Automatic          |Automatic                |
 |Fusion           |                   |                   |                         |
@@ -430,7 +430,7 @@ to do the following in addition:
    to be fused. We currently support the following fusions:
    [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu]
 
-(Prototype - maintaince mode) FX Graph Mode Quantization
+(Prototype - maintenance mode) FX Graph Mode Quantization
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 There are multiple quantization types in post training quantization (weight only, dynamic and static) and the configuration is done through `qconfig_mapping` (an argument of the `prepare_fx` function).
@@ -534,7 +534,7 @@ API Example::
 
   # Step 1. program capture
   # NOTE: this API will be updated to torch.export API in the future, but the captured
-  # result shoud mostly stay the same
+  # result should mostly stay the same
   m = capture_pre_autograd_graph(m, *example_inputs)
   # we get a model with aten ops
 
@@ -1227,7 +1227,7 @@ An example::
 
 Symbolic Trace Error when using FX Graph Mode Quantization
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Symbolic traceability is a requirement for `(Prototype - maintaince mode) FX Graph Mode Quantization`_, so if you pass a PyTorch Model that is not symbolically traceable to `torch.ao.quantization.prepare_fx` or `torch.ao.quantization.prepare_qat_fx`, we might see an error like the following::
+Symbolic traceability is a requirement for `(Prototype - maintenance mode) FX Graph Mode Quantization`_, so if you pass a PyTorch Model that is not symbolically traceable to `torch.ao.quantization.prepare_fx` or `torch.ao.quantization.prepare_qat_fx`, we might see an error like the following::
 
   torch.fx.proxy.TraceError: symbolically traced variables cannot be used as inputs to control flow
 
@@ -1334,7 +1334,7 @@ Please take a look at `Limitations of Symbolic Tracing <https://pytorch.org/docs
 .. py:module:: torch.ao.quantization.fx.utils
 .. py:module:: torch.ao.quantization.observer
 .. py:module:: torch.ao.quantization.pt2e.duplicate_dq_pass
-.. py:module:: torch.ao.quantization.pt2e.eval_utils
+.. py:module:: torch.ao.quantization.pt2e.export_utils
 .. py:module:: torch.ao.quantization.pt2e.graph_utils
 .. py:module:: torch.ao.quantization.pt2e.port_metadata_pass
 .. py:module:: torch.ao.quantization.pt2e.prepare
@@ -1345,7 +1345,6 @@ Please take a look at `Limitations of Symbolic Tracing <https://pytorch.org/docs
 .. py:module:: torch.ao.quantization.qconfig_mapping
 .. py:module:: torch.ao.quantization.quant_type
 .. py:module:: torch.ao.quantization.quantization_mappings
-.. py:module:: torch.ao.quantization.quantize
 .. py:module:: torch.ao.quantization.quantize_fx
 .. py:module:: torch.ao.quantization.quantize_jit
 .. py:module:: torch.ao.quantization.quantize_pt2e
diff --git a/docs/source/scripts/build_activation_images.py b/docs/source/scripts/build_activation_images.py
index cce571531d1f2..2a91d929899ad 100644
--- a/docs/source/scripts/build_activation_images.py
+++ b/docs/source/scripts/build_activation_images.py
@@ -21,7 +21,7 @@
     ACTIVATION_IMAGE_PATH.mkdir()
 
 # In a refactor, these ought to go into their own module or entry
-# points so we can generate this list programmaticly
+# points so we can generate this list programmatically
 functions = [
     torch.nn.ELU(),
     torch.nn.Hardshrink(),
diff --git a/docs/source/size.rst b/docs/source/size.rst
new file mode 100644
index 0000000000000..340836e000e1f
--- /dev/null
+++ b/docs/source/size.rst
@@ -0,0 +1,25 @@
+torch.Size
+===================================
+
+:class:`torch.Size` is the result type of a call to :func:`torch.Tensor.size`. It describes the size of all dimensions
+of the original tensor. As a subclass of :class:`tuple`, it supports common sequence operations like indexing and
+length.
+
+
+Example::
+
+    >>> x = torch.ones(10, 20, 30)
+    >>> s = x.size()
+    >>> s
+    torch.Size([10, 20, 30])
+    >>> s[1]
+    20
+    >>> len(s)
+    3
+
+
+
+.. autoclass:: torch.Size
+   :members:
+   :undoc-members:
+   :inherited-members:
diff --git a/docs/source/sparse.rst b/docs/source/sparse.rst
index b1fbb5e1390ef..74611137c23b8 100644
--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@@ -546,7 +546,7 @@ Let's consider the following example:
 
 As mentioned above, a sparse COO tensor is a :class:`torch.Tensor`
 instance and to distinguish it from the `Tensor` instances that use
-some other layout, on can use :attr:`torch.Tensor.is_sparse` or
+some other layout, one can use :attr:`torch.Tensor.is_sparse` or
 :attr:`torch.Tensor.layout` properties:
 
     >>> isinstance(s, torch.Tensor)
diff --git a/docs/source/tensor_attributes.rst b/docs/source/tensor_attributes.rst
index 47d8587d9a34c..9b2b8716fce41 100644
--- a/docs/source/tensor_attributes.rst
+++ b/docs/source/tensor_attributes.rst
@@ -139,8 +139,10 @@ torch.device
 A :class:`torch.device` is an object representing the device on which a :class:`torch.Tensor` is
 or will be allocated.
 
-The :class:`torch.device` contains a device type (``'cpu'``, ``'cuda'`` or ``'mps'``) and optional device
-ordinal for the device type. If the device ordinal is not present, this object will always represent
+The :class:`torch.device` contains a device type (most commonly "cpu" or
+"cuda", but also potentially :doc:`"mps" <mps>`, :doc:`"xpu" <xpu>`,
+`"xla" <https://github.com/pytorch/xla/>`_ or :doc:`"meta" <meta>`) and optional
+device ordinal for the device type. If the device ordinal is not present, this object will always represent
 the current device for the device type, even after :func:`torch.cuda.set_device()` is called; e.g.,
 a :class:`torch.Tensor` constructed with device ``'cuda'`` is equivalent to ``'cuda:X'`` where X is
 the result of :func:`torch.cuda.current_device()`.
@@ -225,6 +227,21 @@ non-None device argument.  To globally change the default device, see also
    >>> torch.randn((2,3), device='cuda:1')
    >>> torch.randn((2,3), device=1)  # legacy
 
+.. note::
+   Tensors are never moved automatically between devices and require an explicit call from the user. Scalar Tensors (with tensor.dim()==0) are the only exception to this rule and they are automatically transferred from CPU to GPU when needed as this operation can be done "for free".
+   Example:
+
+   >>> # two scalars
+   >>> torch.ones(()) + torch.ones(()).cuda()  # OK, scalar auto-transferred from CPU to GPU
+   >>> torch.ones(()).cuda() + torch.ones(())  # OK, scalar auto-transferred from CPU to GPU
+
+   >>> # one scalar (CPU), one vector (GPU)
+   >>> torch.ones(()) + torch.ones(1).cuda()  # OK, scalar auto-transferred from CPU to GPU
+   >>> torch.ones(1).cuda() + torch.ones(())  # OK, scalar auto-transferred from CPU to GPU
+
+   >>> # one scalar (GPU), one vector (CPU)
+   >>> torch.ones(()).cuda() + torch.ones(1)  # Fail, scalar not auto-transferred from GPU to CPU and non-scalar not auto-transferred from CPU to GPU
+   >>> torch.ones(1) + torch.ones(()).cuda()  # Fail, scalar not auto-transferred from GPU to CPU and non-scalar not auto-transferred from CPU to GPU
 
 .. _layout-doc:
 
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index b066777708373..218c83d0a3733 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -37,6 +37,8 @@ quantized 8-bit integer (unsigned)      ``torch.quint8``
 quantized 8-bit integer (signed)        ``torch.qint8``
 quantized 32-bit integer (signed)       ``torch.qint32``
 quantized 4-bit integer (unsigned) [3]_ ``torch.quint4x2``
+8-bit floating point, e4m3 [5]_         ``torch.float8_e4m3fn`` (limited support)
+8-bit floating point, e5m2 [5]_         ``torch.float8_e5m2`` (limited support)
 ======================================= ===========================================
 
 .. [1]
@@ -54,6 +56,11 @@ quantized 4-bit integer (unsigned) [3]_ ``torch.quint4x2``
   torch.compile); if you need eager support and the extra range is not needed,
   we recommend using their signed variants instead.  See
   https://github.com/pytorch/pytorch/issues/58734 for more details.
+.. [5]
+  ``torch.float8_e4m3fn`` and ``torch.float8_e5m2`` implement the spec for 8-bit
+  floating point types from https://arxiv.org/abs/2209.05433. The op support
+  is very limited.
+
 
 For backwards compatibility, we support the following alternate class names
 for these data types:
@@ -535,6 +542,7 @@ Tensor class reference
     Tensor.max
     Tensor.maximum
     Tensor.mean
+    Tensor.module_load
     Tensor.nanmean
     Tensor.median
     Tensor.nanmedian
diff --git a/docs/source/threading_environment_variables.rst b/docs/source/threading_environment_variables.rst
new file mode 100644
index 0000000000000..aaca33392961c
--- /dev/null
+++ b/docs/source/threading_environment_variables.rst
@@ -0,0 +1,13 @@
+.. _threading_environment_variables:
+
+Threading Environment Variables
+===============================
+.. list-table::
+  :header-rows: 1
+
+  * - Variable
+    - Description
+  * - ``OMP_NUM_THREADS``
+    - Sets the maximum number of threads to use for OpenMP parallel regions.
+  * - ``MKL_NUM_THREADS``
+    - Sets the maximum number of threads to use for the Intel MKL library. Note that MKL_NUM_THREADS takes precedence over ``OMP_NUM_THREADS``.
\ No newline at end of file
diff --git a/docs/source/torch.compiler.rst b/docs/source/torch.compiler.rst
index cce21835c010c..c861e413d07cc 100644
--- a/docs/source/torch.compiler.rst
+++ b/docs/source/torch.compiler.rst
@@ -74,7 +74,7 @@ Some of the most commonly used backends include:
    * - ``torch.compile(m, backend="tvm")``
      - Uses Apache TVM for inference optimizations. `Read more <https://tvm.apache.org/>`__
    * - ``torch.compile(m, backend="openvino")``
-     - Uses OpenVINO for inference optimizations. `Read more <https://docs.openvino.ai/2023.1/pytorch_2_0_torch_compile.html>`__
+     - Uses OpenVINO for inference optimizations. `Read more <https://docs.openvino.ai/torchcompile>`__
 
 Read More
 ~~~~~~~~~
@@ -102,8 +102,8 @@ Read More
    :caption: Deep Dive for PyTorch Developers
    :maxdepth: 1
 
-   torch.compiler_deepdive
-   torch.compiler_guards_overview
+   torch.compiler_dynamo_overview
+   torch.compiler_dynamo_deepdive
    torch.compiler_dynamic_shapes
    torch.compiler_nn_module
    torch.compiler_best_practices_for_backends
diff --git a/docs/source/torch.compiler_aot_inductor.rst b/docs/source/torch.compiler_aot_inductor.rst
index efc395c18cf15..0ebd03bbcecf3 100644
--- a/docs/source/torch.compiler_aot_inductor.rst
+++ b/docs/source/torch.compiler_aot_inductor.rst
@@ -75,7 +75,7 @@ For more details on ``torch.export``, you can refer to the :ref:`torch.export do
 
 In this illustrative example, the ``Dim`` parameter is employed to designate the first dimension of
 the input variable "x" as dynamic. Notably, the path and name of the compiled library remain unspecified,
-resulting in the shared library being stored in a temporay directory.
+resulting in the shared library being stored in a temporary directory.
 To access this path from the C++ side, we save it to a file for later retrieval within the C++ code.
 
 
@@ -114,7 +114,8 @@ previous step, enabling us to conduct model predictions directly within a C++ en
         // The second inference uses a different batch size and it works because we
         // specified that dimension as dynamic when compiling model.so.
         std::cout << "Result from the second inference:"<< std::endl;
-        std::cout << runner.run({torch::randn({2, 10}, at::kCUDA)})[0] << std::endl;
+        std::vector<torch::Tensor> inputs2 = {torch::randn({2, 10}, at::kCUDA)};
+        std::cout << runner.run(inputs2)[0] << std::endl;
 
         return 0;
     }
diff --git a/docs/source/torch.compiler_api.rst b/docs/source/torch.compiler_api.rst
index 6dd2cadd18d3c..a377349f84ae2 100644
--- a/docs/source/torch.compiler_api.rst
+++ b/docs/source/torch.compiler_api.rst
@@ -20,3 +20,5 @@ For a quick overview of ``torch.compiler``, see :ref:`torch.compiler_overview`.
      list_backends
      disable
      cudagraph_mark_step_begin
+     is_compiling
+     is_dynamo_compiling
diff --git a/docs/source/torch.compiler_custom_backends.rst b/docs/source/torch.compiler_custom_backends.rst
index e47e0c25c144c..5c7d5ec0f0b3c 100644
--- a/docs/source/torch.compiler_custom_backends.rst
+++ b/docs/source/torch.compiler_custom_backends.rst
@@ -84,7 +84,7 @@ Registration serves two purposes:
 
 * You can pass a string containing your backend function's name to ``torch.compile`` instead of the function itself,
   for example, ``torch.compile(model, backend="my_compiler")``.
-* It is required for use with the `minifier <https://pytorch.org/docs/main/compile/troubleshooting.html>`__. Any generated
+* It is required for use with the `minifier <https://pytorch.org/docs/main/torch.compiler_troubleshooting.html>`__. Any generated
   code from the minifier must call your code that registers your backend function, typically through an ``import`` statement.
 
 Custom Backends after AOTAutograd
diff --git a/docs/source/torch.compiler_deepdive.rst b/docs/source/torch.compiler_deepdive.rst
deleted file mode 100644
index bdaf13278e83d..0000000000000
--- a/docs/source/torch.compiler_deepdive.rst
+++ /dev/null
@@ -1,348 +0,0 @@
-TorchDynamo Deep Dive
-=====================
-
-Before you read this section, read :ref:`torch.compiler_overview`.
-
-**TorchDynamo** is a Python-level Just-In-Time (JIT) compiler designed to make
-unmodified PyTorch programs faster. TorchDynamo hooks into the frame evaluation
-API in CPython (`PEP 523 <https://peps.python.org/pep-0523/>`__) to
-dynamically modify Python bytecode right before it is executed. It
-rewrites Python bytecode to extract sequences of PyTorch
-operations into an `FX Graph <https://pytorch.org/docs/stable/fx.html>`__
-which is then compiled with a customizable backend.
-It creates this FX Graph through bytecode analysis and is designed to
-mix Python execution with compiled backends to get the best of both
-worlds — usability and performance.
-
-TorchDynamo makes it easy to experiment with different compiler
-backends to make PyTorch code faster with a single line decorator
-``torch._dynamo.optimize()`` which is wrapped for convenience by ``torch.compile()``
-
-The following diagram demonstrates how PyTorch works with ``torch.compile``
-and without it:
-
-.. image:: _static/img/dynamo/TorchDynamo.png
-
-`TorchInductor` is one of the backends
-supported by `TorchDynamo Graph <https://pytorch.org/docs/stable/fx.html>`__
-into `Triton <https://github.com/openai/triton>`__ for GPUs or
-`C++/OpenMP <https://www.openmp.org/>`__ for CPUs. We have a
-`training performance dashboard <https://github.com/pytorch/torchdynamo/issues/681#issuecomment-1233828468>`__
-that provides performance comparison for different training backends. You can read
-more in the `TorchInductor post on PyTorch
-dev-discuss <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__.
-
-For an in-depth overview, read the sections below, watch the deep-dive video,
-and check out the dev-discuss topics.
-
-   * `TorchDynamo deep-dive video <https://www.youtube.com/watch?v=egZB5Uxki0I>`__
-   * `dev-discuss topics <https://dev-discuss.pytorch.org/search?q=TorchDynamo%20order%3Alatest>`__
-
-TorchDynamo Internals
-~~~~~~~~~~~~~~~~~~~~~
-**Author**: `Jason Ansel <https://github.com/jansel>`_ and `Kaichao You <https://github.com/youkaichao>`_
-
-This section will go over some of the TorchDynamo internals and will
-demonstrate how TorchDynamo works under the hood.
-
-What is a guard?
-----------------
-
-TorchDynamo operates just-in-time and specializes graphs based on
-dynamic properties. Below is a basic example of how to use TorchDynamo.
-One can decorate a function or a method using ``torchdynamo.optimize`` to enable
-TorchDynamo optimization:
-
-.. code-block:: python
-
-   from typing import List
-   import torch
-   from torch import _dynamo as torchdynamo
-   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
-       print("my_compiler() called with FX graph:")
-       gm.graph.print_tabular()
-       return gm.forward  # return a python callable
-
-   @torchdynamo.optimize(my_compiler)
-   def toy_example(a, b):
-       x = a / (torch.abs(a) + 1)
-       if b.sum() < 0:
-           b = b * -1
-       return x * b
-   for _ in range(100):
-       toy_example(torch.randn(10), torch.randn(10))
-
-For example, the first graph above has the following
-guards:
-
-::
-
-   GUARDS:
-   hasattr(L['a'], '_dynamo_dynamic_indices') == False
-   hasattr(L['b'], '_dynamo_dynamic_indices') == False
-   utils_device.CURRENT_DEVICE == None
-   ___skip_backend_check() or ___current_backend() == ___lookup_backend(140355900538256)
-   check_tensor(L['a'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU), torch.float32, device=None, requires_grad=False, size=[10], stride=[1])
-   check_tensor(L['b'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU), torch.float32, device=None, requires_grad=False, size=[10], stride=[1])
-
-If any of those guards fail, the graph will be recaptured and
-recompiled. The interesting guard there is ``check_tensor``, which
-checks the following ``torch.Tensor`` properties:
-
-- Python class of the tensor (tensor subclassing, etc)
-- dtype
-- device
-- requires_grad
-- dispatch_key (with thread-local includes/excludes applied)
-- ndim
-- sizes\*
-- strides\*
-
-The full specialization mode allows the backend compiler to assume an
-entirely static graph. Unfortunately, most backends require this.
-Operators which return dynamic shapes will trigger a graph break when
-not in dynamic shape mode.
-
-What is Dynamo doing?
----------------------
-
-If you want to understand better what TorchDynamo is doing, you can run your code with:
-
-::
-
-   TORCH_LOGS="+dynamo,guards,bytecode"
-
-If you are not familiar with Python bytecode, you can add a decompiler hook
-to decompile the bytecode into human-readable source code. One available
-tool is `depyf <https://github.com/youkaichao/depyf>`__. If you don't have
-``depyf`` already installed, run ``pip install depyf``. Then, add the
-following code to install decompilation hooks before you run any code.
-
-.. code-block:: python
-
-   import depyf
-   depyf.install()
-
-This code triggers useful (but spammy) printouts.
-
-For example, the printouts for the first graph in the ``toy_example``
-are:
-
-::
-
-   __compiled_fn_0 <eval_with_key>.1
-   opcode         name     target                                                  args              kwargs
-   -------------  -------  ------------------------------------------------------  ----------------  --------
-   placeholder    a        a                                                       ()                {}
-   placeholder    b        b                                                       ()                {}
-   call_function  abs_1    <built-in method abs of type object at 0x7f9ca082f8a0>  (a,)              {}
-   call_function  add      <built-in function add>                                 (abs_1, 1)        {}
-   call_function  truediv  <built-in function truediv>                             (a, add)          {}
-   call_method    sum_1    sum                                                     (b,)              {}
-   call_function  lt       <built-in function lt>                                  (sum_1, 0)        {}
-   output         output   output                                                  ((truediv, lt),)  {}
-
-   ORIGINAL BYTECODE toy_example example.py line 12
-    14           0 LOAD_FAST                0 (a)
-                 2 LOAD_GLOBAL              0 (torch)
-                 4 LOAD_METHOD              1 (abs)
-                 6 LOAD_FAST                0 (a)
-                 8 CALL_METHOD              1
-                10 LOAD_CONST               1 (1)
-                12 BINARY_ADD
-                14 BINARY_TRUE_DIVIDE
-                16 STORE_FAST               2 (x)
-
-    15          18 LOAD_FAST                1 (b)
-                20 LOAD_METHOD              2 (sum)
-                22 CALL_METHOD              0
-                24 LOAD_CONST               2 (0)
-                26 COMPARE_OP               0 (<)
-                28 POP_JUMP_IF_FALSE       19 (to 38)
-
-    16          30 LOAD_FAST                1 (b)
-                32 LOAD_CONST               3 (-1)
-                34 BINARY_MULTIPLY
-                36 STORE_FAST               1 (b)
-
-    17     >>   38 LOAD_FAST                2 (x)
-                40 LOAD_FAST                1 (b)
-                42 BINARY_MULTIPLY
-                44 RETURN_VALUE
-
-
-   MODIFIED BYTECODE toy_example example.py line 12
-    12           0 LOAD_GLOBAL              3 (__compiled_fn_0)
-                 2 LOAD_FAST                0 (a)
-                 4 LOAD_FAST                1 (b)
-                 6 CALL_FUNCTION            2
-                 8 UNPACK_SEQUENCE          2
-                10 STORE_FAST               2 (x)
-                12 POP_JUMP_IF_FALSE       12 (to 24)
-                14 LOAD_GLOBAL              4 (__resume_at_30_1)
-                16 LOAD_FAST                1 (b)
-                18 LOAD_FAST                2 (x)
-                20 CALL_FUNCTION            2
-                22 RETURN_VALUE
-           >>   24 LOAD_GLOBAL              5 (__resume_at_38_2)
-                26 LOAD_FAST                1 (b)
-                28 LOAD_FAST                2 (x)
-                30 CALL_FUNCTION            2
-                32 RETURN_VALUE
-
-
-   possible source code:
-   def toy_example(a, b):
-       __temp_1 = __compiled_fn_0(a, b)
-       x = __temp_1[0]
-       if __temp_1[1]:
-           return __resume_at_30_1(b, x)
-       return __resume_at_38_2(b, x)
-
-   If you find the decompiled code is wrong,please submit an issue at https://github.com/youkaichao/depyf/issues.
-
-At the top you can see the FX graph.
-Next, you see the original bytecode of the function, followed by the
-modified bytecode generated by TorchDynamo, and the decompiled source
-code for reference. Finally, you see the guards which we covered above.
-
-In the modified bytecode, ``__compiled_fn_0`` is the return value of
-``my_compiler()`` (the compiled graph). ``__resume_at_30_1`` and
-``__resume_at_38_2`` are both generated continuation functions that pick
-up execution after a graph break (at bytecode offsets 30 and 38). Each
-of these functions take the form:
-
-::
-
-   __resume_at_<offset>:
-       ... restore stack state if needed ...
-       JUMP_ABSOLUTE <offset> into toy_example
-       ... original bytecode of toy_example ...
-
-By generating this ``resume_at`` function, we force the remainder of the
-function to be executed in a new Python frame which recursively
-triggers TorchDynamo to restart its capture once execution reaches that
-point for the first time.
-
-How to inspect artifacts generated by TorchDynamo?
---------------------------------------------------
-
-To inspect the artifacts generated by TorchDynamo, there is an API ``torch._dynamo.eval_frame._debug_get_cache_entry_list`` that retrieves compiled code and guards out of a function's ``__code__`` object. A compiled function can have several cache entries, and each cache entry consists a generated function to check guards, and a ``types.CodeType`` object to keep the code to be executed if the guarding conditions are satisfied.
-
-.. code-block:: python
-
-   from torch._dynamo.eval_frame import _debug_get_cache_entry_list, innermost_fn
-   cache_entries = _debug_get_cache_entry_list(innermost_fn(toy_example))
-   cache_entry = cache_entries[0]
-   guard, code = cache_entry.check_fn, cache_entry.code
-   # the guard takes the local variables of an input frame, and tells whether a re-compilation should be triggered.
-   import dis
-   dis.dis(guard)
-   dis.dis(code)
-
-If you know Python bytecode, you can understand the above output.
-
-For the guard function, there is no need to inspect the bytecode. We can directly access its guarding conditions:
-
-.. code-block:: python
-
-   for code_part in guard.code_parts:
-       print(code_part)
-
-The output is:
-
-::
-
-   ___guarded_code.valid
-   ___check_global_state()
-   hasattr(L['a'], '_dynamo_dynamic_indices') == False
-   hasattr(L['b'], '_dynamo_dynamic_indices') == False
-   utils_device.CURRENT_DEVICE == None
-   ___skip_backend_check() or ___current_backend() == ___lookup_backend(140215810860528)
-   ___check_tensors(L['a'], L['b'], tensor_check_names=tensor_check_names)
-
-Only when all the conditions are satisfied, the guard function returns true, and the compiled code is executed.
-
-For the compiled code, we cannot directly access its source but have to decompile it.
-
-.. code-block:: python
-
-   from depyf import decompile
-   print(decompile(code))
-
-The output is:
-
-::
-
-   def toy_example(a, b):
-       __temp_1 = __compiled_fn_0(a, b)
-       x = __temp_1[0]
-       if __temp_1[1]:
-           return __resume_at_30_1(b, x)
-       return __resume_at_38_2(b, x)
-
-Some names referenced in the code are:
-
-- Compiled functions, stored in the global namespace of the module containing the original function ``toy_example``. These include names like ``__compiled_fn_0`` / ``__resume_at_30_1`` / ``__resume_at_38_2``.
-
-- Closure variables used for checking guards. The names can be accessed from ``guard.__code__.co_freevars``, and the values are stored in ``guard.__closure__``. These include names like ``___guarded_code`` / ``___is_grad_enabled`` / ``___are_deterministic_algorithms_enabled`` / ``___is_torch_function_enabled`` / ``utils_device`` / ``___check_tensors`` / ``tensor_check_names``.
-
-- Argument ``L`` of the ``guard`` function. This is a dict mapping the name of arguments of ``toy_example`` to its values. This is only available when the function is called, where the frame evaluation API comes into play. In short, ``L`` is a ``dict`` with structure of ``{'a': value_a, 'b': value_b}``. Therefore, you can see the code uses ``L['a']`` to refer to the input variable ``a``.
-
-The graph break is shown in the code of compiled ``toy_example``, where we have to use Python interpreter to select the following graph to execute.
-
-Note that we pass a simple ``my_compiler`` function as the backend compiler, therefore the subgraph code ``__resume_at_38_2``, ``__resume_at_30_1``, and ``__compiled_fn_0`` remain Python code. This can also be inspected (please ignore the function name, and only use the function signature and function body code):
-
-.. code-block:: python
-
-   print("source code of __compiled_fn_0:")
-   print(innermost_fn(__compiled_fn_0).__self__.code)
-   print("=" * 60)
-   print("source code of __resume_at_30_1:")
-   print(decompile(__resume_at_30_1))
-   print("=" * 60)
-   print("source code of __resume_at_38_2:")
-   print(decompile(__resume_at_38_2))
-
-::
-
-   source code of __compiled_fn_0:
-
-   def forward(self, L_a_ : torch.Tensor, L_b_ : torch.Tensor):
-       l_a_ = L_a_
-       l_b_ = L_b_
-       abs_1 = torch.abs(l_a_)
-       add = abs_1 + 1;  abs_1 = None
-       truediv = l_a_ / add;  l_a_ = add = None
-       sum_1 = l_b_.sum();  l_b_ = None
-       lt = sum_1 < 0;  sum_1 = None
-       return (truediv, lt)
-
-   # To see more debug info, please use ``graph_module.print_readable()``
-   ============================================================
-   source code of __resume_at_30_1:
-   def <resume in toy_example>(b, x):
-       b = b * -1
-       return x * b
-
-   ============================================================
-   source code of __resume_at_38_2:
-   def <resume in toy_example>(b, x):
-       return x * b
-
-However, if we use other backends like the built-in ``inductor``, the subgraph code will be compiled CUDA kernels for GPU or C++ code for CPU.
-
-To summarize, the compiled code is conceptually equivalent to the code below:
-
-.. code-block:: python
-
-   def compiled_example(a, b):
-       L = {'a': a, 'b': b}
-       for guard, code in get_cache_entries():
-           if guard(L):
-               return code(a, b)
-       recompile_and_add_another_cache_entry()
-
-The following diagram demonstrates how ``torch.compile`` transforms and optimizes user-written code: it first extracts computation graphs from the user-written function, and compiles these graphs into optimized functions, then assembles them into a new function, which is functionally equivalent to the user-written code but optimized to have a good computation speed.
-
-.. image:: _static/img/dynamo/flowchart.jpg
diff --git a/docs/source/torch.compiler_dynamic_shapes.rst b/docs/source/torch.compiler_dynamic_shapes.rst
index 4c1be52b5e645..33256840cf6b5 100644
--- a/docs/source/torch.compiler_dynamic_shapes.rst
+++ b/docs/source/torch.compiler_dynamic_shapes.rst
@@ -32,7 +32,8 @@ The default dynamic behavior in PyTorch 2.1 is:
   when guards are added and why.
 
 - If you know ahead of time something will be dynamic, you can skip the first
-  recompile with ``torch._dynamo.mark_dynamic(tensor, dim)``.
+  recompile with ``torch._dynamo.mark_dynamic(tensor, dim)``. If you know ahead of time
+  the ``min`` and ``max`` value this dimension can take, you can specify ``torch._dynamo.mark_dynamic(tensor, dim, min=min, max=max)``
 
 - If you say ``torch.compile(dynamic=False)``, we will turn off automatic
   dynamic shapes on recompiles and always recompile for each distinct size.
diff --git a/docs/source/torch.compiler_dynamo_deepdive.rst b/docs/source/torch.compiler_dynamo_deepdive.rst
new file mode 100644
index 0000000000000..f4c45807d1105
--- /dev/null
+++ b/docs/source/torch.compiler_dynamo_deepdive.rst
@@ -0,0 +1,865 @@
+.. _torch.compiler_dynamo_deepdive:
+
+Dynamo Deep-Dive
+================
+
+TorchDynamo (or simply Dynamo) is the tracer within ``torch.compile``,
+and it is, more often than not, the one to blame for those insane
+backtraces. However, we cannot blindly blame Dynamo for these errors. In
+order to provide the user with the flexibility it does, Dynamo is given
+the arduous task of understanding any Python program. In particular,
+Dynamo has to implement a good part of the Python programming language
+internally!
+
+In this post, we will go over the internal design of Dynamo from the
+ground up. We will discuss the functionality it provides, and how it is
+implemented. By the end of this post, you will have a better
+understanding of what went wrong when you ``torch.compiled`` a PyTorch
+program and the compilation errored out, or succeeded but the speed-up
+was not what you expected.
+
+A Gentle Introduction to Dynamo
+-------------------------------
+
+Before getting our hands dirty with all the implementation details,
+let’s start by discussing what it is that Dynamo does.
+
+Dynamo is a tracer. This means, given and function and inputs to it, it
+executes the function and records a linear sequence of instructions
+(without control flow) into a graph. For example, consider the following
+program:
+
+.. code:: python
+
+   import torch
+
+   @torch.compile
+   def mse(x, y):
+       z = (x - y) ** 2
+       return z.sum()
+
+   x = torch.randn(200)
+   y = torch.randn(200)
+   mse(x, y)
+
+If we save this program into the file ``example.py`` and we run
+
+.. code:: bash
+
+   TORCH_LOGS=graph_code python example.py
+
+we see the output that Dynamo traced
+
+.. code:: python
+
+   def forward(l_x_: torch.Tensor, l_y_: torch.Tensor):
+       # File: example.py:5, code: z = (x - y) ** 2
+       sub = l_x_ - l_y_
+       z = sub ** 2
+       # File: example.py:6, code: return z.sum()
+       sum_1 = z.sum()
+       return (sum_1,)
+
+We call this a **graph (or trace) of the function for the given
+inputs**. This is represented via an `FX
+graph <https://pytorch.org/docs/main/fx.html>`__. We will simply think
+of an FX graph as a container that stores a list of function calls.
+
+The first thing we should notice is that the graph is a linear sequence
+of PyTorch operations. [1]_ Dynamo records all the PyTorch operations
+and stores them sequentially. For example, it split ``z = (x - y) ** 2``
+into its two constituting operations, ``sub = l_x_ - l_y_`` and
+``z = sub ** 2``.
+
+When we say that the trace is linear, we mean that there is no branching
+or any control flow. To see this, consider
+
+.. code:: python
+
+   import torch
+
+   @torch.compile
+   def fn(x, n):
+       y = x ** 2
+       if n >= 0:
+           return (n + 1) * y
+       else:
+           return y / n
+
+   x = torch.randn(200)
+   fn(x, 2)
+
+which, when executed with ``TORCH_LOGS=graph_code``, returns
+
+.. code:: python
+
+   def forward(l_x_: torch.Tensor):
+       # File: example.py:5, code: y = x ** 2
+       y = l_x_ ** 2
+       # File: example.py:7, code: return (n + 1) * y
+       mul = 3 * y
+       return (mul,)
+
+We see that Dynamo completely removed the ``if`` statement from the
+trace and just recorded the operations that were executed with the
+inputs.
+
+As such, it should be clear that **the trace of a function depends on
+the inputs**. In particular, this means that the trace is not generated
+when we write ``@torch.compile``, but when we execute the function
+``fn(x, 2)`` with the actual arguments.
+
+The other interesting thing to note here is that Dynamo removed the
+second argument to the function. Instead, it treated it as a constant
+and recorded the result of the operation ``n + 1`` in the graph. This is
+another feature of Dynamo: Dynamo will treat as constant any non-tensor
+value… other than ints. Let’s see now how are ints special.
+
+The last defining property of Dynamo is that it knows how to handle
+dynamic shapes. Symbolic shapes refer to Dynamo’s ability of tracing
+shapes, and more generally, integers, rather than leaving them as
+constants. This allows for avoiding recompilations and deploying generic
+models that work for any size in production. The main examples of places
+where dynamic shapes appear are the batch size, where we might train a
+model with a fixed batch size but then perform inference for an
+arbitrary batch size, or the variable sequence length that one
+encounters when processing text or audio.
+
+We can see this by executing a few more times the example above
+
+.. code:: python
+
+   import torch
+
+   @torch.compile
+   def fn(x, n):
+       y = x ** 2
+       if n >= 0:
+           return (n + 1) * y
+       else:
+           return y / n
+
+   x = torch.randn(200)
+   fn(x, 2)
+   fn(x, 3)
+   fn(x, -2)
+
+In this case, ``TORCH_LOGS=graph_code`` generates two more graphs
+
+.. code:: python
+
+   # Graph for n==2 omitted
+
+   def forward(self, l_x_: torch.Tensor, l_n_: torch.SymInt):
+       # File: a.py:5, code: y = x ** 2
+       y = l_x_ ** 2
+
+       # File: a.py:7, code: return (n + 1) * y
+       add = l_n_ + 1
+       mul = add * y
+       return (mul,)
+
+.. code:: python
+
+   def forward(self, l_x_: torch.Tensor, l_n_: torch.SymInt):
+       # File: a.py:5, code: y = x ** 2
+       y = l_x_ ** 2
+
+       # File: a.py:9, code: return y / n
+       truediv = y / l_n_
+       return (truediv,)
+
+Dynamo detected that one integer changed its value after the first call
+and started tracing it. We see that these graphs are generic, and trace
+the variable ``n`` symbolically via an object of type ``SymInt``.
+
+If after these calls we call ``fn(x, 4)``, Dynamo would not recompile,
+but rather reuse the graph that was already traced.
+
+To summarize: 1. Dynamo is a Python tracer 2. Given some inputs, it
+returns an FX graph with the PyTorch functions that were executed 3. It
+can also trace integers if it detects that they changed between calls 4.
+It specializes any other value that is not a tensor or a scalar
+
+Of course, Dynamo does many more things, like figuring out when it needs
+to retrace, rewriting the bytecode of the function, implementing graph
+breaks… To keep the introduction short, we will incrementally discuss
+all these in the sequel.
+
+PEP 523: Adding a frame evaluation API to CPython
+-------------------------------------------------
+
+Imagine now that we are given the task to implement Dynamo. Where would
+we even start? Rather conveniently for us, `PEP
+523 <https://peps.python.org/pep-0523/>`__ was released with Python 3.6.
+This PEP `was
+designed <https://peps.python.org/pep-0523/#a-jit-for-cpython>`__ to
+allow third parties to create JIT compilers for Python. Let’s see how.
+
+**A note on CPython**: CPython is internally implemented as a `stack
+machine <https://en.wikipedia.org/wiki/Stack_machine>`__. A Python
+program is compiled into
+`bytecodes <https://en.wikipedia.org/wiki/Bytecode>`__ that then are
+executed by this interpreter. To learn more about these bytecodes, see
+the `dis module <https://docs.python.org/3/library/dis.html>`__ from the
+standard library. See also `the developer
+docs <https://devguide.python.org/internals/interpreter/>`__ for an
+introduction to CPython’s interpreter. We will assume that the reader is
+familiar with the notion of a stack machine.
+
+PEP 523 exposes an API where a user can add a custom per-function
+interpreter. Then, CPython will use this interpreter rather than its own
+to execute the function. In order to be able to execute the function, on
+entry, CPython provides the custom interpreter with things like - The
+bytecode of the function - The value of the arguments of the function
+(i.e., the local variables) and their names - The value of the global
+variables and their names - The builtin functions like ``abs`` or
+``print``
+
+You can see all the fields
+`here <https://github.com/pytorch/pytorch/blob/e891a3bba9f05697d72776f6e89347231a141f03/torch/csrc/dynamo/eval_frame.c#L50-L59>`__. [2]_
+
+In summary, CPython provides the user’s interpreter with all the
+information necessary to execute the function. [3]_
+
+With this API, we can implement a tracer by implementing an interpreter
+that runs the code and records in a graph all the PyTorch operations
+that occur during this execution. This is exactly what Dynamo does.
+
+Dynamo uses this CPython API to parse all these objects and packs them
+into `a Python
+structure <https://github.com/pytorch/pytorch/blob/e891a3bba9f05697d72776f6e89347231a141f03/torch/csrc/dynamo/eval_frame.c#L93-L108>`__.
+After it has done so… it goes back from C to python. Other than for this
+piece of code that communicates with CPython, Dynamo is fully
+implemented in Python.
+
+It should be clear that it is the decorator ``@torch.compile``\ ’s job
+to install the necessary scaffolding that will pass the bytecode, the
+args, global variables and so on to Dynamo when the function is called.
+Again, ``@torch.compile`` does not actually compile anything.
+
+Implementing CPython in Python
+------------------------------
+
+So, we are back in the Python world. We have the bytecode of a function,
+and all the context necessary to execute it. In particular, we have
+landed at
+`_convert_frame_assert <https://github.com/pytorch/pytorch/blob/b6df8414601e1e086e830ca9e919e7fdc8874e71/torch/_dynamo/convert_frame.py#L272-L274>`__.
+This is the function that the decorator ``torch.compile`` returns! We
+get to this function from
+`_dynamo.optimize <https://github.com/pytorch/pytorch/blob/b6df8414601e1e086e830ca9e919e7fdc8874e71/torch/_dynamo/eval_frame.py#L715-L727>`__.
+The decorator ``torch.compile`` is just a nice API around
+``_dynamo.optimize``.
+
+Before getting into implementing a Python interpreter, we want to define
+an `IR <https://en.wikipedia.org/wiki/Intermediate_representation>`__.
+In particular, we want to wrap all the local and global variables in our
+own internal classes. This allows us to better track these objects and
+group together objects that can be treated in the same way to the eyes
+of Dynamo.
+
+The parent class of the internal class structure is ``VariableTracker``
+and represents the different objects that Dynamo understands. For
+example, ``ListVariable``, represents a ``list`` object, and keeps
+internally a `list of VariableTrackers <https://github.com/pytorch/pytorch/blob/e38a3a6079a3861b4bc9f256120ec661f34e726d/torch/_dynamo/variables/lists.py#L48-L56>`__.
+Another example of ``VariableTracker`` is
+`ConstantVariable <https://github.com/pytorch/pytorch/blob/83c0763dda1f93c6cf552ba88260a0dc7a3ecb70/torch/_dynamo/variables/constant.py#L30>`__.
+ConstantVariable wraps all the `objects considered constant by
+Dynamo <https://github.com/pytorch/pytorch/blob/83c0763dda1f93c6cf552ba88260a0dc7a3ecb70/torch/_dynamo/variables/constant.py#L98-L107>`__.
+We also have special subclasses for objects that require special
+attention, like
+`TensorVariable <https://github.com/pytorch/pytorch/blob/83c0763dda1f93c6cf552ba88260a0dc7a3ecb70/torch/_dynamo/variables/tensor.py#L68-L69>`__.
+All these internal classes are defined in the
+`torch/_dynamo/variables <https://github.com/pytorch/pytorch/tree/83c0763dda1f93c6cf552ba88260a0dc7a3ecb70/torch/_dynamo/variables>`__
+folder.
+
+Python objects are wrapped into their corresponding ``VariableTracker``
+class in
+`VariableBuilder._wrap <https://github.com/pytorch/pytorch/blob/83c0763dda1f93c6cf552ba88260a0dc7a3ecb70/torch/_dynamo/variables/builder.py#L365>`__.
+This function is just a very long chain of ``elif``\ s that tries to
+recursively pattern-match the Python inputs into the appropriate type of
+``VariableTracker``.
+
+**Debugging tip**. When we get unexpected results from dynamo, it is
+sometimes caused by the builder. If the logic of the builder is wrong,
+sometimes Dynamo may wrap a variable in the incorrect
+``VariableTracker`` type, and this may cause issues later on. It is
+rather useful to have a look at the ``VariableTracker`` types that
+appear in the errors, and the ``VariableTracker`` method that throws the
+exception when you encounter a Dynamo error. In particular, sometimes we
+find that an object is tracked as a ``UserDefinedObjectVariable`` (this
+is Dynamo’s catch-all class), when it should have been tracked as
+something more specific. In these cases, the ``SourceBuilder.__call__``
+logic is often to blame.
+
+**Debugging tip**. When running a program with ``TORCH_LOGS=dynamo``,
+one of the artifacts that are printed out is lines of the form
+
+::
+
+   TRACE LOAD_GLOBAL y [TorchInGraphFunctionVariable(<built-in method any>), TensorVariable()]
+
+This is the bytecode for the original program and the state of the stack
+at that point. This is very useful to find where an object was not
+traced into the right ``VariableTracker``.
+
+Ok, so we have an IR for our tracer, now we *just* need to reimplement
+CPython’s stack machine. This is implemented by
+`InstructorTranslatorBase <https://github.com/pytorch/pytorch/blob/69f112d5867f785a3a090a0c6d6644ae047033ac/torch/_dynamo/symbolic_convert.py#L576-L594>`__
+in
+`symbolic_convert.py <https://github.com/pytorch/pytorch/blob/69f112d5867f785a3a090a0c6d6644ae047033ac/torch/_dynamo/symbolic_convert.py>`__.
+
+``InstructionTranslatorBase`` has about 200 methods, implementing almost
+all of Python bytecodes. As an example, we can see the implementation of
+``BUILD_LIST``
+
+.. code:: python
+
+   def BUILD_LIST(self, inst):
+       items = self.popn(inst.argval)
+       self.push(ListVariable(items, mutable_local=MutableLocal()))
+
+This is the bytecode generated by constructions like ``l = [2, 3, 4]``.
+In this case, since there are three elements, the generated bytecode is
+``BUILD_LIST 3``. This means that we pop the top ``3`` elements of the
+stack and push a new list object to the top of the stack formed by these
+three elements.
+
+Generating the Output Graph
+---------------------------
+
+With a way to symbolically execute Python code, we are set to extract
+the PyTorch operations that happen during the symbolic execution of a
+program given some inputs. This is implemented in Dynamo via the
+`OutputGraph <https://github.com/pytorch/pytorch/blob/69f112d5867f785a3a090a0c6d6644ae047033ac/torch/_dynamo/output_graph.py#L221-L230>`__
+object. The ``OutputGraph`` object is `bound to an
+`InstructionTranslator object <https://github.com/pytorch/pytorch/blob/69f112d5867f785a3a090a0c6d6644ae047033ac/torch/_dynamo/symbolic_convert.py#L2060-L2071>`__
+and it tracks all the data necessary to create the FX graph which will
+be returned by Dynamo.
+
+All the inputs and intermediary elements of the FX graph are
+``fx.Node``\ s. In Dynamo, ``fx.Node``\ s are wrapped in
+``fx.Proxy``\ s. ``fx.Proxy``\ s are used to build the FX graph.
+In particular, they record every PyTorch operation performed on them
+into the graph. You can can create a new operation to be added to
+the graph by calling `create_proxy <https://github.com/pytorch/pytorch/blob/fb80f05ee2e1cba17892980701bfd5dbce58349f/torch/_dynamo/output_graph.py#L430-L431>`__.
+Then, we can add it to the graph through the function
+`wrap_fx_proxy <https://github.com/pytorch/pytorch/blob/fb80f05ee2e1cba17892980701bfd5dbce58349f/torch/_dynamo/variables/builder.py#L1311>`__.
+
+A graph stores operations on tensors… and operations on symbolic
+integers. We will discuss symbolic integers later on, but first we will
+discuss how Dynamo addresses a rather important correctness issue.
+
+.. _making-dynamo-sound-guards:
+
+Making Dynamo Sound: Guards
+---------------------------
+
+At this point, we have a way to trace programs completely disregarding control flow.
+And for that, we have reimplemented all of CPython… If this sounds like a bit of an
+overkill, that is because it is.
+`torch.jit.trace <https://pytorch.org/docs/main/generated/torch.jit.trace.html>`__
+already implements this without all this machinery, so what gives?
+
+The issue with ``torch.jit.trace``, as it is warned in its docs, is that
+it just works if the traced program is not data dependent. In other
+words, it will just work if the program itself is linear. This means
+writing our program without using if-elses, for-while loops, exceptions.
+Even more, none of the libraries that we use can use any control flow!
+All in all, not using control flow in a language as dynamic as Python
+is, in fact, a huge constraint.
+
+JAX solves this problem by always retracing and caching the graph after
+retracing. Dynamo, on the other hand, uses guards to avoid retracing the
+whole program every time.
+
+A **guard** is an assumption (a boolean expression on an input) made in
+order to specialize a frame for one set of example inputs. Reusing the
+graph is only valid if these assumptions hold on the new inputs.
+
+For example, any constant input to a function, like a string, installs a
+guard stating that that input should be of type ``str`` and equal to the
+string we passed. Running
+
+.. code:: python
+
+   import torch
+
+   @torch.compile
+   def fn(a, b):
+       return a * len(b)
+
+   fn(torch.arange(10), "Hello")
+
+with ``TORCH_LOGS=guards`` prints (among other guards)
+
+.. code:: python
+
+   ___check_type_id(L['b'], 94334122025024)
+   L['b'] == 'Hello'
+
+This reads as “the local variable ``b`` should have a specific type
+(``str`` in this case, represented by the constant ``9433...``) and
+its value should be ``'Hello'``”. If we then execute the function
+again passing a different argument
+
+.. code:: python
+
+   import torch
+
+   @torch.compile
+   def fn(a, b):
+       return a * len(b)
+
+   fn(torch.arange(10), "Hello")
+   fn(torch.arange(10), "Hi")
+
+we can see the guard that failed by running ``TORCH_LOGS=recompiles``
+
+.. code:: python
+
+   Recompiling function fn in script.py:3
+   triggered by the following guard failure(s):
+        - L['b'] == 'Hello'
+
+Guards are accumulated while `the inputs to the function are wrapped in
+the
+builder <https://github.com/pytorch/pytorch/blob/69f112d5867f785a3a090a0c6d6644ae047033ac/torch/_dynamo/variables/builder.py#L808-L810>`__
+and `during the execution of the
+program <https://github.com/pytorch/pytorch/blob/69f112d5867f785a3a090a0c6d6644ae047033ac/torch/_dynamo/variables/dicts.py#L763-L769>`__.
+We will show many more examples of guards in the next section, but first
+let us discuss sources.
+
+A **source** tracks how to reconstruct a variable from the original
+local or global variables present when entering the current frame. In
+particular, it tracks the original local and global objects and any of
+the objects they contain. In
+
+.. code:: python
+
+   def foo(x: Tensor, y: List[Tensor]):
+       a = x * y[0]
+       return a * x
+
+``x`` and ``y`` have
+`LocalSource <https://github.com/pytorch/pytorch/blob/40dc0580a69565b06ec5263efe5d87cecc8200f7/torch/_dynamo/source.py#L80-L92>`__
+as their source, and ``y[0]`` has
+`GetItemSource <https://github.com/pytorch/pytorch/blob/40dc0580a69565b06ec5263efe5d87cecc8200f7/torch/_dynamo/source.py#L302>`__,
+which stores a ``LocalSource`` inside. On the other hand, ``a`` will not
+have a source as it is an intermediate variable that only exists within
+the fx graph.
+
+All these are defined in
+`torch/_dynamo/source.py <https://github.com/pytorch/pytorch/blob/main/torch/_dynamo/source.py>`__.
+We can see the guard generated by ``GetItemSource`` in the following
+example:
+
+.. code:: python
+
+   import torch
+
+   @torch.compile
+   def fn(x, l):
+       return x * len(l[0])
+
+   fn(torch.randn(8), ["Hi", "Hello"])
+
+generates the following guards
+
+.. code:: python
+
+   ___check_type_id(L['l'], 94439025877664)
+   len(L['l']) == 2
+   ___check_type_id(L['l'][0], 94439025840192)
+   L['l'][0] == 'Hi'
+   ___check_type_id(L['l'][1], 94439025840192)
+   L['l'][1] == 'Hello'
+
+Here, we see the code generated by ``GetItemSource`` (``[0]`` and
+``[1]``) wrapping a ``LocalSource`` (``L['l']``).
+
+At this point, with sources and guards, we are able to implement a
+caching system to avoid recompilation without having to retrace every
+time. We will discuss a bit more in detail this caching system in the
+sequel.
+
+The attentive reader will have noticed that this does not explain yet
+why we need to have such fine control over the Python interpreter as to
+having to reimplement it. The examples of guards that we have shown
+depend on the input objects, so we could still compute these before
+executing the function. In other words, we could implement this guard
+system on top of ``torch.jit.trace`` and get the same functionality with
+much less effort… Enter symbolic shapes.
+
+Symbolic Shapes
+---------------
+
+Another point we discussed in the introduction is that Dynamo knows how
+to trace integers. In order to implement this, we use a symbolic class
+`torch.SymInt <https://github.com/pytorch/pytorch/blob/fb80f05ee2e1cba17892980701bfd5dbce58349f/torch/__init__.py#L244-L249>`__
+that acts like an ``int`` but it records all the operations performed on
+it in the output FX graph. [4]_ We already saw this class in the introduction
+when introducing symbolic integer tracing.
+
+Let us now discuss the three properties that define symbolic shape
+tracing in Dynamo, and how to implement them.
+
+Static by default
+^^^^^^^^^^^^^^^^^
+
+Dynamo assumes that every integer, let that be an input or the shape of
+a tensor, is static by default. In other words, no integers will be
+traced on the first execution of a function. Then, only if it detects
+that an integer or a shape changed value during the execution, it will
+trace it and generate a graph generic on that variable.
+
+We already saw this behavior in the introduction using integers. Let us
+now look at an example using shapes of tensors.
+
+.. code:: python
+
+   import torch
+
+   @torch.compile
+   def fn(a, b):
+       return a.shape[0] * a * b
+
+   fn(torch.randn(4, 3), torch.randn(4, 3))
+   fn(torch.randn(8, 3), torch.randn(8, 3))
+
+Running this program with ``TORCH_LOGS=graph_code`` we see that these
+two calls are traced as
+
+.. code:: python
+
+   def forward(self, l_a_: torch.Tensor, l_b_: torch.Tensor):
+       mul = 4 * l_a_
+       mul_1 = mul * l_b_
+       return (mul_1,)
+
+   def forward(self, s0: torch.SymInt, l_a_: torch.Tensor, l_b_: torch.Tensor):
+       size = l_a_.size()
+       getitem = size[0]
+       mul = getitem * l_a_
+       mul_1 = mul * l_b_
+       return (mul_1,)
+
+In the first graph the shape is traced as a constant, but once it
+changes, it traces it symbolically using a ``SymInt``\ s. In general, a
+simpler way to see the shapes of the intermediary values is by running
+the program with ``TORCH_LOGS=graph_sizes``
+
+::
+
+   TRACED GRAPH TENSOR SIZES
+   ===== __compiled_fn_1 =====
+   l_a_: (s0, 3)
+   l_a_ (concrete): (8, 3)
+   l_b_: (s0, 3)
+   l_b_ (concrete): (8, 3)
+   mul: (s0, 3)
+   mul (concrete): (8, 3)
+   mul_1: (s0, 3)
+   mul_1 (concrete): (8, 3)
+
+where we can see that the first dimension of the two tensor args is
+dynamic, given that it is represented by the ``s0`` variable.
+
+We can find how Dynamo implements this by running ``TORCH_LOGS=guards``
+
+.. code:: python
+
+   # Guards first call
+   check_tensor(L['a'], torch.float32, device=None, requires_grad=False, size=[4, 3], stride=[3, 1])
+   check_tensor(L['b'], torch.float32, device=None, requires_grad=False, size=[4, 3], stride=[3, 1])
+
+   # Guards second call
+   check_tensor(L['a'], torch.float32, device=None, requires_grad=False, size=[None, 3], stride=[3, 1])
+   check_tensor(L['b'], torch.float32, device=None, requires_grad=False, size=[None, 3], stride=[3, 1])
+
+   L['b'].size()[0] == L['a'].size()[0]
+   2 <= L['a'].size()[0]
+
+We see that on the first call, the guards check that the tensors have
+some fixed sizes and strides. These guards fail in the second execution,
+so it retraces. Since it was an ``int`` guard that failed, in this
+second iteration it traces this ``int`` symbolically and it installs
+more general guards on this more generic kernel.
+
+**Compilation performance tip**. If you know that a dimension will vary
+in size, you can mark it as dynamic by calling
+`torch._dynamo.mark_dynamic <https://github.com/pytorch/pytorch/blob/66a76516bfc341b2b55bb2056d2faa9c2de46d69/torch/_dynamo/decorators.py#L176>`__
+before calling ``torch.compile``. This will avoid the first compilation
+with a static shape. There are other useful utility functions like
+``maybe_mark_dynamic`` or ``mark_static``. You can also have all
+integers and shapes traced by calling ``torch.compile(dynamic=True)``.
+This is mostly useful for debugging purposes.
+
+0, 1 are always specialized
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Regardless of whether we mark a dimension as dynamic, or we have traced
+an integer as dynamic, if we pass an input where that dimension is 0 or
+1, Dynamo will trace it as non-dynamic and it will generate a specific
+graph for it. This is the reason why in the example above we find guards
+of the form ``2 <= L['a'].size()[0]``.
+
+There are several reasons for this choice. There are two particularly
+important - A tensor is empty if and only if any of its dimensions is
+zero - A tensor can only be contiguous if one of the strides is one
+
+Duck shaping
+^^^^^^^^^^^^
+
+Dynamo performs what we call “duck shaping”. If two dynamic integers
+have the same value at trace time, we will assume that they are equal
+and guard on it. Effectively, this means that rather than having two
+symbols ``s0``, ``s1`` in the example above, we just unified them to
+``s0`` and had the guard ``L['b'].size()[0] == L['a'].size()[0]``. This
+enables performing fusions within the compiler while being able to
+generate kernels that are generic enough.
+
+Guards on symbolic ints
+^^^^^^^^^^^^^^^^^^^^^^^
+
+We now understand how symbolic shapes are implemented at a high level
+and the properties they have. Now, why is that symbolic shapes forced us
+through the tricky route of getting control of the CPython interpreter?
+Consider the following example:
+
+.. code:: python
+
+   import torch
+
+   @torch.compile(dynamic=True)
+   def fn(a):
+       if a.shape[0] * 2 < 16:
+           return a
+       else:
+           return a + 1
+
+   fn(torch.randn(8))
+
+This code has a guard of the form ``2*L['a'].size()[0] >= 16``. This is
+a non-trivial guard in terms of the inputs of the function, but it is
+registered in the middle of the execution of the program. Even more so,
+we cannot know this guard is needed until we see the ``if`` statement
+conditional on a ``SymNodeVariable`` argument. Such conditions are
+invisible to ``torch.jit.trace`` and require deep analysis of the python
+code.
+
+**Debugging tip** Running this code with ``TORCH_LOGS=dynamo`` tells us
+where this guard was added
+
+::
+
+   eval 2*s0 >= 16 [guard added] at script.py:5 in fn (_dynamo/variables/tensor.py:812 in evaluate_expr)
+
+Placing a breakpoint there and looking at the backtrace is rather useful
+to understand where a guard came from.
+
+Making Dynamo Complete: Graph Breaks
+------------------------------------
+
+With all the tools we have discussed, we have a tracer that can trace
+PyTorch operations on tensors and integers and has a caching system that
+knows when it can reuse a previously traced graph and when it needs to
+retrace. All this executing arbitrary Python code!
+
+There is just one small issue with this. The statement “executing
+arbitrary Python code” is perhaps a bit too general. Dynamo implements a
+good part of Python, but does it implement the more complex parts, like
+coroutines or async? Does it implement the whole Python standard
+library? NumPy also has a Python API. Does ``torch.compile`` also
+understand NumPy? and Django? [5]_
+
+Python’s ecosystem is massive, and a good part of it is written in other
+more performant languages like C++ or Rust, and it just exposes Python
+bindings. There is no hope in Dynamo tracing through Python objects that
+are implemented in C++. What can a tracer do when it finds an operation
+that it does not understand?
+
+The usual way machine learning tracers handle this issue is by informing
+the user that the operation they choked on and giving up tracing
+altogether. This would pose a real usability issue in the case of
+PyTorch, where its users are used to the flexibility it gives them. As a
+real-world example the ``doctr_det_predictor`` model uses NumPy and the
+``cv2`` library to `postprocess the model’s
+result <https://github.com/mindee/doctr/blob/f2114758d529ed8d3d0030581638f0520b6b98d8/doctr/models/detection/core.py#L86>`__.
+
+Here is another place where having access to CPython is interesting.
+Rather than erroring out, Dynamo can let CPython run that problematic
+code! To do this, Dynamo generates at trace time one graph with all the
+operations before the problematic code, and one with all the operations
+after. [6]_ Then, at runtime, it will delegate to CPython to execute the
+first graph, then the problematic code, and then the second graph. This
+process of stopping the tracing and generating multiple graphs is called
+a **graph break**.
+
+A small confession: I lied all throughout the introduction and the first
+sections. Dynamo does not generate one graph, but **multiple graphs**!
+For all practical purposes, starting retracing after a second graph can
+be thought of as starting tracing a new function. The new graph after
+the graph break will have its own guards, its new set of local
+variables, and so on.
+
+To discuss how to implement graph breaks, we need to first revisit how
+Dynamo interacts with CPython. Using PEP 523, CPython allows a user to
+use their own frame evaluation mechanism. What we had not discussed is
+that CPython also exposes its own frame evaluation for others to use.
+Dynamo leverages this to let the fast CPython interpreter run the
+compiled code. For a function without graph breaks, the whole tracing /
+execution process of a program that calls the function 2 times with the
+same arguments looks like this:
+
+1. In the first call to the function
+
+   1. Dynamo traces the function into an FX graph
+
+      1. The FX graph is compiled by the compiler (Inductor) into
+         efficient low-level code… but that’s a story for another day
+
+   2. It rewrites the bytecode of the function so that it simply calls
+      the compiled function
+   3. It gives CPython this new bytecode and asks it to run it
+      [`here <https://github.com/pytorch/pytorch/blob/e891a3bba9f05697d72776f6e89347231a141f03/torch/csrc/dynamo/eval_frame.c#L1006>`__]
+
+2. In the second call to the function
+
+   1. It checks the guards from the first call against the new arguments
+      [`here <https://github.com/pytorch/pytorch/blob/e891a3bba9f05697d72776f6e89347231a141f03/torch/csrc/dynamo/eval_frame.c#L658>`__].
+      Since they are the same arguments as before, they pass
+   2. It asks CPython to run the bytecode associated to those guards
+      [`here <https://github.com/pytorch/pytorch/blob/e891a3bba9f05697d72776f6e89347231a141f03/torch/csrc/dynamo/eval_frame.c#L972-L975>`__]
+
+This process on its own looks overly complicated. Why generate new
+bytecode and ask CPython to run it rather than simply creating a C++
+binding to the compiled function and executing it? Well, this pattern
+allows us to implement graph breaks! The bytecode generated by a graph
+break has the following structure:
+
+1. Bytecode that executes the first graph
+2. Bytecode that leaves the stack as it would be if CPython would have
+   executed the first graph. It also replays any modifications to local
+   or global variables that would be visible at this point
+3. The bytecode that made Dynamo graph break
+4. Bytecode that executes the second graph
+
+Let us see this in a simple example
+
+.. code:: python
+
+   import torch
+
+   @torch.compile
+   def fn(a):
+       b = a + 2
+       print("Hi")
+       return b + a
+
+   fn(torch.randn(4))
+
+Running this with ``TORCH_LOGS=bytecode`` shows us the initial bytecode
+and the modified bytecode
+
+.. code:: python
+
+   MODIFIED BYTECODE fn script.py line 3
+    0 LOAD_GLOBAL              1 (__compiled_fn_0)
+    2 LOAD_FAST                0 (a)
+    4 CALL_FUNCTION            1
+    6 STORE_FAST               3 (graph_out_0)
+    8 LOAD_GLOBAL              0 (print)
+   10 LOAD_CONST               2 ('Hi')
+   12 LOAD_FAST                3 (graph_out_0)
+   14 LOAD_CONST               3 (0)
+   16 BINARY_SUBSCR
+   18 STORE_FAST               1 (b)
+
+   20 CALL_FUNCTION            1
+   22 LOAD_GLOBAL              2 (__resume_at_14_1)
+   24 ROT_TWO
+   26 LOAD_FAST                0 (a)
+   28 LOAD_FAST                1 (b)
+   30 CALL_FUNCTION            3
+   32 RETURN_VALUE
+
+   MODIFIED BYTECODE resume_in_fn script.py line 6
+    0 LOAD_GLOBAL              1 (__compiled_fn_2)
+    2 LOAD_FAST                2 (b)
+    4 LOAD_FAST                1 (a)
+    6 CALL_FUNCTION            2
+    8 UNPACK_SEQUENCE          1
+   10 RETURN_VALUE
+
+We can see that the modified bytecode is split into two functions,
+``fn``, the original function, and a function called ``resume_in_fn``.
+This second function is a function created by Dynamo to implement the
+execution of the program starting at the graph break. This is often
+called a `continuation
+function <https://en.wikipedia.org/wiki/Continuation>`__. This
+continuation function simply calls the second compiled function with the
+right arguments. The code for the initial function is rewritten
+implementing the strategy that we described before
+
+-  L0-4. Call the compiled function (``a + 2``).
+-  L6. Store its result in a local variable called ``graph_out_0``.
+   ``graph_out_0`` is a tuple
+-  L8-18. Leave the stack as it would be at the point of the graph break
+-  L20. Execute the code that caused the graph break
+-  L22-32. Call the compiled continuation function (``a + b``)
+
+The code generation of the stack in Dynamo is delegated to
+``VariableTracker`` subclasses. Every ``VariableTracker`` object in
+Dynamo has a `reconstruct <https://github.com/pytorch/pytorch/blob/e891a3bba9f05697d72776f6e89347231a141f03/torch/_dynamo/variables/lists.py#L307-L309>`__
+method that generates the necessary bytecode to create the python object
+it represents on the stack.
+
+**Debugging tip**. Graph breaks hamper performance, and as such, it is
+best to avoid them. Running a program with ``TORCH_LOGS=graph_breaks``
+is a great way to find how many graph breaks did our program hit. The
+information it returns is in terms of ``VariableTracker`` objects, so
+the debugging tips above are sometimes also helpful to figure out what
+caused that graph break.
+
+Conclusion
+----------
+
+Dynamo is a complex piece of software. Once you sign up to implement a
+CPython interpreter you know you are in for a ride. That being said, we
+hope that this post helps demystify it a bit.
+
+Dynamo is (mostly) implemented in Python. We left plenty of links to the
+pieces of the code that we discussed. We hope that reading those pieces
+of code and grepping for the places that call them, or putting
+breakpoints on them and looking at the call stack helps understanding
+the rest of the code base.
+
+Of course, the best way to learn how a piece of software works is by
+extending it. In this case, the best way is to have a look at the `open
+dynamo issues on
+github <https://github.com/pytorch/pytorch/issues?q=is%3Aissue+is%3Aopen+label%3A%22module%3A+dynamo%22+>`__.
+Many of them require very minor changes in the code, once you find where
+you need to make those changes.
+
+Footnotes
+---------
+
+.. [1] In the literature, this is called a Directed Acyclical Graph (DAG).
+
+.. [2] All this binding code lives in ``torch/csrc/dynamo/eval_frame.c``.
+
+.. [3] In CPython lingo, the set of all these objects are called `a
+   frame <https://github.com/python/cpython/blob/f26bfe4b25f7e5a4f68fcac26207b7175abad208/Include/internal/pycore_frame.h#L57-L71>`__.
+
+.. [4] There are also ``SymBool`` and ``SymFloat`` classes. The latter one
+   is not used all that much at the time of this writing.
+
+.. [5] Interestingly enough, it does understand NumPy code! Have a look at
+   `this blogpost <https://pytorch.org/blog/compiling-numpy-code/>`__
+   and `the docs <https://pytorch.org/docs/main/torch.compiler_faq.html#does-numpy-work-with-torch-compile>`__.
+   Now, this is just possible because we reimplemented NumPy using
+   PyTorch. Good luck implementing Django in PyTorch though…
+
+.. [6] Assuming there is just one piece of problematic code. If there are
+   more, Dynamo can split the code into as many graphs as it needs.
diff --git a/docs/source/torch.compiler_dynamo_overview.rst b/docs/source/torch.compiler_dynamo_overview.rst
new file mode 100644
index 0000000000000..fc7a8c5c292e9
--- /dev/null
+++ b/docs/source/torch.compiler_dynamo_overview.rst
@@ -0,0 +1,350 @@
+Dynamo Overview
+===============
+
+Before you read this section, read :ref:`torch.compiler_overview`.
+
+TorchDynamo (or simply Dynamo) is a Python-level Just-In-Time (JIT) compiler designed to make
+unmodified PyTorch programs faster. Dynamo hooks into the frame evaluation
+API in CPython (`PEP 523 <https://peps.python.org/pep-0523/>`__) to
+dynamically modify Python bytecode right before it is executed. It
+rewrites Python bytecode to extract sequences of PyTorch
+operations into an `FX Graph <https://pytorch.org/docs/stable/fx.html>`__
+which is then compiled with a customizable backend.
+It creates this FX Graph through bytecode analysis and is designed to
+mix Python execution with compiled backends to get the best of both
+worlds — usability and performance.
+
+Dynamo makes it easy to experiment with different compiler
+backends to make PyTorch code faster with a single line decorator
+``torch._dynamo.optimize()`` which is wrapped for convenience by ``torch.compile()``
+
+The following diagram demonstrates how PyTorch works with ``torch.compile``
+and without it:
+
+.. image:: _static/img/dynamo/TorchDynamo.png
+
+`TorchInductor` is one of the backends
+supported by `Dynamo Graph <https://pytorch.org/docs/stable/fx.html>`__
+into `Triton <https://github.com/openai/triton>`__ for GPUs or
+`C++/OpenMP <https://www.openmp.org/>`__ for CPUs. We have a
+`training performance dashboard <https://github.com/pytorch/torchdynamo/issues/681#issuecomment-1233828468>`__
+that provides performance comparison for different training backends. You can read
+more in the `TorchInductor post on PyTorch
+dev-discuss <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__.
+
+For an in-depth overview, read the sections below, watch the deep-dive video,
+and check out the dev-discuss topics.
+
+   * `Dynamo deep-dive video <https://www.youtube.com/watch?v=egZB5Uxki0I>`__
+   * `dev-discuss topics <https://dev-discuss.pytorch.org/search?q=TorchDynamo%20order%3Alatest>`__
+
+Dynamo Internals
+~~~~~~~~~~~~~~~~
+**Author**: `Jason Ansel <https://github.com/jansel>`_ and `Kaichao You <https://github.com/youkaichao>`_
+
+This section will go over some of the Dynamo internals and will
+demonstrate how Dynamo works under the hood.
+
+What is a guard?
+----------------
+
+Dynamo operates just-in-time and specializes graphs based on
+dynamic properties. Below is a basic example of how to use Dynamo.
+One can decorate a function or a method using ``torchdynamo.optimize`` to enable
+Dynamo optimization:
+
+.. code-block:: python
+
+   from typing import List
+   import torch
+   from torch import _dynamo as torchdynamo
+   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+       print("my_compiler() called with FX graph:")
+       gm.graph.print_tabular()
+       return gm.forward  # return a python callable
+
+   @torchdynamo.optimize(my_compiler)
+   def toy_example(a, b):
+       x = a / (torch.abs(a) + 1)
+       if b.sum() < 0:
+           b = b * -1
+       return x * b
+   for _ in range(100):
+       toy_example(torch.randn(10), torch.randn(10))
+
+For example, the first graph above has the following
+guards:
+
+::
+
+   GUARDS:
+   hasattr(L['a'], '_dynamo_dynamic_indices') == False
+   hasattr(L['b'], '_dynamo_dynamic_indices') == False
+   utils_device.CURRENT_DEVICE == None
+   ___skip_backend_check() or ___current_backend() == ___lookup_backend(140355900538256)
+   check_tensor(L['a'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU), torch.float32, device=None, requires_grad=False, size=[10], stride=[1])
+   check_tensor(L['b'], Tensor, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU), torch.float32, device=None, requires_grad=False, size=[10], stride=[1])
+
+If any of those guards fail, the graph will be recaptured and
+recompiled. The interesting guard there is ``check_tensor``, which
+checks the following ``torch.Tensor`` properties:
+
+- Python class of the tensor (tensor subclassing, etc)
+- dtype
+- device
+- requires_grad
+- dispatch_key (with thread-local includes/excludes applied)
+- ndim
+- sizes\*
+- strides\*
+
+The full specialization mode allows the backend compiler to assume an
+entirely static graph. Unfortunately, most backends require this.
+Operators which return dynamic shapes will trigger a graph break when
+not in dynamic shape mode.
+
+What is Dynamo doing?
+---------------------
+
+If you want to understand better what Dynamo is doing, you can run your code with:
+
+::
+
+   TORCH_LOGS="+dynamo,guards,bytecode"
+
+If you are not familiar with Python bytecode, you can add a decompiler hook
+to decompile the bytecode into human-readable source code. One available
+tool is `depyf <https://github.com/youkaichao/depyf>`__. If you don't have
+``depyf`` already installed, run ``pip install depyf``. Then, add the
+following code to install decompilation hooks before you run any code.
+
+.. code-block:: python
+
+   import depyf
+   depyf.install()
+
+This code triggers useful (but spammy) printouts.
+
+For example, the printouts for the first graph in the ``toy_example``
+are:
+
+::
+
+   __compiled_fn_0 <eval_with_key>.1
+   opcode         name     target                                                  args              kwargs
+   -------------  -------  ------------------------------------------------------  ----------------  --------
+   placeholder    a        a                                                       ()                {}
+   placeholder    b        b                                                       ()                {}
+   call_function  abs_1    <built-in method abs of type object at 0x7f9ca082f8a0>  (a,)              {}
+   call_function  add      <built-in function add>                                 (abs_1, 1)        {}
+   call_function  truediv  <built-in function truediv>                             (a, add)          {}
+   call_method    sum_1    sum                                                     (b,)              {}
+   call_function  lt       <built-in function lt>                                  (sum_1, 0)        {}
+   output         output   output                                                  ((truediv, lt),)  {}
+
+   ORIGINAL BYTECODE toy_example example.py line 12
+    14           0 LOAD_FAST                0 (a)
+                 2 LOAD_GLOBAL              0 (torch)
+                 4 LOAD_METHOD              1 (abs)
+                 6 LOAD_FAST                0 (a)
+                 8 CALL_METHOD              1
+                10 LOAD_CONST               1 (1)
+                12 BINARY_ADD
+                14 BINARY_TRUE_DIVIDE
+                16 STORE_FAST               2 (x)
+
+    15          18 LOAD_FAST                1 (b)
+                20 LOAD_METHOD              2 (sum)
+                22 CALL_METHOD              0
+                24 LOAD_CONST               2 (0)
+                26 COMPARE_OP               0 (<)
+                28 POP_JUMP_IF_FALSE       19 (to 38)
+
+    16          30 LOAD_FAST                1 (b)
+                32 LOAD_CONST               3 (-1)
+                34 BINARY_MULTIPLY
+                36 STORE_FAST               1 (b)
+
+    17     >>   38 LOAD_FAST                2 (x)
+                40 LOAD_FAST                1 (b)
+                42 BINARY_MULTIPLY
+                44 RETURN_VALUE
+
+
+   MODIFIED BYTECODE toy_example example.py line 12
+    12           0 LOAD_GLOBAL              3 (__compiled_fn_0)
+                 2 LOAD_FAST                0 (a)
+                 4 LOAD_FAST                1 (b)
+                 6 CALL_FUNCTION            2
+                 8 UNPACK_SEQUENCE          2
+                10 STORE_FAST               2 (x)
+                12 POP_JUMP_IF_FALSE       12 (to 24)
+                14 LOAD_GLOBAL              4 (__resume_at_30_1)
+                16 LOAD_FAST                1 (b)
+                18 LOAD_FAST                2 (x)
+                20 CALL_FUNCTION            2
+                22 RETURN_VALUE
+           >>   24 LOAD_GLOBAL              5 (__resume_at_38_2)
+                26 LOAD_FAST                1 (b)
+                28 LOAD_FAST                2 (x)
+                30 CALL_FUNCTION            2
+                32 RETURN_VALUE
+
+
+   possible source code:
+   def toy_example(a, b):
+       __temp_1 = __compiled_fn_0(a, b)
+       x = __temp_1[0]
+       if __temp_1[1]:
+           return __resume_at_30_1(b, x)
+       return __resume_at_38_2(b, x)
+
+   If you find the decompiled code is wrong,please submit an issue at https://github.com/youkaichao/depyf/issues.
+
+At the top you can see the FX graph.
+Next, you see the original bytecode of the function, followed by the
+modified bytecode generated by Dynamo, and the decompiled source
+code for reference. Finally, you see the guards which we covered above.
+
+In the modified bytecode, ``__compiled_fn_0`` is the return value of
+``my_compiler()`` (the compiled graph). ``__resume_at_30_1`` and
+``__resume_at_38_2`` are both generated continuation functions that pick
+up execution after a graph break (at bytecode offsets 30 and 38). Each
+of these functions take the form:
+
+::
+
+   __resume_at_<offset>:
+       ... restore stack state if needed ...
+       JUMP_ABSOLUTE <offset> into toy_example
+       ... original bytecode of toy_example ...
+
+By generating this ``resume_at`` function, we force the remainder of the
+function to be executed in a new Python frame which recursively
+triggers Dynamo to restart its capture once execution reaches that
+point for the first time.
+
+How to inspect artifacts generated by Dynamo?
+---------------------------------------------
+
+To inspect the artifacts generated by Dynamo, there is an API ``torch._dynamo.eval_frame._debug_get_cache_entry_list`` that retrieves compiled code and guards out of a function's ``__code__`` object. A compiled function can have several cache entries, and each cache entry consists a generated function to check guards, and a ``types.CodeType`` object to keep the code to be executed if the guarding conditions are satisfied.
+
+.. code-block:: python
+
+   from torch._dynamo.eval_frame import _debug_get_cache_entry_list, innermost_fn
+   cache_entries = _debug_get_cache_entry_list(innermost_fn(toy_example))
+   cache_entry = cache_entries[0]
+   guard, code = cache_entry.check_fn, cache_entry.code
+   # the guard takes the local variables of an input frame, and tells whether a re-compilation should be triggered.
+   import dis
+   dis.dis(guard)
+   dis.dis(code)
+
+If you know Python bytecode, you can understand the above output.
+
+For the guard function, there is no need to inspect the bytecode. We can directly access its guarding conditions:
+
+.. code-block:: python
+
+   for code_part in guard.code_parts:
+       print(code_part)
+
+The output is:
+
+::
+
+   ___guarded_code.valid
+   ___check_global_state()
+   hasattr(L['a'], '_dynamo_dynamic_indices') == False
+   hasattr(L['b'], '_dynamo_dynamic_indices') == False
+   utils_device.CURRENT_DEVICE == None
+   ___skip_backend_check() or ___current_backend() == ___lookup_backend(140215810860528)
+   ___check_tensors(L['a'], L['b'], tensor_check_names=tensor_check_names)
+
+Only when all the conditions are satisfied, the guard function returns true, and the compiled code is executed.
+
+For the compiled code, we cannot directly access its source but have to decompile it.
+
+.. code-block:: python
+
+   from depyf import decompile
+   print(decompile(code))
+
+The output is:
+
+::
+
+   def toy_example(a, b):
+       __temp_1 = __compiled_fn_0(a, b)
+       x = __temp_1[0]
+       if __temp_1[1]:
+           return __resume_at_30_1(b, x)
+       return __resume_at_38_2(b, x)
+
+Some names referenced in the code are:
+
+- Compiled functions, stored in the global namespace of the module containing the original function ``toy_example``. These include names like ``__compiled_fn_0`` / ``__resume_at_30_1`` / ``__resume_at_38_2``.
+
+- Closure variables used for checking guards. The names can be accessed from ``guard.__code__.co_freevars``, and the values are stored in ``guard.__closure__``. These include names like ``___guarded_code`` / ``___is_grad_enabled`` / ``___are_deterministic_algorithms_enabled`` / ``___is_torch_function_enabled`` / ``utils_device`` / ``___check_tensors`` / ``tensor_check_names``.
+
+- Argument ``L`` of the ``guard`` function. This is a dict mapping the name of arguments of ``toy_example`` to its values. This is only available when the function is called, where the frame evaluation API comes into play. In short, ``L`` is a ``dict`` with structure of ``{'a': value_a, 'b': value_b}``. Therefore, you can see the code uses ``L['a']`` to refer to the input variable ``a``.
+
+The graph break is shown in the code of compiled ``toy_example``, where we have to use Python interpreter to select the following graph to execute.
+
+Note that we pass a simple ``my_compiler`` function as the backend compiler, therefore the subgraph code ``__resume_at_38_2``, ``__resume_at_30_1``, and ``__compiled_fn_0`` remain Python code. This can also be inspected (please ignore the function name, and only use the function signature and function body code):
+
+.. code-block:: python
+
+   print("source code of __compiled_fn_0:")
+   print(innermost_fn(__compiled_fn_0).__self__.code)
+   print("=" * 60)
+   print("source code of __resume_at_30_1:")
+   print(decompile(__resume_at_30_1))
+   print("=" * 60)
+   print("source code of __resume_at_38_2:")
+   print(decompile(__resume_at_38_2))
+
+::
+
+   source code of __compiled_fn_0:
+
+   def forward(self, L_a_ : torch.Tensor, L_b_ : torch.Tensor):
+       l_a_ = L_a_
+       l_b_ = L_b_
+       abs_1 = torch.abs(l_a_)
+       add = abs_1 + 1;  abs_1 = None
+       truediv = l_a_ / add;  l_a_ = add = None
+       sum_1 = l_b_.sum();  l_b_ = None
+       lt = sum_1 < 0;  sum_1 = None
+       return (truediv, lt)
+
+   # To see more debug info, please use ``graph_module.print_readable()``
+   ============================================================
+   source code of __resume_at_30_1:
+   def <resume in toy_example>(b, x):
+       b = b * -1
+       return x * b
+
+   ============================================================
+   source code of __resume_at_38_2:
+   def <resume in toy_example>(b, x):
+       return x * b
+
+However, if we use other backends like the built-in ``inductor``, the subgraph code will be compiled CUDA kernels for GPU or C++ code for CPU.
+
+To summarize, the compiled code is conceptually equivalent to the code below:
+
+.. code-block:: python
+
+   def compiled_example(a, b):
+       L = {'a': a, 'b': b}
+       for guard, code in get_cache_entries():
+           if guard(L):
+               return code(a, b)
+       recompile_and_add_another_cache_entry()
+
+The following diagram demonstrates how ``torch.compile`` transforms and optimizes user-written code: it first extracts computation graphs from the user-written function, and compiles these graphs into optimized functions, then assembles them into a new function, which is functionally equivalent to the user-written code but optimized to have a good computation speed.
+
+.. image:: _static/img/dynamo/flowchart.jpg
+
+To learn more about how all this is implemented internally, see :ref:`torch.compiler_dynamo_deepdive`.
diff --git a/docs/source/torch.compiler_faq.rst b/docs/source/torch.compiler_faq.rst
index 59cbaacad35fe..59c7c33898b5a 100644
--- a/docs/source/torch.compiler_faq.rst
+++ b/docs/source/torch.compiler_faq.rst
@@ -60,7 +60,7 @@ Do I still need to export whole graphs?
 For the vast majority of models you probably don’t and you can use
 ``torch.compile()`` as is but there are a few situations where
 full graphs are necessary and you can can ensure a full graph by simply
-running ``torch.compile(..., nopython=True)``. These situations include:
+running ``torch.compile(..., fullgraph=True)``. These situations include:
 
 * Large scale training runs, such as $250K+ that require pipeline parallelism
   and other advanced sharding strategies.
@@ -199,6 +199,8 @@ generated kernels and try to see what’s going on for yourself.
 Why am I not seeing speedups?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+.. _torch.compiler_graph_breaks:
+
 Graph Breaks
 ------------
 
@@ -245,22 +247,29 @@ that are encountered. Here is an example usage:
        if b.sum() < 0:
            b = b * -1
        return x * b
-   explanation, out_guards, graphs, ops_per_graph = dynamo.explain(toy_example, torch.randn(10), torch.randn(10))
+   explanation = dynamo.explain(toy_example)(torch.randn(10), torch.randn(10))
    print(explanation)
    """
-   Dynamo produced 3 graphs, with 2 graph break and 6 ops.
-    Break reasons:
-   1. call_function BuiltinVariable(print) [ConstantVariable(str)] {}
-      File "t2.py", line 16, in toy_example
-       print("woo")
-
-   2. generic_jump
-      File "t2.py", line 17, in toy_example
-       if b.sum() < 0:
-    """
+   Graph Count: 3
+   Graph Break Count: 2
+   Op Count: 5
+   Break Reasons:
+     Break Reason 1:
+       Reason: builtin: print [<class 'torch._dynamo.variables.constant.ConstantVariable'>] False
+       User Stack:
+         <FrameSummary file foo.py, line 5 in toy_example>
+     Break Reason 2:
+       Reason: generic_jump TensorVariable()
+       User Stack:
+         <FrameSummary file foo.py, line 6 in torch_dynamo_resume_in_toy_example_at_5>
+   Ops per Graph:
+     ...
+   Out Guards:
+     ...
+   """
 
-To throw an error on the first graph break encountered you can use
-disable python fallback by using ``nopython=True``, this should be
+To throw an error on the first graph break encountered you can
+disable python fallbacks by using ``fullgraph=True``, this should be
 familiar if you’ve worked with export based compilers.
 
 .. code-block:: python
@@ -268,7 +277,7 @@ familiar if you’ve worked with export based compilers.
    def toy_example(a, b):
       ...
 
-   torch.compile(toy_example, fullgraph=True, backend=<compiler>)
+   torch.compile(toy_example, fullgraph=True, backend=<compiler>)(a, b)
 
 Why didn’t my code recompile when I changed it?
 -----------------------------------------------
@@ -321,7 +330,7 @@ Does ``torch.func`` work with ``torch.compile`` (for `grad` and `vmap` transform
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Applying a ``torch.func`` transform to a function that uses ``torch.compile``
-does not work:
+does work:
 
 .. code-block:: python
 
@@ -337,30 +346,6 @@ does not work:
     x = torch.randn(2, 3)
     g(x)
 
-This code will not work. There is an `issue <https://github.com/pytorch/pytorch/issues/100320>`__
-that you can track for this.
-
-As a workaround, use ``torch.compile`` outside of the ``torch.func`` function:
-
-.. note::
-    This is an experimental feature and can be used by setting `torch._dynamo.config.capture_func_transforms=True`
-
-.. code-block:: python
-
-    import torch
-
-    torch._dynamo.config.capture_func_transforms=True
-
-    def f(x):
-        return torch.sin(x)
-
-    @torch.compile
-    def g(x):
-        return torch.vmap(f)(x)
-
-    x = torch.randn(2, 3)
-    g(x)
-
 Calling ``torch.func`` transform inside of a function handled with ``torch.compile``
 ------------------------------------------------------------------------------------
 
@@ -372,8 +357,6 @@ Compiling ``torch.func.grad`` with ``torch.compile``
 
     import torch
 
-    torch._dynamo.config.capture_func_transforms=True
-
     def wrapper_fn(x):
         return torch.func.grad(lambda x: x.sin().sum())(x)
 
@@ -387,109 +370,12 @@ Compiling ``torch.vmap`` with ``torch.compile``
 
     import torch
 
-    torch._dynamo.config.capture_func_transforms=True
-
     def my_fn(x):
         return torch.vmap(lambda x: x.sum(1))(x)
 
     x = torch.randn(3, 3, 3)
     output = torch.compile(my_fn)(x)
 
-Limitations
------------
-
-There are currently a few cases which are not supported and lead to graph breaks
-(that is, torch.compile falls back to eager-mode PyTorch on these). We are working
-on improving the situation for the next release (PyTorch 2.2)
-
-1. The inputs and outputs of the function being transformed over must be tensors.
-We do not yet support things like tuple of Tensors.
-
-.. code-block:: python
-
-    import torch
-
-    torch._dynamo.config.capture_func_transforms=True
-
-    def fn(x):
-        x1, x2 = x
-        return x1 + x2
-
-    def my_fn(x):
-        return torch.func.vmap(fn)(x)
-
-    x1 = torch.randn(3, 3, 3)
-    x2 = torch.randn(3, 3, 3)
-    # Unsupported, falls back to eager-mode PyTorch
-    output = torch.compile(my_fn)((x1, x2))
-
-2. Keyword arguments are not supported.
-
-.. code-block:: python
-
-    import torch
-
-    torch._dynamo.config.capture_func_transforms=True
-
-    def fn(x, y):
-        return (x + y).sum()
-
-    def my_fn(x, y):
-        return torch.func.grad(fn)(x, y=y)
-
-    x = torch.randn(3, 3)
-    y = torch.randn(3, 3)
-    # Unsupported, falls back to eager-mode PyTorch
-    output = torch.compile(my_fn)(x, y)
-
-3. Functions with observable side effects. For example, it is OK to mutate a list created in the function,
-but not OK to mutate a list created outside of the function.
-
-.. code-block:: python
-
-    import torch
-
-    torch._dynamo.config.capture_func_transforms=True
-
-    some_list = []
-
-    def f(x, y):
-        some_list.append(1)
-        return x + y
-
-    def my_fn(x, y):
-        return torch.func.vmap(f)(x, y)
-
-    x = torch.ones(2, 3)
-    y = torch.randn(2, 3)
-    # Unsupported, falls back to eager-mode PyTorch
-    output = torch.compile(my_fn)(x, y)
-
-4. ``torch.vmap`` over a function that calls one or more operators in the following list.
-
-.. note::
-    'stride', 'requires_grad', 'storage_offset', 'layout', 'data', 'is_coalesced', 'is_complex',
-    'is_conj', 'is_contiguous', 'is_cpu', 'is_cuda', 'is_distributed', 'is_floating_point',
-    'is_inference', 'is_ipu', 'is_leaf', 'is_meta', 'is_mkldnn', 'is_mps', 'is_neg', 'is_nested',
-    'is_nonzero', 'is_ort', 'is_pinned', 'is_quantized', 'is_same_size', 'is_set_to', 'is_shared',
-    'is_signed', 'is_sparse', 'is_sparse_csr', 'is_vulkan', 'is_xla', 'is_xpu'
-
-.. code-block:: python
-
-    import torch
-
-    torch._dynamo.config.capture_func_transforms=True
-
-    def bad_fn(x):
-        x.stride()
-        return x
-
-    def my_fn(x):
-        return torch.func.vmap(bad_fn)(x)
-
-    x = torch.randn(3, 3, 3)
-    # Unsupported, falls back to eager-mode PyTorch
-    output = torch.compile(my_fn)(x)
 
 Compiling functions besides the ones which are supported (escape hatch)
 -----------------------------------------------------------------------
diff --git a/docs/source/torch.compiler_fine_grain_apis.rst b/docs/source/torch.compiler_fine_grain_apis.rst
index 7c0a5b085e819..5057628cf748e 100644
--- a/docs/source/torch.compiler_fine_grain_apis.rst
+++ b/docs/source/torch.compiler_fine_grain_apis.rst
@@ -26,6 +26,8 @@ disable compilation are listed in the following table:
    "``torch._dynamo.disallow_in_graph``", "Disallows the marked op in the TorchDynamo graph. TorchDynamo causes graph break, and runs the op in the eager (no compile) mode.\n\nThis is suitable for the ops, while ``torch.compiler.disable`` is suitable for decorating functions.", "This API is excellent for both debugging and unblocking if a custom op like ``torch.ops.fbgemm.*`` is causing issues with the ``torch.compile`` function."
    "``torch.compile.allow_in_graph``", "The annotated callable goes as is in the TorchDynamo graph. For example, a black-box for TorchDynamo Dynamo.\n\nNote that AOT Autograd will trace through it, so the ``allow_in_graph`` is only a Dynamo-level concept.", "This API is useful for portions of the model which have known TorchDynamo hard-to-support features, like hooks or ``autograd.Function``. However, each usage of ``allow_in_graph`` **must be carefully screened** (no graph breaks, no closures)."
    "``torch._dynamo.graph_break``", "Adds a graph break. The code before and after the graph break goes through TorchDynamo.", "**Rarely useful for deployment** - If you think you need this, most probably you need either ``disable`` or ``disallow_in_graph``."
+   "``torch.compiler.is_compiling``", "Indicates whether a graph is executed/traced as part of torch.compile() or torch.export()."
+   "``torch.compiler.is_dynamo_compiling``", "Indicates whether a graph is traced via TorchDynamo. It's stricter than torch.compiler.is_compiling() flag, as it would only be set to True when TorchDynamo is used."
 
 ``torch.compiler.disable``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -35,7 +37,7 @@ disable compilation are listed in the following table:
 TorchDynamo intercepts the execution of each Python function frame. So, suppose you have a code structure (image below) where the function ``fn`` calls functions ``a_fn`` and ``b_fn``. And ``a_fn`` calls ``aa_fn`` and ``ab_fn``. When you use the PyTorch eager mode rather than ``torch.compile``, these function frames run as is. With ``torch.compile``, TorchDynamo intercepts each of these function frames (indicated by the green color):
 
 .. figure:: _static/img/fine_grained_apis/api_diagram.png
-   :alt: Callstack diagram of differnet apis.
+   :alt: Callstack diagram of different apis.
 
 Let's imagine, that function ``a_fn`` is causing troubles with ``torch.compile``.
 And this is a non-critical portion of the model. You can use ``compiler.disable``
diff --git a/docs/source/torch.compiler_guards_overview.rst b/docs/source/torch.compiler_guards_overview.rst
deleted file mode 100644
index 52efeb3490461..0000000000000
--- a/docs/source/torch.compiler_guards_overview.rst
+++ /dev/null
@@ -1,512 +0,0 @@
-Guards Overview
-===============
-
-From a UX perspective, TorchDynamo is very easy to use. The user invokes
-``torchdynamo.optimize`` as an annotation:
-
-.. code-block:: python
-
-   @torchdynamo.optimize(my_compiler)
-   def fn_foo(bar):
-
-Where a complete example looks like this:
-
-.. code-block:: python
-
-   from typing import List
-   import torch
-   from torch import _dynamo as torchdynamo
-
-   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
-       print("my_compiler() called with FX graph:")
-       gm.graph.print_tabular()
-       return gm.forward  # return a python callable
-
-   @torchdynamo.optimize(my_compiler)
-   def toy_example(a, b):
-       x = a / (torch.abs(a) + 1)
-       if b.sum() < 0:
-           b = b * -1
-       return x * b
-
-   for _ in range(100):
-       toy_example(torch.randn(10), torch.randn(10))
-
-This allows TorchDynamo to capture the interpreted Python frames, grab
-any and all relevant information, and speed things up wherever it can.
-The speedup comes from a few places, and can be rather dependent on the
-backend (`my_compiler` in the example above) provided, but the one speedup
-that is important in this section is **caching**. Caching itself is not
-a direct speedup but a critical enablement that prevents
-recompilation. We dig a hole with dynamo, and caching allows us to get
-out. It enables us to hold perf
-neutrality while then enabling backends - the true source of our
-speedups.
-
-With even a pass-through no-op backend provided:
-
-.. code-block:: python
-
-   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
-       return gm.forward
-
-We can see TorchDynamo speeding up Python execution even on
-regular Python, not just PyTorch.
-
-Caching and Guards Overview
----------------------------
-
-TorchDynamo operates through caching transformed (by TorchDynamo) user
-bytecode. When TorchDynamo receives a frame for evaluation, it checks if the
-**objects referenced in the frame have changed** in certain ways, and if
-not, TorchDynamo reads the previously transformed user bytecode to evaluate it.
-In this section, we will focus on how we can identify whether or not the
-**objects referenced in the frame have changed**. This is a critical
-piece of functionality in TorchDynamo, because it drives the entire
-invalidation lifecycle. This functionality is called **guards**.
-
-At a very high level, the flow can be summarized like this:
-
-1. TorchDynamo receives a Python frame.
-2. It converts the frame (1) passing it through instruction
-   translation.
-3. For the objects captured in (2), TorchDynamo creates tracking objects that
-   are:
-
-   - tracked on an output graph, which is an internal specialization of a `torch.fx.Tracer`
-   - guards
-
-4. TorchDynamo processes the guard objects created in (3), turning them into a
-   generated Python function, `check_fn`, associated with a piece of code.
-5. The `check_fn` is evaluated whenever we encounter this code a
-   subsequent time - if a `check_fn` passes and evaluates to `True`, TorchDynamo
-   identifies the code in the cache and the code encountered here as same, and
-   can be safely used. If it fails and evaluates to `False`, TorchDynamo
-   identifies the code in the cache as not valid, and can be thrown out in
-   favor of a new entry, through recompilation or a graph break.
-
-Python Frame Evaluation and PEP 523
------------------------------------
-
-The functionality of TorchDynamo is based on
-`PEP 523 <https://peps.python.org/pep-0523/>`__.
-
-TorchDynamo installs a frame evaluation function on Python by using
-`_PyInterpreterState_SetEvalFrameFunc`. TorchDynamo has a hook where
-Python can hand control back to us during evaluation.
-
-The function we have installed is ``convert_frame`` or
-``convert_frame_assert`` in the ``nopython=True`` case, but glossing
-over that nuance for now, let’s take a look at ``convert_frame_assert``,
-as ``convert_frame`` proxies to it.
-
-We can find the function in ``torch/_dynamo/convert_frame.py`` with a signature
-as follows:
-
-.. code-block:: python
-
-   def  convert_frame_assert(compiler_fn: Callable, one_graph=True):
-
-This function wraps the entry point of where Python invokes TorchDynamo
-with a frame:
-
-.. code-block:: python
-
-   def  _convert_frame_assert(frame: types.FrameType, cache_size: int):
-
-Here is what this function does:
-
-1. Checks if it has seen this ``code``\ (see: f_code `here
-   <https://docs.python.org/3/library/inspect.html>`__) before and exits
-   early if it did.
-2. Checks if the code is an unsupported case.
-3. Checks if the ``cache_size`` (second arg above) crosses the limit
-   defined in the config, ``cache_size_limit``. If it has, the function
-   drops the frame and logs warnings. This helps to avoid constant
-   recompilation of a frame as it generally means that the frame is hot
-   in an unexpected way and caching it produces needless overhead,
-   as it is likely to get evicted the next time it is encountered.
-4. Passes the frame, alongside a function that creates an
-   ``InstructionTranslator`` through bytecode
-   transformation, via ``transform_code_object``. A few crucial things
-   happen under the hood here:
-
-   1. New code is produced through ``transform_code_object``.
-
-   2. An FX tracer named ``output`` is produced through
-      ``InstructionTranslator``. This can be a bit confusing,
-      as ``InstructionTranslator`` is not an `fx` tracer, but its stored
-      in a variable named tracer, and its output **is** an `fx` tracer.
-
-   3. The function produces guards and stores them on ``output`` above.
-
-   4. The function produces ``output_instructions`` and stores them on
-      ``output`` above.
-
-   5. The function maps the newly produced transformed code to the initial code it
-      read off the frame. This mapping is worth remembering, we will
-      refer to it much later on below where we cover guard failures.
-
-5. Using the transformed code from 4.1 and the guards from 4.3,
-   the function produces a `GuardedCode`.
-
-Now that we have learned about frame evaluation, let’s review
-``InstructionTranslator``, and see how it turns the frame we handed
-it over into TorchDynamo internal types.
-
-InstructionTranslator
----------------------
-
-`InstructionTranslator` does a lot! We won’t cover the details of
-everything it does, but most importantly for this document, it produces
-a mapping of ``symbolic_locals`` which maintains a mapping from the
-frame’s ``f_locals`` to TorchDynamo internal Variable objects (more on these
-in a moment. ``symbolic_locals`` is filled via traversing the frame’s
-locals:
-
-.. code-block:: python
-
-   self.symbolic_locals = collections.OrderedDict(
-       (k, VariableBuilder(self, LocalSource(k))(f_locals[k]))
-       for k in vars
-       if k in f_locals
-   )
-
-The important component here  is the invocation of a call
-into ``VariableBuilder``. ``VariableBuilder``\ ’s call implementation
-proxies into a function called ``_wrap``, which in turn both constructs
-instances of ``VariableTracker`` and calls ``make_guards`` on them. More
-on that later.
-
-This mapping, in turn, is critical as each Variable has associated
-guards, which are then passed to ``self.output``, the instance of
-``OutputGraph``, an fx tracer, mentioned in 4.2 of the section above. If
-you recall, this ``OutputGraph``, stored in a variable called ``output``
-is where our guards are stored before being passed on to become
-``GuardedCode``
-
-How does ``InstructionTranslator`` do this? At the heart of it, there is
-a loop that is pumped, which drives a function ``step``.
-
-``step`` is just that - a single processing step, taking exactly one
-instruction and doing *something* with it.
-
-.. note:: These are real instructions processed by TorchDynamo’s
-   ``transform_code_object``, and it is pretty cool.
-
-.. note:: This section purposely skips the details of
-   `dis.get_instructions <https://docs.python.org/3/library/dis.html>`__.
-
-For the example above, here is a snippet of a what a few
-``Instruction``\'s may look like:
-
-.. code-block:: python
-
-   Instruction(opcode=124, opname='LOAD_FAST', arg=0, argval='b', offset=32, starts_line=8, is_jump_target=True, target=None)
-   Instruction(opcode=100, opname='LOAD_CONST', arg=3, argval=-1, offset=34, starts_line=None, is_jump_target=False, target=None)
-   Instruction(opcode=20, opname='BINARY_MULTIPLY', arg=None, argval=None, offset=36, starts_line=None, is_jump_target=False, target=None)
-
-This is the core functionality of this function. Take a look at the ``opname``,
-and then take a look at this little snippet from inside ``step``;
-
-.. code-block:: python
-
-   if not hasattr(self, inst.opname):
-       unimplemented(f"missing: {inst.opname}")
-   getattr(self, inst.opname)(inst)
-
-As we can see, the function checks if the current class, the
-``InstructionTranslator`` has an attribute set matching the operator name
-(for example, ``LOAD_CONST``). If it does, the function invokes it, passing the
-whole instruction object in. If it does not, the function drops the frame as
-unimplemented.
-
-For the ``LOAD_CONST`` example, we can see that we do indeed support it,
-with a relatively straightforward definition:
-
-.. code-block:: python
-
-   def LOAD_CONST(self, inst):
-       self.push(ConstantVariable(value=inst.argval))
-
-We can see that this function creates a new instance of the class
-``ConstantVariable`` , with a value, in our example case, -1, and then
-pushes it onto the stack.
-
-There are dozens of such methods - see ``symbolic_convert.py`` for all of
-them. Generally, we implement as many matching methods to Python
-bytecode instructions as possible.
-
-Across both the logic downstream of ``step`` and the logic from invoking
-``VariableBuilder`` - we now have a lot of ``VariableTracker``\ s and of
-course, we’ve spoken about creating guards quiet a bit. Let’s dig into
-what Variables are, and get a little closer to understanding guards.
-
-Variables
----------
-
-A ``ConstantVariable`` is an instance of ``VariableTracker``.
-``VariableTracker`` represents a tracked Python local or stack value.
-
-When it comes to representing an object inside TorchDynamo, a
-``VariableTracker`` does exactly what it says - it tracks a given variable.
-It is an extremely flexible class, but there are a few points to keep in
-mind:
-
--  It manages the ``guard`` relationship around the underlying object
-   through:
-
-   -  ``make_guard``
-   -  ``replace_guards``
-   -  ``add_guard(s)``
-   -  ``propagate`` - ``propagate(*vars: List[List["VariableTracker"]])`` -
-      Perhaps the most important of all, in that it combines guards from
-      all the provided ``VariableTracker`` instances passed in. It visits
-      the guards and combines the guards from these onto itself.
-
--  It acts as a proxy on behalf of the underlying object, implementing
-   methods for the rest of TorchDynamo to get information about the
-   tracked object:
-
-   -  ``call_method``
-   -  ``call_function``
-   -  ``python_type``
-   -  ``as_proxy``
-   -  ``is/as_python_proxy``
-
--  It stores the variable ``source`` of type ``Source``, from
-   ``torchdynamo/source.py``. This source type is a relatively self
-   contained class that helps us organize and bookkeep where the original
-   source came from, and helps provide convenience methods for things
-   like getting the name, and importantly for us, producing guards.
-
-And this class (``VariableTracker``) is built around subclassing,
-somewhere between a full Abstract Base Class and fully fleshed out class
-- it leaves many methods raising ``NotImplementedError`` - with reliance on
-subclasses. See ``torchdynamo/variables/`` for all subclasses to fulfill
-contracts and custom behaviors.
-
-Knowing what we know now, we can see an example of how an instruction
-from ``dis``, ``BUILD_TUPLE``:
-
-   ``BUILD_TUPLE(count)`` Creates a tuple consuming count items from the
-   stack, and pushes the resulting tuple onto the stack.
-
-In our case, our signature will be a *little* different due to the way
-we create ``Instruction`` objects, but the gist of it will be the same.
-Instead of passing in ``count``, we pass in an object with a little
-extra bookkeeping, and of course, we deal with turning regular old
-python objects into TorchDynamo notions:
-
-.. code-block:: python
-
-   def BUILD_TUPLE(self, inst):
-       items = self.popn(inst.argval)
-       options = VariableTracker.propagate(items)
-       self.push(TupleVariable(items, **options))
-
-Here is what this code does:
-
-1. The function reads ``argval``, which in this case, is
-   analogous to ``counts`` in the pydoc for the equivalent instruction.
-
-2. The function ``popn`` the items, in this case, the signature is
-   ``def  popn(self, n: int) -> List[TensorVariable]:`` this hints at an
-   underlying contract - we are returning ``TensorVariables``. If we
-   take a closer look at ``symbolic_convert.py`` and
-   ``InstructionTranslatorBase``/``InstructionTranslator``\ we see that
-   the only thing pushed onto and popped from our stack are
-   ``VariableTracker``\ s.
-
-3) The function calls ``VariableTracker.propagate``. This
-   takes the guards from every single item popped off the stack in 2,
-   and recursively traverses it and combines all the guards into
-   ``options``: ``py  return {      "guards": guards,  }``
-
-4) The function then makes a new instance of a ``VariableTracker``,
-   ``TupleVariable``\ out of the ``items`` and ``options``. This then
-   allows us to install all the appropriate guards from the ``items``
-   that make up the new ``TupleVariable``
-
-.. note:: Where did the first guards come from? Propagation
-   is a good technique, but we need something created before it can be
-   propagated. ``VariableBuilder`` calls
-   ``make_guards`` as it creates ``VariableTracker`` instances, from
-   ``f_locals``. This in turn calls into the ``source``, to have it create
-   guards.
-
-After all this, bytecode translation is done and we are one step closer
-to producing ``GuardedCode``. We now understand how locals become
-``VariableTracker``\ s, how instructions are handled, and where guards
-are called on for creation. Before we can go into seeing how code and
-guards are combined into a GuardedCode object, we need to dig a little
-bit into those ``make_guard`` and ``source.make_guard`` calls above. We
-can then understand, what was going on when we made guards
-alongside, and on, ``VariableTracker`` instances.
-
-Making Guards
--------------
-
-Guards are just Python objects, of the class ``Guard``. Let's look at them
-in more detail.
-
-Looking at the definition of the dataclass (and therefore, ctor
-signature), we see that it has a name, a source, and a create function.
-
-.. code-block:: python
-
-   @dataclasses.dataclass
-   class Guard:
-       name: str
-       source: GuardSource
-       create_fn: Callable
-
-The name should be the name of the variable.
-
-The source here is an enum indicating what *kind* of source the guard
-belongs to.
-
-.. note:: Not to be confused with ``Source`` and the other types
-   in ``source.py``, as stored on ``VariableTracker``.
-
-``create_fn`` provides the main functionality to transition from a simple
-dataclass to actually producing valid Python code to be invoked for
-knowing whether or not things have changed in between invocations, and
-whether we can safely read from the code cache or not.
-
-The most common code paths for getting an instance of a guard are
-through ``make_guards`` on ``VariableTracker``.
-``make_guards`` -> ``source.make_guard`` -> ``return Guard(self.name(), self.guard_source(), fn)``
-
-Or, in a concrete example:
-
-.. code-block:: python
-
-   ...
-   elif istype(value, range):
-       guards = self.make_guards(GuardBuilder.EQUALS_MATCH)
-       return RangeVariable(value=value, guards=guards)
-
-Since ``source`` was set at the construction time of this
-``VariableTracker``, all that was needed here was to provide the ``fn``,
-``GuardBuilder.EQUALS_MATCH`` to the ``create_fn`` field.
-
-This ``create_fn`` must be a method on ``GuardBuilder``. The reason for
-this becomes apparent in our next step. Once we have all the guards
-created for a frame, we move on to ``CheckFunctionManager`` and
-``compile_check_fn``.
-
-Before the ``convert_frame`` function can produce a ``GuardedCode``,
-it needs to run the ``CheckFunctionManager``, with all the guards, to
-produce a ``check_fn`` which will then, in turn get passed in alongside
-the code into ``GuardedCode``. This is the same ``check_fn`` that we store in our
-cache entry, and the same one we run to know whether or not to retrieve
-the code stored alongside. For reference, here is that code:
-
-.. code-block:: cpp
-
-   static CacheEntry *create_cache_entry(CacheEntry *next,
-                                         PyObject *guarded_code) {
-     CacheEntry *e = (CacheEntry *)malloc(sizeof(CacheEntry));
-     DEBUG_NULL_CHECK(e);
-     e->check_fn = PyObject_GetAttrString(guarded_code, "check_fn");
-     NULL_CHECK(e->check_fn);
-     e->code = (PyCodeObject *)PyObject_GetAttrString(guarded_code, "code");
-     NULL_CHECK(e->code);
-     e->next = next;
-     return e;
-   }
-
-We now know how a ``check_fn`` function is used, and who makes it, and
-what it is composed of, but what we do not yet know is how. How does a
-list of ``Guard`` objects become a function we can run later on?
-
-First, we iterate these guards:
-
-.. code-block:: python
-
-   for guard in sorted(guards or [], key=Guard.sort_key):
-       if not config.guard_nn_modules and guard.is_nn_module():
-           continue
-       guard.create(local_builder, global_builder)
-
-Calling ``guard.create`` runs that ``create_fn`` we set on the ``Guard``
-class above (don’t confuse it with the ``check_fn`` we are working on
-producing, the names are similar, so it can get a little confusing). In
-our example above, our ``create_fn`` is ``GuardBuilder.EQUALS_MATCH``.
-So we are now invoking it, passing in the ``self``, the guard itself,
-in.
-
-The signature is: ``def EQUALS_MATCH(self, guard: Guard):``
-
-And internally to that function, we can use the ``name`` on the guard to
-get back our original object, querying it for data and type information,
-which in turn gets us to the most important bit: appending code.
-
-At its simplest, ``EQUALS_MATCH`` appends just one line of code:
-``self.code.append(f"{ref} == {val!r}")``. Where ``ref`` is the name of
-the variable, and ``val`` is the value. It might produce code like this:
-
-.. code-block:: python
-
-   y == 2
-
-This is a basic example. But if we append a few other kinds of ``GuardBuilder``
-functions and then combine them all with
-``and`` in between each statement (as we do), we might get something
-like this:
-
-.. code-block:: python
-
-   ___guarded_code.valid and ___check_type_id(y, 94367738391392) and y == 2 and ___check_tensors(x)
-
-Here is what this code performs:
-
-1. A check for ``.valid``
-2. A type ID check
-3. A value check
-4. A tensor check
-
-This becomes the heart of the code our ``check_fn``, which in turn
-is evaluated the **next** time we encounter this code. It
-will then check:
-
-1. Is this code still valid?
-2. If (1), Does ``y`` still have a type of ``94367738391392``?
-3. If (2), is ``y`` still 2?
-4. If (3), let’s check on if tensor ``x`` changed in some specific ways.
-
-If all of these are still true, then we can use the code cached
-alongside this ``check_fn``.
-
-.. note:: For a deeper dive for how and where this happens
-   you can read ``static PyCodeObject *lookup(CacheEntry *e, PyObject *f_locals) {`` of
-   ``_eval_frame.c``.
-
-If not, then, we can move on to recompiling the code anew, and storing
-that in the cache alongside this code, and a whole new ``check_fn``,
-again to be checked on yet another subsequent frame.
-
-There are lots of other such functions on ``GuardBuilder`` which get
-coalesced into, at times massive, strings which then get evaluated as
-Python code and stored into ``check_fn``. The example above
-illustrates of a simple case. To understand this functionality better, read
-the other functions on ``GuardBuilder``, or better yet, dump the ``code`` variable
-in ``compile_check_fn`` to see what is getting produced,
-especially on larger, real models.
-
-Summary
--------
-
-In this section, we have reviewed:
-
-- The role of ``.valid`` and invalidation around weak references (and potentially soon to be NN Moduleinvalidations).
-- How the C++ side of guard functions (``___check_type_id``, ``___check_tensors``, etc) operate.
-- What happens when guards fail.
-- What happens if we produce invalid guard code.
-
-We covered how user provided code wrapped in a TorchDynamo context
-goes on to get traced and tracked internally, organized into ``VariableTracker``\ s
-``Source``\ s and subsequently ``Guard``\ s, and how those ``Guards`` in
-turn guide cache entry selection and invalidation when handing Python
-code.
diff --git a/docs/source/torch.compiler_profiling_torch_compile.rst b/docs/source/torch.compiler_profiling_torch_compile.rst
index 7459fc85ab0bf..33a6bca92faf9 100644
--- a/docs/source/torch.compiler_profiling_torch_compile.rst
+++ b/docs/source/torch.compiler_profiling_torch_compile.rst
@@ -125,12 +125,20 @@ Note a few things:
 * The first invocation should occur *during* profiling in order to capture compilation
 * Add a warm-up compilation in order to initialize any systems that need to be lazily initialized.
 
-Finding graph breaks
---------------------
+Finding graph breaks: "Torch-Compiled Region" and "CompiledFunction"
+--------------------------------------------------------------------
 
-Although there are logging tools for identifying graph breaks, the profiler provides a quick visual method of identifying graph breaks.
+Although there are logging tools for identifying graph breaks, the profiler provides a quick visual method of identifying :ref:`graph breaks <torch.compiler_graph_breaks>`. There are two profiler events to look for: **Torch-Compiled Region** and **CompiledFunction**.
 
-When gradients are required for any inputs, graph breaks are easy to identify: each graph break will interrupt a CompiledFunction block, splitting it in two.
+**Torch-Compiled Region** - which was introduced in PyTorch 2.2 - is a profiler event that covers the entire compiled region. Graph breaks almost always look the same: nested “Torch-Compiled Region” events.
+
+If you run two separate functions with torch.compile() applied independently on each of them, you should generally expect to see two adjacent (i.e NOT stacked/nested) Torch-Compiled regions. Meanwhile, if you encounter graph breaks (or disable()'ed/skipped regions), expect nested “Torch-Compiled Region” events.
+
+**CompiledFunction** - introduced in PyTorch 2.0 - is a profiler event that appears when gradients are required for any inputs.  Each graph break will interrupt a CompiledFunction block, splitting it in two. CompiledFunction events only appear when Autograd is involved, i.e. some of the input tensors to the graph have requires_grad=True.
+
+When a CompiledFunction appears in a trace, it is typically paired with a CompiledFunctionBackward event in the backward pass. A “fwd-bwd link” should appear in the trace connecting the two, if the backward function is called.
+
+If your use case includes a graph that doesn't require grad and doesn't include "Torch-Compiled Region" events, it can be more difficult to identify whether torch.compile is being applied correctly. One clue can be the existence of Inductor-generated Triton kernels.
 
 See the synthetic example below for a demonstration:
 
@@ -184,8 +192,43 @@ See the synthetic example below for a demonstration:
 
     prof.export_chrome_trace("trace_break.json")
 
-.. figure:: _static/img/profiling_torch_compile/graph_breaks.png
-    :alt: Visualization in the chrome://trace viewer, showing multiple CompiledFunction events - indicating graph breaks.
+.. figure:: _static/img/profiling_torch_compile/graph_breaks_with_torch_compiled_region.png
+    :alt: Visualization in the chrome://trace viewer, showing nested Torch-Compiled Region events and multiple CompiledFunction events - indicating graph breaks.
+
+Operator Kernels
+----------------
+
+When an operator is launched, we expect to see a few events:
+
+1. CPU-side event
+2. Kernel launch (if dealing with a GPU kernel)
+3. GPU-side event
+
+.. figure:: _static/img/profiling_torch_compile/kernel_launch_labeled.png
+    :alt: Visualization in the chrome://trace viewer, showing the three types of events: CPU-side event, kernel launch, and GPU-side event
+
+**Inductor-generated Triton kernels:**
+1. The **CPU-side event** should appear as an event prefixed with "triton\_". The events currently have minimal information - the kernel name and a launch, but less information than typical aten kernel launches (which contain input shapes, types, etc.).
+2. The **kernel launch** should appear as cuLaunchKernel instead of cudaLaunchKernel (cudaLaunchKernel is typical for aten ops)
+3. The **GPU-side event** should appear, and how descriptive the name will be depends on the inductor config for unique_kernel_names
+
+.. figure:: _static/img/profiling_torch_compile/triton_kernel_launch.png
+
+**Non-Inductor generated Triton kernels:**
+
+1. The **CPU-side** event may not appear in traces; the machinery for automatically inserting a profiler event is currently implemented at the Inductor level, so Triton kernels that bypass Inductor may not appear in traces, unless users have annotated them manually
+2. The **kernel launch** should appear s cuLaunchKernel instead of cudaLaunchKernel (cudaLaunchKernel is typical for aten ops)
+3. The **GPU-side** event should appear, named similarly to the triton kernel that was authored.
+
+.. figure:: _static/img/profiling_torch_compile/noninductor_triton_kernel.png
+
+**Inductor-generated CPU kernels:**
+
+1. The **CPU-side event** will not appear in traces; we haven't added profiling for this yet.
+2. The **kernel launch** and **GPU-side events** don't exist
+
+**Non-Triton kernels** (i.e. aten kernels or custom ops) should also be expected to sometimes appear in traces. Sometimes, Inductor will fall back to the original op implementation, in which case you will see a call to the aten op.
+
 
 Launch overhead
 ---------------
diff --git a/docs/source/torch.compiler_troubleshooting.rst b/docs/source/torch.compiler_troubleshooting.rst
index 25795b38a0a51..f98a4dc779b63 100644
--- a/docs/source/torch.compiler_troubleshooting.rst
+++ b/docs/source/torch.compiler_troubleshooting.rst
@@ -171,9 +171,9 @@ As the message suggests you can set
 ``torch._dynamo.config.verbose=True`` to get a full stack trace to both
 the error in TorchDynamo and the user code. In addition to this flag,
 you can also set the ``log_level`` of TorchDynamo through
-``torch._dynamo.config.log_level``. These levels include:
+``torch._logging.set_logs(dynamo = logging.INFO)`` or ``TORCH_LOGS="dynamo"``. These levels include:
 
-- ``logging.DEBUG``: Print every instruction that is
+- ``logging.DEBUG`` or ``TORCH_LOGS="+dynamo"``: Print every instruction that is
   encountered in addition to all the log levels listed below.
 - ``logging.INFO``:
   Print each function that is compiled (original and modified bytecode)
@@ -589,8 +589,8 @@ everything into one graph.
 Some graph break reasons are insurmountable to TorchDynamo, and can't be
 easily fixed. - calling into a C extension other than torch is invisible
 to torchdynamo, and could do arbitrary things without TorchDynamo being
-able to introduce necessary `guards <./torch.compiler_guards_overview.rst>`__ to
-ensure that the compiled program would be safe to reuse. Graph breaks
+able to introduce necessary guards (see :ref:`making-dynamo-sound-guards`)
+to ensure that the compiled program would be safe to reuse. Graph breaks
 can hinder performance if the resulting fragments are small. To maximize
 performance, it's important to have as few graph breaks as possible.
 
@@ -612,21 +612,26 @@ that are encountered. Here is an example usage:
        if b.sum() < 0:
            b = b * -1
        return x * b
-   explanation, out_guards, graphs, ops_per_graph, break_reasons, explanation_verbose = (
-       dynamo.explain(toy_example, torch.randn(10), torch.randn(10))
-   )
+   explanation = dynamo.explain(toy_example)(torch.randn(10), torch.randn(10))
    print(explanation_verbose)
    """
-   Dynamo produced 3 graphs, with 2 graph breaks and 6 ops.
-    Break reasons:
-   1. call_function BuiltinVariable(print) [ConstantVariable(str)] {}
-      File "t2.py", line 16, in toy_example
-       print("woo")
-
-   2. generic_jump
-      File "t2.py", line 17, in toy_example
-       if b.sum() < 0:
-    """
+   Graph Count: 3
+   Graph Break Count: 2
+   Op Count: 5
+   Break Reasons:
+     Break Reason 1:
+       Reason: builtin: print [<class 'torch._dynamo.variables.constant.ConstantVariable'>] False
+       User Stack:
+         <FrameSummary file foo.py, line 5 in toy_example>
+     Break Reason 2:
+       Reason: generic_jump TensorVariable()
+       User Stack:
+         <FrameSummary file foo.py, line 6 in torch_dynamo_resume_in_toy_example_at_5>
+   Ops per Graph:
+     ...
+   Out Guards:
+     ...
+   """
 
 Outputs include:
 
@@ -634,7 +639,7 @@ Outputs include:
 - ``graphs`` - a list of graph modules which were successfully traced.
 - ``ops_per_graph`` - a list of lists where each sublist contains the ops that are run in the graph.
 
-To throw an error on the first graph break encountered, use the ``nopython``
+To throw an error on the first graph break encountered, use the ``fullgraph``
 mode. This mode disables TorchDynamo’s Python fallback, and only
 succeeds if the entire program is convertible into a single graph. Example
 usage:
@@ -644,7 +649,7 @@ usage:
    def toy_example(a, b):
       ...
 
-   compiled_toy = torch.compile(toy_example, fullgraph=True, backend=<compiler>)
+   compiled_toy = torch.compile(toy_example, fullgraph=True, backend=<compiler>)(a, b)
 
 Excessive Recompilation
 -----------------------
@@ -707,3 +712,18 @@ to detect bugs in our codegen or with a backend compiler.
 
 If you'd like to ensure that random number generation is the same across both torch
 and triton then you can enable ``torch._inductor.config.fallback_random = True``
+
+Extended Debugging
+~~~~~~~~~~~~~~~~~~
+
+Extended debugging can be enabled by using the following experimental flags.
+
+``TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED`` - provides extended debug information if the
+string representation of a guard matches this flag value. For example, set it to
+"Ne(s0, 10)" to generate full Python and C++ backtrace whenever guard was issued.
+``TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL`` - provides extended debug information when
+a particular symbol is allocated. For example, set this to "u2" to generate full Python
+and C++ backtrace whenever this symbol was created.
+``TORCHDYNAMO_EXTENDED_DEBUG_CPP`` - provides extended debug information (C++ backtrace)
+for all extended debug settings as well as errors. For example, set this to "1". The C++
+backtrace is slow and very spammy so it is not included by default with extended debugging.
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 0ac0d96cb1fe4..32bcadc154523 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -274,9 +274,9 @@ Examples::
 
     no_grad
     enable_grad
-    set_grad_enabled
+    autograd.grad_mode.set_grad_enabled
     is_grad_enabled
-    inference_mode
+    autograd.grad_mode.inference_mode
     is_inference_mode_enabled
 
 Math operations
@@ -684,6 +684,7 @@ Utilities
     set_float32_matmul_precision
     get_float32_matmul_precision
     set_warn_always
+    get_device_module
     is_warn_always_enabled
     vmap
     _assert
@@ -742,7 +743,7 @@ Optimizations
 
     compile
 
-`torch.compile documentation <https://pytorch.org/docs/main/compile/index.html>`__
+`torch.compile documentation <https://pytorch.org/docs/main/torch.compiler.html>`__
 
 Operator Tags
 ------------------------------------
diff --git a/docs/source/torch_environment_variables.rst b/docs/source/torch_environment_variables.rst
new file mode 100644
index 0000000000000..f63760de87e9d
--- /dev/null
+++ b/docs/source/torch_environment_variables.rst
@@ -0,0 +1,26 @@
+.. _torch_environment_variables:
+
+Torch Environment Variables
+===============================
+
+PyTorch leverages environment variables for adjusting various settings that influence its runtime behavior.
+These variables offer control over key functionalities, such as displaying the C++ stack trace upon encountering errors, synchronizing the execution of CUDA kernels,
+specifying the number of threads for parallel processing tasks and many more.
+
+Moreover, PyTorch leverages several high-performance libraries, such as MKL and cuDNN,
+which also utilize environment variables to modify their functionality.
+This interplay of settings allows for a highly customizable development environment that can be
+optimized for efficiency, debugging, and computational resource management.
+
+Please note that while this documentation covers a broad spectrum of environment variables relevant to PyTorch and its associated libraries, it is not exhaustive.
+If you find anything in this documentation that is missing, incorrect, or could be improved, please let us know by filing an issue or opening a pull request.
+
+
+.. toctree::
+   :maxdepth: 1
+
+   threading_environment_variables
+   cuda_environment_variables
+   debugging_environment_variables
+   miscellaneous_environment_variables
+   logging
diff --git a/docs/source/xpu.rst b/docs/source/xpu.rst
new file mode 100644
index 0000000000000..d4085cf4e6267
--- /dev/null
+++ b/docs/source/xpu.rst
@@ -0,0 +1,57 @@
+torch.xpu
+===================================
+.. automodule:: torch.xpu
+.. currentmodule:: torch.xpu
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    StreamContext
+    current_device
+    current_stream
+    device
+    device_count
+    device_of
+    empty_cache
+    get_device_capability
+    get_device_name
+    get_device_properties
+    init
+    is_available
+    is_initialized
+    set_device
+    set_stream
+    stream
+    synchronize
+
+Random Number Generator
+-------------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    get_rng_state
+    get_rng_state_all
+    initial_seed
+    manual_seed
+    manual_seed_all
+    seed
+    seed_all
+    set_rng_state
+    set_rng_state_all
+
+Streams and events
+------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Event
+    Stream
+
+
+.. This module needs to be documented. Adding here in the meantime
+.. for tracking purposes
+.. py:module:: torch.xpu.random
+.. py:module:: torch.xpu.streams
\ No newline at end of file
diff --git a/functorch/CMakeLists.txt b/functorch/CMakeLists.txt
index f2f32745877fd..1fa28c8aee886 100644
--- a/functorch/CMakeLists.txt
+++ b/functorch/CMakeLists.txt
@@ -6,7 +6,7 @@ include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 
 set(FT_DIR csrc)
-file(GLOB_RECURSE FT_SOURCES ${FT_DIR}/*.cpp)
+file(GLOB_RECURSE FT_SOURCES ${FT_DIR}/*.cpp ${FT_DIR}/*.c)
 
 add_library(${PROJECT_NAME} MODULE ${FT_SOURCES})
 target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
index 4cc027504c773..e25b8d0e5731a 100644
--- a/functorch/csrc/dim/dim.cpp
+++ b/functorch/csrc/dim/dim.cpp
@@ -1518,14 +1518,14 @@ struct PyInstDecoder {
     // On Windows, _PyOpcode_Caches and _PyOpcode_Deopt are private symbols
     // See https://github.com/pytorch/pytorch/issues/93854
     void next() {
-    #if IS_PYTHON_3_11_PLUS && !defined(_WIN32)
+    #if IS_PYTHON_3_11_PLUS
         offset_ += _PyOpcode_Caches[opcode()];
     #endif
         offset_ += 1;
     }
     int opcode() {
         auto r = _Py_OPCODE(code_[offset_]);
-    #if IS_PYTHON_3_11_PLUS && !defined(_WIN32)
+    #if IS_PYTHON_3_11_PLUS
         r = _PyOpcode_Deopt[r];
     #endif
         return r;
diff --git a/functorch/csrc/dim/dim_opcode.c b/functorch/csrc/dim/dim_opcode.c
new file mode 100644
index 0000000000000..81ba62a378110
--- /dev/null
+++ b/functorch/csrc/dim/dim_opcode.c
@@ -0,0 +1,6 @@
+#include <torch/csrc/utils/python_compat.h>
+#if defined(_WIN32) && IS_PYTHON_3_11_PLUS
+#define Py_BUILD_CORE
+#define NEED_OPCODE_TABLES
+#include "internal/pycore_opcode.h"
+#endif
diff --git a/functorch/dim/__init__.py b/functorch/dim/__init__.py
index e8c6b0df0d580..519a7cb271cdc 100644
--- a/functorch/dim/__init__.py
+++ b/functorch/dim/__init__.py
@@ -24,7 +24,7 @@ class DimensionBindError(Exception):
 from . import op_properties
 
 # use dict to avoid writing C++ bindings for set
-pointwise = {t: True for t in op_properties.pointwise}
+pointwise = dict.fromkeys(op_properties.pointwise, True)
 
 use_c = True
 if not use_c:
diff --git a/functorch/docs/requirements.txt b/functorch/docs/requirements.txt
deleted file mode 100644
index 706eff6d4780b..0000000000000
--- a/functorch/docs/requirements.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-sphinx==3.5.4
-docutils==0.16
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
-sphinxcontrib.katex==0.8.6
-sphinx_copybutton>=0.3.1
-IPython==8.12.0
-myst-nb==0.13.2
-# Fixing upper version due to https://github.com/sphinx-doc/sphinx/issues/10306
-Jinja2<3.1.0
diff --git a/functorch/docs/source/conf.py b/functorch/docs/source/conf.py
index 68f02c8b81094..8a1bf182ddaa4 100644
--- a/functorch/docs/source/conf.py
+++ b/functorch/docs/source/conf.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 #
 # This file is execfile()d with the current directory set to its
 # containing dir.
@@ -22,11 +21,9 @@
 # source code directory, relative to this file, for sphinx-autobuild
 # sys.path.insert(0, os.path.abspath('../..'))
 
-import torch
 
 RELEASE = os.environ.get("RELEASE", False)
 
-import sys
 
 import pytorch_sphinx_theme
 
diff --git a/functorch/docs/source/index.rst b/functorch/docs/source/index.rst
index 997fda54ecc59..5f988524fb0e3 100644
--- a/functorch/docs/source/index.rst
+++ b/functorch/docs/source/index.rst
@@ -12,8 +12,8 @@ functorch is `JAX-like <https://github.com/google/jax>`_ composable function tra
    We've integrated functorch into PyTorch. As the final step of the
    integration, the functorch APIs are deprecated as of PyTorch 2.0.
    Please use the torch.func APIs instead and see the
-   `migration guide <https://pytorch.org/docs/master/func.migrating.html>`_
-   and `docs <https://pytorch.org/docs/master/func.html>`_
+   `migration guide <https://pytorch.org/docs/main/func.migrating.html>`_
+   and `docs <https://pytorch.org/docs/main/func.html>`_
    for more details.
 
 What are composable function transforms?
diff --git a/functorch/einops/_parsing.py b/functorch/einops/_parsing.py
index 63adcb6e5a64c..25f86ec6feee3 100644
--- a/functorch/einops/_parsing.py
+++ b/functorch/einops/_parsing.py
@@ -28,7 +28,7 @@
 import warnings
 from typing import Collection, List, Mapping, Optional, Set, Tuple, Union
 
-_ellipsis: str = "…"  # NB, this is a single unicode symbol. String is used as it is not a list, but can be iterated
+_ellipsis: str = "\u2026"  # NB, this is a single unicode symbol. String is used as it is not a list, but can be iterated
 
 
 class AnonymousAxis:
diff --git a/functorch/op_analysis/gen_data.py b/functorch/op_analysis/gen_data.py
index a364a05f86a42..23672071cd974 100644
--- a/functorch/op_analysis/gen_data.py
+++ b/functorch/op_analysis/gen_data.py
@@ -148,19 +148,19 @@ def remove_prefix(input_string, prefix):
 
 if True:
     with open("run_ops.txt") as f:
-        opinfo_ops = [remove_suffix(i.strip(), ".default") for i in f.readlines()]
+        opinfo_ops = [remove_suffix(i.strip(), ".default") for i in f]
     with open("count_ops.txt") as f:
-        opinfo_counts = [i.strip() for i in f.readlines()]
+        opinfo_counts = [i.strip() for i in f]
         opinfo_counts = defaultdict(int, dict(zip(opinfo_ops, opinfo_counts)))
 
     def count_fn(x):
         return opinfo_counts[x["full_name"]]
 
     with open("run_decompositions.txt") as f:
-        decomposed_ops = [remove_suffix(i.strip(), ".default") for i in f.readlines()]
+        decomposed_ops = [remove_suffix(i.strip(), ".default") for i in f]
 
     with open("public_api") as f:
-        ref_api = [i.strip() for i in f.readlines()]
+        ref_api = [i.strip() for i in f]
 
     def has_ref_impl(x):
         name = x["name"]
diff --git a/ios/TestApp/models/android_api_module.ptl b/ios/TestApp/models/android_api_module.ptl
index df62dd8620881..9adfb84bf8551 100644
Binary files a/ios/TestApp/models/android_api_module.ptl and b/ios/TestApp/models/android_api_module.ptl differ
diff --git a/ios/TestApp/run_on_aws_devicefarm.py b/ios/TestApp/run_on_aws_devicefarm.py
index 5ac692ea496f3..46ad8b4bc72d2 100755
--- a/ios/TestApp/run_on_aws_devicefarm.py
+++ b/ios/TestApp/run_on_aws_devicefarm.py
@@ -79,14 +79,14 @@ def upload_file(
         print(f"Uploading {filename} to Device Farm as {upload_name}...")
         r = requests.put(upload_url, data=file_stream, headers={"content-type": mime})
         if not r.ok:
-            raise Exception(f"Couldn't upload {filename}: {r.reason}")
+            raise Exception(f"Couldn't upload {filename}: {r.reason}")  # noqa: TRY002
 
     start_time = datetime.datetime.now()
     # Polling AWS till the uploaded file is ready
     while True:
         waiting_time = datetime.datetime.now() - start_time
         if waiting_time > datetime.timedelta(seconds=MAX_UPLOAD_WAIT_IN_SECOND):
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 f"Uploading {filename} is taking longer than {MAX_UPLOAD_WAIT_IN_SECOND} seconds, terminating..."
             )
 
@@ -96,7 +96,7 @@ def upload_file(
         print(f"{filename} is in state {status} after {waiting_time}")
 
         if status == "FAILED":
-            raise Exception(f"Couldn't upload {filename}: {r}")
+            raise Exception(f"Couldn't upload {filename}: {r}")  # noqa: TRY002
         if status == "SUCCEEDED":
             break
 
diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt
index 9a5e7279ebe78..598cac60bdbad 100644
--- a/modules/CMakeLists.txt
+++ b/modules/CMakeLists.txt
@@ -2,7 +2,6 @@ project(modules CXX C)
 add_subdirectory(detectron)
 add_subdirectory(module_test)
 add_subdirectory(observers)
-add_subdirectory(rocksdb)
 
 # Finally, set Caffe2_MODULES to parent scope.
 set(Caffe2_MODULES ${Caffe2_MODULES} PARENT_SCOPE)
diff --git a/modules/detectron/select_smooth_l1_loss_op.cu b/modules/detectron/select_smooth_l1_loss_op.cu
index ce68fcff634d6..72f1d563b4c92 100644
--- a/modules/detectron/select_smooth_l1_loss_op.cu
+++ b/modules/detectron/select_smooth_l1_loss_op.cu
@@ -149,7 +149,7 @@ bool SelectSmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& Y          = Input(1);
   auto& L          = Input(2);
   auto& S          = Input(3);
-  // Below is gradient of net w.r.t. avg_loss ("gradOuput"), should be all 1's
+  // Below is gradient of net w.r.t. avg_loss ("gradOutput"), should be all 1's
   auto& d_avg_loss = Input(4);
 
   auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype<float>()); // gradient of net w.r.t. Y_hat ("gradInput")
diff --git a/modules/detectron/smooth_l1_loss_op.cu b/modules/detectron/smooth_l1_loss_op.cu
index ea835a4bc2b97..ad2d9148c72f0 100644
--- a/modules/detectron/smooth_l1_loss_op.cu
+++ b/modules/detectron/smooth_l1_loss_op.cu
@@ -128,7 +128,7 @@ bool SmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& Y          = Input(1);
   auto& alpha_in   = Input(2);
   auto& alpha_out  = Input(3);
-  auto& d_avg_loss = Input(4);  // gradient of net w.r.t. avg_loss ("gradOuput")
+  auto& d_avg_loss = Input(4);  // gradient of net w.r.t. avg_loss ("gradOutput")
   // We intentially don't compute gradients for Y, alpha_{in,out} since they
   // are not needed (can change in the future if desired)
 
diff --git a/modules/rocksdb/CMakeLists.txt b/modules/rocksdb/CMakeLists.txt
deleted file mode 100644
index 7983ab91aa317..0000000000000
--- a/modules/rocksdb/CMakeLists.txt
+++ /dev/null
@@ -1,78 +0,0 @@
-# ---[ RocksDB module
-# In addition to being a useful module itself, RocksDB is also an exemplar
-# case where show how one should built a Caffe2 module inside the Caffe2
-# repository.
-#
-# This cmake file achieves two build modes:
-# (1) If one is invoking the main Caffe2 build, we will check a USE_* option,
-#     in this case USE_ROCKSDB, to test if we want to build this module.
-# (2) if we are building it in a standalone way, we will find the preinstalled
-#     Caffe2 library, and then build the library and install it.
-
-# ---[ First, determine if we are building with the main repo or not.
-# This is guarded by the CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO variable. It then
-# routes build to two paths:
-# (1) When we are building with the main repo, the caffe2_library is going to
-#     be already defined, and all related paths will be defined too. So we will
-#     simply test if the main repo build wants to build this module, in our
-#     case by the variable "USE_ROCKSDB".
-# (2) When we are not building with the main repo, we will need to do the usual
-#     cmake setup: version checks, project options, find dependent packages,
-#     etc.
-if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  if(NOT USE_ROCKSDB)
-    return()
-  endif()
-else()
-  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
-  project(caffe2_rocksdb CXX)
-  find_package(Caffe2 REQUIRED)
-  option(BUILD_SHARED_LIBS "Build shared libs." ON)
-  list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/../../cmake/Modules)
-endif()
-
-
-# ---[ Second, find dependencies.
-# This one should be similar to the standard dependency discovery in normal
-# cmake. Note that for modules that are located in the Caffe2 repository,
-# cmake related files, such as FindRocksDB in this case, should live in the
-# cmake/ folder under root.
-find_package(RocksDB CONFIG)
-if((DEFINED RocksDB_DIR) AND RocksDB_DIR)
-  list(APPEND RocksDB_LIBRARIES RocksDB::rocksdb)
-else()
-  message("RocksDB config not found. Fallback to legacy find.")
-  find_package(RocksDB)
-  if(NOT ROCKSDB_FOUND)
-    message(
-       FATAL_ERROR
-       "RocksDB not found. If you do not need caffe2_rocksdb, set "
-       "-DUSE_ROCKSDB=OFF to solve this error.")
-  endif()
-endif()
-
-# ---[ Third, create the CMake target.
-# The key to note is that this library will need to depend on caffe2_library,
-# which is the main lib of Caffe2. If your library explicitly depends on cuda,
-# then you will need to depend on the caffe2_gpu_library as well.
-add_library(caffe2_rocksdb ${CMAKE_CURRENT_SOURCE_DIR}/rocksdb.cc)
-# RocksDB 7 uses C++17 STL in header.
-if(RocksDB_VERSION_MAJOR VERSION_GREATER_EQUAL 7)
-  set_target_properties(caffe2_rocksdb PROPERTIES CXX_STANDARD 17)
-endif()
-target_link_libraries(caffe2_rocksdb PUBLIC torch_library)
-target_link_libraries(caffe2_rocksdb PRIVATE ${RocksDB_LIBRARIES})
-target_include_directories(caffe2_rocksdb PRIVATE ${RocksDB_INCLUDE_DIR})
-install(TARGETS caffe2_rocksdb DESTINATION lib)
-
-# ---[ Last, Append the library to Caffe2_MODULES, if we are building with
-# the main repo.
-# The purpose of this is that, for all binaries built in the Caffe2 main repo,
-# they will be built with the first class modules that are built. As a result,
-# these binaries will not need to explicitly load these modules before using
-# them.
-# Note(jiayq): this also depends on a separate cmake move to reorg test builds
-# and binary builds after modules. When it is done, this note should be removed.
-if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  set(Caffe2_MODULES ${Caffe2_MODULES} caffe2_rocksdb PARENT_SCOPE)
-endif()
diff --git a/modules/rocksdb/rocksdb.cc b/modules/rocksdb/rocksdb.cc
deleted file mode 100644
index 2db4691434006..0000000000000
--- a/modules/rocksdb/rocksdb.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "caffe2/core/db.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/module.h"
-#include "caffe2/core/flags.h"
-#include "rocksdb/db.h"
-#include "rocksdb/utilities/leveldb_options.h"
-
-C10_DEFINE_int(
-    caffe2_rocksdb_block_size,
-    65536,
-    "The caffe2 rocksdb block size when writing a rocksdb.");
-
-namespace caffe2 {
-namespace db {
-
-class RocksDBCursor : public Cursor {
- public:
-  explicit RocksDBCursor(rocksdb::DB* db)
-      : iter_(db->NewIterator(rocksdb::ReadOptions())) {
-    SeekToFirst();
-  }
-  ~RocksDBCursor() {}
-  void Seek(const string& key) override { iter_->Seek(key); }
-  bool SupportsSeek() override { return true; }
-  void SeekToFirst() override { iter_->SeekToFirst(); }
-  void Next() override { iter_->Next(); }
-  string key() override { return iter_->key().ToString(); }
-  string value() override { return iter_->value().ToString(); }
-  bool Valid() override { return iter_->Valid(); }
-
- private:
-  std::unique_ptr<rocksdb::Iterator> iter_;
-};
-
-class RocksDBTransaction : public Transaction {
- public:
-  explicit RocksDBTransaction(rocksdb::DB* db) : db_(db) {
-    CAFFE_ENFORCE(db_);
-    batch_.reset(new rocksdb::WriteBatch());
-  }
-  ~RocksDBTransaction() { Commit(); }
-  void Put(const string& key, string&& value) override {
-    batch_->Put(key, value);
-  }
-  void Commit() override {
-    rocksdb::Status status = db_->Write(rocksdb::WriteOptions(), batch_.get());
-    batch_.reset(new rocksdb::WriteBatch());
-    CAFFE_ENFORCE(
-        status.ok(), "Failed to write batch to rocksdb: " + status.ToString());
-  }
-
- private:
-  rocksdb::DB* db_;
-  std::unique_ptr<rocksdb::WriteBatch> batch_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(RocksDBTransaction);
-};
-
-class RocksDB : public DB {
- public:
-  RocksDB(const string& source, Mode mode) : DB(source, mode) {
-    rocksdb::LevelDBOptions options;
-    options.block_size = FLAGS_caffe2_rocksdb_block_size;
-    options.write_buffer_size = 268435456;
-    options.max_open_files = 100;
-    options.error_if_exists = mode == NEW;
-    options.create_if_missing = mode != READ;
-    rocksdb::Options rocksdb_options = rocksdb::ConvertOptions(options);
-
-    rocksdb::DB* db_temp;
-    rocksdb::Status status = rocksdb::DB::Open(
-      rocksdb_options, source, &db_temp);
-    CAFFE_ENFORCE(
-        status.ok(),
-        "Failed to open rocksdb ",
-        source,
-        "\n",
-        status.ToString());
-    db_.reset(db_temp);
-    VLOG(1) << "Opened rocksdb " << source;
-  }
-
-  void Close() override { db_.reset(); }
-  unique_ptr<Cursor> NewCursor() override {
-    return make_unique<RocksDBCursor>(db_.get());
-  }
-  unique_ptr<Transaction> NewTransaction() override {
-    return make_unique<RocksDBTransaction>(db_.get());
-  }
-
- private:
-  std::unique_ptr<rocksdb::DB> db_;
-};
-
-REGISTER_CAFFE2_DB(RocksDB, RocksDB);
-// For lazy-minded, one can also call with lower-case name.
-REGISTER_CAFFE2_DB(rocksdb, RocksDB);
-
-}  // namespace db
-
-CAFFE2_MODULE(caffe2_rocksdb, "RocksDB implementation for caffe2::DB.");
-}  // namespace caffe2
diff --git a/mypy-inductor.ini b/mypy-inductor.ini
deleted file mode 100644
index ea95d844e5641..0000000000000
--- a/mypy-inductor.ini
+++ /dev/null
@@ -1,83 +0,0 @@
-[mypy]
-plugins = mypy_plugins/check_mypy_version.py, numpy.typing.mypy_plugin
-
-cache_dir = .mypy_cache/inductor
-allow_redefinition = True
-warn_unused_configs = True
-warn_redundant_casts = True
-show_error_codes = True
-show_column_numbers = True
-check_untyped_defs = True
-follow_imports = silent
-
-# do not reenable this:
-# https://github.com/pytorch/pytorch/pull/60006#issuecomment-866130657
-warn_unused_ignores = False
-disallow_any_generics = True
-
-files =
-    torch/_dynamo,
-    torch/_inductor
-
-# We access some Python runtime classes / class members that are only available
-# in 3.11. These accesses are gated by runtime checks that cannot always be
-# understood by mypy.
-python_version = 3.11
-
-[mypy-colorama.*]
-ignore_missing_imports = True
-
-[mypy-cutlass_library.*]
-ignore_missing_imports = True
-
-[mypy-deeplearning.*]
-ignore_missing_imports = True
-
-[mypy-dill.*]
-ignore_missing_imports = True
-
-[mypy-einops.*]
-ignore_missing_imports = True
-
-[mypy-libfb.*]
-ignore_missing_imports = True
-
-# sympy is too dynamic, hard to type properly
-[mypy-sympy.*]
-ignore_missing_imports = True
-follow_imports = skip
-
-[mypy-torch.*.fb.*]
-ignore_missing_imports = True
-
-# FIXME: importing this creates lots of type errors
-[mypy-torch._dynamo.variables.*]
-follow_imports = skip
-
-# FIXME: importing this creates lots of type errors
-[mypy-torch.backends.*]
-follow_imports = skip
-
-[mypy-torch.fb.*]
-ignore_missing_imports = True
-
-# FIXME: importing this creates lots of type errors
-[mypy-torch.fx.*]
-follow_imports = skip
-
-# FIXME: importing this creates lots of type errors
-[mypy-torch.testing._internal.*]
-follow_imports = skip
-
-# sympy is too dynamic, hard to type properly
-[mypy-torch.utils._sympy.*]
-follow_imports = skip
-
-[mypy-torch_xla.*]
-ignore_missing_imports = True
-
-[mypy-torchvision.*]
-ignore_missing_imports = True
-
-[mypy-triton.*]
-ignore_missing_imports = True
diff --git a/mypy-strict.ini b/mypy-strict.ini
index 19db492c17f40..82aa689cff2e1 100644
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@@ -17,21 +17,8 @@ show_column_numbers = True
 warn_no_return = True
 disallow_any_unimported = True
 
-# Across versions of mypy, the flags toggled by --strict vary.  To ensure
-# we have reproducible type check, we instead manually specify the flags
-warn_unused_configs = True
-disallow_any_generics = True
-disallow_subclassing_any = True
-disallow_untyped_calls = True
-disallow_untyped_defs = True
-disallow_incomplete_defs = True
-check_untyped_defs = True
-disallow_untyped_decorators = True
-no_implicit_optional = True
-warn_redundant_casts = True
-warn_return_any = True
+strict = True
 implicit_reexport = False
-strict_equality = True
 
 # do not reenable this:
 # https://github.com/pytorch/pytorch/pull/60006#issuecomment-866130657
diff --git a/mypy.ini b/mypy.ini
index 1122e1c1cabae..c306acbd944ad 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -2,7 +2,7 @@
 # test_run_mypy in test/test_type_hints.py uses this string)
 
 [mypy]
-plugins = mypy_plugins/check_mypy_version.py, numpy.typing.mypy_plugin
+plugins = mypy_plugins/check_mypy_version.py, mypy_plugins/sympy_mypy_plugin.py, numpy.typing.mypy_plugin
 
 cache_dir = .mypy_cache/normal
 allow_redefinition = True
@@ -11,7 +11,9 @@ warn_redundant_casts = True
 show_error_codes = True
 show_column_numbers = True
 check_untyped_defs = True
-follow_imports = silent
+follow_imports = normal
+local_partial_types = True
+enable_error_code = possibly-undefined
 
 # do not reenable this:
 # https://github.com/pytorch/pytorch/pull/60006#issuecomment-866130657
@@ -44,9 +46,7 @@ files =
 #
 exclude = torch/include/|torch/csrc/|torch/distributed/elastic/agent/server/api.py|torch/testing/_internal|torch/distributed/fsdp/fully_sharded_data_parallel.py
 
-# Minimum version supported - variable annotations were introduced
-# in Python 3.8
-python_version = 3.8
+python_version = 3.11
 
 
 #
@@ -270,3 +270,30 @@ ignore_missing_imports = True
 
 [mypy-usort.*]
 ignore_missing_imports = True
+
+[mypy-torch._inductor.*]
+disallow_any_generics = True
+
+[mypy-torch._dynamo.*]
+disallow_any_generics = True
+
+[mypy-cutlass_library.*]
+ignore_missing_imports = True
+
+[mypy-deeplearning.*]
+ignore_missing_imports = True
+
+[mypy-einops.*]
+ignore_missing_imports = True
+
+[mypy-libfb.*]
+ignore_missing_imports = True
+
+[mypy-torch.*.fb.*]
+ignore_missing_imports = True
+
+[mypy-torch.fb.*]
+ignore_missing_imports = True
+
+[mypy-torch_xla.*]
+ignore_missing_imports = True
diff --git a/mypy_plugins/sympy_mypy_plugin.py b/mypy_plugins/sympy_mypy_plugin.py
new file mode 100644
index 0000000000000..b2ffce0f29d15
--- /dev/null
+++ b/mypy_plugins/sympy_mypy_plugin.py
@@ -0,0 +1,59 @@
+from mypy.plugin import Plugin
+from mypy.plugins.common import add_attribute_to_class
+from mypy.types import NoneType, UnionType
+
+
+class SympyPlugin(Plugin):
+    def get_base_class_hook(self, fullname: str):
+        if fullname == "sympy.core.basic.Basic":
+            return add_assumptions
+        return None
+
+
+def add_assumptions(ctx) -> None:
+    # Generated by list(sys.modules['sympy.core.assumptions']._assume_defined)
+    # (do not import sympy to speedup mypy plugin load time)
+    assumptions = [
+        "hermitian",
+        "prime",
+        "noninteger",
+        "negative",
+        "antihermitian",
+        "infinite",
+        "finite",
+        "irrational",
+        "extended_positive",
+        "nonpositive",
+        "odd",
+        "algebraic",
+        "integer",
+        "rational",
+        "extended_real",
+        "nonnegative",
+        "transcendental",
+        "extended_nonzero",
+        "extended_negative",
+        "composite",
+        "complex",
+        "imaginary",
+        "nonzero",
+        "zero",
+        "even",
+        "positive",
+        "polar",
+        "extended_nonpositive",
+        "extended_nonnegative",
+        "real",
+        "commutative",
+    ]
+    for a in assumptions:
+        add_attribute_to_class(
+            ctx.api,
+            ctx.cls,
+            f"is_{a}",
+            UnionType([ctx.api.named_type("builtins.bool"), NoneType()]),
+        )
+
+
+def plugin(version: str):
+    return SympyPlugin
diff --git a/pt_template_srcs.bzl b/pt_template_srcs.bzl
index 6bfcfc6f23103..6d42026ba6ca9 100644
--- a/pt_template_srcs.bzl
+++ b/pt_template_srcs.bzl
@@ -131,6 +131,8 @@ def get_generate_code_bin_outs():
         "autograd/generated/VariableType_3.cpp": ["autograd/generated/VariableType_3.cpp"],
         "autograd/generated/VariableType_4.cpp": ["autograd/generated/VariableType_4.cpp"],
         "autograd/generated/variable_factories.h": ["autograd/generated/variable_factories.h"],
+        "autograd/generated/ViewFuncs.cpp": ["autograd/generated/ViewFuncs.cpp"],
+        "autograd/generated/ViewFuncs.h": ["autograd/generated/ViewFuncs.h"],
     }
 
     if is_arvr_mode():
diff --git a/pyproject.toml b/pyproject.toml
index 9508ad05f0784..3d0749d6b5767 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,14 +15,34 @@ build-backend = "setuptools.build_meta:__legacy__"
 
 
 [tool.black]
-# Uncomment if pyproject.toml worked fine to ensure consistency with flake8
-# line-length = 120
+line-length = 88
 target-version = ["py38", "py39", "py310", "py311"]
 
 
+[tool.isort]
+src_paths = ["caffe2", "torch", "torchgen", "functorch", "tests"]
+extra_standard_library = ["typing_extensions"]
+skip_gitignore = true
+skip_glob = ["third_party/*"]
+atomic = true
+profile = "black"
+indent = 4
+line_length = 88
+lines_after_imports = 2
+multi_line_output = 3
+include_trailing_comma = true
+
+
+[tool.usort.kown]
+first_party = ["caffe2", "torch", "torchgen", "functorch", "tests"]
+standard_library = ["typing_extensions"]
+
+
 [tool.ruff]
 target-version = "py38"
+line-length = 120
 
+[tool.ruff.lint]
 # NOTE: Synchoronize the ignores with .flake8
 ignore = [
     # these ignores are from flake8-bugbear; please fix!
@@ -31,7 +51,7 @@ ignore = [
     "B019",
     "B023",
     "B028", # No explicit `stacklevel` keyword argument found
-    "B904",
+    "B904", # Migrate from TRY200
     "E402",
     "C408", # C408 ignored because we like the dict keyword argument syntax
     "E501", # E501 is not flexible enough, we're using B950 instead
@@ -56,6 +76,7 @@ ignore = [
     "PYI041",
     "PYI056",
     "SIM102", "SIM103", "SIM112", # flake8-simplify code styles
+    "SIM113", # please fix
     "SIM105", # these ignores are from flake8-simplify. please fix or ignore with commented reason
     "SIM108",
     "SIM110",
@@ -67,7 +88,6 @@ ignore = [
     "UP006", # keep-runtime-typing
     "UP007", # keep-runtime-typing
 ]
-line-length = 120
 select = [
     "B",
     "C4",
@@ -78,6 +98,7 @@ select = [
     "SIM1",
     "W",
     # Not included in flake8
+    "LOG",
     "NPY",
     "PERF",
     "PGH004",
@@ -105,19 +126,26 @@ select = [
     "PT025",
     "PT026",
     "PYI",
+    "RSE",
     "RUF008", # mutable dataclass default
     "RUF015", # access first ele in constant time
     "RUF016", # type error non-integer index
     "RUF017",
-    "TRY200",
+    "RUF018", # no assignment in assert
+    "TRY002", # ban vanilla raise (todo fix NOQAs)
+    "TRY200", # TODO: migrate from deprecated alias
     "TRY302",
+    "TRY401", # verbose-log-message
     "UP",
 ]
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 "__init__.py" = [
     "F401",
 ]
+"functorch/notebooks/**" = [
+    "F401",
+]
 "test/typing/reveal/**" = [
     "F821",
 ]
@@ -136,7 +164,8 @@ select = [
     "RUF015",
     "UP", # We don't want to modify the jit test as they test specify syntax
 ]
-
+# autogenerated #TODO figure out why file level noqa is ignored
+"torch/_inductor/fx_passes/serialized_patterns/**" = ["F401", "F501"]
 "torch/onnx/**" = [
     "UP037", # ONNX does runtime type checking
 ]
diff --git a/pytest.ini b/pytest.ini
index 532e3bce098f3..e2ab2ebd0cc16 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -8,6 +8,10 @@ addopts =
     --capture=sys
     # don't suppress warnings, but don't shove them all to the end either
     -p no:warnings
+    # Use custom pytest shard located in test/pytest_shard_custom.py instead
+    -p no:pytest-shard
+    # don't rewrite assertions (usually not a problem in CI due to differences in imports, see #95844)
+    --assert=plain
 testpaths =
     test
 junit_logging_reruns = all
@@ -15,3 +19,6 @@ filterwarnings =
     ignore:Module already imported so cannot be rewritten.*hypothesis:pytest.PytestAssertRewriteWarning
 
 xfail_strict = True
+
+markers =
+    serial: marks tests as needs to be run serially (deselect with '-m "not serial"')
diff --git a/requirements-flake8.txt b/requirements-flake8.txt
deleted file mode 100644
index dc289b4e036fb..0000000000000
--- a/requirements-flake8.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-flake8==3.8.2
-flake8-bugbear==20.1.4
-flake8-comprehensions==3.3.0
-flake8-executable==2.0.4
-flake8-logging-format==0.9.0
-git+https://github.com/malfet/flake8-coding.git
-flake8-pyi==20.5.0
-mccabe==0.6.1
-pycodestyle==2.6.0
-pyflakes==2.2.0
-torchfix==0.2.0
diff --git a/requirements.txt b/requirements.txt
index 51fd003805fa4..09259eb5c23c7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,8 @@ filelock
 networkx
 jinja2
 fsspec
+lintrunner
 # setuptools was removed from default python install
 setuptools ; python_version >= "3.12"
 packaging
-optree>=0.9.1
+optree>=0.11.0
diff --git a/scripts/analysis/format_test_csv.py b/scripts/analysis/format_test_csv.py
index f5cbcde8344e9..d92a8f8829407 100644
--- a/scripts/analysis/format_test_csv.py
+++ b/scripts/analysis/format_test_csv.py
@@ -34,7 +34,7 @@
 
 out.writerow([hash, args.log_url, ""])
 
-with open(args.file, "r") as f:
+with open(args.file) as f:
     reader = csv.DictReader(f)
     for row in reader:
         if row["status"] not in {"failed", "error"}:
diff --git a/scripts/build_android.sh b/scripts/build_android.sh
index be018593331d5..0a842a20b9a18 100755
--- a/scripts/build_android.sh
+++ b/scripts/build_android.sh
@@ -128,8 +128,6 @@ CMAKE_ARGS+=("-DUSE_CUDA=OFF")
 CMAKE_ARGS+=("-DUSE_ITT=OFF")
 CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
 CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
-CMAKE_ARGS+=("-DUSE_LMDB=OFF")
-CMAKE_ARGS+=("-DUSE_LEVELDB=OFF")
 CMAKE_ARGS+=("-DUSE_MPI=OFF")
 CMAKE_ARGS+=("-DUSE_OPENMP=OFF")
 # Only toggle if VERBOSE=1
@@ -164,7 +162,7 @@ CMAKE_ARGS+=($@)
 
 # Patch pocketfft (as Android does not have aligned_alloc even if compiled with c++17
 if [ -f third_party/pocketfft/pocketfft_hdronly.h ]; then
-  sed -i -e "s/#if __cplusplus >= 201703L/#if 0/" third_party/pocketfft/pocketfft_hdronly.h
+  sed -i -e "s/__cplusplus >= 201703L/0/" third_party/pocketfft/pocketfft_hdronly.h
 fi
 
 # Now, actually build the Android target.
diff --git a/.circleci/scripts/build_android_gradle.sh b/scripts/build_android_gradle.sh
similarity index 94%
rename from .circleci/scripts/build_android_gradle.sh
rename to scripts/build_android_gradle.sh
index 8312a18eb0aad..fc27c5dd2516b 100755
--- a/.circleci/scripts/build_android_gradle.sh
+++ b/scripts/build_android_gradle.sh
@@ -22,7 +22,7 @@ done < <(find /var/lib/jenkins/.gradle -type f -print0)
 
 # Patch pocketfft (as Android does not have aligned_alloc even if compiled with c++17
 if [ -f ~/workspace/third_party/pocketfft/pocketfft_hdronly.h ]; then
-  sed -i -e "s/#if __cplusplus >= 201703L/#if 0/" ~/workspace/third_party/pocketfft/pocketfft_hdronly.h
+  sed -i -e "s/__cplusplus >= 201703L/0/" ~/workspace/third_party/pocketfft/pocketfft_hdronly.h
 fi
 
 export GRADLE_LOCAL_PROPERTIES=~/workspace/android/local.properties
@@ -40,7 +40,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *-gradle-custom-build* ]]; then
   # Install torch & torchvision - used to download & dump used ops from test model.
   retry pip install torch torchvision --progress-bar off
 
-  exec "$(dirname "${BASH_SOURCE[0]}")/../../android/build_test_app_custom.sh" armeabi-v7a
+  exec "$(dirname "${BASH_SOURCE[0]}")/../android/build_test_app_custom.sh" armeabi-v7a
 fi
 
 # Run default build
diff --git a/scripts/build_ios.sh b/scripts/build_ios.sh
index 8262eeced09c5..abe64717b3a57 100755
--- a/scripts/build_ios.sh
+++ b/scripts/build_ios.sh
@@ -107,8 +107,6 @@ CMAKE_ARGS+=("-DUSE_CUDA=OFF")
 CMAKE_ARGS+=("-DUSE_ITT=OFF")
 CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
 CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
-CMAKE_ARGS+=("-DUSE_LMDB=OFF")
-CMAKE_ARGS+=("-DUSE_LEVELDB=OFF")
 CMAKE_ARGS+=("-DUSE_MPI=OFF")
 CMAKE_ARGS+=("-DUSE_NUMPY=OFF")
 CMAKE_ARGS+=("-DUSE_NNPACK=OFF")
diff --git a/scripts/build_mobile.sh b/scripts/build_mobile.sh
index 902458b2350e0..a14441823a30e 100755
--- a/scripts/build_mobile.sh
+++ b/scripts/build_mobile.sh
@@ -68,8 +68,6 @@ CMAKE_ARGS+=("-DUSE_CUDA=OFF")
 CMAKE_ARGS+=("-DUSE_ITT=OFF")
 CMAKE_ARGS+=("-DUSE_GFLAGS=OFF")
 CMAKE_ARGS+=("-DUSE_OPENCV=OFF")
-CMAKE_ARGS+=("-DUSE_LMDB=OFF")
-CMAKE_ARGS+=("-DUSE_LEVELDB=OFF")
 CMAKE_ARGS+=("-DUSE_MPI=OFF")
 CMAKE_ARGS+=("-DUSE_OPENMP=OFF")
 CMAKE_ARGS+=("-DUSE_MKLDNN=OFF")
diff --git a/scripts/build_tegra_x1.sh b/scripts/build_tegra_x1.sh
index b1121ff1d7165..063e17dfe3514 100755
--- a/scripts/build_tegra_x1.sh
+++ b/scripts/build_tegra_x1.sh
@@ -29,10 +29,7 @@ sudo apt-get install \
 # obtain optional dependencies that are usually useful to have.
 echo "Installing optional dependencies."
 sudo apt-get install \
-  libleveldb-dev \
-  liblmdb-dev \
   libpython-dev \
-  libsnappy-dev \
   python-numpy \
   python-pip \
   python-protobuf
diff --git a/scripts/build_tizen.sh b/scripts/build_tizen.sh
index 33fc65c50c9e2..ce64b6c4298ec 100755
--- a/scripts/build_tizen.sh
+++ b/scripts/build_tizen.sh
@@ -55,9 +55,6 @@ cmake .. \
     -DRUN_HAVE_POSIX_REGEX=0 \
     -DHAVE_GNU_POSIX_REGEX=0 \
     -DUSE_MPI=OFF -DUSE_OPENMP=OFF \
-    -DUSE_ROCKSDB=OFF \
-    -DUSE_LEVELDB=OFF \
-    -DUSE_LMDB=OFF \
     -DBUILD_PYTHON=OFF \
     -DUSE_GLOO=OFF \
     -DUSE_OPENCV=OFF \
@@ -84,10 +81,7 @@ sudo zypper install \
 # Obtain optional dependencies that are usually useful to have.
 echo "Installing optional dependencies."
 sudo zypper install \
-  libleveldb-dev \
-  liblmdb-dev \
   libpython-dev \
-  libsnappy-dev \
   python-numpy \
   python-pip \
   python-protobuf
@@ -110,7 +104,6 @@ cmake "$CAFFE2_ROOT" \
     -DUSE_CUDA=OFF \
     -DUSE_ITT=OFF \
     -DUSE_OPENCV=OFF \
-    -DUSE_LMDB=OFF \
     -DCAFFE2_CPU_FLAGS="-mfpu=neon -mfloat-abi=soft" \
     || exit 1
 
diff --git a/scripts/compile_tests/common.py b/scripts/compile_tests/common.py
new file mode 100644
index 0000000000000..c62821c132242
--- /dev/null
+++ b/scripts/compile_tests/common.py
@@ -0,0 +1,142 @@
+import functools
+import os
+import warnings
+
+try:
+    import lxml.etree
+
+    p = lxml.etree.XMLParser(huge_tree=True)
+    parse = functools.partial(lxml.etree.parse, parser=p)
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+    parse = ET.parse
+    warnings.warn(
+        "lxml was not found. `pip install lxml` to make this script run much faster"
+    )
+
+
+def open_test_results(directory):
+    xmls = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.endswith(".xml"):
+                tree = parse(f"{root}/{file}")
+                xmls.append(tree)
+    return xmls
+
+
+def get_testcases(xmls):
+    testcases = []
+    for xml in xmls:
+        root = xml.getroot()
+        testcases.extend(list(root.iter("testcase")))
+    return testcases
+
+
+def find(testcase, condition):
+    children = list(testcase.iter())
+    assert children[0] is testcase
+    children = children[1:]
+    return condition(children)
+
+
+def skipped_test(testcase):
+    def condition(children):
+        tags = [child.tag for child in children]
+        if "skipped" in tags:
+            return True
+        return False
+
+    return find(testcase, condition)
+
+
+def passed_test(testcase):
+    def condition(children):
+        if len(children) == 0:
+            return True
+        tags = [child.tag for child in children]
+        if "skipped" in tags:
+            return False
+        if "failed" in tags:
+            return False
+        return True
+
+    return find(testcase, condition)
+
+
+def key(testcase):
+    file = testcase.attrib.get("file", "UNKNOWN")
+    classname = testcase.attrib["classname"]
+    name = testcase.attrib["name"]
+    return "::".join([file, classname, name])
+
+
+def get_passed_testcases(xmls):
+    testcases = get_testcases(xmls)
+    passed_testcases = [testcase for testcase in testcases if passed_test(testcase)]
+    return passed_testcases
+
+
+def get_excluded_testcases(xmls):
+    testcases = get_testcases(xmls)
+    excluded_testcases = [t for t in testcases if excluded_testcase(t)]
+    return excluded_testcases
+
+
+def excluded_testcase(testcase):
+    def condition(children):
+        for child in children:
+            if child.tag == "skipped":
+                if "Policy: we don't run" in child.attrib["message"]:
+                    return True
+        return False
+
+    return find(testcase, condition)
+
+
+def is_unexpected_success(testcase):
+    def condition(children):
+        for child in children:
+            if child.tag != "failure":
+                continue
+            is_unexpected_success = (
+                "unexpected success" in child.attrib["message"].lower()
+            )
+            if is_unexpected_success:
+                return True
+        return False
+
+    return find(testcase, condition)
+
+
+MSG = "This test passed, maybe we can remove the skip from dynamo_test_failures.py"
+
+
+def is_passing_skipped_test(testcase):
+    def condition(children):
+        for child in children:
+            if child.tag != "skipped":
+                continue
+            has_passing_skipped_test_msg = MSG in child.attrib["message"]
+            if has_passing_skipped_test_msg:
+                return True
+        return False
+
+    return find(testcase, condition)
+
+
+# NB: not an unexpected success
+def is_failure(testcase):
+    def condition(children):
+        for child in children:
+            if child.tag != "failure":
+                continue
+            is_unexpected_success = (
+                "unexpected success" in child.attrib["message"].lower()
+            )
+            if not is_unexpected_success:
+                return True
+        return False
+
+    return find(testcase, condition)
diff --git a/scripts/compile_tests/download_reports.py b/scripts/compile_tests/download_reports.py
new file mode 100644
index 0000000000000..c428ddca8a04b
--- /dev/null
+++ b/scripts/compile_tests/download_reports.py
@@ -0,0 +1,114 @@
+import json
+import os
+import pprint
+import re
+import subprocess
+
+import requests
+
+
+CONFIGS = {
+    "dynamo38": {
+        "linux-focal-py3.8-clang10 / test (dynamo, 1, 3, linux.2xlarge)",
+        "linux-focal-py3.8-clang10 / test (dynamo, 2, 3, linux.2xlarge)",
+        "linux-focal-py3.8-clang10 / test (dynamo, 3, 3, linux.2xlarge)",
+    },
+    "dynamo311": {
+        "linux-focal-py3.11-clang10 / test (dynamo, 1, 3, linux.2xlarge)",
+        "linux-focal-py3.11-clang10 / test (dynamo, 2, 3, linux.2xlarge)",
+        "linux-focal-py3.11-clang10 / test (dynamo, 3, 3, linux.2xlarge)",
+    },
+    "eager311": {
+        "linux-focal-py3.11-clang10 / test (default, 1, 3, linux.2xlarge)",
+        "linux-focal-py3.11-clang10 / test (default, 2, 3, linux.2xlarge)",
+        "linux-focal-py3.11-clang10 / test (default, 3, 3, linux.2xlarge)",
+    },
+}
+
+
+def download_reports(commit_sha, configs=("dynamo38", "dynamo311", "eager311")):
+    log_dir = "tmp_test_reports_" + commit_sha
+
+    def subdir_path(config):
+        return f"{log_dir}/{config}"
+
+    for config in configs:
+        assert config in CONFIGS.keys(), config
+    subdir_paths = [subdir_path(config) for config in configs]
+
+    # See which configs we haven't downloaded logs for yet
+    missing_configs = []
+    for config, path in zip(configs, subdir_paths):
+        if os.path.exists(path):
+            continue
+        missing_configs.append(config)
+    if len(missing_configs) == 0:
+        print(
+            f"All required logs appear to exist, not downloading again. Run `rm -rf {log_dir}` if this is not the case"
+        )
+        return subdir_paths
+
+    output = subprocess.check_output(
+        ["gh", "run", "list", "-c", commit_sha, "-w", "pull", "--json", "databaseId"]
+    ).decode()
+    workflow_run_id = str(json.loads(output)[0]["databaseId"])
+    output = subprocess.check_output(["gh", "run", "view", workflow_run_id])
+    workflow_jobs = parse_workflow_jobs(output)
+    print("found the following workflow jobs:")
+    pprint.pprint(workflow_jobs)
+
+    # Figure out which jobs we need to download logs for
+    required_jobs = []
+    for config in configs:
+        required_jobs.extend(list(CONFIGS[config]))
+    for job in required_jobs:
+        assert (
+            job in workflow_jobs
+        ), f"{job} not found, is the commit_sha correct? has the job finished running? The GitHub API may take a couple minutes to update."
+
+    # This page lists all artifacts.
+    listings = requests.get(
+        f"https://hud.pytorch.org/api/artifacts/s3/{workflow_run_id}"
+    ).json()
+
+    def download_report(job_name, subdir):
+        job_id = workflow_jobs[job_name]
+        for listing in listings:
+            name = listing["name"]
+            if not name.startswith("test-reports-"):
+                continue
+            if name.endswith(f"_{job_id}.zip"):
+                url = listing["url"]
+                subprocess.run(["wget", "-P", subdir, url], check=True)
+                path_to_zip = f"{subdir}/{name}"
+                dir_name = path_to_zip[:-4]
+                subprocess.run(["unzip", path_to_zip, "-d", dir_name], check=True)
+                return
+        raise AssertionError("should not be hit")
+
+    if not os.path.exists(log_dir):
+        os.mkdir(log_dir)
+
+    for config in set(configs) - set(missing_configs):
+        print(
+            f"Logs for {config} already exist, not downloading again. Run `rm -rf {subdir_path(config)}` if this is not the case."
+        )
+    for config in missing_configs:
+        subdir = subdir_path(config)
+        os.mkdir(subdir)
+        job_names = CONFIGS[config]
+        for job_name in job_names:
+            download_report(job_name, subdir)
+
+    return subdir_paths
+
+
+def parse_workflow_jobs(output):
+    result = {}
+    lines = output.decode().split("\n")
+    for line in lines:
+        match = re.search(r"(\S+ / .*) in .* \(ID (\d+)\)", line)
+        if match is None:
+            continue
+        result[match.group(1)] = match.group(2)
+    return result
diff --git a/scripts/compile_tests/failures_histogram.py b/scripts/compile_tests/failures_histogram.py
new file mode 100644
index 0000000000000..6e65888a1c744
--- /dev/null
+++ b/scripts/compile_tests/failures_histogram.py
@@ -0,0 +1,166 @@
+import argparse
+import re
+
+from common import download_reports, get_testcases, key, open_test_results, skipped_test
+
+from passrate import compute_pass_rate
+
+
+"""
+python failures_histogram.py commit_sha
+
+Analyzes skip reasons for Dynamo tests and prints a histogram with repro
+commands. You'll need to provide the commit_sha for a commit on the main branch,
+from which we will pull CI test results.
+
+This script requires the `gh` cli. You'll need to install it and then
+authenticate with it via `gh auth login` before using this script.
+https://docs.github.com/en/github-cli/github-cli/quickstart
+"""
+
+
+def skip_reason(testcase):
+    for child in testcase.iter():
+        if child.tag != "skipped":
+            continue
+        return child.attrib["message"]
+    raise AssertionError("no message?")
+
+
+def skip_reason_normalized(testcase):
+    for child in testcase.iter():
+        if child.tag != "skipped":
+            continue
+        result = child.attrib["message"].split("\n")[0]
+        result = result.split(">")[0]
+        result = re.sub(r"0x\w+", "0xDEADBEEF", result)
+        result = re.sub(r"MagicMock id='\d+'", "MagicMock id='0000000000'", result)
+        result = re.sub(r"issues/\d+", "issues/XXX", result)
+        result = re.sub(r"torch.Size\(\[.*\]\)", "torch.Size([...])", result)
+        result = re.sub(
+            r"Could not get qualified name for class '.*'",
+            "Could not get qualified name for class",
+            result,
+        )
+        return result
+    raise AssertionError("no message?")
+
+
+def get_failures(testcases):
+    skipped = [t for t in testcases if skipped_test(t)]
+    skipped_dict = {}
+    for s in skipped:
+        reason = skip_reason_normalized(s)
+        if reason not in skipped_dict:
+            skipped_dict[reason] = []
+        skipped_dict[reason].append(s)
+    result = []
+    for s, v in skipped_dict.items():
+        result.append((len(v), s, v))
+    result.sort(reverse=True)
+    return result
+
+
+def repro(testcase):
+    return f"PYTORCH_TEST_WITH_DYNAMO=1 pytest {testcase.attrib['file']} -v -k {testcase.attrib['name']}"
+
+
+def all_tests(testcase):
+    return f"{testcase.attrib['file']}::{testcase.attrib['classname']}.{testcase.attrib['name']}"
+
+
+# e.g. "17c5f69852/eager", "17c5f69852/dynamo"
+def failures_histogram(eager_dir, dynamo_dir, verbose=False, format_issues=False):
+    fail_keys = compute_pass_rate(eager_dir, dynamo_dir)
+    xmls = open_test_results(dynamo_dir)
+
+    testcases = get_testcases(xmls)
+    testcases = [t for t in testcases if key(t) in fail_keys]
+    dct = get_failures(testcases)
+
+    result = []
+    for count, reason, testcases in dct:
+        if verbose:
+            row = (
+                count,
+                reason,
+                repro(testcases[0]),
+                [all_tests(t) for t in testcases],
+            )
+        else:
+            row = (count, reason, repro(testcases[0]))
+        result.append(row)
+
+    header = (
+        "(num_failed_tests, error_msg, sample_test, all_tests)"
+        if verbose
+        else "(num_failed_tests, error_msg, sample_test)"
+    )
+    print(header)
+    sum_counts = sum(r[0] for r in result)
+    for row in result:
+        if format_issues:
+            print(as_issue(*row))
+        else:
+            print(row)
+    print("[counts]", sum_counts)
+
+
+def as_issue(count, msg, repro, tests):
+    tests = "\n".join(tests)
+    result = f"""
+{'-' * 50}
+{count} Dynamo test are failing with \"{msg}\".
+
+## Repro
+
+`{repro}`
+
+You will need to remove the skip or expectedFailure before running the repro command.
+This may be just removing a sentinel file from in
+[dynamo_expected_failures](https://github.com/pytorch/pytorch/blob/main/test/dynamo_expected_failures)
+or [dynamo_skips](https://github.com/pytorch/pytorch/blob/main/test/dynamo_skips).
+
+
+## Failing tests
+
+Here's a comprehensive list of tests that fail (as of this issue) with the above message:
+<details>
+<summary>Click me</summary>
+```
+{tests}
+```
+</details>
+"""
+    return result
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="failures_histogram",
+        description="See statistics about skipped Dynamo tests",
+    )
+    parser.add_argument(
+        "commit",
+        help=(
+            "The commit sha for the latest commit on a PR from which we will "
+            "pull CI test results, e.g. 7e5f597aeeba30c390c05f7d316829b3798064a5"
+        ),
+    )
+    parser.add_argument(
+        "-v", "--verbose", help="Prints all failing test names", action="store_true"
+    )
+    parser.add_argument(
+        "--format-issues",
+        help="Prints histogram in a way that they can be copy-pasted as a github issues",
+        action="store_true",
+    )
+    args = parser.parse_args()
+
+    # args.format_issues implies verbose=True
+    verbose = args.verbose
+    if args.format_issues:
+        verbose = True
+
+    dynamo311, eager311 = download_reports(args.commit, ("dynamo311", "eager311"))
+    failures_histogram(eager311, dynamo311, verbose, args.format_issues)
diff --git a/scripts/compile_tests/passrate.py b/scripts/compile_tests/passrate.py
new file mode 100644
index 0000000000000..1dd0b7bc5520f
--- /dev/null
+++ b/scripts/compile_tests/passrate.py
@@ -0,0 +1,101 @@
+import argparse
+
+from common import (
+    get_excluded_testcases,
+    get_passed_testcases,
+    get_testcases,
+    key,
+    open_test_results,
+)
+
+from download_reports import download_reports
+
+"""
+Usage: passrate.py commit_sha
+
+Parses test reports to measure the passrate. The passrate is defined as:
+
+A) Take the number of tests that pass under eager mode, excluding
+CUDA, OpInfo, and ModuleInfo tests
+B) Of those tests, count the number of tests that pass under Dynamo
+C) Take B/A.
+
+You'll need to provide the commit_sha for a commit on the main branch,
+from which we will pull CI test results.
+
+This script requires the `gh` cli. You'll need to install it and then
+authenticate with it via `gh auth login` before using this script.
+https://docs.github.com/en/github-cli/github-cli/quickstart
+"""
+
+
+def testcases_by_time(xmls):
+    testcases = get_testcases(xmls)
+    testcases.sort(reverse=True, key=lambda x: float(x.attrib["time"]))
+    return testcases
+
+
+def should_exclude(key):
+    test_file = key.split("::")[0]
+    # C++ tests
+    if test_file == "UNKNOWN":
+        return True
+    # Policy: "pass rate" does not include inductor, export, or dynamo tests.
+    if test_file.startswith("inductor/"):
+        return True
+    if test_file.startswith("export/"):
+        return True
+    if test_file.startswith("dynamo/"):
+        return True
+    return False
+
+
+def compute_pass_rate(eager_dir, dynamo_dir):
+    print("parsing xmls")
+    eager_xmls = open_test_results(eager_dir)
+    dynamo_xmls = open_test_results(dynamo_dir)
+
+    print("computing pass rate")
+    eager_passed = get_passed_testcases(eager_xmls)
+    dynamo_passed = get_passed_testcases(dynamo_xmls)
+    dynamo_pass_keys = {key(testcase) for testcase in dynamo_passed}
+    dynamo_pass_keys = {key_ for key_ in dynamo_pass_keys if not should_exclude(key_)}
+    tmp_eager_pass_keys = {key(testcase) for testcase in eager_passed}
+    tmp_eager_pass_keys = {
+        key_ for key_ in tmp_eager_pass_keys if not should_exclude(key_)
+    }
+    excluded = [key(t) for t in get_excluded_testcases(dynamo_xmls)]
+    eager_pass_keys = tmp_eager_pass_keys - set(excluded)
+
+    subset = eager_pass_keys.intersection(dynamo_pass_keys)
+    total_subset = len(subset)
+    total_tests = len(eager_pass_keys)
+    print("pass rate", total_subset / total_tests, total_subset, total_tests)
+
+    dynamo_testcases = get_testcases(dynamo_xmls)
+    tc = {key(t): t for t in dynamo_testcases}
+
+    # Useful for debugging
+    not_there_keys = set()
+    for key_ in eager_pass_keys:
+        if key_ not in tc:
+            not_there_keys.add(key_)
+
+    fail_keys = eager_pass_keys - subset
+    return fail_keys
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="passrate", description="Computes the Dynamo unittest pass rate"
+    )
+    parser.add_argument(
+        "commit",
+        help=(
+            "The commit sha for the latest commit on a PR from which we will "
+            "pull CI test results, e.g. 7e5f597aeeba30c390c05f7d316829b3798064a5"
+        ),
+    )
+    args = parser.parse_args()
+    dynamo311, eager311 = download_reports(args.commit, ("dynamo311", "eager311"))
+    compute_pass_rate(eager311, dynamo311)
diff --git a/scripts/compile_tests/update_failures.py b/scripts/compile_tests/update_failures.py
new file mode 100755
index 0000000000000..929ed9fe20ac4
--- /dev/null
+++ b/scripts/compile_tests/update_failures.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+import argparse
+import os
+import pathlib
+import subprocess
+
+from common import (
+    get_testcases,
+    is_failure,
+    is_passing_skipped_test,
+    is_unexpected_success,
+    key,
+    open_test_results,
+)
+
+from download_reports import download_reports
+
+"""
+Usage: update_failures.py /path/to/dynamo_test_failures.py /path/to/test commit_sha
+
+Best-effort updates the xfail and skip files under test directory
+by parsing test reports.
+
+You'll need to provide the commit_sha for the latest commit on a PR
+from which we will pull CI test results.
+
+Instructions:
+- On your PR, add the "keep-going" label to ensure that all the tests are
+  failing (as opposed to CI stopping on the first failure). You may need to
+  restart your test jobs by force-pushing to your branch for CI to pick
+  up the "keep-going" label.
+- Wait for all the tests to finish running.
+- Find the full SHA of your commit and run this command.
+
+This script requires the `gh` cli. You'll need to install it and then
+authenticate with it via `gh auth login` before using this script.
+https://docs.github.com/en/github-cli/github-cli/quickstart
+"""
+
+
+def patch_file(
+    filename, test_dir, unexpected_successes, new_xfails, new_skips, unexpected_skips
+):
+    failures_directory = os.path.join(test_dir, "dynamo_expected_failures")
+    skips_directory = os.path.join(test_dir, "dynamo_skips")
+
+    dynamo_expected_failures = set(os.listdir(failures_directory))
+    dynamo_skips = set(os.listdir(skips_directory))
+
+    # These are hand written skips
+    extra_dynamo_skips = set()
+    with open(filename) as f:
+        start = False
+        for text in f.readlines():
+            text = text.strip()
+            if start:
+                if text == "}":
+                    break
+                extra_dynamo_skips.add(text.strip(',"'))
+            else:
+                if text == "extra_dynamo_skips = {":
+                    start = True
+
+    def format(testcase):
+        classname = testcase.attrib["classname"]
+        name = testcase.attrib["name"]
+        return f"{classname}.{name}"
+
+    formatted_unexpected_successes = {
+        f"{format(test)}" for test in unexpected_successes.values()
+    }
+    formatted_unexpected_skips = {
+        f"{format(test)}" for test in unexpected_skips.values()
+    }
+    formatted_new_xfails = [f"{format(test)}" for test in new_xfails.values()]
+    formatted_new_skips = [f"{format(test)}" for test in new_skips.values()]
+
+    def remove_file(path, name):
+        file = os.path.join(path, name)
+        cmd = ["git", "rm", file]
+        subprocess.run(cmd)
+
+    def add_file(path, name):
+        file = os.path.join(path, name)
+        with open(file, "w") as fp:
+            pass
+        cmd = ["git", "add", file]
+        subprocess.run(cmd)
+
+    covered_unexpected_successes = set()
+
+    # dynamo_expected_failures
+    for test in dynamo_expected_failures:
+        if test in formatted_unexpected_successes:
+            covered_unexpected_successes.add(test)
+            remove_file(failures_directory, test)
+    for test in formatted_new_xfails:
+        add_file(failures_directory, test)
+
+    leftover_unexpected_successes = (
+        formatted_unexpected_successes - covered_unexpected_successes
+    )
+    if len(leftover_unexpected_successes) > 0:
+        print(
+            "WARNING: we were unable to remove these "
+            f"{len(leftover_unexpected_successes)} expectedFailures:"
+        )
+        for stuff in leftover_unexpected_successes:
+            print(stuff)
+
+    # dynamo_skips
+    for test in dynamo_skips:
+        if test in formatted_unexpected_skips:
+            remove_file(skips_directory, test)
+    for test in extra_dynamo_skips:
+        if test in formatted_unexpected_skips:
+            print(
+                f"WARNING: {test} in dynamo_test_failures.py needs to be removed manually"
+            )
+    for test in formatted_new_skips:
+        add_file(skips_directory, test)
+
+
+def get_intersection_and_outside(a_dict, b_dict):
+    a = set(a_dict.keys())
+    b = set(b_dict.keys())
+    intersection = a.intersection(b)
+    outside = (a.union(b)) - intersection
+
+    def build_dict(keys):
+        result = {}
+        for k in keys:
+            if k in a_dict:
+                result[k] = a_dict[k]
+            else:
+                result[k] = b_dict[k]
+        return result
+
+    return build_dict(intersection), build_dict(outside)
+
+
+def update(filename, test_dir, py38_dir, py311_dir, also_remove_skips):
+    def read_test_results(directory):
+        xmls = open_test_results(directory)
+        testcases = get_testcases(xmls)
+        unexpected_successes = {
+            key(test): test for test in testcases if is_unexpected_success(test)
+        }
+        failures = {key(test): test for test in testcases if is_failure(test)}
+        passing_skipped_tests = {
+            key(test): test for test in testcases if is_passing_skipped_test(test)
+        }
+        return unexpected_successes, failures, passing_skipped_tests
+
+    (
+        py38_unexpected_successes,
+        py38_failures,
+        py38_passing_skipped_tests,
+    ) = read_test_results(py38_dir)
+    (
+        py311_unexpected_successes,
+        py311_failures,
+        py311_passing_skipped_tests,
+    ) = read_test_results(py311_dir)
+
+    unexpected_successes = {**py38_unexpected_successes, **py311_unexpected_successes}
+    _, skips = get_intersection_and_outside(
+        py38_unexpected_successes, py311_unexpected_successes
+    )
+    xfails, more_skips = get_intersection_and_outside(py38_failures, py311_failures)
+    if also_remove_skips:
+        unexpected_skips, _ = get_intersection_and_outside(
+            py38_passing_skipped_tests, py311_passing_skipped_tests
+        )
+    else:
+        unexpected_skips = {}
+    all_skips = {**skips, **more_skips}
+    print(
+        f"Discovered {len(unexpected_successes)} new unexpected successes, "
+        f"{len(xfails)} new xfails, {len(all_skips)} new skips, {len(unexpected_skips)} new unexpected skips"
+    )
+    return patch_file(
+        filename, test_dir, unexpected_successes, xfails, all_skips, unexpected_skips
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="update_dynamo_test_failures",
+        description="Read from logs and update the dynamo_test_failures file",
+    )
+    # dynamo_test_failures path
+    parser.add_argument(
+        "filename",
+        nargs="?",
+        default=str(
+            pathlib.Path(__file__).absolute().parent.parent.parent
+            / "torch/testing/_internal/dynamo_test_failures.py"
+        ),
+        help="Optional path to dynamo_test_failures.py",
+    )
+    # test path
+    parser.add_argument(
+        "test_dir",
+        nargs="?",
+        default=str(pathlib.Path(__file__).absolute().parent.parent.parent / "test"),
+        help="Optional path to test folder",
+    )
+    parser.add_argument(
+        "commit",
+        help=(
+            "The commit sha for the latest commit on a PR from which we will "
+            "pull CI test results, e.g. 7e5f597aeeba30c390c05f7d316829b3798064a5"
+        ),
+    )
+    parser.add_argument(
+        "--also-remove-skips",
+        help="Also attempt to remove skips. WARNING: does not guard against test flakiness",
+        action="store_true",
+    )
+    args = parser.parse_args()
+    assert pathlib.Path(args.filename).exists(), args.filename
+    assert pathlib.Path(args.test_dir).exists(), args.test_dir
+    dynamo38, dynamo311 = download_reports(args.commit, ("dynamo38", "dynamo311"))
+    update(args.filename, args.test_dir, dynamo38, dynamo311, args.also_remove_skips)
diff --git a/scripts/diagnose_protobuf.py b/scripts/diagnose_protobuf.py
index 30bdbb2c2fe65..aa2844dbf0ab6 100644
--- a/scripts/diagnose_protobuf.py
+++ b/scripts/diagnose_protobuf.py
@@ -71,13 +71,11 @@
         https://github.com/google/protobuf/releases/
 """
 
-VERSION_MISMATCH = """
-Your python protobuf is of version {py_ver} but your native protoc version is of
-version {native_ver}. This will cause the installation to produce incompatible
+VERSION_MISMATCH = f"""
+Your python protobuf is of version {python_version} but your native protoc version is of
+version {native_version}. This will cause the installation to produce incompatible
 protobuf files. This is bad in general - consider installing the same version.
-""".format(
-    py_ver=python_version, native_ver=native_version
-)
+"""
 
 # Now, give actual recommendations
 if not python_protobuf_installed:
diff --git a/scripts/export/update_schema.py b/scripts/export/update_schema.py
new file mode 100644
index 0000000000000..dff24c83bd8f7
--- /dev/null
+++ b/scripts/export/update_schema.py
@@ -0,0 +1,71 @@
+import argparse
+import os
+
+from torch._export.serde import schema_check
+from yaml import dump, Dumper
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(prog="update_schema")
+    parser.add_argument(
+        "--prefix", type=str, required=True, help="The root of pytorch directory."
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print the schema instead of writing it to file.",
+    )
+    parser.add_argument(
+        "--force-unsafe",
+        action="store_true",
+        help="!!! Only use this option when you are a chad. !!! Force to write the schema even if schema validation doesn't pass.",
+    )
+    args = parser.parse_args()
+
+    assert os.path.exists(
+        args.prefix
+    ), f"Assuming path {args.prefix} is the root of pytorch directory, but it doesn't exist."
+
+    commit = schema_check.update_schema()
+
+    if os.path.exists(args.prefix + commit.path):
+        if commit.result["SCHEMA_VERSION"] < commit.base["SCHEMA_VERSION"]:
+            raise RuntimeError(
+                f"Schema version downgraded from {commit.base['SCHEMA_VERSION']} to {commit.result['SCHEMA_VERSION']}."
+            )
+
+        if commit.result["TREESPEC_VERSION"] < commit.base["TREESPEC_VERSION"]:
+            raise RuntimeError(
+                f"Treespec version downgraded from {commit.base['TREESPEC_VERSION']} to {commit.result['TREESPEC_VERSION']}."
+            )
+    else:
+        assert (
+            args.force_unsafe
+        ), "Existing schema yaml file not found, please use --force-unsafe to try again."
+
+    next_version, reason = schema_check.check(commit, args.force_unsafe)
+
+    if next_version is not None and next_version != commit.result["SCHEMA_VERSION"]:
+        raise RuntimeError(
+            f"Schema version is not updated from {commit.base['SCHEMA_VERSION']} to {next_version}.\n"
+            + "Please either:\n"
+            + "    1. update schema.py to not break compatibility.\n"
+            + "    or 2. bump the schema version to the expected value.\n"
+            + "    or 3. use --force-unsafe to override schema.yaml (not recommended).\n "
+            + "and try again.\n"
+            + f"Reason: {reason}"
+        )
+
+    header = (
+        "# @" + "generated by " + os.path.basename(__file__).rsplit(".", 1)[0] + ".py"
+    )
+    header += f"\n# checksum<<{commit.checksum_result}>>"
+    payload = dump(commit.result, Dumper=Dumper, sort_keys=False)
+
+    content = header + "\n" + payload
+
+    if args.dry_run:
+        print(content)
+        print("\nWill write the above schema to" + args.prefix + commit.path)
+    else:
+        with open(args.prefix + commit.path, "w") as f:
+            f.write(content)
diff --git a/scripts/install_triton_wheel.sh b/scripts/install_triton_wheel.sh
index 79d0417d8ab4f..269b80d07599f 100755
--- a/scripts/install_triton_wheel.sh
+++ b/scripts/install_triton_wheel.sh
@@ -1,3 +1,11 @@
 #!/bin/bash
 # Updates Triton to the pinned version for this copy of PyTorch
-pip install --index-url https://download.pytorch.org/whl/nightly/ "pytorch-triton==$(cat .ci/docker/triton_version.txt)+$(head -c 10 .ci/docker/ci_commit_pins/triton.txt)"
+BRANCH=$(git rev-parse --abbrev-ref HEAD)
+TRITON_VERSION="pytorch-triton==$(cat .ci/docker/triton_version.txt)"
+DOWNLOAD_PYTORCH_ORG="https://download.pytorch.org/whl"
+
+if [[ "$BRANCH" =~ .*release.* ]]; then
+    pip install --index-url ${DOWNLOAD_PYTORCH_ORG}/test/ $TRITON_VERSION
+else
+    pip install --index-url ${DOWNLOAD_PYTORCH_ORG}/nightly/ $TRITON_VERSION+$(head -c 10 .ci/docker/ci_commit_pins/triton.txt)
+fi
diff --git a/scripts/model_zoo/update-caffe2-models.py b/scripts/model_zoo/update-caffe2-models.py
index 309c76881f54c..1053530d05c55 100755
--- a/scripts/model_zoo/update-caffe2-models.py
+++ b/scripts/model_zoo/update-caffe2-models.py
@@ -110,7 +110,7 @@ def generate_models():
         caffe2_model_dir = sc._caffe2_model_dir(model)
         onnx_model_dir, onnx_models_dir = sc._onnx_model_dir(model)
         subprocess.check_call(["echo", model])
-        with open(os.path.join(caffe2_model_dir, "value_info.json"), "r") as f:
+        with open(os.path.join(caffe2_model_dir, "value_info.json")) as f:
             value_info = f.read()
         subprocess.check_call(
             [
diff --git a/scripts/model_zoo/update-models-from-caffe2.py b/scripts/model_zoo/update-models-from-caffe2.py
index a75683abed60b..b79e0bf0002df 100644
--- a/scripts/model_zoo/update-models-from-caffe2.py
+++ b/scripts/model_zoo/update-models-from-caffe2.py
@@ -90,9 +90,7 @@ def download_onnx_model(model_name, zoo_dir, use_cache=True, only_local=False):
     try:
         download_file.close()
         print(
-            "Downloading ONNX model {} from {} and save in {} ...\n".format(
-                model_name, url, download_file.name
-            )
+            f"Downloading ONNX model {model_name} from {url} and save in {download_file.name} ...\n"
         )
         urlretrieve(url, download_file.name)
         with tarfile.open(download_file.name) as t:
@@ -300,9 +298,7 @@ def onnx_verify(onnx_model, inputs, ref_outputs):
         )
 
         print(f"Deleteing old ONNX {onnx_model_name} model...")
-        for f in glob.glob(
-            os.path.join(onnx_model_dir, "model*".format(onnx_model_name))
-        ):
+        for f in glob.glob(os.path.join(onnx_model_dir, "model*".format())):
             os.remove(f)
 
         print(f"Serializing generated ONNX {onnx_model_name} model ...")
diff --git a/scripts/release/apply-release-changes.sh b/scripts/release/apply-release-changes.sh
index 613a0e4eff6b7..9c5c96831cefb 100755
--- a/scripts/release/apply-release-changes.sh
+++ b/scripts/release/apply-release-changes.sh
@@ -1,17 +1,21 @@
 #!/usr/bin/env bash
 #
+# Step 2 after branch cut is complete.
+#
+# Creates PR with release only changes.
+#
+# Prerequisite: Must be  successfully authenticated in aws fbossci account.
+#
 # Usage (run from root of project):
-#  DRY_RUN=disabled RELEASE_VERSION=2.2 ./scripts/release/apply-release-changes.sh
+#  DRY_RUN=disabled ./scripts/release/apply-release-changes.sh
 #
 # RELEASE_VERSION: Version of this current release
 
 set -eou pipefail
 
-# Create and Check out to Release Branch
-# git checkout -b "${RELEASE_BRANCH}"
-
+GIT_TOP_DIR=$(git rev-parse --show-toplevel)
+RELEASE_VERSION=${RELEASE_VERSION:-$(cut -d'.' -f1-2 "${GIT_TOP_DIR}/version.txt")}
 DRY_RUN=${DRY_RUN:-enabled}
-python3 .github/scripts/tag_docker_images_for_release.py --version ${RELEASE_VERSION} --dry-run ${DRY_RUN}
 
 # Change all GitHub Actions to reference the test-infra release branch
 # as opposed to main.
@@ -46,8 +50,9 @@ SLOW_VER=$(aws s3api list-object-versions --bucket ossci-metrics --prefix slow-t
 DISABLED_TESTS_VER=$(aws s3api list-object-versions --bucket ossci-metrics --prefix disabled-tests-condensed.json --query 'Versions[?IsLatest].[VersionId]' --output text)
 sed -i -e s#unstable-jobs.json#"unstable-jobs.json?versionId=${UNSTABLE_VER}"# .github/scripts/filter_test_configs.py
 sed -i -e s#disabled-jobs.json#"disabled-jobs.json?versionId=${DISABLED_VER}"# .github/scripts/filter_test_configs.py
-sed -i -e s#slow-tests.json#"slow-tests.json?versionId=${SLOW_VER}"#  tools/stats/import_test_stats.py
+# please note we want to match slow-tests.json not .pytorch-slow-tests.json hence "/" is needed here
+sed -i -e s#/slow-tests.json#"/slow-tests.json?versionId=${SLOW_VER}"#  tools/stats/import_test_stats.py
 sed -i -e s#disabled-tests-condensed.json#"disabled-tests-condensed.json?versionId=${DISABLED_TESTS_VER}"# tools/stats/import_test_stats.py
 # Optional
-# git commit -m "[RELEASE-ONLY CHANGES] Branch Cut for Release {RELEASE_VERSION}"
-# git push origin "${RELEASE_BRANCH}"
+git commit -m "[RELEASE-ONLY CHANGES] Branch Cut for Release {RELEASE_VERSION}"
+git push origin "${RELEASE_BRANCH}"
diff --git a/scripts/release/tag-docker-images.sh b/scripts/release/tag-docker-images.sh
new file mode 100644
index 0000000000000..ab366ecc0e3e2
--- /dev/null
+++ b/scripts/release/tag-docker-images.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+#
+# Step 1 after branch cut is complete.
+#
+# Tags latest docker images for release branch.
+# In case of failure. The script can be rerun.
+#
+#  Before executing this script do:
+#  1. Create and Check out to Release Branch
+#  git checkout -b "${RELEASE_BRANCH}"
+#  2. Update submodules
+#  git submodule update --init --recursive
+#
+# Usage (run from root of project):
+#  DRY_RUN=disabled ./scripts/release/tag_docker_images.sh
+#
+
+set -eou pipefail
+
+GIT_TOP_DIR=$(git rev-parse --show-toplevel)
+RELEASE_VERSION=${RELEASE_VERSION:-$(cut -d'.' -f1-2 "${GIT_TOP_DIR}/version.txt")}
+DRY_RUN=${DRY_RUN:-enabled}
+
+python3 .github/scripts/tag_docker_images_for_release.py --version ${RELEASE_VERSION} --dry-run ${DRY_RUN}
diff --git a/scripts/release_notes/apply_categories.py b/scripts/release_notes/apply_categories.py
index 5fd2928c89cf0..96b552b86f1cd 100644
--- a/scripts/release_notes/apply_categories.py
+++ b/scripts/release_notes/apply_categories.py
@@ -9,12 +9,12 @@
 category_csv = "results/category_data.csv"
 commitlist_csv = "results/commitlist.csv"
 
-with open(category_csv, "r") as category_data:
+with open(category_csv) as category_data:
     reader = csv.DictReader(category_data, commitlist.commit_fields)
     rows = list(reader)
     category_map = {row["commit_hash"]: row["category"] for row in rows}
 
-with open(commitlist_csv, "r") as commitlist_data:
+with open(commitlist_csv) as commitlist_data:
     reader = csv.DictReader(commitlist_data, commitlist.commit_fields)
     commitlist_rows = list(reader)
 
diff --git a/scripts/release_notes/classifier.py b/scripts/release_notes/classifier.py
index 1c17bb3252d2a..6c0a9a02f8bcb 100644
--- a/scripts/release_notes/classifier.py
+++ b/scripts/release_notes/classifier.py
@@ -11,7 +11,6 @@
 import pandas as pd
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 import torchtext
 from torchtext.functional import to_tensor
 from tqdm import tqdm
diff --git a/scripts/release_notes/commitlist.py b/scripts/release_notes/commitlist.py
index 83c73be49b512..23277bf6d9e4f 100644
--- a/scripts/release_notes/commitlist.py
+++ b/scripts/release_notes/commitlist.py
@@ -13,7 +13,6 @@
     features_to_dict,
     frontend_categories,
     get_commit_data_cache,
-    get_features,
     run,
     topics,
 )
@@ -578,7 +577,7 @@ def main():
             with open(filename, "w") as f:
                 f.writelines(lines)
         return
-    raise AssertionError()
+    raise AssertionError
 
 
 if __name__ == "__main__":
diff --git a/scripts/release_notes/common.py b/scripts/release_notes/common.py
index aa7d2e658e647..d05ec76572a4a 100644
--- a/scripts/release_notes/common.py
+++ b/scripts/release_notes/common.py
@@ -217,7 +217,7 @@ def run_query(query):
     if request.status_code == 200:
         return request.json()
     else:
-        raise Exception(
+        raise Exception(  # noqa: TRY002
             f"Query failed to run by returning code of {request.status_code}. {request.json()}"
         )
 
@@ -262,7 +262,7 @@ def github_data(pr_number):
         if len(_ERRORS) < _MAX_ERROR_LEN:
             return [], "None", ()
         else:
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 f"Got {_MAX_ERROR_LEN} errors: {_ERRORS}, please check if"
                 " there is something wrong"
             )
@@ -321,7 +321,7 @@ def get(self, commit):
         return self.data[commit]
 
     def read_from_disk(self):
-        with open(self.path, "r") as f:
+        with open(self.path) as f:
             data = json.load(f)
             data = {commit: dict_to_features(dct) for commit, dct in data.items()}
         return data
diff --git a/scripts/release_notes/namespace_check.py b/scripts/release_notes/namespace_check.py
index 64f9642703fd4..7d9f55c955205 100644
--- a/scripts/release_notes/namespace_check.py
+++ b/scripts/release_notes/namespace_check.py
@@ -5,7 +5,6 @@
 import torch
 
 # Import all utils so that getattr below can find them
-from torch.utils import bottleneck, checkpoint, model_zoo
 
 all_submod_list = [
     "",
@@ -69,10 +68,10 @@ def run(args, submod):
         if not path.exists(new_filename):
             raise RuntimeError("New version data not collected")
 
-        with open(prev_filename, "r") as f:
+        with open(prev_filename) as f:
             prev_content = set(json.load(f))
 
-        with open(new_filename, "r") as f:
+        with open(new_filename) as f:
             new_content = set(json.load(f))
 
         if not args.show_all:
diff --git a/setup.py b/setup.py
index 3ca6691309c79..d774446780b48 100644
--- a/setup.py
+++ b/setup.py
@@ -223,6 +223,9 @@
 #   USE_MIMALLOC
 #      Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free.
 #      By default, It is only enabled on Windows.
+#
+#   USE_PRIORITIZED_TEXT_FOR_LD
+#      Uses prioritized text form cmake/prioritized_text.txt for LD
 
 import sys
 
@@ -263,6 +266,7 @@
 from tools.generate_torch_version import get_torch_version
 from tools.setup_helpers.cmake import CMake
 from tools.setup_helpers.env import build_type, IS_DARWIN, IS_LINUX, IS_WINDOWS
+from tools.setup_helpers.generate_linker_script import gen_linker_script
 
 ################################################################################
 # Parameters parsed from environment
@@ -365,7 +369,7 @@ def get_submodule_folders():
     with open(git_modules_path) as f:
         return [
             os.path.join(cwd, line.split("=", 1)[1].strip())
-            for line in f.readlines()
+            for line in f
             if line.strip().startswith("path")
         ]
 
@@ -521,8 +525,8 @@ def check_pydep(importname, module):
 
 
 class build_ext(setuptools.command.build_ext.build_ext):
-    # Copy libiomp5.dylib inside the wheel package on OS X
-    def _embed_libiomp(self):
+    def _embed_libomp(self):
+        # Copy libiomp5.dylib/libomp.dylib inside the wheel package on MacOS
         lib_dir = os.path.join(self.build_lib, "torch", "lib")
         libtorch_cpu_path = os.path.join(lib_dir, "libtorch_cpu.dylib")
         if not os.path.exists(libtorch_cpu_path):
@@ -545,17 +549,47 @@ def _embed_libiomp(self):
                 assert rpath.startswith("path ")
                 rpaths.append(rpath.split(" ", 1)[1].rsplit("(", 1)[0][:-1])
 
-        omp_lib_name = "libiomp5.dylib"
-        if os.path.join("@rpath", omp_lib_name) not in libs:
+        omp_lib_name = (
+            "libomp.dylib" if os.uname().machine == "arm64" else "libiomp5.dylib"
+        )
+        omp_rpath_lib_path = os.path.join("@rpath", omp_lib_name)
+        omp_loader_lib_path = os.path.join("@loader_path", omp_lib_name)
+        if omp_rpath_lib_path not in libs:
             return
 
-        # Copy libiomp5 from rpath locations
+        # Copy libomp/libiomp5 from rpath locations
         for rpath in rpaths:
             source_lib = os.path.join(rpath, omp_lib_name)
             if not os.path.exists(source_lib):
                 continue
             target_lib = os.path.join(self.build_lib, "torch", "lib", omp_lib_name)
             self.copy_file(source_lib, target_lib)
+            # Change OMP library load path to loader_path and delete old rpath
+            # This should prevent delocate from attempting to package another instance
+            # of OpenMP library in torch wheel
+            subprocess.check_call(
+                [
+                    "install_name_tool",
+                    "-change",
+                    omp_rpath_lib_path,
+                    omp_loader_lib_path,
+                    "-delete_rpath",
+                    rpath,
+                    libtorch_cpu_path,
+                ]
+            )
+            break
+
+        # Copy omp.h from OpenMP_C_FLAGS and copy it into include folder
+        omp_cflags = get_cmake_cache_vars()["OpenMP_C_FLAGS"]
+        if not omp_cflags:
+            return
+        for include_dir in [f[2:] for f in omp_cflags.split(" ") if f.startswith("-I")]:
+            omp_h = os.path.join(include_dir, "omp.h")
+            if not os.path.exists(omp_h):
+                continue
+            target_omp_h = os.path.join(self.build_lib, "torch", "include", "omp.h")
+            self.copy_file(omp_h, target_omp_h)
             break
 
     def run(self):
@@ -579,6 +613,10 @@ def run(self):
             report("-- Detected CUDA at " + cmake_cache_vars["CUDA_TOOLKIT_ROOT_DIR"])
         else:
             report("-- Not using CUDA")
+        if cmake_cache_vars["USE_XPU"]:
+            report("-- Detected XPU runtime at " + cmake_cache_vars["SYCL_LIBRARY_DIR"])
+        else:
+            report("-- Not using XPU")
         if cmake_cache_vars["USE_MKLDNN"]:
             report("-- Using MKLDNN")
             if cmake_cache_vars["USE_MKLDNN_ACL"]:
@@ -646,7 +684,7 @@ def run(self):
         setuptools.command.build_ext.build_ext.run(self)
 
         if IS_DARWIN and package_type != "conda":
-            self._embed_libiomp()
+            self._embed_libomp()
 
         # Copy the essential export library to compile C++ extensions.
         if IS_WINDOWS:
@@ -1037,7 +1075,10 @@ def make_relative_rpath_args(path):
             "convert-caffe2-to-onnx = caffe2.python.onnx.bin.conversion:caffe2_to_onnx",
             "convert-onnx-to-caffe2 = caffe2.python.onnx.bin.conversion:onnx_to_caffe2",
             "torchrun = torch.distributed.run:main",
-        ]
+        ],
+        "torchrun.logs_specs": [
+            "default = torch.distributed.elastic.multiprocessing:DefaultLogsSpecs",
+        ],
     }
 
     return extensions, cmdclass, packages, entry_points, extra_install_requires
@@ -1074,8 +1115,34 @@ def main():
         "networkx",
         "jinja2",
         "fsspec",
+        'mkl>=2021.1.1,<=2021.4.0; platform_system == "Windows"',
     ]
 
+    use_prioritized_text = str(os.getenv("USE_PRIORITIZED_TEXT_FOR_LD", ""))
+    if (
+        use_prioritized_text == ""
+        and platform.system() == "Linux"
+        and platform.processor() == "aarch64"
+    ):
+        print_box(
+            """
+            WARNING: we strongly recommend enabling linker script optimization for ARM + CUDA.
+            To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
+            """
+        )
+    if use_prioritized_text == "1" or use_prioritized_text == "True":
+        gen_linker_script(
+            filein="cmake/prioritized_text.txt", fout="cmake/linker_script.ld"
+        )
+        linker_script_path = os.path.abspath("cmake/linker_script.ld")
+        os.environ["LDFLAGS"] = os.getenv("LDFLAGS", "") + f" -T{linker_script_path}"
+        os.environ["CFLAGS"] = (
+            os.getenv("CFLAGS", "") + " -ffunction-sections -fdata-sections"
+        )
+        os.environ["CXXFLAGS"] = (
+            os.getenv("CXXFLAGS", "") + " -ffunction-sections -fdata-sections"
+        )
+
     # Parse the command line and check the arguments before we proceed with
     # building deps and setup. We need to set values so `--help` works.
     dist = Distribution()
@@ -1102,7 +1169,7 @@ def main():
     install_requires += extra_install_requires
 
     extras_require = {
-        "optree": ["optree>=0.9.1"],
+        "optree": ["optree>=0.11.0"],
         "opt-einsum": ["opt-einsum>=3.3"],
     }
 
@@ -1110,7 +1177,7 @@ def main():
     with open(os.path.join(cwd, "README.md"), encoding="utf-8") as f:
         long_description = f.read()
 
-    version_range_max = max(sys.version_info[1], 10) + 1
+    version_range_max = max(sys.version_info[1], 12) + 1
     torch_package_data = [
         "py.typed",
         "bin/*",
@@ -1146,6 +1213,7 @@ def main():
         "include/ATen/cuda/*.h",
         "include/ATen/cuda/detail/*.cuh",
         "include/ATen/cuda/detail/*.h",
+        "include/ATen/cuda/tunable/*.h",
         "include/ATen/cudnn/*.h",
         "include/ATen/functorch/*.h",
         "include/ATen/ops/*.h",
@@ -1154,6 +1222,7 @@ def main():
         "include/ATen/hip/detail/*.cuh",
         "include/ATen/hip/detail/*.h",
         "include/ATen/hip/impl/*.h",
+        "include/ATen/hip/tunable/*.h",
         "include/ATen/mps/*.h",
         "include/ATen/miopen/*.h",
         "include/ATen/detail/*.h",
@@ -1164,10 +1233,15 @@ def main():
         "include/ATen/native/hip/*.h",
         "include/ATen/native/hip/*.cuh",
         "include/ATen/native/mps/*.h",
+        "include/ATen/native/nested/*.h",
         "include/ATen/native/quantized/*.h",
         "include/ATen/native/quantized/cpu/*.h",
+        "include/ATen/native/transformers/*.h",
+        "include/ATen/native/sparse/*.h",
         "include/ATen/native/utils/*.h",
         "include/ATen/quantized/*.h",
+        "include/ATen/xpu/*.h",
+        "include/ATen/xpu/detail/*.h",
         "include/caffe2/serialize/*.h",
         "include/c10/*.h",
         "include/c10/macros/*.h",
@@ -1177,12 +1251,13 @@ def main():
         "include/ATen/core/dispatch/*.h",
         "include/ATen/core/op_registration/*.h",
         "include/c10/core/impl/*.h",
-        "include/c10/core/impl/cow/*.h",
         "include/c10/util/*.h",
         "include/c10/cuda/*.h",
         "include/c10/cuda/impl/*.h",
         "include/c10/hip/*.h",
         "include/c10/hip/impl/*.h",
+        "include/c10/xpu/*.h",
+        "include/c10/xpu/impl/*.h",
         "include/torch/*.h",
         "include/torch/csrc/*.h",
         "include/torch/csrc/api/include/torch/*.h",
@@ -1221,6 +1296,7 @@ def main():
         "include/torch/csrc/inductor/aoti_runtime/*.h",
         "include/torch/csrc/inductor/aoti_torch/*.h",
         "include/torch/csrc/inductor/aoti_torch/c/*.h",
+        "include/torch/csrc/inductor/aoti_torch/generated/*.h",
         "include/torch/csrc/jit/*.h",
         "include/torch/csrc/jit/backends/*.h",
         "include/torch/csrc/jit/generated/*.h",
@@ -1243,6 +1319,7 @@ def main():
         "include/torch/csrc/profiler/orchestration/*.h",
         "include/torch/csrc/profiler/stubs/*.h",
         "include/torch/csrc/profiler/unwind/*.h",
+        "include/torch/csrc/profiler/python/*.h",
         "include/torch/csrc/utils/*.h",
         "include/torch/csrc/tensor/*.h",
         "include/torch/csrc/lazy/backend/*.h",
@@ -1251,6 +1328,7 @@ def main():
         "include/torch/csrc/lazy/core/ops/*.h",
         "include/torch/csrc/lazy/python/python_util.h",
         "include/torch/csrc/lazy/ts_backend/*.h",
+        "include/torch/csrc/xpu/*.h",
         "include/pybind11/*.h",
         "include/pybind11/detail/*.h",
         "include/pybind11/eigen/*.h",
@@ -1265,6 +1343,7 @@ def main():
         "include/sleef.h",
         "_inductor/codegen/*.h",
         "_inductor/codegen/aoti_runtime/*.cpp",
+        "_export/serde/*.yaml",
         "share/cmake/ATen/*.cmake",
         "share/cmake/Caffe2/*.cmake",
         "share/cmake/Caffe2/public/*.cmake",
@@ -1307,6 +1386,12 @@ def main():
                 "include/tensorpipe/transport/uv/*.h",
             ]
         )
+    if get_cmake_cache_vars()["USE_KINETO"]:
+        torch_package_data.extend(
+            [
+                "include/kineto/*.h",
+            ]
+        )
     torchgen_package_data = [
         # Recursive glob doesn't work in setup.py,
         # https://github.com/pypa/setuptools/issues/1806
diff --git a/test/_test_bazel.py b/test/_test_bazel.py
index 9c3bb6f87b57a..7a44f65847e73 100644
--- a/test/_test_bazel.py
+++ b/test/_test_bazel.py
@@ -11,11 +11,14 @@
 
 import torch
 
+
 def test_sum() -> None:
-    assert torch.eq(torch.tensor([[1, 2, 3]]) + torch.tensor([[4, 5, 6]]), torch.tensor([[5, 7, 9]])).all()
+    assert torch.eq(
+        torch.tensor([[1, 2, 3]]) + torch.tensor([[4, 5, 6]]), torch.tensor([[5, 7, 9]])
+    ).all()
 
-def test_simple_compile_eager() -> None:
 
+def test_simple_compile_eager() -> None:
     def foo(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         a = torch.sin(x)
         b = torch.cos(y)
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index ffa80b8410baa..9bc60578ea7a2 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -35,20 +35,6 @@
     "torch.nn.quantizable.modules.activation": "torch.ao.nn.quantizable.modules.activation",
     "torch.nn.quantizable.modules.rnn": "torch.ao.nn.quantizable.modules.rnn"
   },
-  "torch.autograd": [
-    "NestedIOFunction",
-    "detect_anomaly",
-    "enable_grad",
-    "grad",
-    "gradcheck",
-    "gradgradcheck",
-    "inference_mode",
-    "no_grad",
-    "set_detect_anomaly",
-    "set_grad_enabled",
-    "set_multithreading_enabled",
-    "variable"
-  ],
   "torch.backends": [
     "contextmanager"
   ],
@@ -149,10 +135,15 @@
     "Optional"
   ],
   "torch.distributed.elastic.events.handlers": [
-    "Dict"
+    "Dict",
+    "Optional",
+    "ScubaLogHandler",
+    "ScubaRdzvLogHandler"
   ],
   "torch.distributed.elastic.metrics": [
-    "Optional"
+    "Optional",
+    "get_logger",
+    "TorchElasticService"
   ],
   "torch.distributed.elastic.multiprocessing": [
     "Callable",
@@ -260,8 +251,7 @@
     "urlparse",
     "urlunparse"
   ],
-  "torch.distributed.rpc": [
-  ],
+  "torch.distributed.rpc": [],
   "torch.fft": [
     "Tensor",
     "fft",
@@ -1529,7 +1519,6 @@
     "SharedQuantizationSpec",
     "Tuple",
     "fold_bn_weights_into_conv_node",
-    "get_aten_graph_module",
     "replace_pattern_with_filters"
   ],
   "torch.ao.quantization.quantize_fx": [
@@ -1926,7 +1915,7 @@
     "extract_val",
     "fake_signature",
     "fetch_sym_proxy",
-    "fetch_tensor_proxy",
+    "fetch_object_proxy",
     "get_isolated_graphmodule",
     "get_proxy_slot",
     "get_torch_dispatch_modes",
@@ -2434,5 +2423,185 @@
   ],
   "torch.utils.tensorboard": [
     "RecordWriter"
+  ],
+  "torch.ao.quantization.experimental.APoT_tensor": [
+    "APoTQuantizer"
+  ],
+  "torch.ao.quantization.experimental.fake_quantize": [
+    "APoTObserver",
+    "FakeQuantizeBase",
+    "Tensor"
+  ],
+  "torch.ao.quantization.experimental.fake_quantize_function": [
+    "dequantize_APoT",
+    "quantize_APoT",
+    "Tensor"
+  ],
+  "torch.ao.quantization.experimental.linear": [
+    "APoTObserver",
+    "quantize_APoT",
+    "WeightedQuantizedModule"
+  ],
+  "torch.ao.quantization.experimental.observer": [
+    "apot_to_float",
+    "float_to_apot",
+    "ObserverBase"
+  ],
+  "torch.ao.quantization.experimental.qconfig": [
+    "APoTFakeQuantize",
+    "default_symmetric_fake_quant",
+    "default_weight_symmetric_fake_quant",
+    "FakeQuantize",
+    "MinMaxObserver",
+    "QConfig"
+  ],
+  "torch.ao.quantization.experimental.quantizer": [
+    "apot_to_float",
+    "float_to_apot",
+    "quant_dequant_util",
+    "Tensor"
+  ],
+  "torch.ao.sparsity": [
+    "BaseScheduler",
+    "BaseSparsifier",
+    "CubicSL",
+    "FakeSparsity",
+    "fqn_to_module",
+    "get_arg_info_from_tensor_fqn",
+    "get_dynamic_sparse_quantized_mapping",
+    "get_static_sparse_quantized_mapping",
+    "LambdaSL",
+    "module_to_fqn",
+    "NearlyDiagonalSparsifier",
+    "WeightNormSparsifier"
+  ],
+  "torch.ao.sparsity.scheduler.base_scheduler": [
+    "BaseScheduler"
+  ],
+  "torch.ao.sparsity.scheduler.cubic_scheduler": [
+    "CubicSL"
+  ],
+  "torch.ao.sparsity.scheduler.lambda_scheduler": [
+    "LambdaSL"
+  ],
+  "torch.ao.sparsity.sparsifier.base_sparsifier": [
+    "BaseSparsifier"
+  ],
+  "torch.ao.sparsity.sparsifier.nearly_diagonal_sparsifier": [
+    "NearlyDiagonalSparsifier"
+  ],
+  "torch.ao.sparsity.sparsifier.utils": [
+    "FakeSparsity",
+    "fqn_to_module",
+    "get_arg_info_from_tensor_fqn",
+    "module_to_fqn"
+  ],
+  "torch.ao.sparsity.sparsifier.weight_norm_sparsifier": [
+    "WeightNormSparsifier"
+  ],
+  "torch.csrc.jit.tensorexpr.codegen_external": [
+    "FileManager",
+    "parse_native_yaml"
+  ],
+  "torch.distributed.checkpoint.examples.async_checkpointing_example": [
+    "FSDP",
+    "init_device_mesh"
+  ],
+  "torch.distributed.checkpoint.examples.fsdp_checkpoint_example": [
+    "FSDP",
+    "load_sharded_optimizer_state_dict",
+    "StateDictType"
+  ],
+  "torch.distributed.checkpoint.examples.stateful_example": [
+    "FSDP",
+    "init_device_mesh"
+  ],
+  "torch.distributed.elastic.events.fb.scuba": [
+    "await_sync",
+    "cast",
+    "Dict",
+    "Enum",
+    "Event",
+    "EventMetadataValue",
+    "List",
+    "Optional",
+    "RdzvEvent",
+    "RuntimeEnvironment",
+    "TorchelasticRdzvLogEntry",
+    "TorchelasticStatusLogEntry",
+    "WhenceScribeLogged"
+  ],
+  "torch.distributed.elastic.metrics.fb.service_data_metrics": [
+    "MetricHandler",
+    "ServiceDataMetrics"
+  ],
+  "torch.distributed.elastic.metrics.static_init": [
+    "configure",
+    "get_logger",
+    "MetricsConfig",
+    "Optional",
+    "ServiceDataMetricsHandler",
+    "TorchElasticService"
+  ],
+  "torch.distributed.elastic.multiprocessing.errors.fb.error_handler_fb": [
+    "Any",
+    "Dict",
+    "ErrorHandler",
+    "format_exception",
+    "generate_python_trace",
+    "MastReplyFileErrorCode",
+    "Optional",
+    "RuntimeEnvironment",
+    "RuntimeEnvironmentScheduler",
+    "write_formatted_message"
+  ],
+  "torch.distributed.elastic.multiprocessing.errors.handlers": [
+    "ErrorHandlerFB"
+  ],
+  "torch.distributed.elastic.rendezvous.fb.mast_rendezvous": [
+    "create_c10d_store",
+    "DistNetworkError",
+    "DistStoreError",
+    "get_logger",
+    "List",
+    "Optional",
+    "RendezvousClosedError",
+    "RendezvousHandler",
+    "RendezvousParameters",
+    "RendezvousTimeoutError",
+    "Tuple"
+  ],
+  "torch.distributed.elastic.rendezvous.fb.zeus": [
+    "gethostname",
+    "get_logger",
+    "namedtuple",
+    "Optional",
+    "RendezvousClosedError",
+    "RendezvousHandler",
+    "RendezvousParameters",
+    "RendezvousTimeoutError"
+  ],
+  "torch.distributed.elastic.rendezvous.registry": [
+    "create_handler",
+    "RendezvousHandler",
+    "RendezvousParameters"
+  ],
+  "torch.distributed.logging_handlers": [
+    "C10D_CATEGORY",
+    "Dict",
+    "LogCategory",
+    "Optional",
+    "Sample",
+    "ScubaData",
+    "signpost",
+    "SignpostType"
+  ],
+  "torch.utils.benchmark.examples.sparse.op_benchmark": [
+    "BinaryOpSparseFuzzer",
+    "Timer",
+    "UnaryOpSparseFuzzer"
+  ],
+  "torch.version": [
+    "get_file_path"
   ]
 }
diff --git a/test/ao/sparsity/test_activation_sparsifier.py b/test/ao/sparsity/test_activation_sparsifier.py
index bd0a456e9118e..7fcbd1e87ba51 100644
--- a/test/ao/sparsity/test_activation_sparsifier.py
+++ b/test/ao/sparsity/test_activation_sparsifier.py
@@ -1,16 +1,21 @@
 # Owner(s): ["module: unknown"]
 
 import copy
-from torch.testing._internal.common_utils import TestCase, skipIfTorchDynamo
 import logging
+from typing import List
+
 import torch
-from torch.ao.pruning._experimental.activation_sparsifier.activation_sparsifier import ActivationSparsifier
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.ao.pruning._experimental.activation_sparsifier.activation_sparsifier import (
+    ActivationSparsifier,
+)
 from torch.ao.pruning.sparsifier.utils import module_to_fqn
-from typing import List
+from torch.testing._internal.common_utils import skipIfTorchDynamo, TestCase
 
-logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
+)
 
 
 class Model(nn.Module):
@@ -45,7 +50,7 @@ def _check_constructor(self, activation_sparsifier, model, defaults, sparse_conf
         in the activation sparsifier
         """
         sparsifier_defaults = activation_sparsifier.defaults
-        combined_defaults = {**defaults, 'sparse_config': sparse_config}
+        combined_defaults = {**defaults, "sparse_config": sparse_config}
 
         # more keys are populated in activation sparsifier (eventhough they may be None)
         assert len(combined_defaults) <= len(activation_sparsifier.defaults)
@@ -54,7 +59,9 @@ def _check_constructor(self, activation_sparsifier, model, defaults, sparse_conf
             # all the keys in combined_defaults should be present in sparsifier defaults
             assert config == combined_defaults.get(key, None)
 
-    def _check_register_layer(self, activation_sparsifier, defaults, sparse_config, layer_args_list):
+    def _check_register_layer(
+        self, activation_sparsifier, defaults, sparse_config, layer_args_list
+    ):
         """Checks if layers in the model are correctly mapped to it's arguments.
 
         Args:
@@ -82,14 +89,14 @@ def _check_register_layer(self, activation_sparsifier, defaults, sparse_config,
             sparse_config_actual = copy.deepcopy(sparse_config)
             sparse_config_actual.update(sparse_config_layer)
 
-            name = module_to_fqn(activation_sparsifier.model, layer_arg['layer'])
+            name = module_to_fqn(activation_sparsifier.model, layer_arg["layer"])
 
-            assert data_groups[name]['sparse_config'] == sparse_config_actual
+            assert data_groups[name]["sparse_config"] == sparse_config_actual
 
             # assert the rest
             other_config_actual = copy.deepcopy(defaults)
             other_config_actual.update(layer_arg)
-            other_config_actual.pop('layer')
+            other_config_actual.pop("layer")
 
             for key, value in other_config_actual.items():
                 assert key in data_groups[name]
@@ -119,13 +126,15 @@ def _check_pre_forward_hook(self, activation_sparsifier, data_list):
         data_agg_actual = data_list[0]
         model = activation_sparsifier.model
         layer_name = module_to_fqn(model, model.conv1)
-        agg_fn = activation_sparsifier.data_groups[layer_name]['aggregate_fn']
+        agg_fn = activation_sparsifier.data_groups[layer_name]["aggregate_fn"]
 
         for i in range(1, len(data_list)):
             data_agg_actual = agg_fn(data_agg_actual, data_list[i])
 
-        assert 'data' in activation_sparsifier.data_groups[layer_name]
-        assert torch.all(activation_sparsifier.data_groups[layer_name]['data'] == data_agg_actual)
+        assert "data" in activation_sparsifier.data_groups[layer_name]
+        assert torch.all(
+            activation_sparsifier.data_groups[layer_name]["data"] == data_agg_actual
+        )
 
         return data_agg_actual
 
@@ -144,11 +153,11 @@ def _check_step(self, activation_sparsifier, data_agg_actual):
         layer_name = module_to_fqn(model, model.conv1)
         assert layer_name is not None
 
-        reduce_fn = activation_sparsifier.data_groups[layer_name]['reduce_fn']
+        reduce_fn = activation_sparsifier.data_groups[layer_name]["reduce_fn"]
 
         data_reduce_actual = reduce_fn(data_agg_actual)
-        mask_fn = activation_sparsifier.data_groups[layer_name]['mask_fn']
-        sparse_config = activation_sparsifier.data_groups[layer_name]['sparse_config']
+        mask_fn = activation_sparsifier.data_groups[layer_name]["mask_fn"]
+        sparse_config = activation_sparsifier.data_groups[layer_name]["sparse_config"]
         mask_actual = mask_fn(data_reduce_actual, **sparse_config)
 
         mask_model = activation_sparsifier.get_mask(layer_name)
@@ -156,8 +165,7 @@ def _check_step(self, activation_sparsifier, data_agg_actual):
         assert torch.all(mask_model == mask_actual)
 
         for config in activation_sparsifier.data_groups.values():
-            assert 'data' not in config
-
+            assert "data" not in config
 
     def _check_squash_mask(self, activation_sparsifier, data):
         """Makes sure that squash_mask() works as usual. Specifically, checks
@@ -172,11 +180,12 @@ def _check_squash_mask(self, activation_sparsifier, data):
             data (torch tensor)
                 dummy batched data
         """
+
         # create a forward hook for checking output == layer(input * mask)
         def check_output(name):
             mask = activation_sparsifier.get_mask(name)
-            features = activation_sparsifier.data_groups[name].get('features')
-            feature_dim = activation_sparsifier.data_groups[name].get('feature_dim')
+            features = activation_sparsifier.data_groups[name].get("features")
+            feature_dim = activation_sparsifier.data_groups[name].get("feature_dim")
 
             def hook(module, input, output):
                 input_data = input[0]
@@ -184,20 +193,28 @@ def hook(module, input, output):
                     assert torch.all(mask * input_data == output)
                 else:
                     for feature_idx in range(0, len(features)):
-                        feature = torch.Tensor([features[feature_idx]], device=input_data.device).long()
-                        inp_data_feature = torch.index_select(input_data, feature_dim, feature)
-                        out_data_feature = torch.index_select(output, feature_dim, feature)
+                        feature = torch.Tensor(
+                            [features[feature_idx]], device=input_data.device
+                        ).long()
+                        inp_data_feature = torch.index_select(
+                            input_data, feature_dim, feature
+                        )
+                        out_data_feature = torch.index_select(
+                            output, feature_dim, feature
+                        )
+
+                        assert torch.all(
+                            mask[feature_idx] * inp_data_feature == out_data_feature
+                        )
 
-                        assert torch.all(mask[feature_idx] * inp_data_feature == out_data_feature)
             return hook
 
         for name, config in activation_sparsifier.data_groups.items():
-            if 'identity' in name:
-                config['layer'].register_forward_hook(check_output(name))
+            if "identity" in name:
+                config["layer"].register_forward_hook(check_output(name))
 
         activation_sparsifier.model(data)
 
-
     def _check_state_dict(self, sparsifier1):
         """Checks if loading and restoring of state_dict() works as expected.
         Basically, dumps the state of the sparsifier and loads it in the other sparsifier
@@ -222,8 +239,8 @@ def _check_state_dict(self, sparsifier1):
 
         for name, state in sparsifier2.state.items():
             assert name in sparsifier1.state
-            mask1 = sparsifier1.state[name]['mask']
-            mask2 = state['mask']
+            mask1 = sparsifier1.state[name]["mask"]
+            mask2 = state["mask"]
 
             if mask1 is None:
                 assert mask2 is None
@@ -237,8 +254,8 @@ def _check_state_dict(self, sparsifier1):
                     assert torch.all(mask1 == mask2)
 
         # make sure that the state dict is stored as torch sparse
-        for state in state_dict['state'].values():
-            mask = state['mask']
+        for state in state_dict["state"].values():
+            mask = state["mask"]
             if mask is not None:
                 if isinstance(mask, List):
                     for idx in range(len(mask)):
@@ -252,8 +269,16 @@ def _check_state_dict(self, sparsifier1):
             assert layer_name in dg2
 
             # exclude hook and layer
-            config1 = {key: value for key, value in config.items() if key not in ['hook', 'layer']}
-            config2 = {key: value for key, value in dg2[layer_name].items() if key not in ['hook', 'layer']}
+            config1 = {
+                key: value
+                for key, value in config.items()
+                if key not in ["hook", "layer"]
+            }
+            config2 = {
+                key: value
+                for key, value in dg2[layer_name].items()
+                if key not in ["hook", "layer"]
+            }
 
             assert config1 == config2
 
@@ -263,6 +288,7 @@ def test_activation_sparsifier(self):
         till squash_mask().
         The idea is to check that everything works as expected while in the workflow.
         """
+
         # defining aggregate, reduce and mask functions
         def agg_fn(x, y):
             return x + y
@@ -287,14 +313,9 @@ def _vanilla_norm_sparsifier(data, sparsity_level):
 
         # Creating default function and sparse configs
         # default sparse_config
-        sparse_config = {
-            'sparsity_level': 0.5
-        }
+        sparse_config = {"sparsity_level": 0.5}
 
-        defaults = {
-            'aggregate_fn': agg_fn,
-            'reduce_fn': reduce_fn
-        }
+        defaults = {"aggregate_fn": agg_fn, "reduce_fn": reduce_fn}
 
         # simulate the workflow
         # STEP 1: make data and activation sparsifier object
@@ -306,35 +327,41 @@ def _vanilla_norm_sparsifier(data, sparsity_level):
 
         # STEP 2: Register some layers
         register_layer1_args = {
-            'layer': model.conv1,
-            'mask_fn': _vanilla_norm_sparsifier
+            "layer": model.conv1,
+            "mask_fn": _vanilla_norm_sparsifier,
         }
-        sparse_config_layer1 = {'sparsity_level': 0.3}
+        sparse_config_layer1 = {"sparsity_level": 0.3}
 
         register_layer2_args = {
-            'layer': model.linear1,
-            'features': [0, 10, 234],
-            'feature_dim': 1,
-            'mask_fn': _vanilla_norm_sparsifier
+            "layer": model.linear1,
+            "features": [0, 10, 234],
+            "feature_dim": 1,
+            "mask_fn": _vanilla_norm_sparsifier,
         }
-        sparse_config_layer2 = {'sparsity_level': 0.1}
+        sparse_config_layer2 = {"sparsity_level": 0.1}
 
         register_layer3_args = {
-            'layer': model.identity1,
-            'mask_fn': _vanilla_norm_sparsifier
+            "layer": model.identity1,
+            "mask_fn": _vanilla_norm_sparsifier,
         }
-        sparse_config_layer3 = {'sparsity_level': 0.3}
+        sparse_config_layer3 = {"sparsity_level": 0.3}
 
         register_layer4_args = {
-            'layer': model.identity2,
-            'features': [0, 10, 20],
-            'feature_dim': 1,
-            'mask_fn': _vanilla_norm_sparsifier
+            "layer": model.identity2,
+            "features": [0, 10, 20],
+            "feature_dim": 1,
+            "mask_fn": _vanilla_norm_sparsifier,
         }
-        sparse_config_layer4 = {'sparsity_level': 0.1}
+        sparse_config_layer4 = {"sparsity_level": 0.1}
 
-        layer_args_list = [(register_layer1_args, sparse_config_layer1), (register_layer2_args, sparse_config_layer2)]
-        layer_args_list += [(register_layer3_args, sparse_config_layer3), (register_layer4_args, sparse_config_layer4)]
+        layer_args_list = [
+            (register_layer1_args, sparse_config_layer1),
+            (register_layer2_args, sparse_config_layer2),
+        ]
+        layer_args_list += [
+            (register_layer3_args, sparse_config_layer3),
+            (register_layer4_args, sparse_config_layer4),
+        ]
 
         # Registering..
         for layer_args in layer_args_list:
@@ -342,7 +369,9 @@ def _vanilla_norm_sparsifier(data, sparsity_level):
             activation_sparsifier.register_layer(**layer_arg, **sparse_config_layer)
 
         # check if things are registered correctly
-        self._check_register_layer(activation_sparsifier, defaults, sparse_config, layer_args_list)
+        self._check_register_layer(
+            activation_sparsifier, defaults, sparse_config, layer_args_list
+        )
 
         # check state_dict after registering and before model forward
         self._check_state_dict(activation_sparsifier)
diff --git a/test/ao/sparsity/test_composability.py b/test/ao/sparsity/test_composability.py
index 9c5424601f2ba..1d2e6a53c2cf4 100644
--- a/test/ao/sparsity/test_composability.py
+++ b/test/ao/sparsity/test_composability.py
@@ -7,9 +7,14 @@
 import torch.ao.quantization as tq
 from torch import nn
 from torch.ao import pruning
-from torch.testing._internal.common_utils import TestCase
-from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx, convert_to_reference_fx, prepare_qat_fx
 from torch.ao.pruning import fqn_to_module
+from torch.ao.quantization.quantize_fx import (
+    convert_fx,
+    convert_to_reference_fx,
+    prepare_fx,
+    prepare_qat_fx,
+)
+from torch.testing._internal.common_utils import TestCase
 
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
@@ -21,6 +26,7 @@
     "zeros_per_block": 4,
 }
 
+
 def _get_model_and_sparsifier_and_sparse_config(qconfig=None):
     model = nn.Sequential(
         nn.Linear(4, 4),  # 0
@@ -40,7 +46,7 @@ def _get_model_and_sparsifier_and_sparse_config(qconfig=None):
 
     sparse_config = [
         {
-            "tensor_fqn": '5.weight',
+            "tensor_fqn": "5.weight",
             "sparsity_level": 0.7,
             "sparse_block_shape": (1, 4),
             "zeros_per_block": 4,
@@ -49,15 +55,18 @@ def _get_model_and_sparsifier_and_sparse_config(qconfig=None):
     ]
     return model, sparsifier, sparse_config
 
+
 def _squash_mask_calibrate_and_convert(model, sparsifier, input):
     sparsifier.step()
     sparsifier.squash_mask()
     model(input)
     tq.convert(model, inplace=True)
 
+
 def _calculate_sparsity(tensor):
     return ((tensor == 0).sum() / tensor.numel()).item()
 
+
 # This series of tests are to check the composability goals for sparsity and quantization. Namely
 # that performing quantization and sparsity model manipulations in various orderings
 # does not cause problems
@@ -70,7 +79,9 @@ def test_q_prep_before_s_prep(self):
             mod,
             sparsifier,
             sparse_config,
-        ) = _get_model_and_sparsifier_and_sparse_config(tq.get_default_qconfig("fbgemm"))
+        ) = _get_model_and_sparsifier_and_sparse_config(
+            tq.get_default_qconfig("fbgemm")
+        )
 
         tq.prepare(mod, inplace=True)
         sparsifier.prepare(mod, config=sparse_config)
@@ -81,9 +92,7 @@ def test_q_prep_before_s_prep(self):
         # check that correct observers were inserted
         self.assertTrue(hasattr(mod[5], "activation_post_process"))
 
-        _squash_mask_calibrate_and_convert(
-            mod, sparsifier, torch.randn(1, 4, 4, 4)
-        )
+        _squash_mask_calibrate_and_convert(mod, sparsifier, torch.randn(1, 4, 4, 4))
 
         # check that final module is the expected quantized module and that the model runs
         self.assertTrue(isinstance(mod[5], torch.ao.nn.quantized.Linear))
@@ -99,7 +108,9 @@ def test_s_prep_before_q_prep(self):
             mod,
             sparsifier,
             sparse_config,
-        ) = _get_model_and_sparsifier_and_sparse_config(tq.get_default_qconfig("fbgemm"))
+        ) = _get_model_and_sparsifier_and_sparse_config(
+            tq.get_default_qconfig("fbgemm")
+        )
 
         sparsifier.prepare(mod, config=sparse_config)
         tq.prepare(mod, inplace=True)
@@ -113,9 +124,7 @@ def test_s_prep_before_q_prep(self):
         # occurred successfully
         self.assertTrue(hasattr(mod[5], "activation_post_process"))
 
-        _squash_mask_calibrate_and_convert(
-            mod, sparsifier, torch.randn(1, 4, 4, 4)
-        )
+        _squash_mask_calibrate_and_convert(mod, sparsifier, torch.randn(1, 4, 4, 4))
 
         # check that final module is the expected quantized module and that the model runs
         self.assertTrue(isinstance(mod[5], torch.ao.nn.quantized.Linear))
@@ -130,7 +139,9 @@ def test_convert_without_squash_mask(self):
             mod,
             sparsifier,
             sparse_config,
-        ) = _get_model_and_sparsifier_and_sparse_config(tq.get_default_qconfig("fbgemm"))
+        ) = _get_model_and_sparsifier_and_sparse_config(
+            tq.get_default_qconfig("fbgemm")
+        )
 
         sparsifier.prepare(mod, config=sparse_config)
         tq.prepare(mod, inplace=True)
@@ -168,7 +179,9 @@ def test_s_prep_before_fusion(self):
             mod,
             sparsifier,
             sparse_config,
-        ) = _get_model_and_sparsifier_and_sparse_config(tq.get_default_qconfig("fbgemm"))
+        ) = _get_model_and_sparsifier_and_sparse_config(
+            tq.get_default_qconfig("fbgemm")
+        )
         sparsifier.prepare(mod, config=sparse_config)
         tq.fuse_modules(mod, [["5", "6"]], inplace=True)
         mod[5].qconfig = tq.get_default_qconfig("fbgemm")
@@ -182,9 +195,7 @@ def test_s_prep_before_fusion(self):
         # check that correct observers were inserted and that matching
         # occurred successfully
         self.assertTrue(hasattr(mod[5], "activation_post_process"))
-        _squash_mask_calibrate_and_convert(
-            mod, sparsifier, torch.randn(1, 4, 4, 4)
-        )
+        _squash_mask_calibrate_and_convert(mod, sparsifier, torch.randn(1, 4, 4, 4))
 
         # check that final module is the expected quantized module and that the model runs
         self.assertTrue(isinstance(mod[5], torch.ao.nn.intrinsic.quantized.LinearReLU))
@@ -197,7 +208,9 @@ def test_fusion_before_s_prep(self):
             mod,
             sparsifier,
             _,
-        ) = _get_model_and_sparsifier_and_sparse_config(tq.get_default_qconfig("fbgemm"))
+        ) = _get_model_and_sparsifier_and_sparse_config(
+            tq.get_default_qconfig("fbgemm")
+        )
         tq.fuse_modules(mod, [["5", "6"]], inplace=True)
 
         # its absolutely broken by fusion but will still work if you put the correct fqn in
@@ -261,9 +274,7 @@ def test_s_prep_before_qat_prep(self):
         # occurred successfully
         self.assertTrue(hasattr(mod[5], "activation_post_process"))
         self.assertTrue(isinstance(mod[5], torch.ao.nn.qat.Linear))
-        _squash_mask_calibrate_and_convert(
-            mod, sparsifier, torch.randn(1, 4, 4, 4)
-        )
+        _squash_mask_calibrate_and_convert(mod, sparsifier, torch.randn(1, 4, 4, 4))
         # check that final module is the expected quantized module and that the model runs
         self.assertTrue(isinstance(mod[5], torch.ao.nn.quantized.Linear))
         self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))
@@ -301,9 +312,7 @@ def test_qat_prep_before_s_prep(self):
         self.assertTrue(hasattr(mod[5], "activation_post_process"))
         self.assertTrue(isinstance(mod[5], torch.ao.nn.qat.Linear))
 
-        _squash_mask_calibrate_and_convert(
-            mod, sparsifier, torch.randn(1, 4, 4, 4)
-        )
+        _squash_mask_calibrate_and_convert(mod, sparsifier, torch.randn(1, 4, 4, 4))
 
         # check that final module is the expected quantized module and that the model runs
         self.assertTrue(isinstance(mod[5], torch.ao.nn.quantized.Linear))
@@ -313,6 +322,7 @@ def test_qat_prep_before_s_prep(self):
         cur_sparsity = _calculate_sparsity(mod[5]._weight_bias()[0])
         self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
 
+
 def _module_has_activation_post_process(model, fqn_of_module):
     for node in model.graph.nodes:
         # look for an observer whose arg is the target module
@@ -321,10 +331,12 @@ def _module_has_activation_post_process(model, fqn_of_module):
                 return True
     return False
 
+
 class TestFxComposability(TestCase):
     r"""This series of tests checks that various steps of the quantization and sparsity flow
     compose cleanly despite variation in sequencing.
     """
+
     def test_q_prep_fx_before_s_prep(self):
         r"""
         This test checks that the ordering of prepare_fx -> sparse prepare -> convert_fx
@@ -340,10 +352,11 @@ def test_q_prep_fx_before_s_prep(self):
 
         example = torch.randn(1, 4, 4, 4)
         qconfig = tq.get_default_qconfig("fbgemm")
-        qconfig_mapping = tq.QConfigMapping() \
-            .set_module_name("4", qconfig) \
+        qconfig_mapping = (
+            tq.QConfigMapping()
+            .set_module_name("4", qconfig)
             .set_module_name("5", qconfig)
-
+        )
 
         mod = prepare_fx(mod, qconfig_mapping, (example,))
 
@@ -374,7 +387,11 @@ def test_q_prep_fx_before_s_prep(self):
         mod = convert_fx(mod)
 
         # check that final module is the expected quantized module and that the model runs
-        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.quantized.LinearReLU))
+        self.assertTrue(
+            isinstance(
+                fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.quantized.LinearReLU
+            )
+        )
         self.assertEqual(mod(example).shape, torch.Size([1, 4, 4, 4]))
 
         # check that module was actually sparsified
@@ -399,9 +416,11 @@ def test_q_prep_fx_s_prep_ref_conv(self):
 
         example = torch.randn(1, 4, 4, 4)
         qconfig = tq.get_default_qconfig("fbgemm")
-        qconfig_mapping = tq.QConfigMapping() \
-            .set_module_name("4", qconfig) \
+        qconfig_mapping = (
+            tq.QConfigMapping()
+            .set_module_name("4", qconfig)
             .set_module_name("5", qconfig)
+        )
 
         mod = prepare_fx(mod, qconfig_mapping, (example,))
 
@@ -432,9 +451,15 @@ def test_q_prep_fx_s_prep_ref_conv(self):
         mod = convert_to_reference_fx(mod)
 
         # check that final module is the expected quantized module and that the model runs
-        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.LinearReLU))
+        self.assertTrue(
+            isinstance(fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.LinearReLU)
+        )
         self.assertEqual(mod(example).shape, torch.Size([1, 4, 4, 4]))
-        self.assertTrue(isinstance(fqn_to_module(mod, "5.0"), torch.ao.nn.quantized.reference.Linear))
+        self.assertTrue(
+            isinstance(
+                fqn_to_module(mod, "5.0"), torch.ao.nn.quantized.reference.Linear
+            )
+        )
 
         # check that module was actually sparsified
         cur_sparsity = _calculate_sparsity(fqn_to_module(mod, "5.0.weight"))
@@ -459,9 +484,11 @@ def test_s_prep_before_q_prep_fx(self):
 
         example = torch.randn(1, 4, 4, 4)
         qconfig = tq.get_default_qconfig("fbgemm")
-        qconfig_mapping = tq.QConfigMapping() \
-            .set_module_name("4", qconfig) \
+        qconfig_mapping = (
+            tq.QConfigMapping()
+            .set_module_name("4", qconfig)
             .set_module_name("5", qconfig)
+        )
         mod = prepare_fx(mod, qconfig_mapping, (example,))
 
         # check that correct modules had parametrizations added and
@@ -478,7 +505,11 @@ def test_s_prep_before_q_prep_fx(self):
         mod = convert_fx(mod)
 
         # check that final module is the expected quantized module and that the model runs
-        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.quantized.LinearReLU))
+        self.assertTrue(
+            isinstance(
+                fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.quantized.LinearReLU
+            )
+        )
         self.assertEqual(mod(example).shape, torch.Size([1, 4, 4, 4]))
 
         # check that module was actually sparsified
@@ -504,16 +535,20 @@ def test_s_prep_before_qat_prep_fx(self):
 
         example = torch.randn(1, 4, 4, 4)
         qconfig = tq.get_default_qat_qconfig("fbgemm")
-        qconfig_mapping = tq.QConfigMapping() \
-            .set_module_name("4", qconfig) \
+        qconfig_mapping = (
+            tq.QConfigMapping()
+            .set_module_name("4", qconfig)
             .set_module_name("5", qconfig)
+        )
         mod = prepare_qat_fx(mod, qconfig_mapping, (example,))
 
         # check that correct modules had parametrizations added and
         # that none were lost during prepare
         self.assertTrue(hasattr(fqn_to_module(mod, "0.0"), "parametrizations"))
         self.assertTrue(hasattr(fqn_to_module(mod, "5"), "parametrizations"))
-        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.qat.LinearReLU))
+        self.assertTrue(
+            isinstance(fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.qat.LinearReLU)
+        )
 
         # check that correct observers were inserted and that matching
         # occurred successfully
@@ -524,7 +559,11 @@ def test_s_prep_before_qat_prep_fx(self):
         mod = convert_fx(mod)
 
         # check that final module is the expected quantized module and that the model runs
-        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.quantized.LinearReLU))
+        self.assertTrue(
+            isinstance(
+                fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.quantized.LinearReLU
+            )
+        )
         self.assertEqual(mod(example).shape, torch.Size([1, 4, 4, 4]))
 
         # check that module was actually sparsified
@@ -550,9 +589,11 @@ def test_s_prep_q_prep_fx_ref(self):
 
         example = torch.randn(1, 4, 4, 4)
         qconfig = tq.get_default_qconfig("fbgemm")
-        qconfig_mapping = tq.QConfigMapping() \
-            .set_module_name("4", qconfig) \
+        qconfig_mapping = (
+            tq.QConfigMapping()
+            .set_module_name("4", qconfig)
             .set_module_name("5", qconfig)
+        )
         mod = prepare_fx(mod, qconfig_mapping, (example,))
 
         # check that correct modules had parametrizations added and
@@ -569,9 +610,15 @@ def test_s_prep_q_prep_fx_ref(self):
         mod = convert_to_reference_fx(mod)
 
         # check that final module is the expected quantized module and that the model runs
-        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.LinearReLU))
+        self.assertTrue(
+            isinstance(fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.LinearReLU)
+        )
         self.assertEqual(mod(example).shape, torch.Size([1, 4, 4, 4]))
-        self.assertTrue(isinstance(fqn_to_module(mod, "5.0"), torch.ao.nn.quantized.reference.Linear))
+        self.assertTrue(
+            isinstance(
+                fqn_to_module(mod, "5.0"), torch.ao.nn.quantized.reference.Linear
+            )
+        )
 
         # check that module was actually sparsified
         cur_sparsity = _calculate_sparsity(fqn_to_module(mod, "5.0.weight"))
diff --git a/test/ao/sparsity/test_data_scheduler.py b/test/ao/sparsity/test_data_scheduler.py
index ab7c051c21077..cc451d5cd7838 100644
--- a/test/ao/sparsity/test_data_scheduler.py
+++ b/test/ao/sparsity/test_data_scheduler.py
@@ -1,17 +1,20 @@
 # Owner(s): ["module: unknown"]
 
+import copy
 import logging
 import warnings
-from torch.testing._internal.common_utils import TestCase
-from torch import nn
-import torch
 from typing import Tuple
-import copy
 
-from torch.ao.pruning._experimental.data_sparsifier import DataNormSparsifier
+import torch
+from torch import nn
 from torch.ao.pruning._experimental.data_scheduler import BaseDataScheduler
 
-logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
+from torch.ao.pruning._experimental.data_sparsifier import DataNormSparsifier
+from torch.testing._internal.common_utils import TestCase
+
+logging.basicConfig(
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
+)
 
 
 class ImplementedDataScheduler(BaseDataScheduler):
@@ -20,27 +23,32 @@ def __init__(self, sparsifier, sparsifier_hyperparam, last_epoch=-1, verbose=Fal
 
     def get_schedule_param(self):
         if self.last_epoch > 0:
-            return {name: config['sparsity_level'] * 0.5
-                    for name, config in self.data_sparsifier.data_groups.items()}
+            return {
+                name: config["sparsity_level"] * 0.5
+                for name, config in self.data_sparsifier.data_groups.items()
+            }
         else:
             return self.base_param
 
 
 class TestBaseDataScheduler(TestCase):
     def _get_data(self):
-        tensor1, param1, emb1 = torch.randn(5, 5), nn.Parameter(torch.randn(10, 10)), nn.Embedding(50, 5)
-        data_list = [
-            ('tensor1', tensor1), ('param1', param1), ('emb1', emb1)
-        ]
+        tensor1, param1, emb1 = (
+            torch.randn(5, 5),
+            nn.Parameter(torch.randn(10, 10)),
+            nn.Embedding(50, 5),
+        )
+        data_list = [("tensor1", tensor1), ("param1", param1), ("emb1", emb1)]
         defaults = {
-            'sparsity_level': 0.7,
-            'sparse_block_shape': (1, 4),
-            'zeros_per_block': 2
+            "sparsity_level": 0.7,
+            "sparse_block_shape": (1, 4),
+            "zeros_per_block": 2,
         }
         data_with_config = [
             {
-                'name': 'tensor2', 'data': torch.randn(4, 4),
-                'config': {'sparsity_level': 0.3}
+                "name": "tensor2",
+                "data": torch.randn(4, 4),
+                "config": {"sparsity_level": 0.3},
             }
         ]
         return data_list, data_with_config, defaults
@@ -48,7 +56,11 @@ def _get_data(self):
     def _get_sparsifier(self, data_list, data_with_config, defaults):
         sparsifier = DataNormSparsifier(data_list, **defaults)
         for data_config_dict in data_with_config:
-            name, data, config = data_config_dict['name'], data_config_dict['data'], data_config_dict['config']
+            name, data, config = (
+                data_config_dict["name"],
+                data_config_dict["data"],
+                data_config_dict["config"],
+            )
             sparsifier.add_data(name=name, data=data, **config)
         return sparsifier
 
@@ -57,7 +69,7 @@ def _get_scheduler(self, sparsifier, schedule_param):
         return scheduler
 
     def _get_schedule_param(self):
-        return 'sparsity_level'
+        return "sparsity_level"
 
     def _get_name_data_config(self, some_data, defaults):
         config = copy.deepcopy(defaults)
@@ -66,7 +78,11 @@ def _get_name_data_config(self, some_data, defaults):
             name, data = some_data
         else:
             # dealing with data_with_config
-            name, data, new_config = some_data['name'], some_data['data'], some_data['config']
+            name, data, new_config = (
+                some_data["name"],
+                some_data["data"],
+                some_data["config"],
+            )
             config.update(new_config)
         return name, data, config
 
@@ -102,8 +118,11 @@ def test_order_of_steps(self):
             # Make sure there is no warning related to the base_data_scheduler
             for warning in w:
                 fname = warning.filename
-                fname = '/'.join(fname.split('/')[-5:])
-                assert fname != 'torch/ao/sparsity/experimental/scheduler/data_scheduler/base_data_scheduler.py'
+                fname = "/".join(fname.split("/")[-5:])
+                assert (
+                    fname
+                    != "torch/ao/sparsity/experimental/scheduler/data_scheduler/base_data_scheduler.py"
+                )
 
     def test_step(self):
         data_list, data_with_config, defaults = self._get_data()
@@ -115,14 +134,19 @@ def test_step(self):
 
         for some_data in all_data:
             name, _, config = self._get_name_data_config(some_data, defaults)
-            assert sparsifier.data_groups[name][schedule_param] == config[schedule_param]
+            assert (
+                sparsifier.data_groups[name][schedule_param] == config[schedule_param]
+            )
 
         sparsifier.step()
         scheduler.step()
 
         for some_data in all_data:
             name, _, config = self._get_name_data_config(some_data, defaults)
-            assert sparsifier.data_groups[name][schedule_param] == config[schedule_param] * 0.5
+            assert (
+                sparsifier.data_groups[name][schedule_param]
+                == config[schedule_param] * 0.5
+            )
 
         # checking step count
         step_cnt = 5
@@ -130,7 +154,9 @@ def test_step(self):
             sparsifier.step()
             scheduler.step()
 
-        assert scheduler._step_count == step_cnt + 2  # step_cnt + step above + 1 step in constructor
+        assert (
+            scheduler._step_count == step_cnt + 2
+        )  # step_cnt + step above + 1 step in constructor
 
     def test_state_dict(self):
         data_list, data_with_config, defaults = self._get_data()
diff --git a/test/ao/sparsity/test_data_sparsifier.py b/test/ao/sparsity/test_data_sparsifier.py
index d6d27e368b986..23a3d02872331 100644
--- a/test/ao/sparsity/test_data_sparsifier.py
+++ b/test/ao/sparsity/test_data_sparsifier.py
@@ -1,20 +1,28 @@
 # Owner(s): ["module: unknown"]
 
+import copy
+import itertools
 import logging
-import torch
-from torch.nn.utils.parametrize import is_parametrized
-from torch.testing._internal.common_utils import TestCase
+import math
 
 from typing import Tuple
+
+import torch
 from torch import nn
-import itertools
-import math
-import copy
 
-from torch.ao.pruning._experimental.data_sparsifier import BaseDataSparsifier, DataNormSparsifier
-from torch.ao.pruning._experimental.data_sparsifier.quantization_utils import post_training_sparse_quantize
+from torch.ao.pruning._experimental.data_sparsifier import (
+    BaseDataSparsifier,
+    DataNormSparsifier,
+)
+from torch.ao.pruning._experimental.data_sparsifier.quantization_utils import (
+    post_training_sparse_quantize,
+)
+from torch.nn.utils.parametrize import is_parametrized
+from torch.testing._internal.common_utils import TestCase
 
-logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
+)
 
 
 class ImplementedSparsifier(BaseDataSparsifier):
@@ -25,17 +33,18 @@ def update_mask(self, name, data, **kwargs):
         mask = self.get_mask(name)
         mask[0] = 0
         linear_state = self.state[name]
-        linear_state['step_count'] = linear_state.get('step_count', 0) + 1
+        linear_state["step_count"] = linear_state.get("step_count", 0) + 1
 
 
 class _BaseDataSparsiferTestCase(TestCase):
     r"""This helper test class takes in any supported type of and runs some tests.
-        The user is required to pass in the data that needs to sparsified and the
-        runner will run some tests that needs to be passed in order for the data
-        type to be supported.
-        TODO: Change the structure by creating a separate test case class for each
-              member function
+    The user is required to pass in the data that needs to sparsified and the
+    runner will run some tests that needs to be passed in order for the data
+    type to be supported.
+    TODO: Change the structure by creating a separate test case class for each
+          member function
     """
+
     def run_all_checks(self, data_list, data_with_config, defaults):
         self.check_constructor(data_list, data_with_config, defaults)
         self.check_squash_mask(data_list, data_with_config, defaults)
@@ -52,32 +61,49 @@ def _get_name_data_config(some_data, defaults=None):
             config = defaults
         else:
             # dealing with data_with_config
-            name, data, config = some_data['name'], some_data['data'], some_data['config']
+            name, data, config = (
+                some_data["name"],
+                some_data["data"],
+                some_data["config"],
+            )
         return name, data, config
 
     @staticmethod
-    def _make_sparsifier(data_list, data_with_config, defaults,
-                         sparsifier_type=None, sparsifier_kwargs=None):
+    def _make_sparsifier(
+        data_list,
+        data_with_config,
+        defaults,
+        sparsifier_type=None,
+        sparsifier_kwargs=None,
+    ):
         if sparsifier_type is None:
             sparsifier = ImplementedSparsifier(data_list=data_list, **defaults)
         else:
             kwargs = copy.deepcopy(defaults)
             kwargs.update(sparsifier_kwargs)
-            kwargs['data_list'] = data_list
+            kwargs["data_list"] = data_list
             sparsifier = sparsifier_type(**kwargs)
         assert len(sparsifier.data_groups) == len(data_list)
         for data_config_dict in data_with_config:
-            name, data, config = data_config_dict['name'], data_config_dict['data'], data_config_dict['config']
+            name, data, config = (
+                data_config_dict["name"],
+                data_config_dict["data"],
+                data_config_dict["config"],
+            )
             sparsifier.add_data(name=name, data=data, **config)
         return sparsifier
 
     def check_constructor(self, data_list, data_with_config, defaults, **kwargs):
-        sparsifier = self._make_sparsifier(data_list, data_with_config, defaults=defaults, **kwargs)
-        self.assertEqual(len(sparsifier.data_groups),
-                         len(data_list) + len(data_with_config),
-                         msg="Sparsifier data groups don't match the input "
-                         f"({len(sparsifier.data_groups)} vs. "
-                         f"{len(data_list) + len(data_with_config)}).")
+        sparsifier = self._make_sparsifier(
+            data_list, data_with_config, defaults=defaults, **kwargs
+        )
+        self.assertEqual(
+            len(sparsifier.data_groups),
+            len(data_list) + len(data_with_config),
+            msg="Sparsifier data groups don't match the input "
+            f"({len(sparsifier.data_groups)} vs. "
+            f"{len(data_list) + len(data_with_config)}).",
+        )
 
         all_data = data_list + data_with_config
 
@@ -87,7 +113,9 @@ def check_constructor(self, data_list, data_with_config, defaults, **kwargs):
             self.assertEqual(sparsifier.data_groups[name], config)
 
     def check_step(self, data_list, data_with_config, defaults, **kwargs):
-        sparsifier = self._make_sparsifier(data_list, data_with_config, defaults=defaults, **kwargs)
+        sparsifier = self._make_sparsifier(
+            data_list, data_with_config, defaults=defaults, **kwargs
+        )
         all_data = data_list + data_with_config
 
         # Check data and mask before doing the step
@@ -114,11 +142,13 @@ def check_step(self, data_list, data_with_config, defaults, **kwargs):
             self.assertEqualBroadcasting(sparsified_data[0], 0)
             self.assertEqual(original_data, data)
             self.assertEqualBroadcasting(mask[0], 0)
-            assert 'step_count' in sparsifier.state[name]
-            assert sparsifier.state[name]['step_count'] == 3
+            assert "step_count" in sparsifier.state[name]
+            assert sparsifier.state[name]["step_count"] == 3
 
     def check_squash_mask(self, data_list, data_with_config, defaults, **kwargs):
-        sparsifier = self._make_sparsifier(data_list, data_with_config, defaults=defaults, **kwargs)
+        sparsifier = self._make_sparsifier(
+            data_list, data_with_config, defaults=defaults, **kwargs
+        )
         all_data = data_list + data_with_config
         for some_data in all_data:
             name, _, _ = self._get_name_data_config(some_data)
@@ -129,15 +159,21 @@ def check_squash_mask(self, data_list, data_with_config, defaults, **kwargs):
 
         for some_data in all_data:
             name, _, _ = self._get_name_data_config(some_data)
-            assert not is_parametrized(sparsifier._container, name)  # not parametrized anymore
+            assert not is_parametrized(
+                sparsifier._container, name
+            )  # not parametrized anymore
             with self.assertRaises(ValueError):
                 sparsifier.get_data(name, return_original=True)
 
     def check_add_data(self, data_list, data_with_config, defaults, **kwargs):
-        sparsifier = self._make_sparsifier(data_list, data_with_config, defaults=defaults, **kwargs)
+        sparsifier = self._make_sparsifier(
+            data_list, data_with_config, defaults=defaults, **kwargs
+        )
         all_data = data_list + data_with_config
         for some_data in all_data:
-            name1, data1, config = self._get_name_data_config(some_data, defaults=defaults)
+            name1, data1, config = self._get_name_data_config(
+                some_data, defaults=defaults
+            )
             data1 = sparsifier._extract_weight(data1)
             data1_old = copy.deepcopy(data1)
             assert torch.all(data1 == sparsifier.get_data(name=name1))
@@ -145,18 +181,28 @@ def check_add_data(self, data_list, data_with_config, defaults, **kwargs):
             sparsifier.step()
             mask = sparsifier.get_mask(name1)
 
-            data2 = torch.randn(data1.shape)  # add another data with the same shape as original data
+            data2 = torch.randn(
+                data1.shape
+            )  # add another data with the same shape as original data
             sparsifier.add_data(name=name1, data=data2)
             assert torch.all(data2 == sparsifier.get_data(name=name1))
 
-            assert torch.all(sparsifier.get_mask(name1) == mask)  # mask should not change
+            assert torch.all(
+                sparsifier.get_mask(name1) == mask
+            )  # mask should not change
             assert torch.all(data1_old == data1)
 
-            assert sparsifier.data_groups[name1] == config  # if replaced old_config should match new config
+            assert (
+                sparsifier.data_groups[name1] == config
+            )  # if replaced old_config should match new config
 
     def check_state_dict(self, data_list, data_with_config, defaults, **kwargs):
-        sparsifier1 = self._make_sparsifier(data_list, data_with_config, defaults=defaults, **kwargs)
-        sparsifier2 = self._make_sparsifier(data_list=[data_list[0]], data_with_config=[], defaults=defaults, **kwargs)
+        sparsifier1 = self._make_sparsifier(
+            data_list, data_with_config, defaults=defaults, **kwargs
+        )
+        sparsifier2 = self._make_sparsifier(
+            data_list=[data_list[0]], data_with_config=[], defaults=defaults, **kwargs
+        )
         sparsifier1.step()
 
         state_dict1 = sparsifier1.state_dict()
@@ -169,15 +215,17 @@ def check_state_dict(self, data_list, data_with_config, defaults, **kwargs):
         assert len(sparsifier1.state) == len(sparsifier2.state)
         assert len(sparsifier1.data_groups) == len(sparsifier2.data_groups)
 
-        state1 = state_dict1['state']
+        state1 = state_dict1["state"]
         for name in state1.keys():
             # compare mask
             assert name in sparsifier2.state
-            assert 'mask' in sparsifier2.state[name]
-            assert 'mask' in sparsifier1.state[name]
-            mask1, mask2 = state1[name]['mask'], sparsifier2.state[name]['mask']
+            assert "mask" in sparsifier2.state[name]
+            assert "mask" in sparsifier1.state[name]
+            mask1, mask2 = state1[name]["mask"], sparsifier2.state[name]["mask"]
             assert mask1.is_sparse and not mask2.is_sparse
-            assert torch.all(mask1.to_dense() == mask2)  # mask1 is stored as sparse coo now
+            assert torch.all(
+                mask1.to_dense() == mask2
+            )  # mask1 is stored as sparse coo now
 
             # compare data_groups
             dg1, dg2 = sparsifier1.data_groups, sparsifier2.data_groups
@@ -187,12 +235,14 @@ def check_state_dict(self, data_list, data_with_config, defaults, **kwargs):
             # compare container
             container1, container2 = sparsifier1._container, sparsifier2._container
             assert torch.all(getattr(container1, name) == getattr(container2, name))
-            assert is_parametrized(container1, name) == is_parametrized(container2, name)
+            assert is_parametrized(container1, name) == is_parametrized(
+                container2, name
+            )
             if is_parametrized(container1, name):
                 param1 = getattr(container1.parametrizations, name)[0]
                 param2 = getattr(container2.parametrizations, name)[0]
-                assert hasattr(param1, 'mask')
-                assert hasattr(param2, 'mask')
+                assert hasattr(param1, "mask")
+                assert hasattr(param2, "mask")
                 self.assertEqual(param1.__dict__, param2.__dict__)
 
     def check_memory_reference(self, data_list, data_with_config, defaults, **kwargs):
@@ -204,29 +254,35 @@ def check_memory_reference(self, data_list, data_with_config, defaults, **kwargs
 
         This test modifies the data and asserts that data in the sparsifier is changed as well
         """
-        sparsifier = self._make_sparsifier(data_list, data_with_config, defaults=defaults, **kwargs)
+        sparsifier = self._make_sparsifier(
+            data_list, data_with_config, defaults=defaults, **kwargs
+        )
         all_data = data_list + data_with_config
         for some_data in all_data:
             name, data, _ = self._get_name_data_config(some_data)
             weight = sparsifier._extract_weight(data)
             weight.data = weight + torch.randn(*weight.shape)
             contained_data = sparsifier.get_data(name=name)
-            assert weight.data.storage().data_ptr() == contained_data.data.storage().data_ptr()
+            assert (
+                weight.data.storage().data_ptr()
+                == contained_data.data.storage().data_ptr()
+            )
             assert torch.all(contained_data == weight)
 
 
 class _NormDataSparsifierTestCase(_BaseDataSparsiferTestCase):
     r"""This helper test class takes in any supported type of and runs some tests.
-        This inherits the TestBaseDataSparsifierRuner wherein some functions are
-        over-ridden to take accomodate the specific sparsifier.
-        TODO: Change the structure by creating a separate test case class for each
-              member function
+    This inherits the TestBaseDataSparsifierRuner wherein some functions are
+    over-ridden to take accomodate the specific sparsifier.
+    TODO: Change the structure by creating a separate test case class for each
+          member function
     """
-    def run_all_checks(self, data_list, defaults, data_with_config, norm_type='L1'):
-        assert norm_type in ['L1', 'L2']
+
+    def run_all_checks(self, data_list, defaults, data_with_config, norm_type="L1"):
+        assert norm_type in ["L1", "L2"]
         kwargs = {
-            'sparsifier_type': DataNormSparsifier,
-            'sparsifier_kwargs': {'norm': norm_type}
+            "sparsifier_type": DataNormSparsifier,
+            "sparsifier_kwargs": {"norm": norm_type},
         }
         self.check_constructor(data_list, data_with_config, defaults, **kwargs)
         self.check_squash_mask(data_list, data_with_config, defaults, **kwargs)
@@ -234,24 +290,28 @@ def run_all_checks(self, data_list, defaults, data_with_config, norm_type='L1'):
         self.check_state_dict(data_list, data_with_config, defaults, **kwargs)
         self.check_step(data_list, data_with_config, defaults, norm_type=norm_type)
         self.check_step_2_of_4(norm_type=norm_type)
-        self.check_sparsity_level(data_list, data_with_config, defaults, norm_type=norm_type)
+        self.check_sparsity_level(
+            data_list, data_with_config, defaults, norm_type=norm_type
+        )
         self.check_memory_reference(data_list, data_with_config, defaults, **kwargs)
 
     @staticmethod
     def _get_bounds_on_actual_sparsity(config, tensor_shape):
         r"""This function gets the bounds on actual sparsity.
-            Note::
-                Although we specify the sparsity_level parameter, this does not mean that
-                the actual sparsity obtained after sparsification is the same as sparsity_level.
-                The actual sparsity depends largely on the shape and the data itself.
+        Note::
+            Although we specify the sparsity_level parameter, this does not mean that
+            the actual sparsity obtained after sparsification is the same as sparsity_level.
+            The actual sparsity depends largely on the shape and the data itself.
         """
-        sparsity_level = config['sparsity_level']
-        zeros_per_block = config['zeros_per_block']
-        sparse_block_shape = config['sparse_block_shape']
+        sparsity_level = config["sparsity_level"]
+        zeros_per_block = config["zeros_per_block"]
+        sparse_block_shape = config["sparse_block_shape"]
 
         height, width = tensor_shape[-2], tensor_shape[-1]
         block_height, block_width = sparse_block_shape
-        number_blocks = math.ceil(height / block_height) * math.ceil(width / block_width)
+        number_blocks = math.ceil(height / block_height) * math.ceil(
+            width / block_width
+        )
         values_per_block = block_height * block_width
 
         if zeros_per_block == 0:
@@ -260,17 +320,23 @@ def _get_bounds_on_actual_sparsity(config, tensor_shape):
             # min value assumes zeros_per_block is 1
             min_values_sparsified = round(number_blocks * sparsity_level)
             # max value assumes actual zeros_per_block
-            max_values_sparsified = min_values_sparsified * min(values_per_block, zeros_per_block)
+            max_values_sparsified = min_values_sparsified * min(
+                values_per_block, zeros_per_block
+            )
             lower_bound = min_values_sparsified / (height * width)
             upper_bound = min(1.0, max_values_sparsified / (height * width))
 
             lower_bound, upper_bound = round(lower_bound, 3), round(upper_bound, 3)
             return lower_bound, upper_bound
 
-    def check_step(self, data_list, data_with_config, defaults, norm_type='L1'):
-        sparsifier = self._make_sparsifier(data_list, data_with_config, defaults,
-                                           sparsifier_type=DataNormSparsifier,
-                                           sparsifier_kwargs={'norm': norm_type})
+    def check_step(self, data_list, data_with_config, defaults, norm_type="L1"):
+        sparsifier = self._make_sparsifier(
+            data_list,
+            data_with_config,
+            defaults,
+            sparsifier_type=DataNormSparsifier,
+            sparsifier_kwargs={"norm": norm_type},
+        )
         all_data = data_list + data_with_config
 
         # mask before step() should not be sparsified
@@ -289,29 +355,39 @@ def check_step(self, data_list, data_with_config, defaults, norm_type='L1'):
             mask = mask.to(torch.float)
             actual_sparsity = round(1 - mask.mean().item(), 3)
             assert actual_sparsity >= lb and actual_sparsity <= ub
-            assert actual_sparsity > 0.0  # exact sparsity level cannot be achieved due to size of tensor
+            assert (
+                actual_sparsity > 0.0
+            )  # exact sparsity level cannot be achieved due to size of tensor
 
         iters_before_collapse = 100
 
-        test_sparsifier = DataNormSparsifier(sparsity_level=0.5,
-                                             sparse_block_shape=(1, 4),
-                                             zeros_per_block=4,
-                                             norm=norm_type)
+        test_sparsifier = DataNormSparsifier(
+            sparsity_level=0.5,
+            sparse_block_shape=(1, 4),
+            zeros_per_block=4,
+            norm=norm_type,
+        )
 
         for _ in range(iters_before_collapse):
             new_data = torch.randn(20, 20)
-            test_sparsifier.add_data(name='test_data', data=new_data)
+            test_sparsifier.add_data(name="test_data", data=new_data)
             test_sparsifier.step()
-            mask = test_sparsifier.get_mask(name='test_data')
+            mask = test_sparsifier.get_mask(name="test_data")
             mask = mask.to(torch.float)
             assert (1.0 - mask.mean().item()) > 0  # some sparsity achieved
 
     def check_step_2_of_4(self, norm_type):
         # overriding default config for test purposes
-        default_config = {'sparsity_level': 1.0, 'zeros_per_block': 2, 'sparse_block_shape': (1, 4)}
-        data_list = [('test_data', torch.randn(4, 4))]
+        default_config = {
+            "sparsity_level": 1.0,
+            "zeros_per_block": 2,
+            "sparse_block_shape": (1, 4),
+        }
+        data_list = [("test_data", torch.randn(4, 4))]
 
-        sparsifier = DataNormSparsifier(data_list=data_list, norm=norm_type, **default_config)
+        sparsifier = DataNormSparsifier(
+            data_list=data_list, norm=norm_type, **default_config
+        )
         sparsifier.step()
 
         for some_data in data_list:
@@ -321,29 +397,39 @@ def check_step_2_of_4(self, norm_type):
             self.assertAlmostEqual(1.0 - mask.mean().item(), 0.5, places=2)
             for row in mask:
                 for idx in range(0, len(row), 4):
-                    block = row[idx:idx + 4]
+                    block = row[idx : idx + 4]
                     block, _ = block.sort()
                     assert (block[:2] == 0).all()
                     assert (block[2:] != 0).all()
 
-    def check_sparsity_level(self, data_list, data_with_config, defaults, norm_type='L1'):
+    def check_sparsity_level(
+        self, data_list, data_with_config, defaults, norm_type="L1"
+    ):
         sparsity_levels = [-1.0, 0.0, 0.5, 1.0, 2.0]
         sparse_block_shapes = [(1, 1), (1, 4), (2, 2), (4, 1)]
         zeros_per_blocks = [0, 1, 2, 3, 4]
         sparsifier = DataNormSparsifier(data_list=data_list, norm=norm_type)
 
-        testcases = itertools.tee(itertools.product(sparsity_levels,
-                                                    sparse_block_shapes,
-                                                    zeros_per_blocks))
+        testcases = itertools.tee(
+            itertools.product(sparsity_levels, sparse_block_shapes, zeros_per_blocks)
+        )
 
-        assert len(data_with_config) > 0 and 'name' in data_with_config[0] and 'data' in data_with_config[0]
+        assert (
+            len(data_with_config) > 0
+            and "name" in data_with_config[0]
+            and "data" in data_with_config[0]
+        )
         # get some data
-        name, data = data_with_config[0]['name'], data_with_config[0]['data']
+        name, data = data_with_config[0]["name"], data_with_config[0]["data"]
         for idx, (sl, sbs, zpb) in enumerate(testcases[0]):
-            new_name = f'{name}_{idx}'
+            new_name = f"{name}_{idx}"
             if zpb > sbs[0] * sbs[1]:
                 continue
-            current_config = {'sparsity_level': sl, 'sparse_block_shape': sbs, 'zeros_per_block': zpb}
+            current_config = {
+                "sparsity_level": sl,
+                "sparse_block_shape": sbs,
+                "zeros_per_block": zpb,
+            }
             sparsifier.add_data(name=new_name, data=data, **current_config)
             if zpb > sbs[0] * sbs[1]:
                 continue
@@ -351,7 +437,7 @@ def check_sparsity_level(self, data_list, data_with_config, defaults, norm_type=
         sparsifier.step()
         sparsifier.squash_mask()
         for idx, (sl, sbs, zpb) in enumerate(testcases[0]):
-            new_name = f'{name}_{idx}'
+            new_name = f"{name}_{idx}"
             sparsified_data = sparsifier.get_data(name=new_name, original=False)
             # sparse mask
             sparse_mask = (sparsified_data == 0).float()
@@ -372,130 +458,232 @@ class TestBaseDataSparsifier(_BaseDataSparsiferTestCase):
 
     Once the above is done, create an instance of TestBaseDataSparsifierType and call all the run_tests()
     """
+
     def test_tensors(self):
-        tensor1, tensor2, tensor3 = torch.randn(3, 3), torch.randn(4, 4), torch.randn(5, 5)
+        tensor1, tensor2, tensor3 = (
+            torch.randn(3, 3),
+            torch.randn(4, 4),
+            torch.randn(5, 5),
+        )
         tensor4, tensor5 = torch.randn(1, 1), torch.randn(4, 4)
-        data_list = [('tensor1', tensor1), ('tensor2', tensor2), ('tensor3', tensor3)]
-        defaults = {'test': 3}
+        data_list = [("tensor1", tensor1), ("tensor2", tensor2), ("tensor3", tensor3)]
+        defaults = {"test": 3}
 
         data_with_config = [
-            {
-                'name': 'tensor4', 'data': tensor4, 'config': {'test': 7}
-            },
-            {
-                'name': 'tensor5', 'data': tensor5, 'config': {'test': 8}
-            },
+            {"name": "tensor4", "data": tensor4, "config": {"test": 7}},
+            {"name": "tensor5", "data": tensor5, "config": {"test": 8}},
         ]
-        self.run_all_checks(data_list=data_list, defaults=defaults, data_with_config=data_with_config)
+        self.run_all_checks(
+            data_list=data_list, defaults=defaults, data_with_config=data_with_config
+        )
 
     def test_nn_parameters(self):
-        param1, param2, param3 = nn.Parameter(torch.randn(3, 3)), nn.Parameter(torch.randn(4, 4)), nn.Parameter(torch.randn(5, 5))
-        param4, param5 = nn.Parameter(torch.randn(1, 1)), nn.Parameter(torch.randn(4, 4))
-        data_list = [('param1', param1), ('param2', param2), ('param3', param3)]
-        defaults = {'test': 3}
+        param1, param2, param3 = (
+            nn.Parameter(torch.randn(3, 3)),
+            nn.Parameter(torch.randn(4, 4)),
+            nn.Parameter(torch.randn(5, 5)),
+        )
+        param4, param5 = nn.Parameter(torch.randn(1, 1)), nn.Parameter(
+            torch.randn(4, 4)
+        )
+        data_list = [("param1", param1), ("param2", param2), ("param3", param3)]
+        defaults = {"test": 3}
 
         data_with_config = [
-            {
-                'name': 'param4', 'data': param4, 'config': {'test': 7}
-            },
-            {
-                'name': 'param5', 'data': param5, 'config': {'test': 8}
-            },
+            {"name": "param4", "data": param4, "config": {"test": 7}},
+            {"name": "param5", "data": param5, "config": {"test": 8}},
         ]
-        self.run_all_checks(data_list=data_list, defaults=defaults, data_with_config=data_with_config)
+        self.run_all_checks(
+            data_list=data_list, defaults=defaults, data_with_config=data_with_config
+        )
 
     def test_nn_embeddings(self):
-        emb1, emb2, = nn.Embedding(10, 3), nn.Embedding(20, 3)
+        (
+            emb1,
+            emb2,
+        ) = nn.Embedding(
+            10, 3
+        ), nn.Embedding(20, 3)
         emb1_bag, emb2_bag = nn.EmbeddingBag(10, 3), nn.EmbeddingBag(20, 3)
 
         emb3, emb3_bag = nn.Embedding(15, 3), nn.EmbeddingBag(20, 3)
-        data_list = [('emb1', emb1), ('emb1_bag', emb1_bag), ('emb2', emb2), ('emb2_bag', emb2_bag)]
-        defaults = {'test': 3}
+        data_list = [
+            ("emb1", emb1),
+            ("emb1_bag", emb1_bag),
+            ("emb2", emb2),
+            ("emb2_bag", emb2_bag),
+        ]
+        defaults = {"test": 3}
 
         data_with_config = [
-            {
-                'name': 'emb3', 'data': emb3, 'config': {'test': 7}
-            },
-            {
-                'name': 'emb3_bag', 'data': emb3_bag, 'config': {'test': 8}
-            },
+            {"name": "emb3", "data": emb3, "config": {"test": 7}},
+            {"name": "emb3_bag", "data": emb3_bag, "config": {"test": 8}},
         ]
-        self.run_all_checks(data_list=data_list, defaults=defaults, data_with_config=data_with_config)
+        self.run_all_checks(
+            data_list=data_list, defaults=defaults, data_with_config=data_with_config
+        )
 
 
 class TestNormDataSparsifiers(_NormDataSparsifierTestCase):
     """To add unit tests to support new data types for the NormDataSparsifier, create the following
-        data_list: List of tuples of name, data to be added to the constructor
-        defaults: default config for the above data in data_list
-        data_with_config: list of dictionaries defining name, data and config (look test_tensors())
+    data_list: List of tuples of name, data to be added to the constructor
+    defaults: default config for the above data in data_list
+    data_with_config: list of dictionaries defining name, data and config (look test_tensors())
 
-        Once the above is done, create an instance of _NormDataSparsifierTestRunner and call run_tests()
+    Once the above is done, create an instance of _NormDataSparsifierTestRunner and call run_tests()
     """
+
     def test_tensors(self):
-        tensor1, tensor2, tensor3 = torch.randn(1, 10), torch.randn(4, 4), torch.randn(1, 5)
+        tensor1, tensor2, tensor3 = (
+            torch.randn(1, 10),
+            torch.randn(4, 4),
+            torch.randn(1, 5),
+        )
         tensor4, tensor5 = torch.randn(1, 2), torch.randn(4, 4)
-        data_list = [('tensor1', tensor1), ('tensor2', tensor2), ('tensor3', tensor3)]
-        defaults = {'sparsity_level': 0.5, 'sparse_block_shape': (1, 4), 'zeros_per_block': 4}
+        data_list = [("tensor1", tensor1), ("tensor2", tensor2), ("tensor3", tensor3)]
+        defaults = {
+            "sparsity_level": 0.5,
+            "sparse_block_shape": (1, 4),
+            "zeros_per_block": 4,
+        }
 
         data_with_config = [
             {
-                'name': 'tensor4', 'data': tensor4,
-                'config': {'sparsity_level': 0.7, 'sparse_block_shape': (2, 3), 'zeros_per_block': 6}
+                "name": "tensor4",
+                "data": tensor4,
+                "config": {
+                    "sparsity_level": 0.7,
+                    "sparse_block_shape": (2, 3),
+                    "zeros_per_block": 6,
+                },
             },
             {
-                'name': 'tensor5', 'data': tensor5,
-                'config': {'sparsity_level': 0.3, 'sparse_block_shape': (2, 3), 'zeros_per_block': 6}
+                "name": "tensor5",
+                "data": tensor5,
+                "config": {
+                    "sparsity_level": 0.3,
+                    "sparse_block_shape": (2, 3),
+                    "zeros_per_block": 6,
+                },
             },
         ]
-        self.run_all_checks(data_list=data_list, defaults=defaults,
-                            data_with_config=data_with_config, norm_type='L1')
-        self.run_all_checks(data_list=data_list, defaults=defaults,
-                            data_with_config=data_with_config, norm_type='L2')
+        self.run_all_checks(
+            data_list=data_list,
+            defaults=defaults,
+            data_with_config=data_with_config,
+            norm_type="L1",
+        )
+        self.run_all_checks(
+            data_list=data_list,
+            defaults=defaults,
+            data_with_config=data_with_config,
+            norm_type="L2",
+        )
 
     def test_nn_parameters(self):
-        param1, param2, param3 = nn.Parameter(torch.randn(1, 8)), nn.Parameter(torch.randn(4, 4)), nn.Parameter(torch.randn(5, 5))
-        param4, param5 = nn.Parameter(torch.randn(10, 10)), nn.Parameter(torch.randn(4, 4))
-        data_list = [('param1', param1), ('param2', param2), ('param3', param3)]
-        defaults = {'sparsity_level': 0.5, 'sparse_block_shape': (1, 4), 'zeros_per_block': 4}
+        param1, param2, param3 = (
+            nn.Parameter(torch.randn(1, 8)),
+            nn.Parameter(torch.randn(4, 4)),
+            nn.Parameter(torch.randn(5, 5)),
+        )
+        param4, param5 = nn.Parameter(torch.randn(10, 10)), nn.Parameter(
+            torch.randn(4, 4)
+        )
+        data_list = [("param1", param1), ("param2", param2), ("param3", param3)]
+        defaults = {
+            "sparsity_level": 0.5,
+            "sparse_block_shape": (1, 4),
+            "zeros_per_block": 4,
+        }
 
         data_with_config = [
             {
-                'name': 'param4', 'data': param4,
-                'config': {'sparsity_level': 0.7, 'sparse_block_shape': (2, 3), 'zeros_per_block': 6}
+                "name": "param4",
+                "data": param4,
+                "config": {
+                    "sparsity_level": 0.7,
+                    "sparse_block_shape": (2, 3),
+                    "zeros_per_block": 6,
+                },
             },
             {
-                'name': 'param5', 'data': param5,
-                'config': {'sparsity_level': 0.3, 'sparse_block_shape': (2, 3), 'zeros_per_block': 6}
+                "name": "param5",
+                "data": param5,
+                "config": {
+                    "sparsity_level": 0.3,
+                    "sparse_block_shape": (2, 3),
+                    "zeros_per_block": 6,
+                },
             },
         ]
-        self.run_all_checks(data_list=data_list, defaults=defaults,
-                            data_with_config=data_with_config, norm_type='L1')
-        self.run_all_checks(data_list=data_list, defaults=defaults,
-                            data_with_config=data_with_config, norm_type='L2')
+        self.run_all_checks(
+            data_list=data_list,
+            defaults=defaults,
+            data_with_config=data_with_config,
+            norm_type="L1",
+        )
+        self.run_all_checks(
+            data_list=data_list,
+            defaults=defaults,
+            data_with_config=data_with_config,
+            norm_type="L2",
+        )
 
     def test_nn_embeddings(self):
-        emb1, emb2, = nn.Embedding(10, 3), nn.Embedding(20, 3)
+        (
+            emb1,
+            emb2,
+        ) = nn.Embedding(
+            10, 3
+        ), nn.Embedding(20, 3)
         emb1_bag, emb2_bag = nn.EmbeddingBag(10, 3), nn.EmbeddingBag(20, 3)
 
         emb3, emb3_bag = nn.Embedding(15, 3), nn.EmbeddingBag(20, 3)
-        data_list = [('emb1', emb1), ('emb1_bag', emb1_bag), ('emb2', emb2), ('emb2_bag', emb2_bag)]
-        defaults = {'sparsity_level': 0.5, 'sparse_block_shape': (1, 4), 'zeros_per_block': 4}
+        data_list = [
+            ("emb1", emb1),
+            ("emb1_bag", emb1_bag),
+            ("emb2", emb2),
+            ("emb2_bag", emb2_bag),
+        ]
+        defaults = {
+            "sparsity_level": 0.5,
+            "sparse_block_shape": (1, 4),
+            "zeros_per_block": 4,
+        }
 
         data_with_config = [
             {
-                'name': 'emb3', 'data': emb3,
-                'config': {'sparsity_level': 0.7, 'sparse_block_shape': (2, 3), 'zeros_per_block': 6}
+                "name": "emb3",
+                "data": emb3,
+                "config": {
+                    "sparsity_level": 0.7,
+                    "sparse_block_shape": (2, 3),
+                    "zeros_per_block": 6,
+                },
             },
             {
-                'name': 'emb3_bag', 'data': emb3_bag,
-                'config': {'sparsity_level': 0.3, 'sparse_block_shape': (2, 3), 'zeros_per_block': 6}
+                "name": "emb3_bag",
+                "data": emb3_bag,
+                "config": {
+                    "sparsity_level": 0.3,
+                    "sparse_block_shape": (2, 3),
+                    "zeros_per_block": 6,
+                },
             },
         ]
-        self.run_all_checks(data_list=data_list, defaults=defaults,
-                            data_with_config=data_with_config, norm_type='L1')
+        self.run_all_checks(
+            data_list=data_list,
+            defaults=defaults,
+            data_with_config=data_with_config,
+            norm_type="L1",
+        )
 
-        self.run_all_checks(data_list=data_list, defaults=defaults,
-                            data_with_config=data_with_config, norm_type='L2')
+        self.run_all_checks(
+            data_list=data_list,
+            defaults=defaults,
+            data_with_config=data_with_config,
+            norm_type="L2",
+        )
 
 
 class Model(nn.Module):
@@ -522,16 +710,21 @@ def test_ptq_sparsify_first(self):
         """
         model = Model()
 
-        sparse_config = {'sparsity_level': 0.80, 'sparse_block_shape': (1, 1)}
+        sparse_config = {"sparsity_level": 0.80, "sparse_block_shape": (1, 1)}
         select_embeddings = [model.embbag1, model.emb1]
-        post_training_sparse_quantize(model,
-                                      data_sparsifier_class=DataNormSparsifier,
-                                      sparsify_first=True,
-                                      select_embeddings=select_embeddings,
-                                      **sparse_config)
+        post_training_sparse_quantize(
+            model,
+            data_sparsifier_class=DataNormSparsifier,
+            sparsify_first=True,
+            select_embeddings=select_embeddings,
+            **sparse_config,
+        )
 
         assert type(model.emb1) == torch.ao.nn.quantized.modules.embedding_ops.Embedding
-        assert type(model.embbag1) == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
+        assert (
+            type(model.embbag1)
+            == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
+        )
         assert type(model.emb_seq[0] == nn.Embedding)
         assert type(model.emb_seq[1] == nn.EmbeddingBag)
         assert type(model.linear1) == nn.Linear
@@ -561,24 +754,34 @@ def test_ptq_quantize_first(self):
         """
         model = Model()
 
-        sparse_config = {'sparsity_level': 0.8, 'sparse_block_shape': (1, 1)}
-        post_training_sparse_quantize(model, DataNormSparsifier, sparsify_first=False, **sparse_config)
+        sparse_config = {"sparsity_level": 0.8, "sparse_block_shape": (1, 1)}
+        post_training_sparse_quantize(
+            model, DataNormSparsifier, sparsify_first=False, **sparse_config
+        )
 
         assert type(model.emb1) == torch.ao.nn.quantized.modules.embedding_ops.Embedding
-        assert type(model.embbag1) == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
-        assert type(model.emb_seq[0] == torch.ao.nn.quantized.modules.embedding_ops.Embedding)
-        assert type(model.emb_seq[1] == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag)
+        assert (
+            type(model.embbag1)
+            == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
+        )
+        assert type(
+            model.emb_seq[0] == torch.ao.nn.quantized.modules.embedding_ops.Embedding
+        )
+        assert type(
+            model.emb_seq[1] == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
+        )
         assert type(model.linear1) == nn.Linear  # not quantized
         assert type(model.linear2) == nn.Linear  # not quantized
 
-
         dequant_emb1 = torch.dequantize(model.emb1.weight())
         dequant_embbag1 = torch.dequantize(model.embbag1.weight())
         dequant_emb_seq_0 = torch.dequantize(model.emb_seq[0].weight())
         dequant_emb_seq_1 = torch.dequantize(model.emb_seq[1].weight())
 
         # higher threshold as quantization occurs before sparsity
-        threshold = 1  # zero points seem to have higher magnitude with sparsity occuring after
+        threshold = (
+            1  # zero points seem to have higher magnitude with sparsity occuring after
+        )
 
         sl_emb1 = (torch.abs(dequant_emb1) < threshold).float().mean()
         sl_embbag1 = (torch.abs(dequant_embbag1) < threshold).float().mean()
diff --git a/test/ao/sparsity/test_kernels.py b/test/ao/sparsity/test_kernels.py
index 111d51465be10..910b493fc3a21 100644
--- a/test/ao/sparsity/test_kernels.py
+++ b/test/ao/sparsity/test_kernels.py
@@ -1,32 +1,34 @@
 # Owner(s): ["module: unknown"]
 
-from torch.testing._internal.common_utils import run_tests
-
 import copy
-import numpy as np
 import io
 import logging
 from itertools import product
 
+import numpy as np
+
 import torch
 import torch.ao.quantization as tq
 
 from torch import nn
 from torch.ao.pruning.sparsifier.utils import fqn_to_module
-
-from torch.testing._internal.common_utils import TestCase, skipIfTorchDynamo
 from torch.testing._internal.common_quantized import (
     override_cpu_allocator_for_qnnpack,
     override_qengines,
-    qengine_is_qnnpack,
     qengine_is_fbgemm,
     qengine_is_onednn,
+    qengine_is_qnnpack,
     qengine_is_x86,
 )
 
+from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TestCase
+
 # TODO: Once more test files are created, move the contents to a ao folder.
 
-logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
+)
+
 
 class TestQuantizedSparseKernels(TestCase):
     @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons")
@@ -84,29 +86,42 @@ def test_sparse_qlinear(self):
                     continue
                 if use_channelwise:
                     W_q = torch.quantize_per_channel(
-                        W_fp32, scales=W_scales, zero_points=W_zps, axis=0, dtype=torch.qint8
+                        W_fp32,
+                        scales=W_scales,
+                        zero_points=W_zps,
+                        axis=0,
+                        dtype=torch.qint8,
                     )
                 else:
                     W_q = torch.quantize_per_tensor(
-                        W_fp32, scale=W_scales[0], zero_point=W_zps[0], dtype=torch.qint8
+                        W_fp32,
+                        scale=W_scales[0],
+                        zero_point=W_zps[0],
+                        dtype=torch.qint8,
                     )
 
                 Y_scale = 1.1234
                 Y_zp = 5
                 W_prepack_dense = dense_prepack(W_q, float_bias)
-                W_prepack_sparse = sparse_prepack(W_q, float_bias, row_block_size, col_block_size)
+                W_prepack_sparse = sparse_prepack(
+                    W_q, float_bias, row_block_size, col_block_size
+                )
 
                 if dynamic_mode:
                     Y = sparse_qlinear_dynamic(X_fp32, W_prepack_sparse)
                     Y_ref = dense_qlinear_dynamic(X_fp32, W_prepack_dense)
 
-                    np.testing.assert_array_almost_equal(Y_ref.numpy(), Y.numpy(), decimal=decimal_val)
+                    np.testing.assert_array_almost_equal(
+                        Y_ref.numpy(), Y.numpy(), decimal=decimal_val
+                    )
                 else:
                     Y_q = sparse_qlinear(X_q, W_prepack_sparse, Y_scale, Y_zp)
                     Y_q_ref = dense_qlinear(X_q, W_prepack_dense, Y_scale, Y_zp)
 
                     np.testing.assert_array_almost_equal(
-                        Y_q_ref.int_repr().numpy(), Y_q.int_repr().numpy(), decimal=decimal_val
+                        Y_q_ref.int_repr().numpy(),
+                        Y_q.int_repr().numpy(),
+                        decimal=decimal_val,
                     )
 
 
@@ -235,6 +250,7 @@ def _sparse_layer_test_helper(
             Y_hat = sqmodel(X_q)
             test_class.assertEqual(Y_ref.dequantize(), Y_hat.dequantize())
 
+
 class SparseQuantizedModel(nn.Module):
     def __init__(self, in_channels, out_channels):
         super().__init__()
@@ -243,6 +259,7 @@ def __init__(self, in_channels, out_channels):
     def forward(self, x):
         return self.linear(x)
 
+
 class TestQuantizedSparseLayers(TestCase):
     @override_qengines
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1991")
diff --git a/test/ao/sparsity/test_parametrization.py b/test/ao/sparsity/test_parametrization.py
index 7bd64b916383b..3314c0bacb1d1 100644
--- a/test/ao/sparsity/test_parametrization.py
+++ b/test/ao/sparsity/test_parametrization.py
@@ -3,22 +3,24 @@
 
 import logging
 
+import torch
+
 from torch import nn
 from torch.ao.pruning.sparsifier import utils
 from torch.nn.utils import parametrize
-
-import torch
 from torch.testing._internal.common_utils import TestCase
 
-logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
+)
+
 
 class ModelUnderTest(nn.Module):
     def __init__(self, bias=True):
         super().__init__()
         self.linear = nn.Linear(16, 16, bias=bias)
         self.seq = nn.Sequential(
-            nn.Linear(16, 16, bias=bias),
-            nn.Linear(16, 16, bias=bias)
+            nn.Linear(16, 16, bias=bias), nn.Linear(16, 16, bias=bias)
         )
 
         # Make sure the weights are not random
@@ -45,7 +47,7 @@ def test_masking_logic(self):
 
         mask = torch.zeros(16, 16)
         sparsity = utils.FakeSparsity(mask)
-        parametrize.register_parametrization(model, 'weight', sparsity)
+        parametrize.register_parametrization(model, "weight", sparsity)
 
         x = torch.randn(3, 16)
         self.assertEqual(torch.zeros(3, 16), model(x))
@@ -53,95 +55,119 @@ def test_masking_logic(self):
     def test_weights_parametrized(self):
         model = ModelUnderTest(bias=False)
 
-        assert not hasattr(model.linear, 'parametrizations')
-        assert not hasattr(model.seq[0], 'parametrizations')
-        assert not hasattr(model.seq[1], 'parametrizations')
+        assert not hasattr(model.linear, "parametrizations")
+        assert not hasattr(model.seq[0], "parametrizations")
+        assert not hasattr(model.seq[1], "parametrizations")
         mask = torch.eye(16)
-        parametrize.register_parametrization(model.linear, 'weight',
-                                             utils.FakeSparsity(mask))
+        parametrize.register_parametrization(
+            model.linear, "weight", utils.FakeSparsity(mask)
+        )
         mask = torch.eye(16)
-        parametrize.register_parametrization(model.seq[0], 'weight',
-                                             utils.FakeSparsity(mask))
+        parametrize.register_parametrization(
+            model.seq[0], "weight", utils.FakeSparsity(mask)
+        )
         mask = torch.eye(16)
-        parametrize.register_parametrization(model.seq[1], 'weight',
-                                             utils.FakeSparsity(mask))
+        parametrize.register_parametrization(
+            model.seq[1], "weight", utils.FakeSparsity(mask)
+        )
 
-        assert hasattr(model.linear, 'parametrizations')
-        assert parametrize.is_parametrized(model.linear, 'weight')
-        assert hasattr(model.seq[0], 'parametrizations')
-        assert parametrize.is_parametrized(model.linear, 'weight')
-        assert hasattr(model.seq[1], 'parametrizations')
-        assert parametrize.is_parametrized(model.linear, 'weight')
+        assert hasattr(model.linear, "parametrizations")
+        assert parametrize.is_parametrized(model.linear, "weight")
+        assert hasattr(model.seq[0], "parametrizations")
+        assert parametrize.is_parametrized(model.linear, "weight")
+        assert hasattr(model.seq[1], "parametrizations")
+        assert parametrize.is_parametrized(model.linear, "weight")
 
     def test_state_dict_preserved(self):
         model_save = ModelUnderTest(bias=False)
 
         mask = torch.eye(16)
-        parametrize.register_parametrization(model_save.linear, 'weight',
-                                             utils.FakeSparsity(mask))
+        parametrize.register_parametrization(
+            model_save.linear, "weight", utils.FakeSparsity(mask)
+        )
         mask = torch.eye(16)
-        parametrize.register_parametrization(model_save.seq[0], 'weight',
-                                             utils.FakeSparsity(mask))
+        parametrize.register_parametrization(
+            model_save.seq[0], "weight", utils.FakeSparsity(mask)
+        )
         mask = torch.eye(16)
-        parametrize.register_parametrization(model_save.seq[1], 'weight',
-                                             utils.FakeSparsity(mask))
+        parametrize.register_parametrization(
+            model_save.seq[1], "weight", utils.FakeSparsity(mask)
+        )
         state_dict = model_save.state_dict()
 
         model_load = ModelUnderTest(bias=False)
         mask = torch.zeros(model_load.linear.weight.shape)
-        parametrize.register_parametrization(model_load.linear, 'weight',
-                                             utils.FakeSparsity(mask))
+        parametrize.register_parametrization(
+            model_load.linear, "weight", utils.FakeSparsity(mask)
+        )
         mask = torch.zeros(model_load.seq[0].weight.shape)
-        parametrize.register_parametrization(model_load.seq[0], 'weight',
-                                             utils.FakeSparsity(mask))
+        parametrize.register_parametrization(
+            model_load.seq[0], "weight", utils.FakeSparsity(mask)
+        )
         mask = torch.zeros(model_load.seq[1].weight.shape)
-        parametrize.register_parametrization(model_load.seq[1], 'weight',
-                                             utils.FakeSparsity(mask))
+        parametrize.register_parametrization(
+            model_load.seq[1], "weight", utils.FakeSparsity(mask)
+        )
         # Keep this strict, as we are not loading the 'mask'
         model_load.load_state_dict(state_dict, strict=False)
 
         # Check the parametrizations are preserved
-        assert hasattr(model_load.linear, 'parametrizations')
-        assert parametrize.is_parametrized(model_load.linear, 'weight')
-        assert hasattr(model_load.seq[0], 'parametrizations')
-        assert parametrize.is_parametrized(model_load.linear, 'weight')
-        assert hasattr(model_load.seq[1], 'parametrizations')
-        assert parametrize.is_parametrized(model_load.linear, 'weight')
+        assert hasattr(model_load.linear, "parametrizations")
+        assert parametrize.is_parametrized(model_load.linear, "weight")
+        assert hasattr(model_load.seq[0], "parametrizations")
+        assert parametrize.is_parametrized(model_load.linear, "weight")
+        assert hasattr(model_load.seq[1], "parametrizations")
+        assert parametrize.is_parametrized(model_load.linear, "weight")
 
         # Check the weights are preserved
-        self.assertEqual(model_save.linear.parametrizations['weight'].original,
-                         model_load.linear.parametrizations['weight'].original)
-        self.assertEqual(model_save.seq[0].parametrizations['weight'].original,
-                         model_load.seq[0].parametrizations['weight'].original)
-        self.assertEqual(model_save.seq[1].parametrizations['weight'].original,
-                         model_load.seq[1].parametrizations['weight'].original)
+        self.assertEqual(
+            model_save.linear.parametrizations["weight"].original,
+            model_load.linear.parametrizations["weight"].original,
+        )
+        self.assertEqual(
+            model_save.seq[0].parametrizations["weight"].original,
+            model_load.seq[0].parametrizations["weight"].original,
+        )
+        self.assertEqual(
+            model_save.seq[1].parametrizations["weight"].original,
+            model_load.seq[1].parametrizations["weight"].original,
+        )
 
         # Check the masks are not preserved in the state_dict
         # We store the state_dicts in the sparsifier, not in the model itself.
         # TODO: Need to find a clean way of exporting the parametrized model
-        self.assertNotEqual(model_save.linear.parametrizations['weight'][0].mask,
-                            model_load.linear.parametrizations['weight'][0].mask)
-        self.assertNotEqual(model_save.seq[0].parametrizations['weight'][0].mask,
-                            model_load.seq[0].parametrizations['weight'][0].mask)
-        self.assertNotEqual(model_save.seq[1].parametrizations['weight'][0].mask,
-                            model_load.seq[1].parametrizations['weight'][0].mask)
+        self.assertNotEqual(
+            model_save.linear.parametrizations["weight"][0].mask,
+            model_load.linear.parametrizations["weight"][0].mask,
+        )
+        self.assertNotEqual(
+            model_save.seq[0].parametrizations["weight"][0].mask,
+            model_load.seq[0].parametrizations["weight"][0].mask,
+        )
+        self.assertNotEqual(
+            model_save.seq[1].parametrizations["weight"][0].mask,
+            model_load.seq[1].parametrizations["weight"][0].mask,
+        )
 
     def test_jit_trace(self):
         model = ModelUnderTest(bias=False)
 
         mask = torch.eye(16)
-        parametrize.register_parametrization(model.linear, 'weight',
-                                             utils.FakeSparsity(mask))
+        parametrize.register_parametrization(
+            model.linear, "weight", utils.FakeSparsity(mask)
+        )
         mask = torch.eye(16)
-        parametrize.register_parametrization(model.seq[0], 'weight',
-                                             utils.FakeSparsity(mask))
+        parametrize.register_parametrization(
+            model.seq[0], "weight", utils.FakeSparsity(mask)
+        )
         mask = torch.eye(16)
-        parametrize.register_parametrization(model.seq[1], 'weight',
-                                             utils.FakeSparsity(mask))
+        parametrize.register_parametrization(
+            model.seq[1], "weight", utils.FakeSparsity(mask)
+        )
 
         # Tracing
         example_x = torch.ones(3, 16)
-        model_trace = torch.jit.trace_module(model, {'forward': example_x})
+        model_trace = torch.jit.trace_module(model, {"forward": example_x})
 
         x = torch.randn(3, 16)
         y = model(x)
diff --git a/test/ao/sparsity/test_qlinear_packed_params.py b/test/ao/sparsity/test_qlinear_packed_params.py
index 9e719f423d381..eb186d4245f6d 100644
--- a/test/ao/sparsity/test_qlinear_packed_params.py
+++ b/test/ao/sparsity/test_qlinear_packed_params.py
@@ -2,29 +2,30 @@
 # Owner(s): ["oncall: mobile"]
 
 import tempfile
+
 import torch
 from torch.ao.nn.sparse.quantized.dynamic.linear import Linear
-from torch.testing._internal.common_quantization import (
-    skipIfNoFBGEMM,
-    skipIfNoQNNPACK,
-)
+from torch.testing._internal.common_quantization import skipIfNoFBGEMM, skipIfNoQNNPACK
 from torch.testing._internal.common_quantized import (
-    qengine_is_qnnpack,
+    override_cpu_allocator_for_qnnpack,
     override_quantized_engine,
-    override_cpu_allocator_for_qnnpack
+    qengine_is_qnnpack,
 )
 from torch.testing._internal.common_utils import TestCase
 
+
 class TestQlinearPackedParams(TestCase):
     def qlinear_packed_params_test(self, allow_non_zero_zero_points=False):
         # copied from https://pytorch.org/docs/stable/sparse.html#csr-tensor-operations,
         # so row/col block indices match that example, but with blocks and
         # scaled rows
-        weight_fp32 = torch.Tensor([
-            [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0],
-            [6, 6, 6, 6, 12, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0],
-            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-        ])
+        weight_fp32 = torch.Tensor(
+            [
+                [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0],
+                [6, 6, 6, 6, 12, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+            ]
+        )
 
         row_block_size = 1
         col_block_size = 4
@@ -43,18 +44,14 @@ def qlinear_packed_params_test(self, allow_non_zero_zero_points=False):
         wide_weight_fp32[1][0] = 8
 
         per_tensor_small = (
-            torch.quantize_per_tensor(
-                weight_fp32,
-                scales[0],
-                zero_points[0],
-                dtype
-            ),
+            torch.quantize_per_tensor(weight_fp32, scales[0], zero_points[0], dtype),
             True,
             [0, 1, 3, 3],
             [2, 0, 1],
-            [x + (1 if allow_non_zero_zero_points else 0) for x in [
-                1, 1, 1, 1, 3, 3, 3, 3, 6, 6, 6, 6
-            ]],
+            [
+                x + (1 if allow_non_zero_zero_points else 0)
+                for x in [1, 1, 1, 1, 3, 3, 3, 3, 6, 6, 6, 6]
+            ],
         )
 
         per_channel_small = (
@@ -68,9 +65,10 @@ def qlinear_packed_params_test(self, allow_non_zero_zero_points=False):
             False,
             [0, 1, 3, 3],
             [2, 0, 1],
-            [x + ([1, 2, 2][i // 4] if allow_non_zero_zero_points else 0) for (i, x) in enumerate([
-                1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2
-            ])],
+            [
+                x + ([1, 2, 2][i // 4] if allow_non_zero_zero_points else 0)
+                for (i, x) in enumerate([1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2])
+            ],
         )
 
         per_tensor_large = (
@@ -83,14 +81,19 @@ def qlinear_packed_params_test(self, allow_non_zero_zero_points=False):
             True,
             [0, 2, 3, 3],
             [0, 1001, 0],
-            [x + (1 if allow_non_zero_zero_points else 0) for x in [
-                2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0
-            ]],
+            [
+                x + (1 if allow_non_zero_zero_points else 0)
+                for x in [2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0]
+            ],
         )
 
-        for (weight, is_per_tensor_quantized, expected_row_block_indices, expected_col_block_indices, expected_weights) in [
-            per_tensor_small, per_channel_small, per_tensor_large
-        ]:
+        for (
+            weight,
+            is_per_tensor_quantized,
+            expected_row_block_indices,
+            expected_col_block_indices,
+            expected_weights,
+        ) in [per_tensor_small, per_channel_small, per_tensor_large]:
             lin = Linear(
                 out_features=weight.shape[0],
                 in_features=weight.shape[1],
@@ -118,24 +121,36 @@ def qlinear_packed_params_test(self, allow_non_zero_zero_points=False):
                 col_block_indices_,
                 weights_,
                 output_channels_,
-                input_channels_
+                input_channels_,
             ) = serialized[0]
 
             # Test Serialization
             self.assertEqual(bias_, bias)
             self.assertEqual(out_features_block_size_, row_block_size)
             self.assertEqual(in_features_block_size_, col_block_size)
-            self.assertEqual(weight_scales_, [scales[0]] if is_per_tensor_quantized else scales)
-            self.assertEqual(weight_zero_points_, [zero_points[0]] if is_per_tensor_quantized else zero_points)
+            self.assertEqual(
+                weight_scales_, [scales[0]] if is_per_tensor_quantized else scales
+            )
+            self.assertEqual(
+                weight_zero_points_,
+                [zero_points[0]] if is_per_tensor_quantized else zero_points,
+            )
             self.assertEqual(quantization_scheme_, is_per_tensor_quantized)
             self.assertEqual(row_block_indices_, expected_row_block_indices)
             self.assertEqual(col_block_indices_, expected_col_block_indices)
-            self.assertEqual(weights_.tolist(), [v + 128 for v in expected_weights])  # weights are serialized as +128
+            self.assertEqual(
+                weights_.tolist(), [v + 128 for v in expected_weights]
+            )  # weights are serialized as +128
             self.assertEqual(output_channels_, weight.shape[0])
             self.assertEqual(input_channels_, weight.shape[1])
 
             # Test Unpacking
-            (weights_, bias_, out_features_block_size_, in_features_block_size_) = lin._weight_bias()
+            (
+                weights_,
+                bias_,
+                out_features_block_size_,
+                in_features_block_size_,
+            ) = lin._weight_bias()
             self.assertEqual(torch.dequantize(weights_), torch.dequantize(weight))
             self.assertEqual(bias_, bias)
             self.assertEqual(out_features_block_size_, row_block_size)
@@ -148,7 +163,9 @@ def qlinear_packed_params_test(self, allow_non_zero_zero_points=False):
                 lin2 = torch.load(file_buff)
                 self.assertEqual(lin._weight_bias(), lin2._weight_bias())
                 # Serialize -> Deserialize -> Serialize should match Serialize
-                self.assertEqual(serialized, lin2._packed_params._packed_params.__getstate__())
+                self.assertEqual(
+                    serialized, lin2._packed_params._packed_params.__getstate__()
+                )
 
                 # Test that op output is preserved by serialize -> deserialize
                 if qengine_is_qnnpack():
@@ -157,29 +174,29 @@ def qlinear_packed_params_test(self, allow_non_zero_zero_points=False):
                     y2 = lin2(x)
                     self.assertEqual(y1, y2)
 
-
     @skipIfNoFBGEMM
     def test_qlinear_packed_params_fbgemm(self):
         torch.manual_seed(0)
-        with override_quantized_engine('fbgemm'):
+        with override_quantized_engine("fbgemm"):
             self.qlinear_packed_params_test(allow_non_zero_zero_points=False)
 
-
     @skipIfNoQNNPACK
     def test_qlinear_packed_params_qnnpack(self):
         torch.manual_seed(0)
-        with override_quantized_engine('qnnpack'):
+        with override_quantized_engine("qnnpack"):
             with override_cpu_allocator_for_qnnpack(qengine_is_qnnpack()):
                 self.qlinear_packed_params_test(allow_non_zero_zero_points=True)
 
     def test_qlinear_packed_params_fbgemm_qnnpack_cross_compatibility(self):
         torch.manual_seed(0)
 
-        weight_fp32 = torch.Tensor([
-            [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0],
-            [6, 6, 6, 6, 12, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0],
-            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-        ])
+        weight_fp32 = torch.Tensor(
+            [
+                [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0],
+                [6, 6, 6, 6, 12, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+            ]
+        )
 
         row_block_size = 1
         col_block_size = 4
@@ -229,17 +246,23 @@ def load_get_state_weight_bias(f_b):
         def packed_params_data_with_int32_indices(data_as_state_and_weight_bias):
             (st, weight_bias) = data_as_state_and_weight_bias
             (s0, s1) = st
-            s0_updated = tuple([
-                # 7 and 8 are row and col block indices respectively
-                v if (i != 7 and i != 8) else v.to(torch.int32) for (i, v) in enumerate(list(s0))
-            ])
+            s0_updated = tuple(
+                [
+                    # 7 and 8 are row and col block indices respectively
+                    v if (i != 7 and i != 8) else v.to(torch.int32)
+                    for (i, v) in enumerate(list(s0))
+                ]
+            )
             return ((s0_updated, s1), weight_bias)
 
         # Test Fbgemm -> Qnnpack
-        with override_quantized_engine('fbgemm'):
-            packed_params_data_1a, file_buff_1 = make_lin_get_state_weight_bias_and_save()
+        with override_quantized_engine("fbgemm"):
+            (
+                packed_params_data_1a,
+                file_buff_1,
+            ) = make_lin_get_state_weight_bias_and_save()
 
-        with override_quantized_engine('qnnpack'):
+        with override_quantized_engine("qnnpack"):
             with override_cpu_allocator_for_qnnpack(qengine_is_qnnpack()):
                 packed_params_data_1b = load_get_state_weight_bias(file_buff_1)
 
@@ -249,11 +272,14 @@ def packed_params_data_with_int32_indices(data_as_state_and_weight_bias):
         )
 
         # Test Qnnpack -> Fbgemm
-        with override_quantized_engine('qnnpack'):
+        with override_quantized_engine("qnnpack"):
             with override_cpu_allocator_for_qnnpack(qengine_is_qnnpack()):
-                packed_params_data_2a, file_buff_2 = make_lin_get_state_weight_bias_and_save()
+                (
+                    packed_params_data_2a,
+                    file_buff_2,
+                ) = make_lin_get_state_weight_bias_and_save()
 
-        with override_quantized_engine('fbgemm'):
+        with override_quantized_engine("fbgemm"):
             packed_params_data_2b = load_get_state_weight_bias(file_buff_2)
 
         self.assertEqual(
diff --git a/test/ao/sparsity/test_scheduler.py b/test/ao/sparsity/test_scheduler.py
index 835c5143f18bc..098fde5935ed4 100644
--- a/test/ao/sparsity/test_scheduler.py
+++ b/test/ao/sparsity/test_scheduler.py
@@ -1,43 +1,38 @@
 # Owner(s): ["module: unknown"]
 
+import warnings
+
 from torch import nn
 
-from torch.ao.pruning import WeightNormSparsifier
-from torch.ao.pruning import BaseScheduler, LambdaSL, CubicSL
+from torch.ao.pruning import BaseScheduler, CubicSL, LambdaSL, WeightNormSparsifier
 
 from torch.testing._internal.common_utils import TestCase
 
-import warnings
 
 class ImplementedScheduler(BaseScheduler):
     def get_sl(self):
         if self.last_epoch > 0:
-            return [group['sparsity_level'] * 0.5
-                    for group in self.sparsifier.groups]
+            return [group["sparsity_level"] * 0.5 for group in self.sparsifier.groups]
         else:
             return list(self.base_sl)
 
 
 class TestScheduler(TestCase):
     def test_constructor(self):
-        model = nn.Sequential(
-            nn.Linear(16, 16)
-        )
+        model = nn.Sequential(nn.Linear(16, 16))
         sparsifier = WeightNormSparsifier()
         sparsifier.prepare(model, config=None)
         scheduler = ImplementedScheduler(sparsifier)
 
         assert scheduler.sparsifier is sparsifier
         assert scheduler._step_count == 1
-        assert scheduler.base_sl == [sparsifier.groups[0]['sparsity_level']]
+        assert scheduler.base_sl == [sparsifier.groups[0]["sparsity_level"]]
 
     def test_order_of_steps(self):
         """Checks if the warning is thrown if the scheduler step is called
         before the sparsifier step"""
 
-        model = nn.Sequential(
-            nn.Linear(16, 16)
-        )
+        model = nn.Sequential(nn.Linear(16, 16))
         sparsifier = WeightNormSparsifier()
         sparsifier.prepare(model, config=None)
         scheduler = ImplementedScheduler(sparsifier)
@@ -54,43 +49,41 @@ def test_order_of_steps(self):
             # Make sure there is no warning related to the base_scheduler
             for warning in w:
                 fname = warning.filename
-                fname = '/'.join(fname.split('/')[-5:])
-                assert fname != 'torch/ao/sparsity/scheduler/base_scheduler.py'
+                fname = "/".join(fname.split("/")[-5:])
+                assert fname != "torch/ao/sparsity/scheduler/base_scheduler.py"
 
     def test_step(self):
-        model = nn.Sequential(
-            nn.Linear(16, 16)
-        )
+        model = nn.Sequential(nn.Linear(16, 16))
         sparsifier = WeightNormSparsifier()
         sparsifier.prepare(model, config=None)
-        assert sparsifier.groups[0]['sparsity_level'] == 0.5
+        assert sparsifier.groups[0]["sparsity_level"] == 0.5
         scheduler = ImplementedScheduler(sparsifier)
-        assert sparsifier.groups[0]['sparsity_level'] == 0.5
+        assert sparsifier.groups[0]["sparsity_level"] == 0.5
 
         sparsifier.step()
         scheduler.step()
-        assert sparsifier.groups[0]['sparsity_level'] == 0.25
+        assert sparsifier.groups[0]["sparsity_level"] == 0.25
 
     def test_lambda_scheduler(self):
-        model = nn.Sequential(
-            nn.Linear(16, 16)
-        )
+        model = nn.Sequential(nn.Linear(16, 16))
         sparsifier = WeightNormSparsifier()
         sparsifier.prepare(model, config=None)
-        assert sparsifier.groups[0]['sparsity_level'] == 0.5
+        assert sparsifier.groups[0]["sparsity_level"] == 0.5
         scheduler = LambdaSL(sparsifier, lambda epoch: epoch * 10)
-        assert sparsifier.groups[0]['sparsity_level'] == 0.0  # Epoch 0
+        assert sparsifier.groups[0]["sparsity_level"] == 0.0  # Epoch 0
         scheduler.step()
-        assert sparsifier.groups[0]['sparsity_level'] == 5.0  # Epoch 1
+        assert sparsifier.groups[0]["sparsity_level"] == 5.0  # Epoch 1
 
 
 class TestCubicScheduler(TestCase):
     def setUp(self):
         self.model_sparse_config = [
-            {'tensor_fqn': '0.weight', 'sparsity_level': 0.8},
-            {'tensor_fqn': '2.weight', 'sparsity_level': 0.4},
+            {"tensor_fqn": "0.weight", "sparsity_level": 0.8},
+            {"tensor_fqn": "2.weight", "sparsity_level": 0.4},
+        ]
+        self.sorted_sparse_levels = [
+            conf["sparsity_level"] for conf in self.model_sparse_config
         ]
-        self.sorted_sparse_levels = [conf['sparsity_level'] for conf in self.model_sparse_config]
         self.initial_sparsity = 0.1
         self.initial_step = 3
 
@@ -107,8 +100,8 @@ def _make_scheduler(self, model, **kwargs):
         sparsifier.prepare(model, config=self.model_sparse_config)
 
         scheduler_args = {
-            'init_sl': self.initial_sparsity,
-            'init_t': self.initial_step,
+            "init_sl": self.initial_sparsity,
+            "init_t": self.initial_step,
         }
         scheduler_args.update(kwargs)
 
@@ -118,25 +111,33 @@ def _make_scheduler(self, model, **kwargs):
     @staticmethod
     def _get_sparsity_levels(sparsifier, precision=32):
         r"""Gets the current levels of sparsity in a sparsifier."""
-        return [round(group['sparsity_level'], precision) for group in sparsifier.groups]
+        return [
+            round(group["sparsity_level"], precision) for group in sparsifier.groups
+        ]
 
     def test_constructor(self):
         model = self._make_model()
         sparsifier, scheduler = self._make_scheduler(model=model, initially_zero=True)
         self.assertIs(
-            scheduler.sparsifier, sparsifier,
-            msg="Sparsifier is not properly attached")
+            scheduler.sparsifier, sparsifier, msg="Sparsifier is not properly attached"
+        )
         self.assertEqual(
-            scheduler._step_count, 1,
-            msg="Scheduler is initialized with incorrect step count")
+            scheduler._step_count,
+            1,
+            msg="Scheduler is initialized with incorrect step count",
+        )
         self.assertEqual(
-            scheduler.base_sl, self.sorted_sparse_levels,
-            msg="Scheduler did not store the target sparsity levels correctly")
+            scheduler.base_sl,
+            self.sorted_sparse_levels,
+            msg="Scheduler did not store the target sparsity levels correctly",
+        )
 
         # Value before t_0 is 0
         self.assertEqual(
-            self._get_sparsity_levels(sparsifier), scheduler._make_sure_a_list(0.0),
-            msg="Sparsifier is not reset correctly after attaching to the Scheduler")
+            self._get_sparsity_levels(sparsifier),
+            scheduler._make_sure_a_list(0.0),
+            msg="Sparsifier is not reset correctly after attaching to the Scheduler",
+        )
 
         # Value before t_0 is s_0
         model = self._make_model()
@@ -144,36 +145,50 @@ def test_constructor(self):
         self.assertEqual(
             self._get_sparsity_levels(sparsifier),
             scheduler._make_sure_a_list(self.initial_sparsity),
-            msg="Sparsifier is not reset correctly after attaching to the Scheduler")
+            msg="Sparsifier is not reset correctly after attaching to the Scheduler",
+        )
 
     def test_step(self):
         # For n=5, dt=2, there will be totally 10 steps between s_0 and s_f, starting from t_0
         model = self._make_model()
         sparsifier, scheduler = self._make_scheduler(
-            model=model, initially_zero=True, init_t=3, delta_t=2, total_t=5)
+            model=model, initially_zero=True, init_t=3, delta_t=2, total_t=5
+        )
 
         scheduler.step()
         scheduler.step()
-        self.assertEqual(scheduler._step_count, 3, msg="Scheduler step_count is expected to increment")
+        self.assertEqual(
+            scheduler._step_count,
+            3,
+            msg="Scheduler step_count is expected to increment",
+        )
         # Value before t_0 is supposed to be 0
         self.assertEqual(
-            self._get_sparsity_levels(sparsifier), scheduler._make_sure_a_list(0.0),
-            msg="Scheduler step updating the sparsity level before t_0")
+            self._get_sparsity_levels(sparsifier),
+            scheduler._make_sure_a_list(0.0),
+            msg="Scheduler step updating the sparsity level before t_0",
+        )
 
         scheduler.step()  # Step = 3  =>  sparsity = initial_sparsity
         self.assertEqual(
-            self._get_sparsity_levels(sparsifier), scheduler._make_sure_a_list(self.initial_sparsity),
-            msg="Sparsifier is not reset to initial sparsity at the first step")
+            self._get_sparsity_levels(sparsifier),
+            scheduler._make_sure_a_list(self.initial_sparsity),
+            msg="Sparsifier is not reset to initial sparsity at the first step",
+        )
 
         scheduler.step()  # Step = 4  =>  sparsity ~ [0.3, 0.2]
         self.assertEqual(
-            self._get_sparsity_levels(sparsifier, 1), [0.3, 0.2],
-            msg="Sparsity level is not set correctly after the first step")
+            self._get_sparsity_levels(sparsifier, 1),
+            [0.3, 0.2],
+            msg="Sparsity level is not set correctly after the first step",
+        )
 
         current_step = scheduler._step_count - scheduler.init_t[0] - 1
         more_steps_needed = scheduler.delta_t[0] * scheduler.total_t[0] - current_step
         for _ in range(more_steps_needed):  # More steps needed to final sparsity level
             scheduler.step()
         self.assertEqual(
-            self._get_sparsity_levels(sparsifier), self.sorted_sparse_levels,
-            msg="Sparsity level is not reaching the target level afer delta_t * n steps ")
+            self._get_sparsity_levels(sparsifier),
+            self.sorted_sparse_levels,
+            msg="Sparsity level is not reaching the target level afer delta_t * n steps ",
+        )
diff --git a/test/ao/sparsity/test_sparsifier.py b/test/ao/sparsity/test_sparsifier.py
index c9309d4b81fe5..d8151a091d84c 100644
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@@ -6,13 +6,24 @@
 
 import torch
 from torch import nn
-from torch.ao.pruning import BaseSparsifier, WeightNormSparsifier, FakeSparsity, NearlyDiagonalSparsifier
+from torch.ao.pruning import (
+    BaseSparsifier,
+    FakeSparsity,
+    NearlyDiagonalSparsifier,
+    WeightNormSparsifier,
+)
 from torch.nn.utils.parametrize import is_parametrized
+from torch.testing._internal.common_pruning import (
+    ImplementedSparsifier,
+    MockSparseLinear,
+    SimpleLinear,
+)
 
 from torch.testing._internal.common_utils import TestCase
-from torch.testing._internal.common_pruning import SimpleLinear, MockSparseLinear, ImplementedSparsifier
 
-logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
+)
 
 
 class TestBaseSparsifier(TestCase):
@@ -27,41 +38,44 @@ def test_constructor(self):
         sparsifier.step()
         # Can instantiate the model with configs
         sparsifier = ImplementedSparsifier(test=3)
-        sparsifier.prepare(model, [{'tensor_fqn': 'linear1.weight'}])
+        sparsifier.prepare(model, [{"tensor_fqn": "linear1.weight"}])
         assert len(sparsifier.groups) == 1
-        assert sparsifier.groups[0]['tensor_fqn'] == 'linear1.weight'
-        assert 'test' in sparsifier.groups[0]
-        assert sparsifier.groups[0]['test'] == 3
+        assert sparsifier.groups[0]["tensor_fqn"] == "linear1.weight"
+        assert "test" in sparsifier.groups[0]
+        assert sparsifier.groups[0]["test"] == 3
 
     def test_prepare_config(self):
         model = SimpleLinear()
         sparsifier = ImplementedSparsifier(test=3)
         # Make sure there are no parametrizations before `prepare`
-        assert not hasattr(model.seq[0], 'parametrizations')
-        assert not hasattr(model.linear1, 'parametrizations')
-        assert not hasattr(model.linear2, 'parametrizations')
-        sparsifier.prepare(model, config=[
-            {'tensor_fqn': 'seq.0.weight', 'test': 42},
-            # No 'linear1' to make sure it will be skipped in the sparsification
-            {'tensor_fqn': 'linear2.weight'}
-        ])
+        assert not hasattr(model.seq[0], "parametrizations")
+        assert not hasattr(model.linear1, "parametrizations")
+        assert not hasattr(model.linear2, "parametrizations")
+        sparsifier.prepare(
+            model,
+            config=[
+                {"tensor_fqn": "seq.0.weight", "test": 42},
+                # No 'linear1' to make sure it will be skipped in the sparsification
+                {"tensor_fqn": "linear2.weight"},
+            ],
+        )
         assert len(sparsifier.groups) == 2
         # Check if default argument is not assigned if explicit
-        assert sparsifier.groups[0]['tensor_fqn'] == 'seq.0.weight'
-        assert sparsifier.groups[0]['test'] == 42
+        assert sparsifier.groups[0]["tensor_fqn"] == "seq.0.weight"
+        assert sparsifier.groups[0]["test"] == 42
         # Check if FQN and module are pointing to the same location
-        assert sparsifier.groups[1]['tensor_fqn'] == 'linear2.weight'
-        assert sparsifier.groups[1]['module'] == model.linear2
+        assert sparsifier.groups[1]["tensor_fqn"] == "linear2.weight"
+        assert sparsifier.groups[1]["module"] == model.linear2
         # Check if parameterizations are attached
-        assert hasattr(model.seq[0], 'parametrizations')
-        assert not hasattr(model.linear1, 'parametrizations')
-        assert hasattr(model.linear2, 'parametrizations')
+        assert hasattr(model.seq[0], "parametrizations")
+        assert not hasattr(model.linear1, "parametrizations")
+        assert hasattr(model.linear2, "parametrizations")
 
     def test_step(self):
         model = SimpleLinear()
         sparsifier = ImplementedSparsifier(test=3)
         sparsifier.enable_mask_update = True
-        sparsifier.prepare(model, [{'tensor_fqn': 'linear1.weight'}])
+        sparsifier.prepare(model, [{"tensor_fqn": "linear1.weight"}])
         sparsifier.step()
         assert torch.all(model.linear1.parametrizations.weight[0].mask[0] == 0)
 
@@ -69,21 +83,21 @@ def test_state_dict(self):
         step_count = 3
         model0 = SimpleLinear()
         sparsifier0 = ImplementedSparsifier(test=3)
-        sparsifier0.prepare(model0, [{'tensor_fqn': 'linear1.weight'}])
-        mask = model0.linear1.parametrizations['weight'][0].mask
+        sparsifier0.prepare(model0, [{"tensor_fqn": "linear1.weight"}])
+        mask = model0.linear1.parametrizations["weight"][0].mask
         mask.data = torch.arange(mask.shape[0] * mask.shape[1]).reshape(mask.shape)
         for step in range(step_count):
             sparsifier0.step()
         state_dict = sparsifier0.state_dict()
 
         # Check the expected keys in the state_dict
-        assert 'state' in state_dict
-        assert 'step_count' in state_dict['state']['linear1.weight']
-        assert state_dict['state']['linear1.weight']['step_count'] == 3
-        assert 'groups' in state_dict
-        assert 'test' in state_dict['groups'][0]
-        assert 'tensor_fqn' in state_dict['groups'][0]
-        assert state_dict['groups'][0]['tensor_fqn'] == 'linear1.weight'
+        assert "state" in state_dict
+        assert "step_count" in state_dict["state"]["linear1.weight"]
+        assert state_dict["state"]["linear1.weight"]["step_count"] == 3
+        assert "groups" in state_dict
+        assert "test" in state_dict["groups"][0]
+        assert "tensor_fqn" in state_dict["groups"][0]
+        assert state_dict["groups"][0]["tensor_fqn"] == "linear1.weight"
 
         # Check loading static_dict creates an equivalent model
         model1 = SimpleLinear()
@@ -94,11 +108,11 @@ def test_state_dict(self):
 
         # Make sure the masks are different in the beginning
         for mg in sparsifier0.groups:
-            if mg['tensor_fqn'] == 'linear1.weight':
-                mask0 = mg['module'].parametrizations.weight[0].mask
+            if mg["tensor_fqn"] == "linear1.weight":
+                mask0 = mg["module"].parametrizations.weight[0].mask
         for mg in sparsifier1.groups:
-            if mg['tensor_fqn'] == 'linear1.weight':
-                mask1 = mg['module'].parametrizations.weight[0].mask
+            if mg["tensor_fqn"] == "linear1.weight":
+                mask1 = mg["module"].parametrizations.weight[0].mask
         self.assertNotEqual(mask0, mask1)
 
         sparsifier1.load_state_dict(state_dict)
@@ -113,12 +127,12 @@ def test_state_dict(self):
             mg1 = sparsifier1.groups[idx]
             for key in mg0.keys():
                 assert key in mg1
-                if key == 'module':
+                if key == "module":
                     # We cannot compare modules as they are different
                     param0 = mg0[key].parametrizations.weight[0]
                     param1 = mg1[key].parametrizations.weight[0]
-                    assert hasattr(param0, 'mask')
-                    assert hasattr(param1, 'mask')
+                    assert hasattr(param0, "mask")
+                    assert hasattr(param1, "mask")
                     self.assertEqual(param0.__dict__, param1.__dict__)
                 else:
                     assert mg0[key] == mg1[key]
@@ -126,80 +140,84 @@ def test_state_dict(self):
     def test_convert(self):
         model = SimpleLinear()
         sparsifier = ImplementedSparsifier(test=3)
-        sparsifier.prepare(model, [{'tensor_fqn': 'linear1.weight'}])
-        new_model = sparsifier.convert(model, mapping={nn.Linear: MockSparseLinear}, inplace=False)
+        sparsifier.prepare(model, [{"tensor_fqn": "linear1.weight"}])
+        new_model = sparsifier.convert(
+            model, mapping={nn.Linear: MockSparseLinear}, inplace=False
+        )
 
         assert isinstance(new_model.linear1, MockSparseLinear)
         assert isinstance(new_model.seq[0], nn.Linear)
         assert isinstance(new_model.linear2, nn.Linear)
 
-
-
     def test_mask_squash(self):
         model = SimpleLinear()
         sparsifier = ImplementedSparsifier(test=3)
-        sparsifier.prepare(model, [{'tensor_fqn': 'linear1.weight'}])
-        assert hasattr(model.linear1.parametrizations.weight[0], 'mask')
-        assert is_parametrized(model.linear1, 'weight')
-        assert not is_parametrized(model.seq[0], 'weight')
+        sparsifier.prepare(model, [{"tensor_fqn": "linear1.weight"}])
+        assert hasattr(model.linear1.parametrizations.weight[0], "mask")
+        assert is_parametrized(model.linear1, "weight")
+        assert not is_parametrized(model.seq[0], "weight")
 
         sparsifier.squash_mask()
-        assert not is_parametrized(model.seq[0], 'weight')
-        assert not is_parametrized(model.linear1, 'weight')
+        assert not is_parametrized(model.seq[0], "weight")
+        assert not is_parametrized(model.linear1, "weight")
 
     def test_mask_squash_with_params1(self):
         model = SimpleLinear()
         sparsifier = ImplementedSparsifier(foo=3, bar=2, baz=1)
-        sparsifier.prepare(model, [{'tensor_fqn': 'linear1.weight'}, {'tensor_fqn': 'seq.0.weight'}])
+        sparsifier.prepare(
+            model, [{"tensor_fqn": "linear1.weight"}, {"tensor_fqn": "seq.0.weight"}]
+        )
         sparsifier.squash_mask(
-            params_to_keep_per_layer={
-                'linear1': ('foo', 'bar'),
-                'seq.0': ('baz',)
-            })
-        assert not is_parametrized(model.seq[0], 'weight')
-        assert not is_parametrized(model.linear1, 'weight')
-        assert hasattr(model.seq[0], 'sparse_params')
-        assert hasattr(model.linear1, 'sparse_params')
-        assert model.seq[0].sparse_params.get('foo', None) is None
-        assert model.seq[0].sparse_params.get('bar', None) is None
-        assert model.seq[0].sparse_params.get('baz', None) == 1
-        assert model.linear1.sparse_params.get('foo', None) == 3
-        assert model.linear1.sparse_params.get('bar', None) == 2
-        assert model.linear1.sparse_params.get('baz', None) is None
+            params_to_keep_per_layer={"linear1": ("foo", "bar"), "seq.0": ("baz",)}
+        )
+        assert not is_parametrized(model.seq[0], "weight")
+        assert not is_parametrized(model.linear1, "weight")
+        assert hasattr(model.seq[0], "sparse_params")
+        assert hasattr(model.linear1, "sparse_params")
+        assert model.seq[0].sparse_params.get("foo", None) is None
+        assert model.seq[0].sparse_params.get("bar", None) is None
+        assert model.seq[0].sparse_params.get("baz", None) == 1
+        assert model.linear1.sparse_params.get("foo", None) == 3
+        assert model.linear1.sparse_params.get("bar", None) == 2
+        assert model.linear1.sparse_params.get("baz", None) is None
 
     def test_mask_squash_with_params2(self):
         model = SimpleLinear()
         sparsifier = ImplementedSparsifier(foo=3, bar=2, baz=1)
-        sparsifier.prepare(model, [{'tensor_fqn': 'linear1.weight'}, {'tensor_fqn': 'seq.0.weight'}])
-        sparsifier.squash_mask(params_to_keep=('foo', 'bar'))
-        assert not is_parametrized(model.seq[0], 'weight')
-        assert not is_parametrized(model.linear1, 'weight')
-        assert hasattr(model.seq[0], 'sparse_params')
-        assert hasattr(model.linear1, 'sparse_params')
-        assert model.seq[0].sparse_params.get('foo', None) == 3
-        assert model.seq[0].sparse_params.get('bar', None) == 2
-        assert model.seq[0].sparse_params.get('baz', None) is None
-        assert model.linear1.sparse_params.get('foo', None) == 3
-        assert model.linear1.sparse_params.get('bar', None) == 2
-        assert model.linear1.sparse_params.get('baz', None) is None
+        sparsifier.prepare(
+            model, [{"tensor_fqn": "linear1.weight"}, {"tensor_fqn": "seq.0.weight"}]
+        )
+        sparsifier.squash_mask(params_to_keep=("foo", "bar"))
+        assert not is_parametrized(model.seq[0], "weight")
+        assert not is_parametrized(model.linear1, "weight")
+        assert hasattr(model.seq[0], "sparse_params")
+        assert hasattr(model.linear1, "sparse_params")
+        assert model.seq[0].sparse_params.get("foo", None) == 3
+        assert model.seq[0].sparse_params.get("bar", None) == 2
+        assert model.seq[0].sparse_params.get("baz", None) is None
+        assert model.linear1.sparse_params.get("foo", None) == 3
+        assert model.linear1.sparse_params.get("bar", None) == 2
+        assert model.linear1.sparse_params.get("baz", None) is None
 
     def test_mask_squash_with_params3(self):
         model = SimpleLinear()
         sparsifier = ImplementedSparsifier(foo=3, bar=2, baz=1)
-        sparsifier.prepare(model, [{'tensor_fqn': 'linear1.weight'}, {'tensor_fqn': 'seq.0.weight'}])
+        sparsifier.prepare(
+            model, [{"tensor_fqn": "linear1.weight"}, {"tensor_fqn": "seq.0.weight"}]
+        )
         sparsifier.squash_mask(
-            params_to_keep=('foo', 'bar'),
-            params_to_keep_per_layer={'seq.0': ('baz',)})
-        assert not is_parametrized(model.seq[0], 'weight')
-        assert not is_parametrized(model.linear1, 'weight')
-        assert hasattr(model.seq[0], 'sparse_params')
-        assert hasattr(model.linear1, 'sparse_params')
-        assert model.seq[0].sparse_params.get('foo', None) == 3
-        assert model.seq[0].sparse_params.get('bar', None) == 2
-        assert model.seq[0].sparse_params.get('baz', None) == 1
-        assert model.linear1.sparse_params.get('foo', None) == 3
-        assert model.linear1.sparse_params.get('bar', None) == 2
-        assert model.linear1.sparse_params.get('baz', None) is None
+            params_to_keep=("foo", "bar"), params_to_keep_per_layer={"seq.0": ("baz",)}
+        )
+        assert not is_parametrized(model.seq[0], "weight")
+        assert not is_parametrized(model.linear1, "weight")
+        assert hasattr(model.seq[0], "sparse_params")
+        assert hasattr(model.linear1, "sparse_params")
+        assert model.seq[0].sparse_params.get("foo", None) == 3
+        assert model.seq[0].sparse_params.get("bar", None) == 2
+        assert model.seq[0].sparse_params.get("baz", None) == 1
+        assert model.linear1.sparse_params.get("foo", None) == 3
+        assert model.linear1.sparse_params.get("bar", None) == 2
+        assert model.linear1.sparse_params.get("baz", None) is None
 
 
 class TestWeightNormSparsifier(TestCase):
@@ -208,25 +226,33 @@ def test_constructor(self):
         sparsifier = WeightNormSparsifier()
         sparsifier.prepare(model, config=None)
         for g in sparsifier.groups:
-            assert isinstance(g['module'], nn.Linear)
+            assert isinstance(g["module"], nn.Linear)
             # The groups are unordered
-            assert g['module_fqn'] in ('seq.0', 'seq.1', 'seq.2', 'linear1', 'linear2')
+            assert g["module_fqn"] in ("seq.0", "seq.1", "seq.2", "linear1", "linear2")
 
     def test_step(self):
         model = SimpleLinear()
         sparsifier = WeightNormSparsifier(sparsity_level=0.5)
-        sparsifier.prepare(model, config=[{'tensor_fqn': 'linear1.weight'}])
+        sparsifier.prepare(model, config=[{"tensor_fqn": "linear1.weight"}])
         for g in sparsifier.groups:
             # Before step
-            module = g['module']
-            assert (1.0 - module.parametrizations['weight'][0].mask.mean()) == 0  # checking sparsity level is 0
+            module = g["module"]
+            assert (
+                1.0 - module.parametrizations["weight"][0].mask.mean()
+            ) == 0  # checking sparsity level is 0
         sparsifier.enable_mask_update = True
         sparsifier.step()
-        self.assertAlmostEqual(model.linear1.parametrizations['weight'][0].mask.mean().item(), 0.5, places=2)
+        self.assertAlmostEqual(
+            model.linear1.parametrizations["weight"][0].mask.mean().item(),
+            0.5,
+            places=2,
+        )
         for g in sparsifier.groups:
             # After step
-            module = g['module']
-            assert (1.0 - module.parametrizations['weight'][0].mask.mean()) > 0  # checking sparsity level has increased
+            module = g["module"]
+            assert (
+                1.0 - module.parametrizations["weight"][0].mask.mean()
+            ) > 0  # checking sparsity level has increased
         # Test if the mask collapses to all zeros if the weights are randomized
         iters_before_collapse = 1000
         for _ in range(iters_before_collapse):
@@ -234,25 +260,29 @@ def test_step(self):
             sparsifier.step()
         for g in sparsifier.groups:
             # After step
-            module = g['module']
-            assert (1.0 - module.parametrizations['weight'][0].mask.mean()) > 0  # checking sparsity level did not collapse
+            module = g["module"]
+            assert (
+                1.0 - module.parametrizations["weight"][0].mask.mean()
+            ) > 0  # checking sparsity level did not collapse
 
     def test_step_2_of_4(self):
         model = SimpleLinear()
-        sparsifier = WeightNormSparsifier(sparsity_level=1.0,
-                                          sparse_block_shape=(1, 4),
-                                          zeros_per_block=2)
-        sparsifier.prepare(model, config=[{'tensor_fqn': 'linear1.weight'}])
+        sparsifier = WeightNormSparsifier(
+            sparsity_level=1.0, sparse_block_shape=(1, 4), zeros_per_block=2
+        )
+        sparsifier.prepare(model, config=[{"tensor_fqn": "linear1.weight"}])
         sparsifier.step()
         # make sure the sparsity level is approximately 50%
-        mask = model.linear1.parametrizations['weight'][0].mask.to(torch.float)  # mean works on float only
+        mask = model.linear1.parametrizations["weight"][0].mask.to(
+            torch.float
+        )  # mean works on float only
         self.assertAlmostEqual(mask.mean().item(), 0.5, places=2)
         # Make sure each block has exactly 50% zeros
-        module = sparsifier.groups[0]['module']
-        mask = module.parametrizations['weight'][0].mask
+        module = sparsifier.groups[0]["module"]
+        mask = module.parametrizations["weight"][0].mask
         for row in mask:
             for idx in range(0, len(row), 4):
-                block = row[idx:idx + 4]
+                block = row[idx : idx + 4]
                 block, _ = block.sort()
                 assert (block[:2] == 0).all()
                 assert (block[2:] != 0).all()
@@ -262,11 +292,11 @@ def test_prepare(self):
         sparsifier = WeightNormSparsifier()
         sparsifier.prepare(model, config=None)
         for g in sparsifier.groups:
-            module = g['module']
+            module = g["module"]
             # Check mask exists
-            assert hasattr(module.parametrizations['weight'][0], 'mask')
+            assert hasattr(module.parametrizations["weight"][0], "mask")
             # Check parametrization exists and is correct
-            assert is_parametrized(module, 'weight')
+            assert is_parametrized(module, "weight")
             assert type(module.parametrizations.weight[0]) == FakeSparsity
 
     def test_mask_squash(self):
@@ -275,39 +305,39 @@ def test_mask_squash(self):
         sparsifier.prepare(model, config=None)
         sparsifier.squash_mask()
         for g in sparsifier.groups:
-            module = g['module']
-            assert not is_parametrized(module, 'weight')
-            assert not hasattr(module, 'mask')
+            module = g["module"]
+            assert not is_parametrized(module, "weight")
+            assert not hasattr(module, "mask")
 
     def test_sparsity_levels(self):
         sparsity_levels = [-1.0, 0.0, 0.5, 1.0, 2.0]
         sparse_block_shapes = [(1, 1), (1, 4), (2, 2), (4, 1)]
         zeros_per_blocks = [0, 1, 2, 3, 4]
 
-        testcases = itertools.tee(itertools.product(sparsity_levels,
-                                                    sparse_block_shapes,
-                                                    zeros_per_blocks))
+        testcases = itertools.tee(
+            itertools.product(sparsity_levels, sparse_block_shapes, zeros_per_blocks)
+        )
         # Create a config and model with all the testcases
         model = nn.Sequential()
         sparsifier = WeightNormSparsifier()
 
         sparsity_per_layer_config = []
-        p = re.compile(r'[-\.\s]')
+        p = re.compile(r"[-\.\s]")
         for sl, sbs, zpb in testcases[0]:
             # Make sure the number of zeros is not > values in a block
             if zpb > sbs[0] * sbs[1]:
                 continue
-            layer_name = f'{sl}_{sbs}_{zpb}'
-            layer_name = p.sub('_', layer_name)
+            layer_name = f"{sl}_{sbs}_{zpb}"
+            layer_name = p.sub("_", layer_name)
 
             layer = nn.Linear(12, 12, bias=False)
             layer.weight = nn.Parameter(torch.ones(12, 12))
             model.add_module(layer_name, layer)
             config = {
-                'tensor_fqn': layer_name + ".weight",
-                'sparsity_level': sl,
-                'sparse_block_shape': sbs,
-                'zeros_per_block': zpb
+                "tensor_fqn": layer_name + ".weight",
+                "sparsity_level": sl,
+                "sparse_block_shape": sbs,
+                "zeros_per_block": zpb,
             }
             sparsity_per_layer_config.append(config)
 
@@ -319,8 +349,8 @@ def test_sparsity_levels(self):
         for sl, sbs, zpb in testcases[1]:
             if zpb > sbs[0] * sbs[1]:
                 continue
-            layer_name = f'{sl}_{sbs}_{zpb}'
-            layer_name = p.sub('_', layer_name)
+            layer_name = f"{sl}_{sbs}_{zpb}"
+            layer_name = p.sub("_", layer_name)
             layer = getattr(model, layer_name)
 
             # Level of sparsity is achieved
@@ -340,30 +370,34 @@ def test_constructor(self):
         sparsifier = NearlyDiagonalSparsifier(nearliness=1)
         sparsifier.prepare(model, config=None)
         for g in sparsifier.groups:
-            assert isinstance(g['module'], nn.Linear)
+            assert isinstance(g["module"], nn.Linear)
             # The groups are unordered
-            assert g['module_fqn'] in ('seq.0', 'seq.1', 'seq.2', 'linear1', 'linear2')
+            assert g["module_fqn"] in ("seq.0", "seq.1", "seq.2", "linear1", "linear2")
 
     def test_step(self):
         model = SimpleLinear()
         sparsifier = NearlyDiagonalSparsifier(nearliness=1)
-        sparsifier.prepare(model, config=[{'tensor_fqn': 'linear1.weight'}])
+        sparsifier.prepare(model, config=[{"tensor_fqn": "linear1.weight"}])
 
         for g in sparsifier.groups:
             # Before step
-            module = g['module']
-            assert (1.0 - module.parametrizations['weight'][0].mask.mean()) == 0  # checking sparsity level is 0
+            module = g["module"]
+            assert (
+                1.0 - module.parametrizations["weight"][0].mask.mean()
+            ) == 0  # checking sparsity level is 0
 
         sparsifier.enable_mask_update = True
         sparsifier.step()
-        mask = module.parametrizations['weight'][0].mask
+        mask = module.parametrizations["weight"][0].mask
         height, width = mask.shape
         assert torch.all(mask == torch.eye(height, width))
 
         for g in sparsifier.groups:
             # After step
-            module = g['module']
-            assert (1.0 - module.parametrizations['weight'][0].mask.mean()) > 0  # checking sparsity level has increased
+            module = g["module"]
+            assert (
+                1.0 - module.parametrizations["weight"][0].mask.mean()
+            ) > 0  # checking sparsity level has increased
 
         # Test if the mask collapses to all zeros if the weights are randomized
         iters_before_collapse = 1000
@@ -372,19 +406,21 @@ def test_step(self):
             sparsifier.step()
         for g in sparsifier.groups:
             # After step
-            module = g['module']
-            assert (1.0 - module.parametrizations['weight'][0].mask.mean()) > 0  # checking sparsity level did not collapse
+            module = g["module"]
+            assert (
+                1.0 - module.parametrizations["weight"][0].mask.mean()
+            ) > 0  # checking sparsity level did not collapse
 
     def test_prepare(self):
         model = SimpleLinear()
         sparsifier = NearlyDiagonalSparsifier(nearliness=1)
         sparsifier.prepare(model, config=None)
         for g in sparsifier.groups:
-            module = g['module']
+            module = g["module"]
             # Check mask exists
-            assert hasattr(module.parametrizations['weight'][0], 'mask')
+            assert hasattr(module.parametrizations["weight"][0], "mask")
             # Check parametrization exists and is correct
-            assert is_parametrized(module, 'weight')
+            assert is_parametrized(module, "weight")
             assert type(module.parametrizations.weight[0]) == FakeSparsity
 
     def test_mask_squash(self):
@@ -394,36 +430,36 @@ def test_mask_squash(self):
         sparsifier.step()
         sparsifier.squash_mask()
         for g in sparsifier.groups:
-            module = g['module']
-            assert not is_parametrized(module, 'weight')
-            assert not hasattr(module, 'mask')
+            module = g["module"]
+            assert not is_parametrized(module, "weight")
+            assert not hasattr(module, "mask")
             weights = module.weight
             height, width = weights.shape
-            assert torch.all(weights == torch.eye(height, width) * weights)  # only diagonal to be present
-
+            assert torch.all(
+                weights == torch.eye(height, width) * weights
+            )  # only diagonal to be present
 
     def test_sparsity_levels(self):
         nearliness_levels = list(range(-1, 100))
         model = nn.Sequential()
 
-        p = re.compile(r'[-\.\s]')
+        p = re.compile(r"[-\.\s]")
         for nearliness in nearliness_levels:
             sparsifier = NearlyDiagonalSparsifier(nearliness=1)
-            layer_name = f'{nearliness}'
-            layer_name = p.sub('_', layer_name)
+            layer_name = f"{nearliness}"
+            layer_name = p.sub("_", layer_name)
 
             layer = nn.Linear(32, 32, bias=False)
             layer.weight = nn.Parameter(torch.ones(32, 32))
             width, height = layer.weight.shape
             model.add_module(layer_name, layer)
-            config = {
-                'tensor_fqn': layer_name + ".weight",
-                'nearliness': nearliness
-            }
+            config = {"tensor_fqn": layer_name + ".weight", "nearliness": nearliness}
 
             sparsifier.prepare(model, [config])
             # should raise a ValueError when nearliness arg is illegal
-            if (nearliness > 0 and nearliness % 2 == 0) or (nearliness // 2 >= min(width, height)):
+            if (nearliness > 0 and nearliness % 2 == 0) or (
+                nearliness // 2 >= min(width, height)
+            ):
                 with self.assertRaises(ValueError):
                     sparsifier.step()
             else:
diff --git a/test/ao/sparsity/test_structured_sparsifier.py b/test/ao/sparsity/test_structured_sparsifier.py
index 7587f5ca5c3a6..c76c9b20cc902 100644
--- a/test/ao/sparsity/test_structured_sparsifier.py
+++ b/test/ao/sparsity/test_structured_sparsifier.py
@@ -6,32 +6,32 @@
 import torch
 from torch import nn
 from torch.ao.pruning._experimental.pruner import (
-    SaliencyPruner,
-    LSTMSaliencyPruner,
     BaseStructuredSparsifier,
     FakeStructuredSparsity,
-    FPGMPruner
+    FPGMPruner,
+    LSTMSaliencyPruner,
+    SaliencyPruner,
 )
 from torch.nn.utils import parametrize
-
-from torch.testing._internal.common_utils import TestCase, skipIfTorchDynamo
 from torch.testing._internal.common_pruning import (
-    SimpleLinear,
-    LinearBias,
-    LinearActivation,
-    LinearActivationFunctional,
-    SimpleConv2d,
-    Conv2dBias,
     Conv2dActivation,
+    Conv2dBias,
     Conv2dPadBias,
     Conv2dPool,
     Conv2dPoolFlatten,
     Conv2dPoolFlattenFunctional,
-    LSTMLinearModel,
+    LinearActivation,
+    LinearActivationFunctional,
+    LinearBias,
     LSTMLayerNormLinearModel,
+    LSTMLinearModel,
     rows_are_subset,
+    SimpleConv2d,
+    SimpleLinear,
 )
 
+from torch.testing._internal.common_utils import skipIfTorchDynamo, TestCase
+
 
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
@@ -73,6 +73,7 @@ def update_mask(self, module, tensor_name, **kwargs):
                 new_mask = torch.cat(masks)
                 mask.data = new_mask.data
 
+
 class TestSaliencyPruner(TestCase):
     def test_saliency_pruner_update_mask(self):
         """Test that we prune out the row with the lowest saliency (first row)"""
@@ -103,14 +104,9 @@ def test_lstm_saliency_pruner_update_mask(self):
             num_layers=1,
         )
 
-        manual_weights = torch.Tensor([[1, 1],
-                                       [2, 2],
-                                       [2, 2],
-                                       [1, 1],
-                                       [-1, -1],
-                                       [-2, -2],
-                                       [-2, -2],
-                                       [-1, -1]])
+        manual_weights = torch.Tensor(
+            [[1, 1], [2, 2], [2, 2], [1, 1], [-1, -1], [-2, -2], [-2, -2], [-1, -1]]
+        )
 
         with torch.no_grad():
             model.lstm.weight_ih_l0 = nn.Parameter(manual_weights)
@@ -137,18 +133,12 @@ def test_lstm_saliency_pruner_update_mask(self):
         pruned_model(lstm_input)
 
         # make sure lowest saliency rows are pruned
-        expected = torch.Tensor([[2, 2],
-                                 [2, 2],
-                                 [-2, -2],
-                                 [-2, -2]])
+        expected = torch.Tensor([[2, 2], [2, 2], [-2, -2], [-2, -2]])
         pruned = model.lstm.weight_ih_l0
         assert expected.shape == pruned.shape
         assert torch.isclose(expected, pruned, rtol=1e-05, atol=1e-07).all()
 
-        expected = torch.Tensor([[2],
-                                 [2],
-                                 [-2],
-                                 [-2]])
+        expected = torch.Tensor([[2], [2], [-2], [-2]])
         pruned = model.lstm.weight_hh_l0
         assert expected.shape == pruned.shape
         assert torch.isclose(expected, pruned, rtol=1e-05, atol=1e-07).all()
@@ -159,7 +149,6 @@ def test_lstm_saliency_pruner_update_mask(self):
             assert torch.isclose(expected, pruned, rtol=1e-05, atol=1e-07).all()
 
 
-
 class TestBaseStructuredSparsifier(TestCase):
     def _check_pruner_prepared(self, model, pruner, device):
         for config in pruner.groups:
@@ -204,7 +193,7 @@ def _check_pruner_valid_after_step(self, model, pruner, mask, device):
     def _test_constructor_on_device(self, model, device):
         self.assertRaisesRegex(
             TypeError,
-            "BaseStructuredSparsifier.* update_mask",
+            "BaseStructuredSparsifier.*update_mask",
             BaseStructuredSparsifier,
         )
         model1 = copy.deepcopy(model).to(device)
@@ -916,15 +905,19 @@ def test_prune_lstm_layernorm_linear_single_layer(self):
         # linear columns correctly.
         assert out_expected.shape == out_pruned.shape
 
+
 class TestFPGMPruner(TestCase):
     """
     Test case for the implementation of paper:
     `Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration <https://arxiv.org/abs/1811.00250>`_.
     """
+
     class SimpleConvFPGM(nn.Module):
         def __init__(self):
             super().__init__()
-            self.conv2d1 = nn.Conv2d(in_channels=1, out_channels=3, kernel_size=3, padding=1, bias=False)
+            self.conv2d1 = nn.Conv2d(
+                in_channels=1, out_channels=3, kernel_size=3, padding=1, bias=False
+            )
             # Manually set the filter weights for demonstration purposes
             """
             Three filters' weight are manually set to values 3.0, 2.0, and 0.1.
@@ -933,13 +926,19 @@ def __init__(self):
             """
             weights = torch.tensor([3.0, 2.0, 0.1])  # Weight weights for each filter
             weights = weights[:, None, None, None]  # broadcasting
-            self.conv2d1.weight.data.copy_(torch.ones(self.conv2d1.weight.shape) * weights)
+            self.conv2d1.weight.data.copy_(
+                torch.ones(self.conv2d1.weight.shape) * weights
+            )
 
             # Second Convolutional Layer
-            self.conv2d2 = nn.Conv2d(in_channels=3, out_channels=4, kernel_size=3, padding=1, bias=False)
+            self.conv2d2 = nn.Conv2d(
+                in_channels=3, out_channels=4, kernel_size=3, padding=1, bias=False
+            )
             weights = torch.tensor([6.0, 7.0, 0.4, 0.5])
             weights = weights[:, None, None, None]
-            self.conv2d2.weight.data.copy_(torch.ones(self.conv2d2.weight.shape) * weights)
+            self.conv2d2.weight.data.copy_(
+                torch.ones(self.conv2d2.weight.shape) * weights
+            )
 
         def forward(self, x):
             x = self.conv2d1(x)
@@ -953,11 +952,43 @@ def test_compute_distance(self, device="cpu"):
         dist_conv1 = pruner._compute_distance(model.conv2d1.weight)
 
         # compute the distance matrix using torch.cdist
-        flattened_filters = torch.Tensor([
-            [3.0000, 3.0000, 3.0000, 3.0000, 3.0000, 3.0000, 3.0000, 3.0000, 3.0000],
-            [2.0000, 2.0000, 2.0000, 2.0000, 2.0000, 2.0000, 2.0000, 2.0000, 2.0000],
-            [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000]
-        ])
+        flattened_filters = torch.Tensor(
+            [
+                [
+                    3.0000,
+                    3.0000,
+                    3.0000,
+                    3.0000,
+                    3.0000,
+                    3.0000,
+                    3.0000,
+                    3.0000,
+                    3.0000,
+                ],
+                [
+                    2.0000,
+                    2.0000,
+                    2.0000,
+                    2.0000,
+                    2.0000,
+                    2.0000,
+                    2.0000,
+                    2.0000,
+                    2.0000,
+                ],
+                [
+                    0.1000,
+                    0.1000,
+                    0.1000,
+                    0.1000,
+                    0.1000,
+                    0.1000,
+                    0.1000,
+                    0.1000,
+                    0.1000,
+                ],
+            ]
+        )
 
         """
         Expected distance matrix should have the following values:
@@ -967,9 +998,13 @@ def test_compute_distance(self, device="cpu"):
         the distance should therefore be:
             [11.7000, 8.7000, 14.4000]
         """
-        expected_dist_matrix_conv1 = torch.cdist(flattened_filters, flattened_filters, p=2)
+        expected_dist_matrix_conv1 = torch.cdist(
+            flattened_filters, flattened_filters, p=2
+        )
         expected_dist_conv1 = torch.sum(torch.abs(expected_dist_matrix_conv1), 1)
-        assert torch.isclose(dist_conv1, expected_dist_conv1, rtol=1e-05, atol=1e-07).all()
+        assert torch.isclose(
+            dist_conv1, expected_dist_conv1, rtol=1e-05, atol=1e-07
+        ).all()
 
     def _test_update_mask_on_single_layer(self, expected_conv1, device):
         """Test that pruning is conducted based on the pair-wise distance measurement instead of absolute norm value"""
@@ -981,8 +1016,10 @@ def _test_update_mask_on_single_layer(self, expected_conv1, device):
         pruner.prepare(model, config)
         pruner.enable_mask_update = True
         pruner.step()
-        assert pruner.groups[0]["module"].parametrizations.weight[0].mask[-1].item() is not False, \
-            "do not prune the least-norm filter"
+        assert (
+            pruner.groups[0]["module"].parametrizations.weight[0].mask[-1].item()
+            is not False
+        ), "do not prune the least-norm filter"
 
         # fusion step
         pruned_model = pruner.prune()
@@ -992,27 +1029,38 @@ def _test_update_mask_on_single_layer(self, expected_conv1, device):
         expected_conv1 = expected_conv1.to(device)
         assert pruned_y.shape == (1, 4, 32, 32)
         assert pruned_model.conv2d1.weight.shape == expected_conv1.shape
-        assert pruned_model.conv2d2.weight.shape == (4, 2, 3, 3), "conv2d2 should have input channel pruned"
+        assert pruned_model.conv2d2.weight.shape == (
+            4,
+            2,
+            3,
+            3,
+        ), "conv2d2 should have input channel pruned"
         # assert value
-        assert torch.isclose(pruned_model.conv2d1.weight, expected_conv1, rtol=1e-05, atol=1e-07).all()
+        assert torch.isclose(
+            pruned_model.conv2d1.weight, expected_conv1, rtol=1e-05, atol=1e-07
+        ).all()
 
-    def _test_update_mask_on_multiple_layer(self, expected_conv1, expected_conv2, device):
+    def _test_update_mask_on_multiple_layer(
+        self, expected_conv1, expected_conv2, device
+    ):
         # the second setting
         model = TestFPGMPruner.SimpleConvFPGM().to(device)
         x = torch.ones((1, 1, 32, 32), device=device)
         pruner = FPGMPruner(0.3)
         config = [
             {"tensor_fqn": "conv2d1.weight"},
-            {"tensor_fqn": "conv2d2.weight", "sparsity_level": 0.5}
+            {"tensor_fqn": "conv2d2.weight", "sparsity_level": 0.5},
         ]
         pruner.prepare(model, config)
         pruner.enable_mask_update = True
         pruner.step()
         # Get the masks for the two least-norm filters
-        mask1 = pruner.groups[0]['module'].parametrizations.weight[0].mask[-1]
-        mask2 = pruner.groups[0]['module'].parametrizations.weight[0].mask[-2]
+        mask1 = pruner.groups[0]["module"].parametrizations.weight[0].mask[-1]
+        mask2 = pruner.groups[0]["module"].parametrizations.weight[0].mask[-2]
         # Check if either of the least-norm filters is not pruned
-        assert mask1.item() is not False or mask2.item() is not False, "Do not prune all least-norm filters"
+        assert (
+            mask1.item() is not False or mask2.item() is not False
+        ), "Do not prune all least-norm filters"
 
         # fusion step
         pruned_model = pruner.prune()
@@ -1024,8 +1072,12 @@ def _test_update_mask_on_multiple_layer(self, expected_conv1, expected_conv2, de
         assert pruned_model.conv2d1.weight.shape == expected_conv1.shape
         assert pruned_model.conv2d2.weight.shape == expected_conv2.shape
         # assert values
-        assert torch.isclose(pruned_model.conv2d1.weight, expected_conv1, rtol=1e-05, atol=1e-07).all()
-        assert torch.isclose(pruned_model.conv2d2.weight, expected_conv2, rtol=1e-05, atol=1e-07).all()
+        assert torch.isclose(
+            pruned_model.conv2d1.weight, expected_conv1, rtol=1e-05, atol=1e-07
+        ).all()
+        assert torch.isclose(
+            pruned_model.conv2d2.weight, expected_conv2, rtol=1e-05, atol=1e-07
+        ).all()
 
     def test_update_mask(self):
         weights = torch.tensor([3.0, 0.1])
@@ -1036,4 +1088,6 @@ def test_update_mask(self):
 
         for device in DEVICES:
             self._test_update_mask_on_single_layer(expected_conv1, device)
-            self._test_update_mask_on_multiple_layer(expected_conv1, expected_conv2, device)
+            self._test_update_mask_on_multiple_layer(
+                expected_conv1, expected_conv2, device
+            )
diff --git a/test/autograd/test_complex.py b/test/autograd/test_complex.py
index 5162e0399ee81..caca6f88a00fe 100644
--- a/test/autograd/test_complex.py
+++ b/test/autograd/test_complex.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from torch.testing._internal.common_utils import TestCase, run_tests, gradcheck
+from torch.testing._internal.common_utils import gradcheck, run_tests, TestCase
 
 
 class TestAutogradComplex(TestCase):
@@ -71,7 +71,9 @@ def test_view_with_multi_output(self):
         # modified inplace
         res = x1.unbind(0)
 
-        with self.assertRaisesRegex(RuntimeError, "output of a function that returns multiple views"):
+        with self.assertRaisesRegex(
+            RuntimeError, "output of a function that returns multiple views"
+        ):
             res[0] += torch.rand(2, requires_grad=True)
 
         x.requires_grad_(True)
@@ -80,7 +82,9 @@ def test_view_with_multi_output(self):
         # modified inplace
         res = x1.unbind(0)
 
-        with self.assertRaisesRegex(RuntimeError, "output of a function that returns multiple views"):
+        with self.assertRaisesRegex(
+            RuntimeError, "output of a function that returns multiple views"
+        ):
             res[0] += torch.rand(2, requires_grad=True)
 
     def as_identity(self):
@@ -101,5 +105,5 @@ def func(z):
         self.assertEqual(z.grad, z1.grad)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/autograd/test_functional.py b/test/autograd/test_functional.py
index 9146edd0176ab..90b855a78132e 100644
--- a/test/autograd/test_functional.py
+++ b/test/autograd/test_functional.py
@@ -9,7 +9,14 @@
 
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests, subtest, gradcheck, gradgradcheck, parametrize, instantiate_parametrized_tests)
+    gradcheck,
+    gradgradcheck,
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    subtest,
+    TestCase,
+)
 from torch.testing._internal.logging_tensor import LoggingTensor
 
 # Utilities for parametrizing the tensor constructors used in autograd tests
@@ -27,37 +34,66 @@
 }
 base_ctors = types.SimpleNamespace(**base_ctors_dict)
 
+
 def wrap_with_logging_tensor(ctor):
     def wrapper(*args, **kwargs):
         requires_grad = kwargs.pop("requires_grad", False)
         return LoggingTensor(ctor(*args, **kwargs), requires_grad=requires_grad)
+
     return wrapper
 
-logging_tensor_ctors_dict = {k: wrap_with_logging_tensor(ctor) for (k, ctor) in base_ctors_dict.items()}
-logging_tensor_ctors = types.SimpleNamespace(**logging_tensor_ctors_dict)
 
-base_and_logging_tensor = parametrize("ctors", [subtest(base_ctors, name="base_tensor"),
-                                                subtest(logging_tensor_ctors, name="logging_tensor")])
+logging_tensor_ctors_dict = {
+    k: wrap_with_logging_tensor(ctor) for (k, ctor) in base_ctors_dict.items()
+}
+logging_tensor_ctors = types.SimpleNamespace(**logging_tensor_ctors_dict)
 
-FIXME_base_and_xfail_logging_tensor = parametrize("ctors", [subtest(base_ctors, name="base_tensor"),
-                                                            subtest(logging_tensor_ctors, name="logging_tensor",
-                                                                    decorators=[unittest.expectedFailure])])
+base_and_logging_tensor = parametrize(
+    "ctors",
+    [
+        subtest(base_ctors, name="base_tensor"),
+        subtest(logging_tensor_ctors, name="logging_tensor"),
+    ],
+)
+
+FIXME_base_and_xfail_logging_tensor = parametrize(
+    "ctors",
+    [
+        subtest(base_ctors, name="base_tensor"),
+        subtest(
+            logging_tensor_ctors,
+            name="logging_tensor",
+            decorators=[unittest.expectedFailure],
+        ),
+    ],
+)
 
 # NB: This is equivalent to having both @parametrize("vectorized", [True, False]) and
 #     FIXME_base_and_xfail_logging_tensor, except the non-vectorized logging_tensor case is
 #     actually expected to succeed
-FIXME_xfail_vectorized_logging_tensor = (
-    parametrize("vectorize,ctors", [subtest((True, base_ctors), name="vectorized_base_tensor"),
-                                    subtest((False, base_ctors), name="base_tensor"),
-                                    subtest((True, logging_tensor_ctors), name="vectorized_logging_tensor",
-                                            decorators=[unittest.expectedFailure]),
-                                    subtest((False, logging_tensor_ctors), name="logging_tensor")]))
-
-vectorized_logging_tensor = (
-    parametrize("vectorize,ctors", [subtest((True, base_ctors), name="vectorized_base_tensor"),
-                                    subtest((False, base_ctors), name="base_tensor"),
-                                    subtest((True, logging_tensor_ctors), name="vectorized_logging_tensor"),
-                                    subtest((False, logging_tensor_ctors), name="logging_tensor")]))
+FIXME_xfail_vectorized_logging_tensor = parametrize(
+    "vectorize,ctors",
+    [
+        subtest((True, base_ctors), name="vectorized_base_tensor"),
+        subtest((False, base_ctors), name="base_tensor"),
+        subtest(
+            (True, logging_tensor_ctors),
+            name="vectorized_logging_tensor",
+            decorators=[unittest.expectedFailure],
+        ),
+        subtest((False, logging_tensor_ctors), name="logging_tensor"),
+    ],
+)
+
+vectorized_logging_tensor = parametrize(
+    "vectorize,ctors",
+    [
+        subtest((True, base_ctors), name="vectorized_base_tensor"),
+        subtest((False, base_ctors), name="base_tensor"),
+        subtest((True, logging_tensor_ctors), name="vectorized_logging_tensor"),
+        subtest((False, logging_tensor_ctors), name="logging_tensor"),
+    ],
+)
 
 
 class TestAutogradFunctional(TestCase):
@@ -75,8 +111,10 @@ def _assert_same_struct(self, res, base):
                 self.assertEqual(el_base.size(), el_res.size())
         else:
             # Wrong base
-            raise RuntimeError("The base given to `_assert_same_struct` doesn't have"
-                               " the right structure.")
+            raise RuntimeError(
+                "The base given to `_assert_same_struct` doesn't have"
+                " the right structure."
+            )
 
     def _assert_interleaved_struct(self, res, base1, base2):
         # base1 and base2 can be Tensors or tuples of Tensors.
@@ -112,11 +150,15 @@ def _assert_interleaved_struct(self, res, base1, base2):
                 for el_el_res, el_base2 in zip(el_res, base2):
                     self.assertTrue(isinstance(el_el_res, torch.Tensor))
                     self.assertTrue(isinstance(el_base2, torch.Tensor))
-                    self.assertEqual(el_el_res.size(), el_base1.size() + el_base2.size())
+                    self.assertEqual(
+                        el_el_res.size(), el_base1.size() + el_base2.size()
+                    )
         else:
             # Wrong bases
-            raise RuntimeError("The bases given to `_assert_interleaved_struct` don't have"
-                               " the right structure.")
+            raise RuntimeError(
+                "The bases given to `_assert_interleaved_struct` don't have"
+                " the right structure."
+            )
 
     @base_and_logging_tensor
     def test_vjp_err_check(self, ctors):
@@ -128,19 +170,30 @@ def bar(a):
 
         inp = ctors.rand(4)
         v = ctors.ones(3)
-        with self.assertRaisesRegex(TypeError, "The inputs given to vjp must be either a Tensor"):
+        with self.assertRaisesRegex(
+            TypeError, "The inputs given to vjp must be either a Tensor"
+        ):
             res = autogradF.vjp(foo, (inp, 2), v)
 
-        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to vjp must"):
+        with self.assertRaisesRegex(
+            TypeError, "The outputs of the user-provided function given to vjp must"
+        ):
             res = autogradF.vjp(bar, inp, v)
 
-        with self.assertRaisesRegex(RuntimeError, "The vector v can only be None if the user-provided function returns"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "The vector v can only be None if the user-provided function returns",
+        ):
             res = autogradF.vjp(foo, inp)
 
-        with self.assertRaisesRegex(RuntimeError, "The given v should contain a single Tensor."):
+        with self.assertRaisesRegex(
+            RuntimeError, "The given v should contain a single Tensor."
+        ):
             res = autogradF.vjp(foo, inp, (torch.ones_like(inp), torch.ones_like(inp)))
 
-        with self.assertRaisesRegex(RuntimeError, "v has invalid size: should be torch.Size"):
+        with self.assertRaisesRegex(
+            RuntimeError, "v has invalid size: should be torch.Size"
+        ):
             res = autogradF.vjp(foo, inp, v[:2])
 
         res = autogradF.vjp(foo, inp, v)[1]
@@ -157,24 +210,33 @@ def bar(a):
 
         inp = ctors.rand(4)
         v = ctors.rand(4)
-        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Output 0 of the user-provided function does not require gradients.",
+        ):
             res = autogradF.vjp(foo, inp, v, strict=True)
         res = autogradF.vjp(foo, inp, v, strict=False)
         self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
+        self.assertEqual(res[1].abs().sum(), 0.0)
 
-        with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "The output of the user-provided function is independent of input 0",
+        ):
             res = autogradF.vjp(bar, inp, v, strict=True)
         res = autogradF.vjp(bar, inp, v, strict=False)
         self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
+        self.assertEqual(res[1].abs().sum(), 0.0)
 
         # The Jacobian does not depend on the input
         def foo(a):
             return a.clone()
 
         inp.requires_grad_()
-        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "jacobian of the user-provided function is independent of input 0.",
+        ):
             res = autogradF.vjp(foo, inp, v, create_graph=True, strict=True)
         res = autogradF.vjp(foo, inp, v, create_graph=True, strict=False)
         self._assert_same_struct(res[1], inp)
@@ -184,6 +246,7 @@ def foo(a):
     def test_vjp_no_grad(self, ctors):
         def reducer(x):
             return x.sum(dim=1)
+
         inputs = ctors.rand(4, 4)
         v = ctors.ones(4)
         with torch.no_grad():
@@ -204,6 +267,7 @@ def reducer(x):
     def test_vjp_output(self, ctors):
         def reducer(x):
             return x.sum(dim=1)
+
         inputs = ctors.rand(4, 4)
         v = ctors.ones(4)
         res = autogradF.vjp(reducer, inputs, v)
@@ -226,7 +290,7 @@ def adder(x, y):
             return 2 * x + 3 * y, x + y
 
         inputs = (ctors.rand(2), ctors.rand(2))
-        v = (ctors.tensor([1., 0.]), ctors.tensor([1., 0.]))
+        v = (ctors.tensor([1.0, 0.0]), ctors.tensor([1.0, 0.0]))
         out, vjp_val = autogradF.vjp(adder, inputs, v)
         self._assert_same_struct(vjp_val, inputs)
         self.assertIsNone(out[0].grad_fn)
@@ -238,6 +302,7 @@ def adder(x, y):
     def test_vjp_scalar(self, ctors):
         def reducer(x):
             return x.sum()
+
         inputs = ctors.rand(4, 4)
         v = ctors.ones([])
         res = autogradF.vjp(reducer, inputs, v)
@@ -250,6 +315,7 @@ def reducer(x):
 
         def expander(x):
             return x.unsqueeze(0).repeat(4)
+
         inputs = ctors.rand([])
         v = ctors.ones(4)
         res = autogradF.vjp(expander, inputs, v)
@@ -260,6 +326,7 @@ def expander(x):
     def test_vjp_create_graph(self, ctors):
         def reducer(x):
             return x.sum(dim=1)
+
         inputs = ctors.rand(2, 2, dtype=torch.double)
         v = ctors.ones(2, dtype=torch.double)
 
@@ -270,19 +337,39 @@ def reducer(x):
         self.assertIsNotNone(res[0].grad_fn)
         self.assertIsNotNone(res[1].grad_fn)
 
-        gradcheck(lambda inp, v: autogradF.vjp(reducer, inputs, v, create_graph=True), (inputs, v))
-        gradgradcheck(lambda inp, v: autogradF.vjp(reducer, inputs, v, create_graph=True), (inputs, v))
+        gradcheck(
+            lambda inp, v: autogradF.vjp(reducer, inputs, v, create_graph=True),
+            (inputs, v),
+        )
+        gradgradcheck(
+            lambda inp, v: autogradF.vjp(reducer, inputs, v, create_graph=True),
+            (inputs, v),
+        )
 
         def adder(x, y):
             return 2 * x + 3 * y, x * y
 
-        inputs = (ctors.rand(2, dtype=torch.double, requires_grad=True),
-                  ctors.rand(2, dtype=torch.double, requires_grad=True))
-        v = (ctors.tensor([1., 0.], dtype=torch.double, requires_grad=True),
-             ctors.tensor([1., 0.], dtype=torch.double, requires_grad=True))
-
-        gradcheck(lambda *args: autogradF.vjp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
-        gradgradcheck(lambda *args: autogradF.vjp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
+        inputs = (
+            ctors.rand(2, dtype=torch.double, requires_grad=True),
+            ctors.rand(2, dtype=torch.double, requires_grad=True),
+        )
+        v = (
+            ctors.tensor([1.0, 0.0], dtype=torch.double, requires_grad=True),
+            ctors.tensor([1.0, 0.0], dtype=torch.double, requires_grad=True),
+        )
+
+        gradcheck(
+            lambda *args: autogradF.vjp(adder, args[:2], args[2:], create_graph=True)[
+                1
+            ],
+            inputs + v,
+        )
+        gradgradcheck(
+            lambda *args: autogradF.vjp(adder, args[:2], args[2:], create_graph=True)[
+                1
+            ],
+            inputs + v,
+        )
 
         def foo(*args):
             x, y = args[:2]
@@ -291,7 +378,14 @@ def foo(*args):
             x = x.cos()
             val, grad = autogradF.vjp(adder, (x, y), v, create_graph=True)
 
-            return val[0].exp() + val[1].exp() + grad[0].exp() + grad[1].exp() + x.exp() + y.exp()
+            return (
+                val[0].exp()
+                + val[1].exp()
+                + grad[0].exp()
+                + grad[1].exp()
+                + x.exp()
+                + y.exp()
+            )
 
         gradcheck(foo, inputs + v)
         gradgradcheck(foo, inputs + v)
@@ -306,19 +400,30 @@ def bar(a):
 
         inp = ctors.rand(4)
         v = ctors.rand(4)
-        with self.assertRaisesRegex(TypeError, "The inputs given to jvp must be either a Tensor"):
+        with self.assertRaisesRegex(
+            TypeError, "The inputs given to jvp must be either a Tensor"
+        ):
             res = autogradF.jvp(foo, (inp, 2), v)
 
-        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to jvp must"):
+        with self.assertRaisesRegex(
+            TypeError, "The outputs of the user-provided function given to jvp must"
+        ):
             res = autogradF.jvp(bar, inp, v)
 
-        with self.assertRaisesRegex(RuntimeError, "The vector v can only be None if the input to the user-provided function"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "The vector v can only be None if the input to the user-provided function",
+        ):
             res = autogradF.jvp(foo, inp)
 
-        with self.assertRaisesRegex(RuntimeError, "The given v should contain a single Tensor."):
+        with self.assertRaisesRegex(
+            RuntimeError, "The given v should contain a single Tensor."
+        ):
             res = autogradF.jvp(foo, inp, (v, v))
 
-        with self.assertRaisesRegex(RuntimeError, "v has invalid size: should be torch.Size"):
+        with self.assertRaisesRegex(
+            RuntimeError, "v has invalid size: should be torch.Size"
+        ):
             res = autogradF.jvp(foo, inp, v[:2])
 
         res = autogradF.jvp(foo, inp, v)[1]
@@ -335,24 +440,33 @@ def bar(a):
 
         inp = ctors.rand(4)
         v = ctors.rand(4)
-        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Output 0 of the user-provided function does not require gradients.",
+        ):
             res = autogradF.jvp(foo, inp, v, strict=True)
         res = autogradF.jvp(foo, inp, v, strict=False)
         self._assert_same_struct(res[1], res[0])
-        self.assertEqual(res[1].abs().sum(), 0.)
+        self.assertEqual(res[1].abs().sum(), 0.0)
 
-        with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "The output of the user-provided function is independent of input 0",
+        ):
             res = autogradF.jvp(bar, inp, v, strict=True)
         res = autogradF.jvp(bar, inp, v, strict=False)
         self._assert_same_struct(res[1], res[0])
-        self.assertEqual(res[1].abs().sum(), 0.)
+        self.assertEqual(res[1].abs().sum(), 0.0)
 
         # The Jacobian does not depend on the input
         def foo(a):
             return a.clone()
 
         inp.requires_grad_()
-        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "jacobian of the user-provided function is independent of input 0.",
+        ):
             res = autogradF.jvp(foo, inp, v, create_graph=True, strict=True)
         res = autogradF.jvp(foo, inp, v, create_graph=True, strict=False)
         self._assert_same_struct(res[1], inp)
@@ -362,6 +476,7 @@ def foo(a):
     def test_jvp_no_grad(self, ctors):
         def reducer(x):
             return x.sum(dim=1)
+
         inputs = ctors.rand(4, 4)
         v = ctors.ones(4, 4)
         with torch.no_grad():
@@ -382,6 +497,7 @@ def reducer(x):
     def test_jvp_output(self, ctors):
         def reducer(x):
             return x.sum(dim=1)
+
         inputs = ctors.rand(4, 4)
         v = ctors.ones(4, 4)
         res = autogradF.jvp(reducer, inputs, v)
@@ -404,7 +520,7 @@ def adder(x, y):
             return 2 * x + 3 * y, x + y
 
         inputs = (ctors.rand(2), ctors.rand(2))
-        v = (ctors.tensor([1., 0.]), ctors.tensor([1., 0.]))
+        v = (ctors.tensor([1.0, 0.0]), ctors.tensor([1.0, 0.0]))
         out, jvp_val = autogradF.jvp(adder, inputs, v)
         self._assert_same_struct(jvp_val, out)
         self.assertIsNone(out[0].grad_fn)
@@ -416,6 +532,7 @@ def adder(x, y):
     def test_jvp_scalar(self, ctors):
         def reducer(x):
             return x.sum()
+
         inputs = ctors.rand(4, 4)
         v = ctors.ones(4, 4)
         res = autogradF.jvp(reducer, inputs, v)
@@ -424,6 +541,7 @@ def reducer(x):
 
         def expander(x):
             return x.unsqueeze(0).repeat(4)
+
         inputs = ctors.rand([])
         v = ctors.ones([])
         res = autogradF.jvp(expander, inputs, v)
@@ -438,6 +556,7 @@ def expander(x):
     def test_jvp_create_graph(self, ctors):
         def reducer(x):
             return x.sum(dim=1)
+
         inputs = ctors.rand(2, 2, dtype=torch.double)
         v = ctors.ones(2, 2, dtype=torch.double)
 
@@ -448,19 +567,39 @@ def reducer(x):
         self.assertIsNotNone(res[0].grad_fn)
         self.assertIsNotNone(res[1].grad_fn)
 
-        gradcheck(lambda inp, v: autogradF.jvp(reducer, inp, v, create_graph=True), (inputs, v))
-        gradgradcheck(lambda inp, v: autogradF.jvp(reducer, inp, v, create_graph=True), (inputs, v))
+        gradcheck(
+            lambda inp, v: autogradF.jvp(reducer, inp, v, create_graph=True),
+            (inputs, v),
+        )
+        gradgradcheck(
+            lambda inp, v: autogradF.jvp(reducer, inp, v, create_graph=True),
+            (inputs, v),
+        )
 
         def adder(x, y):
             return 2 * x + 3 * y, x * y
 
-        inputs = (ctors.rand(2, dtype=torch.double, requires_grad=True),
-                  ctors.rand(2, dtype=torch.double, requires_grad=True))
-        v = (ctors.tensor([1., 0.], dtype=torch.double, requires_grad=True),
-             ctors.tensor([1., 0.], dtype=torch.double, requires_grad=True))
-
-        gradcheck(lambda *args: autogradF.jvp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
-        gradgradcheck(lambda *args: autogradF.jvp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v)
+        inputs = (
+            ctors.rand(2, dtype=torch.double, requires_grad=True),
+            ctors.rand(2, dtype=torch.double, requires_grad=True),
+        )
+        v = (
+            ctors.tensor([1.0, 0.0], dtype=torch.double, requires_grad=True),
+            ctors.tensor([1.0, 0.0], dtype=torch.double, requires_grad=True),
+        )
+
+        gradcheck(
+            lambda *args: autogradF.jvp(adder, args[:2], args[2:], create_graph=True)[
+                1
+            ],
+            inputs + v,
+        )
+        gradgradcheck(
+            lambda *args: autogradF.jvp(adder, args[:2], args[2:], create_graph=True)[
+                1
+            ],
+            inputs + v,
+        )
 
         def foo(*args):
             x, y = args[:2]
@@ -469,7 +608,14 @@ def foo(*args):
             x = x.cos()
             val, grad = autogradF.jvp(adder, (x, y), v, create_graph=True)
 
-            return val[0].exp() + val[1].exp() + grad[0].exp() + grad[1].exp() + x.exp() + y.exp()
+            return (
+                val[0].exp()
+                + val[1].exp()
+                + grad[0].exp()
+                + grad[1].exp()
+                + x.exp()
+                + y.exp()
+            )
 
         gradcheck(foo, inputs + v)
         gradgradcheck(foo, inputs + v)
@@ -480,8 +626,9 @@ def _test_construct_standard_basis_for(self, inputs):
         for result, inp in zip(results, inputs):
             self.assertEqual(result.dtype, inp.dtype)
             self.assertEqual(result.device, inp.device)
-        results = torch.cat([result.to(device='cpu', dtype=torch.float)
-                             for result in results], dim=1)
+        results = torch.cat(
+            [result.to(device="cpu", dtype=torch.float) for result in results], dim=1
+        )
         expected = torch.eye(results[0].shape[0], dtype=torch.float)
         self.assertEqual(results, expected)
 
@@ -505,8 +652,8 @@ def test_construct_standard_basis_for(self, ctors):
     @base_and_logging_tensor
     def test_construct_standard_basis_for_cuda(self, ctors):
         test_cases = [
-            (ctors.randn(2), ctors.randn(3, device='cuda')),
-            (ctors.randn(3, device='cuda'), ctors.randn(2)),
+            (ctors.randn(2), ctors.randn(3, device="cuda")),
+            (ctors.randn(3, device="cuda"), ctors.randn(2)),
         ]
 
         for inputs in test_cases:
@@ -519,7 +666,7 @@ def _test_vectorize_raises_no_warnings(self, api, ctors):
         # warning; it is not nice for a public-facing API to raise a warning
         # no matter how it is called.
         def foo(a):
-            return (a ** 2).sum()
+            return (a**2).sum()
 
         x = ctors.randn(3)
         with warnings.catch_warnings(record=True) as wa:
@@ -544,10 +691,15 @@ def bar(a):
             return 3 * a.narrow(0, 0, 3), "bar"
 
         inp = ctors.rand(4)
-        with self.assertRaisesRegex(TypeError, "The inputs given to jacobian must be either a Tensor"):
+        with self.assertRaisesRegex(
+            TypeError, "The inputs given to jacobian must be either a Tensor"
+        ):
             res = autogradF.jacobian(foo, (inp, 2), vectorize=vectorize)
 
-        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to jacobian must"):
+        with self.assertRaisesRegex(
+            TypeError,
+            "The outputs of the user-provided function given to jacobian must",
+        ):
             res = autogradF.jacobian(bar, inp, vectorize=vectorize)
 
         res = autogradF.jacobian(foo, inp, vectorize=vectorize)
@@ -571,24 +723,33 @@ def bar(a):
             return a.long().float().requires_grad_().clone()
 
         inp = ctors.rand(4)
-        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Output 0 of the user-provided function does not require gradients.",
+        ):
             res = autogradF.jacobian(foo, inp, strict=True)
         res = autogradF.jacobian(foo, inp, strict=False)
         self._assert_interleaved_struct(res, foo(inp), inp)
-        self.assertEqual(res.abs().sum(), 0.)
+        self.assertEqual(res.abs().sum(), 0.0)
 
-        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function is independent of input 0."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Output 0 of the user-provided function is independent of input 0.",
+        ):
             res = autogradF.jacobian(bar, inp, strict=True)
         res = autogradF.jacobian(bar, inp, strict=False)
         self._assert_interleaved_struct(res, foo(inp), inp)
-        self.assertEqual(res.abs().sum(), 0.)
+        self.assertEqual(res.abs().sum(), 0.0)
 
         # The Jacobian does not depend on the input
         def foo(a):
             return a.clone()
 
         inp.requires_grad_()
-        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "jacobian of the user-provided function is independent of input 0.",
+        ):
             res = autogradF.jacobian(foo, inp, create_graph=True, strict=True)
         res = autogradF.jacobian(foo, inp, create_graph=True, strict=False)
         self._assert_interleaved_struct(res, inp, inp)
@@ -651,12 +812,14 @@ def add_exp_reducer(x, y):
     def test_jacobian_scalar(self, vectorize, ctors):
         def reducer(x):
             return x.sum()
+
         inputs = ctors.rand(4, 4)
         res = autogradF.jacobian(reducer, inputs, vectorize=vectorize)
         self._assert_same_struct(res, inputs)
 
         def expander(x):
             return x.unsqueeze(0).repeat(4)
+
         inputs = ctors.rand([])
         res = autogradF.jacobian(expander, inputs, vectorize=vectorize)
         self._assert_same_struct(res, ctors.zeros(4))
@@ -668,29 +831,57 @@ def exp_reducer(x):
             return x.exp().sum(dim=1)
 
         inputs = ctors.rand(4, 4, dtype=torch.double, requires_grad=True)
-        res = autogradF.jacobian(exp_reducer, inputs, create_graph=True, vectorize=vectorize)
+        res = autogradF.jacobian(
+            exp_reducer, inputs, create_graph=True, vectorize=vectorize
+        )
         self._assert_interleaved_struct(res, exp_reducer(inputs), inputs)
         self.assertIsNotNone(res.grad_fn)
 
-        gradcheck(lambda inp: autogradF.jacobian(exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
-        gradgradcheck(lambda inp: autogradF.jacobian(exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
+        gradcheck(
+            lambda inp: autogradF.jacobian(
+                exp_reducer, inp, create_graph=True, vectorize=vectorize
+            ),
+            inputs,
+        )
+        gradgradcheck(
+            lambda inp: autogradF.jacobian(
+                exp_reducer, inp, create_graph=True, vectorize=vectorize
+            ),
+            inputs,
+        )
 
         def add_exp_reducer(x, y):
             return (x + y).exp().sum(dim=1)
 
-        inputs = (ctors.rand(4, 4, dtype=torch.double, requires_grad=True),
-                  ctors.rand(4, 4, dtype=torch.double, requires_grad=True))
-        res = autogradF.jacobian(add_exp_reducer, inputs, create_graph=True, vectorize=vectorize)
+        inputs = (
+            ctors.rand(4, 4, dtype=torch.double, requires_grad=True),
+            ctors.rand(4, 4, dtype=torch.double, requires_grad=True),
+        )
+        res = autogradF.jacobian(
+            add_exp_reducer, inputs, create_graph=True, vectorize=vectorize
+        )
         self._assert_interleaved_struct(res, add_exp_reducer(*inputs), inputs)
         self.assertIsNotNone(res[0].grad_fn)
         self.assertIsNotNone(res[1].grad_fn)
 
-        gradcheck(lambda *inp: autogradF.jacobian(add_exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
-        gradgradcheck(lambda *inp: autogradF.jacobian(add_exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
+        gradcheck(
+            lambda *inp: autogradF.jacobian(
+                add_exp_reducer, inp, create_graph=True, vectorize=vectorize
+            ),
+            inputs,
+        )
+        gradgradcheck(
+            lambda *inp: autogradF.jacobian(
+                add_exp_reducer, inp, create_graph=True, vectorize=vectorize
+            ),
+            inputs,
+        )
 
         def foo(x, y):
             x = x.cos()
-            val, jac = autogradF.jacobian(add_exp_reducer, (x, y), create_graph=True, vectorize=vectorize)
+            val, jac = autogradF.jacobian(
+                add_exp_reducer, (x, y), create_graph=True, vectorize=vectorize
+            )
 
             res = val[0].exp().sum() + val[1].exp().sum() + jac[0].exp().sum()
             res = res + jac[1].exp().sum() + x.exp().sum() + y.exp().sum()
@@ -705,13 +896,15 @@ def _check_jacobian_vectorize_correctness(self, f, inputs, test_forward_ad=True)
         self.assertEqual(result_backward_mode, expected)
 
         if test_forward_ad:
-            result_forward_mode = autogradF.jacobian(f, inputs, strategy="forward-mode", vectorize=True)
+            result_forward_mode = autogradF.jacobian(
+                f, inputs, strategy="forward-mode", vectorize=True
+            )
             self.assertEqual(result_forward_mode, expected)
 
     @base_and_logging_tensor
     def test_jacobian_vectorize_correctness_simple(self, ctors):
         def f(x):
-            return 3 * x ** 2
+            return 3 * x**2
 
         x = ctors.randn(2, 3, 5)
         self._check_jacobian_vectorize_correctness(f, x)
@@ -794,13 +987,15 @@ def _check_hessian_vectorize_correctness(self, f, inputs):
         result = autogradF.hessian(f, inputs, vectorize=True)
         self.assertEqual(result, expected)
 
-        result_forward_mode = autogradF.hessian(f, inputs, outer_jacobian_strategy="forward-mode", vectorize=True)
+        result_forward_mode = autogradF.hessian(
+            f, inputs, outer_jacobian_strategy="forward-mode", vectorize=True
+        )
         self.assertEqual(result_forward_mode, expected)
 
     @base_and_logging_tensor
     def test_hessian_vectorize_correctness_simple(self, ctors):
         def f(x):
-            return (3 * x ** 2).sum()
+            return (3 * x**2).sum()
 
         x = ctors.randn(2, 3, 5)
         self._check_hessian_vectorize_correctness(f, x)
@@ -819,7 +1014,7 @@ def f(x, y, z):
     def test_hessian_vectorize_correctness_unrelated_outputs(self, ctors):
         # output unrelated to one input
         def f(x, y):
-            return (x ** 2).sum()
+            return (x**2).sum()
 
         x = ctors.randn(2)
         y = ctors.randn(3)
@@ -849,17 +1044,23 @@ def bar3(a):
             return 3 * a.narrow(0, 0, 3), 3 * a.narrow(0, 0, 3)
 
         inp = ctors.rand(4)
-        with self.assertRaisesRegex(TypeError, "The inputs given to hessian must be either a Tensor"):
+        with self.assertRaisesRegex(
+            TypeError, "The inputs given to hessian must be either a Tensor"
+        ):
             res = autogradF.hessian(foo, (inp, 2), vectorize=vectorize)
 
-        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to hessian must"):
+        with self.assertRaisesRegex(
+            TypeError, "The outputs of the user-provided function given to hessian must"
+        ):
             res = autogradF.hessian(bar, inp, vectorize=vectorize)
 
         err_msg_out = "The Tensor returned by the function given to hessian should contain a single element"
         with self.assertRaisesRegex(RuntimeError, err_msg_out):
             res = autogradF.hessian(bar2, inp, vectorize=vectorize)
 
-        with self.assertRaisesRegex(RuntimeError, "The function given to hessian should return a single Tensor"):
+        with self.assertRaisesRegex(
+            RuntimeError, "The function given to hessian should return a single Tensor"
+        ):
             res = autogradF.hessian(bar3, inp, vectorize=vectorize)
 
         res = autogradF.hessian(foo, inp, vectorize=vectorize)
@@ -887,28 +1088,37 @@ def bar2(a):
             return (3 * a).sum()
 
         inp = ctors.rand(4)
-        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Output 0 of the user-provided function does not require gradients.",
+        ):
             res = autogradF.hessian(foo, inp, strict=True)
         res = autogradF.hessian(foo, inp, strict=False)
         self._assert_interleaved_struct(res, inp, inp)
-        self.assertEqual(res.abs().sum(), 0.)
+        self.assertEqual(res.abs().sum(), 0.0)
 
-        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "jacobian of the user-provided function with respect to input 0",
+        ):
             res = autogradF.hessian(bar, inp, strict=True)
         res = autogradF.hessian(bar, inp, strict=False)
         self._assert_interleaved_struct(res, inp, inp)
-        self.assertEqual(res.abs().sum(), 0.)
+        self.assertEqual(res.abs().sum(), 0.0)
 
-        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "jacobian of the user-provided function with respect to input 0 is",
+        ):
             res = autogradF.hessian(bar2, inp, strict=True)
         res = autogradF.hessian(bar2, inp, strict=False)
         self._assert_interleaved_struct(res, inp, inp)
-        self.assertEqual(res.abs().sum(), 0.)
+        self.assertEqual(res.abs().sum(), 0.0)
 
     @base_and_logging_tensor
     def test_hessian_err_check_strict_vectorize(self, ctors):
         def foo(x):
-            return (x ** 3).sum()
+            return (x**3).sum()
 
         inp = ctors.rand(4)
         with self.assertRaisesRegex(RuntimeError, "not supported together"):
@@ -962,6 +1172,7 @@ def add_pow_reducer(x, y):
     def test_hessian_scalar(self, vectorize, ctors):
         def reducer(x):
             return x.sum()
+
         inputs = ctors.rand(4, 4)
         res = autogradF.hessian(reducer, inputs, vectorize=vectorize)
         self._assert_interleaved_struct(res, inputs, inputs)
@@ -972,6 +1183,7 @@ def reducer(x):
 
         def bad_reducer(x):
             return x.sum().view(1, 1, 1)
+
         inputs = ctors.rand(4, 4)
         res = autogradF.hessian(bad_reducer, inputs, vectorize=vectorize)
         self._assert_interleaved_struct(res, inputs, inputs)
@@ -983,19 +1195,35 @@ def pow_reducer(x):
             return x.pow(3).sum()
 
         inputs = ctors.rand(2, 2, dtype=torch.double, requires_grad=True)
-        res = autogradF.hessian(pow_reducer, inputs, create_graph=True, vectorize=vectorize)
+        res = autogradF.hessian(
+            pow_reducer, inputs, create_graph=True, vectorize=vectorize
+        )
         self._assert_interleaved_struct(res, inputs, inputs)
         self.assertIsNotNone(res.grad_fn)
 
-        gradcheck(lambda inp: autogradF.hessian(pow_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
-        gradgradcheck(lambda inp: autogradF.hessian(pow_reducer, inp, create_graph=True, vectorize=vectorize), inputs)
+        gradcheck(
+            lambda inp: autogradF.hessian(
+                pow_reducer, inp, create_graph=True, vectorize=vectorize
+            ),
+            inputs,
+        )
+        gradgradcheck(
+            lambda inp: autogradF.hessian(
+                pow_reducer, inp, create_graph=True, vectorize=vectorize
+            ),
+            inputs,
+        )
 
         def add_pow_reducer(x, y):
             return (x + y).pow(3).sum()
 
-        inputs = (ctors.rand(2, 2, dtype=torch.double, requires_grad=True),
-                  ctors.rand(2, 2, dtype=torch.double, requires_grad=True))
-        res = autogradF.hessian(add_pow_reducer, inputs, create_graph=True, vectorize=vectorize)
+        inputs = (
+            ctors.rand(2, 2, dtype=torch.double, requires_grad=True),
+            ctors.rand(2, 2, dtype=torch.double, requires_grad=True),
+        )
+        res = autogradF.hessian(
+            add_pow_reducer, inputs, create_graph=True, vectorize=vectorize
+        )
         self._assert_interleaved_struct(res, inputs, inputs)
         self.assertIsNotNone(res[0][0].grad_fn)
         self.assertIsNotNone(res[0][1].grad_fn)
@@ -1005,12 +1233,28 @@ def add_pow_reducer(x, y):
         def flatten(inp):
             return tuple(el_lvl2 for el_lvl1 in inp for el_lvl2 in el_lvl1)
 
-        gradcheck(lambda *inp: flatten(autogradF.hessian(add_pow_reducer, inp, create_graph=True, vectorize=vectorize)), inputs)
-        gradgradcheck(lambda *inp: flatten(autogradF.hessian(add_pow_reducer, inp, create_graph=True, vectorize=vectorize)), inputs)
+        gradcheck(
+            lambda *inp: flatten(
+                autogradF.hessian(
+                    add_pow_reducer, inp, create_graph=True, vectorize=vectorize
+                )
+            ),
+            inputs,
+        )
+        gradgradcheck(
+            lambda *inp: flatten(
+                autogradF.hessian(
+                    add_pow_reducer, inp, create_graph=True, vectorize=vectorize
+                )
+            ),
+            inputs,
+        )
 
         def foo(x, y):
             x = x.cos()
-            val, hess = autogradF.hessian(add_pow_reducer, (x, y), create_graph=True, vectorize=vectorize)
+            val, hess = autogradF.hessian(
+                add_pow_reducer, (x, y), create_graph=True, vectorize=vectorize
+            )
 
             res = val[0].cos().sum() + val[1].cos().sum() + hess[0].cos().sum()
             res = res + hess[1].cos().sum() + x.cos().sum() + y.cos().sum()
@@ -1032,10 +1276,14 @@ def bar2(a):
 
         inp = ctors.rand(4)
         v = ctors.rand(4)
-        with self.assertRaisesRegex(TypeError, "The inputs given to vhp must be either a Tensor"):
+        with self.assertRaisesRegex(
+            TypeError, "The inputs given to vhp must be either a Tensor"
+        ):
             res = autogradF.vhp(foo, (inp, 2), v)
 
-        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to vhp must"):
+        with self.assertRaisesRegex(
+            TypeError, "The outputs of the user-provided function given to vhp must"
+        ):
             res = autogradF.vhp(bar, inp, v)
 
         err_msg_out = "The Tensor returned by the function given to vhp should contain a single element"
@@ -1045,7 +1293,10 @@ def bar2(a):
         with self.assertRaisesRegex(RuntimeError, "v has invalid size:"):
             res = autogradF.vhp(foo, inp, ctors.rand(5))
 
-        with self.assertRaisesRegex(TypeError, "The v given to vhp must be either a Tensor or a tuple of Tensors"):
+        with self.assertRaisesRegex(
+            TypeError,
+            "The v given to vhp must be either a Tensor or a tuple of Tensors",
+        ):
             res = autogradF.vhp(foo, inp, (v, 2))
 
         res = autogradF.vhp(foo, inp, v)
@@ -1075,28 +1326,38 @@ def bar2(a):
 
         inp = ctors.rand(4)
         v = ctors.rand(4)
-        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Output 0 of the user-provided function does not require gradients.",
+        ):
             res = autogradF.vhp(foo, inp, v, strict=True)
         res = autogradF.vhp(foo, inp, v, strict=False)
         self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
+        self.assertEqual(res[1].abs().sum(), 0.0)
 
-        with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "The output of the user-provided function is independent of input 0",
+        ):
             res = autogradF.vhp(bar, inp, v, strict=True)
         res = autogradF.vhp(bar, inp, v, strict=False)
         self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
+        self.assertEqual(res[1].abs().sum(), 0.0)
 
-        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "jacobian of the user-provided function with respect to input 0 is",
+        ):
             res = autogradF.vhp(bar2, inp, v, strict=True)
         res = autogradF.vhp(bar2, inp, v, strict=False)
         self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
+        self.assertEqual(res[1].abs().sum(), 0.0)
 
     @base_and_logging_tensor
     def test_vhp_no_grad(self, ctors):
         def reducer(x):
             return x.exp().sum()
+
         inputs = ctors.rand(4, 4)
         v = ctors.ones(4, 4)
         with torch.no_grad():
@@ -1138,6 +1399,7 @@ def bar(a, b):
     def test_vhp_scalar(self, ctors):
         def reducer(x):
             return x.sum()
+
         inputs = ctors.rand(4, 4)
         v = ctors.ones(4, 4)
         res = autogradF.vhp(reducer, inputs, v)
@@ -1153,6 +1415,7 @@ def reducer(x):
 
         def bad_reducer(x):
             return x.sum().view(1, 1, 1)
+
         inputs = ctors.rand(4, 4)
         v = ctors.rand(4, 4)
         res = autogradF.vhp(bad_reducer, inputs, v)
@@ -1170,24 +1433,38 @@ def foo(a):
         self.assertIsNotNone(res[0].grad_fn)
         self.assertIsNotNone(res[1].grad_fn)
 
-        gradcheck(lambda inp, v: autogradF.vhp(foo, inp, v, create_graph=True), (inputs, v))
-        gradgradcheck(lambda inp, v: autogradF.vhp(foo, inp, v, create_graph=True), (inputs, v))
+        gradcheck(
+            lambda inp, v: autogradF.vhp(foo, inp, v, create_graph=True), (inputs, v)
+        )
+        gradgradcheck(
+            lambda inp, v: autogradF.vhp(foo, inp, v, create_graph=True), (inputs, v)
+        )
 
         def bar(a, b):
             return (a + 3 * b.narrow(0, 0, 3)).exp().sum()
 
-        inputs = (ctors.rand(3, dtype=torch.double, requires_grad=True),
-                  ctors.rand(4, dtype=torch.double, requires_grad=True))
-        v = (ctors.ones(3, dtype=torch.double, requires_grad=True),
-             ctors.ones(4, dtype=torch.double, requires_grad=True))
+        inputs = (
+            ctors.rand(3, dtype=torch.double, requires_grad=True),
+            ctors.rand(4, dtype=torch.double, requires_grad=True),
+        )
+        v = (
+            ctors.ones(3, dtype=torch.double, requires_grad=True),
+            ctors.ones(4, dtype=torch.double, requires_grad=True),
+        )
         out, vhp_val = autogradF.vhp(bar, inputs, v, create_graph=True)
         self._assert_same_struct(vhp_val, inputs)
         self.assertIsNotNone(out.grad_fn)
         self.assertIsNotNone(vhp_val[0].grad_fn)
         self.assertIsNotNone(vhp_val[1].grad_fn)
 
-        gradcheck(lambda *args: autogradF.vhp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v)
-        gradgradcheck(lambda *args: autogradF.vhp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v)
+        gradcheck(
+            lambda *args: autogradF.vhp(bar, args[:2], args[2:], create_graph=True)[1],
+            inputs + v,
+        )
+        gradgradcheck(
+            lambda *args: autogradF.vhp(bar, args[:2], args[2:], create_graph=True)[1],
+            inputs + v,
+        )
 
         def foo(*args):
             x, y = args[:2]
@@ -1196,7 +1473,13 @@ def foo(*args):
             x = x.cos()
             val, grad = autogradF.vhp(bar, (x, y), v, create_graph=True)
 
-            return val.cos() + grad[0].cos().sum() + grad[1].cos() + x.cos().sum() + y.cos()
+            return (
+                val.cos()
+                + grad[0].cos().sum()
+                + grad[1].cos()
+                + x.cos().sum()
+                + y.cos()
+            )
 
         gradcheck(foo, inputs + v)
         gradgradcheck(foo, inputs + v)
@@ -1215,10 +1498,14 @@ def bar2(a):
         inp = ctors.rand(4)
         v = ctors.rand(4)
         res = autogradF.hvp(foo, inp, v)
-        with self.assertRaisesRegex(TypeError, "The inputs given to hvp must be either a Tensor"):
+        with self.assertRaisesRegex(
+            TypeError, "The inputs given to hvp must be either a Tensor"
+        ):
             res = autogradF.hvp(foo, (inp, 2), v)
 
-        with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to hvp must"):
+        with self.assertRaisesRegex(
+            TypeError, "The outputs of the user-provided function given to hvp must"
+        ):
             res = autogradF.hvp(bar, inp, v)
 
         err_msg_out = "The Tensor returned by the function given to hvp should contain a single element"
@@ -1228,7 +1515,10 @@ def bar2(a):
         with self.assertRaisesRegex(RuntimeError, "v has invalid size:"):
             res = autogradF.hvp(foo, inp, ctors.rand(5))
 
-        with self.assertRaisesRegex(TypeError, "The v given to hvp must be either a Tensor or a tuple of Tensors"):
+        with self.assertRaisesRegex(
+            TypeError,
+            "The v given to hvp must be either a Tensor or a tuple of Tensors",
+        ):
             res = autogradF.hvp(foo, inp, (v, 2))
 
         res = autogradF.hvp(foo, inp, v)
@@ -1258,28 +1548,38 @@ def bar2(a):
 
         inp = ctors.rand(4)
         v = ctors.rand(4)
-        with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Output 0 of the user-provided function does not require gradients.",
+        ):
             res = autogradF.hvp(foo, inp, v, strict=True)
         res = autogradF.hvp(foo, inp, v, strict=False)
         self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
+        self.assertEqual(res[1].abs().sum(), 0.0)
 
-        with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "The output of the user-provided function is independent of input 0",
+        ):
             res = autogradF.hvp(bar, inp, v, strict=True)
         res = autogradF.hvp(bar, inp, v, strict=False)
         self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
+        self.assertEqual(res[1].abs().sum(), 0.0)
 
-        with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "jacobian of the user-provided function with respect to input 0 is",
+        ):
             res = autogradF.hvp(bar2, inp, v, strict=True)
         res = autogradF.hvp(bar2, inp, v, strict=False)
         self._assert_same_struct(res[1], inp)
-        self.assertEqual(res[1].abs().sum(), 0.)
+        self.assertEqual(res[1].abs().sum(), 0.0)
 
     @base_and_logging_tensor
     def test_hvp_no_grad(self, ctors):
         def reducer(x):
             return x.exp().sum()
+
         inputs = ctors.rand(4, 4)
         v = ctors.ones(4, 4)
         with torch.no_grad():
@@ -1321,6 +1621,7 @@ def bar(a, b):
     def test_hvp_scalar(self, ctors):
         def reducer(x):
             return x.exp().sum()
+
         inputs = ctors.rand(4, 4)
         v = ctors.ones(4, 4)
         res = autogradF.hvp(reducer, inputs, v)
@@ -1336,6 +1637,7 @@ def reducer(x):
 
         def bad_reducer(x):
             return x.exp().sum().view(1, 1, 1)
+
         inputs = ctors.rand(4, 4)
         v = ctors.rand(4, 4)
         res = autogradF.hvp(bad_reducer, inputs, v)
@@ -1353,24 +1655,38 @@ def foo(a):
         self.assertIsNotNone(res[0].grad_fn)
         self.assertIsNotNone(res[1].grad_fn)
 
-        gradcheck(lambda inp, v: autogradF.hvp(foo, inp, v, create_graph=True), (inputs, v))
-        gradgradcheck(lambda inp, v: autogradF.hvp(foo, inp, v, create_graph=True), (inputs, v))
+        gradcheck(
+            lambda inp, v: autogradF.hvp(foo, inp, v, create_graph=True), (inputs, v)
+        )
+        gradgradcheck(
+            lambda inp, v: autogradF.hvp(foo, inp, v, create_graph=True), (inputs, v)
+        )
 
         def bar(a, b):
             return (a + 3 * b.narrow(0, 0, 3)).exp().sum()
 
-        inputs = (ctors.rand(3, dtype=torch.double, requires_grad=True),
-                  ctors.rand(4, dtype=torch.double, requires_grad=True))
-        v = (ctors.ones(3, dtype=torch.double, requires_grad=True),
-             ctors.ones(4, dtype=torch.double, requires_grad=True))
+        inputs = (
+            ctors.rand(3, dtype=torch.double, requires_grad=True),
+            ctors.rand(4, dtype=torch.double, requires_grad=True),
+        )
+        v = (
+            ctors.ones(3, dtype=torch.double, requires_grad=True),
+            ctors.ones(4, dtype=torch.double, requires_grad=True),
+        )
         out, hvp_val = autogradF.hvp(bar, inputs, v, create_graph=True)
         self._assert_same_struct(hvp_val, inputs)
         self.assertIsNotNone(out.grad_fn)
         self.assertIsNotNone(hvp_val[0].grad_fn)
         self.assertIsNotNone(hvp_val[1].grad_fn)
 
-        gradcheck(lambda *args: autogradF.hvp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v)
-        gradgradcheck(lambda *args: autogradF.hvp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v)
+        gradcheck(
+            lambda *args: autogradF.hvp(bar, args[:2], args[2:], create_graph=True)[1],
+            inputs + v,
+        )
+        gradgradcheck(
+            lambda *args: autogradF.hvp(bar, args[:2], args[2:], create_graph=True)[1],
+            inputs + v,
+        )
 
         def foo(*args):
             x, y = args[:2]
@@ -1379,7 +1695,13 @@ def foo(*args):
             x = x.cos()
             val, grad = autogradF.hvp(bar, (x, y), v, create_graph=True)
 
-            return val.cos() + grad[0].cos().sum() + grad[1].cos() + x.cos().sum() + y.cos()
+            return (
+                val.cos()
+                + grad[0].cos().sum()
+                + grad[1].cos()
+                + x.cos().sum()
+                + y.cos()
+            )
 
         gradcheck(foo, inputs + v)
         gradgradcheck(foo, inputs + v)
@@ -1387,7 +1709,7 @@ def foo(*args):
     @base_and_logging_tensor
     def test_jacobian_match_vjp_jvp(self, ctors):
         def foo(x):
-            return x ** 3 + x.sum()
+            return x**3 + x.sum()
 
         inputs = ctors.rand(4)
         v = ctors.rand(4)
@@ -1414,7 +1736,8 @@ def foo(a):
         self.assertEqual(hvp, torch.mm(hes, v.unsqueeze(1)).squeeze(1))
         self.assertEqual(vhp, torch.mm(v.unsqueeze(0), hes).squeeze(0))
 
+
 instantiate_parametrized_tests(TestAutogradFunctional)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/backends/xeon/test_launch.py b/test/backends/xeon/test_launch.py
index 9e5f4def951a1..bab1006015212 100644
--- a/test/backends/xeon/test_launch.py
+++ b/test/backends/xeon/test_launch.py
@@ -1,11 +1,13 @@
 # Owner(s): ["module: intel"]
 
-from torch.testing._internal.common_utils import TestCase, run_tests, IS_LINUX
 import shutil
 import subprocess
 import tempfile
 import unittest
 
+from torch.testing._internal.common_utils import IS_LINUX, run_tests, TestCase
+
+
 @unittest.skipIf(not IS_LINUX, "Only works on linux")
 class TestTorchrun(TestCase):
     def setUp(self):
@@ -37,6 +39,7 @@ def test_cpu_info(self):
 15,7,1,1
 """
         from torch.backends.xeon.run_cpu import _CPUinfo
+
         cpuinfo = _CPUinfo(lscpu_info)
         assert cpuinfo._physical_core_nums() == 8
         assert cpuinfo._logical_core_nums() == 16
@@ -45,21 +48,43 @@ def test_cpu_info(self):
         assert cpuinfo.get_node_logical_cores(0) == [0, 1, 2, 3, 8, 9, 10, 11]
         assert cpuinfo.get_node_logical_cores(1) == [4, 5, 6, 7, 12, 13, 14, 15]
         assert cpuinfo.get_all_physical_cores() == [0, 1, 2, 3, 4, 5, 6, 7]
-        assert cpuinfo.get_all_logical_cores() == [0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15]
+        assert cpuinfo.get_all_logical_cores() == [
+            0,
+            1,
+            2,
+            3,
+            8,
+            9,
+            10,
+            11,
+            4,
+            5,
+            6,
+            7,
+            12,
+            13,
+            14,
+            15,
+        ]
         assert cpuinfo.numa_aware_check([0, 1, 2, 3]) == [0]
         assert cpuinfo.numa_aware_check([4, 5, 6, 7]) == [1]
         assert cpuinfo.numa_aware_check([2, 3, 4, 5]) == [0, 1]
 
     def test_multi_threads(self):
         num = 0
-        with subprocess.Popen(f"python -m torch.backends.xeon.run_cpu --ninstances 4 --use-default-allocator \
+        with subprocess.Popen(
+            f"python -m torch.backends.xeon.run_cpu --ninstances 4 --use-default-allocator \
             --disable-iomp --disable-numactl --disable-taskset --log-path {self._test_dir} --no-python pwd",
-                              shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p:
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+        ) as p:
             for line in p.stdout.readlines():
                 segs = str(line, "utf-8").strip().split("-")
                 if segs[-1].strip() == "pwd":
                     num += 1
         assert num == 4, "Failed to launch multiple instances for inference"
 
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/benchmark_utils/test_benchmark_utils.py b/test/benchmark_utils/test_benchmark_utils.py
index 2267262551b43..ff3538769e06d 100644
--- a/test/benchmark_utils/test_benchmark_utils.py
+++ b/test/benchmark_utils/test_benchmark_utils.py
@@ -6,19 +6,26 @@
 import re
 import textwrap
 import timeit
-from typing import Any, List, Tuple
 import unittest
+from typing import Any, List, Tuple
 
-import torch
-import torch.utils.benchmark as benchmark_utils
-from torch.testing._internal.common_utils import TestCase, run_tests, IS_SANDCASTLE, IS_WINDOWS, slowTest, TEST_WITH_ASAN
 import expecttest
 import numpy as np
 
+import torch
+import torch.utils.benchmark as benchmark_utils
+from torch.testing._internal.common_utils import (
+    IS_SANDCASTLE,
+    IS_WINDOWS,
+    run_tests,
+    slowTest,
+    TEST_WITH_ASAN,
+    TestCase,
+)
+
 
 CALLGRIND_ARTIFACTS: str = os.path.join(
-    os.path.split(os.path.abspath(__file__))[0],
-    "callgrind_artifacts.json"
+    os.path.split(os.path.abspath(__file__))[0], "callgrind_artifacts.json"
 )
 
 
@@ -33,13 +40,13 @@ def generate_callgrind_artifacts() -> None:
     """
     print("Regenerating callgrind artifact.")
 
-    stats_no_data = benchmark_utils.Timer(
-        "y = torch.ones(())"
-    ).collect_callgrind(number=1000)
+    stats_no_data = benchmark_utils.Timer("y = torch.ones(())").collect_callgrind(
+        number=1000
+    )
 
-    stats_with_data = benchmark_utils.Timer(
-        "y = torch.ones((1,))"
-    ).collect_callgrind(number=1000)
+    stats_with_data = benchmark_utils.Timer("y = torch.ones((1,))").collect_callgrind(
+        number=1000
+    )
 
     user = os.getenv("USER")
 
@@ -59,7 +66,9 @@ def to_entry(fn_counts):
         json.dump(artifacts, f, indent=4)
 
 
-def load_callgrind_artifacts() -> Tuple[benchmark_utils.CallgrindStats, benchmark_utils.CallgrindStats]:
+def load_callgrind_artifacts() -> (
+    Tuple[benchmark_utils.CallgrindStats, benchmark_utils.CallgrindStats]
+):
     """Hermetic artifact to unit test Callgrind wrapper.
 
     In addition to collecting counts, this wrapper provides some facilities for
@@ -76,8 +85,7 @@ def load_callgrind_artifacts() -> Tuple[benchmark_utils.CallgrindStats, benchmar
     pattern = re.compile(r"^\s*([0-9]+)\s(.+)$")
 
     def to_function_counts(
-        count_strings: List[str],
-        inclusive: bool
+        count_strings: List[str], inclusive: bool
     ) -> benchmark_utils.FunctionCounts:
         data: List[benchmark_utils.FunctionCount] = []
         for cs in count_strings:
@@ -89,8 +97,8 @@ def to_function_counts(
             data.append(benchmark_utils.FunctionCount(count=int(c), function=fn))
 
         return benchmark_utils.FunctionCounts(
-            tuple(sorted(data, reverse=True)),
-            inclusive=inclusive)
+            tuple(sorted(data, reverse=True)), inclusive=inclusive
+        )
 
     baseline_inclusive = to_function_counts(artifacts["baseline_inclusive"], True)
     baseline_exclusive = to_function_counts(artifacts["baseline_exclusive"], False)
@@ -101,8 +109,12 @@ def to_function_counts(
         built_with_debug_symbols=True,
         baseline_inclusive_stats=baseline_inclusive,
         baseline_exclusive_stats=baseline_exclusive,
-        stmt_inclusive_stats=to_function_counts(artifacts["ones_no_data_inclusive"], True),
-        stmt_exclusive_stats=to_function_counts(artifacts["ones_no_data_exclusive"], False),
+        stmt_inclusive_stats=to_function_counts(
+            artifacts["ones_no_data_inclusive"], True
+        ),
+        stmt_exclusive_stats=to_function_counts(
+            artifacts["ones_no_data_exclusive"], False
+        ),
         stmt_callgrind_out=None,
     )
 
@@ -112,8 +124,12 @@ def to_function_counts(
         built_with_debug_symbols=True,
         baseline_inclusive_stats=baseline_inclusive,
         baseline_exclusive_stats=baseline_exclusive,
-        stmt_inclusive_stats=to_function_counts(artifacts["ones_with_data_inclusive"], True),
-        stmt_exclusive_stats=to_function_counts(artifacts["ones_with_data_exclusive"], False),
+        stmt_inclusive_stats=to_function_counts(
+            artifacts["ones_with_data_inclusive"], True
+        ),
+        stmt_exclusive_stats=to_function_counts(
+            artifacts["ones_with_data_exclusive"], False
+        ),
         stmt_callgrind_out=None,
     )
 
@@ -127,14 +143,12 @@ def forward(self, x):
 
 class TestBenchmarkUtils(TestCase):
     def regularizeAndAssertExpectedInline(
-        self, x: Any,
-        expect: str,
-        indent: int = 12
+        self, x: Any, expect: str, indent: int = 12
     ) -> None:
         x_str: str = re.sub(
             "object at 0x[0-9a-fA-F]+>",
             "object at 0xXXXXXXXXXXXX>",
-            x if isinstance(x, str) else repr(x)
+            x if isinstance(x, str) else repr(x),
         )
         if "\n" in x_str:
             # Indent makes the reference align at the call site.
@@ -157,15 +171,19 @@ def test_timer(self):
         median = timer.adaptive_autorange(threshold=0.5).median
 
         # Test that multi-line statements work properly.
-        median = benchmark_utils.Timer(
-            stmt="""
+        median = (
+            benchmark_utils.Timer(
+                stmt="""
                 with torch.no_grad():
                     y = x + 1""",
-            setup="""
+                setup="""
                 x = torch.ones((1,), requires_grad=True)
                 for _ in range(5):
                     x = x + 1.0""",
-        ).timeit(5).median
+            )
+            .timeit(5)
+            .median
+        )
         self.assertIsInstance(sample, float)
 
     @slowTest
@@ -173,7 +191,7 @@ def test_timer(self):
     @unittest.skipIf(True, "Failing on clang, see 74398")
     def test_timer_tiny_fast_snippet(self):
         timer = benchmark_utils.Timer(
-            'auto x = 1;(void)x;',
+            "auto x = 1;(void)x;",
             timer=timeit.default_timer,
             language=benchmark_utils.Language.CPP,
         )
@@ -222,16 +240,16 @@ def sample(self, mean, noise_level):
             return max(self._random_state.normal(mean, mean * noise_level), 5e-9)
 
         def timeit(self, number):
-            return sum([
-                # First timer invocation
-                self.sample(self._timer_cost, self._timer_noise_level),
-
-                # Stmt body
-                self.sample(self._mean_cost * number, self._function_noise_level),
-
-                # Second timer invocation
-                self.sample(self._timer_cost, self._timer_noise_level),
-            ])
+            return sum(
+                [
+                    # First timer invocation
+                    self.sample(self._timer_cost, self._timer_noise_level),
+                    # Stmt body
+                    self.sample(self._mean_cost * number, self._function_noise_level),
+                    # Second timer invocation
+                    self.sample(self._timer_cost, self._timer_noise_level),
+                ]
+            )
 
     def test_adaptive_timer(self):
         class MockTimer(benchmark_utils.Timer):
@@ -245,7 +263,6 @@ class _MockCudaTimer(self._MockTimer):
             _function_costs = (
                 self._MockTimer._function_costs[0],
                 self._MockTimer._function_costs[1],
-
                 # GPU should be faster once there is enough work.
                 ("expensive_fn()", 5e-6),
             )
@@ -261,7 +278,7 @@ class MockCudaTimer(benchmark_utils.Timer):
             pass
               Median: 7.98 ns
               IQR:    0.52 ns (7.74 to 8.26)
-              125 measurements, 10000000 runs per measurement, 1 thread"""
+              125 measurements, 10000000 runs per measurement, 1 thread""",
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -271,7 +288,7 @@ class MockCudaTimer(benchmark_utils.Timer):
             pass
               Median: 7.86 ns
               IQR:    0.71 ns (7.63 to 8.34)
-              6 measurements, 1000000 runs per measurement, 1 thread"""
+              6 measurements, 1000000 runs per measurement, 1 thread""",
         )
 
         # Check against strings so we can reuse expect infra.
@@ -287,7 +304,7 @@ class MockCudaTimer(benchmark_utils.Timer):
             cheap_fn()
               Median: 3.98 us
               IQR:    0.27 us (3.85 to 4.12)
-              252 measurements, 10000 runs per measurement, 1 thread"""
+              252 measurements, 10000 runs per measurement, 1 thread""",
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -297,7 +314,7 @@ class MockCudaTimer(benchmark_utils.Timer):
             cheap_fn()
               Median: 4.16 us
               IQR:    0.22 us (4.04 to 4.26)
-              4 measurements, 1000 runs per measurement, 1 thread"""
+              4 measurements, 1000 runs per measurement, 1 thread""",
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -307,7 +324,7 @@ class MockCudaTimer(benchmark_utils.Timer):
             expensive_fn()
               Median: 19.97 us
               IQR:    1.35 us (19.31 to 20.65)
-              501 measurements, 1000 runs per measurement, 1 thread"""
+              501 measurements, 1000 runs per measurement, 1 thread""",
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -317,7 +334,7 @@ class MockCudaTimer(benchmark_utils.Timer):
             expensive_fn()
               Median: 20.79 us
               IQR:    1.09 us (20.20 to 21.29)
-              4 measurements, 1000 runs per measurement, 1 thread"""
+              4 measurements, 1000 runs per measurement, 1 thread""",
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -327,7 +344,7 @@ class MockCudaTimer(benchmark_utils.Timer):
             pass
               Median: 7.92 ns
               IQR:    0.43 ns (7.75 to 8.17)
-              13 measurements, 100000000 runs per measurement, 1 thread"""
+              13 measurements, 100000000 runs per measurement, 1 thread""",
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -337,7 +354,7 @@ class MockCudaTimer(benchmark_utils.Timer):
             pass
               Median: 7.75 ns
               IQR:    0.57 ns (7.56 to 8.13)
-              4 measurements, 10000000 runs per measurement, 1 thread"""
+              4 measurements, 10000000 runs per measurement, 1 thread""",
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -347,7 +364,7 @@ class MockCudaTimer(benchmark_utils.Timer):
             cheap_fn()
               Median: 4.04 us
               IQR:    0.30 us (3.90 to 4.19)
-              25 measurements, 100000 runs per measurement, 1 thread"""
+              25 measurements, 100000 runs per measurement, 1 thread""",
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -357,7 +374,7 @@ class MockCudaTimer(benchmark_utils.Timer):
             cheap_fn()
               Median: 4.09 us
               IQR:    0.38 us (3.90 to 4.28)
-              4 measurements, 100000 runs per measurement, 1 thread"""
+              4 measurements, 100000 runs per measurement, 1 thread""",
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -367,7 +384,7 @@ class MockCudaTimer(benchmark_utils.Timer):
             expensive_fn()
               Median: 4.98 us
               IQR:    0.31 us (4.83 to 5.13)
-              20 measurements, 100000 runs per measurement, 1 thread"""
+              20 measurements, 100000 runs per measurement, 1 thread""",
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -377,7 +394,7 @@ class MockCudaTimer(benchmark_utils.Timer):
             expensive_fn()
               Median: 5.01 us
               IQR:    0.28 us (4.87 to 5.15)
-              4 measurements, 10000 runs per measurement, 1 thread"""
+              4 measurements, 10000 runs per measurement, 1 thread""",
         )
 
         # Make sure __repr__ is reasonable for
@@ -398,7 +415,7 @@ class MockCudaTimer(benchmark_utils.Timer):
 
               Median: 10.06 us
               IQR:    0.54 us (9.73 to 10.27)
-              20 measurements, 1000 runs per measurement, 1 thread"""
+              20 measurements, 1000 runs per measurement, 1 thread""",
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -411,7 +428,7 @@ class MockCudaTimer(benchmark_utils.Timer):
 
               Median: 10.06 us
               IQR:    0.54 us (9.73 to 10.27)
-              20 measurements, 1000 runs per measurement, 1 thread"""
+              20 measurements, 1000 runs per measurement, 1 thread""",
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -425,7 +442,7 @@ class MockCudaTimer(benchmark_utils.Timer):
             x + 1 (no grad): scalar_add
               Median: 10.06 us
               IQR:    0.54 us (9.73 to 10.27)
-              20 measurements, 1000 runs per measurement, 1 thread"""
+              20 measurements, 1000 runs per measurement, 1 thread""",
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -443,7 +460,7 @@ class MockCudaTimer(benchmark_utils.Timer):
             setup: setup_fn()
               Median: 10.06 us
               IQR:    0.54 us (9.73 to 10.27)
-              20 measurements, 1000 runs per measurement, 1 thread"""
+              20 measurements, 1000 runs per measurement, 1 thread""",
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -471,7 +488,7 @@ class MockCudaTimer(benchmark_utils.Timer):
 
               Median: 10.06 us
               IQR:    0.54 us (9.73 to 10.27)
-              20 measurements, 1000 runs per measurement, 16 threads"""
+              20 measurements, 1000 runs per measurement, 16 threads""",
         )
 
     @slowTest
@@ -482,24 +499,23 @@ def test_collect_callgrind(self):
         with self.assertRaisesRegex(
             ValueError,
             r"`collect_callgrind` requires that globals be wrapped "
-            r"in `CopyIfCallgrind` so that serialization is explicit."
+            r"in `CopyIfCallgrind` so that serialization is explicit.",
         ):
-            benchmark_utils.Timer(
-                "pass",
-                globals={"x": 1}
-            ).collect_callgrind(collect_baseline=False)
+            benchmark_utils.Timer("pass", globals={"x": 1}).collect_callgrind(
+                collect_baseline=False
+            )
 
         with self.assertRaisesRegex(
             # Subprocess raises AttributeError (from pickle),
             # _ValgrindWrapper re-raises as generic OSError.
-            OSError, "AttributeError: Can't get attribute 'MyModule'"
+            OSError,
+            "AttributeError: Can't get attribute 'MyModule'",
         ):
             benchmark_utils.Timer(
                 "model(1)",
-                globals={"model": benchmark_utils.CopyIfCallgrind(MyModule())}
+                globals={"model": benchmark_utils.CopyIfCallgrind(MyModule())},
             ).collect_callgrind(collect_baseline=False)
 
-
         @torch.jit.script
         def add_one(x):
             return x + 1
@@ -516,9 +532,9 @@ def add_one(x):
                     import sys
                     sys.path.append({repr(os.path.split(os.path.abspath(__file__))[0])})
                     from test_benchmark_utils import MyModule
-                    """
-                )
-            }
+                    """,
+                ),
+            },
         )
 
         stats = timer.collect_callgrind(number=1000)
@@ -538,13 +554,22 @@ def add_one(x):
         assert isinstance(stats, tuple)
 
         # Check that the repeats are at least somewhat repeatable. (within 10 instructions per iter)
-        counts = collections.Counter([s.counts(denoise=True) // 10_000 * 10_000 for s in stats])
-        self.assertGreater(max(counts.values()), 1, f"Every instruction count total was unique: {counts}")
+        counts = collections.Counter(
+            [s.counts(denoise=True) // 10_000 * 10_000 for s in stats]
+        )
+        self.assertGreater(
+            max(counts.values()),
+            1,
+            f"Every instruction count total was unique: {counts}",
+        )
+
+        from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import (
+            wrapper_singleton,
+        )
 
-        from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import wrapper_singleton
         self.assertIsNone(
             wrapper_singleton()._bindings_module,
-            "JIT'd bindings are only for back testing."
+            "JIT'd bindings are only for back testing.",
         )
 
     @slowTest
@@ -558,28 +583,29 @@ def test_collect_cpp_callgrind(self):
             timer=timeit.default_timer,
             language="c++",
         )
-        stats = [
-            timer.collect_callgrind()
-            for _ in range(3)
-        ]
+        stats = [timer.collect_callgrind() for _ in range(3)]
         counts = [s.counts() for s in stats]
 
-        self.assertGreater(
-            min(counts), 0, "No stats were collected")
+        self.assertGreater(min(counts), 0, "No stats were collected")
         self.assertEqual(
-            min(counts), max(counts), "C++ Callgrind should be deterministic")
+            min(counts), max(counts), "C++ Callgrind should be deterministic"
+        )
 
         for s in stats:
             self.assertEqual(
-                s.counts(denoise=True), s.counts(denoise=False),
-                "De-noising should not apply to C++.")
+                s.counts(denoise=True),
+                s.counts(denoise=False),
+                "De-noising should not apply to C++.",
+            )
 
         stats = timer.collect_callgrind(number=1000, repeats=20)
         assert isinstance(stats, tuple)
 
         # NB: Unlike the example above, there is no expectation that all
         #     repeats will be identical.
-        counts = collections.Counter([s.counts(denoise=True) // 10_000 * 10_000 for s in stats])
+        counts = collections.Counter(
+            [s.counts(denoise=True) // 10_000 * 10_000 for s in stats]
+        )
         self.assertGreater(max(counts.values()), 1, repr(counts))
 
     def test_manipulate_callgrind_stats(self):
@@ -587,7 +613,8 @@ def test_manipulate_callgrind_stats(self):
 
         # Mock `torch.set_printoptions(linewidth=160)`
         wide_linewidth = benchmark_utils.FunctionCounts(
-            stats_no_data.stats(inclusive=False)._data, False, _linewidth=160)
+            stats_no_data.stats(inclusive=False)._data, False, _linewidth=160
+        )
 
         for l in repr(wide_linewidth).splitlines(keepends=False):
             self.assertLessEqual(len(l), 160)
@@ -595,10 +622,12 @@ def test_manipulate_callgrind_stats(self):
         self.assertEqual(
             # `delta` is just a convenience method.
             stats_with_data.delta(stats_no_data)._data,
-            (stats_with_data.stats() - stats_no_data.stats())._data
+            (stats_with_data.stats() - stats_no_data.stats())._data,
         )
 
-        deltas = stats_with_data.as_standardized().delta(stats_no_data.as_standardized())
+        deltas = stats_with_data.as_standardized().delta(
+            stats_no_data.as_standardized()
+        )
 
         def custom_transforms(fn: str):
             fn = re.sub(re.escape("/usr/include/c++/8/bits/"), "", fn)
@@ -703,7 +732,7 @@ def custom_transforms(fn: str):
                 2000  /usr/include/c++/8/bits/atomic_base.h:at::Tensor at::detail::make_tensor ... t_null_type<c10::StorageImpl> >&&, c10::DispatchKey&&, caffe2::TypeMeta&)
                 2000  /usr/include/c++/8/array:at::Tensor& c10::Dispatcher::callWithDispatchKe ... , c10::Scalar)> const&, c10::DispatchKey, at::Tensor&, c10::Scalar) const
 
-            Total: 8869966"""  # noqa: B950
+            Total: 8869966""",  # noqa: B950
         )
 
         self.regularizeAndAssertExpectedInline(
@@ -821,10 +850,8 @@ def test_compare(self):
         costs = (
             # overhead_optimized_fn()
             (1e-6, 1e-9),
-
             # compute_optimized_fn()
             (3e-6, 5e-10),
-
             # special_case_fn()  [square inputs only]
             (1e-6, 4e-10),
         )
@@ -840,8 +867,7 @@ def test_compare(self):
         # overhead_optimized_fn()
         class _MockTimer_0(self._MockTimer):
             _function_costs = tuple(
-                (f"fn({i}, {j})", costs[0][0] + costs[0][1] * i * j)
-                for i, j in sizes
+                (f"fn({i}, {j})", costs[0][0] + costs[0][1] * i * j) for i, j in sizes
             )
 
         class MockTimer_0(benchmark_utils.Timer):
@@ -850,8 +876,7 @@ class MockTimer_0(benchmark_utils.Timer):
         # compute_optimized_fn()
         class _MockTimer_1(self._MockTimer):
             _function_costs = tuple(
-                (f"fn({i}, {j})", costs[1][0] + costs[1][1] * i * j)
-                for i, j in sizes
+                (f"fn({i}, {j})", costs[1][0] + costs[1][1] * i * j) for i, j in sizes
             )
 
         class MockTimer_1(benchmark_utils.Timer):
@@ -861,7 +886,8 @@ class MockTimer_1(benchmark_utils.Timer):
         class _MockTimer_2(self._MockTimer):
             _function_costs = tuple(
                 (f"fn({i}, {j})", costs[2][0] + costs[2][1] * i * j)
-                for i, j in sizes if i == j
+                for i, j in sizes
+                if i == j
             )
 
         class MockTimer_2(benchmark_utils.Timer):
@@ -913,7 +939,7 @@ def rstrip_lines(s: str) -> str:
                   compute_optimized      |    3.1     |     4.0     |     11.2     |     2099.3     |     2099.3
                   special_case (square)  |    1.1     |             |      7.5     |                |     1674.7
 
-            Times are in microseconds (us)."""
+            Times are in microseconds (us).""",
         )
 
         compare.trim_significant_figures()
@@ -927,7 +953,7 @@ def rstrip_lines(s: str) -> str:
                   compute_optimized      |     3      |     4.0     |      11      |      2100      |      2100
                   special_case (square)  |     1      |             |       8      |                |      1700
 
-            Times are in microseconds (us)."""
+            Times are in microseconds (us).""",
         )
 
         compare.colorize()
@@ -978,12 +1004,16 @@ def print_new_expected(s: str) -> None:
         self.assertEqual(columnwise_colored_actual, columnwise_colored_expected)
         self.assertEqual(rowwise_colored_actual, rowwise_colored_expected)
 
-    @unittest.skipIf(IS_WINDOWS and os.getenv("VC_YEAR") == "2019", "Random seed only accepts int32")
+    @unittest.skipIf(
+        IS_WINDOWS and os.getenv("VC_YEAR") == "2019", "Random seed only accepts int32"
+    )
     def test_fuzzer(self):
         fuzzer = benchmark_utils.Fuzzer(
             parameters=[
                 benchmark_utils.FuzzedParameter(
-                    "n", minval=1, maxval=16, distribution="loguniform")],
+                    "n", minval=1, maxval=16, distribution="loguniform"
+                )
+            ],
             tensors=[benchmark_utils.FuzzedTensor("x", size=("n",))],
             seed=0,
         )
@@ -995,9 +1025,8 @@ def test_fuzzer(self):
 
         for i, (tensors, _, _) in enumerate(fuzzer.take(2)):
             x = tensors["x"]
-            self.assertEqual(
-                x, torch.tensor(expected_results[i]), rtol=1e-3, atol=1e-3)
+            self.assertEqual(x, torch.tensor(expected_results[i]), rtol=1e-3, atol=1e-3)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/bottleneck_test/test_args.py b/test/bottleneck_test/test_args.py
index 6c00920841f8a..f84260b7daea0 100644
--- a/test/bottleneck_test/test_args.py
+++ b/test/bottleneck_test/test_args.py
@@ -1,14 +1,15 @@
 # Owner(s): ["module: unknown"]
 
 import argparse
+
 import torch
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
     # Required args. Raises error if they aren't passed.
-    parser.add_argument('--foo', help='foo', required=True)
-    parser.add_argument('--bar', help='bar', required=True)
+    parser.add_argument("--foo", help="foo", required=True)
+    parser.add_argument("--bar", help="bar", required=True)
     _ = parser.parse_args()
 
     x = torch.ones((3, 3), requires_grad=True)
diff --git a/test/bottleneck_test/test_cuda.py b/test/bottleneck_test/test_cuda.py
index 65bbcac0f0151..012b61daaa457 100644
--- a/test/bottleneck_test/test_cuda.py
+++ b/test/bottleneck_test/test_cuda.py
@@ -25,5 +25,5 @@ def main():
         optimizer.step()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/test/conftest.py b/test/conftest.py
index 48393f5f1fb84..9ba7286892851 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -1,24 +1,24 @@
-from _pytest.junitxml import LogXML, _NodeReporter, bin_xml_escape
-from _pytest.terminal import _get_raw_skip_reason
-from _pytest.stash import StashKey
-from _pytest.reports import TestReport
-from _pytest.config.argparsing import Parser
-from _pytest.config import filename_arg
-from _pytest.config import Config
-from _pytest._code.code import ReprFileLocation
-from _pytest.python import Module
-from typing import Any, List, Union
-from typing import Optional
-from types import MethodType
-import xml.etree.ElementTree as ET
-import functools
-import pytest
-import sys
-import os
 import copy
+import functools
 import json
+import os
 import re
+import sys
+import xml.etree.ElementTree as ET
 from collections import defaultdict
+from types import MethodType
+from typing import Any, List, Optional, Union
+
+import pytest
+from _pytest._code.code import ReprFileLocation
+from _pytest.config import Config, filename_arg
+from _pytest.config.argparsing import Parser
+from _pytest.junitxml import _NodeReporter, bin_xml_escape, LogXML
+from _pytest.python import Module
+from _pytest.reports import TestReport
+from _pytest.stash import StashKey
+from _pytest.terminal import _get_raw_skip_reason
+from pytest_shard_custom import pytest_addoptions as shard_addoptions, PytestShardPlugin
 
 # a lot of this file is copied from _pytest.junitxml and modified to get rerun info
 
@@ -41,7 +41,7 @@ def pytest_addoption(parser: Parser) -> None:
         dest="stepcurrent",
     )
 
-    parser.addoption("--use-main-module", action='store_true')
+    parser.addoption("--use-main-module", action="store_true")
     group = parser.getgroup("terminal reporting")
     group.addoption(
         "--junit-xml-reruns",
@@ -84,6 +84,7 @@ def pytest_addoption(parser: Parser) -> None:
         "Emit XML for schema: one of legacy|xunit1|xunit2",
         default="xunit2",
     )
+    shard_addoptions(parser)
 
 
 def pytest_configure(config: Config) -> None:
@@ -105,6 +106,8 @@ def pytest_configure(config: Config) -> None:
         config.option.stepcurrent = config.getoption("stepcurrent_skip")
     if config.getoption("stepcurrent"):
         config.pluginmanager.register(StepcurrentPlugin(config), "stepcurrentplugin")
+    if config.getoption("num_shards"):
+        config.pluginmanager.register(PytestShardPlugin(config), "pytestshardplugin")
 
 
 def pytest_unconfigure(config: Config) -> None:
@@ -125,6 +128,27 @@ def _write_content(self, report: TestReport, content: str, jheader: str) -> None
         tag.text = bin_xml_escape(content)
         self.append(tag)
 
+    def append_skipped(self, report: TestReport) -> None:
+        # Referenced from the below
+        # https://github.com/pytest-dev/pytest/blob/2178ee86d7c1ee93748cfb46540a6e40b4761f2d/src/_pytest/junitxml.py#L236C6-L236C6
+        # Modified to escape characters not supported by xml in the skip reason.  Everything else should be the same.
+        if hasattr(report, "wasxfail"):
+            # Super here instead of the actual code so we can reduce possible divergence
+            super().append_skipped(report)
+        else:
+            assert isinstance(report.longrepr, tuple)
+            filename, lineno, skipreason = report.longrepr
+            if skipreason.startswith("Skipped: "):
+                skipreason = skipreason[9:]
+            details = f"{filename}:{lineno}: {skipreason}"
+
+            skipped = ET.Element(
+                "skipped", type="pytest.skip", message=bin_xml_escape(skipreason)
+            )
+            skipped.text = bin_xml_escape(details)
+            self.append(skipped)
+            self.write_captured_output(report)
+
 
 class LogXMLReruns(LogXML):
     def __init__(self, *args, **kwargs):
@@ -201,7 +225,7 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
 def pytest_pycollect_makemodule(module_path, path, parent) -> Module:
     if parent.config.getoption("--use-main-module"):
         mod = Module.from_parent(parent, path=module_path)
-        mod._getobj = MethodType(lambda x: sys.modules['__main__'], mod)
+        mod._getobj = MethodType(lambda x: sys.modules["__main__"], mod)
         return mod
 
 
@@ -253,7 +277,10 @@ def pytest_collection_modifyitems(items: List[Any]) -> None:
         test_name = item.name
         test_class = item.parent.name
 
-        if test_class not in disabled_tests or test_name not in disabled_tests[test_class]:
+        if (
+            test_class not in disabled_tests
+            or test_name not in disabled_tests[test_class]
+        ):
             continue
 
         cpy = copy.copy(item)
diff --git a/test/cpp/aot_inductor/CMakeLists.txt b/test/cpp/aot_inductor/CMakeLists.txt
deleted file mode 100644
index 8ec065ee08f6b..0000000000000
--- a/test/cpp/aot_inductor/CMakeLists.txt
+++ /dev/null
@@ -1,74 +0,0 @@
-
-set(AOT_INDUCTOR_TEST_ROOT ${TORCH_ROOT}/test/cpp/aot_inductor)
-
-# Build custom TorchScript op for AOTInductor
-add_library(aoti_custom_class SHARED aoti_custom_class.cpp)
-set_target_properties(aoti_custom_class PROPERTIES
-    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-if(USE_CUDA)
-  target_compile_definitions(aoti_custom_class PRIVATE USE_CUDA)
-endif()
-# Link against LibTorch
-target_link_libraries(aoti_custom_class torch)
-
-# the custom command that generates the TorchScript module
-add_custom_command(
-    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
-           ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
-           ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
-    COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py
-    DEPENDS compile_model.py
-)
-add_custom_target(aoti_script_model ALL
-    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
-    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
-    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
-)
-add_dependencies(aoti_script_model aoti_custom_class)
-
-# Build the cpp gtest binary containing the cpp-only tests.
-set(INDUCTOR_TEST_SRCS
-  ${AOT_INDUCTOR_TEST_ROOT}/test.cpp
-)
-
-add_executable(test_aot_inductor
-  ${TORCH_ROOT}/test/cpp/common/main.cpp
-  ${INDUCTOR_TEST_SRCS}
-  data.pt
-  script_data.pt
-  script_model_cpu.pt
-  script_model_cuda.pt
-)
-add_dependencies(test_aot_inductor aoti_custom_class aoti_script_model)
-
-# TODO temporary until we can delete the old gtest polyfills.
-target_compile_definitions(test_aot_inductor PRIVATE USE_GTEST)
-
-# Define a custom command to generate the library
-add_custom_command(
-        OUTPUT data.pt
-        COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/test.py
-        DEPENDS ${AOT_INDUCTOR_TEST_ROOT}/test.py
-)
-
-target_link_libraries(test_aot_inductor PRIVATE
-  torch
-  gtest
-  -Wl,--no-as-needed aoti_custom_class
-)
-
-if(USE_CUDA)
-  target_include_directories(test_aot_inductor PRIVATE ${ATen_CUDA_INCLUDE})
-  target_compile_definitions(test_aot_inductor PRIVATE USE_CUDA)
-endif()
-target_compile_definitions(test_aot_inductor PRIVATE
-    CMAKE_CURRENT_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}
-)
-
-if(INSTALL_TEST)
-  install(TARGETS test_aot_inductor DESTINATION bin)
-  # Install PDB files for MSVC builds
-  if(MSVC AND BUILD_SHARED_LIBS)
-    install(FILES $<TARGET_PDB_FILE:test_aot_inductor> DESTINATION bin OPTIONAL)
-  endif()
-endif()
diff --git a/test/cpp/aot_inductor/test.cpp b/test/cpp/aot_inductor/test.cpp
deleted file mode 100644
index 934b9a118a29c..0000000000000
--- a/test/cpp/aot_inductor/test.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-#include <gtest/gtest.h>
-#include <filesystem>
-#include <string>
-#include <vector>
-
-#include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
-#ifdef USE_CUDA
-#include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
-#endif
-#include <torch/script.h>
-#include <torch/torch.h>
-
-#define STR_VALUE(x) #x
-#define STRINGIZE(x) STR_VALUE(x)
-
-namespace {
-
-void test_aoti(const std::string& device) {
-  torch::NoGradGuard no_grad;
-
-  std::string data_path =
-      (std::filesystem::path(STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data.pt")
-           .string();
-  torch::jit::script::Module data_loader = torch::jit::load(data_path);
-  std::string path_attr = "model_so_path_" + device;
-  std::string inputs_attr = "inputs_" + device;
-  std::string outputs_attr = "outputs_" + device;
-  const auto& model_so_path = data_loader.attr(path_attr.c_str()).toStringRef();
-  auto input_tensors =
-      data_loader.attr(inputs_attr.c_str()).toTensorList().vec();
-  const auto& ref_output_tensors =
-      data_loader.attr(outputs_attr.c_str()).toTensorList().vec();
-
-  std::unique_ptr<torch::inductor::AOTIModelContainerRunner> runner;
-  if (device == "cuda") {
-    runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
-        model_so_path);
-  } else if (device == "cpu") {
-    runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCpu>(
-        model_so_path);
-  } else {
-    testing::AssertionFailure() << "unsupported device: " << device;
-  }
-  auto actual_output_tensors = runner->run(input_tensors);
-  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
-}
-
-void test_aoti_script(const std::string& device) {
-  torch::NoGradGuard no_grad;
-
-  std::string script_model = "script_model_" + device + ".pt";
-  std::string model_path =
-      (std::filesystem::path(
-           STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / script_model.c_str())
-           .string();
-  torch::jit::script::Module model = torch::jit::load(model_path);
-
-  std::string sample_data_path =
-      (std::filesystem::path(
-           STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "script_data.pt")
-           .string();
-  torch::jit::script::Module sample_data = torch::jit::load(sample_data_path);
-  std::string inputs_attr = "inputs_" + device;
-  std::string outputs_attr = "outputs_" + device;
-  const auto& inputs = sample_data.attr(inputs_attr.c_str()).toList().vec();
-  const auto& ref_output_tensors =
-      sample_data.attr(outputs_attr.c_str()).toTensorVector();
-  auto outputs = model.forward(inputs).toTuple()->elements();
-  ASSERT_EQ(outputs.size(), ref_output_tensors.size());
-  for (size_t i = 0; i < ref_output_tensors.size(); i++) {
-    ASSERT_TRUE(torch::allclose(outputs[i].toTensor(), ref_output_tensors[i]));
-  }
-}
-
-void test_aoti_constants_update(const std::string& device) {
-  torch::NoGradGuard no_grad;
-
-  std::string data_path =
-      (std::filesystem::path(STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data.pt")
-           .string();
-
-  torch::jit::script::Module data_loader = torch::jit::load(data_path);
-  std::string path_attr = "model_so_path_" + device;
-  std::string inputs_attr = "inputs_" + device;
-  std::string outputs_attr = "outputs_" + device;
-  std::string weights_attr = "fc_weight_" + device;
-  std::string bias_attr = "fc_bias_" + device;
-  const auto& model_so_path = data_loader.attr(path_attr.c_str()).toStringRef();
-  auto input_tensors =
-      data_loader.attr(inputs_attr.c_str()).toTensorList().vec();
-  const auto& ref_output_tensors =
-      data_loader.attr(outputs_attr.c_str()).toTensorList().vec();
-
-  const auto& weight_tensors =
-      data_loader.attr(weights_attr.c_str()).toTensor();
-  const auto& bias_tensors = data_loader.attr(bias_attr.c_str()).toTensor();
-
-  torch::inductor::TensorConstantMap missing_map, rand_map, real_map;
-  missing_map.emplace(
-      "L__self___fc_weight", new at::Tensor(at::randn({10, 64})));
-  rand_map.emplace("L__self___fc_weight", new at::Tensor(at::randn({10, 64})));
-  rand_map.emplace("L__self___fc_bias", new at::Tensor(at::randn({10})));
-  real_map.emplace("L__self___fc_weight", new at::Tensor(weight_tensors));
-  real_map.emplace("L__self___fc_bias", new at::Tensor(bias_tensors));
-
-  std::unique_ptr<torch::inductor::AOTIModelContainerRunner> runner;
-  if (device == "cuda") {
-    runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
-        model_so_path);
-  } else if (device == "cpu") {
-    runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCpu>(
-        model_so_path);
-  } else {
-    testing::AssertionFailure() << "unsupported device: " << device;
-  }
-  // By default, buffer #1 get loaded with burned in weights. Correct results.
-  auto actual_output_tensors = runner->run(input_tensors);
-  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
-
-  // Update with missing map which should throw.
-  EXPECT_THROW(
-      runner->update_constant_buffer(missing_map, false, true),
-      std::runtime_error);
-
-  // Update random weight to buffer #1.
-  runner->update_constant_buffer(missing_map, false, false);
-  actual_output_tensors = runner->run(input_tensors);
-  ASSERT_FALSE(
-      torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
-
-  // Update with real weight.
-  runner->update_constant_buffer(real_map, false, false);
-  actual_output_tensors = runner->run(input_tensors);
-  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
-
-  // Update with full random weight.
-  runner->update_constant_buffer(rand_map, false, true);
-  actual_output_tensors = runner->run(input_tensors);
-  ASSERT_FALSE(
-      torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
-}
-
-void test_aoti_double_buffering(const std::string& device) {
-  torch::NoGradGuard no_grad;
-
-  std::string data_path =
-      (std::filesystem::path(STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data.pt")
-           .string();
-
-  torch::jit::script::Module data_loader = torch::jit::load(data_path);
-  std::string path_attr = "model_so_path_" + device;
-  std::string inputs_attr = "inputs_" + device;
-  std::string outputs_attr = "outputs_" + device;
-  std::string weights_attr = "fc_weight_" + device;
-  std::string bias_attr = "fc_bias_" + device;
-  const auto& model_so_path = data_loader.attr(path_attr.c_str()).toStringRef();
-  auto input_tensors =
-      data_loader.attr(inputs_attr.c_str()).toTensorList().vec();
-  const auto& ref_output_tensors =
-      data_loader.attr(outputs_attr.c_str()).toTensorList().vec();
-
-  const auto& weight_tensors =
-      data_loader.attr(weights_attr.c_str()).toTensor();
-  const auto& bias_tensors = data_loader.attr(bias_attr.c_str()).toTensor();
-
-  torch::inductor::TensorConstantMap rand_map, real_map;
-  rand_map.emplace("L__self___fc_weight", new at::Tensor(at::randn({10, 64})));
-  rand_map.emplace("L__self___fc_bias", new at::Tensor(at::randn({10})));
-  real_map.emplace("L__self___fc_weight", new at::Tensor(weight_tensors));
-  real_map.emplace("L__self___fc_bias", new at::Tensor(bias_tensors));
-
-  std::unique_ptr<torch::inductor::AOTIModelContainerRunner> runner;
-  if (device == "cuda") {
-    runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
-        model_so_path.c_str());
-  } else if (device == "cpu") {
-    runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCpu>(
-        model_so_path.c_str());
-  } else {
-    testing::AssertionFailure() << "unsupported device: " << device;
-  }
-  // By default, buffer #1 get loaded with burned in weights. Correct results.
-  auto actual_output_tensors = runner->run(input_tensors);
-  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
-
-  // We update the weights to buffer #2 and activate it. This should still
-  // produce correct result, as it's the real constant map.
-  runner->update_inactive_constant_buffer(real_map);
-  runner->swap_constant_buffer();
-  actual_output_tensors = runner->run(input_tensors);
-  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
-
-  // We update random weights to buffer #1. But do not swap in the weight yet.
-  runner->update_inactive_constant_buffer(rand_map);
-  actual_output_tensors = runner->run(input_tensors);
-  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
-
-  // We swap and activate the weight to buffer #1. This is random weight and
-  // should produce in correct results.
-  runner->swap_constant_buffer();
-  actual_output_tensors = runner->run(input_tensors);
-  ASSERT_FALSE(
-      torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
-
-  // Swap back to buffer #2 which is the real constants.
-  runner->swap_constant_buffer();
-  actual_output_tensors = runner->run(input_tensors);
-  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
-}
-
-} // namespace
-
-namespace torch {
-namespace inductor {
-
-TEST(AotInductorTest, BasicTestCpu) {
-  test_aoti("cpu");
-}
-
-TEST(AotInductorTest, BasicScriptTestCpu) {
-  test_aoti_script("cpu");
-}
-
-#ifdef USE_CUDA
-TEST(AotInductorTest, BasicTestCuda) {
-  test_aoti("cuda");
-}
-
-TEST(AotInductorTest, BasicScriptTestCuda) {
-  test_aoti_script("cuda");
-}
-
-TEST(AotInductorTest, UpdateConstantsCuda) {
-  test_aoti_constants_update("cuda");
-}
-
-TEST(AotInductorTest, UpdateInactiveConstantsCuda) {
-  test_aoti_double_buffering("cuda");
-}
-#endif
-
-} // namespace inductor
-} // namespace torch
diff --git a/test/cpp/aot_inductor/test.py b/test/cpp/aot_inductor/test.py
deleted file mode 100644
index 398d3d42a5ba6..0000000000000
--- a/test/cpp/aot_inductor/test.py
+++ /dev/null
@@ -1,49 +0,0 @@
-
-import torch
-from torch._export import aot_compile, dynamic_dim
-
-torch.manual_seed(1337)
-
-class Net(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.fc = torch.nn.Linear(64, 10)
-
-    def forward(self, x, y):
-        return self.fc(torch.sin(x) + torch.cos(y))
-
-data = {}
-
-for device in ["cpu", "cuda"]:
-    model = Net().to(device=device)
-    x = torch.randn((32, 64), device=device)
-    y = torch.randn((32, 64), device=device)
-    with torch.no_grad():
-        ref_output = model(x, y)
-
-    torch._dynamo.reset()
-    with torch.no_grad():
-        constraints = [
-            dynamic_dim(x, 0) >= 1,
-            dynamic_dim(x, 0) <= 1024,
-            dynamic_dim(x, 0) == dynamic_dim(y, 0),
-        ]
-        model_so_path = aot_compile(model, (x, y), constraints=constraints)
-
-    params = dict(model.named_parameters())
-    data.update({
-        f"model_so_path_{device}": model_so_path,
-        f"inputs_{device}": [x, y],
-        f"outputs_{device}": [ref_output],
-        f"fc_weight_{device}": params["fc.weight"],
-        f"fc_bias_{device}": params["fc.bias"],
-    })
-
-# Use this to communicate tensors to the cpp code
-class Serializer(torch.nn.Module):
-    def __init__(self, data):
-        super().__init__()
-        for key in data:
-            setattr(self, key, data[key])
-
-torch.jit.script(Serializer(data)).save("data.pt")
diff --git a/test/cpp/aoti_abi_check/CMakeLists.txt b/test/cpp/aoti_abi_check/CMakeLists.txt
new file mode 100644
index 0000000000000..401a4c712a8c4
--- /dev/null
+++ b/test/cpp/aoti_abi_check/CMakeLists.txt
@@ -0,0 +1,31 @@
+set(AOTI_ABI_CHECK_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_abi_check)
+
+# Build the cpp gtest binary containing the cpp-only tests.
+set(AOTI_ABI_CHECK_TEST_SRCS
+  ${AOTI_ABI_CHECK_TEST_ROOT}/main.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_cast.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_math.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_rand.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_vec.cpp
+)
+
+add_executable(test_aoti_abi_check
+  ${AOTI_ABI_CHECK_TEST_SRCS}
+)
+
+# TODO temporary until we can delete the old gtest polyfills.
+target_compile_definitions(test_aoti_abi_check PRIVATE USE_GTEST)
+
+# WARNING: DO NOT LINK torch!!!
+# The purpose is to check if the used aten/c10 headers are writtern in a header-only way
+target_link_libraries(test_aoti_abi_check PRIVATE gtest)
+target_include_directories(test_aoti_abi_check PRIVATE ${ATen_CPU_INCLUDE})
+
+if(INSTALL_TEST)
+  install(TARGETS test_aoti_abi_check DESTINATION bin)
+  # Install PDB files for MSVC builds
+  if(MSVC AND BUILD_SHARED_LIBS)
+    install(FILES $<TARGET_PDB_FILE:test_aoti_abi_check> DESTINATION bin OPTIONAL)
+  endif()
+endif()
diff --git a/test/cpp/aoti_abi_check/README.md b/test/cpp/aoti_abi_check/README.md
new file mode 100644
index 0000000000000..7a35838a4f678
--- /dev/null
+++ b/test/cpp/aoti_abi_check/README.md
@@ -0,0 +1 @@
+Tests in this directory are meant to guard certain ATen/c10 util functions and data structures are implemented in a header-only fashion, to make sure AOTInductor generated CPU model code is ABI backward-compatible.
diff --git a/test/cpp/aoti_abi_check/main.cpp b/test/cpp/aoti_abi_check/main.cpp
new file mode 100644
index 0000000000000..d81071339fc22
--- /dev/null
+++ b/test/cpp/aoti_abi_check/main.cpp
@@ -0,0 +1,6 @@
+#include <gtest/gtest.h>
+
+int main(int argc, char* argv[]) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/test/cpp/aoti_abi_check/test_cast.cpp b/test/cpp/aoti_abi_check/test_cast.cpp
new file mode 100644
index 0000000000000..5021a14881e17
--- /dev/null
+++ b/test/cpp/aoti_abi_check/test_cast.cpp
@@ -0,0 +1,25 @@
+#include <gtest/gtest.h>
+
+#include <c10/util/TypeCast.h>
+#include <c10/util/bit_cast.h>
+namespace torch {
+namespace aot_inductor {
+
+TEST(TestCast, TestConvert) {
+  c10::BFloat16 a = 3.0f;
+  c10::Half b = 3.0f;
+
+  EXPECT_EQ(c10::convert<c10::Half>(a), b);
+  EXPECT_EQ(a, c10::convert<c10::BFloat16>(b));
+}
+
+TEST(TestCast, TestBitcast) {
+  c10::BFloat16 a = 3.0f;
+  c10::Half b = 3.0f;
+
+  EXPECT_EQ(c10::bit_cast<c10::BFloat16>(c10::bit_cast<c10::Half>(a)), a);
+  EXPECT_EQ(c10::bit_cast<c10::Half>(c10::bit_cast<c10::BFloat16>(b)), b);
+}
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/test/cpp/aoti_abi_check/test_dtype.cpp b/test/cpp/aoti_abi_check/test_dtype.cpp
new file mode 100644
index 0000000000000..bf0081575b06f
--- /dev/null
+++ b/test/cpp/aoti_abi_check/test_dtype.cpp
@@ -0,0 +1,99 @@
+#include <gtest/gtest.h>
+
+#include <c10/util/BFloat16-math.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Half.h>
+
+namespace torch {
+namespace aot_inductor {
+
+TEST(TestDtype, TestBFloat16) {
+  c10::BFloat16 a = 1.0f;
+  c10::BFloat16 b = 2.0f;
+  c10::BFloat16 add = 3.0f;
+  c10::BFloat16 sub = -1.0f;
+  c10::BFloat16 mul = 2.0f;
+  c10::BFloat16 div = 0.5f;
+
+  EXPECT_EQ(a + b, add);
+  EXPECT_EQ(a - b, sub);
+  EXPECT_EQ(a * b, mul);
+  EXPECT_EQ(a / b, div);
+}
+
+TEST(TestDtype, TestFloat8_e4m3fn) {
+  c10::Float8_e4m3fn a = 1.0f;
+  c10::Float8_e4m3fn b = 2.0f;
+  c10::Float8_e4m3fn add = 3.0f;
+  c10::Float8_e4m3fn sub = -1.0f;
+  c10::Float8_e4m3fn mul = 2.0f;
+  c10::Float8_e4m3fn div = 0.5f;
+
+  EXPECT_EQ(a + b, add);
+  EXPECT_EQ(a - b, sub);
+  EXPECT_EQ(a * b, mul);
+  EXPECT_EQ(a / b, div);
+}
+
+TEST(TestDtype, TestFloat8_e4m3fuz) {
+  c10::Float8_e4m3fnuz a = 1.0f;
+  c10::Float8_e4m3fnuz b = 2.0f;
+  c10::Float8_e4m3fnuz add = 3.0f;
+  c10::Float8_e4m3fnuz sub = -1.0f;
+  c10::Float8_e4m3fnuz mul = 2.0f;
+  c10::Float8_e4m3fnuz div = 0.5f;
+
+  EXPECT_EQ(a + b, add);
+  EXPECT_EQ(a - b, sub);
+  EXPECT_EQ(a * b, mul);
+  EXPECT_EQ(a / b, div);
+}
+
+TEST(TestDtype, TestFloat8_e5m2) {
+  c10::Float8_e5m2 a = 1.0f;
+  c10::Float8_e5m2 b = 2.0f;
+  c10::Float8_e5m2 add = 3.0f;
+  c10::Float8_e5m2 sub = -1.0f;
+  c10::Float8_e5m2 mul = 2.0f;
+  c10::Float8_e5m2 div = 0.5f;
+
+  EXPECT_EQ(a + b, add);
+  EXPECT_EQ(a - b, sub);
+  EXPECT_EQ(a * b, mul);
+  EXPECT_EQ(a / b, div);
+}
+
+TEST(TestDtype, TestFloat8_e5m2fnuz) {
+  c10::Float8_e5m2fnuz a = 1.0f;
+  c10::Float8_e5m2fnuz b = 2.0f;
+  c10::Float8_e5m2fnuz add = 3.0f;
+  c10::Float8_e5m2fnuz sub = -1.0f;
+  c10::Float8_e5m2fnuz mul = 2.0f;
+  c10::Float8_e5m2fnuz div = 0.5f;
+
+  EXPECT_EQ(a + b, add);
+  EXPECT_EQ(a - b, sub);
+  EXPECT_EQ(a * b, mul);
+  EXPECT_EQ(a / b, div);
+}
+
+TEST(TestDtype, TestHalf) {
+  c10::Half a = 1.0f;
+  c10::Half b = 2.0f;
+  c10::Half add = 3.0f;
+  c10::Half sub = -1.0f;
+  c10::Half mul = 2.0f;
+  c10::Half div = 0.5f;
+
+  EXPECT_EQ(a + b, add);
+  EXPECT_EQ(a - b, sub);
+  EXPECT_EQ(a * b, mul);
+  EXPECT_EQ(a / b, div);
+}
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/test/cpp/aoti_abi_check/test_math.cpp b/test/cpp/aoti_abi_check/test_math.cpp
new file mode 100644
index 0000000000000..83418142bdb7b
--- /dev/null
+++ b/test/cpp/aoti_abi_check/test_math.cpp
@@ -0,0 +1,23 @@
+#include <gtest/gtest.h>
+
+#include <ATen/NumericUtils.h>
+#include <c10/util/generic_math.h>
+#include <cmath>
+namespace torch {
+namespace aot_inductor {
+
+TEST(TestMath, TestDivFloor) {
+  EXPECT_EQ(c10::div_floor_floating(5., 0.), INFINITY);
+  EXPECT_DOUBLE_EQ(c10::div_floor_floating(5., 2.), 2.);
+  EXPECT_DOUBLE_EQ(c10::div_floor_floating(5., -2.), -3.);
+  EXPECT_EQ(c10::div_floor_integer(5, 2), 2);
+  EXPECT_EQ(c10::div_floor_integer(5, -2), -3);
+}
+
+TEST(TestMath, TestNan) {
+  EXPECT_FALSE(at::_isnan(1.0));
+  EXPECT_TRUE(at::_isnan(std::nan("")));
+}
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/test/cpp/aoti_abi_check/test_rand.cpp b/test/cpp/aoti_abi_check/test_rand.cpp
new file mode 100644
index 0000000000000..98ce1a4eda2d5
--- /dev/null
+++ b/test/cpp/aoti_abi_check/test_rand.cpp
@@ -0,0 +1,39 @@
+#include <gtest/gtest.h>
+
+#include <ATen/core/PhiloxRNGEngine.h>
+
+#include <cstdint>
+#include <iostream>
+namespace torch {
+namespace aot_inductor {
+
+int64_t randint64_cpu(
+    uint32_t seed,
+    uint32_t offset,
+    int64_t low,
+    int64_t high) {
+  auto gen = at::Philox4_32(seed, 0, offset);
+  uint64_t r0 = gen();
+  uint64_t r1 = gen();
+  uint64_t result = r0 | (r1 << 32);
+  return static_cast<int64_t>(result % (high - low)) + low;
+}
+
+TEST(TestRand, TestRandn) {
+  at::Philox4_32 engine_1(1, 0, 0);
+  float a = engine_1.randn(10);
+  at::Philox4_32 engine_2(1, 0, 0);
+  float b = engine_2.randn(10);
+
+  EXPECT_EQ(a, b);
+}
+
+TEST(TestRand, TestRandint64) {
+  int64_t a = randint64_cpu(0xffffffff, 100, 0, INT64_MAX);
+  int64_t b = randint64_cpu(0xffffffff, 100, 0, INT64_MAX);
+
+  EXPECT_EQ(a, b);
+}
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/test/cpp/aoti_abi_check/test_vec.cpp b/test/cpp/aoti_abi_check/test_vec.cpp
new file mode 100644
index 0000000000000..a26576cfddc37
--- /dev/null
+++ b/test/cpp/aoti_abi_check/test_vec.cpp
@@ -0,0 +1,81 @@
+#include <gtest/gtest.h>
+
+#include <ATen/cpu/vec/vec.h>
+
+#include <iostream>
+namespace torch {
+namespace aot_inductor {
+
+TEST(TestVec, TestAdd) {
+  using Vec = at::vec::Vectorized<int>;
+  std::vector<int> a(1024, 1);
+  std::vector<int> b(1024, 2);
+  Vec a_vec = Vec::loadu(a.data());
+  Vec b_vec = Vec::loadu(b.data());
+  Vec actual_vec = a_vec + b_vec;
+  std::vector<int> expected(1024, 3);
+  Vec expected_vec = Vec::loadu(expected.data());
+
+  for (int i = 0; i < Vec::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
+}
+
+TEST(TestVec, TestMax) {
+  using Vec = at::vec::Vectorized<int>;
+  std::vector<int> a(1024, -1);
+  std::vector<int> b(1024, 2);
+  Vec a_vec = Vec::loadu(a.data());
+  Vec b_vec = Vec::loadu(b.data());
+  Vec actual_vec = at::vec::maximum(a_vec, b_vec);
+  Vec expected_vec = b_vec;
+
+  for (int i = 0; i < Vec::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
+}
+
+TEST(TestVec, TestMin) {
+  using Vec = at::vec::Vectorized<int>;
+  std::vector<int> a(1024, -1);
+  std::vector<int> b(1024, 2);
+  Vec a_vec = Vec::loadu(a.data());
+  Vec b_vec = Vec::loadu(b.data());
+  Vec actual_vec = at::vec::minimum(a_vec, b_vec);
+  Vec expected_vec = a_vec;
+
+  for (int i = 0; i < Vec::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
+}
+
+TEST(TestVec, TestConvert) {
+  std::vector<int> a(1024, -1);
+  std::vector<float> b(1024, -1.0);
+  at::vec::Vectorized<int> a_vec = at::vec::Vectorized<int>::loadu(a.data());
+  at::vec::Vectorized<float> b_vec =
+      at::vec::Vectorized<float>::loadu(b.data());
+  auto actual_vec = at::vec::convert<float>(a_vec);
+  auto expected_vec = b_vec;
+
+  for (int i = 0; i < at::vec::Vectorized<int>::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
+}
+
+TEST(TestVec, TestClampMin) {
+  using Vec = at::vec::Vectorized<float>;
+  std::vector<float> a(1024, -2.0);
+  std::vector<float> min(1024, -1.0);
+  Vec a_vec = Vec::loadu(a.data());
+  Vec min_vec = Vec::loadu(min.data());
+  Vec actual_vec = at::vec::clamp_min(a_vec, min_vec);
+  Vec expected_vec = min_vec;
+
+  for (int i = 0; i < Vec::size(); i++) {
+    EXPECT_EQ(expected_vec[i], actual_vec[i]);
+  }
+}
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/test/cpp/aoti_inference/CMakeLists.txt b/test/cpp/aoti_inference/CMakeLists.txt
new file mode 100644
index 0000000000000..70c91fbd419d6
--- /dev/null
+++ b/test/cpp/aoti_inference/CMakeLists.txt
@@ -0,0 +1,74 @@
+
+set(AOT_INDUCTOR_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_inference)
+
+# Build custom TorchScript op for AOTInductor
+add_library(aoti_custom_class SHARED aoti_custom_class.cpp)
+set_target_properties(aoti_custom_class PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+if(USE_CUDA)
+  target_compile_definitions(aoti_custom_class PRIVATE USE_CUDA)
+endif()
+# Link against LibTorch
+target_link_libraries(aoti_custom_class torch)
+
+# the custom command that generates the TorchScript module
+add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
+           ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
+           ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
+    COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py
+    DEPENDS compile_model.py
+)
+add_custom_target(aoti_script_model ALL
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
+)
+add_dependencies(aoti_script_model aoti_custom_class)
+
+# Build the cpp gtest binary containing the cpp-only tests.
+set(INDUCTOR_TEST_SRCS
+  ${AOT_INDUCTOR_TEST_ROOT}/test.cpp
+)
+
+add_executable(test_aoti_inference
+  ${TORCH_ROOT}/test/cpp/common/main.cpp
+  ${INDUCTOR_TEST_SRCS}
+  data.pt
+  script_data.pt
+  script_model_cpu.pt
+  script_model_cuda.pt
+)
+add_dependencies(test_aoti_inference aoti_custom_class aoti_script_model)
+
+# TODO temporary until we can delete the old gtest polyfills.
+target_compile_definitions(test_aoti_inference PRIVATE USE_GTEST)
+
+# Define a custom command to generate the library
+add_custom_command(
+        OUTPUT data.pt
+        COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/test.py
+        DEPENDS ${AOT_INDUCTOR_TEST_ROOT}/test.py
+)
+
+target_link_libraries(test_aoti_inference PRIVATE
+  torch
+  gtest
+  -Wl,--no-as-needed aoti_custom_class
+)
+
+if(USE_CUDA)
+  target_include_directories(test_aoti_inference PRIVATE ${ATen_CUDA_INCLUDE})
+  target_compile_definitions(test_aoti_inference PRIVATE USE_CUDA)
+endif()
+target_compile_definitions(test_aoti_inference PRIVATE
+    CMAKE_CURRENT_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}
+)
+
+if(INSTALL_TEST)
+  install(TARGETS test_aoti_inference DESTINATION bin)
+  # Install PDB files for MSVC builds
+  if(MSVC AND BUILD_SHARED_LIBS)
+    install(FILES $<TARGET_PDB_FILE:test_aoti_inference> DESTINATION bin OPTIONAL)
+  endif()
+endif()
diff --git a/test/cpp/aot_inductor/aoti_custom_class.cpp b/test/cpp/aoti_inference/aoti_custom_class.cpp
similarity index 100%
rename from test/cpp/aot_inductor/aoti_custom_class.cpp
rename to test/cpp/aoti_inference/aoti_custom_class.cpp
diff --git a/test/cpp/aot_inductor/aoti_custom_class.h b/test/cpp/aoti_inference/aoti_custom_class.h
similarity index 100%
rename from test/cpp/aot_inductor/aoti_custom_class.h
rename to test/cpp/aoti_inference/aoti_custom_class.h
diff --git a/test/cpp/aot_inductor/compile_model.py b/test/cpp/aoti_inference/compile_model.py
similarity index 100%
rename from test/cpp/aot_inductor/compile_model.py
rename to test/cpp/aoti_inference/compile_model.py
diff --git a/test/cpp/aoti_inference/test.cpp b/test/cpp/aoti_inference/test.cpp
new file mode 100644
index 0000000000000..fde2a372b54a3
--- /dev/null
+++ b/test/cpp/aoti_inference/test.cpp
@@ -0,0 +1,328 @@
+#include <gtest/gtest.h>
+#include <filesystem>
+#include <string>
+#include <vector>
+
+#include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
+#ifdef USE_CUDA
+#include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
+#endif
+#include <torch/script.h>
+#include <torch/torch.h>
+
+#define STR_VALUE(x) #x
+#define STRINGIZE(x) STR_VALUE(x)
+
+namespace {
+
+void test_aoti(const std::string& device, bool use_runtime_constant_folding) {
+  torch::NoGradGuard no_grad;
+
+  std::string data_path =
+      (std::filesystem::path(STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data.pt")
+           .string();
+  torch::jit::script::Module data_loader = torch::jit::load(data_path);
+  std::string suffix = use_runtime_constant_folding
+      ? device + "_use_runtime_constant_folding"
+      : device;
+  std::string path_attr = "model_so_path_" + suffix;
+  std::string inputs_attr = "inputs_" + suffix;
+  std::string outputs_attr = "outputs_" + suffix;
+  const auto& model_so_path = data_loader.attr(path_attr.c_str()).toStringRef();
+  auto input_tensors =
+      data_loader.attr(inputs_attr.c_str()).toTensorList().vec();
+  const auto& ref_output_tensors =
+      data_loader.attr(outputs_attr.c_str()).toTensorList().vec();
+
+  std::unique_ptr<torch::inductor::AOTIModelContainerRunner> runner;
+  if (device == "cuda") {
+    runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
+        model_so_path);
+  } else if (device == "cpu") {
+    runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCpu>(
+        model_so_path);
+  } else {
+    testing::AssertionFailure() << "unsupported device: " << device;
+  }
+  auto actual_output_tensors = runner->run(input_tensors);
+  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+}
+
+void test_aoti_script(const std::string& device) {
+  torch::NoGradGuard no_grad;
+
+  std::string script_model = "script_model_" + device + ".pt";
+  std::string model_path =
+      (std::filesystem::path(
+           STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / script_model.c_str())
+           .string();
+  torch::jit::script::Module model = torch::jit::load(model_path);
+
+  std::string sample_data_path =
+      (std::filesystem::path(
+           STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "script_data.pt")
+           .string();
+  torch::jit::script::Module sample_data = torch::jit::load(sample_data_path);
+  std::string inputs_attr = "inputs_" + device;
+  std::string outputs_attr = "outputs_" + device;
+  const auto& inputs = sample_data.attr(inputs_attr.c_str()).toList().vec();
+  const auto& ref_output_tensors =
+      sample_data.attr(outputs_attr.c_str()).toTensorVector();
+  auto outputs = model.forward(inputs).toTuple()->elements();
+  ASSERT_EQ(outputs.size(), ref_output_tensors.size());
+  for (size_t i = 0; i < ref_output_tensors.size(); i++) {
+    ASSERT_TRUE(torch::allclose(outputs[i].toTensor(), ref_output_tensors[i]));
+  }
+}
+
+void test_aoti_constants_update(
+    const std::string& device,
+    bool use_runtime_constant_folding) {
+  torch::NoGradGuard no_grad;
+
+  std::string data_path =
+      (std::filesystem::path(STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data.pt")
+           .string();
+
+  torch::jit::script::Module data_loader = torch::jit::load(data_path);
+  std::string suffix = use_runtime_constant_folding
+      ? device + "_use_runtime_constant_folding"
+      : device;
+  std::string path_attr = "model_so_path_" + suffix;
+  std::string inputs_attr = "inputs_" + suffix;
+  std::string outputs_attr = "outputs_" + suffix;
+  std::string weights_attr = "w_pre_" + suffix;
+  std::string add_attr = "w_add_" + suffix;
+  const auto& model_so_path = data_loader.attr(path_attr.c_str()).toStringRef();
+  auto input_tensors =
+      data_loader.attr(inputs_attr.c_str()).toTensorList().vec();
+  const auto& ref_output_tensors =
+      data_loader.attr(outputs_attr.c_str()).toTensorList().vec();
+
+  const auto& weight_tensors =
+      data_loader.attr(weights_attr.c_str()).toTensor();
+  const auto& add_tensors = data_loader.attr(add_attr.c_str()).toTensor();
+
+  torch::inductor::TensorConstantMap missing_map, rand_map, real_map;
+  missing_map.emplace("L__self___w_pre", new at::Tensor(at::randn({4, 4})));
+  rand_map.emplace("L__self___w_pre", new at::Tensor(at::randn({10})));
+  rand_map.emplace("L__self___w_add", new at::Tensor(at::randn({10})));
+  real_map.emplace("L__self___w_pre", new at::Tensor(weight_tensors));
+  real_map.emplace("L__self___w_add", new at::Tensor(add_tensors));
+
+  std::unique_ptr<torch::inductor::AOTIModelContainerRunner> runner;
+  if (device == "cuda") {
+    runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
+        model_so_path);
+  } else if (device == "cpu") {
+    runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCpu>(
+        model_so_path);
+  } else {
+    testing::AssertionFailure() << "unsupported device: " << device;
+  }
+  // By default, buffer #1 get loaded with burned in weights. Correct results.
+  auto actual_output_tensors = runner->run(input_tensors);
+  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+
+  // Update with missing map which should throw.
+  EXPECT_THROW(
+      runner->update_constant_buffer(missing_map, false, true),
+      std::runtime_error);
+
+  // Update random weight to buffer #1.
+  runner->update_constant_buffer(missing_map, false, false);
+  actual_output_tensors = runner->run(input_tensors);
+  if (use_runtime_constant_folding) {
+    // At this moment, this update is applied on the original weight.
+    // The weight being consumed is "folded", so will have no affect.
+    ASSERT_TRUE(
+        torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+    runner->run_const_fold(/* use_inactive = */ false);
+    actual_output_tensors = runner->run(input_tensors);
+  }
+  ASSERT_FALSE(
+      torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+
+  // Update with real map.
+  runner->update_constant_buffer(real_map, false, false);
+  actual_output_tensors = runner->run(input_tensors);
+  if (use_runtime_constant_folding) {
+    runner->run_const_fold(/* use_inactive = */ false);
+  }
+  actual_output_tensors = runner->run(input_tensors);
+  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+
+  // Update with full random map.
+  runner->update_constant_buffer(rand_map, false, false);
+  if (use_runtime_constant_folding) {
+    runner->run_const_fold(/* use_inactive = */ false);
+  }
+  actual_output_tensors = runner->run(input_tensors);
+  ASSERT_FALSE(
+      torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+}
+
+void test_aoti_double_buffering(
+    const std::string& device,
+    bool use_runtime_constant_folding) {
+  torch::NoGradGuard no_grad;
+
+  std::string data_path =
+      (std::filesystem::path(STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data.pt")
+           .string();
+
+  torch::jit::script::Module data_loader = torch::jit::load(data_path);
+  std::string suffix = use_runtime_constant_folding
+      ? device + "_use_runtime_constant_folding"
+      : device;
+  std::string path_attr = "model_so_path_" + suffix;
+  std::string inputs_attr = "inputs_" + suffix;
+  std::string outputs_attr = "outputs_" + suffix;
+  std::string weights_attr = "w_pre_" + suffix;
+  std::string add_attr = "w_add_" + suffix;
+  const auto& model_so_path = data_loader.attr(path_attr.c_str()).toStringRef();
+  auto input_tensors =
+      data_loader.attr(inputs_attr.c_str()).toTensorList().vec();
+  const auto& ref_output_tensors =
+      data_loader.attr(outputs_attr.c_str()).toTensorList().vec();
+
+  const auto& weight_tensors =
+      data_loader.attr(weights_attr.c_str()).toTensor();
+  const auto& add_tensors = data_loader.attr(add_attr.c_str()).toTensor();
+
+  torch::inductor::TensorConstantMap rand_map, real_map;
+  rand_map.emplace("L__self___w_pre", new at::Tensor(at::randn({4, 4})));
+  rand_map.emplace("L__self___w_add", new at::Tensor(at::randn({4, 4})));
+  real_map.emplace("L__self___w_pre", new at::Tensor(weight_tensors));
+  real_map.emplace("L__self___w_add", new at::Tensor(add_tensors));
+
+  std::unique_ptr<torch::inductor::AOTIModelContainerRunner> runner;
+  if (device == "cuda") {
+    runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
+        model_so_path.c_str());
+  } else if (device == "cpu") {
+    runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCpu>(
+        model_so_path.c_str());
+  } else {
+    testing::AssertionFailure() << "unsupported device: " << device;
+  }
+  // By default, buffer #1 get loaded with burned in weights. Correct results.
+  auto actual_output_tensors = runner->run(input_tensors);
+  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+
+  // We update the weights to buffer #2 and activate it. This should still
+  // produce correct result, as it's the real constant map.
+  runner->update_inactive_constant_buffer(real_map);
+  if (use_runtime_constant_folding) {
+    runner->run_const_fold(/* use_inactive = */ true);
+  }
+  runner->swap_constant_buffer();
+  actual_output_tensors = runner->run(input_tensors);
+  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+
+  // We update random weights to buffer #1. But do not swap in the weight yet.
+  runner->update_inactive_constant_buffer(rand_map);
+  if (use_runtime_constant_folding) {
+    runner->run_const_fold(/* use_inactive = */ true);
+  }
+  actual_output_tensors = runner->run(input_tensors);
+  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+
+  // We swap and activate the weight to buffer #1. This is random weight and
+  // should produce incorrect results.
+  runner->swap_constant_buffer();
+  actual_output_tensors = runner->run(input_tensors);
+  ASSERT_FALSE(
+      torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+
+  // Swap back to buffer #2 which is the real constants.
+  runner->swap_constant_buffer();
+  actual_output_tensors = runner->run(input_tensors);
+  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+}
+
+void test_aoti_double_buffering_with_tensor_constants() {
+  torch::NoGradGuard no_grad;
+
+  std::string data_path = (std::filesystem::path(
+                               STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) /
+                               "data_with_tensor_constants.pt")
+                               .string();
+
+  torch::jit::script::Module data_loader = torch::jit::load(data_path);
+  std::string path_attr = "model_so_path";
+  std::string inputs_attr = "inputs";
+  std::string w_attr = "w";
+  std::string outputs_attr = "outputs";
+  const auto& model_so_path = data_loader.attr(path_attr.c_str()).toStringRef();
+  auto input_tensors =
+      data_loader.attr(inputs_attr.c_str()).toTensorList().vec();
+  const auto& w_tensors = data_loader.attr(w_attr.c_str()).toTensor();
+  const auto& ref_output_tensors =
+      data_loader.attr(outputs_attr.c_str()).toTensorList().vec();
+
+  torch::inductor::TensorConstantMap real_map;
+  real_map.emplace("L__self___w", new at::Tensor(w_tensors));
+
+  std::unique_ptr<torch::inductor::AOTIModelContainerRunner> runner;
+  runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
+      model_so_path.c_str());
+
+  // By default, buffer #1 get loaded with burned in weights. Correct results.
+  auto actual_output_tensors = runner->run(input_tensors);
+  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+
+  // We update the weights to buffer #2 and activate it. This should still
+  // produce correct result, since we would have copied the tensor_constants.
+  runner->update_inactive_constant_buffer(real_map);
+  runner->swap_constant_buffer();
+  actual_output_tensors = runner->run(input_tensors);
+  ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
+}
+
+} // namespace
+
+namespace torch {
+namespace aot_inductor {
+
+TEST(AotInductorTest, BasicTestCpu) {
+  test_aoti("cpu", false);
+}
+
+TEST(AotInductorTest, BasicScriptTestCpu) {
+  test_aoti_script("cpu");
+}
+
+#ifdef USE_CUDA
+TEST(AotInductorTest, BasicTestCuda) {
+  test_aoti("cuda", true);
+  test_aoti("cuda", false);
+}
+
+TEST(AotInductorTest, BasicScriptTestCuda) {
+  test_aoti_script("cuda");
+}
+
+TEST(AotInductorTest, RuntimeUpdateConstantsCuda) {
+  test_aoti_constants_update("cuda", true);
+}
+
+TEST(AotInductorTest, UpdateConstantsCuda) {
+  test_aoti_constants_update("cuda", false);
+}
+
+TEST(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) {
+  test_aoti_double_buffering("cuda", true);
+}
+
+TEST(AotInductorTest, UpdateInactiveConstantsCuda) {
+  test_aoti_double_buffering("cuda", false);
+}
+
+TEST(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
+  test_aoti_double_buffering_with_tensor_constants();
+}
+#endif
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/test/cpp/aoti_inference/test.py b/test/cpp/aoti_inference/test.py
new file mode 100644
index 0000000000000..65b05f6192942
--- /dev/null
+++ b/test/cpp/aoti_inference/test.py
@@ -0,0 +1,111 @@
+import torch
+from torch._export import aot_compile
+from torch.export import Dim
+
+torch.manual_seed(1337)
+
+
+class Net(torch.nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        self.w_pre = torch.randn(4, 4, device=device)
+        self.w_add = torch.randn(4, 4, device=device)
+
+    def forward(self, x):
+        w_transpose = torch.transpose(self.w_pre, 0, 1)
+        w_relu = torch.nn.functional.relu(w_transpose)
+        w = w_relu + self.w_add
+        return torch.matmul(x, w)
+
+
+class NetWithTensorConstants(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.w = torch.randn(30, 1, device="cuda")
+
+    def forward(self, x, y):
+        z = self.w * x * y
+        return z[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17]]
+
+
+data = {}
+data_with_tensor_constants = {}
+
+
+# Basice AOTI model test generation.
+def generate_basic_tests():
+    for device in ["cpu", "cuda"]:
+        for use_runtime_constant_folding in [True, False]:
+            if device == "cpu" and use_runtime_constant_folding:
+                # We do not test runtime const folding for cpu mode.
+                continue
+            model = Net(device).to(device=device)
+            x = torch.randn((4, 4), device=device)
+            with torch.no_grad():
+                ref_output = model(x)
+
+            torch._dynamo.reset()
+            with torch.no_grad():
+                dim0_x = Dim("dim0_x", min=1, max=1024)
+                dynamic_shapes = {"x": {0: dim0_x}}
+                model_so_path = aot_compile(
+                    model,
+                    (x,),
+                    dynamic_shapes=dynamic_shapes,
+                    options={
+                        "aot_inductor.use_runtime_constant_folding": use_runtime_constant_folding
+                    },
+                )
+
+            suffix = f"{device}"
+            if use_runtime_constant_folding:
+                suffix += "_use_runtime_constant_folding"
+            data.update(
+                {
+                    f"model_so_path_{suffix}": model_so_path,
+                    f"inputs_{suffix}": [x],
+                    f"outputs_{suffix}": [ref_output],
+                    f"w_pre_{suffix}": model.w_pre,
+                    f"w_add_{suffix}": model.w_add,
+                }
+            )
+
+
+# AOTI model which will create additional tensors during autograd.
+def generate_test_with_additional_tensors():
+    model = NetWithTensorConstants()
+    x = torch.randn((30, 1), device="cuda")
+    y = torch.randn((30, 1), device="cuda")
+    with torch.no_grad():
+        ref_output = model(x, y)
+
+    torch._dynamo.reset()
+    with torch.no_grad():
+        model_so_path = aot_compile(model, (x, y))
+
+    data_with_tensor_constants.update(
+        {
+            "model_so_path": model_so_path,
+            "inputs": [x, y],
+            "outputs": [ref_output],
+            "w": model.w,
+        }
+    )
+
+
+generate_basic_tests()
+generate_test_with_additional_tensors()
+
+
+# Use this to communicate tensors to the cpp code
+class Serializer(torch.nn.Module):
+    def __init__(self, data):
+        super().__init__()
+        for key in data:
+            setattr(self, key, data[key])
+
+
+torch.jit.script(Serializer(data)).save("data.pt")
+torch.jit.script(Serializer(data_with_tensor_constants)).save(
+    "data_with_tensor_constants.pt"
+)
diff --git a/test/cpp/api/CMakeLists.txt b/test/cpp/api/CMakeLists.txt
index 2e89ec2e7c839..42b67d8cb25c2 100644
--- a/test/cpp/api/CMakeLists.txt
+++ b/test/cpp/api/CMakeLists.txt
@@ -41,7 +41,7 @@ set(TORCH_API_TEST_SOURCES
   ${TORCH_API_TEST_DIR}/inference_mode.cpp
   ${TORCH_API_TEST_DIR}/grad_mode.cpp
   ${TORCH_API_TEST_DIR}/operations.cpp
-  ${TORCH_API_TEST_DIR}/singleton_int.cpp
+  ${TORCH_API_TEST_DIR}/nested_int.cpp
 )
 if(USE_CUDA OR USE_ROCM)
   list(APPEND TORCH_API_TEST_SOURCES ${TORCH_API_TEST_DIR}/parallel.cpp)
diff --git a/test/cpp/api/autograd.cpp b/test/cpp/api/autograd.cpp
index 7c271b9f603d1..3d1604752dbc4 100644
--- a/test/cpp/api/autograd.cpp
+++ b/test/cpp/api/autograd.cpp
@@ -1500,14 +1500,11 @@ TEST(TestAutogradNotImplementedFallback, RetTupleNonTensor) {
   auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow(
       "_test::ret_tuple_non_tensor", "");
   auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
-    torch::Tensor out0;
-    torch::Tensor out1;
-    int64_t out2;
     auto out = callOpUnboxed<
         std::tuple<torch::Tensor, torch::Tensor, int64_t>,
         const torch::Tensor&,
         const torch::Tensor&>(opHandle, _1, _2);
-    std::tie(out0, out1, out2) = std::move(out);
+    auto [out0, out1, out2] = std::move(out);
     return out0;
   };
 
@@ -1664,7 +1661,7 @@ TEST(TestAutogradNotImplementedFallback, TensorlistOp) {
 
   ASSERT_THROWS_WITH(
       torch::autograd::grad({out}, {vec[0]}),
-      "One of the differentiated Tensors does not require grad");
+      "element 0 of the input tensors does not require grad");
   ASSERT_THROWS_WITH(
       torch::autograd::grad({out}, {vec[1]}), "is not implemented");
 
diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index 7ab3e9579f3c7..68d41cb163d51 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -1524,6 +1524,13 @@ TEST_F(FunctionalTest, Bilinear) {
   ASSERT_EQ(y_no_bias.sizes(), torch::IntArrayRef({2, 1}));
   auto y_no_bias_exp = torch::tensor({{448, 1701}}).reshape({2, 1});
   ASSERT_TRUE(torch::allclose(y_no_bias, y_no_bias_exp, 1e-4, 1e-7));
+
+  input1 = input1.to(torch::kFloat64);
+  input2 = input2.to(torch::kInt32);
+  weight = weight.to(torch::kInt32);
+  ASSERT_THROWS_WITH(
+      F::bilinear(input1, input2, weight),
+      "All tensors must have the same dtype, got input1: double, input2: int, weight: int");
 }
 
 TEST_F(FunctionalTest, Normalize) {
diff --git a/test/cpp/api/init_baseline.py b/test/cpp/api/init_baseline.py
index 9ed88f7c226c6..0fb09acb143ba 100644
--- a/test/cpp/api/init_baseline.py
+++ b/test/cpp/api/init_baseline.py
@@ -1,6 +1,7 @@
 """Script to generate baseline values from PyTorch initialization algorithms"""
 
 import sys
+
 import torch
 
 HEADER = """
@@ -19,13 +20,13 @@
     "Xavier_Uniform": lambda w: torch.nn.init.xavier_uniform(w),
     "Xavier_Normal": lambda w: torch.nn.init.xavier_normal(w),
     "Kaiming_Normal": lambda w: torch.nn.init.kaiming_normal(w),
-    "Kaiming_Uniform": lambda w: torch.nn.init.kaiming_uniform(w)
+    "Kaiming_Uniform": lambda w: torch.nn.init.kaiming_uniform(w),
 }
 
 
 def emit(initializer_parameter_map):
     # Don't write generated with an @ in front, else this file is recognized as generated.
-    print("// @{} from {}".format('generated', __file__))
+    print("// @{} from {}".format("generated", __file__))
     print(HEADER)
     for initializer_name, weights in initializer_parameter_map.items():
         print(PARAMETERS.format(initializer_name))
@@ -63,10 +64,11 @@ def run(initializer):
 def main():
     initializer_parameter_map = {}
     for initializer in INITIALIZERS.keys():
-        sys.stderr.write(f'Evaluating {initializer} ...\n')
+        sys.stderr.write(f"Evaluating {initializer} ...\n")
         initializer_parameter_map[initializer] = run(initializer)
 
     emit(initializer_parameter_map)
 
+
 if __name__ == "__main__":
     main()
diff --git a/test/cpp/api/nested_int.cpp b/test/cpp/api/nested_int.cpp
new file mode 100644
index 0000000000000..b13922823fb5c
--- /dev/null
+++ b/test/cpp/api/nested_int.cpp
@@ -0,0 +1,105 @@
+#include <gtest/gtest.h>
+
+#include <ATen/core/NestedIntSymNodeImpl.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/SymNodeImpl.h>
+#include <torch/torch.h>
+
+#include <test/cpp/api/support.h>
+
+TEST(NestedIntTest, Comparisons) {
+  auto a = c10::SymInt(
+      c10::SymNode(c10::make_intrusive<c10::NestedIntSymNodeImpl>(1, 1)));
+  auto b = c10::SymInt(
+      c10::SymNode(c10::make_intrusive<c10::NestedIntSymNodeImpl>(1, 1)));
+  auto c = c10::SymInt(
+      c10::SymNode(c10::make_intrusive<c10::NestedIntSymNodeImpl>(2, 1)));
+  auto d = c10::SymInt(3);
+
+  ASSERT_TRUE(a == a);
+  ASSERT_TRUE(a == b);
+  ASSERT_FALSE(a != a);
+  ASSERT_FALSE(a != b);
+  ASSERT_FALSE(a == c);
+  ASSERT_TRUE(a != c);
+
+  ASSERT_FALSE(a == d);
+  ASSERT_TRUE(a != d);
+  ASSERT_FALSE(d == a);
+  ASSERT_TRUE(d != a);
+
+  // ge
+  ASSERT_TRUE(a >= a);
+  ASSERT_TRUE(a >= b);
+  ASSERT_TRUE(b >= a);
+  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+  EXPECT_THROW((void)(a >= c), c10::Error);
+  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+  EXPECT_THROW((void)(c >= a), c10::Error);
+  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+  EXPECT_THROW((void)(c >= 3), c10::Error);
+  ASSERT_TRUE(c >= 2);
+  ASSERT_TRUE(c >= 1);
+  ASSERT_FALSE(1 >= c);
+
+  // lt
+  ASSERT_FALSE(a < a);
+  ASSERT_FALSE(a < b);
+  ASSERT_FALSE(b < a);
+  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+  EXPECT_THROW((void)(a < c), c10::Error);
+  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+  EXPECT_THROW((void)(c < a), c10::Error);
+  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+  EXPECT_THROW((void)(3 < a), c10::Error);
+  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+  EXPECT_THROW((void)(2 < a), c10::Error);
+  ASSERT_TRUE(1 < a);
+
+  // le
+  ASSERT_TRUE(a <= a);
+  ASSERT_TRUE(b <= a);
+  ASSERT_TRUE(a <= b);
+  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+  EXPECT_THROW((void)(a <= c), c10::Error);
+  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+  EXPECT_THROW((void)(c <= a), c10::Error);
+  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+  EXPECT_THROW((void)(3 <= c), c10::Error);
+  ASSERT_TRUE(2 <= c);
+  ASSERT_TRUE(1 <= c);
+  ASSERT_FALSE(c <= 1);
+
+  // gt
+  ASSERT_FALSE(a > a);
+  ASSERT_FALSE(b > a);
+  ASSERT_FALSE(a > b);
+  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+  EXPECT_THROW((void)(a > c), c10::Error);
+  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+  EXPECT_THROW((void)(c > a), c10::Error);
+  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+  EXPECT_THROW((void)(a > 3), c10::Error);
+  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
+  EXPECT_THROW((void)(a > 2), c10::Error);
+  ASSERT_TRUE(a > 1);
+}
+
+TEST(NestedIntTest, WithFactor) {
+  auto a = c10::SymInt(
+      c10::SymNode(c10::make_intrusive<c10::NestedIntSymNodeImpl>(1, 5)));
+  auto b = c10::SymInt(
+      c10::SymNode(c10::make_intrusive<c10::NestedIntSymNodeImpl>(1, 10)));
+  // eq
+  ASSERT_FALSE(a == b);
+  ASSERT_FALSE(a >= b);
+  ASSERT_TRUE(b >= a);
+  ASSERT_TRUE(a <= b);
+  ASSERT_FALSE(b <= a);
+  // ne
+  ASSERT_TRUE(a != b);
+  // mul
+  ASSERT_TRUE(a * 2 == b);
+  ASSERT_TRUE(a * 3 >= b);
+  ASSERT_TRUE(a * 2 == 2 * a);
+}
diff --git a/test/cpp/api/optim.cpp b/test/cpp/api/optim.cpp
index 282173a30491f..b8799a17157fb 100644
--- a/test/cpp/api/optim.cpp
+++ b/test/cpp/api/optim.cpp
@@ -510,6 +510,38 @@ void check_lr_change(
   }
 }
 
+// Very similar to check_lr_change, but for ReduceLROnPlateauScheduler
+// which does not inherit from LRScheduler and requires a metrics
+// input to step().
+void check_lr_change_for_reduce_on_plateau(
+    Optimizer& optimizer,
+    ReduceLROnPlateauScheduler& lr_scheduler,
+    std::map<unsigned, double> expected_epoch_lrs) {
+  // Find maximum epoch in map
+  unsigned kIterations = std::max_element(
+                             expected_epoch_lrs.begin(),
+                             expected_epoch_lrs.end(),
+                             [](const std::pair<unsigned, double>& a,
+                                const std::pair<unsigned, double>& b) -> bool {
+                               return a.second > b.second;
+                             })
+                             ->first;
+
+  for (unsigned i = 0; i <= kIterations; i++) {
+    const auto epoch_iter = expected_epoch_lrs.find(i);
+    if (epoch_iter != expected_epoch_lrs.end()) {
+      // Compare the similarity of the two floating point learning rates
+      ASSERT_TRUE(
+          fabs(
+              epoch_iter->second -
+              optimizer.param_groups()[0].options().get_lr()) <
+          std::numeric_limits<double>::epsilon());
+    }
+    optimizer.step();
+    lr_scheduler.step(5.0);
+  }
+}
+
 TEST(OptimTest, CheckLRChange_StepLR_Adam) {
   torch::Tensor parameters = torch::zeros({1});
   auto optimizer = Adam({parameters}, AdamOptions().lr(1e-3));
@@ -523,3 +555,21 @@ TEST(OptimTest, CheckLRChange_StepLR_Adam) {
 
   check_lr_change(optimizer, step_lr_scheduler, expected_epoch_lrs);
 }
+
+TEST(OptimTest, CheckLRChange_ReduceLROnPlateau_Adam) {
+  torch::Tensor parameters = torch::zeros({1});
+  auto optimizer = Adam({parameters}, AdamOptions().lr(1e-3));
+  const float factor = 0.5;
+  const int patience = 20;
+  ReduceLROnPlateauScheduler reduce_lr_on_plateau_scheduler(
+      optimizer,
+      ReduceLROnPlateauScheduler::SchedulerMode::min,
+      factor,
+      patience);
+
+  // The learning rate should have halved at epoch 20
+  const std::map<unsigned, double> expected_epoch_lrs = {{1, 1e-3}, {25, 5e-4}};
+
+  check_lr_change_for_reduce_on_plateau(
+      optimizer, reduce_lr_on_plateau_scheduler, expected_epoch_lrs);
+}
diff --git a/test/cpp/api/optim_baseline.py b/test/cpp/api/optim_baseline.py
index 16d2508ab41ce..7e278d4e42086 100644
--- a/test/cpp/api/optim_baseline.py
+++ b/test/cpp/api/optim_baseline.py
@@ -21,27 +21,43 @@
 PARAMETERS = "inline std::vector<std::vector<torch::Tensor>> {}() {{"
 
 OPTIMIZERS = {
-    "LBFGS" : lambda p: torch.optim.LBFGS(p, 1.0),
-    "LBFGS_with_line_search" : lambda p: torch.optim.LBFGS(p, 1.0, line_search_fn="strong_wolfe"),
+    "LBFGS": lambda p: torch.optim.LBFGS(p, 1.0),
+    "LBFGS_with_line_search": lambda p: torch.optim.LBFGS(
+        p, 1.0, line_search_fn="strong_wolfe"
+    ),
     "Adam": lambda p: torch.optim.Adam(p, 1.0),
     "Adam_with_weight_decay": lambda p: torch.optim.Adam(p, 1.0, weight_decay=1e-2),
-    "Adam_with_weight_decay_and_amsgrad": lambda p: torch.optim.Adam(p, 1.0, weight_decay=1e-6, amsgrad=True),
+    "Adam_with_weight_decay_and_amsgrad": lambda p: torch.optim.Adam(
+        p, 1.0, weight_decay=1e-6, amsgrad=True
+    ),
     "AdamW": lambda p: torch.optim.AdamW(p, 1.0),
     "AdamW_without_weight_decay": lambda p: torch.optim.AdamW(p, 1.0, weight_decay=0),
     "AdamW_with_amsgrad": lambda p: torch.optim.AdamW(p, 1.0, amsgrad=True),
     "Adagrad": lambda p: torch.optim.Adagrad(p, 1.0),
-    "Adagrad_with_weight_decay": lambda p: torch.optim.Adagrad(p, 1.0, weight_decay=1e-2),
-    "Adagrad_with_weight_decay_and_lr_decay": lambda p: torch.optim.Adagrad(p, 1.0, weight_decay=1e-6, lr_decay=1e-3),
+    "Adagrad_with_weight_decay": lambda p: torch.optim.Adagrad(
+        p, 1.0, weight_decay=1e-2
+    ),
+    "Adagrad_with_weight_decay_and_lr_decay": lambda p: torch.optim.Adagrad(
+        p, 1.0, weight_decay=1e-6, lr_decay=1e-3
+    ),
     "RMSprop": lambda p: torch.optim.RMSprop(p, 0.1),
-    "RMSprop_with_weight_decay": lambda p: torch.optim.RMSprop(p, 0.1, weight_decay=1e-2),
-    "RMSprop_with_weight_decay_and_centered": lambda p: torch.optim.RMSprop(p, 0.1, weight_decay=1e-6, centered=True),
-    "RMSprop_with_weight_decay_and_centered_and_momentum":
-        lambda p: torch.optim.RMSprop(p, 0.1, weight_decay=1e-6, centered=True, momentum=0.9),
+    "RMSprop_with_weight_decay": lambda p: torch.optim.RMSprop(
+        p, 0.1, weight_decay=1e-2
+    ),
+    "RMSprop_with_weight_decay_and_centered": lambda p: torch.optim.RMSprop(
+        p, 0.1, weight_decay=1e-6, centered=True
+    ),
+    "RMSprop_with_weight_decay_and_centered_and_momentum": lambda p: torch.optim.RMSprop(
+        p, 0.1, weight_decay=1e-6, centered=True, momentum=0.9
+    ),
     "SGD": lambda p: torch.optim.SGD(p, 0.1),
     "SGD_with_weight_decay": lambda p: torch.optim.SGD(p, 0.1, weight_decay=1e-2),
-    "SGD_with_weight_decay_and_momentum": lambda p: torch.optim.SGD(p, 0.1, momentum=0.9, weight_decay=1e-2),
-    "SGD_with_weight_decay_and_nesterov_momentum":
-        lambda p: torch.optim.SGD(p, 0.1, momentum=0.9, weight_decay=1e-6, nesterov=True),
+    "SGD_with_weight_decay_and_momentum": lambda p: torch.optim.SGD(
+        p, 0.1, momentum=0.9, weight_decay=1e-2
+    ),
+    "SGD_with_weight_decay_and_nesterov_momentum": lambda p: torch.optim.SGD(
+        p, 0.1, momentum=0.9, weight_decay=1e-6, nesterov=True
+    ),
 }
 
 
@@ -75,11 +91,11 @@ def run(optimizer_name, iterations, sample_every):
         loss.backward()
 
         def closure():
-            return torch.tensor([10.])
+            return torch.tensor([10.0])
+
         optimizer.step(closure)
 
         if i % sample_every == 0:
-
             values.append(
                 [p.clone().flatten().data.numpy() for p in model.parameters()]
             )
@@ -89,7 +105,7 @@ def closure():
 
 def emit(optimizer_parameter_map):
     # Don't write generated with an @ in front, else this file is recognized as generated.
-    print("// @{} from {}".format('generated', __file__))
+    print("// @{} from {}".format("generated", __file__))
     print(HEADER)
     for optimizer_name, parameters in optimizer_parameter_map.items():
         print(PARAMETERS.format(optimizer_name))
@@ -115,7 +131,7 @@ def main():
 
     optimizer_parameter_map = {}
     for optimizer in OPTIMIZERS.keys():
-        sys.stderr.write(f'Evaluating {optimizer} ...\n')
+        sys.stderr.write(f"Evaluating {optimizer} ...\n")
         optimizer_parameter_map[optimizer] = run(
             optimizer, options.iterations, options.sample_every
         )
diff --git a/test/cpp/api/serialize.cpp b/test/cpp/api/serialize.cpp
index e765eb3e2dbb5..1b61499c2a756 100644
--- a/test/cpp/api/serialize.cpp
+++ b/test/cpp/api/serialize.cpp
@@ -102,6 +102,12 @@ void test_serialize_optimizer(
   auto optim2_2 = OptimizerClass(model2->parameters(), options);
   auto optim3 = OptimizerClass(model3->parameters(), options);
   auto optim3_2 = OptimizerClass(model3->parameters(), options);
+  for (auto& param_group : optim3_2.param_groups()) {
+    const double lr = param_group.options().get_lr();
+    // change the learning rate, which will be overwritten by the loading
+    // otherwise, test cannot check if options are saved and loaded correctly
+    param_group.options().set_lr(lr + 0.01);
+  }
 
   auto x = torch::ones({10, 5});
 
diff --git a/test/cpp/api/singleton_int.cpp b/test/cpp/api/singleton_int.cpp
deleted file mode 100644
index 152d7a72f28f2..0000000000000
--- a/test/cpp/api/singleton_int.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <ATen/core/SingletonSymNodeImpl.h>
-#include <c10/core/SymInt.h>
-#include <c10/core/SymNodeImpl.h>
-#include <torch/torch.h>
-
-#include <test/cpp/api/support.h>
-
-TEST(SingletonIntTest, Comparisons) {
-  auto a = c10::SymInt(
-      c10::SymNode(c10::make_intrusive<c10::SingletonSymNodeImpl>(1, 1)));
-  auto b = c10::SymInt(
-      c10::SymNode(c10::make_intrusive<c10::SingletonSymNodeImpl>(1, 1)));
-  auto c = c10::SymInt(
-      c10::SymNode(c10::make_intrusive<c10::SingletonSymNodeImpl>(2, 1)));
-  auto d = c10::SymInt(3);
-
-  ASSERT_TRUE(a == a);
-  ASSERT_TRUE(a == b);
-  ASSERT_FALSE(a != a);
-  ASSERT_FALSE(a != b);
-  ASSERT_FALSE(a == c);
-  ASSERT_TRUE(a != c);
-
-  ASSERT_FALSE(a == d);
-  ASSERT_TRUE(a != d);
-  ASSERT_FALSE(d == a);
-  ASSERT_TRUE(d != a);
-
-  // ge
-  ASSERT_TRUE(a >= a);
-  ASSERT_TRUE(a >= b);
-  ASSERT_TRUE(b >= a);
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW((void)(a >= c), c10::Error);
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW((void)(c >= a), c10::Error);
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW((void)(c >= 3), c10::Error);
-  ASSERT_TRUE(c >= 2);
-  ASSERT_TRUE(c >= 1);
-  ASSERT_FALSE(1 >= c);
-
-  // lt
-  ASSERT_FALSE(a < a);
-  ASSERT_FALSE(a < b);
-  ASSERT_FALSE(b < a);
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW((void)(a < c), c10::Error);
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW((void)(c < a), c10::Error);
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW((void)(3 < a), c10::Error);
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW((void)(2 < a), c10::Error);
-  ASSERT_TRUE(1 < a);
-
-  // le
-  ASSERT_TRUE(a <= a);
-  ASSERT_TRUE(b <= a);
-  ASSERT_TRUE(a <= b);
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW((void)(a <= c), c10::Error);
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW((void)(c <= a), c10::Error);
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW((void)(3 <= c), c10::Error);
-  ASSERT_TRUE(2 <= c);
-  ASSERT_TRUE(1 <= c);
-  ASSERT_FALSE(c <= 1);
-
-  // gt
-  ASSERT_FALSE(a > a);
-  ASSERT_FALSE(b > a);
-  ASSERT_FALSE(a > b);
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW((void)(a > c), c10::Error);
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW((void)(c > a), c10::Error);
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW((void)(a > 3), c10::Error);
-  // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto)
-  EXPECT_THROW((void)(a > 2), c10::Error);
-  ASSERT_TRUE(a > 1);
-}
-
-TEST(SingletonIntTest, WiithFactor) {
-  auto a = c10::SymInt(
-      c10::SymNode(c10::make_intrusive<c10::SingletonSymNodeImpl>(1, 5)));
-  auto b = c10::SymInt(
-      c10::SymNode(c10::make_intrusive<c10::SingletonSymNodeImpl>(1, 10)));
-  // eq
-  ASSERT_FALSE(a == b);
-  ASSERT_FALSE(a >= b);
-  ASSERT_TRUE(b >= a);
-  ASSERT_TRUE(a <= b);
-  ASSERT_FALSE(b <= a);
-  // ne
-  ASSERT_TRUE(a != b);
-  // mul
-  ASSERT_TRUE(a * 2 == b);
-  ASSERT_TRUE(a * 3 >= b);
-  ASSERT_TRUE(a * 2 == 2 * a);
-}
diff --git a/test/cpp/c10d/ProcessGroupGlooAsyncTest.cpp b/test/cpp/c10d/ProcessGroupGlooAsyncTest.cpp
index 0815de7e6b648..0059560a602ab 100644
--- a/test/cpp/c10d/ProcessGroupGlooAsyncTest.cpp
+++ b/test/cpp/c10d/ProcessGroupGlooAsyncTest.cpp
@@ -243,7 +243,7 @@ void runAsyncBroadcastTest(
       for (const auto i : c10::irange(numProcesses)) {
         auto tensors = tests[i].getTensors();
         for (const auto& tensor : tensors) {
-          const auto* const data = tensor.data_ptr<float>();
+          const auto* const data = tensor.const_data_ptr<float>();
           for (const auto k : c10::irange(tensor.numel())) {
             EXPECT_EQ(data[k], expected);
           }
diff --git a/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp b/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
index 20d3b657c0771..aef97daae2e47 100644
--- a/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
+++ b/test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp
@@ -20,20 +20,18 @@ constexpr int kNcclErrorHandlingVersion = 2400;
 class WorkNCCLSimulateErrors : public c10d::ProcessGroupNCCL::WorkNCCL {
  public:
   WorkNCCLSimulateErrors(
-      const std::vector<at::Device>& devices,
+      at::Device& device,
       bool simulate_error,
       int rank,
       c10d::OpType opType,
       uint64_t seq)
-      : WorkNCCL(devices, rank, opType, seq), simulateError_(simulate_error) {}
+      : WorkNCCL(device, rank, opType, seq), simulateError_(simulate_error) {}
 
-  std::exception_ptr checkForNCCLErrors(
-      const std::vector<std::shared_ptr<c10d::NCCLComm>>& ncclComms)
-      const override {
+  std::exception_ptr checkForNCCLErrors() override {
     if (simulateError_) {
       return std::make_exception_ptr(std::runtime_error("Error"));
     }
-    return c10d::ProcessGroupNCCL::WorkNCCL::checkForNCCLErrors(ncclComms);
+    return c10d::ProcessGroupNCCL::WorkNCCL::checkForNCCLErrors();
   }
 
  private:
@@ -50,11 +48,11 @@ class ProcessGroupNCCLSimulateErrors : public c10d::ProcessGroupNCCL {
       : ProcessGroupNCCL(store, rank, size, opts), simulateError_(false) {}
 
   std::exception_ptr checkForNCCLErrors(
-      const std::vector<std::shared_ptr<c10d::NCCLComm>>& ncclComms) override {
+      std::shared_ptr<c10d::NCCLComm>& ncclComm) override {
     if (simulateError_) {
       return std::make_exception_ptr(std::runtime_error("Error"));
     }
-    return c10d::ProcessGroupNCCL::checkForNCCLErrors(ncclComms);
+    return c10d::ProcessGroupNCCL::checkForNCCLErrors(ncclComm);
   }
 
   std::chrono::duration<int64_t, std::milli> getWatchdogSleepInterval() {
@@ -63,14 +61,15 @@ class ProcessGroupNCCLSimulateErrors : public c10d::ProcessGroupNCCL {
   }
 
   c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> initWork(
-      std::vector<at::Device> devices,
+      at::Device& device,
       int rank,
       c10d::OpType opType,
       const char* profilingTitle,
       const std::vector<at::Tensor>& inputs = {},
-      const std::vector<at::Tensor>& outputs = {}) override {
+      const std::vector<at::Tensor>& outputs = {},
+      bool record = false) override {
     return c10::make_intrusive<WorkNCCLSimulateErrors>(
-        devices, simulateError_, rank, opType, seq_);
+        device, simulateError_, rank, opType, seq_);
   }
 
   size_t getNCCLCommCacheSize() {
@@ -92,12 +91,12 @@ class ProcessGroupNCCLSimulateErrors : public c10d::ProcessGroupNCCL {
 class WorkNCCLTimedoutErrors : public c10d::ProcessGroupNCCL::WorkNCCL {
  public:
   WorkNCCLTimedoutErrors(
-      const std::vector<at::Device>& devices,
+      at::Device& device,
       bool set_timedout_error,
       int rank,
       c10d::OpType opType,
       uint64_t seq)
-      : WorkNCCL(devices, rank, opType, seq),
+      : WorkNCCL(device, rank, opType, seq),
         setTimedoutError_(set_timedout_error) {}
 
  private:
@@ -124,14 +123,15 @@ class ProcessGroupNCCLTimedOutErrors : public ProcessGroupNCCLSimulateErrors {
         setTimedoutError_(false) {}
 
   c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> initWork(
-      std::vector<at::Device> devices,
+      at::Device& device,
       int rank,
       c10d::OpType opType,
       const char* profilingTitle,
       const std::vector<at::Tensor>& inputs = {},
-      const std::vector<at::Tensor>& outputs = {}) override {
+      const std::vector<at::Tensor>& outputs = {},
+      bool record = false) override {
     return c10::make_intrusive<WorkNCCLTimedoutErrors>(
-        devices, setTimedoutError_, rank, opType, seq_);
+        device, setTimedoutError_, rank, opType, seq_);
   }
 
   void setTimedoutError() {
@@ -188,7 +188,8 @@ class ProcessGroupNCCLNoHeartbeatCaught
   }
 
   void forceTryWriteDebugInfo() {
-    std::future<bool> asyncDebugDump = launchAsyncDebugDump();
+    std::future<bool> asyncDebugDump = std::async(
+        std::launch::async, [this]() { return this->dumpDebuggingInfo(); });
     asyncDebugDump.wait();
   }
 
@@ -254,16 +255,19 @@ class ProcessGroupNCCLErrorsTest : public ::testing::Test {
   void SetUp() override {
     // Enable LOG(INFO) messages.
     c10::initLogging();
-    size_t numDevices = cudaNumDevices();
+    // Need to have this check for at SetUp to make sure we only run the test --
+    // including the init -- when there are GPUs available.
+    if (skipTest()) {
+      GTEST_SKIP() << "Skipping ProcessGroupNCCLErrorsTest because system "
+                   << "requirement is not met (no CUDA or GPU).";
+    }
+
+    size_t numDevices = 1; // One device per rank (thread)
     TemporaryFile file;
     store_ = c10::make_intrusive<::c10d::FileStore>(file.path, 1);
 
-    at::cuda::OptionalCUDAGuard deviceGuard;
     tensors_.resize(numDevices);
-    for (const auto i : c10::irange(numDevices)) {
-      deviceGuard.set_index(i);
-      tensors_[i] = at::ones({3, 3}, at::kCUDA);
-    }
+    tensors_[0] = at::empty({3, 3}, at::kCUDA);
   }
 
   void TearDown() override {
@@ -275,10 +279,6 @@ class ProcessGroupNCCLErrorsTest : public ::testing::Test {
 };
 
 TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsBlocking) {
-  if (skipTest()) {
-    return;
-  }
-
   ASSERT_TRUE(setenv(c10d::TORCH_NCCL_BLOCKING_WAIT[0].c_str(), "1", 1) == 0);
   auto options = c10d::ProcessGroupNCCL::Options::create();
   options->timeout = std::chrono::milliseconds(1000);
@@ -286,7 +286,6 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsBlocking) {
 
   auto work = pg.allreduce(tensors_);
   work->wait();
-  EXPECT_TRUE(work->isSuccess());
   EXPECT_EQ(1, pg.getNCCLCommCacheSize());
 
   // Now run all reduce with errors.
@@ -296,17 +295,12 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsBlocking) {
 
   // Verify the work item failed.
   EXPECT_TRUE(work->isCompleted());
-  EXPECT_FALSE(work->isSuccess());
   EXPECT_THROW(work->wait(), std::runtime_error);
 
   // Communicators might be aborted here, further operations would fail.
 }
 
 TEST_F(ProcessGroupNCCLErrorsTest, testNCCLTimedoutErrorsBlocking) {
-  if (skipTest()) {
-    return;
-  }
-
   ASSERT_TRUE(setenv(c10d::TORCH_NCCL_BLOCKING_WAIT[0].c_str(), "1", 1) == 0);
   auto options = c10d::ProcessGroupNCCL::Options::create();
   options->timeout = std::chrono::milliseconds(3000);
@@ -314,7 +308,6 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLTimedoutErrorsBlocking) {
 
   auto work = pg.allreduce(tensors_);
   work->wait();
-  EXPECT_TRUE(work->isSuccess());
   EXPECT_EQ(1, pg.getNCCLCommCacheSize());
 
   // Now run all reduce with errors.
@@ -326,17 +319,12 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLTimedoutErrorsBlocking) {
 }
 
 TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNonBlocking) {
-  if (skipTest()) {
-    return;
-  }
-
   auto options = c10d::ProcessGroupNCCL::Options::create();
   options->timeout = std::chrono::milliseconds(3000);
   ProcessGroupNCCLSimulateErrors pg(store_, 0, 1, options);
 
   auto work = pg.allreduce(tensors_);
   pg.barrier()->wait();
-  EXPECT_TRUE(work->isSuccess());
   EXPECT_EQ(1, pg.getNCCLCommCacheSize());
 
   // Now run all reduce with errors.
@@ -347,10 +335,7 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNonBlocking) {
   work->wait();
   pg.barrier()->wait();
 
-  // Verify the work item failed.
   EXPECT_TRUE(work->isCompleted());
-  EXPECT_FALSE(work->isSuccess());
-
   // Communicators might be aborted here, further operations would fail.
 }
 
@@ -388,10 +373,6 @@ class TestDebugInfoWriter : public c10d::DebugInfoWriter {
 };
 
 TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
-  if (skipTest()) {
-    return;
-  }
-
   int heartBeatIntervalInSec = 2;
   std::string timeInterval = std::to_string(heartBeatIntervalInSec);
   ASSERT_TRUE(setenv(c10d::TORCH_NCCL_BLOCKING_WAIT[0].c_str(), "1", 1) == 0);
@@ -408,6 +389,7 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
       setenv("TORCH_NCCL_DEBUG_INFO_TEMP_FILE", tempFilename.c_str(), 1) == 0);
   // Enable nccl flight recorder.
   ASSERT_TRUE(setenv("TORCH_NCCL_TRACE_BUFFER_SIZE", "10", 1) == 0);
+  ASSERT_TRUE(setenv(c10d::TORCH_NCCL_DUMP_ON_TIMEOUT[0].c_str(), "1", 1) == 0);
   auto options = c10d::ProcessGroupNCCL::Options::create();
   // Set a long watchdog timeout, so that we have enough time to lock the
   // watchdog and let the heartbeat monitor thread to kick in.
@@ -426,7 +408,6 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
   // Normal collective case.
   auto work = pg.allreduce(tensors_);
   work->wait();
-  EXPECT_TRUE(work->isSuccess());
 
   work = pg.allreduce(tensors_);
   {
@@ -440,7 +421,6 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
     EXPECT_TRUE(pg.getErrorCaughtFlag());
   }
   work->wait();
-  EXPECT_TRUE(work->isSuccess());
   EXPECT_TRUE(traces.size() > 0);
   auto filename = c10::str(tempFilename, 0);
   auto traceFromStorage = readTraceFromFile(filename, traces.size());
@@ -452,6 +432,9 @@ TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsNoHeartbeat) {
 class ProcessGroupNCCLWatchdogTimeoutTest : public ProcessGroupNCCLErrorsTest {
  protected:
   void SetUp() override {
+    // TODO (kwen2501)
+    GTEST_SKIP() << "Skipping tests under ProcessGroupNCCLWatchdogTimeoutTest; "
+                 << "will rewrite them after refactoring Work queues.";
     ProcessGroupNCCLErrorsTest::SetUp();
     std::string timeInterval = std::to_string(heartBeatIntervalInSec);
     ASSERT_TRUE(setenv(c10d::TORCH_NCCL_BLOCKING_WAIT[0].c_str(), "1", 1) == 0);
@@ -489,12 +472,6 @@ class ProcessGroupNCCLWatchdogTimeoutTest : public ProcessGroupNCCLErrorsTest {
 };
 
 TEST_F(ProcessGroupNCCLWatchdogTimeoutTest, testNCCLTimedoutDebugInfoFinished) {
-  // Need to have this check for every test to make sure we only run the test
-  // when there are GPUs available.
-  if (skipTest()) {
-    return;
-  }
-
   ProcessGroupNCCLNoHeartbeatCaught pg(store_, 0, 1, options_);
   // Write debug info will lead to watchdog thread to wait for 30 seconds.
   // And this is hard to override, so we just call it before hand. Otherwise,
@@ -515,11 +492,6 @@ TEST_F(ProcessGroupNCCLWatchdogTimeoutTest, testNCCLTimedoutDebugInfoFinished) {
 }
 
 TEST_F(ProcessGroupNCCLWatchdogTimeoutTest, testNCCLTimedoutDebugInfoStuck) {
-  // Need to have this check for every test to make sure we only run the test
-  // when there are GPUs available.
-  if (skipTest()) {
-    return;
-  }
   ProcessGroupNCCLDebugInfoStuck pg(store_, 0, 1, options_);
   // Need to keep main thread sleep longer so that we can let heartbeat monitor
   // thread to finish the extra wait and flip the flag.
diff --git a/test/cpp/c10d/ProcessGroupNCCLTest.cpp b/test/cpp/c10d/ProcessGroupNCCLTest.cpp
index 591db55a8a16e..d1c2380274278 100644
--- a/test/cpp/c10d/ProcessGroupNCCLTest.cpp
+++ b/test/cpp/c10d/ProcessGroupNCCLTest.cpp
@@ -49,10 +49,6 @@ class NCCLTestBase {
     c10::intrusive_ptr<c10d::ProcessGroupNCCL::Options> opts =
         c10::make_intrusive<c10d::ProcessGroupNCCL::Options>();
     opts->timeout = pgTimeout_;
-    setenv(
-        c10d::TORCH_ENABLE_NCCL_HEALTH_CHECK[0].c_str(),
-        "1",
-        /* overwrite */ 1);
 #ifdef NCCL_HAS_COMM_SPLIT
     if (split_from) {
       opts->split_from = *split_from;
@@ -75,12 +71,14 @@ class NCCLTest : public NCCLTestBase {
  public:
   NCCLTest(
       const std::string& path,
+      int rank,
       int worldSize,
       std::chrono::milliseconds pgTimeout =
           c10d::kProcessGroupNCCLDefaultTimeout,
       int inputDim = 3)
       : NCCLTestBase(path, pgTimeout),
-        numDevices_(cudaNumDevices()),
+        numDevices_(1), // one device per rank (thread)
+        rank_(rank),
         worldSize_(worldSize) {
     // Each device has a single tensor to perf the NCCL op
     ::at::globalContext().lazyInitCUDA();
@@ -88,8 +86,9 @@ class NCCLTest : public NCCLTestBase {
     inputs_.resize(numDevices_);
     outputs_.resize(numDevices_);
     at::cuda::OptionalCUDAGuard deviceGuard;
+    assert(numDevices_ == 1);
     for (const auto i : c10::irange(numDevices_)) {
-      deviceGuard.set_index(i);
+      deviceGuard.set_index(rank_);
       tensors_[i] = at::empty({inputDim, inputDim}, at::kCUDA);
       inputs_[i].resize(worldSize_ * numDevices_);
       outputs_[i].resize(worldSize_ * numDevices_);
@@ -106,11 +105,9 @@ class NCCLTest : public NCCLTestBase {
     // and pass this along to the collective (since it uses the THC
     // getters to retrieve the current stream).
     //
-    streams_.reserve(numDevices_);
-    for (const auto i : c10::irange(numDevices_)) {
-      deviceGuard.set_index(i);
-      streams_.push_back(at::cuda::getStreamFromPool());
-    }
+    // 1 device only, hence 1 stream only
+    deviceGuard.set_index(rank_);
+    streams_.push_back(at::cuda::getStreamFromPool());
   }
 
   void wait(
@@ -172,7 +169,7 @@ class NCCLTest : public NCCLTestBase {
   void launchDeviceSleep() {
     at::cuda::OptionalCUDAGuard deviceGuard;
     for (const auto i : c10::irange(numDevices_)) {
-      deviceGuard.set_index(i);
+      deviceGuard.set_index(rank_);
       cudaSleep(streams_[i], 2000 * 1000 * 1000);
     }
   }
@@ -181,7 +178,7 @@ class NCCLTest : public NCCLTestBase {
   void valueInitialization() {
     at::cuda::OptionalCUDAGuard deviceGuard;
     for (const auto i : c10::irange(numDevices_)) {
-      deviceGuard.set_index(i);
+      deviceGuard.set_index(rank_);
       tensors_[i].fill_(pg_->getRank() * numDevices_ + i);
     }
   }
@@ -202,7 +199,7 @@ class NCCLTest : public NCCLTestBase {
   void valueInitializationForSparse() {
     at::cuda::OptionalCUDAGuard deviceGuard;
     for (const auto i : c10::irange(numDevices_)) {
-      deviceGuard.set_index(i);
+      deviceGuard.set_index(rank_);
       tensors_[i].fill_(pg_->getRank() * numDevices_ + i + 1);
       // Convert the dense tensor to a sparse tensor in COO row format
       tensors_[i] = to_sparse_row_indices_format(tensors_[i]);
@@ -210,6 +207,7 @@ class NCCLTest : public NCCLTestBase {
   }
 
   const int numDevices_;
+  int rank_;
   int worldSize_;
   std::vector<at::Tensor> tensors_;
   std::vector<std::vector<at::Tensor>> inputs_;
@@ -219,8 +217,8 @@ class NCCLTest : public NCCLTestBase {
 
 class AllreduceNCCLTest : public NCCLTest {
  public:
-  AllreduceNCCLTest(const std::string& path, int worldSize)
-      : NCCLTest(path, worldSize) {}
+  AllreduceNCCLTest(const std::string& path, int rank, int worldSize)
+      : NCCLTest(path, rank, worldSize) {}
 
   c10::intrusive_ptr<c10d::Work> run() {
     // For the duration of this function, make THC use our streams
@@ -242,9 +240,14 @@ class AllreduceNCCLTest : public NCCLTest {
 
 class SparseAllreduceNCCLTest : public NCCLTest {
  public:
-  SparseAllreduceNCCLTest(const std::string& path, int worldSize, int inputDim)
+  SparseAllreduceNCCLTest(
+      const std::string& path,
+      int rank,
+      int worldSize,
+      int inputDim)
       : NCCLTest(
             path,
+            rank,
             worldSize,
             c10d::kProcessGroupNCCLDefaultTimeout,
             inputDim) {}
@@ -261,8 +264,8 @@ class SparseAllreduceNCCLTest : public NCCLTest {
 
 class BroadcastNCCLTest : public NCCLTest {
  public:
-  BroadcastNCCLTest(const std::string& path, int worldSize)
-      : NCCLTest(path, worldSize) {}
+  BroadcastNCCLTest(const std::string& path, int rank, int worldSize)
+      : NCCLTest(path, rank, worldSize) {}
 
   c10::intrusive_ptr<c10d::Work> run(int rootRank, int rootTensor) {
     // For the duration of this function, make THC use our streams
@@ -280,8 +283,8 @@ class BroadcastNCCLTest : public NCCLTest {
 
 class ReduceNCCLTest : public NCCLTest {
  public:
-  ReduceNCCLTest(const std::string& path, int worldSize)
-      : NCCLTest(path, worldSize) {}
+  ReduceNCCLTest(const std::string& path, int rank, int worldSize)
+      : NCCLTest(path, rank, worldSize) {}
 
   c10::intrusive_ptr<c10d::Work> run(int rootRank, int rootTensor) {
     // For the duration of this function, make THC use our streams
@@ -299,8 +302,8 @@ class ReduceNCCLTest : public NCCLTest {
 
 class AllgatherNCCLTest : public NCCLTest {
  public:
-  AllgatherNCCLTest(const std::string& path, int worldSize)
-      : NCCLTest(path, worldSize) {}
+  AllgatherNCCLTest(const std::string& path, int rank, int worldSize)
+      : NCCLTest(path, rank, worldSize) {}
 
   c10::intrusive_ptr<c10d::Work> run() {
     // For the duration of this function, make THC use our streams
@@ -315,8 +318,8 @@ class AllgatherNCCLTest : public NCCLTest {
 
 class AllgatherBaseNCCLTest : public NCCLTest {
  public:
-  AllgatherBaseNCCLTest(const std::string& path, int worldSize)
-      : NCCLTest(path, worldSize) {
+  AllgatherBaseNCCLTest(const std::string& path, int rank, int worldSize)
+      : NCCLTest(path, rank, worldSize) {
     output_tensor_ = at::empty({worldSize_, 3, 3}, at::kCUDA);
   }
 
@@ -347,8 +350,8 @@ class AllgatherBaseNCCLTest : public NCCLTest {
 };
 
 struct ReduceScatterNCCLTest : NCCLTest {
-  ReduceScatterNCCLTest(const std::string& path, int worldSize)
-      : NCCLTest(path, worldSize) {}
+  ReduceScatterNCCLTest(const std::string& path, int rank, int worldSize)
+      : NCCLTest(path, rank, worldSize) {}
 
   c10::intrusive_ptr<c10d::Work> run() {
     // For the duration of this function, make THC use our streams
@@ -358,12 +361,8 @@ struct ReduceScatterNCCLTest : NCCLTest {
     launchDeviceSleep();
 
     // Launch value initialization for every tensor
-    for (const auto i : c10::irange(numDevices_)) {
-      deviceGuard.set_index(i);
-      for (auto j = 0; j < worldSize_ * numDevices_; ++j) {
-        inputs_[i][j].fill_(
-            pg_->getRank() * numDevices_ * worldSize_ + i * worldSize_ + j);
-      }
+    for (auto j = 0; j < worldSize_; ++j) {
+      inputs_[0][j].fill_(rank_ * worldSize_ + j);
     }
 
     return pg_->reduce_scatter(tensors_, inputs_);
@@ -372,8 +371,10 @@ struct ReduceScatterNCCLTest : NCCLTest {
 
 class ReduceScatterBaseNCCLTest : public NCCLTest {
  public:
-  ReduceScatterBaseNCCLTest(const std::string& path, int worldSize)
-      : NCCLTest(path, worldSize) {
+  ReduceScatterBaseNCCLTest(const std::string& path, int rank, int worldSize)
+      : NCCLTest(path, rank, worldSize) {
+    at::cuda::OptionalCUDAGuard deviceGuard;
+    deviceGuard.set_index(rank_);
     output_tensor_ = at::empty({1}, at::kCUDA);
     input_tensor_ = at::empty({worldSize}, at::kCUDA);
     for (const auto i : c10::irange(worldSize)) {
@@ -405,7 +406,7 @@ class ReduceScatterBaseNCCLTest : public NCCLTest {
 };
 
 void testAllreduce(const std::string& path, int rank, int size) {
-  auto test = AllreduceNCCLTest(path, size);
+  auto test = AllreduceNCCLTest(path, rank, size);
   test.initialize(rank, size);
   auto work = test.run();
   // Wait for work to finish
@@ -416,7 +417,7 @@ void testAllreduce(const std::string& path, int rank, int size) {
   const auto expected = (totalNumGPUs * (totalNumGPUs - 1)) / 2;
   const auto tensors = test.getTensors();
   for (const auto& tensor : tensors) {
-    const auto* const data = tensor.data_ptr<float>();
+    const auto* const data = tensor.const_data_ptr<float>();
     for (const auto k : c10::irange(tensor.numel())) {
       EXPECT_EQ(data[k], expected)
           << "Allreduce outputs do not match expected outputs";
@@ -426,7 +427,7 @@ void testAllreduce(const std::string& path, int rank, int size) {
 
 void testSparseAllreduce(const std::string& path, int rank, int size) {
   const int inputDim = 3;
-  auto test = SparseAllreduceNCCLTest(path, size, inputDim);
+  auto test = SparseAllreduceNCCLTest(path, rank, size, inputDim);
   test.initialize(rank, size);
   auto work = test.run();
   // Wait for work to finish
@@ -462,7 +463,7 @@ void testSparseAllreduce(const std::string& path, int rank, int size) {
     }
 
     // validate all tensor values are expected value
-    const auto* const data = values.data_ptr<float>();
+    const auto* const data = values.const_data_ptr<float>();
     for (const auto k : c10::irange(values.numel())) {
       EXPECT_EQ(data[k], expected)
           << "Allreduce outputs do not match expected outputs";
@@ -477,7 +478,7 @@ void testSparseAllreduce(const std::string& path, int rank, int size) {
 
 void testSparseAllreduceLarge(const std::string& path, int rank, int size) {
   const int inputDim = 2500;
-  auto test = SparseAllreduceNCCLTest(path, size, inputDim);
+  auto test = SparseAllreduceNCCLTest(path, rank, size, inputDim);
   test.initialize(rank, size);
   auto work = test.run();
   // Wait for work to finish
@@ -513,7 +514,7 @@ void testSparseAllreduceLarge(const std::string& path, int rank, int size) {
     }
 
     // validate all tensor values are expected value
-    const auto* const data = values.data_ptr<float>();
+    const auto* const data = values.const_data_ptr<float>();
     for (const auto k : c10::irange(values.numel())) {
       EXPECT_EQ(data[k], expected)
           << "Allreduce outputs do not match expected outputs";
@@ -527,7 +528,7 @@ void testSparseAllreduceLarge(const std::string& path, int rank, int size) {
 }
 
 void testBroadcast(const std::string& path, int rank, int size) {
-  auto test = BroadcastNCCLTest(path, size);
+  auto test = BroadcastNCCLTest(path, rank, size);
   test.initialize(rank, size);
 
   const int numDevices = test.numDevices();
@@ -543,7 +544,7 @@ void testBroadcast(const std::string& path, int rank, int size) {
       const auto expected = (rootRank * numDevices + rootTensor);
       const auto tensors = test.getTensors();
       for (const auto& tensor : tensors) {
-        const auto* const data = tensor.data_ptr<float>();
+        const auto* const data = tensor.const_data_ptr<float>();
         for (const auto k : c10::irange(tensor.numel())) {
           EXPECT_EQ(data[k], expected)
               << "Broadcast outputs do not match expected outputs";
@@ -554,7 +555,7 @@ void testBroadcast(const std::string& path, int rank, int size) {
 }
 
 void testReduce(const std::string& path, int rank, int size) {
-  auto test = ReduceNCCLTest(path, size);
+  auto test = ReduceNCCLTest(path, rank, size);
   test.initialize(rank, size);
 
   const int numDevices = test.numDevices();
@@ -583,7 +584,7 @@ void testReduce(const std::string& path, int rank, int size) {
 }
 
 void testAllgather(const std::string& path, int rank, int size) {
-  auto test = AllgatherNCCLTest(path, size);
+  auto test = AllgatherNCCLTest(path, rank, size);
   test.initialize(rank, size);
   auto work = test.run();
   // Wait for work to finish
@@ -607,7 +608,7 @@ void testAllgather(const std::string& path, int rank, int size) {
 }
 
 void testAllgatherBase(const std::string& path, int rank, int size) {
-  auto test = AllgatherBaseNCCLTest(path, size);
+  auto test = AllgatherBaseNCCLTest(path, rank, size);
   test.initialize(rank, size);
   auto work = test.run();
   // Wait for work to finish
@@ -628,7 +629,7 @@ void testAllgatherBase(const std::string& path, int rank, int size) {
   }
 }
 void testReduceScatterBase(const std::string& path, int rank, int size) {
-  auto test = ReduceScatterBaseNCCLTest(path, size);
+  auto test = ReduceScatterBaseNCCLTest(path, rank, size);
   test.initialize(rank, size);
   auto work = test.run();
   // Wait for work to finish
@@ -650,106 +651,78 @@ void testReduceScatterBase(const std::string& path, int rank, int size) {
 }
 
 void testReduceScatter(const std::string& path, int rank, int size) {
-  auto test = ReduceScatterNCCLTest(path, size);
+  auto test = ReduceScatterNCCLTest(path, rank, size);
   test.initialize(rank, size);
   auto work = test.run();
   // Wait for work to finish
   test.wait(work);
 
-  const auto participants = test.numDevices() * size;
+  const auto participants = size;
   const auto base = (participants * (participants - 1)) / 2;
 
   // Validation
   auto tensors = test.getTensors();
-  // device index
-  for (const auto i : c10::irange(tensors.size())) {
-    const auto modifier = participants * (rank * participants + i);
-    const auto expected = base + modifier;
-    auto& tensor = tensors[i];
-    auto data = tensor.data_ptr<float>();
-    for (const auto j : c10::irange(tensor.numel())) {
-      EXPECT_EQ(data[j], expected)
-          << "ReduceScatter outputs do not match expected outputs!";
-    }
+  const auto modifier = rank * participants;
+  const auto expected = base * participants + modifier;
+  auto& tensor = tensors[0];
+  auto data = tensor.data_ptr<float>();
+  for (const auto j : c10::irange(tensor.numel())) {
+    EXPECT_EQ(data[j], expected)
+        << "ReduceScatter outputs do not match expected outputs!";
   }
 }
 
-void testProcessGroupNCCLHealthCheckFailHelper(
-    const std::string& path,
-    bool timeout) {
-  // simulate world_size > 1 here via threads.
-  const int worldSize = 4;
-  std::unordered_set<uint64_t> nums;
-  auto runTest = [&](int i) {
-    NCCLTest test(path, worldSize, std::chrono::milliseconds(3000));
-    // Catch error relating to health check failure
-    bool error_caught = false;
-    try {
-      test.initialize(timeout ? 0 : -1, worldSize);
-    } catch (const std::exception& e) {
-      std::string errMsg = e.what();
-      const std::string kTimeoutErr =
-          "Failed to initialize NCCL communicator on rank";
-      const std::string kInvalidRankErr = "Invalid rank";
-      std::string expectedSubstr = timeout ? kTimeoutErr : kInvalidRankErr;
-      bool cond = errMsg.find(expectedSubstr) != std::string::npos;
-      EXPECT_TRUE(cond);
-      error_caught = true;
-    }
-    EXPECT_TRUE(error_caught);
-  };
-  std::vector<std::thread> threads;
-  threads.reserve(worldSize);
-  for (const auto r : c10::irange(worldSize)) {
-    threads.emplace_back(std::thread([=]() { runTest(r); }));
-  }
-  for (auto& t : threads) {
-    t.join();
-  }
+void testSequenceNumInit(const std::string& path, int rank, int size) {
+  NCCLTest test(path, rank, size);
+  test.initialize(rank, size);
+  test.getProcessGroup()->setSequenceNumberForGroup();
+  auto seqNum = test.getProcessGroup()->getSequenceNumberForGroup();
+  EXPECT_EQ(seqNum, 0);
 }
 
-void testProcessGroupNCCLHealthCheckFailException(
-    const std::string& path,
-    int /* unused */,
-    int /* unused */) {
-  testProcessGroupNCCLHealthCheckFailHelper(path, /* timeout */ false);
-}
+void testSplittingCommunicator(const std::string& path, int rank, int size) {
+  auto test1 = BroadcastNCCLTest(path, rank, size);
+  test1.initialize(rank, size);
 
-void testProcessGroupNCCLHealthCheckFailTimeout(
-    const std::string& path,
-    int /* unused */,
-    int /* unused */) {
-  testProcessGroupNCCLHealthCheckFailHelper(path, /* timeout */ true);
-}
+  auto test2 = BroadcastNCCLTest(path, rank, size);
+  test2.initialize(rank, size, test1.getProcessGroup());
 
-void testSequenceNumInit(
-    const std::string& path,
-    int /* unused */,
-    int /* unused */) {
-  // Note: ProcessGroupNCCLTest doesn't support multiprocess testing. So we
-  // simulate world_size > 1 here via threads.
-  const int worldSize = 2;
-  std::mutex m;
-  std::unordered_set<uint64_t> nums;
-  auto runTest = [&](int i) {
-    NCCLTest test(path, worldSize);
-    test.initialize(i, worldSize);
-    test.getProcessGroup()->setSequenceNumberForGroup();
-    std::lock_guard<std::mutex> lock(m);
-    auto seqNum = test.getProcessGroup()->getSequenceNumberForGroup();
-    nums.insert(seqNum);
-  };
-  std::vector<std::thread> threads;
-  threads.reserve(worldSize);
-  for (const auto r : c10::irange(worldSize)) {
-    threads.emplace_back(std::thread([=]() { runTest(r); }));
-  }
-  for (auto& t : threads) {
-    t.join();
-  }
-  EXPECT_EQ(nums.size(), 1);
+  // Steal the broadcast test and issue it for both of our groups.
+  // This ensures consistent full collective communication.  TODO:
+  // maybe refactor the guts rather than copy-pasta, but it may not be
+  // worth it.
+  for (auto test : {&test1, &test2}) {
+    const int numDevices = test->numDevices();
+    // try every permutation of root rank and root tensor
+    for (const auto rootRank : c10::irange(size)) {
+      for (const auto rootTensor : c10::irange(numDevices)) {
+        auto work = test->run(rootRank, rootTensor);
+        test->wait(work);
+
+        // Check results
+        const auto expected = (rootRank * numDevices + rootTensor);
+        const auto tensors = test->getTensors();
+        for (const auto& tensor : tensors) {
+          const auto* const data = tensor.const_data_ptr<float>();
+          for (const auto k : c10::irange(tensor.numel())) {
+            EXPECT_EQ(data[k], expected)
+                << "Broadcast outputs do not match expected outputs";
+          }
+        }
+      }
+    }
+  }
+
+  // Now that we've run full operations on both the original and split process
+  // group, ensure we saw exactly as many splits as we expected: 0 in the
+  // original process group, and one per device in the second.
+  EXPECT_EQ(test2.getProcessGroup()->getCommSplitCounter(), 0);
+  EXPECT_EQ(test1.getProcessGroup()->getCommSplitCounter(), test1.numDevices());
 }
 
+// All testAbc's use this signature
+using FuncType = void (*)(const std::string&, int, int);
+
 class ProcessGroupNCCLTest : public ::testing::Test {
  protected:
   void SetUp() override {
@@ -757,13 +730,10 @@ class ProcessGroupNCCLTest : public ::testing::Test {
     // Use WORLD_SIZE and RANK environmental variables to do multi-node
     // distributed testing
     auto sizeEnv = std::getenv("WORLD_SIZE");
-    auto rankEnv = std::getenv("RANK");
-
-    if (sizeEnv && rankEnv) {
+    if (sizeEnv) {
       size_ = std::stoi(std::string(sizeEnv));
-      rank_ = std::stoi(std::string(rankEnv));
     }
-    LOG(INFO) << "Multi-node world size: " << size_ << " rank: " << rank_;
+    LOG(INFO) << "ProcessGroupNCCLTest world size: " << size_;
   }
 
   void TearDown() override {
@@ -780,177 +750,102 @@ class ProcessGroupNCCLTest : public ::testing::Test {
     return false;
   }
 
+  void multiThreadRun(FuncType testFunc) {
+    TemporaryFile file;
+    std::vector<std::thread> threads;
+    threads.reserve(size_);
+    for (const auto rank : c10::irange(size_)) {
+      threads.emplace_back(std::thread(testFunc, file.path, rank, size_));
+    }
+    for (const auto rank : c10::irange(size_)) {
+      threads[rank].join();
+    }
+  }
+
   int size_{1};
-  int rank_{0};
 };
 
 TEST_F(ProcessGroupNCCLTest, testAllreduce) {
   if (skipTest()) {
     return;
   }
-  {
-    TemporaryFile file;
-    testAllreduce(file.path, rank_, size_);
-  }
+  multiThreadRun(testAllreduce);
 }
 
 TEST_F(ProcessGroupNCCLTest, testBroadcast) {
   if (skipTest()) {
     return;
   }
-  {
-    TemporaryFile file;
-    testBroadcast(file.path, rank_, size_);
-  }
+  multiThreadRun(testBroadcast);
 }
 
 TEST_F(ProcessGroupNCCLTest, testReduce) {
   if (skipTest()) {
     return;
   }
-  {
-    TemporaryFile file;
-    testReduce(file.path, rank_, size_);
-  }
+  multiThreadRun(testReduce);
 }
 
 TEST_F(ProcessGroupNCCLTest, testAllgather) {
   if (skipTest()) {
     return;
   }
-  {
-    TemporaryFile file;
-    testAllgather(file.path, rank_, size_);
-  }
+  multiThreadRun(testAllgather);
 }
 
 TEST_F(ProcessGroupNCCLTest, testAllgatherBase) {
   if (skipTest()) {
     return;
   }
-  {
-    TemporaryFile file;
-    testAllgatherBase(file.path, rank_, size_);
-  }
+  multiThreadRun(testAllgatherBase);
 }
 
 TEST_F(ProcessGroupNCCLTest, testReduceScatter) {
   if (skipTest()) {
     return;
   }
-  {
-    TemporaryFile file;
-    testReduceScatter(file.path, rank_, size_);
-  }
+  multiThreadRun(testReduceScatter);
 }
 
 TEST_F(ProcessGroupNCCLTest, testSequenceNumInit) {
   if (skipTest()) {
     return;
   }
-  {
-    TemporaryFile file;
-    testSequenceNumInit(file.path, rank_, size_);
-  }
-}
-
-TEST_F(ProcessGroupNCCLTest, testProcessGroupNCCLHealthCheckFailTimeout) {
-  if (skipTest()) {
-    return;
-  }
-  {
-    TemporaryFile file;
-    testProcessGroupNCCLHealthCheckFailTimeout(file.path, rank_, size_);
-  }
-}
-
-TEST_F(ProcessGroupNCCLTest, testProcessGroupNCCLHealthCheckFailException) {
-  if (skipTest()) {
-    return;
-  }
-  {
-    TemporaryFile file;
-    testProcessGroupNCCLHealthCheckFailException(file.path, rank_, size_);
-  }
+  multiThreadRun(testSequenceNumInit);
 }
 
 TEST_F(ProcessGroupNCCLTest, testReduceScatterBase) {
   if (skipTest()) {
     return;
   }
-  {
-    TemporaryFile file;
-    testReduceScatterBase(file.path, rank_, size_);
-  }
+  multiThreadRun(testReduceScatterBase);
 }
 
 TEST_F(ProcessGroupNCCLTest, testBackendName) {
   if (skipTest()) {
     return;
   }
-  {
-    TemporaryFile file;
-    auto test = NCCLTestBase(file.path);
-    test.initialize(rank_, size_);
-    EXPECT_EQ(
-        test.getProcessGroup()->getBackendName(),
-        std::string(c10d::NCCL_BACKEND_NAME));
-  }
+  TemporaryFile file;
+  auto test = NCCLTestBase(file.path);
+  test.initialize(/*rank=*/0, /*world_size=*/1);
+  EXPECT_EQ(
+      test.getProcessGroup()->getBackendName(),
+      std::string(c10d::NCCL_BACKEND_NAME));
 }
 
 TEST_F(ProcessGroupNCCLTest, testSplittingCommunicator) {
   if (skipTest()) {
     return;
   }
-  TemporaryFile file;
-  auto test1 = BroadcastNCCLTest(file.path, size_);
-  test1.initialize(rank_, size_);
-
-  auto test2 = BroadcastNCCLTest(file.path, size_);
-  test2.initialize(rank_, size_, test1.getProcessGroup());
-
-  // Steal the broadcast test and issue it for both of our groups.
-  // This ensures consistent full collective communication.  TODO:
-  // maybe refactor the guts rather than copy-pasta, but it may not be
-  // worth it.
-  for (auto test : {&test1, &test2}) {
-    const int numDevices = test->numDevices();
-    // try every permutation of root rank and root tensor
-    for (const auto rootRank : c10::irange(size_)) {
-      for (const auto rootTensor : c10::irange(numDevices)) {
-        auto work = test->run(rootRank, rootTensor);
-        test->wait(work);
-
-        // Check results
-        const auto expected = (rootRank * numDevices + rootTensor);
-        const auto tensors = test->getTensors();
-        for (const auto& tensor : tensors) {
-          const auto* const data = tensor.data_ptr<float>();
-          for (const auto k : c10::irange(tensor.numel())) {
-            EXPECT_EQ(data[k], expected)
-                << "Broadcast outputs do not match expected outputs";
-          }
-        }
-      }
-    }
-  }
-
-  // Now that we've run full operations on both the original and split process
-  // group, ensure we saw exactly as many splits as we expected: 0 in the
-  // original process group, and one per device in the second.
-  EXPECT_EQ(test2.getProcessGroup()->getCommSplitCounter(), 0);
-  EXPECT_EQ(test1.getProcessGroup()->getCommSplitCounter(), test1.numDevices());
+  multiThreadRun(testSplittingCommunicator);
 }
 
-#ifdef IS_NCCL_EXP
+#ifdef IS_NCCLX
 TEST_F(ProcessGroupNCCLTest, testSparseAllreduce) {
   if (skipTest()) {
     return;
   }
-  {
-    TemporaryFile file;
-    testSparseAllreduce(file.path, rank_, size_);
-    testSparseAllreduceLarge(file.path, rank_, size_);
-  }
+  multiThreadRun(testSparseAllreduce);
+  multiThreadRun(testSparseAllreduceLarge);
 }
 #endif
diff --git a/test/cpp/common/main.cpp b/test/cpp/common/main.cpp
index 632aa8ee8af2f..7feb2a8fb7bfb 100644
--- a/test/cpp/common/main.cpp
+++ b/test/cpp/common/main.cpp
@@ -18,6 +18,7 @@ std::string add_negative_flag(const std::string& flag) {
 
 int main(int argc, char* argv[]) {
   ::testing::InitGoogleTest(&argc, argv);
+
   if (!torch::cuda::is_available()) {
     std::cout << "CUDA not available. Disabling CUDA and MultiCUDA tests"
               << std::endl;
diff --git a/test/cpp/jit/test_alias_analysis.cpp b/test/cpp/jit/test_alias_analysis.cpp
index deb0e6cc1afca..5a094462fca3f 100644
--- a/test/cpp/jit/test_alias_analysis.cpp
+++ b/test/cpp/jit/test_alias_analysis.cpp
@@ -4,6 +4,7 @@
 #include <torch/csrc/jit/frontend/ir_emitter.h>
 #include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/ir/type_hashing.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/runtime/graph_iterator.h>
@@ -1691,5 +1692,17 @@ TEST(NonDeterminismBackwardsCompatibility, BackwardsCompatibility) {
   }
 }
 
+TEST(TypeHashing, HashTypes) {
+  HashType hasher;
+
+  const TypePtr int_type = IntType::get();
+  const TypePtr float_type = FloatType::get();
+  ASSERT_NE(hasher(int_type), hasher(float_type));
+
+  const TypePtr int2_type = TupleType::create({int_type, int_type});
+  const TypePtr int3_type = TupleType::create({int_type, int_type, int_type});
+  ASSERT_NE(hasher(int2_type), hasher(int3_type));
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_custom_class_registrations.cpp b/test/cpp/jit/test_custom_class_registrations.cpp
index 3982407f19660..2595c64c9b170 100644
--- a/test/cpp/jit/test_custom_class_registrations.cpp
+++ b/test/cpp/jit/test_custom_class_registrations.cpp
@@ -46,6 +46,9 @@ struct Foo : torch::CustomClassHolder {
   int64_t add(int64_t z) {
     return (x + y) * z;
   }
+  at::Tensor add_tensor(at::Tensor z) {
+    return (x + y) * z;
+  }
   void increment(int64_t z) {
     this->x += z;
     this->y += z;
@@ -56,6 +59,10 @@ struct Foo : torch::CustomClassHolder {
   bool eq(c10::intrusive_ptr<Foo> other) {
     return this->x == other->x && this->y == other->y;
   }
+  std::tuple<std::tuple<std::string, int64_t>, std::tuple<std::string, int64_t>>
+  __obj_flatten__() {
+    return std::tuple(std::tuple("x", this->x), std::tuple("y", this->y));
+  }
 };
 
 struct _StaticMethod : torch::CustomClassHolder {
@@ -119,6 +126,93 @@ struct PickleTester : torch::CustomClassHolder {
   std::vector<int64_t> vals;
 };
 
+// Thread-safe Tensor Queue
+struct TensorQueue : torch::CustomClassHolder {
+  explicit TensorQueue(at::Tensor t) : init_tensor_(t) {}
+
+  explicit TensorQueue(c10::Dict<std::string, at::Tensor> dict) {
+    init_tensor_ = dict.at(std::string("init_tensor"));
+    const std::string key = "queue";
+    at::Tensor size_tensor;
+    size_tensor = dict.at(std::string(key + "/size")).cpu();
+    const auto* size_tensor_acc = size_tensor.const_data_ptr<int64_t>();
+    int64_t queue_size = size_tensor_acc[0];
+
+    for (const auto index : c10::irange(queue_size)) {
+      at::Tensor val;
+      queue_[index] = dict.at(key + "/" + c10::to_string(index));
+      queue_.push_back(val);
+    }
+  }
+
+  c10::Dict<std::string, at::Tensor> serialize() const {
+    c10::Dict<std::string, at::Tensor> dict;
+    dict.insert(std::string("init_tensor"), init_tensor_);
+    const std::string key = "queue";
+    dict.insert(
+        key + "/size", torch::tensor(static_cast<int64_t>(queue_.size())));
+    for (const auto index : c10::irange(queue_.size())) {
+      dict.insert(key + "/" + c10::to_string(index), queue_[index]);
+    }
+    return dict;
+  }
+  // Push the element to the rear of queue.
+  // Lock is added for thread safe.
+  void push(at::Tensor x) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    queue_.push_back(x);
+  }
+  // Pop the front element of queue and return it.
+  // If empty, return init_tensor_.
+  // Lock is added for thread safe.
+  at::Tensor pop() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    if (!queue_.empty()) {
+      auto val = queue_.front();
+      queue_.pop_front();
+      return val;
+    } else {
+      return init_tensor_;
+    }
+  }
+  // Return front element of queue, read-only.
+  // We might further optimize with read-write lock.
+  at::Tensor top() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    if (!queue_.empty()) {
+      auto val = queue_.front();
+      return val;
+    } else {
+      return init_tensor_;
+    }
+  }
+  int64_t size() {
+    return queue_.size();
+  }
+
+  std::vector<at::Tensor> clone_queue() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    std::vector<at::Tensor> ret;
+    for (const auto& t : queue_) {
+      ret.push_back(t.clone());
+    }
+    return ret;
+  }
+  std::vector<at::Tensor> get_raw_queue() {
+    std::vector<at::Tensor> raw_queue(queue_.begin(), queue_.end());
+    return raw_queue;
+  }
+
+  std::tuple<std::tuple<std::string, std::vector<at::Tensor>>> __obj_flatten__() {
+    return std::tuple(std::tuple("queue", this->get_raw_queue()));
+  }
+
+ private:
+  std::deque<at::Tensor> queue_;
+  std::mutex mutex_;
+  at::Tensor init_tensor_;
+};
+
 at::Tensor take_an_instance(const c10::intrusive_ptr<PickleTester>& instance) {
   return torch::zeros({instance->vals.back(), 4});
 }
@@ -277,7 +371,22 @@ struct ReLUClass : public torch::CustomClassHolder {
   }
 };
 
+struct ContainsTensor : public torch::CustomClassHolder {
+  explicit ContainsTensor(at::Tensor t) : t_(t) {}
+
+  at::Tensor get() {
+    return t_;
+  }
+
+  std::tuple<std::tuple<std::string, at::Tensor>> __obj_flatten__() {
+    return std::tuple(std::tuple("t", this->t_));
+  }
+
+  at::Tensor t_;
+};
+
 TORCH_LIBRARY(_TorchScriptTesting, m) {
+  m.impl_abstract_pystub("torch.testing._internal.torchbind_impls");
   m.class_<ScalarTypeClass>("_ScalarTypeClass")
       .def(torch::init<at::ScalarType>())
       .def_pickle(
@@ -317,8 +426,26 @@ TORCH_LIBRARY(_TorchScriptTesting, m) {
       .def("info", &Foo::info)
       .def("increment", &Foo::increment)
       .def("add", &Foo::add)
+      .def("add_tensor", &Foo::add_tensor)
       .def("__eq__", &Foo::eq)
-      .def("combine", &Foo::combine);
+      .def("combine", &Foo::combine)
+      .def("__obj_flatten__", &Foo::__obj_flatten__)
+      .def_pickle(
+          [](c10::intrusive_ptr<Foo> self) { // __getstate__
+            return std::vector<int64_t>{self->x, self->y};
+          },
+          [](std::vector<int64_t> state) { // __setstate__
+            return c10::make_intrusive<Foo>(state[0], state[1]);
+          });
+
+  m.def(
+      "takes_foo(__torch__.torch.classes._TorchScriptTesting._Foo foo, Tensor x) -> Tensor");
+  m.def(
+      "takes_foo_python_meta(__torch__.torch.classes._TorchScriptTesting._Foo foo, Tensor x) -> Tensor");
+  m.def(
+      "takes_foo_list_return(__torch__.torch.classes._TorchScriptTesting._Foo foo, Tensor x) -> Tensor[]");
+  m.def(
+      "takes_foo_tuple_return(__torch__.torch.classes._TorchScriptTesting._Foo foo, Tensor x) -> (Tensor, Tensor)");
 
   m.class_<FooGetterSetter>("_FooGetterSetter")
       .def(torch::init<int64_t, int64_t>())
@@ -434,6 +561,116 @@ TORCH_LIBRARY(_TorchScriptTesting, m) {
           [](ElementwiseInterpreter::SerializationType state) {
             return ElementwiseInterpreter::__setstate__(std::move(state));
           });
+
+  m.class_<ContainsTensor>("_ContainsTensor")
+      .def(torch::init<at::Tensor>())
+      .def("get", &ContainsTensor::get)
+      .def("__obj_flatten__", &ContainsTensor::__obj_flatten__)
+      .def_pickle(
+          // __getstate__
+          [](const c10::intrusive_ptr<ContainsTensor>& self) -> at::Tensor {
+            return self->t_;
+          },
+          // __setstate__
+          [](at::Tensor data) -> c10::intrusive_ptr<ContainsTensor> {
+            return c10::make_intrusive<ContainsTensor>(std::move(data));
+          });
+  m.class_<TensorQueue>("_TensorQueue")
+      .def(torch::init<at::Tensor>())
+      .def("push", &TensorQueue::push)
+      .def("pop", &TensorQueue::pop)
+      .def("top", &TensorQueue::top)
+      .def("size", &TensorQueue::size)
+      .def("clone_queue", &TensorQueue::clone_queue)
+      .def("get_raw_queue", &TensorQueue::get_raw_queue)
+      .def("__obj_flatten__", &TensorQueue::__obj_flatten__)
+      .def_pickle(
+          // __getstate__
+          [](const c10::intrusive_ptr<TensorQueue>& self)
+              -> c10::Dict<std::string, at::Tensor> {
+            return self->serialize();
+          },
+          // __setstate__
+          [](c10::Dict<std::string, at::Tensor> data)
+              -> c10::intrusive_ptr<TensorQueue> {
+            return c10::make_intrusive<TensorQueue>(std::move(data));
+          });
+}
+
+at::Tensor takes_foo(c10::intrusive_ptr<Foo> foo, at::Tensor x) {
+  return foo->add_tensor(x);
+}
+
+std::vector<at::Tensor> takes_foo_list_return(
+    c10::intrusive_ptr<Foo> foo,
+    at::Tensor x) {
+  std::vector<at::Tensor> result;
+  result.reserve(3);
+  auto a = foo->add_tensor(x);
+  auto b = foo->add_tensor(a);
+  auto c = foo->add_tensor(b);
+  result.push_back(a);
+  result.push_back(b);
+  result.push_back(c);
+  return result;
+}
+
+std::tuple<at::Tensor, at::Tensor> takes_foo_tuple_return(
+    c10::intrusive_ptr<Foo> foo,
+    at::Tensor x) {
+  auto a = foo->add_tensor(x);
+  auto b = foo->add_tensor(a);
+  return std::make_tuple(a, b);
+}
+
+void queue_push(c10::intrusive_ptr<TensorQueue> tq, at::Tensor x) {
+  tq->push(x);
+}
+
+at::Tensor queue_pop(c10::intrusive_ptr<TensorQueue> tq) {
+  return tq->pop();
+}
+
+int64_t queue_size(c10::intrusive_ptr<TensorQueue> tq) {
+  return tq->size();
+}
+
+TORCH_LIBRARY_FRAGMENT(_TorchScriptTesting, m) {
+  m.impl_abstract_pystub("torch.testing._internal.torchbind_impls");
+  m.def(
+      "takes_foo_cia(__torch__.torch.classes._TorchScriptTesting._Foo foo, Tensor x) -> Tensor");
+  m.def(
+      "queue_pop(__torch__.torch.classes._TorchScriptTesting._TensorQueue foo) -> Tensor");
+  m.def(
+      "queue_push(__torch__.torch.classes._TorchScriptTesting._TensorQueue foo, Tensor x) -> ()");
+  m.def(
+      "queue_size(__torch__.torch.classes._TorchScriptTesting._TensorQueue foo) -> int");
+}
+
+TORCH_LIBRARY_IMPL(_TorchScriptTesting, CPU, m) {
+  m.impl("takes_foo", takes_foo);
+  m.impl("takes_foo_list_return", takes_foo_list_return);
+  m.impl("takes_foo_tuple_return", takes_foo_tuple_return);
+  m.impl("queue_push", queue_push);
+  m.impl("queue_pop", queue_pop);
+  m.impl("queue_size", queue_size);
+}
+
+TORCH_LIBRARY_IMPL(_TorchScriptTesting, Meta, m) {
+  m.impl("takes_foo", &takes_foo);
+  m.impl("takes_foo_list_return", takes_foo_list_return);
+  m.impl("takes_foo_tuple_return", takes_foo_tuple_return);
+}
+
+TORCH_LIBRARY_IMPL(_TorchScriptTesting, CompositeImplicitAutograd, m) {
+  m.impl("takes_foo_cia", takes_foo);
+}
+
+// Need to implement BackendSelect because these two operators don't have tensor
+// inputs.
+TORCH_LIBRARY_IMPL(_TorchScriptTesting, BackendSelect, m) {
+  m.impl("queue_pop", queue_pop);
+  m.impl("queue_size", queue_size);
 }
 
 } // namespace
diff --git a/test/cpp/jit/test_memory_dag.cpp b/test/cpp/jit/test_memory_dag.cpp
index f3ff28f941a44..f5d9fbf7c221d 100644
--- a/test/cpp/jit/test_memory_dag.cpp
+++ b/test/cpp/jit/test_memory_dag.cpp
@@ -36,7 +36,7 @@ TEST(MemoryDAGTest, Basic) {
     t->makePointerTo(e, a);
     t->makePointerTo(e, f);
 
-    auto dag = std::make_unique<MemoryDAG>(std::move(t));
+    auto dag = std::move(*t).createMemoryDAG();
 
     /**
      * Test mayAlias()
@@ -69,7 +69,7 @@ TEST(MemoryDAGTest, Basic) {
     auto c = t->makeFreshValue(cValue);
     t->addToContainedElements(a, c);
 
-    auto dag = std::make_unique<MemoryDAG>(std::move(t));
+    auto dag = std::move(*t).createMemoryDAG();
     EXPECT_TRUE(dag->mayContainAlias(a, b));
     EXPECT_TRUE(dag->mayContainAlias(b, a));
 
@@ -99,7 +99,7 @@ TEST(MemoryDAGTest, Basic) {
     auto d = t->makeFreshValue(dValue);
     t->addToContainedElements(b, d);
 
-    auto dag = std::make_unique<MemoryDAG>(std::move(t));
+    auto dag = std::move(*t).createMemoryDAG();
     EXPECT_TRUE(dag->mayContainAlias(b, d));
     EXPECT_TRUE(dag->mayContainAlias(d, b));
 
@@ -126,7 +126,7 @@ TEST(MemoryDAGTest, Basic) {
 
     t->addToContainedElements(f, e);
 
-    auto dag = std::make_unique<MemoryDAG>(std::move(t));
+    auto dag = std::move(*t).createMemoryDAG();
     for (auto elem : {a, b, c, d}) {
       EXPECT_FALSE(dag->mayContainAlias(f, elem));
       EXPECT_FALSE(dag->mayContainAlias(e, elem));
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index a1b35a55afd13..efe377aad72ce 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -3134,6 +3134,7 @@ TEST_F(Composed, ComposedOp) {
 }
 
 TEST(ConstantPropagation, CustomClassesCanBePropagated) {
+#ifdef USE_QNNPACK
   const auto src = R"IR(
     graph():
         %none: NoneType = prim::Constant()
@@ -3154,6 +3155,7 @@ TEST(ConstantPropagation, CustomClassesCanBePropagated) {
   ConstantPropagation(graph);
 
   testing::FileCheck().check_not("quantized::linear_prepack")->run(*graph);
+#endif
 }
 
 } // namespace jit
diff --git a/test/cpp/jit/test_save_load.cpp b/test/cpp/jit/test_save_load.cpp
index 067892a28e011..9c66db990e26e 100644
--- a/test/cpp/jit/test_save_load.cpp
+++ b/test/cpp/jit/test_save_load.cpp
@@ -1,6 +1,7 @@
 #include <gtest/gtest.h>
 
 #include <test/cpp/jit/test_utils.h>
+#include <cstdlib>
 #include <iostream>
 #include <sstream>
 
@@ -263,6 +264,37 @@ TEST(SerializationTest, ParentDirNotExist) {
       "Parent directory ./doesnotexist does not exist.");
 }
 
+#ifdef WIN32
+TEST(SerializationTest, WindowsDrivePathTest) {
+  // "ZZZ" is typically not a valid drive letter.
+  // We expect to see "ZZZ:\\" or "ZZZ:/" in the error message.
+  // Note: slash should be included for the drive letter parent in Windows.
+  expectThrowsEq(
+      []() {
+        auto t = torch::nn::Linear(5, 5);
+        torch::save(t, "ZZZ:\\file.pt");
+      },
+      "Parent directory ZZZ:\\ does not exist.");
+  expectThrowsEq(
+      []() {
+        auto t = torch::nn::Linear(5, 5);
+        torch::save(t, "ZZZ:/file.pt");
+      },
+      "Parent directory ZZZ:/ does not exist.");
+}
+
+TEST(SerializationTest, WindowsTempPathTest) {
+  // Test for verifying file saving and loading in the temporary folder
+  std::string temp_dir = std::getenv("TEMP");
+  std::string file_path = temp_dir + "/file.pt";
+  auto t1 = torch::tensor(1.0);
+  torch::save(t1, file_path);
+  torch::Tensor t2;
+  torch::load(t2, file_path);
+  ASSERT_TRUE(t1.allclose(t2, 0.0, 0.0));
+}
+#endif
+
 TEST(SerializationTest, CalculateNecessaryArgsTest) {
   auto schema = torch::schema(
       "sync_stream(int stream_id = -1) -> ()",
diff --git a/test/cpp/jit/tests_setup.py b/test/cpp/jit/tests_setup.py
index b4643927a9785..d0ddf31532950 100644
--- a/test/cpp/jit/tests_setup.py
+++ b/test/cpp/jit/tests_setup.py
@@ -1,14 +1,15 @@
-import sys
 import os
+import sys
+
 import torch
 
 
 class Setup:
     def setup(self):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def shutdown(self):
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 class FileSetup:
@@ -21,7 +22,7 @@ def shutdown(self):
 
 
 class EvalModeForLoadedModule(FileSetup):
-    path = 'dropout_model.pt'
+    path = "dropout_model.pt"
 
     def setup(self):
         class Model(torch.jit.ScriptModule):
@@ -40,7 +41,7 @@ def forward(self, x):
 
 
 class SerializationInterop(FileSetup):
-    path = 'ivalue.pt'
+    path = "ivalue.pt"
 
     def setup(self):
         ones = torch.ones(2, 2)
@@ -53,7 +54,7 @@ def setup(self):
 
 # See testTorchSaveError in test/cpp/jit/tests.h for usage
 class TorchSaveError(FileSetup):
-    path = 'eager_value.pt'
+    path = "eager_value.pt"
 
     def setup(self):
         ones = torch.ones(2, 2)
@@ -63,8 +64,9 @@ def setup(self):
 
         torch.save(value, self.path, _use_new_zipfile_serialization=False)
 
+
 class TorchSaveJitStream_CUDA(FileSetup):
-    path = 'saved_stream_model.pt'
+    path = "saved_stream_model.pt"
 
     def setup(self):
         if not torch.cuda.is_available():
@@ -77,7 +79,9 @@ def forward(self):
                 b = torch.rand(3, 4, device="cuda")
 
                 with torch.cuda.stream(s):
-                    is_stream_s = torch.cuda.current_stream(s.device_index()).id() == s.id()
+                    is_stream_s = (
+                        torch.cuda.current_stream(s.device_index()).id() == s.id()
+                    )
                     c = torch.cat((a, b), 0).to("cuda")
                 s.synchronize()
                 return is_stream_s, a, b, c
@@ -93,9 +97,10 @@ def forward(self):
     EvalModeForLoadedModule(),
     SerializationInterop(),
     TorchSaveError(),
-    TorchSaveJitStream_CUDA()
+    TorchSaveJitStream_CUDA(),
 ]
 
+
 def setup():
     for test in tests:
         test.setup()
diff --git a/test/cpp_api_parity/functional_impl_check.py b/test/cpp_api_parity/functional_impl_check.py
index ad56e917ef6d5..8e57f4e88cd19 100644
--- a/test/cpp_api_parity/functional_impl_check.py
+++ b/test/cpp_api_parity/functional_impl_check.py
@@ -14,26 +14,39 @@
 # 4. Compare Python/C++ functional's forward output. If they are the same, then we
 # have implementation parity between Python/C++ module.
 
+import os
+import pprint
+import re
 import tempfile
 from string import Template
-import re
-import pprint
-import os
 
 import torch
-from cpp_api_parity.utils import TorchNNFunctionalTestParams, TORCH_NN_COMMON_TEST_HARNESS, \
-    compile_cpp_code_inline, set_python_tensors_requires_grad, move_python_tensors_to_device, \
-    add_test, compute_cpp_args_construction_stmts_and_forward_arg_symbols, serialize_arg_dict_as_script_module, \
-    compute_arg_dict, decorate_test_fn, compute_temp_file_path, generate_error_msg, is_torch_nn_functional_test, \
-    try_remove_folder
+
 from cpp_api_parity.sample_functional import SAMPLE_FUNCTIONAL_CPP_SOURCE
+from cpp_api_parity.utils import (
+    add_test,
+    compile_cpp_code_inline,
+    compute_arg_dict,
+    compute_cpp_args_construction_stmts_and_forward_arg_symbols,
+    compute_temp_file_path,
+    decorate_test_fn,
+    generate_error_msg,
+    is_torch_nn_functional_test,
+    move_python_tensors_to_device,
+    serialize_arg_dict_as_script_module,
+    set_python_tensors_requires_grad,
+    TORCH_NN_COMMON_TEST_HARNESS,
+    TorchNNFunctionalTestParams,
+    try_remove_folder,
+)
 
 # Expected substitutions:
 #
 # ${functional_variant_name}  (e.g. `BCELoss_no_reduce`)
 # ${cpp_args_construction_stmts}
 # ${cpp_function_call}
-TORCH_NN_FUNCTIONAL_TEST_FORWARD = Template("""
+TORCH_NN_FUNCTIONAL_TEST_FORWARD = Template(
+    """
 void ${functional_variant_name}_test_forward(
     const std::string& arg_dict_file_path,
     const std::string& forward_output_file_path) {
@@ -56,17 +69,24 @@
   // Save the output into a file to be compared in Python later
   write_ivalue_to_file(torch::IValue(cpp_output), forward_output_file_path);
 }
-""")
+"""
+)
+
 
 def run_forward(unit_test_class, test_params):
     device = test_params.device
 
-    inputs = set_python_tensors_requires_grad(move_python_tensors_to_device(
-        [arg_value for _, arg_value in test_params.arg_dict['input']], device))
+    inputs = set_python_tensors_requires_grad(
+        move_python_tensors_to_device(
+            [arg_value for _, arg_value in test_params.arg_dict["input"]], device
+        )
+    )
     inputs += move_python_tensors_to_device(
-        [arg_value for _, arg_value in test_params.arg_dict['target']], device)
+        [arg_value for _, arg_value in test_params.arg_dict["target"]], device
+    )
     inputs += move_python_tensors_to_device(
-        [arg_value for _, arg_value in test_params.arg_dict['extra_args']], device)
+        [arg_value for _, arg_value in test_params.arg_dict["extra_args"]], device
+    )
 
     # Some functionals (such as `F.rrelu`) create random tensors in their call path.
     # To make sure the random tensors created are the same in Python/C++, we need
@@ -76,6 +96,7 @@ def run_forward(unit_test_class, test_params):
 
     return python_output
 
+
 def test_forward(unit_test_class, test_params):
     functional_variant_name = test_params.functional_variant_name
     cpp_tmp_folder = test_params.cpp_tmp_folder
@@ -87,105 +108,141 @@ def test_forward(unit_test_class, test_params):
     python_output = run_forward(unit_test_class, test_params)
 
     # Save Python arguments to be used from C++ function
-    arg_dict_file_path = compute_temp_file_path(cpp_tmp_folder, functional_variant_name, 'arg_dict')
+    arg_dict_file_path = compute_temp_file_path(
+        cpp_tmp_folder, functional_variant_name, "arg_dict"
+    )
     serialize_arg_dict_as_script_module(test_params.arg_dict).save(arg_dict_file_path)
 
-    cpp_test_name = f'{test_params.functional_variant_name}_test_forward'
-    cpp_test_fn = getattr(unit_test_class.functional_impl_check_cpp_module, cpp_test_name)
+    cpp_test_name = f"{test_params.functional_variant_name}_test_forward"
+    cpp_test_fn = getattr(
+        unit_test_class.functional_impl_check_cpp_module, cpp_test_name
+    )
 
     def run_cpp_test_fn_and_check_output():
-        forward_output_file_path = compute_temp_file_path(cpp_tmp_folder, functional_variant_name, 'forward_output')
+        forward_output_file_path = compute_temp_file_path(
+            cpp_tmp_folder, functional_variant_name, "forward_output"
+        )
 
         cpp_test_fn(arg_dict_file_path, forward_output_file_path)
         cpp_output = torch.load(forward_output_file_path)
 
         # Check that forward outputs are equal
         unit_test_class.assertEqual(
-            python_output, cpp_output,
-            msg=generate_error_msg("forward output", cpp_output, python_output))
+            python_output,
+            cpp_output,
+            msg=generate_error_msg("forward output", cpp_output, python_output),
+        )
 
     run_cpp_test_fn_and_check_output()
 
     # Remove temporary folder that stores C++ outputs
     try_remove_folder(cpp_tmp_folder)
 
+
 def compute_functional_name(test_params_dict):
     def camel_case_to_snake_case(camel_case_str):
-        return re.sub(r'(?<!^)(?=[A-Z])', '_', camel_case_str).lower()
+        return re.sub(r"(?<!^)(?=[A-Z])", "_", camel_case_str).lower()
 
-    if 'cpp_options_args' in test_params_dict:
+    if "cpp_options_args" in test_params_dict:
         # Expected format for `cpp_options_args`: `F::FunctionalFuncOptions(...)`
         # Example output: `binary_cross_entropy`
         return camel_case_to_snake_case(
-            test_params_dict['cpp_options_args'].split('(')[0].replace('F::', '').replace('FuncOptions', ''))
-    elif 'cpp_function_call' in test_params_dict:
+            test_params_dict["cpp_options_args"]
+            .split("(")[0]
+            .replace("F::", "")
+            .replace("FuncOptions", "")
+        )
+    elif "cpp_function_call" in test_params_dict:
         # Expected format for `cpp_function_call`: `F::functional_name(...)`
         # Example output: `binary_cross_entropy`
-        return test_params_dict['cpp_function_call'].split('(')[0].replace('F::', '')
+        return test_params_dict["cpp_function_call"].split("(")[0].replace("F::", "")
     else:
         raise RuntimeError(
-            "`cpp_options_args` or `cpp_function_call` entry must be present in test params dict:\n{}".format(
-                pprint.pformat(test_params_dict)))
+            f"`cpp_options_args` or `cpp_function_call` entry must be present in test params dict:\n{pprint.pformat(test_params_dict)}"  # noqa: B950
+        )
+
 
 def compute_cpp_function_call(test_params_dict, arg_dict, functional_name):
-    if 'cpp_function_call' in test_params_dict:
-        return test_params_dict['cpp_function_call']
-    elif 'cpp_options_args' in test_params_dict:
-        cpp_forward_args_symbols = [arg_name for arg_name, _ in
-                                    arg_dict['input'] + arg_dict['target'] + arg_dict['extra_args']]
-        return 'F::{}({}, {})'.format(
-            functional_name, ", ".join(cpp_forward_args_symbols), test_params_dict['cpp_options_args'])
+    if "cpp_function_call" in test_params_dict:
+        return test_params_dict["cpp_function_call"]
+    elif "cpp_options_args" in test_params_dict:
+        cpp_forward_args_symbols = [
+            arg_name
+            for arg_name, _ in arg_dict["input"]
+            + arg_dict["target"]
+            + arg_dict["extra_args"]
+        ]
+        return "F::{}({}, {})".format(
+            functional_name,
+            ", ".join(cpp_forward_args_symbols),
+            test_params_dict["cpp_options_args"],
+        )
     else:
         raise RuntimeError(
-            "`cpp_options_args` or `cpp_function_call` entry must be present in test params dict:\n{}".format(
-                pprint.pformat(test_params_dict)))
+            f"`cpp_options_args` or `cpp_function_call` entry must be present in test params dict:\n{pprint.pformat(test_params_dict)}"  # noqa: B950
+        )
+
 
 def process_test_params_for_functional(test_params_dict, device, test_instance_class):
     test_instance = test_instance_class(**test_params_dict)
     functional_name = compute_functional_name(test_params_dict)
-    assert test_instance.get_name().startswith('test_')
+    assert test_instance.get_name().startswith("test_")
     # Example output: `BCELoss_no_reduce_cuda`
-    functional_variant_name = test_instance.get_name()[5:] + (('_' + device) if device != 'cpu' else '')
+    functional_variant_name = test_instance.get_name()[5:] + (
+        ("_" + device) if device != "cpu" else ""
+    )
     arg_dict = compute_arg_dict(test_params_dict, test_instance)
 
     return TorchNNFunctionalTestParams(
         functional_name=functional_name,
         functional_variant_name=functional_variant_name,
         test_instance=test_instance,
-        cpp_function_call=compute_cpp_function_call(test_params_dict, arg_dict, functional_name),
+        cpp_function_call=compute_cpp_function_call(
+            test_params_dict, arg_dict, functional_name
+        ),
         arg_dict=arg_dict,
-        has_parity=test_params_dict.get('has_parity', True),
+        has_parity=test_params_dict.get("has_parity", True),
         device=device,
         cpp_tmp_folder=tempfile.mkdtemp(),
     )
 
+
 def write_test_to_test_class(
-        unit_test_class, test_params_dict, test_instance_class, parity_table, devices):
+    unit_test_class, test_params_dict, test_instance_class, parity_table, devices
+):
     assert is_torch_nn_functional_test(test_params_dict)
 
-    assert 'cpp_options_args' in test_params_dict or 'cpp_function_call' in test_params_dict, (
+    assert (
+        "cpp_options_args" in test_params_dict
+        or "cpp_function_call" in test_params_dict
+    ), (
         "To enable C++ API parity test, "
-        "`cpp_options_args` or `cpp_function_call` entry must be present in test params dict:\n{}. \n"
+        f"`cpp_options_args` or `cpp_function_call` entry must be present in test params dict:\n{pprint.pformat(test_params_dict)}. \n"  # noqa: B950
         "If you are interested in adding the C++ API parity test, please see:\n"
         "NOTE [How to check NN module / functional API parity between Python and C++ frontends]. \n"
         "If not, please add `test_cpp_api_parity=False` to the test params dict and file an issue about this."
-    ).format(pprint.pformat(test_params_dict))
+    )
 
-    assert not ('cpp_options_args' in test_params_dict and 'cpp_function_call' in test_params_dict), (
+    assert not (
+        "cpp_options_args" in test_params_dict
+        and "cpp_function_call" in test_params_dict
+    ), (
         "Only one of `cpp_options_args` and `cpp_function_call` entries "
-        f"should be present in test params dict:\n{pprint.pformat(test_params_dict)}")
+        f"should be present in test params dict:\n{pprint.pformat(test_params_dict)}"
+    )
 
     functional_name = compute_functional_name(test_params_dict)
 
-    assert hasattr(torch.nn.functional, functional_name), \
-        "`torch.nn.functional` doesn't have function `{}`. (Discovered while processing\n{}.)".format(
-            functional_name, pprint.pformat(test_params_dict))
+    assert hasattr(
+        torch.nn.functional, functional_name
+    ), f"`torch.nn.functional` doesn't have function `{functional_name}`. (Discovered while processing\n{pprint.pformat(test_params_dict)}.)"  # noqa: B950
 
-    functional_full_name = 'F::' + functional_name
+    functional_full_name = "F::" + functional_name
 
-    assert functional_full_name in parity_table['torch::nn::functional'], (
-        "Please add `{}` entry to `torch::nn::functional` section of `test/cpp_api_parity/parity-tracker.md`. "
-        "(Discovered while processing\n{}.)").format(functional_full_name, pprint.pformat(test_params_dict))
+    assert functional_full_name in parity_table["torch::nn::functional"], (
+        f"Please add `{functional_full_name}` entry to `torch::nn::functional` section of `test/cpp_api_parity/parity-tracker.md`. "
+        f"(Discovered while processing\n{pprint.pformat(test_params_dict)}.)"
+    )
 
     for device in devices:
         test_params = process_test_params_for_functional(
@@ -194,24 +251,37 @@ def write_test_to_test_class(
             test_instance_class=test_instance_class,
         )
         try_remove_folder(test_params.cpp_tmp_folder)
-        unit_test_name = f'test_torch_nn_functional_{test_params.functional_variant_name}'
+        unit_test_name = (
+            f"test_torch_nn_functional_{test_params.functional_variant_name}"
+        )
         unit_test_class.functional_test_params_map[unit_test_name] = test_params
 
         def test_fn(self):
             test_forward(
-                unit_test_class=self, test_params=unit_test_class.functional_test_params_map[self._testMethodName])
+                unit_test_class=self,
+                test_params=unit_test_class.functional_test_params_map[
+                    self._testMethodName
+                ],
+            )
 
         test_fn = decorate_test_fn(
             test_fn=test_fn,
-            test_cuda=test_params_dict.get('test_cuda', True),
-            has_impl_parity=parity_table['torch::nn::functional'][functional_full_name][0] and
-            test_params_dict.get('has_parity', True),
-            device=device)
+            test_cuda=test_params_dict.get("test_cuda", True),
+            has_impl_parity=parity_table["torch::nn::functional"][functional_full_name][
+                0
+            ]
+            and test_params_dict.get("has_parity", True),
+            device=device,
+        )
 
         add_test(unit_test_class, unit_test_name, test_fn)
 
+
 def generate_test_cpp_sources(test_params, template):
-    cpp_args_construction_stmts, _ = compute_cpp_args_construction_stmts_and_forward_arg_symbols(test_params)
+    (
+        cpp_args_construction_stmts,
+        _,
+    ) = compute_cpp_args_construction_stmts_and_forward_arg_symbols(test_params)
 
     test_cpp_sources = template.substitute(
         functional_variant_name=test_params.functional_variant_name,
@@ -220,19 +290,21 @@ def generate_test_cpp_sources(test_params, template):
     )
     return test_cpp_sources
 
+
 # Build all C++ tests together, instead of once per test.
 def build_cpp_tests(unit_test_class, print_cpp_source=False):
     assert len(unit_test_class.functional_test_params_map) > 0
     cpp_sources = TORCH_NN_COMMON_TEST_HARNESS + SAMPLE_FUNCTIONAL_CPP_SOURCE
     functions = []
     for test_params in unit_test_class.functional_test_params_map.values():
-        cpp_sources += generate_test_cpp_sources(test_params=test_params, template=TORCH_NN_FUNCTIONAL_TEST_FORWARD)
-        functions.append(f'{test_params.functional_variant_name}_test_forward')
+        cpp_sources += generate_test_cpp_sources(
+            test_params=test_params, template=TORCH_NN_FUNCTIONAL_TEST_FORWARD
+        )
+        functions.append(f"{test_params.functional_variant_name}_test_forward")
     if print_cpp_source:
         print(cpp_sources)
 
     cpp_module = compile_cpp_code_inline(
-        name='functional_impl_check',
-        cpp_sources=cpp_sources,
-        functions=functions)
+        name="functional_impl_check", cpp_sources=cpp_sources, functions=functions
+    )
     unit_test_class.functional_impl_check_cpp_module = cpp_module
diff --git a/test/cpp_api_parity/module_impl_check.py b/test/cpp_api_parity/module_impl_check.py
index 1aa6273f0d99d..7b6afaef8bf87 100644
--- a/test/cpp_api_parity/module_impl_check.py
+++ b/test/cpp_api_parity/module_impl_check.py
@@ -14,19 +14,31 @@
 # 4. Compare Python/C++ module's forward output and backward gradients. If they
 # are the same, then we have implementation parity between Python/C++ module.
 
+import os
+import pprint
 import tempfile
-from string import Template
 import types
-import pprint
-import os
+from string import Template
 
 import torch
-from cpp_api_parity.utils import TorchNNModuleTestParams, TORCH_NN_COMMON_TEST_HARNESS, \
-    compile_cpp_code_inline, set_python_tensors_requires_grad, move_python_tensors_to_device, \
-    add_test, compute_cpp_args_construction_stmts_and_forward_arg_symbols, serialize_arg_dict_as_script_module, \
-    compute_arg_dict, decorate_test_fn, compute_temp_file_path, generate_error_msg, is_torch_nn_functional_test, \
-    try_remove_folder
+
 from cpp_api_parity.sample_module import SAMPLE_MODULE_CPP_SOURCE
+from cpp_api_parity.utils import (
+    add_test,
+    compile_cpp_code_inline,
+    compute_arg_dict,
+    compute_cpp_args_construction_stmts_and_forward_arg_symbols,
+    compute_temp_file_path,
+    decorate_test_fn,
+    generate_error_msg,
+    is_torch_nn_functional_test,
+    move_python_tensors_to_device,
+    serialize_arg_dict_as_script_module,
+    set_python_tensors_requires_grad,
+    TORCH_NN_COMMON_TEST_HARNESS,
+    TorchNNModuleTestParams,
+    try_remove_folder,
+)
 
 # Expected substitutions:
 #
@@ -36,7 +48,8 @@
 # ${cpp_constructor_args}
 # ${device}
 # ${cpp_forward_args_symbols}
-TORCH_NN_MODULE_TEST_FORWARD_BACKWARD = Template("""
+TORCH_NN_MODULE_TEST_FORWARD_BACKWARD = Template(
+    """
 void ${module_variant_name}_test_forward_backward(
     const std::string& arg_dict_file_path,
     const std::string& module_file_path,
@@ -85,18 +98,27 @@
 
   write_ivalue_to_file(torch::IValue(grad_dict), backward_grad_dict_file_path);
 }
-""")
+"""
+)
+
 
 def run_python_forward_backward(unit_test_class, test_params):
     device = test_params.device
-    module = test_params.test_instance.constructor(*test_params.test_instance.constructor_args).to(device)
+    module = test_params.test_instance.constructor(
+        *test_params.test_instance.constructor_args
+    ).to(device)
 
-    inputs = set_python_tensors_requires_grad(move_python_tensors_to_device(
-        [arg_value for _, arg_value in test_params.arg_dict['input']], device))
+    inputs = set_python_tensors_requires_grad(
+        move_python_tensors_to_device(
+            [arg_value for _, arg_value in test_params.arg_dict["input"]], device
+        )
+    )
     inputs += move_python_tensors_to_device(
-        [arg_value for _, arg_value in test_params.arg_dict['target']], device)
+        [arg_value for _, arg_value in test_params.arg_dict["target"]], device
+    )
     inputs += move_python_tensors_to_device(
-        [arg_value for _, arg_value in test_params.arg_dict['extra_args']], device)
+        [arg_value for _, arg_value in test_params.arg_dict["extra_args"]], device
+    )
 
     # Some modules (such as `RReLU`) create random tensors in their forward pass.
     # To make sure the random tensors created are the same in Python/C++, we need
@@ -130,6 +152,7 @@ def run_python_forward_backward(unit_test_class, test_params):
 
     return script_module, python_output, python_grad_dict
 
+
 def test_forward_backward(unit_test_class, test_params):
     module_variant_name = test_params.module_variant_name
     cpp_tmp_folder = test_params.cpp_tmp_folder
@@ -138,76 +161,113 @@ def test_forward_backward(unit_test_class, test_params):
     os.mkdir(cpp_tmp_folder)
 
     # Run forward and backward on Python module
-    script_module, python_output, python_grad_dict = run_python_forward_backward(unit_test_class, test_params)
+    script_module, python_output, python_grad_dict = run_python_forward_backward(
+        unit_test_class, test_params
+    )
 
     # Save Python module and arguments to be used from C++ function
-    module_file_path = compute_temp_file_path(cpp_tmp_folder, module_variant_name, 'module')
-    arg_dict_file_path = compute_temp_file_path(cpp_tmp_folder, module_variant_name, 'arg_dict')
+    module_file_path = compute_temp_file_path(
+        cpp_tmp_folder, module_variant_name, "module"
+    )
+    arg_dict_file_path = compute_temp_file_path(
+        cpp_tmp_folder, module_variant_name, "arg_dict"
+    )
     script_module.save(module_file_path)
     serialize_arg_dict_as_script_module(test_params.arg_dict).save(arg_dict_file_path)
 
-    cpp_test_name = f'{test_params.module_variant_name}_test_forward_backward'
+    cpp_test_name = f"{test_params.module_variant_name}_test_forward_backward"
     cpp_test_fn = getattr(unit_test_class.module_impl_check_cpp_module, cpp_test_name)
 
     def run_cpp_test_fn_and_check_output():
-        forward_output_file_path = compute_temp_file_path(cpp_tmp_folder, module_variant_name, 'forward_output')
-        backward_grad_dict_file_path = compute_temp_file_path(cpp_tmp_folder, module_variant_name, 'backward_grad_dict')
+        forward_output_file_path = compute_temp_file_path(
+            cpp_tmp_folder, module_variant_name, "forward_output"
+        )
+        backward_grad_dict_file_path = compute_temp_file_path(
+            cpp_tmp_folder, module_variant_name, "backward_grad_dict"
+        )
 
-        cpp_test_fn(arg_dict_file_path, module_file_path, forward_output_file_path, backward_grad_dict_file_path)
+        cpp_test_fn(
+            arg_dict_file_path,
+            module_file_path,
+            forward_output_file_path,
+            backward_grad_dict_file_path,
+        )
         cpp_output = torch.load(forward_output_file_path)
         cpp_grad_dict = torch.load(backward_grad_dict_file_path)
 
         # Check that forward outputs are equal
-        unit_test_class.assertEqual(python_output, cpp_output,
-                                    msg=generate_error_msg("forward output", cpp_output, python_output))
+        unit_test_class.assertEqual(
+            python_output,
+            cpp_output,
+            msg=generate_error_msg("forward output", cpp_output, python_output),
+        )
 
         # Check that module parameter gradients are equal after backward pass
         unit_test_class.assertEqual(
-            len(python_grad_dict), len(cpp_grad_dict),
-            msg=generate_error_msg("# of parameters", len(cpp_grad_dict), len(python_grad_dict)))
+            len(python_grad_dict),
+            len(cpp_grad_dict),
+            msg=generate_error_msg(
+                "# of parameters", len(cpp_grad_dict), len(python_grad_dict)
+            ),
+        )
         for key in python_grad_dict:
             param_name = None
-            for suffix in ['_grad', '_grad_indices', '_grad_values']:
+            for suffix in ["_grad", "_grad_indices", "_grad_values"]:
                 if key.endswith(suffix):
-                    param_name = key[:-len(suffix)]
+                    param_name = key[: -len(suffix)]
                     break
             assert param_name is not None
-            sparsity_str = 'sparse' if key.endswith(('_grad_indices', '_grad_values')) else 'dense'
+            sparsity_str = (
+                "sparse" if key.endswith(("_grad_indices", "_grad_values")) else "dense"
+            )
 
             unit_test_class.assertTrue(
                 key in cpp_grad_dict,
                 msg=generate_error_msg(
-                    f"\"Does module have a parameter named `{param_name}` with {sparsity_str} gradient?\"",
-                    False, True))
+                    f'"Does module have a parameter named `{param_name}` with {sparsity_str} gradient?"',
+                    False,
+                    True,
+                ),
+            )
             unit_test_class.assertEqual(
-                python_grad_dict[key], cpp_grad_dict[key],
+                python_grad_dict[key],
+                cpp_grad_dict[key],
                 msg=generate_error_msg(
                     f"`{param_name}`'s {sparsity_str} gradient (`{key}`)",
-                    cpp_grad_dict[key], python_grad_dict[key]))
+                    cpp_grad_dict[key],
+                    python_grad_dict[key],
+                ),
+            )
 
     run_cpp_test_fn_and_check_output()
 
     # Remove temporary folder that stores C++ outputs
     try_remove_folder(cpp_tmp_folder)
 
+
 def compute_module_name(test_params_dict):
-    fullname = test_params_dict.get('fullname', None)
+    fullname = test_params_dict.get("fullname", None)
     if fullname:
-        module_name = fullname.split('_')[0]
+        module_name = fullname.split("_")[0]
     else:
-        module_name = test_params_dict.get('module_name')
+        module_name = test_params_dict.get("module_name")
     return module_name
 
+
 def process_test_params_for_module(test_params_dict, device, test_instance_class):
     module_name = compute_module_name(test_params_dict)
-    test_params_dict['constructor'] = test_params_dict.get('constructor', getattr(torch.nn, module_name))
+    test_params_dict["constructor"] = test_params_dict.get(
+        "constructor", getattr(torch.nn, module_name)
+    )
     test_instance = test_instance_class(**test_params_dict)
-    assert test_instance.get_name().startswith('test_')
+    assert test_instance.get_name().startswith("test_")
     # Example output: `BCELoss_weights_cuda`
-    module_variant_name = test_instance.get_name()[5:] + (('_' + device) if device != 'cpu' else '')
+    module_variant_name = test_instance.get_name()[5:] + (
+        ("_" + device) if device != "cpu" else ""
+    )
 
-    if 'constructor_args' in test_params_dict:
-        assert 'cpp_constructor_args' in test_params_dict, (
+    if "constructor_args" in test_params_dict:
+        assert "cpp_constructor_args" in test_params_dict, (
             "If `constructor_args` is present in test params dict, to enable C++ API parity test, "
             f"`cpp_constructor_args` must be present in:\n{pprint.pformat(test_params_dict)}"
             "If you are interested in adding the C++ API parity test, please see:\n"
@@ -219,15 +279,17 @@ def process_test_params_for_module(test_params_dict, device, test_instance_class
         module_name=module_name,
         module_variant_name=module_variant_name,
         test_instance=test_instance,
-        cpp_constructor_args=test_params_dict.get('cpp_constructor_args', ''),
+        cpp_constructor_args=test_params_dict.get("cpp_constructor_args", ""),
         arg_dict=compute_arg_dict(test_params_dict, test_instance),
-        has_parity=test_params_dict.get('has_parity', True),
+        has_parity=test_params_dict.get("has_parity", True),
         device=device,
         cpp_tmp_folder=tempfile.mkdtemp(),
     )
 
+
 def write_test_to_test_class(
-        unit_test_class, test_params_dict, test_instance_class, parity_table, devices):
+    unit_test_class, test_params_dict, test_instance_class, parity_table, devices
+):
     assert not is_torch_nn_functional_test(test_params_dict)
 
     module_name = compute_module_name(test_params_dict)
@@ -238,11 +300,12 @@ def write_test_to_test_class(
         f"or set `module_name` using format `ModuleName` in the module test dict:\n{pprint.pformat(test_params_dict)}"
     )
 
-    module_full_name = 'torch::nn::' + module_name
+    module_full_name = "torch::nn::" + module_name
 
-    assert module_full_name in parity_table['torch::nn'], (
+    assert module_full_name in parity_table["torch::nn"], (
         f"Please add `{module_full_name}` entry to `torch::nn` section of `test/cpp_api_parity/parity-tracker.md`. "
-        f"(Discovered while processing\n{pprint.pformat(test_params_dict)}.)")
+        f"(Discovered while processing\n{pprint.pformat(test_params_dict)}.)"
+    )
 
     for device in devices:
         test_params = process_test_params_for_module(
@@ -251,35 +314,43 @@ def write_test_to_test_class(
             test_instance_class=test_instance_class,
         )
         try_remove_folder(test_params.cpp_tmp_folder)
-        unit_test_name = f'test_torch_nn_{test_params.module_variant_name}'
+        unit_test_name = f"test_torch_nn_{test_params.module_variant_name}"
         unit_test_class.module_test_params_map[unit_test_name] = test_params
 
         def test_fn(self):
             test_forward_backward(
-                unit_test_class=self, test_params=unit_test_class.module_test_params_map[self._testMethodName])
+                unit_test_class=self,
+                test_params=unit_test_class.module_test_params_map[
+                    self._testMethodName
+                ],
+            )
 
         test_fn = decorate_test_fn(
             test_fn=test_fn,
-            test_cuda=test_params_dict.get('test_cuda', True),
-            has_impl_parity=parity_table['torch::nn'][module_full_name][0] and
-            test_params_dict.get('has_parity', True),
-            device=device)
+            test_cuda=test_params_dict.get("test_cuda", True),
+            has_impl_parity=parity_table["torch::nn"][module_full_name][0]
+            and test_params_dict.get("has_parity", True),
+            device=device,
+        )
 
         add_test(unit_test_class, unit_test_name, test_fn)
 
+
 def generate_test_cpp_sources(test_params, template):
     device = test_params.device
 
     cpp_constructor_args = test_params.cpp_constructor_args
-    if cpp_constructor_args != '':
-        cpp_constructor_args = f'({cpp_constructor_args})'
+    if cpp_constructor_args != "":
+        cpp_constructor_args = f"({cpp_constructor_args})"
 
-    cpp_args_construction_stmts, cpp_forward_args_symbols = \
-        compute_cpp_args_construction_stmts_and_forward_arg_symbols(test_params)
+    (
+        cpp_args_construction_stmts,
+        cpp_forward_args_symbols,
+    ) = compute_cpp_args_construction_stmts_and_forward_arg_symbols(test_params)
 
     test_cpp_sources = template.substitute(
         module_variant_name=test_params.module_variant_name,
-        module_qualified_name=f'torch::nn::{test_params.module_name}',
+        module_qualified_name=f"torch::nn::{test_params.module_name}",
         cpp_args_construction_stmts=";\n  ".join(cpp_args_construction_stmts),
         cpp_constructor_args=cpp_constructor_args,
         cpp_forward_args_symbols=", ".join(cpp_forward_args_symbols),
@@ -287,6 +358,7 @@ def generate_test_cpp_sources(test_params, template):
     )
     return test_cpp_sources
 
+
 # Build all C++ tests together, instead of once per test.
 def build_cpp_tests(unit_test_class, print_cpp_source=False):
     assert len(unit_test_class.module_test_params_map) > 0
@@ -294,13 +366,13 @@ def build_cpp_tests(unit_test_class, print_cpp_source=False):
     functions = []
     for test_params in unit_test_class.module_test_params_map.values():
         cpp_sources += generate_test_cpp_sources(
-            test_params=test_params, template=TORCH_NN_MODULE_TEST_FORWARD_BACKWARD)
-        functions.append(f'{test_params.module_variant_name}_test_forward_backward')
+            test_params=test_params, template=TORCH_NN_MODULE_TEST_FORWARD_BACKWARD
+        )
+        functions.append(f"{test_params.module_variant_name}_test_forward_backward")
     if print_cpp_source:
         print(cpp_sources)
 
     cpp_module = compile_cpp_code_inline(
-        name='module_impl_check',
-        cpp_sources=cpp_sources,
-        functions=functions)
+        name="module_impl_check", cpp_sources=cpp_sources, functions=functions
+    )
     unit_test_class.module_impl_check_cpp_module = cpp_module
diff --git a/test/cpp_api_parity/parity_table_parser.py b/test/cpp_api_parity/parity_table_parser.py
index f9f5497d968d3..770c7a8f6efd9 100644
--- a/test/cpp_api_parity/parity_table_parser.py
+++ b/test/cpp_api_parity/parity_table_parser.py
@@ -1,8 +1,8 @@
 from collections import namedtuple
 
-ParityStatus = namedtuple('ParityStatus', ['has_impl_parity', 'has_doc_parity'])
+ParityStatus = namedtuple("ParityStatus", ["has_impl_parity", "has_doc_parity"])
 
-'''
+"""
 This function expects the parity tracker Markdown file to have the following format:
 
 ```
@@ -28,31 +28,39 @@
     -> Dict[api_name]
         -> ParityStatus
 ```
-'''
+"""
+
+
 def parse_parity_tracker_table(file_path):
     def parse_parity_choice(str):
-        if str in ['Yes', 'No']:
-            return str == 'Yes'
+        if str in ["Yes", "No"]:
+            return str == "Yes"
         else:
             raise RuntimeError(
-                f'{str} is not a supported parity choice. The valid choices are "Yes" and "No".')
+                f'{str} is not a supported parity choice. The valid choices are "Yes" and "No".'
+            )
 
     parity_tracker_dict = {}
 
     with open(file_path) as f:
         all_text = f.read()
-        packages = all_text.split('##')
+        packages = all_text.split("##")
         for package in packages[1:]:
-            lines = [line.strip() for line in package.split('\n') if line.strip() != '']
+            lines = [line.strip() for line in package.split("\n") if line.strip() != ""]
             package_name = lines[0]
             if package_name in parity_tracker_dict:
-                raise RuntimeError(f"Duplicated package name `{package_name}` found in {file_path}")
+                raise RuntimeError(
+                    f"Duplicated package name `{package_name}` found in {file_path}"
+                )
             else:
                 parity_tracker_dict[package_name] = {}
             for api_status in lines[3:]:
-                api_name, has_impl_parity_str, has_doc_parity_str = (x.strip() for x in api_status.split('|'))
+                api_name, has_impl_parity_str, has_doc_parity_str = (
+                    x.strip() for x in api_status.split("|")
+                )
                 parity_tracker_dict[package_name][api_name] = ParityStatus(
                     has_impl_parity=parse_parity_choice(has_impl_parity_str),
-                    has_doc_parity=parse_parity_choice(has_doc_parity_str))
+                    has_doc_parity=parse_parity_choice(has_doc_parity_str),
+                )
 
     return parity_tracker_dict
diff --git a/test/cpp_api_parity/sample_functional.py b/test/cpp_api_parity/sample_functional.py
index 4f870fe7d62a0..7a0c0cc1b8dca 100644
--- a/test/cpp_api_parity/sample_functional.py
+++ b/test/cpp_api_parity/sample_functional.py
@@ -2,7 +2,7 @@
 import torch.nn.functional as F
 from torch.testing._internal.common_nn import wrap_functional
 
-'''
+"""
 `sample_functional` is used by `test_cpp_api_parity.py` to test that Python / C++ API
 parity test harness works for `torch.nn.functional` functions.
 
@@ -11,7 +11,8 @@
 
 When `has_parity=false` is passed to `sample_functional`, behavior of `sample_functional`
 is different from the C++ equivalent.
-'''
+"""
+
 
 def sample_functional(x, has_parity):
     if has_parity:
@@ -19,6 +20,7 @@ def sample_functional(x, has_parity):
     else:
         return x * 4
 
+
 torch.nn.functional.sample_functional = sample_functional
 
 SAMPLE_FUNCTIONAL_CPP_SOURCE = """\n
@@ -44,16 +46,16 @@ def sample_functional(x, has_parity):
 functional_tests = [
     dict(
         constructor=wrap_functional(F.sample_functional, has_parity=True),
-        cpp_options_args='F::SampleFunctionalFuncOptions(true)',
+        cpp_options_args="F::SampleFunctionalFuncOptions(true)",
         input_size=(1, 2, 3),
-        fullname='sample_functional_has_parity',
+        fullname="sample_functional_has_parity",
         has_parity=True,
     ),
     dict(
         constructor=wrap_functional(F.sample_functional, has_parity=False),
-        cpp_options_args='F::SampleFunctionalFuncOptions(false)',
+        cpp_options_args="F::SampleFunctionalFuncOptions(false)",
         input_size=(1, 2, 3),
-        fullname='sample_functional_no_parity',
+        fullname="sample_functional_no_parity",
         has_parity=False,
     ),
     # This is to test that setting the `test_cpp_api_parity=False` flag skips
@@ -61,9 +63,9 @@ def sample_functional(x, has_parity):
     # throw a parity error).
     dict(
         constructor=wrap_functional(F.sample_functional, has_parity=False),
-        cpp_options_args='F::SampleFunctionalFuncOptions(false)',
+        cpp_options_args="F::SampleFunctionalFuncOptions(false)",
         input_size=(1, 2, 3),
-        fullname='sample_functional_THIS_TEST_SHOULD_BE_SKIPPED',
+        fullname="sample_functional_THIS_TEST_SHOULD_BE_SKIPPED",
         test_cpp_api_parity=False,
     ),
 ]
diff --git a/test/cpp_api_parity/sample_module.py b/test/cpp_api_parity/sample_module.py
index e126bbd2b8bf1..6f3cd4de9d2b6 100644
--- a/test/cpp_api_parity/sample_module.py
+++ b/test/cpp_api_parity/sample_module.py
@@ -1,6 +1,6 @@
 import torch
 
-'''
+"""
 `SampleModule` is used by `test_cpp_api_parity.py` to test that Python / C++ API
 parity test harness works for `torch.nn.Module` subclasses.
 
@@ -9,7 +9,8 @@
 
 When `SampleModule.has_parity` is false, behavior of `forward` / `backward`
 is different from the C++ equivalent.
-'''
+"""
+
 
 class SampleModule(torch.nn.Module):
     def __init__(self, has_parity, has_submodule):
@@ -19,7 +20,7 @@ def __init__(self, has_parity, has_submodule):
             self.submodule = SampleModule(self.has_parity, False)
 
         self.has_submodule = has_submodule
-        self.register_parameter('param', torch.nn.Parameter(torch.empty(3, 4)))
+        self.register_parameter("param", torch.nn.Parameter(torch.empty(3, 4)))
 
         self.reset_parameters()
 
@@ -28,12 +29,15 @@ def reset_parameters(self):
             self.param.fill_(1)
 
     def forward(self, x):
-        submodule_forward_result = self.submodule(x) if hasattr(self, 'submodule') else 0
+        submodule_forward_result = (
+            self.submodule(x) if hasattr(self, "submodule") else 0
+        )
         if self.has_parity:
             return x + self.param * 2 + submodule_forward_result
         else:
             return x + self.param * 4 + submodule_forward_result + 3
 
+
 torch.nn.SampleModule = SampleModule
 
 SAMPLE_MODULE_CPP_SOURCE = """\n
@@ -73,31 +77,31 @@ def forward(self, x):
 
 module_tests = [
     dict(
-        module_name='SampleModule',
-        desc='has_parity',
+        module_name="SampleModule",
+        desc="has_parity",
         constructor_args=(True, True),
-        cpp_constructor_args='torch::nn::SampleModuleOptions(true, true)',
+        cpp_constructor_args="torch::nn::SampleModuleOptions(true, true)",
         input_size=(3, 4),
-        cpp_input_args=['torch::randn({3, 4})'],
+        cpp_input_args=["torch::randn({3, 4})"],
         has_parity=True,
     ),
     dict(
-        fullname='SampleModule_no_parity',
+        fullname="SampleModule_no_parity",
         constructor=lambda: SampleModule(has_parity=False, has_submodule=True),
-        cpp_constructor_args='torch::nn::SampleModuleOptions(false, true)',
+        cpp_constructor_args="torch::nn::SampleModuleOptions(false, true)",
         input_size=(3, 4),
-        cpp_input_args=['torch::randn({3, 4})'],
+        cpp_input_args=["torch::randn({3, 4})"],
         has_parity=False,
     ),
     # This is to test that setting the `test_cpp_api_parity=False` flag skips
     # the C++ API parity test accordingly (otherwise this test would run and
     # throw a parity error).
     dict(
-        fullname='SampleModule_THIS_TEST_SHOULD_BE_SKIPPED',
+        fullname="SampleModule_THIS_TEST_SHOULD_BE_SKIPPED",
         constructor=lambda: SampleModule(False, True),
-        cpp_constructor_args='torch::nn::SampleModuleOptions(false, true)',
+        cpp_constructor_args="torch::nn::SampleModuleOptions(false, true)",
         input_size=(3, 4),
-        cpp_input_args=['torch::randn({3, 4})'],
+        cpp_input_args=["torch::randn({3, 4})"],
         test_cpp_api_parity=False,
     ),
 ]
diff --git a/test/cpp_api_parity/utils.py b/test/cpp_api_parity/utils.py
index 8ac4700cb3c94..4771998e74ba2 100644
--- a/test/cpp_api_parity/utils.py
+++ b/test/cpp_api_parity/utils.py
@@ -1,37 +1,33 @@
-from collections import namedtuple
-import unittest
 import os
-import warnings
 import shutil
+import unittest
+import warnings
+from collections import namedtuple
 
 import torch
-import torch.utils.cpp_extension
 import torch.testing._internal.common_nn as common_nn
+import torch.utils.cpp_extension
 from torch.testing._internal.common_cuda import TEST_CUDA
 
 # Note that this namedtuple is for C++ parity test mechanism's internal use.
 # For guidance on how to add a new C++ parity test, please see
 # NOTE [How to check NN module / functional API parity between Python and C++ frontends]
 TorchNNModuleTestParams = namedtuple(
-    'TorchNNModuleTestParams',
+    "TorchNNModuleTestParams",
     [
         # NN module name (e.g. "BCELoss")
-        'module_name',
-
+        "module_name",
         # Unique identifier for this module config (e.g. "BCELoss_weights_cuda")
-        'module_variant_name',
-
+        "module_variant_name",
         # An instance of an NN test class (e.g. `CriterionTest`) which stores
         # necessary information (e.g. input / target / extra_args) for running the Python test
-        'test_instance',
-
+        "test_instance",
         # Constructor arguments passed to the C++ module constructor, which must be
         # strictly equivalent to the Python module constructor arguments
         # (e.g. `torch::nn::BCELossOptions().weight(torch::rand(10))`,
         # which is strictly equivalent to passing `torch.rand(10)` to `torch.nn.BCELoss`
         # constructor in Python)
-        'cpp_constructor_args',
-
+        "cpp_constructor_args",
         # All arguments used in NN module's forward pass.
         # Please see `compute_arg_dict` function for details on how we construct this dict.
         # (e.g.
@@ -44,42 +40,35 @@
         # }
         # ```
         # )
-        'arg_dict',
-
+        "arg_dict",
         # Whether we expect this NN module test to pass the Python/C++ parity test
         # (e.g. `True`)
-        'has_parity',
-
+        "has_parity",
         # Device (e.g. "cuda")
-        'device',
-
+        "device",
         # Temporary folder to store C++ outputs (to be compared with Python outputs later)
-        'cpp_tmp_folder',
-    ]
+        "cpp_tmp_folder",
+    ],
 )
 
 # Note that this namedtuple is for C++ parity test mechanism's internal use.
 # For guidance on how to add a new C++ parity test, please see
 # NOTE [How to check NN module / functional API parity between Python and C++ frontends]
 TorchNNFunctionalTestParams = namedtuple(
-    'TorchNNFunctionalTestParams',
+    "TorchNNFunctionalTestParams",
     [
         # NN functional name (e.g. "binary_cross_entropy")
-        'functional_name',
-
+        "functional_name",
         # Unique identifier for this functional config (e.g. "BCELoss_no_reduce_cuda")
-        'functional_variant_name',
-
+        "functional_variant_name",
         # An instance of an NN test class (e.g. `NewModuleTest`) which stores
         # necessary information (e.g. input / target / extra_args) for running the Python test
-        'test_instance',
-
+        "test_instance",
         # The C++ function call that is strictly equivalent to the Python function call
         # (e.g. "F::binary_cross_entropy(
         #            i, t.to(i.options()),F::BinaryCrossEntropyFuncOptions().reduction(torch::kNone))",
         # which is strictly equivalent to `F.binary_cross_entropy(i, t.type_as(i), reduction='none')` in Python)
-        'cpp_function_call',
-
+        "cpp_function_call",
         # All arguments used in NN functional's function call.
         # Please see `compute_arg_dict` function for details on how we construct this dict.
         # (e.g.
@@ -92,21 +81,18 @@
         # }
         # ```
         # )
-        'arg_dict',
-
+        "arg_dict",
         # Whether we expect this NN functional test to pass the Python/C++ parity test
         # (e.g. `True`)
-        'has_parity',
-
+        "has_parity",
         # Device (e.g. "cuda")
-        'device',
-
+        "device",
         # Temporary folder to store C++ outputs (to be compared with Python outputs later)
-        'cpp_tmp_folder',
-    ]
+        "cpp_tmp_folder",
+    ],
 )
 
-CppArg = namedtuple('CppArg', ['name', 'value'])
+CppArg = namedtuple("CppArg", ["name", "value"])
 
 TORCH_NN_COMMON_TEST_HARNESS = """
 #include <torch/script.h>
@@ -139,21 +125,27 @@
 }
 """
 
+
 def compile_cpp_code_inline(name, cpp_sources, functions):
     cpp_module = torch.utils.cpp_extension.load_inline(
         name=name,
         cpp_sources=cpp_sources,
-        extra_cflags=['-g'],  # Enable debug symbols by default for debugging test failures.
+        extra_cflags=[
+            "-g"
+        ],  # Enable debug symbols by default for debugging test failures.
         functions=functions,
         verbose=False,
     )
     return cpp_module
 
+
 def compute_temp_file_path(cpp_tmp_folder, variant_name, file_suffix):
-    return os.path.join(cpp_tmp_folder, f'{variant_name}_{file_suffix}.pt')
+    return os.path.join(cpp_tmp_folder, f"{variant_name}_{file_suffix}.pt")
+
 
 def is_torch_nn_functional_test(test_params_dict):
-    return 'wrap_functional' in str(test_params_dict.get('constructor', ''))
+    return "wrap_functional" in str(test_params_dict.get("constructor", ""))
+
 
 def convert_to_list(python_input):
     if isinstance(python_input, torch.Tensor):
@@ -161,31 +153,46 @@ def convert_to_list(python_input):
     else:
         return list(python_input)
 
+
 def set_python_tensors_requires_grad(python_tensors):
-    return [tensor.requires_grad_(True) if tensor.dtype != torch.long else tensor for tensor in python_tensors]
+    return [
+        tensor.requires_grad_(True) if tensor.dtype != torch.long else tensor
+        for tensor in python_tensors
+    ]
+
 
 def move_python_tensors_to_device(python_tensors, device):
     return [tensor.to(device) for tensor in python_tensors]
 
+
 def has_test(unit_test_class, test_name):
     return hasattr(unit_test_class, test_name)
 
+
 def add_test(unit_test_class, test_name, test_fn):
     if has_test(unit_test_class, test_name):
         raise RuntimeError("Found two tests with the same name: " + test_name)
     setattr(unit_test_class, test_name, test_fn)
 
+
 def set_cpp_tensors_requires_grad(cpp_tensor_stmts, python_tensors):
     assert len(cpp_tensor_stmts) == len(python_tensors)
-    return [f'{tensor_stmt}.requires_grad_(true)' if tensor.dtype != torch.long else tensor_stmt
-            for tensor_stmt, (_, tensor) in zip(cpp_tensor_stmts, python_tensors)]
+    return [
+        f"{tensor_stmt}.requires_grad_(true)"
+        if tensor.dtype != torch.long
+        else tensor_stmt
+        for tensor_stmt, (_, tensor) in zip(cpp_tensor_stmts, python_tensors)
+    ]
+
 
 def move_cpp_tensors_to_device(cpp_tensor_stmts, device):
     return [f'{tensor_stmt}.to("{device}")' for tensor_stmt in cpp_tensor_stmts]
 
+
 def is_criterion_test(test_instance):
     return isinstance(test_instance, common_nn.CriterionTest)
 
+
 # This function computes the following:
 # - What variable declaration statements should show up in the C++ parity test function
 # - What arguments should be passed into the C++ module/functional's forward function
@@ -213,26 +220,42 @@ def add_cpp_forward_args(args):
             cpp_forward_args_symbols.append(arg_name)
         return args_stmts
 
-    cpp_forward_input_args_stmts = set_cpp_tensors_requires_grad(move_cpp_tensors_to_device(
-        add_cpp_forward_args(test_params.arg_dict['input']), device), test_params.arg_dict['input'])
+    cpp_forward_input_args_stmts = set_cpp_tensors_requires_grad(
+        move_cpp_tensors_to_device(
+            add_cpp_forward_args(test_params.arg_dict["input"]), device
+        ),
+        test_params.arg_dict["input"],
+    )
     cpp_forward_target_args_stmts = move_cpp_tensors_to_device(
-        add_cpp_forward_args(test_params.arg_dict['target']), device)
+        add_cpp_forward_args(test_params.arg_dict["target"]), device
+    )
     cpp_forward_extra_args_stmts = move_cpp_tensors_to_device(
-        add_cpp_forward_args(test_params.arg_dict['extra_args']), device)
+        add_cpp_forward_args(test_params.arg_dict["extra_args"]), device
+    )
 
     # Build the list of other arguments needed
     cpp_other_args_stmts = []
-    for arg_name, _ in test_params.arg_dict['other']:
+    for arg_name, _ in test_params.arg_dict["other"]:
         cpp_other_args_stmts.append(f'auto {arg_name} = arg_dict.at("{arg_name}")')
     cpp_other_args_stmts = move_cpp_tensors_to_device(cpp_other_args_stmts, device)
 
-    cpp_args_construction_stmts = cpp_forward_input_args_stmts + cpp_forward_target_args_stmts + \
-        cpp_forward_extra_args_stmts + cpp_other_args_stmts
+    cpp_args_construction_stmts = (
+        cpp_forward_input_args_stmts
+        + cpp_forward_target_args_stmts
+        + cpp_forward_extra_args_stmts
+        + cpp_other_args_stmts
+    )
 
     return cpp_args_construction_stmts, cpp_forward_args_symbols
 
+
 def serialize_arg_dict_as_script_module(arg_dict):
-    arg_dict_flat = dict(arg_dict['input'] + arg_dict['target'] + arg_dict['extra_args'] + arg_dict['other'])
+    arg_dict_flat = dict(
+        arg_dict["input"]
+        + arg_dict["target"]
+        + arg_dict["extra_args"]
+        + arg_dict["other"]
+    )
     arg_dict_module = torch.nn.Module()
     for arg_name, arg_value in arg_dict_flat.items():
         assert isinstance(arg_value, torch.Tensor)
@@ -240,6 +263,7 @@ def serialize_arg_dict_as_script_module(arg_dict):
 
     return torch.jit.script(arg_dict_module)
 
+
 # NOTE: any argument symbol used in `cpp_constructor_args` / `cpp_options_args` / `cpp_function_call`
 # must have a mapping in `cpp_var_map`.
 #
@@ -270,38 +294,47 @@ def serialize_arg_dict_as_script_module(arg_dict):
 # ```
 def compute_arg_dict(test_params_dict, test_instance):
     arg_dict = {
-        'input': [],
-        'target': [],
-        'extra_args': [],
-        'other': [],
+        "input": [],
+        "target": [],
+        "extra_args": [],
+        "other": [],
     }
 
     def put_args_into_arg_dict(arg_type, arg_type_prefix, args):
         for i, arg in enumerate(args):
             arg_dict[arg_type].append(CppArg(name=arg_type_prefix + str(i), value=arg))
 
-    put_args_into_arg_dict('input', 'i', convert_to_list(test_instance._get_input()))
+    put_args_into_arg_dict("input", "i", convert_to_list(test_instance._get_input()))
     if is_criterion_test(test_instance):
-        put_args_into_arg_dict('target', 't', convert_to_list(test_instance._get_target()))
+        put_args_into_arg_dict(
+            "target", "t", convert_to_list(test_instance._get_target())
+        )
     if test_instance.extra_args:
-        put_args_into_arg_dict('extra_args', 'e', convert_to_list(test_instance.extra_args))
+        put_args_into_arg_dict(
+            "extra_args", "e", convert_to_list(test_instance.extra_args)
+        )
 
-    cpp_var_map = test_params_dict.get('cpp_var_map', {})
+    cpp_var_map = test_params_dict.get("cpp_var_map", {})
     for arg_name, arg_value in cpp_var_map.items():
         if isinstance(arg_value, str):
-            if arg_value == '_get_input()':
-                arg_dict['other'].append(CppArg(name=arg_name, value=test_instance._get_input()))
+            if arg_value == "_get_input()":
+                arg_dict["other"].append(
+                    CppArg(name=arg_name, value=test_instance._get_input())
+                )
             else:
-                raise RuntimeError(f"`{arg_name}` has unsupported string value: {arg_value}")
+                raise RuntimeError(
+                    f"`{arg_name}` has unsupported string value: {arg_value}"
+                )
         elif isinstance(arg_value, torch.Tensor):
-            arg_dict['other'].append(CppArg(name=arg_name, value=arg_value))
+            arg_dict["other"].append(CppArg(name=arg_name, value=arg_value))
         else:
             raise RuntimeError(f"`{arg_name}` has unsupported value: {arg_value}")
 
     return arg_dict
 
+
 def decorate_test_fn(test_fn, test_cuda, has_impl_parity, device):
-    if device == 'cuda':
+    if device == "cuda":
         test_fn = unittest.skipIf(not TEST_CUDA, "CUDA unavailable")(test_fn)
         test_fn = unittest.skipIf(not test_cuda, "Excluded from CUDA tests")(test_fn)
 
@@ -313,7 +346,8 @@ def decorate_test_fn(test_fn, test_cuda, has_impl_parity, device):
 
     return test_fn
 
-MESSAGE_HOW_TO_FIX_CPP_PARITY_TEST_FAILURE = '''
+
+MESSAGE_HOW_TO_FIX_CPP_PARITY_TEST_FAILURE = """
 What should I do when C++ API parity test is failing?
 
 - If you are changing the implementation of an existing `torch.nn` module / `torch.nn.functional` function:
@@ -337,13 +371,15 @@ def decorate_test_fn(test_fn, test_cuda, has_impl_parity, device):
 
 For more details on how to add a C++ API parity test, please see:
 NOTE [How to check NN module / functional API parity between Python and C++ frontends]
-'''
+"""
+
 
 def generate_error_msg(name, cpp_value, python_value):
     return (
-        "Parity test failed: {} in C++ has value: {}, "
-        "which does not match the corresponding value in Python: {}.\n{}").format(
-        name, cpp_value, python_value, MESSAGE_HOW_TO_FIX_CPP_PARITY_TEST_FAILURE)
+        f"Parity test failed: {name} in C++ has value: {cpp_value}, "
+        f"which does not match the corresponding value in Python: {python_value}.\n{MESSAGE_HOW_TO_FIX_CPP_PARITY_TEST_FAILURE}"
+    )
+
 
 def try_remove_folder(folder_path):
     if os.path.exists(folder_path):
@@ -351,4 +387,6 @@ def try_remove_folder(folder_path):
         try:
             shutil.rmtree(folder_path)
         except Exception as e:
-            warnings.warn(f"Non-blocking folder removal fails with the following error:\n{str(e)}")
+            warnings.warn(
+                f"Non-blocking folder removal fails with the following error:\n{str(e)}"
+            )
diff --git a/test/cpp_extensions/maia_extension.cpp b/test/cpp_extensions/maia_extension.cpp
new file mode 100644
index 0000000000000..13315810f54c4
--- /dev/null
+++ b/test/cpp_extensions/maia_extension.cpp
@@ -0,0 +1,126 @@
+#include <torch/extension.h>
+#include <torch/library.h>
+
+using namespace at;
+
+static int test_int;
+
+Tensor get_tensor(caffe2::TypeMeta dtype, IntArrayRef size) {
+  auto tensor_impl = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
+      Storage(
+          Storage::use_byte_size_t(),
+          0,
+          at::DataPtr(nullptr, Device(DeviceType::MAIA, 0)),
+          nullptr,
+          false),
+      DispatchKey::MAIA,
+      dtype);
+  // This is a hack to workaround the shape checks in _convolution.
+  tensor_impl->set_sizes_contiguous(size);
+  return Tensor(std::move(tensor_impl));
+}
+
+Tensor empty_override(IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device,
+                      c10::optional<bool> pin_memory, c10::optional<c10::MemoryFormat> optional_memory_format) {
+  test_int = 0;
+  return get_tensor(scalarTypeToTypeMeta(dtype_or_default(dtype)), size);
+}
+
+Tensor& add_out_override(const Tensor & a, const Tensor & b , const Scalar& c, Tensor & out) {
+  test_int = 1;
+  return out;
+}
+
+Tensor fake_convolution(
+    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias,
+    IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
+    bool transposed, IntArrayRef output_padding, int64_t groups) {
+  test_int = 2;
+  // Only the first 2 dimension of output shape is correct.
+  return get_tensor(input.dtype(), {input.size(0), weight.size(0), input.size(2), input.size(3)});
+}
+
+std::tuple<Tensor,Tensor,Tensor> fake_convolution_backward(
+        const Tensor & grad_output, const Tensor & input, const Tensor & weight,
+        IntArrayRef stride, IntArrayRef padding,
+        IntArrayRef dilation, bool transposed, IntArrayRef output_padding,
+        int64_t groups, std::array<bool,3> output_mask) {
+    test_int = 3;
+    return std::tuple<Tensor, Tensor, Tensor>(
+            get_tensor(input.dtype(), input.sizes()),
+            get_tensor(weight.dtype(), weight.sizes()),
+            get_tensor(input.dtype(), {}));
+}
+
+TORCH_LIBRARY_IMPL(aten, MAIA, m) {
+  m.impl("empty.memory_format",                empty_override);
+  m.impl("add.out",                            add_out_override);
+  m.impl("convolution_overrideable",           fake_convolution);
+  m.impl("convolution_backward_overrideable",  fake_convolution_backward);
+}
+
+// TODO: Extend this to exercise multi-device setting.  In that case,
+// we need to add a thread local variable to track the current device.
+struct MAIAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr DeviceType static_type = DeviceType::MAIA;
+  MAIAGuardImpl() {}
+  MAIAGuardImpl(DeviceType t) {
+    AT_ASSERT(t == DeviceType::MAIA);
+  }
+  DeviceType type() const override {
+    return DeviceType::MAIA;
+  }
+  Device exchangeDevice(Device d) const override {
+    AT_ASSERT(d.type() == DeviceType::MAIA);
+    AT_ASSERT(d.index() == 0);
+    return d;
+  }
+  Device getDevice() const override {
+    return Device(DeviceType::MAIA, 0);
+  }
+  void setDevice(Device d) const override {
+    AT_ASSERT(d.type() == DeviceType::MAIA);
+    AT_ASSERT(d.index() == 0);
+  }
+  void uncheckedSetDevice(Device d) const noexcept override {
+  }
+  Stream getStream(Device d) const noexcept override {
+    return Stream(Stream::DEFAULT, Device(DeviceType::MAIA, 0));
+  }
+  Stream exchangeStream(Stream s) const noexcept override {
+    return Stream(Stream::DEFAULT, Device(DeviceType::MAIA, 0));
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    return 1;
+  }
+
+  // Event-related functions
+  void record(void** event,
+    const Stream& stream,
+    const DeviceIndex device_index,
+    const EventFlag flag) const override {
+    TORCH_CHECK(false, "MAIA backend doesn't support events.");
+  }
+  void block(
+    void* event,
+    const Stream& stream) const override {
+    TORCH_CHECK(false, "MAIA backend doesn't support events.");
+  }
+  bool queryEvent(void* event) const override {
+    TORCH_CHECK(false, "MAIA backend doesn't support events.");
+  }
+  void destroyEvent(
+    void* event,
+    const DeviceIndex device_index) const noexcept override { }
+};
+
+constexpr DeviceType MAIAGuardImpl::static_type;
+C10_REGISTER_GUARD_IMPL(MAIA, MAIAGuardImpl);
+
+int get_test_int() {
+  return test_int;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("get_test_int", &get_test_int);
+}
diff --git a/test/cpp_extensions/mtia_extension.cpp b/test/cpp_extensions/mtia_extension.cpp
new file mode 100644
index 0000000000000..3b02d3968e482
--- /dev/null
+++ b/test/cpp_extensions/mtia_extension.cpp
@@ -0,0 +1,219 @@
+#include <ATen/detail/MTIAHooksInterface.h>
+#include <c10/core/Device.h>
+#include <c10/core/Stream.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/util/Logging.h>
+#include <torch/csrc/utils/device_lazy_init.h>
+#include <thread>
+namespace torch::mtia {
+
+constexpr c10::DeviceType kMTIADeviceType = c10::DeviceType::MTIA;
+constexpr c10::DeviceIndex kMTIADeviceCount = 2;
+static thread_local c10::DeviceIndex current_device = 0;
+static thread_local std::array<c10::Stream, kMTIADeviceCount> current_streams =
+    {c10::Stream::unpack3(0, 0, c10::DeviceType::MTIA),
+     c10::Stream::unpack3(0, 1, c10::DeviceType::MTIA)};
+static int64_t stream_id_gen = 1;
+static int64_t event_id_gen = 1;
+static std::array<c10::Stream, kMTIADeviceCount> default_streams = {
+    c10::Stream::unpack3(0, 0, c10::DeviceType::MTIA),
+    c10::Stream::unpack3(0, 1, c10::DeviceType::MTIA)};
+struct MTIAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  MTIAGuardImpl() = default;
+  explicit MTIAGuardImpl(c10::DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == kMTIADeviceType);
+  }
+  c10::DeviceType type() const override {
+    return kMTIADeviceType;
+  }
+  c10::Device exchangeDevice(c10::Device d) const override {
+    c10::Device old_device = getDevice();
+    if (old_device.index() != d.index()) {
+      setDevice(d);
+    }
+    return old_device;
+  }
+  c10::Device getDevice() const override {
+    return c10::Device(kMTIADeviceType, current_device);
+  }
+
+  void setDevice(c10::Device d) const override {
+    c10::Device current_device = getDevice();
+    if (current_device.index() != d.index()) {
+      current_device = d;
+    }
+  }
+  void uncheckedSetDevice(c10::Device d) const noexcept override {
+    (void)d;
+  }
+  c10::Stream getStream(c10::Device d) const noexcept override {
+    return current_streams[d.index()];
+  }
+  c10::Stream getNewStream(c10::Device d, int priority = 0) const override {
+    (void)priority;
+    return c10::Stream::unpack3(stream_id_gen++, d.index(), d.type());
+  }
+  c10::Stream getDefaultStream(c10::Device d) const override {
+    return default_streams[d.index()];
+  }
+  c10::Stream getStreamFromGlobalPool(
+      c10::Device d,
+      bool isHighPriority = false) const override {
+    return c10::Stream::unpack3(stream_id_gen++, d.index(), d.type());
+  }
+  // NB: These do NOT set the current device
+  c10::Stream exchangeStream(c10::Stream s) const noexcept override {
+    c10::Stream old_stream = getStream(s.device());
+    return old_stream;
+  }
+  c10::DeviceIndex deviceCount() const noexcept override {
+    return kMTIADeviceCount;
+  }
+
+  void destroyEvent(void* event, const c10::DeviceIndex device_index)
+      const noexcept override {
+    (void)device_index;
+  }
+
+  void record(
+      void** event,
+      const c10::Stream& stream,
+      const c10::DeviceIndex device_index,
+      const c10::EventFlag flag) const override {
+    TORCH_CHECK(
+        device_index == -1 || device_index == stream.device_index(),
+        "Event device index ",
+        device_index,
+        " does not match recording stream's device index ",
+        stream.device_index(),
+        ".");
+
+    const auto orig_device = getDevice();
+
+    setDevice(stream.device());
+
+    if (*event == nullptr) {
+      *event = reinterpret_cast<void*>(event_id_gen++);
+    }
+    setDevice(orig_device);
+  }
+
+  void block(void* event, const c10::Stream& stream) const override {
+    (void)event;
+    (void)stream;
+  }
+
+  // May be called from any device
+  bool queryEvent(void* event) const override {
+    (void)event;
+    return true;
+  }
+
+  // Stream-related functions
+  bool queryStream(const c10::Stream& stream) const override {
+    (void)stream;
+    return true;
+  }
+
+  void synchronizeStream(const c10::Stream& stream) const override {
+    (void)stream;
+  }
+
+  void recordDataPtrOnStream(
+      const c10::DataPtr& data_ptr,
+      const c10::Stream& stream) const override {
+    (void)data_ptr;
+    (void)stream;
+  }
+
+  double elapsedTime(void* event1, void* event2) const override {
+    uint64_t elapsed_time = 1e6;
+    return (double)(elapsed_time / 1e6);
+  }
+
+  void synchronizeEvent(void* event) const override {
+    (void)event;
+  }
+};
+
+struct MTIAHooks : public at::MTIAHooksInterface {
+  explicit MTIAHooks(at::MTIAHooksArgs) {}
+  void initMTIA() const override {}
+
+  bool hasMTIA() const override {
+    return true;
+  }
+
+  c10::DeviceIndex deviceCount() const override {
+    torch::utils::device_lazy_init(at::kMTIA);
+    return c10::DeviceIndex(2);
+  }
+
+  void deviceSynchronize(c10::DeviceIndex device_index) const override {
+    torch::utils::device_lazy_init(at::kMTIA);
+    (void)device_index;
+  }
+
+  std::string showConfig() const override {
+    return "None config";
+  }
+
+  c10::DeviceIndex exchangeDevice(c10::DeviceIndex device) const override {
+    torch::utils::device_lazy_init(at::kMTIA);
+    auto orig_device = current_device;
+    if (current_device != device) {
+      current_device = device;
+    }
+    return orig_device;
+  }
+
+  c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device) const override {
+    torch::utils::device_lazy_init(at::kMTIA);
+
+    auto orig_device = current_device;
+    if (current_device != device) {
+      current_device = device;
+    }
+    return orig_device;
+  }
+
+  c10::Stream getDefaultStream(c10::DeviceIndex device) const override {
+    torch::utils::device_lazy_init(at::kMTIA);
+
+    return default_streams[device];
+  }
+
+  c10::Stream getCurrentStream(c10::DeviceIndex device) const override {
+    torch::utils::device_lazy_init(at::kMTIA);
+
+    return current_streams[device];
+  }
+
+  void setCurrentStream(const c10::Stream& stream) const override {
+    torch::utils::device_lazy_init(at::kMTIA);
+
+    current_streams[stream.device_index()] = stream;
+  }
+
+  c10::DeviceIndex getCurrentDevice() const override {
+    torch::utils::device_lazy_init(at::kMTIA);
+
+    return current_device;
+  }
+
+  void setCurrentDevice(c10::DeviceIndex device) const override {
+    torch::utils::device_lazy_init(at::kMTIA);
+
+    if (current_device != device) {
+      current_device = device;
+    }
+  }
+};
+
+using at::MTIAHooksRegistry;
+using at::RegistererMTIAHooksRegistry;
+
+REGISTER_MTIA_HOOKS(MTIAHooks);
+C10_REGISTER_GUARD_IMPL(MTIA, MTIAGuardImpl);
+
+} // namespace torch::mtia
diff --git a/test/cpp_extensions/open_registration_extension.cpp b/test/cpp_extensions/open_registration_extension.cpp
index 5818d647bc4e8..f5b61102af7b2 100644
--- a/test/cpp_extensions/open_registration_extension.cpp
+++ b/test/cpp_extensions/open_registration_extension.cpp
@@ -43,13 +43,80 @@ C10_REGISTER_GUARD_IMPL(
 
 namespace {
 
-void abs_kernel(::at::TensorIteratorBase& iter) {
-  // Since this custom device is just for testing, not bothering to implement kernels.
-  abs_counter += 1;
+// Using the simplest way to obtain continuous Tensor data and process it.
+// This is a demo for using operand API, and you can add more complex logic
+// for input and output tensor based on your custom device kernel.
+void abs_kernel(at::TensorIteratorBase& iter) {
+  // Abs only have a input tensor and a output tensor.
+  auto& output_operand = iter.operand(0);
+  auto& input_operand = iter.operand(1);
+  auto& output_tensor_base = output_operand.tensor_base();
+  auto& input_tensor_base = input_operand.tensor_base();
+  TORCH_CHECK(!input_operand.original_tensor_base().defined(),
+    "input original tensor is defined.");
+  TORCH_CHECK(!output_operand.original_tensor_base().defined(),
+    "output original tensor is defined.");
+  // For easy test, only accept contiguous input tensor for calculate.
+  auto memory_format = input_tensor_base.suggest_memory_format();
+  TORCH_CHECK(input_tensor_base.is_contiguous(memory_format),
+    "Input tensor need be contiguous.");
+  // Add necessary restrictions to ensure the security of the demo.
+  TORCH_CHECK(input_tensor_base.sizes() == output_tensor_base.sizes(),
+    "Intput and output tensor size are not equal.");
+  // Common dtype is calculate in TensorIteratorBase.
+  TORCH_CHECK(iter.common_dtype() == at::ScalarType::Float,
+    "Only support float type.")
+  // Using for loop for abs calculate.
+  auto abs_function = [](float* output_ptr, const float* input_ptr,
+                         const int64_t NUM) {
+    for (int64_t i = 0; i < NUM; ++i) {
+      *(output_ptr + i) = std::abs(*(input_ptr + i));
+    }
+  };
+  // To simplify the logic of the test demo code,
+  // we only use contiguous tensor to calculate on device side.
+  // And using input tensor memory format.
+  if (iter.is_contiguous()) {
+    // Add for will_resize flag check. You can convert to differernt
+    // tensor memory format when will_resize is True.
+    // If TensorIteratorConfig resize_outputs_ flag is true, and there are two
+    // situations:
+    // 1) Out tensor is undefined, and TensorIterator set will_resize to true;
+    // 2) Out tensor is defined and tensor size is not equal to input tensor size;
+    //    TensorIterator set will_resize to true, and call set_output_raw_strided
+    //    to resize output tensor.
+    // When output operand will_resize flag is ture, dummy
+    // device can convert tensor to dummy device preferred memory format.
+    // Here we don't convert tensor memory format, because it will become complex
+    // when dummy device want keep same memory format for training network.
+    TORCH_CHECK(output_operand.will_resize,
+      "output operand will_resize flag need be True.");
+    abs_function((float*)iter.data_ptr(0), (float*)iter.data_ptr(1), iter.numel());
+  } else {
+    // Stride copy is not support for foo device, using cpu device instead.
+    // For abs op, the last situation is: output tensor is not contiguous with
+    // operand will_resize is False.
+    TORCH_CHECK(!output_operand.will_resize, "output operand will_resize is True.");
+    // Get a contiguous tensor with input memory format.
+    at::Tensor output = at::empty(output_tensor_base.sizes(),
+                                  input_tensor_base.options()
+                                                   .memory_format(memory_format));
+    // For structured op which inheried from TensorIteratorBase, maybe you need to
+    // call set_output_raw_strided function to update output stored in op sturctured.
+    // abs op is no need to do this.
+    output_operand.exchange_tensor(c10::MaybeOwned<at::TensorBase>::owned(std::in_place, output));
+    abs_function((float*)output_operand.tensor_base().mutable_data_ptr(),
+                 (float*)iter.data_ptr(1), iter.numel());
+    // Copy tensor base to original tensor base, and keep same scalar type and
+    // stride with cpu and gpu.
+    if (output_operand.original_tensor_base().defined() &&
+        !output_operand.original_tensor_base().is_same(output_operand.tensor_base())) {
+      output_operand.original_tensor().copy_(output_operand.tensor());
+      output_operand.restore_original_tensor();
+    }
+  }
 }
 
-} // namespace
-
 void quantize_tensor_per_tensor_affine_privateuse1(
     const at::Tensor& rtensor,
     at::Tensor& qtensor,
@@ -58,6 +125,8 @@ void quantize_tensor_per_tensor_affine_privateuse1(
     // do nothing
 }
 
+} // namespace
+
 namespace at::native {
 
 REGISTER_PRIVATEUSE1_DISPATCH(abs_stub, &abs_kernel);
@@ -137,10 +206,17 @@ void custom_set_backend_meta(const at::Tensor& t) {
 // A dummy storageImpl for our custom device, that secretly uses the CPU
 c10::intrusive_ptr<c10::StorageImpl> make_custom_storage_impl(c10::StorageImpl::use_byte_size_t,
                                                               c10::SymInt size_bytes,
+                                                              c10::DataPtr data_ptr,
                                                               c10::Allocator* allocator,
                                                               bool resizable) {
-  c10::intrusive_ptr<c10::StorageImpl> custom_storage_impl = c10::make_intrusive<c10::StorageImpl>(
+  c10::intrusive_ptr<c10::StorageImpl> custom_storage_impl;
+  if (data_ptr == nullptr){
+    custom_storage_impl = c10::make_intrusive<c10::StorageImpl>(
       c10::StorageImpl::use_byte_size_t(), size_bytes, allocator, resizable);
+  } else {
+    custom_storage_impl = c10::make_intrusive<c10::StorageImpl>(
+      c10::StorageImpl::use_byte_size_t(), size_bytes, std::move(data_ptr), allocator, resizable);
+  }
   storageImpl_counter += 1;
   return custom_storage_impl;
 }
@@ -173,7 +249,7 @@ at::Tensor& custom_abs_out(const at::Tensor& self, at::Tensor& out) {
 // A dummy allocator for our custom device, that secretly uses the CPU
 struct DummyCustomAllocator final : at::Allocator {
   DummyCustomAllocator() = default;
-  at::DataPtr allocate(size_t nbytes) const override {
+  at::DataPtr allocate(size_t nbytes) override {
     void* data = c10::alloc_cpu(nbytes);
     return {data, data, &ReportAndDelete, at::Device(at::DeviceType::PrivateUse1, custom_device_index)};
   }
@@ -188,6 +264,10 @@ struct DummyCustomAllocator final : at::Allocator {
   at::DeleterFnPtr raw_deleter() const override {
     return &ReportAndDelete;
   }
+
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    default_copy_data(dest, src, count);
+  }
 };
 
 // Register our dummy allocator
@@ -225,6 +305,36 @@ at::Tensor & custom_fill__scalar(at::Tensor & self, const at::Scalar & value) {
   return self;
 }
 
+// Unsafe using dummy device data_ptr to creat a cpu tensor, and shared data_ptr.
+at::Tensor unsafe_create_cpu_tensor_from_dummy_tensor(const at::Tensor& src) {
+  TORCH_CHECK(src.device().type() == c10::DeviceType::PrivateUse1,
+              "Only support dummy device.");
+  const auto& sizes_ = src.sizes();
+  const auto& strides_ = src.strides();
+  auto storage_offset_ = src.storage_offset();
+  at::detail::check_size_nonnegative(sizes_);
+
+  size_t size_bytes = at::detail::computeStorageNbytes(sizes_, strides_,
+                                                       src.element_size(),
+                                                       storage_offset_);
+
+  at::DataPtr data_ptr =
+    c10::InefficientStdFunctionContext::makeDataPtr(src.storage().mutable_data_ptr().get(),
+                                                    [](void*){}, at::kCPU);
+
+  c10::Storage storage{c10::Storage::use_byte_size_t{}, size_bytes, std::move(data_ptr),
+    /*allocator=*/&global_custom_alloc, /*resizeable=*/false};
+
+  constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU);
+  at::Tensor tensor = at::detail::make_tensor<c10::TensorImpl>(
+       std::move(storage), cpu_ks, src.dtype());
+
+  c10::TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
+  tensor_impl->set_sizes_and_strides(sizes_, strides_);
+  tensor_impl->set_storage_offset(storage_offset_);
+  return tensor;
+}
+
 // basic dummy copy_() function, so we can copy from the custom device to/from CPU
 at::Tensor custom__copy_from(const at::Tensor& self, const at::Tensor& dst, bool non_blocking) {
   TORCH_CHECK(
@@ -237,9 +347,18 @@ at::Tensor custom__copy_from(const at::Tensor& self, const at::Tensor& dst, bool
   // Some dummy asserts for the basic use case: inputs are the same size / dtype, all contiguous.
   TORCH_CHECK(self.sizes() == dst.sizes());
   TORCH_CHECK(self.scalar_type() == dst.scalar_type());
-  TORCH_CHECK(self.is_contiguous() && dst.is_contiguous());
 
-  std::memcpy(dst.storage().data_ptr().get(), self.storage().data_ptr().get(), self.storage().nbytes());
+  if (self.is_contiguous() && dst.is_contiguous()) {
+    std::memcpy(dst.storage().data_ptr().get(),
+                self.storage().data_ptr().get(),
+                self.storage().nbytes());
+  } else {
+    // Using cpu tensor to accomplishment stride copy.
+    at::Tensor cpu_self = unsafe_create_cpu_tensor_from_dummy_tensor(self);
+    at::Tensor cpu_dst = unsafe_create_cpu_tensor_from_dummy_tensor(dst);
+    cpu_dst.copy_(cpu_self);
+  }
+
   return dst;
 }
 
@@ -318,15 +437,24 @@ bool custom_is_pinned(const at::Tensor& self, c10::optional<at::Device> device)
 
 const at::Tensor& custom_resize_(const at::Tensor& self, at::IntArrayRef size,
                           c10::optional<at::MemoryFormat> optional_memory_format) {
-  self.unsafeGetTensorImpl()->set_sizes_contiguous(size);
-  const auto itemsize = self.unsafeGetTensorImpl()->dtype().itemsize();
-  const auto offset = self.unsafeGetTensorImpl()->storage_offset();
+  at::TensorImpl* tensor_impl = self.unsafeGetTensorImpl();
+  tensor_impl->set_sizes_contiguous(size);
+  const auto itemsize = tensor_impl->dtype().itemsize();
+  const auto offset = tensor_impl->storage_offset();
   const auto storage_size = at::detail::computeStorageNbytesContiguous(size, itemsize, offset);
-  const auto &storage = self.unsafeGetTensorImpl()->unsafe_storage();
-  if (storage_size > storage.nbytes()) {
-    storage.unsafeGetStorageImpl()->set_nbytes(storage_size);
+  // Dummy device is using cpu allocator, so here just call cpu
+  // function maybe_resize_storage_cpu in aten/src/ATen/native/Resize.h
+  // to get a sufficient memory space.
+  at::native::maybe_resize_storage_cpu(tensor_impl, storage_size);
+  if (optional_memory_format.has_value()) {
+    auto memory_format =
+        optional_memory_format.value();
+    TORCH_CHECK(
+        memory_format != at::MemoryFormat::Preserve,
+        "Unsupported memory format",
+        memory_format);
+    tensor_impl->empty_tensor_restride(memory_format);
   }
-
   return self;
 }
 
@@ -352,6 +480,7 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
   m.impl("_pin_memory", &custom__pin_memory);
   m.impl("is_pinned", &custom_is_pinned);
   m.impl("resize_", &custom_resize_);
+  m.impl("as_strided", at::native::as_strided_tensorimpl);
   m.impl("quantize_per_tensor", at::native::quantize_per_tensor);
 }
 
@@ -362,6 +491,8 @@ void custom_cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack
 TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
   m.impl("sub.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
   m.impl("_foreach_add.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("index.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("triu_indices", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
 }
 
 // This basic implementation doesn't bother dealing with different device indices
@@ -382,15 +513,6 @@ bool custom_add_called() {
   return called;
 }
 
-bool custom_abs_called() {
-  bool called = false;
-  if (abs_counter > last_abs_saved_value) {
-    called = true;
-    last_abs_saved_value = abs_counter;
-  }
-  return called;
-}
-
 class PrivateGeneratorImpl : public at::CPUGeneratorImpl {
 public:
   // Constructors
@@ -491,7 +613,6 @@ at::Tensor custom_autograd_fn_aliasing(at::Tensor x) {
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.def("custom_device", &get_custom_device, "get custom device object");
     m.def("custom_add_called", &custom_add_called, "check if our custom add function was called");
-    m.def("custom_abs_called", &custom_abs_called, "check if our custom abs function was called");
     m.def("register_generator_first", &register_generator_first, "register generator for custom device firstly");
     m.def("register_generator_second", &register_generator_second, "register generator for custom device secondly");
     m.def("set_custom_device_index", &set_custom_device_index, "set custom device index");
diff --git a/test/cpp_extensions/ort_extension.cpp b/test/cpp_extensions/ort_extension.cpp
deleted file mode 100644
index b646f3b14939d..0000000000000
--- a/test/cpp_extensions/ort_extension.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#include <torch/extension.h>
-#include <torch/library.h>
-
-using namespace at;
-
-static int test_int;
-
-Tensor get_tensor(caffe2::TypeMeta dtype, IntArrayRef size) {
-  auto tensor_impl = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-      Storage(
-          Storage::use_byte_size_t(),
-          0,
-          at::DataPtr(nullptr, Device(DeviceType::ORT, 0)),
-          nullptr,
-          false),
-      DispatchKey::ORT,
-      dtype);
-  // This is a hack to workaround the shape checks in _convolution.
-  tensor_impl->set_sizes_contiguous(size);
-  return Tensor(std::move(tensor_impl));
-}
-
-Tensor empty_override(IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device,
-                      c10::optional<bool> pin_memory, c10::optional<c10::MemoryFormat> optional_memory_format) {
-  test_int = 0;
-  return get_tensor(scalarTypeToTypeMeta(dtype_or_default(dtype)), size);
-}
-
-Tensor& add_out_override(const Tensor & a, const Tensor & b , const Scalar& c, Tensor & out) {
-  test_int = 1;
-  return out;
-}
-
-Tensor fake_convolution(
-    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias,
-    IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
-    bool transposed, IntArrayRef output_padding, int64_t groups) {
-  test_int = 2;
-  // Only the first 2 dimension of output shape is correct.
-  return get_tensor(input.dtype(), {input.size(0), weight.size(0), input.size(2), input.size(3)});
-}
-
-std::tuple<Tensor,Tensor,Tensor> fake_convolution_backward(
-        const Tensor & grad_output, const Tensor & input, const Tensor & weight,
-        IntArrayRef stride, IntArrayRef padding,
-        IntArrayRef dilation, bool transposed, IntArrayRef output_padding,
-        int64_t groups, std::array<bool,3> output_mask) {
-    test_int = 3;
-    return std::tuple<Tensor, Tensor, Tensor>(
-            get_tensor(input.dtype(), input.sizes()),
-            get_tensor(weight.dtype(), weight.sizes()),
-            get_tensor(input.dtype(), {}));
-}
-
-TORCH_LIBRARY_IMPL(aten, ORT, m) {
-  m.impl("empty.memory_format",                empty_override);
-  m.impl("add.out",                            add_out_override);
-  m.impl("convolution_overrideable",           fake_convolution);
-  m.impl("convolution_backward_overrideable",  fake_convolution_backward);
-}
-
-// TODO: Extend this to exercise multi-device setting.  In that case,
-// we need to add a thread local variable to track the current device.
-struct ORTGuardImpl final : public c10::impl::DeviceGuardImplInterface {
-  static constexpr DeviceType static_type = DeviceType::ORT;
-  ORTGuardImpl() {}
-  ORTGuardImpl(DeviceType t) {
-    AT_ASSERT(t == DeviceType::ORT);
-  }
-  DeviceType type() const override {
-    return DeviceType::ORT;
-  }
-  Device exchangeDevice(Device d) const override {
-    AT_ASSERT(d.type() == DeviceType::ORT);
-    AT_ASSERT(d.index() == 0);
-    return d;
-  }
-  Device getDevice() const override {
-    return Device(DeviceType::ORT, 0);
-  }
-  void setDevice(Device d) const override {
-    AT_ASSERT(d.type() == DeviceType::ORT);
-    AT_ASSERT(d.index() == 0);
-  }
-  void uncheckedSetDevice(Device d) const noexcept override {
-  }
-  Stream getStream(Device d) const noexcept override {
-    return Stream(Stream::DEFAULT, Device(DeviceType::ORT, 0));
-  }
-  Stream exchangeStream(Stream s) const noexcept override {
-    return Stream(Stream::DEFAULT, Device(DeviceType::ORT, 0));
-  }
-  DeviceIndex deviceCount() const noexcept override {
-    return 1;
-  }
-
-  // Event-related functions
-  void record(void** event,
-    const Stream& stream,
-    const DeviceIndex device_index,
-    const EventFlag flag) const override {
-    TORCH_CHECK(false, "ORT backend doesn't support events.");
-  }
-  void block(
-    void* event,
-    const Stream& stream) const override {
-    TORCH_CHECK(false, "ORT backend doesn't support events.");
-  }
-  bool queryEvent(void* event) const override {
-    TORCH_CHECK(false, "ORT backend doesn't support events.");
-  }
-  void destroyEvent(
-    void* event,
-    const DeviceIndex device_index) const noexcept override { }
-};
-
-constexpr DeviceType ORTGuardImpl::static_type;
-C10_REGISTER_GUARD_IMPL(ORT, ORTGuardImpl);
-
-int get_test_int() {
-  return test_int;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("get_test_int", &get_test_int);
-}
diff --git a/test/cpp_extensions/setup.py b/test/cpp_extensions/setup.py
index f775f188c6d7f..4d4288a3076fc 100644
--- a/test/cpp_extensions/setup.py
+++ b/test/cpp_extensions/setup.py
@@ -1,58 +1,68 @@
+import os
 import sys
+
 import torch.cuda
-import os
 from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
-from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
 from torch.testing._internal.common_utils import IS_WINDOWS
+from torch.utils.cpp_extension import (
+    BuildExtension,
+    CppExtension,
+    CUDA_HOME,
+    CUDAExtension,
+    ROCM_HOME,
+)
 
-if sys.platform == 'win32':
-    vc_version = os.getenv('VCToolsVersion', '')
-    if vc_version.startswith('14.16.'):
-        CXX_FLAGS = ['/sdl']
+if sys.platform == "win32":
+    vc_version = os.getenv("VCToolsVersion", "")
+    if vc_version.startswith("14.16."):
+        CXX_FLAGS = ["/sdl"]
     else:
-        CXX_FLAGS = ['/sdl', '/permissive-']
+        CXX_FLAGS = ["/sdl", "/permissive-"]
 else:
-    CXX_FLAGS = ['-g']
+    CXX_FLAGS = ["-g"]
 
-USE_NINJA = os.getenv('USE_NINJA') == '1'
+USE_NINJA = os.getenv("USE_NINJA") == "1"
 
 ext_modules = [
     CppExtension(
-        'torch_test_cpp_extension.cpp', ['extension.cpp'],
-        extra_compile_args=CXX_FLAGS),
+        "torch_test_cpp_extension.cpp", ["extension.cpp"], extra_compile_args=CXX_FLAGS
+    ),
     CppExtension(
-        'torch_test_cpp_extension.ort', ['ort_extension.cpp'],
-        extra_compile_args=CXX_FLAGS),
+        "torch_test_cpp_extension.maia",
+        ["maia_extension.cpp"],
+        extra_compile_args=CXX_FLAGS,
+    ),
     CppExtension(
-        'torch_test_cpp_extension.rng', ['rng_extension.cpp'],
-        extra_compile_args=CXX_FLAGS),
+        "torch_test_cpp_extension.rng",
+        ["rng_extension.cpp"],
+        extra_compile_args=CXX_FLAGS,
+    ),
 ]
 
 if torch.cuda.is_available() and (CUDA_HOME is not None or ROCM_HOME is not None):
     extension = CUDAExtension(
-        'torch_test_cpp_extension.cuda', [
-            'cuda_extension.cpp',
-            'cuda_extension_kernel.cu',
-            'cuda_extension_kernel2.cu',
+        "torch_test_cpp_extension.cuda",
+        [
+            "cuda_extension.cpp",
+            "cuda_extension_kernel.cu",
+            "cuda_extension_kernel2.cu",
         ],
-        extra_compile_args={'cxx': CXX_FLAGS,
-                            'nvcc': ['-O2']})
+        extra_compile_args={"cxx": CXX_FLAGS, "nvcc": ["-O2"]},
+    )
     ext_modules.append(extension)
 
 if torch.cuda.is_available() and (CUDA_HOME is not None or ROCM_HOME is not None):
     extension = CUDAExtension(
-        'torch_test_cpp_extension.torch_library', [
-            'torch_library.cu'
-        ],
-        extra_compile_args={'cxx': CXX_FLAGS,
-                            'nvcc': ['-O2']})
+        "torch_test_cpp_extension.torch_library",
+        ["torch_library.cu"],
+        extra_compile_args={"cxx": CXX_FLAGS, "nvcc": ["-O2"]},
+    )
     ext_modules.append(extension)
 
 if torch.backends.mps.is_available():
     extension = CppExtension(
-        'torch_test_cpp_extension.mps',
-        ['mps_extension.mm'],
+        "torch_test_cpp_extension.mps",
+        ["mps_extension.mm"],
         extra_compile_args=CXX_FLAGS,
     )
     ext_modules.append(extension)
@@ -61,35 +71,41 @@
 if (not IS_WINDOWS) and torch.cuda.is_available() and CUDA_HOME is not None:
     # malfet: One should not assume that PyTorch re-exports CUDA dependencies
     cublas_extension = CUDAExtension(
-        name='torch_test_cpp_extension.cublas_extension',
-        sources=['cublas_extension.cpp'],
-        libraries=['cublas'] if torch.version.hip is None else [],
+        name="torch_test_cpp_extension.cublas_extension",
+        sources=["cublas_extension.cpp"],
+        libraries=["cublas"] if torch.version.hip is None else [],
     )
     ext_modules.append(cublas_extension)
 
     cusolver_extension = CUDAExtension(
-        name='torch_test_cpp_extension.cusolver_extension',
-        sources=['cusolver_extension.cpp'],
-        libraries=['cusolver'] if torch.version.hip is None else [],
+        name="torch_test_cpp_extension.cusolver_extension",
+        sources=["cusolver_extension.cpp"],
+        libraries=["cusolver"] if torch.version.hip is None else [],
     )
     ext_modules.append(cusolver_extension)
 
-if USE_NINJA and (not IS_WINDOWS) and torch.cuda.is_available() and CUDA_HOME is not None:
+if (
+    USE_NINJA
+    and (not IS_WINDOWS)
+    and torch.cuda.is_available()
+    and CUDA_HOME is not None
+):
     extension = CUDAExtension(
-        name='torch_test_cpp_extension.cuda_dlink',
+        name="torch_test_cpp_extension.cuda_dlink",
         sources=[
-            'cuda_dlink_extension.cpp',
-            'cuda_dlink_extension_kernel.cu',
-            'cuda_dlink_extension_add.cu',
+            "cuda_dlink_extension.cpp",
+            "cuda_dlink_extension_kernel.cu",
+            "cuda_dlink_extension_add.cu",
         ],
         dlink=True,
-        extra_compile_args={'cxx': CXX_FLAGS,
-                            'nvcc': ['-O2', '-dc']})
+        extra_compile_args={"cxx": CXX_FLAGS, "nvcc": ["-O2", "-dc"]},
+    )
     ext_modules.append(extension)
 
 setup(
-    name='torch_test_cpp_extension',
-    packages=['torch_test_cpp_extension'],
+    name="torch_test_cpp_extension",
+    packages=["torch_test_cpp_extension"],
     ext_modules=ext_modules,
-    include_dirs='self_compiler_include_dirs_test',
-    cmdclass={'build_ext': BuildExtension.with_options(use_ninja=USE_NINJA)})
+    include_dirs="self_compiler_include_dirs_test",
+    cmdclass={"build_ext": BuildExtension.with_options(use_ninja=USE_NINJA)},
+)
diff --git a/test/create_dummy_torchscript_model.py b/test/create_dummy_torchscript_model.py
index 500087b9a239c..4a95b34bbe4c8 100644
--- a/test/create_dummy_torchscript_model.py
+++ b/test/create_dummy_torchscript_model.py
@@ -1,11 +1,11 @@
 # Usage: python create_dummy_model.py <name_of_the_file>
 import sys
+
 import torch
 from torch import nn
 
 
 class NeuralNetwork(nn.Module):
-
     def __init__(self):
         super().__init__()
         self.flatten = nn.Flatten()
@@ -23,7 +23,7 @@ def forward(self, x):
         return logits
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     jit_module = torch.jit.script(NeuralNetwork())
     torch.jit.save(jit_module, sys.argv[1])
     orig_module = nn.Sequential(
diff --git a/test/custom_backend/backend.py b/test/custom_backend/backend.py
index 477dd37b57d6f..d954c9318d8bf 100644
--- a/test/custom_backend/backend.py
+++ b/test/custom_backend/backend.py
@@ -1,6 +1,7 @@
 import argparse
 import os.path
 import sys
+
 import torch
 
 
@@ -33,7 +34,9 @@ def to_custom_backend(module):
     Returns:
         The module, lowered so that it can run on TestBackend.
     """
-    lowered_module = torch._C._jit_to_backend("custom_backend", module, {"forward": {"": ""}})
+    lowered_module = torch._C._jit_to_backend(
+        "custom_backend", module, {"forward": {"": ""}}
+    )
     return lowered_module
 
 
@@ -48,9 +51,7 @@ def forward(self, a, b):
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="Lower a Module to a custom backend"
-    )
+    parser = argparse.ArgumentParser(description="Lower a Module to a custom backend")
     parser.add_argument("--export-module-to", required=True)
     options = parser.parse_args()
 
diff --git a/test/custom_backend/test_custom_backend.py b/test/custom_backend/test_custom_backend.py
index 008008aaedcdd..7b3d300d0342f 100644
--- a/test/custom_backend/test_custom_backend.py
+++ b/test/custom_backend/test_custom_backend.py
@@ -2,10 +2,11 @@
 
 import os
 import tempfile
+
 import torch
 
-from backend import Model, to_custom_backend, get_custom_backend_library_path
-from torch.testing._internal.common_utils import TestCase, run_tests
+from backend import get_custom_backend_library_path, Model, to_custom_backend
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class TestCustomBackend(TestCase):
diff --git a/test/custom_operator/op.cpp b/test/custom_operator/op.cpp
index e73d86ff05e17..c9389713428bc 100644
--- a/test/custom_operator/op.cpp
+++ b/test/custom_operator/op.cpp
@@ -87,6 +87,11 @@ TORCH_LIBRARY_FRAGMENT(custom, m) {
     m.def("nonzero(Tensor x) -> Tensor");
 }
 
+TORCH_LIBRARY_FRAGMENT(custom, m) {
+    m.impl_abstract_pystub("nonexistent");
+    m.def("asin(Tensor x) -> Tensor");
+}
+
 TORCH_LIBRARY_FRAGMENT(custom, m) {
     m.def("tan(Tensor x) -> Tensor");
 }
@@ -94,4 +99,5 @@ TORCH_LIBRARY_FRAGMENT(custom, m) {
 TORCH_LIBRARY_IMPL(custom, CPU, m) {
   m.impl("nonzero", &custom_nonzero);
   m.impl("sin", &custom_sin);
+  m.impl("asin", &at::asin);
 }
diff --git a/test/custom_operator/test_custom_ops.py b/test/custom_operator/test_custom_ops.py
index b69d1e206d1b1..1b490c49e9ca1 100644
--- a/test/custom_operator/test_custom_ops.py
+++ b/test/custom_operator/test_custom_ops.py
@@ -3,15 +3,18 @@
 import os.path
 import sys
 import tempfile
+import unittest
 
 import torch
-from torch import ops
+import torch._library.utils as utils
 
-from model import Model, get_custom_op_library_path
-from torch.testing._internal.common_utils import TestCase, run_tests
+from model import get_custom_op_library_path, Model
+from torch import ops
+from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
 
 torch.ops.import_module("pointwise")
 
+
 class TestCustomOperators(TestCase):
     def setUp(self):
         self.library_path = get_custom_op_library_path()
@@ -21,35 +24,59 @@ def test_custom_library_is_loaded(self):
         self.assertIn(self.library_path, ops.loaded_libraries)
 
     def test_op_with_no_abstract_impl_pystub(self):
-        x = torch.randn(3, device='meta')
-        with self.assertRaisesRegex(RuntimeError, "pointwise"):
+        x = torch.randn(3, device="meta")
+        if utils.requires_set_python_module():
+            with self.assertRaisesRegex(RuntimeError, "pointwise"):
+                torch.ops.custom.tan(x)
+        else:
+            # Smoketest
             torch.ops.custom.tan(x)
 
     def test_op_with_incorrect_abstract_impl_pystub(self):
-        x = torch.randn(3, device='meta')
+        x = torch.randn(3, device="meta")
         with self.assertRaisesRegex(RuntimeError, "pointwise"):
             torch.ops.custom.cos(x)
 
+    @unittest.skipIf(IS_WINDOWS, "torch.compile not supported on windows")
+    def test_dynamo_pystub_suggestion(self):
+        x = torch.randn(3)
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            return torch.ops.custom.asin(x)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"unsupported operator: .* you may need to `import nonexistent`",
+        ):
+            f(x)
+
     def test_abstract_impl_pystub_faketensor(self):
         from functorch import make_fx
-        x = torch.randn(3, device='cpu')
+
+        x = torch.randn(3, device="cpu")
         self.assertNotIn("my_custom_ops", sys.modules.keys())
 
-        with self.assertRaises(torch._subclasses.fake_tensor.UnsupportedOperatorException):
+        with self.assertRaises(
+            torch._subclasses.fake_tensor.UnsupportedOperatorException
+        ):
             gm = make_fx(torch.ops.custom.nonzero.default, tracing_mode="symbolic")(x)
 
         torch.ops.import_module("my_custom_ops")
         gm = make_fx(torch.ops.custom.nonzero.default, tracing_mode="symbolic")(x)
-        self.assertExpectedInline("""\
+        self.assertExpectedInline(
+            """\
 def forward(self, arg0_1):
     nonzero = torch.ops.custom.nonzero.default(arg0_1);  arg0_1 = None
     return nonzero
-""".strip(), gm.code.strip())
+""".strip(),
+            gm.code.strip(),
+        )
 
     def test_abstract_impl_pystub_meta(self):
         x = torch.randn(3, device="meta")
         self.assertNotIn("my_custom_ops2", sys.modules.keys())
-        with self.assertRaisesRegex(NotImplementedError, r"import the 'my_custom_ops2'"):
+        with self.assertRaisesRegex(NotImplementedError, r"'my_custom_ops2'"):
             y = torch.ops.custom.sin.default(x)
         torch.ops.import_module("my_custom_ops2")
         y = torch.ops.custom.sin.default(x)
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_autograd.py b/test/distributed/_composable/fsdp/test_fully_shard_autograd.py
new file mode 100644
index 0000000000000..81929075199ea
--- /dev/null
+++ b/test/distributed/_composable/fsdp/test_fully_shard_autograd.py
@@ -0,0 +1,236 @@
+# Owner(s): ["oncall: distributed"]
+
+import collections
+import copy
+
+from typing import Any, List, Optional, Type, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+from torch.distributed._composable.fsdp import fully_shard
+from torch.nn.parallel.scatter_gather import _is_namedtuple
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    check_sharded_parity,
+    DoubleLinear,
+    FSDPTest,
+)
+from torch.testing._internal.common_utils import run_tests
+
+
+class TestFullyShardAutograd(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.cuda.device_count())
+
+    def _reduce_1d_partial_grads(
+        self, module: nn.Module, group: Optional[dist.ProcessGroup] = None
+    ) -> None:
+        group = group or dist.distributed_c10d._get_default_group()
+        for param in module.parameters():
+            if param.grad is not None:
+                param.grad.div_(group.size())
+
+    @skip_if_lt_x_gpu(2)
+    def test_unused_forward_output(self):
+        """
+        Tests that gradients propagate when running a backward where some
+        forward output is not used to compute the loss, motivated by:
+        https://github.com/pytorch/pytorch/pull/83195
+        """
+        self.run_subtests(
+            {"reshard_after_forward": [True, False, 2]},
+            self._test_unused_forward_output,
+        )
+
+    def _test_unused_forward_output(self, reshard_after_forward: Union[bool, int]):
+        torch.manual_seed(42)
+        local_batch_size = 2
+        global_batch_size, dim = (self.world_size * local_batch_size, 24)
+        model = DoubleLinear(dim=dim, use_second_linear=True)
+        ref_model = copy.deepcopy(model).cuda()
+        fully_shard(model.lin1, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, reshard_after_forward=reshard_after_forward)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        torch.manual_seed(1)  # same on all ranks
+        for iter_idx in range(10):
+            # Use all forward outputs in the loss/backward for the first half
+            # of the iterations and only the 1st forward output for the rest
+            global_inp = torch.rand((global_batch_size, dim), device="cuda")
+            local_inp = global_inp[
+                self.rank * local_batch_size : (self.rank + 1) * local_batch_size
+            ].detach()
+            out1, out2 = model(local_inp)
+            loss = (out1 * out2).sum() if iter_idx < 3 else out1.sum()
+            loss.backward()
+            optim.step()
+            ref_out1, ref_out2 = ref_model(global_inp)
+            ref_loss = (ref_out1 * ref_out2).sum() if iter_idx < 3 else ref_out1.sum()
+            ref_loss.backward()
+            self._reduce_1d_partial_grads(ref_model)
+            ref_optim.step()
+            dist.all_reduce(loss)  # partial -> replicated
+            self.assertEqual(loss, ref_loss)
+            optim.zero_grad(set_to_none=(iter_idx % 2))
+            ref_optim.zero_grad(set_to_none=(iter_idx % 2))
+            check_sharded_parity(self, ref_model, model)
+
+    @skip_if_lt_x_gpu(2)
+    def test_unused_forward_module(self):
+        """
+        Tests that gradients propagate when running a backward where some
+        forward module is not used to compute the loss, motivated by:
+        https://github.com/pytorch/pytorch/pull/80245
+        """
+        self.run_subtests(
+            {"reshard_after_forward": [True, False, 2]},
+            self._test_unused_forward_module,
+        )
+
+    def _test_unused_forward_module(self, reshard_after_forward: Union[bool, int]):
+        torch.manual_seed(42)
+        local_batch_size, dim = (2, 24)
+        global_batch_size = self.world_size * local_batch_size
+        model = DoubleLinear(dim=dim, use_second_linear=False)
+        ref_model = copy.deepcopy(model).cuda()
+        fully_shard(model.lin1, reshard_after_forward=reshard_after_forward)
+        fully_shard(model.lin2, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, reshard_after_forward=reshard_after_forward)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        torch.manual_seed(1)  # same on all ranks
+        for iter_idx in range(10):
+            global_inp = torch.rand((global_batch_size, dim), device="cuda")
+            local_inp = global_inp[
+                self.rank * local_batch_size : (self.rank + 1) * local_batch_size
+            ].detach()
+            losses: List[torch.Tensor] = []
+            for _model, inp in ((ref_model, global_inp), (model, local_inp)):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+            self._reduce_1d_partial_grads(ref_model)
+            dist.all_reduce(losses[1])  # partial -> replicated
+            self.assertEqual(losses[0], losses[1])
+            check_sharded_parity(self, ref_model, model)
+            for _optim in (optim, ref_optim):
+                _optim.step()
+                _optim.zero_grad(set_to_none=(iter_idx % 2))
+
+    @skip_if_lt_x_gpu(2)
+    def test_nontensor_activations(self):
+        """
+        Tests that gradients propagate when running forward with nontensor
+        data structures wrapping the activations. This is mainly to test the
+        hook registration.
+        """
+        self.run_subtests(
+            {"container_type": [list, collections.namedtuple, tuple, dict]},
+            self._test_nontensor_activations,
+        )
+
+    def _test_nontensor_activations(self, container_type: Type):
+        class Module(nn.Module):
+            def __init__(self, dim: int):
+                super().__init__()
+                self.lin1 = nn.Linear(dim, dim)
+                self.lin2 = nn.Linear(dim, dim)
+                self.relu = nn.ReLU()
+
+            def forward(self, inp: Any):
+                # Assume that the "0th" element of `inp` is a tensor, run some
+                # forward computation on it, and pack it back into the same
+                # data structure type as `inp`
+                if isinstance(inp, list):
+                    return [self._forward(inp[0])]
+                elif _is_namedtuple(inp):
+                    return type(inp)(*([self._forward(inp[0])] + list(inp[1:])))
+                elif isinstance(inp, tuple):
+                    return (self._forward(inp[0]),)
+                elif isinstance(inp, dict):
+                    return {"x": self._forward(inp["x"])}
+                else:
+                    raise NotImplementedError(
+                        f"Unsupported input type {type(inp)}: {inp}"
+                    )
+
+            def _forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.relu(self.lin2(self.relu(self.lin1(x))))
+
+        class ToContainerType(nn.Module):
+            def __init__(self, container_type: Type):
+                super().__init__()
+                self.container_type = container_type
+
+            def forward(self, x: torch.Tensor):
+                if self.container_type is list:
+                    return [x]
+                elif self.container_type is collections.namedtuple:
+                    nt = collections.namedtuple("NT", "x y")
+                    return nt(x, torch.ones_like(x))
+                elif self.container_type is tuple:
+                    return (x,)
+                elif self.container_type is dict:
+                    return {"x": x}
+                else:
+                    raise NotImplementedError(
+                        f"Unsupported container type: {self.container_type}"
+                    )
+
+        class FromContainerType(nn.Module):
+            def __init__(self, container_type: Type):
+                super().__init__()
+                self.container_type = container_type
+
+            def forward(self, x: torch.Tensor):
+                if self.container_type in (list, collections.namedtuple, tuple):
+                    return x[0]
+                elif self.container_type is dict:
+                    return x["x"]
+                else:
+                    raise NotImplementedError(
+                        f"Unsupported container type: {self.container_type}"
+                    )
+
+        torch.manual_seed(42)
+        local_batch_size, dim = (2, 24)
+        global_batch_size = self.world_size * local_batch_size
+        model = nn.Sequential(
+            ToContainerType(container_type),
+            Module(dim),
+            Module(dim),
+            Module(dim),
+            FromContainerType(container_type),
+        )
+        ref_model = copy.deepcopy(model).cuda()
+        for module in model:
+            fully_shard(module)
+        fully_shard(model)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        torch.manual_seed(1)  # same on all ranks
+        for iter_idx in range(10):
+            global_inp = torch.rand((global_batch_size, dim), device="cuda")
+            local_inp = global_inp[
+                self.rank * local_batch_size : (self.rank + 1) * local_batch_size
+            ].detach()
+            losses: List[torch.Tensor] = []
+            for _model, inp in ((ref_model, global_inp), (model, local_inp)):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+            self._reduce_1d_partial_grads(ref_model)
+            dist.all_reduce(losses[1])  # partial -> replicated
+            self.assertEqual(losses[0], losses[1])
+            check_sharded_parity(self, ref_model, model)
+            for _optim in (optim, ref_optim):
+                _optim.step()
+                _optim.zero_grad(set_to_none=(iter_idx % 2))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py b/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
new file mode 100644
index 0000000000000..4c22ea347156e
--- /dev/null
+++ b/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
@@ -0,0 +1,151 @@
+# Owner(s): ["oncall: distributed"]
+
+import copy
+import functools
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from torch.distributed._composable import replicate
+from torch.distributed._composable.fsdp import fully_shard
+from torch.distributed._tensor import Shard
+from torch.distributed._tensor.debug import CommDebugMode
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    ModelArgs,
+    Transformer,
+    TransformerBlock,
+)
+
+
+class _TestClipGradNormBase(FSDPTest):
+    def _test_clip_grad_norm(
+        self,
+        max_norm: Union[float, int],
+        norm_type: Union[float, int],
+        ref_model: nn.Module,
+        ref_optim: torch.optim.Optimizer,
+        model: nn.Module,
+        optim: torch.optim.Optimizer,
+        dp_mesh: Optional[DeviceMesh] = None,
+    ):
+        vector_norm_fn = functools.partial(torch.linalg.vector_norm, ord=norm_type)
+        dp_mesh = dp_mesh or init_device_mesh("cuda", (self.world_size,))
+        torch.manual_seed(42 + dp_mesh.get_local_rank() + 1)
+        inp = torch.randint(0, model.model_args.vocab_size, (3, 16), device="cuda")
+        for iter_idx in range(10):
+            ref_optim.zero_grad()
+            ref_model(inp).sum().backward()
+            optim.zero_grad()
+            model(inp).sum().backward()
+
+            ref_grads = [p.grad.detach().clone() for p in ref_model.parameters()]
+            local_grads = [
+                p.grad.to_local().detach().clone() for p in model.parameters()
+            ]
+            for ref_grad, param in zip(ref_grads, model.parameters()):
+                # TODO: Skip the check for the parameters since FSDP needs
+                # strided sharding for it to work with `full_tensor`
+                if tuple(param.placements) == (Shard(0), Shard(0)):
+                    continue
+                self.assertEqual(ref_grad, param.grad.full_tensor())
+
+            # Check that all gradients have norm greater than the max norm
+            # before clipping to ensure the clipping is not vacuous
+            self.assertTrue(all(vector_norm_fn(g).item() > max_norm for g in ref_grads))
+            self.assertTrue(
+                all(vector_norm_fn(g).item() > max_norm for g in local_grads)
+            )
+
+            # Check gradient norm clipping via total norm and individual
+            # gradient norms post-clipping
+            ref_total_norm = torch.nn.utils.clip_grad_norm_(
+                ref_model.parameters(), max_norm=max_norm, norm_type=norm_type
+            )
+            comm_mode = CommDebugMode()
+            with comm_mode:
+                total_norm = torch.nn.utils.clip_grad_norm_(
+                    model.parameters(),
+                    max_norm=max_norm,
+                    norm_type=norm_type,
+                    foreach=True,
+                )
+            self.assertEqual(ref_total_norm, total_norm)
+            # Expect one all-reduce per mesh dim for partial -> replicate
+            expected_all_reduces = len(total_norm.placements)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[torch.ops.c10d_functional.all_reduce],
+                expected_all_reduces,
+            )
+            # For zero gradients, clipping has no effect
+            for param, grad in zip(ref_model.parameters(), ref_grads):
+                self.assertTrue(vector_norm_fn(param.grad).item() <= max_norm)
+                if torch.count_nonzero(grad):
+                    self.assertFalse(torch.equal(param.grad, grad))
+            for param, grad in zip(model.parameters(), local_grads):
+                self.assertTrue(
+                    vector_norm_fn(param.grad.to_local()).item() <= max_norm
+                )
+                if torch.count_nonzero(grad):
+                    self.assertFalse(torch.equal(param.grad.to_local(), grad))
+
+
+class TestClipGradNormWorldSize2(_TestClipGradNormBase):
+    @property
+    def world_size(self) -> int:
+        return min(torch.cuda.device_count(), 2)
+
+    @skip_if_lt_x_gpu(2)
+    def test_clip_grad_norm_1d(self):
+        for norm_type in (2, 1, float("inf")):
+            torch.manual_seed(42)
+            model_args = ModelArgs(dropout_p=0.0)
+            model = Transformer(model_args)
+            ref_model = replicate(copy.deepcopy(model).cuda())
+            ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+            for module in model.modules():
+                if isinstance(module, TransformerBlock):
+                    fully_shard(module)
+            fully_shard(model)
+            optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+            self._test_clip_grad_norm(1, norm_type, ref_model, ref_optim, model, optim)
+
+
+class TestClipGradNormWorldSize4(_TestClipGradNormBase):
+    @property
+    def world_size(self) -> int:
+        return min(torch.cuda.device_count(), 4)
+
+    @skip_if_lt_x_gpu(4)
+    def test_clip_grad_norm_2d(self):
+        for norm_type in (2, 1, 3, float("inf")):
+            dp_size = 2
+            global_mesh = init_device_mesh(
+                "cuda",
+                (dp_size, self.world_size // dp_size),
+                mesh_dim_names=("dp", "tp"),
+            )
+            dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
+            torch.manual_seed(42)
+            model_args = ModelArgs(dropout_p=0.0)
+            model = Transformer(model_args)
+            ref_model = replicate(
+                copy.deepcopy(model).cuda(), process_group=dp_mesh.get_group()
+            )
+            ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+            model = Transformer.parallelize(model, tp_mesh, use_seq_parallel=True)
+            for module in model.modules():
+                if isinstance(module, TransformerBlock):
+                    fully_shard(module, mesh=dp_mesh)
+            fully_shard(model, mesh=dp_mesh)
+            optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+            self._test_clip_grad_norm(
+                1, norm_type, ref_model, ref_optim, model, optim, dp_mesh
+            )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_comm.py b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
new file mode 100644
index 0000000000000..115c1f93227c6
--- /dev/null
+++ b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@@ -0,0 +1,709 @@
+# Owner(s): ["oncall: distributed"]
+
+import functools
+import itertools
+import unittest
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.distributed._composable import checkpoint, replicate
+from torch.distributed._composable.fsdp import (
+    FSDPModule,
+    fully_shard,
+    MixedPrecisionPolicy,
+    OffloadPolicy,
+)
+from torch.distributed._composable.fsdp._fsdp_collectives import (
+    _div_if_needed,
+    _get_gradient_divide_factors,
+    foreach_all_gather,
+    foreach_all_gather_copy_out,
+    foreach_reduce,
+)
+from torch.distributed._composable.fsdp._fsdp_common import FSDPMeshInfo, TrainingState
+from torch.distributed._composable.fsdp._fsdp_init import (
+    _get_post_forward_mesh_info,
+    _init_default_fully_shard_mesh,
+)
+from torch.distributed._composable.fsdp._fsdp_param import ShardedState
+from torch.distributed._composable.fsdp._fsdp_param_group import FSDPParamGroup
+from torch.distributed._tensor import DTensor
+from torch.distributed._tensor.experimental import implicit_replication
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    DoubleLinear,
+    FSDPTest,
+    FSDPTestMultiThread,
+    MLP,
+    patch_all_gather,
+    patch_post_backward,
+    patch_reduce_scatter,
+    patch_unshard,
+)
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    ModelArgs,
+    Transformer,
+    TransformerBlock,
+)
+
+# For recording FSDP events like unshard or post-backward
+EventType = Tuple[str, str, TrainingState]
+
+
+class TestFullyShardCollectiveOps(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 128
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device("cuda:0")
+
+    def _get_param_sizes(self) -> List[torch.Size]:
+        # For world size 128, the fp32 all-gather and reduce-scatter testing
+        # requires ~0.22 GB
+        return [
+            torch.Size([17, 257]),
+            torch.Size([17]),
+            torch.Size([64, 312]),
+            torch.Size([64]),
+            torch.Size([64, 64]),
+            torch.Size([512, 64]),
+            torch.Size([256]),
+            torch.Size([64, 297]),
+        ]
+
+    def _init_params(self, param_sizes: List[torch.Size]) -> List[nn.Parameter]:
+        torch.manual_seed(42)
+        orig_params = [
+            nn.Parameter(torch.randn(size, device=self.device)) for size in param_sizes
+        ]
+        # Since seed is per process, not per thread, we broadcast to ensure the
+        # same original parameters across ranks
+        for orig_param in orig_params:
+            dist.broadcast(orig_param, src=0)
+        return orig_params
+
+    def _init_fsdp_param_group(
+        self, params: List[nn.Parameter], reshard_after_forward: Union[bool, int]
+    ):
+        module = nn.ParameterList([param.detach().clone() for param in params])
+        mesh_info = FSDPMeshInfo(_init_default_fully_shard_mesh(), shard_mesh_dim=0)
+        post_forward_mesh_info = _get_post_forward_mesh_info(
+            reshard_after_forward, mesh_info
+        )
+        fsdp_param_group = FSDPParamGroup(
+            list(module.parameters()),
+            module,
+            mesh_info,
+            post_forward_mesh_info,
+            self.device,
+            MixedPrecisionPolicy(),
+            OffloadPolicy(),
+        )
+        fsdp_param_group.lazy_init()
+        return fsdp_param_group
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_all_gather_fp32(self):
+        param_sizes = self._get_param_sizes()
+        default_stream = torch.cuda.current_stream()
+        stream1, stream2 = torch.cuda.Stream(), torch.cuda.Stream()
+        for async_op, streams, reshard_after_forward in itertools.product(
+            (False, True),
+            ((default_stream, default_stream), (stream1, stream2)),
+            (True, 8),
+        ):
+            all_gather_copy_in_stream, all_gather_stream = streams
+            # Save test time by only testing reshard after forward as an int
+            # for non-async and non-default streams (like in pre-backward)
+            if type(reshard_after_forward) is int and (
+                async_op or all_gather_stream is default_stream
+            ):
+                continue
+            self._test_all_gather(
+                param_sizes,
+                reshard_after_forward=reshard_after_forward,
+                async_op=async_op,
+                all_gather_copy_in_stream=all_gather_copy_in_stream,
+                all_gather_stream=all_gather_stream,
+            )
+
+    def _test_all_gather(
+        self,
+        param_sizes: List[torch.Size],
+        reshard_after_forward: Union[bool, int],
+        async_op: bool,
+        all_gather_copy_in_stream: torch.cuda.Stream,
+        all_gather_stream: torch.cuda.Stream,
+    ):
+        def all_gather(fsdp_param_group: FSDPParamGroup, group: dist.ProcessGroup):
+            all_gather_result = foreach_all_gather(
+                fsdp_param_group.fsdp_params,
+                group,
+                async_op=async_op,
+                all_gather_copy_in_stream=all_gather_copy_in_stream,
+                all_gather_stream=all_gather_stream,
+                device=self.device,
+            )
+            foreach_all_gather_copy_out(all_gather_result, fsdp_params, group)
+            # Transition to unsharded state to register unsharded parameters
+            for fsdp_param in fsdp_param_group.fsdp_params:
+                fsdp_param.init_unsharded_param()
+            fsdp_param_group._to_unsharded()
+
+        def check_all_gathered_params(
+            orig_params: List[nn.Parameter], module: nn.Module
+        ):
+            for orig_param, param in zip(orig_params, module.parameters()):
+                self.assertIsInstance(param, torch.Tensor)
+                self.assertIsInstance(param, nn.Parameter)
+                self.assertEqual(param, orig_param.to(param.dtype))
+
+        # Set up the reference parameters and construct the FSDP group
+        orig_params = self._init_params(param_sizes)
+        fsdp_param_group = self._init_fsdp_param_group(
+            orig_params, reshard_after_forward
+        )
+        fsdp_params = fsdp_param_group.fsdp_params
+        module = fsdp_param_group.module
+
+        # Sanity check that the parameter sharding is as expected
+        for orig_param, param in zip(orig_params, module.parameters()):
+            self.assertTrue(isinstance(param, DTensor))
+            self.assertEqual(param.full_tensor(), orig_param)
+
+        # Run the foreach all-gather (including copy-in and copy-out)
+        all_gather(fsdp_param_group, fsdp_param_group.mesh_info.shard_process_group)
+
+        # Check all-gather correctness
+        check_all_gathered_params(orig_params, module)
+
+        # For reshard after after forward as an int, further test emulating the
+        # pre-backward all-gather
+        if type(reshard_after_forward) is not int:
+            return
+        fsdp_param_group._to_sharded_post_forward()
+        all_gather(
+            fsdp_param_group,
+            fsdp_param_group.post_forward_mesh_info.shard_process_group,
+        )
+        check_all_gathered_params(orig_params, module)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_reduce_scatter_fp32(self):
+        param_sizes = self._get_param_sizes()
+        default_stream = torch.cuda.current_stream()
+        stream = torch.cuda.Stream()
+        for reduce_scatter_stream in (default_stream, stream):
+            self._test_reduce_scatter(
+                param_sizes,
+                reduce_scatter_stream=reduce_scatter_stream,
+                reduce_scatter_dtype=torch.float32,
+            )
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_reduce_scatter_fp16(self):
+        param_sizes = self._get_param_sizes()
+        default_stream = torch.cuda.current_stream()
+        stream = torch.cuda.Stream()
+        for reduce_scatter_stream in (default_stream, stream):
+            self._test_reduce_scatter(
+                param_sizes,
+                reduce_scatter_stream=reduce_scatter_stream,
+                reduce_scatter_dtype=torch.float16,
+            )
+
+    def _test_reduce_scatter(
+        self,
+        param_sizes: List[torch.Size],
+        reduce_scatter_stream: torch.cuda.Stream,
+        reduce_scatter_dtype: torch.dtype,
+    ):
+        # Set up the reference parameters and construct the FSDP group
+        orig_params = self._init_params(param_sizes)
+        fsdp_param_group = self._init_fsdp_param_group(orig_params, True)
+        fsdp_params = fsdp_param_group.fsdp_params
+        fsdp_param_group.comm_ctx.init()
+
+        # Run one unshard to initialize metadata
+        fsdp_param_group.unshard()
+        fsdp_param_group.wait_for_unshard()
+        fsdp_param_group.reshard()
+
+        # Run the foreach reduce-scatter (including copy-in and view-out)
+        torch.manual_seed(42)
+        unsharded_grads = [torch.ones_like(param) * self.rank for param in orig_params]
+        group = fsdp_param_group.mesh_info.shard_process_group
+        self.assertEqual(group.size(), self.world_size)
+        all_reduce_stream = torch.cuda.Stream()
+        view_out_event = foreach_reduce(
+            fsdp_params,
+            unsharded_grads,
+            group,
+            reduce_scatter_stream,
+            orig_dtype=orig_params[0].dtype,
+            reduce_dtype=reduce_scatter_dtype,
+            device=self.device,
+            all_reduce_group=None,
+            all_reduce_stream=all_reduce_stream,
+        )
+        torch.cuda.current_stream().wait_event(view_out_event)
+
+        # Check reduce-scatter correctness
+        predivide_factor, postdivide_factor = _get_gradient_divide_factors(
+            group, None, reduce_scatter_dtype
+        )
+        reduced_grads = [grad.detach().clone() for grad in unsharded_grads]
+        for grad in reduced_grads:
+            _div_if_needed(grad, predivide_factor)
+            dist.all_reduce(
+                grad,
+                group=group,
+                op=dist.ReduceOp.AVG if predivide_factor is None else dist.ReduceOp.SUM,
+            )
+            _div_if_needed(grad, postdivide_factor)
+        for fsdp_param, reduced_grad in zip(fsdp_params, reduced_grads):
+            sharded_grad = fsdp_param.sharded_param.grad
+            self.assertIsInstance(sharded_grad, DTensor)
+            self.assertEqual(sharded_grad.full_tensor(), reduced_grad)
+
+
+class TestFullyShardCommunication(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.cuda.device_count())
+
+    @skip_if_lt_x_gpu(2)
+    def test_fully_shard_communication_count(self):
+        """
+        Tests that FSDP issues the expected number of all-gathers and
+        reduce-scatters during forward and backward.
+        """
+        self.run_subtests(
+            {"reshard_after_forward": [True, False, 2]},
+            self._test_communication_count,
+        )
+
+    def _test_communication_count(
+        self,
+        reshard_after_forward: Union[bool, int],
+    ):
+        torch.manual_seed(42)
+        model_args = ModelArgs()
+        model = Transformer(model_args)
+        fully_shard_fn = functools.partial(
+            fully_shard, reshard_after_forward=reshard_after_forward
+        )
+        num_blocks = 0
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard_fn(module)
+                num_blocks += 1
+        fully_shard_fn(model)
+        # We construct `num_blocks` plus 1 FSDP states/communication groups
+
+        orig_all_gather = dist.all_gather_into_tensor
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+        reduce_scatter_count = all_gather_count = 0
+
+        def all_gather_with_count(*args, **kwargs):
+            nonlocal all_gather_count
+            all_gather_count += 1
+            return orig_all_gather(*args, **kwargs)
+
+        def reduce_scatter_with_count(*args, **kwargs):
+            nonlocal reduce_scatter_count
+            reduce_scatter_count += 1
+            return orig_reduce_scatter(*args, **kwargs)
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+        with patch_all_gather(all_gather_with_count), patch_reduce_scatter(
+            reduce_scatter_with_count
+        ):
+            loss = model(inp)
+        self.assertEqual(all_gather_count, num_blocks + 1)
+        self.assertEqual(reduce_scatter_count, 0)
+        all_gather_count = reduce_scatter_count = 0
+        with patch_all_gather(all_gather_with_count), patch_reduce_scatter(
+            reduce_scatter_with_count
+        ):
+            loss.sum().backward()
+        if reshard_after_forward is False:
+            self.assertEqual(
+                all_gather_count,
+                0,
+                f"Expects 0 but got {all_gather_count} for reshard_after_forward={reshard_after_forward}",
+            )
+        else:
+            # The root always does not reshard after forward
+            self.assertEqual(all_gather_count, num_blocks)
+        self.assertEqual(reduce_scatter_count, num_blocks + 1)
+
+
+class TestFullyShardBackwardPrefetch(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.cuda.device_count())
+
+    @skip_if_lt_x_gpu(2)
+    def test_fully_shard_backward_prefetch(self):
+        # Activation checkpointing should not affect the expected FSDP events
+        self.run_subtests(
+            {
+                "reshard_after_forward": [True, False, 2],
+                "checkpoint_impl": [None, "utils", "composable"],
+            },
+            self._test_backward_prefetch_forward_backward,
+        )
+        self.run_subtests(
+            {
+                "reshard_after_forward": [True, False, 2],
+                "checkpoint_impl": [None, "utils", "composable"],
+            },
+            self._test_backward_prefetch_multi_forward,
+        )
+        self._test_backward_prefetch_unused_in_backward(True)
+
+    def _test_backward_prefetch_forward_backward(
+        self, reshard_after_forward: Union[bool, int], checkpoint_impl: Optional[str]
+    ):
+        n_layers = 3
+        model, optim, inp = self._init_transformer(
+            n_layers, reshard_after_forward, checkpoint_impl
+        )
+        events: List[EventType] = []
+        unshard_with_record = self._get_unshard_with_record(
+            FSDPParamGroup.unshard, events
+        )
+        post_backward_with_record = self._get_post_backward_with_record(
+            FSDPParamGroup.post_backward, events
+        )
+        # Check the order for normal 1 forward, 1 backward, 1 optimizer step
+        with patch_unshard(unshard_with_record), patch_post_backward(
+            post_backward_with_record
+        ):
+            for iter_idx in range(3):
+                loss = model(inp)
+                expected_events = [
+                    ("unshard", "", TrainingState.FORWARD),  # root
+                    ("unshard", "layers.0", TrainingState.FORWARD),
+                    ("unshard", "layers.1", TrainingState.FORWARD),
+                    ("unshard", "layers.2", TrainingState.FORWARD),
+                ]
+                self.assertEqual(events, expected_events)
+                events.clear()
+                loss.sum().backward()
+                expected_events = [
+                    # Root does not reshard after forward so there is no
+                    # unshard event for it in backward
+                    ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
+                    # Explicit backward prefetching moves the unshards early
+                    # by one module (note how swapping each unshard down one
+                    # event would give the natural event order)
+                    ("unshard", "layers.1", TrainingState.PRE_BACKWARD),
+                    ("post_backward", "layers.2", TrainingState.POST_BACKWARD),
+                    ("unshard", "layers.0", TrainingState.PRE_BACKWARD),
+                    ("post_backward", "layers.1", TrainingState.POST_BACKWARD),
+                    ("post_backward", "layers.0", TrainingState.POST_BACKWARD),
+                    ("post_backward", "", TrainingState.POST_BACKWARD),
+                ]
+                if reshard_after_forward is False:
+                    # No reshard after forward means no backward unshards
+                    expected_events = [e for e in expected_events if e[0] != "unshard"]
+                self.assertEqual(events, expected_events)
+                events.clear()
+                optim.step()
+                optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+
+    def _test_backward_prefetch_multi_forward(
+        self, reshard_after_forward: Union[bool, int], checkpoint_impl: Optional[str]
+    ):
+        n_layers = 3
+        model, optim, inp = self._init_transformer(
+            n_layers, reshard_after_forward, checkpoint_impl
+        )
+        events: List[EventType] = []
+        unshard_with_record = self._get_unshard_with_record(
+            FSDPParamGroup.unshard, events
+        )
+        post_backward_with_record = self._get_post_backward_with_record(
+            FSDPParamGroup.post_backward, events
+        )
+        # Check the order for multiple forwards before 1 backward
+        with patch_unshard(unshard_with_record), patch_post_backward(
+            post_backward_with_record
+        ):
+            loss1 = model(inp)
+            loss2 = model(inp)
+            expected_events = [
+                ("unshard", "", TrainingState.FORWARD),  # root
+                ("unshard", "layers.0", TrainingState.FORWARD),
+                ("unshard", "layers.1", TrainingState.FORWARD),
+                ("unshard", "layers.2", TrainingState.FORWARD),
+                # Root does not reshard after forward so there is not another
+                # unshard event for it
+                ("unshard", "layers.0", TrainingState.FORWARD),
+                ("unshard", "layers.1", TrainingState.FORWARD),
+                ("unshard", "layers.2", TrainingState.FORWARD),
+            ]
+            if reshard_after_forward is False:
+                # No reshard after forward means no second set of unshards
+                expected_events = expected_events[:-3]
+            self.assertEqual(events, expected_events)
+            events.clear()
+            (loss1 + loss2).sum().backward()
+            expected_events = [
+                # Same as the single forward/backward case except the root's
+                # post-backward does not run until the end of backward in the
+                # final callback (since the input not requiring gradient means
+                # that we do not have a tensor on which to hook for
+                # post-backward)
+                ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
+                ("unshard", "layers.1", TrainingState.PRE_BACKWARD),
+                ("post_backward", "layers.2", TrainingState.POST_BACKWARD),
+                ("unshard", "layers.0", TrainingState.PRE_BACKWARD),
+                ("post_backward", "layers.1", TrainingState.POST_BACKWARD),
+                ("post_backward", "layers.0", TrainingState.POST_BACKWARD),
+            ]
+            if reshard_after_forward is False:
+                # No reshard after forward means no backward unshards
+                expected_events = [e for e in expected_events if e[0] != "unshard"]
+                # However, the post-backward reshards, so the second set of
+                # unshards will run as real ops
+            expected_events += [
+                # Repeat the same pattern except with the root's post-backward
+                # at the end since the final callback runs
+                ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
+                ("unshard", "layers.1", TrainingState.PRE_BACKWARD),
+                ("post_backward", "layers.2", TrainingState.POST_BACKWARD),
+                ("unshard", "layers.0", TrainingState.PRE_BACKWARD),
+                ("post_backward", "layers.1", TrainingState.POST_BACKWARD),
+                ("post_backward", "layers.0", TrainingState.POST_BACKWARD),
+                ("post_backward", "", TrainingState.POST_BACKWARD),
+            ]
+            self.assertEqual(events, expected_events)
+            events.clear()
+
+    def _test_backward_prefetch_unused_in_backward(
+        self, reshard_after_forward: Union[bool, int]
+    ):
+        """
+        Test a model with a linear module then a split into two linear modules,
+        where we run backward through one path first before the other, meaning
+        that (1) only one linear of the two split is used per backward and (2)
+        the initial shared linear is used in both backwards.
+        """
+        dim = 8
+        model = nn.Sequential(nn.Linear(dim, dim), DoubleLinear(dim))
+        fully_shard(model[0], reshard_after_forward=reshard_after_forward)
+        fully_shard(model[1].lin1, reshard_after_forward=reshard_after_forward)
+        fully_shard(model[1].lin2, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, reshard_after_forward=reshard_after_forward)
+        inp = torch.randn((4, dim), device="cuda")
+        events: List[EventType] = []
+        unshard_with_record = self._get_unshard_with_record(
+            FSDPParamGroup.unshard, events
+        )
+        post_backward_with_record = self._get_post_backward_with_record(
+            FSDPParamGroup.post_backward, events
+        )
+        with patch_unshard(unshard_with_record), patch_post_backward(
+            post_backward_with_record
+        ):
+            loss1, loss2 = model(inp)
+            expected_events = [
+                # Root has no parameters, so it does not have an unshard
+                ("unshard", "0", TrainingState.FORWARD),
+                ("unshard", "1.lin1", TrainingState.FORWARD),
+                ("unshard", "1.lin2", TrainingState.FORWARD),
+            ]
+            self.assertEqual(events, expected_events)
+            events.clear()
+
+            model.set_is_last_backward(False)
+            loss2.sum().backward(retain_graph=True)
+            expected_events = [
+                ("unshard", "1.lin2", TrainingState.PRE_BACKWARD),
+                # NOTE: This `1.lin1` unshard is a mistargeted prefetch.
+                ("unshard", "1.lin1", TrainingState.PRE_BACKWARD),
+                ("post_backward", "1.lin2", TrainingState.POST_BACKWARD),
+                ("unshard", "0", TrainingState.PRE_BACKWARD),
+                ("post_backward", "0", TrainingState.POST_BACKWARD),
+            ]
+            self.assertEqual(events, expected_events)
+            events.clear()
+
+            model.set_is_last_backward(True)
+            loss1.sum().backward()
+            expected_events = [
+                # NOTE: `1.lin1` is already unsharded from the mistargeted
+                # prefetch in the first backward.
+                # Prefetch `0`
+                ("unshard", "0", TrainingState.PRE_BACKWARD),
+                ("post_backward", "1.lin1", TrainingState.POST_BACKWARD),
+                ("post_backward", "0", TrainingState.POST_BACKWARD),
+            ]
+            self.assertEqual(events, expected_events)
+            events.clear()
+
+    def _init_transformer(
+        self,
+        n_layers: int,
+        reshard_after_forward: Union[bool, int],
+        checkpoint_impl: Optional[str],
+    ):
+        model_args = ModelArgs(
+            n_layers=n_layers, checkpoint_activations=(checkpoint_impl == "utils")
+        )
+        model = Transformer(model_args)
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                if checkpoint_impl == "composable":
+                    checkpoint(module)
+                fully_shard(module, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, reshard_after_forward=reshard_after_forward)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        inp = torch.randint(
+            0, model_args.vocab_size, (2, model_args.max_seq_len), device="cuda"
+        )
+        return model, optim, inp
+
+    def _get_unshard_with_record(
+        self, orig_unshard: Callable, events: List[EventType]
+    ) -> Callable:
+        def unshard_with_record(self, *args, **kwargs):
+            nonlocal events
+            if (
+                self._all_gather_result is None
+                and self._sharded_state != ShardedState.UNSHARDED
+            ):  # skip no-ops
+                events.append(("unshard", self._module_fqn, self._training_state))
+            return orig_unshard(self, *args, **kwargs)
+
+        return unshard_with_record
+
+    def _get_post_backward_with_record(
+        self, orig_post_backward: Callable, events: List[EventType]
+    ) -> Callable:
+        def post_backward_with_record(self, *args, **kwargs):
+            nonlocal events
+            ret = orig_post_backward(self, *args, **kwargs)
+            # Use training state after running post-backward to check that the
+            # state is transitioned to `POST_BACKWARD` as expected
+            events.append(("post_backward", self._module_fqn, self._training_state))
+            return ret
+
+        return post_backward_with_record
+
+
+class TestFullyShardUnshardMultiProcess(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(torch.cuda.device_count(), 2)
+
+    @skip_if_lt_x_gpu(2)
+    def test_unshard_async(self):
+        class ReduceModule(nn.Module):
+            def __init__(self, dim: int, mesh: DeviceMesh):
+                super().__init__()
+                self.mesh = mesh
+                self.weight = nn.Parameter(torch.randn(dim, dim))
+
+            def forward(self, x: torch.Tensor):
+                y = F.relu(x @ self.weight)
+                # NOTE: This all-reduce is not differentiable and is included
+                # to exercise the overlap.
+                work = dist.all_reduce(y, group=self.mesh.get_group(), async_op=True)
+                return y, work
+
+        class MLPs(nn.Module):
+            def __init__(self, dim: int):
+                super().__init__()
+                self.mlp1 = MLP(dim)
+                self.mlp2 = MLP(dim)
+                self.mlp3 = MLP(dim)
+
+            def forward(self, ys: List[torch.Tensor], works: List[dist.Work]):
+                (y1, y2, y3), (work1, work2, work3) = ys, works
+                work1.wait()
+                z1 = self.mlp1(y1)
+                work2.wait()
+                z2 = self.mlp2(y2)
+                work3.wait()
+                z3 = self.mlp3(y3)
+                return z1 + z2 + z3
+
+        class ReduceModel(nn.Module):
+            def __init__(self, dim: int, mesh: DeviceMesh):
+                super().__init__()
+                self.reduce_module1 = ReduceModule(dim, mesh)
+                self.reduce_module2 = ReduceModule(dim, mesh)
+                self.reduce_module3 = ReduceModule(dim, mesh)
+                self.mlps = MLPs(dim)
+
+            def forward(self, x: torch.Tensor):
+                y1, work1 = self.reduce_module1(x)
+                if isinstance(self.mlps.mlp1, FSDPModule):
+                    self.mlps.mlp1.unshard(async_op=True)
+                y2, work2 = self.reduce_module2(x)
+                if isinstance(self.mlps.mlp2, FSDPModule):
+                    self.mlps.mlp2.unshard(async_op=True)
+                y3, work3 = self.reduce_module3(x)
+                if isinstance(self.mlps.mlp3, FSDPModule):
+                    self.mlps.mlp3.unshard(async_op=True)
+                return self.mlps([y1, y2, y3], [work1, work2, work3])
+
+        mesh = init_device_mesh("cuda", (self.world_size,))
+        batch_size, dim = 2, 8
+        torch.manual_seed(42)
+        ref_model = replicate(ReduceModel(dim, mesh).cuda())
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        torch.manual_seed(42)
+        model = ReduceModel(dim, mesh)
+        fully_shard(model.mlps.mlp1, reshard_after_forward=False)
+        fully_shard(model.mlps.mlp2, reshard_after_forward=False)
+        fully_shard(model.mlps.mlp3, reshard_after_forward=False)
+        fully_shard(model.mlps)
+        replicate(model.cuda())
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((batch_size, dim), device="cuda")
+        for _ in range(10):
+            losses: List[torch.Tensor] = []
+            for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+                with implicit_replication():
+                    _optim.step()
+                _optim.zero_grad()
+            self.assertEqual(losses[0], losses[1])
+
+
+class TestFullyShardUnshardMultiThread(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_unshard_no_param_group(self):
+        # Check that we can call `unshard()` on a module with no parameter
+        # group / no managed parameters without erroring
+        model = nn.Sequential(nn.Linear(4, 4), nn.Linear(4, 4))
+        for lin in model:
+            fully_shard(lin)
+        fully_shard(model)
+        handle = model.unshard(async_op=True)
+        handle.wait()
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_compile.py b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
new file mode 100644
index 0000000000000..8a87dfdd1d4de
--- /dev/null
+++ b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
@@ -0,0 +1,64 @@
+# Owner(s): ["oncall: distributed"]
+
+
+import unittest
+
+import torch
+from torch.distributed._composable.fsdp import fully_shard
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest, MLP
+from torch.testing._internal.common_utils import run_tests
+from torch.utils._triton import has_triton
+
+
+class TestFullyShardCompileCompute(FSDPTest):
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @skip_if_lt_x_gpu(2)
+    def test_disable_compiling_hooks(self):
+        self.run_subtests(
+            {
+                "skip_fsdp_hooks": [False, True],
+            },
+            self._test_disable_compiling_hooks,
+        )
+
+    def _test_disable_compiling_hooks(
+        self,
+        skip_fsdp_hooks: bool,
+    ):
+        torch._dynamo.reset()
+        trace_rules_check_count = 0
+        HOOKS_FILE_NAME = "torch/distributed/_composable/fsdp/_fsdp_state.py"
+        HOOK_WRAPPER_NAME = "fsdp_hook_wrapper"
+
+        def patched_trace_rules_check(*args, **kwargs):
+            nonlocal trace_rules_check_count
+            f_code = args[0]
+            if (
+                hasattr(f_code, "co_filename")
+                and f_code.co_filename.endswith(HOOKS_FILE_NAME)
+                and f_code.co_name != HOOK_WRAPPER_NAME
+            ):
+                trace_rules_check_count += 1
+            return orig_trace_rules_check(*args, **kwargs)
+
+        original_skip_fsdp_hooks = torch._dynamo.config.skip_fsdp_hooks
+        orig_trace_rules_check = torch._dynamo.trace_rules.check
+        torch.distributed.barrier()
+        torch._dynamo.config.skip_fsdp_hooks = skip_fsdp_hooks
+        torch._dynamo.trace_rules.check = patched_trace_rules_check
+        model = MLP(4)
+        fully_shard(model)
+        model.compile()
+        model(torch.randn((4, 4), device="cuda"))
+        torch.distributed.barrier()
+        torch._dynamo.config.skip_fsdp_hooks = original_skip_fsdp_hooks
+        torch._dynamo.trace_rules.check = orig_trace_rules_check
+        if skip_fsdp_hooks:
+            self.assertEqual(trace_rules_check_count, 0)
+        else:
+            self.assertTrue(trace_rules_check_count > 0)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_extensions.py b/test/distributed/_composable/fsdp/test_fully_shard_extensions.py
new file mode 100644
index 0000000000000..5ce0f272c3f65
--- /dev/null
+++ b/test/distributed/_composable/fsdp/test_fully_shard_extensions.py
@@ -0,0 +1,238 @@
+# Owner(s): ["oncall: distributed"]
+
+import contextlib
+import copy
+import functools
+import threading
+import unittest
+from typing import Any, List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy
+
+from torch.distributed.device_mesh import DeviceMesh
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    check_sharded_parity,
+    FSDPTest,
+    FSDPTestMultiThread,
+    MLP,
+)
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.two_tensor import TwoTensor
+
+
+def two_tensor_fsdp_pre_all_gather(
+    self, mesh: DeviceMesh
+) -> Tuple[Tuple[torch.Tensor, ...], Any]:
+    all_gather_inputs = (self.a, self.b)
+    metadata = None
+    return all_gather_inputs, metadata
+
+
+def two_tensor_fsdp_post_all_gather(
+    self,
+    all_gather_outputs: Tuple[torch.Tensor, ...],
+    metadata: Any,
+    param_dtype: torch.dtype,
+    *,
+    out: Optional[torch.Tensor] = None,
+) -> Union[Tuple[torch.Tensor, Tuple[torch.Tensor, ...]], None]:
+    assert metadata is None, f"{metadata}"
+    a, b = all_gather_outputs
+    if out is not None:
+        assert isinstance(out, TwoTensor), f"{type(out)}"
+        if a.dtype == param_dtype:
+            assert a.untyped_storage().data_ptr() == out.a.untyped_storage().data_ptr()
+            assert b.untyped_storage().data_ptr() == out.b.untyped_storage().data_ptr()
+        else:
+            assert out.a.dtype == param_dtype, f"{out.a.dtype} {param_dtype}"
+            assert out.b.dtype == param_dtype, f"{out.b.dtype} {param_dtype}"
+            out.a.copy_(a)
+            out.b.copy_(b)
+        return
+    tensors_to_free = (a, b)
+    # If the cast is real, then the all-gather outputs will not alias the
+    # returned `TwoTensor`'s `a` and `b`
+    two_tensor = TwoTensor(a, b).to(param_dtype)
+    return two_tensor, tensors_to_free
+
+
+class TestFullyShardAllGatherExtensionsCommon:
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @contextlib.contextmanager
+    def _patch_two_tensor_fsdp_all_gather(self):
+        lock = threading.Lock()
+        TwoTensor.fsdp_pre_all_gather = two_tensor_fsdp_pre_all_gather
+        TwoTensor.fsdp_post_all_gather = two_tensor_fsdp_post_all_gather
+        dist.barrier()
+        try:
+            yield
+        finally:
+            dist.barrier()
+            with lock:  # only one thread needs to delete
+                if hasattr(TwoTensor, "fsdp_pre_all_gather"):
+                    delattr(TwoTensor, "fsdp_pre_all_gather")
+                if hasattr(TwoTensor, "fsdp_post_all_gather"):
+                    delattr(TwoTensor, "fsdp_post_all_gather")
+
+    def _init_two_tensor_mlp(self) -> nn.Module:
+        # Disable bias because the reference model will end up with a bias
+        # gradient that is a `TwoTensor`, whereas the FSDP model does not
+        model = nn.Sequential(*[MLP(8, bias=False) for _ in range(3)])
+        for mlp in model:
+            mlp.in_proj.weight = nn.Parameter(
+                TwoTensor(mlp.in_proj.weight, mlp.in_proj.weight.clone())
+            )
+            mlp.out_proj.weight = nn.Parameter(
+                TwoTensor(mlp.out_proj.weight, mlp.out_proj.weight.clone())
+            )
+        return model
+
+
+class TestFullyShardAllGatherExtensionsMultiProcess(
+    TestFullyShardAllGatherExtensionsCommon, FSDPTest
+):
+    @skip_if_lt_x_gpu(2)
+    def test_all_gather_extensions_train_parity(self):
+        with self._patch_two_tensor_fsdp_all_gather():
+            self.run_subtests(
+                {"reshard_after_forward": [True, False]},
+                self._test_all_gather_extensions_train_parity,
+            )
+
+    def _test_all_gather_extensions_train_parity(self, reshard_after_forward: bool):
+        torch.manual_seed(42)
+        model = self._init_two_tensor_mlp()
+        ref_model = copy.deepcopy(model).cuda()
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=True)
+        fully_shard_fn = functools.partial(
+            fully_shard, reshard_after_forward=reshard_after_forward
+        )
+        for mlp in model:
+            fully_shard_fn(mlp)
+        fully_shard_fn(model)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+        check_sharded_parity(self, ref_model, model)
+
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((2, 8), device="cuda")
+        for iter_idx in range(10):
+            losses: List[torch.Tensor] = []
+            for _model in (ref_model, model):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+                if _model is ref_model:
+                    for param_name, param in _model.named_parameters():
+                        dist.all_reduce(param.grad)
+                        param.grad.detach().div_(self.world_size)
+            self.assertEqual(losses[0], losses[1])
+            check_sharded_parity(self, ref_model, model)
+            for _optim in (ref_optim, optim):
+                _optim.step()
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            check_sharded_parity(self, ref_model, model)
+
+
+class TestFullyShardAllGatherExtensionsMultiThread(
+    TestFullyShardAllGatherExtensionsCommon, FSDPTestMultiThread
+):
+    @property
+    def device(self) -> torch.device:
+        return torch.device("cuda:0")
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_all_gather_extensions_end_to_end(self):
+        with self._patch_two_tensor_fsdp_all_gather():
+            self.run_subtests(
+                {"reshard_after_forward": [True, False]},
+                self._test_all_gather_extensions_end_to_end,
+            )
+
+    def _test_all_gather_extensions_end_to_end(self, reshard_after_forward: bool):
+        # Check that we can run the meta-device initialization flow
+        with torch.device("meta"):
+            model = self._init_two_tensor_mlp()
+        for param in model.parameters():
+            self.assertEqual(param.device, torch.device("meta"))
+        fully_shard_fn = functools.partial(
+            fully_shard,
+            reshard_after_forward=reshard_after_forward,
+            mp_policy=MixedPrecisionPolicy(param_dtype=torch.bfloat16),
+        )
+        for mlp in model:
+            fully_shard_fn(mlp)
+        fully_shard_fn(model)
+        model.to_empty(device=self.device)
+        for param in model.parameters():
+            nn.init.trunc_normal_(param)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+
+        # Run a few iterations to check for errors
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((2, 8), device="cuda")
+        for _ in range(3):
+            model(inp).sum().backward()
+            optim.step()
+            optim.zero_grad()
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_all_gather_extensions_monkey_patch(self):
+        # Define a pre/post-all-gather pair that quantizes to bf16 for the
+        # all-gather and de-quantizes back to the parameter dtype
+        def fsdp_pre_all_gather(self) -> Tuple[Tuple[torch.Tensor, ...], Any]:
+            return (self.to(torch.bfloat16),), None
+
+        def fsdp_post_all_gather(
+            self,
+            all_gather_outputs: Tuple[torch.Tensor, ...],
+            metadata: Any,
+            param_dtype: torch.dtype,
+            *,
+            out: Optional[torch.Tensor] = None,
+        ) -> Union[Tuple[torch.Tensor, Tuple[torch.Tensor, ...]], None]:
+            (tensor,) = all_gather_outputs
+            assert metadata is None, f"{metadata}"
+            assert tensor.dtype == torch.bfloat16, f"{tensor.dtype}"
+            if out is not None:
+                out.copy_(tensor)
+                return
+            return tensor.to(param_dtype), (tensor,)
+
+        with torch.device("meta"):
+            model = self._init_two_tensor_mlp()
+        for mlp in model:
+            fully_shard(mlp)
+        fully_shard(model)
+        model.to_empty(device=self.device)
+        for param in model.parameters():
+            nn.init.trunc_normal_(param)
+        # Monkey patch the pre/post-all-gather functions *after* `to_empty()`
+        # since the local tensor objects change from materialization
+        self.assertGreater(sum("weight" in n for n, _ in model.named_parameters()), 0)
+        for param_name, param in model.named_parameters():
+            if "weight" in param_name:
+                local_param = param.to_local()
+                # Monkey patch on the `torch.Tensor` to show that the extension
+                # can work even without a subclass
+                local_param.fsdp_pre_all_gather = fsdp_pre_all_gather
+                local_param.fsdp_post_all_gather = fsdp_post_all_gather
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+
+        # Run a few iterations to check for errors
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((2, 8), device="cuda")
+        for _ in range(3):
+            model(inp).sum().backward()
+            optim.step()
+            optim.zero_grad()
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_frozen.py b/test/distributed/_composable/fsdp/test_fully_shard_frozen.py
new file mode 100644
index 0000000000000..95eee760e1cbd
--- /dev/null
+++ b/test/distributed/_composable/fsdp/test_fully_shard_frozen.py
@@ -0,0 +1,265 @@
+# Owner(s): ["oncall: distributed"]
+
+import copy
+import functools
+import itertools
+
+from typing import List, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributed._composable import checkpoint, replicate
+from torch.distributed._composable.fsdp import fully_shard
+from torch.distributed._composable.fsdp._fsdp_param_group import (
+    RegisterPostBackwardFunction,
+)
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    check_sharded_parity,
+    FSDPTest,
+    MLP,
+    patch_reduce_scatter,
+    patch_register_post_backward_hook_backward,
+    reduce_scatter_with_assert,
+)
+from torch.testing._internal.common_utils import run_tests
+
+
+class TestFullyShardFrozen(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.cuda.device_count())
+
+    @skip_if_lt_x_gpu(2)
+    def test_train_mixed_requires_grad_per_group(self):
+        """
+        Tests training parity with DDP when mixing frozen and non-frozen
+        parameters in the same FSDP communication group. This checks that
+        the reduce-scatters reduce the expected numel and that they are called
+        via the custom autograd function backward (i.e. that they are not
+        delayed until the end of backward).
+        """
+        self.run_subtests(
+            {
+                "reshard_after_forward": [False, True, 2],
+                "use_activation_checkpointing": [False, True],
+                "freeze_after_init": [False, True],
+            },
+            self._test_train_mixed_requires_grad_per_group,
+        )
+
+    def _test_train_mixed_requires_grad_per_group(
+        self,
+        reshard_after_forward: Union[bool, int],
+        use_activation_checkpointing: bool,
+        freeze_after_init: bool,
+    ):
+        torch.manual_seed(42)
+        num_mlps, lin_dim = (3, 32)
+        model = nn.Sequential(
+            *[MLP(lin_dim, torch.device("cpu")) for _ in range(num_mlps)]
+        )
+        # Train biases only (e.g. like BitFit)
+        if not freeze_after_init:
+            for param_name, param in model.named_parameters():
+                if "bias" not in param_name:
+                    param.requires_grad_(False)
+        ref_model = replicate(
+            copy.deepcopy(model).cuda(),
+            device_ids=[self.rank],
+            find_unused_parameters=freeze_after_init,
+        )
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        for mlp in model:
+            if use_activation_checkpointing:
+                checkpoint(mlp)
+            fully_shard(mlp, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, reshard_after_forward=reshard_after_forward)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+        if freeze_after_init:
+            for param_name, param in itertools.chain(
+                model.named_parameters(), ref_model.named_parameters()
+            ):
+                if "bias" not in param_name:
+                    param.requires_grad_(False)
+        for mlp in model:
+            assert isinstance(mlp, MLP), (
+                "The reduce-scatter numel check assumes the model consists of "
+                f"only the same MLP class but got {type(mlp)}"
+            )
+        expected_numel = sum(
+            p._local_tensor.numel()
+            for n, p in model[0].named_parameters()
+            if "bias" in n
+        )
+
+        def assert_fn(output: torch.Tensor):
+            self.assertEqual(output.numel(), expected_numel)
+
+        reduce_scatter = functools.partial(
+            reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
+        )
+        orig_backward = RegisterPostBackwardFunction.backward
+        backward_count = 0
+
+        def backward_with_count(*args, **kwargs):
+            nonlocal backward_count
+            backward_count += 1
+            return orig_backward(*args, **kwargs)
+
+        torch.manual_seed(42 + self.rank + 1)
+        device = torch.device("cuda")
+        with patch_reduce_scatter(
+            reduce_scatter
+        ), patch_register_post_backward_hook_backward(backward_with_count):
+            for iter_idx in range(10):
+                inp = torch.randn((8, lin_dim), device=device)
+                losses: List[torch.Tensor] = []
+                for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                    _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                    losses.append(_model(inp).sum())
+                    losses[-1].backward()
+                    _optim.step()
+                check_sharded_parity(self, ref_model, model)
+                self.assertEqual(losses[0], losses[1])
+                # Check that the post-backward hooks ran through the autograd
+                # backward, not the final callback (except possibly that of the
+                # first MLP, which does not have an input that requires grad)
+                self.assertTrue(backward_count >= num_mlps - 1)
+
+    @skip_if_lt_x_gpu(2)
+    def test_train_mixed_requires_grad_across_groups(self):
+        """
+        Tests training parity with DDP when mixing frozen and non-frozen
+        parameters across different FSDP communication groups, including
+        possibly unfreezing parameters.
+        """
+        self.run_subtests(
+            {
+                "reshard_after_forward": [False, True, 2],
+                "unfreeze_params": [False, True],
+            },
+            self._test_train_mixed_requires_grad_across_groups,
+        )
+
+    def _test_train_mixed_requires_grad_across_groups(
+        self,
+        reshard_after_forward: Union[bool, int],
+        unfreeze_params: bool,
+    ):
+        torch.manual_seed(42)
+        num_linears, lin_dim = (6, 32)
+        modules: List[nn.Module] = []
+        for _ in range(num_linears):
+            modules += [nn.Linear(lin_dim, lin_dim), nn.ReLU()]
+        model = nn.Sequential(*modules)
+        ref_model = replicate(
+            copy.deepcopy(model).cuda(),
+            device_ids=[self.rank],
+            find_unused_parameters=True,
+        )
+        for module in model.modules():
+            if isinstance(module, nn.Linear):
+                fully_shard(module, reshard_after_forward=reshard_after_forward)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        orig_backward = RegisterPostBackwardFunction.backward
+        backward_count = 0
+
+        def _set_requires_grad(seq: nn.Module, requires_grad: bool):
+            for i in range(num_linears):
+                # Interleave frozen -> non-frozen -> ... linears
+                if i % 2 == 0:
+                    for param in seq[i % 2].parameters():
+                        param.requires_grad_(requires_grad)
+
+        def backward_with_count(*args, **kwargs):
+            nonlocal backward_count
+            backward_count += 1
+            return orig_backward(*args, **kwargs)
+
+        _set_requires_grad(model, False)
+        _set_requires_grad(ref_model, False)
+        num_iters, no_grad_iter_idx = (3, 1)
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randn((8, lin_dim), device="cuda")
+        with patch_register_post_backward_hook_backward(backward_with_count):
+            for iter_idx in range(num_iters):
+                losses: List[torch.Tensor] = []
+                for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                    # Unfreeze the parameters on the last step to emulate some
+                    # kinds of fine-tuning
+                    if unfreeze_params and iter_idx == num_iters - 1:
+                        _set_requires_grad(model, True)
+                    if iter_idx == no_grad_iter_idx:
+                        with torch.no_grad():
+                            losses.append(_model(inp).sum())
+                    else:
+                        losses.append(_model(inp).sum())
+                        losses[-1].backward()
+                        _optim.step()
+                        _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            self.assertEqual(losses[0], losses[1])
+            # Check that the post-backward hooks ran through the autograd
+            # backward, not the final callback (except possibly that of the
+            # first linear, which does not have an input that requires grad)
+            self.assertTrue(backward_count >= num_linears - 1)
+
+    @skip_if_lt_x_gpu(2)
+    def test_multi_forward_mixed_requires_grad(self):
+        """
+        Tests training parity with DDP when having trainable and frozen modules
+        that participate multiple times in forward.
+        """
+        self.run_subtests(
+            {"reshard_after_forward": [True, False, 2]},
+            self._test_multi_forward_mixed_requires_grad,
+        )
+
+    def _test_multi_forward_mixed_requires_grad(
+        self,
+        reshard_after_forward: Union[bool, int],
+    ):
+        class MultiForwardModule(nn.Module):
+            def __init__(self, device: torch.device):
+                super().__init__()
+                self.layer_0 = nn.Linear(5, 5, device=device)
+                self.layer_no_grad = nn.Linear(5, 5, device=device)
+                self.layer_with_grad = nn.Linear(5, 5, device=device)
+                self.layer_no_grad.requires_grad_(False)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                x = self.layer_0(x)
+                for _ in range(3):
+                    x = self.layer_no_grad(F.relu(self.layer_with_grad(x)))
+                    # Make sure that calling the same layer multiple times
+                    # works regardless whether gradient is enabled
+                    with torch.no_grad():
+                        x += F.relu(self.layer_with_grad(x))
+                return x
+
+        torch.manual_seed(42)
+        model = MultiForwardModule(torch.device("cpu"))
+        ref_model = replicate(copy.deepcopy(model).cuda(), device_ids=[self.rank])
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        for module in model.modules():
+            if isinstance(module, nn.Linear):
+                fully_shard(module, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, reshard_after_forward=reshard_after_forward)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        for iter_idx in range(10):
+            inp = torch.randn((8, 5), device="cuda")
+            losses: List[torch.Tensor] = []
+            for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+                _optim.step()
+            self.assertEqual(losses[0], losses[1])
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_init.py b/test/distributed/_composable/fsdp/test_fully_shard_init.py
new file mode 100644
index 0000000000000..958f375fe2c3d
--- /dev/null
+++ b/test/distributed/_composable/fsdp/test_fully_shard_init.py
@@ -0,0 +1,796 @@
+# Owner(s): ["oncall: distributed"]
+
+import copy
+import itertools
+import unittest
+from typing import List
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed._composable import replicate
+from torch.distributed._composable.fsdp import fully_shard
+from torch.distributed._composable.fsdp._fsdp_init import (
+    _get_managed_modules,
+    _get_managed_states,
+)
+from torch.distributed._composable.fsdp._fsdp_param import ParamModuleInfo
+from torch.distributed._composable.fsdp._fsdp_param_group import _get_param_module_infos
+from torch.distributed._tensor import (
+    DeviceMesh,
+    distribute_tensor,
+    DTensor,
+    Replicate,
+    Shard,
+)
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    RowwiseParallel,
+)
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_fsdp import FSDPTestMultiThread, MLP
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    ModelArgs,
+    Transformer,
+    TransformerBlock,
+)
+
+
+class TestFullyShardDeviceTensor(FSDPTestMultiThread):
+    """Tests that tensor parameters are moved to the expected device."""
+
+    @property
+    def world_size(self) -> int:
+        return 1
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_move_states_to_device_tensor(self):
+        model = MLP(8, torch.device("cpu"), with_buffer=True)
+        for tensor in itertools.chain(model.parameters(), model.buffers()):
+            self.assertEqual(tensor.device, torch.device("cpu"))
+        fully_shard(model)
+        cuda_device = torch.device("cuda", torch.cuda.current_device())
+        for tensor in itertools.chain(model.parameters(), model.buffers()):
+            self.assertEqual(tensor.device, cuda_device)
+
+
+class TestFullyShardDeviceDTensor(FSDPTestMultiThread):
+    """Tests that DTensor parameters are moved to the expected device."""
+
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_move_states_to_device_dtensor_valid(self):
+        assert self.world_size >= 4, f"{self.world_size}"
+        dp_size = 2
+        global_mesh = init_device_mesh(
+            "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+        )
+        dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
+        model = MLP(8, torch.device("cpu"), with_buffer=True)
+        parallelize_module(
+            model,
+            tp_mesh,
+            {"in_proj": ColwiseParallel(), "out_proj": RowwiseParallel()},
+        )
+        cuda_device = torch.device("cuda", torch.cuda.current_device())
+        for tensor in itertools.chain(model.parameters(), model.buffers()):
+            if isinstance(tensor, DTensor):
+                # DTensor constructor moves to the mesh's device
+                self.assertEqual(tensor.device, cuda_device)
+                self.assertEqual(tensor._local_tensor.device, cuda_device)
+            else:
+                self.assertEqual(tensor.device, torch.device("cpu"))
+        fully_shard(model, mesh=dp_mesh)
+        for tensor in itertools.chain(model.parameters(), model.buffers()):
+            self.assertEqual(tensor.device, cuda_device)
+            if isinstance(tensor, DTensor):
+                self.assertEqual(tensor._local_tensor.device, cuda_device)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_move_states_to_device_dtensor_invalid(self):
+        assert self.world_size >= 4, f"{self.world_size}"
+        dp_size = 2
+        global_cuda_mesh = init_device_mesh(
+            "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+        )
+        global_cpu_mesh = init_device_mesh(
+            "cpu", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+        )
+        dp_mesh = global_cuda_mesh["dp"]
+        tp_mesh = global_cpu_mesh["tp"]  # mismatched meshes!
+        model = MLP(8, torch.device("cpu"), with_buffer=True)
+        parallelize_module(
+            model,
+            tp_mesh,
+            {"in_proj": ColwiseParallel(), "out_proj": RowwiseParallel()},
+        )
+        for tensor in itertools.chain(model.parameters(), model.buffers()):
+            self.assertEqual(tensor.device, torch.device("cpu"))
+            if isinstance(tensor, DTensor):
+                self.assertEqual(tensor._local_tensor.device, torch.device("cpu"))
+        regex = r"Requires DTensor to have mesh of the same type as the FSDP mesh but got cpu for DTensor and cuda for FSDP"
+        with self.assertRaisesRegex(ValueError, regex):
+            fully_shard(model, mesh=dp_mesh)
+
+
+class TestFullyShardMeshArg(FSDPTestMultiThread):
+    """Tests the ``mesh`` argument."""
+
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_invalid_mesh_ndim(self):
+        mesh = init_device_mesh("cuda", (self.world_size, 1, 1))
+        model = MLP(8)
+        regex = r"fully\_shard expects a 1D or 2D DeviceMesh but got DeviceMesh\(\[\[\[0\]\], \[\[1\]\]\]\)"
+        with self.assertRaisesRegex(ValueError, regex):
+            fully_shard(model, mesh=mesh)
+
+
+class TestFullyShardManagedModulesAndStates(FSDPTestMultiThread):
+    """Tests getting the managed modules/states for a ``fully_shard`` module."""
+
+    @property
+    def world_size(self) -> int:
+        return 1
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_managed_modules_single(self):
+        model = MLP(8)
+        # Assume calling `fully_shard` on `model`
+        managed_modules = _get_managed_modules(model)
+        expected_managed_modules = list(model.modules())
+        self._check_managed_modules(managed_modules, expected_managed_modules)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_managed_modules_nested(self):
+        model = nn.Sequential(*[MLP(8) for _ in range(2)])
+        fully_shard(model[0])
+        # Assume calling `fully_shard` on `model`
+        managed_modules = _get_managed_modules(model)
+        expected_managed_modules = list(model[1].modules()) + [model]
+        self._check_managed_modules(managed_modules, expected_managed_modules)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_managed_modules_nested_fully_shard_and_replicate(self):
+        model = nn.Sequential(*[MLP(8) for _ in range(3)])
+        replicate(model[0])
+        fully_shard(model[2])
+        # Assume calling `fully_shard` on `model`
+        managed_modules = _get_managed_modules(model)
+        expected_managed_modules = list(model[1].modules()) + [model]
+        self._check_managed_modules(managed_modules, expected_managed_modules)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_managed_modules_duplicate(self):
+        mlp = MLP(8)
+        model = nn.Sequential(mlp, mlp)  # duplicate MLP
+        # Assume calling `fully_shard` on `model`
+        managed_modules = _get_managed_modules(model)
+        # Check that the duplicate module is only counted once
+        expected_managed_modules = list(mlp.modules()) + [model]
+        self._check_managed_modules(managed_modules, expected_managed_modules)
+
+    def _check_managed_modules(
+        self,
+        managed_modules: List[nn.Module],
+        expected_managed_modules: List[nn.Module],
+    ):
+        self.assertEqual(len(managed_modules), len(expected_managed_modules))
+        # Check set comparison since we do not require anything about the order
+        self.assertEqual(set(managed_modules), set(expected_managed_modules))
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_managed_states_shared_params_and_buffers(self):
+        model = nn.Sequential(*[MLP(8, with_buffer=True) for _ in range(3)])
+        model[0].in_proj.weight = model[1].in_proj.weight
+        model[2].in_proj.weight = model[1].in_proj.weight
+        model[1].buffer = model[2].buffer
+        # Assume calling `fully_shard` on `model`
+        managed_modules = _get_managed_modules(model)
+        params, buffers = _get_managed_states(managed_modules)
+        expected_params = list(model.parameters())  # de-dups shared
+        expected_buffers = list(model.buffers())  # de-dups shared
+        self._check_managed_states(params, buffers, expected_params, expected_buffers)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_managed_states_nested_fully_shard(self):
+        model = nn.Sequential(*[MLP(8, with_buffer=True) for _ in range(2)])
+        fully_shard(model[0])
+        # Assume calling `fully_shard` on `model`
+        managed_modules = _get_managed_modules(model)
+        params, buffers = _get_managed_states(managed_modules)
+        expected_params = list(model[1].parameters())
+        expected_buffers = list(model[1].buffers())
+        self._check_managed_states(params, buffers, expected_params, expected_buffers)
+
+    def _check_managed_states(
+        self,
+        managed_params: List[nn.Parameter],
+        managed_buffers: List[torch.Tensor],
+        expected_managed_params: List[nn.Parameter],
+        expected_managed_buffers: List[torch.Tensor],
+    ):
+        self.assertEqual(len(managed_params), len(expected_managed_params))
+        self.assertEqual(len(managed_buffers), len(expected_managed_buffers))
+        self.assertEqual(set(managed_params), set(expected_managed_params))
+        self.assertEqual(set(managed_buffers), set(expected_managed_buffers))
+
+
+class TestFullyShardParamModuleInfos(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_get_param_module_infos_shared_params(self):
+        model = nn.Sequential(*[MLP(8) for _ in range(2)])
+        model[0].in_proj.weight = model[1].in_proj.weight
+        managed_modules = _get_managed_modules(model)
+        params, _ = _get_managed_states(managed_modules)
+        param_module_infos = _get_param_module_infos(params, model)
+        self.assertEqual(len(param_module_infos), len(params))
+        # We expect `params` to already have de-duplicated shared parameters
+        expected_param_module_infos = [
+            ParamModuleInfo(model[0].in_proj, "weight", [model[1].in_proj], ["weight"]),
+            ParamModuleInfo(model[0].in_proj, "bias", [], []),
+            ParamModuleInfo(model[0].out_proj, "weight", [], []),
+            ParamModuleInfo(model[0].out_proj, "bias", [], []),
+            ParamModuleInfo(model[1].in_proj, "bias", [], []),
+            ParamModuleInfo(model[1].out_proj, "weight", [], []),
+            ParamModuleInfo(model[1].out_proj, "bias", [], []),
+        ]
+        self.assertEqual(len(param_module_infos), len(expected_param_module_infos))
+        self.assertEqual(param_module_infos, expected_param_module_infos)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_get_param_module_infos_duplicates(self):
+        mlp = MLP(8)
+        model = nn.Sequential(mlp, mlp)  # shared MLP
+        params = list(model.parameters())
+        param_module_infos = _get_param_module_infos(params, model)
+        self.assertEqual(len(param_module_infos), len(params))
+        expected_param_module_infos = [
+            ParamModuleInfo(mlp.in_proj, "weight", [mlp.in_proj], ["weight"]),
+            ParamModuleInfo(mlp.in_proj, "bias", [mlp.in_proj], ["bias"]),
+            ParamModuleInfo(mlp.out_proj, "weight", [mlp.out_proj], ["weight"]),
+            ParamModuleInfo(mlp.out_proj, "bias", [mlp.out_proj], ["bias"]),
+        ]
+        self.assertEqual(len(param_module_infos), len(expected_param_module_infos))
+        self.assertEqual(param_module_infos, expected_param_module_infos)
+
+        model = nn.Sequential(*[MLP(8) for _ in range(2)])
+        model[0].in_proj = model[1].in_proj  # shared in-projection
+        params = list(model.parameters())
+        param_module_infos = _get_param_module_infos(params, model)
+        self.assertEqual(len(param_module_infos), len(params))
+        expected_param_module_infos = [
+            ParamModuleInfo(model[0].in_proj, "weight", [model[1].in_proj], ["weight"]),
+            ParamModuleInfo(mlp.in_proj, "bias", [], []),
+            ParamModuleInfo(mlp.out_proj, "weight", [], []),
+            ParamModuleInfo(mlp.out_proj, "bias", [], []),
+        ]
+
+
+class TestFullyShardShardedParameterTensor(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_shard_tensor_parameters(self):
+        # Use odd dim sizes to test uneven shards
+        model = nn.Sequential(*[MLP(3, dim_multiplier=3) for _ in range(3)])
+        orig_params = [param.detach().clone() for param in model.parameters()]
+        fully_shard(model)
+        sharded_params = list(model.parameters())
+        self._check_1d_sharded_parameters(orig_params, sharded_params)
+
+        model = nn.Sequential(*[MLP(3, dim_multiplier=3) for _ in range(3)])
+        model[0].in_proj = model[1].in_proj
+        orig_params = [param.detach().clone() for param in model.parameters()]
+        fully_shard(model)
+        sharded_params = list(model.parameters())
+        self._check_1d_sharded_parameters(orig_params, sharded_params)
+
+    def _check_1d_sharded_parameters(
+        self, orig_params: List[nn.Parameter], sharded_params: List[nn.Parameter]
+    ):
+        self.assertEqual(len(orig_params), len(sharded_params))
+        global_mesh = init_device_mesh("cuda", (self.world_size,))
+        for orig_param, sharded_param in zip(orig_params, sharded_params):
+            self.assertIsInstance(sharded_param, DTensor)
+            self.assertEqual(sharded_param.device_mesh, global_mesh)
+            self.assertEqual(sharded_param.size(), orig_param.size())
+            self.assertEqual(sharded_param.stride(), orig_param.stride())
+            self.assertEqual(sharded_param._spec.placements, (Shard(0),))
+            chunks = torch.chunk(orig_param, self.world_size, dim=0)
+            self.assertEqual(sharded_param._local_tensor, chunks[self.rank])
+
+
+class TestFullyShardShardedParameterDTensor(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_shard_dtensor_parameters(self):
+        dp_size = 2 if self.world_size > 2 else 1
+        global_mesh = init_device_mesh(
+            "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+        )
+        dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
+        # Use odd dim sizes to test uneven shards
+        model = MLP(9, dim_multiplier=3)
+        orig_params = [param.detach().clone() for param in model.parameters()]
+        orig_param_names = [param_name for param_name, _ in model.named_parameters()]
+        parallelize_module(
+            model,
+            tp_mesh,
+            {"in_proj": ColwiseParallel(), "out_proj": RowwiseParallel()},
+        )
+        fully_shard(model, mesh=dp_mesh)
+        sharded_params = list(model.parameters())
+        self.assertEqual(len(orig_params), len(sharded_params))
+        for orig_param_name, orig_param, sharded_param in zip(
+            orig_param_names, orig_params, sharded_params
+        ):
+            self.assertIsInstance(sharded_param, DTensor)
+            self.assertEqual(sharded_param.device_mesh, global_mesh)
+            self.assertEqual(sharded_param.size(), orig_param.size())
+            self.assertEqual(sharded_param.stride(), orig_param.stride())
+            if "in_proj" in orig_param_name:
+                expected_placements = (Shard(0), Shard(0))
+            elif "out_proj" in orig_param_name and "weight" in orig_param_name:
+                expected_placements = (Shard(0), Shard(1))
+            else:
+                expected_placements = (Shard(0), Replicate())
+            self.assertEqual(sharded_param._spec.placements, expected_placements)
+
+
+class TestFullyShardLazyInit(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_fully_shard_is_root(self):
+        """
+        Tests that ``_is_root`` is set correctly after lazy initialization.
+
+        FSDP(model(
+            0: MLP(FSDP(in_proj), FSDP(out_proj)),
+            1: MLP(in_proj, out_proj),
+        ))
+        """
+        model = nn.Sequential(MLP(8), MLP(8))
+        fully_shard(model[0].in_proj)
+        fully_shard(model[0].out_proj)
+        fully_shard(model)  # root gets `model[1]`
+        root_state = fully_shard.state(model)
+        root_state._lazy_init()
+
+        model0_in_proj_state = fully_shard.state(model[0].in_proj)
+        model0_out_proj_state = fully_shard.state(model[0].out_proj)
+        self.assertTrue(root_state._is_root)
+        self.assertFalse(model0_in_proj_state._is_root)
+        self.assertFalse(model0_out_proj_state._is_root)
+
+        all_states = root_state._state_ctx.all_states
+        self.assertEqual(len(all_states), 3)
+        self.assertEqual(
+            all_states, [root_state, model0_in_proj_state, model0_out_proj_state]
+        )
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_fully_shard_module_and_param_fqns(self):
+        """
+        Tests that the module and parameter FQNs are computed correctly after
+        lazy initialization.
+
+        FSDP(model(
+            0: MLP(FSDP(in_proj), FSDP(out_proj)),
+            1: MLP(in_proj, out_proj),
+        ))
+        """
+        model = nn.Sequential(MLP(8), MLP(8))
+        fully_shard(model[0].in_proj)
+        fully_shard(model[0].out_proj)
+        fully_shard(model)  # root gets `model[1]`
+        root_state = fully_shard.state(model)
+        root_state._lazy_init()
+
+        root_param_group = root_state._fsdp_param_group
+        self.assertIsNotNone(root_param_group)
+        self.assertEqual(root_param_group._module_fqn, "")
+        root_param_fqns = {
+            fsdp_param._param_fqn for fsdp_param in root_param_group.fsdp_params
+        }
+        self.assertEqual(
+            root_param_fqns,
+            {
+                "1.in_proj.weight",
+                "1.in_proj.bias",
+                "1.out_proj.weight",
+                "1.out_proj.bias",
+            },
+        )
+
+        model0_in_proj_state = fully_shard.state(model[0].in_proj)
+        model0_in_proj_param_group = model0_in_proj_state._fsdp_param_group
+        self.assertIsNotNone(model0_in_proj_param_group)
+        self.assertEqual(model0_in_proj_param_group._module_fqn, "0.in_proj")
+        model0_in_proj_param_fqns = {
+            fsdp_param._param_fqn
+            for fsdp_param in model0_in_proj_param_group.fsdp_params
+        }
+        self.assertEqual(
+            model0_in_proj_param_fqns, {"0.in_proj.weight", "0.in_proj.bias"}
+        )
+
+        model0_out_proj_state = fully_shard.state(model[0].out_proj)
+        model0_out_proj_param_group = model0_out_proj_state._fsdp_param_group
+        self.assertIsNotNone(model0_out_proj_param_group)
+        self.assertEqual(model0_out_proj_param_group._module_fqn, "0.out_proj")
+        model0_out_proj_param_fqns = {
+            fsdp_param._param_fqn
+            for fsdp_param in model0_out_proj_param_group.fsdp_params
+        }
+        self.assertEqual(
+            model0_out_proj_param_fqns, {"0.out_proj.weight", "0.out_proj.bias"}
+        )
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_fully_shard_double_lazy_init(self):
+        model = nn.Sequential(MLP(8), MLP(8))
+        fully_shard(model[0].in_proj)
+        fully_shard(model[0].out_proj)
+        fully_shard(model)
+        root_state = fully_shard.state(model)
+        model0_in_proj_state = fully_shard.state(model[0].in_proj)
+        model0_in_proj_state._lazy_init()
+        regex = (
+            "FSDP state has already been lazily initialized for 0.in_proj\n"
+            "FSDP requires running forward through the root module first"
+        )
+        with self.assertRaisesRegex(RuntimeError, regex):
+            root_state._lazy_init()
+
+
+class TestFullyShardMetaDeviceInit(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_meta_device_1d_init(self):
+        default_pg = torch.distributed.distributed_c10d._get_default_group()
+        mesh = init_device_mesh("cuda", mesh_shape=(default_pg.size(),))
+
+        # Test both even sharding (8) and uneven sharding (3)
+        for mlp_dim in (8, 3):
+            with torch.device("meta"):
+                model = nn.Sequential(MLP(mlp_dim, with_buffer=True), MLP(mlp_dim))
+                for param in model.parameters():
+                    self.assertEqual(param.device, torch.device("meta"))
+                fully_shard(model[0], mesh=mesh)
+                fully_shard(model[1], mesh=mesh)
+                fully_shard(model, mesh=mesh)
+            for param in model.parameters():
+                self.assertEqual(param.device, torch.device("meta"))
+            self._test_to_empty_and_reset_parameters(model, mesh, mlp_dim)
+
+        # Test that we can call `fully_shard` under meta-device context and
+        # that `init_device_mesh` call still works
+        mlp_dim = 8
+        with torch.device("meta"):
+            model = nn.Sequential(MLP(mlp_dim, with_buffer=True), MLP(mlp_dim))
+            for param in model.parameters():
+                self.assertEqual(param.device, torch.device("meta"))
+            for module in (model[0], model[1], model):
+                fully_shard(module)
+        for param in model.parameters():
+            self.assertEqual(param.device, torch.device("meta"))
+        self._test_to_empty_and_reset_parameters(model, mesh, mlp_dim)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_meta_device_2d_init(self):
+        assert self.world_size >= 4, f"{self.world_size}"
+        dp_size = 2
+        global_mesh = init_device_mesh(
+            "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+        )
+        dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
+
+        # Test both even sharding (8) and uneven sharding (3)
+        for mlp_dim in (8, 3):
+            with torch.device("meta"):
+                model = MLP(mlp_dim, with_buffer=True)
+                for param in model.parameters():
+                    self.assertEqual(param.device, torch.device("meta"))
+                parallelize_module(
+                    model,
+                    tp_mesh,
+                    {"in_proj": ColwiseParallel(), "out_proj": RowwiseParallel()},
+                )
+                for param in model.parameters():
+                    self.assertEqual(param.device, torch.device("meta"))
+                fully_shard(model.in_proj, mesh=dp_mesh)
+                fully_shard(model.out_proj, mesh=dp_mesh)
+                fully_shard(model, mesh=dp_mesh)
+            for param in model.parameters():
+                self.assertEqual(param.device, torch.device("meta"))
+            self._test_to_empty_and_reset_parameters(model, global_mesh, mlp_dim)
+
+    def _test_to_empty_and_reset_parameters(
+        self, model: nn.Module, mesh: DeviceMesh, mlp_dim: int
+    ):
+        # Check that we can materialize it on GPU with empty values
+        device = torch.device("cuda", torch.cuda.current_device())
+        model.to_empty(device=device)
+        for param in model.parameters():
+            self.assertEqual(param.device, device)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        # Check that `reset_parameters()` on each module initializes values
+        const = 1337
+        for tensor in itertools.chain(model.parameters(), model.buffers()):
+            tensor.detach().fill_(const)
+        for module in model.modules():
+            if hasattr(module, "reset_parameters"):
+                module.reset_parameters()
+        for param in model.parameters():
+            local_tensor = param.to_local()
+            if local_tensor.numel() > 0:
+                self.assertNotEqual(local_tensor, torch.ones_like(local_tensor) * const)
+        for buffer in model.buffers():
+            self.assertNotEqual(buffer, torch.ones_like(buffer) * const)
+
+        # Check that we can run an iteration without erroring
+        inp = torch.randn((4, mlp_dim), device="cuda")
+        model(inp).sum().backward()
+        optim.step()
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_invalid_meta_device_init(self):
+        default_pg = torch.distributed.distributed_c10d._get_default_group()
+        mesh = init_device_mesh("cuda", mesh_shape=(default_pg.size(),))
+        mlp_dim = 8
+        with torch.device("meta"):
+            model = nn.Sequential(MLP(mlp_dim, with_buffer=True), MLP(mlp_dim))
+            for param in model.parameters():
+                self.assertEqual(param.device, torch.device("meta"))
+            fully_shard(model[0], mesh=mesh)
+            fully_shard(model[1], mesh=mesh)
+            fully_shard(model, mesh=mesh)
+        inp = torch.randn((4, mlp_dim), device="cuda")
+        error_regex = (
+            "FSDP parameters should be materialized from meta device before training, "
+            "but the following were still on meta device: "
+            r"\['0.in_proj.weight', '0.in_proj.bias', '0.out_proj.weight', '0.out_proj.bias'\]"
+        )
+        with self.assertRaisesRegex(RuntimeError, error_regex):
+            model(inp)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_rank0_broadcast_meta_device_init(self):
+        model_args = ModelArgs(dropout_p=0.0)
+        # Assume we have a CPU full state dict on rank 0
+        if self.rank == 0:
+            torch.manual_seed(42)
+            ref_model = Transformer(model_args)
+            full_sd = ref_model.state_dict()
+            for param in full_sd.values():
+                self.assertEqual(param.device, torch.device("cpu"))
+
+        # Initialize the sharded model on meta device
+        fsdp_mesh = init_device_mesh("cuda", (self.world_size,))
+        with torch.device("meta"):
+            model = Transformer(model_args)
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard(module, mesh=fsdp_mesh)
+        fully_shard(model, mesh=fsdp_mesh)
+        for param in model.parameters():
+            self.assertEqual(param.device, torch.device("meta"))
+
+        # Construct a sharded state dict from the rank 0 full state dict by
+        # broadcasting and sharding
+        meta_sharded_sd = model.state_dict()
+        sharded_sd = {}
+        if self.rank == 0:
+            self.assertEqual(len(meta_sharded_sd), len(full_sd))
+            self.assertEqual(list(meta_sharded_sd.keys()), list(full_sd.keys()))
+            for (param_name, full_param), sharded_meta_param in zip(
+                full_sd.items(), meta_sharded_sd.values()
+            ):
+                full_param = full_param.detach().cuda()
+                mesh = sharded_meta_param.device_mesh
+                dist.broadcast(full_param, src=0, group=mesh.get_group(0))
+                sharded_tensor = distribute_tensor(
+                    full_param, mesh, sharded_meta_param.placements
+                )
+                sharded_sd[param_name] = nn.Parameter(sharded_tensor)
+        else:
+            for param_name, sharded_meta_param in meta_sharded_sd.items():
+                full_tensor = torch.empty(
+                    sharded_meta_param.size(),
+                    device="cuda",
+                    dtype=sharded_meta_param.dtype,
+                )
+                mesh = sharded_meta_param.device_mesh
+                dist.broadcast(full_tensor, src=0, group=mesh.get_group(0))
+                sharded_tensor = distribute_tensor(
+                    full_tensor, mesh, sharded_meta_param.placements
+                )
+                sharded_sd[param_name] = nn.Parameter(sharded_tensor)
+
+        model.load_state_dict(sharded_sd, assign=True)
+        for param in model.parameters():
+            self.assertIsInstance(param, DTensor)
+            self.assertEqual(param.device.type, "cuda")
+
+        # Construct the reference model on nonzero ranks by broadcasting the
+        # unsharded model from rank 0 and sharding on all ranks
+        if self.rank != 0:
+            ref_model = Transformer(model_args)
+        for param in ref_model.parameters():
+            torch.distributed.broadcast(param.detach(), src=0)
+        for module in ref_model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard(module, mesh=fsdp_mesh)
+        fully_shard(ref_model, mesh=fsdp_mesh)
+
+        for (param_name, param), (ref_param_name, ref_param) in zip(
+            model.named_parameters(), ref_model.named_parameters()
+        ):
+            self.assertEqual(param_name, ref_param_name)
+            self.assertEqual(param, ref_param)
+
+        # Check one forward/backward for parity
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+        loss = model(inp).sum()
+        loss.backward()
+        ref_loss = ref_model(inp).sum()
+        ref_loss.backward()
+        self.assertEqual(loss, ref_loss)
+        for param, ref_param in zip(model.parameters(), ref_model.parameters()):
+            self.assertEqual(param.grad, ref_param.grad)
+
+
+class TestFullyShardProcessGroupInit(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_process_group_init(self):
+        assert self.world_size == 4, f"{self.world_size}"
+        # For convenience, use device mesh's infra to construct the DP PG
+        # (in practice, the trainer would do it manually via `new_group()`)
+        dp_size = 2
+        global_mesh = init_device_mesh(
+            "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+        )
+        ref_dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
+        dp_pg = ref_dp_mesh.get_group(0)
+
+        # Check the `from_group()` API for correctness
+        dp_mesh = DeviceMesh.from_group(dp_pg, "cuda")
+        self.assertEqual(dp_mesh.mesh, ref_dp_mesh.mesh)
+        self.assertEqual(dp_mesh, ref_dp_mesh)
+        # self.assertFalse(hasattr(dp_mesh, "_coordinate_on_dim"))
+        self.assertEqual(dp_mesh._coordinate_on_dim, ref_dp_mesh._coordinate_on_dim)
+        self.assertEqual(dp_mesh._dim_group_infos, ref_dp_mesh._dim_group_infos)
+
+        # Check 1D FSDP forward/backward parity over the DP mesh
+        # NOTE: We cannot use 2D DTensor-based training here because the DP
+        # mesh from `from_group` does not respect the parent mesh.
+        torch.manual_seed(42)
+        mlp_dim = 8
+        ref_model = MLP(mlp_dim)
+        for param in ref_model.parameters():
+            dist.broadcast(param.detach(), src=0)
+        model = copy.deepcopy(ref_model)
+
+        # Parallelize the test model with the ref DP mesh
+        for module in (ref_model.in_proj, ref_model.out_proj, ref_model):
+            fully_shard(module, mesh=ref_dp_mesh)
+        # Parallelize the test model with the new DP mesh from the PG
+        for module in (model.in_proj, model.out_proj, model):
+            fully_shard(module, mesh=dp_mesh)
+
+        # Ensure that TP ranks have the same input
+        inp = torch.randn((4, mlp_dim), device="cuda")
+        if self.rank in (0, 1):
+            dist.broadcast(inp, src=0, group=tp_mesh.get_group(0))
+        elif self.rank in (2, 3):
+            dist.broadcast(inp, src=2, group=tp_mesh.get_group(0))
+
+        ref_loss = ref_model(inp).sum()
+        ref_loss.backward()
+        loss = model(inp).sum()
+        loss.backward()
+        self.assertEqual(loss, ref_loss)
+        for param, ref_param in zip(model.parameters(), ref_model.parameters()):
+            self.assertEqual(param, ref_param)
+            self.assertEqual(param.grad, ref_param.grad)
+
+
+class TestFullyShardHSDPBroadcast(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_hsdp_broadcast_across_replicas(self):
+        shard_size, replicate_size = 2, 2
+        mesh = init_device_mesh(
+            "cuda", (replicate_size, shard_size), mesh_dim_names=("replicate", "shard")
+        )
+        model_args = ModelArgs()
+        model = Transformer(model_args)
+        # Add a buffer to show that this flow works for buffers too
+        model.register_buffer("buf", torch.randn((model_args.dim,)))
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard(module, mesh=mesh)
+        fully_shard(model, mesh=mesh)
+
+        # Only preserve the model states on the replicate mesh's rank 0
+        if mesh.get_local_rank("replicate") > 0:
+            for tensor in itertools.chain(model.parameters(), model.buffers()):
+                tensor.detach().fill_(1337)
+
+        # Check that replicas are different
+        for tensor in itertools.chain(model.parameters(), model.buffers()):
+            local_tensor = tensor.to_local() if isinstance(tensor, DTensor) else tensor
+            local_tensor_list = [
+                torch.empty_like(local_tensor) for _ in range(mesh["replicate"].size())
+            ]
+            dist.all_gather(
+                local_tensor_list, local_tensor, group=mesh.get_group("replicate")
+            )
+            for other_local_tensor in local_tensor_list[1:]:
+                self.assertEqual(other_local_tensor.shape, local_tensor_list[0].shape)
+                self.assertNotEqual(other_local_tensor, local_tensor_list[0])
+
+        # Broadcast from replicate mesh's rank 0
+        replicate_group = mesh.get_group("replicate")
+        for tensor in itertools.chain(model.parameters(), model.buffers()):
+            # E.g. for mesh [[0, 1, 2, 3], [4, 5, 6, 7]] sharding on dim-1 and
+            # replicating on dim-0, broadcast with sources 0, 1, 2, 3
+            src_rank = dist.get_process_group_ranks(replicate_group)[0]
+            torch.distributed.broadcast(
+                tensor.to_local() if isinstance(tensor, DTensor) else tensor,
+                src=src_rank,
+                group=replicate_group,
+            )
+
+        # Check that replicas are the same
+        for tensor in itertools.chain(model.parameters(), model.buffers()):
+            local_tensor = tensor.to_local() if isinstance(tensor, DTensor) else tensor
+            local_tensor_list = [
+                torch.empty_like(local_tensor) for _ in range(mesh["replicate"].size())
+            ]
+            dist.all_gather(
+                local_tensor_list, local_tensor, group=mesh.get_group("replicate")
+            )
+            for other_local_tensor in local_tensor_list[1:]:
+                self.assertEqual(other_local_tensor, local_tensor_list[0])
+
+        # Check that we can run an iteration without erroring
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+        model(inp).sum().backward()
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_memory.py b/test/distributed/_composable/fsdp/test_fully_shard_memory.py
new file mode 100644
index 0000000000000..7c65de7268bd9
--- /dev/null
+++ b/test/distributed/_composable/fsdp/test_fully_shard_memory.py
@@ -0,0 +1,180 @@
+# Owner(s): ["oncall: distributed"]
+
+import functools
+
+import torch
+
+from torch.distributed._composable.fsdp import (
+    CPUOffloadPolicy,
+    fully_shard,
+    OffloadPolicy,
+)
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    ModelArgs,
+    Transformer,
+    TransformerBlock,
+)
+
+
+class TestFullyShardMemory(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(2, torch.cuda.device_count())
+
+    @skip_if_lt_x_gpu(2)
+    def test_fully_shard_training_memory(self):
+        self.run_subtests(
+            {"reshard_after_forward": [True, False], "use_cpu_offload": [True, False]},
+            self._test_fully_shard_training_memory,
+        )
+
+    def _test_fully_shard_training_memory(
+        self, reshard_after_forward: bool, use_cpu_offload: bool
+    ):
+        if not reshard_after_forward and use_cpu_offload:
+            return  # skip since not a common use case
+        assert (
+            self.world_size == 2
+        ), f"Requires world size of 2 since some values are hard coded: {self.world_size}"
+        torch.manual_seed(42)
+        # Pre-run a linear forward (gemm and bias) and backward (gemm) to
+        # allocate the cuBLAS workspaces before measuring the memory usage
+        # since the workspace size can differ between hardwares
+        lin = torch.nn.Linear(768, 768, device="cuda")
+        inp = torch.randn(1, 768, device="cuda")
+        lin(inp).sum().backward()
+        torch.cuda.empty_cache()
+        base_mem_mb = self._get_peak_active_memory_mb()
+        vocab_size = 32
+        model_args = ModelArgs(
+            vocab_size=vocab_size, n_layers=3, dim=768, n_heads=12, weight_tying=False
+        )
+        model = Transformer(model_args)
+        model_unsharded_numel = sum(p.numel() for p in model.parameters())
+        model_sharded_numel = (model_unsharded_numel + 1) // 2
+        max_unsharded_numel = sum(
+            p.numel() for p in model.layers[0].parameters()
+        )  # i.e. block unsharded numel
+        non_block_numel = round(
+            sum(p.numel() for p in model.tok_embeddings.parameters())
+            + sum(p.numel() for p in model.pos_embeddings.parameters())
+            + sum(p.numel() for p in model.norm.parameters())
+            + sum(p.numel() for p in model.output.parameters())
+        )
+        fully_shard_fn = functools.partial(
+            fully_shard,
+            reshard_after_forward=reshard_after_forward,
+            offload_policy=CPUOffloadPolicy() if use_cpu_offload else OffloadPolicy(),
+        )
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard_fn(module)
+        fully_shard_fn(model)
+        # Do not use foreach since intermediates increase peak memory
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=False)
+
+        # Init: Each module is moved to GPU before sharding parameters
+        peak_mem_mb = self._get_peak_active_memory_mb()
+        curr_mem_mb = self._get_curr_active_memory_mb()
+        # Allow for some buffer for the peak memory since original parameters
+        # are not freed until a `fully_shard` call returns
+        buffer_mb = 4
+        if use_cpu_offload:
+            # Parameters are offloaded after sharding
+            init_mem_mb = (1.5 * max_unsharded_numel) * 4 / 1e6
+        else:
+            init_mem_mb = (model_sharded_numel + max_unsharded_numel) * 4 / 1e6
+        self.assertLessEqual(peak_mem_mb - base_mem_mb, init_mem_mb + buffer_mb)
+        self.assertLessEqual(curr_mem_mb - base_mem_mb, init_mem_mb)
+
+        # Use a small input to minimize activation memory usage
+        inp = torch.randint(0, vocab_size, (1, 4), device="cuda")
+
+        # Forward:
+        loss = model(inp)
+        mem_mb = self._get_peak_active_memory_mb()
+        # Allow for some buffer for fragmentation/activations (where this
+        # number is kept much smaller than the actual memory usage, which is on
+        # the order of 100-200+ MB)
+        buffer_mb = 16
+        if reshard_after_forward:
+            # 3x max unsharded block parameters (current all-gather + copy-out
+            # and next all-gather), non-block parameters, and other
+            expected_mem_mb = (
+                3 * max_unsharded_numel + non_block_numel
+            ) * 4 / 1e6 + buffer_mb
+            if not use_cpu_offload:
+                # Sharded parameters
+                expected_mem_mb += model_sharded_numel * 4 / 1e6
+        else:
+            assert not use_cpu_offload
+            # Sharded parameters, unsharded parameters, 1x max unsharded block parameters
+            # (copy-out) and other (peak at end of forward)
+            expected_mem_mb = (
+                model_sharded_numel + model_unsharded_numel + max_unsharded_numel
+            ) * 4 / 1e6 + buffer_mb
+        self.assertLessEqual(mem_mb - base_mem_mb, expected_mem_mb)
+
+        # Backward:
+        loss.sum().backward()
+        mem_mb = self._get_peak_active_memory_mb()
+        if reshard_after_forward:
+            # 1x max unsharded block parameters (all-gather), 2x max
+            # unsharded block gradients (gradients, reduce-scatter input),
+            # non-block parameters, and other
+            # NOTE: Reduce-scatter output is counted as part of the 1x sharded
+            # gradients below since the gradients view into the output
+            expected_mem_mb = (
+                3 * max_unsharded_numel + non_block_numel
+            ) * 4 / 1e6 + buffer_mb
+            if not use_cpu_offload:
+                # 2x sharded parameters/gradients
+                expected_mem_mb += 2 * model_sharded_numel * 4 / 1e6
+        else:
+            assert not use_cpu_offload
+            # Sharded parameters, unsharded parameters, 1.5x max unsharded
+            # block parameters (reduce-scatter input/output), and other (peak
+            # at beginning of backward)
+            expected_mem_mb = (
+                model_sharded_numel + model_unsharded_numel + 1.5 * max_unsharded_numel
+            ) * 4 / 1e6 + buffer_mb
+        self.assertLessEqual(mem_mb - base_mem_mb, expected_mem_mb)
+        del loss
+        torch.cuda.reset_peak_memory_stats()
+
+        # Optimizer step: unsharded parameters/gradients freed
+        optim.step()
+        mem_mb = self._get_peak_active_memory_mb()
+        expected_mem_mb = buffer_mb
+        if not use_cpu_offload:
+            # 1x sharded parameters, 1x sharded gradients, 2x sharded optimizer
+            # states
+            expected_mem_mb += (4 * model_sharded_numel) * 4 / 1e6
+        self.assertLessEqual(mem_mb - base_mem_mb, expected_mem_mb)
+
+        # Zero grad: sharded gradients freed
+        optim.zero_grad()
+        torch.cuda.reset_peak_memory_stats()  # reset after freeing
+        mem_mb = self._get_peak_active_memory_mb()
+        expected_mem_mb = 0
+        if not use_cpu_offload:
+            # 1x sharded parameters
+            expected_mem_mb += model_sharded_numel * 4 / 1e6 + buffer_mb
+            # 2x sharded optimizer states
+            expected_mem_mb += (2 * model_sharded_numel) * 4 / 1e6 + buffer_mb
+        self.assertLessEqual(mem_mb - base_mem_mb, expected_mem_mb)
+
+    def _get_peak_active_memory_mb(self) -> int:
+        mem_stats = torch.cuda.memory_stats()
+        return round(mem_stats["active_bytes.all.peak"] / 1e6)
+
+    def _get_curr_active_memory_mb(self) -> int:
+        mem_stats = torch.cuda.memory_stats()
+        return round(mem_stats["active_bytes.all.current"] / 1e6)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py b/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
new file mode 100644
index 0000000000000..4c660392b9094
--- /dev/null
+++ b/test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py
@@ -0,0 +1,479 @@
+# Owner(s): ["oncall: distributed"]
+
+import copy
+import functools
+
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.distributed._functional_collectives as funcol
+import torch.nn as nn
+from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy
+from torch.testing._internal.common_distributed import (
+    requires_nccl_version,
+    SaveForwardInputsModel,
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.common_fsdp import (
+    check_sharded_parity,
+    FSDPTest,
+    FSDPTestMultiThread,
+    MLP,
+    patch_reduce_scatter,
+    reduce_scatter_with_assert,
+)
+from torch.testing._internal.common_utils import run_tests
+
+
+class TestFullyShardMixedPrecisionTraining(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.cuda.device_count())
+
+    def _init_models_and_optims(
+        self,
+        reshard_after_forward: Union[bool, int],
+        param_dtype: Optional[torch.dtype],
+        reduce_dtype: Optional[torch.dtype],
+    ):
+        torch.manual_seed(42)
+        model = nn.Sequential(*[MLP(16, torch.device("cpu")) for _ in range(3)])
+        ref_model = copy.deepcopy(model).cuda()
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        mp_policy = MixedPrecisionPolicy(
+            param_dtype=param_dtype, reduce_dtype=reduce_dtype
+        )
+        fully_shard_fn = functools.partial(
+            fully_shard,
+            reshard_after_forward=reshard_after_forward,
+            mp_policy=mp_policy,
+        )
+        for mlp in model:
+            fully_shard_fn(mlp)
+        fully_shard_fn(model)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+        return ref_model, ref_optim, model, optim
+
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
+    def test_compute_dtype(self):
+        self.run_subtests(
+            {"reshard_after_forward": [False, True, 2]},
+            self._test_compute_dtype,
+        )
+
+    def _test_compute_dtype(self, reshard_after_forward: Union[bool, int]):
+        param_dtype = torch.bfloat16
+        ref_model, ref_optim, model, optim = self._init_models_and_optims(
+            reshard_after_forward, param_dtype=param_dtype, reduce_dtype=None
+        )
+        ref_model_bf16 = copy.deepcopy(ref_model).to(param_dtype)
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+
+        def assert_fn(output: torch.Tensor):
+            self.assertEqual(output.dtype, param_dtype)
+
+        reduce_scatter = functools.partial(
+            reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
+        )
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((4, 16), device="cuda", dtype=param_dtype)
+        for iter_idx in range(10):
+            optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            fsdp_loss = model(inp).sum()
+            with patch_reduce_scatter(reduce_scatter):
+                fsdp_loss.backward()
+            optim.step()
+
+            ref_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            ref_loss = ref_model_bf16(inp.to(param_dtype)).sum()
+            ref_loss.backward()
+            for param in ref_model_bf16.parameters():
+                # Use reduce-scatter -> all-gather as all-reduce because for
+                # world size >=4, NCCL all-reduce shows numeric differences
+                # compared with NCCL reduce-scatter
+                output = torch.zeros_like(torch.chunk(param.grad, self.world_size)[0])
+                dist.reduce_scatter_tensor(output, param.grad)
+                dist.all_gather_into_tensor(param.grad, output)
+                param.grad.div_(self.world_size)
+            for param_fp32, param_bf16 in zip(
+                ref_model.parameters(), ref_model_bf16.parameters()
+            ):
+                param_fp32.grad = param_bf16.grad.to(param_fp32.dtype)
+                param_bf16.grad = None
+            ref_optim.step()  # fp32 optimizer step
+            for param_fp32, param_bf16 in zip(
+                ref_model.parameters(), ref_model_bf16.parameters()
+            ):
+                param_bf16.detach().copy_(param_fp32)
+
+            self.assertEqual(fsdp_loss, ref_loss)
+            check_sharded_parity(self, ref_model, model)
+
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
+    def test_reduce_dtype(self):
+        self.run_subtests(
+            {"reshard_after_forward": [False, True, 2]},
+            self._test_reduce_dtype_fp32_reduce,
+        )
+        self.run_subtests(
+            {"reshard_after_forward": [False, True, 2]},
+            self._test_reduce_dtype_bf16_reduce,
+        )
+
+    def _test_reduce_dtype_fp32_reduce(self, reshard_after_forward: Union[bool, int]):
+        param_dtype, reduce_dtype = torch.bfloat16, torch.float32
+        ref_model, ref_optim, model, optim = self._init_models_and_optims(
+            reshard_after_forward, param_dtype=param_dtype, reduce_dtype=reduce_dtype
+        )
+        ref_model_bf16 = copy.deepcopy(ref_model).to(param_dtype)
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+
+        def assert_fn(output: torch.Tensor):
+            self.assertEqual(output.dtype, reduce_dtype)
+
+        reduce_scatter = functools.partial(
+            reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
+        )
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((4, 16), device="cuda", dtype=param_dtype)
+        for iter_idx in range(10):
+            optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            fsdp_loss = model(inp).sum()
+            with patch_reduce_scatter(reduce_scatter):
+                fsdp_loss.backward()
+            optim.step()
+
+            ref_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            ref_loss = ref_model_bf16(inp.to(param_dtype)).sum()
+            ref_loss.backward()
+            for param in ref_model_bf16.parameters():
+                param.grad.data = param.grad.to(torch.float32)
+                dist.all_reduce(param.grad)  # fp32 reduction
+                param.grad.div_(self.world_size)
+            for param_fp32, param_bf16 in zip(
+                ref_model.parameters(), ref_model_bf16.parameters()
+            ):
+                param_fp32.grad = param_bf16.grad
+                param_bf16.grad = None
+            ref_optim.step()  # fp32 optimizer step
+            for param_fp32, param_bf16 in zip(
+                ref_model.parameters(), ref_model_bf16.parameters()
+            ):
+                param_bf16.detach().copy_(param_fp32)
+
+            self.assertEqual(fsdp_loss, ref_loss)
+            check_sharded_parity(self, ref_model, model)
+
+    def _test_reduce_dtype_bf16_reduce(self, reshard_after_forward: Union[bool, int]):
+        param_dtype, reduce_dtype = torch.float32, torch.bfloat16
+        ref_model, ref_optim, model, optim = self._init_models_and_optims(
+            reshard_after_forward, param_dtype=param_dtype, reduce_dtype=reduce_dtype
+        )
+        group = dist.distributed_c10d._get_default_group()
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+
+        def assert_fn(output: torch.Tensor):
+            self.assertEqual(output.dtype, reduce_dtype)
+
+        reduce_scatter = functools.partial(
+            reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
+        )
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((4, 16), device="cuda", dtype=param_dtype)
+        for iter_idx in range(10):
+            optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            fsdp_loss = model(inp).sum()
+            with patch_reduce_scatter(reduce_scatter):
+                fsdp_loss.backward()
+            optim.step()
+
+            ref_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            ref_loss = ref_model(inp).sum()
+            ref_loss.backward()
+            for param in ref_model.parameters():
+                param_grad = param.grad.to(reduce_dtype)
+                # Use reduce-scatter -> all-gather to implement all-reduce
+                # since for world size >2, bf16 all-reduce and reduce-scatter
+                # have numeric differences
+                sharded_grad = funcol.reduce_scatter_tensor(
+                    param_grad, scatter_dim=0, reduceOp="avg", group=group
+                )  # bf16 reduction
+                param.grad = funcol.all_gather_tensor(
+                    sharded_grad, gather_dim=0, group=group
+                ).to(
+                    param.dtype
+                )  # upcast to fp32
+            ref_optim.step()  # fp32 optimizer step
+
+            self.assertEqual(fsdp_loss, ref_loss)
+            check_sharded_parity(self, ref_model, model)
+
+    @skip_if_lt_x_gpu(2)
+    def test_grad_acc_with_reduce_dtype(self):
+        """
+        Tests that gradient accumulation without reduce-scatter when using
+        bf16 compute and fp32 reduction accumulates the unsharded gradients in
+        fp32.
+        """
+        self.run_subtests(
+            {"reshard_after_forward": [True, False]},
+            self._test_grad_acc_with_reduce_dtype,
+        )
+
+    def _test_grad_acc_with_reduce_dtype(self, reshard_after_forward: bool):
+        torch.manual_seed(42)
+        param_dtype, reduce_dtype = (torch.bfloat16, torch.float32)
+        mp_policy = MixedPrecisionPolicy(
+            param_dtype=param_dtype, reduce_dtype=reduce_dtype
+        )
+        model = nn.Sequential(*[MLP(16, torch.device("cpu")) for _ in range(3)])
+        # To emulate the mixed precision implementation where forward/backward
+        # compute use bf16 and optimizer uses fp32, we maintain both an fp32
+        # and a bf16 copy of the reference model
+        ref_model = copy.deepcopy(model).cuda()
+        ref_model_compute = copy.deepcopy(ref_model).to(param_dtype)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        for mlp in model:
+            fully_shard(
+                mlp, reshard_after_forward=reshard_after_forward, mp_policy=mp_policy
+            )
+        fully_shard(
+            model, reshard_after_forward=reshard_after_forward, mp_policy=mp_policy
+        )
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+
+        def assert_fn(output: torch.Tensor):
+            self.assertEqual(output.dtype, reduce_dtype)
+
+        reduce_scatter = functools.partial(
+            reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
+        )
+        torch.manual_seed(42 + self.rank + 1)
+        device = torch.device("cuda")
+        # Train on the same input to avoid loss explosion
+        num_microbatches = 4
+        inp = torch.randn((2 * num_microbatches, 16), device=device, dtype=param_dtype)
+        for iter_idx in range(10):
+            microbatch_inps = torch.chunk(inp, 4)
+            for microbatch_idx in range(num_microbatches):
+                is_last_microbatch = microbatch_idx == num_microbatches - 1
+                model.set_requires_gradient_sync(is_last_microbatch)
+                model.set_reshard_after_backward(
+                    is_last_microbatch or reshard_after_forward
+                )
+                losses: List[torch.Tensor] = []
+                for _model in (ref_model_compute, model):
+                    losses.append(
+                        _model(microbatch_inps[microbatch_idx].detach()).sum()
+                    )
+                    self.assertEqual(losses[-1].dtype, param_dtype)
+                    with patch_reduce_scatter(reduce_scatter):
+                        losses[-1].backward()
+                self.assertEqual(losses[0], losses[1])
+                # Manually accumulate gradients into the base reference model
+                # from the compute reference model in fp32
+                for ref_param, ref_param_compute in zip(
+                    ref_model.parameters(), ref_model_compute.parameters()
+                ):
+                    self.assertTrue(ref_param_compute.grad is not None)
+                    self.assertEqual(ref_param.dtype, torch.float32)
+                    if ref_param.grad is not None:
+                        ref_param.grad += ref_param_compute.grad
+                    else:
+                        ref_param.grad = ref_param_compute.grad.to(ref_param.dtype)
+                    ref_param_compute.grad = None
+                # Manually reduce gradients for the reference model on the last
+                # microbatch to implement data parallelism
+                if is_last_microbatch:
+                    for ref_param in ref_model.parameters():
+                        self.assertTrue(ref_param.grad is not None)
+                        dist.all_reduce(ref_param.grad)
+                        ref_param.grad /= self.world_size
+            check_sharded_parity(self, ref_model, model)
+            ref_optim.step()
+            optim.step()
+            ref_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            # Manually copy parameters from the base reference model to the
+            # compute reference model to run the optimizer step for the latter
+            for ref_param, ref_param_compute in zip(
+                ref_model.parameters(), ref_model_compute.parameters()
+            ):
+                ref_param_compute.detach().copy_(ref_param)
+
+
+class TestFullyShardMixedPrecisionCasts(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(1)
+    def test_float16_on_one_submodule(self):
+        x = torch.zeros(2, 100, device="cuda")
+
+        # Subtest 1: use fp16 on the second child submodule -- does not require
+        # any additional casting logic
+        forward_inputs: Dict[str, nn.Module] = {}
+        model = SaveForwardInputsModel(
+            forward_inputs,
+            cast_forward_inputs=False,
+        ).cuda()
+        fully_shard(model.c2, mp_policy=MixedPrecisionPolicy(param_dtype=torch.float16))
+        fully_shard(model)
+        model(x).sum().backward()
+        self.assertEqual(forward_inputs[model].dtype, torch.float32)
+        self.assertEqual(forward_inputs[model.c1].dtype, torch.float32)
+        self.assertEqual(forward_inputs[model.c2].dtype, torch.float16)
+
+        # Subtest 2: use fp16 on the second child module, where the user module
+        # owns the cast
+        forward_inputs: Dict[nn.Module, torch.Tensor] = {}
+        model = SaveForwardInputsModel(
+            forward_inputs=forward_inputs, cast_forward_inputs=True
+        ).cuda()
+        fully_shard(
+            model.c2,
+            mp_policy=MixedPrecisionPolicy(
+                param_dtype=torch.float16, cast_forward_inputs=False
+            ),
+        )
+        fully_shard(model)
+        model(x).sum().backward()
+        self.assertEqual(forward_inputs[model].dtype, torch.float32)
+        self.assertEqual(forward_inputs[model.c1].dtype, torch.float32)
+        self.assertEqual(forward_inputs[model.c2].dtype, torch.float32)
+
+        # Subtest 3: use fp16 on the first child module and specify its output
+        # dtype so that the second child module does not need to cast
+        forward_inputs: Dict[nn.Module, torch.Tensor] = {}
+        model = SaveForwardInputsModel(
+            forward_inputs=forward_inputs, cast_forward_inputs=False
+        ).cuda()
+        fully_shard(
+            model.c1,
+            mp_policy=MixedPrecisionPolicy(
+                param_dtype=torch.float16, output_dtype=torch.float32
+            ),
+        )
+        fully_shard(model)
+        model(x).sum().backward()
+        self.assertEqual(forward_inputs[model].dtype, torch.float32)
+        self.assertEqual(forward_inputs[model.c1].dtype, torch.float16)
+        self.assertEqual(forward_inputs[model.c2].dtype, torch.float32)
+
+    @skip_if_lt_x_gpu(1)
+    def test_submodules_with_external_inputs(self):
+        self.run_subtests(
+            {"enable_submodule_cast": [False, True]},
+            self._test_submodules_with_external_inputs,
+        )
+
+    def _test_submodules_with_external_inputs(self, enable_submodule_cast: bool):
+        class ToyModule(nn.Module):
+            def __init__(self, forward_inputs: Dict[str, torch.Tensor]) -> None:
+                super().__init__()
+                self.l = nn.Linear(100, 100)
+                self.forward_inputs = forward_inputs
+
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                self.forward_inputs["l2_input_x"] = x
+                self.forward_inputs["l2_input_y"] = y
+                return self.l(x)
+
+        class ToyModel(nn.Module):
+            def __init__(self, forward_inputs: Dict[str, torch.Tensor]) -> None:
+                super().__init__()
+                self.l1 = nn.Linear(100, 100)
+                self.l2 = ToyModule(forward_inputs)
+                self.forward_inputs = forward_inputs
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                self.forward_inputs["model_input_x"] = x
+                y = torch.ones(
+                    2, 100, device="cuda", dtype=torch.float32
+                )  # external input
+                return self.l2(self.l1(x), y)
+
+        forward_inputs: Dict[str, torch.Tensor] = {}
+        model = ToyModel(forward_inputs).cuda()
+        x = torch.zeros(2, 100, device="cuda", dtype=torch.float32)
+        fully_shard(
+            model.l2,
+            mp_policy=MixedPrecisionPolicy(
+                param_dtype=torch.float16, cast_forward_inputs=enable_submodule_cast
+            ),
+        )
+        fully_shard(model, mp_policy=MixedPrecisionPolicy(param_dtype=torch.float16))
+        model(x).sum().backward()
+
+        # If we enable `model.l2` to cast (as default), then `l2_input_y` gets
+        # cast to fp16, and if we disable, then it says as fp32.
+        self.assertEqual(forward_inputs["model_input_x"].dtype, torch.float16)
+        self.assertEqual(forward_inputs["l2_input_x"].dtype, torch.float16)
+        self.assertEqual(
+            forward_inputs["l2_input_y"].dtype,
+            torch.float16 if enable_submodule_cast else torch.float32,
+        )
+
+    @skip_if_lt_x_gpu(1)
+    @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
+    def test_norm_modules_bf16(self):
+        mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16)
+        self._test_norm_modules(mp_policy)
+
+    @skip_if_lt_x_gpu(1)
+    def test_norm_modules_fp16(self):
+        mp_policy = MixedPrecisionPolicy(param_dtype=torch.float16)
+        self._test_norm_modules(mp_policy)
+
+    def _test_norm_modules(self, mp_policy: MixedPrecisionPolicy):
+        def inner(model: nn.Module, x: torch.Tensor):
+            # Run forward and backward to check for no type mismatch errors
+            z = model(x)
+            self.assertEqual(z.dtype, mp_policy.param_dtype)
+            z.sum().backward()
+
+        # Layer norm
+        model = nn.Sequential(nn.Linear(32, 32), nn.LayerNorm(32), nn.Linear(32, 32))
+        for module in (model[0], model[1], model[2], model):
+            fully_shard(module, mp_policy=mp_policy)
+        inner(model, torch.randn((4, 32)))
+
+        # Batch norm 1D
+        model = nn.Sequential(nn.Linear(32, 32), nn.BatchNorm1d(32), nn.Linear(32, 32))
+        for module in (model[0], model[1], model[2], model):
+            fully_shard(module, mp_policy=mp_policy)
+        inner(model, torch.randn((4, 32)))
+
+        # Batch norm 2D: error in backward from buffer dtype mismatch
+        model = nn.Sequential(nn.Conv2d(1, 5, 3), nn.BatchNorm2d(5), nn.Conv2d(5, 4, 3))
+        for module in (model[0], model[1], model[2], model):
+            fully_shard(module, mp_policy=mp_policy)
+        with self.assertRaisesRegex(RuntimeError, "Expected running_mean to have type"):
+            # Errors in batch norm 2D backward
+            inner(model, torch.randn((3, 1, 9, 9)))
+
+        # Batch norm 2D: cast buffers down to lower precision
+        model = nn.Sequential(nn.Conv2d(1, 5, 3), nn.BatchNorm2d(5), nn.Conv2d(5, 4, 3))
+        for module in (model[0], model[1], model[2], model):
+            fully_shard(module, mp_policy=mp_policy)
+        # Casting batch norm buffers to the lower precision allows backward
+        model[1].running_mean = model[1].running_mean.to(mp_policy.param_dtype)
+        model[1].running_var = model[1].running_var.to(mp_policy.param_dtype)
+        inner(model, torch.randn((3, 1, 9, 9)))
+
+        # Batch norm 2D: use special mixed precision policy
+        model = nn.Sequential(nn.Conv2d(1, 5, 3), nn.BatchNorm2d(5), nn.Conv2d(5, 4, 3))
+        bn_mp_policy = MixedPrecisionPolicy(output_dtype=mp_policy.param_dtype)
+        fully_shard(model[1], mp_policy=bn_mp_policy)
+        for module in (model[0], model[2], model):
+            fully_shard(module, mp_policy=mp_policy)
+        inner(model, torch.randn((3, 1, 9, 9)))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_overlap.py b/test/distributed/_composable/fsdp/test_fully_shard_overlap.py
new file mode 100644
index 0000000000000..99823883abfbb
--- /dev/null
+++ b/test/distributed/_composable/fsdp/test_fully_shard_overlap.py
@@ -0,0 +1,127 @@
+# Owner(s): ["oncall: distributed"]
+
+from typing import Callable
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+from torch.distributed._composable.fsdp import fully_shard
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    FSDPTest,
+    patch_all_gather,
+    patch_reduce_scatter,
+)
+from torch.testing._internal.common_utils import get_cycles_per_ms, run_tests
+
+
+class TestFullyShardOverlap(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(2, torch.cuda.device_count())
+
+    @skip_if_lt_x_gpu(2)
+    def test_fully_shard_training_overlap(self):
+        class LinearWithSleep(nn.Module):
+            def __init__(self, dim: int, sleep_ms: int):
+                super().__init__()
+                self.weight = nn.Parameter(torch.randn((dim, dim)))
+                self.sleep_ms = sleep_ms
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return nn.functional.relu(Matmul.apply(x, self.weight, self.sleep_ms))
+
+        torch.manual_seed(42)
+
+        # Use non-trivial comm. time but still shorter than compute time
+        dim, num_linears, compute_sleep_ms, comm_sleep_ms = (4, 3, 25, 10)
+        model = nn.Sequential(
+            *[LinearWithSleep(dim, compute_sleep_ms) for _ in range(num_linears)]
+        )
+        for lin in model:
+            fully_shard(lin, reshard_after_forward=True)
+        fully_shard(model, reshard_after_forward=True)
+
+        orig_all_gather_into_tensor = dist.all_gather_into_tensor
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+        comm_stream = torch.cuda.Stream()
+
+        def delay_collective():
+            # Share a stream so that all-gather and reduce-scatter block each
+            # other like in `ProcessGroupNCCL`
+            comm_stream.wait_stream(torch.cuda.current_stream())
+            with torch.cuda.stream(comm_stream):
+                torch.cuda._sleep(int(comm_sleep_ms * get_cycles_per_ms()))
+            torch.cuda.current_stream().wait_stream(comm_stream)
+
+        def delayed_all_gather(*args, **kwargs):
+            delay_collective()
+            return orig_all_gather_into_tensor(*args, **kwargs)
+
+        def delayed_reduce_scatter(*args, **kwargs):
+            delay_collective()
+            return orig_reduce_scatter(*args, **kwargs)
+
+        inp = torch.randn((2, dim), device="cuda")
+        loss = model(inp).sum()  # warmup CUDA and allocator
+        loss.backward()
+
+        def fwd():
+            with patch_all_gather(delayed_all_gather):
+                model(inp)
+
+        fwd_time = self._time_fn(fwd)
+        buffer_ms = 2  # CPU delays and copies
+        expected_fwd_time = comm_sleep_ms + num_linears * compute_sleep_ms + buffer_ms
+        # Forward: only 1st all-gather is exposed
+        self.assertLessEqual(fwd_time, expected_fwd_time)
+
+        def fwd_bwd():
+            with patch_all_gather(delayed_all_gather), patch_reduce_scatter(
+                delayed_reduce_scatter
+            ):
+                loss = model(inp).sum()
+                loss.backward()
+
+        fwd_bwd_time = self._time_fn(fwd_bwd)
+        # Backward: only 1st all-gather and last reduce-scatter are exposed;
+        # double the backward compute since computing two gradients per layer
+        expected_bwd_time = (
+            comm_sleep_ms * 2 + num_linears * 2 * compute_sleep_ms + buffer_ms * 2
+        )
+        self.assertLessEqual(fwd_bwd_time, expected_fwd_time + expected_bwd_time)
+
+    def _time_fn(self, fn: Callable):
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        dist.barrier()
+        torch.cuda.synchronize()
+        start_event.record()
+        fn()
+        end_event.record()
+        torch.cuda.synchronize()
+        elapsed_time = start_event.elapsed_time(end_event)
+        return elapsed_time
+
+
+class Matmul(torch.autograd.Function):
+    # Use CUDA sleeps to emulate compute time
+    @staticmethod
+    def forward(ctx, input: torch.Tensor, weight: torch.Tensor, sleep_ms: int):
+        ctx.save_for_backward(input, weight)
+        ctx.sleep_ms = sleep_ms
+        torch.cuda._sleep(int(sleep_ms * get_cycles_per_ms()))
+        return input @ weight
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        (input, weight) = ctx.saved_tensors
+        torch.cuda._sleep(int(2 * ctx.sleep_ms * get_cycles_per_ms()))
+        grad_input = grad_output @ weight.T
+        grad_weight = input.T @ grad_output
+        return grad_input, grad_weight, None
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_state.py b/test/distributed/_composable/fsdp/test_fully_shard_state.py
new file mode 100644
index 0000000000000..7b45f7d4d9999
--- /dev/null
+++ b/test/distributed/_composable/fsdp/test_fully_shard_state.py
@@ -0,0 +1,88 @@
+# Owner(s): ["oncall: distributed"]
+
+import copy
+import unittest
+
+import torch.nn as nn
+from torch.distributed._composable.fsdp import FSDPModule, fully_shard
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_fsdp import FSDPTestMultiThread, MLP
+from torch.testing._internal.common_utils import run_tests
+
+
+class TestFullyShardState(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 1
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_fully_shard_state(self):
+        """
+        Tests the ability to get the state object from a fully sharded module.
+        """
+        num_mlps = 3
+        model = nn.Sequential(*[MLP(8) for _ in range(num_mlps)])
+        for mlp in model:
+            fully_shard(mlp)
+        fully_shard(model)
+        root_state = fully_shard.state(model)
+        self.assertTrue(root_state is not None)
+        all_states = [root_state] + [fully_shard.state(mlp) for mlp in model]
+        # Check that each `fully_shard` call constructs a distinct state object
+        self.assertEqual(len(set(all_states)), num_mlps + 1)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_fully_shard_reapply(self):
+        model = MLP(8)
+        fully_shard(model)
+        with self.assertRaisesRegex(
+            AssertionError,
+            "Each distinct composable distributed API can only be applied to a module once.",
+        ):
+            fully_shard(model)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_fully_shard_cls(self):
+        # Check that we only swap class for the module passed to `fully_shard`
+        model = MLP(8)
+        fully_shard(model)
+        self.assertTrue(isinstance(model, MLP))
+        self.assertTrue(isinstance(model, FSDPModule))
+        self.assertEqual(model.__class__.__name__, "FSDPMLP")
+        for module in model.modules():
+            if module is model:
+                continue
+            self.assertFalse(isinstance(module, FSDPModule))
+
+        # Check that slicing into a `Sequential` does not preserve FSDP
+        model = nn.Sequential(*[MLP(8) for _ in range(3)])
+        fully_shard(model)
+        self.assertTrue(isinstance(model, nn.Sequential))
+        self.assertTrue(isinstance(model, FSDPModule))
+        self.assertEqual(model.__class__.__name__, "FSDPSequential")
+        sliced_model = model[:2]
+        self.assertTrue(isinstance(sliced_model, nn.Sequential))
+        self.assertFalse(isinstance(sliced_model, FSDPModule))
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_fully_shard_unsupported_module_cls(self):
+        regex = (
+            r"fully\_shard does not support containers that do not implement forward"
+        )
+        model = nn.ModuleList([MLP(8) for _ in range(3)])
+        with self.assertRaisesRegex(ValueError, regex):
+            fully_shard(model)
+        model = nn.ModuleDict({"1": MLP(8), "2": MLP(8)})
+        with self.assertRaisesRegex(ValueError, regex):
+            fully_shard(model)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_fully_shard_deepcopy(self):
+        model = MLP(8)
+        fully_shard(model)
+        with self.assertRaisesRegex(AssertionError, "FSDP does not support deepcopy"):
+            copy.deepcopy(model)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
new file mode 100644
index 0000000000000..fde705bdd0699
--- /dev/null
+++ b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
@@ -0,0 +1,223 @@
+# Owner(s): ["oncall: distributed"]
+
+import copy
+import functools
+import unittest
+
+from typing import Dict
+
+import torch
+import torch.nn as nn
+from torch.distributed._composable.fsdp import fully_shard
+from torch.distributed._tensor import DTensor
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    RowwiseParallel,
+)
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest, FSDPTestMultiThread, MLP
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    ModelArgs,
+    Transformer,
+    TransformerBlock,
+)
+
+
+class TestFullyShardStateDictMultiProcess(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.cuda.device_count())
+
+    @skip_if_lt_x_gpu(2)
+    def test_1d_state_dict_save_load(self):
+        self.run_subtests(
+            {"mlp_dim": [2, 3, 4, 5]},
+            self._test_1d_state_dict_save_load,
+        )
+
+    def _test_1d_state_dict_save_load(self, mlp_dim: int):
+        torch.manual_seed(42)
+        base_model = nn.Sequential(
+            MLP(mlp_dim),
+            nn.Sequential(MLP(mlp_dim), nn.Linear(mlp_dim, mlp_dim)),
+            MLP(mlp_dim),
+        )
+        # Check basic `reshard_after_forward=True`
+        model1 = copy.deepcopy(base_model)
+        for module in model1:
+            fully_shard(module)
+        fully_shard(model1)
+        self._test_state_dict_save_load(model1)
+
+        # Check `reshard_after_forward=False` before and after a forward
+        model2 = copy.deepcopy(base_model)
+        for module in model2:
+            fully_shard(module, reshard_after_forward=False)
+        fully_shard(model2, reshard_after_forward=False)
+        self._test_state_dict_save_load(model2)
+        ref_sharded_sd = model2.state_dict()
+        inp = torch.randn((2, mlp_dim), device="cuda")
+        model2(inp)  # parameters are not resharded after this forward
+        # Check that state dict hooks reshard
+        sharded_sd = model2.state_dict()
+        self.assertEqual(set(ref_sharded_sd.keys()), set(sharded_sd.keys()))
+        for key, value in ref_sharded_sd.items():
+            self.assertEqual(value, sharded_sd[key])
+
+    @skip_if_lt_x_gpu(2)
+    def test_2d_state_dict_save_load(self):
+        dp_size = 2
+        global_mesh = init_device_mesh(
+            "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+        )
+        self.run_subtests(
+            {"mlp_dim": [2, 3, 4, 5]},
+            functools.partial(self._test_2d_state_dict_save_load, global_mesh),
+        )
+
+    def _test_2d_state_dict_save_load(self, global_mesh: DeviceMesh, mlp_dim: int):
+        dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
+        torch.manual_seed(42)
+        model = nn.Sequential(*[MLP(mlp_dim) for _ in range(3)])
+        model = parallelize_module(
+            model,
+            device_mesh=tp_mesh,
+            parallelize_plan={
+                "0.in_proj": ColwiseParallel(),
+                "0.out_proj": RowwiseParallel(),
+                "1.in_proj": ColwiseParallel(),
+                "1.out_proj": RowwiseParallel(),
+                "2.in_proj": ColwiseParallel(),
+                "2.out_proj": RowwiseParallel(),
+            },
+        )
+        for mlp in model:
+            fully_shard(mlp, mesh=dp_mesh)
+        fully_shard(model, mesh=dp_mesh)
+        self._test_state_dict_save_load(model)
+
+    def _test_state_dict_save_load(self, model: nn.Module):
+        for param_name, param in model.named_parameters():
+            self.assertIsInstance(
+                param,
+                DTensor,
+                f"Expects parameters to be sharded as DTensors but got {param_name} "
+                f"as {type(param)}: {param}",
+            )
+        old_fill_value = 1
+        new_fill_value = 42 + self.rank
+        with torch.no_grad():
+            for param in model.parameters():
+                param.fill_(old_fill_value)
+        # Use that the parameters are currently sharded, meaning that their
+        # data pointers correspond to the sharded parameter data
+        param_name_to_data_ptr = {
+            n: p.to_local().data_ptr() for n, p in model.named_parameters()
+        }
+        ref_sharded_sizes = [p.size() for p in model.parameters()]
+        state_dict = model.state_dict()
+        for param, ref_sharded_size in zip(model.parameters(), ref_sharded_sizes):
+            self.assertEqual(param.size(), ref_sharded_size)
+            self.assertTrue(isinstance(param, nn.Parameter))
+
+        # Verify that keys match, values are DTensors, and values share the
+        # same storage as the existing sharded parameter data
+        self.assertEqual(set(state_dict.keys()), set(param_name_to_data_ptr.keys()))
+        for param_name, tensor in state_dict.items():
+            self.assertTrue(isinstance(tensor, DTensor))
+            if param_name_to_data_ptr[param_name] == 0:
+                # Check that this is padding (added by DTensor)
+                self.assertGreater(self.rank, 0)
+                self.assertEqual(torch.count_nonzero(tensor.to_local()).item(), 0)
+            else:
+                self.assertEqual(
+                    tensor.to_local().data_ptr(), param_name_to_data_ptr[param_name]
+                )
+
+        # Verify that we can load a new state dict that contains DTensors with
+        # storages different from the current model parameters
+        new_state_dict: Dict[str, DTensor] = {}
+        for param_name, dtensor in state_dict.items():
+            # Construct new DTensors to exercise load state dict writeback
+            new_state_dict[param_name] = dtensor.detach().clone().fill_(new_fill_value)
+        for param in model.parameters():
+            self.assertEqual(
+                param.to_local(),
+                torch.ones_like(param.to_local()) * old_fill_value,
+            )
+        model.load_state_dict(new_state_dict)
+        for param_name, param in model.named_parameters():
+            self.assertEqual(
+                param.to_local(),
+                torch.ones_like(param.to_local()) * new_fill_value,
+            )
+            local_param = param.to_local()
+            # Only guarantee that the local tensor's data pointer does not
+            # change if the sharding was even (i.e. no padding); otherwise,
+            # FSDP may re-pad the local tensor, changing its data pointer
+            if local_param.size(0) * param.device_mesh.size() == param.size(0):
+                self.assertEqual(
+                    local_param.data_ptr(), param_name_to_data_ptr[param_name]
+                )
+
+
+class TestFullyShardStateDictMultiThread(FSDPTestMultiThread):
+    @property
+    def world_size(self):
+        return 2
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_rank0_offload_full_state_dict(self):
+        # Construct a reference unsharded model on all ranks
+        model_args = ModelArgs(dropout_p=0.0)
+        torch.manual_seed(42)
+        ref_model = Transformer(model_args).cuda()
+        for param in ref_model.parameters():
+            torch.distributed.broadcast(param.detach(), src=0)
+
+        # Construct a sharded model and sharded state dict on all ranks
+        model = copy.deepcopy(ref_model)
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard(module)
+        fully_shard(model)
+        sharded_sd = model.state_dict()
+
+        # Save a reference CPU full state dict on rank 0 and delete the
+        # reference model otherwise
+        if self.rank != 0:
+            del ref_model
+        else:
+            ref_gpu_full_sd = ref_model.state_dict()
+            ref_full_sd = {k: v.cpu() for k, v in ref_gpu_full_sd.items()}
+            del ref_gpu_full_sd
+
+        # Reshard the GPU sharded state dict to a CPU full state dict on rank 0
+        full_sd = {}
+        for param_name, sharded_param in sharded_sd.items():
+            full_param = sharded_param.full_tensor()
+            if self.rank == 0:
+                full_sd[param_name] = full_param.cpu()
+            else:
+                del full_param
+
+        # Check that we have a CPU full state dict only on rank 0
+        if self.rank == 0:
+            self.assertEqual(len(full_sd), len(ref_full_sd))
+            self.assertEqual(list(full_sd.keys()), list(ref_full_sd.keys()))
+            for (param_name, param), ref_param in zip(
+                full_sd.items(), ref_full_sd.values()
+            ):
+                self.assertEqual(param.device, torch.device("cpu"))
+                self.assertEqual(param.device, ref_param.device)
+                self.assertEqual(param, ref_param)
+        else:
+            self.assertEqual(len(full_sd), 0)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py
new file mode 100644
index 0000000000000..e826ca7a000d9
--- /dev/null
+++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@@ -0,0 +1,1300 @@
+# Owner(s): ["oncall: distributed"]
+
+import contextlib
+import copy
+import functools
+import unittest
+from typing import Iterable, List, Tuple, Type, Union
+
+import torch
+import torch.distributed as dist
+import torch.distributed.checkpoint as dcp
+import torch.nn as nn
+from torch.distributed._composable import checkpoint, replicate
+from torch.distributed._composable.fsdp import (
+    CPUOffloadPolicy,
+    FSDPModule,
+    fully_shard,
+    OffloadPolicy,
+    register_fsdp_forward_method,
+)
+from torch.distributed._tensor import DTensor, init_device_mesh
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    _CHECKPOINT_PREFIX,
+    apply_activation_checkpointing,
+    CheckpointWrapper,
+)
+from torch.distributed.checkpoint.state_dict import (
+    get_model_state_dict,
+    get_optimizer_state_dict,
+)
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    RowwiseParallel,
+)
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    check_sharded_parity,
+    FSDPTest,
+    FSDPTestMultiThread,
+    MLP,
+    patch_all_gather,
+    patch_all_reduce,
+    patch_reduce_scatter,
+    test_compiled_fsdp,
+)
+from torch.testing._internal.common_utils import (
+    get_cycles_per_ms,
+    run_tests,
+    skipIfRocm,
+    wrapSwapTensorsTest,
+)
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    ModelArgs,
+    Transformer,
+    TransformerBlock,
+)
+from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
+
+
+class TestFullyShardForwardInputs(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_root_move_forward_input_to_device(self):
+        device = torch.device("cuda", 0)
+
+        class ParamlessModule(nn.Module):
+            def forward(self, x: torch.Tensor, ys: Tuple[torch.Tensor, ...]):
+                # Check that FSDP moved the inputs to GPU, including recursing
+                # into the tuple data structure
+                assert x.device == device, f"Expects {device} but got {x.device}"
+                assert (
+                    ys[0].device == device
+                ), f"Expects {device} but got {ys[0].device}"
+                assert (
+                    ys[1].device == device
+                ), f"Expects {device} but got {ys[1].device}"
+                y = ys[0] + ys[1]
+                return x + y + 1
+
+        model = ParamlessModule()
+        fully_shard(model)
+        x = torch.randn((3,))
+        ys = (torch.randn((3,)), torch.randn((3,)))
+        self.assertEqual(x.device, torch.device("cpu"))
+        self.assertEqual(ys[0].device, torch.device("cpu"))
+        self.assertEqual(ys[1].device, torch.device("cpu"))
+        model(x, ys)
+
+
+class TestFullyShardRegisteredParams(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_param_registration_after_forward(self):
+        """Tests the parameter registration after forward."""
+        device = torch.device("cuda", 0)
+        # Single FSDP group
+        for reshard_after_forward in (True, False, 2):
+            torch.manual_seed(42)
+            model = MLP(3, device)
+            # Since seed is per process, not per thread, we broadcast to ensure
+            # the same parameters across ranks
+            for param in model.parameters():
+                dist.broadcast(param, src=0)
+            ref_model = copy.deepcopy(model)
+            fully_shard(model, reshard_after_forward=reshard_after_forward)  # root only
+            inp = torch.randn((2, 3), device="cuda")
+            self._assert_dtensor_params(model.parameters())
+            self._assert_same_params(model.parameters(), ref_model.parameters())
+            model(inp)  # root does not reshard after forward
+            self._assert_tensor_params(model.parameters())
+            self._assert_same_params(model.parameters(), ref_model.parameters())
+            model.reshard()  # however, we can manually reshard
+            self._assert_dtensor_params(model.parameters())
+            self._assert_same_params(model.parameters(), ref_model.parameters())
+
+        # Multiple FSDP groups
+        for reshard_after_forward in (True, False, 2):
+            torch.manual_seed(42)
+            model = nn.Sequential(MLP(3, device), MLP(3, device))
+            for param in model.parameters():
+                dist.broadcast(param, src=0)
+            ref_model = copy.deepcopy(model)
+            fully_shard(model[0].in_proj, reshard_after_forward=reshard_after_forward)
+            fully_shard(model[0].out_proj, reshard_after_forward=reshard_after_forward)
+            fully_shard(model, reshard_after_forward=reshard_after_forward)
+
+            self._assert_dtensor_params(model.parameters())
+            self._assert_same_params(model.parameters(), ref_model.parameters())
+            model(inp)
+            non_root_params = list(model[0].in_proj.parameters()) + list(
+                model[0].out_proj.parameters()
+            )
+            root_params = list(set(model.parameters()) - set(non_root_params))
+            if reshard_after_forward is False:
+                self._assert_tensor_params(non_root_params)
+            else:
+                self._assert_dtensor_params(non_root_params)
+            self._assert_tensor_params(root_params)
+            self._assert_same_params(model.parameters(), ref_model.parameters())
+            for module in model.modules():
+                if isinstance(module, FSDPModule):
+                    module.reshard()  # however, we can manually reshard
+            self._assert_dtensor_params(model.parameters())
+            self._assert_same_params(model.parameters(), ref_model.parameters())
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_param_registration_after_backward(self):
+        """Tests the parameter registration after backward."""
+        device = torch.device("cuda", 0)
+        # Single FSDP group
+        for reshard_after_forward in (True, False, 2):
+            model = MLP(8, device)
+            fully_shard(model, reshard_after_forward=reshard_after_forward)  # root only
+            inp = torch.randn((2, 8), device="cuda")
+            self._assert_dtensor_params(model.parameters())
+            model(inp).sum().backward()
+            self._assert_dtensor_params(model.parameters())
+
+        # Multiple FSDP groups
+        for reshard_after_forward in (True, False, 2):
+            model = MLP(8, device)
+            fully_shard(model.in_proj, reshard_after_forward=reshard_after_forward)
+            fully_shard(model.out_proj, reshard_after_forward=reshard_after_forward)
+            fully_shard(model, reshard_after_forward=reshard_after_forward)
+            self._assert_dtensor_params(model.parameters())
+            model(inp).sum().backward()
+            self._assert_dtensor_params(model.parameters())
+
+    def _assert_tensor_params(self, params: Iterable[nn.Parameter]):
+        self.assertGreater(len(list(params)), 0)
+        for param in params:
+            self.assertNotIsInstance(param, DTensor)
+            self.assertIsInstance(param, torch.Tensor)
+
+    def _assert_dtensor_params(self, params: Iterable[nn.Parameter]):
+        self.assertGreater(len(list(params)), 0)
+        for param in params:
+            self.assertIsInstance(param, DTensor)
+
+    def _assert_same_params(
+        self, params: Iterable[nn.Parameter], ref_params: Iterable[nn.Parameter]
+    ):
+        params, ref_params = list(params), list(ref_params)
+        self.assertEqual(len(params), len(ref_params))
+        for param, ref_param in zip(params, ref_params):
+            if isinstance(param, DTensor):
+                param = param.full_tensor()
+            self.assertEqual(param.shape, ref_param.shape)
+            self.assertEqual(param, ref_param)
+
+
+class TestFullyShardCastAfterInit(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    @wrapSwapTensorsTest(True)
+    def test_to_float64_after_init(self):
+        """Tests that the user can cast the module to float64 after init."""
+        # NOTE: Test fp64 instead of a lower precision dtype like bf16 for
+        # better numerics. The important part is changing the dtype.
+        torch.manual_seed(42)
+        mlp_dim, device, dtype = 4, torch.device("cuda"), torch.float64
+        model = MLP(mlp_dim, device=device)
+        for param in model.parameters():
+            dist.broadcast(param, src=0)
+        ref_model = copy.deepcopy(model).to(dtype)
+        replicate(ref_model)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        for module in (model.in_proj, model.out_proj, model):
+            fully_shard(module)
+        model.to(dtype)
+        for param in model.parameters():
+            self.assertEqual(param.dtype, dtype)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+        check_sharded_parity(self, ref_model, model)
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((2, mlp_dim), device="cuda", dtype=dtype)
+        for iter_idx in range(10):
+            losses: List[torch.Tensor] = []
+            for _model in (ref_model, model):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+            self.assertEqual(losses[0], losses[1])
+            check_sharded_parity(self, ref_model, model)
+            for _optim in (ref_optim, optim):
+                _optim.step()
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+
+
+class TestFullyShard1DTrainingCore(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(8, torch.cuda.device_count())
+
+    @skip_if_lt_x_gpu(2)
+    @test_compiled_fsdp()
+    def test_train_parity_single_group(self):
+        """Tests train parity with DDP for a single FSDP group."""
+        self.run_subtests(
+            {
+                "lin_shapes": [[(16, 15), (15, 8)], [(7, 15), (15, 3)]],
+            },
+            self._test_train_parity_single_group,
+        )
+
+    def _test_train_parity_single_group(self, lin_shapes: List[Tuple[int, int]]):
+        torch.manual_seed(42)
+        model = nn.Sequential(
+            nn.Linear(*lin_shapes[0]), nn.ReLU(), nn.Linear(*lin_shapes[1])
+        )
+        ref_model = copy.deepcopy(model).cuda()
+        replicate(ref_model, device_ids=[self.rank])
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        fully_shard(model)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        torch.manual_seed(42 + self.rank + 1)
+        inp = (torch.randn((4, lin_shapes[0][0]), device="cuda"),)
+        for iter_idx in range(10):
+            losses: List[torch.Tensor] = []
+            for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                losses.append(_model(*inp).sum())
+                losses[-1].backward()
+                _optim.step()
+            self.assertEqual(losses[0], losses[1])
+
+    @skip_if_lt_x_gpu(2)
+    def test_train_parity_multi_group_eager(self):
+        """
+        Tests train parity against DDP when using multiple parameter groups for
+        communication (for communication and computation overlap plus memory
+        reduction).
+        """
+        self.run_subtests(
+            {
+                "reshard_after_forward": [True, False, 2],
+                "device_type": ["cuda"],
+                "offload_policy": [OffloadPolicy()],
+                "delay_after_forward": [False, True],
+                "delay_before_all_gather": [False, True],
+                "delay_before_reduce_scatter": [False, True],
+                "delay_before_optim": [False, True],
+            },
+            self._test_train_parity_multi_group,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_train_parity_multi_group_compile(self):
+        self.run_subtests(
+            {
+                "reshard_after_forward": [True, False],
+                "device_type": ["cuda"],
+                "offload_policy": [OffloadPolicy()],
+                "delay_after_forward": [False, True],
+                "delay_before_all_gather": [False],
+                "delay_before_reduce_scatter": [False],
+                "delay_before_optim": [False, True],
+            },
+            self._test_train_parity_multi_group,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_train_parity_multi_group_cpu_offload_eager(self):
+        """
+        Tests train parity against DDP when using multiple parameter groups for
+        communication and CPU offloading.
+        """
+        self.run_subtests(
+            {
+                "reshard_after_forward": [True],  # save CI time
+                "offload_policy": [
+                    CPUOffloadPolicy(pin_memory=True),
+                    CPUOffloadPolicy(pin_memory=False),
+                ],
+                "device_type": ["cuda"],
+                "delay_after_forward": [False, True],
+                "delay_before_all_gather": [False, True],
+                "delay_before_reduce_scatter": [False, True],
+                "delay_before_optim": [False, True],
+            },
+            self._test_train_parity_multi_group,
+        )
+
+    def _test_train_parity_multi_group(
+        self,
+        reshard_after_forward: Union[bool, int],
+        offload_policy: OffloadPolicy,
+        device_type: str,
+        delay_after_forward: bool,
+        delay_before_all_gather: bool,
+        delay_before_reduce_scatter: bool,
+        delay_before_optim: bool,
+    ):
+        # Only test individual delays or all four delays to save test time
+        if (
+            delay_after_forward
+            + delay_before_all_gather
+            + delay_before_reduce_scatter
+            + delay_before_optim
+            in (2, 3)
+        ):
+            return
+        assert device_type in ("cuda", "cpu"), f"{device_type}"
+        torch.manual_seed(42)
+        lin_dim = 32
+        model = nn.Sequential(*[MLP(lin_dim, torch.device("cpu")) for _ in range(3)])
+        ref_model = copy.deepcopy(model)
+        if device_type == "cuda":
+            replicate(ref_model.cuda(), device_ids=[self.rank])
+        else:
+            gloo_pg = dist.new_group(backend="gloo")
+            replicate(ref_model, process_group=gloo_pg)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        mesh = init_device_mesh(device_type, (self.world_size,))
+        fully_shard_fn = functools.partial(
+            fully_shard,
+            mesh=mesh,
+            reshard_after_forward=reshard_after_forward,
+            offload_policy=offload_policy,
+        )
+        for mlp in model:
+            fully_shard_fn(mlp)
+        fully_shard_fn(model)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        delay_in_ms = 100
+        orig_all_gather = dist.all_gather_into_tensor
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+
+        def delayed_all_gather(*args, **kwargs):
+            torch.cuda._sleep(int(delay_in_ms * get_cycles_per_ms()))
+            return orig_all_gather(*args, **kwargs)
+
+        def delayed_reduce_scatter(*args, **kwargs):
+            torch.cuda._sleep(int(delay_in_ms * get_cycles_per_ms()))
+            return orig_reduce_scatter(*args, **kwargs)
+
+        torch.manual_seed(42 + self.rank + 1)
+        patch_all_gather_ctx = (
+            patch_all_gather(delayed_all_gather)
+            if delay_before_all_gather
+            else contextlib.nullcontext()
+        )
+        patch_reduce_scatter_ctx = (
+            patch_reduce_scatter(delayed_reduce_scatter)
+            if delay_before_reduce_scatter
+            else contextlib.nullcontext()
+        )
+        with patch_all_gather_ctx, patch_reduce_scatter_ctx:
+            for iter_idx in range(10):
+                inp = torch.randn((8, lin_dim), device=torch.device(device_type))
+                losses: List[torch.Tensor] = []
+                for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                    _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                    losses.append(_model(inp).sum())
+                    if _model is model and delay_after_forward:
+                        torch.cuda._sleep(int(delay_in_ms * get_cycles_per_ms()))
+                    losses[-1].backward()
+                    if _model is model and delay_before_optim:
+                        torch.cuda._sleep(int(delay_in_ms * get_cycles_per_ms()))
+                    _optim.step()
+                self.assertEqual(losses[0], losses[1])
+
+    @skip_if_lt_x_gpu(2)
+    @test_compiled_fsdp()
+    def test_non_root_forward_backward(self):
+        """
+        Tests running forward/backward through the root and then through a
+        non-root. The non-root needs to synchronize streams/queue the callback.
+        """
+        torch.manual_seed(42)
+        lin_dim = 32
+        model = nn.Sequential(*[MLP(lin_dim, torch.device("cpu")) for _ in range(3)])
+        ref_model = copy.deepcopy(model).cuda()
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        for mlp in model:
+            fully_shard(mlp)
+        fully_shard(model)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randn((8, lin_dim), device=torch.device("cuda"))
+
+        ref_root_loss = ref_model(inp).sum()
+        ref_root_loss.backward()
+        for param in ref_model.parameters():
+            dist.all_reduce(param.grad)
+            param.grad.detach().div_(self.world_size)
+        ref_optim.step()
+        ref_optim.zero_grad()
+        ref_nonroot_loss = ref_model[0](inp).sum()
+        ref_nonroot_loss.backward()
+        for param in ref_model.parameters():
+            if param.grad is not None:
+                dist.all_reduce(param.grad)
+                param.grad.detach().div_(self.world_size)
+        ref_optim.step()
+
+        root_loss = model(inp).sum()
+        root_loss.backward()
+        torch.cuda._sleep(int(100 * get_cycles_per_ms()))
+        optim.step()
+        optim.zero_grad()
+        nonroot_loss = model[0](inp).sum()
+        nonroot_loss.backward()
+        optim.step()
+
+        self.assertEqual(ref_root_loss, root_loss)
+        self.assertEqual(ref_nonroot_loss, nonroot_loss)
+        self.assertEqual(ref_model(inp).sum(), model(inp).sum())
+
+    @skip_if_lt_x_gpu(2)
+    @test_compiled_fsdp()
+    def test_multi_forward_module(self):
+        """
+        Tests parity with DDP when running a module that participates multiple
+        times in forward.
+        """
+        self.run_subtests(
+            {"reshard_after_forward": [True, False, 2]},
+            self._test_multi_forward_module,
+        )
+
+    def _test_multi_forward_module(self, reshard_after_forward: Union[bool, int]):
+        class MultiForwardModule(nn.Module):
+            def __init__(self, device: torch.device):
+                super().__init__()
+                self.inner = nn.Linear(4, 4, device=device)
+                self.outer = nn.Linear(4, 5, device=device)
+
+            def forward(self, x):
+                i = self.inner(x)
+                j = self.inner(x)
+                return self.outer(i + j)
+
+        torch.manual_seed(42)
+        model = MultiForwardModule(device="cuda")
+        ref_model = copy.deepcopy(model)
+        replicate(ref_model, device_ids=[self.rank])
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        fully_shard(model.inner)
+        fully_shard(model)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randn((32, 4), device="cuda")
+        for iter_idx in range(10):
+            losses: List[torch.Tensor] = []
+            for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+                _optim.step()
+            self.assertEqual(losses[0], losses[1])
+
+
+class TestFullyShard1DTrainingCompose(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        # Since these tests run with a larger transformer model, they may see
+        # some numeric drift with >2 GPUs
+        return min(torch.cuda.device_count(), 2)
+
+    @skip_if_lt_x_gpu(2)
+    def test_train_parity_with_activation_checkpointing(self):
+        """
+        Tests train parity against DDP when composing with activation
+        checkpointing.
+        """
+        self.run_subtests(
+            {
+                "reshard_after_forward": [True, False, 2],
+                "checkpoint_impl": ["composable", "utils", "wrapper"],
+            },
+            self._test_train_parity_with_activation_checkpointing,
+        )
+
+    def _test_train_parity_with_activation_checkpointing(
+        self, reshard_after_forward: Union[bool, int], checkpoint_impl: str
+    ):
+        assert checkpoint_impl in ("composable", "utils", "wrapper")
+        torch.manual_seed(42)
+        vocab_size = 1024
+        with torch.device(torch.device("cuda")):
+            model_args = ModelArgs(
+                n_layers=3,
+                n_heads=4,
+                vocab_size=vocab_size,
+                max_seq_len=64,
+                dropout_p=0.1,
+                checkpoint_activations=(checkpoint_impl == "utils"),
+            )
+            model = Transformer(model_args)
+        ref_model = replicate(copy.deepcopy(model), device_ids=[self.rank])
+        foreach = True
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=foreach)
+        fully_shard_fn = functools.partial(
+            fully_shard,
+            reshard_after_forward=reshard_after_forward,
+        )
+        if checkpoint_impl == "wrapper":
+            prefixes_to_ignore = (_CHECKPOINT_PREFIX,)
+            apply_activation_checkpointing(
+                model, check_fn=lambda m: isinstance(m, TransformerBlock)
+            )
+            for module in model.modules():
+                # Apply to `CheckpointWrapper`, which wraps `TransformerBlock`
+                if isinstance(module, CheckpointWrapper):
+                    fully_shard_fn(module)
+        else:
+            prefixes_to_ignore = ()
+            for module in model.modules():
+                if isinstance(module, TransformerBlock):
+                    if checkpoint_impl == "composable":
+                        checkpoint(module)
+                    fully_shard_fn(module)
+        fully_shard_fn(model)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=foreach)
+
+        torch.manual_seed(42 + self.rank)
+        # Reuse the same input across iterations to avoid loss explosion from
+        # trying to learn from random inputs
+        inp = torch.randint(0, vocab_size, (3, 64), device="cuda")
+        check_sharded_parity(
+            self, ref_model, model, prefixes_to_ignore=prefixes_to_ignore
+        )
+        for iter_idx in range(10):
+            losses: List[torch.Tensor] = []
+            for _model in (ref_model, model):
+                torch.manual_seed(iter_idx + 1)  # for dropout determinism
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+            check_sharded_parity(
+                self, ref_model, model, prefixes_to_ignore=prefixes_to_ignore
+            )
+            self.assertEqual(losses[0], losses[1])
+            for _optim in (ref_optim, optim):
+                _optim.step()
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            check_sharded_parity(
+                self, ref_model, model, prefixes_to_ignore=prefixes_to_ignore
+            )
+
+
+class TestFullyShardSharedParams(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.cuda.device_count())
+
+    @skip_if_lt_x_gpu(2)
+    @test_compiled_fsdp(compile_compute_on_module=TransformerBlock)
+    def test_train_parity_with_shared_params_no_ac(self):
+        self.run_subtests(
+            {
+                "reshard_after_forward": [False, True],
+                "use_activation_checkpointing": [False],
+            },
+            self._test_train_shared_params,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_train_parity_with_shared_params_ac(self):
+        self.run_subtests(
+            {
+                "reshard_after_forward": [False, True],
+                "use_activation_checkpointing": [True],
+            },
+            self._test_train_shared_params,
+        )
+
+    def _test_train_shared_params(
+        self,
+        reshard_after_forward: bool,
+        use_activation_checkpointing: bool,
+    ):
+        torch.manual_seed(42)
+        model_args = ModelArgs(n_layers=3, dropout_p=0.0, weight_tying=True)
+        model = Transformer(model_args)
+        ref_model = copy.deepcopy(model).cuda()
+        replicate(ref_model, device_ids=[self.rank])
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                if use_activation_checkpointing:
+                    checkpoint(module)
+                fully_shard(module, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, reshard_after_forward=reshard_after_forward)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        torch.manual_seed(42 + self.rank + 1)
+        for iter_idx in range(10):
+            inp = torch.randint(0, model_args.vocab_size, (2, 16), device="cuda")
+            losses: List[torch.Tensor] = []
+            for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+                _optim.step()
+            self.assertEqual(losses[0], losses[1])
+
+
+class TestFullyShardGradientAccumulation(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.cuda.device_count())
+
+    @skip_if_lt_x_gpu(2)
+    def test_gradient_accumulation(self):
+        """
+        Tests gradient accumulation with/without gradient reduction and
+        with/without resharding after backward.
+        """
+        meshes = [init_device_mesh("cuda", (self.world_size,))]  # always test FSDP
+        if self.world_size == 4:  # test HSDP too if enough GPUs
+            shard_size, replicate_size = 2, 2
+            meshes.append(init_device_mesh("cuda", (replicate_size, shard_size)))
+        self.run_subtests(
+            {
+                "mesh": meshes,
+                "reshard_after_forward": [True, False, 2],
+                # "all": disable reduce-scatter for all modules
+                # "root_only": disable reduce-scatter for root's linear only
+                # "some_mlps": disable reduce-scatter for some MLPs
+                "mode": ["all", "root_only", "some_mlps"],
+                "reshard_after_backward": [False, True],
+                "offload_policy": [OffloadPolicy(), CPUOffloadPolicy()],
+            },
+            self._test_gradient_accumulation,
+        )
+
+    def _test_gradient_accumulation(
+        self,
+        mesh: DeviceMesh,
+        reshard_after_forward: Union[bool, int],
+        mode: str,
+        reshard_after_backward: bool,
+        offload_policy: OffloadPolicy,
+    ):
+        if (
+            not reshard_after_backward
+            and (reshard_after_forward is not False or mode == "some_mlps")
+        ) or (
+            isinstance(offload_policy, CPUOffloadPolicy)
+            and reshard_after_forward is not True
+        ):
+            return  # skip since not common
+
+        torch.manual_seed(42)
+        local_batch_size, lin_dim, num_mlps, num_microbatches = (2, 32, 3, 3)
+        global_batch_size = local_batch_size * self.world_size
+        if mode == "some_mlps":
+            num_mlps_to_disable_reduce_scatter = 2
+        modules = [nn.Linear(lin_dim, lin_dim)]
+        modules.extend(MLP(lin_dim) for _ in range(num_mlps))
+        model = nn.Sequential(*modules)
+        ref_model = copy.deepcopy(model).cuda()
+        fully_shard_fn = functools.partial(
+            fully_shard,
+            mesh=mesh,
+            reshard_after_forward=reshard_after_forward,
+            offload_policy=offload_policy,
+        )
+        for mlp in model[1:]:
+            fully_shard_fn(mlp)
+        fully_shard_fn(model)  # root gets the 1st linear
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        # TODO: Migrate to `CommDebugMode` once it supports c10d collectives.
+        orig_all_gather = dist.all_gather_into_tensor
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+        orig_all_reduce = dist.all_reduce
+        all_gather_count, reduce_scatter_count, all_reduce_count = 0, 0, 0
+
+        def all_gather_with_count(*args, **kwargs):
+            nonlocal all_gather_count
+            all_gather_count += 1
+            return orig_all_gather(*args, **kwargs)
+
+        def reduce_scatter_with_count(*args, **kwargs):
+            nonlocal reduce_scatter_count
+            reduce_scatter_count += 1
+            return orig_reduce_scatter(*args, **kwargs)
+
+        def all_reduce_with_count(*args, **kwargs):
+            nonlocal all_reduce_count
+            all_reduce_count += 1
+            return orig_all_reduce(*args, **kwargs)
+
+        torch.manual_seed(1)  # same on all ranks
+        for iter_idx in range(5):
+            with patch_all_gather(all_gather_with_count), patch_reduce_scatter(
+                reduce_scatter_with_count
+            ), patch_all_reduce(all_reduce_with_count):
+                for microbatch_idx in range(num_microbatches):
+                    is_last_microbatch = microbatch_idx == num_microbatches - 1
+                    if mode == "all":
+                        model.set_requires_gradient_sync(is_last_microbatch)
+                        if not reshard_after_backward:
+                            model.set_reshard_after_backward(is_last_microbatch)
+                    elif mode == "some_mlps":
+                        for mlp in model[1 : 1 + num_mlps_to_disable_reduce_scatter]:
+                            mlp.set_requires_gradient_sync(is_last_microbatch)
+                            if not reshard_after_backward:
+                                mlp.set_reshard_after_backward(is_last_microbatch)
+                    elif mode == "root_only":
+                        model.set_requires_gradient_sync(
+                            is_last_microbatch, recurse=False
+                        )
+                        if not reshard_after_backward:
+                            model.set_reshard_after_backward(
+                                is_last_microbatch, recurse=False
+                            )
+
+                    global_inp = torch.rand((global_batch_size, lin_dim), device="cuda")
+                    local_inp = global_inp[
+                        self.rank
+                        * local_batch_size : (self.rank + 1)
+                        * local_batch_size
+                    ].detach()
+                    losses: List[torch.Tensor] = []
+                    for _model, inp in ((ref_model, global_inp), (model, local_inp)):
+                        losses.append(_model(inp).sum())
+                        losses[-1].backward()
+                    dist.all_reduce(losses[1])  # partial -> replicated
+                    self.assertEqual(losses[0], losses[1])
+
+            # Expect one reduce-scatter per MLP plus one for the root's linear
+            # on the last microbatch
+            expected_reduce_scatter_count = num_mlps + 1
+            if mode == "some_mlps":
+                # Expect additional reduce-scatters for non-disabled MLPs and
+                # the root's linear
+                expected_reduce_scatter_count += (
+                    num_mlps - num_mlps_to_disable_reduce_scatter + 1
+                ) * (num_microbatches - 1)
+            elif mode == "root_only":
+                # Expect additional reduce-scatters for all MLPs
+                expected_reduce_scatter_count += (num_mlps) * (num_microbatches - 1)
+            self.assertEqual(reduce_scatter_count, expected_reduce_scatter_count)
+            # Exclude the loss all-reduce per microbatch in our training loop
+            all_reduce_count -= num_microbatches
+            if mesh.ndim == 2:
+                self.assertEqual(all_reduce_count, expected_reduce_scatter_count)
+            else:
+                self.assertEqual(all_reduce_count, 0)
+            reduce_scatter_count = all_reduce_count = 0
+
+            # Expect one all-gather per MLP plus one for the root's linear in
+            # the first microbatch's forward
+            expected_all_gather_count = num_mlps + 1
+            if reshard_after_forward is not False:  # `True` or `2`
+                # Add the number of MLPs without the +1 for the backward
+                # all-gathers since the root does not reshard after forward
+                expected_all_gather_count += num_mlps
+                # Multiply by the number of microbatches since these
+                # all-gathers run every microbatch
+                expected_all_gather_count *= num_microbatches
+            elif reshard_after_backward:  # `reshard_after_forward=False`
+                expected_all_gather_count *= num_microbatches
+            elif mode == "all":  # `reshard_after_forward/backward=False`
+                # Only reshard parameters after the last microbatch's backward,
+                # so there should not be any more all-gathers
+                pass
+            elif mode == "root_only":  # `reshard_after_forward/backward=False`
+                # The MLPs should still contribute all-gathers in each
+                # microbatch forward
+                expected_all_gather_count += num_mlps * (num_microbatches - 1)
+            self.assertEqual(all_gather_count, expected_all_gather_count)
+            all_gather_count = 0
+
+            # Average the ref model's gradients over the world size to match
+            # data parallel semantics
+            for param in ref_model.parameters():
+                if param.grad is not None:
+                    param.grad.div_(self.world_size)
+            check_sharded_parity(self, ref_model, model)
+            for _optim in (optim, ref_optim):
+                _optim.step()
+                # When `set_to_none=False`, we are exercising mixing
+                # gradient accumulation with and without communication
+                _optim.zero_grad(set_to_none=(iter_idx % 2))
+
+    @skip_if_lt_x_gpu(2)
+    def test_1f1b_microbatching(self):
+        self.run_subtests(
+            {
+                "use_explicit_unshard": [False, True],
+                "reshard_after_backward": [False, True],
+            },
+            self._test_1f1b_microbatching,
+        )
+
+    def _test_1f1b_microbatching(
+        self, use_explicit_unshard: bool, reshard_after_backward: bool
+    ):
+        torch.manual_seed(42)
+        model_args = ModelArgs(dropout_p=0.0)
+        model = Transformer(model_args)
+        ref_model = copy.deepcopy(model).cuda()
+        ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard(module, reshard_after_forward=False)
+        fully_shard(model, reshard_after_forward=False)
+        optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
+
+        num_microbatches = 3
+        local_batch_size = 2
+        torch.manual_seed(42 + self.rank + 1)
+        inps = [
+            torch.randint(
+                0, model_args.vocab_size, (local_batch_size, 16), device="cuda"
+            )
+            for _ in range(num_microbatches)
+        ]
+
+        # Before pipelining, we may prefer to issue all all-gathers ahead of
+        # time to increase overlap opportunity at no difference in parameter
+        # memory usage since we do not reshard after forward
+        if use_explicit_unshard:
+            for module in model.modules():
+                if isinstance(module, FSDPModule):
+                    module.unshard(async_op=True)
+
+        # Emulate the 1f1b pipeline schedule and only reduce gradients on the
+        # last microbatch
+        losses: List[torch.Tensor] = []
+        ref_losses: List[torch.Tensor] = []
+        for inp_idx, inp in enumerate(inps):
+            is_last_microbatch = inp_idx == num_microbatches - 1
+            model.set_requires_gradient_sync(is_last_microbatch)
+            model.set_is_last_backward(is_last_microbatch)
+            if not reshard_after_backward:
+                model.set_reshard_after_backward(is_last_microbatch)
+            losses.append(model(inp).sum())
+            losses[-1].backward()
+            ref_losses.append(ref_model(inp).sum())
+            ref_losses[-1].backward()
+        for param in ref_model.parameters():
+            dist.all_reduce(param.grad, op=dist.ReduceOp.AVG)
+
+        for loss, ref_loss in zip(losses, ref_losses):
+            self.assertEqual(loss, ref_loss)
+        optim.step()
+        ref_optim.step()
+        check_sharded_parity(self, ref_model, model)
+
+
+class TestFullyShard2DTraining(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.cuda.device_count())
+
+    def init_global_mesh(self) -> DeviceMesh:
+        # Prefer to test with >=4 GPUs, but for 2 GPUs, use 2-way TP
+        dp_size = 2 if self.world_size > 2 else 1
+        return init_device_mesh(
+            "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+        )
+
+    @skip_if_lt_x_gpu(2)
+    @skipIfRocm
+    def test_train_parity_2d_mlp(self):
+        global_mesh = self.init_global_mesh()
+        self.run_subtests(
+            {
+                "reshard_after_forward": [False, True],
+                "use_activation_checkpointing": [False, True],
+                "mlp_dim": [3, 16, 17],
+            },
+            functools.partial(self._test_train_parity_2d_mlp, global_mesh),
+        )
+
+    def _test_train_parity_2d_mlp(
+        self,
+        global_mesh: DeviceMesh,
+        reshard_after_forward: bool,
+        use_activation_checkpointing: bool,
+        mlp_dim: int,
+    ):
+        dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
+        dp_pg = dp_mesh.get_group()  # used for `replicate()`
+
+        torch.manual_seed(42)
+        model = nn.Sequential(
+            nn.LayerNorm(mlp_dim, bias=False),
+            # Use multiplier of 3 to exercise uneven case
+            MLP(mlp_dim, dim_multiplier=3),
+            MLP(mlp_dim),
+            MLP(mlp_dim, dim_multiplier=3),
+        )
+        ref_model = copy.deepcopy(model).cuda()
+        replicate(ref_model, device_ids=[self.rank], process_group=dp_pg)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+
+        model = parallelize_module(
+            model,
+            device_mesh=tp_mesh,
+            # Leave the layer norm as implicitly replicated
+            parallelize_plan={
+                # Pass `use_local_output=False` to keep as DTensor to preserve
+                # uneven activation dims
+                "1.in_proj": ColwiseParallel(use_local_output=False),
+                "1.out_proj": RowwiseParallel(use_local_output=False),
+                "2.in_proj": ColwiseParallel(use_local_output=False),
+                "2.out_proj": RowwiseParallel(use_local_output=False),
+                "3.in_proj": ColwiseParallel(use_local_output=False),
+                "3.out_proj": RowwiseParallel(),
+            },
+        )
+        for mlp in model:
+            if use_activation_checkpointing:
+                checkpoint(mlp)
+            fully_shard(mlp, mesh=dp_mesh, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, mesh=dp_mesh, reshard_after_forward=reshard_after_forward)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        torch.manual_seed(42 + dp_pg.rank() + 1)
+        device = torch.device("cuda")
+        for iter_idx in range(10):
+            inp = torch.randn((8, mlp_dim), device=device)
+            losses: List[torch.Tensor] = []
+            for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+                _optim.step()
+            self.assertEqual(losses[0], losses[1])
+
+    @skip_if_lt_x_gpu(2)
+    @with_temp_dir
+    def test_train_parity_2d_transformer_checkpoint_resume(self):
+        """
+        Tests train parity of a 2D transformer without checkpointing against a
+        2D transformer with a checkpoint save/load.
+        """
+        self.run_subtests(
+            {
+                "use_seq_parallel": [False, True],
+                # If reusing, then load into the same model/optimizer instance
+                # else construct new ones (requiring eager optim state init)
+                "reuse_model_optim": [False, True],
+                "optimizer_class": [torch.optim.Adam, torch.optim.AdamW],
+            },
+            self._test_train_parity_2d_transformer_checkpoint_resume,
+        )
+
+    def _test_train_parity_2d_transformer_checkpoint_resume(
+        self,
+        use_seq_parallel: bool,
+        reuse_model_optim: bool,
+        optimizer_class: Type[torch.optim.Optimizer],
+    ):
+        def train_step(
+            _model: nn.Module, _optim: torch.optim.Optimizer, _inp: torch.Tensor
+        ) -> torch.Tensor:
+            loss = _model(_inp).sum()
+            loss.backward()
+            _optim.step()
+            _optim.zero_grad()
+            return loss
+
+        def parallelize(_model: Transformer, mesh: DeviceMesh, use_seq_parallel: bool):
+            _model = Transformer.parallelize(_model, mesh["tp"], use_seq_parallel)
+            for layer in _model.layers:
+                fully_shard(layer, mesh=mesh["dp"])
+            fully_shard(_model, mesh=mesh["dp"])
+            return _model
+
+        global_mesh = self.init_global_mesh()
+        # Baseline: run two iterations without checkpointing
+        seed = 42
+        torch.manual_seed(seed)
+        model_args = ModelArgs(dropout_p=0.0)
+        model_no_cp = parallelize(
+            Transformer(model_args), global_mesh, use_seq_parallel
+        )
+        optim_no_cp = optimizer_class(model_no_cp.parameters(), lr=1e-2)
+
+        torch.manual_seed(42 + global_mesh["dp"].get_local_rank() + 1)
+        inp = torch.randint(0, model_args.vocab_size, (3, 16), device="cuda")
+        loss_no_cp1 = train_step(model_no_cp, optim_no_cp, inp)
+        loss_no_cp2 = train_step(model_no_cp, optim_no_cp, inp)
+
+        # Test: run one iteration, save checkpoint, zero states or init new
+        # model/optimizer, load checkpoint, and run another iteration
+        torch.manual_seed(seed)
+        model_cp = parallelize(Transformer(model_args), global_mesh, use_seq_parallel)
+        optim_cp = optimizer_class(model_cp.parameters(), lr=1e-2)
+
+        loss_cp1 = train_step(model_cp, optim_cp, inp)
+        self.assertEqual(loss_no_cp1, loss_cp1)
+
+        sharded_sd = {
+            "model": get_model_state_dict(model_cp),
+            # Use `get_optimizer_state_dict` to handle eager optim state init
+            # when constructing a new optimizer instance
+            "optim": get_optimizer_state_dict(model_cp, optim_cp),
+        }
+        dcp.save(
+            state_dict=sharded_sd,
+            storage_writer=dcp.FileSystemWriter(self.temp_dir),
+        )
+        if reuse_model_optim:
+            with torch.no_grad():
+                for param in model_cp.parameters():
+                    param.zero_()
+                optim_sd = optim_cp.state_dict()
+                for param_states in optim_sd["state"].values():
+                    for state_value in param_states.values():
+                        if torch.is_tensor(state_value):
+                            state_value.zero_()
+        else:
+            torch.manual_seed(seed + 1)  # different seed
+            model_cp = parallelize(
+                Transformer(model_args), global_mesh, use_seq_parallel
+            )
+            optim_cp = optimizer_class(model_cp.parameters(), lr=1e-2)
+        self.assertNotEqual(loss_no_cp2, train_step(model_cp, optim_cp, inp))
+
+        sharded_sd = {
+            "model": get_model_state_dict(model_cp),
+            "optim": get_optimizer_state_dict(model_cp, optim_cp),
+        }
+        dcp.load(
+            state_dict=sharded_sd,
+            storage_reader=dcp.FileSystemReader(self.temp_dir),
+        )
+        self.assertGreater(len(optim_cp.state_dict()["state"]), 0)
+
+        loss_cp2 = train_step(model_cp, optim_cp, inp)
+        self.assertEqual(loss_no_cp2, loss_cp2)
+
+
+class TestFullyShardNDTraining(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(8, torch.cuda.device_count())
+
+    def init_global_mesh(self) -> DeviceMesh:
+        # Prefer to test with >=8 GPUs, but for 2 GPUs, use 2-way TP
+        dp_size = 2 if self.world_size > 2 else 1
+        pp_size = 2 if self.world_size > 4 else 1
+        return init_device_mesh(
+            "cuda",
+            (pp_size, dp_size, self.world_size // (dp_size * pp_size)),
+            mesh_dim_names=("pp", "dp", "tp"),
+        )
+
+    @skip_if_lt_x_gpu(4)
+    def test_2d_mlp_with_nd_mesh(self):
+        global_mesh = self.init_global_mesh()
+        self.run_subtests(
+            {
+                "reshard_after_forward": [False, True],
+                "use_activation_checkpointing": [False, True],
+                "mlp_dim": [3, 16, 17],
+            },
+            functools.partial(self._test_2d_mlp_with_nd_mesh, global_mesh),
+        )
+
+    def _test_2d_mlp_with_nd_mesh(
+        self,
+        global_mesh: DeviceMesh,
+        reshard_after_forward: bool,
+        use_activation_checkpointing: bool,
+        mlp_dim: int,
+    ):
+        global_mesh = self.init_global_mesh()
+        pp_mesh, dp_mesh, tp_mesh = (
+            global_mesh["pp"],
+            global_mesh["dp"],
+            global_mesh["tp"],
+        )
+        dp_pg = dp_mesh.get_group()  # used for `replicate()`
+
+        torch.manual_seed(42)
+        model = nn.Sequential(
+            nn.LayerNorm(mlp_dim, bias=False),
+            # Use multiplier of 3 to exercise uneven case
+            MLP(mlp_dim, dim_multiplier=3),
+            MLP(mlp_dim),
+            MLP(mlp_dim, dim_multiplier=3),
+        )
+        ref_model = copy.deepcopy(model).cuda()
+        replicate(ref_model, device_ids=[self.rank], process_group=dp_pg)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+
+        model = parallelize_module(
+            model,
+            device_mesh=tp_mesh,
+            # Leave the layer norm as implicitly replicated
+            parallelize_plan={
+                # Pass `use_local_output=False` to keep as DTensor to preserve
+                # uneven activation dims
+                "1.in_proj": ColwiseParallel(use_local_output=False),
+                "1.out_proj": RowwiseParallel(use_local_output=False),
+                "2.in_proj": ColwiseParallel(use_local_output=False),
+                "2.out_proj": RowwiseParallel(use_local_output=False),
+                "3.in_proj": ColwiseParallel(use_local_output=False),
+                "3.out_proj": RowwiseParallel(),
+            },
+        )
+        for mlp in model:
+            if use_activation_checkpointing:
+                checkpoint(mlp)
+            fully_shard(mlp, mesh=dp_mesh, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, mesh=dp_mesh, reshard_after_forward=reshard_after_forward)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        torch.manual_seed(42 + dp_pg.rank() + 1)
+        device = torch.device("cuda")
+        for iter_idx in range(10):
+            inp = torch.randn((8, mlp_dim), device=device)
+            losses: List[torch.Tensor] = []
+            for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+                _optim.step()
+            self.assertEqual(losses[0], losses[1])
+
+
+class TestFullyShardHSDPTraining(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.cuda.device_count())
+
+    @skip_if_lt_x_gpu(2)
+    def test_train_parity_hsdp(self):
+        shard_size = 2 if self.world_size > 2 else 1
+        replicate_size = self.world_size // shard_size
+        global_mesh = init_device_mesh(
+            "cuda", (replicate_size, shard_size), mesh_dim_names=("replicate", "shard")
+        )
+        self.run_subtests(
+            {
+                "reshard_after_forward": [False, True],
+                "use_activation_checkpointing": [False, True],
+                "mlp_dim": [3, 16, 17],
+                "sync_gradients_at_last_batch": [True, False],
+            },
+            functools.partial(self._test_train_parity_hsdp, global_mesh),
+        )
+
+    def _test_train_parity_hsdp(
+        self,
+        global_mesh: DeviceMesh,
+        reshard_after_forward: bool,
+        use_activation_checkpointing: bool,
+        mlp_dim: int,
+        sync_gradients_at_last_batch: bool,
+    ):
+        torch.manual_seed(42)
+        model = nn.Sequential(
+            nn.LayerNorm(mlp_dim, bias=False),
+            MLP(mlp_dim, dim_multiplier=3),
+            MLP(mlp_dim),
+            MLP(mlp_dim, dim_multiplier=3),
+        )
+        ref_model = copy.deepcopy(model).cuda()
+        replicate(ref_model, device_ids=[self.rank])
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        for mlp in model:
+            if use_activation_checkpointing:
+                checkpoint(mlp)
+            fully_shard(
+                mlp, mesh=global_mesh, reshard_after_forward=reshard_after_forward
+            )
+        fully_shard(
+            model, mesh=global_mesh, reshard_after_forward=reshard_after_forward
+        )
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        check_sharded_parity(self, ref_model, model)
+        torch.manual_seed(42 + self.rank + 1)
+        device = torch.device("cuda")
+        num_microbatches = 3
+        for iter_idx in range(5):
+            for microbatch_idx in range(num_microbatches):
+                is_last_microbatch = microbatch_idx == num_microbatches - 1
+                if sync_gradients_at_last_batch:
+                    model.set_requires_gradient_sync(is_last_microbatch)
+                inp = torch.randn((8, mlp_dim), device=device)
+                losses: List[torch.Tensor] = []
+                for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                    losses.append(_model(inp).sum())
+                    losses[-1].backward()
+                self.assertEqual(losses[0], losses[1])
+            check_sharded_parity(self, ref_model, model)
+            for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                _optim.step()
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            check_sharded_parity(self, ref_model, model)
+
+
+class TestFullyShardCustomForwardMethod(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_register_fsdp_forward_method(self):
+        """Based on https://github.com/pytorch/pytorch/issues/109385"""
+
+        class VisionTransformer(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.patch_proj = nn.Conv2d(3, 1024, kernel_size=14, stride=14)
+
+            def forward_features(self, imgs: torch.Tensor) -> torch.Tensor:
+                return self.patch_proj(imgs).flatten(2).transpose(1, 2)
+
+            def forward(self, imgs: torch.Tensor) -> torch.Tensor:
+                return self.forward_features(imgs).sum(dim=1)
+
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.vit, self.projector = VisionTransformer(), nn.Linear(1024, 256)
+
+            def forward(self, imgs: torch.Tensor) -> torch.Tensor:
+                # Run `vit.forward_features`, which is not `forward`!
+                patch_embeddings = self.vit.forward_features(imgs)
+                return self.projector(patch_embeddings)
+
+        torch.manual_seed(42)
+        model = Model()
+        for param in model.parameters():
+            dist.broadcast(param.detach(), src=0)
+        ref_model = copy.deepcopy(model).cuda()
+        fully_shard(model.vit)
+        fully_shard(model.projector)
+        fully_shard(model)
+        register_fsdp_forward_method(model.vit, "forward_features")
+
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn(4, 3, 224, 224, device="cuda")
+        ref_loss = ref_model(inp).sum()
+        loss = model(inp).sum()
+        self.assertEqual(ref_loss, loss)
+        ref_loss.backward()
+        loss.backward()
+        for param in ref_model.parameters():
+            dist.all_reduce(param.grad, op=dist.ReduceOp.AVG)
+        check_sharded_parity(self, ref_model, model)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/fully_shard/test_fully_shard_compile.py b/test/distributed/_composable/fully_shard/test_fully_shard_compile.py
index 5e578196eb442..03c503f732a19 100644
--- a/test/distributed/_composable/fully_shard/test_fully_shard_compile.py
+++ b/test/distributed/_composable/fully_shard/test_fully_shard_compile.py
@@ -2,6 +2,7 @@
 
 import copy
 import sys
+import unittest
 
 import torch
 import torch.distributed as dist
@@ -17,6 +18,7 @@
     TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
+from torch.utils._triton import has_triton
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
@@ -35,6 +37,7 @@ class TestCompile(FSDPTest):
     def world_size(self) -> int:
         return torch.cuda.device_count()
 
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_lt_x_gpu(2)
     def test_compile(self):
         self.run_subtests(
diff --git a/test/distributed/_composable/fully_shard/test_fully_shard_init.py b/test/distributed/_composable/fully_shard/test_fully_shard_init.py
index d5297f4cc106e..c6da3ab295f6e 100644
--- a/test/distributed/_composable/fully_shard/test_fully_shard_init.py
+++ b/test/distributed/_composable/fully_shard/test_fully_shard_init.py
@@ -144,10 +144,12 @@ def _test_fully_shard_construction(
         # Check that the composable module does not add any wrapper class
         local_module_classes = set()
         composable_module_classes = set()
-        for submodule in local_model.modules():
-            local_module_classes.add(type(submodule))
-        for submodule in composable_module.modules():
-            composable_module_classes.add(type(submodule))
+        local_module_classes.update(
+            type(submodule) for submodule in local_model.modules()
+        )
+        composable_module_classes.update(
+            type(submodule) for submodule in composable_module.modules()
+        )
         self.assertEqual(local_module_classes, composable_module_classes)
 
         # Check that the composable module has the same FSDP states with the
@@ -310,14 +312,14 @@ def _test_nested_fully_shard_shared_state(self, use_policy: bool):
         ]
         for data_structure_name in data_structure_names:
             all_structures = set()
-            for module in (
-                composable_module.u1,
-                composable_module.u2,
-                composable_module,
-            ):
-                all_structures.add(
-                    id(getattr(fully_shard.state(module), data_structure_name))
+            all_structures.update(
+                id(getattr(fully_shard.state(module), data_structure_name))
+                for module in (
+                    composable_module.u1,
+                    composable_module.u2,
+                    composable_module,
                 )
+            )
             self.assertEqual(len(all_structures), 1)
 
 
diff --git a/test/distributed/_composable/test_replicate_with_compiler.py b/test/distributed/_composable/test_replicate_with_compiler.py
new file mode 100644
index 0000000000000..354f99dabd739
--- /dev/null
+++ b/test/distributed/_composable/test_replicate_with_compiler.py
@@ -0,0 +1,406 @@
+# Owner(s): ["oncall: distributed"]
+
+import contextlib
+import functools
+import os
+import unittest
+from copy import deepcopy
+from typing import Callable, Optional
+
+import torch
+import torch.distributed as dist
+from torch import _inductor as inductor, nn
+from torch._C import FileCheck
+from torch._dynamo import compiled_autograd
+from torch._dynamo.utils import counters
+from torch._inductor.utils import run_and_get_triton_code
+from torch.distributed._composable.replicate import replicate
+from torch.distributed.algorithms.ddp_comm_hooks import (
+    default_hooks as ddp_default_hooks,
+)
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    RowwiseParallel,
+)
+from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+from torch.testing._internal.common_distributed import (
+    MultiProcessTestCase,
+    skip_if_lt_x_gpu,
+    skip_if_rocm,
+)
+from torch.testing._internal.common_utils import run_tests
+from torch.utils._triton import has_triton
+from torch.utils.checkpoint import checkpoint
+
+
+DIM = 2000
+
+
+class Net(nn.Module):
+    def __init__(self, checkpoint=False):
+        super().__init__()
+        self.fc1 = nn.Linear(DIM, DIM)
+        self.fc2 = nn.Linear(DIM, DIM)
+        self.fc3 = nn.Linear(DIM, DIM)
+        self.fc4 = nn.Linear(DIM, DIM)
+        self.use_checkpoint = checkpoint
+
+    def forward(self, x):
+        if self.use_checkpoint:
+            _fc1 = checkpoint(self.fc1, x, use_reentrant=False)
+        else:
+            _fc1 = self.fc1(x)
+        return self.fc4(self.fc3(self.fc2(_fc1)))
+
+
+def compiler_fn(no_inductor=False):
+    def _compiler_fn(gm):
+        def inner_compiler(gm_, example_inputs_):
+            if no_inductor:
+                return gm_
+            else:
+                return inductor.compile(gm_, example_inputs_)
+
+        gm = torch.compile(gm, fullgraph=True, backend=inner_compiler)
+        return gm
+
+    return _compiler_fn
+
+
+class ReplicateTest(MultiProcessTestCase):
+    @property
+    def world_size(self) -> int:
+        return min(2, torch.cuda.device_count())
+
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    def _test_compile(
+        self,
+        *,
+        use_gpu: bool,
+        no_sync: bool,
+        setup_func: Optional[Callable] = None,
+        no_inductor: bool = False,
+        no_compile_forward: bool = False,
+    ):
+        backend = "nccl" if use_gpu else "gloo"
+        dist.init_process_group(
+            backend=backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            store=dist.FileStore(self.file_name, self.world_size),
+        )
+        if use_gpu:
+            torch.cuda.set_device(f"cuda:{self.rank}")
+            device = torch.device("cuda")
+        else:
+            device = torch.device("cpu")
+
+        torch._dynamo.config.optimize_ddp = (
+            "python_reducer_without_compiled_forward"
+            if no_compile_forward
+            else "python_reducer"
+        )
+        torch.manual_seed(123)
+        model = Net().to(device)
+        input = torch.randn([1, DIM], device=device)
+
+        compiled_replicate_model = replicate(deepcopy(model))
+        if not no_compile_forward:
+            compiled_replicate_model = torch.compile(
+                compiled_replicate_model, fullgraph=False
+            )
+        compiled_replicate_optim = torch.optim.Adam(
+            compiled_replicate_model.parameters()
+        )
+        compiled_ddp_model = DDP(deepcopy(model))
+        if not no_compile_forward:
+            compiled_ddp_model = torch.compile(compiled_ddp_model, fullgraph=True)
+        compiled_ddp_optim = torch.optim.Adam(compiled_ddp_model.parameters())
+        model = replicate(model)
+        optim = torch.optim.Adam(model.parameters())
+
+        if setup_func:
+            setup_func(model, compiled_replicate_model, compiled_ddp_model)
+
+        models = [model, compiled_replicate_model, compiled_ddp_model]
+        optims = [optim, compiled_replicate_optim, compiled_ddp_optim]
+        sync_contexts = [
+            contextlib.nullcontext(),
+            contextlib.nullcontext(),
+            compiled_ddp_model.no_sync(),
+        ]
+
+        # Run multiple iterations so that we could test no_sync
+        for i in range(2):
+            # Setting a different random seed so that if the allreduces are not
+            # executed correctly, the gradients won't be correct compared to the
+            # eager DDP.
+            torch.manual_seed(123 + self.rank + i)
+            input = torch.randn([1, DIM], device=device)
+
+            for model_idx in range(3):
+                if no_sync and i % 2 == 0:
+                    context = sync_contexts[model_idx]
+                    if model_idx <= 1:
+                        models[model_idx].set_requires_gradient_sync(False)
+                else:
+                    context = contextlib.nullcontext()
+                    if model_idx <= 1:
+                        models[model_idx].set_requires_gradient_sync(True)
+                context = contextlib.nullcontext()
+
+                with context:
+                    bwd_context = (
+                        contextlib.nullcontext()
+                        if model_idx == 0
+                        else compiled_autograd.enable(compiler_fn(no_inductor))
+                    )
+                    with bwd_context:
+                        loss = models[model_idx](input).sum()
+                        loss.backward()
+
+            if not no_sync or i % 2 == 1:
+                for p1, p2, p3 in zip(
+                    model.parameters(),
+                    compiled_replicate_model.parameters(),
+                    compiled_ddp_model.parameters(),
+                ):
+                    self.assertEqual(p1.grad, p2.grad)
+                    self.assertEqual(p1.grad, p3.grad)
+                for optim in optims:
+                    optim.step()
+                    optim.zero_grad()
+
+        self.assertEqual(
+            tuple(model.parameters()), tuple(compiled_replicate_model.parameters())
+        )
+        self.assertEqual(
+            tuple(model.parameters()), tuple(compiled_ddp_model.parameters())
+        )
+
+    def test_compile_cpu(self):
+        # Test the coalesced_op with CPU.
+        torch._inductor.config._fuse_ddp_communication_passes = [
+            "fuse_ddp_with_coalesced_op",
+            "schedule_comm_wait",
+        ]
+        self._test_compile(use_gpu=False, no_sync=False)
+
+    def test_compile_cpu_no_sync(self):
+        # Test the coalesced_op with CPU.
+        torch._inductor.config._fuse_ddp_communication_passes = [
+            "fuse_ddp_with_coalesced_op",
+            "schedule_comm_wait",
+        ]
+        self._test_compile(use_gpu=False, no_sync=True)
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @skip_if_rocm
+    @skip_if_lt_x_gpu(2)
+    def test_compile_gpu(self):
+        self._test_compile(use_gpu=True, no_sync=False)
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @skip_if_rocm
+    @skip_if_lt_x_gpu(2)
+    def test_compile_bf16(self):
+        def setup(model, compiled_replicate_model, compiled_ddp_model) -> None:
+            model.register_comm_hook(None, ddp_default_hooks.bf16_compress_hook)
+            compiled_m = compiled_replicate_model._orig_mod
+            compiled_m.register_comm_hook(None, ddp_default_hooks.bf16_compress_hook)
+            compiled_ddp_model.register_comm_hook(
+                None, ddp_default_hooks.bf16_compress_hook
+            )
+
+        self._test_compile(use_gpu=True, no_sync=False, setup_func=setup)
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @skip_if_rocm
+    @skip_if_lt_x_gpu(2)
+    def test_compile_fp16(self):
+        def setup(model, compiled_replicate_model, compiled_ddp_model) -> None:
+            model.register_comm_hook(None, ddp_default_hooks.fp16_compress_hook)
+            compiled_m = compiled_replicate_model._orig_mod
+            compiled_m.register_comm_hook(None, ddp_default_hooks.fp16_compress_hook)
+            compiled_ddp_model.register_comm_hook(
+                None, ddp_default_hooks.fp16_compress_hook
+            )
+
+        # TODO: figure out why we need to disable Inductor to avoid test errors.
+        self._test_compile(
+            use_gpu=True, no_sync=False, setup_func=setup, no_inductor=True
+        )
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @skip_if_rocm
+    @skip_if_lt_x_gpu(2)
+    def test_compile_backward_only(self):
+        self._test_compile(use_gpu=True, no_sync=False, no_compile_forward=True)
+
+    def _test_bucketing(self, init_process_group=True, loop=1):
+        if init_process_group:
+            dist.init_process_group(
+                backend="gloo",
+                rank=self.rank,
+                world_size=self.world_size,
+                store=dist.FileStore(self.file_name, self.world_size),
+            )
+        model = Net()
+        input = torch.randn([1, DIM])
+        torch._dynamo.config.optimize_ddp = "python_reducer"
+        compiled_replicate_model = torch.compile(
+            replicate(deepcopy(model)), fullgraph=False
+        )
+
+        def bwd(loss):
+            with compiled_autograd.enable(compiler_fn()):
+                loss.backward()
+
+        for i in range(loop):
+            loss = compiled_replicate_model(input).sum()
+            if i != loop - 1:
+                # Leave the last bwd for the run_and_get_triton_code.
+                bwd(loss)
+
+        code = run_and_get_triton_code(functools.partial(bwd, loss=loss))
+
+        self.assertEqual(counters["inductor"]["ddp_buckets"], 3)
+        return code
+
+    def test_bucketing_coalesced_op(self):
+        torch._inductor.config._fuse_ddp_communication_passes = [
+            "fuse_ddp_with_coalesced_op",
+            "schedule_comm_wait",
+        ]
+
+        # Gradient is None
+        code = self._test_bucketing()
+        self.assertEqual(counters["inductor"]["ddp_buckets"], 3)
+        fc = FileCheck()
+        for i in range(3):
+            fc.check("cpp_fused_").check(
+                "torch.ops._c10d_functional.all_reduce_coalesced_.default("
+            )
+        for i in range(3):
+            fc.check("torch.ops._c10d_functional.wait_tensor.default")
+
+        fc.run(code)
+
+        # Gradient is None
+        code = self._test_bucketing(init_process_group=False, loop=2)
+        self.assertEqual(counters["inductor"]["ddp_buckets"], 3)
+        fc = FileCheck()
+        for i in range(3):
+            fc.check("cpp_fused_").check(
+                "torch.ops._c10d_functional.all_reduce_coalesced_.default("
+            )
+        for i in range(3):
+            fc.check("torch.ops._c10d_functional.wait_tensor.default")
+
+        fc.run(code)
+
+    def test_bucketing_concat_op(self):
+        torch._inductor.config._fuse_ddp_communication_passes = [
+            "fuse_ddp_with_concat_op",
+            "schedule_comm_wait",
+        ]
+
+        # Gradient is None
+        code = self._test_bucketing()
+        self.assertEqual(counters["inductor"]["ddp_buckets"], 3)
+        fc = FileCheck()
+        for i in range(3):
+            fc.check("aten.flatten.using_ints(").check("cpp_fused_").check(
+                "torch.ops._c10d_functional.all_reduce_.default("
+            )
+        for i in range(3):
+            fc.check("torch.ops._c10d_functional.wait_tensor.default")
+        fc.run(code)
+
+        # Gradient is not None
+        code = self._test_bucketing(init_process_group=False, loop=2)
+        self.assertEqual(counters["inductor"]["ddp_buckets"], 3)
+        fc = FileCheck()
+        for i in range(3):
+            fc.check("aten.flatten.using_ints(").check("cpp_fused_").check(
+                "torch.ops._c10d_functional.all_reduce_.default("
+            )
+        for i in range(3):
+            fc.check("torch.ops._c10d_functional.wait_tensor.default")
+        fc.run(code)
+
+
+class DDP_TP_Test(MultiProcessTestCase):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.cuda.device_count())
+
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @skip_if_rocm
+    @skip_if_lt_x_gpu(4)
+    def test_ddp_tp(self):
+        torch.cuda.set_device(f"cuda:{self.rank}")
+        dist.init_process_group(
+            backend="nccl",
+            rank=self.rank,
+            world_size=self.world_size,
+            store=dist.FileStore(self.file_name, self.world_size),
+        )
+        model = Net().cuda()
+        compiled_replicate_model = deepcopy(model)
+        mesh_2d = init_device_mesh(
+            "cuda", (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
+        )
+        tp_mesh = mesh_2d["tp"]
+        dp_mesh = mesh_2d["dp"]
+        parallelize_plan = {
+            "fc1": ColwiseParallel(),
+            "fc2": RowwiseParallel(),
+            "fc3": ColwiseParallel(),
+            "fc4": RowwiseParallel(),
+        }
+        model = parallelize_module(model, tp_mesh, parallelize_plan)
+        model = replicate(model, device_mesh=dp_mesh)
+        compiled_replicate_model = parallelize_module(
+            compiled_replicate_model, tp_mesh, parallelize_plan
+        )
+        compiled_replicate_model = replicate(
+            compiled_replicate_model, device_mesh=dp_mesh
+        )
+        compiled_replicate_model = torch.compile(compiled_replicate_model)
+        data = torch.randn([1, DIM]).cuda()
+        with compiled_autograd.enable(compiler_fn()):
+            loss = compiled_replicate_model(data).sum()
+            loss.backward()
+
+        loss = model(data).sum()
+        loss.backward()
+        for p1, p2 in zip(model.parameters(), compiled_replicate_model.parameters()):
+            self.assertEqual(p1.grad, p2.grad)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_shard/sharded_optim/test_sharded_optim.py b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
index 30202ee06257e..6b08479fbbae9 100644
--- a/test/distributed/_shard/sharded_optim/test_sharded_optim.py
+++ b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
@@ -1,32 +1,21 @@
 # Owner(s): ["oncall: distributed"]
 
+from copy import deepcopy
+
 import torch
 import torch.optim as optim
-from torch.distributed._shard import (
-    sharded_tensor,
-    shard_parameter
-)
-
-from copy import deepcopy
-from torch.distributed._shard.sharding_spec import (
-    ChunkShardingSpec,
-)
-from torch.distributed._shard.sharded_optim import (
-    ShardedOptimizer,
-)
-from torch.testing._internal.common_distributed import (
-    requires_nccl,
-    skip_if_lt_x_gpu,
-)
-from torch.testing._internal.common_utils import (
-    run_tests,
-)
+from torch.distributed._shard import shard_parameter, sharded_tensor
+from torch.distributed._shard.sharded_optim import ShardedOptimizer
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import run_tests
 
 from torch.testing._internal.distributed._shard.sharded_tensor import (
     ShardedTensorTestBase,
     with_comms,
 )
 
+
 class MyShardedModel(torch.nn.Module):
     def __init__(self, spec=None, group=None):
         super().__init__()
@@ -34,7 +23,11 @@ def __init__(self, spec=None, group=None):
         torch.manual_seed(0)
         self.param = torch.nn.Parameter(torch.rand(5, 10))
         if spec is not None:
-            self.sharded_param = torch.nn.Parameter(sharded_tensor.rand(spec, 20, 10, requires_grad=True, process_group=group))
+            self.sharded_param = torch.nn.Parameter(
+                sharded_tensor.rand(
+                    spec, 20, 10, requires_grad=True, process_group=group
+                )
+            )
         else:
             self.sharded_param = torch.nn.Parameter(torch.rand(5, 10))
 
@@ -87,7 +80,6 @@ def forward(self, inp):
 
 
 class TestShardedOptimizer(ShardedTensorTestBase):
-
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
@@ -105,8 +97,9 @@ def test_sharded_optim(self):
         sharded_model = MyShardedModel(spec=rowwise_spec).cuda()
 
         # copy the parameters from local model
-        sharded_model.sharded_param.local_shards()[0].tensor = \
+        sharded_model.sharded_param.local_shards()[0].tensor = (
             local_model.sharded_param.detach().clone().requires_grad_()
+        )
 
         local_optim = optim.SGD(local_model.parameters(), lr=0.1)
         sharded_model_params = dict(sharded_model.named_parameters())
@@ -137,12 +130,10 @@ def test_sharded_optim(self):
             new_val = sharded_optim.named_params[key]
             if isinstance(val, sharded_tensor.ShardedTensor):
                 self.assertNotEqual(
-                    val.local_shards()[0].tensor,
-                    new_val.local_shards()[0].tensor
+                    val.local_shards()[0].tensor, new_val.local_shards()[0].tensor
                 )
                 self.assertEqual(
-                    new_val.local_shards()[0].tensor,
-                    local_model.sharded_param
+                    new_val.local_shards()[0].tensor, local_model.sharded_param
                 )
             else:
                 self.assertNotEqual(val, new_val)
@@ -179,5 +170,6 @@ def test_named_params_with_sharded_tensor(self):
         self.assertTrue("linear2.weight" in param_keys)
         self.assertFalse("bias" in param_keys)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py b/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py
index 33fc49f81c0f7..1cfed1945a89b 100644
--- a/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py
+++ b/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py
@@ -1,37 +1,37 @@
 # Owner(s): ["oncall: distributed"]
 
 import sys
+
 import torch
 import torch.distributed as dist
 
 from torch.distributed._shard import sharded_tensor
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
 from torch.distributed.distributed_c10d import _get_default_group
 
-from torch.testing._internal.common_distributed import (
-    requires_nccl,
-    skip_if_lt_x_gpu,
-)
-from torch.distributed._shard.sharding_spec import (
-    ChunkShardingSpec,
-)
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 from torch.testing._internal.distributed._shard.sharded_tensor import (
     ShardedTensorTestBase,
     with_comms,
 )
-from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
-    run_tests,
-)
 
 if TEST_WITH_DEV_DBG_ASAN:
-    print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr)
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
     sys.exit(0)
 
+
 class TestShardedTensorBinaryOps(ShardedTensorTestBase):
-    """ Test base for binary comparison functions such as torch.equal, torch.allclose etc. for ShardedTensor """
+    """Test base for binary comparison functions such as torch.equal, torch.allclose etc. for ShardedTensor"""
+
     seed = 42
 
-    def get_random_tensors(self, spec1, spec2, *sizes, pg1=None, pg2=None, seed_offset=0):
+    def get_random_tensors(
+        self, spec1, spec2, *sizes, pg1=None, pg2=None, seed_offset=0
+    ):
         pg1 = _get_default_group() if pg1 is None else pg1
         pg2 = _get_default_group() if pg2 is None else pg2
         torch.manual_seed(TestShardedTensorBinaryOps.seed)
@@ -128,7 +128,7 @@ def test_torch_equal_tensor_specs(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_torch_equal(self):
-        """ Test torch.equal(ShardedTensor, ShardedTensor) """
+        """Test torch.equal(ShardedTensor, ShardedTensor)"""
 
         spec, alt_spec = self.get_gpu_specs()
         st1, st2 = self.get_random_tensors(spec, spec, 10, 10)
@@ -144,7 +144,7 @@ def test_torch_allclose_tensor_specs(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_torch_allclose(self):
-        """ Test torch.allclose(ShardedTensor, ShardedTensor) """
+        """Test torch.allclose(ShardedTensor, ShardedTensor)"""
 
         spec, alt_spec = self.get_gpu_specs()
 
@@ -158,5 +158,6 @@ def test_torch_allclose(self):
         # sharded_tensor.rand produces uniform values in the [0,1] range.
         self.assertTrue(torch.allclose(st1, st2, atol=1))
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_embedding.py b/test/distributed/_shard/sharded_tensor/ops/test_embedding.py
index 9291e06e31535..98e1efee92d64 100644
--- a/test/distributed/_shard/sharded_tensor/ops/test_embedding.py
+++ b/test/distributed/_shard/sharded_tensor/ops/test_embedding.py
@@ -4,20 +4,12 @@
 
 import torch
 import torch.distributed as dist
-from torch.distributed._shard import (
-    shard_parameter,
-)
-from torch.testing._internal.common_distributed import (
-    requires_nccl,
-    skip_if_lt_x_gpu,
-)
-from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
-    run_tests,
-)
+from torch.distributed._shard import shard_parameter
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 from torch.testing._internal.distributed._shard.sharded_tensor import (
-    TEST_GPU_NUM,
     ShardedTensorTestBase,
+    TEST_GPU_NUM,
     with_comms,
 )
 from torch.testing._internal.distributed._shard.sharded_tensor._test_ops_common import (
@@ -64,9 +56,7 @@ def _run_sharded_embedding(
         )
 
         # Copy the weights from local embedding
-        sharded_embedding.weight = clone_module_parameter(
-            local_embedding, "weight"
-        )
+        sharded_embedding.weight = clone_module_parameter(local_embedding, "weight")
 
         # Shard the parameter.
         shard_parameter(sharded_embedding, "weight", spec)
@@ -134,13 +124,26 @@ def test_sharded_embedding_colwise(self):
             self._run_sharded_embedding(spec, [34], 15, 14, padding_idx=10)
             self._run_sharded_embedding(spec, [8, 6, 5, 4], 23, 13, padding_idx=12)
             self._run_sharded_embedding(
-                spec, [4, 5, 6], 23, 13, max_norm=2.5,
+                spec,
+                [4, 5, 6],
+                23,
+                13,
+                max_norm=2.5,
             )
             self._run_sharded_embedding(
-                spec, [12, 7, 16], 23, 13, max_norm=2.5,
+                spec,
+                [12, 7, 16],
+                23,
+                13,
+                max_norm=2.5,
             )
             self._run_sharded_embedding(
-                spec, [8, 16, 20], 12, 12, max_norm=1.25, norm_type=1.0,
+                spec,
+                [8, 16, 20],
+                12,
+                12,
+                max_norm=1.25,
+                norm_type=1.0,
             )
             self._run_sharded_embedding(spec, [30], 15, 14, max_norm=2.0)
 
@@ -154,11 +157,19 @@ def test_sharded_embedding_rowwise(self):
             self._run_sharded_embedding(spec, [5, 4], 32, 12)
             self._run_sharded_embedding(spec, [6, 7, 6], 64, 11)
             self._run_sharded_embedding(
-                spec, [5, 12], 16, 22, max_norm=2.5,
+                spec,
+                [5, 12],
+                16,
+                22,
+                max_norm=2.5,
             )
             self._run_sharded_embedding(spec, [6, 7, 6], 64, 11, padding_idx=30)
             self._run_sharded_embedding(
-                spec, [6, 5, 3], 26, 11, max_norm=2.0,
+                spec,
+                [6, 5, 3],
+                26,
+                11,
+                max_norm=2.0,
             )
 
             # Test uneven split.
@@ -167,7 +178,11 @@ def test_sharded_embedding_rowwise(self):
             self._run_sharded_embedding(spec, [4], 21, 11)
             self._run_sharded_embedding(spec, [8, 6, 5, 4], 21, 11, padding_idx=10)
             self._run_sharded_embedding(
-                spec, [6, 5, 8], 28, 5, max_norm=2.0,
+                spec,
+                [6, 5, 8],
+                28,
+                5,
+                max_norm=2.0,
             )
             self._run_sharded_embedding(spec, [4], 14, 11, max_norm=2.5)
 
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_embedding_bag.py b/test/distributed/_shard/sharded_tensor/ops/test_embedding_bag.py
index 4843534f68fb3..98feeba76754f 100644
--- a/test/distributed/_shard/sharded_tensor/ops/test_embedding_bag.py
+++ b/test/distributed/_shard/sharded_tensor/ops/test_embedding_bag.py
@@ -4,20 +4,12 @@
 
 import torch
 import torch.distributed as dist
-from torch.distributed._shard import (
-    shard_parameter,
-)
-from torch.testing._internal.common_distributed import (
-    requires_nccl,
-    skip_if_lt_x_gpu,
-)
-from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
-    run_tests,
-)
+from torch.distributed._shard import shard_parameter
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 from torch.testing._internal.distributed._shard.sharded_tensor import (
-    TEST_GPU_NUM,
     ShardedTensorTestBase,
+    TEST_GPU_NUM,
     with_comms,
 )
 from torch.testing._internal.distributed._shard.sharded_tensor._test_ops_common import (
@@ -184,7 +176,12 @@ def _test_sharded_embedding_bag_with_test_cases(self, spec, sharded_dim):
         self._run_sharded_embedding_bag(spec, [5, 4], 17, 12, "mean")
         self._run_sharded_embedding_bag(spec, [6, 7], 21, 11, "max")
         self._run_sharded_embedding_bag(
-            spec, [5, 5], 17, 14, "sum", max_norm=2.5,
+            spec,
+            [5, 5],
+            17,
+            14,
+            "sum",
+            max_norm=2.5,
         )
         self._run_sharded_embedding_bag(
             spec,
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_init.py b/test/distributed/_shard/sharded_tensor/ops/test_init.py
index 6cbfd04b210f0..9d672333769d3 100644
--- a/test/distributed/_shard/sharded_tensor/ops/test_init.py
+++ b/test/distributed/_shard/sharded_tensor/ops/test_init.py
@@ -1,37 +1,34 @@
 # Owner(s): ["oncall: distributed"]
 
 import sys
+
 import torch
 
 from torch.distributed._shard import sharded_tensor
-from torch.distributed._shard.sharding_spec import (
-    ChunkShardingSpec,
-)
-from torch.testing._internal.common_distributed import (
-    requires_nccl,
-    skip_if_lt_x_gpu,
-)
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 from torch.testing._internal.distributed._shard.sharded_tensor import (
     ShardedTensorTestBase,
     with_comms,
 )
-from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
-    run_tests,
-)
 
 if TEST_WITH_DEV_DBG_ASAN:
-    print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr)
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
     sys.exit(0)
 
+
 class TestShardedTensorNNInit(ShardedTensorTestBase):
-    """ Testing torch.nn.init functions for ShardedTensor """
+    """Testing torch.nn.init functions for ShardedTensor"""
 
     @with_comms
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_init_sharded_tensor_with_uniform(self):
-        """ Test torch.nn.init.uniform_(ShardedTensor, a, b) """
+        """Test torch.nn.init.uniform_(ShardedTensor, a, b)"""
 
         spec = ChunkShardingSpec(
             dim=0,
@@ -66,7 +63,7 @@ def test_init_sharded_tensor_with_uniform(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_init_sharded_tensor_with_normal(self):
-        """ Test torch.nn.init.normal_(ShardedTensor, mean, std) """
+        """Test torch.nn.init.normal_(ShardedTensor, mean, std)"""
 
         spec = ChunkShardingSpec(
             dim=0,
@@ -101,7 +98,7 @@ def test_init_sharded_tensor_with_normal(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_init_sharded_tensor_with_kaiming_uniform(self):
-        """ Test torch.nn.init.kaiming_uniform_(ShardedTensor, a, mode, nonlinearit) """
+        """Test torch.nn.init.kaiming_uniform_(ShardedTensor, a, mode, nonlinearit)"""
 
         spec = ChunkShardingSpec(
             dim=0,
@@ -115,7 +112,7 @@ def test_init_sharded_tensor_with_kaiming_uniform(self):
         h, w = 8, 2
         expected_h = 2
         expected_device = torch.device(f"cuda:{self.rank}")
-        a, mode, nonlinearity = 0, 'fan_in', 'leaky_relu'
+        a, mode, nonlinearity = 0, "fan_in", "leaky_relu"
 
         seed = 1234
         dtype = torch.double
@@ -129,8 +126,11 @@ def test_init_sharded_tensor_with_kaiming_uniform(self):
         torch.nn.init.kaiming_uniform_(st, a=a, mode=mode, nonlinearity=nonlinearity)
 
         torch.manual_seed(seed)
-        torch.nn.init.kaiming_uniform_(local_tensor_clone, a=a, mode=mode, nonlinearity=nonlinearity)
+        torch.nn.init.kaiming_uniform_(
+            local_tensor_clone, a=a, mode=mode, nonlinearity=nonlinearity
+        )
         self.assertEqual(local_tensor_clone, st.local_shards()[0].tensor)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py b/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py
index 977fa701b44e0..ca49f52d0833b 100644
--- a/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py
+++ b/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py
@@ -5,22 +5,15 @@
 import torch
 import torch.distributed._shard.sharded_tensor as sharded_tensor
 
-from torch.distributed._shard.sharding_spec import (
-    ChunkShardingSpec,
-)
-from torch.testing._internal.common_distributed import (
-    requires_nccl,
-    skip_if_lt_x_gpu,
-)
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import run_tests
 
 from torch.testing._internal.distributed._shard.sharded_tensor import (
-    TEST_GPU_NUM,
     ShardedTensorTestBase,
+    TEST_GPU_NUM,
     with_comms,
 )
-from torch.testing._internal.common_utils import (
-    run_tests,
-)
 
 
 class TestTensorOps(ShardedTensorTestBase):
diff --git a/test/distributed/_shard/sharded_tensor/test_logger.py b/test/distributed/_shard/sharded_tensor/test_logger.py
index d1560261ad749..fa946819f93b2 100644
--- a/test/distributed/_shard/sharded_tensor/test_logger.py
+++ b/test/distributed/_shard/sharded_tensor/test_logger.py
@@ -3,10 +3,7 @@
 import logging
 
 from torch.distributed._shard.sharded_tensor.logger import _get_or_create_logger
-from torch.testing._internal.common_utils import (
-    TestCase,
-    run_tests,
-)
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class ShardingSpecLoggerTest(TestCase):
diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
index 4d09344bf57da..141a52ddd8f0c 100644
--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
@@ -1,71 +1,76 @@
 # Owner(s): ["oncall: distributed"]
 
 import copy
-import math
 import io
 import itertools
+import math
 import pickle
 import sys
 from typing import List
+
 import torch
 import torch.distributed as dist
-from torch.distributed import rpc
-from torch.distributed import distributed_c10d
+from torch.distributed import distributed_c10d, rpc
 from torch.distributed._shard import sharded_tensor
 from torch.distributed._shard.api import (
-    shard_parameter,
-    _shard_tensor,
-    load_with_process_group,
     _collect_local_shard,
     _reshard_output,
+    _shard_tensor,
+    load_with_process_group,
+    shard_parameter,
 )
 from torch.distributed._shard.sharded_tensor import (
     custom_sharded_op_impl,
     pre_load_state_dict_hook,
-    state_dict_hook,
+    Shard,
     ShardedTensor,
     ShardedTensorBase,
     ShardedTensorMetadata,
-    Shard
+    state_dict_hook,
+)
+from torch.distributed._shard.sharded_tensor.api import (
+    _create_tensor_from_params,
+    TensorProperties,
+)
+from torch.distributed._shard.sharded_tensor.utils import (
+    _parse_and_validate_remote_device,
 )
 from torch.distributed._shard.sharding_spec import (
     ChunkShardingSpec,
     EnumerableShardingSpec,
     ShardMetadata,
 )
-from torch.distributed._shard.sharded_tensor.utils import (
-    _parse_and_validate_remote_device
-)
-from torch.distributed._shard.sharded_tensor.api import (
-    TensorProperties,
-    _create_tensor_from_params,
-)
+from torch.distributed.remote_device import _remote_device
 from torch.testing._internal.common_distributed import (
     requires_nccl,
     skip_if_lt_x_gpu,
+    spawn_threads_and_init_comms,
     tp_transports,
 )
 from torch.testing._internal.common_utils import (
-    TestCase,
-    TEST_WITH_DEV_DBG_ASAN,
     run_tests,
     skip_but_pass_in_sandcastle_if,
-    TEST_CUDA
+    TEST_CUDA,
+    TEST_WITH_DEV_DBG_ASAN,
+    TestCase,
 )
 from torch.testing._internal.distributed._shard.sharded_tensor import (
     ShardedTensorTestBase,
     with_comms,
 )
-from torch.distributed.remote_device import _remote_device
 from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import (
     _chunk_sharding_specs_list_for_test,
     MyShardedModel1,
 )
 
 if TEST_WITH_DEV_DBG_ASAN:
-    print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr)
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
     sys.exit(0)
 
+
 class TestShardedTensorMetadata(TestCase):
     def test_serialize_and_deserialize(self):
         shard_metadatas = [
@@ -88,34 +93,59 @@ def test_serialize_and_deserialize(self):
                 shard_offsets=[5, 5],
                 shard_sizes=[5, 5],
                 placement="rank:3/cuda:3",
-            )
+            ),
         ]
 
         dtypes = [
-            torch.float, torch.double, torch.cfloat, torch.cdouble, torch.half,
-            torch.bfloat16, torch.uint8, torch.int8, torch.short, torch.int,
-            torch.long, torch.bool]
+            torch.float,
+            torch.double,
+            torch.cfloat,
+            torch.cdouble,
+            torch.half,
+            torch.bfloat16,
+            torch.uint8,
+            torch.int8,
+            torch.short,
+            torch.int,
+            torch.long,
+            torch.bool,
+        ]
 
         layouts = [torch.strided, torch.sparse_coo]
         requires_grads = [True, False]
-        memory_formats = [torch.contiguous_format, torch.channels_last, torch.preserve_format]
+        memory_formats = [
+            torch.contiguous_format,
+            torch.channels_last,
+            torch.preserve_format,
+        ]
         pin_memories = [True, False]
 
-        for tensor_properties_input in itertools.product(dtypes, layouts, requires_grads, memory_formats, pin_memories):
-            dtype, layout, requires_grad, memory_format, pin_memory = tensor_properties_input
+        for tensor_properties_input in itertools.product(
+            dtypes, layouts, requires_grads, memory_formats, pin_memories
+        ):
+            (
+                dtype,
+                layout,
+                requires_grad,
+                memory_format,
+                pin_memory,
+            ) = tensor_properties_input
 
             expected_st_metadata = sharded_tensor.ShardedTensorMetadata(
                 shard_metadatas,
                 (10, 10),
-                TensorProperties(dtype, layout, requires_grad, memory_format, pin_memory)
+                TensorProperties(
+                    dtype, layout, requires_grad, memory_format, pin_memory
+                ),
             )
 
             pickled_obj = pickle.dumps(expected_st_metadata)
             st_metadata = pickle.loads(pickled_obj)
             self.assertEqual(expected_st_metadata, st_metadata)
 
+
 class TestCreateTensorFromParams(TestCase):
-    @skip_but_pass_in_sandcastle_if(not TEST_CUDA, 'CUDA GPU is needed')
+    @skip_but_pass_in_sandcastle_if(not TEST_CUDA, "CUDA GPU is needed")
     def test_empty(self):
         expected_dtype = torch.double
         tensor_properties = TensorProperties(
@@ -123,10 +153,12 @@ def test_empty(self):
             layout=torch.strided,
             requires_grad=False,
             pin_memory=False,
-            memory_format=torch.contiguous_format)
-        local_device = torch.device('cuda:0')
+            memory_format=torch.contiguous_format,
+        )
+        local_device = torch.device("cuda:0")
         local_tensor = _create_tensor_from_params(
-            5, 10, local_device=local_device, tensor_properties=tensor_properties)
+            5, 10, local_device=local_device, tensor_properties=tensor_properties
+        )
         self.assertEqual(local_device, local_tensor.device)
         self.assertEqual(expected_dtype, local_tensor.dtype)
         self.assertEqual(torch.strided, local_tensor.layout)
@@ -150,7 +182,7 @@ def test_shard_parameter(self):
 
         fc = torch.nn.Linear(12, 12).cuda(self.rank)
         weight_og = fc.weight.clone()
-        shard_parameter(fc, 'weight', spec)
+        shard_parameter(fc, "weight", spec)
 
         # Verify.
         self.assertTrue(isinstance(fc.weight, ShardedTensor))
@@ -159,7 +191,9 @@ def test_shard_parameter(self):
         self.assertEqual(torch.Size([3, 12]), local_shards[0].tensor.size())
         self.assertEqual(3, local_shards[0].tensor.size(0))
         self.assertEqual(12, local_shards[0].tensor.size(1))
-        self.assertEqual(torch.narrow(weight_og, 0, 3 * self.rank, 3), local_shards[0].tensor)
+        self.assertEqual(
+            torch.narrow(weight_og, 0, 3 * self.rank, 3), local_shards[0].tensor
+        )
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(4)
@@ -176,20 +210,22 @@ def test_shard_parameter_errors(self):
         )
 
         fc = torch.nn.Linear(12, 12).cuda(self.rank)
-        with self.assertRaisesRegex(ValueError, 'does not match with src_rank'):
-            shard_parameter(fc, 'weight', spec, src_rank=self.rank)
+        with self.assertRaisesRegex(ValueError, "does not match with src_rank"):
+            shard_parameter(fc, "weight", spec, src_rank=self.rank)
 
-        with self.assertRaisesRegex(AttributeError, 'has no attribute'):
-            shard_parameter(fc, 'foo', spec)
+        with self.assertRaisesRegex(AttributeError, "has no attribute"):
+            shard_parameter(fc, "foo", spec)
 
-        with self.assertRaisesRegex(ValueError, 'Expected Linear.bias to be a Tensor, but found str'):
+        with self.assertRaisesRegex(
+            ValueError, "Expected Linear.bias to be a Tensor, but found str"
+        ):
             del fc.bias
             fc.bias = "foo"
-            shard_parameter(fc, 'bias', spec)
+            shard_parameter(fc, "bias", spec)
 
-        with self.assertRaisesRegex(ValueError, 'not a contiguous Tensor'):
+        with self.assertRaisesRegex(ValueError, "not a contiguous Tensor"):
             fc.bias = torch.rand(10, 10).cuda(self.rank).t()
-            shard_parameter(fc, 'bias', spec)
+            shard_parameter(fc, "bias", spec)
 
         spec = ChunkShardingSpec(
             dim=0,
@@ -200,23 +236,25 @@ def test_shard_parameter_errors(self):
                 "rank:3/cuda:3",
             ],
         )
-        with self.assertRaisesRegex(ValueError, 'does not match with sharding_spec'):
-            shard_parameter(fc, 'weight', spec)
+        with self.assertRaisesRegex(ValueError, "does not match with sharding_spec"):
+            shard_parameter(fc, "weight", spec)
 
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="rank:0/cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="rank:1/cuda:1",
-            ),
-        ])
-        with self.assertRaisesRegex(NotImplementedError, 'not implemented yet!'):
-            shard_parameter(fc, 'weight', spec)
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+            ]
+        )
+        with self.assertRaisesRegex(NotImplementedError, "not implemented yet!"):
+            shard_parameter(fc, "weight", spec)
 
 
 class TestShardTensor(ShardedTensorTestBase):
@@ -261,6 +299,11 @@ def test_shard_tensor_with_empty_shard(self):
 
         # Verify.
         self.assertTrue(isinstance(st, sharded_tensor.ShardedTensor))
+        sms = st.metadata().shards_metadata
+        self.assertEqual(len(sms), 4)
+        for sm in sms:
+            self.assertTrue(sm.shard_offsets[0] + sm.shard_sizes[0] <= tensor.size(0))
+
         local_shard = st.local_tensor()
         self.assertEqual(1, len(st.local_shards()))
         if dist.get_rank() < 3:
@@ -284,10 +327,10 @@ def test_shard_tensor_errors(self):
         )
         tensor = torch.rand(12, 12).cuda(self.rank)
 
-        with self.assertRaisesRegex(ValueError, 'does not match with src_rank'):
+        with self.assertRaisesRegex(ValueError, "does not match with src_rank"):
             _shard_tensor(tensor, spec, src_rank=self.rank)
 
-        with self.assertRaisesRegex(ValueError, 'not a contiguous Tensor'):
+        with self.assertRaisesRegex(ValueError, "not a contiguous Tensor"):
             tensor_t = torch.rand(12, 12).cuda(self.rank).t()
             _shard_tensor(tensor_t, spec)
 
@@ -300,24 +343,24 @@ def test_shard_tensor_errors(self):
                 "rank:3/cuda:3",
             ],
         )
-        with self.assertRaisesRegex(ValueError, 'does not match with sharding_spec'):
+        with self.assertRaisesRegex(ValueError, "does not match with sharding_spec"):
             _shard_tensor(tensor, spec)
 
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="rank:0/cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="rank:1/cuda:1",
-            ),
-        ])
-        with self.assertRaisesRegex(
-            NotImplementedError, 'not implemented yet!'
-        ):
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+            ]
+        )
+        with self.assertRaisesRegex(NotImplementedError, "not implemented yet!"):
             _shard_tensor(tensor, spec)
 
 
@@ -416,7 +459,6 @@ def test_local_tensor_error(self):
 
 
 class TestShardedTensorChunked(ShardedTensorTestBase):
-
     @with_comms
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
@@ -470,7 +512,6 @@ def test_sharded_tensor_metadata(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_complete_world_size(self):
-
         for dim in [0, -2]:
             spec = ChunkShardingSpec(
                 dim=dim,
@@ -504,7 +545,9 @@ def test_complete_world_size(self):
                     self.assertEqual([1, 20], shard_metadata.shard_sizes)
                 else:
                     self.assertEqual([3, 20], shard_metadata.shard_sizes)
-                self.assertEqual(f'rank:{rank}/cuda:{rank}', str(shard_metadata.placement))
+                self.assertEqual(
+                    f"rank:{rank}/cuda:{rank}", str(shard_metadata.placement)
+                )
 
             # Validate remote shards.
             remote_shards = st.remote_shards()
@@ -515,18 +558,20 @@ def test_complete_world_size(self):
                 for remote_shard in shards:
                     self.assertEqual(rpc_rank, remote_shard.owner().id)
                     shard = remote_shard.to_here()
-                    self.assertEqual(f'rank:{rpc_rank}/cuda:{rpc_rank}', str(shard.metadata.placement))
+                    self.assertEqual(
+                        f"rank:{rpc_rank}/cuda:{rpc_rank}",
+                        str(shard.metadata.placement),
+                    )
                     if rpc_rank == 3:
                         self.assertEqual((1, 20), shard.tensor.size())
                     else:
                         self.assertEqual((3, 20), shard.tensor.size())
 
-
     @with_comms
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_create_sharded_tensor_with_ones(self):
-        """ Test sharded_tensor.ones(...) """
+        """Test sharded_tensor.ones(...)"""
 
         spec = ChunkShardingSpec(
             dim=0,
@@ -554,7 +599,7 @@ def test_create_sharded_tensor_with_ones(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_gather_even(self) -> None:
-        """ Test _sharded_tensor.gather(...) with evenly distributed._shards"""
+        """Test _sharded_tensor.gather(...) with evenly distributed._shards"""
 
         spec = ChunkShardingSpec(
             dim=0,
@@ -587,7 +632,7 @@ def test_gather_even(self) -> None:
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_gather_uneven(self) -> None:
-        """ Test _sharded_tensor.gather(...) with unevenly distributed._shards"""
+        """Test _sharded_tensor.gather(...) with unevenly distributed._shards"""
 
         spec = ChunkShardingSpec(
             dim=0,
@@ -621,7 +666,7 @@ def test_gather_uneven(self) -> None:
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_create_sharded_tensor_with_zeros(self):
-        """ Test sharded_tensor.zeros(...) """
+        """Test sharded_tensor.zeros(...)"""
 
         spec = ChunkShardingSpec(
             dim=0,
@@ -645,12 +690,11 @@ def test_create_sharded_tensor_with_zeros(self):
         self.assertEqual((expected_h, w), local_shard.size())
         self.assertEqual(local_shard, torch.zeros(expected_h, w))
 
-
     @with_comms
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_create_sharded_tensor_with_rand(self):
-        """ Test sharded_tensor.rand(...)/randn(...) """
+        """Test sharded_tensor.rand(...)/randn(...)"""
 
         spec = ChunkShardingSpec(
             dim=0,
@@ -701,7 +745,7 @@ def test_create_sharded_tensor_with_rand(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_create_sharded_tensor_with_full(self):
-        """ Test sharded_tensor.full(...) """
+        """Test sharded_tensor.full(...)"""
 
         spec = ChunkShardingSpec(
             dim=0,
@@ -714,7 +758,9 @@ def test_create_sharded_tensor_with_full(self):
         )
         h, w = 10, 20
         fill_value = 1234
-        st = sharded_tensor.full(spec, size=(h, w), fill_value=fill_value, dtype=torch.int32)
+        st = sharded_tensor.full(
+            spec, size=(h, w), fill_value=fill_value, dtype=torch.int32
+        )
 
         # Validate local shard is initialized with torch.full
         local_shards = st.local_shards()
@@ -724,14 +770,16 @@ def test_create_sharded_tensor_with_full(self):
         # The split: for rank!=3 ceil(h/4)=3  for rank=3 1
         expected_h = 1 if self.rank == 3 else math.ceil(h / 4)
         self.assertEqual((expected_h, w), local_shard.size())
-        self.assertEqual(local_shard,
-                         torch.full(size=(expected_h, w), fill_value=fill_value, dtype=torch.int32))
+        self.assertEqual(
+            local_shard,
+            torch.full(size=(expected_h, w), fill_value=fill_value, dtype=torch.int32),
+        )
 
     @with_comms
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_create_sharded_tensor_like(self):
-        """ Test tensor like methods, i.e. torch.zeros_like(...), torch.full_like, etc. """
+        """Test tensor like methods, i.e. torch.zeros_like(...), torch.full_like, etc."""
 
         spec = ChunkShardingSpec(
             dim=0,
@@ -754,22 +802,28 @@ def test_create_sharded_tensor_like(self):
             torch.rand_like: torch.rand,
             torch.randn_like: torch.randn,
             torch.empty_like: torch.empty,
-            torch.full_like: torch.full
+            torch.full_like: torch.full,
         }
         for op, expect_local_op in tensor_like_ops.items():
             if op == torch.full_like:
                 # special handle full/full_like as it needs to have additional fill_value arg
-                expect_tensor = expect_local_op((expected_h, w), 8.8, device=expected_device, dtype=dtype)
+                expect_tensor = expect_local_op(
+                    (expected_h, w), 8.8, device=expected_device, dtype=dtype
+                )
                 new_op_st = op(st, 8.8, dtype=dtype)
                 self.assertEqual(new_op_st.local_tensor(), expect_tensor)
             elif op == torch.empty_like:
                 # empty/empty_like we only compare the shape
-                expect_tensor = expect_local_op(expected_h, w, device=expected_device, dtype=dtype)
+                expect_tensor = expect_local_op(
+                    expected_h, w, device=expected_device, dtype=dtype
+                )
                 new_op_st = op(st, dtype=dtype)
                 self.assertEqual(new_op_st.local_tensor().shape, expect_tensor.shape)
             else:
                 torch.manual_seed(seed)
-                expect_tensor = expect_local_op(expected_h, w, device=expected_device, dtype=dtype)
+                expect_tensor = expect_local_op(
+                    expected_h, w, device=expected_device, dtype=dtype
+                )
                 torch.manual_seed(seed)
                 new_op_st = op(st, dtype=dtype)
                 self.assertEqual(new_op_st.local_tensor(), expect_tensor)
@@ -778,7 +832,6 @@ def test_create_sharded_tensor_like(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_partial_world_size(self):
-
         spec = ChunkShardingSpec(
             dim=0,
             placements=[
@@ -806,7 +859,10 @@ def test_partial_world_size(self):
         for shard_rank, shard_metadata in enumerate(shards_metadata):
             self.assertEqual([shard_rank * 5, 0], shard_metadata.shard_offsets)
             self.assertEqual([5, 20], shard_metadata.shard_sizes)
-            self.assertEqual(f'rank:{shard_rank + 2}/cuda:{shard_rank + 2}', str(shard_metadata.placement))
+            self.assertEqual(
+                f"rank:{shard_rank + 2}/cuda:{shard_rank + 2}",
+                str(shard_metadata.placement),
+            )
 
         # Validate remote shards.
         remote_shards = st.remote_shards()
@@ -820,19 +876,20 @@ def test_partial_world_size(self):
             for remote_shard in shards:
                 self.assertEqual(rpc_rank, remote_shard.owner().id)
                 shard = remote_shard.to_here()
-                self.assertEqual(f'rank:{rpc_rank}/cuda:{rpc_rank}', str(shard.metadata.placement))
+                self.assertEqual(
+                    f"rank:{rpc_rank}/cuda:{rpc_rank}", str(shard.metadata.placement)
+                )
                 self.assertEqual((5, 20), shard.tensor.size())
 
     @with_comms
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_new_group(self):
-
         spec = ChunkShardingSpec(
             dim=0,
             placements=[
-                "rank:1/cuda:2",
-                "rank:2/cuda:3",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
             ],
         )
 
@@ -857,7 +914,10 @@ def test_new_group(self):
         for shard_rank, shard_metadata in enumerate(shards_metadata):
             self.assertEqual([shard_rank * 5, 0], shard_metadata.shard_offsets)
             self.assertEqual([5, 20], shard_metadata.shard_sizes)
-            self.assertEqual(f'rank:{shard_rank + 1}/cuda:{shard_rank + 2}', str(shard_metadata.placement))
+            self.assertEqual(
+                f"rank:{shard_rank + 2}/cuda:{shard_rank + 2}",
+                str(shard_metadata.placement),
+            )
 
         # Validate remote shards.
         remote_shards = st.remote_shards()
@@ -871,7 +931,9 @@ def test_new_group(self):
             for remote_shard in shards:
                 shard = remote_shard.to_here()
                 self.assertEqual(rpc_rank, remote_shard.owner().id)
-                self.assertEqual(f'rank:{rpc_rank - 1}/cuda:{rpc_rank}', str(shard.metadata.placement))
+                self.assertEqual(
+                    f"rank:{rpc_rank}/cuda:{rpc_rank}", str(shard.metadata.placement)
+                )
                 self.assertEqual((5, 20), shard.tensor.size())
 
     @with_comms
@@ -897,7 +959,9 @@ def test_multiple_local_shards(self):
         local_shards = st.local_shards()
         self.assertEqual(2, len(local_shards))
         for local_shard in local_shards:
-            self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.tensor.device)
+            self.assertEqual(
+                torch.device(f"cuda:{self.rank}"), local_shard.tensor.device
+            )
             self.assertEqual((2, 20), local_shard.tensor.size())
 
         # Validate global metadata.
@@ -908,7 +972,10 @@ def test_multiple_local_shards(self):
         for shard_idx, shard_metadata in enumerate(shards_metadata):
             self.assertEqual([shard_idx * 2, 0], shard_metadata.shard_offsets)
             self.assertEqual([2, 20], shard_metadata.shard_sizes)
-            self.assertEqual(f'rank:{shard_idx % 4}/cuda:{shard_idx % 4}', str(shard_metadata.placement))
+            self.assertEqual(
+                f"rank:{shard_idx % 4}/cuda:{shard_idx % 4}",
+                str(shard_metadata.placement),
+            )
 
         # Validate remote shards.
         remote_shards = st.remote_shards()
@@ -921,7 +988,6 @@ def test_multiple_local_shards(self):
                 self.assertEqual((2, 20), shard.tensor.size())
                 self.assertEqual(rpc_rank, remote_shard.owner().id)
 
-
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_sharding_columns(self):
@@ -955,55 +1021,74 @@ def test_sharding_columns(self):
             for rank, shard_metadata in enumerate(shards_metadata):
                 self.assertEqual([0, rank * 8], shard_metadata.shard_offsets)
                 self.assertEqual([10, 8], shard_metadata.shard_sizes)
-                self.assertEqual(f'rank:{rank}/cuda:{rank}', str(shard_metadata.placement))
+                self.assertEqual(
+                    f"rank:{rank}/cuda:{rank}", str(shard_metadata.placement)
+                )
 
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_invalid_sharding(self):
         self.init_pg()
 
-        with self.assertRaisesRegex(NotImplementedError, 'does not support named dimension'):
-            spec = ChunkShardingSpec(dim='H', placements=["rank:1/cuda:1"])
+        with self.assertRaisesRegex(
+            NotImplementedError, "does not support named dimension"
+        ):
+            spec = ChunkShardingSpec(dim="H", placements=["rank:1/cuda:1"])
             sharded_tensor.empty(spec, 10, 20)
 
         for dim in [2, 3, 4, -3, -4, -5]:
             spec = ChunkShardingSpec(dim=dim, placements=["rank:1/cuda:1"])
-            with self.assertRaisesRegex(ValueError, 'Invalid sharding dim'):
+            with self.assertRaisesRegex(ValueError, "Invalid sharding dim"):
                 sharded_tensor.empty(spec, 10, 20)
 
         spec = ChunkShardingSpec(dim=0, placements=["rank:5/cuda:1"])
-        with self.assertRaisesRegex(ValueError, 'Invalid rank'):
+        with self.assertRaisesRegex(
+            ValueError, "Global rank 5 does not exist in input process group"
+        ):
             sharded_tensor.empty(spec, 10, 20)
 
         spec = ChunkShardingSpec(dim=0, placements=["rank:0/cuda:1"])
         st = sharded_tensor.empty(spec, 10, 20)
         tensor = torch.empty(10, 20)
-        with self.assertRaisesRegex(RuntimeError, r".*not supported for ShardedTensor!$"):
+        with self.assertRaisesRegex(
+            RuntimeError, r".*not supported for ShardedTensor!$"
+        ):
             torch.add(st, tensor)
 
         spec = ChunkShardingSpec(dim=0, placements=["rank:0/cuda:1"])
-        with self.assertRaisesRegex(ValueError, 'Only torch.strided layout is currently supported'):
+        with self.assertRaisesRegex(
+            ValueError, "Only torch.strided layout is currently supported"
+        ):
             sharded_tensor.empty(spec, 10, 20, layout=torch.sparse_coo)
 
         spec = ChunkShardingSpec(dim=0, placements=["rank:0/cuda:1"])
-        with self.assertRaisesRegex(ValueError, 'Only torch.contiguous_format memory_format is currently supported'):
+        with self.assertRaisesRegex(
+            ValueError,
+            "Only torch.contiguous_format memory_format is currently supported",
+        ):
             sharded_tensor.empty(spec, 10, 20, memory_format=torch.channels_last)
 
         spec = ChunkShardingSpec(dim=0, placements=["worker0/cuda:1"])
-        with self.assertRaisesRegex(RuntimeError, 'RPC framework needs to be initialized'):
+        with self.assertRaisesRegex(
+            RuntimeError, "RPC framework needs to be initialized"
+        ):
             sharded_tensor.empty(spec, 10, 20)
 
         spec = ChunkShardingSpec(dim=0, placements=["rank:0/cuda:1"])
-        with self.assertRaisesRegex(RuntimeError, 'RPC Framework needs to be initialized'):
+        with self.assertRaisesRegex(
+            RuntimeError, "RPC Framework needs to be initialized"
+        ):
             st = sharded_tensor.empty(spec, 10, 20, init_rrefs=True)
 
-        with self.assertRaisesRegex(RuntimeError, 'ShardedTensor created with init_rrefs=False'):
+        with self.assertRaisesRegex(
+            RuntimeError, "ShardedTensor created with init_rrefs=False"
+        ):
             st = sharded_tensor.empty(spec, 10, 20)
             st.remote_shards()
 
         self.init_rpc()
         spec = ChunkShardingSpec(dim=0, placements=["workerfoo/cuda:1"])
-        with self.assertRaisesRegex(ValueError, 'Invalid worker name'):
+        with self.assertRaisesRegex(ValueError, "Invalid worker name"):
             sharded_tensor.empty(spec, 10, 20, init_rrefs=True)
 
     @skip_if_lt_x_gpu(4)
@@ -1012,18 +1097,22 @@ def test_invalid_pg_rpc_ranks(self):
         self.init_pg()
 
         # Init RPC with different ranks.
-        rpc_backend_options = rpc.TensorPipeRpcBackendOptions(_transports=tp_transports())
+        rpc_backend_options = rpc.TensorPipeRpcBackendOptions(
+            _transports=tp_transports()
+        )
         rpc_backend_options.init_method = f"file://{self.file_name}"
         rank = (self.rank + 1) % self.world_size
         rpc.init_rpc(
-            name=f'worker{rank}',
+            name=f"worker{rank}",
             rank=rank,
             world_size=self.world_size,
             rpc_backend_options=rpc_backend_options,
         )
 
         spec = ChunkShardingSpec(dim=0, placements=["rank:1/cuda:1"])
-        with self.assertRaisesRegex(ValueError, 'Default ProcessGroup and RPC ranks must be the same'):
+        with self.assertRaisesRegex(
+            ValueError, "Default ProcessGroup and RPC ranks must be the same"
+        ):
             sharded_tensor.empty(spec, 10, 20, init_rrefs=True)
 
     @skip_if_lt_x_gpu(4)
@@ -1062,7 +1151,9 @@ def test_insufficient_sharding_dims(self):
 
         for shard_rank, shard_metadata in enumerate(shards_metadata):
             self.assertEqual([shard_rank, 0], shard_metadata.shard_offsets)
-            self.assertEqual(f'rank:{shard_rank}/cuda:{shard_rank}', str(shard_metadata.placement))
+            self.assertEqual(
+                f"rank:{shard_rank}/cuda:{shard_rank}", str(shard_metadata.placement)
+            )
             if shard_rank <= 1:
                 self.assertEqual([1, 20], shard_metadata.shard_sizes)
             else:
@@ -1115,13 +1206,13 @@ def test_sharded_tensor_sizes(self):
         self.assertEqual(st.ndim, 2)
         # Test with invalid input
         st = sharded_tensor.empty(spec, (10, 20), init_rrefs=True)
-        with self.assertRaisesRegex(IndexError, 'Dimension out of range'):
+        with self.assertRaisesRegex(IndexError, "Dimension out of range"):
             st.size(-3)
-        with self.assertRaisesRegex(IndexError, 'Dimension out of range'):
+        with self.assertRaisesRegex(IndexError, "Dimension out of range"):
             st.size(2)
 
         with self.assertRaises(TypeError):
-            st = sharded_tensor.empty(spec, 'foo')
+            st = sharded_tensor.empty(spec, "foo")
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -1162,7 +1253,11 @@ def test_state_dict(self):
         self.assertTrue("submodule.sharded_tensor2" in loaded_dict_keys)
         # Verify after load.
         self.assertTrue(torch.equal(m.sharded_tensor1, module_load.sharded_tensor1))
-        self.assertTrue(torch.equal(m.submodule.sharded_tensor2, module_load.submodule.sharded_tensor2))
+        self.assertTrue(
+            torch.equal(
+                m.submodule.sharded_tensor2, module_load.submodule.sharded_tensor2
+            )
+        )
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -1171,10 +1266,10 @@ def test_state_dict_new_group(self):
         spec = ChunkShardingSpec(
             dim=0,
             placements=[
-                "rank:0/cuda:0",
-                "rank:1/cuda:1",
-                "rank:0/cuda:2",
-                "rank:1/cuda:3",
+                "rank:2/cuda:0",
+                "rank:3/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
             ],
         )
 
@@ -1198,7 +1293,11 @@ def test_state_dict_new_group(self):
 
         # Verify after load.
         self.assertTrue(torch.equal(m.sharded_tensor1, module_load.sharded_tensor1))
-        self.assertTrue(torch.equal(m.submodule.sharded_tensor2, module_load.submodule.sharded_tensor2))
+        self.assertTrue(
+            torch.equal(
+                m.submodule.sharded_tensor2, module_load.submodule.sharded_tensor2
+            )
+        )
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -1259,17 +1358,21 @@ def test_load_state_dict_errors(self):
 
         buffer.seek(0)
         if self.rank != 0:
-            with self.assertRaisesRegex(RuntimeError, 'Local rank at save time was'):
+            with self.assertRaisesRegex(RuntimeError, "Local rank at save time was"):
                 with load_with_process_group(pg):
                     state_dict_deser = torch.load(buffer)
         else:
-            with self.assertRaisesRegex(RuntimeError, 'Local world size at save time was'):
+            with self.assertRaisesRegex(
+                RuntimeError, "Local world size at save time was"
+            ):
                 with load_with_process_group(pg):
                     state_dict_deser = torch.load(buffer)
 
         dist.destroy_process_group()
         buffer.seek(0)
-        with self.assertRaisesRegex(RuntimeError, 'Need to initialize default process group'):
+        with self.assertRaisesRegex(
+            RuntimeError, "Need to initialize default process group"
+        ):
             state_dict_deser = torch.load(buffer)
         rpc.shutdown()
 
@@ -1277,7 +1380,6 @@ def test_load_state_dict_errors(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_cleanup(self):
-
         def create_tensors():
             spec = ChunkShardingSpec(
                 dim=0,
@@ -1296,33 +1398,34 @@ def create_tensors():
 
 
 class TestShardedTensorEnumerable(ShardedTensorTestBase):
-
     @with_comms
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_sharded_tensor_metadata(self):
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="rank:0/cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[0, 5],
-                shard_sizes=[5, 5],
-                placement="rank:1/cuda:1",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="rank:2/cuda:2",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 5],
-                shard_sizes=[5, 5],
-                placement="rank:3/cuda:3",
-            )
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[0, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:2/cuda:2",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:3/cuda:3",
+                ),
+            ]
+        )
 
         st = sharded_tensor.empty(spec, 10, 10, init_rrefs=True)
         st_metadata = st.metadata()
@@ -1340,28 +1443,30 @@ def test_sharded_tensor_metadata(self):
         self.assertEqual(torch.double, st.dtype)
 
         # Need CPU for pin_memory
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="rank:0/cpu",
-            ),
-            ShardMetadata(
-                shard_offsets=[0, 5],
-                shard_sizes=[5, 5],
-                placement="rank:1/cpu",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="rank:2/cpu",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 5],
-                shard_sizes=[5, 5],
-                placement="rank:3/cpu",
-            )
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cpu",
+                ),
+                ShardMetadata(
+                    shard_offsets=[0, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cpu",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:2/cpu",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:3/cpu",
+                ),
+            ]
+        )
 
         st = sharded_tensor.empty(spec, 10, 10, pin_memory=True, init_rrefs=True)
         self.assertTrue(st.is_pinned())
@@ -1370,29 +1475,30 @@ def test_sharded_tensor_metadata(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_grid_sharding(self):
-
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="rank:0/cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[0, 5],
-                shard_sizes=[5, 5],
-                placement="rank:1/cuda:1",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="rank:2/cuda:2",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 5],
-                shard_sizes=[5, 5],
-                placement="rank:3/cuda:3",
-            )
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[0, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:2/cuda:2",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:3/cuda:3",
+                ),
+            ]
+        )
 
         st = sharded_tensor.empty(spec, 10, 10, init_rrefs=True)
         self.assertEqual((10, 10), st.size())
@@ -1400,22 +1506,29 @@ def test_grid_sharding(self):
 
         # Verify local shard.
         local_shard = st.local_shards()[0]
-        self.assertEqual(torch.device(f'cuda:{self.rank}'), local_shard.tensor.device)
+        self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.tensor.device)
         self.assertEqual((5, 5), local_shard.tensor.size())
 
         # Verify local shard metadata.
-        self.assertEqual((self.rank // 2 * 5, (self.rank % 2) * 5), local_shard.metadata.shard_offsets)
+        self.assertEqual(
+            (self.rank // 2 * 5, (self.rank % 2) * 5),
+            local_shard.metadata.shard_offsets,
+        )
         self.assertEqual((5, 5), local_shard.metadata.shard_sizes)
-        self.assertEqual(f'rank:{self.rank}/cuda:{self.rank}', str(local_shard.metadata.placement))
+        self.assertEqual(
+            f"rank:{self.rank}/cuda:{self.rank}", str(local_shard.metadata.placement)
+        )
 
         # Verify global metadata.
         st_metadata = st.metadata()
         shards_metadata = st_metadata.shards_metadata
         self.assertEqual(4, len(shards_metadata))
         for rank, shard_metadata in enumerate(shards_metadata):
-            self.assertEqual((rank // 2 * 5, (rank % 2) * 5), shard_metadata.shard_offsets)
+            self.assertEqual(
+                (rank // 2 * 5, (rank % 2) * 5), shard_metadata.shard_offsets
+            )
             self.assertEqual((5, 5), shard_metadata.shard_sizes)
-            self.assertEqual(f'rank:{rank}/cuda:{rank}', str(shard_metadata.placement))
+            self.assertEqual(f"rank:{rank}/cuda:{rank}", str(shard_metadata.placement))
 
         # Validate remote shards.
         remote_shards = st.remote_shards()
@@ -1432,30 +1545,32 @@ def test_grid_sharding(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_create_sharded_tensor_with_ones(self):
-        """ Test sharded_tensor.ones(...) """
+        """Test sharded_tensor.ones(...)"""
 
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="rank:0/cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[0, 5],
-                shard_sizes=[5, 5],
-                placement="rank:1/cuda:1",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="rank:2/cuda:2",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 5],
-                shard_sizes=[5, 5],
-                placement="rank:3/cuda:3",
-            )
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[0, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:2/cuda:2",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:3/cuda:3",
+                ),
+            ]
+        )
 
         st = sharded_tensor.ones(spec, 10, 10, init_rrefs=True)
         self.assertEqual((10, 10), st.size())
@@ -1463,7 +1578,7 @@ def test_create_sharded_tensor_with_ones(self):
 
         # Verify local shard is initialized with torch.ones
         local_shard = st.local_shards()[0]
-        self.assertEqual(torch.device(f'cuda:{self.rank}'), local_shard.tensor.device)
+        self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.tensor.device)
         self.assertEqual((5, 5), local_shard.tensor.size())
         self.assertEqual(local_shard.tensor, torch.ones(5, 5))
 
@@ -1471,30 +1586,32 @@ def test_create_sharded_tensor_with_ones(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_gather_even(self) -> None:
-        """ Test _sharded_tensor.gather(...) with evenly distributed._shards"""
+        """Test _sharded_tensor.gather(...) with evenly distributed._shards"""
 
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="rank:0/cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[0, 5],
-                shard_sizes=[5, 5],
-                placement="rank:1/cuda:1",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="rank:2/cuda:2",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 5],
-                shard_sizes=[5, 5],
-                placement="rank:3/cuda:3",
-            )
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[0, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:2/cuda:2",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:3/cuda:3",
+                ),
+            ]
+        )
 
         h, w = 10, 10
         st = sharded_tensor.ones(spec, h, w, init_rrefs=True)
@@ -1502,11 +1619,7 @@ def test_gather_even(self) -> None:
         full_tensor = None
         dst = 0
         if self.rank == dst:
-            full_tensor = torch.zeros(
-                h,
-                w,
-                device=torch.device(f"cuda:{dst}")
-            )
+            full_tensor = torch.zeros(h, w, device=torch.device(f"cuda:{dst}"))
         st.gather(dst, full_tensor)
 
         if self.rank == dst:
@@ -1518,30 +1631,32 @@ def test_gather_even(self) -> None:
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_gather_uneven(self) -> None:
-        """ Test _sharded_tensor.gather(...) with unevenly distributed._shards"""
+        """Test _sharded_tensor.gather(...) with unevenly distributed._shards"""
 
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="rank:0/cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[0, 5],
-                shard_sizes=[5, 5],
-                placement="rank:1/cuda:1",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="rank:0/cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 5],
-                shard_sizes=[5, 5],
-                placement="rank:3/cuda:3",
-            )
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[0, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:3/cuda:3",
+                ),
+            ]
+        )
 
         h, w = 10, 10
         st = sharded_tensor.ones(spec, h, w, init_rrefs=True)
@@ -1549,11 +1664,7 @@ def test_gather_uneven(self) -> None:
         full_tensor = None
         dst = 0
         if self.rank == dst:
-            full_tensor = torch.zeros(
-                h,
-                w,
-                device=torch.device(f"cuda:{dst}")
-            )
+            full_tensor = torch.zeros(h, w, device=torch.device(f"cuda:{dst}"))
         st.gather(dst, full_tensor)
 
         if self.rank == dst:
@@ -1604,7 +1715,9 @@ def test_sharded_tensor_to_cpu(self):
         self.assertIsInstance(new_st._process_group, distributed_c10d.ProcessGroup)
         # test specs before and after the move almost the same except placement device
         self.assertEqual(spec_before_move.dim, spec_after_move.dim)
-        self.assertEqual(len(spec_before_move.placements), len(spec_after_move.placements))
+        self.assertEqual(
+            len(spec_before_move.placements), len(spec_after_move.placements)
+        )
         for i, remote_device_after in enumerate(spec_after_move.placements):
             remote_device_before = spec_before_move.placements[i]
             self.assertEqual(remote_device_before.rank(), remote_device_after.rank())
@@ -1688,7 +1801,9 @@ def test_sharded_tensor_to_cuda(self):
         self.assertIsInstance(spec_after_move, ChunkShardingSpec)
         # test specs before and after the move almost the same except placement device
         self.assertEqual(spec_before_move.dim, spec_after_move.dim)
-        self.assertEqual(len(spec_before_move.placements), len(spec_after_move.placements))
+        self.assertEqual(
+            len(spec_before_move.placements), len(spec_after_move.placements)
+        )
         for i, remote_device_after in enumerate(spec_after_move.placements):
             remote_device_before = spec_before_move.placements[i]
             self.assertEqual(remote_device_before.rank(), remote_device_after.rank())
@@ -1803,28 +1918,30 @@ def test_sharded_tensor_device(self):
     def test_uneven_shards(self):
         self.init_pg()
 
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[2, 4],
-                placement="rank:0/cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[0, 4],
-                shard_sizes=[4, 2],
-                placement="rank:1/cuda:1",
-            ),
-            ShardMetadata(
-                shard_offsets=[2, 0],
-                shard_sizes=[4, 4],
-                placement="rank:2/cuda:2",
-            ),
-            ShardMetadata(
-                shard_offsets=[4, 4],
-                shard_sizes=[2, 2],
-                placement="rank:3/cuda:3",
-            ),
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[2, 4],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[0, 4],
+                    shard_sizes=[4, 2],
+                    placement="rank:1/cuda:1",
+                ),
+                ShardMetadata(
+                    shard_offsets=[2, 0],
+                    shard_sizes=[4, 4],
+                    placement="rank:2/cuda:2",
+                ),
+                ShardMetadata(
+                    shard_offsets=[4, 4],
+                    shard_sizes=[2, 2],
+                    placement="rank:3/cuda:3",
+                ),
+            ]
+        )
 
         st = sharded_tensor.empty(spec, 6, 6)
         self.assertEqual((6, 6), st.size())
@@ -1852,13 +1969,15 @@ def verify_offsets(rank, offsets):
 
         # Verify local shard.
         local_shard = st.local_shards()[0]
-        self.assertEqual(torch.device(f'cuda:{self.rank}'), local_shard.tensor.device)
+        self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.tensor.device)
         verify_size(self.rank, local_shard.tensor.size())
 
         # Verify local shard metadata.
         verify_offsets(self.rank, local_shard.metadata.shard_offsets)
         verify_size(self.rank, local_shard.metadata.shard_sizes)
-        self.assertEqual(f'rank:{self.rank}/cuda:{self.rank}', str(local_shard.metadata.placement))
+        self.assertEqual(
+            f"rank:{self.rank}/cuda:{self.rank}", str(local_shard.metadata.placement)
+        )
 
         # Verify global metadata.
         st_metadata = st.metadata()
@@ -1867,24 +1986,26 @@ def verify_offsets(rank, offsets):
         for rank, shard_metadata in enumerate(shards_metadata):
             verify_offsets(rank, shard_metadata.shard_offsets)
             verify_size(rank, shard_metadata.shard_sizes)
-            self.assertEqual(f'rank:{rank}/cuda:{rank}', str(shard_metadata.placement))
+            self.assertEqual(f"rank:{rank}/cuda:{rank}", str(shard_metadata.placement))
 
     @with_comms
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_partial_world_size(self):
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="rank:0/cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="rank:1/cuda:1",
-            ),
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+            ]
+        )
 
         st = sharded_tensor.empty(spec, 10, 5, init_rrefs=True)
         self.assertEqual((10, 5), st.size())
@@ -1896,13 +2017,18 @@ def test_partial_world_size(self):
         if self.rank <= 1:
             # Verify local shard.
             local_shard = st.local_shards()[0]
-            self.assertEqual(torch.device(f'cuda:{self.rank}'), local_shard.tensor.device)
+            self.assertEqual(
+                torch.device(f"cuda:{self.rank}"), local_shard.tensor.device
+            )
             self.assertEqual((5, 5), local_shard.tensor.size())
 
             # Verify local shard metadata.
             self.assertEqual((self.rank * 5, 0), local_shard.metadata.shard_offsets)
             self.assertEqual((5, 5), local_shard.metadata.shard_sizes)
-            self.assertEqual(f'rank:{self.rank}/cuda:{self.rank}', str(local_shard.metadata.placement))
+            self.assertEqual(
+                f"rank:{self.rank}/cuda:{self.rank}",
+                str(local_shard.metadata.placement),
+            )
 
         # Verify global metadata.
         st_metadata = st.metadata()
@@ -1911,7 +2037,7 @@ def test_partial_world_size(self):
         for rank, shard_metadata in enumerate(shards_metadata):
             self.assertEqual((rank * 5, 0), shard_metadata.shard_offsets)
             self.assertEqual((5, 5), shard_metadata.shard_sizes)
-            self.assertEqual(f'rank:{rank}/cuda:{rank}', str(shard_metadata.placement))
+            self.assertEqual(f"rank:{rank}/cuda:{rank}", str(shard_metadata.placement))
 
         # Validate remote shards.
         remote_shards = st.remote_shards()
@@ -1932,18 +2058,20 @@ def test_partial_world_size(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_new_group(self):
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="rank:0/cuda:1",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="rank:2/cuda:3",
-            ),
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:3/cuda:3",
+                ),
+            ]
+        )
 
         pg = dist.new_group(ranks=[1, 2, 3])
 
@@ -1952,13 +2080,20 @@ def test_new_group(self):
         if self.rank == 1 or self.rank == 3:
             # Verify local shard.
             local_shard = st.local_shards()[0]
-            self.assertEqual(torch.device(f'cuda:{self.rank}'), local_shard.tensor.device)
+            self.assertEqual(
+                torch.device(f"cuda:{self.rank}"), local_shard.tensor.device
+            )
             self.assertEqual((5, 5), local_shard.tensor.size())
 
             # Verify local shard metadata.
-            self.assertEqual((self.rank // 2 * 5, 0), local_shard.metadata.shard_offsets)
+            self.assertEqual(
+                (self.rank // 2 * 5, 0), local_shard.metadata.shard_offsets
+            )
             self.assertEqual((5, 5), local_shard.metadata.shard_sizes)
-            self.assertEqual(f'rank:{self.rank - 1}/cuda:{self.rank}', str(local_shard.metadata.placement))
+            self.assertEqual(
+                f"rank:{self.rank}/cuda:{self.rank}",
+                str(local_shard.metadata.placement),
+            )
 
         # Verify global metadata.
         st_metadata = st.metadata()
@@ -1967,7 +2102,10 @@ def test_new_group(self):
         for rank, shard_metadata in enumerate(shards_metadata):
             self.assertEqual((rank * 5, 0), shard_metadata.shard_offsets)
             self.assertEqual((5, 5), shard_metadata.shard_sizes)
-            self.assertEqual(f'rank:{rank * 2}/cuda:{rank * 2 + 1}', str(shard_metadata.placement))
+            self.assertEqual(
+                f"rank:{rank * 2 + 1}/cuda:{rank * 2 + 1}",
+                str(shard_metadata.placement),
+            )
 
         # Validate remote shards.
         remote_shards = st.remote_shards()
@@ -1976,7 +2114,6 @@ def test_new_group(self):
         else:
             self.assertEqual(2, len(remote_shards))
 
-        owners = {}
         for rpc_rank, shards in remote_shards.items():
             self.assertEqual(1, len(shards))
 
@@ -1989,28 +2126,30 @@ def test_new_group(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_multiple_local_shards(self):
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="rank:0/cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[0, 5],
-                shard_sizes=[5, 5],
-                placement="rank:1/cuda:1",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="rank:0/cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 5],
-                shard_sizes=[5, 5],
-                placement="rank:1/cuda:1",
-            )
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[0, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+            ]
+        )
 
         st = sharded_tensor.empty(spec, 10, 10, init_rrefs=True)
         self.assertEqual((10, 10), st.size())
@@ -2020,13 +2159,20 @@ def test_multiple_local_shards(self):
 
             # Verify local shards.
             for idx, local_shard in enumerate(st.local_shards()):
-                self.assertEqual(torch.device(f'cuda:{self.rank}'), local_shard.tensor.device)
+                self.assertEqual(
+                    torch.device(f"cuda:{self.rank}"), local_shard.tensor.device
+                )
                 self.assertEqual((5, 5), local_shard.tensor.size())
 
                 # Verify local shard metadata.
-                self.assertEqual((idx * 5, self.rank * 5), local_shard.metadata.shard_offsets)
+                self.assertEqual(
+                    (idx * 5, self.rank * 5), local_shard.metadata.shard_offsets
+                )
                 self.assertEqual((5, 5), local_shard.metadata.shard_sizes)
-                self.assertEqual(f'rank:{self.rank}/cuda:{self.rank}', str(local_shard.metadata.placement))
+                self.assertEqual(
+                    f"rank:{self.rank}/cuda:{self.rank}",
+                    str(local_shard.metadata.placement),
+                )
         else:
             self.assertEqual(0, len(st.local_shards()))
 
@@ -2035,9 +2181,15 @@ def test_multiple_local_shards(self):
         shards_metadata = st_metadata.shards_metadata
         self.assertEqual(4, len(shards_metadata))
         for shard_rank, shard_metadata in enumerate(shards_metadata):
-            self.assertEqual((shard_rank // 2 * 5, (shard_rank % 2) * 5), shard_metadata.shard_offsets)
+            self.assertEqual(
+                (shard_rank // 2 * 5, (shard_rank % 2) * 5),
+                shard_metadata.shard_offsets,
+            )
             self.assertEqual((5, 5), shard_metadata.shard_sizes)
-            self.assertEqual(f'rank:{shard_rank % 2}/cuda:{shard_rank % 2}', str(shard_metadata.placement))
+            self.assertEqual(
+                f"rank:{shard_rank % 2}/cuda:{shard_rank % 2}",
+                str(shard_metadata.placement),
+            )
 
         # Validate remote shards.
         remote_shards = st.remote_shards()
@@ -2058,28 +2210,30 @@ def test_multiple_local_shards(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_with_rpc_names(self):
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="worker0/cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[0, 5],
-                shard_sizes=[5, 5],
-                placement="worker1/cuda:1",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="worker2/cuda:2",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 5],
-                shard_sizes=[5, 5],
-                placement="worker3/cuda:3",
-            )
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="worker0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[0, 5],
+                    shard_sizes=[5, 5],
+                    placement="worker1/cuda:1",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="worker2/cuda:2",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 5],
+                    shard_sizes=[5, 5],
+                    placement="worker3/cuda:3",
+                ),
+            ]
+        )
 
         st = sharded_tensor.empty(spec, 10, 10, init_rrefs=True)
         self.assertEqual((10, 10), st.size())
@@ -2087,22 +2241,29 @@ def test_with_rpc_names(self):
 
         # Verify local shard.
         local_shard = st.local_shards()[0]
-        self.assertEqual(torch.device(f'cuda:{self.rank}'), local_shard.tensor.device)
+        self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.tensor.device)
         self.assertEqual((5, 5), local_shard.tensor.size())
 
         # Verify local shard metadata.
-        self.assertEqual((self.rank // 2 * 5, (self.rank % 2) * 5), local_shard.metadata.shard_offsets)
+        self.assertEqual(
+            (self.rank // 2 * 5, (self.rank % 2) * 5),
+            local_shard.metadata.shard_offsets,
+        )
         self.assertEqual((5, 5), local_shard.metadata.shard_sizes)
-        self.assertEqual(f'worker{self.rank}/cuda:{self.rank}', str(local_shard.metadata.placement))
+        self.assertEqual(
+            f"worker{self.rank}/cuda:{self.rank}", str(local_shard.metadata.placement)
+        )
 
         # Verify global metadata.
         st_metadata = st.metadata()
         shards_metadata = st_metadata.shards_metadata
         self.assertEqual(4, len(shards_metadata))
         for rank, shard_metadata in enumerate(shards_metadata):
-            self.assertEqual((rank // 2 * 5, (rank % 2) * 5), shard_metadata.shard_offsets)
+            self.assertEqual(
+                (rank // 2 * 5, (rank % 2) * 5), shard_metadata.shard_offsets
+            )
             self.assertEqual((5, 5), shard_metadata.shard_sizes)
-            self.assertEqual(f'worker{rank}/cuda:{rank}', str(shard_metadata.placement))
+            self.assertEqual(f"worker{rank}/cuda:{rank}", str(shard_metadata.placement))
 
         # Validate remote shards.
         remote_shards = st.remote_shards()
@@ -2125,7 +2286,9 @@ def _generate_st_from_chunk_local_tensor(self, st_size, sharding_spec):
         local_shard_metadata = None
         rank_to_metadata = {}
         for shard_metadata in tensor_meta.shards_metadata:
-            rank, device = _parse_and_validate_remote_device(pg, shard_metadata.placement)
+            rank, device = _parse_and_validate_remote_device(
+                pg, shard_metadata.placement
+            )
             rank_to_metadata[rank] = shard_metadata
             if rank == self.rank:
                 local_tensor = torch.rand(shard_metadata.shard_sizes).cuda(device)
@@ -2207,9 +2370,7 @@ def test_init_from_local_tensor_errors(self):
         )
         st_size = [24, 12]
         local_tensor = torch.rand(*st_size).cuda(self.rank)
-        with self.assertRaisesRegex(
-            ValueError, "do not cover the entire tensor"
-        ):
+        with self.assertRaisesRegex(ValueError, "do not cover the entire tensor"):
             ShardedTensor._init_from_local_tensor(
                 local_tensor,
                 enumerable_sharding_spec,
@@ -2227,7 +2388,6 @@ def test_init_from_local_tensor_errors(self):
 
 
 class TestShardedTensorFromLocalShards(ShardedTensorTestBase):
-
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
@@ -2236,24 +2396,22 @@ def test_local_shards(self):
         local_shard_metadata = ShardMetadata(
             shard_offsets=shard_offsets,
             shard_sizes=[5, 5],
-            placement=f"rank:{self.rank}/cuda:{self.rank}"
+            placement=f"rank:{self.rank}/cuda:{self.rank}",
         )
 
         local_tensor = torch.randn(5, 5, device=f"cuda:{self.rank}")
         local_shard = sharded_tensor.Shard(local_tensor, local_shard_metadata)
         local_shard_from_offsets = sharded_tensor.Shard.from_tensor_and_offsets(
-            local_tensor,
-            shard_offsets=shard_offsets,
-            rank=self.rank
+            local_tensor, shard_offsets=shard_offsets, rank=self.rank
         )
         self.assertEqual(local_shard.metadata, local_shard_from_offsets.metadata)
 
         wrong_local_shard_metadata = ShardMetadata(
             shard_offsets=shard_offsets,
             shard_sizes=[6, 5],
-            placement=f"rank:{self.rank}/cuda:{self.rank}"
+            placement=f"rank:{self.rank}/cuda:{self.rank}",
         )
-        with self.assertRaisesRegex(ValueError, 'Shard tensor size does not match'):
+        with self.assertRaisesRegex(ValueError, "Shard tensor size does not match"):
             local_shard_from_wrong_meta = sharded_tensor.Shard(
                 local_tensor,
                 metadata=wrong_local_shard_metadata,
@@ -2266,32 +2424,45 @@ def test_init_from_local_shards(self):
         local_shard_metadata = ShardMetadata(
             shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5],
             shard_sizes=[5, 5],
-            placement=f"rank:{self.rank}/cuda:{self.rank}"
+            placement=f"rank:{self.rank}/cuda:{self.rank}",
         )
 
-        local_shards = [sharded_tensor.Shard(torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata)]
+        local_shards = [
+            sharded_tensor.Shard(
+                torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata
+            )
+        ]
 
-        st = sharded_tensor.init_from_local_shards(local_shards, [10, 10], init_rrefs=True)
+        st = sharded_tensor.init_from_local_shards(
+            local_shards, [10, 10], init_rrefs=True
+        )
         self.assertEqual((10, 10), st.size())
         self.assertEqual(1, len(st.local_shards()))
 
         # Verify local shard.
         local_shard = st.local_shards()[0]
-        self.assertEqual(torch.device(f'cuda:{self.rank}'), local_shard.tensor.device)
+        self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.tensor.device)
         self.assertEqual((5, 5), local_shard.tensor.size())
 
         # Verify local shard metadata.
-        self.assertEqual((self.rank // 2 * 5, (self.rank % 2) * 5), local_shard.metadata.shard_offsets)
+        self.assertEqual(
+            (self.rank // 2 * 5, (self.rank % 2) * 5),
+            local_shard.metadata.shard_offsets,
+        )
         self.assertEqual((5, 5), local_shard.metadata.shard_sizes)
-        self.assertEqual(f'rank:{self.rank}/cuda:{self.rank}', str(local_shard.metadata.placement))
+        self.assertEqual(
+            f"rank:{self.rank}/cuda:{self.rank}", str(local_shard.metadata.placement)
+        )
 
         # Verify global metadata.
         shards_metadata = st.metadata().shards_metadata
         self.assertEqual(4, len(shards_metadata))
         for rank, shard_metadata in enumerate(shards_metadata):
-            self.assertEqual((rank // 2 * 5, (rank % 2) * 5), shard_metadata.shard_offsets)
+            self.assertEqual(
+                (rank // 2 * 5, (rank % 2) * 5), shard_metadata.shard_offsets
+            )
             self.assertEqual((5, 5), shard_metadata.shard_sizes)
-            self.assertEqual(f'rank:{rank}/cuda:{rank}', str(shard_metadata.placement))
+            self.assertEqual(f"rank:{rank}/cuda:{rank}", str(shard_metadata.placement))
 
         # Validate remote shards.
         remote_shards = st.remote_shards()
@@ -2371,7 +2542,7 @@ def test_init_from_local_shards_and_global_metadata(self):
         local_shard_metadata = ShardMetadata(
             shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5],
             shard_sizes=[5, 5],
-            placement=f"rank:{self.rank}/cuda:{self.rank}"
+            placement=f"rank:{self.rank}/cuda:{self.rank}",
         )
 
         shards_metadata = []
@@ -2379,13 +2550,19 @@ def test_init_from_local_shards_and_global_metadata(self):
             if r == self.rank:
                 shards_metadata.append(local_shard_metadata)
             else:
-                shards_metadata.append(ShardMetadata(
-                    shard_offsets=[(r // 2) * 5, (r % 2) * 5],
-                    shard_sizes=[5, 5],
-                    placement=f"rank:{r}/cuda:{r}"
-                ))
+                shards_metadata.append(
+                    ShardMetadata(
+                        shard_offsets=[(r // 2) * 5, (r % 2) * 5],
+                        shard_sizes=[5, 5],
+                        placement=f"rank:{r}/cuda:{r}",
+                    )
+                )
 
-        local_shards = [sharded_tensor.Shard(torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata)]
+        local_shards = [
+            sharded_tensor.Shard(
+                torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata
+            )
+        ]
 
         tensor_properties = TensorProperties(
             dtype=torch.get_default_dtype(),
@@ -2411,21 +2588,28 @@ def test_init_from_local_shards_and_global_metadata(self):
 
         # Verify local shard.
         local_shard = st.local_shards()[0]
-        self.assertEqual(torch.device(f'cuda:{self.rank}'), local_shard.tensor.device)
+        self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.tensor.device)
         self.assertEqual((5, 5), local_shard.tensor.size())
 
         # Verify local shard metadata.
-        self.assertEqual((self.rank // 2 * 5, (self.rank % 2) * 5), local_shard.metadata.shard_offsets)
+        self.assertEqual(
+            (self.rank // 2 * 5, (self.rank % 2) * 5),
+            local_shard.metadata.shard_offsets,
+        )
         self.assertEqual((5, 5), local_shard.metadata.shard_sizes)
-        self.assertEqual(f'rank:{self.rank}/cuda:{self.rank}', str(local_shard.metadata.placement))
+        self.assertEqual(
+            f"rank:{self.rank}/cuda:{self.rank}", str(local_shard.metadata.placement)
+        )
 
         # Verify global metadata.
         shards_metadata = st.metadata().shards_metadata
         self.assertEqual(4, len(shards_metadata))
         for rank, shard_metadata in enumerate(shards_metadata):
-            self.assertEqual((rank // 2 * 5, (rank % 2) * 5), shard_metadata.shard_offsets)
+            self.assertEqual(
+                (rank // 2 * 5, (rank % 2) * 5), shard_metadata.shard_offsets
+            )
             self.assertEqual((5, 5), shard_metadata.shard_sizes)
-            self.assertEqual(f'rank:{rank}/cuda:{rank}', str(shard_metadata.placement))
+            self.assertEqual(f"rank:{rank}/cuda:{rank}", str(shard_metadata.placement))
 
         # Validate remote shards.
         remote_shards = st.remote_shards()
@@ -2448,21 +2632,34 @@ def test_init_from_local_shards_new_group(self):
             local_shard_metadata = ShardMetadata(
                 shard_offsets=[5 * (self.rank - 1), 0],
                 shard_sizes=[5, 5],
-                placement=f"rank:{self.rank - 1}/cuda:{self.rank}"
+                placement=f"rank:{self.rank}/cuda:{self.rank}",
             )
-            local_shards = [sharded_tensor.Shard(torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata)]
+            local_shards = [
+                sharded_tensor.Shard(
+                    torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata
+                )
+            ]
 
-            st = sharded_tensor.init_from_local_shards(local_shards, [15, 5], process_group=new_pg)
+            st = sharded_tensor.init_from_local_shards(
+                local_shards, [15, 5], process_group=new_pg
+            )
 
             # Verify local shard.
             local_shard = st.local_shards()[0]
-            self.assertEqual(torch.device(f'cuda:{self.rank}'), local_shard.tensor.device)
+            self.assertEqual(
+                torch.device(f"cuda:{self.rank}"), local_shard.tensor.device
+            )
             self.assertEqual((5, 5), local_shard.tensor.size())
 
             # Verify local shard metadata.
-            self.assertEqual(((self.rank - 1) * 5, 0), local_shard.metadata.shard_offsets)
+            self.assertEqual(
+                ((self.rank - 1) * 5, 0), local_shard.metadata.shard_offsets
+            )
             self.assertEqual((5, 5), local_shard.metadata.shard_sizes)
-            self.assertEqual(f'rank:{self.rank - 1}/cuda:{self.rank}', str(local_shard.metadata.placement))
+            self.assertEqual(
+                f"rank:{self.rank}/cuda:{self.rank}",
+                str(local_shard.metadata.placement),
+            )
 
             # Verify global metadata.
             st_metadata = st.metadata()
@@ -2471,8 +2668,9 @@ def test_init_from_local_shards_new_group(self):
             for rank, shard_metadata in enumerate(shards_metadata):
                 self.assertEqual((rank * 5, 0), shard_metadata.shard_offsets)
                 self.assertEqual((5, 5), shard_metadata.shard_sizes)
-                self.assertEqual(f'rank:{rank}/cuda:{rank + 1}', str(shard_metadata.placement))
-
+                self.assertEqual(
+                    f"rank:{rank + 1}/cuda:{rank + 1}", str(shard_metadata.placement)
+                )
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -2481,36 +2679,57 @@ def test_init_from_local_shards_invalid_local_shards(self):
         local_shard_metadata = ShardMetadata(
             shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5],
             shard_sizes=[5, 5],
-            placement=f"rank:{self.rank}/cuda:{self.rank}"
+            placement=f"rank:{self.rank}/cuda:{self.rank}",
         )
 
         indices = [[0, 1, 1], [2, 0, 2]]
         values = [3.2, 4.5, 5.8]
-        sparse_tensor = torch.sparse_coo_tensor(indices, values, (5, 5), device=f"cuda:{self.rank}")
+        sparse_tensor = torch.sparse_coo_tensor(
+            indices, values, (5, 5), device=f"cuda:{self.rank}"
+        )
 
         empty_local_shards = []
-        with self.assertRaisesRegex(ValueError, 'have no local shards on all ranks'):
-            st = sharded_tensor.init_from_local_shards(empty_local_shards, [10, 10], init_rrefs=True)
+        with self.assertRaisesRegex(ValueError, "have no local shards on all ranks"):
+            st = sharded_tensor.init_from_local_shards(
+                empty_local_shards, [10, 10], init_rrefs=True
+            )
 
         wrong_layout_shards = [
             sharded_tensor.Shard(sparse_tensor, local_shard_metadata)
         ]
-        with self.assertRaisesRegex(ValueError, 'Only torch.strided layout is currently supported'):
+        with self.assertRaisesRegex(
+            ValueError, "Only torch.strided layout is currently supported"
+        ):
             st = sharded_tensor.init_from_local_shards(
-                wrong_layout_shards, [10, 10], init_rrefs=True)
+                wrong_layout_shards, [10, 10], init_rrefs=True
+            )
 
         wrong_memory_format_shards = [
-            sharded_tensor.Shard(torch.randn(5, 5, device=f"cuda:{self.rank}").t(), local_shard_metadata)
+            sharded_tensor.Shard(
+                torch.randn(5, 5, device=f"cuda:{self.rank}").t(), local_shard_metadata
+            )
         ]
-        with self.assertRaisesRegex(ValueError, 'Only torch.contiguous_format memory_format is currently supported'):
+        with self.assertRaisesRegex(
+            ValueError,
+            "Only torch.contiguous_format memory_format is currently supported",
+        ):
             st = sharded_tensor.init_from_local_shards(
-                wrong_memory_format_shards, [10, 10], init_rrefs=True)
+                wrong_memory_format_shards, [10, 10], init_rrefs=True
+            )
 
-        with self.assertRaisesRegex(ValueError, 'Shard tensor size does not match'):
-            wrong_size_shards = [sharded_tensor.Shard(torch.randn(2, 3, device=f"cuda:{self.rank}"), local_shard_metadata)]
+        with self.assertRaisesRegex(ValueError, "Shard tensor size does not match"):
+            wrong_size_shards = [
+                sharded_tensor.Shard(
+                    torch.randn(2, 3, device=f"cuda:{self.rank}"), local_shard_metadata
+                )
+            ]
 
-        with self.assertRaisesRegex(ValueError, "Local shard tensor device does not match"):
-            wrong_device_shards = [sharded_tensor.Shard(torch.randn(5, 5), local_shard_metadata)]
+        with self.assertRaisesRegex(
+            ValueError, "Local shard tensor device does not match"
+        ):
+            wrong_device_shards = [
+                sharded_tensor.Shard(torch.randn(5, 5), local_shard_metadata)
+            ]
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -2519,37 +2738,58 @@ def test_init_from_local_shards_invalid_property_cross_ranks(self):
         local_shard_metadata = ShardMetadata(
             shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5],
             shard_sizes=[5, 5],
-            placement=f"rank:{self.rank}/cuda:{self.rank}"
+            placement=f"rank:{self.rank}/cuda:{self.rank}",
         )
         tensor_overall_size = [10, 10] if self.rank == 0 else [10, 5]
         wrong_dtype_shards = [
-            sharded_tensor.Shard(torch.ones(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata)
+            sharded_tensor.Shard(
+                torch.ones(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata
+            )
         ]
-        with self.assertRaisesRegex(ValueError, "ShardedTensor global_size property does not match from different ranks!"):
-            st = sharded_tensor.init_from_local_shards(wrong_dtype_shards, tensor_overall_size, init_rrefs=True)
+        with self.assertRaisesRegex(
+            ValueError,
+            "ShardedTensor global_size property does not match from different ranks!",
+        ):
+            st = sharded_tensor.init_from_local_shards(
+                wrong_dtype_shards, tensor_overall_size, init_rrefs=True
+            )
 
         tensor_dtype = torch.int if self.rank == 0 else torch.float32
         wrong_dtype_shards = [
-            sharded_tensor.Shard(torch.ones(5, 5, device=f"cuda:{self.rank}", dtype=tensor_dtype), local_shard_metadata)
+            sharded_tensor.Shard(
+                torch.ones(5, 5, device=f"cuda:{self.rank}", dtype=tensor_dtype),
+                local_shard_metadata,
+            )
         ]
-        with self.assertRaisesRegex(ValueError, "ShardedTensor dtype property does not match from different ranks!"):
-            st = sharded_tensor.init_from_local_shards(wrong_dtype_shards, [10, 10], init_rrefs=True)
+        with self.assertRaisesRegex(
+            ValueError,
+            "ShardedTensor dtype property does not match from different ranks!",
+        ):
+            st = sharded_tensor.init_from_local_shards(
+                wrong_dtype_shards, [10, 10], init_rrefs=True
+            )
 
         tensor_requires_grad = True if self.rank == 0 else False
         wrong_requires_grad_shards = [
             sharded_tensor.Shard(
-                torch.randn(5, 5, device=f"cuda:{self.rank}", requires_grad=tensor_requires_grad),
-                local_shard_metadata
+                torch.randn(
+                    5, 5, device=f"cuda:{self.rank}", requires_grad=tensor_requires_grad
+                ),
+                local_shard_metadata,
             )
         ]
-        with self.assertRaisesRegex(ValueError, 'ShardedTensor requires_grad property does not match from different ranks!'):
+        with self.assertRaisesRegex(
+            ValueError,
+            "ShardedTensor requires_grad property does not match from different ranks!",
+        ):
             st = sharded_tensor.init_from_local_shards(
-                wrong_requires_grad_shards, [10, 10], init_rrefs=True)
+                wrong_requires_grad_shards, [10, 10], init_rrefs=True
+            )
 
         local_shard_metadata = ShardMetadata(
             shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5],
             shard_sizes=[5, 5],
-            placement=f"rank:{self.rank}/cpu"
+            placement=f"rank:{self.rank}/cpu",
         )
 
     @with_comms(init_rpc=False, backend="gloo")
@@ -2559,24 +2799,36 @@ def test_init_from_local_shards_invalid_pin_memory(self):
         local_shard_metadata = ShardMetadata(
             shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5],
             shard_sizes=[5, 5],
-            placement=f"rank:{self.rank}/cpu"
+            placement=f"rank:{self.rank}/cpu",
         )
         wrong_pin_memory_local_shards = [
-            sharded_tensor.Shard(torch.randn(5, 5, pin_memory=True), local_shard_metadata),
-            sharded_tensor.Shard(torch.randn(5, 5, pin_memory=False), local_shard_metadata)
+            sharded_tensor.Shard(
+                torch.randn(5, 5, pin_memory=True), local_shard_metadata
+            ),
+            sharded_tensor.Shard(
+                torch.randn(5, 5, pin_memory=False), local_shard_metadata
+            ),
         ]
-        with self.assertRaisesRegex(ValueError, "Local shards' tensor pin_memory property need to be the same"):
+        with self.assertRaisesRegex(
+            ValueError, "Local shards' tensor pin_memory property need to be the same"
+        ):
             st = sharded_tensor.init_from_local_shards(
-                wrong_pin_memory_local_shards, [10, 10], init_rrefs=True)
+                wrong_pin_memory_local_shards, [10, 10], init_rrefs=True
+            )
 
         tensor_pin_memory = True if self.rank == 0 else False
         wrong_pin_memory_shards_cross_ranks = [
-            sharded_tensor.Shard(torch.randn(5, 5, pin_memory=tensor_pin_memory), local_shard_metadata)
+            sharded_tensor.Shard(
+                torch.randn(5, 5, pin_memory=tensor_pin_memory), local_shard_metadata
+            )
         ]
-        with self.assertRaisesRegex(ValueError, 'ShardedTensor pin_memory property does not match from different ranks!'):
+        with self.assertRaisesRegex(
+            ValueError,
+            "ShardedTensor pin_memory property does not match from different ranks!",
+        ):
             st = sharded_tensor.init_from_local_shards(
-                wrong_pin_memory_shards_cross_ranks, [10, 10], init_rrefs=True)
-
+                wrong_pin_memory_shards_cross_ranks, [10, 10], init_rrefs=True
+            )
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -2586,14 +2838,20 @@ def test_init_from_local_shards_invalid_shards_overlap(self):
         local_shard_metadata = ShardMetadata(
             shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5],
             shard_sizes=local_shard_size,
-            placement=f"rank:{self.rank}/cuda:{self.rank}"
+            placement=f"rank:{self.rank}/cuda:{self.rank}",
         )
 
-        local_shards = [sharded_tensor.Shard(torch.randn(local_shard_size, device=f"cuda:{self.rank}"), local_shard_metadata)]
+        local_shards = [
+            sharded_tensor.Shard(
+                torch.randn(local_shard_size, device=f"cuda:{self.rank}"),
+                local_shard_metadata,
+            )
+        ]
 
         with self.assertRaisesRegex(ValueError, "overlap"):
-            sharded_tensor.init_from_local_shards(local_shards, [10, 10], init_rrefs=True)
-
+            sharded_tensor.init_from_local_shards(
+                local_shards, [10, 10], init_rrefs=True
+            )
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -2603,13 +2861,20 @@ def test_init_from_local_shards_invalid_shards_gaps(self):
         local_shard_metadata = ShardMetadata(
             shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5],
             shard_sizes=local_shard_size,
-            placement=f"rank:{self.rank}/cuda:{self.rank}"
+            placement=f"rank:{self.rank}/cuda:{self.rank}",
         )
 
-        local_shards = [sharded_tensor.Shard(torch.randn(local_shard_size, device=f"cuda:{self.rank}"), local_shard_metadata)]
+        local_shards = [
+            sharded_tensor.Shard(
+                torch.randn(local_shard_size, device=f"cuda:{self.rank}"),
+                local_shard_metadata,
+            )
+        ]
 
         with self.assertRaisesRegex(ValueError, "does not match tensor volume"):
-            sharded_tensor.init_from_local_shards(local_shards, [10, 10], init_rrefs=True)
+            sharded_tensor.init_from_local_shards(
+                local_shards, [10, 10], init_rrefs=True
+            )
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -2618,7 +2883,7 @@ def test_init_from_local_shards_and_global_metadata_invalid_shards(self):
         local_shard_metadata = ShardMetadata(
             shard_offsets=[(self.rank // 2) * 5, (self.rank % 2) * 5],
             shard_sizes=[5, 5],
-            placement=f"rank:{self.rank}/cuda:{self.rank}"
+            placement=f"rank:{self.rank}/cuda:{self.rank}",
         )
 
         shards_metadata = []
@@ -2626,11 +2891,13 @@ def test_init_from_local_shards_and_global_metadata_invalid_shards(self):
             if r == self.rank:
                 shards_metadata.append(local_shard_metadata)
             else:
-                shards_metadata.append(ShardMetadata(
-                    shard_offsets=[(r // 2) * 5, (r % 2) * 5],
-                    shard_sizes=[5, 5],
-                    placement=f"rank:{r}/cuda:{r}"
-                ))
+                shards_metadata.append(
+                    ShardMetadata(
+                        shard_offsets=[(r // 2) * 5, (r % 2) * 5],
+                        shard_sizes=[5, 5],
+                        placement=f"rank:{r}/cuda:{r}",
+                    )
+                )
 
         tensor_properties = TensorProperties(
             dtype=torch.get_default_dtype(),
@@ -2647,85 +2914,120 @@ def test_init_from_local_shards_and_global_metadata_invalid_shards(self):
         )
 
         empty_local_shards = []
-        with self.assertRaisesRegex(RuntimeError, 'does not match number of local shards metadata'):
+        with self.assertRaisesRegex(
+            RuntimeError, "does not match number of local shards metadata"
+        ):
             ShardedTensor._init_from_local_shards_and_global_metadata(
-                empty_local_shards,
-                sharded_tensor_metadata
+                empty_local_shards, sharded_tensor_metadata
             )
 
         wrong_num_shards = [
-            sharded_tensor.Shard(torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata),
-            sharded_tensor.Shard(torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata)
+            sharded_tensor.Shard(
+                torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata
+            ),
+            sharded_tensor.Shard(
+                torch.randn(5, 5, device=f"cuda:{self.rank}"), local_shard_metadata
+            ),
         ]
-        with self.assertRaisesRegex(RuntimeError, 'does not match number of local shards metadata'):
+        with self.assertRaisesRegex(
+            RuntimeError, "does not match number of local shards metadata"
+        ):
             ShardedTensor._init_from_local_shards_and_global_metadata(
-                wrong_num_shards,
-                sharded_tensor_metadata
+                wrong_num_shards, sharded_tensor_metadata
             )
 
-        with self.assertRaisesRegex(ValueError, 'Shard tensor size does not match with metadata.shard_lengths'):
-            wrong_size_shards = [sharded_tensor.Shard(torch.randn(2, 3, device=f"cuda:{self.rank}"), local_shard_metadata)]
+        with self.assertRaisesRegex(
+            ValueError, "Shard tensor size does not match with metadata.shard_lengths"
+        ):
+            wrong_size_shards = [
+                sharded_tensor.Shard(
+                    torch.randn(2, 3, device=f"cuda:{self.rank}"), local_shard_metadata
+                )
+            ]
 
-        with self.assertRaisesRegex(ValueError, "Local shard tensor device does not match with local Shard's placement"):
-            wrong_device_shards = [sharded_tensor.Shard(torch.randn(5, 5), local_shard_metadata)]
+        with self.assertRaisesRegex(
+            ValueError,
+            "Local shard tensor device does not match with local Shard's placement",
+        ):
+            wrong_device_shards = [
+                sharded_tensor.Shard(torch.randn(5, 5), local_shard_metadata)
+            ]
 
         wrong_dtype_shards = [
-            sharded_tensor.Shard(torch.ones(5, 5, device=f"cuda:{self.rank}", dtype=torch.int), local_shard_metadata)
+            sharded_tensor.Shard(
+                torch.ones(5, 5, device=f"cuda:{self.rank}", dtype=torch.int),
+                local_shard_metadata,
+            )
         ]
-        with self.assertRaisesRegex(ValueError, "Local shards' tensor dtype property is incompatible with"):
+        with self.assertRaisesRegex(
+            ValueError, "Local shards' tensor dtype property is incompatible with"
+        ):
             ShardedTensor._init_from_local_shards_and_global_metadata(
-                wrong_dtype_shards,
-                sharded_tensor_metadata
+                wrong_dtype_shards, sharded_tensor_metadata
             )
 
         indices = [[0, 1, 1], [2, 0, 2]]
         values = [3.2, 4.5, 5.8]
-        sparse_tensor = torch.sparse_coo_tensor(indices, values, (5, 5), device=f"cuda:{self.rank}")
+        sparse_tensor = torch.sparse_coo_tensor(
+            indices, values, (5, 5), device=f"cuda:{self.rank}"
+        )
 
         wrong_layout_shards = [
             sharded_tensor.Shard(sparse_tensor, local_shard_metadata)
         ]
-        with self.assertRaisesRegex(ValueError, "Local shards' tensor layout property is incompatible with"):
+        with self.assertRaisesRegex(
+            ValueError, "Local shards' tensor layout property is incompatible with"
+        ):
             ShardedTensor._init_from_local_shards_and_global_metadata(
-                wrong_layout_shards,
-                sharded_tensor_metadata
+                wrong_layout_shards, sharded_tensor_metadata
             )
 
         wrong_requires_grad_shards = [
-            sharded_tensor.Shard(torch.randn(5, 5, device=f"cuda:{self.rank}", requires_grad=True), local_shard_metadata)
+            sharded_tensor.Shard(
+                torch.randn(5, 5, device=f"cuda:{self.rank}", requires_grad=True),
+                local_shard_metadata,
+            )
         ]
-        with self.assertRaisesRegex(ValueError, "Local shards' tensor requires_grad property is incompatible with"):
+        with self.assertRaisesRegex(
+            ValueError,
+            "Local shards' tensor requires_grad property is incompatible with",
+        ):
             ShardedTensor._init_from_local_shards_and_global_metadata(
-                wrong_requires_grad_shards,
-                sharded_tensor_metadata
+                wrong_requires_grad_shards, sharded_tensor_metadata
             )
 
         wrong_memory_format_shards = [
-            sharded_tensor.Shard(torch.randn(5, 5, device=f"cuda:{self.rank}").t(), local_shard_metadata)
+            sharded_tensor.Shard(
+                torch.randn(5, 5, device=f"cuda:{self.rank}").t(), local_shard_metadata
+            )
         ]
-        with self.assertRaisesRegex(ValueError, 'Only torch.contiguous_format memory_format is currently supported'):
+        with self.assertRaisesRegex(
+            ValueError,
+            "Only torch.contiguous_format memory_format is currently supported",
+        ):
             ShardedTensor._init_from_local_shards_and_global_metadata(
-                wrong_memory_format_shards,
-                sharded_tensor_metadata
+                wrong_memory_format_shards, sharded_tensor_metadata
             )
         # pin_memory can only be on CPU
         local_shard_metadata.placement = _remote_device(f"rank:{self.rank}/cpu")
         wrong_pin_memory_shards = [
-            sharded_tensor.Shard(torch.randn(5, 5, pin_memory=True), local_shard_metadata)
+            sharded_tensor.Shard(
+                torch.randn(5, 5, pin_memory=True), local_shard_metadata
+            )
         ]
-        with self.assertRaisesRegex(ValueError, "Local shards' tensor pin_memory property is incompatible with"):
+        with self.assertRaisesRegex(
+            ValueError, "Local shards' tensor pin_memory property is incompatible with"
+        ):
             ShardedTensor._init_from_local_shards_and_global_metadata(
-                wrong_pin_memory_shards,
-                sharded_tensor_metadata
+                wrong_pin_memory_shards, sharded_tensor_metadata
             )
 
-class TestShardedTensorCustomOps(ShardedTensorTestBase):
 
+class TestShardedTensorCustomOps(ShardedTensorTestBase):
     @with_comms
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_custom_op(self):
-
         @custom_sharded_op_impl(torch.asin)
         def my_sharded_asin(types, args, kwargs, process_group):
             return torch.asin(args[0].local_shards()[0].tensor)
@@ -2748,7 +3050,6 @@ def my_sharded_asin(types, args, kwargs, process_group):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_custom_op_override(self):
-
         t = torch.rand(10, 10).cuda(self.rank)
 
         from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
@@ -2767,7 +3068,7 @@ def my_sharded_linear(types, args, kwargs, process_group):
             ],
         )
         m = torch.nn.Linear(32, 16).cuda(self.rank)
-        shard_parameter(m, 'weight', spec)
+        shard_parameter(m, "weight", spec)
 
         result = m(torch.rand(15, 32).cuda(self.rank))
         self.assertEqual(t, result)
@@ -2776,17 +3077,19 @@ def my_sharded_linear(types, args, kwargs, process_group):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_custom_op_errors(self):
+        with self.assertRaisesRegex(TypeError, "expects signature"):
 
-        with self.assertRaisesRegex(TypeError, 'expects signature'):
             @custom_sharded_op_impl(torch.nn.functional.linear)
             def my_op1(types, args, kwargs, process_group, random_param):
                 pass
 
-        with self.assertRaisesRegex(TypeError, 'expects signature'):
+        with self.assertRaisesRegex(TypeError, "expects signature"):
+
             @custom_sharded_op_impl(torch.nn.functional.linear)
             def my_op2(types):
                 pass
 
+
 class TestShardMetadata(ShardedTensorTestBase):
     @with_comms
     @requires_nccl()
@@ -2812,6 +3115,58 @@ def test_create_shard_with_no_placement(self):
         shard = Shard(torch.zeros(10), md)
         self.assertIsNone(shard.metadata.placement)
 
+
+class TestShardedTensorSubGroupInit(TestCase):
+    @spawn_threads_and_init_comms(world_size=4)
+    def test_sub_process_group_sharded_tensor_init(self):
+        world_pg = dist.GroupMember.WORLD
+        rank = dist.get_rank()
+
+        sub_group_sz = 2
+        sub_pg_ranks = [r for r in range(4) if r % sub_group_sz == rank % sub_group_sz]
+        sub_pg = dist.new_group(
+            sub_pg_ranks,
+            backend=dist.get_backend(world_pg),
+            use_local_synchronization=True,
+        )
+        dist.barrier(sub_pg)
+
+        ShardedTensor._init_from_local_shards(
+            [
+                Shard(
+                    tensor=torch.tensor([1, 2, 3], device="meta"),
+                    metadata=ShardMetadata(
+                        shard_offsets=[3 * (rank // sub_group_sz)],
+                        shard_sizes=[3],
+                        placement=f"rank:{rank}/meta",
+                    ),
+                )
+            ],
+            6,
+            process_group=sub_pg,
+        )
+
+    @spawn_threads_and_init_comms(world_size=4)
+    def test_sub_process_group_placement_validation(self):
+        world_pg = dist.GroupMember.WORLD
+        self.assertIsNotNone(world_pg)
+        rank = dist.get_rank()
+
+        sub_group_sz = 2
+        sub_pg_ranks = [r for r in range(4) if r % sub_group_sz == rank % sub_group_sz]
+        sub_pg = dist.new_group(
+            sub_pg_ranks,
+            backend=dist.get_backend(world_pg),
+            use_local_synchronization=True,
+        )
+        dist.barrier(sub_pg)
+
+        for r in sub_pg_ranks:
+            _parse_and_validate_remote_device(
+                sub_pg, _remote_device(f"rank:{r}/cuda:{r % sub_group_sz}")
+            )
+
+
 class TestCreateTensorNoProcessGroupMode(TestCase):
     def test_init_from_local_shards_and_global_metadata(self):
         st_metadata: ShardedTensorMetadata = ShardedTensorMetadata(
@@ -2861,7 +3216,10 @@ def test_non_contiguous_local_shards(self):
             sizes = shard_metadata.shard_sizes
             st_local_shards.append(
                 Shard(
-                    tensor=src[offsets[0]:offsets[0] + sizes[0], offsets[1]:offsets[1] + sizes[1]],
+                    tensor=src[
+                        offsets[0] : offsets[0] + sizes[0],
+                        offsets[1] : offsets[1] + sizes[1],
+                    ],
                     metadata=shard_metadata,
                 )
             )
@@ -2871,5 +3229,6 @@ def test_non_contiguous_local_shards(self):
             sharded_tensor_metadata=st_metadata,
         )
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py
index ec053c95b47a6..c3fe5ee6817e9 100644
--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py
+++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py
@@ -4,22 +4,10 @@
 from itertools import product
 
 import torch
-from torch.distributed._shard import (
-    sharded_tensor,
-    _shard_tensor,
-)
-from torch.distributed._shard.sharding_spec import (
-    EnumerableShardingSpec,
-    ShardMetadata,
-)
-from torch.testing._internal.common_distributed import (
-    requires_nccl,
-    skip_if_lt_x_gpu,
-)
-from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
-    run_tests,
-)
+from torch.distributed._shard import _shard_tensor, sharded_tensor
+from torch.distributed._shard.sharding_spec import EnumerableShardingSpec, ShardMetadata
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 from torch.testing._internal.distributed._shard.sharded_tensor import (
     ShardedTensorTestBase,
     with_comms,
@@ -45,7 +33,9 @@ def _run_sharded_tensor_reshard(self, sharding_spec, reshard_spec, input_size):
         st.reshard(reshard_spec)
         self.assertEqual(1, len(st.local_shards()))
         self.assertEqual(1, len(st_compare.local_shards()))
-        st_compare._metadata.shards_metadata.sort(key=lambda metadata: metadata.placement.rank())
+        st_compare._metadata.shards_metadata.sort(
+            key=lambda metadata: metadata.placement.rank()
+        )
         self.assertEqual(st._metadata, st_compare._metadata)
         self.assertEqual(st.local_tensor(), st_compare.local_tensor())
         self.assertEqual(
diff --git a/test/distributed/_shard/sharding_plan/test_sharding_plan.py b/test/distributed/_shard/sharding_plan/test_sharding_plan.py
index b5ea29b020cf7..c1ca7a6c7be78 100644
--- a/test/distributed/_shard/sharding_plan/test_sharding_plan.py
+++ b/test/distributed/_shard/sharding_plan/test_sharding_plan.py
@@ -1,26 +1,19 @@
-
 # Owner(s): ["oncall: distributed"]
 import sys
 
 import torch
-import torch.nn as nn
 import torch.distributed as dist
-from torch.testing._internal.common_distributed import (
-    requires_nccl,
-    skip_if_lt_x_gpu,
-)
+import torch.nn as nn
 from torch.distributed._shard import shard_module
+from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed._shard.sharding_plan import ShardingPlan, ShardingPlanner
 from torch.distributed._shard.sharding_spec import ChunkShardingSpec
-from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
 
-from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
-    run_tests,
-)
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 from torch.testing._internal.distributed._shard.sharded_tensor import (
-    TEST_GPU_NUM,
     ShardedTensorTestBase,
+    TEST_GPU_NUM,
     with_comms,
 )
 from torch.testing._internal.distributed._shard.sharded_tensor._test_ops_common import (
@@ -65,9 +58,7 @@ def test_sharding_plan_errors(self):
             plan={
                 "fc1.weight": torch.randn(3, 4),
             },
-            output_plan={
-                "": rowwise_sharding_spec
-            },
+            output_plan={"": rowwise_sharding_spec},
         )
 
         megatron_lm = SimpleMegatronLM([[17, 12], [12, 29]]).cuda(self.rank)
@@ -82,9 +73,7 @@ def test_sharding_plan_errors(self):
             plan={
                 "fc1.weight": rowwise_sharding_spec,
             },
-            output_plan={
-                "": torch.randn(3, 4)
-            },
+            output_plan={"": torch.randn(3, 4)},
         )
 
         with self.assertRaisesRegex(
@@ -98,9 +87,7 @@ def test_sharding_plan_errors(self):
                 "fc3.weight": rowwise_sharding_spec,
             },
         )
-        with self.assertRaisesRegex(
-            AttributeError, "has no attribute"
-        ):
+        with self.assertRaisesRegex(AttributeError, "has no attribute"):
             # shard the module with the provided sharding plan
             shard_module(megatron_lm, sharding_plan_wrong_module_path)
 
@@ -109,9 +96,7 @@ def test_sharding_plan_errors(self):
                 "fc1.biass": rowwise_sharding_spec,
             },
         )
-        with self.assertRaisesRegex(
-            AttributeError, "has no attribute"
-        ):
+        with self.assertRaisesRegex(AttributeError, "has no attribute"):
             # shard the module with the provided sharding plan
             shard_module(megatron_lm, sharding_plan_wrong_param_path)
 
@@ -141,21 +126,21 @@ def test_shard_module_sub_process_group(self):
         colwise_sharding_spec = ChunkShardingSpec(
             dim=0,
             placements=[
-                "rank:0/cuda:2",
-                "rank:1/cuda:3",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
             ],
         )
         rowwise_sharding_spec = ChunkShardingSpec(
             dim=1,
             placements=[
-                "rank:0/cuda:2",
-                "rank:1/cuda:3",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
             ],
         )
         sharding_plan = ShardingPlan(
             plan={
                 "fc1.weight": colwise_sharding_spec,
-                "fc2.weight": rowwise_sharding_spec
+                "fc2.weight": rowwise_sharding_spec,
             }
         )
 
@@ -164,5 +149,6 @@ def test_shard_module_sub_process_group(self):
         if self.rank >= 2:
             shard_module(megatron_lm, sharding_plan, process_group=pg)
 
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_shard/sharding_spec/test_sharding_spec.py b/test/distributed/_shard/sharding_spec/test_sharding_spec.py
index 7ff27a0f300ae..8502a63f25f46 100644
--- a/test/distributed/_shard/sharding_spec/test_sharding_spec.py
+++ b/test/distributed/_shard/sharding_spec/test_sharding_spec.py
@@ -1,51 +1,48 @@
 # Owner(s): ["oncall: distributed"]
-from typing import List, Union
+import copy
 from dataclasses import dataclass
+from typing import List, Union
 
-import copy
 import torch
-from torch.testing._internal.common_utils import TestCase
-from torch.testing._internal.common_distributed import (
-    requires_nccl,
-    skip_if_lt_x_gpu,
+from torch.distributed._shard import _shard_tensor, sharded_tensor
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor,
+    ShardedTensorMetadata,
+    TensorProperties,
 )
-from torch.distributed._shard import sharded_tensor, _shard_tensor
 from torch.distributed._shard.sharding_spec import (
-    ShardingSpec,
+    _infer_sharding_spec_from_shards_metadata,
     ChunkShardingSpec,
     DevicePlacementSpec,
     EnumerableShardingSpec,
+    ShardingSpec,
     ShardMetadata,
-    _infer_sharding_spec_from_shards_metadata,
-)
-from torch.distributed._shard.sharded_tensor import (
-    TensorProperties,
-    ShardedTensor,
-    ShardedTensorMetadata,
 )
 from torch.distributed._shard.sharding_spec._internals import (
     check_tensor,
-    get_split_size,
-    get_chunked_dim_size,
     get_chunk_sharding_params,
+    get_chunked_dim_size,
+    get_split_size,
     validate_non_overlapping_shards_metadata,
 )
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import (
     run_tests,
     skip_but_pass_in_sandcastle_if,
-)
-from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import (
-    _chunk_sharding_specs_list_for_test,
+    TestCase,
 )
 from torch.testing._internal.distributed._shard.sharded_tensor import (
     ShardedTensorTestBase,
     with_comms,
 )
+from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import (
+    _chunk_sharding_specs_list_for_test,
+)
 
-class TestShardingSpec(TestCase):
 
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, '2 CUDA GPUs are needed')
+class TestShardingSpec(TestCase):
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "2 CUDA GPUs are needed")
     def test_device_placement(self):
         # valid devices
         DevicePlacementSpec("cuda:0")
@@ -65,7 +62,7 @@ def test_device_placement(self):
         with self.assertRaisesRegex(RuntimeError, "Invalid device string"):
             DevicePlacementSpec("rank:0/cpu2")
 
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, '2 CUDA GPUs are needed')
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "2 CUDA GPUs are needed")
     def test_chunked_sharding_spec(self):
         # Test valid specs.
         ChunkShardingSpec(0, [torch.device(0), torch.device(1)])
@@ -98,165 +95,173 @@ def test_chunked_sharding_spec(self):
         with self.assertRaisesRegex(RuntimeError, "Invalid device string"):
             ChunkShardingSpec(0, ["rank:0/cuda:foo", "cuda:1"])
 
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, '2 CUDA GPUs are needed')
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "2 CUDA GPUs are needed")
     def test_enumerable_sharding_spec(self):
         # test valid specs
 
         # test row-wise sharding
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="cuda:1",
-            )
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="cuda:1",
+                ),
+            ]
+        )
         check_tensor(spec.shards, torch.rand(10, 5).size())
 
         # test row and column sharding
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[3, 3],
-                placement="cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[0, 3],
-                shard_sizes=[3, 3],
-                placement="cuda:1",
-            ),
-            ShardMetadata(
-                shard_offsets=[3, 0],
-                shard_sizes=[3, 3],
-                placement="cuda:2",
-            ),
-            ShardMetadata(
-                shard_offsets=[3, 3],
-                shard_sizes=[3, 3],
-                placement="cuda:3",
-            ),
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[3, 3],
+                    placement="cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[0, 3],
+                    shard_sizes=[3, 3],
+                    placement="cuda:1",
+                ),
+                ShardMetadata(
+                    shard_offsets=[3, 0],
+                    shard_sizes=[3, 3],
+                    placement="cuda:2",
+                ),
+                ShardMetadata(
+                    shard_offsets=[3, 3],
+                    shard_sizes=[3, 3],
+                    placement="cuda:3",
+                ),
+            ]
+        )
         check_tensor(spec.shards, torch.rand(6, 6).size())
 
         # test uneven shard sizes.
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[2, 4],
-                placement="cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[0, 4],
-                shard_sizes=[4, 2],
-                placement="cuda:1",
-            ),
-            ShardMetadata(
-                shard_offsets=[2, 0],
-                shard_sizes=[4, 4],
-                placement="cuda:2",
-            ),
-            ShardMetadata(
-                shard_offsets=[4, 4],
-                shard_sizes=[2, 2],
-                placement="cuda:3",
-            ),
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[2, 4],
+                    placement="cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[0, 4],
+                    shard_sizes=[4, 2],
+                    placement="cuda:1",
+                ),
+                ShardMetadata(
+                    shard_offsets=[2, 0],
+                    shard_sizes=[4, 4],
+                    placement="cuda:2",
+                ),
+                ShardMetadata(
+                    shard_offsets=[4, 4],
+                    shard_sizes=[2, 2],
+                    placement="cuda:3",
+                ),
+            ]
+        )
         check_tensor(spec.shards, torch.rand(6, 6).size())
 
         # test invalid sharding
-        with self.assertRaisesRegex(ValueError, 'Could not parse remote_device'):
+        with self.assertRaisesRegex(ValueError, "Could not parse remote_device"):
             ShardMetadata(shard_offsets=[0], shard_sizes=[1], placement="cuda:foo")
 
-        with self.assertRaisesRegex(ValueError, 'same number of elements'):
+        with self.assertRaisesRegex(ValueError, "same number of elements"):
             ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1], placement="cuda:0")
 
-        with self.assertRaisesRegex(ValueError, 'shard_offsets should be >=0'):
+        with self.assertRaisesRegex(ValueError, "shard_offsets should be >=0"):
             ShardMetadata(shard_offsets=[-1, 0], shard_sizes=[1, 1], placement="cuda:0")
 
-        with self.assertRaisesRegex(ValueError, 'shard_sizes should be >= 0'):
+        with self.assertRaisesRegex(ValueError, "shard_sizes should be >= 0"):
             ShardMetadata(shard_offsets=[0, 0], shard_sizes=[-1, 1], placement="cuda:0")
 
-        with self.assertRaisesRegex(ValueError, 'Empty shard list provided'):
+        with self.assertRaisesRegex(ValueError, "Empty shard list provided"):
             EnumerableShardingSpec([])
 
-        with self.assertRaisesRegex(ValueError, 'Found inconsistent ranks for shards'):
-            EnumerableShardingSpec([
+        with self.assertRaisesRegex(ValueError, "Found inconsistent ranks for shards"):
+            EnumerableShardingSpec(
+                [
+                    ShardMetadata(
+                        shard_offsets=[0, 0], shard_sizes=[1, 1], placement="cpu"
+                    ),
+                    ShardMetadata(
+                        shard_offsets=[0, 0, 0], shard_sizes=[1, 1, 1], placement="cpu"
+                    ),
+                ]
+            )
+
+        with self.assertRaisesRegex(ValueError, "Shards.*overlap"):
+            EnumerableShardingSpec(
+                [
+                    ShardMetadata(
+                        shard_offsets=[0, 0], shard_sizes=[3, 3], placement="cpu"
+                    ),
+                    ShardMetadata(
+                        shard_offsets=[2, 0], shard_sizes=[3, 3], placement="cpu"
+                    ),
+                ]
+            )
+
+        spec = EnumerableShardingSpec(
+            [
                 ShardMetadata(
                     shard_offsets=[0, 0],
-                    shard_sizes=[1, 1],
-                    placement="cpu"
+                    shard_sizes=[5, 5],
+                    placement="cuda:0",
                 ),
                 ShardMetadata(
-                    shard_offsets=[0, 0, 0],
-                    shard_sizes=[1, 1, 1],
-                    placement="cpu"
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="cuda:1",
                 ),
-            ])
+            ]
+        )
 
-        with self.assertRaisesRegex(ValueError, 'Shards.*overlap'):
-            EnumerableShardingSpec([
+        with self.assertRaisesRegex(ValueError, "Rank of tensor is.*but shards rank"):
+            check_tensor(spec.shards, torch.rand(10, 10, 10).size())
+
+        spec = EnumerableShardingSpec(
+            [
                 ShardMetadata(
                     shard_offsets=[0, 0],
-                    shard_sizes=[3, 3],
-                    placement="cpu"
+                    shard_sizes=[5, 5],
+                    placement="cuda:0",
                 ),
                 ShardMetadata(
-                    shard_offsets=[2, 0],
-                    shard_sizes=[3, 3],
-                    placement="cpu"
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="cuda:1",
                 ),
-            ])
-
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="cuda:1",
-            )
-        ])
-
-        with self.assertRaisesRegex(ValueError, 'Rank of tensor is.*but shards rank'):
-            check_tensor(spec.shards, torch.rand(10, 10, 10).size())
-
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 0],
-                shard_sizes=[5, 5],
-                placement="cuda:1",
-            )
-        ])
+            ]
+        )
 
-        with self.assertRaisesRegex(ValueError, 'exceeds tensor dim'):
+        with self.assertRaisesRegex(ValueError, "exceeds tensor dim"):
             check_tensor(spec.shards, torch.rand(10, 3).size())
 
-        spec = EnumerableShardingSpec([
-            ShardMetadata(
-                shard_offsets=[0, 0],
-                shard_sizes=[5, 5],
-                placement="cuda:0",
-            ),
-            ShardMetadata(
-                shard_offsets=[5, 5],
-                shard_sizes=[5, 5],
-                placement="cuda:1",
-            )
-        ])
+        spec = EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 5],
+                    shard_sizes=[5, 5],
+                    placement="cuda:1",
+                ),
+            ]
+        )
 
-        with self.assertRaisesRegex(ValueError, 'does not match tensor volume'):
+        with self.assertRaisesRegex(ValueError, "does not match tensor volume"):
             check_tensor(spec.shards, torch.rand(10, 10).size())
 
     def test_get_split_size(self):
@@ -313,7 +318,7 @@ def _infer_enum_sharding_spec_case(self):
                 shard_offsets=[5, 0],
                 shard_sizes=[10, 5],
                 placement="cuda:1",
-            )
+            ),
         ]
         spec = _infer_sharding_spec_from_shards_metadata(shards_metadata)
         self.assertTrue(isinstance(spec, EnumerableShardingSpec))
@@ -329,7 +334,7 @@ def _infer_enum_sharding_spec_case(self):
                 shard_offsets=[16],
                 shard_sizes=[9],
                 placement="cuda:1",
-            )
+            ),
         ]
         spec = _infer_sharding_spec_from_shards_metadata(shards_metadata)
         self.assertTrue(isinstance(spec, EnumerableShardingSpec))
@@ -369,7 +374,9 @@ def _infer_chunk_sharding_spec_case(self, placements, sharding_dim, st_size):
             shard_size = copy.deepcopy(st_size)
             offsets = [0] * len(st_size)
             offsets[sharding_dim] = split_size * idx
-            shard_size[sharding_dim] = get_chunked_dim_size(st_size[sharding_dim], split_size, idx)
+            shard_size[sharding_dim] = get_chunked_dim_size(
+                st_size[sharding_dim], split_size, idx
+            )
             shards_metadata[placement.rank()] = ShardMetadata(
                 shard_offsets=offsets,
                 shard_sizes=shard_size,
@@ -390,73 +397,100 @@ def test_infer_sharding_spec_from_shards_metadata(self):
             self._infer_chunk_sharding_spec_case(spec.placements, 1, [12, 16])
             self._infer_chunk_sharding_spec_case(spec.placements, 2, [4, 18, 15])
             self._infer_chunk_sharding_spec_case(spec.placements, 3, [7, 12, 16, 37])
-            self._infer_chunk_sharding_spec_case(spec.placements, 4, [50, 4, 18, 15, 77])
+            self._infer_chunk_sharding_spec_case(
+                spec.placements, 4, [50, 4, 18, 15, 77]
+            )
 
     def test_check_overlapping(self):
         shards = [
             ShardMetadata(
-                shard_offsets=[0, 0], shard_sizes=[5, 5], placement="cuda:0",
+                shard_offsets=[0, 0],
+                shard_sizes=[5, 5],
+                placement="cuda:0",
             ),
             ShardMetadata(
-                shard_offsets=[5, 0], shard_sizes=[5, 5], placement="cuda:1",
-            )
+                shard_offsets=[5, 0],
+                shard_sizes=[5, 5],
+                placement="cuda:1",
+            ),
         ]
         validate_non_overlapping_shards_metadata(shards)
 
         shards = [
             ShardMetadata(
-                shard_offsets=[0, 0], shard_sizes=[5, 5], placement="cuda:0",
+                shard_offsets=[0, 0],
+                shard_sizes=[5, 5],
+                placement="cuda:0",
             ),
             ShardMetadata(
-                shard_offsets=[4, 0], shard_sizes=[5, 5], placement="cuda:1",
-            )
+                shard_offsets=[4, 0],
+                shard_sizes=[5, 5],
+                placement="cuda:1",
+            ),
         ]
         with self.assertRaisesRegex(ValueError, "overlap"):
             validate_non_overlapping_shards_metadata(shards)
 
         shards = [
             ShardMetadata(
-                shard_offsets=[0, 0], shard_sizes=[5, 5], placement="cuda:0",
+                shard_offsets=[0, 0],
+                shard_sizes=[5, 5],
+                placement="cuda:0",
             ),
             ShardMetadata(
-                shard_offsets=[0, 4], shard_sizes=[5, 5], placement="cuda:1",
-            )
+                shard_offsets=[0, 4],
+                shard_sizes=[5, 5],
+                placement="cuda:1",
+            ),
         ]
         with self.assertRaisesRegex(ValueError, "overlap"):
             validate_non_overlapping_shards_metadata(shards)
 
         shards = [
             ShardMetadata(
-                shard_offsets=[5, 0, 5], shard_sizes=[5, 5, 5], placement="cuda:0",
+                shard_offsets=[5, 0, 5],
+                shard_sizes=[5, 5, 5],
+                placement="cuda:0",
             ),
             ShardMetadata(
-                shard_offsets=[5, 5, 5], shard_sizes=[5, 5, 5], placement="cuda:1",
-            )
+                shard_offsets=[5, 5, 5],
+                shard_sizes=[5, 5, 5],
+                placement="cuda:1",
+            ),
         ]
         validate_non_overlapping_shards_metadata(shards)
 
         shards = [
             ShardMetadata(
-                shard_offsets=[5, 0, 5], shard_sizes=[5, 5, 5], placement="cuda:0",
+                shard_offsets=[5, 0, 5],
+                shard_sizes=[5, 5, 5],
+                placement="cuda:0",
             ),
             ShardMetadata(
-                shard_offsets=[5, 4, 5], shard_sizes=[5, 5, 5], placement="cuda:1",
-            )
+                shard_offsets=[5, 4, 5],
+                shard_sizes=[5, 5, 5],
+                placement="cuda:1",
+            ),
         ]
         with self.assertRaisesRegex(ValueError, "overlap"):
             validate_non_overlapping_shards_metadata(shards)
 
         shards = [
             ShardMetadata(
-                shard_offsets=[5, 0, 5], shard_sizes=[5, 5, 5], placement="cuda:0",
+                shard_offsets=[5, 0, 5],
+                shard_sizes=[5, 5, 5],
+                placement="cuda:0",
             ),
             ShardMetadata(
-                shard_offsets=[5, 4, 9], shard_sizes=[5, 5, 5], placement="cuda:1",
-            )
+                shard_offsets=[5, 4, 9],
+                shard_sizes=[5, 5, 5],
+                placement="cuda:1",
+            ),
         ]
         with self.assertRaisesRegex(ValueError, "overlap"):
             validate_non_overlapping_shards_metadata(shards)
 
+
 # Custom ShardingSpec, an simple example to do grid sharding
 @dataclass
 class GridShardingSpec(ShardingSpec):
@@ -468,10 +502,11 @@ def __post_init__(self):
             if not isinstance(remote_device, torch.distributed._remote_device):
                 self.placements[i] = torch.distributed._remote_device(remote_device)
 
-    def build_metadata(self,
-                       tensor_sizes: torch.Size,
-                       tensor_properties: TensorProperties,
-                       ) -> ShardedTensorMetadata:
+    def build_metadata(
+        self,
+        tensor_sizes: torch.Size,
+        tensor_properties: TensorProperties,
+    ) -> ShardedTensorMetadata:
         tensor_num_dim = len(tensor_sizes)
         assert tensor_num_dim == 2, "only support 2-dim tensor for grid sharding"
         shards_metadata = []
@@ -488,25 +523,26 @@ def chunk_num(dim_size, grid_size):
             for col_idx in range(col_chunks):
                 shards_metadata.append(
                     ShardMetadata(
-                        shard_offsets=[row_idx * self.grid_size, col_idx * self.grid_size],
+                        shard_offsets=[
+                            row_idx * self.grid_size,
+                            col_idx * self.grid_size,
+                        ],
                         shard_sizes=[self.grid_size, self.grid_size],
-                        placement=self.placements[row_idx * row_chunks + col_idx]
+                        placement=self.placements[row_idx * row_chunks + col_idx],
                     )
                 )
         return ShardedTensorMetadata(
             shards_metadata=shards_metadata,
             size=tensor_sizes,
-            tensor_properties=tensor_properties
+            tensor_properties=tensor_properties,
         )
 
-
-    def shard(self,
-              tensor: torch.Tensor,
-              src_rank: int = 0,
-              process_group=None) -> ShardedTensor:
-
+    def shard(
+        self, tensor: torch.Tensor, src_rank: int = 0, process_group=None
+    ) -> ShardedTensor:
         raise NotImplementedError("GridShardingSpec.shard not implemented yet!")
 
+
 class TestCustomShardingSpec(ShardedTensorTestBase):
     def test_custom_sharding_spec(self):
         ranks = [
@@ -516,10 +552,7 @@ def test_custom_sharding_spec(self):
             "rank:3/cuda:3",
         ]
 
-        grid_spec = GridShardingSpec(
-            grid_size=4,
-            placements=ranks
-        )
+        grid_spec = GridShardingSpec(grid_size=4, placements=ranks)
 
         tensor_properties = TensorProperties(
             dtype=torch.get_default_dtype(),
@@ -536,8 +569,8 @@ def test_custom_sharding_spec(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_custom_sharding_spec_tensor_ctor(self):
-        """ Test sharded_tensor.ones(...) with the custom
-            grid sharding spec.
+        """Test sharded_tensor.ones(...) with the custom
+        grid sharding spec.
         """
 
         ranks = [
@@ -547,10 +580,7 @@ def test_custom_sharding_spec_tensor_ctor(self):
             "rank:3/cuda:3",
         ]
 
-        grid_spec = GridShardingSpec(
-            grid_size=2,
-            placements=ranks
-        )
+        grid_spec = GridShardingSpec(grid_size=2, placements=ranks)
 
         st = sharded_tensor.ones(grid_spec, 4, 4)
 
@@ -566,8 +596,8 @@ def test_custom_sharding_spec_tensor_ctor(self):
     @skip_if_lt_x_gpu(4)
     @requires_nccl()
     def test_custom_sharding_spec_shard_tensor(self):
-        """ Test custom spec can be invoked from the
-            _shard_tensor callsite.
+        """Test custom spec can be invoked from the
+        _shard_tensor callsite.
         """
 
         ranks = [
@@ -577,14 +607,11 @@ def test_custom_sharding_spec_shard_tensor(self):
             "rank:3/cuda:3",
         ]
 
-        grid_spec = GridShardingSpec(
-            grid_size=2,
-            placements=ranks
-        )
+        grid_spec = GridShardingSpec(grid_size=2, placements=ranks)
 
-        with self.assertRaisesRegex(NotImplementedError, 'not implemented'):
+        with self.assertRaisesRegex(NotImplementedError, "not implemented"):
             _shard_tensor(torch.randn(8, 8), grid_spec)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_shard/test_sharder.py b/test/distributed/_shard/test_sharder.py
index 79bcfe56f3126..9a59f891bc4e6 100644
--- a/test/distributed/_shard/test_sharder.py
+++ b/test/distributed/_shard/test_sharder.py
@@ -1,24 +1,20 @@
-
 # Owner(s): ["oncall: distributed"]
-import sys
 import copy
+import sys
 
 import torch
 import torch.nn as nn
-from torch.testing._internal.common_distributed import (
-    requires_nccl,
-    skip_if_lt_x_gpu,
-)
 from torch.distributed._shard import shard_module
-from torch.distributed._shard.sharding_plan import ShardingPlan
+from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed._shard.sharder import Sharder
+from torch.distributed._shard.sharding_plan import ShardingPlan
 from torch.distributed._shard.sharding_spec import ChunkShardingSpec
-from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
 
-from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN, run_tests
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 from torch.testing._internal.distributed._shard.sharded_tensor import (
-    TEST_GPU_NUM,
     ShardedTensorTestBase,
+    TEST_GPU_NUM,
     with_comms,
 )
 
@@ -29,6 +25,7 @@
     )
     sys.exit(0)
 
+
 # a simple collection of embedding bag implementation
 class CustomEmbeddingBagCollection(nn.Module):
     def __init__(self, num_bags, num_embeddings_per_bag, num_dims):
@@ -38,9 +35,8 @@ def __init__(self, num_bags, num_embeddings_per_bag, num_dims):
 
         for i in range(num_bags):
             self.embedding_bags[f"embedding_bag_{i}"] = nn.EmbeddingBag(
-                num_embeddings_per_bag,
-                num_dims,
-                mode="sum")
+                num_embeddings_per_bag, num_dims, mode="sum"
+            )
 
     def forward(self, inputs):
         outputs = []
@@ -48,6 +44,7 @@ def forward(self, inputs):
             outputs.append(bag(inputs))
         return torch.cat(outputs)
 
+
 # a simple sharded version of EBC
 class CustomShardedEBC(nn.Module):
     def __init__(self, ebc, split_idx, specs):
@@ -62,9 +59,19 @@ def __init__(self, ebc, split_idx, specs):
         for i in range(ebc.num_bags):
             bag_key = f"embedding_bag_{i}"
             if i < self.split_idx:
-                shard_module(ebc, plan=ShardingPlan(plan={f"embedding_bags.{bag_key}.weight": row_spec}))
+                shard_module(
+                    ebc,
+                    plan=ShardingPlan(
+                        plan={f"embedding_bags.{bag_key}.weight": row_spec}
+                    ),
+                )
             else:
-                shard_module(ebc, plan=ShardingPlan(plan={f"embedding_bags.{bag_key}.weight": col_spec}))
+                shard_module(
+                    ebc,
+                    plan=ShardingPlan(
+                        plan={f"embedding_bags.{bag_key}.weight": col_spec}
+                    ),
+                )
 
             self.embedding_bags[bag_key] = ebc.embedding_bags[bag_key]
 
@@ -78,13 +85,16 @@ def __init__(self, devices, split_sharding_idx):
 
     def shard(self, ebc: nn.Module) -> nn.Module:
         if not isinstance(ebc, CustomEmbeddingBagCollection):
-            raise RuntimeError("The custom sharder only supports CustomEmbeddingBagCollection")
+            raise RuntimeError(
+                "The custom sharder only supports CustomEmbeddingBagCollection"
+            )
 
-        return CustomShardedEBC(ebc, self.split_sharding_idx, (self.rowwise_spec, self.colwise_spec))
+        return CustomShardedEBC(
+            ebc, self.split_sharding_idx, (self.rowwise_spec, self.colwise_spec)
+        )
 
 
 class TestCustomSharder(ShardedTensorTestBase):
-
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(TEST_GPU_NUM)
     @requires_nccl()
@@ -99,13 +109,14 @@ def forward(self, inputs):
 
         custom_sharder = CustomSharder(
             devices=[f"rank:{i}/cuda:{i}" for i in range(TEST_GPU_NUM)],
-            split_sharding_idx=TEST_GPU_NUM // 2
+            split_sharding_idx=TEST_GPU_NUM // 2,
         )
 
         sharding_plan = ShardingPlan(
             plan={
                 "ebc": custom_sharder,
-            })
+            }
+        )
 
         local_model = MyModule().cuda(self.rank)
         sharded_model = copy.deepcopy(local_model)
@@ -117,8 +128,14 @@ def forward(self, inputs):
         emb_bags = sharded_model.ebc.embedding_bags
         self.assertTrue(isinstance(emb_bags["embedding_bag_0"].weight, ShardedTensor))
         self.assertTrue(isinstance(emb_bags["embedding_bag_9"].weight, ShardedTensor))
-        self.assertEqual(emb_bags["embedding_bag_0"].weight.sharding_spec(), custom_sharder.rowwise_spec)
-        self.assertEqual(emb_bags["embedding_bag_9"].weight.sharding_spec(), custom_sharder.colwise_spec)
+        self.assertEqual(
+            emb_bags["embedding_bag_0"].weight.sharding_spec(),
+            custom_sharder.rowwise_spec,
+        )
+        self.assertEqual(
+            emb_bags["embedding_bag_9"].weight.sharding_spec(),
+            custom_sharder.colwise_spec,
+        )
 
         # make sure we can run sharded computation and compare outputs
         # with the local model version
@@ -134,13 +151,14 @@ def forward(self, inputs):
     def test_custom_sharder_errors(self):
         custom_sharder = CustomSharder(
             devices=[f"rank:{i}/cuda:{i}" for i in range(TEST_GPU_NUM)],
-            split_sharding_idx=TEST_GPU_NUM // 2
+            split_sharding_idx=TEST_GPU_NUM // 2,
         )
 
         sharding_plan = ShardingPlan(
             plan={
                 "": custom_sharder,
-            })
+            }
+        )
 
         sharded_model = CustomEmbeddingBagCollection(10, 10, 8).cuda(self.rank)
 
@@ -156,7 +174,8 @@ def test_custom_sharder_errors(self):
             plan={
                 "embedding_bags.embedding_bag_0.weight": spec,
                 "embedding_bags": custom_sharder,
-            })
+            }
+        )
 
         with self.assertRaisesRegex(
             RuntimeError, "should not conflict with the submodule tree"
diff --git a/test/distributed/_spmd/test_tracing.py b/test/distributed/_spmd/test_tracing.py
index 555d955da172a..025167c32f58d 100644
--- a/test/distributed/_spmd/test_tracing.py
+++ b/test/distributed/_spmd/test_tracing.py
@@ -256,7 +256,7 @@ def ddm_backward(grad: torch.Tensor) -> torch.Tensor:
     return grad
 
 
-dummy_lib = torch.library.Library("dummy", "DEF")
+dummy_lib = torch.library.Library("dummy", "DEF")  # noqa: TOR901
 dummy_lib.define("ddm(Tensor x) -> Tensor")
 dummy_lib.impl("ddm", ddm, "CompositeExplicitAutograd")
 dummy_lib.define("ddm_backward(Tensor x) -> Tensor")
diff --git a/test/distributed/_tensor/debug/test_comm_mode.py b/test/distributed/_tensor/debug/test_comm_mode.py
index c52fb84d52c71..d674905f2bd03 100644
--- a/test/distributed/_tensor/debug/test_comm_mode.py
+++ b/test/distributed/_tensor/debug/test_comm_mode.py
@@ -5,13 +5,17 @@
 
 import torch.distributed._functional_collectives as funcol
 import torch.nn as nn
+from torch.distributed._tensor import DeviceMesh, DTensor
 
 from torch.distributed._tensor.debug.comm_mode import CommDebugMode
+from torch.distributed._tensor.placement_types import Shard
+from torch.testing._internal.common_distributed import requires_nccl
 from torch.testing._internal.common_utils import run_tests, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule
 from torch.testing._internal.distributed.fake_pg import FakeStore
 
 c10d_functional = torch.ops.c10d_functional
+c10d_ops = torch.ops.c10d
 
 
 class TestCommMode(TestCase):
@@ -21,8 +25,11 @@ def tearDown(self):
 
     def setUp(self):
         super().setUp()
+        self.world_size = 2
         store = FakeStore()
-        dist.init_process_group(backend="fake", rank=1, world_size=2, store=store)
+        dist.init_process_group(
+            backend="fake", rank=1, world_size=self.world_size, store=store
+        )
         self.device_type = "cuda" if torch.cuda.is_available() else "cpu"
         self.world_pg = dist.distributed_c10d._get_default_group()
 
@@ -52,6 +59,48 @@ def forward(self, x):
         self.assertEqual(comm_counts[c10d_functional.all_gather_into_tensor], 1)
         self.assertEqual(comm_counts[c10d_functional.reduce_scatter_tensor], 1)
 
+    def test_comm_mode_with_dtensor(self):
+        world_pg = self.world_pg
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+
+        def f(x, y):
+            return torch.mm(x, y)
+
+        comm_mode = CommDebugMode()
+        x = torch.randn(4, 8, requires_grad=True)
+        y = torch.randn(4, 32, requires_grad=True)
+        x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
+        y_dtensor = DTensor.from_local(y, mesh, [Shard(0)], run_check=False)
+
+        with comm_mode:
+            f(x_dtensor, y_dtensor)
+
+        comm_counts = comm_mode.get_comm_counts()
+        self.assertEqual(comm_mode.get_total_counts(), 1)
+        self.assertEqual(comm_counts[c10d_functional.all_reduce], 0)
+        self.assertEqual(comm_counts[c10d_functional.all_gather_into_tensor], 1)
+        self.assertEqual(comm_counts[c10d_functional.reduce_scatter_tensor], 0)
+
+    @requires_nccl()
+    def test_comm_mode_with_c10d(self):
+        world_pg = self.world_pg
+
+        inp = torch.rand(2, 8, 16).cuda()
+        all_gather_out = inp.new_empty(self.world_size * 2, 8, 16)
+
+        comm_mode = CommDebugMode()
+        with comm_mode:
+            dist.all_reduce(inp)
+            dist.all_gather_into_tensor(all_gather_out, inp)
+            dist.reduce_scatter_tensor(inp, all_gather_out)
+            dist.broadcast(inp, 0)
+
+        comm_counts = comm_mode.get_comm_counts()
+        self.assertEqual(comm_counts[c10d_ops.allreduce_], 1)
+        self.assertEqual(comm_counts[c10d_ops._allgather_base_], 1)
+        self.assertEqual(comm_counts[c10d_ops._reduce_scatter_base_], 1)
+        self.assertEqual(comm_counts[c10d_ops.broadcast_], 1)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_tensor/experimental/test_local_map.py b/test/distributed/_tensor/experimental/test_local_map.py
new file mode 100644
index 0000000000000..1035df2f5f7d8
--- /dev/null
+++ b/test/distributed/_tensor/experimental/test_local_map.py
@@ -0,0 +1,228 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+import torch.distributed._functional_collectives as funcol
+from torch.distributed._tensor import (
+    distribute_tensor,
+    init_device_mesh,
+    Replicate,
+    Shard,
+)
+from torch.distributed._tensor.debug import CommDebugMode
+from torch.distributed._tensor.experimental import local_map
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+
+
+def equal_forward(device_mesh, X, Y):
+    eq = torch.tensor([torch.equal(X, Y)], device=X.device)
+    eq_gather = funcol.all_gather_tensor(eq, 0, device_mesh)
+    return torch.all(eq_gather).item()
+
+
+def mm_forward(device_mesh, W, X):
+    return torch.mm(W, X)
+
+
+def mm_allreduce_forward(device_mesh, W, X):
+    partial_sum_tensor = torch.mm(W, X)
+    reduced_tensor = funcol.all_reduce(partial_sum_tensor, "sum", device_mesh).wait()
+    return reduced_tensor
+
+
+def mul_forward(device_mesh, X, scalar):
+    return torch.mul(X, scalar)
+
+
+class TestLocalMap(DTensorTestBase):
+    @property
+    def world_size(self):
+        return 2
+
+    # simple correctness check
+    @with_comms
+    def test_local_map_correctness(self):
+        device_mesh = init_device_mesh(
+            device_type=self.device_type, mesh_shape=(self.world_size,)
+        )
+        comm_mode = CommDebugMode()
+
+        # Y = W @ X
+        W = torch.randn(12, 8, device=self.device_type, requires_grad=False)
+        X = torch.randn(8, 16, device=self.device_type, requires_grad=False)
+        Y = torch.mm(W, X)
+
+        row_wise = [Shard(0)]  # row-wise sharding placements on 1-d mesh
+        col_wise = [Shard(1)]  # col-wise sharding placements on 1-d mesh
+        W_dt = distribute_tensor(
+            W, device_mesh, col_wise
+        )  # col-wisely sharded W tensor
+        X_dt = distribute_tensor(
+            X, device_mesh, row_wise
+        )  # row-wisely sharded X tensor
+        # get the function wrapped with DTensor/Tensor convertion
+        # mm_allreduce_forward is a function that applies to Tensors with manual collective
+        # local_mm_allreduce_forward is the function that does the same but applies to
+        # DTensors' `_local_tensor`.
+        local_mm_allreduce_forward = local_map(
+            mm_allreduce_forward,
+            out_placements=[Replicate()],
+            in_placements=(col_wise, row_wise),
+            device_mesh=device_mesh,
+        )
+        with comm_mode:
+            Y_dt = local_mm_allreduce_forward(W_dt, X_dt)
+
+        # output redistribution to Replicate
+        self.assertEqual(comm_mode.get_total_counts(), 1)
+        # check output placements
+        for placement in Y_dt.placements:
+            self.assertTrue(placement.is_replicate())
+        # check output value
+        self.assertEqual(Y_dt.to_local(), Y)
+
+    # check for `out_placements`
+    @with_comms
+    def test_local_map_out_placements(self):
+        device_mesh = init_device_mesh(
+            device_type=self.device_type, mesh_shape=(self.world_size,)
+        )
+        comm_mode = CommDebugMode()
+
+        # X.equal(Y)
+        X = torch.randn(8, 8, device=self.device_type, requires_grad=False)
+        Y = torch.randn(8, 8, device=self.device_type, requires_grad=False)
+        row_wise = [Shard(0)]
+        X_dt = distribute_tensor(X, device_mesh, row_wise)
+        Y_dt = distribute_tensor(Y, device_mesh, row_wise)
+        local_equal_forward = local_map(equal_forward, out_placements=None)
+        with comm_mode:
+            equal_dt = local_equal_forward(X_dt, Y_dt)  # a bool
+
+        self.assertEqual(comm_mode.get_total_counts(), 1)
+        self.assertTrue(not equal_dt)
+        self.assertTrue(not (X.equal(Y)))
+
+    # check for `in_placements` handling
+    @with_comms
+    def test_local_map_in_placements(self):
+        device_mesh = init_device_mesh(
+            device_type=self.device_type, mesh_shape=(self.world_size,)
+        )
+        comm_mode = CommDebugMode()
+
+        # Y = W @ X
+        W = torch.randn(12, 8, device=self.device_type, requires_grad=False)
+        X = torch.randn(8, 16, device=self.device_type, requires_grad=False)
+        Y = torch.mm(W, X)
+
+        row_wise = [Shard(0)]  # row-wise sharding placements on 1-d mesh
+        replicate = [Replicate()]  # replicate placements on 1-d mesh
+        W_dt = distribute_tensor(
+            W, device_mesh, row_wise
+        )  # row-wisely sharded W tensor
+        X_dt = distribute_tensor(X, device_mesh, replicate)  # replicate X tensor
+
+        # Test 1: explicitly pass `in_placements`
+        local_mm_forward = local_map(
+            mm_forward,
+            out_placements=row_wise,
+            in_placements=(row_wise, replicate),
+            device_mesh=device_mesh,
+        )
+        with comm_mode:
+            Y_dt = local_mm_forward(W_dt, X_dt)
+
+        # no communication should occur in this case
+        self.assertEqual(comm_mode.get_total_counts(), 0)
+        for placement in Y_dt.placements:
+            self.assertTrue(placement.is_shard(dim=0))
+        self.assertEqual(Y_dt.full_tensor(), Y)
+
+        # Test 2: `in_placements=None`
+        local_mm_forward = local_map(
+            mm_forward,
+            out_placements=row_wise,
+            device_mesh=device_mesh,
+        )
+        with comm_mode:
+            Y_dt = local_mm_forward(W_dt, X_dt)
+
+        self.assertEqual(comm_mode.get_total_counts(), 0)
+        for placement in Y_dt.placements:
+            self.assertTrue(placement.is_shard(dim=0))
+        self.assertEqual(Y_dt.full_tensor(), Y)
+
+        # Test 3: `None` placements for non-Tensor input argument
+        local_mul_forward = local_map(
+            mul_forward,
+            in_placements=(row_wise, None),
+            out_placements=row_wise,
+            device_mesh=device_mesh,
+        )
+        Y = torch.mul(W, 2.0)
+        with comm_mode:
+            Y_dt = local_mul_forward(W_dt, 2.0)
+
+        self.assertEqual(comm_mode.get_total_counts(), 0)
+        for placement in Y_dt.placements:
+            self.assertTrue(placement.is_shard(dim=0))
+        self.assertEqual(Y_dt.full_tensor(), Y)
+
+    # check for `redistribute_inputs` handling
+    @with_comms
+    def test_local_map_redistribute(self):
+        device_mesh = init_device_mesh(
+            device_type=self.device_type, mesh_shape=(self.world_size,)
+        )
+        comm_mode = CommDebugMode()
+
+        # Y = W @ X
+        W = torch.randn(12, 8, device=self.device_type, requires_grad=False)
+        X = torch.randn(8, 16, device=self.device_type, requires_grad=False)
+        Y = torch.mm(W, X)
+
+        row_wise = [Shard(0)]  # row-wise sharding placements on 1-d mesh
+        col_wise = [Shard(1)]  # col-wise sharding placements on 1-d mesh
+        W_dt = distribute_tensor(
+            W, device_mesh, row_wise
+        )  # row-wisely sharded W tensor which will be redistributed
+        X_dt = distribute_tensor(
+            X, device_mesh, col_wise
+        )  # col-wisely sharded X tensor which will be redistributed
+
+        # Test 1: allow input redistribution
+        local_mm_allreduce_forward = local_map(
+            mm_allreduce_forward,
+            out_placements=[Replicate()],
+            in_placements=(col_wise, row_wise),
+            device_mesh=device_mesh,
+            redistribute_inputs=True,
+        )
+        with comm_mode:
+            Y_dt = local_mm_allreduce_forward(W_dt, X_dt)
+
+        # 2 for input redistribution and 1 for output
+        self.assertEqual(comm_mode.get_total_counts(), 3)
+        for placement in Y_dt.placements:
+            self.assertTrue(placement.is_replicate())
+        self.assertEqual(Y_dt.to_local(), Y)
+
+        # Test 2: no input redistribution is allowed
+        local_mm_allreduce_forward = local_map(
+            mm_allreduce_forward,
+            out_placements=[Replicate()],
+            in_placements=(col_wise, row_wise),
+            device_mesh=device_mesh,
+            redistribute_inputs=False,
+        )
+        with self.assertRaisesRegex(ValueError, "set redistribute_inputs=True"):
+            Y_dt = local_mm_allreduce_forward(W_dt, X_dt)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/experimental/test_tp_transform.py b/test/distributed/_tensor/experimental/test_tp_transform.py
index 38500a4438782..636870264f84d 100644
--- a/test/distributed/_tensor/experimental/test_tp_transform.py
+++ b/test/distributed/_tensor/experimental/test_tp_transform.py
@@ -72,10 +72,9 @@ def test_tp_transform_with_uncovered_op(self):
         inputs = (torch.randn(7, 3, requires_grad=False).to(device=self.device_type),)
         with torch.no_grad():
             res = model(*inputs)
-        exported_program = torch._export.export(
+        exported_program = torch.export.export(
             model,
             inputs,
-            constraints=None,
         )
         tp_exported_program = tensor_parallel_transformation(
             exported_program,
@@ -92,8 +91,8 @@ def test_tp_transform_with_uncovered_op(self):
         self.assert_has_c10d_ops(
             tp_exported_program.graph_module,
             {
-                "c10d_functional.all_gather_into_tensor.default": 1,
-                "c10d_functional.wait_tensor.default": 1,
+                "_c10d_functional.all_gather_into_tensor.default": 1,
+                "_c10d_functional.wait_tensor.default": 1,
             },
         )
 
@@ -111,10 +110,9 @@ def test_tp_transform_e2e(self):
 
         with torch.inference_mode():
             res = model(*inputs)
-        exported_program = torch._export.export(
+        exported_program = torch.export.export(
             model,
             inputs,
-            constraints=None,
         )
         tp_exported_program = tensor_parallel_transformation(
             exported_program,
@@ -131,8 +129,8 @@ def test_tp_transform_e2e(self):
         self.assert_has_c10d_ops(
             tp_exported_program.graph_module,
             {
-                "c10d_functional.all_reduce.default": 2,
-                "c10d_functional.wait_tensor.default": 2,
+                "_c10d_functional.all_reduce.default": 2,
+                "_c10d_functional.wait_tensor.default": 2,
             },
         )
 
@@ -148,10 +146,9 @@ def test_tp_transform_no_bias(self):
 
         with torch.inference_mode():
             res = model(*inputs)
-        exported_program = torch._export.export(
+        exported_program = torch.export.export(
             model,
             inputs,
-            constraints=None,
         )
         tp_exported_program = tensor_parallel_transformation(
             exported_program,
@@ -167,8 +164,8 @@ def test_tp_transform_no_bias(self):
         self.assert_has_c10d_ops(
             tp_exported_program.graph_module,
             {
-                "c10d_functional.all_reduce.default": 1,
-                "c10d_functional.wait_tensor.default": 1,
+                "_c10d_functional.all_reduce.default": 1,
+                "_c10d_functional.wait_tensor.default": 1,
             },
         )
 
diff --git a/test/distributed/_tensor/test_api.py b/test/distributed/_tensor/test_api.py
index c6ac865cad5cf..196bd6407b266 100644
--- a/test/distributed/_tensor/test_api.py
+++ b/test/distributed/_tensor/test_api.py
@@ -185,10 +185,10 @@ def test_distribute_module_input_fn_output_fn(self):
         module_to_replicate = MyModel(20, 1, device=self.device_type)
 
         # mark input sharding on dim 0
-        def input_fn(inputs, device_mesh):
+        def input_fn(mod, inputs, device_mesh):
             return DTensor.from_local(inputs[0], device_mesh, [Shard(0)])
 
-        def output_fn(outputs, device_mesh):
+        def output_fn(mod, outputs, device_mesh):
             assert isinstance(outputs, DTensor)
             return outputs.to_local()
 
@@ -207,7 +207,7 @@ def output_fn(outputs, device_mesh):
         # full replicate (even on inputs)
         model = MyModel(10, 10, device=self.device_type)
 
-        def replicate_input_fn(inputs, device_mesh):
+        def replicate_input_fn(mod, inputs, device_mesh):
             return DTensor.from_local(inputs[0], device_mesh, [Replicate()])
 
         replica_model = distribute_module(
@@ -222,6 +222,72 @@ def replicate_input_fn(inputs, device_mesh):
         self.assertTrue(isinstance(param_grad, DTensor))
         self.assertTrue(isinstance(param_grad.placements[0], Replicate))
 
+    @with_comms
+    def test_distribute_module_input_fn_output_fn_warning(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+
+        # fully replicate all linear modules
+        module_to_replicate = MyModel(20, 1, device=self.device_type)
+
+        # mark input sharding on dim 0
+        def input_fn(inputs, device_mesh):
+            return DTensor.from_local(inputs[0], device_mesh, [Shard(0)])
+
+        def output_fn(outputs, device_mesh):
+            assert isinstance(outputs, DTensor)
+            return outputs.to_local()
+
+        with self.assertWarnsRegex(UserWarning, "Deprecating"):
+            replica_module = distribute_module(
+                module_to_replicate,
+                device_mesh,
+                input_fn=input_fn,
+                output_fn=output_fn,
+            )
+
+        input_tensor = torch.randn(5, 20, device=self.device_type)
+        local_out = replica_module(input_tensor)
+        self.assertIsInstance(local_out, torch.Tensor)
+        self.assertNotIsInstance(local_out, DTensor)
+
+    @with_comms
+    def test_distribute_module_casting(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+
+        # check DTensor casting
+        dt = DTensor.from_local(torch.rand(10), device_mesh, [Replicate()])
+        dt = dt.to(torch.bfloat16)
+        self.assertEqual(dt.dtype, torch.bfloat16)
+        self.assertEqual(dt._local_tensor.dtype, torch.bfloat16)
+
+        # check distribute_tensor casting
+        dt = distribute_tensor(torch.rand(10), device_mesh, [Replicate()])
+        dt = dt.to(torch.bfloat16)
+        self.assertEqual(dt.dtype, torch.bfloat16)
+        self.assertEqual(dt._local_tensor.dtype, torch.bfloat16)
+
+        # check distribute_module casting
+        model = MyModel(10, 10, device=self.device_type)
+        replica_model = distribute_module(
+            model,
+            device_mesh,
+        )
+        replica_model = replica_model.to(torch.bfloat16)
+        self.assertEqual(replica_model.seq[0].weight.dtype, torch.bfloat16)
+        self.assertEqual(
+            replica_model.seq[0].weight._local_tensor.dtype, torch.bfloat16
+        )
+
+        # check autocast
+        dt = distribute_tensor(torch.rand(10), device_mesh, [Replicate()])
+        replica_model = distribute_module(
+            model,
+            device_mesh,
+        )
+        with torch.autocast(device_type=self.device_type, dtype=torch.bfloat16):
+            output = replica_model(dt)
+        self.assertEqual(output.dtype, torch.bfloat16)
+
     @with_comms
     def test_distribute_module_meta(self):
         # If  the model is too big, the user may first the create entire model on the meta device and then initialize
diff --git a/test/distributed/_tensor/test_attention.py b/test/distributed/_tensor/test_attention.py
new file mode 100644
index 0000000000000..db5a26d438502
--- /dev/null
+++ b/test/distributed/_tensor/test_attention.py
@@ -0,0 +1,384 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+import unittest
+
+import torch
+from torch import nn
+from torch.distributed._tensor import DeviceMesh, distribute_tensor, Shard
+from torch.distributed._tensor.debug import CommDebugMode
+from torch.distributed._tensor.experimental.attention import (
+    _CausalBehavior,
+    _is_causal_behavior,
+    _scaled_dot_product_chunk_flash_attention,
+    _scaled_dot_product_ring_efficient_attention,
+    _scaled_dot_product_ring_flash_attention,
+    attention_context_parallel,
+    AttentionContextParallel,
+)
+from torch.distributed.tensor.parallel import parallelize_module
+from torch.nn.attention import sdpa_kernel, SDPBackend
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+)
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    ModelArgs,
+    Transformer,
+    with_comms,
+)
+
+
+c10d_functional = torch.ops.c10d_functional
+
+
+class RingAttentionTest(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
+    )
+    @with_comms
+    @parametrize("is_causal", [True, False])
+    def test_ring_attention_sdpa(self, is_causal: bool) -> None:
+        device_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(0, self.world_size),
+        )
+        dtype = torch.bfloat16
+        bs = 8
+        query_tokens = 8
+        context_tokens = query_tokens if is_causal else 8
+        dim = 32
+        nheads = 8
+        query = torch.rand(
+            (bs, nheads, self.world_size * query_tokens, dim),
+            device=self.device_type,
+            dtype=dtype,
+            requires_grad=True,
+        )
+        key = torch.rand(
+            (bs, nheads, self.world_size * context_tokens, dim),
+            device=self.device_type,
+            dtype=dtype,
+        )
+        value = torch.rand(
+            (bs, nheads, self.world_size * context_tokens, dim),
+            device=self.device_type,
+            dtype=dtype,
+        )
+
+        query_placement = [Shard(2)]
+        dquery = distribute_tensor(query, device_mesh, query_placement)
+        self.assertEqual(query.shape, (bs, nheads, self.world_size * query_tokens, dim))
+
+        context_placement = [Shard(2)]
+        dkey = distribute_tensor(key, device_mesh, context_placement)
+        dvalue = distribute_tensor(value, device_mesh, context_placement)
+        for t in [dkey, dvalue]:
+            self.assertEqual(
+                t.shape, (bs, nheads, context_tokens * self.world_size, dim)
+            )
+            self.assertEqual(t.to_local().shape, (bs, nheads, context_tokens, dim))
+
+        # local tensors
+        out, logsumexp, *others = torch.ops.aten._scaled_dot_product_flash_attention(
+            query, key, value, is_causal=is_causal
+        )
+
+        self.assertEqual(out.shape, (bs, nheads, self.world_size * query_tokens, dim))
+        out.sum().backward()
+        out_grad = query.grad
+        query.grad = None
+        self.assertIsNotNone(out_grad)
+
+        # compute chunked version to compare distributed to chunked implementations
+        # chunked isn't numerically identical to single operator version
+        (
+            out_chunk,
+            logsumexp_chunk,
+            *others,
+        ) = _scaled_dot_product_chunk_flash_attention(
+            query,
+            key,
+            value,
+            size=self.world_size,
+            is_causal=is_causal,
+        )
+
+        out_chunk.sum().backward()
+        self.assertEqual(
+            out_chunk.shape, (bs, nheads, self.world_size * query_tokens, dim)
+        )
+        self.assertEqual(logsumexp_chunk, logsumexp)
+        self.assertEqual(out_chunk, out)
+        out_chunk_grad = query.grad
+        query.grad = None
+        # gradient doesn't match due to numerical issues with chunk size > 1
+        # self.assertEqual(out_chunk_grad, out_grad)
+
+        # parallel behavior
+        with attention_context_parallel(), CommDebugMode() as comm_mode:
+            (
+                out_parallel,
+                logsumexp_parallel,
+                *others,
+            ) = torch.ops.aten._scaled_dot_product_flash_attention(
+                dquery, dkey, dvalue, is_causal=is_causal
+            )
+        self.assertDictEqual(
+            comm_mode.get_comm_counts(),
+            {
+                c10d_functional.all_to_all_single: self.world_size - 1,
+            },
+        )
+        self.assertEqual(out_parallel.placements, (Shard(2),))
+        self.assertEqual(
+            out_parallel._local_tensor.shape, (bs, nheads, query_tokens, dim)
+        )
+        self.assertEqual(
+            out_parallel.shape, (bs, nheads, self.world_size * query_tokens, dim)
+        )
+        out_parallel_tensor = out_parallel.full_tensor()
+        self.assertEqual(out_parallel_tensor, out)
+        logsumexp_parallel_tensor = logsumexp_parallel.full_tensor()
+        self.assertEqual(logsumexp_parallel_tensor, logsumexp)
+
+        self.assertIsNone(dquery.grad)
+        with attention_context_parallel(), CommDebugMode() as comm_mode:
+            out_parallel.sum().backward()
+
+        self.assertDictEqual(
+            comm_mode.get_comm_counts(),
+            {
+                c10d_functional.all_to_all_single: (self.world_size - 1) * 2,
+            },
+        )
+        out_parallel_grad = dquery.grad.full_tensor()
+        dquery.grad = None
+        self.assertEqual(out_parallel_grad, out_chunk_grad)
+
+    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
+    )
+    @with_comms
+    @sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
+    @parametrize("is_causal", [True, False])
+    def test_ring_attention_native_transformer(self, is_causal: bool) -> None:
+        device_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(0, self.world_size),
+        )
+        dtype = torch.bfloat16
+        bs = 8
+        ntokens = 8
+        dim = 32
+        nheads = 8
+        num_layers = 2
+
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=dim,
+            nhead=nheads,
+            dim_feedforward=dim,
+            batch_first=True,
+        ).to(dtype)
+        encoder_layer = parallelize_module(
+            module=encoder_layer,
+            device_mesh=device_mesh,
+            parallelize_plan={
+                "self_attn": AttentionContextParallel(),
+            },
+        )
+        model = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        model = model.to(self.device_type).to(dtype)
+
+        mask = (
+            nn.Transformer.generate_square_subsequent_mask(
+                ntokens, device=self.device_type, dtype=dtype
+            )
+            if is_causal
+            else None
+        )
+        seq = torch.rand((bs, ntokens, dim), device=self.device_type, dtype=dtype)
+
+        with CommDebugMode() as comm_mode:
+            out = model(seq, mask=mask, is_causal=is_causal)
+        self.assertDictEqual(
+            comm_mode.get_comm_counts(),
+            {
+                c10d_functional.all_to_all_single: (self.world_size - 1) * num_layers,
+            },
+        )
+
+        with CommDebugMode() as comm_mode:
+            out.sum().backward()
+        self.assertDictEqual(
+            comm_mode.get_comm_counts(),
+            {
+                c10d_functional.all_to_all_single: (self.world_size - 1)
+                * 2
+                * num_layers,
+            },
+        )
+
+    def test_is_causal_behavior(self) -> None:
+        # not causal
+        self.assertEqual(
+            _is_causal_behavior(rank=0, world_size=4, i=0, is_causal=False),
+            _CausalBehavior.NOT_IS_CAUSAL,
+        )
+
+        ranks = [
+            [_CausalBehavior.IS_CAUSAL, _CausalBehavior.SKIP],
+            [_CausalBehavior.IS_CAUSAL, _CausalBehavior.NOT_IS_CAUSAL],
+        ]
+        for rank, iters in enumerate(ranks):
+            for i, behavior in enumerate(iters):
+                self.assertEqual(
+                    _is_causal_behavior(rank=rank, world_size=2, i=i, is_causal=True),
+                    behavior,
+                )
+
+    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
+    )
+    @with_comms
+    @sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
+    def test_ring_attention_custom_transformer(self) -> None:
+        device_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(0, self.world_size),
+        )
+        dtype = torch.bfloat16
+        bs = 2
+        args = ModelArgs()
+
+        model = Transformer(args).to(dtype).to(self.device_type)
+
+        model = parallelize_module(
+            module=model,
+            device_mesh=device_mesh,
+            parallelize_plan={
+                f"layers.{i}.attention": AttentionContextParallel()
+                for i in range(args.n_layers)
+            },
+        )
+
+        seq = torch.randint(
+            args.vocab_size, (bs, args.max_seq_len), device=self.device_type
+        )
+
+        with CommDebugMode() as comm_mode:
+            out = model(seq)
+        self.assertDictEqual(
+            comm_mode.get_comm_counts(),
+            {
+                c10d_functional.all_to_all_single: (self.world_size - 1)
+                * args.n_layers,
+            },
+        )
+
+        with CommDebugMode() as comm_mode:
+            out.sum().backward()
+        self.assertDictEqual(
+            comm_mode.get_comm_counts(),
+            {
+                c10d_functional.all_to_all_single: (self.world_size - 1)
+                * 2
+                * args.n_layers,
+            },
+        )
+
+    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
+    )
+    @with_comms
+    @parametrize(
+        "attention_fn",
+        [
+            _scaled_dot_product_ring_flash_attention,
+            _scaled_dot_product_ring_efficient_attention,
+            # _scaled_dot_product_ring_cudnn_attention, # TODO: not built by default
+        ],
+    )
+    def test_ring_attention_compile(self, attention_fn: object) -> None:
+        device_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(0, self.world_size),
+        )
+        dtype = torch.bfloat16
+        bs = 8
+        query_tokens = 8
+        context_tokens = 24
+        dim = 32
+        nheads = 8
+        query = torch.rand(
+            (bs, nheads, self.world_size * query_tokens, dim),
+            device=self.device_type,
+            dtype=dtype,
+            requires_grad=True,
+        )
+        key = torch.rand(
+            (bs, nheads, self.world_size * context_tokens, dim),
+            device=self.device_type,
+            dtype=dtype,
+        )
+        value = torch.rand(
+            (bs, nheads, self.world_size * context_tokens, dim),
+            device=self.device_type,
+            dtype=dtype,
+        )
+
+        query_placement = [Shard(2)]
+        dquery = distribute_tensor(query, device_mesh, query_placement)
+        self.assertEqual(query.shape, (bs, nheads, self.world_size * query_tokens, dim))
+
+        context_placement = [Shard(2)]
+        dkey = distribute_tensor(key, device_mesh, context_placement)
+        dvalue = distribute_tensor(value, device_mesh, context_placement)
+
+        # compiled = attention_fn
+        compiled = torch.compile(attention_fn, fullgraph=True, backend="aot_eager")
+
+        out, lse, *args = compiled(
+            device_mesh.get_group(),
+            dquery.to_local(),
+            dkey.to_local(),
+            dvalue.to_local(),
+        )
+        self.assertEqual(out.shape, (bs, nheads, query_tokens, dim))
+        self.assertIsInstance(lse, torch.Tensor)
+
+        (
+            out_chunk,
+            *others,
+        ) = _scaled_dot_product_chunk_flash_attention(
+            query,
+            key,
+            value,
+            size=self.world_size,
+            is_causal=False,
+        )
+        self.assertEqual(
+            out,
+            out_chunk[
+                :, :, self.rank * query_tokens : (self.rank + 1) * query_tokens, :
+            ],
+        )
+
+        out.sum().backward()
+
+
+instantiate_parametrized_tests(RingAttentionTest)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/test_common_rules.py b/test/distributed/_tensor/test_common_rules.py
index 091751109b167..a69cfc2b4ddd5 100644
--- a/test/distributed/_tensor/test_common_rules.py
+++ b/test/distributed/_tensor/test_common_rules.py
@@ -187,9 +187,9 @@ def test_einop_linearity(self):
             "mk,kn->mn", OpSchema(mm_call, (mat1_spec, mat2_spec), {})
         )
         self.assertIsNone(output_sharding.output_spec)
-        suggestions = output_sharding.schema_suggestions
+        suggestions = output_sharding.redistribute_schema
         self.assertIsNotNone(suggestions)
-        suggested_spec = suggestions[0].args_schema[0]
+        suggested_spec = suggestions.args_schema[0]
         self.assertFalse(suggested_spec.placements[1].is_partial())
 
         # einop prop with linearity on mm, should give back suggestion
@@ -200,9 +200,9 @@ def test_einop_linearity(self):
             linearity=True,
         )
         self.assertIsNone(output_sharding.output_spec)
-        suggestions = output_sharding.schema_suggestions
+        suggestions = output_sharding.redistribute_schema
         self.assertIsNotNone(suggestions)
-        mat2_spec = suggestions[0].args_schema[1]
+        mat2_spec = suggestions.args_schema[1]
         # mat2 mesh dim 1 should become partial now!
         self.assertTrue(mat2_spec.placements[1].is_partial())
 
@@ -225,9 +225,9 @@ def test_einop_linearity(self):
             linearity=True,
         )
         self.assertIsNone(output_sharding.output_spec)
-        suggestions = output_sharding.schema_suggestions
+        suggestions = output_sharding.redistribute_schema
         self.assertIsNotNone(suggestions)
-        mat2_spec = suggestions[0].args_schema[1]
+        mat2_spec = suggestions.args_schema[1]
         # mat2 mesh dim 1 should become partial now!
         self.assertTrue(mat2_spec.placements[1].is_partial())
 
@@ -252,11 +252,11 @@ def test_einop_multi_sharding_on_mesh_dim(self):
         )
         output_spec = output_sharding.output_spec
         self.assertIsNone(output_spec)
-        self.assertIsNotNone(output_sharding.schema_suggestions)
+        self.assertIsNotNone(output_sharding.redistribute_schema)
 
         # ensure that the suggestion is to reshard the second
         # arg by all_gather its tensor dim sharding
-        schema_suggestion = output_sharding.schema_suggestions[0]
+        schema_suggestion = output_sharding.redistribute_schema
         self.assertEqual(schema_suggestion.args_schema[0].dim_map, [0, -1])
         self.assertEqual(schema_suggestion.args_schema[1].dim_map, [-1, -1])
 
@@ -327,11 +327,11 @@ def test_pointwise_rules_suggestion(self):
             OpSchema(lerp_call, (mat1_spec, mat2_spec, -1), {})
         )
         self.assertIsNone(output_sharding.output_spec)
-        self.assertIsNotNone(output_sharding.schema_suggestions)
+        self.assertIsNotNone(output_sharding.redistribute_schema)
 
         # ensure that the suggestion from pointwise rules still have
         # the positional args that are not DTensorSpec
-        schema_suggestion = output_sharding.schema_suggestions[0]
+        schema_suggestion = output_sharding.redistribute_schema
         self.assertEqual(len(schema_suggestion.args_schema), 3)
         self.assertEqual(schema_suggestion.args_schema[2], -1)
 
@@ -373,11 +373,11 @@ def test_pointwise_multi_sharding_on_mesh_dim(self):
         output_sharding = pointwise_rule(OpSchema(add_call, (mat1_spec, mat2_spec), {}))
         output_spec = output_sharding.output_spec
         self.assertIsNone(output_spec)
-        self.assertIsNotNone(output_sharding.schema_suggestions)
+        self.assertIsNotNone(output_sharding.redistribute_schema)
 
         # ensure that the suggestion is to reshard the first
         # arg by all_gather first tensor dim sharding
-        schema_suggestion = output_sharding.schema_suggestions[0]
+        schema_suggestion = output_sharding.redistribute_schema
         self.assertEqual(schema_suggestion.args_schema[0].dim_map, [-1, -1, -1, 1])
         self.assertEqual(schema_suggestion.args_schema[1].dim_map, mat2)
 
@@ -404,11 +404,11 @@ def test_pointwise_enforce_sharding_multi_sharding_on_mesh_dim(self):
         output_sharding = pointwise_rule(OpSchema(add_call, (mat1_spec, mat2_spec), {}))
         output_spec = output_sharding.output_spec
         self.assertIsNone(output_spec)
-        self.assertIsNotNone(output_sharding.schema_suggestions)
+        self.assertIsNotNone(output_sharding.redistribute_schema)
 
         # ensure that the suggestion is to reshard the second
         # arg as we should enforce the sharding of the first arg
-        schema_suggestion = output_sharding.schema_suggestions[0]
+        schema_suggestion = output_sharding.redistribute_schema
         self.assertEqual(schema_suggestion.args_schema[0].dim_map, mat1)
         self.assertEqual(schema_suggestion.args_schema[1].dim_map, mat1)
 
diff --git a/test/distributed/_tensor/test_dtensor.py b/test/distributed/_tensor/test_dtensor.py
index ce10182b4a638..2f7f522b9e7b0 100644
--- a/test/distributed/_tensor/test_dtensor.py
+++ b/test/distributed/_tensor/test_dtensor.py
@@ -12,6 +12,7 @@
     DTensor,
     init_device_mesh,
 )
+from torch.distributed._tensor.debug import CommDebugMode
 from torch.distributed._tensor.placement_types import _Partial, Replicate, Shard
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
@@ -26,6 +27,9 @@
 )
 
 
+c10d_functional = torch.ops.c10d_functional
+
+
 class DummyMLP(torch.nn.Module):
     def __init__(self, device):
         super().__init__()
@@ -73,18 +77,6 @@ def test_dtensor_constructor(self):
                 stride=local_tensor.stride(),
             )
 
-        local_tensor = torch.randn(3, 3, requires_grad=False)
-        with self.assertWarnsRegex(UserWarning, "To construct"):
-            dist_tensor = DTensor(
-                local_tensor,
-                device_mesh,
-                placements,
-                shape=dist_tensor_shape,
-                dtype=local_tensor.dtype,
-                requires_grad=True,
-                stride=local_tensor.stride(),
-            )
-
     @with_comms
     def test_meta_dtensor(self):
         device_mesh = self.build_device_mesh()
@@ -333,10 +325,20 @@ def test_to_local_grad_hint(self):
         global_tensor = torch.ones(8, 3, requires_grad=True)
 
         sharded_dtensor = distribute_tensor(global_tensor, device_mesh, placements)
-        local_out = sharded_dtensor.redistribute(placements=[Replicate()]).to_local(
-            grad_placements=[_Partial()]
+        comm_mode = CommDebugMode()
+
+        with comm_mode:
+            local_out = sharded_dtensor.redistribute(placements=[Replicate()]).to_local(
+                grad_placements=[_Partial()]
+            )
+            local_out.backward(torch.ones_like(local_out))
+
+        self.assertEqual(
+            comm_mode.comm_counts[c10d_functional.all_gather_into_tensor], 1
+        )
+        self.assertEqual(
+            comm_mode.comm_counts[c10d_functional.reduce_scatter_tensor], 1
         )
-        local_out.sum().backward()
 
         replica_grad = sharded_dtensor.grad.full_tensor()
         self.assertEqual(replica_grad, global_tensor * self.world_size)
@@ -393,12 +395,10 @@ def test_dtensor_async_output(self):
         # Tests that if the output of some dtensor operations  isn't used in any compute,
         # the output should be an AsyncCollectiveTensor (representing the fact that
         # we haven't synced the collective yet).
-        from torch.distributed._functional_collectives_impl import _tensor_needs_wait
-
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
         def fn(dt):
-            dt_out_redistribute = dt.redistribute(mesh, [Replicate()])
+            dt_out_redistribute = dt.redistribute(mesh, [Replicate()], async_op=True)
             # Make sure we haven't synced yet
             # TODO: figure out why this is returning None
             # self.assertTrue(_tensor_needs_wait(dt_out_redistribute))
@@ -413,12 +413,12 @@ def fn(dt):
         out = fn(dt)
         # Make sure we haven't synced yet
         self.assertEqual(type(out), AsyncCollectiveTensor)
-        self.assertTrue(_tensor_needs_wait(out.elem))
+        self.assertFalse(out.completed)
         out_view = out.view(-1)
 
         # Assert that output is a `AsyncCollectiveTensor`
         self.assertEqual(type(out_view), AsyncCollectiveTensor)
-        self.assertTrue(_tensor_needs_wait(out_view.elem))
+        self.assertFalse(out.completed)
 
         # Use the daa, requiring a sync
         ref = torch.ones((4, 2), device=self.device_type) + 1
@@ -427,6 +427,11 @@ def fn(dt):
         self.assertEqual(type(out_data), torch.Tensor)
         self.assertEqual(out_data, ref)
 
+        # test async_op = False default
+        sync_out = dt.redistribute(mesh, [Replicate()])
+        self.assertFalse(isinstance(sync_out, AsyncCollectiveTensor))
+        self.assertEqual(sync_out.to_local(), x)
+
     @with_comms
     def test_from_local_then_to_local(self):
         # this test ensure end to end from torch.Tensor -> dist tensor -> torch.Tensor works
@@ -758,6 +763,32 @@ def test_implicit_replication(self):
             self.assertEqual(local_shard.shape, (4, 3))
             self.assertEqual(local_shard, torch.ones(4, 3) + torch.ones(3))
 
+    @with_comms
+    def test_auto_implicit_replication(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        local_tensor = torch.ones(self.world_size, 3, device=self.device_type)
+        sharded_dtensor = DTensor.from_local(local_tensor, mesh, [Shard(0)])
+
+        # automatically turn tensor to DTensor replicate when ndim = 0 and numel = 1
+        ndim_0_tensor = torch.tensor(1, device=self.device_type)
+
+        def add_scalar_tensor_with_dtensor():
+            return sharded_dtensor + ndim_0_tensor
+
+        result = add_scalar_tensor_with_dtensor().to_local()
+        self.assertEqual(result, local_tensor + ndim_0_tensor)
+        self.assertNotWarn(
+            add_scalar_tensor_with_dtensor,
+            "Found a non-scalar tensor with numel=1 and ndim!=0",
+        )
+
+        # automatically turn tensor to DTensor replicate when ndim = 1 and numel = 1
+        numel_1_tensor = torch.tensor([1], device=self.device_type)
+        self.assertEqual(
+            (sharded_dtensor + numel_1_tensor).to_local(), local_tensor + numel_1_tensor
+        )
+
 
 class TestDTensorPlacementTypes(DTensorTestBase):
     @property
@@ -804,8 +835,10 @@ def test_split_tensor_1D(self) -> None:
                 ]
                 assert_array_equal(expected_pad_sizes, pad_sizes)
 
+                from torch.distributed._tensor._collective_utils import unpad_tensor
+
                 unpadded_list = [
-                    shard_placement._unpad_tensor(tensor, pad_sizes[i])
+                    unpad_tensor(tensor, shard_placement.dim, pad_sizes[i])
                     if pad_sizes[i] > 0
                     else tensor
                     for i, tensor in enumerate(splitted_tensor_list)
diff --git a/test/distributed/_tensor/test_dtensor_compile.py b/test/distributed/_tensor/test_dtensor_compile.py
index 2dc77ff0dd845..f40cb4999858e 100644
--- a/test/distributed/_tensor/test_dtensor_compile.py
+++ b/test/distributed/_tensor/test_dtensor_compile.py
@@ -8,6 +8,7 @@
 
 import torch
 import torch._dynamo
+import torch._dynamo.testing
 import torch.distributed as dist
 import torch.nn as nn
 from torch._C import FileCheck
@@ -19,6 +20,7 @@
     Replicate,
     Shard,
 )
+from torch.distributed._tensor.placement_types import _Partial
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     checkpoint_wrapper,
     CheckpointImpl,
@@ -58,7 +60,7 @@ def forward(self, input):
 
 
 def extract_graph(fx_g, _, graph_cell):
-    graph_cell[0] = fx_g
+    graph_cell[0] = fx_g.code
     return fx_g
 
 
@@ -98,6 +100,52 @@ def device_type(self) -> str:
     def world_size(self) -> int:
         return 2
 
+    def test_placement_compile(self):
+        def fn(x):
+            a = 0
+            if x.is_replicate():
+                a += 1
+            if x.is_shard():
+                a += 2
+                if x.dim < 0:
+                    raise RuntimeError("dim < 0")
+            if x.is_shard(0):
+                a += 2
+            if x.is_shard(dim=0):
+                a += 2
+            if x.is_shard(dim=None):
+                a += 2
+            if x.is_partial():
+                a += 3
+            return a
+
+        compiled_fn = torch.compile(backend="aot_eager", fullgraph=True)(fn)
+
+        for x in [Shard(0), Replicate(), _Partial()]:
+            opt_fn = fn(x)
+            compiled_out = compiled_fn(x)
+            self.assertEqual(opt_fn, compiled_out)
+
+    def test_device_mesh_compile(self):
+        def fn(x):
+            # test size()
+            a = x.size()
+            b = x.size(0)
+            c = x.size(mesh_dim=0)
+            size = a + b + c
+            # test get_coordinate()
+            coord = x.get_coordinate()
+            # test get_group()
+            group = x.get_group()
+            return size, coord, group
+
+        compiled_fn = torch.compile(backend="aot_eager", fullgraph=True)(fn)
+
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        opt_fn = fn(mesh)
+        compiled_out = compiled_fn(mesh)
+        self.assertEqual(opt_fn, compiled_out)
+
     def test_fakify_dtensor(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
@@ -126,6 +174,82 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(res, ref)
 
+    def test_dtensor_attribute_access_on_intermediate(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        def fn(x):
+            tmp = x * 2
+            if tmp.placements[0].is_shard():
+                return tmp._local_tensor + 2
+            else:
+                return tmp._local_tensor + 3
+
+        x = DTensor.from_local(torch.ones(4), mesh, [Shard(0)], run_check=False)
+        ref = fn(x)
+
+        opt_fn = torch.compile(fn, backend="aot_eager", fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(res, ref)
+
+    def test_dtensor_constructor_w_graph_break(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        # test passing in DTensor as inputs/outputs and run some tensor computation
+        def fn(x):
+            print("graph break!")
+            return DTensor(
+                x,
+                mesh,
+                (Replicate(), Shard(0)),
+                shape=[128, 32],
+                dtype=x.dtype,
+                requires_grad=x.requires_grad,
+                stride=[32, 1],
+            )
+
+        x = torch.randn(64, 32, requires_grad=True)
+        out = fn(x)
+        out2 = torch.compile(fn, backend="eager")(x)
+
+    def test_dtensor_constructor_w_dynamo_disable(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        @torch._dynamo.disable(recursive=False)
+        def fn(x):
+            print("foo")
+            return DTensor(
+                x,
+                mesh,
+                (Replicate(),),
+                shape=torch.Size([32]),
+                dtype=x.dtype,
+                requires_grad=x.requires_grad,
+                stride=(1,),
+            )
+
+        x = torch.randn(32, requires_grad=True)
+        out = fn(x)
+        out2 = torch.compile(fn, backend="eager")(x)
+        self.assertEqual(out, out2)
+
+    def test_dtensor_noncontiguous_output(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        # test passing in DTensor as inputs/outputs and run some tensor computation
+        def fn(x, y, z):
+            x_transposed = x.permute(0, 2, 1).contiguous()
+            tmp = torch._C._nn.linear(x_transposed, y, z)
+            return tmp.permute(0, 2, 1)
+
+        x_inner = torch.randn(4, 16, 4, requires_grad=True)
+        y_inner = torch.randn(4, 16, requires_grad=True)
+        z_inner = torch.randn(4, requires_grad=True)
+        x = DTensor.from_local(x_inner, mesh, [Shard(1)], run_check=False)
+        y = DTensor.from_local(y_inner, mesh, [Shard(1)], run_check=False)
+        z = DTensor.from_local(z_inner, mesh, [Replicate()], run_check=False)
+        out = torch.compile(fn, backend="aot_eager", fullgraph=True)(x, y, z)
+        out.contiguous().sum().backward()
+
     def test_dynamo_dtensor_from_local(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
@@ -145,11 +269,16 @@ def fn(x):
         # _dt_lib_impl = torch.library.Library("dtensor", "IMPL")
         # _dt_lib_impl.impl("from_local", from_local_tensor, "Autograd")
 
-        x = torch.ones(1)
+        x = torch.ones(1, requires_grad=True)
         ref = fn(x)
-        opt_fn = torch.compile(fn, backend="aot_eager", fullgraph=True)
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+        opt_fn = torch.compile(fn, backend=cnt, fullgraph=True)
         res = opt_fn(x)
+        # backward should work as well
+        res.sum().backward()
+
         self.assertEqual(res, ref)
+        self.assertEqual(cnt.frame_count, 1)
 
         # test if user calls from_local with mesh/placements as kwargs and that should still work
         def from_local_kwargs_fn(x):
@@ -159,11 +288,148 @@ def from_local_kwargs_fn(x):
             return dt.to_local() + 2
 
         ref = from_local_kwargs_fn(x)
-        opt_kwargs_fn = torch.compile(
-            from_local_kwargs_fn, backend="aot_eager", fullgraph=True
-        )
+        opt_kwargs_fn = torch.compile(from_local_kwargs_fn, backend=cnt, fullgraph=True)
         res = opt_kwargs_fn(x)
         self.assertEqual(res, ref)
+        self.assertEqual(cnt.frame_count, 2)
+
+    def test_dtensor_partial_placement_redistribute_unbalanced_correct_strides(self):
+        # Partial -> Shard on an unbalanced tensor results in:
+        # - A contiguous DTensor
+        # - where the inner _local_tensor is noncontiguous
+        placement = Shard(1)
+
+        def fn(x):
+            out = x.redistribute(mesh, [placement])
+            return out
+
+        # Temporarily ignore setUp(), and use rank3 graphs during tracing
+        dist.destroy_process_group()
+        fake_store = FakeStore()
+        dist.init_process_group("fake", store=fake_store, rank=3, world_size=2)
+        mesh = DeviceMesh(self.device_type, [1, 3])
+
+        x = torch.randn(10, 257, 160, requires_grad=True)
+        x_dt = DTensor.from_local(
+            x,
+            mesh,
+            [_Partial()],
+            run_check=False,
+            shape=(10, 257, 160),
+            stride=(41120, 160, 1),
+        )
+
+        # tmp_dt has an inner, non-contiguous tensor, and is an autograd non-leaf
+        tmp_dt = fn(x_dt)
+        fake_mode = torch._subclasses.FakeTensorMode()
+        tmp_dt_fake = fake_mode.from_tensor(tmp_dt)
+        self.assertEqual(tmp_dt.shape, tmp_dt_fake.shape)
+        self.assertEqual(tmp_dt.stride(), tmp_dt_fake.stride())
+        self.assertEqual(tmp_dt._local_tensor.shape, tmp_dt_fake._local_tensor.shape)
+        self.assertEqual(
+            tmp_dt._local_tensor.stride(), tmp_dt_fake._local_tensor.stride()
+        )
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    def test_dtensor_contiguous_dtensor_noncontiguous_local_as_tangent(self):
+        # Partial -> Shard on an unbalanced tensor results in:
+        # - A contiguous DTensor
+        # - where the inner _local_tensor is noncontiguous
+        # When this tensor is a fwd graph output,
+        # AOTAutograd needs to make sure we trace the backward
+        # with a contiguous tangent
+        placement = Shard(1)
+
+        def fn(x):
+            out = x.redistribute(mesh, [placement])
+            return out
+
+        # Temporarily ignore setUp(), and use rank3 graphs during tracing
+        dist.destroy_process_group()
+        fake_store = FakeStore()
+        dist.init_process_group("fake", store=fake_store, rank=3, world_size=2)
+        mesh = DeviceMesh(self.device_type, [1, 3])
+
+        x = torch.randn(10, 257, 160, requires_grad=True)
+        x_dt = DTensor.from_local(
+            x,
+            mesh,
+            [_Partial()],
+            run_check=False,
+            shape=(10, 257, 160),
+            stride=(41120, 160, 1),
+        )
+
+        out_dt = torch.compile(fn)(x_dt)
+        # If we don't properly contiguify our traced tangents,
+        # this fails with an inductor stride assert
+        out_dt.to_local().sum().backward()
+
+    def test_dynamo_to_local_kwargs(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        def fn(x):
+            return dt.to_local(grad_placements=[Shard(0)]) + 2
+
+        fn_opt = torch.compile(fn, backend="aot_eager", fullgraph=True)
+        x = torch.ones(4)
+        dt = DTensor.from_local(x, mesh, [Replicate()], run_check=False)
+
+        out_ref = fn(dt)
+        out_test = fn_opt(dt)
+        self.assertEqual(out_ref, out_test)
+
+    def test_dynamo_to_local_kwargs_forward_hook(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        def fw_hook(module, inp, out):
+            tmp = out.to_local(grad_placements=out.placements) + 2
+            return DTensor.from_local(tmp, mesh, out.placements, run_check=False)
+
+        mod = torch.nn.Linear(4, 4)
+        mod.register_forward_hook(fw_hook)
+
+        mod = torch.nn.Linear(4, 4)
+        mod.register_forward_hook(fw_hook)
+        mod.weight = torch.nn.Parameter(
+            DTensor.from_local(mod.weight, mesh, [Replicate()], run_check=False)
+        )
+        mod.bias = torch.nn.Parameter(
+            DTensor.from_local(mod.bias, mesh, [Replicate()], run_check=False)
+        )
+        opt_mod = torch.compile(mod, backend="aot_eager", fullgraph=True)
+
+        x = torch.ones(4, 4)
+        dt = DTensor.from_local(x, mesh, [Replicate()], run_check=False)
+
+        out_ref = mod(dt)
+        out_test = opt_mod(dt)
+        self.assertEqual(out_ref, out_test)
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    def test_dtensor_different_gradient_placement(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        def fn(x, y, z):
+            permute = x.permute(0, 2, 1)
+            permute2 = permute.contiguous()
+            layer_norm = torch.nn.functional.layer_norm(permute2, (4,), y, z, 1e-05)
+            out = layer_norm.permute(0, 2, 1)
+            return out
+
+        x = torch.randn(4, 2, 4, requires_grad=True, device="cuda")
+        x_dt = DTensor.from_local(x, mesh, [Shard(1)], run_check=False)
+
+        y = torch.randn(4, requires_grad=True, device="cuda")
+        y_dt = DTensor.from_local(y, mesh, [Replicate()], run_check=False)
+
+        z = torch.randn(4, requires_grad=True, device="cuda")
+        z_dt = DTensor.from_local(z, mesh, [Replicate()], run_check=False)
+
+        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
+        tmp_dt = opt_fn(x_dt, y_dt, z_dt)
+        out_dt = torch.matmul(tmp_dt, x_dt).permute(0, 2, 1)
+        out_dt.sum().backward()
 
     def test_dynamo_dtensor_from_local_redistribute(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
@@ -176,7 +442,8 @@ def fn(x):
 
         x = torch.ones(1)
         ref = fn(x)
-        opt_fn = torch.compile(fn, backend="aot_eager", fullgraph=True)
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+        opt_fn = torch.compile(fn, backend=cnt, fullgraph=True)
         res = opt_fn(x)
         self.assertEqual(res, ref)
 
@@ -190,11 +457,74 @@ def redistribute_kwargs_fn(x):
         x = torch.ones(1)
         ref = redistribute_kwargs_fn(x)
         opt_kwargs_fn = torch.compile(
-            redistribute_kwargs_fn, backend="aot_eager", fullgraph=True
+            redistribute_kwargs_fn, backend=cnt, fullgraph=True
         )
         res = opt_kwargs_fn(x)
         self.assertEqual(res, ref)
 
+    def test_dtensor_dynamo_device_mesh_attrs(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        # pass in tensor as inputs/outputs, create DTensor and run redistribute
+        # (allgather collective) inside the fn
+        def fn(x_dt):
+            if x_dt.device_mesh.device_type == "cuda":
+                return x_dt + 1
+            else:
+                return x_dt + 2
+
+        x = torch.ones(4, 4)
+        x_dt = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
+        ref = fn(x_dt)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        res = opt_fn(x_dt)
+        self.assertEqual(ref, res)
+
+    def test_graph_input_is_async(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        def fn(x):
+            return x.sin().sin()
+
+        opt_fn = torch.compile(fn, backend=aot_eager_graph, fullgraph=True)
+
+        x = torch.randn(4, 4, requires_grad=True)
+        x_dt = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
+        x2 = x_dt.redistribute(mesh, [Replicate()], async_op=True)
+        x2 = x2.to_local()
+        out = opt_fn(x2)
+        # The important part: we get a wait_tensor() in the graph.
+        # At runtime, the input to the graph is an AsyncCollectiveTensor,
+        # and inside the graph we need to issue a wait() to synchronize.
+        self.assertExpectedInline(
+            str(fw_graph_cell[0]).strip(),
+            """\
+def forward(self, primals_1):
+    wait_tensor = torch.ops._c10d_functional.wait_tensor.default(primals_1)
+    sin = torch.ops.aten.sin.default(wait_tensor)
+    sin_1 = torch.ops.aten.sin.default(sin);  sin = None
+    return [sin_1, primals_1, wait_tensor]""",
+        )
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    def test_dtensor_partial_placement_graph_output(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        def fn(x):
+            return x + x
+
+        x = torch.randn(4, 4, requires_grad=True)
+        x_dt = DTensor.from_local(x, mesh, [_Partial()], run_check=False)
+
+        y = torch.randn(4, 4, requires_grad=True)
+        y_dt = DTensor.from_local(y, mesh, [Replicate()], run_check=False)
+
+        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
+        tmp_dt = opt_fn(x_dt)
+        out_dt = torch.matmul(tmp_dt, y_dt)
+        out_dt.sum().backward()
+
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_lt_x_gpu(1)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
@@ -254,21 +584,24 @@ def forward(self, input):
             parallelize_plan=parallel_plan,
         )
 
-        compiled_model = torch.compile(model)
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("inductor")
+        compiled_model = torch.compile(model, backend=cnt, fullgraph=True)
         inp = torch.rand(20, 16).to(self.device_type)
         out = compiled_model(inp)
+        out.sum().backward()
+        self.assertEqual(cnt.frame_count, 1)
 
         code = run_and_get_triton_code(compiled_model, inp)
-        # Check that `buf2` is correctly waited on before first use.
-        # fmt: off
-        FileCheck() \
-            .check("buf1_work = dist.all_gather_into_tensor(buf1[0]") \
-            .check("buf2 = buf1[0]") \
-            .check("buf2 = _wait_tensor(buf2)") \
-            .check("extern_kernels.mm(buf2,") \
-            .run(code)
+        FileCheck().check(
+            "buf0 = torch.ops._c10d_functional.all_gather_into_tensor.default(primal"
+        ).check("buf1 = torch.ops._c10d_functional.wait_tensor.default(buf0").check(
+            "extern_kernels.mm(buf0,"
+        ).run(
+            code
+        )
 
 
+@instantiate_parametrized_tests
 class TestDTensorCompileE2E(DTensorTestBase):
     @property
     def world_size(self):
@@ -327,9 +660,12 @@ def test_tp_compile_fullgraph(self, is_seq_parallel):
         torch.manual_seed(rng_seed)
         inp = torch.rand(20, 10, device=self.device_type)
         out = model(inp)
-        compiled_mod = torch.compile(model, backend="aot_eager", fullgraph=True)
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+        compiled_mod = torch.compile(model, backend=cnt, fullgraph=True)
         compiled_out = compiled_mod(inp)
+        compiled_out.sum().backward()
         self.assertEqual(compiled_out, out)
+        self.assertEqual(cnt.frame_count, 1)
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -375,10 +711,12 @@ def test_2d_fsdp_tp_compile(self):
         )
 
         # TODO: once aot autograd support is ready we can just use default backend
-        compiled_2d = torch.compile(fsdp_2d, backend="aot_eager")
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+        compiled_2d = torch.compile(fsdp_2d, backend=cnt)
         compiled_output = compiled_2d(inp)
 
         self.assertEqual(out, compiled_output)
+        self.assertEqual(cnt.frame_count, 1)
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -463,7 +801,5 @@ def fn(x, y):
         self.assertEqual(y_ref.grad, y.grad)
 
 
-instantiate_parametrized_tests(TestDTensorCompileE2E)
-
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_tensor/test_dtensor_ops.py b/test/distributed/_tensor/test_dtensor_ops.py
index 434f60e96c195..d14d2d851f0fb 100644
--- a/test/distributed/_tensor/test_dtensor_ops.py
+++ b/test/distributed/_tensor/test_dtensor_ops.py
@@ -94,8 +94,8 @@ def wrapped(fn):
     # get full support with varying sharding specs
     xfail("__getitem__"),
     xfail("__rsub__"),
+    xfail("_chunk_cat"),
     xfail("_native_batch_norm_legit"),
-    xfail("_softmax_backward_data"),
     xfail("_upsample_bilinear2d_aa"),
     xfail("addbmm"),
     xfail("addmv"),
@@ -114,10 +114,10 @@ def wrapped(fn):
     xfail("as_strided", "partial_views"),
     xfail("as_strided_scatter"),
     xfail("bernoulli"),
+    xfail("_batch_norm_with_update"),
     xfail("block_diag"),
     xfail("broadcast_shapes"),
     xfail("cauchy"),
-    xfail("cartesian_prod"),
     xfail("cdist"),
     xfail("cholesky"),
     xfail("cholesky_inverse"),
@@ -190,7 +190,10 @@ def wrapped(fn):
     xfail("index_copy"),
     xfail("index_fill"),
     xfail("index_put"),
-    xfail("index_reduce"),
+    xfail("index_reduce", "prod"),
+    xfail("index_reduce", "mean"),
+    xfail("index_reduce", "amax"),
+    xfail("index_reduce", "amin"),
     xfail("index_select"),
     xfail("isin"),
     xfail("isinf"),
@@ -240,7 +243,6 @@ def wrapped(fn):
     xfail("linalg.tensorsolve"),
     xfail("linalg.vander"),
     xfail("linalg.vecdot"),
-    xfail("linalg.vector_norm"),
     xfail("linspace"),
     xfail("linspace", "tensor_overload"),
     xfail("log_normal"),
@@ -263,7 +265,6 @@ def wrapped(fn):
     xfail("masked.cumsum"),
     xfail("masked.logsumexp"),
     xfail("masked.median"),
-    xfail("masked.norm"),
     xfail("matrix_exp"),
     xfail("max", "binary"),
     xfail("max", "reduction_with_dim"),
@@ -311,7 +312,6 @@ def wrapped(fn):
     xfail("nn.functional.conv_transpose2d"),
     xfail("nn.functional.conv_transpose3d"),
     xfail("nn.functional.cosine_similarity"),
-    xfail("nn.functional.cross_entropy"),
     xfail("nn.functional.ctc_loss"),
     xfail("nn.functional.dropout"),
     xfail("nn.functional.dropout2d"),
@@ -356,7 +356,6 @@ def wrapped(fn):
     xfail("nn.functional.multi_head_attention_forward"),
     xfail("nn.functional.multilabel_margin_loss"),
     xfail("nn.functional.multilabel_soft_margin_loss"),
-    xfail("nn.functional.nll_loss"),
     xfail("nn.functional.normalize"),
     xfail("nn.functional.pad", "constant"),
     xfail("nn.functional.pad", "reflect"),
@@ -371,7 +370,6 @@ def wrapped(fn):
     xfail("nn.functional.relu6"),
     xfail("nn.functional.rrelu"),
     xfail("nn.functional.selu"),
-    xfail("nn.functional.silu"),
     xfail("nn.functional.smooth_l1_loss"),
     xfail("nn.functional.soft_margin_loss"),
     xfail("nn.functional.softplus"),
@@ -383,9 +381,6 @@ def wrapped(fn):
     xfail("nn.functional.upsample_bilinear"),
     xfail("nn.functional.upsample_nearest"),
     xfail("nonzero"),
-    xfail("norm"),
-    xfail("norm", "fro"),
-    xfail("norm", "inf"),
     xfail("norm", "nuc"),
     xfail("normal"),
     xfail("normal", "number_mean"),
@@ -470,8 +465,6 @@ def wrapped(fn):
     xfail("stft"),
     xfail("svd"),
     xfail("svd_lowrank"),
-    xfail("t"),
-    xfail("take_along_dim"),
     xfail("take"),
     xfail("tensor_split"),
     xfail("to_sparse"),
@@ -495,7 +488,6 @@ def wrapped(fn):
     xfail("var_mean", "unbiased"),
     xfail("vdot"),
     xfail("view_copy"),
-    xfail("view_as_complex"),
     xfail("zeros"),
     # ops inside this might even fail without dtensor
     # tests, as we rescale op db common test size factor (i.e. L, M, S)
@@ -704,7 +696,4 @@ def check_dtensor_func(self, test_func, opinfo, dry_run=False):
 
 
 if __name__ == "__main__":
-    # NB: CPU dtensor ops test frequently timeout https://github.com/pytorch/pytorch/issues/98816
-    # so running it only on CUDA
-    if torch.cuda.is_available():
-        run_tests()
+    run_tests()
diff --git a/test/distributed/_tensor/test_embedding_ops.py b/test/distributed/_tensor/test_embedding_ops.py
index 48e33482fa7dc..3ac61e0b45fe5 100644
--- a/test/distributed/_tensor/test_embedding_ops.py
+++ b/test/distributed/_tensor/test_embedding_ops.py
@@ -3,13 +3,14 @@
 import sys
 
 import torch
-from torch.distributed._tensor import DTensor
-from torch.distributed._tensor.placement_types import Replicate
-from torch.distributed.tensor.parallel import (
-    ColwiseParallel,
-    parallelize_module,
-    RowwiseParallel,
+from torch.distributed._tensor import (
+    distribute_module,
+    distribute_tensor,
+    DTensor,
+    Replicate,
+    Shard,
 )
+from torch.distributed._tensor.debug import CommDebugMode
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -24,9 +25,26 @@
     sys.exit(0)
 
 
+funcol = torch.ops.c10d_functional
+
+
 class TestEmbeddingOp(DTensorTestBase):
+    def _apply_sharding(self, embedding_mod, shard_dim, device_mesh):
+        def shard_embedding_fn(name, module, device_mesh):
+            for name, param in module.named_parameters():
+                dist_param = torch.nn.Parameter(
+                    distribute_tensor(param, device_mesh, [Shard(shard_dim)])
+                )
+                module.register_parameter(name, dist_param)
+
+        sharded_embedding = distribute_module(
+            embedding_mod, device_mesh, shard_embedding_fn
+        )
+        return sharded_embedding
+
     def _run_embedding_op_test(
         self,
+        device_mesh,
         shard_dim,
         input_size,
         num_embeddings,
@@ -34,7 +52,6 @@ def _run_embedding_op_test(
         **kwargs,
     ):
         # Use same seed.
-        device_mesh = self.build_device_mesh()
         torch.manual_seed(0)
         local_embedding = torch.nn.Embedding(
             num_embeddings,
@@ -53,12 +70,9 @@ def _run_embedding_op_test(
         sharded_embedding.weight = torch.nn.Parameter(
             local_embedding.weight.clone().detach()
         )
-        parallelize_module(
-            module=sharded_embedding,
-            device_mesh=device_mesh,
-            parallelize_plan=ColwiseParallel(output_layouts=Replicate())
-            if shard_dim == 1
-            else RowwiseParallel(),
+
+        sharded_embedding = self._apply_sharding(
+            sharded_embedding, shard_dim, device_mesh
         )
 
         # Run sharded computation
@@ -69,8 +83,14 @@ def _run_embedding_op_test(
         target = torch.empty(
             *inp.size(), embedding_dim, dtype=torch.float, device=self.device_type
         ).random_(0, 1)
-        output = sharded_embedding(inp)
+        dist_inp = distribute_tensor(inp, device_mesh, [Replicate()])
+
+        # fwd computation, ensure no comm happened
+        with CommDebugMode() as fwd_mode:
+            dist_output = sharded_embedding(dist_inp)
+            self.assertEqual(fwd_mode.get_total_counts(), 0)
 
+        output = dist_output.full_tensor()
         # Run local computation
         local_output = local_embedding(inp)
 
@@ -79,20 +99,24 @@ def _run_embedding_op_test(
 
         # Use a sample cross entry loss to verify backward and grad computation.
         loss = torch.nn.CrossEntropyLoss()
-        attn_loss = loss(
+        emb_loss = loss(
             output,
             target,
         )
-        attn_dup_loss = loss(
+        emb_dup_loss = loss(
             local_output,
             target,
         )
-        attn_loss.backward()
-        attn_dup_loss.backward()
 
-        gradient = sharded_embedding.weight.grad.redistribute(
-            device_mesh, [Replicate()]
-        ).to_local()
+        # local embedding backward
+        emb_dup_loss.backward()
+
+        # sharded embedding bwd computation, ensure no comm happened
+        with CommDebugMode() as bwd_mode:
+            emb_loss.backward()
+            self.assertEqual(bwd_mode.get_total_counts(), 0)
+
+        gradient = sharded_embedding.weight.grad.full_tensor()
 
         local_grad = local_embedding.weight.grad
 
@@ -106,7 +130,7 @@ def _run_embedding_op_test(
             **kwargs,
         )
         sharded_output = torch.nn.functional.embedding(
-            DTensor.from_local(inp, device_mesh, [Replicate()]),
+            DTensor.from_local(inp, device_mesh, [Replicate()], run_check=False),
             sharded_embedding.weight,
             **kwargs,
         )
@@ -114,31 +138,50 @@ def _run_embedding_op_test(
 
     @with_comms
     def test_sharded_embedding_colwise(self):
-        self._run_embedding_op_test(1, [5, 4], 17, 12)
-        self._run_embedding_op_test(1, [6, 7, 6], 21, 11)
-        self._run_embedding_op_test(1, [8, 6, 5, 4], 23, 13)
-        self._run_embedding_op_test(1, [8, 6, 5, 4, 7], 23, 16)
-        self._run_embedding_op_test(1, [4], 15, 14)
-        self._run_embedding_op_test(1, [34], 15, 14, padding_idx=10)
-        self._run_embedding_op_test(1, [8, 6, 5, 4], 23, 13, padding_idx=12)
+        mesh = self.build_device_mesh()
+        self._run_embedding_op_test(mesh, 1, [5, 4], 17, 12)
+        self._run_embedding_op_test(mesh, 1, [6, 7, 6], 21, 11)
+        self._run_embedding_op_test(mesh, 1, [8, 6, 5, 4], 23, 13)
+        self._run_embedding_op_test(mesh, 1, [8, 6, 5, 4, 7], 23, 16)
+        self._run_embedding_op_test(mesh, 1, [4], 15, 14)
+        self._run_embedding_op_test(mesh, 1, [34], 15, 14, padding_idx=10)
+        self._run_embedding_op_test(mesh, 1, [8, 6, 5, 4], 23, 13, padding_idx=12)
 
     @with_comms
-    def test_sharded_embedding_colwise_errors(self):
+    def test_sharded_embedding_colwise_max_norm_errors(self):
+        mesh = self.build_device_mesh()
         with self.assertRaisesRegex(
             NotImplementedError,
-            "DTensor does not support sharded embedding operation with max_norm yet!",
+            "aten.embedding_renorm_.default does not have a sharding strategy registered.",
         ):
             self._run_embedding_op_test(
-                1, [8, 6, 5, 4], 23, 13, padding_idx=12, max_norm=2.0
+                mesh, 1, [8, 6, 5, 4], 23, 13, padding_idx=12, max_norm=2.0
             )
 
     @with_comms
     def test_sharded_embedding_rowwise(self):
-        with self.assertRaisesRegex(
-            NotImplementedError,
-            "RowwiseParallel currently only support nn.Linear!",
-        ):
-            self._run_embedding_op_test(0, [5, 12], 16, 22)
+        mesh = self.build_device_mesh()
+        # test correctness
+        self._run_embedding_op_test(mesh, 0, [5, 12], 16, 22)
+        self._run_embedding_op_test(mesh, 0, [6, 7, 6], 13, 22)
+        self._run_embedding_op_test(mesh, 0, [34], 15, 14, padding_idx=10)
+
+        from torch.distributed._tensor.ops.embedding_ops import _MaskPartial
+
+        # test collectives
+        embedding_mod = torch.nn.Embedding(10, 20, device=self.device_type)
+        sharded_embedding = self._apply_sharding(embedding_mod, 0, mesh)
+        inp = torch.randint(0, 10, (8, 8), device=self.device_type)
+        replicated_inp = DTensor.from_local(inp, mesh, [Replicate()], run_check=False)
+        output = sharded_embedding(replicated_inp)
+        self.assertIsInstance(output.placements[0], _MaskPartial)
+
+        comm_mode = CommDebugMode()
+
+        with comm_mode:
+            output.full_tensor()
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+            self.assertEqual(comm_mode.get_comm_counts()[funcol.all_reduce], 1)
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/_tensor/test_init.py b/test/distributed/_tensor/test_init.py
index fe95580a8618c..11a9596abc75d 100644
--- a/test/distributed/_tensor/test_init.py
+++ b/test/distributed/_tensor/test_init.py
@@ -199,7 +199,7 @@ def test_zeros_submesh(self):
         # default world_size is 4
         # construct a cuda device 1d mesh, with no sub pg initialized
         sub_mesh_list = [0, 3]
-        mesh = DeviceMesh(self.device_type, sub_mesh_list, _init_process_groups=False)
+        mesh = DeviceMesh(self.device_type, sub_mesh_list)
         placements = [Shard(0)]
         size = [32, 3]
         dist_tensor = zeros(size, device_mesh=mesh, placements=placements)
@@ -235,7 +235,7 @@ def test_zeros_submesh(self):
 
         # construct a cuda device 2d mesh, with no subpg initialized
         sub_mesh_list = [[0], [3]]
-        mesh = DeviceMesh(self.device_type, sub_mesh_list, _init_process_groups=False)
+        mesh = DeviceMesh(self.device_type, sub_mesh_list)
         placements = [Shard(0), Shard(1)]
         size = [32, 3]
         dist_tensor = zeros(size, device_mesh=mesh, placements=placements)
diff --git a/test/distributed/_tensor/test_math_ops.py b/test/distributed/_tensor/test_math_ops.py
index ce3cb705c0d17..f810278c3ffe8 100644
--- a/test/distributed/_tensor/test_math_ops.py
+++ b/test/distributed/_tensor/test_math_ops.py
@@ -8,7 +8,7 @@
 
 from torch.distributed._tensor import DeviceMesh, distribute_module, distribute_tensor
 from torch.distributed._tensor.debug import CommDebugMode
-from torch.distributed._tensor.ops.utils import is_tensor_partial
+from torch.distributed._tensor.ops.utils import is_tensor_partial, normalize_dim
 from torch.distributed._tensor.placement_types import Replicate, Shard
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
@@ -18,6 +18,9 @@
 )
 
 
+funcol = torch.ops.c10d_functional
+
+
 class DistMathOpsTest(DTensorTestBase):
     def linear_op_reductions(self, op_str):
         device_mesh = self.build_device_mesh()
@@ -73,18 +76,14 @@ def test_softmax_fwd(self):
                 x, dim=softmax_dim, dtype=torch.float32
             )
             dist_x = distribute_tensor(x, device_mesh, [Shard(shard_dim)])
+            dist_y = torch.nn.functional.softmax(
+                dist_x, dim=softmax_dim, dtype=torch.float32
+            )
+            shard_dim = normalize_dim(shard_dim, dist_x.ndim)
             if dims[shard_dim] == dims[softmax_dim]:
-                with self.assertRaisesRegex(
-                    Exception, "Cannot run .* on sharding dimension!$"
-                ):
-                    dist_y = torch.nn.functional.softmax(
-                        dist_x, dim=softmax_dim, dtype=torch.float32
-                    )
+                self.assertTrue(dist_y.placements[0].is_replicate())
+                self.assertEqual(dist_y.to_local(), local_y)
             else:
-                dist_y = torch.nn.functional.softmax(
-                    dist_x, dim=softmax_dim, dtype=torch.float32
-                )
-                shard_dim = shard_dim + dist_y.ndim if shard_dim < 0 else shard_dim
                 self.assertTrue(dist_y.placements[0].is_shard(dim=shard_dim))
                 self.assertEqual(dist_y.full_tensor(), local_y)
 
@@ -112,25 +111,100 @@ def test_softmax_with_bwd(self):
 
             dist_x = distribute_tensor(x, device_mesh, [Shard(shard_dim)])
             self.assertTrue(dist_x.requires_grad)
+            dist_softmax = dist_x.softmax(dim=softmax_dim)
+            shard_dim = normalize_dim(shard_dim, dist_x.ndim)
             if dims[softmax_dim] == dims[shard_dim]:
-                with self.assertRaisesRegex(
-                    Exception, "Cannot run .* on sharding dimension!$"
-                ):
-                    dist_softmax = dist_x.softmax(dim=softmax_dim)
+                self.assertTrue(dist_softmax.placements[0].is_replicate())
             else:
-                dist_softmax = dist_x.softmax(dim=softmax_dim)
-                shard_dim = shard_dim + dist_x.ndim if shard_dim < 0 else shard_dim
                 self.assertTrue(dist_softmax.placements[0].is_shard(dim=shard_dim))
-                dist_y = dist_softmax.sum()
+            dist_y = dist_softmax.sum()
+            if dims[softmax_dim] == dims[shard_dim]:
+                self.assertTrue(dist_y.placements[0].is_replicate())
+            else:
+                self.assertTrue(dist_y.placements[0].is_partial())
                 dist_y = dist_y.redistribute(device_mesh, [Replicate()])
-                self.assertEqual(dist_y.to_local(), local_y)
-                self.assertIsNone(dist_x.grad)
-                dist_y.backward()
-                self.assertIsNotNone(dist_x.grad)
-                self.assertEqual(dist_x.grad.full_tensor(), x.grad)
+            self.assertEqual(dist_y.to_local(), local_y)
+            self.assertIsNone(dist_x.grad)
+            dist_y.backward()
+            self.assertIsNotNone(dist_x.grad)
+            if dims[softmax_dim] == dims[shard_dim]:
+                self.assertTrue(dist_x.grad.placements[0].is_replicate())
+            else:
+                self.assertTrue(dist_x.grad.placements[0].is_shard(dim=shard_dim))
+            self.assertEqual(dist_x.grad.full_tensor(), x.grad)
+
+    @with_comms
+    @skip_unless_torch_gpu
+    def test_nll_loss_and_cross_entropy(self):
+        device_mesh = self.build_device_mesh()
+        comm_mode = CommDebugMode()
+
+        channel_size, channel_dim = 16, 1
+        test_setup = [
+            (2, (8, channel_size), (8,)),  # calling aten.nll_loss_forward
+            (3, (8, channel_size, 12), (8, 12)),  # calling aten.nll_loss2d_forward
+        ]
+        for input_ndim, input_size, target_size in test_setup:
+            x = torch.rand(*input_size, device=self.device_type, requires_grad=True)
+            target = torch.randint(channel_size, target_size, device=self.device_type)
+            dist_target = distribute_tensor(target, device_mesh, [Replicate()])
+
+            shard_dims = list(range(input_ndim))
+            reductions = ["none", "mean", "sum"]
+            # Compared with nll_loss, cross_entropy additionally calls log_softmax first.
+            # Testing them together as code can be reused.
+            loss_functions = [
+                torch.nn.functional.nll_loss,
+                torch.nn.functional.cross_entropy,
+            ]
+            for shard_dim, reduction, loss_fn in itertools.product(
+                shard_dims, reductions, loss_functions
+            ):
+                dist_x = distribute_tensor(x, device_mesh, [Shard(shard_dim)])
+                y = loss_fn(x, target, reduction=reduction)
+                if reduction == "none":
+                    y.sum().backward()
+                else:
+                    y.backward()
+                with comm_mode:
+                    dist_y = loss_fn(dist_x, dist_target, reduction=reduction)
+                    if shard_dim == channel_dim:
+                        self.assertEqual(comm_mode.get_total_counts(), 1)
+                        self.assertEqual(
+                            comm_mode.get_comm_counts()[funcol.all_gather_into_tensor],
+                            1,
+                        )
+                        self.assertTrue(dist_y.placements[0].is_replicate())
+                        self.assertEqual(dist_y.to_local(), y)
+                    else:
+                        self.assertEqual(comm_mode.get_total_counts(), 0)
+                        if reduction == "none":
+                            output_shard_dim = (
+                                shard_dim if shard_dim < channel_dim else shard_dim - 1
+                            )
+                            self.assertTrue(
+                                dist_y.placements[0].is_shard(dim=output_shard_dim)
+                            )
+                        else:
+                            self.assertTrue(dist_y.placements[0].is_partial())
+                        self.assertEqual(dist_y.full_tensor(), y)
+
+                    if reduction == "none":
+                        dist_y.sum().backward()
+                    else:
+                        dist_y.backward()
+                    if shard_dim == channel_dim:
+                        self.assertTrue(dist_x.grad.placements[0].is_replicate())
+                        self.assertEqual(dist_x.grad.to_local(), x.grad)
+                    else:
+                        self.assertTrue(
+                            dist_x.grad.placements[0].is_shard(dim=shard_dim)
+                        )
+                        self.assertEqual(dist_x.grad.full_tensor(), x.grad)
+                    x.grad.zero_()
 
     @with_comms
-    def test_full_shard_math_ops(self):
+    def test_shard_math_ops(self):
         mesh_shape = (2, self.world_size // 2)
         mesh = DeviceMesh(
             self.device_type,
@@ -147,11 +221,11 @@ def test_full_shard_math_ops(self):
         # for op in [torch.add, torch.sub, torch.mul, torch.div]:
         for op in [torch.add, torch.sub, torch.mul, torch.div]:
             expect_rs = op(global_tensor, 2)
-            actual_rs = op(double_shard_tensor, 2).redistribute(
-                mesh, [Replicate(), Replicate()]
-            )
-            actual_local_res = actual_rs.to_local()
-            self.assertEqual(actual_local_res, expect_rs)
+            double_shard_full_tensor = op(double_shard_tensor, 2).full_tensor()
+            self.assertEqual(double_shard_full_tensor, expect_rs)
+
+            fully_shard_full_tensor = op(fully_shard_tensor, 2).full_tensor()
+            self.assertEqual(fully_shard_full_tensor, expect_rs)
 
     @with_comms
     def test_layer_norm_fwd(self):
@@ -197,9 +271,9 @@ def _replicate_fn(name, module, device_mesh):
             with comm_mode:
                 y_dist = layer_norm_dist(x_dist)
 
-            self.assertEqual(
+            self.assertLessEqual(
                 comm_mode.get_total_counts(),
-                0,
+                1,  # TODO: This should be 0!
                 f"comm count={comm_mode.get_total_counts()}, "
                 f"shard_dim={shard_dim}, norm_shape={normalized_shape}, elem_affine={elementwise_affine}",
             )
@@ -220,7 +294,7 @@ def test_layer_norm_bwd(self):
         # https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html
         batch, sentence_length, embedding_dim = 20, 5, 10
         norm_shape_idx_list = list(range(3))
-        shard_dims = [-1, 0, 1, 2]
+        shard_dims = [0, 1, 2]
         elementwise_affine_list = [False, True]
         test_config_list = list(
             itertools.product(shard_dims, norm_shape_idx_list, elementwise_affine_list)
@@ -271,9 +345,10 @@ def _replicate_fn(name, module, device_mesh):
             with comm_mode:
                 y_dist = layer_norm_dist(x_dist)
 
+            expected_fwd_comm = 0 if shard_dim < norm_idx else 1
             self.assertEqual(
                 comm_mode.get_total_counts(),
-                0,
+                expected_fwd_comm,
                 f"comm count={comm_mode.get_total_counts()}, "
                 f"shard_dim={shard_dim}, norm_shape={normalized_shape}, elem_affine={elementwise_affine}",
             )
@@ -285,9 +360,11 @@ def _replicate_fn(name, module, device_mesh):
             with comm_mode:
                 y_dist.sum().backward()
 
+            expected_bwd_comm = 0 if shard_dim < norm_idx else 1
+
             self.assertEqual(
                 comm_mode.get_total_counts(),
-                0,
+                expected_bwd_comm,
                 f"comm count={comm_mode.get_total_counts()}, "
                 f"shard_dim={shard_dim}, norm_shape={normalized_shape}, elem_affine={elementwise_affine}",
             )
diff --git a/test/distributed/_tensor/test_matrix_ops.py b/test/distributed/_tensor/test_matrix_ops.py
index a69eaf880f292..fa3f9272c63e4 100644
--- a/test/distributed/_tensor/test_matrix_ops.py
+++ b/test/distributed/_tensor/test_matrix_ops.py
@@ -5,8 +5,10 @@
 from typing import cast, List, Optional
 
 import torch
+import torch.nn.functional as F
 from torch.distributed._tensor import DeviceMesh, distribute_tensor
 from torch.distributed._tensor.api import DTensor
+from torch.distributed._tensor.debug import CommDebugMode
 from torch.distributed._tensor.placement_types import (
     _Partial,
     Placement,
@@ -39,6 +41,23 @@ def test_addmm(self):
         local_res = torch.addmm(input_tensor, tensor_to_shard, tensor_to_replicate)
         self.assertEqual(dist_res.full_tensor(), local_res)
 
+    @with_comms
+    def test_addmm_empty_operand(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+        replica_spec = [Replicate()]
+
+        tensor_to_shard = torch.randn(12, 0)
+        mat1 = distribute_tensor(tensor_to_shard, device_mesh, shard_spec)
+        tensor_to_replicate = torch.randn(0, 4)
+        mat2 = distribute_tensor(tensor_to_replicate, device_mesh, replica_spec)
+        input_tensor = torch.randn(4)
+        inp = distribute_tensor(input_tensor, device_mesh, replica_spec)
+
+        dist_res = torch.addmm(inp, mat1, mat2)
+        local_res = torch.addmm(input_tensor, tensor_to_shard, tensor_to_replicate)
+        self.assertEqual(dist_res.full_tensor(), local_res)
+
     @with_comms
     def test_addmm_auto_redistribute(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
@@ -247,6 +266,78 @@ def test_placement_comb(
         for spec in shard_specs_comb:
             test_placement_comb([spec[0]], [spec[1]])
 
+    @with_comms
+    @skip_unless_torch_gpu
+    def test_scaled_dot_product_attention(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        comm_mode = CommDebugMode()
+        # bsz, n_heads, slen, head_dim
+        query = torch.rand(
+            (4, 8, 8, 8),
+            device=self.device_type,
+            dtype=torch.bfloat16,
+            requires_grad=True,
+        )
+        key = torch.rand(
+            (4, 8, 8, 8),
+            device=self.device_type,
+            dtype=torch.bfloat16,
+            requires_grad=True,
+        )
+        value = torch.rand(
+            (4, 8, 8, 8),
+            device=self.device_type,
+            dtype=torch.bfloat16,
+            requires_grad=True,
+        )
+
+        dist_query = distribute_tensor(query, device_mesh, [Shard(1)])
+        dist_key = distribute_tensor(key, device_mesh, [Shard(1)])
+        dist_value = distribute_tensor(value, device_mesh, [Shard(1)])
+
+        from torch.nn.attention import sdpa_kernel, SDPBackend
+
+        available_backends = []
+        dropout_p = 0.0
+        # TODO: Add test cases where is_causal=False and an attention mask is provided.
+        #       Gaps include missing op support for aten.masked_fill_.Scalar.
+        is_causal = True
+        params = torch.backends.cuda.SDPAParams(
+            query, key, value, None, dropout_p, is_causal
+        )
+        if torch.backends.cuda.can_use_flash_attention(params, debug=False):
+            available_backends.append(SDPBackend.FLASH_ATTENTION)
+        if torch.backends.cuda.can_use_efficient_attention(params, debug=False):
+            available_backends.append(SDPBackend.EFFICIENT_ATTENTION)
+
+        for backend in available_backends:
+            with sdpa_kernel(backends=[backend]):
+                out = F.scaled_dot_product_attention(
+                    query, key, value, dropout_p=dropout_p, is_causal=is_causal
+                )
+                with comm_mode:
+                    dist_out = F.scaled_dot_product_attention(
+                        dist_query,
+                        dist_key,
+                        dist_value,
+                        dropout_p=dropout_p,
+                        is_causal=is_causal,
+                    )
+                    self.assertEqual(comm_mode.get_total_counts(), 0)
+                    self.assertTrue(dist_out.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_out.full_tensor(), out)
+
+                out.sum().backward()
+                with comm_mode:
+                    dist_out.sum().backward()
+                    self.assertEqual(comm_mode.get_total_counts(), 0)
+                    self.assertTrue(dist_query.grad.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_query.grad.full_tensor(), query.grad)
+                    self.assertTrue(dist_key.grad.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_key.grad.full_tensor(), key.grad)
+                    self.assertTrue(dist_value.grad.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_value.grad.full_tensor(), value.grad)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_tensor/test_op_strategy.py b/test/distributed/_tensor/test_op_strategy.py
index 8987164689819..fd6ea44354ced 100644
--- a/test/distributed/_tensor/test_op_strategy.py
+++ b/test/distributed/_tensor/test_op_strategy.py
@@ -1,5 +1,7 @@
 # Owner(s): ["oncall: distributed"]
 
+from itertools import chain
+
 import torch
 from torch.distributed._tensor import DeviceMesh, DTensor
 from torch.distributed._tensor._collective_utils import redistribute_cost
@@ -161,11 +163,57 @@ def test_redistribute_cost_mesh_1d(self):
         # partial -> replicate
         allreduce_cost = redistribute_cost(partial_spec, replica_spec)
         self.assertEqual(allgather_cost, reduce_scatter_cost)
-        self.assertEqual(allreduce_cost + 1, allgather_cost + reduce_scatter_cost)
+        self.assertTrue(allreduce_cost + 1 < allgather_cost + reduce_scatter_cost)
         # shard to partial
         cost = redistribute_cost(shard_spec, partial_spec)
         self.assertEqual(cost, float("inf"))
 
+    def test_redistribute_cost_latency(self):
+        # test cost model on addmm op
+        from torch.distributed._tensor.ops.matrix_ops import addmm_strategy
+
+        mesh = self.build_device_mesh()
+        shard0_placement = (Shard(0),)
+        partial_placement = (_Partial(),)
+        shard1_placement = (Shard(1),)
+
+        shard0_tensor_meta = self._extract_tensor_meta(torch.randn(8))
+        partial_tensor_meta = self._extract_tensor_meta(torch.randn(50, 6))
+        shard1_tensor_meta = self._extract_tensor_meta(torch.randn(6, 8))
+
+        # shard spec
+        shard0_spec = DTensorSpec(mesh, shard0_placement, shard0_tensor_meta)
+        # replica spec
+        partial_spec = DTensorSpec(mesh, partial_placement, partial_tensor_meta)
+        # partial spec
+        shard1_spec = DTensorSpec(mesh, shard1_placement, shard1_tensor_meta)
+
+        op_schema = OpSchema(
+            torch.ops.aten.addmm.default,
+            (
+                OpStrategy([PlacementStrategy(shard0_spec)]),
+                OpStrategy([PlacementStrategy(partial_spec)]),
+                OpStrategy([PlacementStrategy(shard1_spec)]),
+            ),
+            {},
+        )
+
+        output_strategy = addmm_strategy(mesh, op_schema)
+        strategy_costs = {}
+        for strategy in output_strategy.strategies:
+            redistribute_cost = sum(chain.from_iterable(strategy.redistribute_cost))
+            strategy_costs[str(strategy)] = redistribute_cost
+
+        # assert that cost model counts for collective latency (i.e. multiple comm is penalized)
+        self.assertTrue(
+            strategy_costs["(S(0), R, S(1)) -> S(1)"]
+            < strategy_costs["(R, S(0), R) -> S(0)"]
+        )
+        # assert a single allreduce is the best one
+        self.assertEqual(
+            strategy_costs["(S(0), R, S(1)) -> S(1)"], min(strategy_costs.values())
+        )
+
     def test_redistribute_cost_mesh_2d(self):
         mesh_2d = DeviceMesh(
             self.device_type, torch.arange(self.world_size).reshape(2, 2)
diff --git a/test/distributed/_tensor/test_optimizers.py b/test/distributed/_tensor/test_optimizers.py
index c95e31b4d49b4..e7ce18eefa634 100644
--- a/test/distributed/_tensor/test_optimizers.py
+++ b/test/distributed/_tensor/test_optimizers.py
@@ -38,14 +38,14 @@ def shard_fn(name, module, device_mesh):
 
 
 # prepare input
-def input_fn(inputs, device_mesh):
+def input_fn(mod, inputs, device_mesh):
     # split the input tensor to be sharded input
     dist_inp = distribute_tensor(inputs[0], device_mesh, [Shard(0)])
     return dist_inp
 
 
 # prepare output to be local torch.Tensor
-def output_fn(outputs, device_mesh):
+def output_fn(mod, outputs, device_mesh):
     assert isinstance(outputs, DTensor)
     return outputs.redistribute(placements=[Replicate()] * device_mesh.ndim).to_local()
 
@@ -59,6 +59,9 @@ def _assert_optimizer(
         dist_model,
         dist_optim,
         inputs,
+        *,
+        rtol: float = 1.3e-6,
+        atol: float = 1e-5,
     ):
         for iter_idx in range(2):
             # run forward/backward/optim for original model
@@ -78,7 +81,8 @@ def _assert_optimizer(
             # check that the optimizer update parameters with same numerics
             for p1, p2 in zip(model.parameters(), dist_model.parameters()):
                 p2 = p2.full_tensor()
-                self.assertEqual(p1, p2)
+                # Default 'rtol' and 'atol' for attr:`~torch.float32` are ``1.3e-6`` and ``1e-5``
+                self.assertEqual(p1, p2, atol=atol, rtol=rtol)
 
     @with_comms
     def test_adam_1d_sharding(self):
@@ -98,6 +102,15 @@ def test_adam_1d_sharding(self):
                 "amsgrad": True,
                 "foreach": True,
             },
+            {"lr": 0.1, "fused": True},
+            {"lr": 0.1, "weight_decay": 0.05, "amsgrad": True, "fused": True},
+            {
+                "lr": 0.1,
+                "weight_decay": 0.05,
+                "maximize": True,
+                "amsgrad": True,
+                "fused": True,
+            },
         ]
 
         for config in adam_configs:
@@ -118,7 +131,6 @@ def test_adam_1d_sharding(self):
     def test_adamw_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
-        # TODO: add fused_adamw support
         adamw_configs = [
             {"lr": 0.1},
             {"lr": 0.1, "weight_decay": 0.05},
@@ -140,6 +152,24 @@ def test_adamw_1d_sharding(self):
                 "amsgrad": True,
                 "foreach": True,
             },
+            {"lr": 0.1, "weight_decay": 0.05, "fused": True},
+            {
+                "lr": 0.1,
+                "betas": (0.6, 0.66),
+                "eps": 1e-6,
+                "weight_decay": 0.05,
+                "amsgrad": True,
+                "fused": True,
+            },
+            {
+                "lr": 0.1,
+                "betas": (0.6, 0.66),
+                "eps": 1e-6,
+                "weight_decay": 0.05,
+                "maximize": True,
+                "amsgrad": True,
+                "fused": True,
+            },
         ]
 
         for config in adamw_configs:
@@ -427,6 +457,96 @@ def test_radam_1d_sharding(self):
             inp = torch.ones(8, 10, device=self.device_type)
             self._assert_optimizer(mesh, mod, opt, dist_mod, dist_opt, inp)
 
+    @with_comms
+    def test_adamax_1d_sharding(self):
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+
+        adamax_configs = [
+            {"lr": 0.1},
+            {"lr": 0.1, "betas": (0.6, 0.66)},
+            {"lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6},
+            {"lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6, "weight_decay": 0.05},
+            {
+                "lr": 0.1,
+                "betas": (0.6, 0.66),
+                "eps": 1e-6,
+                "weight_decay": 0.05,
+                "foreach": True,
+            },
+            {
+                "lr": 0.1,
+                "betas": (0.6, 0.66),
+                "eps": 1e-6,
+                "weight_decay": 0.05,
+                "foreach": True,
+                "maximize": True,
+            },
+        ]
+
+        for config in adamax_configs:
+            mod = MLPModule(self.device_type)
+            opt = torch.optim.Adamax(mod.parameters(), **config)
+
+            dist_mod = distribute_module(
+                deepcopy(mod), mesh, shard_fn, input_fn, output_fn
+            )
+            dist_opt = torch.optim.Adamax(dist_mod.parameters(), **config)
+
+            # use ones to make sure the single machine model have the same input
+            # on different ranks
+            inp = torch.ones(8, 10, device=self.device_type)
+            self._assert_optimizer(mesh, mod, opt, dist_mod, dist_opt, inp)
+
+    @with_comms
+    def test_asgd_1d_sharding(self):
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+
+        asgd_configs = [
+            {"lr": 0.1},
+            {"lr": 0.1, "lambd": 0.001},
+            {"lr": 0.1, "lambd": 0.001, "alpha": 0.85},
+            {"lr": 0.1, "lambd": 0.001, "alpha": 0.85, "t0": 1e5},
+            {"lr": 0.1, "lambd": 0.001, "alpha": 0.85, "t0": 1e5, "weight_decay": 0.05},
+            {
+                "lr": 0.1,
+                "lambd": 0.001,
+                "alpha": 0.85,
+                "t0": 1e5,
+                "weight_decay": 0.05,
+                "foreach": True,
+            },
+            {
+                "lr": 0.1,
+                "lambd": 0.001,
+                "alpha": 0.85,
+                "t0": 1e5,
+                "weight_decay": 0.05,
+                "foreach": True,
+                "maximize": True,
+            },
+        ]
+
+        for config in asgd_configs:
+            mod = MLPModule(self.device_type)
+            opt = torch.optim.ASGD(mod.parameters(), **config)
+
+            dist_mod = distribute_module(
+                deepcopy(mod), mesh, shard_fn, input_fn, output_fn
+            )
+            dist_opt = torch.optim.ASGD(dist_mod.parameters(), **config)
+
+            # use ones to make sure the single machine model have the same input
+            # on different ranks
+            inp = torch.ones(8, 10, device=self.device_type)
+
+            # TODO: We want to keep a unit test for ASGD optimizer for the time being, but we need to look into why
+            # when using ASGD we need higher atol and rtol when comparing model parameters.
+            # Default 'rtol' and 'atol' for attr:`~torch.float32` are ``1.3e-6`` and ``1e-5``
+            # Pointer here: https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L65
+            self._assert_optimizer(
+                mesh, mod, opt, dist_mod, dist_opt, inp, atol=1.3e-5, rtol=1e-4
+            )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_tensor/test_pointwise_ops.py b/test/distributed/_tensor/test_pointwise_ops.py
index 848d2d0956728..4b25efdc9105a 100644
--- a/test/distributed/_tensor/test_pointwise_ops.py
+++ b/test/distributed/_tensor/test_pointwise_ops.py
@@ -16,7 +16,6 @@
     Replicate,
     Shard,
 )
-from torch.distributed.distributed_c10d import ReduceOp
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorOpTestBase,
@@ -147,7 +146,7 @@ def test_partial_add(self):
         d_3 = d_1 + d_2
         self.assertTrue(d_3._spec.placements[0].is_partial())
 
-    def test_partial_mul_failure(self):
+    def test_partial_mul(self):
         device_mesh = self.build_device_mesh()
         d_1 = DTensor.from_local(torch.ones(2, 2), device_mesh, [_Partial()])
         d_2 = DTensor.from_local(torch.ones(2, 2), device_mesh, [_Partial()])
@@ -257,7 +256,7 @@ def test_dropout_errors(self):
         with self.assertRaisesRegex(RuntimeError, "supported"):
             self._run_sharded_elementwise_ops(
                 device_mesh=device_mesh,
-                placements=[_Partial(ReduceOp.SUM)],
+                placements=[_Partial("sum")],
                 input_size=(8, 5),
                 op=torch.nn.functional.dropout,
             )
diff --git a/test/distributed/_tensor/test_redistribute.py b/test/distributed/_tensor/test_redistribute.py
index 67ce4a7fb35df..9192f815b4dec 100644
--- a/test/distributed/_tensor/test_redistribute.py
+++ b/test/distributed/_tensor/test_redistribute.py
@@ -5,6 +5,7 @@
 
 import torch
 from torch.distributed._tensor import DeviceMesh, distribute_tensor, DTensor
+from torch.distributed._tensor.debug import CommDebugMode
 from torch.distributed._tensor.placement_types import _Partial, Replicate, Shard
 
 from torch.testing._internal.common_utils import run_tests
@@ -14,8 +15,14 @@
     with_comms,
 )
 
+funcol = torch.ops.c10d_functional
+
 
 class RedistributeTest(DTensorTestBase):
+    @property
+    def world_size(self):
+        return 4
+
     @with_comms
     def test_shard_to_replicate_forward_backward(self):
         # 1) test shard -> replicate forward
@@ -31,44 +38,79 @@ def test_shard_to_replicate_forward_backward(self):
             ((3, self.world_size * 3 + 2), 1),
         ]
 
+        comm_mode = CommDebugMode()
         for input_size, shard_dim in input_sizes_and_shard_dim:
             shard_spec = [Shard(shard_dim)]
             expected_tensor = torch.randn(
                 input_size, device=self.device_type, requires_grad=True
             )
             dtensor = distribute_tensor(expected_tensor, device_mesh, shard_spec)
-            reshard_dtensor = dtensor.redistribute(device_mesh, replica_spec)
+            with comm_mode:
+                reshard_dtensor = dtensor.redistribute(device_mesh, replica_spec)
             self.assertEqual(reshard_dtensor.size(), torch.Size(input_size))
             self.assertEqual(expected_tensor, reshard_dtensor.to_local())
+            self.assertEqual(
+                comm_mode.get_comm_counts()[funcol.all_gather_into_tensor], 1
+            )
 
             # 2) test shard -> replicate backward:
             # should give gradient as shard
             grad_output = torch.ones_like(reshard_dtensor)
-            reshard_dtensor.backward(grad_output)
+            with comm_mode:
+                reshard_dtensor.backward(grad_output)
             grad_input = dtensor.grad
             self.assertEqual(grad_input.placements, shard_spec)
             self.assertEqual(
                 grad_input.to_local(), torch.ones(dtensor.to_local().size())
             )
+            self.assertEqual(comm_mode.get_total_counts(), 0)
 
     @with_comms
     def test_replicate_to_replicate_forward_backward(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
         replica_spec = [Replicate()]
         local_tensor = torch.randn(12, 3, device=self.device_type, requires_grad=True)
+
+        comm_mode = CommDebugMode()
+
         # 1) test replicate -> replicate forward
         replica_tensor = distribute_tensor(local_tensor, device_mesh, replica_spec)
-        reshard_replica_tensor = replica_tensor.redistribute(device_mesh, replica_spec)
+        with comm_mode:
+            reshard_replica_tensor = replica_tensor.redistribute(
+                device_mesh, replica_spec
+            )
         self.assertEqual(replica_tensor.size(), local_tensor.size())
         self.assertEqual(replica_tensor, reshard_replica_tensor)
+        self.assertEqual(comm_mode.get_total_counts(), 0)
 
         # 2) test replicate -> replicate backward:
         # should give gradient as replicate
         grad_output = torch.ones_like(reshard_replica_tensor)
-        reshard_replica_tensor.backward(grad_output)
+        with comm_mode:
+            reshard_replica_tensor.backward(grad_output)
         grad_input = replica_tensor.grad
         self.assertEqual(grad_input.placements, replica_spec)
         self.assertEqual(grad_input.to_local(), torch.ones(12, 3))
+        self.assertEqual(comm_mode.get_total_counts(), 0)
+
+    @with_comms
+    def test_replicate_to_local_partial_grad(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        replica_spec = [Replicate()]
+        local_tensor = torch.randn(12, 3, device=self.device_type, requires_grad=True)
+
+        replica_tensor = distribute_tensor(local_tensor, device_mesh, replica_spec)
+
+        comm_mode = CommDebugMode()
+
+        with comm_mode:
+            out = replica_tensor.redistribute(placements=[Replicate()]).to_local(
+                grad_placements=[_Partial()]
+            )
+            out.backward(torch.ones_like(out))
+
+        self.assertEqual(comm_mode.get_total_counts(), 1)
+        self.assertEqual(comm_mode.get_comm_counts()[funcol.all_reduce], 1)
 
     @with_comms
     def test_replicate_to_shard_forward_backward(self):
@@ -83,6 +125,8 @@ def test_replicate_to_shard_forward_backward(self):
             ((3, self.world_size * 3 + 1), 1),
             ((3, self.world_size * 3 + 2), 1),
         ]
+
+        comm_mode = CommDebugMode()
         for input_size, shard_dim in input_sizes_and_shard_dim:
             shard_spec = [Shard(shard_dim)]
             # 1) test replicate -> shard forward
@@ -96,18 +140,25 @@ def test_replicate_to_shard_forward_backward(self):
             # make local tensor as the element of the corresponding chunked list
             local_tensor = splitted_list[self.rank]
             replica_tensor = distribute_tensor(local_replica, device_mesh, replica_spec)
-            reshard_tensor = replica_tensor.redistribute(device_mesh, shard_spec)
+            with comm_mode:
+                reshard_tensor = replica_tensor.redistribute(device_mesh, shard_spec)
             self.assertEqual(reshard_tensor.size(), replica_tensor.size())
             self.assertEqual(reshard_tensor.placements, shard_spec)
             self.assertEqual(reshard_tensor.to_local(), local_tensor)
+            self.assertEqual(comm_mode.get_total_counts(), 0)
 
             # 2) test replicate -> shard backward:
             # should give gradient as replicate
             grad_output = torch.ones_like(reshard_tensor)
-            reshard_tensor.backward(grad_output)
+            with comm_mode:
+                reshard_tensor.backward(grad_output)
             grad_input = replica_tensor.grad
             self.assertEqual(grad_input.placements, replica_spec)
             self.assertEqual(grad_input.to_local(), torch.ones(input_size))
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[funcol.all_gather_into_tensor], 1
+            )
 
     @with_comms
     def test_partial_to_replicate_forward_backward(self):
@@ -119,22 +170,30 @@ def test_partial_to_replicate_forward_backward(self):
         partial_local = torch.ones(12, 3, device=self.device_type, requires_grad=True)
         partial_spec = [_Partial()]
         replica_spec = [Replicate()]
+
+        comm_mode = CommDebugMode()
         # test partial -> replicate, which trigger all_reduce
         partial_tensor = DTensor.from_local(partial_local, device_mesh, partial_spec)
-        global_partial_tensor = partial_tensor.redistribute(device_mesh, replica_spec)
+        with comm_mode:
+            global_partial_tensor = partial_tensor.redistribute(
+                device_mesh, replica_spec
+            )
 
         self.assertEqual(partial_tensor.size(), partial_local.size())
         self.assertEqual(
             partial_local * self.world_size, global_partial_tensor.to_local()
         )
+        self.assertEqual(comm_mode.get_comm_counts()[funcol.all_reduce], 1)
 
         # test backward to have replicate grad on partial
         # for from_local backward, we want the replicate() -> partial() to be
         # pass through.
-        global_partial_tensor.backward(torch.ones_like(global_partial_tensor))
+        with comm_mode:
+            global_partial_tensor.backward(torch.ones_like(global_partial_tensor))
         self.assertIsNotNone(partial_local.grad)
         self.assertEqual(partial_local.grad.size(), partial_local.size())
         self.assertEqual(partial_local.grad, torch.ones_like(partial_local))
+        self.assertEqual(comm_mode.get_total_counts(), 0)
 
     @with_comms
     def test_replicate_to_partial(self):
@@ -149,12 +208,18 @@ def test_replicate_to_partial(self):
 
         from torch.distributed._tensor.redistribute import Redistribute
 
-        partial_tensor = Redistribute.apply(replica_tensor, device_mesh, [partial_spec])
+        comm_mode = CommDebugMode()
+
+        with comm_mode:
+            partial_tensor = Redistribute.apply(
+                replica_tensor, device_mesh, [partial_spec]
+            )
         self.assertEqual(partial_tensor.size(), local_tensor.size())
         # test it successfully zero out the contents on other ranks
         self.assertEqual(
             replica_tensor.to_local() / self.world_size, partial_tensor.to_local()
         )
+        self.assertEqual(comm_mode.get_total_counts(), 0)
 
         # replicate to partial on sub groups
         local_tensor = torch.randn(12, 3, device=self.device_type)
@@ -166,15 +231,17 @@ def test_replicate_to_partial(self):
         replica_tensor = distribute_tensor(
             local_tensor, device_mesh, [replica_spec, replica_spec]
         )
-        partial_tensor = Redistribute.apply(
-            replica_tensor, device_mesh, [partial_spec, partial_spec]
-        )
+        with comm_mode:
+            partial_tensor = Redistribute.apply(
+                replica_tensor, device_mesh, [partial_spec, partial_spec]
+            )
         self.assertEqual(partial_tensor.size(), local_tensor.size())
 
         self.assertEqual(
             replica_tensor.to_local() / self.world_size,
             partial_tensor.to_local(),
         )
+        self.assertEqual(comm_mode.get_total_counts(), 0)
 
     @with_comms
     def test_partial_to_shard(self):
@@ -191,6 +258,8 @@ def test_partial_to_shard(self):
             ((3, self.world_size * 3 + 2), 1),
         ]
 
+        comm_mode = CommDebugMode()
+
         for input_size, shard_dim in input_sizes_and_shard_dim:
             shard_spec = [Shard(shard_dim)]
 
@@ -215,13 +284,19 @@ def test_partial_to_shard(self):
             local_shape[shard_dim] = chunk_sizes[my_rank]
 
             # test partial to shard, trigger reduce_scatter
-            scatter_shard_tensor = partial_tensor.redistribute(device_mesh, shard_spec)
+            with comm_mode:
+                scatter_shard_tensor = partial_tensor.redistribute(
+                    device_mesh, shard_spec
+                )
             self.assertEqual(scatter_shard_tensor.size(), partial_tensor.size())
             self.assertEqual(scatter_shard_tensor.placements, shard_spec)
             self.assertEqual(
                 scatter_shard_tensor.to_local(),
                 torch.ones(local_shape) * self.world_size,
             )
+            self.assertEqual(
+                comm_mode.get_comm_counts()[funcol.reduce_scatter_tensor], 1
+            )
 
     @with_comms
     def test_redistribute_negative_shard_dim(self):
@@ -235,6 +310,113 @@ def test_redistribute_negative_shard_dim(self):
         reshard_tensor = shard_tensor.redistribute(device_mesh, shard_minus_spec)
         self.assertEqual(shard_tensor.placements[0].dim, 1)
 
+    @with_comms
+    def test_redistribute_uneven_sharding(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).reshape(2, 2))
+        data_to_test = [
+            # uneven on last mesh dim
+            torch.randn((10, 5), device=self.device_type),
+            # uneven on both mesh dims
+            torch.randn((9, 5), device=self.device_type),
+            # smaller than mesh dim shape
+            torch.randn((3, 5), device=self.device_type),
+            torch.randn((1, 3), device=self.device_type),
+        ]
+
+        sharding_to_tests = [
+            [Shard(0), Shard(0)],
+            [Shard(0), Shard(1)],
+        ]
+
+        for input_tensor in data_to_test:
+            for placements in sharding_to_tests:
+                dt = distribute_tensor(input_tensor, mesh, placements)
+                dt_full_tensor = dt.full_tensor()
+                self.assertEqual(dt_full_tensor, input_tensor)
+
+    @with_comms
+    def test_redistribute_shard_dim_change(self):
+        # test 1d device mesh
+        mesh_1d = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        data_to_test = [
+            # evenly sharded case
+            torch.randn((8, 8), device=self.device_type),
+            # 3d or more dims
+            torch.randn((8, 8, 8), device=self.device_type),
+            # uneven case 1
+            torch.randn((8, 5), device=self.device_type),
+            # uneven case 2
+            torch.randn((5, 8), device=self.device_type),
+            # uneven case 3
+            torch.randn((5, 5), device=self.device_type),
+        ]
+
+        sharding_src_dst_pairs = [([Shard(0)], [Shard(1)]), ([Shard(1)], [Shard(0)])]
+
+        comm_mode = CommDebugMode()
+
+        for input_data in data_to_test:
+            for src, dst in sharding_src_dst_pairs:
+                expected_dt = distribute_tensor(input_data.clone(), mesh_1d, dst)
+                sharded_dt = distribute_tensor(input_data, mesh_1d, src)
+                with comm_mode:
+                    out_dt = sharded_dt.redistribute(mesh_1d, dst)
+                self.assertEqual(out_dt.placements, expected_dt.placements)
+                local_out_dt = out_dt.to_local()
+                local_expected_dt = expected_dt.to_local()
+                self.assertEqual(out_dt.to_local(), expected_dt.to_local())
+                if self.device_type == "cuda":
+                    self.assertEqual(
+                        comm_mode.get_comm_counts()[
+                            torch.ops._dtensor.shard_dim_alltoall
+                        ],
+                        1,
+                    )
+                else:
+                    self.assertEqual(
+                        comm_mode.get_comm_counts()[funcol.all_gather_into_tensor],
+                        1,
+                    )
+
+        # test 2d device mesh
+        mesh_2d = DeviceMesh(
+            self.device_type, torch.arange(self.world_size).reshape(2, 2)
+        )
+        data_to_test_2d = [
+            # evenly sharded case
+            torch.randn((8, 8), device=self.device_type),
+            # 3d or more dims
+            torch.randn((8, 8, 8), device=self.device_type),
+            # uneven case 1
+            torch.randn((8, 5), device=self.device_type),
+            # uneven case 2
+            torch.randn((5, 8), device=self.device_type),
+            # uneven case 3
+            torch.randn((5, 5), device=self.device_type),
+        ]
+        sharding_src_dst_pairs_2d = [
+            ([Shard(0), Shard(1)], [Shard(0), Shard(0)]),
+            ([Shard(0), Shard(1)], [Shard(1), Shard(0)]),
+            ([Shard(0), Shard(0)], [Shard(1), Shard(1)]),
+        ]
+
+        for input_data in data_to_test_2d:
+            if input_data.ndim > 2:
+                sharding_spec_combs = sharding_src_dst_pairs_2d + [
+                    ([Shard(0), Shard(2)], [Shard(1), Shard(0)])
+                ]
+            else:
+                sharding_spec_combs = sharding_src_dst_pairs_2d
+            for src, dst in sharding_spec_combs:
+                expected_dt = distribute_tensor(input_data.clone(), mesh_2d, dst)
+                sharded_dt = distribute_tensor(input_data, mesh_2d, src)
+                out_dt = sharded_dt.redistribute(mesh_2d, dst)
+
+                self.assertEqual(out_dt.placements, expected_dt.placements)
+                local_out_dt = out_dt.to_local()
+                local_expected_dt = expected_dt.to_local()
+                self.assertEqual(out_dt.to_local(), expected_dt.to_local())
+
 
 class MultiDimRedistributeTest(DTensorTestBase):
     @property
diff --git a/test/distributed/_tensor/test_tensor_ops.py b/test/distributed/_tensor/test_tensor_ops.py
index c1313fc57cc61..2d8d726da865b 100644
--- a/test/distributed/_tensor/test_tensor_ops.py
+++ b/test/distributed/_tensor/test_tensor_ops.py
@@ -3,7 +3,9 @@
 
 import torch
 from torch.distributed._tensor import DeviceMesh, distribute_tensor, DTensor
+from torch.distributed._tensor.debug import CommDebugMode
 from torch.distributed._tensor.placement_types import _Partial, Replicate, Shard
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorConverter,
@@ -228,6 +230,44 @@ def test_zeros_like(self):
         zeros_expected = torch.zeros(4, 8)
         self.assertEqual(zeros_expected, zeros_like_dt.to_local())
 
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    def test_stack(self):
+        mesh_2d = DeviceMesh(
+            self.device_type, torch.arange(self.world_size).reshape(2, 2)
+        )
+        partial_replicate_placement = [_Partial(), Replicate()]
+        partial_placement = [_Partial(), _Partial()]
+
+        partial_replicate_dt = DTensor.from_local(
+            torch.randn(4, 8), mesh_2d, partial_replicate_placement
+        )
+        partial_dt = DTensor.from_local(torch.randn(4, 8), mesh_2d, partial_placement)
+
+        stack_dt = torch.stack([partial_replicate_dt, partial_dt])
+        self.assertEqual(stack_dt.placements, tuple(partial_placement))
+        self.assertEqual(stack_dt.shape, (2, 4, 8))
+
+        mesh_1d = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        # stack before/after shard dim
+        global_input = torch.randn(8, 8)
+        shard1_input = distribute_tensor(global_input, mesh_1d, [Shard(1)])
+        cloned_shard1_input = shard1_input.clone()
+        stack_shard1_dt = torch.stack([shard1_input, cloned_shard1_input])
+        self.assertEqual(stack_shard1_dt.placements, (Shard(2),))
+        self.assertEqual(stack_shard1_dt.shape, (2, 8, 8))
+        self.assertEqual(
+            stack_shard1_dt.full_tensor(), torch.stack([global_input, global_input])
+        )
+
+        stack_dim1_shard1_dt = torch.stack([shard1_input, cloned_shard1_input], dim=1)
+        self.assertEqual(stack_dim1_shard1_dt.placements, (Shard(2),))
+        self.assertEqual(stack_dim1_shard1_dt.shape, (8, 2, 8))
+        self.assertEqual(
+            stack_dim1_shard1_dt.full_tensor(),
+            torch.stack([global_input, global_input], dim=1),
+        )
+
     @with_comms
     def test_equal(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
@@ -278,6 +318,127 @@ def _test_op(self, mesh, op_call, *args, **kwargs):
             d_out = op_call(*d_args, **d_kwargs)
             self.assertEqual(d_out.full_tensor(), out)
 
+    @with_comms
+    def test_new_full(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        comm_mode = CommDebugMode()
+
+        global_tensor = torch.randn(12, 8)
+        placements = [[Shard(0)], [Replicate()]]
+        for placement in placements:
+            input_dt = distribute_tensor(global_tensor, device_mesh, placement)
+            with comm_mode:
+                new_full_diff_dt = input_dt.new_full((4, 8), 42.0)
+                # new_full_diff_dt creates a replicated tensor, regardless of input_dt placement,
+                # which should not trigger any communication.
+                self.assertEqual(comm_mode.get_total_counts(), 0)
+            new_full_diff_expected = torch.full((4, 8), 42.0)
+            self.assertTrue(new_full_diff_dt.placements[0].is_replicate())
+            self.assertEqual(new_full_diff_expected, new_full_diff_dt.to_local())
+
+            with comm_mode:
+                new_full_same_dt = input_dt.new_full((12, 8), 42.0)
+                # new_full_same_dt creates a tensor with the same placement as input_dt,
+                # which should not trigger any communication.
+                self.assertEqual(comm_mode.get_total_counts(), 0)
+            new_full_same_expected = torch.full((12, 8), 42.0)
+            self.assertEqual(new_full_same_dt.placements, placement)
+            self.assertEqual(new_full_same_expected, new_full_same_dt.full_tensor())
+
+    @with_comms
+    def test_new_empty_strided(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        comm_mode = CommDebugMode()
+
+        shard_dim = 1
+        placement = (Shard(shard_dim),)
+
+        # output shape same as input shape, evenly sharded input -> output same sharding as input
+        global_tensor = torch.randn(12, 8)
+        input_dt = distribute_tensor(global_tensor, device_mesh, placement)
+        self.assertTrue(input_dt.shape[shard_dim] % self.world_size == 0)
+        with comm_mode:
+            new_empty_strided_dt = input_dt.new_empty_strided((12, 8), (8, 1))
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+        self.assertEqual(new_empty_strided_dt.placements, placement)
+        self.assertEqual(
+            new_empty_strided_dt._local_tensor.size(), (12, 8 // self.world_size)
+        )
+        self.assertEqual(
+            new_empty_strided_dt._local_tensor.stride(), (8 // self.world_size, 1)
+        )
+        self.assertTrue(new_empty_strided_dt.contiguous() is new_empty_strided_dt)
+
+        # output shape same as input shape, unevenly sharded input -> output replicated
+        global_tensor = torch.randn(12, 7)
+        input_dt = distribute_tensor(global_tensor, device_mesh, placement)
+        self.assertTrue(input_dt.shape[shard_dim] % self.world_size != 0)
+        with comm_mode:
+            new_empty_strided_dt = input_dt.new_empty_strided((12, 7), (7, 1))
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+        self.assertEqual(new_empty_strided_dt.placements, (Replicate(),))
+        self.assertEqual(new_empty_strided_dt._local_tensor.size(), (12, 7))
+        self.assertEqual(new_empty_strided_dt._local_tensor.stride(), (7, 1))
+
+        # output shape different from input shape -> output replicated
+        global_tensor = torch.randn(12, 8)
+        input_dt = distribute_tensor(global_tensor, device_mesh, placement)
+        with comm_mode:
+            new_empty_strided_dt = input_dt.new_empty_strided((12, 4), (4, 1))
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+        self.assertEqual(new_empty_strided_dt.placements, (Replicate(),))
+        self.assertEqual(new_empty_strided_dt._local_tensor.size(), (12, 4))
+        self.assertEqual(new_empty_strided_dt._local_tensor.stride(), (4, 1))
+
+    @with_comms
+    def test_gather(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        comm_mode = CommDebugMode()
+
+        # case 1 all replicate: input replicated, index replicated, output replicated
+        global_input = torch.randn(12, 8, 16)
+        global_index = torch.randint(8, (4, 4, 8))
+        input_dt = distribute_tensor(global_input, device_mesh, [Replicate()])
+        index_dt = distribute_tensor(global_index, device_mesh, [Replicate()])
+        for gather_dim in [0, 1, 2]:
+            global_output = torch.gather(global_input, gather_dim, global_index)
+            with comm_mode:
+                output_dt = torch.gather(input_dt, gather_dim, index_dt)
+                self.assertEqual(comm_mode.get_total_counts(), 0)
+            self.assertEqual(output_dt.placements, [Replicate()])
+            self.assertEqual(output_dt.to_local(), global_output)
+
+        # case 2 input sharding: input sharded, index replicated, output mask partial
+        # only works when index has size 1 on the gather dimension and
+        # input is sharded on the gather dimension
+        from torch.distributed._tensor.ops.embedding_ops import _MaskPartial
+
+        gather_dim = 1
+        global_input = torch.randn(12, 8, 16)
+        global_index = torch.randint(8, (4, 1, 8))
+        global_output = torch.gather(global_input, gather_dim, global_index)
+        input_dt = distribute_tensor(global_input, device_mesh, [Shard(gather_dim)])
+        index_dt = distribute_tensor(global_index, device_mesh, [Replicate()])
+        with comm_mode:
+            output_dt = torch.gather(input_dt, gather_dim, index_dt)
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+        self.assertIsInstance(output_dt.placements[0], _MaskPartial)
+        self.assertEqual(output_dt.full_tensor(), global_output)
+
+        # case 3 index sharding: input replicated, index sharded, output sharded
+        # only works when the sharding dimension is the gather dimension
+        global_input = torch.randn(12, 8, 16)
+        global_index = torch.randint(8, (4, 4, 8))
+        for gather_dim in range(len(global_index.shape)):
+            input_dt = distribute_tensor(global_input, device_mesh, [Replicate()])
+            index_dt = distribute_tensor(global_index, device_mesh, [Shard(gather_dim)])
+            global_output = torch.gather(global_input, gather_dim, global_index)
+            with comm_mode:
+                output_dt = torch.gather(input_dt, gather_dim, index_dt)
+                self.assertEqual(comm_mode.get_total_counts(), 0)
+            self.assertEqual(output_dt.placements, [Shard(gather_dim)])
+            self.assertEqual(output_dt.full_tensor(), global_output)
+
     @with_comms
     def test_index(self):
         meshes = [
diff --git a/test/distributed/_tensor/test_utils.py b/test/distributed/_tensor/test_utils.py
index a19e6e4753397..3d6608a491ec3 100644
--- a/test/distributed/_tensor/test_utils.py
+++ b/test/distributed/_tensor/test_utils.py
@@ -3,13 +3,15 @@
 import itertools
 
 import torch
-from torch.distributed._tensor import distribute_tensor
+from torch.distributed._tensor import distribute_tensor, DTensor
 from torch.distributed._tensor._utils import (
     compute_local_shape,
     compute_local_shape_and_global_offset,
 )
+
+from torch.distributed._tensor.debug import CommDebugMode
 from torch.distributed._tensor.placement_types import Replicate, Shard
-from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
 
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
@@ -17,6 +19,8 @@
     with_comms,
 )
 
+c10d_functional = torch.ops.c10d_functional
+
 
 class UtilTest(DTensorTestBase):
     @property
@@ -119,5 +123,82 @@ def test_compute_local_shape_and_global_offset_2D(self):
                 )
 
 
+class Test2DStridedLocalShard(DTensorTestBase):
+    @property
+    def world_size(self):
+        return 4
+
+    @with_comms
+    def test_fsdp1_tp_2d_dtensor_local_shards_and_offsets(self):
+        # We are mimicking the behavior of FSDP1 + TP.
+        # Currently, the 2D DTensor's local shard is correct, since from_local + redistribute incurs a all_gather behind the scene.
+        # When we have a global_tensor of [0, 1, 2, 3, 4, 5, 6, 7], the local shard of 2D DTensor would be:
+        # rank0: [0, 1], rank1: [2, 3], rank2: [4, 5], rank3: [6, 7]
+        with CommDebugMode() as comm_mode:
+            global_tensor = torch.arange(8).view(4, 2)
+            mesh_2d = init_device_mesh(
+                self.device_type, (2, 2), mesh_dim_names=("DP", "TP")
+            )
+            tp_mesh = mesh_2d["TP"]
+            dtensor_tp = distribute_tensor(
+                global_tensor, tp_mesh, placements=[Shard(0)]
+            )
+            dtensor_2d = DTensor.from_local(
+                dtensor_tp.to_local(), mesh_2d, [Replicate(), Shard(0)], run_check=False
+            ).redistribute(mesh_2d, [Shard(0), Shard(0)])
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 1
+            )
+
+        self.assertEqual(
+            dtensor_2d.to_local(), global_tensor[self.rank : self.rank + 1]
+        )
+        # compute_local_shape_and_global_offset currently does take into consideration of strided sharding,
+        # which should after strided sharding is added.
+        local_size, global_offset = compute_local_shape_and_global_offset(
+            global_tensor.shape, mesh_2d, [Shard(0), Shard(0)]
+        )
+        self.assertEqual(local_size, torch.Size([1, 2]))
+        self.assertEqual(global_offset, torch.Size([self.rank, 0]))
+
+    @with_comms
+    def test_fsdp2_tp_2d_dtensor_local_shards_and_offsets(self):
+        # We are mimicking the behavior of FSDP2 + TP.
+        # Currently, the 2D DTensor's local shard is incorrect for resharding, since we want to avoid extra communication.
+        # It's incorrect for resharding, since `compute_local_shape_and_global_offset`
+        # doesn't know the correct offsets for resharding.
+        # When we have a global_tensor of [0, 1, 2, 3, 4, 5, 6, 7], the local shard of 2D DTensor would be:
+        # local tensor -- rank0: [0, 1], rank1: [4, 5], rank2: [2, 3], rank3: [6, 7]
+        # current offsets -- rank0: [0, 0], rank1: [1, 0], rank2: [2, 0], rank3: [3, 0]
+        # Ideally, with strided sharding, the offsets should be  rank0: [0, 0], rank1: [2, 0], rank2: [1, 0], rank3: [3, 0]
+        # TODO: to make the local shard of FSDP2 + TP correct for resharding, it would require strided_sharding
+        # as well as let compute_local_shape_and_global_offset takes into consideration of strided_sharding.
+        with CommDebugMode() as comm_mode:
+            global_tensor = torch.arange(8).view(4, 2)
+            mesh_2d = init_device_mesh(
+                self.device_type, (2, 2), mesh_dim_names=("DP", "TP")
+            )
+            tp_mesh = mesh_2d["TP"]
+            dtensor_tp = distribute_tensor(
+                global_tensor, tp_mesh, placements=[Shard(0)]
+            )
+            chunks = list(torch.chunk(dtensor_tp.to_local(), 2, dim=0))
+            shard_rank = 0 if self.rank // 2 == 0 else 1
+            sharded_param = chunks[shard_rank]
+            dtensor_2d = DTensor(
+                sharded_param,
+                mesh_2d,
+                [Shard(0), Shard(0)],
+                shape=global_tensor.size(),
+                dtype=global_tensor.dtype,
+                requires_grad=False,
+                stride=global_tensor.stride(),
+            )
+
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 0
+            )
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_tensor/test_view_ops.py b/test/distributed/_tensor/test_view_ops.py
index 855a7df79345a..429e62588651a 100644
--- a/test/distributed/_tensor/test_view_ops.py
+++ b/test/distributed/_tensor/test_view_ops.py
@@ -460,6 +460,72 @@ def test_view_ops(self):
             ),
         )
 
+    # TODO: Currently functional collectives on complex numbers are not fully supported,
+    # so we are having a standalone test for view_as_complex and view_as_real combined.
+    # Once complex numbers are supported, we can add the following to the dim_map test.
+    #
+    # self.dimmap_test(
+    #     torch.view_as_complex,
+    #     (randn(24, 13, 2),),
+    #     (
+    #         InputDim(0),
+    #         Flatten((InputDim(1), InputDim(2))),
+    #     ),
+    # )
+    # self.dimmap_test(
+    #     torch.view_as_real,
+    #     (torch.randn(24, 13, dtype=torch.cfloat),),
+    #     (
+    #         InputDim(0),
+    #         Split(InputDim(1), (13, 2), 0),
+    #         Split(InputDim(1), (13, 2), 1),
+    #     ),
+    # )
+    @with_comms
+    def test_complex_view_ops(self):
+        self.device_mesh = DeviceMesh(
+            self.device_type, torch.arange(dist.get_world_size()).view(-1, 2)
+        )
+        inp = randn(24, 13, 2)
+        intermediate = torch.view_as_complex(inp)
+        out = torch.view_as_real(intermediate)
+
+        # test dim_map correctness
+        expected_view_as_complex_rule = (
+            InputDim(0),
+            Flatten((InputDim(1), InputDim(2))),
+        )
+        view_as_complex_rule = ops[torch.view_as_complex].dim_map(inp)
+        self.assertEqual(view_as_complex_rule, expected_view_as_complex_rule)
+        expected_view_as_real_rule = (
+            InputDim(0),
+            Split(InputDim(1), (13, 2), 0),
+            Split(InputDim(1), (13, 2), 1),
+        )
+        view_as_real_rule = ops[torch.view_as_real].dim_map(intermediate)
+        self.assertEqual(view_as_real_rule, expected_view_as_real_rule)
+
+        # test sharded computation correctness
+        # NOTE: For the input to torch.view_as_complex, sharding
+        #       on the last two dimensions is not supported.
+        sharding_choices: List[Placement] = [Replicate(), Shard(0)]
+        all_sharding_choices = itertools.product(
+            *(self.device_mesh.ndim * [sharding_choices])
+        )
+
+        for inp_shard in all_sharding_choices:
+            inp_dt = distribute_tensor(inp, self.device_mesh, inp_shard)
+
+            comm_mode = CommDebugMode()
+            with comm_mode:
+                intermediate_dt = torch.view_as_complex(inp_dt)
+                out_dt = torch.view_as_real(intermediate_dt)
+
+            self.assertEqual(
+                comm_mode.get_total_counts(), 0, "Expected no redistribution."
+            )
+            self.assertEqual(out, out_dt.full_tensor())
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_tensor/test_xla_integration.py b/test/distributed/_tensor/test_xla_integration.py
index 9d4e26fc3f02a..682d10250f4c4 100644
--- a/test/distributed/_tensor/test_xla_integration.py
+++ b/test/distributed/_tensor/test_xla_integration.py
@@ -8,7 +8,14 @@
 
 import numpy as np
 import torch
-from torch.distributed._tensor import DeviceMesh, distribute_tensor, Replicate, Shard
+from torch import nn
+from torch.distributed._tensor import (
+    DeviceMesh,
+    distribute_module,
+    distribute_tensor,
+    Replicate,
+    Shard,
+)
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
@@ -34,6 +41,18 @@ def wrapper(
 
 
 class DTensorXLAIntegrationTest(TestCase):
+    class SimpleLinear(nn.Module):
+        def __init__(self):
+            super(DTensorXLAIntegrationTest.SimpleLinear, self).__init__()
+            self.fc1 = nn.Linear(128, 64)
+            self.relu = nn.ReLU()
+            self.fc2 = nn.Linear(64, 1)
+
+        def forward(self, x):
+            y = self.relu(self.fc1(x))
+            z = self.fc2(y)
+            return z
+
     @with_xla
     def test_xla_distribute_tensor_1d_shard(self):
         import torch_xla.runtime as xr  # type:ignore[import]
@@ -115,6 +134,33 @@ def test_xla_distribute_tensor_2d(self):
                     self.assertTrue(dist_tensor.global_tensor.requires_grad)
                     self.assertTrue(dist_tensor.is_leaf)
 
+    @with_xla
+    def text_xla_distribute_module(self):
+        import torch_xla  # type:ignore[import]
+        import torch_xla.core.xla_model as xm  # type:ignore[import]
+        import torch_xla.runtime as xr  # type:ignore[import]
+
+        model = self.SimpleLinear().to(xm.xla_device())
+
+        device_count = xr.global_runtime_device_count()
+        device_mesh = DeviceMesh("xla", list(range(device_count)))
+
+        def shard_params(mod_name, mod, mesh):
+            shard_spec = [Shard(0)]
+            # annoate fc1 and fc2
+            if isinstance(mod, nn.Linear):
+                for name, param in mod.named_parameters():
+                    # annotate the parameter tensors directly
+                    distribute_tensor(param, mesh, shard_spec)
+
+        sharded_model = distribute_module(model, device_mesh, shard_params)
+        self.assertTrue(
+            torch_xla._XLAC._get_xla_sharding_spec(sharded_model.fc1.weight) != ""
+        )
+        self.assertTrue(
+            torch_xla._XLAC._get_xla_sharding_spec(sharded_model.fc2.weight) != ""
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_tools/test_memory_tracker.py b/test/distributed/_tools/test_memory_tracker.py
index 90dded67974ba..3523e51e3699c 100644
--- a/test/distributed/_tools/test_memory_tracker.py
+++ b/test/distributed/_tools/test_memory_tracker.py
@@ -1,17 +1,14 @@
 # Owner(s): ["oncall: distributed"]
 import os
-from torch.testing._internal.common_cuda import TEST_CUDA
-from torch.testing._internal.common_utils import (
-    TestCase,
-    run_tests,
-)
+
+import unittest
 
 import torch
 import torch.nn as nn
 
 from torch.distributed._tools import MemoryTracker
-
-import unittest
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class TestMemoryTracker(TestCase):
diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
index 2d6a17bf8d575..80cb52a7e4451 100644
--- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
+++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
@@ -4,8 +4,8 @@
 import sys
 
 import torch
-from torch import nn
 import torch.distributed as dist
+from torch import nn
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
@@ -21,15 +21,13 @@
     requires_nccl,
     skip_if_lt_x_gpu,
 )
-from torch.testing._internal.common_utils import (
-    run_tests,
-    TEST_WITH_DEV_DBG_ASAN,
-)
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 
 if TEST_WITH_DEV_DBG_ASAN:
     print("Multiprocessing spawn is not compatible with dev/dbg asan", file=sys.stderr)
     sys.exit(0)
 
+
 def gpus_for_rank(world_size):
     visible_devices = list(range(torch.cuda.device_count()))
     gpus_per_process = torch.cuda.device_count() // world_size
@@ -185,7 +183,6 @@ def test_ddp_comm_hook_quantize_per_channel_hook(self):
 
         torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
-
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_ddp_comm_hook_noop_hook(self):
@@ -220,7 +217,7 @@ def hook(flags, bucket):
         device_id = gpus_for_rank(self.world_size)[self.rank][0]
         model = nn.Sequential(
             nn.Linear(2, 4000, bias=False),
-            *[nn.Linear(4000, 4000, bias=False) for _ in range(10)]
+            *[nn.Linear(4000, 4000, bias=False) for _ in range(10)],
         )
         gpu_model = DistributedDataParallel(
             model.to(device_id),
diff --git a/test/distributed/algorithms/quantization/test_quantization.py b/test/distributed/algorithms/quantization/test_quantization.py
index d0b6656ec91ca..daa8e9ffca6c8 100644
--- a/test/distributed/algorithms/quantization/test_quantization.py
+++ b/test/distributed/algorithms/quantization/test_quantization.py
@@ -1,25 +1,26 @@
 # Owner(s): ["oncall: distributed"]
 
-import torch
 import os
-import torch.cuda
 import sys
+
+import torch
+import torch.cuda
 import torch.distributed as dist
 import torch.distributed.algorithms._quantization.quantization as quant
 from torch.distributed.algorithms._quantization.quantization import DQuantType
 from torch.testing._internal.common_distributed import (
-    MultiProcessTestCase,
     init_multigpu_helper,
+    MultiProcessTestCase,
     requires_gloo,
-    skip_if_rocm,
-    skip_if_lt_x_gpu,
     requires_nccl,
+    skip_if_lt_x_gpu,
+    skip_if_rocm,
 )
 from torch.testing._internal.common_utils import (
-    skip_but_pass_in_sandcastle_if,
+    NO_MULTIPROCESSING_SPAWN,
     run_tests,
+    skip_but_pass_in_sandcastle_if,
     TEST_WITH_DEV_DBG_ASAN,
-    NO_MULTIPROCESSING_SPAWN,
 )
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/test/distributed/algorithms/test_join.py b/test/distributed/algorithms/test_join.py
index 77ac3de4fb988..89a8e9e04a4bf 100644
--- a/test/distributed/algorithms/test_join.py
+++ b/test/distributed/algorithms/test_join.py
@@ -20,7 +20,10 @@
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 
 if TEST_WITH_DEV_DBG_ASAN:
-    print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr)
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
     sys.exit(0)
 
 BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO
@@ -42,12 +45,8 @@ class AllReducerJoinHook(JoinHook):
             iteration.
         run_post_hook (bool): a flag enabling the post-hook logic.
     """
-    def __init__(
-        self,
-        allreducer,
-        num_allreduces,
-        run_post_hook
-    ):
+
+    def __init__(self, allreducer, num_allreduces, run_post_hook):
         self.allreducer = allreducer
         self.num_allreduces = num_allreduces
         self.run_post_hook = run_post_hook
@@ -73,7 +72,9 @@ def post_hook(self, is_last_joiner: bool):
         common_rank = self.allreducer.find_common_rank(rank, is_last_joiner)
         device = self.allreducer.device
         if rank == common_rank:
-            self.allreducer.post_hook_tensor = torch.tensor([AFTER_CONSTANT], device=device)
+            self.allreducer.post_hook_tensor = torch.tensor(
+                [AFTER_CONSTANT], device=device
+            )
         dist.broadcast(self.allreducer.post_hook_tensor, src=common_rank)
 
 
@@ -82,6 +83,7 @@ class AllReducer(Joinable):
     Example :class:`Joinable` that performs some number of all-reduces as its
     per-iteration collective communication.
     """
+
     def __init__(self, device, process_group):
         super().__init__()
         self.device = device
@@ -109,11 +111,7 @@ def join_hook(self, **kwargs) -> JoinHook:
         """
         num_allreduces = kwargs.get("num_allreduces", 1)
         run_post_hook = kwargs.get("run_post_hooks", False)
-        return AllReducerJoinHook(
-            self,
-            num_allreduces,
-            run_post_hook
-        )
+        return AllReducerJoinHook(self, num_allreduces, run_post_hook)
 
     @property
     def join_device(self) -> torch.device:
@@ -127,17 +125,16 @@ def find_common_rank(self, rank, to_consider):
         r"""
         Returns the max rank of the ones to consider over the process group.
         """
-        common_rank = torch.tensor(
-            [rank if to_consider else -1],
-            device=self.device
-        )
+        common_rank = torch.tensor([rank if to_consider else -1], device=self.device)
         dist.all_reduce(common_rank, op=dist.ReduceOp.MAX, group=self.process_group)
         common_rank = common_rank.item()
         assert common_rank >= 0
         return common_rank
 
+
 class TestJoin(MultiProcessTestCase):
     r"""Test cases for the generic join context."""
+
     def setUp(self):
         super().setUp()
         os.environ["WORLD_SIZE"] = str(self.world_size)
@@ -146,8 +143,11 @@ def setUp(self):
 
     @property
     def device(self):
-        return torch.device(self.rank) if BACKEND == dist.Backend.NCCL \
+        return (
+            torch.device(self.rank)
+            if BACKEND == dist.Backend.NCCL
             else torch.device("cpu")
+        )
 
     @property
     def world_size(self):
@@ -170,10 +170,7 @@ def tearDown(self):
     def dist_init(self, rank, world_size, backend=BACKEND):
         store = dist.FileStore(self.file_name, world_size)
         return dist.init_process_group(
-            backend=backend,
-            store=store,
-            rank=rank,
-            world_size=world_size
+            backend=backend, store=store, rank=rank, world_size=world_size
         )
 
     def construct_uneven_inputs(self, base, offset, device=None):
@@ -231,32 +228,35 @@ def _test_join_base(
         self.dist_init(self.rank, self.world_size)
 
         allreducers = [
-            AllReducer(self.device, self.process_group)
-            for _ in range(num_joinables)
+            AllReducer(self.device, self.process_group) for _ in range(num_joinables)
         ]
         for allreducer in allreducers:
             self.assertEqual(allreducer.post_hook_tensor.item(), BEFORE_CONSTANT)
 
-        inputs = self.construct_uneven_inputs(self.base_num_inputs, self.offset) \
-            if uneven_inputs \
+        inputs = (
+            self.construct_uneven_inputs(self.base_num_inputs, self.offset)
+            if uneven_inputs
             else self.construct_even_inputs(self.base_num_inputs)
+        )
         allreduce_total = 0
 
         # Expect a `RuntimeError` if `throw_on_early_termination=True`
         # Rank 0 exhausts its inputs first
-        expected_msg = "Rank 0 exhausted all inputs." if self.rank == 0 \
-            else "Detected at least one rank that exhausted inputs. " \
+        expected_msg = (
+            "Rank 0 exhausted all inputs."
+            if self.rank == 0
+            else "Detected at least one rank that exhausted inputs. "
             "Throwing across all ranks."
+        )
         with self.assertRaisesRegex(
-            RuntimeError,
-            expected_msg
+            RuntimeError, expected_msg
         ) if throw_on_early_termination else contextlib.nullcontext():
             with Join(
                 allreducers,
                 enable=enable,
                 throw_on_early_termination=throw_on_early_termination,
                 num_allreduces=num_allreduces,
-                run_post_hooks=run_post_hooks
+                run_post_hooks=run_post_hooks,
             ):
                 for _ in inputs:
                     for allreducer in allreducers:
@@ -275,9 +275,7 @@ def _test_join_base(
             for allreducer in allreducers:
                 self.assertEqual(allreducer.post_hook_tensor.item(), AFTER_CONSTANT)
 
-    @require_n_gpus_for_nccl_backend(
-        WORLD_SIZE, BACKEND
-    )
+    @require_n_gpus_for_nccl_backend(WORLD_SIZE, BACKEND)
     def test_single_joinable_main_hooks(self):
         r"""Tests the main hooks of a single :class:`Joinable`."""
         num_joinables = 1
@@ -298,12 +296,10 @@ def test_single_joinable_main_hooks(self):
             throw_on_early_termination=False,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
-            expected_total=expected_total
+            expected_total=expected_total,
         )
 
-    @require_n_gpus_for_nccl_backend(
-        WORLD_SIZE, BACKEND
-    )
+    @require_n_gpus_for_nccl_backend(WORLD_SIZE, BACKEND)
     def test_single_joinable_post_hooks(self):
         r"""Tests the post-hooks of a single :class:`Joinable`."""
         num_joinables = 1
@@ -317,12 +313,10 @@ def test_single_joinable_post_hooks(self):
             throw_on_early_termination=False,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
-            expected_total=None
+            expected_total=None,
         )
 
-    @require_n_gpus_for_nccl_backend(
-        WORLD_SIZE, BACKEND
-    )
+    @require_n_gpus_for_nccl_backend(WORLD_SIZE, BACKEND)
     def test_single_joinable(self):
         r"""
         Tests the main hooks and post-hooks of a single :class:`Joinable`
@@ -347,12 +341,10 @@ def test_single_joinable(self):
             throw_on_early_termination=False,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
-            expected_total=expected_total
+            expected_total=expected_total,
         )
 
-    @require_n_gpus_for_nccl_backend(
-        WORLD_SIZE, BACKEND
-    )
+    @require_n_gpus_for_nccl_backend(WORLD_SIZE, BACKEND)
     def test_multiple_joinables(self):
         r"""
         Tests the main hooks and post-hooks of multiple :class:`Joinable` s
@@ -378,12 +370,10 @@ def test_multiple_joinables(self):
             throw_on_early_termination=False,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
-            expected_total=expected_total
+            expected_total=expected_total,
         )
 
-    @require_n_gpus_for_nccl_backend(
-        WORLD_SIZE, BACKEND
-    )
+    @require_n_gpus_for_nccl_backend(WORLD_SIZE, BACKEND)
     def test_single_joinable_disable(self):
         r"""Tests ``enable=False`` for a single :class:`Joinable`."""
         num_joinables = 1
@@ -401,12 +391,10 @@ def test_single_joinable_disable(self):
             throw_on_early_termination=False,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
-            expected_total=expected_total
+            expected_total=expected_total,
         )
 
-    @require_n_gpus_for_nccl_backend(
-        WORLD_SIZE, BACKEND
-    )
+    @require_n_gpus_for_nccl_backend(WORLD_SIZE, BACKEND)
     def test_multiple_joinable_disable(self):
         r"""
         Tests ``enable=False`` for multiple :class:`Joinable` s.
@@ -429,12 +417,10 @@ def test_multiple_joinable_disable(self):
             throw_on_early_termination=False,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
-            expected_total=expected_total
+            expected_total=expected_total,
         )
 
-    @require_n_gpus_for_nccl_backend(
-        WORLD_SIZE, BACKEND
-    )
+    @require_n_gpus_for_nccl_backend(WORLD_SIZE, BACKEND)
     def test_single_joinable_throw(self):
         r"""
         Tests ``throw_on_early_termination=True`` for a single
@@ -452,12 +438,10 @@ def test_single_joinable_throw(self):
             throw_on_early_termination=throw_on_early_termination,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
-            expected_total=None
+            expected_total=None,
         )
 
-    @require_n_gpus_for_nccl_backend(
-        WORLD_SIZE, BACKEND
-    )
+    @require_n_gpus_for_nccl_backend(WORLD_SIZE, BACKEND)
     def test_multiple_joinables_throw(self):
         r"""
         Tests ``throw_on_early_termination=True`` for multiple
@@ -478,12 +462,10 @@ def test_multiple_joinables_throw(self):
             throw_on_early_termination=throw_on_early_termination,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
-            expected_total=None
+            expected_total=None,
         )
 
-    @require_n_gpus_for_nccl_backend(
-        WORLD_SIZE, BACKEND
-    )
+    @require_n_gpus_for_nccl_backend(WORLD_SIZE, BACKEND)
     def test_join_kwargs(self):
         r"""
         Tests passing keyword arguments to the context manager.
@@ -505,8 +487,9 @@ def test_join_kwargs(self):
             throw_on_early_termination=False,
             num_allreduces=num_allreduces,
             run_post_hooks=run_post_hooks,
-            expected_total=expected_total
+            expected_total=expected_total,
         )
 
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py b/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
index ad5905ec6da61..f24bb131667da 100644
--- a/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
+++ b/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
@@ -1,20 +1,26 @@
 # Owner(s): ["oncall: distributed"]
 
 import time
+from dataclasses import dataclass, field
 from enum import auto, Enum
+from functools import partial
+from io import BytesIO
+from typing import Any, Dict, List
 
 import torch
 import torch.distributed as dist
 import torch.distributed.checkpoint as DCP
+import torch.distributed.checkpoint.state_dict_saver as saver
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed._tensor.device_mesh import init_device_mesh
-from torch.distributed.checkpoint.filesystem import _FileSystemCheckpointer
 from torch.distributed.checkpoint.state_dict import (
     _patch_model_state_dict,
     _patch_optimizer_state_dict,
+    get_model_state_dict,
     get_state_dict,
 )
+from torch.distributed.checkpoint.state_dict_loader import _load_state_dict_from_keys
 from torch.distributed.distributed_c10d import ReduceOp
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.api import ShardingStrategy
@@ -23,6 +29,7 @@
     parallelize_module,
     RowwiseParallel,
 )
+from torch.nn.parallel import DistributedDataParallel
 
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -77,19 +84,59 @@ class ModelType(Enum):
     FSDP = auto()
     HSDP = auto()
     FSDP_TP = auto()
+    DDP = auto()
     NONE = auto()  # no parallelization
 
 
+@dataclass
+class TestTrainState:
+    step: int = 0
+    current_loss: float = -1
+    losses: List[float] = field(default_factory=list)
+
+    def state_dict(self) -> Dict[str, Any]:
+        loss_bytes = BytesIO()
+        torch.save(self.losses, loss_bytes)
+        return {
+            "step": torch.tensor(self.step, dtype=torch.int32),
+            "current_loss": torch.tensor(self.current_loss, dtype=torch.float32),
+            "losses": loss_bytes,
+        }
+
+    def load_state_dict(self, state_dict) -> None:
+        self.step = state_dict["step"].item()
+        self.current_loss = state_dict["current_loss"].item()
+        state_dict["losses"].seek(0)
+        self.losses = torch.load(state_dict["losses"])
+
+    def __eq__(self, other):
+        return (
+            self.step == other.step
+            and self.current_loss == other.current_loss
+            and self.losses == other.losses
+        )
+
+
 def _train(model, optim, train_steps=1):
     torch.manual_seed(0)
     loss = None
+
+    train_state = TestTrainState()
+
     for _ in range(train_steps):
         loss = model(model.get_input()).sum()
         loss.backward()
+
+        # We usually sync the loss across dp ranks in real training.
+        # This is just simulating for testing purpose.
+        train_state.step += 1
+        train_state.current_loss = torch.rand(1).item()
+        train_state.losses.append(train_state.current_loss)
+
         optim.step()
         optim.zero_grad()
 
-    return loss
+    return loss, train_state
 
 
 class TestE2ESaveAndLoad(DTensorTestBase, VerifyStateDictMixin):
@@ -128,6 +175,9 @@ def _create_model(self, compile, model_type, state_dict_options=None):
             }
             model = parallelize_module(dummy_model, tp_mesh, parallelize_plan)
             model = FSDP(model, device_mesh=dp_mesh, use_orig_params=True)
+        elif model_type == ModelType.DDP:
+            model = DistributedDataParallel(dummy_model)
+            model.get_input = partial(TestDummyModel.get_input, model)
         else:
             model = dummy_model
 
@@ -152,36 +202,46 @@ def _optim(self, model):
     @skip_if_lt_x_gpu(4)
     @with_temp_dir
     @parametrize("compile", [True, False])
-    # TODO: Previously PariwiseParallel does not shard properly, passing ModelType.FSDP_TP test where it
+    # TODO: Previously PairwiseParallel does not shard properly, passing ModelType.FSDP_TP test where it
     # should have failed. Disabling the failed test temporarily to unblock the deprecation of PairwiseParallel.
-    # @parametrize("model_type", [ModelType.FSDP, ModelType.HSDP, ModelType.FSDP_TP])
-    @parametrize("model_type", [ModelType.FSDP, ModelType.HSDP])
+    @parametrize("model_type", [ModelType.FSDP, ModelType.HSDP, ModelType.DDP])
     def test_e2e(self, compile, model_type):
         self._run_e2e_test(compile, model_type)
 
     @with_comms
     @skip_if_lt_x_gpu(4)
     @with_temp_dir
-    def test_e2e_async(self):
-        self._run_e2e_test(compile=False, model_type=ModelType.FSDP, async_op=True)
+    @parametrize("cache_staged_state_dict", [False, True])
+    def test_e2e_async_cached(self, cache_staged_state_dict):
+        self._run_e2e_test(
+            compile=False,
+            model_type=ModelType.FSDP,
+            async_op=True,
+            cache_staged_state_dict=cache_staged_state_dict,
+        )
 
-    def _run_e2e_test(self, compile, model_type, async_op=False):
+    def _run_e2e_test(
+        self, compile, model_type, async_op=False, cache_staged_state_dict=False
+    ):
         model, optim = self._create_model(compile, ModelType.NONE)
         _train(model, optim, train_steps=2)
 
         dist_model, dist_optim = self._create_model(compile, model_type)
-        _train(dist_model, dist_optim, train_steps=2)
+        _, original_train_state = _train(dist_model, dist_optim, train_steps=2)
 
         original_stateful_obj = TestStatefulObj()  # tests arbitrary saving/loading
         sd = {
             "model": dist_model,
             "optimizer": dist_optim,
             "s": original_stateful_obj,
+            "train_state": original_train_state,
         }
 
-        checkpointer = _FileSystemCheckpointer(self.temp_dir)
         if async_op:
-            f = checkpointer.async_save(state_dict=sd)
+            writer = DCP.FileSystemWriter(
+                self.temp_dir, cache_staged_state_dict=cache_staged_state_dict
+            )
+            f = saver.async_save(sd, storage_writer=writer)
             t = time.monotonic()
             while not f.done():
                 time.sleep(1)
@@ -189,27 +249,28 @@ def _run_e2e_test(self, compile, model_type, async_op=False):
 
             f.result()
         else:
-            checkpointer.save(state_dict=sd)
-
-        loaded_stateful_obj = TestStatefulObj()
-        dist_model, dist_optim = self._create_model(compile, model_type)
+            DCP.save(sd, checkpoint_id=self.temp_dir)
 
         loaded_stateful_obj = TestStatefulObj()
+        loaded_train_state = TestTrainState()
         dist_model, dist_optim = self._create_model(compile, model_type)
 
-        checkpointer.load(
+        DCP.load(
             state_dict={
                 "model": dist_model,
                 "optimizer": dist_optim,
                 "s": loaded_stateful_obj,
-            }
+                "train_state": loaded_train_state,
+            },
+            checkpoint_id=self.temp_dir,
         )
 
         self.assertEqual(original_stateful_obj, loaded_stateful_obj)
+        self.assertEqual(original_train_state, loaded_train_state)
 
         # train one more step on both models
-        loss = _train(model, optim, train_steps=1)
-        dist_loss = _train(dist_model, dist_optim, train_steps=1)
+        loss, _ = _train(model, optim, train_steps=1)
+        dist_loss, _ = _train(dist_model, dist_optim, train_steps=1)
         self.assertEqual(loss, dist_loss)
 
         dist_msd, dist_osd = get_state_dict(dist_model, optimizers=dist_optim)
@@ -267,14 +328,68 @@ def load_state_dict(self, state_dict):
                 "A": Foo(),
             }
 
-        DCP.save(sd, DCP.FileSystemWriter(self.temp_dir))
-        DCP.load(sd, DCP.FileSystemReader(self.temp_dir))
+        DCP.save(sd, checkpoint_id=self.temp_dir)
+        DCP.load(sd, checkpoint_id=self.temp_dir)
 
     @with_temp_dir
     def test_no_dist(self):
-        checkpointer = _FileSystemCheckpointer(self.temp_dir, no_dist=True)
-        checkpointer.save({})
-        checkpointer.load({})
+        # since comm's are not initialized in this method, `no_dist`
+        # is assumed False
+        DCP.save({}, checkpoint_id=self.temp_dir)
+        DCP.load({}, checkpoint_id=self.temp_dir)
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @with_temp_dir
+    def test_partial_load(self):
+        model, optim = self._create_model(compile=False, model_type=ModelType.NONE)
+        _train(model, optim, train_steps=2)
+
+        dist_model, dist_optim = self._create_model(
+            compile=False, model_type=ModelType.FSDP
+        )
+        _train(dist_model, dist_optim, train_steps=2)
+
+        DCP.save(
+            {"model": dist_model, "optimizer": dist_optim}, checkpoint_id=self.temp_dir
+        )
+
+        dist_model, _ = self._create_model(compile=False, model_type=ModelType.FSDP)
+        DCP.load({"model": dist_model}, checkpoint_id=self.temp_dir)
+
+        dist_msd = get_model_state_dict(dist_model)
+        model_sd = get_model_state_dict(model)
+        self._verify_msd(model_sd, dist_msd)
+
+        # another way
+        loaded_model_sd = _load_state_dict_from_keys(
+            "model", checkpoint_id=self.temp_dir
+        )["model"]
+        self._verify_msd(model_sd, loaded_model_sd, offload_to_cpu=True)
+
+        loaded_optim_state = _load_state_dict_from_keys(
+            "optimizer.state", checkpoint_id=self.temp_dir
+        )["optimizer"]["state"]
+        self.assertNotIn("param_groups", loaded_optim_state)
+        for k, v in dist_optim.state_dict()["state"].items():
+            for optim_key in ["exp_avg", "exp_avg_sq", "step"]:
+                self._compare_tensor(
+                    loaded_optim_state[k][optim_key], v[optim_key], offload_to_cpu=True
+                )
+
+
+class TestNoCPU(DTensorTestBase):
+    @property
+    def backend(self):
+        return "nccl"
+
+    @with_comms
+    def test_no_cpu(self):
+        with self.assertRaisesRegex(
+            AssertionError, r"A CPU backend must be enabled for async save;.*?"
+        ):
+            f = saver.async_save({})
+            f.result()
 
 
 instantiate_parametrized_tests(TestE2ESaveAndLoad)
diff --git a/test/distributed/checkpoint/e2e/test_fsdp_ep.py b/test/distributed/checkpoint/e2e/test_fsdp_ep.py
index 683a62f84d92f..44911ab5bf528 100644
--- a/test/distributed/checkpoint/e2e/test_fsdp_ep.py
+++ b/test/distributed/checkpoint/e2e/test_fsdp_ep.py
@@ -21,7 +21,7 @@ def __init__(self):
         super().__init__()
 
     def forward(self, x):
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 class EPModel(nn.Module):
@@ -31,7 +31,7 @@ def __init__(self, rank):
         self.net2 = nn.Sequential(nn.Linear(16, 16), nn.ReLU())
 
     def forward(self, x):
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 class SecondTier(nn.Module):
@@ -43,7 +43,7 @@ def __init__(self, rank):
         self.net = nn.Sequential(nn.Linear(16, 16), nn.ReLU())
 
     def forward(self, x):
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 class TopModel(nn.Module):
@@ -55,7 +55,7 @@ def __init__(self, rank):
         self.net = nn.Sequential(nn.Linear(16, 16), nn.ReLU())
 
     def forward(self, x):
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 class TestFSDPWithEP(DTensorTestBase, VerifyStateDictMixin):
diff --git a/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py b/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py
new file mode 100644
index 0000000000000..9c2e5e5fa9488
--- /dev/null
+++ b/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py
@@ -0,0 +1,99 @@
+# Owner(s): ["oncall: distributed"]
+
+import copy
+
+import torch
+import torch.nn as nn
+from torch.distributed._composable.fsdp import fully_shard
+from torch.distributed._tensor import DTensor
+from torch.distributed.checkpoint.state_dict import (
+    get_model_state_dict,
+    StateDictOptions,
+)
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest, MLP
+from torch.testing._internal.common_utils import run_tests
+from torch.utils._pytree import tree_all_only
+
+
+class TestFullyShardWithDistributedStateDict(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.cuda.device_count())
+
+    def _get_base_model(self, mlp_dim: int = 2):
+        base_model = nn.Sequential(
+            MLP(mlp_dim),
+            nn.Sequential(MLP(mlp_dim), nn.Linear(mlp_dim, mlp_dim)),
+            MLP(mlp_dim),
+        )
+        return base_model
+
+    @skip_if_lt_x_gpu(2)
+    def test_1d_fsdp_get_model_state_dict(self):
+        self.run_subtests(
+            {"mlp_dim": [2, 3, 4, 5]},
+            self._test_1d_fsdp_get_model_state_dict,
+        )
+
+    def _test_1d_fsdp_get_model_state_dict(self, mlp_dim: int):
+        """
+        Test model.state_dict() and distributed_state_dict parity.
+        """
+        base_model = self._get_base_model(mlp_dim)
+        # Default is `reshard_after_forward=True`
+        model1 = copy.deepcopy(base_model)
+        for module in model1:
+            fully_shard(module)
+        fully_shard(model1)
+
+        # osd: original state dict, dsd: distributed state dict
+        osd = model1.state_dict()
+        dsd = get_model_state_dict(model1)
+        self.assertEqual(osd, dsd)
+
+        # Check `reshard_after_forward=False` after a forward
+        model2 = copy.deepcopy(base_model)
+        for module in model2:
+            fully_shard(module, reshard_after_forward=False)
+        fully_shard(model2, reshard_after_forward=False)
+        inp = torch.randn((2, mlp_dim), device="cuda")
+        model2(inp)  # parameters are not resharded after this forward
+        # Check that state dict hooks reshard
+        osd_2 = model2.state_dict()
+        dsd_2 = get_model_state_dict(model2)
+        self.assertEqual(osd_2, dsd_2)
+
+    @skip_if_lt_x_gpu(2)
+    def test_1d_fsdp_cpu_offload_full_model_state_dict(self):
+        """
+        Test full_state_dict and cpu_offload works for FSDP2 state_dict.
+        """
+        orig_model = self._get_base_model()
+        fsdp_model = copy.deepcopy(orig_model)
+        for module in fsdp_model:
+            fully_shard(module)
+        fully_shard(fsdp_model)
+
+        osd = orig_model.state_dict()
+        dsd = get_model_state_dict(
+            fsdp_model, options=StateDictOptions(full_state_dict=True, cpu_offload=True)
+        )
+
+        cpu_device = torch.device("cpu")
+
+        def is_cpu(v):
+            if isinstance(v, DTensor):
+                return v.device == torch.device("cpu")
+            else:
+                return v.device == cpu_device
+
+        if self.rank == 0:
+            self.assertEqual(osd, dsd)
+            self.assertTrue(tree_all_only((torch.Tensor, DTensor), is_cpu, osd))
+        else:
+            self.assertEqual(dsd, {})
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/checkpoint/test_checkpoint.py b/test/distributed/checkpoint/test_checkpoint.py
index 937853e3063bd..fff664e52fa8c 100644
--- a/test/distributed/checkpoint/test_checkpoint.py
+++ b/test/distributed/checkpoint/test_checkpoint.py
@@ -1,7 +1,8 @@
 # Owner(s): ["oncall: distributed"]
 
+import os
 import sys
-from typing import cast, List, Optional
+from typing import cast, List, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -174,6 +175,9 @@ class FaultyStorageWriter(TestStorageBase, StorageWriter):
     def __init__(self, fail_conf):
         super().__init__(fail_conf)
 
+    def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
+        return
+
     def set_up_storage_writer(self, is_coordinator: bool) -> None:
         self._fail_rank("fail_set_up_storage_writer")
 
@@ -194,12 +198,19 @@ def write_data(
     def finish(self, metadata: Metadata, results: List[List[WriteResult]]) -> None:
         self._fail_rank("fail_finish")
 
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        return True
+
 
 class FaultyStorageReader(TestStorageBase, StorageReader):
     def __init__(self, metadata, fail_conf):
         super().__init__(fail_conf)
         self.metadata = metadata
 
+    def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
+        return
+
     def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
         self._fail_rank("fail_set_up_storage_reader")
 
@@ -219,6 +230,10 @@ def read_metadata(self) -> Metadata:
         self._fail_rank("fail_read_metadata")
         return self.metadata
 
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        return True
+
 
 class TestDistributedFailure(ShardedTensorTestBase):
     def get_spec(self):
diff --git a/test/distributed/checkpoint/test_compatibility.py b/test/distributed/checkpoint/test_compatibility.py
new file mode 100644
index 0000000000000..e64e6228986db
--- /dev/null
+++ b/test/distributed/checkpoint/test_compatibility.py
@@ -0,0 +1,75 @@
+# Owner(s): ["oncall: distributed"]
+
+from unittest.mock import patch
+
+import torch
+import torch.distributed.checkpoint as dcp
+from torch.distributed.checkpoint.metadata import (
+    BytesStorageMetadata,
+    ChunkStorageMetadata,
+    Metadata,
+    MetadataIndex,
+    TensorProperties,
+    TensorStorageMetadata,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
+
+
+class TestDCPCompatbility(TestCase):
+    def test_metadata(self) -> None:
+        # Ensure that all the new fields of all the metadata have the default
+        # values so that we can always deserialize from a legacy metadata.
+        try:
+            tensor = torch.zeros(4, 4)
+            chunk_meta = ChunkStorageMetadata(
+                torch.Size((1, 1)),
+                torch.Size((1, 1)),
+            )
+            tensor_meta = TensorStorageMetadata(
+                properties=TensorProperties.create_from_tensor(tensor),
+                size=tensor.size(),
+                chunks=[chunk_meta],
+            )
+            b_meta = BytesStorageMetadata()
+            _ = Metadata(state_dict_metadata={"a": tensor_meta, "b": b_meta})
+
+            _ = MetadataIndex(fqn="a.b.c")
+        except Exception as e:
+            raise RuntimeError(
+                "The change may break the BC of distributed checkpoint."
+            ) from e
+
+    def test_sharded_tensor_dependency(self) -> None:
+        # Ensure that we can load the existing DCP checkpoints back even if the
+        # metadata contain # _shard.sharded_tensor.metadata.
+        from torch.distributed._shard.sharded_tensor.metadata import (
+            TensorProperties as stp,
+        )
+
+        with patch("torch.distributed.checkpoint.metadata.TensorProperties", stp):
+            dcp.save(
+                {"a": torch.zeros(4, 4)},
+                dcp.FileSystemWriter("/tmp/dcp_testing"),
+            )
+
+        dcp.load(
+            {"a": torch.zeros(4, 4)},
+            dcp.FileSystemReader("/tmp/dcp_testing"),
+        )
+
+    @with_temp_dir
+    def test_storage_meta(self) -> None:
+        writer = dcp.FileSystemWriter(self.temp_dir)
+        dcp.save({"a": torch.zeros(4, 4)}, storage_writer=writer)
+
+        reader = dcp.FileSystemReader(self.temp_dir)
+        storage_meta = reader.read_metadata().storage_meta
+        self.assertNotEqual(storage_meta, None)
+        self.assertEqual(str(storage_meta.checkpoint_id), self.temp_dir)
+        self.assertEqual(storage_meta.save_id, writer.save_id)
+        self.assertEqual(storage_meta.load_id, reader.load_id)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py b/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
index 2fd270f143f21..ccb7a26e318a9 100644
--- a/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
+++ b/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
@@ -433,6 +433,7 @@ def test_switch_between_sharded_tensor_to_tensor(self, thread_count) -> None:
                     "sharded": sharded_tensor.rand(save_spec, tensor_size),
                     "replicated": torch.rand(tensor_size, device="cpu"),
                 }
+                dist.broadcast(save_dict["replicated"], src=0)
 
                 fs_writer = FileSystemWriter(path=path, thread_count=thread_count)
                 save_state_dict(state_dict=save_dict, storage_writer=fs_writer)
@@ -454,6 +455,7 @@ def test_switch_between_sharded_tensor_to_tensor(self, thread_count) -> None:
                         torch.allclose(save_dict_sharded, load_dict["sharded"]),
                         f"save-spec {save_spec} load-spec {load_spec}",
                     )
+
                     self.assertTrue(
                         torch.allclose(save_dict["replicated"], load_dict_replicated),
                         f"save-spec {save_spec} load-spec {load_spec}",
diff --git a/test/distributed/checkpoint/test_format_utils.py b/test/distributed/checkpoint/test_format_utils.py
new file mode 100644
index 0000000000000..38006e937d093
--- /dev/null
+++ b/test/distributed/checkpoint/test_format_utils.py
@@ -0,0 +1,108 @@
+# Owner(s): ["oncall: distributed"]
+
+import torch
+import torch.distributed as dist
+import torch.distributed.checkpoint as dcp
+import torch.nn as nn
+
+import torch.nn.functional as F
+from torch.distributed._tensor.device_mesh import init_device_mesh
+from torch.distributed.checkpoint.format_utils import (
+    BroadcastingTorchSaveReader,
+    dcp_to_torch_save,
+    DynamicMetaLoadPlanner,
+    torch_save_to_dcp,
+)
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
+
+
+class SimpleModelUneven(nn.Module):
+    def __init__(self):
+        super().__init__()
+        torch.manual_seed(0)
+        self.net1 = nn.Linear(5, 10)
+        self.relu = nn.ReLU()
+        self.net2 = nn.Linear(10, 15)
+        self.net3 = nn.Linear(15, 30)
+        self.net4 = nn.Linear(30, 5)
+
+    def forward(self, x):
+        x = F.relu(self.net1(x))
+        x = F.relu(self.net2(x))
+        x = F.relu(self.net3(x))
+        x = self.net4(x)
+        return x
+
+    def get_input(self):
+        return torch.rand(4, 5, device="cuda")
+
+
+class TestFormatUtils(DTensorTestBase):
+    @with_temp_dir
+    def test_dcp_to_torch_save(self) -> None:
+        model = SimpleModelUneven()
+        dcp.save({"model": model}, checkpoint_id=self.temp_dir)
+
+        torch_path = self.temp_dir + "/model.pt"
+        dcp_to_torch_save(self.temp_dir, torch_path)
+
+        loaded_sd = torch.load(torch_path)
+        self.assertEqual(loaded_sd, {"model": model.state_dict()})
+
+    @with_temp_dir
+    def test_torch_save_to_dcp(self) -> None:
+        model = SimpleModelUneven()
+        sd = {"model": model.state_dict()}
+        torch_path = self.temp_dir + "/model.pt"
+        torch.save(sd, torch_path)
+
+        torch_save_to_dcp(torch_path, self.temp_dir)
+
+        model = SimpleModelUneven()
+        dcp.load({"model": model}, checkpoint_id=self.temp_dir)
+
+        self.assertEqual({"model": model.state_dict()}, sd)
+
+    @with_comms
+    @with_temp_dir
+    @skip_if_lt_x_gpu(2)
+    def test_online_torch_save_to_dcp(self) -> None:
+        """Tests loading a model saved by torch.save directly into a sharded model
+        using dcp.load
+        """
+        # Save a model with torch.save
+        model = SimpleModelUneven()
+        sd = {"model": model.state_dict()}
+
+        torch_fn = self.temp_dir + "/model.pt"
+        if dist.get_rank() == 0:
+            torch.save(sd, torch_fn)
+        dist.barrier()
+
+        # Load into a sharded model
+        device_mesh = init_device_mesh(self.device_type, (self.world_size,))
+        model = SimpleModelUneven().cuda()
+        model = FSDP(
+            model,
+            device_mesh=device_mesh,
+            use_orig_params=True,
+        )
+        dcp.load(
+            {"model": model},
+            planner=DynamicMetaLoadPlanner(),
+            storage_reader=BroadcastingTorchSaveReader(),
+            checkpoint_id=torch_fn,
+        )
+
+        self.assertEqual(sd["model"], model.state_dict())
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/checkpoint/test_nested_dict.py b/test/distributed/checkpoint/test_nested_dict.py
index 12df302e5217c..4b873210f42db 100644
--- a/test/distributed/checkpoint/test_nested_dict.py
+++ b/test/distributed/checkpoint/test_nested_dict.py
@@ -13,7 +13,7 @@ def test_flattening_round_trip(self) -> None:
         state_dict = {
             "key0": 1,
             "key1": [1, 2],
-            "key2": {1: 2, 2: 3},
+            "key2": {"1": 2, "2": 3},
             "key3": torch.tensor([1]),
             "key4": [[torch.tensor(2), "x"], [1, 2, 3], {"key6": [44]}],
         }
@@ -24,7 +24,7 @@ def test_flattening_round_trip(self) -> None:
             {
                 'key0': 1,
                 'key1': [1, 2],
-                'key2': {1: 2, 2: 3},
+                'key2': {'1': 2, '2': 3},
                 'key3': tensor([1]),
                 'key4.0.0': tensor(2),
                 'key4.0.1': 'x',
@@ -55,7 +55,9 @@ def test_mapping(self) -> None:
         self.assertEqual(("k2", 0), mapping["k2.0"])
         self.assertEqual(("k2", 1), mapping["k2.1"])
         self.assertEqual(("k2", 2, 0, "k3"), mapping["k2.2.0.k3"])
-        self.assertEqual(("k3",), mapping["k3"])
+        self.assertEqual(("k3", 0), mapping["k3.0"])
+        self.assertEqual(("k3", 1), mapping["k3.1"])
+        self.assertEqual(("k3", 2, 0, "k3"), mapping["k3.2.0.k3"])
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/checkpoint/test_planner.py b/test/distributed/checkpoint/test_planner.py
index 53c129d4b4a71..d4aeae4afed4c 100644
--- a/test/distributed/checkpoint/test_planner.py
+++ b/test/distributed/checkpoint/test_planner.py
@@ -3,30 +3,34 @@
 import sys
 
 import torch
-
+import torch.distributed.checkpoint as dcp
+import torch.nn as nn
 from torch.distributed._shard.sharded_tensor import (
     Shard,
     ShardedTensor,
     ShardedTensorMetadata,
     ShardMetadata,
 )
-from torch.distributed._shard.sharded_tensor.metadata import TensorProperties
-from torch.distributed.checkpoint._dedup_tensors import dedup_tensors
-
+from torch.distributed._shard.sharded_tensor.metadata import (
+    TensorProperties as TensorProperties_Shard,
+)
+from torch.distributed.checkpoint._dedup_save_plans import dedup_save_plans
+from torch.distributed.checkpoint.api import CheckpointException
 from torch.distributed.checkpoint.default_planner import (
     _create_default_local_metadata,
     create_default_global_save_plan,
     create_default_local_load_plan,
     create_default_local_save_plan,
+    DefaultLoadPlanner,
 )
 from torch.distributed.checkpoint.metadata import (
     BytesStorageMetadata,
     ChunkStorageMetadata,
     MetadataIndex,
+    TensorProperties,
     TensorStorageMetadata,
 )
 from torch.distributed.checkpoint.planner import LoadItemType, WriteItemType
-
 from torch.distributed.checkpoint.planner_helpers import (
     create_read_items_for_chunk_list,
 )
@@ -36,6 +40,7 @@
     TEST_WITH_DEV_DBG_ASAN,
     TestCase,
 )
+from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
 from torch.testing._internal.distributed.distributed_utils import (
     with_dist,
@@ -73,7 +78,7 @@ def create_sharded_tensor(rank, world_size, shards_per_rank, shard_size=8):
     sharded_tensor_md = ShardedTensorMetadata(
         shards_metadata=shards_metadata,
         size=torch.Size([shard_size * len(shards_metadata)]),
-        tensor_properties=TensorProperties.create_from_tensor(torch.zeros(1)),
+        tensor_properties=TensorProperties_Shard.create_from_tensor(torch.zeros(1)),
     )
 
     return ShardedTensor._init_from_local_shards_and_global_metadata(
@@ -89,7 +94,7 @@ def test_local_plan(self):
         st = create_sharded_tensor(rank=1, world_size=4, shards_per_rank=1)
         state_dict = {"tensor": tensor, "value": val, "st": st}
         plan = create_default_local_save_plan(state_dict, False)
-        self.assertEqual(2, len(plan.items))
+        self.assertEqual(3, len(plan.items))
         wi = plan.items[0]
         self.assertEqual(wi.index, MetadataIndex("tensor", [0]))
         self.assertEqual(wi.type, WriteItemType.TENSOR)
@@ -101,7 +106,7 @@ def test_local_plan(self):
         self.assertEqual(wi.tensor_data.chunk.offsets, torch.Size([0]))
         self.assertEqual(wi.tensor_data.chunk.sizes, torch.Size([10]))
 
-        st_wi = plan.items[1]
+        st_wi = plan.items[2]
         self.assertEqual(st_wi.index, MetadataIndex("st", [8]))
         self.assertEqual(st_wi.type, WriteItemType.SHARD)
         self.assertEqual(st_wi.tensor_data.size, st.size())
@@ -140,7 +145,7 @@ def create_data(rank):
                 return create_default_local_save_plan(state_dict, rank == 0)
 
         all_plans = [create_data(0), create_data(1), create_data(2), create_data(3)]
-        all_plans = dedup_tensors(all_plans)
+        all_plans = dedup_save_plans(all_plans)
         final_plans, metadata = create_default_global_save_plan(all_plans=all_plans)
 
         # The default global plan updates all indexes to include hints
@@ -334,5 +339,27 @@ def test_create_read_item_from_chunks(self):
         self.assertEqual(torch.Size([3]), read_items[1].lengths)
 
 
+class TestLoadPlanner(TestCase):
+    @with_temp_dir
+    def test_strict(self):
+        original_module = nn.Linear(2, 2)
+        dcp.save(state_dict={"module": original_module}, checkpoint_id=self.temp_dir)
+
+        new_module = nn.Linear(2, 2)
+        new_module.extra_param = nn.Parameter(torch.randn(2, 2))
+        dcp.load(
+            state_dict={"module": new_module},
+            checkpoint_id=self.temp_dir,
+            planner=DefaultLoadPlanner(allow_partial_load=True),
+        )
+
+        with self.assertRaisesRegex(CheckpointException, "Missing key in checkpoint"):
+            dcp.load(
+                state_dict={"module": new_module},
+                checkpoint_id=self.temp_dir,
+                planner=DefaultLoadPlanner(allow_partial_load=False),
+            )
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/checkpoint/test_save_load_api.py b/test/distributed/checkpoint/test_save_load_api.py
new file mode 100644
index 0000000000000..3c1169b5ce55a
--- /dev/null
+++ b/test/distributed/checkpoint/test_save_load_api.py
@@ -0,0 +1,69 @@
+# Owner(s): ["oncall: distributed"]
+import os
+from unittest.mock import patch
+
+import torch.distributed.checkpoint as dcp
+import torch.nn as nn
+from torch.distributed._tensor.device_mesh import init_device_mesh
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    skip_if_lt_x_gpu,
+    with_comms,
+)
+from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
+
+
+class MyTestModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.net1 = nn.Sequential(nn.Linear(8, 16), nn.ReLU())
+        self.net2 = nn.Sequential(nn.Linear(16, 32), nn.ReLU())
+        self.net3 = nn.Linear(32, 64)
+        self.net4 = nn.Sequential(nn.ReLU(), nn.Linear(64, 8))
+
+    def forward(self, x):
+        return self.net4(self.net3(self.net2(self.net1(x))))
+
+
+class TestSaveAndLoadAPI(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @with_temp_dir
+    def test_auto_detect(self):
+        model = FSDP(MyTestModule().cuda())
+        device_mesh = init_device_mesh(self.device_type, (self.world_size,))
+        model = FSDP(model, device_mesh=device_mesh)
+        dcp.save(model.state_dict(), checkpoint_id=os.path.join(self.temp_dir, "first"))
+        sd = dcp.load(
+            model.state_dict(), checkpoint_id=os.path.join(self.temp_dir, "first")
+        )
+
+        with patch.object(
+            dcp.FileSystemReader, "validate_checkpoint_id", return_value=False
+        ) as m1:
+            with patch.object(
+                dcp.FileSystemWriter, "validate_checkpoint_id", return_value=False
+            ) as m2:
+                dcp.save(
+                    model.state_dict(),
+                    checkpoint_id=os.path.join(self.temp_dir, "second"),
+                )
+                sd = dcp.load(
+                    model.state_dict(),
+                    checkpoint_id=os.path.join(self.temp_dir, "second"),
+                )
+
+        with self.assertRaisesRegex(RuntimeError, "Cannot detect"):
+            dcp.save(model.state_dict(), checkpoint_id="abc://abc.abc")
+        with self.assertRaisesRegex(RuntimeError, "Cannot detect"):
+            sd = dcp.load(model.state_dict(), checkpoint_id="abc://abc.abc")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/checkpoint/test_state_dict.py b/test/distributed/checkpoint/test_state_dict.py
index 8c84a7b8c8deb..8fff052b5d88f 100644
--- a/test/distributed/checkpoint/test_state_dict.py
+++ b/test/distributed/checkpoint/test_state_dict.py
@@ -1,29 +1,39 @@
 # Owner(s): ["oncall: distributed"]
 
 import copy
+import functools
 import sys
 from itertools import chain
-from typing import Callable, Tuple
+from typing import Callable, Tuple, Type, Union
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed._composable import fully_shard, replicate
+
+# importing fully_shard as FSDP2 since the original fully_shard is used in this test.
+# TODO: remove old composable fully_shard so that we don't have to import new fully_shard as FSDP2
+from torch.distributed._composable.fsdp import fully_shard as FSDP2
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed._tensor import DTensor, init_device_mesh
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    apply_activation_checkpointing,
+)
 from torch.distributed.checkpoint.state_dict import (
     _patch_model_state_dict,
     _patch_optimizer_state_dict,
     get_model_state_dict,
+    get_optimizer_state_dict,
     get_state_dict,
     set_model_state_dict,
-    set_state_dict,
+    set_optimizer_state_dict,
     StateDictOptions,
 )
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.distributed.optim import _apply_optimizer_in_backward
 from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import Optimizer
 from torch.testing._internal.common_dist_composable import (
     CompositeParamModel,
     UnitModule,
@@ -98,9 +108,6 @@ def _test_save_load(
         # Then finally we can call set_state_dict().
         if not isinstance(dist_optim, list):
             dist_optim = [dist_optim]
-        curr_dist_msd, curr_dist_osd = get_state_dict(
-            dist_model, optimizers=dist_optim, options=options
-        )
         if test_frozen:
             # We won't be able to load the partial state_dict back.
             return
@@ -108,10 +115,14 @@ def _test_save_load(
         # We can directly load them back. This asser is to ensure that optimizer
         # state storage are initialized.
         # self.assertEqual(len(curr_dist_osd[STATE]), len(dist_osd[STATE]))
-        set_state_dict(
+        set_model_state_dict(
             dist_model,
-            optimizers=dist_optim,
             model_state_dict=dist_msd,
+            options=options,
+        )
+        set_optimizer_state_dict(
+            dist_model,
+            optimizers=dist_optim,
             optim_state_dict=dist_osd,
             options=options,
         )
@@ -121,7 +132,8 @@ def _test_save_load(
             dist_model, optimizers=dist_optim, options=options
         )
         self._verify_msd(msd, dist_msd, options)
-        self._verify_osd_by_load(model, optim, copy_optim, dist_osd)
+        # TODO: Ditto
+        # self._verify_osd_by_load(model, optim, copy_optim, dist_osd)
         self._verify_osd(model, optim, osd, dist_osd)
 
         # Test _patch_model_state_dict, and _patch_optimizer_state_dict
@@ -140,6 +152,8 @@ def _test_fsdp(
         use_composable: bool,
         use_dtensor: bool,
         wrapping: Tuple[nn.Module] = (),
+        compile_model: bool = False,
+        optimizer_class: Type[Optimizer],
     ) -> None:
         if not use_orig_params and use_composable:
             return
@@ -153,8 +167,8 @@ def init_model_optim():
                 device_mesh = init_device_mesh("cuda", (self.world_size,))
 
             orig_model = CompositeParamModel(device=torch.device("cuda"))
-            orig_optim = torch.optim.Adam(orig_model.parameters(), lr=1e-3)
-            copy_optim = torch.optim.Adam(orig_model.parameters(), lr=1e-3)
+            orig_optim = optimizer_class(orig_model.parameters(), lr=1e-3)
+            copy_optim = optimizer_class(orig_model.parameters(), lr=1e-3)
             if wrapping:
                 strategy = set(wrapping)
             else:
@@ -179,7 +193,9 @@ def init_model_optim():
                         use_orig_params=use_orig_params,
                     )
 
-            dist_optim = torch.optim.Adam(dist_model.parameters(), lr=1e-3)
+            if compile_model:
+                dist_model = torch.compile(dist_model)
+            dist_optim = optimizer_class(dist_model.parameters(), lr=1e-3)
             return orig_model, orig_optim, copy_optim, dist_model, dist_optim
 
         self._test_save_load(init_model_optim)
@@ -193,20 +209,79 @@ def test_fsdp(self) -> None:
                 "use_composable": [True, False],
                 "use_dtensor": [True, False],
                 "wrapping": [tuple(), (nn.Linear, UnitModule)],
+                "optimizer_class": [torch.optim.Adam, torch.optim.AdamW],
             },
             self._test_fsdp,
         )
 
-    def _test_ddp(self, use_composable: bool) -> None:
+    @with_comms
+    @skip_if_lt_x_gpu(2)
+    def test_compiled_fsdp(self) -> None:
+        self.run_subtests(
+            {
+                "use_orig_params": [True],
+                "use_composable": [False],
+                "use_dtensor": [False],
+                "wrapping": [tuple()],
+                "optimizer_class": [torch.optim.Adam, torch.optim.AdamW],
+            },
+            self._test_fsdp,
+        )
+
+    def _test_fsdp2(
+        self,
+        *,
+        reshard_after_forward: Union[bool, int],
+        optimizer_class: Type[Optimizer],
+        compile_model: bool,
+        foreach: bool = True,
+    ):
+        def init_model_optim():
+            orig_model = CompositeParamModel(device=torch.device("cuda"))
+            orig_optim = optimizer_class(
+                orig_model.parameters(), lr=1e-3, foreach=foreach
+            )
+            copy_optim = optimizer_class(
+                orig_model.parameters(), lr=1e-3, foreach=foreach
+            )
+
+            dist_model = FSDP2(
+                copy.deepcopy(orig_model),
+                reshard_after_forward=reshard_after_forward,
+            )
+
+            if compile_model:
+                dist_model = torch.compile(dist_model)
+            dist_optim = optimizer_class(
+                dist_model.parameters(), lr=1e-3, foreach=foreach
+            )
+
+            return orig_model, orig_optim, copy_optim, dist_model, dist_optim
+
+        self._test_save_load(init_model_optim)
+
+    @with_comms
+    @skip_if_lt_x_gpu(2)
+    def test_fsdp2(self) -> None:
+        self.run_subtests(
+            {
+                "reshard_after_forward": [True, False],
+                "optimizer_class": [torch.optim.Adam, torch.optim.AdamW],
+                "compile_model": [True, False],
+            },
+            self._test_fsdp2,
+        )
+
+    def _test_ddp(self, use_composable: bool, optimizer_class: Type[Optimizer]) -> None:
         def init_model_optim():
             orig_model = CompositeParamModel(device=torch.device("cuda"))
-            orig_optim = torch.optim.Adam(orig_model.parameters(), lr=1e-3)
-            copy_optim = torch.optim.Adam(orig_model.parameters(), lr=1e-3)
+            orig_optim = optimizer_class(orig_model.parameters(), lr=1e-3)
+            copy_optim = optimizer_class(orig_model.parameters(), lr=1e-3)
             if use_composable:
                 dist_model = replicate(copy.deepcopy(orig_model))
             else:
                 dist_model = DDP(copy.deepcopy(orig_model))
-            dist_optim = torch.optim.Adam(dist_model.parameters(), lr=1e-3)
+            dist_optim = optimizer_class(dist_model.parameters(), lr=1e-3)
             return orig_model, orig_optim, copy_optim, dist_model, dist_optim
 
         self._test_save_load(init_model_optim)
@@ -215,13 +290,17 @@ def init_model_optim():
     @skip_if_lt_x_gpu(2)
     def test_ddp(self) -> None:
         self.run_subtests(
-            {"use_composable": [True, False]},
+            {
+                "use_composable": [True, False],
+                "optimizer_class": [torch.optim.Adam, torch.optim.AdamW],
+            },
             self._test_ddp,
         )
 
     def _test_fsdp_ddp(
         self,
         use_composable: bool,
+        optimizer_class: Type[Optimizer],
         optim_in_backward: bool = False,
         test_frozen: bool = False,
     ) -> None:
@@ -232,8 +311,8 @@ def init_model_optim():
                     orig_model.u1.parameters(), orig_model.u2.parameters()
                 ):
                     param.requires_grad = False
-            orig_optim = torch.optim.Adam(orig_model.parameters(), lr=1e-3)
-            copy_optim = torch.optim.Adam(orig_model.parameters(), lr=1e-3)
+            orig_optim = optimizer_class(orig_model.parameters(), lr=1e-3)
+            copy_optim = optimizer_class(orig_model.parameters(), lr=1e-3)
             dist_model = copy.deepcopy(orig_model)
             if use_composable:
                 replicate(dist_model.l)
@@ -248,13 +327,13 @@ def init_model_optim():
                 )
             if optim_in_backward:
                 _apply_optimizer_in_backward(
-                    torch.optim.Adam, dist_model.parameters(), {"lr": 1e-3}
+                    optimizer_class, dist_model.parameters(), {"lr": 1e-3}
                 )
                 dist_optim = [
                     p._in_backward_optimizers[0] for p in dist_model.parameters()
                 ]
             else:
-                dist_optim = torch.optim.Adam(dist_model.parameters(), lr=1e-3)
+                dist_optim = optimizer_class(dist_model.parameters(), lr=1e-3)
             return orig_model, orig_optim, copy_optim, dist_model, dist_optim
 
         self._test_save_load(init_model_optim, test_frozen)
@@ -263,14 +342,24 @@ def init_model_optim():
     @skip_if_lt_x_gpu(2)
     def test_fsdp_ddp(self) -> None:
         self.run_subtests(
-            {"use_composable": [True, False]},
+            {
+                "use_composable": [True, False],
+                "optimizer_class": [torch.optim.Adam, torch.optim.AdamW],
+            },
             self._test_fsdp_ddp,
         )
 
     @with_comms
     @skip_if_lt_x_gpu(2)
     def test_frozen_parameters(self) -> None:
-        self._test_fsdp_ddp(use_composable=False, test_frozen=True)
+        self.run_subtests(
+            {
+                "use_composable": [True],
+                "optimizer_class": [torch.optim.Adam, torch.optim.AdamW],
+                "test_frozen": [True],
+            },
+            self._test_fsdp_ddp,
+        )
 
     # TODO: enable use_dtensor once 2D device_mesh support is fully landed.
     """
@@ -294,19 +383,25 @@ def test_apply_optimizer_in_backward(self) -> None:
         )
     """
 
-    @with_comms
-    @skip_if_lt_x_gpu(1)
-    def test_single_gpu(self) -> None:
+    def _test_single_gpu(self, optimizer_class: Type[Optimizer]) -> None:
         def init_model_optim():
             orig_model = CompositeParamModel(device=torch.device("cuda"))
-            orig_optim = torch.optim.Adam(orig_model.parameters(), lr=1e-3)
-            copy_optim = torch.optim.Adam(orig_model.parameters(), lr=1e-3)
+            orig_optim = optimizer_class(orig_model.parameters(), lr=1e-3)
+            copy_optim = optimizer_class(orig_model.parameters(), lr=1e-3)
             model_copy = copy.deepcopy(orig_model)
-            optim_copy = torch.optim.Adam(model_copy.parameters(), lr=1e-3)
+            optim_copy = optimizer_class(model_copy.parameters(), lr=1e-3)
             return orig_model, orig_optim, copy_optim, model_copy, optim_copy
 
         self._test_save_load(init_model_optim)
 
+    @with_comms
+    @skip_if_lt_x_gpu(1)
+    def test_single_gpu(self) -> None:
+        self.run_subtests(
+            {"optimizer_class": [torch.optim.Adam, torch.optim.AdamW]},
+            self._test_single_gpu,
+        )
+
     @with_comms
     @skip_if_lt_x_gpu(1)
     def test_strict(self) -> None:
@@ -375,9 +470,9 @@ def test_partial(self) -> None:
         self.assertEqual(model.l.weight, model_state_dict1["l.weight"])
         self.assertEqual(model.l.bias, model_state_dict1["l.bias"])
 
-    @with_comms
-    @skip_if_lt_x_gpu(2)
-    def test_cpu_offload_full_state_dict(self) -> None:
+    def _test_cpu_offload_full_state_dict(
+        self, optimizer_class: Type[Optimizer]
+    ) -> None:
         orig_model = CompositeParamModel(device=torch.device("cuda"))
         device_mesh = init_device_mesh("cuda", (self.world_size,))
         dist_model = FSDP(
@@ -387,7 +482,7 @@ def test_cpu_offload_full_state_dict(self) -> None:
             device_mesh=device_mesh,
         )
 
-        dist_optim = torch.optim.Adam(dist_model.parameters(), lr=1e-3)
+        dist_optim = optimizer_class(dist_model.parameters(), lr=1e-3)
 
         mst, ost = get_state_dict(
             dist_model,
@@ -443,6 +538,132 @@ def is_cpu(v):
             self.assertEqual(mst, {})
             self.assertEqual(ost, {})
 
+    @with_comms
+    @skip_if_lt_x_gpu(2)
+    def test_cpu_offload_full_state_dict(self) -> None:
+        self.run_subtests(
+            {"optimizer_class": [torch.optim.Adam, torch.optim.AdamW]},
+            self._test_cpu_offload_full_state_dict,
+        )
+
+    @with_comms
+    @skip_if_lt_x_gpu(1)
+    def test_activation_ckpt_fqns(self) -> None:
+        """Tests that activation checkpointing prefixes are removed from module names"""
+        model = CompositeParamModel(device=torch.device("cuda"))
+        original_keys = get_model_state_dict(model).keys()
+
+        apply_activation_checkpointing(model)
+        model = DDP(model)
+        new_keys = get_model_state_dict(model).keys()
+
+        self.assertEqual(original_keys, new_keys)
+
+    @with_comms
+    @skip_if_lt_x_gpu(1)
+    def test_extra_state(self) -> None:
+        model = CompositeParamModel(device=torch.device("cuda"))
+
+        def get_extra_state(self):
+            return "MyState"
+
+        def set_extra_state(self, state):
+            return
+
+        UnitModule.get_extra_state = get_extra_state
+        UnitModule.set_extra_state = set_extra_state
+
+        ddp_model = DDP(copy.deepcopy(model))
+        set_model_state_dict(ddp_model, get_model_state_dict(ddp_model))
+        self.assertEqual(model.state_dict()["u1._extra_state"], "MyState")
+        self.assertEqual(model.state_dict(), get_model_state_dict(ddp_model))
+
+    @with_comms
+    @skip_if_lt_x_gpu(1)
+    def test_non_persistent_buffers(self) -> None:
+        model = CompositeParamModel(device=torch.device("cuda"))
+        model.register_buffer(
+            "dont_save_me", torch.rand(100, device="cuda"), persistent=False
+        )
+        ddp_model = DDP(copy.deepcopy(model))
+        set_model_state_dict(ddp_model, get_model_state_dict(ddp_model))
+        self.assertEqual(model.state_dict(), get_model_state_dict(ddp_model))
+
+    @with_comms
+    @skip_if_lt_x_gpu(2)
+    def test_broadcast_from_rank0(self) -> None:
+        def inner_test(wrapper):
+            model = CompositeParamModel(device=torch.device("cuda"))
+            optim = torch.optim.Adam(model.parameters())
+            fsdp_model = wrapper(copy.deepcopy(model))
+            fsdp_optim = torch.optim.Adam(fsdp_model.parameters())
+
+            batch = torch.rand(8, 100, device="cuda")
+            model(batch).sum().backward()
+            optim.step()
+            states, optim_states = get_state_dict(model, optim)
+
+            fsdp_model(batch).sum().backward()
+            fsdp_optim.step()
+
+            def check(equal):
+                fsdp_states = get_model_state_dict(
+                    fsdp_model,
+                    options=StateDictOptions(full_state_dict=True),
+                )
+                fsdp_optim_states = get_optimizer_state_dict(
+                    fsdp_model,
+                    fsdp_optim,
+                    options=StateDictOptions(full_state_dict=True),
+                )
+                if equal:
+                    self.assertEqual(states, fsdp_states)
+                    self.assertEqual(optim_states, fsdp_optim_states)
+                else:
+                    self.assertNotEqual(states, fsdp_states)
+                    self.assertNotEqual(optim_states, fsdp_optim_states)
+
+            check(equal=True)
+            fsdp_model(batch).sum().backward()
+            fsdp_optim.step()
+            check(equal=False)
+
+            # Drop the states to simulate loading from rank0
+            if dist.get_rank() > 0:
+                load_states = {}
+                load_optim_states = {}
+            else:
+                load_states = copy.deepcopy(states)
+                load_optim_states = copy.deepcopy(optim_states)
+
+            set_model_state_dict(
+                fsdp_model,
+                model_state_dict=load_states,
+                options=StateDictOptions(
+                    broadcast_from_rank0=True, full_state_dict=True
+                ),
+            )
+            set_optimizer_state_dict(
+                fsdp_model,
+                fsdp_optim,
+                optim_state_dict=load_optim_states,
+                options=StateDictOptions(
+                    broadcast_from_rank0=True, full_state_dict=True
+                ),
+            )
+            check(equal=True)
+
+        device_mesh = init_device_mesh("cuda", (self.world_size,))
+        self.run_subtests(
+            {
+                "wrapper": [
+                    functools.partial(FSDP2, mesh=device_mesh),
+                    functools.partial(FSDP, device_mesh=device_mesh),
+                ]
+            },
+            inner_test,
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/checkpoint/test_state_dict_utils.py b/test/distributed/checkpoint/test_state_dict_utils.py
index 32dbe641b473e..b5c2598fd272e 100644
--- a/test/distributed/checkpoint/test_state_dict_utils.py
+++ b/test/distributed/checkpoint/test_state_dict_utils.py
@@ -1,10 +1,15 @@
 # Owner(s): ["oncall: distributed"]
+import copy
+import io
 
 import torch
 import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
 
 from torch.distributed._state_dict_utils import (
+    _check_state_dict_similarity,
+    _copy_state_dict,
+    _create_cpu_state_dict,
     _gather_state_dict,
     _offload_state_dict_to_cpu,
 )
@@ -115,6 +120,58 @@ def create_dtensor():
         }
         self.assertEqual(state_dict, _gather_state_dict(dist_state_dict))
 
+    @skip_if_lt_x_gpu(2)
+    def test_create_cpu_state_dict(self):
+        device = torch.device("cuda")
+        buffer = io.BytesIO()
+        torch.save(torch.ones(10), buffer)
+        buffer.seek(0)
+        state_dict = {
+            "tensor1": torch.arange(10, device=device),
+            "tensor2": torch.ones(10, device=device),
+            "non_tensor_bytes_io": copy.deepcopy(buffer),
+            "non_tensor_bytes": buffer.read(),
+            "step": torch.tensor(7, dtype=torch.float),
+            "lr": 1.5,
+            "nested": {"list": [1, 2, 3, 4]},
+        }
+
+        def _verify(cpu_state_dict):
+            # Verify the correctness of _check_state_dict_similarity()
+            self.assertTrue(_check_state_dict_similarity(state_dict, cpu_state_dict))
+            tensor1 = cpu_state_dict["tensor1"]
+            cpu_state_dict["tensor1"] = torch.arange(11)
+            self.assertFalse(_check_state_dict_similarity(state_dict, cpu_state_dict))
+            cpu_state_dict["tensor1"] = tensor1
+
+            _copy_state_dict(state_dict, cpu_state_dict)
+
+            # Verify if _copy_state_dict works
+            for v in cpu_state_dict.values():
+                if isinstance(v, torch.Tensor):
+                    self.assertFalse(v.is_cuda)
+            self.assertEqual(cpu_state_dict["tensor1"], torch.arange(10))
+            self.assertEqual(cpu_state_dict["tensor2"], torch.ones(10))
+            buffer.seek(0)
+            cpu_state_dict["non_tensor_bytes_io"].seek(0)
+            self.assertEqual(
+                cpu_state_dict["non_tensor_bytes_io"].read(), buffer.read()
+            )
+            buffer.seek(0)
+            self.assertEqual(cpu_state_dict["non_tensor_bytes"], buffer.read())
+            self.assertEqual(cpu_state_dict["lr"], 1.5)
+            self.assertEqual(cpu_state_dict["step"], 7)
+            self.assertEqual(cpu_state_dict["nested"], {"list": [1, 2, 3, 4]})
+
+        cpu_state_dict = _create_cpu_state_dict(state_dict, pin_memory=True)
+        _verify(cpu_state_dict)
+        cpu_state_dict = _create_cpu_state_dict(state_dict, share_memory=True)
+        _verify(cpu_state_dict)
+        cpu_state_dict = _create_cpu_state_dict(
+            state_dict, share_memory=True, pin_memory=True
+        )
+        _verify(cpu_state_dict)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/checkpoint/test_tp_checkpoint.py b/test/distributed/checkpoint/test_tp_checkpoint.py
index 692d9876a6ec8..5545782e9b1ae 100644
--- a/test/distributed/checkpoint/test_tp_checkpoint.py
+++ b/test/distributed/checkpoint/test_tp_checkpoint.py
@@ -3,7 +3,7 @@
 from copy import deepcopy
 
 import torch
-import torch.distributed.checkpoint as dist_cp
+import torch.distributed.checkpoint as DCP
 
 from torch.distributed._tensor import init_device_mesh
 
@@ -28,6 +28,19 @@
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
 
+class UnevenShardedModel(torch.nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        torch.manual_seed(5)
+        self.net1 = torch.nn.Linear(5, 10, device=device)
+        self.relu = torch.nn.ReLU()
+        self.net2 = torch.nn.Linear(10, 15, device=device)
+        self.net3 = torch.nn.Linear(15, 1, device=device)
+
+    def forward(self, x):
+        return self.net3(self.net2(self.relu(self.net1(x))))
+
+
 class TestTpCheckpoint(DTensorTestBase):
     @with_comms
     @skip_if_lt_x_gpu(2)
@@ -48,9 +61,9 @@ def test_tp_checkpoint(self):
         optimizer = torch.optim.SGD(model.parameters(), lr=0.25)
         original_state_dict = deepcopy(model.state_dict())
 
-        dist_cp.save_state_dict(
+        DCP.save_state_dict(
             state_dict=original_state_dict,
-            storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
+            storage_writer=DCP.FileSystemWriter(CHECKPOINT_DIR),
             planner=DefaultSavePlanner(),
         )
 
@@ -66,9 +79,9 @@ def test_tp_checkpoint(self):
         for param1, param2 in zip(original_state_dict.values(), state_dict.values()):
             self.assertNotEqual(param1.to_local(), param2.to_local())
 
-        dist_cp.load_state_dict(
+        DCP.load_state_dict(
             state_dict=state_dict,
-            storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
+            storage_reader=DCP.FileSystemReader(CHECKPOINT_DIR),
             planner=DefaultLoadPlanner(),
         )
 
@@ -76,6 +89,47 @@ def test_tp_checkpoint(self):
         for param1, param2 in zip(original_state_dict.values(), state_dict.values()):
             self.assertEqual(param1.to_local(), param2.to_local())
 
+    @with_comms
+    @skip_if_lt_x_gpu(2)
+    @with_temp_dir
+    def test_tp_checkpoint_load_on_meta_device(self):
+        CHECKPOINT_DIR = self.temp_dir
+        mesh_shpe = (self.world_size,)
+        tp_mesh = init_device_mesh(self.device_type, mesh_shpe)
+
+        # create model and move it to GPU with id rank
+        model = UnevenShardedModel(self.device_type).cuda(self.rank)
+        # Parallelize the module based on the given Parallel Style.
+        parallelize_plan = {
+            "net1": RowwiseParallel(),
+            "net2": ColwiseParallel(),
+        }
+        model = parallelize_module(model, tp_mesh, parallelize_plan=parallelize_plan)
+        original_state_dict = deepcopy(model.state_dict())
+
+        DCP.save_state_dict(
+            state_dict=original_state_dict,
+            storage_writer=DCP.FileSystemWriter(CHECKPOINT_DIR),
+        )
+
+        model2 = parallelize_module(
+            UnevenShardedModel("meta"), tp_mesh, parallelize_plan=parallelize_plan
+        )
+        state_dict_to_load = model2.state_dict()
+
+        DCP.load_state_dict(
+            state_dict=state_dict_to_load,
+            storage_reader=DCP.FileSystemReader(CHECKPOINT_DIR),
+        )
+        model2.load_state_dict(state_dict_to_load, assign=True)
+        state_dict_after_load = model2.state_dict()
+
+        # After loading, check whether params in state_dict_after_load are equal to original_state_dict.
+        for param1, param2 in zip(
+            original_state_dict.values(), state_dict_after_load.values()
+        ):
+            self.assertEqual(param1, param2)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/checkpoint/test_traverse.py b/test/distributed/checkpoint/test_traverse.py
index 4755967c116ad..22ab029a612f4 100644
--- a/test/distributed/checkpoint/test_traverse.py
+++ b/test/distributed/checkpoint/test_traverse.py
@@ -33,8 +33,10 @@ def collect_data(path, value):
         self.assertIn(("key1",), data)
         self.assertEqual(data[("key1",)], [1, 2])
 
-        self.assertIn(("key2",), data)
-        self.assertEqual(data[("key2",)], {1: 2, 2: 3})
+        self.assertIn(("key2", "1"), data)
+        self.assertEqual(data[("key2", "1")], 2)
+        self.assertIn(("key2", "2"), data)
+        self.assertEqual(data[("key2", "2")], 3)
 
         self.assertIn(("key3",), data)
         self.assertEqual(data[("key3",)], torch.tensor([1]))
diff --git a/test/distributed/elastic/agent/server/test/api_test.py b/test/distributed/elastic/agent/server/test/api_test.py
index 1189a9ac13a97..e1dd16bcf9650 100644
--- a/test/distributed/elastic/agent/server/test/api_test.py
+++ b/test/distributed/elastic/agent/server/test/api_test.py
@@ -11,22 +11,27 @@
 import signal
 import unittest
 import uuid
-from typing import Any, Dict
-from unittest.mock import call, patch, MagicMock
+from multiprocessing.pool import ThreadPool
+from typing import Any, Dict, List
+from unittest.mock import call, patch
+
+import torch.distributed as dist
 
 import torch.distributed.elastic.rendezvous.registry as rdzv_registry
 from torch.distributed.elastic.agent.server.api import (
+    _get_fq_hostname,
+    _RoleInstanceInfo,
     RunResult,
     SimpleElasticAgent,
+    Worker,
     WorkerGroup,
     WorkerSpec,
     WorkerState,
-    _get_fq_hostname,
-    _RoleInstanceInfo,
 )
 from torch.distributed.elastic.multiprocessing import SignalException
 from torch.distributed.elastic.multiprocessing.errors import ProcessFailure
 from torch.distributed.elastic.rendezvous import RendezvousHandler, RendezvousParameters
+from torch.distributed.elastic.rendezvous.api import RendezvousGracefulExitError
 from torch.distributed.elastic.utils.distributed import get_free_port
 from torch.testing._internal.common_utils import run_tests
 
@@ -53,7 +58,7 @@ def test_worker_group_constructor(self):
             args=(),
             rdzv_handler=None,
             max_restarts=50,
-            monitor_interval=1,
+            monitor_interval=0.1,
         )
         worker_group = WorkerGroup(spec)
 
@@ -156,7 +161,7 @@ class SimpleElasticAgentTest(unittest.TestCase):
     def _get_worker_spec(
         self,
         max_restarts=1,
-        monitor_interval=1.0,
+        monitor_interval=0.1,
         role="test_trainer",
         local_world_size=8,
         local_addr=None,
@@ -469,22 +474,6 @@ def test_run_unknown_state(self, mock_monitor_workers):
         self.assertEqual(1, mock_monitor_workers.call_count)
         self.assertEqual(spec.max_restarts, agent._remaining_restarts)
 
-    def test_get_ranks(self):
-        role_infos = [
-            _RoleInstanceInfo("parameter_server", 0, 4),
-            _RoleInstanceInfo("trainer", 1, 1),
-            _RoleInstanceInfo("trainer", 2, 2),
-            _RoleInstanceInfo("trainer", 3, 3),
-            _RoleInstanceInfo("parameter_server", 4, 5),
-        ]
-        spec = self._get_worker_spec(
-            max_restarts=3, monitor_interval=0.1, role="not_used", local_world_size=8
-        )
-        agent = TestAgent(spec)
-        total_sum, ranks = agent._get_ranks(role_infos, 0, 0, len(role_infos))
-        self.assertEqual(15, total_sum)
-        self.assertEqual([0, 1, 2, 3], list(ranks))
-
     def test_assign_worker_ranks(self):
         role_infos = [
             _RoleInstanceInfo("parameter_server", 0, 4),
@@ -493,56 +482,64 @@ def test_assign_worker_ranks(self):
             _RoleInstanceInfo("trainer", 3, 3),
             _RoleInstanceInfo("parameter_server", 4, 5),
         ]
-        num_agents = len(role_infos)
-        with patch.object(TestAgent, "_share_and_gather", return_value=role_infos):
-            self.verify_worker_ranks(
-                role_infos[0], num_agents, [0, 1, 2, 3], [0, 1, 2, 3]
+        store = dist.HashStore()
+
+        def f(info) -> List[Worker]:
+            i, role_info = info
+            spec = self._get_worker_spec(
+                max_restarts=3,
+                monitor_interval=0.1,
+                role=role_info.role,
+                local_world_size=role_info.local_world_size,
             )
-            self.verify_worker_ranks(role_infos[1], num_agents, [4], [0])
-            self.verify_worker_ranks(role_infos[2], num_agents, [5, 6], [1, 2])
-            self.verify_worker_ranks(role_infos[3], num_agents, [7, 8, 9], [3, 4, 5])
-
-    def verify_worker_ranks(
-        self, agent_config, total_agents, expected_global_ranks, expected_role_ranks
-    ):
-        role, agent_rank, local_world_size = (
-            agent_config.role,
-            agent_config.rank,
-            agent_config.local_world_size,
-        )
-        spec = self._get_worker_spec(
-            max_restarts=3,
-            monitor_interval=0.1,
-            role=role,
-            local_world_size=local_world_size,
-        )
-        agent = TestAgent(spec)
-        workers = agent._assign_worker_ranks(None, agent_rank, total_agents, spec)
-        self.assertEqual(
-            expected_global_ranks, [worker.global_rank for worker in workers]
-        )
-        self.assertEqual(expected_role_ranks, [worker.role_rank for worker in workers])
-
-    @patch("torch.distributed.elastic.utils.store.synchronize")
-    def test_share_and_gather(self, sync_mock):
-        # when the state is unknown we exit immediately; no retries
-        spec = self._get_worker_spec(max_restarts=100, monitor_interval=0.1)
-        agent = TestAgent(spec)
-        expected_agent_infos = [
-            _RoleInstanceInfo("trainer", 0, 10),
-            _RoleInstanceInfo("trainer", 1, 10),
-            _RoleInstanceInfo("validator", 2, 10),
-        ]
-
-        sync_mock.return_value = [obj.serialize() for obj in expected_agent_infos]
-        result = agent._share_and_gather(MagicMock(), 1, 3, spec)
-        sync_mock.assert_called_once()
-        for expected_role_info, actual_role_info in zip(expected_agent_infos, result):
-            self.assertEqual(expected_role_info.role, actual_role_info.role)
-            self.assertEqual(expected_role_info.rank, actual_role_info.rank)
-            self.assertEqual(
-                expected_role_info.local_world_size, actual_role_info.local_world_size
+            agent = TestAgent(spec)
+            workers = agent._assign_worker_ranks(
+                store, role_info.rank, len(role_infos), spec
             )
+            return [
+                (
+                    w.local_rank,
+                    w.role_rank,
+                    w.global_rank,
+                    w.world_size,
+                    w.role_world_size,
+                )
+                for w in workers
+            ]
+
+        with ThreadPool(len(role_infos)) as pool:
+            out = pool.map(f, enumerate(role_infos))
+
+        self.assertListEqual(
+            out,
+            [
+                [
+                    (0, 0, 0, 15, 9),
+                    (1, 1, 1, 15, 9),
+                    (2, 2, 2, 15, 9),
+                    (3, 3, 3, 15, 9),
+                ],
+                [
+                    (0, 0, 4, 15, 6),
+                ],
+                [
+                    (0, 1, 5, 15, 6),
+                    (1, 2, 6, 15, 6),
+                ],
+                [
+                    (0, 3, 7, 15, 6),
+                    (1, 4, 8, 15, 6),
+                    (2, 5, 9, 15, 6),
+                ],
+                [
+                    (0, 4, 10, 15, 9),
+                    (1, 5, 11, 15, 9),
+                    (2, 6, 12, 15, 9),
+                    (3, 7, 13, 15, 9),
+                    (4, 8, 14, 15, 9),
+                ],
+            ],
+        )
 
     def test_get_event(self):
         spec = self._get_worker_spec(max_restarts=1)
@@ -582,6 +579,15 @@ def test_agent_process_signal_exception(self, invoke_run, _):
             args, _ = shutdown_mock.call_args
             self.assertEqual(signal.SIGTERM, args[0])
 
+    @patch("torch.distributed.elastic.agent.server.api.put_metric")
+    @patch.object(TestAgent, "_invoke_run")
+    def test_agent_process_handler_graceful_exception(self, invoke_run, _):
+        spec = self._get_worker_spec(max_restarts=0)
+        agent = TestAgent(spec)
+        invoke_run.side_effect = RendezvousGracefulExitError()
+        with patch.object(agent, "_shutdown"):
+            agent.run()
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
index b93f1ea463fe7..145c746dea55c 100644
--- a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
+++ b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
@@ -32,9 +32,10 @@
 )
 from torch.distributed.elastic.agent.server.local_elastic_agent import (
     LocalElasticAgent,
+    TORCHELASTIC_HEALTH_CHECK_PORT,
     TORCHELASTIC_TIMER_FILE,
 )
-from torch.distributed.elastic.multiprocessing import Std
+from torch.distributed.elastic.multiprocessing import DefaultLogsSpecs, Std
 from torch.distributed.elastic.multiprocessing.errors import ChildFailedError, record
 from torch.distributed.elastic.rendezvous import RendezvousParameters
 from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer
@@ -84,6 +85,16 @@ def dummy_compute() -> torch.Tensor:
     return torch.rand(100, 100)
 
 
+def dummy_compute_simulate_rank_failure() -> torch.Tensor:
+    """
+    fails rank 1 once
+    in other cases, returns a predefined size random Tensor
+    """
+    if os.environ["RANK"] == "1" and os.environ["TORCHELASTIC_RESTART_COUNT"] == "0":
+        os.kill(os.getpid(), 9)
+    return torch.rand(100, 100)
+
+
 def _fatal_signal_function(expected_error_index: int, sig: int):
     rank = int(os.environ["RANK"])
     if rank == expected_error_index:
@@ -301,8 +312,6 @@ def get_worker_spec(
             rdzv_handler=rdzv_handler,
             max_restarts=max_restarts,
             monitor_interval=monitor_interval,
-            redirects=node_config.redirects,
-            tee=node_config.tee,
             master_addr=master_addr_override,
             master_port=master_port_override,
         )
@@ -310,6 +319,7 @@ def get_worker_spec(
     def get_agent(
         self,
         spec: WorkerSpec,
+        node_config: Conf,
         start_method: str = "spawn",
         exit_barrier_timeout=5,
         log_line_prefix_template: Optional[str] = None,
@@ -318,7 +328,11 @@ def get_agent(
             spec,
             start_method=start_method,
             exit_barrier_timeout=exit_barrier_timeout,
-            log_dir=self.log_dir(),
+            logs_specs=DefaultLogsSpecs(
+                log_dir=self.log_dir(),
+                redirects=node_config.redirects,
+                tee=node_config.tee,
+            ),
             log_line_prefix_template=log_line_prefix_template,
         )
 
@@ -360,6 +374,7 @@ def run_agent(
         )
         agent = self.get_agent(
             spec=spec,
+            node_config=conf,
             start_method=start_method,
             exit_barrier_timeout=exit_barrier_timeout,
             log_line_prefix_template=log_line_prefix_template,
@@ -408,8 +423,7 @@ def run_job(
                 "max_restarts": 0,
                 "exit_barrier_timeout": exit_barrier_timeout,
                 "is_host": node_idx == 0,
-                "log_line_prefix_template": log_line_prefix_template
-
+                "log_line_prefix_template": log_line_prefix_template,
             }
             p = mp.Process(target=self.run_agent, kwargs=run_agent_args)
             procs.append(p)
@@ -449,15 +463,21 @@ def dummy_compute(self):
             self.assertIsInstance(return_value, torch.Tensor)
             self.assertEqual((100, 100), return_value.shape)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_dummy_compute_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.dummy_compute)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_dummy_compute_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.dummy_compute)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_dummy_compute_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.dummy_compute)
 
@@ -467,15 +487,21 @@ def run_happy_function(self):
         self.assertIsNone(res.return_values[0])
         self.assertIsNone(res.return_values[1])
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_happy_function_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_happy_function)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_happy_function_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_happy_function)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_happy_function_etcd_v2(self):
         self.run_test_with_backend(
             backend="etcd-v2", test_to_run=self.run_happy_function
@@ -496,13 +522,17 @@ def check_master_addr_port_override(self):
         self.assertFalse(res.is_failed())
         self.assertIsNone(res.return_values[0])
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_check_master_addr_port_override_etcd(self):
         self.run_test_with_backend(
             backend="etcd", test_to_run=self.check_master_addr_port_override
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_check_master_addr_port_override_etcd_v2(self):
         self.run_test_with_backend(
             backend="etcd-v2", test_to_run=self.check_master_addr_port_override
@@ -543,9 +573,13 @@ def run_agent_local_watchdog_setup_enabled(self):
         watchdog_file_path = "/tmp/watchdog_timer_" + str(uuid.uuid4())
         os.environ[watchdog_env_name] = watchdog_file_path
         # Run the agent
-        node_conf = Conf(entrypoint=_check_local_watchdog_setup, local_world_size=1, args=(TORCHELASTIC_TIMER_FILE, True))
+        node_conf = Conf(
+            entrypoint=_check_local_watchdog_setup,
+            local_world_size=1,
+            args=(TORCHELASTIC_TIMER_FILE, True),
+        )
         spec = self.get_worker_spec(node_conf, max_restarts=2)
-        agent = self.get_agent(spec)
+        agent = self.get_agent(spec, node_config=node_conf)
         res = agent.run()
         self.assertFalse(res.is_failed())
 
@@ -555,49 +589,130 @@ def run_agent_local_watchdog_setup_disabled(self):
         if watchdog_env_name in os.environ:
             del os.environ[watchdog_env_name]
         # Run the agent
-        node_conf = Conf(entrypoint=_check_local_watchdog_setup, local_world_size=1, args=(TORCHELASTIC_TIMER_FILE, False))
+        node_conf = Conf(
+            entrypoint=_check_local_watchdog_setup,
+            local_world_size=1,
+            args=(TORCHELASTIC_TIMER_FILE, False),
+        )
         spec = self.get_worker_spec(node_conf, max_restarts=2)
-        agent = self.get_agent(spec)
+        agent = self.get_agent(spec, node_config=node_conf)
         res = agent.run()
         self.assertFalse(res.is_failed())
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_agent_local_watchdog_setup_enabled_etcd(self):
         self.run_test_with_backend(
             backend="etcd", test_to_run=self.run_agent_local_watchdog_setup_enabled
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_agent_local_watchdog_setup_enabled_c10d(self):
         self.run_test_with_backend(
             backend="c10d", test_to_run=self.run_agent_local_watchdog_setup_enabled
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_agent_local_watchdog_setup_disabled_etcd(self):
         self.run_test_with_backend(
             backend="etcd", test_to_run=self.run_agent_local_watchdog_setup_disabled
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_agent_local_watchdog_setup_disabled_c10d(self):
         self.run_test_with_backend(
             backend="c10d", test_to_run=self.run_agent_local_watchdog_setup_disabled
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    def run_agent_healthcheck_setup_enabled(self):
+        # Set the env for healthcheck
+        healthcheck_port_env_name = TORCHELASTIC_HEALTH_CHECK_PORT
+        os.environ[healthcheck_port_env_name] = "12345"
+        # Run the agent
+        node_conf = Conf(
+            entrypoint=_check_local_watchdog_setup,
+            local_world_size=1,
+            args=(TORCHELASTIC_HEALTH_CHECK_PORT, True),
+        )
+        spec = self.get_worker_spec(node_conf, max_restarts=2)
+        agent = self.get_agent(spec, node_config=node_conf)
+        res = agent.run()
+        self.assertFalse(res.is_failed())
+
+    def run_agent_healthcheck_setup_disabled(self):
+        # Do not set the env for healthcheck
+        healthcheck_port_env_name = TORCHELASTIC_HEALTH_CHECK_PORT
+        if healthcheck_port_env_name in os.environ:
+            del os.environ[healthcheck_port_env_name]
+        # Run the agent
+        node_conf = Conf(
+            entrypoint=_check_local_watchdog_setup,
+            local_world_size=1,
+            args=(TORCHELASTIC_HEALTH_CHECK_PORT, False),
+        )
+        spec = self.get_worker_spec(node_conf, max_restarts=2)
+        agent = self.get_agent(spec, node_config=node_conf)
+        res = agent.run()
+        self.assertFalse(res.is_failed())
+
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
+    def test_run_agent_healthcheck_setup_enabled_etcd(self):
+        self.run_test_with_backend(
+            backend="etcd", test_to_run=self.run_agent_healthcheck_setup_enabled
+        )
+
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
+    def test_run_agent_healthcheck_setup_enabled_c10d(self):
+        self.run_test_with_backend(
+            backend="c10d", test_to_run=self.run_agent_healthcheck_setup_enabled
+        )
+
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
+    def test_run_agent_healthcheck_setup_disabled_etcd(self):
+        self.run_test_with_backend(
+            backend="etcd", test_to_run=self.run_agent_healthcheck_setup_disabled
+        )
+
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
+    def test_run_agent_healthcheck_setup_disabled_c10d(self):
+        self.run_test_with_backend(
+            backend="c10d", test_to_run=self.run_agent_healthcheck_setup_disabled
+        )
+
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_check_env_function_etcd(self):
         self.run_test_with_backend(
             backend="etcd", test_to_run=self.run_check_env_function
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_check_nccl_async_error_handling_env_c10d(self):
         self.run_test_with_backend(
             backend="c10d", test_to_run=self.run_check_nccl_async_error_handling_env
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_check_nccl_async_error_handling_env_default_c10d(self):
         self.run_test_with_backend(
             backend="c10d",
@@ -610,19 +725,25 @@ def run_function_with_return_value(self):
         self.assertEqual("foo", res.return_values[0])
         self.assertEqual("foo", res.return_values[1])
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_function_with_return_value_c10d(self):
         self.run_test_with_backend(
             backend="c10d", test_to_run=self.run_function_with_return_value
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_function_with_return_value_etcd(self):
         self.run_test_with_backend(
             backend="etcd", test_to_run=self.run_function_with_return_value
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_function_with_return_value_etcd_v2(self):
         self.run_test_with_backend(
             backend="etcd-v2", test_to_run=self.run_function_with_return_value
@@ -633,19 +754,27 @@ def simple_dist_sum(self):
         self.assertFalse(res.is_failed())
         # _dist_sum internally checks that the sum computed is valid
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_simple_dist_sum_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.simple_dist_sum)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_simple_dist_sum_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.simple_dist_sum)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_simple_dist_sum_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.simple_dist_sum)
 
-    def run_distributed_sum_homogeneous(self, log_line_prefix_template: Optional[str] = None):
+    def run_distributed_sum_homogeneous(
+        self, log_line_prefix_template: Optional[str] = None
+    ):
         node_configs = [
             Conf(role="sum", entrypoint=_dist_sum, local_world_size=4, tee=Std.ALL),
             Conf(role="sum", entrypoint=_dist_sum, local_world_size=4, tee=Std.ALL),
@@ -654,7 +783,9 @@ def run_distributed_sum_homogeneous(self, log_line_prefix_template: Optional[str
         # due to getting stuck on the _dist_sum in waiting for TCPStore workers
         # to join the cluster
         # TODO(aivanou): t83447589 come up with the proper fix
-        res = self.run_job(node_configs, log_line_prefix_template=log_line_prefix_template)
+        res = self.run_job(
+            node_configs, log_line_prefix_template=log_line_prefix_template
+        )
         self.assertEqual(2, len(res["sum"]))
         ranks = set()
         for run_results in res["sum"]:
@@ -675,10 +806,11 @@ def test_run_with_custom_log_lines(self):
         log_line_prefix_template = "[${role_name}-${local_rank}:${rank}]:"
         self.run_test_with_backend(
             backend="c10d",
-            test_to_run=lambda: self.run_distributed_sum_homogeneous(log_line_prefix_template)
+            test_to_run=lambda: self.run_distributed_sum_homogeneous(
+                log_line_prefix_template
+            ),
         )
 
-
     @unittest.skipIf(
         TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN,
         "test incompatible with dev/dbg asan or tsan",
@@ -718,19 +850,25 @@ def run_distributed_sum_heterogeneous(self):
             ranks.update(run_results.return_values.keys())
         self.assertSetEqual(set(range(1 + 2 + 3)), ranks)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_distributed_sum_heterogeneous_c10d(self):
         self.run_test_with_backend(
             backend="c10d", test_to_run=self.run_distributed_sum_heterogeneous
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_distributed_sum_heterogeneous_etcd(self):
         self.run_test_with_backend(
             backend="etcd", test_to_run=self.run_distributed_sum_heterogeneous
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_distributed_sum_heterogeneous_etcd_v2(self):
         self.run_test_with_backend(
             backend="etcd-v2", test_to_run=self.run_distributed_sum_heterogeneous
@@ -757,15 +895,21 @@ def run_sad_function(self):
                 self.assertEqual(data["message"], failure_data["message"])
                 self.assertEqual(int(data["extraInfo"]["timestamp"]), failure.timestamp)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_sad_function_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.run_sad_function)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_sad_function_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.run_sad_function)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_sad_function_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_sad_function)
 
@@ -775,26 +919,32 @@ def run_bipolar_function(self):
         """
         node_conf = Conf(entrypoint=_bipolar_function, local_world_size=4)
         spec = self.get_worker_spec(node_conf, max_restarts=2)
-        agent = self.get_agent(spec)
+        agent = self.get_agent(spec, node_config=node_conf)
         run_result = agent.run()
         self.assertTrue(run_result.is_failed())
         self.assertEqual(0, agent._remaining_restarts)
         self.assertEqual(WorkerState.FAILED, agent.get_worker_group().state)
         self.assertTrue(agent._total_execution_time > 0)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_bipolar_function_c10d(self):
         self.run_test_with_backend(
             backend="c10d", test_to_run=self.run_bipolar_function
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_bipolar_function_etcd(self):
         self.run_test_with_backend(
             backend="etcd", test_to_run=self.run_bipolar_function
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_run_bipolar_function_etcd_v2(self):
         self.run_test_with_backend(
             backend="etcd-v2", test_to_run=self.run_bipolar_function
@@ -1250,15 +1400,21 @@ def barrier_failed(self, barrier_mock):
         self.assertFalse(res.is_failed())
         barrier_mock.assert_called_once()
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_barrier_failed_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.barrier_failed)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_barrier_failed_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.barrier_failed)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_barrier_failed_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.barrier_failed)
 
@@ -1269,7 +1425,7 @@ def shutdown_called(self, start_processes_mock):
         start_processes_mock.return_value = pcontext_mock
         node_conf = Conf(entrypoint=_happy_function, local_world_size=1)
         spec = self.get_worker_spec(node_conf, max_restarts=0)
-        agent = self.get_agent(spec)
+        agent = self.get_agent(spec, node_config=node_conf)
         with patch.object(agent, "_monitor_workers") as monitor_mock:
             monitor_mock.return_value = RunResult(
                 state=WorkerState.SUCCEEDED, return_values={0: 0}
@@ -1277,14 +1433,36 @@ def shutdown_called(self, start_processes_mock):
             agent.run("worker")
         pcontext_mock.close.assert_called_once()
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_shutdown_called_c10d(self):
         self.run_test_with_backend(backend="c10d", test_to_run=self.shutdown_called)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_shutdown_called_etcd(self):
         self.run_test_with_backend(backend="etcd", test_to_run=self.shutdown_called)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_shutdown_called_etcd_v2(self):
         self.run_test_with_backend(backend="etcd-v2", test_to_run=self.shutdown_called)
+
+    def fail_rank_one_once(self):
+        res = self.run_agent(
+            Conf(entrypoint=dummy_compute_simulate_rank_failure, local_world_size=2),
+            max_restarts=3,
+        )
+        self.assertFalse(res.is_failed())
+        for return_value in res.return_values.values():
+            self.assertIsInstance(return_value, torch.Tensor)
+            self.assertEqual((100, 100), return_value.shape)
+
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
+    def test_rank_restart_after_failure(self):
+        self.run_test_with_backend(backend="c10d", test_to_run=self.fail_rank_one_once)
diff --git a/test/distributed/elastic/events/lib_test.py b/test/distributed/elastic/events/lib_test.py
index 3a5fb694bfda2..63836c48c1a32 100644
--- a/test/distributed/elastic/events/lib_test.py
+++ b/test/distributed/elastic/events/lib_test.py
@@ -13,12 +13,12 @@
 from unittest.mock import patch
 
 from torch.distributed.elastic.events import (
+    _get_or_create_logger,
+    construct_and_record_rdzv_event,
     Event,
     EventSource,
     NodeState,
     RdzvEvent,
-    _get_or_create_logger,
-    construct_and_record_rdzv_event,
 )
 from torch.testing._internal.common_utils import run_tests, TestCase
 
@@ -58,6 +58,7 @@ def test_event_deser(self):
         deser_event = Event.deserialize(json_event)
         self.assert_event(event, deser_event)
 
+
 class RdzvEventLibTest(TestCase):
     @patch("torch.distributed.elastic.events.record_rdzv_event")
     @patch("torch.distributed.elastic.events.get_logging_handler")
@@ -72,7 +73,9 @@ def test_construct_and_record_rdzv_event(self, get_mock, record_mock):
 
     @patch("torch.distributed.elastic.events.record_rdzv_event")
     @patch("torch.distributed.elastic.events.get_logging_handler")
-    def test_construct_and_record_rdzv_event_does_not_run_if_invalid_dest(self, get_mock, record_mock):
+    def test_construct_and_record_rdzv_event_does_not_run_if_invalid_dest(
+        self, get_mock, record_mock
+    ):
         get_mock.return_value = logging.NullHandler()
         construct_and_record_rdzv_event(
             run_id="test_run_id",
@@ -120,7 +123,6 @@ def test_rdzv_event_created(self):
         self.assertEqual(event.local_id, 4)
         self.assertEqual(event.error_trace, "test_error_trace")
 
-
     def test_rdzv_event_deserialize(self):
         event = self.get_test_rdzv_event()
         json_event = event.serialize()
diff --git a/test/distributed/elastic/metrics/api_test.py b/test/distributed/elastic/metrics/api_test.py
index 279a1b951f351..3505e7740e533 100644
--- a/test/distributed/elastic/metrics/api_test.py
+++ b/test/distributed/elastic/metrics/api_test.py
@@ -10,10 +10,10 @@
 import unittest.mock as mock
 
 from torch.distributed.elastic.metrics.api import (
+    _get_metric_name,
     MetricData,
     MetricHandler,
     MetricStream,
-    _get_metric_name,
     prof,
 )
 from torch.testing._internal.common_utils import run_tests, TestCase
@@ -34,7 +34,7 @@ def emit(self, metric_data: MetricData):
 class Parent(abc.ABC):
     @abc.abstractmethod
     def func(self):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def base_func(self):
         self.func()
@@ -57,7 +57,7 @@ def bar(self):
 
     @prof
     def throw(self):
-        raise RuntimeError()
+        raise RuntimeError
 
     @prof(group="torchelastic")
     def bar2(self):
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index 7ca37cefdf203..9658ed087ab05 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -22,12 +22,13 @@
 import torch.multiprocessing as mp
 from torch.distributed.elastic.multiprocessing import ProcessFailure, start_processes
 from torch.distributed.elastic.multiprocessing.api import (
+    _validate_full_rank,
+    _wrap,
+    DefaultLogsSpecs,
     MultiprocessContext,
     RunProcsResult,
     SignalException,
     Std,
-    _validate_full_rank,
-    _wrap,
     to_map,
 )
 from torch.distributed.elastic.multiprocessing.errors import ErrorHandler
@@ -36,12 +37,13 @@
     IS_MACOS,
     IS_WINDOWS,
     NO_MULTIPROCESSING_SPAWN,
+    run_tests,
+    skip_but_pass_in_sandcastle_if,
+    skip_if_pytest,
     TEST_WITH_ASAN,
     TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_TSAN,
     TestCase,
-    run_tests,
-    skip_but_pass_in_sandcastle_if,
 )
 
 
@@ -65,7 +67,6 @@ def test_is_failed(self):
         self.assertTrue(pr_fail.is_failed())
 
     def test_get_failures(self):
-
         error_file0 = os.path.join(self.test_dir, "error0.json")
         error_file1 = os.path.join(self.test_dir, "error1.json")
         eh = ErrorHandler()
@@ -212,8 +213,7 @@ def start_processes_zombie_test(
         entrypoint=entrypoint,
         args=args,
         envs=envs,
-        log_dir=log_dir,
-        redirects=Std.NONE,
+        logs_specs=DefaultLogsSpecs(log_dir=log_dir),
     )
     my_pid = os.getpid()
     mp_queue.put(my_pid)
@@ -272,22 +272,24 @@ def test_to_map(self):
         def test_invalid_log_dir(self):
             with tempfile.NamedTemporaryFile(dir=self.test_dir) as not_a_dir:
                 cases = {
-                    "does_not_exist": FileNotFoundError,
                     not_a_dir.name: NotADirectoryError,
-                    # test_dir is not empty since we touched not_a_dir file
-                    self.test_dir: RuntimeError,
                 }
 
-                for (log_dir, expected_error) in cases.items():
+                for log_dir, expected_error in cases.items():
                     with self.subTest(log_dir=log_dir, expected_error=expected_error):
                         with self.assertRaises(expected_error):
-                            start_processes(
-                                name="echo",
-                                entrypoint=echo1,
-                                args={0: ("hello",)},
-                                envs={0: {"RANK": "0"}},
-                                log_dir=log_dir,
-                            )
+                            pc = None
+                            try:
+                                pc = start_processes(
+                                    name="echo",
+                                    entrypoint=echo1,
+                                    args={0: ("hello",)},
+                                    envs={0: {"RANK": "0"}},
+                                    logs_specs=DefaultLogsSpecs(log_dir=log_dir),
+                                )
+                            finally:
+                                if pc:
+                                    pc.close()
 
         def test_args_env_len_mismatch(self):
             cases = [
@@ -313,7 +315,7 @@ def test_args_env_len_mismatch(self):
                             entrypoint=echo1,
                             args=args,
                             envs=envs,
-                            log_dir=self.log_dir(),
+                            logs_specs=DefaultLogsSpecs(log_dir=self.log_dir()),
                         )
 
         def test_pcontext_wait(self):
@@ -322,7 +324,7 @@ def test_pcontext_wait(self):
                 entrypoint=time.sleep,
                 args={0: (1,)},
                 envs={0: {}},
-                log_dir=self.log_dir(),
+                logs_specs=DefaultLogsSpecs(log_dir=self.log_dir()),
                 start_method="spawn",
             )
 
@@ -337,7 +339,7 @@ def test_multiprocess_context_close(self):
                 entrypoint=time.sleep,
                 args={0: (1,)},
                 envs={0: {}},
-                log_dir=self.log_dir(),
+                logs_specs=DefaultLogsSpecs(log_dir=self.log_dir()),
                 start_method="spawn",
             )
 
@@ -353,7 +355,7 @@ def test_subprocess_context_close(self):
                 entrypoint=bin("zombie_test.py"),
                 args={0: (1,)},
                 envs={0: {}},
-                log_dir=self.log_dir(),
+                logs_specs=DefaultLogsSpecs(log_dir=self.log_dir()),
             )
 
             pids = pc.pids()
@@ -367,7 +369,7 @@ def test_function_with_tensor(self):
                     entrypoint=dummy_compute,
                     args={},
                     envs={},
-                    log_dir=self.log_dir(),
+                    logs_specs=DefaultLogsSpecs(log_dir=self.log_dir()),
                     start_method=start_method,
                 )
 
@@ -385,14 +387,16 @@ def test_void_function(self):
                         entrypoint=echo0,
                         args={0: ("hello",), 1: ("world",)},
                         envs={0: {}, 1: {}},
-                        log_dir=self.log_dir(),
+                        logs_specs=DefaultLogsSpecs(log_dir=self.log_dir()),
                         start_method=start_method,
                     )
 
                     results = pc.wait(period=0.1)
                     self.assertEqual({0: None, 1: None}, results.return_values)
 
-        @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "tests incompatible with asan")
+        @skip_but_pass_in_sandcastle_if(
+            TEST_WITH_DEV_DBG_ASAN, "tests incompatible with asan"
+        )
         def test_function_large_ret_val(self):
             # python multiprocessing.queue module uses pipes and actually PipedQueues
             # This means that if a single object is greater than a pipe size
@@ -404,11 +408,11 @@ def test_function_large_ret_val(self):
             for start_method in self._start_methods:
                 with self.subTest(start_method=start_method):
                     pc = start_processes(
+                        logs_specs=DefaultLogsSpecs(log_dir=self.log_dir()),
                         name="echo",
                         entrypoint=echo_large,
                         args={0: (size,), 1: (size,), 2: (size,), 3: (size,)},
                         envs={0: {}, 1: {}, 2: {}, 3: {}},
-                        log_dir=self.log_dir(),
                         start_method=start_method,
                     )
 
@@ -429,8 +433,11 @@ def test_function_raise(self):
                         name="echo",
                         entrypoint=echo2,
                         args={0: ("hello", RAISE), 1: ("world",)},
-                        envs={0: {}, 1: {}},
-                        log_dir=log_dir,
+                        envs={
+                            0: {"TORCHELASTIC_RUN_ID": "run_id"},
+                            1: {"TORCHELASTIC_RUN_ID": "run_id"},
+                        },
+                        logs_specs=DefaultLogsSpecs(log_dir=log_dir),
                         start_method=start_method,
                     )
 
@@ -447,9 +454,10 @@ def test_function_raise(self):
                     self.assertEqual(1, failure.exitcode)
                     self.assertEqual("<N/A>", failure.signal_name())
                     self.assertEqual(pc.pids()[0], failure.pid)
-                    self.assertEqual(
-                        os.path.join(log_dir, "0", "error.json"), error_file
+                    self.assertTrue(
+                        error_file.startswith(os.path.join(log_dir, "run_id_"))
                     )
+                    self.assertTrue(error_file.endswith("attempt_0/0/error.json"))
                     self.assertEqual(
                         int(error_file_data["message"]["extraInfo"]["timestamp"]),
                         int(failure.timestamp),
@@ -468,8 +476,10 @@ def test_binary_exit(self):
                 entrypoint=bin("echo1.py"),
                 args={0: ("--exitcode", FAIL, "foo"), 1: ("--exitcode", 0, "bar")},
                 envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
-                log_dir=self.log_dir(),
-                redirects={0: Std.ALL},
+                logs_specs=DefaultLogsSpecs(
+                    log_dir=self.log_dir(),
+                    redirects={0: Std.ALL},
+                ),
             )
 
             results = pc.wait(period=0.1)
@@ -494,7 +504,7 @@ def test_binary_raises(self):
                 entrypoint=bin("echo2.py"),
                 args={0: ("--raises", "true", "foo"), 1: ("bar",)},
                 envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
-                log_dir=self.log_dir(),
+                logs_specs=DefaultLogsSpecs(log_dir=self.log_dir()),
             )
 
             results = pc.wait(period=0.1)
@@ -515,7 +525,7 @@ def test_binary_incorrect_entrypoint(self):
                     entrypoint="does_not_exist.py",
                     args={0: ("foo"), 1: ("bar",)},
                     envs={0: {}, 1: {}},
-                    log_dir=self.log_dir(),
+                    logs_specs=DefaultLogsSpecs(log_dir=self.log_dir()),
                 )
 
         def test_validate_full_rank(self):
@@ -532,12 +542,10 @@ def test_multiprocessing_context_poll_raises_exception(self):
                 name="test_mp",
                 entrypoint=echo0,
                 args={0: (0, 1)},
-                envs={},
-                stdouts={0: {}},
-                stderrs={0: {}},
-                tee_stdouts={0: "tee_stdout"},
-                tee_stderrs={0: "tee_stderr"},
-                error_files={0: "test_file"},
+                envs={0: {}},
+                logs_specs=DefaultLogsSpecs(
+                    log_dir=self.log_dir(), redirects=Std.ALL, tee=Std.ALL
+                ),
                 start_method="spawn",
             )
             mp_context._pc = mock.Mock()
@@ -573,9 +581,11 @@ def test_function(self):
                         entrypoint=echo1,
                         args={0: ("hello",), 1: ("hello",)},
                         envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
-                        log_dir=self.log_dir(),
+                        logs_specs=DefaultLogsSpecs(
+                            log_dir=self.log_dir(),
+                            redirects=redirs,
+                        ),
                         start_method=start_method,
-                        redirects=redirs,
                     )
 
                     results = pc.wait(period=0.1)
@@ -608,9 +618,11 @@ def test_binary(self):
                         entrypoint=bin("echo1.py"),
                         args={0: ("hello",), 1: ("hello",)},
                         envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
-                        log_dir=self.log_dir(),
+                        logs_specs=DefaultLogsSpecs(
+                            log_dir=self.log_dir(),
+                            redirects=redirs,
+                        ),
                         log_line_prefixes={0: "[rank0]:", 1: "[rank1]:"},
-                        redirects=redirs,
                     )
 
                     results = pc.wait(period=0.1)
@@ -641,11 +653,13 @@ def test_binary_redirect_and_tee(self):
                 entrypoint=bin("echo1.py"),
                 args={0: ("hello",), 1: ("world",)},
                 envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
-                log_dir=self.log_dir(),
+                logs_specs=DefaultLogsSpecs(
+                    log_dir=self.log_dir(),
+                    redirects={0: Std.ERR, 1: Std.NONE},
+                    tee={0: Std.OUT, 1: Std.ERR},
+                ),
                 log_line_prefixes={0: "[rank0]:", 1: "[rank1]:"},
                 start_method="spawn",
-                redirects={0: Std.ERR, 1: Std.NONE},
-                tee={0: Std.OUT, 1: Std.ERR},
             )
 
             result = pc.wait()
@@ -663,6 +677,7 @@ def test_binary_redirect_and_tee(self):
 if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS or IS_CI):
 
     class StartProcessesNotCITest(StartProcessesTest):
+        @skip_if_pytest
         def test_wrap_bad(self):
             none = ""
             stdout_log = os.path.join(self.test_dir, "stdout.log")
@@ -700,7 +715,9 @@ def test_binary_signal(self):
                 entrypoint=bin("echo3.py"),
                 args={0: ("--segfault", "true", "foo"), 1: ("bar",)},
                 envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
-                log_dir=self.log_dir(),
+                logs_specs=DefaultLogsSpecs(
+                    log_dir=self.log_dir(),
+                ),
             )
 
             results = pc.wait(period=0.1)
@@ -721,16 +738,17 @@ def test_binary_signal(self):
         def test_function_redirect_and_tee(self):
             for start_method in self._start_methods:
                 with self.subTest(start_method=start_method):
-                    log_dir = self.log_dir()
                     pc = start_processes(
                         name="trainer",
                         entrypoint=echo1,
                         args={0: ("hello",), 1: ("world",)},
                         envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
-                        log_dir=log_dir,
+                        logs_specs=DefaultLogsSpecs(
+                            log_dir=self.log_dir(),
+                            redirects={0: Std.ERR, 1: Std.NONE},
+                            tee={0: Std.OUT, 1: Std.ERR},
+                        ),
                         start_method="spawn",
-                        redirects={0: Std.ERR, 1: Std.NONE},
-                        tee={0: Std.OUT, 1: Std.ERR},
                     )
 
                     result = pc.wait()
@@ -751,9 +769,11 @@ def test_function(self):
                         entrypoint=echo1,
                         args={0: ("hello",), 1: ("hello",)},
                         envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
-                        log_dir=self.log_dir(),
                         start_method=start_method,
-                        redirects=redirs,
+                        logs_specs=DefaultLogsSpecs(
+                            log_dir=self.log_dir(),
+                            redirects=redirs,
+                        ),
                     )
 
                     results = pc.wait(period=0.1)
@@ -788,15 +808,16 @@ def test_function_exit(self):
             FAIL = 138
             for start_method in self._start_methods:
                 with self.subTest(start_method=start_method):
-                    log_dir = self.log_dir()
                     pc = start_processes(
                         name="echo",
                         entrypoint=echo1,
                         args={0: ("hello", FAIL), 1: ("hello",)},
                         envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
-                        log_dir=log_dir,
+                        logs_specs=DefaultLogsSpecs(
+                            log_dir=self.log_dir(),
+                            redirects={0: Std.ERR},
+                        ),
                         start_method=start_method,
-                        redirects={0: Std.ERR},
                     )
 
                     results = pc.wait(period=0.1)
diff --git a/test/distributed/elastic/multiprocessing/bin/echo3.py b/test/distributed/elastic/multiprocessing/bin/echo3.py
index b07f4714b2437..ebad7254682e0 100755
--- a/test/distributed/elastic/multiprocessing/bin/echo3.py
+++ b/test/distributed/elastic/multiprocessing/bin/echo3.py
@@ -24,5 +24,4 @@
     if args.segfault:
         ctypes.string_at(0)
     else:
-
         print(f"{args.msg} from {rank}")
diff --git a/test/distributed/elastic/multiprocessing/errors/api_test.py b/test/distributed/elastic/multiprocessing/errors/api_test.py
index e49cbe85fe159..7710adabfd254 100644
--- a/test/distributed/elastic/multiprocessing/errors/api_test.py
+++ b/test/distributed/elastic/multiprocessing/errors/api_test.py
@@ -28,6 +28,13 @@ def raise_exception_fn():
     raise SentinelError("foobar")
 
 
+@record
+def raise_system_exit_exception_fn(exit_code: int = 1):
+    exp = SystemExit()
+    exp.code = exit_code
+    raise exp
+
+
 @record
 def good_fn():
     print("hello world")
@@ -175,6 +182,21 @@ def test_record(self):
             self.assertIsNotNone(err["message"]["extraInfo"]["py_callstack"])
             self.assertIsNotNone(err["message"]["extraInfo"]["timestamp"])
 
+    def test_record_system_exit(self):
+        with mock.patch.dict(os.environ, {}):
+            raise_system_exit_exception_fn(exit_code=0)
+
+        # no error file should have been generated
+        self.assertFalse(os.path.isfile(self.test_error_file))
+
+    def test_record_system_exit_erronr(self):
+        with mock.patch.dict(os.environ, {}):
+            with self.assertRaises(SystemExit):
+                raise_system_exit_exception_fn()
+
+        # no error file should have been generated
+        self.assertFalse(os.path.isfile(self.test_error_file))
+
     def test_record_no_error_file(self):
         with mock.patch.dict(os.environ, {}):
             with self.assertRaises(SentinelError):
diff --git a/test/distributed/elastic/multiprocessing/redirects_test.py b/test/distributed/elastic/multiprocessing/redirects_test.py
index 04e8b1c30dc7e..0d8c14310f87a 100644
--- a/test/distributed/elastic/multiprocessing/redirects_test.py
+++ b/test/distributed/elastic/multiprocessing/redirects_test.py
@@ -123,7 +123,7 @@ def _redirect_large_buffer(self, print_fn, num_lines=500_000):
                 print_fn(i)
 
         with open(stdout_log) as fp:
-            actual = {int(line.split(":")[1]) for line in fp.readlines()}
+            actual = {int(line.split(":")[1]) for line in fp}
             expected = set(range(num_lines))
             self.assertSetEqual(expected, actual)
 
diff --git a/test/distributed/elastic/multiprocessing/tail_log_test.py b/test/distributed/elastic/multiprocessing/tail_log_test.py
index e675f5f7eb890..6ead06dbe0cbe 100644
--- a/test/distributed/elastic/multiprocessing/tail_log_test.py
+++ b/test/distributed/elastic/multiprocessing/tail_log_test.py
@@ -53,7 +53,9 @@ def test_tail(self):
         }
 
         dst = io.StringIO()
-        tail = TailLog(name="writer", log_files=log_files, dst=dst, interval_sec=interval_sec).start()
+        tail = TailLog(
+            name="writer", log_files=log_files, dst=dst, interval_sec=interval_sec
+        ).start()
         # sleep here is intentional to ensure that the log tail
         # can gracefully handle and wait for non-existent log files
         time.sleep(interval_sec * 10)
@@ -130,7 +132,6 @@ def test_tail_with_custom_prefix(self):
             self.assertIn(f"[worker{i}][{i}]", headers)
         self.assertTrue(tail.stopped())
 
-
     def test_tail_no_files(self):
         """
         Ensures that the log tail can gracefully handle no log files
@@ -152,7 +153,7 @@ def test_tail_logfile_never_generates(self):
         self.assertTrue(tail.stopped())
         self.assertTrue(tail._threadpool._shutdown)
 
-    @mock.patch("torch.distributed.elastic.multiprocessing.tail_log.log")
+    @mock.patch("torch.distributed.elastic.multiprocessing.tail_log.logger")
     def test_tail_logfile_error_in_tail_fn(self, mock_logger):
         """
         Ensures that when there is an error in the tail_fn (the one that runs in the
diff --git a/test/distributed/elastic/rendezvous/api_test.py b/test/distributed/elastic/rendezvous/api_test.py
index 40567857dfe50..82990c7fed429 100644
--- a/test/distributed/elastic/rendezvous/api_test.py
+++ b/test/distributed/elastic/rendezvous/api_test.py
@@ -6,7 +6,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, Dict, SupportsInt, Tuple, cast
+from typing import Any, cast, Dict, SupportsInt, Tuple
 from unittest import TestCase
 
 from torch.distributed import Store
@@ -170,7 +170,9 @@ def test_get_as_int_returns_integer_if_value_represents_integer(self) -> None:
 
                 params = self._create_params()
 
-                self.assertEqual(params.get_as_int("dummy_param"), int(cast(SupportsInt, value)))
+                self.assertEqual(
+                    params.get_as_int("dummy_param"), int(cast(SupportsInt, value))
+                )
 
     def test_get_as_int_raises_error_if_value_is_invalid(self) -> None:
         for value in ["a", "0a", "3b", "abc"]:
@@ -195,7 +197,7 @@ def get_backend(self) -> str:
         return "dummy_backend"
 
     def next_rendezvous(self) -> Tuple[Store, int, int]:
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def is_closed(self) -> bool:
         return False
@@ -233,7 +235,9 @@ def test_register_registers_once_if_called_twice_with_same_creator(self) -> None
         self._registry.register("dummy_backend", self._create_handler)
         self._registry.register("dummy_backend", self._create_handler)
 
-    def test_register_raises_error_if_called_twice_with_different_creators(self) -> None:
+    def test_register_raises_error_if_called_twice_with_different_creators(
+        self,
+    ) -> None:
         self._registry.register("dummy_backend", self._create_handler)
 
         other_create_handler = lambda p: _DummyRendezvousHandler(p)  # noqa: E731
diff --git a/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py b/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
index ebd256a025c9b..5ebeb00b2ff0b 100644
--- a/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
+++ b/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
@@ -11,22 +11,23 @@
 
 from base64 import b64encode
 from datetime import timedelta
-from typing import ClassVar, cast, Callable
-from unittest import TestCase, mock
+from typing import Callable, cast, ClassVar
+from unittest import mock, TestCase
 
-from torch.distributed import TCPStore, FileStore
+from rendezvous_backend_test import RendezvousBackendTestMixin
+
+from torch.distributed import FileStore, TCPStore
 
 from torch.distributed.elastic.rendezvous import (
     RendezvousConnectionError,
+    RendezvousError,
     RendezvousParameters,
-    RendezvousError)
+)
 from torch.distributed.elastic.rendezvous.c10d_rendezvous_backend import (
     C10dRendezvousBackend,
     create_backend,
 )
 
-from rendezvous_backend_test import RendezvousBackendTestMixin
-
 
 class TCPStoreBackendTest(TestCase, RendezvousBackendTestMixin):
     _store: ClassVar[TCPStore]
@@ -44,6 +45,7 @@ def setUp(self) -> None:
     def _corrupt_state(self) -> None:
         self._store.set("torch.rendezvous.dummy_run_id", "non_base64")
 
+
 class FileStoreBackendTest(TestCase, RendezvousBackendTestMixin):
     _store: ClassVar[FileStore]
 
@@ -102,7 +104,6 @@ def setUp(self) -> None:
     def tearDown(self) -> None:
         os.remove(self._expected_endpoint_file)
 
-
     def _run_test_with_store(self, store_type: str, test_to_run: Callable):
         """
         Use this function to specify the store type to use in a test. If
@@ -124,10 +125,10 @@ def _assert_create_backend_returns_backend(self) -> None:
 
         typecast_store = cast(self._expected_store_type, store)
         self.assertEqual(typecast_store.timeout, self._expected_read_timeout)  # type: ignore[attr-defined]
-        if (self._expected_store_type == TCPStore):
+        if self._expected_store_type == TCPStore:
             self.assertEqual(typecast_store.host, self._expected_endpoint_host)  # type: ignore[attr-defined]
             self.assertEqual(typecast_store.port, self._expected_endpoint_port)  # type: ignore[attr-defined]
-        if (self._expected_store_type == FileStore):
+        if self._expected_store_type == FileStore:
             if self._params.endpoint:
                 self.assertEqual(typecast_store.path, self._expected_endpoint_file)  # type: ignore[attr-defined]
             else:
@@ -142,7 +143,9 @@ def _assert_create_backend_returns_backend(self) -> None:
     def test_create_backend_returns_backend(self) -> None:
         for store_type in ["tcp", "file"]:
             with self.subTest(store_type=store_type):
-                self._run_test_with_store(store_type, self._assert_create_backend_returns_backend)
+                self._run_test_with_store(
+                    store_type, self._assert_create_backend_returns_backend
+                )
 
     def test_create_backend_returns_backend_if_is_host_is_false(self) -> None:
         store = TCPStore(  # type: ignore[call-arg] # noqa: F841
@@ -169,34 +172,52 @@ def test_create_backend_returns_backend_if_is_host_is_not_specified_and_store_al
 
         self._assert_create_backend_returns_backend()
 
-    def test_create_backend_returns_backend_if_endpoint_port_is_not_specified(self) -> None:
+    def test_create_backend_returns_backend_if_endpoint_port_is_not_specified(
+        self,
+    ) -> None:
         self._params.endpoint = self._expected_endpoint_host
 
         self._expected_endpoint_port = 29400
 
         self._assert_create_backend_returns_backend()
 
-    def test_create_backend_returns_backend_if_endpoint_file_is_not_specified(self) -> None:
+    def test_create_backend_returns_backend_if_endpoint_file_is_not_specified(
+        self,
+    ) -> None:
         self._params_filestore.endpoint = ""
 
         self._run_test_with_store("file", self._assert_create_backend_returns_backend)
 
-    def test_create_backend_returns_backend_if_store_type_is_not_specified(self) -> None:
+    def test_create_backend_returns_backend_if_store_type_is_not_specified(
+        self,
+    ) -> None:
         del self._params.config["store_type"]
 
         self._expected_store_type = TCPStore
-        if (not self._params.get("read_timeout")):
+        if not self._params.get("read_timeout"):
             self._expected_read_timeout = timedelta(seconds=60)
 
         self._assert_create_backend_returns_backend()
 
-    def test_create_backend_returns_backend_if_read_timeout_is_not_specified(self) -> None:
+    def test_create_backend_returns_backend_if_read_timeout_is_not_specified(
+        self,
+    ) -> None:
         del self._params.config["read_timeout"]
 
         self._expected_read_timeout = timedelta(seconds=60)
 
         self._assert_create_backend_returns_backend()
 
+    def test_create_backend_returns_backend_with_libuv(self) -> None:
+        self._params.config["use_libuv"] = "true"
+
+        self._assert_create_backend_returns_backend()
+
+    def test_create_backend_returns_backend_without_libuv(self) -> None:
+        self._params.config["use_libuv"] = "false"
+
+        self._assert_create_backend_returns_backend()
+
     def test_create_backend_raises_error_if_store_is_unreachable(self) -> None:
         self._params.config["is_host"] = "false"
         self._params.config["read_timeout"] = "2"
@@ -225,7 +246,8 @@ def test_create_backend_raises_error_if_store_type_is_invalid(self) -> None:
         self._params.config["store_type"] = "dummy_store_type"
 
         with self.assertRaisesRegex(
-            ValueError, r"^Invalid store type given. Currently only supports file and tcp.$"
+            ValueError,
+            r"^Invalid store type given. Currently only supports file and tcp.$",
         ):
             create_backend(self._params)
 
@@ -240,18 +262,24 @@ def test_create_backend_raises_error_if_read_timeout_is_invalid(self) -> None:
                     create_backend(self._params)
 
     @mock.patch("tempfile.mkstemp")
-    def test_create_backend_raises_error_if_tempfile_creation_fails(self, tempfile_mock) -> None:
+    def test_create_backend_raises_error_if_tempfile_creation_fails(
+        self, tempfile_mock
+    ) -> None:
         tempfile_mock.side_effect = OSError("test error")
         # Set the endpoint to empty so it defaults to creating a temp file
         self._params_filestore.endpoint = ""
         with self.assertRaisesRegex(
             RendezvousError,
-            r"The file creation for C10d store has failed. See inner exception for details."
+            r"The file creation for C10d store has failed. See inner exception for details.",
         ):
             create_backend(self._params_filestore)
 
-    @mock.patch("torch.distributed.elastic.rendezvous.c10d_rendezvous_backend.FileStore")
-    def test_create_backend_raises_error_if_file_path_is_invalid(self, filestore_mock) -> None:
+    @mock.patch(
+        "torch.distributed.elastic.rendezvous.c10d_rendezvous_backend.FileStore"
+    )
+    def test_create_backend_raises_error_if_file_path_is_invalid(
+        self, filestore_mock
+    ) -> None:
         filestore_mock.side_effect = RuntimeError("test error")
         self._params_filestore.endpoint = "bad file path"
         with self.assertRaisesRegex(
diff --git a/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py b/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py
index 946c70b336e0e..0772ca5135658 100644
--- a/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py
+++ b/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py
@@ -15,9 +15,9 @@
 from abc import ABC, abstractmethod
 from base64 import b64encode
 from datetime import datetime, timedelta
-from typing import Callable, Optional, Tuple, cast
+from typing import Callable, cast, Optional, Tuple
 from unittest import TestCase
-from unittest.mock import MagicMock, Mock, call, patch
+from unittest.mock import call, MagicMock, Mock, patch
 
 from torch.distributed import Store
 from torch.distributed.elastic.rendezvous import (
@@ -28,11 +28,6 @@
     RendezvousTimeoutError,
 )
 from torch.distributed.elastic.rendezvous.dynamic_rendezvous import (
-    DynamicRendezvousHandler,
-    RendezvousBackend,
-    RendezvousSettings,
-    RendezvousTimeout,
-    Token,
     _Action,
     _BackendRendezvousStateHolder,
     _DistributedRendezvousOpExecutor,
@@ -46,13 +41,20 @@
     _RendezvousState,
     _RendezvousStateHolder,
     create_handler,
+    DynamicRendezvousHandler,
+    RendezvousBackend,
+    RendezvousSettings,
+    RendezvousTimeout,
+    Token,
 )
 
 
 class CustomAssertMixin:
     assertDictEqual: Callable
 
-    def assert_state_equal(self, actual: _RendezvousState, expected: _RendezvousState) -> None:
+    def assert_state_equal(
+        self, actual: _RendezvousState, expected: _RendezvousState
+    ) -> None:
         self.assertDictEqual(vars(actual), vars(expected))
 
     def assert_state_empty(self, actual: _RendezvousState) -> None:
@@ -87,7 +89,8 @@ def test_init_raises_error_if_timeout_is_not_positive(self) -> None:
         for join_timeout in join_timeouts:
             with self.subTest(join_timeout=join_timeout):
                 with self.assertRaisesRegex(
-                    ValueError, rf"^The join timeout \({join_timeout}\) must be positive.$"
+                    ValueError,
+                    rf"^The join timeout \({join_timeout}\) must be positive.$",
                 ):
                     timeout = RendezvousTimeout(join_timeout)
 
@@ -143,8 +146,12 @@ def test_encoded_size_is_within_expected_limit(self) -> None:
         for num_nodes, max_byte_size in expected_max_sizes:
             with self.subTest(num_nodes=num_nodes, max_byte_size=max_byte_size):
                 for i in range(num_nodes):
-                    node_running = _NodeDesc(f"dummy{i}.dummy1-dummy1-dummy1-dummy1.com", 12345, i)
-                    node_waiting = _NodeDesc(f"dummy{i}.dummy2-dummy2-dummy2-dummy2.com", 67890, i)
+                    node_running = _NodeDesc(
+                        f"dummy{i}.dummy1-dummy1-dummy1-dummy1.com", 12345, i
+                    )
+                    node_waiting = _NodeDesc(
+                        f"dummy{i}.dummy2-dummy2-dummy2-dummy2.com", 67890, i
+                    )
 
                     state.participants[node_running] = i
 
@@ -269,7 +276,9 @@ def _create_state(self) -> _RendezvousState:
         return state
 
     def _create_state_holder(self) -> _BackendRendezvousStateHolder:
-        return _BackendRendezvousStateHolder(self._backend, self._settings, self._cache_duration)
+        return _BackendRendezvousStateHolder(
+            self._backend, self._settings, self._cache_duration
+        )
 
     def test_init_initializes_state_holder(self) -> None:
         state_holder = self._create_state_holder()
@@ -361,7 +370,9 @@ def test_sync_uses_cached_state_if_cache_duration_is_specified(self) -> None:
 
         self._backend.set_state_internal(state)
 
-        with patch("torch.distributed.elastic.rendezvous.dynamic_rendezvous.time") as mock_time:
+        with patch(
+            "torch.distributed.elastic.rendezvous.dynamic_rendezvous.time"
+        ) as mock_time:
             for cache_duration in [1, 5, 10]:
                 with self.subTest(cache_duration=cache_duration):
                     self._cache_duration = cache_duration
@@ -397,7 +408,9 @@ def test_sync_gets_backend_state_if_cached_state_has_expired(self) -> None:
 
         self._backend.set_state_internal(state)
 
-        with patch("torch.distributed.elastic.rendezvous.dynamic_rendezvous.time") as mock_time:
+        with patch(
+            "torch.distributed.elastic.rendezvous.dynamic_rendezvous.time"
+        ) as mock_time:
             self._cache_duration = 1
 
             state_holder = self._create_state_holder()
@@ -568,7 +581,9 @@ def _create_op_executor(
         if settings is None:
             settings = self._create_settings()
 
-        return _DistributedRendezvousOpExecutor(self._node, self._state_holder, settings)
+        return _DistributedRendezvousOpExecutor(
+            self._node, self._state_holder, settings
+        )
 
     def _run_action(self, action: _Action) -> None:
         op_executor = self._create_op_executor()
@@ -644,14 +659,18 @@ def _add_participants(
                 node = _NodeDesc(f"dummy{i}", 1, 1)
                 rank = i
             else:
-                node = _NodeDesc(f"dummy{num_participants - i - 1}", 1, 1)  # Add in reverse.
+                node = _NodeDesc(
+                    f"dummy{num_participants - i - 1}", 1, 1
+                )  # Add in reverse.
                 rank = 0
 
             state.participants[node] = rank
 
             state.last_heartbeats[node] = self._now
 
-    def test_run_adds_to_participants_and_starts_last_call_if_min_nodes_is_reached(self) -> None:
+    def test_run_adds_to_participants_and_starts_last_call_if_min_nodes_is_reached(
+        self,
+    ) -> None:
         for num_participants in range(3):
             self._state = _RendezvousState()
 
@@ -817,12 +836,16 @@ def test_run_raises_error_if_operation_timed_out(self) -> None:
         self.assertListEqual(self._mock_state_holder.mock_calls, [call.sync()])
 
     def test_run_delays_execution_if_sync_requested(self) -> None:
-        with patch("torch.distributed.elastic.rendezvous.dynamic_rendezvous._delay") as mock_delay:
+        with patch(
+            "torch.distributed.elastic.rendezvous.dynamic_rendezvous._delay"
+        ) as mock_delay:
             self._run_action(_Action.SYNC)
 
             mock_delay.assert_called_once_with(seconds=1)
 
-        self.assertListEqual(self._mock_state_holder.mock_calls, [call.sync(), call.sync()])
+        self.assertListEqual(
+            self._mock_state_holder.mock_calls, [call.sync(), call.sync()]
+        )
 
 
 class AbstractTestRendezvousOp(ABC):
@@ -850,7 +873,9 @@ def setUp(self) -> None:
         mock_datetime = self._datetime_patch.start()
         mock_datetime.utcnow.return_value = self._now
 
-        self._time_patch = patch("torch.distributed.elastic.rendezvous.dynamic_rendezvous.time")
+        self._time_patch = patch(
+            "torch.distributed.elastic.rendezvous.dynamic_rendezvous.time"
+        )
 
         mock_time = self._time_patch.start()
         mock_time.monotonic.return_value = self._deadline
@@ -932,14 +957,38 @@ def _assert_waits_rendezvous_completion(self) -> None:
 
             self._assert_action(expected_action)
 
-    def test_waits_next_round_if_rendezvous_is_complete(self) -> None:
+    def test_treat_as_redundancy_for_next_rendezvous_if_rendezvous_is_complete(
+        self,
+    ) -> None:
+        self._max_nodes = 1
+
+        self._state.complete = True
+
+        self._assert_action(_Action.ADD_TO_REDUNDANCY_LIST)
+
+    def test_waits_next_round_if_rendezvous_is_complete_and_node_is_redundant(
+        self,
+    ) -> None:
+        self._state.redundancy_list.add(self._node)
+
         self._max_nodes = 1
 
         self._state.complete = True
 
         self._assert_waits_rendezvous_completion()
 
-    def test_waits_next_round_if_rendezvous_is_complete_and_node_is_in_wait_list(self) -> None:
+    def test_remove_from_rednundancy_list(self) -> None:
+        self._state.redundancy_list.add(self._node)
+
+        self._max_nodes = 2
+
+        self._state.complete = True
+
+        self._assert_action(_Action.REMOVE_FROM_REDUNDANCY_LIST)
+
+    def test_waits_next_round_if_rendezvous_is_complete_and_node_is_in_wait_list(
+        self,
+    ) -> None:
         self._state.wait_list.add(self._node)
 
         self._state.complete = True
@@ -981,14 +1030,18 @@ def test_raises_timeout_if_deadline_exceeded(self) -> None:
 
         self._assert_action(_Action.ERROR_TIMEOUT)
 
-    def test_raises_timeout_if_rollback_deadline_exceeded_and_node_is_participant(self) -> None:
+    def test_raises_timeout_if_rollback_deadline_exceeded_and_node_is_participant(
+        self,
+    ) -> None:
         self._deadline = 0
 
         self._state.participants[self._node] = 0
 
         self._assert_action(_Action.ERROR_TIMEOUT)
 
-    def test_raises_timeout_if_rollback_deadline_exceeded_and_node_is_in_wait_list(self) -> None:
+    def test_raises_timeout_if_rollback_deadline_exceeded_and_node_is_in_wait_list(
+        self,
+    ) -> None:
         self._deadline = 0
 
         self._state.wait_list.add(self._node)
@@ -1004,13 +1057,35 @@ def test_removes_from_participants_if_timed_out_but_rollback_deadline_is_not_rea
 
         self._assert_action(_Action.REMOVE_FROM_PARTICIPANTS)
 
-    def test_removes_from_wait_list_if_timed_out_but_rollback_deadline_is_not_reached(self) -> None:
+    def test_removes_from_wait_list_if_timed_out_but_rollback_deadline_is_not_reached(
+        self,
+    ) -> None:
         self._deadline = 5
 
         self._state.wait_list.add(self._node)
 
         self._assert_action(_Action.REMOVE_FROM_WAIT_LIST)
 
+    def test_no_timeout_for_redundant_node(self) -> None:
+        self._max_nodes = 1
+        self._deadline = 0
+        self._state.complete = True
+
+        self._state.redundancy_list.add(self._node)
+
+        self._assert_action(_Action.SYNC)
+
+    def test_keep_alive_for_redundant_node(self) -> None:
+        self._deadline = 0
+        self._max_nodes = 1
+        self._state.complete = True
+
+        self._state.redundancy_list.add(self._node)
+
+        keep_alive_time = self._now - self._keep_alive_interval
+        self._state.last_heartbeats[self._node] = keep_alive_time
+        self._assert_action(_Action.KEEP_ALIVE)
+
 
 class TestRendezvousCloseOp(AbstractTestRendezvousOp, TestCase):
     def _create_op(self) -> Callable:
@@ -1053,7 +1128,9 @@ def test_raises_timeout_if_deadlined_exceeded(self) -> None:
     def test_finishes_if_no_keep_alive_update_is_needed(self) -> None:
         delta = timedelta(seconds=1)
 
-        self._state.last_heartbeats[self._node] = self._now - self._keep_alive_interval + delta
+        self._state.last_heartbeats[self._node] = (
+            self._now - self._keep_alive_interval + delta
+        )
 
         self._assert_action(_Action.FINISH)
 
@@ -1138,7 +1215,9 @@ def test_next_rendezvous_returns_expected_value(self) -> None:
 
         _ = store.get("dummy_key")
 
-        self._mock_store_get.assert_called_once_with("torch.rendezvous.dummy_run_id.0/dummy_key")
+        self._mock_store_get.assert_called_once_with(
+            "torch.rendezvous.dummy_run_id.0/dummy_key"
+        )
 
     def test_next_rendezvous_respects_the_requested_timeout(self) -> None:
         self._mock_sync.side_effect = lambda: time.sleep(0.3)
@@ -1451,7 +1530,9 @@ def test_create_handler_returns_handler(self) -> None:
         self.assertEqual(handler.settings.min_nodes, self._params.min_nodes)
         self.assertEqual(handler.settings.max_nodes, self._params.max_nodes)
         self.assertEqual(handler.settings.timeout.join, self._expected_timeout.join)
-        self.assertEqual(handler.settings.timeout.last_call, self._expected_timeout.last_call)
+        self.assertEqual(
+            handler.settings.timeout.last_call, self._expected_timeout.last_call
+        )
         self.assertEqual(handler.settings.timeout.close, self._expected_timeout.close)
 
     def test_create_handler_returns_handler_if_timeout_is_not_specified(self) -> None:
@@ -1470,3 +1551,196 @@ def test_create_handler_records_and_raises_exceptions(self, record_mock) -> None
             with self.assertRaises(RendezvousError):
                 create_handler(self._store, self._backend, self._params)
                 record_mock.assert_called_once()
+
+
+def _ignore_exception(exception_type: Exception, fn: Callable):
+    try:
+        fn()
+    except exception_type as e:
+        pass
+
+
+def _wait_for(condition, timeout=10, interval=1, name=None):
+    def _wait_while():
+        while True:
+            if condition():
+                break
+            else:
+                time.sleep(interval)
+
+    wait_thread = threading.Thread(target=_wait_while, name=name)
+    wait_thread.start()
+    wait_thread.join(timeout=timeout)
+
+
+class _CapturingThread(threading.Thread):
+    def __init__(self, target=None, name=None, args=None, kwargs=None):
+        if args is None:
+            args = ()
+        if kwargs is None:
+            kwargs = {}
+        threading.Thread.__init__(
+            self, target=target, args=args, kwargs=kwargs, name=name
+        )
+        self._result = None
+
+    def run(self):
+        if self._target is not None:
+            self._result = self._target(*self._args, **self._kwargs)
+
+    def join(self, *args):
+        threading.Thread.join(self, *args)
+        return self._result
+
+
+class IntegrationTest(TestCase):
+    def setUp(self) -> None:
+        self._store = DummyStore()
+        self._handlers = []
+        self._backend = _InMemoryRendezvousBackend()
+
+    def tearDown(self) -> None:
+        for handler in self._handlers:
+            handler._stop_heartbeats()
+
+    def _create_handler(self, **kwargs) -> DynamicRendezvousHandler:
+        params = {
+            "backend": self._backend.name,
+            "endpoint": "dummy_endpoint",
+            "run_id": "dummy_run_id",
+            "min_nodes": 2,
+            "max_nodes": 2,
+            "join_timeout": "5",
+            "local_addr": f"address_{len(self._handlers)}",
+        }
+        params.update(**kwargs)
+
+        rzdv_params = RendezvousParameters(**params)
+
+        handler = create_handler(self._store, self._backend, rzdv_params)
+        self._handlers.append(handler)
+        return handler
+
+    def test_all_nodes_join_rendezvous(self) -> None:
+        handler1 = self._create_handler(min_nodes=2, max_nodes=2)
+        handler2 = self._create_handler(min_nodes=2, max_nodes=2)
+
+        handler1_thread = _CapturingThread(target=handler1.next_rendezvous)
+        handler2_thread = _CapturingThread(target=handler2.next_rendezvous)
+
+        handler1_thread.start()
+        handler2_thread.start()
+
+        store1, rank1, world_size1 = handler1_thread.join()
+        store2, rank2, world_size2 = handler2_thread.join()
+        self.assertEqual(store1.underlying_store, self._store)
+        self.assertEqual(store2.underlying_store, self._store)
+
+        self.assertNotEqual(rank1, rank2)
+
+        self.assertEqual(world_size1, 2)
+        self.assertEqual(world_size2, 2)
+
+    def test_redundancy_list(self) -> None:
+        handler1 = self._create_handler(min_nodes=2, max_nodes=2)
+        handler2 = self._create_handler(min_nodes=2, max_nodes=2)
+        handler3 = self._create_handler(min_nodes=2, max_nodes=2)
+
+        handler1_thread = _CapturingThread(target=handler1.next_rendezvous)
+        handler2_thread = _CapturingThread(target=handler2.next_rendezvous)
+        handler3_thread = _CapturingThread(
+            target=_ignore_exception,
+            args=(RendezvousTimeoutError, lambda: handler3.next_rendezvous()),
+        )
+
+        handler1_thread.start()
+        handler2_thread.start()
+
+        # establish successful rendezvous
+        handler1_thread.join()
+        handler2_thread.join()
+
+        # expect to register in redundancy list
+        handler3_thread.start()
+
+        # wait until the handler3 is registered in the redundancy list
+        _wait_for(lambda: pickle.loads(self._backend.get_state()[0]).redundancy_list)
+
+        state_and_token = self._backend.get_state()
+        state = pickle.loads(state_and_token[0])
+        addresses = [node.addr for node in state.redundancy_list]
+        self.assertListEqual(addresses, ["address_2"])
+
+    def test_redundancy_transition_to_wait_list_then_join_rendezvous(self) -> None:
+        handler1 = self._create_handler(
+            min_nodes=1,
+            max_nodes=2,
+        )
+        handler2 = self._create_handler(
+            min_nodes=1,
+            max_nodes=2,
+            keep_alive_interval=timedelta(seconds=1),
+        )
+        handler3 = self._create_handler(
+            min_nodes=1,
+            max_nodes=2,
+        )
+
+        handler1_thread = _CapturingThread(target=handler1.next_rendezvous)
+        handler2_thread = _CapturingThread(target=handler2.next_rendezvous)
+
+        handler3_thread = _CapturingThread(
+            target=_ignore_exception,
+            args=(RendezvousTimeoutError, lambda: handler3.next_rendezvous()),
+        )
+
+        handler1_thread.start()
+        handler2_thread.start()
+
+        # establish successful rendezvous
+        handler1_thread.join()
+        handler2_thread.join()
+
+        handler3_thread.start()
+
+        _wait_for(lambda: pickle.loads(self._backend.get_state()[0]).redundancy_list)
+
+        handler2._stop_heartbeats()
+
+        _wait_for(
+            lambda: len(pickle.loads(self._backend.get_state()[0]).participants) == 1
+        )
+        _wait_for(
+            lambda: len(pickle.loads(self._backend.get_state()[0]).wait_list) == 1
+        )
+
+
+class _InMemoryRendezvousBackend(RendezvousBackend):
+    def __init__(self):
+        self._lock = threading.Lock()
+        self._state = None
+        self._token = None
+
+    @property
+    def name(self):
+        return "_in_memory_backend"
+
+    def get_state(self):
+        with self._lock:
+            if self._state is None:
+                return None
+            return (self._state, self._token)
+
+        return self._state
+
+    def set_state(self, state, token):
+        if state is None:
+            raise ValueError("State cannot be None.")
+        with self._lock:
+            if token is None and self._token is not None:
+                return None
+            if self._token != token:
+                return None
+
+            self._state = state
+            self._token = self._token + 1 if self._token is not None else 0
diff --git a/test/distributed/elastic/rendezvous/etcd_rendezvous_backend_test.py b/test/distributed/elastic/rendezvous/etcd_rendezvous_backend_test.py
index a972ef01b27b7..55343bd080506 100644
--- a/test/distributed/elastic/rendezvous/etcd_rendezvous_backend_test.py
+++ b/test/distributed/elastic/rendezvous/etcd_rendezvous_backend_test.py
@@ -8,21 +8,24 @@
 
 import subprocess
 from base64 import b64encode
-from typing import ClassVar, cast
+from typing import cast, ClassVar
 from unittest import TestCase
 
 from etcd import EtcdKeyNotFound  # type: ignore[import]
 
-from torch.distributed.elastic.rendezvous import RendezvousConnectionError, RendezvousParameters
+from rendezvous_backend_test import RendezvousBackendTestMixin
+
+from torch.distributed.elastic.rendezvous import (
+    RendezvousConnectionError,
+    RendezvousParameters,
+)
 from torch.distributed.elastic.rendezvous.etcd_rendezvous_backend import (
-    EtcdRendezvousBackend,
     create_backend,
+    EtcdRendezvousBackend,
 )
 from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer
 from torch.distributed.elastic.rendezvous.etcd_store import EtcdStore
 
-from rendezvous_backend_test import RendezvousBackendTestMixin
-
 
 class EtcdRendezvousBackendTest(TestCase, RendezvousBackendTestMixin):
     _server: ClassVar[EtcdServer]
@@ -45,7 +48,9 @@ def setUp(self) -> None:
         except EtcdKeyNotFound:
             pass
 
-        self._backend = EtcdRendezvousBackend(self._client, "dummy_run_id", "/dummy_prefix")
+        self._backend = EtcdRendezvousBackend(
+            self._client, "dummy_run_id", "/dummy_prefix"
+        )
 
     def _corrupt_state(self) -> None:
         self._client.write("/dummy_prefix/dummy_run_id", "non_base64")
@@ -107,7 +112,9 @@ def test_create_backend_returns_backend_if_protocol_is_not_specified(self) -> No
 
         self.test_create_backend_returns_backend()
 
-    def test_create_backend_returns_backend_if_read_timeout_is_not_specified(self) -> None:
+    def test_create_backend_returns_backend_if_read_timeout_is_not_specified(
+        self,
+    ) -> None:
         del self._params.config["read_timeout"]
 
         self._expected_read_timeout = 60
@@ -126,7 +133,9 @@ def test_create_backend_raises_error_if_etcd_is_unreachable(self) -> None:
     def test_create_backend_raises_error_if_protocol_is_invalid(self) -> None:
         self._params.config["protocol"] = "dummy"
 
-        with self.assertRaisesRegex(ValueError, r"^The protocol must be HTTP or HTTPS.$"):
+        with self.assertRaisesRegex(
+            ValueError, r"^The protocol must be HTTP or HTTPS.$"
+        ):
             create_backend(self._params)
 
     def test_create_backend_raises_error_if_read_timeout_is_invalid(self) -> None:
diff --git a/test/distributed/elastic/rendezvous/etcd_server_test.py b/test/distributed/elastic/rendezvous/etcd_server_test.py
index 08fe2e14a68eb..62e9a98f834f9 100644
--- a/test/distributed/elastic/rendezvous/etcd_server_test.py
+++ b/test/distributed/elastic/rendezvous/etcd_server_test.py
@@ -6,8 +6,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import os
-import unittest
 import sys
+import unittest
 
 import etcd
 from torch.distributed.elastic.rendezvous.etcd_rendezvous import (
@@ -20,6 +20,7 @@
     print("T85992919 temporarily disabling in circle ci", file=sys.stderr)
     sys.exit(0)
 
+
 class EtcdServerTest(unittest.TestCase):
     def test_etcd_server_start_stop(self):
         server = EtcdServer()
@@ -40,19 +41,22 @@ def test_etcd_server_with_rendezvous(self):
         server = EtcdServer()
         server.start()
 
-        client = etcd.Client(server.get_host(), server.get_port())
-
-        rdzv = EtcdRendezvous(
-            client=client,
-            prefix="test",
-            run_id=1,
-            num_min_workers=1,
-            num_max_workers=1,
-            timeout=60,
-            last_call_timeout=30,
-        )
-        rdzv_handler = EtcdRendezvousHandler(rdzv)
-        store, rank, world_size = rdzv_handler.next_rendezvous()
-        self.assertIsNotNone(store)
-        self.assertEqual(0, rank)
-        self.assertEqual(1, world_size)
+        try:
+            client = etcd.Client(server.get_host(), server.get_port())
+
+            rdzv = EtcdRendezvous(
+                client=client,
+                prefix="test",
+                run_id=1,
+                num_min_workers=1,
+                num_max_workers=1,
+                timeout=60,
+                last_call_timeout=30,
+            )
+            rdzv_handler = EtcdRendezvousHandler(rdzv)
+            store, rank, world_size = rdzv_handler.next_rendezvous()
+            self.assertIsNotNone(store)
+            self.assertEqual(0, rank)
+            self.assertEqual(1, world_size)
+        finally:
+            server.stop()
diff --git a/test/distributed/elastic/rendezvous/rendezvous_backend_test.py b/test/distributed/elastic/rendezvous/rendezvous_backend_test.py
index b64254bf563af..fa2c6ae9c24ff 100644
--- a/test/distributed/elastic/rendezvous/rendezvous_backend_test.py
+++ b/test/distributed/elastic/rendezvous/rendezvous_backend_test.py
@@ -7,10 +7,13 @@
 # LICENSE file in the root directory of this source tree.
 
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Optional, Tuple, cast
+from typing import Any, Callable, cast, Optional, Tuple
 
 from torch.distributed.elastic.rendezvous import RendezvousStateError
-from torch.distributed.elastic.rendezvous.dynamic_rendezvous import RendezvousBackend, Token
+from torch.distributed.elastic.rendezvous.dynamic_rendezvous import (
+    RendezvousBackend,
+    Token,
+)
 
 
 class RendezvousBackendTestMixin(ABC):
@@ -28,7 +31,9 @@ def _corrupt_state(self) -> None:
         """Corrupts the state stored in the backend."""
         pass
 
-    def _set_state(self, state: bytes, token: Optional[Any] = None) -> Tuple[bytes, Token, bool]:
+    def _set_state(
+        self, state: bytes, token: Optional[Any] = None
+    ) -> Tuple[bytes, Token, bool]:
         result = self._backend.set_state(state, token)
 
         self.assertIsNotNone(result)
diff --git a/test/distributed/elastic/rendezvous/utils_test.py b/test/distributed/elastic/rendezvous/utils_test.py
index c180924ba5c55..b876f458ab183 100644
--- a/test/distributed/elastic/rendezvous/utils_test.py
+++ b/test/distributed/elastic/rendezvous/utils_test.py
@@ -6,19 +6,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import socket
 import threading
 import time
-import socket
 from datetime import timedelta
 from typing import List
 from unittest import TestCase
 from unittest.mock import patch
 
 from torch.distributed.elastic.rendezvous.utils import (
-    _PeriodicTimer,
     _delay,
     _matches_machine_hostname,
     _parse_rendezvous_config,
+    _PeriodicTimer,
     _try_parse_port,
     parse_rendezvous_endpoint,
 )
@@ -229,7 +229,9 @@ def test_matches_machine_hostname_returns_true_if_hostname_is_machine_fqdn(
     def test_matches_machine_hostname_returns_true_if_hostname_is_machine_address(
         self,
     ) -> None:
-        addr_list = socket.getaddrinfo(socket.gethostname(), None, proto=socket.IPPROTO_TCP)
+        addr_list = socket.getaddrinfo(
+            socket.gethostname(), None, proto=socket.IPPROTO_TCP
+        )
 
         for addr in (addr_info[4][0] for addr_info in addr_list):
             with self.subTest(addr=addr):
@@ -255,20 +257,26 @@ def test_delay_suspends_thread(self) -> None:
 
                 self.assertGreaterEqual(time2 - time1, 0.2)
 
-
-    @patch('socket.getaddrinfo', side_effect=[
-        [(None, None, 0, 'a_host', ('1.2.3.4', 0))],
-        [(None, None, 0, 'a_different_host', ('1.2.3.4', 0))]])
+    @patch(
+        "socket.getaddrinfo",
+        side_effect=[
+            [(None, None, 0, "a_host", ("1.2.3.4", 0))],
+            [(None, None, 0, "a_different_host", ("1.2.3.4", 0))],
+        ],
+    )
     def test_matches_machine_hostname_returns_true_if_ip_address_match_between_hosts(
         self,
         _0,
     ) -> None:
         self.assertTrue(_matches_machine_hostname("a_host"))
 
-
-    @patch('socket.getaddrinfo', side_effect=[
-        [(None, None, 0, 'a_host', ('1.2.3.4', 0))],
-        [(None, None, 0, 'another_host_with_different_ip', ('1.2.3.5', 0))]])
+    @patch(
+        "socket.getaddrinfo",
+        side_effect=[
+            [(None, None, 0, "a_host", ("1.2.3.4", 0))],
+            [(None, None, 0, "another_host_with_different_ip", ("1.2.3.5", 0))],
+        ],
+    )
     def test_matches_machine_hostname_returns_false_if_ip_address_not_match_between_hosts(
         self,
         _0,
diff --git a/test/distributed/elastic/timer/file_based_local_timer_test.py b/test/distributed/elastic/timer/file_based_local_timer_test.py
index 785cc978ee85d..490e4a9ce37a7 100644
--- a/test/distributed/elastic/timer/file_based_local_timer_test.py
+++ b/test/distributed/elastic/timer/file_based_local_timer_test.py
@@ -38,7 +38,9 @@ def setUp(self):
             super().setUp()
             self.max_interval = 0.01
             self.file_path = "/tmp/test_file_path_" + str(uuid.uuid4())
-            self.server = timer.FileTimerServer(self.file_path, self.max_interval)
+            self.server = timer.FileTimerServer(
+                self.file_path, "test", self.max_interval
+            )
             self.server.start()
 
         def tearDown(self):
@@ -111,14 +113,58 @@ def func(n, file_path):
             num_requests_per_client = 10
             processes = []
             for i in range(num_clients):
-                p = mp.Process(target=func, args=(num_requests_per_client, self.file_path))
+                p = mp.Process(
+                    target=func, args=(num_requests_per_client, self.file_path)
+                )
                 processes.append(p)
                 p.start()
             for p in processes:
                 p.join()
 
             self.server.run_once()  # Allows the server to process all requests
-            self.assertEqual(2 * num_clients * num_requests_per_client, self.server._request_count)
+            self.assertEqual(
+                2 * num_clients * num_requests_per_client, self.server._request_count
+            )
+
+        @mock.patch("torch.distributed.elastic.timer.FileTimerServer._reap_worker")
+        def test_exit_before_release(self, mock_reap):
+            def func1(file_path):
+                client = timer.FileTimerClient(file_path)
+                timer.configure(client)
+                expire = time.time() + 2
+                client.acquire("test_scope", expire)
+                time.sleep(1)
+
+            p = mp.Process(target=func1, args=(self.file_path,))
+            p.start()
+            p.join()
+
+            time.sleep(2)
+            self.server.run_once()  # Allows the server to process all requests
+            mock_reap.assert_not_called()
+            self.assertEqual(0, len(self.server._timers))
+
+        @mock.patch("torch.distributed.elastic.timer.FileTimerServer._reap_worker")
+        @mock.patch(
+            "torch.distributed.elastic.timer.FileTimerServer.is_process_running"
+        )
+        def test_exit_before_release_reap(self, mock_pid_exists, mock_reap):
+            def func1(file_path):
+                client = timer.FileTimerClient(file_path)
+                timer.configure(client)
+                expire = time.time() + 2
+                client.acquire("test_scope", expire)
+                time.sleep(1)
+
+            mock_pid_exists.return_value = True
+            p = mp.Process(target=func1, args=(self.file_path,))
+            p.start()
+            p.join()
+
+            time.sleep(2)
+            self.server.run_once()  # Allows the server to process all requests
+            mock_reap.assert_called()
+            self.assertEqual(0, len(self.server._timers))
 
         @staticmethod
         def _run(file_path, timeout, duration):
@@ -147,7 +193,6 @@ def _request_on_interval(file_path, n, interval, sem):
             client.acquire("test_scope", 0)
             time.sleep(interval)
 
-
     class FileTimerClientTest(TestCase):
         def test_send_request_without_server(self):
             client = timer.FileTimerClient("test_file")
@@ -156,13 +201,14 @@ def test_send_request_without_server(self):
                 with timer.expires(after=0.1):
                     time.sleep(0.1)
 
-
     class FileTimerServerTest(TestCase):
         def setUp(self):
             super().setUp()
             self.file_path = "/tmp/test_file_path_" + str(uuid.uuid4())
             self.max_interval = 0.01
-            self.server = timer.FileTimerServer(self.file_path, self.max_interval)
+            self.server = timer.FileTimerServer(
+                self.file_path, "test", self.max_interval
+            )
 
         def tearDown(self):
             super().tearDown()
@@ -196,17 +242,32 @@ def test_watchdog_empty_queue(self):
 
         def _expired_timer(self, pid, scope):
             expired = time.time() - 60
-            return timer.FileTimerRequest(worker_pid=pid, scope_id=scope, expiration_time=expired, signal=signal.SIGKILL)
+            return timer.FileTimerRequest(
+                worker_pid=pid,
+                scope_id=scope,
+                expiration_time=expired,
+                signal=signal.SIGKILL,
+            )
 
         def _valid_timer(self, pid, scope):
             valid = time.time() + 60
-            return timer.FileTimerRequest(worker_pid=pid, scope_id=scope, expiration_time=valid, signal=signal.SIGKILL)
+            return timer.FileTimerRequest(
+                worker_pid=pid,
+                scope_id=scope,
+                expiration_time=valid,
+                signal=signal.SIGKILL,
+            )
 
         def _release_timer(self, pid, scope):
-            return timer.FileTimerRequest(worker_pid=pid, scope_id=scope, expiration_time=-1)
+            return timer.FileTimerRequest(
+                worker_pid=pid, scope_id=scope, expiration_time=-1
+            )
 
         @mock.patch("os.kill")
-        def test_expired_timers(self, mock_os_kill):
+        @mock.patch(
+            "torch.distributed.elastic.timer.file_based_local_timer.log_debug_info_for_expired_timers"
+        )
+        def test_expired_timers(self, mock_debug_info, mock_os_kill):
             """
             tests that a single expired timer on a process should terminate
             the process and clean up all pending timers that was owned by the process
@@ -221,6 +282,7 @@ def test_expired_timers(self, mock_os_kill):
             self.server.run_once()  # Allows the server to process all requests
             self.assertEqual(0, len(self.server._timers))
             mock_os_kill.assert_called_once_with(test_pid, signal.SIGKILL)
+            mock_debug_info.assert_called()
 
         @mock.patch("os.kill")
         def test_send_request_release(self, mock_os_kill):
@@ -240,12 +302,16 @@ def test_send_request_release(self, mock_os_kill):
             self.assertEqual(0, len(self.server._timers))
             mock_os_kill.assert_not_called()
 
+        @mock.patch(
+            "torch.distributed.elastic.timer.FileTimerServer.is_process_running"
+        )
         @mock.patch("os.kill")
-        def test_valid_timers(self, mock_os_kill):
+        def test_valid_timers(self, mock_os_kill, mock_pid_exists):
             """
             tests that valid timers are processed correctly and the process is left alone
             """
             self.server.start()
+            mock_pid_exists.return_value = True
 
             client = timer.FileTimerClient(self.file_path)
             client._send_request(self._valid_timer(pid=-3, scope="test1"))
diff --git a/test/distributed/elastic/timer/local_timer_example.py b/test/distributed/elastic/timer/local_timer_example.py
index 71204d8350486..48907bca7b18d 100644
--- a/test/distributed/elastic/timer/local_timer_example.py
+++ b/test/distributed/elastic/timer/local_timer_example.py
@@ -14,12 +14,12 @@
 import torch.distributed.elastic.timer as timer
 import torch.multiprocessing as torch_mp
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
-    run_tests,
-    IS_WINDOWS,
     IS_MACOS,
+    IS_WINDOWS,
+    run_tests,
     skip_but_pass_in_sandcastle_if,
-    TestCase
+    TEST_WITH_DEV_DBG_ASAN,
+    TestCase,
 )
 
 
@@ -42,6 +42,7 @@ def _stuck_function(rank, mp_queue):
 
 # timer is not supported on macos or windows
 if not (IS_WINDOWS or IS_MACOS):
+
     class LocalTimerExample(TestCase):
         """
         Demonstrates how to use LocalTimerServer and LocalTimerClient
@@ -55,7 +56,9 @@ class LocalTimerExample(TestCase):
         unittest. As of now this will SIGSEGV.
         """
 
-        @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test is asan incompatible")
+        @skip_but_pass_in_sandcastle_if(
+            TEST_WITH_DEV_DBG_ASAN, "test is asan incompatible"
+        )
         def test_torch_mp_example(self):
             # in practice set the max_interval to a larger value (e.g. 60 seconds)
             mp_queue = mp.get_context("spawn").Queue()
@@ -80,7 +83,9 @@ def test_torch_mp_example(self):
 
             server.stop()
 
-        @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test is asan incompatible")
+        @skip_but_pass_in_sandcastle_if(
+            TEST_WITH_DEV_DBG_ASAN, "test is asan incompatible"
+        )
         def test_example_start_method_spawn(self):
             self._run_example_with(start_method="spawn")
 
diff --git a/test/distributed/elastic/timer/local_timer_test.py b/test/distributed/elastic/timer/local_timer_test.py
index 386b6e9be908a..591a5d7e7c29d 100644
--- a/test/distributed/elastic/timer/local_timer_test.py
+++ b/test/distributed/elastic/timer/local_timer_test.py
@@ -15,12 +15,12 @@
 from torch.distributed.elastic.timer.api import TimerRequest
 from torch.distributed.elastic.timer.local_timer import MultiprocessingRequestQueue
 from torch.testing._internal.common_utils import (
-    run_tests,
-    IS_WINDOWS,
     IS_MACOS,
+    IS_WINDOWS,
+    run_tests,
     TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_TSAN,
-    TestCase
+    TestCase,
 )
 
 
@@ -51,7 +51,7 @@ def tearDown(self):
         def test_exception_propagation(self):
             with self.assertRaises(Exception, msg="foobar"):
                 with timer.expires(after=1):
-                    raise Exception("foobar")
+                    raise Exception("foobar")  # noqa: TRY002
 
         def test_no_client(self):
             # no timer client configured; exception expected
diff --git a/test/distributed/elastic/utils/distributed_test.py b/test/distributed/elastic/utils/distributed_test.py
index 4e5fa8d7e0c32..ded2c7e9d458f 100644
--- a/test/distributed/elastic/utils/distributed_test.py
+++ b/test/distributed/elastic/utils/distributed_test.py
@@ -24,14 +24,21 @@
     IS_WINDOWS,
     run_tests,
     TEST_WITH_TSAN,
-    TestCase
+    TestCase,
 )
 
 
 def _create_c10d_store_mp(is_server, server_addr, port, world_size, wait_for_workers):
-    store = create_c10d_store(is_server, server_addr, port, world_size, wait_for_workers=wait_for_workers, timeout=2)
+    store = create_c10d_store(
+        is_server,
+        server_addr,
+        port,
+        world_size,
+        wait_for_workers=wait_for_workers,
+        timeout=2,
+    )
     if store is None:
-        raise AssertionError()
+        raise AssertionError
 
     store.set(f"test_key/{os.getpid()}", b"test_value")
 
@@ -119,6 +126,33 @@ def test_create_store_timeout_on_worker(self):
                 timeout=1,
             )
 
+    def test_create_store_with_libuv_support(self):
+        world_size = 1
+        wait_for_workers = False
+        localhost = socket.gethostname()
+
+        store = create_c10d_store(
+            is_server=True,
+            server_addr=localhost,
+            server_port=0,
+            timeout=2,
+            world_size=world_size,
+            wait_for_workers=wait_for_workers,
+            use_libuv=False,
+        )
+        self.assertFalse(store.libuvBackend)
+
+        store = create_c10d_store(
+            is_server=True,
+            server_addr=localhost,
+            server_port=0,
+            timeout=2,
+            world_size=world_size,
+            wait_for_workers=wait_for_workers,
+            use_libuv=True,
+        )
+        self.assertTrue(store.libuvBackend)
+
     def test_port_already_in_use_on_server(self):
         # try to create the TCPStore server twice on the same port
         # the second should fail due to a port conflict
diff --git a/test/distributed/elastic/utils/util_test.py b/test/distributed/elastic/utils/util_test.py
index 60db327d5b95c..ab890b0375aeb 100644
--- a/test/distributed/elastic/utils/util_test.py
+++ b/test/distributed/elastic/utils/util_test.py
@@ -7,77 +7,142 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from unittest import mock
+import datetime
+from multiprocessing.pool import ThreadPool
+from typing import List
+
+import torch.distributed as dist
 
 import torch.distributed.elastic.utils.store as store_util
 from torch.distributed.elastic.utils.logging import get_logger
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
+class MockStore:
+    def __init__(self):
+        self.ops = []
+
+    def set_timeout(self, timeout: float) -> None:
+        self.ops.append(("set_timeout", timeout))
+
+    @property
+    def timeout(self) -> datetime.timedelta:
+        self.ops.append(("timeout",))
+
+        return datetime.timedelta(seconds=1234)
+
+    def set(self, key: str, value: str) -> None:
+        self.ops.append(("set", key, value))
+
+    def get(self, key: str) -> str:
+        self.ops.append(("get", key))
+        return "value"
+
+    def multi_get(self, keys: List[str]) -> List[str]:
+        self.ops.append(("multi_get", keys))
+        return ["value"] * len(keys)
+
+    def add(self, key: str, val: int) -> int:
+        self.ops.append(("add", key, val))
+        return 3
+
+
 class StoreUtilTest(TestCase):
     def test_get_all_rank_0(self):
-        store = mock.MagicMock()
         world_size = 3
+
+        store = MockStore()
+
         store_util.get_all(store, 0, "test/store", world_size)
-        # omit empty kwargs, get only key
-        actual_set_call_args = [
-            call_args[0][0] for call_args in store.set.call_args_list
-        ]
-        self.assertListEqual(["test/store0.FIN"], actual_set_call_args)
-
-        actual_get_call_args = [call_args[0] for call_args in store.get.call_args_list]
-        expected_get_call_args = [
-            ("test/store0",),
-            ("test/store1",),
-            ("test/store2",),
-            ("test/store0.FIN",),
-            ("test/store1.FIN",),
-            ("test/store2.FIN",),
-        ]
-        self.assertListEqual(expected_get_call_args, actual_get_call_args)
+
+        self.assertListEqual(
+            store.ops,
+            [
+                ("multi_get", ["test/store0", "test/store1", "test/store2"]),
+                ("add", "test/store/finished/num_members", 1),
+                ("set", "test/store/finished/last_member", "<val_ignored>"),
+                ("get", "test/store/finished/last_member"),
+            ],
+        )
 
     def test_get_all_rank_n(self):
-        store = mock.MagicMock()
+        store = MockStore()
         world_size = 3
         store_util.get_all(store, 1, "test/store", world_size)
-        # omit empty kwargs, get only key
-        actual_set_call_args = [
-            call_args[0][0] for call_args in store.set.call_args_list
-        ]
-        self.assertListEqual(["test/store1.FIN"], actual_set_call_args)
-
-        actual_get_call_args = [call_args[0] for call_args in store.get.call_args_list]
-        expected_get_call_args = [
-            ("test/store0",),
-            ("test/store1",),
-            ("test/store2",),
-        ]
-        self.assertListEqual(expected_get_call_args, actual_get_call_args)
+
+        self.assertListEqual(
+            store.ops,
+            [
+                ("multi_get", ["test/store0", "test/store1", "test/store2"]),
+                ("add", "test/store/finished/num_members", 1),
+                ("set", "test/store/finished/last_member", "<val_ignored>"),
+            ],
+        )
 
     def test_synchronize(self):
-        store_mock = mock.MagicMock()
+        store = MockStore()
+
         data = b"data0"
-        store_util.synchronize(store_mock, data, 0, 3, key_prefix="torchelastic/test")
-        actual_set_call_args = store_mock.set.call_args_list
-        # omit empty kwargs
-        actual_set_call_args = [call_args[0] for call_args in actual_set_call_args]
-        expected_set_call_args = [
-            ("torchelastic/test0", b"data0"),
-            ("torchelastic/test0.FIN", b"FIN"),
-        ]
-        self.assertListEqual(expected_set_call_args, actual_set_call_args)
-
-        expected_get_call_args = [
-            ("torchelastic/test0",),
-            ("torchelastic/test1",),
-            ("torchelastic/test2",),
-            ("torchelastic/test0.FIN",),
-            ("torchelastic/test1.FIN",),
-            ("torchelastic/test2.FIN",),
-        ]
-        actual_get_call_args = store_mock.get.call_args_list
-        actual_get_call_args = [call_args[0] for call_args in actual_get_call_args]
-        self.assertListEqual(expected_get_call_args, actual_get_call_args)
+        store_util.synchronize(store, data, 0, 3, key_prefix="test/store")
+
+        self.assertListEqual(
+            store.ops,
+            [
+                ("timeout",),
+                ("set_timeout", datetime.timedelta(seconds=300)),
+                ("set", "test/store0", data),
+                ("multi_get", ["test/store0", "test/store1", "test/store2"]),
+                ("add", "test/store/finished/num_members", 1),
+                ("set", "test/store/finished/last_member", "<val_ignored>"),
+                ("get", "test/store/finished/last_member"),
+                ("set_timeout", datetime.timedelta(seconds=1234)),
+            ],
+        )
+
+    def test_synchronize_hash_store(self) -> None:
+        N = 4
+
+        store = dist.HashStore()
+
+        def f(i: int):
+            return store_util.synchronize(
+                store, f"data{i}", i, N, key_prefix="test/store"
+            )
+
+        with ThreadPool(N) as pool:
+            out = pool.map(f, range(N))
+
+        self.assertListEqual(out, [[f"data{i}".encode() for i in range(N)]] * N)
+
+    def test_barrier(self):
+        store = MockStore()
+
+        store_util.barrier(store, 3, key_prefix="test/store")
+
+        self.assertListEqual(
+            store.ops,
+            [
+                ("timeout",),
+                ("set_timeout", datetime.timedelta(seconds=300)),
+                ("add", "test/store/num_members", 1),
+                ("set", "test/store/last_member", "<val_ignored>"),
+                ("get", "test/store/last_member"),
+                ("set_timeout", datetime.timedelta(seconds=1234)),
+            ],
+        )
+
+    def test_barrier_hash_store(self) -> None:
+        N = 4
+
+        store = dist.HashStore()
+
+        def f(i: int):
+            store_util.barrier(store, N, key_prefix="test/store")
+
+        with ThreadPool(N) as pool:
+            out = pool.map(f, range(N))
+
+        self.assertEqual(out, [None] * N)
 
 
 class UtilTest(TestCase):
diff --git a/test/distributed/fsdp/test_fsdp_comm.py b/test/distributed/fsdp/test_fsdp_comm.py
index 2e5993b23eacb..c20637061de8b 100644
--- a/test/distributed/fsdp/test_fsdp_comm.py
+++ b/test/distributed/fsdp/test_fsdp_comm.py
@@ -3,18 +3,23 @@
 import sys
 from contextlib import nullcontext
 from enum import auto, Enum
-from typing import Optional
+from typing import List, Optional
 from unittest.mock import patch
 
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from torch import distributed as dist
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.nn.parallel.distributed import DistributedDataParallel as DDP
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     CUDAInitMode,
     FSDPInitMode,
     FSDPTest,
+    MLP,
     NestedWrappedModule,
     TransformerWithSharedParams,
 )
@@ -283,7 +288,102 @@ def reset_mocks():
                 self.assertEqual(num_reduce_scatters, ref_num_reduce_scatters)
 
 
+class TestExplicitUnshard(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(torch.cuda.device_count(), 2)
+
+    @skip_if_lt_x_gpu(2)
+    @parametrize("use_orig_params", [False, True])
+    def test_unshard_async(self, use_orig_params: bool):
+        class ReduceModule(nn.Module):
+            def __init__(self, dim: int, group: dist.ProcessGroup):
+                super().__init__()
+                self.group = group
+                self.weight = nn.Parameter(torch.randn(dim, dim))
+
+            def forward(self, x: torch.Tensor):
+                y = F.relu(x @ self.weight)
+                # NOTE: This all-reduce is not differentiable and is included
+                # to exercise the overlap.
+                work = dist.all_reduce(y, group=self.group, async_op=True)
+                return y, work
+
+        class MLPs(nn.Module):
+            def __init__(self, dim: int):
+                super().__init__()
+                self.mlp1 = MLP(dim)
+                self.mlp2 = MLP(dim)
+                self.mlp3 = MLP(dim)
+
+            def forward(self, ys: List[torch.Tensor], works: List[dist.Work]):
+                (y1, y2, y3), (work1, work2, work3) = ys, works
+                work1.wait()
+                z1 = self.mlp1(y1)
+                work2.wait()
+                z2 = self.mlp2(y2)
+                work3.wait()
+                z3 = self.mlp3(y3)
+                return z1 + z2 + z3
+
+        class ReduceModel(nn.Module):
+            def __init__(self, dim: int, group: dist.ProcessGroup):
+                super().__init__()
+                self.reduce_module1 = ReduceModule(dim, group)
+                self.reduce_module2 = ReduceModule(dim, group)
+                self.reduce_module3 = ReduceModule(dim, group)
+                self.mlps = MLPs(dim)
+
+            def forward(self, x: torch.Tensor):
+                y1, work1 = self.reduce_module1(x)
+                if isinstance(self.mlps.mlp1, FSDP):
+                    self.mlps.mlp1._unshard(async_op=True)
+                y2, work2 = self.reduce_module2(x)
+                if isinstance(self.mlps.mlp2, FSDP):
+                    self.mlps.mlp2._unshard(async_op=True)
+                y3, work3 = self.reduce_module3(x)
+                if isinstance(self.mlps.mlp3, FSDP):
+                    self.mlps.mlp3._unshard(async_op=True)
+                return self.mlps([y1, y2, y3], [work1, work2, work3])
+
+        group = self.process_group
+        batch_size, dim = 2, 8
+        torch.manual_seed(42)
+        ref_model = DDP(ReduceModel(dim, group).cuda(), device_ids=[self.rank])
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+
+        torch.manual_seed(42)
+        model = ReduceModel(dim, group)
+        model.mlps = FSDP(
+            model.mlps,
+            sharding_strategy=ShardingStrategy.SHARD_GRAD_OP,
+            auto_wrap_policy=ModuleWrapPolicy((MLP,)),
+            device_id=self.rank,
+            use_orig_params=use_orig_params,
+        )
+        model.mlps.check_is_root()
+        mlp_params = set(model.mlps.parameters())
+        mlp_param_names = {n for n, p in model.named_parameters() if p in mlp_params}
+        DDP._set_params_and_buffers_to_ignore_for_model(model, mlp_param_names)
+        model = DDP(model.cuda(), device_ids=[self.rank])
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((batch_size, dim), device="cuda")
+
+        for _ in range(10):
+            losses: List[torch.Tensor] = []
+            for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+                _optim.step()
+                _optim.zero_grad()
+            self.assertEqual(losses[0], losses[1])
+            model.module.mlps._wait_unshard_streams_on_current_stream()
+
+
 instantiate_parametrized_tests(TestCommunication)
+instantiate_parametrized_tests(TestExplicitUnshard)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_dtensor_state_dict.py b/test/distributed/fsdp/test_fsdp_dtensor_state_dict.py
index 4ae7c3c797f85..f3eab4642c84a 100644
--- a/test/distributed/fsdp/test_fsdp_dtensor_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_dtensor_state_dict.py
@@ -313,30 +313,6 @@ def test_raises_warning_or_errors(self):
             with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
                 optim_state_dict = FSDP.optim_state_dict(model, optim)
 
-        with self.assertLogs(
-            "torch.distributed.fsdp._state_dict_utils", level="WARNING"
-        ) as log:
-            with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT):
-                state_dict = model.state_dict()
-                self.assertEqual(len(log.records), 1)
-                self.assertEqual(len(log.output), 1)
-                self.assertIn(
-                    "Found both state_dict_type FULL_STATE_DICT and device_mesh.",
-                    log.output[0],
-                )
-
-        with self.assertLogs(
-            "torch.distributed.fsdp._optim_utils", level="WARNING"
-        ) as log:
-            with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT):
-                state_dict = FSDP.optim_state_dict(model, optim)
-                self.assertEqual(len(log.records), 1)
-                self.assertEqual(len(log.output), 1)
-                self.assertIn(
-                    "Found both state_dict_type FULL_STATE_DICT and device_mesh.",
-                    log.output[0],
-                )
-
 
 instantiate_parametrized_tests(TestFSDPWithDeviceMeshAndDTensor)
 if __name__ == "__main__":
diff --git a/test/distributed/fsdp/test_fsdp_fine_tune.py b/test/distributed/fsdp/test_fsdp_fine_tune.py
index 14c194a1fa534..eb7b406bd53c0 100644
--- a/test/distributed/fsdp/test_fsdp_fine_tune.py
+++ b/test/distributed/fsdp/test_fsdp_fine_tune.py
@@ -7,6 +7,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from torch.distributed.fsdp import BackwardPrefetch, CPUOffload, MixedPrecision
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     FullyShardedDataParallel as FSDP,
     ShardingStrategy,
@@ -29,6 +30,28 @@
     sys.exit(0)
 
 
+class LinearUnusedInput(nn.Linear):
+    def forward(self, frozen_input, learnable_input):
+        return super().forward(frozen_input)
+
+
+class ModelUnusedInput(nn.Module):
+    def __init__(self, freeze: bool):
+        super().__init__()
+        self.layer0 = LinearUnusedInput(4, 4, device="cuda")
+        self.layer1_frozen = LinearUnusedInput(4, 4, device="cuda")
+        if freeze:
+            for param in self.layer1_frozen.parameters():
+                param.requires_grad = False
+        self.layer2 = LinearUnusedInput(4, 4, device="cuda")
+
+    def forward(self, frozen_input, learnable_input):
+        x = self.layer0(frozen_input, learnable_input)
+        y = self.layer1_frozen(frozen_input, learnable_input)
+        z = self.layer2(frozen_input, learnable_input)
+        return torch.concat([x, y, z, learnable_input])
+
+
 class TestFSDPFineTune(FSDPTest):
     """Tests fine-tuning cases where some parameters are frozen."""
 
@@ -276,6 +299,86 @@ def _test_parity_with_ddp(
             torch.testing.assert_close(losses[0], losses[1])
             losses.clear()
 
+    @skip_if_lt_x_gpu(2)
+    def test_parity_with_non_frozen_fsdp(self):
+        """
+        For frozen modules with unused input, reshard could happen without unshard
+        Verify numerical parity between `_post_backward_reshard_only_hook` and
+        `_post_backward_hook` path
+        """
+        self.run_subtests(
+            {
+                "sharding_strategy": [
+                    ShardingStrategy.FULL_SHARD,
+                    ShardingStrategy.SHARD_GRAD_OP,
+                ],
+                "use_orig_params": [True, False],
+                "offload_params": [True, False],
+                "mixed_precision": [
+                    MixedPrecision(),
+                    MixedPrecision(
+                        param_dtype=torch.float16,
+                        buffer_dtype=torch.float16,
+                        reduce_dtype=torch.float16,
+                    ),
+                ],
+                "backward_prefetch": [
+                    BackwardPrefetch.BACKWARD_PRE,
+                    BackwardPrefetch.BACKWARD_POST,
+                ],
+            },
+            self._test_parity_with_non_frozen_fsdp,
+        )
+
+    def _test_parity_with_non_frozen_fsdp(
+        self,
+        sharding_strategy: ShardingStrategy,
+        use_orig_params: bool,
+        offload_params: bool,
+        mixed_precision: MixedPrecision,
+        backward_prefetch: BackwardPrefetch,
+    ):
+        torch.manual_seed(42)
+        model = ModelUnusedInput(freeze=True)
+        torch.manual_seed(42)
+        ref_model = ModelUnusedInput(freeze=False)
+        fsdp_kwargs = {
+            "auto_wrap_policy": ModuleWrapPolicy({LinearUnusedInput}),
+            "sharding_strategy": sharding_strategy,
+            "use_orig_params": use_orig_params,
+            "cpu_offload": CPUOffload(offload_params=offload_params),
+            "mixed_precision": mixed_precision,
+            "backward_prefetch": backward_prefetch,
+        }
+        model = FSDP(model, **fsdp_kwargs)
+        ref_model = FSDP(ref_model, **fsdp_kwargs)
+        model_optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        ref_model_optim = torch.optim.Adam(
+            [
+                param
+                for name, param in ref_model.named_parameters()
+                if not name.startswith("_fsdp_wrapped_module.layer1_frozen")
+            ],
+            lr=1e-2,
+        )
+        torch.manual_seed(self.rank + 1)
+        losses = []
+        for idx in range(6):
+            frozen_input = torch.randn((4, 4), device="cuda", requires_grad=False)
+            learnable_input = torch.randn((4, 4), device="cuda", requires_grad=True)
+            for _model, _optim in ((model, model_optim), (ref_model, ref_model_optim)):
+                loss = _model(frozen_input, frozen_input).sum()
+                losses.append(loss)
+                loss.backward()
+                _optim.step()
+                _optim.zero_grad()
+            self.assertEqual(losses[0], losses[1])
+            losses.clear()
+        with FSDP.summon_full_params(model):
+            with FSDP.summon_full_params(ref_model):
+                for param, ref_param in zip(model.parameters(), ref_model.parameters()):
+                    self.assertEqual(param, ref_param)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_hybrid_shard.py b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
index f0bb99b5a4884..1b34cb4d72366 100644
--- a/test/distributed/fsdp/test_fsdp_hybrid_shard.py
+++ b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
@@ -11,6 +11,7 @@
 import torch.distributed as dist
 import torch.distributed.fsdp._traversal_utils as traversal_utils
 import torch.nn as nn
+from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.distributed_c10d import _rank_not_in_group
 from torch.distributed.fsdp import (
     FullyShardedDataParallel as FSDP,
@@ -116,46 +117,6 @@ def test_raises_manual_wrap_hybrid_shard_when_none_policy(self):
         with err_ctx:
             model = FSDP(model, sharding_strategy=ShardingStrategy._HYBRID_SHARD_ZERO2)
 
-    @skip_if_lt_x_gpu(2)
-    def test_hybrid_shard_pg_mismatch_raises(self):
-        model = MyModel().cuda()
-        intra_pg = self.process_group
-        inter_pg = dist.new_group(ranks=[self.rank])
-        # Mismatched process groups for intra-node
-        model.lin1 = FSDP(
-            model.lin1,
-            process_group=(intra_pg, inter_pg),
-            sharding_strategy=ShardingStrategy.HYBRID_SHARD,
-        )
-        model = FSDP(
-            model,
-            process_group=(dist.new_group(), dist.new_group()),
-            sharding_strategy=ShardingStrategy.HYBRID_SHARD,
-        )
-        # Errors during _lazy_init
-        inp = torch.randn(4, 10)
-        with self.assertRaisesRegex(
-            ValueError, "intra-node process groups do not match"
-        ):
-            model(inp)
-
-        # Mismatched process groups for inter-node
-        model = MyModel().cuda()
-        model.lin1 = FSDP(
-            model.lin1,
-            process_group=(intra_pg, inter_pg),
-            sharding_strategy=ShardingStrategy.HYBRID_SHARD,
-        )
-        model = FSDP(
-            model,
-            process_group=(intra_pg, dist.new_group()),
-            sharding_strategy=ShardingStrategy.HYBRID_SHARD,
-        )
-        with self.assertRaisesRegex(
-            ValueError, "inter-node process groups do not match"
-        ):
-            model(inp)
-
     @skip_if_lt_x_gpu(4)
     def test_hsdp_save_load_state_dict(self):
         model = MyModel().cuda()
@@ -284,6 +245,7 @@ def test_fsdp_hybrid_shard_basic_setup(self):
                     ShardingStrategyMode.MIXED_HYBRID_FULL_SHARD,
                 ],
                 "use_orig_params": [False, True],
+                "use_device_mesh": [False, True],
             },
             self._test_fsdp_hybrid_shard_basic_setup,
         )
@@ -293,9 +255,17 @@ def _test_fsdp_hybrid_shard_basic_setup(
         hsdp_sharding_strategy: ShardingStrategy,
         sharding_strategy_mode: ShardingStrategyMode,
         use_orig_params: bool,
+        use_device_mesh: bool,
     ):
+        if use_device_mesh:
+            device_mesh = init_device_mesh("cuda", (1, self.world_size))
+        else:
+            device_mesh = None
         hsdp_model = self._init_hsdp_model(
-            hsdp_sharding_strategy, sharding_strategy_mode, use_orig_params
+            hsdp_sharding_strategy,
+            sharding_strategy_mode,
+            use_orig_params,
+            hsdp_device_mesh=device_mesh,
         )
         # All FSDP modules should have state.process_group as the process group over which to
         # shard (default process group), and state._inter_node_pg (process group containing only
@@ -428,7 +398,9 @@ def _init_hsdp_model(
         hsdp_process_groups: Optional[
             Tuple[dist.ProcessGroup, dist.ProcessGroup]
         ] = None,
+        hsdp_device_mesh: Optional = None,
     ):
+        assert hsdp_process_groups is None or hsdp_device_mesh is None
         auto_wrap_policy = ModuleWrapPolicy(
             {TransformerEncoderLayer, TransformerDecoderLayer},
         )
@@ -437,6 +409,7 @@ def _init_hsdp_model(
             "auto_wrap_policy": auto_wrap_policy,
             "sharding_strategy": hsdp_sharding_strategy,
             "use_orig_params": use_orig_params,
+            "device_mesh": hsdp_device_mesh,
         }
         if sharding_strategy_mode == ShardingStrategyMode.ALL_HYBRID_SHARD:
             hsdp_model = TransformerWithSharedParams.init(
diff --git a/test/distributed/fsdp/test_fsdp_optim_state.py b/test/distributed/fsdp/test_fsdp_optim_state.py
index 4080168e4206c..672b71d5290f6 100644
--- a/test/distributed/fsdp/test_fsdp_optim_state.py
+++ b/test/distributed/fsdp/test_fsdp_optim_state.py
@@ -363,7 +363,7 @@ def _init_transformer_model(
             # these settings are not implemented since the transformer is
             # wrapped with FSDP at the top-level, which means that there is
             # only a single flat parameter, making these booleans vacuous
-            raise NotImplementedError()
+            raise NotImplementedError
         if group is None:
             group = dist.distributed_c10d._get_default_group()
         model = TransformerWithSharedParams.init(
diff --git a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
index b73cdd6d5f919..c4abbfc6ea3de 100644
--- a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
+++ b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
@@ -1,17 +1,24 @@
 # Owner(s): ["oncall: distributed"]
 
+import copy
 import functools
 import itertools
 import sys
 import unittest
-from typing import Optional
+from typing import List, Optional
 
 import torch
 from torch import distributed as dist
 from torch.cuda.amp.common import amp_definitely_not_available
 from torch.distributed.fsdp import CPUOffload, MixedPrecision
-from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    FullyShardedDataParallel as FSDP,
+    ShardingStrategy,
+)
 from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
+from torch.nn.parallel.distributed import DistributedDataParallel as DDP
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     CUDAInitMode,
@@ -21,6 +28,7 @@
     NestedWrappedModule,
     NonUniformReqGradNWM,
     subtest_name,
+    TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -196,6 +204,143 @@ def test_fsdp_ddp_parity_with_grad_scaler(
                 sharded_grad_scaler_kwargs=sharded_grad_scaler_kwargs,
             )
 
+    def _build_model_and_optim(
+        self,
+        cpu_offload: CPUOffload = CPUOffload(offload_params=False),
+        use_orig_params: bool = False,
+    ):
+        model = TransformerWithSharedParams.init(
+            self.process_group,
+            FSDPInitMode.NO_FSDP,
+            CUDAInitMode.CUDA_BEFORE,
+            deterministic=True,
+        )
+        ref_model = DDP(
+            copy.deepcopy(model),
+            device_ids=[self.rank],
+        )
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        fsdp_kwargs = {
+            "use_orig_params": use_orig_params,
+            "cpu_offload": cpu_offload,
+            "auto_wrap_policy": ModuleWrapPolicy(
+                {
+                    TransformerEncoderLayer,
+                    TransformerDecoderLayer,
+                }
+            ),
+        }
+        model = FSDP(model, **fsdp_kwargs)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        return model, optim, ref_model, ref_optim
+
+    @skip_if_lt_x_gpu(2)
+    def test_sharded_grad_scaler_found_inf(self):
+        self.run_subtests(
+            {
+                "use_orig_params": [False, True],
+                "cpu_offload": [
+                    CPUOffload(offload_params=True),
+                    CPUOffload(offload_params=False),
+                ],
+            },
+            self._test_sharded_grad_scaler_found_inf,
+        )
+
+    def _test_sharded_grad_scaler_found_inf(
+        self,
+        use_orig_params: bool,
+        cpu_offload: CPUOffload,
+    ):
+        model, optim, ref_model, ref_optim = self._build_model_and_optim(
+            cpu_offload=cpu_offload,
+            use_orig_params=use_orig_params,
+        )
+        grad_scaler = ShardedGradScaler(init_scale=2.0)
+        ref_grad_scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
+        scaled_losses: List[torch.Tensor] = []
+        device = torch.device("cuda")
+        torch.manual_seed(42 + self.rank + 1)
+
+        for iter in range(10):
+            for _model, _optim, _grad_scaler in (
+                (ref_model, ref_optim, ref_grad_scaler),
+                (model, optim, grad_scaler),
+            ):
+                module = _model.module
+                inp = module.get_input(device)
+                _optim.zero_grad()
+                output = _model(*inp)
+                loss = module.get_loss(inp, output)
+                scaled_loss = _grad_scaler.scale(loss)
+                scaled_losses.append(scaled_loss)
+                scaled_loss.backward()
+                orig_params = [
+                    param.detach().clone()
+                    for param in _model.parameters()
+                    if param.grad is not None
+                ]
+                should_find_inf = iter % 2 == 0
+                if should_find_inf and (
+                    _model is ref_model or (_model is model and self.rank == 0)
+                ):
+                    # other ranks should find infs from rank 0
+                    # after collectives
+                    for param in _model.parameters():
+                        if param.grad is None:
+                            continue
+                        param.grad.fill_(float("inf"))
+                        break
+                _grad_scaler.step(_optim)
+                orig_scale = _grad_scaler.get_scale()
+                _grad_scaler.update()
+                if should_find_inf:
+                    self.assertEqual(
+                        _grad_scaler.get_scale(),
+                        orig_scale * _grad_scaler.get_backoff_factor(),
+                        (
+                            f"rank: {self.rank} iter: {iter} expect origin scale {orig_scale} "
+                            f"to be backed off by {_grad_scaler.get_backoff_factor()} "
+                            f"but got {_grad_scaler.get_scale()}"
+                        ),
+                    )
+                else:
+                    self.assertEqual(
+                        _grad_scaler.get_scale(),
+                        orig_scale,
+                        (
+                            f"rank: {self.rank} iter: {iter} expect same scale {orig_scale} "
+                            f"but got {_grad_scaler.get_scale()}"
+                        ),
+                    )
+                for param, orig_param in zip(
+                    [param for param in _model.parameters() if param.grad is not None],
+                    orig_params,
+                ):
+                    if should_find_inf:
+                        self.assertEqual(
+                            param,
+                            orig_param,
+                            (
+                                f"rank: {self.rank} iter: {iter} expect the same params before "
+                                f"and after optim.step but got {param} vs {orig_param}"
+                            ),
+                        )
+                    else:
+                        self.assertNotEqual(
+                            param,
+                            orig_param,
+                            (
+                                f"rank: {self.rank} iter: {iter} expect the updated params after "
+                                f"optim.step but got {param} vs {orig_param}"
+                            ),
+                        )
+            self.assertEqual(
+                scaled_losses[0],
+                scaled_losses[1],
+                f"iter: {iter} {scaled_losses[0]} vs {scaled_losses[1]}",
+            )
+
 
 instantiate_parametrized_tests(TestShardGradScaler)
 instantiate_parametrized_tests(TestShardedGradScalerParityWithDDP)
diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
index 85a6557714179..6374b06702b50 100644
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -34,6 +34,7 @@
     ShardedStateDictConfig,
     StateDictType,
 )
+from torch.distributed.fsdp._common_utils import FSDP_PREFIX
 from torch.distributed.fsdp._unshard_param_utils import FLAT_PARAM
 from torch.distributed.fsdp.wrap import enable_wrap, ModuleWrapPolicy, wrap
 from torch.nn import Linear, Module, TransformerDecoderLayer, TransformerEncoderLayer
@@ -152,7 +153,7 @@ def get_input(self):
 class TestFSDPStateDict(FSDPTest):
     @property
     def world_size(self):
-        return 2
+        return min(torch.cuda.device_count(), 2)
 
     def _broadcast_state_dict(self, model, state_dict):
         # TODO (rohan-varma): remove model
@@ -1179,6 +1180,20 @@ def test_shared_module_and_shared_parameter(self):
             self.assertEqual(state_dict["net2.0.bias"], state_dict["net3.0.bias"])
             self.assertEqual(state_dict["net2.0.weight"], state_dict["net3.0.weight"])
 
+    @skip_if_lt_x_gpu(2)
+    def test_full_state_dict_missing_unexpected_keys_cleaned(self):
+        model = self._get_simple_nested_model()
+        sd = model.state_dict()
+        # Create a missing key
+        sd.pop(next(iter(sd.keys())))
+        # Create an unexpected key
+        sd["unexpected"] = torch.ones(1)
+        missing, unexpected = model.load_state_dict(sd, strict=False)
+        assert len(missing) == 1
+        assert len(unexpected) == 1
+        self.assertTrue(FSDP_PREFIX not in missing[0])
+        self.assertTrue(FSDP_PREFIX not in unexpected[0])
+
     @skip_if_lt_x_gpu(2)
     def test_sharded_load_multi_backend_pg(self):
         auto_wrap_policy = ModuleWrapPolicy(
@@ -1236,7 +1251,7 @@ def test_world_size_one(self):
 class TestFSDPStateDict4GPUs(FSDPTest):
     @property
     def world_size(self):
-        return max(torch.cuda.device_count(), 2)
+        return torch.cuda.device_count()
 
     @skip_if_lt_x_gpu(4)
     def test_local_state_dict_reshard(self):
diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py
index a074e9a0ac3a9..496dff21d663a 100644
--- a/test/distributed/fsdp/test_fsdp_tp_integration.py
+++ b/test/distributed/fsdp/test_fsdp_tp_integration.py
@@ -11,12 +11,14 @@
     distribute_module,
     DTensor,
     init_device_mesh,
+    Replicate,
     Shard,
 )
 from torch.distributed._tensor.debug import CommDebugMode
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     CPUOffload,
     FullyShardedDataParallel as FSDP,
+    ShardingStrategy,
 )
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
@@ -27,11 +29,13 @@
 from torch.testing._internal.common_fsdp import FSDPTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
-    parametrize,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
 )
-from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    MLPModule,
+    RMSNormPython,
+)
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
@@ -65,27 +69,12 @@ def get_non_sharded_param_names() -> List[str]:
         return ["net3.weight", "net3.bias"]
 
 
-# simple RMSNorm layer for testing
-class RMSNormPython(torch.nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = torch.nn.Parameter(torch.ones(dim))
-
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
-    def forward(self, x):
-        output = self._norm(x)
-        return output * self.weight
-
-
 def distribute_rmsnorm(module, device_mesh):
-    def prepare_input_fn(inputs, device_mesh):
+    def prepare_input_fn(mod, inputs, device_mesh):
         shard_tensor = DTensor.from_local(inputs[0], device_mesh, [Shard(0)])
         return shard_tensor
 
-    def prepare_output_fn(outputs, device_mesh):
+    def prepare_output_fn(mod, outputs, device_mesh):
         return outputs.to_local()
 
     return distribute_module(
@@ -152,31 +141,36 @@ def _sync_tp_grads(
         tp_world_size = tp_pg.size()
         fsdp_world_size = self.world_size // tp_world_size
         assert (
-            type(tp_fsdp_model) is FSDP and len(list(tp_fsdp_model.parameters())) == 1
+            type(tp_fsdp_model) is FSDP
+            and len([m for m in tp_fsdp_model.modules() if type(m) is FSDP]) == 1
         ), (
             "The following logic assumes a single top-level-only FSDP wrapping "
             "the model with TP already applied"
         )
-        flat_param = tp_fsdp_model.params[0]
-        splits = tuple(param_name_to_numel.values())
-        # Create a mask over the gradient elements to manually reduce
-        unsharded_size = torch.Size([flat_param.numel() * fsdp_world_size])
-        unsharded_zeros = torch.zeros(unsharded_size, device=flat_param.device)
-        per_param_masks = unsharded_zeros.split(splits)
-        for param_idx, param_name in enumerate(
-            param_name_to_numel.keys()
-        ):  # assumes fixed order
-            if param_name not in non_sharded_param_names:
-                per_param_masks[param_idx][:] = 1
-        unsharded_mask = torch.cat(per_param_masks).contiguous().type(torch.BoolTensor)
-        sharded_mask = unsharded_mask.chunk(fsdp_world_size)[self.rank // tp_world_size]
-        grad_device = flat_param.grad.device
-        grad = flat_param.grad.detach().clone().cuda(self.rank)
-        dist.all_reduce(grad, op=dist.ReduceOp.SUM, group=tp_pg)
-        grad = grad.to(grad_device)
-        flat_param.grad[~sharded_mask] = grad[~sharded_mask]
-        # Average *all* gradient elements to match the FSDP only semantics
-        flat_param.grad /= tp_world_size
+        for flat_param in tp_fsdp_model.params:
+            splits = tuple(param_name_to_numel.values())
+            # Create a mask over the gradient elements to manually reduce
+            unsharded_size = torch.Size([flat_param.numel() * fsdp_world_size])
+            unsharded_zeros = torch.zeros(unsharded_size, device=flat_param.device)
+            per_param_masks = unsharded_zeros.split(splits)
+            for param_idx, param_name in enumerate(
+                param_name_to_numel.keys()
+            ):  # assumes fixed order
+                if param_name not in non_sharded_param_names:
+                    per_param_masks[param_idx][:] = 1
+            unsharded_mask = (
+                torch.cat(per_param_masks).contiguous().type(torch.BoolTensor)
+            )
+            sharded_mask = unsharded_mask.chunk(fsdp_world_size)[
+                self.rank // tp_world_size
+            ]
+            grad_device = flat_param.grad.device
+            grad = flat_param.grad.detach().clone().cuda(self.rank)
+            dist.all_reduce(grad, op=dist.ReduceOp.SUM, group=tp_pg)
+            grad = grad.to(grad_device)
+            flat_param.grad[~sharded_mask] = grad[~sharded_mask]
+            # Average *all* gradient elements to match the FSDP only semantics
+            flat_param.grad /= tp_world_size
 
     def _get_grads_as_flattened(
         self,
@@ -193,14 +187,21 @@ def _get_grads_as_flattened(
         returns the same value on all ranks.
         """
         local_grads_as_flattened = (
-            torch.cat([torch.flatten(param.grad) for param in model.parameters()])
+            torch.cat(
+                [
+                    torch.flatten(param.grad)
+                    if param.grad is not None
+                    else torch.zeros_like(torch.flatten(param))
+                    for param in model.parameters()
+                ]
+            )
             .contiguous()
             .cuda(self.rank)
         )
         all_grads_as_flattened = torch.cat(
             [torch.empty_like(local_grads_as_flattened) for _ in range(fsdp_pg.size())]
         ).contiguous()
-        dist._all_gather_base(
+        dist.all_gather_into_tensor(
             all_grads_as_flattened, local_grads_as_flattened, group=fsdp_pg
         )
         if not uses_tp:
@@ -225,16 +226,27 @@ def _get_grads_as_flattened(
         return torch.cat(all_grads_per_param).contiguous()
 
     @skip_if_lt_x_gpu(4)
-    @parametrize("tensor_parallel_size", [2, 4])
-    @parametrize(
-        "cpu_offload",
-        [CPUOffload(offload_params=False), CPUOffload(offload_params=True)],
-    )
-    def test_fsdp_tp_integration(self, tensor_parallel_size, cpu_offload):
+    def test_fsdp_tp_integration(self):
+        self.run_subtests(
+            {
+                "cpu_offload": [
+                    CPUOffload(offload_params=False),
+                    CPUOffload(offload_params=True),
+                ],
+                "sharding_strategy": [None, ShardingStrategy.SHARD_GRAD_OP],
+                "use_orig_params": [False, True],
+            },
+            self._test_fsdp_tp_integration,
+        )
+
+    def _test_fsdp_tp_integration(
+        self, cpu_offload, sharding_strategy, use_orig_params
+    ):
         """
         Tests training for TP + FSDP integration by comparing an FSDP-only
         model with a TP + FSDP model.
         """
+        tensor_parallel_size = 2
         LR = 3e-5
         torch.manual_seed(0)
         model = SimpleModel().cuda(self.rank)
@@ -257,7 +269,13 @@ def test_fsdp_tp_integration(self, tensor_parallel_size, cpu_offload):
         self.assertEqual(model(inp), tp_fsdp_model(inp))  # sanity check
 
         mesh_1d = init_device_mesh("cuda", (self.world_size,))
-        fsdp_model = FSDP(model, cpu_offload=cpu_offload, device_mesh=mesh_1d)
+        fsdp_model = FSDP(
+            model,
+            cpu_offload=cpu_offload,
+            device_mesh=mesh_1d,
+            sharding_strategy=sharding_strategy,
+            use_orig_params=use_orig_params,
+        )
         mesh_2d = init_device_mesh(
             "cuda",
             (self.world_size // tensor_parallel_size, tensor_parallel_size),
@@ -280,6 +298,8 @@ def test_fsdp_tp_integration(self, tensor_parallel_size, cpu_offload):
             tp_fsdp_model,
             cpu_offload=cpu_offload,
             device_mesh=mesh_2d["dp"],
+            sharding_strategy=sharding_strategy,
+            use_orig_params=use_orig_params,
         )
         fsdp_pg = mesh_2d["dp"].get_group(mesh_dim=0)
 
@@ -367,17 +387,83 @@ def forward(self, x):
             fsdp_2d_model(torch.rand(2, 10).cuda(self.rank)).sum().backward()
 
         funcol = torch.ops.c10d_functional
+        c10d_ops = torch.ops.c10d
         comm_counts = comm_mode.get_comm_counts()
-        self.assertEqual(comm_mode.get_total_counts(), 5)
+        self.assertEqual(comm_mode.get_total_counts(), 7)
+        # TP comms
         self.assertEqual(comm_counts[funcol.reduce_scatter_tensor], 2)
         self.assertEqual(comm_counts[funcol.all_gather_into_tensor], 2)
         self.assertEqual(comm_counts[funcol.all_reduce], 1)
+        # FSDP comms
+        self.assertEqual(comm_counts[c10d_ops._allgather_base_], 1)
+        self.assertEqual(comm_counts[c10d_ops._reduce_scatter_base_], 1)
 
         grads = [p.grad for p in fsdp_2d_model.parameters() if p.grad is not None]
 
         for grad in grads:
             self.assertFalse(grad.isnan().any().item())
 
+    @skip_if_lt_x_gpu(4)
+    def test_fsdp_tp_sync_module_state(self):
+        mesh_2d = init_device_mesh(
+            "cuda", (self.world_size // 2, 2), mesh_dim_names=["dp", "tp"]
+        )
+        tp_mesh = mesh_2d["tp"]
+        dp_mesh = mesh_2d["dp"]
+
+        # set random seed for each rank
+        torch.manual_seed(mesh_2d.get_rank())
+
+        class TestModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                replicated_dt = DTensor.from_local(
+                    torch.randn(8, 8), tp_mesh, [Replicate()], run_check=False
+                )
+                replicated_buffer_dt = DTensor.from_local(
+                    torch.randn(8, 8), tp_mesh, [Replicate()], run_check=False
+                )
+                self.param = torch.nn.Parameter(replicated_dt)
+                self.register_buffer("buf", replicated_buffer_dt)
+
+            def forward(self, x):
+                return self.param + self.buffer + 1
+
+        model = TestModel()
+
+        def assert_local_shard_across_ranks(local_tensor, group, check_equal=True):
+            gathered_tensors = [
+                torch.empty_like(local_tensor) for _ in range(group.size())
+            ]
+            dist.all_gather(gathered_tensors, local_tensor, group=group)
+            # on dp mesh dim local tensor does not equal
+            tensor_to_compare = gathered_tensors[0]
+            for tensor in gathered_tensors[1:]:
+                if check_equal:
+                    self.assertTrue(torch.equal(tensor, tensor_to_compare))
+                else:
+                    self.assertFalse(torch.equal(tensor, tensor_to_compare))
+
+        dp_group = dp_mesh.get_group()
+
+        # check on dp mesh dim param local tensor does not equal
+        local_param = model.param.to_local()
+        assert_local_shard_across_ranks(local_param, dp_group, check_equal=False)
+        # check on dp mesh dim buffer local tensor does not equal
+        local_buf = model.buf.to_local()
+        assert_local_shard_across_ranks(local_buf, dp_group, check_equal=False)
+
+        # wrap with fsdp sync param should sync dp mesh dim
+        fsdp_mod = FSDP(model, device_mesh=dp_mesh, sync_module_states=True)
+        with fsdp_mod.summon_full_params(fsdp_mod):
+            # on dp mesh dim local param does equal after sync_module_states
+            local_param = fsdp_mod.param.to_local()
+            assert_local_shard_across_ranks(local_param, dp_group, check_equal=True)
+
+            # on dp mesh dim local buf does equal after sync_module_states
+            local_buf = fsdp_mod.buf.to_local()
+            assert_local_shard_across_ranks(local_buf, dp_group, check_equal=True)
+
 
 instantiate_parametrized_tests(TestTPFSDPIntegration)
 
diff --git a/test/distributed/fsdp/test_fsdp_unshard_params.py b/test/distributed/fsdp/test_fsdp_unshard_params.py
index 26d014e0faebb..1cef7e8ec889b 100644
--- a/test/distributed/fsdp/test_fsdp_unshard_params.py
+++ b/test/distributed/fsdp/test_fsdp_unshard_params.py
@@ -17,6 +17,8 @@
 )
 from torch.distributed.fsdp._common_utils import clean_tensor_name
 from torch.distributed.fsdp._flat_param import FlatParameter
+from torch.distributed.fsdp.fully_sharded_data_parallel import FLAT_PARAM
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
@@ -622,6 +624,19 @@ def _test_with_grads_none_grads(self, sharding_strategy: ShardingStrategy):
             for param in fsdp_model.parameters():
                 self.assertTrue(param.grad is None)
 
+    @skip_if_lt_x_gpu(2)
+    def test_unshard_submodule(self):
+        model = nn.Sequential(
+            nn.Sequential(nn.Linear(16, 16), nn.Linear(16, 16)),
+            nn.Sequential(nn.Linear(16, 16), nn.Linear(16, 16)),
+        ).cuda()
+        model = FSDP(model, auto_wrap_policy=ModuleWrapPolicy((nn.Sequential,)))
+        with FSDP.summon_full_params(model[0]):
+            # Check that the summoned module does not have its flat parameter
+            for param_name, param in model[0].named_parameters():
+                self.assertFalse(FLAT_PARAM in param_name)
+            self.assertGreater(len(list(model[0].parameters())), 1)
+
 
 class TestUnshardParamsNoShard(TestUnshardParamsBase):
     @property
diff --git a/test/distributed/fsdp/test_fsdp_use_orig_params.py b/test/distributed/fsdp/test_fsdp_use_orig_params.py
index e651e15f1e58d..38f88a34bfb00 100644
--- a/test/distributed/fsdp/test_fsdp_use_orig_params.py
+++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py
@@ -43,6 +43,7 @@
     TEST_WITH_DEV_DBG_ASAN,
     TestCase,
 )
+from torch.utils._triton import has_triton
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
@@ -216,6 +217,7 @@ def _get_sharding_strategy_from_str(
             raise ValueError(f"Invalid string: {sharding_strategy_str}")
         return sharding_strategy
 
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_lt_x_gpu(2)
     def test_fsdp_compile(self):
         self.run_subtests(
diff --git a/test/distributed/fsdp/test_hsdp_dtensor_state_dict.py b/test/distributed/fsdp/test_hsdp_dtensor_state_dict.py
index 86dbda6c45e84..f659756dad3b4 100644
--- a/test/distributed/fsdp/test_hsdp_dtensor_state_dict.py
+++ b/test/distributed/fsdp/test_hsdp_dtensor_state_dict.py
@@ -9,7 +9,7 @@
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 
 from torch.distributed._tensor import DTensor, Replicate, Shard
-from torch.distributed.device_mesh import _mesh_resources, init_device_mesh
+from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.api import (
     ShardedOptimStateDictConfig,
@@ -50,24 +50,6 @@ def get_input(self):
 
 # TODO: Consolidate DeviceMesh based FSDP and HSDP test cases.
 class TestHSDPWithDeviceMeshAndDTensor(DTensorTestBase):
-    @with_comms
-    @skip_if_lt_x_gpu(4)
-    def test_raises_tp_hsdp_not_supported_error(self):
-        mesh_2d = init_device_mesh(self.device_type, (2, self.world_size // 2))
-        # manually set a fake parent mesh to mesh_2d
-        fake_parent_mesh = init_device_mesh(self.device_type, (self.world_size,))
-        _mesh_resources.child_to_parent_mapping[mesh_2d] = fake_parent_mesh
-
-        with self.assertRaisesRegex(
-            RuntimeError,
-            r"Hybrid sharding \+ TP is not supported yet.",
-        ):
-            model = FSDP(
-                DenseModel().cuda(),
-                device_mesh=mesh_2d,
-                sharding_strategy=ShardingStrategy.HYBRID_SHARD,
-            )
-
     def _create_model(self, device_mesh=None):
         if device_mesh:
             model = FSDP(
diff --git a/test/distributed/fsdp/test_wrap.py b/test/distributed/fsdp/test_wrap.py
index fdb2ba1c8f927..d39ba37307459 100644
--- a/test/distributed/fsdp/test_wrap.py
+++ b/test/distributed/fsdp/test_wrap.py
@@ -462,6 +462,15 @@ def test_module_wrap_policy(self):
         )
         self._test_transformer_wrapping(auto_wrap_policy)
 
+    @unittest.skipIf(not TEST_MULTIGPU, "Requires at least 2 GPUs")
+    def test_module_wrap_policy_callable(self):
+        """Tests the ``ModuleWrapPolicy`` as a ``Callable``."""
+        auto_wrap_policy = ModuleWrapPolicy(
+            {TransformerEncoderLayer, TransformerDecoderLayer}
+        )
+        callable_policy = functools.partial(_or_policy, policies=[auto_wrap_policy])
+        self._test_transformer_wrapping(callable_policy)
+
     def _test_transformer_wrapping(self, auto_wrap_policy: Union[Callable, _Policy]):
         fsdp_kwargs = {"auto_wrap_policy": auto_wrap_policy}
         fsdp_model = TransformerWithSharedParams.init(
@@ -936,8 +945,7 @@ def _test_validate_frozen_params(self, use_orig_params: bool):
         ignored_params = set()
         for module_name, module in model.named_modules():
             if "lora_A" in module_name:
-                for param in module.parameters():
-                    ignored_params.add(param)
+                ignored_params.update(module.parameters())
         _validate_frozen_params(model, modules_to_wrap, ignored_params, use_orig_params)
 
 
diff --git a/test/distributed/launcher/api_test.py b/test/distributed/launcher/api_test.py
index c803d82fa371c..38e3bc305fa55 100644
--- a/test/distributed/launcher/api_test.py
+++ b/test/distributed/launcher/api_test.py
@@ -28,14 +28,14 @@
 from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer
 from torch.distributed.elastic.utils import get_socket_with_port
 from torch.distributed.launcher.api import (
-    LaunchConfig,
     _get_entrypoint_name,
     elastic_launch,
     launch_agent,
+    LaunchConfig,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     skip_but_pass_in_sandcastle_if,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 
@@ -70,7 +70,7 @@ def get_test_launch_config(
         nproc_per_node=nproc_per_node,
         run_id=run_id,
         rdzv_endpoint=rdzv_endpoint,
-        monitor_interval=1,
+        monitor_interval=0.1,
         rdzv_backend=rdzv_backend,
         start_method="spawn",
         max_restarts=0,
@@ -155,7 +155,9 @@ def check_works_ran(self, world_size: int):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_script_python(self):
         nnodes = 1
         nproc_per_node = 4
@@ -170,7 +172,9 @@ def test_launch_script_python(self):
         world_size = nnodes * nproc_per_node
         self.check_works_ran(world_size)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_script_python_local_rank_transfer(self):
         nnodes = 1
         nproc_per_node = 4
@@ -185,7 +189,9 @@ def test_launch_script_python_local_rank_transfer(self):
         world_size = nnodes * nproc_per_node
         self.check_works_ran(world_size)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_script_bash(self):
         nnodes = 1
         nproc_per_node = 4
@@ -198,7 +204,9 @@ def test_launch_script_bash(self):
         world_size = nnodes * nproc_per_node
         self.check_works_ran(world_size)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_function(self):
         nnodes = 1
         nproc_per_node = 4
@@ -212,7 +220,9 @@ def test_launch_function(self):
         actual_res = sorted(value for value in res.values())
         self.assertEqual(expected_res, actual_res)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_dist_sum_with_static_rdzv(self):
         nnodes = 1
         nproc_per_node = 4
@@ -241,7 +251,9 @@ def test_launch_dist_sum_with_static_rdzv(self):
         actual_res = sorted(value for value in res.values())
         self.assertEqual(expected_res, actual_res)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_elastic(self):
         nproc_per_node = 4
 
@@ -286,7 +298,9 @@ def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run)
             )("-u", path("bin/test_script.py"), f"--touch-file-dir={self.test_dir}")
         record_mock.assert_called_once()
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_elastic_multiple_agents(self):
         min_nodes = 1
         max_nodes = 2
diff --git a/test/distributed/launcher/launch_test.py b/test/distributed/launcher/launch_test.py
index 4140e55c6c601..b8312de37faa6 100644
--- a/test/distributed/launcher/launch_test.py
+++ b/test/distributed/launcher/launch_test.py
@@ -15,8 +15,8 @@
 import torch.distributed.launch as launch
 from torch.distributed.elastic.utils import get_socket_with_port
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     skip_but_pass_in_sandcastle_if,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 
diff --git a/test/distributed/launcher/run_test.py b/test/distributed/launcher/run_test.py
index 9725ef0836d11..c816042e3e469 100644
--- a/test/distributed/launcher/run_test.py
+++ b/test/distributed/launcher/run_test.py
@@ -17,17 +17,18 @@
 import uuid
 from contextlib import closing
 from unittest import mock
-from unittest.mock import Mock, patch
+from unittest.mock import MagicMock, Mock, patch
 
 import torch.distributed.run as launch
 from torch.distributed.elastic.agent.server.api import RunResult, WorkerState
+from torch.distributed.elastic.multiprocessing import DefaultLogsSpecs
 from torch.distributed.elastic.multiprocessing.errors import ChildFailedError
 from torch.distributed.elastic.rendezvous.etcd_server import EtcdServer
 from torch.distributed.elastic.utils import get_socket_with_port
 from torch.distributed.elastic.utils.distributed import get_free_port
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     skip_but_pass_in_sandcastle_if,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 
@@ -144,7 +145,9 @@ def test_launch_user_script_python_caffe2_bc(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_user_script_bash(self):
         run_id = str(uuid.uuid4().int)
         nnodes = 1
@@ -175,7 +178,9 @@ def test_launch_user_script_bash(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_user_script_default_nproc(self):
         run_id = str(uuid.uuid4().int)
         nnodes = 1
@@ -204,7 +209,9 @@ def test_launch_user_script_default_nproc(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_with_env_vars(self):
         run_id = str(uuid.uuid4().int)
         nnodes = 1
@@ -262,27 +269,37 @@ def _test_nproc_launch_configuration(self, nproc_type, expected_number):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_nproc_launch_auto_configurations(self):
         self._test_nproc_launch_configuration("auto", os.cpu_count())
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_nproc_launch_number_configurations(self):
         self._test_nproc_launch_configuration("4", 4)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_nproc_launch_unknown_configurations(self):
         with self.assertRaises(ValueError):
             self._test_nproc_launch_configuration("unknown", 4)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     @patch("torch.cuda.is_available", return_value=True)
     @patch("torch.cuda.device_count", return_value=3)
     def test_nproc_gpu_launch_configurations(self, _mock1, _mock2):
         self._test_nproc_launch_configuration("auto", 3)
         self._test_nproc_launch_configuration("gpu", 3)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_elastic(self):
         run_id = str(uuid.uuid4().int)
         min_nodes = 1
@@ -310,7 +327,9 @@ def test_launch_elastic(self):
         )
 
     @mock.patch("torch.distributed.elastic.events.record")
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_elastic_worker_raise_exception(self, record_mock):
         """
         Asserts that when the worker program fails and lancher raieses exception
@@ -338,7 +357,9 @@ def test_launch_elastic_worker_raise_exception(self, record_mock):
 
         record_mock.assert_called_once()
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     @mock.patch(
         "torch.distributed.elastic.agent.server.local_elastic_agent.LocalElasticAgent.run"
     )
@@ -370,7 +391,9 @@ def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run)
             launch.main(args)
         record_mock.assert_called_once()
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_standalone(self):
         nnodes = 1
         nproc_per_node = 4
@@ -392,7 +415,9 @@ def test_launch_standalone(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_run_path(self):
         nnodes = 1
         nproc_per_node = 4
@@ -414,7 +439,9 @@ def test_launch_run_path(self):
             {str(i) for i in range(world_size)}, set(os.listdir(self.test_dir))
         )
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_launch_elastic_multiple_agents(self):
         run_id = str(uuid.uuid4().int)
         min_nodes = 1
@@ -483,19 +510,57 @@ def test_launch_shutdown(self, agent_mock_cls):
             launch.main(args)
             rdzv_handler_mock.shutdown.assert_called_once()
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_is_torchelastic_launched(self):
         # launch test script with torchelastic and validate that
         # torch.distributed.is_torchelastic_launched() returns True
 
         out_file = f"{os.path.join(self.test_dir, 'out')}"
+        launch.main(
+            [
+                "--run-path",
+                "--nnodes=1",
+                "--nproc-per-node=1",
+                "--monitor-interval=1",
+                path("bin/test_script_is_torchelastic_launched.py"),
+                f"--out-file={out_file}",
+            ]
+        )
+
+        with open(out_file) as fp:
+            is_torchelastic_launched = fp.readline()
+            self.assertEqual("True", is_torchelastic_launched)
+
+    @patch("torch.distributed.run.metadata")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
+    def test_is_torchelastic_launched_with_logs_spec_defined(self, metadata_mock):
+        # mock the entrypoint API to avoid version issues.
+        entrypoints = MagicMock()
+        metadata_mock.entry_points.return_value = entrypoints
+
+        group = MagicMock()
+        entrypoints.select.return_value = group
+
+        ep = MagicMock()
+        ep.load.return_value = DefaultLogsSpecs
 
+        group.select.return_value = ep
+        group.__getitem__.return_value = ep
+
+        out_file = f"{os.path.join(self.test_dir, 'out')}"
+        if os.path.exists(out_file):
+            os.remove(out_file)
         launch.main(
             [
                 "--run-path",
                 "--nnodes=1",
                 "--nproc-per-node=1",
                 "--monitor-interval=1",
+                "--logs_specs=default",
                 path("bin/test_script_is_torchelastic_launched.py"),
                 f"--out-file={out_file}",
             ]
@@ -505,6 +570,22 @@ def test_is_torchelastic_launched(self):
             is_torchelastic_launched = fp.readline()
             self.assertEqual("True", is_torchelastic_launched)
 
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
+    def test_logs_logs_spec_entrypoint_must_be_defined(self):
+        with self.assertRaises(ValueError):
+            launch.main(
+                [
+                    "--run-path",
+                    "--nnodes=1",
+                    "--nproc-per-node=1",
+                    "--monitor-interval=1",
+                    "--logs_specs=DOESNOT_EXIST",
+                    path("bin/test_script_is_torchelastic_launched.py"),
+                ]
+            )
+
     def test_is_not_torchelastic_launched(self):
         # launch test script without torchelastic and validate that
         # torch.distributed.is_torchelastic_launched() returns False
@@ -542,7 +623,9 @@ def test_init_method_tcp(self):
             runpy.run_path(sys.argv[0], run_name="__main__")
             # nothing to validate, just make sure it runs
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_init_method_tcp_with_torchelastic(self):
         port = get_free_port()
         launch.main(
@@ -580,7 +663,9 @@ def test_init_method_env(self):
             runpy.run_path(sys.argv[0], run_name="__main__")
             # nothing to validate, just make sure it runs
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
     def test_init_method_env_with_torchelastic(self):
         port = get_free_port()
         launch.main(
diff --git a/test/distributed/nn/jit/test_instantiator.py b/test/distributed/nn/jit/test_instantiator.py
index 8a7026f964036..03d3a6f050628 100644
--- a/test/distributed/nn/jit/test_instantiator.py
+++ b/test/distributed/nn/jit/test_instantiator.py
@@ -6,15 +6,15 @@
 from typing import Tuple
 
 import torch
-from torch import Tensor, nn
 import torch.distributed as dist
+from torch import nn, Tensor
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
 from torch.distributed.nn.jit import instantiator
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 @torch.jit.interface
diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py
index bf79d2bdc54ab..485df8f5b5437 100644
--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@@ -47,6 +47,7 @@
 except ImportError:
     HAS_TORCHVISION = False
 
+
 # Use GLOO on GPU when running CUDA + Windows
 def _get_backend_for_tests():
     return (
@@ -530,7 +531,7 @@ def test_sharding(self):
             params.append(torch.rand(size, 1))
         o = ZeroRedundancyOptimizer(params, optimizer_class=SGD, lr=LR)
         self.assertEqual(
-            sum([x.numel() for x in o.optim.param_groups[0]["params"]]),
+            sum(x.numel() for x in o.optim.param_groups[0]["params"]),
             sum(sizes),
         )
 
@@ -567,7 +568,7 @@ def all_trainable():
             # all partitions have the same elements
             self.assertEqual(len(o.param_groups), 2)
             self.assertEqual(
-                sum([x.numel() for g in o.optim.param_groups for x in g["params"]]),
+                sum(x.numel() for g in o.optim.param_groups for x in g["params"]),
                 sum(sizes),
             )
             self.assertEqual(len(o.optim.param_groups), 2)
@@ -725,7 +726,8 @@ def test_nondefault_process_group(self):
             common_distributed.logger.info(
                 "Skipping `test_nondefault_process_group()` since world size "
                 "of %s is less than %s",
-                self.world_size, MIN_WORLD_SIZE
+                self.world_size,
+                MIN_WORLD_SIZE,
             )
             return
         BACKEND = dist.Backend.GLOO
@@ -1275,7 +1277,7 @@ def _test_ddp_zero_overlap(
                     [torch.randn(1, 3, 3, 1000).to(device) for _ in range(NUM_INPUTS)],
                 )
             )
-        for (model, inputs) in models_to_test:
+        for model, inputs in models_to_test:
             # Enable determinism in cudnn operators
             with torch.backends.cudnn.flags(
                 enabled=True, deterministic=True, benchmark=False
diff --git a/test/distributed/pipeline/sync/conftest.py b/test/distributed/pipeline/sync/conftest.py
index 78f7d3a8f1bb4..4f2479b27b29d 100644
--- a/test/distributed/pipeline/sync/conftest.py
+++ b/test/distributed/pipeline/sync/conftest.py
@@ -5,11 +5,13 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 import tempfile
+
 import pytest
 
 import torch
 import torch.distributed as dist
 
+
 @pytest.fixture(autouse=True)
 def manual_seed_zero():
     torch.manual_seed(0)
@@ -38,6 +40,7 @@ def cuda_sleep(seconds):
 def pytest_report_header():
     return f"torch: {torch.__version__}"
 
+
 @pytest.fixture
 def setup_rpc(scope="session"):
     file = tempfile.NamedTemporaryFile()
@@ -47,11 +50,12 @@ def setup_rpc(scope="session"):
         world_size=1,
         rpc_backend_options=dist.rpc.TensorPipeRpcBackendOptions(
             init_method=f"file://{file.name}",
-        )
+        ),
     )
     yield
     dist.rpc.shutdown()
 
+
 def pytest_ignore_collect(path, config):
     "Skip this directory if distributed modules are not enabled."
     return not dist.is_available()
diff --git a/test/distributed/pipeline/sync/skip/test_gpipe.py b/test/distributed/pipeline/sync/skip/test_gpipe.py
index 21731d452da56..e002d65db7ff7 100644
--- a/test/distributed/pipeline/sync/skip/test_gpipe.py
+++ b/test/distributed/pipeline/sync/skip/test_gpipe.py
@@ -12,13 +12,19 @@
 
 from torch.distributed.pipeline.sync import Pipe
 from torch.distributed.pipeline.sync.skip import pop, skippable, stash
-from torch.distributed.pipeline.sync.skip.portal import PortalBlue, PortalCopy, PortalOrange
+from torch.distributed.pipeline.sync.skip.portal import (
+    PortalBlue,
+    PortalCopy,
+    PortalOrange,
+)
 from torch.distributed.pipeline.sync.utils import partition_model
 from torch.testing._internal.common_utils import run_tests
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
-@pytest.mark.parametrize("balance", [[3], [1, 2], [2, 1], [1, 1, 1]], ids=["3", "1:2", "2:1", "1:1:1"])
+@pytest.mark.parametrize(
+    "balance", [[3], [1, 2], [2, 1], [1, 1, 1]], ids=["3", "1:2", "2:1", "1:1:1"]
+)
 @pytest.mark.parametrize("checkpoint", ["never", "always", "except_last"])
 def test_1to3(balance, checkpoint, setup_rpc):
     if torch.cuda.device_count() < len(balance):
@@ -67,8 +73,12 @@ def forward(self, input):
     loss = output.local_value().mean()
     loss.backward()
 
-    assert torch.allclose(output.local_value().norm(), torch.tensor(1039.0, device=out_device), atol=6e-1)
-    assert torch.allclose(input.grad.norm(), torch.tensor(0.0004533053, device=in_device))
+    assert torch.allclose(
+        output.local_value().norm(), torch.tensor(1039.0, device=out_device), atol=6e-1
+    )
+    assert torch.allclose(
+        input.grad.norm(), torch.tensor(0.0004533053, device=in_device)
+    )
 
 
 def test_none_skip(setup_rpc):
diff --git a/test/distributed/pipeline/sync/skip/test_leak.py b/test/distributed/pipeline/sync/skip/test_leak.py
index f0e82f7bbabfd..2bf797dae5597 100644
--- a/test/distributed/pipeline/sync/skip/test_leak.py
+++ b/test/distributed/pipeline/sync/skip/test_leak.py
@@ -10,7 +10,7 @@
 import torch
 from torch import nn
 
-from torch.distributed.pipeline.sync import Pipe, is_checkpointing, is_recomputing
+from torch.distributed.pipeline.sync import is_checkpointing, is_recomputing, Pipe
 from torch.distributed.pipeline.sync.skip import pop, skippable, stash
 from torch.distributed.pipeline.sync.skip.tracker import current_skip_tracker
 from torch.testing._internal.common_utils import run_tests
@@ -113,7 +113,9 @@ def test_no_portal_without_pipe(train, monkeypatch, setup_rpc):
     def deny(*args, **kwargs):
         raise AssertionError("tried to create Portal without Pipe")
 
-    monkeypatch.setattr("torch.distributed.pipeline.sync.skip.portal.Portal.__init__", deny)
+    monkeypatch.setattr(
+        "torch.distributed.pipeline.sync.skip.portal.Portal.__init__", deny
+    )
 
     model = nn.Sequential(Stash(), Pop())
 
diff --git a/test/distributed/pipeline/sync/skip/test_tracker.py b/test/distributed/pipeline/sync/skip/test_tracker.py
index 5810cab976816..007a5a963a7cf 100644
--- a/test/distributed/pipeline/sync/skip/test_tracker.py
+++ b/test/distributed/pipeline/sync/skip/test_tracker.py
@@ -6,18 +6,25 @@
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
-from queue import Queue
 import threading
+from queue import Queue
 
 import pytest
 import torch
 from torch import nn
 
-from torch.distributed.pipeline.sync.checkpoint import enable_checkpointing, enable_recomputing
+from torch.distributed.pipeline.sync.checkpoint import (
+    enable_checkpointing,
+    enable_recomputing,
+)
 from torch.distributed.pipeline.sync.microbatch import Batch
 from torch.distributed.pipeline.sync.skip import pop, skippable, stash
 from torch.distributed.pipeline.sync.skip.layout import SkipLayout
-from torch.distributed.pipeline.sync.skip.tracker import SkipTracker, SkipTrackerThroughPotals, current_skip_tracker
+from torch.distributed.pipeline.sync.skip.tracker import (
+    current_skip_tracker,
+    SkipTracker,
+    SkipTrackerThroughPotals,
+)
 from torch.testing._internal.common_utils import run_tests
 
 
@@ -76,7 +83,10 @@ def test_reuse_portal():
 
 
 def test_no_copy_no_portal():
-    skip_layout = SkipLayout(num_partitions=2, skip_routes={(None, "copy"): (0, 1), (None, "not_copy"): (0, 0)})
+    skip_layout = SkipLayout(
+        num_partitions=2,
+        skip_routes={(None, "copy"): (0, 1), (None, "not_copy"): (0, 0)},
+    )
     skip_tracker = SkipTrackerThroughPotals(skip_layout)
 
     batch = Batch(torch.tensor([1.0]))
diff --git a/test/distributed/pipeline/sync/skip/test_verify_skippables.py b/test/distributed/pipeline/sync/skip/test_verify_skippables.py
index 6de439ec88d89..265c3fee85f90 100644
--- a/test/distributed/pipeline/sync/skip/test_verify_skippables.py
+++ b/test/distributed/pipeline/sync/skip/test_verify_skippables.py
@@ -151,7 +151,12 @@ class Layer4(nn.Module):
     ns2 = Namespace()
 
     verify_skippables(
-        nn.Sequential(Layer1().isolate(ns1), Layer2().isolate(ns1), Layer3().isolate(ns2), Layer4().isolate(ns2),)
+        nn.Sequential(
+            Layer1().isolate(ns1),
+            Layer2().isolate(ns1),
+            Layer3().isolate(ns2),
+            Layer4().isolate(ns2),
+        )
     )
 
 
diff --git a/test/distributed/pipeline/sync/test_balance.py b/test/distributed/pipeline/sync/test_balance.py
index b8a81aabb74ab..82af7545bb0fb 100644
--- a/test/distributed/pipeline/sync/test_balance.py
+++ b/test/distributed/pipeline/sync/test_balance.py
@@ -12,11 +12,17 @@
 import torch
 from torch import nn
 
-from torch.distributed.pipeline.sync._balance import balance_by_size, balance_by_time, blockpartition
+from torch.distributed.pipeline.sync._balance import (
+    balance_by_size,
+    balance_by_time,
+    blockpartition,
+)
 from torch.distributed.pipeline.sync._balance.profile import layerwise_sandbox
 from torch.testing._internal.common_utils import run_tests
 
-skip_if_no_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
+skip_if_no_cuda = pytest.mark.skipif(
+    not torch.cuda.is_available(), reason="cuda required"
+)
 
 devices = ["cpu"]
 if torch.cuda.is_available():
@@ -24,7 +30,10 @@
 
 
 def test_blockpartition():
-    assert blockpartition.solve([1, 2, 3, 4, 5, 6], partitions=2) == [[1, 2, 3, 4], [5, 6]]
+    assert blockpartition.solve([1, 2, 3, 4, 5, 6], partitions=2) == [
+        [1, 2, 3, 4],
+        [5, 6],
+    ]
 
 
 def test_blockpartition_zeros():
diff --git a/test/distributed/pipeline/sync/test_bugs.py b/test/distributed/pipeline/sync/test_bugs.py
index f9860cb0f2164..1cb981c6a468e 100644
--- a/test/distributed/pipeline/sync/test_bugs.py
+++ b/test/distributed/pipeline/sync/test_bugs.py
@@ -8,12 +8,12 @@
 # LICENSE file in the root directory of this source tree.
 import pytest
 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn
 
 from torch.distributed.pipeline.sync import Pipe
-from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_utils import run_tests
 
 
 def test_python_autograd_function(setup_rpc):
@@ -63,7 +63,7 @@ def forward(self, x):
 
     class Raise(nn.Module):
         def forward(self, x):
-            raise ExpectedException()
+            raise ExpectedException
 
     model = nn.Sequential(Pass(), Pass(), Raise())
     model = Pipe(model, chunks=3)
diff --git a/test/distributed/pipeline/sync/test_checkpoint.py b/test/distributed/pipeline/sync/test_checkpoint.py
index f3d57c218cf1e..e1ae6f6754ec2 100644
--- a/test/distributed/pipeline/sync/test_checkpoint.py
+++ b/test/distributed/pipeline/sync/test_checkpoint.py
@@ -10,10 +10,15 @@
 
 import pytest
 import torch
-from torch import nn
 import torch.cuda
+from torch import nn
 
-from torch.distributed.pipeline.sync.checkpoint import Checkpointing, checkpoint, is_checkpointing, is_recomputing
+from torch.distributed.pipeline.sync.checkpoint import (
+    checkpoint,
+    Checkpointing,
+    is_checkpointing,
+    is_recomputing,
+)
 from torch.distributed.pipeline.sync.dependency import fork, join
 from torch.distributed.pipeline.sync.microbatch import Batch
 from torch.testing._internal.common_utils import run_tests
@@ -63,7 +68,14 @@ def backward(ctx, grad_output):
     #                        +--> {b} --Checkpoint(Log)--> {b} --First--> {b}
     out.backward()
 
-    assert timeline == ["a:forward", "b:forward", "b:forward", "b:backward", "a:forward", "a:backward"]
+    assert timeline == [
+        "a:forward",
+        "b:forward",
+        "b:forward",
+        "b:backward",
+        "a:forward",
+        "a:backward",
+    ]
     #    |----------------------|  |-----------------------|  |-----------------------|
     #          forward pass            Checkpoint(Log[b])         Checkpoint(Log[a])
 
diff --git a/test/distributed/pipeline/sync/test_copy.py b/test/distributed/pipeline/sync/test_copy.py
index 171b7ffbb8ee4..22a3a37805f2c 100644
--- a/test/distributed/pipeline/sync/test_copy.py
+++ b/test/distributed/pipeline/sync/test_copy.py
@@ -10,10 +10,19 @@
 import torch
 
 from torch.distributed.pipeline.sync.copy import Copy, Wait
-from torch.distributed.pipeline.sync.stream import CPUStream, current_stream, get_device, is_cuda, new_stream, use_stream
+from torch.distributed.pipeline.sync.stream import (
+    CPUStream,
+    current_stream,
+    get_device,
+    is_cuda,
+    new_stream,
+    use_stream,
+)
 from torch.testing._internal.common_utils import run_tests
 
-skip_if_no_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
+skip_if_no_cuda = pytest.mark.skipif(
+    not torch.cuda.is_available(), reason="cuda required"
+)
 
 
 def _test_copy_wait(prev_stream, next_stream, cuda_sleep=None):
diff --git a/test/distributed/pipeline/sync/test_deferred_batch_norm.py b/test/distributed/pipeline/sync/test_deferred_batch_norm.py
index 4e2578da94997..2fea96e7ddb6c 100644
--- a/test/distributed/pipeline/sync/test_deferred_batch_norm.py
+++ b/test/distributed/pipeline/sync/test_deferred_batch_norm.py
@@ -28,7 +28,7 @@ def tilt_dist(input):
 
     # Tilt mean by single batch.
     for i, single in enumerate(input):
-        single += 2 ** i
+        single += 2**i
 
     return input
 
@@ -140,7 +140,7 @@ def test_optimize():
         dbn.eval()
 
         with torch.no_grad():
-            assert torch.allclose(bn(input), dbn(input), atol=1e-1 * (10 ** i))
+            assert torch.allclose(bn(input), dbn(input), atol=1e-1 * (10**i))
 
 
 def test_conv_bn():
diff --git a/test/distributed/pipeline/sync/test_dependency.py b/test/distributed/pipeline/sync/test_dependency.py
index cff4082759944..73283b88abb20 100644
--- a/test/distributed/pipeline/sync/test_dependency.py
+++ b/test/distributed/pipeline/sync/test_dependency.py
@@ -11,7 +11,7 @@
 import pytest
 import torch
 
-from torch.distributed.pipeline.sync.dependency import Fork, Join, fork, join
+from torch.distributed.pipeline.sync.dependency import Fork, fork, Join, join
 from torch.testing._internal.common_utils import run_tests
 
 
diff --git a/test/distributed/pipeline/sync/test_pipe.py b/test/distributed/pipeline/sync/test_pipe.py
index 7fc8d8b7c511a..848a3ed7eb904 100644
--- a/test/distributed/pipeline/sync/test_pipe.py
+++ b/test/distributed/pipeline/sync/test_pipe.py
@@ -6,27 +6,26 @@
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
+import random
+import time
 from collections import OrderedDict
 from copy import deepcopy
-import time
 
 import pytest
-import random
 import torch
-from torch import nn
-from torch import Tensor
+from torch import nn, Tensor
 
-from torch.distributed.pipeline.sync import Pipe, NoChunk, WithDevice
+from torch.distributed.pipeline.sync import NoChunk, Pipe, WithDevice
 from torch.distributed.pipeline.sync.pipe import PipeSequential
-from torch.testing._internal.common_utils import run_tests, TEST_CUDA
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_utils import run_tests, TEST_CUDA
 
 skip_if_no_cuda = pytest.mark.skipif(not TEST_CUDA, reason="cuda required")
 
 
 def test_pipe_without_rpc():
     model = nn.Sequential(nn.Linear(1, 1))
-    with pytest.raises(RuntimeError, match='Please initialize RPC framework'):
+    with pytest.raises(RuntimeError, match="Please initialize RPC framework"):
         pipe = Pipe(model, chunks=1)
 
 
@@ -135,14 +134,19 @@ def count_grad_fn(grad_fn, name, visited=None):
     never_output = never(input)
 
     assert count_grad_fn(always_output.local_value().grad_fn, "CheckpointBackward") == 2
-    assert count_grad_fn(except_last_output.local_value().grad_fn, "CheckpointBackward") == 1
+    assert (
+        count_grad_fn(except_last_output.local_value().grad_fn, "CheckpointBackward")
+        == 1
+    )
     assert count_grad_fn(never_output.local_value().grad_fn, "CheckpointBackward") == 0
 
 
 def test_checkpoint_mode_invalid(setup_rpc):
     model = nn.Sequential(nn.Linear(1, 1))
 
-    with pytest.raises(ValueError, match="checkpoint is not one of 'always', 'except_last', or 'never'"):
+    with pytest.raises(
+        ValueError, match="checkpoint is not one of 'always', 'except_last', or 'never'"
+    ):
         Pipe(model, chunks=2, checkpoint="INVALID_CHECKPOINT")
 
 
@@ -227,7 +231,7 @@ class ExpectedException(Exception):
 
     class Raise(nn.Module):
         def forward(self, *_):
-            raise ExpectedException()
+            raise ExpectedException
 
     model = nn.Sequential(Raise())
     model = Pipe(model, chunks=1)
@@ -261,7 +265,7 @@ def forward(self, x):
 
     class Raise(nn.Module):
         def forward(self, x):
-            raise ExpectedException()
+            raise ExpectedException
 
     model = nn.Sequential(Pass(), Pass(), Counter(), Raise())
     model = Pipe(model, chunks=3)
@@ -329,10 +333,7 @@ def forward(self, tup1, tup2):
 
     model = Pipe(nn.Sequential(MultiSeq()))
     with pytest.raises(TypeError):
-        model(
-            [torch.rand(10), torch.rand(10)],
-            [torch.rand(10), torch.rand(10)]
-        )
+        model([torch.rand(10), torch.rand(10)], [torch.rand(10), torch.rand(10)])
 
 
 def test_input_singleton(setup_rpc):
@@ -427,7 +428,9 @@ def forward(self, a: Tensor, b: bool, c: int, d: Tensor, e: str, f: Tensor):
             res += d
             return c, res, a, d + f if f is not None else d, b, e, f
 
-    model = Pipe(nn.Sequential(NonTensor1(), NonTensor2()), chunks=5, checkpoint=checkpoint)
+    model = Pipe(
+        nn.Sequential(NonTensor1(), NonTensor2()), chunks=5, checkpoint=checkpoint
+    )
     a = random.randint(0, 10)
     b = torch.rand(10, 10)
     c = random.randint(0, 1) == 0
@@ -507,7 +510,7 @@ def forward(self, a: Tensor, b: int, c: Tensor):
     b = random.randint(0, 10)
     c = torch.rand(4, 10)
 
-    with pytest.raises(RuntimeError, match='Found different number of chunks'):
+    with pytest.raises(RuntimeError, match="Found different number of chunks"):
         model(a, b, c)
 
 
@@ -529,7 +532,7 @@ def forward(self, a: Tensor, b: int, c: Tensor):
     assert torch.allclose(torch.cat((c, c, c, c, c)), res[2])
 
     # Test invalid type for NoChunk
-    with pytest.raises(TypeError, match='NoChunk only supported for tensors'):
+    with pytest.raises(TypeError, match="NoChunk only supported for tensors"):
         NoChunk(b)
 
 
@@ -538,7 +541,10 @@ def test_deferred_batch_norm(checkpoint, setup_rpc):
     bn = nn.BatchNorm2d(3)
     pipe_bn = deepcopy(bn)
     pipe = Pipe(
-        nn.Sequential(pipe_bn), chunks=2, checkpoint=checkpoint, deferred_batch_norm=True
+        nn.Sequential(pipe_bn),
+        chunks=2,
+        checkpoint=checkpoint,
+        deferred_batch_norm=True,
     )
 
     x = torch.rand(4, 3, 10, 10)
@@ -554,7 +560,10 @@ def test_deferred_batch_norm_params(checkpoint, setup_rpc):
     bn = nn.BatchNorm2d(3)
     pipe_bn = deepcopy(bn)
     pipe = Pipe(
-        nn.Sequential(pipe_bn), chunks=1, checkpoint=checkpoint, deferred_batch_norm=True
+        nn.Sequential(pipe_bn),
+        chunks=1,
+        checkpoint=checkpoint,
+        deferred_batch_norm=True,
     )
 
     x = torch.rand(4, 3, 10, 10)
@@ -682,7 +691,9 @@ def test_named_children(setup_rpc):
 
 
 def test_verify_module_non_sequential(setup_rpc):
-    with pytest.raises(TypeError, match="module must be nn.Sequential to be partitioned"):
+    with pytest.raises(
+        TypeError, match="module must be nn.Sequential to be partitioned"
+    ):
         Pipe(nn.Module())
 
 
@@ -690,7 +701,9 @@ def test_verify_module_duplicate_children(setup_rpc):
     conv = nn.Conv2d(3, 3, 1)
     model = nn.Sequential(conv, conv)
 
-    with pytest.raises(ValueError, match="module with duplicate children is not supported"):
+    with pytest.raises(
+        ValueError, match="module with duplicate children is not supported"
+    ):
         Pipe(model)
 
 
@@ -708,22 +721,17 @@ def __init__(self, param1, param2):
 
     with pytest.raises(
         ValueError,
-        match=r'should have all parameters on a single device, please use .to\(\)'
-            ' to place the module on a single device'):
+        match=r"should have all parameters on a single device, please use .to\(\)"
+        " to place the module on a single device",
+    ):
         Pipe(model)
 
 
 @pytest.mark.skipif(not TEST_MULTIGPU, reason="Need atleast two GPUs")
 def test_verify_nested_modules(setup_rpc):
     model = nn.Sequential(
-        nn.Sequential(
-            nn.Linear(32, 16).cuda(0),
-            nn.Linear(16, 8).cuda(0)
-        ),
-        nn.Sequential(
-            nn.Linear(8, 4).cuda(1),
-            nn.Linear(4, 2).cuda(1)
-        ),
+        nn.Sequential(nn.Linear(32, 16).cuda(0), nn.Linear(16, 8).cuda(0)),
+        nn.Sequential(nn.Linear(8, 4).cuda(1), nn.Linear(4, 2).cuda(1)),
     )
 
     pipe = Pipe(model)
@@ -785,7 +793,11 @@ class Module2(nn.Module):
         def forward(self, a, b):
             return a + b
 
-    model = Pipe(nn.Sequential(Module1().cuda(0), Module2().cuda(0)), chunks=2, checkpoint=checkpoint)
+    model = Pipe(
+        nn.Sequential(Module1().cuda(0), Module2().cuda(0)),
+        chunks=2,
+        checkpoint=checkpoint,
+    )
     t = torch.rand(10)
     res = model(t, t, t).local_value()
     assert torch.equal(res, (t + t + t) + (t * t * t))
@@ -805,7 +817,10 @@ def forward(self, a, b):
     a = torch.rand(10).cuda(1)
     b = torch.rand(10).cuda(1)
     model = Pipe(nn.Sequential(Module1().cuda(0), Module1().cuda(1)), chunks=2)
-    with pytest.raises(ValueError, match='All inputs should be on the same device as the first partition'):
+    with pytest.raises(
+        ValueError,
+        match="All inputs should be on the same device as the first partition",
+    ):
         model(a, b)
 
 
@@ -815,21 +830,27 @@ def test_with_device_wrapper(setup_rpc):
     fc2 = nn.Linear(8, 4).cuda(1)
     dropout = nn.Dropout()
 
-    model = nn.Sequential(fc1, fc2, WithDevice(dropout, 'cuda:1'))
+    model = nn.Sequential(fc1, fc2, WithDevice(dropout, "cuda:1"))
     model = Pipe(model, chunks=8)
-    assert torch.device('cuda:1') == model(torch.rand(16, 16).cuda(0)).local_value().device
-    assert [torch.device('cuda:0'), torch.device('cuda:1')] == model.devices
+    assert (
+        torch.device("cuda:1") == model(torch.rand(16, 16).cuda(0)).local_value().device
+    )
+    assert [torch.device("cuda:0"), torch.device("cuda:1")] == model.devices
 
-    model = nn.Sequential(fc1, WithDevice(dropout, 'cuda:1'))
+    model = nn.Sequential(fc1, WithDevice(dropout, "cuda:1"))
     model = Pipe(model, chunks=8)
-    assert torch.device('cuda:1') == model(torch.rand(16, 16).cuda(0)).local_value().device
-    assert [torch.device('cuda:0'), torch.device('cuda:1')] == model.devices
+    assert (
+        torch.device("cuda:1") == model(torch.rand(16, 16).cuda(0)).local_value().device
+    )
+    assert [torch.device("cuda:0"), torch.device("cuda:1")] == model.devices
 
-    model = nn.Sequential(fc1, WithDevice(fc2, 'cuda:0'))
+    model = nn.Sequential(fc1, WithDevice(fc2, "cuda:0"))
     model = Pipe(model, chunks=8)
-    assert torch.device('cuda:0') == model(torch.rand(16, 16).cuda(0)).local_value().device
-    assert [torch.device('cuda:0')] == model.devices
-    assert torch.device('cuda:0') == fc2.weight.device
+    assert (
+        torch.device("cuda:0") == model(torch.rand(16, 16).cuda(0)).local_value().device
+    )
+    assert [torch.device("cuda:0")] == model.devices
+    assert torch.device("cuda:0") == fc2.weight.device
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/pipeline/sync/test_stream.py b/test/distributed/pipeline/sync/test_stream.py
index 6fa8e99b13db4..29281ca606d86 100644
--- a/test/distributed/pipeline/sync/test_stream.py
+++ b/test/distributed/pipeline/sync/test_stream.py
@@ -23,7 +23,9 @@
 )
 from torch.testing._internal.common_utils import run_tests
 
-skip_if_no_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
+skip_if_no_cuda = pytest.mark.skipif(
+    not torch.cuda.is_available(), reason="cuda required"
+)
 
 
 class TestNewStream:
diff --git a/test/distributed/pipeline/sync/test_transparency.py b/test/distributed/pipeline/sync/test_transparency.py
index e9a312745b128..a87a04150fdc3 100644
--- a/test/distributed/pipeline/sync/test_transparency.py
+++ b/test/distributed/pipeline/sync/test_transparency.py
@@ -15,14 +15,19 @@
 
 def test_simple_linears(setup_rpc):
     def sum_grad(parameters):
-        return sum([p.grad.sum() for p in parameters if p.grad is not None])
+        return sum(p.grad.sum() for p in parameters if p.grad is not None)
 
     def zero_grad(parameters):
         for p in parameters:
             p.grad = None
 
     inputs = torch.rand(8, 1)
-    model = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 4), nn.Linear(4, 2), nn.Linear(2, 1),)
+    model = nn.Sequential(
+        nn.Linear(1, 2),
+        nn.Linear(2, 4),
+        nn.Linear(4, 2),
+        nn.Linear(2, 1),
+    )
 
     # Without Pipe
     outputs = model(inputs)
diff --git a/test/distributed/pipeline/sync/test_worker.py b/test/distributed/pipeline/sync/test_worker.py
index 7d347d48a2193..ca5d99c576388 100644
--- a/test/distributed/pipeline/sync/test_worker.py
+++ b/test/distributed/pipeline/sync/test_worker.py
@@ -13,7 +13,7 @@
 
 from torch.distributed.pipeline.sync.microbatch import Batch
 from torch.distributed.pipeline.sync.stream import CPUStream
-from torch.distributed.pipeline.sync.worker import Task, spawn_workers
+from torch.distributed.pipeline.sync.worker import spawn_workers, Task
 from torch.testing._internal.common_utils import run_tests
 
 
@@ -25,6 +25,7 @@ class fake_device:
     type = "fake"
     index = None
 
+
 def test_compute_multithreading():
     """Task.compute should be executed on multiple threads."""
     thread_ids = set()
diff --git a/test/distributed/pipelining/test_chunkspec.py b/test/distributed/pipelining/test_chunkspec.py
new file mode 100644
index 0000000000000..050a7b11a21bc
--- /dev/null
+++ b/test/distributed/pipelining/test_chunkspec.py
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+import torch
+from torch.distributed.pipelining import (
+    ArgsChunkSpec,
+    KwargsChunkSpec,
+    pipe_split,
+    pipeline,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+d_hid = 512
+batch_size = 256
+
+torch.manual_seed(0)
+
+
+class ExampleCode(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
+        self.mm_param1 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
+        self.mm_param2 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
+        self.lin1 = torch.nn.Linear(d_hid, d_hid)
+        self.lin2 = torch.nn.Linear(d_hid, d_hid)
+
+    def forward(self, x, y, z=torch.zeros(batch_size, d_hid)):
+        x = torch.mm(x, self.mm_param0)
+        x = x + y
+        x = torch.relu(x)
+        x = x + z
+        pipe_split()
+        x = torch.mm(x, self.mm_param1)
+        x = self.lin1(x)
+        pipe_split()
+        x = torch.relu(x)
+        x = torch.mm(x, self.mm_param2)
+        pipe_split()
+        x = self.lin2(x)
+        x = torch.relu(x)
+        return x
+
+
+class ChunkSpecTests(TestCase):
+    def test_chunk_spec(self):
+        mod = ExampleCode()
+
+        x = torch.randn(batch_size, d_hid)
+        y = torch.randn(batch_size, d_hid)
+        z = torch.randn(batch_size, d_hid)
+
+        chunks = 4
+
+        with ArgsChunkSpec((0, 0)), KwargsChunkSpec({"z": 0}):
+            pipe = pipeline(
+                mod,
+                chunks,
+                example_args=(x, y),
+                example_kwargs={"z": z},
+            )
+
+        assert pipe.num_stages == 4
+
+        ref = mod(x, y, z)
+        out = pipe(x, y, z)[0]
+        torch.testing.assert_close(out, ref)
+        print(f"equivalence test passed {torch.sum(out)} ref {torch.sum(ref)}")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipelining/test_microbatch.py b/test/distributed/pipelining/test_microbatch.py
new file mode 100644
index 0000000000000..c526c6ff7b919
--- /dev/null
+++ b/test/distributed/pipelining/test_microbatch.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+import torch
+from torch.distributed.pipelining.microbatch import (
+    merge_chunks,
+    split_args_kwargs_into_chunks,
+    TensorChunkSpec,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+d_hid = 512
+
+
+class MicrobatchTests(TestCase):
+    def test_split_and_merge(self):
+        x0 = torch.randn(128, d_hid)
+        x1 = torch.randn(256, d_hid)
+        x2 = torch.randn(512, d_hid)
+
+        args = (x0, x1, x2)
+        kwargs = {"x0": x0, "x1": x1, "x2": x2}
+
+        # Default chunking: dim 0
+        arg_chunks, kwarg_chunks = split_args_kwargs_into_chunks(args, kwargs, 2)
+        assert len(arg_chunks) == 2
+        assert len(kwarg_chunks) == 2
+        assert arg_chunks[0][0].shape == torch.Size([64, d_hid])
+        assert arg_chunks[1][0].shape == torch.Size([64, d_hid])
+        assert arg_chunks[0][1].shape == torch.Size([128, d_hid])
+        assert arg_chunks[0][2].shape == torch.Size([256, d_hid])
+        assert kwarg_chunks[0]["x0"].shape == torch.Size([64, d_hid])
+        assert kwarg_chunks[0]["x1"].shape == torch.Size([128, d_hid])
+        assert kwarg_chunks[1]["x2"].shape == torch.Size([256, d_hid])
+
+        # Merge chunks back together
+        merged_args = merge_chunks(
+            arg_chunks,
+            (TensorChunkSpec(0), TensorChunkSpec(0), TensorChunkSpec(0)),
+        )
+        torch.testing.assert_close(merged_args, args)
+
+        merged_kwargs = merge_chunks(
+            kwarg_chunks,
+            {
+                "x0": TensorChunkSpec(0),
+                "x1": TensorChunkSpec(0),
+                "x2": TensorChunkSpec(0),
+            },
+        )
+        torch.testing.assert_close(merged_kwargs, kwargs)
+
+        print("Microbatch test passed")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipelining/test_pipe.py b/test/distributed/pipelining/test_pipe.py
new file mode 100644
index 0000000000000..c966a20b3cbc0
--- /dev/null
+++ b/test/distributed/pipelining/test_pipe.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+import torch
+from torch.distributed.pipelining import pipe_split, pipeline
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+d_hid = 512
+batch_size = 256
+
+torch.manual_seed(0)
+
+
+# Basic example
+class ExampleCode(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
+        self.mm_param1 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
+        self.mm_param2 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
+        self.lin1 = torch.nn.Linear(d_hid, d_hid)
+        self.lin2 = torch.nn.Linear(d_hid, d_hid)
+
+    def forward(self, x, y):
+        x = torch.mm(x, self.mm_param0)
+        skip_connection = x
+        x = x + y
+        x = torch.relu(x)
+        pipe_split()
+        x = torch.mm(x, self.mm_param1)
+        x = self.lin1(x)
+        pipe_split()
+        x = torch.relu(x)
+        x = x + skip_connection
+        x = torch.mm(x, self.mm_param2)
+        pipe_split()
+        x = self.lin2(x)
+        x = torch.relu(x)
+        return x
+
+
+# MLP example
+class MLPModule(torch.nn.Module):
+    def __init__(self, d_hid):
+        super().__init__()
+        self.net1 = torch.nn.Linear(d_hid, d_hid)
+        self.relu = torch.nn.ReLU()
+        self.net2 = torch.nn.Linear(d_hid, d_hid)
+
+    def forward(self, x):
+        x = self.net1(x)
+        x = self.relu(x)
+        x = self.net2(x)
+        return x
+
+
+class MultiMLP(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mlp0 = MLPModule(d_hid)
+        self.mlp1 = MLPModule(d_hid)
+        self.mlp2 = MLPModule(d_hid)
+        self.mlp3 = MLPModule(d_hid)
+
+    def forward(self, x, y):
+        x = self.mlp0(x)
+        pipe_split()
+        x = self.mlp1(x)
+        pipe_split()
+        x = self.mlp2(x)
+        pipe_split()
+        x = self.mlp3(x)
+        return x - y
+
+
+class PipeTests(TestCase):
+    def _test_model_split(self, model_class):
+        mod = model_class()
+        x = torch.randn(batch_size, d_hid)
+        y = torch.randn(batch_size, d_hid)
+
+        pipe = pipeline(
+            mod,
+            num_chunks=4,
+            example_args=(x, y),
+        )
+
+        assert pipe.num_stages == 4, f"nstages = {pipe.num_stages}, expect 4"
+
+        ref_out = mod(x, y)
+        out = pipe(x, y)[0]
+        torch.testing.assert_close(out, ref_out)
+        print(f"equivalence test passed {torch.sum(out)} ref {torch.sum(ref_out)}")
+
+        # Check qualname
+        # state_dict.keys include both parameters and persistent buffers
+        old_names = set(mod.state_dict().keys())
+        new_names = set()
+        for idx in range(pipe.num_stages):
+            stage_mod = pipe.get_stage_module(idx)
+            new_names.update(stage_mod.state_dict().keys())
+
+        assert (
+            old_names == new_names
+        ), f"""
+        old names {old_names}
+        new names {new_names}
+        """
+        print("Qualname check passed")
+
+    def test_example_code(self):
+        self._test_model_split(ExampleCode)
+
+    def test_multi_mlp(self):
+        self._test_model_split(MultiMLP)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipelining/test_stage_backward.py b/test/distributed/pipelining/test_stage_backward.py
new file mode 100644
index 0000000000000..358607ab91c3f
--- /dev/null
+++ b/test/distributed/pipelining/test_stage_backward.py
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+import copy
+
+import torch
+from torch.distributed.pipelining._backward import stage_backward
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+d_hid = 512
+batch_size = 256
+
+
+class MLPModule(torch.nn.Module):
+    def __init__(self, d_hid):
+        super().__init__()
+        self.net1 = torch.nn.Linear(d_hid, d_hid)
+        self.relu = torch.nn.ReLU()
+        self.net2 = torch.nn.Linear(d_hid, d_hid)
+
+    def forward(self, x):
+        x = self.net1(x)
+        x = self.relu(x)
+        x = self.net2(x)
+        return x
+
+
+class StageBackwardTests(TestCase):
+    def test_stage_backward(self):
+        # MLP as a stage module
+        mod = MLPModule(d_hid)
+        x = torch.randn(batch_size, d_hid)
+        # As in a pipeline stage, the inputs to this stage requires gradients
+        x.requires_grad_(True)
+        target = torch.randn(batch_size, d_hid)
+        loss_fn = torch.nn.MSELoss(reduction="sum")
+
+        # Make a copy
+        ref_mod = copy.deepcopy(mod)
+        ref_x = x.detach().requires_grad_(x.requires_grad)
+        ref_target = target.detach()
+
+        # Forward and backward in stage manner
+        out = mod(x)
+        loss = loss_fn(out, target)
+        grad_inputs = stage_backward(
+            stage_output=loss,
+            output_grads=None,
+            input_values=(x,),
+        )
+
+        # Run reference
+        ref_out = ref_mod(ref_x)
+        ref_loss = loss_fn(ref_out, ref_target)
+        ref_loss.backward()
+
+        torch.testing.assert_close(grad_inputs[0], ref_x.grad)
+
+        # Every rank checks gradients
+        for name, p in mod.named_parameters():
+            ref_p = ref_mod.get_parameter(name)
+            try:
+                torch.testing.assert_close(p.grad, ref_p.grad)
+            except AssertionError:
+                print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}")
+                raise
+
+        print("Stage backward test passed")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipelining/test_transformer.py b/test/distributed/pipelining/test_transformer.py
new file mode 100644
index 0000000000000..9742c77b606a6
--- /dev/null
+++ b/test/distributed/pipelining/test_transformer.py
@@ -0,0 +1,78 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+import torch
+from torch.distributed.pipelining import pipeline, SplitPoint
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+d_hid = 16
+n_layers = 8
+batch_size = 4
+
+
+class MLPModule(torch.nn.Module):
+    def __init__(self, d_hid):
+        super().__init__()
+        self.net1 = torch.nn.Linear(d_hid, d_hid)
+        self.relu = torch.nn.ReLU()
+        self.net2 = torch.nn.Linear(d_hid, d_hid)
+
+    def forward(self, x):
+        x = self.net1(x)
+        x = self.relu(x)
+        x = self.net2(x)
+        return x
+
+
+class TransformerLike(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.layers = torch.nn.Sequential(*[MLPModule(d_hid) for _ in range(n_layers)])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.layers(x)
+
+
+class TransformerTests(TestCase):
+    def test_ir(self):
+        transformer = TransformerLike()
+        print("Original model:\n", transformer)
+        x = torch.randn(batch_size, d_hid)
+
+        # Split into 2 stages
+        num_stages = 2
+        split_spec = {f"layers.{n_layers // num_stages}": SplitPoint.BEGINNING}
+
+        pipe = pipeline(
+            transformer,
+            1,
+            (x,),
+            split_spec=split_spec,
+        )
+        assert pipe.num_stages == num_stages, f"{pipe.num_stages=}, expect {num_stages}"
+
+        def get_layers(module):
+            layers = [name for name, _ in module.layers.named_children()]
+            return layers
+
+        # Collect all layers in pipe
+        layers = []
+        for stage_idx in range(pipe.num_stages):
+            stage_mod = pipe.get_stage_module(stage_idx)
+            print(f"\nStage {stage_idx}: \n", stage_mod)
+            layers += get_layers(stage_mod)
+
+        # Check layer completeness
+        orig_layers = get_layers(transformer)
+        assert sorted(layers) == sorted(orig_layers), f"{layers} != {orig_layers}"
+        print("Layers matched! ", layers)
+
+        # Check equivalence
+        ref = transformer(x)
+        out = pipe(x)[0]
+        torch.testing.assert_close(out, ref)
+        print(f"\nEquivalence test passed {torch.sum(out)} ref {torch.sum(ref)}")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipelining/test_unflatten.py b/test/distributed/pipelining/test_unflatten.py
new file mode 100644
index 0000000000000..9c388d279cdfd
--- /dev/null
+++ b/test/distributed/pipelining/test_unflatten.py
@@ -0,0 +1,79 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+import torch
+from torch.distributed.pipelining import pipe_split, pipeline
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+# Building block for model
+class Block(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels=16, out_channels=16, kernel_size=3, padding=1
+        )
+        self.lin0 = torch.nn.Linear(256, 256)
+        self.relu = torch.nn.ReLU()
+        self.lin1 = torch.nn.Linear(256, 256)
+
+    def forward(self, x: torch.Tensor, constant=None) -> torch.Tensor:
+        x = self.conv(x)
+        x = self.lin0(x)
+        pipe_split()
+        x.add_(constant)
+        x = self.lin1(x)
+        return self.relu(x)
+
+
+# Full model
+class M(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.block0 = Block()
+        self.block1 = Block()
+
+    def forward(self, x: torch.Tensor, constant=None) -> torch.Tensor:
+        x = self.block0(x, constant=constant)
+        pipe_split()
+        x = self.block1(x, constant=constant)
+        return x
+
+
+class UnflattenTests(TestCase):
+    def test_unflatten(self):
+        x = torch.randn(1, 16, 256, 256)
+        constant = torch.ones(1, 16, 256, 256)
+
+        mod = M()
+        print("Original model:\n", mod)
+
+        pipe = pipeline(
+            mod,
+            1,
+            (x,),
+            {"constant": constant},
+        )
+
+        assert pipe.num_stages == 4
+        orig_state_dict = mod.state_dict()
+
+        # Check qualnames
+        print("\nParameters of each stage:")
+        for stage_idx in range(pipe.num_stages):
+            print(f"\nStage {stage_idx}:")
+            stage_mod = pipe.get_stage_module(stage_idx)
+            for param_name, param in stage_mod.named_parameters():
+                assert (
+                    param_name in orig_state_dict
+                ), f"{param_name} not in original state dict"
+                print(f"{param_name}: {param.size()}")
+
+        # Check equivalence
+        ref = mod(x, constant)
+        out = pipe(x, constant)[0]
+        torch.testing.assert_close(out, ref)
+        print(f"\nEquivalence test passed {torch.sum(out)} ref {torch.sum(ref)}")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/rpc/cuda/test_tensorpipe_agent.py b/test/distributed/rpc/cuda/test_tensorpipe_agent.py
index cef2e9d36a3f8..12af5036b115c 100644
--- a/test/distributed/rpc/cuda/test_tensorpipe_agent.py
+++ b/test/distributed/rpc/cuda/test_tensorpipe_agent.py
@@ -9,19 +9,19 @@
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
+import torch
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed.rpc.tensorpipe_rpc_agent_test_fixture import (
     TensorPipeRpcAgentTestFixture,
 )
 from torch.testing._internal.distributed.rpc_utils import (
+    generate_tests,
     GENERIC_CUDA_TESTS,
     TENSORPIPE_CUDA_TESTS,
-    generate_tests,
 )
-import torch
 
 if torch.cuda.is_available():
-    torch.cuda.memory._set_allocator_settings('expandable_segments:False')
+    torch.cuda.memory._set_allocator_settings("expandable_segments:False")
 
 globals().update(
     generate_tests(
diff --git a/test/distributed/rpc/test_share_memory.py b/test/distributed/rpc/test_share_memory.py
index 8b538c44b6c21..c587023722b2c 100644
--- a/test/distributed/rpc/test_share_memory.py
+++ b/test/distributed/rpc/test_share_memory.py
@@ -1,34 +1,36 @@
 #!/usr/bin/env python3
 # Owner(s): ["oncall: distributed"]
 
-import torch
-import torch.distributed as dist
-
 import contextlib
 import copyreg
 import os
 import sys
 
+import torch
+import torch.distributed as dist
+
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
-from torch import multiprocessing
-import torch.multiprocessing.reductions as TorchMpReductions
 import torch.distributed.rpc as rpc
-from torch.distributed.rpc.internal import _InternalRPCPickler
+import torch.multiprocessing.reductions as TorchMpReductions
+from torch import multiprocessing
 from torch.distributed.rpc.api import _use_rpc_pickler
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.distributed.rpc.internal import _InternalRPCPickler
+from torch.testing._internal.common_utils import run_tests, TestCase
+
 
 @contextlib.contextmanager
 def fs_sharing():
     prev_strategy = multiprocessing.get_sharing_strategy()
-    multiprocessing.set_sharing_strategy('file_system')
+    multiprocessing.set_sharing_strategy("file_system")
     try:
         yield
     finally:
         multiprocessing.set_sharing_strategy(prev_strategy)
 
+
 class ShareMemoryRPCPickler(_InternalRPCPickler):
     def __init__(self) -> None:
         super().__init__()
@@ -46,38 +48,36 @@ def __init__(self) -> None:
             torch.nn.parameter.Parameter
         ] = TorchMpReductions.reduce_tensor
 
+
 def worker_loop(a):
-    rpc.init_rpc('worker1', rank=1, world_size=2)
+    rpc.init_rpc("worker1", rank=1, world_size=2)
     rpc.shutdown()
 
+
 def worker_fn(m):
     pass
 
+
 class TestRPCPickler(TestCase):
     def test_case(self):
-        os.environ['MASTER_ADDR'] = 'localhost'
-        os.environ['MASTER_PORT'] = '29500'
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "29500"
 
         with fs_sharing():
             r = multiprocessing.spawn(worker_loop, join=False)
 
             try:
                 with _use_rpc_pickler(ShareMemoryRPCPickler()):
-                    rpc.init_rpc(
-                        'worker0',
-                        rank=0,
-                        world_size=2)
+                    rpc.init_rpc("worker0", rank=0, world_size=2)
                     m = torch.nn.Linear(1, 2)
                     m.share_memory()
-                    rref = rpc.remote(
-                        'worker1',
-                        worker_fn,
-                        args=(m,))
+                    rref = rpc.remote("worker1", worker_fn, args=(m,))
 
                     rref.to_here()
             finally:
                 rpc.shutdown()
                 r.join()
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/rpc/test_tensorpipe_agent.py b/test/distributed/rpc/test_tensorpipe_agent.py
index bb0870dd50316..56f176a17c9f4 100644
--- a/test/distributed/rpc/test_tensorpipe_agent.py
+++ b/test/distributed/rpc/test_tensorpipe_agent.py
@@ -15,9 +15,9 @@
     TensorPipeRpcAgentTestFixture,
 )
 from torch.testing._internal.distributed.rpc_utils import (
+    generate_tests,
     GENERIC_TESTS,
     TENSORPIPE_TESTS,
-    generate_tests,
 )
 
 
diff --git a/test/distributed/tensor/parallel/test_ddp_2d_parallel.py b/test/distributed/tensor/parallel/test_ddp_2d_parallel.py
index 35cd3f365f194..8c69bf25a8f4a 100644
--- a/test/distributed/tensor/parallel/test_ddp_2d_parallel.py
+++ b/test/distributed/tensor/parallel/test_ddp_2d_parallel.py
@@ -2,7 +2,7 @@
 
 import torch
 import torch.distributed as dist
-from torch.distributed._tensor import DeviceMesh, DTensor, Replicate
+from torch.distributed._tensor import DeviceMesh, DTensor, init_device_mesh, Replicate
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
@@ -39,14 +39,19 @@ def init_model(device_type, model_parallel_size=TP_DEGREE):
         device_type=device_type,
         mesh=torch.arange(0, world_size).view(-1, model_parallel_size),
     )
+    mesh_2d = init_device_mesh(
+        device_type,
+        (world_size // model_parallel_size, model_parallel_size),
+        mesh_dim_names=("dp", "tp"),
+    )
 
-    dp_pg = twod_mesh.get_group(mesh_dim=0)
+    dp_pg = mesh_2d.get_group(mesh_dim=0)
 
     parallelize_plan = {
         "net1": ColwiseParallel(),
         "net2": RowwiseParallel(),
     }
-    twod_model = parallelize_module(twod_model, twod_mesh, parallelize_plan, tp_mesh_dim=1)
+    twod_model = parallelize_module(twod_model, mesh_2d["tp"], parallelize_plan)
     _pre_dp_module_transform(twod_model)
     # TODO: Add tests when using gradient_as_bucket_view and static_graph for DDP.
     twod_model = DDP(twod_model, process_group=dp_pg)
diff --git a/test/distributed/tensor/parallel/test_fsdp_2d_parallel.py b/test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
index 64bc628d1b3f1..b219bc9db38d1 100644
--- a/test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
+++ b/test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
@@ -61,8 +61,6 @@ def get_input(self):
         return torch.rand(4, 5, device="cuda")
 
 
-# TODO: Temporarily disabled tests related SimpleModelUneven due to size mismatch problem.
-# TODO: Let's change back the tests after corresponding fixes are made.
 class SimpleModelUneven(nn.Module):
     def __init__(self):
         super().__init__()
@@ -246,9 +244,7 @@ def test_fsdp_2d_extension(self):
 
     @with_comms
     @skip_if_lt_x_gpu(4)
-    # TODO: See the TODO item for SimpleModelUneven.
-    # @parametrize("is_even_sharded_model", [True, False])
-    @parametrize("is_even_sharded_model", [True])
+    @parametrize("is_even_sharded_model", [True, False])
     def test_2d_state_dict(self, is_even_sharded_model):
         simple_model = SimpleModel if is_even_sharded_model else SimpleModelUneven
 
@@ -302,9 +298,7 @@ def test_2d_state_dict(self, is_even_sharded_model):
 
     @with_comms
     @skip_if_lt_x_gpu(4)
-    # TODO: See the TODO item for SimpleModelUneven.
-    # @parametrize("is_even_sharded_model", [True, False])
-    @parametrize("is_even_sharded_model", [True])
+    @parametrize("is_even_sharded_model", [True, False])
     def test_2d_load_state_dict(self, is_even_sharded_model):
         simple_model = SimpleModel if is_even_sharded_model else SimpleModelUneven
 
@@ -357,9 +351,7 @@ def test_2d_load_state_dict(self, is_even_sharded_model):
 
     @with_comms
     @skip_if_lt_x_gpu(4)
-    # TODO: See the TODO item for SimpleModelUneven.
-    # @parametrize("is_even_sharded_model", [True, False])
-    @parametrize("is_even_sharded_model", [True])
+    @parametrize("is_even_sharded_model", [True, False])
     def test_2d_optim_state_dict(self, is_even_sharded_model):
         simple_model = SimpleModel if is_even_sharded_model else SimpleModelUneven
 
@@ -381,7 +373,9 @@ def test_2d_optim_state_dict(self, is_even_sharded_model):
             "net1": ColwiseParallel(),
             "net2": RowwiseParallel(),
         }
-        model_2d = parallelize_module(simple_model().cuda(), mesh_2d["tp"], parallelize_plan)
+        model_2d = parallelize_module(
+            simple_model().cuda(), mesh_2d["tp"], parallelize_plan
+        )
         model_2d = FSDP(model_2d, device_mesh=mesh_2d["dp"], use_orig_params=True)
         FSDP.set_state_dict_type(
             model_2d,
diff --git a/test/distributed/tensor/parallel/test_parallelize_api.py b/test/distributed/tensor/parallel/test_parallelize_api.py
index ac3e5b8a04e4e..53f92ecd0dba5 100644
--- a/test/distributed/tensor/parallel/test_parallelize_api.py
+++ b/test/distributed/tensor/parallel/test_parallelize_api.py
@@ -1,12 +1,10 @@
 # Owner(s): ["oncall: distributed"]
 from collections import OrderedDict
+from copy import deepcopy
 
 import torch
 from torch.distributed._tensor import DeviceMesh, DTensor, Replicate, Shard
-from torch.distributed.tensor.parallel._utils import _create_1d_device_mesh
-from torch.distributed.tensor.parallel.api import (
-    parallelize_module,
-)
+from torch.distributed.tensor.parallel.api import parallelize_module
 from torch.distributed.tensor.parallel.style import (
     ColwiseParallel,
     PrepareModuleInput,
@@ -17,6 +15,7 @@
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     MLPModule,
+    MLPStacked,
     with_comms,
 )
 
@@ -35,45 +34,6 @@ def world_size(self):
         gpu_num = torch.cuda.device_count()
         return gpu_num if gpu_num % 2 == 0 and gpu_num > 4 else 4
 
-    @with_comms
-    def test_create_1d_device_mesh(self):
-        dim_one_size = 2
-        mesh_shape = (
-            torch.arange(self.world_size)
-            .reshape(
-                self.world_size // dim_one_size,
-                dim_one_size,
-            )
-            .to(torch.int)
-        )
-        mesh = DeviceMesh(self.device_type, mesh_shape)
-        # When 1D dim is 1.
-        one_dimention_mesh_shape = mesh_shape[self.rank // dim_one_size, :]
-        pg = mesh.get_group(mesh_dim=1)
-        new_mesh = _create_1d_device_mesh(mesh, 1)
-        expected_mesh = one_dimention_mesh_shape
-
-        self.assertEqual(new_mesh.mesh, expected_mesh)
-        self.assertEqual(new_mesh.device_type, self.device_type)
-        self.assertEqual(new_mesh.get_group(), pg)
-        # When 1D dim is 0.
-        one_dimention_mesh_shape = mesh_shape[:, self.rank % dim_one_size]
-        pg = mesh.get_group(mesh_dim=0)
-        new_mesh = _create_1d_device_mesh(mesh, 0)
-        expected_mesh = one_dimention_mesh_shape
-        self.assertEqual(new_mesh.mesh, expected_mesh)
-        self.assertEqual(new_mesh.device_type, self.device_type)
-        self.assertEqual(new_mesh.get_group(), pg)
-
-    @with_comms
-    def test_create_1d_device_mesh_error(self):
-        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
-        with self.assertRaisesRegex(
-            AssertionError,
-            "Expect tp_mesh_dim within range \\[-1, 1\\), but found 3.",
-        ):
-            _create_1d_device_mesh(mesh, 3)
-
     def _compare_params(
         self,
         local_module,
@@ -139,13 +99,7 @@ def _compare_module(
     def test_parallelize_mlp_with_module_api(self):
         inp_size = [12, 10]
         model = MLPModule(self.device_type)
-        model_tp = MLPModule(self.device_type)
-
-        # Ensure model are initialized the same way.
-        self.assertEqual(model.net1.weight, model_tp.net1.weight)
-        self.assertEqual(model.net1.bias, model_tp.net1.bias)
-        self.assertEqual(model.net2.weight, model_tp.net2.weight)
-        self.assertEqual(model.net2.bias, model_tp.net2.bias)
+        model_tp = deepcopy(model)
 
         # Parallelize module.
         device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
@@ -165,23 +119,7 @@ def test_parallelize_mlp_with_module_api_nested(self):
         model = torch.nn.Sequential(
             OrderedDict([("dummy_encoder", MLPModule(self.device_type))])
         )
-        model_tp = torch.nn.Sequential(
-            OrderedDict([("dummy_encoder", MLPModule(self.device_type))])
-        )
-
-        # Ensure model are initialized the same way.
-        self.assertEqual(
-            model.dummy_encoder.net1.weight, model_tp.dummy_encoder.net1.weight
-        )
-        self.assertEqual(
-            model.dummy_encoder.net1.bias, model_tp.dummy_encoder.net1.bias
-        )
-        self.assertEqual(
-            model.dummy_encoder.net2.weight, model_tp.dummy_encoder.net2.weight
-        )
-        self.assertEqual(
-            model.dummy_encoder.net2.bias, model_tp.dummy_encoder.net2.bias
-        )
+        model_tp = deepcopy(model)
 
         # Parallelize module.
         device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
@@ -203,8 +141,7 @@ def test_linear_row_wise_parallel(self):
 
         torch.manual_seed(5)
         model = torch.nn.Linear(16, 10, device=self.device_type)
-        torch.manual_seed(5)
-        model_tp = torch.nn.Linear(16, 10, device=self.device_type)
+        model_tp = deepcopy(model)
 
         # parallelize model_tp
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
@@ -222,8 +159,7 @@ def test_linear_col_wise_parallel(self):
 
         torch.manual_seed(5)
         model = torch.nn.Linear(10, 16, device=self.device_type)
-        torch.manual_seed(5)
-        model_tp = torch.nn.Linear(10, 16, device=self.device_type)
+        model_tp = deepcopy(model)
 
         # parallelize model_tp
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
@@ -239,9 +175,8 @@ def test_prepare_module_input(self):
             module,
             device_mesh,
             PrepareModuleInput(
-                input_layouts=Shard(0),
-                desired_input_layouts=Replicate()
-            )
+                input_layouts=Shard(0), desired_input_layouts=Replicate()
+            ),
         )
         inp = torch.rand(5, 7, device=self.device_type)
         output = module(inp).redistribute(device_mesh, [Shard(0)]).to_local()
@@ -255,9 +190,8 @@ def test_prepare_module_output(self):
             module,
             device_mesh,
             PrepareModuleOutput(
-                output_layouts=Replicate(),
-                desired_output_layouts=Shard(0)
-            )
+                output_layouts=Replicate(), desired_output_layouts=Shard(0)
+            ),
         )
         torch.manual_seed(15)
         inp = torch.rand(16, 7, device=self.device_type)
@@ -266,6 +200,71 @@ def test_prepare_module_output(self):
         inp = dtensor.redistribute(device_mesh, [Shard(0)]).to_local()
         self.assertEqual(inp, output)
 
+    @with_comms
+    def test_parallelize_module_with_star(self):
+        inp_size = [12, 10]
+        model = MLPModule(self.device_type)
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        model_tp = deepcopy(model)
+        model_tp = parallelize_module(
+            model_tp,
+            device_mesh,
+            {
+                "net*": ColwiseParallel(output_layouts=Replicate()),
+            },
+        )
+        self._compare_module(model, model_tp, inp_size, rank0_only=False)
+
+    @with_comms
+    def test_parallelize_module_with_question(self):
+        inp_size = [12, 10]
+        model = MLPModule(self.device_type)
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        model_tp = deepcopy(model)
+        model_tp = parallelize_module(
+            model_tp,
+            device_mesh,
+            {
+                "net?": ColwiseParallel(output_layouts=Replicate()),
+            },
+        )
+        self._compare_module(model, model_tp, inp_size, rank0_only=False)
+
+    @with_comms
+    def test_parallelize_module_with_digit(self):
+        inp_size = [12, 10]
+        model = MLPModule(self.device_type)
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        model_tp = deepcopy(model)
+        model_tp = parallelize_module(
+            model_tp,
+            device_mesh,
+            {
+                "net[1-2]": ColwiseParallel(output_layouts=Replicate()),
+            },
+        )
+        self._compare_module(model, model_tp, inp_size, rank0_only=False)
+
+    @with_comms
+    def test_parallelize_module_multi_wildcard(self):
+        inp_size = [12, 10]
+        model = MLPStacked(self.device_type, n_layers=2)
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        model_tp = deepcopy(model)
+        model_tp = parallelize_module(
+            model_tp,
+            device_mesh,
+            {
+                "layers.*.net[1]": ColwiseParallel(),
+                "layers.*.net[2]": RowwiseParallel(),
+            },
+        )
+        self._compare_module(model, model_tp, inp_size, rank0_only=False)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
index 6bdeff2a72653..cd038dbbb2737 100644
--- a/test/distributed/tensor/parallel/test_tp_examples.py
+++ b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -1,19 +1,28 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+import itertools
 from copy import deepcopy
+
 import torch
 import torch.distributed as dist
-from torch.distributed._tensor import DeviceMesh, DTensor, Replicate, Shard
-import torch.distributed._functional_collectives as funcol
+import torch.nn.functional as F
+from torch.distributed._tensor import (
+    DeviceMesh,
+    distribute_tensor,
+    DTensor,
+    Replicate,
+    Shard,
+)
+from torch.distributed._tensor.debug import CommDebugMode
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     checkpoint_wrapper,
     CheckpointImpl,
 )
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
+    loss_parallel,
     parallelize_module,
-    PrepareModuleInput,
     RowwiseParallel,
 )
 from torch.distributed.tensor.parallel.input_reshard import input_reshard
@@ -33,6 +42,9 @@
 )
 
 
+c10d_functional = torch.ops.c10d_functional
+
+
 class DistTensorParallelExampleTest(DTensorTestBase):
     def _check_module(self, m1, m2, check_grad=False):
         named_parameters = dict(m1.named_parameters())
@@ -88,11 +100,25 @@ def _test_mlp_training_e2e(self, is_seq_parallel=False, recompute_activation=Fal
         optim_tp = torch.optim.SGD(model_tp.parameters(), lr=LR)
 
         output = model(inp)
-        output_tp = model_tp(inp)
-        self.assertEqual(output, output_tp)
-
         output.sum().backward()
-        output_tp.sum().backward()
+
+        from torch.distributed._tensor.debug import CommDebugMode
+
+        comm_mode = CommDebugMode()
+        with comm_mode:
+            output_tp = model_tp(inp)
+            output_tp.sum().backward()
+
+        self.assertEqual(output, output_tp)
+        if is_seq_parallel:
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 2
+            )
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.reduce_scatter_tensor], 1
+            )
+        else:
+            self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.all_reduce], 1)
 
         if is_seq_parallel:
             # Sum gradients from different ranks, since input
@@ -164,19 +190,11 @@ def test_mlp_inference(self):
     def test_transformer_training(self, is_seq_parallel=False):
         # Step 1: Initialize single-gpu models and optimizers.
 
-        model_args = ModelArgs(
-            # Disable dropout in the test since we cannot reproduce the same random
-            # behaviors when comparing single-gpu models with multi-gpu models.
-            dropout_p=0.0,
-            # TODO: Weight_tying works fine under inputs and models of small size.
-            # Test error would surpass the allowed limit when tuning up model size.
-            # Need to investigate if this is normal, e.g. due to precision issues.
-            weight_tying=False,
-        )
+        # Disable dropout in the test since we cannot reproduce the same random
+        # behaviors when comparing single-gpu models with multi-gpu models.
+        model_args = ModelArgs(dropout_p=0.0)
 
-        # Reset random seeds to ensure two models have the same initialization.
-        torch.manual_seed(5)
-        model = Transformer(model_args).to(self.device_type)
+        model = Transformer(model_args).to(device=self.device_type)
         model_tp = deepcopy(model)
         self._check_module(model, model_tp)
 
@@ -184,85 +202,7 @@ def test_transformer_training(self, is_seq_parallel=False):
         # onto the device mesh.
 
         device_mesh = DeviceMesh(self.device_type, torch.arange(0, NUM_DEVICES))
-
-        # Parallelize the embedding submodules.
-        parallelize_module(
-            model_tp.tok_embeddings,
-            device_mesh,
-            ColwiseParallel(
-                output_layouts=Shard(1)
-            ) if is_seq_parallel else ColwiseParallel(output_layouts=Replicate())
-        )
-        parallelize_module(
-            model_tp.pos_embeddings,
-            device_mesh,
-            ColwiseParallel(
-                output_layouts=Shard(0),
-            ) if is_seq_parallel else ColwiseParallel(output_layouts=Replicate())
-        )
-
-        # Parallelize the attention and feed forward submodules.
-        for layer in model_tp.layers:
-            layer_parallelize_plan = {}
-            if is_seq_parallel:
-                layer_parallelize_plan["attention"] = PrepareModuleInput(
-                    input_layouts=Shard(1),
-                    desired_input_layouts=Replicate(),
-                )
-            layer_parallelize_plan["attention.wq"] = ColwiseParallel()
-            layer_parallelize_plan["attention.wk"] = ColwiseParallel()
-            layer_parallelize_plan["attention.wv"] = ColwiseParallel()
-            layer_parallelize_plan["attention.wo"] = RowwiseParallel(
-                output_layouts=Shard(1)
-            ) if is_seq_parallel else RowwiseParallel()
-
-            layer_parallelize_plan["feed_forward.w1"] = ColwiseParallel(
-                input_layouts=Shard(1)
-            ) if is_seq_parallel else ColwiseParallel()
-            layer_parallelize_plan["feed_forward.w2"] = RowwiseParallel(
-                output_layouts=Shard(1)
-            ) if is_seq_parallel else RowwiseParallel()
-
-            parallelize_module(layer, device_mesh, layer_parallelize_plan)
-
-        # Parallelize the output submodule. If weight tying is enabled, we need to
-        # make sure output.weight is sharded consistently as tok_embeddings.weight,
-        # at the cost of the all_reduce operation using RowwiseParallel.
-        output_parallelize_plan = None
-        if not model_args.weight_tying:
-            output_parallelize_plan = ColwiseParallel(
-                input_layouts=Shard(1),
-                output_layouts=Replicate(),
-            ) if is_seq_parallel else ColwiseParallel(output_layouts=Replicate())
-        else:
-            output_parallelize_plan = RowwiseParallel(
-                input_layouts=Shard(1),
-                output_layouts=Replicate(),
-            ) if is_seq_parallel else RowwiseParallel(input_layouts=Replicate())
-        parallelize_module(model_tp.output, device_mesh, output_parallelize_plan)
-
-        # Step 2.5: Do manual setup on features that DTensor does not support yet.
-
-        # Manually adjust the number of heads after sharding the attention modules.
-        for layer in model_tp.layers:
-            layer.attention.n_heads = model_args.n_heads // self.world_size
-
-        # TODO: switch to a TP API once that feature is ready.
-        # Manually register all_reduce hooks for all norm layers as they only process sharded inputs.
-        if is_seq_parallel:
-            def all_reduce_fn(grad):
-                return funcol.all_reduce(grad, reduceOp="SUM", group=device_mesh)
-            for layer in model_tp.layers:
-                layer.attention_norm.weight.register_hook(all_reduce_fn)
-                layer.attention_norm.bias.register_hook(all_reduce_fn)
-                layer.ffn_norm.weight.register_hook(all_reduce_fn)
-                layer.ffn_norm.bias.register_hook(all_reduce_fn)
-            model_tp.norm.weight.register_hook(all_reduce_fn)
-            model_tp.norm.bias.register_hook(all_reduce_fn)
-
-        # Manually set output.weight so that parameters and gradients are shared.
-        if model_args.weight_tying:
-            model_tp.output.weight = model_tp.tok_embeddings.weight
+        model_tp = Transformer.parallelize(model_tp, device_mesh, is_seq_parallel)
 
         # Step 3: Run test by comparing outputs from single-gpu and multi-gpu models.
 
@@ -271,7 +211,7 @@ def all_reduce_fn(grad):
         optim_tp = torch.optim.Adam(model_tp.parameters(), lr=LR)
 
         # Initialize input and make sure all ranks have the same input.
-        inp_size = [4, 8]  # [batch_size, seq_len]
+        inp_size = [8, 8]  # [batch_size, seq_len]
         if is_seq_parallel:
             assert inp_size[1] % self.world_size == 0
         torch.manual_seed(0)
@@ -279,18 +219,61 @@ def all_reduce_fn(grad):
 
         # Compare outputs on the same input.
         output = model(inp)
-        output_tp = model_tp(inp)
+        with CommDebugMode() as comm_mode:
+            output_tp = model_tp(inp)
         self.assertEqual(output, output_tp)
+        if is_seq_parallel:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.reduce_scatter_tensor: 6,
+                    c10d_functional.all_gather_into_tensor: 6,
+                },
+            )
+        else:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_reduce: 6,
+                    c10d_functional.all_gather_into_tensor: 1,
+                },
+            )
 
         # Ensure gradients are equal.
         output.sum().backward()
-        output_tp.sum().backward()
+        with CommDebugMode() as comm_mode:
+            output_tp.sum().backward()
         self._check_module(model, model_tp, check_grad=True)
+        if is_seq_parallel:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.reduce_scatter_tensor: 5,
+                    c10d_functional.all_gather_into_tensor: 6,
+                },
+            )
+        else:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_reduce: 9,
+                },
+            )
 
         # Ensure model weights are still the same after update.
         optim.step()
-        optim_tp.step()
+        with CommDebugMode() as comm_mode:
+            optim_tp.step()
         self._check_module(model, model_tp)
+        if is_seq_parallel:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_reduce: 30,
+                },
+            )
+        else:
+            self.assertDictEqual(comm_mode.get_comm_counts(), {})
 
         # Compare outputs on another input.
         torch.manual_seed(11)
@@ -346,6 +329,62 @@ def forward(self, x):
         self.assertEqual(model.embedding.weight.grad, model.fc.weight.grad)
         self.assertEqual(id(model.embedding.weight.grad), id(model.fc.weight.grad))
 
+    @with_comms
+    def test_loss_parallel(self):
+        device_mesh = self.build_device_mesh()
+        comm_mode = CommDebugMode()
+
+        channel_size, channel_dim = 16, 1
+        test_setup = [
+            (2, (8, channel_size), (8,)),  # calling aten.nll_loss_forward
+            (3, (8, channel_size, 12), (8, 12)),  # calling aten.nll_loss2d_forward
+        ]
+        weight = torch.rand(channel_size, device=self.device_type)
+        for input_ndim, input_size, target_size in test_setup:
+            x = torch.rand(*input_size, device=self.device_type, requires_grad=True)
+            target = torch.randint(channel_size, target_size, device=self.device_type)
+
+            shard_dims = list(range(input_ndim))
+            reductions = ["none", "mean", "sum"]
+            for shard_dim, reduction in itertools.product(shard_dims, reductions):
+                dist_x = distribute_tensor(x, device_mesh, [Shard(shard_dim)])
+                y = F.cross_entropy(x, target, weight, reduction=reduction)
+                with loss_parallel():
+                    if shard_dim == channel_dim:
+                        with comm_mode:
+                            dist_y = F.cross_entropy(
+                                dist_x, target, weight, reduction=reduction
+                            )
+                            self.assertEqual(comm_mode.get_total_counts(), 3)
+                            self.assertEqual(
+                                comm_mode.get_comm_counts()[c10d_functional.all_reduce],
+                                3,
+                            )
+                            self.assertTrue(dist_y.placements[0].is_replicate())
+                            self.assertEqual(dist_y.to_local(), y)
+
+                        with comm_mode:
+                            if reduction == "none":
+                                y.sum().backward()
+                                dist_y.sum().backward()
+                            else:
+                                y.backward()
+                                dist_y.backward()
+                            self.assertEqual(comm_mode.get_total_counts(), 0)
+                            self.assertTrue(
+                                dist_x.grad.placements[0].is_shard(shard_dim)
+                            )
+                            self.assertEqual(dist_x.grad.full_tensor(), x.grad)
+                        x.grad.zero_()
+                    else:
+                        with self.assertRaisesRegex(
+                            ValueError,
+                            "loss_parallel",
+                        ):
+                            dist_y = F.cross_entropy(
+                                dist_x, target, reduction=reduction
+                            )
+
 
 instantiate_parametrized_tests(DistTensorParallelExampleTest)
 
diff --git a/test/distributed/tensor/parallel/test_tp_random_state.py b/test/distributed/tensor/parallel/test_tp_random_state.py
index 62771f3caaf49..5c83802076d30 100644
--- a/test/distributed/tensor/parallel/test_tp_random_state.py
+++ b/test/distributed/tensor/parallel/test_tp_random_state.py
@@ -3,7 +3,7 @@
 import torch.distributed._functional_collectives as funcol
 import torch.distributed._tensor.random as random
 
-from torch.distributed._tensor import DeviceMesh, Replicate
+from torch.distributed._tensor import init_device_mesh, Replicate
 from torch.distributed.tensor.parallel.api import parallelize_module
 from torch.distributed.tensor.parallel.style import ColwiseParallel
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
@@ -38,10 +38,17 @@ def check_gathered_tensors(self, self_rank, size, gathered_tensors, assertFunc):
     @with_comms
     @skip_if_lt_x_gpu(4)
     def test_model_init(self):
-        mesh = torch.arange(self.world_size).reshape(2, 2)
-        device_mesh = DeviceMesh(self.device_type, mesh)
-        tp_rank = device_mesh.get_coordinate()[0]  # the tensor parallel dimension is 0
-        dp_rank = device_mesh.get_coordinate()[1]  # the data parallel dimension is 1
+        dp_size = 2
+        tp_size = self.world_size // dp_size
+        mesh_2d = init_device_mesh(
+            self.device_type, (dp_size, tp_size), mesh_dim_names=("dp", "tp")
+        )
+        dp_mesh = mesh_2d["dp"]
+        tp_mesh = mesh_2d["tp"]
+        dp_rank = dp_mesh.get_coordinate()[0]
+        tp_rank = tp_mesh.get_coordinate()[0]
+        self.assertEqual(dp_rank, self.rank // tp_size)
+        self.assertEqual(tp_rank, self.rank % tp_size)
 
         for enable_distribute_flag in [False, True]:
             # a local model on meta device
@@ -49,7 +56,7 @@ def test_model_init(self):
             # the col-wise parallel style shards the weight over tensor dim 0
             model_tp = parallelize_module(
                 model,
-                device_mesh,
+                tp_mesh,
                 {
                     "net1": ColwiseParallel(output_layouts=Replicate()),
                     "net2": ColwiseParallel(output_layouts=Replicate()),
@@ -73,6 +80,7 @@ def test_model_init(self):
                 # the 1d mesh represents the TP group
                 _1d_mesh = dtensor.device_mesh
                 assert _1d_mesh.ndim == 1
+                self.assertEqual(_1d_mesh, tp_mesh)
 
                 tensor_local = dtensor.to_local()
 
@@ -80,7 +88,7 @@ def test_model_init(self):
                 tensor_gather = funcol.all_gather_tensor(
                     tensor_local,
                     gather_dim=0,
-                    group=(_1d_mesh, 0)
+                    group=_1d_mesh,
                 )
                 self.assertEqual(_1d_mesh.get_coordinate()[0], tp_rank)
 
@@ -94,14 +102,16 @@ def tp_weights_assert(tensor1, tensor2):
                         # each rank within a TP group has the same initial weights
                         self.assertEqual(tensor1, tensor2)
 
-                self.check_gathered_tensors(tp_rank, 2, tensor_gather, tp_weights_assert)
+                self.check_gathered_tensors(
+                    tp_rank, tp_size, tensor_gather, tp_weights_assert
+                )
 
                 # check across TP groups
                 # all-gather local shards
                 tensor_gather = funcol.all_gather_tensor(
                     tensor_local,
                     gather_dim=0,
-                    group=(_1d_mesh, 1)
+                    group=dp_mesh,
                 )
 
                 # compare local shards across TP groups
@@ -115,7 +125,9 @@ def dp_weights_assert(tensor1, tensor2):
                         # random seeds set in data loading.
                         self.assertNotEqual(tensor1, tensor2)
 
-                self.check_gathered_tensors(dp_rank, 2, tensor_gather, dp_weights_assert)
+                self.check_gathered_tensors(
+                    dp_rank, dp_size, tensor_gather, dp_weights_assert
+                )
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/tensor/parallel/test_tp_style.py b/test/distributed/tensor/parallel/test_tp_style.py
index 422cb2a6b42c5..e2a9a01da85b7 100644
--- a/test/distributed/tensor/parallel/test_tp_style.py
+++ b/test/distributed/tensor/parallel/test_tp_style.py
@@ -6,25 +6,35 @@
 import torch
 import torch.nn as nn
 
-from torch.distributed._tensor import Replicate, Shard, init_device_mesh
+from torch.distributed._tensor import (
+    distribute_tensor,
+    DTensor,
+    init_device_mesh,
+    Replicate,
+    Shard,
+)
 from torch.distributed._tensor.debug import CommDebugMode
+from torch.distributed._tensor.placement_types import _Partial
 from torch.distributed.tensor.parallel import parallelize_module
 from torch.distributed.tensor.parallel.style import (
     ColwiseParallel,
     PrepareModuleInput,
     PrepareModuleOutput,
     RowwiseParallel,
+    SequenceParallel,
 )
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
-    with_comms,
     NUM_DEVICES,
+    RMSNormPython,
+    with_comms,
 )
 
 
 c10d_functional = torch.ops.c10d_functional
 
+
 class TensorParallelStyleTest(DTensorTestBase):
     @property
     def world_size(self):
@@ -39,8 +49,8 @@ def test_colwise_parallel_style(self):
         model = nn.Linear(16, 16, device=self.device_type)
 
         default_col_parallel = ColwiseParallel()
+        colwise_mod = parallelize_module(deepcopy(model), mesh, default_col_parallel)
         with comm_mode:
-            colwise_mod = parallelize_module(deepcopy(model), mesh, default_col_parallel)
             out = colwise_mod(tensor)
             # ensure output shard on the last dim
             self.assertEqual(out.shape, (8, 16 // self.world_size))
@@ -53,31 +63,58 @@ def test_colwise_parallel_style(self):
             self.assertEqual(comm_mode.get_total_counts(), 1)
 
         sharded_col_parallel = ColwiseParallel(input_layouts=Shard(0))
+        colwise_mod = parallelize_module(deepcopy(model), mesh, sharded_col_parallel)
         with comm_mode:
-            colwise_mod = parallelize_module(deepcopy(model), mesh, sharded_col_parallel)
             out = colwise_mod(tensor)
             # ensure output shard on the last dim
             self.assertEqual(out.shape, (8 * self.world_size, 16 // self.world_size))
             # allgather in fwd
-            self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 1)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 1
+            )
             self.assertEqual(comm_mode.get_total_counts(), 1)
 
             out.sum().backward()
             # reduce_scatter in bwd
-            self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.reduce_scatter_tensor], 1)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.reduce_scatter_tensor], 1
+            )
             self.assertEqual(comm_mode.get_total_counts(), 2)
 
+    @with_comms
+    def test_colwise_parallel_embedding(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        comm_mode = CommDebugMode()
+        tensor = torch.arange(8, device=self.device_type).reshape(4, 2)
+        model = nn.Embedding(16, 16, device=self.device_type)
+
+        default_col_parallel = ColwiseParallel()
+        colwise_mod = parallelize_module(deepcopy(model), mesh, default_col_parallel)
+        with comm_mode:
+            out = colwise_mod(tensor)
+            # ensure output shard on the last dim
+            self.assertEqual(out.shape, (4, 2, 16 // self.world_size))
+            # ensure no communication happened in fwd
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+
+            out.sum().backward()
+            # no comm in bwd
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+
     @with_comms
     def test_rowwise_parallel_style(self):
         mesh = init_device_mesh(self.device_type, (self.world_size,))
 
         comm_mode = CommDebugMode()
-        tensor = torch.rand(8, 16 // self.world_size, device=self.device_type, requires_grad=True)
+        tensor = torch.rand(
+            8, 16 // self.world_size, device=self.device_type, requires_grad=True
+        )
         model = nn.Linear(16, 16, device=self.device_type)
 
         default_row_parallel = RowwiseParallel()
+        rowwise_mod = parallelize_module(deepcopy(model), mesh, default_row_parallel)
         with comm_mode:
-            rowwise_mod = parallelize_module(deepcopy(model), mesh, default_row_parallel)
             out = rowwise_mod(tensor)
             # ensure output replicated
             self.assertEqual(out.shape, (8, 16))
@@ -90,34 +127,85 @@ def test_rowwise_parallel_style(self):
             self.assertEqual(comm_mode.get_total_counts(), 1)
 
         sharded_row_parallel = RowwiseParallel(output_layouts=Shard(0))
+        rowwise_mod = parallelize_module(deepcopy(model), mesh, sharded_row_parallel)
         with comm_mode:
-            rowwise_mod = parallelize_module(deepcopy(model), mesh, sharded_row_parallel)
             out = rowwise_mod(tensor)
             # ensure output replicated
             self.assertEqual(out.shape, (8 // self.world_size, 16))
             # reduce_scatter in fwd
-            self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.reduce_scatter_tensor], 1)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.reduce_scatter_tensor], 1
+            )
             self.assertEqual(comm_mode.get_total_counts(), 1)
 
             out.sum().backward()
             # allgather in bwd
-            self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 1)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 1
+            )
             self.assertEqual(comm_mode.get_total_counts(), 2)
 
+    @with_comms
+    def test_rowwise_parallel_embedding(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        comm_mode = CommDebugMode()
+        tensor = torch.arange(8, device=self.device_type).reshape(4, 2)
+        model = nn.Embedding(16, 16, device=self.device_type)
+
+        rowwise_mod = parallelize_module(
+            deepcopy(model), mesh, RowwiseParallel(input_layouts=Replicate())
+        )
+        with comm_mode:
+            out = rowwise_mod(tensor)
+            # ensure output shard on the last dim
+            self.assertEqual(out.shape, (4, 2, 16))
+            # ensure allreduce communication happened in fwd
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+            self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.all_reduce], 1)
+
+            out.sum().backward()
+            # no comm in bwd
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+
+        sharded_row_parallel = RowwiseParallel(
+            input_layouts=Replicate(), output_layouts=Shard(1)
+        )
+
+        rowwise_mod = parallelize_module(deepcopy(model), mesh, sharded_row_parallel)
+
+        inp_indices = torch.arange(8, device=self.device_type)
+        with comm_mode:
+            out = rowwise_mod(inp_indices)
+            # ensure output shard on the last dim
+            self.assertEqual(out.shape, (8, 16 // self.world_size))
+            # reduce scatter in fwd
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.reduce_scatter_tensor], 1
+            )
+            out.sum().backward()
+            # allgather comm in bwd
+            self.assertEqual(comm_mode.get_total_counts(), 2)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 1
+            )
+
     @with_comms
     def test_prepare_module_input(self):
         mesh = init_device_mesh(self.device_type, (self.world_size,))
 
         tensor = torch.ones(2, 16, device=self.device_type)
         expected_tensor = torch.ones(2 * self.world_size, 16, device=self.device_type)
-        prepare_inp_style = PrepareModuleInput(input_layouts=Shard(0), desired_input_layouts=Replicate())
+        prepare_inp_style = PrepareModuleInput(
+            input_layouts=Shard(0), desired_input_layouts=Replicate()
+        )
 
         model = nn.Identity()
         allgather_mod = parallelize_module(model, mesh, prepare_inp_style)
         output = allgather_mod(tensor).full_tensor()
         self.assertEqual(output, expected_tensor)
 
-
     @with_comms
     def test_prepare_module_input_multiple_inputs(self):
         mesh = init_device_mesh(self.device_type, (self.world_size,))
@@ -132,42 +220,195 @@ def forward(self, x, y):
 
         # Raise assertion error if input_layouts and desired_input_layouts do not have same length.
         test_mod = TestModule().to(self.device_type)
-        with self.assertRaisesRegex(AssertionError, "input_layouts and desired_input_layouts should have same length!"):
-            prepare_inps_dimension_mismatch = PrepareModuleInput(input_layouts=Shard(0), desired_input_layouts=(Replicate(), None))
+        with self.assertRaisesRegex(
+            AssertionError,
+            "input_layouts and desired_input_layouts should have same length!",
+        ):
+            prepare_inps_dimension_mismatch = PrepareModuleInput(
+                input_layouts=Shard(0), desired_input_layouts=(Replicate(), None)
+            )
         # Raise assertion error if module inputs and input_layouts do not have same length.
-        prepare_inps_short_dimension = PrepareModuleInput(input_layouts=Shard(0), desired_input_layouts=Replicate())
+        prepare_inps_short_dimension = PrepareModuleInput(
+            input_layouts=Shard(0), desired_input_layouts=Replicate()
+        )
         parallelize_module(test_mod.linear, mesh, ColwiseParallel())
         parallelize_module(test_mod, mesh, prepare_inps_short_dimension)
-        with self.assertRaisesRegex(ValueError, "module inputs and input_layouts should have same length!"):
+        with self.assertRaisesRegex(
+            ValueError, "module inputs and input_layouts should have same length!"
+        ):
             output = test_mod(
                 torch.randn(2, 8, device=self.device_type),
-                torch.ones(self.world_size * 2, 8 // self.world_size, device=self.device_type)
+                torch.ones(
+                    self.world_size * 2, 8 // self.world_size, device=self.device_type
+                ),
             )
 
         test_mod = TestModule().to(self.device_type)
-        prepare_inps = PrepareModuleInput(input_layouts=(Shard(0), None), desired_input_layouts=(Replicate(), None))
+        prepare_inps = PrepareModuleInput(
+            input_layouts=(Shard(0), None), desired_input_layouts=(Replicate(), None)
+        )
 
         parallelize_module(test_mod.linear, mesh, ColwiseParallel())
         parallelize_module(test_mod, mesh, prepare_inps)
         output = test_mod(
             torch.randn(2, 8, device=self.device_type),
-            torch.ones(self.world_size * 2, 8 // self.world_size, device=self.device_type)
+            torch.ones(
+                self.world_size * 2, 8 // self.world_size, device=self.device_type
+            ),
         )
         self.assertEqual(output.shape, (self.world_size * 2, 8 // self.world_size))
 
+    @with_comms
+    def test_prepare_module_kwargs_input(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        class TestKwargModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(8, 8)
+
+            def forward(self, x, *, y, z=2):
+                return self.linear(x) + y + z
+
+        test_mod = TestKwargModule().to(self.device_type)
+        prepare_inps_simple = PrepareModuleInput(
+            input_kwarg_layouts={"y": Shard(0)},
+            desired_input_kwarg_layouts={"y": Replicate()},
+        )
+        parallelize_module(
+            test_mod.linear, mesh, ColwiseParallel(use_local_output=False)
+        )
+        parallelize_module(test_mod, mesh, prepare_inps_simple)
+
+        comm_mode = CommDebugMode()
+        with comm_mode:
+            output = test_mod(
+                torch.randn(1 * self.world_size, 8, device=self.device_type),
+                y=torch.ones(1, 8, device=self.device_type),
+            )
+
+        self.assertEqual(comm_mode.get_total_counts(), 1)
+        self.assertEqual(output.shape, (1 * self.world_size, 8))
+
+        class TestKwargOnlyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(8, 8)
+
+            def forward(self, *, x, y=2, z=None):
+                return self.linear(x) + y + z
+
+        test_kwonly_mod = TestKwargOnlyModule().to(self.device_type)
+        prepare_inps_simple = PrepareModuleInput(
+            input_kwarg_layouts={"x": Shard(0), "z": Shard(0)},
+            desired_input_kwarg_layouts={"x": Replicate(), "z": Replicate()},
+        )
+        parallelize_module(
+            test_kwonly_mod.linear, mesh, ColwiseParallel(use_local_output=False)
+        )
+        parallelize_module(test_kwonly_mod, mesh, prepare_inps_simple)
+
+        with comm_mode:
+            output = test_kwonly_mod(
+                x=torch.randn(1, 8, device=self.device_type),
+                z=torch.ones(1, 8, device=self.device_type),
+            )
+
+        self.assertEqual(comm_mode.get_total_counts(), 2)
+        self.assertEqual(output.shape, (1 * self.world_size, 8))
+
     @with_comms
     def test_prepare_module_output(self):
         mesh = init_device_mesh(self.device_type, (self.world_size,))
 
         tensor = torch.ones(8, 16, device=self.device_type)
         expected_tensor = torch.ones(8 // self.world_size, 16, device=self.device_type)
-        prepare_out_style = PrepareModuleOutput(output_layouts=Replicate(), desired_output_layouts=Shard(0))
+        prepare_out_style = PrepareModuleOutput(
+            output_layouts=Replicate(), desired_output_layouts=Shard(0)
+        )
 
         model = nn.Identity()
         chunk_mod = parallelize_module(model, mesh, prepare_out_style)
         output = chunk_mod(tensor)
         self.assertEqual(output, expected_tensor)
 
+    @with_comms
+    def test_sequence_parallel_style(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        comm_mode = CommDebugMode()
+        batch, N, embedding_dim = 20, 8, 12
+
+        global_input = torch.rand(
+            batch,
+            N * self.world_size,
+            embedding_dim,
+            device=self.device_type,
+            requires_grad=True,
+        )
+        sharded_input = distribute_tensor(global_input, mesh, [Shard(1)])
+
+        # test LayerNorm
+        for elementwise_affine in [True, False]:
+            norm = nn.LayerNorm(
+                embedding_dim,
+                elementwise_affine=elementwise_affine,
+                device=self.device_type,
+            )
+            sp_norm = parallelize_module(deepcopy(norm), mesh, SequenceParallel())
+
+            output = norm(global_input)
+            output.sum().backward()
+
+            with comm_mode:
+                sharded_out = sp_norm(sharded_input)
+                grad_out = torch.ones_like(sharded_out)
+                sharded_out.backward(grad_out)
+                self.assertIsInstance(sharded_out, DTensor)
+                self.assertEqual(sharded_out.placements, (Shard(1),))
+                self.assertEqual(comm_mode.get_total_counts(), 0)
+                self.assertEqual(
+                    comm_mode.get_comm_counts()[c10d_functional.all_reduce], 0
+                )
+                if elementwise_affine:
+                    self.assertEqual(sp_norm.weight.grad.placements, (_Partial(),))
+                    self.assertEqual(sp_norm.bias.grad.placements, (_Partial(),))
+
+                self.assertEqual(sharded_out.full_tensor(), output)
+
+        # test RMSNorm
+        rmsnorm = RMSNormPython(embedding_dim).to(self.device_type)
+        sp_rmsnorm = parallelize_module(deepcopy(rmsnorm), mesh, SequenceParallel())
+
+        output = rmsnorm(global_input)
+        output.sum().backward()
+
+        with comm_mode:
+            sharded_out = sp_rmsnorm(sharded_input)
+            grad_out = torch.ones_like(sharded_out)
+            sharded_out.backward(grad_out)
+            self.assertIsInstance(sharded_out, DTensor)
+            self.assertEqual(sharded_out.placements, (Shard(1),))
+            self.assertEqual(sp_rmsnorm.weight.grad.placements, (_Partial(),))
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+            self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.all_reduce], 0)
+
+            self.assertEqual(sharded_out.full_tensor(), output)
+
+        # test dropout
+        dropout = nn.Dropout(0.5).to(self.device_type)
+        sp_dropout = parallelize_module(deepcopy(dropout), mesh, SequenceParallel())
+
+        output = dropout(global_input)
+        output.sum().backward()
+        with comm_mode:
+            sharded_out = sp_dropout(sharded_input)
+            grad_out = torch.ones_like(sharded_out)
+            sharded_out.backward(grad_out)
+            self.assertIsInstance(sharded_out, DTensor)
+            self.assertEqual(sharded_out.placements, (Shard(1),))
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index de33e85230782..97660ad0d800e 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -21,8 +21,8 @@
     print("distributed package not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
-import torch.distributed.distributed_c10d as c10d
 import torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook as powerSGD
+import torch.distributed.distributed_c10d as c10d
 import torch.nn.functional as F
 import torch.testing._internal.common_utils as common
 from torch import nn
@@ -34,13 +34,13 @@
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
-    retry_on_connect_failures,
-    TestCase,
+    instantiate_parametrized_tests,
     load_tests,
+    parametrize,
+    retry_on_connect_failures,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
-    instantiate_parametrized_tests,
-    parametrize
+    TestCase,
 )
 from torch.utils.checkpoint import checkpoint
 
@@ -141,7 +141,13 @@ def thread_work(timeout, init_type, world_size, rank, error_list):
             if init_type == "file":
                 barrier_store = dist.FileStore(f.name)
             elif init_type == "tcp":
-                barrier_store = dist.TCPStore("localhost", port, world_size, is_master=rank == 0, wait_for_workers=False)
+                barrier_store = dist.TCPStore(
+                    "localhost",
+                    port,
+                    world_size,
+                    is_master=rank == 0,
+                    wait_for_workers=False,
+                )
             elif init_type == "hash":
                 barrier_store = dist.HashStore()
             try:
@@ -153,7 +159,7 @@ def thread_work(timeout, init_type, world_size, rank, error_list):
                         group_name="_",
                         rendezvous_count=world_size,
                         timeout=timeout,
-                        logging_interval=timeout / 2
+                        logging_interval=timeout / 2,
                     )
             except torch.distributed.DistStoreError as e:
                 self.assertTrue(isinstance(e, torch.distributed.DistError))
@@ -165,7 +171,14 @@ def thread_work(timeout, init_type, world_size, rank, error_list):
         for init_type in ["file", "tcp", "hash"]:
             for rank in range(world_size):
                 t = threading.Thread(
-                    target=thread_work, args=(timedelta(seconds=3), init_type, world_size, rank, error_list,)
+                    target=thread_work,
+                    args=(
+                        timedelta(seconds=3),
+                        init_type,
+                        world_size,
+                        rank,
+                        error_list,
+                    ),
                 )
                 threads.append(t)
                 t.start()
@@ -176,10 +189,14 @@ def thread_work(timeout, init_type, world_size, rank, error_list):
             # we expect the world_size-1 threads to have failed
             self.assertEqual(len(error_list), world_size - 1)
             for error in error_list:
-                self.assertTrue("Timed out initializing process group in store based barrier" in error.args[0])
+                self.assertTrue(
+                    "Timed out initializing process group in store based barrier"
+                    in error.args[0]
+                )
             error_list = []
             threads = []
 
+
 class Net(nn.Module):
     def __init__(self):
         super().__init__()
@@ -375,7 +392,9 @@ def _get_store(self):
     def _get_process_group(self):
         raise NotImplementedError("To be implemented by child class")
 
-    def _train_model(self, model, input_var, target, loss, run_checkpoint=False, use_reentrant=True):
+    def _train_model(
+        self, model, input_var, target, loss, run_checkpoint=False, use_reentrant=True
+    ):
         model.train()
         if run_checkpoint:
             output = checkpoint(model, input_var, use_reentrant=use_reentrant)
@@ -418,9 +437,21 @@ def _test_ddp_checkpointing(
         for i in range(n_iters):
             model.zero_grad(set_to_none=False)
             ddp_model.zero_grad(set_to_none=False)
-            self._train_model(model, input, target, loss, run_checkpoint=run_checkpoint, use_reentrant=use_reentrant)
             self._train_model(
-                ddp_model, ddp_input, ddp_target, loss, run_checkpoint=run_checkpoint, use_reentrant=use_reentrant
+                model,
+                input,
+                target,
+                loss,
+                run_checkpoint=run_checkpoint,
+                use_reentrant=use_reentrant,
+            )
+            self._train_model(
+                ddp_model,
+                ddp_input,
+                ddp_target,
+                loss,
+                run_checkpoint=run_checkpoint,
+                use_reentrant=use_reentrant,
             )
             for i, j in zip(model.parameters(), ddp_model.parameters()):
                 if not allow_none_grads:
@@ -436,6 +467,7 @@ class CheckpointOnceModule(nn.Module):
         """
         Runs checkpoint for a single layer in the model.
         """
+
         def __init__(self, use_reentrant=True):
             super().__init__()
             self.l1 = nn.Linear(20, 20)
@@ -453,6 +485,7 @@ class CheckpointTwiceModule(CheckpointOnceModule):
         cases such as pipeline parallel where the same layer can be checkpointed
         more than one time.
         """
+
         def __init__(self, use_reentrant=True):
             super().__init__(use_reentrant=use_reentrant)
 
@@ -466,6 +499,7 @@ class CheckpointTwiceModuleWeightSharing(CheckpointTwiceModule):
         """
         Similar to CheckpointTwiceModule but the weights are shared.
         """
+
         def __init__(self, use_reentrant=True):
             super().__init__(use_reentrant=use_reentrant)
             # Share weights
@@ -477,7 +511,6 @@ def forward(self, inp):
             x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant)
             return x
 
-
     class DynamicCheckpointTwiceModule(CheckpointTwiceModule):
         def __init__(self, use_reentrant=True):
             super().__init__(use_reentrant=use_reentrant)
@@ -498,7 +531,6 @@ def __init__(self, use_reentrant=True):
             # Share weights
             self.l1.weight = self.l2.weight
 
-
     def _prepare_dummy_data(self):
         ddp_bs = 16
         bs = ddp_bs * self.world_size
@@ -509,7 +541,6 @@ def _prepare_dummy_data(self):
         ddp_target = target[offset : offset + ddp_bs]
         return input, ddp_input, target, ddp_target
 
-
     @skip_if_lt_x_gpu(2)
     @parametrize("use_reentrant", [True, False])
     def test_ddp_checkpointing_once(self, use_reentrant):
@@ -546,10 +577,10 @@ def test_ddp_checkpointing_unused_params(self, use_reentrant):
         process_group = self._get_process_group()
         for use_bucket_view in (True, False):
             err_ctx = (
-                nullcontext() if not use_reentrant else
-                self.assertRaisesRegex(
-                    RuntimeError,
-                    "Expected to mark a variable ready only once."
+                nullcontext()
+                if not use_reentrant
+                else self.assertRaisesRegex(
+                    RuntimeError, "Expected to mark a variable ready only once."
                 )
             )
             with err_ctx:
@@ -578,10 +609,10 @@ def test_ddp_checkpointing_twice(self, use_reentrant):
         process_group = self._get_process_group()
         for use_bucket_view in (True, False):
             err_ctx = (
-                nullcontext() if not use_reentrant else
-                self.assertRaisesRegex(
-                    RuntimeError,
-                    "Expected to mark a variable ready only once."
+                nullcontext()
+                if not use_reentrant
+                else self.assertRaisesRegex(
+                    RuntimeError, "Expected to mark a variable ready only once."
                 )
             )
             with err_ctx:
@@ -634,7 +665,7 @@ def test_ddp_checkpointing_dynamic_module(self):
                 find_unused_parameters=True,
                 # Grads can be none sometimes due to dynamic module not using
                 # all params.
-                allow_none_grads=True
+                allow_none_grads=True,
             )
 
     @skip_if_lt_x_gpu(2)
@@ -653,7 +684,7 @@ def test_ddp_checkpointing_dynamic_weight_sharing(self):
                 find_unused_parameters=True,
                 # Grads can be none sometimes due to dynamic module not using
                 # all params.
-                allow_none_grads=True
+                allow_none_grads=True,
             )
 
     # DDP works as expected if there is weight sharing among layers
@@ -881,7 +912,7 @@ def test_sync_batch_norm_only_empty_input(self):
         x = torch.zeros(
             (1 if self.rank != 0 else 0, 2, 11, 13),
             dtype=torch.float32,
-            device=self.rank
+            device=self.rank,
         )
 
         # input requires grad, this will trigger the collective communication
@@ -894,11 +925,7 @@ def test_sync_batch_norm_only_empty_input(self):
         self._test_not_nan(model, x)
 
         # all ranks receive empty inputs
-        x = torch.zeros(
-            (0, 2, 11, 13),
-            dtype=torch.float32,
-            device=self.rank
-        )
+        x = torch.zeros((0, 2, 11, 13), dtype=torch.float32, device=self.rank)
 
         # input requires grad, this will trigger the collective communication
         # in the backward pass
@@ -933,17 +960,13 @@ def test_sync_batch_norm_empty_input(self):
         x = torch.zeros(
             (3 if self.rank != 0 else 0, 2, 30, 30),
             dtype=torch.float32,
-            device=self.rank
+            device=self.rank,
         )
 
         self._test_not_nan(model, x)
 
         # all ranks receive empty inputs
-        x = torch.zeros(
-            (0, 2, 30, 30),
-            dtype=torch.float32,
-            device=self.rank
-        )
+        x = torch.zeros((0, 2, 30, 30), dtype=torch.float32, device=self.rank)
 
         self._test_not_nan(model, x)
 
@@ -962,16 +985,13 @@ def __init__(self, skip_o1):
 
         def forward(self, x):
             o1 = None if self.skip_o1 else self.relu(self.seq1(x))
-            o2 = {
-                "a": self.seq2(x),
-                "b": self.relu(self.seq2(x))
-            }
+            o2 = {"a": self.seq2(x), "b": self.relu(self.seq2(x))}
             return CommonDistributedDataParallelTest.CustomOutput(o1=o1, o2=o2)
 
     def _test_dataclass_output(self, skip_o1):
-        net_x = torch.cat(
-            [torch.ones(4, 10) * i for i in range(self.world_size)]
-        ).to(self.rank)
+        net_x = torch.cat([torch.ones(4, 10) * i for i in range(self.world_size)]).to(
+            self.rank
+        )
         ddp_x = torch.ones(4, 10, device=self.rank) * self.rank
 
         # use manual_seed to make sure local models start with the same values
@@ -1091,7 +1111,6 @@ def device(self):
         self.fail("test subclass didn't override device")
 
     def _verify_sequence_number_across_pg(self, pg, verify_pg):
-
         seq_num = pg._get_sequence_number_for_group()
         obj_list = [None for _ in range(dist.get_world_size(verify_pg))]
         # We use a separate pg to verify the sequence numbers, otherwise these
@@ -1273,11 +1292,12 @@ def _test_tensor_dtype_mismatch(self, backend):
 
         tensor = torch.ones(2, 2, device=self.device) * 7
         tensor_h = tensor.half()
-        tensor_list = [torch.zeros(2, 2, device=self.device) for _ in range(self.world_size)]
+        tensor_list = [
+            torch.zeros(2, 2, device=self.device) for _ in range(self.world_size)
+        ]
         tensor_list_h = list(tensor_list)
         tensor_list_h[1] = tensor_list_h[1].half()
 
-
         with self.assertRaisesRegex(ValueError, "tensors with different dtypes"):
             dist.all_gather(tensor_list_h, tensor)
 
@@ -1329,7 +1349,9 @@ def _test_tensor_dtype_complex(self, backend):
 
         tensor = torch.rand(2, device=self.device)
         tensor_c = torch.view_as_complex(tensor)
-        tensor_list = [torch.rand(2, device=self.device) for _ in range(self.world_size)]
+        tensor_list = [
+            torch.rand(2, device=self.device) for _ in range(self.world_size)
+        ]
         tensor_list_c = list(tensor_list)
         tensor_list_c[1] = torch.view_as_complex(tensor_list_c[1])
 
@@ -1354,6 +1376,7 @@ def _test_bool_tensors(self, backend):
         dist.broadcast(outensor, src=0)
         self.assertEqual(outensor, tensor)
 
+
 # Variant of AbstractCommTest that expects world size of 4
 class AbstractLargeCommTest:
     @property
@@ -1382,7 +1405,9 @@ def _test_new_group_local_sync(self, backend):
         self.assertIn(rank, ranks_in)
         self.assertNotIn(rank, ranks_out)
 
-        self.assertIsNone(dist.new_group(ranks=ranks_out, use_local_synchronization=True))
+        self.assertIsNone(
+            dist.new_group(ranks=ranks_out, use_local_synchronization=True)
+        )
 
         new_pg = dist.new_group(ranks=ranks_in, use_local_synchronization=True)
         self.assertIsInstance(new_pg, dist.ProcessGroup)
@@ -1393,7 +1418,7 @@ def _test_new_group_local_sync(self, backend):
         self.assertEqual(
             ranks_in,
             dist.get_process_group_ranks(new_pg),
-            f"expecting {ranks_in} but got {dist.get_process_group_ranks(new_pg)}"
+            f"expecting {ranks_in} but got {dist.get_process_group_ranks(new_pg)}",
         )
 
     def _test_new_group_local_sync_sanity_check(self, backend):
@@ -1413,12 +1438,18 @@ def _test_new_group_local_sync_sanity_check(self, backend):
         new_pg = dist.new_group(ranks=ranks_in, use_local_synchronization=True)
 
         input_tensor = torch.tensor([pg_idx, rank], device=self.device)
-        output_tensor_list = [torch.tensor([-1, -1], device=self.device,) for _ in range(new_pg.size())]
+        output_tensor_list = [
+            torch.tensor(
+                [-1, -1],
+                device=self.device,
+            )
+            for _ in range(new_pg.size())
+        ]
         dist.all_gather(output_tensor_list, input_tensor, group=new_pg)
 
         expected = [
             torch.tensor([pg_idx, ranks_in[0]], device=self.device),
-            torch.tensor([pg_idx, ranks_in[1]], device=self.device)
+            torch.tensor([pg_idx, ranks_in[1]], device=self.device),
         ]
         self.assertEqual(output_tensor_list, expected)
 
@@ -1449,13 +1480,17 @@ def _test_new_group_local_sync_duplicate_pg(self, backend):
         input_tensor = torch.tensor([pg_idx, rank], device=self.device)
         for new_pg in new_pgs:
             output_tensor_list = [
-                torch.tensor([-1, -1], device=self.device,) for _ in range(new_pg.size())
+                torch.tensor(
+                    [-1, -1],
+                    device=self.device,
+                )
+                for _ in range(new_pg.size())
             ]
             dist.all_gather(output_tensor_list, input_tensor, group=new_pg)
 
             expected = [
                 torch.tensor([pg_idx, ranks_in[0]], device=self.device),
-                torch.tensor([pg_idx, ranks_in[1]], device=self.device)
+                torch.tensor([pg_idx, ranks_in[1]], device=self.device),
             ]
             self.assertEqual(output_tensor_list, expected)
 
@@ -1507,7 +1542,9 @@ def test_debug_level(self):
 
         for mode in invalid_debug_modes:
             os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode)
-            with self.assertRaisesRegex(ValueError, "The value of TORCH_DISTRIBUTED_DEBUG must"):
+            with self.assertRaisesRegex(
+                ValueError, "The value of TORCH_DISTRIBUTED_DEBUG must"
+            ):
                 dist.set_debug_level_from_env()
 
 
@@ -1523,7 +1560,9 @@ def getBackendName(self):
         return "Dummy"
 
     def allgather(self, output_tensor_lists, input_tensor_list, opts=None):
-        for output_tensor_list, input_tensor in zip(output_tensor_lists, input_tensor_list):
+        for output_tensor_list, input_tensor in zip(
+            output_tensor_lists, input_tensor_list
+        ):
             for output_tensor in output_tensor_list:
                 output_tensor.copy_(input_tensor)
 
@@ -1561,7 +1600,9 @@ def broadcast(self, tensor_list, opts=None):
         return DummyWork()
 
     def reduce_scatter(self, output_tensor_list, input_tensor_lists, opts=None):
-        for output_tensor, input_tensor_list in zip(output_tensor_list, input_tensor_lists):
+        for output_tensor, input_tensor_list in zip(
+            output_tensor_list, input_tensor_lists
+        ):
             output_tensor.copy_(input_tensor_list[self.rank()])
 
         return DummyWork()
@@ -1597,28 +1638,25 @@ def test_get_backend_name(self):
 
     def test_backend_class_attr(self):
         dist.Backend.register_backend(
-            "dummy",
-            PythonProcessGroupExtensionTest.create_dummy
+            "dummy", PythonProcessGroupExtensionTest.create_dummy
         )
         self.assertEqual(dist.Backend.DUMMY, "dummy")
         self.assertEqual(
             dist.Backend._plugins["DUMMY"].creator_fn,
-            PythonProcessGroupExtensionTest.create_dummy
+            PythonProcessGroupExtensionTest.create_dummy,
         )
 
     def test_is_backend_available(self):
         self.assertEqual(dist.is_ucc_available(), dist.is_backend_available("ucc"))
         self.assertFalse(dist.is_backend_available("dummy"))
         dist.Backend.register_backend(
-            "dummy",
-            PythonProcessGroupExtensionTest.create_dummy
+            "dummy", PythonProcessGroupExtensionTest.create_dummy
         )
         self.assertTrue(dist.is_backend_available("dummy"))
 
     def test_backend_config(self):
         dist.Backend.register_backend(
-            "dummy",
-            PythonProcessGroupExtensionTest.create_dummy
+            "dummy", PythonProcessGroupExtensionTest.create_dummy
         )
 
         # Ensure backend config can be created with the following arguments
@@ -1653,11 +1691,15 @@ def test_backend_config(self):
                     dist.BackendConfig(config_str)
 
     def test_init_process_group_with_multiple_backends(self):
-        dist.Backend.register_backend("dummy", PythonProcessGroupExtensionTest.create_dummy)
+        dist.Backend.register_backend(
+            "dummy", PythonProcessGroupExtensionTest.create_dummy
+        )
 
-        os.environ['MASTER_ADDR'] = 'localhost'
-        os.environ['MASTER_PORT'] = '6789'
-        dist.init_process_group("cpu:dummy,cuda:dummy", rank=self.rank, world_size=self.world_size)
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "6789"
+        dist.init_process_group(
+            "cpu:dummy,cuda:dummy", rank=self.rank, world_size=self.world_size
+        )
 
         # test all_gather
         input_tensor = torch.ones(2, 2) * 7
@@ -1679,10 +1721,12 @@ def create_dummy(store, group_rank, group_size, timeout):
         return DummyProcessGroup(group_rank, group_size)
 
     def test_collectives(self):
-        dist.Backend.register_backend("dummy", PythonProcessGroupExtensionTest.create_dummy)
+        dist.Backend.register_backend(
+            "dummy", PythonProcessGroupExtensionTest.create_dummy
+        )
 
-        os.environ['MASTER_ADDR'] = 'localhost'
-        os.environ['MASTER_PORT'] = '6789'
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "6789"
         dist.init_process_group("dummy", rank=self.rank, world_size=self.world_size)
 
         # test all_gather
@@ -1713,10 +1757,12 @@ def test_collectives(self):
         dist.destroy_process_group()
 
     def test_send_recv(self):
-        dist.Backend.register_backend("dummy", PythonProcessGroupExtensionTest.create_dummy)
+        dist.Backend.register_backend(
+            "dummy", PythonProcessGroupExtensionTest.create_dummy
+        )
 
-        os.environ['MASTER_ADDR'] = 'localhost'
-        os.environ['MASTER_PORT'] = '6789'
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "6789"
         dist.init_process_group("dummy", rank=self.rank, world_size=self.world_size)
 
         # test send
@@ -1739,6 +1785,7 @@ def test_send_recv(self):
 
 instantiate_parametrized_tests(CommonDistributedDataParallelTest)
 
+
 class ProcessGroupWithDispatchedCollectivesTests(MultiProcessTestCase):
     @property
     def world_size(self):
@@ -1791,7 +1838,7 @@ def test_init_process_group_for_all_backends(self):
                     backend=backend,
                     rank=self.rank,
                     world_size=self.world_size,
-                    store=store
+                    store=store,
                 )
                 pg = c10d._get_default_group()
                 self.assertEqual(pg.rank(), self.rank)
@@ -1877,6 +1924,7 @@ def _test_all_to_all_single(self, backend):
         output_tensor = torch.zeros(2, 2, device=torch.device(device))
         dist.all_to_all_single(output_tensor, input_tensor)
 
+
 class CompilerTest(MultiProcessTestCase):
     def setUp(self):
         super().setUp()
@@ -1928,14 +1976,18 @@ def fn(x: torch.Tensor) -> torch.Tensor:
                 commed = False
                 while prev is not None and not commed:
                     curr = prev
-                    waited |= all([
-                        curr.op == "call_function",
-                        curr.target == _wait_comm,
-                    ])
-                    commed |= all([
-                        curr.op == "call_function",
-                        CommTensor._is_supported(curr.target.__name__),
-                    ])
+                    waited |= all(
+                        [
+                            curr.op == "call_function",
+                            curr.target == _wait_comm,
+                        ]
+                    )
+                    commed |= all(
+                        [
+                            curr.op == "call_function",
+                            CommTensor._is_supported(curr.target.__name__),
+                        ]
+                    )
 
                     prev = curr.args[0]
 
@@ -1978,7 +2030,9 @@ def _test_allgather_into_tensor_work_wait(self, tensor):
         def comm_fn(tensor, group=None):
             out_tensors = [torch.zeros_like(tensor) for _ in range(group.size())]
             output_tensor = torch.cat(out_tensors, dim=0)
-            work = dist.all_gather_into_tensor(output_tensor, tensor, group=group, async_op=True)
+            work = dist.all_gather_into_tensor(
+                output_tensor, tensor, group=group, async_op=True
+            )
             work.wait()
 
             return work, output_tensor
@@ -1989,7 +2043,9 @@ def _test_reduce_scatter_work_wait(self, tensor):
         def comm_fn(tensor, group=None):
             in_tensors = [tensor.clone() + i for i in range(group.size())]
             out_tensor = torch.zeros_like(tensor)
-            work = dist.reduce_scatter(out_tensor, in_tensors, group=group, async_op=True)
+            work = dist.reduce_scatter(
+                out_tensor, in_tensors, group=group, async_op=True
+            )
             return work, out_tensor
 
         self._test_work_wait(tensor, comm_fn=comm_fn)
@@ -1997,7 +2053,9 @@ def comm_fn(tensor, group=None):
     def _test_reduce_scatter_tensor_work_wait(self, tensor):
         def comm_fn(tensor, group=None):
             out_tensor = torch.zeros_like(tensor).chunk(group.size(), dim=0)[self.rank]
-            work = dist.reduce_scatter_tensor(out_tensor, tensor, group=group, async_op=True)
+            work = dist.reduce_scatter_tensor(
+                out_tensor, tensor, group=group, async_op=True
+            )
             return work, out_tensor
 
         self._test_work_wait(tensor, comm_fn=comm_fn)
@@ -2011,9 +2069,13 @@ def comm_fn(tensor, group=None):
 
     def _test_scatter_work_wait(self, tensor):
         def comm_fn(tensor, group=None):
-            in_tensors = [tensor + i for i in range(group.size())] if self.rank == 0 else None
+            in_tensors = (
+                [tensor + i for i in range(group.size())] if self.rank == 0 else None
+            )
             out_tensor = torch.zeros_like(tensor)
-            work = dist.scatter(out_tensor, in_tensors, src=0, group=group, async_op=True)
+            work = dist.scatter(
+                out_tensor, in_tensors, src=0, group=group, async_op=True
+            )
             return work, out_tensor
 
         self._test_work_wait(tensor, comm_fn=comm_fn)
@@ -2045,22 +2107,35 @@ def comm_fn(tensor, group=None):
 
 
 class ReduceOpTest(TestCase):
-
     # Ref: https://github.com/pytorch/pytorch/issues/87191
     def test_op_isinstance_of_reduceop(self):
         for reduce_op in (
-            c10d.ReduceOp.SUM, c10d.ReduceOp.AVG, c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX,
-            c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR,
+            c10d.ReduceOp.SUM,
+            c10d.ReduceOp.AVG,
+            c10d.ReduceOp.PRODUCT,
+            c10d.ReduceOp.MIN,
+            c10d.ReduceOp.MAX,
+            c10d.ReduceOp.BAND,
+            c10d.ReduceOp.BOR,
+            c10d.ReduceOp.BXOR,
         ):
             self.assertTrue(isinstance(reduce_op, c10d.ReduceOp))
         for scale in (torch.tensor(1.0), 2.0):
-            self.assertTrue(isinstance(dist._make_nccl_premul_sum(scale), c10d.ReduceOp))
+            self.assertTrue(
+                isinstance(dist._make_nccl_premul_sum(scale), c10d.ReduceOp)
+            )
 
     # Ref: https://github.com/pytorch/pytorch/pull/87303#discussion_r1002879700
     def test_reduceop_copyable(self):
         for reduce_op in (
-            c10d.ReduceOp.SUM, c10d.ReduceOp.AVG, c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX,
-            c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR,
+            c10d.ReduceOp.SUM,
+            c10d.ReduceOp.AVG,
+            c10d.ReduceOp.PRODUCT,
+            c10d.ReduceOp.MIN,
+            c10d.ReduceOp.MAX,
+            c10d.ReduceOp.BAND,
+            c10d.ReduceOp.BOR,
+            c10d.ReduceOp.BXOR,
         ):
             self.assertEqual(copy.copy(reduce_op), reduce_op)
             self.assertEqual(copy.deepcopy(reduce_op), reduce_op)
@@ -2074,8 +2149,14 @@ def test_reduceop_copyable(self):
 
     def test_reduceop_pickle(self):
         for reduce_op in (
-            c10d.ReduceOp.SUM, c10d.ReduceOp.AVG, c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX,
-            c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR,
+            c10d.ReduceOp.SUM,
+            c10d.ReduceOp.AVG,
+            c10d.ReduceOp.PRODUCT,
+            c10d.ReduceOp.MIN,
+            c10d.ReduceOp.MAX,
+            c10d.ReduceOp.BAND,
+            c10d.ReduceOp.BOR,
+            c10d.ReduceOp.BXOR,
         ):
             pickle.loads(pickle.dumps(reduce_op))
             orig = c10d.ReduceOp(reduce_op)
@@ -2088,8 +2169,14 @@ def test_reduceop_pickle(self):
     def test_reduceop_equal(self):
         not_reduceop = "abc"
         for reduce_op in (
-            c10d.ReduceOp.SUM, c10d.ReduceOp.AVG, c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX,
-            c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR,
+            c10d.ReduceOp.SUM,
+            c10d.ReduceOp.AVG,
+            c10d.ReduceOp.PRODUCT,
+            c10d.ReduceOp.MIN,
+            c10d.ReduceOp.MAX,
+            c10d.ReduceOp.BAND,
+            c10d.ReduceOp.BOR,
+            c10d.ReduceOp.BXOR,
         ):
             reduce_op_obj = c10d.ReduceOp(reduce_op)
             # this calls `ReduceOp.__eq__(self, other)`
@@ -2107,6 +2194,31 @@ def test_reduceop_equal(self):
             self.assertFalse(not_reduceop in (reduce_op, reduce_op_obj))
 
 
+class LocalRankTest(MultiProcessTestCase):
+    @property
+    def world_size(self):
+        return 4
+
+    def setUp(self):
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    def testWithoutEnv(self):
+        with self.assertRaisesRegex(RuntimeError, "LOCAL_RANK"):
+            dist.get_node_local_rank()
+
+    def testNodeLocalRank(self):
+        os.environ["LOCAL_RANK"] = str(self.rank)
+        self.assertEqual(dist.get_node_local_rank(), self.rank)
+
+
 if __name__ == "__main__":
     assert (
         not torch.cuda._initialized
diff --git a/test/distributed/test_c10d_functional_native.py b/test/distributed/test_c10d_functional_native.py
index f63d8ab56e1b7..54030d1f1d42b 100644
--- a/test/distributed/test_c10d_functional_native.py
+++ b/test/distributed/test_c10d_functional_native.py
@@ -1,18 +1,34 @@
 # Owner(s): ["module: c10d"]
+import threading
 import unittest
 from typing import List
 
 import torch
+
 import torch.distributed as dist
+import torch.distributed._functional_collectives as funcol
 from torch._C import FileCheck
-from torch._dynamo.utils import same
 from torch._inductor.utils import fresh_inductor_cache, run_and_get_triton_code
+from torch.distributed._functional_collectives import (
+    all_gather_into_tensor_coalesced,
+    all_gather_tensor,
+    all_reduce,
+    all_reduce_coalesced,
+    all_to_all_single,
+    AsyncCollectiveTensor,
+    reduce_scatter_tensor,
+    reduce_scatter_tensor_coalesced,
+)
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     requires_nccl,
     skip_if_lt_x_gpu,
 )
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import (  # type: ignore[attr-defined]
+    run_tests,
+    TestCase,
+)
+from torch.testing._internal.distributed.fake_pg import FakeStore
 from torch.utils._triton import has_triton
 
 
@@ -39,7 +55,7 @@ def load_test_module(name):
 
 
 @requires_nccl()
-class C10DFunctionalNativeTest(MultiProcessTestCase):
+class TestWithNCCL(MultiProcessTestCase):
     def setUp(self) -> None:
         super().setUp()
         self._spawn_processes()
@@ -72,7 +88,7 @@ def _init_process_group(self) -> None:
         torch._C._distributed_c10d._register_process_group("default", dist.group.WORLD)
 
     @skip_if_lt_x_gpu(2)
-    def test_all_reduce(self) -> None:
+    def test_all_reduce_single(self) -> None:
         self._init_process_group()
 
         input = torch.full((10, 10), float(self.rank), device=self.device)
@@ -86,8 +102,19 @@ def test_all_reduce(self) -> None:
         expect = sum(self.ranks) / self.world_size
         assert output.eq(expect).all()
 
+        # Test Python API and AsyncCollectiveTensor
+        output = all_reduce(
+            input,
+            "avg",
+            "default",
+        )
+        assert isinstance(output, AsyncCollectiveTensor)
+        assert not output.completed
+        assert output.eq(expect).all()
+        assert output.completed
+
     @skip_if_lt_x_gpu(2)
-    def test_all_reduce_(self) -> None:
+    def test_all_reduce_single_(self) -> None:
         self._init_process_group()
 
         input = torch.full((10, 10), float(self.rank), device=self.device)
@@ -119,6 +146,17 @@ def test_all_reduce_coalesced(self) -> None:
             assert id(output) != id(input)
             assert output.eq(sum(self.ranks) / self.world_size * i).all()
 
+        # Test Python API and AsyncCollectiveTensor
+        outputs = all_reduce_coalesced(
+            inputs,
+            "avg",
+            "default",
+        )
+        for i, (output, input) in enumerate(zip(outputs, inputs)):
+            assert not output.completed
+            assert output.eq(sum(self.ranks) / self.world_size * i).all()
+            assert output.completed
+
     @skip_if_lt_x_gpu(2)
     def test_all_reduce_coalesced_(self) -> None:
         self._init_process_group()
@@ -138,7 +176,7 @@ def test_all_reduce_coalesced_(self) -> None:
             assert output.eq(sum(self.ranks) / self.world_size * i).all()
 
     @skip_if_lt_x_gpu(2)
-    def test_all_gather_into_tensor(self) -> None:
+    def test_all_gather_into_tensor_single(self) -> None:
         self._init_process_group()
 
         input = torch.full((10, 10), float(self.rank), device=self.device)
@@ -157,6 +195,17 @@ def test_all_gather_into_tensor(self) -> None:
         assert torch.allclose(output, expect)
         assert output.eq(expect).all()
 
+        # Test Python API and AsyncCollectiveTensor
+        output = all_gather_tensor(
+            input,
+            0,
+            "default",
+        )
+        assert isinstance(output, AsyncCollectiveTensor)
+        assert not output.completed
+        assert output.eq(expect).all()
+        assert output.completed
+
     @skip_if_lt_x_gpu(2)
     def test_all_gather_into_tensor_coalesced(self) -> None:
         self._init_process_group()
@@ -170,18 +219,31 @@ def test_all_gather_into_tensor_coalesced(self) -> None:
             self.world_size,
             "default",
         )
-        for i, output in enumerate(outputs):
-            output = torch.ops._c10d_functional.wait_tensor(output)
-            expect = torch.cat(
+        expect = [
+            torch.cat(
                 [
                     torch.full((10, 10), float(rank) * i, device=self.device)
                     for rank in self.ranks
                 ]
             )
-            assert output.eq(expect).all()
+            for i in range(10)
+        ]
+        for i, output in enumerate(outputs):
+            output = torch.ops._c10d_functional.wait_tensor(output)
+            assert output.eq(expect[i]).all()
+
+        # Test Python API and AsyncCollectiveTensor
+        outputs = all_gather_into_tensor_coalesced(
+            inputs,
+            "default",
+        )
+        for i, output in enumerate(outputs):
+            assert not output.completed
+            assert output.eq(expect[i]).all()
+            assert output.completed
 
     @skip_if_lt_x_gpu(2)
-    def test_reduce_scatter_tensor(self) -> None:
+    def test_reduce_scatter_tensor_single(self) -> None:
         self._init_process_group()
 
         input = torch.tensor(self.ranks, device=self.device)
@@ -194,6 +256,18 @@ def test_reduce_scatter_tensor(self) -> None:
         output = torch.ops._c10d_functional.wait_tensor(output)
         assert output.eq(self.rank).all()
 
+        # Test Python API and AsyncCollectiveTensor
+        output = reduce_scatter_tensor(
+            input,
+            "avg",
+            0,
+            "default",
+        )
+        assert isinstance(output, AsyncCollectiveTensor)
+        assert not output.completed
+        assert output.eq(self.rank).all()
+        assert output.completed
+
     @skip_if_lt_x_gpu(2)
     def test_reduce_scatter_tensor_coalesced(self) -> None:
         self._init_process_group()
@@ -209,6 +283,18 @@ def test_reduce_scatter_tensor_coalesced(self) -> None:
             output = torch.ops._c10d_functional.wait_tensor(output)
             assert output.eq(self.rank * i).all()
 
+        # Test Python API and AsyncCollectiveTensor
+        outputs = reduce_scatter_tensor_coalesced(
+            inputs,
+            "avg",
+            [0] * 10,
+            "default",
+        )
+        for i, output in enumerate(outputs):
+            assert not output.completed
+            assert output.eq(self.rank * i).all()
+            assert output.completed
+
     @skip_if_lt_x_gpu(2)
     def test_all_to_all_single(self) -> None:
         self._init_process_group()
@@ -236,77 +322,175 @@ def test_all_to_all_single(self) -> None:
         )
         assert output.eq(expect).all()
 
+        # Test Python API and AsyncCollectiveTensor
+        output = all_to_all_single(
+            input, output_split_sizes, input_split_sizes, "default"
+        )
+        assert not output.completed
+        assert output.eq(expect).all()
+        assert output.completed
+
+    @skip_if_lt_x_gpu(2)
+    def test_broadcast(self) -> None:
+        self._init_process_group()
+
+        input = torch.full((10, 10), float(self.rank), device=self.device)
+        output = torch.ops._c10d_functional.broadcast(
+            input,
+            1,
+            "default",
+        )
+        output = torch.ops._c10d_functional.wait_tensor(output)
+        assert id(output) != id(input)
+        expect = 1
+        assert output.eq(expect).all()
+
+        # Test Python API and AsyncCollectiveTensor
+        output = funcol.broadcast(
+            input,
+            1,
+            "default",
+        )
+        assert isinstance(output, AsyncCollectiveTensor)
+        assert not output.completed
+        assert output.eq(expect).all()
+        assert output.completed
+
+    @skip_if_lt_x_gpu(2)
+    def test_unwaited(self) -> None:
+        # Verify that the process can terminate gracefully
+        # even with unwaited tensors
+        self._init_process_group()
+
+        input = torch.full((10, 10), float(self.rank), device=self.device)
+        output = torch.ops._c10d_functional.all_reduce(
+            input,
+            "avg",
+            "default",
+        )
+
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
-    @torch._inductor.config.patch(debug=True)
+    @skip_if_lt_x_gpu(2)
     @fresh_inductor_cache()
-    def test_inductor_all_reduce_single(self):
+    def test_threading(self):
         self._init_process_group()
+        device = torch.device(f"cuda:{self.rank}")
 
+        def func(arg: torch.Tensor) -> torch.Tensor:
+            buf0 = arg + 42
+            ar0 = funcol.all_reduce(buf0, "avg", "0")
+            ar0 = funcol.wait_tensor(ar0)
+            return ar0 + 1
+
+        arg = torch.rand(4, 4, device=device)
+        func(arg)
+
+        compiled = torch.compile(func, fullgraph=True)
+        code = run_and_get_triton_code(compiled, arg)
+        FileCheck().check("all_reduce_.default(buf0, 'avg', '0')").run(code)
+
+        # Unless explicitly specified (e.g. in a custom runtime), the process
+        # group registry is shared among all threads in a process. Here we
+        # verify that a process group registered in main thread can be resolved
+        # in a different thread.
+        class TestThread(threading.Thread):
+            def run(self):
+                self.exc = None
+                try:
+                    func(arg)
+                    compiled(arg)
+                except BaseException as exc:
+                    self.exc = exc
+
+            def join(self):
+                threading.Thread.join(self)
+                if self.exc:
+                    raise self.exc
+
+        t = TestThread()
+        t.start()
+        t.join()
+
+
+class CompileTest(TestCase):
+    def setUp(self):
+        # Allow testing aoti after torch.compile
+        torch._inductor.config.triton.store_cubin = True
+        torch._inductor.config.debug = True
+
+        self.rank = 0
+        self.world_size = 2
+        torch.cuda.set_device("cuda:0")
+
+        store = FakeStore()
+        dist.init_process_group(
+            backend="fake",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+
+    def tearDown(self):
+        dist.destroy_process_group()
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @fresh_inductor_cache()
+    def test_inductor_all_reduce_single(self):
         def func(arg: torch.Tensor) -> torch.Tensor:
             buf0 = arg + 42
             # Expect in-place with inductor allocated buf
-            ar0 = torch.ops._c10d_functional.all_reduce(buf0, "avg", "default")
-            ar0 = torch.ops._c10d_functional.wait_tensor(ar0)
+            ar0 = funcol.all_reduce(buf0, "avg", "0")
+            ar0 = funcol.wait_tensor(ar0)
             # Expect no in-place with graph input
-            ar1 = torch.ops._c10d_functional.all_reduce(arg, "avg", "default")
-            ar1 = torch.ops._c10d_functional.wait_tensor(ar1)
+            ar1 = funcol.all_reduce(arg, "avg", "0")
+            ar1 = funcol.wait_tensor(ar1)
             return ar0, ar1
 
-        arg = torch.rand(4, 4, device=self.device)
+        arg = torch.rand(4, 4, device="cuda")
         compiled = torch.compile(func)
 
         code = run_and_get_triton_code(compiled, arg)
         (
             FileCheck()
-            .check("buf0 = empty(")
-            .check("buf5 = empty(")
+            .check("buf0 = empty")
+            .check("buf7 = empty")
             # Expect in-place with inductor allocated buf
             .check("torch.ops._c10d_functional.all_reduce_.default(buf0")
             .check("torch.ops._c10d_functional.wait_tensor.default(buf0")
             # Expect no in-place with graph input (buf5 is a clone)
-            .check("torch.ops._c10d_functional.all_reduce_.default(buf5")
-            .check("torch.ops._c10d_functional.wait_tensor.default(buf5")
+            .check("torch.ops._c10d_functional.all_reduce_.default(buf7")
+            .check("torch.ops._c10d_functional.wait_tensor.default(buf7")
             # Expect no extra copy on return
-            .check("return (buf0, buf5, )")
+            .check("return (buf0, buf7, )")
             .run(code)
         )
-        out = compiled(arg)
-        correct = func(arg)
-        assert same(out, correct), f"{out} va {correct}"
 
         # Test aoti
         out = AOTIRunnerUtil.run("cuda", func, (arg,))
         torch.cuda.synchronize()
 
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
-    @torch._inductor.config.patch(debug=True)
     @fresh_inductor_cache()
     def test_inductor_all_reduce_coalesced(self):
-        self._init_process_group()
-
         def func(args: List[torch.Tensor]) -> torch.Tensor:
             bufs = [arg + 42 for arg in args]
             # Expect in-place with inductor allocated buf
-            ar0 = torch.ops._c10d_functional.all_reduce_coalesced(
-                bufs, "avg", "default"
-            )
-            ar0 = [torch.ops._c10d_functional.wait_tensor(out) for out in ar0]
+            ar0 = funcol.all_reduce_coalesced(bufs, "avg", "0")
+            ar0 = [funcol.wait_tensor(out) for out in ar0]
             # Expect no in-place with graph input
-            ar1 = torch.ops._c10d_functional.all_reduce_coalesced(
-                args, "avg", "default"
-            )
-            ar1 = [torch.ops._c10d_functional.wait_tensor(out) for out in ar1]
+            ar1 = funcol.all_reduce_coalesced(args, "avg", "0")
+            ar1 = [funcol.wait_tensor(out) for out in ar1]
             return ar0, ar1
 
-        args = [torch.rand(4, 4, device=self.device) for _ in range(2)]
+        args = [torch.rand(4, 4, device="cuda") for _ in range(2)]
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, args)
         (
             FileCheck()
-            .check("buf0 = empty(")
-            .check("buf5 = empty(")
-            .check("buf1 = empty(")
-            .check("buf6 = empty(")
+            .check("buf0 = empty")
+            .check("buf5 = empty")
+            .check("buf1 = empty")
+            .check("buf6 = empty")
             # Expect in-place with inductor allocated buf
             .check(
                 "torch.ops._c10d_functional.all_reduce_coalesced_"
@@ -325,68 +509,81 @@ def func(args: List[torch.Tensor]) -> torch.Tensor:
             .check("return (buf0, buf1, buf5, buf6, )")
             .run(code)
         )
-        out = compiled(args)
-        correct = func(args)
-        assert same(out, correct), f"{out} va {correct}"
 
         # Test aoti
         out = AOTIRunnerUtil.run("cuda", func, (args,))
         torch.cuda.synchronize()
 
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
-    @torch._inductor.config.patch(debug=True)
     @fresh_inductor_cache()
-    def test_inductor_reuse_buffer_after_inplace_collective(self):
-        self._init_process_group()
+    def test_inductor_inplace_op_on_view(self):
+        def func(arg: torch.Tensor) -> torch.Tensor:
+            buf0 = (arg + 10)[:2]
+            ar0 = funcol.all_reduce(buf0, "avg", "0")
+            ar0 = funcol.wait_tensor(ar0)
+            return ar0
+
+        arg = torch.rand(4, 4, device="cuda")
+        compiled = torch.compile(func)
 
+        code = run_and_get_triton_code(compiled, arg)
+        (
+            FileCheck()
+            .check("buf0 = empty")
+            # Ensure the all_reduce_ input is a view
+            .check(
+                "torch.ops._c10d_functional.all_reduce_.default(reinterpret_tensor(buf0"
+            )
+            .check(
+                "torch.ops._c10d_functional.wait_tensor.default(reinterpret_tensor(buf0"
+            )
+            .check("return (reinterpret_tensor(buf0")
+            .run(code)
+        )
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @fresh_inductor_cache()
+    def test_inductor_reuse_buffer_after_inplace_collective(self):
         def func(arg: torch.Tensor) -> torch.Tensor:
             # Expect allocation
             buf0 = arg + 42
-            ar0 = torch.ops._c10d_functional.all_reduce(buf0, "avg", "default")
-            ar0 = torch.ops._c10d_functional.wait_tensor(ar0)
+            ar0 = funcol.all_reduce(buf0, "avg", "0")
+            ar0 = funcol.wait_tensor(ar0)
             # Expect allocation
             buf1 = torch.mm(arg, ar0)
             # Expect buf0 to be reused
             buf2 = torch.mm(arg, buf1)
             return buf1, buf2
 
-        arg = torch.rand(4, 4, device=self.device)
+        arg = torch.rand(4, 4, device="cuda")
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, arg)
         (
             FileCheck()
             # Expect allocation
-            .check("buf0 = empty(")
+            .check("buf0 = empty")
             .check("torch.ops._c10d_functional.all_reduce_.default(buf0")
             .check("torch.ops._c10d_functional.wait_tensor.default(buf0")
             # Expect allocation
-            .check("buf5 = empty(")
-            .check("extern_kernels.mm(arg0_1, buf0, out=buf5")
+            .check("buf7 = empty")
+            .check("extern_kernels.mm(arg0_1, buf0, out=buf7")
             # Expect buf0 to be reused
-            .check("buf6 = buf0; del buf0  # reuse")
-            .check("extern_kernels.mm(arg0_1, buf5, out=buf6")
+            .check("buf8 = buf0; del buf0  # reuse")
+            .check("extern_kernels.mm(arg0_1, buf7, out=buf8")
             # Expect no extra copy on return
-            .check("return (buf5, buf6, )")
+            .check("return (buf7, buf8, )")
             .run(code)
         )
-        out = compiled(arg)
-        correct = func(arg)
-        assert same(out, correct), f"{out} va {correct}"
 
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
-    @torch._inductor.config.patch(debug=True)
     @fresh_inductor_cache()
     def test_inductor_all_gather_into_tensor_single(self):
-        self._init_process_group()
-
         def func(arg: torch.Tensor) -> torch.Tensor:
-            ag0 = torch.ops._c10d_functional.all_gather_into_tensor(
-                arg, self.world_size, "default"
-            )
-            ag0 = torch.ops._c10d_functional.wait_tensor(ag0)
+            ag0 = funcol.all_gather_tensor(arg, 0, "0")
+            ag0 = funcol.wait_tensor(ag0)
             return ag0
 
-        arg = torch.rand(4, 4, device=self.device)
+        arg = torch.rand(4, 4, device="cuda")
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, arg)
         (
@@ -399,31 +596,22 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             .check("return (buf0, )")
             .run(code)
         )
-        out = compiled(arg)
-        correct = func(arg)
-        assert same(out, correct), f"{out} va {correct}"
 
         # Test aoti
         out = AOTIRunnerUtil.run("cuda", func, (arg,))
         torch.cuda.synchronize()
 
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
-    @torch._inductor.config.patch(debug=True)
     @fresh_inductor_cache()
     def test_inductor_all_gather_into_tensor_coalesced(self):
-        self._init_process_group()
-
         def func(args: List[torch.Tensor]) -> torch.Tensor:
-            ag0 = torch.ops._c10d_functional.all_gather_into_tensor_coalesced(
-                args, self.world_size, "default"
-            )
-            ag0 = [torch.ops._c10d_functional.wait_tensor(out) for out in ag0]
+            ag0 = funcol.all_gather_into_tensor_coalesced(args, "0")
+            ag0 = [funcol.wait_tensor(out) for out in ag0]
             return ag0
 
-        args = [torch.rand(4, 4, device=self.device) for _ in range(4)]
+        args = [torch.rand(4, 4, device="cuda") for _ in range(4)]
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, args)
-        print(code)
         (
             FileCheck()
             .check(
@@ -442,28 +630,20 @@ def func(args: List[torch.Tensor]) -> torch.Tensor:
             .check("return (buf1, buf2, buf3, buf4, )")
             .run(code)
         )
-        out = compiled(args)
-        correct = func(args)
-        assert same(out, correct), f"{out} va {correct}"
 
         # Test aoti
         out = AOTIRunnerUtil.run("cuda", func, (args,))
         torch.cuda.synchronize()
 
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
-    @torch._inductor.config.patch(debug=True)
     @fresh_inductor_cache()
     def test_inductor_reduce_scatter_tensor_single(self):
-        self._init_process_group()
-
         def func(arg: torch.Tensor) -> torch.Tensor:
-            rs0 = torch.ops._c10d_functional.reduce_scatter_tensor(
-                arg, "avg", self.world_size, "default"
-            )
-            rs0 = torch.ops._c10d_functional.wait_tensor(rs0)
+            rs0 = funcol.reduce_scatter_tensor(arg, "avg", 0, "0")
+            rs0 = funcol.wait_tensor(rs0)
             return rs0
 
-        arg = torch.rand(4, 4, device=self.device)
+        arg = torch.rand(4, 4, device="cuda")
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, arg)
         (
@@ -476,28 +656,22 @@ def func(arg: torch.Tensor) -> torch.Tensor:
             .check("return (buf0, )")
             .run(code)
         )
-        out = compiled(arg)
-        correct = func(arg)
-        assert same(out, correct), f"{out} va {correct}"
 
         # Test aoti
         out = AOTIRunnerUtil.run("cuda", func, (arg,))
         torch.cuda.synchronize()
 
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
-    @torch._inductor.config.patch(debug=True)
     @fresh_inductor_cache()
     def test_inductor_reduce_scatter_tensor_coalesced(self):
-        self._init_process_group()
-
         def func(args: List[torch.Tensor]) -> torch.Tensor:
-            rs0 = torch.ops._c10d_functional.reduce_scatter_tensor_coalesced(
-                args, "avg", self.world_size, "default"
+            rs0 = funcol.reduce_scatter_tensor_coalesced(
+                args, "avg", [0] * len(args), "0"
             )
-            rs0 = [torch.ops._c10d_functional.wait_tensor(out) for out in rs0]
+            rs0 = [funcol.wait_tensor(out) for out in rs0]
             return rs0
 
-        args = [torch.rand(4, 4, device=self.device) for _ in range(4)]
+        args = [torch.rand(4, 4, device="cuda") for _ in range(4)]
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, args)
         (
@@ -518,25 +692,18 @@ def func(args: List[torch.Tensor]) -> torch.Tensor:
             .check("return (buf1, buf2, buf3, buf4, )")
             .run(code)
         )
-        out = compiled(args)
-        correct = func(args)
-        assert same(out, correct), f"{out} va {correct}"
 
         # Test aoti
-        out = AOTIRunnerUtil.run("cuda", func, (args,))
+        AOTIRunnerUtil.run("cuda", func, (args,))
         torch.cuda.synchronize()
 
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @fresh_inductor_cache()
     def test_inductor_all_to_all_single(self):
-        torch._inductor.config.debug = True
-        self._init_process_group()
-        torch.cuda.set_device(self.device)
-
         def _tolist_with_constrain_as_size(tensor):
             lst = tensor.tolist()
             for elem in lst:
-                torch._constrain_as_size(elem)
+                torch._check_is_size(elem)
             return lst
 
         def func(
@@ -544,13 +711,13 @@ def func(
             output_split_sizes: torch.Tensor,
             input_split_sizes: torch.Tensor,
         ) -> torch.Tensor:
-            output = torch.ops._c10d_functional.all_to_all_single(
+            output = funcol.all_to_all_single(
                 input,
                 _tolist_with_constrain_as_size(output_split_sizes),
                 _tolist_with_constrain_as_size(input_split_sizes),
-                "default",
+                "0",
             )
-            return torch.ops._c10d_functional.wait_tensor(output)
+            return funcol.wait_tensor(output)
 
         torch.manual_seed(42)
         send_sz_matrix = torch.randint(0, 20, (self.world_size, self.world_size))
@@ -572,14 +739,66 @@ def func(
             FileCheck()
             .check_regex(
                 "torch.ops._c10d_functional.all_to_all_single.default\\("
-                "arg\\d+_\\d+, \\[i\\d+, i\\d+\\], \\[i\\d+, i\\d+\\]"
+                "arg\\d+_\\d+, \\[u\\d+, u\\d+\\], \\[u\\d+, u\\d+\\]"
             )
             .check("torch.ops._c10d_functional.wait_tensor.default(")
             .run(code)
         )
-        out = compiled(input, output_split_sizes, input_split_sizes)
-        correct = func(input, output_split_sizes, input_split_sizes)
-        assert same(out, correct), f"{out} va {correct}"
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @fresh_inductor_cache()
+    def test_inductor_broadcast(self):
+        def func(arg: torch.Tensor) -> torch.Tensor:
+            buf0 = arg + 42
+            # Expect in-place with inductor allocated buf
+            br0 = funcol.broadcast(buf0, 1, "0")
+            br0 = funcol.wait_tensor(br0)
+            # Expect no in-place with graph input
+            br1 = funcol.broadcast(arg, 0, "0")
+            br1 = funcol.wait_tensor(br1)
+            return br0, br1
+
+        arg = torch.rand(4, 4, device="cuda")
+        compiled = torch.compile(func)
+
+        code = run_and_get_triton_code(compiled, arg)
+        (
+            FileCheck()
+            .check("buf0 = empty")
+            .check("buf7 = empty")
+            # Expect in-place with inductor allocated buf
+            .check("torch.ops._c10d_functional.broadcast_.default(buf0")
+            .check("torch.ops._c10d_functional.wait_tensor.default(buf0")
+            # Expect no in-place with graph input (buf5 is a clone)
+            .check("torch.ops._c10d_functional.broadcast_.default(buf7")
+            .check("torch.ops._c10d_functional.wait_tensor.default(buf7")
+            # Expect no extra copy on return
+            .check("return (buf0, buf7, )")
+            .run(code)
+        )
+
+        # Test aoti
+        out = AOTIRunnerUtil.run("cuda", func, (arg,))
+        torch.cuda.synchronize()
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @fresh_inductor_cache()
+    def test_ranks_and_tag(self):
+        def func(arg: torch.Tensor) -> torch.Tensor:
+            buf0 = arg + 42
+            # Expect in-place with inductor allocated buf
+            ar0 = funcol.all_reduce(buf0, "avg", [0, 1], "")
+            ar0 = funcol.wait_tensor(ar0)
+            # Expect no in-place with graph input
+            ar1 = funcol.all_reduce(arg, "avg", [0, 1], "")
+            ar1 = funcol.wait_tensor(ar1)
+            return ar0, ar1
+
+        arg = torch.rand(4, 4, device="cuda")
+        compiled = torch.compile(func, fullgraph=True)
+
+        code = run_and_get_triton_code(compiled, arg)
+        (FileCheck().check("all_reduce_.default(buf0, 'avg', '0')").run(code))
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 01f0bcc9303df..34f8849ddbf3a 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -339,8 +339,12 @@ def broadcast(xs, rootRank, rootTensor):
                 ]
 
                 output = broadcast(xs, i, j)
-                self.assertEqual(torch.tensor([i * num + j], dtype=torch.float32), output[0])
-                self.assertEqual(torch.tensor([i * num + j], dtype=torch.float32), output[1])
+                self.assertEqual(
+                    torch.tensor([i * num + j], dtype=torch.float32), output[0]
+                )
+                self.assertEqual(
+                    torch.tensor([i * num + j], dtype=torch.float32), output[1]
+                )
 
         # Test overloaded convenience function
         x = torch.tensor([self.rank + 1.0])
@@ -419,7 +423,7 @@ def _test_allreduce_basics(self, fn):
 
         # Single input tests
         tests = simple_reduce_tests(self.rank, self.world_size)
-        for (op, input, expected) in tests:
+        for op, input, expected in tests:
             opts = c10d.AllreduceOptions()
             opts.reduceOp = op
             tensor = fn(input)
@@ -430,7 +434,7 @@ def _test_allreduce_basics(self, fn):
 
         # Multi input tests
         tests = simple_multi_input_reduce_tests(self.rank, self.world_size)
-        for (op, inputs, output) in tests:
+        for op, inputs, output in tests:
             opts = c10d.AllreduceOptions()
             opts.reduceOp = op
             tensors = [fn(input) for input in inputs]
@@ -459,56 +463,6 @@ def test_allreduce_basics(self):
     def test_allreduce_basics_cuda(self):
         self._test_allreduce_basics(lambda t: t.clone().cuda())
 
-    # _using_work_api tests are to make sure we still properly support work API.
-    # This should go away as we deprecate it.
-    def _test_allreduce_basics_using_work_api(self, fn):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_gloo(
-            store, self.rank, self.world_size, self.opts()
-        )
-
-        # Single input tests
-        tests = simple_reduce_tests(self.rank, self.world_size)
-        for (op, input, expected) in tests:
-            opts = c10d.AllreduceOptions()
-            opts.reduceOp = op
-            tensor = fn(input)
-            work = pg.allreduce([tensor], opts)
-            work.wait()
-            result = work.result()
-            self.assertEqual(expected, result[0])
-
-        # Multi input tests
-        tests = simple_multi_input_reduce_tests(self.rank, self.world_size)
-        for (op, inputs, output) in tests:
-            opts = c10d.AllreduceOptions()
-            opts.reduceOp = op
-            tensors = [fn(input) for input in inputs]
-            work = pg.allreduce(tensors, opts)
-            work.wait()
-            result = work.result()
-            for tensor in result:
-                self.assertEqual(output, tensor)
-
-        # Test overloaded convenience function (defaults to using sum)
-        x = fn(torch.tensor([self.rank + 1.0]))
-        work = pg.allreduce(x)
-        work.wait()
-        result = work.result()
-        self.assertEqual(
-            torch.tensor([float(self.world_size * (self.world_size + 1) / 2)]),
-            result[0],
-        )
-
-    @requires_gloo()
-    def test_allreduce_basics_using_work_api(self):
-        self._test_allreduce_basics_using_work_api(lambda t: t.clone())
-
-    @skip_if_lt_x_gpu(2)
-    @requires_gloo()
-    def test_allreduce_basics_cuda_using_work_api(self):
-        self._test_allreduce_basics_using_work_api(lambda t: t.clone().cuda())
-
     def _test_allreduce_stress(self, inputs):
         store = c10d.FileStore(self.file_name, self.world_size)
         pg = self._create_process_group_gloo(
@@ -556,7 +510,9 @@ def test_allreduce_coalesced_checks(self):
             opts = c10d.AllreduceCoalescedOptions()
             pg.allreduce_coalesced([], opts)
 
-        with self.assertRaisesRegex(RuntimeError, "tensors must all have the same type"):
+        with self.assertRaisesRegex(
+            RuntimeError, "tensors must all have the same type"
+        ):
             opts = c10d.AllreduceCoalescedOptions()
             pg.allreduce_coalesced([t1, t2], opts)
 
@@ -671,7 +627,9 @@ def test_sparse_allreduce_checks(self):
 
         # Sparse allreduce only works with c10d.ReduceOp.SUM.
         for op in [c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX]:
-            with self.assertRaisesRegex(RuntimeError, "unsupported reduction operation"):
+            with self.assertRaisesRegex(
+                RuntimeError, "unsupported reduction operation"
+            ):
                 opts = c10d.AllreduceOptions()
                 opts.reduceOp = op
                 pg.allreduce([t3], opts)
@@ -686,7 +644,7 @@ def _test_sparse_allreduce_basics(self, fn):
             tests = simple_sparse_reduce_tests(
                 self.rank, self.world_size, num_inputs=num_inputs_per_rank
             )
-            for (inputs, outputs) in tests:
+            for inputs, outputs in tests:
                 tensors = [fn(input) for input in inputs]
                 fut = pg.allreduce(tensors).get_future()
                 fut.wait()
@@ -707,16 +665,89 @@ def test_sparse_allreduce_basics_cuda(self):
     @requires_gloo()
     def test_sparse_allreduce_cuda_dispatched(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        dist.init_process_group(backend="gloo", store=store, rank=self.rank, world_size=self.world_size)
-        tests = simple_sparse_reduce_tests(
-            self.rank, self.world_size, num_inputs=1
+        dist.init_process_group(
+            backend="gloo", store=store, rank=self.rank, world_size=self.world_size
         )
-        for (inputs, outputs) in tests:
+        tests = simple_sparse_reduce_tests(self.rank, self.world_size, num_inputs=1)
+        for inputs, outputs in tests:
             tensors = inputs[-1].clone().cuda()
             work = dist.all_reduce(tensors, async_op=True)
             work.wait()
             self.assertEqual([tensors], outputs)
 
+    @requires_gloo()
+    def test_allgather_into_tensor_coalesced(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="gloo",
+            store=store,
+            rank=self.rank,
+            world_size=self.world_size,
+        )
+        torch.manual_seed(42)
+        in_shapes = [(5, 5), (10, 10), (15, 15)]
+        out_shapes = [(s[0] * self.world_size,) + s[1:] for s in in_shapes]
+
+        outputs = [torch.empty(s) for s in out_shapes]
+        inputs = [torch.rand(s) for s in in_shapes]
+        work = dist.group.WORLD.allgather_into_tensor_coalesced(outputs, inputs)
+        work.wait()
+
+        for output, input in zip(outputs, inputs):
+            expect = torch.cat([input] * self.world_size)
+            self.assertTrue(torch.allclose(output, expect))
+
+    @requires_gloo()
+    def test_reduce_scatter_tensor(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="gloo",
+            store=store,
+            rank=self.rank,
+            world_size=self.world_size,
+        )
+        torch.manual_seed(42)
+        out_shape = (20, 20)
+        in_shape = (out_shape[0] * self.world_size,) + out_shape[1:]
+
+        output = torch.empty(out_shape)
+        input = torch.rand(in_shape)
+        work = dist.reduce_scatter_tensor(output, input, async_op=True)
+        work.wait()
+
+        expect = (
+            input.view(self.world_size, *out_shape).chunk(self.world_size)[self.rank]
+            * self.world_size
+        )
+        self.assertTrue(torch.allclose(output, expect))
+
+    @requires_gloo()
+    def test_reduce_scatter_tensor_coalesced(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="gloo",
+            store=store,
+            rank=self.rank,
+            world_size=self.world_size,
+        )
+        torch.manual_seed(42)
+        out_shapes = [(5, 5), (10, 10), (15, 15)]
+        in_shapes = [(s[0] * self.world_size,) + s[1:] for s in out_shapes]
+
+        outputs = [torch.empty(s) for s in out_shapes]
+        inputs = [torch.rand(s) for s in in_shapes]
+        work = dist.group.WORLD.reduce_scatter_tensor_coalesced(outputs, inputs)
+        work.wait()
+
+        for output, input in zip(outputs, inputs):
+            expect = (
+                input.view(self.world_size, *output.shape).chunk(self.world_size)[
+                    self.rank
+                ]
+                * self.world_size
+            )
+            self.assertTrue(torch.allclose(output, expect))
+
     @requires_gloo()
     def test_scatter_checks(self):
         store = c10d.FileStore(self.file_name, self.world_size)
@@ -752,12 +783,16 @@ def test_scatter_checks(self):
             opts.rootRank = 0
             pg.scatter([t1, t1], [], opts)
 
-        with self.assertRaisesRegex(RuntimeError, "requires a single-element input list"):
+        with self.assertRaisesRegex(
+            RuntimeError, "requires a single-element input list"
+        ):
             opts = c10d.ScatterOptions()
             opts.rootRank = self.rank
             pg.scatter([t1], [], opts)
 
-        with self.assertRaisesRegex(RuntimeError, "requires a single-element input list"):
+        with self.assertRaisesRegex(
+            RuntimeError, "requires a single-element input list"
+        ):
             opts = c10d.ScatterOptions()
             opts.rootRank = self.rank
             pg.scatter([t1], [[t1] * self.world_size, [t1] * self.world_size], opts)
@@ -1077,7 +1112,9 @@ def test_allgather_checks(self):
         t2 = torch.zeros([1], dtype=torch.float64)
         t3 = torch.zeros([2], dtype=torch.float32)
 
-        with self.assertRaisesRegex(RuntimeError, "requires non-empty input tensor list"):
+        with self.assertRaisesRegex(
+            RuntimeError, "requires non-empty input tensor list"
+        ):
             pg.allgather([], [])
 
         with self.assertRaisesRegex(
@@ -1241,11 +1278,19 @@ def test_allgather_coalesced_async(self):
         )
 
         xxs = [2 * [torch.tensor([i + self.rank])] for i in range(2)]
-        yys = [[[torch.zeros_like(x) for x in xx] for _ in range(self.world_size)] for xx in xxs]
-        futs = [c10d.all_gather_coalesced(yy, xx, async_op=True) for xx, yy in zip(xxs, yys)]
+        yys = [
+            [[torch.zeros_like(x) for x in xx] for _ in range(self.world_size)]
+            for xx in xxs
+        ]
+        futs = [
+            c10d.all_gather_coalesced(yy, xx, async_op=True) for xx, yy in zip(xxs, yys)
+        ]
 
         # expected outputs
-        zzs = [[2 * [torch.tensor([i + r])] for r in range(self.world_size)] for i in range(2)]
+        zzs = [
+            [2 * [torch.tensor([i + r])] for r in range(self.world_size)]
+            for i in range(2)
+        ]
 
         torch.futures.wait_all(futs)
         for yy, zz in zip(yys, zzs):
@@ -1307,7 +1352,7 @@ def _test_reduce_basics(self, fn):
         pg = self._create_process_group_gloo(
             store, self.rank, self.world_size, self.opts()
         )
-        for (op, input, output) in simple_reduce_tests(self.rank, self.world_size):
+        for op, input, output in simple_reduce_tests(self.rank, self.world_size):
             for root in range(self.world_size):
                 opts = c10d.ReduceOptions()
                 opts.reduceOp = op
@@ -1439,12 +1484,11 @@ def test_barrier_implies_wait(self):
     def test_round_robin(self):
         num_process_groups = 2
         store = c10d.FileStore(self.file_name, self.world_size)
-        c10d.init_process_group(backend="gloo", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            backend="gloo", store=store, rank=self.rank, world_size=self.world_size
+        )
         pg = c10d._round_robin_process_groups(
-            [
-                c10d.new_group(pg_options=self.opts())
-                for i in range(num_process_groups)
-            ]
+            [c10d.new_group(pg_options=self.opts()) for i in range(num_process_groups)]
         )
 
         # Run a few collectives so that we have called each process group
@@ -1457,14 +1501,13 @@ def test_round_robin(self):
     @requires_gloo()
     def test_round_robin_create_destroy(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        c10d.init_process_group(backend="gloo", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            backend="gloo", store=store, rank=self.rank, world_size=self.world_size
+        )
 
         def create(num, prefix):
             return c10d._round_robin_process_groups(
-                [
-                    c10d.new_group(pg_options=self.opts())
-                    for i in range(num)
-                ]
+                [c10d.new_group(pg_options=self.opts()) for i in range(num)]
             )
 
         # Run create/use/destroy twice
@@ -1487,14 +1530,18 @@ def setUp(self):
 
     def _get_process_group(self):
         store = self._get_store()
-        c10d.init_process_group(backend="gloo", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            backend="gloo", store=store, rank=self.rank, world_size=self.world_size
+        )
         return c10d.distributed_c10d._get_default_group()
 
     def _test_gloo_backend(
         self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False
     ):
         store = c10d.FileStore(self.file_name, self.world_size)
-        c10d.init_process_group(backend="gloo", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            backend="gloo", store=store, rank=self.rank, world_size=self.world_size
+        )
         process_group = c10d.distributed_c10d._get_default_group()
         device = devices[-1]
         backend = process_group._get_backend(device)
@@ -1775,6 +1822,7 @@ def __init__(self, shard_tensor: ShardedTensor) -> None:
             def forward(self, x):
                 x = self.relu(self.fc1(x))
                 return F.softmax(x, dim=1)
+
         pg = dist.init_process_group(
             "gloo",
             init_method=f"file://{self.file_name}",
@@ -1785,14 +1833,13 @@ def forward(self, x):
         local_shard_metadata = ShardMetadata(
             shard_offsets=[(self.rank % 2) * 5, 0],
             shard_sizes=[5, 10],
-            placement=f"rank:{self.rank}/cuda:{self.rank}"
+            placement=f"rank:{self.rank}/cuda:{self.rank}",
         )
         local_shards = [Shard(torch.randn(5, 10, device=device), local_shard_metadata)]
         st = init_from_local_shards(local_shards, [10, 10])
         m = MyModule(st)
         DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
-            module=m,
-            params_and_buffers_to_ignore={'st'}
+            module=m, params_and_buffers_to_ignore={"st"}
         )
         # test to make DDP constructor will not fail when module includes a ShardedTensor when ignored
         DistributedDataParallel(
@@ -1822,7 +1869,9 @@ def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model):
         # Check that the gradients are sparse and identical
         vanilla_parameter = next(vanilla_model.parameters())
         ddp_parameter = next(ddp_model.parameters())
-        self.assertEqual(vanilla_parameter.grad.coalesce(), ddp_parameter.grad.coalesce())
+        self.assertEqual(
+            vanilla_parameter.grad.coalesce(), ddp_parameter.grad.coalesce()
+        )
 
     @requires_gloo()
     @skip_if_lt_x_gpu(2)
@@ -2047,7 +2096,9 @@ def test_ddp_invalid_comm_hook_return_type(self):
             ModuleForDdpCommHook(), process_group=process_group
         )
 
-        expected_err = "Communication hook: return annotation should be torch.futures.Future"
+        expected_err = (
+            "Communication hook: return annotation should be torch.futures.Future"
+        )
         with self.assertRaisesRegex(
             ValueError,
             expected_err,
@@ -2155,7 +2206,9 @@ def setUp(self):
         self.file = tempfile.NamedTemporaryFile(delete=False)
         world_size = 1
         self.store = c10d.FileStore(self.file.name, world_size)
-        c10d.init_process_group(backend="gloo", store=self.store, rank=0, world_size=world_size)
+        c10d.init_process_group(
+            backend="gloo", store=self.store, rank=0, world_size=world_size
+        )
         self.process_group = c10d.distributed_c10d._get_default_group()
 
     def tearDown(self):
@@ -2171,7 +2224,9 @@ def test_single_dtype_single_bucket(self):
         model = ReducerModule()
         parameters = list(model.parameters())
         buckets = [list(range(len(parameters)))]
-        dist.Reducer(parameters, buckets, [dist._DEFAULT_FIRST_BUCKET_BYTES], self.process_group)
+        dist.Reducer(
+            parameters, buckets, [dist._DEFAULT_FIRST_BUCKET_BYTES], self.process_group
+        )
 
     def _create_mixed_precision_model(self):
         model = ReducerModule()
@@ -2192,7 +2247,7 @@ def test_multi_dtype_single_bucket(self):
                 parameters,
                 buckets,
                 [dist._DEFAULT_FIRST_BUCKET_BYTES],
-                self.process_group
+                self.process_group,
             )
 
     @requires_gloo()
@@ -2207,7 +2262,7 @@ def test_multi_dtype_multi_bucket(self):
             parameters,
             buckets,
             [dist._DEFAULT_FIRST_BUCKET_BYTES for _ in buckets],
-            self.process_group
+            self.process_group,
         )
 
     def _create_reducer_for_models(self, models, find_unused_parameters=False):
@@ -2290,7 +2345,6 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
     def device(self):
         return "cpu"
 
-
     def setUp(self):
         super().setUp()
         self._spawn_processes()
@@ -2337,7 +2391,9 @@ def _test_broadcast_coalesced(self, process_group, device, root_rank):
     @skip_if_lt_x_gpu(2)
     def test_broadcast_coalesced_gloo_cuda(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        c10d.init_process_group(backend="gloo", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            backend="gloo", store=store, rank=self.rank, world_size=self.world_size
+        )
         process_group = c10d.distributed_c10d._get_default_group()
         device = torch.device("cuda:%d" % self.rank)
         backend = process_group._get_backend(device)
@@ -2349,7 +2405,9 @@ def test_broadcast_coalesced_gloo_cuda(self):
     @requires_gloo()
     def test_broadcast_coalesced_gloo_cpu(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        c10d.init_process_group(backend="gloo", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            backend="gloo", store=store, rank=self.rank, world_size=self.world_size
+        )
         process_group = c10d.distributed_c10d._get_default_group()
         device = torch.device("cpu")
         backend = process_group._get_backend(device)
@@ -2404,7 +2462,10 @@ def test_tensor_dtype_complex(self):
     def test_bool_tensors(self):
         self._test_bool_tensors(backend="gloo")
 
-class GlooProcessGroupWithDispatchedCollectivesTests(test_c10d_common.ProcessGroupWithDispatchedCollectivesTests):
+
+class GlooProcessGroupWithDispatchedCollectivesTests(
+    test_c10d_common.ProcessGroupWithDispatchedCollectivesTests
+):
     @requires_gloo()
     def test_collectives(self):
         self._test_collectives(backend="gloo")
@@ -2442,8 +2503,8 @@ def test_monitored_barrier(self):
         )
         dist.monitored_barrier()
 
-class CompilerTest(test_c10d_common.CompilerTest):
 
+class CompilerTest(test_c10d_common.CompilerTest):
     @property
     def world_size(self):
         return 2
@@ -2463,36 +2524,28 @@ def test_allreduce_work_wait_cpu(self):
 
     @skip_if_lt_x_gpu(2)
     def test_allreduce_work_wait_gpu(self):
-        self._test_allreduce_work_wait(
-            torch.ones(2, 2, device=self.rank) * self.rank
-        )
+        self._test_allreduce_work_wait(torch.ones(2, 2, device=self.rank) * self.rank)
 
     def test_allgather_work_wait_cpu(self):
         self._test_allgather_work_wait(torch.ones(2, 2) * self.rank)
 
     @skip_if_lt_x_gpu(2)
     def test_allgather_work_wait_gpu(self):
-        self._test_allgather_work_wait(
-            torch.ones(2, 2, device=self.rank) * self.rank
-        )
+        self._test_allgather_work_wait(torch.ones(2, 2, device=self.rank) * self.rank)
 
     def test_broadcast_work_wait_cpu(self):
         self._test_broadcast_work_wait(torch.ones(2, 2) * self.rank)
 
     @skip_if_lt_x_gpu(2)
     def test_broadcast_work_wait_gpu(self):
-        self._test_broadcast_work_wait(
-            torch.ones(2, 2, device=self.rank) * self.rank
-        )
+        self._test_broadcast_work_wait(torch.ones(2, 2, device=self.rank) * self.rank)
 
     def test_scatter_work_wait_cpu(self):
         self._test_scatter_work_wait(torch.ones(2, 2) * self.rank)
 
     @skip_if_lt_x_gpu(2)
     def test_scatter_work_wait_gpu(self):
-        self._test_scatter_work_wait(
-            torch.ones(2, 2, device=self.rank) * self.rank
-        )
+        self._test_scatter_work_wait(torch.ones(2, 2, device=self.rank) * self.rank)
 
     def test_nested_comm_tensor_wrapping(self):
         self._test_nested_comm_tensor_wrapping(torch.ones(2, 2) * self.rank)
@@ -2506,6 +2559,7 @@ def test_consecutive_comm_work_wait_gpu(self):
             torch.ones(2, 2, device=self.rank) * self.rank
         )
 
+
 class LargeCommTest(test_c10d_common.AbstractLargeCommTest, MultiProcessTestCase):
     def setUp(self):
         super().setUp()
@@ -2534,6 +2588,7 @@ def test_new_group_local_sync_sanity_check(self):
     def test_new_group_local_sync_duplicate_pg(self):
         self._test_new_group_local_sync_duplicate_pg(backend="gloo")
 
+
 if __name__ == "__main__":
     assert (
         not torch.cuda._initialized
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index e886c99d31287..e105d87b7d4f1 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -1,21 +1,22 @@
 # Owner(s): ["oncall: distributed"]
 
 import copy
-import math
+import json
 import os
+import pickle
 import random
 import re
 import signal
 import sys
 import tempfile
 import threading
-import pickle
 import time
 import warnings
 from contextlib import contextmanager
 from datetime import datetime, timedelta
+from enum import auto, Enum
 from itertools import chain, product
-from unittest import SkipTest, mock
+from unittest import mock, SkipTest
 
 import torch
 import torch.distributed as c10d
@@ -24,42 +25,42 @@
     print("c10d NCCL not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
+from typing import Dict, List
+
 import test_c10d_common
 import torch.distributed as dist
 import torch.distributed.algorithms.ddp_comm_hooks.default_hooks as default
 import torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook as powerSGD
 import torch.nn.functional as F
 import torch.testing._internal.common_utils as common
-from typing import Dict, List
-from test_c10d_common import gpus_for_rank, DoubleGpuNet, ConvNet, ModuleForDdpCommHook
+from test_c10d_common import ConvNet, DoubleGpuNet, gpus_for_rank, ModuleForDdpCommHook
 from torch import nn
 from torch._C._distributed_c10d import OpType
 from torch.nn.parallel import DistributedDataParallel
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
-    MultiProcessTestCase,
+    get_timeout,
     init_multigpu_helper,
-    requires_nccl,
+    MultiProcessTestCase,
     requires_gloo,
+    requires_nccl,
     requires_nccl_version,
     skip_if_lt_x_gpu,
-    get_timeout,
     skip_if_rocm,
     with_dist_debug_levels,
     with_nccl_blocking_wait,
 )
 from torch.testing._internal.common_utils import (
-    TestCase,
-    run_tests,
-    retry_on_connect_failures,
-    skipIfRocm,
-    TEST_WITH_DEV_DBG_ASAN,
-    TEST_WITH_ROCM,
-    parametrize,
     instantiate_parametrized_tests,
+    parametrize,
+    retry_on_connect_failures,
+    run_tests,
     skip_but_pass_in_sandcastle,
     skip_but_pass_in_sandcastle_if,
-    TEST_CUDA
+    TEST_CUDA,
+    TEST_WITH_DEV_DBG_ASAN,
+    TEST_WITH_ROCM,
+    TestCase,
 )
 
 if TEST_WITH_DEV_DBG_ASAN:
@@ -69,15 +70,12 @@
     sys.exit(0)
 
 # bfloat16 is only supported by CUDA 11+
-BFLOAT16_AVAILABLE = (
-    torch.cuda.is_available()
-    and
-    (
-        (torch.version.cuda is not None and int(torch.version.cuda.split('.')[0]) >= 11)
-        or torch.version.hip is not None
-    )
+BFLOAT16_AVAILABLE = torch.cuda.is_available() and (
+    (torch.version.cuda is not None and int(torch.version.cuda.split(".")[0]) >= 11)
+    or torch.version.hip is not None
 )
 
+
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
     @requires_nccl()
@@ -207,912 +205,48 @@ def test_init_no_gpus(self):
             c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
 
 
-class ProcessGroupNCCLTest(MultiProcessTestCase):
-    def _create_process_group_nccl(self, store, opts, device_id=None):
-        # create nccl processgroup with opts
-        c10d.init_process_group(
-            "nccl",
-            world_size=self.world_size,
-            rank=self.rank,
-            store=store,
-            pg_options=opts,
-            device_id=device_id)
-        pg = c10d.distributed_c10d._get_default_group()
-        return pg
-
-    def opts(self, high_priority_stream=False):
-        opts = c10d.ProcessGroupNCCL.Options()
-        opts.is_high_priority_stream = high_priority_stream
-        return opts
-
-    def setUp(self):
-        super().setUp()
-        # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
-        # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
-        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
-        # self.num_gpus = torch.cuda.device_count()
-        self._spawn_processes()
-
-    def tearDown(self):
-        super().tearDown()
-        try:
-            os.remove(self.file_name)
-        except OSError:
-            pass
-
-    @property
-    def world_size(self):
-        return 2
-
-    @property
-    def rank_to_GPU(self):
-        # return rank to GPU map
-        return init_multigpu_helper(self.world_size, "nccl")
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_empty_tensors(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_idx = self.rank_to_GPU[self.rank][0]
-
-        xs = [torch.FloatTensor([]).cuda(local_device_idx)]
-        pg.broadcast(xs).wait()
-        self.assertEqual(0, xs[0].numel())
-
-        pg.allreduce(xs).wait()
-        self.assertEqual(0, xs[0].numel())
-
-        pg.reduce(xs).wait()
-        self.assertEqual(0, xs[0].numel())
-
-        ys = [[torch.FloatTensor([]).cuda(local_device_idx) for _ in range(self.world_size)]]
-        pg.allgather(ys, xs).wait()
-        for y in ys[0]:
-            self.assertEqual(0, y.numel())
-
-        ys = [torch.FloatTensor([]).cuda(local_device_idx)]
-        xs = [[torch.FloatTensor([]).cuda(local_device_idx) for _ in range(self.world_size)]]
-        pg.reduce_scatter(ys, xs).wait()
-        self.assertEqual(0, ys[0].numel())
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_broadcast_ops(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-
-        def broadcast(xs, rootRank, rootTensor):
-            opts = c10d.BroadcastOptions()
-            opts.rootRank = rootRank
-            opts.rootTensor = rootTensor
-            work = pg.broadcast(xs, opts)
-            work.wait()
-            return work.result()
-
-        # Every rank is root once
-        for i in range(self.world_size):
-            # Run with 1 input tensor
-            x = torch.tensor([self.rank]).cuda(self.rank_to_GPU[self.rank][0])
-            output = broadcast([x], i, 0)
-            self.assertEqual(torch.tensor([i]), output[0])
-
-            expected_tensor = torch.empty([i + 1, i + 1]).fill_(i + 1)
-            xs = [torch.empty([i + 1, i + 1]).fill_(-1).cuda(device=device_idx) for device_idx in self.rank_to_GPU[self.rank]]
-
-            # test with multiple input tensors (multiple gpu in one rank)
-            for j in range(len(xs)):
-                if self.rank == i:
-                    xs[j] = expected_tensor.cuda(device=self.rank_to_GPU[self.rank][j])
-
-                broadcast(xs, i, j)
-
-                for tensor in xs:
-                    self.assertEqual(tensor, expected_tensor)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_sparse_allreduce_ops(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-
-        indices = torch.tensor([[0, 1]])
-        values = torch.tensor([[1, 2, 0], [4, 0, 6]])
-        sparse_tensor = torch.sparse_coo_tensor(indices, values, size=(2, 3)).to(self.rank)
-
-        # sparse allreduce call is wrapped in a try catch since the c10d API is only available in the nccl experimental branch
-        try:
-            work = pg.allreduce([sparse_tensor])
-            work.wait()
-
-            # work.result() returns a list of size 1, with the allreduce output as a dense tensor
-            a = torch.tensor([[2, 4, 0], [8, 0, 12]]).to(self.rank)
-            self.assertEqual(work.result()[0], a)
-        except RuntimeError as e:
-            if "allreduce_sparse is only available in the NCCL experimental branch." in str(e):
-                pass
-            else:
-                # Rethrow the exception if it's a different error
-                raise
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_allreduce_ops(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        device_count = torch.cuda.device_count()
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_id = self.rank_to_GPU[self.rank][0]
-
-        def allreduce(tensors, op):
-            opts = c10d.AllreduceOptions()
-            opts.reduceOp = op
-            work = pg.allreduce(tensors, opts)
-            work.wait()
-
-        # Sum
-        tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
-
-        allreduce(tensors, c10d.ReduceOp.SUM)
-
-        ndev = self.world_size
-        self.assertEqual(
-            torch.tensor([ndev * (ndev + 1) // 2]),
-            tensors[0],
-        )
-
-        # Avg (only available for NCCL 2.10+)
-        if torch.cuda.nccl.version() >= (2, 10, 0):
-            tensors = [torch.tensor([self.rank + 1.]).cuda(local_device_id)]
-
-            allreduce(tensors, c10d.ReduceOp.AVG)
-            ndev = self.world_size
-            self.assertEqual(
-                torch.tensor([ndev * (ndev + 1.) / (2. * ndev)]),
-                tensors[0],
-            )
-
-        # Premul Sum
-        if torch.cuda.nccl.version() >= (2, 11, 1):
-            for dtype in torch.half, torch.float, torch.double:
-                for factor in (3.0, torch.tensor([5.0], device=local_device_id, dtype=dtype)):
-                    tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id).to(dtype=dtype)]
-
-                    allreduce(tensors, c10d._make_nccl_premul_sum(factor))
-
-                    self.assertEqual(
-                        factor * torch.tensor([self.world_size * (self.world_size + 1) / 2],
-                                              dtype=dtype, device=local_device_id),
-                        tensors[0],
-                    )
-
-        # Product
-        tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
-
-        allreduce(tensors, c10d.ReduceOp.PRODUCT)
-        self.assertEqual(
-            torch.tensor([math.factorial(self.world_size)]), tensors[0]
-        )
-
-        # Min
-        tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
-
-        allreduce(tensors, c10d.ReduceOp.MIN)
-        self.assertEqual(torch.tensor([1]), tensors[0])
-
-        # Max
-        tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
-
-        allreduce(tensors, c10d.ReduceOp.MAX)
-        self.assertEqual(torch.tensor([self.world_size]), tensors[0])
-
-        for op, err in zip((c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR),
-                           ("ReduceOp.BAND", "ReduceOp.BOR", "ReduceOp.BXOR")):
-            with self.assertRaisesRegex(
-                    ValueError, "Cannot use " + err + " with NCCL"
-            ):
-                allreduce(tensors, op)
-
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_alltoall_ops_with_cudafree_race(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        opts = c10d.AllToAllOptions()
-        local_device = f"cuda:{self.rank_to_GPU[self.rank][0]}"
-        torch.cuda.set_device(local_device)
-        input = torch.rand(1000, 1000, device=local_device)
-        output = torch.rand(1000, 1000, device=local_device)
-        race_tensors = []
-        # create some tensors to race with alltoall collective
-        for _ in range(10):
-            tmp = []
-            for i in range(5):
-                tmp.append(torch.rand(10 ** (3 + i), device=local_device))
-            race_tensors.append(tmp)
-
-        for i in range(10):
-            race_tensors.pop()
-            work = pg.alltoall_base(output, input, [], [], opts)
-            # this triggers cudaFree
-            torch.cuda.empty_cache()
-            work.wait()
-        torch.cuda.synchronize(local_device)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_allreduce_in_cudagraph(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_idx = self.rank_to_GPU[self.rank][0]
-        with torch.cuda.device(local_device_idx):
-            xs = [torch.FloatTensor([1]).cuda(local_device_idx)]
-
-            # single warmup
-            pg.allreduce(xs).wait()
-            self.assertEqual(xs[0].item(), 2)
-
-            graph = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(graph):
-                pg.allreduce(xs).wait()
-            self.assertEqual(xs[0].item(), 2)
-
-            graph.replay()
-            graph.replay()
-            self.assertEqual(xs[0].item(), 8)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    @skipIfRocm()
-    def test_nccl_watchdog_cudagraph(self):
-        # test that the watchdog does not crash graphs with disallowed event query
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        rank = self.rank_to_GPU[self.rank][0]
-        with torch.cuda.device(rank):
-            for i in range(100):
-                xs = [torch.FloatTensor([1]).cuda(rank)]
-                ys = [torch.FloatTensor([4]).cuda(rank)]
-                for _ in range(30):
-                    pg.allreduce(xs[0]).wait()
-
-                graph = torch.cuda.CUDAGraph()
-                with torch.cuda.graph(graph):
-                    xs[0] += 0.0
-                    pg.allreduce(xs[0]).wait()
-                    pg.allreduce(xs[0]).wait()
-                    pg.allreduce(xs[0]).wait()
-                    xs[0] += 0.0
-
-                for _ in range(1400):
-                    graph.replay()
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_reduce_ops(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_id = self.rank_to_GPU[self.rank][0]
-
-        def reduce(xs, rootRank, rootTensor, op=None):
-            opts = c10d.ReduceOptions()
-            opts.rootRank = rootRank
-            opts.rootTensor = rootTensor
-            if op:
-                opts.reduceOp = op
-            work = pg.reduce(xs, opts)
-            work.wait()
-
-        # for every root tensor
-        for rt in range(self.world_size):
-            tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
-
-            reduce(tensors, rt, 0)
-
-            if self.rank == rt:
-                self.assertEqual(
-                    torch.tensor([self.world_size * (self.world_size + 1) // 2]),
-                    tensors[0],
-                )
-            else:
-                self.assertEqual(
-                    torch.tensor([self.rank + 1]),
-                    tensors[0],
-                )
-
-            for op, err in zip(
-                (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR),
-                ("ReduceOp.BAND", "ReduceOp.BOR", "ReduceOp.BXOR"),
-            ):
-                with self.assertRaisesRegex(
-                        ValueError, "Cannot use " + err + " with NCCL"
-                ):
-                    reduce(tensors, self.rank, rt, op)
-
-            # Premul sum
-            if torch.cuda.nccl.version() >= (2, 11, 1):
-                for factor in (3.0, torch.tensor([5.0], device=local_device_id)):
-                    if isinstance(factor, torch.Tensor):
-                        factor_ref = factor.cpu().item()
-                    else:
-                        factor_ref = factor
-                    float_tensors = [
-                        torch.tensor(
-                            [self.rank + 1.0], device=f"cuda:{local_device_id}")
-                    ]
-                    float_tensors_ref = [
-                        torch.tensor(
-                            [(self.rank + 1.0) * factor_ref], device=f"cuda:{local_device_id}")
-                    ]
-
-                    reduce(float_tensors_ref, rt, 0)
-                    reduce(float_tensors, rt, 0, c10d._make_nccl_premul_sum(factor))
-                    if self.rank == rt:
-                        self.assertEqual(float_tensors_ref[0], float_tensors[0])
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_allgather_ops(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_ids = self.rank_to_GPU[self.rank]
-
-        def allgather(output_ts, input_ts):
-            work = pg.allgather(output_ts, input_ts)
-            return work.wait()
-
-        tensors = [torch.empty(2, 2).fill_(2).cuda(device=i) for i in local_device_ids]
-        output_tensors = []
-        expected_output = []
-
-        output_per_gpu = ([torch.empty(2, 2).fill_(-1)] * len(local_device_ids) * self.world_size)
-        expected_per_gpu = ([torch.empty(2, 2).fill_(2)] * len(local_device_ids) * self.world_size)
-
-        for gpu in local_device_ids:
-            output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu])
-            expected_output.append([t.cuda(device=gpu) for t in expected_per_gpu])
-
-        result = allgather(output_tensors, tensors)
-
-        # Verification
-        self.assertEqual(output_tensors, expected_output)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_allgather_base_ops(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_id = self.rank_to_GPU[self.rank][0]
-
-        def allgather_base(output_t, input_t):
-            work = pg._allgather_base(output_t, input_t)
-            work.wait()
-
-        # allgather_base is GPU number agnostic.
-        # Each rank contribute one tensor regardless of GPU counts
-        tensor = torch.tensor([self.rank]).cuda(local_device_id)
-        output_t = torch.empty((self.world_size), dtype=tensor.dtype).cuda(local_device_id)
-
-        allgather_base(output_t, tensor)
-
-        # Verification
-        self.assertEqual(torch.arange(self.world_size), output_t)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_allgather_base_basics(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_id = self.rank_to_GPU[self.rank][0]
-
-        def allgather_base(output_t, input_t):
-            work = pg._allgather_base(output_t, input_t)
-            work.wait()
-
-        # anticipate an error
-        with self.assertRaisesRegex(
-            ValueError,
-            "output tensor size must be equal to world_size times input tensor size",
-        ):
-            tensor = torch.tensor([self.rank]).cuda(local_device_id)
-            output_t = torch.empty((self.world_size + 1), dtype=tensor.dtype).cuda(
-                local_device_id
-            )
-            # fails the check because output_t is not correctly sized
-            allgather_base(output_t, tensor)
-
-        # anticipate an error
-        with self.assertRaisesRegex(
-            TypeError, "output tensor must have the same type as input tensor"
-        ):
-            tensor = torch.tensor([self.rank], dtype=torch.float).cuda(local_device_id)
-            output_t = torch.empty((self.world_size + 1), dtype=torch.long).cuda(
-                local_device_id
-            )
-            # fails the check because the dtype is different
-            allgather_base(output_t, tensor)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_gather_ops(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_ids = self.rank_to_GPU[self.rank]
-        num_gpus = len(local_device_ids)
-
-        def gather(output_t, input_t, rootRank):
-            opts = c10d.GatherOptions()
-            opts.rootRank = rootRank
-            if rootRank == self.rank:
-                work = pg.gather(output_t, input_t, opts)
-            else:
-                work = pg.gather([], input_t, opts)
-            work.wait()
-
-        # init input
-        tensors = []
-        for device_id in local_device_ids:
-            tensors.append(torch.tensor([self.rank]).cuda(device_id))
-
-        # init output
-        output_ts = []
-        for idx in range(num_gpus):
-            gpu_idx = local_device_ids[idx]
-            output_ts.append([])
-            for rank in range(self.world_size):
-                output_ts[idx].append(torch.tensor([-1]).cuda(gpu_idx))
-
-        expected = [[torch.tensor([rank]) for rank in range(self.world_size)]]
-        for rank in range(self.world_size):
-            gather(output_ts, tensors, rank)
-            if rank == self.rank:
-                self.assertEqual(expected, output_ts)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_gather_stress(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_ids = self.rank_to_GPU[self.rank]
-        num_gpus = len(local_device_ids)
-
-        def gather(output_t, input_t, rootRank):
-            opts = c10d.GatherOptions()
-            opts.rootRank = rootRank
-            if rootRank == self.rank:
-                work = pg.gather(output_t, input_t, opts)
-            else:
-                work = pg.gather([], input_t, opts)
-            work.wait()
-
-        stress_length = 1000
-
-        # init input
-        tensors = []
-        for i in range(stress_length):
-            tensors.append([])
-            for device_id in local_device_ids:
-                tensors[i].append(torch.tensor([self.rank]).cuda(device_id))
-
-        # init output
-        output_ts = []
-        for i in range(stress_length):
-            output_ts.append([[] for _ in range(num_gpus)])
-            for idx, ls in enumerate(output_ts[i]):
-                gpu_idx = local_device_ids[idx]
-                for _ in range(self.world_size):
-                    ls.append(torch.tensor([-1]).cuda(gpu_idx))
-
-        expected = [[torch.tensor([rank]) for rank in range(self.world_size)]]
-        for i in range(stress_length):
-            for rank in range(self.world_size):
-                gather(output_ts[i], tensors[i], rank)
-                # Verification
-                if rank == self.rank:
-                    self.assertEqual(output_ts[i], expected)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_gather_checks(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_ids = self.rank_to_GPU[self.rank]
-        num_gpus = len(local_device_ids)
-
-        # init input
-        tensors = []
-        for device_id in local_device_ids:
-            tensors.append(torch.tensor([self.rank]).cuda(device_id))
-
-        # init output
-        output_ts = []
-        for idx in range(num_gpus):
-            gpu_idx = local_device_ids[idx]
-            output_ts.append([])
-            for rank in range(self.world_size):
-                output_ts[idx].append(torch.tensor([-1]).cuda(gpu_idx))
-
-        with self.assertRaisesRegex(ValueError, "invalid root rank"):
-            opts = c10d.GatherOptions()
-            opts.rootRank = -1
-            pg.gather(output_ts, tensors, opts)
-
-        with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
-            pg.gather(output_ts, tensors, 0)
-
-        with self.assertRaisesRegex(ValueError, "invalid root rank"):
-            opts = c10d.GatherOptions()
-            opts.rootRank = self.world_size
-            pg.gather(output_ts, tensors, opts)
-
-        with self.assertRaisesRegex(
-            # throws error message from dispatcher
-            RuntimeError, "There were no tensor arguments to this function"
-        ):
-            opts = c10d.GatherOptions()
-            opts.rootRank = 0
-            pg.gather(output_ts, [], opts)
-
-        with self.assertRaisesRegex(
-            ValueError, "Tensors must be on distinct GPU devices"
-        ):
-            # init input
-            tensors2 = []
-            for device_id in local_device_ids:
-                tensors2.append(torch.tensor([self.rank]).cuda(device_id))
-                tensors2.append(torch.tensor([self.rank]).cuda(device_id))
-
-            opts = c10d.GatherOptions()
-            opts.rootRank = 0
-            pg.gather(output_ts, tensors2, opts)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_scatter_ops(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_ids = self.rank_to_GPU[self.rank]
-        num_gpus = len(local_device_ids)
-
-        def scatter(output_t, input_t, rootRank):
-            opts = c10d.ScatterOptions()
-            opts.rootRank = rootRank
-            if rootRank == self.rank:
-                work = pg.scatter(output_t, input_t, opts)
-            else:
-                work = pg.scatter(output_t, [], opts)
-            work.wait()
-
-        # init output
-        tensors = []
-        for device_id in local_device_ids:
-            tensors.append(torch.tensor([-1]).cuda(device_id))
-
-        # init input
-        scatter_list = []
-        for idx in range(num_gpus):
-            gpu_idx = local_device_ids[idx]
-            scatter_list.append([])
-            for rank in range(self.world_size):
-                scatter_list[idx].append(torch.tensor([rank]).cuda(gpu_idx))
-
-        # test each rank to scatter
-        expected = [torch.tensor([self.rank])]
-        for rank in range(self.world_size):
-            scatter(tensors, scatter_list, rank)
-            self.assertEqual(expected, tensors)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_scatter_stress(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_ids = self.rank_to_GPU[self.rank]
-        num_gpus = len(local_device_ids)
-
-        def scatter(output_t, input_t, rootRank):
-            opts = c10d.ScatterOptions()
-            opts.rootRank = rootRank
-            if rootRank == self.rank:
-                work = pg.scatter(output_t, input_t, opts)
-            else:
-                work = pg.scatter(output_t, [], opts)
-            work.wait()
-
-        stress_length = 1000
-
-        # init output
-        tensors = []
-        for i in range(stress_length):
-            tensors.append([])
-            for device_id in local_device_ids:
-                tensors[i].append(torch.tensor([-1]).cuda(device_id))
-
-        # init input
-        scatter_list = []
-        for i in range(stress_length):
-            scatter_list.append([[] for _ in range(num_gpus)])
-            for idx, ls in enumerate(scatter_list[i]):
-                gpu_idx = local_device_ids[idx]
-                for rank in range(self.world_size):
-                    ls.append(torch.tensor([rank]).cuda(gpu_idx))
-
-
-        # test each rank to scatter
-        expected = [torch.tensor([self.rank])]
-        for i in range(stress_length):
-            for rank in range(self.world_size):
-                scatter(tensors[i], scatter_list[i], rank)
-                # Verification
-                self.assertEqual(tensors[i], expected)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_scatter_checks(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_ids = self.rank_to_GPU[self.rank]
-        num_gpus = len(local_device_ids)
-
-        # init output
-        tensors = []
-        for device_id in local_device_ids:
-            tensors.append(torch.tensor([-1]).cuda(device_id))
-
-        # init input
-        scatter_list = []
-        for idx in range(num_gpus):
-            gpu_idx = local_device_ids[idx]
-            scatter_list.append([])
-            for rank in range(self.world_size):
-                scatter_list[idx].append(torch.tensor([rank]).cuda(gpu_idx))
-
-        with self.assertRaisesRegex(ValueError, "invalid root rank"):
-            opts = c10d.ScatterOptions()
-            opts.rootRank = -1
-            pg.scatter(tensors, scatter_list, opts)
-
-        with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
-            pg.scatter(tensors, scatter_list, 0)
-
-        with self.assertRaisesRegex(ValueError, "invalid root rank"):
-            opts = c10d.ScatterOptions()
-            opts.rootRank = self.world_size
-            pg.scatter(tensors, scatter_list, opts)
-
-        with self.assertRaisesRegex(
-            # throws error message from dispatcher
-            RuntimeError, "There were no tensor arguments to this function"
-        ):
-            opts = c10d.ScatterOptions()
-            opts.rootRank = 0
-            pg.scatter([], scatter_list, opts)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_reduce_scatter_base_basics(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_id = self.rank_to_GPU[self.rank][0]
-
-        def reduce_scatter_base(output_t, input_t):
-            work = pg._reduce_scatter_base(output_t, input_t)
-            work.wait()
-
-        # anticipate an error
-        with self.assertRaisesRegex(
-            ValueError,
-            "input tensor must be the same size as output size times world size",
-        ):
-            input_t = torch.tensor([self.rank]).cuda(local_device_id)
-            output_t = torch.empty((self.world_size + 1), dtype=input_t.dtype).cuda(
-                local_device_id
-            )
-            # fails the check because output_t is not correctly sized
-            reduce_scatter_base(output_t, input_t)
-
-        # anticipate an error
-        with self.assertRaisesRegex(
-            TypeError, "input tensor must be the same type as the output tensor."
-        ):
-            tensor = torch.tensor([self.rank], dtype=torch.float).cuda(local_device_id)
-            output_t = torch.empty((self.world_size + 1), dtype=torch.long).cuda(
-                local_device_id
-            )
-            # fails the check because the dtype is different
-            reduce_scatter_base(output_t, tensor)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_reduce_scatter_ops(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_ids = self.rank_to_GPU[self.rank]
-        num_gpus = len(local_device_ids)
-
-        def reduce_scatter(outputs, input_lists, op):
-            opts = c10d.ReduceScatterOptions()
-            opts.reduceOp = op
-            work = pg.reduce_scatter(outputs, input_lists, opts)
-            work.wait()
-
-        output = [torch.tensor([0]).cuda(i) for i in local_device_ids]
-
-        #  GPU/rank
-        #   0         [1], [2], [3], [4]
-        #   1         [2], [3], [4], [5]
-        #   2         [3], [4], [5], [6]
-        #   3         [4], [5], [6], [7]
-
-        # Sum
-        tensor_lists = []
-        input_per_gpu = []
-
-        for i in range(self.world_size):
-            input_per_gpu.append(torch.tensor([self.rank + i + 1]))
-
-        for gpu in local_device_ids:
-            tensor_lists.append([t.cuda(device=gpu) for t in input_per_gpu])
-
-        reduce_scatter(output, tensor_lists, c10d.ReduceOp.SUM)
-
-        for i in range(num_gpus):
-            expected = torch.tensor(
-                [
-                    (1 + self.world_size) * self.world_size // 2
-                    + self.world_size * self.rank
-                ])
-
-
-            self.assertEqual(expected, output[i])
-
-        # Min
-        reduce_scatter(output, tensor_lists, c10d.ReduceOp.MIN)
-
-        for i in range(num_gpus):
-            expected = torch.tensor([self.rank + 1 + i])
-            self.assertEqual(expected, output[i])
-
-        # Max
-        reduce_scatter(output, tensor_lists, c10d.ReduceOp.MAX)
-
-        for i in range(num_gpus):
-            expected = torch.tensor(
-                [self.rank + self.world_size + i]
-            )
-            self.assertEqual(expected, output[i])
-
-        # Product
-        reduce_scatter(output, tensor_lists, c10d.ReduceOp.PRODUCT)
-
-        # math package don't have math.perm until python 3.8, so
-        # we implement a naive version here.
-        def perm(n, k):
-            prod_val = n
-            for val in range(n - k + 1, n):
-                prod_val *= val
-            return prod_val
-
-        for i in range(num_gpus):
-            prod_val = perm(self.rank + self.world_size, self.world_size)
-
-            expected = torch.tensor([prod_val])
-            self.assertEqual(expected, output[i])
-
-        # Test the input params overridden scenarios, aka, when the input is
-        # a list and output is just one tensor.
-        # Sum
-        output_tensor = torch.empty_like(input_per_gpu[0][0]).cuda(self.rank)
-        input_list = [tensor[0].cuda(self.rank) for tensor in input_per_gpu]
-        pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.SUM).wait()
-        expected = torch.tensor(
-            (1 + self.world_size) * self.world_size // 2 + self.world_size * self.rank
-        )
-        self.assertEqual(expected, output_tensor)
-
-        # Min
-        pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.MIN).wait()
-        expected = torch.tensor(self.rank + 1)
-        self.assertEqual(expected, output_tensor)
-
-        # Max
-        pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.MAX).wait()
-        expected = torch.tensor(self.rank + self.world_size)
-        self.assertEqual(expected, output_tensor)
-
-        # Product
-        pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.PRODUCT).wait()
-        prod_val = self.rank + 1
-        for k in range(1, self.world_size):
-            prod_val = prod_val * (self.rank + 1 + k)
-        expected = torch.tensor(prod_val)
-        self.assertEqual(expected, output_tensor)
-
-        if torch.cuda.nccl.version() >= (2, 11, 1):
-            for factor in (3.0, torch.tensor([5.0], device=self.rank)):
-                if isinstance(factor, torch.Tensor):
-                    factor_ref = factor.cpu().item()
-                else:
-                    factor_ref = factor
-                output = [t.float() for t in output]
-                tensor_lists = [[t.float() for t in tl] for tl in tensor_lists]
-                output_ref = [t.float() for t in output]
-                tensor_lists_ref = [[t.float() * factor_ref for t in tl] for tl in tensor_lists]
-                reduce_scatter(output, tensor_lists, c10d._make_nccl_premul_sum(factor))
-                reduce_scatter(output_ref, tensor_lists_ref, c10d.ReduceOp.SUM)
-                self.assertEqual(output_ref, output)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_reduce_scatter_base_ops(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_id = self.rank_to_GPU[self.rank][0]
-
-        def reduce_scatter_base(output_t, input_t):
-            work = pg._reduce_scatter_base(output_t, input_t)
-            work.wait()
-
-        # reduce_scatter_base is GPU number agnostic.
-        # Each rank contribute one tensor regardless of GPU counts
-        output_t = torch.empty([1]).cuda(local_device_id)
-        tensor = torch.arange(self.world_size, dtype=output_t.dtype).cuda(local_device_id)
-
-        reduce_scatter_base(output_t, tensor)
-
-        # Verification
-        self.assertEqual(output_t[0], self.rank * self.world_size)
-
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_barrier(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_ids = self.rank_to_GPU[self.rank]
-
-        def allreduce(tensors):
-            opts = c10d.AllreduceOptions()
-            work = pg.allreduce(tensors, opts)
-            return work
-
-        # Making the collective to operate on
-        # 1, 2, 3, 4, .... len(local_device_ids) GPUs
-        tensors_list = [[] for _ in range(len(local_device_ids))]
-
-        for i in range(1, len(local_device_ids) + 1):
-            for j in range(i):
-                tensors_list[i - 1].append(torch.tensor([j + 1]).cuda(local_device_ids[j]))
+class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
+    def _create_process_group_nccl(self, store, opts, device_id=None):
+        # create nccl processgroup with opts
+        c10d.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            pg_options=opts,
+            device_id=device_id,
+        )
+        pg = c10d.distributed_c10d._get_default_group()
+        return pg
 
-        works = []
-        for tensors in tensors_list:
-            work = allreduce(tensors)
-            works.append(work)
+    def opts(self, high_priority_stream=False):
+        opts = c10d.ProcessGroupNCCL.Options()
+        opts.is_high_priority_stream = high_priority_stream
+        return opts
 
-        # Barrier will ensure that all previous work is completed
-        pg.barrier().wait()
+    def setUp(self):
+        super().setUp()
+        # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
+        # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
+        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+        # self.num_gpus = torch.cuda.device_count()
+        self._spawn_processes()
 
-        for i in range(1, len(local_device_ids) + 1):
-            for j in range(i):
-                self.assertEqual(
-                    torch.tensor([(j + 1) * self.world_size]), tensors_list[i - 1][j]
-                )
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_send_recv(self):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        self._create_process_group_nccl(store, self.opts())
-        device = self.rank_to_GPU[self.rank][0]
+    @property
+    def world_size(self):
+        return 2
 
-        # Generate the same random tensor
-        torch.manual_seed(0)
-        send_tensor = torch.rand(10, 10, device=device)
-        if self.rank == 0:
-            dist.send(send_tensor, 1)
-        if self.rank == 1:
-            recv_tensor = torch.rand(10, 10, device=device)
-            dist.recv(recv_tensor, 0)
-            self.assertEqual(send_tensor, recv_tensor)
+    @property
+    def rank_to_GPU(self):
+        # return rank to GPU map
+        return init_multigpu_helper(self.world_size, "nccl")
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 1 GPU")
@@ -1144,7 +278,9 @@ def test_abort_pg(self):
         dist.all_reduce(t)
 
         def abortpg():
-            c10d.distributed_c10d._get_default_group()._get_backend(torch.device(device))._shutdown()
+            c10d.distributed_c10d._get_default_group()._get_backend(
+                torch.device(device)
+            )._shutdown()
 
         # Initialize DDP to ensure "destroy_process_group" will not call
         # ProcessGroupNCCL destructor since DDP holds a reference to process group.
@@ -1182,20 +318,132 @@ def test_close_pg(self):
         # First allreduce to initialize state.
         pg.allreduce(t)
 
-        # Destroy pg and validate pg is still in working condition since we hold a
-        # reference above.
+        # Destroy pg and validate pg is no longer valid
         dist.destroy_process_group()
-        pg.allreduce([t])
+        with self.assertRaises(dist.DistBackendError):
+            pg.allreduce([t])
+
+        del pg
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_destruct_before_terminate_pg(self):
+        # Disable ASYNC_ERROR_HANDLING for this test to ensure we can programmatically
+        # abort the process group.
+        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_nccl(store, self.opts())
+        device = self.rank_to_GPU[self.rank][0]
+
+        t = torch.rand(10, 10, device=device)
+        # First allreduce to initialize state.
+        pg.allreduce(t)
+        # force destruction before terminating comms, destructor would terminate comms
+        del pg
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_abort_in_destroy_pg(self):
+        # Disable ASYNC_ERROR_HANDLING for this test to ensure we can programmatically
+        # abort the process group.
+        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
+
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_nccl(store, self.opts())
+        device = self.rank_to_GPU[self.rank][0]
 
-        # Now close pg and validate it no longer works.
-        pg._get_backend(torch.device(device))._shutdown()
+        t = torch.rand(10, 10, device=device)
+        # First allreduce to initialize state.
+        pg.allreduce(t)
 
-        # Try another collective.
+        # Destroy pg and validate pg is NOT in working condition since
+        # we have shutdown comms
+        dist.destroy_process_group()
         with self.assertRaises(dist.DistBackendError):
             pg.allreduce([t])
 
     @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
+    @skip_but_pass_in_sandcastle_if(
+        torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs"
+    )
+    def test_close_multi_pg_unordered(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_nccl(store, self.opts())
+        device = self.rank_to_GPU[self.rank][0]
+        t = torch.rand(10, 10, device=device)
+        # First allreduce to initialize default PG's communicator.
+        pg.allreduce(t).wait()
+        new_pg1 = c10d.new_group([0, 1])
+        new_pg2 = c10d.new_group([0, 1])
+        if self.rank == 0 or self.rank == 1:
+            t1 = torch.rand(10, 10, device=device)
+            t2 = torch.rand(10, 10, device=device)
+            new_pg1.allreduce(t1).wait()
+            new_pg2.allreduce(t2).wait()
+        if self.rank == 0:
+            dist.destroy_process_group(new_pg2)
+            # force destruction of pg2 first
+            del new_pg2
+            dist.destroy_process_group(new_pg1)
+            del new_pg1
+        if self.rank == 1:
+            c10d.destroy_process_group(new_pg1)
+            # force destruction of pg1 first
+            del new_pg1
+            dist.destroy_process_group(new_pg2)
+            del new_pg2
+        dist.destroy_process_group()
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(
+        torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs"
+    )
+    def test_abort_in_destroy_multi_pgs(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_nccl(store, self.opts())
+        device = self.rank_to_GPU[self.rank][0]
+        t = torch.rand(10, 10, device=device)
+        # First allreduce to initialize default PG's communicator.
+        pg.allreduce(t).wait()
+        new_pg1 = c10d.new_group([0, 1])
+        new_pg2 = c10d.new_group([0, 1])
+        t1 = torch.rand(10, 10, device=device)
+        t2 = torch.rand(10, 10, device=device)
+        new_pg1.allreduce(t1).wait()
+        new_pg2.allreduce(t2).wait()
+        backend = pg._get_backend(torch.device(device))
+        # default PG's backend should have a split count of 2
+        self.assertEqual(backend.comm_split_count(), 2)
+        # shutdown all NCCL PGs in one shot
+        dist.destroy_process_group()
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(
+        torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs"
+    )
+    def test_abort_in_destroy_mixed_empty_pgs(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_nccl(store, self.opts())
+        device = self.rank_to_GPU[self.rank][0]
+        t = torch.rand(10, 10, device=device)
+        # First allreduce to initialize default PG's communicator.
+        pg.allreduce(t).wait()
+        # PG1 is an PG without comms initialized, since we don't call collective on it
+        new_pg1 = c10d.new_group([0, 1])
+        new_pg2 = c10d.new_group([0, 1])
+        t2 = torch.rand(10, 10, device=device)
+
+        new_pg2.allreduce(t2).wait()
+        backend = pg._get_backend(torch.device(device))
+        # default PG's backend should have a split count of 1
+        self.assertEqual(backend.comm_split_count(), 1)
+        # shutdown all NCCL PGs in one shot
+        dist.destroy_process_group()
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(
+        torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs"
+    )
     def test_file_store_check(self):
         os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
         os.environ["TORCH_NCCL_ENABLE_MONITORING"] = "0"
@@ -1207,10 +455,7 @@ def test_file_store_check(self):
         # e.g., self.file_name = tempfile.NamedTemporaryFile(delete=False).name
         store = dist.FileStore(self.file_name, self.world_size)
         dist.init_process_group(
-            backend="nccl",
-            rank=self.rank,
-            world_size=self.world_size,
-            store=store
+            backend="nccl", rank=self.rank, world_size=self.world_size, store=store
         )
         pg = dist.distributed_c10d._get_default_group()
         self.assertEqual(pg.rank(), self.rank)
@@ -1262,50 +507,36 @@ def test_init_process_group_nccl_timeout(self):
         # 'timeout' kwarg taking precedence
         opts = dist.ProcessGroupNCCL.Options()
         opts._timeout = timedelta(seconds=123)
-        dist.init_process_group(**base_opts, pg_options=opts, timeout=timedelta(seconds=1240))
+        dist.init_process_group(
+            **base_opts, pg_options=opts, timeout=timedelta(seconds=1240)
+        )
         self._check_nccl_timeout(timedelta(seconds=1240))
         dist.destroy_process_group()
 
     @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_CUDA, "No GPUs available, skipping test")
-    def test_set_nccl_pg_timeout(self):
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("backend", [None, "nccl"])
+    def test_set_nccl_pg_timeout(self, backend):
         store = c10d.FileStore(self.file_name, self.world_size)
         opts = dict(
-            backend="nccl", store=store, rank=self.rank, world_size=self.world_size, timeout=timedelta(seconds=123)
+            backend=backend,
+            store=store,
+            rank=self.rank,
+            world_size=self.world_size,
+            timeout=timedelta(seconds=123),
         )
         dist.init_process_group(**opts)
         pg = dist.distributed_c10d._get_default_group()
         pg.allreduce(torch.rand(10).cuda(self.rank))
         self._check_nccl_timeout(timedelta(seconds=123))
-        pg._get_backend(torch.device(f"cuda:{self.rank}"))._set_default_timeout(timedelta(seconds=23))
+        pg._get_backend(torch.device(f"cuda:{self.rank}"))._set_default_timeout(
+            timedelta(seconds=23)
+        )
         self._check_nccl_timeout(timedelta(seconds=23))
         pg.allreduce(torch.rand(10).cuda(self.rank))
         c10d.distributed_c10d._set_pg_timeout(timedelta(seconds=252), pg)
         self._check_nccl_timeout(timedelta(seconds=252))
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_tensor_register_hook(self):
-        os.environ["TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK"] = "1"
-
-        store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        local_device_id = self.rank_to_GPU[self.rank][0]
-
-        def allgather_base(output_t, input_t):
-            work = pg._allgather_base(output_t, input_t)
-            work.wait()
-
-        # allgather_base is GPU number agnostic.
-        # Each rank contribute one tensor regardless of GPU counts
-        tensor = torch.tensor([self.rank]).cuda(local_device_id)
-        output_t = torch.empty((self.world_size), dtype=tensor.dtype).cuda(local_device_id)
-
-        allgather_base(output_t, tensor)
-
-        # Verification
-        self.assertEqual(torch.arange(self.world_size), output_t)
-
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
     def test_comm_split_optimization(self):
         # Test the optimization of new groups that contain all world
@@ -1330,11 +561,12 @@ def test_comm_split_optimization(self):
             self.assertEqual(backend.comm_split_count(), 1)
 
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_comm_split_subgroup(self):
         # Test `ncclCommSplit` for smaller subgroups of the world when
         # we've passed a specific device_id to init_process_group.
         store = c10d.FileStore(self.file_name, self.world_size)
-        device = torch.device(f'cuda:{self.rank}')
+        device = torch.device(f"cuda:{self.rank}")
         pg = self._create_process_group_nccl(store, self.opts(), device_id=device)
         backend = pg._get_backend(torch.device(device))
 
@@ -1353,6 +585,77 @@ def test_comm_split_subgroup(self):
         self.assertEqual(backend.comm_split_count(), 1)
         self.assertEqual(tensor, original_tensor)
 
+    @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_non_blocking_init(self):
+        # Test creating a pg using nonblocking mode but not eagerly
+        os.environ["TORCH_NCCL_USE_COMM_NONBLOCKING"] = "1"
+        os.environ["TORCH_NCCL_NONBLOCKING_TIMEOUT"] = "100"
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = self.rank_to_GPU[self.rank][0]
+        pg = self._create_process_group_nccl(store, self.opts())
+        backend = pg._get_backend(torch.device(device))
+        self.assertEqual(backend.comm_split_count(), 0)
+        reduce_tensor = torch.rand(10, 10, device=device)
+        # Run an allreduce, which should trigger a comm init for pg
+        pg.allreduce(reduce_tensor).wait()
+        new_pg = c10d.new_group()
+        # even after pg's collective call, new pg's comm is not initialized until its own collectcive calls
+        self.assertEqual(backend.comm_split_count(), 0)
+        broadcast_tensor = torch.tensor([self.rank]).cuda(device)
+        new_pg.broadcast(broadcast_tensor, 0).wait()
+        self.assertEqual(backend.comm_split_count(), 1)
+
+    @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_non_blocking_with_eager_init(self):
+        # Test creating a pg eagerly with nonblocking mode when
+        # we've passed a specific device_id to init_process_group.
+        os.environ["TORCH_NCCL_USE_COMM_NONBLOCKING"] = "1"
+        os.environ["TORCH_NCCL_NONBLOCKING_TIMEOUT"] = "100"
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device(f"cuda:{self.rank}")
+        # bound device to triger eager init mode
+        pg = self._create_process_group_nccl(store, self.opts(), device_id=device)
+        backend = pg._get_backend(torch.device(device))
+        self.assertEqual(backend.comm_split_count(), 0)
+        reduce_tensor = torch.rand(10, 10, device=device)
+        # Run an allreduce, comm should have already started initilizaing,
+        # but allreduce is issued to CUDA STREAM only after the initialization is a success
+        pg.allreduce(reduce_tensor).wait()
+        new_pg = c10d.new_group()
+        # even after pg's collective call, new pg's comm is not initialized until its own collectcive calls
+        self.assertEqual(backend.comm_split_count(), 0)
+        broadcast_tensor = torch.tensor([self.rank]).cuda(device)
+        new_pg.broadcast(broadcast_tensor, 0).wait()
+        self.assertEqual(backend.comm_split_count(), 1)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_get_uid(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device(f"cuda:{self.rank}")
+        pg = self._create_process_group_nccl(store, self.opts(), device_id=device)
+        from torch.distributed.distributed_c10d import _get_process_group_uid
+
+        self.assertEqual(_get_process_group_uid(pg), 0)
+        pg_2 = c10d.new_group([0, 1])
+        self.assertEqual(_get_process_group_uid(pg_2), 1)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_set_process_group_desc(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device(f"cuda:{self.rank}")
+        pg_default = self._create_process_group_nccl(
+            store, self.opts(), device_id=device
+        )
+        self.assertEqual(pg_default.group_desc, "default_pg")
+        pg_1 = c10d.new_group([0, 1], group_desc="test_purpose")
+        self.assertEqual(pg_1.group_desc, "test_purpose")
+        pg_2 = c10d.new_group([0, 1])
+        self.assertEqual(pg_2.group_desc, "undefined")
+
 
 class DistributedDataParallelTest(
     test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
@@ -1366,7 +669,9 @@ def setUp(self):
 
     def _get_process_group(self):
         store = self._get_store()
-        c10d.init_process_group("nccl", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            "nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
         return c10d.distributed_c10d._get_default_group()
 
     def _test_nccl_backend(
@@ -1384,11 +689,11 @@ def test_nccl_propagate_error_reason(self):
         # otherwise process will be taken down and we can't check for errors.
         os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
         os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1"
-        # TODO: smaller timeout can fail since PG NCCl does health check in
-        # constructor. Look into reducing this test's runtime.
         store = c10d.FileStore(self.file_name, self.world_size)
         # provide sufficient timeout to initialize NCCL comm.
-        pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size, timeout=timedelta(seconds=15))
+        pg = c10d.ProcessGroupNCCL(
+            store, self.rank, self.world_size, timeout=timedelta(seconds=15)
+        )
         pg_gloo = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
         pg.barrier().wait(timedelta(seconds=5))
         # Simulate stuckness in rank 0.
@@ -1401,7 +706,7 @@ def test_nccl_propagate_error_reason(self):
             with self.assertRaises(dist.DistBackendError):
                 pg.allreduce([inp]).wait(timedelta(seconds=5))
 
-            # Now when nonzero rank attempts to use communicator, original failure reason should be logged.j
+            # Now when nonzero rank attempts to use communicator, original failure reason should be logged.
             try:
                 pg.allreduce([torch.ones(2).cuda(self.rank)]).wait()
             except dist.DistBackendError as e:
@@ -1535,7 +840,7 @@ def _test_fp16(self, gradient_as_bucket_view=False):
         # Input 2**15, so that the gradients will overflow with a
         # world_size of 2, unless we normalize the gradient by the
         # world_size before the reduction
-        input = torch.tensor([[2 ** 15]]).cuda(gpus[0]).half()
+        input = torch.tensor([[2**15]]).cuda(gpus[0]).half()
 
         # Step model
         ddp_model.train()
@@ -2039,7 +1344,9 @@ def forward(self, x):
         c10d.destroy_process_group(process_group)
 
         store = c10d.FileStore(recovery_filename, self.world_size)
-        c10d.init_process_group("nccl", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            "nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
         process_group = c10d.distributed_c10d._get_default_group()
         ddp = DistributedDataParallel(
             model,
@@ -2118,9 +1425,7 @@ def first_bucket_size(ddp_bucket_mb):
                 layer_formats, layer_dtypes, bucketsizes
             ):
                 with first_bucket_size(bucketsize):
-                    model_msg = (
-                        f"rank = {self.rank} formats = {formats} dtypes = {dtypes} bucketsize = {bucketsize} "
-                    )
+                    model_msg = f"rank = {self.rank} formats = {formats} dtypes = {dtypes} bucketsize = {bucketsize} "
                     try:
                         m = ConvNet(layer_devs, formats, dtypes)
                         m_ddp = DistributedDataParallel(
@@ -2403,7 +1708,9 @@ def _test_powerSGD_ddp_comm_hook_nccl(self, gradient_as_bucket_view=False):
         # Get GPU model with the hook registered.
         # Test the hook with different algorithmic configs.
         for use_error_feedback, warm_start, batch_tensors_with_same_shape in product(
-            [True, False], [True, False], [True, False],
+            [True, False],
+            [True, False],
+            [True, False],
         ):
             state = powerSGD.PowerSGDState(
                 process_group=process_group,
@@ -2641,10 +1948,14 @@ def test_ddp_packed_sequence(self):
         seq_tensor = seq_tensor[permutation_idx]
         embedded_seq_tensor = embed(seq_tensor)
         packed_input = torch.nn.utils.rnn.pack_padded_sequence(
-            embedded_seq_tensor, seq_lengths, batch_first=True,
+            embedded_seq_tensor,
+            seq_lengths,
+            batch_first=True,
         )
         packed_input_ddp = torch.nn.utils.rnn.pack_padded_sequence(
-            embedded_seq_tensor.detach().clone(), seq_lengths, batch_first=True,
+            embedded_seq_tensor.detach().clone(),
+            seq_lengths,
+            batch_first=True,
         )
         # Move the input to GPU explicitly for the local model
         packed_output, (ht, ct) = lstm(packed_input.to(self.rank))
@@ -2663,12 +1974,55 @@ def test_ddp_packed_sequence(self):
     def test_channels_last_contig(self):
         process_group = self._get_process_group()
         device = torch.device(f"cuda:{self.rank}")
-        tensor = torch.ones((2, 16, 768, 1152), dtype=torch.float32, device=device).to(memory_format=torch.channels_last)
+        tensor = torch.ones((2, 16, 768, 1152), dtype=torch.float32, device=device).to(
+            memory_format=torch.channels_last
+        )
         process_group.broadcast([tensor]).wait()
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_ddp_complex_params(self):
+        class FFTModel(nn.Module):
+            def __init__(self, hin, win, n_features):
+                super().__init__()
+                self.hin = hin
+                self.win = win
+                self.weight = nn.Parameter(
+                    torch.ones(
+                        (n_features, n_features, hin, win // 2 + 1), dtype=torch.cfloat
+                    )
+                )
+
+            def forward(self, x):
+                xc = torch.fft.rfft2(
+                    x, s=(self.hin, self.win), dim=(-2, -1), norm="ortho"
+                )
+                xcw = torch.einsum("nchw,cohw->nohw", xc, self.weight)
+                x = torch.fft.irfft2(xcw, dim=(-2, -1), norm="ortho")
+                return x
 
-class WorkHookTest(MultiProcessTestCase):
+        process_group = self._get_process_group()
+        device_id = gpus_for_rank(self.world_size)[self.rank][0]
+        N, C, H, W = 1, 16, 64, 64
+        ddp_model = DistributedDataParallel(
+            FFTModel(hin=H, win=W, n_features=C).to(device_id),
+            device_ids=[device_id],
+            process_group=process_group,
+        )
+        optimizer = torch.optim.Adam(ddp_model.parameters(), lr=0.001)
+
+        inp = torch.ones((N, C, H, W), dtype=torch.float32)
+
+        # train step
+        out = ddp_model(inp)
+        loss = torch.sum(out)
+        loss.backward()
+        optimizer.step()
+
+        torch.cuda.synchronize(device=device_id)
 
+
+class WorkHookTest(MultiProcessTestCase):
     @property
     def world_size(self):
         return 2
@@ -2693,7 +2047,9 @@ def _get_store(self):
 
     def _get_process_group(self):
         store = self._get_store()
-        c10d.init_process_group("nccl", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            "nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
         return c10d.distributed_c10d._get_default_group()
 
     @requires_nccl()
@@ -2760,7 +2116,10 @@ def hook(work_info: torch._C._distributed_c10d.WorkInfo):
 
         self.assertEqual(
             tensor_list,
-            [torch.ones([2, 3]).cuda(self.rank) * self.world_size for _ in range(self.world_size)],
+            [
+                torch.ones([2, 3]).cuda(self.rank) * self.world_size
+                for _ in range(self.world_size)
+            ],
         )
 
     @requires_nccl()
@@ -2799,7 +2158,11 @@ def hook(work_info: torch._C._distributed_c10d.WorkInfo):
         # from rank0 to other ranks. However, this is DDP's internal implementation,
         # which is subject to change in future versions.
         self.assertTrue(num_hook_fired[OpType.BROADCAST] > 0)
-        ctor_allreduce = num_hook_fired[OpType.ALLREDUCE] if OpType.ALLREDUCE in num_hook_fired else 0
+        ctor_allreduce = (
+            num_hook_fired[OpType.ALLREDUCE]
+            if OpType.ALLREDUCE in num_hook_fired
+            else 0
+        )
 
         x = torch.zeros(2, 1000).cuda(self.rank)
         ddp(x).sum().backward()
@@ -2854,6 +2217,33 @@ def hook(work_info: torch._C._distributed_c10d.WorkInfo):
         self.assertEqual(num_hook_fired[OpType.ALLGATHER], 2)
         self.assertTrue(all(duration > 0 for duration in durations[OpType.ALLGATHER]))
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_on_completion_hook_seq(self):
+        pg = self._get_process_group()
+        num_hook_fired = 0
+        seq: int = -1
+        work: int = 0
+
+        def hook(work_info: torch._C._distributed_c10d.WorkInfo):
+            nonlocal num_hook_fired, seq
+            num_hook_fired += 1
+            seq = work_info.seq
+
+        pg._register_on_completion_hook(hook)
+        tensor = torch.ones([2, 3]).cuda(self.rank) * self.rank
+        work_count = 3
+        for i in range(work_count):
+            work += 1
+            pg.broadcast([tensor]).wait()
+
+        # N.B.: destroy_process_group is necessary to wait for
+        # all pending works to finish.
+        c10d.destroy_process_group(pg)
+
+        self.assertEqual(num_hook_fired, work_count)
+        self.assertEqual(work, seq)
+
 
 class NcclErrorHandlingTest(MultiProcessTestCase):
     def setUp(self):
@@ -2880,7 +2270,7 @@ def tearDown(self):
 
     @property
     def op_timeout_sec(self):
-        return 1
+        return 3
 
     @property
     def world_size(self):
@@ -2925,7 +2315,9 @@ def test_nccl_errors_nonblocking(self):
             self.assertTrue(t.is_alive())
 
         if prev_nccl_async_error_handling is not None:
-            os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = prev_nccl_async_error_handling
+            os.environ[
+                "TORCH_NCCL_ASYNC_ERROR_HANDLING"
+            ] = prev_nccl_async_error_handling
 
     def _test_nccl_errors_blocking(self, func):
         store = c10d.FileStore(self.file_name, self.world_size)
@@ -2938,8 +2330,10 @@ def _test_nccl_errors_blocking(self, func):
         process_group.allreduce(torch.rand(10).cuda(self.rank))
         if self.rank == 0:
             work = process_group.allreduce(torch.rand(10).cuda(self.rank))
-            with self.assertRaisesRegex(dist.DistBackendError, self.blocking_wait_error_msg):
-                # Operation would time out in blocking mode.
+            with self.assertRaisesRegex(dist.DistBackendError, ""):
+                # It seems the error message would be different depending on
+                # whether the test is run on CI machine and devGPU.  Skipping
+                # the error message check to make both sides happy.
                 work.wait(timeout=timedelta(seconds=self.op_timeout_sec))
             # Run some GPU operations to make sure cuda has not gotten stuck.
             # It was observed cuda could get stuck if NCCL communicators were
@@ -3007,9 +2401,13 @@ def test_nccl_blocking_wait_with_barrier(self):
         )
         process_group.barrier().wait()
         if self.rank == 0:
-            with self.assertRaisesRegex(dist.DistBackendError, self.blocking_wait_error_msg):
-                # This should timeout
-                process_group.barrier().wait(timeout=timedelta(seconds=self.op_timeout_sec))
+            with self.assertRaisesRegex(dist.DistBackendError, ""):
+                # It seems the error message would be different depending on
+                # whether the test is run on CI machine and devGPU.  Skipping
+                # the error message check to make both sides happy.
+                process_group.barrier().wait(
+                    timeout=timedelta(seconds=self.op_timeout_sec)
+                )
 
     def _run_invalid_nccl_blocking_wait_env(self, val):
         os.environ["TORCH_NCCL_BLOCKING_WAIT"] = val
@@ -3040,13 +2438,19 @@ def test_nccl_timeout(self):
         # to coordinate btwn ranks.
         pg_gloo = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
         failed_collective_timeout = timedelta(milliseconds=100)
-        process_group.allreduce(torch.rand(10).cuda(self.rank)).wait(timeout=timedelta(seconds=5))
+        process_group.allreduce(torch.rand(10).cuda(self.rank)).wait(
+            timeout=timedelta(seconds=5)
+        )
 
         if self.rank == 0:
             # This should timeout in about 1 second.
             # Watchdog may abort timed out work resulting in NCCL error instead of operation timed out.
-            with self.assertRaisesRegex(dist.DistBackendError, self.blocking_wait_error_msg):
-                process_group.allreduce(torch.rand(10).cuda(self.rank)).wait(timeout=failed_collective_timeout)
+            with self.assertRaisesRegex(
+                dist.DistBackendError, self.blocking_wait_error_msg
+            ):
+                process_group.allreduce(torch.rand(10).cuda(self.rank)).wait(
+                    timeout=failed_collective_timeout
+                )
             # Now do a barrier to tell other rank to go ahead.
             pg_gloo.barrier().wait()
         else:
@@ -3054,7 +2458,9 @@ def test_nccl_timeout(self):
             try:
                 pg_gloo.barrier().wait()
             except Exception as e:
-                raise ValueError(f"Rank {self.rank} barrier timed out waiting for rank 0 with error: {str(e)}") from e
+                raise ValueError(
+                    f"Rank {self.rank} barrier timed out waiting for rank 0 with error: {str(e)}"
+                ) from e
 
 
 class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
@@ -3062,7 +2468,6 @@ class CommTest(test_c10d_common.AbstractCommTest, MultiProcessTestCase):
     def device(self):
         return f"cuda:{self.rank}"
 
-
     def setUp(self):
         super().setUp()
         # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
@@ -3112,7 +2517,9 @@ def _test_broadcast_coalesced(self, process_group, device, root_rank):
     @skip_if_lt_x_gpu(2)
     def test_broadcast_coalesced_nccl(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        c10d.init_process_group(backend="nccl", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            backend="nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
         process_group = c10d.distributed_c10d._get_default_group()
         device = torch.device("cuda:%d" % self.rank)
         ranks = [0, 1]
@@ -3123,13 +2530,51 @@ def test_broadcast_coalesced_nccl(self):
     @skip_if_lt_x_gpu(2)
     def test_all_reduce_coalesced_nccl(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        c10d.init_process_group(backend="nccl", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            backend="nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
         process_group = c10d.distributed_c10d._get_default_group()
         device = torch.device("cuda:%d" % self.rank)
-        tensors = [torch.full((60 + i,), self.rank + 1 + i, device=device, dtype=torch.float) for i in range(5)]
+        tensors = [
+            torch.full((60 + i,), self.rank + 1 + i, device=device, dtype=torch.float)
+            for i in range(5)
+        ]
         torch.distributed.all_reduce_coalesced(tensors, group=process_group)
         for i, t in enumerate(tensors):
-            self.assertEqual(t, torch.full_like(t, self.world_size * (i + (self.world_size + 1.) / 2.)))
+            self.assertEqual(
+                t,
+                torch.full_like(
+                    t, self.world_size * (i + (self.world_size + 1.0) / 2.0)
+                ),
+            )
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_all_reduce_coalesced_manager_nccl(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        process_group = c10d.distributed_c10d._get_default_group()
+        device = torch.device("cuda:%d" % self.rank)
+        tensors = [
+            torch.full((60 + i,), self.rank + 1 + i, device=device, dtype=torch.float)
+            for i in range(5)
+        ]
+        with torch.distributed._coalescing_manager(
+            group=process_group, device=device, async_ops=True
+        ) as cm:
+            for tensor in tensors:
+                torch.distributed.all_reduce(tensor)
+        self.assertEqual(len(cm.works), 1)
+        cm.wait()
+        for i, t in enumerate(tensors):
+            self.assertEqual(
+                t,
+                torch.full_like(
+                    t, self.world_size * (i + (self.world_size + 1.0) / 2.0)
+                ),
+            )
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
@@ -3137,6 +2582,7 @@ def test_all_reduce_coalesced_nccl(self):
     def test_intra_node_comm_all_reduce(self):
         from torch._C._distributed_c10d import _get_intra_node_comm_usage_counter
         from torch.testing._internal.common_cuda import SM80OrLater
+
         for peer in range(self.world_size):
             if peer == self.rank:
                 continue
@@ -3177,13 +2623,15 @@ def test_intra_node_comm_all_reduce(self):
         self.assertTrue(t.eq(expect).all())
         self.assertEqual(_get_intra_node_comm_usage_counter(), 2)
 
-        t = torch.full((10 * 1024 ** 2 // 2,), self.rank, dtype=torch.bfloat16).cuda()
+        t = torch.full((10 * 1024**2 // 2,), self.rank, dtype=torch.bfloat16).cuda()
         c10d.all_reduce(t, c10d.ReduceOp.SUM)
         self.assertTrue(t.eq(expect).all())
         self.assertEqual(_get_intra_node_comm_usage_counter(), 3)
 
         # Verify that IntraNodeComm is not used beyond 10MB
-        t = torch.full((10 * 1024 ** 2 // 2 + 1,), self.rank, dtype=torch.bfloat16).cuda()
+        t = torch.full(
+            (10 * 1024**2 // 2 + 1,), self.rank, dtype=torch.bfloat16
+        ).cuda()
         c10d.all_reduce(t, c10d.ReduceOp.SUM)
         self.assertTrue(t.eq(expect).all())
         self.assertEqual(_get_intra_node_comm_usage_counter(), 3)
@@ -3241,7 +2689,9 @@ def test_pass_nccl_options_high_priority_stream(self):
         self._test_pass_nccl_options(pg_opts)
 
     @requires_nccl()
-    @requires_nccl_version((2, 17), "Need NCCL 2.17+ for configuring NCCL communicators")
+    @requires_nccl_version(
+        (2, 17), "Need NCCL 2.17+ for configuring NCCL communicators"
+    )
     @skip_if_lt_x_gpu(2)
     def test_pass_nccl_options_config(self):
         pg_opts = c10d.ProcessGroupNCCL.Options()
@@ -3258,10 +2708,14 @@ def test_pass_nccl_options_config(self):
 
         # Tests if comms were configured
         nccl_debug_file_content = nccl_debug_file.read()
-        max_ctas = re.search(rb'Max CTAs.*(\d+)|$', nccl_debug_file_content).group(1)
-        min_ctas = re.search(rb'Min CTAs.*(\d+)|$', nccl_debug_file_content).group(1)
-        cga_cluster_size = re.search(rb'CGA cluster.*(\d+)|$', nccl_debug_file_content).group(1)
-        net_name = re.search(rb'Using network.([a-zA-z]+)|$', nccl_debug_file_content).group(1)
+        max_ctas = re.search(rb"Max CTAs.*(\d+)|$", nccl_debug_file_content).group(1)
+        min_ctas = re.search(rb"Min CTAs.*(\d+)|$", nccl_debug_file_content).group(1)
+        cga_cluster_size = re.search(
+            rb"CGA cluster.*(\d+)|$", nccl_debug_file_content
+        ).group(1)
+        net_name = re.search(
+            rb"Using network.([a-zA-z]+)|$", nccl_debug_file_content
+        ).group(1)
         self.assertEqual(pg_opts.config.max_ctas, int(max_ctas))
         self.assertEqual(pg_opts.config.min_ctas, int(min_ctas))
         self.assertEqual(pg_opts.config.cga_cluster_size, int(cga_cluster_size))
@@ -3300,23 +2754,6 @@ def test_nccl_barrier(self):
             pg.allreduce(t).wait()
             self.assertEqual(expected_tensor, t)
 
-    @requires_nccl()
-    @skip_if_lt_x_gpu(4)
-    def test_nccl_barrier_timeout(self):
-        os.environ["TORCH_ENABLE_NCCL_HEALTH_CHECK"] = "1"
-        store = c10d.FileStore(self.file_name, self.world_size)
-        if self.rank == 0:
-            with self.assertRaisesRegex(
-                dist.DistBackendError, "Health check failure"
-            ):
-                c10d.init_process_group(
-                    backend="nccl",
-                    rank=self.rank,
-                    world_size=self.world_size,
-                    store=store,
-                    timeout=timedelta(seconds=10),
-                )
-
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_nccl_barrier_device_ids(self):
@@ -3373,7 +2810,6 @@ def test_tensor_dtype_complex(self):
 
 
 class CompilerTest(test_c10d_common.CompilerTest):
-
     @property
     def world_size(self):
         return 2
@@ -3396,9 +2832,7 @@ def test_allreduce_work_wait_gpu(self):
 
     @skip_if_lt_x_gpu(2)
     def test_allgather_work_wait_gpu(self):
-        self._test_allgather_work_wait(
-            torch.ones(2, 2, device=self.rank) * self.rank
-        )
+        self._test_allgather_work_wait(torch.ones(2, 2, device=self.rank) * self.rank)
 
     @skip_if_lt_x_gpu(2)
     def test_allgather_into_tensor_work_wait_gpu(self):
@@ -3420,21 +2854,15 @@ def test_reduce_scatter_tensor_work_wait_gpu(self):
 
     @skip_if_lt_x_gpu(2)
     def test_broadcast_work_wait_gpu(self):
-        self._test_broadcast_work_wait(
-            torch.ones(2, 2, device=self.rank) * self.rank
-        )
+        self._test_broadcast_work_wait(torch.ones(2, 2, device=self.rank) * self.rank)
 
     @skip_if_lt_x_gpu(2)
     def test_scatter_work_wait_gpu(self):
-        self._test_scatter_work_wait(
-            torch.ones(2, 2, device=self.rank) * self.rank
-        )
+        self._test_scatter_work_wait(torch.ones(2, 2, device=self.rank) * self.rank)
 
     @skip_if_lt_x_gpu(2)
     def test_alltoall_work_wait_gpu(self):
-        self._test_alltoall_work_wait(
-            torch.ones(2, 2, device=self.rank) * self.rank
-        )
+        self._test_alltoall_work_wait(torch.ones(2, 2, device=self.rank) * self.rank)
 
     @skip_if_lt_x_gpu(2)
     def test_nested_comm_tensor_wrapping(self):
@@ -3459,7 +2887,9 @@ def test_reduce_scatter_base_k(self):
             store=store,
         )
         output_tensor = torch.zeros(2, dtype=torch.int64).to(self.rank)
-        input_tensors = torch.arange(self.world_size * 2, dtype=torch.int64).to(self.rank)
+        input_tensors = torch.arange(self.world_size * 2, dtype=torch.int64).to(
+            self.rank
+        )
         input_tensors = torch.reshape(input_tensors, (self.world_size, 2))
         dist.reduce_scatter_tensor(output_tensor, input_tensors)
         self.assertEqual(output_tensor, input_tensors[self.rank] * self.world_size)
@@ -3481,7 +2911,15 @@ def test_reduce_scatter_tensor_coalesced(self):
                 dist.reduce_scatter_tensor(output_tensors[i], input_tensors[i])
         self.assertEqual(output_tensors, input_tensors[self.rank] * self.world_size)
 
-class NcclProcessGroupWithDispatchedCollectivesTests(test_c10d_common.ProcessGroupWithDispatchedCollectivesTests):
+
+class SetDeviceMethod(Enum):
+    TORCH_CUDA_SET = auto()  # torch.cuda.set_device
+    COLLECTIVE_ARGUMENT = auto()  # broadcast_object_list(device=)
+
+
+class NcclProcessGroupWithDispatchedCollectivesTests(
+    test_c10d_common.ProcessGroupWithDispatchedCollectivesTests
+):
     @requires_nccl()
     @skip_if_lt_x_gpu(1)
     def test_collectives(self):
@@ -3513,6 +2951,7 @@ def test_allgather_base(self):
         dist.all_gather_into_tensor(output_tensor, tensor)
         self.assertEqual(output_tensor, tensor)
 
+
 class LargeCommTest(test_c10d_common.AbstractLargeCommTest, MultiProcessTestCase):
     def setUp(self):
         super().setUp()
@@ -3547,6 +2986,246 @@ def test_new_group_local_sync_sanity_check(self):
     def test_new_group_local_sync_duplicated_pg(self):
         self._test_new_group_local_sync_duplicate_pg(backend="nccl")
 
+    def _init_two_pg2_subgroups(self, world_size: int = 4):
+        if world_size != 4:
+            raise NotImplementedError(
+                f"need world size of 4 to get 2 subgroup PGs, but got world size of {world_size}"
+            )
+        store = c10d.FileStore(self.file_name, world_size)
+        c10d.init_process_group(
+            backend="nccl", store=store, rank=self.rank, world_size=world_size
+        )
+        # every rank creates the same sub groups
+        # including unused sub groups in the current rank
+        a_group = c10d.new_group([0, 1])
+        b_group = c10d.new_group([2, 3])
+        return a_group if self.rank < 2 else b_group
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(4)
+    def test_gather_subgroup(self):
+        world_size = 4
+        if self.rank >= world_size:
+            # just easier to write the test for exactly 4 gpus, even if this test class increased to 8gpu later
+            return
+
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        device = torch.device("cuda:%d" % self.rank)
+        input = torch.ones((10,), device=device) * self.rank
+        if self.rank == 0 or self.rank == 2:
+            gather_list = [torch.empty_like(input) for _ in range(subgroup.size())]
+            torch.distributed.gather(
+                input,
+                gather_list=gather_list,
+                dst=self.rank,
+                group=subgroup,
+                async_op=False,
+            )
+            for src in range(len(gather_list)):
+                expected = (torch.ones_like(input) * self.rank) + src
+                self.assertEqual(gather_list[src], expected)
+        else:
+            torch.distributed.gather(
+                input,
+                gather_list=None,
+                dst=self.rank - 1,
+                group=subgroup,
+                async_op=False,
+            )
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(4)
+    def test_gather_object_subgroup(self):
+        world_size = 4
+        if self.rank >= world_size:
+            # just easier to write the test for exactly 4 gpus, even if this test class increased to 8gpu later
+            return
+
+        subgroup = self._init_two_pg2_subgroups(world_size)
+
+        # discrepancy #1
+        # have to set device or else gather_object gets wrong device from 'current_device = _get_pg_default_device(group)
+        torch.cuda.set_device(self.rank)
+
+        input = {"rank": self.rank}
+        if self.rank == 0 or self.rank == 2:
+            # discrepancy #2
+            # another weird thing- what's the point of making me specify some empty objects in my list?
+            # empty list should be valid imo.  (but it throws an error)
+            gather_list = [{}, {}]
+            torch.distributed.gather_object(
+                input, object_gather_list=gather_list, dst=self.rank, group=subgroup
+            )
+            for src in range(len(gather_list)):
+                self.assertEqual(gather_list[src]["rank"], self.rank + src)
+        else:
+            torch.distributed.gather_object(
+                input, object_gather_list=None, dst=self.rank - 1, group=subgroup
+            )
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(4)
+    def test_reduce_subgroup(self):
+        world_size = 4
+        if self.rank >= world_size:
+            return
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        device = torch.device("cuda:%d" % self.rank)
+        x = torch.ones((10,), device=device) * self.rank
+        if self.rank == 0 or self.rank == 2:
+            expected = x + torch.ones((10,), device=device) * (self.rank + 1)
+            c10d.reduce(x, dst=self.rank, group=subgroup, async_op=False)
+            self.assertEqual(x, expected)
+        else:
+            c10d.reduce(x, dst=self.rank - 1, group=subgroup, async_op=False)
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(4)
+    @parametrize("async_op", [True, False])
+    def test_send_recv_subgroup(self, async_op):
+        world_size = 4
+        if self.rank >= world_size:
+            return
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        device = torch.device("cuda:%d" % self.rank)
+        if self.rank == 0 or self.rank == 2:
+            x = torch.empty((10,), device=device)
+            if async_op:
+                c10d.irecv(x, src=self.rank + 1, group=subgroup).wait()
+            else:
+                c10d.recv(x, src=self.rank + 1, group=subgroup)
+            expected = torch.ones((10,), device=device) * (self.rank + 1)
+            self.assertEqual(x, expected)
+        else:
+            x = torch.ones((10,), device=device) * self.rank
+            if async_op:
+                c10d.isend(x, dst=self.rank - 1, group=subgroup).wait()
+            else:
+                c10d.send(x, dst=self.rank - 1, group=subgroup)
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(4)
+    def test_broadcast_subgroup(self):
+        world_size = 4
+        if self.rank >= world_size:
+            return
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        device = torch.device("cuda:%d" % self.rank)
+        if self.rank == 0 or self.rank == 2:
+            x = torch.empty((10,), device=device)
+            c10d.broadcast(x, src=self.rank + 1, group=subgroup)
+            expected = torch.ones((10,), device=device) * (self.rank + 1)
+            self.assertEqual(x, expected)
+        else:
+            x = torch.ones((10,), device=device) * self.rank
+            c10d.broadcast(x, src=self.rank, group=subgroup)
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(4)
+    @parametrize(
+        "set_device",
+        [SetDeviceMethod.TORCH_CUDA_SET, SetDeviceMethod.COLLECTIVE_ARGUMENT],
+    )
+    def test_send_recv_object_list_subgroup(self, set_device: SetDeviceMethod):
+        world_size = 4
+        if self.rank >= world_size:
+            return
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        if set_device == SetDeviceMethod.TORCH_CUDA_SET:
+            torch.cuda.set_device(self.rank)
+            device = None
+        else:
+            device = torch.device("cuda:%d" % self.rank)
+        if self.rank == 0 or self.rank == 2:
+            x = [{}]
+            c10d.recv_object_list(x, src=self.rank + 1, group=subgroup, device=device)
+            expected = [{"rank": self.rank + 1}]
+            self.assertEqual(x, expected)
+        else:
+            x = [{"rank": self.rank}]
+            c10d.send_object_list(x, dst=self.rank - 1, group=subgroup, device=device)
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(4)
+    @parametrize(
+        "set_device",
+        [SetDeviceMethod.TORCH_CUDA_SET, SetDeviceMethod.COLLECTIVE_ARGUMENT],
+    )
+    def test_broadcast_object_list_subgroup(self, set_device: SetDeviceMethod):
+        world_size = 4
+        if self.rank >= world_size:
+            return
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        if set_device == SetDeviceMethod.TORCH_CUDA_SET:
+            torch.cuda.set_device(self.rank)
+            device = None
+        else:
+            device = torch.device("cuda:%d" % self.rank)
+        if self.rank == 0 or self.rank == 2:
+            x = [{}]
+            c10d.broadcast_object_list(
+                x, src=self.rank + 1, group=subgroup, device=device
+            )
+            expected = [{"rank": self.rank + 1}]
+            self.assertEqual(x, expected)
+        else:
+            x = [{"rank": self.rank}]
+            c10d.broadcast_object_list(x, src=self.rank, group=subgroup, device=device)
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(4)
+    def test_scatter_subgroup(self):
+        world_size = 4
+        if self.rank >= world_size:
+            return
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        device = torch.device("cuda:%d" % self.rank)
+        x = torch.empty((10,), device=device)
+        expected = torch.ones((10,), device=device) * self.rank
+        if self.rank == 0 or self.rank == 2:
+            c10d.scatter(x, scatter_list=None, src=self.rank + 1, group=subgroup)
+        else:
+            scatter_list = [
+                torch.ones((10,), device=device) * (self.rank - 1),
+                torch.ones((10,), device=device) * self.rank,
+            ]
+            c10d.scatter(x, scatter_list=scatter_list, src=self.rank, group=subgroup)
+        self.assertEqual(x, expected)
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(4)
+    def test_scatter_object_list_subgroup(self):
+        world_size = 4
+        if self.rank >= world_size:
+            return
+        subgroup = self._init_two_pg2_subgroups(world_size)
+        torch.cuda.set_device(self.rank)
+        scatter_object_output_list = [None]
+        expected = [{"rank": self.rank}]
+        if self.rank == 0 or self.rank == 2:
+            c10d.scatter_object_list(
+                scatter_object_output_list=scatter_object_output_list,
+                scatter_object_input_list=None,
+                src=self.rank + 1,
+                group=subgroup,
+            )
+
+        else:
+            scatter_object_input_list = [
+                {"rank": self.rank - 1},
+                {"rank": self.rank},
+            ]
+            c10d.scatter_object_list(
+                scatter_object_output_list=scatter_object_output_list,
+                scatter_object_input_list=scatter_object_input_list,
+                src=self.rank,
+                group=subgroup,
+            )
+        self.assertEqual(scatter_object_output_list, expected)
+
+
+instantiate_parametrized_tests(LargeCommTest)
+
 
 class SparseCollective(MultiProcessTestCase):
     @property
@@ -3571,7 +3250,9 @@ def tearDown(self):
     class ToyModel(nn.Module):
         def __init__(self, rank, vocab_size, embedding_dim):
             super().__init__()
-            self.embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True).to(rank)
+            self.embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True).to(
+                rank
+            )
             self.linear = nn.Linear(embedding_dim, 1).to(rank)
 
         def forward(self, inputs):
@@ -3596,12 +3277,14 @@ def test_ddp_set_sparse_metadata(self):
 
         vocab_size = 5
 
-        model = SparseCollective.ToyModel(self.rank, vocab_size=vocab_size, embedding_dim=10)
+        model = SparseCollective.ToyModel(
+            self.rank, vocab_size=vocab_size, embedding_dim=10
+        )
         ddp_model = DistributedDataParallel(model)
         inputs = torch.tensor([[1, 0, 0], [0, 0, 0], [0, 0, 0]]).to(self.rank)
         # set sparse metadata on the DDP model
         indices = torch.Tensor(list(range(vocab_size)))
-        ddp_model._set_sparse_metadata({"embedding.weight" : indices})
+        ddp_model._set_sparse_metadata({"embedding.weight": indices})
         # forward pass
         try:
             output = ddp_model(inputs)
@@ -3611,7 +3294,7 @@ def test_ddp_set_sparse_metadata(self):
             loss.backward()
             self.assertTrue(ddp_model.module.embedding.weight.grad.indices, indices)
         except RuntimeError as e:
-            if "allreduce_sparse is only available in the NCCL experimental branch." in str(e):
+            if "NCCL does not support all_reduce with sparse tensors" in str(e):
                 pass
             else:
                 # Rethrow the exception if it's a different error
@@ -3621,22 +3304,26 @@ def test_ddp_set_sparse_metadata(self):
 class NCCLTraceTestBase(MultiProcessTestCase):
     def setUp(self):
         super().setUp()
-        os.environ["TORCH_NCCL_ENABLE_TIMING"] = '0'
-        os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = '10'
-        os.environ["TORCH_NCCL_DUMP_ON_TIMEOUT"] = '1'
+        os.environ[
+            "TORCH_NCCL_ENABLE_TIMING"
+        ] = "0"  # see 'timing_enabled' parametrized tests
+        os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "1000"
+        os.environ["TORCH_NCCL_DUMP_ON_TIMEOUT"] = "1"
         self.tempdir = tempfile.TemporaryDirectory()
         os.environ["TORCH_NCCL_DEBUG_INFO_TEMP_FILE"] = self._trace_basename()
         os.environ["TORCH_NCCL_DEBUG_INFO_PIPE_FILE"] = self._trace_basename()
         self._spawn_processes()
 
     @classmethod
-    def _run(cls, parent_conn, rank: int, test_name: str, file_name: str, parent_pipe) -> None:
+    def _run(
+        cls, parent_conn, rank: int, test_name: str, file_name: str, parent_pipe
+    ) -> None:
         cls.parent = parent_conn
         super()._run(rank, test_name, file_name, parent_pipe)
 
     @property
     def local_device(self):
-        return torch.device('cuda', self.rank_to_GPU[self.rank][0])
+        return torch.device("cuda", self.rank_to_GPU[self.rank][0])
 
     def _join_processes(self, fn):
         fn()
@@ -3655,15 +3342,14 @@ def _spawn_processes(self) -> None:
         def wrap(*positional, args, **kwargs):
             args = (next(piter), *args)
             return proc(*positional, args=args, **kwargs)
+
         self._start_processes(wrap)
 
     def _create_process_group_nccl(self):
         store = dist.FileStore(self.file_name, self.world_size)
         c10d.init_process_group(
-            "nccl",
-            world_size=self.world_size,
-            rank=self.rank,
-            store=store)
+            "nccl", world_size=self.world_size, rank=self.rank, store=store
+        )
         pg = c10d.distributed_c10d._get_default_group()
         return pg
 
@@ -3693,32 +3379,65 @@ def _trace_name(self, rank):
     def started_or_scheduled(self, timing_enabled):
         return "started" if timing_enabled else "scheduled"
 
-class NCCLTraceTest(NCCLTraceTestBase):
 
+class NCCLTraceTest(NCCLTraceTestBase):
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_short(self):
+    @parametrize("timing_enabled", [True, False])
+    def test_short(self, timing_enabled):
         if self.rank == self.MAIN_PROCESS_RANK:
             return
         pg = self._create_process_group_nccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
         device = self.local_device
         a = torch.full((3, 4), float(self.rank), device=device)
         for i in range(2):
             f = pg.allreduce(a)
         f.wait()
         torch.cuda.synchronize(device=device)
+
+        # gah ok so now the duration_ms is populated best-effort since it can only happen outside "dump()" api
+        time.sleep(1)
+
         t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+        ver = t["version"]
+        self.assertEqual(ver, "1.5")
+        pg_config = t["pg_config"]
+        self.assertEqual(len(pg_config), 1)
+        default_pg_info = pg_config["0"]
+        self.assertIn("name", default_pg_info)
+        self.assertIn("desc", default_pg_info)
+        self.assertIn("ranks", default_pg_info)
+        global_ranks = pg_config["0"]["ranks"]
+        self.assertEqual(len(json.loads(global_ranks)), self.world_size)
+        t = t["entries"]
         self.assertEqual(len(t), 2)
         last = t[-1]
-        self.assertEqual(last['state'], 'completed')
-        self.assertIn('test_c10d_nccl.py', str(last['frames']))
-        self.assertEqual(last['input_sizes'], ((3, 4),))
-        self.assertEqual(last['output_sizes'], ((3, 4),))
-        self.assertEqual(last['seq_id'], 2)
+        self.assertEqual(last["process_group"], ("0", "default_pg"))
+        self.assertEqual(last["state"], "completed")
+        s = last["time_discovered_started_ns"]
+        f = last["time_discovered_completed_ns"]
+        self.assertEqual(last["record_id"], 1)
+        self.assertIsNotNone(f)
+        if timing_enabled:
+            self.assertIsNotNone(s)
+            self.assertTrue(s <= f)
+        self.assertIn("test_c10d_nccl.py", str(last["frames"]))
+        self.assertEqual(last["input_sizes"], ((3, 4),))
+        self.assertEqual(last["output_sizes"], ((3, 4),))
+        self.assertEqual(last["seq_id"], 2)
         now = datetime.now()
-        event_created_time = datetime.fromtimestamp(last['time_created_us'] / 1000000)
+        event_created_time = datetime.fromtimestamp(
+            last["time_created_ns"] / 1000000000
+        )
         before_test = now - timedelta(minutes=1)
         self.assertTrue(before_test < event_created_time < now)
+        if timing_enabled:
+            # very loose bounds, measured 0.036 ms on devgpu
+            self.assertTrue(0 < last["duration_ms"] < 100)
+        else:
+            self.assertTrue("duration_ms" not in last)
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
@@ -3728,22 +3447,22 @@ def open_file_with_timeout(file_path, mode, timeout=1.0):
             while time.time() - start_time < timeout:
                 if os.path.exists(file_path):
                     return open(file_path, mode)
-                time.sleep(.1)
+                time.sleep(0.1)
             raise FileNotFoundError
 
         if self.rank == self.MAIN_PROCESS_RANK:
             for c in self.children_pipes:
-                self.assertEqual(c.recv(), 'next')
+                self.assertEqual(c.recv(), "next")
 
             dump_file = self._trace_name(rank=0)
             pipe_file = dump_file + ".pipe"
-            with open_file_with_timeout(pipe_file, 'w') as f:
-                f.write('1\n')
-            with open_file_with_timeout(dump_file, 'rb', timeout=10.0) as f:
-                self.assertTrue('all_reduce' in str(pickle.load(f)))
+            with open_file_with_timeout(pipe_file, "w") as f:
+                f.write("1\n")
+            with open_file_with_timeout(dump_file, "rb", timeout=10.0) as f:
+                self.assertTrue("all_reduce" in str(pickle.load(f)))
 
             for c in self.children_pipes:
-                c.send('next')
+                c.send("next")
             return
 
         pg = self._create_process_group_nccl()
@@ -3753,12 +3472,13 @@ def open_file_with_timeout(file_path, mode, timeout=1.0):
             f = pg.allreduce(a)
         f.wait()
         torch.cuda.synchronize(device=device)
-        self.parent.send('next')
+        self.parent.send("next")
         self.parent.recv()
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_long(self):
+        os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "10"
         if self.rank == self.MAIN_PROCESS_RANK:
             return
         pg = self._create_process_group_nccl()
@@ -3778,14 +3498,16 @@ def test_long(self):
         f.wait()
         torch.cuda.synchronize(device=device)
         t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+        t = t["entries"]
         self.assertEqual(len(t), 10)
         first = t[0]
         last = t[-1]
-        self.assertEqual(last['state'], 'completed')
-        self.assertIn('test_c10d_nccl.py', str(last['frames']))
-        self.assertEqual(last['input_sizes'], ((3, 4),))
-        self.assertEqual(last['output_sizes'], ((3, 4),))
-        self.assertEqual(last['seq_id'] - first['seq_id'], 9)
+        self.assertEqual(last["profiling_name"], "nccl:all_reduce")
+        self.assertEqual(last["state"], "completed")
+        self.assertIn("test_c10d_nccl.py", str(last["frames"]))
+        self.assertEqual(last["input_sizes"], ((3, 4),))
+        self.assertEqual(last["output_sizes"], ((3, 4),))
+        self.assertEqual(last["seq_id"] - first["seq_id"], 9)
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
@@ -3793,9 +3515,9 @@ def test_long(self):
     def test_trace_while_active(self, timing_enabled):
         if self.rank == self.MAIN_PROCESS_RANK:
             for c in self.children_pipes:
-                self.assertEqual(c.recv(), 'next')
+                self.assertEqual(c.recv(), "next")
             for c in self.children_pipes:
-                c.send('next')
+                c.send("next")
             return
 
         pg = self._create_process_group_nccl()
@@ -3812,15 +3534,19 @@ def test_trace_while_active(self, timing_enabled):
                 pg.allreduce(a).wait()
             e.synchronize()
             t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+            t = t["entries"]
+            self.assertEqual(t[-1]["profiling_name"], "nccl:all_reduce")
             if self.rank == 0:
-                self.assertEqual(t[-1]['seq_id'], 1)
-                self.assertEqual(t[-1]['state'], 'completed')
+                self.assertEqual(t[-1]["seq_id"], 1)
+                self.assertEqual(t[-1]["state"], "completed")
             else:
-                self.assertEqual(t[-1]['seq_id'], 2)
-                self.assertEqual(t[-1]['state'], self.started_or_scheduled(timing_enabled))
+                self.assertEqual(t[-1]["seq_id"], 2)
+                self.assertEqual(
+                    t[-1]["state"], self.started_or_scheduled(timing_enabled)
+                )
 
-            self.parent.send('next')
-            self.assertEqual('next', self.parent.recv())
+            self.parent.send("next")
+            self.assertEqual("next", self.parent.recv())
             if self.rank == 0:
                 pg.allreduce(a).wait()
             torch.cuda.synchronize(device=device)
@@ -3831,9 +3557,9 @@ def test_trace_while_active(self, timing_enabled):
     def test_trace_while_stuck(self, timing_enabled):
         if self.rank == self.MAIN_PROCESS_RANK:
             for c in self.children_pipes:
-                self.assertEqual(c.recv(), 'next')
+                self.assertEqual(c.recv(), "next")
             for c in self.children_pipes:
-                c.send('next')
+                c.send("next")
             return
 
         pg = self._create_process_group_nccl()
@@ -3853,15 +3579,20 @@ def gather_trace():
                 # give the other thread some time to fill the cuda buffer
                 time.sleep(5)
                 t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+                t = t["entries"]
+                self.assertEqual(t[-1]["profiling_name"], "nccl:all_reduce")
                 if self.rank == 0:
-                    self.assertEqual(t[-1]['seq_id'], 1)
-                    self.assertEqual(t[-1]['state'], 'completed')
+                    self.assertEqual(t[-1]["seq_id"], 1)
+                    self.assertEqual(t[-1]["state"], "completed")
                 else:
-                    self.assertEqual(t[-1]['seq_id'], 2)
-                    self.assertEqual(t[-1]['state'], self.started_or_scheduled(timing_enabled))
+                    self.assertEqual(t[-1]["seq_id"], 2)
+                    self.assertEqual(
+                        t[-1]["state"], self.started_or_scheduled(timing_enabled)
+                    )
+                    self.assertIsNone(t[-1]["time_discovered_completed_ns"])
                 # this will eventually cause the missing rank 0
                 # to continue which will unblock the non-zero ranks
-                self.parent.send('next')
+                self.parent.send("next")
 
             if self.rank != 0:
                 pg.allreduce(a).wait()
@@ -3875,11 +3606,234 @@ def gather_trace():
             else:
                 gather_trace()
 
-            self.assertEqual('next', self.parent.recv())
+            self.assertEqual("next", self.parent.recv())
             if self.rank == 0:
                 pg.allreduce(a).wait()
             torch.cuda.synchronize(device=device)
 
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize(
+        "op_sizes_per_coalesce",
+        [
+            [(2, 3)],
+            [(2, 3), (5, 5), (1,)],
+        ],
+    )
+    @parametrize("timing_enabled", [True, False])
+    def test_batched_send_recv(self, op_sizes_per_coalesce, timing_enabled):
+        """
+        'WorkEnqueue' was skipped for isendirecv, leading to segfault on dump_entries when update_state tried to use
+        a destructed Work obj's cuda events
+        """
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_nccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+
+        num_coalesced_ops = 20
+        ops_per_coalesce = len(op_sizes_per_coalesce)
+        for i in range(num_coalesced_ops):
+            ops = []
+            for input_sizes in op_sizes_per_coalesce:
+                tensor = torch.zeros(input_sizes).to(self.local_device)
+                if self.rank == 0:
+                    ops.append(dist.P2POp(dist.irecv, tensor, 1))
+                elif self.rank == 1:
+                    tensor *= 2
+                    ops.append(dist.P2POp(dist.isend, tensor, 0))
+
+            dist.batch_isend_irecv(ops).pop().wait()
+
+        torch.cuda.synchronize(device=self.local_device)
+
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+        self.assertEqual(len(t["entries"]), num_coalesced_ops * (ops_per_coalesce + 1))
+
+        expected_record_id = 0
+        expected_seq = 1
+        expected_op_id = 1
+        for seq in range(num_coalesced_ops):
+            first_op = seq * (ops_per_coalesce + 1)
+            coalesced_op = first_op + ops_per_coalesce
+            for p2p_op_idx, input_sizes in zip(
+                range(first_op, coalesced_op, 1), op_sizes_per_coalesce
+            ):
+                # the indivudal ops inside the coalescing group the individual op metadata,
+                # but not the timing info coming from the actual coalesced kernel
+                profiling_name = (
+                    "nccl:recv 0<-1" if self.rank == 0 else "nccl:send 1->0"
+                )
+                self.assertEqual(
+                    t["entries"][p2p_op_idx]["record_id"], expected_record_id
+                )
+                expected_record_id += 1
+                self.assertEqual(
+                    t["entries"][p2p_op_idx]["profiling_name"], profiling_name
+                )
+                self.assertEqual(t["entries"][p2p_op_idx]["seq_id"], expected_seq)
+                self.assertEqual(t["entries"][p2p_op_idx]["op_id"], expected_op_id)
+                expected_op_id += 1
+                self.assertEqual(t["entries"][p2p_op_idx]["input_sizes"], [input_sizes])
+                self.assertEqual(
+                    t["entries"][p2p_op_idx]["output_sizes"], [input_sizes]
+                )
+                # duration doesn't get tagged onto individual ops yet, nor is their state updated
+                self.assertEqual(t["entries"][p2p_op_idx]["state"], "scheduled")
+                self.assertTrue("duration_ms" not in t["entries"][p2p_op_idx])
+
+            # the coalesced op has no metadata but indicates that coalescing was used,
+            # and accurately reflects the timing and state info for the whole group
+            self.assertEqual(
+                t["entries"][coalesced_op]["record_id"], expected_record_id
+            )
+            expected_record_id += 1
+            self.assertEqual(
+                t["entries"][coalesced_op]["profiling_name"], "nccl:coalesced"
+            )
+            self.assertEqual(t["entries"][coalesced_op]["seq_id"], expected_seq)
+            expected_seq += 1
+            self.assertEqual(t["entries"][coalesced_op]["state"], "completed")
+            self.assertEqual(t["entries"][coalesced_op]["input_sizes"], [])
+            self.assertEqual(t["entries"][coalesced_op]["output_sizes"], [])
+            if timing_enabled:
+                duration = t["entries"][coalesced_op]["duration_ms"]
+                self.assertTrue(0.001 < duration < 10000, duration)
+            else:
+                self.assertTrue("duration_ms" not in t["entries"][coalesced_op])
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize(
+        "op_sizes",
+        [
+            [(2, 3)],
+            [(2, 3), (5, 5), (1,)],
+        ],
+    )
+    @parametrize("timing_enabled", [True, False])
+    def test_individual_send_recv(self, op_sizes, timing_enabled):
+        """
+        'WorkEnqueue' was skipped for isendirecv, leading to segfault on dump_entries when update_state tried to use
+        a destructed Work obj's cuda events
+        """
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_nccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+        num_repeats = 10
+        ops_per_repeat = len(op_sizes)
+        for i in range(num_repeats):
+            for input_sizes in op_sizes:
+                tensor = torch.zeros(input_sizes).to(self.local_device)
+                if self.rank == 0:
+                    dist.recv(tensor, 1)
+                elif self.rank == 1:
+                    tensor *= 2
+                    dist.send(tensor, 0)
+
+        torch.cuda.synchronize(device=self.local_device)
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+        self.assertEqual(len(t["entries"]), num_repeats * (ops_per_repeat))
+        expected_seq = 1
+        expected_op_id = 1
+        for seq in range(num_repeats * ops_per_repeat):
+            input_sizes = op_sizes[seq % ops_per_repeat]
+            profiling_name = "nccl:recv 0<-1" if self.rank == 0 else "nccl:send 1->0"
+            self.assertEqual(t["entries"][seq]["profiling_name"], profiling_name)
+            self.assertEqual(t["entries"][seq]["seq_id"], expected_seq)
+            expected_seq += 1
+            self.assertEqual(t["entries"][seq]["op_id"], expected_op_id)
+            expected_op_id += 1
+            self.assertEqual(t["entries"][seq]["input_sizes"], [input_sizes])
+            self.assertEqual(t["entries"][seq]["output_sizes"], [input_sizes])
+            self.assertEqual(t["entries"][seq]["state"], "completed")
+
+            if timing_enabled:
+                duration = t["entries"][seq]["duration_ms"]
+                self.assertTrue(0.001 < duration < 10000, duration)
+            else:
+                self.assertTrue("duration_ms" not in t["entries"][seq])
+
+    # TODO(whc) support and test coalesced collectives that use the c++ start/end group thingy instead of python
+    # coalescing manager
+
+    # TODO(whc) test out other ops (And combinations of ops, if that's valid?)
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    @parametrize("timing_enabled", [True, False])
+    def test_coalescing_manager_collective(self, timing_enabled):
+        """
+        The coalescing manager api works by accumulating operations in python via a contextmanager, and then making
+        one call into c++ to an <op>_coalesced API.  It has limited support for ops and has been added recently to
+        avoid overheads of making individual py-cpp calls.  This complicates flight recording..
+
+        For now, flight recording of coalescing_manager collectives is less detailed than cpp coalesced collectives.
+        """
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_nccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+
+        output_tensors = torch.zeros(2, 2).to(self.rank)
+        input_tensors = [torch.ones(2, 2).to(self.rank) for _ in range(self.world_size)]
+
+        # TODO(whc) make this work with bigger world or something
+        self.assertEqual(self.world_size, 2, self.world_size)
+
+        with dist._coalescing_manager():
+            for i in range(self.world_size):
+                dist.reduce_scatter_tensor(output_tensors[i], input_tensors[i])
+        self.assertEqual(output_tensors, input_tensors[self.rank] * self.world_size)
+
+        torch.cuda.synchronize(device=self.rank)
+
+        if timing_enabled:
+            # wait for watchdog thread to process the queue of works
+            time.sleep(1)
+
+        t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+
+        self.assertEqual(
+            len(t["entries"]), 1
+        )  # one for the reduce_scatter_tensor_coalesced, one for the endCoalescing
+        self.assertEqual(
+            t["entries"][0]["profiling_name"], "nccl:reduce_scatter_tensor_coalesced"
+        )
+        self.assertEqual(t["entries"][0]["seq_id"], 1)
+        self.assertEqual(t["entries"][0]["input_sizes"], [[2, 2], [2, 2]])
+        self.assertEqual(
+            t["entries"][0]["output_sizes"],
+            [
+                [
+                    2,
+                ],
+                [
+                    2,
+                ],
+            ],
+        )
+        self.assertEqual(t["entries"][0]["state"], "completed")
+        if timing_enabled:
+            duration = t["entries"][0]["duration_ms"]
+            self.assertTrue(0.001 < duration < 10000, duration)
+        else:
+            self.assertTrue("duration_ms" not in t["entries"][0])
+
+
 class NCCLTraceTestDumpOnTimeoutBase(NCCLTraceTestBase):
     timeout_sec = 1
 
@@ -3890,7 +3844,8 @@ def _create_process_group_nccl(self):
             world_size=self.world_size,
             rank=self.rank,
             store=store,
-            timeout=timedelta(seconds=NCCLTraceTestDumpOnTimeoutBase.timeout_sec))
+            timeout=timedelta(seconds=NCCLTraceTestDumpOnTimeoutBase.timeout_sec),
+        )
         pg = c10d.distributed_c10d._get_default_group()
         return pg
 
@@ -3907,23 +3862,31 @@ def _wait_process(self, rank, timeout):
         except TimeoutError:
             return None
 
+
 class NCCLTraceTestDumpOnTimeout(NCCLTraceTestDumpOnTimeoutBase):
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     @parametrize("timing_enabled", [True, False])
     def test_timeout_dumps(self, timing_enabled):
+        # dump on heartbeatmonitor thread
+        os.environ["TORCH_NCCL_COORD_CHECK_MILSEC"] = "1000"
+        # need rank0 to crash before looking for its output file
+        os.environ["TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC"] = "1"
 
         if self.rank == self.MAIN_PROCESS_RANK:
             # wait for rank0 to crash before looking for its output file
             # we rely on rank0 holding off its abort long enough to dump the debug info
             self.assertEqual(self._wait_process(0, timeout=90), -6)
-            with open(self._trace_name(rank=0), 'rb') as f:
+            with open(self._trace_name(rank=0), "rb") as f:
                 t = pickle.load(f)
+                t = t["entries"]
                 self.assertEqual(len(t), 2)
-                self.assertEqual(t[0]['seq_id'], 1)
-                self.assertEqual(t[0]['state'], 'completed')
-                self.assertEqual(t[1]['seq_id'], 2)
-                self.assertEqual(t[1]['state'], self.started_or_scheduled(timing_enabled))
+                self.assertEqual(t[0]["seq_id"], 1)
+                self.assertEqual(t[0]["state"], "completed")
+                self.assertEqual(t[1]["seq_id"], 2)
+                self.assertEqual(
+                    t[1]["state"], self.started_or_scheduled(timing_enabled)
+                )
 
             self.assertFalse(os.path.exists(self._trace_name(rank=1)))
 
@@ -3943,12 +3906,15 @@ def test_timeout_dumps(self, timing_enabled):
                 pg.allreduce(a).wait()
 
             # rank 0 will crash before it passes the sync, but rank1 will exit quickly and cleanly
-            torch.cuda.synchronize()
+            torch.cuda.synchronize(device=device)
+
 
+instantiate_parametrized_tests(ProcessGroupNCCLGroupTest)
 instantiate_parametrized_tests(NCCLTraceTestDumpOnTimeout)
 instantiate_parametrized_tests(NCCLTraceTest)
 
-class NCCLTraceTestTimeoutDumpOnIdleRanks(NCCLTraceTestDumpOnTimeoutBase):
+
+class NCCLTraceTestTimeoutDumpOnStuckRanks(NCCLTraceTestDumpOnTimeoutBase):
     def _check_return_codes(self, elapsed_time):
         # the base test infra assumes processes exit with matching return codes,
         # but we want rank0 to abort and rank1 to exit cleanly in this test
@@ -3957,7 +3923,11 @@ def _check_return_codes(self, elapsed_time):
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_timeout_dumps_on_idle_ranks(self):
+    def test_timeout_dumps_on_stuck_ranks(self):
+        # need rank0 to crash quicker after detecting timeout
+        os.environ["TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC"] = "1"
+        # restore this env var to its prior default in case another test changed it
+        os.environ["TORCH_NCCL_COORD_CHECK_MILSEC"] = "1000"
 
         if self.rank == self.MAIN_PROCESS_RANK:
             # wait for both rank0 and 1 to crash before looking for both ranks' output
@@ -3966,22 +3936,19 @@ def test_timeout_dumps_on_idle_ranks(self):
             self.assertEqual(self._wait_process(1, timeout=90), -6)
             self.assertTrue(os.path.exists(self._trace_name(rank=1)))
             self.assertTrue(os.path.exists(self._trace_name(rank=0)))
-            with open(self._trace_name(rank=0), 'rb') as f:
+            with open(self._trace_name(rank=0), "rb") as f:
                 t = pickle.load(f)
+                t = t["entries"]
                 self.assertEqual(len(t), 2)
-            with open(self._trace_name(rank=1), 'rb') as f:
+            with open(self._trace_name(rank=1), "rb") as f:
                 t = pickle.load(f)
+                t = t["entries"]
                 self.assertEqual(len(t), 1)
-                self.assertEqual(t[0]['seq_id'], 1)
-                self.assertEqual(t[0]['state'], 'completed')
+                self.assertEqual(t[0]["seq_id"], 1)
+                self.assertEqual(t[0]["state"], "completed")
             return
 
-        # Set heartbeat timeout to a shorter one (default timeout is 2 min).
-        os.environ[
-            "TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC"
-        ] = f"{NCCLTraceTestDumpOnTimeoutBase.timeout_sec * 2}"
         pg = self._create_process_group_nccl()
-
         device = self.local_device
         with torch.cuda.device(device):
             a = torch.full((3, 4), float(self.rank), device=device)
@@ -3990,12 +3957,69 @@ def test_timeout_dumps_on_idle_ranks(self):
             if self.rank == 0:
                 pg.allreduce(a).wait()
 
-            # rank 0 will crash before it passes the sync, but rank1 will exit quickly and cleanly
-            torch.cuda.synchronize()
+            # rank 0 will get stuck, timeout and then signal a timeout to all ranks.
+            torch.cuda.synchronize(device=device)
 
-            # Force rank 1 to idle so that it also gets debug info dump triggered.
             if self.rank == 1:
-                time.sleep(6)
+                # Force rank 1 to idle so that it will eventually timeout as well after
+                # getting the global signal to dump the debugging info.
+                time.sleep(600)
+
+
+class NcclErrorDumpTest(NCCLTraceTestBase):
+    def _wait_process(self, rank, timeout):
+        try:
+            self.processes[rank].join(timeout)
+            return self.processes[rank].exitcode
+        except TimeoutError:
+            return None
+
+    def _check_return_codes(self, elapsed_time):
+        # the base test infra assumes processes exit with matching return codes,
+        # but we want rank0 to abort with exception and rank1 to exit with exit 1
+        self.assertEqual(self.processes[0].exitcode, -6)
+        self.assertEqual(self.processes[1].exitcode, 1)
+
+    @requires_nccl()
+    @requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
+    @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
+    def test_nccl_errors_dump(self):
+        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+        os.environ["TORCH_NCCL_TRACE_BUFFER_SIZE"] = "1000"
+        os.environ["TORCH_NCCL_DUMP_ON_TIMEOUT"] = "1"
+        # need rank0 to dump before abort
+        os.environ["TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC"] = "5"
+
+        if self.rank == self.MAIN_PROCESS_RANK:
+            # wait for both rank0 and 1 to crash before looking for dump
+            self.assertEqual(self._wait_process(0, timeout=90), -6)
+            self.assertEqual(self._wait_process(1, timeout=90), 1)
+            # verify that the trace file exists for rank0
+            self.assertTrue(os.path.exists(self._trace_name(rank=0)))
+            return
+
+        store = c10d.FileStore(self.file_name, self.world_size)
+        process_group = c10d.ProcessGroupNCCL(
+            store,
+            self.rank,
+            self.world_size,
+            timeout=timedelta(seconds=10),
+        )
+        process_group.allreduce(torch.rand(10).cuda(self.rank))
+        if self.rank == 0:
+            work = process_group.allreduce(torch.rand(10).cuda(self.rank))
+            # expect an error to be raised
+            with self.assertRaisesRegex(dist.DistBackendError, ""):
+                # Block the current stream on the NCCL stream
+                work.wait()
+                # Run some GPU operations
+                a = torch.rand(10).cuda(self.rank)
+        elif self.rank == 1:
+            # Clean up structures (ex: files for FileStore before going down)
+            del process_group
+            sys.exit(1)
+
 
 if __name__ == "__main__":
     assert (
diff --git a/test/distributed/test_c10d_object_collectives.py b/test/distributed/test_c10d_object_collectives.py
index aadd3b2f5fb64..9b077e671c8d8 100644
--- a/test/distributed/test_c10d_object_collectives.py
+++ b/test/distributed/test_c10d_object_collectives.py
@@ -2,7 +2,7 @@
 
 import os
 import sys
-from functools import wraps, partial
+from functools import partial, wraps
 
 import torch
 import torch.distributed as dist
@@ -11,23 +11,21 @@
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
-from torch.testing._internal.common_distributed import (
-    MultiProcessTestCase,
-    TEST_SKIPS
-)
+from torch.testing._internal.common_distributed import MultiProcessTestCase, TEST_SKIPS
 
-from torch.testing._internal.common_utils import (
-    run_tests,
-    TEST_WITH_DEV_DBG_ASAN,
-)
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 
 if TEST_WITH_DEV_DBG_ASAN:
-    print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr)
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
     sys.exit(0)
 
 BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO
 WORLD_SIZE = min(4, max(2, torch.cuda.device_count()))
 
+
 def with_comms(func=None):
     if func is None:
         return partial(
@@ -41,8 +39,10 @@ def wrapper(self, *args, **kwargs):
         self.dist_init()
         func(self)
         self.destroy_comms()
+
     return wrapper
 
+
 class TestObjectCollectives(MultiProcessTestCase):
     def setUp(self):
         super().setUp()
@@ -52,8 +52,11 @@ def setUp(self):
 
     @property
     def device(self):
-        return torch.device(self.rank) if BACKEND == dist.Backend.NCCL \
+        return (
+            torch.device(self.rank)
+            if BACKEND == dist.Backend.NCCL
             else torch.device("cpu")
+        )
 
     @property
     def world_size(self):
@@ -83,9 +86,7 @@ def dist_init(self):
     @with_comms()
     def test_all_gather_object(self):
         output = [None] * dist.get_world_size()
-        dist.all_gather_object(
-            object_list=output,
-            obj=self.rank)
+        dist.all_gather_object(object_list=output, obj=self.rank)
 
         for i, v in enumerate(output):
             self.assertEqual(i, v, f"rank: {self.rank}")
@@ -93,14 +94,25 @@ def test_all_gather_object(self):
     @with_comms()
     def test_gather_object(self):
         output = [None] * dist.get_world_size() if self.rank == 0 else None
-        dist.gather_object(
-            obj=self.rank,
-            object_gather_list=output)
+        dist.gather_object(obj=self.rank, object_gather_list=output)
 
         if self.rank == 0:
             for i, v in enumerate(output):
                 self.assertEqual(i, v, f"rank: {self.rank}")
 
+    @with_comms()
+    def test_send_recv_object_list(self):
+        val = 99 if self.rank == 0 else None
+        object_list = [val] * dist.get_world_size()
+        if self.rank == 0:
+            dist.send_object_list(object_list, 1)
+        if self.rank == 1:
+            dist.recv_object_list(object_list, 0)
+
+        if self.rank < 2:
+            self.assertEqual(99, object_list[0])
+        else:
+            self.assertEqual(None, object_list[0])
 
     @with_comms()
     def test_broadcast_object_list(self):
@@ -116,8 +128,8 @@ def test_scatter_object_list(self):
         input_list = list(range(dist.get_world_size())) if self.rank == 0 else None
         output_list = [None]
         dist.scatter_object_list(
-            scatter_object_output_list=output_list,
-            scatter_object_input_list=input_list)
+            scatter_object_output_list=output_list, scatter_object_input_list=input_list
+        )
 
         self.assertEqual(self.rank, output_list[0])
 
@@ -161,5 +173,6 @@ def test_subpg_broadcast_object(self):
         dist.broadcast_object_list(out_list, src=ranks[0], group=my_pg)
         self.assertEqual(ranks[0], out_list[0])
 
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_c10d_ops_nccl.py b/test/distributed/test_c10d_ops_nccl.py
new file mode 100644
index 0000000000000..7d5a2683318e8
--- /dev/null
+++ b/test/distributed/test_c10d_ops_nccl.py
@@ -0,0 +1,994 @@
+# Owner(s): ["oncall: distributed"]
+# This test file contains positive tests for c10d with NCCL backend.
+# During the test, it is expected that ProcessGroup will not be aborted, destroyed or incur fatal error.
+# Please be mindful of this when adding tests here.
+# If you need to add tests for group creation, abort or destroy, please add tests in test_c10d_nccl.py.
+
+# There are two ways to launch tests in this file:
+# 1. Run this file directly with `python test_c10d_ops_nccl.py`
+# 2. Use multi-process launcher, e.g. `torchrun --standalone --nproc-per-node 2 test_c10d_ops_nccl.py`
+
+import math
+import os
+import sys
+import tempfile
+
+import torch
+import torch.distributed as c10d
+
+if not c10d.is_available() or not c10d.is_nccl_available():
+    print("c10d NCCL not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+
+import torch.distributed as dist
+from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_distributed import (
+    init_multigpu_helper,
+    MultiProcContinousTest,
+    requires_nccl,
+)
+from torch.testing._internal.common_utils import (
+    skip_but_pass_in_sandcastle_if,
+    skipIfRocm,
+    TEST_WITH_DEV_DBG_ASAN,
+)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip ASAN as torch + multiprocessing spawn have known issues", file=sys.stderr
+    )
+    sys.exit(0)
+
+
+class ProcessGroupNCCLOpTest(MultiProcContinousTest):
+    @classmethod
+    def backend_str(cls) -> str:
+        return "nccl"
+
+    @classmethod
+    def opts(cls, high_priority_stream=False):
+        opts = c10d.ProcessGroupNCCL.Options()
+        opts.is_high_priority_stream = high_priority_stream
+        return opts
+
+    @property
+    def rank_to_GPU(self):
+        # return rank to GPU map
+        return init_multigpu_helper(self.world_size, "nccl")
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_empty_tensors(self):
+        pg = self.pg
+        local_device_idx = self.rank_to_GPU[self.rank][0]
+
+        xs = [torch.FloatTensor([]).cuda(local_device_idx)]
+        pg.broadcast(xs).wait()
+        self.assertEqual(0, xs[0].numel())
+
+        pg.allreduce(xs).wait()
+        self.assertEqual(0, xs[0].numel())
+
+        pg.reduce(xs).wait()
+        self.assertEqual(0, xs[0].numel())
+
+        ys = [
+            [
+                torch.FloatTensor([]).cuda(local_device_idx)
+                for _ in range(self.world_size)
+            ]
+        ]
+        pg.allgather(ys, xs).wait()
+        for y in ys[0]:
+            self.assertEqual(0, y.numel())
+
+        ys = [torch.FloatTensor([]).cuda(local_device_idx)]
+        xs = [
+            [
+                torch.FloatTensor([]).cuda(local_device_idx)
+                for _ in range(self.world_size)
+            ]
+        ]
+        pg.reduce_scatter(ys, xs).wait()
+        self.assertEqual(0, ys[0].numel())
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_broadcast_ops(self):
+        pg = self.pg
+
+        def broadcast(xs, rootRank, rootTensor):
+            opts = c10d.BroadcastOptions()
+            opts.rootRank = rootRank
+            opts.rootTensor = rootTensor
+            work = pg.broadcast(xs, opts)
+            work.wait()
+            return xs
+
+        # Every rank is root once
+        for i in range(self.world_size):
+            # Run with 1 input tensor
+            x = torch.tensor([self.rank]).cuda(self.rank_to_GPU[self.rank][0])
+            output = broadcast([x], i, 0)
+            self.assertEqual(torch.tensor([i]), output[0])
+
+            expected_tensor = torch.empty([i + 1, i + 1]).fill_(i + 1)
+            xs = [
+                torch.empty([i + 1, i + 1]).fill_(-1).cuda(device=device_idx)
+                for device_idx in self.rank_to_GPU[self.rank]
+            ]
+
+            # test with multiple input tensors (multiple gpu in one rank)
+            for j in range(len(xs)):
+                if self.rank == i:
+                    xs[j] = expected_tensor.cuda(device=self.rank_to_GPU[self.rank][j])
+
+                broadcast(xs, i, j)
+
+                for tensor in xs:
+                    self.assertEqual(tensor, expected_tensor)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_sparse_allreduce_ops(self):
+        pg = self.pg
+
+        indices = torch.tensor([[0, 1]])
+        values = torch.tensor([[1, 2, 0], [4, 0, 6]])
+        sparse_tensor = torch.sparse_coo_tensor(indices, values, size=(2, 3)).to(
+            self.rank
+        )
+
+        # sparse allreduce call is wrapped in a try catch since the c10d API is only available in the nccl experimental branch
+        try:
+            tensor_list = [sparse_tensor]
+            work = pg.allreduce(tensor_list)
+            work.wait()
+
+            # tensor_list is a list of size 1, with the allreduce output as a dense tensor
+            a = torch.tensor([[2, 4, 0], [8, 0, 12]]).to(self.rank)
+            self.assertEqual(tensor_list[0], a)
+        except RuntimeError as e:
+            if "NCCL does not support all_reduce with sparse tensors" in str(e):
+                pass
+            else:
+                # Rethrow the exception if it's a different error
+                raise
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_allreduce_ops(self):
+        device_count = torch.cuda.device_count()
+        pg = self.pg
+        local_device_id = self.rank_to_GPU[self.rank][0]
+
+        def allreduce(tensors, op):
+            opts = c10d.AllreduceOptions()
+            opts.reduceOp = op
+            work = pg.allreduce(tensors, opts)
+            work.wait()
+
+        # Sum
+        tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
+
+        allreduce(tensors, c10d.ReduceOp.SUM)
+
+        ndev = self.world_size
+        self.assertEqual(
+            torch.tensor([ndev * (ndev + 1) // 2]),
+            tensors[0],
+        )
+
+        # Avg (only available for NCCL 2.10+)
+        if torch.cuda.nccl.version() >= (2, 10, 0):
+            tensors = [torch.tensor([self.rank + 1.0]).cuda(local_device_id)]
+
+            allreduce(tensors, c10d.ReduceOp.AVG)
+            ndev = self.world_size
+            self.assertEqual(
+                torch.tensor([ndev * (ndev + 1.0) / (2.0 * ndev)]),
+                tensors[0],
+            )
+
+        # Premul Sum
+        if torch.cuda.nccl.version() >= (2, 11, 1):
+            for dtype in torch.half, torch.float, torch.double:
+                for factor in (
+                    3.0,
+                    torch.tensor([5.0], device=local_device_id, dtype=dtype),
+                ):
+                    tensors = [
+                        torch.tensor([self.rank + 1])
+                        .cuda(local_device_id)
+                        .to(dtype=dtype)
+                    ]
+
+                    allreduce(tensors, c10d._make_nccl_premul_sum(factor))
+
+                    self.assertEqual(
+                        factor
+                        * torch.tensor(
+                            [self.world_size * (self.world_size + 1) / 2],
+                            dtype=dtype,
+                            device=local_device_id,
+                        ),
+                        tensors[0],
+                    )
+
+        # Product
+        tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
+
+        allreduce(tensors, c10d.ReduceOp.PRODUCT)
+        self.assertEqual(torch.tensor([math.factorial(self.world_size)]), tensors[0])
+
+        # Min
+        tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
+
+        allreduce(tensors, c10d.ReduceOp.MIN)
+        self.assertEqual(torch.tensor([1]), tensors[0])
+
+        # Max
+        tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
+
+        allreduce(tensors, c10d.ReduceOp.MAX)
+        self.assertEqual(torch.tensor([self.world_size]), tensors[0])
+
+        for op, err in zip(
+            (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR),
+            ("ReduceOp.BAND", "ReduceOp.BOR", "ReduceOp.BXOR"),
+        ):
+            with self.assertRaisesRegex(ValueError, "Cannot use " + err + " with NCCL"):
+                allreduce(tensors, op)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_alltoall_ops_with_cudafree_race(self):
+        pg = self.pg
+        opts = c10d.AllToAllOptions()
+        local_device = f"cuda:{self.rank_to_GPU[self.rank][0]}"
+        torch.cuda.set_device(local_device)
+        input = torch.rand(1000, 1000, device=local_device)
+        output = torch.rand(1000, 1000, device=local_device)
+        race_tensors = []
+        # create some tensors to race with alltoall collective
+        for _ in range(10):
+            tmp = []
+            for i in range(5):
+                tmp.append(torch.rand(10 ** (3 + i), device=local_device))
+            race_tensors.append(tmp)
+
+        for i in range(10):
+            race_tensors.pop()
+            work = pg.alltoall_base(output, input, [], [], opts)
+            # this triggers cudaFree
+            torch.cuda.empty_cache()
+            work.wait()
+        torch.cuda.synchronize(device=local_device)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_allreduce_in_cudagraph(self):
+        pg = self.pg
+        local_device_idx = self.rank_to_GPU[self.rank][0]
+        with torch.cuda.device(local_device_idx):
+            xs = [torch.FloatTensor([1]).cuda(local_device_idx)]
+
+            # single warmup
+            pg.allreduce(xs).wait()
+            self.assertEqual(xs[0].item(), 2)
+
+            graph = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(graph):
+                pg.allreduce(xs).wait()
+            self.assertEqual(xs[0].item(), 2)
+
+            graph.replay()
+            graph.replay()
+            self.assertEqual(xs[0].item(), 8)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @skipIfRocm()
+    def test_nccl_watchdog_cudagraph(self):
+        # test that the watchdog does not crash graphs with disallowed event query
+        pg = self.pg
+        rank = self.rank_to_GPU[self.rank][0]
+        with torch.cuda.device(rank):
+            for i in range(10):
+                xs = [torch.FloatTensor([1]).cuda(rank)]
+                ys = [torch.FloatTensor([4]).cuda(rank)]
+                for _ in range(30):
+                    pg.allreduce(xs[0]).wait()
+
+                graph = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(graph):
+                    xs[0] += 0.0
+                    pg.allreduce(xs[0]).wait()
+                    pg.allreduce(xs[0]).wait()
+                    pg.allreduce(xs[0]).wait()
+                    xs[0] += 0.0
+
+                for _ in range(100):
+                    graph.replay()
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_reduce_ops(self):
+        pg = self.pg
+        local_device_id = self.rank_to_GPU[self.rank][0]
+
+        def reduce(xs, rootRank, rootTensor, op=None):
+            opts = c10d.ReduceOptions()
+            opts.rootRank = rootRank
+            opts.rootTensor = rootTensor
+            if op:
+                opts.reduceOp = op
+            work = pg.reduce(xs, opts)
+            work.wait()
+
+        # for every root tensor
+        for rt in range(self.world_size):
+            tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
+
+            reduce(tensors, rt, 0)
+
+            if self.rank == rt:
+                self.assertEqual(
+                    torch.tensor([self.world_size * (self.world_size + 1) // 2]),
+                    tensors[0],
+                )
+            else:
+                self.assertEqual(
+                    torch.tensor([self.rank + 1]),
+                    tensors[0],
+                )
+
+            for op, err in zip(
+                (c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR),
+                ("ReduceOp.BAND", "ReduceOp.BOR", "ReduceOp.BXOR"),
+            ):
+                with self.assertRaisesRegex(
+                    ValueError, "Cannot use " + err + " with NCCL"
+                ):
+                    reduce(tensors, self.rank, rt, op)
+
+            # Premul sum
+            if torch.cuda.nccl.version() >= (2, 11, 1):
+                for factor in (3.0, torch.tensor([5.0], device=local_device_id)):
+                    if isinstance(factor, torch.Tensor):
+                        factor_ref = factor.cpu().item()
+                    else:
+                        factor_ref = factor
+                    float_tensors = [
+                        torch.tensor(
+                            [self.rank + 1.0], device=f"cuda:{local_device_id}"
+                        )
+                    ]
+                    float_tensors_ref = [
+                        torch.tensor(
+                            [(self.rank + 1.0) * factor_ref],
+                            device=f"cuda:{local_device_id}",
+                        )
+                    ]
+
+                    reduce(float_tensors_ref, rt, 0)
+                    reduce(float_tensors, rt, 0, c10d._make_nccl_premul_sum(factor))
+                    if self.rank == rt:
+                        self.assertEqual(float_tensors_ref[0], float_tensors[0])
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_allgather_ops(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+
+        def allgather(output_ts, input_ts):
+            work = pg.allgather(output_ts, input_ts)
+            return work.wait()
+
+        tensors = [torch.empty(2, 2).fill_(2).cuda(device=i) for i in local_device_ids]
+        output_tensors = []
+        expected_output = []
+
+        output_per_gpu = (
+            [torch.empty(2, 2).fill_(-1)] * len(local_device_ids) * self.world_size
+        )
+        expected_per_gpu = (
+            [torch.empty(2, 2).fill_(2)] * len(local_device_ids) * self.world_size
+        )
+
+        for gpu in local_device_ids:
+            output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu])
+            expected_output.append([t.cuda(device=gpu) for t in expected_per_gpu])
+
+        result = allgather(output_tensors, tensors)
+
+        # Verification
+        self.assertEqual(output_tensors, expected_output)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_allgather_base_ops(self):
+        pg = self.pg
+        local_device_id = self.rank_to_GPU[self.rank][0]
+
+        def allgather_base(output_t, input_t):
+            work = pg._allgather_base(output_t, input_t)
+            work.wait()
+
+        # allgather_base is GPU number agnostic.
+        # Each rank contribute one tensor regardless of GPU counts
+        tensor = torch.tensor([self.rank]).cuda(local_device_id)
+        output_t = torch.empty((self.world_size), dtype=tensor.dtype).cuda(
+            local_device_id
+        )
+
+        allgather_base(output_t, tensor)
+
+        # Verification
+        self.assertEqual(torch.arange(self.world_size), output_t)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_allgather_base_basics(self):
+        pg = self.pg
+        local_device_id = self.rank_to_GPU[self.rank][0]
+
+        def allgather_base(output_t, input_t):
+            work = pg._allgather_base(output_t, input_t)
+            work.wait()
+
+        # anticipate an error
+        with self.assertRaisesRegex(
+            ValueError,
+            "output tensor size must be equal to world_size times input tensor size",
+        ):
+            tensor = torch.tensor([self.rank]).cuda(local_device_id)
+            output_t = torch.empty((self.world_size + 1), dtype=tensor.dtype).cuda(
+                local_device_id
+            )
+            # fails the check because output_t is not correctly sized
+            allgather_base(output_t, tensor)
+
+        # anticipate an error
+        with self.assertRaisesRegex(
+            TypeError, "output tensor must have the same type as input tensor"
+        ):
+            tensor = torch.tensor([self.rank], dtype=torch.float).cuda(local_device_id)
+            output_t = torch.empty((self.world_size + 1), dtype=torch.long).cuda(
+                local_device_id
+            )
+            # fails the check because the dtype is different
+            allgather_base(output_t, tensor)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_gather_ops(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
+
+        def gather(output_t, input_t, rootRank):
+            opts = c10d.GatherOptions()
+            opts.rootRank = rootRank
+            if rootRank == self.rank:
+                work = pg.gather(output_t, input_t, opts)
+            else:
+                work = pg.gather([], input_t, opts)
+            work.wait()
+
+        # init input
+        tensors = []
+        for device_id in local_device_ids:
+            tensors.append(torch.tensor([self.rank]).cuda(device_id))
+
+        # init output
+        output_ts = []
+        for idx in range(num_gpus):
+            gpu_idx = local_device_ids[idx]
+            output_ts.append([])
+            for rank in range(self.world_size):
+                output_ts[idx].append(torch.tensor([-1]).cuda(gpu_idx))
+
+        expected = [[torch.tensor([rank]) for rank in range(self.world_size)]]
+        for rank in range(self.world_size):
+            gather(output_ts, tensors, rank)
+            if rank == self.rank:
+                self.assertEqual(expected, output_ts)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_gather_stress(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
+
+        def gather(output_t, input_t, rootRank):
+            opts = c10d.GatherOptions()
+            opts.rootRank = rootRank
+            if rootRank == self.rank:
+                work = pg.gather(output_t, input_t, opts)
+            else:
+                work = pg.gather([], input_t, opts)
+            work.wait()
+
+        stress_length = 1000
+
+        # init input
+        tensors = []
+        for i in range(stress_length):
+            tensors.append([])
+            for device_id in local_device_ids:
+                tensors[i].append(torch.tensor([self.rank]).cuda(device_id))
+
+        # init output
+        output_ts = []
+        for i in range(stress_length):
+            output_ts.append([[] for _ in range(num_gpus)])
+            for idx, ls in enumerate(output_ts[i]):
+                gpu_idx = local_device_ids[idx]
+                for _ in range(self.world_size):
+                    ls.append(torch.tensor([-1]).cuda(gpu_idx))
+
+        expected = [[torch.tensor([rank]) for rank in range(self.world_size)]]
+        for i in range(stress_length):
+            for rank in range(self.world_size):
+                gather(output_ts[i], tensors[i], rank)
+                # Verification
+                if rank == self.rank:
+                    self.assertEqual(output_ts[i], expected)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_gather_checks(self):
+        pg = self.pg
+        device_id = self.rank_to_GPU[self.rank][0]
+
+        # init input
+        tensor = torch.tensor([self.rank]).cuda(device_id)
+
+        # init output
+        output_ts = []
+        for rank in range(self.world_size):
+            output_ts.append(torch.tensor([-1]).cuda(device_id))
+
+        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+            opts = c10d.GatherOptions()
+            opts.rootRank = -1
+            pg.gather([output_ts], [tensor], opts)
+
+        with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
+            pg.gather([output_ts], [tensor], 0)
+
+        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+            opts = c10d.GatherOptions()
+            opts.rootRank = self.world_size
+            pg.gather([output_ts], [tensor], opts)
+
+        with self.assertRaisesRegex(
+            # throws error message from dispatcher
+            RuntimeError,
+            "There were no tensor arguments to this function",
+        ):
+            opts = c10d.GatherOptions()
+            opts.rootRank = 0
+            pg.gather([output_ts], [], opts)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_scatter_ops(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
+
+        def scatter(output_t, input_t, rootRank):
+            opts = c10d.ScatterOptions()
+            opts.rootRank = rootRank
+            if rootRank == self.rank:
+                work = pg.scatter(output_t, input_t, opts)
+            else:
+                work = pg.scatter(output_t, [], opts)
+            work.wait()
+
+        # init output
+        tensors = []
+        for device_id in local_device_ids:
+            tensors.append(torch.tensor([-1]).cuda(device_id))
+
+        # init input
+        scatter_list = []
+        for idx in range(num_gpus):
+            gpu_idx = local_device_ids[idx]
+            scatter_list.append([])
+            for rank in range(self.world_size):
+                scatter_list[idx].append(torch.tensor([rank]).cuda(gpu_idx))
+
+        # test each rank to scatter
+        expected = [torch.tensor([self.rank])]
+        for rank in range(self.world_size):
+            scatter(tensors, scatter_list, rank)
+            self.assertEqual(expected, tensors)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_scatter_stress(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
+
+        def scatter(output_t, input_t, rootRank):
+            opts = c10d.ScatterOptions()
+            opts.rootRank = rootRank
+            if rootRank == self.rank:
+                work = pg.scatter(output_t, input_t, opts)
+            else:
+                work = pg.scatter(output_t, [], opts)
+            work.wait()
+
+        stress_length = 1000
+
+        # init output
+        tensors = []
+        for i in range(stress_length):
+            tensors.append([])
+            for device_id in local_device_ids:
+                tensors[i].append(torch.tensor([-1]).cuda(device_id))
+
+        # init input
+        scatter_list = []
+        for i in range(stress_length):
+            scatter_list.append([[] for _ in range(num_gpus)])
+            for idx, ls in enumerate(scatter_list[i]):
+                gpu_idx = local_device_ids[idx]
+                for rank in range(self.world_size):
+                    ls.append(torch.tensor([rank]).cuda(gpu_idx))
+
+        # test each rank to scatter
+        expected = [torch.tensor([self.rank])]
+        for i in range(stress_length):
+            for rank in range(self.world_size):
+                scatter(tensors[i], scatter_list[i], rank)
+                # Verification
+                self.assertEqual(tensors[i], expected)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_scatter_checks(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
+
+        # init output
+        tensors = []
+        for device_id in local_device_ids:
+            tensors.append(torch.tensor([-1]).cuda(device_id))
+
+        # init input
+        scatter_list = []
+        for idx in range(num_gpus):
+            gpu_idx = local_device_ids[idx]
+            scatter_list.append([])
+            for rank in range(self.world_size):
+                scatter_list[idx].append(torch.tensor([rank]).cuda(gpu_idx))
+
+        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+            opts = c10d.ScatterOptions()
+            opts.rootRank = -1
+            pg.scatter(tensors, scatter_list, opts)
+
+        with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
+            pg.scatter(tensors, scatter_list, 0)
+
+        with self.assertRaisesRegex(ValueError, "invalid root rank"):
+            opts = c10d.ScatterOptions()
+            opts.rootRank = self.world_size
+            pg.scatter(tensors, scatter_list, opts)
+
+        with self.assertRaisesRegex(
+            # throws error message from dispatcher
+            RuntimeError,
+            "There were no tensor arguments to this function",
+        ):
+            opts = c10d.ScatterOptions()
+            opts.rootRank = 0
+            pg.scatter([], scatter_list, opts)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_reduce_scatter_base_basics(self):
+        pg = self.pg
+        local_device_id = self.rank_to_GPU[self.rank][0]
+
+        def reduce_scatter_base(output_t, input_t):
+            work = pg._reduce_scatter_base(output_t, input_t)
+            work.wait()
+
+        # anticipate an error
+        with self.assertRaisesRegex(
+            ValueError,
+            "input tensor must be the same size as output size times world size",
+        ):
+            input_t = torch.tensor([self.rank]).cuda(local_device_id)
+            output_t = torch.empty((self.world_size + 1), dtype=input_t.dtype).cuda(
+                local_device_id
+            )
+            # fails the check because output_t is not correctly sized
+            reduce_scatter_base(output_t, input_t)
+
+        # anticipate an error
+        with self.assertRaisesRegex(
+            TypeError, "input tensor must be the same type as the output tensor."
+        ):
+            tensor = torch.tensor([self.rank], dtype=torch.float).cuda(local_device_id)
+            output_t = torch.empty((self.world_size + 1), dtype=torch.long).cuda(
+                local_device_id
+            )
+            # fails the check because the dtype is different
+            reduce_scatter_base(output_t, tensor)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_reduce_scatter_ops(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+        num_gpus = len(local_device_ids)
+
+        def reduce_scatter(outputs, input_lists, op):
+            opts = c10d.ReduceScatterOptions()
+            opts.reduceOp = op
+            work = pg.reduce_scatter(outputs, input_lists, opts)
+            work.wait()
+
+        output = [torch.tensor([0]).cuda(i) for i in local_device_ids]
+
+        #  GPU/rank
+        #   0         [1], [2], [3], [4]
+        #   1         [2], [3], [4], [5]
+        #   2         [3], [4], [5], [6]
+        #   3         [4], [5], [6], [7]
+
+        # Sum
+        tensor_lists = []
+        input_per_gpu = []
+
+        for i in range(self.world_size):
+            input_per_gpu.append(torch.tensor([self.rank + i + 1]))
+
+        for gpu in local_device_ids:
+            tensor_lists.append([t.cuda(device=gpu) for t in input_per_gpu])
+
+        reduce_scatter(output, tensor_lists, c10d.ReduceOp.SUM)
+
+        for i in range(num_gpus):
+            expected = torch.tensor(
+                [
+                    (1 + self.world_size) * self.world_size // 2
+                    + self.world_size * self.rank
+                ]
+            )
+
+            self.assertEqual(expected, output[i])
+
+        # Min
+        reduce_scatter(output, tensor_lists, c10d.ReduceOp.MIN)
+
+        for i in range(num_gpus):
+            expected = torch.tensor([self.rank + 1 + i])
+            self.assertEqual(expected, output[i])
+
+        # Max
+        reduce_scatter(output, tensor_lists, c10d.ReduceOp.MAX)
+
+        for i in range(num_gpus):
+            expected = torch.tensor([self.rank + self.world_size + i])
+            self.assertEqual(expected, output[i])
+
+        # Product
+        reduce_scatter(output, tensor_lists, c10d.ReduceOp.PRODUCT)
+
+        # math package don't have math.perm until python 3.8, so
+        # we implement a naive version here.
+        def perm(n, k):
+            prod_val = n
+            for val in range(n - k + 1, n):
+                prod_val *= val
+            return prod_val
+
+        for i in range(num_gpus):
+            prod_val = perm(self.rank + self.world_size, self.world_size)
+
+            expected = torch.tensor([prod_val])
+            self.assertEqual(expected, output[i])
+
+        # Test the input params overridden scenarios, aka, when the input is
+        # a list and output is just one tensor.
+        # Sum
+        output_tensor = torch.empty_like(input_per_gpu[0][0]).cuda(self.rank)
+        input_list = [tensor[0].cuda(self.rank) for tensor in input_per_gpu]
+        pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.SUM).wait()
+        expected = torch.tensor(
+            (1 + self.world_size) * self.world_size // 2 + self.world_size * self.rank
+        )
+        self.assertEqual(expected, output_tensor)
+
+        # Min
+        pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.MIN).wait()
+        expected = torch.tensor(self.rank + 1)
+        self.assertEqual(expected, output_tensor)
+
+        # Max
+        pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.MAX).wait()
+        expected = torch.tensor(self.rank + self.world_size)
+        self.assertEqual(expected, output_tensor)
+
+        # Product
+        pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.PRODUCT).wait()
+        prod_val = self.rank + 1
+        for k in range(1, self.world_size):
+            prod_val = prod_val * (self.rank + 1 + k)
+        expected = torch.tensor(prod_val)
+        self.assertEqual(expected, output_tensor)
+
+        if torch.cuda.nccl.version() >= (2, 11, 1):
+            for factor in (3.0, torch.tensor([5.0], device=self.rank)):
+                if isinstance(factor, torch.Tensor):
+                    factor_ref = factor.cpu().item()
+                else:
+                    factor_ref = factor
+                output = [t.float() for t in output]
+                tensor_lists = [[t.float() for t in tl] for tl in tensor_lists]
+                output_ref = [t.float() for t in output]
+                tensor_lists_ref = [
+                    [t.float() * factor_ref for t in tl] for tl in tensor_lists
+                ]
+                reduce_scatter(output, tensor_lists, c10d._make_nccl_premul_sum(factor))
+                reduce_scatter(output_ref, tensor_lists_ref, c10d.ReduceOp.SUM)
+                self.assertEqual(output_ref, output)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_reduce_scatter_base_ops(self):
+        pg = self.pg
+        local_device_id = self.rank_to_GPU[self.rank][0]
+
+        def reduce_scatter_base(output_t, input_t):
+            work = pg._reduce_scatter_base(output_t, input_t)
+            work.wait()
+
+        # reduce_scatter_base is GPU number agnostic.
+        # Each rank contribute one tensor regardless of GPU counts
+        output_t = torch.empty([1]).cuda(local_device_id)
+        tensor = torch.arange(self.world_size, dtype=output_t.dtype).cuda(
+            local_device_id
+        )
+
+        reduce_scatter_base(output_t, tensor)
+
+        # Verification
+        self.assertEqual(output_t[0], self.rank * self.world_size)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_barrier(self):
+        pg = self.pg
+        local_device_ids = self.rank_to_GPU[self.rank]
+
+        def allreduce(tensors):
+            opts = c10d.AllreduceOptions()
+            work = pg.allreduce(tensors, opts)
+            return work
+
+        # Making the collective to operate on
+        # 1, 2, 3, 4, .... len(local_device_ids) GPUs
+        tensors_list = [[] for _ in range(len(local_device_ids))]
+
+        for i in range(1, len(local_device_ids) + 1):
+            for j in range(i):
+                tensors_list[i - 1].append(
+                    torch.tensor([j + 1]).cuda(local_device_ids[j])
+                )
+
+        works = []
+        for tensors in tensors_list:
+            work = allreduce(tensors)
+            works.append(work)
+
+        # Barrier will ensure that all previous work is completed
+        pg.barrier().wait()
+
+        for i in range(1, len(local_device_ids) + 1):
+            for j in range(i):
+                self.assertEqual(
+                    torch.tensor([(j + 1) * self.world_size]), tensors_list[i - 1][j]
+                )
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_send_recv(self):
+        pg = self.pg
+        device = self.rank_to_GPU[self.rank][0]
+
+        # Generate the same random tensor
+        torch.manual_seed(0)
+        send_tensor = torch.rand(10, 10, device=device)
+        if self.rank == 0:
+            dist.send(send_tensor, 1)
+        if self.rank == 1:
+            recv_tensor = torch.rand(10, 10, device=device)
+            dist.recv(recv_tensor, 0)
+            self.assertEqual(send_tensor, recv_tensor)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_send_recv_complex(self):
+        pg = self.pg
+        device = self.rank_to_GPU[self.rank][0]
+
+        # Generate the same random tensor
+        torch.manual_seed(0)
+        send_tensor = torch.rand(10, 10, dtype=torch.cfloat, device=device)
+        if self.rank == 0:
+            dist.send(send_tensor, 1)
+        if self.rank == 1:
+            recv_tensor = torch.rand(10, 10, dtype=torch.cfloat, device=device)
+            dist.recv(recv_tensor, 0)
+            self.assertEqual(send_tensor, recv_tensor)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_send_recv_object_list(self):
+        device = self.rank_to_GPU[self.rank][0]
+
+        val = 99 if self.rank == 0 else None
+        object_list = [val] * self.world_size
+        if self.rank == 0:
+            dist.send_object_list(object_list, 1, device=device)
+        if self.rank == 1:
+            dist.recv_object_list(object_list, 0, device=device)
+            self.assertEqual(object_list[0], 99)
+
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_tensor_register_hook(self):
+        os.environ["TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK"] = "1"
+
+        pg = self.pg
+        local_device_id = self.rank_to_GPU[self.rank][0]
+
+        def allgather_base(output_t, input_t):
+            work = pg._allgather_base(output_t, input_t)
+            work.wait()
+
+        # allgather_base is GPU number agnostic.
+        # Each rank contribute one tensor regardless of GPU counts
+        tensor = torch.tensor([self.rank]).cuda(local_device_id)
+        output_t = torch.empty((self.world_size), dtype=tensor.dtype).cuda(
+            local_device_id
+        )
+
+        allgather_base(output_t, tensor)
+
+        # Verification
+        self.assertEqual(torch.arange(self.world_size), output_t)
+
+        # Unset env
+        del os.environ["TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK"]
+
+
+if __name__ == "__main__":
+    rank = int(os.getenv("RANK", -1))
+    world_size = int(os.getenv("WORLD_SIZE", 2))
+
+    if rank != -1:
+        # Launched with torchrun or other multi-proc launchers. Directly run the test.
+        ProcessGroupNCCLOpTest.run_rank(rank, world_size)
+    else:
+        # Launched as a single process. Spawn subprocess to run the tests.
+        # Also need a rendezvous file for `init_process_group` purpose.
+        rdvz_file = tempfile.NamedTemporaryFile(delete=False).name
+        torch.multiprocessing.spawn(
+            ProcessGroupNCCLOpTest.run_rank,
+            nprocs=world_size,
+            args=(world_size, rdvz_file),
+        )
diff --git a/test/distributed/test_c10d_pypg.py b/test/distributed/test_c10d_pypg.py
index 32f33591850f4..1d6743f038276 100644
--- a/test/distributed/test_c10d_pypg.py
+++ b/test/distributed/test_c10d_pypg.py
@@ -1,27 +1,26 @@
 # Owner(s): ["oncall: distributed"]
 
 import os
+import weakref
+
+import test_c10d_common
 
 import torch
 import torch.distributed as dist
-from torch.testing._internal.common_utils import (
-    run_tests,
-)
-from torch.futures import Future
 import torch.nn as nn
-from torch.nn.parallel import DistributedDataParallel as DDP
-import test_c10d_common
-import weakref
 from torch._C._distributed_c10d import _create_work_from_future
-from torch.testing._internal.common_distributed import (
-    MultiProcessTestCase,
-)
+from torch.futures import Future
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.testing._internal.common_distributed import MultiProcessTestCase
+from torch.testing._internal.common_utils import run_tests
+
 
 def create_work(result):
     future = Future()
     future.set_result(result)
     return _create_work_from_future(future)
 
+
 class MyWork(dist._Work):
     def __init__(self, result, pg):
         super().__init__()
@@ -38,10 +37,12 @@ def get_future(self):
         self.pg_().get_future_count += 1
         return self.future_
 
+
 class LonelyRankProcessGroup(dist.ProcessGroup):
     """
     This PG only supports world_size of 1
     """
+
     def __init__(self, rank, world, use_wrapper):
         super().__init__(rank, world)
         assert rank == 0
@@ -88,6 +89,7 @@ def getBackendName(self):
     def __repr__(self):
         return f"PLG w:{self._world} r:{self._rank}"
 
+
 # We cannot use parametrize as some tests are defined on the base class and use _get_process_group
 class AbstractDDPSingleRank(test_c10d_common.CommonDistributedDataParallelTest):
     def setUp(self):
@@ -112,10 +114,7 @@ def test_ddp_invoke_work_object(self):
         pg = self._get_process_group()
 
         torch.manual_seed(123)
-        model = nn.Sequential(
-            nn.Linear(2, 2),
-            nn.ReLU()
-        )
+        model = nn.Sequential(nn.Linear(2, 2), nn.ReLU())
         wrapped_model = model
         input_tensor = torch.rand(2)
         model = DDP(model, process_group=pg)
@@ -138,17 +137,22 @@ def test_ddp_with_pypg(self):
     def test_ddp_with_pypg_with_grad_views(self):
         pg = self._get_process_group()
 
-        self._test_ddp_with_process_group(pg, [torch.device("cpu")], device_ids=None, gradient_as_bucket_view=True)
+        self._test_ddp_with_process_group(
+            pg, [torch.device("cpu")], device_ids=None, gradient_as_bucket_view=True
+        )
+
 
 class TestDDPWithWorkSubclass(AbstractDDPSingleRank, MultiProcessTestCase):
     @property
     def use_wrapper(self):
         return False
 
+
 class TestDDPWithWorkWrapper(AbstractDDPSingleRank, MultiProcessTestCase):
     @property
     def use_wrapper(self):
         return True
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py
index a6fb33d4ead2c..97dc628c8a561 100644
--- a/test/distributed/test_c10d_spawn.py
+++ b/test/distributed/test_c10d_spawn.py
@@ -7,10 +7,8 @@
 import torch
 import torch.distributed as c10d
 import torch.multiprocessing as mp
-from torch.testing._internal.common_distributed import \
-    MultiProcessTestCase
-from torch.testing._internal.common_utils import load_tests, \
-    NO_MULTIPROCESSING_SPAWN
+from torch.testing._internal.common_distributed import MultiProcessTestCase
+from torch.testing._internal.common_utils import load_tests, NO_MULTIPROCESSING_SPAWN
 
 # Torch distributed.nn is not available in windows
 # check #42095, it errors on import.
@@ -25,11 +23,11 @@
 load_tests = load_tests
 
 if not c10d.is_available():
-    print('c10d not available, skipping tests', file=sys.stderr)
+    print("c10d not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
 if NO_MULTIPROCESSING_SPAWN:
-    print('spawn not available, skipping tests', file=sys.stderr)
+    print("spawn not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
 
@@ -40,14 +38,14 @@ def _test_multiprocess(self, f, shared_tensors, init_pg, n_output):
         ws = self.world_size
         # file store will delete the test file on destruction
         file = tempfile.NamedTemporaryFile(delete=False)
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         c2p = ctx.Queue(2)
         p2c = ctx.Queue(2)
         ps = []
         for i in range(ws):
             p = ctx.Process(
-                target=f,
-                args=(i, file.name, shared_tensors, ws, init_pg, c2p, p2c))
+                target=f, args=(i, file.name, shared_tensors, ws, init_pg, c2p, p2c)
+            )
 
             p.start()
             ps.append(p)
@@ -57,7 +55,7 @@ def _test_multiprocess(self, f, shared_tensors, init_pg, n_output):
             self.assertEqual(
                 expected,
                 result,
-                msg=f"Expect rank {pid} to receive tensor {expected} but got {result}."
+                msg=f"Expect rank {pid} to receive tensor {expected} but got {result}.",
             )
 
         for _ in range(ws):
@@ -70,7 +68,8 @@ def _test_multiprocess(self, f, shared_tensors, init_pg, n_output):
     # spawn mode. See https://bugs.python.org/issue33884.
     @classmethod
     def _test_broadcast_process(
-            cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c):
+        cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c
+    ):
         pg = init_pg(rank, filename, world_size)
         xs = [shared_tensors[rank]]
         pg.broadcast(xs).wait()
@@ -79,7 +78,8 @@ def _test_broadcast_process(
 
     @classmethod
     def _test_allreduce_process(
-            cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c):
+        cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c
+    ):
         pg = init_pg(rank, filename, world_size)
         xs = [shared_tensors[rank]]
         pg.allreduce(xs, op=c10d.ReduceOp.SUM).wait()
@@ -88,7 +88,8 @@ def _test_allreduce_process(
 
     @classmethod
     def _test_allgather_process(
-            cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c):
+        cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c
+    ):
         pg = init_pg(rank, filename, world_size)
         xs = [shared_tensors[rank]]
         ys = [[torch.zeros_like(xs[0]) for i in range(world_size)]]
diff --git a/test/distributed/test_c10d_spawn_gloo.py b/test/distributed/test_c10d_spawn_gloo.py
index 70009e9eb3d14..95897a2938b1d 100644
--- a/test/distributed/test_c10d_spawn_gloo.py
+++ b/test/distributed/test_c10d_spawn_gloo.py
@@ -11,19 +11,29 @@
 import torch.nn as nn
 from test_c10d_spawn import _torch_dist_nn_available, TestDistributedNNFunctions
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
-from torch.testing._internal.common_distributed import requires_gloo, \
-    create_device, skip_if_lt_x_gpu
-from torch.testing._internal.common_utils import TestCase, run_tests, skip_but_pass_in_sandcastle_if, TEST_WITH_DEV_DBG_ASAN
+from torch.testing._internal.common_distributed import (
+    create_device,
+    requires_gloo,
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.common_utils import (
+    run_tests,
+    skip_but_pass_in_sandcastle_if,
+    TEST_WITH_DEV_DBG_ASAN,
+    TestCase,
+)
 
 # Fails on Python-3.9, see https://github.com/pytorch/pytorch/issues/51619
 if sys.version_info < (3, 9):
-    class ProcessGroupShareTensorTest(test_c10d_spawn.AbstractProcessGroupShareTensorTest, TestCase):
 
+    class ProcessGroupShareTensorTest(
+        test_c10d_spawn.AbstractProcessGroupShareTensorTest, TestCase
+    ):
         @classmethod
         def opts(cls, threads=2):
             opts = c10d.ProcessGroupGloo._Options()
             opts._timeout = 5.0
-            opts._devices = [create_device(interface='lo')]
+            opts._devices = [create_device(interface="lo")]
             opts._threads = threads
             return opts
 
@@ -31,42 +41,59 @@ def opts(cls, threads=2):
         def _init_pg_gloo(cls, rank, filename, world_size):
             store = c10d.FileStore(filename, world_size)
             backend = c10d.ProcessGroupGloo(
-                store, rank, world_size, ProcessGroupShareTensorTest.opts())
+                store, rank, world_size, ProcessGroupShareTensorTest.opts()
+            )
             # set process group backends manually
-            c10d.init_process_group(backend="gloo", store=store, rank=rank, world_size=world_size)
+            c10d.init_process_group(
+                backend="gloo", store=store, rank=rank, world_size=world_size
+            )
             pg = c10d.distributed_c10d._get_default_group()
-            pg._register_backend(torch.device("cpu"), c10d.ProcessGroup.BackendType.GLOO, backend)
-            pg._register_backend(torch.device("cuda"), c10d.ProcessGroup.BackendType.GLOO, backend)
+            pg._register_backend(
+                torch.device("cpu"), c10d.ProcessGroup.BackendType.GLOO, backend
+            )
+            pg._register_backend(
+                torch.device("cuda"), c10d.ProcessGroup.BackendType.GLOO, backend
+            )
 
             return pg
 
-        @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+        @skip_but_pass_in_sandcastle_if(
+            not TEST_MULTIGPU, "At least 2 CUDA GPUS needed"
+        )
         def test_shared_broadcast_gloo(self):
             self._test_multiprocess(
                 ProcessGroupShareTensorTest._test_broadcast_process,
                 [torch.ones(2, 2).to(i) * i for i in range(self.world_size)],
                 ProcessGroupShareTensorTest._init_pg_gloo,
-                1)
+                1,
+            )
 
-        @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+        @skip_but_pass_in_sandcastle_if(
+            not TEST_MULTIGPU, "At least 2 CUDA GPUS needed"
+        )
         def test_shared_allreduce_gloo(self):
             self._test_multiprocess(
                 ProcessGroupShareTensorTest._test_allreduce_process,
                 [torch.ones(2, 2).to(i) for i in range(self.world_size)],
                 ProcessGroupShareTensorTest._init_pg_gloo,
-                1)
+                1,
+            )
 
-        @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+        @skip_but_pass_in_sandcastle_if(
+            not TEST_MULTIGPU, "At least 2 CUDA GPUS needed"
+        )
         def test_shared_allgather_gloo(self):
             self._test_multiprocess(
                 ProcessGroupShareTensorTest._test_allgather_process,
                 [torch.ones(2, 2).to(i) * i for i in range(self.world_size)],
                 ProcessGroupShareTensorTest._init_pg_gloo,
-                self.world_size)
+                self.world_size,
+            )
 
         @classmethod
         def _test_allgather_chunk_process(
-                cls, rank, filename, shared_tensor, world_size, init_pg, c2p, p2c):
+            cls, rank, filename, shared_tensor, world_size, init_pg, c2p, p2c
+        ):
             pg = init_pg(rank, filename, world_size)
             chunks = torch.chunk(shared_tensor, world_size, dim=0)
             x = chunks[rank]
@@ -76,13 +103,16 @@ def _test_allgather_chunk_process(
             c2p.put((rank, chunks[1].to("cpu"), ys[1].to("cpu")))
             p2c.get()
 
-        @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+        @skip_but_pass_in_sandcastle_if(
+            not TEST_MULTIGPU, "At least 2 CUDA GPUS needed"
+        )
         def test_shared_allgather_chunk_gloo(self):
             self._test_multiprocess(
                 ProcessGroupShareTensorTest._test_allgather_chunk_process,
                 torch.tensor(range(4)).reshape(2, 2),
                 ProcessGroupShareTensorTest._init_pg_gloo,
-                self.world_size)
+                self.world_size,
+            )
 
 
 class DistributedDataParallelSingleProcessTest(TestCase):
@@ -99,7 +129,9 @@ def tearDown(self):
 
     def _test_base(self, net, inp, check_allclose=True):
         store = c10d.FileStore(self.file.name, self.world_size)
-        c10d.init_process_group(backend="gloo", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            backend="gloo", store=store, rank=self.rank, world_size=self.world_size
+        )
         process_group = c10d.distributed_c10d._get_default_group()
         if inp[0].is_cuda:
             device_ids = [torch.cuda.current_device()]
@@ -107,9 +139,7 @@ def _test_base(self, net, inp, check_allclose=True):
             device_ids = None
 
         ddp = nn.parallel.DistributedDataParallel(
-            copy.deepcopy(net),
-            device_ids=device_ids,
-            process_group=process_group
+            copy.deepcopy(net), device_ids=device_ids, process_group=process_group
         )
 
         net_opt = torch.optim.Adam(net.parameters(), lr=0.001)
@@ -161,7 +191,9 @@ def __init__(self, input_dim, hidden_dim, output_dim, hidden_layers):
                 self.output_dim = output_dim
                 self.hidden_layers = hidden_layers
 
-                self.lstm = nn.LSTM(input_dim, hidden_dim, hidden_layers, batch_first=True)
+                self.lstm = nn.LSTM(
+                    input_dim, hidden_dim, hidden_layers, batch_first=True
+                )
                 self.h2o = nn.Linear(hidden_dim, output_dim)
 
             def forward(self, x, y):
@@ -174,7 +206,7 @@ def forward(self, x, y):
         net = Net(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS).to(0)
         inp = [
             torch.randn((BATCH_SIZE, SEQ_LEN, INPUT_DIM)).to(0),
-            torch.rand((BATCH_SIZE, SEQ_LEN, OUTPUT_DIM)).to(0)
+            torch.rand((BATCH_SIZE, SEQ_LEN, OUTPUT_DIM)).to(0),
         ]
 
         # Not checking result allclose as the parameter inconsistency exist
@@ -184,53 +216,70 @@ def forward(self, x, y):
 
 # Skip dev-asan as torch + multiprocessing spawn have known issues
 if not TEST_WITH_DEV_DBG_ASAN:
+
     class TestDistributedNNFunctionsGloo(TestDistributedNNFunctions):
         # Test Common Ops First.
         @requires_gloo()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_broadcast(self):
             self._test_broadcast("gloo")
 
         @requires_gloo()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_reduce(self):
             self._test_reduce("gloo")
 
         @requires_gloo()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_allreduce(self):
             self._test_allreduce("gloo")
 
         @requires_gloo()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_all_gather(self):
             self._test_all_gather("gloo")
 
         @requires_gloo()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_all_to_all(self):
             self._test_all_to_all("gloo")
 
         @requires_gloo()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_all_to_all_single(self):
             self._test_all_to_all_single("gloo")
 
         # Test Ops only supported in GLOO.
         @requires_gloo()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_gather(self):
             store = c10d.FileStore(self.file_name, self.world_size)
             # This is required because these functions calls directly to the .dist and needs
             # the world to be initialized
-            c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo')
+            c10d.init_process_group(
+                store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
+            )
             device = torch.device(f"cuda:{self.rank}")
             x = torch.ones(5, 5, device=device) + self.rank
             x.requires_grad = True
@@ -252,12 +301,16 @@ def test_gather(self):
 
         @requires_gloo()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_scatter(self):
             store = c10d.FileStore(self.file_name, self.world_size)
             # This is required because these functions calls directly to the .dist and needs
             # the world to be initialized
-            c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo')
+            c10d.init_process_group(
+                store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
+            )
             device = torch.device(f"cuda:{self.rank}")
             x0 = torch.ones(5, 5, device=device)
             x1 = torch.ones(5, 5, device=device) + 1
@@ -282,5 +335,5 @@ def test_scatter(self):
                 self.assertEqual(x0.grad, torch.zeros(5, 5, device=device))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_c10d_spawn_nccl.py b/test/distributed/test_c10d_spawn_nccl.py
index b543b80032c2d..67d6fae242669 100644
--- a/test/distributed/test_c10d_spawn_nccl.py
+++ b/test/distributed/test_c10d_spawn_nccl.py
@@ -1,20 +1,18 @@
 # Owner(s): ["oncall: distributed"]
 
 import sys
+
 import test_c10d_spawn
 import torch
 import torch.distributed as c10d
 from test_c10d_spawn import _torch_dist_nn_available, TestDistributedNNFunctions
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
-from torch.testing._internal.common_distributed import (
-    requires_nccl,
-    skip_if_lt_x_gpu,
-)
+from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import (
-    TestCase,
     run_tests,
     skip_but_pass_in_sandcastle_if,
     TEST_WITH_DEV_DBG_ASAN,
+    TestCase,
 )
 
 NO_NCCL = not hasattr(c10d, "ProcessGroupNCCL")
@@ -30,7 +28,9 @@ def _init_pg_nccl(cls, rank, filename, world_size):
             store = c10d.FileStore(filename, world_size)
             return c10d.ProcessGroupNCCL(store, rank, world_size)
 
-        @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+        @skip_but_pass_in_sandcastle_if(
+            not TEST_MULTIGPU, "At least 2 CUDA GPUS needed"
+        )
         @skip_but_pass_in_sandcastle_if(NO_NCCL, "NCCL needed")
         def test_shared_broadcast_nccl(self):
             self._test_multiprocess(
@@ -40,7 +40,9 @@ def test_shared_broadcast_nccl(self):
                 1,
             )
 
-        @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+        @skip_but_pass_in_sandcastle_if(
+            not TEST_MULTIGPU, "At least 2 CUDA GPUS needed"
+        )
         @skip_but_pass_in_sandcastle_if(NO_NCCL, "NCCL needed")
         def test_shared_allreduce_nccl(self):
             self._test_multiprocess(
@@ -63,7 +65,9 @@ def _test_reduce_process(
                 c2p.put((rank, torch.ones(2, 2), x.to("cpu")))
             p2c.get()
 
-        @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+        @skip_but_pass_in_sandcastle_if(
+            not TEST_MULTIGPU, "At least 2 CUDA GPUS needed"
+        )
         @skip_but_pass_in_sandcastle_if(NO_NCCL, "NCCL needed")
         def test_shared_reduce_nccl(self):
             self._test_multiprocess(
@@ -73,7 +77,9 @@ def test_shared_reduce_nccl(self):
                 1,
             )
 
-        @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+        @skip_but_pass_in_sandcastle_if(
+            not TEST_MULTIGPU, "At least 2 CUDA GPUS needed"
+        )
         @skip_but_pass_in_sandcastle_if(NO_NCCL, "NCCL needed")
         def test_shared_allgather_nccl(self):
             self._test_multiprocess(
@@ -99,50 +105,66 @@ def test_broadcast(self):
 
         @requires_nccl()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_reduce(self):
             self._test_reduce("nccl")
 
         @requires_nccl()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_allreduce(self):
             self._test_allreduce("nccl")
 
         @requires_nccl()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_all_gather(self):
             self._test_all_gather("nccl")
 
         @requires_nccl()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_all_to_all(self):
             self._test_all_to_all("nccl")
 
         @requires_nccl()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_all_to_all_single(self):
             self._test_all_to_all_single("nccl")
 
         # Test Ops only supported in NCCL.
         @requires_nccl()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_reduce_scatter(self):
             store = c10d.FileStore(self.file_name, self.world_size)
             # This is required because these functions calls directly to the .dist and needs
             # the world to be initialized
-            c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl')
+            c10d.init_process_group(
+                store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
+            )
             device = torch.device(f"cuda:{self.rank}")
             x0 = torch.ones(5, 5, device=device) + self.rank
             x1 = torch.ones(5, 5, device=device) + self.rank + 1
             x0.requires_grad = True
             x1.requires_grad = True
             y = torch.empty_like(x0)
-            expected = (1 + self.world_size) * self.world_size / 2 + self.world_size * self.rank
+            expected = (
+                1 + self.world_size
+            ) * self.world_size / 2 + self.world_size * self.rank
             y = torch.distributed.nn.reduce_scatter(y, [x0, x1])
             self.assertEqual(y, torch.ones(5, 5, device=device) * expected)
             z = y.sin().sum()
@@ -156,16 +178,19 @@ def test_reduce_scatter(self):
 
         @requires_nccl()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_reduce_scatter_non_contiguous(self):
             store = c10d.FileStore(self.file_name, self.world_size)
             # This is required because these functions calls directly to the .dist and needs
             # the world to be initialized
-            c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl')
+            c10d.init_process_group(
+                store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
+            )
             device = torch.device(f"cuda:{self.rank}")
 
             class NonContiguousGrad(torch.autograd.Function):
-
                 @staticmethod
                 def forward(ctx, input):
                     return input
@@ -184,10 +209,14 @@ def backward(ctx, grad_output):
 
         @requires_nccl()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_all_gather_base(self):
             store = c10d.FileStore(self.file_name, self.world_size)
-            c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl')
+            c10d.init_process_group(
+                store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
+            )
 
             device = torch.device(f"cuda:{self.rank}")
             x = torch.ones(5, 5, device=device) + self.rank
@@ -198,7 +227,10 @@ def test_all_gather_base(self):
             self.assertEqual(output.size(), torch.Size((5 * self.world_size, 5)))
 
             for idx in range(self.world_size):
-                self.assertEqual(output[5 * idx : 5 * (idx + 1)], torch.ones(5, 5, device=device) + idx)
+                self.assertEqual(
+                    output[5 * idx : 5 * (idx + 1)],
+                    torch.ones(5, 5, device=device) + idx,
+                )
 
             y = torch.sum(output.view(self.world_size, 5, 5), axis=0)
             z = y.sin().sum()
diff --git a/test/distributed/test_c10d_spawn_ucc.py b/test/distributed/test_c10d_spawn_ucc.py
index ecd4bc2230a9f..81f7ec621018f 100644
--- a/test/distributed/test_c10d_spawn_ucc.py
+++ b/test/distributed/test_c10d_spawn_ucc.py
@@ -1,21 +1,19 @@
 # Owner(s): ["oncall: distributed"]
 
 import sys
+
 import test_c10d_spawn
 import torch
 import torch.distributed as c10d
 from test_c10d_spawn import _torch_dist_nn_available, TestDistributedNNFunctions
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
-from torch.testing._internal.common_distributed import (
-    requires_ucc,
-    skip_if_lt_x_gpu,
-)
+from torch.testing._internal.common_distributed import requires_ucc, skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import (
-    TestCase,
     run_tests,
     skip_but_pass_in_sandcastle,
     skip_but_pass_in_sandcastle_if,
     TEST_WITH_DEV_DBG_ASAN,
+    TestCase,
 )
 
 NO_UCC = not hasattr(c10d, "ProcessGroupUCC")
@@ -29,10 +27,14 @@ class ProcessGroupShareTensorTest(
         @classmethod
         def _init_pg_ucc(cls, rank, filename, world_size):
             store = c10d.FileStore(filename, world_size)
-            c10d.init_process_group(backend="ucc", store=store, rank=rank, world_size=world_size)
+            c10d.init_process_group(
+                backend="ucc", store=store, rank=rank, world_size=world_size
+            )
             return c10d.distributed_c10d._get_default_group()
 
-        @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+        @skip_but_pass_in_sandcastle_if(
+            not TEST_MULTIGPU, "At least 2 CUDA GPUS needed"
+        )
         @skip_but_pass_in_sandcastle_if(NO_UCC, "UCC needed")
         def test_shared_broadcast_ucc(self):
             self._test_multiprocess(
@@ -42,7 +44,9 @@ def test_shared_broadcast_ucc(self):
                 1,
             )
 
-        @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+        @skip_but_pass_in_sandcastle_if(
+            not TEST_MULTIGPU, "At least 2 CUDA GPUS needed"
+        )
         @skip_but_pass_in_sandcastle_if(NO_UCC, "UCC needed")
         def test_shared_allreduce_ucc(self):
             self._test_multiprocess(
@@ -52,7 +56,9 @@ def test_shared_allreduce_ucc(self):
                 1,
             )
 
-        @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+        @skip_but_pass_in_sandcastle_if(
+            not TEST_MULTIGPU, "At least 2 CUDA GPUS needed"
+        )
         @skip_but_pass_in_sandcastle_if(NO_UCC, "UCC needed")
         def test_shared_allgather_ucc(self):
             self._test_multiprocess(
@@ -78,34 +84,47 @@ def test_broadcast(self):
 
         @requires_ucc()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_reduce(self):
             self._test_reduce("ucc")
 
         @requires_ucc()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_allreduce(self):
             self._test_allreduce("ucc")
 
         @requires_ucc()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
-        @skip_but_pass_in_sandcastle("runs into illegal memory access on first assertEqual check when run locally")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
+        @skip_but_pass_in_sandcastle(
+            "runs into illegal memory access on first assertEqual check when run locally"
+        )
         def test_all_gather(self):
             self._test_all_gather("ucc")
 
         @requires_ucc()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_all_to_all(self):
             self._test_all_to_all("ucc")
 
         @requires_ucc()
         @skip_if_lt_x_gpu(2)
-        @skip_but_pass_in_sandcastle_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @skip_but_pass_in_sandcastle_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
         def test_all_to_all_single(self):
             self._test_all_to_all_single("ucc")
 
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_c10d_ucc.py b/test/distributed/test_c10d_ucc.py
index 0c35fea9b54ac..75294c90b5df3 100644
--- a/test/distributed/test_c10d_ucc.py
+++ b/test/distributed/test_c10d_ucc.py
@@ -23,9 +23,9 @@
 import torch.testing._internal.common_utils as common
 from test_c10d_common import (
     gpus_for_rank,
-    Task,
     ModuleForDdpCommHook,
     SparseGradientModule,
+    Task,
 )
 from torch import nn
 from torch.nn.parallel import DistributedDataParallel
@@ -36,10 +36,10 @@
     verify_ddp_error_logged,
 )
 from torch.testing._internal.common_utils import (
-    TestCase,
-    run_tests,
     retry_on_connect_failures,
+    run_tests,
     skip_but_pass_in_sandcastle,
+    TestCase,
 )
 
 
@@ -207,7 +207,7 @@ def _test_allreduce_basics(self, fn):
 
         # Single input tests
         tests = simple_reduce_tests(self.rank, self.world_size)
-        for (op, input, expected) in tests:
+        for op, input, expected in tests:
             opts = c10d.AllreduceOptions()
             opts.reduceOp = op
             tensor = fn(input)
@@ -260,7 +260,7 @@ def test_allgather_basics(self):
 
     def _test_reduce_basics(self, fn):
         pg = self._create_process_group_ucc()
-        for (op, input, output) in simple_reduce_tests(self.rank, self.world_size):
+        for op, input, output in simple_reduce_tests(self.rank, self.world_size):
             for root in range(self.world_size):
                 opts = c10d.ReduceOptions()
                 opts.reduceOp = op
@@ -346,7 +346,9 @@ def setUp(self):
 
     def _get_process_group(self):
         store = self._get_store()
-        c10d.init_process_group("ucc", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            "ucc", store=store, rank=self.rank, world_size=self.world_size
+        )
         return c10d.distributed_c10d._get_default_group()
 
     def _test_ucc_backend(
@@ -383,7 +385,9 @@ def test_ucc_backend_1gpu_module_device_ids_torch_device_list(self):
 
     # TODO: test_ucc_backend_2gpu_module and test_ucc_backend_4gpu_module
     # require broadcast_coalesced which is not supported by ucc currently
-    @skip_but_pass_in_sandcastle("requires broadcast coalesced, which is not supported by ucc currently")
+    @skip_but_pass_in_sandcastle(
+        "requires broadcast coalesced, which is not supported by ucc currently"
+    )
     @requires_ucc()
     @skip_if_lt_x_gpu(4)
     def test_ucc_backend_2gpu_module(self):
@@ -391,7 +395,9 @@ def test_ucc_backend_2gpu_module(self):
         devices = [torch.device("cuda:" + str(i)) for i in int_devices]
         self._test_ucc_backend(devices, None, multi_device=True)
 
-    @skip_but_pass_in_sandcastle("requires broadcast coalesced, which is not supported by ucc currently")
+    @skip_but_pass_in_sandcastle(
+        "requires broadcast coalesced, which is not supported by ucc currently"
+    )
     @requires_ucc()
     @skip_if_lt_x_gpu(8)
     def test_ucc_backend_4gpu_module(self):
@@ -646,7 +652,9 @@ def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model):
         # Check that the gradients are sparse and identical
         vanilla_parameter = next(vanilla_model.parameters())
         ddp_parameter = next(ddp_model.parameters())
-        self.assertEqual(vanilla_parameter.grad.coalesce(), ddp_parameter.grad.coalesce())
+        self.assertEqual(
+            vanilla_parameter.grad.coalesce(), ddp_parameter.grad.coalesce()
+        )
 
     @requires_ucc()
     @skip_if_lt_x_gpu(2)
@@ -874,7 +882,9 @@ def test_ddp_invalid_comm_hook_return_type(self):
             ModuleForDdpCommHook(), process_group=process_group
         )
 
-        expected_err = "Communication hook: return annotation should be torch.futures.Future"
+        expected_err = (
+            "Communication hook: return annotation should be torch.futures.Future"
+        )
         with self.assertRaisesRegex(
             ValueError,
             expected_err,
@@ -1035,7 +1045,6 @@ def test_tensor_dtype_complex(self):
 
 
 class CompilerTest(test_c10d_common.CompilerTest):
-
     @property
     def world_size(self):
         return 2
@@ -1058,15 +1067,11 @@ def test_allreduce_work_wait_gpu(self):
 
     @skip_if_lt_x_gpu(2)
     def test_allgather_work_wait_gpu(self):
-        self._test_allgather_work_wait(
-            torch.ones(2, 2, device=self.rank) * self.rank
-        )
+        self._test_allgather_work_wait(torch.ones(2, 2, device=self.rank) * self.rank)
 
     @skip_if_lt_x_gpu(2)
     def test_broadcast_work_wait_gpu(self):
-        self._test_broadcast_work_wait(
-            torch.ones(2, 2, device=self.rank) * self.rank
-        )
+        self._test_broadcast_work_wait(torch.ones(2, 2, device=self.rank) * self.rank)
 
     @skip_if_lt_x_gpu(2)
     def test_nested_comm_tensor_wrapping_gpu(self):
@@ -1074,6 +1079,7 @@ def test_nested_comm_tensor_wrapping_gpu(self):
             torch.ones(2, 2, device=self.rank) * self.rank
         )
 
+    @skip_if_lt_x_gpu(2)
     def test_consecutive_comm_work_wait_gpu(self):
         self._test_consecutive_comm_work_wait(
             torch.ones(2, 2, device=self.rank) * self.rank
@@ -1085,28 +1091,21 @@ def test_allreduce_work_wait_cpu(self):
         )
 
     def test_allgather_work_wait_cpu(self):
-        self._test_allgather_work_wait(
-            torch.ones(2, 2) * self.rank
-        )
+        self._test_allgather_work_wait(torch.ones(2, 2) * self.rank)
 
     def test_broadcast_work_wait_cpu(self):
-        self._test_broadcast_work_wait(
-            torch.ones(2, 2) * self.rank
-        )
+        self._test_broadcast_work_wait(torch.ones(2, 2) * self.rank)
 
     def test_nested_comm_tensor_wrapping_cpu(self):
-        self._test_nested_comm_tensor_wrapping(
-            torch.ones(2, 2) * self.rank
-        )
+        self._test_nested_comm_tensor_wrapping(torch.ones(2, 2) * self.rank)
 
     def test_consecutive_comm_work_wait_cpu(self):
-        self._test_consecutive_comm_work_wait(
-            torch.ones(2, 2) * self.rank
-        )
-
+        self._test_consecutive_comm_work_wait(torch.ones(2, 2) * self.rank)
 
-class UccProcessGroupWithDispatchedCollectivesTests(test_c10d_common.ProcessGroupWithDispatchedCollectivesTests):
 
+class UccProcessGroupWithDispatchedCollectivesTests(
+    test_c10d_common.ProcessGroupWithDispatchedCollectivesTests
+):
     @skip_but_pass_in_sandcastle("Fails on M60")
     @requires_ucc()
     @skip_if_lt_x_gpu(1)
diff --git a/test/distributed/test_collective_utils.py b/test/distributed/test_collective_utils.py
index 3b0c2e0199be9..727850680aa59 100644
--- a/test/distributed/test_collective_utils.py
+++ b/test/distributed/test_collective_utils.py
@@ -6,8 +6,8 @@
 from torch.distributed.collective_utils import all_gather, broadcast
 from torch.testing._internal.common_distributed import MultiProcessTestCase
 
-class TestCollectiveUtils(MultiProcessTestCase):
 
+class TestCollectiveUtils(MultiProcessTestCase):
     def setUp(self):
         super().setUp()
         self._spawn_processes()
@@ -26,7 +26,9 @@ def test_broadcast_result(self) -> None:
         Basic unit test for broadcast using a process group of default world size.
         """
         store = c10d.FileStore(self.file_name, self.world_size)
-        c10d.init_process_group(backend="gloo", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            backend="gloo", store=store, rank=self.rank, world_size=self.world_size
+        )
         pg = c10d.new_group(pg_options=self.opts())
 
         func = mock.MagicMock()
@@ -77,7 +79,9 @@ def test_all_gather_result(self) -> None:
         Basic unit test for all_gather using a process group of default world size.
         """
         store = c10d.FileStore(self.file_name, self.world_size)
-        c10d.init_process_group(backend="gloo", store=store, rank=self.rank, world_size=self.world_size)
+        c10d.init_process_group(
+            backend="gloo", store=store, rank=self.rank, world_size=self.world_size
+        )
         pg = c10d.new_group(pg_options=self.opts())
 
         func = mock.MagicMock()
@@ -85,7 +89,9 @@ def test_all_gather_result(self) -> None:
 
         res = all_gather(data_or_fn=func, pg=pg)
         func.assert_called_once()
-        assert res == list(range(self.world_size)), f"Expect res to be list of 0 through {self.world_size} (got {res})"
+        assert res == list(
+            range(self.world_size)
+        ), f"Expect res to be list of 0 through {self.world_size} (got {res})"
 
     def test_all_gather_result_no_pg(self) -> None:
         """
diff --git a/test/distributed/test_compute_comm_reordering.py b/test/distributed/test_compute_comm_reordering.py
index a3563e59d7dc1..cfb29ed4a7d3f 100644
--- a/test/distributed/test_compute_comm_reordering.py
+++ b/test/distributed/test_compute_comm_reordering.py
@@ -34,12 +34,9 @@
 def get_snode_runtime_for_reorder_compute_test(snode):
     # NOTE: custom cost model to show that the compute reordering algorithm is working
     # Collective kernels
-    if isinstance(snode.node, ir.CollectiveKernel):
-        if isinstance(snode.node, ir.AllReduce):
-            return 100
-        else:
-            return 100
-    elif isinstance(snode.node, ir.Wait):
+    if isinstance(snode.node, ir._CollectiveKernel):
+        return 100
+    elif isinstance(snode.node, ir._WaitKernel):
         return 0
     # High-arithmetic-intensity compute kernels
     elif isinstance(snode.node, ir.ExternKernel):
@@ -278,9 +275,14 @@ def func(a, *, tag, ranks, group_size):
             self.assertTrue(same(out, correct))
 
     def test_nccl_heuristics(self):
-        assert list(baseLat.shape) == [len(NCCL_ALGO), len(NCCL_PROTO)]
-        assert list(hwLat.shape) == [len(NCCL_HW), len(NCCL_ALGO), len(NCCL_PROTO)]
-        assert llMaxBws.shape[0] == len(NVIDIA_GPU_TYPE)
+        assert len(baseLat) == len(NCCL_ALGO)
+        assert all(len(x) == len(NCCL_PROTO) for x in baseLat)
+
+        assert len(hwLat) == len(NCCL_HW)
+        assert all(len(x) == len(NCCL_ALGO) for x in hwLat)
+        assert all(len(y) == len(NCCL_PROTO) for x in hwLat for y in x)
+
+        assert len(llMaxBws) == len(NVIDIA_GPU_TYPE)
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py
index 3d88fc3851564..8e380ca9df86f 100644
--- a/test/distributed/test_data_parallel.py
+++ b/test/distributed/test_data_parallel.py
@@ -1,48 +1,61 @@
 # Owner(s): ["oncall: distributed"]
 
 import contextlib
+import functools
 import io
-from copy import deepcopy
 from collections import OrderedDict
+from copy import deepcopy
 from itertools import product
-import functools
 
 import torch
+import torch.nn.functional as F
+import torch.nn.parallel as dp
 from torch import nn
 from torch.cuda.amp import autocast
-import torch.nn.parallel as dp
-from torch.testing._internal.common_cuda import TEST_MULTIGPU, TEST_CUDA
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, onlyCUDA, skipMeta
-from torch.testing._internal.common_utils import run_tests, TestCase
-from torch.testing._internal.common_utils import _assertGradAndGradgradChecks, gradcheck
-from torch.testing._internal.common_utils import dtype2prec_DONTUSE
-from torch.testing._internal.common_utils import skip_but_pass_in_sandcastle_if
-import torch.nn.functional as F
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+    onlyCUDA,
+    skipMeta,
+)
+from torch.testing._internal.common_utils import (
+    _assertGradAndGradgradChecks,
+    dtype2prec_DONTUSE,
+    gradcheck,
+    run_tests,
+    skip_but_pass_in_sandcastle_if,
+    TestCase,
+)
 
 NO_NCCL = not hasattr(torch.distributed, "ProcessGroupNCCL")
 
 # batched grad doesn't support data parallel
 gradcheck = functools.partial(gradcheck, check_batched_grad=False)
-_assertGradAndGradgradChecks = functools.partial(_assertGradAndGradgradChecks, check_batched_grad=False)
+_assertGradAndGradgradChecks = functools.partial(
+    _assertGradAndGradgradChecks, check_batched_grad=False
+)
 
-class TestDataParallel(TestCase):
 
+class TestDataParallel(TestCase):
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_data_parallel_buffers_requiring_grad(self):
         class TestModule(nn.Module):
             def __init__(self, t):
                 super().__init__()
-                self.register_buffer('t_rg', t)
-                self.register_buffer('t_not_rg', t.clone().detach())
+                self.register_buffer("t_rg", t)
+                self.register_buffer("t_not_rg", t.clone().detach())
 
             def forward(self, x):
                 return x * self.t_rg + self.t_not_rg
 
-        m = TestModule(torch.randn(100, device='cuda', requires_grad=True, dtype=torch.double))
+        m = TestModule(
+            torch.randn(100, device="cuda", requires_grad=True, dtype=torch.double)
+        )
         self.assertTrue(m.t_rg.requires_grad)
 
         dpm = nn.DataParallel(m, [0, 1])
-        inp = torch.randn(2, 100, device='cuda', dtype=torch.double)
+        inp = torch.randn(2, 100, device="cuda", dtype=torch.double)
 
         def fn(t):
             return dpm(inp)
@@ -51,12 +64,12 @@ def fn(t):
 
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_data_parallel_rnn(self):
-
         class TestModule(torch.nn.Module):
-
             def __init__(self):
                 super().__init__()
-                self.rnn = torch.nn.LSTM(300, 1024, 1, batch_first=True, bidirectional=True)
+                self.rnn = torch.nn.LSTM(
+                    300, 1024, 1, batch_first=True, bidirectional=True
+                )
 
             def forward(self, x):
                 self.rnn.flatten_parameters()
@@ -86,8 +99,9 @@ def step(model):
 
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_data_parallel_lazy_linear(self):
-
-        with self.assertRaisesRegex(ValueError, 'Attempted to use an uninitialized parameter'):
+        with self.assertRaisesRegex(
+            ValueError, "Attempted to use an uninitialized parameter"
+        ):
             model_dp = torch.nn.DataParallel(torch.nn.LazyLinear(10).to(0))
             model_dp(torch.rand(10, 10).to(0))
 
@@ -133,23 +147,23 @@ def test_parallel_apply_autocast(self):
     def test_parallel_apply_passes_exception(self):
         # we define and instantiate a module that will throw a KeyError
         class TestModule(nn.Module):
-
             def forward(self, *args):
-                return {}['wonderful']
+                return {}["wonderful"]
 
         l1 = TestModule().to("cuda", torch.float)
         # and check that parallel_apply passes on the exception
         # (we can use a single device twice for this test)
-        with self.assertRaisesRegex(KeyError,
-                                    'Caught KeyError in replica \\d '
-                                    'on device 0.\nOriginal Traceback'
-                                    '[\\s\\S]+wonderful'):
+        with self.assertRaisesRegex(
+            KeyError,
+            "Caught KeyError in replica \\d "
+            "on device 0.\nOriginal Traceback"
+            "[\\s\\S]+wonderful",
+        ):
             dp.parallel_apply(modules=(l1, l1), inputs=(None, None))
 
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_data_parallel_multiple_input(self):
         class TestModule(nn.Module):
-
             def forward(self, var1, var2, float1, var3=None):
                 if var3 is None:
                     return float1 * (var1 * var2)
@@ -205,13 +219,13 @@ def local_test(out):
         out = dpm(var1, var2, float1, var3=var3)
         local_test(out)
 
-        kwarg_wrap = {'var3': var3}
+        kwarg_wrap = {"var3": var3}
         out = dp.data_parallel(
-            m, (var1, var2, float1), (0, 1), module_kwargs=kwarg_wrap)
+            m, (var1, var2, float1), (0, 1), module_kwargs=kwarg_wrap
+        )
         local_test(out)
 
-        out = dp.data_parallel(
-            m, (var1, var2, float1), (0,), module_kwargs=kwarg_wrap)
+        out = dp.data_parallel(m, (var1, var2, float1), (0,), module_kwargs=kwarg_wrap)
         local_test(out)
 
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
@@ -223,8 +237,7 @@ def test_data_parallel_small_back(self):
 
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_data_parallel_model_device(self):
-        r"""Test device[0] check at forward time.
-        """
+        r"""Test device[0] check at forward time."""
         l = nn.Linear(2, 2)
         inp = torch.randn(2, 2)
         inp_cuda0 = inp.cuda(0)
@@ -246,8 +259,12 @@ def test(inner_m, dp_device, inp, device_ids, should_fail):
                 expect_device = torch.device(f"cuda:{device_ids[0]}")
 
             if should_fail:
+
                 def assert_correct():
-                    return self.assertRaisesRegex(RuntimeError, error_msg.format(expect_device))
+                    return self.assertRaisesRegex(
+                        RuntimeError, error_msg.format(expect_device)
+                    )
+
             else:
                 assert_correct = dummy_ctx_manager
 
@@ -263,14 +280,14 @@ def assert_correct():
             with assert_correct():
                 nn.parallel.data_parallel(inner_m.to(dp_device), inp, device_ids)
 
-        test(l.to('cpu'), None, inp, None, should_fail=True)
+        test(l.to("cpu"), None, inp, None, should_fail=True)
         test(l.cuda(1), None, inp_cuda0, None, should_fail=True)
         test(l.cuda(), None, inp_cuda0, [1, 0], should_fail=True)
 
         test(l.cuda(), None, inp_cuda0, None, should_fail=False)
-        test(l.cpu(), 'cuda', inp_cuda0, None, should_fail=False)
+        test(l.cpu(), "cuda", inp_cuda0, None, should_fail=False)
         test(l.cuda(1), None, inp_cuda1, [1, 0], should_fail=False)
-        test(l.cpu(), 'cuda:1', inp_cuda1, [1, 0], should_fail=False)
+        test(l.cpu(), "cuda:1", inp_cuda1, [1, 0], should_fail=False)
 
         s = nn.Sequential(l.cpu())
         test(s, None, inp, None, should_fail=True)
@@ -391,8 +408,10 @@ def test_data_parallel_sparse(self):
     def test_data_parallel_nested_output(self):
         def fn(input):
             return [
-                input, (input.sin(), input.cos(), [input.add(1)]), input,
-                OrderedDict(a=input, b=[input.sin()])
+                input,
+                (input.sin(), input.cos(), [input.add(1)]),
+                input,
+                OrderedDict(a=input, b=[input.sin()]),
             ]
 
         class Net(nn.Module):
@@ -412,11 +431,11 @@ def forward(self, input):
         self.assertIsInstance(output[2], torch.Tensor)
         self.assertIsInstance(output[3], dict)
         self.assertEqual(len(output[3]), 2)
-        self.assertIn('a', output[3])
-        self.assertIn('b', output[3])
-        self.assertIsInstance(output[3]['a'], torch.Tensor)
-        self.assertIsInstance(output[3]['b'], list)
-        self.assertIsInstance(output[3]['b'][0], torch.Tensor)
+        self.assertIn("a", output[3])
+        self.assertIn("b", output[3])
+        self.assertIsInstance(output[3]["a"], torch.Tensor)
+        self.assertIsInstance(output[3]["b"], list)
+        self.assertIsInstance(output[3]["b"][0], torch.Tensor)
 
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_data_parallel_nested_input(self):
@@ -437,14 +456,14 @@ def forward(self, *input):
     def test_data_parallel_module_zero_inputs(self):
         class TestModule(nn.Module):
             def forward(self):
-                t = torch.eye(2, 3, device='cuda:0')
+                t = torch.eye(2, 3, device="cuda:0")
                 return t + (1 - t)
 
         def test_helper(output, expected):
             self.assertEqual(output.get_device(), 0)
             self.assertEqual(output, expected)
 
-        expected = torch.ones(2, 3, device='cuda:0')
+        expected = torch.ones(2, 3, device="cuda:0")
         model = TestModule()
 
         test_helper(nn.DataParallel(model, [0])(), expected)
@@ -454,8 +473,8 @@ def test_helper(output, expected):
 
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_data_parallel_device_args(self):
-        cuda0 = torch.device('cuda:0')
-        cuda1 = torch.device('cuda:1')
+        cuda0 = torch.device("cuda:0")
+        cuda1 = torch.device("cuda:1")
 
         # test output_device
         l = nn.Linear(10, 5).to(cuda0, torch.float)
@@ -475,9 +494,12 @@ def test_data_parallel_function_deletion(self):
         def gradient_penalty(net, x):
             output = net(x)
             loss = torch.autograd.grad(
-                outputs=output, inputs=x,
+                outputs=output,
+                inputs=x,
                 grad_outputs=x.new_ones(output.size()),
-                create_graph=True, retain_graph=True)[0].mean()
+                create_graph=True,
+                retain_graph=True,
+            )[0].mean()
             return loss
 
         net = nn.Linear(4, 1).cuda()
@@ -490,9 +512,9 @@ def gradient_penalty(net, x):
         grads = [p.grad for p in net.parameters()]
         self.assertEqual(2, len(grads))
         self.assertEqual(
-            torch.tensor([[0.25, 0.25, 0.25, 0.25]], device='cuda:0'),
-            grads[0])
-        self.assertEqual(torch.tensor([0.0], device='cuda:0'), grads[1])
+            torch.tensor([[0.25, 0.25, 0.25, 0.25]], device="cuda:0"), grads[0]
+        )
+        self.assertEqual(torch.tensor([0.0], device="cuda:0"), grads[1])
 
     def _test_scatter(self, tensor):
         x = tensor.detach().requires_grad_()
@@ -523,7 +545,9 @@ def test_data_parallel_complex(self):
         class Cplx(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.cplx = torch.nn.Parameter(torch.zeros(1, 10, dtype=torch.cfloat).cuda())
+                self.cplx = torch.nn.Parameter(
+                    torch.zeros(1, 10, dtype=torch.cfloat).cuda()
+                )
 
             def forward(self, x):
                 return x + self.cplx
@@ -537,8 +561,8 @@ def forward(self, x):
 
     def _test_gather(self, output_device):
         inputs = (
-            torch.randn(2, 4, device='cuda:0', requires_grad=True, dtype=torch.double),
-            torch.randn(2, 4, device='cuda:1', requires_grad=True, dtype=torch.double),
+            torch.randn(2, 4, device="cuda:0", requires_grad=True, dtype=torch.double),
+            torch.randn(2, 4, device="cuda:1", requires_grad=True, dtype=torch.double),
         )
         result = dp.gather(inputs, output_device)
         self.assertEqual(result.size(), torch.Size([4, 4]))
@@ -554,12 +578,14 @@ def _test_gather(self, output_device):
         result.backward(grad)
         self.assertEqual(inputs[0].grad, grad[:2])
         self.assertEqual(inputs[1].grad, grad[2:])
-        _assertGradAndGradgradChecks(self, lambda x, y: dp.gather((x, y), output_device), inputs)
+        _assertGradAndGradgradChecks(
+            self, lambda x, y: dp.gather((x, y), output_device), inputs
+        )
 
         # test scalar inputs, should stack into a vector in this case
         inputs = (
-            torch.randn((), device='cuda:0', requires_grad=True, dtype=torch.double),
-            torch.randn((), device='cuda:1', requires_grad=True, dtype=torch.double),
+            torch.randn((), device="cuda:0", requires_grad=True, dtype=torch.double),
+            torch.randn((), device="cuda:1", requires_grad=True, dtype=torch.double),
         )
         result = dp.gather(inputs, output_device)
         self.assertEqual(result.size(), torch.Size([2]))
@@ -575,7 +601,9 @@ def _test_gather(self, output_device):
         result.backward(grad)
         self.assertEqual(inputs[0].grad, grad[0])
         self.assertEqual(inputs[1].grad, grad[1])
-        _assertGradAndGradgradChecks(self, lambda x, y: dp.gather((x, y), output_device), inputs)
+        _assertGradAndGradgradChecks(
+            self, lambda x, y: dp.gather((x, y), output_device), inputs
+        )
 
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_gather_cpu(self):
@@ -588,11 +616,11 @@ def test_gather_gpu(self):
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_gather_different_len_dicts(self):
         inputs = (
-            {'a': torch.randn(1, 2, requires_grad=True, device="cuda:0")},
+            {"a": torch.randn(1, 2, requires_grad=True, device="cuda:0")},
             {
-                'b': torch.randn(1, 2, requires_grad=True, device="cuda:1"),
-                'a': torch.randn(1, 2, requires_grad=True, device="cuda:1"),
-            }
+                "b": torch.randn(1, 2, requires_grad=True, device="cuda:1"),
+                "a": torch.randn(1, 2, requires_grad=True, device="cuda:1"),
+            },
         )
         with self.assertRaises(ValueError):
             _ = dp.gather(inputs, target_device=0)
@@ -618,9 +646,19 @@ def test_replicate_buffers(self):
         for devices in [(0, 1), [0, 1]]:
             replicas = dp.replicate(net, devices)
             for i, replica in enumerate(replicas):
-                self.assertEqual(replica.bn.running_mean.get_device(), i, msg='buffer on wrong device')
-                self.assertEqual(replica.bn.running_var.get_device(), i, msg='buffer on wrong device')
-                self.assertEqual(replica.bn.num_batches_tracked.get_device(), i, msg='buffer on wrong device')
+                self.assertEqual(
+                    replica.bn.running_mean.get_device(),
+                    i,
+                    msg="buffer on wrong device",
+                )
+                self.assertEqual(
+                    replica.bn.running_var.get_device(), i, msg="buffer on wrong device"
+                )
+                self.assertEqual(
+                    replica.bn.num_batches_tracked.get_device(),
+                    i,
+                    msg="buffer on wrong device",
+                )
 
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_zero_grad(self):
@@ -633,8 +671,9 @@ def __init__(self, testcase):
 
             def forward(self, x):
                 with self._testcase.assertWarnsRegex(
-                        UserWarning,
-                        r"Calling \.zero_grad\(\) from a module created with nn\.DataParallel\(\) has no effect."):
+                    UserWarning,
+                    r"Calling \.zero_grad\(\) from a module created with nn\.DataParallel\(\) has no effect.",
+                ):
                     self.zero_grad()
                 return x
 
@@ -672,10 +711,18 @@ class ConvNet(nn.Module):
             def __init__(self, layouts, dtype_list):
                 super().__init__()
                 self.dtypes = dtype_list
-                self.conv0 = torch.nn.Conv2d(8, 16, (2, 2)).to(memory_format=layouts[0], dtype=dtype_list[0])
-                self.conv1 = torch.nn.Conv2d(16, 32, (2, 2)).to(memory_format=layouts[1], dtype=dtype_list[1])
-                self.conv2 = torch.nn.Conv2d(32, 16, (2, 2)).to(memory_format=layouts[2], dtype=dtype_list[2])
-                self.conv3 = torch.nn.Conv2d(16, 8, (2, 2)).to(memory_format=layouts[3], dtype=dtype_list[3])
+                self.conv0 = torch.nn.Conv2d(8, 16, (2, 2)).to(
+                    memory_format=layouts[0], dtype=dtype_list[0]
+                )
+                self.conv1 = torch.nn.Conv2d(16, 32, (2, 2)).to(
+                    memory_format=layouts[1], dtype=dtype_list[1]
+                )
+                self.conv2 = torch.nn.Conv2d(32, 16, (2, 2)).to(
+                    memory_format=layouts[2], dtype=dtype_list[2]
+                )
+                self.conv3 = torch.nn.Conv2d(16, 8, (2, 2)).to(
+                    memory_format=layouts[3], dtype=dtype_list[3]
+                )
 
             def forward(self, x):
                 x = x.to(self.dtypes[0])
@@ -685,19 +732,25 @@ def forward(self, x):
                 x = self.conv3(x)
                 return x
 
-        layer_formats = ([torch.contiguous_format] * 4,
-                         [torch.channels_last] * 2 + [torch.contiguous_format] * 2,
-                         [torch.channels_last] * 4,)
-        layer_dtypes = ([torch.float] * 4,
-                        [torch.float] * 2 + [torch.half] * 2,
-                        [torch.half] * 4,)
+        layer_formats = (
+            [torch.contiguous_format] * 4,
+            [torch.channels_last] * 2 + [torch.contiguous_format] * 2,
+            [torch.channels_last] * 4,
+        )
+        layer_dtypes = (
+            [torch.float] * 4,
+            [torch.float] * 2 + [torch.half] * 2,
+            [torch.half] * 4,
+        )
 
         ndevs = torch.cuda.device_count()
         input = torch.randn(ndevs * 8, 8, 8, 8, device="cuda:0", dtype=torch.float)
         target = torch.randn(ndevs * 8, 8, 4, 4, device="cuda:0", dtype=torch.float)
         device_ids = list(range(ndevs))
 
-        with torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False):
+        with torch.backends.cudnn.flags(
+            enabled=True, deterministic=True, benchmark=False
+        ):
             for formats, dtype_list in product(layer_formats, layer_dtypes):
                 model_msg = f"formats = {formats} dtypes = {dtypes}"
                 try:
@@ -706,10 +759,13 @@ def forward(self, x):
                     opt = torch.optim.SGD(m.parameters(), lr=0.1)
                     opt_dp = torch.optim.SGD(m_dp.parameters(), lr=0.1)
                     has_half = any(p.dtype is torch.half for p in m.parameters())
-                    tol = 1.e-3 if has_half else 1.e-5
+                    tol = 1.0e-3 if has_half else 1.0e-5
                 except BaseException:
                     # Prints case-specific debugging info to narrow down failing case.
-                    print("Caught exception during model creation for " + model_msg, flush=True)
+                    print(
+                        "Caught exception during model creation for " + model_msg,
+                        flush=True,
+                    )
                     raise
                 # 2 iters:  First iter creates grads, second iter tries zeroed grads.
                 for it in range(2):
@@ -718,14 +774,28 @@ def forward(self, x):
                     try:
                         F.mse_loss(m(input).float(), target).backward()
                         F.mse_loss(m_dp(input).float(), target).backward()
-                        for i, ((layer_name, m_child), m_dp_child) in enumerate(zip(m.named_children(),
-                                                                                    m_dp.module.children())):
+                        for i, ((layer_name, m_child), m_dp_child) in enumerate(
+                            zip(m.named_children(), m_dp.module.children())
+                        ):
                             named_msg = layer_name + ".weight " + iter_msg
-                            self.assertTrue(m_child.weight.grad.is_contiguous(memory_format=formats[i]), named_msg)
-                            self.assertTrue(m_dp_child.weight.grad.is_contiguous(memory_format=formats[i]), named_msg)
-                            for j, ((param_name, p), p_dp) in enumerate(zip(m_child.named_parameters(),
-                                                                            m_dp_child.parameters())):
-                                named_msg = layer_name + "." + param_name + " " + iter_msg
+                            self.assertTrue(
+                                m_child.weight.grad.is_contiguous(
+                                    memory_format=formats[i]
+                                ),
+                                named_msg,
+                            )
+                            self.assertTrue(
+                                m_dp_child.weight.grad.is_contiguous(
+                                    memory_format=formats[i]
+                                ),
+                                named_msg,
+                            )
+                            for j, ((param_name, p), p_dp) in enumerate(
+                                zip(m_child.named_parameters(), m_dp_child.parameters())
+                            ):
+                                named_msg = (
+                                    layer_name + "." + param_name + " " + iter_msg
+                                )
                                 self.assertEqual(p.grad, p_dp.grad, rtol=tol, atol=tol)
                         opt.step()
                         opt_dp.step()
@@ -733,7 +803,10 @@ def forward(self, x):
                         opt_dp.zero_grad()
                     except BaseException:
                         # Makes sure we still get info if an error occurred somewhere other than the asserts.
-                        print("Caught exception during iterations at " + named_msg, flush=True)
+                        print(
+                            "Caught exception during iterations at " + named_msg,
+                            flush=True,
+                        )
                         raise
 
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
@@ -779,7 +852,6 @@ def check_fn(self_):
 
 
 class TestDataParallelDeviceType(TestCase):
-
     @onlyCUDA
     @skipMeta
     @dtypes(torch.float, torch.double, torch.half)
@@ -822,13 +894,13 @@ def __init__(self):
                 self.l = l
 
             def forward(self, input):
-                return self.l(input['data'])
+                return self.l(input["data"])
 
         l = nn.Linear(10, 5).to(device, dtype)
         i = torch.randn(20, 10, device=device, dtype=dtype)
         expected_out = l(i)
         n = nn.DataParallel(Net())
-        out = n(input={'data': i, 'unused': []})
+        out = n(input={"data": i, "unused": []})
         self.assertEqual(out.get_device(), 0)
         self.assertEqual(out, expected_out, atol=dtype2prec_DONTUSE[dtype], rtol=0)
 
@@ -842,13 +914,13 @@ def __init__(self):
                 self.l = l
 
             def forward(self, input):
-                return self.l(input['data'])
+                return self.l(input["data"])
 
         l = nn.Linear(10, 5).to(device, dtype)
         i = torch.randn(20, 10, device=device, dtype=dtype)
         expected_out = l(i)
         n = nn.DataParallel(Net())
-        out = n(input={'data': i, 'unused': {}})
+        out = n(input={"data": i, "unused": {}})
         self.assertEqual(out.get_device(), 0)
         self.assertEqual(out, expected_out, atol=dtype2prec_DONTUSE[dtype], rtol=0)
 
@@ -862,19 +934,19 @@ def __init__(self):
                 self.l = l
 
             def forward(self, input):
-                return self.l(input['data'])
+                return self.l(input["data"])
 
         l = nn.Linear(10, 5).to(device, dtype)
         i = torch.randn(20, 10, device=device, dtype=dtype)
         expected_out = l(i)
         n = nn.DataParallel(Net())
-        out = n(input={'data': i, 'unused': ()})
+        out = n(input={"data": i, "unused": ()})
         self.assertEqual(out.get_device(), 0)
         self.assertEqual(out, expected_out, atol=dtype2prec_DONTUSE[dtype], rtol=0)
 
 
 instantiate_device_type_tests(TestDataParallelDeviceType, globals())
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     TestCase._default_dtype_check_enabled = True
     run_tests()
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
index 776702fc10d48..d04fcf938c420 100644
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@@ -6,14 +6,16 @@
 import torch.distributed._functional_collectives as funcol
 from torch.distributed._tensor import DTensor
 from torch.distributed._tensor._collective_utils import (
-    mesh_all_to_all,
     mesh_broadcast,
     mesh_scatter,
+    unpad_tensor,
 )
 from torch.distributed._tensor.placement_types import _Partial, Shard
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh, init_device_mesh
 
 from torch.distributed.distributed_c10d import (
+    _get_default_group,
+    _world,
     get_global_rank,
     get_world_size,
     init_process_group,
@@ -24,6 +26,7 @@
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
+    skip_unless_torch_gpu,
     with_comms,
 )
 from torch.testing._internal.distributed.fake_pg import FakeStore
@@ -62,6 +65,13 @@ def test_init_process_group(self):
         self.assertTrue(is_initialized())
         self.destroy_pg()
 
+    @with_comms
+    @skip_unless_torch_gpu
+    def test_assert_invalid_mesh_tensor(self):
+        mesh = torch.arange(self.world_size).to(self.rank)
+        with self.assertRaises(ValueError):
+            device_mesh = DeviceMesh(self.device_type, mesh)
+
     @with_comms
     def test_get_group(self):
         mesh_shape = (2, self.world_size // 2)
@@ -135,12 +145,16 @@ def test_device_mesh_2d(self):
             self.assertEqual(global_ranks, current_rank_expected_group_ranks)
 
     @with_comms
-    def test_lazy_init_device_mesh(self):
-        mesh = DeviceMesh(self.device_type, [1], _init_process_groups=False)
+    def test_device_mesh_init_backend(self):
+        mesh = DeviceMesh(self.device_type, [1], _init_backend=False)
 
         with self.assertRaisesRegex(RuntimeError, "process groups not initialized!"):
             mesh.get_group()
 
+        # coordinates should always been populated when init_backend is False, as whenever
+        # we call init_backend we should make sure the default pg already created
+        mesh.get_coordinate()
+
     def test_fake_pg_device_mesh(self):
         fake_store = FakeStore()
         init_process_group("fake", store=fake_store, rank=0, world_size=self.world_size)
@@ -153,6 +167,30 @@ def test_fake_pg_device_mesh(self):
         )
         self.assertEqual(global_tensor.shape, (self.world_size * 2, 8))
 
+    @with_comms
+    def test_from_group(self):
+        # Simple test: check `from_group` for a global PG vs. directly
+        # initializing via `init_device_mesh`
+        global_pg = _get_default_group()
+        ref_global_mesh = init_device_mesh("cuda", (self.world_size,))
+        global_mesh = DeviceMesh.from_group(global_pg, "cuda")
+        self.assertEqual(ref_global_mesh, global_mesh)
+        self.assertEqual(ref_global_mesh._dim_group_infos, global_mesh._dim_group_infos)
+        self.assertEqual(
+            ref_global_mesh._coordinate_on_dim, global_mesh._coordinate_on_dim
+        )
+
+    def test_raises_invalid_device_type(self):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Device type with GPU index is not supported",
+        ):
+            # test init_device_mesh with an invalid device type that contains a GPU index
+            mesh_shape = (2, self.world_size // 2)
+            mesh_2d = init_device_mesh(
+                "cuda:0", mesh_shape=mesh_shape, mesh_dim_names=("dp", "tp")
+            )
+
 
 class DeviceMeshTestNDim(DTensorTestBase):
     @property
@@ -193,6 +231,44 @@ def test_device_mesh_hash(self):
         self.assertNotEqual(hash(mesh), hash(mesh3))
         self.assertNotEqual(hash(mesh2), hash(mesh3))
 
+    @with_comms
+    def test_get_local_rank_3d(self):
+        """
+        If we have a 3D mesh and we want to apply dp, pp, tp to it,
+        mesh_dim_names = ["dp", "pp", "tp"], and the mesh tensor would be:
+        mesh_3d_tensor = [
+            [
+                [0, 1],
+                [2, 3],
+            ],
+            [
+                [4, 5],
+                [6, 7],
+            ]
+
+        ]
+        """
+        mesh_shape = (2, 2, 2)
+        mesh_3d = init_device_mesh(
+            self.device_type, mesh_shape, mesh_dim_names=("dp", "pp", "tp")
+        )
+
+        # tp_rank_0: [0, 2, 4, 6], tp_rank_1: [1, 3, 5, 7]
+        tp_rank = mesh_3d.get_local_rank("tp")
+        print(f"{self.rank=}, {tp_rank=}")
+        expected_tp_rank = self.rank % 2
+        self.assertEqual(tp_rank, expected_tp_rank)
+
+        # pp_rank_0: [0, 1, 4, 5], pp_rank_1: [2, 3, 6, 7]
+        pp_rank = mesh_3d.get_local_rank("pp")
+        expected_pp_rank = 0 if self.rank % 4 <= 1 else 1
+        self.assertEqual(pp_rank, expected_pp_rank)
+
+        # dp_rank_0: [0, 1, 2, 3], dp_rank_1: [4, 5, 6, 7]
+        dp_rank = mesh_3d.get_local_rank("dp")
+        expected_dp_rank = self.rank // 4
+        self.assertEqual(dp_rank, expected_dp_rank)
+
 
 class InitDeviceMeshTest(DTensorTestBase):
     @property
@@ -246,12 +322,6 @@ class TestDeviceMeshGetItem(DTensorTestBase):
     def world_size(self):
         return 8
 
-    @with_comms
-    def test_raises_mesh_dim_less_than_2(self):
-        with self.assertRaisesRegex(RuntimeError, "Cannot slice a DeviceMesh"):
-            mesh = init_device_mesh(self.device_type, (8,))
-            child_mesh = mesh["DP"]
-
     @with_comms
     def test_raises_no_mesh_dim_found(self):
         with self.assertRaisesRegex(KeyError, "No `mesh_dim_names` found."):
@@ -293,6 +363,34 @@ def test_get_item(self):
         dp_group_idx = self.rank % 4
         self.assertEqual(mesh_2d["DP"].mesh, pg_ranks_by_dim_name["DP"][dp_group_idx])
 
+    @with_comms
+    def test_get_item_1d(self):
+        mesh = init_device_mesh(self.device_type, (8,), mesh_dim_names=("dp",))
+        # Make sure slicing out 1D mesh from a 1D mesh works.
+        # We are just dummy return without the parent mesh here.
+        dp_mesh = mesh["dp"]
+        self.assertEqual(dp_mesh, mesh)
+
+        with self.assertRaisesRegex(RuntimeError, "Invalid mesh_dim_name"):
+            dp_mesh = mesh["dim0"]
+
+    @with_comms
+    def test_cache_and_reuse_submesh_slice_result(self):
+        mesh = init_device_mesh(self.device_type, (2, 4), mesh_dim_names=("dp", "tp"))
+
+        dp_mesh = mesh["dp"]
+        ref_pg_count = _world.group_count
+
+        # When we call the "dp" slice second time, it should not create any new pg.
+        # As we are just using the cached result so the pg count should be the same.
+        dp_mesh_2 = mesh["dp"]
+        self.assertEqual(ref_pg_count, _world.group_count)
+
+        # When we call the "tp" slice, it should not create a new pg, as the "tp" slice would
+        # just reuse the parent mesh pg.
+        tp_mesh = mesh["tp"]
+        self.assertEqual(_world.group_count, ref_pg_count)
+
 
 class TestMeshEnv(DTensorTestBase):
     @with_comms
@@ -403,8 +501,8 @@ def test_scatter_uneven(self):
             mesh_scatter(scattered_tensor, padded_tensor_list, device_mesh, mesh_dim=0)
 
             if pad_sizes[my_rank] != 0:
-                scattered_tensor = shard_placement._unpad_tensor(
-                    scattered_tensor, pad_sizes[my_rank]
+                scattered_tensor = unpad_tensor(
+                    scattered_tensor, shard_dim, pad_sizes[my_rank]
                 )
 
             if scattered_tensor.numel() == 0:
@@ -445,9 +543,11 @@ def test_all_gather_uneven(self):
                 torch.chunk(big_tensor, device_mesh.size(), dim=shard_dim)
             )
             unpadded_list = [
-                shard_placement._unpad_tensor(big_tensor_chunks[i], pad_sizes[i])
-                if pad_sizes[i] > 0
-                else big_tensor_chunks[i]
+                (
+                    unpad_tensor(big_tensor_chunks[i], shard_dim, pad_sizes[i])
+                    if pad_sizes[i] > 0
+                    else big_tensor_chunks[i]
+                )
                 for i, big_tensor in enumerate(big_tensor_chunks)
             ]
             all_gathered_tensor = torch.cat(unpadded_list, dim=shard_dim)
@@ -540,8 +640,8 @@ def test_reduce_scatter_uneven(self):
 
             # unpad scattered_tensor
             if pad_sizes[my_rank] > 0:
-                scattered_tensor = shard_placement._unpad_tensor(
-                    scattered_tensor, pad_sizes[my_rank]
+                scattered_tensor = unpad_tensor(
+                    scattered_tensor, shard_dim, pad_sizes[my_rank]
                 )
 
             if scattered_tensor.numel() == 0:
@@ -599,70 +699,6 @@ def test_scatter_nd(self):
             mesh_scatter(received_tensor, scattered_tensors, mesh, mesh_dim=dim)
             self.assertEqual(received_tensor, torch.ones(3, 3) * self.rank)
 
-    @with_comms
-    def test_all_to_all_1d(self):
-        # transpose on a 2D tensor distributed over N nodes:
-        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
-        tensor_shape = [3, 3]
-        input_tensor_list = [
-            torch.ones(*tensor_shape, device=self.device_type)
-            * (rank + self.rank * self.world_size)
-            for rank in range(self.world_size)
-        ]
-        expected_tensor_list = [
-            torch.ones(tensor_shape, device=self.device_type)
-            * (self.rank + rank * self.world_size)  # i.e. transpose
-            for rank in range(self.world_size)
-        ]
-        for scatter_dim in range(len(tensor_shape)):
-            output_tensor_list = [
-                torch.empty_like(input_tensor_list[idx])
-                for idx in range(len(input_tensor_list))
-            ]
-            # scatter on dim > 0 would generate non-contiguous tensor, verify that works
-            mesh_all_to_all(output_tensor_list, input_tensor_list, mesh, mesh_dim=0)
-            output_tensor = torch.cat(output_tensor_list, dim=scatter_dim)
-            expected_tensor = torch.cat(expected_tensor_list, dim=scatter_dim)
-
-            self.assertEqual(output_tensor, expected_tensor)
-
-    @with_comms
-    def test_all_to_all_nd(self):
-        mesh_tensor = torch.arange(8).reshape(2, 2, 2)
-        mesh = DeviceMesh(self.device_type, mesh_tensor)
-        tensor_shape = [3, 3, 3]
-        # check all dim groups
-        dim_to_subgroups = mesh.get_group()
-        for dim, dim_group in enumerate(dim_to_subgroups):
-            my_coordinate = mesh.get_coordinate()[dim]
-            dim_group_size = get_world_size(dim_group)
-            global_ranks = [
-                get_global_rank(dim_group, i) for i in range(dim_group_size)
-            ]
-            input_tensor_list = [
-                torch.ones(*tensor_shape, device=self.device_type)
-                * (i + self.rank * dim_group_size)
-                for i in range(dim_group_size)
-            ]
-            expected_tensor_list = [
-                torch.ones(*tensor_shape, device=self.device_type)
-                * (my_coordinate + global_rank * dim_group_size)  # i.e. transpose
-                for global_rank in global_ranks
-            ]
-            for scatter_dim in range(len(tensor_shape)):
-                # input_tensor = torch.cat(input_tensor_list, dim=scatter_dim)
-                output_tensor_list = [
-                    torch.empty_like(input_tensor_list[idx])
-                    for idx in range(len(input_tensor_list))
-                ]
-                # scatter on dim > 0 would generate non-contiguous tensor, verify that works
-                mesh_all_to_all(
-                    output_tensor_list, input_tensor_list, mesh, mesh_dim=dim
-                )
-                output_tensor = torch.cat(output_tensor_list, dim=scatter_dim)
-                expected_tensor = torch.cat(expected_tensor_list, dim=scatter_dim)
-                self.assertEqual(output_tensor, expected_tensor)
-
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_distributed_spawn.py b/test/distributed/test_distributed_spawn.py
index 9867dcc3731ba..22e3fb3b06a91 100644
--- a/test/distributed/test_distributed_spawn.py
+++ b/test/distributed/test_distributed_spawn.py
@@ -2,10 +2,10 @@
 
 import os
 import sys
+from os import path
 
 import torch
 import torch.distributed as dist
-from os import path
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
@@ -13,13 +13,21 @@
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
-from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN, NO_MULTIPROCESSING_SPAWN
+from torch.testing._internal.common_utils import (
+    NO_MULTIPROCESSING_SPAWN,
+    run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
+)
 from torch.testing._internal.distributed.distributed_test import (
-    DistributedTest, TestDistBackend
+    DistributedTest,
+    TestDistBackend,
 )
 
 if TEST_WITH_DEV_DBG_ASAN:
-    print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr)
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
     sys.exit(0)
 
 if NO_MULTIPROCESSING_SPAWN:
@@ -45,12 +53,13 @@
 BACKEND = os.environ["BACKEND"]
 
 if BACKEND in _allowed_backends:
-    class TestDistBackendWithSpawn(TestDistBackend, DistributedTest._DistTestBase):
 
+    class TestDistBackendWithSpawn(TestDistBackend, DistributedTest._DistTestBase):
         def setUp(self):
             super().setUp()
             self._spawn_processes()
             torch.backends.cudnn.flags(enabled=True, allow_tf32=False).__enter__()
+
 else:
     print(f"Invalid backend {BACKEND}. Tests will not be run!")
 
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 4b2ad380ac7cf..ec6596a619ecf 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -1,52 +1,62 @@
 # Owner(s): ["module: dynamo"]
+import contextlib
 import copy
 import functools
-from io import StringIO
-from typing import List
 import random
 import unittest
+from contextlib import contextmanager
+from io import StringIO
+from typing import List
 from unittest.mock import patch
+
 import numpy as np
 import torch
-from torch._C import FileCheck
 import torch._dynamo
-from torch._dynamo.backends.distributed import DDPOptimizer
+import torch._dynamo.logging
 import torch._dynamo.test_case
-from contextlib import contextmanager
 from torch import nn
+from torch._C import FileCheck
 from torch._dynamo import config
-from torch._dynamo.utils import same
+from torch._dynamo.backends.distributed import DDPOptimizer
+from torch._dynamo.comptime import comptime
 from torch._dynamo.testing import collect_results
-from torch.utils._triton import has_triton
-from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy, lambda_auto_wrap_policy
+from torch._dynamo.utils import same
 from torch._higher_order_ops.wrap import tag_activation_checkpoint
-from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed._functional_collectives import _maybe_wrap_tensor
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.wrap import (
+    lambda_auto_wrap_policy,
+    transformer_auto_wrap_policy,
+)
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+    PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+)
 from torch.testing._internal.common_distributed import (
-    DynamoDistributedSingleProcTestCase,
+    _dynamo_dist_per_rank_init,
     DynamoDistributedMultiProcTestCase,
+    DynamoDistributedSingleProcTestCase,
     import_transformers_or_skip,
-    skip_if_lt_x_gpu,
     requires_nccl,
-    _dynamo_dist_per_rank_init,
-)
-import torch._dynamo.logging
-from torch.testing._internal.common_cuda import (
-    PLATFORM_SUPPORTS_FLASH_ATTENTION, PLATFORM_SUPPORTS_MEM_EFF_ATTENTION
+    skip_if_lt_x_gpu,
 )
-from torch._dynamo.comptime import comptime
-from torch.distributed._functional_collectives import _maybe_wrap_tensor
+from torch.testing._internal.common_utils import requires_cuda
+from torch.utils._triton import has_triton
+
 
 def reset_rng_state():
     torch.manual_seed(1337)
     random.seed(1337)
     np.random.seed(1337)
 
+
 def init_weights(m):
     if isinstance(m, nn.Linear):
         nn.init.xavier_uniform_(m.weight)
         m.bias.data.fill_(0.01)
 
+
 class ToyModel(nn.Module):
     def __init__(self, in_feat=10, hidden_feat=5000, out_feat=5, ctx_manager=None):
         super().__init__()
@@ -65,15 +75,22 @@ def forward(self, inputs):
         else:
             return self.net(inputs)
 
-def get_model(device, bsz=20, in_feat=10, hidden_feat=5000, out_feat=5, ctx_manager=None):
-    m = ToyModel(in_feat=in_feat, hidden_feat=hidden_feat, out_feat=out_feat, ctx_manager=ctx_manager).to(device)
+
+def get_model(
+    device, bsz=20, in_feat=10, hidden_feat=5000, out_feat=5, ctx_manager=None
+):
+    m = ToyModel(
+        in_feat=in_feat,
+        hidden_feat=hidden_feat,
+        out_feat=out_feat,
+        ctx_manager=ctx_manager,
+    ).to(device)
     m.apply(init_weights)
     inputs = torch.rand(bsz, in_feat).to(device)
     outputs = m(inputs)
     return m, inputs, outputs
 
 
-
 class ToyInnerModel(nn.Module):
     def __init__(self):
         super().__init__()
@@ -83,11 +100,14 @@ def __init__(self):
     def forward(self, inputs):
         return self.layers(inputs)
 
+
 class ToyOuterModel(nn.Module):
     def __init__(self, device):
         super().__init__()
         self.layers = [ToyInnerModel().to(device) for _ in range(2)]
-        self.layers = nn.Sequential(self.layers[0], nn.ReLU(), self.layers[1], nn.ReLU())
+        self.layers = nn.Sequential(
+            self.layers[0], nn.ReLU(), self.layers[1], nn.ReLU()
+        )
 
     def forward(self, inputs):
         return self.layers(inputs)
@@ -107,16 +127,17 @@ def find_first_node(gm, func):
     return None
 
 
-def apply_fsdp_with_checkpointing(model, wrap_policy, checkpoint_policy, use_activation_checkpointing=True):
+def apply_fsdp_with_checkpointing(
+    model, wrap_policy, checkpoint_policy, use_activation_checkpointing=True
+):
     from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
         apply_activation_checkpointing,
         checkpoint_wrapper,
         CheckpointImpl,
     )
+
     model = FSDP(
-        copy.deepcopy(model),
-        auto_wrap_policy=wrap_policy,
-        use_orig_params=True
+        copy.deepcopy(model), auto_wrap_policy=wrap_policy, use_orig_params=True
     )
     if use_activation_checkpointing:
         checkpoint_wrapper_fn = functools.partial(
@@ -124,12 +145,13 @@ def apply_fsdp_with_checkpointing(model, wrap_policy, checkpoint_policy, use_act
             checkpoint_impl=CheckpointImpl.NO_REENTRANT,
         )
         apply_activation_checkpointing(
-            model, checkpoint_wrapper_fn=checkpoint_wrapper_fn, check_fn=checkpoint_policy,
+            model,
+            checkpoint_wrapper_fn=checkpoint_wrapper_fn,
+            check_fn=checkpoint_policy,
         )
     return model
 
 
-
 def get_custom_model(device):
     class MyCustomLinear(torch.nn.Module):
         def __init__(self):
@@ -175,22 +197,26 @@ def forward(self, x, y):
     correct_outputs = m(*inputs)
     return m, inputs, correct_outputs
 
+
 def get_hf_bert(rank):
     # Note: use @import_transformers_or_skip on your test case if you use this
     # in a multiprocessing test
     try:
-        from transformers import BertConfig, AutoModelForMaskedLM
+        from transformers import AutoModelForMaskedLM, BertConfig
     except ImportError as e:
         raise unittest.SkipTest("Unable to import transformers") from e
 
     batch_size, max_length, config, device = 4, 512, BertConfig(), f"cuda:{rank}"
     model = AutoModelForMaskedLM.from_config(config).to(device)
     input_ids = torch.randint(0, config.vocab_size, (batch_size, max_length)).to(device)
-    decoder_ids = torch.randint(0, config.vocab_size, (batch_size, max_length)).to(device)
-    inputs = {'input_ids': input_ids, 'labels': decoder_ids}
+    decoder_ids = torch.randint(0, config.vocab_size, (batch_size, max_length)).to(
+        device
+    )
+    inputs = {"input_ids": input_ids, "labels": decoder_ids}
     model.train()
     return model, inputs
 
+
 class CheckSplitsCompiler:
     def __init__(self):
         self.compiler_called = 0
@@ -224,6 +250,7 @@ def forward(self, *inputs, **kwargs):
         with self._inside_ddp_forward():
             return self.module.forward(*inputs, **kwargs)
 
+
 def run_hf_bert_ddp(self, model, inputs, backend):
     reset_rng_state()
     correct_outputs = model(**inputs)
@@ -237,10 +264,13 @@ def run_hf_bert_ddp(self, model, inputs, backend):
     opt_loss.backward()
 
     inputs_flat = [inputs[k] for k in inputs]
-    correct_results = collect_results(model, correct_outputs.logits, correct_loss, inputs_flat)
+    correct_results = collect_results(
+        model, correct_outputs.logits, correct_loss, inputs_flat
+    )
     opt_results = collect_results(opt_model, opt_outputs.logits, opt_loss, inputs_flat)
     self.assertTrue(same(correct_results, opt_results))
 
+
 class TestFakeDistributedSingleProc(torch._dynamo.test_case.TestCase):
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(config, "optimize_ddp", True)
@@ -268,6 +298,61 @@ def forward(self):
         opt_model = torch._dynamo.optimize("aot_eager")(model)
         opt_model()
 
+    @patch.object(config, "optimize_ddp", True)
+    def test_symbol_splitting(self):
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight1 = nn.Parameter(torch.randn(512, 512))
+                self.weight2 = nn.Parameter(torch.randn(512, 512))
+
+            def forward(self, x):
+                x = torch.cat([x, x])
+                y = x @ self.weight1
+                z = x + y @ self.weight2
+                return z
+
+        model = Model()
+        model = FakeDDP(model)
+
+        opt_model = torch.compile(dynamic=True)(model)
+        opt_model(torch.randn(20, 512))
+
+    @patch.object(config, "optimize_ddp", True)
+    def test_call_method_forward(self):
+        class Model(nn.Module):
+            def __init__(
+                self,
+            ):
+                super().__init__()
+                layers = []
+                for l in range(2):
+                    layer = nn.ModuleList(
+                        [
+                            nn.LayerNorm(96),
+                            nn.MultiheadAttention(
+                                embed_dim=96, num_heads=4, batch_first=True
+                            ),
+                        ]
+                    )
+                    layers.append(layer)
+                self.layers = nn.ModuleList(layers)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                # x: [Batch, Freq, Time, Feature]
+                B, F, T, H = x.shape
+                for m in self.layers:
+                    x = x.reshape(B * F, T, H)
+                    x = m[0](x)
+                    x, attn = m[1].forward(x, x, x)
+                    x = x.reshape(B, F, T, H)
+                return x
+
+        model = Model()
+        model = FakeDDP(model)
+        opt_model = torch.compile(model)
+        opt_model(torch.randn(2, 129, 100, 96))
+
 
 # Are these tests failing?  Check and see if TestFakeDistributedSingleProc has a
 # single process version; if it's just a problem in the Dynamo distributed
@@ -279,6 +364,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
     Prefer MultiThreadedTestCase for most tests. Perhaps use this one
     sparingly for integration tests.
     """
+
     @skip_if_lt_x_gpu(2)
     @patch.object(config, "optimize_ddp", False)
     def test_ddp_baseline_aot_eager_multiprocess(self):
@@ -334,11 +420,10 @@ def test_hf_bert_ddp_aot_eager_static_graph(self):
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(config, "optimize_ddp", False)
     def test_ddp_activation_checkpointing(self):
-
         from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-            CheckpointImpl,
             apply_activation_checkpointing,
             checkpoint_wrapper,
+            CheckpointImpl,
         )
 
         class MyModel(torch.nn.Module):
@@ -360,8 +445,12 @@ def forward(self, inp):
                 checkpoint_wrapper,
                 checkpoint_impl=CheckpointImpl.NO_REENTRANT,
             )
-            check_fn = lambda submodule: isinstance(submodule, torch.nn.Linear)  # noqa: E731
-            apply_activation_checkpointing(model, checkpoint_wrapper_fn=non_reentrant_wrapper, check_fn=check_fn)
+            check_fn = lambda submodule: isinstance(  # noqa: E731
+                submodule, torch.nn.Linear
+            )
+            apply_activation_checkpointing(
+                model, checkpoint_wrapper_fn=non_reentrant_wrapper, check_fn=check_fn
+            )
 
             model = DDP(model)
             x = torch.randn(10, 64).cuda()
@@ -386,9 +475,9 @@ def test_fsdp_aot_eager(self):
             fsdp_m = FSDP(
                 m,
                 auto_wrap_policy=functools.partial(
-                    transformer_auto_wrap_policy, transformer_layer_cls=(nn.Linear, )
+                    transformer_auto_wrap_policy, transformer_layer_cls=(nn.Linear,)
                 ),
-                use_orig_params=True
+                use_orig_params=True,
             )
             fsdp_m = torch._dynamo.optimize("aot_eager")(fsdp_m)
             outputs = fsdp_m(inputs)
@@ -410,20 +499,21 @@ def test_fsdp_inductor(self):
             fsdp_m = FSDP(
                 m,
                 auto_wrap_policy=functools.partial(
-                    transformer_auto_wrap_policy, transformer_layer_cls=(nn.Linear, )
+                    transformer_auto_wrap_policy, transformer_layer_cls=(nn.Linear,)
                 ),
-                use_orig_params=True
+                use_orig_params=True,
             )
             fsdp_m = torch._dynamo.optimize("inductor")(fsdp_m)
             outputs = fsdp_m(inputs)
             self.assertTrue(same(correct_outputs, outputs))
 
-
     @skip_if_lt_x_gpu(1)
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     def test_fsdp_activation_checkpointing(self):
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-            model, inputs = get_toy_model_for_activation_checkpointing(f"cuda:{self.rank}")
+            model, inputs = get_toy_model_for_activation_checkpointing(
+                f"cuda:{self.rank}"
+            )
             is_inner = lambda module: isinstance(module, ToyInnerModel)  # noqa: E731
             wrap_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=is_inner)
             model = apply_fsdp_with_checkpointing(model, wrap_policy, is_inner)
@@ -434,9 +524,9 @@ def test_fsdp_activation_checkpointing(self):
             self.assertTrue(same(correct_outputs, outputs))
             # Each FSDP module is a separate graph
             self.assertEqual(cnt.frame_count, 2)
-            self.assertTrue(find_first_node(cnt.graphs[0], tag_activation_checkpoint) is not None)
-
-
+            self.assertTrue(
+                find_first_node(cnt.graphs[0], tag_activation_checkpoint) is not None
+            )
 
     @import_transformers_or_skip()
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@@ -445,24 +535,18 @@ def test_fsdp_activation_checkpointing(self):
     @patch.object(torch._inductor.config, "fallback_random", True)
     @unittest.skipIf(
         PLATFORM_SUPPORTS_FLASH_ATTENTION or PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
-        "Inaccurate results with fused SDPA kernels"
+        "Inaccurate results with fused SDPA kernels",
     )
     def test_hf_bert_fsdp(self):
-
         def apply_fsdp(model, wrap_policy):
             model = FSDP(
-                copy.deepcopy(model),
-                auto_wrap_policy=wrap_policy,
-                use_orig_params=True
+                copy.deepcopy(model), auto_wrap_policy=wrap_policy, use_orig_params=True
             )
             return model
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-            for (wrap_policy, test_instance) in (
-                (
-                    None,
-                    "FSDP without recursive wrapping"
-                ),
+            for wrap_policy, test_instance in (
+                (None, "FSDP without recursive wrapping"),
             ):
                 print(f"Running hf_bert test for {test_instance}")
                 model, inputs = get_hf_bert(self.rank)
@@ -480,33 +564,43 @@ def apply_fsdp(model, wrap_policy):
                 opt_loss.backward()
 
                 inputs_flat = [inputs[k] for k in inputs]
-                correct_results = collect_results(eager_model, correct_outputs.logits, correct_loss, inputs_flat)
-                opt_results = collect_results(opt_model, opt_outputs.logits, opt_loss, inputs_flat)
+                correct_results = collect_results(
+                    eager_model, correct_outputs.logits, correct_loss, inputs_flat
+                )
+                opt_results = collect_results(
+                    opt_model, opt_outputs.logits, opt_loss, inputs_flat
+                )
                 self.assertTrue(same(correct_results, opt_results))
 
-
-
     @import_transformers_or_skip()
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     # TODO(whc) Investigate why cudagraphs breaks inductor+fsdp for hf_bert
     @patch.object(torch._inductor.config.triton, "cudagraphs", False)
     @patch.object(torch._inductor.config, "fallback_random", True)
+    @patch.object(torch._dynamo.config, "guard_nn_modules", True)
     def test_hf_bert_fsdp_activation_checkpointing(self):
         from transformers.models.bert.modeling_bert import BertLayer
+
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-            for (wrap_policy, test_instance) in (
+            for wrap_policy, test_instance in (
                 (
                     functools.partial(
-                        transformer_auto_wrap_policy, transformer_layer_cls=(BertLayer, )
+                        transformer_auto_wrap_policy, transformer_layer_cls=(BertLayer,)
                     ),
-                    "FSDP with recursive wrapping BertLayer instances"
+                    "FSDP with recursive wrapping BertLayer instances",
                 ),
             ):
-                print(f"Running hf_bert_activation_checkpointing test for {test_instance}")
+                print(
+                    f"Running hf_bert_activation_checkpointing test for {test_instance}"
+                )
                 model, inputs = get_hf_bert(self.rank)
-                check_fn = lambda submodule: isinstance(submodule, BertLayer)  # noqa: E731
+                check_fn = lambda submodule: isinstance(  # noqa: E731
+                    submodule, BertLayer
+                )
                 reset_rng_state()
-                eager_model = apply_fsdp_with_checkpointing(model, wrap_policy, check_fn)
+                eager_model = apply_fsdp_with_checkpointing(
+                    model, wrap_policy, check_fn
+                )
                 correct_outputs = eager_model(**inputs)
                 correct_loss = correct_outputs.loss
                 correct_loss.backward()
@@ -519,12 +613,17 @@ def test_hf_bert_fsdp_activation_checkpointing(self):
                 opt_loss.backward()
 
                 inputs_flat = [inputs[k] for k in inputs]
-                correct_results = collect_results(eager_model, correct_outputs.logits, correct_loss, inputs_flat)
-                opt_results = collect_results(opt_model, opt_outputs.logits, opt_loss, inputs_flat)
+                correct_results = collect_results(
+                    eager_model, correct_outputs.logits, correct_loss, inputs_flat
+                )
+                opt_results = collect_results(
+                    opt_model, opt_outputs.logits, opt_loss, inputs_flat
+                )
                 self.assertTrue(same(correct_results, opt_results))
 
 
 @requires_nccl()
+@requires_cuda
 class TestSingleProc(DynamoDistributedSingleProcTestCase):
     """
     Test harness initializes dist process group.
@@ -533,8 +632,15 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
     Use TestMultiProc for things that really need to run on multiple nodes
     """
 
-    def get_model(self, bsz=20, in_feat=10, hidden_feat=5000, out_feat=5, ctx_manager=None):
-        m = ToyModel(in_feat=in_feat, hidden_feat=hidden_feat, out_feat=out_feat, ctx_manager=ctx_manager).to(self.device)
+    def get_model(
+        self, bsz=20, in_feat=10, hidden_feat=5000, out_feat=5, ctx_manager=None
+    ):
+        m = ToyModel(
+            in_feat=in_feat,
+            hidden_feat=hidden_feat,
+            out_feat=out_feat,
+            ctx_manager=ctx_manager,
+        ).to(self.device)
         m.apply(init_weights)
         inputs = torch.rand(bsz, in_feat).to(self.device)
         outputs = m(inputs)
@@ -604,19 +710,20 @@ def test_graph_split_ctx_manager(self):
         ]:
             for ctx_manager, output_test in [
                 (
-                    lambda: torch.autocast(torch.device(self.device).type, torch.float16),
+                    lambda: torch.autocast(
+                        torch.device(self.device).type, torch.float16
+                    ),
                     lambda out: self.assertEqual(out.dtype, torch.float16),
                 ),
-                (
-                    torch.enable_grad,
-                    lambda out: self.assertTrue(out.requires_grad)
-                ),
-                (
-                    torch.no_grad,
-                    lambda out: self.assertTrue(not out.requires_grad)
-                ),
+                (torch.enable_grad, lambda out: self.assertTrue(out.requires_grad)),
+                (torch.no_grad, lambda out: self.assertTrue(not out.requires_grad)),
             ]:
-                m, inputs, correct_outputs = self.get_model(out_feat=1000, hidden_feat=1000, in_feat=1000, ctx_manager=ctx_manager)
+                m, inputs, correct_outputs = self.get_model(
+                    out_feat=1000,
+                    hidden_feat=1000,
+                    in_feat=1000,
+                    ctx_manager=ctx_manager,
+                )
                 # inp - 1000 * 1000 matrix of float32 (4 bytes) = 4MB
                 # hidden - 1000 * 1000 matrix of float32 (4 bytes) = 4MB
                 bucket_cap_mb = 3.5  # 4MB
@@ -624,7 +731,9 @@ def test_graph_split_ctx_manager(self):
 
                 compiler = get_compiler()
 
-                @torch._dynamo.optimize(compiler.compile_fn if compiler else "aot_eager")
+                @torch._dynamo.optimize(
+                    compiler.compile_fn if compiler else "aot_eager"
+                )
                 def opt_fn(inputs):
                     return ddp_m(inputs)
 
@@ -660,11 +769,11 @@ def opt_fn(inputs):
         opt_outputs = opt_fn(inputs)
         self.assertTrue(same(correct_outputs, opt_outputs))
 
-    @torch._inductor.config.patch({"layout_optimization": True, "keep_output_stride": False})
+    @torch._inductor.config.patch(
+        {"layout_optimization": True, "keep_output_stride": False}
+    )
     @patch.object(config, "optimize_ddp", True)
-    @patch.object(config, "optimize_ddp_lazy_compile", True)
-    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
-    def test_graph_split_inductor_layout_optimizations(self):
+    def _test_graph_split_inductor_layout_optimizations_impl(self, context):
         assert config.optimize_ddp
         channel_dim = 512
         # channel dim must be > 64 for inductor to do layout optimization and use NHWC
@@ -673,10 +782,22 @@ class ToyModelConv(nn.Module):
             def __init__(self):
                 super().__init__()
                 self.net = nn.Sequential(
-                    *[nn.Conv2d(channel_dim, channel_dim, 1, stride=1, bias=False), nn.ReLU()]
-                    + [nn.Conv2d(channel_dim, channel_dim, 1, stride=1, bias=False), nn.ReLU()]
-                    + [nn.Conv2d(channel_dim, channel_dim, 1, stride=1, bias=False), nn.ReLU()]
-                    + [nn.Conv2d(channel_dim, channel_dim, 1, stride=1, bias=False), nn.ReLU()]
+                    *[
+                        nn.Conv2d(channel_dim, channel_dim, 1, stride=1, bias=False),
+                        nn.ReLU(),
+                    ]
+                    + [
+                        nn.Conv2d(channel_dim, channel_dim, 1, stride=1, bias=False),
+                        nn.ReLU(),
+                    ]
+                    + [
+                        nn.Conv2d(channel_dim, channel_dim, 1, stride=1, bias=False),
+                        nn.ReLU(),
+                    ]
+                    + [
+                        nn.Conv2d(channel_dim, channel_dim, 1, stride=1, bias=False),
+                        nn.ReLU(),
+                    ]
                 )
 
             def forward(self, inputs):
@@ -689,16 +810,26 @@ def get_model():
             outputs = m(inputs)
             return m, inputs, outputs
 
-        m, inputs, correct_outputs = get_model()
-        ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=25)
+        with context():
+            m, inputs, correct_outputs = get_model()
+            ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=25)
 
-        @torch._dynamo.optimize("inductor")
-        def opt_fn(inputs):
-            return ddp_m(inputs)
+            @torch._dynamo.optimize("inductor")
+            def opt_fn(inputs):
+                return ddp_m(inputs)
 
-        opt_outputs = opt_fn(inputs)
-        self.assertTrue(same(correct_outputs, opt_outputs))
+            opt_outputs = opt_fn(inputs)
+            self.assertTrue(same(correct_outputs, opt_outputs))
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    def test_graph_split_inductor_layout_optimizations_training(self):
+        self._test_graph_split_inductor_layout_optimizations_impl(
+            contextlib.nullcontext
+        )
 
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    def test_graph_split_inductor_layout_optimizations_inference(self):
+        self._test_graph_split_inductor_layout_optimizations_impl(torch.no_grad)
 
     @patch.object(config, "optimize_ddp", True)
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@@ -817,14 +948,13 @@ def test_ignored_parameters(self):
         DDP._set_params_and_buffers_to_ignore_for_model(m, parameters_to_ignore)
         ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=25)
         parameter_ids_to_ignore = [
-            id(ddp_m.module.get_parameter(p))
-            for p in ddp_m.parameters_to_ignore
+            id(ddp_m.module.get_parameter(p)) for p in ddp_m.parameters_to_ignore
         ]
 
         check_splits_compiler = CheckSplitsCompiler()
         ddp_optimizer = DDPOptimizer(
             bucket_bytes_cap=ddp_m.bucket_bytes_cap,
-            backend_compile_fn=check_splits_compiler.compile_fn
+            backend_compile_fn=check_splits_compiler.compile_fn,
         )
 
         @torch._dynamo.optimize(ddp_optimizer.compile_fn)
@@ -878,17 +1008,21 @@ def forward(self, x):
 
         with self.assertRaisesRegex(
             torch._dynamo.exc.BackendCompilerFailed,
-            "DDPOptimizer backend: Found a higher order op in the graph"
+            "DDPOptimizer backend: Found a higher order op in the graph",
         ):
             torch.compile(mod, backend=cnt)(*args)
 
-
     def test_fsdp_orig_params_assert(self):
         # Test with basic FSDP wrapping (outer wrap around whole model)
         m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
         fsdp_m = FSDP(m, use_orig_params=False)
         fsdp_m = torch._dynamo.optimize()(fsdp_m)
-        self.assertRaisesRegex(AssertionError, "Dynamo only supports FSDP with use_orig_params=True", fsdp_m, inputs)
+        self.assertRaisesRegex(
+            AssertionError,
+            "Dynamo only supports FSDP with use_orig_params=True",
+            fsdp_m,
+            inputs,
+        )
 
     def test_fsdp_skip_guards(self):
         """
@@ -908,7 +1042,7 @@ def test_fsdp_skip_guards(self):
 
         for skip_guards, expected_guard_source in (
             (True, "local_fsdp_module"),
-            (False, "local")
+            (False, "local"),
         ):
             torch._dynamo.reset()
 
@@ -930,8 +1064,13 @@ def _(ctx):
                         ctx.print_guards(file=GUARDS_FILE)
 
                     return out
+
             device = f"cuda:{self.rank}"
-            m = ToyModel(in_feat=10, hidden_feat=5000, out_feat=5,).to(device)
+            m = ToyModel(
+                in_feat=10,
+                hidden_feat=5000,
+                out_feat=5,
+            ).to(device)
             inputs = torch.rand(20, 10).to(device)
             m.apply(init_weights)
             correct_outputs = m(inputs)
@@ -942,14 +1081,17 @@ def _(ctx):
                 outputs = opt_m(inputs)
 
             # far from an exhaustive check of all the expected guards, just check a couple of them.
-            FileCheck() \
-                .check("""local "L['self']" TYPE_MATCH""") \
-                .check("""local "L['self']" ID_MATCH""") \
-                .check(f"""{expected_guard_source} "L['self'].net" TYPE_MATCH""") \
-                .check(f"""{expected_guard_source} "L['self'].net" ID_MATCH""") \
-                .check(f"""{expected_guard_source} "L['self'].net[0]" TYPE_MATCH""") \
-                .check(f"""{expected_guard_source} "L['self'].net[0]" ID_MATCH""") \
-                .run(GUARDS_FILE.getvalue())
+            FileCheck().check("""local "L['self']" TYPE_MATCH""").check(
+                """local "L['self']" ID_MATCH"""
+            ).check(f"""{expected_guard_source} "L['self'].net" TYPE_MATCH""").check(
+                f"""{expected_guard_source} "L['self'].net" ID_MATCH"""
+            ).check(
+                f"""{expected_guard_source} "L['self'].net[0]" TYPE_MATCH"""
+            ).check(
+                f"""{expected_guard_source} "L['self'].net[0]" ID_MATCH"""
+            ).run(
+                GUARDS_FILE.getvalue()
+            )
             self.assertTrue(same(correct_outputs, outputs))
 
     def test_fsdp_skip_register_attr_or_module(self):
@@ -960,6 +1102,7 @@ def test_fsdp_skip_register_attr_or_module(self):
         before calling `register_attr_or_module`
         in variables/builder.py
         """
+
         class ToyModel(nn.Module):
             def __init__(self, in_feat=10, hidden_feat=5000, out_feat=5):
                 super().__init__()
@@ -975,7 +1118,11 @@ def forward(self, inputs):
         torch._dynamo.reset()
 
         device = f"cuda:{self.rank}"
-        m = ToyModel(in_feat=10, hidden_feat=5000, out_feat=5,).to(device)
+        m = ToyModel(
+            in_feat=10,
+            hidden_feat=5000,
+            out_feat=5,
+        ).to(device)
         inputs = torch.rand(20, 10).to(device)
         m.apply(init_weights)
         correct_outputs = m(inputs)
@@ -988,9 +1135,12 @@ def debug_compiler(gm, _):
                         "l__self___net_0_weight",
                         "l__self___net_0_bias",
                         "l__self___net_2_weight",
-                        "l__self___net_2_bias"
+                        "l__self___net_2_bias",
                     ]:
-                        self.assertFalse(name in node.name, f"FSDP module {name} should not be registered as attributes")
+                        self.assertFalse(
+                            name in node.name,
+                            f"FSDP module {name} should not be registered as attributes",
+                        )
             return gm
 
         opt_m = torch._dynamo.optimize(backend=debug_compiler)(fsdp_m)
@@ -1004,6 +1154,7 @@ def test_fsdp_dup_tensors_same_source(self):
         source are de-duplicated, meaning that they are each only passed once
         as a graph input.
         """
+
         class DuplicateModule(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -1034,6 +1185,7 @@ def test_fsdp_dup_tensors_diff_source(self):
         ``a is b``, where ``a`` and ``b`` are certainly not the same. We check
         this by checking for per-invocation recompiles.
         """
+
         class BufModule(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -1077,6 +1229,7 @@ def test_fsdp_staticmethod(self):
         correctly both when the staticmethod is invoked from the class and from
         the object itself.
         """
+
         class ModuleWithStaticMethod(nn.Module):
             def __init__(self, use_self: bool):
                 super().__init__()
@@ -1130,4 +1283,5 @@ def f(x):
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
+
     run_tests()
diff --git a/test/distributed/test_fake_pg.py b/test/distributed/test_fake_pg.py
index 195034e06ea0b..238b8666013fc 100644
--- a/test/distributed/test_fake_pg.py
+++ b/test/distributed/test_fake_pg.py
@@ -1,28 +1,24 @@
 # Owner(s): ["oncall: distributed"]
 
 import sys
+import unittest
+
 import torch
 import torch.distributed as dist
-import torch.nn as nn
-import unittest
 import torch.distributed._functional_collectives as funcol
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch.testing._internal.distributed.fake_pg import FakeStore
-from torch.testing import FileCheck
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+import torch.nn as nn
 from torch.distributed._tensor import DeviceMesh, init_device_mesh, Shard
-from torch.testing._internal.common_utils import (
-    TestCase,
-    run_tests,
-)
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
     RowwiseParallel,
 )
-from torch.testing._internal.distributed._tensor.common_dtensor import (
-    MLPModule,
-)
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing import FileCheck
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule
+from torch.testing._internal.distributed.fake_pg import FakeStore
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
@@ -30,6 +26,7 @@
 
 HAS_CUDA = torch.cuda.is_available()
 
+
 class TestFakePG(TestCase):
     def tearDown(self):
         super().tearDown()
@@ -37,9 +34,7 @@ def tearDown(self):
 
     def test_all_reduce(self):
         store = FakeStore()
-        dist.init_process_group(
-            backend="fake", rank=1, world_size=2, store=store
-        )
+        dist.init_process_group(backend="fake", rank=1, world_size=2, store=store)
 
         output = torch.ones(3, 3) * dist.get_rank()
         dist.all_reduce(output)
@@ -47,9 +42,7 @@ def test_all_reduce(self):
 
     def test_allgather(self):
         store = FakeStore()
-        dist.init_process_group(
-            backend="fake", rank=1, world_size=2, store=store
-        )
+        dist.init_process_group(backend="fake", rank=1, world_size=2, store=store)
 
         input_tensor = torch.ones(3, 3) * dist.get_rank()
         output_tensors = [torch.empty_like(input_tensor) for _ in range(2)]
@@ -59,9 +52,7 @@ def test_allgather(self):
 
     def test_reduce_scatter(self):
         store = FakeStore()
-        dist.init_process_group(
-            backend="fake", rank=1, world_size=2, store=store
-        )
+        dist.init_process_group(backend="fake", rank=1, world_size=2, store=store)
 
         to_reduce_scatter = [torch.ones(3, 3) * rank for rank in range(2)]
         output_tensor = torch.empty(3, 3)
@@ -69,24 +60,20 @@ def test_reduce_scatter(self):
         dist.reduce_scatter(output_tensor, to_reduce_scatter)
         self.assertEqual(tuple(output_tensor.shape), (3, 3))
 
-    @unittest.skipIf(not HAS_CUDA, 'No CUDA')
+    @unittest.skipIf(not HAS_CUDA, "No CUDA")
     def test_construct_fsdp(self):
         store = FakeStore()
-        dist.init_process_group(
-            backend="fake", rank=0, world_size=2, store=store
-        )
-        FSDP(nn.Linear(2, 3, device='cuda'))
+        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
+        FSDP(nn.Linear(2, 3, device="cuda"))
 
-    @unittest.skipIf(not HAS_CUDA, 'No CUDA')
+    @unittest.skipIf(not HAS_CUDA, "No CUDA")
     def test_fsdp_fake_e2e(self):
         store = dist.HashStore()
-        dist.init_process_group(
-            backend="fake", rank=0, world_size=2, store=store
-        )
+        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
         my_module = nn.Sequential(
-            nn.Linear(2, 3, device='cuda'),
+            nn.Linear(2, 3, device="cuda"),
             nn.ReLU(),
-            nn.Linear(3, 2, device='cuda'),
+            nn.Linear(3, 2, device="cuda"),
         )
         sharded_module = FSDP(my_module, use_orig_params=True)
         optim = torch.optim.Adam(sharded_module.parameters(), lr=0.0001)
@@ -96,19 +83,17 @@ def test_fsdp_fake_e2e(self):
         loss.backward()
         optim.step()
 
-    @unittest.skipIf(not HAS_CUDA, 'No CUDA')
+    @unittest.skipIf(not HAS_CUDA, "No CUDA")
     def test_fake_pg_tracing(self):
         store = dist.HashStore()
-        dist.init_process_group(
-            backend="fake", rank=0, world_size=2, store=store
-        )
+        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
 
         default_pg = dist.distributed_c10d._get_default_group()
 
         def allgather_fn(tensor):
             return funcol.all_gather_tensor(tensor, 0, default_pg)
 
-        gm = make_fx(allgather_fn)(torch.randn(2, 2, device='cuda'))
+        gm = make_fx(allgather_fn)(torch.randn(2, 2, device="cuda"))
         FileCheck().check("all_gather").check("wait_tensor").run(str(gm.graph))
 
     def test_broadcast(self):
@@ -184,15 +169,13 @@ def test_fsdp_tp_fake_e2e(self):
         tp_size = 2
 
         store = dist.HashStore()
-        dist.init_process_group(backend="fake", rank=0, world_size=world_size, store=store)
-
-        device_mesh = DeviceMesh(
-            "cuda", torch.arange(0, world_size).view(-1, tp_size)
+        dist.init_process_group(
+            backend="fake", rank=0, world_size=world_size, store=store
         )
+
+        device_mesh = DeviceMesh("cuda", torch.arange(0, world_size).view(-1, tp_size))
         device_mesh = init_device_mesh(
-            "cuda",
-            (world_size // tp_size, tp_size),
-            mesh_dim_names=["dp", "tp"]
+            "cuda", (world_size // tp_size, tp_size), mesh_dim_names=["dp", "tp"]
         )
 
         sequence_parallelize_plan = {
@@ -204,7 +187,6 @@ def test_fsdp_tp_fake_e2e(self):
             "net2": RowwiseParallel(),
         }
         for parallel_plan in [sequence_parallelize_plan, pairwise_parallelize_plan]:
-
             my_module = parallelize_module(
                 MLPModule(device="cuda"),
                 device_mesh["tp"],
@@ -212,9 +194,7 @@ def test_fsdp_tp_fake_e2e(self):
             )
 
             sharded_module = FSDP(
-                my_module,
-                use_orig_params=True,
-                device_mesh=device_mesh["dp"]
+                my_module, use_orig_params=True, device_mesh=device_mesh["dp"]
             )
             optim = torch.optim.Adam(sharded_module.parameters(), lr=0.0001)
 
diff --git a/test/distributed/test_functional_api.py b/test/distributed/test_functional_api.py
index 2123086b9ba3a..f2255637a6912 100644
--- a/test/distributed/test_functional_api.py
+++ b/test/distributed/test_functional_api.py
@@ -3,17 +3,16 @@
 import os
 import sys
 import unittest
-import weakref
 from functools import partial, wraps
 
 import torch
 import torch.distributed as dist
 import torch.distributed._functional_collectives as ft_c
-import torch.distributed._functional_collectives_impl as ft_c_impl
 import torch.distributed._tensor as dt
 import torch.distributed.distributed_c10d as c10d
 
 from functorch import make_fx
+from torch._inductor.utils import run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.distributed.fake_pg import FakeStore
 from torch.utils._triton import has_triton
@@ -28,7 +27,6 @@
     requires_nccl,
     TEST_SKIPS,
 )
-
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -223,6 +221,7 @@ def test_find_root_pg(self):
         self.assertEqual(dist.group.WORLD, pg)
 
 
+@instantiate_parametrized_tests
 class TestTraceableCollectives(MultiThreadedTestCase):
     @property
     def world_size(self):
@@ -295,7 +294,7 @@ def test_all_gather_tensor(self, device):
             for dim in dims_to_gather:
                 output_size = [3, 3, 3]
                 output_size[dim] *= mesh.size(0)
-                # each rank have its own tensor, all_gather gives a list
+                # each rank have its own tensor, all_gather gives a bigger tensor
                 local_tensor = torch.ones([3, 3, 3], device=device)
                 gathered_tensor = ft_c.all_gather_tensor(
                     local_tensor, gather_dim=dim, group=(mesh, 0)
@@ -364,7 +363,7 @@ def test_reduce_scatter_into_tensor_coalesced(self, device):
 class TestMetaCollectives(TestCase):
     def test_all_reduce(self):
         x = torch.rand((2, 3, 4), device="meta")
-        out = ft_c.all_reduce(x, "sum", [1])
+        out = ft_c.all_reduce(x, "sum", "0")
         self.assertEqual(x.size(), out.size())
 
 
@@ -380,7 +379,7 @@ def setUp(self):
     def test_all_reduce(self):
         x = torch.rand([4], requires_grad=True)
         y = torch.rand([4], requires_grad=True)
-        out = ft_c.all_reduce(x, "sum", [0, 1])
+        out = ft_c.all_reduce(x, "sum", dist.group.WORLD)
         (out + y).sum().backward()
         self.assertIsNone(x.grad)
 
@@ -394,9 +393,16 @@ def setUp(self):
         super().setUp()
         self._spawn_threads()
 
+    def tearDown(self):
+        super().tearDown()
+
+        # race condition with threads causes is_fx_tracing flag to be set incorrectly.
+        torch.fx._symbolic_trace._is_fx_tracing_flag = False
+        self.assertFalse(torch.fx._symbolic_trace.is_fx_tracing())
+
     def test_all_reduce_tracing(self):
         def allred(input):
-            return ft_c.all_reduce(input, "sum", group=[0, 1]) + 1
+            return ft_c.all_reduce(input, "sum", group=dist.group.WORLD) + 1
 
         graph = make_fx(allred)(torch.rand(4))
         FileCheck().check("all_reduce").check("wait_tensor").run(str(graph.graph))
@@ -420,14 +426,12 @@ def allred_mesh_dim(input):
         )
 
 
-instantiate_parametrized_tests(TestTraceableCollectives)
-
 BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO
 WORLD_SIZE = 2
 
 
 def exit_if_lt_x_gpu(x):
-    if BACKEND == dist.Backend.NCCL and torch.cuda.device_count() < x:
+    if torch.cuda.device_count() < x:
         sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
 
 
@@ -487,6 +491,8 @@ def destroy_comms(self):
     @requires_nccl()
     @with_comms()
     def test_all_gather_into_tensor_coalesced(self):
+        exit_if_lt_x_gpu(self.world_size)
+
         tensors = [
             torch.ones([4], device=f"cuda:{self.rank}"),
             torch.ones([4], device=f"cuda:{self.rank}") + 1,
@@ -534,40 +540,6 @@ def test_all_to_all_single_1d_input(self):
         expected = torch.cat(expected)
         self.assertEqual(y, expected)
 
-    @with_comms()
-    def test_all_to_all_single_output_split_sizes_none(self):
-        device = "cuda" if BACKEND == dist.Backend.NCCL else "cpu"
-        mesh = dt.DeviceMesh(device, torch.arange(self.world_size))
-        rank = dist.get_rank()
-
-        input_split_sizes = [1] * self.world_size
-        x = torch.ones(self.world_size, self.world_size, device=device) * (rank + 1)
-        y = ft_c.all_to_all_single(
-            x, output_split_sizes=None, input_split_sizes=input_split_sizes, group=mesh
-        )
-        expected = []
-        for idx, tensor in enumerate(torch.chunk(x, self.world_size)):
-            expected.append(torch.full_like(tensor, (idx + 1)))
-        expected = torch.cat(expected)
-        self.assertEqual(y, expected)
-
-    @with_comms()
-    def test_all_to_all_single_input_split_sizes_none(self):
-        device = "cuda" if BACKEND == dist.Backend.NCCL else "cpu"
-        mesh = dt.DeviceMesh(device, torch.arange(self.world_size))
-        rank = dist.get_rank()
-
-        output_split_sizes = [1] * self.world_size
-        x = torch.ones(self.world_size, self.world_size, device=device) * (rank + 1)
-        y = ft_c.all_to_all_single(
-            x, output_split_sizes=output_split_sizes, input_split_sizes=None, group=mesh
-        )
-        expected = []
-        for idx, tensor in enumerate(torch.chunk(x, self.world_size)):
-            expected.append(torch.full_like(tensor, (idx + 1)))
-        expected = torch.cat(expected)
-        self.assertEqual(y, expected)
-
     @with_comms()
     def test_all_to_all_single_split_sizes_none(self):
         device = "cuda" if BACKEND == dist.Backend.NCCL else "cpu"
@@ -612,7 +584,6 @@ def allreduce(t, pg):
 
 
 class TestNCCLCollectivesWithWorldSize4(TestCollectivesWithNCCL):
-
     @property
     def world_size(self):
         return 4
@@ -620,6 +591,8 @@ def world_size(self):
     @requires_nccl()
     @with_comms()
     def test_permute_tensor_with_sub_group(self):
+        exit_if_lt_x_gpu(self.world_size)
+
         device = "cuda"
         mesh_dim_names = ["dp", "tp"]
 
@@ -633,105 +606,208 @@ def test_permute_tensor_with_sub_group(self):
 
             # rank0: [0., 1.], rank1: [2., 3.]
             send_tensor = torch.arange(2, dtype=torch.float32, device=device) + 2 * rank
-            recvd_tensor = ft_c.permute_tensor(
-                send_tensor,
-                [1, 0],
-                group=mesh
-            )
+            recvd_tensor = ft_c.permute_tensor(send_tensor, [1, 0], group=mesh)
 
             # rank0: [2., 3.], rank1: [0., 1.]
-            expected = torch.arange(
-                2,
-                dtype=torch.float32,
-                device=device
-            ) + 2 * ((rank - 1 + 2) % 2)
+            expected = torch.arange(2, dtype=torch.float32, device=device) + 2 * (
+                (rank - 1 + 2) % 2
+            )
             self.assertEqual(
                 recvd_tensor,
                 expected,
                 msg=f"Expected {expected} on {self.rank=} (local_rank={rank}), "
-                    f"but received {recvd_tensor} instead."
+                f"but received {recvd_tensor} instead.",
             )
 
 
+@instantiate_parametrized_tests
+class TestFunctionalAutograd(MultiThreadedTestCase):
+    def setUp(self):
+        super().setUp()
+        self._spawn_threads()
 
-class TestOpWaitiness(MultiThreadedTestCase):
     @property
     def world_size(self):
-        return 1
+        return 2
 
-    def setUp(self):
-        super().setUp()
-        self._spawn_threads()
+    @parametrize("compile", [True, False])
+    def test_all_to_all_single(self, compile: bool = True) -> None:
+        group = dist.group.WORLD.group_name
 
-    def tearDown(self):
-        super().tearDown()
-        ft_c_impl._wait_all()
+        t = torch.ones((self.world_size, 2), requires_grad=True)
 
-    def test_wait_reduce_outstanding_work_count(self):
-        self.assertEqual(0, ft_c_impl._outstanding_wait_count())
+        def my_func(t: torch.Tensor, world_size: int) -> torch.Tensor:
+            sizes = [1] * world_size
+            t = t * 2
+            assert t.requires_grad
+            out = ft_c.all_to_all_single_autograd(t, sizes, sizes, group)
+            out = out + 0
+            return out
 
-        tensor = torch.ones([4])
-        res = ft_c.all_reduce(tensor, "sum", [0])
-        self.assertEqual(1, ft_c_impl._outstanding_wait_count())
-        self.assertTrue(ft_c_impl._tensor_needs_wait(res))
+        if compile:
+            compiled = torch.compile(my_func, fullgraph=True, backend="aot_eager")
+        else:
+            compiled = my_func
+
+        out = compiled(t, self.world_size)
+        self.assertEqual(out.shape, t.shape)
+        self.assertEqual(out, torch.full_like(t, 2.0))
+        self.assertIsNotNone(out.grad_fn)
+        self.assertTrue(out.requires_grad)
+        loss = out.sum()
+        loss.backward()
+        self.assertEqual(t.grad, torch.full_like(t, 2.0))
+
+    def test_all_to_all_single_inductor(self) -> None:
+        group = dist.group.WORLD.group_name
+
+        t = torch.rand((self.world_size, 2), requires_grad=True)
+
+        def my_func(t: torch.Tensor, world_size: int) -> torch.Tensor:
+            sizes = [1] * world_size
+            t = t * 10
+            assert t.requires_grad
+            out = ft_c.all_to_all_single_autograd(t, sizes, sizes, group)
+            out = out + 2
+            return out.sum()
+
+        compiled = torch.compile(my_func, fullgraph=True)
+
+        def run_with_backward():
+            out = compiled(t, self.world_size)
+            out.backward()
+
+        res, codes = run_and_get_code(run_with_backward)
+        for code in codes:
+            FileCheck().check_count(
+                "_c10d_functional.all_to_all_single.default", 1, exactly=True
+            ).check_count("_c10d_functional.wait_tensor.default", 1, exactly=True).run(
+                code
+            )
 
-        res.trigger_wait()
-        self.assertEqual(0, ft_c_impl._outstanding_wait_count())
-        self.assertFalse(ft_c_impl._tensor_needs_wait(res))
+        self.assertIsNotNone(t.grad)
 
-    def test_add_triggers_wait(self):
-        self.assertEqual(0, ft_c_impl._outstanding_wait_count())
+    @parametrize("compile", [True, False])
+    def test_all_gather_tensor(self, compile: bool) -> None:
+        group = dist.group.WORLD.group_name
 
-        tensor = torch.ones([4])
-        res = ft_c.all_reduce(tensor, "sum", [0])
-        self.assertEqual(1, ft_c_impl._outstanding_wait_count())
-        self.assertTrue(ft_c_impl._tensor_needs_wait(res))
+        def my_func(t: torch.Tensor, dim: int) -> torch.Tensor:
+            assert t.requires_grad
+            out = ft_c.all_gather_tensor_autograd(
+                t * 1.0,
+                gather_dim=dim,
+                group=group,
+            )
+            out = out * 1.0
+            return out
 
-        foo = res + torch.ones([4])
-        self.assertEqual(0, ft_c_impl._outstanding_wait_count())
-        self.assertFalse(ft_c_impl._tensor_needs_wait(res))
-        self.assertFalse(isinstance(foo, ft_c.AsyncCollectiveTensor))
+        if compile:
+            compiled = torch.compile(my_func, fullgraph=True, backend="aot_eager")
+        else:
+            compiled = my_func
+
+        dims_to_gather = [0, 1, 2]
+        for dim in dims_to_gather:
+            output_size = [3, 3, 3]
+            output_size[dim] *= self.world_size
+            # each rank have its own tensor, all_gather gives a bigger tensor
+            local_tensor = torch.ones([3, 3, 3], requires_grad=True)
+            gathered_tensor = compiled(local_tensor, dim)
+            self.assertEqual(gathered_tensor, torch.ones(output_size))
+
+            gathered_tensor.sum().backward()
+            self.assertEqual(
+                local_tensor.grad,
+                torch.full((3, 3, 3), fill_value=float(self.world_size)),
+            )
 
-    def test_view_does_not_trigger_wait(self):
-        self.assertEqual(0, ft_c_impl._outstanding_wait_count())
+    @parametrize("compile", [True, False])
+    def test_reduce_scatter_tensor(self, compile: bool) -> None:
+        group = dist.group.WORLD.group_name
 
-        tensor = torch.ones([4])
-        res = ft_c.all_reduce(tensor, "sum", [0])
-        self.assertEqual(1, ft_c_impl._outstanding_wait_count())
-        self.assertTrue(ft_c_impl._tensor_needs_wait(res))
+        def my_func(t: torch.Tensor, dim: int) -> torch.Tensor:
+            assert t.requires_grad
+            rs_tensor = (
+                ft_c.reduce_scatter_tensor_autograd(
+                    input_tensor * 1.0, "sum", scatter_dim=dim, group=group
+                )
+                * 1.0
+            )
+            return rs_tensor
 
-        foo = res.view([2, 2])
-        self.assertEqual(1, ft_c_impl._outstanding_wait_count())
-        self.assertTrue(ft_c_impl._tensor_needs_wait(res))
-        self.assertTrue(ft_c_impl._tensor_needs_wait(foo))
-        self.assertTrue(isinstance(foo, ft_c.AsyncCollectiveTensor))
+        if compile:
+            compiled = torch.compile(my_func, fullgraph=True, backend="aot_eager")
+        else:
+            compiled = my_func
+
+        dims_to_scatter = [0, 1]
+        for dim in dims_to_scatter:
+            group_size = self.world_size
+            input_size = [3, 3]
+            output_size = [3, 3]
+            output_size[dim] *= group_size
+            input_tensor = torch.ones(output_size, requires_grad=True)
+            rs_tensor = compiled(input_tensor, dim)
+            res_num = 1 * group_size
+            self.assertEqual(rs_tensor, torch.ones(input_size) * res_num)
+            rs_tensor.sum().backward()
+            self.assertEqual(input_tensor.grad, torch.full(output_size, fill_value=1.0))
+
+
+class TestFunctionalAutogradWithNCCL(MultiProcessTestCase):
+    def setUp(self):
+        super().setUp()
+        os.environ["WORLD_SIZE"] = str(self.world_size)
+        os.environ["BACKEND"] = dist.Backend.NCCL
+        self._spawn_processes()
 
-        foo.trigger_wait()
-        self.assertEqual(0, ft_c_impl._outstanding_wait_count())
+    @property
+    def device(self):
+        return torch.device(self.rank)
 
-        self.assertEqual(foo.tolist(), [[1.0, 1.0], [1.0, 1.0]])
+    @property
+    def world_size(self):
+        return 2
 
-    def test_dead_wrapper_triggers_wait(self):
-        self.assertEqual(0, ft_c_impl._outstanding_wait_count())
+    @property
+    def process_group(self):
+        return dist.group.WORLD
 
-        tensor = torch.ones([4])
-        res = ft_c.all_reduce(tensor, "sum", [0])
+    def dist_init(self):
+        dist.init_process_group(
+            backend=BACKEND,
+            world_size=self.world_size,
+            rank=self.rank,
+            init_method=f"file://{self.file_name}",
+        )
 
-        wr = weakref.ref(res)
-        self.assertTrue(wr() is not None)
-        res = None
-        self.assertTrue(wr() is None)
-        self.assertEqual(0, ft_c_impl._outstanding_wait_count())
+        # set device for nccl pg for collectives
+        if BACKEND == "nccl":
+            torch.cuda.set_device(self.rank)
 
-    def test_dead_wrapper_plus_view(self):
-        self.assertEqual(0, ft_c_impl._outstanding_wait_count())
+    def destroy_comms(self):
+        # Wait for all ranks to reach here before starting shutdown.
+        dist.barrier()
+        dist.destroy_process_group()
 
-        tensor = torch.ones([4])
-        res = ft_c.all_reduce(tensor, "sum", [0])
-        res = res.view([2, 2])
-        self.assertEqual(1, ft_c_impl._outstanding_wait_count())
-        res = None
-        self.assertEqual(0, ft_c_impl._outstanding_wait_count())
+    @requires_nccl()
+    @with_comms()
+    def test_all_to_all_single(self) -> None:
+        group = self.process_group.group_name
+
+        t = torch.ones((self.world_size, 2), requires_grad=True, device=self.device)
+
+        sizes = [1] * self.world_size
+        assert t.requires_grad
+        out = ft_c.all_to_all_single_autograd(t * 2, sizes, sizes, group) + 0
+
+        self.assertEqual(out.shape, t.shape)
+        self.assertEqual(out, torch.full_like(t, 2.0))
+        self.assertIsNotNone(out.grad_fn)
+        self.assertTrue(out.requires_grad)
+        loss = out.sum()
+        loss.backward()
+        self.assertEqual(t.grad, torch.full_like(t, 2.0))
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index e925005306562..65802c4896f75 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -2,39 +2,49 @@
 import functools
 import unittest
 from unittest.mock import patch
+
 import torch
-from torch._C import FileCheck
-# for some reason importing functional collectives after dynamo breaks collectives handling!
-import torch.distributed._functional_collectives as _functional_collectives
 import torch._dynamo
+import torch._dynamo.logging
 import torch._dynamo.test_case
-from torch._dynamo.utils import same
+
+# for some reason importing functional collectives after dynamo breaks collectives handling!
+import torch.distributed._functional_collectives as _functional_collectives
+from torch._C import FileCheck
 from torch._dynamo.testing import CompileCounter
+from torch._dynamo.utils import same
+from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
+from torch._inductor.utils import run_and_get_triton_code
 from torch.distributed.distributed_c10d import GroupMember
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing._internal.common_distributed import (
-    DynamoDistributedSingleProcTestCase,
-    DynamoDistributedMultiProcTestCase,
     _dynamo_dist_per_rank_init,
+    DynamoDistributedMultiProcTestCase,
+    DynamoDistributedSingleProcTestCase,
     requires_nccl,
     skip_if_lt_x_gpu,
 )
-from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    requires_cuda,
+)
 from torch.utils._triton import has_triton
-from torch._inductor.utils import run_and_get_triton_code
-import torch._dynamo.logging
+
 
 def _tolist_with_constrain_as_size(tensor):
     lst = tensor.tolist()
     for elem in lst:
-        torch._constrain_as_size(elem)
+        torch._check_is_size(elem)
     return lst
 
+
 @requires_nccl()
 class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
     """
     Run correctness checks in multi-proc runner, mark with minimum # GPUs to run under
     """
+
     def get_world_trs(self):
         return {
             "tag": "",
@@ -58,7 +68,9 @@ def test_broadcast_inductor(self):
         """
 
         def example(tensor, src, *, tag, ranks, group_size):
-            res = torch.ops.c10d_functional.broadcast(tensor, src, tag, ranks, group_size)
+            res = torch.ops.c10d_functional.broadcast(
+                tensor, src, tag, ranks, group_size
+            )
             res = torch.ops.c10d_functional.wait_tensor(res)
             return res
 
@@ -67,16 +79,12 @@ def compile(func, example_inputs):
             return inductor_compile_fx(graph, example_inputs)
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-
             example = functools.partial(
                 example,
                 **self.get_world_trs(),
             )
             t = torch.randn(4, 4, device="cuda")
-            inputs = (
-                t if self.rank == 0 else torch.zeros(4, 4, device="cuda"),
-                0
-            )
+            inputs = (t if self.rank == 0 else torch.zeros(4, 4, device="cuda"), 0)
             eager_out = example(*inputs)
             self.assertTrue(same(t, eager_out))
 
@@ -101,14 +109,13 @@ def matmul_cat_col(a, b, c, d, e, f, *, tag, ranks, group_size):
             g = torch.matmul(e, f)
             ar = torch.ops.c10d_functional.wait_tensor(ar)
             out = torch.add(ar, g.repeat(2, 1))
-            return (out, )
+            return (out,)
 
         def compile(func, example_inputs):
             graph = make_fx(func)(*example_inputs)
             return inductor_compile_fx(graph, example_inputs)
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-
             matmul_cat_col = functools.partial(
                 matmul_cat_col,
                 **self.get_world_trs(),
@@ -131,7 +138,6 @@ def test_c10d_functional_tagged_pt2_compliant(self):
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
     def test_eager_allreduce_inductor_wait(self):
-
         def eager_func(a, b, c, d, *, tag, ranks, group_size):
             x = torch.matmul(a, b)
             y = torch.matmul(c, d)
@@ -143,14 +149,13 @@ def inductor_func(ar, e, f):
             g = torch.matmul(e, f)
             ar = torch.ops.c10d_functional.wait_tensor(ar)
             out = torch.add(ar, g.repeat(2, 1))
-            return (out, )
+            return (out,)
 
         def compile(func, example_inputs):
             graph = make_fx(func)(*example_inputs)
             return inductor_compile_fx(graph, example_inputs)
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-
             eager_func = functools.partial(
                 eager_func,
                 **self.get_world_trs(),
@@ -159,8 +164,12 @@ def compile(func, example_inputs):
             inductor_inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 2
 
             eager_out = inductor_func(eager_func(*eager_inputs), *inductor_inputs)
-            compiled_inductor_func = compile(inductor_func, [eager_func(*eager_inputs)] + list(inductor_inputs))
-            inductor_out = compiled_inductor_func(eager_func(*eager_inputs), *inductor_inputs)
+            compiled_inductor_func = compile(
+                inductor_func, [eager_func(*eager_inputs)] + list(inductor_inputs)
+            )
+            inductor_out = compiled_inductor_func(
+                eager_func(*eager_inputs), *inductor_inputs
+            )
             print(f"eager_out, {eager_out}")
             print(f"inductor_out, {inductor_out}")
             self.assertTrue(same(eager_out, inductor_out, tol=0.001))
@@ -170,7 +179,6 @@ def compile(func, example_inputs):
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
     def test_inductor_allreduce_eager_wait(self):
-
         def inductor_func(a, b, c, d, *, tag, ranks, group_size):
             x = torch.matmul(a, b)
             y = torch.matmul(c, d)
@@ -182,14 +190,13 @@ def eager_func(ar, e, f):
             g = torch.matmul(e, f)
             ar = torch.ops.c10d_functional.wait_tensor(ar)
             out = torch.add(ar, g.repeat(2, 1))
-            return (out, )
+            return (out,)
 
         def compile(func, example_inputs):
             graph = make_fx(func)(*example_inputs)
             return inductor_compile_fx(graph, example_inputs)
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-
             inductor_func = functools.partial(
                 inductor_func,
                 **self.get_world_trs(),
@@ -199,7 +206,9 @@ def compile(func, example_inputs):
 
             eager_out = eager_func(inductor_func(*inductor_inputs), *eager_inputs)
             compiled_inductor_func = compile(inductor_func, inductor_inputs)
-            inductor_out = eager_func(compiled_inductor_func(*inductor_inputs), *eager_inputs)
+            inductor_out = eager_func(
+                compiled_inductor_func(*inductor_inputs), *eager_inputs
+            )
             self.assertTrue(same(eager_out, inductor_out, tol=0.001))
 
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
@@ -228,7 +237,9 @@ def func(a, *, tag, ranks, group_size):
     @patch.object(torch._inductor.config, "compile_threads", 1)
     def test_permute_tensor(self):
         def func(tensor, src_dst_pairs, *, tag, ranks, group_size):
-            return _functional_collectives.permute_tensor(tensor, src_dst_pairs, ranks, tag)
+            return _functional_collectives.permute_tensor(
+                tensor, src_dst_pairs, ranks, tag
+            )
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             inputs = (
@@ -242,11 +253,9 @@ def func(tensor, src_dst_pairs, *, tag, ranks, group_size):
             self.assertTrue(same(out, correct))
 
             # rank0: [2., 3.], rank1: [0., 1.]
-            expected = torch.arange(
-                2,
-                dtype=torch.float32,
-                device="cuda"
-            ) + 2 * ((self.rank - 1 + self.world_size) % self.world_size)
+            expected = torch.arange(2, dtype=torch.float32, device="cuda") + 2 * (
+                (self.rank - 1 + self.world_size) % self.world_size
+            )
             self.assertEqual(out, expected)
             self.assertEqual(correct, expected)
 
@@ -313,16 +322,17 @@ def test_allgather_into_tensor_inductor(self):
 
         def example(a, b, *, tag, ranks, group_size):
             c = torch.matmul(a, b)
-            ag = torch.ops.c10d_functional.all_gather_into_tensor(c, tag, ranks, group_size)
+            ag = torch.ops.c10d_functional.all_gather_into_tensor(
+                c, tag, ranks, group_size
+            )
             ag = torch.ops.c10d_functional.wait_tensor(ag)
-            return (ag, )
+            return (ag,)
 
         def compile(func, example_inputs):
             graph = make_fx(func)(*example_inputs)
             return inductor_compile_fx(graph, example_inputs)
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-
             example = functools.partial(
                 example,
                 **self.get_world_trs(),
@@ -369,9 +379,19 @@ def compile(func, example_inputs):
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
     def test_all_to_all_single_inductor(self):
-        def example(inp, input_split_sizes_tensor, output_split_sizes_tensor, *, tag, ranks, group_size):
+        def example(
+            inp,
+            input_split_sizes_tensor,
+            output_split_sizes_tensor,
+            *,
+            tag,
+            ranks,
+            group_size,
+        ):
             input_split_sizes = _tolist_with_constrain_as_size(input_split_sizes_tensor)
-            output_split_sizes = _tolist_with_constrain_as_size(output_split_sizes_tensor)
+            output_split_sizes = _tolist_with_constrain_as_size(
+                output_split_sizes_tensor
+            )
             a2a = torch.ops.c10d_functional.all_to_all_single(
                 inp,
                 output_split_sizes,
@@ -384,107 +404,41 @@ def example(inp, input_split_sizes_tensor, output_split_sizes_tensor, *, tag, ra
             out = a2a / a2a.sum(dim=0)
             return out
 
-        with _dynamo_dist_per_rank_init(self.rank, self.world_size), torch._dynamo.config.patch(
+        with _dynamo_dist_per_rank_init(
+            self.rank, self.world_size
+        ), torch._dynamo.config.patch(
             dynamic_shapes=True,
             capture_dynamic_output_shape_ops=True,
             capture_scalar_outputs=True,
         ):
             row = self.world_size * (self.rank + 1) * (self.world_size + 1) / 2
-            input_split_sizes_tensor = torch.tensor([(i + 1) * (self.rank + 1) for i in range(self.world_size)], dtype=torch.int64)
-            output_split_sizes_tensor = torch.tensor([(i + 1) * (self.rank + 1) for i in range(self.world_size)], dtype=torch.int64)
-            inputs = (
-                torch.ones(int(row), 5, device="cuda") * (self.rank + 1),
-                input_split_sizes_tensor,
-                output_split_sizes_tensor,
+            input_split_sizes_tensor = torch.tensor(
+                [(i + 1) * (self.rank + 1) for i in range(self.world_size)],
+                dtype=torch.int64,
             )
-            trs = self.get_world_trs()
-
-            compiled_fn = torch.compile(example, fullgraph=True, dynamic=True)
-            code = run_and_get_triton_code(compiled_fn, *inputs, **trs)
-
-            FileCheck() \
-                .check_regex("all_to_all_single\\(buf\\d+\\[0\\], buf\\d+_inputs\\[0\\], output_split_sizes=\\[i\\d+, i\\d+\\], input_split_sizes=\\[i\\d+, i\\d+\\]") \
-                .run(code)  # noqa: B950
-
-            eager_out = example(*inputs, **trs)
-            inductor_out = compiled_fn(*inputs, **trs)
-            self.assertTrue(same(eager_out, inductor_out, tol=0.001))
-
-    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
-    @skip_if_lt_x_gpu(2)
-    @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
-    # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
-    @patch.object(torch._inductor.config, "compile_threads", 1)
-    def test_all_to_all_single_inductor_output_split_sizes_none(self):
-        def example(inp, input_split_sizes_tensor, *, tag, ranks, group_size):
-            input_split_sizes = _tolist_with_constrain_as_size(input_split_sizes_tensor)
-            a2a = torch.ops.c10d_functional.all_to_all_single(
-                inp,
-                None,
-                input_split_sizes,
-                tag,
-                ranks,
-                group_size,
+            output_split_sizes_tensor = torch.tensor(
+                [(i + 1) * (self.rank + 1) for i in range(self.world_size)],
+                dtype=torch.int64,
             )
-            a2a = torch.ops.c10d_functional.wait_tensor(a2a)
-            out = a2a / a2a.sum(dim=0)
-            return out
-
-        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-            input_split_sizes_tensor = torch.tensor([1] * self.world_size, dtype=torch.int64)
             inputs = (
-                torch.ones(self.world_size, self.world_size, device="cuda") * (self.rank + 1),
+                torch.ones(int(row), 5, device="cuda") * (self.rank + 1),
                 input_split_sizes_tensor,
-            )
-            trs = self.get_world_trs()
-
-            compiled_fn = torch.compile(example, fullgraph=True, dynamic=True)
-            code = run_and_get_triton_code(compiled_fn, *inputs, **trs)
-            FileCheck() \
-                .check_regex("all_to_all_single\\(buf\\d+\\[0\\], buf\\d+_inputs\\[0\\], output_split_sizes=None, input_split_sizes=\\[i\\d+, i\\d+\\]") \
-                .run(code)  # noqa: B950
-
-            eager_out = example(*inputs, **trs)
-            inductor_out = compiled_fn(*inputs, **trs)
-            self.assertTrue(same(eager_out, inductor_out, tol=0.001))
-
-    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
-    @skip_if_lt_x_gpu(2)
-    @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
-    # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
-    @patch.object(torch._inductor.config, "compile_threads", 1)
-    def test_all_to_all_single_inductor_input_split_sizes_none(self):
-        def example(inp, output_split_sizes_tensor, *, tag, ranks, group_size):
-            output_split_sizes = _tolist_with_constrain_as_size(output_split_sizes_tensor)
-            a2a = torch.ops.c10d_functional.all_to_all_single(
-                inp,
-                output_split_sizes,
-                None,
-                tag,
-                ranks,
-                group_size,
-            )
-            a2a = torch.ops.c10d_functional.wait_tensor(a2a)
-            out = a2a / a2a.sum(dim=0)
-            return out
-
-        with _dynamo_dist_per_rank_init(self.rank, self.world_size), torch._dynamo.config.patch(
-            dynamic_shapes=True,
-            capture_dynamic_output_shape_ops=True,
-            capture_scalar_outputs=True,
-        ):
-            output_split_sizes_tensor = torch.tensor([1] * self.world_size, dtype=torch.int64)
-            inputs = (
-                torch.ones(self.world_size, self.world_size, device="cuda") * (self.rank + 1),
                 output_split_sizes_tensor,
             )
             trs = self.get_world_trs()
 
             compiled_fn = torch.compile(example, fullgraph=True, dynamic=True)
             code = run_and_get_triton_code(compiled_fn, *inputs, **trs)
-            FileCheck() \
-                .check_regex("all_to_all_single\\(buf\\d+\\[0\\], buf\\d+_inputs\\[0\\], output_split_sizes=\\[i\\d+, i\\d+\\], input_split_sizes=None") \
-                .run(code)  # noqa: B950
+            (
+                FileCheck()
+                .check_regex(
+                    "torch.ops._c10d_functional.all_to_all_single.default\\("
+                    "arg\\d+_\\d+, "
+                    "\\[u\\d+, u\\d+\\], "
+                    "\\[u\\d+, u\\d+\\]"
+                )
+                .run(code)
+            )
 
             eager_out = example(*inputs, **trs)
             inductor_out = compiled_fn(*inputs, **trs)
@@ -509,25 +463,38 @@ def example(inp, *, tag, ranks, group_size):
             return out
 
         with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-            inputs = (torch.ones(self.world_size, self.world_size, device="cuda") * (self.rank + 1),)
+            inputs = (
+                torch.ones(self.world_size, self.world_size, device="cuda")
+                * (self.rank + 1),
+            )
             trs = self.get_world_trs()
 
             compiled_fn = torch.compile(example, fullgraph=True, dynamic=True)
             code = run_and_get_triton_code(compiled_fn, *inputs, **trs)
-            FileCheck() \
-                .check_regex("all_to_all_single\\(buf\\d+\\[0\\], buf\\d+_inputs\\[0\\], output_split_sizes=None, input_split_sizes=None") \
-                .run(code)  # noqa: B950
+            (
+                FileCheck()
+                .check_regex(
+                    "torch.ops._c10d_functional.all_to_all_single.default\\("
+                    "arg\\d+_\\d+, "
+                    "\\[\\(s\\d+ // \\d\\), \\(s\\d+ // \\d\\)\\], "
+                    "\\[\\(s\\d+ // \\d\\), \\(s\\d+ // \\d\\)\\]"
+                )
+                .run(code)
+            )
 
             eager_out = example(*inputs, **trs)
             inductor_out = compiled_fn(*inputs, **trs)
             self.assertTrue(same(eager_out, inductor_out, tol=0.001))
 
 
+@instantiate_parametrized_tests
 @requires_nccl()
+@requires_cuda
 class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
     """
     Prefer single-proc test runner for basic tests as it is easier to work with.
     """
+
     def get_world_trs(self, world_size=1):
         return {
             "tag": "",
@@ -538,9 +505,10 @@ def get_world_trs(self, world_size=1):
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @torch._inductor.config.patch(debug=True)
     def test_inductor_single_op(self):
-
         def func(inp, *, tag, ranks, group_size):
-            ar = torch.ops.c10d_functional.all_reduce(inp, "sum", tag, ranks, group_size)
+            ar = torch.ops.c10d_functional.all_reduce(
+                inp, "sum", tag, ranks, group_size
+            )
             ar = torch.ops.c10d_functional.wait_tensor(ar)
             return ar
 
@@ -551,15 +519,15 @@ def func(inp, *, tag, ranks, group_size):
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
         # NOTE: Make sure we are not unneccessarily copying the outputs of
         # wait_tensors before they are returned from the graph.
-        FileCheck() \
-            .check("buf0 = empty(") \
-            .check("buf0.copy_(arg0_1)") \
-            .check("buf1 = buf0") \
-            .check("buf1_work = dist.all_reduce(buf1") \
-            .check("fun_col_impl._register_tensor_work(buf1, buf1_work)") \
-            .check("buf0 = _wait_tensor(buf0)") \
-            .check("return (buf0, )") \
+        (
+            FileCheck()
+            .check("buf0 = empty_strided")
+            .check(".run(arg0_1, buf0, 16")
+            .check("buf1 = torch.ops._c10d_functional.all_reduce_.default(buf0")
+            .check("buf3 = torch.ops._c10d_functional.wait_tensor.default(buf0")
+            .check("return (buf0")
             .run(code)
+        )
         correct = func(inputs, **self.get_world_trs())
         self.assertTrue(same(out, correct))
 
@@ -583,19 +551,17 @@ def func(inp, *, tag, ranks, group_size):
 
         compiled = torch.compile(func)
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
-        # NOTE: Make sure we are not unneccessarily copying the outputs of
-        # wait_tensors before they are returned from the graph.
-        FileCheck() \
-            .check("buf1 = buf0; del buf0  # reuse") \
-            .check_not("buf1.copy_(") \
-            .check("buf2 = buf1") \
-            .check("buf2_work = dist.all_reduce(buf2") \
-            .check("fun_col_impl._register_tensor_work(buf2, buf2_work)") \
-            .check("buf1 = _wait_tensor(buf1)") \
-            .check("buf4 = buf1") \
-            .check("buf5 = empty") \
-            .check("return (buf1, buf5") \
+        (
+            FileCheck()
+            .check("buf0 = empty_strided")
+            .check(".run(arg0_1, buf0")
+            .check("buf1 = torch.ops._c10d_functional.all_reduce_.default(buf0")
+            .check("buf3 = torch.ops._c10d_functional.wait_tensor.default(buf0")
+            .check("buf5 = empty_strided")
+            .check(".run(buf5, 16")
+            .check("return (buf0, buf5")
             .run(code)
+        )
         out = compiled(inputs, **self.get_world_trs())
         correct = func(inputs, **self.get_world_trs())
         self.assertTrue(same(out, correct))
@@ -622,34 +588,32 @@ def func(inp, *, tag, ranks, group_size):
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
         # NOTE: Make sure we are not unneccessarily copying the outputs of
         # wait_tensors before they are returned from the graph.
-        FileCheck() \
-            .check("buf0 = empty(") \
-            .check("buf5 = empty(") \
-            .check("triton_poi__0.run(arg0_1, buf0, buf5") \
-            .check_not("copy_(") \
-            .check("buf1 = buf0; del buf0  # reuse") \
-            .check("buf2 = buf1") \
-            .check("buf2_work = dist.all_reduce(buf2") \
-            .check("fun_col_impl._register_tensor_work(buf2, buf2_work)") \
-            .check("buf1 = _wait_tensor(buf1)") \
-            .check("buf4 = buf1") \
-            .check("return (buf1, buf5, buf6") \
+        (
+            FileCheck()
+            .check("buf0 = empty_strided")
+            .check("buf5 = empty_strided")
+            .check(".run(arg0_1, buf0, buf5, 16")
+            .check("buf1 = torch.ops._c10d_functional.all_reduce_.default(buf0")
+            .check("buf3 = torch.ops._c10d_functional.wait_tensor.default(buf0")
+            .check("buf6 = empty_strided")
+            .check(".run(buf6, 16")
+            .check("return (buf0, buf5, buf6")
             .run(code)
+        )
         out = compiled(inputs, **self.get_world_trs())
         correct = func(inputs, **self.get_world_trs())
         self.assertTrue(same(out, correct))
 
     def test_dynamo_trace_allreduce(self):
-
-        def func(inp, *, tag, ranks, group_size):
-            ar = _functional_collectives.all_reduce(inp, "sum", ranks, tag)
+        def func(inp):
+            ar = _functional_collectives.all_reduce(inp, "sum", "0")
             return ar
 
         inputs = torch.ones(4, 4, device="cuda")
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter)
-        out = compiled(inputs, **self.get_world_trs())
-        correct = func(inputs, **self.get_world_trs())
+        out = compiled(inputs)
+        correct = func(inputs)
         self.assertEqual(counter.frame_count, 1)
 
         # should test more precisely, but the 2 is supposed to be (all_reduce, wait)
@@ -657,16 +621,15 @@ def func(inp, *, tag, ranks, group_size):
         self.assertTrue(same(out, correct))
 
     def test_dynamo_trace_all_gather_tensor(self):
-
-        def func(inp, *, tag, ranks, group_size):
-            ar = _functional_collectives.all_gather_tensor(inp, 0, ranks, tag)
+        def func(inp):
+            ar = _functional_collectives.all_gather_tensor(inp, 0, "0")
             return ar
 
         inputs = torch.ones(4, 4, device="cuda")
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter)
-        out = compiled(inputs, **self.get_world_trs())
-        correct = func(inputs, **self.get_world_trs())
+        out = compiled(inputs)
+        correct = func(inputs)
         self.assertEqual(counter.frame_count, 1)
 
         # should test more precisely, but the 2 is supposed to be (all_gather, wait)
@@ -674,7 +637,6 @@ def func(inp, *, tag, ranks, group_size):
         self.assertTrue(same(out, correct))
 
     def test_dynamo_trace_all_gather_tensor_pg(self):
-
         def func(inp, *, pg):
             ar = _functional_collectives.all_gather_tensor(inp, 0, pg)
             return ar
@@ -691,13 +653,13 @@ def func(inp, *, pg):
         self.assertTrue(same(out, correct))
 
     def test_dynamo_rewrite_dist_all_gather(self):
-
         def func(inp, out, *, pg):
             torch.distributed.all_gather_into_tensor(
                 out,
                 inp,
                 pg,
             )
+
         local_size = [4, 4]
         # single-proc test
         global_size = local_size
@@ -715,14 +677,64 @@ def func(inp, out, *, pg):
         assert counter.op_count == 3
         assert same(outputs, correct_outputs)
 
-    def test_dynamo_rewrite_dist_reduce_scatter(self):
+    def test_dynamo_rewrite_dist_all_gather_list(self):
+        def func(inp, out, *, pg):
+            torch.distributed.all_gather(
+                out,
+                inp,
+                pg,
+            )
+
+        local_size = [4, 4]
+        # single-proc test
+        global_size = local_size
+
+        inputs = torch.ones(local_size, device=self.device)
+        outputs = [torch.empty(global_size, device=self.device)]
+        correct_outputs = [torch.empty(global_size, device=self.device)]
+        counter = CompileCounter()
+        compiled = torch.compile(func, backend=counter, fullgraph=True)
+        compiled(inputs, outputs, pg=GroupMember.WORLD)
+        func(inputs, correct_outputs, pg=GroupMember.WORLD)
+        assert counter.frame_count == 1
+        assert same(outputs, correct_outputs)
 
+    def test_dynamo_rewrite_dist_all_gather_args_match(self):
+        # Duplicated most of the structure from test_dynamo_rewrite_dist_all_gather
+        # except uses kwargs to ensure rewrite has matching arg names
+        def func(inp, out, *, pg):
+            torch.distributed.all_gather_into_tensor(
+                output_tensor=out,
+                input_tensor=inp,
+                group=pg,
+                async_op=False,
+            )
+
+        local_size = [4, 4]
+        # single-proc test
+        global_size = local_size
+
+        inputs = torch.ones(local_size, device=self.device)
+        outputs = torch.empty(global_size, device=self.device)
+        correct_outputs = torch.empty(global_size, device=self.device)
+        counter = CompileCounter()
+        compiled = torch.compile(func, backend=counter, fullgraph=True)
+        compiled(inputs, outputs, pg=GroupMember.WORLD)
+        func(inputs, correct_outputs, pg=GroupMember.WORLD)
+        assert counter.frame_count == 1
+
+        # should test more precisely, but the 3 is supposed to be (all_gather, wait, copy_)
+        assert counter.op_count == 3
+        assert same(outputs, correct_outputs)
+
+    def test_dynamo_rewrite_dist_reduce_scatter(self):
         def func(inp, out, *, pg):
             torch.distributed.reduce_scatter_tensor(
                 out,
                 inp,
                 group=pg,
             )
+
         local_size = [4, 4]
         # single-proc test
         global_size = local_size
@@ -740,39 +752,154 @@ def func(inp, out, *, pg):
         assert counter.op_count == 3
         assert same(outputs, correct_outputs)
 
-    def test_dynamo_rewrite_dist_allreduce(self):
-
-        def func(tensor, pg):
+    @parametrize(
+        "pg_mode",
+        [
+            "positional",
+            "positional_none",
+            "kwargs",
+            "kwargs_none",
+            "unspecified",
+        ],
+    )
+    def test_dynamo_rewrite_dist_allreduce(self, pg_mode):
+        def func(tensor, *args, **kwargs):
             torch.distributed.all_reduce(
                 tensor,
-                group=pg
+                *args,
+                **kwargs,
             )
 
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter, fullgraph=True)
 
+        args = []
+        kwargs = {}
+
+        if pg_mode == "positional":
+            args.append(torch.distributed.ReduceOp.MAX)
+            args.append(GroupMember.WORLD)
+        elif pg_mode == "positional_none":
+            args.append(torch.distributed.ReduceOp.MAX)
+            args.append(None)
+        elif pg_mode == "kwargs":
+            kwargs["group"] = GroupMember.WORLD
+        elif pg_mode == "kwargs_none":
+            kwargs["group"] = None
+        else:
+            assert pg_mode == "unspecified"
+
         inputs_compiled = torch.ones(2, device=self.device)
         inputs_eager = torch.ones(2, device=self.device)
 
-        compiled(inputs_compiled, GroupMember.WORLD)
-        func(inputs_eager, GroupMember.WORLD)
+        compiled(inputs_compiled, *args, **kwargs)
+        func(inputs_eager, *args, **kwargs)
 
         assert counter.frame_count == 1
         # should test more precisely, but the 3 is supposed to be (all_reduce, wait, copy_)
         assert counter.op_count == 3
         assert same(inputs_compiled, inputs_eager)
 
-    def test_dynamo_support_collective_op_with_async_op_False(self):
+    def test_dynamo_rewrite_dist_all_to_all_single(self):
+        def func(output, input, pg):
+            torch.distributed.all_to_all_single(output, input, group=pg)
+
+        counter = CompileCounter()
+        compiled = torch.compile(func, backend=counter, fullgraph=True)
+
+        input_compiled = torch.ones(2, device=self.device)
+        input_eager = torch.ones(2, device=self.device)
+        output_compiled = torch.empty(2, device=self.device)
+        output_eager = torch.empty(2, device=self.device)
+
+        compiled(output_compiled, input_compiled, GroupMember.WORLD)
+        func(output_eager, input_eager, GroupMember.WORLD)
+
+        assert counter.frame_count == 1
+        assert same(output_compiled, output_eager)
+
+    @parametrize(
+        "reduce_op",
+        [
+            torch.distributed.ReduceOp.SUM,
+            torch.distributed.ReduceOp.AVG,
+            torch.distributed.ReduceOp.PRODUCT,
+            torch.distributed.ReduceOp.MIN,
+            torch.distributed.ReduceOp.MAX,
+        ],
+    )
+    def test_dynamo_rewrite_dist_allreduce_reduce_op(self, reduce_op):
+        from torch.distributed._functional_collectives import REDUCE_OP_TO_STR
+
+        def verify_rewrite(gm, _):
+            ar_nodes = []
+            for node in gm.graph.nodes:
+                if node.target in [
+                    torch.ops.c10d_functional.all_reduce,
+                    torch.ops._c10d_functional.all_reduce,
+                ]:
+                    ar_nodes.append(node)
+            self.assertEqual(len(ar_nodes), 1)
+            reduce_op_str = ar_nodes[0].args[1]
+            self.assertEqual(REDUCE_OP_TO_STR[reduce_op], reduce_op_str)
+            return gm
+
+        compiled = torch.compile(
+            torch.distributed.all_reduce,
+            backend=verify_rewrite,
+            fullgraph=True,
+        )
+        inputs = (
+            torch.ones(2, device=self.device),
+            reduce_op,
+            GroupMember.WORLD,
+        )
+        compiled(*inputs)
+
+    @parametrize(
+        "source",
+        [
+            "GroupMember.WORLD",
+            "group.WORLD",
+            "_get_default_group",
+        ],
+    )
+    def test_dynamo_get_world_group(self, source):
+        def func(tensor):
+            if source == "GroupMember.WORLD":
+                group = torch.distributed.GroupMember.WORLD
+            elif source == "group.WORLD":
+                group = torch.distributed.group.WORLD
+            else:
+                assert source == "_get_default_group"
+                group = torch.distributed.distributed_c10d._get_default_group()
+
+            torch.distributed.all_reduce(
+                tensor,
+                group=group,
+            )
+
+        def verify(gm, _):
+            ar_nodes = []
+            for node in gm.graph.nodes:
+                if node.target in [
+                    torch.ops.c10d_functional.all_reduce,
+                    torch.ops._c10d_functional.all_reduce,
+                ]:
+                    ar_nodes.append(node)
+            self.assertEqual(len(ar_nodes), 1)
+            return gm
+
+        compiled = torch.compile(func, backend=verify, fullgraph=True)
+        input = torch.ones(2, device=self.device)
+        compiled(input)
 
+    def test_dynamo_support_collective_op_with_async_op_False(self):
         def func(inp, out, *, pg):
             # user explicitly set the attribute `async_op` to False,
             # there should be no graph break
-            torch.distributed.reduce_scatter_tensor(
-                out,
-                inp,
-                group=pg,
-                async_op=False
-            )
+            torch.distributed.reduce_scatter_tensor(out, inp, group=pg, async_op=False)
+
         local_size = [4, 4]
         # single-proc test
         global_size = local_size
@@ -789,15 +916,12 @@ def func(inp, out, *, pg):
         assert same(outputs, correct_outputs)
 
     def test_dynamo_graphbreaks_unsupported_async_op(self):
-
         def func(inp, out, *, pg):
             work = torch.distributed.reduce_scatter_tensor(
-                out,
-                inp,
-                group=pg,
-                async_op=True
+                out, inp, group=pg, async_op=True
             )
             work.wait()
+
         local_size = [4, 4]
         # single-proc test
         global_size = local_size
@@ -830,16 +954,15 @@ def func(inp, *, pg):
         assert same(outputs, correct_outputs)
 
     def test_dynamo_trace_reduce_scatter_tensor(self):
-
-        def func(inp, *, tag, ranks, group_size):
-            ar = _functional_collectives.reduce_scatter_tensor(inp, "sum", 0, ranks, tag)
+        def func(inp):
+            ar = _functional_collectives.reduce_scatter_tensor(inp, "sum", 0, "0")
             return ar
 
         inputs = torch.ones(4, 4, device="cuda")
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter)
-        out = compiled(inputs, **self.get_world_trs())
-        correct = func(inputs, **self.get_world_trs())
+        out = compiled(inputs)
+        correct = func(inputs)
         self.assertEqual(counter.frame_count, 1)
 
         # should test more precisely, but the 2 is supposed to be (reduce_scatter, wait)
@@ -848,7 +971,9 @@ def func(inp, *, tag, ranks, group_size):
 
     def test_dynamo_trace_allgather_coalesced(self):
         def func(inp, *, tag, ranks, group_size):
-            ar = torch.ops.c10d_functional.all_gather_into_tensor_coalesced(inp, tag, ranks, group_size)
+            ar = torch.ops.c10d_functional.all_gather_into_tensor_coalesced(
+                inp, tag, ranks, group_size
+            )
             return ar
 
         inputs = [torch.ones(4, 4, device="cuda"), torch.ones(6, 6, device="cuda")]
@@ -860,26 +985,31 @@ def func(inp, *, tag, ranks, group_size):
         assert counter.op_count == 3  # It generates 2 getattr to unpack the array
         assert same(out, correct)
 
-
     def test_backwards(self):
         """
         It's probably not that common to need backwards support for collectives.
 
         However, I wanted to at least see if it was possible to support it as a design goal.
         """
-        def func(inp, *, tag, ranks, group_size):
-            ar = _functional_collectives.all_reduce(inp, "sum", ranks, tag)
+
+        def func(inp):
+            ar = _functional_collectives.all_reduce(inp, "sum", "0")
             return ar
 
         input = torch.ones(4, 4, device="cuda", requires_grad=True)
         # TODO implement backwards
-        with self.assertRaisesRegex(RuntimeError, "element 0 of tensors does not require grad and does not have a grad_fn"):
-            compiled = torch.compile(func, backend="aot_eager")  # inductor bug with single-op allreduce graph
-            out = compiled(input, **self.get_world_trs())
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "element 0 of tensors does not require grad and does not have a grad_fn",
+        ):
+            compiled = torch.compile(
+                func, backend="aot_eager"
+            )  # inductor bug with single-op allreduce graph
+            out = compiled(input)
             out.sum().backward()
 
             correct_input = input.clone().detach().requires_grad_()
-            correct = func(correct_input, **self.get_world_trs())
+            correct = func(correct_input)
             correct.sum().backward()
             self.assertTrue(same(out, correct))
             self.assertTrue(same(input.grad, correct_input.grad))
@@ -898,7 +1028,9 @@ def test_inductor_all_gather_coalesced(self):
 
         def func(inp, *, tag, ranks, group_size):
             x = inp + 1
-            tensor_list = torch.ops.c10d_functional.all_gather_into_tensor_coalesced([x, inp], tag, ranks, group_size)
+            tensor_list = torch.ops.c10d_functional.all_gather_into_tensor_coalesced(
+                [x, inp], tag, ranks, group_size
+            )
             y = x + 2
             ar0 = torch.ops.c10d_functional.wait_tensor(tensor_list[0])
             ar1 = torch.ops.c10d_functional.wait_tensor(tensor_list[1])
@@ -912,25 +1044,23 @@ def func(inp, *, tag, ranks, group_size):
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
         # NOTE: Make sure we are not unneccessarily copying the outputs of
         # wait_tensors before they are returned from the graph.
-        FileCheck() \
-            .check("buf0 = empty(") \
-            .check("buf5 = empty(") \
-            .check("triton_poi__0.run(arg0_1, buf0, buf5") \
-            .check("buf1 = empty(") \
-            .check("buf2 = empty(") \
-            .check_not("copy_(") \
-            .check("buf3_inputs = [buf0,arg0_1]") \
-            .check("buf3 = [buf1,buf2]") \
-            .check("buf3_work = fun_col_impl._all_gather_into_tensor_coalesced_fallback("
-                   "output_tensors=buf3, input_tensors=buf3_inputs") \
-            .check("fun_col_impl._register_tensor_work(buf3, buf3_work)") \
-            .check("buf1 = _wait_tensor(buf1)") \
-            .check("buf4 = buf1") \
-            .check("buf6 = buf0; del buf0  # reuse") \
-            .check("buf2 = _wait_tensor(buf2)") \
-            .check("buf7 = buf2") \
-            .check("return (buf1, buf5, buf6, buf2") \
+        (
+            FileCheck()
+            .check("buf0 = empty_strided")
+            .check("buf6 = empty_strided")
+            .check(".run(arg0_1, buf0, buf6, 16")
+            .check(
+                "buf1 = torch.ops._c10d_functional.all_gather_into_tensor_coalesced.default([buf0, arg0_1]"
+            )
+            .check("buf2 = buf1[0]")
+            .check("buf3 = buf1[1]")
+            .check("buf4 = torch.ops._c10d_functional.wait_tensor.default(buf2")
+            .check("buf7 = buf0; del buf0  # reuse")
+            .check(".run(buf7, 16")
+            .check("buf8 = torch.ops._c10d_functional.wait_tensor.default(buf3")
+            .check("return (buf2, buf6, buf7, buf3")
             .run(code)
+        )
         out = compiled(inputs, **self.get_world_trs())
         correct = func(inputs, **self.get_world_trs())
         assert same(out, correct), f"{out} va {correct}"
@@ -944,7 +1074,9 @@ def test_inductor_reduce_scatter_coalesced(self):
 
         def func(inp, *, tag, ranks, group_size):
             x = inp + 1
-            tensor_list = torch.ops.c10d_functional.reduce_scatter_tensor_coalesced([x, inp], "sum", tag, ranks, group_size)
+            tensor_list = torch.ops.c10d_functional.reduce_scatter_tensor_coalesced(
+                [x, inp], "sum", tag, ranks, group_size
+            )
             y = x + 2
             ar0 = torch.ops.c10d_functional.wait_tensor(tensor_list[0])
             ar1 = torch.ops.c10d_functional.wait_tensor(tensor_list[1])
@@ -958,24 +1090,23 @@ def func(inp, *, tag, ranks, group_size):
         code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
         # NOTE: The first return value should be the output of the first wait_tensor.
         # We want to make sure no unneccessary copy is made.
-        FileCheck() \
-            .check("buf0 = empty(") \
-            .check("buf5 = empty(") \
-            .check("triton_poi__0.run(arg0_1, buf0, buf5") \
-            .check("buf1 = empty(") \
-            .check("buf2 = empty(") \
-            .check_not("copy_(") \
-            .check("buf3 = [buf1,buf2]") \
-            .check("buf3_work = fun_col_impl._reduce_scatter_tensor_coalesced_fallback("
-                   "output_tensors=buf3, input_tensors=buf3_inputs") \
-            .check("fun_col_impl._register_tensor_work(buf3, buf3_work)") \
-            .check("buf1 = _wait_tensor(buf1)") \
-            .check("buf4 = buf1") \
-            .check("buf6 = buf0; del buf0  # reuse") \
-            .check("buf2 = _wait_tensor(buf2)") \
-            .check("buf7 = buf2") \
-            .check("return (buf1, buf5, buf6, buf2") \
+        (
+            FileCheck()
+            .check("buf0 = empty_strided")
+            .check("buf6 = empty_strided")
+            .check(".run(arg0_1, buf0, buf6, 16")
+            .check(
+                "buf1 = torch.ops._c10d_functional.reduce_scatter_tensor_coalesced.default([buf0, arg0_1]"
+            )
+            .check("buf2 = buf1[0]")
+            .check("buf3 = buf1[1]")
+            .check("buf4 = torch.ops._c10d_functional.wait_tensor.default(buf2")
+            .check("buf7 = buf0; del buf0  # reuse")
+            .check(".run(buf7, 16")
+            .check("buf8 = torch.ops._c10d_functional.wait_tensor.default(buf3")
+            .check("return (buf2, buf6, buf7, buf3")
             .run(code)
+        )
         out = compiled(inputs, **self.get_world_trs())
         correct = func(inputs, **self.get_world_trs())
         assert same(out, correct), f"{out} va {correct}"
@@ -983,4 +1114,5 @@ def func(inp, *, tag, ranks, group_size):
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
+
     run_tests()
diff --git a/test/distributed/test_launcher.py b/test/distributed/test_launcher.py
index 178d98ffdc9b8..58c5de168ebbe 100644
--- a/test/distributed/test_launcher.py
+++ b/test/distributed/test_launcher.py
@@ -13,9 +13,9 @@
     sys.exit(0)
 
 from torch.testing._internal.common_utils import (
+    run_tests,
     TEST_WITH_DEV_DBG_ASAN,
     TestCase,
-    run_tests,
 )
 
 
diff --git a/test/distributed/test_multi_threaded_pg.py b/test/distributed/test_multi_threaded_pg.py
index f9ebdc62965e3..5bc9700e2459d 100644
--- a/test/distributed/test_multi_threaded_pg.py
+++ b/test/distributed/test_multi_threaded_pg.py
@@ -1,34 +1,32 @@
 # Owner(s): ["oncall: distributed"]
 
+import operator
 import os
 import sys
+import threading
+from functools import reduce
+from unittest import skip, SkipTest
+
 import torch
+import torch.autograd
 import torch.distributed as dist
 from torch._C._distributed_c10d import ReduceOp
-from unittest import skip, SkipTest
-import operator
-from functools import reduce
-import threading
-import torch.autograd
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
 from torch.testing._internal.common_distributed import (
-    spawn_threads_and_init_comms,
     MultiThreadedTestCase,
     skip_if_lt_x_gpu,
+    spawn_threads_and_init_comms,
 )
-from torch.testing._internal.common_utils import (
-    TestCase,
-    run_tests,
-    IS_SANDCASTLE,
-)
+from torch.testing._internal.common_utils import IS_SANDCASTLE, run_tests, TestCase
 
 
 DEFAULT_WORLD_SIZE = 4
 
+
 class TestCollectivesWithWrapper(TestCase):
     @spawn_threads_and_init_comms(world_size=4)
     def test_broadcast_object_list(self):
@@ -42,7 +40,9 @@ def test_collective_error_on_rank_zero(self):
         @spawn_threads_and_init_comms(world_size=4)
         def _test_method(self):
             input_tensor = torch.ones(3, 3) * dist.get_rank()  # perform 1st all gather
-            output_tensors = [torch.empty_like(input_tensor) for _ in range(dist.get_world_size())]
+            output_tensors = [
+                torch.empty_like(input_tensor) for _ in range(dist.get_world_size())
+            ]
             dist.all_gather(output_tensors, input_tensor)
 
             if dist.get_rank() == 0:
@@ -57,7 +57,9 @@ def test_collective_error_on_rank_non_zero(self):
         @spawn_threads_and_init_comms(world_size=4)
         def _test_method(self):
             input_tensor = torch.ones(3, 3) * dist.get_rank()  # perform 1st all gather
-            output_tensors = [torch.empty_like(input_tensor) for _ in range(dist.get_world_size())]
+            output_tensors = [
+                torch.empty_like(input_tensor) for _ in range(dist.get_world_size())
+            ]
             dist.all_gather(output_tensors, input_tensor)
 
             if dist.get_rank() == 1:
@@ -72,11 +74,15 @@ def test_collective_error_on_rank_non_zero_all(self):
         @spawn_threads_and_init_comms(world_size=4)
         def _test_method(self):
             input_tensor = torch.ones(3, 3) * dist.get_rank()  # perform 1st all gather
-            output_tensors = [torch.empty_like(input_tensor) for _ in range(dist.get_world_size())]
+            output_tensors = [
+                torch.empty_like(input_tensor) for _ in range(dist.get_world_size())
+            ]
             dist.all_gather(output_tensors, input_tensor)
 
             if dist.get_rank() > 0:
-                raise AssertionError("Mimic real test failure.")  # fail on all non-zero rank
+                raise AssertionError(
+                    "Mimic real test failure."
+                )  # fail on all non-zero rank
 
             dist.all_gather(output_tensors, input_tensor)  # perform 2nd all gather
 
@@ -93,6 +99,39 @@ def _test_method(self):
             with self.assertRaises(SkipTest):
                 _test_method(self)
 
+    @spawn_threads_and_init_comms(world_size=4)
+    def test_all_to_all_single_tensor(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        send = torch.full((world_size, 2), rank)
+        sizes = torch.ones(world_size, dtype=torch.int64)
+
+        out = torch.zeros(world_size, 2, dtype=send.dtype)
+        dist.all_to_all_single(out, send, sizes, sizes)
+        self.assertEqual(out.tolist(), list(zip(range(world_size), range(world_size))))
+
+    @spawn_threads_and_init_comms(world_size=4)
+    def test_all_to_all_single_list(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        send = torch.full((world_size, 2), rank)
+        sizes = [1] * world_size
+
+        out = torch.zeros(world_size, 2, dtype=send.dtype)
+        dist.all_to_all_single(out, send, sizes, sizes)
+        self.assertEqual(out.tolist(), list(zip(range(world_size), range(world_size))))
+
+    @spawn_threads_and_init_comms(world_size=4)
+    def test_all_to_all_single_none(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        send = torch.full((world_size, 2), rank)
+
+        out = torch.zeros(world_size, 2, dtype=send.dtype)
+        dist.all_to_all_single(out, send)
+        self.assertEqual(out.tolist(), list(zip(range(world_size), range(world_size))))
+
+
 class TestCollectivesWithBaseClass(MultiThreadedTestCase):
     @property
     def world_size(self):
@@ -109,7 +148,9 @@ def tearDown(self):
 
     def test_allgather(self):
         input_tensor = torch.ones(3, 3) * dist.get_rank()
-        output_tensors = [torch.empty_like(input_tensor) for _ in range(self.world_size)]
+        output_tensors = [
+            torch.empty_like(input_tensor) for _ in range(self.world_size)
+        ]
         dist.all_gather(output_tensors, input_tensor)
         for rank, out_tensor in enumerate(output_tensors):
             self.assertEqual(out_tensor, torch.ones(3, 3) * rank)
@@ -139,6 +180,11 @@ def test_reduce_scatter(self):
         expected_tensor = torch.ones(3, 3) * dist.get_rank() * self.world_size
         self.assertEqual(output_tensor, expected_tensor)
 
+        output_tensor = torch.empty(3, 3)
+        dist.reduce_scatter(output_tensor, to_reduce_scatter, op=dist.ReduceOp.AVG)
+        expected_tensor = torch.ones(3, 3) * dist.get_rank()
+        self.assertEqual(output_tensor, expected_tensor)
+
     def test_broadcast_object_list(self):
         val = 99 if dist.get_rank() == 0 else None
         object_list = [val] * dist.get_world_size()
@@ -270,7 +316,11 @@ def backward(ctx, grad_output):
                 result, rank = ctx.saved_tensors
                 bwd_tid = threading.current_thread().ident
 
-                self.assertEqual(fwd_tid, bwd_tid, f"bwd not running in the same thread a fwd for rank {rank.item()}")
+                self.assertEqual(
+                    fwd_tid,
+                    bwd_tid,
+                    f"bwd not running in the same thread a fwd for rank {rank.item()}",
+                )
                 self.assertTrue(dist.is_initialized())
                 self.assertEqual(int(rank.item()), dist.get_rank())
                 dist.all_reduce(result)
@@ -278,9 +328,12 @@ def backward(ctx, grad_output):
 
                 return grad_output * result
 
-        x = torch.tensor([dist.get_rank()], dtype=torch.float, device="cuda", requires_grad=True)
+        x = torch.tensor(
+            [dist.get_rank()], dtype=torch.float, device="cuda", requires_grad=True
+        )
         x = MyFunc.apply(x)
         x.sum().backward()
 
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_nccl.py b/test/distributed/test_nccl.py
index 27997256ea8f7..7da2c9af9a566 100644
--- a/test/distributed/test_nccl.py
+++ b/test/distributed/test_nccl.py
@@ -1,26 +1,27 @@
 # Owner(s): ["oncall: distributed"]
 
+import re
 import sys
+
 import torch
-import torch.cuda.nccl as nccl
 import torch.cuda
+import torch.cuda.nccl as nccl
 import torch.distributed as c10d
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+)
 
 from torch.testing._internal.common_utils import (
-    TestCase,
-    run_tests,
     IS_WINDOWS,
     load_tests,
-    TEST_WITH_ROCM,
-    skip_but_pass_in_sandcastle_if,
     NoTest,
+    run_tests,
+    skip_but_pass_in_sandcastle_if,
+    TEST_WITH_ROCM,
+    TestCase,
 )
-from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
-from torch.testing._internal.common_device_type import (
-    instantiate_device_type_tests,
-    dtypes,
-)
-import re
 
 HIP_VERSION = (
     0.0
diff --git a/test/distributed/test_pg_wrapper.py b/test/distributed/test_pg_wrapper.py
index 1305ddd042b06..d7e59f1c90a76 100644
--- a/test/distributed/test_pg_wrapper.py
+++ b/test/distributed/test_pg_wrapper.py
@@ -3,15 +3,19 @@
 import os
 import sys
 from datetime import timedelta
+from unittest.mock import patch
 
 import torch
 import torch.distributed as c10d
+from torch._C._distributed_c10d import _ProcessGroupWrapper
+
 
 if not c10d.is_available():
     print("c10d not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
 from test_c10d_common import LOOPBACK
+
 from torch.testing._internal.common_distributed import (
     create_device,
     MultiProcessTestCase,
@@ -346,6 +350,36 @@ def test_coalescing_manager_debug_mode_detail(self):
             pg.allreduce([torch.ones(1, device=dev)])
             pg._end_coalescing(torch.device(dev))
 
+        @requires_nccl()
+        @skip_if_lt_x_gpu(2)
+        @with_dist_debug_levels(levels=["DETAIL"])
+        @patch("torch.distributed.distributed_c10d._GLOO_AVAILABLE", False)
+        def test_debug_level_detail_no_gloo(self):
+            with self.assertRaisesRegex(
+                AssertionError, "ProcessGroupWrapper unsupported without GLOO backend"
+            ):
+                self._create_wrapper_pg()
+
+        @requires_nccl()
+        @skip_if_lt_x_gpu(2)
+        @patch("torch.distributed.distributed_c10d._GLOO_AVAILABLE", False)
+        def test_new_group_no_gloo(self):
+            def patched_isinstance(obj, clazz):
+                if clazz is _ProcessGroupWrapper:
+                    raise NameError
+                else:
+                    return isinstance(obj, clazz)
+
+            with patch(
+                "torch.distributed.distributed_c10d.isinstance",
+                side_effect=patched_isinstance,
+            ):
+                self._create_wrapper_pg(with_new_group=True)
+                # nothing to assert, isinstance(pg, _ProcessGroupWrapper)
+                # should never be invoked since it is preceeded by
+                # _GLOO_AVAILABLE check, this test will fail on
+                # an unexpected NameError if not.
+
 
 @requires_gloo()
 class ProcessGroupGlooWrapperTest(AbstractProcessGroupWrapperTest):
diff --git a/test/distributed/test_store.py b/test/distributed/test_store.py
index 03dd02bdb788f..8383101d20938 100644
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@@ -5,8 +5,8 @@
 import socket
 import sys
 import tempfile
-import time
 import threading
+import time
 from datetime import timedelta
 from sys import platform
 
@@ -14,9 +14,12 @@
 import torch.distributed as dist
 import torch.distributed.distributed_c10d as c10d
 import torch.distributed.rpc as rpc
-from torch.distributed import DistNetworkError, DistError, DistStoreError
+from torch.distributed import DistError, DistNetworkError, DistStoreError
 from torch.testing._internal.common_distributed import MultiThreadedTestCase
-from torch.testing._internal.common_utils import instantiate_parametrized_tests, parametrize
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
 
 if not dist.is_available():
     print("torch.distributed not available, skipping tests", file=sys.stderr)
@@ -24,17 +27,17 @@
 
 import torch.testing._internal.common_utils as common
 from torch.testing._internal.common_distributed import (
-    skip_if_win32,
     create_tcp_store,
-    tp_transports
+    skip_if_win32,
+    tp_transports,
 )
 from torch.testing._internal.common_utils import (
-    TestCase,
-    load_tests,
-    run_tests,
-    retry_on_connect_failures,
     ADDRESS_IN_USE,
     CONNECT_TIMEOUT,
+    load_tests,
+    retry_on_connect_failures,
+    run_tests,
+    TestCase,
 )
 
 # load_tests from common_utils is used to automatically filter tests for
@@ -62,7 +65,7 @@ def gpus_for_rank(world_size):
     gpus_for_rank = []
     for rank in range(world_size):
         gpus_for_rank.append(
-            visible_devices[rank * gpus_per_process: (rank + 1) * gpus_per_process]
+            visible_devices[rank * gpus_per_process : (rank + 1) * gpus_per_process]
         )
     return gpus_for_rank
 
@@ -102,7 +105,9 @@ def test_set_get_check(self):
         self._test_set_get_check(self._create_store())
 
     def _test_compare_set(self, store):
-        missing_key_result = store.compare_set("cs_key0", "wrong_old_value", "new_value0")
+        missing_key_result = store.compare_set(
+            "cs_key0", "wrong_old_value", "new_value0"
+        )
         self.assertEqual(b"wrong_old_value", missing_key_result)
 
         store.set("cs_key0", "value0")
@@ -189,10 +194,14 @@ def test_init_pg_and_rpc_with_same_file(self):
         rpc_backend_options = rpc.TensorPipeRpcBackendOptions()
         rpc_backend_options.init_method = f"file://{file.name}"
         rpc_backend_options._transports = tp_transports()
-        rpc.init_rpc("worker", rank=0, world_size=1, rpc_backend_options=rpc_backend_options)
+        rpc.init_rpc(
+            "worker", rank=0, world_size=1, rpc_backend_options=rpc_backend_options
+        )
 
         # Init PG using file
-        dist.init_process_group("gloo", rank=0, world_size=1, init_method=f"file://{file.name}")
+        dist.init_process_group(
+            "gloo", rank=0, world_size=1, init_method=f"file://{file.name}"
+        )
         dist.destroy_process_group()
         assert os.path.exists(file.name)
 
@@ -228,7 +237,9 @@ def setUp(self):
         self.file = tempfile.NamedTemporaryFile(delete=False)
 
     def test_get_underlying_store(self):
-        tcp_store = dist.TCPStore(host_name=DEFAULT_HOSTNAME, port=0, world_size=1, is_master=True)
+        tcp_store = dist.TCPStore(
+            host_name=DEFAULT_HOSTNAME, port=0, world_size=1, is_master=True
+        )
         hash_store = dist.HashStore()
         file_store = dist.FileStore(self.file.name, world_size=1)
         for store in [tcp_store, hash_store, file_store]:
@@ -305,8 +316,7 @@ def test_init_pg_and_rpc_with_same_socket(self):
         )
 
         backend_opts = rpc.TensorPipeRpcBackendOptions(
-            init_method=f"tcp://{addr}:{port}",
-            _transports=tp_transports()
+            init_method=f"tcp://{addr}:{port}", _transports=tp_transports()
         )
         rpc.init_rpc(
             name="worker0",
@@ -362,11 +372,17 @@ def test_numkeys_delkeys(self):
         self._test_numkeys_delkeys(self._create_store())
 
     def _create_client(self, index, addr, port, world_size):
-        client_store = dist.TCPStore(addr, port, world_size=world_size, timeout=timedelta(seconds=10))
+        client_store = dist.TCPStore(
+            addr, port, world_size=world_size, timeout=timedelta(seconds=10)
+        )
         self.assertEqual(b"value", client_store.get("key"))
         client_store.set(f"new_key{index}", f"new_value{index}")
-        self.assertEqual(f"next_value{index}".encode(),
-                         client_store.compare_set(f"new_key{index}", f"new_value{index}", f"next_value{index}"))
+        self.assertEqual(
+            f"next_value{index}".encode(),
+            client_store.compare_set(
+                f"new_key{index}", f"new_value{index}", f"next_value{index}"
+            ),
+        )
 
     def _multi_worker_helper(self, world_size):
         addr = DEFAULT_HOSTNAME
@@ -408,22 +424,34 @@ def test_multi_get(self):
         self.assertEqual(b"tato", v1)
 
     def test_store_timeout_on_missing_clients(self):
-        with self.assertRaisesRegex(DistStoreError, r"Timed out after \d+ seconds waiting for clients. \d+/\d+ clients joined."):
+        with self.assertRaisesRegex(
+            DistStoreError,
+            r"Timed out after \d+ seconds waiting for clients. \d+/\d+ clients joined.",
+        ):
             # world_size is 2 so it should timeout
             dist.TCPStore("localhost", 0, 2, True, timeout=timedelta(seconds=2))
 
         # when wait_for_workers is not set, then there should be no exception raised
-        dist.TCPStore("localhost", 0, 2, True, timeout=timedelta(seconds=2), wait_for_workers=False)
+        dist.TCPStore(
+            "localhost",
+            0,
+            2,
+            True,
+            timeout=timedelta(seconds=2),
+            wait_for_workers=False,
+        )
 
-class LibUvTCPStoreTest(TCPStoreTest):
 
+class LibUvTCPStoreTest(TCPStoreTest):
     def _create_store(self):
         store = create_tcp_store(use_libuv=True)
         store.set_timeout(timedelta(seconds=300))
         return store
 
     def _create_store_with_ws(self, addr, world_size):
-        return create_tcp_store(addr, world_size, wait_for_workers=False, use_libuv=True)
+        return create_tcp_store(
+            addr, world_size, wait_for_workers=False, use_libuv=True
+        )
 
 
 class PrefixTCPStoreTest(TestCase, StoreTestBase):
@@ -443,6 +471,15 @@ def _create_store(self):
     def num_keys_total(self):
         return 6
 
+    def test_underlying_non_prefix_store(self):
+        store = self._create_store()
+        wrapped_store = dist.PrefixStore(
+            self.prefix, dist.PrefixStore(self.prefix, store)
+        )
+        self.assertEqual(self.tcpstore, store._underlying_non_prefix_store)
+        self.assertEqual(self.tcpstore, wrapped_store._underlying_non_prefix_store)
+
+
 class MyPythonStore(dist.Store):
     def __init__(self):
         super().__init__()
@@ -477,6 +514,7 @@ def compare_set(self, key, expected, newValue):
             val = self.store[key] = newValue
         return val
 
+
 class PythonStoreTest(TestCase):
     def test_set_get(self):
         # If we were to inherit from StoreTestBase and try to use
@@ -572,7 +610,9 @@ def test_common_errors(self):
             next(gen)
 
     def test_dns_timeout(self):
-        with self.assertRaisesRegex(DistNetworkError, "client socket has timed out after.*dnsnotexist") as manager:
+        with self.assertRaisesRegex(
+            DistNetworkError, "client socket has timed out after.*dnsnotexist"
+        ) as manager:
             gen = dist.rendezvous(
                 "tcp://dnsnotexist:23456?world_size=2&rank=0",
                 timeout=timedelta(seconds=1),
@@ -635,6 +675,7 @@ def test_tcp_store_url_with_libuv(self):
         store0, rank0, size0 = next(gen0)
         self.assertTrue(store0.libuvBackend)
 
+
 class DummyStore(dist.Store):
     def __init__(self):
         self.appends = []
@@ -656,10 +697,12 @@ def multi_set(self, keys, values):
     def has_extended_api(self):
         return True
 
+
 class TestPythonStore(TestCase):
     def test_optional_methods_fail(self):
         class TestStore(dist.Store):
             pass
+
         store = TestStore()
         self.assertFalse(store.has_extended_api())
         with self.assertRaisesRegex(RuntimeError, "Not implemented."):
@@ -672,6 +715,7 @@ class TestStore(dist.Store):
     def test_has_extended_api_passthrough(self):
         class TestStore(dist.Store):
             pass
+
         test_store = TestStore()
         store = dist.PrefixStore("p", test_store)
         self.assertFalse(store.has_extended_api())
@@ -706,10 +750,10 @@ def test_multi_get_roundtrip(self):
     def test_multi_set_roundtrip(self):
         store = DummyStore()
         prefix = dist.PrefixStore("p", store)
-        prefix.multi_set(["foo", "bar"], [b'x', b'y'])
+        prefix.multi_set(["foo", "bar"], [b"x", b"y"])
         self.assertEqual(1, len(store.multi_sets))
         self.assertEqual(["p/foo", "p/bar"], store.multi_sets[0][0])
-        self.assertEqual([b'x', b'y'], store.multi_sets[0][1])
+        self.assertEqual([b"x", b"y"], store.multi_sets[0][1])
 
     def test_extended_methods_fallbacks(self):
         test_store = MyPythonStore()
@@ -729,7 +773,9 @@ class TestMultiThreadedWait(MultiThreadedTestCase):
     stores = [
         dist.FileStore(tempfile.NamedTemporaryFile(delete=False).name, 1),
         dist.HashStore(),
-        dist.PrefixStore("pre", dist.FileStore(tempfile.NamedTemporaryFile(delete=False).name, 1)),
+        dist.PrefixStore(
+            "pre", dist.FileStore(tempfile.NamedTemporaryFile(delete=False).name, 1)
+        ),
         create_tcp_store(),
         create_tcp_store(use_libuv=True),
         dist.PrefixStore("pre", create_tcp_store()),
@@ -758,15 +804,18 @@ def test_wait(self, i):
 
 instantiate_parametrized_tests(TestMultiThreadedWait)
 
+
 @skip_if_win32()
 class TimeoutTest(TestCase):
     def tearDown(self):
         import signal
+
         super().tearDown()
         signal.signal(signal.SIGUSR1, signal.SIG_IGN)
 
     def test_interrupt_doesnt_break_wait(self):
         import signal
+
         rank_res = [None, None]
 
         def run(rank, my_store):
@@ -783,13 +832,29 @@ def run(rank, my_store):
             time.sleep(1)
 
         rank0_store = dist.TCPStore(
-            host_name=DEFAULT_HOSTNAME, port=0, world_size=2, is_master=True, wait_for_workers=False)
+            host_name=DEFAULT_HOSTNAME,
+            port=0,
+            world_size=2,
+            is_master=True,
+            wait_for_workers=False,
+        )
         rank1_store = dist.TCPStore(
-            host_name=DEFAULT_HOSTNAME, port=rank0_store.port, world_size=2, is_master=False, wait_for_workers=False)
+            host_name=DEFAULT_HOSTNAME,
+            port=rank0_store.port,
+            world_size=2,
+            is_master=False,
+            wait_for_workers=False,
+        )
 
         ths = []
         for i in range(2):
-            t = threading.Thread(target=run, args=(i, [rank0_store, rank1_store][i],))
+            t = threading.Thread(
+                target=run,
+                args=(
+                    i,
+                    [rank0_store, rank1_store][i],
+                ),
+            )
             t.start()
             ths.append(t)
 
@@ -815,7 +880,12 @@ def tearDown(self):
 
     def test_with_url_param(self):
         port = common.find_free_port()
-        dist.init_process_group("gloo", rank=0, world_size=1, init_method=f"tcp://{DEFAULT_HOSTNAME}:{port}?use_libuv=1")
+        dist.init_process_group(
+            "gloo",
+            rank=0,
+            world_size=1,
+            init_method=f"tcp://{DEFAULT_HOSTNAME}:{port}?use_libuv=1",
+        )
         self._run_test()
 
     def test_with_env_var(self):
@@ -837,6 +907,7 @@ def _run_test(self):
         self.assertTrue(store.libuvBackend)
         dist.destroy_process_group()
 
+
 if __name__ == "__main__":
     assert (
         not torch.cuda._initialized
diff --git a/test/distributions/test_constraints.py b/test/distributions/test_constraints.py
index 266fb96be6db1..056189d62c090 100644
--- a/test/distributions/test_constraints.py
+++ b/test/distributions/test_constraints.py
@@ -9,46 +9,70 @@
 
 
 EXAMPLES = [
-    (constraints.symmetric, False, [[2., 0], [2., 2]]),
-    (constraints.positive_semidefinite, False, [[2., 0], [2., 2]]),
-    (constraints.positive_definite, False, [[2., 0], [2., 2]]),
-    (constraints.symmetric, True, [[3., -5], [-5., 3]]),
-    (constraints.positive_semidefinite, False, [[3., -5], [-5., 3]]),
-    (constraints.positive_definite, False, [[3., -5], [-5., 3]]),
-    (constraints.symmetric, True, [[1., 2], [2., 4]]),
-    (constraints.positive_semidefinite, True, [[1., 2], [2., 4]]),
-    (constraints.positive_definite, False, [[1., 2], [2., 4]]),
-    (constraints.symmetric, True, [[[1., -2], [-2., 1]], [[2., 3], [3., 2]]]),
-    (constraints.positive_semidefinite, False, [[[1., -2], [-2., 1]], [[2., 3], [3., 2]]]),
-    (constraints.positive_definite, False, [[[1., -2], [-2., 1]], [[2., 3], [3., 2]]]),
-    (constraints.symmetric, True, [[[1., -2], [-2., 4]], [[1., -1], [-1., 1]]]),
-    (constraints.positive_semidefinite, True, [[[1., -2], [-2., 4]], [[1., -1], [-1., 1]]]),
-    (constraints.positive_definite, False, [[[1., -2], [-2., 4]], [[1., -1], [-1., 1]]]),
-    (constraints.symmetric, True, [[[4., 2], [2., 4]], [[3., -1], [-1., 3]]]),
-    (constraints.positive_semidefinite, True, [[[4., 2], [2., 4]], [[3., -1], [-1., 3]]]),
-    (constraints.positive_definite, True, [[[4., 2], [2., 4]], [[3., -1], [-1., 3]]]),
+    (constraints.symmetric, False, [[2.0, 0], [2.0, 2]]),
+    (constraints.positive_semidefinite, False, [[2.0, 0], [2.0, 2]]),
+    (constraints.positive_definite, False, [[2.0, 0], [2.0, 2]]),
+    (constraints.symmetric, True, [[3.0, -5], [-5.0, 3]]),
+    (constraints.positive_semidefinite, False, [[3.0, -5], [-5.0, 3]]),
+    (constraints.positive_definite, False, [[3.0, -5], [-5.0, 3]]),
+    (constraints.symmetric, True, [[1.0, 2], [2.0, 4]]),
+    (constraints.positive_semidefinite, True, [[1.0, 2], [2.0, 4]]),
+    (constraints.positive_definite, False, [[1.0, 2], [2.0, 4]]),
+    (constraints.symmetric, True, [[[1.0, -2], [-2.0, 1]], [[2.0, 3], [3.0, 2]]]),
+    (
+        constraints.positive_semidefinite,
+        False,
+        [[[1.0, -2], [-2.0, 1]], [[2.0, 3], [3.0, 2]]],
+    ),
+    (
+        constraints.positive_definite,
+        False,
+        [[[1.0, -2], [-2.0, 1]], [[2.0, 3], [3.0, 2]]],
+    ),
+    (constraints.symmetric, True, [[[1.0, -2], [-2.0, 4]], [[1.0, -1], [-1.0, 1]]]),
+    (
+        constraints.positive_semidefinite,
+        True,
+        [[[1.0, -2], [-2.0, 4]], [[1.0, -1], [-1.0, 1]]],
+    ),
+    (
+        constraints.positive_definite,
+        False,
+        [[[1.0, -2], [-2.0, 4]], [[1.0, -1], [-1.0, 1]]],
+    ),
+    (constraints.symmetric, True, [[[4.0, 2], [2.0, 4]], [[3.0, -1], [-1.0, 3]]]),
+    (
+        constraints.positive_semidefinite,
+        True,
+        [[[4.0, 2], [2.0, 4]], [[3.0, -1], [-1.0, 3]]],
+    ),
+    (
+        constraints.positive_definite,
+        True,
+        [[[4.0, 2], [2.0, 4]], [[3.0, -1], [-1.0, 3]]],
+    ),
 ]
 
 CONSTRAINTS = [
     (constraints.real,),
     (constraints.real_vector,),
     (constraints.positive,),
-    (constraints.greater_than, [-10., -2, 0, 2, 10]),
+    (constraints.greater_than, [-10.0, -2, 0, 2, 10]),
     (constraints.greater_than, 0),
     (constraints.greater_than, 2),
     (constraints.greater_than, -2),
     (constraints.greater_than_eq, 0),
     (constraints.greater_than_eq, 2),
     (constraints.greater_than_eq, -2),
-    (constraints.less_than, [-10., -2, 0, 2, 10]),
+    (constraints.less_than, [-10.0, -2, 0, 2, 10]),
     (constraints.less_than, 0),
     (constraints.less_than, 2),
     (constraints.less_than, -2),
     (constraints.unit_interval,),
-    (constraints.interval, [-4., -2, 0, 2, 4], [-3., 3, 1, 5, 5]),
+    (constraints.interval, [-4.0, -2, 0, 2, 4], [-3.0, 3, 1, 5, 5]),
     (constraints.interval, -2, -1),
     (constraints.interval, 1, 2),
-    (constraints.half_open_interval, [-4., -2, 0, 2, 4], [-3., 3, 1, 5, 5]),
+    (constraints.half_open_interval, [-4.0, -2, 0, 2, 4], [-3.0, 3, 1, 5, 5]),
     (constraints.half_open_interval, -2, -1),
     (constraints.half_open_interval, 1, 2),
     (constraints.simplex,),
@@ -64,25 +88,40 @@ def build_constraint(constraint_fn, args, is_cuda=False):
     t = torch.cuda.DoubleTensor if is_cuda else torch.DoubleTensor
     return constraint_fn(*(t(x) if isinstance(x, list) else x for x in args))
 
-@pytest.mark.parametrize(('constraint_fn', 'result', 'value'), EXAMPLES)
-@pytest.mark.parametrize('is_cuda', [False,
-                                     pytest.param(True, marks=pytest.mark.skipif(not TEST_CUDA,
-                                                                                 reason='CUDA not found.'))])
+
+@pytest.mark.parametrize(("constraint_fn", "result", "value"), EXAMPLES)
+@pytest.mark.parametrize(
+    "is_cuda",
+    [
+        False,
+        pytest.param(
+            True, marks=pytest.mark.skipif(not TEST_CUDA, reason="CUDA not found.")
+        ),
+    ],
+)
 def test_constraint(constraint_fn, result, value, is_cuda):
     t = torch.cuda.DoubleTensor if is_cuda else torch.DoubleTensor
     assert constraint_fn.check(t(value)).all() == result
 
 
-@pytest.mark.parametrize(('constraint_fn', 'args'), [(c[0], c[1:]) for c in CONSTRAINTS])
-@pytest.mark.parametrize('is_cuda', [False,
-                                     pytest.param(True, marks=pytest.mark.skipif(not TEST_CUDA,
-                                                                                 reason='CUDA not found.'))])
+@pytest.mark.parametrize(
+    ("constraint_fn", "args"), [(c[0], c[1:]) for c in CONSTRAINTS]
+)
+@pytest.mark.parametrize(
+    "is_cuda",
+    [
+        False,
+        pytest.param(
+            True, marks=pytest.mark.skipif(not TEST_CUDA, reason="CUDA not found.")
+        ),
+    ],
+)
 def test_biject_to(constraint_fn, args, is_cuda):
     constraint = build_constraint(constraint_fn, args, is_cuda=is_cuda)
     try:
         t = biject_to(constraint)
     except NotImplementedError:
-        pytest.skip('`biject_to` not implemented.')
+        pytest.skip("`biject_to` not implemented.")
     assert t.bijective, f"biject_to({constraint}) is not bijective"
     if constraint_fn is constraints.corr_cholesky:
         # (D * (D-1)) / 2 (where D = 4) = 6 (size of last dim)
@@ -92,22 +131,32 @@ def test_biject_to(constraint_fn, args, is_cuda):
     if is_cuda:
         x = x.cuda()
     y = t(x)
-    assert constraint.check(y).all(), '\n'.join([
-        f"Failed to biject_to({constraint})",
-        f"x = {x}",
-        f"biject_to(...)(x) = {y}",
-    ])
+    assert constraint.check(y).all(), "\n".join(
+        [
+            f"Failed to biject_to({constraint})",
+            f"x = {x}",
+            f"biject_to(...)(x) = {y}",
+        ]
+    )
     x2 = t.inv(y)
     assert torch.allclose(x, x2), f"Error in biject_to({constraint}) inverse"
 
     j = t.log_abs_det_jacobian(x, y)
-    assert j.shape == x.shape[:x.dim() - t.domain.event_dim]
+    assert j.shape == x.shape[: x.dim() - t.domain.event_dim]
 
 
-@pytest.mark.parametrize(('constraint_fn', 'args'), [(c[0], c[1:]) for c in CONSTRAINTS])
-@pytest.mark.parametrize('is_cuda', [False,
-                                     pytest.param(True, marks=pytest.mark.skipif(not TEST_CUDA,
-                                                                                 reason='CUDA not found.'))])
+@pytest.mark.parametrize(
+    ("constraint_fn", "args"), [(c[0], c[1:]) for c in CONSTRAINTS]
+)
+@pytest.mark.parametrize(
+    "is_cuda",
+    [
+        False,
+        pytest.param(
+            True, marks=pytest.mark.skipif(not TEST_CUDA, reason="CUDA not found.")
+        ),
+    ],
+)
 def test_transform_to(constraint_fn, args, is_cuda):
     constraint = build_constraint(constraint_fn, args, is_cuda=is_cuda)
     t = transform_to(constraint)
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index 70378b63e5d19..cd9a0d39bb2d9 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -34,40 +34,88 @@
 from collections import namedtuple
 from itertools import product
 from random import shuffle
-from packaging import version
 
 import torch
+import torch.autograd.forward_ad as fwAD
+from packaging import version
 
 from torch import inf, nan
-from torch.testing._internal.common_utils import \
-    (TestCase, run_tests, set_rng_seed, load_tests,
-     gradcheck, skipIfTorchDynamo, set_default_dtype)
-from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.autograd import grad
-import torch.autograd.forward_ad as fwAD
 from torch.autograd.functional import jacobian
-from torch.distributions import (Bernoulli, Beta, Binomial, Categorical,
-                                 Cauchy, Chi2, ContinuousBernoulli, Dirichlet,
-                                 Distribution, Exponential, ExponentialFamily,
-                                 FisherSnedecor, Gamma, Geometric, Gumbel,
-                                 HalfCauchy, HalfNormal, Independent, InverseGamma, Kumaraswamy,
-                                 LKJCholesky, Laplace, LogisticNormal,
-                                 LogNormal, LowRankMultivariateNormal,
-                                 MixtureSameFamily, Multinomial, MultivariateNormal,
-                                 NegativeBinomial, Normal,
-                                 OneHotCategorical, OneHotCategoricalStraightThrough,
-                                 Pareto, Poisson, RelaxedBernoulli, RelaxedOneHotCategorical,
-                                 StudentT, TransformedDistribution, Uniform,
-                                 VonMises, Weibull, Wishart, constraints, kl_divergence)
+from torch.distributions import (
+    Bernoulli,
+    Beta,
+    Binomial,
+    Categorical,
+    Cauchy,
+    Chi2,
+    constraints,
+    ContinuousBernoulli,
+    Dirichlet,
+    Distribution,
+    Exponential,
+    ExponentialFamily,
+    FisherSnedecor,
+    Gamma,
+    Geometric,
+    Gumbel,
+    HalfCauchy,
+    HalfNormal,
+    Independent,
+    InverseGamma,
+    kl_divergence,
+    Kumaraswamy,
+    Laplace,
+    LKJCholesky,
+    LogisticNormal,
+    LogNormal,
+    LowRankMultivariateNormal,
+    MixtureSameFamily,
+    Multinomial,
+    MultivariateNormal,
+    NegativeBinomial,
+    Normal,
+    OneHotCategorical,
+    OneHotCategoricalStraightThrough,
+    Pareto,
+    Poisson,
+    RelaxedBernoulli,
+    RelaxedOneHotCategorical,
+    StudentT,
+    TransformedDistribution,
+    Uniform,
+    VonMises,
+    Weibull,
+    Wishart,
+)
 from torch.distributions.constraint_registry import transform_to
 from torch.distributions.constraints import Constraint, is_dependent
 from torch.distributions.dirichlet import _Dirichlet_backward
 from torch.distributions.kl import _kl_expfamily_expfamily
-from torch.distributions.transforms import (AffineTransform, CatTransform, ExpTransform,
-                                            StackTransform, identity_transform)
-from torch.distributions.utils import (probs_to_logits, lazy_property, tril_matrix_to_vec,
-                                       vec_to_tril_matrix)
+from torch.distributions.transforms import (
+    AffineTransform,
+    CatTransform,
+    ExpTransform,
+    identity_transform,
+    StackTransform,
+)
+from torch.distributions.utils import (
+    lazy_property,
+    probs_to_logits,
+    tril_matrix_to_vec,
+    vec_to_tril_matrix,
+)
 from torch.nn.functional import softmax
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_utils import (
+    gradcheck,
+    load_tests,
+    run_tests,
+    set_default_dtype,
+    set_rng_seed,
+    skipIfTorchDynamo,
+    TestCase,
+)
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -76,8 +124,8 @@
 TEST_NUMPY = True
 try:
     import numpy as np
-    import scipy.stats
     import scipy.special
+    import scipy.stats
 except ImportError:
     TEST_NUMPY = False
 
@@ -99,716 +147,1057 @@ def is_all_nan(tensor):
     return (tensor != tensor).all()
 
 
-Example = namedtuple('Example', ['Dist', 'params'])
+Example = namedtuple("Example", ["Dist", "params"])
+
 
 # Register all distributions for generic tests.
 def _get_examples():
     return [
-        Example(Bernoulli, [
-            {'probs': torch.tensor([0.7, 0.2, 0.4], requires_grad=True)},
-            {'probs': torch.tensor([0.3], requires_grad=True)},
-            {'probs': 0.3},
-            {'logits': torch.tensor([0.], requires_grad=True)},
-        ]),
-        Example(Geometric, [
-            {'probs': torch.tensor([0.7, 0.2, 0.4], requires_grad=True)},
-            {'probs': torch.tensor([0.3], requires_grad=True)},
-            {'probs': 0.3},
-        ]),
-        Example(Beta, [
-            {
-                'concentration1': torch.randn(2, 3).exp().requires_grad_(),
-                'concentration0': torch.randn(2, 3).exp().requires_grad_(),
-            },
-            {
-                'concentration1': torch.randn(4).exp().requires_grad_(),
-                'concentration0': torch.randn(4).exp().requires_grad_(),
-            },
-        ]),
-        Example(Categorical, [
-            {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True)},
-            {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True)},
-            {'logits': torch.tensor([[0.0, 0.0], [0.0, 0.0]], requires_grad=True)},
-        ]),
-        Example(Binomial, [
-            {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), 'total_count': 10},
-            {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True), 'total_count': 10},
-            {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True), 'total_count': torch.tensor([10])},
-            {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True), 'total_count': torch.tensor([10, 8])},
-            {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True),
-             'total_count': torch.tensor([[10., 8.], [5., 3.]])},
-            {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True),
-             'total_count': torch.tensor(0.)},
-        ]),
-        Example(NegativeBinomial, [
-            {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), 'total_count': 10},
-            {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': 10},
-            {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': torch.tensor([10])},
-            {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': torch.tensor([10, 8])},
-            {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True),
-             'total_count': torch.tensor([[10., 8.], [5., 3.]])},
-            {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True),
-             'total_count': torch.tensor(0.)},
-        ]),
-        Example(Multinomial, [
-            {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), 'total_count': 10},
-            {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True), 'total_count': 10},
-        ]),
-        Example(Cauchy, [
-            {'loc': 0.0, 'scale': 1.0},
-            {'loc': torch.tensor([0.0]), 'scale': 1.0},
-            {'loc': torch.tensor([[0.0], [0.0]]),
-             'scale': torch.tensor([[1.0], [1.0]])}
-        ]),
-        Example(Chi2, [
-            {'df': torch.randn(2, 3).exp().requires_grad_()},
-            {'df': torch.randn(1).exp().requires_grad_()},
-        ]),
-        Example(StudentT, [
-            {'df': torch.randn(2, 3).exp().requires_grad_()},
-            {'df': torch.randn(1).exp().requires_grad_()},
-        ]),
-        Example(Dirichlet, [
-            {'concentration': torch.randn(2, 3).exp().requires_grad_()},
-            {'concentration': torch.randn(4).exp().requires_grad_()},
-        ]),
-        Example(Exponential, [
-            {'rate': torch.randn(5, 5).abs().requires_grad_()},
-            {'rate': torch.randn(1).abs().requires_grad_()},
-        ]),
-        Example(FisherSnedecor, [
-            {
-                'df1': torch.randn(5, 5).abs().requires_grad_(),
-                'df2': torch.randn(5, 5).abs().requires_grad_(),
-            },
-            {
-                'df1': torch.randn(1).abs().requires_grad_(),
-                'df2': torch.randn(1).abs().requires_grad_(),
-            },
-            {
-                'df1': torch.tensor([1.0]),
-                'df2': 1.0,
-            }
-        ]),
-        Example(Gamma, [
-            {
-                'concentration': torch.randn(2, 3).exp().requires_grad_(),
-                'rate': torch.randn(2, 3).exp().requires_grad_(),
-            },
-            {
-                'concentration': torch.randn(1).exp().requires_grad_(),
-                'rate': torch.randn(1).exp().requires_grad_(),
-            },
-        ]),
-        Example(Gumbel, [
-            {
-                'loc': torch.randn(5, 5, requires_grad=True),
-                'scale': torch.randn(5, 5).abs().requires_grad_(),
-            },
-            {
-                'loc': torch.randn(1, requires_grad=True),
-                'scale': torch.randn(1).abs().requires_grad_(),
-            },
-        ]),
-        Example(HalfCauchy, [
-            {'scale': 1.0},
-            {'scale': torch.tensor([[1.0], [1.0]])}
-        ]),
-        Example(HalfNormal, [
-            {'scale': torch.randn(5, 5).abs().requires_grad_()},
-            {'scale': torch.randn(1).abs().requires_grad_()},
-            {'scale': torch.tensor([1e-5, 1e-5], requires_grad=True)}
-        ]),
-        Example(Independent, [
-            {
-                'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                            torch.randn(2, 3).abs().requires_grad_()),
-                'reinterpreted_batch_ndims': 0,
-            },
-            {
-                'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                            torch.randn(2, 3).abs().requires_grad_()),
-                'reinterpreted_batch_ndims': 1,
-            },
-            {
-                'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                            torch.randn(2, 3).abs().requires_grad_()),
-                'reinterpreted_batch_ndims': 2,
-            },
-            {
-                'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
-                                            torch.randn(2, 3, 5).abs().requires_grad_()),
-                'reinterpreted_batch_ndims': 2,
-            },
-            {
-                'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
-                                            torch.randn(2, 3, 5).abs().requires_grad_()),
-                'reinterpreted_batch_ndims': 3,
-            },
-        ]),
-        Example(Kumaraswamy, [
-            {
-                'concentration1': torch.empty(2, 3).uniform_(1, 2).requires_grad_(),
-                'concentration0': torch.empty(2, 3).uniform_(1, 2).requires_grad_(),
-            },
-            {
-                'concentration1': torch.rand(4).uniform_(1, 2).requires_grad_(),
-                'concentration0': torch.rand(4).uniform_(1, 2).requires_grad_(),
-            },
-        ]),
-        Example(LKJCholesky, [
-            {
-                'dim': 2,
-                'concentration': 0.5
-            },
-            {
-                'dim': 3,
-                'concentration': torch.tensor([0.5, 1., 2.]),
-            },
-            {
-                'dim': 100,
-                'concentration': 4.
-            },
-        ]),
-        Example(Laplace, [
-            {
-                'loc': torch.randn(5, 5, requires_grad=True),
-                'scale': torch.randn(5, 5).abs().requires_grad_(),
-            },
-            {
-                'loc': torch.randn(1, requires_grad=True),
-                'scale': torch.randn(1).abs().requires_grad_(),
-            },
-            {
-                'loc': torch.tensor([1.0, 0.0], requires_grad=True),
-                'scale': torch.tensor([1e-5, 1e-5], requires_grad=True),
-            },
-        ]),
-        Example(LogNormal, [
-            {
-                'loc': torch.randn(5, 5, requires_grad=True),
-                'scale': torch.randn(5, 5).abs().requires_grad_(),
-            },
-            {
-                'loc': torch.randn(1, requires_grad=True),
-                'scale': torch.randn(1).abs().requires_grad_(),
-            },
-            {
-                'loc': torch.tensor([1.0, 0.0], requires_grad=True),
-                'scale': torch.tensor([1e-5, 1e-5], requires_grad=True),
-            },
-        ]),
-        Example(LogisticNormal, [
-            {
-                'loc': torch.randn(5, 5).requires_grad_(),
-                'scale': torch.randn(5, 5).abs().requires_grad_(),
-            },
-            {
-                'loc': torch.randn(1).requires_grad_(),
-                'scale': torch.randn(1).abs().requires_grad_(),
-            },
-            {
-                'loc': torch.tensor([1.0, 0.0], requires_grad=True),
-                'scale': torch.tensor([1e-5, 1e-5], requires_grad=True),
-            },
-        ]),
-        Example(LowRankMultivariateNormal, [
-            {
-                'loc': torch.randn(5, 2, requires_grad=True),
-                'cov_factor': torch.randn(5, 2, 1, requires_grad=True),
-                'cov_diag': torch.tensor([2.0, 0.25], requires_grad=True),
-            },
-            {
-                'loc': torch.randn(4, 3, requires_grad=True),
-                'cov_factor': torch.randn(3, 2, requires_grad=True),
-                'cov_diag': torch.tensor([5.0, 1.5, 3.], requires_grad=True),
-            }
-        ]),
-        Example(MultivariateNormal, [
-            {
-                'loc': torch.randn(5, 2, requires_grad=True),
-                'covariance_matrix': torch.tensor([[2.0, 0.3], [0.3, 0.25]], requires_grad=True),
-            },
-            {
-                'loc': torch.randn(2, 3, requires_grad=True),
-                'precision_matrix': torch.tensor([[2.0, 0.1, 0.0],
-                                                  [0.1, 0.25, 0.0],
-                                                  [0.0, 0.0, 0.3]], requires_grad=True),
-            },
-            {
-                'loc': torch.randn(5, 3, 2, requires_grad=True),
-                'scale_tril': torch.tensor([[[2.0, 0.0], [-0.5, 0.25]],
-                                            [[2.0, 0.0], [0.3, 0.25]],
-                                            [[5.0, 0.0], [-0.5, 1.5]]], requires_grad=True),
-            },
-            {
-                'loc': torch.tensor([1.0, -1.0]),
-                'covariance_matrix': torch.tensor([[5.0, -0.5], [-0.5, 1.5]]),
-            },
-        ]),
-        Example(Normal, [
-            {
-                'loc': torch.randn(5, 5, requires_grad=True),
-                'scale': torch.randn(5, 5).abs().requires_grad_(),
-            },
-            {
-                'loc': torch.randn(1, requires_grad=True),
-                'scale': torch.randn(1).abs().requires_grad_(),
-            },
-            {
-                'loc': torch.tensor([1.0, 0.0], requires_grad=True),
-                'scale': torch.tensor([1e-5, 1e-5], requires_grad=True),
-            },
-        ]),
-        Example(OneHotCategorical, [
-            {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True)},
-            {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True)},
-            {'logits': torch.tensor([[0.0, 0.0], [0.0, 0.0]], requires_grad=True)},
-        ]),
-        Example(OneHotCategoricalStraightThrough, [
-            {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True)},
-            {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True)},
-            {'logits': torch.tensor([[0.0, 0.0], [0.0, 0.0]], requires_grad=True)},
-        ]),
-        Example(Pareto, [
-            {
-                'scale': 1.0,
-                'alpha': 1.0
-            },
-            {
-                'scale': (torch.randn(5, 5).abs() + 0.1).requires_grad_(),
-                'alpha': (torch.randn(5, 5).abs() + 0.1).requires_grad_()
-            },
-            {
-                'scale': torch.tensor([1.0]),
-                'alpha': 1.0
-            }
-        ]),
-        Example(Poisson, [
-            {
-                'rate': torch.randn(5, 5).abs().requires_grad_(),
-            },
-            {
-                'rate': torch.randn(3).abs().requires_grad_(),
-            },
-            {
-                'rate': 0.2,
-            },
-            {
-                'rate': torch.tensor([0.0], requires_grad=True),
-            },
-            {
-                'rate': 0.0,
-            }
-        ]),
-        Example(RelaxedBernoulli, [
-            {
-                'temperature': torch.tensor([0.5], requires_grad=True),
-                'probs': torch.tensor([0.7, 0.2, 0.4], requires_grad=True),
-            },
-            {
-                'temperature': torch.tensor([2.0]),
-                'probs': torch.tensor([0.3]),
-            },
-            {
-                'temperature': torch.tensor([7.2]),
-                'logits': torch.tensor([-2.0, 2.0, 1.0, 5.0])
-            }
-        ]),
-        Example(RelaxedOneHotCategorical, [
-            {
-                'temperature': torch.tensor([0.5], requires_grad=True),
-                'probs': torch.tensor([[0.1, 0.2, 0.7], [0.5, 0.3, 0.2]], requires_grad=True)
-            },
-            {
-                'temperature': torch.tensor([2.0]),
-                'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]])
-            },
-            {
-                'temperature': torch.tensor([7.2]),
-                'logits': torch.tensor([[-2.0, 2.0], [1.0, 5.0]])
-            }
-        ]),
-        Example(TransformedDistribution, [
-            {
-                'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                            torch.randn(2, 3).abs().requires_grad_()),
-                'transforms': [],
-            },
-            {
-                'base_distribution': Normal(torch.randn(2, 3, requires_grad=True),
-                                            torch.randn(2, 3).abs().requires_grad_()),
-                'transforms': ExpTransform(),
-            },
-            {
-                'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
-                                            torch.randn(2, 3, 5).abs().requires_grad_()),
-                'transforms': [AffineTransform(torch.randn(3, 5), torch.randn(3, 5)),
-                               ExpTransform()],
-            },
-            {
-                'base_distribution': Normal(torch.randn(2, 3, 5, requires_grad=True),
-                                            torch.randn(2, 3, 5).abs().requires_grad_()),
-                'transforms': AffineTransform(1, 2),
-            },
-            {
-                'base_distribution': Uniform(torch.tensor(1e8).log(), torch.tensor(1e10).log()),
-                'transforms': ExpTransform(),
-            },
-        ]),
-        Example(Uniform, [
-            {
-                'low': torch.zeros(5, 5, requires_grad=True),
-                'high': torch.ones(5, 5, requires_grad=True),
-            },
-            {
-                'low': torch.zeros(1, requires_grad=True),
-                'high': torch.ones(1, requires_grad=True),
-            },
-            {
-                'low': torch.tensor([1.0, 1.0], requires_grad=True),
-                'high': torch.tensor([2.0, 3.0], requires_grad=True),
-            },
-        ]),
-        Example(Weibull, [
-            {
-                'scale': torch.randn(5, 5).abs().requires_grad_(),
-                'concentration': torch.randn(1).abs().requires_grad_()
-            }
-        ]),
-        Example(Wishart, [
-            {
-                'covariance_matrix': torch.tensor([[2.0, 0.3], [0.3, 0.25]], requires_grad=True),
-                'df': torch.tensor([3.], requires_grad=True),
-            },
-            {
-                'precision_matrix': torch.tensor([[2.0, 0.1, 0.0],
-                                                  [0.1, 0.25, 0.0],
-                                                  [0.0, 0.0, 0.3]], requires_grad=True),
-                'df': torch.tensor([5., 4], requires_grad=True),
-            },
-            {
-                'scale_tril': torch.tensor([[[2.0, 0.0], [-0.5, 0.25]],
-                                            [[2.0, 0.0], [0.3, 0.25]],
-                                            [[5.0, 0.0], [-0.5, 1.5]]], requires_grad=True),
-                'df': torch.tensor([5., 3.5, 3], requires_grad=True),
-            },
-            {
-                'covariance_matrix': torch.tensor([[5.0, -0.5], [-0.5, 1.5]]),
-                'df': torch.tensor([3.0]),
-            },
-            {
-                'covariance_matrix': torch.tensor([[5.0, -0.5], [-0.5, 1.5]]),
-                'df': 3.0,
-            },
-        ]),
-        Example(MixtureSameFamily, [
-            {
-                'mixture_distribution': Categorical(torch.rand(5, requires_grad=True)),
-                'component_distribution': Normal(torch.randn(5, requires_grad=True),
-                                                 torch.rand(5, requires_grad=True)),
-            },
-            {
-                'mixture_distribution': Categorical(torch.rand(5, requires_grad=True)),
-                'component_distribution': MultivariateNormal(
-                    loc=torch.randn(5, 2, requires_grad=True),
-                    covariance_matrix=torch.tensor([[2.0, 0.3], [0.3, 0.25]], requires_grad=True)),
-            },
-        ]),
-        Example(VonMises, [
-            {
-                'loc': torch.tensor(1.0, requires_grad=True),
-                'concentration': torch.tensor(10.0, requires_grad=True)
-            },
-            {
-                'loc': torch.tensor([0.0, math.pi / 2], requires_grad=True),
-                'concentration': torch.tensor([1.0, 10.0], requires_grad=True)
-            },
-        ]),
-        Example(ContinuousBernoulli, [
-            {'probs': torch.tensor([0.7, 0.2, 0.4], requires_grad=True)},
-            {'probs': torch.tensor([0.3], requires_grad=True)},
-            {'probs': 0.3},
-            {'logits': torch.tensor([0.], requires_grad=True)},
-        ]),
-        Example(InverseGamma, [
-            {
-                'concentration': torch.randn(2, 3).exp().requires_grad_(),
-                'rate': torch.randn(2, 3).exp().requires_grad_(),
-            },
-            {
-                'concentration': torch.randn(1).exp().requires_grad_(),
-                'rate': torch.randn(1).exp().requires_grad_(),
-            },
-        ]),
+        Example(
+            Bernoulli,
+            [
+                {"probs": torch.tensor([0.7, 0.2, 0.4], requires_grad=True)},
+                {"probs": torch.tensor([0.3], requires_grad=True)},
+                {"probs": 0.3},
+                {"logits": torch.tensor([0.0], requires_grad=True)},
+            ],
+        ),
+        Example(
+            Geometric,
+            [
+                {"probs": torch.tensor([0.7, 0.2, 0.4], requires_grad=True)},
+                {"probs": torch.tensor([0.3], requires_grad=True)},
+                {"probs": 0.3},
+            ],
+        ),
+        Example(
+            Beta,
+            [
+                {
+                    "concentration1": torch.randn(2, 3).exp().requires_grad_(),
+                    "concentration0": torch.randn(2, 3).exp().requires_grad_(),
+                },
+                {
+                    "concentration1": torch.randn(4).exp().requires_grad_(),
+                    "concentration0": torch.randn(4).exp().requires_grad_(),
+                },
+            ],
+        ),
+        Example(
+            Categorical,
+            [
+                {
+                    "probs": torch.tensor(
+                        [[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True
+                    )
+                },
+                {"probs": torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True)},
+                {"logits": torch.tensor([[0.0, 0.0], [0.0, 0.0]], requires_grad=True)},
+            ],
+        ),
+        Example(
+            Binomial,
+            [
+                {
+                    "probs": torch.tensor(
+                        [[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True
+                    ),
+                    "total_count": 10,
+                },
+                {
+                    "probs": torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True),
+                    "total_count": 10,
+                },
+                {
+                    "probs": torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True),
+                    "total_count": torch.tensor([10]),
+                },
+                {
+                    "probs": torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True),
+                    "total_count": torch.tensor([10, 8]),
+                },
+                {
+                    "probs": torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True),
+                    "total_count": torch.tensor([[10.0, 8.0], [5.0, 3.0]]),
+                },
+                {
+                    "probs": torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True),
+                    "total_count": torch.tensor(0.0),
+                },
+            ],
+        ),
+        Example(
+            NegativeBinomial,
+            [
+                {
+                    "probs": torch.tensor(
+                        [[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True
+                    ),
+                    "total_count": 10,
+                },
+                {
+                    "probs": torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True),
+                    "total_count": 10,
+                },
+                {
+                    "probs": torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True),
+                    "total_count": torch.tensor([10]),
+                },
+                {
+                    "probs": torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True),
+                    "total_count": torch.tensor([10, 8]),
+                },
+                {
+                    "probs": torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True),
+                    "total_count": torch.tensor([[10.0, 8.0], [5.0, 3.0]]),
+                },
+                {
+                    "probs": torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True),
+                    "total_count": torch.tensor(0.0),
+                },
+            ],
+        ),
+        Example(
+            Multinomial,
+            [
+                {
+                    "probs": torch.tensor(
+                        [[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True
+                    ),
+                    "total_count": 10,
+                },
+                {
+                    "probs": torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True),
+                    "total_count": 10,
+                },
+            ],
+        ),
+        Example(
+            Cauchy,
+            [
+                {"loc": 0.0, "scale": 1.0},
+                {"loc": torch.tensor([0.0]), "scale": 1.0},
+                {
+                    "loc": torch.tensor([[0.0], [0.0]]),
+                    "scale": torch.tensor([[1.0], [1.0]]),
+                },
+            ],
+        ),
+        Example(
+            Chi2,
+            [
+                {"df": torch.randn(2, 3).exp().requires_grad_()},
+                {"df": torch.randn(1).exp().requires_grad_()},
+            ],
+        ),
+        Example(
+            StudentT,
+            [
+                {"df": torch.randn(2, 3).exp().requires_grad_()},
+                {"df": torch.randn(1).exp().requires_grad_()},
+            ],
+        ),
+        Example(
+            Dirichlet,
+            [
+                {"concentration": torch.randn(2, 3).exp().requires_grad_()},
+                {"concentration": torch.randn(4).exp().requires_grad_()},
+            ],
+        ),
+        Example(
+            Exponential,
+            [
+                {"rate": torch.randn(5, 5).abs().requires_grad_()},
+                {"rate": torch.randn(1).abs().requires_grad_()},
+            ],
+        ),
+        Example(
+            FisherSnedecor,
+            [
+                {
+                    "df1": torch.randn(5, 5).abs().requires_grad_(),
+                    "df2": torch.randn(5, 5).abs().requires_grad_(),
+                },
+                {
+                    "df1": torch.randn(1).abs().requires_grad_(),
+                    "df2": torch.randn(1).abs().requires_grad_(),
+                },
+                {
+                    "df1": torch.tensor([1.0]),
+                    "df2": 1.0,
+                },
+            ],
+        ),
+        Example(
+            Gamma,
+            [
+                {
+                    "concentration": torch.randn(2, 3).exp().requires_grad_(),
+                    "rate": torch.randn(2, 3).exp().requires_grad_(),
+                },
+                {
+                    "concentration": torch.randn(1).exp().requires_grad_(),
+                    "rate": torch.randn(1).exp().requires_grad_(),
+                },
+            ],
+        ),
+        Example(
+            Gumbel,
+            [
+                {
+                    "loc": torch.randn(5, 5, requires_grad=True),
+                    "scale": torch.randn(5, 5).abs().requires_grad_(),
+                },
+                {
+                    "loc": torch.randn(1, requires_grad=True),
+                    "scale": torch.randn(1).abs().requires_grad_(),
+                },
+            ],
+        ),
+        Example(HalfCauchy, [{"scale": 1.0}, {"scale": torch.tensor([[1.0], [1.0]])}]),
+        Example(
+            HalfNormal,
+            [
+                {"scale": torch.randn(5, 5).abs().requires_grad_()},
+                {"scale": torch.randn(1).abs().requires_grad_()},
+                {"scale": torch.tensor([1e-5, 1e-5], requires_grad=True)},
+            ],
+        ),
+        Example(
+            Independent,
+            [
+                {
+                    "base_distribution": Normal(
+                        torch.randn(2, 3, requires_grad=True),
+                        torch.randn(2, 3).abs().requires_grad_(),
+                    ),
+                    "reinterpreted_batch_ndims": 0,
+                },
+                {
+                    "base_distribution": Normal(
+                        torch.randn(2, 3, requires_grad=True),
+                        torch.randn(2, 3).abs().requires_grad_(),
+                    ),
+                    "reinterpreted_batch_ndims": 1,
+                },
+                {
+                    "base_distribution": Normal(
+                        torch.randn(2, 3, requires_grad=True),
+                        torch.randn(2, 3).abs().requires_grad_(),
+                    ),
+                    "reinterpreted_batch_ndims": 2,
+                },
+                {
+                    "base_distribution": Normal(
+                        torch.randn(2, 3, 5, requires_grad=True),
+                        torch.randn(2, 3, 5).abs().requires_grad_(),
+                    ),
+                    "reinterpreted_batch_ndims": 2,
+                },
+                {
+                    "base_distribution": Normal(
+                        torch.randn(2, 3, 5, requires_grad=True),
+                        torch.randn(2, 3, 5).abs().requires_grad_(),
+                    ),
+                    "reinterpreted_batch_ndims": 3,
+                },
+            ],
+        ),
+        Example(
+            Kumaraswamy,
+            [
+                {
+                    "concentration1": torch.empty(2, 3).uniform_(1, 2).requires_grad_(),
+                    "concentration0": torch.empty(2, 3).uniform_(1, 2).requires_grad_(),
+                },
+                {
+                    "concentration1": torch.rand(4).uniform_(1, 2).requires_grad_(),
+                    "concentration0": torch.rand(4).uniform_(1, 2).requires_grad_(),
+                },
+            ],
+        ),
+        Example(
+            LKJCholesky,
+            [
+                {"dim": 2, "concentration": 0.5},
+                {
+                    "dim": 3,
+                    "concentration": torch.tensor([0.5, 1.0, 2.0]),
+                },
+                {"dim": 100, "concentration": 4.0},
+            ],
+        ),
+        Example(
+            Laplace,
+            [
+                {
+                    "loc": torch.randn(5, 5, requires_grad=True),
+                    "scale": torch.randn(5, 5).abs().requires_grad_(),
+                },
+                {
+                    "loc": torch.randn(1, requires_grad=True),
+                    "scale": torch.randn(1).abs().requires_grad_(),
+                },
+                {
+                    "loc": torch.tensor([1.0, 0.0], requires_grad=True),
+                    "scale": torch.tensor([1e-5, 1e-5], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            LogNormal,
+            [
+                {
+                    "loc": torch.randn(5, 5, requires_grad=True),
+                    "scale": torch.randn(5, 5).abs().requires_grad_(),
+                },
+                {
+                    "loc": torch.randn(1, requires_grad=True),
+                    "scale": torch.randn(1).abs().requires_grad_(),
+                },
+                {
+                    "loc": torch.tensor([1.0, 0.0], requires_grad=True),
+                    "scale": torch.tensor([1e-5, 1e-5], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            LogisticNormal,
+            [
+                {
+                    "loc": torch.randn(5, 5).requires_grad_(),
+                    "scale": torch.randn(5, 5).abs().requires_grad_(),
+                },
+                {
+                    "loc": torch.randn(1).requires_grad_(),
+                    "scale": torch.randn(1).abs().requires_grad_(),
+                },
+                {
+                    "loc": torch.tensor([1.0, 0.0], requires_grad=True),
+                    "scale": torch.tensor([1e-5, 1e-5], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            LowRankMultivariateNormal,
+            [
+                {
+                    "loc": torch.randn(5, 2, requires_grad=True),
+                    "cov_factor": torch.randn(5, 2, 1, requires_grad=True),
+                    "cov_diag": torch.tensor([2.0, 0.25], requires_grad=True),
+                },
+                {
+                    "loc": torch.randn(4, 3, requires_grad=True),
+                    "cov_factor": torch.randn(3, 2, requires_grad=True),
+                    "cov_diag": torch.tensor([5.0, 1.5, 3.0], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            MultivariateNormal,
+            [
+                {
+                    "loc": torch.randn(5, 2, requires_grad=True),
+                    "covariance_matrix": torch.tensor(
+                        [[2.0, 0.3], [0.3, 0.25]], requires_grad=True
+                    ),
+                },
+                {
+                    "loc": torch.randn(2, 3, requires_grad=True),
+                    "precision_matrix": torch.tensor(
+                        [[2.0, 0.1, 0.0], [0.1, 0.25, 0.0], [0.0, 0.0, 0.3]],
+                        requires_grad=True,
+                    ),
+                },
+                {
+                    "loc": torch.randn(5, 3, 2, requires_grad=True),
+                    "scale_tril": torch.tensor(
+                        [
+                            [[2.0, 0.0], [-0.5, 0.25]],
+                            [[2.0, 0.0], [0.3, 0.25]],
+                            [[5.0, 0.0], [-0.5, 1.5]],
+                        ],
+                        requires_grad=True,
+                    ),
+                },
+                {
+                    "loc": torch.tensor([1.0, -1.0]),
+                    "covariance_matrix": torch.tensor([[5.0, -0.5], [-0.5, 1.5]]),
+                },
+            ],
+        ),
+        Example(
+            Normal,
+            [
+                {
+                    "loc": torch.randn(5, 5, requires_grad=True),
+                    "scale": torch.randn(5, 5).abs().requires_grad_(),
+                },
+                {
+                    "loc": torch.randn(1, requires_grad=True),
+                    "scale": torch.randn(1).abs().requires_grad_(),
+                },
+                {
+                    "loc": torch.tensor([1.0, 0.0], requires_grad=True),
+                    "scale": torch.tensor([1e-5, 1e-5], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            OneHotCategorical,
+            [
+                {
+                    "probs": torch.tensor(
+                        [[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True
+                    )
+                },
+                {"probs": torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True)},
+                {"logits": torch.tensor([[0.0, 0.0], [0.0, 0.0]], requires_grad=True)},
+            ],
+        ),
+        Example(
+            OneHotCategoricalStraightThrough,
+            [
+                {
+                    "probs": torch.tensor(
+                        [[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True
+                    )
+                },
+                {"probs": torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True)},
+                {"logits": torch.tensor([[0.0, 0.0], [0.0, 0.0]], requires_grad=True)},
+            ],
+        ),
+        Example(
+            Pareto,
+            [
+                {"scale": 1.0, "alpha": 1.0},
+                {
+                    "scale": (torch.randn(5, 5).abs() + 0.1).requires_grad_(),
+                    "alpha": (torch.randn(5, 5).abs() + 0.1).requires_grad_(),
+                },
+                {"scale": torch.tensor([1.0]), "alpha": 1.0},
+            ],
+        ),
+        Example(
+            Poisson,
+            [
+                {
+                    "rate": torch.randn(5, 5).abs().requires_grad_(),
+                },
+                {
+                    "rate": torch.randn(3).abs().requires_grad_(),
+                },
+                {
+                    "rate": 0.2,
+                },
+                {
+                    "rate": torch.tensor([0.0], requires_grad=True),
+                },
+                {
+                    "rate": 0.0,
+                },
+            ],
+        ),
+        Example(
+            RelaxedBernoulli,
+            [
+                {
+                    "temperature": torch.tensor([0.5], requires_grad=True),
+                    "probs": torch.tensor([0.7, 0.2, 0.4], requires_grad=True),
+                },
+                {
+                    "temperature": torch.tensor([2.0]),
+                    "probs": torch.tensor([0.3]),
+                },
+                {
+                    "temperature": torch.tensor([7.2]),
+                    "logits": torch.tensor([-2.0, 2.0, 1.0, 5.0]),
+                },
+            ],
+        ),
+        Example(
+            RelaxedOneHotCategorical,
+            [
+                {
+                    "temperature": torch.tensor([0.5], requires_grad=True),
+                    "probs": torch.tensor(
+                        [[0.1, 0.2, 0.7], [0.5, 0.3, 0.2]], requires_grad=True
+                    ),
+                },
+                {
+                    "temperature": torch.tensor([2.0]),
+                    "probs": torch.tensor([[1.0, 0.0], [0.0, 1.0]]),
+                },
+                {
+                    "temperature": torch.tensor([7.2]),
+                    "logits": torch.tensor([[-2.0, 2.0], [1.0, 5.0]]),
+                },
+            ],
+        ),
+        Example(
+            TransformedDistribution,
+            [
+                {
+                    "base_distribution": Normal(
+                        torch.randn(2, 3, requires_grad=True),
+                        torch.randn(2, 3).abs().requires_grad_(),
+                    ),
+                    "transforms": [],
+                },
+                {
+                    "base_distribution": Normal(
+                        torch.randn(2, 3, requires_grad=True),
+                        torch.randn(2, 3).abs().requires_grad_(),
+                    ),
+                    "transforms": ExpTransform(),
+                },
+                {
+                    "base_distribution": Normal(
+                        torch.randn(2, 3, 5, requires_grad=True),
+                        torch.randn(2, 3, 5).abs().requires_grad_(),
+                    ),
+                    "transforms": [
+                        AffineTransform(torch.randn(3, 5), torch.randn(3, 5)),
+                        ExpTransform(),
+                    ],
+                },
+                {
+                    "base_distribution": Normal(
+                        torch.randn(2, 3, 5, requires_grad=True),
+                        torch.randn(2, 3, 5).abs().requires_grad_(),
+                    ),
+                    "transforms": AffineTransform(1, 2),
+                },
+                {
+                    "base_distribution": Uniform(
+                        torch.tensor(1e8).log(), torch.tensor(1e10).log()
+                    ),
+                    "transforms": ExpTransform(),
+                },
+            ],
+        ),
+        Example(
+            Uniform,
+            [
+                {
+                    "low": torch.zeros(5, 5, requires_grad=True),
+                    "high": torch.ones(5, 5, requires_grad=True),
+                },
+                {
+                    "low": torch.zeros(1, requires_grad=True),
+                    "high": torch.ones(1, requires_grad=True),
+                },
+                {
+                    "low": torch.tensor([1.0, 1.0], requires_grad=True),
+                    "high": torch.tensor([2.0, 3.0], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            Weibull,
+            [
+                {
+                    "scale": torch.randn(5, 5).abs().requires_grad_(),
+                    "concentration": torch.randn(1).abs().requires_grad_(),
+                }
+            ],
+        ),
+        Example(
+            Wishart,
+            [
+                {
+                    "covariance_matrix": torch.tensor(
+                        [[2.0, 0.3], [0.3, 0.25]], requires_grad=True
+                    ),
+                    "df": torch.tensor([3.0], requires_grad=True),
+                },
+                {
+                    "precision_matrix": torch.tensor(
+                        [[2.0, 0.1, 0.0], [0.1, 0.25, 0.0], [0.0, 0.0, 0.3]],
+                        requires_grad=True,
+                    ),
+                    "df": torch.tensor([5.0, 4], requires_grad=True),
+                },
+                {
+                    "scale_tril": torch.tensor(
+                        [
+                            [[2.0, 0.0], [-0.5, 0.25]],
+                            [[2.0, 0.0], [0.3, 0.25]],
+                            [[5.0, 0.0], [-0.5, 1.5]],
+                        ],
+                        requires_grad=True,
+                    ),
+                    "df": torch.tensor([5.0, 3.5, 3], requires_grad=True),
+                },
+                {
+                    "covariance_matrix": torch.tensor([[5.0, -0.5], [-0.5, 1.5]]),
+                    "df": torch.tensor([3.0]),
+                },
+                {
+                    "covariance_matrix": torch.tensor([[5.0, -0.5], [-0.5, 1.5]]),
+                    "df": 3.0,
+                },
+            ],
+        ),
+        Example(
+            MixtureSameFamily,
+            [
+                {
+                    "mixture_distribution": Categorical(
+                        torch.rand(5, requires_grad=True)
+                    ),
+                    "component_distribution": Normal(
+                        torch.randn(5, requires_grad=True),
+                        torch.rand(5, requires_grad=True),
+                    ),
+                },
+                {
+                    "mixture_distribution": Categorical(
+                        torch.rand(5, requires_grad=True)
+                    ),
+                    "component_distribution": MultivariateNormal(
+                        loc=torch.randn(5, 2, requires_grad=True),
+                        covariance_matrix=torch.tensor(
+                            [[2.0, 0.3], [0.3, 0.25]], requires_grad=True
+                        ),
+                    ),
+                },
+            ],
+        ),
+        Example(
+            VonMises,
+            [
+                {
+                    "loc": torch.tensor(1.0, requires_grad=True),
+                    "concentration": torch.tensor(10.0, requires_grad=True),
+                },
+                {
+                    "loc": torch.tensor([0.0, math.pi / 2], requires_grad=True),
+                    "concentration": torch.tensor([1.0, 10.0], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            ContinuousBernoulli,
+            [
+                {"probs": torch.tensor([0.7, 0.2, 0.4], requires_grad=True)},
+                {"probs": torch.tensor([0.3], requires_grad=True)},
+                {"probs": 0.3},
+                {"logits": torch.tensor([0.0], requires_grad=True)},
+            ],
+        ),
+        Example(
+            InverseGamma,
+            [
+                {
+                    "concentration": torch.randn(2, 3).exp().requires_grad_(),
+                    "rate": torch.randn(2, 3).exp().requires_grad_(),
+                },
+                {
+                    "concentration": torch.randn(1).exp().requires_grad_(),
+                    "rate": torch.randn(1).exp().requires_grad_(),
+                },
+            ],
+        ),
     ]
 
+
 def _get_bad_examples():
     return [
-        Example(Bernoulli, [
-            {'probs': torch.tensor([1.1, 0.2, 0.4], requires_grad=True)},
-            {'probs': torch.tensor([-0.5], requires_grad=True)},
-            {'probs': 1.00001},
-        ]),
-        Example(Beta, [
-            {
-                'concentration1': torch.tensor([0.0], requires_grad=True),
-                'concentration0': torch.tensor([0.0], requires_grad=True),
-            },
-            {
-                'concentration1': torch.tensor([-1.0], requires_grad=True),
-                'concentration0': torch.tensor([-2.0], requires_grad=True),
-            },
-        ]),
-        Example(Geometric, [
-            {'probs': torch.tensor([1.1, 0.2, 0.4], requires_grad=True)},
-            {'probs': torch.tensor([-0.3], requires_grad=True)},
-            {'probs': 1.00000001},
-        ]),
-        Example(Categorical, [
-            {'probs': torch.tensor([[-0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True)},
-            {'probs': torch.tensor([[-1.0, 10.0], [0.0, -1.0]], requires_grad=True)},
-        ]),
-        Example(Binomial, [
-            {'probs': torch.tensor([[-0.0000001, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True),
-             'total_count': 10},
-            {'probs': torch.tensor([[1.0, 0.0], [0.0, 2.0]], requires_grad=True),
-             'total_count': 10},
-        ]),
-        Example(NegativeBinomial, [
-            {'probs': torch.tensor([[-0.0000001, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True),
-             'total_count': 10},
-            {'probs': torch.tensor([[1.0, 0.0], [0.0, 2.0]], requires_grad=True),
-             'total_count': 10},
-        ]),
-        Example(Cauchy, [
-            {'loc': 0.0, 'scale': -1.0},
-            {'loc': torch.tensor([0.0]), 'scale': 0.0},
-            {'loc': torch.tensor([[0.0], [-2.0]]),
-             'scale': torch.tensor([[-0.000001], [1.0]])}
-        ]),
-        Example(Chi2, [
-            {'df': torch.tensor([0.], requires_grad=True)},
-            {'df': torch.tensor([-2.], requires_grad=True)},
-        ]),
-        Example(StudentT, [
-            {'df': torch.tensor([0.], requires_grad=True)},
-            {'df': torch.tensor([-2.], requires_grad=True)},
-        ]),
-        Example(Dirichlet, [
-            {'concentration': torch.tensor([0.], requires_grad=True)},
-            {'concentration': torch.tensor([-2.], requires_grad=True)}
-        ]),
-        Example(Exponential, [
-            {'rate': torch.tensor([0., 0.], requires_grad=True)},
-            {'rate': torch.tensor([-2.], requires_grad=True)}
-        ]),
-        Example(FisherSnedecor, [
-            {
-                'df1': torch.tensor([0., 0.], requires_grad=True),
-                'df2': torch.tensor([-1., -100.], requires_grad=True),
-            },
-            {
-                'df1': torch.tensor([1., 1.], requires_grad=True),
-                'df2': torch.tensor([0., 0.], requires_grad=True),
-            }
-        ]),
-        Example(Gamma, [
-            {
-                'concentration': torch.tensor([0., 0.], requires_grad=True),
-                'rate': torch.tensor([-1., -100.], requires_grad=True),
-            },
-            {
-                'concentration': torch.tensor([1., 1.], requires_grad=True),
-                'rate': torch.tensor([0., 0.], requires_grad=True),
-            }
-        ]),
-        Example(Gumbel, [
-            {
-                'loc': torch.tensor([1., 1.], requires_grad=True),
-                'scale': torch.tensor([0., 1.], requires_grad=True),
-            },
-            {
-                'loc': torch.tensor([1., 1.], requires_grad=True),
-                'scale': torch.tensor([1., -1.], requires_grad=True),
-            },
-        ]),
-        Example(HalfCauchy, [
-            {'scale': -1.0},
-            {'scale': 0.0},
-            {'scale': torch.tensor([[-0.000001], [1.0]])}
-        ]),
-        Example(HalfNormal, [
-            {'scale': torch.tensor([0., 1.], requires_grad=True)},
-            {'scale': torch.tensor([1., -1.], requires_grad=True)},
-        ]),
-        Example(LKJCholesky, [
-            {
-                'dim': -2,
-                'concentration': 0.1
-            },
-            {
-                'dim': 1,
-                'concentration': 2.,
-            },
-            {
-                'dim': 2,
-                'concentration': 0.,
-            },
-        ]),
-        Example(Laplace, [
-            {
-                'loc': torch.tensor([1., 1.], requires_grad=True),
-                'scale': torch.tensor([0., 1.], requires_grad=True),
-            },
-            {
-                'loc': torch.tensor([1., 1.], requires_grad=True),
-                'scale': torch.tensor([1., -1.], requires_grad=True),
-            },
-        ]),
-        Example(LogNormal, [
-            {
-                'loc': torch.tensor([1., 1.], requires_grad=True),
-                'scale': torch.tensor([0., 1.], requires_grad=True),
-            },
-            {
-                'loc': torch.tensor([1., 1.], requires_grad=True),
-                'scale': torch.tensor([1., -1.], requires_grad=True),
-            },
-        ]),
-        Example(MultivariateNormal, [
-            {
-                'loc': torch.tensor([1., 1.], requires_grad=True),
-                'covariance_matrix': torch.tensor([[1.0, 0.0], [0.0, -2.0]], requires_grad=True),
-            },
-        ]),
-        Example(Normal, [
-            {
-                'loc': torch.tensor([1., 1.], requires_grad=True),
-                'scale': torch.tensor([0., 1.], requires_grad=True),
-            },
-            {
-                'loc': torch.tensor([1., 1.], requires_grad=True),
-                'scale': torch.tensor([1., -1.], requires_grad=True),
-            },
-            {
-                'loc': torch.tensor([1.0, 0.0], requires_grad=True),
-                'scale': torch.tensor([1e-5, -1e-5], requires_grad=True),
-            },
-        ]),
-        Example(OneHotCategorical, [
-            {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.1, -10.0, 0.2]], requires_grad=True)},
-            {'probs': torch.tensor([[0.0, 0.0], [0.0, 0.0]], requires_grad=True)},
-        ]),
-        Example(OneHotCategoricalStraightThrough, [
-            {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.1, -10.0, 0.2]], requires_grad=True)},
-            {'probs': torch.tensor([[0.0, 0.0], [0.0, 0.0]], requires_grad=True)},
-        ]),
-        Example(Pareto, [
-            {
-                'scale': 0.0,
-                'alpha': 0.0
-            },
-            {
-                'scale': torch.tensor([0.0, 0.0], requires_grad=True),
-                'alpha': torch.tensor([-1e-5, 0.0], requires_grad=True)
-            },
-            {
-                'scale': torch.tensor([1.0]),
-                'alpha': -1.0
-            }
-        ]),
-        Example(Poisson, [
-            {
-                'rate': torch.tensor([-0.1], requires_grad=True),
-            },
-            {
-                'rate': -1.0,
-            }
-        ]),
-        Example(RelaxedBernoulli, [
-            {
-                'temperature': torch.tensor([1.5], requires_grad=True),
-                'probs': torch.tensor([1.7, 0.2, 0.4], requires_grad=True),
-            },
-            {
-                'temperature': torch.tensor([2.0]),
-                'probs': torch.tensor([-1.0]),
-            }
-        ]),
-        Example(RelaxedOneHotCategorical, [
-            {
-                'temperature': torch.tensor([0.5], requires_grad=True),
-                'probs': torch.tensor([[-0.1, 0.2, 0.7], [0.5, 0.3, 0.2]], requires_grad=True)
-            },
-            {
-                'temperature': torch.tensor([2.0]),
-                'probs': torch.tensor([[-1.0, 0.0], [-1.0, 1.1]])
-            }
-        ]),
-        Example(TransformedDistribution, [
-            {
-                'base_distribution': Normal(0, 1),
-                'transforms': lambda x: x,
-            },
-            {
-                'base_distribution': Normal(0, 1),
-                'transforms': [lambda x: x],
-            },
-        ]),
-        Example(Uniform, [
-            {
-                'low': torch.tensor([2.0], requires_grad=True),
-                'high': torch.tensor([2.0], requires_grad=True),
-            },
-            {
-                'low': torch.tensor([0.0], requires_grad=True),
-                'high': torch.tensor([0.0], requires_grad=True),
-            },
-            {
-                'low': torch.tensor([1.0], requires_grad=True),
-                'high': torch.tensor([0.0], requires_grad=True),
-            }
-        ]),
-        Example(Weibull, [
-            {
-                'scale': torch.tensor([0.0], requires_grad=True),
-                'concentration': torch.tensor([0.0], requires_grad=True)
-            },
-            {
-                'scale': torch.tensor([1.0], requires_grad=True),
-                'concentration': torch.tensor([-1.0], requires_grad=True)
-            }
-        ]),
-        Example(Wishart, [
-            {
-                'covariance_matrix': torch.tensor([[1.0, 0.0], [0.0, -2.0]], requires_grad=True),
-                'df': torch.tensor([1.5], requires_grad=True),
-            },
-            {
-                'covariance_matrix': torch.tensor([[1.0, 1.0], [1.0, -2.0]], requires_grad=True),
-                'df': torch.tensor([3.], requires_grad=True),
-            },
-            {
-                'covariance_matrix': torch.tensor([[1.0, 1.0], [1.0, -2.0]], requires_grad=True),
-                'df': 3.,
-            },
-        ]),
-        Example(ContinuousBernoulli, [
-            {'probs': torch.tensor([1.1, 0.2, 0.4], requires_grad=True)},
-            {'probs': torch.tensor([-0.5], requires_grad=True)},
-            {'probs': 1.00001},
-        ]),
-        Example(InverseGamma, [
-            {
-                'concentration': torch.tensor([0., 0.], requires_grad=True),
-                'rate': torch.tensor([-1., -100.], requires_grad=True),
-            },
-            {
-                'concentration': torch.tensor([1., 1.], requires_grad=True),
-                'rate': torch.tensor([0., 0.], requires_grad=True),
-            }
-        ]),
+        Example(
+            Bernoulli,
+            [
+                {"probs": torch.tensor([1.1, 0.2, 0.4], requires_grad=True)},
+                {"probs": torch.tensor([-0.5], requires_grad=True)},
+                {"probs": 1.00001},
+            ],
+        ),
+        Example(
+            Beta,
+            [
+                {
+                    "concentration1": torch.tensor([0.0], requires_grad=True),
+                    "concentration0": torch.tensor([0.0], requires_grad=True),
+                },
+                {
+                    "concentration1": torch.tensor([-1.0], requires_grad=True),
+                    "concentration0": torch.tensor([-2.0], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            Geometric,
+            [
+                {"probs": torch.tensor([1.1, 0.2, 0.4], requires_grad=True)},
+                {"probs": torch.tensor([-0.3], requires_grad=True)},
+                {"probs": 1.00000001},
+            ],
+        ),
+        Example(
+            Categorical,
+            [
+                {
+                    "probs": torch.tensor(
+                        [[-0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True
+                    )
+                },
+                {
+                    "probs": torch.tensor(
+                        [[-1.0, 10.0], [0.0, -1.0]], requires_grad=True
+                    )
+                },
+            ],
+        ),
+        Example(
+            Binomial,
+            [
+                {
+                    "probs": torch.tensor(
+                        [[-0.0000001, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True
+                    ),
+                    "total_count": 10,
+                },
+                {
+                    "probs": torch.tensor([[1.0, 0.0], [0.0, 2.0]], requires_grad=True),
+                    "total_count": 10,
+                },
+            ],
+        ),
+        Example(
+            NegativeBinomial,
+            [
+                {
+                    "probs": torch.tensor(
+                        [[-0.0000001, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True
+                    ),
+                    "total_count": 10,
+                },
+                {
+                    "probs": torch.tensor([[1.0, 0.0], [0.0, 2.0]], requires_grad=True),
+                    "total_count": 10,
+                },
+            ],
+        ),
+        Example(
+            Cauchy,
+            [
+                {"loc": 0.0, "scale": -1.0},
+                {"loc": torch.tensor([0.0]), "scale": 0.0},
+                {
+                    "loc": torch.tensor([[0.0], [-2.0]]),
+                    "scale": torch.tensor([[-0.000001], [1.0]]),
+                },
+            ],
+        ),
+        Example(
+            Chi2,
+            [
+                {"df": torch.tensor([0.0], requires_grad=True)},
+                {"df": torch.tensor([-2.0], requires_grad=True)},
+            ],
+        ),
+        Example(
+            StudentT,
+            [
+                {"df": torch.tensor([0.0], requires_grad=True)},
+                {"df": torch.tensor([-2.0], requires_grad=True)},
+            ],
+        ),
+        Example(
+            Dirichlet,
+            [
+                {"concentration": torch.tensor([0.0], requires_grad=True)},
+                {"concentration": torch.tensor([-2.0], requires_grad=True)},
+            ],
+        ),
+        Example(
+            Exponential,
+            [
+                {"rate": torch.tensor([0.0, 0.0], requires_grad=True)},
+                {"rate": torch.tensor([-2.0], requires_grad=True)},
+            ],
+        ),
+        Example(
+            FisherSnedecor,
+            [
+                {
+                    "df1": torch.tensor([0.0, 0.0], requires_grad=True),
+                    "df2": torch.tensor([-1.0, -100.0], requires_grad=True),
+                },
+                {
+                    "df1": torch.tensor([1.0, 1.0], requires_grad=True),
+                    "df2": torch.tensor([0.0, 0.0], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            Gamma,
+            [
+                {
+                    "concentration": torch.tensor([0.0, 0.0], requires_grad=True),
+                    "rate": torch.tensor([-1.0, -100.0], requires_grad=True),
+                },
+                {
+                    "concentration": torch.tensor([1.0, 1.0], requires_grad=True),
+                    "rate": torch.tensor([0.0, 0.0], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            Gumbel,
+            [
+                {
+                    "loc": torch.tensor([1.0, 1.0], requires_grad=True),
+                    "scale": torch.tensor([0.0, 1.0], requires_grad=True),
+                },
+                {
+                    "loc": torch.tensor([1.0, 1.0], requires_grad=True),
+                    "scale": torch.tensor([1.0, -1.0], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            HalfCauchy,
+            [
+                {"scale": -1.0},
+                {"scale": 0.0},
+                {"scale": torch.tensor([[-0.000001], [1.0]])},
+            ],
+        ),
+        Example(
+            HalfNormal,
+            [
+                {"scale": torch.tensor([0.0, 1.0], requires_grad=True)},
+                {"scale": torch.tensor([1.0, -1.0], requires_grad=True)},
+            ],
+        ),
+        Example(
+            LKJCholesky,
+            [
+                {"dim": -2, "concentration": 0.1},
+                {
+                    "dim": 1,
+                    "concentration": 2.0,
+                },
+                {
+                    "dim": 2,
+                    "concentration": 0.0,
+                },
+            ],
+        ),
+        Example(
+            Laplace,
+            [
+                {
+                    "loc": torch.tensor([1.0, 1.0], requires_grad=True),
+                    "scale": torch.tensor([0.0, 1.0], requires_grad=True),
+                },
+                {
+                    "loc": torch.tensor([1.0, 1.0], requires_grad=True),
+                    "scale": torch.tensor([1.0, -1.0], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            LogNormal,
+            [
+                {
+                    "loc": torch.tensor([1.0, 1.0], requires_grad=True),
+                    "scale": torch.tensor([0.0, 1.0], requires_grad=True),
+                },
+                {
+                    "loc": torch.tensor([1.0, 1.0], requires_grad=True),
+                    "scale": torch.tensor([1.0, -1.0], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            MultivariateNormal,
+            [
+                {
+                    "loc": torch.tensor([1.0, 1.0], requires_grad=True),
+                    "covariance_matrix": torch.tensor(
+                        [[1.0, 0.0], [0.0, -2.0]], requires_grad=True
+                    ),
+                },
+            ],
+        ),
+        Example(
+            Normal,
+            [
+                {
+                    "loc": torch.tensor([1.0, 1.0], requires_grad=True),
+                    "scale": torch.tensor([0.0, 1.0], requires_grad=True),
+                },
+                {
+                    "loc": torch.tensor([1.0, 1.0], requires_grad=True),
+                    "scale": torch.tensor([1.0, -1.0], requires_grad=True),
+                },
+                {
+                    "loc": torch.tensor([1.0, 0.0], requires_grad=True),
+                    "scale": torch.tensor([1e-5, -1e-5], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            OneHotCategorical,
+            [
+                {
+                    "probs": torch.tensor(
+                        [[0.1, 0.2, 0.3], [0.1, -10.0, 0.2]], requires_grad=True
+                    )
+                },
+                {"probs": torch.tensor([[0.0, 0.0], [0.0, 0.0]], requires_grad=True)},
+            ],
+        ),
+        Example(
+            OneHotCategoricalStraightThrough,
+            [
+                {
+                    "probs": torch.tensor(
+                        [[0.1, 0.2, 0.3], [0.1, -10.0, 0.2]], requires_grad=True
+                    )
+                },
+                {"probs": torch.tensor([[0.0, 0.0], [0.0, 0.0]], requires_grad=True)},
+            ],
+        ),
+        Example(
+            Pareto,
+            [
+                {"scale": 0.0, "alpha": 0.0},
+                {
+                    "scale": torch.tensor([0.0, 0.0], requires_grad=True),
+                    "alpha": torch.tensor([-1e-5, 0.0], requires_grad=True),
+                },
+                {"scale": torch.tensor([1.0]), "alpha": -1.0},
+            ],
+        ),
+        Example(
+            Poisson,
+            [
+                {
+                    "rate": torch.tensor([-0.1], requires_grad=True),
+                },
+                {
+                    "rate": -1.0,
+                },
+            ],
+        ),
+        Example(
+            RelaxedBernoulli,
+            [
+                {
+                    "temperature": torch.tensor([1.5], requires_grad=True),
+                    "probs": torch.tensor([1.7, 0.2, 0.4], requires_grad=True),
+                },
+                {
+                    "temperature": torch.tensor([2.0]),
+                    "probs": torch.tensor([-1.0]),
+                },
+            ],
+        ),
+        Example(
+            RelaxedOneHotCategorical,
+            [
+                {
+                    "temperature": torch.tensor([0.5], requires_grad=True),
+                    "probs": torch.tensor(
+                        [[-0.1, 0.2, 0.7], [0.5, 0.3, 0.2]], requires_grad=True
+                    ),
+                },
+                {
+                    "temperature": torch.tensor([2.0]),
+                    "probs": torch.tensor([[-1.0, 0.0], [-1.0, 1.1]]),
+                },
+            ],
+        ),
+        Example(
+            TransformedDistribution,
+            [
+                {
+                    "base_distribution": Normal(0, 1),
+                    "transforms": lambda x: x,
+                },
+                {
+                    "base_distribution": Normal(0, 1),
+                    "transforms": [lambda x: x],
+                },
+            ],
+        ),
+        Example(
+            Uniform,
+            [
+                {
+                    "low": torch.tensor([2.0], requires_grad=True),
+                    "high": torch.tensor([2.0], requires_grad=True),
+                },
+                {
+                    "low": torch.tensor([0.0], requires_grad=True),
+                    "high": torch.tensor([0.0], requires_grad=True),
+                },
+                {
+                    "low": torch.tensor([1.0], requires_grad=True),
+                    "high": torch.tensor([0.0], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            Weibull,
+            [
+                {
+                    "scale": torch.tensor([0.0], requires_grad=True),
+                    "concentration": torch.tensor([0.0], requires_grad=True),
+                },
+                {
+                    "scale": torch.tensor([1.0], requires_grad=True),
+                    "concentration": torch.tensor([-1.0], requires_grad=True),
+                },
+            ],
+        ),
+        Example(
+            Wishart,
+            [
+                {
+                    "covariance_matrix": torch.tensor(
+                        [[1.0, 0.0], [0.0, -2.0]], requires_grad=True
+                    ),
+                    "df": torch.tensor([1.5], requires_grad=True),
+                },
+                {
+                    "covariance_matrix": torch.tensor(
+                        [[1.0, 1.0], [1.0, -2.0]], requires_grad=True
+                    ),
+                    "df": torch.tensor([3.0], requires_grad=True),
+                },
+                {
+                    "covariance_matrix": torch.tensor(
+                        [[1.0, 1.0], [1.0, -2.0]], requires_grad=True
+                    ),
+                    "df": 3.0,
+                },
+            ],
+        ),
+        Example(
+            ContinuousBernoulli,
+            [
+                {"probs": torch.tensor([1.1, 0.2, 0.4], requires_grad=True)},
+                {"probs": torch.tensor([-0.5], requires_grad=True)},
+                {"probs": 1.00001},
+            ],
+        ),
+        Example(
+            InverseGamma,
+            [
+                {
+                    "concentration": torch.tensor([0.0, 0.0], requires_grad=True),
+                    "rate": torch.tensor([-1.0, -100.0], requires_grad=True),
+                },
+                {
+                    "concentration": torch.tensor([1.0, 1.0], requires_grad=True),
+                    "rate": torch.tensor([0.0, 0.0], requires_grad=True),
+                },
+            ],
+        ),
     ]
 
 
@@ -841,11 +1230,13 @@ def apply_fn(s, *params):
 
     def _check_forward_ad(self, fn):
         with fwAD.dual_level():
-            x = torch.tensor(1.)
-            t = torch.tensor(1.)
+            x = torch.tensor(1.0)
+            t = torch.tensor(1.0)
             dual = fwAD.make_dual(x, t)
             dual_out = fn(dual)
-            self.assertEqual(torch.count_nonzero(fwAD.unpack_dual(dual_out).tangent).item(), 0)
+            self.assertEqual(
+                torch.count_nonzero(fwAD.unpack_dual(dual_out).tangent).item(), 0
+            )
 
     def _check_log_prob(self, dist, asset_fn):
         # checks that the log_prob matches a reference function
@@ -856,8 +1247,16 @@ def _check_log_prob(self, dist, asset_fn):
         for i, (val, log_prob) in enumerate(zip(s_data_flat, log_probs_data_flat)):
             asset_fn(i, val.squeeze(), log_prob)
 
-    def _check_sampler_sampler(self, torch_dist, ref_dist, message, multivariate=False,
-                               circular=False, num_samples=10000, failure_rate=1e-3):
+    def _check_sampler_sampler(
+        self,
+        torch_dist,
+        ref_dist,
+        message,
+        multivariate=False,
+        circular=False,
+        num_samples=10000,
+        failure_rate=1e-3,
+    ):
         # Checks that the .sample() method matches a reference function.
         torch_samples = torch_dist.sample((num_samples,)).squeeze()
         torch_samples = torch_samples.cpu().numpy()
@@ -871,7 +1270,9 @@ def _check_sampler_sampler(self, torch_dist, ref_dist, message, multivariate=Fal
         samples = [(x, +1) for x in torch_samples] + [(x, -1) for x in ref_samples]
         if circular:
             samples = [(np.cos(x), v) for (x, v) in samples]
-        shuffle(samples)  # necessary to prevent stable sort from making uneven bins for discrete
+        shuffle(
+            samples
+        )  # necessary to prevent stable sort from making uneven bins for discrete
         samples.sort(key=lambda x: x[0])
         samples = np.array(samples)[:, 1]
 
@@ -879,25 +1280,34 @@ def _check_sampler_sampler(self, torch_dist, ref_dist, message, multivariate=Fal
         num_bins = 10
         samples_per_bin = len(samples) // num_bins
         bins = samples.reshape((num_bins, samples_per_bin)).mean(axis=1)
-        stddev = samples_per_bin ** -0.5
+        stddev = samples_per_bin**-0.5
         threshold = stddev * scipy.special.erfinv(1 - 2 * failure_rate / num_bins)
-        message = f'{message}.sample() is biased:\n{bins}'
+        message = f"{message}.sample() is biased:\n{bins}"
         for bias in bins:
             self.assertLess(-threshold, bias, message)
             self.assertLess(bias, threshold, message)
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
-    def _check_sampler_discrete(self, torch_dist, ref_dist, message,
-                                num_samples=10000, failure_rate=1e-3):
+    def _check_sampler_discrete(
+        self, torch_dist, ref_dist, message, num_samples=10000, failure_rate=1e-3
+    ):
         """Runs a Chi2-test for the support, but ignores tail instead of combining"""
         torch_samples = torch_dist.sample((num_samples,)).squeeze()
-        torch_samples = torch_samples.float() if torch_samples.dtype == torch.bfloat16 else torch_samples
+        torch_samples = (
+            torch_samples.float()
+            if torch_samples.dtype == torch.bfloat16
+            else torch_samples
+        )
         torch_samples = torch_samples.cpu().numpy()
         unique, counts = np.unique(torch_samples, return_counts=True)
         pmf = ref_dist.pmf(unique)
         pmf = pmf / pmf.sum()  # renormalize to 1.0 for chisq test
         msk = (counts > 5) & ((pmf * num_samples) > 5)
-        self.assertGreater(pmf[msk].sum(), 0.9, "Distribution is too sparse for test; try increasing num_samples")
+        self.assertGreater(
+            pmf[msk].sum(),
+            0.9,
+            "Distribution is too sparse for test; try increasing num_samples",
+        )
         # Add a remainder bucket that combines counts for all values
         # below threshold, if such values exist (i.e. mask has False entries).
         if not msk.all():
@@ -914,7 +1324,9 @@ def _check_enumerate_support(self, dist, examples):
             expected = torch.tensor(expected, dtype=actual.dtype)
             self.assertEqual(actual, expected)
             actual = d.enumerate_support(expand=True)
-            expected_with_expand = expected.expand((-1,) + d.batch_shape + d.event_shape)
+            expected_with_expand = expected.expand(
+                (-1,) + d.batch_shape + d.event_shape
+            )
             self.assertEqual(actual, expected_with_expand)
 
     def test_repr(self):
@@ -926,35 +1338,45 @@ def test_repr(self):
     def test_sample_detached(self):
         for Dist, params in _get_examples():
             for i, param in enumerate(params):
-                variable_params = [p for p in param.values() if getattr(p, 'requires_grad', False)]
+                variable_params = [
+                    p for p in param.values() if getattr(p, "requires_grad", False)
+                ]
                 if not variable_params:
                     continue
                 dist = Dist(**param)
                 sample = dist.sample()
-                self.assertFalse(sample.requires_grad,
-                                 msg=f'{Dist.__name__} example {i + 1}/{len(params)}, .sample() is not detached')
+                self.assertFalse(
+                    sample.requires_grad,
+                    msg=f"{Dist.__name__} example {i + 1}/{len(params)}, .sample() is not detached",
+                )
 
     @skipIfTorchDynamo("Not a TorchDynamo suitable test")
     def test_rsample_requires_grad(self):
         for Dist, params in _get_examples():
             for i, param in enumerate(params):
-                if not any(getattr(p, 'requires_grad', False) for p in param.values()):
+                if not any(getattr(p, "requires_grad", False) for p in param.values()):
                     continue
                 dist = Dist(**param)
                 if not dist.has_rsample:
                     continue
                 sample = dist.rsample()
-                self.assertTrue(sample.requires_grad,
-                                msg=f'{Dist.__name__} example {i + 1}/{len(params)}, .rsample() does not require grad')
+                self.assertTrue(
+                    sample.requires_grad,
+                    msg=f"{Dist.__name__} example {i + 1}/{len(params)}, .rsample() does not require grad",
+                )
 
     def test_enumerate_support_type(self):
         for Dist, params in _get_examples():
             for i, param in enumerate(params):
                 dist = Dist(**param)
                 try:
-                    self.assertTrue(type(dist.sample()) is type(dist.enumerate_support()),
-                                    msg=('{} example {}/{}, return type mismatch between ' +
-                                         'sample and enumerate_support.').format(Dist.__name__, i + 1, len(params)))
+                    self.assertTrue(
+                        type(dist.sample()) is type(dist.enumerate_support()),
+                        msg=(
+                            "{} example {}/{}, return type mismatch between "
+                            + "sample and enumerate_support."
+                        ).format(Dist.__name__, i + 1, len(params)),
+                    )
                 except NotImplementedError:
                     pass
 
@@ -986,10 +1408,17 @@ def test():
     def test_has_examples(self):
         distributions_with_examples = {e.Dist for e in _get_examples()}
         for Dist in globals().values():
-            if isinstance(Dist, type) and issubclass(Dist, Distribution) \
-                    and Dist is not Distribution and Dist is not ExponentialFamily:
-                self.assertIn(Dist, distributions_with_examples,
-                              f"Please add {Dist.__name__} to the _get_examples list in test_distributions.py")
+            if (
+                isinstance(Dist, type)
+                and issubclass(Dist, Distribution)
+                and Dist is not Distribution
+                and Dist is not ExponentialFamily
+            ):
+                self.assertIn(
+                    Dist,
+                    distributions_with_examples,
+                    f"Please add {Dist.__name__} to the _get_examples list in test_distributions.py",
+                )
 
     def test_support_attributes(self):
         for Dist, params in _get_examples():
@@ -1025,10 +1454,13 @@ def test_distribution_expand(self):
                     self.assertEqual(actual_shape, expected_shape)
                     self.assertEqual(expanded.batch_shape, expanded_shape)
                     try:
-                        self.assertEqual(expanded.mean,
-                                         d.mean.expand(expanded_shape + d.event_shape))
-                        self.assertEqual(expanded.variance,
-                                         d.variance.expand(expanded_shape + d.event_shape))
+                        self.assertEqual(
+                            expanded.mean, d.mean.expand(expanded_shape + d.event_shape)
+                        )
+                        self.assertEqual(
+                            expanded.variance,
+                            d.variance.expand(expanded_shape + d.event_shape),
+                        )
                     except NotImplementedError:
                         pass
 
@@ -1061,7 +1493,13 @@ def test_bernoulli(self):
         self.assertFalse(Bernoulli(p).sample().requires_grad)
         self.assertEqual(Bernoulli(r).sample((8,)).size(), (8,))
         self.assertEqual(Bernoulli(r).sample().size(), ())
-        self.assertEqual(Bernoulli(r).sample((3, 2)).size(), (3, 2,))
+        self.assertEqual(
+            Bernoulli(r).sample((3, 2)).size(),
+            (
+                3,
+                2,
+            ),
+        )
         self.assertEqual(Bernoulli(s).sample().size(), ())
         self._gradcheck_log_prob(Bernoulli, (p,))
 
@@ -1074,9 +1512,16 @@ def ref_log_prob(idx, val, log_prob):
         self.assertRaises(NotImplementedError, Bernoulli(r).rsample)
 
         # check entropy computation
-        self.assertEqual(Bernoulli(p).entropy(), torch.tensor([0.6108, 0.5004, 0.6730]), atol=1e-4, rtol=0)
+        self.assertEqual(
+            Bernoulli(p).entropy(),
+            torch.tensor([0.6108, 0.5004, 0.6730]),
+            atol=1e-4,
+            rtol=0,
+        )
         self.assertEqual(Bernoulli(torch.tensor([0.0])).entropy(), torch.tensor([0.0]))
-        self.assertEqual(Bernoulli(s).entropy(), torch.tensor(0.6108), atol=1e-4, rtol=0)
+        self.assertEqual(
+            Bernoulli(s).entropy(), torch.tensor(0.6108), atol=1e-4, rtol=0
+        )
 
         self._check_forward_ad(torch.bernoulli)
         self._check_forward_ad(lambda x: x.bernoulli_())
@@ -1094,8 +1539,9 @@ def test_bernoulli_enumerate_support(self):
     def test_bernoulli_3d(self):
         p = torch.full((2, 3, 5), 0.5).requires_grad_()
         self.assertEqual(Bernoulli(p).sample().size(), (2, 3, 5))
-        self.assertEqual(Bernoulli(p).sample(sample_shape=(2, 5)).size(),
-                         (2, 5, 2, 3, 5))
+        self.assertEqual(
+            Bernoulli(p).sample(sample_shape=(2, 5)).size(), (2, 5, 2, 3, 5)
+        )
         self.assertEqual(Bernoulli(p).sample((2,)).size(), (2, 2, 3, 5))
 
     @set_default_dtype(torch.double)
@@ -1105,8 +1551,8 @@ def test_geometric(self):
         s = 0.3
         self.assertEqual(Geometric(p).sample((8,)).size(), (8, 3))
         self.assertEqual(Geometric(1).sample(), 0)
-        self.assertEqual(Geometric(1).log_prob(torch.tensor(1.)), -inf)
-        self.assertEqual(Geometric(1).log_prob(torch.tensor(0.)), 0)
+        self.assertEqual(Geometric(1).log_prob(torch.tensor(1.0)), -inf)
+        self.assertEqual(Geometric(1).log_prob(torch.tensor(0.0)), 0)
         self.assertFalse(Geometric(p).sample().requires_grad)
         self.assertEqual(Geometric(r).sample((8,)).size(), (8,))
         self.assertEqual(Geometric(r).sample().size(), ())
@@ -1132,23 +1578,37 @@ def ref_log_prob(idx, val, log_prob):
         self._check_log_prob(Geometric(logits=p.log() - (-p).log1p()), ref_log_prob)
 
         # check entropy computation
-        self.assertEqual(Geometric(p).entropy(), scipy.stats.geom(p.detach().numpy(), loc=-1).entropy(), atol=1e-3, rtol=0)
-        self.assertEqual(float(Geometric(s).entropy()), scipy.stats.geom(s, loc=-1).entropy().item(), atol=1e-3, rtol=0)
+        self.assertEqual(
+            Geometric(p).entropy(),
+            scipy.stats.geom(p.detach().numpy(), loc=-1).entropy(),
+            atol=1e-3,
+            rtol=0,
+        )
+        self.assertEqual(
+            float(Geometric(s).entropy()),
+            scipy.stats.geom(s, loc=-1).entropy().item(),
+            atol=1e-3,
+            rtol=0,
+        )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_geometric_sample(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         for prob in [0.01, 0.18, 0.8]:
-            self._check_sampler_discrete(Geometric(prob),
-                                         scipy.stats.geom(p=prob, loc=-1),
-                                         f'Geometric(prob={prob})')
+            self._check_sampler_discrete(
+                Geometric(prob),
+                scipy.stats.geom(p=prob, loc=-1),
+                f"Geometric(prob={prob})",
+            )
 
     @set_default_dtype(torch.double)
     def test_binomial(self):
         p = torch.arange(0.05, 1, 0.1).requires_grad_()
         for total_count in [1, 2, 10]:
             self._gradcheck_log_prob(lambda p: Binomial(total_count, p), [p])
-            self._gradcheck_log_prob(lambda p: Binomial(total_count, None, p.log()), [p])
+            self._gradcheck_log_prob(
+                lambda p: Binomial(total_count, None, p.log()), [p]
+            )
         self.assertRaises(NotImplementedError, Binomial(10, p).rsample)
 
     test_binomial_half = set_default_dtype(torch.float16)(test_binomial)
@@ -1159,9 +1619,11 @@ def test_binomial_sample(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         for prob in [0.01, 0.1, 0.5, 0.8, 0.9]:
             for count in [2, 10, 100, 500]:
-                self._check_sampler_discrete(Binomial(total_count=count, probs=prob),
-                                             scipy.stats.binom(count, prob),
-                                             f'Binomial(total_count={count}, probs={prob})')
+                self._check_sampler_discrete(
+                    Binomial(total_count=count, probs=prob),
+                    scipy.stats.binom(count, prob),
+                    f"Binomial(total_count={count}, probs={prob})",
+                )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     @set_default_dtype(torch.double)
@@ -1173,6 +1635,7 @@ def ref_log_prob(idx, x, log_prob):
                 p = probs.view(-1)[idx].item()
                 expected = scipy.stats.binom(total_count, p).logpmf(x)
                 self.assertEqual(log_prob, expected, atol=1e-3, rtol=0)
+
             self._check_log_prob(Binomial(total_count, probs), ref_log_prob)
             logits = probs_to_logits(probs, is_binary=True)
             self._check_log_prob(Binomial(total_count, logits=logits), ref_log_prob)
@@ -1180,36 +1643,47 @@ def ref_log_prob(idx, x, log_prob):
             bin = Binomial(total_count, logits=logits)
             self.assertEqual(
                 bin.entropy(),
-                scipy.stats.binom(total_count, bin.probs.detach().numpy(), loc=-1).entropy(),
-                atol=1e-3, rtol=0)
+                scipy.stats.binom(
+                    total_count, bin.probs.detach().numpy(), loc=-1
+                ).entropy(),
+                atol=1e-3,
+                rtol=0,
+            )
 
     def test_binomial_stable(self):
-        logits = torch.tensor([-100., 100.], dtype=torch.float)
-        total_count = 1.
-        x = torch.tensor([0., 0.], dtype=torch.float)
+        logits = torch.tensor([-100.0, 100.0], dtype=torch.float)
+        total_count = 1.0
+        x = torch.tensor([0.0, 0.0], dtype=torch.float)
         log_prob = Binomial(total_count, logits=logits).log_prob(x)
         self.assertTrue(torch.isfinite(log_prob).all())
 
         # make sure that the grad at logits=0, value=0 is 0.5
-        x = torch.tensor(0., requires_grad=True)
-        y = Binomial(total_count, logits=x).log_prob(torch.tensor(0.))
+        x = torch.tensor(0.0, requires_grad=True)
+        y = Binomial(total_count, logits=x).log_prob(torch.tensor(0.0))
         self.assertEqual(grad(y, x)[0], torch.tensor(-0.5))
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     @set_default_dtype(torch.double)
     def test_binomial_log_prob_vectorized_count(self):
         probs = torch.tensor([0.2, 0.7, 0.9])
-        for total_count, sample in [(torch.tensor([10]), torch.tensor([7., 3., 9.])),
-                                    (torch.tensor([1, 2, 10]), torch.tensor([0., 1., 9.]))]:
+        for total_count, sample in [
+            (torch.tensor([10]), torch.tensor([7.0, 3.0, 9.0])),
+            (torch.tensor([1, 2, 10]), torch.tensor([0.0, 1.0, 9.0])),
+        ]:
             log_prob = Binomial(total_count, probs).log_prob(sample)
-            expected = scipy.stats.binom(total_count.cpu().numpy(), probs.cpu().numpy()).logpmf(sample)
+            expected = scipy.stats.binom(
+                total_count.cpu().numpy(), probs.cpu().numpy()
+            ).logpmf(sample)
             self.assertEqual(log_prob, expected, atol=1e-4, rtol=0)
 
     def test_binomial_enumerate_support(self):
         examples = [
             ({"probs": [0.1], "total_count": 2}, [[0], [1], [2]]),
             ({"probs": [0.1, 0.9], "total_count": 2}, [[0], [1], [2]]),
-            ({"probs": [[0.1, 0.2], [0.3, 0.4]], "total_count": 3}, [[[0]], [[1]], [[2]], [[3]]]),
+            (
+                {"probs": [[0.1, 0.2], [0.3, 0.4]], "total_count": 3},
+                [[[0]], [[1]], [[2]], [[3]]],
+            ),
         ]
         self._check_enumerate_support(Binomial, examples)
 
@@ -1218,12 +1692,16 @@ def test_binomial_extreme_vals(self):
         total_count = 100
         bin0 = Binomial(total_count, 0)
         self.assertEqual(bin0.sample(), 0)
-        self.assertEqual(bin0.log_prob(torch.tensor([0.]))[0], 0, atol=1e-3, rtol=0)
-        self.assertEqual(float(bin0.log_prob(torch.tensor([1.])).exp()), 0)
+        self.assertEqual(bin0.log_prob(torch.tensor([0.0]))[0], 0, atol=1e-3, rtol=0)
+        self.assertEqual(float(bin0.log_prob(torch.tensor([1.0])).exp()), 0)
         bin1 = Binomial(total_count, 1)
         self.assertEqual(bin1.sample(), total_count)
-        self.assertEqual(bin1.log_prob(torch.tensor([float(total_count)]))[0], 0, atol=1e-3, rtol=0)
-        self.assertEqual(float(bin1.log_prob(torch.tensor([float(total_count - 1)])).exp()), 0)
+        self.assertEqual(
+            bin1.log_prob(torch.tensor([float(total_count)]))[0], 0, atol=1e-3, rtol=0
+        )
+        self.assertEqual(
+            float(bin1.log_prob(torch.tensor([float(total_count - 1)])).exp()), 0
+        )
         zero_counts = torch.zeros(torch.Size((2, 2)))
         bin2 = Binomial(zero_counts, 1)
         self.assertEqual(bin2.sample(), zero_counts)
@@ -1233,7 +1711,7 @@ def test_binomial_extreme_vals(self):
     def test_binomial_vectorized_count(self):
         set_rng_seed(1)  # see Note [Randomized statistical tests]
         total_count = torch.tensor([[4, 7], [3, 8]], dtype=torch.float64)
-        bin0 = Binomial(total_count, torch.tensor(1.))
+        bin0 = Binomial(total_count, torch.tensor(1.0))
         self.assertEqual(bin0.sample(), total_count)
         bin1 = Binomial(total_count, torch.tensor(0.5))
         samples = bin1.sample(torch.Size((100000,)))
@@ -1246,7 +1724,9 @@ def test_negative_binomial(self):
         p = torch.arange(0.05, 1, 0.1).requires_grad_()
         for total_count in [1, 2, 10]:
             self._gradcheck_log_prob(lambda p: NegativeBinomial(total_count, p), [p])
-            self._gradcheck_log_prob(lambda p: NegativeBinomial(total_count, None, p.log()), [p])
+            self._gradcheck_log_prob(
+                lambda p: NegativeBinomial(total_count, None, p.log()), [p]
+            )
         self.assertRaises(NotImplementedError, NegativeBinomial(10, p).rsample)
         self.assertRaises(NotImplementedError, NegativeBinomial(10, p).entropy)
 
@@ -1262,31 +1742,37 @@ def ref_log_prob(idx, x, log_prob):
 
             self._check_log_prob(NegativeBinomial(total_count, probs), ref_log_prob)
             logits = probs_to_logits(probs, is_binary=True)
-            self._check_log_prob(NegativeBinomial(total_count, logits=logits), ref_log_prob)
+            self._check_log_prob(
+                NegativeBinomial(total_count, logits=logits), ref_log_prob
+            )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     @set_default_dtype(torch.double)
     def test_negative_binomial_log_prob_vectorized_count(self):
         probs = torch.tensor([0.2, 0.7, 0.9])
-        for total_count, sample in [(torch.tensor([10]), torch.tensor([7., 3., 9.])),
-                                    (torch.tensor([1, 2, 10]), torch.tensor([0., 1., 9.]))]:
+        for total_count, sample in [
+            (torch.tensor([10]), torch.tensor([7.0, 3.0, 9.0])),
+            (torch.tensor([1, 2, 10]), torch.tensor([0.0, 1.0, 9.0])),
+        ]:
             log_prob = NegativeBinomial(total_count, probs).log_prob(sample)
-            expected = scipy.stats.nbinom(total_count.cpu().numpy(), 1 - probs.cpu().numpy()).logpmf(sample)
+            expected = scipy.stats.nbinom(
+                total_count.cpu().numpy(), 1 - probs.cpu().numpy()
+            ).logpmf(sample)
             self.assertEqual(log_prob, expected, atol=1e-4, rtol=0)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA not found")
     def test_zero_excluded_binomial(self):
-        vals = Binomial(total_count=torch.tensor(1.0).cuda(),
-                        probs=torch.tensor(0.9).cuda()
-                        ).sample(torch.Size((100000000,)))
+        vals = Binomial(
+            total_count=torch.tensor(1.0).cuda(), probs=torch.tensor(0.9).cuda()
+        ).sample(torch.Size((100000000,)))
         self.assertTrue((vals >= 0).all())
-        vals = Binomial(total_count=torch.tensor(1.0).cuda(),
-                        probs=torch.tensor(0.1).cuda()
-                        ).sample(torch.Size((100000000,)))
+        vals = Binomial(
+            total_count=torch.tensor(1.0).cuda(), probs=torch.tensor(0.1).cuda()
+        ).sample(torch.Size((100000000,)))
         self.assertTrue((vals < 2).all())
-        vals = Binomial(total_count=torch.tensor(1.0).cuda(),
-                        probs=torch.tensor(0.5).cuda()
-                        ).sample(torch.Size((10000,)))
+        vals = Binomial(
+            total_count=torch.tensor(1.0).cuda(), probs=torch.tensor(0.5).cuda()
+        ).sample(torch.Size((10000,)))
         # vals should be roughly half zeroes, half ones
         assert (vals == 0.0).sum() > 4000
         assert (vals == 1.0).sum() > 4000
@@ -1310,16 +1796,26 @@ def test_multinomial_1d_log_prob_and_entropy(self):
         dist = Multinomial(total_count, probs=p)
         x = dist.sample()
         log_prob = dist.log_prob(x)
-        expected = torch.tensor(scipy.stats.multinomial.logpmf(x.numpy(), n=total_count, p=dist.probs.detach().numpy()))
+        expected = torch.tensor(
+            scipy.stats.multinomial.logpmf(
+                x.numpy(), n=total_count, p=dist.probs.detach().numpy()
+            )
+        )
         self.assertEqual(log_prob, expected)
 
         dist = Multinomial(total_count, logits=p.log())
         x = dist.sample()
         log_prob = dist.log_prob(x)
-        expected = torch.tensor(scipy.stats.multinomial.logpmf(x.numpy(), n=total_count, p=dist.probs.detach().numpy()))
+        expected = torch.tensor(
+            scipy.stats.multinomial.logpmf(
+                x.numpy(), n=total_count, p=dist.probs.detach().numpy()
+            )
+        )
         self.assertEqual(log_prob, expected)
 
-        expected = scipy.stats.multinomial.entropy(total_count, dist.probs.detach().numpy())
+        expected = scipy.stats.multinomial.entropy(
+            total_count, dist.probs.detach().numpy()
+        )
         self.assertEqual(dist.entropy(), expected, atol=1e-3, rtol=0)
 
     @set_default_dtype(torch.double)
@@ -1330,15 +1826,19 @@ def test_multinomial_2d(self):
         p = torch.tensor(probabilities, requires_grad=True)
         s = torch.tensor(probabilities_1, requires_grad=True)
         self.assertEqual(Multinomial(total_count, p).sample().size(), (2, 3))
-        self.assertEqual(Multinomial(total_count, p).sample(sample_shape=(3, 4)).size(), (3, 4, 2, 3))
+        self.assertEqual(
+            Multinomial(total_count, p).sample(sample_shape=(3, 4)).size(), (3, 4, 2, 3)
+        )
         self.assertEqual(Multinomial(total_count, p).sample((6,)).size(), (6, 2, 3))
         set_rng_seed(0)
         self._gradcheck_log_prob(lambda p: Multinomial(total_count, p), [p])
         self._gradcheck_log_prob(lambda p: Multinomial(total_count, None, p.log()), [p])
 
         # sample check for extreme value of probs
-        self.assertEqual(Multinomial(total_count, s).sample(),
-                         torch.tensor([[total_count, 0], [0, total_count]], dtype=torch.float64))
+        self.assertEqual(
+            Multinomial(total_count, s).sample(),
+            torch.tensor([[total_count, 0], [0, total_count]], dtype=torch.float64),
+        )
 
     @set_default_dtype(torch.double)
     def test_categorical_1d(self):
@@ -1369,8 +1869,9 @@ def test_categorical_2d(self):
 
         # sample check for extreme value of probs
         set_rng_seed(0)
-        self.assertEqual(Categorical(s).sample(sample_shape=(2,)),
-                         torch.tensor([[0, 1], [0, 1]]))
+        self.assertEqual(
+            Categorical(s).sample(sample_shape=(2,)), torch.tensor([[0, 1], [0, 1]])
+        )
 
         def ref_log_prob(idx, val, log_prob):
             sample_prob = p[idx][val] / p[idx].sum()
@@ -1380,11 +1881,13 @@ def ref_log_prob(idx, val, log_prob):
         self._check_log_prob(Categorical(logits=p.log()), ref_log_prob)
 
         # check entropy computation
-        self.assertEqual(Categorical(p).entropy(), torch.tensor([1.0114, 1.0297]), atol=1e-4, rtol=0)
+        self.assertEqual(
+            Categorical(p).entropy(), torch.tensor([1.0114, 1.0297]), atol=1e-4, rtol=0
+        )
         self.assertEqual(Categorical(s).entropy(), torch.tensor([0.0, 0.0]))
         # issue gh-40553
         logits = p.log()
-        logits[1, 1] = logits[0, 2] = float('-inf')
+        logits[1, 1] = logits[0, 2] = float("-inf")
         e = Categorical(logits=logits).entropy()
         self.assertEqual(e, torch.tensor([0.6365, 0.5983]), atol=1e-4, rtol=0)
 
@@ -1412,7 +1915,9 @@ def test_one_hot_categorical_2d(self):
         p = torch.tensor(probabilities, requires_grad=True)
         s = torch.tensor(probabilities_1, requires_grad=True)
         self.assertEqual(OneHotCategorical(p).sample().size(), (2, 3))
-        self.assertEqual(OneHotCategorical(p).sample(sample_shape=(3, 4)).size(), (3, 4, 2, 3))
+        self.assertEqual(
+            OneHotCategorical(p).sample(sample_shape=(3, 4)).size(), (3, 4, 2, 3)
+        )
         self.assertEqual(OneHotCategorical(p).sample((6,)).size(), (6, 2, 3))
         self._gradcheck_log_prob(OneHotCategorical, (p,))
 
@@ -1453,7 +1958,9 @@ def ref_log_prob(ref_rate, idx, x, log_prob):
 
         set_rng_seed(0)
         self._check_log_prob(Poisson(rate), lambda *args: ref_log_prob(rate, *args))
-        self._check_log_prob(Poisson(rate_zero), lambda *args: ref_log_prob(rate_zero, *args))
+        self._check_log_prob(
+            Poisson(rate_zero), lambda *args: ref_log_prob(rate_zero, *args)
+        )
         self._gradcheck_log_prob(Poisson, (rate,))
         self._gradcheck_log_prob(Poisson, (rate_1d,))
 
@@ -1471,10 +1978,12 @@ def test_poisson_sample(self):
         for dtype in [torch.float, torch.double, torch.bfloat16, torch.half]:
             torch.set_default_dtype(dtype)
             for rate in [0.1, 1.0, 5.0]:
-                self._check_sampler_discrete(Poisson(rate),
-                                             scipy.stats.poisson(rate),
-                                             f'Poisson(lambda={rate})',
-                                             failure_rate=1e-3)
+                self._check_sampler_discrete(
+                    Poisson(rate),
+                    scipy.stats.poisson(rate),
+                    f"Poisson(lambda={rate})",
+                    failure_rate=1e-3,
+                )
         torch.set_default_dtype(saved_dtype)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA not found")
@@ -1482,10 +1991,12 @@ def test_poisson_sample(self):
     def test_poisson_gpu_sample(self):
         set_rng_seed(1)
         for rate in [0.12, 0.9, 4.0]:
-            self._check_sampler_discrete(Poisson(torch.tensor([rate]).cuda()),
-                                         scipy.stats.poisson(rate),
-                                         f'Poisson(lambda={rate}, cuda)',
-                                         failure_rate=1e-3)
+            self._check_sampler_discrete(
+                Poisson(torch.tensor([rate]).cuda()),
+                scipy.stats.poisson(rate),
+                f"Poisson(lambda={rate}, cuda)",
+                failure_rate=1e-3,
+            )
 
     @set_default_dtype(torch.double)
     def test_relaxed_bernoulli(self):
@@ -1497,7 +2008,13 @@ def test_relaxed_bernoulli(self):
         self.assertFalse(RelaxedBernoulli(temp, p).sample().requires_grad)
         self.assertEqual(RelaxedBernoulli(temp, r).sample((8,)).size(), (8,))
         self.assertEqual(RelaxedBernoulli(temp, r).sample().size(), ())
-        self.assertEqual(RelaxedBernoulli(temp, r).sample((3, 2)).size(), (3, 2,))
+        self.assertEqual(
+            RelaxedBernoulli(temp, r).sample((3, 2)).size(),
+            (
+                3,
+                2,
+            ),
+        )
         self.assertEqual(RelaxedBernoulli(temp, s).sample().size(), ())
         self._gradcheck_log_prob(RelaxedBernoulli, (temp, p))
         self._gradcheck_log_prob(RelaxedBernoulli, (temp, r))
@@ -1518,10 +2035,12 @@ def sample(self, *args, **kwargs):
                 return torch.round(self.dist.sample(*args, **kwargs))
 
         for probs, temp in product([0.1, 0.2, 0.8], [0.1, 1.0, 10.0]):
-            self._check_sampler_discrete(Rounded(RelaxedBernoulli(temp, probs)),
-                                         scipy.stats.bernoulli(probs),
-                                         f'Rounded(RelaxedBernoulli(temp={temp}, probs={probs}))',
-                                         failure_rate=1e-3)
+            self._check_sampler_discrete(
+                Rounded(RelaxedBernoulli(temp, probs)),
+                scipy.stats.bernoulli(probs),
+                f"Rounded(RelaxedBernoulli(temp={temp}, probs={probs}))",
+                failure_rate=1e-3,
+            )
 
         for probs in [0.001, 0.2, 0.999]:
             equal_probs = torch.tensor(0.5)
@@ -1533,11 +2052,23 @@ def sample(self, *args, **kwargs):
     def test_relaxed_one_hot_categorical_1d(self):
         p = torch.tensor([0.1, 0.2, 0.3], requires_grad=True)
         temp = torch.tensor(0.67, requires_grad=True)
-        self.assertEqual(RelaxedOneHotCategorical(probs=p, temperature=temp).sample().size(), (3,))
-        self.assertFalse(RelaxedOneHotCategorical(probs=p, temperature=temp).sample().requires_grad)
-        self.assertEqual(RelaxedOneHotCategorical(probs=p, temperature=temp).sample((2, 2)).size(), (2, 2, 3))
-        self.assertEqual(RelaxedOneHotCategorical(probs=p, temperature=temp).sample((1,)).size(), (1, 3))
-        self._gradcheck_log_prob(lambda t, p: RelaxedOneHotCategorical(t, p, validate_args=False), (temp, p))
+        self.assertEqual(
+            RelaxedOneHotCategorical(probs=p, temperature=temp).sample().size(), (3,)
+        )
+        self.assertFalse(
+            RelaxedOneHotCategorical(probs=p, temperature=temp).sample().requires_grad
+        )
+        self.assertEqual(
+            RelaxedOneHotCategorical(probs=p, temperature=temp).sample((2, 2)).size(),
+            (2, 2, 3),
+        )
+        self.assertEqual(
+            RelaxedOneHotCategorical(probs=p, temperature=temp).sample((1,)).size(),
+            (1, 3),
+        )
+        self._gradcheck_log_prob(
+            lambda t, p: RelaxedOneHotCategorical(t, p, validate_args=False), (temp, p)
+        )
 
     @set_default_dtype(torch.double)
     def test_relaxed_one_hot_categorical_2d(self):
@@ -1550,10 +2081,20 @@ def test_relaxed_one_hot_categorical_2d(self):
         p = torch.tensor(probabilities, requires_grad=True)
         s = torch.tensor(probabilities_1, requires_grad=True)
         self.assertEqual(RelaxedOneHotCategorical(temp, p).sample().size(), (2, 3))
-        self.assertEqual(RelaxedOneHotCategorical(temp, p).sample(sample_shape=(3, 4)).size(), (3, 4, 2, 3))
-        self.assertEqual(RelaxedOneHotCategorical(temp, p).sample((6,)).size(), (6, 2, 3))
-        self._gradcheck_log_prob(lambda t, p: RelaxedOneHotCategorical(t, p, validate_args=False), (temp, p))
-        self._gradcheck_log_prob(lambda t, p: RelaxedOneHotCategorical(t, p, validate_args=False), (temp_2, p))
+        self.assertEqual(
+            RelaxedOneHotCategorical(temp, p).sample(sample_shape=(3, 4)).size(),
+            (3, 4, 2, 3),
+        )
+        self.assertEqual(
+            RelaxedOneHotCategorical(temp, p).sample((6,)).size(), (6, 2, 3)
+        )
+        self._gradcheck_log_prob(
+            lambda t, p: RelaxedOneHotCategorical(t, p, validate_args=False), (temp, p)
+        )
+        self._gradcheck_log_prob(
+            lambda t, p: RelaxedOneHotCategorical(t, p, validate_args=False),
+            (temp_2, p),
+        )
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_argmax_relaxed_categorical(self):
@@ -1577,11 +2118,15 @@ def pmf(self, samples):
                 new_samples[np.arange(samples.shape[0]), samples] = 1
                 return self.dist.pmf(new_samples)
 
-        for probs, temp in product([torch.tensor([0.1, 0.9]), torch.tensor([0.2, 0.2, 0.6])], [0.1, 1.0, 10.0]):
-            self._check_sampler_discrete(ArgMax(RelaxedOneHotCategorical(temp, probs)),
-                                         ScipyCategorical(scipy.stats.multinomial(1, probs)),
-                                         f'Rounded(RelaxedOneHotCategorical(temp={temp}, probs={probs}))',
-                                         failure_rate=1e-3)
+        for probs, temp in product(
+            [torch.tensor([0.1, 0.9]), torch.tensor([0.2, 0.2, 0.6])], [0.1, 1.0, 10.0]
+        ):
+            self._check_sampler_discrete(
+                ArgMax(RelaxedOneHotCategorical(temp, probs)),
+                ScipyCategorical(scipy.stats.multinomial(1, probs)),
+                f"Rounded(RelaxedOneHotCategorical(temp={temp}, probs={probs}))",
+                failure_rate=1e-3,
+            )
 
         for probs in [torch.tensor([0.1, 0.9]), torch.tensor([0.2, 0.2, 0.6])]:
             equal_probs = torch.ones(probs.size()) / probs.size()[0]
@@ -1633,15 +2178,18 @@ def test_uniform(self):
     def test_vonmises_sample(self):
         for loc in [0.0, math.pi / 2.0]:
             for concentration in [0.03, 0.3, 1.0, 10.0, 100.0]:
-                self._check_sampler_sampler(VonMises(loc, concentration),
-                                            scipy.stats.vonmises(loc=loc, kappa=concentration),
-                                            f"VonMises(loc={loc}, concentration={concentration})",
-                                            num_samples=int(1e5), circular=True)
+                self._check_sampler_sampler(
+                    VonMises(loc, concentration),
+                    scipy.stats.vonmises(loc=loc, kappa=concentration),
+                    f"VonMises(loc={loc}, concentration={concentration})",
+                    num_samples=int(1e5),
+                    circular=True,
+                )
 
     def test_vonmises_logprob(self):
         concentrations = [0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0, 30.0, 100.0]
         for concentration in concentrations:
-            grid = torch.arange(0., 2 * math.pi, 1e-4)
+            grid = torch.arange(0.0, 2 * math.pi, 1e-4)
             prob = VonMises(0.0, concentration).log_prob(grid).exp()
             norm = prob.mean().item() * 2 * math.pi
             self.assertLess(abs(norm - 1), 1e-3)
@@ -1710,14 +2258,17 @@ def test_halfnormal(self):
         self.assertEqual(HalfNormal(std).sample((7,)).size(), (7, 5, 5))
         self.assertEqual(HalfNormal(std_1d).sample((1,)).size(), (1, 1))
         self.assertEqual(HalfNormal(std_1d).sample().size(), (1,))
-        self.assertEqual(HalfNormal(.6).sample((1,)).size(), (1,))
+        self.assertEqual(HalfNormal(0.6).sample((1,)).size(), (1,))
         self.assertEqual(HalfNormal(50.0).sample((1,)).size(), (1,))
 
         # sample check for extreme value of std
         set_rng_seed(1)
-        self.assertEqual(HalfNormal(std_delta).sample(sample_shape=(1, 2)),
-                         torch.tensor([[[0.0, 0.0], [0.0, 0.0]]]),
-                         atol=1e-4, rtol=0)
+        self.assertEqual(
+            HalfNormal(std_delta).sample(sample_shape=(1, 2)),
+            torch.tensor([[[0.0, 0.0], [0.0, 0.0]]]),
+            atol=1e-4,
+            rtol=0,
+        )
 
         self._gradcheck_log_prob(HalfNormal, (std,))
         self._gradcheck_log_prob(HalfNormal, (1.0,))
@@ -1742,9 +2293,11 @@ def ref_log_prob(idx, x, log_prob):
     def test_halfnormal_sample(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         for std in [0.1, 1.0, 10.0]:
-            self._check_sampler_sampler(HalfNormal(std),
-                                        scipy.stats.halfnorm(scale=std),
-                                        f'HalfNormal(scale={std})')
+            self._check_sampler_sampler(
+                HalfNormal(std),
+                scipy.stats.halfnorm(scale=std),
+                f"HalfNormal(scale={std})",
+            )
 
     @set_default_dtype(torch.double)
     def test_inversegamma(self):
@@ -1769,9 +2322,11 @@ def test_inversegamma(self):
     def test_inversegamma_sample(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         for concentration, rate in product([2, 5], [0.1, 1.0, 10.0]):
-            self._check_sampler_sampler(InverseGamma(concentration, rate),
-                                        scipy.stats.invgamma(concentration, scale=rate),
-                                        'InverseGamma()')
+            self._check_sampler_sampler(
+                InverseGamma(concentration, rate),
+                scipy.stats.invgamma(concentration, scale=rate),
+                "InverseGamma()",
+            )
 
     @set_default_dtype(torch.double)
     def test_lognormal(self):
@@ -1785,14 +2340,17 @@ def test_lognormal(self):
         self.assertEqual(LogNormal(mean, std).sample((7,)).size(), (7, 5, 5))
         self.assertEqual(LogNormal(mean_1d, std_1d).sample((1,)).size(), (1, 1))
         self.assertEqual(LogNormal(mean_1d, std_1d).sample().size(), (1,))
-        self.assertEqual(LogNormal(0.2, .6).sample((1,)).size(), (1,))
+        self.assertEqual(LogNormal(0.2, 0.6).sample((1,)).size(), (1,))
         self.assertEqual(LogNormal(-0.7, 50.0).sample((1,)).size(), (1,))
 
         # sample check for extreme value of mean, std
         set_rng_seed(1)
-        self.assertEqual(LogNormal(mean_delta, std_delta).sample(sample_shape=(1, 2)),
-                         torch.tensor([[[math.exp(1), 1.0], [math.exp(1), 1.0]]]),
-                         atol=1e-4, rtol=0)
+        self.assertEqual(
+            LogNormal(mean_delta, std_delta).sample(sample_shape=(1, 2)),
+            torch.tensor([[[math.exp(1), 1.0], [math.exp(1), 1.0]]]),
+            atol=1e-4,
+            rtol=0,
+        )
 
         self._gradcheck_log_prob(LogNormal, (mean, std))
         self._gradcheck_log_prob(LogNormal, (mean, 1.0))
@@ -1822,9 +2380,11 @@ def ref_log_prob(idx, x, log_prob):
     def test_lognormal_sample(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         for mean, std in product([-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]):
-            self._check_sampler_sampler(LogNormal(mean, std),
-                                        scipy.stats.lognorm(scale=math.exp(mean), s=std),
-                                        f'LogNormal(loc={mean}, scale={std})')
+            self._check_sampler_sampler(
+                LogNormal(mean, std),
+                scipy.stats.lognorm(scale=math.exp(mean), s=std),
+                f"LogNormal(loc={mean}, scale={std})",
+            )
 
     @set_default_dtype(torch.double)
     def test_logisticnormal(self):
@@ -1839,22 +2399,35 @@ def test_logisticnormal(self):
         self.assertEqual(LogisticNormal(mean, std).sample((7,)).size(), (7, 5, 6))
         self.assertEqual(LogisticNormal(mean_1d, std_1d).sample((1,)).size(), (1, 2))
         self.assertEqual(LogisticNormal(mean_1d, std_1d).sample().size(), (2,))
-        self.assertEqual(LogisticNormal(0.2, .6).sample().size(), (2,))
+        self.assertEqual(LogisticNormal(0.2, 0.6).sample().size(), (2,))
         self.assertEqual(LogisticNormal(-0.7, 50.0).sample().size(), (2,))
 
         # sample check for extreme value of mean, std
         set_rng_seed(1)
-        self.assertEqual(LogisticNormal(mean_delta, std_delta).sample(),
-                         torch.tensor([math.exp(1) / (1. + 1. + math.exp(1)),
-                                       1. / (1. + 1. + math.exp(1)),
-                                       1. / (1. + 1. + math.exp(1))]),
-                         atol=1e-4, rtol=0)
+        self.assertEqual(
+            LogisticNormal(mean_delta, std_delta).sample(),
+            torch.tensor(
+                [
+                    math.exp(1) / (1.0 + 1.0 + math.exp(1)),
+                    1.0 / (1.0 + 1.0 + math.exp(1)),
+                    1.0 / (1.0 + 1.0 + math.exp(1)),
+                ]
+            ),
+            atol=1e-4,
+            rtol=0,
+        )
 
         # TODO: gradcheck seems to mutate the sample values so that the simplex
         # constraint fails by a very small margin.
-        self._gradcheck_log_prob(lambda m, s: LogisticNormal(m, s, validate_args=False), (mean, std))
-        self._gradcheck_log_prob(lambda m, s: LogisticNormal(m, s, validate_args=False), (mean, 1.0))
-        self._gradcheck_log_prob(lambda m, s: LogisticNormal(m, s, validate_args=False), (0.0, std))
+        self._gradcheck_log_prob(
+            lambda m, s: LogisticNormal(m, s, validate_args=False), (mean, std)
+        )
+        self._gradcheck_log_prob(
+            lambda m, s: LogisticNormal(m, s, validate_args=False), (mean, 1.0)
+        )
+        self._gradcheck_log_prob(
+            lambda m, s: LogisticNormal(m, s, validate_args=False), (0.0, std)
+        )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_logisticnormal_logprob(self):
@@ -1868,14 +2441,15 @@ def test_logisticnormal_logprob(self):
         assert dist.log_prob(dist.sample()).detach().cpu().numpy().shape == (5,)
 
     def _get_logistic_normal_ref_sampler(self, base_dist):
-
         def _sampler(num_samples):
             x = base_dist.rvs(num_samples)
             offset = np.log((x.shape[-1] + 1) - np.ones_like(x).cumsum(-1))
-            z = 1. / (1. + np.exp(offset - x))
+            z = 1.0 / (1.0 + np.exp(offset - x))
             z_cumprod = np.cumprod(1 - z, axis=-1)
-            y1 = np.pad(z, ((0, 0), (0, 1)), mode='constant', constant_values=1.)
-            y2 = np.pad(z_cumprod, ((0, 0), (1, 0)), mode='constant', constant_values=1.)
+            y1 = np.pad(z, ((0, 0), (0, 1)), mode="constant", constant_values=1.0)
+            y2 = np.pad(
+                z_cumprod, ((0, 0), (1, 0)), mode="constant", constant_values=1.0
+            )
             return y1 * y2
 
         return _sampler
@@ -1892,29 +2466,35 @@ def test_logisticnormal_sample(self):
             mean_th = torch.tensor(mean)
             std_th = torch.tensor(np.sqrt(np.diag(cov)))
             self._check_sampler_sampler(
-                LogisticNormal(mean_th, std_th), ref_dist,
-                f'LogisticNormal(loc={mean_th}, scale={std_th})',
-                multivariate=True)
+                LogisticNormal(mean_th, std_th),
+                ref_dist,
+                f"LogisticNormal(loc={mean_th}, scale={std_th})",
+                multivariate=True,
+            )
 
     def test_mixture_same_family_shape(self):
         normal_case_1d = MixtureSameFamily(
-            Categorical(torch.rand(5)),
-            Normal(torch.randn(5), torch.rand(5)))
+            Categorical(torch.rand(5)), Normal(torch.randn(5), torch.rand(5))
+        )
         normal_case_1d_batch = MixtureSameFamily(
-            Categorical(torch.rand(3, 5)),
-            Normal(torch.randn(3, 5), torch.rand(3, 5)))
+            Categorical(torch.rand(3, 5)), Normal(torch.randn(3, 5), torch.rand(3, 5))
+        )
         normal_case_1d_multi_batch = MixtureSameFamily(
             Categorical(torch.rand(4, 3, 5)),
-            Normal(torch.randn(4, 3, 5), torch.rand(4, 3, 5)))
+            Normal(torch.randn(4, 3, 5), torch.rand(4, 3, 5)),
+        )
         normal_case_2d = MixtureSameFamily(
             Categorical(torch.rand(5)),
-            Independent(Normal(torch.randn(5, 2), torch.rand(5, 2)), 1))
+            Independent(Normal(torch.randn(5, 2), torch.rand(5, 2)), 1),
+        )
         normal_case_2d_batch = MixtureSameFamily(
             Categorical(torch.rand(3, 5)),
-            Independent(Normal(torch.randn(3, 5, 2), torch.rand(3, 5, 2)), 1))
+            Independent(Normal(torch.randn(3, 5, 2), torch.rand(3, 5, 2)), 1),
+        )
         normal_case_2d_multi_batch = MixtureSameFamily(
             Categorical(torch.rand(4, 3, 5)),
-            Independent(Normal(torch.randn(4, 3, 5, 2), torch.rand(4, 3, 5, 2)), 1))
+            Independent(Normal(torch.randn(4, 3, 5, 2), torch.rand(4, 3, 5, 2)), 1),
+        )
 
         self.assertEqual(normal_case_1d.sample().size(), ())
         self.assertEqual(normal_case_1d.sample((2,)).size(), (2,))
@@ -1934,7 +2514,9 @@ def test_mixture_same_family_shape(self):
         self.assertEqual(normal_case_2d_batch.sample((2, 7)).size(), (2, 7, 3, 2))
         self.assertEqual(normal_case_2d_multi_batch.sample().size(), (4, 3, 2))
         self.assertEqual(normal_case_2d_multi_batch.sample((2,)).size(), (2, 4, 3, 2))
-        self.assertEqual(normal_case_2d_multi_batch.sample((2, 7)).size(), (2, 7, 4, 3, 2))
+        self.assertEqual(
+            normal_case_2d_multi_batch.sample((2, 7)).size(), (2, 7, 4, 3, 2)
+        )
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_mixture_same_family_log_prob(self):
@@ -1952,8 +2534,9 @@ def ref_log_prob(idx, x, log_prob):
             self.assertEqual(log_prob, expected, atol=1e-3, rtol=0)
 
         self._check_log_prob(
-            MixtureSameFamily(Categorical(probs=probs),
-                              Normal(loc, scale)), ref_log_prob)
+            MixtureSameFamily(Categorical(probs=probs), Normal(loc, scale)),
+            ref_log_prob,
+        )
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_mixture_same_family_sample(self):
@@ -1968,8 +2551,10 @@ def __init__(self, probs, mu, std):
                 self.std = std
 
             def rvs(self, n_sample):
-                comp_samples = [scipy.stats.norm(m, s).rvs(n_sample) for m, s
-                                in zip(self.mu, self.std)]
+                comp_samples = [
+                    scipy.stats.norm(m, s).rvs(n_sample)
+                    for m, s in zip(self.mu, self.std)
+                ]
                 mix_samples = scipy.stats.multinomial(1, self.probs).rvs(n_sample)
                 samples = []
                 for i in range(n_sample):
@@ -1979,8 +2564,9 @@ def rvs(self, n_sample):
         self._check_sampler_sampler(
             MixtureSameFamily(Categorical(probs=probs), Normal(loc, scale)),
             ScipyMixtureNormal(probs.numpy(), loc.numpy(), scale.numpy()),
-            f'''MixtureSameFamily(Categorical(probs={probs}),
-            Normal(loc={loc}, scale={scale}))''')
+            f"""MixtureSameFamily(Categorical(probs={probs}),
+            Normal(loc={loc}, scale={scale}))""",
+        )
 
     @set_default_dtype(torch.double)
     def test_normal(self):
@@ -1994,14 +2580,17 @@ def test_normal(self):
         self.assertEqual(Normal(loc, scale).sample((7,)).size(), (7, 5, 5))
         self.assertEqual(Normal(loc_1d, scale_1d).sample((1,)).size(), (1, 1))
         self.assertEqual(Normal(loc_1d, scale_1d).sample().size(), (1,))
-        self.assertEqual(Normal(0.2, .6).sample((1,)).size(), (1,))
+        self.assertEqual(Normal(0.2, 0.6).sample((1,)).size(), (1,))
         self.assertEqual(Normal(-0.7, 50.0).sample((1,)).size(), (1,))
 
         # sample check for extreme value of mean, std
         set_rng_seed(1)
-        self.assertEqual(Normal(loc_delta, scale_delta).sample(sample_shape=(1, 2)),
-                         torch.tensor([[[1.0, 0.0], [1.0, 0.0]]]),
-                         atol=1e-4, rtol=0)
+        self.assertEqual(
+            Normal(loc_delta, scale_delta).sample(sample_shape=(1, 2)),
+            torch.tensor([[[1.0, 0.0], [1.0, 0.0]]]),
+            atol=1e-4,
+            rtol=0,
+        )
 
         self._gradcheck_log_prob(Normal, (loc, scale))
         self._gradcheck_log_prob(Normal, (loc, 1.0))
@@ -2021,8 +2610,9 @@ def test_normal(self):
         def ref_log_prob(idx, x, log_prob):
             m = loc.view(-1)[idx]
             s = scale.view(-1)[idx]
-            expected = (math.exp(-(x - m) ** 2 / (2 * s ** 2)) /
-                        math.sqrt(2 * math.pi * s ** 2))
+            expected = math.exp(-((x - m) ** 2) / (2 * s**2)) / math.sqrt(
+                2 * math.pi * s**2
+            )
             self.assertEqual(log_prob, math.log(expected), atol=1e-3, rtol=0)
 
         self._check_log_prob(Normal(loc, scale), ref_log_prob)
@@ -2036,9 +2626,11 @@ def ref_log_prob(idx, x, log_prob):
     def test_normal_sample(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         for loc, scale in product([-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]):
-            self._check_sampler_sampler(Normal(loc, scale),
-                                        scipy.stats.norm(loc=loc, scale=scale),
-                                        f'Normal(mean={loc}, std={scale})')
+            self._check_sampler_sampler(
+                Normal(loc, scale),
+                scipy.stats.norm(loc=loc, scale=scale),
+                f"Normal(mean={loc}, std={scale})",
+            )
 
     @set_default_dtype(torch.double)
     def test_lowrank_multivariate_normal_shape(self):
@@ -2055,38 +2647,88 @@ def test_lowrank_multivariate_normal_shape(self):
         cov_diag_batched = torch.randn(6, 5, 3).abs().requires_grad_()
 
         # ensure that sample, batch, event shapes all handled correctly
-        self.assertEqual(LowRankMultivariateNormal(mean, cov_factor, cov_diag)
-                         .sample().size(), (5, 3))
-        self.assertEqual(LowRankMultivariateNormal(mean_no_batch, cov_factor, cov_diag)
-                         .sample().size(), (3,))
-        self.assertEqual(LowRankMultivariateNormal(mean_multi_batch, cov_factor, cov_diag)
-                         .sample().size(), (6, 5, 3))
-        self.assertEqual(LowRankMultivariateNormal(mean, cov_factor, cov_diag)
-                         .sample((2,)).size(), (2, 5, 3))
-        self.assertEqual(LowRankMultivariateNormal(mean_no_batch, cov_factor, cov_diag)
-                         .sample((2,)).size(), (2, 3))
-        self.assertEqual(LowRankMultivariateNormal(mean_multi_batch, cov_factor, cov_diag)
-                         .sample((2,)).size(), (2, 6, 5, 3))
-        self.assertEqual(LowRankMultivariateNormal(mean, cov_factor, cov_diag)
-                         .sample((2, 7)).size(), (2, 7, 5, 3))
-        self.assertEqual(LowRankMultivariateNormal(mean_no_batch, cov_factor, cov_diag)
-                         .sample((2, 7)).size(), (2, 7, 3))
-        self.assertEqual(LowRankMultivariateNormal(mean_multi_batch, cov_factor, cov_diag)
-                         .sample((2, 7)).size(), (2, 7, 6, 5, 3))
-        self.assertEqual(LowRankMultivariateNormal(mean, cov_factor_batched, cov_diag_batched)
-                         .sample((2, 7)).size(), (2, 7, 6, 5, 3))
-        self.assertEqual(LowRankMultivariateNormal(mean_no_batch, cov_factor_batched, cov_diag_batched)
-                         .sample((2, 7)).size(), (2, 7, 6, 5, 3))
-        self.assertEqual(LowRankMultivariateNormal(mean_multi_batch, cov_factor_batched, cov_diag_batched)
-                         .sample((2, 7)).size(), (2, 7, 6, 5, 3))
+        self.assertEqual(
+            LowRankMultivariateNormal(mean, cov_factor, cov_diag).sample().size(),
+            (5, 3),
+        )
+        self.assertEqual(
+            LowRankMultivariateNormal(mean_no_batch, cov_factor, cov_diag)
+            .sample()
+            .size(),
+            (3,),
+        )
+        self.assertEqual(
+            LowRankMultivariateNormal(mean_multi_batch, cov_factor, cov_diag)
+            .sample()
+            .size(),
+            (6, 5, 3),
+        )
+        self.assertEqual(
+            LowRankMultivariateNormal(mean, cov_factor, cov_diag).sample((2,)).size(),
+            (2, 5, 3),
+        )
+        self.assertEqual(
+            LowRankMultivariateNormal(mean_no_batch, cov_factor, cov_diag)
+            .sample((2,))
+            .size(),
+            (2, 3),
+        )
+        self.assertEqual(
+            LowRankMultivariateNormal(mean_multi_batch, cov_factor, cov_diag)
+            .sample((2,))
+            .size(),
+            (2, 6, 5, 3),
+        )
+        self.assertEqual(
+            LowRankMultivariateNormal(mean, cov_factor, cov_diag).sample((2, 7)).size(),
+            (2, 7, 5, 3),
+        )
+        self.assertEqual(
+            LowRankMultivariateNormal(mean_no_batch, cov_factor, cov_diag)
+            .sample((2, 7))
+            .size(),
+            (2, 7, 3),
+        )
+        self.assertEqual(
+            LowRankMultivariateNormal(mean_multi_batch, cov_factor, cov_diag)
+            .sample((2, 7))
+            .size(),
+            (2, 7, 6, 5, 3),
+        )
+        self.assertEqual(
+            LowRankMultivariateNormal(mean, cov_factor_batched, cov_diag_batched)
+            .sample((2, 7))
+            .size(),
+            (2, 7, 6, 5, 3),
+        )
+        self.assertEqual(
+            LowRankMultivariateNormal(
+                mean_no_batch, cov_factor_batched, cov_diag_batched
+            )
+            .sample((2, 7))
+            .size(),
+            (2, 7, 6, 5, 3),
+        )
+        self.assertEqual(
+            LowRankMultivariateNormal(
+                mean_multi_batch, cov_factor_batched, cov_diag_batched
+            )
+            .sample((2, 7))
+            .size(),
+            (2, 7, 6, 5, 3),
+        )
 
         # check gradients
-        self._gradcheck_log_prob(LowRankMultivariateNormal,
-                                 (mean, cov_factor, cov_diag))
-        self._gradcheck_log_prob(LowRankMultivariateNormal,
-                                 (mean_multi_batch, cov_factor, cov_diag))
-        self._gradcheck_log_prob(LowRankMultivariateNormal,
-                                 (mean_multi_batch, cov_factor_batched, cov_diag_batched))
+        self._gradcheck_log_prob(
+            LowRankMultivariateNormal, (mean, cov_factor, cov_diag)
+        )
+        self._gradcheck_log_prob(
+            LowRankMultivariateNormal, (mean_multi_batch, cov_factor, cov_diag)
+        )
+        self._gradcheck_log_prob(
+            LowRankMultivariateNormal,
+            (mean_multi_batch, cov_factor_batched, cov_diag_batched),
+        )
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_lowrank_multivariate_normal_log_prob(self):
@@ -2098,12 +2740,19 @@ def test_lowrank_multivariate_normal_log_prob(self):
         # check that logprob values match scipy logpdf,
         # and that covariance and scale_tril parameters are equivalent
         dist1 = LowRankMultivariateNormal(mean, cov_factor, cov_diag)
-        ref_dist = scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy())
+        ref_dist = scipy.stats.multivariate_normal(
+            mean.detach().numpy(), cov.detach().numpy()
+        )
 
         x = dist1.sample((10,))
         expected = ref_dist.logpdf(x.numpy())
 
-        self.assertEqual(0.0, np.mean((dist1.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0)
+        self.assertEqual(
+            0.0,
+            np.mean((dist1.log_prob(x).detach().numpy() - expected) ** 2),
+            atol=1e-3,
+            rtol=0,
+        )
 
         # Double-check that batched versions behave the same as unbatched
         mean = torch.randn(5, 3, requires_grad=True)
@@ -2111,12 +2760,16 @@ def test_lowrank_multivariate_normal_log_prob(self):
         cov_diag = torch.randn(5, 3).abs().requires_grad_()
 
         dist_batched = LowRankMultivariateNormal(mean, cov_factor, cov_diag)
-        dist_unbatched = [LowRankMultivariateNormal(mean[i], cov_factor[i], cov_diag[i])
-                          for i in range(mean.size(0))]
+        dist_unbatched = [
+            LowRankMultivariateNormal(mean[i], cov_factor[i], cov_diag[i])
+            for i in range(mean.size(0))
+        ]
 
         x = dist_batched.sample((10,))
         batched_prob = dist_batched.log_prob(x)
-        unbatched_prob = torch.stack([dist_unbatched[i].log_prob(x[:, i]) for i in range(5)]).t()
+        unbatched_prob = torch.stack(
+            [dist_unbatched[i].log_prob(x[:, i]) for i in range(5)]
+        ).t()
 
         self.assertEqual(batched_prob.shape, unbatched_prob.shape)
         self.assertEqual(batched_prob, unbatched_prob, atol=1e-3, rtol=0)
@@ -2129,10 +2782,14 @@ def test_lowrank_multivariate_normal_sample(self):
         cov_diag = torch.randn(5).abs().requires_grad_()
         cov = cov_factor.matmul(cov_factor.t()) + cov_diag.diag()
 
-        self._check_sampler_sampler(LowRankMultivariateNormal(mean, cov_factor, cov_diag),
-                                    scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy()),
-                                    'LowRankMultivariateNormal(loc={}, cov_factor={}, cov_diag={})'
-                                    .format(mean, cov_factor, cov_diag), multivariate=True)
+        self._check_sampler_sampler(
+            LowRankMultivariateNormal(mean, cov_factor, cov_diag),
+            scipy.stats.multivariate_normal(
+                mean.detach().numpy(), cov.detach().numpy()
+            ),
+            f"LowRankMultivariateNormal(loc={mean}, cov_factor={cov_factor}, cov_diag={cov_diag})",
+            multivariate=True,
+        )
 
     def test_lowrank_multivariate_normal_properties(self):
         loc = torch.randn(5)
@@ -2181,26 +2838,69 @@ def test_multivariate_normal_shape(self):
         # ensure that sample, batch, event shapes all handled correctly
         self.assertEqual(MultivariateNormal(mean, cov).sample().size(), (5, 3))
         self.assertEqual(MultivariateNormal(mean_no_batch, cov).sample().size(), (3,))
-        self.assertEqual(MultivariateNormal(mean_multi_batch, cov).sample().size(), (6, 5, 3))
+        self.assertEqual(
+            MultivariateNormal(mean_multi_batch, cov).sample().size(), (6, 5, 3)
+        )
         self.assertEqual(MultivariateNormal(mean, cov).sample((2,)).size(), (2, 5, 3))
-        self.assertEqual(MultivariateNormal(mean_no_batch, cov).sample((2,)).size(), (2, 3))
-        self.assertEqual(MultivariateNormal(mean_multi_batch, cov).sample((2,)).size(), (2, 6, 5, 3))
-        self.assertEqual(MultivariateNormal(mean, cov).sample((2, 7)).size(), (2, 7, 5, 3))
-        self.assertEqual(MultivariateNormal(mean_no_batch, cov).sample((2, 7)).size(), (2, 7, 3))
-        self.assertEqual(MultivariateNormal(mean_multi_batch, cov).sample((2, 7)).size(), (2, 7, 6, 5, 3))
-        self.assertEqual(MultivariateNormal(mean, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3))
-        self.assertEqual(MultivariateNormal(mean_no_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3))
-        self.assertEqual(MultivariateNormal(mean_multi_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3))
-        self.assertEqual(MultivariateNormal(mean, precision_matrix=prec).sample((2, 7)).size(), (2, 7, 5, 3))
-        self.assertEqual(MultivariateNormal(mean, precision_matrix=prec_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3))
-        self.assertEqual(MultivariateNormal(mean, scale_tril=scale_tril).sample((2, 7)).size(), (2, 7, 5, 3))
-        self.assertEqual(MultivariateNormal(mean, scale_tril=scale_tril_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3))
+        self.assertEqual(
+            MultivariateNormal(mean_no_batch, cov).sample((2,)).size(), (2, 3)
+        )
+        self.assertEqual(
+            MultivariateNormal(mean_multi_batch, cov).sample((2,)).size(), (2, 6, 5, 3)
+        )
+        self.assertEqual(
+            MultivariateNormal(mean, cov).sample((2, 7)).size(), (2, 7, 5, 3)
+        )
+        self.assertEqual(
+            MultivariateNormal(mean_no_batch, cov).sample((2, 7)).size(), (2, 7, 3)
+        )
+        self.assertEqual(
+            MultivariateNormal(mean_multi_batch, cov).sample((2, 7)).size(),
+            (2, 7, 6, 5, 3),
+        )
+        self.assertEqual(
+            MultivariateNormal(mean, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3)
+        )
+        self.assertEqual(
+            MultivariateNormal(mean_no_batch, cov_batched).sample((2, 7)).size(),
+            (2, 7, 6, 5, 3),
+        )
+        self.assertEqual(
+            MultivariateNormal(mean_multi_batch, cov_batched).sample((2, 7)).size(),
+            (2, 7, 6, 5, 3),
+        )
+        self.assertEqual(
+            MultivariateNormal(mean, precision_matrix=prec).sample((2, 7)).size(),
+            (2, 7, 5, 3),
+        )
+        self.assertEqual(
+            MultivariateNormal(mean, precision_matrix=prec_batched)
+            .sample((2, 7))
+            .size(),
+            (2, 7, 6, 5, 3),
+        )
+        self.assertEqual(
+            MultivariateNormal(mean, scale_tril=scale_tril).sample((2, 7)).size(),
+            (2, 7, 5, 3),
+        )
+        self.assertEqual(
+            MultivariateNormal(mean, scale_tril=scale_tril_batched)
+            .sample((2, 7))
+            .size(),
+            (2, 7, 6, 5, 3),
+        )
 
         # check gradients
         # We write a custom gradcheck function to maintain the symmetry
         # of the perturbed covariances and their inverses (precision)
-        def multivariate_normal_log_prob_gradcheck(mean, covariance=None, precision=None, scale_tril=None):
-            mvn_samples = MultivariateNormal(mean, covariance, precision, scale_tril).sample().requires_grad_()
+        def multivariate_normal_log_prob_gradcheck(
+            mean, covariance=None, precision=None, scale_tril=None
+        ):
+            mvn_samples = (
+                MultivariateNormal(mean, covariance, precision, scale_tril)
+                .sample()
+                .requires_grad_()
+            )
 
             def gradcheck_func(samples, mu, sigma, prec, scale_tril):
                 if sigma is not None:
@@ -2210,7 +2910,12 @@ def gradcheck_func(samples, mu, sigma, prec, scale_tril):
                 if scale_tril is not None:
                     scale_tril = scale_tril.tril()
                 return MultivariateNormal(mu, sigma, prec, scale_tril).log_prob(samples)
-            gradcheck(gradcheck_func, (mvn_samples, mean, covariance, precision, scale_tril), raise_exception=True)
+
+            gradcheck(
+                gradcheck_func,
+                (mvn_samples, mean, covariance, precision, scale_tril),
+                raise_exception=True,
+            )
 
         multivariate_normal_log_prob_gradcheck(mean, cov)
         multivariate_normal_log_prob_gradcheck(mean_multi_batch, cov)
@@ -2218,12 +2923,14 @@ def gradcheck_func(samples, mu, sigma, prec, scale_tril):
         multivariate_normal_log_prob_gradcheck(mean, None, prec)
         multivariate_normal_log_prob_gradcheck(mean_no_batch, None, prec_batched)
         multivariate_normal_log_prob_gradcheck(mean, None, None, scale_tril)
-        multivariate_normal_log_prob_gradcheck(mean_no_batch, None, None, scale_tril_batched)
+        multivariate_normal_log_prob_gradcheck(
+            mean_no_batch, None, None, scale_tril_batched
+        )
 
     @set_default_dtype(torch.double)
     def test_multivariate_normal_stable_with_precision_matrix(self):
         x = torch.randn(10)
-        P = torch.exp(-(x - x.unsqueeze(-1)) ** 2)  # RBF kernel
+        P = torch.exp(-((x - x.unsqueeze(-1)) ** 2))  # RBF kernel
         MultivariateNormal(x.new_zeros(10), precision_matrix=P)
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
@@ -2239,14 +2946,31 @@ def test_multivariate_normal_log_prob(self):
         dist1 = MultivariateNormal(mean, cov)
         dist2 = MultivariateNormal(mean, precision_matrix=prec)
         dist3 = MultivariateNormal(mean, scale_tril=scale_tril)
-        ref_dist = scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy())
+        ref_dist = scipy.stats.multivariate_normal(
+            mean.detach().numpy(), cov.detach().numpy()
+        )
 
         x = dist1.sample((10,))
         expected = ref_dist.logpdf(x.numpy())
 
-        self.assertEqual(0.0, np.mean((dist1.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0)
-        self.assertEqual(0.0, np.mean((dist2.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0)
-        self.assertEqual(0.0, np.mean((dist3.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0)
+        self.assertEqual(
+            0.0,
+            np.mean((dist1.log_prob(x).detach().numpy() - expected) ** 2),
+            atol=1e-3,
+            rtol=0,
+        )
+        self.assertEqual(
+            0.0,
+            np.mean((dist2.log_prob(x).detach().numpy() - expected) ** 2),
+            atol=1e-3,
+            rtol=0,
+        )
+        self.assertEqual(
+            0.0,
+            np.mean((dist3.log_prob(x).detach().numpy() - expected) ** 2),
+            atol=1e-3,
+            rtol=0,
+        )
 
         # Double-check that batched versions behave the same as unbatched
         mean = torch.randn(5, 3, requires_grad=True)
@@ -2254,11 +2978,15 @@ def test_multivariate_normal_log_prob(self):
         cov = (tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1).requires_grad_()
 
         dist_batched = MultivariateNormal(mean, cov)
-        dist_unbatched = [MultivariateNormal(mean[i], cov[i]) for i in range(mean.size(0))]
+        dist_unbatched = [
+            MultivariateNormal(mean[i], cov[i]) for i in range(mean.size(0))
+        ]
 
         x = dist_batched.sample((10,))
         batched_prob = dist_batched.log_prob(x)
-        unbatched_prob = torch.stack([dist_unbatched[i].log_prob(x[:, i]) for i in range(5)]).t()
+        unbatched_prob = torch.stack(
+            [dist_unbatched[i].log_prob(x[:, i]) for i in range(5)]
+        ).t()
 
         self.assertEqual(batched_prob.shape, unbatched_prob.shape)
         self.assertEqual(batched_prob, unbatched_prob, atol=1e-3, rtol=0)
@@ -2272,18 +3000,30 @@ def test_multivariate_normal_sample(self):
         prec = cov.inverse().requires_grad_()
         scale_tril = torch.linalg.cholesky(cov).requires_grad_()
 
-        self._check_sampler_sampler(MultivariateNormal(mean, cov),
-                                    scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy()),
-                                    f'MultivariateNormal(loc={mean}, cov={cov})',
-                                    multivariate=True)
-        self._check_sampler_sampler(MultivariateNormal(mean, precision_matrix=prec),
-                                    scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy()),
-                                    f'MultivariateNormal(loc={mean}, atol={prec})',
-                                    multivariate=True)
-        self._check_sampler_sampler(MultivariateNormal(mean, scale_tril=scale_tril),
-                                    scipy.stats.multivariate_normal(mean.detach().numpy(), cov.detach().numpy()),
-                                    f'MultivariateNormal(loc={mean}, scale_tril={scale_tril})',
-                                    multivariate=True)
+        self._check_sampler_sampler(
+            MultivariateNormal(mean, cov),
+            scipy.stats.multivariate_normal(
+                mean.detach().numpy(), cov.detach().numpy()
+            ),
+            f"MultivariateNormal(loc={mean}, cov={cov})",
+            multivariate=True,
+        )
+        self._check_sampler_sampler(
+            MultivariateNormal(mean, precision_matrix=prec),
+            scipy.stats.multivariate_normal(
+                mean.detach().numpy(), cov.detach().numpy()
+            ),
+            f"MultivariateNormal(loc={mean}, atol={prec})",
+            multivariate=True,
+        )
+        self._check_sampler_sampler(
+            MultivariateNormal(mean, scale_tril=scale_tril),
+            scipy.stats.multivariate_normal(
+                mean.detach().numpy(), cov.detach().numpy()
+            ),
+            f"MultivariateNormal(loc={mean}, scale_tril={scale_tril})",
+            multivariate=True,
+        )
 
     @set_default_dtype(torch.double)
     def test_multivariate_normal_properties(self):
@@ -2291,7 +3031,9 @@ def test_multivariate_normal_properties(self):
         scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(5, 5))
         m = MultivariateNormal(loc=loc, scale_tril=scale_tril)
         self.assertEqual(m.covariance_matrix, m.scale_tril.mm(m.scale_tril.t()))
-        self.assertEqual(m.covariance_matrix.mm(m.precision_matrix), torch.eye(m.event_shape[0]))
+        self.assertEqual(
+            m.covariance_matrix.mm(m.precision_matrix), torch.eye(m.event_shape[0])
+        )
         self.assertEqual(m.scale_tril, torch.linalg.cholesky(m.covariance_matrix))
 
     @set_default_dtype(torch.double)
@@ -2331,25 +3073,57 @@ def test_wishart_shape(self):
         # ensure that sample, batch, event shapes all handled correctly
         self.assertEqual(Wishart(df, cov).sample().size(), (5, ndim, ndim))
         self.assertEqual(Wishart(df_no_batch, cov).sample().size(), (ndim, ndim))
-        self.assertEqual(Wishart(df_multi_batch, cov).sample().size(), (6, 5, ndim, ndim))
+        self.assertEqual(
+            Wishart(df_multi_batch, cov).sample().size(), (6, 5, ndim, ndim)
+        )
         self.assertEqual(Wishart(df, cov).sample((2,)).size(), (2, 5, ndim, ndim))
         self.assertEqual(Wishart(df_no_batch, cov).sample((2,)).size(), (2, ndim, ndim))
-        self.assertEqual(Wishart(df_multi_batch, cov).sample((2,)).size(), (2, 6, 5, ndim, ndim))
+        self.assertEqual(
+            Wishart(df_multi_batch, cov).sample((2,)).size(), (2, 6, 5, ndim, ndim)
+        )
         self.assertEqual(Wishart(df, cov).sample((2, 7)).size(), (2, 7, 5, ndim, ndim))
-        self.assertEqual(Wishart(df_no_batch, cov).sample((2, 7)).size(), (2, 7, ndim, ndim))
-        self.assertEqual(Wishart(df_multi_batch, cov).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim))
-        self.assertEqual(Wishart(df, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim))
-        self.assertEqual(Wishart(df_no_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim))
-        self.assertEqual(Wishart(df_multi_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim))
-        self.assertEqual(Wishart(df, precision_matrix=prec).sample((2, 7)).size(), (2, 7, 5, ndim, ndim))
-        self.assertEqual(Wishart(df, precision_matrix=prec_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim))
-        self.assertEqual(Wishart(df, scale_tril=scale_tril).sample((2, 7)).size(), (2, 7, 5, ndim, ndim))
-        self.assertEqual(Wishart(df, scale_tril=scale_tril_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim))
+        self.assertEqual(
+            Wishart(df_no_batch, cov).sample((2, 7)).size(), (2, 7, ndim, ndim)
+        )
+        self.assertEqual(
+            Wishart(df_multi_batch, cov).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)
+        )
+        self.assertEqual(
+            Wishart(df, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)
+        )
+        self.assertEqual(
+            Wishart(df_no_batch, cov_batched).sample((2, 7)).size(),
+            (2, 7, 6, 5, ndim, ndim),
+        )
+        self.assertEqual(
+            Wishart(df_multi_batch, cov_batched).sample((2, 7)).size(),
+            (2, 7, 6, 5, ndim, ndim),
+        )
+        self.assertEqual(
+            Wishart(df, precision_matrix=prec).sample((2, 7)).size(),
+            (2, 7, 5, ndim, ndim),
+        )
+        self.assertEqual(
+            Wishart(df, precision_matrix=prec_batched).sample((2, 7)).size(),
+            (2, 7, 6, 5, ndim, ndim),
+        )
+        self.assertEqual(
+            Wishart(df, scale_tril=scale_tril).sample((2, 7)).size(),
+            (2, 7, 5, ndim, ndim),
+        )
+        self.assertEqual(
+            Wishart(df, scale_tril=scale_tril_batched).sample((2, 7)).size(),
+            (2, 7, 6, 5, ndim, ndim),
+        )
 
         # check gradients
         # Modified and applied the same tests for multivariate_normal
-        def wishart_log_prob_gradcheck(df=None, covariance=None, precision=None, scale_tril=None):
-            wishart_samples = Wishart(df, covariance, precision, scale_tril).sample().requires_grad_()
+        def wishart_log_prob_gradcheck(
+            df=None, covariance=None, precision=None, scale_tril=None
+        ):
+            wishart_samples = (
+                Wishart(df, covariance, precision, scale_tril).sample().requires_grad_()
+            )
 
             def gradcheck_func(samples, nu, sigma, prec, scale_tril):
                 if sigma is not None:
@@ -2359,7 +3133,12 @@ def gradcheck_func(samples, nu, sigma, prec, scale_tril):
                 if scale_tril is not None:
                     scale_tril = scale_tril.tril()
                 return Wishart(nu, sigma, prec, scale_tril).log_prob(samples)
-            gradcheck(gradcheck_func, (wishart_samples, df, covariance, precision, scale_tril), raise_exception=True)
+
+            gradcheck(
+                gradcheck_func,
+                (wishart_samples, df, covariance, precision, scale_tril),
+                raise_exception=True,
+            )
 
         wishart_log_prob_gradcheck(df, cov)
         wishart_log_prob_gradcheck(df_multi_batch, cov)
@@ -2373,7 +3152,7 @@ def test_wishart_stable_with_precision_matrix(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         ndim = 10
         x = torch.randn(ndim)
-        P = torch.exp(-(x - x.unsqueeze(-1)) ** 2)  # RBF kernel
+        P = torch.exp(-((x - x.unsqueeze(-1)) ** 2))  # RBF kernel
         Wishart(torch.tensor(ndim), precision_matrix=P)
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
@@ -2384,7 +3163,7 @@ def test_wishart_log_prob(self):
         df = torch.rand([], requires_grad=True) + ndim - 1
         # SciPy allowed ndim -1 < df < ndim for Wishar distribution after version 1.7.0
         if version.parse(scipy.__version__) < version.parse("1.7.0"):
-            df += 1.
+            df += 1.0
         tmp = torch.randn(ndim, 10)
         cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_()
         prec = cov.inverse().requires_grad_()
@@ -2400,15 +3179,30 @@ def test_wishart_log_prob(self):
         x = dist1.sample((1000,))
         expected = ref_dist.logpdf(x.transpose(0, 2).numpy())
 
-        self.assertEqual(0.0, np.mean((dist1.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0)
-        self.assertEqual(0.0, np.mean((dist2.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0)
-        self.assertEqual(0.0, np.mean((dist3.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0)
+        self.assertEqual(
+            0.0,
+            np.mean((dist1.log_prob(x).detach().numpy() - expected) ** 2),
+            atol=1e-3,
+            rtol=0,
+        )
+        self.assertEqual(
+            0.0,
+            np.mean((dist2.log_prob(x).detach().numpy() - expected) ** 2),
+            atol=1e-3,
+            rtol=0,
+        )
+        self.assertEqual(
+            0.0,
+            np.mean((dist3.log_prob(x).detach().numpy() - expected) ** 2),
+            atol=1e-3,
+            rtol=0,
+        )
 
         # Double-check that batched versions behave the same as unbatched
         df = torch.rand(5, requires_grad=True) + ndim - 1
         # SciPy allowed ndim -1 < df < ndim for Wishar distribution after version 1.7.0
         if version.parse(scipy.__version__) < version.parse("1.7.0"):
-            df += 1.
+            df += 1.0
         tmp = torch.randn(5, ndim, 10)
         cov = (tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1).requires_grad_()
 
@@ -2417,7 +3211,9 @@ def test_wishart_log_prob(self):
 
         x = dist_batched.sample((1000,))
         batched_prob = dist_batched.log_prob(x)
-        unbatched_prob = torch.stack([dist_unbatched[i].log_prob(x[:, i]) for i in range(5)]).t()
+        unbatched_prob = torch.stack(
+            [dist_unbatched[i].log_prob(x[:, i]) for i in range(5)]
+        ).t()
 
         self.assertEqual(batched_prob.shape, unbatched_prob.shape)
         self.assertEqual(batched_prob, unbatched_prob, atol=1e-3, rtol=0)
@@ -2430,7 +3226,7 @@ def test_wishart_sample(self):
         df = torch.rand([], requires_grad=True) + ndim - 1
         # SciPy allowed ndim -1 < df < ndim for Wishar distribution after version 1.7.0
         if version.parse(scipy.__version__) < version.parse("1.7.0"):
-            df += 1.
+            df += 1.0
         tmp = torch.randn(ndim, 10)
         cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_()
         prec = cov.inverse().requires_grad_()
@@ -2438,18 +3234,24 @@ def test_wishart_sample(self):
 
         ref_dist = scipy.stats.wishart(df.item(), cov.detach().numpy())
 
-        self._check_sampler_sampler(Wishart(df, cov),
-                                    ref_dist,
-                                    f'Wishart(df={df}, covariance_matrix={cov})',
-                                    multivariate=True)
-        self._check_sampler_sampler(Wishart(df, precision_matrix=prec),
-                                    ref_dist,
-                                    f'Wishart(df={df}, precision_matrix={prec})',
-                                    multivariate=True)
-        self._check_sampler_sampler(Wishart(df, scale_tril=scale_tril),
-                                    ref_dist,
-                                    f'Wishart(df={df}, scale_tril={scale_tril})',
-                                    multivariate=True)
+        self._check_sampler_sampler(
+            Wishart(df, cov),
+            ref_dist,
+            f"Wishart(df={df}, covariance_matrix={cov})",
+            multivariate=True,
+        )
+        self._check_sampler_sampler(
+            Wishart(df, precision_matrix=prec),
+            ref_dist,
+            f"Wishart(df={df}, precision_matrix={prec})",
+            multivariate=True,
+        )
+        self._check_sampler_sampler(
+            Wishart(df, scale_tril=scale_tril),
+            ref_dist,
+            f"Wishart(df={df}, scale_tril={scale_tril})",
+            multivariate=True,
+        )
 
     def test_wishart_properties(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
@@ -2458,7 +3260,9 @@ def test_wishart_properties(self):
         scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(ndim, ndim))
         m = Wishart(df=df, scale_tril=scale_tril)
         self.assertEqual(m.covariance_matrix, m.scale_tril.mm(m.scale_tril.t()))
-        self.assertEqual(m.covariance_matrix.mm(m.precision_matrix), torch.eye(m.event_shape[0]))
+        self.assertEqual(
+            m.covariance_matrix.mm(m.precision_matrix), torch.eye(m.event_shape[0])
+        )
         self.assertEqual(m.scale_tril, torch.linalg.cholesky(m.covariance_matrix))
 
     def test_wishart_moments(self):
@@ -2506,21 +3310,23 @@ def mean_var(lambd, sample):
             sample.exponential_(lambd)
             mean = sample.float().mean()
             var = sample.float().var()
-            self.assertEqual((1. / lambd), mean, atol=2e-2, rtol=2e-2)
-            self.assertEqual((1. / lambd) ** 2, var, atol=2e-2, rtol=2e-2)
+            self.assertEqual((1.0 / lambd), mean, atol=2e-2, rtol=2e-2)
+            self.assertEqual((1.0 / lambd) ** 2, var, atol=2e-2, rtol=2e-2)
 
         for dtype in [torch.float, torch.double, torch.bfloat16, torch.float16]:
-            for lambd in [0.2, 0.5, 1., 1.5, 2., 5.]:
+            for lambd in [0.2, 0.5, 1.0, 1.5, 2.0, 5.0]:
                 sample_len = 50000
                 mean_var(lambd, torch.rand(sample_len, dtype=dtype))
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_exponential_sample(self):
         set_rng_seed(1)  # see Note [Randomized statistical tests]
-        for rate in [1e-5, 1.0, 10.]:
-            self._check_sampler_sampler(Exponential(rate),
-                                        scipy.stats.expon(scale=1. / rate),
-                                        f'Exponential(rate={rate})')
+        for rate in [1e-5, 1.0, 10.0]:
+            self._check_sampler_sampler(
+                Exponential(rate),
+                scipy.stats.expon(scale=1.0 / rate),
+                f"Exponential(rate={rate})",
+            )
 
     @set_default_dtype(torch.double)
     def test_laplace(self):
@@ -2534,21 +3340,24 @@ def test_laplace(self):
         self.assertEqual(Laplace(loc, scale).sample((7,)).size(), (7, 5, 5))
         self.assertEqual(Laplace(loc_1d, scale_1d).sample((1,)).size(), (1, 1))
         self.assertEqual(Laplace(loc_1d, scale_1d).sample().size(), (1,))
-        self.assertEqual(Laplace(0.2, .6).sample((1,)).size(), (1,))
+        self.assertEqual(Laplace(0.2, 0.6).sample((1,)).size(), (1,))
         self.assertEqual(Laplace(-0.7, 50.0).sample((1,)).size(), (1,))
 
         # sample check for extreme value of mean, std
         set_rng_seed(0)
-        self.assertEqual(Laplace(loc_delta, scale_delta).sample(sample_shape=(1, 2)),
-                         torch.tensor([[[1.0, 0.0], [1.0, 0.0]]]),
-                         atol=1e-4, rtol=0)
+        self.assertEqual(
+            Laplace(loc_delta, scale_delta).sample(sample_shape=(1, 2)),
+            torch.tensor([[[1.0, 0.0], [1.0, 0.0]]]),
+            atol=1e-4,
+            rtol=0,
+        )
 
         self._gradcheck_log_prob(Laplace, (loc, scale))
         self._gradcheck_log_prob(Laplace, (loc, 1.0))
         self._gradcheck_log_prob(Laplace, (0.0, scale))
 
         state = torch.get_rng_state()
-        eps = torch.ones_like(loc).uniform_(-.5, .5)
+        eps = torch.ones_like(loc).uniform_(-0.5, 0.5)
         torch.set_rng_state(state)
         z = Laplace(loc, scale).rsample()
         z.backward(torch.ones_like(z))
@@ -2561,7 +3370,7 @@ def test_laplace(self):
         def ref_log_prob(idx, x, log_prob):
             m = loc.view(-1)[idx]
             s = scale.view(-1)[idx]
-            expected = (-math.log(2 * s) - abs(x - m) / s)
+            expected = -math.log(2 * s) - abs(x - m) / s
             self.assertEqual(log_prob, expected, atol=1e-3, rtol=0)
 
         self._check_log_prob(Laplace(loc, scale), ref_log_prob)
@@ -2571,9 +3380,11 @@ def ref_log_prob(idx, x, log_prob):
     def test_laplace_sample(self):
         set_rng_seed(1)  # see Note [Randomized statistical tests]
         for loc, scale in product([-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]):
-            self._check_sampler_sampler(Laplace(loc, scale),
-                                        scipy.stats.laplace(loc=loc, scale=scale),
-                                        f'Laplace(loc={loc}, scale={scale})')
+            self._check_sampler_sampler(
+                Laplace(loc, scale),
+                scipy.stats.laplace(loc=loc, scale=scale),
+                f"Laplace(loc={loc}, scale={scale})",
+            )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_gamma_shape(self):
@@ -2622,9 +3433,11 @@ def ref_log_prob(idx, x, log_prob):
     def test_gamma_sample(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         for alpha, beta in product([0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
-            self._check_sampler_sampler(Gamma(alpha, beta),
-                                        scipy.stats.gamma(alpha, scale=1.0 / beta),
-                                        f'Gamma(concentration={alpha}, rate={beta})')
+            self._check_sampler_sampler(
+                Gamma(alpha, beta),
+                scipy.stats.gamma(alpha, scale=1.0 / beta),
+                f"Gamma(concentration={alpha}, rate={beta})",
+            )
 
     @unittest.skipIf(not TEST_CUDA, "CUDA not found")
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
@@ -2632,10 +3445,12 @@ def test_gamma_gpu_sample(self):
         set_rng_seed(0)
         for alpha, beta in product([0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
             a, b = torch.tensor([alpha]).cuda(), torch.tensor([beta]).cuda()
-            self._check_sampler_sampler(Gamma(a, b),
-                                        scipy.stats.gamma(alpha, scale=1.0 / beta),
-                                        f'Gamma(alpha={alpha}, beta={beta})',
-                                        failure_rate=1e-4)
+            self._check_sampler_sampler(
+                Gamma(a, b),
+                scipy.stats.gamma(alpha, scale=1.0 / beta),
+                f"Gamma(alpha={alpha}, beta={beta})",
+                failure_rate=1e-4,
+            )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_pareto(self):
@@ -2664,9 +3479,11 @@ def ref_log_prob(idx, x, log_prob):
     def test_pareto_sample(self):
         set_rng_seed(1)  # see Note [Randomized statistical tests]
         for scale, alpha in product([0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
-            self._check_sampler_sampler(Pareto(scale, alpha),
-                                        scipy.stats.pareto(alpha, scale=scale),
-                                        f'Pareto(scale={scale}, alpha={alpha})')
+            self._check_sampler_sampler(
+                Pareto(scale, alpha),
+                scipy.stats.pareto(alpha, scale=scale),
+                f"Pareto(scale={scale}, alpha={alpha})",
+            )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_gumbel(self):
@@ -2680,18 +3497,46 @@ def test_gumbel(self):
         self.assertEqual(Gumbel(loc_1d, scale_1d).sample((1,)).size(), (1, 1))
         self.assertEqual(Gumbel(1.0, 1.0).sample().size(), ())
         self.assertEqual(Gumbel(1.0, 1.0).sample((1,)).size(), (1,))
-        self.assertEqual(Gumbel(torch.tensor(0.0, dtype=torch.float32),
-                                torch.tensor(1.0, dtype=torch.float32),
-                                validate_args=False).cdf(20.0), 1.0, atol=1e-4, rtol=0)
-        self.assertEqual(Gumbel(torch.tensor(0.0, dtype=torch.float64),
-                                torch.tensor(1.0, dtype=torch.float64),
-                                validate_args=False).cdf(50.0), 1.0, atol=1e-4, rtol=0)
-        self.assertEqual(Gumbel(torch.tensor(0.0, dtype=torch.float32),
-                                torch.tensor(1.0, dtype=torch.float32),
-                                validate_args=False).cdf(-5.0), 0.0, atol=1e-4, rtol=0)
-        self.assertEqual(Gumbel(torch.tensor(0.0, dtype=torch.float64),
-                                torch.tensor(1.0, dtype=torch.float64),
-                                validate_args=False).cdf(-10.0), 0.0, atol=1e-8, rtol=0)
+        self.assertEqual(
+            Gumbel(
+                torch.tensor(0.0, dtype=torch.float32),
+                torch.tensor(1.0, dtype=torch.float32),
+                validate_args=False,
+            ).cdf(20.0),
+            1.0,
+            atol=1e-4,
+            rtol=0,
+        )
+        self.assertEqual(
+            Gumbel(
+                torch.tensor(0.0, dtype=torch.float64),
+                torch.tensor(1.0, dtype=torch.float64),
+                validate_args=False,
+            ).cdf(50.0),
+            1.0,
+            atol=1e-4,
+            rtol=0,
+        )
+        self.assertEqual(
+            Gumbel(
+                torch.tensor(0.0, dtype=torch.float32),
+                torch.tensor(1.0, dtype=torch.float32),
+                validate_args=False,
+            ).cdf(-5.0),
+            0.0,
+            atol=1e-4,
+            rtol=0,
+        )
+        self.assertEqual(
+            Gumbel(
+                torch.tensor(0.0, dtype=torch.float64),
+                torch.tensor(1.0, dtype=torch.float64),
+                validate_args=False,
+            ).cdf(-10.0),
+            0.0,
+            atol=1e-8,
+            rtol=0,
+        )
 
         def ref_log_prob(idx, x, log_prob):
             l = loc.view(-1)[idx].detach()
@@ -2706,19 +3551,30 @@ def ref_log_prob(idx, x, log_prob):
     def test_gumbel_sample(self):
         set_rng_seed(1)  # see note [Randomized statistical tests]
         for loc, scale in product([-5.0, -1.0, -0.1, 0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
-            self._check_sampler_sampler(Gumbel(loc, scale),
-                                        scipy.stats.gumbel_r(loc=loc, scale=scale),
-                                        f'Gumbel(loc={loc}, scale={scale})')
+            self._check_sampler_sampler(
+                Gumbel(loc, scale),
+                scipy.stats.gumbel_r(loc=loc, scale=scale),
+                f"Gumbel(loc={loc}, scale={scale})",
+            )
 
     def test_kumaraswamy_shape(self):
         concentration1 = torch.randn(2, 3).abs().requires_grad_()
         concentration0 = torch.randn(2, 3).abs().requires_grad_()
         concentration1_1d = torch.randn(1).abs().requires_grad_()
         concentration0_1d = torch.randn(1).abs().requires_grad_()
-        self.assertEqual(Kumaraswamy(concentration1, concentration0).sample().size(), (2, 3))
-        self.assertEqual(Kumaraswamy(concentration1, concentration0).sample((5,)).size(), (5, 2, 3))
-        self.assertEqual(Kumaraswamy(concentration1_1d, concentration0_1d).sample().size(), (1,))
-        self.assertEqual(Kumaraswamy(concentration1_1d, concentration0_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(
+            Kumaraswamy(concentration1, concentration0).sample().size(), (2, 3)
+        )
+        self.assertEqual(
+            Kumaraswamy(concentration1, concentration0).sample((5,)).size(), (5, 2, 3)
+        )
+        self.assertEqual(
+            Kumaraswamy(concentration1_1d, concentration0_1d).sample().size(), (1,)
+        )
+        self.assertEqual(
+            Kumaraswamy(concentration1_1d, concentration0_1d).sample((1,)).size(),
+            (1, 1),
+        )
         self.assertEqual(Kumaraswamy(1.0, 1.0).sample().size(), ())
         self.assertEqual(Kumaraswamy(1.0, 1.0).sample((1,)).size(), (1,))
 
@@ -2732,19 +3588,25 @@ def test_kumaraswamy_mean_variance(self):
         cases = [(c1_1, c0_1), (c1_2, c0_2)]
         for i, (a, b) in enumerate(cases):
             m = Kumaraswamy(a, b)
-            samples = m.sample((60000, ))
+            samples = m.sample((60000,))
             expected = samples.mean(0)
             actual = m.mean
             error = (expected - actual).abs()
             max_error = max(error[error == error])
-            self.assertLess(max_error, 0.01,
-                            f"Kumaraswamy example {i + 1}/{len(cases)}, incorrect .mean")
+            self.assertLess(
+                max_error,
+                0.01,
+                f"Kumaraswamy example {i + 1}/{len(cases)}, incorrect .mean",
+            )
             expected = samples.var(0)
             actual = m.variance
             error = (expected - actual).abs()
             max_error = max(error[error == error])
-            self.assertLess(max_error, 0.01,
-                            f"Kumaraswamy example {i + 1}/{len(cases)}, incorrect .variance")
+            self.assertLess(
+                max_error,
+                0.01,
+                f"Kumaraswamy example {i + 1}/{len(cases)}, incorrect .variance",
+            )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_fishersnedecor(self):
@@ -2773,9 +3635,11 @@ def ref_log_prob(idx, x, log_prob):
     def test_fishersnedecor_sample(self):
         set_rng_seed(1)  # see note [Randomized statistical tests]
         for df1, df2 in product([0.1, 0.5, 1.0, 5.0, 10.0], [0.1, 0.5, 1.0, 5.0, 10.0]):
-            self._check_sampler_sampler(FisherSnedecor(df1, df2),
-                                        scipy.stats.f(df1, df2),
-                                        f'FisherSnedecor(loc={df1}, scale={df2})')
+            self._check_sampler_sampler(
+                FisherSnedecor(df1, df2),
+                scipy.stats.f(df1, df2),
+                f"FisherSnedecor(loc={df1}, scale={df2})",
+            )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_chi2_shape(self):
@@ -2785,7 +3649,9 @@ def test_chi2_shape(self):
         self.assertEqual(Chi2(df).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Chi2(df_1d).sample((1,)).size(), (1, 1))
         self.assertEqual(Chi2(df_1d).sample().size(), (1,))
-        self.assertEqual(Chi2(torch.tensor(0.5, requires_grad=True)).sample().size(), ())
+        self.assertEqual(
+            Chi2(torch.tensor(0.5, requires_grad=True)).sample().size(), ()
+        )
         self.assertEqual(Chi2(0.5).sample().size(), ())
         self.assertEqual(Chi2(0.5).sample((1,)).size(), (1,))
 
@@ -2800,9 +3666,9 @@ def ref_log_prob(idx, x, log_prob):
     def test_chi2_sample(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         for df in [0.1, 1.0, 5.0]:
-            self._check_sampler_sampler(Chi2(df),
-                                        scipy.stats.chi2(df),
-                                        f'Chi2(df={df})')
+            self._check_sampler_sampler(
+                Chi2(df), scipy.stats.chi2(df), f"Chi2(df={df})"
+            )
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_studentT(self):
@@ -2815,7 +3681,9 @@ def test_studentT(self):
         self.assertEqual(StudentT(df).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(StudentT(df_1d).sample((1,)).size(), (1, 1))
         self.assertEqual(StudentT(df_1d).sample().size(), (1,))
-        self.assertEqual(StudentT(torch.tensor(0.5, requires_grad=True)).sample().size(), ())
+        self.assertEqual(
+            StudentT(torch.tensor(0.5, requires_grad=True)).sample().size(), ()
+        )
         self.assertEqual(StudentT(0.5).sample().size(), ())
         self.assertEqual(StudentT(0.5).sample((1,)).size(), (1,))
 
@@ -2830,22 +3698,35 @@ def ref_log_prob(idx, x, log_prob):
     @set_default_dtype(torch.double)
     def test_studentT_sample(self):
         set_rng_seed(11)  # see Note [Randomized statistical tests]
-        for df, loc, scale in product([0.1, 1.0, 5.0, 10.0], [-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]):
-            self._check_sampler_sampler(StudentT(df=df, loc=loc, scale=scale),
-                                        scipy.stats.t(df=df, loc=loc, scale=scale),
-                                        f'StudentT(df={df}, loc={loc}, scale={scale})')
+        for df, loc, scale in product(
+            [0.1, 1.0, 5.0, 10.0], [-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]
+        ):
+            self._check_sampler_sampler(
+                StudentT(df=df, loc=loc, scale=scale),
+                scipy.stats.t(df=df, loc=loc, scale=scale),
+                f"StudentT(df={df}, loc={loc}, scale={scale})",
+            )
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_studentT_log_prob(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         num_samples = 10
-        for df, loc, scale in product([0.1, 1.0, 5.0, 10.0], [-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]):
+        for df, loc, scale in product(
+            [0.1, 1.0, 5.0, 10.0], [-1.0, 0.0, 1.0], [0.1, 1.0, 10.0]
+        ):
             dist = StudentT(df=df, loc=loc, scale=scale)
             x = dist.sample((num_samples,))
             actual_log_prob = dist.log_prob(x)
             for i in range(num_samples):
-                expected_log_prob = scipy.stats.t.logpdf(x[i], df=df, loc=loc, scale=scale)
-                self.assertEqual(float(actual_log_prob[i]), float(expected_log_prob), atol=1e-3, rtol=0)
+                expected_log_prob = scipy.stats.t.logpdf(
+                    x[i], df=df, loc=loc, scale=scale
+                )
+                self.assertEqual(
+                    float(actual_log_prob[i]),
+                    float(expected_log_prob),
+                    atol=1e-3,
+                    rtol=0,
+                )
 
     def test_dirichlet_shape(self):
         alpha = torch.randn(2, 3).exp().requires_grad_()
@@ -2864,16 +3745,18 @@ def test_dirichlet_log_prob(self):
         x = dist.sample((num_samples,))
         actual_log_prob = dist.log_prob(x)
         for i in range(num_samples):
-            expected_log_prob = scipy.stats.dirichlet.logpdf(x[i].numpy(), alpha.numpy())
+            expected_log_prob = scipy.stats.dirichlet.logpdf(
+                x[i].numpy(), alpha.numpy()
+            )
             self.assertEqual(actual_log_prob[i], expected_log_prob, atol=1e-3, rtol=0)
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_dirichlet_log_prob_zero(self):
-        # Specifically test the special case where x=0 and α=1.  The PDF is
-        # proportional to x**(α-1), which in this case works out to 0**0=1.
+        # Specifically test the special case where x=0 and alpha=1.  The PDF is
+        # proportional to x**(alpha-1), which in this case works out to 0**0=1.
         # The log PDF of this term should therefore be 0.  However, it's easy
         # to accidentally introduce NaNs by calculating log(x) without regard
-        # for the value of α-1.
+        # for the value of alpha-1.
         alpha = torch.tensor([1, 2])
         dist = Dirichlet(alpha)
         x = torch.tensor([0, 1])
@@ -2885,17 +3768,19 @@ def test_dirichlet_log_prob_zero(self):
     def test_dirichlet_sample(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         alpha = torch.exp(torch.randn(3))
-        self._check_sampler_sampler(Dirichlet(alpha),
-                                    scipy.stats.dirichlet(alpha.numpy()),
-                                    f'Dirichlet(alpha={list(alpha)})',
-                                    multivariate=True)
+        self._check_sampler_sampler(
+            Dirichlet(alpha),
+            scipy.stats.dirichlet(alpha.numpy()),
+            f"Dirichlet(alpha={list(alpha)})",
+            multivariate=True,
+        )
 
     def test_dirichlet_mode(self):
         # Test a few edge cases for the Dirichlet distribution mode. This also covers beta distributions.
         concentrations_and_modes = [
-            ([2, 2, 1], [.5, .5, 0.]),
+            ([2, 2, 1], [0.5, 0.5, 0.0]),
             ([3, 2, 1], [2 / 3, 1 / 3, 0]),
-            ([.5, .2, .2], [1., 0., 0.]),
+            ([0.5, 0.2, 0.2], [1.0, 0.0, 0.0]),
             ([1, 1, 1], [nan, nan, nan]),
         ]
         for concentration, mode in concentrations_and_modes:
@@ -2923,20 +3808,24 @@ def test_beta_log_prob(self):
             x = dist.sample()
             actual_log_prob = dist.log_prob(x).sum()
             expected_log_prob = scipy.stats.beta.logpdf(x, con1, con0)
-            self.assertEqual(float(actual_log_prob), float(expected_log_prob), atol=1e-3, rtol=0)
+            self.assertEqual(
+                float(actual_log_prob), float(expected_log_prob), atol=1e-3, rtol=0
+            )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     @set_default_dtype(torch.double)
     def test_beta_sample(self):
         set_rng_seed(1)  # see Note [Randomized statistical tests]
         for con1, con0 in product([0.1, 1.0, 10.0], [0.1, 1.0, 10.0]):
-            self._check_sampler_sampler(Beta(con1, con0),
-                                        scipy.stats.beta(con1, con0),
-                                        f'Beta(alpha={con1}, beta={con0})')
+            self._check_sampler_sampler(
+                Beta(con1, con0),
+                scipy.stats.beta(con1, con0),
+                f"Beta(alpha={con1}, beta={con0})",
+            )
         # Check that small alphas do not cause NANs.
         for Tensor in [torch.FloatTensor, torch.DoubleTensor]:
             x = Beta(Tensor([1e-6]), Tensor([1e-6])).sample()[0]
-            self.assertTrue(np.isfinite(x) and x > 0, f'Invalid Beta.sample(): {x}')
+            self.assertTrue(np.isfinite(x) and x > 0, f"Invalid Beta.sample(): {x}")
 
     def test_beta_underflow(self):
         # For low values of (alpha, beta), the gamma samples can underflow
@@ -2980,34 +3869,63 @@ def test_continuous_bernoulli(self):
         self.assertFalse(ContinuousBernoulli(p).sample().requires_grad)
         self.assertEqual(ContinuousBernoulli(r).sample((8,)).size(), (8,))
         self.assertEqual(ContinuousBernoulli(r).sample().size(), ())
-        self.assertEqual(ContinuousBernoulli(r).sample((3, 2)).size(), (3, 2,))
+        self.assertEqual(
+            ContinuousBernoulli(r).sample((3, 2)).size(),
+            (
+                3,
+                2,
+            ),
+        )
         self.assertEqual(ContinuousBernoulli(s).sample().size(), ())
         self._gradcheck_log_prob(ContinuousBernoulli, (p,))
 
         def ref_log_prob(idx, val, log_prob):
             prob = p[idx]
             if prob > 0.499 and prob < 0.501:  # using default value of lim here
-                log_norm_const = math.log(2.) + 4. / 3. * math.pow(prob - 0.5, 2) + 104. / 45. * math.pow(prob - 0.5, 4)
+                log_norm_const = (
+                    math.log(2.0)
+                    + 4.0 / 3.0 * math.pow(prob - 0.5, 2)
+                    + 104.0 / 45.0 * math.pow(prob - 0.5, 4)
+                )
             else:
-                log_norm_const = math.log(2. * math.atanh(1. - 2. * prob) / (1. - 2.0 * prob))
-            res = val * math.log(prob) + (1. - val) * math.log1p(-prob) + log_norm_const
+                log_norm_const = math.log(
+                    2.0 * math.atanh(1.0 - 2.0 * prob) / (1.0 - 2.0 * prob)
+                )
+            res = (
+                val * math.log(prob) + (1.0 - val) * math.log1p(-prob) + log_norm_const
+            )
             self.assertEqual(log_prob, res)
 
         self._check_log_prob(ContinuousBernoulli(p), ref_log_prob)
-        self._check_log_prob(ContinuousBernoulli(logits=p.log() - (-p).log1p()), ref_log_prob)
+        self._check_log_prob(
+            ContinuousBernoulli(logits=p.log() - (-p).log1p()), ref_log_prob
+        )
 
         # check entropy computation
-        self.assertEqual(ContinuousBernoulli(p).entropy(), torch.tensor([-0.02938, -0.07641, -0.00682]), atol=1e-4, rtol=0)
+        self.assertEqual(
+            ContinuousBernoulli(p).entropy(),
+            torch.tensor([-0.02938, -0.07641, -0.00682]),
+            atol=1e-4,
+            rtol=0,
+        )
         # entropy below corresponds to the clamped value of prob when using float 64
         # the value for float32 should be -1.76898
-        self.assertEqual(ContinuousBernoulli(torch.tensor([0.0])).entropy(), torch.tensor([-2.58473]), atol=1e-5, rtol=0)
-        self.assertEqual(ContinuousBernoulli(s).entropy(), torch.tensor(-0.02938), atol=1e-4, rtol=0)
+        self.assertEqual(
+            ContinuousBernoulli(torch.tensor([0.0])).entropy(),
+            torch.tensor([-2.58473]),
+            atol=1e-5,
+            rtol=0,
+        )
+        self.assertEqual(
+            ContinuousBernoulli(s).entropy(), torch.tensor(-0.02938), atol=1e-4, rtol=0
+        )
 
     def test_continuous_bernoulli_3d(self):
         p = torch.full((2, 3, 5), 0.5).requires_grad_()
         self.assertEqual(ContinuousBernoulli(p).sample().size(), (2, 3, 5))
-        self.assertEqual(ContinuousBernoulli(p).sample(sample_shape=(2, 5)).size(),
-                         (2, 5, 2, 3, 5))
+        self.assertEqual(
+            ContinuousBernoulli(p).sample(sample_shape=(2, 5)).size(), (2, 5, 2, 3, 5)
+        )
         self.assertEqual(ContinuousBernoulli(p).sample((2,)).size(), (2, 2, 3, 5))
 
     def test_lkj_cholesky_log_prob(self):
@@ -3019,18 +3937,25 @@ def tril_cholesky_to_tril_corr(x):
 
         for dim in range(2, 5):
             log_probs = []
-            lkj = LKJCholesky(dim, concentration=1., validate_args=True)
+            lkj = LKJCholesky(dim, concentration=1.0, validate_args=True)
             for i in range(2):
                 sample = lkj.sample()
                 sample_tril = tril_matrix_to_vec(sample, diag=-1)
                 log_prob = lkj.log_prob(sample)
-                log_abs_det_jacobian = torch.slogdet(jacobian(tril_cholesky_to_tril_corr, sample_tril)).logabsdet
+                log_abs_det_jacobian = torch.slogdet(
+                    jacobian(tril_cholesky_to_tril_corr, sample_tril)
+                ).logabsdet
                 log_probs.append(log_prob - log_abs_det_jacobian)
             # for concentration=1., the density is uniform over the space of all
             # correlation matrices.
             if dim == 2:
                 # for dim=2, pdf = 0.5 (jacobian adjustment factor is 0.)
-                self.assertTrue(all(torch.allclose(x, torch.tensor(0.5).log(), atol=1e-10) for x in log_probs))
+                self.assertTrue(
+                    all(
+                        torch.allclose(x, torch.tensor(0.5).log(), atol=1e-10)
+                        for x in log_probs
+                    )
+                )
             self.assertEqual(log_probs[0], log_probs[1])
             invalid_sample = torch.cat([sample, sample.new_ones(1, dim)], dim=0)
             self.assertRaises(ValueError, lambda: lkj.log_prob(invalid_sample))
@@ -3043,23 +3968,36 @@ def test_independent_shape(self):
                 base_log_prob_shape = base_dist.log_prob(x).shape
                 for reinterpreted_batch_ndims in range(len(base_dist.batch_shape) + 1):
                     indep_dist = Independent(base_dist, reinterpreted_batch_ndims)
-                    indep_log_prob_shape = base_log_prob_shape[:len(base_log_prob_shape) - reinterpreted_batch_ndims]
+                    indep_log_prob_shape = base_log_prob_shape[
+                        : len(base_log_prob_shape) - reinterpreted_batch_ndims
+                    ]
                     self.assertEqual(indep_dist.log_prob(x).shape, indep_log_prob_shape)
-                    self.assertEqual(indep_dist.sample().shape, base_dist.sample().shape)
+                    self.assertEqual(
+                        indep_dist.sample().shape, base_dist.sample().shape
+                    )
                     self.assertEqual(indep_dist.has_rsample, base_dist.has_rsample)
                     if indep_dist.has_rsample:
-                        self.assertEqual(indep_dist.sample().shape, base_dist.sample().shape)
+                        self.assertEqual(
+                            indep_dist.sample().shape, base_dist.sample().shape
+                        )
                     try:
-                        self.assertEqual(indep_dist.enumerate_support().shape, base_dist.enumerate_support().shape)
+                        self.assertEqual(
+                            indep_dist.enumerate_support().shape,
+                            base_dist.enumerate_support().shape,
+                        )
                         self.assertEqual(indep_dist.mean.shape, base_dist.mean.shape)
                     except NotImplementedError:
                         pass
                     try:
-                        self.assertEqual(indep_dist.variance.shape, base_dist.variance.shape)
+                        self.assertEqual(
+                            indep_dist.variance.shape, base_dist.variance.shape
+                        )
                     except NotImplementedError:
                         pass
                     try:
-                        self.assertEqual(indep_dist.entropy().shape, indep_log_prob_shape)
+                        self.assertEqual(
+                            indep_dist.entropy().shape, indep_log_prob_shape
+                        )
                     except NotImplementedError:
                         pass
 
@@ -3075,8 +4013,10 @@ def test_independent_expand(self):
                         expanded_sample = expanded.sample()
                         expected_shape = expanded_shape + indep_dist.event_shape
                         self.assertEqual(expanded_sample.shape, expected_shape)
-                        self.assertEqual(expanded.log_prob(expanded_sample),
-                                         indep_dist.log_prob(expanded_sample))
+                        self.assertEqual(
+                            expanded.log_prob(expanded_sample),
+                            indep_dist.log_prob(expanded_sample),
+                        )
                         self.assertEqual(expanded.event_shape, indep_dist.event_shape)
                         self.assertEqual(expanded.batch_shape, expanded_shape)
 
@@ -3093,16 +4033,22 @@ def test_cdf_icdf_inverse(self):
                 except NotImplementedError:
                     continue
                 rel_error = torch.abs(actual - samples) / (1e-10 + torch.abs(samples))
-                self.assertLess(rel_error.max(), 1e-4, msg='\n'.join([
-                    f'{Dist.__name__} example {i + 1}/{len(params)}, icdf(cdf(x)) != x',
-                    f'x = {samples}',
-                    f'cdf(x) = {cdf}',
-                    f'icdf(cdf(x)) = {actual}',
-                ]))
+                self.assertLess(
+                    rel_error.max(),
+                    1e-4,
+                    msg="\n".join(
+                        [
+                            f"{Dist.__name__} example {i + 1}/{len(params)}, icdf(cdf(x)) != x",
+                            f"x = {samples}",
+                            f"cdf(x) = {cdf}",
+                            f"icdf(cdf(x)) = {actual}",
+                        ]
+                    ),
+                )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_gamma_log_prob_at_boundary(self):
-        for concentration, log_prob in [(.5, inf), (1, 0), (2, -inf)]:
+        for concentration, log_prob in [(0.5, inf), (1, 0), (2, -inf)]:
             dist = Gamma(concentration, 1)
             scipy_dist = scipy.stats.gamma(concentration)
             self.assertAlmostEqual(dist.log_prob(0), log_prob)
@@ -3114,8 +4060,10 @@ def test_cdf_log_prob(self):
         for Dist, params in _get_examples():
             for i, param in enumerate(params):
                 # We do not need grads wrt params here, e.g. shape of gamma distribution.
-                param = {key: value.detach() if isinstance(value, torch.Tensor) else value
-                         for key, value in param.items()}
+                param = {
+                    key: value.detach() if isinstance(value, torch.Tensor) else value
+                    for key, value in param.items()
+                }
                 dist = Dist(**param)
                 samples = dist.sample()
                 if not dist.support.is_discrete:
@@ -3125,183 +4073,260 @@ def test_cdf_log_prob(self):
                     pdfs = dist.log_prob(samples).exp()
                 except NotImplementedError:
                     continue
-                cdfs_derivative = grad(cdfs.sum(), [samples])[0]  # this should not be wrapped in torch.abs()
-                self.assertEqual(cdfs_derivative, pdfs, msg='\n'.join([
-                    f'{Dist.__name__} example {i + 1}/{len(params)}, d(cdf)/dx != pdf(x)',
-                    f'x = {samples}',
-                    f'cdf = {cdfs}',
-                    f'pdf = {pdfs}',
-                    f'grad(cdf) = {cdfs_derivative}',
-                ]))
+                cdfs_derivative = grad(cdfs.sum(), [samples])[
+                    0
+                ]  # this should not be wrapped in torch.abs()
+                self.assertEqual(
+                    cdfs_derivative,
+                    pdfs,
+                    msg="\n".join(
+                        [
+                            f"{Dist.__name__} example {i + 1}/{len(params)}, d(cdf)/dx != pdf(x)",
+                            f"x = {samples}",
+                            f"cdf = {cdfs}",
+                            f"pdf = {pdfs}",
+                            f"grad(cdf) = {cdfs_derivative}",
+                        ]
+                    ),
+                )
 
     def test_valid_parameter_broadcasting(self):
         # Test correct broadcasting of parameter sizes for distributions that have multiple
         # parameters.
         # example type (distribution instance, expected sample shape)
         valid_examples = [
-            (Normal(loc=torch.tensor([0., 0.]), scale=1),
-             (2,)),
-            (Normal(loc=0, scale=torch.tensor([1., 1.])),
-             (2,)),
-            (Normal(loc=torch.tensor([0., 0.]), scale=torch.tensor([1.])),
-             (2,)),
-            (Normal(loc=torch.tensor([0., 0.]), scale=torch.tensor([[1.], [1.]])),
-             (2, 2)),
-            (Normal(loc=torch.tensor([0., 0.]), scale=torch.tensor([[1.]])),
-             (1, 2)),
-            (Normal(loc=torch.tensor([0.]), scale=torch.tensor([[1.]])),
-             (1, 1)),
-            (FisherSnedecor(df1=torch.tensor([1., 1.]), df2=1),
-             (2,)),
-            (FisherSnedecor(df1=1, df2=torch.tensor([1., 1.])),
-             (2,)),
-            (FisherSnedecor(df1=torch.tensor([1., 1.]), df2=torch.tensor([1.])),
-             (2,)),
-            (FisherSnedecor(df1=torch.tensor([1., 1.]), df2=torch.tensor([[1.], [1.]])),
-             (2, 2)),
-            (FisherSnedecor(df1=torch.tensor([1., 1.]), df2=torch.tensor([[1.]])),
-             (1, 2)),
-            (FisherSnedecor(df1=torch.tensor([1.]), df2=torch.tensor([[1.]])),
-             (1, 1)),
-            (Gamma(concentration=torch.tensor([1., 1.]), rate=1),
-             (2,)),
-            (Gamma(concentration=1, rate=torch.tensor([1., 1.])),
-             (2,)),
-            (Gamma(concentration=torch.tensor([1., 1.]), rate=torch.tensor([[1.], [1.], [1.]])),
-             (3, 2)),
-            (Gamma(concentration=torch.tensor([1., 1.]), rate=torch.tensor([[1.], [1.]])),
-             (2, 2)),
-            (Gamma(concentration=torch.tensor([1., 1.]), rate=torch.tensor([[1.]])),
-             (1, 2)),
-            (Gamma(concentration=torch.tensor([1.]), rate=torch.tensor([[1.]])),
-             (1, 1)),
-            (Gumbel(loc=torch.tensor([0., 0.]), scale=1),
-             (2,)),
-            (Gumbel(loc=0, scale=torch.tensor([1., 1.])),
-             (2,)),
-            (Gumbel(loc=torch.tensor([0., 0.]), scale=torch.tensor([1.])),
-             (2,)),
-            (Gumbel(loc=torch.tensor([0., 0.]), scale=torch.tensor([[1.], [1.]])),
-             (2, 2)),
-            (Gumbel(loc=torch.tensor([0., 0.]), scale=torch.tensor([[1.]])),
-             (1, 2)),
-            (Gumbel(loc=torch.tensor([0.]), scale=torch.tensor([[1.]])),
-             (1, 1)),
-            (Kumaraswamy(concentration1=torch.tensor([1., 1.]), concentration0=1.),
-             (2,)),
-            (Kumaraswamy(concentration1=1, concentration0=torch.tensor([1., 1.])),
-             (2, )),
-            (Kumaraswamy(concentration1=torch.tensor([1., 1.]), concentration0=torch.tensor([1.])),
-             (2,)),
-            (Kumaraswamy(concentration1=torch.tensor([1., 1.]), concentration0=torch.tensor([[1.], [1.]])),
-             (2, 2)),
-            (Kumaraswamy(concentration1=torch.tensor([1., 1.]), concentration0=torch.tensor([[1.]])),
-             (1, 2)),
-            (Kumaraswamy(concentration1=torch.tensor([1.]), concentration0=torch.tensor([[1.]])),
-             (1, 1)),
-            (Laplace(loc=torch.tensor([0., 0.]), scale=1),
-             (2,)),
-            (Laplace(loc=0, scale=torch.tensor([1., 1.])),
-             (2,)),
-            (Laplace(loc=torch.tensor([0., 0.]), scale=torch.tensor([1.])),
-             (2,)),
-            (Laplace(loc=torch.tensor([0., 0.]), scale=torch.tensor([[1.], [1.]])),
-             (2, 2)),
-            (Laplace(loc=torch.tensor([0., 0.]), scale=torch.tensor([[1.]])),
-             (1, 2)),
-            (Laplace(loc=torch.tensor([0.]), scale=torch.tensor([[1.]])),
-             (1, 1)),
-            (Pareto(scale=torch.tensor([1., 1.]), alpha=1),
-             (2,)),
-            (Pareto(scale=1, alpha=torch.tensor([1., 1.])),
-             (2,)),
-            (Pareto(scale=torch.tensor([1., 1.]), alpha=torch.tensor([1.])),
-             (2,)),
-            (Pareto(scale=torch.tensor([1., 1.]), alpha=torch.tensor([[1.], [1.]])),
-             (2, 2)),
-            (Pareto(scale=torch.tensor([1., 1.]), alpha=torch.tensor([[1.]])),
-             (1, 2)),
-            (Pareto(scale=torch.tensor([1.]), alpha=torch.tensor([[1.]])),
-             (1, 1)),
-            (StudentT(df=torch.tensor([1., 1.]), loc=1),
-             (2,)),
-            (StudentT(df=1, scale=torch.tensor([1., 1.])),
-             (2,)),
-            (StudentT(df=torch.tensor([1., 1.]), loc=torch.tensor([1.])),
-             (2,)),
-            (StudentT(df=torch.tensor([1., 1.]), scale=torch.tensor([[1.], [1.]])),
-             (2, 2)),
-            (StudentT(df=torch.tensor([1., 1.]), loc=torch.tensor([[1.]])),
-             (1, 2)),
-            (StudentT(df=torch.tensor([1.]), scale=torch.tensor([[1.]])),
-             (1, 1)),
-            (StudentT(df=1., loc=torch.zeros(5, 1), scale=torch.ones(3)),
-             (5, 3)),
+            (Normal(loc=torch.tensor([0.0, 0.0]), scale=1), (2,)),
+            (Normal(loc=0, scale=torch.tensor([1.0, 1.0])), (2,)),
+            (Normal(loc=torch.tensor([0.0, 0.0]), scale=torch.tensor([1.0])), (2,)),
+            (
+                Normal(
+                    loc=torch.tensor([0.0, 0.0]), scale=torch.tensor([[1.0], [1.0]])
+                ),
+                (2, 2),
+            ),
+            (Normal(loc=torch.tensor([0.0, 0.0]), scale=torch.tensor([[1.0]])), (1, 2)),
+            (Normal(loc=torch.tensor([0.0]), scale=torch.tensor([[1.0]])), (1, 1)),
+            (FisherSnedecor(df1=torch.tensor([1.0, 1.0]), df2=1), (2,)),
+            (FisherSnedecor(df1=1, df2=torch.tensor([1.0, 1.0])), (2,)),
+            (
+                FisherSnedecor(df1=torch.tensor([1.0, 1.0]), df2=torch.tensor([1.0])),
+                (2,),
+            ),
+            (
+                FisherSnedecor(
+                    df1=torch.tensor([1.0, 1.0]), df2=torch.tensor([[1.0], [1.0]])
+                ),
+                (2, 2),
+            ),
+            (
+                FisherSnedecor(df1=torch.tensor([1.0, 1.0]), df2=torch.tensor([[1.0]])),
+                (1, 2),
+            ),
+            (
+                FisherSnedecor(df1=torch.tensor([1.0]), df2=torch.tensor([[1.0]])),
+                (1, 1),
+            ),
+            (Gamma(concentration=torch.tensor([1.0, 1.0]), rate=1), (2,)),
+            (Gamma(concentration=1, rate=torch.tensor([1.0, 1.0])), (2,)),
+            (
+                Gamma(
+                    concentration=torch.tensor([1.0, 1.0]),
+                    rate=torch.tensor([[1.0], [1.0], [1.0]]),
+                ),
+                (3, 2),
+            ),
+            (
+                Gamma(
+                    concentration=torch.tensor([1.0, 1.0]),
+                    rate=torch.tensor([[1.0], [1.0]]),
+                ),
+                (2, 2),
+            ),
+            (
+                Gamma(
+                    concentration=torch.tensor([1.0, 1.0]), rate=torch.tensor([[1.0]])
+                ),
+                (1, 2),
+            ),
+            (
+                Gamma(concentration=torch.tensor([1.0]), rate=torch.tensor([[1.0]])),
+                (1, 1),
+            ),
+            (Gumbel(loc=torch.tensor([0.0, 0.0]), scale=1), (2,)),
+            (Gumbel(loc=0, scale=torch.tensor([1.0, 1.0])), (2,)),
+            (Gumbel(loc=torch.tensor([0.0, 0.0]), scale=torch.tensor([1.0])), (2,)),
+            (
+                Gumbel(
+                    loc=torch.tensor([0.0, 0.0]), scale=torch.tensor([[1.0], [1.0]])
+                ),
+                (2, 2),
+            ),
+            (Gumbel(loc=torch.tensor([0.0, 0.0]), scale=torch.tensor([[1.0]])), (1, 2)),
+            (Gumbel(loc=torch.tensor([0.0]), scale=torch.tensor([[1.0]])), (1, 1)),
+            (
+                Kumaraswamy(
+                    concentration1=torch.tensor([1.0, 1.0]), concentration0=1.0
+                ),
+                (2,),
+            ),
+            (
+                Kumaraswamy(concentration1=1, concentration0=torch.tensor([1.0, 1.0])),
+                (2,),
+            ),
+            (
+                Kumaraswamy(
+                    concentration1=torch.tensor([1.0, 1.0]),
+                    concentration0=torch.tensor([1.0]),
+                ),
+                (2,),
+            ),
+            (
+                Kumaraswamy(
+                    concentration1=torch.tensor([1.0, 1.0]),
+                    concentration0=torch.tensor([[1.0], [1.0]]),
+                ),
+                (2, 2),
+            ),
+            (
+                Kumaraswamy(
+                    concentration1=torch.tensor([1.0, 1.0]),
+                    concentration0=torch.tensor([[1.0]]),
+                ),
+                (1, 2),
+            ),
+            (
+                Kumaraswamy(
+                    concentration1=torch.tensor([1.0]),
+                    concentration0=torch.tensor([[1.0]]),
+                ),
+                (1, 1),
+            ),
+            (Laplace(loc=torch.tensor([0.0, 0.0]), scale=1), (2,)),
+            (Laplace(loc=0, scale=torch.tensor([1.0, 1.0])), (2,)),
+            (Laplace(loc=torch.tensor([0.0, 0.0]), scale=torch.tensor([1.0])), (2,)),
+            (
+                Laplace(
+                    loc=torch.tensor([0.0, 0.0]), scale=torch.tensor([[1.0], [1.0]])
+                ),
+                (2, 2),
+            ),
+            (
+                Laplace(loc=torch.tensor([0.0, 0.0]), scale=torch.tensor([[1.0]])),
+                (1, 2),
+            ),
+            (Laplace(loc=torch.tensor([0.0]), scale=torch.tensor([[1.0]])), (1, 1)),
+            (Pareto(scale=torch.tensor([1.0, 1.0]), alpha=1), (2,)),
+            (Pareto(scale=1, alpha=torch.tensor([1.0, 1.0])), (2,)),
+            (Pareto(scale=torch.tensor([1.0, 1.0]), alpha=torch.tensor([1.0])), (2,)),
+            (
+                Pareto(
+                    scale=torch.tensor([1.0, 1.0]), alpha=torch.tensor([[1.0], [1.0]])
+                ),
+                (2, 2),
+            ),
+            (
+                Pareto(scale=torch.tensor([1.0, 1.0]), alpha=torch.tensor([[1.0]])),
+                (1, 2),
+            ),
+            (Pareto(scale=torch.tensor([1.0]), alpha=torch.tensor([[1.0]])), (1, 1)),
+            (StudentT(df=torch.tensor([1.0, 1.0]), loc=1), (2,)),
+            (StudentT(df=1, scale=torch.tensor([1.0, 1.0])), (2,)),
+            (StudentT(df=torch.tensor([1.0, 1.0]), loc=torch.tensor([1.0])), (2,)),
+            (
+                StudentT(
+                    df=torch.tensor([1.0, 1.0]), scale=torch.tensor([[1.0], [1.0]])
+                ),
+                (2, 2),
+            ),
+            (StudentT(df=torch.tensor([1.0, 1.0]), loc=torch.tensor([[1.0]])), (1, 2)),
+            (StudentT(df=torch.tensor([1.0]), scale=torch.tensor([[1.0]])), (1, 1)),
+            (StudentT(df=1.0, loc=torch.zeros(5, 1), scale=torch.ones(3)), (5, 3)),
         ]
 
         for dist, expected_size in valid_examples:
             actual_size = dist.sample().size()
-            self.assertEqual(actual_size, expected_size,
-                             msg=f'{dist} actual size: {actual_size} != expected size: {expected_size}')
+            self.assertEqual(
+                actual_size,
+                expected_size,
+                msg=f"{dist} actual size: {actual_size} != expected size: {expected_size}",
+            )
 
             sample_shape = torch.Size((2,))
             expected_size = sample_shape + expected_size
             actual_size = dist.sample(sample_shape).size()
-            self.assertEqual(actual_size, expected_size,
-                             msg=f'{dist} actual size: {actual_size} != expected size: {expected_size}')
+            self.assertEqual(
+                actual_size,
+                expected_size,
+                msg=f"{dist} actual size: {actual_size} != expected size: {expected_size}",
+            )
 
     def test_invalid_parameter_broadcasting(self):
         # invalid broadcasting cases; should throw error
         # example type (distribution class, distribution params)
         invalid_examples = [
-            (Normal, {
-                'loc': torch.tensor([[0, 0]]),
-                'scale': torch.tensor([1, 1, 1, 1])
-            }),
-            (Normal, {
-                'loc': torch.tensor([[[0, 0, 0], [0, 0, 0]]]),
-                'scale': torch.tensor([1, 1])
-            }),
-            (FisherSnedecor, {
-                'df1': torch.tensor([1, 1]),
-                'df2': torch.tensor([1, 1, 1]),
-            }),
-            (Gumbel, {
-                'loc': torch.tensor([[0, 0]]),
-                'scale': torch.tensor([1, 1, 1, 1])
-            }),
-            (Gumbel, {
-                'loc': torch.tensor([[[0, 0, 0], [0, 0, 0]]]),
-                'scale': torch.tensor([1, 1])
-            }),
-            (Gamma, {
-                'concentration': torch.tensor([0, 0]),
-                'rate': torch.tensor([1, 1, 1])
-            }),
-            (Kumaraswamy, {
-                'concentration1': torch.tensor([[1, 1]]),
-                'concentration0': torch.tensor([1, 1, 1, 1])
-            }),
-            (Kumaraswamy, {
-                'concentration1': torch.tensor([[[1, 1, 1], [1, 1, 1]]]),
-                'concentration0': torch.tensor([1, 1])
-            }),
-            (Laplace, {
-                'loc': torch.tensor([0, 0]),
-                'scale': torch.tensor([1, 1, 1])
-            }),
-            (Pareto, {
-                'scale': torch.tensor([1, 1]),
-                'alpha': torch.tensor([1, 1, 1])
-            }),
-            (StudentT, {
-                'df': torch.tensor([1., 1.]),
-                'scale': torch.tensor([1., 1., 1.])
-            }),
-            (StudentT, {
-                'df': torch.tensor([1., 1.]),
-                'loc': torch.tensor([1., 1., 1.])
-            })
+            (
+                Normal,
+                {"loc": torch.tensor([[0, 0]]), "scale": torch.tensor([1, 1, 1, 1])},
+            ),
+            (
+                Normal,
+                {
+                    "loc": torch.tensor([[[0, 0, 0], [0, 0, 0]]]),
+                    "scale": torch.tensor([1, 1]),
+                },
+            ),
+            (
+                FisherSnedecor,
+                {
+                    "df1": torch.tensor([1, 1]),
+                    "df2": torch.tensor([1, 1, 1]),
+                },
+            ),
+            (
+                Gumbel,
+                {"loc": torch.tensor([[0, 0]]), "scale": torch.tensor([1, 1, 1, 1])},
+            ),
+            (
+                Gumbel,
+                {
+                    "loc": torch.tensor([[[0, 0, 0], [0, 0, 0]]]),
+                    "scale": torch.tensor([1, 1]),
+                },
+            ),
+            (
+                Gamma,
+                {
+                    "concentration": torch.tensor([0, 0]),
+                    "rate": torch.tensor([1, 1, 1]),
+                },
+            ),
+            (
+                Kumaraswamy,
+                {
+                    "concentration1": torch.tensor([[1, 1]]),
+                    "concentration0": torch.tensor([1, 1, 1, 1]),
+                },
+            ),
+            (
+                Kumaraswamy,
+                {
+                    "concentration1": torch.tensor([[[1, 1, 1], [1, 1, 1]]]),
+                    "concentration0": torch.tensor([1, 1]),
+                },
+            ),
+            (Laplace, {"loc": torch.tensor([0, 0]), "scale": torch.tensor([1, 1, 1])}),
+            (Pareto, {"scale": torch.tensor([1, 1]), "alpha": torch.tensor([1, 1, 1])}),
+            (
+                StudentT,
+                {
+                    "df": torch.tensor([1.0, 1.0]),
+                    "scale": torch.tensor([1.0, 1.0, 1.0]),
+                },
+            ),
+            (
+                StudentT,
+                {"df": torch.tensor([1.0, 1.0]), "loc": torch.tensor([1.0, 1.0, 1.0])},
+            ),
         ]
 
         for dist, kwargs in invalid_examples:
@@ -3314,23 +4339,33 @@ def _test_discrete_distribution_mode(self, dist, sanitized_mode, batch_isfinite)
             log_prob_mode = dist.log_prob(sanitized_mode)
             if isinstance(dist, OneHotCategorical):
                 idx = (dist._categorical.mode + 1) % dist.probs.shape[-1]
-                other = torch.nn.functional.one_hot(idx, num_classes=dist.probs.shape[-1]).to(dist.mode)
+                other = torch.nn.functional.one_hot(
+                    idx, num_classes=dist.probs.shape[-1]
+                ).to(dist.mode)
             else:
                 other = dist.mode + step
             mask = batch_isfinite & dist.support.check(other)
             self.assertTrue(mask.any() or dist.mode.unique().numel() == 1)
             # Add a dimension to the right if the event shape is not a scalar, e.g. OneHotCategorical.
-            other = torch.where(mask[..., None] if mask.ndim < other.ndim else mask, other, dist.sample())
+            other = torch.where(
+                mask[..., None] if mask.ndim < other.ndim else mask,
+                other,
+                dist.sample(),
+            )
             log_prob_other = dist.log_prob(other)
             delta = log_prob_mode - log_prob_other
-            self.assertTrue((-1e-12 < delta[mask].detach()).all())  # Allow up to 1e-12 rounding error.
+            self.assertTrue(
+                (-1e-12 < delta[mask].detach()).all()
+            )  # Allow up to 1e-12 rounding error.
 
     def _test_continuous_distribution_mode(self, dist, sanitized_mode, batch_isfinite):
         # We perturb the mode in the unconstrained space and expect the log probability to decrease.
         num_points = 10
         transform = transform_to(dist.support)
         unconstrained_mode = transform.inv(sanitized_mode)
-        perturbation = 1e-5 * (torch.rand((num_points,) + unconstrained_mode.shape) - 0.5)
+        perturbation = 1e-5 * (
+            torch.rand((num_points,) + unconstrained_mode.shape) - 0.5
+        )
         perturbed_mode = transform(perturbation + unconstrained_mode)
         log_prob_mode = dist.log_prob(sanitized_mode)
         log_prob_other = dist.log_prob(perturbed_mode)
@@ -3338,41 +4373,65 @@ def _test_continuous_distribution_mode(self, dist, sanitized_mode, batch_isfinit
 
         # We pass the test with a small tolerance to allow for rounding and manually set the
         # difference to zero if both log probs are infinite with the same sign.
-        both_infinite_with_same_sign = (log_prob_mode == log_prob_other) & (log_prob_mode.abs() == inf)
-        delta[both_infinite_with_same_sign] = 0.
+        both_infinite_with_same_sign = (log_prob_mode == log_prob_other) & (
+            log_prob_mode.abs() == inf
+        )
+        delta[both_infinite_with_same_sign] = 0.0
         ordering = (delta > -1e-12).all(axis=0)
         self.assertTrue(ordering[batch_isfinite].all())
 
     @set_default_dtype(torch.double)
     def test_mode(self):
         discrete_distributions = (
-            Bernoulli, Binomial, Categorical, Geometric, NegativeBinomial, OneHotCategorical, Poisson,
+            Bernoulli,
+            Binomial,
+            Categorical,
+            Geometric,
+            NegativeBinomial,
+            OneHotCategorical,
+            Poisson,
         )
         no_mode_available = (
-            ContinuousBernoulli, LKJCholesky, LogisticNormal, MixtureSameFamily, Multinomial,
-            RelaxedBernoulli, RelaxedOneHotCategorical,
+            ContinuousBernoulli,
+            LKJCholesky,
+            LogisticNormal,
+            MixtureSameFamily,
+            Multinomial,
+            RelaxedBernoulli,
+            RelaxedOneHotCategorical,
         )
 
         for dist_cls, params in _get_examples():
             for param in params:
                 dist = dist_cls(**param)
-                if isinstance(dist, no_mode_available) or type(dist) is TransformedDistribution:
+                if (
+                    isinstance(dist, no_mode_available)
+                    or type(dist) is TransformedDistribution
+                ):
                     with self.assertRaises(NotImplementedError):
                         dist.mode
                     continue
 
                 # Check that either all or no elements in the event shape are nan: the mode cannot be
                 # defined for part of an event.
-                isfinite = dist.mode.isfinite().reshape(dist.batch_shape + (dist.event_shape.numel(),))
+                isfinite = dist.mode.isfinite().reshape(
+                    dist.batch_shape + (dist.event_shape.numel(),)
+                )
                 batch_isfinite = isfinite.all(axis=-1)
                 self.assertTrue((batch_isfinite | ~isfinite.any(axis=-1)).all())
 
                 # We sanitize undefined modes by sampling from the distribution.
-                sanitized_mode = torch.where(~dist.mode.isnan(), dist.mode, dist.sample())
+                sanitized_mode = torch.where(
+                    ~dist.mode.isnan(), dist.mode, dist.sample()
+                )
                 if isinstance(dist, discrete_distributions):
-                    self._test_discrete_distribution_mode(dist, sanitized_mode, batch_isfinite)
+                    self._test_discrete_distribution_mode(
+                        dist, sanitized_mode, batch_isfinite
+                    )
                 else:
-                    self._test_continuous_distribution_mode(dist, sanitized_mode, batch_isfinite)
+                    self._test_continuous_distribution_mode(
+                        dist, sanitized_mode, batch_isfinite
+                    )
 
                 self.assertFalse(dist.log_prob(sanitized_mode).isnan().any())
 
@@ -3386,7 +4445,9 @@ class TestRsample(DistributionsTestCase):
     def test_gamma(self):
         num_samples = 100
         for alpha in [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]:
-            alphas = torch.tensor([alpha] * num_samples, dtype=torch.float, requires_grad=True)
+            alphas = torch.tensor(
+                [alpha] * num_samples, dtype=torch.float, requires_grad=True
+            )
             betas = alphas.new_ones(num_samples)
             x = Gamma(alphas, betas).rsample()
             x.sum().backward()
@@ -3396,26 +4457,34 @@ def test_gamma(self):
             # Compare with expected gradient dx/dalpha along constant cdf(x,alpha).
             cdf = scipy.stats.gamma.cdf
             pdf = scipy.stats.gamma.pdf
-            eps = 0.01 * alpha / (1.0 + alpha ** 0.5)
+            eps = 0.01 * alpha / (1.0 + alpha**0.5)
             cdf_alpha = (cdf(x, alpha + eps) - cdf(x, alpha - eps)) / (2 * eps)
             cdf_x = pdf(x, alpha)
             expected_grad = -cdf_alpha / cdf_x
             rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
-            self.assertLess(np.max(rel_error), 0.0005, '\n'.join([
-                f'Bad gradient dx/alpha for x ~ Gamma({alpha}, 1)',
-                f'x {x}',
-                f'expected {expected_grad}',
-                f'actual {actual_grad}',
-                f'rel error {rel_error}',
-                f'max error {rel_error.max()}',
-                f'at alpha={alpha}, x={x[rel_error.argmax()]}',
-            ]))
+            self.assertLess(
+                np.max(rel_error),
+                0.0005,
+                "\n".join(
+                    [
+                        f"Bad gradient dx/alpha for x ~ Gamma({alpha}, 1)",
+                        f"x {x}",
+                        f"expected {expected_grad}",
+                        f"actual {actual_grad}",
+                        f"rel error {rel_error}",
+                        f"max error {rel_error.max()}",
+                        f"at alpha={alpha}, x={x[rel_error.argmax()]}",
+                    ]
+                ),
+            )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_chi2(self):
         num_samples = 100
         for df in [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]:
-            dfs = torch.tensor([df] * num_samples, dtype=torch.float, requires_grad=True)
+            dfs = torch.tensor(
+                [df] * num_samples, dtype=torch.float, requires_grad=True
+            )
             x = Chi2(dfs).rsample()
             x.sum().backward()
             x, ind = x.sort()
@@ -3424,26 +4493,34 @@ def test_chi2(self):
             # Compare with expected gradient dx/ddf along constant cdf(x,df).
             cdf = scipy.stats.chi2.cdf
             pdf = scipy.stats.chi2.pdf
-            eps = 0.01 * df / (1.0 + df ** 0.5)
+            eps = 0.01 * df / (1.0 + df**0.5)
             cdf_df = (cdf(x, df + eps) - cdf(x, df - eps)) / (2 * eps)
             cdf_x = pdf(x, df)
             expected_grad = -cdf_df / cdf_x
             rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
-            self.assertLess(np.max(rel_error), 0.001, '\n'.join([
-                f'Bad gradient dx/ddf for x ~ Chi2({df})',
-                f'x {x}',
-                f'expected {expected_grad}',
-                f'actual {actual_grad}',
-                f'rel error {rel_error}',
-                f'max error {rel_error.max()}',
-            ]))
+            self.assertLess(
+                np.max(rel_error),
+                0.001,
+                "\n".join(
+                    [
+                        f"Bad gradient dx/ddf for x ~ Chi2({df})",
+                        f"x {x}",
+                        f"expected {expected_grad}",
+                        f"actual {actual_grad}",
+                        f"rel error {rel_error}",
+                        f"max error {rel_error.max()}",
+                    ]
+                ),
+            )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_dirichlet_on_diagonal(self):
         num_samples = 20
         grid = [1e-1, 1e0, 1e1]
         for a0, a1, a2 in product(grid, grid, grid):
-            alphas = torch.tensor([[a0, a1, a2]] * num_samples, dtype=torch.float, requires_grad=True)
+            alphas = torch.tensor(
+                [[a0, a1, a2]] * num_samples, dtype=torch.float, requires_grad=True
+            )
             x = Dirichlet(alphas).rsample()[:, 0]
             x.sum().backward()
             x, ind = x.sort()
@@ -3455,26 +4532,36 @@ def test_dirichlet_on_diagonal(self):
             pdf = scipy.stats.beta.pdf
             alpha, beta = a0, a1 + a2
             eps = 0.01 * alpha / (1.0 + np.sqrt(alpha))
-            cdf_alpha = (cdf(x, alpha + eps, beta) - cdf(x, alpha - eps, beta)) / (2 * eps)
+            cdf_alpha = (cdf(x, alpha + eps, beta) - cdf(x, alpha - eps, beta)) / (
+                2 * eps
+            )
             cdf_x = pdf(x, alpha, beta)
             expected_grad = -cdf_alpha / cdf_x
             rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
-            self.assertLess(np.max(rel_error), 0.001, '\n'.join([
-                f'Bad gradient dx[0]/dalpha[0] for Dirichlet([{a0}, {a1}, {a2}])',
-                f'x {x}',
-                f'expected {expected_grad}',
-                f'actual {actual_grad}',
-                f'rel error {rel_error}',
-                f'max error {rel_error.max()}',
-                f'at x={x[rel_error.argmax()]}',
-            ]))
+            self.assertLess(
+                np.max(rel_error),
+                0.001,
+                "\n".join(
+                    [
+                        f"Bad gradient dx[0]/dalpha[0] for Dirichlet([{a0}, {a1}, {a2}])",
+                        f"x {x}",
+                        f"expected {expected_grad}",
+                        f"actual {actual_grad}",
+                        f"rel error {rel_error}",
+                        f"max error {rel_error.max()}",
+                        f"at x={x[rel_error.argmax()]}",
+                    ]
+                ),
+            )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_beta_wrt_alpha(self):
         num_samples = 20
         grid = [1e-2, 1e-1, 1e0, 1e1, 1e2]
         for con1, con0 in product(grid, grid):
-            con1s = torch.tensor([con1] * num_samples, dtype=torch.float, requires_grad=True)
+            con1s = torch.tensor(
+                [con1] * num_samples, dtype=torch.float, requires_grad=True
+            )
             con0s = con1s.new_tensor([con0] * num_samples)
             x = Beta(con1s, con0s).rsample()
             x.sum().backward()
@@ -3485,26 +4572,36 @@ def test_beta_wrt_alpha(self):
             cdf = scipy.stats.beta.cdf
             pdf = scipy.stats.beta.pdf
             eps = 0.01 * con1 / (1.0 + np.sqrt(con1))
-            cdf_alpha = (cdf(x, con1 + eps, con0) - cdf(x, con1 - eps, con0)) / (2 * eps)
+            cdf_alpha = (cdf(x, con1 + eps, con0) - cdf(x, con1 - eps, con0)) / (
+                2 * eps
+            )
             cdf_x = pdf(x, con1, con0)
             expected_grad = -cdf_alpha / cdf_x
             rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
-            self.assertLess(np.max(rel_error), 0.005, '\n'.join([
-                f'Bad gradient dx/dcon1 for x ~ Beta({con1}, {con0})',
-                f'x {x}',
-                f'expected {expected_grad}',
-                f'actual {actual_grad}',
-                f'rel error {rel_error}',
-                f'max error {rel_error.max()}',
-                f'at x = {x[rel_error.argmax()]}',
-            ]))
+            self.assertLess(
+                np.max(rel_error),
+                0.005,
+                "\n".join(
+                    [
+                        f"Bad gradient dx/dcon1 for x ~ Beta({con1}, {con0})",
+                        f"x {x}",
+                        f"expected {expected_grad}",
+                        f"actual {actual_grad}",
+                        f"rel error {rel_error}",
+                        f"max error {rel_error.max()}",
+                        f"at x = {x[rel_error.argmax()]}",
+                    ]
+                ),
+            )
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_beta_wrt_beta(self):
         num_samples = 20
         grid = [1e-2, 1e-1, 1e0, 1e1, 1e2]
         for con1, con0 in product(grid, grid):
-            con0s = torch.tensor([con0] * num_samples, dtype=torch.float, requires_grad=True)
+            con0s = torch.tensor(
+                [con0] * num_samples, dtype=torch.float, requires_grad=True
+            )
             con1s = con0s.new_tensor([con1] * num_samples)
             x = Beta(con1s, con0s).rsample()
             x.sum().backward()
@@ -3519,18 +4616,24 @@ def test_beta_wrt_beta(self):
             cdf_x = pdf(x, con1, con0)
             expected_grad = -cdf_beta / cdf_x
             rel_error = np.abs(actual_grad - expected_grad) / (expected_grad + 1e-30)
-            self.assertLess(np.max(rel_error), 0.005, '\n'.join([
-                f'Bad gradient dx/dcon0 for x ~ Beta({con1}, {con0})',
-                f'x {x}',
-                f'expected {expected_grad}',
-                f'actual {actual_grad}',
-                f'rel error {rel_error}',
-                f'max error {rel_error.max()}',
-                f'at x = {x[rel_error.argmax()]!r}',
-            ]))
+            self.assertLess(
+                np.max(rel_error),
+                0.005,
+                "\n".join(
+                    [
+                        f"Bad gradient dx/dcon0 for x ~ Beta({con1}, {con0})",
+                        f"x {x}",
+                        f"expected {expected_grad}",
+                        f"actual {actual_grad}",
+                        f"rel error {rel_error}",
+                        f"max error {rel_error.max()}",
+                        f"at x = {x[rel_error.argmax()]!r}",
+                    ]
+                ),
+            )
 
     def test_dirichlet_multivariate(self):
-        alpha_crit = 0.25 * (5.0 ** 0.5 - 1.0)
+        alpha_crit = 0.25 * (5.0**0.5 - 1.0)
         num_samples = 100000
         for shift in [-0.1, -0.05, -0.01, 0.0, 0.01, 0.05, 0.10]:
             alpha = alpha_crit + shift
@@ -3542,14 +4645,22 @@ def test_dirichlet_multivariate(self):
             actual_grad = grad(loss, [alpha])[0]
             # Compute expected gradient by hand.
             num = 1.0 - 2.0 * alpha - 4.0 * alpha**2
-            den = (1.0 + alpha)**2 * (1.0 + 2.0 * alpha)**3
+            den = (1.0 + alpha) ** 2 * (1.0 + 2.0 * alpha) ** 3
             expected_grad = num / den
-            self.assertEqual(actual_grad, expected_grad, atol=0.002, rtol=0, msg='\n'.join([
-                "alpha = alpha_c + %.2g" % shift,
-                "expected_grad: %.5g" % expected_grad,
-                "actual_grad: %.5g" % actual_grad,
-                "error = %.2g" % torch.abs(expected_grad - actual_grad).max(),
-            ]))
+            self.assertEqual(
+                actual_grad,
+                expected_grad,
+                atol=0.002,
+                rtol=0,
+                msg="\n".join(
+                    [
+                        "alpha = alpha_c + %.2g" % shift,
+                        "expected_grad: %.5g" % expected_grad,
+                        "actual_grad: %.5g" % actual_grad,
+                        "error = %.2g" % torch.abs(expected_grad - actual_grad).max(),
+                    ]
+                ),
+            )
 
     @set_default_dtype(torch.double)
     def test_dirichlet_tangent_field(self):
@@ -3558,36 +4669,60 @@ def test_dirichlet_tangent_field(self):
 
         # v = dx/dalpha[0] is the reparameterized gradient aka tangent field.
         def compute_v(x, alpha):
-            return torch.stack([
-                _Dirichlet_backward(x, alpha, torch.eye(3, 3)[i].expand_as(x))[:, 0]
-                for i in range(3)
-            ], dim=-1)
+            return torch.stack(
+                [
+                    _Dirichlet_backward(x, alpha, torch.eye(3, 3)[i].expand_as(x))[:, 0]
+                    for i in range(3)
+                ],
+                dim=-1,
+            )
 
         for a1, a2, a3 in product(alpha_grid, alpha_grid, alpha_grid):
-            alpha = torch.tensor([a1, a2, a3], requires_grad=True).expand(num_samples, 3)
+            alpha = torch.tensor([a1, a2, a3], requires_grad=True).expand(
+                num_samples, 3
+            )
             x = Dirichlet(alpha).rsample()
-            dlogp_da = grad([Dirichlet(alpha).log_prob(x.detach()).sum()],
-                            [alpha], retain_graph=True)[0][:, 0]
-            dlogp_dx = grad([Dirichlet(alpha.detach()).log_prob(x).sum()],
-                            [x], retain_graph=True)[0]
-            v = torch.stack([grad([x[:, i].sum()], [alpha], retain_graph=True)[0][:, 0]
-                             for i in range(3)], dim=-1)
+            dlogp_da = grad(
+                [Dirichlet(alpha).log_prob(x.detach()).sum()],
+                [alpha],
+                retain_graph=True,
+            )[0][:, 0]
+            dlogp_dx = grad(
+                [Dirichlet(alpha.detach()).log_prob(x).sum()], [x], retain_graph=True
+            )[0]
+            v = torch.stack(
+                [
+                    grad([x[:, i].sum()], [alpha], retain_graph=True)[0][:, 0]
+                    for i in range(3)
+                ],
+                dim=-1,
+            )
             # Compute ramaining properties by finite difference.
-            self.assertEqual(compute_v(x, alpha), v, msg='Bug in compute_v() helper')
+            self.assertEqual(compute_v(x, alpha), v, msg="Bug in compute_v() helper")
             # dx is an arbitrary orthonormal basis tangent to the simplex.
-            dx = torch.tensor([[2., -1., -1.], [0., 1., -1.]])
+            dx = torch.tensor([[2.0, -1.0, -1.0], [0.0, 1.0, -1.0]])
             dx /= dx.norm(2, -1, True)
             eps = 1e-2 * x.min(-1, True)[0]  # avoid boundary
-            dv0 = (compute_v(x + eps * dx[0], alpha) - compute_v(x - eps * dx[0], alpha)) / (2 * eps)
-            dv1 = (compute_v(x + eps * dx[1], alpha) - compute_v(x - eps * dx[1], alpha)) / (2 * eps)
+            dv0 = (
+                compute_v(x + eps * dx[0], alpha) - compute_v(x - eps * dx[0], alpha)
+            ) / (2 * eps)
+            dv1 = (
+                compute_v(x + eps * dx[1], alpha) - compute_v(x - eps * dx[1], alpha)
+            ) / (2 * eps)
             div_v = (dv0 * dx[0] + dv1 * dx[1]).sum(-1)
             # This is a modification of the standard continuity equation, using the product rule to allow
             # expression in terms of log_prob rather than the less numerically stable log_prob.exp().
             error = dlogp_da + (dlogp_dx * v).sum(-1) + div_v
-            self.assertLess(torch.abs(error).max(), 0.005, '\n'.join([
-                f'Dirichlet([{a1}, {a2}, {a3}]) gradient violates continuity equation:',
-                f'error = {error}',
-            ]))
+            self.assertLess(
+                torch.abs(error).max(),
+                0.005,
+                "\n".join(
+                    [
+                        f"Dirichlet([{a1}, {a2}, {a3}]) gradient violates continuity equation:",
+                        f"error = {error}",
+                    ]
+                ),
+            )
 
 
 class TestDistributionShapes(DistributionsTestCase):
@@ -3603,9 +4738,10 @@ def test_entropy_shape(self):
                 dist = Dist(validate_args=False, **param)
                 try:
                     actual_shape = dist.entropy().size()
-                    expected_shape = dist.batch_shape if dist.batch_shape else torch.Size()
-                    message = '{} example {}/{}, shape mismatch. expected {}, actual {}'.format(
-                        Dist.__name__, i + 1, len(params), expected_shape, actual_shape)
+                    expected_shape = (
+                        dist.batch_shape if dist.batch_shape else torch.Size()
+                    )
+                    message = f"{Dist.__name__} example {i + 1}/{len(params)}, shape mismatch. expected {expected_shape}, actual {actual_shape}"  # noqa: B950
                     self.assertEqual(actual_shape, expected_shape, msg=message)
                 except NotImplementedError:
                     continue
@@ -3617,8 +4753,12 @@ def test_bernoulli_shape_scalar_params(self):
         self.assertEqual(bernoulli.sample().size(), torch.Size())
         self.assertEqual(bernoulli.sample((3, 2)).size(), torch.Size((3, 2)))
         self.assertRaises(ValueError, bernoulli.log_prob, self.scalar_sample)
-        self.assertEqual(bernoulli.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(bernoulli.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            bernoulli.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            bernoulli.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_bernoulli_shape_tensor_params(self):
         bernoulli = Bernoulli(torch.tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]]))
@@ -3626,9 +4766,13 @@ def test_bernoulli_shape_tensor_params(self):
         self.assertEqual(bernoulli._event_shape, torch.Size(()))
         self.assertEqual(bernoulli.sample().size(), torch.Size((3, 2)))
         self.assertEqual(bernoulli.sample((3, 2)).size(), torch.Size((3, 2, 3, 2)))
-        self.assertEqual(bernoulli.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(
+            bernoulli.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
         self.assertRaises(ValueError, bernoulli.log_prob, self.tensor_sample_2)
-        self.assertEqual(bernoulli.log_prob(torch.ones(3, 1, 1)).size(), torch.Size((3, 3, 2)))
+        self.assertEqual(
+            bernoulli.log_prob(torch.ones(3, 1, 1)).size(), torch.Size((3, 3, 2))
+        )
 
     def test_geometric_shape_scalar_params(self):
         geometric = Geometric(0.3)
@@ -3637,8 +4781,12 @@ def test_geometric_shape_scalar_params(self):
         self.assertEqual(geometric.sample().size(), torch.Size())
         self.assertEqual(geometric.sample((3, 2)).size(), torch.Size((3, 2)))
         self.assertRaises(ValueError, geometric.log_prob, self.scalar_sample)
-        self.assertEqual(geometric.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(geometric.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            geometric.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            geometric.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_geometric_shape_tensor_params(self):
         geometric = Geometric(torch.tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]]))
@@ -3646,9 +4794,13 @@ def test_geometric_shape_tensor_params(self):
         self.assertEqual(geometric._event_shape, torch.Size(()))
         self.assertEqual(geometric.sample().size(), torch.Size((3, 2)))
         self.assertEqual(geometric.sample((3, 2)).size(), torch.Size((3, 2, 3, 2)))
-        self.assertEqual(geometric.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(
+            geometric.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
         self.assertRaises(ValueError, geometric.log_prob, self.tensor_sample_2)
-        self.assertEqual(geometric.log_prob(torch.ones(3, 1, 1)).size(), torch.Size((3, 3, 2)))
+        self.assertEqual(
+            geometric.log_prob(torch.ones(3, 1, 1)).size(), torch.Size((3, 3, 2))
+        )
 
     def test_beta_shape_scalar_params(self):
         dist = Beta(0.1, 0.1)
@@ -3658,18 +4810,24 @@ def test_beta_shape_scalar_params(self):
         self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2)))
         self.assertRaises(ValueError, dist.log_prob, self.scalar_sample)
         self.assertEqual(dist.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_beta_shape_tensor_params(self):
-        dist = Beta(torch.tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]),
-                    torch.tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]))
+        dist = Beta(
+            torch.tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]),
+            torch.tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]),
+        )
         self.assertEqual(dist._batch_shape, torch.Size((3, 2)))
         self.assertEqual(dist._event_shape, torch.Size(()))
         self.assertEqual(dist.sample().size(), torch.Size((3, 2)))
         self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3, 2)))
         self.assertEqual(dist.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
         self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2)
-        self.assertEqual(dist.log_prob(torch.ones(3, 1, 1)).size(), torch.Size((3, 3, 2)))
+        self.assertEqual(
+            dist.log_prob(torch.ones(3, 1, 1)).size(), torch.Size((3, 3, 2))
+        )
 
     def test_binomial_shape(self):
         dist = Binomial(10, torch.tensor([0.6, 0.3]))
@@ -3681,12 +4839,16 @@ def test_binomial_shape(self):
         self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2)
 
     def test_binomial_shape_vectorized_n(self):
-        dist = Binomial(torch.tensor([[10, 3, 1], [4, 8, 4]]), torch.tensor([0.6, 0.3, 0.1]))
+        dist = Binomial(
+            torch.tensor([[10, 3, 1], [4, 8, 4]]), torch.tensor([0.6, 0.3, 0.1])
+        )
         self.assertEqual(dist._batch_shape, torch.Size((2, 3)))
         self.assertEqual(dist._event_shape, torch.Size(()))
         self.assertEqual(dist.sample().size(), torch.Size((2, 3)))
         self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 2, 3)))
-        self.assertEqual(dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
         self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_1)
 
     def test_multinomial_shape(self):
@@ -3705,18 +4867,39 @@ def test_categorical_shape(self):
         self.assertEqual(dist._batch_shape, torch.Size(()))
         self.assertEqual(dist._event_shape, torch.Size(()))
         self.assertEqual(dist.sample().size(), torch.Size())
-        self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2,)))
+        self.assertEqual(
+            dist.sample((3, 2)).size(),
+            torch.Size(
+                (
+                    3,
+                    2,
+                )
+            ),
+        )
         self.assertEqual(dist.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
         self.assertEqual(dist.log_prob(torch.ones(3, 1)).size(), torch.Size((3, 1)))
         # batched
         dist = Categorical(torch.tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]]))
         self.assertEqual(dist._batch_shape, torch.Size((3,)))
         self.assertEqual(dist._event_shape, torch.Size(()))
         self.assertEqual(dist.sample().size(), torch.Size((3,)))
-        self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3,)))
+        self.assertEqual(
+            dist.sample((3, 2)).size(),
+            torch.Size(
+                (
+                    3,
+                    2,
+                    3,
+                )
+            ),
+        )
         self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_1)
-        self.assertEqual(dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
         self.assertEqual(dist.log_prob(torch.ones(3, 1)).size(), torch.Size((3, 3)))
 
     def test_one_hot_categorical_shape(self):
@@ -3727,9 +4910,19 @@ def test_one_hot_categorical_shape(self):
         self.assertEqual(dist.sample().size(), torch.Size((3,)))
         self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3)))
         self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_1)
-        sample = torch.tensor([0., 1., 0.]).expand(3, 2, 3)
-        self.assertEqual(dist.log_prob(sample).size(), torch.Size((3, 2,)))
-        self.assertEqual(dist.log_prob(dist.enumerate_support()).size(), torch.Size((3,)))
+        sample = torch.tensor([0.0, 1.0, 0.0]).expand(3, 2, 3)
+        self.assertEqual(
+            dist.log_prob(sample).size(),
+            torch.Size(
+                (
+                    3,
+                    2,
+                )
+            ),
+        )
+        self.assertEqual(
+            dist.log_prob(dist.enumerate_support()).size(), torch.Size((3,))
+        )
         sample = torch.eye(3)
         self.assertEqual(dist.log_prob(sample).size(), torch.Size((3,)))
         # batched
@@ -3738,11 +4931,13 @@ def test_one_hot_categorical_shape(self):
         self.assertEqual(dist._event_shape, torch.Size((2,)))
         self.assertEqual(dist.sample().size(), torch.Size((3, 2)))
         self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3, 2)))
-        sample = torch.tensor([0., 1.])
+        sample = torch.tensor([0.0, 1.0])
         self.assertEqual(dist.log_prob(sample).size(), torch.Size((3,)))
         self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2)
-        self.assertEqual(dist.log_prob(dist.enumerate_support()).size(), torch.Size((2, 3)))
-        sample = torch.tensor([0., 1.]).expand(3, 1, 2)
+        self.assertEqual(
+            dist.log_prob(dist.enumerate_support()).size(), torch.Size((2, 3))
+        )
+        sample = torch.tensor([0.0, 1.0]).expand(3, 1, 2)
         self.assertEqual(dist.log_prob(sample).size(), torch.Size((3, 3)))
 
     def test_cauchy_shape_scalar_params(self):
@@ -3752,16 +4947,24 @@ def test_cauchy_shape_scalar_params(self):
         self.assertEqual(cauchy.sample().size(), torch.Size())
         self.assertEqual(cauchy.sample(torch.Size((3, 2))).size(), torch.Size((3, 2)))
         self.assertRaises(ValueError, cauchy.log_prob, self.scalar_sample)
-        self.assertEqual(cauchy.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(cauchy.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            cauchy.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            cauchy.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_cauchy_shape_tensor_params(self):
-        cauchy = Cauchy(torch.tensor([0., 0.]), torch.tensor([1., 1.]))
+        cauchy = Cauchy(torch.tensor([0.0, 0.0]), torch.tensor([1.0, 1.0]))
         self.assertEqual(cauchy._batch_shape, torch.Size((2,)))
         self.assertEqual(cauchy._event_shape, torch.Size(()))
         self.assertEqual(cauchy.sample().size(), torch.Size((2,)))
-        self.assertEqual(cauchy.sample(torch.Size((3, 2))).size(), torch.Size((3, 2, 2)))
-        self.assertEqual(cauchy.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(
+            cauchy.sample(torch.Size((3, 2))).size(), torch.Size((3, 2, 2))
+        )
+        self.assertEqual(
+            cauchy.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
         self.assertRaises(ValueError, cauchy.log_prob, self.tensor_sample_2)
         self.assertEqual(cauchy.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2)))
 
@@ -3770,26 +4973,32 @@ def test_halfcauchy_shape_scalar_params(self):
         self.assertEqual(halfcauchy._batch_shape, torch.Size())
         self.assertEqual(halfcauchy._event_shape, torch.Size())
         self.assertEqual(halfcauchy.sample().size(), torch.Size())
-        self.assertEqual(halfcauchy.sample(torch.Size((3, 2))).size(),
-                         torch.Size((3, 2)))
+        self.assertEqual(
+            halfcauchy.sample(torch.Size((3, 2))).size(), torch.Size((3, 2))
+        )
         self.assertRaises(ValueError, halfcauchy.log_prob, self.scalar_sample)
-        self.assertEqual(halfcauchy.log_prob(self.tensor_sample_1).size(),
-                         torch.Size((3, 2)))
-        self.assertEqual(halfcauchy.log_prob(self.tensor_sample_2).size(),
-                         torch.Size((3, 2, 3)))
+        self.assertEqual(
+            halfcauchy.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            halfcauchy.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_halfcauchy_shape_tensor_params(self):
-        halfcauchy = HalfCauchy(torch.tensor([1., 1.]))
+        halfcauchy = HalfCauchy(torch.tensor([1.0, 1.0]))
         self.assertEqual(halfcauchy._batch_shape, torch.Size((2,)))
         self.assertEqual(halfcauchy._event_shape, torch.Size(()))
         self.assertEqual(halfcauchy.sample().size(), torch.Size((2,)))
-        self.assertEqual(halfcauchy.sample(torch.Size((3, 2))).size(),
-                         torch.Size((3, 2, 2)))
-        self.assertEqual(halfcauchy.log_prob(self.tensor_sample_1).size(),
-                         torch.Size((3, 2)))
+        self.assertEqual(
+            halfcauchy.sample(torch.Size((3, 2))).size(), torch.Size((3, 2, 2))
+        )
+        self.assertEqual(
+            halfcauchy.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
         self.assertRaises(ValueError, halfcauchy.log_prob, self.tensor_sample_2)
-        self.assertEqual(halfcauchy.log_prob(torch.ones(2, 1)).size(),
-                         torch.Size((2, 2)))
+        self.assertEqual(
+            halfcauchy.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2))
+        )
 
     def test_dirichlet_shape(self):
         dist = Dirichlet(torch.tensor([[0.6, 0.3], [1.6, 1.3], [2.6, 2.3]]))
@@ -3797,7 +5006,9 @@ def test_dirichlet_shape(self):
         self.assertEqual(dist._event_shape, torch.Size((2,)))
         self.assertEqual(dist.sample().size(), torch.Size((3, 2)))
         self.assertEqual(dist.sample((5, 4)).size(), torch.Size((5, 4, 3, 2)))
-        simplex_sample = self.tensor_sample_1 / self.tensor_sample_1.sum(-1, keepdim=True)
+        simplex_sample = self.tensor_sample_1 / self.tensor_sample_1.sum(
+            -1, keepdim=True
+        )
         self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3,)))
         self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2)
         simplex_sample = torch.ones(3, 1, 2)
@@ -3805,14 +5016,17 @@ def test_dirichlet_shape(self):
         self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3, 3)))
 
     def test_mixture_same_family_shape(self):
-        dist = MixtureSameFamily(Categorical(torch.rand(5)),
-                                 Normal(torch.randn(5), torch.rand(5)))
+        dist = MixtureSameFamily(
+            Categorical(torch.rand(5)), Normal(torch.randn(5), torch.rand(5))
+        )
         self.assertEqual(dist._batch_shape, torch.Size())
         self.assertEqual(dist._event_shape, torch.Size())
         self.assertEqual(dist.sample().size(), torch.Size())
         self.assertEqual(dist.sample((5, 4)).size(), torch.Size((5, 4)))
         self.assertEqual(dist.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            dist.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_gamma_shape_scalar_params(self):
         gamma = Gamma(1, 1)
@@ -3821,16 +5035,22 @@ def test_gamma_shape_scalar_params(self):
         self.assertEqual(gamma.sample().size(), torch.Size())
         self.assertEqual(gamma.sample((3, 2)).size(), torch.Size((3, 2)))
         self.assertEqual(gamma.log_prob(self.scalar_sample).size(), torch.Size())
-        self.assertEqual(gamma.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(gamma.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            gamma.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            gamma.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_gamma_shape_tensor_params(self):
-        gamma = Gamma(torch.tensor([1., 1.]), torch.tensor([1., 1.]))
+        gamma = Gamma(torch.tensor([1.0, 1.0]), torch.tensor([1.0, 1.0]))
         self.assertEqual(gamma._batch_shape, torch.Size((2,)))
         self.assertEqual(gamma._event_shape, torch.Size(()))
         self.assertEqual(gamma.sample().size(), torch.Size((2,)))
         self.assertEqual(gamma.sample((3, 2)).size(), torch.Size((3, 2, 2)))
-        self.assertEqual(gamma.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(
+            gamma.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
         self.assertRaises(ValueError, gamma.log_prob, self.tensor_sample_2)
         self.assertEqual(gamma.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2)))
 
@@ -3842,10 +5062,12 @@ def test_chi2_shape_scalar_params(self):
         self.assertEqual(chi2.sample((3, 2)).size(), torch.Size((3, 2)))
         self.assertEqual(chi2.log_prob(self.scalar_sample).size(), torch.Size())
         self.assertEqual(chi2.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(chi2.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            chi2.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_chi2_shape_tensor_params(self):
-        chi2 = Chi2(torch.tensor([1., 1.]))
+        chi2 = Chi2(torch.tensor([1.0, 1.0]))
         self.assertEqual(chi2._batch_shape, torch.Size((2,)))
         self.assertEqual(chi2._event_shape, torch.Size(()))
         self.assertEqual(chi2.sample().size(), torch.Size((2,)))
@@ -3862,10 +5084,12 @@ def test_studentT_shape_scalar_params(self):
         self.assertEqual(st.sample((3, 2)).size(), torch.Size((3, 2)))
         self.assertRaises(ValueError, st.log_prob, self.scalar_sample)
         self.assertEqual(st.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(st.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            st.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_studentT_shape_tensor_params(self):
-        st = StudentT(torch.tensor([1., 1.]))
+        st = StudentT(torch.tensor([1.0, 1.0]))
         self.assertEqual(st._batch_shape, torch.Size((2,)))
         self.assertEqual(st._event_shape, torch.Size(()))
         self.assertEqual(st.sample().size(), torch.Size((2,)))
@@ -3880,8 +5104,12 @@ def test_pareto_shape_scalar_params(self):
         self.assertEqual(pareto._event_shape, torch.Size())
         self.assertEqual(pareto.sample().size(), torch.Size())
         self.assertEqual(pareto.sample((3, 2)).size(), torch.Size((3, 2)))
-        self.assertEqual(pareto.log_prob(self.tensor_sample_1 + 1).size(), torch.Size((3, 2)))
-        self.assertEqual(pareto.log_prob(self.tensor_sample_2 + 1).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            pareto.log_prob(self.tensor_sample_1 + 1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            pareto.log_prob(self.tensor_sample_2 + 1).size(), torch.Size((3, 2, 3))
+        )
 
     def test_gumbel_shape_scalar_params(self):
         gumbel = Gumbel(1, 1)
@@ -3889,8 +5117,12 @@ def test_gumbel_shape_scalar_params(self):
         self.assertEqual(gumbel._event_shape, torch.Size())
         self.assertEqual(gumbel.sample().size(), torch.Size())
         self.assertEqual(gumbel.sample((3, 2)).size(), torch.Size((3, 2)))
-        self.assertEqual(gumbel.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(gumbel.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            gumbel.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            gumbel.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_kumaraswamy_shape_scalar_params(self):
         kumaraswamy = Kumaraswamy(1, 1)
@@ -3898,29 +5130,42 @@ def test_kumaraswamy_shape_scalar_params(self):
         self.assertEqual(kumaraswamy._event_shape, torch.Size())
         self.assertEqual(kumaraswamy.sample().size(), torch.Size())
         self.assertEqual(kumaraswamy.sample((3, 2)).size(), torch.Size((3, 2)))
-        self.assertEqual(kumaraswamy.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(kumaraswamy.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            kumaraswamy.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            kumaraswamy.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_vonmises_shape_tensor_params(self):
-        von_mises = VonMises(torch.tensor([0., 0.]), torch.tensor([1., 1.]))
+        von_mises = VonMises(torch.tensor([0.0, 0.0]), torch.tensor([1.0, 1.0]))
         self.assertEqual(von_mises._batch_shape, torch.Size((2,)))
         self.assertEqual(von_mises._event_shape, torch.Size(()))
         self.assertEqual(von_mises.sample().size(), torch.Size((2,)))
-        self.assertEqual(von_mises.sample(torch.Size((3, 2))).size(), torch.Size((3, 2, 2)))
-        self.assertEqual(von_mises.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(von_mises.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2)))
+        self.assertEqual(
+            von_mises.sample(torch.Size((3, 2))).size(), torch.Size((3, 2, 2))
+        )
+        self.assertEqual(
+            von_mises.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            von_mises.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2))
+        )
 
     def test_vonmises_shape_scalar_params(self):
-        von_mises = VonMises(0., 1.)
+        von_mises = VonMises(0.0, 1.0)
         self.assertEqual(von_mises._batch_shape, torch.Size())
         self.assertEqual(von_mises._event_shape, torch.Size())
         self.assertEqual(von_mises.sample().size(), torch.Size())
-        self.assertEqual(von_mises.sample(torch.Size((3, 2))).size(),
-                         torch.Size((3, 2)))
-        self.assertEqual(von_mises.log_prob(self.tensor_sample_1).size(),
-                         torch.Size((3, 2)))
-        self.assertEqual(von_mises.log_prob(self.tensor_sample_2).size(),
-                         torch.Size((3, 2, 3)))
+        self.assertEqual(
+            von_mises.sample(torch.Size((3, 2))).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            von_mises.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            von_mises.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_weibull_scale_scalar_params(self):
         weibull = Weibull(1, 1)
@@ -3928,11 +5173,15 @@ def test_weibull_scale_scalar_params(self):
         self.assertEqual(weibull._event_shape, torch.Size())
         self.assertEqual(weibull.sample().size(), torch.Size())
         self.assertEqual(weibull.sample((3, 2)).size(), torch.Size((3, 2)))
-        self.assertEqual(weibull.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(weibull.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            weibull.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            weibull.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_wishart_shape_scalar_params(self):
-        wishart = Wishart(torch.tensor(1), torch.tensor([[1.]]))
+        wishart = Wishart(torch.tensor(1), torch.tensor([[1.0]]))
         self.assertEqual(wishart._batch_shape, torch.Size())
         self.assertEqual(wishart._event_shape, torch.Size((1, 1)))
         self.assertEqual(wishart.sample().size(), torch.Size((1, 1)))
@@ -3940,7 +5189,7 @@ def test_wishart_shape_scalar_params(self):
         self.assertRaises(ValueError, wishart.log_prob, self.scalar_sample)
 
     def test_wishart_shape_tensor_params(self):
-        wishart = Wishart(torch.tensor([1., 1.]), torch.tensor([[[1.]], [[1.]]]))
+        wishart = Wishart(torch.tensor([1.0, 1.0]), torch.tensor([[[1.0]], [[1.0]]]))
         self.assertEqual(wishart._batch_shape, torch.Size((2,)))
         self.assertEqual(wishart._event_shape, torch.Size((1, 1)))
         self.assertEqual(wishart.sample().size(), torch.Size((2, 1, 1)))
@@ -3955,16 +5204,22 @@ def test_normal_shape_scalar_params(self):
         self.assertEqual(normal.sample().size(), torch.Size())
         self.assertEqual(normal.sample((3, 2)).size(), torch.Size((3, 2)))
         self.assertRaises(ValueError, normal.log_prob, self.scalar_sample)
-        self.assertEqual(normal.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(normal.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            normal.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            normal.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_normal_shape_tensor_params(self):
-        normal = Normal(torch.tensor([0., 0.]), torch.tensor([1., 1.]))
+        normal = Normal(torch.tensor([0.0, 0.0]), torch.tensor([1.0, 1.0]))
         self.assertEqual(normal._batch_shape, torch.Size((2,)))
         self.assertEqual(normal._event_shape, torch.Size(()))
         self.assertEqual(normal.sample().size(), torch.Size((2,)))
         self.assertEqual(normal.sample((3, 2)).size(), torch.Size((3, 2, 2)))
-        self.assertEqual(normal.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(
+            normal.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
         self.assertRaises(ValueError, normal.log_prob, self.tensor_sample_2)
         self.assertEqual(normal.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2)))
 
@@ -3975,36 +5230,50 @@ def test_uniform_shape_scalar_params(self):
         self.assertEqual(uniform.sample().size(), torch.Size())
         self.assertEqual(uniform.sample(torch.Size((3, 2))).size(), torch.Size((3, 2)))
         self.assertRaises(ValueError, uniform.log_prob, self.scalar_sample)
-        self.assertEqual(uniform.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(uniform.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            uniform.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            uniform.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_uniform_shape_tensor_params(self):
-        uniform = Uniform(torch.tensor([0., 0.]), torch.tensor([1., 1.]))
+        uniform = Uniform(torch.tensor([0.0, 0.0]), torch.tensor([1.0, 1.0]))
         self.assertEqual(uniform._batch_shape, torch.Size((2,)))
         self.assertEqual(uniform._event_shape, torch.Size(()))
         self.assertEqual(uniform.sample().size(), torch.Size((2,)))
-        self.assertEqual(uniform.sample(torch.Size((3, 2))).size(), torch.Size((3, 2, 2)))
-        self.assertEqual(uniform.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(
+            uniform.sample(torch.Size((3, 2))).size(), torch.Size((3, 2, 2))
+        )
+        self.assertEqual(
+            uniform.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
         self.assertRaises(ValueError, uniform.log_prob, self.tensor_sample_2)
         self.assertEqual(uniform.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2)))
 
     def test_exponential_shape_scalar_param(self):
-        expon = Exponential(1.)
+        expon = Exponential(1.0)
         self.assertEqual(expon._batch_shape, torch.Size())
         self.assertEqual(expon._event_shape, torch.Size())
         self.assertEqual(expon.sample().size(), torch.Size())
         self.assertEqual(expon.sample((3, 2)).size(), torch.Size((3, 2)))
         self.assertRaises(ValueError, expon.log_prob, self.scalar_sample)
-        self.assertEqual(expon.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(expon.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            expon.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            expon.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_exponential_shape_tensor_param(self):
-        expon = Exponential(torch.tensor([1., 1.]))
+        expon = Exponential(torch.tensor([1.0, 1.0]))
         self.assertEqual(expon._batch_shape, torch.Size((2,)))
         self.assertEqual(expon._event_shape, torch.Size(()))
         self.assertEqual(expon.sample().size(), torch.Size((2,)))
         self.assertEqual(expon.sample((3, 2)).size(), torch.Size((3, 2, 2)))
-        self.assertEqual(expon.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(
+            expon.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
         self.assertRaises(ValueError, expon.log_prob, self.tensor_sample_2)
         self.assertEqual(expon.log_prob(torch.ones(2, 2)).size(), torch.Size((2, 2)))
 
@@ -4015,16 +5284,22 @@ def test_laplace_shape_scalar_params(self):
         self.assertEqual(laplace.sample().size(), torch.Size())
         self.assertEqual(laplace.sample((3, 2)).size(), torch.Size((3, 2)))
         self.assertRaises(ValueError, laplace.log_prob, self.scalar_sample)
-        self.assertEqual(laplace.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(laplace.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            laplace.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
+        self.assertEqual(
+            laplace.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))
+        )
 
     def test_laplace_shape_tensor_params(self):
-        laplace = Laplace(torch.tensor([0., 0.]), torch.tensor([1., 1.]))
+        laplace = Laplace(torch.tensor([0.0, 0.0]), torch.tensor([1.0, 1.0]))
         self.assertEqual(laplace._batch_shape, torch.Size((2,)))
         self.assertEqual(laplace._event_shape, torch.Size(()))
         self.assertEqual(laplace.sample().size(), torch.Size((2,)))
         self.assertEqual(laplace.sample((3, 2)).size(), torch.Size((3, 2, 2)))
-        self.assertEqual(laplace.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(
+            laplace.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))
+        )
         self.assertRaises(ValueError, laplace.log_prob, self.tensor_sample_2)
         self.assertEqual(laplace.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2)))
 
@@ -4035,23 +5310,47 @@ def test_continuous_bernoulli_shape_scalar_params(self):
         self.assertEqual(continuous_bernoulli.sample().size(), torch.Size())
         self.assertEqual(continuous_bernoulli.sample((3, 2)).size(), torch.Size((3, 2)))
         self.assertRaises(ValueError, continuous_bernoulli.log_prob, self.scalar_sample)
-        self.assertEqual(continuous_bernoulli.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertEqual(continuous_bernoulli.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+        self.assertEqual(
+            continuous_bernoulli.log_prob(self.tensor_sample_1).size(),
+            torch.Size((3, 2)),
+        )
+        self.assertEqual(
+            continuous_bernoulli.log_prob(self.tensor_sample_2).size(),
+            torch.Size((3, 2, 3)),
+        )
 
     def test_continuous_bernoulli_shape_tensor_params(self):
-        continuous_bernoulli = ContinuousBernoulli(torch.tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]]))
+        continuous_bernoulli = ContinuousBernoulli(
+            torch.tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]])
+        )
         self.assertEqual(continuous_bernoulli._batch_shape, torch.Size((3, 2)))
         self.assertEqual(continuous_bernoulli._event_shape, torch.Size(()))
         self.assertEqual(continuous_bernoulli.sample().size(), torch.Size((3, 2)))
-        self.assertEqual(continuous_bernoulli.sample((3, 2)).size(), torch.Size((3, 2, 3, 2)))
-        self.assertEqual(continuous_bernoulli.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
-        self.assertRaises(ValueError, continuous_bernoulli.log_prob, self.tensor_sample_2)
-        self.assertEqual(continuous_bernoulli.log_prob(torch.ones(3, 1, 1)).size(), torch.Size((3, 3, 2)))
+        self.assertEqual(
+            continuous_bernoulli.sample((3, 2)).size(), torch.Size((3, 2, 3, 2))
+        )
+        self.assertEqual(
+            continuous_bernoulli.log_prob(self.tensor_sample_1).size(),
+            torch.Size((3, 2)),
+        )
+        self.assertRaises(
+            ValueError, continuous_bernoulli.log_prob, self.tensor_sample_2
+        )
+        self.assertEqual(
+            continuous_bernoulli.log_prob(torch.ones(3, 1, 1)).size(),
+            torch.Size((3, 3, 2)),
+        )
+
+    @skipIfTorchDynamo("Not a TorchDynamo suitable test")
+    def test_mixture_same_family_mean_shape(self):
+        mix_distribution = Categorical(torch.ones([3, 1, 3]))
+        component_distribution = Normal(torch.zeros([3, 3, 3]), torch.ones([3, 3, 3]))
+        gmm = MixtureSameFamily(mix_distribution, component_distribution)
+        self.assertEqual(len(gmm.mean.shape), 2)
 
 
 @skipIfTorchDynamo("Not a TorchDynamo suitable test")
 class TestKL(DistributionsTestCase):
-
     def setUp(self):
         super().setUp()
 
@@ -4064,40 +5363,52 @@ def __init__(self, probs):
         # e.g. bernoulli[1] varies row-wise; that way we test all param pairs.
         bernoulli = pairwise(Bernoulli, [0.1, 0.2, 0.6, 0.9])
         binomial30 = pairwise(Binomial30, [0.1, 0.2, 0.6, 0.9])
-        binomial_vectorized_count = (Binomial(torch.tensor([3, 4]), torch.tensor([0.4, 0.6])),
-                                     Binomial(torch.tensor([3, 4]), torch.tensor([0.5, 0.8])))
+        binomial_vectorized_count = (
+            Binomial(torch.tensor([3, 4]), torch.tensor([0.4, 0.6])),
+            Binomial(torch.tensor([3, 4]), torch.tensor([0.5, 0.8])),
+        )
         beta = pairwise(Beta, [1.0, 2.5, 1.0, 2.5], [1.5, 1.5, 3.5, 3.5])
-        categorical = pairwise(Categorical, [[0.4, 0.3, 0.3],
-                                             [0.2, 0.7, 0.1],
-                                             [0.33, 0.33, 0.34],
-                                             [0.2, 0.2, 0.6]])
+        categorical = pairwise(
+            Categorical,
+            [[0.4, 0.3, 0.3], [0.2, 0.7, 0.1], [0.33, 0.33, 0.34], [0.2, 0.2, 0.6]],
+        )
         cauchy = pairwise(Cauchy, [-2.0, 2.0, -3.0, 3.0], [1.0, 2.0, 1.0, 2.0])
         chi2 = pairwise(Chi2, [1.0, 2.0, 2.5, 5.0])
-        dirichlet = pairwise(Dirichlet, [[0.1, 0.2, 0.7],
-                                         [0.5, 0.4, 0.1],
-                                         [0.33, 0.33, 0.34],
-                                         [0.2, 0.2, 0.4]])
+        dirichlet = pairwise(
+            Dirichlet,
+            [[0.1, 0.2, 0.7], [0.5, 0.4, 0.1], [0.33, 0.33, 0.34], [0.2, 0.2, 0.4]],
+        )
         exponential = pairwise(Exponential, [1.0, 2.5, 5.0, 10.0])
         gamma = pairwise(Gamma, [1.0, 2.5, 1.0, 2.5], [1.5, 1.5, 3.5, 3.5])
         gumbel = pairwise(Gumbel, [-2.0, 4.0, -3.0, 6.0], [1.0, 2.5, 1.0, 2.5])
         halfnormal = pairwise(HalfNormal, [1.0, 2.0, 1.0, 2.0])
-        inversegamma = pairwise(InverseGamma, [1.0, 2.5, 1.0, 2.5], [1.5, 1.5, 3.5, 3.5])
+        inversegamma = pairwise(
+            InverseGamma, [1.0, 2.5, 1.0, 2.5], [1.5, 1.5, 3.5, 3.5]
+        )
         laplace = pairwise(Laplace, [-2.0, 4.0, -3.0, 6.0], [1.0, 2.5, 1.0, 2.5])
         lognormal = pairwise(LogNormal, [-2.0, 2.0, -3.0, 3.0], [1.0, 2.0, 1.0, 2.0])
         normal = pairwise(Normal, [-2.0, 2.0, -3.0, 3.0], [1.0, 2.0, 1.0, 2.0])
         independent = (Independent(normal[0], 1), Independent(normal[1], 1))
-        onehotcategorical = pairwise(OneHotCategorical, [[0.4, 0.3, 0.3],
-                                                         [0.2, 0.7, 0.1],
-                                                         [0.33, 0.33, 0.34],
-                                                         [0.2, 0.2, 0.6]])
-        pareto = (Pareto(torch.tensor([2.5, 4.0, 2.5, 4.0]).expand(4, 4),
-                         torch.tensor([2.25, 3.75, 2.25, 3.75]).expand(4, 4)),
-                  Pareto(torch.tensor([2.25, 3.75, 2.25, 3.8]).expand(4, 4),
-                         torch.tensor([2.25, 3.75, 2.25, 3.75]).expand(4, 4)))
+        onehotcategorical = pairwise(
+            OneHotCategorical,
+            [[0.4, 0.3, 0.3], [0.2, 0.7, 0.1], [0.33, 0.33, 0.34], [0.2, 0.2, 0.6]],
+        )
+        pareto = (
+            Pareto(
+                torch.tensor([2.5, 4.0, 2.5, 4.0]).expand(4, 4),
+                torch.tensor([2.25, 3.75, 2.25, 3.75]).expand(4, 4),
+            ),
+            Pareto(
+                torch.tensor([2.25, 3.75, 2.25, 3.8]).expand(4, 4),
+                torch.tensor([2.25, 3.75, 2.25, 3.75]).expand(4, 4),
+            ),
+        )
         poisson = pairwise(Poisson, [0.3, 1.0, 5.0, 10.0])
-        uniform_within_unit = pairwise(Uniform, [0.1, 0.9, 0.2, 0.75], [0.15, 0.95, 0.25, 0.8])
+        uniform_within_unit = pairwise(
+            Uniform, [0.1, 0.9, 0.2, 0.75], [0.15, 0.95, 0.25, 0.8]
+        )
         uniform_positive = pairwise(Uniform, [1, 1.5, 2, 4], [1.2, 2.0, 3, 7])
-        uniform_real = pairwise(Uniform, [-2., -1, 0, 2], [-1., 1, 1, 4])
+        uniform_real = pairwise(Uniform, [-2.0, -1, 0, 2], [-1.0, 1, 1, 4])
         uniform_pareto = pairwise(Uniform, [6.5, 7.5, 6.5, 8.5], [7.5, 8.5, 9.5, 9.5])
         continuous_bernoulli = pairwise(ContinuousBernoulli, [0.1, 0.2, 0.5, 0.9])
 
@@ -4164,21 +5475,29 @@ def __init__(self, probs):
             (continuous_bernoulli, continuous_bernoulli),
             (continuous_bernoulli, exponential),
             (continuous_bernoulli, normal),
-            (beta, continuous_bernoulli)
+            (beta, continuous_bernoulli),
         ]
 
         self.infinite_examples = [
             (Bernoulli(0), Bernoulli(1)),
             (Bernoulli(1), Bernoulli(0)),
-            (Categorical(torch.tensor([0.9, 0.1])), Categorical(torch.tensor([1., 0.]))),
-            (Categorical(torch.tensor([[0.9, 0.1], [.9, .1]])), Categorical(torch.tensor([1., 0.]))),
+            (
+                Categorical(torch.tensor([0.9, 0.1])),
+                Categorical(torch.tensor([1.0, 0.0])),
+            ),
+            (
+                Categorical(torch.tensor([[0.9, 0.1], [0.9, 0.1]])),
+                Categorical(torch.tensor([1.0, 0.0])),
+            ),
             (Beta(1, 2), Uniform(0.25, 1)),
             (Beta(1, 2), Uniform(0, 0.75)),
             (Beta(1, 2), Uniform(0.25, 0.75)),
             (Beta(1, 2), Pareto(1, 2)),
             (Binomial(31, 0.7), Binomial(30, 0.3)),
-            (Binomial(torch.tensor([3, 4]), torch.tensor([0.4, 0.6])),
-             Binomial(torch.tensor([2, 3]), torch.tensor([0.5, 0.8]))),
+            (
+                Binomial(torch.tensor([3, 4]), torch.tensor([0.4, 0.6])),
+                Binomial(torch.tensor([2, 3]), torch.tensor([0.5, 0.8])),
+            ),
             (Chi2(1), Beta(2, 3)),
             (Chi2(1), Pareto(2, 3)),
             (Chi2(1), Uniform(-2, 3)),
@@ -4231,7 +5550,7 @@ def __init__(self, probs):
             (Normal(-1, 2), ContinuousBernoulli(0.75)),
             (Uniform(-1, 1), ContinuousBernoulli(0.75)),
             (Uniform(0, 2), ContinuousBernoulli(0.75)),
-            (Uniform(-1, 2), ContinuousBernoulli(0.75))
+            (Uniform(-1, 2), ContinuousBernoulli(0.75)),
         ]
 
     def test_kl_monte_carlo(self):
@@ -4248,11 +5567,17 @@ def test_kl_monte_carlo(self):
                 error = torch.abs(expected - actual) / (1 + expected)
                 if error[error == error].max() < self.precision:
                     break
-            self.assertLess(error[error == error].max(), self.precision, '\n'.join([
-                f'Incorrect KL({type(p).__name__}, {type(q).__name__}).',
-                f'Expected ({denominator} Monte Carlo samples): {expected}',
-                f'Actual (analytic): {actual}',
-            ]))
+            self.assertLess(
+                error[error == error].max(),
+                self.precision,
+                "\n".join(
+                    [
+                        f"Incorrect KL({type(p).__name__}, {type(q).__name__}).",
+                        f"Expected ({denominator} Monte Carlo samples): {expected}",
+                        f"Actual (analytic): {actual}",
+                    ]
+                ),
+            )
 
     # Multivariate normal has a separate Monte Carlo based test due to the requirement of random generation of
     # positive (semi) definite matrices. n is set to 5, but can be increased during testing.
@@ -4261,7 +5586,10 @@ def test_kl_multivariate_normal(self):
         n = 5  # Number of tests for multivariate_normal
         for i in range(0, n):
             loc = [torch.randn(4) for _ in range(0, 2)]
-            scale_tril = [transform_to(constraints.lower_cholesky)(torch.randn(4, 4)) for _ in range(0, 2)]
+            scale_tril = [
+                transform_to(constraints.lower_cholesky)(torch.randn(4, 4))
+                for _ in range(0, 2)
+            ]
             p = MultivariateNormal(loc=loc[0], scale_tril=scale_tril[0])
             q = MultivariateNormal(loc=loc[1], scale_tril=scale_tril[1])
             actual = kl_divergence(p, q)
@@ -4275,33 +5603,60 @@ def test_kl_multivariate_normal(self):
                 error = torch.abs(expected - actual) / (1 + expected)
                 if error[error == error].max() < self.precision:
                     break
-            self.assertLess(error[error == error].max(), self.precision, '\n'.join([
-                f'Incorrect KL(MultivariateNormal, MultivariateNormal) instance {i + 1}/{n}',
-                f'Expected ({denominator} Monte Carlo sample): {expected}',
-                f'Actual (analytic): {actual}',
-            ]))
+            self.assertLess(
+                error[error == error].max(),
+                self.precision,
+                "\n".join(
+                    [
+                        f"Incorrect KL(MultivariateNormal, MultivariateNormal) instance {i + 1}/{n}",
+                        f"Expected ({denominator} Monte Carlo sample): {expected}",
+                        f"Actual (analytic): {actual}",
+                    ]
+                ),
+            )
 
     def test_kl_multivariate_normal_batched(self):
         b = 7  # Number of batches
         loc = [torch.randn(b, 3) for _ in range(0, 2)]
-        scale_tril = [transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3)) for _ in range(0, 2)]
-        expected_kl = torch.stack([
-            kl_divergence(MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
-                          MultivariateNormal(loc[1][i], scale_tril=scale_tril[1][i])) for i in range(0, b)])
-        actual_kl = kl_divergence(MultivariateNormal(loc[0], scale_tril=scale_tril[0]),
-                                  MultivariateNormal(loc[1], scale_tril=scale_tril[1]))
+        scale_tril = [
+            transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3))
+            for _ in range(0, 2)
+        ]
+        expected_kl = torch.stack(
+            [
+                kl_divergence(
+                    MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
+                    MultivariateNormal(loc[1][i], scale_tril=scale_tril[1][i]),
+                )
+                for i in range(0, b)
+            ]
+        )
+        actual_kl = kl_divergence(
+            MultivariateNormal(loc[0], scale_tril=scale_tril[0]),
+            MultivariateNormal(loc[1], scale_tril=scale_tril[1]),
+        )
         self.assertEqual(expected_kl, actual_kl)
 
     def test_kl_multivariate_normal_batched_broadcasted(self):
         b = 7  # Number of batches
         loc = [torch.randn(b, 3) for _ in range(0, 2)]
-        scale_tril = [transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3)),
-                      transform_to(constraints.lower_cholesky)(torch.randn(3, 3))]
-        expected_kl = torch.stack([
-            kl_divergence(MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
-                          MultivariateNormal(loc[1][i], scale_tril=scale_tril[1])) for i in range(0, b)])
-        actual_kl = kl_divergence(MultivariateNormal(loc[0], scale_tril=scale_tril[0]),
-                                  MultivariateNormal(loc[1], scale_tril=scale_tril[1]))
+        scale_tril = [
+            transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3)),
+            transform_to(constraints.lower_cholesky)(torch.randn(3, 3)),
+        ]
+        expected_kl = torch.stack(
+            [
+                kl_divergence(
+                    MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
+                    MultivariateNormal(loc[1][i], scale_tril=scale_tril[1]),
+                )
+                for i in range(0, b)
+            ]
+        )
+        actual_kl = kl_divergence(
+            MultivariateNormal(loc[0], scale_tril=scale_tril[0]),
+            MultivariateNormal(loc[1], scale_tril=scale_tril[1]),
+        )
         self.assertEqual(expected_kl, actual_kl)
 
     def test_kl_lowrank_multivariate_normal(self):
@@ -4310,9 +5665,13 @@ def test_kl_lowrank_multivariate_normal(self):
         for i in range(0, n):
             loc = [torch.randn(4) for _ in range(0, 2)]
             cov_factor = [torch.randn(4, 3) for _ in range(0, 2)]
-            cov_diag = [transform_to(constraints.positive)(torch.randn(4)) for _ in range(0, 2)]
-            covariance_matrix = [cov_factor[i].matmul(cov_factor[i].t()) +
-                                 cov_diag[i].diag() for i in range(0, 2)]
+            cov_diag = [
+                transform_to(constraints.positive)(torch.randn(4)) for _ in range(0, 2)
+            ]
+            covariance_matrix = [
+                cov_factor[i].matmul(cov_factor[i].t()) + cov_diag[i].diag()
+                for i in range(0, 2)
+            ]
             p = LowRankMultivariateNormal(loc[0], cov_factor[0], cov_diag[0])
             q = LowRankMultivariateNormal(loc[1], cov_factor[1], cov_diag[1])
             p_full = MultivariateNormal(loc[0], covariance_matrix[0])
@@ -4324,37 +5683,68 @@ def test_kl_lowrank_multivariate_normal(self):
             actual_full_lowrank = kl_divergence(p_full, q)
 
             error_lowrank_lowrank = torch.abs(actual_lowrank_lowrank - expected).max()
-            self.assertLess(error_lowrank_lowrank, self.precision, '\n'.join([
-                f'Incorrect KL(LowRankMultivariateNormal, LowRankMultivariateNormal) instance {i + 1}/{n}',
-                f'Expected (from KL MultivariateNormal): {expected}',
-                f'Actual (analytic): {actual_lowrank_lowrank}',
-            ]))
+            self.assertLess(
+                error_lowrank_lowrank,
+                self.precision,
+                "\n".join(
+                    [
+                        f"Incorrect KL(LowRankMultivariateNormal, LowRankMultivariateNormal) instance {i + 1}/{n}",
+                        f"Expected (from KL MultivariateNormal): {expected}",
+                        f"Actual (analytic): {actual_lowrank_lowrank}",
+                    ]
+                ),
+            )
 
             error_lowrank_full = torch.abs(actual_lowrank_full - expected).max()
-            self.assertLess(error_lowrank_full, self.precision, '\n'.join([
-                f'Incorrect KL(LowRankMultivariateNormal, MultivariateNormal) instance {i + 1}/{n}',
-                f'Expected (from KL MultivariateNormal): {expected}',
-                f'Actual (analytic): {actual_lowrank_full}',
-            ]))
+            self.assertLess(
+                error_lowrank_full,
+                self.precision,
+                "\n".join(
+                    [
+                        f"Incorrect KL(LowRankMultivariateNormal, MultivariateNormal) instance {i + 1}/{n}",
+                        f"Expected (from KL MultivariateNormal): {expected}",
+                        f"Actual (analytic): {actual_lowrank_full}",
+                    ]
+                ),
+            )
 
             error_full_lowrank = torch.abs(actual_full_lowrank - expected).max()
-            self.assertLess(error_full_lowrank, self.precision, '\n'.join([
-                f'Incorrect KL(MultivariateNormal, LowRankMultivariateNormal) instance {i + 1}/{n}',
-                f'Expected (from KL MultivariateNormal): {expected}',
-                f'Actual (analytic): {actual_full_lowrank}',
-            ]))
+            self.assertLess(
+                error_full_lowrank,
+                self.precision,
+                "\n".join(
+                    [
+                        f"Incorrect KL(MultivariateNormal, LowRankMultivariateNormal) instance {i + 1}/{n}",
+                        f"Expected (from KL MultivariateNormal): {expected}",
+                        f"Actual (analytic): {actual_full_lowrank}",
+                    ]
+                ),
+            )
 
     def test_kl_lowrank_multivariate_normal_batched(self):
         b = 7  # Number of batches
         loc = [torch.randn(b, 3) for _ in range(0, 2)]
         cov_factor = [torch.randn(b, 3, 2) for _ in range(0, 2)]
-        cov_diag = [transform_to(constraints.positive)(torch.randn(b, 3)) for _ in range(0, 2)]
-        expected_kl = torch.stack([
-            kl_divergence(LowRankMultivariateNormal(loc[0][i], cov_factor[0][i], cov_diag[0][i]),
-                          LowRankMultivariateNormal(loc[1][i], cov_factor[1][i], cov_diag[1][i]))
-            for i in range(0, b)])
-        actual_kl = kl_divergence(LowRankMultivariateNormal(loc[0], cov_factor[0], cov_diag[0]),
-                                  LowRankMultivariateNormal(loc[1], cov_factor[1], cov_diag[1]))
+        cov_diag = [
+            transform_to(constraints.positive)(torch.randn(b, 3)) for _ in range(0, 2)
+        ]
+        expected_kl = torch.stack(
+            [
+                kl_divergence(
+                    LowRankMultivariateNormal(
+                        loc[0][i], cov_factor[0][i], cov_diag[0][i]
+                    ),
+                    LowRankMultivariateNormal(
+                        loc[1][i], cov_factor[1][i], cov_diag[1][i]
+                    ),
+                )
+                for i in range(0, b)
+            ]
+        )
+        actual_kl = kl_divergence(
+            LowRankMultivariateNormal(loc[0], cov_factor[0], cov_diag[0]),
+            LowRankMultivariateNormal(loc[1], cov_factor[1], cov_diag[1]),
+        )
         self.assertEqual(expected_kl, actual_kl)
 
     def test_kl_exponential_family(self):
@@ -4362,22 +5752,36 @@ def test_kl_exponential_family(self):
             if type(p) == type(q) and issubclass(type(p), ExponentialFamily):
                 actual = kl_divergence(p, q)
                 expected = _kl_expfamily_expfamily(p, q)
-                self.assertEqual(actual, expected, msg='\n'.join([
-                    f'Incorrect KL({type(p).__name__}, {type(q).__name__}).',
-                    f'Expected (using Bregman Divergence) {expected}',
-                    f'Actual (analytic) {actual}',
-                    f'max error = {torch.abs(actual - expected).max()}'
-                ]))
+                self.assertEqual(
+                    actual,
+                    expected,
+                    msg="\n".join(
+                        [
+                            f"Incorrect KL({type(p).__name__}, {type(q).__name__}).",
+                            f"Expected (using Bregman Divergence) {expected}",
+                            f"Actual (analytic) {actual}",
+                            f"max error = {torch.abs(actual - expected).max()}",
+                        ]
+                    ),
+                )
 
     def test_kl_infinite(self):
         for p, q in self.infinite_examples:
-            self.assertTrue((kl_divergence(p, q) == inf).all(),
-                            f'Incorrect KL({type(p).__name__}, {type(q).__name__})')
+            self.assertTrue(
+                (kl_divergence(p, q) == inf).all(),
+                f"Incorrect KL({type(p).__name__}, {type(q).__name__})",
+            )
 
     def test_kl_edgecases(self):
         self.assertEqual(kl_divergence(Bernoulli(0), Bernoulli(0)), 0)
         self.assertEqual(kl_divergence(Bernoulli(1), Bernoulli(1)), 0)
-        self.assertEqual(kl_divergence(Categorical(torch.tensor([0., 1.])), Categorical(torch.tensor([0., 1.]))), 0)
+        self.assertEqual(
+            kl_divergence(
+                Categorical(torch.tensor([0.0, 1.0])),
+                Categorical(torch.tensor([0.0, 1.0])),
+            ),
+            0,
+        )
 
     def test_kl_shape(self):
         for Dist, params in _get_examples():
@@ -4388,11 +5792,17 @@ def test_kl_shape(self):
                 except NotImplementedError:
                     continue
                 expected_shape = dist.batch_shape if dist.batch_shape else torch.Size()
-                self.assertEqual(kl.shape, expected_shape, msg='\n'.join([
-                    f'{Dist.__name__} example {i + 1}/{len(params)}',
-                    f'Expected {expected_shape}',
-                    f'Actual {kl.shape}',
-                ]))
+                self.assertEqual(
+                    kl.shape,
+                    expected_shape,
+                    msg="\n".join(
+                        [
+                            f"{Dist.__name__} example {i + 1}/{len(params)}",
+                            f"Expected {expected_shape}",
+                            f"Actual {kl.shape}",
+                        ]
+                    ),
+                )
 
     def test_kl_transformed(self):
         # Regression test for https://github.com/pytorch/pytorch/issues/34859
@@ -4400,7 +5810,9 @@ def test_kl_transformed(self):
         loc = torch.zeros(2, 3)
         normal = Normal(loc=loc, scale=scale)
         diag_normal = Independent(normal, reinterpreted_batch_ndims=1)
-        trans_dist = TransformedDistribution(diag_normal, AffineTransform(loc=0., scale=2.))
+        trans_dist = TransformedDistribution(
+            diag_normal, AffineTransform(loc=0.0, scale=2.0)
+        )
         self.assertEqual(kl_divergence(diag_normal, diag_normal).shape, (2,))
         self.assertEqual(kl_divergence(trans_dist, trans_dist).shape, (2,))
 
@@ -4418,12 +5830,20 @@ def test_entropy_monte_carlo(self):
                 expected = -dist.log_prob(x).mean(0)
                 ignore = (expected == inf) | (expected == -inf)
                 expected[ignore] = actual[ignore]
-                self.assertEqual(actual, expected, atol=0.2, rtol=0, msg='\n'.join([
-                    f'{Dist.__name__} example {i + 1}/{len(params)}, incorrect .entropy().',
-                    f'Expected (monte carlo) {expected}',
-                    f'Actual (analytic) {actual}',
-                    f'max error = {torch.abs(actual - expected).max()}',
-                ]))
+                self.assertEqual(
+                    actual,
+                    expected,
+                    atol=0.2,
+                    rtol=0,
+                    msg="\n".join(
+                        [
+                            f"{Dist.__name__} example {i + 1}/{len(params)}, incorrect .entropy().",
+                            f"Expected (monte carlo) {expected}",
+                            f"Actual (analytic) {actual}",
+                            f"max error = {torch.abs(actual - expected).max()}",
+                        ]
+                    ),
+                )
 
     @set_default_dtype(torch.double)
     def test_entropy_exponential_family(self):
@@ -4440,12 +5860,18 @@ def test_entropy_exponential_family(self):
                     expected = ExponentialFamily.entropy(dist)
                 except NotImplementedError:
                     continue
-                self.assertEqual(actual, expected, msg='\n'.join([
-                    f'{Dist.__name__} example {i + 1}/{len(params)}, incorrect .entropy().',
-                    f'Expected (Bregman Divergence) {expected}',
-                    f'Actual (analytic) {actual}',
-                    f'max error = {torch.abs(actual - expected).max()}'
-                ]))
+                self.assertEqual(
+                    actual,
+                    expected,
+                    msg="\n".join(
+                        [
+                            f"{Dist.__name__} example {i + 1}/{len(params)}, incorrect .entropy().",
+                            f"Expected (Bregman Divergence) {expected}",
+                            f"Actual (analytic) {actual}",
+                            f"max error = {torch.abs(actual - expected).max()}",
+                        ]
+                    ),
+                )
 
 
 class TestConstraints(DistributionsTestCase):
@@ -4455,7 +5881,7 @@ def test_params_constraints(self):
             Multinomial,
             OneHotCategorical,
             OneHotCategoricalStraightThrough,
-            RelaxedOneHotCategorical
+            RelaxedOneHotCategorical,
         )
 
         for Dist, params in _get_examples():
@@ -4464,7 +5890,7 @@ def test_params_constraints(self):
                 for name, value in param.items():
                     if isinstance(value, numbers.Number):
                         value = torch.tensor([value])
-                    if Dist in normalize_probs_dists and name == 'probs':
+                    if Dist in normalize_probs_dists and name == "probs":
                         # These distributions accept positive probs, but elsewhere we
                         # use a stricter constraint to the simplex.
                         value = value / value.sum(-1, True)
@@ -4475,13 +5901,15 @@ def test_params_constraints(self):
 
                     # Check param shape is compatible with distribution shape.
                     self.assertGreaterEqual(value.dim(), constraint.event_dim)
-                    value_batch_shape = value.shape[:value.dim() - constraint.event_dim]
+                    value_batch_shape = value.shape[
+                        : value.dim() - constraint.event_dim
+                    ]
                     torch.broadcast_shapes(dist.batch_shape, value_batch_shape)
 
                     if is_dependent(constraint):
                         continue
 
-                    message = f'{Dist.__name__} example {i + 1}/{len(params)} parameter {name} = {value}'
+                    message = f"{Dist.__name__} example {i + 1}/{len(params)} parameter {name} = {value}"
                     self.assertTrue(constraint.check(value).all(), msg=message)
 
     def test_support_constraints(self):
@@ -4491,8 +5919,12 @@ def test_support_constraints(self):
                 dist = Dist(**param)
                 value = dist.sample()
                 constraint = dist.support
-                message = f'{Dist.__name__} example {i + 1}/{len(params)} sample = {value}'
-                self.assertEqual(constraint.event_dim, len(dist.event_shape), msg=message)
+                message = (
+                    f"{Dist.__name__} example {i + 1}/{len(params)} sample = {value}"
+                )
+                self.assertEqual(
+                    constraint.event_dim, len(dist.event_shape), msg=message
+                )
                 ok = constraint.check(value)
                 self.assertEqual(ok.shape, dist.batch_shape, msg=message)
                 self.assertTrue(ok.all(), msg=message)
@@ -4500,14 +5932,16 @@ def test_support_constraints(self):
 
 @skipIfTorchDynamo("Not a TorchDynamo suitable test")
 class TestNumericalStability(DistributionsTestCase):
-    def _test_pdf_score(self,
-                        dist_class,
-                        x,
-                        expected_value,
-                        probs=None,
-                        logits=None,
-                        expected_gradient=None,
-                        atol=1e-5):
+    def _test_pdf_score(
+        self,
+        dist_class,
+        x,
+        expected_value,
+        probs=None,
+        logits=None,
+        expected_gradient=None,
+        atol=1e-5,
+    ):
         if probs is not None:
             p = probs.detach().requires_grad_()
             dist = dist_class(p)
@@ -4516,85 +5950,107 @@ def _test_pdf_score(self,
             dist = dist_class(logits=p)
         log_pdf = dist.log_prob(x)
         log_pdf.sum().backward()
-        self.assertEqual(log_pdf,
-                         expected_value,
-                         atol=atol,
-                         rtol=0,
-                         msg='Incorrect value for tensor type: {}. Expected = {}, Actual = {}'
-                         .format(type(x), expected_value, log_pdf))
+        self.assertEqual(
+            log_pdf,
+            expected_value,
+            atol=atol,
+            rtol=0,
+            msg=f"Incorrect value for tensor type: {type(x)}. Expected = {expected_value}, Actual = {log_pdf}",
+        )
         if expected_gradient is not None:
-            self.assertEqual(p.grad,
-                             expected_gradient,
-                             atol=atol,
-                             rtol=0,
-                             msg='Incorrect gradient for tensor type: {}. Expected = {}, Actual = {}'
-                             .format(type(x), expected_gradient, p.grad))
+            self.assertEqual(
+                p.grad,
+                expected_gradient,
+                atol=atol,
+                rtol=0,
+                msg=f"Incorrect gradient for tensor type: {type(x)}. Expected = {expected_gradient}, Actual = {p.grad}",
+            )
 
     def test_bernoulli_gradient(self):
         for tensor_type in [torch.FloatTensor, torch.DoubleTensor]:
-            self._test_pdf_score(dist_class=Bernoulli,
-                                 probs=tensor_type([0]),
-                                 x=tensor_type([0]),
-                                 expected_value=tensor_type([0]),
-                                 expected_gradient=tensor_type([0]))
-
-            self._test_pdf_score(dist_class=Bernoulli,
-                                 probs=tensor_type([0]),
-                                 x=tensor_type([1]),
-                                 expected_value=tensor_type([torch.finfo(tensor_type([]).dtype).eps]).log(),
-                                 expected_gradient=tensor_type([0]))
-
-            self._test_pdf_score(dist_class=Bernoulli,
-                                 probs=tensor_type([1e-4]),
-                                 x=tensor_type([1]),
-                                 expected_value=tensor_type([math.log(1e-4)]),
-                                 expected_gradient=tensor_type([10000]))
+            self._test_pdf_score(
+                dist_class=Bernoulli,
+                probs=tensor_type([0]),
+                x=tensor_type([0]),
+                expected_value=tensor_type([0]),
+                expected_gradient=tensor_type([0]),
+            )
+
+            self._test_pdf_score(
+                dist_class=Bernoulli,
+                probs=tensor_type([0]),
+                x=tensor_type([1]),
+                expected_value=tensor_type(
+                    [torch.finfo(tensor_type([]).dtype).eps]
+                ).log(),
+                expected_gradient=tensor_type([0]),
+            )
+
+            self._test_pdf_score(
+                dist_class=Bernoulli,
+                probs=tensor_type([1e-4]),
+                x=tensor_type([1]),
+                expected_value=tensor_type([math.log(1e-4)]),
+                expected_gradient=tensor_type([10000]),
+            )
 
             # Lower precision due to:
             # >>> 1 / (1 - torch.FloatTensor([0.9999]))
             # 9998.3408
             # [torch.FloatTensor of size 1]
-            self._test_pdf_score(dist_class=Bernoulli,
-                                 probs=tensor_type([1 - 1e-4]),
-                                 x=tensor_type([0]),
-                                 expected_value=tensor_type([math.log(1e-4)]),
-                                 expected_gradient=tensor_type([-10000]),
-                                 atol=2)
-
-            self._test_pdf_score(dist_class=Bernoulli,
-                                 logits=tensor_type([math.log(9999)]),
-                                 x=tensor_type([0]),
-                                 expected_value=tensor_type([math.log(1e-4)]),
-                                 expected_gradient=tensor_type([-1]),
-                                 atol=1e-3)
+            self._test_pdf_score(
+                dist_class=Bernoulli,
+                probs=tensor_type([1 - 1e-4]),
+                x=tensor_type([0]),
+                expected_value=tensor_type([math.log(1e-4)]),
+                expected_gradient=tensor_type([-10000]),
+                atol=2,
+            )
+
+            self._test_pdf_score(
+                dist_class=Bernoulli,
+                logits=tensor_type([math.log(9999)]),
+                x=tensor_type([0]),
+                expected_value=tensor_type([math.log(1e-4)]),
+                expected_gradient=tensor_type([-1]),
+                atol=1e-3,
+            )
 
     def test_bernoulli_with_logits_underflow(self):
-        for tensor_type, lim in ([(torch.FloatTensor, -1e38),
-                                  (torch.DoubleTensor, -1e308)]):
-            self._test_pdf_score(dist_class=Bernoulli,
-                                 logits=tensor_type([lim]),
-                                 x=tensor_type([0]),
-                                 expected_value=tensor_type([0]),
-                                 expected_gradient=tensor_type([0]))
+        for tensor_type, lim in [
+            (torch.FloatTensor, -1e38),
+            (torch.DoubleTensor, -1e308),
+        ]:
+            self._test_pdf_score(
+                dist_class=Bernoulli,
+                logits=tensor_type([lim]),
+                x=tensor_type([0]),
+                expected_value=tensor_type([0]),
+                expected_gradient=tensor_type([0]),
+            )
 
     def test_bernoulli_with_logits_overflow(self):
-        for tensor_type, lim in ([(torch.FloatTensor, 1e38),
-                                  (torch.DoubleTensor, 1e308)]):
-            self._test_pdf_score(dist_class=Bernoulli,
-                                 logits=tensor_type([lim]),
-                                 x=tensor_type([1]),
-                                 expected_value=tensor_type([0]),
-                                 expected_gradient=tensor_type([0]))
+        for tensor_type, lim in [
+            (torch.FloatTensor, 1e38),
+            (torch.DoubleTensor, 1e308),
+        ]:
+            self._test_pdf_score(
+                dist_class=Bernoulli,
+                logits=tensor_type([lim]),
+                x=tensor_type([1]),
+                expected_value=tensor_type([0]),
+                expected_gradient=tensor_type([0]),
+            )
 
     def test_categorical_log_prob(self):
-        for dtype in ([torch.float, torch.double]):
+        for dtype in [torch.float, torch.double]:
             p = torch.tensor([0, 1], dtype=dtype, requires_grad=True)
             categorical = OneHotCategorical(p)
             log_pdf = categorical.log_prob(torch.tensor([0, 1], dtype=dtype))
             self.assertEqual(log_pdf.item(), 0)
 
     def test_categorical_log_prob_with_logits(self):
-        for dtype in ([torch.float, torch.double]):
+        for dtype in [torch.float, torch.double]:
             p = torch.tensor([-inf, 0], dtype=dtype, requires_grad=True)
             categorical = OneHotCategorical(logits=p)
             log_pdf_prob_1 = categorical.log_prob(torch.tensor([0, 1], dtype=dtype))
@@ -4603,7 +6059,7 @@ def test_categorical_log_prob_with_logits(self):
             self.assertEqual(log_pdf_prob_0.item(), -inf)
 
     def test_multinomial_log_prob(self):
-        for dtype in ([torch.float, torch.double]):
+        for dtype in [torch.float, torch.double]:
             p = torch.tensor([0, 1], dtype=dtype, requires_grad=True)
             s = torch.tensor([0, 10], dtype=dtype)
             multinomial = Multinomial(10, p)
@@ -4611,7 +6067,7 @@ def test_multinomial_log_prob(self):
             self.assertEqual(log_pdf.item(), 0)
 
     def test_multinomial_log_prob_with_logits(self):
-        for dtype in ([torch.float, torch.double]):
+        for dtype in [torch.float, torch.double]:
             p = torch.tensor([-inf, 0], dtype=dtype, requires_grad=True)
             multinomial = Multinomial(10, logits=p)
             log_pdf_prob_1 = multinomial.log_prob(torch.tensor([0, 10], dtype=dtype))
@@ -4620,15 +6076,17 @@ def test_multinomial_log_prob_with_logits(self):
             self.assertEqual(log_pdf_prob_0.item(), -inf)
 
     def test_continuous_bernoulli_gradient(self):
-
         def expec_val(x, probs=None, logits=None):
             assert not (probs is None and logits is None)
             if logits is not None:
-                probs = 1. / (1. + math.exp(-logits))
-            bern_log_lik = x * math.log(probs) + (1. - x) * math.log1p(-probs)
+                probs = 1.0 / (1.0 + math.exp(-logits))
+            bern_log_lik = x * math.log(probs) + (1.0 - x) * math.log1p(-probs)
             if probs < 0.499 or probs > 0.501:  # using default values of lims here
-                log_norm_const = math.log(
-                    math.fabs(math.atanh(1. - 2. * probs))) - math.log(math.fabs(1. - 2. * probs)) + math.log(2.)
+                log_norm_const = (
+                    math.log(math.fabs(math.atanh(1.0 - 2.0 * probs)))
+                    - math.log(math.fabs(1.0 - 2.0 * probs))
+                    + math.log(2.0)
+                )
             else:
                 aux = math.pow(probs - 0.5, 2)
                 log_norm_const = math.log(2.0) + (4.0 / 3.0 + 104.0 / 45.0 * aux) * aux
@@ -4638,81 +6096,117 @@ def expec_val(x, probs=None, logits=None):
         def expec_grad(x, probs=None, logits=None):
             assert not (probs is None and logits is None)
             if logits is not None:
-                probs = 1. / (1. + math.exp(-logits))
-            grad_bern_log_lik = x / probs - (1. - x) / (1. - probs)
+                probs = 1.0 / (1.0 + math.exp(-logits))
+            grad_bern_log_lik = x / probs - (1.0 - x) / (1.0 - probs)
             if probs < 0.499 or probs > 0.501:  # using default values of lims here
-                grad_log_c = 2. * probs - 4. * (probs - 1.) * probs * math.atanh(1. - 2. * probs) - 1.
-                grad_log_c /= 2. * (probs - 1.) * probs * (2. * probs - 1.) * math.atanh(1. - 2. * probs)
+                grad_log_c = (
+                    2.0 * probs
+                    - 4.0 * (probs - 1.0) * probs * math.atanh(1.0 - 2.0 * probs)
+                    - 1.0
+                )
+                grad_log_c /= (
+                    2.0
+                    * (probs - 1.0)
+                    * probs
+                    * (2.0 * probs - 1.0)
+                    * math.atanh(1.0 - 2.0 * probs)
+                )
             else:
-                grad_log_c = 8. / 3. * (probs - 0.5) + 416. / 45. * math.pow(probs - 0.5, 3)
+                grad_log_c = 8.0 / 3.0 * (probs - 0.5) + 416.0 / 45.0 * math.pow(
+                    probs - 0.5, 3
+                )
             grad = grad_bern_log_lik + grad_log_c
             if logits is not None:
-                grad *= 1. / (1. + math.exp(logits)) - 1. / math.pow(1. + math.exp(logits), 2)
+                grad *= 1.0 / (1.0 + math.exp(logits)) - 1.0 / math.pow(
+                    1.0 + math.exp(logits), 2
+                )
             return grad
 
         for tensor_type in [torch.FloatTensor, torch.DoubleTensor]:
-            self._test_pdf_score(dist_class=ContinuousBernoulli,
-                                 probs=tensor_type([0.1]),
-                                 x=tensor_type([0.1]),
-                                 expected_value=tensor_type([expec_val(0.1, probs=0.1)]),
-                                 expected_gradient=tensor_type([expec_grad(0.1, probs=0.1)]))
-
-            self._test_pdf_score(dist_class=ContinuousBernoulli,
-                                 probs=tensor_type([0.1]),
-                                 x=tensor_type([1.]),
-                                 expected_value=tensor_type([expec_val(1., probs=0.1)]),
-                                 expected_gradient=tensor_type([expec_grad(1., probs=0.1)]))
-
-            self._test_pdf_score(dist_class=ContinuousBernoulli,
-                                 probs=tensor_type([0.4999]),
-                                 x=tensor_type([0.9]),
-                                 expected_value=tensor_type([expec_val(0.9, probs=0.4999)]),
-                                 expected_gradient=tensor_type([expec_grad(0.9, probs=0.4999)]))
-
-            self._test_pdf_score(dist_class=ContinuousBernoulli,
-                                 probs=tensor_type([1e-4]),
-                                 x=tensor_type([1]),
-                                 expected_value=tensor_type([expec_val(1, probs=1e-4)]),
-                                 expected_gradient=tensor_type(tensor_type([expec_grad(1, probs=1e-4)])),
-                                 atol=1e-3)
-
-            self._test_pdf_score(dist_class=ContinuousBernoulli,
-                                 probs=tensor_type([1 - 1e-4]),
-                                 x=tensor_type([0.1]),
-                                 expected_value=tensor_type([expec_val(0.1, probs=1 - 1e-4)]),
-                                 expected_gradient=tensor_type([expec_grad(0.1, probs=1 - 1e-4)]),
-                                 atol=2)
-
-            self._test_pdf_score(dist_class=ContinuousBernoulli,
-                                 logits=tensor_type([math.log(9999)]),
-                                 x=tensor_type([0]),
-                                 expected_value=tensor_type([expec_val(0, logits=math.log(9999))]),
-                                 expected_gradient=tensor_type([expec_grad(0, logits=math.log(9999))]),
-                                 atol=1e-3)
-
-            self._test_pdf_score(dist_class=ContinuousBernoulli,
-                                 logits=tensor_type([0.001]),
-                                 x=tensor_type([0.5]),
-                                 expected_value=tensor_type([expec_val(0.5, logits=0.001)]),
-                                 expected_gradient=tensor_type([expec_grad(0.5, logits=0.001)]))
+            self._test_pdf_score(
+                dist_class=ContinuousBernoulli,
+                probs=tensor_type([0.1]),
+                x=tensor_type([0.1]),
+                expected_value=tensor_type([expec_val(0.1, probs=0.1)]),
+                expected_gradient=tensor_type([expec_grad(0.1, probs=0.1)]),
+            )
+
+            self._test_pdf_score(
+                dist_class=ContinuousBernoulli,
+                probs=tensor_type([0.1]),
+                x=tensor_type([1.0]),
+                expected_value=tensor_type([expec_val(1.0, probs=0.1)]),
+                expected_gradient=tensor_type([expec_grad(1.0, probs=0.1)]),
+            )
+
+            self._test_pdf_score(
+                dist_class=ContinuousBernoulli,
+                probs=tensor_type([0.4999]),
+                x=tensor_type([0.9]),
+                expected_value=tensor_type([expec_val(0.9, probs=0.4999)]),
+                expected_gradient=tensor_type([expec_grad(0.9, probs=0.4999)]),
+            )
+
+            self._test_pdf_score(
+                dist_class=ContinuousBernoulli,
+                probs=tensor_type([1e-4]),
+                x=tensor_type([1]),
+                expected_value=tensor_type([expec_val(1, probs=1e-4)]),
+                expected_gradient=tensor_type(tensor_type([expec_grad(1, probs=1e-4)])),
+                atol=1e-3,
+            )
+
+            self._test_pdf_score(
+                dist_class=ContinuousBernoulli,
+                probs=tensor_type([1 - 1e-4]),
+                x=tensor_type([0.1]),
+                expected_value=tensor_type([expec_val(0.1, probs=1 - 1e-4)]),
+                expected_gradient=tensor_type([expec_grad(0.1, probs=1 - 1e-4)]),
+                atol=2,
+            )
+
+            self._test_pdf_score(
+                dist_class=ContinuousBernoulli,
+                logits=tensor_type([math.log(9999)]),
+                x=tensor_type([0]),
+                expected_value=tensor_type([expec_val(0, logits=math.log(9999))]),
+                expected_gradient=tensor_type([expec_grad(0, logits=math.log(9999))]),
+                atol=1e-3,
+            )
+
+            self._test_pdf_score(
+                dist_class=ContinuousBernoulli,
+                logits=tensor_type([0.001]),
+                x=tensor_type([0.5]),
+                expected_value=tensor_type([expec_val(0.5, logits=0.001)]),
+                expected_gradient=tensor_type([expec_grad(0.5, logits=0.001)]),
+            )
 
     def test_continuous_bernoulli_with_logits_underflow(self):
-        for tensor_type, lim, expected in ([(torch.FloatTensor, -1e38, 2.76898),
-                                            (torch.DoubleTensor, -1e308, 3.58473)]):
-            self._test_pdf_score(dist_class=ContinuousBernoulli,
-                                 logits=tensor_type([lim]),
-                                 x=tensor_type([0]),
-                                 expected_value=tensor_type([expected]),
-                                 expected_gradient=tensor_type([0.]))
+        for tensor_type, lim, expected in [
+            (torch.FloatTensor, -1e38, 2.76898),
+            (torch.DoubleTensor, -1e308, 3.58473),
+        ]:
+            self._test_pdf_score(
+                dist_class=ContinuousBernoulli,
+                logits=tensor_type([lim]),
+                x=tensor_type([0]),
+                expected_value=tensor_type([expected]),
+                expected_gradient=tensor_type([0.0]),
+            )
 
     def test_continuous_bernoulli_with_logits_overflow(self):
-        for tensor_type, lim, expected in ([(torch.FloatTensor, 1e38, 2.76898),
-                                            (torch.DoubleTensor, 1e308, 3.58473)]):
-            self._test_pdf_score(dist_class=ContinuousBernoulli,
-                                 logits=tensor_type([lim]),
-                                 x=tensor_type([1]),
-                                 expected_value=tensor_type([expected]),
-                                 expected_gradient=tensor_type([0.]))
+        for tensor_type, lim, expected in [
+            (torch.FloatTensor, 1e38, 2.76898),
+            (torch.DoubleTensor, 1e308, 3.58473),
+        ]:
+            self._test_pdf_score(
+                dist_class=ContinuousBernoulli,
+                logits=tensor_type([lim]),
+                x=tensor_type([1]),
+                expected_value=tensor_type([expected]),
+                expected_gradient=tensor_type([0.0]),
+            )
 
 
 # TODO: make this a pytest parameterized test
@@ -4721,45 +6215,49 @@ def setUp(self):
         super().setUp()
         # ContinuousBernoulli is not tested because log_prob is not computed simply
         # from 'logits', but 'probs' is also needed
-        self.examples = [e for e in _get_examples() if e.Dist in
-                         (Categorical, OneHotCategorical, Bernoulli, Binomial, Multinomial)]
+        self.examples = [
+            e
+            for e in _get_examples()
+            if e.Dist
+            in (Categorical, OneHotCategorical, Bernoulli, Binomial, Multinomial)
+        ]
 
     def test_lazy_logits_initialization(self):
         for Dist, params in self.examples:
             param = params[0].copy()
-            if 'probs' not in param:
+            if "probs" not in param:
                 continue
-            probs = param.pop('probs')
-            param['logits'] = probs_to_logits(probs)
+            probs = param.pop("probs")
+            param["logits"] = probs_to_logits(probs)
             dist = Dist(**param)
             # Create new instance to generate a valid sample
             dist.log_prob(Dist(**param).sample())
-            message = f'Failed for {Dist.__name__} example 0/{len(params)}'
-            self.assertNotIn('probs', dist.__dict__, msg=message)
+            message = f"Failed for {Dist.__name__} example 0/{len(params)}"
+            self.assertNotIn("probs", dist.__dict__, msg=message)
             try:
                 dist.enumerate_support()
             except NotImplementedError:
                 pass
-            self.assertNotIn('probs', dist.__dict__, msg=message)
+            self.assertNotIn("probs", dist.__dict__, msg=message)
             batch_shape, event_shape = dist.batch_shape, dist.event_shape
-            self.assertNotIn('probs', dist.__dict__, msg=message)
+            self.assertNotIn("probs", dist.__dict__, msg=message)
 
     def test_lazy_probs_initialization(self):
         for Dist, params in self.examples:
             param = params[0].copy()
-            if 'probs' not in param:
+            if "probs" not in param:
                 continue
             dist = Dist(**param)
             dist.sample()
-            message = f'Failed for {Dist.__name__} example 0/{len(params)}'
-            self.assertNotIn('logits', dist.__dict__, msg=message)
+            message = f"Failed for {Dist.__name__} example 0/{len(params)}"
+            self.assertNotIn("logits", dist.__dict__, msg=message)
             try:
                 dist.enumerate_support()
             except NotImplementedError:
                 pass
-            self.assertNotIn('logits', dist.__dict__, msg=message)
+            self.assertNotIn("logits", dist.__dict__, msg=message)
             batch_shape, event_shape = dist.batch_shape, dist.event_shape
-            self.assertNotIn('logits', dist.__dict__, msg=message)
+            self.assertNotIn("logits", dist.__dict__, msg=message)
 
 
 @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
@@ -4774,124 +6272,126 @@ def setUp(self):
         cov_tensor = torch.randn(20, 20, dtype=torch.double)
         cov_tensor = cov_tensor @ cov_tensor.mT
         self.distribution_pairs = [
-            (
-                Bernoulli(simplex_tensor),
-                scipy.stats.bernoulli(simplex_tensor)
-            ),
+            (Bernoulli(simplex_tensor), scipy.stats.bernoulli(simplex_tensor)),
             (
                 Beta(positive_var, positive_var2),
-                scipy.stats.beta(positive_var, positive_var2)
+                scipy.stats.beta(positive_var, positive_var2),
             ),
             (
                 Binomial(10, simplex_tensor),
-                scipy.stats.binom(10 * np.ones(simplex_tensor.shape), simplex_tensor.numpy())
+                scipy.stats.binom(
+                    10 * np.ones(simplex_tensor.shape), simplex_tensor.numpy()
+                ),
             ),
             (
                 Cauchy(random_var, positive_var),
-                scipy.stats.cauchy(loc=random_var, scale=positive_var)
-            ),
-            (
-                Dirichlet(positive_var),
-                scipy.stats.dirichlet(positive_var)
+                scipy.stats.cauchy(loc=random_var, scale=positive_var),
             ),
+            (Dirichlet(positive_var), scipy.stats.dirichlet(positive_var)),
             (
                 Exponential(positive_var),
-                scipy.stats.expon(scale=positive_var.reciprocal())
+                scipy.stats.expon(scale=positive_var.reciprocal()),
             ),
             (
-                FisherSnedecor(positive_var, 4 + positive_var2),  # var for df2<=4 is undefined
-                scipy.stats.f(positive_var, 4 + positive_var2)
+                FisherSnedecor(
+                    positive_var, 4 + positive_var2
+                ),  # var for df2<=4 is undefined
+                scipy.stats.f(positive_var, 4 + positive_var2),
             ),
             (
                 Gamma(positive_var, positive_var2),
-                scipy.stats.gamma(positive_var, scale=positive_var2.reciprocal())
-            ),
-            (
-                Geometric(simplex_tensor),
-                scipy.stats.geom(simplex_tensor, loc=-1)
+                scipy.stats.gamma(positive_var, scale=positive_var2.reciprocal()),
             ),
+            (Geometric(simplex_tensor), scipy.stats.geom(simplex_tensor, loc=-1)),
             (
                 Gumbel(random_var, positive_var2),
-                scipy.stats.gumbel_r(random_var, positive_var2)
-            ),
-            (
-                HalfCauchy(positive_var),
-                scipy.stats.halfcauchy(scale=positive_var)
-            ),
-            (
-                HalfNormal(positive_var2),
-                scipy.stats.halfnorm(scale=positive_var2)
+                scipy.stats.gumbel_r(random_var, positive_var2),
             ),
+            (HalfCauchy(positive_var), scipy.stats.halfcauchy(scale=positive_var)),
+            (HalfNormal(positive_var2), scipy.stats.halfnorm(scale=positive_var2)),
             (
                 InverseGamma(positive_var, positive_var2),
-                scipy.stats.invgamma(positive_var, scale=positive_var2)
+                scipy.stats.invgamma(positive_var, scale=positive_var2),
             ),
             (
                 Laplace(random_var, positive_var2),
-                scipy.stats.laplace(random_var, positive_var2)
+                scipy.stats.laplace(random_var, positive_var2),
             ),
             (
                 # Tests fail 1e-5 threshold if scale > 3
                 LogNormal(random_var, positive_var.clamp(max=3)),
-                scipy.stats.lognorm(s=positive_var.clamp(max=3), scale=random_var.exp())
+                scipy.stats.lognorm(
+                    s=positive_var.clamp(max=3), scale=random_var.exp()
+                ),
             ),
             (
-                LowRankMultivariateNormal(random_var, torch.zeros(20, 1, dtype=torch.double), positive_var2),
-                scipy.stats.multivariate_normal(random_var, torch.diag(positive_var2))
+                LowRankMultivariateNormal(
+                    random_var, torch.zeros(20, 1, dtype=torch.double), positive_var2
+                ),
+                scipy.stats.multivariate_normal(random_var, torch.diag(positive_var2)),
             ),
             (
                 Multinomial(10, simplex_tensor),
-                scipy.stats.multinomial(10, simplex_tensor)
+                scipy.stats.multinomial(10, simplex_tensor),
             ),
             (
                 MultivariateNormal(random_var, torch.diag(positive_var2)),
-                scipy.stats.multivariate_normal(random_var, torch.diag(positive_var2))
+                scipy.stats.multivariate_normal(random_var, torch.diag(positive_var2)),
             ),
             (
                 MultivariateNormal(random_var, cov_tensor),
-                scipy.stats.multivariate_normal(random_var, cov_tensor)
+                scipy.stats.multivariate_normal(random_var, cov_tensor),
             ),
             (
                 Normal(random_var, positive_var2),
-                scipy.stats.norm(random_var, positive_var2)
+                scipy.stats.norm(random_var, positive_var2),
             ),
             (
                 OneHotCategorical(simplex_tensor),
-                scipy.stats.multinomial(1, simplex_tensor)
+                scipy.stats.multinomial(1, simplex_tensor),
             ),
             (
                 Pareto(positive_var, 2 + positive_var2),
-                scipy.stats.pareto(2 + positive_var2, scale=positive_var)
-            ),
-            (
-                Poisson(positive_var),
-                scipy.stats.poisson(positive_var)
+                scipy.stats.pareto(2 + positive_var2, scale=positive_var),
             ),
+            (Poisson(positive_var), scipy.stats.poisson(positive_var)),
             (
                 StudentT(2 + positive_var, random_var, positive_var2),
-                scipy.stats.t(2 + positive_var, random_var, positive_var2)
+                scipy.stats.t(2 + positive_var, random_var, positive_var2),
             ),
             (
                 Uniform(random_var, random_var + positive_var),
-                scipy.stats.uniform(random_var, positive_var)
+                scipy.stats.uniform(random_var, positive_var),
             ),
             (
                 VonMises(random_var, positive_var),
-                scipy.stats.vonmises(positive_var, loc=random_var)
+                scipy.stats.vonmises(positive_var, loc=random_var),
             ),
             (
-                Weibull(positive_var[0], positive_var2[0]),  # scipy var for Weibull only supports scalars
-                scipy.stats.weibull_min(c=positive_var2[0], scale=positive_var[0])
+                Weibull(
+                    positive_var[0], positive_var2[0]
+                ),  # scipy var for Weibull only supports scalars
+                scipy.stats.weibull_min(c=positive_var2[0], scale=positive_var[0]),
             ),
             (
                 # scipy var for Wishart only supports scalars
                 # SciPy allowed ndim -1 < df < ndim for Wishar distribution after version 1.7.0
                 Wishart(
-                    (20 if version.parse(scipy.__version__) < version.parse("1.7.0") else 19) + positive_var[0],
+                    (
+                        20
+                        if version.parse(scipy.__version__) < version.parse("1.7.0")
+                        else 19
+                    )
+                    + positive_var[0],
                     cov_tensor,
                 ),
                 scipy.stats.wishart(
-                    (20 if version.parse(scipy.__version__) < version.parse("1.7.0") else 19) + positive_var[0].item(),
+                    (
+                        20
+                        if version.parse(scipy.__version__) < version.parse("1.7.0")
+                        else 19
+                    )
+                    + positive_var[0].item(),
                     cov_tensor,
                 ),
             ),
@@ -4902,7 +6402,9 @@ def test_mean(self):
             if isinstance(pytorch_dist, (Cauchy, HalfCauchy)):
                 # Cauchy, HalfCauchy distributions' mean is nan, skipping check
                 continue
-            elif isinstance(pytorch_dist, (LowRankMultivariateNormal, MultivariateNormal)):
+            elif isinstance(
+                pytorch_dist, (LowRankMultivariateNormal, MultivariateNormal)
+            ):
                 self.assertEqual(pytorch_dist.mean, scipy_dist.mean, msg=pytorch_dist)
             else:
                 self.assertEqual(pytorch_dist.mean, scipy_dist.mean(), msg=pytorch_dist)
@@ -4914,14 +6416,32 @@ def test_variance_stddev(self):
                 # VonMises variance is circular and scipy doesn't produce a correct result
                 continue
             elif isinstance(pytorch_dist, (Multinomial, OneHotCategorical)):
-                self.assertEqual(pytorch_dist.variance, np.diag(scipy_dist.cov()), msg=pytorch_dist)
-                self.assertEqual(pytorch_dist.stddev, np.diag(scipy_dist.cov()) ** 0.5, msg=pytorch_dist)
-            elif isinstance(pytorch_dist, (LowRankMultivariateNormal, MultivariateNormal)):
-                self.assertEqual(pytorch_dist.variance, np.diag(scipy_dist.cov), msg=pytorch_dist)
-                self.assertEqual(pytorch_dist.stddev, np.diag(scipy_dist.cov) ** 0.5, msg=pytorch_dist)
+                self.assertEqual(
+                    pytorch_dist.variance, np.diag(scipy_dist.cov()), msg=pytorch_dist
+                )
+                self.assertEqual(
+                    pytorch_dist.stddev,
+                    np.diag(scipy_dist.cov()) ** 0.5,
+                    msg=pytorch_dist,
+                )
+            elif isinstance(
+                pytorch_dist, (LowRankMultivariateNormal, MultivariateNormal)
+            ):
+                self.assertEqual(
+                    pytorch_dist.variance, np.diag(scipy_dist.cov), msg=pytorch_dist
+                )
+                self.assertEqual(
+                    pytorch_dist.stddev,
+                    np.diag(scipy_dist.cov) ** 0.5,
+                    msg=pytorch_dist,
+                )
             else:
-                self.assertEqual(pytorch_dist.variance, scipy_dist.var(), msg=pytorch_dist)
-                self.assertEqual(pytorch_dist.stddev, scipy_dist.var() ** 0.5, msg=pytorch_dist)
+                self.assertEqual(
+                    pytorch_dist.variance, scipy_dist.var(), msg=pytorch_dist
+                )
+                self.assertEqual(
+                    pytorch_dist.stddev, scipy_dist.var() ** 0.5, msg=pytorch_dist
+                )
 
     @set_default_dtype(torch.double)
     def test_cdf(self):
@@ -4953,9 +6473,9 @@ def test_cat_transform(self):
         x = torch.cat([x1, x2, x3], dim=dim)
         t = CatTransform([t1, t2, t3], dim=dim)
         actual_dom_check = t.domain.check(x)
-        expected_dom_check = torch.cat([t1.domain.check(x1),
-                                        t2.domain.check(x2),
-                                        t3.domain.check(x3)], dim=dim)
+        expected_dom_check = torch.cat(
+            [t1.domain.check(x1), t2.domain.check(x2), t3.domain.check(x3)], dim=dim
+        )
         self.assertEqual(expected_dom_check, actual_dom_check)
         actual = t(x)
         expected = torch.cat([t1(x1), t2(x2), t3(x3)], dim=dim)
@@ -4965,49 +6485,66 @@ def test_cat_transform(self):
         y3 = torch.arange(1, 101, dtype=torch.float).view(-1, 100)
         y = torch.cat([y1, y2, y3], dim=dim)
         actual_cod_check = t.codomain.check(y)
-        expected_cod_check = torch.cat([t1.codomain.check(y1),
-                                        t2.codomain.check(y2),
-                                        t3.codomain.check(y3)], dim=dim)
+        expected_cod_check = torch.cat(
+            [t1.codomain.check(y1), t2.codomain.check(y2), t3.codomain.check(y3)],
+            dim=dim,
+        )
         self.assertEqual(actual_cod_check, expected_cod_check)
         actual_inv = t.inv(y)
         expected_inv = torch.cat([t1.inv(y1), t2.inv(y2), t3.inv(y3)], dim=dim)
         self.assertEqual(expected_inv, actual_inv)
         actual_jac = t.log_abs_det_jacobian(x, y)
-        expected_jac = torch.cat([t1.log_abs_det_jacobian(x1, y1),
-                                  t2.log_abs_det_jacobian(x2, y2),
-                                  t3.log_abs_det_jacobian(x3, y3)], dim=dim)
+        expected_jac = torch.cat(
+            [
+                t1.log_abs_det_jacobian(x1, y1),
+                t2.log_abs_det_jacobian(x2, y2),
+                t3.log_abs_det_jacobian(x3, y3),
+            ],
+            dim=dim,
+        )
         self.assertEqual(actual_jac, expected_jac)
 
     def test_cat_transform_non_uniform(self):
         x1 = -1 * torch.arange(1, 101, dtype=torch.float).view(-1, 100)
-        x2 = torch.cat([(torch.arange(1, 101, dtype=torch.float).view(-1, 100) - 1) / 100,
-                        torch.arange(1, 101, dtype=torch.float).view(-1, 100)])
+        x2 = torch.cat(
+            [
+                (torch.arange(1, 101, dtype=torch.float).view(-1, 100) - 1) / 100,
+                torch.arange(1, 101, dtype=torch.float).view(-1, 100),
+            ]
+        )
         t1 = ExpTransform()
         t2 = CatTransform([AffineTransform(1, 100), identity_transform], dim=0)
         dim = 0
         x = torch.cat([x1, x2], dim=dim)
         t = CatTransform([t1, t2], dim=dim, lengths=[1, 2])
         actual_dom_check = t.domain.check(x)
-        expected_dom_check = torch.cat([t1.domain.check(x1),
-                                        t2.domain.check(x2)], dim=dim)
+        expected_dom_check = torch.cat(
+            [t1.domain.check(x1), t2.domain.check(x2)], dim=dim
+        )
         self.assertEqual(expected_dom_check, actual_dom_check)
         actual = t(x)
         expected = torch.cat([t1(x1), t2(x2)], dim=dim)
         self.assertEqual(expected, actual)
         y1 = torch.arange(1, 101, dtype=torch.float).view(-1, 100)
-        y2 = torch.cat([torch.arange(1, 101, dtype=torch.float).view(-1, 100),
-                        torch.arange(1, 101, dtype=torch.float).view(-1, 100)])
+        y2 = torch.cat(
+            [
+                torch.arange(1, 101, dtype=torch.float).view(-1, 100),
+                torch.arange(1, 101, dtype=torch.float).view(-1, 100),
+            ]
+        )
         y = torch.cat([y1, y2], dim=dim)
         actual_cod_check = t.codomain.check(y)
-        expected_cod_check = torch.cat([t1.codomain.check(y1),
-                                        t2.codomain.check(y2)], dim=dim)
+        expected_cod_check = torch.cat(
+            [t1.codomain.check(y1), t2.codomain.check(y2)], dim=dim
+        )
         self.assertEqual(actual_cod_check, expected_cod_check)
         actual_inv = t.inv(y)
         expected_inv = torch.cat([t1.inv(y1), t2.inv(y2)], dim=dim)
         self.assertEqual(expected_inv, actual_inv)
         actual_jac = t.log_abs_det_jacobian(x, y)
-        expected_jac = torch.cat([t1.log_abs_det_jacobian(x1, y1),
-                                  t2.log_abs_det_jacobian(x2, y2)], dim=dim)
+        expected_jac = torch.cat(
+            [t1.log_abs_det_jacobian(x1, y1), t2.log_abs_det_jacobian(x2, y2)], dim=dim
+        )
         self.assertEqual(actual_jac, expected_jac)
 
     def test_cat_event_dim(self):
@@ -5023,8 +6560,9 @@ def test_cat_event_dim(self):
         y2 = t2(x2)
         y = t(x)
         actual_jac = t.log_abs_det_jacobian(x, y)
-        expected_jac = sum([t1.log_abs_det_jacobian(x1, y1),
-                            t2.log_abs_det_jacobian(x2, y2)])
+        expected_jac = sum(
+            [t1.log_abs_det_jacobian(x1, y1), t2.log_abs_det_jacobian(x2, y2)]
+        )
 
     def test_stack_transform(self):
         x1 = -1 * torch.arange(1, 101, dtype=torch.float)
@@ -5035,9 +6573,9 @@ def test_stack_transform(self):
         x = torch.stack([x1, x2, x3], dim=dim)
         t = StackTransform([t1, t2, t3], dim=dim)
         actual_dom_check = t.domain.check(x)
-        expected_dom_check = torch.stack([t1.domain.check(x1),
-                                          t2.domain.check(x2),
-                                          t3.domain.check(x3)], dim=dim)
+        expected_dom_check = torch.stack(
+            [t1.domain.check(x1), t2.domain.check(x2), t3.domain.check(x3)], dim=dim
+        )
         self.assertEqual(expected_dom_check, actual_dom_check)
         actual = t(x)
         expected = torch.stack([t1(x1), t2(x2), t3(x3)], dim=dim)
@@ -5047,17 +6585,23 @@ def test_stack_transform(self):
         y3 = torch.arange(1, 101, dtype=torch.float)
         y = torch.stack([y1, y2, y3], dim=dim)
         actual_cod_check = t.codomain.check(y)
-        expected_cod_check = torch.stack([t1.codomain.check(y1),
-                                          t2.codomain.check(y2),
-                                          t3.codomain.check(y3)], dim=dim)
+        expected_cod_check = torch.stack(
+            [t1.codomain.check(y1), t2.codomain.check(y2), t3.codomain.check(y3)],
+            dim=dim,
+        )
         self.assertEqual(actual_cod_check, expected_cod_check)
         actual_inv = t.inv(x)
         expected_inv = torch.stack([t1.inv(x1), t2.inv(x2), t3.inv(x3)], dim=dim)
         self.assertEqual(expected_inv, actual_inv)
         actual_jac = t.log_abs_det_jacobian(x, y)
-        expected_jac = torch.stack([t1.log_abs_det_jacobian(x1, y1),
-                                    t2.log_abs_det_jacobian(x2, y2),
-                                    t3.log_abs_det_jacobian(x3, y3)], dim=dim)
+        expected_jac = torch.stack(
+            [
+                t1.log_abs_det_jacobian(x1, y1),
+                t2.log_abs_det_jacobian(x2, y2),
+                t3.log_abs_det_jacobian(x3, y3),
+            ],
+            dim=dim,
+        )
         self.assertEqual(actual_jac, expected_jac)
 
 
@@ -5091,7 +6635,7 @@ def test_invalid_log_probs_arg(self):
                     try:
                         log_prob = d_val.log_prob(val)
                     except ValueError as e:
-                        if e.args and 'must be within the support' in e.args[0]:
+                        if e.args and "must be within the support" in e.args[0]:
                             try:
                                 log_prob = d_nonval.log_prob(val)
                             except RuntimeError:
@@ -5135,13 +6679,13 @@ def __init__(self, validate_args=True):
                 super().__init__(validate_args=validate_args)
 
             def sample(self, sample_shape=torch.Size()):
-                return torch.tensor(0.).expand(sample_shape)
+                return torch.tensor(0.0).expand(sample_shape)
 
             def log_prob(self, value):
                 if self._validate_args:
                     self._validate_sample(value)
-                value[value != 0.] = -float('inf')
-                value[value == 0.] = 0.
+                value[value != 0.0] = -float("inf")
+                value[value == 0.0] = 0.0
                 return value
 
         with self.assertWarns(UserWarning):
@@ -5165,7 +6709,10 @@ def _examples(self):
     def _perturb_tensor(self, value, constraint):
         if isinstance(constraint, constraints._IntegerGreaterThan):
             return value + 1
-        if isinstance(constraint, (constraints._PositiveDefinite, constraints._PositiveSemidefinite)):
+        if isinstance(
+            constraint,
+            (constraints._PositiveDefinite, constraints._PositiveSemidefinite),
+        ):
             return value + torch.eye(value.shape[-1])
         if value.dtype in [torch.float, torch.double]:
             transform = transform_to(constraint)
@@ -5182,12 +6729,16 @@ def _perturb(self, Dist, keys, values, sample):
         with torch.no_grad():
             if Dist is Uniform:
                 param = dict(zip(keys, values))
-                param['low'] = param['low'] - torch.rand(param['low'].shape)
-                param['high'] = param['high'] + torch.rand(param['high'].shape)
+                param["low"] = param["low"] - torch.rand(param["low"].shape)
+                param["high"] = param["high"] + torch.rand(param["high"].shape)
                 values = [param[key] for key in keys]
             else:
-                values = [self._perturb_tensor(value, Dist.arg_constraints.get(key, constraints.real))
-                          for key, value in zip(keys, values)]
+                values = [
+                    self._perturb_tensor(
+                        value, Dist.arg_constraints.get(key, constraints.real)
+                    )
+                    for key, value in zip(keys, values)
+                ]
             param = dict(zip(keys, values))
             sample = Dist(**param).sample()
             return values, sample
@@ -5207,7 +6758,7 @@ def f(*values):
             xfail = [
                 Cauchy,  # aten::cauchy(Double(2,1), float, float, Generator)
                 HalfCauchy,  # aten::cauchy(Double(2, 1), float, float, Generator)
-                VonMises  # Variance is not Euclidean
+                VonMises,  # Variance is not Euclidean
             ]
             if Dist in xfail:
                 continue
@@ -5220,7 +6771,9 @@ def f(*values):
             # FIXME no nondeterministic nodes found in trace
             xfail = [Beta, Dirichlet]
             if Dist not in xfail:
-                self.assertTrue(any(n.isNondeterministic() for n in traced_f.graph.nodes()))
+                self.assertTrue(
+                    any(n.isNondeterministic() for n in traced_f.graph.nodes())
+                )
 
     def test_rsample(self):
         for Dist, keys, values, sample in self._examples():
@@ -5250,7 +6803,9 @@ def f(*values):
             # FIXME no nondeterministic nodes found in trace
             xfail = [Beta, Dirichlet]
             if Dist not in xfail:
-                self.assertTrue(any(n.isNondeterministic() for n in traced_f.graph.nodes()))
+                self.assertTrue(
+                    any(n.isNondeterministic() for n in traced_f.graph.nodes())
+                )
 
     @set_default_dtype(torch.double)
     def test_log_prob(self):
@@ -5271,8 +6826,11 @@ def f(sample, *values):
             values, sample = self._perturb(Dist, keys, values, sample)
             expected = f(sample, *values)
             actual = traced_f(sample, *values)
-            self.assertEqual(expected, actual,
-                             msg=f'{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}')
+            self.assertEqual(
+                expected,
+                actual,
+                msg=f"{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}",
+            )
 
     def test_enumerate_support(self):
         for Dist, keys, values, sample in self._examples():
@@ -5295,8 +6853,11 @@ def f(*values):
             values, sample = self._perturb(Dist, keys, values, sample)
             expected = f(*values)
             actual = traced_f(*values)
-            self.assertEqual(expected, actual,
-                             msg=f'{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}')
+            self.assertEqual(
+                expected,
+                actual,
+                msg=f"{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}",
+            )
 
     def test_mean(self):
         for Dist, keys, values, sample in self._examples():
@@ -5315,10 +6876,13 @@ def f(*values):
             values, sample = self._perturb(Dist, keys, values, sample)
             expected = f(*values)
             actual = traced_f(*values)
-            expected[expected == float('inf')] = 0.
-            actual[actual == float('inf')] = 0.
-            self.assertEqual(expected, actual,
-                             msg=f'{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}')
+            expected[expected == float("inf")] = 0.0
+            actual[actual == float("inf")] = 0.0
+            self.assertEqual(
+                expected,
+                actual,
+                msg=f"{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}",
+            )
 
     def test_variance(self):
         for Dist, keys, values, sample in self._examples():
@@ -5339,10 +6903,13 @@ def f(*values):
             values, sample = self._perturb(Dist, keys, values, sample)
             expected = f(*values).clone()
             actual = traced_f(*values).clone()
-            expected[expected == float('inf')] = 0.
-            actual[actual == float('inf')] = 0.
-            self.assertEqual(expected, actual,
-                             msg=f'{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}')
+            expected[expected == float("inf")] = 0.0
+            actual[actual == float("inf")] = 0.0
+            self.assertEqual(
+                expected,
+                actual,
+                msg=f"{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}",
+            )
 
     @set_default_dtype(torch.double)
     def test_entropy(self):
@@ -5366,8 +6933,11 @@ def f(*values):
             values, sample = self._perturb(Dist, keys, values, sample)
             expected = f(*values)
             actual = traced_f(*values)
-            self.assertEqual(expected, actual,
-                             msg=f'{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}')
+            self.assertEqual(
+                expected,
+                actual,
+                msg=f"{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}",
+            )
 
     @set_default_dtype(torch.double)
     def test_cdf(self):
@@ -5388,10 +6958,13 @@ def f(sample, *values):
             values, sample = self._perturb(Dist, keys, values, sample)
             expected = f(sample, *values)
             actual = traced_f(sample, *values)
-            self.assertEqual(expected, actual,
-                             msg=f'{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}')
+            self.assertEqual(
+                expected,
+                actual,
+                msg=f"{Dist.__name__}\nExpected:\n{expected}\nActual:\n{actual}",
+            )
 
 
-if __name__ == '__main__' and torch._C.has_lapack:
+if __name__ == "__main__" and torch._C.has_lapack:
     TestCase._default_dtype_check_enabled = True
     run_tests()
diff --git a/test/distributions/test_transforms.py b/test/distributions/test_transforms.py
index 11fcaa5ee5525..e9914086b2e68 100644
--- a/test/distributions/test_transforms.py
+++ b/test/distributions/test_transforms.py
@@ -8,15 +8,34 @@
 import torch
 from torch.autograd import grad
 from torch.autograd.functional import jacobian
-from torch.distributions import Dirichlet, Independent, Normal, TransformedDistribution, constraints
-from torch.distributions.transforms import (AbsTransform, AffineTransform, ComposeTransform,
-                                            CorrCholeskyTransform, CumulativeDistributionTransform,
-                                            ExpTransform, IndependentTransform,
-                                            LowerCholeskyTransform, PowerTransform,
-                                            ReshapeTransform, SigmoidTransform, TanhTransform,
-                                            SoftmaxTransform, SoftplusTransform, StickBreakingTransform,
-                                            identity_transform, Transform, _InverseTransform,
-                                            PositiveDefiniteTransform)
+from torch.distributions import (
+    constraints,
+    Dirichlet,
+    Independent,
+    Normal,
+    TransformedDistribution,
+)
+from torch.distributions.transforms import (
+    _InverseTransform,
+    AbsTransform,
+    AffineTransform,
+    ComposeTransform,
+    CorrCholeskyTransform,
+    CumulativeDistributionTransform,
+    ExpTransform,
+    identity_transform,
+    IndependentTransform,
+    LowerCholeskyTransform,
+    PositiveDefiniteTransform,
+    PowerTransform,
+    ReshapeTransform,
+    SigmoidTransform,
+    SoftmaxTransform,
+    SoftplusTransform,
+    StickBreakingTransform,
+    TanhTransform,
+    Transform,
+)
 from torch.distributions.utils import tril_matrix_to_vec, vec_to_tril_matrix
 from torch.testing._internal.common_utils import run_tests
 
@@ -25,57 +44,53 @@ def get_transforms(cache_size):
     transforms = [
         AbsTransform(cache_size=cache_size),
         ExpTransform(cache_size=cache_size),
-        PowerTransform(exponent=2,
-                       cache_size=cache_size),
-        PowerTransform(exponent=-2,
-                       cache_size=cache_size),
-        PowerTransform(exponent=torch.tensor(5.).normal_(),
-                       cache_size=cache_size),
-        PowerTransform(exponent=torch.tensor(5.).normal_(),
-                       cache_size=cache_size),
+        PowerTransform(exponent=2, cache_size=cache_size),
+        PowerTransform(exponent=-2, cache_size=cache_size),
+        PowerTransform(exponent=torch.tensor(5.0).normal_(), cache_size=cache_size),
+        PowerTransform(exponent=torch.tensor(5.0).normal_(), cache_size=cache_size),
         SigmoidTransform(cache_size=cache_size),
         TanhTransform(cache_size=cache_size),
         AffineTransform(0, 1, cache_size=cache_size),
         AffineTransform(1, -2, cache_size=cache_size),
-        AffineTransform(torch.randn(5),
-                        torch.randn(5),
-                        cache_size=cache_size),
-        AffineTransform(torch.randn(4, 5),
-                        torch.randn(4, 5),
-                        cache_size=cache_size),
+        AffineTransform(torch.randn(5), torch.randn(5), cache_size=cache_size),
+        AffineTransform(torch.randn(4, 5), torch.randn(4, 5), cache_size=cache_size),
         SoftmaxTransform(cache_size=cache_size),
         SoftplusTransform(cache_size=cache_size),
         StickBreakingTransform(cache_size=cache_size),
         LowerCholeskyTransform(cache_size=cache_size),
         CorrCholeskyTransform(cache_size=cache_size),
         PositiveDefiniteTransform(cache_size=cache_size),
-        ComposeTransform([
-            AffineTransform(torch.randn(4, 5),
-                            torch.randn(4, 5),
-                            cache_size=cache_size),
-        ]),
-        ComposeTransform([
-            AffineTransform(torch.randn(4, 5),
-                            torch.randn(4, 5),
-                            cache_size=cache_size),
-            ExpTransform(cache_size=cache_size),
-        ]),
-        ComposeTransform([
-            AffineTransform(0, 1, cache_size=cache_size),
-            AffineTransform(torch.randn(4, 5),
-                            torch.randn(4, 5),
-                            cache_size=cache_size),
-            AffineTransform(1, -2, cache_size=cache_size),
-            AffineTransform(torch.randn(4, 5),
-                            torch.randn(4, 5),
-                            cache_size=cache_size),
-        ]),
+        ComposeTransform(
+            [
+                AffineTransform(
+                    torch.randn(4, 5), torch.randn(4, 5), cache_size=cache_size
+                ),
+            ]
+        ),
+        ComposeTransform(
+            [
+                AffineTransform(
+                    torch.randn(4, 5), torch.randn(4, 5), cache_size=cache_size
+                ),
+                ExpTransform(cache_size=cache_size),
+            ]
+        ),
+        ComposeTransform(
+            [
+                AffineTransform(0, 1, cache_size=cache_size),
+                AffineTransform(
+                    torch.randn(4, 5), torch.randn(4, 5), cache_size=cache_size
+                ),
+                AffineTransform(1, -2, cache_size=cache_size),
+                AffineTransform(
+                    torch.randn(4, 5), torch.randn(4, 5), cache_size=cache_size
+                ),
+            ]
+        ),
         ReshapeTransform((4, 5), (2, 5, 2)),
         IndependentTransform(
-            AffineTransform(torch.randn(5),
-                            torch.randn(5),
-                            cache_size=cache_size),
-            1),
+            AffineTransform(torch.randn(5), torch.randn(5), cache_size=cache_size), 1
+        ),
         CumulativeDistributionTransform(Normal(0, 1)),
     ]
     transforms += [t.inv for t in transforms]
@@ -88,9 +103,17 @@ def reshape_transform(transform, shape):
         if isinstance(transform.loc, Number):
             return transform
         try:
-            return AffineTransform(transform.loc.expand(shape), transform.scale.expand(shape), cache_size=transform._cache_size)
+            return AffineTransform(
+                transform.loc.expand(shape),
+                transform.scale.expand(shape),
+                cache_size=transform._cache_size,
+            )
         except RuntimeError:
-            return AffineTransform(transform.loc.reshape(shape), transform.scale.reshape(shape), cache_size=transform._cache_size)
+            return AffineTransform(
+                transform.loc.reshape(shape),
+                transform.scale.reshape(shape),
+                cache_size=transform._cache_size,
+            )
     if isinstance(transform, ComposeTransform):
         reshaped_parts = []
         for p in transform.parts:
@@ -106,8 +129,12 @@ def reshape_transform(transform, shape):
 # Generate pytest ids
 def transform_id(x):
     assert isinstance(x, Transform)
-    name = f'Inv({type(x._inv).__name__})' if isinstance(x, _InverseTransform) else f'{type(x).__name__}'
-    return f'{name}(cache_size={x._cache_size})'
+    name = (
+        f"Inv({type(x._inv).__name__})"
+        if isinstance(x, _InverseTransform)
+        else f"{type(x).__name__}"
+    )
+    return f"{name}(cache_size={x._cache_size})"
 
 
 def generate_data(transform):
@@ -119,12 +146,17 @@ def generate_data(transform):
     if isinstance(transform.inv, ReshapeTransform):
         return torch.randn(transform.inv.out_shape)
     domain = transform.domain
-    while (isinstance(domain, constraints.independent) and
-           domain is not constraints.real_vector):
+    while (
+        isinstance(domain, constraints.independent)
+        and domain is not constraints.real_vector
+    ):
         domain = domain.base_constraint
     codomain = transform.codomain
     x = torch.empty(4, 5)
-    positive_definite_constraints = [constraints.lower_cholesky, constraints.positive_definite]
+    positive_definite_constraints = [
+        constraints.lower_cholesky,
+        constraints.positive_definite,
+    ]
     if domain in positive_definite_constraints:
         x = torch.randn(6, 6)
         x = x.tril(-1) + x.diag().exp().diag_embed()
@@ -159,21 +191,23 @@ def generate_data(transform):
         x /= x.norm(dim=-1, keepdim=True)
         x.diagonal(dim1=-1).copy_(x.diagonal(dim1=-1).abs())
         return x
-    raise ValueError(f'Unsupported domain: {domain}')
+    raise ValueError(f"Unsupported domain: {domain}")
 
 
 TRANSFORMS_CACHE_ACTIVE = get_transforms(cache_size=1)
 TRANSFORMS_CACHE_INACTIVE = get_transforms(cache_size=0)
-ALL_TRANSFORMS = TRANSFORMS_CACHE_ACTIVE + TRANSFORMS_CACHE_INACTIVE + [identity_transform]
+ALL_TRANSFORMS = (
+    TRANSFORMS_CACHE_ACTIVE + TRANSFORMS_CACHE_INACTIVE + [identity_transform]
+)
 
 
-@pytest.mark.parametrize('transform', ALL_TRANSFORMS, ids=transform_id)
+@pytest.mark.parametrize("transform", ALL_TRANSFORMS, ids=transform_id)
 def test_inv_inv(transform, ids=transform_id):
     assert transform.inv.inv is transform
 
 
-@pytest.mark.parametrize('x', TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
-@pytest.mark.parametrize('y', TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
+@pytest.mark.parametrize("x", TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
+@pytest.mark.parametrize("y", TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
 def test_equality(x, y):
     if x is y:
         assert x == y
@@ -182,7 +216,7 @@ def test_equality(x, y):
     assert identity_transform == identity_transform.inv
 
 
-@pytest.mark.parametrize('transform', ALL_TRANSFORMS, ids=transform_id)
+@pytest.mark.parametrize("transform", ALL_TRANSFORMS, ids=transform_id)
 def test_with_cache(transform):
     if transform._cache_size == 0:
         transform = transform.with_cache(1)
@@ -191,20 +225,20 @@ def test_with_cache(transform):
     try:
         y = transform(x)
     except NotImplementedError:
-        pytest.skip('Not implemented.')
+        pytest.skip("Not implemented.")
     y2 = transform(x)
     assert y2 is y
 
 
-@pytest.mark.parametrize('transform', ALL_TRANSFORMS, ids=transform_id)
-@pytest.mark.parametrize('test_cached', [True, False])
+@pytest.mark.parametrize("transform", ALL_TRANSFORMS, ids=transform_id)
+@pytest.mark.parametrize("test_cached", [True, False])
 def test_forward_inverse(transform, test_cached):
     x = generate_data(transform).requires_grad_()
     assert transform.domain.check(x).all()  # verify that the input data are valid
     try:
         y = transform(x)
     except NotImplementedError:
-        pytest.skip('Not implemented.')
+        pytest.skip("Not implemented.")
     assert y.shape == transform.forward_shape(x.shape)
     if test_cached:
         x2 = transform.inv(y)  # should be implemented at least by caching
@@ -212,26 +246,30 @@ def test_forward_inverse(transform, test_cached):
         try:
             x2 = transform.inv(y.clone())  # bypass cache
         except NotImplementedError:
-            pytest.skip('Not implemented.')
+            pytest.skip("Not implemented.")
     assert x2.shape == transform.inverse_shape(y.shape)
     y2 = transform(x2)
     if transform.bijective:
         # verify function inverse
-        assert torch.allclose(x2, x, atol=1e-4, equal_nan=True), '\n'.join([
-            f'{transform} t.inv(t(-)) error',
-            f'x = {x}',
-            f'y = t(x) = {y}',
-            f'x2 = t.inv(y) = {x2}',
-        ])
+        assert torch.allclose(x2, x, atol=1e-4, equal_nan=True), "\n".join(
+            [
+                f"{transform} t.inv(t(-)) error",
+                f"x = {x}",
+                f"y = t(x) = {y}",
+                f"x2 = t.inv(y) = {x2}",
+            ]
+        )
     else:
         # verify weaker function pseudo-inverse
-        assert torch.allclose(y2, y, atol=1e-4, equal_nan=True), '\n'.join([
-            f'{transform} t(t.inv(t(-))) error',
-            f'x = {x}',
-            f'y = t(x) = {y}',
-            f'x2 = t.inv(y) = {x2}',
-            f'y2 = t(x2) = {y2}',
-        ])
+        assert torch.allclose(y2, y, atol=1e-4, equal_nan=True), "\n".join(
+            [
+                f"{transform} t(t.inv(t(-))) error",
+                f"x = {x}",
+                f"y = t(x) = {y}",
+                f"x2 = t.inv(y) = {x2}",
+                f"y2 = t(x2) = {y2}",
+            ]
+        )
 
 
 def test_compose_transform_shapes():
@@ -255,33 +293,36 @@ def test_compose_transform_shapes():
 base_dist2 = Normal(torch.zeros(3, 4, 4), torch.ones(3, 4, 4))
 
 
-@pytest.mark.parametrize(('batch_shape', 'event_shape', 'dist'), [
-    ((4, 4), (), base_dist0),
-    ((4,), (4,), base_dist1),
-    ((4, 4), (), TransformedDistribution(base_dist0, [transform0])),
-    ((4,), (4,), TransformedDistribution(base_dist0, [transform1])),
-    ((4,), (4,), TransformedDistribution(base_dist0, [transform0, transform1])),
-    ((), (4, 4), TransformedDistribution(base_dist0, [transform0, transform2])),
-    ((4,), (4,), TransformedDistribution(base_dist0, [transform1, transform0])),
-    ((), (4, 4), TransformedDistribution(base_dist0, [transform1, transform2])),
-    ((), (4, 4), TransformedDistribution(base_dist0, [transform2, transform0])),
-    ((), (4, 4), TransformedDistribution(base_dist0, [transform2, transform1])),
-    ((4,), (4,), TransformedDistribution(base_dist1, [transform0])),
-    ((4,), (4,), TransformedDistribution(base_dist1, [transform1])),
-    ((), (4, 4), TransformedDistribution(base_dist1, [transform2])),
-    ((4,), (4,), TransformedDistribution(base_dist1, [transform0, transform1])),
-    ((), (4, 4), TransformedDistribution(base_dist1, [transform0, transform2])),
-    ((4,), (4,), TransformedDistribution(base_dist1, [transform1, transform0])),
-    ((), (4, 4), TransformedDistribution(base_dist1, [transform1, transform2])),
-    ((), (4, 4), TransformedDistribution(base_dist1, [transform2, transform0])),
-    ((), (4, 4), TransformedDistribution(base_dist1, [transform2, transform1])),
-    ((3, 4, 4), (), base_dist2),
-    ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2])),
-    ((3,), (4, 4), TransformedDistribution(base_dist2, [transform0, transform2])),
-    ((3,), (4, 4), TransformedDistribution(base_dist2, [transform1, transform2])),
-    ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2, transform0])),
-    ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2, transform1])),
-])
+@pytest.mark.parametrize(
+    ("batch_shape", "event_shape", "dist"),
+    [
+        ((4, 4), (), base_dist0),
+        ((4,), (4,), base_dist1),
+        ((4, 4), (), TransformedDistribution(base_dist0, [transform0])),
+        ((4,), (4,), TransformedDistribution(base_dist0, [transform1])),
+        ((4,), (4,), TransformedDistribution(base_dist0, [transform0, transform1])),
+        ((), (4, 4), TransformedDistribution(base_dist0, [transform0, transform2])),
+        ((4,), (4,), TransformedDistribution(base_dist0, [transform1, transform0])),
+        ((), (4, 4), TransformedDistribution(base_dist0, [transform1, transform2])),
+        ((), (4, 4), TransformedDistribution(base_dist0, [transform2, transform0])),
+        ((), (4, 4), TransformedDistribution(base_dist0, [transform2, transform1])),
+        ((4,), (4,), TransformedDistribution(base_dist1, [transform0])),
+        ((4,), (4,), TransformedDistribution(base_dist1, [transform1])),
+        ((), (4, 4), TransformedDistribution(base_dist1, [transform2])),
+        ((4,), (4,), TransformedDistribution(base_dist1, [transform0, transform1])),
+        ((), (4, 4), TransformedDistribution(base_dist1, [transform0, transform2])),
+        ((4,), (4,), TransformedDistribution(base_dist1, [transform1, transform0])),
+        ((), (4, 4), TransformedDistribution(base_dist1, [transform1, transform2])),
+        ((), (4, 4), TransformedDistribution(base_dist1, [transform2, transform0])),
+        ((), (4, 4), TransformedDistribution(base_dist1, [transform2, transform1])),
+        ((3, 4, 4), (), base_dist2),
+        ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2])),
+        ((3,), (4, 4), TransformedDistribution(base_dist2, [transform0, transform2])),
+        ((3,), (4, 4), TransformedDistribution(base_dist2, [transform1, transform2])),
+        ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2, transform0])),
+        ((3,), (4, 4), TransformedDistribution(base_dist2, [transform2, transform1])),
+    ],
+)
 def test_transformed_distribution_shapes(batch_shape, event_shape, dist):
     assert dist.batch_shape == batch_shape
     assert dist.event_shape == event_shape
@@ -289,10 +330,10 @@ def test_transformed_distribution_shapes(batch_shape, event_shape, dist):
     try:
         dist.log_prob(x)  # this should not crash
     except NotImplementedError:
-        pytest.skip('Not implemented.')
+        pytest.skip("Not implemented.")
 
 
-@pytest.mark.parametrize('transform', TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
+@pytest.mark.parametrize("transform", TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
 def test_jit_fwd(transform):
     x = generate_data(transform).requires_grad_()
 
@@ -302,14 +343,14 @@ def f(x):
     try:
         traced_f = torch.jit.trace(f, (x,))
     except NotImplementedError:
-        pytest.skip('Not implemented.')
+        pytest.skip("Not implemented.")
 
     # check on different inputs
     x = generate_data(transform).requires_grad_()
     assert torch.allclose(f(x), traced_f(x), atol=1e-5, equal_nan=True)
 
 
-@pytest.mark.parametrize('transform', TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
+@pytest.mark.parametrize("transform", TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
 def test_jit_inv(transform):
     y = generate_data(transform.inv).requires_grad_()
 
@@ -319,14 +360,14 @@ def f(y):
     try:
         traced_f = torch.jit.trace(f, (y,))
     except NotImplementedError:
-        pytest.skip('Not implemented.')
+        pytest.skip("Not implemented.")
 
     # check on different inputs
     y = generate_data(transform.inv).requires_grad_()
     assert torch.allclose(f(y), traced_f(y), atol=1e-5, equal_nan=True)
 
 
-@pytest.mark.parametrize('transform', TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
+@pytest.mark.parametrize("transform", TRANSFORMS_CACHE_INACTIVE, ids=transform_id)
 def test_jit_jacobian(transform):
     x = generate_data(transform).requires_grad_()
 
@@ -337,23 +378,23 @@ def f(x):
     try:
         traced_f = torch.jit.trace(f, (x,))
     except NotImplementedError:
-        pytest.skip('Not implemented.')
+        pytest.skip("Not implemented.")
 
     # check on different inputs
     x = generate_data(transform).requires_grad_()
     assert torch.allclose(f(x), traced_f(x), atol=1e-5, equal_nan=True)
 
 
-@pytest.mark.parametrize('transform', ALL_TRANSFORMS, ids=transform_id)
+@pytest.mark.parametrize("transform", ALL_TRANSFORMS, ids=transform_id)
 def test_jacobian(transform):
     x = generate_data(transform)
     try:
         y = transform(x)
         actual = transform.log_abs_det_jacobian(x, y)
     except NotImplementedError:
-        pytest.skip('Not implemented.')
+        pytest.skip("Not implemented.")
     # Test shape
-    target_shape = x.shape[:x.dim() - transform.domain.event_dim]
+    target_shape = x.shape[: x.dim() - transform.domain.event_dim]
     assert actual.shape == target_shape
 
     # Expand if required
@@ -366,7 +407,9 @@ def test_jacobian(transform):
     transform = reshape_transform(transform, x_.shape)
 
     # 1. Transforms with unit jacobian
-    if isinstance(transform, ReshapeTransform) or isinstance(transform.inv, ReshapeTransform):
+    if isinstance(transform, ReshapeTransform) or isinstance(
+        transform.inv, ReshapeTransform
+    ):
         expected = x.new_zeros(x.shape[x.dim() - transform.domain.event_dim])
         expected = x.new_zeros(x.shape[x.dim() - transform.domain.event_dim])
     # 2. Transforms with 0 off-diagonal elements
@@ -380,8 +423,10 @@ def test_jacobian(transform):
         if isinstance(transform, CorrCholeskyTransform):
             jac = jacobian(lambda x: tril_matrix_to_vec(transform(x), diag=-1), x_)
         elif isinstance(transform.inv, CorrCholeskyTransform):
-            jac = jacobian(lambda x: transform(vec_to_tril_matrix(x, diag=-1)),
-                           tril_matrix_to_vec(x_, diag=-1))
+            jac = jacobian(
+                lambda x: transform(vec_to_tril_matrix(x, diag=-1)),
+                tril_matrix_to_vec(x_, diag=-1),
+            )
         elif isinstance(transform, StickBreakingTransform):
             jac = jacobian(lambda x: transform(x)[..., :-1], x_)
         else:
@@ -393,20 +438,28 @@ def test_jacobian(transform):
         # can be computed.
         gather_idx_shape = list(jac.shape)
         gather_idx_shape[-2] = 1
-        gather_idxs = torch.arange(n).reshape((n,) + (1,) * (len(jac.shape) - 1)).expand(gather_idx_shape)
+        gather_idxs = (
+            torch.arange(n)
+            .reshape((n,) + (1,) * (len(jac.shape) - 1))
+            .expand(gather_idx_shape)
+        )
         jac = jac.gather(-2, gather_idxs).squeeze(-2)
         out_ndims = jac.shape[-2]
-        jac = jac[..., :out_ndims]  # Remove extra zero-valued dims (for inverse stick-breaking).
+        jac = jac[
+            ..., :out_ndims
+        ]  # Remove extra zero-valued dims (for inverse stick-breaking).
         expected = torch.slogdet(jac).logabsdet
 
     assert torch.allclose(actual, expected, atol=1e-5)
 
 
-@pytest.mark.parametrize("event_dims",
-                         [(0,), (1,), (2, 3), (0, 1, 2), (1, 2, 0), (2, 0, 1)],
-                         ids=str)
+@pytest.mark.parametrize(
+    "event_dims", [(0,), (1,), (2, 3), (0, 1, 2), (1, 2, 0), (2, 0, 1)], ids=str
+)
 def test_compose_affine(event_dims):
-    transforms = [AffineTransform(torch.zeros((1,) * e), 1, event_dim=e) for e in event_dims]
+    transforms = [
+        AffineTransform(torch.zeros((1,) * e), 1, event_dim=e) for e in event_dims
+    ]
     transform = ComposeTransform(transforms)
     assert transform.codomain.event_dim == max(event_dims)
     assert transform.domain.event_dim == max(event_dims)
@@ -426,10 +479,12 @@ def test_compose_affine(event_dims):
 
 @pytest.mark.parametrize("batch_shape", [(), (6,), (5, 4)], ids=str)
 def test_compose_reshape(batch_shape):
-    transforms = [ReshapeTransform((), ()),
-                  ReshapeTransform((2,), (1, 2)),
-                  ReshapeTransform((3, 1, 2), (6,)),
-                  ReshapeTransform((6,), (2, 3))]
+    transforms = [
+        ReshapeTransform((), ()),
+        ReshapeTransform((2,), (1, 2)),
+        ReshapeTransform((3, 1, 2), (6,)),
+        ReshapeTransform((6,), (2, 3)),
+    ]
     transform = ComposeTransform(transforms)
     assert transform.codomain.event_dim == 2
     assert transform.domain.event_dim == 2
@@ -447,16 +502,19 @@ def test_compose_reshape(batch_shape):
 @pytest.mark.parametrize("base_batch_dim", [0, 1, 2])
 @pytest.mark.parametrize("base_event_dim", [0, 1, 2])
 @pytest.mark.parametrize("num_transforms", [0, 1, 2, 3])
-def test_transformed_distribution(base_batch_dim, base_event_dim, transform_dim,
-                                  num_transforms, sample_shape):
+def test_transformed_distribution(
+    base_batch_dim, base_event_dim, transform_dim, num_transforms, sample_shape
+):
     shape = torch.Size([2, 3, 4, 5])
     base_dist = Normal(0, 1)
-    base_dist = base_dist.expand(shape[4 - base_batch_dim - base_event_dim:])
+    base_dist = base_dist.expand(shape[4 - base_batch_dim - base_event_dim :])
     if base_event_dim:
         base_dist = Independent(base_dist, base_event_dim)
-    transforms = [AffineTransform(torch.zeros(shape[4 - transform_dim:]), 1),
-                  ReshapeTransform((4, 5), (20,)),
-                  ReshapeTransform((3, 20), (6, 10))]
+    transforms = [
+        AffineTransform(torch.zeros(shape[4 - transform_dim :]), 1),
+        ReshapeTransform((4, 5), (20,)),
+        ReshapeTransform((3, 20), (6, 10)),
+    ]
     transforms = transforms[:num_transforms]
     transform = ComposeTransform(transforms)
 
@@ -498,17 +556,17 @@ def test_save_load_transform():
     assert torch.allclose(log_prob, other.log_prob(x))
 
 
-@pytest.mark.parametrize('transform', ALL_TRANSFORMS, ids=transform_id)
+@pytest.mark.parametrize("transform", ALL_TRANSFORMS, ids=transform_id)
 def test_transform_sign(transform: Transform):
     try:
         sign = transform.sign
     except NotImplementedError:
-        pytest.skip('Not implemented.')
+        pytest.skip("Not implemented.")
 
     x = generate_data(transform).requires_grad_()
     y = transform(x).sum()
-    derivatives, = grad(y, [x])
-    assert torch.less(torch.as_tensor(0.), derivatives * sign).all()
+    (derivatives,) = grad(y, [x])
+    assert torch.less(torch.as_tensor(0.0), derivatives * sign).all()
 
 
 if __name__ == "__main__":
diff --git a/test/distributions/test_utils.py b/test/distributions/test_utils.py
index 3855b7f15d63d..f69d7671bc15f 100644
--- a/test/distributions/test_utils.py
+++ b/test/distributions/test_utils.py
@@ -6,12 +6,16 @@
 from torch.distributions.utils import tril_matrix_to_vec, vec_to_tril_matrix
 from torch.testing._internal.common_utils import run_tests
 
-@pytest.mark.parametrize('shape', [
-    (2, 2),
-    (3, 3),
-    (2, 4, 4),
-    (2, 2, 4, 4),
-])
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (2, 2),
+        (3, 3),
+        (2, 4, 4),
+        (2, 2, 4, 4),
+    ],
+)
 def test_tril_matrix_to_vec(shape):
     mat = torch.randn(shape)
     n = mat.shape[-1]
diff --git a/test/dynamo/test_activation_checkpointing.py b/test/dynamo/test_activation_checkpointing.py
index 636caadae5a63..0da6668197c9b 100644
--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: dynamo"]
 import functools
-import unittest
+import math
+import unittest  # noqa: F811
 from importlib import import_module
 
 import torch
@@ -8,6 +9,7 @@
 
 import torch._dynamo.test_case
 import torch._functorch.config
+import torch.distributed as dist
 import torch.utils.checkpoint
 from functorch.compile import min_cut_rematerialization_partition
 from torch._dynamo.backends.common import aot_autograd
@@ -18,7 +20,17 @@
 from torch.testing._internal.two_tensor import TwoTensor
 from torch.utils.checkpoint import _pt2_selective_checkpoint_context_fn_gen, checkpoint
 
-requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+requires_distributed = functools.partial(
+    unittest.skipIf, not dist.is_available(), "requires distributed"
+)
+
+
+def checkpoint_wrapper(fn):
+    def inner(*args):
+        return torch.utils.checkpoint.checkpoint(fn, *args, use_reentrant=True)
+
+    return inner
 
 
 def count_ops(
@@ -32,12 +44,13 @@ def match_rng_op(node, op):
                 return node.args[1] == op
         return False
 
-    assert ((freq or freq_ge) and op) or ((freqs or freqs_ge) and ops)
-    if op:
+    # assert ((freq or freq_ge) and op) or ((freqs or freqs_ge) and ops)
+    if op is not None:
+        assert not isinstance(op, list)
         ops = [op]
-    if freq:
+    if freq is not None:
         freqs = [freq]
-    if freq_ge:
+    if freq_ge is not None:
         freqs_ge = [freq_ge]
     if freqs:
         for op, freq in zip(ops, freqs):
@@ -45,9 +58,8 @@ def match_rng_op(node, op):
             for node in gm.graph.nodes:
                 if match_rng_op(node, op) or node.target == op:
                     actual_count += 1
-            assert (
-                actual_count == freq
-            ), f"In graph {gm}, expected {op} to have occurred {freq} times in the graph, but got {actual_count}."
+            err_msg = f"In graph {gm}, expected {op} to have occurred {freq} times in the graph, but got {actual_count}."
+            assert actual_count == freq, err_msg
     else:
         assert freqs_ge is not None
         for op, freq_ge in zip(ops, freqs_ge):
@@ -172,7 +184,7 @@ def _compare_orig_and_checkpointed_fns(
                 msg="Gradient mismatch between the original version and the checkpointed version of the same function",
             )
 
-    @requires_cuda()
+    @requires_cuda
     def test_tags_function(self):
         def gn(x, y):
             return torch.sigmoid(torch.matmul(x, y))
@@ -192,7 +204,7 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
-    @requires_cuda()
+    @requires_cuda
     def test_tags_function_via_global_checkpoint(self):
         def gn(x, y):
             return torch.sigmoid(torch.matmul(x, y))
@@ -211,7 +223,7 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
-    @requires_cuda()
+    @requires_cuda
     def test_tags_function_with_kwargs(self):
         def gn(x, y):
             return torch.sigmoid(torch.matmul(x, y))
@@ -231,7 +243,32 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
-    @requires_cuda()
+    @requires_cuda
+    def test_tags_sequential_layers(self):
+        def gn(x):
+            x = x.cos()
+            for _ in range(3):
+                x = torch.mm(x, x)
+            x = x.cos()
+            return x
+
+        def fn(x):
+            x = torch.utils.checkpoint.checkpoint(gn, x)
+            x = torch.utils.checkpoint.checkpoint(gn, x)
+            return x
+
+        x = torch.randn(4, 4, device="cuda", requires_grad=True)
+
+        fw_compiler = functools.partial(count_ops, freq=6, op=torch.ops.aten.mm.default)
+        bw_compiler = functools.partial(
+            count_ops,
+            freqs=[2, 18],
+            ops=[torch.ops.aten.cos.default, torch.ops.aten.mm.default],
+        )  # mm recomputed in the bwd
+        backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
+        self._validate(fn, backend, x)
+
+    @requires_cuda
     def test_tags_multiple_checkpoints(self):
         def gn(x, y):
             return torch.sigmoid(torch.matmul(x, y))
@@ -253,7 +290,7 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
-    @requires_cuda()
+    @requires_cuda
     def test_tags_module(self):
         class MockModule(torch.nn.Module):
             def __init__(self):
@@ -281,7 +318,7 @@ def fn(x):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x)
 
-    @requires_cuda()
+    @requires_cuda
     def test_tags_decomps(self):
         # Ensures that tags are passed on through decompositions as well
         class MockModule(torch.nn.Module):
@@ -316,7 +353,7 @@ def fn(x):
         )
         self._validate(fn, backend, x)
 
-    @requires_cuda()
+    @requires_cuda
     @torch._inductor.config.patch(fallback_random=True)
     def test_tags_recomputed_rand(self):
         def gn(x, y):
@@ -340,7 +377,7 @@ def fn(x, y):
         backend = "inductor"
         self._validate(fn, backend, x, y)
 
-    @requires_cuda()
+    @requires_cuda
     @torch._inductor.config.patch(fallback_random=True)
     def test_tags_rand(self):
         def gn(x, y):
@@ -367,7 +404,7 @@ def fn(x, y):
         backend = "inductor"
         self._validate(fn, backend, x, y)
 
-    @requires_cuda()
+    @requires_cuda
     @torch._inductor.config.patch(fallback_random=True)
     def test_tags_dropout(self):
         # Figure out a way to test the number of inductor_random calls
@@ -390,7 +427,7 @@ def fn(x):
         # rand decomps do not have have numerical results as eager
         self._validate(fn, backend, x, skip_check=True)
 
-    @requires_cuda()
+    @requires_cuda
     def test_fallback(self):
         def gn(x, y):
             torch._dynamo.graph_break()
@@ -418,7 +455,7 @@ def fn(x, y):
         self.assertEqual(cnt.op_count, 2)
         self.assertEqual(len(cnt.graphs), 2)
 
-    @requires_cuda()
+    @requires_cuda
     def test_kwargs(self):
         def gn(x, y, z=None):
             a = torch.matmul(x, y)
@@ -452,7 +489,7 @@ def fn(x, y, z):
         body_function = getattr(cnt.graphs[0], wrap_node.args[0].name)
         self.assertEqual(op_count(body_function), 2)
 
-    @requires_cuda()
+    @requires_cuda
     def test_symints_location(self):
         def gn(x, y):
             return torch.matmul(x, torch.nn.functional.dropout(y, 0.5))
@@ -482,7 +519,7 @@ def fn(x, y):
         wrap_node = find_first_node(cnt.graphs[0], tag_activation_checkpoint)
         self.assertEqual(len(wrap_node.args), 3)
 
-    @requires_cuda()
+    @requires_cuda
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     @torch._dynamo.config.patch(
         "_experimental_support_context_fn_in_torch_utils_checkpoint", True
@@ -532,7 +569,7 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
-    @requires_cuda()
+    @requires_cuda
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     @torch._dynamo.config.patch(
         "_experimental_support_context_fn_in_torch_utils_checkpoint", True
@@ -585,7 +622,7 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
-    @requires_cuda()
+    @requires_cuda
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     @torch._dynamo.config.patch(
         "_experimental_support_context_fn_in_torch_utils_checkpoint", True
@@ -653,7 +690,56 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
-    @requires_cuda()
+    @requires_cuda
+    @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
+    @torch._dynamo.config.patch(
+        "_experimental_support_context_fn_in_torch_utils_checkpoint", True
+    )
+    def test_compile_selective_checkpoint_partial_ctx_fn(self):
+        def selective_checkpointing_context_fn(no_recompute_list):
+            return _pt2_selective_checkpoint_context_fn_gen(
+                _get_custom_policy(no_recompute_list=no_recompute_list)
+            )
+
+        def gn(x, y):
+            return torch.sigmoid(torch.matmul(torch.matmul(x, y), y)) * y
+
+        def fn(x, y):
+            return torch.utils.checkpoint.checkpoint(
+                gn,
+                x,
+                y,
+                use_reentrant=False,
+                context_fn=functools.partial(
+                    selective_checkpointing_context_fn, [torch.ops.aten.mm.default]
+                ),
+            )
+
+        x = torch.randn(4, 4, requires_grad=True, device="cuda")
+        y = torch.randn(4, 4, requires_grad=True, device="cuda")
+
+        fw_compiler = functools.partial(
+            count_ops,
+            freq=2,
+            op=torch.ops.aten.mm.default,
+        )
+        bw_compiler = functools.partial(
+            count_ops,
+            # We would've expected 6 here
+            # (2 matmul recompute and 2 mm ops per fwd matmul, so 2 + 2 * 2 = 6)
+            # if we didn't enable selective checkpointing.
+            freq=4,
+            op=torch.ops.aten.mm.default,
+        )
+        backend = aot_autograd(
+            fw_compiler=fw_compiler,
+            bw_compiler=bw_compiler,
+            partition_fn=min_cut_rematerialization_partition,
+        )
+        self._validate(fn, backend, x, y)
+        self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
+
+    @requires_cuda
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     @torch._dynamo.config.patch(
         "_experimental_support_context_fn_in_torch_utils_checkpoint", True
@@ -701,7 +787,7 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
-    @requires_cuda()
+    @requires_cuda
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     @unittest.skip(
         "In-place op support in selective checkpointing + torch.compile "
@@ -755,7 +841,7 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
-    @requires_cuda()
+    @requires_cuda
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     @torch._dynamo.config.patch(
         "_experimental_support_context_fn_in_torch_utils_checkpoint", True
@@ -857,7 +943,7 @@ def fn(x, y):
         ):
             self._validate(fn, backend, x, y)
 
-    @requires_cuda()
+    @requires_cuda
     @skipIfRocm
     def test_autocast_flash_attention(self):
         def fn(primals_1, primals_2, primals_3):
@@ -882,7 +968,7 @@ def gn(*args):
             res = opt_gn(*args)
             self.assertEqual(ref, res)
 
-    @requires_cuda()
+    @requires_cuda
     def test_error_msg(self):
         class MockModule(torch.nn.Module):
             def __init__(self):
@@ -902,12 +988,11 @@ def fn(x):
         x = torch.randn(4, 4).cuda()
         opt_fn = torch.compile(fn, fullgraph=True)
         with self.assertRaisesRegex(
-            RuntimeError,
-            "while introspecting torch.utils.checkpoint.checkpoint, we were unable to trace function `NNModuleVariable`",
+            torch._dynamo.exc.Unsupported, "skip function graph_break in file"
         ):
             opt_fn(x)
 
-    @requires_cuda()
+    @requires_cuda
     def test_list_inputs(self):
         class MockModule(torch.nn.Module):
             def __init__(self):
@@ -932,6 +1017,98 @@ def fn(x, ys):
         res = opt_fn(x, [y, z])
         self.assertEqual(ref, res)
 
+    @requires_cuda
+    def test_pattern_matcher(self):
+        # Check that the sdpa op is recomputed in the backward graph
+        # tests percolate_tags
+
+        @checkpoint_wrapper
+        def dot_prod_attention(
+            query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+        ) -> torch.Tensor:
+            return (
+                torch.matmul(query, key.transpose(-2, -1))
+                .mul(1.0 / math.sqrt(key.shape[-1]))
+                .softmax(dim=-1)
+                .matmul(value)
+            )
+
+        def fn(query, key, value):
+            # Checks that sin is not recomputed in the backward graph
+            return dot_prod_attention(query.sin(), key, value)
+
+        tensor_shape = (4, 2, 16, 32)
+        dtype = torch.float16
+        args1 = [
+            torch.randn(tensor_shape, device="cuda", dtype=dtype, requires_grad=True),
+            torch.randn(tensor_shape, device="cuda", dtype=dtype, requires_grad=True),
+            torch.randn(tensor_shape, device="cuda", dtype=dtype, requires_grad=True),
+        ]
+
+        # Save the AOT graphs
+        aot_graphs = []
+        from torch._inductor import compile_fx
+
+        def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
+            aot_graphs.append(graph)
+            return compile_fx.compile_fx_inner(graph, example_inputs, *args, **kwargs)
+
+        backend = functools.partial(
+            compile_fx.compile_fx, inner_compile=debug_compile_fx_inner
+        )
+
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+        opt_fn(*args1).sum().backward()
+
+        fwd_graph = aot_graphs[0]
+        self.assertTrue(
+            count_ops(
+                fwd_graph,
+                [],
+                freq=1,
+                op=torch.ops.aten._scaled_dot_product_flash_attention.default,
+            )
+        )
+
+        bwd_graph = aot_graphs[1]
+        # Check that sin is not recomputed in the backward graph - checks percolate tags
+        self.assertTrue(count_ops(bwd_graph, [], freq=0, op=torch.ops.aten.sin.default))
+        # Check that the sdpa op is recomputed in the backward graph
+        self.assertTrue(
+            count_ops(
+                bwd_graph,
+                [],
+                freq=1,
+                op=torch.ops.aten._scaled_dot_product_flash_attention.default,
+            )
+        )
+
+    @requires_cuda
+    @requires_distributed()
+    def test_distributed_utils_checkpoint_wrapper(self):
+        from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+            checkpoint_wrapper as dist_checkpoint_wrapper,
+        )
+
+        class MockModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+                self.c = 2
+
+            def forward(self, x):
+                x = torch.sin(x)
+                x = self.linear(x)
+                x = torch.cos(x)
+                return x * self.c
+
+        mod = dist_checkpoint_wrapper(MockModule())
+        x = torch.randn(4, 4)
+        ref = mod(x)
+        opt_mod = torch.compile(mod, backend="eager", fullgraph=True)
+        res = opt_mod(x)
+        self.assertEqual(ref, res)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_aot_autograd.py b/test/dynamo/test_aot_autograd.py
index 68233cced0f4c..84b1e0a28c0ec 100644
--- a/test/dynamo/test_aot_autograd.py
+++ b/test/dynamo/test_aot_autograd.py
@@ -1,5 +1,7 @@
 # Owner(s): ["module: dynamo"]
+import copy
 import re
+import unittest
 from textwrap import dedent
 from unittest.mock import patch
 
@@ -11,6 +13,8 @@
 import torch.utils._pytree as pytree
 from torch._dynamo.testing import CompileCounter, expectedFailureDynamic, rand_strided
 from torch._functorch.aot_autograd import _aot_export_function, create_functional_call
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import make_fx
 from torch.profiler import profile
 from torch.testing._internal.common_utils import compare_equal_outs_and_grads
 
@@ -25,7 +29,7 @@ def maybe_dupe_op(x):
 
 
 aten = torch.ops.aten
-lib = torch.library.Library("custom", "DEF")
+lib = torch.library.Library("custom", "DEF")  # noqa: TOR901
 lib.define("maybe_dupe_op(Tensor a) -> (Tensor, Tensor)")
 lib.impl("maybe_dupe_op", maybe_dupe_op, "CPU")
 lib.impl("maybe_dupe_op", maybe_dupe_op, "Meta")
@@ -430,17 +434,15 @@ def __init__(self):
                 super().__init__()
                 self.mean = torch.nn.Parameter(torch.randn(3, 3))
 
-            def forward(self, a, b, c, d, e, f):
+            def forward(self, a, b, e, f):
                 a.trunc_()
                 b.trunc_()
-                c.trunc_()
-                d.trunc_()
-                return (a + b + c + d + self.mean) * e * f
+                return (a + b + self.mean) * e * f
 
         a = torch.randn(3, 3, requires_grad=True)
         b = torch.randn(3, 3, requires_grad=True)
-        a1, a2, a3, a4 = a.clone(), a.clone(), a.clone(), a.clone()
-        b1, b2, b3, b4 = b.clone(), b.clone(), b.clone(), b.clone()
+        a1, a2 = a.clone(), a.clone()
+        b1, b2 = b.clone(), b.clone()
 
         failure_reason = None
 
@@ -453,8 +455,8 @@ def guard_fail_fn(failure):
         cc = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
 
         f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
-        f(a1, a1, a1, a1, 2, 2)
-        f(a2, b2, b2, b2, 2, 2)
+        f(a1, a1, 2, 2)
+        f(a2, b2, 2, 2)
         self.assertEqual(cc.frame_count, 2)
         self.assertIn(
             """L['a'] is L['b']""",
@@ -471,10 +473,10 @@ def guard_fail_fn(failure):
         d3, d4 = d.clone(), d.clone()
 
         f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
-        f(a3, b3, c3, c3, 3, 3)
-        f(a4, b4, c4, d4, 3, 3)
+        f(c3, c3, 3, 3)
+        f(c4, d4, 3, 3)
         self.assertEqual(cc.frame_count, 2)
-        self.assertIn("""L['c'] is L['d']""", failure_reason)
+        self.assertIn("""L['a'] is L['b']""", failure_reason)
 
     @patch("torch._functorch.config.debug_assert", True)
     def test_arg_dupe_via_dynamo_recompiles_many_with_global(self):
@@ -485,18 +487,16 @@ def __init__(self):
                 super().__init__()
                 self.mean = torch.nn.Parameter(torch.randn(3, 3))
 
-            def forward(self, a, b, c, d, e, f):
+            def forward(self, a, b, e, f):
                 a.trunc_()
                 b.trunc_()
-                c.trunc_()
-                d.trunc_()
-                return (a + b + c + d + z + self.mean) * e * f
+                return (a + b + z + self.mean) * e * f
 
         a = torch.randn(3, 3, requires_grad=True)
         b = torch.randn(3, 3, requires_grad=True)
         z = a
-        a1, a2, a3, a4 = a.clone(), a.clone(), a.clone(), a.clone()
-        b1, b2, b3, b4 = b.clone(), b.clone(), b.clone(), b.clone()
+        a1, a2 = a.clone(), a.clone()
+        b1, b2 = b.clone(), b.clone()
 
         failure_reason = None
 
@@ -509,8 +509,8 @@ def guard_fail_fn(failure):
         cc = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
 
         f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
-        f(a1, a1, a1, a1, 2, 2)
-        f(a2, b2, b2, b2, 2, 2)
+        f(a1, a1, 2, 2)
+        f(a2, b2, 2, 2)
         self.assertEqual(cc.frame_count, 2)
         self.assertIn(
             """L['a'] is L['b']""",
@@ -524,17 +524,15 @@ def __init__(self):
                 super().__init__()
                 self.mean = torch.nn.Parameter(torch.randn(3, 3))
 
-            def forward(self, e, f, a, b, c, d):
+            def forward(self, e, f, a, b):
                 a.trunc_()
                 b.trunc_()
-                c.trunc_()
-                d.trunc_()
-                return (a + b + c + d + self.mean) * e[0] * f[0]
+                return (a + b + self.mean) * e[0] * f[0]
 
         a = torch.randn(3, 3, requires_grad=True)
         b = torch.randn(3, 3, requires_grad=True)
-        a1, a2, a3, a4 = a.clone(), a.clone(), a.clone(), a.clone()
-        b1, b2, b3, b4 = b.clone(), b.clone(), b.clone(), b.clone()
+        a1, a2 = a.clone(), a.clone()
+        b1, b2 = b.clone(), b.clone()
 
         failure_reason = None
 
@@ -547,8 +545,8 @@ def guard_fail_fn(failure):
         cc = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
 
         f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
-        f([3, 2, 1], [4, 5, 6], a1, a1, a1, a1)
-        f([3, 2, 1], [4, 5, 6], a2, b2, b2, b2)
+        f([3, 2, 1], [4, 5, 6], a1, a1)
+        f([3, 2, 1], [4, 5, 6], a2, b2)
         self.assertEqual(cc.frame_count, 2)
         self.assertIn(
             """L['a'] is L['b']""",
@@ -565,8 +563,8 @@ def guard_fail_fn(failure):
         d3, d4 = d.clone(), d.clone()
 
         f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
-        f([3, 2, 1], [4, 5, 6], a3, b3, c3, c3)
-        f([3, 2, 1], [4, 5, 6], a4, b4, c4, d4)
+        f([3, 2, 1], [4, 5, 6], c3, c3)
+        f([3, 2, 1], [4, 5, 6], c4, d4)
         self.assertEqual(cc.frame_count, 2)
 
     @patch("torch._functorch.config.debug_assert", True)
@@ -576,17 +574,15 @@ def __init__(self):
                 super().__init__()
                 self.mean = torch.nn.Parameter(torch.randn(3, 3))
 
-            def forward(self, a, b, c, d):
+            def forward(self, a, b):
                 a.trunc_()
                 b.trunc_()
-                c.trunc_()
-                d.trunc_()
-                return a + b + c + d + self.mean
+                return a + b + self.mean
 
         a = torch.randn(3, 3, requires_grad=True)
         b = torch.randn(3, 3, requires_grad=True)
-        a1, a2, a3, a4 = a.clone(), a.clone(), a.clone(), a.clone()
-        b1, b2, b3, b4 = b.clone(), b.clone(), b.clone(), b.clone()
+        a1, a2 = a.clone(), a.clone()
+        b1, b2 = b.clone(), b.clone()
 
         failure_reason = None
 
@@ -599,8 +595,8 @@ def guard_fail_fn(failure):
         cc = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
 
         f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
-        f(a1, a1, a1, a1)
-        f(a2, b2, b2, b2)
+        f(a1, a1)
+        f(a2, b2)
         self.assertEqual(cc.frame_count, 2)
         self.assertIn(
             """L['a'] is L['b']""",
@@ -617,10 +613,10 @@ def guard_fail_fn(failure):
         d3, d4 = d.clone(), d.clone()
 
         f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
-        f(a3, b3, c3, c3)
-        f(a4, b4, c4, d4)
+        f(c3, c3)
+        f(c4, d4)
         self.assertEqual(cc.frame_count, 2)
-        self.assertIn("""L['c'] is L['d']""", failure_reason)
+        self.assertIn("""L['a'] is L['b']""", failure_reason)
 
     @patch("torch._functorch.config.debug_assert", True)
     def test_arg_dupe_via_dynamo_recompiles_many_args(self):
@@ -687,6 +683,27 @@ def fn():
         actual_output = aot_fn()
         self.assertEqual(ref_output, actual_output)
 
+    def test_grad_inputs_alias_inputs(self):
+        class Test(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x, y):
+                ctx.save_for_backward(x)
+                return y
+
+            @staticmethod
+            def backward(ctx, grad):
+                (x,) = ctx.saved_tensors
+                return x, grad
+
+        def fn(x, y):
+            return Test.apply(x, y)
+
+        x = torch.ones(1, requires_grad=True)
+        y = torch.ones(1, requires_grad=True)
+        compiled_fn = torch.compile(fn, backend="aot_eager")
+        out = compiled_fn(x, y)
+        out.sum().backward()
+
     @expectedFailureDynamic  # https://github.com/pytorch/pytorch/issues/103539
     @torch._dynamo.config.patch(automatic_dynamic_shapes=False)
     @patch("torch._functorch.config.debug_assert", True)
@@ -885,6 +902,25 @@ def _prepare_model_args():
             ),
         )
 
+    def test_split_with_sizes_aot_autograd_cleans_up_traceback_meta(self):
+        from torch._functorch.aot_autograd import setup_stacktrace_preservation_hooks
+
+        def fn(result, split_sizes):
+            rs = torch.ops.aten.split_with_sizes(result, split_sizes.tolist())
+            return rs
+
+        example_inputs = (
+            torch.randn(32, requires_grad=True),
+            torch.tensor((7, 16, 9)),
+        )
+        outs = fn(*example_inputs)
+        setup_stacktrace_preservation_hooks([out.grad_fn for out in outs])
+        with fx_traceback.preserve_node_meta():
+            (outs[0].sum() + outs[1].sum() + outs[2].sum()).backward()
+
+        self.assertNotIn("grad_fn_seq_nr", fx_traceback.current_meta)
+        self.assertNotIn("in_grad_fn", fx_traceback.current_meta)
+
     # https://github.com/pytorch/pytorch/issues/110121
     def test_aot_export_joint_simple_repro(self):
         class Mod(torch.nn.Module):
@@ -1022,6 +1058,138 @@ def f(x):
         ):
             f(x)
 
+    def test_aot_autograd_expand_mutation_functionalizes(self):
+        def fn(x):
+            y = x.expand(3, *x.shape)
+            y[0, 0].add_(5)
+            return y
+
+        opt_fn = torch.compile(fn, backend="aot_eager")
+
+        x = torch.arange(6)
+        x_opt = x.clone().detach()
+        self.assertEqual(fn(x), opt_fn(x_opt))
+        self.assertEqual(x, x_opt)
+
+    def test_aot_autograd_expand_mutation_backwards(self):
+        def fn(x, z):
+            y = x.expand(3, *x.shape)
+            y[1, 1].mul_(5)
+            ret = y * z
+            return ret
+
+        opt_fn = torch.compile(fn, backend="aot_eager")
+
+        x = torch.arange(6, dtype=torch.float)
+        z = x.clone().detach()
+        x_opt = x.clone().detach()
+        z_opt = x.clone().detach()
+
+        z.requires_grad = True
+        z_opt.requires_grad = True
+
+        res = fn(x, z)
+        opt_res = opt_fn(x_opt, z_opt)
+
+        self.assertEqual(res, opt_res)
+
+        res.sum().backward()
+        opt_res.sum().backward()
+
+        self.assertEqual(x, x_opt)
+        self.assertEqual(z.grad, z_opt.grad)
+
+    def test_data_ptr_access_copy(self):
+        import torch._functorch.config as _config
+
+        with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
+            with FakeTensorMode():
+                x = torch.randn(3)
+                y = copy.copy(x)
+        self.assertEqual(y.shape, x.shape)
+
+    def test_data_ptr_access_fails_in_forward(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            torch.library.define("mylib::foo", "(Tensor x) -> Tensor", lib=lib)
+
+            @torch.library.impl("mylib::foo", "CompositeImplicitAutograd", lib=lib)
+            def _(x):
+                x.data_ptr()
+                return x.clone()
+
+            x = torch.randn(3)
+
+            def data_ptr_graph_input(x):
+                r0 = torch.ops.mylib.foo(x)
+                return r0
+
+            def data_ptr_graph_intermediate(x):
+                y = x.clone()
+                r0 = torch.ops.mylib.foo(y)
+                return r0
+
+            tests = [data_ptr_graph_input, data_ptr_graph_intermediate]
+
+            def ctx():
+                return self.assertRaisesRegex(
+                    RuntimeError, "Cannot access data pointer"
+                )
+
+            for f in tests:
+                with ctx():
+                    make_fx(f, tracing_mode="fake")(x)
+                with ctx():
+                    make_fx(f, tracing_mode="symbolic")(x)
+                with ctx():
+                    torch.compile(f, backend="eager", fullgraph=True)(x)
+
+    def test_data_ptr_access_fails_in_backward(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            torch.library.define("mylib::foo", "(Tensor x) -> Tensor", lib=lib)
+
+            backward_called = False
+
+            class Foo(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    return x.clone()
+
+                @staticmethod
+                def backward(ctx, grad):
+                    nonlocal backward_called
+                    backward_called = True
+                    grad.data_ptr()
+                    return grad.clone()
+
+            @torch.library.impl("mylib::foo", "CompositeImplicitAutograd", lib=lib)
+            def _(x):
+                return Foo.apply(x)
+
+            def f(x):
+                return torch.ops.mylib.foo(x)
+
+            x = torch.randn(3, requires_grad=True)
+            with self.assertRaisesRegex(RuntimeError, "Cannot access data pointer"):
+                y = torch.compile(f, backend="aot_eager", fullgraph=True)(x)
+            self.assertTrue(backward_called)
+
+    # We don't know how to catch multiple mutations to the same memory location
+    @unittest.expectedFailure
+    def test_aot_autograd_expand_mutation_error(self):
+        def fn(x):
+            y = x.expand(3, *x.shape)
+            y[0:3, 0].add_(5)
+            return y
+
+        opt_fn = torch.compile(fn, backend="aot_eager")
+
+        x = torch.arange(6)
+        x_opt = x.clone().detach()
+        with self.assertRaises(Exception):
+            fn(x)
+        with self.assertRaises(Exception):
+            opt_fn(x_opt)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_aot_autograd_cache.py b/test/dynamo/test_aot_autograd_cache.py
new file mode 100644
index 0000000000000..4164bfe346cd6
--- /dev/null
+++ b/test/dynamo/test_aot_autograd_cache.py
@@ -0,0 +1,105 @@
+# Owner(s): ["module: dynamo"]
+
+import torch
+import torch._dynamo
+import torch._dynamo.test_case
+
+import torch._functorch._aot_autograd
+from torch._functorch._aot_autograd.autograd_cache import autograd_cache_hash
+from torch._functorch._aot_autograd.schemas import AOTConfig
+
+
+class AOTAutogradCachePicklerTests(torch._dynamo.test_case.TestCase):
+    def default_config(self):
+        return AOTConfig(
+            fw_compiler=None,
+            bw_compiler=None,
+            inference_compiler=None,
+            partition_fn=None,
+            decompositions={},
+            num_params_buffers=0,
+            aot_id=0,
+            keep_inference_input_mutations=False,
+            dynamic_shapes=True,
+            aot_autograd_arg_pos_to_source=None,
+            is_export=False,
+            no_tangents=False,
+            enable_log=False,
+        )
+
+    def _get_dynamo_output(self, fn, *args, **kwargs):
+        # Reset dynamo between runs
+        torch._dynamo.reset()
+        fx_graph = None
+
+        def compiler(gm, inputs, **kwargs):
+            nonlocal fx_graph
+            fx_graph = gm
+            return gm
+
+        g = torch.compile(fn, backend=compiler, fullgraph=True)
+        result = g(*args, **kwargs)
+        return (result, fx_graph)
+
+    def gen_cache_key(self, f, config, inputs=None):
+        if inputs is None:
+            inputs = [torch.randn(3)]
+        _, fx_g = self._get_dynamo_output(f, *inputs)
+        return autograd_cache_hash(fx_g, config)
+
+    def test_basic_hash_key(self):
+        def fn(x):
+            return x.sin().cos()
+
+        config = self.default_config()
+        # Check hash is stable on multiple runs
+        c1 = self.gen_cache_key(fn, config)
+        c2 = self.gen_cache_key(fn, config)
+        self.assertEqual(c1, c2)
+
+    def test_identical_graphs_and_configs(self):
+        def fn(x):
+            return x.sin().cos()
+
+        def fn2(x):
+            y = x.sin()
+            z = y.cos()
+            return z
+
+        # Make the id different, but otherwise identical
+        config = self.default_config()
+        config2 = self.default_config()
+        config2.aot_id = 1
+
+        c1 = self.gen_cache_key(fn, config)
+        c2 = self.gen_cache_key(fn, config2)
+        self.assertEqual(c1, c2)
+
+    def test_different_graphs(self):
+        def fn(x):
+            return x.cos().sin()
+
+        def fn2(x):
+            return x.sin().cos()
+
+        config = self.default_config()
+        c1 = self.gen_cache_key(fn, config)
+        c2 = self.gen_cache_key(fn2, config)
+        self.assertNotEqual(c1, c2)
+
+    def test_different_configs(self):
+        def fn(x):
+            return x.cos().sin()
+
+        config = self.default_config()
+        config2 = self.default_config()
+        config2.dynamic_shapes = False
+        c1 = self.gen_cache_key(fn, config)
+        c2 = self.gen_cache_key(fn, config2)
+        self.assertNotEqual(c1, c2)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_autograd_function.py b/test/dynamo/test_autograd_function.py
index f1b1e70f404da..52a56bf0786e8 100644
--- a/test/dynamo/test_autograd_function.py
+++ b/test/dynamo/test_autograd_function.py
@@ -1,13 +1,21 @@
 # Owner(s): ["module: dynamo"]
-
+# flake8: noqa: B950
 import copy
 import math
 
+from dataclasses import dataclass
+
 import torch
 
 import torch._dynamo.test_case
 import torch._dynamo.testing
 import torch._dynamo.utils
+from torch.testing._internal.common_utils import skipIfRocm
+from torch.testing._internal.triton_utils import HAS_CUDA, requires_cuda
+
+if HAS_CUDA:
+    import triton
+    from torch.testing._internal.triton_utils import add_kernel
 
 
 class CustomFunc1(torch.autograd.Function):
@@ -245,11 +253,11 @@ def test_autograd_function_has_graph_break(self):
 
     def test_linear_setup_context(self):
         model = ModuleLinear()
-        opt_model = torch._dynamo.optimize("eager")(model)
+        opt_model = torch._dynamo.optimize("eager", nopython=True)(model)
         input = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
         weight = torch.randn(3, 2, dtype=torch.double, requires_grad=True)
-        optim_result = opt_model(input, weight)
         eager_result = model(input, weight)
+        optim_result = opt_model(input, weight)
         self.assertEqual(optim_result, eager_result)
 
     def test_materialize_grad(self):
@@ -264,20 +272,24 @@ def test_print_in_bwd(self):
         model = CustomFuncBwdPrintModule()
         opt_model = torch._dynamo.optimize("eager", nopython=True)(model)
         x = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
-        with self.assertRaisesRegex(
-            torch._dynamo.exc.Unsupported, ".*BuiltinVariable\\(print\\).*"
-        ):
+        with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "builtin: print"):
             opt_model(x)
 
     def test_stride_in_bwd(self):
+        torch._dynamo.utils.counters.clear()
+        cnt = torch._dynamo.testing.CompileCounter()
         model = CustomFuncStrideModule()
-        opt_model = torch._dynamo.optimize("eager", nopython=True)(model)
+        opt_model = torch.compile(backend=cnt)(model)
         x = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
-        with self.assertRaisesRegex(
-            torch._dynamo.exc.Unsupported,
-            "Illegal getattr invocation stride in strict mod",
-        ):
-            opt_model(x)
+        ref = model(x)
+        res = opt_model(x)
+
+        self.assertEqual(ref, res)
+        self.assertEqual(cnt.frame_count, 1)
+        # graph break: Illegal getattr invocation stride in strict mod.
+        self.assertEqual(
+            list(torch._dynamo.utils.counters["graph_break"].values()), [1]
+        )
 
     def test_enum_arg(self):
         from enum import Enum
@@ -457,32 +469,182 @@ def fn(a, b):
         self.assertEqual(res, MyMM.apply(a, a))
         self.assertEqual(cnt.frame_count, 1)
 
-    def test_graph_break_if_lifted_free_variable(self):
-        torch._dynamo.utils.counters.clear()
-        cnt = torch._dynamo.testing.CompileCounter()
-        delta = torch.randn(3)
+    def test_user_defined_object_as_input(self):
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+
+        @dataclass
+        class Weird:
+            x: int
+            b: torch.Tensor
+            c: torch.Tensor
 
         class Foo(torch.autograd.Function):
             @staticmethod
-            def forward(ctx, x):
-                return x.clone(), (x + delta).clone()
+            def forward(ctx, x: torch.Tensor, weird: Weird, z: torch.Tensor):
+                ctx.save_for_backward(weird.b, weird.c)
+                return weird.b * weird.c * x.clone()
 
             @staticmethod
-            def backward(ctx, grad1, grad2):
-                return grad1 + grad2
+            def backward(ctx, grad):
+                b, c = ctx.saved_tensors
+                return grad * b * c, None, grad * 2
 
-        @torch.compile(backend=cnt)
-        def f(x):
-            return Foo.apply(x)
+        @torch.compile(backend=cnt, fullgraph=True)
+        def f(x, weird, z):
+            return Foo.apply(x, weird, z)
 
-        x = torch.randn(3, requires_grad=True)
-        result = f(x)
+        x = torch.tensor(2.0, requires_grad=True)
+        weird = Weird(1.2, torch.tensor(2.5, requires_grad=True), torch.tensor(3.5))
+        z = torch.tensor(3.0, requires_grad=True)
 
-        self.assertEqual(result, Foo.apply(x))
-        self.assertEqual(cnt.frame_count, 1)
-        self.assertEqual(
-            list(torch._dynamo.utils.counters["graph_break"].values()), [1]
+        result = f(x, weird, z)
+        result.sum().backward()
+
+        self.assertEqual(result, Foo.apply(x, weird, z))
+        self.assertEqual(x.grad, 2.5 * 3.5)
+        self.assertEqual(z.grad, 2.0)
+        self.assertEqual(weird.b.grad, None)
+
+        # check Dynamo captured graph is correct!
+        actual_graph = torch._dynamo.testing.normalize_gm(
+            cnt.graphs[0].print_readable(print_output=False)
+        )
+        self.assertExpectedInline(
+            actual_graph,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[]", L_z_: "f32[]", L_weird_b: "f32[]", L_weird_c: "f32[]"):
+        l_x_ = L_x_
+        l_z_ = L_z_
+        l_weird_b = L_weird_b
+        l_weird_c = L_weird_c
+
+        function_ctx = torch.autograd.function.FunctionCtx()
+        fwd_body_0 = self.fwd_body_0
+        bwd_body_0 = self.bwd_body_0
+        autograd_function_apply: "f32[]" = torch._functorch.autograd_function.autograd_function_apply(fwd_body_0, bwd_body_0, l_x_, l_z_, l_weird_b, l_weird_c, args_tensor_mask = [True, False, True]);  fwd_body_0 = bwd_body_0 = l_x_ = l_z_ = l_weird_b = l_weird_c = None
+        return (autograd_function_apply,)
+
+    class GraphModule(torch.nn.Module):
+        def forward(self, function_ctx, l_x_: "f32[]", l_z_: "f32[]", l_weird_b: "f32[]", l_weird_c: "f32[]"):
+            mul: "f32[]" = l_weird_b * l_weird_c
+            clone: "f32[]" = l_x_.clone();  l_x_ = None
+            mul_1: "f32[]" = mul * clone;  mul = clone = None
+            return (mul_1, [l_weird_b, l_weird_c])
+
+    class GraphModule(torch.nn.Module):
+        def forward(self, function_ctx, mul_1: "f32[]", l_weird_b: "f32[]", l_weird_c: "f32[]"):
+            _set_grad_enabled = torch._C._set_grad_enabled(False)
+
+            mul: "f32[]" = mul_1 * l_weird_b;  l_weird_b = None
+            mul_2: "f32[]" = mul * l_weird_c;  mul = l_weird_c = None
+            mul_3: "f32[]" = mul_1 * 2;  mul_1 = None
+
+            _set_grad_enabled_1 = torch._C._set_grad_enabled(True)
+            return (mul_2, mul_3)
+""",
+        )
+
+    def test_tensor_list_as_input(self):
+        class Foo(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x, tl):
+                ctx.save_for_backward(tl[0], tl[1])
+                return x.clone() * (tl[0] + tl[1])
+
+            @staticmethod
+            def backward(ctx, grad):
+                tl0, tl1 = ctx.saved_tensors
+                return grad * (tl0 + tl1), None
+
+        @torch.compile(backend="aot_eager", fullgraph=True)
+        def f(x, tl):
+            return Foo.apply(x, tl)
+
+        x = torch.tensor(2.0, requires_grad=True)
+        tl = [
+            torch.tensor(3.0, requires_grad=True),
+            torch.tensor(4.0, requires_grad=True),
+        ]
+
+        result = f(x, tl)
+        result.sum().backward()
+
+        self.assertEqual(result, Foo.apply(x, tl))
+        self.assertEqual(x.grad, 7.0)
+        self.assertEqual(tl[0].grad, None)
+        self.assertEqual(tl[1].grad, None)
+
+    def test_multiple_different_non_tensor_inputs(self):
+        @dataclass
+        class Weird:
+            x: int
+            b: torch.Tensor
+            c: torch.Tensor
+
+        class Foo(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x, weird, z, tl):
+                ctx.save_for_backward(weird.b, weird.c, tl[0], tl[1])
+                return x.clone() * weird.b * weird.c * tl[0]
+
+            @staticmethod
+            def backward(ctx, grad):
+                b, c, tl0, _ = ctx.saved_tensors
+                return grad * b * c * tl0, None, grad * 2, None
+
+        @torch.compile(backend="aot_eager", fullgraph=True)
+        def f(x, weird, z, tl):
+            return Foo.apply(x, weird, z, tl)
+
+        x = torch.tensor(2.0, requires_grad=True)
+        weird = Weird(
+            1.2,
+            torch.tensor(2.5, requires_grad=True),
+            torch.tensor(3.5, requires_grad=True),
         )
+        z = torch.tensor(3.0, requires_grad=True)
+        tl = [
+            torch.tensor(0.5, requires_grad=True),
+            torch.tensor(0.6, requires_grad=True),
+        ]
+
+        result = f(x, weird, z, tl)
+        result.sum().backward()
+
+        self.assertEqual(result, Foo.apply(x, weird, z, tl))
+        self.assertEqual(x.grad, 2.5 * 3.5 * 0.5)
+        self.assertEqual(z.grad, 2.0)
+        self.assertEqual(weird.b.grad, None)
+        self.assertEqual(weird.c.grad, None)
+        self.assertEqual(tl[0].grad, None)
+        self.assertEqual(tl[1].grad, None)
+
+    def test_backward_returns_none_for_tensor_input(self):
+        class Foo(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x, y):
+                ctx.save_for_backward(y)
+                return x.clone() * y
+
+            @staticmethod
+            def backward(ctx, grad):
+                (y,) = ctx.saved_tensors
+                return grad * y, None
+
+        @torch.compile(backend="aot_eager", fullgraph=True)
+        def f(x, y):
+            return Foo.apply(x, y)
+
+        x = torch.tensor(2.0, requires_grad=True)
+        y = torch.tensor(3.0, requires_grad=True)
+
+        result = f(x, y)
+        result.sum().backward()
+
+        self.assertEqual(result, Foo.apply(x, y))
+        self.assertEqual(x.grad, 3.0)
+        self.assertEqual(y.grad, None)
 
     def test_function_with_bound_free_variable(self):
         class LowerBound(torch.autograd.Function):
@@ -742,9 +904,7 @@ def __torch_dispatch__(cls, func, types, args, kwargs=None):
                     new_data = args[0]._data.view(*args[1:])
                     return FooTensor(new_data, args[0]._config, args[0]._scale)
 
-                raise NotImplementedError()
-
-            __torch_function__ = torch._C._disabled_torch_function_impl
+                raise NotImplementedError
 
         class foo_autograd_fn(torch.autograd.Function):
             @staticmethod
@@ -810,6 +970,66 @@ def foo(x):
         foo(torch.randn(2, requires_grad=True))
         self.assertEqual(cnts.frame_count, 1)
 
+    def test_needs_input_grad(self):
+        cnt = torch._dynamo.testing.CompileCounter()
+
+        class NeedsInputGradFunc(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, foo):
+                result = foo + foo
+                ctx.save_for_backward(result)
+                return result
+
+            @staticmethod
+            @torch.compile(backend=cnt, fullgraph=True)
+            def backward(ctx, grad_output):
+                (result,) = ctx.saved_tensors
+                if ctx.needs_input_grad[0]:
+                    return grad_output * result.sin()
+                return None
+
+        x = torch.randn(10, requires_grad=True)
+        NeedsInputGradFunc.apply(x).sum().backward()
+        self.assertEqual(x.grad.shape, x.shape)
+        self.assertEqual(cnt.frame_count, 1)
+        self.assertEqual(cnt.op_count, 2)
+
+    def test_repeated_save_for_backward_calls(self):
+        from torch.autograd import Function
+
+        class Foo(Function):
+            @staticmethod
+            def forward(ctx, x, y):
+                ctx.save_for_backward(x)
+                ctx.save_for_backward(x, y)
+                return x * y
+
+            @staticmethod
+            def backward(ctx, grad_out):
+                x, y = ctx.saved_tensors
+                return grad_out * x, grad_out * y
+
+        cnts = torch._dynamo.testing.CompileCounter()
+
+        def foo(x, y):
+            return Foo.apply(x, y)
+
+        x_ref = torch.randn(2, requires_grad=True)
+        y_ref = torch.randn(2, requires_grad=True)
+        x_test = x_ref.clone().detach().requires_grad_()
+        y_test = y_ref.clone().detach().requires_grad_()
+
+        out_ref = foo(x_ref, y_ref)
+        out_ref.sum().backward()
+
+        out_test = torch.compile(foo, backend=cnts)(x_test, y_test)
+        out_test.sum().backward()
+
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(out_ref, out_test)
+        self.assertEqual(x_ref.grad, x_test.grad)
+        self.assertEqual(y_ref.grad, y_test.grad)
+
     def test_smuggle_tensor_and_complex_structures(self):
         from torch.autograd import Function
 
@@ -856,6 +1076,101 @@ def foo(x):
         foo(torch.randn(2))
         foo(torch.randn(2, requires_grad=True))
 
+    def test_tuple_arg(self):
+        cnt = torch._dynamo.testing.CompileCounter()
+
+        class TupleArgFunc(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x, shape):
+                ctx.save_for_backward(torch.randn(shape))
+                return x + 1
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                (result,) = ctx.saved_tensors
+                return result, None
+
+        @torch.compile(backend=cnt, fullgraph=True)
+        def fn():
+            return TupleArgFunc.apply(x, shape)
+
+        shape = (10, 10)
+        x = torch.randn(shape, requires_grad=True)
+        out = fn()
+        out.sum().backward()
+        self.assertEqual(out, x + 1)
+        self.assertEqual(x.grad.shape, shape)
+        self.assertEqual(cnt.frame_count, 1)
+        self.assertEqual(cnt.op_count, 2)
+
+    @requires_cuda
+    @skipIfRocm
+    def test_triton_kernel_basic(self):
+        class Add(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x, y):
+                ctx.save_for_backward(x, y)
+                output = torch.zeros_like(x)
+                n_elements = output.numel()
+                grid = lambda meta: (  # noqa: E731
+                    triton.cdiv(n_elements, meta["BLOCK_SIZE"]),
+                )
+                add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=16)
+                return output
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                x, y = ctx.saved_tensors
+                return x * grad_output, y * grad_output
+
+        @torch.compile(fullgraph=True, backend="inductor")
+        def f(x, y):
+            z = Add.apply(x, y)
+            return z
+
+        x = torch.randn(10, device="cuda", requires_grad=True)
+        y = torch.randn(10, device="cuda", requires_grad=True)
+        z = f(x, y)
+        loss = z.sum()
+        loss.backward()
+        self.assertEqual(x + y, z)
+
+    @requires_cuda
+    @skipIfRocm
+    def test_triton_kernel_multiple_out(self):
+        class Add(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x, y):
+                ctx.save_for_backward(x, y)
+                ctx.t1 = x
+                ctx.t2 = y
+                output = torch.zeros_like(x)
+                n_elements = output.numel()
+                grid = lambda meta: (  # noqa: E731
+                    triton.cdiv(n_elements, meta["BLOCK_SIZE"]),
+                )
+                add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=16)
+                return output, x
+
+            @staticmethod
+            def backward(ctx, grad_output, old_x):
+                x, y = ctx.saved_tensors
+                x1 = ctx.t1
+                y1 = ctx.t2
+                return old_x * x * x1 * grad_output, y * y1 * grad_output
+
+        @torch.compile(fullgraph=True, backend="inductor")
+        def f(x, y):
+            z = Add.apply(x, y)
+            return z
+
+        x = torch.randn(10, device="cuda", requires_grad=True)
+        y = torch.randn(10, device="cuda", requires_grad=True)
+        z, _ = f(x, y)
+        loss = z.sum()
+        loss.backward()
+        self.assertEqual(x + y, z)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_backends.py b/test/dynamo/test_backends.py
index 9291b22b43608..05172c0ad1afc 100644
--- a/test/dynamo/test_backends.py
+++ b/test/dynamo/test_backends.py
@@ -1,5 +1,4 @@
 # Owner(s): ["module: dynamo"]
-import functools
 import unittest
 
 import torch
@@ -10,9 +9,10 @@
 from torch._dynamo.backends.onnxrt import has_onnxruntime
 from torch._dynamo.backends.tvm import has_tvm
 from torch._dynamo.testing import same
+from torch.fx._lazy_graph_module import _force_skip_lazy_graph_module
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
-requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 
 
 class Seq(torch.nn.Module):
@@ -106,6 +106,7 @@ def _check_backend_works(self, backend):
     def test_eager(self):
         self._check_backend_works("eager")
 
+    @_force_skip_lazy_graph_module()
     def test_torchscript(self):
         self._check_backend_works("ts")
 
@@ -115,10 +116,11 @@ def test_aot_eager(self):
     def test_aot_eager_decomp_partition(self):
         self._check_backend_works("aot_eager_decomp_partition")
 
+    @_force_skip_lazy_graph_module()
     def test_aot_ts(self):
         self._check_backend_works("aot_ts")
 
-    @requires_cuda()
+    @requires_cuda
     def test_aot_cudagraphs(self):
         self._check_backend_works("cudagraphs")
 
diff --git a/test/dynamo/test_backward_higher_order_ops.py b/test/dynamo/test_backward_higher_order_ops.py
index 9546f0236ba3c..425dc91d7a6c1 100644
--- a/test/dynamo/test_backward_higher_order_ops.py
+++ b/test/dynamo/test_backward_higher_order_ops.py
@@ -59,19 +59,15 @@ def test_invoke_make_fx_forward_contrived(self):
         out = make_fx(_multiply_invoke)(x)
         self.assertEqual(out(x), torch.tensor([0.25, 0.25]))
         actual = normalize_gm(out.print_readable(False))
-
-        expected = """\
+        self.assertExpectedInline(
+            actual,
+            """\
 class _multiply_invoke(torch.nn.Module):
     def forward(self, grad_1: "f32[2]"):
         trace_wrapped: "f32[2]" = torch__dynamo__trace_wrapped_higher_order_op_self_invoke(grad_1);  grad_1 = None
-        assert_1: "f32[2]" = torch__dynamo__trace_wrapped_higher_order_op__assert_meta(trace_wrapped, (2,), (1,), torch.float32);  trace_wrapped = None
-        detach: "f32[2]" = torch.ops.aten.detach.default(assert_1);  assert_1 = None
-        detach_1: "f32[2]" = torch.ops.aten.detach.default(detach);  detach = None
-        detach_2: "f32[2]" = torch.ops.aten.detach.default(detach_1);  detach_1 = None
-        detach_3: "f32[2]" = torch.ops.aten.detach.default(detach_2);  detach_2 = None
-        return detach_3
-"""
-        self.assertExpectedInline(actual, expected)
+        return trace_wrapped
+""",
+        )
 
     def test_invoke_make_bw(self):
         x = torch.tensor([0.5, 0.5], requires_grad=True)
@@ -86,14 +82,15 @@ def fwd(x):
         self.assertEqual(out(x.grad), torch.tensor([4.0, 4.0]))
         actual = normalize_gm(out.print_readable(False))
 
-        expected = """\
+        self.assertExpectedInline(
+            actual,
+            """\
 class _multiply_invoke(torch.nn.Module):
     def forward(self, grad_1: "f32[2]"):
         trace_wrapped: "f32[2]" = torch__dynamo__trace_wrapped_higher_order_op_self_invoke(grad_1);  grad_1 = None
-        assert_1: "f32[2]" = torch__dynamo__trace_wrapped_higher_order_op__assert_meta(trace_wrapped, (2,), (1,), torch.float32);  trace_wrapped = None
-        return assert_1
-"""
-        self.assertExpectedInline(actual, expected)
+        return trace_wrapped
+""",
+        )
 
     def test_invoke_in_pt2_compiled_autograd(self):
         graph = None
@@ -125,21 +122,23 @@ def fn(x, y):
                 out.backward(grad_out)
             actual = normalize_gm(graph.print_readable(False))
             self.assertEqual(x.grad, grad_out * grad_out)
-            expected = """\
+            self.assertExpectedInline(
+                actual,
+                """\
 class GraphModule(torch.nn.Module):
-    def forward(self, s0 : torch.SymInt, L_inputs_0_ : torch.Tensor, L_inputs_1_ : torch.Tensor, L_inputs_2_ : torch.Tensor):
-        getitem = L_inputs_0_
-        getitem_1 = L_inputs_1_
-        getitem_2 = L_inputs_2_
+    def forward(self, L_inputs_ : list):
+        l_inputs_ = L_inputs_
 
-        accumulate_grad__default = torch.ops.inductor.accumulate_grad_.default(getitem_1, getitem);  getitem_1 = None
+        getitem: "f32[s0]" = l_inputs_[0];  l_inputs_ = None
 
-        call_hook = getitem * getitem;  getitem = None
+        new_grad: "f32[s0]" = torch.clone(getitem)
 
-        accumulate_grad__default_1 = torch.ops.inductor.accumulate_grad_.default(getitem_2, call_hook);  getitem_2 = call_hook = None
-        return ()
-"""
-            self.assertExpectedInline(actual, expected)
+        result: "f32[s0]" = getitem * getitem;  getitem = None
+
+        new_grad_1: "f32[s0]" = torch.clone(result);  result = None
+        return (new_grad, new_grad_1)
+""",
+            )
 
             graph = None
 
@@ -193,17 +192,17 @@ def fn(x, y):
                 actual,
                 """\
 class GraphModule(torch.nn.Module):
-    def forward(self, s0 : torch.SymInt, L_inputs_0_ : torch.Tensor, L_inputs_1_ : torch.Tensor, L_inputs_2_ : torch.Tensor):
-        getitem = L_inputs_0_
-        getitem_1 = L_inputs_1_
-        getitem_2 = L_inputs_2_
+    def forward(self, L_inputs_ : list):
+        l_inputs_ = L_inputs_
+
+        getitem: "f32[s0]" = l_inputs_[0];  l_inputs_ = None
 
-        accumulate_grad__default = torch.ops.inductor.accumulate_grad_.default(getitem_1, getitem);  getitem_1 = None
+        new_grad: "f32[s0]" = torch.clone(getitem)
 
-        call_hook = getitem * getitem;  getitem = None
+        result: "f32[s0]" = getitem * getitem;  getitem = None
 
-        accumulate_grad__default_1 = torch.ops.inductor.accumulate_grad_.default(getitem_2, call_hook);  getitem_2 = call_hook = None
-        return ()
+        new_grad_1: "f32[s0]" = torch.clone(result);  result = None
+        return (new_grad, new_grad_1)
 """,
             )
 
diff --git a/test/dynamo/test_compile.py b/test/dynamo/test_compile.py
index c687c3ebac376..9ee739e95cfec 100644
--- a/test/dynamo/test_compile.py
+++ b/test/dynamo/test_compile.py
@@ -1,8 +1,10 @@
 # Owner(s): ["module: dynamo"]
 
 import inspect
+import io
 import os
 import tempfile
+from unittest.mock import patch
 
 import torch
 from torch._dynamo.test_case import run_tests, TestCase
@@ -71,6 +73,59 @@ def test_jit_save(self):
             loaded_model = torch.jit.load(os.path.join(tmpdirname, "model.pt"))
             loaded_model(torch.randn(1, 10))
 
+    def test_compilation_callback(self):
+        torch._dynamo.reset()
+
+        @torch._dynamo.on_compile_start
+        def start_callback():
+            print("Compilation started.")
+
+        @torch._dynamo.on_compile_end
+        def end_callback():
+            print("Compilation ended.")
+
+        mod = ToyModel()
+        x = torch.randn(10, 10)
+
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            opt_mod = torch.compile(backend="eager", fullgraph=True)(mod)
+            opt_mod(x)
+            printed_output = mock_stdout.getvalue().strip()
+
+        self.assertEqual(printed_output, "Compilation started.\nCompilation ended.")
+
+    def test_compilation_callback_with_graph_break(self):
+        torch._dynamo.reset()
+        counter = 0
+
+        @torch._dynamo.on_compile_start
+        def start_callback():
+            nonlocal counter
+            counter += 1
+            print(f"Counter = {counter}")
+
+        @torch._dynamo.on_compile_end
+        def end_callback():
+            nonlocal counter
+            counter += 1
+            print(f"Counter = {counter}")
+
+        @torch.compile(backend="eager")
+        def fn(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            return torch.sin(x)
+
+        x = torch.randn(10, 10)
+
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            fn(x)
+            printed_output = mock_stdout.getvalue().strip()
+
+        self.assertEqual(
+            printed_output, "Counter = 1\nCounter = 2\nCounter = 3\nCounter = 4"
+        )
+
 
 # The private variants of the below functions are extensively tested
 # So as long as the signatures match we're good
diff --git a/test/dynamo/test_comptime.py b/test/dynamo/test_comptime.py
index 45f2a6c6ad9a9..1170010837a20 100644
--- a/test/dynamo/test_comptime.py
+++ b/test/dynamo/test_comptime.py
@@ -223,13 +223,6 @@ def _(ctx):
             'obj_weakref': None
             'guarded_class': None
         }
-        global '' BACKEND_MATCH
-        {
-            'guard_types': None,
-            'code': None,
-            'obj_weakref': None
-            'guarded_class': None
-        }
         shape_env '' SHAPE_ENV
         {
             'guard_types': None,
diff --git a/test/dynamo/test_config.py b/test/dynamo/test_config.py
index 5ccd46a1ff20d..05e941118dfa2 100644
--- a/test/dynamo/test_config.py
+++ b/test/dynamo/test_config.py
@@ -81,7 +81,7 @@ def test_config_compile_ignored(self):
             "debug_dir_root",
         }
         for k in dynamo_guarded_config_ignorelist:
-            assert k in torch._dynamo.config._compile_ignored_keys
+            assert k in torch._dynamo.config._compile_ignored_keys, k
 
     def test_config_hash(self):
         config = torch._dynamo.config
diff --git a/test/dynamo/test_cpp_guard_manager.py b/test/dynamo/test_cpp_guard_manager.py
new file mode 100644
index 0000000000000..0597e158972d8
--- /dev/null
+++ b/test/dynamo/test_cpp_guard_manager.py
@@ -0,0 +1,59 @@
+# Owner(s): ["module: dynamo"]
+
+from torch._dynamo import config
+from torch._dynamo.testing import make_test_cls_with_patches
+
+try:
+    from . import (
+        test_functions,
+        test_higher_order_ops,
+        test_misc,
+        test_optimizers,
+        test_repros,
+    )
+except ImportError:
+    import test_functions
+    import test_higher_order_ops
+    import test_misc
+    import test_optimizers
+    import test_repros
+
+
+test_classes = {}
+
+
+def make_cpp_guard_manager_cls(cls):
+    suffix = "_cpp_guard_manager"
+
+    cls_prefix = "CppGuardManager"
+
+    test_class = make_test_cls_with_patches(
+        cls,
+        cls_prefix,
+        suffix,
+        (config, "enable_cpp_guard_manager", True),
+    )
+
+    test_classes[test_class.__name__] = test_class
+    # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
+    globals()[test_class.__name__] = test_class
+    test_class.__module__ = __name__
+    return test_class
+
+
+tests = [
+    test_functions.FunctionTests,
+    test_misc.MiscTests,
+    test_repros.ReproTests,
+    test_higher_order_ops.HigherOrderOpTests,
+    test_higher_order_ops.FuncTorchHigherOrderOpTests,
+    test_optimizers.End2EndTests,
+]
+for test in tests:
+    make_cpp_guard_manager_cls(test)
+del test
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_ctx_manager.py b/test/dynamo/test_ctx_manager.py
index c0b24def56e9a..651c392f5dd2c 100644
--- a/test/dynamo/test_ctx_manager.py
+++ b/test/dynamo/test_ctx_manager.py
@@ -162,8 +162,11 @@ def fn(x):
             s = torch.cuda.Stream()
             x = torch.mul(x, 5)
             x = torch.add(x, 2)
+            current_stream = torch.cuda.current_stream()
+            s.wait_stream(current_stream)
             with torch.cuda.stream(s):
                 x = torch.relu(x)
+            current_stream.wait_stream(s)
             x = torch.add(x, 1)
             x = torch.cos(x)
             return x
@@ -175,24 +178,62 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref, res)
         self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(cnts.op_count, 12)
+
+    @unittest.expectedFailure  # https://github.com/pytorch/pytorch/issues/118204
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    def test_cuda_stream_across_graph_break(self):
+        def fn(x):
+            s = torch.cuda.Stream()
+            x = torch.mul(x, 5)
+            x = torch.add(x, 2)
+
+            print("foo")
+
+            tcs = torch.cuda.stream(s)
+            current_stream = torch.cuda.current_stream()
+            s.wait_stream(current_stream)
+
+            with tcs:
+                x = torch.relu(x)
+
+            current_stream.wait_stream(s)
+            x = torch.add(x, 1)
+            x = torch.cos(x)
+            return x
+
+        x = torch.randn((2, 2), device="cuda")
+        ref = fn(x)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts)(fn)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
         self.assertEqual(cnts.op_count, 9)
 
+    @unittest.expectedFailure  # https://github.com/pytorch/pytorch/issues/118204
     @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
     def test_cuda_stream_context_manager2(self):
         def fn(x, s):
             x = torch.mul(x, 5)
             x = torch.add(x, 2)
+
+            current_stream = torch.cuda.current_stream()
+            s.wait_stream(current_stream)
+
             with torch.cuda.stream(s):
                 x = torch.relu(x)
 
-            s1 = torch.cuda.current_stream()
-            with torch.cuda.stream(s1):
+            current_stream.wait_stream(s)
+            with torch.cuda.stream(current_stream):
                 x = torch.relu(x)
 
             s2 = torch.cuda.Stream()
+            s2.wait_stream(current_stream)
             with torch.cuda.stream(s2):
                 x = torch.relu(x)
 
+            current_stream.wait_stream(s2)
             x = torch.add(x, 1)
             x = torch.cos(x)
             return x
@@ -214,11 +255,13 @@ def fn(x):
             x = torch.add(x, 2)
 
             new_stream = torch.cuda.Stream()
+            cur_stream = torch.cuda.current_stream()
+            new_stream.wait_stream(cur_stream)
+
             with torch.cuda.stream(new_stream):
                 x = torch.sin(x)
                 x = torch.add(x, 3)
 
-            cur_stream = torch.cuda.current_stream()
             cur_stream.wait_stream(new_stream)
 
             x = torch.add(x, 4)
@@ -240,9 +283,118 @@ def fn(x):
         cnts = torch._dynamo.testing.CompileCounter()
         opt_fn = torch._dynamo.optimize(cnts, nopython=True)(fn)
         res = opt_fn(x)
-        self.assertTrue(same(ref, res))
+        self.assertEqual(ref, res)
         self.assertEqual(cnts.frame_count, 1)
-        self.assertEqual(cnts.op_count, 20)
+        self.assertEqual(cnts.op_count, 21)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    def test_cuda_stream_compared_with_constant(self):
+        def fn(x):
+            x = torch.mul(x, 1)
+            x = torch.add(x, 2)
+
+            cur_stream = torch.cuda.current_stream()
+            if cur_stream is not None:
+                return x + 1
+            return x - 1
+
+        def fn2(x):
+            x = torch.mul(x, 1)
+            x = torch.add(x, 2)
+
+            cur_stream = torch.cuda.current_stream()
+            if cur_stream != "const_str":
+                return x + 1
+            return x - 1
+
+        x = torch.randn((2, 2), device="cuda")
+        ref = fn(x)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts, nopython=True)(fn)
+        opt_fn2 = torch._dynamo.optimize(cnts, nopython=True)(fn2)
+        res = opt_fn(x)
+        res2 = opt_fn2(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(ref, res2)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    def test_cuda_stream_compared_with_stream(self):
+        def fn(x, s0, s1):
+            if s0 == s1:
+                return x + 1
+            else:
+                return x - 1
+
+        s0 = torch.cuda.Stream()
+        s1 = torch.cuda.Stream()
+        x = torch.randn(2, 2)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts, nopython=True)(fn)
+
+        ref0 = fn(x, s0, s1)
+        res0 = opt_fn(x, s0, s1)
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(ref0, res0)
+
+        ref1 = fn(x, s1, s1)
+        res1 = opt_fn(x, s1, s1)
+        # We have a re-compilation because of chaning inputs
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(ref1, res1)
+
+        torch._dynamo.reset()
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts, nopython=True)(fn)
+
+        ref1 = fn(x, s1, s1)
+        res1 = opt_fn(x, s1, s1)
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(ref1, res1)
+
+        ref0 = fn(x, s0, s1)
+        res0 = opt_fn(x, s0, s1)
+        # We have a re-compilation because of chaning inputs
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(ref0, res0)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    def test_cuda_event_method_create_stream_outside_of_compile(self):
+        def fn(x, cur_stream, new_stream):
+            x = torch.mul(x, 1)
+            x = torch.add(x, 2)
+
+            x = torch.add(x, 3)
+
+            event = cur_stream.record_event()
+            is_idle = event.query()
+
+            new_stream.wait_event(event)
+            with torch.cuda.stream(new_stream):
+                x = torch.add(x, 4)
+
+            new_event = torch.cuda.Event()
+            new_event.record(new_stream)
+
+            new_event.wait(cur_stream)
+            x = torch.add(x, 5)
+
+            # use new event to sync
+            new_event.synchronize()
+
+            x = torch.relu(x)
+            x = torch.cos(x)
+            return x
+
+        x = torch.randn((2, 2), device="cuda")
+        cur_stream = torch.cuda.current_stream()
+        new_stream = torch.cuda.Stream()
+        ref = fn(x, cur_stream, new_stream)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts, nopython=True)(fn)
+        res = opt_fn(x, cur_stream, new_stream)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(cnts.op_count, 19)
 
     @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
     def test_cuda_event_method(self):
@@ -265,8 +417,8 @@ def fn(x):
             new_event = torch.cuda.Event()
             new_event.record(new_stream)
 
-            x = torch.add(x, 5)
             new_event.wait(cur_stream)
+            x = torch.add(x, 5)
 
             # use new event to sync
             new_event.synchronize()
@@ -280,7 +432,7 @@ def fn(x):
         cnts = torch._dynamo.testing.CompileCounter()
         opt_fn = torch._dynamo.optimize(cnts, nopython=True)(fn)
         res = opt_fn(x)
-        self.assertTrue(same(ref, res))
+        self.assertEqual(ref, res)
         self.assertEqual(cnts.frame_count, 1)
         self.assertEqual(cnts.op_count, 19)
 
@@ -720,11 +872,23 @@ def fn(x):
                 x = torch.relu(x)
             return x - 1
 
+        x = torch.rand(2, 3)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch.compile(backend=cnts, fullgraph=False)(fn)
+
         with torch.no_grad():
-            torch._dynamo.testing.standard_test(self, fn=fn, nargs=1, expected_ops=6)
+            ref = fn(x)
+            res = opt_fn(x)
+            self.assertTrue(same(ref, res))
+            self.assertEqual(cnts.frame_count, 2)
+            self.assertEqual(cnts.op_count, 2)
 
         with torch.enable_grad():
-            torch._dynamo.testing.standard_test(self, fn=fn, nargs=1, expected_ops=6)
+            ref = fn(x)
+            res = opt_fn(x)
+            self.assertTrue(same(ref, res))
+            self.assertEqual(cnts.frame_count, 4)
+            self.assertEqual(cnts.op_count, 4)
 
     def test_nested_generic_context_manager(self):
         def fn(x):
@@ -739,11 +903,23 @@ def fn(x):
                 x = torch.relu(x)
             return x - 1
 
+        x = torch.rand(2, 3)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch.compile(backend=cnts, fullgraph=False)(fn)
+
         with torch.no_grad():
-            torch._dynamo.testing.standard_test(self, fn=fn, nargs=1, expected_ops=9)
+            ref = fn(x)
+            res = opt_fn(x)
+            self.assertTrue(same(ref, res))
+            self.assertEqual(cnts.frame_count, 4)
+            self.assertEqual(cnts.op_count, 4)
 
         with torch.enable_grad():
-            torch._dynamo.testing.standard_test(self, fn=fn, nargs=1, expected_ops=9)
+            ref = fn(x)
+            res = opt_fn(x)
+            self.assertTrue(same(ref, res))
+            self.assertEqual(cnts.frame_count, 6)
+            self.assertEqual(cnts.op_count, 6)
 
     def test_generic_context_manager_with_graph_break(self):
         def fn(x):
@@ -876,21 +1052,23 @@ def f(x, y):
         graph = eager.graphs[0]
         actual = normalize_gm(graph.print_readable(False))
 
-        expected = """\
+        self.assertExpectedInline(
+            actual,
+            """\
 class GraphModule(torch.nn.Module):
     def forward(self):
         _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable('This is not supported')
 
-        x = torch.ones(1)
+        x: "f32[1]" = torch.ones(1)
 
-        y = torch.zeros(1)
+        y: "f32[1]" = torch.zeros(1)
 
-        add = x + y;  x = y = None
+        add: "f32[1]" = x + y;  x = y = None
 
         _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
         return (add,)
-"""
-        self.assertExpectedInline(actual, expected)
+""",
+        )
 
     def test_disable_saved_tensors_hooks_prev_disabled(self):
         def fn(z):
@@ -914,21 +1092,23 @@ def f(x, y):
         graph = eager.graphs[0]
         actual = normalize_gm(graph.print_readable(False))
 
-        expected = """\
+        self.assertExpectedInline(
+            actual,
+            """\
 class GraphModule(torch.nn.Module):
     def forward(self):
         _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable('This is not supported')
 
-        x = torch.ones(1)
+        x: "f32[1]" = torch.ones(1)
 
-        y = torch.zeros(1)
+        y: "f32[1]" = torch.zeros(1)
 
-        add = x + y;  x = y = None
+        add: "f32[1]" = x + y;  x = y = None
 
         _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable('Previously disabled message')
         return (add,)
-"""
-        self.assertExpectedInline(actual, expected)
+""",
+        )
 
     def test_disable_saved_tensors_hooks_prev_disabled_nested(self):
         def fn(z):
@@ -958,27 +1138,29 @@ def inner_fn(x, y):
         graph = eager.graphs[0]
         actual = normalize_gm(graph.print_readable(False))
 
-        expected = """\
+        self.assertExpectedInline(
+            actual,
+            """\
 class GraphModule(torch.nn.Module):
     def forward(self):
         _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable('This is not supported')
 
-        x = torch.ones(1)
+        x: "f32[1]" = torch.ones(1)
 
-        y = torch.zeros(1)
+        y: "f32[1]" = torch.zeros(1)
 
         _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable('This is not supported inner')
 
-        add = x + y;  y = None
+        add: "f32[1]" = x + y;  y = None
 
         _saved_tensors_hooks_disable_2 = torch._C._autograd._saved_tensors_hooks_disable('This is not supported')
 
-        add_1 = add + x;  add = x = None
+        add_1: "f32[1]" = add + x;  add = x = None
 
         _saved_tensors_hooks_disable_3 = torch._C._autograd._saved_tensors_hooks_disable('Previously disabled message')
         return (add_1,)
-"""
-        self.assertExpectedInline(actual, expected)
+""",
+        )
 
     def test_disable_saved_tensors_hooks_graph_break(self):
         def fn(x):
@@ -995,37 +1177,41 @@ def fn(x):
         def check_graph(actual, expected):
             self.assertExpectedInline(actual, expected)
 
-        expected = """\
+        graph = eager.graphs[0]
+        actual = normalize_gm(graph.print_readable(False))
+        self.assertExpectedInline(
+            actual,
+            """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
+    def forward(self, L_x_: "f32[]"):
         l_x_ = L_x_
 
         _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable('This is not supported')
 
-        y = l_x_ + 1;  l_x_ = None
+        y: "f32[]" = l_x_ + 1;  l_x_ = None
 
         _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
         return (y,)
-"""
-        graph = eager.graphs[0]
-        actual = normalize_gm(graph.print_readable(False))
-        check_graph(actual, expected)
+""",
+        )
 
-        expected = """\
+        graph = eager.graphs[1]
+        actual = normalize_gm(graph.print_readable(False))
+        self.assertExpectedInline(
+            actual,
+            """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_y_ : torch.Tensor):
+    def forward(self, L_y_: "f32[]"):
         l_y_ = L_y_
 
         _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable('This is not supported')
 
-        mul = l_y_ * 2;  l_y_ = None
+        mul: "f32[]" = l_y_ * 2;  l_y_ = None
 
         _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
         return (mul,)
-"""
-        graph = eager.graphs[1]
-        actual = normalize_gm(graph.print_readable(False))
-        check_graph(actual, expected)
+""",
+        )
 
     def test_context_wrapping_grad_mode_decorator(self):
         ctx_wrappers = [(torch.enable_grad, True), (torch.no_grad, False)]
@@ -1128,6 +1314,101 @@ def inner_func(x):
             self.assertEqual(fn(x), opt_fn(x))
             self.assertEqual(fn(x).requires_grad, opt_fn(x).requires_grad)
 
+    def test_inactive_context_graph_break_local(self):
+        def fn(x):
+            x = x + 1
+            ctx = torch.set_grad_enabled(True)
+            torch._dynamo.graph_break()
+            with ctx:
+                x = x + 1
+            return x
+
+        x = torch.zeros(10, requires_grad=False)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch.compile(fn, backend=cnts)
+        self.assertEqual(fn(x), opt_fn(x))
+        self.assertEqual(fn(x).requires_grad, opt_fn(x).requires_grad)
+        self.assertEqual(cnts.frame_count, 2)
+
+    def test_inactive_context_graph_break_local_nullctx(self):
+        import contextlib
+
+        # test with context manager that results in None target_values
+        def fn(x):
+            x = x + 1
+            ctx = contextlib.nullcontext()
+            torch._dynamo.graph_break()
+            with ctx:
+                x = x + 1
+            return x
+
+        x = torch.zeros(10, requires_grad=False)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch.compile(fn, backend=cnts)
+        self.assertEqual(fn(x), opt_fn(x))
+        self.assertEqual(fn(x).requires_grad, opt_fn(x).requires_grad)
+        self.assertEqual(cnts.frame_count, 2)
+
+    def test_inactive_context_graph_break_local_nullctx2(self):
+        import contextlib
+
+        # test with nullcontext where graph break happens
+        # in an inlined function that returns something
+        def gn():
+            torch._dynamo.graph_break()
+            return [0, 1, 2]
+
+        def fn(x):
+            x = x + 1
+            ctx = contextlib.nullcontext()
+            lst = gn()
+            with ctx:
+                x = x + lst[1]
+            return x
+
+        x = torch.zeros(10, requires_grad=False)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch.compile(fn, backend=cnts)
+        self.assertEqual(fn(x), opt_fn(x))
+        self.assertEqual(fn(x).requires_grad, opt_fn(x).requires_grad)
+        self.assertEqual(cnts.frame_count, 2)
+
+    def test_inactive_context_graph_break_stack(self):
+        def gn(ctx):
+            torch._dynamo.graph_break()
+            return ctx
+
+        def fn(x):
+            x = x + 1
+            ctx = gn(torch.set_grad_enabled(True))
+            # we expect a graph break on next line as well
+            with ctx:
+                x = x + 1
+            return x
+
+        x = torch.zeros(10, requires_grad=False)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch.compile(fn, backend=cnts)
+        self.assertEqual(fn(x), opt_fn(x))
+        self.assertEqual(fn(x).requires_grad, opt_fn(x).requires_grad)
+
+    def test_inactive_context_graph_break_stack2(self):
+        def gn(x, ctx, y, z, dummy):
+            with ctx:
+                return x * y * z
+
+        def fn(x):
+            x = x + 1
+            x = gn(x, torch.set_grad_enabled(True), 2, 3, torch._dynamo.graph_break())
+            return x
+
+        x = torch.zeros(10, requires_grad=False)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch.compile(fn, backend=cnts)
+        self.assertEqual(fn(x), opt_fn(x))
+        self.assertEqual(fn(x).requires_grad, opt_fn(x).requires_grad)
+        self.assertEqual(cnts.frame_count, 2)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_debug_utils.py b/test/dynamo/test_debug_utils.py
index 31b18311e3d91..d5015aec18860 100644
--- a/test/dynamo/test_debug_utils.py
+++ b/test/dynamo/test_debug_utils.py
@@ -1,9 +1,19 @@
 # Owner(s): ["module: dynamo"]
 
+import unittest
+
 import torch
 from functorch import make_fx
 from torch._dynamo import debug_utils
+from torch._dynamo.debug_utils import aot_graph_input_parser
 from torch._dynamo.test_case import TestCase
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+
+f32 = torch.float32
+i64 = torch.int64
+i32 = torch.int32
 
 
 class TestDebugUtils(TestCase):
@@ -49,6 +59,118 @@ def forward(self, x_1):
     """,  # NOQA: B950
         )
 
+    @requires_cuda
+    def test_aot_graph_parser(self):
+        from torch import device
+
+        def forward(
+            self,
+            primals_1: "f32[1001, 6]",
+            primals_2: "f32[1001]",
+            primals_3: "f32[1001, 64]",
+            primals_4: "f32[4190]",
+            primals_5: "f32[4190]",
+            primals_6: "f32[1739, 4190]",
+            primals_48: "f32[6144, 4191]",
+        ):
+            _tensor_constant0: "i64[4190]" = self._tensor_constant0
+            lift_fresh_copy: "i64[4190]" = torch.ops.aten.lift_fresh_copy.default(
+                _tensor_constant0
+            )
+            _tensor_constant0 = None
+            index: "f32[6144, 4190]" = torch.ops.aten.index.Tensor(
+                primals_48, [None, lift_fresh_copy]
+            )
+            lift_fresh_copy = None
+
+            _tensor_constant1: "i64[6]" = self._tensor_constant1
+            lift_fresh_copy_1: "i64[6]" = torch.ops.aten.lift_fresh_copy.default(
+                _tensor_constant1
+            )
+            _tensor_constant1 = None
+            index_1: "f32[6144, 6]" = torch.ops.aten.index.Tensor(
+                primals_48, [None, lift_fresh_copy_1]
+            )
+            primals_48 = lift_fresh_copy_1 = None
+            permute: "f32[6, 1001]" = torch.ops.aten.permute.default(primals_1, [1, 0])
+            primals_1 = None
+            addmm: "f32[6144, 1001]" = torch.ops.aten.addmm.default(
+                primals_2, index_1, permute
+            )
+            primals_2 = permute = None
+            amax: "f32[6144, 1]" = torch.ops.aten.amax.default(addmm, [-1], True)
+            sub: "f32[6144, 1001]" = torch.ops.aten.sub.Tensor(addmm, amax)
+            exp: "f32[6144, 1001]" = torch.ops.aten.exp.default(sub)
+            sub = None
+            sum_1: "f32[6144, 1]" = torch.ops.aten.sum.dim_IntList(exp, [-1], True)
+            div: "f32[6144, 1001]" = torch.ops.aten.div.Tensor(exp, sum_1)
+            exp = None
+
+            full_default: "i32[6144, 1001]" = torch.ops.aten.full.default(
+                [6144, 1001],
+                1,
+                dtype=torch.int32,
+                layout=torch.strided,
+                device=device(type="cuda", index=0),
+                pin_memory=False,
+            )
+
+            iota: "i32[1001]" = torch.ops.prims.iota.default(
+                1001,
+                start=0,
+                step=1,
+                dtype=torch.int32,
+                device=device(type="cuda"),
+                requires_grad=False,
+            )
+
+            mul: "i32[6144, 1001]" = torch.ops.aten.mul.Tensor(full_default, iota)
+            full_default = iota = None
+
+            iota_1: "i32[6144]" = torch.ops.prims.iota.default(
+                6144,
+                start=0,
+                step=1001,
+                dtype=torch.int32,
+                device=device(type="cuda", index=0),
+                requires_grad=False,
+            )
+            view: "i32[6150144]" = torch.ops.aten.reshape.default(mul, [-1])
+            mul = None
+            view_1: "f32[6150144]" = torch.ops.aten.reshape.default(div, [-1])
+            div = None
+            _embedding_bag = torch.ops.aten._embedding_bag.default(
+                primals_3, view, iota_1, False, 0, False, view_1
+            )
+
+            return _embedding_bag
+
+        kwargs = aot_graph_input_parser(forward, device="cuda")
+        # runs successfully
+        forward(**kwargs)
+
+    @requires_cuda
+    def test_sym_aot_graph_parser(self):
+        def forward(
+            self,
+            primals_1: "f32[1001, 6]",  # noqa: F821
+            primals_2: "f32[s0]",  # noqa: F821
+            primals_3: "Sym(s0)",  # noqa: F821,
+            primals_4: "f32[s1]",  # noqa: F821,
+            primals_5: "Sym(s1)",  # noqa: F821,
+        ):
+            _tensor_constant0: "i64[4190]" = self._tensor_constant0
+
+        kwargs = aot_graph_input_parser(
+            forward, device="cuda", sym_shapes={"s0": 10}, default_sym_shape=5
+        )
+
+        self.assertEqual(list(kwargs["primals_2"].shape), [10])
+        self.assertEqual(kwargs["primals_3"], 10)
+
+        self.assertEqual(list(kwargs["primals_4"].shape), [5])
+        self.assertEqual(kwargs["primals_5"], 5)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_decorators.py b/test/dynamo/test_decorators.py
index 5bc5cc0ec635b..890edca40ccc9 100644
--- a/test/dynamo/test_decorators.py
+++ b/test/dynamo/test_decorators.py
@@ -1,4 +1,5 @@
 # Owner(s): ["module: dynamo"]
+import functools
 import os
 import unittest.mock as mock
 from unittest.mock import patch
@@ -39,7 +40,7 @@ def test_disable_for_custom_op(self):
         import torch.library
         from torch.library import Library
 
-        foo = Library("foo", "DEF")
+        foo = Library("foo", "DEF")  # noqa: TOR901
         foo.define("custom(Tensor self) -> Tensor")
 
         # Dynamic shape data dependent operator. For static shape compilation, Dynamo
@@ -66,6 +67,122 @@ def fn(x):
         self.assertEqual(cnts.frame_count, 2)
         self.assertEqual(ref, res)
 
+    def test_disable_ignores_outer_wraps(self):
+        def orig_inner():
+            pass
+
+        def inner():
+            pass
+
+        inner._torchdynamo_orig_callable = orig_inner
+
+        @functools.wraps(inner)
+        def wrapper():
+            raise AssertionError("wrapper called")
+
+        # This behavior is not ideal, but supporting it would add overhead
+        # to callsites of eval_frame.innermost_fn. A warning would also be very noisy.
+        w = torch._dynamo.disable(fn=wrapper, recursive=True)
+
+    def test_disable_nn_modules_forward_hook(self):
+        class SimpleLinear(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layer0 = torch.nn.Linear(4, 4)
+
+            def forward(self, inp):
+                return self.layer0(torch.sigmoid(inp))
+
+        class SimpleModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layer0 = SimpleLinear()
+                self.layer1 = torch.nn.Linear(4, 4)
+
+            def forward(self, inp):
+                z = self.layer0(torch.sin(inp))
+                return self.layer1(z)
+
+        def hook(module, args):
+            inp = args[0].sigmoid()
+            return (inp,)
+
+        model = SimpleModel()
+        model.layer0.register_forward_pre_hook(hook)
+
+        # Disable my monkeypatching
+        model.layer0 = torch._dynamo.disable(model.layer0)
+
+        cnts = torch._dynamo.testing.CompileCounterWithBackend("eager")
+        opt_model = torch.compile(model, backend=cnts)
+        opt_model(torch.randn(4))
+
+        # check for no graph break
+        self.assertEqual(cnts.frame_count, 2)
+
+        gm0 = cnts.graphs[0]
+        # Check that the first graph has sin node, and no sigmoid
+        self.assertTrue(any(node.target is torch.sin for node in gm0.graph.nodes))
+        self.assertTrue(
+            all(node.target is not torch.sigmoid for node in gm0.graph.nodes)
+        )
+
+        gm1 = cnts.graphs[1]
+        # Check that the first graph does not have sigmoid. sigmoid is used in
+        # both hook and disabled module.
+        self.assertTrue(
+            all(node.target is not torch.sigmoid for node in gm1.graph.nodes)
+        )
+
+    def test_disable_nn_module_with_class_decorator(self):
+        cnts = torch._dynamo.testing.CompileCounterWithBackend("eager")
+
+        @torch._dynamo.disable
+        class SimpleLinear(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layer0 = torch.nn.Linear(4, 4)
+
+            def forward(self, inp):
+                return self.layer0(torch.sigmoid(inp))
+
+        @torch.compile(backend=cnts)
+        class SimpleModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layer0 = SimpleLinear()
+                self.layer1 = torch.nn.Linear(4, 4)
+
+            def forward(self, inp):
+                z = self.layer0(torch.sin(inp))
+                return self.layer1(z)
+
+        def hook(module, args):
+            inp = args[0].sigmoid()
+            return (inp,)
+
+        model = SimpleModel()
+        model.layer0.register_forward_pre_hook(hook)
+
+        model(torch.randn(4))
+
+        # check for no graph break
+        self.assertEqual(cnts.frame_count, 2)
+
+        gm0 = cnts.graphs[0]
+        # Check that the first graph has sin node, and no sigmoid
+        self.assertTrue(any(node.target is torch.sin for node in gm0.graph.nodes))
+        self.assertTrue(
+            all(node.target is not torch.sigmoid for node in gm0.graph.nodes)
+        )
+
+        gm1 = cnts.graphs[1]
+        # Check that the first graph does not have sigmoid. sigmoid is used in
+        # both hook and disabled module.
+        self.assertTrue(
+            all(node.target is not torch.sigmoid for node in gm1.graph.nodes)
+        )
+
     def test_allow_in_graph(self):
         cnts = torch._dynamo.testing.CompileCounter()
 
@@ -288,6 +405,66 @@ def test_mark_static_address_guarded(self):
     def test_mark_static_address_unguarded(self):
         self._test_mark_static_address(guarded=False)
 
+    def test_class_methods(self):
+        class A:
+            @classmethod
+            def my_class_method(cls, arg1):
+                return cls, arg1
+
+            @staticmethod
+            def my_static_method(arg1):
+                return None, arg1
+
+            def my_regular_method(self, arg1):
+                return self, arg1
+
+        class B(A):
+            def my_class_method(self, arg1):
+                return super().my_class_method(arg1)
+
+            def my_static_method(self, arg1):
+                return super().my_static_method(arg1)
+
+        class C(A):
+            @classmethod
+            def my_class_method(cls, arg1):
+                return super().my_class_method(arg1)
+
+        cnt = torch._dynamo.testing.CompileCounter()
+
+        @torch.compile(backend=cnt)
+        def fn(a, b, c):
+            # We want a function that does not graph break but
+            # does generate custom bytecode
+            v1 = a.my_class_method(1)
+            v2 = A.my_class_method(2)
+            v3 = a.my_static_method(3)
+            v4 = A.my_static_method(4)
+            v5 = a.my_regular_method(5)
+            v6 = b.my_class_method(6)
+            v7 = b.my_static_method(7)
+            v8 = c.my_class_method(8)
+            v9 = C.my_class_method(9)
+            torch.rand(2)
+            return v1, v2, v3, v4, v5, v6, v7, v8, v9
+
+        a, b, c = A(), B(), C()
+        v1, v2, v3, v4, v5, v6, v7, v8, v9 = fn(a, b, c)
+
+        self.assertEqual(v1, (A, 1))
+        self.assertEqual(v2, (A, 2))
+        self.assertEqual(v3, (None, 3))
+        self.assertEqual(v4, (None, 4))
+        self.assertEqual(v5, (a, 5))
+        # TODO fix me: we do not resolve classmethods properly
+        # from a regular method
+        # self.assertEqual(v6, (B, 6))
+        self.assertEqual(v7, (None, 7))
+        self.assertEqual(v8, (C, 8))
+        self.assertEqual(v9, (C, 9))
+
+        self.assertEqual(cnt.frame_count, 1)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_deviceguard.py b/test/dynamo/test_deviceguard.py
new file mode 100644
index 0000000000000..4ed54a4c19222
--- /dev/null
+++ b/test/dynamo/test_deviceguard.py
@@ -0,0 +1,92 @@
+# Owner(s): ["module: dynamo"]
+import unittest
+from unittest.mock import Mock
+
+import torch
+
+import torch._dynamo.test_case
+import torch._dynamo.testing
+from torch._dynamo.device_interface import CudaInterface, DeviceGuard
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
+
+
+class TestDeviceGuard(torch._dynamo.test_case.TestCase):
+    """
+    Unit tests for the DeviceGuard class using a mock DeviceInterface.
+    """
+
+    def setUp(self):
+        super().setUp()
+        self.device_interface = Mock()
+
+        self.device_interface.exchange_device = Mock(return_value=0)
+        self.device_interface.maybe_exchange_device = Mock(return_value=0)
+
+    def test_device_guard(self):
+        device_guard = DeviceGuard(self.device_interface, 1)
+
+        with device_guard as _:
+            self.device_interface.exchange_device.assert_called_once_with(1)
+            self.assertEqual(device_guard.prev_idx, 0)
+            self.assertEqual(device_guard.idx, 1)
+
+        self.device_interface.maybe_exchange_device.assert_called_once_with(0)
+        self.assertEqual(device_guard.prev_idx, 0)
+        self.assertEqual(device_guard.idx, 0)
+
+    def test_device_guard_no_index(self):
+        device_guard = DeviceGuard(self.device_interface, None)
+
+        with device_guard as _:
+            self.device_interface.exchange_device.assert_not_called()
+            self.assertEqual(device_guard.prev_idx, -1)
+            self.assertEqual(device_guard.idx, None)
+
+        self.device_interface.maybe_exchange_device.assert_not_called()
+        self.assertEqual(device_guard.prev_idx, -1)
+        self.assertEqual(device_guard.idx, None)
+
+
+@unittest.skipIf(not TEST_CUDA, "No CUDA available.")
+class TestCUDADeviceGuard(torch._dynamo.test_case.TestCase):
+    """
+    Unit tests for the DeviceGuard class using a CudaInterface.
+    """
+
+    def setUp(self):
+        super().setUp()
+        self.device_interface = CudaInterface
+
+    @unittest.skipIf(not TEST_MULTIGPU, "need multiple GPU")
+    def test_device_guard(self):
+        current_device = torch.cuda.current_device()
+
+        device_guard = DeviceGuard(self.device_interface, 1)
+
+        with device_guard as _:
+            self.assertEqual(torch.cuda.current_device(), 1)
+            self.assertEqual(device_guard.prev_idx, 0)
+            self.assertEqual(device_guard.idx, 1)
+
+        self.assertEqual(torch.cuda.current_device(), current_device)
+        self.assertEqual(device_guard.prev_idx, 0)
+        self.assertEqual(device_guard.idx, 0)
+
+    def test_device_guard_no_index(self):
+        current_device = torch.cuda.current_device()
+
+        device_guard = DeviceGuard(self.device_interface, None)
+
+        with device_guard as _:
+            self.assertEqual(torch.cuda.current_device(), current_device)
+            self.assertEqual(device_guard.prev_idx, -1)
+            self.assertEqual(device_guard.idx, None)
+
+        self.assertEqual(device_guard.prev_idx, -1)
+        self.assertEqual(device_guard.idx, None)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_dynamic_shapes.py b/test/dynamo/test_dynamic_shapes.py
index 8231465c568ad..4ceed0fad3dd7 100644
--- a/test/dynamo/test_dynamic_shapes.py
+++ b/test/dynamo/test_dynamic_shapes.py
@@ -5,7 +5,7 @@
 from torch._dynamo import config
 from torch._dynamo.testing import make_test_cls_with_patches
 from torch.fx.experimental import _config as fx_config
-from torch.testing._internal.common_utils import TEST_Z3
+from torch.testing._internal.common_utils import slowTest, TEST_Z3
 
 try:
     from . import (
@@ -17,6 +17,7 @@
         test_misc,
         test_modules,
         test_repros,
+        test_sdpa,
         test_subgraphs,
     )
 except ImportError:
@@ -28,6 +29,7 @@
     import test_misc
     import test_modules
     import test_repros
+    import test_sdpa
     import test_subgraphs
 
 
@@ -47,7 +49,7 @@ def make_dynamic_cls(cls):
         (config, "specialize_int", False),
         (fx_config, "translation_validation", TEST_Z3),
         (fx_config, "check_shape_env_recorded_events", True),
-        (fx_config, "validate_shape_env_verison_key", True),
+        (fx_config, "validate_shape_env_version_key", True),
         xfail_prop="_expected_failure_dynamic",
     )
 
@@ -69,6 +71,7 @@ def make_dynamic_cls(cls):
     test_higher_order_ops.HigherOrderOpTests,
     test_higher_order_ops.FuncTorchHigherOrderOpTests,
     test_aot_autograd.AotAutogradFallbackTests,
+    test_sdpa.TestSDPA,
 ]
 for test in tests:
     make_dynamic_cls(test)
@@ -82,11 +85,21 @@ def make_dynamic_cls(cls):
         DynamicShapesReproTests.test_dynamic_shapes_float_guard_dynamic_shapes  # noqa: F821
     )
 
+    # TODO model is somehow not being freed when z3 is available
+    unittest.expectedFailure(
+        DynamicShapesMiscTests.test_parameter_free_dynamic_shapes  # noqa: F821
+    )
+
 unittest.expectedFailure(
     # Test is only valid without dynamic shapes
     DynamicShapesReproTests.test_many_views_with_mutation_dynamic_shapes  # noqa: F821
 )
 
+# Test takes too long ~700s as of 414a1fd29f04d06e41b7f895368dd1f83a4be29d
+DynamicShapesExportTests.test_retracibility_dynamic_shapes = slowTest(  # noqa: F821
+    DynamicShapesExportTests.test_retracibility_dynamic_shapes  # noqa: F821
+)
+
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/dynamo/test_exc.py b/test/dynamo/test_exc.py
index 777b06d92f8eb..953e8ecd0a35e 100644
--- a/test/dynamo/test_exc.py
+++ b/test/dynamo/test_exc.py
@@ -47,7 +47,7 @@ def fn001(x):
     def test_internal_error_suppress_errors(self, records):
         def fn001(x):
             def f(ctx):
-                raise AssertionError()
+                raise AssertionError
 
             comptime(f)
 
@@ -62,7 +62,7 @@ def f(ctx):
 ========== TorchDynamo Stack Trace ==========
 Traceback (most recent call last):
   File "test_exc.py", line N, in f
-    raise AssertionError()
+    raise AssertionError
 AssertionError:
 
 from user code:
@@ -84,7 +84,7 @@ def f(ctx):
     def test_not_implemented_error(self, records):
         def fn001(x):
             def f(ctx):
-                raise NotImplementedError()
+                raise NotImplementedError
 
             # Ensure graph break is not possible
             for i in range(3):
@@ -101,7 +101,7 @@ def f(ctx):
 due to:
 Traceback (most recent call last):
   File "test_exc.py", line N, in f
-    raise NotImplementedError()
+    raise NotImplementedError
 torch._dynamo.exc.InternalTorchDynamoError:
 
 from user code:
@@ -109,7 +109,6 @@ def f(ctx):
     comptime(f)""",
         )
 
-    @unittest.expectedFailure
     @torch._dynamo.config.patch(inject_BUILD_SET_unimplemented_TESTING_ONLY=True)
     @make_logging_test(dynamo=logging.DEBUG)
     def test_unsupported_error(self, records):
@@ -129,7 +128,7 @@ def fn001(x):
             # NB: avoid decorator, as 3.11 changed the line number attributed
             # in this situation
             def f(ctx):
-                raise AssertionError()
+                raise AssertionError
 
             comptime(f)
 
@@ -166,7 +165,7 @@ def fn001(x):
         self.assertExpectedInline(
             munge_exc(record.getMessage()),
             """\
-Graph break: 'skip function graph_break in file _dynamo/decorators.py' from user code at:
+Graph break: from user code at:
   File "test_exc.py", line N, in fn001
     return fn002(x)
   File "test_exc.py", line N, in fn002
@@ -251,15 +250,12 @@ def fn(x, shape):
   ==> (== 1 L['x'].stride()[0])
   ==> (== L['x'].size()[0] s0)
   ==> (> s0 0)
+  ==> (>= 0 s1)
+  ==> (>= 0 s2)
+  ==> (>= 0 s3)
   ==> (>= 9223372036854775806 s0)
-  ==> (>= 9223372036854775806 s1)
-  ==> (>= 9223372036854775806 s2)
-  ==> (>= 9223372036854775806 s3)
 
 Failed Source Expressions:
-  ==> (!= 0 L['shape'][0])
-  ==> (!= 0 L['shape'][1])
-  ==> (!= 0 L['shape'][2])
   ==> (== (+ L['shape'][0] L['shape'][1] L['shape'][2]) L['x'].size()[0])""",
         )
 
@@ -289,16 +285,16 @@ def fn(x, shape):
     %split : [num_users=3] = call_method[target=split](args = (%l_x_, (%l_shape_0_, %l_shape_1_, %l_shape_2_)), kwargs = {})
 
 Model:
-  ==> L['shape'][0]: -9223372036854775807
-  ==> L['shape'][1]: -9223372036854775807
-  ==> L['shape'][2]: -9223372036854775807
+  ==> L['shape'][0]: 1
+  ==> L['shape'][1]: 1
+  ==> L['shape'][2]: 2
   ==> L['x'].size()[0]: 3
   ==> L['x'].storage_offset(): 0
   ==> L['x'].stride()[0]: 1
   ==> s0: 3
-  ==> s1: -9223372036854775807
-  ==> s2: -9223372036854775807
-  ==> s3: -9223372036854775807
+  ==> s1: 1
+  ==> s2: 1
+  ==> s3: 2
 
 Assertions:
   ==> (== 0 L['x'].storage_offset())
@@ -311,9 +307,9 @@ def fn(x, shape):
 
 Target Expressions:
   ==> (!= (+ s1 s2 s3) s0)
-  ==> (<= -9223372036854775808 s1)
-  ==> (<= -9223372036854775808 s2)
-  ==> (<= -9223372036854775808 s3)
+  ==> (<= 0 s1)
+  ==> (<= 0 s2)
+  ==> (<= 0 s3)
   ==> (<= 2 s0)
   ==> (== 0 L['x'].storage_offset())
   ==> (== 1 L['x'].stride()[0])
diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index f61505fa0347a..e224cd4c4eb4e 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -21,9 +21,9 @@
 from torch._dynamo import config
 from torch._dynamo.exc import UserError
 from torch._dynamo.testing import normalize_gm
-from torch._export import dynamic_dim
 from torch._higher_order_ops.out_dtype import out_dtype
 from torch._subclasses import fake_tensor
+from torch.export import dynamic_dim
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.experimental.symbolic_shapes import (
     ConstraintViolationError,
@@ -120,7 +120,11 @@ def func(x):
         for guard in out_guards:
             if guard.source == GuardSource.SHAPE_ENV:
                 hit = True
-                self.assertTrue("L['x'].size()[0] <= 10" in guard.code_list)
+                self.assertExpectedInline(
+                    guard.code_list,
+                    """["L['x'].stride()[0] == L['x'].size()[1]", "L['x'].stride()[1] == 1", "L['x'].storage_offset() == 0", "2 <= L['x'].size()[0] <= 10", "2 <= L['x'].size()[1]"]""",  # noqa: B950
+                )
+                break
 
         self.assertTrue(hit)
 
@@ -1481,6 +1485,29 @@ def forward(self, x):
         resB = graph(torch.tensor([2]))
         self.assertTrue(torch._dynamo.utils.same(resA, resB))
 
+    def test_export_with_builtin_op_on_assume_constant(self):
+        @torch._dynamo.assume_constant_result
+        def get_y(y) -> torch.Tensor:
+            return y
+
+        class Bob(torch.nn.Module):
+            def __init__(self, p, val) -> None:
+                super().__init__()
+                self.p = p
+                self.y = torch.nn.Parameter(torch.tensor(val))
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                # This only looks dynamic but it's actually a constant value
+                if get_y(self.y) < self.p:
+                    return torch.cat([x, x])
+                else:
+                    return x
+
+        model = Bob(0.5, 0.3)
+        inp = torch.ones(3, 4)
+        graph, guards = torch._dynamo.export(model)(inp)
+        self.assertEqual(model(inp), graph(inp))
+
     def test_export_decomp(self):
         def f(x):
             return x.t() + x.t()
@@ -2231,64 +2258,44 @@ def f(t):
             return t.x + t.y
 
         with self.assertRaisesRegex(
-            AssertionError,
-            "graph-captured input #1, of type .*Tensor.*, "
-            "is not among original inputs of types: .*Tensors",
+            UserError,
+            "It looks like one of the inputs with type .*Tensors.* "
+            "is not supported or pytree-flattenable",
         ):
-            torch._dynamo.export(
-                f, Tensors(x=torch.randn(10), y=torch.randn(10)), aten_graph=False
+            torch._dynamo.export(f, aten_graph=False)(
+                Tensors(x=torch.randn(10), y=torch.randn(10))
             )
 
         def f(x, y):
             return Tensors(x=x.sin(), y=y.cos())
 
         with self.assertRaisesRegex(
-            AssertionError,
-            "original output #1 is .*Tensors.*, "
-            "but only the following types are supported",
-        ):
-            torch._dynamo.export(f, torch.randn(10), torch.randn(10), aten_graph=False)
-
-    def test_none_out(self):
-        def f(x, y):
-            _ = x + y
-
-        with self.assertRaisesRegex(
-            AssertionError,
-            "original output #1 is None, but only the following types are supported",
+            UserError,
+            "It looks like one of the outputs with type .*Tensors.* "
+            "is not supported or pytree-flattenable",
         ):
-            torch._dynamo.export(f, torch.randn(10), torch.randn(10), aten_graph=False)
-
-    def test_primitive_constant_output(self):
-        def foo(x):
-            # return a constant of primitive type
-            y = 5
-            return y * x, y
+            torch._dynamo.export(f, aten_graph=False)(torch.randn(10), torch.randn(10))
 
-        with self.assertRaisesRegex(
-            AssertionError,
-            "original output #2 is 5, but only the following types are supported",
-        ):
-            torch.export.export(foo, (torch.tensor(3),))
+    def test_empty(self):
+        def f(x):
+            return x
 
-        def bar(x, y):
-            return y * x, y
+        exported = torch._dynamo.export(f)(torch.randn(3, 3))
+        out_graph = exported[0]
+        inp = torch.randn(3, 3)
+        self.assertTrue(torch._dynamo.utils.same(inp, out_graph(inp)))
 
-        # new behavior
-        with self.assertRaisesRegex(
-            AssertionError,
-            "original output #2 is 5, but only the following types are supported",
-        ):
-            torch.export.export(bar, (torch.tensor(3), 5))
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.ones(3, 3)
 
-        def qux(x, y):
-            return y * x, y - 1
+            def forward(self):
+                return self.a
 
-        with self.assertRaisesRegex(
-            AssertionError,
-            "original output #2 is 4, but only the following types are supported",
-        ):
-            torch.export.export(qux, (torch.tensor(3), 5))
+        exported = torch._dynamo.export(M())()
+        out_graph = exported[0]
+        self.assertTrue(torch._dynamo.utils.same(torch.ones(3, 3), out_graph()))
 
     @unittest.skipIf(not TEST_CUDA, "No CUDA available.")
     def test_export_with_parameters(self):
@@ -2316,9 +2323,31 @@ def forward(self, x):
         torch.export.save(exp_program, output_buffer)
         loaded_model = torch.export.load(output_buffer)
         self.assertTrue(
-            isinstance(loaded_model.module().features_0_weight, torch.nn.Parameter)
+            isinstance(
+                loaded_model.module().get_parameter("features.0.weight"),
+                torch.nn.Parameter,
+            )
         )
 
+    def test_export_fast_binary_broadcast_check(self):
+        # This test looks at the case where we erroneously create a guard
+        # when checking the equality of the operands' shape and the output
+        # shape during FakeTensor's binary op fast path.
+
+        class MyModel(torch.nn.Module):
+            def forward(self, a, b):
+                # final shape is (dim0, 4, 8)
+                # order matters since a & the output have the same shape
+                return b + a
+
+        a = torch.randn(100, 4, 8)
+        b = torch.randn(4, 8)
+        model = MyModel().eval().cuda()
+        batchsize = torch.export.Dim("dim0", min=3, max=1024)
+        dynamic_shape_spec = {"a": [batchsize, None, None], "b": [None, None]}
+
+        torch.export.export(model, (a, b), dynamic_shapes=dynamic_shape_spec)
+
     def test_export_meta(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -2338,11 +2367,14 @@ def forward(self, x):
         self.assertEqual(dynamo_result, m(inp))
 
     def test_constraint_violation_error_messages(self):
-        def foo(x):
-            if x.shape[0] == x.shape[1] * 2:
-                return x + 1
-            else:
-                return x + 2
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                if x.shape[0] == x.shape[1] * 2:
+                    return x + 1
+                else:
+                    return x + 2
+
+        foo = Foo()
 
         t = torch.zeros([8, 4])
         dim0 = torch.export.Dim("dim0", min=3, max=10)
@@ -2351,16 +2383,20 @@ def foo(x):
 
         with self.assertRaisesRegex(
             torch._dynamo.exc.UserError,
-            "Not all values.*valid.*inferred to be equal to(.*\n)*.*"
-            "must be specialized.*guards generated.*too complex",
+            "Constraints violated .*!(.*\n)*.*"
+            "by dim0 = 2\\*dim1(.*\n)*.*"
+            "Not all values of dim1 .* satisfy the generated guard 2 <= .* and .* <= 5(.*\n)*.*",
         ):
             torch.export.export(foo, (t,), dynamic_shapes=dynamic_shapes)
 
-        def bar(x):
-            if x.shape[0] == 5:
-                return x + 1
-            else:
-                return x + 2
+        class Bar(torch.nn.Module):
+            def forward(self, x):
+                if x.shape[0] == 5:
+                    return x + 1
+                else:
+                    return x + 2
+
+        bar = Bar()
 
         t = torch.zeros([5])
         dim0 = torch.export.Dim("dim0", min=3, max=8)
@@ -2371,11 +2407,14 @@ def bar(x):
         ):
             torch.export.export(bar, (t,), dynamic_shapes=dynamic_shapes)
 
-        def qux(x):
-            if x.shape[0] > 5 and x.shape[0] < 10:
-                return x + 1
-            else:
-                return x + 2
+        class Qux(torch.nn.Module):
+            def forward(self, x):
+                if x.shape[0] > 5 and x.shape[0] < 10:
+                    return x + 1
+                else:
+                    return x + 2
+
+        qux = Qux()
 
         t = torch.zeros([7])
         dim0 = torch.export.Dim("dim0", min=3, max=8)
@@ -2389,24 +2428,21 @@ def qux(x):
     def test_untracked_inputs_in_constraints(self):
         from copy import copy
 
-        def foo(x, y):
-            return y + 1
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                return y + 1
+
+        foo = Foo()
 
         x = torch.randn(2)
         y = torch.randn(5, 4)
-        constraints = [dynamic_dim(x, 0), dynamic_dim(y, 0)]
-
-        example_inputs = (copy(x), y)
-        ep = torch.export.export(foo, example_inputs, constraints=constraints)
-        with self.assertRaisesRegex(RuntimeError, "input.*shape.*to be equal to 2"):
-            ep(torch.randn(3), y)
 
         dim0_x, dim0_y = torch.export.dims("dim0_x", "dim0_y")
         dynamic_shapes = {"x": {0: dim0_x}, "y": {0: dim0_y}}
 
         example_inputs = (copy(x), y)
         ep = torch.export.export(foo, example_inputs, dynamic_shapes=dynamic_shapes)
-        ep(torch.randn(3), y)  # no specialization error
+        ep.module()(torch.randn(3), y)  # no specialization error
 
     def test_export_raise_guard_full_constraint(self):
         y = torch.randn([3, 3, 3])
@@ -2419,7 +2455,9 @@ def my_dyn_fn(x):
         torch._dynamo.export(my_dyn_fn)(y)
 
         with self.assertRaises(ConstraintViolationError):
-            torch._dynamo.export(my_dyn_fn, constraints=[dynamic_dim(y, 0)])(y)
+            torch._dynamo.export(
+                my_dyn_fn, dynamic_shapes=({0: torch.export.Dim("dimx")},)
+            )(y)
 
     def test_export_module_specify_constraints_signature(self):
         y = torch.randn([3, 3, 3])
@@ -2433,10 +2471,10 @@ def forward(self, x):
         mod = Mod()
         torch._dynamo.export(mod)(y)
 
-        with self.assertRaisesRegex(
-            ConstraintViolationError, "def specify_constraints\\(x\\):"
-        ):
-            torch._dynamo.export(mod, constraints=[dynamic_dim(y, 0)])(y)
+        with self.assertRaisesRegex(ConstraintViolationError, "dimx = 3"):
+            torch._dynamo.export(mod, dynamic_shapes=({0: torch.export.Dim("dimx")},))(
+                y
+            )
 
     def test_export_raise_guard_partial_constraint(self):
         y = torch.randn([3, 3, 3])
@@ -2449,7 +2487,9 @@ def my_dyn_fn(x):
         torch._dynamo.export(my_dyn_fn)(y)
 
         with self.assertRaises(ConstraintViolationError):
-            torch._dynamo.export(my_dyn_fn, constraints=[dynamic_dim(y, 0)])(y)
+            torch._dynamo.export(
+                my_dyn_fn, dynamic_shapes=({0: torch.export.Dim("dimx")},)
+            )(y)
 
     def test_export_raise_on_relationship(self):
         y = torch.randn([3, 3, 3])
@@ -2461,14 +2501,12 @@ def my_dyn_fn(a, b, c):
             return a.cos()
 
         torch._dynamo.export(my_dyn_fn)(y, y, y)
-        constraints = [dynamic_dim(y, 0)]
+        dim = torch.export.Dim("dim")
+        dynamic_shapes = ({0: dim}, {0: dim}, {0: dim})
         with self.assertRaises(ConstraintViolationError):
-            torch._dynamo.export(my_dyn_fn, constraints=constraints)(y, y, y)
-        constraints += [
-            dynamic_dim(y, 1) == dynamic_dim(y, 0),
-            dynamic_dim(y, 2) == dynamic_dim(y, 0),
-        ]
-        torch._dynamo.export(my_dyn_fn, constraints=constraints)(y, y, y)
+            torch._dynamo.export(my_dyn_fn, dynamic_shapes=dynamic_shapes)(y, y, y)
+        dynamic_shapes = ({0: dim}, {1: dim}, {2: dim})
+        torch._dynamo.export(my_dyn_fn, dynamic_shapes=dynamic_shapes)(y, y, y)
 
     def test_export_no_raise(self):
         y = torch.randn([3, 3, 3])
@@ -2479,7 +2517,9 @@ def my_dyn_fn(a, b, c):
             return a * b * c
 
         torch._dynamo.export(my_dyn_fn)(y, y, y)
-        torch._dynamo.export(my_dyn_fn, constraints=[dynamic_dim(y, 0)])(y, y, y)
+        dim = torch.export.Dim("dim")
+        dynamic_shapes = ({0: dim}, {0: dim}, {0: dim})
+        torch._dynamo.export(my_dyn_fn, dynamic_shapes=dynamic_shapes)(y, y, y)
 
     def test_export_multi_dynamic_dim_unsafe_relationship(self):
         x = torch.randn([3, 3, 3])
@@ -2492,18 +2532,23 @@ def my_dyn_fn(a, b, c):
             return a * c, b
 
         torch._dynamo.export(my_dyn_fn)(x, y, z)
-        constraints = [dynamic_dim(x, 0), dynamic_dim(y, 0), dynamic_dim(z, 0)]
+        dimx, dimy, dimz = torch.export.dims("dimx", "dimy", "dimz")
+        dynamic_shapes = ({0: dimx}, {0: dimy}, {0: dimz})
         with self.assertRaises(ConstraintViolationError):
-            torch._dynamo.export(my_dyn_fn, constraints=constraints)(x, y, z)
-        constraints.append(dynamic_dim(z, 0) == dynamic_dim(x, 0))
-        torch._dynamo.export(my_dyn_fn, constraints=constraints)(x, y, z)
+            torch._dynamo.export(my_dyn_fn, dynamic_shapes=dynamic_shapes)(x, y, z)
+        dimz = dimx
+        dynamic_shapes = ({0: dimx}, {0: dimy}, {0: dimz})
+        torch._dynamo.export(my_dyn_fn, dynamic_shapes=dynamic_shapes)(x, y, z)
 
     def test_remove_redundant_dynamic_dim_in_error_message(self):
-        def foo(x, y):
-            if x.shape[0] == y["k"].shape[0]:
-                return x + 1
-            else:
-                return x - 1
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                if x.shape[0] == y["k"].shape[0]:
+                    return x + 1
+                else:
+                    return x - 1
+
+        foo = Foo()
 
         a = torch.randn(3)
         b = torch.randn(3)
@@ -2516,8 +2561,11 @@ def foo(x, y):
             )
 
     def test_enforce_equalities(self):
-        def bar(x, y):
-            return torch.matmul(x, y)
+        class Bar(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.matmul(x, y)
+
+        bar = Bar()
 
         batch, size = torch.export.dims("batch", "size")
         dynamic_shapes = {"x": (batch, size, size), "y": (batch, size, size)}
@@ -2556,20 +2604,23 @@ def bar(x, y):
     def test_export_preserve_constraints_as_metadata_scalar(self):
         def f(x, y):
             b = x.item()
-            torch._constrain_as_size(b)
+            torch._check_is_size(b)
             return torch.empty((b, y.shape[0]))
 
         x = torch.tensor([3])
         y = torch.randn([8, 8, 6])
         example_inputs = [x, y]
-        constraints = [dynamic_dim(y, 0) >= 6, dynamic_dim(y, 0) <= 10]
+        dynamic_shapes = (None, {0: torch.export.Dim("dimy", min=6, max=10)})
         gm, _ = torch._dynamo.export(
             f,
-            constraints=constraints,
+            dynamic_shapes=dynamic_shapes,
             aten_graph=True,
             tracing_mode="symbolic",
         )(*example_inputs)
 
+        constraints = torch.export.dynamic_shapes._process_dynamic_shapes(
+            f, example_inputs, dynamic_shapes=dynamic_shapes
+        )
         self.assertEqual(
             gm.meta["input_shape_constraints"],
             [c.serializable_spec for c in constraints],
@@ -2583,14 +2634,13 @@ def f(x, y):
     def test_export_preserve_constraints_as_metadata_tensor(self):
         def f(x):
             b = x.nonzero()
-            torch._constrain_as_value(b.shape[0], min=2, max=5)
+            torch._check(b.shape[0] >= 2)
+            torch._check(b.shape[0] <= 5)
             return b
 
         y = torch.tensor([8, 8, 6])
-        constraints = []
         gm, _ = torch._dynamo.export(
             f,
-            constraints=constraints,
             aten_graph=True,
             tracing_mode="symbolic",
         )(y)
@@ -2603,16 +2653,16 @@ def f(x):
     def test_exported_graph_serialization(self):
         def f(x, y):
             b = x.item()
-            torch._constrain_as_size(b)
+            torch._check_is_size(b)
             return torch.empty((b, y.shape[0]))
 
         x = torch.tensor([3])
         y = torch.randn([8, 8, 6])
         example_inputs = [x, y]
-        constraints = [dynamic_dim(y, 0) >= 6, dynamic_dim(y, 0) <= 10]
+        dynamic_shapes = (None, {0: torch.export.Dim("dimy", min=6, max=10)})
         gm, _ = torch._dynamo.export(
             f,
-            constraints=constraints,
+            dynamic_shapes=dynamic_shapes,
             aten_graph=True,
             tracing_mode="symbolic",
         )(*example_inputs)
@@ -2632,7 +2682,9 @@ def my_dyn_fn(a):
 
         torch._dynamo.export(my_dyn_fn)(x)
         with self.assertRaises(ConstraintViolationError):
-            torch._dynamo.export(my_dyn_fn, constraints=[dynamic_dim(x, 0)])(x)
+            torch._dynamo.export(
+                my_dyn_fn, dynamic_shapes=({0: torch.export.Dim("dimx")},)
+            )(x)
 
     def test_symbool(self):
         def f(x):
@@ -2653,11 +2705,12 @@ def my_dyn_fn(a, b, c):
             return a * c, b
 
         torch._dynamo.export(my_dyn_fn)(x, y, z)
-        constraints = [dynamic_dim(x, 0), dynamic_dim(x, 1), dynamic_dim(x, 2)]
+        dimx_0, dimx_1, dimx_2 = torch.export.dims("dimx_0", "dimx_1", "dimx_2")
+        dynamic_shapes = ({0: dimx_0, 1: dimx_1, 2: dimx_2}, None, None)
         with self.assertRaises(ConstraintViolationError):
-            torch._dynamo.export(my_dyn_fn, constraints=constraints)(x, y, z)
-        constraints.append(dynamic_dim(z, 0) == dynamic_dim(x, 0))
-        torch._dynamo.export(my_dyn_fn, constraints=constraints)(x, y, z)
+            torch._dynamo.export(my_dyn_fn, dynamic_shapes=dynamic_shapes)(x, y, z)
+        dynamic_shapes = ({0: dimx_0, 1: dimx_1, 2: dimx_2}, None, {0: dimx_0})
+        torch._dynamo.export(my_dyn_fn, dynamic_shapes=dynamic_shapes)(x, y, z)
 
     def test_export_dynamic_dim_raise_on_compound_range_constraint(self):
         x = torch.ones(6, 4, 4)
@@ -2666,10 +2719,7 @@ def test_export_dynamic_dim_raise_on_compound_range_constraint(self):
 
     def test_export_dynamic_dim_range_constraint(self):
         x = torch.ones(6, 4, 4)
-        constraints = [
-            4 < dynamic_dim(x, 0),
-            dynamic_dim(x, 0) <= 6,
-        ]
+        dynamic_shapes = ({0: torch.export.Dim("dimx", min=5, max=6)},)
 
         def foo(x):
             if x.shape[0] > 3:  # ok
@@ -2678,7 +2728,7 @@ def foo(x):
 
         torch._dynamo.export(
             foo,
-            constraints=constraints,
+            dynamic_shapes=dynamic_shapes,
             aten_graph=True,
         )(x)
 
@@ -2690,24 +2740,40 @@ def bar(x):
         with self.assertRaises(ConstraintViolationError):
             torch._dynamo.export(
                 bar,
-                constraints=constraints,
+                dynamic_shapes=dynamic_shapes,
                 aten_graph=True,
             )(x)
 
     def test_trivial_constraint(self):
-        def foo(x):
-            # non-trivial divisibility condition
-            if (2 * x.shape[0] + 3) % (x.shape[0] - 3) == 0:
-                return x + 1
-            else:
-                return x - 1
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                # complex divisibility condition
+                if (2 * x.shape[0] + 3) % (x.shape[0] - 3) == 0:
+                    return x + 1
+                else:
+                    return x - 1
 
-        def bar(x):
-            # trivially true
-            if (2 * x.shape[0] + 2) % (x.shape[0] + 1) == 0:
-                return x + 1
-            else:
-                return x - 1
+        foo = Foo()
+
+        class Bar(torch.nn.Module):
+            def forward(self, x):
+                # trivially true
+                if (2 * x.shape[0] + 2) % (x.shape[0] + 1) == 0:
+                    return x + 1
+                else:
+                    return x - 1
+
+        bar = Bar()
+
+        class Qux(torch.nn.Module):
+            def forward(self, x):
+                # simple divisibility condition (not trivially true)
+                if (3 * x.shape[0]) % 2 == 0:
+                    return x + 1
+                else:
+                    return x - 1
+
+        qux = Qux()
 
         x = torch.randn(12)
         dim0 = torch.export.Dim("dim0", max=100)
@@ -2720,6 +2786,12 @@ def bar(x):
 
         torch.export.export(bar, (x,), dynamic_shapes=dynamic_shapes)
 
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UserError,
+            "Not all values.*satisfy the generated guard",
+        ):
+            torch.export.export(qux, (x,), dynamic_shapes=dynamic_shapes)
+
     def test_list_contains(self):
         def func(x):
             assert x.size(-1) in [4, 5, 6], "bad"
@@ -2865,7 +2937,9 @@ def my_dyn_fn(x):
             RuntimeError,
             "Constraints violated",
         ):
-            torch._dynamo.export(my_dyn_fn, constraints=[dynamic_dim(y, 0)])(y)
+            torch._dynamo.export(
+                my_dyn_fn, dynamic_shapes=({0: torch.export.Dim("dim")},)
+            )(y)
 
     def test_export_dynamic_dim_cleanup(self):
         y = torch.randn([3, 3, 3])
@@ -2873,8 +2947,9 @@ def test_export_dynamic_dim_cleanup(self):
         def my_dyn_fn(x):
             return x.cos()
 
-        constraints = [dynamic_dim(y, 0)]
-        torch._dynamo.export(my_dyn_fn, constraints=constraints)(y)
+        torch._dynamo.export(my_dyn_fn, dynamic_shapes=({0: torch.export.Dim("dim")},))(
+            y
+        )
 
     @config.patch(capture_dynamic_output_shape_ops=True)
     def test_export_dynamic_control_flow_error(self):
@@ -2892,11 +2967,11 @@ def f(x):
     @config.patch(assume_static_by_default=False)
     def test_export_persist_assert(self):
         def f(x):
-            assert x.shape[0] > 4, "Shape must be more than 4"
+            assert x[0].sum() > 4, "Shape must be more than 4"
             return x.cos() + x.sin()
 
         gm, guard = torch._dynamo.export(f, aten_graph=True, tracing_mode="symbolic")(
-            torch.randn(5, 4, 6)
+            torch.ones(5, 4, 6)
         )
 
         def has_aten_op(gm, op):
@@ -2912,7 +2987,7 @@ def has_aten_op(gm, op):
         self.assertTrue(has_aten_op(gm, torch.ops.aten._assert_async.msg))
 
         with self.assertRaisesRegex(RuntimeError, "Shape must be more than 4"):
-            gm(torch.randn(3, 4, 5))
+            gm(torch.zeros(3, 4, 5))
 
     @common_utils.parametrize(
         "type_fn",
@@ -2997,6 +3072,20 @@ def f_pred_complex_expression_traced_as_symnode_var(x):
             gm, _ = torch._dynamo.export(f, aten_graph=True)(*example_inputs)
             self.assertEqual(gm(*example_inputs), f(*example_inputs))
 
+    @unittest.expectedFailure  # TODO: Not sure why dynamo creates a new inputs for self.a
+    def test_sum_param(self):
+        # Setting a new attribute inside forward()
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.randn(3, 2)
+
+            def forward(self, x):
+                self.b = 2
+                return x.sum() + self.a.sum() + self.b
+
+        torch._dynamo.export(Foo())(torch.randn(3, 2))
+
     def test_mixed_real_and_fake_inputs(self):
         class _TestPattern(torch.nn.Module):
             def __init__(self):
@@ -3312,6 +3401,7 @@ def test_symbool_guards(
                     pred = fake_x.size(0) == size
                     gm, guards = torch._dynamo.export(f)(pred, x)
                     actual = normalize_gm(gm.print_readable(print_output=False))
+                    # TODO: This is naughty, EXPECTTEST_ACCEPT=1 doesn't work
                     self.assertExpectedInline(actual, exp_graph[i])
                     dynamo_shape_env_guards = [
                         guard
@@ -3339,7 +3429,7 @@ def forward(self, pred, x):
         arg0, arg1, = fx_pytree.tree_flatten_spec(([pred, x], {}), self._in_spec)
         l_x_ = arg1
 
-        sin = l_x_.sin();  l_x_ = None
+        sin: "f32[s1, s2]" = l_x_.sin();  l_x_ = None
         return pytree.tree_unflatten([sin], self._out_spec)
 """
         false_graph = """\
@@ -3350,7 +3440,7 @@ def forward(self, pred, x):
         arg0, arg1, = fx_pytree.tree_flatten_spec(([pred, x], {}), self._in_spec)
         l_x_ = arg1
 
-        cos = l_x_.cos();  l_x_ = None
+        cos: "f32[s1, s2]" = l_x_.cos();  l_x_ = None
         return pytree.tree_unflatten([cos], self._out_spec)
 """
         true_guard_code = [
@@ -3455,15 +3545,12 @@ def forward(self, a, b, c) -> torch.Tensor:
         with fake_mode:
             model = DynamicShapeSimpleModel()
             inputs = (torch.randn(2, 4), torch.randn(4, 7), torch.randn(2, 7))
-            constraints = [
-                dynamic_dim(inputs[0], 0),
-                dynamic_dim(inputs[2], 0),
-                dynamic_dim(inputs[2], 0) == dynamic_dim(inputs[0], 0),
-            ]
+            dim = torch.export.Dim("dim")
+            dynamic_shapes = ({0: dim}, None, {0: dim})
             for aten_graph in [True, False]:
                 gm = torch._dynamo.export(
                     model,
-                    constraints=constraints,
+                    dynamic_shapes=dynamic_shapes,
                     aten_graph=aten_graph,
                 )(*inputs).graph_module
 
@@ -3490,13 +3577,11 @@ def forward(self, x):
         with fake_mode:
             model = Model()
             inputs = (torch.randn(10, 2, 2),)
-            constraints = [
-                dynamic_dim(inputs[0], 0),
-            ]
+            dynamic_shapes = ({0: torch.export.Dim("dim")},)
             for aten_graph in [True, False]:
                 gm = torch._dynamo.export(
                     model,
-                    constraints=constraints,
+                    dynamic_shapes=dynamic_shapes,
                     aten_graph=aten_graph,
                 )(*inputs).graph_module
 
@@ -4145,8 +4230,8 @@ def _constais_op(gm, target):
             """\
 def forward(self, x):
     arg0, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
-    l_x__1 = arg0
-    x = torch.cos(l_x__1);  l_x__1 = None
+    l_x_ = arg0
+    x = torch.cos(l_x_);  l_x_ = None
     x_1 = torch.sin(x);  x = None
     x_2 = torch.abs(x_1);  x_1 = None
     return pytree.tree_unflatten([x_2], self._out_spec)""",
@@ -4180,8 +4265,8 @@ def fn(x):
             """\
 def forward(self, x):
     arg0, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
-    l_x__1 = arg0
-    sin = torch.sin(l_x__1);  l_x__1 = None
+    l_x_ = arg0
+    sin = torch.sin(l_x_);  l_x_ = None
     return pytree.tree_unflatten([sin], self._out_spec)""",
         )
         self.assertExpectedInline(
@@ -4189,8 +4274,8 @@ def forward(self, x):
             """\
 def forward(self, x):
     arg0, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
-    l_x__1 = arg0
-    sin = torch.sin(l_x__1);  l_x__1 = None
+    l_x_ = arg0
+    sin = torch.sin(l_x_);  l_x_ = None
     return pytree.tree_unflatten([sin], self._out_spec)""",
         )
 
@@ -4212,8 +4297,8 @@ def f2(x):
 def forward(self, x):
     arg0, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     l_x_ = arg0
-    l_x__1 = torch.cos(l_x_);  l_x_ = None
-    sin = torch.sin(l_x__1);  l_x__1 = None
+    x = torch.cos(l_x_);  l_x_ = None
+    sin = torch.sin(x);  x = None
     return pytree.tree_unflatten([sin], self._out_spec)""",
         )
 
@@ -4239,12 +4324,16 @@ def bad_fn(x):
         gm_edit.recompile()
 
         expected = [
-            "x = torch.sin(l_x__1)",
-            "cos = torch.cos(x_1)",
+            """x = torch.sin(l_x_)""",
+            """cos = torch.cos(l_stack0_)""",
         ]
 
         def test_backend(gm: torch.fx.GraphModule, example_inputs):
             self.assertTrue(expected)
+            # Normalize output for dynamic and not
+            for nd in gm.graph.nodes:
+                if "example_value" in nd.meta:
+                    del nd.meta["example_value"]
             self.assertIn(expected[0], gm.print_readable(print_output=False))
             expected.pop(0)
             return gm.forward
@@ -4302,6 +4391,90 @@ def forward(self, x):
         self.assertEqual(out.requires_grad, False)
         out.requires_grad = True
 
+        def fn(x):
+            with torch.inference_mode():
+                return x + 1
+
+        gm, _ = torch._dynamo.export(fn)(torch.rand(2, 2))
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
+def forward(self, x):
+    arg0, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    l_x_ = arg0
+    _enter_inference_mode = torch.autograd.grad_mode._enter_inference_mode(True)
+    add = l_x_ + 1;  l_x_ = None
+    _exit_inference_mode = torch.autograd.grad_mode._exit_inference_mode(_enter_inference_mode);  _enter_inference_mode = None
+    return pytree.tree_unflatten([add], self._out_spec)""",
+        )
+        inp = torch.randn(2, 2, requires_grad=True)
+        out = gm(inp)
+        self.assertEqual(out.requires_grad, False)
+
+    def test_export_masking_with_no_grad(self):
+        def fn(x, b, y):
+            x = x.clone()
+            x[b] = y
+            return x
+
+        def fn_no_grad(x, b, y):
+            with torch.no_grad():
+                return fn(x, b, y)
+
+        def fn_inference_mode(x, b, y):
+            with torch.inference_mode():
+                return fn(x, b, y)
+
+        x = torch.randn(4, requires_grad=True)
+        b = torch.tensor([True, False, True, False])
+        y = torch.randn(2, requires_grad=True)
+
+        gm, _ = torch._dynamo.export(fn_no_grad)(x, b, y)
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
+def forward(self, x, b, y):
+    arg0, arg1, arg2, = fx_pytree.tree_flatten_spec(([x, b, y], {}), self._in_spec)
+    l_x_ = arg0
+    l_b_ = arg1
+    l_y_ = arg2
+    _set_grad_enabled = torch._C._set_grad_enabled(False)
+    x = l_x_.clone();  l_x_ = None
+    x[l_b_] = l_y_;  setitem = x;  l_b_ = l_y_ = None
+    _set_grad_enabled_1 = torch._C._set_grad_enabled(True)
+    return pytree.tree_unflatten([x], self._out_spec)""",
+        )
+
+        gm, _ = torch._dynamo.export(fn_inference_mode)(x, b, y)
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
+def forward(self, x, b, y):
+    arg0, arg1, arg2, = fx_pytree.tree_flatten_spec(([x, b, y], {}), self._in_spec)
+    l_x_ = arg0
+    l_b_ = arg1
+    l_y_ = arg2
+    _enter_inference_mode = torch.autograd.grad_mode._enter_inference_mode(True)
+    x = l_x_.clone();  l_x_ = None
+    x[l_b_] = l_y_;  setitem = x;  l_b_ = l_y_ = None
+    _exit_inference_mode = torch.autograd.grad_mode._exit_inference_mode(_enter_inference_mode);  _enter_inference_mode = None
+    return pytree.tree_unflatten([x], self._out_spec)""",
+        )
+
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.Unsupported, "boolean masking setitem backwards"
+        ):
+            gm, _ = torch._dynamo.export(fn)(x, b, y)
+
+    def test_dynamo_list_index(self):
+        def fn(x, in_list):
+            return x + in_list.index(2)
+
+        inputs = (torch.ones(2, 2), [1, 2])
+        graph, _ = torch._dynamo.export(fn)(*inputs)
+        out = graph(*inputs)
+        self.assertEqual(out, torch.ones(2, 2) + 1)
+
 
 common_utils.instantiate_parametrized_tests(ExportTests)
 
diff --git a/test/dynamo/test_export_mutations.py b/test/dynamo/test_export_mutations.py
index c3b73a8178387..a9683af254191 100644
--- a/test/dynamo/test_export_mutations.py
+++ b/test/dynamo/test_export_mutations.py
@@ -16,58 +16,20 @@ def check_same_with_export(self, mod, arg):
         real_result = mod(arg)
         graph, _ = torch._dynamo.export(mod)(arg)
         result = graph(arg)
-        self.assertTrue(torch._dynamo.utils.same(result, real_result))
+        self.assertEqual(result, real_result)
 
     def test_module_attribute_mutation_violation_positive_1(self):
         # Mutating attribute with a Tensor type
         class Foo(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.a = torch.Tensor(3, 2)
+                self.a = torch.randn(3, 2)
 
             def forward(self, x):
                 self.a = self.a.to(torch.float64)
                 return x.sum() + self.a.sum()
 
-        self.check_failure_on_export(Foo(), torch.Tensor(3, 2))
-
-    def test_module_attribute_mutation_violation_positive_2(self):
-        # Mutating attribute with a scalar type
-        class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.a = 2
-
-            def forward(self, x):
-                self.a = self.a * 3
-                return x.sum() + self.a
-
-        self.check_failure_on_export(Foo(), torch.Tensor(3, 2))
-
-    def test_module_attribute_mutation_violation_positive_3(self):
-        # Setting a new attribute inside forward()
-        class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.a = torch.Tensor(3, 2)
-
-            def forward(self, x):
-                self.b = 2
-                return x.sum() + self.a.sum() + self.b
-
-        self.check_failure_on_export(Foo(), torch.Tensor(3, 2))
-
-    def test_module_attribute_mutation_violation_positive_4(self):
-        # Mutating attribute with an inline function
-        class Foo(torch.nn.Module):
-            def add(self, a, b):
-                return a + b
-
-            def forward(self, x):
-                self.a = self.add(1, 2) * self.add(3, 4)
-                return x.sum() + self.a
-
-        self.check_failure_on_export(Foo(), torch.Tensor(3, 2))
+        self.check_failure_on_export(Foo(), torch.randn(3, 2))
 
     def test_module_attribute_mutation_violation_negative_1(self):
         # Mutating attribute with a Tensor type inside __init__ but
@@ -75,39 +37,39 @@ def test_module_attribute_mutation_violation_negative_1(self):
         class Foo(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.a = torch.Tensor(3, 2)
+                self.a = torch.randn(3, 2)
 
             def forward(self, x):
                 return x.sum() + self.a.to(torch.float64).sum()
 
-        self.check_same_with_export(Foo(), torch.Tensor(3, 2))
+        self.check_same_with_export(Foo(), torch.randn(3, 2))
 
     def test_module_attribute_mutation_violation_negative_2(self):
         # Mutating attribute with a Tensor type inside __init__ twice
         class Foo(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.a = torch.Tensor(3, 2)
+                self.a = torch.randn(3, 2)
                 self.a = self.a.to(torch.float64)
 
             def forward(self, x):
                 return x.sum() + self.a.sum()
 
-        self.check_same_with_export(Foo(), torch.Tensor(3, 2))
+        self.check_same_with_export(Foo(), torch.randn(3, 2))
 
     def test_module_attribute_mutation_violation_negative_3(self):
         # Mutating local variable inside forward()
         class Foo(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.a = torch.Tensor(3, 2)
+                self.a = torch.randn(3, 2)
 
             def forward(self, x):
                 b = 1
                 b = b * 5
                 return x.sum() + self.a.sum() + b
 
-        self.check_same_with_export(Foo(), torch.Tensor(3, 2))
+        self.check_same_with_export(Foo(), torch.randn(3, 2))
 
     @unittest.skipIf(IS_FBCODE, "Broken in fbcode")
     def test_module_attribute_mutation_violation_negative_4(self):
@@ -116,17 +78,17 @@ def test_module_attribute_mutation_violation_negative_4(self):
         class Foo(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.a = torch.Tensor(3, 2)
+                self.a = torch.randn(3, 2)
 
             def forward(self, x):
                 self.a = self.a.to(torch.float64)
                 return x.sum() + self.a.sum()
 
         mod = Foo()
-        arg = torch.Tensor(3, 2)
+        arg = torch.randn(3, 2)
         real_result = mod(arg)
         opt_mod = torch._dynamo.optimize("eager", nopython=True)(mod)
-        self.assertTrue(torch._dynamo.utils.same(opt_mod(arg), real_result))
+        self.assertEqual(opt_mod(arg), real_result)
 
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 468c27e9d4218..393f81cb5e7ca 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -6,6 +6,7 @@
 import itertools
 import math
 import operator
+import random
 import sys
 import unittest
 from dataclasses import dataclass, field
@@ -19,7 +20,12 @@
 import torch._dynamo.test_case
 import torch._dynamo.testing
 from torch import sub
-from torch._dynamo.testing import expectedFailureDynamic
+from torch._dynamo.testing import (
+    CompileCounterWithBackend,
+    EagerAndRecordGraphs,
+    expectedFailureDynamic,
+    normalize_gm,
+)
 from torch._dynamo.utils import ifdynstaticdefault, same
 
 from torch.nn import functional as F
@@ -48,6 +54,16 @@ def constant3(a, b):
     return a - b + (1.0 + 2)
 
 
+_variable = 0
+
+
+def update_global(x):
+    global _variable
+    _variable += 1
+    # Check that updated global variable value is picked up
+    return x * _variable
+
+
 def func_with_default(a, b, some_default_arg=True):
     if some_default_arg:
         return a - b
@@ -70,6 +86,10 @@ def test_fn(self):
     return test_fn
 
 
+class MyCls:
+    a = 1
+
+
 @torch.jit.script_if_tracing
 def inline_script_if_tracing(x):
     return x + 1.2
@@ -85,6 +105,16 @@ def inline_unused(x):
     return x + 5.6
 
 
+@functools.lru_cache
+def inline_lru_cache_fn_with_default_args(x, y, _=None):
+    return torch.sin(x * y)
+
+
+@torch.jit.script_if_tracing
+def inline_script_if_tracing_fn_with_default_args(x, y, _=None):
+    return torch.cos(x * y)
+
+
 class FunctionTests(torch._dynamo.test_case.TestCase):
     @make_test
     def test_inline_jit_annotations(x):
@@ -93,6 +123,14 @@ def test_inline_jit_annotations(x):
         x = inline_unused(x)
         return
 
+    @make_test
+    def test_inline_script_if_tracing_fn_with_default_args(a, b):
+        return inline_script_if_tracing_fn_with_default_args(a, 2, b)
+
+    @make_test
+    def test_inline_lru_cache_fn_with_default_args(a, b):
+        return inline_lru_cache_fn_with_default_args(a, 2, b)
+
     @make_test
     def test_add(a, b):
         return a + b
@@ -143,6 +181,58 @@ def test_itertools_chain_from_iterable(a, b):
             v = v + x
         return v
 
+    @make_test
+    def test_obj_eq(a, b):
+        v = a + b
+        if MyCls() == None:  # noqa: E711
+            return -1
+        if MyCls() != None:  # noqa: E711
+            v = v.sin()
+        if MyCls() == MyCls():
+            return -2
+        if MyCls() != MyCls():
+            return v + 1
+        return -3
+
+    @make_test
+    def test_cls_eq(a, b):
+        v = a + b
+        if MyCls == None:  # noqa: E711
+            return -1
+        if MyCls != None:  # noqa: E711
+            v = v.sin()
+        if MyCls != MyCls:
+            return -2
+        if MyCls == MyCls:
+            return v + 1
+        return -3
+
+    @make_test
+    def test_obj_is(a, b):
+        v = a + b
+        if MyCls() is None:  # noqa: E711
+            return -1
+        if MyCls() is not None:  # noqa: E711
+            v = v.sin()
+        if MyCls() is MyCls():
+            return -2
+        if MyCls() is not MyCls():
+            return v + 1
+        return -3
+
+    @make_test
+    def test_cls_is(a, b):
+        v = a + b
+        if MyCls is None:  # noqa: E711
+            return -1
+        if MyCls is not None:  # noqa: E711
+            v = v.sin()
+        if MyCls is not MyCls:
+            return -2
+        if MyCls is MyCls:
+            return v + 1
+        return -3
+
     @make_test
     def test_itertools_combinations(a, b):
         combs = []
@@ -150,6 +240,16 @@ def test_itertools_combinations(a, b):
             combs.append(torch.ones(size))
         return combs
 
+    @make_test
+    def test_np_iinfo(a):
+        max_dim = np.iinfo(np.int16).max
+        return a + max_dim
+
+    @make_test
+    def test_np_finfo(a):
+        min_dim = np.finfo(np.float32).min
+        return a + min_dim
+
     @make_test
     def test_constant1(a, b, c):
         return a - b * c + 1.0
@@ -173,6 +273,14 @@ def test_constant4(a, b):
             return a - b
         return b - a
 
+    @make_test
+    def test_cls_hasattr(self, x):
+        if hasattr(MyCls, "a"):
+            x = x + 1
+        if hasattr(MyCls, "b"):
+            x = x + 2
+        return x
+
     @make_test
     def test_finfo(a, b):
         if torch.iinfo(torch.int32).bits == 32:
@@ -413,6 +521,22 @@ def test_dict_copy(x):
         z = dict({"foo": x + 1})
         return z
 
+    @make_test
+    def test_dict_keys(x):
+        d = {3: x}
+        keys = d.keys()
+        d[4] = x + 1
+        d2 = {3: 2, 4: "aa"}
+        return 3 in keys, 4 in keys, 5 in keys, d2.keys() == keys
+
+    @make_test
+    def test_dict_values(x):
+        d = {3: x}
+        values = d.values()
+        d[3] = x + 1
+        d[4] = x + 2
+        return len(values)
+
     @make_test
     def test_callable_lambda(x):
         if callable(lambda x: True):
@@ -434,6 +558,40 @@ def test_callable_builtin(x):
         else:
             return x - 1
 
+    def test_callable_class(self):
+        class CallableClass:
+            def __call__():
+                pass
+
+        class NotCallableClass:
+            pass
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn1(x, arg):
+            if callable(arg):
+                return x
+            return x + 1
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn2(x, arg):
+            if callable(arg):
+                return x * 2
+            return x + 1
+
+        input = torch.randn(4)
+
+        for f in [fn1, fn2]:
+            self.assertEqual(f(input, NotCallableClass()), input + 1)
+            self.assertEqual(
+                f(input, CallableClass()), input if f is fn1 else input * 2
+            )
+
+            # passing tensor and scalars
+            self.assertEqual(f(input, 1), input + 1)
+            self.assertEqual(f(input, 1.1), input + 1)
+            self.assertEqual(f(input, True), input + 1)
+            self.assertEqual(f(input, input), input + 1)
+
     @make_test
     def test_len_constant_misc_iterables(x):
         a = len((1, 2, 3))
@@ -484,6 +642,29 @@ def test_get_autocast_gpu_dtype(x):
         dtype = torch.get_autocast_gpu_dtype()
         return x.type(dtype)
 
+    @make_test
+    def test_list_compare_polyfill(x):
+        for a, b, c in [
+            [(1, 2, 3), (1, 2, 3), 7.77],
+            [(1, 4, 3), (1, 2, 3), 3.33],
+            [(1, 2), (1, 2, 3), 5.55],
+            [(1, 2, 3), (1, 2), 11.11],
+            [(1, -1, 3), (1, 2, 3), 13.33],
+        ]:
+            if a != b:
+                x += 1 * c
+            if a == b:
+                x += 2 * c
+            if a < b:
+                x += 4 * c
+            if a > b:
+                x += 8 * c
+            if a <= b:
+                x += 16 * c
+            if a >= b:
+                x += 32 * c
+        return x
+
     @make_test
     def test_promote_types(x):
         if x.dtype == torch.promote_types(torch.int32, torch.float32):
@@ -491,6 +672,13 @@ def test_promote_types(x):
         else:
             return x - 1
 
+    @make_test
+    def test_cublas_allow_tf32(x):
+        if torch.backends.cuda.matmul.allow_tf32:
+            return x.sin() + 1
+
+        return x.cos() - 1
+
     @make_test
     def test_get_calculate_correct_fan(x):
         fan_in = torch.nn.init._calculate_correct_fan(x, "fan_in")
@@ -503,6 +691,13 @@ def test_is_complex(x):
         else:
             return x - 1
 
+    @make_test
+    def test_tensor_is_complex(x):
+        if x.is_complex():
+            return x + 1
+        else:
+            return x - 1
+
     @make_test
     def test_get_privateuse1_name(x):
         if torch._C._get_privateuse1_backend_name() == "privateuseone":
@@ -515,6 +710,15 @@ def test_device(x):
         if not x.is_cuda:
             return x + 1
 
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    @make_test
+    def test_get_device_properties_tensor_device(a):
+        x = a.to("cuda")
+        prop = torch.cuda.get_device_properties(x.device)
+        if prop.major == 8:
+            return x + prop.multi_processor_count
+        return x + prop.max_threads_per_multi_processor
+
     @make_test
     def test_tensor_type(a, b):
         m = a.to(torch.float16)
@@ -542,6 +746,12 @@ def test_tensor_type5(a, b):
         m = a.type(torch.cuda.HalfTensor)
         return b.type(m.type())
 
+    @make_test
+    def test_tensor_element_size(a):
+        if a.element_size() > 1:
+            return (a + a.element_size(), a - a.element_size())
+        return (a - a.element_size(), a + a.element_size())
+
     @make_test
     def test_ndim(x):
         if x.ndim == 2 and x.ndimension() == 2 and x.dim() == 2:
@@ -701,6 +911,12 @@ def test_dict_ops(a, b):
         if "c" in tmp and "missing" not in tmp:
             return tmp["c"] - tmp["a"] + len(tmp)
 
+    @make_test
+    def test_inline_jit__unwrap_optional(x):
+        if torch.jit._unwrap_optional(x) is None:
+            return torch.ones(2, 2)
+        return x.sin()
+
     def test_dict_param_keys(self):
         a_param = torch.nn.Parameter(torch.ones([4, 4]))
 
@@ -769,6 +985,21 @@ def fn(x):
         self.assertTrue(same(ref[1]["e"], res[1]["e"]))
         self.assertTrue(same(ref[1][param], res[1][param]))
 
+    def test_dict_tuple_lazy_guard(self):
+        @torch.compile(backend="eager")
+        def fn(x, y):
+            return torch.sin(x) * y[1]
+
+        fn(torch.randn(3), {1: 1, 2: 2})
+        # Changing the value of other key should not causing recompilation
+        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
+            fn(torch.randn(3), {1: 1, 2: 3})
+
+        fn(torch.randn(3), (1, 2, 3))
+        # Changing the value of index 0, 2 (not 1) should not cause recompilation
+        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
+            fn(torch.randn(3), (11, 2, 13))
+
     @make_test
     def test_call_dict1(x):
         d1 = dict()
@@ -1070,6 +1301,22 @@ def test_namedtuple_user_methods(a, b):
         mytuple = FunctionTests.MyNamedTuple(a, b)
         return mytuple.add(), mytuple.static_method(), mytuple.class_method()
 
+    @make_test
+    def test_namedtuple_hasattr(a, b):
+        mytuple = FunctionTests.MyNamedTuple(a, b)
+
+        def isinstance_namedtuple(obj) -> bool:
+            return (
+                isinstance(obj, tuple)
+                and hasattr(obj, "_asdict")
+                and hasattr(obj, "_fields")
+            )
+
+        if isinstance_namedtuple(mytuple):
+            return a + b
+        else:
+            return a - b
+
     @make_test
     def test_is_quantized(a, b):
         if not a.is_quantized:
@@ -1342,6 +1589,26 @@ def test_partials_udf_arg(x):
         par_mul = functools.partial(udf_mul, torch.ones(10, 10))
         return par_mul(x)
 
+    @make_test
+    def test_list_add_then_mutate(x):
+        my_list = [1, x]
+        y = x / 4.0
+        my_list = my_list + [x / 2.0, 4]
+        my_list.append(y)
+        return sum(my_list)
+
+    @make_test
+    def test_list_expand_lhs(x):
+        return sum(4 * [x])
+
+    @make_test
+    def test_in_not_in(x):
+        mylist = [1, 2, 3, 4, 5, x]
+        myotherlist = [1, 2, 3, 4, 5]
+        assert 3 in mylist
+        assert 6 not in myotherlist
+        return sum(mylist)
+
     @make_test
     def test_partials_udf_kwarg(x):
         par_mul = functools.partial(udf_mul, y=torch.ones(10, 10))
@@ -1363,6 +1630,21 @@ def test_partials_lambda(x):
         triple = functools.partial(multiply, y=3)
         return triple(x)
 
+    @unittest.skipUnless(torch.distributed.is_available(), "requires torch.distributed")
+    @make_test
+    def test_flat_param_same_storage_size(x, y):
+        import torch.distributed.fsdp._flat_param as flat_param
+
+        if flat_param._same_storage_size(x, 100):
+            x = x + 1
+        else:
+            x = x - 1
+        if flat_param._same_storage_size(y, 123):
+            y = y + 1
+        else:
+            y = y - 1
+        return x, y
+
     @parametrize(
         "attr",
         (
@@ -1506,6 +1788,201 @@ def fn(f0, f1, x):
         eager_result = fn(lambda0, lambda1, x)
         self.assertEqual(eager_result, dynamo_result)
 
+    def test_partials_graph_break_reconstruct(self):
+        def fn(udf_mul_0, udf_mul_1, x):
+            lambda0 = functools.partial(udf_mul_0, y=x)
+            lambda1 = functools.partial(udf_mul_1, y=x)
+
+            print("break")
+            return torch.mul(lambda0(x), lambda1(x))
+
+        backend = EagerAndRecordGraphs()
+        cnts = CompileCounterWithBackend(backend)
+        x = torch.randn(2, 2)
+        dynamo_result = torch._dynamo.optimize(cnts)(fn)(udf_mul, udf_mul, x)
+
+        eager_result = fn(udf_mul, udf_mul, x)
+        gm = backend.graphs[0]
+        self.assertEqual(eager_result, dynamo_result)
+        if torch._dynamo.config.assume_static_by_default:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_lambda0_keywords_y_: "f32[2, 2]"):
+        l_lambda0_keywords_y_ = L_lambda0_keywords_y_
+
+        mul: "f32[2, 2]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_
+        mul_1: "f32[2, 2]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_;  l_lambda0_keywords_y_ = None
+
+        mul_2: "f32[2, 2]" = torch.mul(mul, mul_1);  mul = mul_1 = None
+        return (mul_2,)
+""",
+            )
+        else:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, s0: "Sym(s0)", L_lambda0_keywords_y_: "f32[s0, s0]"):
+        l_lambda0_keywords_y_ = L_lambda0_keywords_y_
+
+        mul: "f32[s0, s0]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_
+        mul_1: "f32[s0, s0]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_;  l_lambda0_keywords_y_ = None
+
+        mul_2: "f32[s0, s0]" = torch.mul(mul, mul_1);  mul = mul_1 = None
+        return (mul_2,)
+""",
+            )
+
+    def test_partials_graph_break_reconstruct_mix(self):
+        def fn(udf_mul_0, udf_add_1, x):
+            lambda0 = functools.partial(udf_mul_0, y=x)
+            lambda1 = functools.partial(udf_add_1, x)
+
+            print("break")
+            return torch.mul(lambda0(x), lambda1(x))
+
+        backend = EagerAndRecordGraphs()
+        cnts = CompileCounterWithBackend(backend)
+        x = torch.randn(2, 2)
+        dynamo_result = torch._dynamo.optimize(cnts)(fn)(udf_mul, udf_add, x)
+
+        eager_result = fn(udf_mul, udf_add, x)
+        gm = backend.graphs[0]
+        self.assertEqual(eager_result, dynamo_result)
+        if torch._dynamo.config.assume_static_by_default:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_lambda0_keywords_y_: "f32[2, 2]"):
+        l_lambda0_keywords_y_ = L_lambda0_keywords_y_
+
+        mul: "f32[2, 2]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_
+
+        add: "f32[2, 2]" = l_lambda0_keywords_y_ + l_lambda0_keywords_y_;  l_lambda0_keywords_y_ = None
+
+        mul_1: "f32[2, 2]" = torch.mul(mul, add);  mul = add = None
+        return (mul_1,)
+""",
+            )
+        else:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, s0: "Sym(s0)", L_lambda0_keywords_y_: "f32[s0, s0]"):
+        l_lambda0_keywords_y_ = L_lambda0_keywords_y_
+
+        mul: "f32[s0, s0]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_
+
+        add: "f32[s0, s0]" = l_lambda0_keywords_y_ + l_lambda0_keywords_y_;  l_lambda0_keywords_y_ = None
+
+        mul_1: "f32[s0, s0]" = torch.mul(mul, add);  mul = add = None
+        return (mul_1,)
+""",
+            )
+
+    def test_partials_graph_break_reconstruct_mix_no_source(self):
+        def fn(udf_mul_0, x):
+            udf_add_1 = lambda x, y: x + y
+
+            lambda0 = functools.partial(udf_mul_0, y=x)
+            lambda1 = functools.partial(udf_add_1, x)
+
+            print("break")
+            return torch.mul(lambda0(x), lambda1(x))
+
+        backend = EagerAndRecordGraphs()
+        cnts = CompileCounterWithBackend(backend)
+        x = torch.randn(2, 2)
+        dynamo_result = torch._dynamo.optimize(cnts)(fn)(udf_mul, x)
+
+        eager_result = fn(udf_mul, x)
+        gm = backend.graphs[0]
+        self.assertEqual(eager_result, dynamo_result)
+        if torch._dynamo.config.assume_static_by_default:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_lambda0_keywords_y_: "f32[2, 2]"):
+        l_lambda0_keywords_y_ = L_lambda0_keywords_y_
+
+        mul: "f32[2, 2]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_
+
+        add: "f32[2, 2]" = l_lambda0_keywords_y_ + l_lambda0_keywords_y_;  l_lambda0_keywords_y_ = None
+
+        mul_1: "f32[2, 2]" = torch.mul(mul, add);  mul = add = None
+        return (mul_1,)
+""",
+            )
+        else:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, s0: "Sym(s0)", L_lambda0_keywords_y_: "f32[s0, s0]"):
+        l_lambda0_keywords_y_ = L_lambda0_keywords_y_
+
+        mul: "f32[s0, s0]" = l_lambda0_keywords_y_ * l_lambda0_keywords_y_
+
+        add: "f32[s0, s0]" = l_lambda0_keywords_y_ + l_lambda0_keywords_y_;  l_lambda0_keywords_y_ = None
+
+        mul_1: "f32[s0, s0]" = torch.mul(mul, add);  mul = add = None
+        return (mul_1,)
+""",
+            )
+
+    def test_partials_graph_break_reconstruct_args_and_kwargs(self):
+        def fn(udf_mul_0, x):
+            lambda0 = functools.partial(udf_mul_0, x, 4, z=x)
+            lambda1 = functools.partial(udf_mul_0, 4, z=x)
+
+            return torch.mul(lambda0(), lambda1(5))
+
+        backend = EagerAndRecordGraphs()
+        cnts = CompileCounterWithBackend(backend)
+        x = torch.randn(2, 2)
+        dynamo_result = torch._dynamo.optimize(cnts)(fn)(udf_mul2, x)
+
+        eager_result = fn(udf_mul2, x)
+        gm = backend.graphs[0]
+        self.assertEqual(eager_result, dynamo_result)
+        if torch._dynamo.config.assume_static_by_default:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[2, 2]"):
+        l_x_ = L_x_
+
+        mul: "f32[2, 2]" = l_x_ * 4
+        mul_1: "f32[2, 2]" = mul * l_x_;  mul = None
+        mul_2: "f32[2, 2]" = 20 * l_x_;  l_x_ = None
+
+        mul_3: "f32[2, 2]" = torch.mul(mul_1, mul_2);  mul_1 = mul_2 = None
+        return (mul_3,)
+""",
+            )
+        else:
+            self.assertExpectedInline(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, s0: "Sym(s0)", L_x_: "f32[s0, s0]"):
+        l_x_ = L_x_
+
+        mul: "f32[s0, s0]" = l_x_ * 4
+        mul_1: "f32[s0, s0]" = mul * l_x_;  mul = None
+        mul_2: "f32[s0, s0]" = 20 * l_x_;  l_x_ = None
+
+        mul_3: "f32[s0, s0]" = torch.mul(mul_1, mul_2);  mul_1 = mul_2 = None
+        return (mul_3,)
+""",
+            )
+
     def test_partials_recompilation(self):
         def fn(f0, f1, x):
             return f0(x) * f1(x)
@@ -1514,6 +1991,7 @@ def fn(f0, f1, x):
         lambda1 = functools.partial(udf_mul, y=torch.randn(2, 2))
 
         cnts = torch._dynamo.testing.CompileCounter()
+
         x = torch.randn(2, 2)
         fn = torch._dynamo.optimize(cnts, nopython=True)(fn)
         dynamo_result = fn(lambda0, lambda1, x)
@@ -1661,11 +2139,170 @@ def inner():
 
         self.assertTrue(same(program(input1, input2), input1 + input1))
 
+    def test_compare_constant_and_tensor(self):
+        for op in [
+            operator.lt,
+            operator.le,
+            operator.gt,
+            operator.ge,
+            operator.ne,
+            operator.eq,
+            operator.is_,
+            operator.is_not,
+        ]:
+            with self.subTest(op=op):
+
+                def fn(x):
+                    return op(-10, x)
+
+                opt_fn = torch.compile(fullgraph=True)(fn)
+
+                x = torch.randn(10)
+                self.assertEqual(opt_fn(x), fn(x))
+
+    def test_pos(self):
+        def fn(x, y):
+            return operator.pos(x) * +y
+
+        opt_fn = torch.compile(fullgraph=True, dynamic=True)(fn)
+
+        def test(x, y):
+            self.assertEqual(opt_fn(x, y), fn(x, y))
+
+        test(torch.ones(4), 1)
+        test(1, torch.ones(4))
+        test(-1, -1)
+        test(-1.1, 1.1)
+        test(True, False)
+        test(torch.ones(4, dtype=torch.float32), 1.1)
+
+    def test_truth(self):
+        def fn(x, y):
+            return operator.truth(x) and bool(y)
+
+        opt_fn = torch.compile(fullgraph=True, dynamic=False)(fn)
+
+        def test(x, y):
+            self.assertEqual(opt_fn(x, y), fn(x, y))
+
+        test(1, 100)
+        test(-1.1, True)
+        test(-1.1, 1.1)
+        test(True, False)
+        test(torch.ones(1), 1)
+        test(torch.zeros(1), 1)
+        test(torch.ones(1), torch.ones(1))
+
+    def test_unary_fold_op(self):
+        for op in (operator.abs, abs, operator.neg, operator.pos, operator.truth):
+            with self.subTest(op=op):
+
+                def fn():
+                    a = range(-10, 10)
+                    return list(map(op, a))
+
+                opt_fn = torch._dynamo.optimize(nopython=True)(fn)
+                self.assertEqual(opt_fn(), fn())
+
+    def test_unary_fold_op_seq(self):
+        for op in (operator.length_hint,):
+            with self.subTest(op=op):
+
+                def fn():
+                    a = [tuple(range(-10, i)) for i in range(10)]
+                    return tuple(map(op, a))
+
+                opt_fn = torch._dynamo.optimize(nopython=True)(fn)
+                self.assertEqual(opt_fn(), fn())
+
+    def test_rand_inlined(self):
+        @torch.compile(backend="eager", dynamic=True)
+        def fn():
+            idx_size = [10]
+            idx_size[random.randint(0, 0)] = random.randint(1, 8)
+            t = tuple(idx_size)
+            src_size = [random.randint(1, 5) + s for s in idx_size]
+            idx = torch.empty(t)
+
+        fn()
+
+    def test_rand_tensor_partial(self):
+        from collections import namedtuple
+        from functools import partial
+
+        SdpaShape = namedtuple(
+            "Sdpa_Shape", ["batch", "num_heads", "seq_len", "head_dim"]
+        )
+
+        @torch.compile(backend="eager")
+        def func():
+            make_tensor = partial(
+                torch.rand, device="cpu", dtype=torch.float16, requires_grad=True
+            )
+
+            bsz, num_heads, seq_len_q, seq_len_kv, head_dim = (16, 16, 128, 128, 16)
+            make_q_tensor = partial(
+                make_tensor, SdpaShape(bsz, num_heads, seq_len_q, head_dim)
+            )
+            make_kv_tensor = partial(
+                make_tensor, SdpaShape(bsz, num_heads, seq_len_kv, head_dim)
+            )
+            t1 = make_q_tensor()
+            t2 = make_kv_tensor()
+            t3 = t1 + t2
+
+        func()
+
+    def test_to(self):
+        @torch.compile(backend="eager")
+        def fn():
+            t = torch.ones(2)
+            y = t.to("meta")
+
+        fn()
+
+    def test_elipsis(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(a, ind, val):
+            a[ind] = val
+            return a
+
+        arr = np.zeros(4)
+        self.assertEqual(fn(arr, np.s_[...], np.ones(4)), np.ones(4))
+
+        arr = np.array([[1, 1], [2, 2]])
+        self.assertEqual(
+            fn(arr, np.s_[0, ...], np.zeros(2)), np.array([[0, 0], [2, 2]])
+        )
+
+        arr = np.array([[1, 1], [2, 2]])
+        self.assertEqual(
+            fn(arr, np.s_[1, ...], np.zeros(2)), np.array([[1, 1], [0, 0]])
+        )
+
+        arr = np.array([[1, 1], [2, 2]])
+        self.assertEqual(
+            fn(arr, np.s_[..., 0], np.array([3, 3])), np.array([[3, 1], [3, 2]])
+        )
+
+        arr = np.array([[1, 1], [2, 2]])
+        self.assertEqual(
+            fn(arr, np.s_[..., 1], np.array([3, 3])), np.array([[1, 3], [2, 3]])
+        )
+
 
 def udf_mul(x, y):
     return x * y
 
 
+def udf_mul2(x, y, z):
+    return x * y * z
+
+
+def udf_add(x, y):
+    return x + y
+
+
 class SmallNN(torch.nn.Module):
     def forward(self, x, y):
         combined = torch.cat((x, y), dim=1)
@@ -1947,6 +2584,21 @@ def fn(x, y):
         self.assertEqual(fn(x, y), fn_opt(x, y))
         self.assertEqual(fn(x, x), fn_opt(x, x))
 
+    def test_is_not_tensor_tensor(self):
+        def fn(x, y):
+            if x is not y:
+                return x * 2
+            else:
+                return x + y
+
+        fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn)
+
+        x = torch.zeros(2)
+        y = torch.ones(2)
+
+        self.assertEqual(fn(x, y), fn_opt(x, y))
+        self.assertEqual(fn(x, x), fn_opt(x, x))
+
     def test_is_mutated_tensor_tensor(self):
         def fn(x):
             y = x.add_(1)
@@ -1998,7 +2650,6 @@ def fn(x):
 
         self.assertEqual(fn(z), fn_opt(z))
 
-    @torch._dynamo.config.patch(capture_func_transforms=True)
     def test_is_init_in_compile_vmapped_mutated_tensor_tensor(self):
         def fn(z):
             x = z.clone()
@@ -2012,7 +2663,6 @@ def fn(z):
 
         self.assertEqual(fn(z), fn_opt(z))
 
-    @torch._dynamo.config.patch(capture_func_transforms=True)
     def test_is_vmapped_mutated_tensor_tensor(self):
         def fn(x):
             y = torch.vmap(torch.Tensor.acos_)(x)
@@ -2024,7 +2674,6 @@ def fn(x):
 
         self.assertEqual(fn(z), fn_opt(z))
 
-    @torch._dynamo.config.patch(capture_func_transforms=True)
     def test_is_init_in_compile_vmapped_mutated_tensor_tensor_multi_arg(self):
         def fn(y, z):
             a = y.clone()
@@ -2087,6 +2736,26 @@ def fn(param, param2):
         self.assertEqual(opt_fn(param, param), fn(param, param))
         self.assertEqual(cnts.frame_count, 2)  # Recompiles
 
+    def test_reconstructed_name(self):
+        lst = []
+
+        @torch._dynamo.disable
+        def disallowed(g):
+            lst.append(g.__name__)
+
+        def f():
+            def g():
+                return ()
+
+            disallowed(g)
+
+        f_opt = torch._dynamo
+        opt_f = torch._dynamo.optimize(backend="eager")(f)
+        opt_f()
+        f()
+        self.assertEqual(len(lst), 2)
+        self.assertEqual(lst[0], lst[1])
+
     @unittest.skipIf(
         sys.version_info < (3, 10),
         "zip strict kwargs not implemented for Python < 3.10",
@@ -2115,27 +2784,6 @@ def fn(x, ys, zs):
         with self.assertRaisesRegex(ValueError, "zip()"):
             opt_fn(x, ys[:1], zs)
 
-    def test_compare_constant_and_tensor(self):
-        for op in [
-            operator.lt,
-            operator.le,
-            operator.gt,
-            operator.ge,
-            operator.ne,
-            operator.eq,
-            operator.is_,
-            operator.is_not,
-        ]:
-            with self.subTest(op=op):
-
-                def fn(x):
-                    return op(-10, x)
-
-                opt_fn = torch.compile(fullgraph=True)(fn)
-
-                x = torch.randn(10)
-                self.assertEqual(opt_fn(x), fn(x))
-
 
 instantiate_parametrized_tests(FunctionTests)
 
diff --git a/test/dynamo/test_fx_passes_pre_grad.py b/test/dynamo/test_fx_passes_pre_grad.py
new file mode 100644
index 0000000000000..829e54951c83f
--- /dev/null
+++ b/test/dynamo/test_fx_passes_pre_grad.py
@@ -0,0 +1,38 @@
+# Owner(s): ["module: dynamo"]
+from unittest import mock
+
+import torch
+
+import torch._dynamo
+import torch._dynamo.test_case
+from torch._inductor.utils import pass_execution_and_save
+
+
+class FxPassesPreGradTests(torch._dynamo.test_case.TestCase):
+    @mock.patch("torch._inductor.utils.ShapeProp.propagate")
+    def test_pass_execution_and_save(self, mock_shape_prop):
+        class TestModule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.param = torch.nn.Parameter(torch.ones(4, 4))
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.param + x
+
+        def fx_pass(graph: torch.fx.GraphModule) -> None:
+            return
+
+        sample_input = torch.randn(4, 4)
+        m = TestModule()
+        m(sample_input)
+        exported_program = torch.export.export(m, (sample_input,))
+        gm = exported_program.graph_module
+
+        pass_execution_and_save(fx_pass, gm, sample_input, "Apply testing pass")
+        mock_shape_prop.assert_called_once()
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_guard_manager.py b/test/dynamo/test_guard_manager.py
new file mode 100644
index 0000000000000..4edc65c143ddc
--- /dev/null
+++ b/test/dynamo/test_guard_manager.py
@@ -0,0 +1,740 @@
+# Owner(s): ["module: dynamo"]
+import functools
+import weakref
+
+import torch
+import torch._dynamo
+import torch._dynamo.test_case
+from torch._C._dynamo import guards
+from torch._dynamo.convert_frame import GlobalStateGuard
+from torch.testing._internal.common_utils import set_default_dtype
+
+RootGuardManager = guards.RootGuardManager
+DictGuardManager = guards.DictGuardManager
+DictSubclassGuardManager = guards.DictSubclassGuardManager
+GetAttrGuardAccessor = guards.GetAttrGuardAccessor
+GetItemGuardAccessor = guards.GetItemGuardAccessor
+TypeGuardAccessor = guards.TypeGuardAccessor
+TENSOR_ALIASING = guards.TENSOR_ALIASING
+install_tensor_aliasing_guard = guards.install_tensor_aliasing_guard
+NO_TENSOR_ALIASING = guards.NO_TENSOR_ALIASING
+install_no_tensor_aliasing_guard = guards.install_no_tensor_aliasing_guard
+
+
+x = torch.tensor(4)
+weakref_x = weakref.ref(x)
+
+default_mgr_enum = torch._dynamo.guards.GuardManagerType.GUARD_MANAGER
+
+
+class Pair:
+    def __init__(self, x, y):
+        self.x = x
+        self.y = y
+
+
+global_pair = Pair(torch.randn(4), 1)
+
+
+def id_type(x):
+    return id(type(x))
+
+
+def equals_match(x, expected):
+    return x == expected
+
+
+def equals_match_verbose_code_parts(expected):
+    return [f"x == {expected}"]
+
+
+def ge_match(x, expected):
+    return x >= expected
+
+
+def ge_match_verbose_code_parts(expected):
+    return f"expected >= {expected}"
+
+
+def less_match(x, expected):
+    return x < expected
+
+
+def less_match_verbose_code_parts(expected):
+    return [f"expected < {expected}"]
+
+
+class GuardManagerTests(torch._dynamo.test_case.TestCase):
+    def test_global_state_guard(self):
+        guard = guards.GLOBAL_STATE(["global_state_check"])
+        self.assertTrue(guard(None))
+        with set_default_dtype(torch.double):
+            self.assertFalse(guard(None))
+            self.assertExpectedInline(
+                str(guard.check_verbose(None)),
+                """\
+GuardDebugInfo(
+result=0,
+verbose_code_parts=['GLOBAL_STATE changed: default_dtype '],
+num_guards_executed=0)
+""",
+            )
+        self.assertTrue(guard(None))
+        self.assertTrue(guard.check_verbose(None).result)
+        _orig = torch.are_deterministic_algorithms_enabled()
+        try:
+            torch.use_deterministic_algorithms(not _orig)
+            self.assertFalse(guard(None))
+            self.assertExpectedInline(
+                str(guard.check_verbose(None)),
+                """\
+GuardDebugInfo(
+result=0,
+verbose_code_parts=['GLOBAL_STATE changed: deterministic_algorithms '],
+num_guards_executed=0)
+""",
+            )
+        finally:
+            torch.use_deterministic_algorithms(_orig)
+        self.assertTrue(guard(None))
+        self.assertTrue(guard.check_verbose(None).result)
+
+    def test_global_state_reason(self):
+        with torch.enable_grad():
+            guards = GlobalStateGuard()
+        with torch.no_grad():
+            self.assertIs(guards.check(), False)
+            self.assertEqual(guards.reason(), "grad_mode ")
+
+    def test_python_lambda_leaf_guard(self):
+        const_guard = guards.LAMBDA_GUARD(
+            functools.partial(equals_match, expected=5),
+            equals_match_verbose_code_parts(5),
+        )
+        self.assertTrue(const_guard(5))
+        self.assertFalse(const_guard(4))
+        self.assertFalse(const_guard("foo"))
+
+    def test_type_guard(self):
+        foo = 4
+        guard = guards.TYPE_MATCH(id_type(foo), ["type(x) == int"])
+
+        self.assertTrue(guard(5))
+        self.assertTrue(guard(4))
+        self.assertFalse(guard("foo"))
+
+        foo = {"a": 1}
+        guard = guards.TYPE_MATCH(id_type(foo), ["type(x) == dict"])
+        self.assertTrue(guard(foo))
+        self.assertTrue(guard({}))
+        self.assertFalse(guard(5))
+        self.assertFalse(guard("foo"))
+
+        class Foo:
+            def __init__(self, x, y):
+                self.x = x
+                self.y = y
+
+        foo = Foo(1, 2)
+
+        guard = guards.TYPE_MATCH(id_type(foo), ["type(x) == Foo"])
+        self.assertTrue(guard(foo))
+        self.assertFalse(guard({}))
+        self.assertFalse(guard(5))
+        self.assertFalse(guard("foo"))
+
+    def test_id_guard(self):
+        foo = 4
+        guard = guards.ID_MATCH(id(foo), ["id(x) == id(foo)"])
+
+        self.assertTrue(guard(foo))
+        self.assertFalse(guard(5))
+        self.assertFalse(guard("foo"))
+
+        foo = {"a": 1}
+        guard = guards.ID_MATCH(id(foo), ["id(x) == id(foo)"])
+        self.assertTrue(guard(foo))
+        self.assertFalse(guard({"a": 1}))
+        self.assertFalse(guard({}))
+        self.assertFalse(guard(5))
+
+    def test_equals_guard(self):
+        foo = 4
+        guard = guards.EQUALS_MATCH(foo, ["x == 4"])
+
+        self.assertTrue(guard(4))
+        self.assertFalse(guard(5))
+        self.assertFalse(guard("foo"))
+
+        # tuple
+        foo = (1, 2, 3)
+        guard = guards.EQUALS_MATCH(foo, ["x == foo"])
+        self.assertTrue(guard(foo))
+        self.assertTrue(guard((1, 2, 3)))
+        self.assertFalse(guard((1, 2, 3, 4)))
+        self.assertFalse(guard({}))
+
+        # list
+        foo = [1, 2, 3]
+        guard = guards.EQUALS_MATCH(foo, ["x == foo"])
+        self.assertTrue(guard(foo))
+        self.assertTrue(guard([1, 2, 3]))
+        self.assertFalse(guard([1, 2, 3, 4]))
+
+        # type
+        foo = int
+        guard = guards.EQUALS_MATCH(foo, ["x == foo"])
+        self.assertTrue(guard(foo))
+        self.assertTrue(guard(int))
+        self.assertFalse(guard(float))
+
+    def test_default_device_guard(self):
+        foo = 1
+        guard = guards.DEFAULT_DEVICE(["cpu device"])
+        self.assertTrue(guard(foo))
+
+        try:
+            torch.set_default_device("cuda")
+            self.assertFalse(guard(foo))
+        finally:
+            torch.set_default_device(None)
+
+    def test_data_ptr_match_guard(self):
+        foo = torch.tensor([1, 2, 3])
+        guard = guards.DATA_PTR_MATCH(foo, ["x.data_ptr() == foo.data_ptr()"])
+        self.assertTrue(guard(foo))
+        self.assertFalse(guard(torch.tensor([1, 2, 3])))
+
+    def test_length_check_guard(self):
+        foo = [1, 2, 3]
+        guard = guards.LENGTH_CHECK(len(foo), ["len(x) == len(foo)"])
+        self.assertTrue(guard(foo))
+        self.assertFalse(guard([]))
+
+    def test_no_hasattr_guard(self):
+        class Bar:
+            def __init__(self):
+                self.bar = 2
+
+        bar = Bar()
+
+        class Foo:
+            def __init__(self):
+                self.foo = 2
+
+        foo = Foo()
+
+        guard = guards.NO_HASATTR("foo", ["hasattr(x, 'foo') == False"])
+        self.assertTrue(guard(bar))
+        self.assertFalse(guard(foo))
+
+    def test_tensor_aliasing_guard(self):
+        guard_manager = RootGuardManager()
+
+        a = torch.randn(3, 4)
+
+        class Foo:
+            def __init__(self, x, y):
+                self.x = x
+                self.y = y
+
+        f_locals = Foo(a, a)
+
+        x_guard_mgr = guard_manager.getattr_manager("x", "", a, default_mgr_enum)
+        y_guard_mgr = guard_manager.getattr_manager("y", "", a, default_mgr_enum)
+        install_tensor_aliasing_guard(x_guard_mgr, y_guard_mgr, ["x is y"])
+
+        # Check structure
+        x_guards = x_guard_mgr.get_leaf_guards()
+        y_guards = y_guard_mgr.get_leaf_guards()
+        self.assertEqual(len(x_guards), 1)
+        self.assertEqual(len(y_guards), 1)
+        self.assertTrue(isinstance(x_guards[0], TENSOR_ALIASING))
+        self.assertTrue(isinstance(y_guards[0], TENSOR_ALIASING))
+        # Check that the two guards are the same object
+        self.assertTrue(x_guards[0] is y_guards[0])
+
+        f_locals_unaliased = Foo(torch.randn(3, 4), torch.randn(3, 4))
+        self.assertEqual(len(x_guard_mgr.get_leaf_guards()), 1)
+        self.assertEqual(len(y_guard_mgr.get_leaf_guards()), 1)
+        self.assertTrue(guard_manager.check(f_locals))
+
+        self.assertFalse(guard_manager.check(f_locals_unaliased))
+
+    def test_dict_version_guard(self):
+        foo = {"a": 1, "b": 2}
+        guard = guards.DICT_VERSION(foo, ["x.version == foo.version"])
+
+        self.assertTrue(guard(foo))
+        self.assertFalse(guard(dict(foo)))
+        foo["a"] = 2
+        self.assertFalse(guard(foo))
+        self.assertFalse(guard({"a": 1, "b": 2}))
+        self.assertFalse(guard({}))
+
+    def test_dynamic_indices_guard(self):
+        guard1 = guards.DYNAMIC_INDICES(set(), ["x.size(0) == y.size(0)"])
+        guard2 = guards.DYNAMIC_INDICES(set({0, 1}), ["x.size(0) == y.size(0)"])
+
+        x = torch.randn(4)
+        self.assertTrue(guard1(x))
+        self.assertTrue(guard2(x))
+
+        x._dynamo_dynamic_indices = set({0})
+        self.assertFalse(guard1(x))
+        self.assertTrue(guard2(x))
+
+        x._dynamo_dynamic_indices = set({2})
+        self.assertFalse(guard1(x))
+        self.assertFalse(guard2(x))
+
+    def test_tensor_match_guard(self):
+        guard_manager = RootGuardManager()
+        x = torch.randn(4, 4)
+        size = list(x.size())
+        stride = list(x.stride())
+        guard_manager.add_tensor_match_guard(x, size, stride, "x", ["check_tensor(x)"])
+        self.assertTrue(guard_manager.check(x))
+        self.assertTrue(guard_manager.check_verbose(x).result)
+        self.assertTrue(guard_manager.check(torch.randn(4, 4)))
+        self.assertTrue(guard_manager.check_verbose(torch.randn(4, 4)).result)
+        self.assertFalse(guard_manager.check(x.t_()))
+
+        x = torch.randn(4, 4)
+        x.t_()
+        debug_info = guard_manager.check_verbose(x)
+        print(debug_info.verbose_code_parts[0])
+        self.assertTrue(
+            "tensor 'x' stride mismatch" in debug_info.verbose_code_parts[0]
+        )
+
+    def test_no_tensor_aliasing_guard(self):
+        guard_manager = RootGuardManager()
+
+        a = torch.randn(3, 4)
+
+        class Foo:
+            def __init__(self, x, y, z):
+                self.x = x
+                self.y = y
+                self.z = z
+
+        f_locals = Foo(a, a, a)
+
+        x_guard_mgr = guard_manager.getattr_manager("x", "", a, default_mgr_enum)
+        y_guard_mgr = guard_manager.getattr_manager("y", "", a, default_mgr_enum)
+        z_guard_mgr = guard_manager.getattr_manager("z", "", a, default_mgr_enum)
+        install_no_tensor_aliasing_guard(
+            [x_guard_mgr, y_guard_mgr, z_guard_mgr],
+            ["x", "y", "z"],
+            ["no_aliasing(x, y, z)"],
+        )
+
+        # Check structure
+        x_guards = x_guard_mgr.get_leaf_guards()
+        y_guards = y_guard_mgr.get_leaf_guards()
+        z_guards = z_guard_mgr.get_leaf_guards()
+        self.assertEqual(len(x_guards), 1)
+        self.assertEqual(len(y_guards), 1)
+        self.assertEqual(len(z_guards), 1)
+        self.assertTrue(isinstance(x_guards[0], NO_TENSOR_ALIASING))
+        self.assertTrue(isinstance(y_guards[0], NO_TENSOR_ALIASING))
+        self.assertTrue(isinstance(z_guards[0], NO_TENSOR_ALIASING))
+        # Check that the two guards are the same object
+        self.assertTrue(x_guards[0] is y_guards[0] is z_guards[0])
+        self.assertFalse(guard_manager.check(f_locals))
+        self.assertFalse(guard_manager.check_verbose(f_locals).result)
+
+        f_locals_unaliased = Foo(
+            torch.randn(3, 4),
+            torch.randn(3, 4),
+            torch.randn(3, 4),
+        )
+        self.assertTrue(guard_manager.check(f_locals_unaliased))
+        self.assertTrue(guard_manager.check_verbose(f_locals_unaliased).result)
+        # Check that hash map is cleared.
+        self.assertTrue(guard_manager.check(f_locals_unaliased))
+
+        f_locals_unaliased = Foo(
+            a,
+            torch.randn(3, 4),
+            a,
+        )
+        self.assertFalse(guard_manager.check(f_locals_unaliased))
+        self.assertFalse(guard_manager.check_verbose(f_locals_unaliased).result)
+
+    def test_weakref_alive_guard(self):
+        x = torch.rand(3, 4)
+        weakref_x = weakref.ref(x)
+
+        guard = guards.NOT_NONE(["weakref_x is not None"])
+        self.assertTrue(guard(weakref_x()))
+        del x
+        self.assertFalse(guard(weakref_x()))
+
+    def test_guard_manager_leaf_guard(self):
+        guard_manager = RootGuardManager()
+        guard_manager.add_type_match_guard(id_type(5), ["type(x) == int"])
+        guard_manager.add_lambda_guard(
+            functools.partial(ge_match, expected=5),
+            ge_match_verbose_code_parts(expected=5),
+        )
+        guard_manager.add_lambda_guard(
+            functools.partial(less_match, expected=10),
+            less_match_verbose_code_parts(expected=10),
+        )
+        self.assertEqual(len(guard_manager.get_leaf_guards()), 3)
+        self.assertEqual(len(guard_manager.get_accessors()), 0)
+        self.assertTrue(guard_manager.check(6))
+        self.assertFalse(guard_manager.check(4))
+        self.assertFalse(guard_manager.check("foo"))
+
+    def test_attr_guard_manager(self):
+        class Foo:
+            def __init__(self, x, y):
+                self.x = x
+                self.y = y
+
+        foo = Foo(1, 2)
+        guard_manager = RootGuardManager()
+        guard_manager.add_type_match_guard(id_type(foo), ["type(x) == Foo"])
+        guard_manager.getattr_manager("x", "x", 1, default_mgr_enum).add_lambda_guard(
+            functools.partial(equals_match, expected=foo.x),
+            equals_match_verbose_code_parts(foo.x),
+        )
+        guard_manager.getattr_manager("y", "y", 2, default_mgr_enum).add_lambda_guard(
+            functools.partial(equals_match, expected=foo.y),
+            equals_match_verbose_code_parts(foo.y),
+        )
+        self.assertEqual(len(guard_manager.get_leaf_guards()), 1)
+        # 2 child managers, one for x and one for y
+        self.assertEqual(len(guard_manager.get_accessors()), 2)
+        self.assertTrue(
+            isinstance(guard_manager.get_accessors()[0], GetAttrGuardAccessor)
+        )
+        self.assertTrue(
+            isinstance(guard_manager.get_accessors()[1], GetAttrGuardAccessor)
+        )
+        # Check leaf guards on child managers
+        self.assertEqual(
+            len(
+                guard_manager.getattr_manager(
+                    attr="x",
+                    source="x",
+                    example_value=None,
+                    guard_manager_enum=default_mgr_enum,
+                ).get_leaf_guards()
+            ),
+            1,
+        )
+        self.assertEqual(
+            len(
+                guard_manager.getattr_manager(
+                    "y", "y", None, default_mgr_enum
+                ).get_leaf_guards()
+            ),
+            1,
+        )
+
+        self.assertTrue(guard_manager.check(foo))
+        self.assertFalse(guard_manager.check(Foo(3, 4)))
+        self.assertFalse(guard_manager.check("foo"))
+
+    def test_item_guard_manager(self):
+        foo = [1, 2]
+        guard_manager = RootGuardManager()
+        guard_manager.add_type_match_guard(id_type(foo), ["type(x) == Foo"])
+        guard_manager.getitem_manager(0, "", 1, default_mgr_enum).add_lambda_guard(
+            functools.partial(equals_match, expected=foo[0]),
+            equals_match_verbose_code_parts(foo[0]),
+        )
+        guard_manager.getitem_manager(1, "", 2, default_mgr_enum).add_lambda_guard(
+            functools.partial(equals_match, expected=foo[1]),
+            equals_match_verbose_code_parts(foo[1]),
+        )
+        self.assertEqual(len(guard_manager.get_leaf_guards()), 1)
+        # 2 child managers, one for x and one for y
+        self.assertEqual(len(guard_manager.get_accessors()), 2)
+        self.assertTrue(
+            isinstance(guard_manager.get_accessors()[0], GetItemGuardAccessor)
+        )
+        self.assertTrue(
+            isinstance(guard_manager.get_accessors()[1], GetItemGuardAccessor)
+        )
+        # Check leaf guards on child managers
+        self.assertEqual(
+            len(
+                guard_manager.getitem_manager(
+                    0, "", None, default_mgr_enum
+                ).get_leaf_guards()
+            ),
+            1,
+        )
+        self.assertEqual(
+            len(
+                guard_manager.getitem_manager(
+                    1, "", None, default_mgr_enum
+                ).get_leaf_guards()
+            ),
+            1,
+        )
+
+        self.assertTrue(guard_manager.check(foo))
+        self.assertFalse(guard_manager.check([3, 4]))
+        self.assertFalse(guard_manager.check("foo"))
+
+    def test_dict_getitem_accessor(self):
+        foo = {
+            "a": 1,
+            "b": 2,
+        }
+
+        guards_manager = RootGuardManager()
+        guards_manager.add_type_match_guard(id_type(foo), ["type(x) == Foo"])
+        guards_manager.dict_getitem_manager(
+            "a", "", 1, default_mgr_enum
+        ).add_equals_match_guard(1, ["a == 1"])
+        guards_manager.dict_getitem_manager(
+            "b", "", 2, default_mgr_enum
+        ).add_equals_match_guard(2, ["b == 2"])
+
+        self.assertTrue(guards_manager.check(foo))
+        self.assertFalse(guards_manager.check({"a": 1, "b": 3}))
+
+    def test_globals(self):
+        global global_pair, Pair
+        guard_manager = RootGuardManager()
+        gpair_mgr = guard_manager.globals_dict_manager(
+            globals(), "", None, default_mgr_enum
+        ).getitem_manager("global_pair", "", global_pair, default_mgr_enum)
+
+        gpair_mgr.add_lambda_guard(
+            lambda x: isinstance(x, Pair)
+            and isinstance(x.x, torch.Tensor)
+            and isinstance(x.y, int),
+            "global guard fail",
+        )
+
+        self.assertTrue(guard_manager.check(global_pair))
+        global_pair.y = "foo"
+        self.assertFalse(guard_manager.check(global_pair))
+
+    def test_type_manager(self):
+        guard_manager = RootGuardManager()
+
+        class A:
+            a = 4
+
+        class B(A):
+            def mul(self, x):
+                super().mul(x)
+
+        foo = B()
+        f_locals = {"foo": foo}
+
+        # len(type(foo).__mro__) == 2
+        foo_mgr = guard_manager.getitem_manager("foo", "", foo, default_mgr_enum)
+        type_manager = foo_mgr.type_manager("", type(foo), default_mgr_enum)
+        self.assertTrue(isinstance(foo_mgr.get_accessors()[0], TypeGuardAccessor))
+        mro_manager = type_manager.getattr_manager(
+            "__mro__", "", type(foo).__mro__, default_mgr_enum
+        )
+        self.assertTrue(
+            isinstance(type_manager.get_accessors()[0], GetAttrGuardAccessor)
+        )
+        mro_manager.add_length_check_guard(
+            3,
+            "Expected len(type(foo).__mro__) == 3",
+        )
+
+        # type(foo).__mro__[0].a = 4
+        item_manager = mro_manager.getitem_manager(
+            1, "", type(foo).__mro__[1], default_mgr_enum
+        )
+        self.assertTrue(
+            isinstance(mro_manager.get_accessors()[0], GetItemGuardAccessor)
+        )
+        attr_manager = item_manager.getattr_manager(
+            "a", "", type(foo).__mro__[0].a, default_mgr_enum
+        )
+        self.assertTrue(
+            isinstance(item_manager.get_accessors()[0], GetAttrGuardAccessor)
+        )
+        attr_manager.add_lambda_guard(
+            lambda x: x == 4,
+            "Expected value 4",
+        )
+
+        self.assertTrue(guard_manager.check(f_locals))
+
+    def test_tuple_iterator_getitem(self):
+        a = (1, 2, 3, 4, 5, 6)
+        foo = iter(a)
+        next(foo)  # foo points at index=1
+
+        guard_manager = RootGuardManager()
+        # Check a[3] which is tuple_iterator_getitem(foo, 2)
+        guard_manager.add_tuple_iterator_length_guard(
+            5, id_type(iter(tuple())), ["len == 5"]
+        )
+        guard_manager.tuple_iterator_getitem_manager(
+            2, "", foo, default_mgr_enum
+        ).add_equals_match_guard(a[3], ["x==4"])
+
+        # Check that type match works
+        self.assertFalse(guard_manager.check(False))
+
+        self.assertTrue(guard_manager.check(foo))
+
+        # Check that index error fails gracefully
+        b = (1, 2)
+        b_foo = iter(b)
+        self.assertFalse(guard_manager.check(b_foo))
+
+    def test_global_weakref(self):
+        guard_manager = RootGuardManager()
+        globals_manager = guard_manager.globals_dict_manager(
+            globals(), "", None, default_mgr_enum
+        )
+        weakref_manager = globals_manager.global_weakref_manager(
+            "weakref_x", "", None, default_mgr_enum
+        )
+
+        weakref_manager.add_lambda_guard(
+            lambda x: isinstance(x, torch.Tensor),
+            "global weakref fail",
+        )
+
+        self.assertTrue(guard_manager.check(None))
+        global x
+        del x
+        self.assertFalse(guard_manager.check(None))
+
+    def test_lambda_manager(self):
+        a = (1, 1, 3, 4, 5, 6)
+
+        guard_manager = RootGuardManager()
+
+        # Check that we can use the same accessor
+        foo_mgr = guard_manager.lambda_manager(
+            lambda x: x[2], "", None, default_mgr_enum
+        )
+        foo_mgr.add_lambda_guard(
+            lambda x: x == 3,
+            "Expected value 3",
+        )
+        self.assertTrue(guard_manager.check(a))
+
+        # test that exception works
+        guard_manager = RootGuardManager()
+
+        def fn(x):
+            raise AssertionError("Test")
+            return x
+
+        foo_mgr = guard_manager.lambda_manager(fn, "", None, default_mgr_enum)
+
+        self.assertFalse(guard_manager.check(None))
+        debug_info = guard_manager.check_verbose(None)
+        self.assertFalse(debug_info.result)
+        self.assertTrue("Test" in debug_info.verbose_code_parts[0])
+
+    def test_dict_contains_guard(self):
+        foo = {"a": 1, "b": 2}
+        guard = guards.DICT_CONTAINS(True, "a", ["has a"])
+
+        self.assertTrue(guard(foo))
+        self.assertTrue(guard({"a": 1, "b": 2}))
+        self.assertFalse(guard({"b": 2, "c": 3}))
+        self.assertFalse(guard({}))
+
+        guard = guards.DICT_CONTAINS(False, "c", ["not has c"])
+        self.assertTrue(guard(foo))
+        self.assertTrue(guard({"a": 1, "b": 2}))
+        self.assertFalse(guard({"b": 2, "c": 3}))
+        self.assertTrue(guard({}))
+
+    def test_dict_guard_manager(self):
+        root = RootGuardManager()
+
+        def nothing():
+            pass
+
+        f_locals = {
+            "d": {"a": 1, nothing: {"z": 3}, 100: torch.randn(4)},
+        }
+
+        # its a getitem_manager just for f_locals. But the child guard manager
+        # should be a DictGuardManager.
+        dict_mgr = root.getitem_manager(
+            "d",
+            "",
+            f_locals["d"],
+            torch._dynamo.guards.GuardManagerType.DICT_GUARD_MANAGER,
+        )
+        self.assertTrue(isinstance(dict_mgr, DictGuardManager))
+
+        self.assertTrue(root.check(f_locals))
+
+        # Check that no one can add a leaf guard
+        with self.assertRaises(RuntimeError):
+            dict_mgr.add_id_match_guard(id_type(f_locals), "id match")
+
+        # Check that no one can add an arbitrary accessor
+        with self.assertRaises(RuntimeError):
+            dict_mgr.getitem_manager("a", "", f_locals["d"]["a"])
+
+        # Check that it fails with different length dict
+        f_locals_prime = {
+            "d": {"a": 1, "b": 2},
+        }
+        self.assertFalse(root.check(f_locals_prime))
+
+        # Add key-value manager ("a" : 1)
+        self.assertTrue(root.check(f_locals))
+        dict_mgr.get_key_manager(0, "", "a", default_mgr_enum).add_equals_match_guard(
+            "a",
+            ["dict.keys()[0] == a"],
+        )
+        self.assertTrue(root.check(f_locals))
+        dict_mgr.get_value_manager(0, "", 1, default_mgr_enum).add_equals_match_guard(
+            1, ["d[0] == 1"]
+        )
+        self.assertTrue(root.check(f_locals))
+
+        # Add key-value manager (nothing : {"z" : 3})
+        self.assertTrue(root.check(f_locals))
+        dict_mgr.get_key_manager(1, "", nothing, default_mgr_enum).add_lambda_guard(
+            lambda x: x is nothing, ["x is nothing"]
+        )
+        self.assertTrue(root.check(f_locals))
+        value_mgr = dict_mgr.get_value_manager(
+            1,
+            "",
+            f_locals["d"][nothing],
+            torch._dynamo.guards.GuardManagerType.DICT_GUARD_MANAGER,
+        )
+        self.assertTrue(isinstance(value_mgr, DictGuardManager))
+        self.assertTrue(root.check(f_locals))
+
+        # Check structure
+        # Check that we are only guarding on two keys. This is common in
+        # LazyVariableTracker.
+        self.assertEqual(len(dict_mgr.get_key_value_managers()), 2)
+
+        f_locals["d"]["a"] = 2
+        self.assertFalse(root.check(f_locals))
+        self.assertFalse(root.check_verbose(f_locals).result)
+
+        f_locals["d"]["a"] = 1
+        self.assertTrue(root.check(f_locals))
+
+        f_locals["d"].pop(100)
+        # fails because of len check
+        self.assertFalse(root.check(f_locals))
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index de371dc5b2809..c0b5bfc595363 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -3,7 +3,9 @@
 import functools
 import pprint
 import re
+import sys
 import unittest
+import warnings
 
 import functorch.experimental.control_flow as control_flow
 
@@ -24,10 +26,16 @@
 )
 from torch._dynamo.utils import counters, ifdynstaticdefault
 from torch._higher_order_ops.wrap import wrap
+from torch.testing._internal.common_utils import (
+    munge_exc,
+    TEST_WITH_TORCHDYNAMO,
+    xfailIfTorchDynamo,
+)
 from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
 
 
-requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 
 
 def check_dynamic_shape_capture():
@@ -197,8 +205,8 @@ def f(x):
 
         x = torch.randn(3)
         with self.assertRaisesRegex(
-            RuntimeError,
-            "while introspecting wrap, we were unable to trace function `inner`",
+            torch._dynamo.exc.Unsupported,
+            r"HigherOrderOperator: Mutating a variable not in the current scope \(SideEffects\)",
         ):
             f(x)
 
@@ -316,23 +324,23 @@ def my_args_generator(t):
             actual_graph,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_d_x_ : torch.Tensor, L_d_y_0_ : torch.Tensor, L_d_y_1_2_ : torch.Tensor):
+    def forward(self, L_d_x_: "f32[]", L_d_y_0_: "f32[]", L_d_y_1_2_: "f32[]"):
         l_d_x_ = L_d_x_
         l_d_y_0_ = L_d_y_0_
         l_d_y_1_2_ = L_d_y_1_2_
 
         wrap_body_0 = self.wrap_body_0
         wrap = torch._higher_order_ops.wrap.wrap(wrap_body_0, l_d_x_, l_d_y_0_, l_d_y_1_2_);  wrap_body_0 = l_d_x_ = l_d_y_0_ = l_d_y_1_2_ = None
-        getitem = wrap[0];  wrap = None
+        getitem: "f32[]" = wrap[0];  wrap = None
         return (getitem,)
 
     class GraphModule(torch.nn.Module):
-        def forward(self, l_d_x_, l_d_y_0_, l_d_y_1_2_):
-            sin = l_d_x_.sin();  l_d_x_ = None
-            cos = l_d_y_0_.cos();  l_d_y_0_ = None
-            add = sin + cos;  sin = cos = None
-            sin_1 = l_d_y_1_2_.sin();  l_d_y_1_2_ = None
-            sub = add - sin_1;  add = sin_1 = None
+        def forward(self, l_d_x_: "f32[]", l_d_y_0_: "f32[]", l_d_y_1_2_: "f32[]"):
+            sin: "f32[]" = l_d_x_.sin();  l_d_x_ = None
+            cos: "f32[]" = l_d_y_0_.cos();  l_d_y_0_ = None
+            add: "f32[]" = sin + cos;  sin = cos = None
+            sin_1: "f32[]" = l_d_y_1_2_.sin();  l_d_y_1_2_ = None
+            sub: "f32[]" = add - sin_1;  add = sin_1 = None
             return (sub,)
 """,  # NOQA: B950
         )
@@ -356,18 +364,18 @@ def f(x, y):
                 actual_graph,
                 """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
+    def forward(self, L_x_: "f32[3, 1]"):
         l_x_ = L_x_
 
         wrap_body_0 = self.wrap_body_0
         wrap = torch._higher_order_ops.wrap.wrap(wrap_body_0, l_x_);  wrap_body_0 = l_x_ = None
-        getitem = wrap[0];  wrap = None
+        getitem: "f32[3]" = wrap[0];  wrap = None
         return (getitem,)
 
     class GraphModule(torch.nn.Module):
-        def forward(self, l_x_):
-            view = l_x_.view(3);  l_x_ = None
-            add = view + 0.5;  view = None
+        def forward(self, l_x_: "f32[3, 1]"):
+            view: "f32[3]" = l_x_.view(3);  l_x_ = None
+            add: "f32[3]" = view + 0.5;  view = None
             return (add,)
 """,
             )
@@ -376,20 +384,20 @@ def forward(self, l_x_):
                 actual_graph,
                 """\
 class GraphModule(torch.nn.Module):
-    def forward(self, s0 : torch.SymInt, L_x_ : torch.Tensor):
+    def forward(self, s0: "Sym(s0)", L_x_: "f32[s0, 1]"):
         l_x_ = L_x_
 
-        size = l_x_.size(0)
+        size: "Sym(s0)" = l_x_.size(0)
 
         wrap_body_0 = self.wrap_body_0
         wrap = torch._higher_order_ops.wrap.wrap(wrap_body_0, l_x_, size);  wrap_body_0 = l_x_ = size = None
-        getitem = wrap[0];  wrap = None
+        getitem: "f32[s0]" = wrap[0];  wrap = None
         return (getitem,)
 
     class GraphModule(torch.nn.Module):
-        def forward(self, l_x_, size):
-            view = l_x_.view(size);  l_x_ = size = None
-            add = view + 0.5;  view = None
+        def forward(self, l_x_: "f32[s0, 1]", size: "Sym(s0)"):
+            view: "f32[s0]" = l_x_.view(size);  l_x_ = size = None
+            add: "f32[s0]" = view + 0.5;  view = None
             return (add,)
 """,
             )
@@ -1121,7 +1129,7 @@ def forward(self, L_xs_ : torch.Tensor, L_y_ : torch.Tensor):
     l_xs_ = L_xs_
     l_y_ = L_y_
     map_body_1 = self.map_body_1
-    map_impl = torch.ops.higher_order.map_impl(map_body_1, 1, l_xs_, l_y_);  map_body_1 = l_xs_ = l_y_ = None
+    map_impl = torch.ops.higher_order.map_impl(map_body_1, [l_xs_], [l_y_]);  map_body_1 = l_xs_ = l_y_ = None
     getitem_1 = map_impl[0];  map_impl = None
     return (getitem_1,)""",
             )
@@ -1131,7 +1139,7 @@ def forward(self, L_xs_ : torch.Tensor, L_y_ : torch.Tensor):
 def forward(self, getitem, l_y_):
     getitem_1 = getitem[0]
     map_body_0 = self.map_body_0
-    map_impl = torch.ops.higher_order.map_impl(map_body_0, 1, getitem, l_y_);  map_body_0 = getitem = l_y_ = None
+    map_impl = torch.ops.higher_order.map_impl(map_body_0, [getitem], [l_y_]);  map_body_0 = getitem = l_y_ = None
     getitem_2 = map_impl[0];  map_impl = None
     return (getitem_2,)""",
             )
@@ -1152,7 +1160,7 @@ def f(x):
 def forward(self, L_x_ : torch.Tensor):
     l_x_ = L_x_
     map_body_0 = self.map_body_0
-    map_impl = torch.ops.higher_order.map_impl(map_body_0, 1, l_x_);  map_body_0 = l_x_ = None
+    map_impl = torch.ops.higher_order.map_impl(map_body_0, [l_x_], []);  map_body_0 = l_x_ = None
     getitem_1 = map_impl[0]
     getitem_2 = map_impl[1];  map_impl = None
     return (getitem_1, getitem_2)""",
@@ -1188,7 +1196,7 @@ def inner_f(xs):
 def forward(self, L_x_ : torch.Tensor):
     l_x_ = L_x_
     map_body_0 = self.map_body_0
-    map_impl = torch.ops.higher_order.map_impl(map_body_0, 1, l_x_);  map_body_0 = l_x_ = None
+    map_impl = torch.ops.higher_order.map_impl(map_body_0, [l_x_], []);  map_body_0 = l_x_ = None
     getitem_1 = map_impl[0]
     getitem_2 = map_impl[1]
     getitem_3 = map_impl[2]
@@ -1237,7 +1245,7 @@ def inner(x, y):
 def forward(self, L_x_ : torch.Tensor):
     l_x_ = L_x_
     map_body_0 = self.map_body_0
-    map_impl = torch.ops.higher_order.map_impl(map_body_0, 1, l_x_, 3);  map_body_0 = l_x_ = None
+    map_impl = torch.ops.higher_order.map_impl(map_body_0, [l_x_], [3]);  map_body_0 = l_x_ = None
     getitem_1 = map_impl[0];  map_impl = None
     return (getitem_1,)""",
             )
@@ -1271,7 +1279,7 @@ def inner(x, y):
 def forward(self, L_x_ : torch.Tensor):
     l_x_ = L_x_
     map_body_0 = self.map_body_0
-    map_impl = torch.ops.higher_order.map_impl(map_body_0, 1, l_x_, 3);  map_body_0 = l_x_ = None
+    map_impl = torch.ops.higher_order.map_impl(map_body_0, [l_x_], [3]);  map_body_0 = l_x_ = None
     getitem_1 = map_impl[0];  map_impl = None
     return (getitem_1,)""",
             )
@@ -1284,6 +1292,59 @@ def forward(self, getitem, const):
     return (sin,)""",
             )
 
+    def test_map_example_value_metadata_consistent_with_eager(self):
+        from torch._higher_order_ops.map import map_dense
+
+        backend = EagerAndRecordGraphs()
+
+        def inner(x):
+            return x.sin(), x.cos().T, x.sin().view(-1)
+
+        rand_44 = torch.randn(4, 4)
+        inps = [
+            torch.randn(3),
+            torch.randn(3, 4),
+            torch.randn(3, 4, 5, requires_grad=True),
+            torch.randn(3, 4, 5, requires_grad=True).permute((2, 0, 1)),
+            torch.randn(3, 4, 5, requires_grad=True).detach(),
+            torch.randn(3, 4, 5, requires_grad=True).narrow(1, 1, 2),
+            rand_44.T,
+            rand_44[::2],
+            rand_44[::2, ::2],
+            rand_44[1::3, 1::3],
+            rand_44[1::3, 1::2].T,
+            rand_44.unsqueeze(1),
+            rand_44.squeeze(0),
+            rand_44.reshape(2, 8),
+        ]
+        for x in inps:
+            compiled_ret = torch.compile(
+                control_flow.map, backend=backend, fullgraph=True
+            )(inner, x)
+            eager_sin, eager_transpose, eager_view = map_dense(inner, (x,), tuple())
+
+            map_node = next(
+                node
+                for node in backend.graphs[0].graph.nodes
+                if node.op == "call_function" and "map" in node.name
+            )
+
+            fake_sin, fake_transpose, fake_view = map_node.meta["example_value"]
+
+            def _check_size_stride_contiguous(x, y):
+                self.assertEqual(y.size(), x.size())
+                self.assertEqual(y.stride(), x.stride())
+                self.assertEqual(y.requires_grad, x.requires_grad)
+                self.assertEqual(x.is_contiguous(), True)
+                self.assertEqual(y.is_contiguous(), True)
+
+            _check_size_stride_contiguous(eager_sin, fake_sin)
+            _check_size_stride_contiguous(eager_transpose, fake_transpose)
+            _check_size_stride_contiguous(eager_view, fake_view)
+
+            torch._dynamo.reset()
+            backend.graphs.clear()
+
     def test_cond_subgraph_name_is_valid(self):
         backend = EagerAndRecordGraphs()
         cnt = CompileCounterWithBackend(backend)
@@ -1314,8 +1375,7 @@ def false_fn2(x, y):
 
         cond_gm = backend.graphs[0]
         name_set = set()
-        for name, _ in cond_gm.named_modules():
-            name_set.add(name)
+        name_set.update(name for name, _ in cond_gm.named_modules())
         self.assertEqual(
             name_set,
             {
@@ -1674,8 +1734,7 @@ def f(x, y):
         self.assertEqual(result, x + y + x)
         wrap_gm = backend.graphs[0]
         names = set()
-        for mod_name, _ in wrap_gm.named_modules():
-            names.add(mod_name)
+        names.update(mod_name for mod_name, _ in wrap_gm.named_modules())
         self.assertEqual(
             names,
             {
@@ -1722,21 +1781,21 @@ def my_args_generator():
             actual_graph,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_arg1_0_ : torch.Tensor, L_arg2_0_ : torch.Tensor):
+    def forward(self, L_arg1_0_: "f32[3]", L_arg2_0_: "f32[3]"):
         l_arg1_0_ = L_arg1_0_
         l_arg2_0_ = L_arg2_0_
 
         wrap_body_0 = self.wrap_body_0
         wrap = torch._higher_order_ops.wrap.wrap(wrap_body_0, l_arg1_0_, l_arg2_0_);  wrap_body_0 = l_arg1_0_ = l_arg2_0_ = None
-        getitem = wrap[0]
-        getitem_1 = wrap[1];  wrap = None
+        getitem: "f32[3]" = wrap[0]
+        getitem_1: "f32[3]" = wrap[1];  wrap = None
         return (getitem, getitem_1)
 
     class GraphModule(torch.nn.Module):
-        def forward(self, l_arg1_0_, l_arg2_0_):
-            add = l_arg1_0_ + 1;  l_arg1_0_ = None
+        def forward(self, l_arg1_0_: "f32[3]", l_arg2_0_: "f32[3]"):
+            add: "f32[3]" = l_arg1_0_ + 1;  l_arg1_0_ = None
 
-            add_1 = l_arg2_0_ + 1;  l_arg2_0_ = None
+            add_1: "f32[3]" = l_arg2_0_ + 1;  l_arg2_0_ = None
             return (add, add_1)
 """,
         )
@@ -1919,21 +1978,21 @@ def f(x):
             graph,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
+    def forward(self, L_x_: "f32[2, 3]"):
         l_x_ = L_x_
 
         wrap_body_0 = self.wrap_body_0
         wrap = torch._higher_order_ops.wrap.wrap(wrap_body_0, l_x_);  wrap_body_0 = l_x_ = None
-        a = wrap[0]
-        b = wrap[1];  wrap = None
+        a: "f32[2, 3]" = wrap[0]
+        b: "f32[2, 3]" = wrap[1];  wrap = None
 
-        add = a + b;  a = b = None
+        add: "f32[2, 3]" = a + b;  a = b = None
         return (add,)
 
     class GraphModule(torch.nn.Module):
-        def forward(self, l_x_):
-            sin = l_x_.sin()
-            cos = l_x_.cos();  l_x_ = None
+        def forward(self, l_x_: "f32[2, 3]"):
+            sin: "f32[2, 3]" = l_x_.sin()
+            cos: "f32[2, 3]" = l_x_.cos();  l_x_ = None
             return (sin, cos)
 """,
         )
@@ -1957,17 +2016,17 @@ def f(x):
             graph,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
+    def forward(self, L_x_: "f32[3]"):
         l_x_ = L_x_
 
         wrap_body_0 = self.wrap_body_0
         wrap = torch._higher_order_ops.wrap.wrap(wrap_body_0, l_x_);  wrap_body_0 = l_x_ = None
-        getitem = wrap[0];  wrap = None
+        getitem: "f32[3]" = wrap[0];  wrap = None
         return (getitem,)
 
     class GraphModule(torch.nn.Module):
-        def forward(self, l_x_):
-            neg = -l_x_;  l_x_ = None
+        def forward(self, l_x_: "f32[3]"):
+            neg: "f32[3]" = -l_x_;  l_x_ = None
             return (neg,)
 """,
         )
@@ -2235,7 +2294,6 @@ def inner2(x, y):
             """{'add': ['map', 'map', 'add'], 'cos': ['map', 'cos'], 'sin': ['sin']}""",
         )
 
-    @config.patch(capture_func_transforms=True)
     def test_grad_source_fn_stack(self):
         backend = EagerAndRecordGraphs()
 
@@ -2253,12 +2311,9 @@ def wrapper_fn(x):
         actual_stack = self._get_source_fn_stack(gm, {"sum_1", "sin"})
         self.assertExpectedInline(
             pprint.pformat(actual_stack),
-            """\
-{'sin': ['grad_impl', 'grad_impl', 'sin'],
- 'sum_1': ['grad_impl', 'grad_impl', 'sum_1']}""",
+            """{'sin': ['sin']}""",
         )
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_source_fn_stack(self):
         backend = EagerAndRecordGraphs()
 
@@ -2272,13 +2327,12 @@ def fn(x):
         x = torch.randn(3, 3, 3, 3)
         fn(x)
         gm = backend.graphs[0]
-        actual_stack = self._get_source_fn_stack(gm, {"sum_1", "sum_2", "add"})
+        actual_stack = self._get_source_fn_stack(
+            gm, {"sum_1", "sum_2", "batched_output"}
+        )
         self.assertExpectedInline(
             pprint.pformat(actual_stack),
-            """\
-{'add': ['vmap_impl', 'vmap_impl', 'add'],
- 'sum_1': ['vmap_impl', 'vmap_impl', 'sum_1'],
- 'sum_2': ['vmap_impl', 'vmap_impl', 'sum_2']}""",
+            """{'sum_1': ['sum_1'], 'sum_2': ['sum_2']}""",
         )
 
     def test_cond_pytree_operands(self):
@@ -2370,129 +2424,334 @@ def fn(pred, pytree_in):
                 torch.compile(fn, backend="eager")(pred, pytree_in)
 
 
-class FuncTorchHigherOrderOpTests(torch._dynamo.test_case.TestCase):
-    def _compile_check(self, fn, inputs, fullgraph=True, graph_idx=0):
-        backend = EagerAndRecordGraphs()
-        actual = fn(*inputs)
-        expected = torch.compile(fn, backend=backend, fullgraph=fullgraph)(*inputs)
+class HigherOrderOpVmapGuardTests(LoggingTestCase):
+    @make_logging_test(recompiles=True)
+    def test_vmap_grad_guard_ok(self, records):
+        vmap = torch.vmap
+        grad = torch.func.grad
 
-        self.assertEqual(actual, expected)
+        def g(x):
+            return vmap(grad(torch.sin))(x)
 
-        wrapped_gm = backend.graphs[graph_idx]
-        return wrapped_gm
+        @torch.compile(backend="eager")
+        def fn(x):
+            return vmap(g)(x)
 
-    @config.patch(capture_func_transforms=True)
-    def test_grad(self):
-        counters.clear()
+        x = torch.randn(4, 5)
+        y = fn(x)
+        # sanity check
+        self.assertEqual(len(records), 0)
+        self.assertEqual(x.cos(), y)
 
-        def fn(x):
-            return x.sin().sum()
+        # Calling the same function again won't have any effect on guards
+        fn(x)
+        self.assertEqual(len(records), 0)
 
-        def wrapper_fn(x):
-            return torch.func.grad(fn)(x)
+    @xfailIfTorchDynamo
+    @make_logging_test(recompiles=True)
+    def test_grad_guard_fail(self, records):
+        grad = torch.func.grad
 
-        x = torch.randn(3, 3, 3)
-        wrapped_gm = self._compile_check(wrapper_fn, (x,))
+        @torch.compile(backend="eager")
+        def fn(x):
+            return grad(torch.sin)(x.sum())
 
-        # Dynamic shapes produce a slightly different graph.
-        if check_dynamic_shape_capture():
-            return
+        x = torch.randn([])
+        fn(x)
+        self.assertEqual(len(records), 0)
 
-        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
-        self.assertExpectedInline(
-            actual,
+        # calling again should not invalidate the graph
+        fn(x)
+        self.assertEqual(len(records), 0)
+
+        # call grad should retrigger compilation
+        x = torch.randn(3)
+        grad(fn)(x)
+        self.assertGreater(len(records), 0)
+        record = self.getRecord(records, "pyfunctorch")
+        self.assertIn(
             """\
-class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
-        l_x_ = L_x_
+    triggered by the following guard failure(s):
+    - torch._functorch.pyfunctorch.compare_functorch_state([])""",
+            munge_exc(record.getMessage()),
+        )
 
-        grad_body_0 = self.grad_body_0
-        grad_proxy = torch.func.grad(grad_body_0, 0, False);  grad_body_0 = None
-        call = grad_proxy.__call__(l_x_);  grad_proxy = l_x_ = None
-        contiguous = call.contiguous();  call = None
-        return (contiguous,)
+    @make_logging_test(recompiles=True)
+    def test_dual_level_guard(self, records):
+        fwAD = torch.autograd.forward_ad
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, l_x_):
-            sin = l_x_.sin();  l_x_ = None
-            sum_1 = sin.sum();  sin = None
-            return sum_1
-""",
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(foo, tangent):
+            with fwAD.dual_level():
+                dual = fwAD.make_dual(foo, tangent[1:])
+                return dual
+
+        foo = torch.rand(2)
+        tangent = torch.rand(3)
+        fn(foo, tangent)
+        self.assertEqual(len(records), 0)
+
+        # calling again should not invalidate the graph
+        fn(foo, tangent)
+        self.assertEqual(len(records), 0)
+
+        # assertRaises is only here because Nested forward mode AD is not supported
+        with self.assertRaises(torch._dynamo.exc.InternalTorchDynamoError):
+            with fwAD.dual_level():
+                fn(foo, tangent)
+        self.assertGreater(len(records), 0)
+        record = self.getRecord(records, "forward_ad")
+        self.assertIn(
+            """\
+    triggered by the following guard failure(s):
+    - torch.autograd.forward_ad._current_level == -1""",
+            munge_exc(record.getMessage()),
         )
 
-    @config.patch(capture_func_transforms=True)
-    def test_grad_freevar_tensor(self):
-        counters.clear()
-        y = torch.randn(3, 3)
+    @xfailIfTorchDynamo
+    @make_logging_test(recompiles=True)
+    def test_jvp_guard_fail(self, records):
+        jvp = torch.func.jvp
+        vmap = torch.func.vmap
 
+        @torch.compile(backend="eager")
         def fn(x):
-            return (x.sin() + y).sum()
+            return jvp(torch.sin, (x,), (x,))
 
-        def wrapper_fn(x):
-            return torch.func.grad(fn)(x)
+        x = torch.randn(3, 4)
+        fn(x)
+        self.assertEqual(len(records), 0)
 
-        x = torch.randn(3, 3, 3)
-        expected = wrapper_fn(x)
-        actual = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=True)(x)
-        self.assertEqual(actual, expected)
+        # calling again should not invalidate the graph
+        fn(x)
+        self.assertEqual(len(records), 0)
 
-    @config.patch(capture_func_transforms=True)
-    def test_grad_freevar_python_scalar(self):
-        counters.clear()
-        y = 3
+        # call jvp should retrigger compilation
+        x = torch.randn(3, 4, 5)
+        jvp(vmap(fn), (x,), (x,))
+
+        self.assertGreater(len(records), 0)
+        if self.hasRecord(records, "pyfunctorch"):
+            record = self.getRecord(records, "pyfunctorch")
+            self.assertIn(
+                """\
+    triggered by the following guard failure(s):
+    - torch._functorch.pyfunctorch.compare_functorch_state([])""",
+                munge_exc(record.getMessage()),
+            )
+        elif self.hasRecord(records, "forward_ad"):
+            record = self.getRecord(records, "forward_ad")
+            self.assertIn(
+                """\
+    triggered by the following guard failure(s):
+    - torch.autograd.forward_ad._current_level == -1""",
+                munge_exc(record.getMessage()),
+            )
 
+    @make_logging_test(recompiles=True)
+    def test_vmap_guard_ok(self, records):
+        @torch.compile(backend="eager")
         def fn(x):
-            return (x.sin() + y).sum()
+            return torch.vmap(lambda x: x.sin())(x)
+
+        x = torch.randn(3, 3, 4, 5)
+        y = fn(x)
+        # sanity check
+        self.assertEqual(len(records), 0)
+        self.assertEqual(x.sin(), y)
+
+        # Calling the same function again won't have any effect on guards
+        z = fn(x)
+        self.assertEqual(len(records), 0)
+        self.assertEqual(x.sin(), z)
+
+        # calling with a different object will also not affect guards
+        w = fn(z)
+        self.assertEqual(len(records), 0)
+        self.assertEqual(z.sin(), w)
+
+    @xfailIfTorchDynamo
+    @make_logging_test(recompiles=True)
+    def test_vmap_guard_fail_different_state(self, records):
+        @torch.compile(backend="eager")
+        def fn(x):
+            return torch.vmap(lambda x: x.sin())(x)
+
+        x = torch.zeros(3, 4)
+        y = torch.vmap(fn, randomness="same")(x)
+        self.assertEqual(x.sin(), y)
+        self.assertEqual(len(records), 0)
+
+        # call vmap(vmap(fn))(x) should retrigger compilation
+        y = torch.vmap(fn, randomness="different")(x)
+        self.assertEqual(x.sin(), y)
+        self.assertGreater(len(records), 0)
+        record = self.getRecord(records, "pyfunctorch")
+        self.assertIn(
+            """\
+    triggered by the following guard failure(s):
+    - torch._functorch.pyfunctorch.compare_functorch_state([('Vmap', 1, 'same')])""",
+            record.getMessage(),
+        )
 
-        def wrapper_fn(x):
-            return torch.func.grad(fn)(x)
+    @xfailIfTorchDynamo
+    @make_logging_test(recompiles=True)
+    def test_vmap_guard_fail(self, records):
+        @torch.compile(backend="eager")
+        def fn(x):
+            return torch.vmap(lambda x: x.sin())(x)
+
+        x = torch.zeros(3, 3, 4, 5)
+        y = torch.vmap(fn)(x)
+        self.assertEqual(x.sin(), y)
+        self.assertEqual(len(records), 0)
+
+        # call vmap(vmap(fn))(x) should retrigger compilation as
+        # _functorch.current_level() is not the same
+        x = torch.zeros(3, 3, 3, 4, 5)
+        y = torch.vmap(torch.vmap(fn))(x)
+        self.assertEqual(x.sin(), y)
+        self.assertGreater(len(records), 0)
+        record = self.getRecord(records, "pyfunctorch")
+        self.assertIn(
+            """\
+    triggered by the following guard failure(s):
+    - torch._functorch.pyfunctorch.compare_functorch_state([('Vmap', 1, 'error')])""",
+            record.getMessage(),
+        )
 
-        x = torch.randn(3, 3, 3)
-        wrapped_gm = self._compile_check(wrapper_fn, (x,))
+    @xfailIfTorchDynamo
+    @make_logging_test(recompiles=True)
+    def test_vmap_grad_vmap_guard_fail(self, records):
+        vmap = torch.vmap
+        grad = torch.func.grad
 
-        # Dynamic shapes produce a slightly different graph.
-        if check_dynamic_shape_capture():
-            return
+        def g(x):
+            y = vmap(torch.sin, randomness="same")(x)
+            return y.sum(0)
 
-        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
-        self.assertExpectedInline(
-            actual,
+        @torch.compile(backend="eager")
+        def fn(x):
+            return grad(g)(x)
+
+        x = torch.randn(3, 3)
+        y = vmap(fn, randomness="error")(x)
+        self.assertEqual(x.cos(), y)
+
+        # previous FX graph should be invalidated
+        x = torch.randn(3, 3, 4)
+        y = vmap(vmap(fn, randomness="different"))(x)
+        self.assertGreater(len(records), 0)
+        record = self.getRecord(records, "pyfunctorch")
+        self.assertIn(
             """\
-class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
-        l_x_ = L_x_
+    triggered by the following guard failure(s):
+    - torch._functorch.pyfunctorch.compare_functorch_state([('Vmap', 1, 'error')])""",
+            munge_exc(record.getMessage()),
+        )
 
-        grad_body_0 = self.grad_body_0
-        grad_proxy = torch.func.grad(grad_body_0, 0, False);  grad_body_0 = None
-        call = grad_proxy.__call__(l_x_);  grad_proxy = l_x_ = None
-        contiguous = call.contiguous();  call = None
-        return (contiguous,)
+    @xfailIfTorchDynamo
+    @make_logging_test(recompiles=True)
+    def test_vmap_recompile_different_states(self, records):
+        @torch.compile(backend="eager")
+        def fn(x):
+            return torch.vmap(lambda x: x.sin())(x)
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, l_x_):
-            sin = l_x_.sin();  l_x_ = None
-            add = sin + 3;  sin = None
-            sum_1 = add.sum();  add = None
-            return sum_1
-""",
+        x = torch.zeros(3, 3, 4, 5)
+        y = torch.vmap(fn, randomness="same")(x)
+        self.assertEqual(len(records), 0)  # sanity check
+
+        y = torch.vmap(fn, randomness="different")(x)
+        self.assertGreater(len(records), 0)
+        record = self.getRecord(records, "pyfunctorch")
+        self.assertIn(
+            """\
+    triggered by the following guard failure(s):
+    - torch._functorch.pyfunctorch.compare_functorch_state([('Vmap', 1, 'same')])""",
+            munge_exc(record.getMessage()),
         )
 
     @config.patch(capture_func_transforms=True)
-    def test_grad_capture_tensor(self):
-        counters.clear()
+    @make_logging_test(guards=True)
+    def test_emit_functorch_guard_if_active(self, records):
+        @torch.compile(backend="eager")
+        def fn(x):
+            return torch.sin(x)
+
+        x = torch.randn(3, 4)
+        _ = fn(x)
+        self.assertFalse(self.hasRecord(records, "pyfunctorch"))  # sanity check
+
+        _ = torch.vmap(fn)(x)
+        self.assertTrue(self.hasRecord(records, "pyfunctorch"))
+        record = self.getRecord(records, "pyfunctorch")
+        self.assertIn(
+            """torch._functorch.pyfunctorch.compare_functorch_state([('Vmap', 1, 'error')])""",
+            munge_exc(record.getMessage()),
+        )
 
-        def wrapper_fn(x):
-            y = torch.randn(3)
+    @make_logging_test(recompiles=True)
+    def test_linearize_recompiles(self, records):
+        @torch.compile(backend="eager")
+        def fn(x):
+            out, jvp_fn = torch.func.linearize(torch.sin, x)
+            return out, jvp_fn(x)
 
-            def fn(x):
-                return (x.sin() + y).sum()
+        x = torch.randn(2, 3)
+        fn(x)
+        self.assertEqual(len(records), 0)
 
-            return torch.func.grad(fn)(x)
+        z = torch.randn(2, 3)
+        fn(z)
+        self.assertEqual(len(records), 0)
 
-        x = torch.randn(3, 3, 3)
+        y = torch.randn(3, 4)
+        fn(y)
+        self.assertGreater(len(records), 0)
 
-        wrapped_gm = self._compile_check(wrapper_fn, (x,))
 
+class FuncTorchHigherOrderOpTests(torch._dynamo.test_case.TestCase):
+    def tearDown(self):
+        # Ensure that in the case of a test failure, the next test won't fail
+        # because of a previous call to _vmap_increment_nesting that wasn't undone
+        # i.e. test_vmap_free_tensor fails when PYTORCH_TEST_WITH_DYNAMO=1
+        # and the call to increment nesting is not undone
+        if not TEST_WITH_TORCHDYNAMO:
+            return
+
+        warn = False
+        while ci := torch._C._functorch.peek_interpreter_stack():
+            if ci.key() == torch._C._functorch.TransformType.Vmap:
+                warn = True
+                torch._C._functorch._vmap_decrement_nesting()
+            else:
+                break
+
+        if warn:
+            msg = (
+                "Interpreter stack is not empty. Test should have called "
+                "'torch._C._functorch._vmap_decrement_nesting()'"
+            )
+            warnings.warn(msg)
+
+    def _compile_check(self, fn, inputs, fullgraph=True, graph_idx=0):
+        backend = EagerAndRecordGraphs()
+        actual = fn(*inputs)
+        expected = torch.compile(fn, backend=backend, fullgraph=fullgraph)(*inputs)
+
+        self.assertEqual(actual, expected)
+
+        wrapped_gm = backend.graphs[graph_idx]
+        return wrapped_gm
+
+    def test_hessian(self):
+        counters.clear()
+
+        def wrapper_fn(x):
+            return torch.func.hessian(torch.sin)(x)
+
+        x = torch.randn(4, 3)
+        wrapped_gm = self._compile_check(wrapper_fn, (x,))
         # Dynamic shapes produce a slightly different graph.
         if check_dynamic_shape_capture():
             return
@@ -2502,261 +2761,2082 @@ def fn(x):
             actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
+    def forward(self, L_x_: "f32[4, 3]"):
         l_x_ = L_x_
 
-        y = torch.randn(3)
+        tensor: "i64[1]" = torch.tensor((12,))
+        cumsum: "i64[1]" = tensor.cumsum(dim = 0);  tensor = None
+        getitem: "i64[0]" = cumsum[slice(None, -1, None)];  cumsum = None
+        neg: "i64[0]" = getitem.neg();  getitem = None
+        unbind = neg.unbind();  neg = None
 
-        grad_body_0 = self.grad_body_0
-        grad_proxy = torch.func.grad(grad_body_0, 0, False);  grad_body_0 = None
-        call = grad_proxy.__call__(l_x_, y);  grad_proxy = l_x_ = None
-        contiguous = call.contiguous();  call = None
-        return (y, contiguous)
+        chunk: "f32[12, 12]" = l_x_.new_zeros(12, 12)
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, l_x_, y):
-            sin = l_x_.sin();  l_x_ = None
-            add = sin + y;  sin = y = None
-            sum_1 = add.sum();  add = None
-            return sum_1
-""",
-        )
+        diagonal: "f32[12]" = chunk.diagonal(0)
+        fill_: "f32[12]" = diagonal.fill_(1);  diagonal = None
 
-    @config.patch(capture_func_transforms=True)
-    def test_grad_closure_scalar(self):
-        counters.clear()
+        child: "f32[12, 4, 3]" = chunk.view(12, 4, 3);  chunk = None
 
-        def wrapper_fn(x):
-            y = 3.14
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
 
-            def fn(x):
-                return (x.sin() + y).sum()
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error')
 
-            return torch.func.grad(fn)(x)
+        child_1 = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
 
-        x = torch.randn(3, 3, 3)
+        _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
 
-        # Graph break because dynamo is unable to get source `fn` and
-        # functools.wraps in `grad` leads to graph-break
-        wrapped_gm = self._compile_check(wrapper_fn, (x,), fullgraph=False)
+        _jvp_treespec_compare = torch._functorch.eager_transforms._jvp_treespec_compare((l_x_,), (child_1,))
 
-        # Dynamic shapes produce a slightly different graph.
-        if check_dynamic_shape_capture():
-            return
+        _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting()
+        _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True)
+        _enter_dual_level = torch._C._enter_dual_level()
 
-        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
-        self.assertExpectedInline(
-            actual,
-            """\
-class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
-        l_x_ = L_x_
+        _maybe_load_decompositions = torch.autograd.forward_ad._maybe_load_decompositions()
 
-        grad_body_0 = self.grad_body_0
-        grad_proxy = torch.func.grad(grad_body_0, 0, False);  grad_body_0 = None
-        call = grad_proxy.__call__(l_x_);  grad_proxy = l_x_ = None
-        contiguous = call.contiguous();  call = None
-        return (contiguous,)
+        child_2 = torch._make_dual(l_x_, child_1, level = 0);  child_1 = None
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, l_x_):
-            sin = l_x_.sin();  l_x_ = None
-            add = sin + 3.14;  sin = None
-            sum_1 = add.sum();  add = None
-            return sum_1
-""",
-        )
+        _wrap_for_grad = torch._C._functorch._wrap_for_grad(l_x_, 2);  l_x_ = None
 
-    @config.patch(capture_func_transforms=True)
-    def test_grad_has_aux(self):
-        counters.clear()
+        _saved_tensors_hooks_disable_2 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
 
-        y = 3.14
+        diff_primals = torch._C._functorch._wrap_for_grad(child_2, 3);  child_2 = None
 
-        def fn(x):
-            return ((x.sin() + y).sum(), x.cos())
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
 
-        def wrapper_fn(x):
-            return torch.func.grad(fn, has_aux=True)(x)
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(diff_primals)
 
-        x = torch.randn(3, 3, 3)
-        wrapped_gm = self._compile_check(wrapper_fn, (x,))
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
 
-        # Dynamic shapes produce a slightly different graph.
-        if check_dynamic_shape_capture():
-            return
+        o = torch.sin(diff_primals)
 
-        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
-        self.assertExpectedInline(
-            actual,
-            """\
-class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
-        l_x_ = L_x_
+        results = torch._C._functorch._unwrap_for_grad(o, 3)
 
-        grad_body_0 = self.grad_body_0
-        grad_proxy = torch.func.grad(grad_body_0, 0, True);  grad_body_0 = None
-        call = grad_proxy.__call__(l_x_);  grad_proxy = l_x_ = None
-        getitem = call[0]
-        getitem_1 = call[1];  call = None
-        contiguous = getitem.contiguous();  getitem = None
-        return (contiguous, getitem_1)
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_disable_3 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, l_x_):
-            sin = l_x_.sin()
-            add = sin + 3.14;  sin = None
-            sum_1 = add.sum();  add = None
-            cos = l_x_.cos();  l_x_ = None
-            return (sum_1, cos)
+        tensor_1 = torch.tensor((12,))
+        cumsum_1 = tensor_1.cumsum(dim = 0);  tensor_1 = None
+        getitem_1 = cumsum_1[slice(None, -1, None)];  cumsum_1 = None
+        neg_1 = getitem_1.neg();  getitem_1 = None
+        unbind_1 = neg_1.unbind();  neg_1 = None
+
+        chunk_1 = results.new_zeros(12, 12);  results = None
+
+        diagonal_1 = chunk_1.diagonal(0)
+        fill__1 = diagonal_1.fill_(1);  diagonal_1 = None
+
+        basis = chunk_1.view(12, 4, 3);  chunk_1 = None
+
+        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions()
+
+        _saved_tensors_hooks_disable_4 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(12, 'error')
+
+        _add_batch_dim_1 = torch._C._functorch._add_batch_dim(basis, 0, 3);  basis = None
+
+        _vjp_treespec_compare = torch._functorch.eager_transforms._vjp_treespec_compare(o, _add_batch_dim_1)
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad([o], [diff_primals], [_add_batch_dim_1], retain_graph = True, create_graph = True);  _add_batch_dim_1 = None
+        batched_outputs = _autograd_grad[0];  _autograd_grad = None
+
+        chunked_result = torch._C._functorch._remove_batch_dim(batched_outputs, 3, 12, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_disable_5 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        split = chunked_result.split((12,), dim = 0);  chunked_result = None
+        split_1 = split[0];  split = None
+
+        output_input = split_1.view((4, 3, 4, 3));  split_1 = None
+
+        _unpack_dual = torch._unpack_dual(output_input, level = 0);  output_input = None
+        primal = _unpack_dual[0]
+        dual = _unpack_dual[1];  _unpack_dual = None
+
+        primals_out_unflatten: "f32[4, 3, 4, 3]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = None
+
+        tangents_out_unflatten = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
+
+        _exit_dual_level = torch._C._exit_dual_level(0)
+        _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True)
+        _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting()
+        _saved_tensors_hooks_disable_6 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        results_1: "f32[12, 4, 3, 4, 3]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
+
+        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+
+        movedim: "f32[4, 3, 4, 3, 12]" = results_1.movedim(0, -1);  results_1 = None
+        split_2 = movedim.split((12,), dim = -1);  movedim = None
+        jac_out_in: "f32[4, 3, 4, 3, 12]" = split_2[0];  split_2 = None
+
+        unflatten: "f32[4, 3, 4, 3, 4, 3]" = jac_out_in.unflatten(-1, (4, 3));  jac_out_in = None
+        return (unflatten, diff_primals, o)
 """,
         )
 
-    @config.patch(capture_func_transforms=True)
-    def test_grad_two_tensor_has_aux(self):
+    def test_hessian_argnums(self):
         counters.clear()
 
         def fn(x, y):
-            return ((x.sin() + y).sum(), x.cos())
+            return x.sin()
 
         def wrapper_fn(x, y):
-            return torch.func.grad(fn, has_aux=True)(x, y)
+            return torch.func.hessian(fn, argnums=(1,))(x, y)
 
-        y = torch.randn(3, 3, 3)
-        x = torch.randn(3, 3, 3)
+        x = torch.randn(4, 3)
+        y = torch.randn(3, 4)
         wrapped_gm = self._compile_check(wrapper_fn, (x, y))
-
         # Dynamic shapes produce a slightly different graph.
         if check_dynamic_shape_capture():
             return
 
         actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
         self.assertExpectedInline(
-            actual,
+            "\n".join(actual.split("\n")[:-2]),
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
+    def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
         l_x_ = L_x_
         l_y_ = L_y_
 
-        grad_body_0 = self.grad_body_0
-        grad_proxy = torch.func.grad(grad_body_0, 0, True);  grad_body_0 = None
-        call = grad_proxy.__call__(l_x_, l_y_);  grad_proxy = l_x_ = l_y_ = None
-        getitem = call[0]
-        getitem_1 = call[1];  call = None
-        contiguous = getitem.contiguous();  getitem = None
-        return (contiguous, getitem_1)
+        tensor: "i64[1]" = torch.tensor((12,))
+        cumsum: "i64[1]" = tensor.cumsum(dim = 0);  tensor = None
+        getitem: "i64[0]" = cumsum[slice(None, -1, None)];  cumsum = None
+        neg: "i64[0]" = getitem.neg();  getitem = None
+        unbind = neg.unbind();  neg = None
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, l_x_, l_y_):
-            sin = l_x_.sin()
-            add = sin + l_y_;  sin = l_y_ = None
-            sum_1 = add.sum();  add = None
-            cos = l_x_.cos();  l_x_ = None
-            return (sum_1, cos)
-""",
-        )
+        chunk: "f32[12, 12]" = l_y_.new_zeros(12, 12)
 
-    @config.patch(capture_func_transforms=True)
-    def test_grad_two_tensor_all_grad_has_aux(self):
-        counters.clear()
+        diagonal: "f32[12]" = chunk.diagonal(0)
+        fill_: "f32[12]" = diagonal.fill_(1);  diagonal = None
 
-        nums = (0, 1)
+        child: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
 
-        def fn(x, y):
-            return ((x.sin() + y).sum(), x.cos())
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
 
-        def wrapper_fn_const_var(x, y):
-            return torch.func.grad(fn, argnums=(0, 1), has_aux=True)(x, y)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error')
 
-        def wrapper_fn_tuple_var(x, y):
-            return torch.func.grad(fn, argnums=nums, has_aux=True)(x, y)
+        child_1 = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
 
-        y = torch.randn(3, 3, 3)
-        x = torch.randn(3, 3, 3)
-        wrapped_gm_const_var = self._compile_check(wrapper_fn_const_var, (x, y))
-        wrapped_gm_tuple_var = self._compile_check(wrapper_fn_tuple_var, (x, y))
+        _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
 
-        # Dynamic shapes produce a slightly different graph.
-        if check_dynamic_shape_capture():
-            return
+        _jvp_treespec_compare = torch._functorch.eager_transforms._jvp_treespec_compare((l_y_,), (child_1,))
 
-        actual_const_var = normalize_gm(
-            wrapped_gm_const_var.print_readable(print_output=False)
-        )
-        actual_tuple_var = normalize_gm(
-            wrapped_gm_tuple_var.print_readable(print_output=False)
-        )
-        self.assertExpectedInline(
-            actual_const_var,
+        _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting()
+        _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True)
+        _enter_dual_level = torch._C._enter_dual_level()
+
+        _maybe_load_decompositions = torch.autograd.forward_ad._maybe_load_decompositions()
+
+        child_3 = torch._make_dual(l_y_, child_1, level = 0);  child_1 = None
+
+        child_2 = torch._C._functorch._wrap_for_grad(l_x_, 2);  l_x_ = None
+        _wrap_for_grad_1 = torch._C._functorch._wrap_for_grad(l_y_, 2);  l_y_ = None
+
+        _saved_tensors_hooks_disable_2 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        _wrap_for_grad_2 = torch._C._functorch._wrap_for_grad(child_2, 3)
+        child_4 = torch._C._functorch._wrap_for_grad(child_3, 3)
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(child_4)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        o = _wrap_for_grad_2.sin();  _wrap_for_grad_2 = None
+
+        results = torch._C._functorch._unwrap_for_grad(o, 3)
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_disable_3 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        tensor_1 = torch.tensor((12,))
+        cumsum_1 = tensor_1.cumsum(dim = 0);  tensor_1 = None
+        getitem_1 = cumsum_1[slice(None, -1, None)];  cumsum_1 = None
+        neg_1 = getitem_1.neg();  getitem_1 = None
+        unbind_1 = neg_1.unbind();  neg_1 = None
+
+        chunk_1 = results.new_zeros(12, 12);  results = None
+
+        diagonal_1 = chunk_1.diagonal(0)
+        fill__1 = diagonal_1.fill_(1);  diagonal_1 = None
+
+        basis = chunk_1.view(12, 4, 3);  chunk_1 = None
+
+        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions()
+
+        _saved_tensors_hooks_disable_4 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(12, 'error')
+
+        _add_batch_dim_1 = torch._C._functorch._add_batch_dim(basis, 0, 3);  basis = None
+
+        _vjp_treespec_compare = torch._functorch.eager_transforms._vjp_treespec_compare(o, _add_batch_dim_1)
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad([o], [child_4], [_add_batch_dim_1], retain_graph = True, create_graph = True);  _add_batch_dim_1 = None
+        child_5 = _autograd_grad[0];  _autograd_grad = None
+
+        child_6 = torch._C._functorch._remove_batch_dim(child_5, 3, 12, 0);  child_5 = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_disable_5 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        split = child_6.split((12,), dim = 0);  child_6 = None
+        split_1 = split[0];  split = None
+
+        child_7 = split_1.view((4, 3, 3, 4));  split_1 = None
+
+        _unpack_dual = torch._unpack_dual(child_7, level = 0);  child_7 = None
+        primal = _unpack_dual[0];  _unpack_dual = None
+
+        tangent = torch.zeros_like(primal)
+
+        child_8: "f32[4, 3, 3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = None
+
+        child_9: "f32[4, 3, 3, 4]" = torch._C._functorch._unwrap_for_grad(tangent, 2);  tangent = None
+
+        _exit_dual_level = torch._C._exit_dual_level(0)
+        _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True)
+        _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting()
+        _saved_tensors_hooks_disable_6 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        child_10: "f32[12, 4, 3, 3, 4]" = torch._C._functorch._remove_batch_dim(child_9, 1, 12, 0);  child_9 = None
+
+        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+
+        movedim: "f32[4, 3, 3, 4, 12]" = child_10.movedim(0, -1);  child_10 = None
+        split_2 = movedim.split((12,), dim = -1);  movedim = None
+        jac_out_in: "f32[4, 3, 3, 4, 12]" = split_2[0];  split_2 = None
+
+        unflatten: "f32[4, 3, 3, 4, 3, 4]" = jac_out_in.unflatten(-1, (3, 4));  jac_out_in = None""",
+        )
+
+        # Python 3.10 and 3.11 produces slightly different graphs
+        if sys.version_info[:2] > (3, 10):
+            self.assertExpectedInline(
+                actual.split("\n")[-2],
+                """        return (unflatten, child_2, _wrap_for_grad_1, child_3, child_4, o)""",
+            )
+        else:
+            self.assertExpectedInline(
+                actual.split("\n")[-2],
+                """        return (unflatten, child_3, child_2, _wrap_for_grad_1, child_4, o)""",
+            )
+
+    def test_hessian_disable_capture(self):
+        counters.clear()
+
+        with config.patch(capture_func_transforms=False):
+            # We have verified above that this
+            # function compiles
+            def wrapper_fn(x):
+                return torch.func.hessian(torch.sin)(x)
+
+            x = torch.randn(3, 3, 3)
+            actual = wrapper_fn(x)
+            expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(
+                x
+            )
+            self.assertEqual(len(counters["graph_break"]), 2)
+            self.assertEqual(
+                {
+                    "torch.func.vmap capture is disabled, it can be "
+                    "turned on by setting `torch._dynamo.config.capture_func_transforms=True`": 2,
+                    "torch.func.hessian capture is disabled, it can be "
+                    "turned on by setting `torch._dynamo.config.capture_func_transforms=True`": 1,
+                },
+                dict(counters["graph_break"]),
+            )
+            self.assertEqual(actual, expected)
+
+    def test_jacrev(self):
+        counters.clear()
+
+        def wrapper_fn(x):
+            return torch.func.jacrev(torch.sin)(x)
+
+        x = torch.randn(4, 3)
+        wrapped_gm = self._compile_check(wrapper_fn, (x,))
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[4, 3]"):
+        l_x_ = L_x_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        diff_primals = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(diff_primals)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        o = torch.sin(diff_primals)
+
+        results: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(o, 1)
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+
+        tensor: "i64[1]" = torch.tensor((12,))
+        cumsum: "i64[1]" = tensor.cumsum(dim = 0);  tensor = None
+        getitem: "i64[0]" = cumsum[slice(None, -1, None)];  cumsum = None
+        neg: "i64[0]" = getitem.neg();  getitem = None
+        unbind = neg.unbind();  neg = None
+
+        chunk: "f32[12, 12]" = results.new_zeros(12, 12);  results = None
+
+        diagonal: "f32[12]" = chunk.diagonal(0)
+        fill_: "f32[12]" = diagonal.fill_(1);  diagonal = None
+
+        basis: "f32[12, 4, 3]" = chunk.view(12, 4, 3);  chunk = None
+
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
+
+        _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error')
+
+        _add_batch_dim = torch._C._functorch._add_batch_dim(basis, 0, 1);  basis = None
+
+        _vjp_treespec_compare = torch._functorch.eager_transforms._vjp_treespec_compare(o, _add_batch_dim)
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad([o], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True);  _add_batch_dim = None
+        batched_outputs = _autograd_grad[0];  _autograd_grad = None
+
+        chunked_result: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable_1 = torch._C._autograd._saved_tensors_hooks_enable()
+
+        split = chunked_result.split((12,), dim = 0);  chunked_result = None
+        split_1: "f32[12, 4, 3]" = split[0];  split = None
+
+        output_input: "f32[4, 3, 4, 3]" = split_1.view((4, 3, 4, 3));  split_1 = None
+        return (output_input, diff_primals, o)
+""",
+        )
+
+    def test_jacrev_two_tensors_argnums(self):
+        counters.clear()
+
+        def fn(x, y):
+            return y.sin()
+
+        def wrapper_fn(x, y):
+            return torch.func.jacrev(fn, argnums=1)(x, y)
+
+        x = torch.randn(4, 3)
+        y = torch.randn(3, 4)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, y))
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        _wrap_for_grad = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+        diff_primals = torch._C._functorch._wrap_for_grad(l_y_, 1);  l_y_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(diff_primals)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        o = diff_primals.sin()
+
+        results: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(o, 1)
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+
+        tensor: "i64[1]" = torch.tensor((12,))
+        cumsum: "i64[1]" = tensor.cumsum(dim = 0);  tensor = None
+        getitem: "i64[0]" = cumsum[slice(None, -1, None)];  cumsum = None
+        neg: "i64[0]" = getitem.neg();  getitem = None
+        unbind = neg.unbind();  neg = None
+
+        chunk: "f32[12, 12]" = results.new_zeros(12, 12);  results = None
+
+        diagonal: "f32[12]" = chunk.diagonal(0)
+        fill_: "f32[12]" = diagonal.fill_(1);  diagonal = None
+
+        basis: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
+
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
+
+        _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error')
+
+        _add_batch_dim = torch._C._functorch._add_batch_dim(basis, 0, 1);  basis = None
+
+        _vjp_treespec_compare = torch._functorch.eager_transforms._vjp_treespec_compare(o, _add_batch_dim)
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad([o], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True);  _add_batch_dim = None
+        batched_outputs = _autograd_grad[0];  _autograd_grad = None
+
+        chunked_result: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable_1 = torch._C._autograd._saved_tensors_hooks_enable()
+
+        split = chunked_result.split((12,), dim = 0);  chunked_result = None
+        split_1: "f32[12, 3, 4]" = split[0];  split = None
+
+        output_input: "f32[3, 4, 3, 4]" = split_1.view((3, 4, 3, 4));  split_1 = None
+        return (output_input, diff_primals, o)
+""",
+        )
+
+    def test_jacrev_has_aux(self):
+        counters.clear()
+
+        def fn(x, y):
+            return y.sin(), x
+
+        def wrapper_fn(x, y):
+            return torch.func.jacrev(fn, argnums=1, has_aux=True)(x, y)
+
+        x = torch.randn(4, 3)
+        y = torch.randn(3, 4)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, y))
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        aux = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+        diff_primals = torch._C._functorch._wrap_for_grad(l_y_, 1);  l_y_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(diff_primals)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        o = diff_primals.sin()
+
+        aux_1: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
+
+        results: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(o, 1)
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+
+        tensor: "i64[1]" = torch.tensor((12,))
+        cumsum: "i64[1]" = tensor.cumsum(dim = 0);  tensor = None
+        getitem: "i64[0]" = cumsum[slice(None, -1, None)];  cumsum = None
+        neg: "i64[0]" = getitem.neg();  getitem = None
+        unbind = neg.unbind();  neg = None
+
+        chunk: "f32[12, 12]" = results.new_zeros(12, 12);  results = None
+
+        diagonal: "f32[12]" = chunk.diagonal(0)
+        fill_: "f32[12]" = diagonal.fill_(1);  diagonal = None
+
+        basis: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
+
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
+
+        _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error')
+
+        _add_batch_dim = torch._C._functorch._add_batch_dim(basis, 0, 1);  basis = None
+
+        _vjp_treespec_compare = torch._functorch.eager_transforms._vjp_treespec_compare(o, _add_batch_dim)
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad([o], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True);  _add_batch_dim = None
+        batched_outputs = _autograd_grad[0];  _autograd_grad = None
+
+        chunked_result: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable_1 = torch._C._autograd._saved_tensors_hooks_enable()
+
+        split = chunked_result.split((12,), dim = 0);  chunked_result = None
+        split_1: "f32[12, 3, 4]" = split[0];  split = None
+
+        output_input: "f32[3, 4, 3, 4]" = split_1.view((3, 4, 3, 4));  split_1 = None
+        return (output_input, aux_1, diff_primals, o)
+""",
+        )
+
+    def test_jacrev_disable_capture(self):
+        counters.clear()
+
+        with config.patch(capture_func_transforms=False):
+            # We have verified above that this
+            # function compiles
+            def wrapper_fn(x):
+                return torch.func.jacrev(torch.sin)(x)
+
+            x = torch.randn(3, 3, 3)
+            actual = wrapper_fn(x)
+            expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(
+                x
+            )
+            self.assertEqual(len(counters["graph_break"]), 2)
+            self.assertEqual(
+                dict(counters["graph_break"]),
+                {
+                    "torch.func.vmap capture is disabled, it can be "
+                    "turned on by setting `torch._dynamo.config.capture_func_transforms=True`": 2,
+                    "torch.func.jacrev capture is disabled, it can be "
+                    "turned on by setting `torch._dynamo.config.capture_func_transforms=True`": 1,
+                },
+            )
+            self.assertEqual(actual, expected)
+
+    def test_vjp(self):
+        counters.clear()
+
+        def fn(x):
+            return x.sin().sum()
+
+        def wrapper_fn(x, v):
+            (out, vjpfunc) = torch.func.vjp(fn, x)
+            return out
+
+        x = torch.randn([5])
+        v = torch.randn(5)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, v))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[5]"):
+        l_x_ = L_x_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        child = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        child_1 = torch._functorch.eager_transforms._set_tensor_requires_grad(child)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        sin = child.sin();  child = None
+        o = sin.sum();  sin = None
+
+        results: "f32[]" = torch._C._functorch._unwrap_for_grad(o, 1);  o = None
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (results,)
+""",
+        )
+
+    def test_vjp_multiple_outputs(self):
+        counters.clear()
+
+        def wrapper_fn(x, v):
+            fn = lambda x: (x.sin(), x.cos())  # noqa: E731
+            (out, vjpfunc) = torch.func.vjp(fn, x)
+            vjps = vjpfunc((v, v))
+            return out, vjps
+
+        x = torch.randn([5])
+        v = torch.randn(5)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, v))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[5]", L_v_: "f32[5]"):
+        l_x_ = L_x_
+        l_v_ = L_v_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        child = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        child_3 = torch._functorch.eager_transforms._set_tensor_requires_grad(child)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        child_1 = child.sin()
+        child_2 = child.cos();  child = None
+
+        _unwrap_for_grad: "f32[5]" = torch._C._functorch._unwrap_for_grad(child_1, 1)
+        _unwrap_for_grad_1: "f32[5]" = torch._C._functorch._unwrap_for_grad(child_2, 1)
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+
+        _vjp_treespec_compare = torch._functorch.eager_transforms._vjp_treespec_compare((child_1, child_2), (l_v_, l_v_))
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad([child_1, child_2], [child_3], [l_v_, l_v_], retain_graph = True, create_graph = True);  child_1 = child_2 = child_3 = l_v_ = None
+        getitem: "f32[5]" = _autograd_grad[0];  _autograd_grad = None
+        return (_unwrap_for_grad, _unwrap_for_grad_1, getitem)
+""",
+        )
+
+    def test_vjp_multiple_outputs_python_struct(self):
+        counters.clear()
+
+        def wrapper_fn(x, v):
+            fn = lambda x: {"first": x.sin(), "second": x.cos()}  # noqa: E731
+            (out, vjpfunc) = torch.func.vjp(fn, x)
+            vjps = vjpfunc({"first": v, "second": v.sin()})
+            return out, vjps
+
+        x = torch.randn([5])
+        v = torch.randn(5)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, v))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[5]", L_v_: "f32[5]"):
+        l_x_ = L_x_
+        l_v_ = L_v_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        child = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        child_3 = torch._functorch.eager_transforms._set_tensor_requires_grad(child)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        child_1 = child.sin()
+        child_2 = child.cos();  child = None
+
+        _unwrap_for_grad: "f32[5]" = torch._C._functorch._unwrap_for_grad(child_1, 1)
+        _unwrap_for_grad_1: "f32[5]" = torch._C._functorch._unwrap_for_grad(child_2, 1)
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+
+        child_4: "f32[5]" = l_v_.sin()
+
+        _vjp_treespec_compare = torch._functorch.eager_transforms._vjp_treespec_compare({'first': child_1, 'second': child_2}, {'first': l_v_, 'second': child_4})
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad([child_1, child_2], [child_3], [l_v_, child_4], retain_graph = True, create_graph = True);  child_1 = child_2 = child_3 = l_v_ = child_4 = None
+        getitem: "f32[5]" = _autograd_grad[0];  _autograd_grad = None
+        return (_unwrap_for_grad, _unwrap_for_grad_1, getitem)
+""",
+        )
+
+    def test_vjp_has_aux(self):
+        counters.clear()
+
+        def fn(x):
+            return x.sin().sum(), x
+
+        def wrapper_fn(x, v):
+            (out, vjpfunc, _) = torch.func.vjp(fn, x, has_aux=True)
+            return out
+
+        x = torch.randn([5])
+        v = torch.randn(5)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, v))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[5]"):
+        l_x_ = L_x_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        child = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        child_1 = torch._functorch.eager_transforms._set_tensor_requires_grad(child)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        sin = child.sin()
+        o = sin.sum();  sin = None
+
+        aux: "f32[5]" = torch._C._functorch._unwrap_for_grad(child, 1);  child = None
+
+        results: "f32[]" = torch._C._functorch._unwrap_for_grad(o, 1);  o = None
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (results,)
+""",
+        )
+
+    def test_vjp_disable_capture(self):
+        counters.clear()
+
+        with config.patch(capture_func_transforms=False):
+            # We have verified above that this
+            # function compiles
+            def wrapper_fn(x):
+                (out, vjpfunc) = torch.func.vjp(torch.sin, x)
+                return out
+
+            x = torch.randn(3, 3, 3)
+            actual = wrapper_fn(x)
+            expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(
+                x
+            )
+            self.assertEqual(len(counters["graph_break"]), 1)
+            self.assertEqual(
+                dict(counters["graph_break"]),
+                {
+                    "torch.func.vjp capture is disabled, it can be "
+                    "turned on by setting `torch._dynamo.config.capture_func_transforms=True`": 1
+                },
+            )
+            self.assertEqual(actual, expected)
+
+    def test_grad(self):
+        counters.clear()
+
+        def fn(x):
+            return x.sin().sum()
+
+        def wrapper_fn(x):
+            return torch.func.grad(fn)(x)
+
+        x = torch.randn(3, 3, 3)
+        wrapped_gm = self._compile_check(wrapper_fn, (x,))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3, 3]"):
+        l_x_ = L_x_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        diff_args = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(diff_args)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        sin = diff_args.sin()
+        output = sin.sum();  sin = None
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad((output,), [diff_args], create_graph = True);  diff_args = None
+        grad_input = _autograd_grad[0];  _autograd_grad = None
+
+        grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+
+        output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = None
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (grad_input_1,)
+""",
+        )
+
+    def test_grad_freevar_tensor(self):
+        counters.clear()
+        y = torch.randn(3, 3)
+
+        def fn(x):
+            return (x.sin() + y).sum()
+
+        def wrapper_fn(x):
+            return torch.func.grad(fn)(x)
+
+        x = torch.randn(3, 3, 3)
+        expected = wrapper_fn(x)
+        actual = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=True)(x)
+        self.assertEqual(actual, expected)
+
+    def test_grad_freevar_python_scalar(self):
+        counters.clear()
+        y = 3
+
+        def fn(x):
+            return (x.sin() + y).sum()
+
+        def wrapper_fn(x):
+            return torch.func.grad(fn)(x)
+
+        x = torch.randn(3, 3, 3)
+        wrapped_gm = self._compile_check(wrapper_fn, (x,))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3, 3]"):
+        l_x_ = L_x_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        diff_args = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(diff_args)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        sin = diff_args.sin()
+        add = sin + 3;  sin = None
+        output = add.sum();  add = None
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad((output,), [diff_args], create_graph = True);  diff_args = None
+        grad_input = _autograd_grad[0];  _autograd_grad = None
+
+        grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+
+        output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = None
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (grad_input_1,)
+""",
+        )
+
+    def test_grad_capture_tensor(self):
+        counters.clear()
+
+        def wrapper_fn(x):
+            y = torch.randn(3)
+
+            def fn(x):
+                return (x.sin() + y).sum()
+
+            return torch.func.grad(fn)(x)
+
+        x = torch.randn(3, 3, 3)
+
+        wrapped_gm = self._compile_check(wrapper_fn, (x,))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3, 3]"):
+        l_x_ = L_x_
+
+        y: "f32[3]" = torch.randn(3)
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        diff_args = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(diff_args)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        sin = diff_args.sin()
+        add = sin + y;  sin = None
+        output = add.sum();  add = None
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad((output,), [diff_args], create_graph = True);  diff_args = None
+        grad_input = _autograd_grad[0];  _autograd_grad = None
+
+        grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+
+        output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = None
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (grad_input_1, y)
+""",
+        )
+
+    def test_grad_closure_scalar(self):
+        counters.clear()
+
+        def wrapper_fn(x):
+            y = 3.14
+
+            def fn(x):
+                return (x.sin() + y).sum()
+
+            return torch.func.grad(fn)(x)
+
+        x = torch.randn(3, 3, 3)
+
+        # Graph break because dynamo is unable to get source `fn` and
+        # functools.wraps in `grad` leads to graph-break
+        wrapped_gm = self._compile_check(wrapper_fn, (x,), fullgraph=False)
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3, 3]"):
+        l_x_ = L_x_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        diff_args = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(diff_args)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        sin = diff_args.sin()
+        add = sin + 3.14;  sin = None
+        output = add.sum();  add = None
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad((output,), [diff_args], create_graph = True);  diff_args = None
+        grad_input = _autograd_grad[0];  _autograd_grad = None
+
+        grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+
+        output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = None
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (grad_input_1,)
+""",
+        )
+
+    def test_grad_has_aux(self):
+        counters.clear()
+
+        y = 3.14
+
+        def fn(x):
+            return ((x.sin() + y).sum(), x.cos())
+
+        def wrapper_fn(x):
+            return torch.func.grad(fn, has_aux=True)(x)
+
+        x = torch.randn(3, 3, 3)
+        wrapped_gm = self._compile_check(wrapper_fn, (x,))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3, 3]"):
+        l_x_ = L_x_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        diff_args = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(diff_args)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        sin = diff_args.sin()
+        add = sin + 3.14;  sin = None
+        output = add.sum();  add = None
+        aux = diff_args.cos()
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad((output,), [diff_args], create_graph = True);  diff_args = None
+        grad_input = _autograd_grad[0];  _autograd_grad = None
+
+        grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+
+        output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = None
+
+        aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (grad_input_1, aux_1)
+""",
+        )
+
+    def test_grad_two_tensor_has_aux(self):
+        counters.clear()
+
+        def fn(x, y):
+            return ((x.sin() + y).sum(), x.cos())
+
+        def wrapper_fn(x, y):
+            return torch.func.grad(fn, has_aux=True)(x, y)
+
+        y = torch.randn(3, 3, 3)
+        x = torch.randn(3, 3, 3)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, y))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3, 3]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        diff_args = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+        _wrap_for_grad_1 = torch._C._functorch._wrap_for_grad(l_y_, 1);  l_y_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(diff_args)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        sin = diff_args.sin()
+        add = sin + _wrap_for_grad_1;  sin = _wrap_for_grad_1 = None
+        output = add.sum();  add = None
+        aux = diff_args.cos()
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad((output,), [diff_args], create_graph = True);  diff_args = None
+        grad_input = _autograd_grad[0];  _autograd_grad = None
+
+        grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+
+        output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = None
+
+        aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (grad_input_1, aux_1)
+""",
+        )
+
+    def test_grad_two_tensor_all_grad_has_aux(self):
+        counters.clear()
+
+        nums = (0, 1)
+
+        def fn(x, y):
+            return ((x.sin() + y).sum(), x.cos())
+
+        def wrapper_fn_const_var(x, y):
+            return torch.func.grad(fn, argnums=(0, 1), has_aux=True)(x, y)
+
+        def wrapper_fn_tuple_var(x, y):
+            return torch.func.grad(fn, argnums=nums, has_aux=True)(x, y)
+
+        y = torch.randn(3, 3, 3)
+        x = torch.randn(3, 3, 3)
+        wrapped_gm_const_var = self._compile_check(wrapper_fn_const_var, (x, y))
+        wrapped_gm_tuple_var = self._compile_check(wrapper_fn_tuple_var, (x, y))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual_const_var = normalize_gm(
+            wrapped_gm_const_var.print_readable(print_output=False)
+        )
+        actual_tuple_var = normalize_gm(
+            wrapped_gm_tuple_var.print_readable(print_output=False)
+        )
+        self.assertExpectedInline(
+            actual_const_var,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3, 3]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        child = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+        child_1 = torch._C._functorch._wrap_for_grad(l_y_, 1);  l_y_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(child)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+        set_inplace_requires_grad_allowed_2 = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad_1 = torch._functorch.eager_transforms._set_tensor_requires_grad(child_1)
+
+        set_inplace_requires_grad_allowed_3 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        sin = child.sin()
+        add = sin + child_1;  sin = None
+        output = add.sum();  add = None
+        aux = child.cos()
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad((output,), [child, child_1], create_graph = True);  child = child_1 = None
+        child_2 = _autograd_grad[0]
+        child_3 = _autograd_grad[1];  _autograd_grad = None
+
+        _unwrap_for_grad: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_2, 1);  child_2 = None
+        _unwrap_for_grad_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_3, 1);  child_3 = None
+
+        output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = None
+
+        aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (_unwrap_for_grad, _unwrap_for_grad_1, aux_1)
+""",
+        )
+        self.assertExpectedInline(
+            actual_tuple_var,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3, 3]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        child = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+        child_1 = torch._C._functorch._wrap_for_grad(l_y_, 1);  l_y_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(child)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+        set_inplace_requires_grad_allowed_2 = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad_1 = torch._functorch.eager_transforms._set_tensor_requires_grad(child_1)
+
+        set_inplace_requires_grad_allowed_3 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        sin = child.sin()
+        add = sin + child_1;  sin = None
+        output = add.sum();  add = None
+        aux = child.cos()
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad((output,), [child, child_1], create_graph = True);  child = child_1 = None
+        child_2 = _autograd_grad[0]
+        child_3 = _autograd_grad[1];  _autograd_grad = None
+
+        _unwrap_for_grad: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_2, 1);  child_2 = None
+        _unwrap_for_grad_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(child_3, 1);  child_3 = None
+
+        output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = None
+
+        aux_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (_unwrap_for_grad, _unwrap_for_grad_1, aux_1)
+""",
+        )
+
+    def test_grad_over_grad(self):
+        counters.clear()
+
+        def fn(x):
+            return x.sin().sum()
+
+        def wrapper_fn(x):
+            return torch.func.grad(torch.func.grad(fn))(x)
+
+        x = torch.randn(())
+        wrapped_gm = self._compile_check(wrapper_fn, (x,), fullgraph=False)
+
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[]"):
+        l_x_ = L_x_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        diff_args = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(diff_args)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+        _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting_1 = torch._C._functorch._grad_increment_nesting()
+
+        diff_args_1 = torch._C._functorch._wrap_for_grad(diff_args, 2)
+
+        set_inplace_requires_grad_allowed_2 = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad_1 = torch._functorch.eager_transforms._set_tensor_requires_grad(diff_args_1)
+
+        set_inplace_requires_grad_allowed_3 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        sin = diff_args_1.sin()
+        output = sin.sum();  sin = None
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad((output,), [diff_args_1], create_graph = True);  diff_args_1 = None
+        grad_input = _autograd_grad[0];  _autograd_grad = None
+
+        grad_input_1 = torch._C._functorch._unwrap_for_grad(grad_input, 2);  grad_input = None
+
+        output_1 = torch._C._functorch._unwrap_for_grad(output, 2);  output = None
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_disable_2 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        _autograd_grad_1 = torch._functorch.eager_transforms._autograd_grad((grad_input_1,), [diff_args], create_graph = True);  diff_args = None
+        grad_input_2 = _autograd_grad_1[0];  _autograd_grad_1 = None
+
+        grad_input_3: "f32[]" = torch._C._functorch._unwrap_for_grad(grad_input_2, 1);  grad_input_2 = None
+
+        output_2: "f32[]" = torch._C._functorch._unwrap_for_grad(grad_input_1, 1);  grad_input_1 = None
+
+        _grad_decrement_nesting_1 = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (grad_input_3,)
+""",
+        )
+
+    def test_grad_with_graph_break(self):
+        counters.clear()
+
+        def fn(x):
+            torch._dynamo.graph_break()
+            return x.sin().sum()
+
+        def wrapper_fn(x):
+            return torch.func.grad(fn)(x)
+
+        x = torch.randn(3, 3, 3)
+        actual = wrapper_fn(x)
+        expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(x)
+        self.assertEqual(len(counters["graph_break"]), 1)
+        self.assertEqual(actual, expected)
+
+    def test_grad_with_side_effect(self):
+        counters.clear()
+
+        foo = [1, 2]
+
+        def fn(x):
+            foo.append(3)
+            return x.sin().sum()
+
+        def wrapper_fn(x):
+            return torch.func.grad(fn)(x)
+
+        x = torch.randn(3, 3, 3)
+        actual = wrapper_fn(x)
+        expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(x)
+        self.assertEqual(len(counters["graph_break"]), 0)
+        self.assertEqual(actual, expected)
+
+    def test_grad_pytree(self):
+        counters.clear()
+
+        def fn(x):
+            x1, x2 = x
+            return x1.sin().sum() + x2
+
+        def wrapper_fn(x):
+            return torch.func.grad(fn)(x)
+
+        x1 = torch.randn(3, 3, 3)
+        x2 = torch.randn(())
+        actual = wrapper_fn((x1, x2))
+        expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(
+            (x1, x2)
+        )
+        self.assertEqual(len(counters["graph_break"]), 0)
+        self.assertEqual(actual, expected)
+
+    def test_grad_non_tensor_input(self):
+        counters.clear()
+
+        def fn(x, y):
+            return x.sin().sum() + y
+
+        def wrapper_fn(x, y):
+            return torch.func.grad(fn)(x, y)
+
+        x = torch.randn(3, 3, 3)
+        y = 3.0
+        wrapped_gm = self._compile_check(wrapper_fn, (x, y))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3, 3]"):
+        l_x_ = L_x_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _grad_increment_nesting = torch._C._functorch._grad_increment_nesting()
+
+        diff_args = torch._C._functorch._wrap_for_grad(l_x_, 1);  l_x_ = None
+
+        set_inplace_requires_grad_allowed = torch._C._functorch.set_inplace_requires_grad_allowed(True)
+
+        _set_tensor_requires_grad = torch._functorch.eager_transforms._set_tensor_requires_grad(diff_args)
+
+        set_inplace_requires_grad_allowed_1 = torch._C._functorch.set_inplace_requires_grad_allowed(False)
+
+        sin = diff_args.sin()
+        sum_1 = sin.sum();  sin = None
+        output = sum_1 + 3.0;  sum_1 = None
+
+        _autograd_grad = torch._functorch.eager_transforms._autograd_grad((output,), [diff_args], create_graph = True);  diff_args = None
+        grad_input = _autograd_grad[0];  _autograd_grad = None
+
+        grad_input_1: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(grad_input, 1);  grad_input = None
+
+        output_1: "f32[]" = torch._C._functorch._unwrap_for_grad(output, 1);  output = None
+
+        _grad_decrement_nesting = torch._C._functorch._grad_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (grad_input_1,)
+""",
+        )
+
+    def test_grad_disable_capture(self):
+        counters.clear()
+
+        with config.patch(capture_func_transforms=False):
+            # We have verified above that this
+            # function compiles
+            def fn(x):
+                return x.sin().sum()
+
+            def wrapper_fn(x):
+                return torch.func.grad(fn)(x)
+
+            x = torch.randn(3, 3)
+            actual = wrapper_fn(x)
+            expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(
+                x
+            )
+            self.assertEqual(len(counters["graph_break"]), 1)
+            self.assertEqual(
+                dict(counters["graph_break"]),
+                {
+                    "torch.func.grad capture is disabled, it can be turned "
+                    "on by setting `torch._dynamo.config.capture_func_transforms=True`": 2
+                },
+            )
+            self.assertEqual(actual, expected)
+
+    def test_grad_fn_with_kwargs(self):
+        def fn(x, y):
+            return (x + y).sum()
+
+        def wrapper_fn(x, y):
+            return torch.func.grad(fn)(x, y=y)
+
+        x = torch.randn(3, 3)
+        y = torch.randn(3, 3)
+        actual = wrapper_fn(x, y)
+        expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(x, y)
+        self.assertEqual(len(counters["graph_break"]), 0)
+        self.assertEqual(actual, expected)
+
+    def test_jacfwd(self):
+        counters.clear()
+
+        def wrapper_fn(x):
+            return torch.func.jacfwd(torch.sin)(x)
+
+        x = torch.randn(4, 3)
+        wrapped_gm = self._compile_check(wrapper_fn, (x,))
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[4, 3]"):
+        l_x_ = L_x_
+
+        tensor: "i64[1]" = torch.tensor((12,))
+        cumsum: "i64[1]" = tensor.cumsum(dim = 0);  tensor = None
+        getitem: "i64[0]" = cumsum[slice(None, -1, None)];  cumsum = None
+        neg: "i64[0]" = getitem.neg();  getitem = None
+        unbind = neg.unbind();  neg = None
+
+        chunk: "f32[12, 12]" = l_x_.new_zeros(12, 12)
+
+        diagonal: "f32[12]" = chunk.diagonal(0)
+        fill_: "f32[12]" = diagonal.fill_(1);  diagonal = None
+
+        child: "f32[12, 4, 3]" = chunk.view(12, 4, 3);  chunk = None
+
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error')
+
+        child_1 = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+
+        _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        _jvp_treespec_compare = torch._functorch.eager_transforms._jvp_treespec_compare((l_x_,), (child_1,))
+
+        _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting()
+        _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True)
+        _enter_dual_level = torch._C._enter_dual_level()
+
+        _maybe_load_decompositions = torch.autograd.forward_ad._maybe_load_decompositions()
+
+        _make_dual = torch._make_dual(l_x_, child_1, level = 0);  child_1 = None
+
+        _wrap_for_grad = torch._C._functorch._wrap_for_grad(l_x_, 2);  l_x_ = None
+
+        result_duals = torch.sin(_make_dual);  _make_dual = None
+
+        _unpack_dual = torch._unpack_dual(result_duals, level = 0);  result_duals = None
+        primal = _unpack_dual[0]
+        dual = _unpack_dual[1];  _unpack_dual = None
+
+        primals_out_unflatten: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = None
+
+        tangents_out_unflatten = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
+
+        _exit_dual_level = torch._C._exit_dual_level(0)
+        _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True)
+        _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting()
+        _saved_tensors_hooks_disable_2 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        results: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+
+        movedim: "f32[4, 3, 12]" = results.movedim(0, -1);  results = None
+        split = movedim.split((12,), dim = -1);  movedim = None
+        jac_out_in: "f32[4, 3, 12]" = split[0];  split = None
+
+        unflatten: "f32[4, 3, 4, 3]" = jac_out_in.unflatten(-1, (4, 3));  jac_out_in = None
+        return (unflatten,)
+""",
+        )
+
+    def test_jacfwd_two_tensors_argnums(self):
+        counters.clear()
+
+        def fn(x, y):
+            return y.sin()
+
+        def wrapper_fn(x, y):
+            return torch.func.jacfwd(fn, argnums=1)(x, y)
+
+        x = torch.randn(4, 3)
+        y = torch.randn(3, 4)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, y))
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        tensor: "i64[1]" = torch.tensor((12,))
+        cumsum: "i64[1]" = tensor.cumsum(dim = 0);  tensor = None
+        getitem: "i64[0]" = cumsum[slice(None, -1, None)];  cumsum = None
+        neg: "i64[0]" = getitem.neg();  getitem = None
+        unbind = neg.unbind();  neg = None
+
+        chunk: "f32[12, 12]" = l_y_.new_zeros(12, 12)
+
+        diagonal: "f32[12]" = chunk.diagonal(0)
+        fill_: "f32[12]" = diagonal.fill_(1);  diagonal = None
+
+        child: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
+
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error')
+
+        child_1 = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+
+        _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        _jvp_treespec_compare = torch._functorch.eager_transforms._jvp_treespec_compare((l_y_,), (child_1,))
+
+        _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting()
+        _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True)
+        _enter_dual_level = torch._C._enter_dual_level()
+
+        _maybe_load_decompositions = torch.autograd.forward_ad._maybe_load_decompositions()
+
+        _make_dual = torch._make_dual(l_y_, child_1, level = 0);  child_1 = None
+
+        _wrap_for_grad = torch._C._functorch._wrap_for_grad(l_x_, 2);  l_x_ = None
+        _wrap_for_grad_1 = torch._C._functorch._wrap_for_grad(l_y_, 2);  l_y_ = None
+
+        result_duals = _make_dual.sin();  _make_dual = None
+
+        _unpack_dual = torch._unpack_dual(result_duals, level = 0);  result_duals = None
+        primal = _unpack_dual[0]
+        dual = _unpack_dual[1];  _unpack_dual = None
+
+        primals_out_unflatten: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = None
+
+        tangents_out_unflatten = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
+
+        _exit_dual_level = torch._C._exit_dual_level(0)
+        _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True)
+        _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting()
+        _saved_tensors_hooks_disable_2 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        results: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+
+        movedim: "f32[3, 4, 12]" = results.movedim(0, -1);  results = None
+        split = movedim.split((12,), dim = -1);  movedim = None
+        jac_out_in: "f32[3, 4, 12]" = split[0];  split = None
+
+        unflatten: "f32[3, 4, 3, 4]" = jac_out_in.unflatten(-1, (3, 4));  jac_out_in = None
+        return (unflatten,)
+""",
+        )
+
+    def test_jacfwd_has_aux(self):
+        counters.clear()
+
+        def fn(x, y):
+            return y.sin(), x
+
+        def wrapper_fn(x, y):
+            return torch.func.jacfwd(fn, argnums=1, has_aux=True)(x, y)
+
+        x = torch.randn(4, 3)
+        y = torch.randn(3, 4)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, y))
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        tensor: "i64[1]" = torch.tensor((12,))
+        cumsum: "i64[1]" = tensor.cumsum(dim = 0);  tensor = None
+        getitem: "i64[0]" = cumsum[slice(None, -1, None)];  cumsum = None
+        neg: "i64[0]" = getitem.neg();  getitem = None
+        unbind = neg.unbind();  neg = None
+
+        chunk: "f32[12, 12]" = l_y_.new_zeros(12, 12)
+
+        diagonal: "f32[12]" = chunk.diagonal(0)
+        fill_: "f32[12]" = diagonal.fill_(1);  diagonal = None
+
+        child: "f32[12, 3, 4]" = chunk.view(12, 3, 4);  chunk = None
+
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error')
+
+        child_1 = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+
+        _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        _jvp_treespec_compare = torch._functorch.eager_transforms._jvp_treespec_compare((l_y_,), (child_1,))
+
+        _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting()
+        _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True)
+        _enter_dual_level = torch._C._enter_dual_level()
+
+        _maybe_load_decompositions = torch.autograd.forward_ad._maybe_load_decompositions()
+
+        _make_dual = torch._make_dual(l_y_, child_1, level = 0);  child_1 = None
+
+        aux = torch._C._functorch._wrap_for_grad(l_x_, 2);  l_x_ = None
+        _wrap_for_grad_1 = torch._C._functorch._wrap_for_grad(l_y_, 2);  l_y_ = None
+
+        result_duals = _make_dual.sin();  _make_dual = None
+
+        aux_1: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(aux, 2);  aux = None
+
+        _unpack_dual = torch._unpack_dual(result_duals, level = 0);  result_duals = None
+        primal = _unpack_dual[0]
+        dual = _unpack_dual[1];  _unpack_dual = None
+
+        primals_out_unflatten: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = None
+
+        tangents_out_unflatten = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
+
+        _exit_dual_level = torch._C._exit_dual_level(0)
+        _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True)
+        _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting()
+        _saved_tensors_hooks_disable_2 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        results: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0);  tangents_out_unflatten = None
+        aux_2: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(aux_1, 1, 12, 0);  aux_1 = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+
+        aux_3: "f32[4, 3]" = aux_2[0];  aux_2 = None
+
+        movedim: "f32[3, 4, 12]" = results.movedim(0, -1);  results = None
+        split = movedim.split((12,), dim = -1);  movedim = None
+        jac_out_in: "f32[3, 4, 12]" = split[0];  split = None
+
+        unflatten: "f32[3, 4, 3, 4]" = jac_out_in.unflatten(-1, (3, 4));  jac_out_in = None
+        return (unflatten, aux_3)
+""",
+        )
+
+    def test_jacfwd_randomness(self):
+        counters.clear()
+
+        def fn(x, y):
+            return y.sin(), x
+
+        def wrapper_fn(x, y):
+            return torch.func.jacfwd(fn, randomness="same")(x, y)
+
+        x = torch.randn(4, 3)
+        y = torch.randn(3, 4)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, y))
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[4, 3]", L_y_: "f32[3, 4]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
+
+        tensor: "i64[1]" = torch.tensor((12,))
+        cumsum: "i64[1]" = tensor.cumsum(dim = 0);  tensor = None
+        getitem: "i64[0]" = cumsum[slice(None, -1, None)];  cumsum = None
+        neg: "i64[0]" = getitem.neg();  getitem = None
+        unbind = neg.unbind();  neg = None
+
+        chunk: "f32[12, 12]" = l_x_.new_zeros(12, 12)
+
+        diagonal: "f32[12]" = chunk.diagonal(0)
+        fill_: "f32[12]" = diagonal.fill_(1);  diagonal = None
+
+        child: "f32[12, 4, 3]" = chunk.view(12, 4, 3);  chunk = None
+
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'same')
+
+        child_1 = torch._C._functorch._add_batch_dim(child, 0, 1);  child = None
+
+        _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        _jvp_treespec_compare = torch._functorch.eager_transforms._jvp_treespec_compare((l_x_,), (child_1,))
+
+        _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting()
+        _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True)
+        _enter_dual_level = torch._C._enter_dual_level()
+
+        _maybe_load_decompositions = torch.autograd.forward_ad._maybe_load_decompositions()
+
+        child_3 = torch._make_dual(l_x_, child_1, level = 0);  child_1 = None
+
+        _wrap_for_grad = torch._C._functorch._wrap_for_grad(l_x_, 2);  l_x_ = None
+        _wrap_for_grad_1 = torch._C._functorch._wrap_for_grad(l_y_, 2);  l_y_ = None
+
+        child_2 = _wrap_for_grad_1.sin();  _wrap_for_grad_1 = None
+
+        _unpack_dual = torch._unpack_dual(child_2, level = 0);  child_2 = None
+        primal = _unpack_dual[0];  _unpack_dual = None
+
+        tangent = torch.zeros_like(primal)
+
+        _unpack_dual_1 = torch._unpack_dual(child_3, level = 0);  child_3 = None
+        primal_1 = _unpack_dual_1[0]
+        dual = _unpack_dual_1[1];  _unpack_dual_1 = None
+
+        child_4: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = None
+        child_5: "f32[4, 3]" = torch._C._functorch._unwrap_for_grad(primal_1, 2);  primal_1 = None
+
+        child_6: "f32[3, 4]" = torch._C._functorch._unwrap_for_grad(tangent, 2);  tangent = None
+        child_7 = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
+
+        _exit_dual_level = torch._C._exit_dual_level(0)
+        _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True)
+        _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting()
+        _saved_tensors_hooks_disable_2 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        child_8: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(child_6, 1, 12, 0);  child_6 = None
+        child_9: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(child_7, 1, 12, 0);  child_7 = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+
+        movedim: "f32[3, 4, 12]" = child_8.movedim(0, -1);  child_8 = None
+        split = movedim.split((12,), dim = -1);  movedim = None
+        jac_out_in: "f32[3, 4, 12]" = split[0];  split = None
+
+        unflatten: "f32[3, 4, 4, 3]" = jac_out_in.unflatten(-1, (4, 3));  jac_out_in = None
+
+        movedim_1: "f32[4, 3, 12]" = child_9.movedim(0, -1);  child_9 = None
+        split_1 = movedim_1.split((12,), dim = -1);  movedim_1 = None
+        jac_out_in_1: "f32[4, 3, 12]" = split_1[0];  split_1 = None
+
+        unflatten_1: "f32[4, 3, 4, 3]" = jac_out_in_1.unflatten(-1, (4, 3));  jac_out_in_1 = None
+        return (unflatten, unflatten_1)
+""",
+        )
+
+    def test_jacfwd_disable_capture(self):
+        counters.clear()
+
+        with config.patch(capture_func_transforms=False):
+            # We have verified above that this
+            # function compiles
+            def wrapper_fn(x):
+                return torch.func.jacfwd(torch.sin)(x)
+
+            x = torch.randn(3, 3, 3)
+            actual = wrapper_fn(x)
+            expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(
+                x
+            )
+            self.assertEqual(len(counters["graph_break"]), 2)
+            self.assertEqual(
+                dict(counters["graph_break"]),
+                {
+                    "torch.func.vmap capture is disabled, it can be "
+                    "turned on by setting `torch._dynamo.config.capture_func_transforms=True`": 2,
+                    "torch.func.jacfwd capture is disabled, it can be "
+                    "turned on by setting `torch._dynamo.config.capture_func_transforms=True`": 1,
+                },
+            )
+            self.assertEqual(actual, expected)
+
+    def test_jvp_simple(self):
+        counters.clear()
+
+        def fn(x):
+            return x.sin().sum()
+
+        def wrapper_fn(x, v):
+            return torch.func.jvp(fn, (x,), (v,))
+
+        x = torch.randn(3, 3)
+        v = torch.randn(3, 3)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, v))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
+    def forward(self, L_x_: "f32[3, 3]", L_v_: "f32[3, 3]"):
         l_x_ = L_x_
-        l_y_ = L_y_
+        l_v_ = L_v_
 
-        grad_body_0 = self.grad_body_0
-        grad_proxy = torch.func.grad(grad_body_0, (0, 1), True);  grad_body_0 = None
-        call = grad_proxy.__call__(l_x_, l_y_);  grad_proxy = l_x_ = l_y_ = None
-        getitem = call[0]
-        getitem_1 = getitem[0]
-        getitem_2 = getitem[1];  getitem = None
-        getitem_3 = call[1];  call = None
-        contiguous = getitem_1.contiguous();  getitem_1 = None
-        contiguous_1 = getitem_2.contiguous();  getitem_2 = None
-        return (contiguous, contiguous_1, getitem_3)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, l_x_, l_y_):
-            sin = l_x_.sin()
-            add = sin + l_y_;  sin = l_y_ = None
-            sum_1 = add.sum();  add = None
-            cos = l_x_.cos();  l_x_ = None
-            return (sum_1, cos)
+        _jvp_treespec_compare = torch._functorch.eager_transforms._jvp_treespec_compare((l_x_,), (l_v_,))
+
+        _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting()
+        _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True)
+        _enter_dual_level = torch._C._enter_dual_level()
+
+        _maybe_load_decompositions = torch.autograd.forward_ad._maybe_load_decompositions()
+
+        _make_dual = torch._make_dual(l_x_, l_v_, level = 0);  l_x_ = l_v_ = None
+
+        sin = _make_dual.sin();  _make_dual = None
+        result_duals = sin.sum();  sin = None
+
+        _unpack_dual = torch._unpack_dual(result_duals, level = 0);  result_duals = None
+        primal = _unpack_dual[0]
+        dual = _unpack_dual[1];  _unpack_dual = None
+
+        primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+
+        tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
+
+        _exit_dual_level = torch._C._exit_dual_level(0)
+        _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True)
+        _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (primals_out_unflatten, tangents_out_unflatten)
 """,
         )
+
+    def test_jvp_has_aux(self):
+        counters.clear()
+
+        def fn(x):
+            return x.sin().sum(), x
+
+        def wrapper_fn(x, v):
+            return torch.func.jvp(fn, (x,), (v,), has_aux=True)
+
+        x = torch.randn(3, 3)
+        v = torch.randn(3, 3)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, v))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
         self.assertExpectedInline(
-            actual_tuple_var,
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3]", L_v_: "f32[3, 3]"):
+        l_x_ = L_x_
+        l_v_ = L_v_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        _jvp_treespec_compare = torch._functorch.eager_transforms._jvp_treespec_compare((l_x_,), (l_v_,))
+
+        _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting()
+        _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True)
+        _enter_dual_level = torch._C._enter_dual_level()
+
+        _maybe_load_decompositions = torch.autograd.forward_ad._maybe_load_decompositions()
+
+        aux = torch._make_dual(l_x_, l_v_, level = 0);  l_x_ = l_v_ = None
+
+        sin = aux.sin()
+        result_duals = sin.sum();  sin = None
+
+        aux_1: "f32[3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
+
+        _unpack_dual = torch._unpack_dual(result_duals, level = 0);  result_duals = None
+        primal = _unpack_dual[0]
+        dual = _unpack_dual[1];  _unpack_dual = None
+
+        primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+
+        tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
+
+        _exit_dual_level = torch._C._exit_dual_level(0)
+        _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True)
+        _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (primals_out_unflatten, tangents_out_unflatten, aux_1)
+""",
+        )
+
+    def test_jvp_two_tensors_has_aux(self):
+        counters.clear()
+
+        def fn(x, y):
+            return (x.sin().sum() + y.cos()), x
+
+        def wrapper_fn(x, y, v):
+            return torch.func.jvp(fn, (x, y), (v, v), has_aux=True)
+
+        x = torch.randn(3, 3)
+        y = torch.randn(3, 3)
+        v = torch.randn(3, 3)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, y, v))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
+    def forward(self, L_x_: "f32[3, 3]", L_y_: "f32[3, 3]", L_v_: "f32[3, 3]"):
         l_x_ = L_x_
         l_y_ = L_y_
+        l_v_ = L_v_
 
-        grad_body_0 = self.grad_body_0
-        grad_proxy = torch.func.grad(grad_body_0, (0, 1), True);  grad_body_0 = None
-        call = grad_proxy.__call__(l_x_, l_y_);  grad_proxy = l_x_ = l_y_ = None
-        getitem = call[0]
-        getitem_1 = getitem[0]
-        getitem_2 = getitem[1];  getitem = None
-        getitem_3 = call[1];  call = None
-        contiguous = getitem_1.contiguous();  getitem_1 = None
-        contiguous_1 = getitem_2.contiguous();  getitem_2 = None
-        return (contiguous, contiguous_1, getitem_3)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, l_x_, l_y_):
-            sin = l_x_.sin()
-            add = sin + l_y_;  sin = l_y_ = None
-            sum_1 = add.sum();  add = None
-            cos = l_x_.cos();  l_x_ = None
-            return (sum_1, cos)
+        _jvp_treespec_compare = torch._functorch.eager_transforms._jvp_treespec_compare((l_x_, l_y_), (l_v_, l_v_))
+
+        _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting()
+        _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True)
+        _enter_dual_level = torch._C._enter_dual_level()
+
+        _maybe_load_decompositions = torch.autograd.forward_ad._maybe_load_decompositions()
+
+        aux = torch._make_dual(l_x_, l_v_, level = 0);  l_x_ = None
+
+        _maybe_load_decompositions_1 = torch.autograd.forward_ad._maybe_load_decompositions()
+
+        _make_dual_1 = torch._make_dual(l_y_, l_v_, level = 0);  l_y_ = l_v_ = None
+
+        sin = aux.sin()
+        sum_1 = sin.sum();  sin = None
+        cos = _make_dual_1.cos();  _make_dual_1 = None
+        result_duals = sum_1 + cos;  sum_1 = cos = None
+
+        aux_1: "f32[3, 3]" = torch._C._functorch._unwrap_for_grad(aux, 1);  aux = None
+
+        _unpack_dual = torch._unpack_dual(result_duals, level = 0);  result_duals = None
+        primal = _unpack_dual[0]
+        dual = _unpack_dual[1];  _unpack_dual = None
+
+        primals_out_unflatten: "f32[3, 3]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+
+        tangents_out_unflatten: "f32[3, 3]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
+
+        _exit_dual_level = torch._C._exit_dual_level(0)
+        _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True)
+        _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (primals_out_unflatten, tangents_out_unflatten, aux_1)
 """,
         )
 
-    @config.patch(capture_func_transforms=True)
-    def test_grad_over_grad(self):
+    def test_jvp_two_tensors_disable_grad(self):
         counters.clear()
 
         def fn(x):
             return x.sin().sum()
 
-        def wrapper_fn(x):
-            return torch.func.grad(torch.func.grad(fn))(x)
+        def wrapper_fn(x, v):
+            with torch.autograd.forward_ad._set_fwd_grad_enabled(False):
+                return torch.func.jvp(fn, (x,), (v,))
 
-        x = torch.randn(())
-        wrapped_gm = self._compile_check(wrapper_fn, (x,), fullgraph=False)
+        x = torch.randn(3, 3)
+        v = torch.randn(3, 3)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, v))
 
+        # Dynamic shapes produce a slightly different graph.
         if check_dynamic_shape_capture():
             return
 
@@ -2765,112 +4845,267 @@ def wrapper_fn(x):
             actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
+    def forward(self, L_x_: "f32[3, 3]", L_v_: "f32[3, 3]"):
         l_x_ = L_x_
+        l_v_ = L_v_
 
-        grad_body_1 = self.grad_body_1
-        grad_proxy = torch.func.grad(grad_body_1, 0, False);  grad_body_1 = None
-        call = grad_proxy.__call__(l_x_);  grad_proxy = l_x_ = None
-        contiguous = call.contiguous();  call = None
-        return (contiguous,)
+        _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(False)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, l_x_):
-            grad_body_0 = self.grad_body_0
-            grad_proxy = torch.func.grad(grad_body_0, 0, False);  grad_body_0 = None
-            call = grad_proxy.__call__(l_x_);  grad_proxy = l_x_ = None
-            contiguous = call.contiguous();  call = None
-            return contiguous
-
-        class GraphModule(torch.nn.Module):
-            def forward(self, l_x_):
-                sin = l_x_.sin();  l_x_ = None
-                sum_1 = sin.sum();  sin = None
-                return sum_1
+        _jvp_treespec_compare = torch._functorch.eager_transforms._jvp_treespec_compare((l_x_,), (l_v_,))
+
+        _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting()
+        _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True)
+        _enter_dual_level = torch._C._enter_dual_level()
+
+        _maybe_load_decompositions = torch.autograd.forward_ad._maybe_load_decompositions()
+
+        _make_dual = torch._make_dual(l_x_, l_v_, level = 0);  l_x_ = l_v_ = None
+
+        sin = _make_dual.sin();  _make_dual = None
+        result_duals = sin.sum();  sin = None
+
+        _unpack_dual = torch._unpack_dual(result_duals, level = 0);  result_duals = None
+        primal = _unpack_dual[0]
+        dual = _unpack_dual[1];  _unpack_dual = None
+
+        primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+
+        tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
+
+        _exit_dual_level = torch._C._exit_dual_level(0)
+        _set_fwd_grad_enabled_2 = torch._C._set_fwd_grad_enabled(False)
+        _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        _set_fwd_grad_enabled_3 = torch._C._set_fwd_grad_enabled(True)
+        return (primals_out_unflatten, tangents_out_unflatten)
 """,
         )
 
-    @config.patch(capture_func_transforms=True)
-    def test_grad_with_graph_break(self):
+    def test_jvp_two_tensors_disable_enable_disable_grad(self):
         counters.clear()
 
         def fn(x):
-            torch._dynamo.graph_break()
             return x.sin().sum()
 
+        def wrapper_fn(x, v):
+            with torch.autograd.forward_ad._set_fwd_grad_enabled(False):  # (1)
+                with torch.autograd.forward_ad._set_fwd_grad_enabled(True):  # (2)
+                    with torch.autograd.forward_ad._set_fwd_grad_enabled(False):  # (3)
+                        return torch.func.jvp(fn, (x,), (v,))  # (4)
+
+            # Start True
+            # False      (1)
+            #   True     (2)
+            #     False  (3)
+            #       True (4)
+            #     True   (undo 3)
+            #   False    (undo 2)
+            # True       (undo 1)
+
+        x = torch.randn(3, 3)
+        v = torch.randn(3, 3)
+        wrapped_gm = self._compile_check(wrapper_fn, (x, v))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3]", L_v_: "f32[3, 3]"):
+        l_x_ = L_x_
+        l_v_ = L_v_
+
+        _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(False)
+        _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True)
+        _set_fwd_grad_enabled_2 = torch._C._set_fwd_grad_enabled(False)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        _jvp_treespec_compare = torch._functorch.eager_transforms._jvp_treespec_compare((l_x_,), (l_v_,))
+
+        _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting()
+        _set_fwd_grad_enabled_3 = torch._C._set_fwd_grad_enabled(True)
+        _enter_dual_level = torch._C._enter_dual_level()
+
+        _maybe_load_decompositions = torch.autograd.forward_ad._maybe_load_decompositions()
+
+        _make_dual = torch._make_dual(l_x_, l_v_, level = 0);  l_x_ = l_v_ = None
+
+        sin = _make_dual.sin();  _make_dual = None
+        result_duals = sin.sum();  sin = None
+
+        _unpack_dual = torch._unpack_dual(result_duals, level = 0);  result_duals = None
+        primal = _unpack_dual[0]
+        dual = _unpack_dual[1];  _unpack_dual = None
+
+        primals_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(primal, 1);  primal = None
+
+        tangents_out_unflatten: "f32[]" = torch._C._functorch._unwrap_for_grad(dual, 1);  dual = None
+
+        _exit_dual_level = torch._C._exit_dual_level(0)
+        _set_fwd_grad_enabled_4 = torch._C._set_fwd_grad_enabled(False)
+        _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        _set_fwd_grad_enabled_5 = torch._C._set_fwd_grad_enabled(True)
+        _set_fwd_grad_enabled_6 = torch._C._set_fwd_grad_enabled(False)
+        _set_fwd_grad_enabled_7 = torch._C._set_fwd_grad_enabled(True)
+        return (primals_out_unflatten, tangents_out_unflatten)
+""",
+        )
+
+    def test_jvp_freevar_tensor(self):
+        counters.clear()
+        y = torch.randn(3, 3)
+
+        def fn(x):
+            return (x.sin() + y).sum()
+
         def wrapper_fn(x):
-            return torch.func.grad(fn)(x)
+            return torch.func.jvp(fn, (x,), (x,))
 
-        x = torch.randn(3, 3, 3)
-        actual = wrapper_fn(x)
-        expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(x)
-        self.assertEqual(len(counters["graph_break"]), 1)
+        x = torch.randn(3, 3)
+        expected = wrapper_fn(x)
+        actual = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=True)(x)
         self.assertEqual(actual, expected)
 
-    @config.patch(capture_func_transforms=True)
-    def test_grad_with_side_effect(self):
+    def test_jvp_jvp(self):
         counters.clear()
 
-        foo = [1, 2]
+        if check_dynamic_shape_capture():
+            self.skipTest("test fails with dynamic shapes")
 
         def fn(x):
-            foo.append(3)
-            return x.sin().sum()
+            return torch.func.jvp(torch.sin, (x,), (x,))
 
         def wrapper_fn(x):
-            return torch.func.grad(fn)(x)
+            return torch.func.jvp(fn, (x,), (x,))
 
         x = torch.randn(3, 3, 3)
-        actual = wrapper_fn(x)
-        expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(x)
-        self.assertEqual(len(counters["graph_break"]), 1)
-        assert_dict_matches_regex(
-            self,
-            dict(counters["graph_break"]),
-            {
-                r".*HigherOrderOperator: Mutating a variable not in the current scope \(SideEffects\)": 2
-            },
+        wrapped_gm = self._compile_check(wrapper_fn, (x,))
+
+        # Dynamic shapes produce a slightly different graph.
+        if check_dynamic_shape_capture():
+            return
+
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[3, 3, 3]"):
+        l_x_ = L_x_
+
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        _jvp_treespec_compare = torch._functorch.eager_transforms._jvp_treespec_compare((l_x_,), (l_x_,))
+
+        _jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting()
+        _set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True)
+        _enter_dual_level = torch._C._enter_dual_level()
+
+        _maybe_load_decompositions = torch.autograd.forward_ad._maybe_load_decompositions()
+
+        child = torch._make_dual(l_x_, l_x_, level = 0);  l_x_ = None
+
+        _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        _jvp_treespec_compare_1 = torch._functorch.eager_transforms._jvp_treespec_compare((child,), (child,))
+
+        _jvp_increment_nesting_1 = torch._C._functorch._jvp_increment_nesting()
+        _set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True)
+
+        _maybe_load_decompositions_1 = torch.autograd.forward_ad._maybe_load_decompositions()
+
+        _make_dual_1 = torch._make_dual(child, child, level = 0);  child = None
+
+        result_duals = torch.sin(_make_dual_1);  _make_dual_1 = None
+
+        _unpack_dual = torch._unpack_dual(result_duals, level = 0);  result_duals = None
+        primal = _unpack_dual[0]
+        dual = _unpack_dual[1];  _unpack_dual = None
+
+        primals_out_unflatten = torch._C._functorch._unwrap_for_grad(primal, 2);  primal = None
+
+        tangents_out_unflatten = torch._C._functorch._unwrap_for_grad(dual, 2);  dual = None
+
+        _set_fwd_grad_enabled_2 = torch._C._set_fwd_grad_enabled(True)
+        _jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting()
+        _saved_tensors_hooks_disable_2 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        _unpack_dual_1 = torch._unpack_dual(primals_out_unflatten, level = 0);  primals_out_unflatten = None
+        primal_1 = _unpack_dual_1[0]
+        dual_1 = _unpack_dual_1[1];  _unpack_dual_1 = None
+        _unpack_dual_2 = torch._unpack_dual(tangents_out_unflatten, level = 0);  tangents_out_unflatten = None
+        primal_2 = _unpack_dual_2[0]
+        dual_2 = _unpack_dual_2[1];  _unpack_dual_2 = None
+
+        _unwrap_for_grad_2: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(primal_1, 1);  primal_1 = None
+        _unwrap_for_grad_3: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(primal_2, 1);  primal_2 = None
+
+        _unwrap_for_grad_4: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(dual_1, 1);  dual_1 = None
+        _unwrap_for_grad_5: "f32[3, 3, 3]" = torch._C._functorch._unwrap_for_grad(dual_2, 1);  dual_2 = None
+
+        _exit_dual_level = torch._C._exit_dual_level(0)
+        _set_fwd_grad_enabled_3 = torch._C._set_fwd_grad_enabled(True)
+        _jvp_decrement_nesting_1 = torch._C._functorch._jvp_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (_unwrap_for_grad_2, _unwrap_for_grad_3, _unwrap_for_grad_4, _unwrap_for_grad_5)
+""",
         )
-        self.assertEqual(actual, expected)
 
-    @config.patch(capture_func_transforms=True)
-    def test_grad_pytree(self):
+    def test_jvp_freevar_python_scalar(self):
         counters.clear()
+        y = 3
 
         def fn(x):
-            x1, x2 = x
-            return x1.sin().sum() + x2
+            return (x.sin() + y).sum()
 
         def wrapper_fn(x):
-            return torch.func.grad(fn)(x)
+            return torch.func.jvp(fn, (x,), (x,))
 
-        x1 = torch.randn(3, 3, 3)
-        x2 = torch.randn(())
-        actual = wrapper_fn((x1, x2))
-        expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(
-            (x1, x2)
-        )
-        self.assertEqual(len(counters["graph_break"]), 1)
-        assert_dict_matches_regex(
-            self,
-            dict(counters["graph_break"]),
-            {".*torch.func.grad with body that accepts non-Tensors as input": 2},
-        )
+        x = torch.randn(3, 3, 3)
+        expected = wrapper_fn(x)
+        actual = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=True)(x)
         self.assertEqual(actual, expected)
 
-    @config.patch(capture_func_transforms=True)
-    def test_grad_non_tensor_input(self):
+    def test_jvp_disable_capture(self):
         counters.clear()
 
-        def fn(x, y):
-            return x.sin().sum() + y
+        with config.patch(capture_func_transforms=False):
+            # We have verified above that this
+            # function compiles
+            def wrapper_fn(x):
+                return torch.func.jvp(torch.sin, (x,), (x,))
 
-        def wrapper_fn(x, y):
-            return torch.func.grad(fn)(x, y)
+            x = torch.randn(3, 3, 3)
+            actual = wrapper_fn(x)
+            expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(
+                x
+            )
+            self.assertEqual(len(counters["graph_break"]), 1)
+            self.assertEqual(
+                dict(counters["graph_break"]),
+                {
+                    "torch.func.jvp capture is disabled, it can be "
+                    "turned on by setting `torch._dynamo.config.capture_func_transforms=True`": 1
+                },
+            )
+        self.assertEqual(actual, expected)
 
-        x = torch.randn(3, 3, 3)
-        y = 3.0
-        wrapped_gm = self._compile_check(wrapper_fn, (x, y))
+    @config.patch(capture_func_transforms=True)
+    def test_linearize_jvp_fn(self):
+        counters.clear()
+
+        def wrapper_fn(x):
+            output, jvp_fn = torch.func.linearize(torch.sin, x)
+            return output, jvp_fn(x)
+
+        x = torch.randn(3, 3, 3)
+        wrapped_gm = self._compile_check(wrapper_fn, (x,), fullgraph=False, graph_idx=0)
 
         # Dynamic shapes produce a slightly different graph.
         if check_dynamic_shape_capture():
@@ -2881,72 +5116,225 @@ def wrapper_fn(x, y):
             actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
-        l_x_ = L_x_
+    def forward(self, L_self_tensor_constant0: "f32[3, 3, 3]"):
+        l_self_tensor_constant0 = L_self_tensor_constant0
 
-        grad_body_0 = self.grad_body_0
-        grad_proxy = torch.func.grad(grad_body_0, 0, False);  grad_body_0 = None
-        call = grad_proxy.__call__(l_x_, 3.0);  grad_proxy = l_x_ = None
-        contiguous = call.contiguous();  call = None
-        return (contiguous,)
+        alias_default: "f32[3, 3, 3]" = torch.ops.aten.alias.default(l_self_tensor_constant0);  l_self_tensor_constant0 = None
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, l_x_, const):
-            sin = l_x_.sin();  l_x_ = None
-            sum_1 = sin.sum();  sin = None
-            add = sum_1 + 3.0;  sum_1 = None
-            return add
+        sin_default: "f32[3, 3, 3]" = torch.ops.aten.sin.default(alias_default)
+
+        alias_default_1: "f32[3, 3, 3]" = torch.ops.aten.alias.default(alias_default)
+
+        cos_default: "f32[3, 3, 3]" = torch.ops.aten.cos.default(alias_default_1);  alias_default_1 = None
+
+        alias_default_2: "f32[3, 3, 3]" = torch.ops.aten.alias.default(sin_default)
+        return (alias_default, cos_default, sin_default)
 """,
         )
 
-    @config.patch(capture_func_transforms=True)
-    def test_grad_disable_capture(self):
-        counters.clear()
+        wrapped_gm = self._compile_check(wrapper_fn, (x,), fullgraph=False, graph_idx=1)
+        actual = normalize_gm(wrapped_gm.print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, getattr_L_self_FX_CONST_FOLDED_ATTRS_0_: "f32[3, 3, 3]", getattr_L_self_FX_CONST_FOLDED_ATTRS_1_: "f32[3, 3, 3]", L_flat_tangents_1_: "f32[3, 3, 3]"):
+        getattr_l_self_fx_const_folded_attrs_0_ = getattr_L_self_FX_CONST_FOLDED_ATTRS_0_
+        getattr_l_self_fx_const_folded_attrs_1_ = getattr_L_self_FX_CONST_FOLDED_ATTRS_1_
+        l_flat_tangents_1_ = L_flat_tangents_1_
+
+        _new_zeros_with_same_feature_meta_default: "f32[3, 3, 3]" = torch.ops.aten._new_zeros_with_same_feature_meta.default(l_flat_tangents_1_, getattr_l_self_fx_const_folded_attrs_0_);  getattr_l_self_fx_const_folded_attrs_0_ = None
+
+        copy__default: "f32[3, 3, 3]" = torch.ops.aten.copy_.default(_new_zeros_with_same_feature_meta_default, l_flat_tangents_1_);  _new_zeros_with_same_feature_meta_default = l_flat_tangents_1_ = None
 
+        mul_tensor: "f32[3, 3, 3]" = torch.ops.aten.mul.Tensor(copy__default, getattr_l_self_fx_const_folded_attrs_1_);  copy__default = getattr_l_self_fx_const_folded_attrs_1_ = None
+        return (mul_tensor,)
+""",
+        )
+
+    def test_linearize_disable_capture(self):
+        counters.clear()
         with config.patch(capture_func_transforms=False):
             # We have verified above that this
             # function compiles
-            def fn(x):
-                return x.sin().sum()
-
             def wrapper_fn(x):
-                return torch.func.grad(fn)(x)
+                out, _ = torch.func.linearize(torch.sin, x)
+                return out
 
-            x = torch.randn(3, 3)
+            x = torch.randn(2, 3)
             actual = wrapper_fn(x)
             expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(
                 x
             )
             self.assertEqual(len(counters["graph_break"]), 1)
             self.assertEqual(
-                dict(counters["graph_break"]),
                 {
-                    "torch.func.grad capture is disabled, it can be turned "
-                    "on by setting `torch._dynamo.config.capture_func_transforms=True`": 2
+                    "torch.func.linearize capture is disabled, it can be "
+                    "turned on by setting `torch._dynamo.config.capture_func_transforms=True`": 1,
                 },
+                dict(counters["graph_break"]),
             )
             self.assertEqual(actual, expected)
 
     @config.patch(capture_func_transforms=True)
-    def test_grad_fn_with_kwargs(self):
-        def fn(x, y):
-            return (x + y).sum()
+    @config.patch(error_on_recompile=True)
+    def test_vmap_recompile(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            return torch.vmap(lambda x: x.sin())(x)
 
-        def wrapper_fn(x, y):
-            return torch.func.grad(fn)(x, y=y)
+        x = torch.zeros(3, 3, 4, 5)
+        y = torch.vmap(fn)(x)
+        # should not recompile on second call. See Pytorch issue #118493
+        y = torch.vmap(fn)(x)
 
-        x = torch.randn(3, 3)
-        y = torch.randn(3, 3)
-        actual = wrapper_fn(x, y)
-        expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(x, y)
+    @xfailIfTorchDynamo
+    @config.patch(error_on_recompile=True)
+    def test_vmap_recompile_different_config(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            return torch.vmap(lambda x: x.sin())(x)
+
+        x = torch.zeros(3, 3, 4, 5)
+        y = torch.vmap(fn)(x)
+        with self.assertRaises(torch._dynamo.exc.RecompileError):
+            fn(x)
+
+    @config.patch(error_on_recompile=True)
+    def test_vmap_recompile_same_config(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            return torch.vmap(lambda x: x.sin())(x)
+
+        x = torch.zeros(3, 3, 4, 5)
+        torch.vmap(torch.vmap(fn, randomness="same"), randomness="same")(x)
+        with self.assertRaises(torch._dynamo.exc.RecompileError):
+            torch.vmap(torch.vmap(fn, randomness="same"), randomness="error")(x)
+
+    @config.patch(error_on_recompile=True)
+    def test_vmap_recompile_with_randomness(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            return torch.vmap(lambda x: x.sin())(x)
+
+        x = torch.zeros(3, 3, 4, 5)
+        torch.vmap(fn, randomness="same")(x)
+        with self.assertRaises(torch._dynamo.exc.RecompileError):
+            torch.vmap(fn, randomness="different")(x)
+
+    @config.patch(error_on_recompile=True)
+    def test_grad_recompile(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            return torch.func.grad(torch.sin)(x)
+
+        x = torch.randn([])
+        torch.func.grad(fn)(x)
+        # should not recompile on second call
+        torch.func.grad(fn)(x)
+
+    def test_vmap_get_wrapped(self):
+        counters.clear()
+
+        def g(x):
+            return x.sin()
+
+        @torch.compile(backend="aot_eager", fullgraph=True)
+        def fn():
+            return torch.vmap(g)
+
+        x = torch.randn(3, 4)
+        expected = torch.vmap(g)(x)
+        wrapper = fn()
+        got = wrapper(x)
+        self.assertEqual(expected, got)
+
+    def test_vmap_with_conditional_graph_break(self):
+        def g(x):
+            if len(x.shape) < 2:
+                torch._dynamo.graph_break()
+                return x.sin()
+            else:
+                return x.cos()
+
+        @torch.compile(backend="aot_eager")
+        def fn(x):
+            return torch.vmap(g)(x)
+
+        counters.clear()
+        x = torch.randn(2, 3)
+        expected = x.sin()
+        got = fn(x)
+        self.assertEqual(expected, got)
         self.assertEqual(len(counters["graph_break"]), 1)
-        self.assertEqual(
-            dict(counters["graph_break"]),
-            {"torch.func.grad: kwargs arguments are currently unsupported.": 2},
-        )
-        self.assertEqual(actual, expected)
 
-    @config.patch(capture_func_transforms=True)
+        counters.clear()
+        y = torch.randn(2, 3, 4)
+        expected = y.cos()
+        got = fn(y)
+        self.assertEqual(expected, got)
+        self.assertEqual(len(counters["graph_break"]), 0)
+
+    def test_vmap_with_graph_break(self):
+        counters.clear()
+
+        def g(x):
+            y = x.cos()
+            print("hi")
+            return y.sin()
+
+        def fn(x):
+            return torch.vmap(g)(x)
+
+        x = torch.randn(3, 4)
+        opt = torch.compile(fn, backend="aot_eager", fullgraph=False)
+        expected = fn(x)
+        got = opt(x)
+        self.assertEqual(len(counters["graph_break"]), 1)
+        self.assertEqual(expected, got)
+
+    def test_vmap_with_graph_break_2(self):
+        counters.clear()
+
+        def cos(x):
+            print("cos")
+            return x.cos()
+
+        def sin(x):
+            print("sin")
+            return x.sin()
+
+        def g(x):
+            y = cos(x)
+            return sin(y)
+
+        def fn(x):
+            return torch.vmap(g, randomness="same")(x)
+
+        x = torch.randn(3, 4)
+        opt = torch.compile(fn, backend="aot_eager", fullgraph=False)
+        expected = fn(x)
+        got = opt(x)
+        self.assertEqual(len(counters["graph_break"]), 1)
+        self.assertEqual(expected, got)
+
+    def test_vmap_with_graph_break_lambda(self):
+        counters.clear()
+
+        def sin(x):
+            print("sin")
+            return x.sin()
+
+        def fn(x):
+            return torch.vmap(lambda x: sin(x))(x)
+
+        x = torch.randn(3, 4)
+        opt = torch.compile(fn, backend="aot_eager", fullgraph=False)
+        expected = fn(x)
+        got = opt(x)
+        self.assertEqual(len(counters["graph_break"]), 1)
+        self.assertEqual(expected, got)
+
     def test_vmap(self):
         def fn(x):
             return torch.func.vmap(lambda x: x.sum(0) + x.sum(1))(x)
@@ -2963,27 +5351,28 @@ def fn(x):
             actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
-        child = L_x_
+    def forward(self, L_x_: "f32[3, 3, 3]"):
+        l_x_ = L_x_
 
-        _check_randomness_arg = torch._functorch.vmap._check_randomness_arg('error')
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
 
-        select = child.select(0, 0)
-        vmap_body_0 = self.vmap_body_0
-        vmap_proxy = torch.func.vmap(vmap_body_0, (0,), 0, 'error');  vmap_body_0 = None
-        call = vmap_proxy.__call__(child);  vmap_proxy = child = None
-        return (call,)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error')
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, select):
-            sum_1 = select.sum(0)
-            sum_2 = select.sum(1);  select = None
-            add = sum_1 + sum_2;  sum_1 = sum_2 = None
-            return add
+        _add_batch_dim = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+
+        sum_1 = _add_batch_dim.sum(0)
+        sum_2 = _add_batch_dim.sum(1);  _add_batch_dim = None
+        batched_outputs = sum_1 + sum_2;  sum_1 = sum_2 = None
+
+        _remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (_remove_batch_dim,)
 """,
         )
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_free_const(self):
         y = 3
 
@@ -3002,28 +5391,29 @@ def fn(x):
             actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
-        child = L_x_
+    def forward(self, L_x_: "f32[3, 3, 3]"):
+        l_x_ = L_x_
 
-        _check_randomness_arg = torch._functorch.vmap._check_randomness_arg('error')
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
 
-        select = child.select(0, 0)
-        vmap_body_0 = self.vmap_body_0
-        vmap_proxy = torch.func.vmap(vmap_body_0, (0,), 0, 'error');  vmap_body_0 = None
-        call = vmap_proxy.__call__(child);  vmap_proxy = child = None
-        return (call,)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error')
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, select):
-            sum_1 = select.sum(0)
-            sum_2 = select.sum(1);  select = None
-            add = sum_1 + sum_2;  sum_1 = sum_2 = None
-            add_1 = add + 3;  add = None
-            return add_1
+        _add_batch_dim = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+
+        sum_1 = _add_batch_dim.sum(0)
+        sum_2 = _add_batch_dim.sum(1);  _add_batch_dim = None
+        add = sum_1 + sum_2;  sum_1 = sum_2 = None
+        batched_outputs = add + 3;  add = None
+
+        _remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (_remove_batch_dim,)
 """,
         )
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_free_tensor(self):
         y = torch.randn(3, 3)
 
@@ -3042,29 +5432,30 @@ def fn(x):
             actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
-        child = L_x_
+    def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3]"):
+        l_x_ = L_x_
         l_y_ = L_y_
 
-        _check_randomness_arg = torch._functorch.vmap._check_randomness_arg('error')
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
 
-        select = child.select(0, 0)
-        vmap_body_0 = self.vmap_body_0
-        vmap_proxy = torch.func.vmap(vmap_body_0, (0, None), 0, 'error');  vmap_body_0 = None
-        call = vmap_proxy.__call__(child, l_y_);  vmap_proxy = child = l_y_ = None
-        return (call,)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error')
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, select, l_y_):
-            sum_1 = select.sum(0)
-            sum_2 = select.sum(1);  select = None
-            add = sum_1 + sum_2;  sum_1 = sum_2 = None
-            add_1 = add + l_y_;  add = l_y_ = None
-            return add_1
+        _add_batch_dim = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+
+        sum_1 = _add_batch_dim.sum(0)
+        sum_2 = _add_batch_dim.sum(1);  _add_batch_dim = None
+        add = sum_1 + sum_2;  sum_1 = sum_2 = None
+        batched_outputs = add + l_y_;  add = l_y_ = None
+
+        _remove_batch_dim: "f32[3, 3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (_remove_batch_dim,)
 """,
         )
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_two_inputs(self):
         def fn(x, y):
             return torch.func.vmap(
@@ -3084,30 +5475,31 @@ def fn(x, y):
             actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
-        child = L_x_
-        child_1 = L_y_
+    def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
 
-        _check_randomness_arg = torch._functorch.vmap._check_randomness_arg('error')
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
 
-        select = child.select(0, 0)
-        select_1 = child_1.select(1, 0)
-        vmap_body_0 = self.vmap_body_0
-        vmap_proxy = torch.func.vmap(vmap_body_0, (0, 1), 0, 'error');  vmap_body_0 = None
-        call = vmap_proxy.__call__(child, child_1);  vmap_proxy = child = child_1 = None
-        return (call,)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error')
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, select, select_1):
-            sum_1 = select.sum(0)
-            sum_2 = select.sum(1);  select = None
-            add = sum_1 + sum_2;  sum_1 = sum_2 = None
-            add_1 = add + select_1;  add = select_1 = None
-            return add_1
+        _add_batch_dim = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        _add_batch_dim_1 = torch._C._functorch._add_batch_dim(l_y_, 1, 1);  l_y_ = None
+
+        sum_1 = _add_batch_dim.sum(0)
+        sum_2 = _add_batch_dim.sum(1);  _add_batch_dim = None
+        add = sum_1 + sum_2;  sum_1 = sum_2 = None
+        batched_outputs = add + _add_batch_dim_1;  add = _add_batch_dim_1 = None
+
+        _remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (_remove_batch_dim,)
 """,
         )
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_two_inputs_tuple_in_dims(self):
         in_dims = (0, 1)
 
@@ -3129,30 +5521,31 @@ def fn(x, y):
             actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
-        child = L_x_
-        child_1 = L_y_
+    def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
 
-        _check_randomness_arg = torch._functorch.vmap._check_randomness_arg('error')
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
 
-        select = child.select(0, 0)
-        select_1 = child_1.select(1, 0)
-        vmap_body_0 = self.vmap_body_0
-        vmap_proxy = torch.func.vmap(vmap_body_0, (0, 1), 0, 'error');  vmap_body_0 = None
-        call = vmap_proxy.__call__(child, child_1);  vmap_proxy = child = child_1 = None
-        return (call,)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error')
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, select, select_1):
-            sum_1 = select.sum(0)
-            sum_2 = select.sum(1);  select = None
-            add = sum_1 + sum_2;  sum_1 = sum_2 = None
-            add_1 = add + select_1;  add = select_1 = None
-            return add_1
+        _add_batch_dim = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        _add_batch_dim_1 = torch._C._functorch._add_batch_dim(l_y_, 1, 1);  l_y_ = None
+
+        sum_1 = _add_batch_dim.sum(0)
+        sum_2 = _add_batch_dim.sum(1);  _add_batch_dim = None
+        add = sum_1 + sum_2;  sum_1 = sum_2 = None
+        batched_outputs = add + _add_batch_dim_1;  add = _add_batch_dim_1 = None
+
+        _remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (_remove_batch_dim,)
 """,
         )
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_over_vmap_two_inputs(self):
         def fn(x, y):
             return torch.func.vmap(torch.func.vmap(lambda x, y: x + y, in_dims=1))(x, y)
@@ -3170,37 +5563,41 @@ def fn(x, y):
             actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
-        child = L_x_
-        child_1 = L_y_
+    def forward(self, L_x_: "f32[3, 3, 3]", L_y_: "f32[3, 3, 3]"):
+        l_x_ = L_x_
+        l_y_ = L_y_
 
-        _check_randomness_arg = torch._functorch.vmap._check_randomness_arg('error')
-        _check_randomness_arg_1 = torch._functorch.vmap._check_randomness_arg('error')
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
 
-        select = child.select(0, 0)
-        select_1 = child_1.select(0, 0)
-        vmap_body_1 = self.vmap_body_1
-        vmap_proxy = torch.func.vmap(vmap_body_1, (0, 0), 0, 'error');  vmap_body_1 = None
-        call = vmap_proxy.__call__(child, child_1);  vmap_proxy = child = child_1 = None
-        return (call,)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error')
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, select, select_1):
-            select_2 = select.select(1, 0)
-            select_3 = select_1.select(1, 0)
-            vmap_body_0 = self.vmap_body_0
-            vmap_proxy = torch.func.vmap(vmap_body_0, (1, 1), 0, 'error');  vmap_body_0 = None
-            call = vmap_proxy.__call__(select, select_1);  vmap_proxy = select = select_1 = None
-            return call
-
-        class GraphModule(torch.nn.Module):
-            def forward(self, select_2, select_3):
-                add = select_2 + select_3;  select_2 = select_3 = None
-                return add
+        child = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+        child_1 = torch._C._functorch._add_batch_dim(l_y_, 0, 1);  l_y_ = None
+
+        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions()
+
+        _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(3, 'error')
+
+        _add_batch_dim_2 = torch._C._functorch._add_batch_dim(child, 1, 2);  child = None
+        _add_batch_dim_3 = torch._C._functorch._add_batch_dim(child_1, 1, 2);  child_1 = None
+
+        batched_outputs = _add_batch_dim_2 + _add_batch_dim_3;  _add_batch_dim_2 = _add_batch_dim_3 = None
+
+        batched_outputs_1 = torch._C._functorch._remove_batch_dim(batched_outputs, 2, 3, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_disable_2 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        _remove_batch_dim_1: "f32[3, 3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs_1, 1, 3, 0);  batched_outputs_1 = None
+
+        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (_remove_batch_dim_1,)
 """,
         )
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_over_vmap_captured(self):
         x = torch.ones(2, 3)
         y = torch.ones(5, 3)
@@ -3219,35 +5616,39 @@ def fn(x):
             actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_y_ : torch.Tensor, L_x_ : torch.Tensor):
-        child = L_y_
+    def forward(self, L_y_: "f32[5, 3]", L_x_: "f32[2, 3]"):
+        l_y_ = L_y_
         l_x_ = L_x_
 
-        _check_randomness_arg = torch._functorch.vmap._check_randomness_arg('error')
-        _check_randomness_arg_1 = torch._functorch.vmap._check_randomness_arg('error')
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
 
-        select = child.select(0, 0)
-        vmap_body_1 = self.vmap_body_1
-        vmap_proxy = torch.func.vmap(vmap_body_1, (0, None), 0, 'error');  vmap_body_1 = None
-        call = vmap_proxy.__call__(child, l_x_);  vmap_proxy = child = l_x_ = None
-        return (call,)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(5, 'error')
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, select, l_x_):
-            select_1 = select.select(0, 0)
-            vmap_body_0 = self.vmap_body_0
-            vmap_proxy = torch.func.vmap(vmap_body_0, (0, None), 0, 'error');  vmap_body_0 = None
-            call = vmap_proxy.__call__(select, l_x_);  vmap_proxy = select = l_x_ = None
-            return call
-
-        class GraphModule(torch.nn.Module):
-            def forward(self, select_1, l_x_):
-                mul = l_x_ * select_1;  l_x_ = select_1 = None
-                return mul
+        child = torch._C._functorch._add_batch_dim(l_y_, 0, 1);  l_y_ = None
+
+        lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions()
+
+        _saved_tensors_hooks_disable_1 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(3, 'error')
+
+        _add_batch_dim_1 = torch._C._functorch._add_batch_dim(child, 0, 2);  child = None
+
+        batched_outputs = l_x_ * _add_batch_dim_1;  l_x_ = _add_batch_dim_1 = None
+
+        batched_outputs_1 = torch._C._functorch._remove_batch_dim(batched_outputs, 2, 3, 0);  batched_outputs = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_disable_2 = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+
+        _remove_batch_dim_1: "f32[5, 3, 2, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs_1, 1, 5, 0);  batched_outputs_1 = None
+
+        _vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (_remove_batch_dim_1,)
 """,
         )
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_multiple_outputs(self):
         x = torch.ones(2, 4, 3)
 
@@ -3265,28 +5666,28 @@ def fn(x):
             actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
-        child = L_x_
+    def forward(self, L_x_: "f32[2, 4, 3]"):
+        l_x_ = L_x_
 
-        _check_randomness_arg = torch._functorch.vmap._check_randomness_arg('error')
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
 
-        select = child.select(0, 0)
-        vmap_body_0 = self.vmap_body_0
-        vmap_proxy = torch.func.vmap(vmap_body_0, (0,), 0, 'error');  vmap_body_0 = None
-        call = vmap_proxy.__call__(child);  vmap_proxy = child = None
-        getitem = call[0]
-        getitem_1 = call[1];  call = None
-        return (getitem, getitem_1)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error')
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, select):
-            sum_1 = select.sum(0)
-            sum_2 = select.sum(1);  select = None
-            return (sum_1, sum_2)
+        _add_batch_dim = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+
+        child = _add_batch_dim.sum(0)
+        child_1 = _add_batch_dim.sum(1);  _add_batch_dim = None
+
+        _remove_batch_dim: "f32[2, 3]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 0);  child = None
+        _remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (_remove_batch_dim, _remove_batch_dim_1)
 """,
         )
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_multiple_outputs_diff_dims(self):
         x = torch.ones(2, 4, 3)
 
@@ -3304,28 +5705,28 @@ def fn(x):
             actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
-        child = L_x_
+    def forward(self, L_x_: "f32[2, 4, 3]"):
+        l_x_ = L_x_
 
-        _check_randomness_arg = torch._functorch.vmap._check_randomness_arg('error')
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
 
-        select = child.select(0, 0)
-        vmap_body_0 = self.vmap_body_0
-        vmap_proxy = torch.func.vmap(vmap_body_0, (0,), (1, 0), 'error');  vmap_body_0 = None
-        call = vmap_proxy.__call__(child);  vmap_proxy = child = None
-        getitem = call[0]
-        getitem_1 = call[1];  call = None
-        return (getitem, getitem_1)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error')
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, select):
-            sum_1 = select.sum(0)
-            sum_2 = select.sum(1);  select = None
-            return (sum_1, sum_2)
+        _add_batch_dim = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+
+        child = _add_batch_dim.sum(0)
+        child_1 = _add_batch_dim.sum(1);  _add_batch_dim = None
+
+        _remove_batch_dim: "f32[3, 2]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 1);  child = None
+        _remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (_remove_batch_dim, _remove_batch_dim_1)
 """,
         )
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_multiple_outputs_out_dims_tuple(self):
         x = torch.ones(2, 4, 3)
         out_dims = (1, 0)
@@ -3344,28 +5745,28 @@ def fn(x):
             actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
-        child = L_x_
+    def forward(self, L_x_: "f32[2, 4, 3]"):
+        l_x_ = L_x_
 
-        _check_randomness_arg = torch._functorch.vmap._check_randomness_arg('error')
+        lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions()
 
-        select = child.select(0, 0)
-        vmap_body_0 = self.vmap_body_0
-        vmap_proxy = torch.func.vmap(vmap_body_0, (0,), (1, 0), 'error');  vmap_body_0 = None
-        call = vmap_proxy.__call__(child);  vmap_proxy = child = None
-        getitem = call[0]
-        getitem_1 = call[1];  call = None
-        return (getitem, getitem_1)
+        _saved_tensors_hooks_disable = torch._C._autograd._saved_tensors_hooks_disable("torch.func transforms don't yet support saved tensor hooks. Please open an issue with your use case.")
+        _vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error')
 
-    class GraphModule(torch.nn.Module):
-        def forward(self, select):
-            sum_1 = select.sum(0)
-            sum_2 = select.sum(1);  select = None
-            return (sum_1, sum_2)
+        _add_batch_dim = torch._C._functorch._add_batch_dim(l_x_, 0, 1);  l_x_ = None
+
+        child = _add_batch_dim.sum(0)
+        child_1 = _add_batch_dim.sum(1);  _add_batch_dim = None
+
+        _remove_batch_dim: "f32[3, 2]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 1);  child = None
+        _remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0);  child_1 = None
+
+        _vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting()
+        _saved_tensors_hooks_enable = torch._C._autograd._saved_tensors_hooks_enable()
+        return (_remove_batch_dim, _remove_batch_dim_1)
 """,
         )
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_kwargs(self):
         counters.clear()
         x = torch.ones(2, 3)
@@ -3376,14 +5777,9 @@ def fn(x, y):
 
         actual = fn(x, y)
         expected = torch.compile(fn, backend="aot_eager", fullgraph=False)(x, y)
-        self.assertEqual(len(counters["graph_break"]), 1)
-        self.assertEqual(
-            dict(counters["graph_break"]),
-            {"NYI - torch.func.vmap: kwargs arguments are currently unsupported.": 2},
-        )
+        self.assertEqual(len(counters["graph_break"]), 0)
         self.assertEqual(actual, expected)
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_pytree_inputs(self):
         counters.clear()
         x = torch.ones(2, 3)
@@ -3399,17 +5795,9 @@ def fn(x, y):
 
         actual = fn(x, y)
         expected = torch.compile(fn, backend="aot_eager", fullgraph=False)(x, y)
-        self.assertEqual(len(counters["graph_break"]), 1)
-        assert_dict_matches_regex(
-            self,
-            dict(counters["graph_break"]),
-            {
-                ".*torch.vmap with body that accepts non-Tensors as input": 2,
-            },
-        )
+        self.assertEqual(len(counters["graph_break"]), 0)
         self.assertEqual(actual, expected)
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_side_effects(self):
         counters.clear()
         x = torch.ones(2, 3)
@@ -3426,17 +5814,48 @@ def wrapper_fn(x, y):
 
         actual = wrapper_fn(x, y)
         expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(x, y)
-        self.assertEqual(len(counters["graph_break"]), 1)
-        assert_dict_matches_regex(
-            self,
-            dict(counters["graph_break"]),
-            {
-                r".*HigherOrderOperator: Mutating a variable not in the current scope \(SideEffects\)": 2
-            },
-        )
+        self.assertEqual(len(counters["graph_break"]), 0)
+        self.assertEqual(actual, expected)
+        self.assertEqual(some_list, [1, 1])
+
+    @unittest.expectedFailure
+    def test_vmap_side_effects_append_input(self):
+        counters.clear()
+        x = torch.ones(2, 3)
+        y = torch.randn(2, 3)
+
+        some_list = []
+
+        def f(x, y):
+            some_list.append(x)
+            return x + y
+
+        def wrapper_fn(x, y):
+            return torch.func.vmap(f)(x, y)
+
+        actual = wrapper_fn(x, y)
+        expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(x, y)
+        self.assertEqual(len(counters["graph_break"]), 0)
+        self.assertEqual(actual, expected)
+
+    def test_vmap_previous_illegal_op_no_graph_break(self):
+        counters.clear()
+
+        # calling .stride() would previously graph break
+        def bad_fn(x):
+            y = x.view((4, 3))
+            y.stride()
+            return y
+
+        def wrapper_fn(x):
+            return torch.func.vmap(bad_fn)(x)
+
+        x = torch.randn(2, 3, 4)
+        actual = wrapper_fn(x)
+        expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(x)
+        self.assertEqual(len(counters["graph_break"]), 0)
         self.assertEqual(actual, expected)
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_disable_capture(self):
         counters.clear()
 
@@ -3461,29 +5880,6 @@ def wrapper_fn(x):
             )
             self.assertEqual(actual, expected)
 
-    @config.patch(capture_func_transforms=True)
-    def test_vmap_illegal_op_graph_break(self):
-        counters.clear()
-
-        def bad_fn(x):
-            x.stride()
-            return x
-
-        def wrapper_fn(x):
-            return torch.func.vmap(bad_fn)(x)
-
-        x = torch.randn(3, 3, 3)
-        actual = wrapper_fn(x)
-        expected = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=False)(x)
-        self.assertEqual(len(counters["graph_break"]), 1)
-        assert_dict_matches_regex(
-            self,
-            dict(counters["graph_break"]),
-            {".*Illegal getattr invocation stride in strict mode": 2},
-        )
-        self.assertEqual(actual, expected)
-
-    @config.patch(capture_func_transforms=True)
     def test_vmap_multiple_invocation_in_dims(self):
         counters.clear()
 
@@ -3498,9 +5894,8 @@ def wrapper_fn(x, in_dims):
         actual = opt(x, 0), opt(x, 1), opt(x, 2)
         self.assertEqual(expected, actual)
         self.assertEqual(cnt.frame_count, 3)
-        self.assertEqual(cnt.op_count, 9)
+        self.assertEqual(cnt.op_count, 33)
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_multiple_invocation_out_dims(self):
         counters.clear()
 
@@ -3515,9 +5910,8 @@ def wrapper_fn(x, out_dims):
         actual = opt(x, 0), opt(x, 1), opt(x, 2)
         self.assertEqual(expected, actual)
         self.assertEqual(cnt.frame_count, 3)
-        self.assertEqual(cnt.op_count, 9)
+        self.assertEqual(cnt.op_count, 30)
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_new_tensor_in_body(self):
         def fn(x):
             return x + torch.ones(3)
@@ -3528,12 +5922,11 @@ def wrapper_fn(x):
         x = torch.randn(
             3,
         )
-        opt = torch.compile(wrapper_fn, backend="eager", fullgraph=True)
+        opt = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=True)
         expected = wrapper_fn(x)
         actual = opt(x)
         self.assertEqual(expected, actual)
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_new_tensor_unused_in_body(self):
         def fn(x):
             return torch.tensor(0.5)
@@ -3542,18 +5935,17 @@ def wrapper_fn(x):
             return torch.func.vmap(fn)(x)
 
         x = torch.randn(3)
-        opt = torch.compile(wrapper_fn, backend="eager", fullgraph=True)
+        opt = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=True)
         expected = wrapper_fn(x)
         actual = opt(x)
         self.assertEqual(expected, actual)
 
-    @config.patch(capture_func_transforms=True)
     def test_vmap_new_tensor_implicit_via_op(self):
         def wrapper_fn(x):
             return torch.func.vmap(lambda t: torch.add(t, 0.5))(x)
 
         x = torch.randn(3)
-        opt = torch.compile(wrapper_fn, backend="eager", fullgraph=True)
+        opt = torch.compile(wrapper_fn, backend="aot_eager", fullgraph=True)
         expected = wrapper_fn(x)
         actual = opt(x)
         self.assertEqual(expected, actual)
@@ -3579,7 +5971,7 @@ def _validate(self, fn, backend, *args, skip_check=False, fullgraph=True):
             for arg, cloned_arg in zip(args, cloned_args):
                 self.assertEqual(arg.grad, cloned_arg.grad)
 
-    @requires_cuda()
+    @requires_cuda
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_function(self):
         def gn(x, y):
@@ -3600,7 +5992,7 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
-    @requires_cuda()
+    @requires_cuda
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_function_with_kwargs(self):
         def gn(x, y):
@@ -3625,7 +6017,7 @@ def fn(x, y):
         backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
         self._validate(fn, backend, x, y)
 
-    @requires_cuda()
+    @requires_cuda
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_dropout(self):
         def gn(x, y):
@@ -3650,7 +6042,7 @@ def fn(x, y):
             fn, backend, x, y, skip_check=True
         )  # dropout decomp is known to diverge with eager
 
-    @requires_cuda()
+    @requires_cuda
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_dropout_inductor(self):
         def gn(x, y):
@@ -3669,7 +6061,7 @@ def fn(x, y):
             fn, backend, x, y, skip_check=True
         )  # dropout decomp is known to diverge with eager
 
-    @requires_cuda()
+    @requires_cuda
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_fallback(self):
         def gn(x, y):
@@ -3700,7 +6092,7 @@ def fn(x, y):
         self.assertEqual(cnt.op_count, 2)
         self.assertEqual(len(backend.graphs), 2)
 
-    @requires_cuda()
+    @requires_cuda
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_module(self):
         class MockModule(torch.nn.Module):
diff --git a/test/dynamo/test_hooks.py b/test/dynamo/test_hooks.py
index 0864f38758085..17966cfb85a65 100644
--- a/test/dynamo/test_hooks.py
+++ b/test/dynamo/test_hooks.py
@@ -2,6 +2,7 @@
 
 import contextlib
 import functools
+import unittest
 
 import torch
 import torch._dynamo
@@ -10,6 +11,7 @@
 from functorch.compile import nop
 from torch._dynamo import compiled_autograd
 from torch._functorch.aot_autograd import aot_module_simplified
+from torch.utils.hooks import RemovableHandle
 
 
 def compiler_fn(gm):
@@ -31,6 +33,11 @@ def global_hook_2(grad):
 h0 = None
 
 
+class ClassWithVal:
+    def __init__(self, val):
+        self.val = val
+
+
 class HooksTests(torch._dynamo.test_case.TestCase):
     def test_tensor_only_register_hook_in_graph_lambda(self):
         def fn(x):
@@ -72,7 +79,7 @@ def fn(x, y, z):
         v = fn(v, torch.randn([2, 2]), torch.randn([2, 2]))[0]
         v.backward(torch.tensor([1.0, 2.0, 3.0]))
         self.assertEqual(v.grad, torch.tensor([3.0, 6.0, 9.0]))
-        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.frame_count, 1)
 
     def test_tensor_register_hook_multi_handle_return(self):
         def fn(x, y, z):
@@ -106,9 +113,27 @@ def fn(x, y, z):
         v.backward(torch.tensor([1.0, 2.0, 3.0]))
         self.assertEqual(v.grad, torch.tensor([2.0, 4.0, 6.0]))
         self.assertEqual(cnts.frame_count, 1)
-        self.assertNotEqual(h, None)
-        self.assertNotEqual(h2, None)
-        self.assertEqual(h2, h)
+        self.assertIsInstance(h, RemovableHandle)
+        self.assertIs(h2, h)
+
+    def test_removed_handle_return(self):
+        cnt = torch._dynamo.testing.CompileCounter()
+
+        @torch.compile(backend=cnt, fullgraph=True)
+        def fn(x, y, z):
+            handle = x.register_hook(lambda grad: grad * 2)
+            z = z * z
+            handle.remove()
+            handle.remove()
+            return x, y * y, z, handle, handle
+
+        v = torch.tensor([0.0, 0.0, 0.0], requires_grad=True)
+        v, y, z, h, h2 = fn(v, torch.randn([2, 2]), torch.randn([2, 2]))
+        v.backward(torch.tensor([1.0, 2.0, 3.0]))
+        self.assertEqual(v.grad, torch.tensor([1.0, 2.0, 3.0]))
+        self.assertEqual(cnt.frame_count, 1)
+        self.assertIsInstance(h, RemovableHandle)
+        self.assertIs(h2, h)
 
     def test_tensor_register_hook_repeated_handle_not_local(self):
         def fn(x, y, z, mod):
@@ -117,7 +142,7 @@ def fn(x, y, z, mod):
             return x, y * y, z
 
         cnts = torch._dynamo.testing.CompileCounter()
-        fn = torch._dynamo.optimize(cnts)(fn)
+        fn = torch._dynamo.optimize(cnts, nopython=True)(fn)
         v = torch.tensor([0.0, 0.0, 0.0], requires_grad=True)
 
         mod = torch.nn.Module()
@@ -487,13 +512,132 @@ def forward(self, x, obj):
         x2 = torch.ones(4, requires_grad=True)
         with compiled_autograd.enable(compiler_fn):
             dynamo_out = torch._dynamo.optimize("inductor", nopython=True)(mod)(x2, obj)
-            with self.assertRaisesRegex(
-                torch._dynamo.exc.Unsupported, ".*BuiltinVariable\\(str\\).*"
-            ):
+            with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "builtin: str"):
                 dynamo_out[0].backward(torch.ones(4))
 
         self.assertEqual(obj.count, 2)
 
+    def test_register_hook_partial_guarding(
+        self,
+    ):
+        def some_hook(grad, *, obj):
+            return grad + obj.val
+
+        class MyMod(torch.nn.Module):
+            def forward(self, x, obj):
+                y = x.mul(2)
+                hook1 = functools.partial(some_hook, obj=obj)
+                y.register_hook(hook1)
+                z = y.mul(3)
+                return (z,)
+
+        mod = MyMod()
+        obj1 = ClassWithVal(torch.tensor(88))
+        obj2 = ClassWithVal(torch.tensor(99))
+        obj3 = ClassWithVal(11)
+        cnt = torch._dynamo.testing.CompileCounter()
+
+        x0 = torch.ones(4, requires_grad=True)
+        x1 = torch.ones(4, requires_grad=True)
+
+        with compiled_autograd.enable(compiler_fn):
+            torch.compile(mod, backend=cnt, fullgraph=True)(x0, obj1)
+            torch.compile(mod, backend=cnt, fullgraph=True)(x1, obj1)
+            torch.compile(mod, backend=cnt, fullgraph=True)(x0, obj2)
+            torch.compile(mod, backend=cnt, fullgraph=True)(x0, obj3)
+            self.assertEqual(cnt.frame_count, 1)
+
+    def test_hook_with_closure(self):
+        def fn(x, obj):
+            y = x.sin()
+            x.register_hook(lambda grad: grad + obj.val)
+            z = y.sin()
+            return z
+
+        cnt_fw = torch._dynamo.testing.CompileCounter()
+        cnt_bw = torch._dynamo.testing.CompileCounter()
+        opt = torch.compile(fn, backend=cnt_fw, fullgraph=True)
+
+        obj1 = ClassWithVal(torch.tensor(88))
+        obj2 = ClassWithVal(torch.tensor(99))
+        x0 = torch.ones(4, requires_grad=True)
+        x1 = torch.ones(4, requires_grad=True)
+        x2 = torch.ones(4, requires_grad=True)
+        x3 = torch.ones(4, requires_grad=True)
+        fn(x0, obj1).sum().backward()
+        fn(x1, obj2).sum().backward()
+
+        with compiled_autograd.enable(
+            functools.partial(torch.compile, backend=cnt_bw, fullgraph=True)
+        ):
+            opt(x2, obj1).sum().backward()
+            opt(x3, obj2).sum().backward()
+            self.assertEqual(cnt_fw.frame_count, 1)
+            self.assertEqual(cnt_bw.frame_count, 1)
+
+        self.assertEqual(x0.grad, x2.grad)
+        self.assertEqual(x1.grad, x3.grad)
+
+    def test_intermediate_hook_with_closure_eager(self):
+        def fn(x, obj):
+            y = x.sin()
+            y.register_hook(lambda grad: grad + obj.val)
+            z = y.sin()
+            return z
+
+        cnt_fw = torch._dynamo.testing.CompileCounter()
+        cnt_bw = torch._dynamo.testing.CompileCounter()
+        opt = torch.compile(fn, backend=cnt_fw, fullgraph=True)
+
+        obj1 = ClassWithVal(torch.tensor(88))
+        obj2 = ClassWithVal(torch.tensor(99))
+        x0 = torch.ones(4, requires_grad=True)
+        x1 = torch.ones(4, requires_grad=True)
+        x2 = torch.ones(4, requires_grad=True)
+        x3 = torch.ones(4, requires_grad=True)
+        fn(x0, obj1).sum().backward()
+        fn(x1, obj2).sum().backward()
+
+        with compiled_autograd.enable(
+            functools.partial(torch.compile, backend=cnt_bw, fullgraph=True)
+        ):
+            opt(x2, obj1).sum().backward()
+            opt(x3, obj2).sum().backward()
+            self.assertEqual(cnt_fw.frame_count, 1)
+            self.assertEqual(cnt_bw.frame_count, 1)
+
+        self.assertEqual(x0.grad, x2.grad)
+        self.assertEqual(x1.grad, x3.grad)
+
+    def test_intermediate_hook_with_closure_aot(self):
+        def fn(x, obj):
+            y = x.sin()
+            y.register_hook(lambda grad: grad + obj.val)
+            z = y.sin()
+            return z
+
+        cnt_bw = torch._dynamo.testing.CompileCounter()
+        opt = torch.compile(fn, backend="aot_eager", fullgraph=True)
+
+        obj1 = ClassWithVal(torch.tensor(88))
+        obj2 = ClassWithVal(torch.tensor(99))
+        x0 = torch.ones(4, requires_grad=True)
+        x1 = torch.ones(4, requires_grad=True)
+        x2 = torch.ones(4, requires_grad=True)
+        x3 = torch.ones(4, requires_grad=True)
+        fn(x0, obj1).sum().backward()
+        fn(x1, obj2).sum().backward()
+
+        with compiled_autograd.enable(
+            functools.partial(torch.compile, backend=cnt_bw, fullgraph=True)
+        ):
+            opt(x2, obj1).sum().backward()
+            opt(x3, obj2).sum().backward()
+            self.assertEqual(cnt_bw.frame_count, 1)
+
+        self.assertEqual(x0.grad, x2.grad)
+        self.assertEqual(x1.grad, x3.grad)
+
     def test_no_recompile_on_hook_identity_change(self):
         def my_hook(grad, k=0):
             return grad + k
@@ -530,7 +674,7 @@ def forward(self, x):
 
             comp_out = comp_mod(x1)
 
-            self.assertEqual(cnts.frame_count, 2)
+            self.assertEqual(cnts.frame_count, 1)
             comp_out[0].backward(torch.ones(4))
             self.assertEqual(x0.grad, x1.grad)
 
@@ -604,6 +748,26 @@ def test_fn(fn):
                 with compiled_bwd_ctx:
                     test_fn(compiled_fn)
 
+    def test_recompile(self):
+        def hook(param):
+            param.grad *= 2
+
+        x = torch.ones(10)
+        x.requires_grad = True
+
+        def run(input):
+            return x * input
+
+        x.register_post_accumulate_grad_hook(hook)
+        with compiled_autograd.enable(compiler_fn):
+            for i in range(5):
+                with unittest.mock.patch(
+                    "torch._dynamo.config.error_on_recompile", True
+                ):
+                    # Mimic optimizer.zero_grad() to clear the gradient
+                    x.grad = None
+                    run(i).sum().backward()
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_input_attr_tracking.py b/test/dynamo/test_input_attr_tracking.py
index 9a99043e883d4..f402d54eb966f 100644
--- a/test/dynamo/test_input_attr_tracking.py
+++ b/test/dynamo/test_input_attr_tracking.py
@@ -314,21 +314,21 @@ def fn(x, y):
             actual,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_y_ : torch.Tensor, L_x_ : torch.Tensor):
+    def forward(self, L_y_: "f32[2, 2]", L_x_: "f32[2, 2]"):
         l_y_ = L_y_
         l_x_ = L_x_
 
-        detach = l_y_.detach()
+        detach: "f32[2, 2]" = l_y_.detach()
 
         _set_grad_enabled = torch._C._set_grad_enabled(False)
 
-        set_ = torch_Tensor_set_(l_x_, detach);  detach = None
+        set_: "f32[2, 2]" = torch_Tensor_set_(l_x_, detach);  detach = None
 
         _set_grad_enabled_1 = torch._C._set_grad_enabled(True)
 
         _lower_version_count_by_1 = torch__dynamo_variables_builtin__lower_version_count_by_1(set_);  set_ = None
 
-        mul = l_x_ * l_y_;  l_x_ = l_y_ = None
+        mul: "f32[2, 2]" = l_x_ * l_y_;  l_x_ = l_y_ = None
         return (mul,)
 """,
         )
diff --git a/test/dynamo/test_interop.py b/test/dynamo/test_interop.py
index 1b4778395bf44..163effee0b76f 100644
--- a/test/dynamo/test_interop.py
+++ b/test/dynamo/test_interop.py
@@ -31,7 +31,6 @@ def test_trace_fn(self):
         trace_fn = torch.jit.trace(fn, [torch.zeros(10), torch.zeros(10)])
         self._common(lambda a, b: trace_fn(a, b) + 1)
 
-    @torch._dynamo.config.patch(capture_func_transforms=True)
     def test_vmap_in_graph(self):
         from functools import wraps
 
diff --git a/test/dynamo/test_logging.py b/test/dynamo/test_logging.py
index 27eaf073d5700..85493dc6c98cf 100644
--- a/test/dynamo/test_logging.py
+++ b/test/dynamo/test_logging.py
@@ -1,10 +1,8 @@
 # Owner(s): ["module: dynamo"]
-import atexit
 import contextlib
 import functools
 import logging
 import os
-import re
 import unittest.mock
 
 import torch
@@ -15,7 +13,11 @@
 
 from torch.nn.parallel import DistributedDataParallel as DDP
 
-from torch.testing._internal.common_utils import find_free_port, munge_exc
+from torch.testing._internal.common_utils import (
+    find_free_port,
+    munge_exc,
+    skipIfTorchDynamo,
+)
 from torch.testing._internal.inductor_utils import HAS_CUDA
 from torch.testing._internal.logging_utils import (
     LoggingTestCase,
@@ -23,7 +25,7 @@
     make_settings_test,
 )
 
-requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 requires_distributed = functools.partial(
     unittest.skipIf, not dist.is_available(), "requires distributed"
 )
@@ -82,9 +84,9 @@ def single_record_test(**kwargs):
 class LoggingTests(LoggingTestCase):
     test_bytecode = multi_record_test(2, bytecode=True)
     test_output_code = multi_record_test(2, output_code=True)
-    test_aot_graphs = multi_record_test(2, aot_graphs=True)
+    test_aot_graphs = multi_record_test(3, aot_graphs=True)
 
-    @requires_cuda()
+    @requires_cuda
     @make_logging_test(schedule=True)
     def test_schedule(self, records):
         fn_opt = torch._dynamo.optimize("inductor")(inductor_schedule_fn)
@@ -92,7 +94,7 @@ def test_schedule(self, records):
         self.assertGreater(len(records), 0)
         self.assertLess(len(records), 5)
 
-    @requires_cuda()
+    @requires_cuda
     @make_logging_test(fusion=True)
     def test_fusion(self, records):
         fn_opt = torch._dynamo.optimize("inductor")(inductor_schedule_fn)
@@ -100,6 +102,14 @@ def test_fusion(self, records):
         self.assertGreater(len(records), 0)
         self.assertLess(len(records), 8)
 
+    @requires_cuda
+    @make_logging_test(cudagraphs=True)
+    def test_cudagraphs(self, records):
+        fn_opt = torch.compile(mode="reduce-overhead")(inductor_schedule_fn)
+        fn_opt(torch.ones(1000, 1000, device="cuda"))
+        self.assertGreater(len(records), 0)
+        self.assertLess(len(records), 8)
+
     @make_logging_test(recompiles=True)
     def test_recompiles(self, records):
         def fn(x, y):
@@ -113,6 +123,7 @@ def fn(x, y):
     test_dynamo_debug = within_range_record_test(30, 90, dynamo=logging.DEBUG)
     test_dynamo_info = within_range_record_test(2, 10, dynamo=logging.INFO)
 
+    @skipIfTorchDynamo("too slow")
     @make_logging_test(dynamo=logging.DEBUG)
     def test_dynamo_debug_default_off_artifacts(self, records):
         fn_opt = torch._dynamo.optimize("inductor")(example_fn)
@@ -152,7 +163,7 @@ def test_inductor_error(self, records):
         import torch._inductor.lowering
 
         def throw(x):
-            raise AssertionError()
+            raise AssertionError
 
         # inject an error in the lowerings
         dict_entries = {}
@@ -177,7 +188,7 @@ def throw(x):
 due to:
 Traceback (most recent call last):
   File "test_logging.py", line N, in throw
-    raise AssertionError()
+    raise AssertionError
 torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
 LoweringException: AssertionError:
   target: aten.round.default
@@ -189,7 +200,7 @@ def throw(x):
         exitstack.close()
 
     @requires_distributed()
-    @requires_cuda()
+    @requires_cuda
     @make_logging_test(ddp_graphs=True)
     def test_ddp_graphs(self, records):
         class ToyModel(torch.nn.Module):
@@ -277,8 +288,10 @@ def fn(x):
     def test_dump_compile_times(self, records):
         fn_opt = torch._dynamo.optimize("inductor")(example_fn)
         fn_opt(torch.ones(1000, 1000))
-        # explicitly invoke the atexit registered functions
-        atexit._run_exitfuncs()
+        # This function runs during exit via atexit.register.
+        # We're not actually going to run atexit._run_exit_funcs() here,
+        # because it'll destroy state necessary for other tests.
+        torch._dynamo.utils.dump_compile_times()
         self.assertEqual(
             len(
                 [r for r in records if "TorchDynamo compilation metrics" in str(r.msg)]
@@ -321,7 +334,7 @@ def test_custom_format(self, records):
             if torch._logging._internal._is_torch_handler(handler):
                 break
         self.assertIsNotNone(handler)
-        self.assertIn("[INFO]", handler.format(records[0]))
+        self.assertIn("I", handler.format(records[0]))
         self.assertEqual("custom format", handler.format(records[1]))
 
     @make_logging_test(dynamo=logging.INFO)
@@ -340,7 +353,7 @@ def test_multiline_format(self, records):
         for record in records:
             r = handler.format(record)
             for l in r.splitlines():
-                self.assertIn("[INFO]", l)
+                self.assertIn("I", l)
 
     test_trace_source_simple = within_range_record_test(1, 100, trace_source=True)
 
@@ -435,10 +448,11 @@ def outer(pred, x):
 
     @make_logging_test(trace_source=True)
     def test_trace_source_funcname(self, records):
+        # NOTE: list comprehensions are inlined in 3.12, so test with tuples
         def fn1():
             def fn2():
                 if True:
-                    return [torch.ones(3, 3) for _ in range(5)]
+                    return tuple(torch.ones(3, 3) for _ in range(5))
                 return None
 
             return fn2()
@@ -449,29 +463,11 @@ def fn2():
         found_funcname = False
         for record in records:
             msg = record.getMessage()
-            if "<listcomp>" in msg and "fn1.fn2" in msg:
+            if "<genexpr>" in msg and "fn1.fn2" in msg:
                 found_funcname = True
 
         self.assertTrue(found_funcname)
 
-    @make_logging_test(graph_sizes=True)
-    def test_graph_sizes_dynamic(self, records):
-        def fn(a, b):
-            return a @ b
-
-        fn_opt = torch._dynamo.optimize("eager", dynamic=False)(fn)
-        fn_opt(torch.randn(10, 20), torch.randn(20, 30))
-
-        fn_opt2 = torch._dynamo.optimize("eager", dynamic=True)(fn)
-        fn_opt2(torch.randn(5, 10), torch.randn(10, 15))
-
-        self.assertEqual(len(records), 2)
-        self.assertNotIn("concrete", records[0].getMessage())
-        lines = records[1].getMessage().split("\n")
-        for line in lines:
-            if "concrete" in line:
-                self.assertIsNotNone(re.search(r"\(concrete\): \(\d+, \d+\)", line))
-
     def test_invalid_artifact_flag(self):
         with self.assertRaises(ValueError):
             torch._logging.set_logs(aot_graphs=5)
@@ -629,6 +625,7 @@ def inner(x, ys, zs):
             record_str,
         )
 
+    @skipIfTorchDynamo("too slow")
     @make_logging_test(**torch._logging.DEFAULT_LOGGING)
     def test_default_logging(self, records):
         def fn(a):
@@ -677,6 +674,7 @@ def fn(a):
 # single record tests
 exclusions = {
     "bytecode",
+    "cudagraphs",
     "output_code",
     "schedule",
     "fusion",
@@ -684,19 +682,25 @@ def fn(a):
     "aot_graphs",
     "post_grad_graphs",
     "compiled_autograd",
+    "compiled_autograd_verbose",
     "recompiles",
     "recompiles_verbose",
     "graph_breaks",
+    "graph",
+    "graph_sizes",
     "ddp_graphs",
     "perf_hints",
     "not_implemented",
     "trace_source",
     "trace_call",
+    "trace_bytecode",
     "custom_format_test_artifact",
     "onnx",
     "onnx_diagnostics",
     "guards",
     "verbose_guards",
+    "sym_node",
+    "export",
 }
 for name in torch._logging._internal.log_registry.artifact_names:
     if name not in exclusions:
diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py
index de6e6dc99be2d..6412d015d56bd 100644
--- a/test/dynamo/test_minifier.py
+++ b/test/dynamo/test_minifier.py
@@ -1,13 +1,10 @@
 # Owner(s): ["module: dynamo"]
-import functools
 import unittest
 
 import torch._dynamo
 from torch._dynamo.test_minifier_common import MinifierTestBase
 
-requires_cuda = functools.partial(
-    unittest.skipIf, not torch.cuda.is_available(), "requires cuda"
-)
+requires_cuda = unittest.skipUnless(torch.cuda.is_available(), "requires cuda")
 
 
 class MinifierTests(MinifierTestBase):
@@ -42,19 +39,19 @@ def test_after_dynamo_cpu_accuracy_error(self):
             "cpu", "relu_accuracy_error_TESTING_ONLY", "AccuracyError"
         )
 
-    @requires_cuda()
+    @requires_cuda
     def test_after_dynamo_cuda_compile_error(self):
         self._test_after_dynamo(
             "cuda", "relu_compile_error_TESTING_ONLY", "ReluCompileError"
         )
 
-    @requires_cuda()
+    @requires_cuda
     def test_after_dynamo_cuda_runtime_error(self):
         self._test_after_dynamo(
             "cuda", "relu_runtime_error_TESTING_ONLY", "ReluRuntimeError"
         )
 
-    @requires_cuda()
+    @requires_cuda
     def test_after_dynamo_cuda_accuracy_error(self):
         self._test_after_dynamo(
             "cuda", "relu_accuracy_error_TESTING_ONLY", "AccuracyError"
@@ -95,26 +92,26 @@ def test_after_dynamo_cpu_accuracy_backend_passes(self):
             "cpu", "relu_accuracy_error_TESTING_ONLY"
         )
 
-    @requires_cuda()
+    @requires_cuda
     def test_after_dynamo_cuda_compile_backend_passes(self):
         self._test_after_dynamo_backend_passes(
             "cuda", "relu_compile_error_TESTING_ONLY"
         )
 
-    @requires_cuda()
+    @requires_cuda
     def test_after_dynamo_cuda_runtime_backend_passes(self):
         self._test_after_dynamo_backend_passes(
             "cuda", "relu_runtime_error_TESTING_ONLY"
         )
 
-    @requires_cuda()
+    @requires_cuda
     def test_after_dynamo_cuda_accuracy_backend_passes(self):
         self._test_after_dynamo_backend_passes(
             "cuda", "relu_accuracy_error_TESTING_ONLY"
         )
 
     # Test that a module with mixed cpu/cuda parts with an error after dynamo can be repro'd
-    @requires_cuda()
+    @requires_cuda
     def test_cpu_cuda_module_after_dynamo(self):
         backend_name = "relu_compile_error_TESTING_ONLY"
         run_code = f"""\
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index e6f3a775c2e87..5c93c8a565596 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -6,6 +6,7 @@
 import dis
 import enum
 import functools
+import gc
 import itertools
 import logging
 import math
@@ -24,12 +25,10 @@
 from unittest.mock import patch
 
 import numpy as np
-import pytest
-import sympy
 import torch
-
-import torch._dynamo.test_case
 import torch._dynamo.testing
+
+import torch._inductor.test_case
 import torch.onnx.operators
 
 import torch.utils._pytree as pytree
@@ -45,6 +44,7 @@
     same,
     skipIfNotPy311,
     unsupported,
+    xfailIfPy312,
 )
 from torch._dynamo.utils import CompileProfiler, counters, ifdynstaticdefault
 from torch._inductor.utils import run_and_get_code
@@ -59,6 +59,7 @@
     constrain_unify,
     ConstraintViolationError,
     expect_true,
+    guard_size_oblivious,
     ShapeEnv,
 )
 from torch.nn import functional as F
@@ -76,8 +77,10 @@
     freeze_rng_state,
     IS_FBCODE,
     set_default_dtype,
+    wrapDeterministicFlagAPITest,
 )
 from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing._internal.logging_utils import logs_to_string
 
 mytuple = collections.namedtuple("mytuple", ["a", "b", "ab"])
 T = typing.TypeVar("T")
@@ -96,6 +99,16 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+def cleanup_op(opname):
+    ns, name = opname.split("::")
+    if not hasattr(torch.ops, ns):
+        return
+    actual_ns = getattr(torch.ops, ns)
+    if not hasattr(actual_ns, name):
+        return
+    delattr(actual_ns, name)
+
+
 class MyPickledModule(torch.nn.Module):
     def __init__(self, z):
         super().__init__()
@@ -119,7 +132,26 @@ def forward(self, x, y):
 qconfig_dict = {"object_type": [(torch.nn.Linear, uniform_qconfig_8bit)]}
 
 
-class MiscTests(torch._dynamo.test_case.TestCase):
+def closure_adder(val):
+    def inner(x):
+        return torch.sin(x + val)
+
+    return inner
+
+
+class UserDefineSetAttr:
+    setup = False
+
+    def __setattr__(self, key, value):
+        assert torch.compiler.is_dynamo_compiling() or UserDefineSetAttr.setup
+        super().__setattr__(f"pfx_{key}", value)
+
+    def __getattr__(self, key):
+        assert torch.compiler.is_dynamo_compiling() or UserDefineSetAttr.setup
+        return self.__dict__[f"pfx_{key}"]
+
+
+class MiscTests(torch._inductor.test_case.TestCase):
     def test_get_cache_entry(self):
         def f(x):
             return x + 1
@@ -139,6 +171,17 @@ def g(x):
         except TypeError as e:
             self.assertIn("expected a code object!", str(e))
 
+        # test get cache entry on skipped code object
+        def h(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            return x + 1
+
+        torch.compile(h)(torch.randn(3, 3))
+
+        entries = _debug_get_cache_entry_list(torch._dynamo.graph_break)
+        self.assertEqual(len(entries), 0)
+
     def test_boolarg(self):
         def boolarg(aa, bb, flag):
             if flag:
@@ -163,6 +206,17 @@ def boolarg(aa, bb, flag):
         self.assertTrue(same(val4, correct1))
         self.assertEqual(counter.frame_count, 3)
 
+    def test_invalid_args_builtin(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            x = x.sin()
+            if isinstance(x, torch.Tensor, invalid=True):
+                x = x.sin()
+            return x
+
+        with self.assertRaises(TypeError):
+            fn(torch.randn(16))
+
     def test_callpacked(self):
         def call_packed(args):
             a, b, c = args
@@ -320,7 +374,7 @@ def g(x):
             optimized_g = torch._dynamo.optimize(counts, nopython=True)(f)
             _ = optimized_g(x)
         finally:
-            del torch.ops.mylib.bar
+            cleanup_op("mylib::bar")
             del lib
 
     @torch._dynamo.config.patch(only_allow_pt2_compliant_ops=True)
@@ -356,7 +410,7 @@ def g(x):
                 optimized_g = torch._dynamo.optimize(counts, nopython=True)(f)
                 y = optimized_g(x)
         finally:
-            del torch.ops.mylib.bar2
+            cleanup_op("mylib::bar2")
             del lib
 
     @torch._dynamo.config.patch(only_allow_pt2_compliant_ops=True)
@@ -412,9 +466,83 @@ def h(x):
                 y = optimized_h(x)
 
         finally:
-            del torch.ops.mylib.bar3
+            cleanup_op("mylib::bar3")
             del lib
 
+    def test_auto_functionalize_can_with_default(self):
+        lib = torch.library.Library("mylib", "FRAGMENT")
+        torch.library.define(
+            "mylib::foo",
+            "(Tensor a, int b, Tensor(d!)? c=None, Tensor? d=None, int e=-1) -> ()",
+            tags=torch.Tag.pt2_compliant_tag,
+            lib=lib,
+        )
+
+        @torch.library.impl("mylib::foo", "cpu", lib=lib)
+        def foo_impl(a, b, c=None, d=None, e=-1):
+            a + b
+            return
+
+        def f(a, mode):
+            return torch.ops.mylib.foo(
+                a,
+                0,
+            )
+
+        a = torch.tensor([10, 10, 10], dtype=torch.int64)
+
+        torch.compile(f)(a, 0)
+
+        cleanup_op("mylib::foo")
+        del lib
+
+    def test_user_defined_setattr1(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(obj):
+            obj.y = obj.x + 1
+
+        obj = UserDefineSetAttr()
+        with patch.object(UserDefineSetAttr, "setup", True):
+            obj.x = torch.randn(8)
+        fn(obj)
+        with patch.object(UserDefineSetAttr, "setup", True):
+            self.assertEqual(obj.y, obj.x + 1)
+        self.assertEqual(obj.__dict__.keys(), {"pfx_x", "pfx_y"})
+
+    def test_user_defined_setattr2(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            obj = UserDefineSetAttr()
+            obj.x = x
+            obj.y = obj.x + 1
+            return obj
+
+        x = torch.randn(8)
+        obj = fn(x)
+        with patch.object(UserDefineSetAttr, "setup", True):
+            self.assertIs(obj.x, x)
+            self.assertEqual(obj.y, x + 1)
+        self.assertEqual(obj.__dict__.keys(), {"pfx_x", "pfx_y"})
+
+    def test_closure_recompiles(self):
+        cnt = CompileCounter()
+
+        def fn(x, other_fn):
+            return other_fn(x + 1) - 1
+
+        opt = torch.compile(fn, backend=cnt, fullgraph=True)
+
+        x = torch.randn(8)
+        for f in (
+            closure_adder(5),
+            closure_adder(5),
+            closure_adder(torch.randn(8)),
+            closure_adder(torch.randn(8)),
+        ):
+            self.assertEqual(opt(x, f), fn(x, f))
+
+        self.assertEqual(cnt.frame_count, 2)
+
     def test_generate_trivial_abstract_impl(self):
         try:
             lib = torch.library.Library("mylib", "FRAGMENT")
@@ -443,7 +571,7 @@ def f(x, y, z, w):
             output = torch.compile(f, backend="eager", fullgraph=True)(*args)
             self.assertEqual(output, None)
         finally:
-            del torch.ops.mylib.foo
+            cleanup_op("mylib::foo")
             del lib
 
     def test_can_auto_functionalize(self):
@@ -475,7 +603,7 @@ def test_can_auto_functionalize(self):
                 )
                 self.assertFalse(can_auto_functionalize(torch.ops.mylib.a))
             finally:
-                del torch.ops.mylib.a
+                cleanup_op("mylib::a")
                 del lib
         for schema in expected_false:
             try:
@@ -486,7 +614,7 @@ def test_can_auto_functionalize(self):
                 )
                 self.assertFalse(can_auto_functionalize(torch.ops.mylib.a))
             finally:
-                del torch.ops.mylib.a
+                cleanup_op("mylib::a")
                 del lib
 
     def test_auto_functionalize(self):
@@ -515,15 +643,33 @@ def f(x, y, z, n):
             orig_args = (x, y, z, n)
 
             compiled_args = pytree.tree_map_only(torch.Tensor, torch.clone, orig_args)
-            torch.compile(f, backend="aot_eager_decomp_partition", fullgraph=True)(
-                *compiled_args
+
+            log_stream, ctx = logs_to_string(
+                "torch._inductor.compile_fx", "post_grad_graphs"
             )
+            with ctx():
+                torch.compile(f, backend="inductor", fullgraph=True)(*compiled_args)
+
+            post_grad_graphs = "\n".join(
+                log_stream.getvalue().strip().split("\n")[3:]
+            ).strip()
+
+            # Check the graph under static shapes
+            if torch._dynamo.config.assume_static_by_default:
+                self.assertExpectedInline(
+                    post_grad_graphs,
+                    """\
+def forward(self, arg0_1: "f32[3]", arg1_1: "f32[3]", arg2_1: "f32[3]", arg3_1: "f32[3]", arg4_1: "f32[3]"):
+        # No stacktrace found for following nodes
+        foo_default = torch.ops.mylib.foo.default(arg0_1, [arg3_1, arg4_1], arg1_1, 2, arg2_1);  arg0_1 = arg3_1 = arg4_1 = arg1_1 = arg2_1 = None
+        return ()""",
+                )
 
             eager_args = pytree.tree_map_only(torch.Tensor, torch.clone, orig_args)
             f(*eager_args)
             self.assertEqual(compiled_args, eager_args)
         finally:
-            del torch.ops.mylib.foo
+            cleanup_op("mylib::foo")
             del lib
 
     def test_auto_functionalize_with_returns(self):
@@ -557,16 +703,35 @@ def f(x, y, z, n):
             orig_args = (x, y, z, n)
 
             compiled_args = pytree.tree_map_only(torch.Tensor, torch.clone, orig_args)
-            compiled_out = torch.compile(
-                f, backend="aot_eager_decomp_partition", fullgraph=True
-            )(*compiled_args)
+            log_stream, ctx = logs_to_string(
+                "torch._inductor.compile_fx", "post_grad_graphs"
+            )
+            with ctx():
+                compiled_out = torch.compile(f, backend="inductor", fullgraph=True)(
+                    *compiled_args
+                )
+
+            if torch._dynamo.config.assume_static_by_default:
+                post_grad_graphs = "\n".join(
+                    log_stream.getvalue().strip().split("\n")[3:]
+                ).strip()
+                self.assertExpectedInline(
+                    post_grad_graphs,
+                    """\
+def forward(self, arg0_1: "f32[3]", arg1_1: "f32[3]", arg2_1: "f32[3]", arg3_1: "f32[3]", arg4_1: "f32[3]"):
+        # No stacktrace found for following nodes
+        foo_default = torch.ops.mylib.foo.default(arg0_1, [arg3_1, arg4_1], arg1_1, 2, arg2_1);  arg0_1 = arg3_1 = arg4_1 = arg1_1 = arg2_1 = None
+        getitem_4: "f32[3]" = foo_default[0]
+        getitem_5: "f32[3]" = foo_default[1];  foo_default = None
+        return (getitem_4, getitem_5)""",
+                )
 
             eager_args = pytree.tree_map_only(torch.Tensor, torch.clone, orig_args)
             eager_out = f(*eager_args)
             self.assertEqual(compiled_args, eager_args)
             self.assertEqual(compiled_out, eager_out)
         finally:
-            del torch.ops.mylib.foo
+            cleanup_op("mylib::foo")
             del lib
 
     def test_auto_functionalize_on_view(self):
@@ -601,8 +766,8 @@ def f(x):
             y = f(x)
             self.assertEqual(y, x.sin())
         finally:
+            cleanup_op("mylib::foo")
             del lib
-            del torch.ops.mylib.foo
 
     def test_auto_functionalize_optional(self):
         try:
@@ -632,15 +797,30 @@ def f(x, y, z, n):
             orig_args = (x, y, z, n)
 
             compiled_args = pytree.tree_map_only(torch.Tensor, torch.clone, orig_args)
-            torch.compile(f, backend="aot_eager_decomp_partition", fullgraph=True)(
-                *compiled_args
+            log_stream, ctx = logs_to_string(
+                "torch._inductor.compile_fx", "post_grad_graphs"
             )
+            with ctx():
+                torch.compile(f, backend="inductor", fullgraph=True)(*compiled_args)
+
+            if torch._dynamo.config.assume_static_by_default:
+                post_grad_graphs = "\n".join(
+                    log_stream.getvalue().strip().split("\n")[3:]
+                ).strip()
+                self.assertExpectedInline(
+                    post_grad_graphs,
+                    """\
+def forward(self, arg0_1: "f32[3]", arg1_1: "f32[3]", arg2_1: "f32[3]", arg3_1: "f32[3]"):
+        # No stacktrace found for following nodes
+        foo_default = torch.ops.mylib.foo.default(None, [arg2_1, arg3_1], arg0_1, 2, arg1_1);  arg2_1 = arg3_1 = arg0_1 = arg1_1 = None
+        return ()""",
+                )
 
             eager_args = pytree.tree_map_only(torch.Tensor, torch.clone, orig_args)
             f(*eager_args)
             self.assertEqual(compiled_args, eager_args)
         finally:
-            del torch.ops.mylib.foo
+            cleanup_op("mylib::foo")
             del lib
 
     def test_shape_int_inplace_binops(self):
@@ -815,6 +995,42 @@ def fn(x, c):
         else:
             self.assertExpectedInline(counts.op_count, """4""")
 
+    def test_user_defined_iter(self):
+        class Mod:
+            def __init__(self):
+                self.a = [torch.randn(2, 2), torch.randn(2, 2)]
+
+            def __iter__(self):
+                return iter(self.a)
+
+        def f(mod):
+            ret = []
+            for x in mod:
+                ret.append(x + 1)
+            return ret
+
+        mod = Mod()
+        counts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(counts, nopython=True)(f)
+        ref = f(mod)
+        res = opt_fn(mod)
+        res = opt_fn(mod)
+        res = opt_fn(mod)
+        res = opt_fn(mod)
+        self.assertTrue(same(ref, res))
+        self.assertEqual(counts.frame_count, 1)
+
+        mod.a.append(torch.randn(2, 2))
+        # `for x in mod` is inlined, where iter(m.a) creates a guard on the list length of m.a
+        # Mutating length of mod.a causes a re-compilation.
+        ref2 = f(mod)
+        res2 = opt_fn(mod)
+        res2 = opt_fn(mod)
+        res2 = opt_fn(mod)
+        res2 = opt_fn(mod)
+        self.assertTrue(same(ref2, res2))
+        self.assertEqual(counts.frame_count, 2)
+
     def test_compare_shapes_eq(self):
         def compare_shapes(a, b, to_list):
             x = list(a.unsqueeze(-1).shape) if to_list else a.shape
@@ -926,6 +1142,24 @@ def fn(x):
 
         torch._dynamo.testing.standard_test(self, fn, 1, expected_ops=1)
 
+    @unittest.skipIf(sys.version_info[:2] <= (3, 8), "Requires astunparse")
+    def test_cse_dict_guards(self):
+        def fn(x):
+            ret = torch.zeros(3)
+            for v in x.values():
+                ret = ret + v
+            return ret
+
+        from torch._dynamo.guards import build_guard_function, CLOSURE_VARS
+
+        x = {3: torch.randn(3), 2: torch.randn(3), 4: torch.randn(3)}
+        _, guards = torch._dynamo.export(fn, x)
+
+        code_lists = [c for g in guards for c in g.code_list or []]
+        _, pycode = build_guard_function(code_lists, [])
+        # Make sure we just call "list(dict.keys())" once
+        self.assertEqual(pycode.count("keys"), 1)
+
     def test_sys_modules(self):
         def fn(x, y):
             mod_a = sys.modules.get("aaaaaaaa")
@@ -985,6 +1219,18 @@ def fn(a):
 
         torch._dynamo.testing.standard_test(self, fn, 1, expected_ops=1)
 
+    def test_getattr_dict(self):
+        def fn(x):
+            from torch.masked.maskedtensor._ops_refs import _MASKEDTENSOR_FUNCTION_TABLE
+
+            return x * len(_MASKEDTENSOR_FUNCTION_TABLE)
+
+        i = torch.randn(5)
+        r1 = fn(i)
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        r2 = opt_fn(i)
+        self.assertEqual(r1, r2)
+
     def test_shape_unpack(self):
         def fn(x):
             a, b = x.size()
@@ -996,6 +1242,16 @@ def fn(x):
         r2 = opt_fn(i)
         self.assertTrue(same(r1, r2))
 
+    def test_typing_dict(self):
+        def fn(d):
+            return d[T]
+
+        d = {T: torch.randn(3)}
+        r1 = fn(d)
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        r2 = opt_fn(d)
+        self.assertEqual(r1, r2)
+
     def test_tensor_iter(self):
         def fn(x):
             for y in x:
@@ -1086,6 +1342,8 @@ def f(x):
         self.assertEqual(cnts.frame_count, 1)
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    # Translation validation changes the exception type, don't run with it
+    @torch.fx.experimental._config.patch(translation_validation=False)
     def test_torch_check_is_size(self):
         cnts = torch._dynamo.testing.CompileCounter()
 
@@ -1093,15 +1351,13 @@ def test_torch_check_is_size(self):
         def f(x):
             y = x.item()
             torch._check_is_size(y)
-            # unsound 0/1 specialization!
+            # Cannot conditional on unbacked SymInt
             if y == 0:
                 assert False
             else:
                 return torch.arange(0, y)
 
-        f(torch.tensor([3]))
-        f(torch.tensor([4]))
-        self.assertEqual(cnts.frame_count, 1)
+        self.assertRaises(torch._dynamo.exc.UserError, lambda: f(torch.tensor([3])))
 
     def test_config_obj(self):
         class Cfg:
@@ -1849,6 +2105,33 @@ def fn():
         self.assertEqual(cnt.frame_count, 1)
         self.assertEqual(cnt.op_count, 2)
 
+    def test_catch_watchings1(self):
+        cnt = CompileCounter()
+
+        @torch.compile(backend=cnt, fullgraph=True)
+        def fn(x):
+            with warnings.catch_warnings(record=True):
+                return x.sin()
+
+        x = torch.randn(8)
+        self.assertEqual(fn(x), x.sin())
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_catch_watchings2(self):
+        cnt = CompileCounter()
+
+        @torch.compile(backend=cnt, fullgraph=True)
+        def fn(x):
+            return x.sin(), warnings.catch_warnings(record=True)
+
+        x = torch.randn(8)
+        _, a = fn(x)
+        _, b = fn(x)
+        self.assertEqual(cnt.frame_count, 1)
+        self.assertIsInstance(a, warnings.catch_warnings)
+        self.assertIsInstance(b, warnings.catch_warnings)
+        self.assertIsNot(a, b)
+
     def test_tensor_build_list_unpack(self):
         def fn(x):
             # seen in fastNLP_Bert
@@ -2431,6 +2714,29 @@ def fn(x, y):
         self.assertTrue(same(fn(x, y), opt_fn(x.clone(), y.clone())))
         self.assertEqual(cnts.frame_count, 1)
 
+    def test_out_variants_with_resizing_on_graph_inputs_with_dynamic(self):
+        # https://github.com/pytorch/pytorch/issues/120482
+        class CustomModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, inputs):
+                return torch.outer(**inputs)
+
+        compile_fn = torch.compile(CustomModel(), fullgraph=True)
+
+        shapes = [(2, 1), (6, 1), (4, 1)]
+        for shape in shapes:
+            vec1, vec2 = shape
+            input_tensor1 = torch.randn(vec1)
+            input_tensor2 = torch.randn(vec2)
+            out_tensor = torch.empty(shape)
+            args = {"input": input_tensor1, "vec2": input_tensor2, "out": out_tensor}
+            res = compile_fn(args)
+            opt_res = res.clone()  # cuz this is out and we mutate it
+            res = CustomModel()(args)
+            self.assertEqual(res, opt_res)
+
     def test_dict_mutation_side_effect(self):
         def fn(d):
             d["c"] = d["a"] + d.pop("b")
@@ -2448,7 +2754,10 @@ def fn(d):
 
     def test_dict_order_keys(self):
         def fn(d):
-            return d["a"] - d["b"]
+            c = 0
+            for v in d.values():
+                c += v
+            return c
 
         args1 = {}
         args1["a"] = torch.rand(10)
@@ -2457,7 +2766,8 @@ def fn(d):
         opt_fn = torch._dynamo.optimize(cnts)(fn)
         self.assertEqual(fn(args1), opt_fn(args1))
         self.assertEqual(cnts.frame_count, 1)
-        self.assertEqual(cnts.op_count, 1)
+        self.assertEqual(cnts.op_count, 2)
+
         # A different order of keys recompiles
         args2 = {}
         args2["b"] = args1["b"]
@@ -2467,6 +2777,20 @@ def fn(d):
         # Extra calls don't recompile
         self.assertEqual(cnts.frame_count, 2)
 
+    def test_dict_namedtuple(self):
+        def fn(d):
+            return d[3] * 2
+
+        args1 = {collections.namedtuple: None, 3: torch.randn(3)}
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts)(fn)
+        self.assertEqual(fn(args1), opt_fn(args1))
+        self.assertEqual(cnts.frame_count, 1)
+        # Test a failing namedtuple guard
+        args2 = {2: None, 3: torch.randn(3)}
+        self.assertEqual(fn(args2), opt_fn(args2))
+        self.assertEqual(cnts.frame_count, 2)
+
     def test_dict_order_keys_tensors(self):
         def fn(d, x):
             return d[x] + 3
@@ -2738,6 +3062,91 @@ def fn(x):
         self.assertEqual(cnts.frame_count, 1)
         self.assertEqual(cnts.op_count, 9)
 
+    def test_object_setattr(self):
+        @dataclasses.dataclass
+        class A:
+            x: torch.Tensor
+
+        def fn1(x) -> None:
+            a = A(x)
+            object.__setattr__(a, "x", x + 2)
+            return a
+
+        x1 = torch.randn(10)
+        obj11 = fn1(x1.clone())
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn1 = torch._dynamo.optimize(cnts, nopython=True)(fn1)
+        obj12 = opt_fn1(x1.clone())
+        self.assertTrue(same(obj11.x, x1 + 2))
+        self.assertTrue(same(obj12.x, x1 + 2))
+        self.assertTrue(same(obj11.x, obj12.x))
+        self.assertEqual(cnts.frame_count, 1)
+
+        @dataclasses.dataclass(frozen=True)
+        class B:
+            x: torch.Tensor
+
+        def fn2(x) -> None:
+            b = B(x)
+            return b
+
+        x2 = torch.randn(10)
+        obj21 = fn2(x2.clone())
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn2 = torch._dynamo.optimize(cnts, nopython=True)(fn2)
+        obj22 = opt_fn2(x2.clone())
+        self.assertTrue(same(obj21.x, x2))
+        self.assertTrue(same(obj22.x, x2))
+        self.assertTrue(same(obj21.x, obj22.x))
+        self.assertEqual(cnts.frame_count, 0)
+
+        @dataclasses.dataclass(frozen=True)
+        class C:
+            x: torch.Tensor
+
+        def fn3(x) -> None:
+            c = C(x)
+            object.__setattr__(c, "x", x + 2)
+            return c
+
+        x3 = torch.randn(10)
+        obj31 = fn3(x3.clone())
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn3 = torch._dynamo.optimize(cnts, nopython=True)(fn3)
+        obj32 = opt_fn3(x3.clone())
+        self.assertTrue(same(obj31.x, x3 + 2))
+        self.assertTrue(same(obj32.x, x3 + 2))
+        self.assertTrue(same(obj31.x, obj32.x))
+        self.assertEqual(cnts.frame_count, 1)
+
+        @dataclasses.dataclass(frozen=True)
+        class D:
+            x: torch.Tensor
+
+            def __post_init__(self):
+                object.__setattr__(self, "y", self.x + 2)
+
+        def fn4(x) -> None:
+            d = D(x)
+            return d
+
+        x4 = torch.randn(10)
+        obj41 = fn4(x4.clone())
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn4 = torch._dynamo.optimize(cnts, nopython=True)(fn4)
+        obj42 = opt_fn4(x4.clone())
+        self.assertTrue(same(obj41.x, x4))
+        self.assertTrue(same(obj42.x, x4))
+        self.assertTrue(same(obj41.x, obj42.x))
+        self.assertTrue(same(obj41.y, x4 + 2))
+        self.assertTrue(same(obj42.y, x4 + 2))
+        self.assertTrue(same(obj41.y, obj42.y))
+        self.assertEqual(cnts.frame_count, 1)
+
     def test_user_defined_class_name(self):
         class MyClassFoo:
             pass
@@ -3468,6 +3877,7 @@ def fn(a, b):
 
         self.assertTrue(same(ref, res))
 
+    @torch._dynamo.config.patch(guard_nn_modules=True)
     def test_source_non_input_grad_access(self):
         # This test creates a model, and accesses the grads
         # from its parameter. This means that within dynamo,
@@ -3539,6 +3949,25 @@ def fn(a, b):
         self.assertEqual(cnts.frame_count, 1)
         self.assertEqual(cnts.op_count, 2)
 
+    def test_clone_sparse_input(self):
+        for layout in [
+            torch.sparse_coo,
+            torch.sparse_csr,
+            torch.sparse_csc,
+            torch.sparse_bsr,
+            torch.sparse_bsc,
+        ]:
+            for sparse_input in self.generate_simple_inputs(
+                layout,
+                device="cpu",
+                dtype=torch.float64,
+                index_dtype=torch.int64,
+            ):
+                # Invoke the dynamo clone input method directly.
+                sparse_copy = torch._dynamo.utils.clone_input(sparse_input)
+                # Make sure sparse clone is successful.
+                self.assertEqual(sparse_input, sparse_copy)
+
     @skipIfNotPy311
     def test_linetable_311_writer1(self):
         def fn():
@@ -3559,6 +3988,9 @@ def fn():
         self.assertEqual(len(l1), len(l2))
         for p1, p2 in zip(l1, l2):
             self.assertEqual(p1, p2)
+        # TODO co_lnotab is deprecated in 3.12 and will be removed in 3.14
+        # In 3.11+,. it is computed lazily from other linetable attributes (e.g. co_linetable),
+        # so we do not set this attribute ourselves.
         self.assertEqual(fn.__code__.co_lnotab, result[1].co_lnotab)
 
     @skipIfNotPy311
@@ -3584,7 +4016,8 @@ def fn():
         fn = locals["fn"]
         orig_inst_str = "\n".join(list(map(str, dis.get_instructions(fn))))
         self.assertIn("EXTENDED_ARG", orig_inst_str)
-        self.assertIn("LOAD_METHOD", orig_inst_str)
+        load_method_str = "LOAD_ATTR" if sys.version_info >= (3, 12) else "LOAD_METHOD"
+        self.assertIn(load_method_str, orig_inst_str)
         keys = bytecode_transformation.get_code_keys()
         code_options = {k: getattr(fn.__code__, k) for k in keys}
         result = bytecode_transformation.clean_and_assemble_instructions(
@@ -3594,7 +4027,7 @@ def fn():
         )
         new_inst_str = "\n".join(list(map(str, result[0])))
         self.assertIn("EXTENDED_ARG", new_inst_str)
-        self.assertIn("LOAD_METHOD", new_inst_str)
+        self.assertIn(load_method_str, new_inst_str)
         l1, l2 = list(fn.__code__.co_positions()), list(result[1].co_positions())
         self.assertEqual(len(l1), len(l2))
         for p1, p2 in zip(l1, l2):
@@ -3983,6 +4416,49 @@ def f2():
         res2 = opt_f2()
         self.assertTrue(same(res1, res2))
 
+    def test_inline_local_dict_clear(self):
+        def f(d):
+            d.clear()
+            return d
+
+        inp = {"a": torch.randn(2, 2), "b": torch.randn(2, 2)}
+        out = torch.compile(f, backend="eager", fullgraph=True)(inp)
+        self.assertEqual(len(out), 0)
+        self.assertEqual(len(inp), 0)
+
+    def test_inline_module_attr_dict_clear(self):
+        class MyMod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = {"a": torch.randn(2, 2), "b": torch.randn(2, 2)}
+
+            def forward(self):
+                self.a.clear()
+                return self.a
+
+        m = MyMod()
+        out = torch.compile(m, backend="eager", fullgraph=True)()
+        self.assertEqual(len(out), 0)
+        self.assertEqual(len(m.a), 0)
+
+    def test_inline_user_defined_dict_attr_clear(self):
+        class MyMod:
+            def __init__(self):
+                self.a = {"a": torch.randn(2, 2), "b": torch.randn(2, 2)}
+
+        def f(obj, inp):
+            ret = len(obj.a) + inp
+            obj.a.clear()
+            return obj.a, ret
+
+        m = MyMod()
+        before_len = len(m.a)
+        t_inp = torch.ones(1)
+        d, ret = torch.compile(f, backend="eager", fullgraph=True)(m, t_inp)
+        self.assertEqual(len(m.a), 0)
+        self.assertEqual(len(d), 0)
+        self.assertEqual(ret, t_inp + before_len)
+
     def test_recursive_inline_list_mutation(self):
         def f1(x, y):
             x.append(torch.tensor([1.1]))
@@ -4167,6 +4643,19 @@ def fn(x):
         res2 = opt_fn(x)
         self.assertEqual(res, res2)
 
+    @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
+    def test_tensor_ctor_list_of_tensor(self):
+        def fn(x):
+            return torch.tensor([x], dtype=torch.int64)
+
+        x = torch.tensor(20)
+        res = fn(x)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts)(fn)
+        res2 = opt_fn(x)
+        self.assertEqual(res, res2)
+        self.assertEqual(cnts.frame_count, 1)
+
     def test_tensor_types(self):
         def fn(dtype, tensor_type):
             x = torch.empty(4, dtype=dtype)
@@ -4397,6 +4886,7 @@ def fn1():
             self.assertEqual(k_a, k)
             self.assertTrue(torch.allclose(v_a, v))
 
+    @torch._dynamo.config.patch(guard_nn_modules=True)
     def test_module_complex_iter(self):
         n_embd = 768
         block_size = 128
@@ -4527,10 +5017,6 @@ def _optimize_then_check_exp(
         opt_out = torch._dynamo.optimize(backend=cnt)(foo)(*args)
         self.assertEqual(exp_out, opt_out)
         self.assertEqual(cnt.frame_count, exp_frame_count)
-        self.assertEqual(
-            len(torch._dynamo.eval_frame.cached_backends),
-            exp_n_cached_backend,
-        )
 
     def test_backend_match_guard(self):
         x = torch.randn([3, 4])
@@ -4612,12 +5098,6 @@ def compile_then_check_exp(foo, args, cnt, eager_result, exp_frame_count):
         for thread in threads:
             thread.join()
 
-        # Threads are sharing the backend cache. We see two cnt backends and one None backend
-        self.assertEqual(
-            len(torch._dynamo.eval_frame.cached_backends),
-            3,
-        )
-
         self.assertEqual(len(thread_success), len(threads))
 
     def test_dynamo_min_operator_with_shape(self):
@@ -5269,6 +5749,7 @@ def fn(x, m):
         res = opt_fn(x, m)
         self.assertTrue(torch.allclose(ref, res))
 
+    @torch._dynamo.config.patch(guard_nn_modules=True)
     def test_repro_graph_breaks_in__get_item_by_idx(self):
         class Mod(torch.nn.Module):
             def __init__(self):
@@ -5283,6 +5764,7 @@ def forward(self, x):
         m = Mod()
         graph, _ = torch._dynamo.export(m)(torch.randn(3, 3))
 
+    @torch._dynamo.config.patch(guard_nn_modules=True)
     def test_nn_sequential_invocation(self):
         with freeze_rng_state():
 
@@ -5307,6 +5789,7 @@ def forward(self, x):
             dynamo_result = graph(x)
             self.assertTrue(same(real, dynamo_result))
 
+    @torch._dynamo.config.patch(guard_nn_modules=True)
     def test_nn_sequential_invocation_reposition_indices(self):
         with freeze_rng_state():
 
@@ -5676,16 +6159,35 @@ def fn(x, y):
         self.assertEqual(cnt.frame_count, 0)
 
     def test_is_compiling(self):
-        def f():
+        def f1():
             if torch._dynamo.is_compiling():
                 return torch.ones(2, 2)
             else:
                 return torch.zeros(2, 2)
 
-        opt_f = torch._dynamo.optimize("eager")(f)
+        def f2():
+            if torch._utils.is_compiling():
+                return torch.ones(2, 2)
+            else:
+                return torch.zeros(2, 2)
+
+        def f3():
+            if torch.compiler.is_compiling():
+                return torch.ones(2, 2)
+            else:
+                return torch.zeros(2, 2)
+
+        def f4():
+            if torch.compiler.is_dynamo_compiling():
+                return torch.ones(2, 2)
+            else:
+                return torch.zeros(2, 2)
+
+        for f in [f1, f2, f3, f4]:
+            opt_f = torch._dynamo.optimize("eager")(f)
 
-        self.assertEqual(f(), torch.zeros(2, 2))
-        self.assertEqual(opt_f(), torch.ones(2, 2))
+            self.assertEqual(f(), torch.zeros(2, 2))
+            self.assertEqual(opt_f(), torch.ones(2, 2))
 
     def test_torch_generator_set_state(self):
         def fn():
@@ -5782,7 +6284,7 @@ def guard_failures(failure):
                 first_guard_failure,
             )
         else:
-            self.assertIn("""L['x'].size()[0] < 3""", first_guard_failure)
+            self.assertIn("""2 <= L['x'].size()[0] <= 2""", first_guard_failure)
 
     def test_guard_failure_fn2(self):
         def fn(x, y):
@@ -5970,6 +6472,18 @@ def fn(inputs, params):
 
         fn(inputs, iter(tuple(inputs)))
 
+        def fn(params):
+            y = tuple(params)
+            return inner_fn(*y)
+
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        inputs = [torch.randn(10, 10) for _ in range(3)]
+        self.assertTrue(same(fn(iter(tuple(inputs))), opt_fn(iter(tuple(inputs)))))
+
+        # Force recompilation
+        inputs = [torch.randn(10, 10) for _ in range(4)]
+        self.assertTrue(same(fn(iter(tuple(inputs))), opt_fn(iter(tuple(inputs)))))
+
     def test_torch_package_working_with_trace(self):
         # from torch._dynamo.test_case import run_tests
 
@@ -6065,6 +6579,11 @@ def fn(x, y):
         finally:
             builtins.isinstance = builtin_isinstance
 
+        # check recompilation because builtins is now unpatched
+        opt_fn = torch.compile(backend="eager", fullgraph=True)(fn)
+        res = opt_fn(x, y)
+        self.assertTrue(same(res, x + 1))
+
     # specifically test for tensor.attribute -> torch.something()
     def test_real_imag_tensor_attribute(self):
         def fn(x, y):
@@ -6581,7 +7100,6 @@ def f(a):
         except RuntimeError as e:
             self.assertIn("smoge", traceback.format_exc())
 
-    @unittest.skip("Not clear why this test would trigger a segfault.")
     def test_unhandled_exception_in_dynamo2(self):
         # segfaults in python 3.11 if shadow frame is freed improperly
         from torch.testing import make_tensor
@@ -6613,7 +7131,7 @@ def test1(*, is_sparse):
 
     def test_variable_access_in_exception(self):
         def fn():
-            x = torch.ones(3, 3)
+            x = torch.ones(1)
             try:
                 raise RuntimeError("bad")
             except RuntimeError:
@@ -6621,7 +7139,87 @@ def fn():
             return x
 
         opt_fn = torch._dynamo.optimize("eager")(fn)
-        torch.allclose(opt_fn(), torch.tensor([3.0]))
+        self.assertEqual(opt_fn(), torch.tensor([2.0]))
+
+    def test_nested_sequential_with(self):
+        def fn(x):
+            with torch.set_grad_enabled(True):
+                with torch.set_grad_enabled(False):
+                    x = x + 1
+                with torch.set_grad_enabled(True):
+                    x = x + 1
+                return x
+
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        self.assertEqual(opt_fn(torch.ones(1)), torch.tensor([3.0]))
+
+    def test_nested_sequential_try(self):
+        def fn(x):
+            try:
+                try:
+                    x = x + 1
+                except:
+                    pass
+                try:
+                    try:
+                        x = x + 1
+                    except:
+                        pass
+                except:
+                    pass
+            except:
+                pass
+            return x
+
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        self.assertEqual(opt_fn(torch.ones(1)), torch.tensor([3.0]))
+
+    def test_nested_sequential_try_with(self):
+        def fn(x):
+            with torch.set_grad_enabled(True):
+                try:
+                    x = x + 1
+                except:
+                    pass
+                try:
+                    with torch.set_grad_enabled(False):
+                        x = x + 1
+                except:
+                    pass
+            return x
+
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        self.assertEqual(opt_fn(torch.ones(1)), torch.tensor([3.0]))
+
+    def test_nested_sequential_try_with_graph_break(self):
+        def fn(x, n):
+            with torch.set_grad_enabled(True):
+                with torch.set_grad_enabled(False):
+                    x = x + 1
+                    torch._dynamo.graph_break()
+                try:
+                    with torch.set_grad_enabled(False):
+                        x = x + 1
+                        if n == 0:
+                            torch._dynamo.graph_break()
+                except:
+                    pass
+                with torch.set_grad_enabled(False):
+                    x = x + 1
+                    torch._dynamo.graph_break()
+                x = x + 1
+            return x
+
+        counter = CompileCounter()
+        opt_fn = torch._dynamo.optimize(counter)(fn)
+        self.assertEqual(opt_fn(torch.ones(1), 0), torch.tensor([5.0]))
+        self.assertEqual(counter.frame_count, 1)
+
+        torch._dynamo.reset()
+        counter = CompileCounter()
+        opt_fn = torch._dynamo.optimize(counter)(fn)
+        self.assertEqual(opt_fn(torch.ones(1), 1), torch.tensor([5.0]))
+        self.assertEqual(counter.frame_count, 3)
 
     def test_ordered_dict_alias_reconstruct(self):
         od = collections.OrderedDict
@@ -6640,6 +7238,8 @@ def fn():
         dis.dis(fn)
         self.assertEqual(torch._dynamo.optimize("eager")(fn)(), 3)
 
+    # NOTE this test can be removed once multiline errors are in Python.
+    # See https://github.com/python/cpython/issues/106922
     @skipIfNotPy311
     def test_get_instruction_source_311(self):
         def f():
@@ -6690,7 +7290,11 @@ def f():
 
         from torch._dynamo.utils import get_instruction_source_311
 
-        offsets = (3, 11, 15, 19, 23, 29, 35, 46, 58, 74)
+        if sys.version_info >= (3, 12):
+            # Offsets changed in 3.12, e.g. due to removal of PRECALL inst
+            offsets = (3, 11, 15, 19, 23, 29, 35, 44, 53, 65)
+        else:
+            offsets = (3, 11, 15, 19, 23, 29, 35, 46, 58, 74)
         insts = list(dis.get_instructions(f))
         # uncomment to determine offsets
         # print(*enumerate(insts), sep="\n")
@@ -6770,8 +7374,9 @@ def f():
 """,
         )
         # test unicode (since assertExpectedInline doesn't support unicode)
+        op_offset = 74 if sys.version_info >= (3, 12) else 84
         self.assertEqual(
-            get_instruction_source_311(f.__code__, insts[84]),
+            get_instruction_source_311(f.__code__, insts[op_offset]),
             """\
             a = ("🔥🔥🔥" +
                 ~~~~~~~~
@@ -6792,14 +7397,28 @@ def my_dyn_fn(x):
         with self.assertRaises(ConstraintViolationError):
             torch._dynamo.optimize("eager")(my_dyn_fn)(y)
 
-    def test_mark_static(self):
-        counter = CompileCounter()
+    # Translation validation changes the exception type, don't run with it
+    @torch.fx.experimental._config.patch(translation_validation=False)
+    def test_mark_dynamic_with_ranges(self):
+        y = torch.randn([8, 3, 3])
 
         def my_dyn_fn(x):
+            if x.shape[0] == 3:
+                return x.sin()
             return x.cos()
 
-        y = torch.randn([3])
-        torch._dynamo.mark_static(y, 0)
+        torch._dynamo.mark_dynamic(y, 0, min=2, max=5)
+        with self.assertRaises(ConstraintViolationError):
+            torch._dynamo.optimize("eager")(my_dyn_fn)(y)
+
+    def test_mark_static(self):
+        counter = CompileCounter()
+
+        def my_dyn_fn(x):
+            return x.cos()
+
+        y = torch.randn([3])
+        torch._dynamo.mark_static(y, 0)
         torch._dynamo.optimize(counter)(my_dyn_fn)(y)
 
         z = torch.randn([4])
@@ -7636,6 +8255,19 @@ def fn():
         compiled_out = compiled_fn()
         self.assertTrue(same(fn_out, compiled_out))
 
+    def test_tuple_hasattr(self):
+        def fn(x):
+            if hasattr(x, "foo"):
+                return x[0] + 1
+            return x[1] - 1
+
+        compiled_fn = torch.compile(backend="eager", fullgraph=True)(fn)
+
+        x = (torch.randn(3), torch.randn(3))
+        fn_out = fn(x)
+        compiled_out = compiled_fn(x)
+        self.assertTrue(same(fn_out, compiled_out))
+
     def test_fn_hasattr__name__1(self):
         def fn():
             foo = lambda x: x + 1
@@ -7875,11 +8507,46 @@ def test_unbacked_symint(self):
         def f(lengths, values):
             sizes = lengths.tolist()
             for s in sizes:
-                torch._constrain_as_size(s, min=2, max=100)
+                torch._check_is_size(s)
+                torch._check(s >= 2)
+                torch._check(s <= 100)
             return torch.split(values, sizes)
 
         f(torch.tensor([2, 3, 4]), torch.randn(9))
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_runtime_assert_replacement(self):
+        @torch.compile(backend="aot_eager")
+        def fn(x, y):
+            z = y.item()
+            torch._check(z == 3)
+            return x + z
+
+        fn(torch.randn(4), torch.tensor([3]))
+        self.assertRaises(RuntimeError, lambda: fn(torch.randn(4), torch.tensor([4])))
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_cat_unbacked(self):
+        @torch.compile(backend="eager")
+        def fn(x, y):
+            z = y.item()
+            return torch.cat([x, torch.ones(z)])
+
+        fn(torch.randn(2, 3), torch.tensor([0]))
+        self.assertRaises(
+            RuntimeError, lambda: fn(torch.randn(2, 3), torch.tensor([1]))
+        )
+
+    @torch._dynamo.config.patch(
+        capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+    )
+    def test_aot_autograd_propagate_unbacked_symints_shape(self):
+        @torch.compile(backend="aot_eager")
+        def f(x):
+            return torch.nonzero(x)
+
+        f(torch.tensor([1, 0, 3, 2, 0]))
+
     def test_simple_set_usage(self):
         def foo(x, y):
             setty = {x, y}
@@ -8049,6 +8716,29 @@ def fn(img):
         self.assertEqual(msg, "shape torch.Size([8, 8]) batch size 1.00")
         self.assertEqual(res, img1 + torch.sin(img1))
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_validate_outputs_unbacked(self):
+        class SillyCat(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x0, x1, i):
+                ctx.save_for_backward(i)
+                return torch.cat([x0, x1])
+
+            @staticmethod
+            def backward(ctx, grad_out):
+                (i,) = ctx.saved_tensors
+                i0, i1 = i.tolist()
+                g_x0, g_x1 = grad_out.split([i0, i1])
+                return g_x0, g_x1, None
+
+        @torch.compile(backend="aot_eager", fullgraph=True)
+        def f(x, i):
+            i0, i1 = i.tolist()
+            x0, x1 = x.split([i0, i1])
+            return SillyCat.apply(x0, x1, i)
+
+        f(torch.randn(9, requires_grad=True), torch.tensor([3, 6]))
+
     def test_str_format_assert1(self):
         @torch.compile(backend="eager", fullgraph=True)
         def fn(img):
@@ -8254,13 +8944,31 @@ def yield_from_gen(l):
 
             return [t * k for t in yield_from_gen(t_list)]
 
-        t_list = [torch.randn([2, 3])] * 3
+        t_list = [torch.randn([2, 3]) for _ in range(3)]
         eager = yield_from_fn(t_list, 2)
         counter = CompileCounter()
         compiled = torch._dynamo.optimize(counter)(yield_from_fn)(t_list, 2)
         self.assertEqual(eager, compiled)
         self.assertEqual(counter.frame_count, 1)
 
+    def test_yield_from_in_a_loop(self):
+        def gen2():
+            yield 1
+
+        def gen1():
+            for value in range(5):
+                yield from gen2()
+
+        def fn(x):
+            c = 0
+            for i in gen1():
+                c = c + i
+            return x + c
+
+        opt_fn = torch.compile(fn, backend="eager")
+        x = torch.zeros(4)
+        self.assertEqual(fn(x), opt_fn(x))
+
     def test_yield_gen_and_from(self):
         def populate_and_multiply_sequence(n, multiplier):
             # Inline generator
@@ -8285,6 +8993,34 @@ def yield_from_gen():
         self.assertEqual(eager, compiled)
         self.assertEqual(counter.frame_count, 1)
 
+    def test_yield_from_user_stop_iteration(self):
+        class MyIter:
+            def __init__(self, seq):
+                self.seq = seq
+                self.index = 0
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                self.index += 1
+                if self.index <= len(self.seq):
+                    return self.seq[self.index - 1]
+                raise StopIteration(self.index)
+
+        def yield_from_iter_fn(seq):
+            def gen(seq):
+                yield from MyIter(seq)
+
+            return [i for i in gen(seq)]
+
+        seq = [torch.randn([2, 3]) for _ in range(3)]
+        eager = yield_from_iter_fn(seq)
+        counter = CompileCounter()
+        compiled = torch._dynamo.optimize(counter)(yield_from_iter_fn)(seq)
+        self.assertEqual(eager, compiled)
+        self.assertEqual(counter.frame_count, 0)
+
     def test_yield_send_to_subgenerator_graph_break(self):
         def subgenerator(tensor):
             multiplier = yield
@@ -8598,6 +9334,18 @@ def fn(a, b, c, d, x):
             self.assertEqual(list(eager), list(compiled))
             self.assertEqual(len(counters["graph_break"]), 0)
 
+    def test_packaging_version_parse(self):
+        from packaging import version
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn():
+            x = torch.zeros(1)
+            if version.parse(torch.__version__) >= version.parse("2.0.0"):
+                return x + 1
+            return x
+
+        self.assertEqual(fn().item(), 1)
+
     def test_itertools_accumulate_tensors_user_defined(self):
         def udo_fn_0(a, b):
             return -1
@@ -8705,6 +9453,22 @@ def fn(x):
 
         self.assertEqual(fn(x), compiled_fn(x))
 
+    def test_storage_return(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            y = torch.sin(x + 1)
+            storage = x.untyped_storage()
+            storage.resize_(0)
+            y = torch.cos(y)
+            return y, storage
+
+        x = torch.randn(10)
+        expected = torch.cos(torch.sin(x + 1))
+        y, s = fn(x)
+        self.assertEqual(y, expected)
+        self.assertEqual(x.untyped_storage().size(), 0)
+        self.assertIs(s, x.untyped_storage())
+
     def test_flat_name_to_original_fqn(self):
         class FooBarModule(torch.nn.Module):
             def __init__(self):
@@ -8792,9 +9556,9 @@ def test_shape_env_equal_constructor(self):
             """\
 ShapeEnv not equal: field values don't match:
 
-==> allow_scalar_outputs: values don't match.
-  >  Left: False
-  > Right: True
+==> settings: values don't match.
+  >  Left: ShapeEnvSettings(allow_scalar_outputs=False, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False)
+  > Right: ShapeEnvSettings(allow_scalar_outputs=True, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False)
 """,
         )
         self._replay_and_check(main)
@@ -8814,9 +9578,6 @@ def test_shape_env_equal_create_symbolic_sizes_strides_storage_offset(self):
 ==> name_to_node: values don't match.
   >  Left: {x_size_0_, x_size_1_, x_storage_offset, x_stride_0_, x_stride_1_}
   > Right: {}
-==> runtime_var_to_range: values don't match.
-  >  Left: {s0: ValueRanges(lower=2, upper=9223372036854775806, is_bool=False), s1: ValueRanges(lower=2, upper=9223372036854775806, is_bool=False)}
-  > Right: {}
 ==> source_to_symbol: values don't match.
   >  Left: {x.size()[0]: x.size()[0], x.size()[1]: x.size()[1], x.storage_offset(): x.storage_offset(), x.stride()[0]: x.stride()[0], x.stride()[1]: x.stride()[1]}
   > Right: {}
@@ -8849,7 +9610,7 @@ def test_shape_env_equal_unbacked(self):
 ShapeEnv not equal: field values don't match:
 
 ==> name_to_node: values don't match.
-  >  Left: {f0, i0, i1}
+  >  Left: {f0, u0, u1}
   > Right: {}
 ==> unbacked_symfloat_counter: values don't match.
   >  Left: 1
@@ -8858,7 +9619,7 @@ def test_shape_env_equal_unbacked(self):
   >  Left: 2
   > Right: 0
 ==> var_to_range: values don't match.
-  >  Left: {f0: ValueRanges(lower=-oo, upper=oo, is_bool=False), i0: ValueRanges(lower=-9223372036854775808, upper=9223372036854775807, is_bool=False), i1: ValueRanges(lower=0, upper=1, is_bool=False)}
+  >  Left: {f0: ValueRanges(lower=-oo, upper=oo, is_bool=False), u0: ValueRanges(lower=-9223372036854775808, upper=9223372036854775807, is_bool=False), u1: ValueRanges(lower=0, upper=1, is_bool=False)}
   > Right: {}
 """,
         )
@@ -8934,6 +9695,9 @@ def test_shape_env_equal_evaluate_expr_replacement(self):
 ==> replacements: values don't match.
   >  Left: {s0: 3}
   > Right: {}
+==> var_to_range: values don't match.
+  >  Left: {s0: ValueRanges(lower=3, upper=3, is_bool=False), s1: ValueRanges(lower=2, upper=9223372036854775806, is_bool=False)}
+  > Right: {s0: ValueRanges(lower=2, upper=9223372036854775806, is_bool=False), s1: ValueRanges(lower=2, upper=9223372036854775806, is_bool=False)}
 """,
         )
         self._replay_and_check(main)
@@ -8969,9 +9733,6 @@ def test_shape_env_equal_evaluate_expr_refinement(self):
 ==> name_to_node: values don't match.
   >  Left: {_assert, ge, x_size_0_, x_size_1_, x_storage_offset, x_stride_0_, x_stride_1_}
   > Right: {x_size_0_, x_size_1_, x_storage_offset, x_stride_0_, x_stride_1_}
-==> var_to_guards: values don't match.
-  >  Left: {s0: (s0 >= 3, None)}
-  > Right: {}
 ==> var_to_range: values don't match.
   >  Left: {s0: ValueRanges(lower=3, upper=9223372036854775806, is_bool=False), s1: ValueRanges(lower=2, upper=9223372036854775806, is_bool=False)}
   > Right: {s0: ValueRanges(lower=2, upper=9223372036854775806, is_bool=False), s1: ValueRanges(lower=2, upper=9223372036854775806, is_bool=False)}
@@ -8999,14 +9760,14 @@ def test_shape_env_equal_runtime_assert(self):
 ShapeEnv not equal: field values don't match:
 
 ==> deferred_runtime_asserts: values don't match.
-  >  Left: {i0: [Eq(Mod(i0, 3), 0)]}
+  >  Left: {u0: [Eq(Mod(u0, 3), 0)]}
   > Right: {}
 ==> divisible: values don't match.
-  >  Left: {Mod(i0, 3)}
+  >  Left: {Mod(u0, 3)}
   > Right: {}
 ==> name_to_node: values don't match.
-  >  Left: {_assert, eq, i0, mod}
-  > Right: {i0}
+  >  Left: {_assert, eq, mod, u0}
+  > Right: {u0}
 ==> num_deferred_runtime_asserts: values don't match.
   >  Left: 1
   > Right: 0
@@ -9040,6 +9801,30 @@ def inner(a, b, res_dtype):
         with set_default_dtype(torch.double):
             foo()
 
+    def test_numpy_ufunc_out(self):
+        @torch.compile(backend="eager")
+        def foo():
+            x = np.arange(5)
+            out = np.empty((x.shape[0], x.shape[0]))
+            res_out = np.sin(x, out=out)
+            assert res_out is out
+
+        foo()
+
+    # Unfortunately, we don't currently preserve the ids of
+    # res_out and out correctly across the graph break
+    @unittest.expectedFailure
+    def test_numpy_ufunc_out_graph_break(self):
+        @torch.compile(backend="eager")
+        def foo():
+            x = np.arange(5)
+            out = np.empty((x.shape[0], x.shape[0]))
+            res_out = np.sin(x, out=out)
+            torch._dynamo.graph_break()
+            assert res_out is out
+
+        foo()
+
     def test_dict_subclass_cannot_be_initialized_in_graph(self):
         for super_class in (
             collections.OrderedDict,
@@ -9062,6 +9847,31 @@ def fn(x):
             ):
                 print(fn_opt(torch.zeros(1)))
 
+    @wrapDeterministicFlagAPITest
+    def test_backward_deterministic_mode_mismatch_warning(self):
+        @torch.compile
+        def func(a, b):
+            return a + b
+
+        for forward_deterministic, backward_deterministic in itertools.product(
+            [True, False], [True, False]
+        ):
+            torch.use_deterministic_algorithms(forward_deterministic)
+            a = torch.randn(10, requires_grad=True)
+            res = func(a, 1)
+            grad = torch.ones_like(res)
+            torch.use_deterministic_algorithms(backward_deterministic)
+
+            if not forward_deterministic and backward_deterministic:
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "^This compiled backward function is being run with torch\.use_deterministic_algorithms",
+                ):
+                    res.backward(grad)
+
+            else:
+                res.backward(grad)
+
     def test_torch_dynamo_codegen_pow(self):
         def pow(x):
             return x**2
@@ -9079,6 +9889,66 @@ def pow(x):
             msg="Encountered an unexpected fallback to 'aten pow' in dynamo compiled code",
         )
 
+    def test_graph_break_compilation_metrics(self):
+        def fn(x):
+            x.cos()
+            torch._dynamo.graph_break()
+            x.sin()
+            torch._dynamo.graph_break()
+            return x.cos()
+
+        torch._dynamo.utils.clear_compilation_metrics()
+        x = torch.rand((4, 4))
+        f = torch.compile(fn, backend="eager")
+        f(x)
+        metrics = torch._dynamo.utils.get_compilation_metrics()
+        # Should only be one restart per event
+        (restart_reason,) = metrics[0].restart_reasons
+        self.assertTrue(
+            "skip function graph_break" in restart_reason,
+            "Should have logged graph break reason",
+        )
+        self.assertTrue(
+            metrics[0].dynamo_time_before_restart_s
+            <= metrics[0].entire_frame_compile_time_s
+        )
+
+        (restart_reason,) = metrics[1].restart_reasons
+        self.assertTrue(
+            "skip function graph_break" in restart_reason,
+            "Should have logged graph break reason",
+        )
+        self.assertTrue(
+            metrics[1].dynamo_time_before_restart_s
+            <= metrics[1].entire_frame_compile_time_s
+        )
+
+        # No restarts
+        self.assertTrue(
+            len(metrics[2].restart_reasons) == 0, "Last compile has no graph break"
+        )
+        self.assertTrue(metrics[2].dynamo_time_before_restart_s == 0)
+
+    def test_graph_break_compilation_metrics_on_failure(self):
+        def fn(x):
+            return x.sin()
+
+        def broken_backend(gm, example_inputs):
+            raise RuntimeError("broken backend")
+
+        x = torch.rand((4, 4))
+        f = torch.compile(fn, backend=broken_backend)
+        with unittest.mock.patch("torch._dynamo.config.suppress_errors", True):
+            torch._dynamo.utils.clear_compilation_metrics()
+            f(x)
+            metrics = torch._dynamo.utils.get_compilation_metrics()
+            for metric in metrics:
+                self.assertTrue(metric.dynamo_time_before_restart_s > 0)
+                self.assertTrue(
+                    "RuntimeError: broken backend" in metric.fail_reason,
+                    "Should have logged fail reason",
+                )
+
     def test_compilation_metrics_size_limit(self):
         def fn1(x):
             return x.relu()
@@ -9188,6 +10058,603 @@ def fn():
 """,
         )
 
+    def test_return_dict_with_graph_break_and_update(self):
+        def create():
+            torch._dynamo.graph_break()
+            return {0: torch.tensor(3)}
+
+        def fn():
+            return {**create()}
+
+        opt_fn = torch.compile(backend="eager")(fn)
+        result = opt_fn()
+        self.assertIn(0, result)
+        self.assertTrue(same(result[0], torch.tensor(3)))
+
+    def test_dynamo_reset_clears_cache(self):
+        """Test that dynamo bytecode cache is freed
+        when dynamo reset is called
+        """
+
+        def fn(x):
+            return torch.sin(x)
+
+        opt_fn = torch.compile(backend="eager")(fn)
+        opt_fn(torch.randn(3, 3))
+
+        c1 = _debug_get_cache_entry_list(fn.__code__)
+        self.assertEqual(len(c1), 1)
+
+        torch._dynamo.reset()
+        c2 = _debug_get_cache_entry_list(fn.__code__)
+        self.assertEqual(len(c2), 0)
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_guard_size_oblivious(self):
+        # This code, in fact, does NOT work in eager
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            y = torch.zeros(x.item())
+            if guard_size_oblivious(y.size(0) == 0):
+                assert False
+            return y
+
+        self.assertEqual(fn(torch.tensor([0])), torch.zeros(0))
+
+    def test_guard_size_oblivious_backed(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            y = x.size(0)
+            # This doesn't actually do anything
+            if guard_size_oblivious(y == 0):
+                return torch.randn(1)
+            else:
+                return torch.randn(2)
+
+        # Should not fail in either case
+        self.assertEqual(f(torch.randn(0)).shape, (1,))
+        self.assertEqual(f(torch.randn(2)).shape, (2,))
+
+    def _test_compile_model_free(self, model_inp_ctr, weakref_watch):
+        """
+        Args:
+        model_inp_ctr
+            - constructor that returns a new model and inputs to that model
+        weakref_watch
+            - function that returns a layer of the model for weakref to
+              finalize on, so we can check that the layer is freed after
+              the model goes out of scope
+        """
+        cleared = False
+
+        def finalize():
+            nonlocal cleared
+            cleared = True
+
+        def run():
+            mod, inp = model_inp_ctr()
+            weakref.finalize(weakref_watch(mod), finalize)
+            torch.compile(mod, backend="eager")(inp)
+
+        run()
+        gc.collect()
+        self.assertTrue(cleared)
+
+    def test_custom_module_free(self):
+        """Test that a model is freed when it goes out of scope"""
+
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super(Mod, self).__init__()
+                self.fc = torch.nn.Linear(100, 100)
+
+            def forward(self, out):
+                return self.fc(out)
+
+        self._test_compile_model_free(
+            lambda: (Mod(), torch.randn(100, 100)),
+            lambda mod: mod.fc,
+        )
+
+    def test_sequential_module_free(self):
+        self._test_compile_model_free(
+            lambda: (
+                torch.nn.Sequential(
+                    torch.nn.Linear(100, 100),
+                    torch.nn.ReLU(),
+                ),
+                torch.randn(100, 100),
+            ),
+            lambda mod: mod[0],
+        )
+
+    def test_linear_module_free(self):
+        self._test_compile_model_free(
+            lambda: (torch.nn.Linear(100, 100), torch.randn(100, 100)),
+            lambda mod: mod,
+        )
+
+    # The following 2 tests fail due to https://github.com/python/cpython/issues/118013.
+    # Tracked by https://github.com/pytorch/pytorch/issues/124302.
+    # The xfails can be removed once Python 3.12 is updated on CI.
+    @xfailIfPy312
+    def test_outside_linear_module_free(self):
+        # Compared to test_linear_module_free, the linear
+        # layer is not the code object that is directly compiled.
+
+        # This test does not use _test_compile_model_free because of difficulty
+        # in handling variable fc.
+
+        cleared = False
+
+        def finalize():
+            nonlocal cleared
+            cleared = True
+
+        def run():
+            fc = torch.nn.Linear(100, 100)
+
+            class Mod(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.fc_ref = fc
+
+                def forward(self, x):
+                    return self.fc_ref(x)
+
+            mod = Mod()
+            inp = torch.randn(100, 100)
+            weakref.finalize(fc, finalize)
+            torch.compile(mod, backend="eager")(inp)
+
+        run()
+        # del fc  # This should delete all the references
+        gc.collect()
+        self.assertTrue(cleared)
+
+    @xfailIfPy312
+    def test_parameter_free(self):
+        def model_inp_ctr():
+            param = torch.nn.Parameter(torch.randn(100, 100))
+
+            class Mod(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.param = param
+
+                def forward(self, x):
+                    return self.param * x[0]
+
+            # return param to keep it alive in _test_compile_model_free
+            return Mod(), (torch.randn(100, 100), param)
+
+        self._test_compile_model_free(model_inp_ctr, lambda mod: mod.param)
+
+    def test_conditional_list_comp_in_context(self):
+        def fn(inp):
+            try:
+                return [torch.sin(x) for x in inp if x is not None]
+            except Exception:
+                pass
+
+        inp = [torch.randn(3, 3) for _ in range(3)] + [None]
+        opt_fn = torch.compile(fn, backend="eager")
+        opt_fn(inp)
+
+    def test_312_binary_slice_with_graph_break1(self):
+        l1 = torch.nn.Linear(5, 5)
+        l2 = torch.nn.Linear(5, 5)
+
+        def fn(x):
+            # causes a graph break with items in the stack
+            n = torch.nn.Sequential(l1, l2)
+            out = n[1:](x)
+            return out
+
+        opt_fn = torch.compile(fn, backend="eager")
+        opt_fn(torch.randn(5, 5))
+
+    def test_312_binary_slice_with_graph_break2(self):
+        class Foo:
+            def __setitem__(self, key, val):
+                pass
+
+            def __getitem__(self, key):
+                torch._dynamo.graph_break()
+                return 1
+
+        foo = Foo()
+
+        def fn(x):
+            # graph break in a STORE_SLICE instruction
+            foo[:] = x
+            # graph break in BINARY_SLICE with has_backedge check
+            x = x + foo[:]
+            if x is None:
+                x = x + 1
+            else:
+                x = x + 1
+            return x
+
+        opt_fn = torch.compile(fn, backend="eager")
+        opt_fn(torch.randn(5, 5))
+
+    def test_super_after_graph_break(self):
+        class Foo(torch.nn.Sequential):
+            def __init__(self, layers):
+                torch._dynamo.graph_break()
+                super().__init__(*layers)
+
+        def fn(x):
+            layers = [torch.nn.Linear(3, 3) for _ in range(3)]
+            mod = Foo(layers)
+            return mod(x)
+
+        opt_fn = torch.compile(fn, backend="eager")
+        opt_fn(torch.randn(3, 3))
+
+    def test_load_fast_and_clear_graph_break(self):
+        # Can result in a segfault in 3.12+ if LOAD_FAST_AND_CLEAR
+        # is not handled properly in a graph break
+        def fn():
+            out = torch.cat([torch.randn(r, 5) for r in range(3)])
+            torch._dynamo.graph_break()
+            out = torch.cat([torch.randn(r, 5) for r in range(3)])
+            return out
+
+        self.assertEqual(torch._dynamo.optimize("eager")(fn)().shape, (3, 5))
+
+    def test_raises_importerror1(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            try:
+                import some_module_that_surely_does_not_exist
+
+                return
+            except ImportError:
+                pass
+            return x.sin()
+
+        x = torch.randn(8)
+        self.assertEqual(fn(x), x.sin())
+
+    def test_raises_importerror2(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            import some_module_that_surely_does_not_exist
+
+            return x + 1
+
+        x = torch.randn(8)
+        with self.assertRaises(ImportError):
+            fn(x)
+
+    def test_dynamo_cache_move_to_front(self):
+        def fn(x, const):
+            return x + const
+
+        # dynamic=False forces Dynamo to recompile
+        opt_fn = torch.compile(fn, backend="eager", dynamic=False)
+
+        inp = torch.randn(3, 3)
+
+        # NOTE: assumes that each cache entry is guarded
+        # on unique Mod instance
+        opt_fn(inp, 1)
+        opt_fn(inp, 2)
+        opt_fn(inp, 3)
+
+        c1 = _debug_get_cache_entry_list(fn.__code__)
+        self.assertEqual(len(c1), 3)
+
+        # move cache entry to front
+        opt_fn(inp, 2)
+        c2 = _debug_get_cache_entry_list(fn.__code__)
+        self.assertIs(c1[1], c2[0])
+
+    def test_dynamo_cache_invalidate(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super(Mod, self).__init__()
+                self.fc = torch.nn.Linear(3, 3)
+
+            def forward(self, out):
+                return self.fc(out)
+
+        def fn(x, mod):
+            return mod(x)
+
+        opt_fn = torch.compile(fn, backend="eager")
+
+        m1 = Mod()
+        m2 = Mod()
+        m3 = Mod()
+        inp = torch.randn(3, 3)
+
+        # NOTE: assumes that each cache entry is guarded
+        # on unique Mod instance
+        opt_fn(inp, m1)
+        opt_fn(inp, m2)
+        opt_fn(inp, m3)
+
+        c1 = _debug_get_cache_entry_list(fn.__code__)
+        self.assertEqual(len(c1), 3)
+
+        # move cache entry to front
+        opt_fn(inp, m2)
+        c2 = _debug_get_cache_entry_list(fn.__code__)
+        self.assertIs(c1[1], c2[0])
+
+        # delete center of cache
+        del m3
+        c3 = _debug_get_cache_entry_list(fn.__code__)
+        self.assertEqual(len(c3), 2)
+        self.assertIs(c3[0], c2[0])
+        self.assertIs(c3[1], c2[2])
+
+        # delete end of cache
+        del m1
+        c4 = _debug_get_cache_entry_list(fn.__code__)
+        self.assertEqual(len(c4), 1)
+        self.assertIs(c4[0], c3[0])
+
+        del m2
+        c5 = _debug_get_cache_entry_list(fn.__code__)
+        self.assertEqual(len(c5), 0)
+
+    def test_grad_none(self):
+        def fn(x, y):
+            x.grad = torch.abs(y)
+            x.grad.add_(y)
+            return torch.abs(y)
+
+        y = torch.arange(4).reshape(2, 2).to(torch.float)
+        x = torch.randn(2, 2)
+        x.grad = None
+
+        z = fn(x, y)
+        ref_y = torch.clone(z).detach()
+        ref_x_grad = torch.clone(x.grad).detach()
+
+        y = torch.arange(4).reshape(2, 2).to(torch.float)
+        x = torch.randn(2, 2)
+        x.grad = None
+
+        opt_fn = torch.compile(fn, backend="eager")
+        z = opt_fn(x, y)
+        self.assertEqual(z, ref_y)
+        self.assertEqual(x.grad, ref_x_grad)
+
+    def test_grad_non_none(self):
+        def fn(x, y):
+            x.grad.add_(y)
+            return torch.abs(y)
+
+        y = torch.ones(2, 2)
+        x = torch.randn(2, 2)
+        x.grad = torch.arange(4).reshape(2, 2).to(torch.float)
+
+        z = fn(x, y)
+        ref_y = torch.clone(z).detach()
+        ref_x_grad = torch.clone(x.grad).detach()
+
+        y = torch.ones(2, 2)
+        x = torch.randn(2, 2)
+        x.grad = torch.arange(4).reshape(2, 2).to(torch.float)
+
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("eager")
+        opt_fn = torch.compile(fn, backend=cnt)
+        z = opt_fn(x, y)
+
+        # Ensure that the generated graph returns only one output. We want the
+        # add_ on the grad to be part of the graph itself, so that inductor can
+        # theoretically move the add_ and resutling copy_ nodes at the right
+        # place to free memory.
+        self.assertEqual(len(list(cnt.graphs[0].graph.nodes)[-1].all_input_nodes), 1)
+        self.assertEqual(z, ref_y)
+        self.assertEqual(x.grad, ref_x_grad)
+
+    def test_new_with_int_list(self):
+        # Make sure torch.Tensor.new(int argument list) behaves the same on dynamo.
+        def fn(x):
+            return x.new(*x.size()) + 5
+
+        optfn = torch.compile(backend="eager")(fn)
+
+        x = torch.arange(10).view(2, 5)
+
+        expected = fn(x)
+        actual = optfn(x)
+
+        self.assertEqual(expected.dtype, actual.dtype)
+        self.assertEqual(expected.shape, actual.shape)
+        self.assertEqual(expected.stride(), actual.stride())
+        self.assertEqual(expected.storage_offset(), actual.storage_offset())
+
+    @torch._dynamo.config.patch(guard_nn_modules=True)
+    def test_hasattr_nn_module_guard(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.nn.Linear(3, 3)
+
+            def forward(self, x):
+                if hasattr(self, "a"):
+                    return self.a(x)
+                else:
+                    return x
+
+        m = M()
+        x = torch.randn(3, 3)
+        ref = m(x)
+
+        opt_m = torch.compile(backend="eager")(m)
+        res = opt_m(x)
+        self.assertEqual(ref, res)
+
+    def test_ordered_dict_move_to_end(self):
+        d = {
+            "foo": 1,
+            "bar": 2,
+        }
+
+        d = collections.OrderedDict(d)
+        d.move_to_end("foo")
+
+        @torch.compile(backend="eager")
+        def fn(x, d):
+            return x * d["foo"] * d["bar"]
+
+        fn(torch.randn(4), d)
+        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
+            fn(torch.randn(4), d)
+
+    def test_defaultdict(self):
+        d = collections.defaultdict()
+        d["foo"] = 1
+        d["bar"] = 2
+
+        @torch.compile(backend="eager")
+        def fn(x, d):
+            return x * d["foo"] * d["bar"]
+
+        fn(torch.randn(4), d)
+        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
+            fn(torch.randn(4), d)
+
+    def test_custom_dict(self):
+        class MyDict(dict):
+            pass
+
+        d = {
+            "foo": 1,
+            "bar": 2,
+        }
+
+        d = MyDict(d)
+
+        @torch.compile(backend="eager")
+        def fn(x, d):
+            return x * d["foo"] * d["bar"]
+
+        fn(torch.randn(4), d)
+        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
+            fn(torch.randn(4), d)
+
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    @torch._dynamo.config.patch(
+        capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+    )
+    @torch._functorch.config.patch(fake_tensor_propagate_real_tensors=True)
+    def test_interpolate_propagate_real_tensors(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(mask, box):
+            # u0, u1 = mask.tolist()
+            mask = torch.randn(1, 1, 30, 30, device="cuda")
+            h, w = box.tolist()
+            return torch.nn.functional.interpolate(
+                mask, (h, w), mode="bilinear", align_corners=False
+            )
+
+        f(torch.tensor([30, 30], device="cuda"), torch.tensor([68, 32], device="cuda"))
+
+    def test_custom_iter_dict(self):
+        class ReversedDict(dict):
+            def __iter__(self):
+                return reversed(list(self.keys()))
+
+        d = {
+            "foo": 1,
+            "bar": 2,
+        }
+
+        d = ReversedDict(d)
+
+        @torch.compile(backend="eager")
+        def fn(x, d):
+            return x * d["foo"] * d["bar"]
+
+        fn(torch.randn(4), d)
+        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
+            fn(torch.randn(4), d)
+
+    def test_custom_keys_iter_dict(self):
+        class ReversedDict(dict):
+            def keys(self):
+                return ["bar", "foo"]
+
+        d = {
+            "foo": 1,
+            "bar": 2,
+        }
+
+        d = ReversedDict(d)
+
+        @torch.compile(backend="eager")
+        def fn(x, d):
+            return x * d["foo"] * d["bar"]
+
+        fn(torch.randn(4), d)
+        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
+            fn(torch.randn(4), d)
+
+    def test_dict_guard_on_keys_order(self):
+        d = {
+            2: 4,
+            3: 5,
+        }
+
+        cnts = torch._dynamo.testing.CompileCounter()
+
+        def fn(x, d):
+            for key, value in d.items():
+                x = x * key + value
+            return x
+
+        opt_fn = torch.compile(fn, backend=cnts)
+        opt_fn(torch.randn(4), d)
+        opt_fn(torch.randn(4), d)
+        # No recompilation
+        self.assertEqual(cnts.frame_count, 1)
+
+        # move 2 to the end
+        d[2] = d.pop(2)
+
+        x = torch.randn(4)
+        res = opt_fn(x, d)
+        # Check recompilation
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(res, fn(x, d))
+
+    def test_dict_guard_on_keys_order2(self):
+        d = {
+            2: 4,
+            3: 5,
+        }
+
+        cnts = torch._dynamo.testing.CompileCounter()
+
+        def fn(x, d):
+            for key in d:
+                value = d[key]
+                x = x * key + value
+            return x
+
+        opt_fn = torch.compile(fn, backend=cnts)
+        opt_fn(torch.randn(4), d)
+        opt_fn(torch.randn(4), d)
+        # No recompilation
+        self.assertEqual(cnts.frame_count, 1)
+
+        # move 2 to the end
+        d[2] = d.pop(2)
+
+        x = torch.randn(4)
+        res = opt_fn(x, d)
+        # Check recompilation
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(res, fn(x, d))
+
 
 class TestTracer(JitTestCase):
     def test_jit_save(self):
diff --git a/test/dynamo/test_model_output.py b/test/dynamo/test_model_output.py
index 2b6f8fe2453f9..b2c1581d7e861 100644
--- a/test/dynamo/test_model_output.py
+++ b/test/dynamo/test_model_output.py
@@ -16,6 +16,7 @@
         BaseModelOutput,
         BaseModelOutputWithPastAndCrossAttentions,
         BaseModelOutputWithPoolingAndCrossAttentions,
+        CausalLMOutputWithPast,
     )
 except ImportError:
     modeling_outputs = None
@@ -242,6 +243,31 @@ def forward(
             torch.allclose(orig_result.pooler_output, compiled_result.pooler_output)
         )
 
+    @maybe_skip
+    def test_none(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                x = x + 1
+                return CausalLMOutputWithPast(loss=None, logits=x)[0]
+
+        model = Model()
+        opt_model = torch.compile(model, backend="eager", fullgraph=True)
+        x = torch.randn(1, 1, 1, 1)
+
+        self.assertTrue(same(model(x), opt_model(x)))
+
+    @maybe_skip
+    def test_reconstruction(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                x = x + 1
+                return CausalLMOutputWithPast(loss=x, logits=None)
+
+        model = Model()
+        x = torch.randn(1, 1, 1, 1)
+        eo = torch._dynamo.export(Model(), aten_graph=True)(x)
+        self.assertTrue(same(model(x), eo.graph_module(x)))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 055441f2362da..0b12767583bdc 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -1,13 +1,14 @@
 # Owner(s): ["module: dynamo"]
 
 import collections
+import copy
 import itertools
 import traceback
 import types
 import unittest
 from copy import deepcopy
 from functools import partial
-from typing import Tuple
+from typing import Dict, NamedTuple, Tuple
 from unittest.mock import patch
 
 import torch
@@ -27,6 +28,16 @@
     import test_functions
 
 
+_variable = 0
+_variable1 = 0
+
+
+def update_global():
+    global _variable, _variable1
+    _variable += 1
+    _variable1 += 1
+
+
 class BasicModule(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -581,6 +592,38 @@ def forward(self, input):
         return y
 
 
+class MyInput(NamedTuple):
+    x: Dict[str, Dict[str, torch.Tensor]]
+    y: torch.Tensor
+
+
+class LazyLayerWithNamedTupleInput(LazyModuleMixin, torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def initialize_parameters(self, input):
+        with torch.no_grad():
+            self._param = torch.nn.Parameter(
+                torch.empty(input.x["a"][0].shape).fill_(0.5)
+            )
+
+    def forward(self, input):
+        input = input.x["a"]
+        x = 0
+        for i in range(len(input)):
+            x = x + input[i]
+        return x
+
+
+class LazyModuleWithNamedTupleInput(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layer = LazyLayerWithNamedTupleInput()
+
+    def forward(self, input):
+        return self.layer(input)
+
+
 class LazyLayerWithListInput(LazyModuleMixin, torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -617,6 +660,37 @@ def forward(self, x):
         return self.layer(x)
 
 
+class LazyLayerWithInputs(LazyModuleMixin, torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def initialize_parameters(self, x, y):
+        with torch.no_grad():
+            self._param_x = torch.nn.Parameter(torch.empty(x[0].shape).fill_(0.5))
+            self._param_y = torch.nn.Parameter(torch.empty(y[0].shape).fill_(0.5))
+
+    def forward(self, x, y):
+        res_x = 0
+        for i in range(len(x)):
+            res_x = res_x + x[i]
+        res_y = 0
+        for i in range(len(y)):
+            res_y = res_y + y[i]
+        return res_x + res_y
+
+
+class LazyModuleKwArgs(LazyModuleMixin, torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def initialize_parameters(self, *args, **kwargs):
+        with torch.no_grad():
+            self.layer = LazyLayerWithInputs()
+
+    def forward(self, x, y):
+        return self.layer(x, y=y)
+
+
 class LazyParentModule(LazyModuleMixin, torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -847,8 +921,10 @@ def forward(self, x):
 
 
 class ModuleAttributePrecedenceBase(torch.nn.Module):
-    def linear(self, x):
-        return x * 2.0
+    def linear(self, x, flag=None):
+        if flag:
+            return x * 2.0
+        return x * 3.0
 
 
 class ModuleAttributePrecedence(ModuleAttributePrecedenceBase):
@@ -1424,6 +1500,20 @@ def test_lazy_module6(self):
         ref = m(x)
         self.assertTrue(torch.allclose(ref, res))
 
+    # RuntimeError: SymIntArrayRef expected to contain only concrete integers
+    @expectedFailureDynamic
+    def test_lazy_module7(self):
+        # Test lazy module works well with namedtuple/dict input
+        m = LazyModuleWithNamedTupleInput()
+        x = MyInput(
+            x={"a": [torch.rand([5, 5])] * 3, "b": torch.rand([5, 5])},
+            y=torch.rand([5, 5]),
+        )
+        opt_m = torch.compile(backend="eager", fullgraph=True)(m)
+        res = opt_m(x)
+        ref = m(x)
+        self.assertTrue(torch.allclose(ref, res))
+
     def test_lazy_module_no_cls_to_become(self):
         # make sure super() works in the case where cls_to_become is None
         m = LazyChildModuleNoClsToBecome()
@@ -1433,6 +1523,14 @@ def test_lazy_module_no_cls_to_become(self):
         ref = m(x)
         self.assertTrue(torch.allclose(ref, res))
 
+    def test_lazy_module_kwargs(self):
+        m = LazyModuleKwArgs()
+        x = [torch.rand([5, 5])] * 3
+        y = [torch.rand([5, 5])] * 2
+        opt_m = torch.compile(backend="eager", fullgraph=True)(m)
+        exp_res = m(x, y)
+        self.assertTrue(torch.allclose(exp_res, opt_m(x, y)))
+
     def test_call_fn_with_non_const_inputs_safe(self):
         class ModuleSpecialFwd(torch.nn.Module):
             def __init__(self):
@@ -1508,6 +1606,50 @@ def test_nn_module(self):
         self.assertTrue(torch._dynamo.testing.same(mod(x), opt_mod(x)))
         self.assertEqual(cnt.frame_count, 1)
 
+    @torch._dynamo.config.patch(guard_nn_modules=True)
+    def test_attr_precedence(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = 3
+
+            def forward(self, x, c=4):
+                return x * c
+
+            def linear(self, x):
+                return x
+
+            def b(self, x):
+                raise RuntimeError("Should not be called")
+
+        class MyMod(Mod):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(11, 11)
+                self.a = 2
+                self.b = 2
+                self.scale = 1
+
+            def scale(self, x):
+                # Should not be called because it is shadowed by the instance
+                # attribute
+                raise RuntimeError("Should not be called")
+
+            def forward(self, x, c=None):
+                return self.linear(x) * self.a * self.b * self.scale
+
+        mod = MyMod()
+        x = torch.ones(3, 3)
+        ref = mod(x)
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_mod = torch.compile(mod, backend=cnts)
+        opt_mod(torch.ones(3, 3))
+        res = opt_mod(torch.ones(3, 3))
+
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(ref, res)
+
     def test_to(self):
         mod = MockModule()
         cnt = torch._dynamo.testing.CompileCounter()
@@ -1535,6 +1677,126 @@ def test_to(self):
         opt_mod(x)
         self.assertEqual(cnt.frame_count, 3)
 
+    @torch._dynamo.config.patch(guard_nn_modules=True)
+    def test_param_order(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.param1 = torch.nn.Parameter(torch.ones([1]))
+                self.param2 = torch.nn.Parameter(torch.ones([2]))
+
+            def forward(self, x):
+                return x
+
+        mod = MyModule()
+        coeffs = [2, 3]
+
+        def fn(x):
+            for idx, p in enumerate(mod.parameters()):
+                x += p.sum() * coeffs[idx]
+
+            for idx, p in enumerate(mod.named_parameters()):
+                x += p[1].sum() * coeffs[idx]
+
+            return x
+
+        ref = fn(torch.ones(1))
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts)(fn)
+        res = opt_fn(torch.ones(1))
+
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 1)
+
+        mod._parameters["param1"] = mod._parameters.pop("param1")
+        ref = fn(torch.ones(1))
+        res = opt_fn(torch.ones(1))
+
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+
+    @torch._dynamo.config.patch(guard_nn_modules=True)
+    def test_buffer_order(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("b1", torch.ones([1]))
+                self.register_buffer("b2", torch.ones([2]))
+
+            def forward(self, x):
+                return x
+
+        mod = MyModule()
+        coeffs = [2, 3]
+
+        def fn(x):
+            for idx, p in enumerate(mod.buffers()):
+                x += p.sum() * coeffs[idx]
+
+            for idx, p in enumerate(mod.named_buffers()):
+                x += p[1].sum() * coeffs[idx]
+
+            return x
+
+        ref = fn(torch.ones(1))
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts)(fn)
+        res = opt_fn(torch.ones(1))
+
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 1)
+
+        mod._buffers["b1"] = mod._buffers.pop("b1")
+        ref = fn(torch.ones(1))
+        res = opt_fn(torch.ones(1))
+
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+
+    @torch._dynamo.config.patch(guard_nn_modules=True)
+    def test_module_order(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear1 = torch.nn.Linear(3, 3)
+                self.linear2 = torch.nn.Linear(10, 10)
+
+            def forward(self, x):
+                return x
+
+        mod = MyModule()
+        coeffs = [2, 3, 4]
+
+        coeffs_for_mod = {mod: 10, mod.linear1: 20, mod.linear2: 30}
+
+        # Check order of _modules
+        def fn(x):
+            for idx, p in enumerate(mod.modules()):
+                # Something silly to force depedency on the order
+                x += coeffs_for_mod[p] * coeffs[idx]
+            for idx, p in enumerate(mod.named_modules()):
+                x += coeffs_for_mod[p[1]] * coeffs[idx]
+            for idx, p in enumerate(mod.children()):
+                x += coeffs_for_mod[p] * coeffs[idx]
+            for idx, p in enumerate(mod.named_children()):
+                x += coeffs_for_mod[p[1]] * coeffs[idx]
+            return x
+
+        ref = fn(torch.ones(1))
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts)(fn)
+        res = opt_fn(torch.ones(1))
+
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 1)
+
+        mod._modules["linear1"] = mod._modules.pop("linear1")
+        ref = fn(torch.ones(1))
+        res = opt_fn(torch.ones(1))
+
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+
     def test_attr(self):
         class MockModule(torch.nn.Module):
             def __init__(self):
@@ -1843,7 +2105,7 @@ def guard_fail_fn(failure):
         handle.remove()
         self.assertEqual(compiled_func(inp), outer_func(inp))
         self.assertEqual(compiled_func(inp).item(), 7)
-        self.assertTrue("forward_hooks.keys" in failure_reason)
+        self.assertTrue("forward_hooks" in failure_reason)
         self.assertEqual(cc.frame_count, 1 + 1)
         self.assertEqual(cc.op_count, 6 + 4)
 
@@ -1863,10 +2125,9 @@ def new_forward_hook(
         m._forward_hooks[handle.id] = new_forward_hook
         self.assertEqual(compiled_func(inp), outer_func(inp))
         self.assertEqual(compiled_func(inp).item(), 16)
-        self.assertRegex(
-            failure_reason, r"^___check_obj_id\(.*\(L\['m'\]\._forward_hooks"
-        )
+        self.assertRegex(failure_reason, r"^___check_obj_id\(L\['m'\]._forward_hooks")
 
+    @patch.object(torch._dynamo.config, "guard_nn_modules", False)
     @patch.object(torch._dynamo.config, "skip_nnmodule_hook_guards", True)
     def test_hooks_skip_guards(self):
         class TestModule(torch.nn.Module):
@@ -2130,6 +2391,33 @@ def backward_pre_hook(name, mod, grad_out):
         self.assertTrue(grad_sizes.keys() == backward_hook_handles.keys())
         self.assertTrue(pre_grad_sizes.keys() == pre_backward_hook_handles.keys())
 
+    def test_udo_instance_method_as_hook(self):
+        class CustomClass:
+            def __init__(self, module):
+                self.module = module
+                self.handle = self.module.register_forward_pre_hook(
+                    self.func1, prepend=True, with_kwargs=True
+                )
+
+            def func1(self, module, args, kwargs):
+                return (args[0] + 1,), kwargs
+
+            def __call__(self, x):
+                return self.module(x)
+
+        class ToyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x * x
+
+        model = ToyModel()
+        x = torch.zeros((3, 4))
+        obj = CustomClass(model)
+        out = torch.compile(obj, fullgraph=True)(x)
+        self.assertEqual(out, (x + 1) * (x + 1))
+
     def test_module_dict_iter_name(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -2294,6 +2582,52 @@ def generate(x, c):
         generate(torch.randn(10, 10), 0)
         self.assertEqual(cnt.frame_count, 3)
 
+    def test_setattr_on_compiled_module(self):
+        # https://github.com/pytorch/pytorch/issues/114844
+
+        class ReplayMutation(torch.nn.Module):
+            def __init__(self, inp_size, out_size, inner_size):
+                super().__init__()
+                self.Linear1 = torch.nn.Linear(inp_size, inner_size)
+                self.Linear2 = torch.nn.Linear(inner_size, out_size)
+                self.x = None
+
+            def forward(self, inp):
+                res = self.Linear1(inp)
+                self.x = res
+                return self.Linear2(res)
+
+        N, D_in, H, D_out, inner = 2, 2, 2, 2, 4
+        model = ReplayMutation(D_in, H, inner)
+        model2 = copy.deepcopy(model)
+        input = torch.ones(N, D_in)
+
+        # Keep some intermediate value in model.x
+        model.x = torch.tensor([[100, 100, 100, 100], [200, 200, 200, 200]])
+        model(input)
+
+        compiled_model = torch.compile(model2, backend="eager")
+        compiled_model.x = torch.tensor([[100, 100, 100, 100], [200, 200, 200, 200]])
+        compiled_model(input)
+
+        self.assertEqual(model.x, compiled_model.x)
+
+    def test_globals_change_in_other_file(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            update_global()
+            a = test_functions.update_global(x)
+            # Ensure that the updated global values are read
+            return x * a * (_variable + _variable1 + test_functions._variable)
+
+        res = fn(torch.ones(10))
+        self.assertEqual(_variable, 1)
+        self.assertEqual(_variable1, 1)
+        # Ensure that the reconstructed bytecode updates the global value in the
+        # other file.
+        self.assertEqual(test_functions._variable, 1)
+        self.assertEqual(res, 3 * torch.ones(10))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_optimizers.py b/test/dynamo/test_optimizers.py
index 2518313ab3790..779dc8aa6c9c8 100644
--- a/test/dynamo/test_optimizers.py
+++ b/test/dynamo/test_optimizers.py
@@ -6,7 +6,6 @@
 
 # Owner(s): ["module: dynamo"]
 
-import inspect
 
 import torch
 
@@ -15,89 +14,6 @@
 import torch._dynamo.testing
 from torch.nn import Parameter
 
-input = torch.ones([10, 10])
-model = torch.nn.Sequential(*[torch.nn.Linear(10, 10) for _ in range(2)])
-model(input).sum().backward()
-
-
-def get_optimizer_step(opt, closure=None):
-    # run the patcher so that step has the expected structure
-    torch._dynamo.eval_frame.TorchPatcher.patch()
-
-    # unwrap step to avoid a deliberate graph break due to
-    # a limitation of functionalization/no_grad detection
-    # see the [Note on graph break] in optimizer.py
-    # This ignores the outer _use_grad_if_differentiable wrapper, which is fine for now
-    # as dynamo does not support differentiable optimizers anyway
-    step_fn = opt.step.__wrapped__
-    if closure is not None:
-
-        def fn():
-            step_fn(opt, closure)
-
-    else:
-
-        def fn():
-            step_fn(opt)
-
-    return fn
-
-
-def make_test(optim_cls, closure=None, **kwargs):
-    opt = optim_cls(model.parameters(), **kwargs)
-
-    def test_fn(self):
-        nonlocal opt
-
-        fn = get_optimizer_step(opt, closure=closure)
-
-        with torch.set_grad_enabled(False):
-            torch.compile(fn, backend="eager", fullgraph=True)()
-
-    return test_fn
-
-
-class OptimizerTests(torch._dynamo.test_case.TestCase):
-    test_sgd = make_test(torch.optim.SGD, lr=0.01)
-    # lgbfs has data-dependent control and internally iterates
-    # calling the closure
-    # TODO mlazos: re-enable once we have latest pytorch with FakeTensor fix #497
-    # test_lbfgs = make_test(
-    #    torch.optim.LBFGS, exp_frame_cnt=3, closure=lambda: model(input).sum()
-    # )
-
-    # Has data dependent control for rectification (needs symint)
-    # RAdam has data-dependent control which breaks the graph;
-    # furthermore, the break is inside a for loop, so we bail on the frame
-    # entirely.  This is basically an xfail; if the frame count goes up
-    # you done good
-    # test_radam = unittest.skipIf(IS_FBCODE, "TypeError: _use_grad() missing")(
-    #    make_test(torch.optim.RAdam, exp_graph_count=0)
-    # )
-
-
-# exclude SparseAdam because other areas of the stack don't support it yet
-# the others are handled specially above
-exclude = {
-    "SGD",  # Handled above
-    "Optimizer",
-    "SparseAdam",  # Unsupported
-    "LBFGS",  # Unsupported
-    "RAdam",  # Has data dependent control for rectification (needs symint)
-}
-
-optimizers = [
-    opt
-    for opt in torch.optim.__dict__.values()
-    if inspect.isclass(opt)
-    and issubclass(opt, torch.optim.Optimizer)
-    and opt.__name__ not in exclude
-]
-
-
-for opt in optimizers:
-    setattr(OptimizerTests, "test_" + opt.__name__.lower(), make_test(opt))
-
 
 class MyOptimizer(torch.optim.Optimizer):
     def __init__(self, params):
@@ -176,7 +92,6 @@ def test_init_group(self):
             tensor = torch.randn(5, 5, dtype=dtype)
             params = Parameter(tensor.detach().clone(), requires_grad=False)
             opt_params = Parameter(tensor.detach().clone(), requires_grad=False)
-            print(params, opt_params)
 
             optim = MyOptimizer([params])
             optim.step()
@@ -184,7 +99,6 @@ def test_init_group(self):
             opt_optim = MyOptimizer([opt_params])
             opt_step = torch.compile(backend="eager", fullgraph=True)(opt_optim.step)
             opt_step()
-            print(params, opt_params)
 
             self.assertEqual(params, opt_params)
 
diff --git a/test/dynamo/test_recompile_ux.py b/test/dynamo/test_recompile_ux.py
index 4039d924aec53..8c456842c20d0 100644
--- a/test/dynamo/test_recompile_ux.py
+++ b/test/dynamo/test_recompile_ux.py
@@ -266,9 +266,7 @@ def filter_reasons():
             opt_f([7, 8])
 
             for line in """\
-len(L['x']) == 3
-L['x'][0] == 4
-L['x'][1] == 5""".split(
+len(L['x']) == 3""".split(
                 "\n"
             ):
                 self.assertIn(line, filter_reasons())
@@ -278,9 +276,7 @@ def filter_reasons():
 
             for line in """\
 len(L['x']) == 2
-L['x'][0] == 7
-len(L['x']) == 3
-L['x'][0] == 4""".split(
+len(L['x']) == 3""".split(
                 "\n"
             ):
                 self.assertIn(line, filter_reasons())
diff --git a/test/dynamo/test_reorder_logs.py b/test/dynamo/test_reorder_logs.py
new file mode 100644
index 0000000000000..80ffe2e13cd8f
--- /dev/null
+++ b/test/dynamo/test_reorder_logs.py
@@ -0,0 +1,152 @@
+# Owner(s): ["module: dynamo"]
+import io
+import warnings
+from unittest.mock import patch
+
+import torch
+import torch._dynamo
+import torch._dynamo.test_case
+import torch._dynamo.testing
+from torch._dynamo.testing import same
+from torch._dynamo.utils import counters
+
+
+class ReorderLogsTests(torch._dynamo.test_case.TestCase):
+    def test_dont_reorder_print(self):
+        def f(x):
+            x = x + x
+            print("moo")
+            x = x * x
+            return x
+
+        counters.clear()
+        x = torch.randn(3, 3)
+        opt_f = torch.compile(backend="eager")(f)
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            opt_out = opt_f(x)
+            printed_output = mock_stdout.getvalue().strip()
+            orig_out = f(x)
+
+        self.assertTrue(same(orig_out, opt_out))
+        self.assertEqual(printed_output, "moo")
+        self.assertEqual(len(counters["graph_break"]), 1)
+
+    @torch._dynamo.config.patch(reorderable_logging_functions={print})
+    def test_reorder_print(self):
+        def f(x):
+            print("moo")
+            x1 = x + x
+            print(x1)
+            x2 = x1 * x1
+            print(1, 2, 3)
+            x3 = x2 + x2
+            return (x1, x3)
+
+        x = torch.ones(3, 3)
+        opt_f = torch.compile(backend="eager", fullgraph=True)(f)
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            opt_out = opt_f(x)
+            printed_output = mock_stdout.getvalue().strip()
+            orig_out = f(x)
+
+        self.assertEqual(printed_output, f"moo\n{torch.ones(3, 3) * 2}\n1 2 3")
+        self.assertTrue(same(orig_out, opt_out))
+
+    @torch._dynamo.config.patch(reorderable_logging_functions={warnings.warn})
+    def test_reorder_warnings(self):
+        import warnings
+
+        def f(x):
+            x1 = x + x
+            warnings.warn("moo")
+            x2 = x1 * x1
+            warnings.warn(f"{x2}")
+            x3 = x2 + x2
+            return x3
+
+        x = torch.ones(3, 3)
+        opt_f = torch.compile(backend="eager", fullgraph=True)(f)
+        with warnings.catch_warnings(record=True) as w:
+            opt_out = opt_f(x)
+            warning_messages = [str(i.message) for i in w]
+            orig_out = f(x)
+
+        self.assertTrue(same(orig_out, opt_out))
+        self.assertIn("moo", warning_messages)
+
+    @torch._dynamo.config.patch(reorderable_logging_functions={print})
+    def test_reorder_print_graph_break(self):
+        def f(x):
+            x1 = x + x
+            print(f"res: {x1}")
+            x2 = x1 * x1
+            torch._dynamo.graph_break()
+            x3 = x2 + x2
+            print(1, 2, 3)
+            return x3
+
+        x = torch.ones(3, 3)
+        opt_f = torch.compile(backend="eager")(f)
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            opt_out = opt_f(x)
+            printed_output = mock_stdout.getvalue().strip()
+            orig_out = f(x)
+
+        self.assertEqual(printed_output, f"res: {torch.ones(3, 3) * 2}\n1 2 3")
+        self.assertTrue(same(orig_out, opt_out))
+
+    def test_reorder_custom_log_fn(self):
+        custom_logs = []
+
+        def custom_log(s: str):
+            torch._dynamo.graph_break()
+            custom_logs.append(s)
+
+        def f(x):
+            custom_log("moo")
+            x1 = x + x
+            custom_log(f"{x1}")
+            return x + x
+
+        x = torch.ones(3, 3)
+        counters.clear()
+        with torch._dynamo.config.patch(reorderable_logging_functions={custom_log}):
+            opt_f = torch.compile(backend="eager")(f)
+            opt_out = opt_f(x)
+
+        self.assertEqual(sum(counters["graph_break"].values()), 1)
+        self.assertEqual(custom_logs[0], "moo")
+        self.assertEqual(custom_logs[1], f"{torch.ones(3, 3) * 2}")
+
+    @torch._dynamo.config.patch(reorderable_logging_functions={print})
+    def test_constant_mutation(self):
+        def f(x):
+            alist = [x]
+            alist.append(x + 1)
+            print(alist[-1])
+            alist[0].sum().item()  # graph break
+            res = alist.pop()
+            print(alist[-1])
+            res.sum().item()  # graph break
+            return res
+
+        inputs = (torch.tensor([1]),)
+        counters.clear()
+        opt_f = torch.compile(backend="eager")(f)
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            opt_out = opt_f(*inputs)
+            printed_output = mock_stdout.getvalue().strip()
+            orig_out = f(*inputs)
+
+        self.assertEqual(printed_output, "tensor([2])\ntensor([1])")
+        self.assertTrue(same(orig_out, opt_out))
+
+        graph_break_key = counters["graph_break"].keys()
+        self.assertEqual(len(graph_break_key), 1)
+        self.assertEqual(next(iter(graph_break_key)), "Tensor.item")
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_replay_record.py b/test/dynamo/test_replay_record.py
deleted file mode 100644
index f7429c8205887..0000000000000
--- a/test/dynamo/test_replay_record.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Owner(s): ["module: dynamo"]
-import logging
-import re
-import shutil
-import unittest
-
-import torch
-import torch._dynamo.test_case
-import torch._dynamo.testing
-
-try:
-    import dill
-except ImportError:
-    dill = None
-
-requires_dill = unittest.skipIf(dill is None, "requires dill")
-
-
-class ReplayRecordTests(torch._dynamo.test_case.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-        cls._exit_stack.enter_context(
-            unittest.mock.patch.object(
-                torch._dynamo.config, "replay_record_enabled", True
-            )
-        )
-        torch._logging.set_logs(graph_breaks=True, dynamo=logging.ERROR)
-        # Most of the tests are checking to see if errors got logged, so we
-        # ask for errors to be suppressed
-        cls._exit_stack.enter_context(
-            unittest.mock.patch.object(torch._dynamo.config, "suppress_errors", True)
-        )
-        cls._exit_stack.enter_context(
-            unittest.mock.patch.object(
-                torch._dynamo.config,
-                "debug_dir_root",
-                "/tmp/_torchdynamo_debug_/",
-            )
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        shutil.rmtree(torch._dynamo.config.debug_dir_root, ignore_errors=True)
-        torch._logging.set_logs()
-        cls._exit_stack.close()
-
-    def check_replay(self, fn, *args, exp_exc_name=None):
-        fn_opt = torch._dynamo.optimize("eager")(fn)
-        with self.assertLogs(logger="torch._dynamo", level=logging.ERROR) as log_orig:
-            try:
-                fn_opt(*args)
-            except Exception:
-                pass  # we'll check the logs for the raised exception
-
-        with self.assertLogs(
-            logger="torch._dynamo", level=logging.ERROR
-        ) as log_replayed:
-            file_name_match = re.search(
-                r"torch._dynamo\.replay\('(.*)'\)", log_orig.output[-1]
-            )
-            self.assertTrue(
-                file_name_match is not None,
-                "No record file name found in generated logs.",
-            )
-
-            torch._dynamo.replay(file_name_match.groups()[0])
-
-        def get_error_name(log):
-            error_name = re.search(r"\w+Error", log.output[-1])
-            self.assertIsNotNone(error_name, "No error name found in logs.")
-            return error_name[0]
-
-        orig_error = get_error_name(log_orig)
-        replayed_error = get_error_name(log_replayed)
-        if exp_exc_name is not None:
-            self.assertEqual(orig_error, exp_exc_name)
-
-        self.assertEqual(
-            orig_error,
-            replayed_error,
-            "Error logs for recorded execution and replayed execution should match.",
-        )
-
-    @requires_dill
-    def test_unsuccessful_inline(self):
-        def level2():
-            z = torch.ones(2, 2)
-            a = {z: 10}  # Error here, tensor as key to dict
-            return a[z] * torch.ones(1)
-
-        def level1():
-            y = torch.ones(1, 1)
-            return level2() + y
-
-        def level0():
-            x = torch.ones(1, 1)
-            return level1() + x
-
-        self.check_replay(level0, exp_exc_name="AssertionError")
-
-    @requires_dill
-    def test_successful_inline(self):
-        def test_fn():
-            x = torch.ones(2, 2)
-
-            def level1(a):
-                return a + torch.ones(2, 2)
-
-            y = level1(x)
-
-            return y + torch.ones(3, 3)  # dimension mismatch
-
-        self.check_replay(test_fn, exp_exc_name="RuntimeError")
-
-    @requires_dill
-    def test_nonlocal_fn_call(self):
-        def nonlocal_fn(x):
-            return x + torch.ones(2, 2)
-
-        def test_fn():
-            z = torch.ones(2, 2)
-            x = nonlocal_fn(z)
-            return x + torch.ones(3, 3)
-
-        self.check_replay(test_fn, exp_exc_name="RuntimeError")
-
-    @requires_dill
-    def test_nonlocal_module_fn_call(self):
-        # replay when we use a module
-        # not defined in the replay env
-        try:
-            from . import mock_modules
-        except ImportError:
-            import mock_modules
-
-        def test_fn():
-            z = mock_modules.mock_module2.method1([], 2)
-            z = torch.ones(2, 2) + z[0]
-            return z + torch.zeros(3, 3)
-
-        self.check_replay(test_fn, exp_exc_name="RuntimeError")
-
-    @requires_dill
-    def test_nonlocal_module_class(self):
-        try:
-            from .mock_modules import mock_module2
-        except ImportError:
-            from mock_modules import mock_module2
-
-        def test_fn():
-            z = mock_module2.Class1(1, 2)
-            y = z.method2(torch.ones(3, 3))
-            return y + torch.zeros(3, 5)
-
-        self.check_replay(test_fn, exp_exc_name="TypeError")
-
-    @requires_dill
-    def test_local_module(self):
-        try:
-            from .mock_modules import mock_module3 as _  # noqa: F401
-
-            def test_fn(x):
-                from .mock_modules import mock_module3
-
-                z = mock_module3.method1([], torch.ones(5, 1))
-                return torch.ones(2, 2) + x + z[0]
-
-        except ImportError:
-
-            def test_fn(x):
-                from mock_modules import mock_module3
-
-                z = mock_module3.method1([], torch.ones(5, 1))
-                return torch.ones(2, 2) + x + z[0]
-
-        self.check_replay(test_fn, torch.ones(1, 1), exp_exc_name="RuntimeError")
-
-    # Verify that we replay when we have tensor arguments to the frame being replayed
-    @requires_dill
-    def test_fn_call_args(self):
-        def test_fn(x, y):
-            return x + y + torch.zeros(2, 2)
-
-        self.check_replay(
-            test_fn, torch.ones(3, 3), torch.ones(2, 2), exp_exc_name="RuntimeError"
-        )
-
-
-if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
-
-    run_tests()
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 4445ba174b2f3..1a31336f4eb79 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -11,13 +11,15 @@
 import itertools
 import random
 import unittest
+import warnings
 import weakref
 from abc import ABC
 from collections import namedtuple
 from copy import deepcopy
 from enum import Enum
 from functools import wraps
-from typing import List
+from typing import Any, Dict, Iterator, List, Tuple
+from unittest import mock
 
 import numpy as np
 import torch
@@ -28,27 +30,30 @@
 
 import torch._functorch.config
 import torch.library
-
 from torch import nn
 from torch._dynamo.debug_utils import same_two_models
 from torch._dynamo.testing import CompileCounter, rand_strided, same
 from torch.nn import functional as F
+
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.testing._internal.common_utils import (
     disable_translation_validation_if_dynamic_shapes,
+    instantiate_parametrized_tests,
+    parametrize,
+    TEST_WITH_ROCM,
 )
+from torch.testing._internal.two_tensor import TwoTensor
 
 
 _orig_module_call = torch.nn.Module.__call__
 
 # Custom operator that only supports CPU and Meta
-lib = torch.library.Library("test_sample", "DEF")
+lib = torch.library.Library("test_sample", "DEF")  # noqa: TOR901
 lib.define("foo(Tensor self) -> Tensor")
 lib.impl("foo", torch.sin, "CPU")
 
 
-requires_cuda = functools.partial(
-    unittest.skipIf, not torch.cuda.is_available(), "requires cuda"
-)
+requires_cuda = unittest.skipUnless(torch.cuda.is_available(), "requires cuda")
 
 
 _GLOBAL_CPU_TENSOR = torch.randn(3)
@@ -127,6 +132,10 @@ def _do_paste_mask(masks, boxes, img_h: int, img_w: int, skip_empty: bool = True
         return img_masks[:, 0], ()
 
 
+def global_fn(x):
+    return torch.sin(x)
+
+
 def cat(tensors, dim=0):
     # from detectron2 wrappers.py
     assert isinstance(tensors, (list, tuple))
@@ -226,6 +235,7 @@ def forward(
                 all_hidden_states.append(hidden_states)
 
             attn_output = layer(attn_output)
+            all_buckets = all_buckets + (attn_output,)
 
         # Add last layer
         if output_hidden_states is True:
@@ -247,45 +257,8 @@ def backward(ctx, grad_hidden_states):
             grad_hidden_states, 2, dim=-1
         )
 
-        # retrieve params from ctx for backward
-        attn_output, hidden_states = ctx.saved_tensors
-
-        # create tuple
-        output = ReformerBackwardOutput(
-            attn_output=attn_output,
-            hidden_states=hidden_states,
-            grad_attn_output=grad_attn_output,
-            grad_hidden_states=grad_hidden_states,
-        )
-
         # free memory
-        del grad_attn_output, grad_hidden_states, attn_output, hidden_states
-
-        layers = ctx.layers
-        all_buckets = ctx.all_buckets
-        head_mask = ctx.head_mask
-        attention_mask = ctx.attention_mask
-
-        for idx, layer in enumerate(layers[::-1]):
-            # pop last buckets from stack
-            buckets = all_buckets[-1]
-            all_buckets = all_buckets[:-1]
-
-            # backprop
-            output = layer.backward_pass(
-                next_attn_output=output.attn_output,
-                hidden_states=output.hidden_states,
-                grad_attn_output=output.grad_attn_output,
-                grad_hidden_states=output.grad_hidden_states,
-                head_mask=head_mask[len(layers) - idx - 1],
-                attention_mask=attention_mask,
-                buckets=buckets,
-            )
-
-        assert all_buckets == (), "buckets have to be empty after backpropagation"
-        grad_hidden_states = torch.cat(
-            [output.grad_attn_output, output.grad_hidden_states], dim=-1
-        )
+        del grad_attn_output
 
         # num of return vars has to match num of forward() args
         # return gradient for hidden_states arg and None for other args
@@ -361,6 +334,56 @@ def forward(
         )
 
 
+class ListConfig:
+    class ValueNode:
+        def __init__(self, value):
+            self.value = value
+
+        def _dereference_node(self):
+            return self
+
+        def _is_missing(self):
+            return False
+
+        def _value(self):
+            return self.value
+
+    # Based on an example from omegaconfig.listconfig
+    class ListIterator(Iterator[Any]):
+        def __init__(self, lst: Any, resolve: bool) -> None:
+            self.resolve = resolve
+            self.iterator = iter(lst.__dict__["_content"])
+            self.index = 0
+
+        def __next__(self) -> Any:
+            x = next(self.iterator)
+            if self.resolve:
+                x = x._dereference_node()
+                if x._is_missing():
+                    raise AssertionError
+
+            self.index = self.index + 1
+            if isinstance(x, ListConfig.ValueNode):
+                return x._value()
+            raise AssertionError
+
+    def __iter__(self):
+        return self._iter_ex(True)
+
+    def _iter_ex(self, resolve: bool) -> Iterator[Any]:
+        try:
+            return ListConfig.ListIterator(self, resolve)
+        except Exception:
+            raise AssertionError
+
+    def __init__(self):
+        self._content = [
+            ListConfig.ValueNode(1),
+            ListConfig.ValueNode(3),
+            ListConfig.ValueNode(torch.tensor([7.0])),
+        ]
+
+
 def longformer_chunk(hidden_states, window_overlap=256):
     """convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
 
@@ -486,7 +509,7 @@ def apply_chunking_to_forward(forward_fn, *input_tensors):
     assert all(input_tensor.shape[1] == tensor_shape for input_tensor in input_tensors)
     num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
     if num_args_in_forward_chunk_fn != len(input_tensors):
-        raise ValueError()
+        raise ValueError
 
     return forward_fn(*input_tensors)
 
@@ -789,7 +812,7 @@ def _merge_criteria_processor_list(default_list, custom_list):
     for default in default_list:
         for custom in custom_list:
             if type(custom) is type(default):
-                raise ValueError()
+                raise ValueError
     default_list.extend(custom_list)
     return default_list
 
@@ -861,6 +884,16 @@ def fn(self, tensor):
         return self.inner_fn(tensor.shape, (1, 2, 3))
 
 
+class IncByOne:
+    def __init__(self, x):
+        self.x = x + 1
+
+
+class IncByTwo:
+    def __init__(self, x):
+        self.x = x + 2
+
+
 class ReproTests(torch._dynamo.test_case.TestCase):
     def test_do_paste_mask(self):
         torch._dynamo.utils.counters.clear()
@@ -959,7 +992,7 @@ def _reformer(self, nopython):
         self.assertTrue(same(opt_model(input), correct))
         return cnt
 
-    @requires_cuda()
+    @requires_cuda
     def test_sub_alpha_scalar_repro(self):
         @torch.compile(backend="aot_eager")
         def f(x):
@@ -1091,11 +1124,11 @@ def test_reformer_train(self):
             cnt = self._reformer(nopython=False)
         # cant inline torch.autograd.Function means graph break
         if torch._dynamo.config.assume_static_by_default:
-            self.assertExpectedInline(cnt.frame_count, """3""")
-            self.assertExpectedInline(cnt.op_count, """10""")
+            self.assertExpectedInline(cnt.frame_count, """1""")
+            self.assertExpectedInline(cnt.op_count, """5""")
         else:
-            self.assertExpectedInline(cnt.frame_count, """3""")
-            self.assertExpectedInline(cnt.op_count, """10""")
+            self.assertExpectedInline(cnt.frame_count, """1""")
+            self.assertExpectedInline(cnt.op_count, """5""")
 
     @disable_translation_validation_if_dynamic_shapes
     def test_longformer_chunk(self):
@@ -1234,9 +1267,9 @@ def test_maml_no_item_capture(self):
             self.assertTrue(same(opt_model(a, b, c, d), correct))
 
         if torch._dynamo.config.assume_static_by_default:
-            self.assertExpectedInline(cnt.frame_count, """2""")
+            self.assertExpectedInline(cnt.frame_count, """4""")
         else:
-            self.assertExpectedInline(cnt.frame_count, """3""")
+            self.assertExpectedInline(cnt.frame_count, """5""")
 
     def test_hf_model_output(self):
         ex = ModelOutput(a=torch.randn(10), b=torch.randn(10), c=torch.randn(10))
@@ -1958,6 +1991,22 @@ def fn(x):
 
         fn(torch.randn(3))
 
+    def test_issue111522(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x, y):
+            return x + y.a
+
+        class A:
+            a = 2
+
+        self.assertEqual(f(torch.zeros(2), A()), torch.full([2], 2.0))
+
+        del A.a
+
+        # graph break on missing attr
+        with self.assertRaises(torch._dynamo.exc.Unsupported):
+            f(torch.zeros(2), A())
+
     def test_dict_list_values(self):
         def inner_fn(args):
             return [x[1].shape for x in args]
@@ -2069,6 +2118,29 @@ def forward(self, listy):
 
         self.assertEqual(cnt.frame_count, 1)
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_issue111918(self):
+        cnt = CompileCounter()
+
+        @torch.compile(backend=cnt, dynamic=True)
+        def fn(x):
+            x = x + 1
+            y = x.item()
+            if y > 2:
+                return x * 2
+            else:
+                return x * 3
+
+        x = torch.tensor([3.0])
+        fn(x)
+        self.assertEqual(cnt.frame_count, 2)
+        self.assertEqual(cnt.op_count, 4)
+
+        torch._dynamo.reset()
+        fn = torch.compile(fn, fullgraph=True, backend="eager")
+        with self.assertRaises(torch._dynamo.exc.UserError):
+            fn(x)
+
     def test_vdd_duplicate_error(self):
         def fn(a, dt):
             keys = list(dt._jt_dict.keys())
@@ -2439,6 +2511,50 @@ def forward(self, inp):
         self.assertEqual(cnt.op_count, 5)
         self.assertEqual(cnt.frame_count, 1)
 
+    def test_omegaconf_listconfig_iter(self):
+        obj = ListConfig()
+        x = torch.zeros(2)
+
+        def fn():
+            y = x
+            for i in obj:
+                y += i
+            return y
+
+        expected = fn()
+        actual = torch.compile(fn, fullgraph=True, backend="eager")()
+        self.assertEqual(actual, expected)
+
+    def test_user_defined_iter(self):
+        class MyIter:
+            def __init__(self):
+                self.i = 0
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                if self.i < 3:
+                    self.i += 1
+                    return self.i
+                raise StopIteration
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            for i in MyIter():
+                x += i
+            return x
+
+        self.assertEqual(fn(torch.zeros(1)), torch.full([1], 6.0))
+
+    def test_stop_iteration_reconstruct(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            return x.sin(), StopIteration(1, 2, 3)
+
+        _, res = fn(torch.ones(1))
+        self.assertEqual(str(res), str(StopIteration(1, 2, 3)))
+
     def test_tensor_data_kwarg(self):
         # https://github.com/pytorch/pytorch/issues/96278
         def f():
@@ -2449,7 +2565,7 @@ def f():
         self.assertTrue(same(f(), opt_fn()))
         self.assertEqual(cnt.frame_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     def test_norm_dtype(self):
         def foo(_stack0):
             getitem = _stack0[(slice(None, None, None), -1)]
@@ -3184,6 +3300,29 @@ def fn(x, obj):
         obj1 = MyObj(x, x)
         self.assertRaises(AttributeError, lambda: fn(x, obj1))
 
+    def test_delsubscr(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            del x["a"]
+            y = x["b"] + 1
+            return y
+
+        x = {"a": torch.tensor([1]), "b": torch.tensor([1])}
+        result = fn(x)
+        self.assertFalse(hasattr(x, "a"))
+        self.assertEqual(result.item(), 2)
+
+    def test_delsubscr_raises(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            del x["a"]
+            y = x["a"] + 1  # should raise KeyError
+            return y
+
+        x = {"a": torch.tensor([1]), "b": torch.tensor([1])}
+        # FIXME It should be KeyError here
+        self.assertRaises(torch._dynamo.exc.InternalTorchDynamoError, lambda: fn(x))
+
     def test_attached_attribute_in_dir(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -3339,6 +3478,7 @@ def fn(input, mask):
         actual = fn_opt(*inputs)
         self.assertTrue(same(actual, expected))
 
+    @mock.patch("torch._dynamo.config.guard_nn_modules", True)
     def test_hf_xsoftmax_training(self):
         from torch._dynamo.utils import counters
 
@@ -3519,7 +3659,7 @@ def fn(x):
     def test_odict_get_item_index_name(self):
         d = {float: torch.float32, np.float16: torch.float16}
 
-        @torch.compile
+        @torch.compile(backend="eager")
         def f(x, y1, y2):
             return torch.zeros(5, dtype=d[y1]), torch.zeros(5, dtype=d[y2])
 
@@ -3532,7 +3672,19 @@ def f():
 
         self.assertEqual(f(), _GLOBAL_CPU_TENSOR + _GLOBAL_CPU_TENSOR)
 
-    @requires_cuda()
+    def test_randint_out_dynamic(self):
+        def randint_fn(high, size, out):
+            return torch.randint(high, size, out=out)
+
+        opt_model = torch.compile(randint_fn)
+
+        out1 = torch.empty(10, dtype=torch.int32)
+        opt_model(17, (10,), out1)
+
+        out2 = torch.empty(12, dtype=torch.int32)
+        opt_model(17, (12,), out2)
+
+    @requires_cuda
     def test_guard_default_device(self):
         try:
             torch.set_default_device("cuda")
@@ -3710,6 +3862,16 @@ def fn():
 
         make_fn(None)()
 
+    def test_call_finally_python_3_8_2(self):
+        def f(x):
+            while x:
+                try:
+                    pass
+                except Exception as _:
+                    continue
+
+        torch.compile(f, backend="eager")(0)
+
     def test_call_finally_opcode_python_3_8(self):
         def fn():
             try:
@@ -3905,6 +4067,21 @@ def fn(x):
         )
         self.assertLess(total_shape_env_guards, AMT * 8)
 
+    # https://github.com/pytorch/pytorch/issues/118799
+    def test_subclass_graph_output_repro(self):
+        @torch._dynamo.allow_in_graph
+        def to_subclass(x):
+            return TwoTensor(x.clone(), x.clone())
+
+        def f(x):
+            tmp_subclass = to_subclass(x)
+            return tmp_subclass.view(-1)
+
+        x = torch.ones(2)
+        out_ref = f(x)
+        out_test = torch.compile(f, backend="aot_eager")(x)
+        self.assertEqual(out_ref, out_test)
+
     def test_numpy_tobytes_no_error(self):
         def fn(x):
             x += 1
@@ -3944,7 +4121,15 @@ def fn(x=None):
         self.assertEqual(opt_fn("10"), fn("10"))
         self.assertEqual(cnt.frame_count, 4)
 
-    def test_tensor_set_data(self):
+    @parametrize(
+        "backend",
+        ["eager", "aot_eager", "inductor"],
+    )
+    @parametrize(
+        "func_name",
+        ["func1", "func2", "func3"],
+    )
+    def test_tensor_set_data(self, backend, func_name):
         # https://github.com/pytorch/pytorch/issues/113030
         def func1(x, y):
             x.data = y
@@ -3962,43 +4147,698 @@ def func3(x, y):
             y.data = torch.zeros([0])
             return torch.tensor(x is z)
 
-        for backend in ["eager", "aot_eager", "inductor"]:
-            for func in [func1, func2, func3]:
-                if backend != "eager" and func is func1:
-                    # add_ not working w/ aot_autograd?
-                    continue
-                torch._dynamo.reset()
-                cnt = torch._dynamo.testing.CompileCounterWithBackend(backend)
-
-                compiled_fn = torch.compile(func, backend=cnt, fullgraph=True)
-                requires_grad = func is not func1
-                for i in range(0, 5):
-                    # Inputs
-                    eager_a = torch.ones([6], requires_grad=requires_grad)
-                    compiled_a = torch.ones([6], requires_grad=requires_grad)
-
-                    eager_b = torch.ones([6], requires_grad=requires_grad)
-                    compiled_b = torch.ones([6], requires_grad=requires_grad)
-
-                    # Eager
-                    out_eager = func(eager_a, eager_b)
-                    # Compiled
-                    out_compiled = compiled_fn(compiled_a, compiled_b)
-                    self.assertEqual(eager_a, compiled_a)
-                    self.assertEqual(eager_b, compiled_b)
-                    self.assertTrue(torch.equal(out_eager, out_compiled))
-
-                    # func1 hits a leaf Variable that requires grad is being used in an in-place operation
-                    if requires_grad:
-                        bwd_inp_eager = torch.randn([6])
-                        bwd_inp_compiled = torch.clone(bwd_inp_eager)
-                        eager_a.backward(bwd_inp_eager)
-                        compiled_a.backward(bwd_inp_compiled)
-                        self.assertEqual(eager_a.grad, compiled_a.grad)
-
-                # Prove guarding works - we run the compiled_fn 5 times
-                # frame_count should stay at 1.
-                self.assertEqual(cnt.frame_count, 1)
+        funcs = {"func1": func1, "func2": func2, "func3": func3}
+        func = funcs[func_name]
+
+        if backend != "eager" and func is func1:
+            # add_ not working w/ aot_autograd?
+            return
+
+        torch._dynamo.reset()
+        cnt = torch._dynamo.testing.CompileCounterWithBackend(backend)
+
+        compiled_fn = torch.compile(func, backend=cnt, fullgraph=True)
+        requires_grad = func is not func1
+        for i in range(0, 5):
+            # Inputs
+            eager_a = torch.ones([6], requires_grad=requires_grad)
+            compiled_a = torch.ones([6], requires_grad=requires_grad)
+
+            eager_b = torch.ones([6], requires_grad=requires_grad)
+            compiled_b = torch.ones([6], requires_grad=requires_grad)
+
+            # Eager
+            out_eager = func(eager_a, eager_b)
+            # Compiled
+            out_compiled = compiled_fn(compiled_a, compiled_b)
+            self.assertEqual(eager_a, compiled_a)
+            self.assertEqual(eager_b, compiled_b)
+            self.assertTrue(torch.equal(out_eager, out_compiled))
+
+            # func1 hits a leaf Variable that requires grad is being used in an in-place operation
+            if requires_grad:
+                bwd_inp_eager = torch.randn([6])
+                bwd_inp_compiled = torch.clone(bwd_inp_eager)
+                eager_a.backward(bwd_inp_eager)
+                compiled_a.backward(bwd_inp_compiled)
+                self.assertEqual(eager_a.grad, compiled_a.grad)
+
+        # Prove guarding works - we run the compiled_fn 5 times
+        # frame_count should stay at 1.
+        self.assertEqual(cnt.frame_count, 1)
+
+    @unittest.skipIf(
+        TEST_WITH_ROCM or not PLATFORM_SUPPORTS_FLASH_ATTENTION,
+        "flash attention not supported",
+    )
+    def test_flash_attn_backward_mixed_strides(self):
+        # in this repro, "grad_out" and "value" are transposed tensors,
+        # but "key" and "value" are contiguous
+        def gen_inputs(device):
+            return (
+                torch.randn(
+                    2, 513, 16, 64, dtype=torch.float16, device=device
+                ).transpose(1, 2),
+                torch.randn(2, 16, 513, 64, dtype=torch.float16, device=device),
+                torch.randn(2, 16, 513, 64, dtype=torch.float16, device=device),
+                torch.randn(
+                    2, 513, 16, 64, dtype=torch.float16, device=device
+                ).transpose(1, 2),
+                torch.randn(2, 16, 513, 64, dtype=torch.float16, device=device),
+                torch.randn(2, 16, 513, device=device),
+                None,
+                None,
+                513,
+                513,
+                0.0,
+                False,
+                torch.tensor(1, dtype=torch.int64),
+                torch.tensor(1, dtype=torch.int64),
+            )
+
+        inps_cuda = gen_inputs("cuda")
+        inps_meta = gen_inputs("meta")
+        (
+            out1_ref,
+            out2_ref,
+            out3_ref,
+        ) = torch.ops.aten._scaled_dot_product_flash_attention_backward(
+            *inps_cuda, scale=0.125
+        )
+        from torch._meta_registrations import meta__scaled_dot_product_flash_backward
+
+        out1_test, out2_test, out3_test = meta__scaled_dot_product_flash_backward(
+            *inps_meta, scale=0.125
+        )
+
+        self.assertEqual(out1_ref.shape, out1_test.shape)
+        self.assertEqual(out1_ref.stride(), out1_test.stride())
+        self.assertEqual(out2_ref.shape, out2_test.shape)
+        self.assertEqual(out2_ref.stride(), out2_test.stride())
+        self.assertEqual(out3_ref.shape, out3_test.shape)
+        self.assertEqual(out3_ref.stride(), out3_test.stride())
+
+    def test_user_ctor_ctx_manager(self):
+        class UserCtxManager:
+            def __enter__(self):
+                return 1
+
+            def __exit__(self, exc_type, exc_val, exc_tb):
+                pass
+
+        def fn(x, y):
+            ucm = UserCtxManager()
+            return x * x
+
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnt, nopython=True)(fn)
+        x = torch.rand([2, 2])
+        opt_fn(x, x)
+        self.assertEqual(cnt.frame_count, 1)
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_unbacked_arange_in_bounds(self):
+        # see https://github.com/pytorch/pytorch/issues/113002
+        class PaddingNet(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, lengths):
+                max_seq_len = lengths.max().item()
+                row_vector = torch.arange(0, max_seq_len, 1)
+                matrix = torch.unsqueeze(lengths, dim=-1)
+                mask = row_vector < matrix
+                mask = mask.type(torch.float32)
+                mask_3d_btd = mask[:, :, None]
+                return mask_3d_btd
+
+        model = PaddingNet()
+        lengths = torch.tensor([5, 4, 4, 4], dtype=torch.int32)
+
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnt, nopython=True)(model)
+        opt_fn(lengths)
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_overlapping_inputs_with_dynamic_shapes_error(self):
+        @torch.compile(backend="aot_eager")
+        def fn(a, b, c, d, e, f):
+            a.mul_(2)
+            b.mul_(2)
+            c.mul_(2)
+            d.mul_(2)
+            e.mul_(2)
+            f.mul_(2)
+
+            base = torch.ones(2, 20)
+            a = base[:, 0:2]
+            b = base[:, 2:4]
+            c = base[:, 4:6]
+            d = base[:, 6:8]
+            e = base[:, 8:10]
+            f = base[:, 10:12]
+            f2 = base[:, 10:14]
+            out = fn(a, b, c, d, e, f)
+            with self.assertRaisesRegex(
+                AssertionError, "is being compiled with dynamic shapes"
+            ):
+                out2 = fn(a, b, c, d, e, f2)
+
+    def test_user_ctor_ctx_manager_custom_init(self):
+        class UserCtxManager:
+            def __init__(self, x):
+                x[0] = 10
+
+            def __enter__(self):
+                return 1
+
+            def __exit__(self, exc_type, exc_val, exc_tb):
+                pass
+
+        def fn(x, y):
+            ucm = UserCtxManager(y)
+            return x * y[0]
+
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnt, nopython=True)(fn)
+        x = torch.rand([2, 2])
+        self.assertEqual(opt_fn(x, [5]), fn(x, [5]))
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_user_ctor_ctx_manager_custom_init_graph_break(self):
+        counter = [0]
+
+        class UserCtxManager:
+            def __init__(self, k):
+                k[0] += 1
+
+            def __enter__(self):
+                return 1
+
+            def __exit__(self, exc_type, exc_val, exc_tb):
+                pass
+
+        def fn(x, counter):
+            x = x * x
+            ucm = UserCtxManager(counter)
+            return x * x
+
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnt)(fn)
+        x = torch.rand([2, 2])
+        self.assertEqual(opt_fn(x, counter), fn(x, counter))
+        self.assertEqual(counter[0], 2)
+        for i in range(0, 10):
+            opt_fn(x, counter)
+        self.assertEqual(counter[0], 12)
+        self.assertEqual(cnt.frame_count, torch._dynamo.utils.ifdynstaticdefault(3, 2))
+
+    @unittest.expectedFailure
+    def test_many_overlapping_inputs_does_not_explode_guards(self):
+        from torch._dynamo.backends.common import aot_autograd
+
+        # Before, this was (9702, 0)
+        num_shape_guards = None
+        num_aot_guards = None
+        num_compiles = 0
+
+        def guard_count_backend(gm, *args):
+            nonlocal num_shape_guards
+            nonlocal num_aot_guards
+            nonlocal num_compiles
+            num_shape_guards = len(
+                torch._guards.TracingContext.try_get().fake_mode.shape_env.guards
+            )
+            num_aot_guards = len(
+                torch._guards.TracingContext.try_get().guards_context.aotautograd_guards
+            )
+            num_compiles += 1
+            return gm
+
+        aot_guard_counter = aot_autograd(fw_compiler=guard_count_backend)
+
+        @torch.compile(backend=aot_guard_counter, dynamic=True)
+        def f(*args):
+            for a in args:
+                a.add_(1)
+
+        x = torch.ones(1000, requires_grad=True)
+        args = x.split(10)
+
+        with torch.no_grad():
+            f(*args)
+        # In this example, there were 4950 guards (roughly (# tensors) ^ 2 // 2),
+        # because every pair of aliased inputs needs a guard.
+        self.assertTrue(num_aot_guards < 5000)
+        # But there are no dynamic shape guards.
+        self.assertEqual(num_shape_guards, 0)
+        # don't recompile
+        with torch.no_grad():
+            f(*args)
+        self.assertEqual(num_compiles, 1)
+
+    def test_invalid_seq_unpack(self):
+        def myfn(arg):
+            (a, b) = arg
+
+        def fn():
+            return myfn((1, 2, 3))
+
+        try:
+            torch.compile(fn)()
+        except ValueError:
+            pass
+        else:
+            self.fail("expected exception")
+
+    def test_megablocks_moe(self):
+        try:
+            from megablocks.layers import moe
+            from megablocks.layers.arguments import Arguments
+        except ImportError as e:
+            raise unittest.SkipTest("requires megablocks") from e
+        bs, sl, hs, num_experts, top_k = (16, 1024, 512, 1, 1)
+        args = Arguments(
+            hidden_size=hs,
+            ffn_hidden_size=hs * 2,
+            moe_num_experts=num_experts,
+            moe_capacity_factor=1,
+            moe_top_k=top_k,
+        )
+        moe_mlp = moe.MoE(args)
+        moe_mlp.cuda(torch.cuda.current_device()).half()
+        x = torch.randn(sl, bs, hs).cuda().half()
+        out1, _ = moe_mlp(x)
+        out2, _ = torch.compile(moe_mlp, backend="eager")(x)
+        self.assertEqual(out1, out2)
+
+    def test_udf_classes_reconstruction(self):
+        def fn(x):
+            o = T(5)
+            return o.x + x
+
+        opt_fn = torch.compile(fn, backend="eager")
+        T = IncByOne
+
+        x = torch.randn(4)
+        self.assertEqual(fn(x), opt_fn(x))
+
+        # This should recompile
+        T = IncByTwo
+        self.assertEqual(fn(x), opt_fn(x))
+
+    def test_contains_range_constprop(self):
+        def fn(x):
+            # dynamo should const prop to False
+            if 3 in range(0, 10):
+                return x + 1
+            else:
+                return x + 2
+
+        opt_fn = torch.compile(fn, backend="eager")
+        x = torch.zeros(4)
+        self.assertEqual(fn(x), opt_fn(x))
+
+    # https://github.com/pytorch/pytorch/issues/104505
+    def test_as_strided_on_base_with_mutation_works(self):
+        def foo(a):
+            f = a.as_strided((2,), (1,), 0)
+            f.add_(1.0)
+            return a
+
+        a = torch.randn(2, 4)
+        a_ref = a.clone()
+        out_ref = foo(a_ref)
+        f_compiled = torch.compile(foo, backend="aot_eager")
+        out = f_compiled(a)
+        self.assertEqual(out_ref, out)
+        self.assertEqual(a_ref, a)
+
+    # https://github.com/pytorch/pytorch/issues/104505
+    def test_as_strided_on_existing_view_banned(self):
+        def foo(a):
+            e = a.diagonal()
+            f = e.as_strided((2,), (1,), 0)
+            f.add_(1.0)
+            return a
+
+        a = torch.randn(2, 4)
+        a_ref = a.clone()
+        out_ref = foo(a_ref)
+        f_compiled = torch.compile(foo, backend="aot_eager")
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "encountered a mutation on a view chain of length 2, where view 1 was an as_strided",
+        ):
+            out = f_compiled(a)
+
+    def test_dont_aggressively_write_assert(self):
+        record_graph = torch._dynamo.testing.EagerAndRecordGraphs()
+
+        @torch.compile(dynamic=True, backend=record_graph)
+        def f(x):
+            assert x.shape[0] > 3
+            assert x[0].sum() > 0
+            assert 1 % (x.shape[0] // 2) != 0
+            assert 32 * (x.shape[0] // 2) ** 2 - 16 * (x.shape[0] // 2) != 0
+            return x.cos()
+
+        f(torch.ones(6, 4))
+        graph = record_graph.graphs[0]
+        # It is bit annoying that we generate useless statements for
+        # shape guards, but DCE should be able to remove them since t
+        # there is no backed assert on them. The reason this is ok is
+        # because dynamo will only skip the assert statement, but not
+        # the instructions before it.
+        self.assertExpectedInline(
+            str(graph.code).strip(),
+            """\
+def forward(self, s0 : torch.SymInt, s1 : torch.SymInt, L_x_ : torch.Tensor):
+    l_x_ = L_x_
+    size = l_x_.size()
+    getitem = size[0];  size = None
+    gt = getitem > 3;  getitem = None
+    getitem_2 = l_x_[0]
+    sum_1 = getitem_2.sum();  getitem_2 = None
+    gt_1 = sum_1 > 0;  sum_1 = None
+    _assert_async = torch._assert_async(gt_1, 'assertion error');  gt_1 = None
+    size_1 = l_x_.size()
+    getitem_3 = size_1[0];  size_1 = None
+    floordiv = getitem_3 // 2;  getitem_3 = None
+    mod = 1 % floordiv;  floordiv = None
+    ne = mod != 0;  mod = None
+    size_2 = l_x_.size()
+    getitem_5 = size_2[0];  size_2 = None
+    floordiv_1 = getitem_5 // 2;  getitem_5 = None
+    pow_1 = floordiv_1 ** 2;  floordiv_1 = None
+    mul = 32 * pow_1;  pow_1 = None
+    size_3 = l_x_.size()
+    getitem_7 = size_3[0];  size_3 = None
+    floordiv_2 = getitem_7 // 2;  getitem_7 = None
+    mul_1 = 16 * floordiv_2;  floordiv_2 = None
+    sub = mul - mul_1;  mul = mul_1 = None
+    ne_1 = sub != 0;  sub = None
+    cos = l_x_.cos();  l_x_ = None
+    return (cos,)""",
+        )
+        for node in graph.graph.nodes:
+            if "example_value" in node.meta and isinstance(
+                node.meta["example_value"], torch._subclasses.fake_tensor.FakeTensor
+            ):
+                shape_env = node.meta["example_value"].fake_mode.shape_env
+                lower_ranges = [val.lower for val in shape_env.var_to_range.values()]
+                self.assertTrue(lower_ranges == [4, 2])
+
+        @torch.compile(dynamic=True, backend=record_graph)
+        def f_fail(x):
+            assert x.shape[0] < 3
+
+        # We graph-break here, so the failure should be eager
+        with self.assertRaisesRegex(AssertionError, ""):
+            f_fail(torch.ones(6, 4))
+
+    def test_detectron2_instances_cat(self):
+        class Instances:
+            def __init__(self, image_size: Tuple[int, int], **kwargs: Any):
+                self._image_size = image_size
+                self._fields: Dict[str, Any] = {}
+                for k, v in kwargs.items():
+                    self.set(k, v)
+
+            @property
+            def image_size(self) -> Tuple[int, int]:
+                return self._image_size
+
+            def __setattr__(self, name: str, val: Any) -> None:
+                if name.startswith("_"):
+                    super().__setattr__(name, val)
+                else:
+                    self.set(name, val)
+
+            def __getattr__(self, name: str) -> Any:
+                if name == "_fields" or name not in self._fields:
+                    raise AttributeError(
+                        f"Cannot find field '{name}' in the given Instances!"
+                    )
+                return self._fields[name]
+
+            def __len__(self) -> int:
+                for v in self._fields.values():
+                    # use __len__ because len() has to be int and is not friendly to tracing
+                    return v.__len__()
+                raise NotImplementedError("Empty Instances does not support __len__!")
+
+            def set(self, name: str, value: Any) -> None:
+                with warnings.catch_warnings(record=True):
+                    data_len = len(value)
+                if len(self._fields):
+                    assert (
+                        len(self) == data_len
+                    ), f"Adding a field of length {data_len} to a Instances of length {len(self)}"
+                self._fields[name] = value
+
+            def get(self, name: str) -> Any:
+                return self._fields[name]
+
+            @staticmethod
+            def cat(instance_lists: List["Instances"]) -> "Instances":
+                assert all(isinstance(i, Instances) for i in instance_lists)
+                assert len(instance_lists) > 0
+                if len(instance_lists) == 1:
+                    return instance_lists[0]
+
+                image_size = instance_lists[0].image_size
+                if not isinstance(
+                    image_size, torch.Tensor
+                ):  # could be a tensor in tracing
+                    for i in instance_lists[1:]:
+                        assert i.image_size == image_size
+                ret = Instances(image_size)
+                for k in instance_lists[0]._fields.keys():
+                    values = [i.get(k) for i in instance_lists]
+                    v0 = values[0]
+                    if isinstance(v0, torch.Tensor):
+                        values = torch.cat(values, dim=0)
+                    elif isinstance(v0, list):
+                        values = list(itertools.chain(*values))
+                    elif hasattr(type(v0), "cat"):
+                        values = type(v0).cat(values)
+                    else:
+                        raise ValueError(
+                            f"Unsupported type {type(v0)} for concatenation"
+                        )
+                    ret.set(k, values)
+                return ret
+
+        instances = [
+            Instances((16, 16), a=torch.randn(16, 16), b=torch.randn(16, 16))
+            for _ in range(3)
+        ]
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(instances):
+            return instances[0].cat(instances)
+
+        actual = fn(instances)
+        expected = instances[0].cat(instances)
+        self.assertEqual(type(actual), type(expected))
+        self.assertEqual(actual.__dict__, expected.__dict__)
+
+    def test_super_in_staticmethod(self):
+        class A:
+            @staticmethod
+            def foo():
+                return super().__init__()
+
+        def fn(obj):
+            return obj.foo()
+
+        obj = A()
+
+        try:
+            fn(obj)
+        except Exception as e:
+            orig_str = str(e)
+        self.assertIn("no arguments", orig_str)
+
+        try:
+            torch.compile(backend="eager")(fn)(obj)
+        except Exception as e:
+            compiled_str = str(e)
+        self.assertEqual(orig_str, compiled_str)
+
+    def test_stk_sdd_is_transposed(self):
+        trigger_graph_break = False
+
+        def _is_transposed(x):
+            return (
+                not x.is_contiguous()
+                and x.stride()[0] == 1
+                and x.stride()[1] == x.size()[0]
+            )
+
+        class SDD(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, lhs, rhs):
+                ctx.save_for_backward(lhs, rhs)
+                out = torch.full_like(lhs, 1.0, dtype=lhs.dtype, device=lhs.device)
+                return out
+
+            @staticmethod
+            def backward(ctx, dy):
+                saved_tensors = ctx.saved_tensors
+                lhs, rhs = saved_tensors[:2]
+                trans_a = _is_transposed(lhs)
+                trans_b = _is_transposed(rhs)
+                dlhs = None
+                if ctx.needs_input_grad[0]:
+                    dlhs = torch.full_like(lhs, 1.0 if trans_a else 2.0)
+                drhs = None
+                if ctx.needs_input_grad[1]:
+                    drhs = torch.full_like(rhs, 1.0 if trans_b else 2.0)
+                if trigger_graph_break:
+                    if _is_transposed(dy):
+                        return dlhs + 1, drhs + 1, None, None
+                return dlhs, drhs, None, None
+
+        x1 = torch.randn((8, 8), requires_grad=True)
+        y1 = torch.randn((8, 8)).transpose(0, 1).requires_grad_(True)
+        x2 = torch.randn((8, 8), requires_grad=True)
+        y2 = torch.randn((8, 8)).transpose(0, 1).requires_grad_(True)
+
+        SDD.apply(x1, y1).sum().backward()
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn():
+            return SDD.apply(x2, y2)
+
+        fn().sum().backward()
+
+        self.assertEqual(x1.grad, x2.grad)
+        self.assertEqual(y1.grad, y2.grad)
+
+        trigger_graph_break = True
+        with self.assertRaises(torch._dynamo.exc.Unsupported):
+            fn().sum().backward()
+
+    def test_partially_initialized_module_property(self):
+        class Matrix(torch.nn.Module):
+            def __init__(self, data):
+                super().__init__()
+                self._data = data
+                self.foo = 10 * self.blocking
+
+            @property
+            def data(self):
+                return self._data
+
+            @property
+            def blocking(self):
+                return self.data.shape[1]
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn():
+            return Matrix(torch.randn(10, 20))
+
+        v = fn()
+        self.assertEqual(v.foo, 200)
+        self.assertEqual(v.data.shape, (10, 20))
+        self.assertEqual(type(v), Matrix)
+
+    def test_nn_parametrize(self):
+        class Module(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.param = torch.nn.Parameter(torch.randn(10, 10))
+
+            def forward(self, x):
+                return self.param @ x
+
+        class Parametrization(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        def parametrize(model: nn.Module):
+            mods = list(model.modules())
+            for mod in mods:
+                params = list(mod._parameters.items())
+                for name, p in params:
+                    if p is not None:
+                        torch.nn.utils.parametrize.register_parametrization(
+                            mod, name, Parametrization(), unsafe=True
+                        )
+
+        m = Module()
+        parametrize(m)
+        opt_m = torch.compile(m, backend="eager")
+        inp = torch.randn(10, 10)
+        self.assertEqual(m(inp), opt_m(inp))
+
+    def test_nn_module_property_closure(self):
+        x = torch.randn(10, 10)
+
+        class Mod(torch.nn.Module):
+            @property
+            def y(self):
+                return torch.ones(10, 10) + x
+
+            def forward(self, x):
+                return x @ self.y
+
+        mod = Mod()
+
+        def fn(x):
+            return mod(x)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+
+        inp = torch.randn(10, 10)
+        self.assertEqual(fn(inp), opt_fn(inp))
+
+    def test_global_fn_mutation(self):
+        def foo(x, y):
+            return global_fn(x) + y
+
+        x = torch.ones(1)
+        y = torch.ones(1)
+
+        opt = torch.compile(foo, fullgraph=True, backend="eager")
+        self.assertEqual(opt(x, y), foo(x, y))
+
+        # Change global_fn
+        global global_fn
+
+        def new_fn(x):
+            return torch.cos(x)
+
+        global_fn = new_fn
+        self.assertEqual(opt(x, y), foo(x, y))
+
+    # ref https://github.com/pytorch/pytorch/issues/123974
+    def test_list_reverse(self):
+        def ladder(x):
+            trail = x.size(-1)
+            assert trail > 2
+            weights = []
+            for s in [trail, trail - 1, trail - 2]:
+                weights.append(torch.ones(s, s - 1))
+
+            for w in weights:
+                x = x @ w
+
+            weights.reverse()
+
+            for w in weights:
+                x = x @ w.t()
+
+            return x
+
+        data = torch.randn(3, 4)
+        opt_ladder = torch.compile(ladder, fullgraph=True, backend="eager")
+        self.assertEqual(opt_ladder(data), ladder(data))
+
+
+instantiate_parametrized_tests(ReproTests)
 
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_sdpa.py b/test/dynamo/test_sdpa.py
new file mode 100644
index 0000000000000..ba351410a203d
--- /dev/null
+++ b/test/dynamo/test_sdpa.py
@@ -0,0 +1,106 @@
+# Owner(s): ["module: dynamo"]
+import contextlib
+
+import torch._dynamo.test_case
+import torch._dynamo.testing
+from torch._dynamo.testing import CompileCounter
+from torch.backends.cuda import SDPAParams
+
+
+@contextlib.contextmanager
+def allow_in_graph_sdpa_params():
+    global SDPAParams
+    try:
+        old = SDPAParams
+        SDPAParams = torch._dynamo.allow_in_graph(SDPAParams)
+        yield
+    finally:
+        SDPAParams = old
+
+
+class TestSDPA(torch._dynamo.test_case.TestCase):
+    def assert_ref_equals_params(self, actual, expected):
+        self.assertIs(actual.query, expected.query)
+        self.assertIs(actual.key, expected.key)
+        self.assertIs(actual.value, expected.value)
+        self.assertIs(actual.attn_mask, expected.attn_mask)
+
+    def test_returns_SDPAParams(self):
+        with allow_in_graph_sdpa_params():
+            counter = CompileCounter()
+
+            @torch.compile(fullgraph=True, backend=counter)
+            def fn(q, k, v, m):
+                return SDPAParams(q, k, v, m, 0.1, True)
+
+            q = torch.randn(10)
+            k = torch.randn(10)
+            v = torch.randn(10)
+            m = torch.randn(10)
+            o = fn(q, k, v, m)
+            self.assertTrue(isinstance(o, SDPAParams))
+            self.assert_ref_equals_params(o, SDPAParams(q, k, v, m, 0.1, True))
+            self.assertEqual(counter.frame_count, 1)
+
+    def test_graph_break_SDPAParams(self):
+        with allow_in_graph_sdpa_params():
+            counter = CompileCounter()
+
+            @torch.compile(backend=counter)
+            def fn(q, k, v, m):
+                z = SDPAParams(q, k, v, m, 0.1, True)
+                torch._dynamo.graph_break()
+                return z, q + 1
+
+            q = torch.randn(10)
+            k = torch.randn(10)
+            v = torch.randn(10)
+            m = torch.randn(10)
+            o, _ = fn(q, k, v, m)
+            self.assertTrue(isinstance(o, SDPAParams))
+            self.assert_ref_equals_params(o, SDPAParams(q, k, v, m, 0.1, True))
+            self.assertEqual(counter.frame_count, 2)
+
+    def test_input_SDPAParams(self):
+        with allow_in_graph_sdpa_params():
+            counter = CompileCounter()
+
+            @torch.compile(backend=counter)
+            def fn(sdpap, q):
+                torch._dynamo.graph_break()
+                return sdpap, sdpap.query + q
+
+            q = torch.randn(10)
+            k = torch.randn(10)
+            v = torch.randn(10)
+            m = torch.randn(10)
+            s = SDPAParams(q, k, v, m, 0.1, True)
+            o, _ = fn(s, q)
+            self.assertIs(o, s)
+            self.assertEqual(counter.frame_count, 1)
+
+    def test_intermediate_attr_access_SDPAParams(self):
+        with allow_in_graph_sdpa_params():
+            counter = CompileCounter()
+
+            @torch.compile(fullgraph=True, backend=counter)
+            def fn(q, k, v, m):
+                q += 1
+                z = SDPAParams(q, k, v, m, 0.1, True)
+                a = z.query
+                return a + 1, z, q
+
+            q = torch.randn(10)
+            k = torch.randn(10)
+            v = torch.randn(10)
+            m = torch.randn(10)
+            _, o, _ = fn(q, k, v, m)
+            expected = SDPAParams(q, k, v, m, 0.1, True)
+            self.assert_ref_equals_params(o, expected)
+            self.assertEqual(counter.frame_count, 1)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_skip_non_tensor.py b/test/dynamo/test_skip_non_tensor.py
index 81210f3fba8c4..43fe1ba1ece49 100644
--- a/test/dynamo/test_skip_non_tensor.py
+++ b/test/dynamo/test_skip_non_tensor.py
@@ -147,10 +147,10 @@ def fn(x):
 
         class Foo(list):
             def __iter__(self):
-                raise Exception()
+                raise Exception  # noqa: TRY002
 
             def __len__(self):
-                raise Exception()
+                raise Exception  # noqa: TRY002
 
         x = Foo()
         x.append(torch.randn(4))
diff --git a/test/dynamo/test_sources.py b/test/dynamo/test_sources.py
index 11bbe41875ab6..be590cdc0363b 100644
--- a/test/dynamo/test_sources.py
+++ b/test/dynamo/test_sources.py
@@ -3,6 +3,7 @@
 import torch
 import torch._dynamo
 import torch._dynamo.test_case
+import torch.nn as nn
 from torch._dynamo.source import (
     AttrSource,
     GlobalSource,
@@ -11,6 +12,10 @@
 )
 
 
+class CausalLMOutputWithPast:
+    value = 5
+
+
 class SourceTests(torch._dynamo.test_case.TestCase):
     def test_is_local(self):
         x_src = LocalSource("x")
@@ -22,6 +27,54 @@ def test_is_local(self):
         self.assertTrue(is_from_local_source(attr_x_a))
         self.assertEqual(is_from_local_source(attr_y_b), False)
 
+    def test_property_closure(self):
+        def external_property():
+            closed_value = 7
+
+            def internal_function(self):
+                return closed_value
+
+            return internal_function
+
+        class Elements:
+            myprop = property(external_property())
+
+        def func(elements):
+            if not elements.myprop:
+                return torch.tensor([1, 2, 3])
+            else:
+                return torch.tensor([4, 5, 6])
+
+        e = Elements()
+        a = func(e)
+        b = torch.compile(func, backend="eager", fullgraph=True)(e)
+        self.assertEqual(a, b)
+
+    def test_supported_nodes(self):
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.x = torch.randn(10, 10)
+
+            def forward(self):
+                if (
+                    torch.utils._pytree.SUPPORTED_NODES[CausalLMOutputWithPast].type
+                    == int
+                ):
+                    x = torch.sin(self.x)
+                else:
+                    x = torch.cos(self.x)
+                return x
+
+        torch.utils._pytree.register_pytree_node(
+            CausalLMOutputWithPast,
+            lambda x: ((), None),
+            lambda x, _: CausalLMOutputWithPast(),
+        )
+
+        # breakpoint()
+        torch.export.export(Model(), ())
+
 
 if __name__ == "__main__":
     torch._dynamo.test_case.run_tests()
diff --git a/test/dynamo/test_structured_trace.py b/test/dynamo/test_structured_trace.py
new file mode 100644
index 0000000000000..1213a48ffc0f3
--- /dev/null
+++ b/test/dynamo/test_structured_trace.py
@@ -0,0 +1,426 @@
+# Owner(s): ["module: dynamo"]
+import copy
+import functools
+import io
+import json
+import logging
+import os
+import shutil
+import subprocess
+import tempfile
+import unittest.mock
+
+import torch
+import torch._dynamo.test_case
+import torch._dynamo.testing
+import torch._logging.structured
+import torch.distributed as dist
+
+from torch._inductor.test_case import TestCase
+
+from torch._logging._internal import TorchLogsFormatter
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.testing._internal.common_utils import find_free_port
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+requires_distributed = functools.partial(
+    unittest.skipIf, not dist.is_available(), "requires distributed"
+)
+
+
+def example_fn(a):
+    output = a.mul(torch.ones(1000, 1000))
+    output = output.add(torch.ones(1000, 1000))
+    return output
+
+
+def dynamo_error_fn(a):
+    output = a.mul(torch.ones(1000, 1000))
+    output = output.add(torch.ones(10, 10))
+    return output
+
+
+def inductor_error_fn(a):
+    output = torch.round(a)
+    return output
+
+
+def inductor_schedule_fn(a):
+    output = a.add(torch.ones(1000, 1000, device="cuda"))
+    return output
+
+
+ARGS = (torch.ones(1000, 1000, requires_grad=True),)
+
+
+class StructuredTraceTestingFilter(logging.Filter):
+    def filter(self, record):
+        if "str" in record.metadata:
+            return False
+        return True
+
+
+class StructuredTraceTestingFormatter(logging.Formatter):
+    def format(self, record):
+        metadata = copy.deepcopy(record.metadata)
+
+        # Stub out values that are not stable across runs
+        # TODO: Check that these match schema
+        if "has_payload" in metadata:
+            metadata["has_payload"] = "HASH"
+        if "dynamo_start" in metadata:
+            metadata["dynamo_start"]["stack"] = "STACK"
+        if "inductor_output_code" in metadata:
+            metadata["inductor_output_code"]["filename"] = "FILENAME"
+        if "stack" in metadata:
+            metadata["stack"] = "STACK"
+        if "compilation_metrics" in metadata:
+            metadata["compilation_metrics"] = "METRICS"
+
+        return json.dumps(metadata)
+
+
+trace_log = logging.getLogger("torch.__trace")
+
+
+class StructuredTraceTest(TestCase):
+    def setUp(self):
+        super().setUp()
+        torch._dynamo.reset()
+        torch._logging.structured.INTERN_TABLE.clear()
+        self.buffer = io.StringIO()
+        self.old_level = trace_log.level
+        trace_log.setLevel(logging.DEBUG)
+
+        self.handler = logging.StreamHandler(self.buffer)
+        self.handler.setFormatter(StructuredTraceTestingFormatter())
+        self.handler.addFilter(StructuredTraceTestingFilter())
+        trace_log.addHandler(self.handler)
+
+        self.raw_file = tempfile.NamedTemporaryFile(
+            mode="w", delete=True
+        )  # set this to False to keep temporary files
+        self.raw_handler = logging.StreamHandler(self.raw_file)
+        self.raw_handler.setFormatter(TorchLogsFormatter(trace=True))
+        trace_log.addHandler(self.raw_handler)
+
+    def tearDown(self):
+        trace_log.removeHandler(self.handler)
+        trace_log.removeHandler(self.raw_handler)
+        self.raw_file.close()
+        trace_log.setLevel(self.old_level)
+
+    def assertParses(self):
+        out = tempfile.mkdtemp()
+        try:
+            subprocess.check_call(
+                [
+                    "tlparse",
+                    "-o",
+                    out,
+                    "--overwrite",
+                    "--no-browser",
+                    "--strict",
+                    self.raw_file.name,
+                ]
+            )
+        finally:
+            shutil.rmtree(out, ignore_errors=True)
+
+    @requires_cuda
+    def test_schedule(self):
+        fn_opt = torch._dynamo.optimize("inductor")(inductor_schedule_fn)
+        fn_opt(torch.ones(1000, 1000, device="cuda"))
+        self.assertExpectedInline(
+            self.buffer.getvalue(),
+            """\
+{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_forward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+""",  # noqa: B950
+        )
+
+        self.assertParses()
+
+    @requires_cuda
+    def test_cudagraphs(self):
+        fn_opt = torch.compile(mode="reduce-overhead")(inductor_schedule_fn)
+        fn_opt(torch.ones(1000, 1000, device="cuda"))
+        self.assertExpectedInline(
+            self.buffer.getvalue(),
+            """\
+{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_forward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+""",  # noqa: B950
+        )
+
+        self.assertParses()
+
+    def test_recompiles(self):
+        def fn(x, y):
+            return torch.add(x, y)
+
+        fn_opt = torch._dynamo.optimize("inductor")(fn)
+        fn_opt(torch.ones(1000, 1000), torch.ones(1000, 1000))
+        fn_opt(torch.ones(1000, 1000), 1)
+
+        self.assertExpectedInline(
+            self.buffer.getvalue(),
+            """\
+{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_x_": [1000, 1000], "l_y_": [1000, 1000], "add": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_forward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_x_": [1000, 1000], "add": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"aot_forward_graph": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+""",  # noqa: B950
+        )
+
+        self.assertParses()
+
+    def test_example_fn(self):
+        fn_opt = torch._dynamo.optimize("inductor")(example_fn)
+        fn_opt(torch.ones(1000, 1000))
+        self.assertExpectedInline(
+            self.buffer.getvalue(),
+            """\
+{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000], "ones_1": [1000, 1000], "output_1": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_forward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+""",  # noqa: B950
+        )
+
+        self.assertParses()
+
+    def test_dynamo_error(self):
+        try:
+            fn_opt = torch._dynamo.optimize("inductor")(dynamo_error_fn)
+            fn_opt(*ARGS)
+        except Exception:
+            pass
+        self.assertExpectedInline(
+            self.buffer.getvalue(),
+            """\
+{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+""",  # noqa: B950
+        )
+
+        self.assertParses()
+
+    def test_inductor_error(self):
+        import torch._inductor.lowering
+
+        def throw(x):
+            raise AssertionError
+
+        # inject an error in the lowerings
+        dict_entries = {}
+        for x in list(torch._inductor.lowering.lowerings.keys()):
+            if "round" in x.__name__:
+                dict_entries[x] = throw
+
+        with unittest.mock.patch.dict(torch._inductor.lowering.lowerings, dict_entries):
+            try:
+                fn_opt = torch._dynamo.optimize("inductor")(inductor_error_fn)
+                fn_opt(*ARGS)
+            except Exception:
+                pass
+
+        self.assertExpectedInline(
+            self.buffer.getvalue(),
+            """\
+{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_joint_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_forward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_backward_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+""",  # noqa: B950
+        )
+
+        self.assertParses()
+
+    @requires_distributed()
+    @requires_cuda
+    def test_ddp_graphs(self):
+        class ToyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layers = torch.nn.Sequential(
+                    torch.nn.Linear(1024, 1024),
+                    torch.nn.Linear(1024, 1024),
+                )
+
+            def forward(self, x):
+                return self.layers(x)
+
+        # TODO: this isn't safely bracketed, will leak
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = str(find_free_port())
+        dist.init_process_group("gloo", rank=0, world_size=1)
+
+        ddp_model = torch._dynamo.optimize("inductor")(
+            DDP(ToyModel().to("cuda:0"), device_ids=[0], bucket_cap_mb=4)
+        )
+
+        ddp_model(torch.randn(1024, 1024, device="cuda:0"))
+
+        dist.destroy_process_group()
+
+        self.assertExpectedInline(
+            self.buffer.getvalue(),
+            """\
+{"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_guards": {}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
+{"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_x_": [1024, 1024], "l__self___layers_0": [1024, 1024], "l__self___layers_1": [1024, 1024]}}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"optimize_ddp_split_graph": {}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"optimize_ddp_split_child": {"name": "submod_0"}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"optimize_ddp_split_child": {"name": "submod_1"}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_joint_graph": {}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_forward_graph": {}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_backward_graph": {}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_post_grad_graph": {}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_joint_graph": {}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_forward_graph": {}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_backward_graph": {}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_post_grad_graph": {}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_guards": {}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
+""",  # noqa: B950
+        )
+
+        self.assertParses()
+
+    def test_graph_breaks(self):
+        @torch._dynamo.optimize("inductor")
+        def fn(x):
+            torch._dynamo.graph_break()
+            return x + 1
+
+        fn(torch.ones(1))
+
+        self.assertExpectedInline(
+            self.buffer.getvalue(),
+            """\
+{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
+{"dynamo_start": {"stack": "STACK"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_x_": [1], "add": [1]}}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"aot_forward_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_post_grad_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_guards": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
+""",  # noqa: B950
+        )
+
+        self.assertParses()
+
+    # TODO: bring in the trace_source tests once we start emitting bytecode
+
+    def test_graph_sizes_dynamic(self):
+        def fn(a, b):
+            return a @ b
+
+        fn_opt = torch._dynamo.optimize("eager", dynamic=False)(fn)
+        fn_opt(torch.randn(10, 20), torch.randn(20, 30))
+
+        fn_opt2 = torch._dynamo.optimize("eager", dynamic=True)(fn)
+        fn_opt2(torch.randn(5, 10), torch.randn(10, 15))
+
+        self.assertExpectedInline(
+            self.buffer.getvalue(),
+            """\
+{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_a_": [10, 20], "l_b_": [20, 30], "matmul": [10, 30]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_a_": ["s0", "s1"], "l_b_": ["s1", "s3"], "matmul": ["s0", "s3"]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+""",  # noqa: B950
+        )
+
+        self.assertParses()
+
+    def test_guards_recompiles(self):
+        def fn(x, ys, zs):
+            return inner(x, ys, zs)
+
+        def inner(x, ys, zs):
+            for y, z in zip(ys, zs):
+                x += y * z
+            return x
+
+        ys = [1.0, 2.0]
+        zs = [3.0]
+        x = torch.tensor([1.0])
+
+        fn_opt = torch._dynamo.optimize("eager")(fn)
+        fn_opt(x, ys, zs)
+        fn_opt(x, ys[:1], zs)
+
+        self.assertExpectedInline(
+            self.buffer.getvalue(),
+            """\
+{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_x_": [1], "x": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+{"dynamo_output_graph": {"sizes": {"l_x_": [1], "x": [1]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_guards": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
+""",  # noqa: B950
+        )
+
+        self.assertParses()
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_subclasses.py b/test/dynamo/test_subclasses.py
index d0108bd19cf40..157a02997d626 100644
--- a/test/dynamo/test_subclasses.py
+++ b/test/dynamo/test_subclasses.py
@@ -3,6 +3,8 @@
 import itertools
 import unittest
 
+from functools import partial
+
 import torch
 
 import torch._dynamo.test_case
@@ -21,7 +23,8 @@
 from torch.nested._internal.nested_tensor import (
     jagged_from_list,
     jagged_from_tensor_and_lengths,
-    ViewBufferFromNested,
+    nested_view_from_values_offsets,
+    NestedTensor,
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -29,9 +32,113 @@
     subtest,
 )
 from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.two_tensor import TwoTensor
+
+
+def traceable_subclass(c):
+    return torch._dynamo.config.patch("traceable_tensor_subclasses", {c})
+
+
+def get_jagged_tensor(nested_size, offsets, requires_grad=True):
+    # Makes a jagged tensor with N constituent tensors with size
+    # as specified ((S0, S1, S2), D)
+    D = nested_size[1]
+    out = []
+    for s in nested_size[0]:
+        out.append(torch.randn(s, D, requires_grad=requires_grad, dtype=torch.float64))
+    return jagged_from_list(out, offsets)
+
+
+def get_view_test_cases():
+    # Test all cases with both an NT base and a dense base
+    # Subclass -> Subclass
+    # Dense -> Subclass
+
+    # NB: Don't close over loop variables, they will not get copied into the
+    # closure
+    #
+    # NB: These return functions so we don't generate tensors during test
+    # collection time
+
+    def mk_basic(base_is_nt):
+        # There are three cases to consider here based on the logic in
+        # meta_utils.py
+        #
+        # (1) basic case:
+        # view is not a leaf and has the same requires grad as its basic case
+        x, _ = get_jagged_tensor(((2, 3, 4), 3), None, requires_grad=True)
+        x = x.clone() if base_is_nt else x
+        assert not x.is_leaf
+        return x.unsqueeze(-1)
+
+    def mk_leaf(base_is_nt, requires_grad_1, requires_grad_2):
+        x, _ = get_jagged_tensor(((2, 3, 4), 3), None, requires_grad=requires_grad_1)
+        x = x.clone() if base_is_nt else x
+        with torch.no_grad():
+            x_view = x.unsqueeze(-1)
+            # The issue is this doesn't quite work
+            x_view.requires_grad_(requires_grad_2)
+
+        return x_view
+
+    def mk_obscure(base_is_nt):
+        x, _ = get_jagged_tensor(((2, 3, 4), 3), None, requires_grad=False)
+        x = x.clone() if base_is_nt else x
+        # intermediate leaf view
+        with torch.no_grad():
+            x_view = x.unsqueeze(-1)
+        x_view.requires_grad_(True)
+        x_view_view = x_view.unsqueeze(-1)
+        return x_view_view
+
+    for base_is_nt in [False, True]:
+        prefix = f"base_is_nt_{base_is_nt}"
+
+        yield partial(mk_basic, base_is_nt), f"{prefix}_basic"
+
+        # (2) leaf view case:
+        # the view has to be a leaf (w/ requires_grad True or requires_grad False)
+        # base w/ requires_grad True or requires_grad False
+        for requires_grad_1, requires_grad_2 in itertools.product(
+            [True, False], repeat=2
+        ):
+            yield partial(
+                mk_leaf, base_is_nt, requires_grad_1, requires_grad_2
+            ), f"{prefix}_leaf_{requires_grad_1}_{requires_grad_2}"
+
+        # (3) obscure case:
+        # view is not a leaf (implies requires_grad True)
+        # base w/ requires_grad False)
+        yield partial(mk_obscure, base_is_nt), f"{prefix}_obscure"
+
+    # Subclass -> Dense
+    yield lambda: get_jagged_tensor(((2, 3, 4), 3), None, requires_grad=True)[
+        0
+    ].clone(), "subclass_dense"
+
+    # Dense -> Subclass -> Dense -> Subclass
+    def mk_dense_subclass_dense_subclass():
+        values = torch.randn(10, 5)
+        offsets = torch.tensor([0, 3, 6, 10])
+        offsets2 = offsets.clone().detach()
+        return nested_view_from_values_offsets(
+            nested_view_from_values_offsets(values, offsets).values(), offsets
+        )
+
+    yield mk_dense_subclass_dense_subclass, "dense_subclass_dense_subclass"
+
+    def mk_subclass_dense_subclass_dense():
+        x = get_jagged_tensor(((2, 3, 4), 3), None, requires_grad=True)[0].clone()
+        offsets2 = x.offsets().clone().detach()
+        nt_view = nested_view_from_values_offsets(x.values(), offsets2).values()
+
+    yield mk_subclass_dense_subclass_dense, "subclass_dense_subclass_dense"
+
+
+VIEW_TEST_CASES = {k: v for v, k in get_view_test_cases()}
 
 
-requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 
 compile_full_eager = torch.compile(backend="eager", fullgraph=True)
 
@@ -52,6 +159,18 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         return func(*args, **kwargs)
 
 
+class AttrSubclass(torch.Tensor):
+    x: int = 10
+    size: int = 10
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+
+        return func(*args, **kwargs)
+
+
 class DummyNDim(torch.Tensor):
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
@@ -98,6 +217,8 @@ def __new__(
         cls,
         data: torch.Tensor,
         scale: torch.Tensor,
+        *,
+        constant: int = 0,
     ):
         return torch.Tensor._make_wrapper_subclass(
             cls,
@@ -110,24 +231,29 @@ def __new__(
             device=data.device,
         )
 
-    def __init__(self, data: torch.Tensor, scale: torch.Tensor):
+    def __init__(self, data: torch.Tensor, scale: torch.Tensor, constant: int = 0):
         self._data = data
         self._scale = scale
+        self._constant = constant
 
     def __tensor_flatten__(self):
-        ctx = {}
+        ctx = {"_constant": self._constant}
         return ["_data", "_scale"], ctx
 
     @staticmethod
     def __tensor_unflatten__(inner_tensors, metadata, outer_size, outer_stride):
         assert len(inner_tensors) == 2
-        return ScaledTensor(inner_tensors["_data"], inner_tensors["_scale"])
+        return ScaledTensor(
+            inner_tensors["_data"],
+            inner_tensors["_scale"],
+            constant=metadata["_constant"],
+        )
 
     @classmethod
     def __torch_dispatch__(cls, func, types, args, kwargs=None):
         scaled_tensor = args[0]
         out = func(scaled_tensor._data, *args[1:], **kwargs)
-        return ScaledTensor(out, scaled_tensor._scale)
+        return ScaledTensor(out, scaled_tensor._scale, constant=scaled_tensor._constant)
 
     def __repr__(self):
         return f"{self._data.__repr__()}\n{self._scale.__repr__()}"
@@ -509,6 +635,29 @@ def fn(w):
         res_act = fn_opt(wrapped)
         self.assertEqual(res_exp, res_act)
 
+    def test_tensor_subclass_custom_attr(self):
+        class AttrSubclass(torch.Tensor):
+            x: int = 10
+
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+
+                return super().__torch_function__(func, types, args, kwargs)
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            return x.x + torch.ones(2, 2)
+
+        with traceable_subclass(AttrSubclass):
+            input = torch.ones(2, 2).as_subclass(AttrSubclass)
+            fn_opt = compile_full_eager(fn)
+
+            res_exp = fn(input)
+            res_act = fn_opt(input)
+            self.assertEqual(res_exp, res_act)
+
     def test_compile_with_fake_tensor_dynamic_dim(self):
         x = torch.randn([3, 4])
 
@@ -612,18 +761,20 @@ def f(x):
         self.assertEqual(len(backend.graphs), 1)
         self.assertEqual(len(backend.example_inputs), 1)
 
-        expected = """\
+        actual = normalize_gm(backend.graphs[0].print_readable(print_output=False))
+        self.assertExpectedInline(
+            actual,
+            """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
+    def forward(self, L_x_: "f32[3, 4]"):
         l_x_ = L_x_
 
-        add_ = l_x_.add_(1.0)
-        relu_ = torch.relu_(l_x_);  l_x_ = None
-        add = add_ + relu_;  add_ = relu_ = None
+        add_: "f32[3, 4]" = l_x_.add_(1.0)
+        relu_: "f32[3, 4]" = torch.relu_(l_x_);  l_x_ = None
+        add: "f32[3, 4]" = add_ + relu_;  add_ = relu_ = None
         return (add,)
-"""
-        actual = normalize_gm(backend.graphs[0].print_readable(print_output=False))
-        self.assertEqual(actual, expected)
+""",
+        )
 
         ff = torch.func.functionalize(f)
         ff_out = ff(x_clone)
@@ -633,7 +784,19 @@ def forward(self, L_x_ : torch.Tensor):
         self.assertEqual(len(backend.graphs), 2)
         self.assertEqual(len(backend.example_inputs), 2)
         actual = normalize_gm(backend.graphs[1].print_readable(print_output=False))
-        self.assertEqual(actual, expected)
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_ : torch.Tensor):
+        l_x_ = L_x_
+
+        add_ = l_x_.add_(1.0)
+        relu_ = torch.relu_(l_x_);  l_x_ = None
+        add = add_ + relu_;  add_ = relu_ = None
+        return (add,)
+""",
+        )
         self.assertTrue(torch._is_functional_tensor(backend.example_inputs[1][0]))
 
         # Cannot re-use the version from AOTAutograd, since that uses python functional tensors.
@@ -663,7 +826,19 @@ def wrapper(*args, **kwargs):
         self.assertEqual(len(backend.graphs), 3)
         self.assertEqual(len(backend.example_inputs), 3)
         actual = normalize_gm(backend.graphs[2].print_readable(print_output=False))
-        self.assertEqual(actual, expected)
+        self.assertExpectedInline(
+            actual,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_ : torch.Tensor):
+        l_x_ = L_x_
+
+        add_ = l_x_.add_(1.0)
+        relu_ = torch.relu_(l_x_);  l_x_ = None
+        add = add_ + relu_;  add_ = relu_ = None
+        return (add,)
+""",
+        )
         self.assertTrue(torch._is_functional_tensor(backend.example_inputs[1][0]))
 
         self.assertEqual(f_out, ff_out)
@@ -695,34 +870,57 @@ def check_count_and_graph(
             actual = normalize_gm(
                 backend.graphs[exp_n_graph - 1].print_readable(print_output=False)
             )
-            self.assertExpectedInline(actual, exp_graph)
+            self.assertExpectedInline(actual, exp_graph, skip=1)
 
         t = torch.randn([3, 4])
         t_clone = t.clone()
         t_clone2 = t.clone()
         f(t)
 
-        expected_graph = """\
+        check_count_and_graph(
+            1,
+            2,
+            1,
+            """\
 class GraphModule(torch.nn.Module):
-    def forward(self, L_x_ : torch.Tensor):
+    def forward(self, L_x_: "f32[3, 4]"):
         l_x_ = L_x_
 
         wrap_body_0 = self.wrap_body_0
         wrap = torch._higher_order_ops.wrap.wrap(wrap_body_0, l_x_);  wrap_body_0 = l_x_ = None
-        getitem = wrap[0];  wrap = None
+        getitem: "f32[3, 4]" = wrap[0];  wrap = None
         return (getitem,)
 
     class GraphModule(torch.nn.Module):
-        def forward(self, l_x_):
-            add_ = l_x_.add_(1.0);  l_x_ = None
+        def forward(self, l_x_: "f32[3, 4]"):
+            add_: "f32[3, 4]" = l_x_.add_(1.0);  l_x_ = None
             return (add_,)
-"""
-        check_count_and_graph(1, 2, 1, expected_graph)
+""",
+        )
 
         ff = torch.func.functionalize(f)
         ff_out = ff(t_clone)
         # frame count and op count are incremented due to re-compilation
-        check_count_and_graph(2, 4, 2, expected_graph)
+        check_count_and_graph(
+            2,
+            4,
+            2,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_ : torch.Tensor):
+        l_x_ = L_x_
+
+        wrap_body_0 = self.wrap_body_0
+        wrap = torch._higher_order_ops.wrap.wrap(wrap_body_0, l_x_);  wrap_body_0 = l_x_ = None
+        getitem = wrap[0];  wrap = None
+        return (getitem,)
+
+    class GraphModule(torch.nn.Module):
+        def forward(self, l_x_):
+            add_ = l_x_.add_(1.0);  l_x_ = None
+            return (add_,)
+""",
+        )
 
         try:
             x = torch._to_functional_tensor(t_clone2)
@@ -733,7 +931,26 @@ def forward(self, l_x_):
             torch._disable_functionalization()
 
         # frame count and op count are incremented due to re-compilation
-        check_count_and_graph(3, 6, 3, expected_graph)
+        check_count_and_graph(
+            3,
+            6,
+            3,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_ : torch.Tensor):
+        l_x_ = L_x_
+
+        wrap_body_0 = self.wrap_body_0
+        wrap = torch._higher_order_ops.wrap.wrap(wrap_body_0, l_x_);  wrap_body_0 = l_x_ = None
+        getitem = wrap[0];  wrap = None
+        return (getitem,)
+
+    class GraphModule(torch.nn.Module):
+        def forward(self, l_x_):
+            add_ = l_x_.add_(1.0);  l_x_ = None
+            return (add_,)
+""",
+        )
 
     def test_has_torch_function(self):
         class MyTensor:
@@ -823,24 +1040,19 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
 
                 return DoubleSizeMaybeAddGeThreeTensor(out_inner)
 
-        lower_bound_str = None
-        upper_bound_str = None
         curr_var_to_val = None
         curr_var_to_sources = None
+        guards = None
 
         def backend(gm, args):
-            print(gm.code)
             context = torch._guards.TracingContext.get()
-            val_to_guards = list(context.fake_mode.shape_env.var_to_guards.values())
 
             # Grab info on sources and guards from the shapeenv
-            nonlocal lower_bound_str
-            nonlocal upper_bound_str
             nonlocal curr_var_to_val
             nonlocal curr_var_to_sources
+            nonlocal guards
 
-            lower_bound_str = str(val_to_guards[0][0].expr)
-            upper_bound_str = str(val_to_guards[0][1].expr)
+            guards = [str(g.expr) for g in context.fake_mode.shape_env.guards]
             curr_var_to_val = {
                 str(k): v for k, v in context.fake_mode.shape_env.var_to_val.items()
             }
@@ -872,14 +1084,15 @@ def fn(x):
             "s0": "L['x'].size()[0]",
             "s1": "L['x'].inner_elem.size()[0]",
         }
-        # lower bound comes from code underneath torch_dispatch  (operating on the inner tensor size)
-        expected_lower_bound = "s1 > 3"
-        # upper bound comes from user code (operating on the wrapper size)
-        expected_upper_bound = "2*s1 < 10"
         self.assertEqual(curr_var_to_val, expected_var_to_val)
         self.assertEqual(curr_var_to_sources, expected_var_to_sources)
-        self.assertEqual(lower_bound_str, expected_lower_bound)
-        self.assertEqual(upper_bound_str, expected_upper_bound)
+        self.assertExpectedInline(
+            "\n".join(guards),
+            """\
+Eq(2*s1, s0)
+2*s1 < 10
+s1 > 3""",
+        )
 
     def test_wrapper_subclass_with_same_sized_inner_tensor(self):
         # shouldn't recompile for different sizes when dynamic=True
@@ -914,6 +1127,96 @@ def test_wrapper_subclass_with_differently_sized_inner_tensor(self):
         sub2 = ScaledTensor(torch.randn(2, 4), torch.randn(5))
         self.assertTrue(_recompiles_for_inputs(func, (sub1,), (sub2,), dynamic=False))
 
+    def test_torch_dispatch_subclass_guard_recompile(self):
+        x = torch.ones(2, 2)
+        x_two = TwoTensor(x.clone(), x.clone())
+
+        def fn(w):
+            return torch.add(w, 1.0)
+
+        fn_opt = torch.compile(backend="eager")(fn)
+
+        ref = fn(x_two)
+        res = fn_opt(x_two)
+        self.assertEqual(ref, res)
+
+        # ensure no recompilation on same input type
+        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
+            fn_opt(TwoTensor(x + 1, x + 2))
+
+        # recompile!
+        ref = fn(x)
+        res = fn_opt(x)
+        self.assertEqual(ref, res)
+
+    def test_torch_function_subclass_survives_into_aot_autograd(self):
+        # If you have a tensor subclass that relies on dispatch into the same op
+        # without unwrapping and calling torch._C.DisableTorchFunctionSubclass(),
+        # the torch function-ness will survive into AOTAutograd. Today, NestedTensor
+        # actually relies on this behavior! Because that torch function logic
+        # runs during AOTAutograd, this test tests that there is no logic below
+        # that relies torch function that gets unexpectedly disabled after we
+        # redispatch from the subclass's torch function.
+        class SubTensor(torch.Tensor):
+            @staticmethod
+            def __new__(cls, t):
+                return torch.Tensor._make_wrapper_subclass(
+                    cls,
+                    t.shape,
+                    t.stride(),
+                    t.storage_offset(),
+                    torch.contiguous_format,
+                    t.dtype,
+                    torch.strided,
+                    t.device,
+                    False,
+                    t.requires_grad,
+                    "sizes",
+                    False,
+                    False,
+                    None,
+                )
+
+            def __init__(self, t):
+                super().__init__()
+                self._t = t
+
+            def __tensor_flatten__(self):
+                return ["_t"], {}
+
+            @staticmethod
+            def __tensor_unflatten__(inner_tensors, ctx, outer_size, outer_stride):
+                t = inner_tensors["_t"]
+                return SubTensor(t)
+
+            def __repr__(self):
+                return f"SubTensor({self._t})"
+
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+
+                with torch._C.DisableTorchFunctionSubclass():
+                    return func(*args, **kwargs)
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                kwargs = {} if kwargs is None else kwargs
+                new_args = pytree.tree_map_only(SubTensor, lambda s: s._t, args)
+                output = func(*new_args, **kwargs)
+                output = pytree.tree_map_only(
+                    torch.Tensor, lambda t: SubTensor(t), output
+                )
+                return output
+
+        @torch.compile(dynamic=True)
+        def f(x):
+            return x.unflatten(-1, [2, 5])
+
+        s = SubTensor(torch.randn(3, 10))
+        f(s)
+
     def test_recompile_with_symbool_inputs(self):
         def f(pred: bool):
             if pred:
@@ -954,13 +1257,13 @@ def test_recompilation(
         true_graph = """\
 class GraphModule(torch.nn.Module):
     def forward(self):
-        ones = torch.ones([3, 4])
+        ones: "f32[3, 4]" = torch.ones([3, 4])
         return (ones,)
 """
         false_graph = """\
 class GraphModule(torch.nn.Module):
     def forward(self):
-        ones = torch.ones([4, 3])
+        ones: "f32[4, 3]" = torch.ones([4, 3])
         return (ones,)
 """
         test_recompilation(
@@ -1007,6 +1310,16 @@ def forward(self):
             ],
         )
 
+    def test_wrapper_subclass_dynamo_attribute_access_on_intermediate(self):
+        def f(x_subclass):
+            tmp_subclass = torch.add(x, 1)
+            return torch.mul(tmp_subclass._scale, tmp_subclass._constant)
+
+        x = ScaledTensor(torch.randn(2, 4), torch.randn(3), constant=2)
+        out_ref = f(x)
+        out_test = torch.compile(f, backend="aot_eager", fullgraph=True)(x)
+        self.assertEqual(out_ref, out_test)
+
     def test_support_bases(self):
         import abc
 
@@ -1032,21 +1345,38 @@ def f(x):
 
         self.assertEqual(f(torch.randn(1)), (Multistreamable,))
 
+    @parametrize("dynamic", [False, True])
+    def test_subclass_views(self, dynamic):
+        def _get_views(t):
+            # Note that any closed-over SymInts will be symbolicized during fake-ification.
+            yield t.narrow(dim=-1, start=3, length=8)
+            yield t.split(5, -1)
+            yield t.split_with_sizes([9, 6], -1)
+            yield t.unsqueeze(-1).expand(4, 15, 10)
+            yield t.select(-1, 6)
+            yield t[2:3, 5:9]
+
+        def f(x):
+            return x * 2
+
+        compiled_f = torch.compile(
+            f, backend="aot_eager", fullgraph=True, dynamic=dynamic
+        )
+
+        # Take a view of a subclass to pass as input.
+        t = TwoTensor(torch.randn(4, 15), torch.randn(4, 15))
+        for view in _get_views(t):
+            out_ref = f(view)
+            out_test = compiled_f(view)
+            self.assertEqual(out_ref, out_test)
+
 
 instantiate_parametrized_tests(SubclassTests)
 
 
 class TestNestedTensor(torch._dynamo.test_case.TestCase):
     def _get_jagged_tensor(self, nested_size, offsets, requires_grad=True):
-        # Makes a jagged tensor with N constituent tensors with size
-        # as specified ((S0, S1, S2), D)
-        D = nested_size[1]
-        out = []
-        for s in nested_size[0]:
-            out.append(
-                torch.randn(s, D, requires_grad=requires_grad, dtype=torch.float64)
-            )
-        return jagged_from_list(out, offsets)
+        return get_jagged_tensor(nested_size, offsets, requires_grad)
 
     def _get_nc_jagged_tensor(self, inner_dim, starts, lengths, requires_grad=True):
         # Makes a jagged tensor with N constituent tensors with size
@@ -1104,19 +1434,20 @@ def _test_autograd(self, backend):
         a = torch.randn(2, 3, requires_grad=True, dtype=torch.float64)
         b = torch.randn(3, 3, requires_grad=True, dtype=torch.float64)
         c = torch.randn(4, 3, requires_grad=True, dtype=torch.float64)
-        nt, offsets = jagged_from_list([a, b, c], None)
-        nt2, _ = jagged_from_list([a, b, c], offsets)
+        nt = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
+        # TODO: Switch to public API when it exists
+        nt2, _ = jagged_from_list([a, b, c], nt.offsets())
 
         def fn1(nt1, nt2):
             return (nt1 + nt2).sin().cos()
 
         compiled_f = torch.compile(fn1, fullgraph=True, backend=backend, dynamic=True)
         out = compiled_f(nt, nt2)
-        out_buffer = ViewBufferFromNested.apply(out)
+        out_buffer = out.values()
         ga, gb, gc = torch.autograd.grad(out_buffer.sum(), (a, b, c))
 
         out_ref = fn1(nt, nt2)
-        out_buffer_ref = ViewBufferFromNested.apply(out_ref)
+        out_buffer_ref = out_ref.values()
         ga_ref, gb_ref, gc_ref = torch.autograd.grad(out_buffer_ref.sum(), (a, b, c))
 
         self.assertTrue(torch.allclose(ga, ga_ref))
@@ -1126,10 +1457,43 @@ def fn1(nt1, nt2):
     def test_basic_autograd(self):
         self._test_autograd("aot_eager")
 
-    @requires_cuda()
+    @requires_cuda
     def test_basic_autograd_inductor(self):
         self._test_autograd("inductor")
 
+    def test_subclass_with_mutation_in_graph(self):
+        # In this graph, we have an in-graph mutation, i.e. a mutation that is allowed
+        # to remain in the graph. Normally this is allowed, but it's not allowed if
+        # the graph handles subclasses at all.
+        # Whether the mutation is allowed or not allowed in the graph alters the number
+        # of outputs from the forward graph. Previously, a bug in this handling meant
+        # that sometimes the expected number and actual number of outputs from the
+        # joint graph did not match, causing assertion failures.
+        def fn(x, y):
+            z = x.sin()
+            y.sin_()
+            return z.cos(), y.cos()
+
+        fn_c = torch.compile(fn, backend="inductor")
+
+        values = [torch.rand((i, 8), requires_grad=True) for i in range(1, 6)]
+        values_copy = [x.detach().clone().requires_grad_(True) for x in values]
+
+        nt, offsets = jagged_from_list(values, None)
+        nt_copy, offsets = jagged_from_list(values_copy, offsets)
+        y = torch.rand((4, 8))
+        y_copy = y.clone()
+
+        ret = fn_c(nt, y)[0]
+        ref = fn(nt_copy, y_copy)[0]
+
+        self.assertEqual(ret.values(), ref.values())
+
+        ret.values().sum().backward()
+        ref.values().sum().backward()
+        for ref_v, res_v in zip(values_copy, values):
+            self.assertEqual(ref_v.grad, res_v.grad)
+
     def test_unbind(self):
         # NB: If we have shape e.g. (3, j0, 3), duck sizing will give us (s0, s1, s0).
         # This causes a recompile later on when it realizes the batch and last dim
@@ -1158,72 +1522,92 @@ def fn(x):
         self._check_recompiles(fn, (nt,), (nt2,), False)
         self._check_recompiles(fn, (nt,), (nt3,), True)
 
-    def _get_views(self):
-        # There are three cases to consider here based on the logic in
-        # meta_utils.py
-        #
-        # (1) basic case:
-        # view is not a leaf and has the same requires grad as its basic case
-        x, _ = self._get_jagged_tensor(((2, 3, 4), 3), None, requires_grad=True)
-        self.assertEqual(x.is_leaf, False)
-        yield x.unsqueeze(-1)
+    def test_inline_nested_tensor_from_jagged(self):
+        nt, _ = self._get_jagged_tensor(((2, 3, 4), 5), None)
 
-        # (2) leaf view case:
-        # the view has to be a leaf (w/ requires_grad True or requires_grad False)
-        # base w/ requires_grad True or requires_grad False
-        for requires_grad_1, requires_grad_2 in itertools.product(
-            [True, False], repeat=2
-        ):
-            x, _ = self._get_jagged_tensor(
-                ((2, 3, 4), 3), None, requires_grad=requires_grad_1
-            )
-            with torch.no_grad():
-                x_view = x.unsqueeze(-1)
-                # The issue is this doesn't quite work
-                x_view.requires_grad_(requires_grad_2)
-            yield x_view
+        def fn(x):
+            return torch.nested.nested_tensor_from_jagged(x.values() * 2, x.offsets())
 
-        # (3) obscure case:
-        # view is not a leaf (implies requires_grad True)
-        # base w/ requires_grad False)
-        x, _ = self._get_jagged_tensor(((2, 3, 4), 3), None, requires_grad=False)
-        # intermediate leaf view
-        with torch.no_grad():
-            x_view = x.unsqueeze(-1)
-        x_view.requires_grad_(True)
-        x_view_view = x_view.unsqueeze(-1)
-        yield x_view_view
+        torch.compile(fn, fullgraph=True, backend="aot_eager")(nt)
 
-    def test_inputs_to_compiled_fn_are_views(self):
-        for nt_view in self._get_views():
+    def _input_view_test(self, nt_view_name):
+        nt_view = VIEW_TEST_CASES[nt_view_name]()
 
-            def fn(x):
-                return x.sin()
+        def fn(x):
+            return x.sin()
 
-            out_ref = fn(nt_view)
-            torch._dynamo.reset()
-            compile_fn = torch.compile(
-                fn, fullgraph=True, backend="aot_eager", dynamic=True
-            )
-            out = compile_fn(nt_view)
+        out_ref = fn(nt_view)
+        torch._dynamo.reset()
+        compile_fn = torch.compile(
+            fn, fullgraph=True, backend="aot_eager", dynamic=True
+        )
+        out = compile_fn(nt_view)
 
-            # Check metadata and values are correct
-            self.assertTrue(out.size() == out_ref.size())
-            self.assertTrue(out.stride() == out_ref.stride())
+        # Check metadata and values are correct
+        self.assertTrue(out.size() == out_ref.size())
+        self.assertTrue(out.stride() == out_ref.stride())
+        if out.is_nested:
             self.assertTrue(torch.allclose(out.values(), out_ref.values()))
+        else:
+            self.assertTrue(torch.allclose(out, out_ref))
 
-            # Check that no guards are incurred
-            def backend(gm, args):
-                context = torch._guards.TracingContext.get()
-                val_to_guards = context.fake_mode.shape_env.var_to_guards.values()
-                self.assertEqual(len(val_to_guards), 0)
-                return gm
+        # Check that no upper/lower bound guards are incurred
+        def backend(gm, args):
+            context = torch._guards.TracingContext.get()
+            guards = [str(g.expr) for g in context.fake_mode.shape_env.guards]
+
+            # varies based on the type of view
+            guard_str = "\n".join(guards)
+            if (
+                isinstance(nt_view._base, NestedTensor)
+                or nt_view_name == "subclass_dense"
+            ):
+                self.assertExpectedInline(guard_str, """Eq(s3 - 1, s0)""")
+            else:
+                self.assertExpectedInline(guard_str, """""")
+            return gm
 
-            torch._dynamo.reset()
-            compile_fn = torch.compile(
-                fn, fullgraph=True, backend=backend, dynamic=True
+        torch._dynamo.reset()
+        compile_fn = torch.compile(fn, fullgraph=True, backend=backend, dynamic=True)
+        out = compile_fn(nt_view)
+
+    @parametrize(
+        "nt_view_name",
+        [k for k in VIEW_TEST_CASES.keys() if k != "subclass_dense_subclass_dense"],
+    )
+    def test_inputs_to_compiled_fn_are_views(self, nt_view_name):
+        self._input_view_test(nt_view_name)
+
+    def test_subclass_gives_static_shapes_when_dynamic_false(self):
+        def check_graph(gm, *args):
+            first_node_example_val = next(iter(gm.graph.nodes)).meta["example_value"]
+            # We compiled with dynamic=False, expect no SymInt sizes on our placeholders
+            self.assertTrue(
+                all(isinstance(x, int) for x in first_node_example_val.shape)
             )
-            out = compile_fn(nt_view)
+            return gm
+
+        @torch.compile(backend=check_graph, dynamic=False)
+        def f(x):
+            return x + 1
+
+        x_inner = torch.ones(4)
+        x = TwoTensor(x_inner, x_inner)
+        x_view = x.view(2, 2)
+        out = f(x_view)
+
+    # NJT1 -> Dense -> NJT2 -> Dense view
+    # During view replay, the Dense -> NJT2 part will construct an intermediate,
+    # symbolically-sized NJT that is immediately deconstructed to return the final dense
+    # view. To construct this intermediate properly, we need the associated nested int
+    # to be symbolic. This view is expected to fail compilation until symbolic nested ints
+    # are cached onto fake offsets to solve this problem.
+    @unittest.expectedFailure
+    def test_subclass_dense_subclass_dense_view(self):
+        self._input_view_test("subclass_dense_subclass_dense")
+
+
+instantiate_parametrized_tests(TestNestedTensor)
 
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_trace_rules.py b/test/dynamo/test_trace_rules.py
index 56fc0d6fd2c38..a88e7eae95d52 100644
--- a/test/dynamo/test_trace_rules.py
+++ b/test/dynamo/test_trace_rules.py
@@ -12,22 +12,16 @@
 import torch._dynamo.config as config
 import torch._dynamo.test_case
 import torch._functorch.deprecated as deprecated_func
-from torch._dynamo.skipfiles import (
-    FUNC_INLINELIST,
-    LEGACY_MOD_INLINELIST,
-    MOD_INLINELIST,
-)
 from torch._dynamo.trace_rules import (
+    LEGACY_MOD_INLINELIST,
     load_object,
+    manual_torch_name_rule_map,
+    MOD_INLINELIST,
     torch_c_binding_in_graph_functions,
-    torch_ctx_manager_classes,
     torch_non_c_binding_in_graph_functions,
 )
 from torch._dynamo.utils import hashable, is_safe_constant, istype
-from torch._dynamo.variables import (
-    TorchCtxManagerClassVariable,
-    TorchInGraphFunctionVariable,
-)
+from torch._dynamo.variables import TorchInGraphFunctionVariable, UserFunctionVariable
 
 try:
     from .utils import create_dummy_module_and_function
@@ -35,52 +29,18 @@
     from utils import create_dummy_module_and_function
 
 
-ignored_ctx_manager_class_names = {
-    "torch.ExcludeDispatchKeyGuard",
-    "torch._C.DisableTorchFunction",
-    "torch._C._AutoDispatchBelowAutograd",
-    "torch._C._DisableAutocast",
-    "torch._C._DisableFuncTorch",
-    "torch._C._DisablePythonDispatcher",
-    "torch._C._DisableTorchDispatch",
-    "torch._C._EnablePreDispatch",
-    "torch._C._EnablePythonDispatcher",
-    "torch._C._EnableTorchFunction",
-    "torch._C._ExcludeDispatchKeyGuard",
-    "torch._C._ForceDispatchKeyGuard",
-    "torch._C._IncludeDispatchKeyGuard",
-    "torch._C._InferenceMode",
-    "torch._C._RestorePythonTLSSnapshot",
-    "torch._C._SetExcludeDispatchKeyGuard",
-    "torch.ao.nn.sparse.quantized.utils.LinearBlockSparsePattern",
-    "torch.autograd.anomaly_mode.detect_anomaly",
-    "torch.autograd.anomaly_mode.set_detect_anomaly",
-    "torch.autograd.forward_ad._set_fwd_grad_enabled",
-    "torch.autograd.forward_ad.dual_level",
-    "torch.autograd.grad_mode._force_original_view_tracking",
-    "torch.autograd.grad_mode._unsafe_preserve_version_counter",
-    "torch.autograd.grad_mode.set_multithreading_enabled",
-    "torch.autograd.graph._CloneArgBeforeMutateMode",
-    "torch.autograd.graph._swap_with_cloned",
-    "torch.autograd.graph.save_on_cpu",
-    "torch.autograd.graph.saved_tensors_hooks",
-    "torch.backends.mkl.verbose",
-    "torch.backends.mkldnn.verbose",
-    "torch.cpu.StreamContext",
-    "torch.cuda.StreamContext",
-    "torch.cuda._DeviceGuard",
-    "torch.cuda.device",
-    "torch.cuda.device_of",
-    "torch.cuda.graphs.graph",
-    "torch.device",  # as constant folding function
-    "torch.sparse.check_sparse_tensor_invariants",
-}
-
 ignored_c_binding_in_graph_function_names = {
     # Ignored because they have manual rules defined at `trace_rules.manual_torch_name_rule_map`.
     "torch._nested_tensor_from_mask",
     "torch._nested_from_padded",
-    # Ignored and go through rules defined at `skipfiles.check`.
+    "torch.sparse_compressed_tensor",
+    "torch.sparse_bsc_tensor",
+    "torch.sparse_bsr_tensor",
+    "torch.sparse_coo_tensor",
+    "torch.sparse_csc_tensor",
+    "torch.sparse_csr_tensor",
+    "torch.cuda._get_device_properties",
+    # Ignored and go through rules defined at `trace_rules.check`.
     "torch._functionalize_are_all_mutations_under_no_grad_or_inference_mode",
     "torch._cslt_sparse_mm_search",
     "torch._C._abort",
@@ -103,6 +63,19 @@
     "torch._C.set_autocast_xla_enabled",
     "torch.resize_as_",
     "torch.resize_as_sparse_",
+    "torch._C._data_address",
+    "torch._C._is_cow_tensor",
+    "torch._lazy_clone",
+    "torch._test_parallel_materialize",
+    "torch._C._storage_address",
+    "torch._C._pickle_save",
+    "torch._validate_sparse_compressed_tensor_args",
+    "torch._validate_sparse_csr_tensor_args",
+    "torch._validate_sparse_bsr_tensor_args",
+    "torch._validate_sparse_csc_tensor_args",
+    "torch._validate_sparse_coo_tensor_args",
+    "torch._validate_sparse_bsc_tensor_args",
+    "torch._validate_compressed_sparse_indices",
 }
 if torch._C._llvm_enabled():
     ignored_c_binding_in_graph_function_names |= {
@@ -129,7 +102,6 @@ class AllowedObjects:
     """
 
     object_ids: Dict[int, str]
-    ctx_mamager_classes: Set[Any]
     c_binding_in_graph_functions: Set[Any]
     non_c_binding_in_graph_functions: Set[Any]
     name_rule_map: Dict[str, Any]
@@ -142,24 +114,10 @@ def gen_allowed_objs_and_ids(record=False, c_binding_only=True) -> AllowedObject
 
     warnings.filterwarnings("ignore", category=UserWarning, module="torch.distributed")
     torch_object_ids = dict()
-    ctx_mamager_classes = set()
     c_binding_in_graph_functions = set()
     non_c_binding_in_graph_functions = set()
     torch_name_rule_map = dict()
 
-    # Add obj to ctx_mamager_classes set if it's a torch context manager class.
-    # This is used to generate the ctx manager class list based on heuristic.
-    def heuristic_record_if_ctx_manager(obj, module, name):
-        if (
-            issubclass(type(obj), type)
-            and hasattr(obj, "__enter__")
-            and hasattr(obj, "__exit__")
-        ):
-            torch_name_rule_map[
-                f"{module.__name__}.{name}"
-            ] = TorchCtxManagerClassVariable
-            ctx_mamager_classes.add(obj)
-
     # In some platforms, these functions were loaded as classes instead of functions.
     # To mitigate these weired cases, we need this special check.
     def is_special_functions(obj):
@@ -324,12 +282,10 @@ def _find_torch_objects(module):
                         _find_torch_objects(obj)
                 elif _is_allowed_module_prefix(obj):
                     if record:
-                        heuristic_record_if_ctx_manager(obj, module, name)
                         heuristic_record_if_in_graph_function(obj, module, name)
                     torch_object_ids[id(obj)] = f"{module.__name__}.{name}"
                 elif inspect.getmodule(obj) is None and not is_safe_constant(obj):
                     if record:
-                        heuristic_record_if_ctx_manager(obj, module, name)
                         heuristic_record_if_in_graph_function(obj, module, name)
                     torch_object_ids[id(obj)] = f"{module.__name__}.{name}"
 
@@ -338,26 +294,12 @@ def _find_torch_objects(module):
 
     return AllowedObjects(
         torch_object_ids,
-        ctx_mamager_classes,
         c_binding_in_graph_functions,
         non_c_binding_in_graph_functions,
         torch_name_rule_map,
     )
 
 
-def gen_get_func_inlinelist(dummy_func_inlinelist):
-    def get_func_inlinelist():
-        inlinelist = set()
-        for f in dummy_func_inlinelist:
-            module_name, fn_name = f.rsplit(".", 1)
-            m = importlib.import_module(module_name)
-            fn = getattr(m, fn_name)
-            inlinelist.add(fn.__code__)
-        return inlinelist
-
-    return get_func_inlinelist
-
-
 class TraceRuleTests(torch._dynamo.test_case.TestCase):
     def _check_set_equality(self, generated, used, rule_map, ignored_set):
         x = generated - used
@@ -382,34 +324,15 @@ def test_skipfiles_inlinelist(self):
         for m in LEGACY_MOD_INLINELIST.union(MOD_INLINELIST):
             self.assertTrue(
                 isinstance(importlib.import_module(m), types.ModuleType),
-                f"{m} from skipfiles.MOD_INLINELIST/LEGACY_MOD_INLINELIST is not a python module, please check and correct it.",
-            )
-        for f in FUNC_INLINELIST:
-            module_name, fn_name = f.rsplit(".", 1)
-            m = importlib.import_module(module_name)
-            self.assertTrue(
-                isinstance(getattr(m, fn_name), types.FunctionType),
-                f"{f} from skipfiles.FUNC_INLINELIST is not a python function, please check and correct it.",
+                f"{m} from trace_rules.MOD_INLINELIST/LEGACY_MOD_INLINELIST is not a python module, please check and correct it.",
             )
 
+    @unittest.skip(
+        "This test keeps getting broken and our disable infra is not handling well. see #120627"
+    )
     def test_torch_name_rule_map_updated(self):
         # Generate the allowed objects based on heuristic defined in `allowed_functions.py`,
         objs = gen_allowed_objs_and_ids(record=True, c_binding_only=True)
-        # Test ctx manager classes are updated in torch_name_rule_map.
-        generated = objs.ctx_mamager_classes
-        used = set()
-        for x in (
-            set(torch_ctx_manager_classes.keys()) | ignored_ctx_manager_class_names
-        ):
-            obj = load_object(x)
-            if obj is not None:
-                used.add(obj)
-        self._check_set_equality(
-            generated,
-            used,
-            "torch_ctx_manager_classes",
-            "ignored_ctx_manager_class_names",
-        )
         # Test C binding in graph functions are updated in torch_name_rule_map.
         generated = objs.c_binding_in_graph_functions
         used = set()
@@ -433,7 +356,6 @@ def test_torch_name_rule_map_updated(self):
                     load_object(f),
                     (
                         types.FunctionType,
-                        types.MethodType,
                         types.BuiltinFunctionType,
                         types.MethodDescriptorType,
                         types.WrapperDescriptorType,
@@ -441,24 +363,35 @@ def test_torch_name_rule_map_updated(self):
                 )
             )
 
-    def test_func_inlinelist_torch_function(self):
+    def test_force_inline_torch_function(self):
+        # `torch._dynamo.utils.istype` is skipped by default
         def fn(x):
             if istype(x, torch.Tensor):
                 return x + 1
             else:
                 return x - 1
 
-        func_inlinelist = torch._dynamo.skipfiles.FUNC_INLINELIST.copy()
-        func_inlinelist.add("torch._dynamo.utils.istype")
+        _manual_torch_name_rule_map = manual_torch_name_rule_map.copy()
+        # Force inline `torch._dynamo.utils.istype` by setting trace rule.
+        _manual_torch_name_rule_map["torch._dynamo.utils.istype"] = UserFunctionVariable
+
+        _torch_name_rule_map = [
+            _manual_torch_name_rule_map,
+            torch_c_binding_in_graph_functions,
+            torch_non_c_binding_in_graph_functions,
+        ]
 
         self.assertTrue(
-            "torch._dynamo" not in torch._dynamo.skipfiles.LEGACY_MOD_INLINELIST
+            "torch._dynamo" not in torch._dynamo.trace_rules.LEGACY_MOD_INLINELIST
         )
-        self.assertTrue("torch._dynamo" not in torch._dynamo.skipfiles.MOD_INLINELIST)
+        self.assertTrue("torch._dynamo" not in torch._dynamo.trace_rules.MOD_INLINELIST)
 
         with unittest.mock.patch(
-            "torch._dynamo.skipfiles.get_func_inlinelist",
-            gen_get_func_inlinelist(func_inlinelist),
+            "torch._dynamo.trace_rules.torch_name_rule_map",
+            _torch_name_rule_map,
+        ), unittest.mock.patch(
+            "torch._dynamo.trace_rules.get_torch_obj_rule_map",
+            torch._dynamo.trace_rules.get_torch_obj_rule_map.__wrapped__,  # bypass functools.lru_cache
         ):
             x = torch.rand(3)
             opt_fn = torch.compile(backend="eager", fullgraph=True)(fn)
@@ -466,24 +399,33 @@ def fn(x):
             res = opt_fn(x)
             self.assertEqual(ref, res)
 
-    def test_func_inlinelist_third_party_function(self):
+    def test_force_inline_custom_function(self):
         mod, func = create_dummy_module_and_function()
 
         def fn(x):
             return func(x)
 
-        func_inlinelist = torch._dynamo.skipfiles.FUNC_INLINELIST.copy()
-        func_inlinelist.add(f"{mod.__name__}.{func.__name__}")
+        _manual_torch_name_rule_map = manual_torch_name_rule_map.copy()
+        # Force inline `mod.func` by setting trace rule.
+        _manual_torch_name_rule_map[
+            f"{mod.__name__}.{func.__name__}"
+        ] = UserFunctionVariable
+
+        _torch_name_rule_map = [
+            _manual_torch_name_rule_map,
+            torch_c_binding_in_graph_functions,
+            torch_non_c_binding_in_graph_functions,
+        ]
 
         with unittest.mock.patch(
-            "torch._dynamo.skipfiles.get_func_inlinelist",
-            gen_get_func_inlinelist(func_inlinelist),
+            "torch._dynamo.trace_rules.torch_name_rule_map",
+            _torch_name_rule_map,
         ), unittest.mock.patch(
-            "torch._dynamo.skipfiles.SKIP_DIRS",
-            torch._dynamo.skipfiles.SKIP_DIRS.copy(),
+            "torch._dynamo.trace_rules.get_torch_obj_rule_map",
+            torch._dynamo.trace_rules.get_torch_obj_rule_map.__wrapped__,
         ):
-            # First adding the module to SKIP_DIRS so that it will be skipped.
-            torch._dynamo.skipfiles.add(mod.__name__)
+            # First adding the module to SKIP_DIRS so that it will be skipped by default.
+            torch._dynamo.trace_rules.add(mod.__name__)
             x = torch.rand(3)
             opt_fn = torch.compile(backend="eager", fullgraph=True)(fn)
             ref = fn(x)
@@ -491,6 +433,25 @@ def fn(x):
             self.assertEqual(ref, res)
 
 
+class TestModuleSurviveSkipFiles(torch._dynamo.test_case.TestCase):
+    @unittest.skipIf(
+        not torch.distributed.is_available(),
+        "need to import MLP module from distributed",
+    )
+    def test_module_survive_skip_files(self):
+        from torch.testing._internal.common_fsdp import MLP
+
+        model = MLP(3)
+        inp = torch.randn((2, 3))
+        frame_count_before = torch._dynamo.convert_frame.FRAME_COUNTER
+        model.compile(backend="eager")
+        model(inp)
+        frame_count_after = torch._dynamo.convert_frame.FRAME_COUNTER
+        self.assertTrue(
+            frame_count_after > frame_count_before, "MLP did not survive skip files"
+        )
+
+
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/dynamo/test_triton_kernels.py b/test/dynamo/test_triton_kernels.py
deleted file mode 100644
index 6aa34c5b740f7..0000000000000
--- a/test/dynamo/test_triton_kernels.py
+++ /dev/null
@@ -1,850 +0,0 @@
-# Owner(s): ["module: dynamo"]
-# flake8: noqa: E731
-# Skip do not assign a lambda expression, use a def
-from unittest.mock import patch
-
-import torch
-
-import torch._dynamo.test_case
-import torch._dynamo.testing
-
-from torch._higher_order_ops.triton_kernel_wrap import (
-    triton_kernel_wrapper_functional,
-    triton_kernel_wrapper_mutation,
-)
-from torch._inductor import metrics
-from torch.testing._internal import common_utils
-from torch.testing._internal.common_utils import skipIfRocm
-
-# Defines all the kernels for tests
-from torch.testing._internal.triton_utils import *  # noqa: F403
-
-if HAS_CUDA:
-    import triton
-    from triton import language as tl
-
-
-# Define shared triton constants here.
-CONSTANT_C = 4
-STRING_CONSTANT_C = "CONSTANT_C"
-BOOL_CONSTANT_C = True
-
-
-class KernelTests(torch._dynamo.test_case.TestCase):
-    @requires_cuda()
-    def test_triton_kernel_with_kernel_param(self):
-        @triton.jit
-        def pass_kernel(kernel):
-            pass
-
-        @torch.compile(backend="eager")
-        def f(x):
-            grid = (x.numel(),)
-            pass_kernel[grid](kernel=x)
-
-        t1 = torch.rand(5, device="cuda")
-        f(t1)
-        # No need to assert anything, the goal is to make sure dynamo does
-        # not crash
-
-    @requires_cuda()
-    def test_triton_kernel_higher_order_func(self):
-        from torch._higher_order_ops.triton_kernel_wrap import kernel_side_table
-
-        add_kernel_id = kernel_side_table.add_kernel(add_kernel)
-
-        t1 = torch.rand(5, device="cuda")
-        t2 = torch.rand(5, device="cuda")
-
-        torch_add = t1 + t2
-
-        # Test higher order function with mutation
-        output = torch.zeros_like(t1)
-        n_elements = output.numel()
-        grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-        triton_kernel_wrapper_mutation(
-            kernel_idx=add_kernel_id,
-            grid=[grid],
-            kwargs={
-                "in_ptr0": t1,
-                "in_ptr1": t2,
-                "out_ptr": output,
-                "n_elements": n_elements,
-                "BLOCK_SIZE": 16,
-            },
-        )
-        self.assertEqual(output, torch_add)
-        # Make sure it is modified
-        self.assertNotEqual(output, torch.zeros_like(t1))
-
-        # Test higher order function without mutation
-        output = torch.zeros_like(t1)
-        out_dict = triton_kernel_wrapper_functional(
-            kernel_idx=add_kernel_id,
-            grid=[grid],
-            kwargs={
-                "in_ptr0": t1,
-                "in_ptr1": t2,
-                "out_ptr": output,
-                "n_elements": n_elements,
-                "BLOCK_SIZE": 16,
-            },
-            tensors_to_clone=["in_ptr0", "in_ptr1", "out_ptr"],
-        )
-        self.assertEqual(out_dict["out_ptr"], torch_add)
-        # Make sure it is NOT modified
-        self.assertEqual(output, torch.zeros_like(t1))
-
-    @requires_cuda()
-    @skipIfRocm
-    def test_triton_kernel_functionalize(self):
-        from functorch import make_fx
-        from torch._higher_order_ops.triton_kernel_wrap import kernel_side_table
-        from torch._subclasses.functional_tensor import (
-            CppFunctionalizeAPI,
-            FunctionalTensorMode,
-            PythonFunctionalizeAPI,
-        )
-
-        kernel_side_table.reset_table()
-
-        def f(x, output):
-            out = triton_kernel_wrapper_functional(
-                kernel_idx=kernel_side_table.add_kernel(mul2_kernel),
-                grid=[(x.numel(),)],
-                kwargs={
-                    "in_ptr0": x,
-                    "out_ptr": output,
-                    "n_elements": output.numel(),
-                    "BLOCK_SIZE": 16,
-                },
-                tensors_to_clone=["in_ptr0", "out_ptr"],
-            )
-            return out["out_ptr"]
-
-        t1 = torch.rand(5, device="cuda")
-        t2 = torch.rand(5, device="cuda")
-        with FunctionalTensorMode():
-            gm = make_fx(PythonFunctionalizeAPI().functionalize(f))(t1, t2)
-        # Make sure t2 was not modified
-        self.assertNotEqual(gm(t1, t2), t2)
-
-        gm = make_fx(CppFunctionalizeAPI().functionalize(f))(t1, t2)
-        # Make sure t2 was not modified
-        self.assertNotEqual(gm(t1, t2), t2)
-
-        gm = make_fx(torch.func.functionalize(f))(t1, t2)
-        # Make sure t2 was not modified
-        self.assertNotEqual(gm(t1, t2), t2)
-
-        gm = make_fx(f, tracing_mode="fake")(t1, t2)
-        self.assertExpectedInline(
-            gm.code.strip(),
-            """\
-def forward(self, x_1, output_1):
-    triton_kernel_wrapper_functional_proxy = torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional(kernel_idx = 0, grid = [(5,)], kwargs = {'in_ptr0': x_1, 'out_ptr': output_1, 'n_elements': 5, 'BLOCK_SIZE': 16}, tensors_to_clone = ['in_ptr0', 'out_ptr']);  x_1 = output_1 = None
-    getitem = triton_kernel_wrapper_functional_proxy['in_ptr0']
-    getitem_1 = triton_kernel_wrapper_functional_proxy['out_ptr'];  triton_kernel_wrapper_functional_proxy = None
-    return getitem_1""",
-        )
-
-    @requires_cuda()
-    @skipIfRocm
-    def test_triton_kernel_mutation_type(self):
-        from torch._higher_order_ops.triton_kernel_wrap import kernel_side_table
-        from torch._subclasses.fake_tensor import FakeTensorMode
-        from torch._subclasses.functional_tensor import (
-            FunctionalTensor,
-            FunctionalTensorMode,
-        )
-
-        def prep():
-            x = torch.ones(4, device="cuda", requires_grad=True)
-            with FunctionalTensorMode():
-                x_func = FunctionalTensor.to_functional(x)
-            self.assertTrue(torch._is_functional_tensor(x_func.elem))
-            return x_func
-
-        # normal mutation only
-        with FakeTensorMode():
-            x_func = prep()
-
-            with FunctionalTensorMode():
-                x_func.mul_(2)
-
-            self.assertFalse(
-                torch._functionalize_are_all_mutations_hidden_from_autograd(x_func.elem)
-            )
-
-        # triton kernel mutation only
-        with FakeTensorMode():
-            x_func = prep()
-
-            with FunctionalTensorMode():
-                triton_kernel_wrapper_mutation(
-                    kernel_idx=kernel_side_table.add_kernel(mul2_inplace_kernel),
-                    grid=[(x_func.numel(),)],
-                    kwargs={
-                        "ptr": x_func,
-                        "n_elements": x_func.numel(),
-                        "BLOCK_SIZE": 16,
-                    },
-                )
-
-            self.assertTrue(
-                torch._functionalize_are_all_mutations_hidden_from_autograd(x_func.elem)
-            )
-
-        # normal mutation + triton kernel mutation
-        with FakeTensorMode():
-            x_func = prep()
-
-            with FunctionalTensorMode():
-                x_func.mul_(2)
-                triton_kernel_wrapper_mutation(
-                    kernel_idx=kernel_side_table.add_kernel(mul2_inplace_kernel),
-                    grid=[(x_func.numel(),)],
-                    kwargs={
-                        "ptr": x_func,
-                        "n_elements": x_func.numel(),
-                        "BLOCK_SIZE": 16,
-                    },
-                )
-
-            self.assertFalse(
-                torch._functionalize_are_all_mutations_hidden_from_autograd(x_func.elem)
-            )
-
-    @requires_cuda()
-    @common_utils.parametrize("dynamic", [False, True])
-    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
-    def test_triton_kernel_with_views(self, dynamic, backend):
-        def call_triton_take_view(x: torch.Tensor):
-            output = torch.zeros_like(x)
-            n_elements = output.numel()
-            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-            mul2_kernel[grid](x, output, n_elements, BLOCK_SIZE=16)
-            return output
-
-        def call_triton_return_view(x: torch.Tensor):
-            output = torch.zeros_like(x)
-            n_elements = output.numel()
-            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-            mul2_kernel[grid](x, output, n_elements, BLOCK_SIZE=16)
-            return output.view(4, 4)
-
-        t = torch.rand(4, 4, device="cuda")
-        t_view = t.view(16)
-
-        compiled_func = torch.compile(
-            call_triton_take_view, backend=backend, fullgraph=True, dynamic=dynamic
-        )
-        self.assertEqual(2 * t_view, compiled_func(t_view))
-        self.assertEqual(2 * t, compiled_func(t_view).view(4, 4))
-
-        compiled_func = torch.compile(
-            call_triton_return_view, backend=backend, fullgraph=True, dynamic=dynamic
-        )
-        self.assertEqual(2 * t_view, compiled_func(t).view(16))
-        self.assertEqual(2 * t, compiled_func(t))
-
-    @requires_cuda()
-    @common_utils.parametrize("grad_fn", [torch.no_grad, torch.enable_grad])
-    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
-    def test_triton_kernel_with_grad_option(self, grad_fn, backend):
-        def call_triton(x: torch.Tensor):
-            with grad_fn():
-                output = torch.zeros_like(x)
-                n_elements = output.numel()
-                grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-                mul2_kernel[grid](x, output, n_elements, BLOCK_SIZE=16)
-                return output
-
-        t = torch.rand(5, device="cuda")
-        compiled_func = torch.compile(call_triton, backend=backend, fullgraph=True)
-        self.assertEqual(2 * t, compiled_func(t))
-
-    @requires_cuda()
-    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
-    def test_triton_kernel_inner_triton_function(self, backend):
-        def f(x: torch.Tensor):
-            @triton.jit
-            def pow2_kernel(
-                in_ptr0,
-                out_ptr,
-                n_elements,
-                BLOCK_SIZE: "tl.constexpr",
-            ):
-                pid = tl.program_id(axis=0)
-                block_start = pid * BLOCK_SIZE
-                offsets = block_start + tl.arange(0, BLOCK_SIZE)
-                mask = offsets < n_elements
-                x = tl.load(in_ptr0 + offsets, mask=mask)
-                output = x * x
-                tl.store(out_ptr + offsets, output, mask=mask)
-
-            output = torch.zeros_like(x)
-            n_elements = output.numel()
-            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-            pow2_kernel[grid](x, output, n_elements, BLOCK_SIZE=16)
-            return output
-
-        t = torch.rand(5, device="cuda")
-
-        compiled_func = torch.compile(f, backend=backend, fullgraph=True)
-        # TODO(oulgen): NYI - Support this
-        # self.assertEqual(t * t, compiled_func(t))
-
-    @requires_cuda()
-    @common_utils.parametrize("grad", [False, True])
-    @common_utils.parametrize("dynamic", [False, True])
-    @patch.object(torch._inductor.config, "implicit_fallbacks", False)
-    def test_triton_kernel_no_clones(self, grad, dynamic):
-        from torch._inductor.utils import run_and_get_code
-
-        def call_triton(x: torch.Tensor, y: torch.Tensor, output: torch.Tensor):
-            n_elements = output.numel()
-
-            tmp = torch.add(x, 1)
-            grid = (x.numel(),)
-            add_kernel.run(
-                x, y, output, n_elements, warmup=False, grid=grid, BLOCK_SIZE=16
-            )
-
-            return output, tmp
-
-        t1 = torch.rand(5, device="cuda", requires_grad=grad)
-        t2 = torch.rand(5, device="cuda", requires_grad=grad)
-        o1 = torch.zeros_like(t1, requires_grad=grad)
-
-        torch_add = call_triton(t1, t2, o1)
-        metrics.reset()
-        o2 = torch.zeros_like(t1, requires_grad=grad)
-        test, codes = run_and_get_code(
-            torch.compile(call_triton, dynamic=dynamic), t1, t2, o2
-        )
-        if not grad:
-            self.assertEqual(metrics.generated_kernel_count, 1)
-        self.assertEqual(torch_add, test)
-        # These two asserts are not optimal since it requires original aten
-        # to be in the metadata, so there might be false negatives
-        self.assertTrue("aten.copy" not in codes[0])
-        self.assertTrue("aten.clone" not in codes[0])
-        # The following checks that there are only the tensor output is in
-        # the compiled graph
-        if dynamic and grad:
-            self.assertTrue("return (buf0, s0, )" in codes[0])
-        else:
-            self.assertTrue("return (buf0, )" in codes[0])
-
-    @requires_cuda()
-    @skipIfRocm
-    def test_triton_kernel_caching(self):
-        from torch._inductor.utils import run_and_get_code
-
-        def add_in_loop(
-            x: torch.Tensor,
-            y: torch.Tensor,
-        ):
-            output = torch.zeros_like(x)
-            n_elements = output.numel()
-            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-            add_kernel_autotuned[grid](x, y, output, n_elements)
-            return output
-
-        def call_triton_add(
-            x: torch.Tensor,
-            y: torch.Tensor,
-        ):
-            for i in range(4):
-                x = add_in_loop(x, y)
-            return x
-
-        t1 = torch.ones(5, device="cuda")
-        t2 = torch.ones(5, device="cuda")
-
-        test, (code,) = run_and_get_code(torch.compile(call_triton_add), t1, t2)
-        self.assertEqual(test, 5 * torch.ones(5, device="cuda"))
-        self.assertTrue("add_kernel_autotuned_1.run" not in code)
-
-    @requires_cuda()
-    @skipIfRocm
-    def test_triton_kernel_caching_duplicate(self):
-        from torch._inductor.utils import run_and_get_code
-
-        class C:
-            @triton.jit
-            def pass_kernel(
-                in_ptr0,
-                out_ptr,
-                n_elements,
-                BLOCK_SIZE: "tl.constexpr",
-            ):
-                pid = tl.program_id(axis=0)
-                block_start = pid * BLOCK_SIZE
-                offsets = block_start + tl.arange(0, BLOCK_SIZE)
-                mask = offsets < n_elements
-                x = tl.load(in_ptr0 + offsets, mask=mask)
-                tl.store(out_ptr + offsets, x, mask=mask)
-
-        class D:
-            @triton.jit
-            def pass_kernel(
-                in_ptr0,
-                out_ptr,
-                n_elements,
-                BLOCK_SIZE: "tl.constexpr",
-            ):
-                pid = tl.program_id(axis=0)
-                block_start = pid * BLOCK_SIZE
-                offsets = block_start + tl.arange(0, BLOCK_SIZE)
-                mask = offsets < n_elements
-                x = tl.load(in_ptr0 + offsets, mask=mask)
-                tl.store(out_ptr + offsets, x, mask=mask)
-
-        def call_triton(x: torch.Tensor):
-            output1 = torch.zeros_like(x)
-            output2 = torch.zeros_like(x)
-            n_elements = output1.numel()
-            grid = (n_elements,)
-            C.pass_kernel[grid](x, output1, n_elements, BLOCK_SIZE=16)
-            D.pass_kernel[grid](x, output2, n_elements, BLOCK_SIZE=16)
-            return output1 + output2
-
-        t = torch.ones(5, device="cuda")
-        test, (code,) = run_and_get_code(torch.compile(call_triton), t)
-        # Make sure we emitted two kernels here
-        self.assertTrue("pass_kernel_0.run" in code)
-        self.assertTrue("pass_kernel_1.run" in code)
-
-    @requires_cuda()
-    @skipIfRocm
-    def test_triton_kernel_various_args(self):
-        @triton.autotune(
-            configs=[triton.Config({"BLOCK_SIZE": 128})],
-            key=[],
-        )
-        @triton.jit
-        def pass_kernel(
-            out_ptr,
-            n_elements,
-            dummy_None,
-            dummy_empty,
-            dummy_float,
-            BLOCK_SIZE: "tl.constexpr",
-            RANDOM_SIZE: "tl.constexpr",
-        ):
-            pass
-
-        @torch.compile
-        def call_triton(output):
-            n_elements = output.numel()
-            grid = (n_elements,)
-            pass_kernel[grid](
-                output,
-                n_elements,
-                None,
-                torch.empty_like(output),
-                3.1415926,
-                RANDOM_SIZE=0,
-            )
-            return output
-
-        output = torch.randn(5, device="cuda")
-        # Make sure this does not crash
-        call_triton(output)
-
-    @requires_cuda()
-    @skipIfRocm
-    def test_triton_kernel_dependancies(self):
-        def call_triton(
-            x: torch.Tensor,
-            y: torch.Tensor,
-        ):
-            output = torch.zeros_like(x)
-            n_elements = output.numel()
-            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-            add_kernel_autotuned[grid](x, y, output, n_elements)
-            output2 = torch.zeros_like(output)
-            add_kernel_autotuned[grid](output, y, output2, n_elements)
-            output3 = torch.add(output2, 1)
-            return output3
-
-        t1 = torch.rand(5, device="cuda")
-        t2 = torch.rand(5, device="cuda")
-        torch_result = call_triton(t1, t2)
-        compiled_result = torch.compile(call_triton)(t1, t2)
-        self.assertEqual(torch_result, compiled_result)
-
-    @requires_cuda()
-    @common_utils.parametrize("grad", [False, True])
-    def test_triton_kernel_multi_kernel(self, grad):
-        @triton.jit
-        def mul2_and_add_and_zero_negatives_kernel(
-            in_ptr0,
-            in_ptr1,
-            out_ptr,
-            n_elements,
-            BLOCK_SIZE: "tl.constexpr",
-            ACTIVATION: "tl.constexpr",
-        ):
-            pid = tl.program_id(axis=0)
-            block_start = pid * BLOCK_SIZE
-            offsets = block_start + tl.arange(0, BLOCK_SIZE)
-            mask = offsets < n_elements
-            indirection_kernel(
-                in_ptr0,
-                in_ptr0,
-                n_elements,
-                BLOCK_SIZE=BLOCK_SIZE,
-                ACTIVATION="mul2_inplace_kernel",
-            )
-            indirection_kernel(
-                in_ptr1,
-                in_ptr1,
-                n_elements,
-                BLOCK_SIZE=BLOCK_SIZE,
-                ACTIVATION="mul2_inplace_kernel",
-            )
-            x = tl.load(in_ptr0 + offsets, mask=mask)
-            y = tl.load(in_ptr1 + offsets, mask=mask)
-            output = x + y
-            if ACTIVATION == "zero_negs":
-                output = zero_negs(output)
-            tl.store(out_ptr + offsets, output, mask=mask)
-
-        @torch.compile
-        def call_triton(
-            x: torch.Tensor,
-            y: torch.Tensor,
-            xi: torch.Tensor,
-            yi: torch.Tensor,
-            output: torch.Tensor,
-            outputi: torch.Tensor,
-        ):
-            n_elements = output.numel()
-
-            grid = (x.numel(),)
-            mul2_and_add_and_zero_negatives_kernel[grid](
-                x, y, output, n_elements, BLOCK_SIZE=16, ACTIVATION="zero_negs"
-            )
-            mul2_and_add_and_zero_negatives_kernel[grid](
-                xi, yi, outputi, n_elements, BLOCK_SIZE=16, ACTIVATION=None
-            )
-
-            return (output, outputi)
-
-        t1 = torch.tensor(
-            [-2.0, -1.0, 0.0, 1.0, 2.0], device="cuda", requires_grad=grad
-        )
-        t2 = torch.tensor(
-            [-2.0, -1.0, 0.0, 1.0, 2.0], device="cuda", requires_grad=grad
-        )
-        float_result = 2 * t1 + 2 * t2
-        float_result = float_result.where(float_result >= 0, 0.0)
-
-        t1i = torch.randint(-2, 2, (5,), device="cuda")
-        t2i = torch.randint(-2, 2, (5,), device="cuda")
-        o = torch.zeros_like(t1, requires_grad=grad)
-        oi = torch.zeros_like(t1i)
-        int_result = 2 * t1i + 2 * t2i
-
-        (result, resulti) = call_triton(t1, t2, t1i, t2i, o, oi)
-        self.assertEqual(float_result, result)
-        self.assertEqual(int_result, resulti)
-
-    @requires_cuda()
-    def test_triton_kernel_constants(self):
-        @triton.jit
-        def mulC_kernel(
-            in_ptr0,
-            out_ptr,
-            n_elements,
-            BLOCK_SIZE: "tl.constexpr",
-            CONSTANT_NAME: "tl.constexpr",
-        ):
-            pid = tl.program_id(axis=0)
-            block_start = pid * BLOCK_SIZE
-            offsets = block_start + tl.arange(0, BLOCK_SIZE)
-            mask = offsets < n_elements
-            x = tl.load(in_ptr0 + offsets, mask=mask)
-            if CONSTANT_NAME.value == STRING_CONSTANT_C:
-                output = CONSTANT_C * x
-            if BOOL_CONSTANT_C:
-                output *= CONSTANT_C
-            tl.store(out_ptr + offsets, output, mask=mask)
-
-        def call_triton(
-            x: torch.Tensor,
-        ):
-            output = torch.zeros_like(x)
-            n_elements = output.numel()
-
-            grid = (x.numel(),)
-            mulC_kernel[grid](
-                x, output, n_elements, BLOCK_SIZE=16, CONSTANT_NAME="CONSTANT_C"
-            )
-            return output
-
-        # Triton kernels capture global constants by their parse time value
-        # not runtime value
-        global CONSTANT_C
-        prev_c = CONSTANT_C
-        # If the behavior of triton kernels change, this test will fail
-        CONSTANT_C = 10
-        assert CONSTANT_C != prev_c
-
-        t = torch.randn(5, device="cuda")
-        torch_result = call_triton(t)
-        compiled_result = torch.compile(call_triton)(t)
-
-        self.assertEqual(torch_result, compiled_result)
-
-        # reset back
-        CONSTANT_C = prev_c
-
-    @requires_cuda()
-    @skipIfRocm
-    @common_utils.parametrize("grad", [False, True])
-    @common_utils.parametrize("dynamic", [False, True])
-    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
-    @common_utils.parametrize("grid_type", [1, 2, 3])
-    def test_triton_kernel_autotune(self, grad, dynamic, backend, grid_type):
-        def call_triton(x: torch.Tensor, y: torch.Tensor, output: torch.Tensor):
-            n_elements = output.numel()
-
-            def grid_fn(meta):
-                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-
-            if grid_type == 1:
-                grid = (n_elements,)
-            elif grid_type == 2:
-                grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-            elif grid_type == 3:
-                grid = grid_fn
-
-            add_kernel_autotuned[grid](x, y, output, n_elements)
-            return output
-
-        t1 = torch.rand(256, device="cuda", requires_grad=grad)
-        t2 = torch.rand(256, device="cuda", requires_grad=grad)
-        output = torch.zeros_like(t1, requires_grad=grad)
-
-        torch_add = call_triton(t1, t2, output)
-        compiled_func = torch.compile(
-            call_triton, backend=backend, fullgraph=True, dynamic=dynamic
-        )
-
-        output2 = torch.zeros_like(t1, requires_grad=grad)
-        self.assertEqual(compiled_func(t1, t2, output2), torch_add)
-
-    @requires_cuda()
-    @skipIfRocm
-    @common_utils.parametrize("grad", [False, True])
-    @common_utils.parametrize("dynamic", [False, True])
-    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
-    @common_utils.parametrize("grid_type", [1, 2, 3])
-    def test_triton_kernel_2d_autotune(self, grad, dynamic, backend, grid_type):
-        def call_triton(x: torch.Tensor, y: torch.Tensor, output: torch.Tensor):
-            x_elements = output.size()[0]
-            y_elements = output.size()[1]
-
-            def grid_fn(meta):
-                return (
-                    triton.cdiv(x_elements, meta["BLOCK_SIZE_X"]),
-                    triton.cdiv(y_elements, meta["BLOCK_SIZE_Y"]),
-                )
-
-            if grid_type == 1:
-                grid = (x_elements, y_elements)
-            elif grid_type == 2:
-                grid = lambda meta: (
-                    triton.cdiv(x_elements, meta["BLOCK_SIZE_X"]),
-                    triton.cdiv(y_elements, meta["BLOCK_SIZE_Y"]),
-                )
-            elif grid_type == 3:
-                grid = grid_fn
-
-            add_kernel_2d_autotuned[grid](x, y, output, x_elements, y_elements)
-            return output
-
-        t1 = torch.rand((512, 256), device="cuda", requires_grad=grad)
-        t2 = torch.rand((512, 256), device="cuda", requires_grad=grad)
-        output = torch.zeros_like(t1, requires_grad=grad)
-
-        torch_result = call_triton(t1, t2, output)
-        compiled_func = torch.compile(
-            call_triton, backend=backend, fullgraph=True, dynamic=dynamic
-        )
-        output2 = torch.zeros_like(t1, requires_grad=grad)
-        self.assertEqual(compiled_func(t1, t2, output2), torch_result)
-
-    @requires_cuda()
-    @common_utils.parametrize("grad", [False, True])
-    @common_utils.parametrize("dynamic", [False, True])
-    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
-    @patch.object(torch._inductor.config, "implicit_fallbacks", False)
-    def test_triton_kernel_native(self, grad, dynamic, backend):
-        def call_triton_add(
-            x: torch.Tensor,
-            y: torch.Tensor,
-            output: torch.Tensor,
-            grid_type: int,
-            num=1,
-            positional=False,
-        ):
-            n_elements = output.numel()
-
-            def grid_fn(meta):
-                return (triton.cdiv(num, meta["BLOCK_SIZE"]),)
-
-            if grid_type == 0:
-                grid = (x.numel(),)
-            elif grid_type == 1:
-                grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
-            else:
-                grid = grid_fn
-
-            if positional:
-                add_kernel[grid](x, y, output, n_elements, 16)
-            else:
-                add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=16)
-
-            return output
-
-        t1 = torch.rand(5, device="cuda", requires_grad=grad)
-        t2 = torch.rand(5, device="cuda", requires_grad=grad)
-        o1 = torch.zeros_like(t1, requires_grad=grad)
-
-        torch_add = t1 + t2
-
-        # No Dynamo -- Make sure triton kernel works
-        self.assertEqual(call_triton_add(t1, t2, o1, 1), torch_add)
-        # No Dynamo -- Make sure triton kernel works (with positional BLOCK_SIZE)
-        o2 = torch.zeros_like(t1, requires_grad=grad)
-        self.assertEqual(call_triton_add(t1, t2, o2, 1, True), torch_add)
-
-        # With Dynamo
-        compiled_func = torch.compile(
-            call_triton_add, backend=backend, fullgraph=True, dynamic=dynamic
-        )
-        # With simple kernel
-        o3 = torch.zeros_like(t1, requires_grad=grad)
-        self.assertEqual(compiled_func(t1, t2, o3, 0), torch_add)
-        # With lambda kernel
-        o4 = torch.zeros_like(t1, requires_grad=grad)
-        self.assertEqual(compiled_func(t1, t2, o4, 1), torch_add)
-        # With lambda kernel (with positional BLOCK_SIZE)
-        o5 = torch.zeros_like(t1, requires_grad=grad)
-        self.assertEqual(compiled_func(t1, t2, o5, 1, 1, True), torch_add)
-        # With user defined function kernel
-        o6 = torch.zeros_like(t1, requires_grad=grad)
-        self.assertEqual(compiled_func(t1, t2, o6, 2, 200), torch_add)
-
-    @requires_cuda()
-    def test_triton_kernel_mutation_not_mark_dirty(self):
-        @torch.compile
-        def f(x):
-            n_elements = x.numel()
-            add_kernel[(n_elements,)](x, x, x, n_elements, 16)
-            return x
-
-        x = torch.randn(5, device="cuda", requires_grad=True)
-        x_cloned = x.clone()
-        out = x_cloned.sin()
-        f(x_cloned)
-        out.sum().backward()
-
-    @requires_cuda()
-    def test_triton_kernel_matmul_tracking(self):
-        @triton.jit
-        def ones_kernel(x_ptr, n_elements, BLOCK_SIZE: "tl.constexpr"):
-            pid = tl.program_id(axis=0)
-            block_start = pid * BLOCK_SIZE
-            offsets = block_start + tl.arange(0, BLOCK_SIZE)
-            mask = offsets < n_elements
-            x = 1.0
-            tl.store(x_ptr + offsets, x, mask=mask)
-
-        @torch.compile
-        def f(x):
-            out = torch.zeros_like(x)
-            ones_kernel[(4,)](out, 16, BLOCK_SIZE=16)
-            return torch.mm(out, x) + 10
-
-        x = torch.randn(4, 4, device="cuda")
-        torch_out = f(x)
-        python_out = torch.mm(torch.ones(4, 4, device="cuda"), x) + 10
-        self.assertEqual(torch_out, python_out)
-
-    @requires_cuda()
-    def test_triton_kernel_strided_input(self):
-        def f(inp):
-            # left has strides [256, 1]
-            left, right = torch.split(inp, [128, 128], dim=1)
-            out = torch.empty_like(left)
-            X_BLOCK_SIZE, Y_BLOCK_SIZE = 32, 16
-            grid = (left.size(1) // X_BLOCK_SIZE, left.size(0) // Y_BLOCK_SIZE)
-            double_strided_kernel[grid](
-                in_ptr=left,
-                out_ptr=out,
-                in_y_stride=left.stride(0),
-                out_y_stride=out.stride(0),
-                X_BLOCK_SIZE=X_BLOCK_SIZE,
-                Y_BLOCK_SIZE=Y_BLOCK_SIZE,
-            )
-            return out
-
-        inp = torch.randn(64, 256, device="cuda")
-
-        eager_out = f(inp)
-        compiled_out = torch.compile(f)(inp)
-        self.assertEqual(compiled_out, eager_out)
-
-    @requires_cuda()
-    def test_triton_kernel_strided_input_nonzero_offset(self):
-        def f(inp):
-            # right has strides [256, 1] and storage offset 128
-            left, right = torch.split(inp, [128, 128], dim=1)
-            out = torch.empty_like(right)
-            X_BLOCK_SIZE, Y_BLOCK_SIZE = 32, 16
-            grid = (right.size(1) // X_BLOCK_SIZE, right.size(0) // Y_BLOCK_SIZE)
-            double_strided_kernel[grid](
-                in_ptr=right,
-                out_ptr=out,
-                in_y_stride=right.stride(0),
-                out_y_stride=out.stride(0),
-                X_BLOCK_SIZE=X_BLOCK_SIZE,
-                Y_BLOCK_SIZE=Y_BLOCK_SIZE,
-            )
-            return out
-
-        inp = torch.randn(64, 256, device="cuda")
-
-        eager_out = f(inp)
-        compiled_out = torch.compile(f)(inp)
-        self.assertEqual(compiled_out, eager_out)
-
-
-class MutationTests(torch._dynamo.test_case.TestCase):
-    @requires_cuda()
-    def test_find_mutations(self):
-        from torch._higher_order_ops.triton_kernel_wrap import filter_non_mutated
-
-        tests = [
-            [add_kernel, ["in_ptr0", "in_ptr1", "out_ptr"], ["out_ptr"]],
-            [add_kernel_2d_autotuned, ["in_ptr0", "in_ptr1", "out_ptr"], ["out_ptr"]],
-            # Cannot remove in_ptr0 since it is used in a external call
-            [indirection_kernel, ["in_ptr0", "out_ptr"], ["in_ptr0", "out_ptr"]],
-            [mul2_inplace_kernel, ["ptr"], ["ptr"]],
-        ]
-        for kernel, inputs, outputs in tests:
-            self.assertListEqual(filter_non_mutated(kernel, inputs), outputs)
-
-
-common_utils.instantiate_parametrized_tests(KernelTests)
-
-if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
-
-    run_tests()
diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index 7110155d16997..83443a5a55763 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -212,6 +212,24 @@ def fn(x, y):
         self.assertEqual(cnt.frame_count, 1)
         self.assertEqual(cnt.op_count, 1)
 
+    def test_no_recompiles_prod_backward(self):
+        # https://github.com/pytorch/pytorch/issues/120608
+        cnt = CompileCounter()
+
+        @torch.compile(backend=cnt, fullgraph=True, dynamic=True)
+        def fn(t):
+            return torch.prod(t, 3, keepdim=True)
+
+        input_shapes = [(8, 10, 3, 2), (8, 3, 5, 2), (8, 4, 8, 2)]
+        for s in input_shapes:
+            t1 = torch.randn(s, requires_grad=True)
+            h_result = fn(t1)
+            grad = torch.ones_like(h_result)
+            h_result.backward(grad)
+
+        self.assertEqual(cnt.frame_count, 1)
+        self.assertEqual(cnt.op_count, 1)
+
     @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
     def test_builtin_functions_on_cuda(self):
         def fn(x, scaler):
@@ -267,6 +285,15 @@ def fn(x, y):
             res = opt_fn(x, y)
             self.assertTrue(same(ref, res))
 
+    def test_mark_static_inside(self):
+        def fn(x):
+            torch._dynamo.mark_static(x, 0)
+            comptime.assert_static(x.size(0))
+            return x + 1
+
+        opt_fn = torch.compile(fn, dynamic=True, fullgraph=True)
+        opt_fn(torch.randn(12, 23))
+
     def test_shape_graph_break(self):
         from torch._dynamo.comptime import comptime
 
@@ -384,6 +411,17 @@ def fn(inputs, dim):
         compl_fn = torch.compile(fn, dynamic=True, backend="eager", fullgraph=True)
         self.assertEqual(compl_fn(inputs, dim), fn(inputs, dim))
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_item_max(self):
+        def fn(x):
+            return torch.ones(max(x.item(), 1024))
+
+        x = torch.tensor([1000])
+        y = torch.tensor([2000])
+        compl_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        self.assertEqual(fn(x), compl_fn(x))
+        self.assertEqual(fn(y), compl_fn(y))
+
     # https://github.com/pytorch/pytorch/issues/104812
     def test_argmin_coerces_symint_to_intlist_spec(self):
         def fn(x, dim):
@@ -405,6 +443,48 @@ def fn(inputs, op_inputs_dict):
         compl_fn = torch.compile(fn, dynamic=True, backend="eager", fullgraph=True)
         self.assertEqual(compl_fn(inputs, op_inputs_dict), fn(inputs, op_inputs_dict))
 
+    def test_symbol_guard_limit_before_specialize(self):
+        cnts = torch._dynamo.testing.CompileCounter()
+
+        @torch._dynamo.optimize(cnts, dynamic=True)
+        def fn(x):
+            torch._check(x.size(0) != 3)
+            torch._check(x.size(0) != 4)
+            torch._check(x.size(0) != 5)
+            torch._check(x.size(0) != 6)
+            return x + 2
+
+        # Control test
+        fn(torch.randn(12))
+        fn(torch.randn(13))
+        fn(torch.randn(14))
+
+        self.assertExpectedInline(cnts.frame_count, """1""")
+        cnts.frame_count = 0
+
+        torch._dynamo.reset()
+
+        with torch.fx.experimental._config.patch(
+            symbol_guard_limit_before_specialize=3
+        ):
+            fn(torch.randn(12))
+            fn(torch.randn(13))
+            fn(torch.randn(14))
+
+            self.assertExpectedInline(cnts.frame_count, """3""")
+
+    def test_defaults(self):
+        def g(x, i=8):
+            comptime.assert_static(i)
+            return x * i
+
+        def fn(x):
+            return g(x)
+
+        inputs = torch.randn(2, 3, 4)
+        compl_fn = torch.compile(fn, dynamic=True, backend="eager")
+        self.assertEqual(compl_fn(inputs), fn(inputs))
+
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_data_dependent_evaluate_expr_graph_break(self):
         cnts = torch._dynamo.testing.CompileCounter()
@@ -433,6 +513,26 @@ def fn(x):
         self.assertExpectedInline(cnts.frame_count, """2""")
         self.assertExpectedInline(cnts.op_count, """3""")
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_split_aot_autograd(self):
+        @torch.compile(backend="aot_eager", fullgraph=True)
+        def f(x, i):
+            y, z = i.tolist()
+            return torch.split(x, [y, z])
+
+        print(f(torch.randn(10, requires_grad=True), torch.tensor([7, 3])))
+
+    def test_bool_tensor_ctor(self):
+        cnts = torch._dynamo.testing.CompileCounter()
+
+        @torch.compile(backend=cnts, dynamic=True, fullgraph=True)
+        def f(x):
+            y = torch.empty((x.size(0) // 13) * 13)
+            return torch.tensor(y.numel() == 0)
+
+        self.assertTrue(f(torch.empty(8)).item())
+        self.assertFalse(f(torch.empty(13)).item())
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_verify_correctness.py b/test/dynamo/test_verify_correctness.py
index 27c517645a735..99f37e1a12354 100644
--- a/test/dynamo/test_verify_correctness.py
+++ b/test/dynamo/test_verify_correctness.py
@@ -7,6 +7,7 @@
 import torch._dynamo.config as config
 import torch._dynamo.test_case
 from torch._dynamo.testing import same
+from torch.fx._lazy_graph_module import _force_skip_lazy_graph_module
 
 
 class Seq(torch.nn.Module):
@@ -86,6 +87,7 @@ def compiler_fn(graph, example_inputs):
         self.assertEqual(r1.device, r2.device)
         self.assertEqual(r1.device, r3.device)
 
+    @_force_skip_lazy_graph_module()
     def test_torchscript(self):
         s = Seq()
         i = torch.randn(10)
diff --git a/caffe2/contrib/aten/__init__.py b/test/dynamo_expected_failures/ActivationCheckpointingTests.test_cond_with_kwargs
similarity index 100%
rename from caffe2/contrib/aten/__init__.py
rename to test/dynamo_expected_failures/ActivationCheckpointingTests.test_cond_with_kwargs
diff --git a/caffe2/contrib/aten/docs/__init__.py b/test/dynamo_expected_failures/AotAutogradFallbackTests.test_aot_sequence_nr
similarity index 100%
rename from caffe2/contrib/aten/docs/__init__.py
rename to test/dynamo_expected_failures/AotAutogradFallbackTests.test_aot_sequence_nr
diff --git a/caffe2/contrib/gloo/__init__.py b/test/dynamo_expected_failures/AutogradFunctionTests.test_graph_break_if_lifted_free_variable
similarity index 100%
rename from caffe2/contrib/gloo/__init__.py
rename to test/dynamo_expected_failures/AutogradFunctionTests.test_graph_break_if_lifted_free_variable
diff --git a/caffe2/contrib/nccl/__init__.py b/test/dynamo_expected_failures/AutogradFunctionTests.test_print_in_bwd
similarity index 100%
rename from caffe2/contrib/nccl/__init__.py
rename to test/dynamo_expected_failures/AutogradFunctionTests.test_print_in_bwd
diff --git a/caffe2/contrib/nnpack/__init__.py b/test/dynamo_expected_failures/AutogradFunctionTests.test_stride_in_bwd
similarity index 100%
rename from caffe2/contrib/nnpack/__init__.py
rename to test/dynamo_expected_failures/AutogradFunctionTests.test_stride_in_bwd
diff --git a/caffe2/contrib/playground/__init__.py b/test/dynamo_expected_failures/ComptimeTests.test_graph_break
similarity index 100%
rename from caffe2/contrib/playground/__init__.py
rename to test/dynamo_expected_failures/ComptimeTests.test_graph_break
diff --git a/caffe2/contrib/playground/resnetdemo/__init__.py b/test/dynamo_expected_failures/CtxManagerTests.test_autograd_profiler_enabled
similarity index 100%
rename from caffe2/contrib/playground/resnetdemo/__init__.py
rename to test/dynamo_expected_failures/CtxManagerTests.test_autograd_profiler_enabled
diff --git a/caffe2/contrib/prof/__init__.py b/test/dynamo_expected_failures/CtxManagerTests.test_disable_saved_tensors_hooks
similarity index 100%
rename from caffe2/contrib/prof/__init__.py
rename to test/dynamo_expected_failures/CtxManagerTests.test_disable_saved_tensors_hooks
diff --git a/caffe2/contrib/script/__init__.py b/test/dynamo_expected_failures/CtxManagerTests.test_disable_saved_tensors_hooks_prev_disabled
similarity index 100%
rename from caffe2/contrib/script/__init__.py
rename to test/dynamo_expected_failures/CtxManagerTests.test_disable_saved_tensors_hooks_prev_disabled
diff --git a/caffe2/contrib/script/examples/__init__.py b/test/dynamo_expected_failures/CtxManagerTests.test_disable_saved_tensors_hooks_prev_disabled_nested
similarity index 100%
rename from caffe2/contrib/script/examples/__init__.py
rename to test/dynamo_expected_failures/CtxManagerTests.test_disable_saved_tensors_hooks_prev_disabled_nested
diff --git a/caffe2/contrib/tensorboard/__init__.py b/test/dynamo_expected_failures/DecoratorTests.test_mark_static_address_guarded
similarity index 100%
rename from caffe2/contrib/tensorboard/__init__.py
rename to test/dynamo_expected_failures/DecoratorTests.test_mark_static_address_guarded
diff --git a/caffe2/contrib/warpctc/__init__.py b/test/dynamo_expected_failures/DecoratorTests.test_mark_static_address_unguarded
similarity index 100%
rename from caffe2/contrib/warpctc/__init__.py
rename to test/dynamo_expected_failures/DecoratorTests.test_mark_static_address_unguarded
diff --git a/caffe2/distributed/__init__.py b/test/dynamo_expected_failures/DefaultsTests.test_cast_tensor_single_elem
similarity index 100%
rename from caffe2/distributed/__init__.py
rename to test/dynamo_expected_failures/DefaultsTests.test_cast_tensor_single_elem
diff --git a/caffe2/experiments/__init__.py b/test/dynamo_expected_failures/DefaultsTests.test_compare_constant_and_tensor
similarity index 100%
rename from caffe2/experiments/__init__.py
rename to test/dynamo_expected_failures/DefaultsTests.test_compare_constant_and_tensor
diff --git a/caffe2/experiments/python/__init__.py b/test/dynamo_expected_failures/DefaultsTests.test_func_default_torch_args
similarity index 100%
rename from caffe2/experiments/python/__init__.py
rename to test/dynamo_expected_failures/DefaultsTests.test_func_default_torch_args
diff --git a/test/dynamo_expected_failures/DefaultsTests.test_in_set_inplace b/test/dynamo_expected_failures/DefaultsTests.test_in_set_inplace
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/DefaultsTests.test_in_set_would_fail_broadcast b/test/dynamo_expected_failures/DefaultsTests.test_in_set_would_fail_broadcast
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/DynamoProfilerTests.test_dynamo_timed_profiling_isolated b/test/dynamo_expected_failures/DynamoProfilerTests.test_dynamo_timed_profiling_isolated
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/DynamoProfilerTests.test_profiler_cache_lookup b/test/dynamo_expected_failures/DynamoProfilerTests.test_profiler_cache_lookup
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/DynamoProfilerTests.test_profiler_dynamo_compiled_region b/test/dynamo_expected_failures/DynamoProfilerTests.test_profiler_dynamo_compiled_region
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/End2EndTests.test_init_group b/test/dynamo_expected_failures/End2EndTests.test_init_group
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExampleTests.test_exportdb_supported_case_assume_constant_result b/test/dynamo_expected_failures/ExampleTests.test_exportdb_supported_case_assume_constant_result
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExampleTests.test_exportdb_supported_case_constrain_as_size_example b/test/dynamo_expected_failures/ExampleTests.test_exportdb_supported_case_constrain_as_size_example
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExcTests.test_backend_suppress_line b/test/dynamo_expected_failures/ExcTests.test_backend_suppress_line
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExcTests.test_graph_break_log b/test/dynamo_expected_failures/ExcTests.test_graph_break_log
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExcTests.test_internal_error_no_suppress b/test/dynamo_expected_failures/ExcTests.test_internal_error_no_suppress
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExcTests.test_internal_error_suppress_errors b/test/dynamo_expected_failures/ExcTests.test_internal_error_suppress_errors
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExcTests.test_not_implemented_error b/test/dynamo_expected_failures/ExcTests.test_not_implemented_error
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExcTests.test_trigger_bisect_on_error b/test/dynamo_expected_failures/ExcTests.test_trigger_bisect_on_error
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExcTests.test_trigger_on_error b/test/dynamo_expected_failures/ExcTests.test_trigger_on_error
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_cond_raise_user_error_on_non_list_operands b/test/dynamo_expected_failures/ExportTests.test_cond_raise_user_error_on_non_list_operands
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_cond_raise_user_error_on_non_tensor_operands b/test/dynamo_expected_failures/ExportTests.test_cond_raise_user_error_on_non_tensor_operands
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_cond_raise_user_error_on_unsupported_pred b/test/dynamo_expected_failures/ExportTests.test_cond_raise_user_error_on_unsupported_pred
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_decomp b/test/dynamo_expected_failures/ExportTests.test_export_decomp
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_dynamic_dim_cleanup b/test/dynamo_expected_failures/ExportTests.test_export_dynamic_dim_cleanup
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_dynamic_dim_range_constraint b/test/dynamo_expected_failures/ExportTests.test_export_dynamic_dim_range_constraint
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_mark_dynamic_conflict_dynamic_dim b/test/dynamo_expected_failures/ExportTests.test_export_mark_dynamic_conflict_dynamic_dim
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_multi_dynamic_dim_constraint b/test/dynamo_expected_failures/ExportTests.test_export_multi_dynamic_dim_constraint
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_multi_dynamic_dim_unsafe_relationship b/test/dynamo_expected_failures/ExportTests.test_export_multi_dynamic_dim_unsafe_relationship
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_no_raise b/test/dynamo_expected_failures/ExportTests.test_export_no_raise
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_preserve_constraints_as_metadata_scalar b/test/dynamo_expected_failures/ExportTests.test_export_preserve_constraints_as_metadata_scalar
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_raise_on_relationship b/test/dynamo_expected_failures/ExportTests.test_export_raise_on_relationship
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_dict_values b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_dict_values
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_free_function b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_free_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_free_function_and_class_method b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_free_function_and_class_method
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_free_function_and_class_method_multiarg b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_free_function_and_class_method_multiarg
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_free_function_and_class_method_multiarg_diff b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_free_function_and_class_method_multiarg_diff
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_list_nonzero b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_list_nonzero
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_list_nonzero_free_function b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_list_nonzero_free_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_method_on_module b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_method_on_module
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_method_on_module_invoke_twice b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_method_on_module_invoke_twice
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_none_control_flow b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_none_control_flow
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_none_control_flow_free_func b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_none_control_flow_free_func
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_not_none_control_flow b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_not_none_control_flow
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_not_none_control_flow_free_func b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_not_none_control_flow_free_func
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_not_none_control_flow_pos b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_not_none_control_flow_pos
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_not_return_const b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_not_return_const
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_constant_tuple_nonzero b/test/dynamo_expected_failures/ExportTests.test_export_with_constant_tuple_nonzero
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_export_with_map_cond b/test/dynamo_expected_failures/ExportTests.test_export_with_map_cond
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_exported_graph_serialization b/test/dynamo_expected_failures/ExportTests.test_exported_graph_serialization
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_map_cond_param_buffer_lifted b/test/dynamo_expected_failures/ExportTests.test_map_cond_param_buffer_lifted
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_torch_inference_mode_ctx b/test/dynamo_expected_failures/ExportTests.test_torch_inference_mode_ctx
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ExportTests.test_untracked_inputs_in_constraints b/test/dynamo_expected_failures/ExportTests.test_untracked_inputs_in_constraints
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FakeTensorOperatorInvariants.test_like_ops b/test/dynamo_expected_failures/FakeTensorOperatorInvariants.test_like_ops
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FakeTensorOperatorInvariants.test_non_kwarg_only_device b/test/dynamo_expected_failures/FakeTensorOperatorInvariants.test_non_kwarg_only_device
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FakeTensorOperatorInvariants.test_tensor_constructors_all_have_kwarg_device b/test/dynamo_expected_failures/FakeTensorOperatorInvariants.test_tensor_constructors_all_have_kwarg_device
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_disable_capture b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_disable_capture
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_fn_with_kwargs b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_fn_with_kwargs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_freevar_python_scalar b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_freevar_python_scalar
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_has_aux b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_has_aux
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_non_tensor_input b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_non_tensor_input
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_pytree b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_pytree
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_two_tensor_all_grad_has_aux b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_two_tensor_all_grad_has_aux
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_two_tensor_has_aux b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_two_tensor_has_aux
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_with_graph_break b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_with_graph_break
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_with_side_effect b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_grad_with_side_effect
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_disable_capture b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_disable_capture
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_free_const b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_free_const
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_free_tensor b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_free_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_kwargs b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_kwargs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_multiple_invocation_in_dims b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_multiple_invocation_in_dims
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_multiple_invocation_out_dims b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_multiple_invocation_out_dims
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_multiple_outputs b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_multiple_outputs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_multiple_outputs_diff_dims b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_multiple_outputs_diff_dims
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_multiple_outputs_out_dims_tuple b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_multiple_outputs_out_dims_tuple
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_over_vmap_captured b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_over_vmap_captured
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_over_vmap_two_inputs b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_over_vmap_two_inputs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_previous_illegal_op_no_graph_break b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_previous_illegal_op_no_graph_break
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_pytree_inputs b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_pytree_inputs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_recompile_same_config b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_recompile_same_config
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_recompile_with_randomness b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_recompile_with_randomness
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_side_effects b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_side_effects
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_two_inputs b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_two_inputs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_two_inputs_tuple_in_dims b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_two_inputs_tuple_in_dims
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_with_conditional_graph_break b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_with_conditional_graph_break
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_with_graph_break b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_with_graph_break
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_with_graph_break_2 b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_with_graph_break_2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_with_graph_break_lambda b/test/dynamo_expected_failures/FuncTorchHigherOrderOpTests.test_vmap_with_graph_break_lambda
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FunctionTests.test_default_dict b/test/dynamo_expected_failures/FunctionTests.test_default_dict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FunctionTests.test_default_dict_closure b/test/dynamo_expected_failures/FunctionTests.test_default_dict_closure
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FunctionTests.test_default_dict_lambda b/test/dynamo_expected_failures/FunctionTests.test_default_dict_lambda
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FunctionTests.test_is_contiguous_frame_counts b/test/dynamo_expected_failures/FunctionTests.test_is_contiguous_frame_counts
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FunctionTests.test_math_radians b/test/dynamo_expected_failures/FunctionTests.test_math_radians
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FunctionTests.test_partials_as_input_partials_lambda b/test/dynamo_expected_failures/FunctionTests.test_partials_as_input_partials_lambda
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/FunctionTests.test_partials_as_input_partials_mod b/test/dynamo_expected_failures/FunctionTests.test_partials_as_input_partials_mod
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_capture_untracked_global_nested b/test/dynamo_expected_failures/HigherOrderOpTests.test_capture_untracked_global_nested
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_capture_value_created_in_subgraph b/test/dynamo_expected_failures/HigherOrderOpTests.test_capture_value_created_in_subgraph
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_cond_branches_no_arguments b/test/dynamo_expected_failures/HigherOrderOpTests.test_cond_branches_no_arguments
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_cond_branches_no_arguments_no_closure b/test/dynamo_expected_failures/HigherOrderOpTests.test_cond_branches_no_arguments_no_closure
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_cond_pytree_operands b/test/dynamo_expected_failures/HigherOrderOpTests.test_cond_pytree_operands
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_cond_pytree_operands_with_non_tensor_leaves b/test/dynamo_expected_failures/HigherOrderOpTests.test_cond_pytree_operands_with_non_tensor_leaves
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_cond_source_fn_stack b/test/dynamo_expected_failures/HigherOrderOpTests.test_cond_source_fn_stack
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_cond_subgraph_name_is_valid b/test/dynamo_expected_failures/HigherOrderOpTests.test_cond_subgraph_name_is_valid
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_fallback_on_python_primitives_output b/test/dynamo_expected_failures/HigherOrderOpTests.test_fallback_on_python_primitives_output
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_map_lowers_to_graph b/test/dynamo_expected_failures/HigherOrderOpTests.test_map_lowers_to_graph
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_map_multi_return b/test/dynamo_expected_failures/HigherOrderOpTests.test_map_multi_return
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_map_pytree_return b/test/dynamo_expected_failures/HigherOrderOpTests.test_map_pytree_return
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_map_source_fn_stack b/test/dynamo_expected_failures/HigherOrderOpTests.test_map_source_fn_stack
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_map_subgraph_name_is_valid b/test/dynamo_expected_failures/HigherOrderOpTests.test_map_subgraph_name_is_valid
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_map_symint_input b/test/dynamo_expected_failures/HigherOrderOpTests.test_map_symint_input
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_modules b/test/dynamo_expected_failures/HigherOrderOpTests.test_modules
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_nested_tuple_output b/test/dynamo_expected_failures/HigherOrderOpTests.test_nested_tuple_output
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_output_with_dict b/test/dynamo_expected_failures/HigherOrderOpTests.test_output_with_dict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_side_effect_in_body b/test/dynamo_expected_failures/HigherOrderOpTests.test_side_effect_in_body
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_vmap_source_fn_stack b/test/dynamo_expected_failures/HigherOrderOpTests.test_vmap_source_fn_stack
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_wrap_kwarg_recompile b/test/dynamo_expected_failures/HigherOrderOpTests.test_wrap_kwarg_recompile
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HigherOrderOpTests.test_wrap_subgraph_name_is_valid b/test/dynamo_expected_failures/HigherOrderOpTests.test_wrap_subgraph_name_is_valid
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HooksTests.test_functools_arg_vary b/test/dynamo_expected_failures/HooksTests.test_functools_arg_vary
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HooksTests.test_post_acc_grad_hook b/test/dynamo_expected_failures/HooksTests.test_post_acc_grad_hook
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HooksTests.test_tensor_register_global_hook b/test/dynamo_expected_failures/HooksTests.test_tensor_register_global_hook
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HooksTests.test_tensor_register_hook_in_graph_local b/test/dynamo_expected_failures/HooksTests.test_tensor_register_hook_in_graph_local
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HooksTests.test_tensor_register_multiple_hooks b/test/dynamo_expected_failures/HooksTests.test_tensor_register_multiple_hooks
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/HooksTests.test_tensor_register_multiple_hooks_handles_in_list b/test/dynamo_expected_failures/HooksTests.test_tensor_register_multiple_hooks_handles_in_list
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/InteropTests.test_vmap_in_graph b/test/dynamo_expected_failures/InteropTests.test_vmap_in_graph
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_aot b/test/dynamo_expected_failures/LoggingTests.test_aot
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_aot_graphs b/test/dynamo_expected_failures/LoggingTests.test_aot_graphs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_aot_joint_graph b/test/dynamo_expected_failures/LoggingTests.test_aot_joint_graph
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_bytecode b/test/dynamo_expected_failures/LoggingTests.test_bytecode
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_custom_format b/test/dynamo_expected_failures/LoggingTests.test_custom_format
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_custom_format_exc b/test/dynamo_expected_failures/LoggingTests.test_custom_format_exc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_dynamo_debug b/test/dynamo_expected_failures/LoggingTests.test_dynamo_debug
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_dynamo_error b/test/dynamo_expected_failures/LoggingTests.test_dynamo_error
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_dynamo_info b/test/dynamo_expected_failures/LoggingTests.test_dynamo_info
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_graph b/test/dynamo_expected_failures/LoggingTests.test_graph
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_graph_breaks b/test/dynamo_expected_failures/LoggingTests.test_graph_breaks
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_graph_code b/test/dynamo_expected_failures/LoggingTests.test_graph_code
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_graph_sizes b/test/dynamo_expected_failures/LoggingTests.test_graph_sizes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_graph_sizes_dynamic b/test/dynamo_expected_failures/LoggingTests.test_graph_sizes_dynamic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_guards_recompiles b/test/dynamo_expected_failures/LoggingTests.test_guards_recompiles
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_inductor_debug b/test/dynamo_expected_failures/LoggingTests.test_inductor_debug
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_inductor_error b/test/dynamo_expected_failures/LoggingTests.test_inductor_error
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_inductor_info b/test/dynamo_expected_failures/LoggingTests.test_inductor_info
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_multiline_format b/test/dynamo_expected_failures/LoggingTests.test_multiline_format
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_output_code b/test/dynamo_expected_failures/LoggingTests.test_output_code
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_recompiles b/test/dynamo_expected_failures/LoggingTests.test_recompiles
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_trace_source_cond b/test/dynamo_expected_failures/LoggingTests.test_trace_source_cond
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/LoggingTests.test_trace_source_nested b/test/dynamo_expected_failures/LoggingTests.test_trace_source_nested
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_add_to_set b/test/dynamo_expected_failures/MiscTests.test_add_to_set
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_any_all_symnode b/test/dynamo_expected_failures/MiscTests.test_any_all_symnode
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_backend_match_guard b/test/dynamo_expected_failures/MiscTests.test_backend_match_guard
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_boolarg b/test/dynamo_expected_failures/MiscTests.test_boolarg
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_callpacked b/test/dynamo_expected_failures/MiscTests.test_callpacked
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_closure_out_of_scope_cell b/test/dynamo_expected_failures/MiscTests.test_closure_out_of_scope_cell
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_compare_shapes_with_constant b/test/dynamo_expected_failures/MiscTests.test_compare_shapes_with_constant
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_cond_nested b/test/dynamo_expected_failures/MiscTests.test_cond_nested
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_cond_side_effects b/test/dynamo_expected_failures/MiscTests.test_cond_side_effects
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_dataclass_fields b/test/dynamo_expected_failures/MiscTests.test_dataclass_fields
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_deque_append_left b/test/dynamo_expected_failures/MiscTests.test_deque_append_left
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_deque_input b/test/dynamo_expected_failures/MiscTests.test_deque_input
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_deterministic_algorithms_mutated b/test/dynamo_expected_failures/MiscTests.test_deterministic_algorithms_mutated
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_dict_order_keys b/test/dynamo_expected_failures/MiscTests.test_dict_order_keys
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_dict_order_keys_modules b/test/dynamo_expected_failures/MiscTests.test_dict_order_keys_modules
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_dict_order_keys_tensors b/test/dynamo_expected_failures/MiscTests.test_dict_order_keys_tensors
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_dictcomp b/test/dynamo_expected_failures/MiscTests.test_dictcomp
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_dtypes_no_graphbreaks b/test/dynamo_expected_failures/MiscTests.test_dtypes_no_graphbreaks
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_dunder_new_function_inlining b/test/dynamo_expected_failures/MiscTests.test_dunder_new_function_inlining
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_frozenset_torch_func_contains b/test/dynamo_expected_failures/MiscTests.test_frozenset_torch_func_contains
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_get_cache_entry b/test/dynamo_expected_failures/MiscTests.test_get_cache_entry
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_guard_failure_fn b/test/dynamo_expected_failures/MiscTests.test_guard_failure_fn
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_guard_failure_fn2 b/test/dynamo_expected_failures/MiscTests.test_guard_failure_fn2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_guard_failure_fn_shape_control b/test/dynamo_expected_failures/MiscTests.test_guard_failure_fn_shape_control
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_guard_failure_fn_tensor_iter b/test/dynamo_expected_failures/MiscTests.test_guard_failure_fn_tensor_iter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_inference_mode b/test/dynamo_expected_failures/MiscTests.test_inference_mode
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_inline_closure_not_loaded_by_parent b/test/dynamo_expected_failures/MiscTests.test_inline_closure_not_loaded_by_parent
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_inplace_param_update b/test/dynamo_expected_failures/MiscTests.test_inplace_param_update
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_inplace_view_on_graph_input b/test/dynamo_expected_failures/MiscTests.test_inplace_view_on_graph_input
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_intermediary_tensor_grad_access b/test/dynamo_expected_failures/MiscTests.test_intermediary_tensor_grad_access
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_is_compiling b/test/dynamo_expected_failures/MiscTests.test_is_compiling
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_iter_set b/test/dynamo_expected_failures/MiscTests.test_iter_set
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_itertools_accumulate_symint_default_sum b/test/dynamo_expected_failures/MiscTests.test_itertools_accumulate_symint_default_sum
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_itertools_accumulate_tensors_default_sum b/test/dynamo_expected_failures/MiscTests.test_itertools_accumulate_tensors_default_sum
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_itertools_groupby_pure_python_default_identify_func b/test/dynamo_expected_failures/MiscTests.test_itertools_groupby_pure_python_default_identify_func
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_itertools_groupby_pure_python_key_func b/test/dynamo_expected_failures/MiscTests.test_itertools_groupby_pure_python_key_func
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_itertools_infinite_cycle b/test/dynamo_expected_failures/MiscTests.test_itertools_infinite_cycle
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_itertools_infinite_repeat b/test/dynamo_expected_failures/MiscTests.test_itertools_infinite_repeat
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_itertools_infinite_repeat_mutation b/test/dynamo_expected_failures/MiscTests.test_itertools_infinite_repeat_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_itertools_repeat b/test/dynamo_expected_failures/MiscTests.test_itertools_repeat
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_listcomp b/test/dynamo_expected_failures/MiscTests.test_listcomp
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_mandelbrot_numpy b/test/dynamo_expected_failures/MiscTests.test_mandelbrot_numpy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_mark_static b/test/dynamo_expected_failures/MiscTests.test_mark_static
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_namedtuple1 b/test/dynamo_expected_failures/MiscTests.test_namedtuple1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_namedtuple2 b/test/dynamo_expected_failures/MiscTests.test_namedtuple2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_nan b/test/dynamo_expected_failures/MiscTests.test_nan
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_nested_closure b/test/dynamo_expected_failures/MiscTests.test_nested_closure
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_nested_closure_mutation b/test/dynamo_expected_failures/MiscTests.test_nested_closure_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_nested_optimize b/test/dynamo_expected_failures/MiscTests.test_nested_optimize
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_nested_optimize_decorator b/test/dynamo_expected_failures/MiscTests.test_nested_optimize_decorator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_nested_optimize_run b/test/dynamo_expected_failures/MiscTests.test_nested_optimize_run
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_nn_module_getattr b/test/dynamo_expected_failures/MiscTests.test_nn_module_getattr
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_no_raise_guard_partial_constraint b/test/dynamo_expected_failures/MiscTests.test_no_raise_guard_partial_constraint
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_no_raise_guard_partial_constraint_across_break b/test/dynamo_expected_failures/MiscTests.test_no_raise_guard_partial_constraint_across_break
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_numpy_array_of_arrays b/test/dynamo_expected_failures/MiscTests.test_numpy_array_of_arrays
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_numpy_force b/test/dynamo_expected_failures/MiscTests.test_numpy_force
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_numpy_int_constant b/test/dynamo_expected_failures/MiscTests.test_numpy_int_constant
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_numpy_iter b/test/dynamo_expected_failures/MiscTests.test_numpy_iter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_numpy_non_torch_dtype b/test/dynamo_expected_failures/MiscTests.test_numpy_non_torch_dtype
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_numpy_readonly b/test/dynamo_expected_failures/MiscTests.test_numpy_readonly
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_numpy_recompilation_scalar b/test/dynamo_expected_failures/MiscTests.test_numpy_recompilation_scalar
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_numpy_size_attr b/test/dynamo_expected_failures/MiscTests.test_numpy_size_attr
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_numpy_subdtype b/test/dynamo_expected_failures/MiscTests.test_numpy_subdtype
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_numpy_tolist b/test/dynamo_expected_failures/MiscTests.test_numpy_tolist
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_numpy_torch_operators b/test/dynamo_expected_failures/MiscTests.test_numpy_torch_operators
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_numpy_with_builtin_type b/test/dynamo_expected_failures/MiscTests.test_numpy_with_builtin_type
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_out_variants_with_resizing_on_graph_inputs b/test/dynamo_expected_failures/MiscTests.test_out_variants_with_resizing_on_graph_inputs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_pure_python_accumulate b/test/dynamo_expected_failures/MiscTests.test_pure_python_accumulate
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_py_guards_mark_dynamic b/test/dynamo_expected_failures/MiscTests.test_py_guards_mark_dynamic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_raise_guard_full_constraint b/test/dynamo_expected_failures/MiscTests.test_raise_guard_full_constraint
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_raise_guard_partial_constraint_no_graph_break b/test/dynamo_expected_failures/MiscTests.test_raise_guard_partial_constraint_no_graph_break
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_raise_on_backend_error b/test/dynamo_expected_failures/MiscTests.test_raise_on_backend_error
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_recompile_on_global_state_change b/test/dynamo_expected_failures/MiscTests.test_recompile_on_global_state_change
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_release_input_memory b/test/dynamo_expected_failures/MiscTests.test_release_input_memory
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_release_module_memory b/test/dynamo_expected_failures/MiscTests.test_release_module_memory
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_release_scope_memory b/test/dynamo_expected_failures/MiscTests.test_release_scope_memory
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_repeat_interleave_graphbreaks b/test/dynamo_expected_failures/MiscTests.test_repeat_interleave_graphbreaks
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_return_nested_function b/test/dynamo_expected_failures/MiscTests.test_return_nested_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_set_aliasing_recompiles b/test/dynamo_expected_failures/MiscTests.test_set_aliasing_recompiles
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_setattr_mutation1 b/test/dynamo_expected_failures/MiscTests.test_setattr_mutation1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_simple_set_usage b/test/dynamo_expected_failures/MiscTests.test_simple_set_usage
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_size_input b/test/dynamo_expected_failures/MiscTests.test_size_input
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_str_format_assert2 b/test/dynamo_expected_failures/MiscTests.test_str_format_assert2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tensor_build_list_unpack b/test/dynamo_expected_failures/MiscTests.test_tensor_build_list_unpack
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tensor_dict1 b/test/dynamo_expected_failures/MiscTests.test_tensor_dict1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tensor_dict2 b/test/dynamo_expected_failures/MiscTests.test_tensor_dict2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tensor_dict3 b/test/dynamo_expected_failures/MiscTests.test_tensor_dict3
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tensor_item_capture b/test/dynamo_expected_failures/MiscTests.test_tensor_item_capture
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tolist_0d b/test/dynamo_expected_failures/MiscTests.test_tolist_0d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tolist_1d b/test/dynamo_expected_failures/MiscTests.test_tolist_1d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tolist_kd b/test/dynamo_expected_failures/MiscTests.test_tolist_kd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tolist_kd_dynamic b/test/dynamo_expected_failures/MiscTests.test_tolist_kd_dynamic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tolist_scalar b/test/dynamo_expected_failures/MiscTests.test_tolist_scalar
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_torch_seed b/test/dynamo_expected_failures/MiscTests.test_torch_seed
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tracing_nested_py_tree b/test/dynamo_expected_failures/MiscTests.test_tracing_nested_py_tree
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tracing_nested_py_tree_dicts b/test/dynamo_expected_failures/MiscTests.test_tracing_nested_py_tree_dicts
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tracing_nested_py_tree_mixed_all b/test/dynamo_expected_failures/MiscTests.test_tracing_nested_py_tree_mixed_all
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tracing_nested_py_tree_tuples b/test/dynamo_expected_failures/MiscTests.test_tracing_nested_py_tree_tuples
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tracing_py_tree b/test/dynamo_expected_failures/MiscTests.test_tracing_py_tree
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tracing_py_tree_tensor_subclass b/test/dynamo_expected_failures/MiscTests.test_tracing_py_tree_tensor_subclass
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_tracing_tree_map_only b/test/dynamo_expected_failures/MiscTests.test_tracing_tree_map_only
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_type_copy b/test/dynamo_expected_failures/MiscTests.test_type_copy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_typing_typevar b/test/dynamo_expected_failures/MiscTests.test_typing_typevar
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_user_getattribute b/test/dynamo_expected_failures/MiscTests.test_user_getattribute
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_with_builtin_type b/test/dynamo_expected_failures/MiscTests.test_with_builtin_type
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_yield_from b/test/dynamo_expected_failures/MiscTests.test_yield_from
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_yield_gen_and_from b/test/dynamo_expected_failures/MiscTests.test_yield_gen_and_from
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/MiscTests.test_yield_send_to_subgenerator_graph_break b/test/dynamo_expected_failures/MiscTests.test_yield_send_to_subgenerator_graph_break
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NNModuleTests.test_lazy_module1 b/test/dynamo_expected_failures/NNModuleTests.test_lazy_module1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NNModuleTests.test_lazy_module2 b/test/dynamo_expected_failures/NNModuleTests.test_lazy_module2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NNModuleTests.test_lazy_module4 b/test/dynamo_expected_failures/NNModuleTests.test_lazy_module4
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NNModuleTests.test_lazy_module5 b/test/dynamo_expected_failures/NNModuleTests.test_lazy_module5
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NNModuleTests.test_lazy_module6 b/test/dynamo_expected_failures/NNModuleTests.test_lazy_module6
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NNModuleTests.test_lazy_module_no_cls_to_become b/test/dynamo_expected_failures/NNModuleTests.test_lazy_module_no_cls_to_become
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NNModuleTests.test_self_mutating1 b/test/dynamo_expected_failures/NNModuleTests.test_self_mutating1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NNModuleTests.test_unsupportedmethod b/test/dynamo_expected_failures/NNModuleTests.test_unsupportedmethod
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NNModuleTests.test_unsupportedmodule b/test/dynamo_expected_failures/NNModuleTests.test_unsupportedmodule
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test__scaled_dot_product_flash_attention_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test__scaled_dot_product_flash_attention_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_basic_non_strict_fake_tensor_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_basic_non_strict_fake_tensor_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_basic_non_strict_real_tensor_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_basic_non_strict_real_tensor_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_buffer_util_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_buffer_util_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_cond_with_module_stack_export_with_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_cond_with_module_stack_export_with_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_export_decomps_dynamic_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_export_decomps_dynamic_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_export_decomps_simple_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_export_decomps_simple_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_export_with_wrong_inputs_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_export_with_wrong_inputs_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_external_call_non_strict_real_tensor_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_external_call_non_strict_real_tensor_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_fqn_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_fqn_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_nn_module_stack_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_nn_module_stack_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_nn_module_stack_shared_submodule_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_nn_module_stack_shared_submodule_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_non_strict_dynamic_shapes_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_non_strict_dynamic_shapes_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_non_strict_dynamic_shapes_suggested_fixes_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_non_strict_dynamic_shapes_suggested_fixes_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_param_util_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_param_util_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_raise_user_error_when_guard_on_data_dependent_operation_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_raise_user_error_when_guard_on_data_dependent_operation_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_sym_sqrt_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_sym_sqrt_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_to_module_with_mutated_buffer_multiple_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_to_module_with_mutated_buffer_multiple_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_to_module_with_mutated_buffer_multiple_update_sub_later_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_to_module_with_mutated_buffer_multiple_update_sub_later_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NonStrictExportTestExport.test_to_module_with_mutated_buffer_non_strict b/test/dynamo_expected_failures/NonStrictExportTestExport.test_to_module_with_mutated_buffer_non_strict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NumpyTestsCPU.test_boolean_indexing_weirdness_cpu b/test/dynamo_expected_failures/NumpyTestsCPU.test_boolean_indexing_weirdness_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NumpyTestsCPU.test_boolean_shape_mismatch_cpu b/test/dynamo_expected_failures/NumpyTestsCPU.test_boolean_shape_mismatch_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NumpyTestsCPU.test_empty_fancy_index_cpu b/test/dynamo_expected_failures/NumpyTestsCPU.test_empty_fancy_index_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/NumpyTestsCPU.test_index_no_floats_cpu b/test/dynamo_expected_failures/NumpyTestsCPU.test_index_no_floats_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizedModuleTest.test_backward_hooks b/test/dynamo_expected_failures/OptimizedModuleTest.test_backward_hooks
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizedModuleTest.test_cache_size_limit_on_guarded_nn_modules b/test/dynamo_expected_failures/OptimizedModuleTest.test_cache_size_limit_on_guarded_nn_modules
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizedModuleTest.test_composition_with_opt_mod b/test/dynamo_expected_failures/OptimizedModuleTest.test_composition_with_opt_mod
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizedModuleTest.test_hooks_inner b/test/dynamo_expected_failures/OptimizedModuleTest.test_hooks_inner
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizedModuleTest.test_hooks_outer b/test/dynamo_expected_failures/OptimizedModuleTest.test_hooks_outer
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizedModuleTest.test_hooks_skip_guards b/test/dynamo_expected_failures/OptimizedModuleTest.test_hooks_skip_guards
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizedModuleTest.test_module_dict_iter_name b/test/dynamo_expected_failures/OptimizedModuleTest.test_module_dict_iter_name
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizedModuleTest.test_nn_module b/test/dynamo_expected_failures/OptimizedModuleTest.test_nn_module
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizedModuleTest.test_no_recompile_on_nn_guarded_modules b/test/dynamo_expected_failures/OptimizedModuleTest.test_no_recompile_on_nn_guarded_modules
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizedModuleTest.test_to b/test/dynamo_expected_failures/OptimizedModuleTest.test_to
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizerTests.test_adagrad b/test/dynamo_expected_failures/OptimizerTests.test_adagrad
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizerTests.test_adam b/test/dynamo_expected_failures/OptimizerTests.test_adam
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizerTests.test_adamax b/test/dynamo_expected_failures/OptimizerTests.test_adamax
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizerTests.test_adamw b/test/dynamo_expected_failures/OptimizerTests.test_adamw
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizerTests.test_asgd b/test/dynamo_expected_failures/OptimizerTests.test_asgd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizerTests.test_nadam b/test/dynamo_expected_failures/OptimizerTests.test_nadam
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizerTests.test_rmsprop b/test/dynamo_expected_failures/OptimizerTests.test_rmsprop
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizerTests.test_rprop b/test/dynamo_expected_failures/OptimizerTests.test_rprop
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/OptimizerTests.test_sgd b/test/dynamo_expected_failures/OptimizerTests.test_sgd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/PackedSequenceTest.test_pack_sequence b/test/dynamo_expected_failures/PackedSequenceTest.test_pack_sequence
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/PackedSequenceTest.test_to b/test/dynamo_expected_failures/PackedSequenceTest.test_to
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/PackedSequenceTest.test_total_length b/test/dynamo_expected_failures/PackedSequenceTest.test_total_length
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/PackedSequenceTest.test_type_casts b/test/dynamo_expected_failures/PackedSequenceTest.test_type_casts
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RecompileTests.test_aliasing_guard_failures_with_globals b/test/dynamo_expected_failures/RecompileTests.test_aliasing_guard_failures_with_globals
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RecompileTests.test_automatic_dynamic_reduce_recompiles b/test/dynamo_expected_failures/RecompileTests.test_automatic_dynamic_reduce_recompiles
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RecompileTests.test_automatic_dynamic_tensor_scalar_change b/test/dynamo_expected_failures/RecompileTests.test_automatic_dynamic_tensor_scalar_change
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RecompileTests.test_dynamic_shape_parameter_recompile b/test/dynamo_expected_failures/RecompileTests.test_dynamic_shape_parameter_recompile
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RecompileTests.test_recompiles_true_false_flop b/test/dynamo_expected_failures/RecompileTests.test_recompiles_true_false_flop
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RecompileUxTests.test_mismatched_type b/test/dynamo_expected_failures/RecompileUxTests.test_mismatched_type
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RecompileUxTests.test_verbose_tensor_check b/test/dynamo_expected_failures/RecompileUxTests.test_verbose_tensor_check
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_add_sub_alpha_out b/test/dynamo_expected_failures/ReproTests.test_add_sub_alpha_out
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_addr_alpha_beta_out b/test/dynamo_expected_failures/ReproTests.test_addr_alpha_beta_out
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_boxes_len b/test/dynamo_expected_failures/ReproTests.test_boxes_len
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_chunk_reformer_ff b/test/dynamo_expected_failures/ReproTests.test_chunk_reformer_ff
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_convert_boxes_to_pooler_format b/test/dynamo_expected_failures/ReproTests.test_convert_boxes_to_pooler_format
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_create_rand_mask_from_inputs b/test/dynamo_expected_failures/ReproTests.test_create_rand_mask_from_inputs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_do_paste_mask b/test/dynamo_expected_failures/ReproTests.test_do_paste_mask
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_dynamic_shapes_float_guard b/test/dynamo_expected_failures/ReproTests.test_dynamic_shapes_float_guard
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_dynamic_shapes_implicit_guard b/test/dynamo_expected_failures/ReproTests.test_dynamic_shapes_implicit_guard
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_empty_list_contains_with_jump b/test/dynamo_expected_failures/ReproTests.test_empty_list_contains_with_jump
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_function_in_skipfiles b/test/dynamo_expected_failures/ReproTests.test_function_in_skipfiles
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_functools_wraps b/test/dynamo_expected_failures/ReproTests.test_functools_wraps
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_hf_t5_forward b/test/dynamo_expected_failures/ReproTests.test_hf_t5_forward
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_hf_xsoftmax_training b/test/dynamo_expected_failures/ReproTests.test_hf_xsoftmax_training
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_issue175 b/test/dynamo_expected_failures/ReproTests.test_issue175
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_jit_trace_errors b/test/dynamo_expected_failures/ReproTests.test_jit_trace_errors
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_list_aliasing b/test/dynamo_expected_failures/ReproTests.test_list_aliasing
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_list_self_reference b/test/dynamo_expected_failures/ReproTests.test_list_self_reference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_longformer_chunk b/test/dynamo_expected_failures/ReproTests.test_longformer_chunk
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_merge_criteria_processor_list1 b/test/dynamo_expected_failures/ReproTests.test_merge_criteria_processor_list1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_merge_criteria_processor_list2 b/test/dynamo_expected_failures/ReproTests.test_merge_criteria_processor_list2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_module_in_skipfiles b/test/dynamo_expected_failures/ReproTests.test_module_in_skipfiles
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_multi_import b/test/dynamo_expected_failures/ReproTests.test_multi_import
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_negative_shape_guard b/test/dynamo_expected_failures/ReproTests.test_negative_shape_guard
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_numpy_not_ndarray_recompiles b/test/dynamo_expected_failures/ReproTests.test_numpy_not_ndarray_recompiles
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_recursive_map b/test/dynamo_expected_failures/ReproTests.test_recursive_map
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_reformer_eval b/test/dynamo_expected_failures/ReproTests.test_reformer_eval
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_reformer_min_chunk_len b/test/dynamo_expected_failures/ReproTests.test_reformer_min_chunk_len
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_reformer_sorting b/test/dynamo_expected_failures/ReproTests.test_reformer_sorting
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_relative_import b/test/dynamo_expected_failures/ReproTests.test_relative_import
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_relative_import_no_modulename b/test/dynamo_expected_failures/ReproTests.test_relative_import_no_modulename
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_restricted_list_subclass1 b/test/dynamo_expected_failures/ReproTests.test_restricted_list_subclass1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_rewrite_assert_noop b/test/dynamo_expected_failures/ReproTests.test_rewrite_assert_noop
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_rewrite_assert_with_msg b/test/dynamo_expected_failures/ReproTests.test_rewrite_assert_with_msg
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_rewrite_assert_with_non_string_msg b/test/dynamo_expected_failures/ReproTests.test_rewrite_assert_with_non_string_msg
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_seq_append_list b/test/dynamo_expected_failures/ReproTests.test_seq_append_list
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_size_typematch b/test/dynamo_expected_failures/ReproTests.test_size_typematch
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_tensor_data_kwarg b/test/dynamo_expected_failures/ReproTests.test_tensor_data_kwarg
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_threading_local b/test/dynamo_expected_failures/ReproTests.test_threading_local
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/ReproTests.test_validate_model_kwargs b/test/dynamo_expected_failures/ReproTests.test_validate_model_kwargs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RetraceExportTestDynamismExpression.test_export_inline_constraints_retraceability b/test/dynamo_expected_failures/RetraceExportTestDynamismExpression.test_export_inline_constraints_retraceability
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RetraceExportTestExport.test_cond_with_module_stack_export_with_retraceability b/test/dynamo_expected_failures/RetraceExportTestExport.test_cond_with_module_stack_export_with_retraceability
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RetraceExportTestExport.test_constrain_size_in_eager_retraceability b/test/dynamo_expected_failures/RetraceExportTestExport.test_constrain_size_in_eager_retraceability
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RetraceExportTestExport.test_constrain_size_with_constrain_value_retraceability b/test/dynamo_expected_failures/RetraceExportTestExport.test_constrain_size_with_constrain_value_retraceability
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RetraceExportTestExport.test_constrain_size_with_various_cases_retraceability b/test/dynamo_expected_failures/RetraceExportTestExport.test_constrain_size_with_various_cases_retraceability
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RetraceExportTestExport.test_nn_module_stack_retraceability b/test/dynamo_expected_failures/RetraceExportTestExport.test_nn_module_stack_retraceability
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RetraceExportTestExport.test_nn_module_stack_shared_submodule_retraceability b/test/dynamo_expected_failures/RetraceExportTestExport.test_nn_module_stack_shared_submodule_retraceability
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RetraceExportTestExport.test_non_strict_dynamic_shapes_retraceability b/test/dynamo_expected_failures/RetraceExportTestExport.test_non_strict_dynamic_shapes_retraceability
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/RetraceExportTestExport.test_non_strict_dynamic_shapes_suggested_fixes_retraceability b/test/dynamo_expected_failures/RetraceExportTestExport.test_non_strict_dynamic_shapes_suggested_fixes_retraceability
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SerDesExportTestDynamismExpression.test_export_inline_constraints_serdes b/test/dynamo_expected_failures/SerDesExportTestDynamismExpression.test_export_inline_constraints_serdes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SerDesExportTestExport.test_basic_non_strict_fake_tensor_serdes b/test/dynamo_expected_failures/SerDesExportTestExport.test_basic_non_strict_fake_tensor_serdes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SerDesExportTestExport.test_basic_non_strict_real_tensor_serdes b/test/dynamo_expected_failures/SerDesExportTestExport.test_basic_non_strict_real_tensor_serdes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SerDesExportTestExport.test_cond_with_module_stack_export_with_serdes b/test/dynamo_expected_failures/SerDesExportTestExport.test_cond_with_module_stack_export_with_serdes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SerDesExportTestExport.test_constrain_size_in_eager_serdes b/test/dynamo_expected_failures/SerDesExportTestExport.test_constrain_size_in_eager_serdes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SerDesExportTestExport.test_constrain_size_with_constrain_value_serdes b/test/dynamo_expected_failures/SerDesExportTestExport.test_constrain_size_with_constrain_value_serdes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SerDesExportTestExport.test_constrain_size_with_various_cases_serdes b/test/dynamo_expected_failures/SerDesExportTestExport.test_constrain_size_with_various_cases_serdes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SerDesExportTestExport.test_external_call_non_strict_real_tensor_serdes b/test/dynamo_expected_failures/SerDesExportTestExport.test_external_call_non_strict_real_tensor_serdes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SerDesExportTestExport.test_nn_module_stack_serdes b/test/dynamo_expected_failures/SerDesExportTestExport.test_nn_module_stack_serdes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SerDesExportTestExport.test_nn_module_stack_shared_submodule_serdes b/test/dynamo_expected_failures/SerDesExportTestExport.test_nn_module_stack_shared_submodule_serdes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SerDesExportTestExport.test_non_strict_dynamic_shapes_serdes b/test/dynamo_expected_failures/SerDesExportTestExport.test_non_strict_dynamic_shapes_serdes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SerDesExportTestExport.test_non_strict_dynamic_shapes_suggested_fixes_serdes b/test/dynamo_expected_failures/SerDesExportTestExport.test_non_strict_dynamic_shapes_suggested_fixes_serdes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SkipNonTensorTests.test_add_tensor1 b/test/dynamo_expected_failures/SkipNonTensorTests.test_add_tensor1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SkipNonTensorTests.test_add_tensor2 b/test/dynamo_expected_failures/SkipNonTensorTests.test_add_tensor2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SkipNonTensorTests.test_add_tensor_dict b/test/dynamo_expected_failures/SkipNonTensorTests.test_add_tensor_dict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SkipNonTensorTests.test_add_tensor_list b/test/dynamo_expected_failures/SkipNonTensorTests.test_add_tensor_list
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SkipNonTensorTests.test_do_not_skip_side_effects b/test/dynamo_expected_failures/SkipNonTensorTests.test_do_not_skip_side_effects
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SkipNonTensorTests.test_recursive_list b/test/dynamo_expected_failures/SkipNonTensorTests.test_recursive_list
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SubGraphTests.test_dynamic_duck_size b/test/dynamo_expected_failures/SubGraphTests.test_dynamic_duck_size
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SubGraphTests.test_dynamic_getitem b/test/dynamo_expected_failures/SubGraphTests.test_dynamic_getitem
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SubGraphTests.test_dynamic_kwarg b/test/dynamo_expected_failures/SubGraphTests.test_dynamic_kwarg
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SubGraphTests.test_dynamic_order_dependence b/test/dynamo_expected_failures/SubGraphTests.test_dynamic_order_dependence
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SubGraphTests.test_dynamic_zero_inference b/test/dynamo_expected_failures/SubGraphTests.test_dynamic_zero_inference
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SubGraphTests.test_enumerate_not_break_graph b/test/dynamo_expected_failures/SubGraphTests.test_enumerate_not_break_graph
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SubGraphTests.test_no_graph_break_on_item b/test/dynamo_expected_failures/SubGraphTests.test_no_graph_break_on_item
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SubclassTests.test_compile_higher_order_with_functionalization b/test/dynamo_expected_failures/SubclassTests.test_compile_higher_order_with_functionalization
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SubclassTests.test_compile_with_functionalization b/test/dynamo_expected_failures/SubclassTests.test_compile_with_functionalization
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SubclassTests.test_torch_function_state_guards b/test/dynamo_expected_failures/SubclassTests.test_torch_function_state_guards
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SubclassTests.test_wrapper_subclass_guards_on_inner_tensor b/test/dynamo_expected_failures/SubclassTests.test_wrapper_subclass_guards_on_inner_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SubclassTests.test_wrapper_subclass_with_differently_sized_inner_tensor b/test/dynamo_expected_failures/SubclassTests.test_wrapper_subclass_with_differently_sized_inner_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/SubclassTests.test_wrapper_subclass_with_same_sized_inner_tensor b/test/dynamo_expected_failures/SubclassTests.test_wrapper_subclass_with_same_sized_inner_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAOTAutograd.test_input_mutation_aliases_bases_out_of_order b/test/dynamo_expected_failures/TestAOTAutograd.test_input_mutation_aliases_bases_out_of_order
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAOTAutograd.test_input_mutation_false_aliasing b/test/dynamo_expected_failures/TestAOTAutograd.test_input_mutation_false_aliasing
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAOTAutograd.test_output_aliases_multiple_inputs_get_correct_one b/test/dynamo_expected_failures/TestAOTAutograd.test_output_aliases_multiple_inputs_get_correct_one
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAOTAutograd.test_set__and_data_mutation_good b/test/dynamo_expected_failures/TestAOTAutograd.test_set__and_data_mutation_good
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAOTDispatch.test_aot_dispatch_input_mutation b/test/dynamo_expected_failures/TestAOTDispatch.test_aot_dispatch_input_mutation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAOTDispatch.test_aot_dispatch_input_mutation_and_output_alias b/test/dynamo_expected_failures/TestAOTDispatch.test_aot_dispatch_input_mutation_and_output_alias
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAOTDispatch.test_aot_dispatch_output_alias b/test/dynamo_expected_failures/TestAOTDispatch.test_aot_dispatch_output_alias
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAOTDispatch.test_aot_dispatch_simple b/test/dynamo_expected_failures/TestAOTDispatch.test_aot_dispatch_simple
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAOTModuleSimplified.test_aot_module_simplified_fake_tensor_gm_raises b/test/dynamo_expected_failures/TestAOTModuleSimplified.test_aot_module_simplified_fake_tensor_gm_raises
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAnalyze.test_trace_dependencies b/test/dynamo_expected_failures/TestAnalyze.test_trace_dependencies
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestApplyAlongAxis.test_0d_array b/test/dynamo_expected_failures/TestApplyAlongAxis.test_0d_array
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestApplyAlongAxis.test_axis_insertion b/test/dynamo_expected_failures/TestApplyAlongAxis.test_axis_insertion
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestApplyAlongAxis.test_scalar_array b/test/dynamo_expected_failures/TestApplyAlongAxis.test_scalar_array
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArange.test_infinite b/test/dynamo_expected_failures/TestArange.test_infinite
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArange.test_nan_step b/test/dynamo_expected_failures/TestArange.test_nan_step
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArange.test_require_range b/test/dynamo_expected_failures/TestArange.test_require_range
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArange.test_zero_step b/test/dynamo_expected_failures/TestArange.test_zero_step
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArgmax.test_combinations_data62 b/test/dynamo_expected_failures/TestArgmax.test_combinations_data62
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArgmax.test_combinations_data63 b/test/dynamo_expected_failures/TestArgmax.test_combinations_data63
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArgmax.test_combinations_data65 b/test/dynamo_expected_failures/TestArgmax.test_combinations_data65
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArgmax.test_combinations_data66 b/test/dynamo_expected_failures/TestArgmax.test_combinations_data66
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArrayAttributeDeletion.test_multiarray_not_writable_attributes_deletion b/test/dynamo_expected_failures/TestArrayAttributeDeletion.test_multiarray_not_writable_attributes_deletion
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArrayAttributeDeletion.test_multiarray_writable_attributes_deletion b/test/dynamo_expected_failures/TestArrayAttributeDeletion.test_multiarray_writable_attributes_deletion
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArrayCreationCopyArgument.test_scalars b/test/dynamo_expected_failures/TestArrayCreationCopyArgument.test_scalars
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArrayCreationCopyArgument.test_striding_not_ok b/test/dynamo_expected_failures/TestArrayCreationCopyArgument.test_striding_not_ok
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArrayFromScalar.test_integers_np_byte_np_longlong b/test/dynamo_expected_failures/TestArrayFromScalar.test_integers_np_byte_np_longlong
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArrayFromScalar.test_integers_np_int__np_longlong b/test/dynamo_expected_failures/TestArrayFromScalar.test_integers_np_int__np_longlong
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArrayFromScalar.test_integers_np_intc_np_longlong b/test/dynamo_expected_failures/TestArrayFromScalar.test_integers_np_intc_np_longlong
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArrayFromScalar.test_integers_np_longlong_np_longlong b/test/dynamo_expected_failures/TestArrayFromScalar.test_integers_np_longlong_np_longlong
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArrayFromScalar.test_integers_np_longlong_t26 b/test/dynamo_expected_failures/TestArrayFromScalar.test_integers_np_longlong_t26
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArrayFromScalar.test_integers_np_short_np_longlong b/test/dynamo_expected_failures/TestArrayFromScalar.test_integers_np_short_np_longlong
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArrayFromScalar.test_integers_t15_np_longlong b/test/dynamo_expected_failures/TestArrayFromScalar.test_integers_t15_np_longlong
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestArraySplit.test_integer_0_split b/test/dynamo_expected_failures/TestArraySplit.test_integer_0_split
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_bfloat16 b/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_bool b/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_complex128 b/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_complex64 b/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_float16 b/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_float32 b/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_float64 b/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_int16 b/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_int32 b/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_int64 b/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_int8 b/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_uint8 b/test/dynamo_expected_failures/TestAsArrayCPU.test_copy_list_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAssignment.test_assignment_broadcasting b/test/dynamo_expected_failures/TestAssignment.test_assignment_broadcasting
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAttributes.test_fill_max_uint64 b/test/dynamo_expected_failures/TestAttributes.test_fill_max_uint64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAttributes.test_fill_readonly b/test/dynamo_expected_failures/TestAttributes.test_fill_readonly
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAttributes.test_fill_struct_array b/test/dynamo_expected_failures/TestAttributes.test_fill_struct_array
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_access_saved_tensor_twice_without_recomputation_works b/test/dynamo_expected_failures/TestAutograd.test_access_saved_tensor_twice_without_recomputation_works
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_anomaly_mode_no_check_nan b/test/dynamo_expected_failures/TestAutograd.test_anomaly_mode_no_check_nan
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_autograd_simple_views_python b/test/dynamo_expected_failures/TestAutograd.test_autograd_simple_views_python
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_autograd_views_codegen b/test/dynamo_expected_failures/TestAutograd.test_autograd_views_codegen
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_backward_with_inputs b/test/dynamo_expected_failures/TestAutograd.test_backward_with_inputs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_checkpoint_detects_non_determinism b/test/dynamo_expected_failures/TestAutograd.test_checkpoint_detects_non_determinism
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_checkpointing_non_reentrant_autocast_cpu b/test/dynamo_expected_failures/TestAutograd.test_checkpointing_non_reentrant_autocast_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_checkpointing_without_reentrant_custom_function_works b/test/dynamo_expected_failures/TestAutograd.test_checkpointing_without_reentrant_custom_function_works
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_checkpointing_without_reentrant_with_context_fn b/test/dynamo_expected_failures/TestAutograd.test_checkpointing_without_reentrant_with_context_fn
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_custom_function_forward_mode_inplace_checks b/test/dynamo_expected_failures/TestAutograd.test_custom_function_forward_mode_inplace_checks
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_custom_function_forward_mode_view_checks b/test/dynamo_expected_failures/TestAutograd.test_custom_function_forward_mode_view_checks
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_custom_function_saved_tensors b/test/dynamo_expected_failures/TestAutograd.test_custom_function_saved_tensors
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_gradcheck_forward_ad b/test/dynamo_expected_failures/TestAutograd.test_gradcheck_forward_ad
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_gradcheck_nondeterministic b/test/dynamo_expected_failures/TestAutograd.test_gradcheck_nondeterministic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_hook_closure_cycle_use_custom_function_True_use_tensor_hook_False b/test/dynamo_expected_failures/TestAutograd.test_hook_closure_cycle_use_custom_function_True_use_tensor_hook_False
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_nested_anomaly_detect_nan b/test/dynamo_expected_failures/TestAutograd.test_nested_anomaly_detect_nan
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_nested_anomaly_printstack_cleanup b/test/dynamo_expected_failures/TestAutograd.test_nested_anomaly_printstack_cleanup
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_profiler b/test/dynamo_expected_failures/TestAutograd.test_profiler
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_profiler_aggregation_table b/test/dynamo_expected_failures/TestAutograd.test_profiler_aggregation_table
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_profiler_propagation b/test/dynamo_expected_failures/TestAutograd.test_profiler_propagation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_profiler_seq_nr b/test/dynamo_expected_failures/TestAutograd.test_profiler_seq_nr
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_profiler_shapes b/test/dynamo_expected_failures/TestAutograd.test_profiler_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_record_function b/test/dynamo_expected_failures/TestAutograd.test_record_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_record_function_callbacks b/test/dynamo_expected_failures/TestAutograd.test_record_function_callbacks
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_record_function_legacy b/test/dynamo_expected_failures/TestAutograd.test_record_function_legacy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_return_duplicate b/test/dynamo_expected_failures/TestAutograd.test_return_duplicate
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_return_duplicate_inplace b/test/dynamo_expected_failures/TestAutograd.test_return_duplicate_inplace
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_save_on_cpu_and_checkpoint b/test/dynamo_expected_failures/TestAutograd.test_save_on_cpu_and_checkpoint
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_set_grad_coroutines b/test/dynamo_expected_failures/TestAutograd.test_set_grad_coroutines
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_set_grad_coroutines_exit b/test/dynamo_expected_failures/TestAutograd.test_set_grad_coroutines_exit
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_set_grad_generator_functions b/test/dynamo_expected_failures/TestAutograd.test_set_grad_generator_functions
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutograd.test_set_grad_generator_functions_recursive b/test/dynamo_expected_failures/TestAutograd.test_set_grad_generator_functions_recursive
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutogradDeviceTypeCPU.test_inplace_on_view_modify_base_cpu b/test/dynamo_expected_failures/TestAutogradDeviceTypeCPU.test_inplace_on_view_modify_base_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutogradDeviceTypeCPU.test_inplace_on_view_python_cpu b/test/dynamo_expected_failures/TestAutogradDeviceTypeCPU.test_inplace_on_view_python_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutogradForwardMode.test_create_new_zeros_with_same_meta b/test/dynamo_expected_failures/TestAutogradForwardMode.test_create_new_zeros_with_same_meta
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutogradForwardMode.test_detach_view_tracking b/test/dynamo_expected_failures/TestAutogradForwardMode.test_detach_view_tracking
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutogradForwardMode.test_forward_level_cleanup b/test/dynamo_expected_failures/TestAutogradForwardMode.test_forward_level_cleanup
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutogradForwardMode.test_make_dual_inference_tensor_in_inference_mode b/test/dynamo_expected_failures/TestAutogradForwardMode.test_make_dual_inference_tensor_in_inference_mode
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutogradInferenceMode.test_inference_mode_decorator b/test/dynamo_expected_failures/TestAutogradInferenceMode.test_inference_mode_decorator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestAutogradInferenceMode.test_inference_mode_inf_tensor_in_inf_mode_inplace_op b/test/dynamo_expected_failures/TestAutogradInferenceMode.test_inference_mode_inf_tensor_in_inf_mode_inplace_op
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestBaseMath.test_lower_align b/test/dynamo_expected_failures/TestBaseMath.test_lower_align
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestBaseSparsifier.test_state_dict b/test/dynamo_expected_failures/TestBaseSparsifier.test_state_dict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestBitsCPU.test_cat_cpu b/test/dynamo_expected_failures/TestBitsCPU.test_cat_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestBool.test_sum_2 b/test/dynamo_expected_failures/TestBool.test_sum_2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestBooleanIndexing.test_bool_as_int_argument_errors b/test/dynamo_expected_failures/TestBooleanIndexing.test_bool_as_int_argument_errors
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestBooleanIndexing.test_boolean_indexing_weirdness b/test/dynamo_expected_failures/TestBooleanIndexing.test_boolean_indexing_weirdness
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestBroadcast.test_broadcast_error_kwargs b/test/dynamo_expected_failures/TestBroadcast.test_broadcast_error_kwargs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestBroadcast.test_broadcast_single_arg b/test/dynamo_expected_failures/TestBroadcast.test_broadcast_single_arg
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestBufferProtocolCPU.test_byte_to_int_cpu b/test/dynamo_expected_failures/TestBufferProtocolCPU.test_byte_to_int_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestClip.test_clip_func_takes_out b/test/dynamo_expected_failures/TestClip.test_clip_func_takes_out
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestClip.test_simple_complex b/test/dynamo_expected_failures/TestClip.test_simple_complex
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestColumnStack.test_non_iterable b/test/dynamo_expected_failures/TestColumnStack.test_non_iterable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCommonPass.test_correctness_CSEPass_MutationInput_cpu b/test/dynamo_expected_failures/TestCommonPass.test_correctness_CSEPass_MutationInput_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCommonPass.test_correctness_CSEPass_MutationMetadata_cpu b/test/dynamo_expected_failures/TestCommonPass.test_correctness_CSEPass_MutationMetadata_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCommonPass.test_correctness_CSEPass_MutationTorchTensorCall_cpu b/test/dynamo_expected_failures/TestCommonPass.test_correctness_CSEPass_MutationTorchTensorCall_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCommonPass.test_correctness_CSEPass_Mutation_cpu b/test/dynamo_expected_failures/TestCommonPass.test_correctness_CSEPass_Mutation_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCommonPass.test_correctness_CSEPass_ReturnList_cpu b/test/dynamo_expected_failures/TestCommonPass.test_correctness_CSEPass_ReturnList_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCommonPass.test_correctness_CSEPass_TakeList_cpu b/test/dynamo_expected_failures/TestCommonPass.test_correctness_CSEPass_TakeList_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCommonPass.test_correctness_factory_CSEPass_FactoryFunctionCall_cpu b/test/dynamo_expected_failures/TestCommonPass.test_correctness_factory_CSEPass_FactoryFunctionCall_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCommonPass.test_correctness_factory_CSEPass_MutationFactory_cpu b/test/dynamo_expected_failures/TestCommonPass.test_correctness_factory_CSEPass_MutationFactory_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCompileTransformsCPU.test_compile_vmap_hessian_cpu b/test/dynamo_expected_failures/TestCompileTransformsCPU.test_compile_vmap_hessian_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestComposability.test_convert_without_squash_mask b/test/dynamo_expected_failures/TestComposability.test_convert_without_squash_mask
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestComposability.test_fusion_before_s_prep b/test/dynamo_expected_failures/TestComposability.test_fusion_before_s_prep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestComposability.test_q_prep_before_s_prep b/test/dynamo_expected_failures/TestComposability.test_q_prep_before_s_prep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestComposability.test_qat_prep_before_s_prep b/test/dynamo_expected_failures/TestComposability.test_qat_prep_before_s_prep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestComposability.test_s_prep_before_fusion b/test/dynamo_expected_failures/TestComposability.test_s_prep_before_fusion
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestComposability.test_s_prep_before_q_prep b/test/dynamo_expected_failures/TestComposability.test_s_prep_before_q_prep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestComposability.test_s_prep_before_qat_prep b/test/dynamo_expected_failures/TestComposability.test_s_prep_before_qat_prep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestComposabilityCPU.test_autograd_function_no_setup_context_transform_hessian_cpu b/test/dynamo_expected_failures/TestComposabilityCPU.test_autograd_function_no_setup_context_transform_hessian_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestComposabilityCPU.test_autograd_function_no_setup_context_transform_jacfwd_cpu b/test/dynamo_expected_failures/TestComposabilityCPU.test_autograd_function_no_setup_context_transform_jacfwd_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestComposabilityCPU.test_deprecation_transforms_transform_functionalize_cpu b/test/dynamo_expected_failures/TestComposabilityCPU.test_deprecation_transforms_transform_functionalize_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestComposabilityCPU.test_requires_grad_inside_transform_cpu b/test/dynamo_expected_failures/TestComposabilityCPU.test_requires_grad_inside_transform_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConcatenate.test_bad_out_shape b/test/dynamo_expected_failures/TestConcatenate.test_bad_out_shape
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConcatenate.test_concatenate b/test/dynamo_expected_failures/TestConcatenate.test_concatenate
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConcatenate.test_exceptions b/test/dynamo_expected_failures/TestConcatenate.test_exceptions
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConcatenate.test_large_concatenate_axis_None b/test/dynamo_expected_failures/TestConcatenate.test_large_concatenate_axis_None
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCond.test_empty_sq_cases b/test/dynamo_expected_failures/TestCond.test_empty_sq_cases
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCond.test_sq_cases b/test/dynamo_expected_failures/TestCond.test_sq_cases
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestContentStoreCPU.test_repeated_hash_cpu b/test/dynamo_expected_failures/TestContentStoreCPU.test_repeated_hash_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlow.test_map_autograd_nested_list b/test/dynamo_expected_failures/TestControlFlow.test_map_autograd_nested_list
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlow.test_map_autograd_no_grad_output b/test/dynamo_expected_failures/TestControlFlow.test_map_autograd_no_grad_output
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlow.test_map_dict_in_out b/test/dynamo_expected_failures/TestControlFlow.test_map_dict_in_out
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlow.test_map_list_in_out b/test/dynamo_expected_failures/TestControlFlow.test_map_list_in_out
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_map_functionalized b/test/dynamo_expected_failures/TestControlFlowTraced.test_map_functionalized
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_map_functionalized_aot_func b/test/dynamo_expected_failures/TestControlFlowTraced.test_map_functionalized_aot_func
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_nested_cond_map_cond_symbolic b/test/dynamo_expected_failures/TestControlFlowTraced.test_nested_cond_map_cond_symbolic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_nested_map_cond_real b/test/dynamo_expected_failures/TestControlFlowTraced.test_nested_map_cond_real
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_nested_map_cond_symbolic b/test/dynamo_expected_failures/TestControlFlowTraced.test_nested_map_cond_symbolic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_autograd_aot_functionalized b/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_autograd_aot_functionalized
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_autograd_symbolic_dict b/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_autograd_symbolic_dict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_autograd_symbolic_list b/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_autograd_symbolic_list
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_autograd_symbolic_simple b/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_autograd_symbolic_simple
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_real b/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_real
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_symbolic_dict b/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_symbolic_dict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_symbolic_list b/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_symbolic_list
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_symbolic_simple b/test/dynamo_expected_failures/TestControlFlowTraced.test_tracing_map_symbolic_simple
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestConversion.test_to_int_scalar b/test/dynamo_expected_failures/TestConversion.test_to_int_scalar
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCov.test_aweights b/test/dynamo_expected_failures/TestCov.test_aweights
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCov.test_fweights b/test/dynamo_expected_failures/TestCov.test_fweights
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCppExtensionJIT.test_cpp_frontend_module_has_same_output_as_python b/test/dynamo_expected_failures/TestCppExtensionJIT.test_cpp_frontend_module_has_same_output_as_python
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCppExtensionJIT.test_cpp_frontend_module_has_up_to_date_attribute b/test/dynamo_expected_failures/TestCppExtensionJIT.test_cpp_frontend_module_has_up_to_date_attribute
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCppExtensionJIT.test_cpp_frontend_module_has_up_to_date_attributes b/test/dynamo_expected_failures/TestCppExtensionJIT.test_cpp_frontend_module_has_up_to_date_attributes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCppExtensionOpenRgistration.test_open_device_registration b/test/dynamo_expected_failures/TestCppExtensionOpenRgistration.test_open_device_registration
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCross.test_broadcasting_shapes b/test/dynamo_expected_failures/TestCross.test_broadcasting_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCustomBackendAPI.test_aot_autograd_api b/test/dynamo_expected_failures/TestCustomBackendAPI.test_aot_autograd_api
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCustomBackendAPI.test_lookup_backend b/test/dynamo_expected_failures/TestCustomBackendAPI.test_lookup_backend
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCustomBackendAPI.test_register_backend_api b/test/dynamo_expected_failures/TestCustomBackendAPI.test_register_backend_api
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCustomOp.test_impl_device_cpu b/test/dynamo_expected_failures/TestCustomOp.test_impl_device_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCustomOp.test_legacy_define b/test/dynamo_expected_failures/TestCustomOp.test_legacy_define
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestCustomOpTestingCPU.test_missing_functionalization_cpu b/test/dynamo_expected_failures/TestCustomOpTestingCPU.test_missing_functionalization_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_dlpack_device b/test/dynamo_expected_failures/TestDLPack.test_dlpack_device
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype0 b/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype1 b/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype2 b/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype3 b/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype3
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype4 b/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype4
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype5 b/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype5
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype6 b/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype6
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype7 b/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype7
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype8 b/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype9 b/test/dynamo_expected_failures/TestDLPack.test_dtype_passthrough_dtype9
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_dunder_dlpack_refcount b/test/dynamo_expected_failures/TestDLPack.test_dunder_dlpack_refcount
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_from_dlpack_refcount b/test/dynamo_expected_failures/TestDLPack.test_from_dlpack_refcount
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_from_torch b/test/dynamo_expected_failures/TestDLPack.test_from_torch
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_0 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_1 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_10 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_10
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_11 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_11
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_12 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_12
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_13 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_13
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_14 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_14
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_15 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_15
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_16 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_17 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_17
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_18 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_18
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_19 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_19
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_2 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_20 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_20
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_21 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_21
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_22 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_22
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_23 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_23
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_24 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_24
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_25 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_25
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_26 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_26
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_27 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_27
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_28 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_28
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_29 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_29
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_3 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_3
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_30 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_30
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_31 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_31
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_32 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_4 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_4
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_5 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_5
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_6 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_6
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_7 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_7
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_8 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_9 b/test/dynamo_expected_failures/TestDLPack.test_higher_dims_ndim_9
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDLPack.test_ndim0 b/test/dynamo_expected_failures/TestDLPack.test_ndim0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDelete.test_slices b/test/dynamo_expected_failures/TestDelete.test_slices
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDeserialize.test_tensor_tensor_list b/test/dynamo_expected_failures/TestDeserialize.test_tensor_tensor_list
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDiag.test_failure b/test/dynamo_expected_failures/TestDiag.test_failure
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDiff.test_append b/test/dynamo_expected_failures/TestDiff.test_append
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDiff.test_axis b/test/dynamo_expected_failures/TestDiff.test_axis
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDiff.test_n b/test/dynamo_expected_failures/TestDiff.test_n
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDiff.test_prepend b/test/dynamo_expected_failures/TestDiff.test_prepend
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDistributionShapes.test_mixture_same_family_shape b/test/dynamo_expected_failures/TestDistributionShapes.test_mixture_same_family_shape
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDropoutNN.test_invalid_dropout_p b/test/dynamo_expected_failures/TestDropoutNN.test_invalid_dropout_p
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDsplit.test_0D_array b/test/dynamo_expected_failures/TestDsplit.test_0D_array
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDsplit.test_1D_array b/test/dynamo_expected_failures/TestDsplit.test_1D_array
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDsplit.test_2D_array b/test/dynamo_expected_failures/TestDsplit.test_2D_array
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDsplit.test_non_iterable b/test/dynamo_expected_failures/TestDsplit.test_non_iterable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDstack.test_non_iterable b/test/dynamo_expected_failures/TestDstack.test_non_iterable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDynamicQuantizedOps.test_qlinear b/test/dynamo_expected_failures/TestDynamicQuantizedOps.test_qlinear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestDynamismExpression.test_export_inline_constraints b/test/dynamo_expected_failures/TestDynamismExpression.test_export_inline_constraints
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestEigh.test_invalid b/test/dynamo_expected_failures/TestEigh.test_invalid
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestEigvalsh.test_invalid b/test/dynamo_expected_failures/TestEigvalsh.test_invalid
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestEmbeddingNN.test_embedding_max_norm b/test/dynamo_expected_failures/TestEmbeddingNN.test_embedding_max_norm
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestEmbeddingNN.test_embedding_sparse_basic b/test/dynamo_expected_failures/TestEmbeddingNN.test_embedding_sparse_basic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestEmbeddingNN.test_embedding_sparse_empty_tensor b/test/dynamo_expected_failures/TestEmbeddingNN.test_embedding_sparse_empty_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestEmbeddingNN.test_embeddingbag_include_last_offset b/test/dynamo_expected_failures/TestEmbeddingNN.test_embeddingbag_include_last_offset
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_maml_regression_mechanism_functional_call_cpu b/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_maml_regression_mechanism_functional_call_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_maml_regression_mechanism_make_functional_cpu b/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_maml_regression_mechanism_make_functional_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_resnet18_per_sample_grads_mechanism_functional_call_cpu b/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_resnet18_per_sample_grads_mechanism_functional_call_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_resnet18_per_sample_grads_mechanism_make_functional_cpu b/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_resnet18_per_sample_grads_mechanism_make_functional_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExamplesCorrectnessCUDA.test_maml_regression_mechanism_functional_call_cuda b/test/dynamo_expected_failures/TestExamplesCorrectnessCUDA.test_maml_regression_mechanism_functional_call_cuda
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExamplesCorrectnessCUDA.test_maml_regression_mechanism_make_functional_cuda b/test/dynamo_expected_failures/TestExamplesCorrectnessCUDA.test_maml_regression_mechanism_make_functional_cuda
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandDims.test_axis_out_of_range b/test/dynamo_expected_failures/TestExpandDims.test_axis_out_of_range
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandDims.test_repeated_axis b/test/dynamo_expected_failures/TestExpandDims.test_repeated_axis
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_conv1d_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_conv1d_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_conv2d_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_conv2d_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_conv3d_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_conv3d_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_group_norm_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_group_norm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_instance_norm_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_instance_norm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_layer_norm_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_layer_norm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_conv1d_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_conv1d_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_conv2d_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_conv2d_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_conv3d_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_conv3d_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_group_norm_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_group_norm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_instance_norm_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_instance_norm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_layer_norm_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_layer_norm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_conv1d_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_conv1d_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_conv2d_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_conv2d_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_conv3d_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_conv3d_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_group_norm_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_group_norm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_instance_norm_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_instance_norm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_layer_norm_cpu_float64 b/test/dynamo_expected_failures/TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_layer_norm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExperiment.test_with_buffer_as_submodule b/test/dynamo_expected_failures/TestExperiment.test_with_buffer_as_submodule
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExperimentalUtils.test_profiler_name_pattern b/test/dynamo_expected_failures/TestExperimentalUtils.test_profiler_name_pattern
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExperimentalUtils.test_profiler_optimizer_single_tensor_pattern b/test/dynamo_expected_failures/TestExperimentalUtils.test_profiler_optimizer_single_tensor_pattern
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExperimentalUtils.test_profiler_pattern_match_helper b/test/dynamo_expected_failures/TestExperimentalUtils.test_profiler_pattern_match_helper
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExperimentalUtils.test_profiler_pattern_matcher_json_report b/test/dynamo_expected_failures/TestExperimentalUtils.test_profiler_pattern_matcher_json_report
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExperimentalUtils.test_profiler_synchronized_dataloader_pattern b/test/dynamo_expected_failures/TestExperimentalUtils.test_profiler_synchronized_dataloader_pattern
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExperimentalUtils.test_utils_compute_queue_depth_when_no_cuda_events b/test/dynamo_expected_failures/TestExperimentalUtils.test_utils_compute_queue_depth_when_no_cuda_events
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExperimentalUtils.test_utils_compute_self_time b/test/dynamo_expected_failures/TestExperimentalUtils.test_utils_compute_self_time
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExport.test_basic_non_strict_fake_tensor b/test/dynamo_expected_failures/TestExport.test_basic_non_strict_fake_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExport.test_basic_non_strict_real_tensor b/test/dynamo_expected_failures/TestExport.test_basic_non_strict_real_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExport.test_cond_with_module_stack_export_with b/test/dynamo_expected_failures/TestExport.test_cond_with_module_stack_export_with
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExport.test_constrain_size_in_eager b/test/dynamo_expected_failures/TestExport.test_constrain_size_in_eager
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExport.test_constrain_size_with_constrain_value b/test/dynamo_expected_failures/TestExport.test_constrain_size_with_constrain_value
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExport.test_constrain_size_with_various_cases b/test/dynamo_expected_failures/TestExport.test_constrain_size_with_various_cases
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExport.test_external_call_non_strict_real_tensor b/test/dynamo_expected_failures/TestExport.test_external_call_non_strict_real_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExport.test_nn_module_stack b/test/dynamo_expected_failures/TestExport.test_nn_module_stack
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExport.test_nn_module_stack_shared_submodule b/test/dynamo_expected_failures/TestExport.test_nn_module_stack_shared_submodule
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExport.test_non_strict_dynamic_shapes b/test/dynamo_expected_failures/TestExport.test_non_strict_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestExport.test_non_strict_dynamic_shapes_suggested_fixes b/test/dynamo_expected_failures/TestExport.test_non_strict_dynamic_shapes_suggested_fixes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFXGraphMatcherModels.test_mobilenet_v2_qat b/test/dynamo_expected_failures/TestFXGraphMatcherModels.test_mobilenet_v2_qat
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFakeQuantizeOps.test_backward_per_channel b/test/dynamo_expected_failures/TestFakeQuantizeOps.test_backward_per_channel
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFakeQuantizeOps.test_backward_per_tensor b/test/dynamo_expected_failures/TestFakeQuantizeOps.test_backward_per_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFakeQuantizeOps.test_learnable_backward_per_channel_cpu b/test/dynamo_expected_failures/TestFakeQuantizeOps.test_learnable_backward_per_channel_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFakeQuantizeOps.test_learnable_backward_per_channel_cuda b/test/dynamo_expected_failures/TestFakeQuantizeOps.test_learnable_backward_per_channel_cuda
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFakeQuantizeOps.test_learnable_backward_per_tensor_cuda b/test/dynamo_expected_failures/TestFakeQuantizeOps.test_learnable_backward_per_tensor_cuda
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFakeQuantizeOps.test_learnable_forward_per_channel_cuda b/test/dynamo_expected_failures/TestFakeQuantizeOps.test_learnable_forward_per_channel_cuda
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFakeQuantizeOps.test_learnable_forward_per_tensor_cpu b/test/dynamo_expected_failures/TestFakeQuantizeOps.test_learnable_forward_per_tensor_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFakeQuantizeOps.test_learnable_forward_per_tensor_cuda b/test/dynamo_expected_failures/TestFakeQuantizeOps.test_learnable_forward_per_tensor_cuda
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFlag.test_writeable_from_readonly b/test/dynamo_expected_failures/TestFlag.test_writeable_from_readonly
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFlip.test_axes b/test/dynamo_expected_failures/TestFlip.test_axes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFliplr.test_basic b/test/dynamo_expected_failures/TestFliplr.test_basic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFloatNonIntegerArgument.test_non_integer_argument_errors b/test/dynamo_expected_failures/TestFloatNonIntegerArgument.test_non_integer_argument_errors
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFloatNonIntegerArgument.test_reduce_axis_float_index b/test/dynamo_expected_failures/TestFloatNonIntegerArgument.test_reduce_axis_float_index
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFromBuffer.test_basic_little_dtype0 b/test/dynamo_expected_failures/TestFromBuffer.test_basic_little_dtype0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFromBuffer.test_basic_little_dtype1 b/test/dynamo_expected_failures/TestFromBuffer.test_basic_little_dtype1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFromBuffer.test_basic_little_dtype2 b/test/dynamo_expected_failures/TestFromBuffer.test_basic_little_dtype2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFromBuffer.test_empty b/test/dynamo_expected_failures/TestFromBuffer.test_empty
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFunctionalizeCPU.test_multioutput_view_cpu b/test/dynamo_expected_failures/TestFunctionalizeCPU.test_multioutput_view_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFunctionalizeCPU.test_simple_view_cpu b/test/dynamo_expected_failures/TestFunctionalizeCPU.test_simple_view_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFunctionalizeCPU.test_vmap_functionalize_jvp_cpu b/test/dynamo_expected_failures/TestFunctionalizeCPU.test_vmap_functionalize_jvp_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFunctors.test_cat_transform b/test/dynamo_expected_failures/TestFunctors.test_cat_transform
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestFunctors.test_cat_transform_non_uniform b/test/dynamo_expected_failures/TestFunctors.test_cat_transform_non_uniform
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenerateNumericDebugHandle.test_quantize_pt2e_preserve_handle b/test/dynamo_expected_failures/TestGenerateNumericDebugHandle.test_quantize_pt2e_preserve_handle
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenerateOpcheckTests.test_opcheck_bad_op b/test/dynamo_expected_failures/TestGenerateOpcheckTests.test_opcheck_bad_op
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericCumSumProd.test_bad_axis_func0 b/test/dynamo_expected_failures/TestGenericCumSumProd.test_bad_axis_func0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericCumSumProd.test_bad_axis_func1 b/test/dynamo_expected_failures/TestGenericCumSumProd.test_bad_axis_func1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericPytree.test_flatten_unflatten_deque_cxx b/test/dynamo_expected_failures/TestGenericPytree.test_flatten_unflatten_deque_cxx
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericPytree.test_flatten_unflatten_deque_py b/test/dynamo_expected_failures/TestGenericPytree.test_flatten_unflatten_deque_py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func0 b/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func1 b/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func10 b/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func10
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func11 b/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func11
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func2 b/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func3 b/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func3
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func4 b/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func4
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func5 b/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func5
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func6 b/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func6
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func7 b/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func7
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func8 b/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func9 b/test/dynamo_expected_failures/TestGenericReductions.test_bad_axis_func9
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGradNewOnesOverride.test_newones b/test/dynamo_expected_failures/TestGradNewOnesOverride.test_newones
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGradient.test_badargs b/test/dynamo_expected_failures/TestGradient.test_badargs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGradient.test_second_order_accurate b/test/dynamo_expected_failures/TestGradient.test_second_order_accurate
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGradient.test_specific_axes b/test/dynamo_expected_failures/TestGradient.test_specific_axes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestGradient.test_values b/test/dynamo_expected_failures/TestGradient.test_values
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHessianCPU.test_jacfwd_different_levels_cpu b/test/dynamo_expected_failures/TestHessianCPU.test_jacfwd_different_levels_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHigherOrderOperatorInteractionCPU.test_grad_name_wrapping_cpu b/test/dynamo_expected_failures/TestHigherOrderOperatorInteractionCPU.test_grad_name_wrapping_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHistogram.test_error_binnum_type b/test/dynamo_expected_failures/TestHistogram.test_error_binnum_type
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHistogram.test_finite_range b/test/dynamo_expected_failures/TestHistogram.test_finite_range
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHistogram.test_one_bin b/test/dynamo_expected_failures/TestHistogram.test_one_bin
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHistogram.test_unsigned_monotonicity_check b/test/dynamo_expected_failures/TestHistogram.test_unsigned_monotonicity_check
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHistogram2d.test_binparameter_combination b/test/dynamo_expected_failures/TestHistogram2d.test_binparameter_combination
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHistogramdd.test_bins_error_2 b/test/dynamo_expected_failures/TestHistogramdd.test_bins_error_2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHistogramdd.test_bins_errors b/test/dynamo_expected_failures/TestHistogramdd.test_bins_errors
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHistogramdd.test_equal_edges b/test/dynamo_expected_failures/TestHistogramdd.test_equal_edges
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHistogramdd.test_inf_edges b/test/dynamo_expected_failures/TestHistogramdd.test_inf_edges
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHistogramdd.test_weights b/test/dynamo_expected_failures/TestHistogramdd.test_weights
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHsplit.test_0D_array b/test/dynamo_expected_failures/TestHsplit.test_0D_array
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHsplit.test_non_iterable b/test/dynamo_expected_failures/TestHsplit.test_non_iterable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHstack.test_empty_input b/test/dynamo_expected_failures/TestHstack.test_empty_input
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestHstack.test_non_iterable b/test/dynamo_expected_failures/TestHstack.test_non_iterable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestIndexing.test_index_no_floats b/test/dynamo_expected_failures/TestIndexing.test_index_no_floats
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestIndexing.test_slicing_no_floats b/test/dynamo_expected_failures/TestIndexing.test_slicing_no_floats
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestIndexingCPU.test_byte_mask_cpu b/test/dynamo_expected_failures/TestIndexingCPU.test_byte_mask_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestIndexingCPU.test_empty_ndim_index_bool_cpu b/test/dynamo_expected_failures/TestIndexingCPU.test_empty_ndim_index_bool_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestIndexingCPU.test_index_cpu b/test/dynamo_expected_failures/TestIndexingCPU.test_index_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestIndexingCPU.test_index_limits_cpu b/test/dynamo_expected_failures/TestIndexingCPU.test_index_limits_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestIndexingCPU.test_out_of_bound_index_cpu b/test/dynamo_expected_failures/TestIndexingCPU.test_out_of_bound_index_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestIndexingCPU.test_zero_dim_index_cpu b/test/dynamo_expected_failures/TestIndexingCPU.test_zero_dim_index_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestInputAttrTracking.test_complex_attr_access_without_graph_breaks b/test/dynamo_expected_failures/TestInputAttrTracking.test_complex_attr_access_without_graph_breaks
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestInputAttrTracking.test_const_property_on_tensor b/test/dynamo_expected_failures/TestInputAttrTracking.test_const_property_on_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestInputAttrTracking.test_set_data_on_input_tensor b/test/dynamo_expected_failures/TestInputAttrTracking.test_set_data_on_input_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestInputAttrTracking.test_tensor_property_assigned_on_tensor b/test/dynamo_expected_failures/TestInputAttrTracking.test_tensor_property_assigned_on_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestInputAttrTracking.test_tensor_property_on_tensor b/test/dynamo_expected_failures/TestInputAttrTracking.test_tensor_property_on_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestIsScalar.test_is_not_scalar_value6 b/test/dynamo_expected_failures/TestIsScalar.test_is_not_scalar_value6
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestIterDataPipeGraphFastForward.test_simple_snapshot_custom_non_generator b/test/dynamo_expected_failures/TestIterDataPipeGraphFastForward.test_simple_snapshot_custom_non_generator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestIterDataPipeGraphFastForward.test_simple_snapshot_custom_self_next b/test/dynamo_expected_failures/TestIterDataPipeGraphFastForward.test_simple_snapshot_custom_self_next
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestIterator.test_iterator b/test/dynamo_expected_failures/TestIterator.test_iterator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_against_reference_correctness_different_devices_cpu b/test/dynamo_expected_failures/TestJacCPU.test_against_reference_correctness_different_devices_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_against_reference_default_arg_cpu b/test/dynamo_expected_failures/TestJacCPU.test_against_reference_default_arg_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_against_reference_multi_input_cpu b/test/dynamo_expected_failures/TestJacCPU.test_against_reference_multi_input_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_against_reference_multi_input_multi_output_cpu b/test/dynamo_expected_failures/TestJacCPU.test_against_reference_multi_input_multi_output_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_against_reference_simple_cpu b/test/dynamo_expected_failures/TestJacCPU.test_against_reference_simple_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_against_reference_unrelated_outputs_cpu b/test/dynamo_expected_failures/TestJacCPU.test_against_reference_unrelated_outputs_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_against_reference_zero_dim_cpu b/test/dynamo_expected_failures/TestJacCPU.test_against_reference_zero_dim_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_argnums_defaults_to_zero_cpu b/test/dynamo_expected_failures/TestJacCPU.test_argnums_defaults_to_zero_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_aux_pytree_cpu b/test/dynamo_expected_failures/TestJacCPU.test_aux_pytree_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_dimensionality_cpu b/test/dynamo_expected_failures/TestJacCPU.test_dimensionality_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_empty_output_cpu b/test/dynamo_expected_failures/TestJacCPU.test_empty_output_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_inplace_cpu b/test/dynamo_expected_failures/TestJacCPU.test_inplace_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_jac_with_non_tensor_args_cpu b/test/dynamo_expected_failures/TestJacCPU.test_jac_with_non_tensor_args_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_multiple_inputs_outputs_pytree_cpu b/test/dynamo_expected_failures/TestJacCPU.test_multiple_inputs_outputs_pytree_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_multiple_inputs_pytree_cpu b/test/dynamo_expected_failures/TestJacCPU.test_multiple_inputs_pytree_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_multiple_outputs_multiple_argnums_cpu b/test/dynamo_expected_failures/TestJacCPU.test_multiple_outputs_multiple_argnums_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_multiple_outputs_single_argnums_cpu b/test/dynamo_expected_failures/TestJacCPU.test_multiple_outputs_single_argnums_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_outputs_can_any_pytree_cpu b/test/dynamo_expected_failures/TestJacCPU.test_outputs_can_any_pytree_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_unrelated_input_cpu b/test/dynamo_expected_failures/TestJacCPU.test_unrelated_input_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJacCPU.test_unrelated_output_cpu b/test/dynamo_expected_failures/TestJacCPU.test_unrelated_output_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJit.test_batchnorm b/test/dynamo_expected_failures/TestJit.test_batchnorm
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJit.test_function_default_values b/test/dynamo_expected_failures/TestJit.test_function_default_values
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_Bilinear b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_Bilinear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_Embedding b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_Embedding
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_discontiguous b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_discontiguous
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_max b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_max
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_max_padding_idx b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_max_padding_idx
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_mean b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_mean
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_mean_padding_idx b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_mean_padding_idx
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_sparse b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_sparse
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_sum b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_sum
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_sum_padding_idx b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_EmbeddingBag_sum_padding_idx
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_Embedding_discontiguous b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_Embedding_discontiguous
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_Embedding_sparse b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_Embedding_sparse
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_Linear b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_Linear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_Linear_no_batch_dim b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_Linear_no_batch_dim
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_PReLU_no_batch_dim b/test/dynamo_expected_failures/TestJitGeneratedModule.test_nn_PReLU_no_batch_dim
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_batchnorm_with_dict_input b/test/dynamo_expected_failures/TestLazyModules.test_lazy_batchnorm_with_dict_input
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv1d b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv1d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv1d_pickle b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv1d_pickle
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv2d b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv2d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv2d_pickle b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv2d_pickle
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv3d b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv3d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv3d_pickle b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv3d_pickle
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose1d_kwargs b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose1d_kwargs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose1d_pickle b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose1d_pickle
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d_kwargs b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d_kwargs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d_pickle b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d_pickle
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d_kwargs b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d_kwargs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d_pickle b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d_pickle
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transposed1d b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transposed1d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_linear_pickle b/test/dynamo_expected_failures/TestLazyModules.test_lazy_linear_pickle
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_linear b/test/dynamo_expected_failures/TestLazyModules.test_linear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLexsort.test_datetime b/test/dynamo_expected_failures/TestLexsort.test_datetime
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLexsort.test_mixed b/test/dynamo_expected_failures/TestLexsort.test_mixed
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_addmm_sizes_cpu_float32 b/test/dynamo_expected_failures/TestLinalgCPU.test_addmm_sizes_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_addmm_sizes_cpu_float64 b/test/dynamo_expected_failures/TestLinalgCPU.test_addmm_sizes_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_addr_integral_cpu_int16 b/test/dynamo_expected_failures/TestLinalgCPU.test_addr_integral_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_addr_integral_cpu_int32 b/test/dynamo_expected_failures/TestLinalgCPU.test_addr_integral_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_addr_integral_cpu_int64 b/test/dynamo_expected_failures/TestLinalgCPU.test_addr_integral_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_addr_integral_cpu_int8 b/test/dynamo_expected_failures/TestLinalgCPU.test_addr_integral_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_addr_integral_cpu_uint8 b/test/dynamo_expected_failures/TestLinalgCPU.test_addr_integral_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_geqrf_cpu_complex128 b/test/dynamo_expected_failures/TestLinalgCPU.test_geqrf_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_geqrf_cpu_complex64 b/test/dynamo_expected_failures/TestLinalgCPU.test_geqrf_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_geqrf_cpu_float32 b/test/dynamo_expected_failures/TestLinalgCPU.test_geqrf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_geqrf_cpu_float64 b/test/dynamo_expected_failures/TestLinalgCPU.test_geqrf_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_householder_product_cpu_complex128 b/test/dynamo_expected_failures/TestLinalgCPU.test_householder_product_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_householder_product_cpu_complex64 b/test/dynamo_expected_failures/TestLinalgCPU.test_householder_product_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_householder_product_cpu_float32 b/test/dynamo_expected_failures/TestLinalgCPU.test_householder_product_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_householder_product_cpu_float64 b/test/dynamo_expected_failures/TestLinalgCPU.test_householder_product_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_lobpcg_torchscript_cpu_float64 b/test/dynamo_expected_failures/TestLinalgCPU.test_lobpcg_torchscript_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_pinv_cpu_complex128 b/test/dynamo_expected_failures/TestLinalgCPU.test_pinv_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_pinv_cpu_complex64 b/test/dynamo_expected_failures/TestLinalgCPU.test_pinv_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_pinv_cpu_float32 b/test/dynamo_expected_failures/TestLinalgCPU.test_pinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_pinv_cpu_float64 b/test/dynamo_expected_failures/TestLinalgCPU.test_pinv_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_complex128 b/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_complex64 b/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_float32 b/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_float64 b/test/dynamo_expected_failures/TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_solve_cpu_complex128 b/test/dynamo_expected_failures/TestLinalgCPU.test_solve_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_solve_cpu_complex64 b/test/dynamo_expected_failures/TestLinalgCPU.test_solve_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_solve_cpu_float32 b/test/dynamo_expected_failures/TestLinalgCPU.test_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_solve_cpu_float64 b/test/dynamo_expected_failures/TestLinalgCPU.test_solve_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_tensorinv_errors_and_warnings_cpu_complex128 b/test/dynamo_expected_failures/TestLinalgCPU.test_tensorinv_errors_and_warnings_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_tensorinv_errors_and_warnings_cpu_complex64 b/test/dynamo_expected_failures/TestLinalgCPU.test_tensorinv_errors_and_warnings_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_tensorinv_errors_and_warnings_cpu_float32 b/test/dynamo_expected_failures/TestLinalgCPU.test_tensorinv_errors_and_warnings_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestLinalgCPU.test_tensorinv_errors_and_warnings_cpu_float64 b/test/dynamo_expected_failures/TestLinalgCPU.test_tensorinv_errors_and_warnings_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMatmul.test_exceptions b/test/dynamo_expected_failures/TestMatmul.test_exceptions
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMedian.test_extended_axis_invalid b/test/dynamo_expected_failures/TestMedian.test_extended_axis_invalid
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMeshgrid.test_indexing b/test/dynamo_expected_failures/TestMeshgrid.test_indexing
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMetaKernel.test_addmm_invalid_dtype b/test/dynamo_expected_failures/TestMetaKernel.test_addmm_invalid_dtype
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_adapt b/test/dynamo_expected_failures/TestMin.test_adapt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_attn b/test/dynamo_expected_failures/TestMin.test_attn
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_big_split b/test/dynamo_expected_failures/TestMin.test_big_split
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_c b/test/dynamo_expected_failures/TestMin.test_c
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_compare_dims b/test/dynamo_expected_failures/TestMin.test_compare_dims
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_diag b/test/dynamo_expected_failures/TestMin.test_diag
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_dim_args b/test/dynamo_expected_failures/TestMin.test_dim_args
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_dims_with_size b/test/dynamo_expected_failures/TestMin.test_dims_with_size
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_dir b/test/dynamo_expected_failures/TestMin.test_dir
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_doc b/test/dynamo_expected_failures/TestMin.test_doc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_embed b/test/dynamo_expected_failures/TestMin.test_embed
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_eq b/test/dynamo_expected_failures/TestMin.test_eq
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_expand b/test/dynamo_expected_failures/TestMin.test_expand
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_functorch b/test/dynamo_expected_failures/TestMin.test_functorch
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_hello b/test/dynamo_expected_failures/TestMin.test_hello
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_index b/test/dynamo_expected_failures/TestMin.test_index
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_index_placement b/test/dynamo_expected_failures/TestMin.test_index_placement
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_inplace b/test/dynamo_expected_failures/TestMin.test_inplace
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_manual_stuff b/test/dynamo_expected_failures/TestMin.test_manual_stuff
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_mask b/test/dynamo_expected_failures/TestMin.test_mask
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_max b/test/dynamo_expected_failures/TestMin.test_max
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_mm b/test/dynamo_expected_failures/TestMin.test_mm
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_mm_fuse b/test/dynamo_expected_failures/TestMin.test_mm_fuse
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_monkey b/test/dynamo_expected_failures/TestMin.test_monkey
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_network b/test/dynamo_expected_failures/TestMin.test_network
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_order b/test/dynamo_expected_failures/TestMin.test_order
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_order_keyword b/test/dynamo_expected_failures/TestMin.test_order_keyword
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_parse b/test/dynamo_expected_failures/TestMin.test_parse
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_permute_orig b/test/dynamo_expected_failures/TestMin.test_permute_orig
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_seg b/test/dynamo_expected_failures/TestMin.test_seg
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_simple b/test/dynamo_expected_failures/TestMin.test_simple
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_softmax_split b/test/dynamo_expected_failures/TestMin.test_softmax_split
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_stack b/test/dynamo_expected_failures/TestMin.test_stack
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_time_mm_fuse b/test/dynamo_expected_failures/TestMin.test_time_mm_fuse
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMin.test_with_dims_split b/test/dynamo_expected_failures/TestMin.test_with_dims_split
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_adapt b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_adapt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_attn b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_attn
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_big_split b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_big_split
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_c b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_c
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_compare_dims b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_compare_dims
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_diag b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_diag
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_dim_args b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_dim_args
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_dims_with_size b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_dims_with_size
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_dir b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_dir
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_doc b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_doc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_embed b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_embed
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_eq b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_eq
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_expand b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_expand
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_functorch b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_functorch
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_hello b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_hello
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_index b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_index
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_index_placement b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_index_placement
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_inplace b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_inplace
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_manual_stuff b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_manual_stuff
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_mask b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_mask
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_max b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_max
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_mm b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_mm
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_mm_fuse b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_mm_fuse
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_monkey b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_monkey
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_network b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_network
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_order b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_order
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_order_keyword b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_order_keyword
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_parse b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_parse
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_permute_orig b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_permute_orig
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_seg b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_seg
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_simple b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_simple
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_softmax_split b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_softmax_split
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_stack b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_stack
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinFunctorchOnly.test_with_dims_split b/test/dynamo_expected_failures/TestMinFunctorchOnly.test_with_dims_split
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinMax.test_axis b/test/dynamo_expected_failures/TestMinMax.test_axis
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMinMax.test_scalar b/test/dynamo_expected_failures/TestMinMax.test_scalar
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMisc.test_byteorder_check b/test/dynamo_expected_failures/TestMisc.test_byteorder_check
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMisc.test_generalized_raise_multiloop b/test/dynamo_expected_failures/TestMisc.test_generalized_raise_multiloop
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMkldnnCPU.test_add_cpu b/test/dynamo_expected_failures/TestMkldnnCPU.test_add_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMkldnnCPU.test_copy_cpu b/test/dynamo_expected_failures/TestMkldnnCPU.test_copy_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMkldnnCPU.test_mul_cpu b/test/dynamo_expected_failures/TestMkldnnCPU.test_mul_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMkldnnCPU.test_sigmoid_cpu b/test/dynamo_expected_failures/TestMkldnnCPU.test_sigmoid_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMkldnnCPU.test_tanh_cpu b/test/dynamo_expected_failures/TestMkldnnCPU.test_tanh_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestModuleHookNN.test_hook_inplace b/test/dynamo_expected_failures/TestModuleHookNN.test_hook_inplace
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMoveaxis.test_errors b/test/dynamo_expected_failures/TestMoveaxis.test_errors
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMultiDot.test_dynamic_programming_optimization_and_out b/test/dynamo_expected_failures/TestMultiDot.test_dynamic_programming_optimization_and_out
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMultiDot.test_three_arguments_and_out b/test/dynamo_expected_failures/TestMultiDot.test_three_arguments_and_out
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMultiDot.test_too_few_input_arrays b/test/dynamo_expected_failures/TestMultiDot.test_too_few_input_arrays
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMultiDot.test_two_arguments_and_out b/test/dynamo_expected_failures/TestMultiDot.test_two_arguments_and_out
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMultiprocessing.test_empty_shared b/test/dynamo_expected_failures/TestMultiprocessing.test_empty_shared
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMultiprocessing.test_fs_is_shared b/test/dynamo_expected_failures/TestMultiprocessing.test_fs_is_shared
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMultiprocessing.test_inherit_tensor b/test/dynamo_expected_failures/TestMultiprocessing.test_inherit_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestMultiprocessing.test_is_shared b/test/dynamo_expected_failures/TestMultiprocessing.test_is_shared
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNN.test_ParameterDict b/test/dynamo_expected_failures/TestNN.test_ParameterDict
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNN.test_ParameterList b/test/dynamo_expected_failures/TestNN.test_ParameterList
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNN.test_Sequential_iadd b/test/dynamo_expected_failures/TestNN.test_Sequential_iadd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNN.test_bilinear_broadcasting b/test/dynamo_expected_failures/TestNN.test_bilinear_broadcasting
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNN.test_fb_fc_packed b/test/dynamo_expected_failures/TestNN.test_fb_fc_packed
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNN.test_layer_norm_grads_with_create_graph_flag b/test/dynamo_expected_failures/TestNN.test_layer_norm_grads_with_create_graph_flag
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNN.test_linear_autograd_device_cpu_bias_weightCOO b/test/dynamo_expected_failures/TestNN.test_linear_autograd_device_cpu_bias_weightCOO
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNN.test_linear_broadcasting b/test/dynamo_expected_failures/TestNN.test_linear_broadcasting
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNN.test_module_apply_inplace_op b/test/dynamo_expected_failures/TestNN.test_module_apply_inplace_op
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNDeviceTypeCPU.test_invalid_reduction_strings_cpu b/test/dynamo_expected_failures/TestNNDeviceTypeCPU.test_invalid_reduction_strings_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNDeviceTypeCPU.test_module_to_empty_cpu_float32 b/test/dynamo_expected_failures/TestNNDeviceTypeCPU.test_module_to_empty_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNDeviceTypeCPU.test_module_to_empty_cpu_float64 b/test/dynamo_expected_failures/TestNNDeviceTypeCPU.test_module_to_empty_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNDeviceTypeCPU.test_nll_loss_byte_target_matches_long_cpu b/test/dynamo_expected_failures/TestNNDeviceTypeCPU.test_nll_loss_byte_target_matches_long_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNDeviceTypeCPU.test_threshold_inplace_overlap_cpu b/test/dynamo_expected_failures/TestNNDeviceTypeCPU.test_threshold_inplace_overlap_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNDeviceTypeCUDA.test_nll_loss_byte_target_matches_long_cuda b/test/dynamo_expected_failures/TestNNDeviceTypeCUDA.test_nll_loss_byte_target_matches_long_cuda
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNParametrization.test_deepcopy_after_parametrization_swap_True b/test/dynamo_expected_failures/TestNNParametrization.test_deepcopy_after_parametrization_swap_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNParametrization.test_errors_unparametrized_tensor_parametrization_swap_False b/test/dynamo_expected_failures/TestNNParametrization.test_errors_unparametrized_tensor_parametrization_swap_False
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNParametrization.test_errors_unparametrized_tensor_parametrization_swap_True b/test/dynamo_expected_failures/TestNNParametrization.test_errors_unparametrized_tensor_parametrization_swap_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNParametrization.test_initialization_parametrization_swap_True b/test/dynamo_expected_failures/TestNNParametrization.test_initialization_parametrization_swap_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNParametrization.test_register_and_remove_buffer_parametrization_swap_True b/test/dynamo_expected_failures/TestNNParametrization.test_register_and_remove_buffer_parametrization_swap_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNParametrization.test_register_and_remove_nested_parametrization_swap_True b/test/dynamo_expected_failures/TestNNParametrization.test_register_and_remove_nested_parametrization_swap_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNParametrization.test_register_and_remove_parametrization_swap_True b/test/dynamo_expected_failures/TestNNParametrization.test_register_and_remove_parametrization_swap_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNParametrization.test_serialization_parametrization_swap_True b/test/dynamo_expected_failures/TestNNParametrization.test_serialization_parametrization_swap_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNParametrization.test_transfer_parametrizations_and_params_right_inverse_swap_True b/test/dynamo_expected_failures/TestNNParametrization.test_transfer_parametrizations_and_params_right_inverse_swap_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNParametrization.test_transfer_parametrizations_and_params_swap_True b/test/dynamo_expected_failures/TestNNParametrization.test_transfer_parametrizations_and_params_swap_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNNParametrization.test_wrapper_subclass_parametrization_swap_True b/test/dynamo_expected_failures/TestNNParametrization.test_wrapper_subclass_parametrization_swap_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_addmm b/test/dynamo_expected_failures/TestNamedTensor.test_addmm
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_bmm b/test/dynamo_expected_failures/TestNamedTensor.test_bmm
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_cat b/test/dynamo_expected_failures/TestNamedTensor.test_cat
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_flatten b/test/dynamo_expected_failures/TestNamedTensor.test_flatten
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_info_smoke b/test/dynamo_expected_failures/TestNamedTensor.test_info_smoke
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_logical_ops b/test/dynamo_expected_failures/TestNamedTensor.test_logical_ops
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_matmul b/test/dynamo_expected_failures/TestNamedTensor.test_matmul
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_mm b/test/dynamo_expected_failures/TestNamedTensor.test_mm
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_reduction_fns b/test/dynamo_expected_failures/TestNamedTensor.test_reduction_fns
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_set_names_property b/test/dynamo_expected_failures/TestNamedTensor.test_set_names_property
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_transpose_variants b/test/dynamo_expected_failures/TestNamedTensor.test_transpose_variants
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_unary_propagate_names_fns b/test/dynamo_expected_failures/TestNamedTensor.test_unary_propagate_names_fns
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_unflatten b/test/dynamo_expected_failures/TestNamedTensor.test_unflatten
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_using_unseen_interned_string_bumps_refcount_permanently b/test/dynamo_expected_failures/TestNamedTensor.test_using_unseen_interned_string_bumps_refcount_permanently
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTensor.test_using_unseen_uninterned_string_refcounts b/test/dynamo_expected_failures/TestNamedTensor.test_using_unseen_uninterned_string_refcounts
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNamedTuple.test_max b/test/dynamo_expected_failures/TestNamedTuple.test_max
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNegative.test_exceptions b/test/dynamo_expected_failures/TestNegative.test_exceptions
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_early_stop_False b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_early_stop_False
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_early_stop_True b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_early_stop_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_kwargs_early_stop_False b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_kwargs_early_stop_False
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_kwargs_early_stop_True b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_kwargs_early_stop_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_non_tensor_inputs_and_outputs_early_stop_False b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_non_tensor_inputs_and_outputs_early_stop_False
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_non_tensor_inputs_and_outputs_early_stop_True b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_non_tensor_inputs_and_outputs_early_stop_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_reentrant_backwards_early_stop_False b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_reentrant_backwards_early_stop_False
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_reentrant_backwards_early_stop_True b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_reentrant_backwards_early_stop_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_same_graph_early_stop_False b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_same_graph_early_stop_False
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_same_graph_early_stop_True b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_same_graph_early_stop_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_set_early_stop b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_set_early_stop
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_set_early_stop_no_recompution_needed b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_set_early_stop_no_recompution_needed
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_two_children_early_stop_False b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_two_children_early_stop_False
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_two_children_early_stop_True b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_two_children_early_stop_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedTensor.test_binary_recompiles b/test/dynamo_expected_failures/TestNestedTensor.test_binary_recompiles
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNestedTensor.test_unbind b/test/dynamo_expected_failures/TestNestedTensor.test_unbind
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNonarrayArgs.test_dunder_round_edgecases_val_2147483647_ndigits_-1 b/test/dynamo_expected_failures/TestNonarrayArgs.test_dunder_round_edgecases_val_2147483647_ndigits_-1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNonarrayArgs.test_dunder_round_edgecases_val_2147483647_ndigits_-10 b/test/dynamo_expected_failures/TestNonarrayArgs.test_dunder_round_edgecases_val_2147483647_ndigits_-10
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNonarrayArgs.test_dunder_round_edgecases_val_2147483647_ndigits_-9 b/test/dynamo_expected_failures/TestNonarrayArgs.test_dunder_round_edgecases_val_2147483647_ndigits_-9
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNonzeroAndCountNonzero.test_count_nonzero_axis b/test/dynamo_expected_failures/TestNonzeroAndCountNonzero.test_count_nonzero_axis
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNormDouble.test_axis b/test/dynamo_expected_failures/TestNormDouble.test_axis
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNormDouble.test_bad_args b/test/dynamo_expected_failures/TestNormDouble.test_bad_args
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNormDouble.test_matrix_2x2 b/test/dynamo_expected_failures/TestNormDouble.test_matrix_2x2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNormInt64.test_axis b/test/dynamo_expected_failures/TestNormInt64.test_axis
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNormInt64.test_bad_args b/test/dynamo_expected_failures/TestNormInt64.test_bad_args
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNormInt64.test_matrix_2x2 b/test/dynamo_expected_failures/TestNormInt64.test_matrix_2x2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNormSingle.test_axis b/test/dynamo_expected_failures/TestNormSingle.test_axis
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNormSingle.test_bad_args b/test/dynamo_expected_failures/TestNormSingle.test_bad_args
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNormSingle.test_matrix_2x2 b/test/dynamo_expected_failures/TestNormSingle.test_matrix_2x2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestNumPyInteropCPU.test_numpy_non_writeable_cpu b/test/dynamo_expected_failures/TestNumPyInteropCPU.test_numpy_non_writeable_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestObserver.test_per_channel_observers b/test/dynamo_expected_failures/TestObserver.test_per_channel_observers
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestObserver.test_per_tensor_observers b/test/dynamo_expected_failures/TestObserver.test_per_tensor_observers
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestOpCPU.test_cat_cpu_bfloat16 b/test/dynamo_expected_failures/TestOpCPU.test_cat_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestOpCPU.test_cat_cpu_float32 b/test/dynamo_expected_failures/TestOpCPU.test_cat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestOptimizations.test_example_inputs b/test/dynamo_expected_failures/TestOptimizations.test_example_inputs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestOptimizations.test_example_inputs_runtime_use b/test/dynamo_expected_failures/TestOptimizations.test_example_inputs_runtime_use
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestOutDtypeOp.test_out_dtype_non_op_overload b/test/dynamo_expected_failures/TestOutDtypeOp.test_out_dtype_non_op_overload
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestOutDtypeOp.test_out_dtype_wrong_output b/test/dynamo_expected_failures/TestOutDtypeOp.test_out_dtype_wrong_output
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPT2ERepresentation.test_add b/test/dynamo_expected_failures/TestPT2ERepresentation.test_add
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPT2ERepresentation.test_add_relu b/test/dynamo_expected_failures/TestPT2ERepresentation.test_add_relu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPT2ERepresentation.test_conv2d b/test/dynamo_expected_failures/TestPT2ERepresentation.test_conv2d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPT2ERepresentation.test_dynamic_linear b/test/dynamo_expected_failures/TestPT2ERepresentation.test_dynamic_linear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPT2ERepresentation.test_maxpool2d b/test/dynamo_expected_failures/TestPT2ERepresentation.test_maxpool2d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPT2ERepresentation.test_qdq b/test/dynamo_expected_failures/TestPT2ERepresentation.test_qdq
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPT2ERepresentation.test_qdq_per_channel b/test/dynamo_expected_failures/TestPT2ERepresentation.test_qdq_per_channel
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPT2ERepresentation.test_static_linear b/test/dynamo_expected_failures/TestPT2ERepresentation.test_static_linear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPadding.test_constant_padNd b/test/dynamo_expected_failures/TestPadding.test_constant_padNd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPassManager.test_pass_manager b/test/dynamo_expected_failures/TestPassManager.test_pass_manager
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPasses.test_functionalize_inline_contraints b/test/dynamo_expected_failures/TestPasses.test_functionalize_inline_contraints
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPasses.test_views_op_having_view_copy b/test/dynamo_expected_failures/TestPasses.test_views_op_having_view_copy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPercentile.test_extended_axis_invalid b/test/dynamo_expected_failures/TestPercentile.test_extended_axis_invalid
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPercentile.test_scalar_q b/test/dynamo_expected_failures/TestPercentile.test_scalar_q
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPickle.test_pickle b/test/dynamo_expected_failures/TestPickle.test_pickle
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPoolingNN.test_MaxUnpool2d_output_size b/test/dynamo_expected_failures/TestPoolingNN.test_MaxUnpool2d_output_size
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_concrete_inputs_profiling b/test/dynamo_expected_failures/TestProfiler.test_concrete_inputs_profiling
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_export_stacks b/test/dynamo_expected_failures/TestProfiler.test_export_stacks
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_flops b/test/dynamo_expected_failures/TestProfiler.test_flops
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_guarded_record_function_fast b/test/dynamo_expected_failures/TestProfiler.test_guarded_record_function_fast
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_high_level_trace b/test/dynamo_expected_failures/TestProfiler.test_high_level_trace
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_is_profiler_enabled b/test/dynamo_expected_failures/TestProfiler.test_is_profiler_enabled
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_profiler_correlation_id b/test/dynamo_expected_failures/TestProfiler.test_profiler_correlation_id
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_profiler_fwd_bwd_link b/test/dynamo_expected_failures/TestProfiler.test_profiler_fwd_bwd_link
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_profiler_op_event_args b/test/dynamo_expected_failures/TestProfiler.test_profiler_op_event_args
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_profiler_tracing b/test/dynamo_expected_failures/TestProfiler.test_profiler_tracing
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_source b/test/dynamo_expected_failures/TestProfiler.test_source
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_source_multithreaded_basic_work_in_main_thread_True b/test/dynamo_expected_failures/TestProfiler.test_source_multithreaded_basic_work_in_main_thread_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_source_multithreaded_close_in_scope_work_in_main_thread_True b/test/dynamo_expected_failures/TestProfiler.test_source_multithreaded_close_in_scope_work_in_main_thread_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_source_multithreaded_complex_work_in_main_thread_True b/test/dynamo_expected_failures/TestProfiler.test_source_multithreaded_complex_work_in_main_thread_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_source_multithreaded_multiple_preexisting_work_in_main_thread_True b/test/dynamo_expected_failures/TestProfiler.test_source_multithreaded_multiple_preexisting_work_in_main_thread_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfiler.test_source_multithreaded_open_in_scope_work_in_main_thread_True b/test/dynamo_expected_failures/TestProfiler.test_source_multithreaded_open_in_scope_work_in_main_thread_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfilerTree.test_profiler_experimental_tree_with_memory b/test/dynamo_expected_failures/TestProfilerTree.test_profiler_experimental_tree_with_memory
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfilerTree.test_profiler_experimental_tree_with_memory_and_stack b/test/dynamo_expected_failures/TestProfilerTree.test_profiler_experimental_tree_with_memory_and_stack
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfilerTree.test_profiler_experimental_tree_with_record_function b/test/dynamo_expected_failures/TestProfilerTree.test_profiler_experimental_tree_with_record_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfilerTree.test_profiler_experimental_tree_with_stack_and_torch_dispatch b/test/dynamo_expected_failures/TestProfilerTree.test_profiler_experimental_tree_with_stack_and_torch_dispatch
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestProfilerTree.test_profiler_experimental_tree_with_stack_and_torch_function b/test/dynamo_expected_failures/TestProfilerTree.test_profiler_experimental_tree_with_stack_and_torch_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPruningNN.test_identity_pruning b/test/dynamo_expected_failures/TestPruningNN.test_identity_pruning
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPruningNN.test_random_pruning_0perc b/test/dynamo_expected_failures/TestPruningNN.test_random_pruning_0perc
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPutAlongAxis.test_broadcast b/test/dynamo_expected_failures/TestPutAlongAxis.test_broadcast
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPutmask.test_byteorder_greater_False b/test/dynamo_expected_failures/TestPutmask.test_byteorder_greater_False
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPutmask.test_byteorder_greater_True b/test/dynamo_expected_failures/TestPutmask.test_byteorder_greater_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPutmask.test_record_array b/test/dynamo_expected_failures/TestPutmask.test_record_array
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPythonAutograd.test_backwards2 b/test/dynamo_expected_failures/TestPythonAutograd.test_backwards2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPythonAutograd.test_forwards1 b/test/dynamo_expected_failures/TestPythonAutograd.test_forwards1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPythonAutograd.test_forwards2 b/test/dynamo_expected_failures/TestPythonAutograd.test_forwards2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPythonAutograd.test_split b/test/dynamo_expected_failures/TestPythonAutograd.test_split
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPythonDispatch.test_make_subclass_with_modes b/test/dynamo_expected_failures/TestPythonDispatch.test_make_subclass_with_modes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPythonRegistration.test_alias_analysis b/test/dynamo_expected_failures/TestPythonRegistration.test_alias_analysis
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestPythonRegistration.test_finalizer b/test/dynamo_expected_failures/TestPythonRegistration.test_finalizer
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQR.test_mode_raw b/test/dynamo_expected_failures/TestQR.test_mode_raw
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQR.test_qr_empty_m_0_n_0 b/test/dynamo_expected_failures/TestQR.test_qr_empty_m_0_n_0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQR.test_qr_empty_m_0_n_3 b/test/dynamo_expected_failures/TestQR.test_qr_empty_m_0_n_3
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQR.test_qr_empty_m_3_n_0 b/test/dynamo_expected_failures/TestQR.test_qr_empty_m_3_n_0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_averaged_inverted_cdf b/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_averaged_inverted_cdf
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_closest_observation b/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_closest_observation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_hazen b/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_hazen
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_interpolated_inverted_cdf b/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_interpolated_inverted_cdf
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_inverted_cdf b/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_inverted_cdf
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_median_unbiased b/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_median_unbiased
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_normal_unbiased b/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_normal_unbiased
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_weibull b/test/dynamo_expected_failures/TestQuantile.test_quantile_monotonic_method_weibull
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizeFx.test_prepare_custom_config_set_standalone_module_class b/test/dynamo_expected_failures/TestQuantizeFx.test_prepare_custom_config_set_standalone_module_class
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizeFx.test_static_lstm_with_custom_fixed_qparams b/test/dynamo_expected_failures/TestQuantizeFx.test_static_lstm_with_custom_fixed_qparams
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2E.test_composable_quantizer_linear_conv b/test/dynamo_expected_failures/TestQuantizePT2E.test_composable_quantizer_linear_conv
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2E.test_constant_prop_preserve_metadata b/test/dynamo_expected_failures/TestQuantizePT2E.test_constant_prop_preserve_metadata
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2E.test_embedding_conv_linear_quantization b/test/dynamo_expected_failures/TestQuantizePT2E.test_embedding_conv_linear_quantization
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2E.test_fold_all_ops_before_quantize b/test/dynamo_expected_failures/TestQuantizePT2E.test_fold_all_ops_before_quantize
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2E.test_fold_quantize b/test/dynamo_expected_failures/TestQuantizePT2E.test_fold_quantize
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2E.test_fold_quantize_per_channel b/test/dynamo_expected_failures/TestQuantizePT2E.test_fold_quantize_per_channel
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2E.test_groupwise_per_channel_quant b/test/dynamo_expected_failures/TestQuantizePT2E.test_groupwise_per_channel_quant
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2E.test_reentrant b/test/dynamo_expected_failures/TestQuantizePT2E.test_reentrant
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2E.test_save_load b/test/dynamo_expected_failures/TestQuantizePT2E.test_save_load
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2E.test_speed b/test/dynamo_expected_failures/TestQuantizePT2E.test_speed
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQATModels.test_qat_mobilenet_v2 b/test/dynamo_expected_failures/TestQuantizePT2EQATModels.test_qat_mobilenet_v2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQATModels.test_qat_resnet18 b/test/dynamo_expected_failures/TestQuantizePT2EQATModels.test_qat_resnet18
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_prepare_qat_conv_bn_fusion_getitem_placeholder b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_prepare_qat_conv_bn_fusion_getitem_placeholder
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_bn_fusion b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_bn_fusion
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_bn_fusion_literal_args b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_bn_fusion_literal_args
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_bn_fusion_no_conv_bias b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_bn_fusion_no_conv_bias
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_bn_relu_fusion b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_bn_relu_fusion
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_bn_relu_fusion_no_conv_bias b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_bn_relu_fusion_no_conv_bias
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_no_bias b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_no_bias
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_transpose_bn b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_transpose_bn
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_transpose_bn_relu b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_conv_transpose_bn_relu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_inplace_add_relu b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_inplace_add_relu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_preserve_source_fn_stack b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_preserve_source_fn_stack
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_update_shared_qspec b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn1d.test_qat_update_shared_qspec
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_prepare_qat_conv_bn_fusion_getitem_placeholder b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_prepare_qat_conv_bn_fusion_getitem_placeholder
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_bn_fusion b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_bn_fusion
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_bn_fusion_literal_args b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_bn_fusion_literal_args
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_bn_fusion_no_conv_bias b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_bn_fusion_no_conv_bias
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_bn_relu_fusion b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_bn_relu_fusion
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_bn_relu_fusion_no_conv_bias b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_bn_relu_fusion_no_conv_bias
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_no_bias b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_no_bias
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_transpose_bn b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_transpose_bn
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_transpose_bn_relu b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_conv_transpose_bn_relu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_inplace_add_relu b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_inplace_add_relu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_preserve_source_fn_stack b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_preserve_source_fn_stack
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_update_shared_qspec b/test/dynamo_expected_failures/TestQuantizePT2EQAT_ConvBn2d.test_qat_update_shared_qspec
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizedEmbeddingOps.test_embedding_bag_2d_indices b/test/dynamo_expected_failures/TestQuantizedEmbeddingOps.test_embedding_bag_2d_indices
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizedOps.test_custom_module_lstm b/test/dynamo_expected_failures/TestQuantizedOps.test_custom_module_lstm
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizedOps.test_max_pool2d_cudnn b/test/dynamo_expected_failures/TestQuantizedOps.test_max_pool2d_cudnn
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizedOps.test_qtanh b/test/dynamo_expected_failures/TestQuantizedOps.test_qtanh
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizedTensor.test_choose_qparams b/test/dynamo_expected_failures/TestQuantizedTensor.test_choose_qparams
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizedTensor.test_choose_qparams_optimized b/test/dynamo_expected_failures/TestQuantizedTensor.test_choose_qparams_optimized
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizedTensor.test_decomposed_dequantize_per_tensor b/test/dynamo_expected_failures/TestQuantizedTensor.test_decomposed_dequantize_per_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizedTensor.test_decomposed_quantize_per_tensor b/test/dynamo_expected_failures/TestQuantizedTensor.test_decomposed_quantize_per_tensor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizedTensor.test_decomposed_quantize_per_tensor_bfloat16_input b/test/dynamo_expected_failures/TestQuantizedTensor.test_decomposed_quantize_per_tensor_bfloat16_input
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizedTensor.test_fp16_saturate_op b/test/dynamo_expected_failures/TestQuantizedTensor.test_fp16_saturate_op
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizedTensor.test_qtensor_cpu b/test/dynamo_expected_failures/TestQuantizedTensor.test_qtensor_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestQuantizedTensor.test_quantize_per_channel_sub_byte b/test/dynamo_expected_failures/TestQuantizedTensor.test_quantize_per_channel_sub_byte
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestRecordFunction.test_datapipe_with_record_function b/test/dynamo_expected_failures/TestRecordFunction.test_datapipe_with_record_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestRecordFunction.test_datapipe_with_record_function_fork b/test/dynamo_expected_failures/TestRecordFunction.test_datapipe_with_record_function_fork
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestRecordFunction.test_record_function b/test/dynamo_expected_failures/TestRecordFunction.test_record_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestReductionsCPU.test_std_vs_numpy_cpu_complex128 b/test/dynamo_expected_failures/TestReductionsCPU.test_std_vs_numpy_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestReductionsCPU.test_std_vs_numpy_cpu_complex64 b/test/dynamo_expected_failures/TestReductionsCPU.test_std_vs_numpy_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestReductionsCPU.test_std_vs_numpy_cpu_float32 b/test/dynamo_expected_failures/TestReductionsCPU.test_std_vs_numpy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestReductionsCPU.test_std_vs_numpy_cpu_float64 b/test/dynamo_expected_failures/TestReductionsCPU.test_std_vs_numpy_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestReductionsCPU.test_var_vs_numpy_cpu_complex128 b/test/dynamo_expected_failures/TestReductionsCPU.test_var_vs_numpy_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestReductionsCPU.test_var_vs_numpy_cpu_complex64 b/test/dynamo_expected_failures/TestReductionsCPU.test_var_vs_numpy_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestReductionsCPU.test_var_vs_numpy_cpu_float32 b/test/dynamo_expected_failures/TestReductionsCPU.test_var_vs_numpy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestReductionsCPU.test_var_vs_numpy_cpu_float64 b/test/dynamo_expected_failures/TestReductionsCPU.test_var_vs_numpy_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestReductionsCPU.test_warn_invalid_degrees_of_freedom_cpu_complex128 b/test/dynamo_expected_failures/TestReductionsCPU.test_warn_invalid_degrees_of_freedom_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestReductionsCPU.test_warn_invalid_degrees_of_freedom_cpu_complex64 b/test/dynamo_expected_failures/TestReductionsCPU.test_warn_invalid_degrees_of_freedom_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestReductionsCPU.test_warn_invalid_degrees_of_freedom_cpu_float32 b/test/dynamo_expected_failures/TestReductionsCPU.test_warn_invalid_degrees_of_freedom_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestReductionsCPU.test_warn_invalid_degrees_of_freedom_cpu_float64 b/test/dynamo_expected_failures/TestReductionsCPU.test_warn_invalid_degrees_of_freedom_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestRequire.test_C_and_F_simul b/test/dynamo_expected_failures/TestRequire.test_C_and_F_simul
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestRequire.test_require_each b/test/dynamo_expected_failures/TestRequire.test_require_each
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestRequire.test_unknown_requirement b/test/dynamo_expected_failures/TestRequire.test_unknown_requirement
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestRollaxis.test_exceptions b/test/dynamo_expected_failures/TestRollaxis.test_exceptions
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestRot90.test_basic b/test/dynamo_expected_failures/TestRot90.test_basic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_0_bfloat16_cpu_bfloat16 b/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_0_bfloat16_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_0_float16_cpu_float16 b/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_0_float16_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_0_float32_cpu_float32 b/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_0_float32_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_0_float64_cpu_float64 b/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_0_float64_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_7_bfloat16_cpu_bfloat16 b/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_7_bfloat16_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_7_float16_cpu_float16 b/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_7_float16_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_7_float32_cpu_float32 b/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_7_float32_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_7_float64_cpu_float64 b/test/dynamo_expected_failures/TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_7_float64_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScalarIndexing.test_invalid_newaxis b/test/dynamo_expected_failures/TestScalarIndexing.test_invalid_newaxis
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScalarIndexing.test_invalid_subscript b/test/dynamo_expected_failures/TestScalarIndexing.test_invalid_subscript
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScalarIndexing.test_invalid_subscript_assignment b/test/dynamo_expected_failures/TestScalarIndexing.test_invalid_subscript_assignment
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t0 b/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t1 b/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t2 b/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t3 b/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t3
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t4 b/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t4
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t5 b/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t5
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t6 b/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t6
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t7 b/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t7
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t8 b/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t9 b/test/dynamo_expected_failures/TestScalarTypeNames.test_names_reflect_attributes_t9
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_conv_error b/test/dynamo_expected_failures/TestScript.test_conv_error
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_dict_str b/test/dynamo_expected_failures/TestScript.test_dict_str
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_empty_tuple_str b/test/dynamo_expected_failures/TestScript.test_empty_tuple_str
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_error_stacktrace_interface b/test/dynamo_expected_failures/TestScript.test_error_stacktrace_interface
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_function_overloading_isinstance b/test/dynamo_expected_failures/TestScript.test_function_overloading_isinstance
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_function_overloads b/test/dynamo_expected_failures/TestScript.test_function_overloads
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_ignored_as_value b/test/dynamo_expected_failures/TestScript.test_ignored_as_value
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_infer_size b/test/dynamo_expected_failures/TestScript.test_infer_size
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_is_after_use b/test/dynamo_expected_failures/TestScript.test_is_after_use
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_is_scripting b/test/dynamo_expected_failures/TestScript.test_is_scripting
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_isinstance_dynamic b/test/dynamo_expected_failures/TestScript.test_isinstance_dynamic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_method_overloading b/test/dynamo_expected_failures/TestScript.test_method_overloading
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_namedtuple_default_values_using_factory_constructor b/test/dynamo_expected_failures/TestScript.test_namedtuple_default_values_using_factory_constructor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_namedtuple_python b/test/dynamo_expected_failures/TestScript.test_namedtuple_python
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_no_self_arg_ignore_function b/test/dynamo_expected_failures/TestScript.test_no_self_arg_ignore_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_none_type_str b/test/dynamo_expected_failures/TestScript.test_none_type_str
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_parse_nested_names b/test/dynamo_expected_failures/TestScript.test_parse_nested_names
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_parse_tensor_constants b/test/dynamo_expected_failures/TestScript.test_parse_tensor_constants
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_python_call b/test/dynamo_expected_failures/TestScript.test_python_call
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_python_call_non_tensor_wrong b/test/dynamo_expected_failures/TestScript.test_python_call_non_tensor_wrong
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_python_op_builtins b/test/dynamo_expected_failures/TestScript.test_python_op_builtins
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_script_optional_none b/test/dynamo_expected_failures/TestScript.test_script_optional_none
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_string_device_implicit_conversion b/test/dynamo_expected_failures/TestScript.test_string_device_implicit_conversion
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_torch_functional_tensordot_int b/test/dynamo_expected_failures/TestScript.test_torch_functional_tensordot_int
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_torchscript_multi_head_attn b/test/dynamo_expected_failures/TestScript.test_torchscript_multi_head_attn
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_tuple_str b/test/dynamo_expected_failures/TestScript.test_tuple_str
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_type_annotation_module b/test/dynamo_expected_failures/TestScript.test_type_annotation_module
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_unspecialized_any_binding b/test/dynamo_expected_failures/TestScript.test_unspecialized_any_binding
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_unused_decorator b/test/dynamo_expected_failures/TestScript.test_unused_decorator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestScript.test_wrong_return_type b/test/dynamo_expected_failures/TestScript.test_wrong_return_type
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestShapeOpsCUDA.test_flip_cuda_float32 b/test/dynamo_expected_failures/TestShapeOpsCUDA.test_flip_cuda_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestSortAndSelectCPU.test_sort_overflow_cpu_int16 b/test/dynamo_expected_failures/TestSortAndSelectCPU.test_sort_overflow_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestSortComplex.test_sort_real_type_in_H_type_out_F b/test/dynamo_expected_failures/TestSortComplex.test_sort_real_type_in_H_type_out_F
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestSortComplex.test_sort_real_type_in_g_type_out_G b/test/dynamo_expected_failures/TestSortComplex.test_sort_real_type_in_g_type_out_G
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestSplit.test_unequal_split b/test/dynamo_expected_failures/TestSplit.test_unequal_split
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestStackMisc.test_stack b/test/dynamo_expected_failures/TestStackMisc.test_stack
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestStats.test_dtype_from_dtype b/test/dynamo_expected_failures/TestStats.test_dtype_from_dtype
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestStats.test_out b/test/dynamo_expected_failures/TestStats.test_out
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTEFuserDynamic.test_profiler b/test/dynamo_expected_failures/TestTEFuserDynamic.test_profiler
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTEFuserDynamic.test_remove_output_used_only_in_size b/test/dynamo_expected_failures/TestTEFuserDynamic.test_remove_output_used_only_in_size
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTEFuserDynamic.test_to_dtype b/test/dynamo_expected_failures/TestTEFuserDynamic.test_to_dtype
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTEFuserDynamic.test_torch_to b/test/dynamo_expected_failures/TestTEFuserDynamic.test_torch_to
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTEFuserStatic.test_inlined_optimized_graph b/test/dynamo_expected_failures/TestTEFuserStatic.test_inlined_optimized_graph
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTEFuserStatic.test_profiler b/test/dynamo_expected_failures/TestTEFuserStatic.test_profiler
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTEFuserStatic.test_remove_output_used_only_in_size b/test/dynamo_expected_failures/TestTEFuserStatic.test_remove_output_used_only_in_size
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTEFuserStatic.test_skip_grad_in_check b/test/dynamo_expected_failures/TestTEFuserStatic.test_skip_grad_in_check
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTEFuserStatic.test_to_dtype b/test/dynamo_expected_failures/TestTEFuserStatic.test_to_dtype
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTEFuserStatic.test_torch_to b/test/dynamo_expected_failures/TestTEFuserStatic.test_torch_to
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTake.test_raise b/test/dynamo_expected_failures/TestTake.test_raise
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTakeAlongAxis.test_invalid b/test/dynamo_expected_failures/TestTakeAlongAxis.test_invalid
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTensorBoardEmbedding.test_embedding b/test/dynamo_expected_failures/TestTensorBoardEmbedding.test_embedding
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTensorBoardEmbedding.test_embedding_64 b/test/dynamo_expected_failures/TestTensorBoardEmbedding.test_embedding_64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTensorBoardSummary.test_image_with_one_channel b/test/dynamo_expected_failures/TestTensorBoardSummary.test_image_with_one_channel
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTensorBoardSummary.test_image_without_channel b/test/dynamo_expected_failures/TestTensorBoardSummary.test_image_without_channel
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTensorBoardUtils.test_numpy_vid_uint8 b/test/dynamo_expected_failures/TestTensorBoardUtils.test_numpy_vid_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTensorCreationCPU.test_block_diag_cpu b/test/dynamo_expected_failures/TestTensorCreationCPU.test_block_diag_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTensorCreationCPU.test_constructor_dtypes_cpu b/test/dynamo_expected_failures/TestTensorCreationCPU.test_constructor_dtypes_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorch.test_cuda_not_built b/test/dynamo_expected_failures/TestTorch.test_cuda_not_built
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorch.test_map b/test/dynamo_expected_failures/TestTorch.test_map
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorch.test_new b/test/dynamo_expected_failures/TestTorch.test_new
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorch.test_parsing_intlist b/test/dynamo_expected_failures/TestTorch.test_parsing_intlist
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorch.test_pin_memory b/test/dynamo_expected_failures/TestTorch.test_pin_memory
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorch.test_type b/test/dynamo_expected_failures/TestTorch.test_type
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorch.test_upsample_nearest2d_meta b/test/dynamo_expected_failures/TestTorch.test_upsample_nearest2d_meta
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_add_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_add_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_addcdiv_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_addcdiv_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_addcmul_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_addcmul_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_atan2_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_atan2_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_copy_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_copy_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_div_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_div_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_eq_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_eq_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_fmod_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_fmod_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_ge_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_ge_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_gt_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_gt_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_le_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_le_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_lerp_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_lerp_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_lt_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_lt_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_map2_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_map2_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_map_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_map_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_masked_fill_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_masked_fill_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_masked_scatter_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_masked_scatter_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_mul_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_mul_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_ne_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_ne_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_pow_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_pow_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_remainder_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_remainder_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_sub_cpu b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_broadcast_fn_sub_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_deterministic_empty_cpu_uint64 b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_deterministic_empty_cpu_uint64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_nondeterministic_alert_MaxUnpool1d_cpu_float32 b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_nondeterministic_alert_MaxUnpool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_nondeterministic_alert_MaxUnpool1d_cpu_float64 b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_nondeterministic_alert_MaxUnpool1d_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_nondeterministic_alert_MaxUnpool2d_cpu_float32 b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_nondeterministic_alert_MaxUnpool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_nondeterministic_alert_MaxUnpool2d_cpu_float64 b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_nondeterministic_alert_MaxUnpool2d_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_nondeterministic_alert_MaxUnpool3d_cpu_float32 b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_nondeterministic_alert_MaxUnpool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_nondeterministic_alert_MaxUnpool3d_cpu_float64 b/test/dynamo_expected_failures/TestTorchDeviceTypeCPU.test_nondeterministic_alert_MaxUnpool3d_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchFunctionMode.test_mode_notimplemented_loop b/test/dynamo_expected_failures/TestTorchFunctionMode.test_mode_notimplemented_loop
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchFunctionMode.test_modes_return_notimplemented b/test/dynamo_expected_failures/TestTorchFunctionMode.test_modes_return_notimplemented
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchFunctionMode.test_nested_modes_with_python_has_torch_function b/test/dynamo_expected_failures/TestTorchFunctionMode.test_nested_modes_with_python_has_torch_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchFunctionOverride.test_Tensor___cuda_array_interface_____get__ b/test/dynamo_expected_failures/TestTorchFunctionOverride.test_Tensor___cuda_array_interface_____get__
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_allocation_id_uniqueness b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_allocation_id_uniqueness
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_allocation_ids b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_allocation_ids
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_allocation_ids_with_other_ops b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_allocation_ids_with_other_ops
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_allocations b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_allocations
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_extra_fields b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_extra_fields
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_impl_reuse b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_impl_reuse
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_mkldnn_tensors b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_mkldnn_tensors
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_module_and_optimizer_ids b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_module_and_optimizer_ids
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_nnmodule_params b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_nnmodule_params
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_pointers_and_ids b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_pointers_and_ids
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_scalar_ins b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_scalar_ins
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_tensor_lists b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_tensor_lists
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_tensor_properties b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_tensor_properties
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_tensorimpl_invalidation_full b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_tensorimpl_invalidation_full
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_tensorimpl_invalidation_keep_alive b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_tensorimpl_invalidation_keep_alive
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_tensorimpl_invalidation_scalar_args b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_tensorimpl_invalidation_scalar_args
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTorchTidyProfiler.test_tensorimpl_invalidation_set b/test/dynamo_expected_failures/TestTorchTidyProfiler.test_tensorimpl_invalidation_set
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTrilIndicesFrom.test_exceptions b/test/dynamo_expected_failures/TestTrilIndicesFrom.test_exceptions
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTriuIndicesFrom.test_exceptions b/test/dynamo_expected_failures/TestTriuIndicesFrom.test_exceptions
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTypeHints.test_doc_examples b/test/dynamo_expected_failures/TestTypeHints.test_doc_examples
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTypePromotionCPU.test_alpha_mismatch_cpu b/test/dynamo_expected_failures/TestTypePromotionCPU.test_alpha_mismatch_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestTypePromotionCPU.test_alternate_result_cpu b/test/dynamo_expected_failures/TestTypePromotionCPU.test_alternate_result_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestUnflatten.test_unflatten_container_type b/test/dynamo_expected_failures/TestUnflatten.test_unflatten_container_type
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestUnique.test_unique_axis b/test/dynamo_expected_failures/TestUnique.test_unique_axis
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestUnique.test_unique_axis_errors b/test/dynamo_expected_failures/TestUnique.test_unique_axis_errors
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_complex_args b/test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_complex_args
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_default_kwargs b/test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_default_kwargs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_simple b/test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_simple
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestVerifyCorrectness.test_incorrect_verify_true b/test/dynamo_expected_failures/TestVerifyCorrectness.test_incorrect_verify_true
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestVerifyCorrectness.test_torchscript b/test/dynamo_expected_failures/TestVerifyCorrectness.test_torchscript
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestVmapAPI.test_data_attribute b/test/dynamo_expected_failures/TestVmapAPI.test_data_attribute
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestVmapAPI.test_fallback_does_not_warn_by_default b/test/dynamo_expected_failures/TestVmapAPI.test_fallback_does_not_warn_by_default
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestVmapAPI.test_fallback_warns_when_warnings_are_enabled b/test/dynamo_expected_failures/TestVmapAPI.test_fallback_warns_when_warnings_are_enabled
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestVmapAPI.test_out_dim_out_of_bounds_err_msg b/test/dynamo_expected_failures/TestVmapAPI.test_out_dim_out_of_bounds_err_msg
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestVmapOperators.test_new_empty_strided b/test/dynamo_expected_failures/TestVmapOperators.test_new_empty_strided
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestVsplit.test_0D_array b/test/dynamo_expected_failures/TestVsplit.test_0D_array
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestVsplit.test_1D_array b/test/dynamo_expected_failures/TestVsplit.test_1D_array
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestVsplit.test_non_iterable b/test/dynamo_expected_failures/TestVsplit.test_non_iterable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestVstack.test_empty_input b/test/dynamo_expected_failures/TestVstack.test_empty_input
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestVstack.test_non_iterable b/test/dynamo_expected_failures/TestVstack.test_non_iterable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestWhere.test_error b/test/dynamo_expected_failures/TestWhere.test_error
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_add_and_inplace_add b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_add_and_inplace_add
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_add_mul_long b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_add_mul_long
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_add_mul_scalar b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_add_mul_scalar
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_conv1d b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_conv1d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_conv1d_with_conv2d b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_conv1d_with_conv2d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_conv2d b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_conv2d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_conv_linear b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_conv_linear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_conv_linear_no_permute b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_conv_linear_no_permute
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_dynamic_linear b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_dynamic_linear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_dynamic_linear_int4_weight b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_dynamic_linear_int4_weight
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_dynamic_linear_with_conv b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_dynamic_linear_with_conv
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_gru b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_gru
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_linear b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_linear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_linear_gru b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_linear_gru
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_linear_relu b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_linear_relu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_linear_with_dynamic_shape b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_linear_with_dynamic_shape
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_mul_and_inplace_mul b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_mul_and_inplace_mul
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_mul_float32_max b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_mul_float32_max
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_obs_sharing_ops b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_obs_sharing_ops
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_propagate_annotation b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_propagate_annotation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_qat_dynamic_linear b/test/dynamo_expected_failures/TestXNNPACKQuantizer.test_qat_dynamic_linear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TestXNNPACKQuantizerModels.test_resnet18 b/test/dynamo_expected_failures/TestXNNPACKQuantizerModels.test_resnet18
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/TraceRuleTests.test_skipfiles_inlinelist b/test/dynamo_expected_failures/TraceRuleTests.test_skipfiles_inlinelist
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/UnspecTests.test_builtin_max_min b/test/dynamo_expected_failures/UnspecTests.test_builtin_max_min
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/UnspecTests.test_conv1d_symint_padding b/test/dynamo_expected_failures/UnspecTests.test_conv1d_symint_padding
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/UnspecTests.test_isinstance_symint b/test/dynamo_expected_failures/UnspecTests.test_isinstance_symint
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/UnspecTests.test_mark_01_dynamic b/test/dynamo_expected_failures/UnspecTests.test_mark_01_dynamic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/UnspecTests.test_no_recompilations b/test/dynamo_expected_failures/UnspecTests.test_no_recompilations
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/UnspecTests.test_no_recompiles b/test/dynamo_expected_failures/UnspecTests.test_no_recompiles
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/UnspecTests.test_propagate_dynamic_dim b/test/dynamo_expected_failures/UnspecTests.test_propagate_dynamic_dim
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_expected_failures/UnspecTests.test_use_and_specialize b/test/dynamo_expected_failures/UnspecTests.test_use_and_specialize
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DecoratorTests.test_allow_in_graph b/test/dynamo_skips/DecoratorTests.test_allow_in_graph
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesAotAutogradFallbackTests.test_aot_sequence_nr_dynamic_shapes b/test/dynamo_skips/DynamicShapesAotAutogradFallbackTests.test_aot_sequence_nr_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesCtxManagerTests.test_autograd_profiler_enabled_dynamic_shapes b/test/dynamo_skips/DynamicShapesCtxManagerTests.test_autograd_profiler_enabled_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesCtxManagerTests.test_disable_saved_tensors_hooks_dynamic_shapes b/test/dynamo_skips/DynamicShapesCtxManagerTests.test_disable_saved_tensors_hooks_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesCtxManagerTests.test_disable_saved_tensors_hooks_prev_disabled_dynamic_shapes b/test/dynamo_skips/DynamicShapesCtxManagerTests.test_disable_saved_tensors_hooks_prev_disabled_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesCtxManagerTests.test_disable_saved_tensors_hooks_prev_disabled_nested_dynamic_shapes b/test/dynamo_skips/DynamicShapesCtxManagerTests.test_disable_saved_tensors_hooks_prev_disabled_nested_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_cond_raise_user_error_on_non_list_operands_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_cond_raise_user_error_on_non_list_operands_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_cond_raise_user_error_on_non_tensor_operands_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_cond_raise_user_error_on_non_tensor_operands_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_cond_raise_user_error_on_unsupported_pred_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_cond_raise_user_error_on_unsupported_pred_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_dynamic_dim_range_constraint_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_dynamic_dim_range_constraint_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_mark_dynamic_conflict_dynamic_dim_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_mark_dynamic_conflict_dynamic_dim_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_dict_values_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_dict_values_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_free_function_and_class_method_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_free_function_and_class_method_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_free_function_and_class_method_multiarg_diff_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_free_function_and_class_method_multiarg_diff_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_free_function_and_class_method_multiarg_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_free_function_and_class_method_multiarg_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_free_function_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_free_function_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_list_nonzero_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_list_nonzero_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_list_nonzero_free_function_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_list_nonzero_free_function_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_method_on_module_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_method_on_module_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_method_on_module_invoke_twice_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_method_on_module_invoke_twice_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_none_control_flow_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_none_control_flow_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_none_control_flow_free_func_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_none_control_flow_free_func_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_not_none_control_flow_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_not_none_control_flow_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_not_none_control_flow_free_func_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_not_none_control_flow_free_func_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_not_none_control_flow_pos_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_not_none_control_flow_pos_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_not_return_const_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_not_return_const_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_tuple_nonzero_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_constant_tuple_nonzero_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_export_with_map_cond_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_export_with_map_cond_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_map_cond_param_buffer_lifted_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_map_cond_param_buffer_lifted_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_retracibility_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_retracibility_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_torch_inference_mode_ctx_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_torch_inference_mode_ctx_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesExportTests.test_untracked_inputs_in_constraints_dynamic_shapes b/test/dynamo_skips/DynamicShapesExportTests.test_untracked_inputs_in_constraints_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_disable_capture_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_disable_capture_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_fn_with_kwargs_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_fn_with_kwargs_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_freevar_python_scalar_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_freevar_python_scalar_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_has_aux_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_has_aux_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_non_tensor_input_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_non_tensor_input_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_pytree_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_pytree_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_two_tensor_all_grad_has_aux_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_two_tensor_all_grad_has_aux_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_two_tensor_has_aux_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_two_tensor_has_aux_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_with_graph_break_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_with_graph_break_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_with_side_effect_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_grad_with_side_effect_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_disable_capture_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_disable_capture_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_free_const_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_free_const_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_free_tensor_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_free_tensor_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_illegal_op_graph_break_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_illegal_op_graph_break_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_kwargs_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_kwargs_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_multiple_invocation_in_dims_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_multiple_invocation_in_dims_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_multiple_invocation_out_dims_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_multiple_invocation_out_dims_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_multiple_outputs_diff_dims_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_multiple_outputs_diff_dims_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_multiple_outputs_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_multiple_outputs_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_multiple_outputs_out_dims_tuple_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_multiple_outputs_out_dims_tuple_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_over_vmap_captured_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_over_vmap_captured_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_over_vmap_two_inputs_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_over_vmap_two_inputs_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_pytree_inputs_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_pytree_inputs_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_side_effects_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_side_effects_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_two_inputs_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_two_inputs_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_two_inputs_tuple_in_dims_dynamic_shapes b/test/dynamo_skips/DynamicShapesFuncTorchHigherOrderOpTests.test_vmap_two_inputs_tuple_in_dims_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFunctionTests.test_default_dict_closure_dynamic_shapes b/test/dynamo_skips/DynamicShapesFunctionTests.test_default_dict_closure_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFunctionTests.test_default_dict_dynamic_shapes b/test/dynamo_skips/DynamicShapesFunctionTests.test_default_dict_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFunctionTests.test_default_dict_lambda_dynamic_shapes b/test/dynamo_skips/DynamicShapesFunctionTests.test_default_dict_lambda_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFunctionTests.test_fstrings2_dynamic_shapes b/test/dynamo_skips/DynamicShapesFunctionTests.test_fstrings2_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFunctionTests.test_is_contiguous_frame_counts_dynamic_shapes b/test/dynamo_skips/DynamicShapesFunctionTests.test_is_contiguous_frame_counts_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFunctionTests.test_math_radians_dynamic_shapes b/test/dynamo_skips/DynamicShapesFunctionTests.test_math_radians_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFunctionTests.test_partials_as_input_partials_lambda_dynamic_shapes b/test/dynamo_skips/DynamicShapesFunctionTests.test_partials_as_input_partials_lambda_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesFunctionTests.test_partials_as_input_partials_mod_dynamic_shapes b/test/dynamo_skips/DynamicShapesFunctionTests.test_partials_as_input_partials_mod_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_access_module_attr_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_access_module_attr_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_capture_untracked_global_nested_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_capture_untracked_global_nested_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_capture_value_created_in_subgraph_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_capture_value_created_in_subgraph_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_cond_branches_no_arguments_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_cond_branches_no_arguments_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_cond_branches_no_arguments_no_closure_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_cond_branches_no_arguments_no_closure_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_cond_pytree_operands_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_cond_pytree_operands_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_cond_pytree_operands_with_non_tensor_leaves_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_cond_pytree_operands_with_non_tensor_leaves_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_cond_source_fn_stack_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_cond_source_fn_stack_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_cond_subgraph_name_is_valid_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_cond_subgraph_name_is_valid_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_fallback_on_python_primitives_output_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_fallback_on_python_primitives_output_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_map_lowers_to_graph_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_map_lowers_to_graph_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_map_multi_return_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_map_multi_return_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_map_pytree_return_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_map_pytree_return_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_map_source_fn_stack_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_map_source_fn_stack_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_map_subgraph_name_is_valid_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_map_subgraph_name_is_valid_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_map_symint_input_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_map_symint_input_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_modules_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_modules_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_nested_tuple_output_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_nested_tuple_output_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_output_with_dict_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_output_with_dict_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_side_effect_in_body_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_side_effect_in_body_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_vmap_source_fn_stack_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_vmap_source_fn_stack_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_wrap_kwarg_recompile_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_wrap_kwarg_recompile_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_wrap_subgraph_name_is_valid_dynamic_shapes b/test/dynamo_skips/DynamicShapesHigherOrderOpTests.test_wrap_subgraph_name_is_valid_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_add_to_set_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_add_to_set_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_any_all_symnode_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_any_all_symnode_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_backend_match_guard_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_backend_match_guard_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_boolarg_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_boolarg_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_callpacked_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_callpacked_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_closure_out_of_scope_cell_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_closure_out_of_scope_cell_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_compare_shapes_with_constant_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_compare_shapes_with_constant_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_cond_nested_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_cond_nested_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_cond_side_effects_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_cond_side_effects_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_dataclass_fields_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_dataclass_fields_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_deque_append_left_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_deque_append_left_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_deque_input_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_deque_input_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_deterministic_algorithms_mutated_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_deterministic_algorithms_mutated_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_dict_order_keys_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_dict_order_keys_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_dict_order_keys_modules_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_dict_order_keys_modules_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_dict_order_keys_tensors_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_dict_order_keys_tensors_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_dictcomp_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_dictcomp_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_dtypes_no_graphbreaks_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_dtypes_no_graphbreaks_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_dunder_new_function_inlining_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_dunder_new_function_inlining_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_frozenset_torch_func_contains_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_frozenset_torch_func_contains_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_get_cache_entry_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_get_cache_entry_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_grad_state_mutated_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_grad_state_mutated_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_guard_failure_fn_shape_control_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_guard_failure_fn_shape_control_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_guard_failure_fn_tensor_iter_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_guard_failure_fn_tensor_iter_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_inline_closure_not_loaded_by_parent_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_inline_closure_not_loaded_by_parent_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_inplace_param_update_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_inplace_param_update_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_intermediary_tensor_grad_access_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_intermediary_tensor_grad_access_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_is_compiling_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_is_compiling_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_iter_set_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_iter_set_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_accumulate_symint_default_sum_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_accumulate_symint_default_sum_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_accumulate_tensors_builtins_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_accumulate_tensors_builtins_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_accumulate_tensors_default_sum_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_accumulate_tensors_default_sum_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_accumulate_tensors_kwargs_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_accumulate_tensors_kwargs_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_accumulate_tensors_user_defined_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_accumulate_tensors_user_defined_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_groupby_pure_python_default_identify_func_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_groupby_pure_python_default_identify_func_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_groupby_pure_python_key_func_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_groupby_pure_python_key_func_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_infinite_count_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_infinite_count_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_infinite_cycle_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_infinite_cycle_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_infinite_repeat_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_infinite_repeat_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_infinite_repeat_mutation_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_infinite_repeat_mutation_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_repeat_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_itertools_repeat_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_linetable_311_writer1_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_linetable_311_writer1_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_linetable_311_writer2_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_linetable_311_writer2_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_list_slice_mul_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_list_slice_mul_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_listcomp_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_listcomp_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_mandelbrot_numpy_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_mandelbrot_numpy_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_mark_static_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_mark_static_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_namedtuple1_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_namedtuple1_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_namedtuple2_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_namedtuple2_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_nan_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_nan_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_nested_closure_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_nested_closure_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_nested_closure_mutation_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_nested_closure_mutation_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_nested_optimize_decorator_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_nested_optimize_decorator_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_nested_optimize_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_nested_optimize_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_nested_optimize_run_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_nested_optimize_run_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_nn_module_getattr_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_nn_module_getattr_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_no_raise_guard_partial_constraint_across_break_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_no_raise_guard_partial_constraint_across_break_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_no_raise_guard_partial_constraint_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_no_raise_guard_partial_constraint_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_array_of_arrays_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_array_of_arrays_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_force_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_force_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_int_constant_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_int_constant_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_iter_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_iter_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_non_torch_dtype_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_non_torch_dtype_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_readonly_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_readonly_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_recompilation_scalar_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_recompilation_scalar_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_size_attr_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_size_attr_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_subdtype_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_subdtype_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_tolist_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_tolist_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_with_builtin_type_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_numpy_with_builtin_type_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_out_variants_with_resizing_on_graph_inputs_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_out_variants_with_resizing_on_graph_inputs_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_pure_python_accumulate_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_pure_python_accumulate_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_py_guards_mark_dynamic_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_py_guards_mark_dynamic_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_raise_guard_full_constraint_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_raise_guard_full_constraint_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_raise_guard_partial_constraint_no_graph_break_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_raise_guard_partial_constraint_no_graph_break_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_raise_on_backend_error_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_raise_on_backend_error_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_recompile_on_global_state_change_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_recompile_on_global_state_change_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_release_input_memory_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_release_input_memory_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_release_module_memory_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_release_module_memory_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_release_scope_memory_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_release_scope_memory_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_repeat_interleave_graphbreaks_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_repeat_interleave_graphbreaks_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_return_nested_function_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_return_nested_function_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_set_aliasing_recompiles_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_set_aliasing_recompiles_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_setattr_mutation1_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_setattr_mutation1_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_simple_set_usage_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_simple_set_usage_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_size_input_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_size_input_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_slice_input_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_slice_input_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_str_format_assert2_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_str_format_assert2_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tensor_build_list_unpack_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tensor_build_list_unpack_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tensor_dict1_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tensor_dict1_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tensor_dict2_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tensor_dict2_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tensor_dict3_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tensor_dict3_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tensor_item_capture_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tensor_item_capture_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tolist_0d_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tolist_0d_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tolist_1d_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tolist_1d_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tolist_kd_dynamic_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tolist_kd_dynamic_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tolist_kd_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tolist_kd_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tolist_scalar_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tolist_scalar_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_torch_seed_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_torch_seed_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tracing_nested_py_tree_dicts_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tracing_nested_py_tree_dicts_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tracing_nested_py_tree_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tracing_nested_py_tree_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tracing_nested_py_tree_mixed_all_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tracing_nested_py_tree_mixed_all_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tracing_nested_py_tree_tuples_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tracing_nested_py_tree_tuples_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tracing_py_tree_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tracing_py_tree_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tracing_py_tree_tensor_subclass_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tracing_py_tree_tensor_subclass_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tracing_tree_map_only_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tracing_tree_map_only_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_tuple_mul_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_tuple_mul_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_type_copy_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_type_copy_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_typing_typevar_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_typing_typevar_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_user_getattribute_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_user_getattribute_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_with_builtin_type_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_with_builtin_type_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_yield_from_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_yield_from_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_yield_gen_and_from_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_yield_gen_and_from_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesMiscTests.test_yield_send_to_subgenerator_graph_break_dynamic_shapes b/test/dynamo_skips/DynamicShapesMiscTests.test_yield_send_to_subgenerator_graph_break_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesNNModuleTests.test_self_mutating1_dynamic_shapes b/test/dynamo_skips/DynamicShapesNNModuleTests.test_self_mutating1_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesNNModuleTests.test_unsupportedmethod_dynamic_shapes b/test/dynamo_skips/DynamicShapesNNModuleTests.test_unsupportedmethod_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesNNModuleTests.test_unsupportedmodule_dynamic_shapes b/test/dynamo_skips/DynamicShapesNNModuleTests.test_unsupportedmodule_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_add_sub_alpha_out_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_add_sub_alpha_out_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_addr_alpha_beta_out_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_addr_alpha_beta_out_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_boxes_len_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_boxes_len_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_chunk_reformer_ff_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_chunk_reformer_ff_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_convert_boxes_to_pooler_format_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_convert_boxes_to_pooler_format_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_create_rand_mask_from_inputs_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_create_rand_mask_from_inputs_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_do_paste_mask_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_do_paste_mask_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_dynamic_shapes_implicit_guard_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_dynamic_shapes_implicit_guard_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_empty_list_contains_with_jump_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_empty_list_contains_with_jump_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_function_in_skipfiles_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_function_in_skipfiles_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_hf_t5_forward_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_hf_t5_forward_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_hf_xsoftmax_training_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_hf_xsoftmax_training_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_issue175_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_issue175_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_list_aliasing_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_list_aliasing_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_list_self_reference_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_list_self_reference_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_longformer_chunk_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_longformer_chunk_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_many_views_with_mutation_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_many_views_with_mutation_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_merge_criteria_processor_list1_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_merge_criteria_processor_list1_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_merge_criteria_processor_list2_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_merge_criteria_processor_list2_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_module_in_skipfiles_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_module_in_skipfiles_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_negative_shape_guard_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_negative_shape_guard_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_numpy_not_ndarray_recompiles_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_numpy_not_ndarray_recompiles_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_optim_state_references_cleared_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_optim_state_references_cleared_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_recursive_map_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_recursive_map_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_reformer_eval_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_reformer_eval_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_reformer_min_chunk_len_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_reformer_min_chunk_len_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_reformer_sorting_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_reformer_sorting_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_reformer_train_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_reformer_train_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_relative_import_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_relative_import_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_relative_import_no_modulename_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_relative_import_no_modulename_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_restricted_list_subclass1_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_restricted_list_subclass1_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_rewrite_assert_noop_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_rewrite_assert_noop_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_rewrite_assert_with_msg_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_rewrite_assert_with_msg_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_rewrite_assert_with_non_string_msg_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_rewrite_assert_with_non_string_msg_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_seq_append_list_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_seq_append_list_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_size_typematch_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_size_typematch_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_tensor_data_kwarg_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_tensor_data_kwarg_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_threading_local_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_threading_local_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesReproTests.test_validate_model_kwargs_dynamic_shapes b/test/dynamo_skips/DynamicShapesReproTests.test_validate_model_kwargs_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesSubGraphTests.test_dynamic_duck_size_dynamic_shapes b/test/dynamo_skips/DynamicShapesSubGraphTests.test_dynamic_duck_size_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesSubGraphTests.test_dynamic_getitem_dynamic_shapes b/test/dynamo_skips/DynamicShapesSubGraphTests.test_dynamic_getitem_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesSubGraphTests.test_dynamic_kwarg_dynamic_shapes b/test/dynamo_skips/DynamicShapesSubGraphTests.test_dynamic_kwarg_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesSubGraphTests.test_dynamic_order_dependence_dynamic_shapes b/test/dynamo_skips/DynamicShapesSubGraphTests.test_dynamic_order_dependence_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesSubGraphTests.test_dynamic_zero_inference_dynamic_shapes b/test/dynamo_skips/DynamicShapesSubGraphTests.test_dynamic_zero_inference_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesSubGraphTests.test_enumerate_not_break_graph_dynamic_shapes b/test/dynamo_skips/DynamicShapesSubGraphTests.test_enumerate_not_break_graph_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamicShapesSubGraphTests.test_no_graph_break_on_item_dynamic_shapes b/test/dynamo_skips/DynamicShapesSubGraphTests.test_no_graph_break_on_item_dynamic_shapes
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/DynamoProfilerTests.test_dynamo_timed_profiling_backend_compile b/test/dynamo_skips/DynamoProfilerTests.test_dynamo_timed_profiling_backend_compile
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/ExportTests.test_predispatch_with_for_out_dtype b/test/dynamo_skips/ExportTests.test_predispatch_with_for_out_dtype
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/ExportTests.test_predispatch_with_for_out_dtype_nested b/test/dynamo_skips/ExportTests.test_predispatch_with_for_out_dtype_nested
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/FrameInitTests.test_frame_init b/test/dynamo_skips/FrameInitTests.test_frame_init
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/HigherOrderOpTests.test_access_module_attr b/test/dynamo_skips/HigherOrderOpTests.test_access_module_attr
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/InPlaceCompilationTests.test_compilation b/test/dynamo_skips/InPlaceCompilationTests.test_compilation
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/InteropTests.test_fx_fn b/test/dynamo_skips/InteropTests.test_fx_fn
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/LoggingTests.test_distributed_rank_logging b/test/dynamo_skips/LoggingTests.test_distributed_rank_logging
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/LoggingTests.test_logs_out b/test/dynamo_skips/LoggingTests.test_logs_out
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/LoggingTests.test_trace_call b/test/dynamo_skips/LoggingTests.test_trace_call
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/LoggingTests.test_trace_call_graph_break b/test/dynamo_skips/LoggingTests.test_trace_call_graph_break
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/LoggingTests.test_trace_call_inline_call b/test/dynamo_skips/LoggingTests.test_trace_call_inline_call
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/MiscTests.test_auto_functionalize_on_view b/test/dynamo_skips/MiscTests.test_auto_functionalize_on_view
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/MiscTests.test_auto_functionalize_optional b/test/dynamo_skips/MiscTests.test_auto_functionalize_optional
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/MiscTests.test_exception_table_e2e b/test/dynamo_skips/MiscTests.test_exception_table_e2e
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/MiscTests.test_exception_table_e2e_2 b/test/dynamo_skips/MiscTests.test_exception_table_e2e_2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/MiscTests.test_exception_table_parsing b/test/dynamo_skips/MiscTests.test_exception_table_parsing
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/MiscTests.test_generate_trivial_abstract_impl b/test/dynamo_skips/MiscTests.test_generate_trivial_abstract_impl
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/MiscTests.test_itertools_accumulate_tensors_builtins b/test/dynamo_skips/MiscTests.test_itertools_accumulate_tensors_builtins
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/MiscTests.test_itertools_accumulate_tensors_kwargs b/test/dynamo_skips/MiscTests.test_itertools_accumulate_tensors_kwargs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/MiscTests.test_itertools_accumulate_tensors_user_defined b/test/dynamo_skips/MiscTests.test_itertools_accumulate_tensors_user_defined
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/MiscTests.test_itertools_infinite_count b/test/dynamo_skips/MiscTests.test_itertools_infinite_count
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/MiscTests.test_linetable_311_writer1 b/test/dynamo_skips/MiscTests.test_linetable_311_writer1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/MiscTests.test_linetable_311_writer2 b/test/dynamo_skips/MiscTests.test_linetable_311_writer2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/MiscTests.test_py311_jump_offset b/test/dynamo_skips/MiscTests.test_py311_jump_offset
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/NopTests.test_extended_args b/test/dynamo_skips/NopTests.test_extended_args
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/OptimizerTests.test_adadelta b/test/dynamo_skips/OptimizerTests.test_adadelta
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/RecompileTests.test_aliasing_guard_failures b/test/dynamo_skips/RecompileTests.test_aliasing_guard_failures
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/RecompileUxTests.test_drop_cache_on_skip b/test/dynamo_skips/RecompileUxTests.test_drop_cache_on_skip
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/ReproTests.test_optim_state_references_cleared b/test/dynamo_skips/ReproTests.test_optim_state_references_cleared
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/ReproTests.test_reformer_train b/test/dynamo_skips/ReproTests.test_reformer_train
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestAOTAutograd.test_view_detach b/test/dynamo_skips/TestAOTAutograd.test_view_detach
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestArgmax.test_combinations_data58 b/test/dynamo_skips/TestArgmax.test_combinations_data58
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestArgmax.test_combinations_data61 b/test/dynamo_skips/TestArgmax.test_combinations_data61
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestAutograd.test_hook_closure_cycle_use_custom_function_True_use_tensor_hook_True b/test/dynamo_skips/TestAutograd.test_hook_closure_cycle_use_custom_function_True_use_tensor_hook_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestAutograd.test_post_accumulate_grad_hook_gets_cleaned_up b/test/dynamo_skips/TestAutograd.test_post_accumulate_grad_hook_gets_cleaned_up
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestAwait.test_await_python b/test/dynamo_skips/TestAwait.test_await_python
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestBackends.test_execution b/test/dynamo_skips/TestBackends.test_execution
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestBackends.test_save_load b/test/dynamo_skips/TestBackends.test_save_load
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestBackendsWithCompiler.test_execution b/test/dynamo_skips/TestBackendsWithCompiler.test_execution
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestBasicsCPU.test_invalid_sparse_coo_values_cpu b/test/dynamo_skips/TestBasicsCPU.test_invalid_sparse_coo_values_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestBasicsCPU.test_invalid_sparse_csr_values_cpu b/test/dynamo_skips/TestBasicsCPU.test_invalid_sparse_csr_values_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestBinaryUfuncsCPU.test_add_cpu b/test/dynamo_skips/TestBinaryUfuncsCPU.test_add_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestBinaryUfuncsCPU.test_int_tensor_pow_neg_ints_cpu b/test/dynamo_skips/TestBinaryUfuncsCPU.test_int_tensor_pow_neg_ints_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestBinaryUfuncsCPU.test_long_tensor_pow_floats_cpu b/test/dynamo_skips/TestBinaryUfuncsCPU.test_long_tensor_pow_floats_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestBinaryUfuncsCPU.test_shift_limits_cpu_uint8 b/test/dynamo_skips/TestBinaryUfuncsCPU.test_shift_limits_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestCheckpoint.test_checkpoint_trigger b/test/dynamo_skips/TestCheckpoint.test_checkpoint_trigger
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestConstant.test_check_constant b/test/dynamo_skips/TestConstant.test_check_constant
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestConvolutionNN.test_Conv1d_module_same_padding b/test/dynamo_skips/TestConvolutionNN.test_Conv1d_module_same_padding
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestConvolutionNN.test_Conv2d_backward_twice b/test/dynamo_skips/TestConvolutionNN.test_Conv2d_backward_twice
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestConvolutionNN.test_Conv2d_module_same_padding b/test/dynamo_skips/TestConvolutionNN.test_Conv2d_module_same_padding
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestConvolutionNN.test_Conv3d_module_same_padding b/test/dynamo_skips/TestConvolutionNN.test_Conv3d_module_same_padding
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestConvolutionNN.test_ConvTranspose3d_correct_output_size b/test/dynamo_skips/TestConvolutionNN.test_ConvTranspose3d_correct_output_size
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestCustomOp.test_impl_device_function b/test/dynamo_skips/TestCustomOp.test_impl_device_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestCustomOpTestingCPU.test_opcheck_fails_basic_cpu b/test/dynamo_skips/TestCustomOpTestingCPU.test_opcheck_fails_basic_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestDLPack.test_non_contiguous b/test/dynamo_skips/TestDLPack.test_non_contiguous
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestDtypeAnalysis.test_custom_rules b/test/dynamo_skips/TestDtypeAnalysis.test_custom_rules
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestDtypeAnalysis.test_unary b/test/dynamo_skips/TestDtypeAnalysis.test_unary
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestExperiment.test_mark_strict_with_container_type b/test/dynamo_skips/TestExperiment.test_mark_strict_with_container_type
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestExperimentalUtils.test_profiler_for_loop_indexing_pattern b/test/dynamo_skips/TestExperimentalUtils.test_profiler_for_loop_indexing_pattern
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestFFTShift.test_fft_n b/test/dynamo_skips/TestFFTShift.test_fft_n
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestFX.test_annotations_empty_tuple b/test/dynamo_skips/TestFX.test_annotations_empty_tuple
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestFX.test_assert b/test/dynamo_skips/TestFX.test_assert
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestFX.test_custom_traceback_raised_when_exception_source_is_graphmodule b/test/dynamo_skips/TestFX.test_custom_traceback_raised_when_exception_source_is_graphmodule
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestFXExperimental.test_optimize_for_inference_cpu b/test/dynamo_skips/TestFXExperimental.test_optimize_for_inference_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestFXExperimental.test_optimize_for_inference_cpu_torchvision b/test/dynamo_skips/TestFXExperimental.test_optimize_for_inference_cpu_torchvision
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestForeachCPU.test_add_scalar_with_empty_list_and_empty_tensor_cpu_int16 b/test/dynamo_skips/TestForeachCPU.test_add_scalar_with_empty_list_and_empty_tensor_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestForeachCPU.test_add_scalar_with_empty_list_and_empty_tensor_cpu_int32 b/test/dynamo_skips/TestForeachCPU.test_add_scalar_with_empty_list_and_empty_tensor_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestForeachCPU.test_add_scalar_with_empty_list_and_empty_tensor_cpu_int64 b/test/dynamo_skips/TestForeachCPU.test_add_scalar_with_empty_list_and_empty_tensor_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestForeachCPU.test_add_scalar_with_empty_list_and_empty_tensor_cpu_int8 b/test/dynamo_skips/TestForeachCPU.test_add_scalar_with_empty_list_and_empty_tensor_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestForeachCPU.test_add_scalar_with_empty_list_and_empty_tensor_cpu_uint8 b/test/dynamo_skips/TestForeachCPU.test_add_scalar_with_empty_list_and_empty_tensor_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestFreezing.test_freeze_module_with_fork2 b/test/dynamo_skips/TestFreezing.test_freeze_module_with_fork2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestFreezing.test_freeze_module_with_fork_calling_module_method b/test/dynamo_skips/TestFreezing.test_freeze_module_with_fork_calling_module_method
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestFrozenOptimizations.test_collapse_adjacent_conversions b/test/dynamo_skips/TestFrozenOptimizations.test_collapse_adjacent_conversions
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestFrozenOptimizations.test_conv_add_folding b/test/dynamo_skips/TestFrozenOptimizations.test_conv_add_folding
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestFrozenOptimizations.test_conv_bn_folding b/test/dynamo_skips/TestFrozenOptimizations.test_conv_bn_folding
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestGenericProxyTensorFake.test_amp_cache b/test/dynamo_skips/TestGenericProxyTensorFake.test_amp_cache
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestGenericProxyTensorFake.test_inplace_metadata b/test/dynamo_skips/TestGenericProxyTensorFake.test_inplace_metadata
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestGenericProxyTensorReal.test_inplace_metadata b/test/dynamo_skips/TestGenericProxyTensorReal.test_inplace_metadata
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestGenericProxyTensorSymbolic.test_amp_cache b/test/dynamo_skips/TestGenericProxyTensorSymbolic.test_amp_cache
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestGenericProxyTensorSymbolic.test_inplace_metadata b/test/dynamo_skips/TestGenericProxyTensorSymbolic.test_inplace_metadata
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestHistogramdd.test_bins_array b/test/dynamo_skips/TestHistogramdd.test_bins_array
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestIndexingCPU.test_invalid_index_cpu b/test/dynamo_skips/TestIndexingCPU.test_invalid_index_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_circular_stride2_pad2 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_circular_stride2_pad2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_dilated b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_dilated
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_groups b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_groups
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad1 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad1size1 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad1size1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad2 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad2size1 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad2size1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad_same b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad_same
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad_same2 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad_same2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad_same_dilated b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad_same_dilated
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad_valid b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_pad_valid
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_reflect_stride2_pad2 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_reflect_stride2_pad2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_replicate_stride2_pad2 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_replicate_stride2_pad2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_stride b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_stride
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_zero_batch b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_zero_batch
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_zeros_stride2_pad2 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv1d_zeros_stride2_pad2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_circular_stride2_pad2 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_circular_stride2_pad2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_depthwise b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_depthwise
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_depthwise_dilated b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_depthwise_dilated
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_depthwise_padded b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_depthwise_padded
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_depthwise_strided b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_depthwise_strided
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_depthwise_with_multiplier b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_depthwise_with_multiplier
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_dilated b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_dilated
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_groups b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_groups
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_groups_thnn b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_groups_thnn
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_pad_same b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_pad_same
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_pad_same_dilated b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_pad_same_dilated
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_pad_valid b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_pad_valid
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_padding b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_padding
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_reflect_stride2_pad2 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_reflect_stride2_pad2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_replicate_stride2_pad2 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_replicate_stride2_pad2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_strided b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_strided
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_zero_batch b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_zero_batch
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_zeros_stride2_pad2 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv2d_zeros_stride2_pad2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_circular_stride2_pad2 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_circular_stride2_pad2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_dilated b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_dilated
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_dilated_strided b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_dilated_strided
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_groups b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_groups
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_pad_same b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_pad_same
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_pad_same_dilated b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_pad_same_dilated
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_pad_valid b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_pad_valid
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_replicate_stride2_pad2 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_replicate_stride2_pad2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_stride b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_stride
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_stride_padding b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_stride_padding
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_zero_batch b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_zero_batch
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_zeros_stride2_pad2 b/test/dynamo_skips/TestJitGeneratedModule.test_nn_Conv3d_zeros_stride2_pad2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_ConvTranspose1d b/test/dynamo_skips/TestJitGeneratedModule.test_nn_ConvTranspose1d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_ConvTranspose1d_dilated b/test/dynamo_skips/TestJitGeneratedModule.test_nn_ConvTranspose1d_dilated
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_ConvTranspose1d_groups b/test/dynamo_skips/TestJitGeneratedModule.test_nn_ConvTranspose1d_groups
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_ConvTranspose2d b/test/dynamo_skips/TestJitGeneratedModule.test_nn_ConvTranspose2d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_ConvTranspose2d_groups b/test/dynamo_skips/TestJitGeneratedModule.test_nn_ConvTranspose2d_groups
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_ConvTranspose3d b/test/dynamo_skips/TestJitGeneratedModule.test_nn_ConvTranspose3d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestJitGeneratedModule.test_nn_ConvTranspose3d_dilated b/test/dynamo_skips/TestJitGeneratedModule.test_nn_ConvTranspose3d_dilated
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestLazyDynamicOps.test_nonzero_dynamic b/test/dynamo_skips/TestLazyDynamicOps.test_nonzero_dynamic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestLazyReuseIr.testAdd b/test/dynamo_skips/TestLazyReuseIr.testAdd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestLazyReuseIr.testAddSub b/test/dynamo_skips/TestLazyReuseIr.testAddSub
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestLazyReuseIr.testAddSubFallback b/test/dynamo_skips/TestLazyReuseIr.testAddSubFallback
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestLazyReuseIr.testBatchNorm b/test/dynamo_skips/TestLazyReuseIr.testBatchNorm
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestLazyTensor.test_tensor_ctr b/test/dynamo_skips/TestLazyTensor.test_tensor_ctr
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestLazyTensor.test_view_mark_step_preserved b/test/dynamo_skips/TestLazyTensor.test_view_mark_step_preserved
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestList.test_comprehension_iterable b/test/dynamo_skips/TestList.test_comprehension_iterable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestLoadStateDict.test_load_state_dict_BC_swap_True b/test/dynamo_skips/TestLoadStateDict.test_load_state_dict_BC_swap_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestLogging.test_trace_numeric_counter b/test/dynamo_skips/TestLogging.test_trace_numeric_counter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMKLDNNReinplacing.test_always_alive_values b/test/dynamo_skips/TestMKLDNNReinplacing.test_always_alive_values
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMatmulOperator.test_exceptions b/test/dynamo_skips/TestMatmulOperator.test_exceptions
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMatmulOperator.test_matmul_raises b/test/dynamo_skips/TestMatmulOperator.test_matmul_raises
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMethods.test_compress b/test/dynamo_skips/TestMethods.test_compress
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMethods.test_dot_out_mem_overlap b/test/dynamo_skips/TestMethods.test_dot_out_mem_overlap
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMethods.test_matmul_out b/test/dynamo_skips/TestMethods.test_matmul_out
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMethods.test_partition_iterative b/test/dynamo_skips/TestMethods.test_partition_iterative
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMethods.test_round b/test/dynamo_skips/TestMethods.test_round
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMethods.test_searchsorted_complex b/test/dynamo_skips/TestMethods.test_searchsorted_complex
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMethods.test_searchsorted_type_specific_2 b/test/dynamo_skips/TestMethods.test_searchsorted_type_specific_2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMethods.test_searchsorted_with_invalid_sorter b/test/dynamo_skips/TestMethods.test_searchsorted_with_invalid_sorter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMethods.test_transpose b/test/dynamo_skips/TestMethods.test_transpose
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMisc.test_broadcasting_list b/test/dynamo_skips/TestMisc.test_broadcasting_list
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMisc.test_parse_ir_annotate b/test/dynamo_skips/TestMisc.test_parse_ir_annotate
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMisc.test_parse_ir_single_element_tensor_negative b/test/dynamo_skips/TestMisc.test_parse_ir_single_element_tensor_negative
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMisc.test_parse_ir_single_element_tensor_positive b/test/dynamo_skips/TestMisc.test_parse_ir_single_element_tensor_positive
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestMkldnnFusion.test_single_conv b/test/dynamo_skips/TestMkldnnFusion.test_single_conv
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestModels.test_snli_quantized b/test/dynamo_skips/TestModels.test_snli_quantized
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestModels.test_time_sequence_prediction b/test/dynamo_skips/TestModels.test_time_sequence_prediction
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestModels.test_vae_quantized b/test/dynamo_skips/TestModels.test_vae_quantized
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNN.test_padding_list b/test/dynamo_skips/TestNN.test_padding_list
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNN.test_vector_to_parameters b/test/dynamo_skips/TestNN.test_vector_to_parameters
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNNParametrization.test_new_spectral_norm_dim_swap_True b/test/dynamo_skips/TestNNParametrization.test_new_spectral_norm_dim_swap_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNativeFunctions.test_intlist_error_with_overload b/test/dynamo_skips/TestNativeFunctions.test_intlist_error_with_overload
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_avg_pool2d b/test/dynamo_skips/TestNnapiBackend.test_avg_pool2d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_cat b/test/dynamo_skips/TestNnapiBackend.test_cat
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_compile_spec_santiy b/test/dynamo_skips/TestNnapiBackend.test_compile_spec_santiy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_conv2d b/test/dynamo_skips/TestNnapiBackend.test_conv2d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_conv2d_transpose b/test/dynamo_skips/TestNnapiBackend.test_conv2d_transpose
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_dequantize b/test/dynamo_skips/TestNnapiBackend.test_dequantize
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_detach b/test/dynamo_skips/TestNnapiBackend.test_detach
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_flatten b/test/dynamo_skips/TestNnapiBackend.test_flatten
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_hardtanh b/test/dynamo_skips/TestNnapiBackend.test_hardtanh
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_linear b/test/dynamo_skips/TestNnapiBackend.test_linear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_log_softmax b/test/dynamo_skips/TestNnapiBackend.test_log_softmax
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_max_pool2d b/test/dynamo_skips/TestNnapiBackend.test_max_pool2d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_mean b/test/dynamo_skips/TestNnapiBackend.test_mean
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_multi_output b/test/dynamo_skips/TestNnapiBackend.test_multi_output
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_pointwise_binary b/test/dynamo_skips/TestNnapiBackend.test_pointwise_binary
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_pointwise_binary_const b/test/dynamo_skips/TestNnapiBackend.test_pointwise_binary_const
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_pointwise_unary b/test/dynamo_skips/TestNnapiBackend.test_pointwise_unary
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_prelu b/test/dynamo_skips/TestNnapiBackend.test_prelu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_qadd b/test/dynamo_skips/TestNnapiBackend.test_qadd
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_qlinear b/test/dynamo_skips/TestNnapiBackend.test_qlinear
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_quantize b/test/dynamo_skips/TestNnapiBackend.test_quantize
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_reshape b/test/dynamo_skips/TestNnapiBackend.test_reshape
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_seblock_mul b/test/dynamo_skips/TestNnapiBackend.test_seblock_mul
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_slice b/test/dynamo_skips/TestNnapiBackend.test_slice
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_softmax b/test/dynamo_skips/TestNnapiBackend.test_softmax
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_tensor_input b/test/dynamo_skips/TestNnapiBackend.test_tensor_input
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_to b/test/dynamo_skips/TestNnapiBackend.test_to
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_unsqueeze b/test/dynamo_skips/TestNnapiBackend.test_unsqueeze
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestNnapiBackend.test_upsample_nearest2d b/test/dynamo_skips/TestNnapiBackend.test_upsample_nearest2d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestOpenMP_ParallelFor.test_one_thread b/test/dynamo_skips/TestOpenMP_ParallelFor.test_one_thread
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestPackageScript.test_load_shared_tensors_repackaged b/test/dynamo_skips/TestPackageScript.test_load_shared_tensors_repackaged
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestParametrization.test_traceable b/test/dynamo_skips/TestParametrization.test_traceable
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestPeephole.test_peephole_int b/test/dynamo_skips/TestPeephole.test_peephole_int
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestPeephole.test_peephole_optional_refine b/test/dynamo_skips/TestPeephole.test_peephole_optional_refine
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProfiler.test_profiler_metadata b/test/dynamo_skips/TestProfiler.test_profiler_metadata
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProfiler.test_tensorboard_trace_handler b/test/dynamo_skips/TestProfiler.test_tensorboard_trace_handler
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProfilerTree.test_profiler_experimental_tree b/test/dynamo_skips/TestProfilerTree.test_profiler_experimental_tree
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_H_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_H_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_MapControlflowOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_MapControlflowOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NestedMapControlflowOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NestedMapControlflowOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpyCatCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpyCatCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpyCubeCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpyCubeCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpyMulCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpyMulCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpyNMSCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpyNMSCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpyNonzeroCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpyNonzeroCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpySortCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpySortCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpySplitCopyCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpySplitCopyCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpySplitCopyWithIntCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpySplitCopyWithIntCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpyTakeCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpyTakeCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpyViewCopyCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_NumpyViewCopyCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_TripleNestedMapControlflowOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_TripleNestedMapControlflowOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___getitem___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___getitem___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___radd___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___radd___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___rdiv___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___rdiv___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___rmatmul___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___rmatmul___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___rmod___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___rmod___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___rmul___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___rmul___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___rpow___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___rpow___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___rsub___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive___rsub___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive__native_batch_norm_legit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive__native_batch_norm_legit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive__segment_reduce_lengths_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive__segment_reduce_lengths_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive__segment_reduce_offsets_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive__segment_reduce_offsets_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive__softmax_backward_data_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive__softmax_backward_data_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive__upsample_bilinear2d_aa_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive__upsample_bilinear2d_aa_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_abs_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_abs_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_acos_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_acos_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_acosh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_acosh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_addbmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_addbmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_addcdiv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_addcdiv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_addcmul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_addcmul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_addmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_addmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_addmm_decomposed_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_addmm_decomposed_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_addmv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_addmv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_addr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_addr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_all_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_all_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_allclose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_allclose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_aminmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_aminmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_angle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_angle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_any_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_any_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_arange_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_arange_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_argmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_argmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_argmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_argmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_argsort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_argsort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_argwhere_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_argwhere_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_as_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_as_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_as_strided_partial_views_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_as_strided_partial_views_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_as_strided_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_as_strided_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_asin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_asin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_asinh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_asinh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_atan2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_atan2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_atan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_atan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_atanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_atanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_atleast_1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_atleast_1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_atleast_2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_atleast_2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_atleast_3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_atleast_3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_baddbmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_baddbmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_batch_norm_with_update_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_batch_norm_with_update_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_bernoulli_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_bernoulli_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_bfloat16_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_bfloat16_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_block_diag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_block_diag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_bmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_bmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_bool_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_bool_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_broadcast_shapes_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_broadcast_shapes_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_broadcast_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_broadcast_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_broadcast_to_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_broadcast_to_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_bucketize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_bucketize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_byte_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_byte_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cartesian_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cartesian_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cauchy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cauchy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cdist_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cdist_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cdouble_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cdouble_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ceil_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ceil_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cfloat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cfloat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_chalf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_chalf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_char_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_char_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cholesky_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cholesky_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cholesky_inverse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cholesky_inverse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cholesky_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cholesky_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_chunk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_chunk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_clamp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_clamp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_clamp_max_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_clamp_max_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_clamp_min_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_clamp_min_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_clone_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_clone_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_column_stack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_column_stack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_combinations_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_combinations_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_complex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_complex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_conj_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_conj_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_conj_physical_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_conj_physical_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_constant_pad_nd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_constant_pad_nd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_contiguous_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_contiguous_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_copysign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_copysign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_corrcoef_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_corrcoef_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cos_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cos_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cosh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cosh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_count_nonzero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_count_nonzero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cov_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cov_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cross_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cross_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cummax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cummax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cummin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cummin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cumprod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cumprod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cumsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cumsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cumulative_trapezoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_cumulative_trapezoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_deg2rad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_deg2rad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_diag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_diag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_diag_embed_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_diag_embed_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_diagflat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_diagflat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_diagonal_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_diagonal_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_diagonal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_diagonal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_diagonal_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_diagonal_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_diff_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_diff_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_digamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_digamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_div_floor_rounding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_div_floor_rounding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_div_no_rounding_mode_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_div_no_rounding_mode_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_div_trunc_rounding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_div_trunc_rounding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_dot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_dot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_double_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_double_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_dsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_dsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_dstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_dstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_einsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_einsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_empty_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_empty_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_empty_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_empty_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_empty_permuted_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_empty_permuted_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_empty_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_empty_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_eq_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_eq_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_equal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_equal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_erf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_erf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_erfc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_erfc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_erfinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_erfinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_exp2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_exp2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_exp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_exp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_expand_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_expand_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_expand_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_expand_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_expm1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_expm1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_exponential_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_exponential_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_eye_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_eye_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_fft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_fft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_fft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_fft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_fftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_fftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_fftshift_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_fftshift_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_hfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_hfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_hfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_hfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_hfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_hfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_ifft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_ifft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_ifft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_ifft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_ifftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_ifftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_ifftshift_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_ifftshift_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_ihfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_ihfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_ihfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_ihfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_ihfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_ihfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_irfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_irfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_irfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_irfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_irfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_irfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_rfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_rfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_rfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_rfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_rfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fft_rfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_flatten_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_flatten_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_flip_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_flip_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fliplr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fliplr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_flipud_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_flipud_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_float_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_float_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_float_power_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_float_power_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_floor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_floor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_floor_divide_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_floor_divide_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fmod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_fmod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_frac_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_frac_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_frexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_frexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_full_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_full_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_full_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_full_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_gather_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_gather_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ge_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ge_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_geometric_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_geometric_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_geqrf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_geqrf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_gradient_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_gradient_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_grid_sampler_2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_grid_sampler_2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_gt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_gt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_half_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_half_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_heaviside_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_heaviside_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_histc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_histc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_histogram_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_histogram_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_histogramdd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_histogramdd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_hsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_hsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_hstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_hstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_hypot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_hypot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_i0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_i0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_igamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_igamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_igammac_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_igammac_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_index_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_index_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_index_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_index_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_index_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_index_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_index_put_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_index_put_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_index_reduce_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_index_reduce_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_index_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_index_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_inner_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_inner_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_int_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_int_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isclose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isclose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isfinite_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isfinite_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isinf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isinf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isnan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isnan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isneginf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isneginf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isposinf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isposinf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isreal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_isreal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_item_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_item_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_jiterator_2inputs_2outputs_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_jiterator_2inputs_2outputs_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_jiterator_4inputs_with_extra_args_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_jiterator_4inputs_with_extra_args_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_jiterator_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_jiterator_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_jiterator_binary_return_by_ref_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_jiterator_binary_return_by_ref_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_jiterator_unary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_jiterator_unary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_kron_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_kron_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_kthvalue_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_kthvalue_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ldexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ldexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_le_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_le_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_lerp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_lerp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_lgamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_lgamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_cholesky_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_cholesky_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_cholesky_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_cholesky_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_cond_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_cond_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_cross_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_cross_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_det_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_det_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_det_singular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_det_singular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_diagonal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_diagonal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_eig_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_eig_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_eigh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_eigh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_eigvals_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_eigvals_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_eigvalsh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_eigvalsh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_householder_product_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_householder_product_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_inv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_inv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_inv_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_inv_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_ldl_factor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_ldl_factor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_ldl_factor_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_ldl_factor_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_ldl_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_ldl_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_lstsq_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_lstsq_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_lstsq_grad_oriented_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_lstsq_grad_oriented_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_lu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_lu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_lu_factor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_lu_factor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_lu_factor_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_lu_factor_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_lu_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_lu_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_matrix_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_matrix_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_matrix_power_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_matrix_power_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_matrix_rank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_matrix_rank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_matrix_rank_hermitian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_matrix_rank_hermitian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_multi_dot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_multi_dot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_norm_subgradients_at_zero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_norm_subgradients_at_zero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_pinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_pinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_pinv_hermitian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_pinv_hermitian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_pinv_singular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_pinv_singular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_qr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_qr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_slogdet_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_slogdet_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_solve_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_solve_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_solve_triangular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_solve_triangular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_svd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_svd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_svdvals_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_svdvals_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_tensorinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_tensorinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_tensorsolve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_tensorsolve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_vander_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_vander_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_vecdot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_vecdot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_vector_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linalg_vector_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linspace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linspace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linspace_tensor_overload_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_linspace_tensor_overload_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_log10_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_log10_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_log1p_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_log1p_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_log2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_log2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_log_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_log_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_log_normal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_log_normal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_log_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_log_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_log_softmax_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_log_softmax_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logaddexp2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logaddexp2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logaddexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logaddexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logcumsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logcumsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logdet_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logdet_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logical_and_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logical_and_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logical_not_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logical_not_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logical_or_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logical_or_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logical_xor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logical_xor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logspace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logspace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logspace_tensor_overload_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logspace_tensor_overload_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_logsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_long_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_long_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_lt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_lt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_lu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_lu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_lu_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_lu_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_lu_unpack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_lu_unpack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mH_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mH_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mT_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mT_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_argmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_argmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_argmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_argmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_cumprod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_cumprod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_cumsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_cumsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_log_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_log_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_logaddexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_logaddexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_logsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_logsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_median_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_median_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_normalize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_normalize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_softmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_softmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_std_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_std_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_var_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_masked_var_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_matmul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_matmul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_matrix_exp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_matrix_exp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_max_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_max_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_max_pool2d_with_indices_backward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_max_pool2d_with_indices_backward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_max_reduction_no_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_max_reduction_no_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_max_reduction_with_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_max_reduction_with_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_maximum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_maximum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_median_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_median_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_meshgrid_list_of_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_meshgrid_list_of_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_meshgrid_variadic_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_meshgrid_variadic_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_min_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_min_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_min_reduction_no_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_min_reduction_no_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_min_reduction_with_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_min_reduction_with_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_minimum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_minimum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mode_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mode_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_movedim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_movedim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_msort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_msort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_multinomial_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_multinomial_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mvlgamma_mvlgamma_p_1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mvlgamma_mvlgamma_p_1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mvlgamma_mvlgamma_p_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mvlgamma_mvlgamma_p_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mvlgamma_mvlgamma_p_5_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_mvlgamma_mvlgamma_p_5_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nan_to_num_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nan_to_num_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nanmean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nanmean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nanmedian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nanmedian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nanquantile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nanquantile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nansum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nansum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_narrow_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_narrow_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_narrow_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_narrow_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_native_batch_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_native_batch_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_native_dropout_backward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_native_dropout_backward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_native_layer_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_native_layer_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ne_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ne_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_neg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_neg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_new_empty_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_new_empty_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_new_empty_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_new_empty_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_new_full_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_new_full_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_new_ones_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_new_ones_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_new_zeros_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_new_zeros_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nextafter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nextafter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_adaptive_avg_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_adaptive_avg_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_adaptive_avg_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_adaptive_avg_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_adaptive_avg_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_adaptive_avg_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_adaptive_max_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_adaptive_max_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_adaptive_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_adaptive_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_adaptive_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_adaptive_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_alpha_dropout_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_alpha_dropout_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_avg_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_avg_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_avg_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_avg_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_avg_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_avg_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_batch_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_batch_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_binary_cross_entropy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_binary_cross_entropy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_binary_cross_entropy_with_logits_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_binary_cross_entropy_with_logits_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_celu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_celu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_conv1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_conv1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_conv2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_conv2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_conv3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_conv3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_conv_transpose1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_conv_transpose1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_conv_transpose2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_conv_transpose2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_conv_transpose3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_conv_transpose3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_cosine_embedding_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_cosine_embedding_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_cosine_similarity_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_cosine_similarity_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_cross_entropy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_cross_entropy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_ctc_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_ctc_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_dropout2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_dropout2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_dropout3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_dropout3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_dropout_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_dropout_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_elu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_elu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_embedding_bag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_embedding_bag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_embedding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_embedding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_feature_alpha_dropout_with_train_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_feature_alpha_dropout_with_train_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_feature_alpha_dropout_without_train_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_feature_alpha_dropout_without_train_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_fractional_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_fractional_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_fractional_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_fractional_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_gaussian_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_gaussian_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_gelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_gelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_glu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_glu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_grid_sample_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_grid_sample_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_group_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_group_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_hardshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_hardshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_hardsigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_hardsigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_hardswish_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_hardswish_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_hardtanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_hardtanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_hinge_embedding_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_hinge_embedding_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_huber_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_huber_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_instance_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_instance_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_interpolate_area_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_interpolate_area_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_interpolate_bicubic_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_interpolate_bicubic_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_interpolate_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_interpolate_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_interpolate_linear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_interpolate_linear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_interpolate_nearest-exact_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_interpolate_nearest-exact_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_interpolate_nearest_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_interpolate_nearest_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_interpolate_trilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_interpolate_trilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_kl_div_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_kl_div_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_l1_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_l1_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_layer_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_layer_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_leaky_relu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_leaky_relu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_linear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_linear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_local_response_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_local_response_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_logsigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_logsigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_margin_ranking_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_margin_ranking_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_unpool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_unpool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_unpool1d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_unpool1d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_unpool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_unpool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_unpool2d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_unpool2d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_unpool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_unpool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_unpool3d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_max_unpool3d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_mish_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_mish_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_mse_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_mse_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_multi_head_attention_forward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_multi_head_attention_forward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_multi_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_multi_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_multilabel_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_multilabel_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_multilabel_soft_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_multilabel_soft_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_normalize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_normalize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pad_circular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pad_circular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pad_constant_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pad_constant_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pad_reflect_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pad_reflect_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pad_replicate_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pad_replicate_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pad_replicate_negative_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pad_replicate_negative_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pairwise_distance_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pairwise_distance_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pdist_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pdist_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pixel_shuffle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pixel_shuffle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pixel_unshuffle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_pixel_unshuffle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_poisson_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_poisson_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_prelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_prelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_relu6_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_relu6_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_relu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_relu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_rrelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_rrelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_scaled_dot_product_attention_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_scaled_dot_product_attention_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_selu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_selu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_silu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_silu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_smooth_l1_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_smooth_l1_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_soft_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_soft_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_softmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_softmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_softmin_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_softmin_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_softplus_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_softplus_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_softshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_softshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_softsign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_softsign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_tanhshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_tanhshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_threshold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_threshold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_triplet_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_triplet_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_triplet_margin_with_distance_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_triplet_margin_with_distance_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_unfold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_unfold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_upsample_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_upsample_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_upsample_nearest_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nn_functional_upsample_nearest_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nonzero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nonzero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nonzero_static_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_nonzero_static_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_norm_fro_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_norm_fro_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_norm_inf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_norm_inf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_norm_nuc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_norm_nuc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_normal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_normal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_normal_in_place_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_normal_in_place_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_normal_number_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_normal_number_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ones_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ones_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ones_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ones_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ormqr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ormqr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_outer_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_outer_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_pca_lowrank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_pca_lowrank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_permute_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_permute_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_pinverse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_pinverse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_polar_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_polar_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_polygamma_polygamma_n_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_polygamma_polygamma_n_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_polygamma_polygamma_n_1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_polygamma_polygamma_n_1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_polygamma_polygamma_n_2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_polygamma_polygamma_n_2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_polygamma_polygamma_n_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_polygamma_polygamma_n_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_polygamma_polygamma_n_4_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_polygamma_polygamma_n_4_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_positive_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_positive_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_pow_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_pow_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_put_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_put_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_qr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_qr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_quantile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_quantile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_rad2deg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_rad2deg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_rand_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_rand_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_randint_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_randint_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_randint_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_randint_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_randn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_randn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_randn_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_randn_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ravel_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_ravel_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_real_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_real_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_reciprocal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_reciprocal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_remainder_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_remainder_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_renorm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_renorm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_repeat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_repeat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_repeat_interleave_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_repeat_interleave_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_reshape_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_reshape_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_reshape_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_reshape_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_resize__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_resize__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_resize_as__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_resize_as__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_resolve_conj_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_resolve_conj_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_resolve_neg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_resolve_neg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_roll_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_roll_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_rot90_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_rot90_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_round_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_round_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_round_decimals_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_round_decimals_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_round_decimals_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_round_decimals_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_round_decimals_neg_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_round_decimals_neg_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_rsqrt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_rsqrt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_rsub_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_rsub_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scalar_tensor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scalar_tensor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scatter_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scatter_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scatter_reduce_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scatter_reduce_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scatter_reduce_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scatter_reduce_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scatter_reduce_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scatter_reduce_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scatter_reduce_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scatter_reduce_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scatter_reduce_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_scatter_reduce_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_searchsorted_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_searchsorted_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_select_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_select_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sgn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sgn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_short_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_short_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_bartlett_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_bartlett_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_blackman_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_blackman_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_cosine_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_cosine_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_exponential_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_exponential_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_gaussian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_gaussian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_general_cosine_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_general_cosine_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_general_hamming_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_general_hamming_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_hamming_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_hamming_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_hann_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_hann_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_kaiser_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_kaiser_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_nuttall_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signal_windows_nuttall_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signbit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_signbit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sinc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sinc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sinh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sinh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_slice_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_slice_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_slice_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_slice_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_softmax_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_softmax_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sparse_mm_reduce_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sparse_mm_reduce_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sparse_sampled_addmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sparse_sampled_addmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_airy_ai_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_airy_ai_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_bessel_j0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_bessel_j0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_bessel_j1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_bessel_j1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_bessel_y0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_bessel_y0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_bessel_y1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_bessel_y1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_chebyshev_polynomial_t_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_chebyshev_polynomial_t_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_chebyshev_polynomial_u_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_chebyshev_polynomial_u_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_chebyshev_polynomial_v_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_chebyshev_polynomial_v_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_chebyshev_polynomial_w_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_chebyshev_polynomial_w_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_entr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_entr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_erfcx_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_erfcx_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_hermite_polynomial_h_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_hermite_polynomial_h_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_hermite_polynomial_he_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_hermite_polynomial_he_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_i0e_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_i0e_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_i1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_i1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_i1e_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_i1e_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_laguerre_polynomial_l_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_laguerre_polynomial_l_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_legendre_polynomial_p_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_legendre_polynomial_p_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_log_ndtr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_log_ndtr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_modified_bessel_i0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_modified_bessel_i0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_modified_bessel_i1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_modified_bessel_i1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_modified_bessel_k0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_modified_bessel_k0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_modified_bessel_k1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_modified_bessel_k1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_ndtr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_ndtr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_ndtri_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_ndtri_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_polygamma_special_polygamma_n_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_polygamma_special_polygamma_n_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_scaled_modified_bessel_k0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_scaled_modified_bessel_k0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_scaled_modified_bessel_k1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_scaled_modified_bessel_k1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_shifted_chebyshev_polynomial_t_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_shifted_chebyshev_polynomial_t_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_shifted_chebyshev_polynomial_u_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_shifted_chebyshev_polynomial_u_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_shifted_chebyshev_polynomial_v_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_shifted_chebyshev_polynomial_v_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_shifted_chebyshev_polynomial_w_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_shifted_chebyshev_polynomial_w_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_spherical_bessel_j0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_spherical_bessel_j0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_xlog1py_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_xlog1py_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_zeta_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_special_zeta_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_split_list_args_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_split_list_args_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_split_with_sizes_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_split_with_sizes_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sqrt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sqrt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_square_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_square_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_squeeze_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_squeeze_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_squeeze_multiple_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_squeeze_multiple_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_stack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_stack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_std_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_std_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_std_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_std_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_std_mean_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_std_mean_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_std_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_std_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_stft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_stft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sub_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sub_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sum_to_size_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_sum_to_size_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_svd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_svd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_svd_lowrank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_svd_lowrank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_take_along_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_take_along_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_take_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_take_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_tan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_tan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_tanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_tanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_tensor_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_tensor_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_tensordot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_tensordot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_tile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_tile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_to_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_to_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_to_sparse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_to_sparse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_topk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_topk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_trace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_trace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_transpose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_transpose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_trapezoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_trapezoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_trapz_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_trapz_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_triangular_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_triangular_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_tril_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_tril_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_triu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_triu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_true_divide_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_true_divide_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_trunc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_trunc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unbind_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unbind_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unflatten_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unflatten_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unfold_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unfold_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unfold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unfold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_uniform_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_uniform_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unique_consecutive_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unique_consecutive_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unique_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unique_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unsafe_chunk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unsafe_chunk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unsafe_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unsafe_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unsqueeze_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_unsqueeze_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_var_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_var_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_var_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_var_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_var_mean_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_var_mean_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_var_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_var_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_vdot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_vdot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_view_as_complex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_view_as_complex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_view_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_view_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_view_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_view_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_view_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_view_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_vsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_vsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_vstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_vstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_where_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_where_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_xlogy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_xlogy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_zero__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_zero__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_zeros_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_zeros_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_zeros_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_zeros_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_H_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_H_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_MapControlflowOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_MapControlflowOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NestedMapControlflowOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NestedMapControlflowOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpyCatCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpyCatCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpyCubeCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpyCubeCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpyMulCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpyMulCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpyNMSCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpyNMSCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpyNonzeroCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpyNonzeroCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpySortCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpySortCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpySplitCopyCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpySplitCopyCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpySplitCopyWithIntCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpySplitCopyWithIntCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpyTakeCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpyTakeCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpyViewCopyCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_NumpyViewCopyCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_TripleNestedMapControlflowOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_TripleNestedMapControlflowOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___getitem___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___getitem___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___radd___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___radd___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___rdiv___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___rdiv___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___rmatmul___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___rmatmul___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___rmod___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___rmod___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___rmul___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___rmul___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___rpow___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___rpow___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___rsub___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive___rsub___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive__native_batch_norm_legit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive__native_batch_norm_legit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive__segment_reduce_lengths_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive__segment_reduce_lengths_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive__segment_reduce_offsets_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive__segment_reduce_offsets_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive__softmax_backward_data_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive__softmax_backward_data_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive__upsample_bilinear2d_aa_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive__upsample_bilinear2d_aa_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_abs_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_abs_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_acos_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_acos_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_acosh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_acosh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_addbmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_addbmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_addcdiv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_addcdiv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_addcmul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_addcmul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_addmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_addmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_addmm_decomposed_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_addmm_decomposed_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_addmv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_addmv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_addr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_addr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_all_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_all_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_allclose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_allclose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_aminmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_aminmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_angle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_angle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_any_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_any_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_arange_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_arange_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_argmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_argmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_argmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_argmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_argsort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_argsort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_argwhere_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_argwhere_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_as_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_as_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_as_strided_partial_views_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_as_strided_partial_views_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_as_strided_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_as_strided_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_asin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_asin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_asinh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_asinh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_atan2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_atan2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_atan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_atan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_atanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_atanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_atleast_1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_atleast_1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_atleast_2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_atleast_2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_atleast_3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_atleast_3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_baddbmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_baddbmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_batch_norm_with_update_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_batch_norm_with_update_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_bernoulli_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_bernoulli_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_bfloat16_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_bfloat16_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_block_diag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_block_diag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_bmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_bmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_bool_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_bool_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_broadcast_shapes_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_broadcast_shapes_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_broadcast_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_broadcast_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_broadcast_to_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_broadcast_to_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_bucketize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_bucketize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_byte_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_byte_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cartesian_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cartesian_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cauchy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cauchy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cdist_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cdist_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cdouble_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cdouble_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ceil_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ceil_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cfloat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cfloat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_chalf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_chalf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_char_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_char_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cholesky_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cholesky_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cholesky_inverse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cholesky_inverse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cholesky_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cholesky_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_chunk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_chunk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_clamp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_clamp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_clamp_max_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_clamp_max_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_clamp_min_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_clamp_min_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_clone_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_clone_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_column_stack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_column_stack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_combinations_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_combinations_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_complex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_complex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_conj_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_conj_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_conj_physical_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_conj_physical_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_constant_pad_nd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_constant_pad_nd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_contiguous_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_contiguous_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_copysign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_copysign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_corrcoef_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_corrcoef_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cos_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cos_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cosh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cosh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_count_nonzero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_count_nonzero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cov_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cov_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cross_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cross_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cummax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cummax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cummin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cummin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cumprod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cumprod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cumsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cumsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cumulative_trapezoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_cumulative_trapezoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_deg2rad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_deg2rad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_diag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_diag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_diag_embed_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_diag_embed_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_diagflat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_diagflat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_diagonal_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_diagonal_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_diagonal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_diagonal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_diagonal_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_diagonal_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_diff_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_diff_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_digamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_digamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_dist_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_dist_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_div_floor_rounding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_div_floor_rounding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_div_no_rounding_mode_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_div_no_rounding_mode_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_div_trunc_rounding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_div_trunc_rounding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_dot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_dot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_double_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_double_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_dsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_dsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_dstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_dstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_einsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_einsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_empty_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_empty_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_empty_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_empty_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_empty_permuted_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_empty_permuted_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_empty_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_empty_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_eq_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_eq_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_equal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_equal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_erf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_erf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_erfc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_erfc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_erfinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_erfinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_exp2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_exp2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_exp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_exp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_expand_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_expand_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_expand_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_expand_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_expm1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_expm1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_exponential_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_exponential_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_eye_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_eye_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_fft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_fft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_fft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_fft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_fftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_fftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_fftshift_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_fftshift_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_hfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_hfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_hfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_hfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_hfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_hfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_ifft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_ifft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_ifft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_ifft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_ifftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_ifftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_ifftshift_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_ifftshift_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_ihfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_ihfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_ihfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_ihfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_ihfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_ihfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_irfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_irfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_irfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_irfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_irfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_irfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_rfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_rfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_rfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_rfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_rfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fft_rfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_flatten_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_flatten_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_flip_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_flip_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fliplr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fliplr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_flipud_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_flipud_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_float_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_float_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_float_power_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_float_power_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_floor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_floor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_floor_divide_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_floor_divide_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fmod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_fmod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_frac_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_frac_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_frexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_frexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_full_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_full_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_full_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_full_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_gather_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_gather_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ge_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ge_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_geometric_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_geometric_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_geqrf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_geqrf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_gradient_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_gradient_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_grid_sampler_2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_grid_sampler_2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_gt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_gt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_half_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_half_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_heaviside_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_heaviside_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_histc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_histc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_histogram_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_histogram_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_histogramdd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_histogramdd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_hsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_hsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_hstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_hstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_hypot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_hypot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_i0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_i0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_igamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_igamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_igammac_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_igammac_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_index_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_index_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_index_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_index_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_index_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_index_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_index_put_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_index_put_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_index_reduce_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_index_reduce_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_index_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_index_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_inner_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_inner_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_int_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_int_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isclose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isclose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isfinite_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isfinite_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isinf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isinf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isnan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isnan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isneginf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isneginf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isposinf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isposinf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isreal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_isreal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_item_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_item_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_jiterator_2inputs_2outputs_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_jiterator_2inputs_2outputs_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_jiterator_4inputs_with_extra_args_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_jiterator_4inputs_with_extra_args_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_jiterator_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_jiterator_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_jiterator_binary_return_by_ref_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_jiterator_binary_return_by_ref_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_jiterator_unary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_jiterator_unary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_kron_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_kron_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_kthvalue_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_kthvalue_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ldexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ldexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_le_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_le_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_lerp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_lerp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_lgamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_lgamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_cholesky_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_cholesky_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_cholesky_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_cholesky_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_cond_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_cond_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_cross_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_cross_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_det_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_det_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_det_singular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_det_singular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_diagonal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_diagonal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_eig_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_eig_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_eigh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_eigh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_eigvals_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_eigvals_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_eigvalsh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_eigvalsh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_householder_product_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_householder_product_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_inv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_inv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_inv_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_inv_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_ldl_factor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_ldl_factor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_ldl_factor_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_ldl_factor_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_ldl_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_ldl_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_lstsq_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_lstsq_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_lstsq_grad_oriented_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_lstsq_grad_oriented_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_lu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_lu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_lu_factor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_lu_factor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_lu_factor_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_lu_factor_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_lu_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_lu_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_matrix_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_matrix_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_matrix_power_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_matrix_power_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_matrix_rank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_matrix_rank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_matrix_rank_hermitian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_matrix_rank_hermitian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_multi_dot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_multi_dot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_norm_subgradients_at_zero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_norm_subgradients_at_zero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_pinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_pinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_pinv_hermitian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_pinv_hermitian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_pinv_singular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_pinv_singular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_qr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_qr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_slogdet_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_slogdet_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_solve_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_solve_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_solve_triangular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_solve_triangular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_svd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_svd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_svdvals_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_svdvals_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_tensorinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_tensorinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_tensorsolve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_tensorsolve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_vander_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_vander_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_vecdot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_vecdot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_vector_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linalg_vector_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linspace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linspace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linspace_tensor_overload_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_linspace_tensor_overload_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_log10_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_log10_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_log1p_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_log1p_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_log2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_log2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_log_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_log_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_log_normal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_log_normal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_log_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_log_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_log_softmax_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_log_softmax_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logaddexp2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logaddexp2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logaddexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logaddexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logcumsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logcumsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logdet_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logdet_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logical_and_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logical_and_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logical_not_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logical_not_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logical_or_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logical_or_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logical_xor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logical_xor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logspace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logspace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logspace_tensor_overload_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logspace_tensor_overload_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_logsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_long_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_long_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_lt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_lt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_lu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_lu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_lu_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_lu_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_lu_unpack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_lu_unpack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mH_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mH_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mT_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mT_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_argmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_argmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_argmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_argmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_cumprod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_cumprod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_cumsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_cumsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_log_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_log_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_logaddexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_logaddexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_logsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_logsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_median_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_median_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_normalize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_normalize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_softmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_softmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_std_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_std_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_var_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_masked_var_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_matmul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_matmul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_matrix_exp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_matrix_exp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_max_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_max_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_max_pool2d_with_indices_backward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_max_pool2d_with_indices_backward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_max_reduction_no_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_max_reduction_no_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_max_reduction_with_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_max_reduction_with_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_maximum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_maximum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_median_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_median_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_meshgrid_list_of_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_meshgrid_list_of_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_meshgrid_variadic_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_meshgrid_variadic_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_min_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_min_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_min_reduction_no_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_min_reduction_no_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_min_reduction_with_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_min_reduction_with_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_minimum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_minimum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mode_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mode_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_movedim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_movedim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_msort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_msort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_multinomial_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_multinomial_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mvlgamma_mvlgamma_p_1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mvlgamma_mvlgamma_p_1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mvlgamma_mvlgamma_p_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mvlgamma_mvlgamma_p_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mvlgamma_mvlgamma_p_5_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_mvlgamma_mvlgamma_p_5_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nan_to_num_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nan_to_num_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nanmean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nanmean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nanmedian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nanmedian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nanquantile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nanquantile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nansum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nansum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_narrow_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_narrow_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_narrow_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_narrow_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_native_batch_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_native_batch_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_native_dropout_backward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_native_dropout_backward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_native_layer_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_native_layer_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ne_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ne_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_neg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_neg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_new_empty_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_new_empty_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_new_empty_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_new_empty_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_new_full_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_new_full_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_new_ones_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_new_ones_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_new_zeros_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_new_zeros_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nextafter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nextafter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_adaptive_avg_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_adaptive_avg_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_adaptive_avg_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_adaptive_avg_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_adaptive_avg_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_adaptive_avg_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_adaptive_max_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_adaptive_max_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_adaptive_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_adaptive_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_adaptive_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_adaptive_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_alpha_dropout_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_alpha_dropout_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_avg_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_avg_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_avg_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_avg_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_avg_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_avg_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_batch_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_batch_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_binary_cross_entropy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_binary_cross_entropy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_binary_cross_entropy_with_logits_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_binary_cross_entropy_with_logits_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_celu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_celu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_conv1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_conv1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_conv2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_conv2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_conv3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_conv3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_conv_transpose1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_conv_transpose1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_conv_transpose2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_conv_transpose2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_conv_transpose3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_conv_transpose3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_cosine_embedding_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_cosine_embedding_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_cosine_similarity_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_cosine_similarity_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_cross_entropy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_cross_entropy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_ctc_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_ctc_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_dropout2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_dropout2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_dropout3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_dropout3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_dropout_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_dropout_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_elu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_elu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_embedding_bag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_embedding_bag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_embedding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_embedding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_feature_alpha_dropout_with_train_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_feature_alpha_dropout_with_train_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_feature_alpha_dropout_without_train_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_feature_alpha_dropout_without_train_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_fractional_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_fractional_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_fractional_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_fractional_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_gaussian_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_gaussian_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_gelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_gelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_glu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_glu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_grid_sample_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_grid_sample_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_group_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_group_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_hardshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_hardshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_hardsigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_hardsigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_hardswish_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_hardswish_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_hardtanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_hardtanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_hinge_embedding_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_hinge_embedding_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_huber_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_huber_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_instance_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_instance_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_interpolate_area_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_interpolate_area_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_interpolate_bicubic_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_interpolate_bicubic_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_interpolate_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_interpolate_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_interpolate_linear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_interpolate_linear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_interpolate_nearest-exact_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_interpolate_nearest-exact_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_interpolate_nearest_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_interpolate_nearest_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_interpolate_trilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_interpolate_trilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_kl_div_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_kl_div_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_l1_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_l1_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_layer_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_layer_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_leaky_relu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_leaky_relu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_linear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_linear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_local_response_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_local_response_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_logsigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_logsigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_margin_ranking_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_margin_ranking_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_unpool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_unpool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_unpool1d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_unpool1d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_unpool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_unpool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_unpool2d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_unpool2d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_unpool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_unpool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_unpool3d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_max_unpool3d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_mish_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_mish_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_mse_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_mse_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_multi_head_attention_forward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_multi_head_attention_forward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_multi_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_multi_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_multilabel_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_multilabel_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_multilabel_soft_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_multilabel_soft_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_normalize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_normalize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pad_circular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pad_circular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pad_constant_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pad_constant_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pad_reflect_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pad_reflect_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pad_replicate_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pad_replicate_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pad_replicate_negative_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pad_replicate_negative_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pairwise_distance_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pairwise_distance_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pdist_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pdist_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pixel_shuffle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pixel_shuffle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pixel_unshuffle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_pixel_unshuffle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_poisson_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_poisson_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_prelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_prelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_relu6_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_relu6_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_relu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_relu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_rrelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_rrelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_scaled_dot_product_attention_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_scaled_dot_product_attention_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_selu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_selu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_silu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_silu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_smooth_l1_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_smooth_l1_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_soft_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_soft_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_softmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_softmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_softmin_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_softmin_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_softplus_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_softplus_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_softshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_softshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_softsign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_softsign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_tanhshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_tanhshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_threshold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_threshold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_triplet_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_triplet_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_triplet_margin_with_distance_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_triplet_margin_with_distance_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_unfold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_unfold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_upsample_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_upsample_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_upsample_nearest_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nn_functional_upsample_nearest_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nonzero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nonzero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nonzero_static_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_nonzero_static_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_norm_fro_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_norm_fro_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_norm_inf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_norm_inf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_norm_nuc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_norm_nuc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_normal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_normal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_normal_in_place_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_normal_in_place_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_normal_number_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_normal_number_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ones_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ones_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ones_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ones_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ormqr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ormqr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_outer_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_outer_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_pca_lowrank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_pca_lowrank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_permute_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_permute_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_pinverse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_pinverse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_polar_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_polar_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_polygamma_polygamma_n_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_polygamma_polygamma_n_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_polygamma_polygamma_n_1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_polygamma_polygamma_n_1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_polygamma_polygamma_n_2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_polygamma_polygamma_n_2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_polygamma_polygamma_n_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_polygamma_polygamma_n_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_polygamma_polygamma_n_4_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_polygamma_polygamma_n_4_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_positive_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_positive_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_pow_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_pow_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_put_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_put_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_qr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_qr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_quantile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_quantile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_rad2deg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_rad2deg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_rand_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_rand_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_randint_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_randint_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_randint_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_randint_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_randn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_randn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_randn_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_randn_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ravel_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_ravel_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_real_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_real_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_reciprocal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_reciprocal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_remainder_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_remainder_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_renorm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_renorm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_repeat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_repeat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_repeat_interleave_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_repeat_interleave_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_reshape_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_reshape_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_reshape_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_reshape_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_resize__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_resize__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_resize_as__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_resize_as__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_resolve_conj_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_resolve_conj_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_resolve_neg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_resolve_neg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_roll_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_roll_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_rot90_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_rot90_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_round_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_round_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_round_decimals_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_round_decimals_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_round_decimals_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_round_decimals_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_round_decimals_neg_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_round_decimals_neg_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_rsqrt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_rsqrt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_rsub_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_rsub_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scalar_tensor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scalar_tensor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scatter_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scatter_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scatter_reduce_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scatter_reduce_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scatter_reduce_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scatter_reduce_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scatter_reduce_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scatter_reduce_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scatter_reduce_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scatter_reduce_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scatter_reduce_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_scatter_reduce_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_searchsorted_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_searchsorted_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_select_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_select_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sgn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sgn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_short_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_short_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_bartlett_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_bartlett_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_blackman_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_blackman_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_cosine_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_cosine_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_exponential_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_exponential_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_gaussian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_gaussian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_general_cosine_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_general_cosine_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_general_hamming_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_general_hamming_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_hamming_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_hamming_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_hann_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_hann_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_kaiser_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_kaiser_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_nuttall_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signal_windows_nuttall_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signbit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_signbit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sinc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sinc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sinh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sinh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_slice_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_slice_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_slice_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_slice_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_softmax_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_softmax_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sparse_mm_reduce_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sparse_mm_reduce_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sparse_sampled_addmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sparse_sampled_addmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_airy_ai_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_airy_ai_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_bessel_j0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_bessel_j0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_bessel_j1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_bessel_j1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_bessel_y0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_bessel_y0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_bessel_y1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_bessel_y1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_chebyshev_polynomial_t_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_chebyshev_polynomial_t_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_chebyshev_polynomial_u_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_chebyshev_polynomial_u_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_chebyshev_polynomial_v_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_chebyshev_polynomial_v_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_chebyshev_polynomial_w_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_chebyshev_polynomial_w_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_entr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_entr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_erfcx_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_erfcx_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_hermite_polynomial_h_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_hermite_polynomial_h_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_hermite_polynomial_he_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_hermite_polynomial_he_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_i0e_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_i0e_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_i1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_i1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_i1e_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_i1e_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_laguerre_polynomial_l_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_laguerre_polynomial_l_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_legendre_polynomial_p_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_legendre_polynomial_p_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_log_ndtr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_log_ndtr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_modified_bessel_i0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_modified_bessel_i0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_modified_bessel_i1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_modified_bessel_i1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_modified_bessel_k0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_modified_bessel_k0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_modified_bessel_k1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_modified_bessel_k1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_ndtr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_ndtr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_ndtri_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_ndtri_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_polygamma_special_polygamma_n_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_polygamma_special_polygamma_n_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_scaled_modified_bessel_k0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_scaled_modified_bessel_k0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_scaled_modified_bessel_k1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_scaled_modified_bessel_k1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_shifted_chebyshev_polynomial_t_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_shifted_chebyshev_polynomial_t_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_shifted_chebyshev_polynomial_u_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_shifted_chebyshev_polynomial_u_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_shifted_chebyshev_polynomial_v_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_shifted_chebyshev_polynomial_v_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_shifted_chebyshev_polynomial_w_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_shifted_chebyshev_polynomial_w_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_spherical_bessel_j0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_spherical_bessel_j0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_xlog1py_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_xlog1py_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_zeta_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_special_zeta_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_split_list_args_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_split_list_args_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_split_with_sizes_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_split_with_sizes_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sqrt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sqrt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_square_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_square_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_squeeze_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_squeeze_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_squeeze_multiple_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_squeeze_multiple_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_stack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_stack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_std_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_std_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_std_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_std_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_std_mean_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_std_mean_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_std_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_std_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_stft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_stft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sub_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sub_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sum_to_size_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_sum_to_size_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_svd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_svd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_svd_lowrank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_svd_lowrank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_take_along_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_take_along_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_take_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_take_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_tan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_tan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_tanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_tanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_tensor_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_tensor_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_tensordot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_tensordot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_tile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_tile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_to_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_to_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_to_sparse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_to_sparse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_topk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_topk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_trace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_trace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_transpose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_transpose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_trapezoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_trapezoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_trapz_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_trapz_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_triangular_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_triangular_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_tril_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_tril_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_triu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_triu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_true_divide_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_true_divide_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_trunc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_trunc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unbind_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unbind_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unflatten_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unflatten_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unfold_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unfold_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unfold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unfold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_uniform_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_uniform_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unique_consecutive_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unique_consecutive_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unique_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unique_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unsafe_chunk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unsafe_chunk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unsafe_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unsafe_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unsqueeze_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_unsqueeze_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_var_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_var_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_var_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_var_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_var_mean_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_var_mean_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_var_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_var_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_vdot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_vdot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_view_as_complex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_view_as_complex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_view_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_view_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_view_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_view_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_view_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_view_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_vsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_vsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_vstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_vstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_where_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_where_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_xlogy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_xlogy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_zero__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_zero__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_zeros_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_zeros_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_zeros_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_zeros_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_H_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_H_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_MapControlflowOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_MapControlflowOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NestedMapControlflowOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NestedMapControlflowOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpyCatCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpyCatCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpyCubeCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpyCubeCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpyMulCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpyMulCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpyNMSCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpyNMSCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpyNonzeroCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpyNonzeroCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpySortCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpySortCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpySplitCopyCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpySplitCopyCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpySplitCopyWithIntCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpySplitCopyWithIntCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpyTakeCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpyTakeCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpyViewCopyCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_NumpyViewCopyCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_TripleNestedMapControlflowOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_TripleNestedMapControlflowOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___getitem___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___getitem___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___radd___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___radd___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___rdiv___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___rdiv___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___rmatmul___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___rmatmul___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___rmod___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___rmod___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___rmul___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___rmul___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___rpow___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___rpow___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___rsub___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive___rsub___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive__native_batch_norm_legit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive__native_batch_norm_legit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive__segment_reduce_lengths_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive__segment_reduce_lengths_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive__segment_reduce_offsets_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive__segment_reduce_offsets_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive__softmax_backward_data_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive__softmax_backward_data_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive__upsample_bilinear2d_aa_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive__upsample_bilinear2d_aa_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_abs_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_abs_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_acos_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_acos_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_acosh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_acosh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_addbmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_addbmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_addcdiv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_addcdiv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_addcmul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_addcmul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_addmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_addmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_addmm_decomposed_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_addmm_decomposed_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_addmv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_addmv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_addr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_addr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_all_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_all_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_allclose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_allclose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_aminmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_aminmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_angle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_angle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_any_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_any_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_arange_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_arange_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_argmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_argmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_argmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_argmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_argsort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_argsort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_argwhere_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_argwhere_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_as_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_as_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_as_strided_partial_views_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_as_strided_partial_views_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_as_strided_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_as_strided_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_asin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_asin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_asinh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_asinh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_atan2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_atan2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_atan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_atan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_atanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_atanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_atleast_1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_atleast_1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_atleast_2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_atleast_2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_atleast_3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_atleast_3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_baddbmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_baddbmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_batch_norm_with_update_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_batch_norm_with_update_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_bernoulli_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_bernoulli_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_bfloat16_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_bfloat16_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_block_diag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_block_diag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_bmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_bmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_bool_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_bool_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_broadcast_shapes_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_broadcast_shapes_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_broadcast_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_broadcast_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_broadcast_to_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_broadcast_to_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_bucketize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_bucketize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_byte_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_byte_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cartesian_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cartesian_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cauchy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cauchy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cdist_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cdist_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cdouble_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cdouble_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ceil_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ceil_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cfloat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cfloat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_chalf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_chalf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_char_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_char_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cholesky_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cholesky_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cholesky_inverse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cholesky_inverse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cholesky_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cholesky_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_chunk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_chunk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_clamp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_clamp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_clamp_max_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_clamp_max_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_clamp_min_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_clamp_min_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_clone_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_clone_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_column_stack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_column_stack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_combinations_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_combinations_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_complex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_complex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_conj_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_conj_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_conj_physical_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_conj_physical_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_constant_pad_nd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_constant_pad_nd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_contiguous_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_contiguous_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_copysign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_copysign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_corrcoef_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_corrcoef_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cos_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cos_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cosh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cosh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_count_nonzero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_count_nonzero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cov_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cov_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cross_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cross_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cummax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cummax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cummin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cummin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cumprod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cumprod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cumsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cumsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cumulative_trapezoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_cumulative_trapezoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_deg2rad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_deg2rad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_diag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_diag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_diag_embed_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_diag_embed_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_diagflat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_diagflat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_diagonal_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_diagonal_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_diagonal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_diagonal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_diagonal_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_diagonal_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_diff_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_diff_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_digamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_digamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_div_floor_rounding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_div_floor_rounding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_div_no_rounding_mode_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_div_no_rounding_mode_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_div_trunc_rounding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_div_trunc_rounding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_dot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_dot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_double_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_double_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_dsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_dsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_dstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_dstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_einsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_einsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_empty_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_empty_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_empty_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_empty_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_empty_permuted_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_empty_permuted_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_empty_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_empty_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_eq_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_eq_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_equal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_equal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_erf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_erf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_erfc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_erfc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_erfinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_erfinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_exp2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_exp2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_exp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_exp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_expand_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_expand_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_expand_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_expand_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_expm1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_expm1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_exponential_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_exponential_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_eye_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_eye_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_fft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_fft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_fft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_fft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_fftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_fftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_fftshift_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_fftshift_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_hfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_hfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_hfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_hfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_hfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_hfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_ifft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_ifft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_ifft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_ifft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_ifftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_ifftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_ifftshift_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_ifftshift_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_ihfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_ihfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_ihfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_ihfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_ihfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_ihfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_irfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_irfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_irfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_irfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_irfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_irfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_rfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_rfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_rfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_rfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_rfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fft_rfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_flatten_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_flatten_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_flip_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_flip_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fliplr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fliplr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_flipud_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_flipud_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_float_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_float_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_float_power_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_float_power_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_floor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_floor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_floor_divide_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_floor_divide_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fmod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_fmod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_frac_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_frac_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_frexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_frexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_full_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_full_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_full_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_full_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_gather_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_gather_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ge_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ge_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_geometric_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_geometric_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_geqrf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_geqrf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_gradient_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_gradient_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_grid_sampler_2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_grid_sampler_2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_gt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_gt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_half_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_half_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_heaviside_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_heaviside_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_histc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_histc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_histogram_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_histogram_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_histogramdd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_histogramdd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_hsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_hsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_hstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_hstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_hypot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_hypot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_i0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_i0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_igamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_igamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_igammac_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_igammac_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_index_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_index_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_index_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_index_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_index_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_index_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_index_put_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_index_put_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_index_reduce_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_index_reduce_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_index_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_index_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inner_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inner_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_H_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_H_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpyCatCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpyCatCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpyCubeCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpyCubeCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpyMulCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpyMulCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpyNMSCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpyNMSCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpyNonzeroCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpyNonzeroCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpySortCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpySortCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpySplitCopyCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpySplitCopyCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpySplitCopyWithIntCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpySplitCopyWithIntCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpyTakeCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpyTakeCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpyViewCopyCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_NumpyViewCopyCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___getitem___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___getitem___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___radd___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___radd___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___rdiv___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___rdiv___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___rmatmul___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___rmatmul___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___rmod___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___rmod___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___rmul___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___rmul___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___rpow___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___rpow___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___rsub___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace___rsub___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace__native_batch_norm_legit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace__native_batch_norm_legit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace__segment_reduce_lengths_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace__segment_reduce_lengths_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace__segment_reduce_offsets_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace__segment_reduce_offsets_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace__softmax_backward_data_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace__softmax_backward_data_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace__upsample_bilinear2d_aa_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace__upsample_bilinear2d_aa_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_abs_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_abs_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_acos_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_acos_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_acosh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_acosh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_addbmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_addbmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_addcdiv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_addcdiv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_addcmul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_addcmul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_addmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_addmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_addmm_decomposed_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_addmm_decomposed_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_addmv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_addmv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_addr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_addr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_all_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_all_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_allclose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_allclose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_aminmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_aminmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_angle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_angle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_any_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_any_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_arange_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_arange_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_argmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_argmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_argmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_argmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_argsort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_argsort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_argwhere_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_argwhere_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_as_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_as_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_as_strided_partial_views_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_as_strided_partial_views_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_as_strided_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_as_strided_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_asin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_asin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_asinh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_asinh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_atan2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_atan2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_atan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_atan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_atanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_atanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_atleast_1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_atleast_1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_atleast_2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_atleast_2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_atleast_3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_atleast_3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_baddbmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_baddbmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_batch_norm_with_update_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_batch_norm_with_update_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_bernoulli_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_bernoulli_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_bfloat16_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_bfloat16_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_block_diag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_block_diag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_bmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_bmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_bool_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_bool_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_broadcast_shapes_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_broadcast_shapes_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_broadcast_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_broadcast_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_broadcast_to_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_broadcast_to_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_bucketize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_bucketize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_byte_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_byte_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cartesian_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cartesian_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cauchy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cauchy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cdist_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cdist_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cdouble_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cdouble_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ceil_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ceil_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cfloat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cfloat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_chalf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_chalf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_char_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_char_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cholesky_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cholesky_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cholesky_inverse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cholesky_inverse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cholesky_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cholesky_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_chunk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_chunk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_clamp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_clamp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_clamp_max_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_clamp_max_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_clamp_min_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_clamp_min_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_clone_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_clone_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_column_stack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_column_stack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_combinations_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_combinations_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_complex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_complex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_conj_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_conj_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_conj_physical_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_conj_physical_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_constant_pad_nd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_constant_pad_nd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_contiguous_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_contiguous_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_copysign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_copysign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_corrcoef_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_corrcoef_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cos_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cos_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cosh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cosh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_count_nonzero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_count_nonzero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cov_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cov_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cross_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cross_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cummax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cummax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cummin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cummin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cumprod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cumprod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cumsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cumsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cumulative_trapezoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_cumulative_trapezoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_deg2rad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_deg2rad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_diag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_diag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_diag_embed_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_diag_embed_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_diagflat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_diagflat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_diagonal_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_diagonal_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_diagonal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_diagonal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_diagonal_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_diagonal_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_diff_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_diff_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_digamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_digamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_dist_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_dist_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_div_floor_rounding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_div_floor_rounding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_div_no_rounding_mode_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_div_no_rounding_mode_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_div_trunc_rounding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_div_trunc_rounding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_dot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_dot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_double_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_double_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_dsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_dsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_dstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_dstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_einsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_einsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_empty_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_empty_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_empty_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_empty_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_empty_permuted_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_empty_permuted_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_empty_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_empty_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_eq_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_eq_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_equal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_equal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_erf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_erf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_erfc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_erfc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_erfinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_erfinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_exp2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_exp2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_exp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_exp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_expand_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_expand_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_expand_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_expand_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_expm1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_expm1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_exponential_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_exponential_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_eye_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_eye_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_fft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_fft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_fft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_fft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_fftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_fftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_fftshift_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_fftshift_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_hfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_hfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_hfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_hfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_hfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_hfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_ifft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_ifft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_ifft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_ifft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_ifftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_ifftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_ifftshift_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_ifftshift_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_ihfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_ihfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_ihfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_ihfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_ihfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_ihfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_irfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_irfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_irfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_irfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_irfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_irfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_rfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_rfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_rfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_rfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_rfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fft_rfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_flatten_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_flatten_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_flip_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_flip_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fliplr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fliplr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_flipud_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_flipud_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_float_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_float_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_float_power_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_float_power_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_floor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_floor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_floor_divide_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_floor_divide_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fmod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_fmod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_frac_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_frac_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_frexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_frexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_full_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_full_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_full_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_full_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_gather_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_gather_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ge_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ge_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_geometric_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_geometric_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_geqrf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_geqrf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_gradient_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_gradient_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_grid_sampler_2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_grid_sampler_2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_gt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_gt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_half_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_half_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_heaviside_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_heaviside_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_histc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_histc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_histogram_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_histogram_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_histogramdd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_histogramdd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_hsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_hsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_hstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_hstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_hypot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_hypot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_i0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_i0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_igamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_igamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_igammac_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_igammac_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_index_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_index_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_index_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_index_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_index_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_index_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_index_put_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_index_put_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_index_reduce_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_index_reduce_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_index_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_index_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_inner_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_inner_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_int_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_int_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isclose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isclose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isfinite_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isfinite_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isinf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isinf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isnan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isnan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isneginf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isneginf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isposinf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isposinf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isreal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_isreal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_item_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_item_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_jiterator_2inputs_2outputs_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_jiterator_2inputs_2outputs_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_jiterator_4inputs_with_extra_args_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_jiterator_4inputs_with_extra_args_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_jiterator_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_jiterator_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_jiterator_binary_return_by_ref_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_jiterator_binary_return_by_ref_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_jiterator_unary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_jiterator_unary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_kron_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_kron_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_kthvalue_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_kthvalue_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ldexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ldexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_le_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_le_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_lerp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_lerp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_lgamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_lgamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_cholesky_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_cholesky_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_cholesky_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_cholesky_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_cond_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_cond_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_cross_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_cross_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_det_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_det_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_det_singular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_det_singular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_diagonal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_diagonal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_eig_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_eig_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_eigh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_eigh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_eigvals_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_eigvals_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_eigvalsh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_eigvalsh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_householder_product_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_householder_product_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_inv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_inv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_inv_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_inv_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_ldl_factor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_ldl_factor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_ldl_factor_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_ldl_factor_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_ldl_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_ldl_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_lstsq_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_lstsq_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_lstsq_grad_oriented_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_lstsq_grad_oriented_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_lu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_lu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_lu_factor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_lu_factor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_lu_factor_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_lu_factor_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_lu_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_lu_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_matrix_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_matrix_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_matrix_power_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_matrix_power_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_matrix_rank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_matrix_rank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_matrix_rank_hermitian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_matrix_rank_hermitian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_multi_dot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_multi_dot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_norm_subgradients_at_zero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_norm_subgradients_at_zero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_pinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_pinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_pinv_hermitian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_pinv_hermitian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_pinv_singular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_pinv_singular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_qr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_qr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_slogdet_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_slogdet_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_solve_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_solve_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_solve_triangular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_solve_triangular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_svd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_svd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_svdvals_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_svdvals_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_tensorinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_tensorinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_tensorsolve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_tensorsolve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_vander_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_vander_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_vecdot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_vecdot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_vector_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linalg_vector_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linspace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linspace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linspace_tensor_overload_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_linspace_tensor_overload_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_log10_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_log10_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_log1p_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_log1p_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_log2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_log2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_log_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_log_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_log_normal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_log_normal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_log_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_log_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_log_softmax_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_log_softmax_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logaddexp2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logaddexp2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logaddexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logaddexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logcumsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logcumsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logdet_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logdet_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logical_and_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logical_and_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logical_not_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logical_not_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logical_or_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logical_or_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logical_xor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logical_xor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logspace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logspace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logspace_tensor_overload_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logspace_tensor_overload_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_logsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_long_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_long_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_lt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_lt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_lu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_lu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_lu_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_lu_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_lu_unpack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_lu_unpack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mH_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mH_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mT_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mT_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_argmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_argmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_argmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_argmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_cumprod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_cumprod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_cumsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_cumsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_log_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_log_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_logaddexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_logaddexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_logsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_logsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_median_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_median_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_normalize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_normalize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_softmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_softmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_std_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_std_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_var_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_masked_var_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_matmul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_matmul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_matrix_exp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_matrix_exp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_max_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_max_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_max_pool2d_with_indices_backward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_max_pool2d_with_indices_backward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_max_reduction_no_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_max_reduction_no_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_max_reduction_with_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_max_reduction_with_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_maximum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_maximum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_median_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_median_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_meshgrid_list_of_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_meshgrid_list_of_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_meshgrid_variadic_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_meshgrid_variadic_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_min_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_min_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_min_reduction_no_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_min_reduction_no_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_min_reduction_with_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_min_reduction_with_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_minimum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_minimum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mode_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mode_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_movedim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_movedim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_msort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_msort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_multinomial_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_multinomial_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mvlgamma_mvlgamma_p_1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mvlgamma_mvlgamma_p_1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mvlgamma_mvlgamma_p_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mvlgamma_mvlgamma_p_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mvlgamma_mvlgamma_p_5_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_mvlgamma_mvlgamma_p_5_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nan_to_num_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nan_to_num_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nanmean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nanmean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nanmedian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nanmedian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nanquantile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nanquantile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nansum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nansum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_narrow_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_narrow_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_narrow_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_narrow_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_native_batch_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_native_batch_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_native_dropout_backward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_native_dropout_backward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_native_layer_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_native_layer_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ne_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ne_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_neg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_neg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_new_empty_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_new_empty_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_new_empty_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_new_empty_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_new_full_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_new_full_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_new_ones_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_new_ones_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_new_zeros_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_new_zeros_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nextafter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nextafter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_adaptive_avg_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_adaptive_avg_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_adaptive_avg_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_adaptive_avg_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_adaptive_avg_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_adaptive_avg_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_adaptive_max_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_adaptive_max_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_adaptive_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_adaptive_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_adaptive_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_adaptive_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_alpha_dropout_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_alpha_dropout_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_avg_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_avg_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_avg_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_avg_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_avg_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_avg_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_batch_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_batch_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_binary_cross_entropy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_binary_cross_entropy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_binary_cross_entropy_with_logits_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_binary_cross_entropy_with_logits_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_celu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_celu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_conv1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_conv1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_conv2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_conv2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_conv3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_conv3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_conv_transpose1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_conv_transpose1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_conv_transpose2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_conv_transpose2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_conv_transpose3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_conv_transpose3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_cosine_embedding_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_cosine_embedding_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_cosine_similarity_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_cosine_similarity_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_cross_entropy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_cross_entropy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_ctc_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_ctc_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_dropout2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_dropout2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_dropout3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_dropout3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_dropout_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_dropout_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_elu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_elu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_embedding_bag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_embedding_bag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_embedding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_embedding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_feature_alpha_dropout_with_train_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_feature_alpha_dropout_with_train_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_feature_alpha_dropout_without_train_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_feature_alpha_dropout_without_train_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_fractional_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_fractional_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_fractional_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_fractional_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_gaussian_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_gaussian_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_gelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_gelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_glu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_glu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_grid_sample_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_grid_sample_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_group_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_group_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_hardshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_hardshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_hardsigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_hardsigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_hardswish_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_hardswish_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_hardtanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_hardtanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_hinge_embedding_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_hinge_embedding_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_huber_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_huber_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_instance_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_instance_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_interpolate_area_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_interpolate_area_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_interpolate_bicubic_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_interpolate_bicubic_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_interpolate_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_interpolate_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_interpolate_linear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_interpolate_linear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_interpolate_nearest-exact_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_interpolate_nearest-exact_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_interpolate_nearest_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_interpolate_nearest_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_interpolate_trilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_interpolate_trilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_kl_div_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_kl_div_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_l1_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_l1_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_layer_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_layer_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_leaky_relu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_leaky_relu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_linear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_linear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_local_response_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_local_response_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_logsigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_logsigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_margin_ranking_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_margin_ranking_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_unpool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_unpool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_unpool1d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_unpool1d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_unpool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_unpool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_unpool2d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_unpool2d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_unpool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_unpool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_unpool3d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_max_unpool3d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_mish_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_mish_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_mse_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_mse_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_multi_head_attention_forward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_multi_head_attention_forward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_multi_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_multi_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_multilabel_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_multilabel_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_multilabel_soft_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_multilabel_soft_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_normalize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_normalize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pad_circular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pad_circular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pad_constant_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pad_constant_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pad_reflect_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pad_reflect_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pad_replicate_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pad_replicate_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pad_replicate_negative_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pad_replicate_negative_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pairwise_distance_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pairwise_distance_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pdist_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pdist_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pixel_shuffle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pixel_shuffle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pixel_unshuffle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_pixel_unshuffle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_poisson_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_poisson_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_prelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_prelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_relu6_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_relu6_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_relu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_relu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_rrelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_rrelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_scaled_dot_product_attention_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_scaled_dot_product_attention_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_selu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_selu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_silu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_silu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_smooth_l1_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_smooth_l1_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_soft_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_soft_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_softmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_softmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_softmin_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_softmin_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_softplus_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_softplus_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_softshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_softshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_softsign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_softsign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_tanhshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_tanhshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_threshold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_threshold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_triplet_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_triplet_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_triplet_margin_with_distance_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_triplet_margin_with_distance_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_unfold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_unfold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_upsample_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_upsample_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_upsample_nearest_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nn_functional_upsample_nearest_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nonzero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nonzero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nonzero_static_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_nonzero_static_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_norm_fro_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_norm_fro_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_norm_inf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_norm_inf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_norm_nuc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_norm_nuc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_normal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_normal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_normal_in_place_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_normal_in_place_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_normal_number_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_normal_number_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ones_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ones_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ones_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ones_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ormqr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ormqr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_outer_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_outer_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_pca_lowrank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_pca_lowrank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_permute_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_permute_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_pinverse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_pinverse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_polar_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_polar_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_polygamma_polygamma_n_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_polygamma_polygamma_n_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_polygamma_polygamma_n_1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_polygamma_polygamma_n_1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_polygamma_polygamma_n_2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_polygamma_polygamma_n_2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_polygamma_polygamma_n_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_polygamma_polygamma_n_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_polygamma_polygamma_n_4_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_polygamma_polygamma_n_4_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_positive_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_positive_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_pow_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_pow_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_put_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_put_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_qr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_qr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_quantile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_quantile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_rad2deg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_rad2deg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_rand_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_rand_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_randint_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_randint_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_randint_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_randint_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_randn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_randn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_randn_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_randn_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ravel_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_ravel_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_real_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_real_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_reciprocal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_reciprocal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_remainder_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_remainder_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_renorm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_renorm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_repeat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_repeat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_repeat_interleave_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_repeat_interleave_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_reshape_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_reshape_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_reshape_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_reshape_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_resize__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_resize__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_resize_as__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_resize_as__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_resolve_conj_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_resolve_conj_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_resolve_neg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_resolve_neg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_roll_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_roll_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_rot90_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_rot90_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_round_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_round_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_round_decimals_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_round_decimals_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_round_decimals_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_round_decimals_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_round_decimals_neg_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_round_decimals_neg_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_rsqrt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_rsqrt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_rsub_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_rsub_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scalar_tensor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scalar_tensor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scatter_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scatter_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scatter_reduce_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scatter_reduce_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scatter_reduce_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scatter_reduce_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scatter_reduce_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scatter_reduce_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scatter_reduce_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scatter_reduce_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scatter_reduce_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_scatter_reduce_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_searchsorted_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_searchsorted_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_select_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_select_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sgn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sgn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_short_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_short_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_bartlett_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_bartlett_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_blackman_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_blackman_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_cosine_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_cosine_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_exponential_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_exponential_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_gaussian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_gaussian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_general_cosine_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_general_cosine_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_general_hamming_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_general_hamming_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_hamming_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_hamming_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_hann_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_hann_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_kaiser_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_kaiser_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_nuttall_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signal_windows_nuttall_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signbit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_signbit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sinc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sinc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sinh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sinh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_slice_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_slice_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_slice_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_slice_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_softmax_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_softmax_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sparse_mm_reduce_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sparse_mm_reduce_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sparse_sampled_addmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sparse_sampled_addmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_airy_ai_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_airy_ai_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_bessel_j0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_bessel_j0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_bessel_j1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_bessel_j1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_bessel_y0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_bessel_y0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_bessel_y1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_bessel_y1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_chebyshev_polynomial_t_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_chebyshev_polynomial_t_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_chebyshev_polynomial_u_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_chebyshev_polynomial_u_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_chebyshev_polynomial_v_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_chebyshev_polynomial_v_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_chebyshev_polynomial_w_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_chebyshev_polynomial_w_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_entr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_entr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_erfcx_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_erfcx_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_hermite_polynomial_h_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_hermite_polynomial_h_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_hermite_polynomial_he_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_hermite_polynomial_he_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_i0e_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_i0e_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_i1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_i1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_i1e_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_i1e_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_laguerre_polynomial_l_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_laguerre_polynomial_l_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_legendre_polynomial_p_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_legendre_polynomial_p_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_log_ndtr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_log_ndtr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_modified_bessel_i0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_modified_bessel_i0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_modified_bessel_i1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_modified_bessel_i1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_modified_bessel_k0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_modified_bessel_k0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_modified_bessel_k1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_modified_bessel_k1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_ndtr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_ndtr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_ndtri_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_ndtri_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_polygamma_special_polygamma_n_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_polygamma_special_polygamma_n_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_scaled_modified_bessel_k0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_scaled_modified_bessel_k0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_scaled_modified_bessel_k1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_scaled_modified_bessel_k1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_shifted_chebyshev_polynomial_t_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_shifted_chebyshev_polynomial_t_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_shifted_chebyshev_polynomial_u_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_shifted_chebyshev_polynomial_u_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_shifted_chebyshev_polynomial_v_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_shifted_chebyshev_polynomial_v_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_shifted_chebyshev_polynomial_w_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_shifted_chebyshev_polynomial_w_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_spherical_bessel_j0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_spherical_bessel_j0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_xlog1py_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_xlog1py_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_zeta_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_special_zeta_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_split_list_args_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_split_list_args_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_split_with_sizes_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_split_with_sizes_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sqrt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sqrt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_square_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_square_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_squeeze_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_squeeze_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_squeeze_multiple_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_squeeze_multiple_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_stack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_stack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_std_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_std_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_std_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_std_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_std_mean_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_std_mean_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_std_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_std_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_stft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_stft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sub_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sub_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sum_to_size_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_sum_to_size_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_svd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_svd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_svd_lowrank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_svd_lowrank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_take_along_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_take_along_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_take_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_take_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_tan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_tan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_tanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_tanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_tensor_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_tensor_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_tensordot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_tensordot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_tile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_tile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_to_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_to_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_to_sparse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_to_sparse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_topk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_topk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_trace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_trace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_transpose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_transpose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_trapezoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_trapezoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_trapz_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_trapz_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_triangular_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_triangular_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_tril_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_tril_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_triu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_triu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_true_divide_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_true_divide_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_trunc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_trunc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unbind_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unbind_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unflatten_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unflatten_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unfold_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unfold_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unfold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unfold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_uniform_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_uniform_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unique_consecutive_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unique_consecutive_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unique_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unique_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unsafe_chunk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unsafe_chunk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unsafe_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unsafe_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unsqueeze_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_unsqueeze_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_var_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_var_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_var_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_var_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_var_mean_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_var_mean_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_var_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_var_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_vdot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_vdot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_view_as_complex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_view_as_complex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_view_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_view_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_view_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_view_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_view_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_view_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_vsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_vsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_vstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_vstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_where_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_where_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_xlogy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_xlogy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_zero__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_zero__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_zeros_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_zeros_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_zeros_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_zeros_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_int_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_int_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isclose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isclose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isfinite_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isfinite_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isinf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isinf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isnan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isnan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isneginf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isneginf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isposinf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isposinf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isreal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_isreal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_item_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_item_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_jiterator_2inputs_2outputs_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_jiterator_2inputs_2outputs_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_jiterator_4inputs_with_extra_args_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_jiterator_4inputs_with_extra_args_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_jiterator_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_jiterator_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_jiterator_binary_return_by_ref_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_jiterator_binary_return_by_ref_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_jiterator_unary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_jiterator_unary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_kron_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_kron_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_kthvalue_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_kthvalue_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ldexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ldexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_le_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_le_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_lerp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_lerp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_lgamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_lgamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_cholesky_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_cholesky_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_cholesky_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_cholesky_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_cond_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_cond_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_cross_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_cross_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_det_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_det_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_det_singular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_det_singular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_diagonal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_diagonal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_eig_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_eig_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_eigh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_eigh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_eigvals_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_eigvals_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_eigvalsh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_eigvalsh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_householder_product_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_householder_product_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_inv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_inv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_inv_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_inv_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_ldl_factor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_ldl_factor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_ldl_factor_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_ldl_factor_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_ldl_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_ldl_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_lstsq_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_lstsq_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_lstsq_grad_oriented_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_lstsq_grad_oriented_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_lu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_lu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_lu_factor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_lu_factor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_lu_factor_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_lu_factor_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_lu_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_lu_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_matrix_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_matrix_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_matrix_power_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_matrix_power_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_matrix_rank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_matrix_rank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_matrix_rank_hermitian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_matrix_rank_hermitian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_multi_dot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_multi_dot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_norm_subgradients_at_zero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_norm_subgradients_at_zero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_pinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_pinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_pinv_hermitian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_pinv_hermitian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_pinv_singular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_pinv_singular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_qr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_qr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_slogdet_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_slogdet_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_solve_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_solve_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_solve_triangular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_solve_triangular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_svd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_svd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_svdvals_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_svdvals_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_tensorinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_tensorinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_tensorsolve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_tensorsolve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_vander_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_vander_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_vecdot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_vecdot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_vector_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linalg_vector_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linspace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linspace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linspace_tensor_overload_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_linspace_tensor_overload_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_log10_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_log10_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_log1p_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_log1p_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_log2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_log2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_log_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_log_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_log_normal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_log_normal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_log_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_log_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_log_softmax_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_log_softmax_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logaddexp2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logaddexp2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logaddexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logaddexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logcumsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logcumsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logdet_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logdet_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logical_and_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logical_and_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logical_not_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logical_not_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logical_or_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logical_or_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logical_xor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logical_xor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logspace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logspace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logspace_tensor_overload_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logspace_tensor_overload_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_logsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_long_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_long_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_lt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_lt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_lu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_lu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_lu_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_lu_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_lu_unpack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_lu_unpack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mH_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mH_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mT_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mT_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_argmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_argmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_argmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_argmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_cumprod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_cumprod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_cumsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_cumsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_log_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_log_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_logaddexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_logaddexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_logsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_logsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_median_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_median_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_normalize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_normalize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_softmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_softmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_std_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_std_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_var_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_masked_var_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_matmul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_matmul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_matrix_exp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_matrix_exp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_max_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_max_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_max_pool2d_with_indices_backward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_max_pool2d_with_indices_backward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_max_reduction_no_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_max_reduction_no_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_max_reduction_with_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_max_reduction_with_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_maximum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_maximum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_median_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_median_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_meshgrid_list_of_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_meshgrid_list_of_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_meshgrid_variadic_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_meshgrid_variadic_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_min_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_min_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_min_reduction_no_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_min_reduction_no_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_min_reduction_with_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_min_reduction_with_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_minimum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_minimum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mode_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mode_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_movedim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_movedim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_msort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_msort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_multinomial_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_multinomial_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mvlgamma_mvlgamma_p_1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mvlgamma_mvlgamma_p_1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mvlgamma_mvlgamma_p_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mvlgamma_mvlgamma_p_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mvlgamma_mvlgamma_p_5_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_mvlgamma_mvlgamma_p_5_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nan_to_num_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nan_to_num_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nanmean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nanmean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nanmedian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nanmedian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nanquantile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nanquantile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nansum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nansum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_narrow_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_narrow_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_narrow_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_narrow_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_native_batch_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_native_batch_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_native_dropout_backward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_native_dropout_backward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_native_layer_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_native_layer_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ne_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ne_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_neg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_neg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_new_empty_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_new_empty_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_new_empty_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_new_empty_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_new_full_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_new_full_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_new_ones_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_new_ones_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_new_zeros_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_new_zeros_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nextafter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nextafter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_adaptive_avg_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_adaptive_avg_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_adaptive_avg_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_adaptive_avg_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_adaptive_avg_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_adaptive_avg_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_adaptive_max_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_adaptive_max_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_adaptive_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_adaptive_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_adaptive_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_adaptive_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_alpha_dropout_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_alpha_dropout_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_avg_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_avg_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_avg_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_avg_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_avg_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_avg_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_batch_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_batch_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_binary_cross_entropy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_binary_cross_entropy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_binary_cross_entropy_with_logits_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_binary_cross_entropy_with_logits_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_celu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_celu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_conv1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_conv1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_conv2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_conv2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_conv3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_conv3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_conv_transpose1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_conv_transpose1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_conv_transpose2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_conv_transpose2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_conv_transpose3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_conv_transpose3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_cosine_embedding_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_cosine_embedding_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_cosine_similarity_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_cosine_similarity_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_cross_entropy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_cross_entropy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_ctc_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_ctc_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_dropout2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_dropout2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_dropout3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_dropout3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_dropout_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_dropout_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_elu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_elu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_embedding_bag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_embedding_bag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_embedding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_embedding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_feature_alpha_dropout_with_train_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_feature_alpha_dropout_with_train_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_feature_alpha_dropout_without_train_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_feature_alpha_dropout_without_train_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_fractional_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_fractional_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_fractional_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_fractional_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_gaussian_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_gaussian_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_gelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_gelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_glu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_glu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_grid_sample_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_grid_sample_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_group_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_group_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_hardshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_hardshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_hardsigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_hardsigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_hardswish_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_hardswish_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_hardtanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_hardtanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_hinge_embedding_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_hinge_embedding_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_huber_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_huber_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_instance_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_instance_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_interpolate_area_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_interpolate_area_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_interpolate_bicubic_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_interpolate_bicubic_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_interpolate_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_interpolate_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_interpolate_linear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_interpolate_linear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_interpolate_nearest-exact_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_interpolate_nearest-exact_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_interpolate_nearest_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_interpolate_nearest_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_interpolate_trilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_interpolate_trilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_kl_div_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_kl_div_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_l1_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_l1_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_layer_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_layer_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_leaky_relu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_leaky_relu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_linear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_linear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_local_response_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_local_response_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_logsigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_logsigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_margin_ranking_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_margin_ranking_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_unpool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_unpool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_unpool1d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_unpool1d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_unpool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_unpool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_unpool2d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_unpool2d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_unpool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_unpool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_unpool3d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_max_unpool3d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_mish_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_mish_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_mse_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_mse_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_multi_head_attention_forward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_multi_head_attention_forward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_multi_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_multi_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_multilabel_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_multilabel_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_multilabel_soft_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_multilabel_soft_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_normalize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_normalize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pad_circular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pad_circular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pad_constant_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pad_constant_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pad_reflect_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pad_reflect_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pad_replicate_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pad_replicate_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pad_replicate_negative_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pad_replicate_negative_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pairwise_distance_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pairwise_distance_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pdist_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pdist_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pixel_shuffle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pixel_shuffle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pixel_unshuffle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_pixel_unshuffle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_poisson_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_poisson_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_prelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_prelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_relu6_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_relu6_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_relu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_relu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_rrelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_rrelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_scaled_dot_product_attention_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_scaled_dot_product_attention_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_selu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_selu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_silu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_silu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_smooth_l1_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_smooth_l1_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_soft_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_soft_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_softmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_softmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_softmin_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_softmin_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_softplus_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_softplus_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_softshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_softshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_softsign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_softsign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_tanhshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_tanhshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_threshold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_threshold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_triplet_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_triplet_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_triplet_margin_with_distance_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_triplet_margin_with_distance_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_unfold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_unfold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_upsample_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_upsample_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_upsample_nearest_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nn_functional_upsample_nearest_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nonzero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nonzero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nonzero_static_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_nonzero_static_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_norm_fro_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_norm_fro_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_norm_inf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_norm_inf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_norm_nuc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_norm_nuc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_normal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_normal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_normal_in_place_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_normal_in_place_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_normal_number_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_normal_number_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ones_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ones_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ones_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ones_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ormqr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ormqr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_H_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_H_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpyCatCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpyCatCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpyCubeCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpyCubeCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpyMulCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpyMulCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpyNMSCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpyNMSCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpyNonzeroCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpyNonzeroCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpySortCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpySortCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpySplitCopyCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpySplitCopyCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpySplitCopyWithIntCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpySplitCopyWithIntCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpyTakeCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpyTakeCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpyViewCopyCustomOp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_NumpyViewCopyCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___getitem___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___getitem___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___radd___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___radd___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___rdiv___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___rdiv___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___rmatmul___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___rmatmul___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___rmod___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___rmod___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___rmul___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___rmul___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___rpow___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___rpow___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___rsub___cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out___rsub___cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out__native_batch_norm_legit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out__native_batch_norm_legit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out__segment_reduce_lengths_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out__segment_reduce_lengths_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out__segment_reduce_offsets_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out__segment_reduce_offsets_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out__softmax_backward_data_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out__softmax_backward_data_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out__upsample_bilinear2d_aa_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out__upsample_bilinear2d_aa_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_abs_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_abs_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_acos_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_acos_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_acosh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_acosh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_addbmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_addbmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_addcdiv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_addcdiv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_addcmul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_addcmul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_addmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_addmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_addmm_decomposed_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_addmm_decomposed_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_addmv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_addmv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_addr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_addr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_all_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_all_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_allclose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_allclose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_aminmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_aminmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_angle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_angle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_any_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_any_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_arange_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_arange_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_argmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_argmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_argmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_argmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_argsort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_argsort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_argwhere_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_argwhere_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_as_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_as_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_as_strided_partial_views_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_as_strided_partial_views_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_as_strided_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_as_strided_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_asin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_asin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_asinh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_asinh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_atan2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_atan2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_atan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_atan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_atanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_atanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_atleast_1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_atleast_1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_atleast_2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_atleast_2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_atleast_3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_atleast_3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_baddbmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_baddbmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_batch_norm_with_update_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_batch_norm_with_update_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_bernoulli_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_bernoulli_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_bfloat16_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_bfloat16_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_block_diag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_block_diag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_bmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_bmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_bool_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_bool_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_broadcast_shapes_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_broadcast_shapes_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_broadcast_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_broadcast_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_broadcast_to_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_broadcast_to_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_bucketize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_bucketize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_byte_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_byte_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cartesian_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cartesian_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cauchy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cauchy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cdist_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cdist_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cdouble_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cdouble_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ceil_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ceil_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cfloat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cfloat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_chalf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_chalf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_char_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_char_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cholesky_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cholesky_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cholesky_inverse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cholesky_inverse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cholesky_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cholesky_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_chunk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_chunk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_clamp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_clamp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_clamp_max_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_clamp_max_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_clamp_min_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_clamp_min_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_clone_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_clone_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_column_stack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_column_stack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_combinations_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_combinations_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_complex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_complex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_conj_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_conj_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_conj_physical_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_conj_physical_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_constant_pad_nd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_constant_pad_nd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_contiguous_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_contiguous_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_copysign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_copysign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_corrcoef_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_corrcoef_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cos_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cos_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cosh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cosh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_count_nonzero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_count_nonzero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cov_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cov_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cross_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cross_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cummax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cummax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cummin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cummin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cumprod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cumprod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cumsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cumsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cumulative_trapezoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_cumulative_trapezoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_deg2rad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_deg2rad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_diag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_diag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_diag_embed_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_diag_embed_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_diagflat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_diagflat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_diagonal_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_diagonal_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_diagonal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_diagonal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_diagonal_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_diagonal_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_diff_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_diff_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_digamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_digamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_dist_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_dist_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_div_floor_rounding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_div_floor_rounding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_div_no_rounding_mode_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_div_no_rounding_mode_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_div_trunc_rounding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_div_trunc_rounding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_dot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_dot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_double_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_double_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_dsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_dsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_dstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_dstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_einsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_einsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_empty_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_empty_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_empty_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_empty_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_empty_permuted_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_empty_permuted_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_empty_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_empty_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_eq_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_eq_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_equal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_equal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_erf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_erf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_erfc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_erfc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_erfinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_erfinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_exp2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_exp2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_exp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_exp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_expand_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_expand_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_expand_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_expand_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_expm1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_expm1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_exponential_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_exponential_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_eye_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_eye_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_fft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_fft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_fft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_fft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_fftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_fftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_fftshift_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_fftshift_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_hfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_hfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_hfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_hfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_hfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_hfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_ifft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_ifft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_ifft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_ifft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_ifftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_ifftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_ifftshift_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_ifftshift_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_ihfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_ihfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_ihfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_ihfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_ihfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_ihfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_irfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_irfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_irfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_irfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_irfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_irfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_rfft2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_rfft2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_rfft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_rfft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_rfftn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fft_rfftn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_flatten_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_flatten_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_flip_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_flip_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fliplr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fliplr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_flipud_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_flipud_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_float_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_float_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_float_power_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_float_power_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_floor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_floor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_floor_divide_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_floor_divide_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fmod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_fmod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_frac_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_frac_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_frexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_frexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_full_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_full_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_full_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_full_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_gather_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_gather_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ge_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ge_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_geometric_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_geometric_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_geqrf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_geqrf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_gradient_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_gradient_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_grid_sampler_2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_grid_sampler_2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_gt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_gt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_half_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_half_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_heaviside_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_heaviside_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_histc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_histc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_histogram_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_histogram_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_histogramdd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_histogramdd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_hsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_hsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_hstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_hstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_hypot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_hypot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_i0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_i0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_igamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_igamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_igammac_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_igammac_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_index_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_index_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_index_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_index_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_index_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_index_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_index_put_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_index_put_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_index_reduce_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_index_reduce_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_index_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_index_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_inner_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_inner_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_int_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_int_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isclose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isclose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isfinite_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isfinite_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isinf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isinf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isnan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isnan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isneginf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isneginf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isposinf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isposinf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isreal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_isreal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_item_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_item_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_jiterator_2inputs_2outputs_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_jiterator_2inputs_2outputs_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_jiterator_4inputs_with_extra_args_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_jiterator_4inputs_with_extra_args_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_jiterator_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_jiterator_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_jiterator_binary_return_by_ref_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_jiterator_binary_return_by_ref_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_jiterator_unary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_jiterator_unary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_kron_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_kron_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_kthvalue_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_kthvalue_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ldexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ldexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_le_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_le_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_lerp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_lerp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_lgamma_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_lgamma_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_cholesky_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_cholesky_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_cholesky_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_cholesky_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_cond_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_cond_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_cross_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_cross_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_det_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_det_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_det_singular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_det_singular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_diagonal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_diagonal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_eig_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_eig_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_eigh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_eigh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_eigvals_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_eigvals_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_eigvalsh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_eigvalsh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_householder_product_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_householder_product_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_inv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_inv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_inv_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_inv_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_ldl_factor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_ldl_factor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_ldl_factor_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_ldl_factor_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_ldl_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_ldl_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_lstsq_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_lstsq_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_lstsq_grad_oriented_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_lstsq_grad_oriented_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_lu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_lu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_lu_factor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_lu_factor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_lu_factor_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_lu_factor_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_lu_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_lu_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_matrix_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_matrix_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_matrix_power_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_matrix_power_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_matrix_rank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_matrix_rank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_matrix_rank_hermitian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_matrix_rank_hermitian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_multi_dot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_multi_dot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_norm_subgradients_at_zero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_norm_subgradients_at_zero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_pinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_pinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_pinv_hermitian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_pinv_hermitian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_pinv_singular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_pinv_singular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_qr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_qr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_slogdet_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_slogdet_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_solve_ex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_solve_ex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_solve_triangular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_solve_triangular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_svd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_svd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_svdvals_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_svdvals_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_tensorinv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_tensorinv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_tensorsolve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_tensorsolve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_vander_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_vander_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_vecdot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_vecdot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_vector_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linalg_vector_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linspace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linspace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linspace_tensor_overload_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_linspace_tensor_overload_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_log10_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_log10_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_log1p_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_log1p_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_log2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_log2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_log_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_log_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_log_normal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_log_normal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_log_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_log_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_log_softmax_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_log_softmax_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logaddexp2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logaddexp2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logaddexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logaddexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logcumsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logcumsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logdet_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logdet_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logical_and_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logical_and_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logical_not_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logical_not_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logical_or_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logical_or_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logical_xor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logical_xor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logspace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logspace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logspace_tensor_overload_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logspace_tensor_overload_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_logsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_long_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_long_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_lt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_lt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_lu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_lu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_lu_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_lu_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_lu_unpack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_lu_unpack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mH_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mH_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mT_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mT_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_argmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_argmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_argmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_argmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_cumprod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_cumprod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_cumsum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_cumsum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_fill_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_fill_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_log_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_log_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_logaddexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_logaddexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_logsumexp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_logsumexp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_median_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_median_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_normalize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_normalize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_softmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_softmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_std_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_std_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_var_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_masked_var_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_matmul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_matmul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_matrix_exp_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_matrix_exp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_max_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_max_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_max_pool2d_with_indices_backward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_max_pool2d_with_indices_backward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_max_reduction_no_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_max_reduction_no_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_max_reduction_with_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_max_reduction_with_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_maximum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_maximum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_median_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_median_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_meshgrid_list_of_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_meshgrid_list_of_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_meshgrid_variadic_tensors_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_meshgrid_variadic_tensors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_min_binary_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_min_binary_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_min_reduction_no_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_min_reduction_no_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_min_reduction_with_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_min_reduction_with_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_minimum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_minimum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mode_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mode_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_movedim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_movedim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_msort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_msort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mul_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_multinomial_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_multinomial_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mv_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mv_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mvlgamma_mvlgamma_p_1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mvlgamma_mvlgamma_p_1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mvlgamma_mvlgamma_p_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mvlgamma_mvlgamma_p_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mvlgamma_mvlgamma_p_5_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_mvlgamma_mvlgamma_p_5_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nan_to_num_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nan_to_num_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nanmean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nanmean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nanmedian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nanmedian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nanquantile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nanquantile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nansum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nansum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_narrow_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_narrow_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_narrow_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_narrow_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_native_batch_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_native_batch_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_native_dropout_backward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_native_dropout_backward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_native_layer_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_native_layer_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ne_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ne_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_neg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_neg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_new_empty_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_new_empty_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_new_empty_strided_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_new_empty_strided_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_new_full_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_new_full_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_new_ones_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_new_ones_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_new_zeros_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_new_zeros_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nextafter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nextafter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_adaptive_avg_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_adaptive_avg_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_adaptive_avg_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_adaptive_avg_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_adaptive_avg_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_adaptive_avg_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_adaptive_max_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_adaptive_max_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_adaptive_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_adaptive_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_adaptive_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_adaptive_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_alpha_dropout_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_alpha_dropout_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_avg_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_avg_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_avg_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_avg_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_avg_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_avg_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_batch_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_batch_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_binary_cross_entropy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_binary_cross_entropy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_binary_cross_entropy_with_logits_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_binary_cross_entropy_with_logits_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_celu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_celu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_conv1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_conv1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_conv2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_conv2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_conv3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_conv3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_conv_transpose1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_conv_transpose1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_conv_transpose2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_conv_transpose2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_conv_transpose3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_conv_transpose3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_cosine_embedding_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_cosine_embedding_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_cosine_similarity_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_cosine_similarity_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_cross_entropy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_cross_entropy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_ctc_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_ctc_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_dropout2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_dropout2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_dropout3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_dropout3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_dropout_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_dropout_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_elu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_elu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_embedding_bag_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_embedding_bag_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_embedding_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_embedding_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_feature_alpha_dropout_with_train_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_feature_alpha_dropout_with_train_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_feature_alpha_dropout_without_train_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_feature_alpha_dropout_without_train_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_fractional_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_fractional_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_fractional_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_fractional_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_gaussian_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_gaussian_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_gelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_gelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_glu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_glu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_grid_sample_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_grid_sample_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_group_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_group_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_hardshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_hardshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_hardsigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_hardsigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_hardswish_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_hardswish_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_hardtanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_hardtanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_hinge_embedding_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_hinge_embedding_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_huber_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_huber_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_instance_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_instance_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_interpolate_area_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_interpolate_area_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_interpolate_bicubic_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_interpolate_bicubic_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_interpolate_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_interpolate_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_interpolate_linear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_interpolate_linear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_interpolate_nearest-exact_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_interpolate_nearest-exact_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_interpolate_nearest_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_interpolate_nearest_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_interpolate_trilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_interpolate_trilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_kl_div_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_kl_div_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_l1_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_l1_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_layer_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_layer_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_leaky_relu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_leaky_relu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_linear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_linear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_local_response_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_local_response_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_logsigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_logsigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_margin_ranking_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_margin_ranking_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_pool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_pool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_pool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_pool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_pool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_pool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_unpool1d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_unpool1d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_unpool1d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_unpool1d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_unpool2d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_unpool2d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_unpool2d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_unpool2d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_unpool3d_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_unpool3d_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_unpool3d_grad_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_max_unpool3d_grad_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_mish_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_mish_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_mse_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_mse_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_multi_head_attention_forward_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_multi_head_attention_forward_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_multi_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_multi_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_multilabel_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_multilabel_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_multilabel_soft_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_multilabel_soft_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_normalize_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_normalize_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pad_circular_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pad_circular_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pad_constant_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pad_constant_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pad_reflect_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pad_reflect_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pad_replicate_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pad_replicate_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pad_replicate_negative_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pad_replicate_negative_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pairwise_distance_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pairwise_distance_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pdist_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pdist_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pixel_shuffle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pixel_shuffle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pixel_unshuffle_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_pixel_unshuffle_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_poisson_nll_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_poisson_nll_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_prelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_prelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_relu6_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_relu6_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_relu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_relu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_rrelu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_rrelu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_scaled_dot_product_attention_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_scaled_dot_product_attention_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_selu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_selu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_silu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_silu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_smooth_l1_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_smooth_l1_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_soft_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_soft_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_softmin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_softmin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_softmin_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_softmin_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_softplus_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_softplus_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_softshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_softshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_softsign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_softsign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_tanhshrink_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_tanhshrink_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_threshold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_threshold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_triplet_margin_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_triplet_margin_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_triplet_margin_with_distance_loss_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_triplet_margin_with_distance_loss_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_unfold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_unfold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_upsample_bilinear_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_upsample_bilinear_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_upsample_nearest_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nn_functional_upsample_nearest_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nonzero_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nonzero_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nonzero_static_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_nonzero_static_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_norm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_norm_fro_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_norm_fro_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_norm_inf_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_norm_inf_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_norm_nuc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_norm_nuc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_normal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_normal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_normal_in_place_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_normal_in_place_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_normal_number_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_normal_number_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ones_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ones_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ones_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ones_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ormqr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ormqr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_outer_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_outer_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_pca_lowrank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_pca_lowrank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_permute_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_permute_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_pinverse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_pinverse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_polar_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_polar_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_polygamma_polygamma_n_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_polygamma_polygamma_n_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_polygamma_polygamma_n_1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_polygamma_polygamma_n_1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_polygamma_polygamma_n_2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_polygamma_polygamma_n_2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_polygamma_polygamma_n_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_polygamma_polygamma_n_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_polygamma_polygamma_n_4_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_polygamma_polygamma_n_4_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_positive_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_positive_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_pow_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_pow_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_put_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_put_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_qr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_qr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_quantile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_quantile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_rad2deg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_rad2deg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_rand_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_rand_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_randint_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_randint_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_randint_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_randint_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_randn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_randn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_randn_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_randn_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ravel_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_ravel_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_real_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_real_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_reciprocal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_reciprocal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_remainder_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_remainder_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_renorm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_renorm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_repeat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_repeat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_repeat_interleave_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_repeat_interleave_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_reshape_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_reshape_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_reshape_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_reshape_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_resize__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_resize__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_resize_as__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_resize_as__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_resolve_conj_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_resolve_conj_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_resolve_neg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_resolve_neg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_roll_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_roll_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_rot90_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_rot90_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_round_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_round_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_round_decimals_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_round_decimals_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_round_decimals_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_round_decimals_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_round_decimals_neg_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_round_decimals_neg_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_rsqrt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_rsqrt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_rsub_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_rsub_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scalar_tensor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scalar_tensor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scatter_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scatter_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scatter_reduce_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scatter_reduce_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scatter_reduce_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scatter_reduce_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scatter_reduce_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scatter_reduce_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scatter_reduce_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scatter_reduce_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scatter_reduce_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_scatter_reduce_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_searchsorted_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_searchsorted_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_select_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_select_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sgn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sgn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_short_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_short_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_bartlett_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_bartlett_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_blackman_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_blackman_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_cosine_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_cosine_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_exponential_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_exponential_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_gaussian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_gaussian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_general_cosine_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_general_cosine_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_general_hamming_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_general_hamming_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_hamming_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_hamming_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_hann_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_hann_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_kaiser_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_kaiser_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_nuttall_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signal_windows_nuttall_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signbit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_signbit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sinc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sinc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sinh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sinh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_slice_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_slice_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_slice_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_slice_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_softmax_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_softmax_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sparse_mm_reduce_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sparse_mm_reduce_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sparse_sampled_addmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sparse_sampled_addmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_airy_ai_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_airy_ai_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_bessel_j0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_bessel_j0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_bessel_j1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_bessel_j1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_bessel_y0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_bessel_y0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_bessel_y1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_bessel_y1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_chebyshev_polynomial_t_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_chebyshev_polynomial_t_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_chebyshev_polynomial_u_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_chebyshev_polynomial_u_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_chebyshev_polynomial_v_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_chebyshev_polynomial_v_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_chebyshev_polynomial_w_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_chebyshev_polynomial_w_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_entr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_entr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_erfcx_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_erfcx_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_hermite_polynomial_h_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_hermite_polynomial_h_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_hermite_polynomial_he_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_hermite_polynomial_he_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_i0e_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_i0e_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_i1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_i1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_i1e_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_i1e_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_laguerre_polynomial_l_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_laguerre_polynomial_l_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_legendre_polynomial_p_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_legendre_polynomial_p_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_log_ndtr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_log_ndtr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_modified_bessel_i0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_modified_bessel_i0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_modified_bessel_i1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_modified_bessel_i1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_modified_bessel_k0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_modified_bessel_k0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_modified_bessel_k1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_modified_bessel_k1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_ndtr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_ndtr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_ndtri_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_ndtri_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_polygamma_special_polygamma_n_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_polygamma_special_polygamma_n_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_scaled_modified_bessel_k0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_scaled_modified_bessel_k0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_scaled_modified_bessel_k1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_scaled_modified_bessel_k1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_shifted_chebyshev_polynomial_t_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_shifted_chebyshev_polynomial_t_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_shifted_chebyshev_polynomial_u_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_shifted_chebyshev_polynomial_u_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_shifted_chebyshev_polynomial_v_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_shifted_chebyshev_polynomial_v_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_shifted_chebyshev_polynomial_w_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_shifted_chebyshev_polynomial_w_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_spherical_bessel_j0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_spherical_bessel_j0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_xlog1py_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_xlog1py_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_zeta_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_special_zeta_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_split_list_args_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_split_list_args_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_split_with_sizes_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_split_with_sizes_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sqrt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sqrt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_square_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_square_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_squeeze_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_squeeze_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_squeeze_multiple_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_squeeze_multiple_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_stack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_stack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_std_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_std_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_std_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_std_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_std_mean_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_std_mean_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_std_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_std_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_stft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_stft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sub_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sub_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sum_to_size_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_sum_to_size_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_svd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_svd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_svd_lowrank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_svd_lowrank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_take_along_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_take_along_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_take_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_take_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_tan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_tan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_tanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_tanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_tensor_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_tensor_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_tensordot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_tensordot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_tile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_tile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_to_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_to_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_to_sparse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_to_sparse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_topk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_topk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_trace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_trace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_transpose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_transpose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_trapezoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_trapezoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_trapz_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_trapz_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_triangular_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_triangular_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_tril_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_tril_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_triu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_triu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_true_divide_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_true_divide_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_trunc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_trunc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unbind_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unbind_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unflatten_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unflatten_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unfold_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unfold_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unfold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unfold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_uniform_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_uniform_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unique_consecutive_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unique_consecutive_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unique_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unique_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unsafe_chunk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unsafe_chunk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unsafe_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unsafe_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unsqueeze_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_unsqueeze_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_var_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_var_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_var_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_var_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_var_mean_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_var_mean_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_var_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_var_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_vdot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_vdot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_view_as_complex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_view_as_complex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_view_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_view_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_view_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_view_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_view_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_view_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_vsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_vsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_vstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_vstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_where_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_where_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_xlogy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_xlogy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_zero__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_zero__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_zeros_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_zeros_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_zeros_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_zeros_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_outer_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_outer_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_pca_lowrank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_pca_lowrank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_permute_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_permute_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_pinverse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_pinverse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_polar_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_polar_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_polygamma_polygamma_n_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_polygamma_polygamma_n_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_polygamma_polygamma_n_1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_polygamma_polygamma_n_1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_polygamma_polygamma_n_2_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_polygamma_polygamma_n_2_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_polygamma_polygamma_n_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_polygamma_polygamma_n_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_polygamma_polygamma_n_4_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_polygamma_polygamma_n_4_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_positive_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_positive_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_pow_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_pow_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_put_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_put_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_qr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_qr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_quantile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_quantile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_rad2deg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_rad2deg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_rand_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_rand_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_randint_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_randint_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_randint_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_randint_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_randn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_randn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_randn_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_randn_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ravel_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_ravel_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_real_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_real_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_reciprocal_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_reciprocal_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_remainder_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_remainder_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_renorm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_renorm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_repeat_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_repeat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_repeat_interleave_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_repeat_interleave_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_reshape_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_reshape_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_reshape_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_reshape_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_resize__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_resize__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_resize_as__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_resize_as__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_resolve_conj_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_resolve_conj_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_resolve_neg_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_resolve_neg_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_roll_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_roll_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_rot90_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_rot90_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_round_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_round_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_round_decimals_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_round_decimals_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_round_decimals_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_round_decimals_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_round_decimals_neg_3_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_round_decimals_neg_3_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_rsqrt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_rsqrt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_rsub_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_rsub_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scalar_tensor_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scalar_tensor_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scatter_add_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scatter_add_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scatter_reduce_amax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scatter_reduce_amax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scatter_reduce_amin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scatter_reduce_amin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scatter_reduce_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scatter_reduce_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scatter_reduce_prod_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scatter_reduce_prod_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scatter_reduce_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_scatter_reduce_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_searchsorted_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_searchsorted_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_select_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_select_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_select_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_select_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sgn_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sgn_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_short_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_short_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sigmoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sigmoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sign_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sign_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_bartlett_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_bartlett_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_blackman_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_blackman_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_cosine_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_cosine_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_exponential_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_exponential_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_gaussian_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_gaussian_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_general_cosine_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_general_cosine_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_general_hamming_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_general_hamming_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_hamming_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_hamming_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_hann_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_hann_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_kaiser_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_kaiser_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_nuttall_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signal_windows_nuttall_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signbit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_signbit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sin_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sinc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sinc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sinh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sinh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_slice_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_slice_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_slice_scatter_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_slice_scatter_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_softmax_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_softmax_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_softmax_with_dtype_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_softmax_with_dtype_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sort_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sort_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sparse_mm_reduce_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sparse_mm_reduce_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sparse_sampled_addmm_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sparse_sampled_addmm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_airy_ai_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_airy_ai_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_bessel_j0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_bessel_j0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_bessel_j1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_bessel_j1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_bessel_y0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_bessel_y0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_bessel_y1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_bessel_y1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_chebyshev_polynomial_t_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_chebyshev_polynomial_t_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_chebyshev_polynomial_u_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_chebyshev_polynomial_u_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_chebyshev_polynomial_v_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_chebyshev_polynomial_v_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_chebyshev_polynomial_w_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_chebyshev_polynomial_w_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_entr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_entr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_erfcx_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_erfcx_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_hermite_polynomial_h_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_hermite_polynomial_h_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_hermite_polynomial_he_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_hermite_polynomial_he_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_i0e_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_i0e_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_i1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_i1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_i1e_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_i1e_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_laguerre_polynomial_l_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_laguerre_polynomial_l_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_legendre_polynomial_p_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_legendre_polynomial_p_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_log_ndtr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_log_ndtr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_modified_bessel_i0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_modified_bessel_i0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_modified_bessel_i1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_modified_bessel_i1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_modified_bessel_k0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_modified_bessel_k0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_modified_bessel_k1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_modified_bessel_k1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_ndtr_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_ndtr_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_ndtri_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_ndtri_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_polygamma_special_polygamma_n_0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_polygamma_special_polygamma_n_0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_scaled_modified_bessel_k0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_scaled_modified_bessel_k0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_scaled_modified_bessel_k1_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_scaled_modified_bessel_k1_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_shifted_chebyshev_polynomial_t_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_shifted_chebyshev_polynomial_t_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_shifted_chebyshev_polynomial_u_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_shifted_chebyshev_polynomial_u_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_shifted_chebyshev_polynomial_v_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_shifted_chebyshev_polynomial_v_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_shifted_chebyshev_polynomial_w_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_shifted_chebyshev_polynomial_w_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_spherical_bessel_j0_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_spherical_bessel_j0_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_xlog1py_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_xlog1py_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_zeta_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_special_zeta_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_split_list_args_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_split_list_args_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_split_with_sizes_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_split_with_sizes_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sqrt_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sqrt_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_square_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_square_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_squeeze_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_squeeze_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_squeeze_multiple_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_squeeze_multiple_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_stack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_stack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_std_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_std_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_std_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_std_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_std_mean_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_std_mean_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_std_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_std_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_stft_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_stft_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sub_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sub_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sum_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sum_to_size_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_sum_to_size_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_svd_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_svd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_svd_lowrank_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_svd_lowrank_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_take_along_dim_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_take_along_dim_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_take_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_take_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_tan_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_tan_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_tanh_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_tanh_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_tensor_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_tensor_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_tensordot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_tensordot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_tile_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_tile_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_to_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_to_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_to_sparse_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_to_sparse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_topk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_topk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_trace_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_trace_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_transpose_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_transpose_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_trapezoid_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_trapezoid_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_trapz_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_trapz_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_triangular_solve_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_triangular_solve_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_tril_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_tril_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_triu_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_triu_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_true_divide_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_true_divide_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_trunc_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_trunc_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unbind_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unbind_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unflatten_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unflatten_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unfold_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unfold_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unfold_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unfold_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_uniform_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_uniform_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unique_consecutive_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unique_consecutive_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unique_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unique_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unsafe_chunk_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unsafe_chunk_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unsafe_split_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unsafe_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unsqueeze_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_unsqueeze_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_var_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_var_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_var_mean_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_var_mean_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_var_mean_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_var_mean_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_var_unbiased_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_var_unbiased_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_vdot_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_vdot_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_view_as_complex_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_view_as_complex_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_view_as_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_view_as_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_view_copy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_view_copy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_view_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_view_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_vsplit_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_vsplit_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_vstack_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_vstack_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_where_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_where_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_xlogy_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_xlogy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_zero__cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_zero__cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_zeros_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_zeros_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_zeros_like_cpu_float32 b/test/dynamo_skips/TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_zeros_like_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestPythonAutograd.test_backwards1 b/test/dynamo_skips/TestPythonAutograd.test_backwards1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestPythonBindings.test_cu_create_function b/test/dynamo_skips/TestPythonBindings.test_cu_create_function
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestPythonBuiltinOP.test_stepped_tuple_slicing b/test/dynamo_skips/TestPythonBuiltinOP.test_stepped_tuple_slicing
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestPythonDispatch.test_list_ret b/test/dynamo_skips/TestPythonDispatch.test_list_ret
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestQuantizedEmbeddingOps.test_embedding_bag_byte b/test/dynamo_skips/TestQuantizedEmbeddingOps.test_embedding_bag_byte
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestRecursiveScript.test_inner_traced_module b/test/dynamo_skips/TestRecursiveScript.test_inner_traced_module
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestReductionsCPU.test_all_any_vs_numpy_cpu_bool b/test/dynamo_skips/TestReductionsCPU.test_all_any_vs_numpy_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestReductionsCPU.test_all_any_vs_numpy_cpu_uint8 b/test/dynamo_skips/TestReductionsCPU.test_all_any_vs_numpy_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestReductionsCPU.test_histogram_cpu_float32 b/test/dynamo_skips/TestReductionsCPU.test_histogram_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestReductionsCPU.test_histogramdd_cpu_float32 b/test/dynamo_skips/TestReductionsCPU.test_histogramdd_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestReductionsCPU.test_logcumsumexp_complex_cpu_complex128 b/test/dynamo_skips/TestReductionsCPU.test_logcumsumexp_complex_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestReductionsCPU.test_logcumsumexp_complex_cpu_complex64 b/test/dynamo_skips/TestReductionsCPU.test_logcumsumexp_complex_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestReductionsCPU.test_tensor_compare_ops_argmax_argmix_kthvalue_dim_empty_cpu b/test/dynamo_skips/TestReductionsCPU.test_tensor_compare_ops_argmax_argmix_kthvalue_dim_empty_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestReductionsCPU.test_tensor_compare_ops_empty_cpu b/test/dynamo_skips/TestReductionsCPU.test_tensor_compare_ops_empty_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestReductionsCPU.test_tensor_reduce_ops_empty_cpu b/test/dynamo_skips/TestReductionsCPU.test_tensor_reduce_ops_empty_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestRepackage.test_repackage_import_indirectly_via_parent_module b/test/dynamo_skips/TestRepackage.test_repackage_import_indirectly_via_parent_module
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSWAUtils.test_averaged_model_all_devices_ema_True b/test/dynamo_skips/TestSWAUtils.test_averaged_model_all_devices_ema_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSWAUtils.test_averaged_model_exponential_use_multi_avg_fn_True_use_buffers_False b/test/dynamo_skips/TestSWAUtils.test_averaged_model_exponential_use_multi_avg_fn_True_use_buffers_False
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSWAUtils.test_averaged_model_exponential_use_multi_avg_fn_True_use_buffers_True b/test/dynamo_skips/TestSWAUtils.test_averaged_model_exponential_use_multi_avg_fn_True_use_buffers_True
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSaveLoadForOpVersion.test_versioned_div_tensor_inplace b/test/dynamo_skips/TestSaveLoadForOpVersion.test_versioned_div_tensor_inplace
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSaveLoadForOpVersion.test_versioned_div_tensor_out b/test/dynamo_skips/TestSaveLoadForOpVersion.test_versioned_div_tensor_out
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_H_operation0 b/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_H_operation0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_H_operation1 b/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_H_operation1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_I_operation0 b/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_I_operation0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_I_operation1 b/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_I_operation1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_L_operation0 b/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_L_operation0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_L_operation1 b/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_L_operation1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_P_operation0 b/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_P_operation0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_P_operation1 b/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_P_operation1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_Q_operation0 b/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_Q_operation0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_Q_operation1 b/test/dynamo_skips/TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_Q_operation1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestScriptProfile.test_script b/test/dynamo_skips/TestScriptProfile.test_script
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestScriptProfile.test_section b/test/dynamo_skips/TestScriptProfile.test_section
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSerializeCustomClass.test_custom_class b/test/dynamo_skips/TestSerializeCustomClass.test_custom_class
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSetOps.test_in1d_table_timedelta_fails b/test/dynamo_skips/TestSetOps.test_in1d_table_timedelta_fails
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSetOps.test_in1d_timedelta_kind0 b/test/dynamo_skips/TestSetOps.test_in1d_timedelta_kind0
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSetOps.test_in1d_timedelta_kind_sort b/test/dynamo_skips/TestSetOps.test_in1d_timedelta_kind_sort
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSetOps.test_setdiff1d b/test/dynamo_skips/TestSetOps.test_setdiff1d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSlice.test_tuple_slicing b/test/dynamo_skips/TestSlice.test_tuple_slicing
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_check_sparse_tensor_invariants_SparseBSC_cpu b/test/dynamo_skips/TestSparseAnyCPU.test_check_sparse_tensor_invariants_SparseBSC_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_check_sparse_tensor_invariants_SparseBSR_cpu b/test/dynamo_skips/TestSparseAnyCPU.test_check_sparse_tensor_invariants_SparseBSR_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_check_sparse_tensor_invariants_SparseCOO_cpu b/test/dynamo_skips/TestSparseAnyCPU.test_check_sparse_tensor_invariants_SparseCOO_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_check_sparse_tensor_invariants_SparseCSC_cpu b/test/dynamo_skips/TestSparseAnyCPU.test_check_sparse_tensor_invariants_SparseCSC_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_check_sparse_tensor_invariants_SparseCSR_cpu b/test/dynamo_skips/TestSparseAnyCPU.test_check_sparse_tensor_invariants_SparseCSR_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_constructor_autograd_SparseBSC_cpu b/test/dynamo_skips/TestSparseAnyCPU.test_constructor_autograd_SparseBSC_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_constructor_autograd_SparseBSR_cpu b/test/dynamo_skips/TestSparseAnyCPU.test_constructor_autograd_SparseBSR_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_constructor_autograd_SparseCOO_cpu b/test/dynamo_skips/TestSparseAnyCPU.test_constructor_autograd_SparseCOO_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_constructor_autograd_SparseCSC_cpu b/test/dynamo_skips/TestSparseAnyCPU.test_constructor_autograd_SparseCSC_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_constructor_autograd_SparseCSR_cpu b/test/dynamo_skips/TestSparseAnyCPU.test_constructor_autograd_SparseCSR_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_masked_fast_cpu_complex128 b/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_masked_fast_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_masked_fast_cpu_float64 b/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_masked_fast_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_masked_slow_cpu_complex128 b/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_masked_slow_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_masked_slow_cpu_float64 b/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_masked_slow_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_sparse_fast_cpu_complex128 b/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_sparse_fast_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_sparse_fast_cpu_float64 b/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_sparse_fast_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_sparse_slow_cpu_complex128 b/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_sparse_slow_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_sparse_slow_cpu_float64 b/test/dynamo_skips/TestSparseAnyCPU.test_gradcheck_mm_SparseCOO_sparse_slow_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_add_dense_sparse_mismatch_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_add_dense_sparse_mismatch_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_add_dense_sparse_mismatch_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_add_dense_sparse_mismatch_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_add_zeros_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_add_zeros_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_add_zeros_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_add_zeros_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_any_cpu b/test/dynamo_skips/TestSparseCPU.test_any_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_asin_arcsin_cpu_float32 b/test/dynamo_skips/TestSparseCPU.test_asin_arcsin_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_asin_arcsin_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_asin_arcsin_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_asin_arcsin_cpu_int16 b/test/dynamo_skips/TestSparseCPU.test_asin_arcsin_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_asin_arcsin_cpu_int32 b/test/dynamo_skips/TestSparseCPU.test_asin_arcsin_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_asin_arcsin_cpu_int64 b/test/dynamo_skips/TestSparseCPU.test_asin_arcsin_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_asin_arcsin_cpu_int8 b/test/dynamo_skips/TestSparseCPU.test_asin_arcsin_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_asin_arcsin_cpu_uint8 b/test/dynamo_skips/TestSparseCPU.test_asin_arcsin_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_assign_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_assign_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_basic_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_basic_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_basic_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_basic_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_basic_ops_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_basic_ops_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_bmm_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_bmm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_cat_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_cat_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_cat_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_cat_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_change_tensor_metadata_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_change_tensor_metadata_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_change_tensor_metadata_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_change_tensor_metadata_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_clone_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_clone_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_clone_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_clone_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_coalesce_transpose_mm_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_coalesce_transpose_mm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_contig_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_contig_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_contig_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_contig_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_contig_hybrid_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_contig_hybrid_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_contig_hybrid_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_contig_hybrid_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_ctor_is_coalesced_with_gradcheck_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_ctor_is_coalesced_with_gradcheck_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_div_rounding_mode_cpu_float32 b/test/dynamo_skips/TestSparseCPU.test_div_rounding_mode_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_div_rounding_mode_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_div_rounding_mode_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_dsmm_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_dsmm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_empty_like_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_empty_like_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_empty_like_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_empty_like_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_factory_copy_cpu b/test/dynamo_skips/TestSparseCPU.test_factory_copy_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_factory_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_factory_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_factory_cpu_complex64 b/test/dynamo_skips/TestSparseCPU.test_factory_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_factory_cpu_float16 b/test/dynamo_skips/TestSparseCPU.test_factory_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_factory_cpu_float32 b/test/dynamo_skips/TestSparseCPU.test_factory_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_factory_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_factory_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_factory_type_inference_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_factory_type_inference_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_factory_type_inference_cpu_complex64 b/test/dynamo_skips/TestSparseCPU.test_factory_type_inference_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_factory_type_inference_cpu_float16 b/test/dynamo_skips/TestSparseCPU.test_factory_type_inference_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_factory_type_inference_cpu_float32 b/test/dynamo_skips/TestSparseCPU.test_factory_type_inference_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_factory_type_inference_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_factory_type_inference_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_factory_type_inference_cpu_int64 b/test/dynamo_skips/TestSparseCPU.test_factory_type_inference_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_hsmm_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_hsmm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_index_select_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_index_select_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_index_select_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_index_select_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_index_select_empty_and_non_contiguous_index_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_index_select_empty_and_non_contiguous_index_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_index_select_empty_and_non_contiguous_index_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_index_select_empty_and_non_contiguous_index_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_index_select_exhaustive_index_large_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_index_select_exhaustive_index_large_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_index_select_exhaustive_index_large_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_index_select_exhaustive_index_large_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_index_select_exhaustive_index_small_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_index_select_exhaustive_index_small_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_index_select_exhaustive_index_small_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_index_select_exhaustive_index_small_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_index_select_parallelization_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_index_select_parallelization_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_index_select_parallelization_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_index_select_parallelization_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_is_nonzero_cpu b/test/dynamo_skips/TestSparseCPU.test_is_nonzero_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_isnan_cpu b/test/dynamo_skips/TestSparseCPU.test_isnan_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_legacy_new_cpu b/test/dynamo_skips/TestSparseCPU.test_legacy_new_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_legacy_new_device_cpu b/test/dynamo_skips/TestSparseCPU.test_legacy_new_device_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_log1p_cpu_float32 b/test/dynamo_skips/TestSparseCPU.test_log1p_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_log1p_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_log1p_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_log1p_cpu_int16 b/test/dynamo_skips/TestSparseCPU.test_log1p_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_log1p_cpu_int32 b/test/dynamo_skips/TestSparseCPU.test_log1p_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_log1p_cpu_int64 b/test/dynamo_skips/TestSparseCPU.test_log1p_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_log1p_cpu_int8 b/test/dynamo_skips/TestSparseCPU.test_log1p_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_log1p_cpu_uint8 b/test/dynamo_skips/TestSparseCPU.test_log1p_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_mm_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_mm_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_mm_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_mm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_mv_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_mv_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_neg_negative_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_neg_negative_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_neg_negative_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_neg_negative_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_new_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_new_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_new_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_new_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_norm_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_norm_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_norm_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_norm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_permute_masked_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_permute_masked_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_permute_masked_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_permute_masked_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_permute_sparse_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_permute_sparse_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_permute_sparse_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_permute_sparse_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_pickle_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_pickle_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_print_coalesced_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_print_coalesced_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_print_uncoalesced_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_print_uncoalesced_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_resize_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_resize_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_resize_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_resize_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_saddmm_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_saddmm_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_saddmm_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_saddmm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_select_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_select_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_select_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_select_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_select_no_type_promotion_cpu_int16 b/test/dynamo_skips/TestSparseCPU.test_select_no_type_promotion_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_select_no_type_promotion_cpu_int32 b/test/dynamo_skips/TestSparseCPU.test_select_no_type_promotion_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_select_no_type_promotion_cpu_int64 b/test/dynamo_skips/TestSparseCPU.test_select_no_type_promotion_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_select_no_type_promotion_cpu_int8 b/test/dynamo_skips/TestSparseCPU.test_select_no_type_promotion_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_select_no_type_promotion_cpu_uint8 b/test/dynamo_skips/TestSparseCPU.test_select_no_type_promotion_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_shared_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_shared_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_shared_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_shared_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_small_nnz_coalesced_cpu b/test/dynamo_skips/TestSparseCPU.test_small_nnz_coalesced_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_spadd_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_spadd_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_add_coalesce_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_sparse_add_coalesce_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_add_coalesce_cpu_complex64 b/test/dynamo_skips/TestSparseCPU.test_sparse_add_coalesce_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_add_coalesce_cpu_float32 b/test/dynamo_skips/TestSparseCPU.test_sparse_add_coalesce_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_add_coalesce_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_sparse_add_coalesce_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_add_out_bfloat16_cpu_float32 b/test/dynamo_skips/TestSparseCPU.test_sparse_add_out_bfloat16_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_addmm_cpu_bfloat16 b/test/dynamo_skips/TestSparseCPU.test_sparse_addmm_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_addmm_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_sparse_addmm_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_addmm_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_sparse_addmm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_broadcast_to_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_sparse_broadcast_to_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_broadcast_to_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_sparse_broadcast_to_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_bool b/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_complex64 b/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_float32 b/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_int16 b/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_int32 b/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_int64 b/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_int8 b/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_uint8 b/test/dynamo_skips/TestSparseCPU.test_sparse_dense_mul_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_mask_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_sparse_mask_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_mask_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_sparse_mask_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_mask_hybrid_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_sparse_mask_hybrid_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_mask_hybrid_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_sparse_mask_hybrid_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_mm_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_sparse_mm_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_complex64 b/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_float32 b/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_int16 b/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_int32 b/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_int64 b/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_int8 b/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_uint8 b/test/dynamo_skips/TestSparseCPU.test_sparse_sparse_mul_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_bool b/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_complex64 b/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_float32 b/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_int16 b/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_int32 b/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_int64 b/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_int8 b/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_uint8 b/test/dynamo_skips/TestSparseCPU.test_sparse_spdiags_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_sum_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_sparse_sum_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sparse_to_numpy_cpu b/test/dynamo_skips/TestSparseCPU.test_sparse_to_numpy_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sum_cpu_bool b/test/dynamo_skips/TestSparseCPU.test_sum_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sum_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_sum_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sum_cpu_complex64 b/test/dynamo_skips/TestSparseCPU.test_sum_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sum_cpu_float32 b/test/dynamo_skips/TestSparseCPU.test_sum_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sum_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_sum_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sum_cpu_int16 b/test/dynamo_skips/TestSparseCPU.test_sum_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sum_cpu_int32 b/test/dynamo_skips/TestSparseCPU.test_sum_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sum_cpu_int64 b/test/dynamo_skips/TestSparseCPU.test_sum_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sum_cpu_int8 b/test/dynamo_skips/TestSparseCPU.test_sum_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_sum_cpu_uint8 b/test/dynamo_skips/TestSparseCPU.test_sum_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_hybrid_masked_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_to_dense_hybrid_masked_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_hybrid_masked_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_to_dense_hybrid_masked_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_hybrid_sparse_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_to_dense_hybrid_sparse_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_hybrid_sparse_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_to_dense_hybrid_sparse_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_masked_cpu_bfloat16 b/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_masked_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_masked_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_masked_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_masked_cpu_complex64 b/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_masked_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_masked_cpu_float16 b/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_masked_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_masked_cpu_float32 b/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_masked_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_masked_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_masked_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_bfloat16 b/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_complex64 b/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_float16 b/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_float32 b/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_transpose_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_transpose_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_transpose_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_transpose_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_unsqueeze_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_unsqueeze_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_unsqueeze_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_unsqueeze_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_zeros_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_zeros_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_zeros_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_zeros_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_zeros_like_cpu_complex128 b/test/dynamo_skips/TestSparseCPU.test_zeros_like_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCPU.test_zeros_like_cpu_float64 b/test/dynamo_skips/TestSparseCPU.test_zeros_like_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_addmm_errors_cpu_float32 b/test/dynamo_skips/TestSparseCSRCPU.test_addmm_errors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_False_cpu_complex128 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_False_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_False_cpu_complex64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_False_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_False_cpu_float32 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_False_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_False_cpu_float64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_False_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_True_cpu_complex128 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_True_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_True_cpu_complex64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_True_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_True_cpu_float32 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_True_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_True_cpu_float64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int32_noncontiguous_True_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_False_cpu_complex128 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_False_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_False_cpu_complex64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_False_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_False_cpu_float32 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_False_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_False_cpu_float64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_False_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_True_cpu_complex128 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_True_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_True_cpu_complex64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_True_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_True_cpu_float32 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_True_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_True_cpu_float64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_2_int64_noncontiguous_True_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_False_cpu_complex128 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_False_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_False_cpu_complex64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_False_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_False_cpu_float32 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_False_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_False_cpu_float64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_False_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_True_cpu_complex128 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_True_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_True_cpu_complex64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_True_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_True_cpu_float32 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_True_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_True_cpu_float64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int32_noncontiguous_True_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_False_cpu_complex128 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_False_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_False_cpu_complex64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_False_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_False_cpu_float32 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_False_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_False_cpu_float64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_False_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_True_cpu_complex128 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_True_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_True_cpu_complex64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_True_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_True_cpu_float32 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_True_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_True_cpu_float64 b/test/dynamo_skips/TestSparseCSRCPU.test_block_triangular_solve_block_size_3_int64_noncontiguous_True_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_csr_to_block_csr_blocksize_2_cpu_float64_int32 b/test/dynamo_skips/TestSparseCSRCPU.test_csr_to_block_csr_blocksize_2_cpu_float64_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_csr_to_block_csr_blocksize_2_cpu_float64_int64 b/test/dynamo_skips/TestSparseCSRCPU.test_csr_to_block_csr_blocksize_2_cpu_float64_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_csr_to_block_csr_blocksize_4_cpu_float64_int32 b/test/dynamo_skips/TestSparseCSRCPU.test_csr_to_block_csr_blocksize_4_cpu_float64_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_csr_to_block_csr_blocksize_4_cpu_float64_int64 b/test/dynamo_skips/TestSparseCSRCPU.test_csr_to_block_csr_blocksize_4_cpu_float64_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseBSC_Batched_NonHybrid_cpu b/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseBSC_Batched_NonHybrid_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseBSC_NonBatched_NonHybrid_cpu b/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseBSC_NonBatched_NonHybrid_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseBSR_Batched_NonHybrid_cpu b/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseBSR_Batched_NonHybrid_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseBSR_NonBatched_NonHybrid_cpu b/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseBSR_NonBatched_NonHybrid_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseCSC_Batched_NonHybrid_cpu b/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseCSC_Batched_NonHybrid_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseCSC_NonBatched_NonHybrid_cpu b/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseCSC_NonBatched_NonHybrid_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseCSR_Batched_NonHybrid_cpu b/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseCSR_Batched_NonHybrid_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseCSR_NonBatched_NonHybrid_cpu b/test/dynamo_skips/TestSparseCSRCPU.test_dense_to_from_sparse_compressed_SparseCSR_NonBatched_NonHybrid_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_mm_errors_cpu_float32 b/test/dynamo_skips/TestSparseCSRCPU.test_mm_errors_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_bfloat16 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_bool b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_complex128 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_complex64 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_float16 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_float32 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_float64 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_int16 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_int32 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_int64 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_int8 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_uint8 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csc_to_dense_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_bfloat16 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_bool b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_complex128 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_complex64 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_float16 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_float32 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_float64 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_int16 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_int32 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_int64 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_int8 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_uint8 b/test/dynamo_skips/TestSparseCSRCPU.test_sparse_csr_to_dense_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_bfloat16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_bool b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_complex128 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_complex64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_float16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_float32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_float64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_int16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_int32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_int64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_int8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_uint8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int32_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_bfloat16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_bool b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_complex128 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_complex64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_float16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_float32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_float64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_int16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_int32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_int64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_int8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_uint8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSC_int64_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_bfloat16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_bool b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_complex128 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_complex64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_float16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_float32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_float64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_int16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_int32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_int64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_int8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_uint8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int32_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_bfloat16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_bool b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_complex128 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_complex64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_float16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_float32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_float64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_int16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_int32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_int64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_int8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_uint8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseBSR_int64_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_bfloat16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_bool b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_complex128 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_complex64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_float16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_float32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_float64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_int16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_int32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_int64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_int8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_uint8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int32_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_bfloat16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_bool b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_complex128 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_complex64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_float16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_float32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_float64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_int16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_int32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_int64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_int8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_uint8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSC_int64_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_bfloat16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_bool b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_complex128 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_complex64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_float16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_float32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_float64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_int16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_int32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_int64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_int8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_uint8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int32_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_bfloat16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_bool b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_bool
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_complex128 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_complex128
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_complex64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_complex64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_float16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_float16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_float32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_float64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_int16 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_int16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_int32 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_int32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_int64 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_int64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_int8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_int8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_uint8 b/test/dynamo_skips/TestSparseCompressedCPU.test_select_copy_SparseCSR_int64_cpu_uint8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSparseMeta.test_basic b/test/dynamo_skips/TestSparseMeta.test_basic
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSqueeze.test_squeeze_type b/test/dynamo_skips/TestSqueeze.test_squeeze_type
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSubscripting.test_test_zero_rank b/test/dynamo_skips/TestSubscripting.test_test_zero_rank
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSymbolicShapeAnalysis.test_convolution_backward b/test/dynamo_skips/TestSymbolicShapeAnalysis.test_convolution_backward
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestSymbolicTracing.test_cpu_scalar_cuda b/test/dynamo_skips/TestSymbolicTracing.test_cpu_scalar_cuda
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTEFuserDynamic.test_matmul b/test/dynamo_skips/TestTEFuserDynamic.test_matmul
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTEFuserDynamic.test_unary_ops b/test/dynamo_skips/TestTEFuserDynamic.test_unary_ops
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTEFuserStatic.test_unary_ops b/test/dynamo_skips/TestTEFuserStatic.test_unary_ops
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTensorBuiltins.test_scalar_to_num_conversions b/test/dynamo_skips/TestTensorBuiltins.test_scalar_to_num_conversions
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTensorExprPyBind.test_kernel_with_custom_lowering b/test/dynamo_skips/TestTensorExprPyBind.test_kernel_with_custom_lowering
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchTidyProfiler.test_optimizer b/test/dynamo_skips/TestTorchTidyProfiler.test_optimizer
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchTidyProfiler.test_optimizer_parameters_adam b/test/dynamo_skips/TestTorchTidyProfiler.test_optimizer_parameters_adam
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_default_args b/test/dynamo_skips/TestTorchbind.test_default_args
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_lambda_as_constructor b/test/dynamo_skips/TestTorchbind.test_lambda_as_constructor
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_profiler_custom_op b/test/dynamo_skips/TestTorchbind.test_profiler_custom_op
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_staticmethod b/test/dynamo_skips/TestTorchbind.test_staticmethod
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind b/test/dynamo_skips/TestTorchbind.test_torchbind
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_attr_exception b/test/dynamo_skips/TestTorchbind.test_torchbind_attr_exception
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_class_attr_recursive b/test/dynamo_skips/TestTorchbind.test_torchbind_class_attr_recursive
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_class_attribute b/test/dynamo_skips/TestTorchbind.test_torchbind_class_attribute
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_deepcopy b/test/dynamo_skips/TestTorchbind.test_torchbind_deepcopy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_def_property_getter_setter b/test/dynamo_skips/TestTorchbind.test_torchbind_def_property_getter_setter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_def_property_just_getter b/test/dynamo_skips/TestTorchbind.test_torchbind_def_property_just_getter
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_def_property_readwrite b/test/dynamo_skips/TestTorchbind.test_torchbind_def_property_readwrite
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_getattr b/test/dynamo_skips/TestTorchbind.test_torchbind_getattr
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_getstate b/test/dynamo_skips/TestTorchbind.test_torchbind_getstate
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_instantiate_missing_class b/test/dynamo_skips/TestTorchbind.test_torchbind_instantiate_missing_class
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_lambda_method b/test/dynamo_skips/TestTorchbind.test_torchbind_lambda_method
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_no_init b/test/dynamo_skips/TestTorchbind.test_torchbind_no_init
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_optional_explicit_attr b/test/dynamo_skips/TestTorchbind.test_torchbind_optional_explicit_attr
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_pass_wrong_type b/test/dynamo_skips/TestTorchbind.test_torchbind_pass_wrong_type
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_pickle_serialization b/test/dynamo_skips/TestTorchbind.test_torchbind_pickle_serialization
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_python_deepcopy b/test/dynamo_skips/TestTorchbind.test_torchbind_python_deepcopy
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_return_instance b/test/dynamo_skips/TestTorchbind.test_torchbind_return_instance
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_return_instance_from_method b/test/dynamo_skips/TestTorchbind.test_torchbind_return_instance_from_method
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_return_tuple b/test/dynamo_skips/TestTorchbind.test_torchbind_return_tuple
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_save_load b/test/dynamo_skips/TestTorchbind.test_torchbind_save_load
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_take_as_arg b/test/dynamo_skips/TestTorchbind.test_torchbind_take_as_arg
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_take_instance_as_method_arg b/test/dynamo_skips/TestTorchbind.test_torchbind_take_instance_as_method_arg
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_tracing b/test/dynamo_skips/TestTorchbind.test_torchbind_tracing
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTorchbind.test_torchbind_tracing_nested b/test/dynamo_skips/TestTorchbind.test_torchbind_tracing_nested
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTransformersCPU.test_decoder_padding_and_src_mask_bool_cpu b/test/dynamo_skips/TestTransformersCPU.test_decoder_padding_and_src_mask_bool_cpu
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTypesAndAnnotation.test_pep585_type b/test/dynamo_skips/TestTypesAndAnnotation.test_pep585_type
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestTyping.test_optional_conversion b/test/dynamo_skips/TestTyping.test_optional_conversion
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestUnaryUfuncsCPU.test_sinc_cpu_float64 b/test/dynamo_skips/TestUnaryUfuncsCPU.test_sinc_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestUnaryUfuncsCPU.test_special_i0_i1_vs_scipy_cpu_bfloat16 b/test/dynamo_skips/TestUnaryUfuncsCPU.test_special_i0_i1_vs_scipy_cpu_bfloat16
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestUnaryUfuncsCPU.test_special_i0_i1_vs_scipy_cpu_float32 b/test/dynamo_skips/TestUnaryUfuncsCPU.test_special_i0_i1_vs_scipy_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestUnaryUfuncsCPU.test_special_i0_i1_vs_scipy_cpu_float64 b/test/dynamo_skips/TestUnaryUfuncsCPU.test_special_i0_i1_vs_scipy_cpu_float64
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestVerifyCorrectness.test_example_inputs b/test/dynamo_skips/TestVerifyCorrectness.test_example_inputs
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestVstack.test_generator b/test/dynamo_skips/TestVstack.test_generator
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_cat_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_cat_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpyCatCustomOp_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpyCatCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpyCubeCustomOp_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpyCubeCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpyMulCustomOp_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpyMulCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpyNMSCustomOp_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpyNMSCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpyNonzeroCustomOp_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpyNonzeroCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpySortCustomOp_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpySortCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpySplitCopyCustomOp_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpySplitCopyCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpySplitCopyWithIntCustomOp_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpySplitCopyWithIntCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpyTakeCustomOp_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpyTakeCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpyViewCopyCustomOp_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_custom_NumpyViewCopyCustomOp_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_mul_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_mul_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_native_batch_norm_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_native_batch_norm_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_split_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_split_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_split_list_args_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_split_list_args_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_view_cpu_float32 b/test/dynamo_skips/TestWrapperSubclassAliasingCPU.test_wrapper_subclass_aliasing_view_cpu_float32
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/dynamo_skips/TraceRuleTests.test_torch_name_rule_map_updated b/test/dynamo_skips/TraceRuleTests.test_torch_name_rule_map_updated
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/error_messages/storage.py b/test/error_messages/storage.py
index b33b86e0908a9..4e9b4b8ce6931 100644
--- a/test/error_messages/storage.py
+++ b/test/error_messages/storage.py
@@ -6,66 +6,71 @@ def check_error(desc, fn, *required_substrings):
         fn()
     except Exception as e:
         error_message = e.args[0]
-        print('=' * 80)
+        print("=" * 80)
         print(desc)
-        print('-' * 80)
+        print("-" * 80)
         print(error_message)
-        print('')
+        print("")
         for sub in required_substrings:
             assert sub in error_message
         return
     raise AssertionError(f"given function ({desc}) didn't raise an error")
 
-check_error(
-    'Wrong argument types',
-    lambda: torch.FloatStorage(object()),
-    'object')
 
-check_error('Unknown keyword argument',
-            lambda: torch.FloatStorage(content=1234.),
-            'keyword')
+check_error("Wrong argument types", lambda: torch.FloatStorage(object()), "object")
 
-check_error('Invalid types inside a sequence',
-            lambda: torch.FloatStorage(['a', 'b']),
-            'list', 'str')
+check_error(
+    "Unknown keyword argument", lambda: torch.FloatStorage(content=1234.0), "keyword"
+)
 
-check_error('Invalid size type',
-            lambda: torch.FloatStorage(1.5),
-            'float')
+check_error(
+    "Invalid types inside a sequence",
+    lambda: torch.FloatStorage(["a", "b"]),
+    "list",
+    "str",
+)
 
-check_error('Invalid offset',
-            lambda: torch.FloatStorage(torch.FloatStorage(2), 4),
-            '2', '4')
+check_error("Invalid size type", lambda: torch.FloatStorage(1.5), "float")
 
-check_error('Negative offset',
-            lambda: torch.FloatStorage(torch.FloatStorage(2), -1),
-            '2', '-1')
+check_error(
+    "Invalid offset", lambda: torch.FloatStorage(torch.FloatStorage(2), 4), "2", "4"
+)
 
-check_error('Invalid size',
-            lambda: torch.FloatStorage(torch.FloatStorage(3), 1, 5),
-            '2', '1', '5')
+check_error(
+    "Negative offset", lambda: torch.FloatStorage(torch.FloatStorage(2), -1), "2", "-1"
+)
+
+check_error(
+    "Invalid size",
+    lambda: torch.FloatStorage(torch.FloatStorage(3), 1, 5),
+    "2",
+    "1",
+    "5",
+)
 
-check_error('Negative size',
-            lambda: torch.FloatStorage(torch.FloatStorage(3), 1, -5),
-            '2', '1', '-5')
+check_error(
+    "Negative size",
+    lambda: torch.FloatStorage(torch.FloatStorage(3), 1, -5),
+    "2",
+    "1",
+    "-5",
+)
 
-check_error('Invalid index type',
-            lambda: torch.FloatStorage(10)['first item'],
-            'str')
+check_error("Invalid index type", lambda: torch.FloatStorage(10)["first item"], "str")
 
 
 def assign():
-    torch.FloatStorage(10)[1:-1] = '1'
-check_error('Invalid value type',
-            assign,
-            'str')
-
-check_error('resize_ with invalid type',
-            lambda: torch.FloatStorage(10).resize_(1.5),
-            'float')
-
-check_error('fill_ with invalid type',
-            lambda: torch.IntStorage(10).fill_('asdf'),
-            'str')
+    torch.FloatStorage(10)[1:-1] = "1"
+
+
+check_error("Invalid value type", assign, "str")
+
+check_error(
+    "resize_ with invalid type", lambda: torch.FloatStorage(10).resize_(1.5), "float"
+)
+
+check_error(
+    "fill_ with invalid type", lambda: torch.IntStorage(10).fill_("asdf"), "str"
+)
 
 # TODO: frombuffer
diff --git a/test/expect/HasDecompTest.test_aten_core_operators.expect b/test/expect/HasDecompTest.test_aten_core_operators.expect
index 2ea3fbf1e287d..dc3d8cc389a7d 100644
--- a/test/expect/HasDecompTest.test_aten_core_operators.expect
+++ b/test/expect/HasDecompTest.test_aten_core_operators.expect
@@ -6,6 +6,9 @@ aten::_adaptive_avg_pool2d
 aten::_adaptive_avg_pool2d.out
 aten::_addmm_activation
 aten::_addmm_activation.out
+aten::_batch_norm_no_update
+aten::_batch_norm_with_update
+aten::_batch_norm_with_update_functional
 aten::_euclidean_dist.out
 aten::_fused_dropout
 aten::_fused_dropout.out
@@ -19,9 +22,9 @@ aten::_softmax
 aten::_softmax.out
 aten::_to_copy
 aten::_to_copy.out
-aten::_upsample_nearest_exact1d
-aten::_upsample_nearest_exact2d
-aten::_upsample_nearest_exact3d
+aten::_upsample_nearest_exact1d.out
+aten::_upsample_nearest_exact2d.out
+aten::_upsample_nearest_exact3d.out
 aten::abs
 aten::abs.out
 aten::abs_
@@ -76,6 +79,7 @@ aten::atanh
 aten::atanh.out
 aten::atanh_
 aten::baddbmm_
+aten::batch_norm_backward
 aten::bitwise_and.Scalar
 aten::bitwise_and.Scalar_Tensor
 aten::bitwise_and.Scalar_Tensor_out
@@ -238,6 +242,8 @@ aten::fmod.Tensor
 aten::fmod.Tensor_out
 aten::fmod_.Scalar
 aten::fmod_.Tensor
+aten::frexp.Tensor
+aten::frexp.Tensor_out
 aten::full
 aten::full.out
 aten::gcd
@@ -267,6 +273,8 @@ aten::hardtanh.out
 aten::hypot
 aten::hypot.out
 aten::hypot_
+aten::i0
+aten::i0.out
 aten::i0_
 aten::igamma
 aten::igamma.out
@@ -417,8 +425,10 @@ aten::remainder_.Scalar
 aten::remainder_.Tensor
 aten::repeat
 aten::repeat.out
-aten::roll
-aten::roll.out
+aten::round
+aten::round.decimals
+aten::round.decimals_out
+aten::round.out
 aten::round_
 aten::round_.decimals
 aten::rrelu_with_noise_backward
@@ -501,9 +511,10 @@ aten::uniform.out
 aten::uniform_
 aten::unsqueeze
 aten::upsample_bicubic2d
-aten::upsample_nearest1d
-aten::upsample_nearest2d
-aten::upsample_nearest3d
+aten::upsample_bicubic2d.out
+aten::upsample_nearest1d.out
+aten::upsample_nearest2d.out
+aten::upsample_nearest3d.out
 aten::var.correction
 aten::var.correction_out
 aten::var_mean.correction
diff --git a/test/expect/HasDecompTest.test_has_decomposition.expect b/test/expect/HasDecompTest.test_has_decomposition.expect
index 863d9cf005dfe..764379473b012 100644
--- a/test/expect/HasDecompTest.test_has_decomposition.expect
+++ b/test/expect/HasDecompTest.test_has_decomposition.expect
@@ -30,6 +30,8 @@ aten::_amp_update_scale.out
 aten::_amp_update_scale_
 aten::_assert_async
 aten::_assert_async.msg
+aten::_batch_norm_no_update.out
+aten::_batch_norm_with_update.out
 aten::_cdist_backward
 aten::_cdist_backward.out
 aten::_cdist_forward
@@ -353,6 +355,12 @@ aten::_fused_moving_avg_obs_fq_helper
 aten::_fused_moving_avg_obs_fq_helper.out
 aten::_fused_moving_avg_obs_fq_helper_functional
 aten::_fused_sdp_choice
+aten::_fused_sgd
+aten::_fused_sgd.out
+aten::_fused_sgd.tensor_lr
+aten::_fused_sgd.tensor_lr_out
+aten::_fused_sgd_
+aten::_fused_sgd_.tensor_lr
 aten::_fw_primal
 aten::_fw_primal_copy
 aten::_fw_primal_copy.out
@@ -375,11 +383,13 @@ aten::_int_mm
 aten::_int_mm.out
 aten::_is_all_true
 aten::_is_any_true
+aten::_lazy_clone
 aten::_linalg_check_errors
 aten::_linalg_det
 aten::_linalg_det.result
 aten::_linalg_eigh
 aten::_linalg_eigh.eigenvalues
+aten::_linalg_eigvals
 aten::_linalg_slogdet
 aten::_linalg_slogdet.sign
 aten::_linalg_solve_ex
@@ -423,10 +433,18 @@ aten::_native_multi_head_attention.out
 aten::_neg_view
 aten::_neg_view_copy
 aten::_neg_view_copy.out
+aten::_nested_compute_contiguous_strides_offsets
 aten::_nested_from_padded
 aten::_nested_from_padded.out
 aten::_nested_from_padded_and_nested_example
 aten::_nested_from_padded_and_nested_example.out
+aten::_nested_get_jagged_dummy
+aten::_nested_get_lengths
+aten::_nested_get_offsets
+aten::_nested_get_ragged_idx
+aten::_nested_get_values
+aten::_nested_get_values_copy
+aten::_nested_get_values_copy.out
 aten::_nested_select_backward
 aten::_nested_sum_backward
 aten::_nested_tensor_from_mask
@@ -444,6 +462,9 @@ aten::_nested_tensor_strides.out
 aten::_nested_view_from_buffer
 aten::_nested_view_from_buffer_copy
 aten::_nested_view_from_buffer_copy.out
+aten::_nested_view_from_jagged
+aten::_nested_view_from_jagged_copy
+aten::_nested_view_from_jagged_copy.out
 aten::_new_zeros_with_same_feature_meta
 aten::_new_zeros_with_same_feature_meta.out
 aten::_nnpack_spatial_convolution
@@ -465,10 +486,12 @@ aten::_resize_output.out
 aten::_resize_output_
 aten::_sample_dirichlet
 aten::_sample_dirichlet.out
+aten::_scaled_dot_product_cudnn_attention
+aten::_scaled_dot_product_cudnn_attention_backward
 aten::_scaled_dot_product_efficient_attention
 aten::_scaled_dot_product_efficient_attention_backward
+aten::_scaled_dot_product_flash_attention
 aten::_scaled_dot_product_flash_attention_backward
-aten::_scaled_dot_product_flash_attention_for_cpu
 aten::_scaled_dot_product_flash_attention_for_cpu_backward
 aten::_scaled_mm
 aten::_scaled_mm.out
@@ -484,6 +507,7 @@ aten::_sparse_addmm.out
 aten::_sparse_broadcast_to
 aten::_sparse_broadcast_to_copy
 aten::_sparse_broadcast_to_copy.out
+aten::_sparse_compressed_tensor_with_dims
 aten::_sparse_coo_tensor_with_dims
 aten::_sparse_coo_tensor_with_dims.out
 aten::_sparse_coo_tensor_with_dims_and_tensors
@@ -500,7 +524,12 @@ aten::_sparse_mask_projection
 aten::_sparse_mask_projection.out
 aten::_sparse_mm_reduce_impl
 aten::_sparse_mm_reduce_impl_backward
+aten::_sparse_semi_structured_addmm
+aten::_sparse_semi_structured_apply
+aten::_sparse_semi_structured_apply_dense
 aten::_sparse_semi_structured_linear
+aten::_sparse_semi_structured_mm
+aten::_sparse_semi_structured_tile
 aten::_sparse_softmax
 aten::_sparse_softmax.out
 aten::_sparse_softmax_backward_data
@@ -532,6 +561,7 @@ aten::_test_optional_floatlist
 aten::_test_optional_floatlist.out
 aten::_test_optional_intlist
 aten::_test_optional_intlist.out
+aten::_test_parallel_materialize
 aten::_test_warn_in_autograd
 aten::_test_warn_in_autograd.out
 aten::_thnn_fused_gru_cell
@@ -580,13 +610,10 @@ aten::_upsample_bilinear2d_aa
 aten::_upsample_bilinear2d_aa.out
 aten::_upsample_bilinear2d_aa_backward
 aten::_upsample_bilinear2d_aa_backward.grad_input
-aten::_upsample_nearest_exact1d.out
 aten::_upsample_nearest_exact1d_backward
 aten::_upsample_nearest_exact1d_backward.grad_input
-aten::_upsample_nearest_exact2d.out
 aten::_upsample_nearest_exact2d_backward
 aten::_upsample_nearest_exact2d_backward.grad_input
-aten::_upsample_nearest_exact3d.out
 aten::_upsample_nearest_exact3d_backward
 aten::_upsample_nearest_exact3d_backward.grad_input
 aten::_use_cudnn_ctc_loss
@@ -596,6 +623,7 @@ aten::_values
 aten::_values_copy
 aten::_values_copy.out
 aten::_weight_int4pack_mm
+aten::_weight_int8pack_mm
 aten::_weight_norm_interface_backward
 aten::_weight_norm_interface_backward.out
 aten::adaptive_avg_pool2d.out
@@ -765,8 +793,6 @@ aten::fractional_max_pool3d
 aten::fractional_max_pool3d.output
 aten::fractional_max_pool3d_backward
 aten::fractional_max_pool3d_backward.grad_input
-aten::frexp.Tensor
-aten::frexp.Tensor_out
 aten::from_file
 aten::from_file.out
 aten::full.names
@@ -809,8 +835,6 @@ aten::histogram.bins_tensor
 aten::histogram.bins_tensor_out
 aten::hspmm
 aten::hspmm.out
-aten::i0
-aten::i0.out
 aten::index.Tensor
 aten::index.Tensor_out
 aten::index_put
@@ -825,12 +849,6 @@ aten::int_repr.out
 aten::is_coalesced
 aten::is_pinned
 aten::is_set_to
-aten::isin.Scalar_Tensor
-aten::isin.Scalar_Tensor_out
-aten::isin.Tensor_Scalar
-aten::isin.Tensor_Scalar_out
-aten::isin.Tensor_Tensor
-aten::isin.Tensor_Tensor_out
 aten::kaiser_window
 aten::kaiser_window.beta
 aten::kaiser_window.beta_out
@@ -843,10 +861,9 @@ aten::lift_fresh_copy
 aten::lift_fresh_copy.out
 aten::linalg_cholesky_ex
 aten::linalg_cholesky_ex.L
-aten::linalg_cross
-aten::linalg_cross.out
 aten::linalg_eig
 aten::linalg_eig.out
+aten::linalg_eigvals.out
 aten::linalg_householder_product
 aten::linalg_householder_product.out
 aten::linalg_inv_ex
@@ -997,10 +1014,6 @@ aten::ormqr
 aten::ormqr.out
 aten::permute_copy
 aten::permute_copy.out
-aten::pixel_shuffle
-aten::pixel_shuffle.out
-aten::pixel_unshuffle
-aten::pixel_unshuffle.out
 aten::poisson
 aten::poisson.out
 aten::polygamma
@@ -1106,21 +1119,13 @@ aten::replication_pad3d_backward.grad_input
 aten::resize
 aten::resize.out
 aten::resize_
-aten::resize_as
-aten::resize_as.out
 aten::resize_as_
 aten::resize_as_sparse
 aten::resize_as_sparse.out
 aten::resize_as_sparse_
-aten::round
-aten::round.decimals
-aten::round.decimals_out
-aten::round.out
 aten::row_indices
 aten::row_indices_copy
 aten::row_indices_copy.out
-aten::rsub.Scalar_out
-aten::rsub.Tensor_out
 aten::scalar_tensor
 aten::scalar_tensor.out
 aten::scatter.reduce
@@ -1144,8 +1149,6 @@ aten::segment_reduce.out
 aten::select.int
 aten::select_copy.int
 aten::select_copy.int_out
-aten::select_scatter
-aten::select_scatter.out
 aten::set
 aten::set.out
 aten::set.source_Storage
@@ -1160,6 +1163,7 @@ aten::set_.source_Storage_storage_offset
 aten::set_.source_Tensor
 aten::slice_copy.Tensor
 aten::slice_copy.Tensor_out
+aten::slice_inverse
 aten::slice_scatter
 aten::slice_scatter.out
 aten::slow_conv3d_forward
@@ -1286,8 +1290,6 @@ aten::special_shifted_chebyshev_polynomial_w.x_scalar
 aten::special_shifted_chebyshev_polynomial_w.x_scalar_out
 aten::split_copy.Tensor
 aten::split_copy.Tensor_out
-aten::split_with_sizes_copy
-aten::split_with_sizes_copy.out
 aten::squeeze_
 aten::squeeze_.dim
 aten::squeeze_.dims
@@ -1325,27 +1327,18 @@ aten::unsafe_split_with_sizes.out
 aten::unsqueeze_
 aten::unsqueeze_copy
 aten::unsqueeze_copy.out
-aten::upsample_bicubic2d.out
 aten::upsample_bicubic2d_backward
 aten::upsample_bicubic2d_backward.grad_input
-aten::upsample_bilinear2d.out
 aten::upsample_bilinear2d_backward
 aten::upsample_bilinear2d_backward.grad_input
-aten::upsample_linear1d
-aten::upsample_linear1d.out
 aten::upsample_linear1d_backward
 aten::upsample_linear1d_backward.grad_input
-aten::upsample_nearest1d.out
 aten::upsample_nearest1d_backward
 aten::upsample_nearest1d_backward.grad_input
-aten::upsample_nearest2d.out
 aten::upsample_nearest2d_backward
 aten::upsample_nearest2d_backward.grad_input
-aten::upsample_nearest3d.out
 aten::upsample_nearest3d_backward
 aten::upsample_nearest3d_backward.grad_input
-aten::upsample_trilinear3d
-aten::upsample_trilinear3d.out
 aten::upsample_trilinear3d_backward
 aten::upsample_trilinear3d_backward.grad_input
 aten::values
diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
index 053ef6c3ae489..e1e5d745a5dac 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect
@@ -1,10 +1,10 @@
 torch.fx._symbolic_trace.ProxyableClassMeta []
-torch.fx._symbolic_trace.Tracer ['call_module', 'create_arg', 'create_args_for_root', 'getattr', 'is_leaf_module', 'path_of_module', 'trace']
-torch.fx.graph.Graph ['call_function', 'call_method', 'call_module', 'create_node', 'eliminate_dead_code', 'erase_node', 'get_attr', 'graph_copy', 'inserting_after', 'inserting_before', 'lint', 'node_copy', 'nodes', 'on_generate_code', 'output', 'owning_module', 'placeholder', 'print_tabular', 'process_inputs', 'process_outputs', 'python_code', 'set_codegen']
+torch.fx._symbolic_trace.Tracer ['call_module', 'create_arg', 'create_args_for_root', 'get_fresh_qualname', 'getattr', 'is_leaf_module', 'path_of_module', 'trace']
+torch.fx.graph.Graph ['call_function', 'call_method', 'call_module', 'create_node', 'eliminate_dead_code', 'erase_node', 'find_nodes', 'get_attr', 'graph_copy', 'inserting_after', 'inserting_before', 'lint', 'node_copy', 'nodes', 'on_generate_code', 'output', 'owning_module', 'placeholder', 'print_tabular', 'process_inputs', 'process_outputs', 'python_code', 'set_codegen']
 torch.fx.graph.PythonCode []
 torch.fx.graph_module.GraphModule ['add_submodule', 'code', 'delete_all_unused_submodules', 'delete_submodule', 'graph', 'print_readable', 'recompile', 'to_folder']
-torch.fx.immutable_collections.immutable_dict ['clear', 'pop', 'popitem', 'update']
-torch.fx.immutable_collections.immutable_list ['append', 'clear', 'extend', 'insert', 'pop', 'remove']
+torch.fx.immutable_collections.immutable_dict ['clear', 'pop', 'popitem', 'setdefault', 'update']
+torch.fx.immutable_collections.immutable_list ['append', 'clear', 'extend', 'insert', 'pop', 'remove', 'reverse', 'sort']
 torch.fx.interpreter.Interpreter ['boxed_run', 'call_function', 'call_method', 'call_module', 'fetch_args_kwargs_from_env', 'fetch_attr', 'get_attr', 'map_nodes_to_values', 'output', 'placeholder', 'run', 'run_node']
 torch.fx.interpreter.Transformer ['call_function', 'call_module', 'get_attr', 'placeholder', 'transform']
 torch.fx.node.Node ['all_input_nodes', 'append', 'args', 'format_node', 'insert_arg', 'is_impure', 'kwargs', 'next', 'normalized_arguments', 'prepend', 'prev', 'replace_all_uses_with', 'replace_input_with', 'stack_trace', 'update_arg', 'update_kwarg']
diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
index 7ab868fe7a78b..6f18ffaa583ab 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
@@ -1,6 +1,7 @@
 torch.fx._symbolic_trace.Tracer.__init__(self, autowrap_modules: Tuple[Callable] = (<module math>,), autowrap_functions: Tuple[Callable, ...] = (,), param_shapes_constant: bool = False) -> None
 torch.fx._symbolic_trace.Tracer.call_module(self, m: torch.nn.modules.module.Module, forward: Callable[..., Any], args: Tuple[Any, ...], kwargs: Dict[str, Any]) -> Any
 torch.fx._symbolic_trace.Tracer.create_arg(self, a: Any) -> 'Argument'
+torch.fx._symbolic_trace.Tracer.get_fresh_qualname(self, prefix: str) -> str
 torch.fx._symbolic_trace.Tracer.is_leaf_module(self, m: torch.nn.modules.module.Module, module_qualified_name: str) -> bool
 torch.fx._symbolic_trace.Tracer.path_of_module(self, mod: torch.nn.modules.module.Module) -> str
 torch.fx._symbolic_trace.Tracer.trace(self, root: Union[torch.nn.modules.module.Module, Callable[..., Any]], concrete_args: Optional[Dict[str, Any]] = None) -> torch.fx.graph.Graph
@@ -22,7 +23,7 @@ torch.fx.graph.Graph.node_copy(self, node: torch.fx.node.Node, arg_transform: Ca
 torch.fx.graph.Graph.output(self, result: 'Argument', type_expr: Optional[Any] = None)
 torch.fx.graph.Graph.placeholder(self, name: str, type_expr: Optional[Any] = None, default_value: Any) -> torch.fx.node.Node
 torch.fx.graph.Graph.print_tabular(self)
-torch.fx.graph.Graph.python_code(self, root_module: str, verbose: bool = False) -> torch.fx.graph.PythonCode
+torch.fx.graph.Graph.python_code(self, root_module: str, verbose: bool = False, include_stride: bool = False, include_device: bool = False) -> torch.fx.graph.PythonCode
 torch.fx.graph_module.GraphModule.__init__(self, root: Union[torch.nn.modules.module.Module, Dict[str, Any]], graph: torch.fx.graph.Graph, class_name: str = 'GraphModule')
 torch.fx.graph_module.GraphModule.add_submodule(self, target: str, m: torch.nn.modules.module.Module) -> bool
 torch.fx.graph_module.GraphModule.delete_all_unused_submodules(self) -> None
@@ -62,7 +63,8 @@ torch.fx.node.Node.update_kwarg(self, key: str, arg: torch.fx.node.Argument) ->
 torch.fx.node.map_aggregate(a: torch.fx.node.Argument, fn: Callable[[torch.fx.node.Argument], torch.fx.node.Argument]) -> torch.fx.node.Argument
 torch.fx.node.map_arg(a: torch.fx.node.Argument, fn: Callable[[torch.fx.node.Node], torch.fx.node.Argument]) -> torch.fx.node.Argument
 torch.fx.passes.reinplace.reinplace(gm, *sample_args)
-torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int], qualname_map: Optional[Dict[str, str]] = None, keep_original_order: Optional[bool] = False)
+torch.fx.passes.runtime_assert.insert_deferred_runtime_asserts(gm: torch.fx.graph_module.GraphModule, shape_env: Any, name: str, export: bool = False) -> None
+torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int], qualname_map: Optional[Dict[str, str]] = None, keep_original_order: Optional[bool] = False, keep_original_node_name: Optional[bool] = False)
 torch.fx.proxy.Attribute.__init__(self, root: torch.fx.proxy.Proxy, attr: str)
 torch.fx.proxy.Proxy.__init__(self, node: torch.fx.node.Node, tracer: 'Optional[TracerBase]' = None)
 torch.fx.proxy.Proxy.keys(self)
diff --git a/test/expect/TestSparseMeta.test_print_meta_SparseBSC_float64.expect b/test/expect/TestSparseMeta.test_print_meta_SparseBSC_float64.expect
new file mode 100644
index 0000000000000..957fd7a275787
--- /dev/null
+++ b/test/expect/TestSparseMeta.test_print_meta_SparseBSC_float64.expect
@@ -0,0 +1,24 @@
+########## torch.float64/torch.int64/size=()+(4, 6)+(2, 3)+() ##########
+# sparse meta tensor
+tensor(ccol_indices=tensor(..., size=(3,)),
+       row_indices=tensor(..., size=(0,)),
+       values=tensor(..., size=(0, 2, 3)), device='meta', size=(4, 6),
+       dtype=torch.float64, layout=torch.sparse_bsc)
+########## torch.float64/torch.int64/size=()+(4, 6)+(2, 3)+(3,) ##########
+# sparse meta tensor
+tensor(ccol_indices=tensor(..., size=(3,)),
+       row_indices=tensor(..., size=(0,)),
+       values=tensor(..., size=(0, 2, 3, 3)), device='meta', size=(4, 6, 3),
+       dtype=torch.float64, layout=torch.sparse_bsc)
+########## torch.float64/torch.int64/size=(2,)+(4, 6)+(2, 3)+() ##########
+# sparse meta tensor
+tensor(ccol_indices=tensor(..., size=(2, 3)),
+       row_indices=tensor(..., size=(2, 0)),
+       values=tensor(..., size=(2, 0, 2, 3)), device='meta', size=(2, 4, 6),
+       dtype=torch.float64, layout=torch.sparse_bsc)
+########## torch.float64/torch.int64/size=(2,)+(4, 6)+(2, 3)+(3,) ##########
+# sparse meta tensor
+tensor(ccol_indices=tensor(..., size=(2, 3)),
+       row_indices=tensor(..., size=(2, 0)),
+       values=tensor(..., size=(2, 0, 2, 3, 3)), device='meta',
+       size=(2, 4, 6, 3), dtype=torch.float64, layout=torch.sparse_bsc)
\ No newline at end of file
diff --git a/test/expect/TestSparseMeta.test_print_meta_SparseBSR_float64.expect b/test/expect/TestSparseMeta.test_print_meta_SparseBSR_float64.expect
new file mode 100644
index 0000000000000..0854c964265c9
--- /dev/null
+++ b/test/expect/TestSparseMeta.test_print_meta_SparseBSR_float64.expect
@@ -0,0 +1,24 @@
+########## torch.float64/torch.int64/size=()+(4, 6)+(2, 3)+() ##########
+# sparse meta tensor
+tensor(crow_indices=tensor(..., size=(3,)),
+       col_indices=tensor(..., size=(0,)),
+       values=tensor(..., size=(0, 2, 3)), device='meta', size=(4, 6),
+       dtype=torch.float64, layout=torch.sparse_bsr)
+########## torch.float64/torch.int64/size=()+(4, 6)+(2, 3)+(3,) ##########
+# sparse meta tensor
+tensor(crow_indices=tensor(..., size=(3,)),
+       col_indices=tensor(..., size=(0,)),
+       values=tensor(..., size=(0, 2, 3, 3)), device='meta', size=(4, 6, 3),
+       dtype=torch.float64, layout=torch.sparse_bsr)
+########## torch.float64/torch.int64/size=(2,)+(4, 6)+(2, 3)+() ##########
+# sparse meta tensor
+tensor(crow_indices=tensor(..., size=(2, 3)),
+       col_indices=tensor(..., size=(2, 0)),
+       values=tensor(..., size=(2, 0, 2, 3)), device='meta', size=(2, 4, 6),
+       dtype=torch.float64, layout=torch.sparse_bsr)
+########## torch.float64/torch.int64/size=(2,)+(4, 6)+(2, 3)+(3,) ##########
+# sparse meta tensor
+tensor(crow_indices=tensor(..., size=(2, 3)),
+       col_indices=tensor(..., size=(2, 0)),
+       values=tensor(..., size=(2, 0, 2, 3, 3)), device='meta',
+       size=(2, 4, 6, 3), dtype=torch.float64, layout=torch.sparse_bsr)
\ No newline at end of file
diff --git a/test/expect/TestSparseMeta.test_print_meta_SparseCOO_float64.expect b/test/expect/TestSparseMeta.test_print_meta_SparseCOO_float64.expect
new file mode 100644
index 0000000000000..229fda0c07f04
--- /dev/null
+++ b/test/expect/TestSparseMeta.test_print_meta_SparseCOO_float64.expect
@@ -0,0 +1,23 @@
+########## torch.float64/torch.int64/size=()+(4, 6)+()+() ##########
+# sparse meta tensor
+tensor(indices=tensor(..., size=(2, 0)),
+       values=tensor(..., size=(0,)),
+       device='meta', size=(4, 6), dtype=torch.float64, layout=torch.sparse_coo)
+########## torch.float64/torch.int64/size=()+(4, 6)+()+(3,) ##########
+# sparse meta tensor
+tensor(indices=tensor(..., size=(2, 0)),
+       values=tensor(..., size=(0, 3)),
+       device='meta', size=(4, 6, 3), dtype=torch.float64,
+       layout=torch.sparse_coo)
+########## torch.float64/torch.int64/size=()+(3, 5, 7)+()+() ##########
+# sparse meta tensor
+tensor(indices=tensor(..., size=(3, 0)),
+       values=tensor(..., size=(0,)),
+       device='meta', size=(3, 5, 7), dtype=torch.float64,
+       layout=torch.sparse_coo)
+########## torch.float64/torch.int64/size=()+(3, 5, 7)+()+(3,) ##########
+# sparse meta tensor
+tensor(indices=tensor(..., size=(3, 0)),
+       values=tensor(..., size=(0, 3)),
+       device='meta', size=(3, 5, 7, 3), dtype=torch.float64,
+       layout=torch.sparse_coo)
\ No newline at end of file
diff --git a/test/expect/TestSparseMeta.test_print_meta_SparseCSC_float64.expect b/test/expect/TestSparseMeta.test_print_meta_SparseCSC_float64.expect
new file mode 100644
index 0000000000000..2bf55071dc608
--- /dev/null
+++ b/test/expect/TestSparseMeta.test_print_meta_SparseCSC_float64.expect
@@ -0,0 +1,24 @@
+########## torch.float64/torch.int64/size=()+(4, 6)+()+() ##########
+# sparse meta tensor
+tensor(ccol_indices=tensor(..., size=(7,)),
+       row_indices=tensor(..., size=(0,)),
+       values=tensor(..., size=(0,)), device='meta', size=(4, 6),
+       dtype=torch.float64, layout=torch.sparse_csc)
+########## torch.float64/torch.int64/size=()+(4, 6)+()+(3,) ##########
+# sparse meta tensor
+tensor(ccol_indices=tensor(..., size=(7,)),
+       row_indices=tensor(..., size=(0,)),
+       values=tensor(..., size=(0, 3)), device='meta', size=(4, 6, 3),
+       dtype=torch.float64, layout=torch.sparse_csc)
+########## torch.float64/torch.int64/size=(2,)+(4, 6)+()+() ##########
+# sparse meta tensor
+tensor(ccol_indices=tensor(..., size=(2, 7)),
+       row_indices=tensor(..., size=(2, 0)),
+       values=tensor(..., size=(2, 0)), device='meta', size=(2, 4, 6),
+       dtype=torch.float64, layout=torch.sparse_csc)
+########## torch.float64/torch.int64/size=(2,)+(4, 6)+()+(3,) ##########
+# sparse meta tensor
+tensor(ccol_indices=tensor(..., size=(2, 7)),
+       row_indices=tensor(..., size=(2, 0)),
+       values=tensor(..., size=(2, 0, 3)), device='meta', size=(2, 4, 6, 3),
+       dtype=torch.float64, layout=torch.sparse_csc)
\ No newline at end of file
diff --git a/test/expect/TestSparseMeta.test_print_meta_SparseCSR_float64.expect b/test/expect/TestSparseMeta.test_print_meta_SparseCSR_float64.expect
new file mode 100644
index 0000000000000..8e5ba60ad4335
--- /dev/null
+++ b/test/expect/TestSparseMeta.test_print_meta_SparseCSR_float64.expect
@@ -0,0 +1,24 @@
+########## torch.float64/torch.int64/size=()+(4, 6)+()+() ##########
+# sparse meta tensor
+tensor(crow_indices=tensor(..., size=(5,)),
+       col_indices=tensor(..., size=(0,)),
+       values=tensor(..., size=(0,)), device='meta', size=(4, 6),
+       dtype=torch.float64, layout=torch.sparse_csr)
+########## torch.float64/torch.int64/size=()+(4, 6)+()+(3,) ##########
+# sparse meta tensor
+tensor(crow_indices=tensor(..., size=(5,)),
+       col_indices=tensor(..., size=(0,)),
+       values=tensor(..., size=(0, 3)), device='meta', size=(4, 6, 3),
+       dtype=torch.float64, layout=torch.sparse_csr)
+########## torch.float64/torch.int64/size=(2,)+(4, 6)+()+() ##########
+# sparse meta tensor
+tensor(crow_indices=tensor(..., size=(2, 5)),
+       col_indices=tensor(..., size=(2, 0)),
+       values=tensor(..., size=(2, 0)), device='meta', size=(2, 4, 6),
+       dtype=torch.float64, layout=torch.sparse_csr)
+########## torch.float64/torch.int64/size=(2,)+(4, 6)+()+(3,) ##########
+# sparse meta tensor
+tensor(crow_indices=tensor(..., size=(2, 5)),
+       col_indices=tensor(..., size=(2, 0)),
+       values=tensor(..., size=(2, 0, 3)), device='meta', size=(2, 4, 6, 3),
+       dtype=torch.float64, layout=torch.sparse_csr)
\ No newline at end of file
diff --git a/test/export/test_db.py b/test/export/test_db.py
index a2d2f36af42ea..2abce16a8b0c0 100644
--- a/test/export/test_db.py
+++ b/test/export/test_db.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: dynamo"]
+# Owner(s): ["oncall: export"]
 
 import copy
 import unittest
@@ -12,12 +12,14 @@
 from torch.export import export
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+    IS_WINDOWS,
     parametrize,
     run_tests,
     TestCase,
 )
 
 
+@unittest.skipIf(IS_WINDOWS, "Windows not supported for this test")
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
 class ExampleTests(TestCase):
     # TODO Maybe we should make this tests actually show up in a file?
@@ -40,14 +42,14 @@ def test_exportdb_supported(self, name: str, case: ExportCase) -> None:
         exported_program.graph_module.print_readable()
 
         self.assertEqual(
-            exported_program(*inputs_export.args, **inputs_export.kwargs),
+            exported_program.module()(*inputs_export.args, **inputs_export.kwargs),
             model(*inputs_model.args, **inputs_model.kwargs),
         )
 
         if case.extra_inputs is not None:
             inputs = normalize_inputs(case.extra_inputs)
             self.assertEqual(
-                exported_program(*inputs.args, **inputs.kwargs),
+                exported_program.module()(*inputs.args, **inputs.kwargs),
                 model(*inputs.args, **inputs.kwargs),
             )
 
@@ -59,7 +61,9 @@ def test_exportdb_supported(self, name: str, case: ExportCase) -> None:
     def test_exportdb_not_supported(self, name: str, case: ExportCase) -> None:
         model = case.model
         # pyre-ignore
-        with self.assertRaises((torchdynamo.exc.Unsupported, AssertionError, RuntimeError)):
+        with self.assertRaises(
+            (torchdynamo.exc.Unsupported, AssertionError, RuntimeError)
+        ):
             inputs = normalize_inputs(case.example_inputs)
             exported_model = export(
                 model,
@@ -76,6 +80,7 @@ def test_exportdb_not_supported(self, name: str, case: ExportCase) -> None:
         for rewrite_case in get_rewrite_cases(case)
     ]
     if exportdb_not_supported_rewrite_cases:
+
         @parametrize(
             "name,rewrite_case",
             exportdb_not_supported_rewrite_cases,
diff --git a/test/export/test_experimental.py b/test/export/test_experimental.py
index c923b6af7053f..66f1a60ca9726 100644
--- a/test/export/test_experimental.py
+++ b/test/export/test_experimental.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: dynamo"]
+# Owner(s): ["oncall: export"]
 # flake8: noqa
 import unittest
 
@@ -43,17 +43,17 @@ def forward(self, x):
         self.assertExpectedInline(
             str(ep.graph_module.code.strip()),
             """\
-def forward(self, arg0_1, arg1_1):
-    sin = torch.ops.aten.sin.default(arg1_1)
-    strict_graph_1 = self.strict_graph_1
-    strict_mode_1 = torch.ops.higher_order.strict_mode(strict_graph_1, (sin, arg0_1));  strict_graph_1 = sin = arg0_1 = None
-    getitem_1 = strict_mode_1[0];  strict_mode_1 = None
-    add = torch.ops.aten.add.Tensor(arg1_1, 3);  arg1_1 = None
-    return (getitem_1, add)""",
+def forward(self, b_submodule_buffer1, x):
+    sin = torch.ops.aten.sin.default(x)
+    strict_graph_0 = self.strict_graph_0
+    strict_mode = torch.ops.higher_order.strict_mode(strict_graph_0, (sin, b_submodule_buffer1));  strict_graph_0 = sin = b_submodule_buffer1 = None
+    getitem = strict_mode[0];  strict_mode = None
+    add = torch.ops.aten.add.Tensor(x, 3);  x = None
+    return (getitem, add)""",
         )
 
         self.assertExpectedInline(
-            str(ep.graph_module.strict_graph_1.code.strip()),
+            str(ep.graph_module.strict_graph_0.code.strip()),
             """\
 def forward(self, arg0_1, arg1_1):
     add = torch.ops.aten.add.Tensor(arg0_1, 2)
@@ -70,13 +70,13 @@ def forward(self, arg0_1, arg1_1):
         eager_mod = M()
         ep = torch.export.export(eager_mod, (inp,), strict=True)
 
-        graph_res_1, graph_res_2 = ep(inp)
+        graph_res_1, graph_res_2 = ep.module()(inp)
         eager_res_1, eager_res_2 = eager_mod(inp)
 
         self.assertTrue(torch.allclose(graph_res_2, eager_res_2))
         self.assertTrue(torch.allclose(graph_res_1, eager_res_1))
 
-        graph_res_1, graph_res_2 = ep(inp)
+        graph_res_1, graph_res_2 = ep.module()(inp)
         eager_res_1, eager_res_2 = eager_mod(inp)
 
         self.assertTrue(torch.allclose(graph_res_2, eager_res_2))
diff --git a/test/export/test_export.py b/test/export/test_export.py
index 506edbf6412e4..bd9daaddf8d90 100644
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -1,28 +1,23 @@
-# Owner(s): ["module: dynamo"]
+# Owner(s): ["oncall: export"]
 # flake8: noqa
 import copy
 import dataclasses
+import io
+import logging
+import re
 import unittest
+import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass
+from re import escape
+from typing import Dict, List
 
 import torch
 import torch._dynamo as torchdynamo
+import torch.nn.functional as F
 from functorch.experimental.control_flow import cond, map
 from torch import Tensor
-from torch.export import (
-    Constraint,
-    Dim,
-    dynamic_dim,
-    export,
-    unflatten,
-)
-from torch.export._trace import (
-    _export_to_torch_ir,
-    _export,
-    DEFAULT_EXPORT_DYNAMO_CONFIG,
-)
-from torch._export import capture_pre_autograd_graph
+from torch._dynamo.test_case import TestCase
 from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse
 from torch._export.utils import (
     get_buffer,
@@ -32,11 +27,28 @@
     register_dataclass_as_pytree_node,
 )
 from torch._subclasses import FakeTensorMode
-from torch.export import Constraint, Dim
+from torch.export import Dim, dynamic_dim, export, unflatten
+from torch.export._trace import (
+    _export,
+    _export_to_torch_ir,
+    DEFAULT_EXPORT_DYNAMO_CONFIG,
+)
+from torch.export.graph_signature import InputKind
 from torch.fx.experimental.proxy_tensor import make_fx
+from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.testing import FileCheck
-from torch.testing._internal.common_utils import run_tests
-from torch._dynamo.test_case import TestCase
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
+from torch.testing._internal.common_device_type import onlyCPU, onlyCUDA
+from torch.testing._internal.common_utils import (
+    find_library_location,
+    IS_FBCODE,
+    IS_MACOS,
+    IS_SANDCASTLE,
+    IS_WINDOWS,
+    run_tests,
+    TEST_TRANSFORMERS,
+    TestCase as TorchTestCase,
+)
 from torch.utils._pytree import (
     LeafSpec,
     tree_flatten,
@@ -46,6 +58,14 @@
     treespec_dumps,
     treespec_loads,
 )
+
+try:
+    from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+    HAS_TORCHREC = True
+except ImportError:
+    HAS_TORCHREC = False
+
 try:
     from . import testing
 except ImportError:
@@ -56,21 +76,89 @@
 from torch.export import export
 
 
+torch.library.define("testlib::returns_tensor_symint", "(Tensor x) -> (Tensor, SymInt)")
+torch.library.define(
+    "testlib::foo",
+    "(Tensor(a!) x, Tensor(b!) z) -> (Tensor, Tensor, Tensor)",
+    tags=torch.Tag.pt2_compliant_tag,
+)
+torch.library.define(
+    "testlib::foo_mutated",
+    "(Tensor(a!) x) -> (Tensor, Tensor)",
+    tags=torch.Tag.pt2_compliant_tag,
+)
+torch.library.define(
+    "testlib::foo_functional",
+    "(Tensor x) -> (Tensor)",
+    tags=torch.Tag.pt2_compliant_tag,
+)
+
+
+@torch.library.impl("testlib::returns_tensor_symint", "cpu")
+@torch.library.impl_abstract("testlib::returns_tensor_symint")
+def returns_tensor_symint_impl(x):
+    return x, x.shape[0]
+
+
+@torch.library.impl("testlib::foo", "cpu")
+@torch._dynamo.disable
+def foo_impl(x, z):
+    x.add_(5)
+    z.add_(5)
+    return x, z, x + z
+
+
+@torch.library.impl_abstract("testlib::foo")
+def foo_abstract(x, z):
+    return x, z, x + z
+
+
+@torch.library.impl("testlib::foo_mutated", "CompositeImplicitAutograd")
+def foo_mutated(x):
+    a, b, c = torch.ops.testlib.foo(x, x.cos())
+    return a, a.cos()
+
+
+@torch.library.impl("testlib::foo_functional", "CompositeImplicitAutograd")
+def foo_functional(x):
+    a, b, c = torch.ops.testlib.foo(x.cos(), x.cos())
+    return a.cos()
+
+
+@dataclass
+class Inp:
+    x: Tensor
+    y: List[Tensor]
+    z: Dict[str, Tensor]
+
+
+NON_STRICT_SUFFIX = "_non_strict"
+RETRACEABILITY_SUFFIX = "_retraceability"
+
+
+def is_non_strict_test(test_name):
+    return test_name.endswith(NON_STRICT_SUFFIX)
+
+
+def is_retracebility_test(test_name):
+    return test_name.endswith(RETRACEABILITY_SUFFIX)
+
+
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
 class TestDynamismExpression(TestCase):
-    @testing.expectedFailureNonStrict
     def test_export_inline_constraints(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                b = x.item()
+                torch._check_is_size(b)
+                return torch.full((b, 1), 1)
 
-        def f(x):
-            b = x.item()
-            torch._constrain_as_size(b)
-            return torch.full((b, 1), 1)
-
+        f = Module()
         inp = (torch.tensor([3]),)
         ref = f(*inp)
 
         gm = export(f, inp)
-        res = gm(*inp)
+        res = gm.module()(*inp)
 
         self.assertTrue(torchdynamo.utils.same(ref, res))
 
@@ -79,65 +167,72 @@ def f(x):
         self.assertTrue(torchdynamo.utils.same(ref, res))
 
     def test_export_constraints_error(self):
-        def invalid_input_conflict_with_input_constraints(x):
-            return x + 1
+        class InvalidInputConflictWithInputConstraints(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
 
         inp = torch.zeros([3])
         dim_x = torch.export.Dim("dim_x", min=6)
         with self.assertRaisesRegex(torch._dynamo.exc.UserError, "not in range"):
             torch.export.export(
-                invalid_input_conflict_with_input_constraints,
+                InvalidInputConflictWithInputConstraints(),
                 (inp,),
                 dynamic_shapes={"x": {0: dim_x}},
             )
 
-        def conflicting_constraints(x):
-            b = x.item()
-            torch._constrain_as_size(b)
-            torch._constrain_as_value(b, min=4, max=5)
-            return torch.full((b, 1), 1)
+        class ConflictingConstraints(torch.nn.Module):
+            def forward(self, x):
+                b = x.item()
+                torch._check_is_size(b)
+                torch._check(b >= 4)
+                torch._check(b <= 5)
+                return torch.full((b, 1), 1)
 
         inp = (torch.tensor([3]),)
-        ep = torch.export.export(conflicting_constraints, inp)
+        ep = export(ConflictingConstraints(), inp)
 
         with self.assertRaisesRegex(
-            RuntimeError, r"is outside of inline constraint \[4, 5\]"
+            RuntimeError, r"Invalid value range for 3 between \[4, 5\]"
         ):
-            ep(torch.tensor([3]))
+            ep.module()(torch.tensor([3]))
 
-    @testing.expectedFailureNonStrict
     def test_export_assume_static_by_default(self):
-        def branch_on_shape(x: torch.Tensor):
-            if x.shape[0] == 4:
-                return x + 1
-            else:
-                return x
+        class Module(torch.nn.Module):
+            def forward(self, x: torch.Tensor):
+                if x.shape[0] == 4:
+                    return x + 1
+                else:
+                    return x
 
+        branch_on_shape = Module()
         inp = (torch.rand(4, 5),)
 
         # Being able to export means shape is preserved as static
         export(branch_on_shape, inp)
 
 
+@unittest.skipIf(IS_WINDOWS, "Windows isn't supported for this case")
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
 class TestExport(TestCase):
-
     def _test_export_same_as_eager(self, f, args, kwargs=None):
         kwargs = kwargs or {}
         exported_program = export(f, args, kwargs)
-        reversed_kwargs = {key: kwargs[key] for key in reversed(kwargs)}
-        self.assertEqual(exported_program(*args, **kwargs), f(*args, **kwargs))
-        self.assertEqual(exported_program(*args, **reversed_kwargs), f(*args, **reversed_kwargs))
+        self.assertEqual(exported_program.module()(*args, **kwargs), f(*args, **kwargs))
+        # this is not supported by .module()
+        # reversed_kwargs = {key: kwargs[key] for key in reversed(kwargs)}
+        # self.assertEqual(
+        #     exported_program.module()(*args, **reversed_kwargs), f(*args, **reversed_kwargs)
+        # )
 
-    @testing.expectedFailureNonStrict
     def test_basic(self):
-        def f(x, y):
-            return x[0] + y
+        class Module(torch.nn.Module):
+            def forward(self, x, y):
+                return x[0] + y
 
+        f = Module()
         inp = ([torch.ones(1, 3)], torch.ones(1, 3))
         self._test_export_same_as_eager(f, inp)
 
-    @testing.expectedFailureRetraceability
     def test_external_call_non_strict_real_tensor(self):
         class ExternalMethod:
             def add(self, x):
@@ -152,11 +247,51 @@ def forward(self, x):
                 return self.external_add(x)
 
         f = Basic()
-        args = (torch.randn(1, 3), )
+        args = (torch.randn(1, 3),)
         ep = export(f, args, strict=False)
-        self.assertEqual(ep(*args), f(*args))
+        self.assertEqual(ep.module()(*args), f(*args))
+
+    def test_colon_parameter(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_parameter("foo:bar", torch.nn.Parameter(torch.ones(3, 3)))
+
+            def forward(self, x):
+                return x + getattr(self, "foo:bar")
+
+        ep = export(M(), (torch.randn(3, 3),))
+        x = torch.randn(3, 3)
+        self.assertEqual(ep.module()(x), M()(x))
+
+    def test_conv_dynamic(self):
+        # Simple module for demonstration
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    in_channels=3, out_channels=32, kernel_size=3, padding=1
+                )
+                self.relu = torch.nn.ReLU()
+                self.maxpool = torch.nn.MaxPool2d(kernel_size=3)
+
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                a = self.conv(x)
+                a.add_(y)
+                return self.maxpool(self.relu(a))
+
+        example_args = (torch.randn(2, 3, 256, 256), torch.ones(2, 32, 256, 256))
+        dynamic_shapes = {"x": {0: Dim("batch")}, "y": {0: Dim("batch")}}
+        m = M()
+        exported_program: torch.export.ExportedProgram = export(
+            m, args=example_args, dynamic_shapes=dynamic_shapes
+        )
+
+        args = (torch.randn(17, 3, 256, 256), torch.ones(17, 32, 256, 256))
+        self.assertEqual(exported_program.module()(*args), m(*args))
+        args = (torch.randn(15, 3, 256, 256), torch.ones(15, 32, 256, 256))
+        self.assertEqual(exported_program.module()(*args), m(*args))
 
-    @testing.expectedFailureRetraceability
     def test_basic_non_strict_real_tensor(self):
         class Basic(torch.nn.Module):
             def __init__(self):
@@ -169,9 +304,8 @@ def forward(self, x, y):
         f = Basic()
         args = ([torch.randn(1, 3)], torch.randn(1, 3))
         ep = export(f, args, strict=False)
-        self.assertEqual(ep(*args), f(*args))
+        self.assertEqual(ep.module()(*args), f(*args))
 
-    @testing.expectedFailureRetraceability
     def test_basic_non_strict_fake_tensor(self):
         class Basic(torch.nn.Module):
             def __init__(self):
@@ -181,13 +315,13 @@ def __init__(self):
             def forward(self, x, y):
                 return x[0] + y - self.param
 
-        fake_mode = FakeTensorMode()
+        fake_mode = FakeTensorMode(shape_env=ShapeEnv(tracked_fakes=[]))
         f = Basic()
         with fake_mode:
             args = ([torch.empty(3, 2)], torch.empty(3, 2))
         ep = export(f, args, strict=False)
         inputs = ([torch.randn(3, 2)], torch.randn(3, 2))
-        self.assertEqual(ep(*inputs), f(*inputs))
+        self.assertEqual(ep.module()(*inputs), f(*inputs))
 
     def test_non_strict_dynamic_shapes(self):
         class Foo(torch.nn.Module):
@@ -221,7 +355,9 @@ def forward(self, x, ys, zs, c):
             None,
         )
 
-        ep_ns = torch.export.export(foo, inp, dynamic_shapes=dynamic_shapes, strict=False)
+        ep_ns = torch.export.export(
+            foo, inp, dynamic_shapes=dynamic_shapes, strict=False
+        )
 
         bad_runtime_inp1 = (
             torch.ones(6),
@@ -230,9 +366,12 @@ def forward(self, x, ys, zs, c):
             torch.ones(4),
         )
         with self.assertRaisesRegex(
-            RuntimeError, "Expected input arg3_1.shape\[0\] to be equal to 6, but got 5"
+            RuntimeError,
+            escape(
+                "Expected input at *args[1][0].shape[0] to be equal to 6, but got 5"
+            ),
         ):
-            ep_ns(*bad_runtime_inp1)
+            ep_ns.module()(*bad_runtime_inp1)
 
         bad_runtime_inp2 = (
             torch.ones(5),
@@ -240,8 +379,11 @@ def forward(self, x, ys, zs, c):
             {"a": torch.zeros(5), "b": torch.ones(5)},
             torch.ones(6),
         )
-        with self.assertRaisesRegex(RuntimeError, "Expected input arg7_1.shape\[0\] to be equal to 4, but got 6"):
-            ep_ns(*bad_runtime_inp2)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            escape("Expected input at *args[3].shape[0] to be equal to 4, but got 6"),
+        ):
+            ep_ns.module()(*bad_runtime_inp2)
 
         good_runtime_inp = (
             torch.ones(7),
@@ -249,7 +391,7 @@ def forward(self, x, ys, zs, c):
             {"a": torch.zeros(7), "b": torch.ones(7)},
             torch.ones(4),
         )
-        ep_ns(*good_runtime_inp)
+        ep_ns.module()(*good_runtime_inp)
 
         bad_example_inp = (
             torch.ones(2),
@@ -267,9 +409,6 @@ def forward(self, x, ys, zs, c):
 
     def test_non_strict_dynamic_shapes_suggested_fixes(self):
         class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, c):
                 if x.shape[0] <= 6:
                     return x + 1, c + 2
@@ -289,7 +428,7 @@ def forward(self, x, c):
         )
 
         with self.assertRaisesRegex(
-            torch.fx.experimental.symbolic_shapes.ConstraintViolationError,
+            torch._dynamo.exc.UserError,
             "Constraints violated \\(dim\\)!(.*\n)*.*"
             "Not all values of dim.*satisfy the generated guard(.*\n)*.*"
             "Suggested fixes:(.*\n)*.*"
@@ -299,2238 +438,4786 @@ def forward(self, x, c):
                 foo, bad_example_inp, dynamic_shapes=dynamic_shapes, strict=False
             )
 
-    def test_raise_user_error_when_guard_on_data_dependent_operation(self):
-        class M(torch.nn.Module):
+    def test_state_tensors(self):
+        class M(torch.nn.Module):  # simple with register buffer
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buf", torch.ones(2, 3), persistent=False)
+
             def forward(self, x):
-                y = x.nonzero()
-                z = y.shape[0]
-                if z > 2:
-                    return x.cos()
-                else:
-                    return x.sin()
+                # x = 2
+                y = self.buf
+                # y = 1
+                w1 = self.buf + 3
+                w2 = self.buf + 4
+                w3 = self.buf + 5
+                self.buf = w1
+                z = self.buf
+                self.buf = w3
+                # z = 4
+                return x + y + z + w2
+
+        ep = torch.export.export(M(), (torch.randn(2, 3),), strict=False)
+        self.assertEqual(ep.graph_signature.buffers_to_mutate, {"add_2": "buf"})
+        self.assertTrue(
+            torch.allclose(ep.module()(torch.ones(2, 3) + 1), torch.ones(2, 3) * 12)
+        )
+
+        class M(torch.nn.Module):  # simple without register buffer
+            def __init__(self):
+                super().__init__()
+                self.buf = torch.ones(2, 3)
+
+            def forward(self, x):
+                # x = 2
+                y = self.buf
+                # y = 1
+                self.buf = self.buf + 3
+                z = self.buf
+                # z = 3
+                return x + y + z
 
         with self.assertRaisesRegex(
-            (torchdynamo.exc.UserError, torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode),
-            "trying to get a value out of symbolic int"
+            ValueError,
+            "The tensor attribute self.buf was assigned during export",
         ):
-            _ = export(M(), (torch.tensor([2, 3, 5]),))
+            torch.export.export(M(), (torch.randn(2, 3),), strict=False)
 
-    @testing.expectedFailureNonStrict
-    def test_if_functional(self):
-        def foo(x):
-            z = x + 4
-            z.add_(4)
-            y = z.view(x.shape)
-            return x.cos() + y.cos()
+        class M(torch.nn.Module):  # complex with register buffer
+            def __init__(self):
+                super().__init__()
+                tensors = [torch.ones(2, 3), torch.ones(2, 3)]
+                for i, tensor in enumerate(tensors):
+                    self.register_buffer(f"buf_{i}", tensor, persistent=False)
 
-        gm = export(foo, (torch.tensor([2, 3, 5]),))
+            def get_tensor(self, i):
+                return getattr(self, f"buf_{i}")
 
-        view_count = 0
-        for node in gm.graph.nodes:
-            if node.op == "call_function" and node.target == torch.ops.aten.add_.Tensor:
-                # No more inplace mutation
-                self.assertNotEqual(
-                    node.target,
-                    torch.ops.aten.add_.Tensor,
-                    "There shouldn't be any inplace mutation node in the graph."
-                )
-            if node.op == "call_function" and node.target == torch.ops.aten.view.default:
-                view_count += 1
+            def set_tensor(self, i, val):
+                setattr(self, f"buf_{i}", val)
 
-        # There should be nonzero view nodes in the graph
-        self.assertTrue(view_count > 0)
+            def forward(self, x):
+                # x = 2
+                y = self.get_tensor(0) + self.get_tensor(1)
+                # y = 1 + 1
+                self.set_tensor(0, torch.ones(2, 3) + 2)
+                self.set_tensor(1, torch.ones(2, 3) + 2)
+                z = self.get_tensor(0) + self.get_tensor(1)
+                # z = 3 + 3
+                return x + y + z
 
-    def test_export_mod_constraints(self):
-        class BasicDynamiShapeModel(torch.nn.Module):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return x.view(x.shape[0] - 1, -1)
+        ep = torch.export.export(M(), (torch.randn(2, 3),), strict=False)
+        self.assertEqual(
+            ep.graph_signature.buffers_to_mutate, {"add_1": "buf_0", "add_2": "buf_1"}
+        )
+        self.assertTrue(
+            torch.allclose(ep.module()(torch.ones(2, 3) + 1), torch.ones(2, 3) * 10)
+        )
 
-        m = BasicDynamiShapeModel()
-        a = torch.randn(3, 4)
-        dim0_x = torch.export.Dim("dim0_x", min=3)
-        dim1_x = torch.export.Dim("dim1_x")
-        dynamic_shapes = {"x": (dim0_x, dim1_x)}
-        with self.assertRaisesRegex(
-            torch._dynamo.exc.UserError,
-            (
-                "Specializations unexpectedly required"
-                ".*\n.*\\[0\\] must be specialized to 3.*guards.*too complex"
-                ".*\n.*\\[1\\] must be specialized to 4.*guards.*too complex"
-            ),
-        ):
-            torch.export.export(m, (a,), dynamic_shapes=dynamic_shapes)
-        em = torch.export.export(m, (a,))
-        x = torch.randn(3, 5)
-        with self.assertRaisesRegex(RuntimeError, "shape\[1\] to be equal to 4, but got 5"):
-            em(x)
+        class M(torch.nn.Module):  # complex without register buffer
+            def __init__(self):
+                super().__init__()
+                self.tensors = [torch.ones(2, 3), torch.ones(2, 3)]
 
-    def test_not_correct_dim(self):
-        def f(x):
-            return x.cos()
+            def get_tensor(self, i):
+                return self.tensors[i]
 
-        def g(x):
-            return x + 4
+            def set_tensor(self, i, val):
+                self.tensors[i] = val
 
-        inp_for_f = torch.tensor(5)
-        with self.assertRaisesRegex(torchdynamo.exc.UserError, "Cannot mark 0-dimension tensors to be dynamic"):
-            constraints = [dynamic_dim(inp_for_f, 0)]
+            def forward(self, x):
+                # x = 2
+                y = self.get_tensor(0) + self.get_tensor(1)
+                # y = 1 + 1
+                self.set_tensor(0, torch.ones(2, 3) + 2)
+                self.set_tensor(1, torch.ones(2, 3) + 2)
+                z = self.get_tensor(0) + self.get_tensor(1)
+                # z = 3 + 3
+                return x + y + z
 
-        inp_for_f_mul_dim = torch.ones(5, 5)
         with self.assertRaisesRegex(
-            torchdynamo.exc.UserError,
-            "Expected the dimension passed to dynamic_dim to be in the range \\[0:1\\]"
+            ValueError,
+            "The tensor attributes self.tensors\\[0\\], self.tensors\\[1\\] were assigned during export",
         ):
-            constraints = [dynamic_dim(inp_for_f_mul_dim, 2)]
-
-        inp_for_g = 4
-        with self.assertRaisesRegex(torchdynamo.exc.UserError, "Expected tensor as input to dynamic_dim"):
-            constraints = [dynamic_dim(inp_for_g, 0)]
+            torch.export.export(M(), (torch.randn(2, 3),), strict=False)
 
-    @testing.expectedFailureRetraceability
-    @testing.expectedFailureNonStrict
-    def test_map(self):
-        def list_tensor_map(xs, y, z):
-            def body(x, y, z):
-                return x + y + z
-
-            return map(body, xs, y, z)
+    def test_state_primitives(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.x = 1
+                self.y = {"k": 2}
+                self.z = (3,)
 
-        inps = (torch.ones(6, 4), torch.tensor(5), torch.tensor(4))
-        self._test_export_same_as_eager(list_tensor_map, inps)
+            def forward(self, x):
+                self.x = self.x + 4
+                self.y["k"] = self.y["k"] + 5
+                self.z = (self.z[0] + 6,)
+                return x + self.x + self.y["k"] + self.z[0]
 
-    @testing.expectedFailureRetraceability
-    @testing.expectedFailureNonStrict
-    def test_export_func_with_kwargs(self):
-        def kw_func(arg1, arg2, kw1, kw2):
-            return arg1 + arg2, kw1 + kw2
+        ep = export(M(), (torch.randn(2, 3),))
+        self.assertTrue(
+            torch.allclose(ep.module()(torch.zeros(2, 3)), torch.ones(2, 3) * 21)
+        )
 
-        args = (torch.ones(6, 4), torch.ones(1, 1))
-        kwargs = {"kw1": torch.ones(1, 1), "kw2": torch.ones(6, 4)}
-        self._test_export_same_as_eager(kw_func, args, kwargs)
+    # Predispatch has different expected results
+    @testing.expectedFailureSerDerPreDispatch
+    def test_torch_fn(self):
+        class M1(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 3)
+                self.relu = torch.nn.ReLU()
 
-    @testing.expectedFailureRetraceability
-    @testing.expectedFailureNonStrict
-    def test_export_func_with_pytree_kwargs(self):
-        def kw_func(arg1, arg2, a, b):
-            return arg1 + a["kw1"] + b[0], arg2 + a["kw2"] + b[1]
+            def forward(self, x):
+                x = self.linear(x)
+                x = self.linear(x)
+                x = self.relu(x)
+                x = x + x
+                return x
 
-        args = (torch.ones(2, 3), torch.ones(3, 4))
-        kwargs = {"a": {"kw1": torch.ones(2, 3), "kw2": torch.ones(3, 4)}, "b": [torch.ones(2, 3), torch.ones(3, 4)]}
-        self._test_export_same_as_eager(kw_func, args, kwargs)
+        ep1 = export(M1(), (torch.randn(3, 3),))
+        expected_result = [
+            ("linear_1", "builtin_function_or_method.linear"),
+            ("linear_1", "builtin_function_or_method.linear"),
+            ("linear_2", "builtin_function_or_method.linear"),
+            ("linear_2", "builtin_function_or_method.linear"),
+            ("relu_1", "function.relu"),
+            ("add_1", "method_descriptor.add"),
+        ]
+        actual_result = []
+        for i, node in enumerate(ep1.graph.nodes):
+            if node.op == "call_function":
+                actual_result.append(node.meta.get("torch_fn"))
+        self.assertEqual(actual_result, expected_result)
 
-    @testing.expectedFailureSerDer
-    @testing.expectedFailureRetraceability
-    @testing.expectedFailureNonStrict
-    def test_export_func_with_default_kwargs(self):
-        def kw_func(arg1, arg2, a, b=1):
-            return arg1 + arg2, a["kw1"] + a["kw2"] + b
+        class M2(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
 
-        def kw_func2(arg1, arg2, a=1, b=2):
-            return arg1 + a, arg2 + b
+            def forward(self, x, weight, bias):
+                x = torch.nn.functional.linear(x, weight, bias)
+                x = torch.nn.functional.relu(x)
+                x = torch.add(x, x)
+                return x
 
+        ep2 = export(M2(), (torch.randn(3, 3), torch.randn(3, 3), torch.randn(3)))
+        expected_result = [
+            ("linear_1", "builtin_function_or_method.linear"),
+            ("linear_1", "builtin_function_or_method.linear"),
+            ("relu_1", "function.relu"),
+            ("add_1", "builtin_function_or_method.add"),
+        ]
+        actual_result = []
+        for i, node in enumerate(ep2.graph.nodes):
+            if node.op == "call_function":
+                actual_result.append(node.meta.get("torch_fn"))
+        self.assertEqual(actual_result, expected_result)
 
-        args = (torch.ones(6, 4), torch.ones(1, 1))
-        kwargs1 = {"a": {"kw1": torch.ones(1, 1), "kw2": torch.ones(6, 4)}}
-        kwargs2 = {"a": {"kw1": torch.ones(1, 1), "kw2": torch.ones(6, 4)}, "b": 2}
-        self._test_export_same_as_eager(kw_func, args, kwargs1)
-        self._test_export_same_as_eager(kw_func, args, kwargs2)
-        kwargs3 = {"b": 1}
-        self._test_export_same_as_eager(kw_func2, args, kwargs3)
+    # TODO(yidi)
+    @unittest.expectedFailure
+    def test_export_cond_preserve_torch_fn_for_subgraphs(self):
+        class MySubModule(torch.nn.Module):
+            def foo(self, x):
+                return x.cos()
 
-    @testing.expectedFailureNonStrict
-    def test_export_func_with_var_postional_args(self):
-        def kw_func(arg1, arg2, *args):
-            return arg1 + args[0], arg2 + args[1]
+            def forward(self, x):
+                return self.foo(x)
 
-        args = (torch.ones(2, 3), torch.ones(3, 4), torch.ones(2, 3), torch.ones(3, 4))
-        self._test_export_same_as_eager(kw_func, args)
+        class CondBranchClassMethod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.subm = MySubModule()
 
-    @testing.expectedFailureRetraceability
-    @testing.expectedFailureNonStrict
-    def test_export_func_with_keyword_only_args(self):
-        def kw_func(arg1, arg2, *args, kw1, kw2):
-            return arg1 + args[0] + kw1, arg2 + args[1] + kw2
+            def bar(self, x):
+                return x.sin()
 
-        args = (torch.ones(2, 3), torch.ones(3, 4), torch.ones(2, 3), torch.ones(3, 4))
-        kwargs = {"kw1": torch.ones(2, 3), "kw2": torch.ones(3, 4)}
-        self._test_export_same_as_eager(kw_func, args, kwargs)
+            def forward(self, x):
+                return cond(x.shape[0] <= 2, self.subm.forward, self.bar, [x])
 
-    @testing.expectedFailureRetraceability
-    @testing.expectedFailureNonStrict
-    def test_export_func_with_var_keyword_args(self):
-        def kw_func(arg1, arg2, *args, kw1, kw2, **kwargs):
-            return arg1 + args[0] + kw1 + kwargs["kw3"], arg2 + args[1] + kw2 + kwargs["kw4"]
+        example_inputs = (torch.randn(1, 3, 3, 3),)
+        m = CondBranchClassMethod()
+        m.eval()
+        gm = export(m, example_inputs).module()
 
-        args = (torch.ones(2, 3), torch.ones(3, 4), torch.ones(2, 3), torch.ones(3, 4))
-        kwargs = {"kw1": torch.ones(2, 3), "kw2": torch.ones(3, 4), "kw3": torch.ones(2, 3), "kw4": torch.ones(3, 4)}
-        self._test_export_same_as_eager(kw_func, args, kwargs)
+        actual_torch_fns = []
+        for mod in gm.modules():
+            for node in mod.graph.nodes:
+                if node.name in {"sin", "cos"}:
+                    torch_fn = node.meta.get("torch_fn")
+                    print(torch_fn)
+                    actual_torch_fns.append(torch_fn)
+        exp_torch_fns = [
+            ("cos_1", "method_descriptor.cos"),
+            ("sin_1", "method_descriptor.sin"),
+        ]
+        self.assertEqual(actual_torch_fns, exp_torch_fns)
 
-    @testing.expectedFailureRetraceability
-    @testing.expectedFailureNonStrict
-    def test_export_func_with_var_keyword_pytree_args(self):
-        def kw_func(arg1, arg2, *args, kw1, kw2, **kwargs):
-            return arg1 + arg2[0][0] + args[0] + kw1[0] + kwargs["kw3"][0], arg2[1] + args[1] + kw2 + kwargs["kw4"]
+    def test_derived_dim_basic(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y[1:]
 
-        args = (torch.ones(2, 3), [(torch.ones(2, 3), ), torch.ones(3, 4)], torch.ones(2, 3), torch.ones(3, 4))
-        kwargs = {"kw1": (torch.ones(2, 3), ), "kw2": torch.ones(3, 4),
-                  "kw3": (torch.ones(2, 3), torch.ones(3, 4)), "kw4": torch.ones(3, 4)}
-        self._test_export_same_as_eager(kw_func, args, kwargs)
+        foo = Foo()
 
-    @testing.expectedFailureSerDer
-    @testing.expectedFailureNonStrict
-    def test_linear_conv(self):
+        x, y = torch.randn(5), torch.randn(6)
+        dimx = torch.export.Dim("dimx", min=3, max=6)
 
-        class MyLinear(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.weight = torch.randn(20, 98)
-                self.bias = torch.randn(20)
+        dimy = torch.export.Dim("dimy", min=4, max=7)  # doesn't work
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UserError,
+            (
+                "Constraints violated \\(dimy\\)!(.*\n)*.*"
+                "The values of dimy.*must always be related to the values of dimx.*by.*(.*\n)*.*"
+                "Suggested fixes:(.*\n)*.*"
+                "dimy = dimx \\+ 1"
+            ),
+        ):
+            export(
+                foo,
+                (x, y),
+                dynamic_shapes=({0: dimx}, {0: dimy}),
+            )
 
-            def forward(self, x):
-                return torch.nn.functional.linear(x, self.weight, self.bias)
+        dimy = dimx * 2  # doesn't work
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UserError,
+            "Expected input.*size.* to be equal to 2\\*dimx, where dimx = 5, but got 6",
+        ):
+            export(
+                foo,
+                (x, y),
+                dynamic_shapes=({0: dimx}, {0: dimy}),
+            )
 
-        class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = torch.nn.Conv2d(16, 33, 3)
-                self.linear = MyLinear()
+        dimy = dimx + 1  # works
+        ep = export(
+            foo,
+            (x, y),
+            dynamic_shapes=({0: dimx}, {0: dimy}),
+        )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected input.*shape.*to be equal to 5, but got 6",
+        ):
+            ep.module()(torch.randn(4), torch.randn(6))
 
-            def forward(self, x):
-                x_conv = self.conv(x)
-                x_linear = self.linear(x_conv)
-                return x_linear.cos()
+        self.assertEqual(ep.module()(torch.randn(4), torch.randn(5)).size()[0], 4)
 
-        ep = export(Foo(), (torch.randn(20, 16, 50, 100),))
-        for node in ep.graph.nodes:
-            if (
-                node.op == "placeholder" and
-                node.name in ep.graph_signature.inputs_to_buffers or
-                node.name in ep.graph_signature.inputs_to_parameters
-            ):
-                self.assertTrue("source_fn_stack" in node.meta)
-                self.assertTrue("nn_module_stack" in node.meta)
+    def test_derived_dim_nested(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y[1::2]
 
-    def test_export_api_with_dynamic_shapes(self):
-        from torch.export import Dim, dims, export
+        foo = Foo()
 
-        # pass dynamic shapes of inputs [args]
-        def foo(x, y):
-            return torch.matmul(x, y)
+        x, y = torch.randn(5), torch.randn(11)
+        dimx = torch.export.Dim("dimx", min=3, max=6)
+        dimy = dimx * 2 + 1  # works
+        ep = export(
+            foo,
+            (x, y),
+            dynamic_shapes=({0: dimx}, {0: dimy}),
+        )
+        self.assertEqual(ep.module()(torch.randn(4), torch.randn(9)).size()[0], 4)
 
-        inputs = (torch.randn(10, 2, 3), torch.randn(10, 3, 4))
-        batch = Dim("batch")
-        efoo = export(foo, inputs, dynamic_shapes={k: {0: batch} for k in ["x", "y"]})
-        self.assertEqual(efoo(*inputs).shape, foo(*inputs).shape)
+        class Foo(torch.nn.Module):
+            def forward(self, z, y):
+                return z[1:] + y[1::2]
 
-        # pass dynamic shapes of inputs [kwargs]
-        def foo(x, y):
-            return torch.matmul(x, y)
+        foo = Foo()
 
-        inputs = (torch.randn(10, 2, 3),)
-        kwinputs = {"y": torch.randn(10, 3, 4)}
-        batch = Dim("batch")
-        efoo = export(
-            foo, inputs, kwinputs, dynamic_shapes={k: {0: batch} for k in ["x", "y"]}
+        z, y = torch.randn(6), torch.randn(11)
+
+        dimz = dimx
+        dimy = dimx * 2 - 1  # works
+        ep = export(
+            foo,
+            (z, y),
+            dynamic_shapes=({0: dimz}, {0: dimy}),
         )
-        self.assertEqual(efoo(*inputs, **kwinputs).shape, foo(*inputs, **kwinputs).shape)
+        self.assertEqual(ep.module()(torch.randn(5), torch.randn(9)).size()[0], 4)
 
-        # pass dynamic shapes of inputs [partial, error]
-        def foo(x, y):
-            return torch.matmul(x, y)
+        dimz = dimx + 1
+        dimy = dimx * 2 - 1  # doesn't work
 
-        inputs = (torch.randn(10, 2, 3),)
-        kwinputs = {"y": torch.randn(10, 3, 4)}
-        batch = Dim("batch")
         with self.assertRaisesRegex(
             torch._dynamo.exc.UserError,
-            (
-                "Constraints violated \\(batch\\)!(.*\n)*.*"
-                "batch was inferred to be a constant(.*\n)*.*"
-                "Suggested fixes:(.*\n)*.*"
-                "batch = None  # 10"
-            ),
+            "Expected input.*size.*to be equal to 2\\*dimx - 1, where dimx = 5, but got 11",
         ):
-            export(foo, inputs, kwinputs, dynamic_shapes={"x": {0: batch}, "y": None})
+            export(
+                foo,
+                (z, y),
+                dynamic_shapes=({0: dimz}, {0: dimy}),
+            )
 
-        # pass dynamic shapes of inputs [module]
+        dimy = dimx * 2 + 1  # works
+        ep = export(
+            foo,
+            (z, y),
+            dynamic_shapes=({0: dimz}, {0: dimy}),
+        )
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected input.*shape.*to be <= 7, but got 8"
+        ):
+            ep.module()(torch.randn(8), torch.randn(15))
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected input.*shape.*to be equal to 9, but got 8",
+        ):
+            ep.module()(torch.randn(5), torch.randn(8))
+
+        self.assertEqual(ep.module()(torch.randn(5), torch.randn(9)).size()[0], 4)
+
+    def test_derived_dim_integer(self):
         class Foo(torch.nn.Module):
-            def forward(self, x, y):
-                return torch.matmul(x, y)
+            def forward(self, w):
+                if w.shape[0] % 2 == 0:
+                    return w[::2]
+                else:
+                    return w[1:-1:2]
 
         foo = Foo()
-        inputs = (torch.randn(10, 2, 3), torch.randn(10, 3, 4))
-        batch = Dim("batch")
-        efoo = export(foo, inputs, dynamic_shapes={"x": {0: batch}, "y": {0: batch}})
-        self.assertEqual(efoo(*inputs).shape, foo(*inputs).shape)
 
-        # pass dynamic shapes of inputs [bounds, mostly shared]
-        def foo(x, y):
-            return torch.matmul(x, y)
+        w = torch.randn(10)
+        dimx = torch.export.Dim("dimx", min=3, max=6)
+        dimw = dimx * 2 + 1  # doesn't work
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UserError,
+            "Expected shape.*= 10 of input Tensor to be "
+            "of the form 2\\*dimx \\+ 1, where dimx is an integer",
+        ):
+            export(
+                foo,
+                (w,),
+                dynamic_shapes=({0: dimw},),
+            )
 
-        inputs = (torch.randn(10, 3, 3), torch.randn(10, 3, 3))
-        batch = Dim("batch", min=8, max=64)
-        size = Dim("size")
-        efoo = export(
+        dimw = dimx * 2  # works
+        ep = export(
             foo,
-            inputs,
-            dynamic_shapes={
-                "x": (batch, size, size),
-                "y": (batch, size, size),
-            },
-        )
-        self.assertEqual(
-            [
-                str(node.meta["val"].shape)
-                for node in efoo.graph_module.graph.nodes
-                if node.op == "placeholder"
-            ],
-            ["torch.Size([s0, s1, s1])", "torch.Size([s0, s1, s1])"],
+            (w,),
+            dynamic_shapes=({0: dimw},),
         )
-        self.assertEqual(efoo(*inputs).shape, foo(*inputs).shape)
-
-        # pass dynamic shapes of inputs [multiple, mostly distinct]
-        def foo(x, y):
-            return torch.matmul(x, y)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected input.*shape.*= 9 to be "
+            "of the form 2\\*s1, where s1 is an integer",
+        ):
+            ep.module()(torch.randn(9))
 
-        inputs = (torch.randn(10, 2, 3), torch.randn(10, 3, 4))
-        batch, M, K, N = dims("batch", "M", "K", "N")
-        efoo = export(
-            foo,
-            inputs,
-            dynamic_shapes={"x": (batch, M, K), "y": (batch, K, N)},
-        )
-        self.assertEqual(
-            [
-                str(node.meta["val"].shape)
-                for node in efoo.graph_module.graph.nodes
-                if node.op == "placeholder"
-            ],
-            ["torch.Size([s0, s1, s2])", "torch.Size([s0, s2, s5])"],
-        )
-        self.assertEqual(efoo(*inputs).shape, foo(*inputs).shape)
+        self.assertEqual(ep.module()(torch.randn(8)).size()[0], 4)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected input.*shape.*to be <= 12, but got 14",
+        ):
+            ep.module()(torch.randn(14))
 
-        # pass dynamic shapes of inputs [dict]
+    def test_derived_dim_repeat_derived(self):
         class Foo(torch.nn.Module):
-            def forward(self, inputs):
-                return torch.matmul(inputs["x"], inputs["y"])
+            def forward(self, u, v):
+                return u[::2] + v[::2]
 
         foo = Foo()
-        inputs = ({"x": torch.randn(10, 2, 3), "y": torch.randn(10, 3, 4)},)
-        batch = Dim("batch")
-        efoo = export(
-            foo, inputs, dynamic_shapes={"inputs": {k: {0: batch} for k in ["x", "y"]}}
-        )
-        self.assertEqual(
-            [
-                str(node.meta["val"].shape)
-                for node in efoo.graph_module.graph.nodes
-                if node.op == "placeholder"
-            ],
-            ["torch.Size([s0, 2, 3])", "torch.Size([s0, 3, 4])"],
+
+        u, v = torch.randn(10), torch.randn(10)
+        dimx = torch.export.Dim("dimx", min=3, max=6)
+        dimw = dimx * 2  # works
+        ep = export(
+            foo,
+            (u, v),
+            dynamic_shapes=({0: dimw}, {0: dimw}),
         )
-        self.assertEqual(efoo(*inputs).shape, foo(*inputs).shape)
+        self.assertEqual(ep.module()(torch.randn(8), torch.randn(8)).size()[0], 4)
+
+    def test_derived_dim_out_of_order(self):
+        dimy = torch.export.Dim("dimy", min=5, max=7)
+        dimx = dimy - 1  # out of order, effectively dimy = dimx + 1
+        dimz = dimy + 1  # out of order, effectively dimz = dimx + 2
 
-        # pass dynamic shapes of inputs [list]
         class Foo(torch.nn.Module):
-            def forward(self, inputs):
-                return torch.matmul(inputs[0], inputs[1])
+            def forward(self, x, y, z):
+                return x + y[1:] + z[2:]
 
         foo = Foo()
-        inputs = ((torch.randn(10, 2, 3), torch.randn(10, 3, 4)),)
-        batch = Dim("batch")
-        efoo = export(
-            foo, inputs, dynamic_shapes={"inputs": [{0: batch} for _ in range(2)]}
+
+        u, v, w = torch.randn(5), torch.randn(6), torch.randn(7)
+        ep = export(
+            foo,
+            (u, v, w),
+            dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}),
         )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected input.*shape.*to be equal to 8, but got 5",
+        ):
+            ep.module()(torch.randn(6), torch.randn(7), torch.randn(5))
+
         self.assertEqual(
-            [
-                str(node.meta["val"].shape)
-                for node in efoo.graph_module.graph.nodes
-                if node.op == "placeholder"
-            ],
-            ["torch.Size([s0, 2, 3])", "torch.Size([s0, 3, 4])"],
+            ep.module()(torch.randn(6), torch.randn(7), torch.randn(8)).size()[0], 6
         )
-        self.assertEqual(efoo(*inputs).shape, foo(*inputs).shape)
-
-        # pass dynamic shapes of inputs [dataclass]
-        @dataclass
-        class DataClass:
-            a: Tensor
-            b: Tensor
 
-        register_dataclass_as_pytree_node(DataClass)
+    def test_derived_dim_out_of_order_repeat_derived(self):
+        dimy = torch.export.Dim("dimy", min=5, max=7)
+        dimx = dimy - 1  # out of order, effectively dimy = dimx + 1
+        dimz = dimy + 1  # out of order, effectively dimz = dimx + 2
+        dimx1 = dimx
+        dimx2 = dimz - 2  # works, effectively = dimx
 
         class Foo(torch.nn.Module):
-            def forward(self, inputs):
-                return torch.matmul(inputs.a, inputs.b)
+            def forward(self, x, y, z, x1, x2):
+                return x + y[1:] + z[2:] + x1 + x2
 
         foo = Foo()
-        inputs = (DataClass(a=torch.randn(10, 2, 3), b=torch.randn(10, 3, 4)),)
-        batch = Dim("batch")
-        efoo = export(
-            foo, inputs, dynamic_shapes={"inputs": DataClass(a={0: batch}, b={0: batch})}
+
+        u, v, w, u1, u2 = (
+            torch.randn(5),
+            torch.randn(6),
+            torch.randn(7),
+            torch.randn(5),
+            torch.randn(5),
         )
-        self.assertEqual(
-            [
-                str(node.meta["val"].shape)
-                for node in efoo.graph_module.graph.nodes
-                if node.op == "placeholder"
-            ],
-            ["torch.Size([s0, 2, 3])", "torch.Size([s0, 3, 4])"],
+        ep = export(
+            foo,
+            (u, v, w, u1, u2),
+            dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}, {0: dimx1}, {0: dimx2}),
         )
-
-        # pass dynamic shapes of inputs [distinct, error]
-        def foo(x, y):
-            return torch.matmul(x, y)
-
-        inputs = (torch.randn(10, 2, 3), torch.randn(10, 3, 4))
-        batch, M, K1, K2, N = dims("batch", "M", "K1", "K2", "N")
         with self.assertRaisesRegex(
-            torch._dynamo.exc.UserError,
-            (
-                "Constraints violated \\(K2\\)!(.*\n)*.*"
-                "K2.*and.*K1.*must always be equal(.*\n)*.*"
-                "Suggested fixes:(.*\n)*.*"
-                "K2 = K1"
-            ),
+            RuntimeError,
+            "Expected input.*shape.*to be equal to 6, but got 5",
         ):
-            export(
-                foo,
-                inputs,
-                dynamic_shapes={"x": (batch, M, K1), "y": (batch, K2, N)},
+            ep.module()(
+                torch.randn(6),
+                torch.randn(7),
+                torch.randn(8),
+                torch.randn(6),
+                torch.randn(5),
             )
 
-        # pass dynamic shapes of inputs [specialized, error]
-        def foo(x, y):
-            return torch.matmul(x, y)
+        self.assertEqual(
+            ep.module()(
+                torch.randn(6),
+                torch.randn(7),
+                torch.randn(8),
+                torch.randn(6),
+                torch.randn(6),
+            ).size()[0],
+            6,
+        )
 
-        inputs = (torch.randn(10, 2, 3), torch.randn(10, 3, 4))
-        batch, M, K1, N = dims("batch", "M", "K1", "N")
+        ep = export(
+            foo,
+            (u, v, w, u, u),  # reused inputs
+            dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}, {0: dimx1}, {0: dimx2}),
+        )
         with self.assertRaisesRegex(
-            torch._dynamo.exc.UserError,
-            (
-                "Constraints violated \\(K1\\)!(.*\n)*.*"
-                "K1 was inferred to be a constant(.*\n)*.*"
-                "Suggested fixes:(.*\n)*.*"
-                "K1 = None  # 3"
-            ),
+            RuntimeError,
+            "Expected input.*shape.*to be equal to 6, but got 5",
         ):
-            export(
-                foo,
-                inputs,
-                dynamic_shapes={"x": (batch, M, K1), "y": (batch, None, N)},
+            ep.module()(
+                torch.randn(6),
+                torch.randn(7),
+                torch.randn(8),
+                torch.randn(6),
+                torch.randn(5),
             )
 
-        # pass dynamic shapes of inputs [guards, error]
-        def foo(x, y):
-            if x.shape[0] < 16 and y.shape[1] % 3 == 0:
-                return torch.matmul(x, y)
-            else:
-                return x + y
+        self.assertEqual(
+            ep.module()(
+                torch.randn(6),
+                torch.randn(7),
+                torch.randn(8),
+                torch.randn(6),
+                torch.randn(6),
+            ).size()[0],
+            6,
+        )
 
-        inputs = (torch.randn(10, 2, 3), torch.randn(10, 3, 4))
-        batch, M, K, N = dims("batch", "M", "K", "N")
+    def test_derived_dim_out_of_order_simplified(self):
+        _dimz = torch.export.Dim("_dimz", min=6, max=8)
+        dimy = _dimz - 1
+        dimx = dimy - 1
+        dimz = torch.export.Dim("dimz", min=6, max=8)  # doesn't work, should be = _dimz
+
+        class Foo(torch.nn.Module):
+            def forward(self, x, y, z):
+                return x + y[1:] + z[2:]
+
+        foo = Foo()
+
+        u, v, w = torch.randn(5), torch.randn(6), torch.randn(7)
         with self.assertRaisesRegex(
             torch._dynamo.exc.UserError,
             (
-                "Constraints violated \\(batch\\)!(.*\n)*.*"
-                "Not all values of batch.*satisfy the generated guard(.*\n)*.*"
-                "Specializations unexpectedly required \\(K\\)!(.*\n)*.*"
-                "K.*specialized.*because the guards generated for it are too complex(.*\n)*.*"
+                "Constraints violated \\(dimz\\)!(.*\n)*.*"
+                "The values of dimz.*must always be related to the values of _dimz - 2.*by.*(.*\n)*.*"
                 "Suggested fixes:(.*\n)*.*"
-                "batch = Dim\\('batch', max=15\\)(.*\n)*.*"
-                "K = None  # 3"
+                "dimz = _dimz"
             ),
         ):
             export(
                 foo,
-                inputs,
-                dynamic_shapes={"x": (batch, M, K), "y": (batch, K, N)},
+                (u, v, w),
+                dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}),
             )
 
-    def test_dynamic_shapes_spec_with_pytree(self):
-        from torch.export import Dim, export
-        from torch.utils._pytree import tree_map
+        dimz = dimx + 2  # works, effectively = _dimz
+        ep = export(
+            foo,
+            (u, v, w),
+            dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}),
+        )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected input.*shape.*to be equal to 8, but got 5",
+        ):
+            ep.module()(torch.randn(6), torch.randn(7), torch.randn(5))
 
-        inputs = {
-            "tensor": torch.randn(3),
-            "dict_of_tensors": {k: torch.randn(3) for k in ["A", "B", "C", "D"]},
-            "list_of_tensors": [torch.randn(3) for _ in range(4)],
-        }
+        self.assertEqual(
+            ep.module()(torch.randn(6), torch.randn(7), torch.randn(8)).size()[0], 6
+        )
 
-        batch = Dim("batch")
-        # uniformly specify dynamic shapes for all inputs
-        spec = tree_map(lambda x: {0: batch}, inputs)
+    def test_derived_dim_out_of_order_simplified_repeat_non_derived(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x, y, y1, z):
+                return x + y[1:] + y1[1:] + z[2:]
 
-        def foo(inputs):
-            return (
-                inputs["tensor"]
-                + inputs["dict_of_tensors"]["A"]
-                + inputs["list_of_tensors"][0]
+        foo = Foo()
+
+        u, v, v1, w = torch.randn(5), torch.randn(6), torch.randn(6), torch.randn(7)
+        _dimz = torch.export.Dim("_dimz", min=6, max=8)
+        dimy = _dimz - 1
+        dimx = dimy - 1
+        dimz = dimx + 2  # works, effectively = _dimz
+        ep = export(
+            foo,
+            (u, v, v1, w),
+            dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimy}, {0: dimz}),
+        )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected input.*shape.*to be equal to 7, but got 5",
+        ):
+            ep.module()(
+                torch.randn(6),
+                torch.randn(7),
+                torch.randn(5),
+                torch.randn(8),
             )
 
-        ep = export(foo, (inputs,), dynamic_shapes={"inputs": spec})
-        input_shapes = [
-            str(node.meta["val"].shape)
-            for node in ep.graph_module.graph.nodes
-            if node.op == "placeholder"
-        ]
-        self.assertEqual(len(input_shapes), 9)
-        self.assertTrue(all(shape == "torch.Size([s0])" for shape in input_shapes))
+        self.assertEqual(
+            ep.module()(
+                torch.randn(6),
+                torch.randn(7),
+                torch.randn(7),
+                torch.randn(8),
+            ).size()[0],
+            6,
+        )
 
-    @testing.expectedFailureNonStrict
-    def test_error_does_not_reference_eager_fallback(self):
-        def fn_ddo(x):
-            y = x.nonzero()
-            z = y.shape[0]
-            if z > 2:
-                return x.cos()
-            else:
-                return x.sin()
+    def test_static_dim_constraints(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.l = torch.nn.Linear(6, 4)
 
+            def forward(self, x, y, z):
+                x0 = self.l(x) + y[1:]
+                return x0, z * 2.0
+
+        foo = Foo()
+        inputs = (torch.randn(4, 6), torch.randn(5, 4), torch.randn(3, 3))
+        dx = Dim("dx", min=3, max=6)
+        dy = dx + 1
+        dz = Dim("dz", min=3, max=6)
+
+        # all of these should be fine
+        for dynamic_shapes in [
+            ({0: dx, 1: 6}, {0: dy, 1: 4}, {0: dz, 1: 3}),
+            ((dx, None), (dy, 4), (dz, 3)),
+            ((None, 6), (5, None), (None, None)),
+            ((4, 6), {0: None, 1: 4}, {0: None, 1: 3}),
+        ]:
+            ep = export(foo, inputs, dynamic_shapes=dynamic_shapes)
+            self.assertEqual(foo(*inputs), ep.module()(*inputs))
+
+        # check range_constraints - static dims shouldn't be present
+        ep = export(foo, inputs, dynamic_shapes=((dx, None), (dy, 4), (dz, 3)))
+        self.assertEqual(len(ep.range_constraints), 3)
+        for vr in ep.range_constraints.values():
+            self.assertTrue(vr.lower < vr.upper)
+
+        # check raised errors
         with self.assertRaisesRegex(
-            torchdynamo.exc.UserError,
-            r"^(?!.*fall back to eager).*"
+            (
+                torch.fx.experimental.symbolic_shapes.ConstraintViolationError,
+                torch._dynamo.exc.UserError,
+            ),
+            "Static shape constraint of 5 does not match input size of 4, for .*",
         ):
-            _ = export(fn_ddo, (torch.tensor([2, 3, 5]),))
-
-    def test_pytree_register_data_class(self):
+            _ = export(foo, inputs, dynamic_shapes=((5, None), None, None))
+        with self.assertRaisesRegex(
+            (
+                torch.fx.experimental.symbolic_shapes.ConstraintViolationError,
+                torch._dynamo.exc.UserError,
+            ),
+            "Static shape constraint of 9 does not match input size of 6, for .*",
+        ):
+            _ = export(foo, inputs, dynamic_shapes=((dx, 9), (dy, 4), (3, 3)))
 
-        @dataclass
-        class MyDataClass:
-            x: int
-            y: int
-            z: int = None
+    def test_dim_1_2(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return x * 2
 
-        dt = MyDataClass(x=3, y=4)
-        flat, spec = tree_flatten(dt)
-        self.assertTrue(spec, LeafSpec())
-        self.assertTrue(len(flat) == 1)
+        dx = Dim("dx", min=1, max=2)
+        ep = export(Foo(), (torch.randn(2, 2),), dynamic_shapes=({0: dx, 1: None},))
+        ep.module()(torch.randn(1, 2))
+        ep.module()(torch.randn(2, 2))
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected input at .* to be <= 2, but got 3"
+        ):
+            ep.module()(torch.randn(3, 2))
+        vr = list(ep.range_constraints.values())[0]
+        self.assertEqual(vr.lower, 1)
+        self.assertEqual(vr.upper, 2)
 
-        register_dataclass_as_pytree_node(MyDataClass, serialized_type_name="test_pytree_register_data_class.MyDataClass")
+    def test_derived_dim_1_2(self):
+        class Bar(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y[1:]
 
-        flat, spec = tree_flatten(dt)
-        self.assertEqual(
-            spec,
-            TreeSpec(
-                MyDataClass,
-                (
-                    MyDataClass,
-                    ['x', 'y'],
-                    ['z']
-                ),
-                [LeafSpec(), LeafSpec()]
-            )
+        dx = Dim("dx", min=1, max=2)
+        ep = export(
+            Bar(),
+            (torch.randn(2, 2), torch.randn(3, 2)),
+            dynamic_shapes=({0: dx, 1: None}, {0: dx + 1, 1: None}),
         )
-        self.assertEqual(flat, [3, 4])
+        ep.module()(torch.randn(1, 2), torch.randn(2, 2))
+        range_lower_bounds = sorted(vr.lower for vr in ep.range_constraints.values())
+        range_upper_bounds = sorted(vr.upper for vr in ep.range_constraints.values())
+        self.assertEqual(range_lower_bounds, [1, 2])
+        self.assertEqual(range_upper_bounds, [2, 3])
 
-        orig_dt = tree_unflatten(flat, spec)
-        self.assertTrue(isinstance(orig_dt, MyDataClass))
-        self.assertEqual(orig_dt.x, 3)
-        self.assertEqual(orig_dt.y, 4)
-        self.assertEqual(orig_dt.z, None)
+    def test_dynamic_shapes_builder_basic(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y, z):
+                return x + y[0] + z["k"]
 
-        roundtrip_spec = treespec_loads(treespec_dumps(spec))
-        self.assertEqual(roundtrip_spec, spec)
+        m = M()
 
-        @dataclass
-        class MyOtherDataClass:  # the pytree registration don't allow registering the same class twice
-            x: int
-            y: int
-            z: int = None
+        x = torch.randn(4)
+        y = [torch.randn(4)]
+        z = {"k": torch.randn(4)}
+        args = (x, y, z)
 
-        # Override the registration with keep none fields
-        register_dataclass_as_pytree_node(MyOtherDataClass, return_none_fields=True, serialized_type_name="test_pytree_regster_data_class.MyOtherDataClass")
+        shapes_collection = torch.export.ShapesCollection()
+        dim = torch.export.Dim("dim", max=10)
+        shapes_collection[x] = (dim,)
+        shapes_collection[y[0]] = (dim,)
+        shapes_collection[z["k"]] = (dim,)
 
-        dt = MyOtherDataClass(x=3, y=4)
-        flat, spec = tree_flatten(dt)
-        self.assertEqual(
-            spec,
-            TreeSpec(
-                MyOtherDataClass,
-                (
-                    MyOtherDataClass,
-                    ['x', 'y', 'z'],
-                    [],
-                ),
-                [LeafSpec(), LeafSpec(), LeafSpec()]
-            )
-        )
-        self.assertEqual(flat, [3, 4, None])
+        ep = export(m, args, dynamic_shapes=shapes_collection)
+        sym = next(iter(ep.range_constraints.keys()))
+        for node in ep.graph.nodes:
+            if node.op == "placeholder":
+                self.assertEqual(str(tuple(node.meta["val"].shape)), f"({sym},)")
 
-        orig_dt = tree_unflatten(flat, spec)
-        self.assertTrue(isinstance(orig_dt, MyOtherDataClass))
-        self.assertEqual(orig_dt.x, 3)
-        self.assertEqual(orig_dt.y, 4)
-        self.assertEqual(orig_dt.z, None)
+    def test_dynamic_shapes_builder_kwargs(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y, z):
+                return x + y[0] + z["k"]
 
-        roundtrip_spec = treespec_loads(treespec_dumps(spec))
-        self.assertEqual(roundtrip_spec, spec)
+        m = M()
 
-    def test_pytree_register_nested_data_class(self):
+        x = torch.randn(4)
+        y = [torch.randn(4)]
+        z = {"k": torch.randn(4)}
+        args = (x,)
+        kwargs = {"z": z, "y": y}
 
-        @dataclass
-        class Inner:
-            x: int
-            y: int
+        shapes_collection = torch.export.ShapesCollection()
+        dim = torch.export.Dim("dim", max=10)
+        shapes_collection[x] = (dim,)
+        shapes_collection[y[0]] = (dim,)
+        shapes_collection[z["k"]] = (dim,)
 
-        @dataclass
-        class Outer:
-            xy: Inner
-            ab: Inner
+        ep = export(m, args, kwargs=kwargs, dynamic_shapes=shapes_collection)
+        sym = next(iter(ep.range_constraints.keys()))
+        for node in ep.graph.nodes:
+            if node.op == "placeholder":
+                self.assertEqual(str(tuple(node.meta["val"].shape)), f"({sym},)")
 
-        xy = Inner(1, 2)
-        ab = Inner(3, 4)
-        dt = Outer(xy, ab)
-        inp = {"dt1": (dt, ({},)), "dt2": ((torch.ones(1),), dt)}
+    # retracing doesn't seem to like dataclass registration,
+    # raising a dynamo error in fx_pytree.tree_flatten_spec
+    @testing.expectedFailureRetraceability
+    def test_dynamic_shapes_builder_pytree(self):
+        torch.export.register_dataclass(
+            Inp,
+            serialized_type_name="test_dynamic_shapes_builder_pytree.Inp",
+        )
 
-        register_dataclass_as_pytree_node(Inner, serialized_type_name="test_pytree_register_nested_data_class.Inner")
-        register_dataclass_as_pytree_node(Outer, serialized_type_name="test_pytree_register_nested_data_class.Outer")
+        class M(torch.nn.Module):
+            def forward(self, inp: Inp):
+                return inp.x + inp.y[0] + inp.z["k"]
 
-        flat, spec = tree_flatten(inp)
-        self.assertEqual(flat, [1, 2, 3, 4, torch.ones(1), 1, 2, 3, 4])
+        m = M()
+        x = torch.randn(4)
+        y = [torch.randn(4)]
+        z = {"k": torch.randn(4)}
+        args = (Inp(x, y, z),)
+
+        shapes_collection = torch.export.ShapesCollection()
+        dim = torch.export.Dim("dim", max=10)
+        shapes_collection[x] = (dim,)
+        shapes_collection[y[0]] = (dim,)
+        shapes_collection[z["k"]] = (dim,)
+
+        ep = export(m, args, dynamic_shapes=shapes_collection.dynamic_shapes(m, args))
+        sym = next(iter(ep.range_constraints.keys()))
+        for node in ep.graph.nodes:
+            if node.op == "placeholder":
+                self.assertEqual(str(tuple(node.meta["val"].shape)), f"({sym},)")
+
+    def test_torch_check_eq_commutativity(self):
+        class M1(torch.nn.Module):
+            def forward(self, x1, x2, x3, y):
+                z1 = x1.item()
+                z2 = x2.item()
+                z3 = x3.item()
+                # instead of: torch._check((z2 + z3) == z1)
+                torch._check(z1 == (z2 + z3))
+                if z2 + z3 == z1:
+                    return y * 2
+                else:
+                    return y + 3
 
-        unflat = tree_unflatten(flat, spec)
-        self.assertEqual(unflat, inp)
+        export(
+            M1(),
+            (torch.tensor(6), torch.tensor(3), torch.tensor(3), torch.randn(1)),
+        )
 
-        roundtrip_spec = treespec_loads(treespec_dumps(spec))
-        self.assertEqual(roundtrip_spec, spec)
+        class M2(torch.nn.Module):
+            def forward(self, x1, x2, x3, y):
+                z1 = x1.item()
+                z2 = x2.item()
+                z3 = x3.item()
+                # instead of: torch._check((z2 + z3) != z1)
+                torch._check(z1 != (z2 + z3))
+                if z2 + z3 == z1:
+                    return y * 2
+                else:
+                    return y + 3
 
-    def test_param_util(self):
-        class Basic(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.lin = torch.nn.Linear(10, 1)
+        export(
+            M2(),
+            (torch.tensor(6), torch.tensor(6), torch.tensor(6), torch.randn(1)),
+        )
 
+    def test_raise_user_error_when_guard_on_data_dependent_operation(self):
+        class M(torch.nn.Module):
             def forward(self, x):
-                return self.lin(x)
+                y = x.nonzero()
+                z = y.shape[0]
+                if z > 2:
+                    return x.cos()
+                else:
+                    return x.sin()
 
-        ep = export(Basic(), (torch.randn(5, 10),))
-        num_params = 0
-        params = []
-        for node in ep.graph.nodes:
-            if is_param(ep, node):
-                num_params += 1
-                params.append(get_param(ep, node))
-        self.assertEqual(num_params, 2)
-        self.assertEqual(params[0].shape, [1, 10])  # weight
-        self.assertEqual(params[1].shape, [1])  # bias
+        with self.assertRaisesRegex(
+            (
+                torchdynamo.exc.UserError,
+                torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode,
+            ),
+            "Could not guard on data-dependent expression",
+        ):
+            _ = export(M(), (torch.tensor([2, 3, 5]),))
 
-    def test_buffer_util(self):
-        ep = export(torch.nn.BatchNorm2d(100, affine=False), (torch.ones(20, 100, 35, 45), ))
-        num_buffer = 0
-        buffer = []
+    def test_if_functional(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                z = x + 4
+                z.add_(4)
+                y = z.view(x.shape)
+                return x.cos() + y.cos()
 
-        for node in ep.graph.nodes:
-            if is_buffer(ep, node):
-                num_buffer += 1
-                buffer.append(get_buffer(ep, node))
-        self.assertEqual(num_buffer, 3)
+        foo = Module()
+        gm = export(foo, (torch.tensor([2, 3, 5]),))
 
-        self.assertEqual(buffer[0].shape, torch.Size([100]))  # running_mean
-        self.assertEqual(buffer[1].shape, torch.Size([100]))  # running_var
-        self.assertEqual(buffer[2].shape, torch.Size([]))  # num_batches_tracked
+        view_count = 0
+        for node in gm.graph.nodes:
+            if node.op == "call_function" and node.target == torch.ops.aten.add_.Tensor:
+                # No more inplace mutation
+                self.assertNotEqual(
+                    node.target,
+                    torch.ops.aten.add_.Tensor,
+                    "There shouldn't be any inplace mutation node in the graph.",
+                )
+            if (
+                node.op == "call_function"
+                and node.target == torch.ops.aten.view.default
+            ):
+                view_count += 1
 
+        # There should be nonzero view nodes in the graph
+        self.assertTrue(view_count > 0)
 
-    @testing.expectedFailureNonStrict
-    def test_export_dynamo_config(self):
-        class MyModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.lstm = torch.nn.LSTM(input_size=4, hidden_size=5, num_layers=1)
+    def test_export_mod_constraints(self):
+        class BasicDynamiShapeModel(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return x.view(x.shape[0] - 1, -1)
 
-            def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-                return self.lstm(inputs)
+        m = BasicDynamiShapeModel()
+        a = torch.randn(3, 4)
+        dim0_x = torch.export.Dim("dim0_x", min=3)
+        dim1_x = torch.export.Dim("dim1_x", max=8000)
+        dynamic_shapes = {"x": (dim0_x, dim1_x)}
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UserError,
+            (
+                "Specializations unexpectedly required"
+                ".*\n.*\\[0\\] must be specialized to 3.*guards.*too complex(.*\n)*.*"
+                "Suggested fixes:(.*\n)*.*"
+                "dim0_x = 3(.*\n)*.*"
+                "dim1_x = 2\\*_dim1_x"
+            ),
+        ):
+            torch.export.export(m, (a,), dynamic_shapes=dynamic_shapes)
+        dim0_x = None
+        dim1_x = 2 * torch.export.Dim("_dim1_x", max=4000)
+        dynamic_shapes = {"x": (dim0_x, dim1_x)}
+        em = torch.export.export(m, (a,), dynamic_shapes=dynamic_shapes)
+        x = torch.randn(3, 5)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected.*shape\\[1\\] = 5 to be of the form 2\\*s1, where s1 is an integer",
+        ):
+            em.module()(x)
 
+    def test_not_correct_dim(self):
+        def f(x):
+            return x.cos()
 
-        config = DEFAULT_EXPORT_DYNAMO_CONFIG
-        mod = MyModule()
+        def g(x):
+            return x + 4
 
-        @contextmanager
-        def _patch_config(kwargs):
-            orig_config_dict = dataclasses.asdict(config)
+        inp_for_f = torch.tensor(5)
+        with self.assertRaisesRegex(
+            torchdynamo.exc.UserError, "Cannot mark 0-dimension tensors to be dynamic"
+        ):
+            constraints = [dynamic_dim(inp_for_f, 0)]
 
-            try:
-                for k, v in kwargs.items():
-                    setattr(config, k, v)
-                yield
-            finally:
-                for k, v in orig_config_dict.items():
-                    setattr(config, k, v)
+        inp_for_f_mul_dim = torch.ones(5, 5)
+        with self.assertRaisesRegex(
+            torchdynamo.exc.UserError,
+            "Expected the dimension passed to dynamic_dim to be in the range \\[0:1\\]",
+        ):
+            constraints = [dynamic_dim(inp_for_f_mul_dim, 2)]
 
-        inp = (torch.rand(5, 4), )
-        exported_program = export(mod, inp)
+        inp_for_g = 4
+        with self.assertRaisesRegex(
+            torchdynamo.exc.UserError, "Expected tensor as input to dynamic_dim"
+        ):
+            constraints = [dynamic_dim(inp_for_g, 0)]
 
-        with _patch_config({"allow_rnn": False}):
-            with self.assertRaisesRegex(
-                torch._dynamo.exc.Unsupported,
-                "TorchDynamo purposely graph breaks on RNN, GRU, LSTMs"
-            ):
-                _ = export(mod, inp)
+    @testing.expectedFailureRetraceability  # T183144629
+    def test_map(self):
+        class Module(torch.nn.Module):
+            def forward(self, xs, y, z):
+                def body(x, y, z):
+                    return x + y + z
 
-    @testing.expectedFailureSerDer
-    @testing.expectedFailureNonStrict
-    def test_module(self):
+                return map(body, xs, y, z)
 
-        class MyLinear(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.weight = torch.randn(20, 98)
-                self.bias = torch.randn(20)
+        list_tensor_map = Module()
+        inps = (torch.ones(6, 4), torch.tensor(5), torch.tensor(4))
+        self._test_export_same_as_eager(list_tensor_map, inps)
 
-            def forward(self, x):
-                return torch.nn.functional.linear(x, self.weight, self.bias)
+    @unittest.expectedFailure
+    def test_crop_like(self):
+        # https://fb.workplace.com/groups/1405155842844877/posts/8195050017188725/
+
+        # Minimal crop code copied from https://github.com/pytorch/vision/blob/main/torchvision/transforms/v2/functional
+        class CropLike(torch.nn.Module):
+            def forward(self, image, crop_height, crop_width):
+                c, image_height, image_width = image.shape
+                crop_top = int(round((image_height - crop_height) / 2.0))
+                crop_left = int(round((image_width - crop_width) / 2.0))
+                return image[
+                    ...,
+                    crop_top : crop_top + crop_height,
+                    crop_left : crop_left + crop_width,
+                ]
+
+        crop = CropLike()
+        imagew = Dim("width")
+        imageh = Dim("height")
+        dynamic_dims = {
+            "image": {0: None, 1: imageh, 2: imagew},
+            "crop_height": None,
+            "crop_width": None,
+        }
+        args = (torch.rand(3, 512, 512), 150, 150)
+        ecrop = export(crop, args=args, dynamic_shapes=dynamic_dims)
 
-        class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = torch.nn.Conv2d(16, 33, 3)
-                self.linear = MyLinear()
+        args = (torch.rand(3, 700, 700), 150, 150)
+        self.assertEqual(ecrop.module()(*args), ecrop(*args))
 
-            def forward(self, x):
-                a, b = x
-                a_conv = self.conv(a)
-                a_linear = self.linear(a_conv)
-                b_conv = self.conv(b)
-                b_linear = self.linear(b_conv)
-                return (a_linear.cos() + b_linear.sin(), a_linear.sin() + b_linear.cos())
+    def test_export_func_with_kwargs(self):
+        class Module(torch.nn.Module):
+            def forward(self, arg1, arg2, kw1, kw2):
+                return arg1 + arg2, kw1 + kw2
 
-        inp_container = ((torch.randn(20, 16, 50, 100), torch.randn(20, 16, 50, 100)),)
+        kw_func = Module()
+        args = (torch.ones(6, 4), torch.ones(1, 1))
+        kwargs = {"kw1": torch.ones(1, 1), "kw2": torch.ones(6, 4)}
+        self._test_export_same_as_eager(kw_func, args, kwargs)
 
-        ep = export(Foo(), inp_container)
-        ep_rexported = export(ep.module(), inp_container)
+    def test_export_func_with_pytree_kwargs(self):
+        class Module(torch.nn.Module):
+            def forward(self, arg1, arg2, a, b):
+                return arg1 + a["kw1"] + b[0], arg2 + a["kw2"] + b[1]
 
-        inp_test = ((torch.randn(20, 16, 50, 100), torch.randn(20, 16, 50, 100)),)
+        kw_func = Module()
+        args = (torch.ones(2, 3), torch.ones(3, 4))
+        kwargs = {
+            "a": {"kw1": torch.ones(2, 3), "kw2": torch.ones(3, 4)},
+            "b": [torch.ones(2, 3), torch.ones(3, 4)],
+        }
+        self._test_export_same_as_eager(kw_func, args, kwargs)
 
-        self.assertTrue(torch.allclose(ep(*inp_test)[0], ep_rexported(*inp_test)[0]))
-        self.assertTrue(torch.allclose(ep(*inp_test)[1], ep_rexported(*inp_test)[1]))
+    def test_export_func_with_default_kwargs(self):
+        class Module(torch.nn.Module):
+            def forward(self, arg1, arg2, a, b=1):
+                return arg1 + arg2, a["kw1"] + a["kw2"] + b
 
-    @testing.expectedFailureSerDer
-    @testing.expectedFailureNonStrict
-    def test_module_with_dict_container_inp_out(self):
+        kw_func = Module()
 
-        class MyLinear(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.weight = torch.randn(20, 98)
-                self.bias = torch.randn(20)
+        class Module2(torch.nn.Module):
+            def forward(self, arg1, arg2, a=1, b=2):
+                return arg1 + a, arg2 + b
 
-            def forward(self, x):
-                return torch.nn.functional.linear(x, self.weight, self.bias)
+        kw_func2 = Module2()
 
-        class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = torch.nn.Conv2d(16, 33, 3)
-                self.linear = MyLinear()
+        args = (torch.ones(6, 4), torch.ones(1, 1))
+        kwargs1 = {"a": {"kw1": torch.ones(1, 1), "kw2": torch.ones(6, 4)}}
+        kwargs2 = {"a": {"kw1": torch.ones(1, 1), "kw2": torch.ones(6, 4)}, "b": 2}
+        self._test_export_same_as_eager(kw_func, args, kwargs1)
+        self._test_export_same_as_eager(kw_func, args, kwargs2)
+        kwargs3 = {"b": 1}
+        self._test_export_same_as_eager(kw_func2, args, kwargs3)
 
-            def forward(self, x):
-                a1, a2 = x["a"]
-                b = x["b"]
-                a1_conv = self.conv(a1)
-                a1_linear = self.linear(a1_conv)
-                a2_conv = self.conv(a2)
-                a2_linear = self.linear(a2_conv)
-                b_conv = self.conv(b)
-                b_linear = self.linear(b_conv)
-                return {"a": a1_linear.cos() + b_linear.sin(), "b": a2_linear.sin() + b_linear.cos()}
+    def test_export_func_with_var_postional_args(self):
+        class Module(torch.nn.Module):
+            def forward(self, arg1, arg2, *args):
+                return arg1 + args[0], arg2 + args[1]
 
-        inp_container = ({"a": (torch.randn(20, 16, 50, 100), torch.randn(20, 16, 50, 100)), "b": torch.randn(20, 16, 50, 100)},)
+        kw_func = Module()
+        args = (torch.ones(2, 3), torch.ones(3, 4), torch.ones(2, 3), torch.ones(3, 4))
+        self._test_export_same_as_eager(kw_func, args)
 
-        ep = export(Foo(), inp_container)
-        ep_rexported = export(ep.module(), inp_container)
+    def test_export_func_with_keyword_only_args(self):
+        class Module(torch.nn.Module):
+            def forward(self, arg1, arg2, *args, kw1, kw2):
+                return arg1 + args[0] + kw1, arg2 + args[1] + kw2
 
-        inp_test = ({"a": (torch.randn(20, 16, 50, 100), torch.randn(20, 16, 50, 100)), "b": torch.randn(20, 16, 50, 100)},)
+        kw_func = Module()
+        args = (torch.ones(2, 3), torch.ones(3, 4), torch.ones(2, 3), torch.ones(3, 4))
+        kwargs = {"kw1": torch.ones(2, 3), "kw2": torch.ones(3, 4)}
+        self._test_export_same_as_eager(kw_func, args, kwargs)
 
-        self.assertTrue(torch.allclose(ep(*inp_test)["a"], ep_rexported(*inp_test)["a"]))
-        self.assertTrue(torch.allclose(ep(*inp_test)["b"], ep_rexported(*inp_test)["b"]))
+    def test_export_func_with_var_keyword_args(self):
+        class Module(torch.nn.Module):
+            def forward(self, arg1, arg2, *args, kw1, kw2, **kwargs):
+                return (
+                    arg1 + args[0] + kw1 + kwargs["kw3"],
+                    arg2 + args[1] + kw2 + kwargs["kw4"],
+                )
 
-    @testing.expectedFailureNonStrict
-    def test_args_type_checked(self):
-        class M(torch.nn.Module):
-            def forward(self, x):
-                return x + 1
+        kw_func = Module()
+        args = (torch.ones(2, 3), torch.ones(3, 4), torch.ones(2, 3), torch.ones(3, 4))
+        kwargs = {
+            "kw1": torch.ones(2, 3),
+            "kw2": torch.ones(3, 4),
+            "kw3": torch.ones(2, 3),
+            "kw4": torch.ones(3, 4),
+        }
+        self._test_export_same_as_eager(kw_func, args, kwargs)
 
-        inp = torch.rand(2, 2)
-        with self.assertRaisesRegex(torch._dynamo.exc.UserError, "to be a tuple"):
-            # Intentionally not wrapping `inp` in a tuple to trigger the error
-            _ = export(M(), inp)
+    def test_unbacked_slice(self):
+        class M(torch.nn.Module):
+            def forward(self, scores, score_thr, topk: torch.Tensor, results=None):
+                valid_mask = scores > score_thr
+                scores = scores[valid_mask]
+                valid_idxs = torch.nonzero(valid_mask).to(scores.device)
+
+                num_topk = torch.minimum(topk, torch.tensor(valid_idxs.shape[0])).item()
+                torch._check_is_size(num_topk)
+                torch._check(scores.shape[0] >= num_topk)
+                scores, idxs = scores.sort(descending=True)
+                scores = scores[:num_topk]
+                topk_idxs = valid_idxs[idxs[:num_topk]]
+                keep_idxs, labels = topk_idxs.unbind(dim=1)
+
+                return scores, labels, keep_idxs
+
+        score = torch.tensor(
+            [[0.1, 0.3, 0.2], [0.12, 0.7, 0.9], [0.02, 0.8, 0.08], [0.4, 0.1, 0.08]]
+        )
+        bbox_pred = torch.tensor([[0.2, 0.3], [0.4, 0.7], [0.1, 0.1], [0.5, 0.1]])
+        score_thr = 0.15
+        nms_pre = torch.tensor(4)
+        inputs = (score, score_thr, nms_pre, dict(bbox_pred=bbox_pred))
+
+        ep = torch.export.export(M(), inputs)
+        orig_res = M()(*inputs)
+        ep_res = ep.module()(*inputs)
+        self.assertTrue(torch.allclose(orig_res[0], ep_res[0]))
+        self.assertTrue(torch.allclose(orig_res[1], ep_res[1]))
+        self.assertTrue(torch.allclose(orig_res[2], ep_res[2]))
 
-    @testing.expectedFailureNonStrict
-    def test_constrain_value_with_no_default(self):
-        def fn(x, y):
-            n = x.max().item()
-            torch._constrain_as_value(n)
-            return y + n
+    def test_export_func_with_var_keyword_pytree_args(self):
+        class Module(torch.nn.Module):
+            def forward(self, arg1, arg2, *args, kw1, kw2, **kwargs):
+                return (
+                    arg1 + arg2[0][0] + args[0] + kw1[0] + kwargs["kw3"][0],
+                    arg2[1] + args[1] + kw2 + kwargs["kw4"],
+                )
 
-        ep = export(fn, (torch.randint(3, 5, (2, 2)), torch.randint(3, 5, (2, 3))))
-        test_inp = (torch.randint(3, 5, (2, 2)), torch.randint(3, 5, (2, 3)))
-        self.assertTrue(torch.allclose(ep(*test_inp), fn(*test_inp)))
+        kw_func = Module()
+        args = (
+            torch.ones(2, 3),
+            [(torch.ones(2, 3),), torch.ones(3, 4)],
+            torch.ones(2, 3),
+            torch.ones(3, 4),
+        )
+        kwargs = {
+            "kw1": (torch.ones(2, 3),),
+            "kw2": torch.ones(3, 4),
+            "kw3": (torch.ones(2, 3), torch.ones(3, 4)),
+            "kw4": torch.ones(3, 4),
+        }
+        self._test_export_same_as_eager(kw_func, args, kwargs)
 
-    def test_decomp_batch_norm_functional_predispatch(self):
-        class ConvBatchnorm(torch.nn.Module):
+    @testing.expectedFailureSerDer  # we don't save placeholder metadata
+    @testing.expectedFailureSerDerPreDispatch
+    @testing.expectedFailureNonStrict
+    def test_linear_conv(self):
+        class MyLinear(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.conv = torch.nn.Conv2d(1, 3, 1, 1)
-                self.bn = torch.nn.BatchNorm2d(3)
+                self.weight = torch.randn(20, 98)
+                self.bias = torch.randn(20)
 
             def forward(self, x):
-                x = self.conv(x)
-                x = self.bn(x)
-                return (x,)
-
-        mod = ConvBatchnorm()
-        mod.eval()
-        inp = torch.randn(1, 1, 3, 3)
-
-        gm = torch.export._trace._export(mod, (inp,), pre_dispatch=True).module()
-        self.assertExpectedInline(str(gm.code).strip(), """\
-def forward(self, arg_0):
-    l_x_, = fx_pytree.tree_flatten_spec(([arg_0], {}), self._in_spec)
-    conv_weight = self.conv_weight
-    conv_bias = self.conv_bias
-    bn_weight = self.bn_weight
-    bn_bias = self.bn_bias
-    bn_running_mean = self.bn_running_mean
-    bn_running_var = self.bn_running_var
-    conv2d = torch.ops.aten.conv2d.default(l_x_, conv_weight, conv_bias);  l_x_ = conv_weight = conv_bias = None
-    _native_batch_norm_legit_no_training = torch.ops.aten._native_batch_norm_legit_no_training.default(conv2d, bn_weight, bn_bias, bn_running_mean, bn_running_var, 0.1, 1e-05);  conv2d = bn_weight = bn_bias = bn_running_mean = bn_running_var = None
-    getitem = _native_batch_norm_legit_no_training[0];  _native_batch_norm_legit_no_training = None
-    return pytree.tree_unflatten((getitem,), self._out_spec)""")
-
-        mod.train()
-        gm_train = _export(mod, (inp,), pre_dispatch=True).module()
-        self.assertExpectedInline(str(gm_train.code).strip(), """\
-def forward(self, arg_0):
-    l_x_, = fx_pytree.tree_flatten_spec(([arg_0], {}), self._in_spec)
-    conv_weight = self.conv_weight
-    conv_bias = self.conv_bias
-    bn_weight = self.bn_weight
-    bn_bias = self.bn_bias
-    bn_running_mean = self.bn_running_mean
-    bn_running_var = self.bn_running_var
-    bn_num_batches_tracked = self.bn_num_batches_tracked
-    conv2d = torch.ops.aten.conv2d.default(l_x_, conv_weight, conv_bias);  l_x_ = conv_weight = conv_bias = None
-    add = torch.ops.aten.add.Tensor(bn_num_batches_tracked, 1)
-    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(conv2d, bn_weight, bn_bias, bn_running_mean, bn_running_var, True, 0.1, 1e-05);  conv2d = bn_weight = bn_bias = None
-    getitem = _native_batch_norm_legit_functional[0]
-    getitem_3 = _native_batch_norm_legit_functional[3]
-    getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
-    copy__default = torch.ops.aten.copy_.default(bn_running_mean, getitem_3);  bn_running_mean = getitem_3 = None
-    copy__default_1 = torch.ops.aten.copy_.default(bn_running_var, getitem_4);  bn_running_var = getitem_4 = None
-    copy__default_2 = torch.ops.aten.copy_.default(bn_num_batches_tracked, add);  bn_num_batches_tracked = add = None
-    return pytree.tree_unflatten((getitem,), self._out_spec)""")
-
-
-
-    @testing.expectedFailureNonStrict
-    def test_constrain_value_with_symfloat(self):
-        def fn(x, y):
-            n = x.max().item()
-            torch._constrain_as_value(n)
-            return y + n
-
-        with self.assertRaisesRegex(torch._dynamo.exc.TorchRuntimeError, "Constraining SymFloat or Symbool is nyi"):
-            _ = export(fn, (torch.rand(2, 2), torch.rand(2, 3)))
-
-    @testing.expectedFailureNonStrict
-    def test_constrain_size_in_eager(self):
-        def fn(x, y):
-            n = x.max().item()
-            torch._constrain_as_size(n)
-            return y + n
-
-        ep = export(fn, (torch.randint(1, 2, (2, 2)), torch.randint(3, 5, (2, 3))))
-        test_inp = (torch.randint(1, 2, (2, 2)), torch.randint(3, 5, (2, 3)))
-        self.assertTrue(torch.allclose(ep(*test_inp), fn(*test_inp)))
-
-    @testing.expectedFailureNonStrict
-    def test_constrain_size_with_constrain_value(self):
-        def fn(x, y):
-            n = x.max().item()
-            torch._constrain_as_value(n, 2, 10)
-            torch._constrain_as_size(n)
-            return y + n
-
-        with self.assertRaisesRegex(RuntimeError, r"Invalid value range for 1 between \[2, 10\]."):
-            _ = fn(torch.randint(1, 2, (2, 2)), torch.randint(3, 5, (2, 3)))
-
-        ep = export(fn, (torch.randint(3, 4, (2, 2)), torch.randint(3, 5, (2, 3))))
-        with self.assertRaisesRegex(RuntimeError, "is outside of inline constraint"):
-            test_inp = (torch.randint(1, 2, (2, 2)), torch.randint(3, 5, (2, 3)))
-            _ = ep(*test_inp)
-
-    @testing.expectedFailureNonStrict
-    def test_constrain_size_with_various_cases(self):
-
-        def case_1(x, y):
-            n = x.item()
-            torch._constrain_as_size(n, min=0)
-            return y.sum() + torch.ones(n, 5).sum()
-
-        def case_2(x, y):
-            n = x.item()
-            torch._constrain_as_size(n, min=0, max=6)
-            return y.sum() + torch.ones(n, 5).sum()
-
-        def case_3(x, y):
-            n = x.item()
-            torch._constrain_as_size(n, min=0, max=1)
-            return y.sum() + torch.ones(n, 5).sum()
-
-        def case_4(x, y):
-            n = x.item()
-            torch._constrain_as_size(n, min=2)
-            return y.sum() + torch.ones(n, 5).sum()
-
-        def case_5(x, y):
-            n = x.item()
-            torch._constrain_as_size(n, min=1)
-            return y.sum() + torch.ones(n, 5).sum()
-
-        ep = export(case_1, (torch.tensor(1), torch.ones(4, 5)))
+                return torch.nn.functional.linear(x, self.weight, self.bias)
 
-        with self.assertRaisesRegex(RuntimeError, r"Invalid value range for -1 between"):
-            _ = case_1(torch.tensor(-1), torch.randn(4, 5))
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(16, 33, 3)
+                self.linear = MyLinear()
 
-        self.assertTrue(
-            torch.allclose(
-                ep(torch.tensor(1), torch.ones(4, 5)),
-                case_1(torch.tensor(1), torch.ones(4, 5)),
-            )
-        )
+            def forward(self, x):
+                x_conv = self.conv(x)
+                x_linear = self.linear(x_conv)
+                return x_linear.cos()
 
-        ep = export(case_2, (torch.tensor(5), torch.randn(4, 5)))
+        ep = export(Foo(), (torch.randn(20, 16, 50, 100),))
+        for node in ep.graph.nodes:
+            if (
+                node.op == "placeholder"
+                and node.name in ep.graph_signature.inputs_to_buffers
+                or node.name in ep.graph_signature.inputs_to_parameters
+            ):
+                self.assertTrue("source_fn_stack" in node.meta)
 
-        with self.assertRaisesRegex(RuntimeError, r"Invalid value range for 7 between"):
-            _ = case_2(torch.tensor(7), torch.randn(4, 5))
+    def test_export_api_with_dynamic_shapes(self):
+        from torch.export import Dim, dims, export
 
-        with self.assertRaisesRegex(RuntimeError, r"Invalid value range for 9 between"):
-            _ = case_2(torch.tensor(9), torch.randn(4, 5))
+        # pass dynamic shapes of inputs [args]
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.matmul(x, y)
 
-        self.assertTrue(
-            torch.allclose(
-                ep(torch.tensor(5), torch.ones(4, 5)),
-                case_2(torch.tensor(5), torch.ones(4, 5)),
-            )
+        foo = Foo()
+        inputs = (torch.randn(10, 2, 3), torch.randn(10, 3, 4))
+        batch = Dim("batch")
+        efoo = export(
+            foo,
+            inputs,
+            dynamic_shapes={k: {0: batch} for k in ["x", "y"]},
         )
+        self.assertEqual(efoo.module()(*inputs).shape, foo(*inputs).shape)
 
-        with self.assertRaisesRegex(RuntimeError, "Max value to constrain_range_for_size must be greater than 2. got: 1"):
-            _ = case_3(torch.tensor(1), torch.randn(4, 5))
-
-        with self.assertRaisesRegex(RuntimeError, r"Invalid value range for 1 between \[2, 9223372036854775807\]."):
-            _ = case_4(torch.tensor(1), torch.randn(4, 5))
-
-        ep = export(case_4, (torch.tensor(5), torch.randn(4, 5)))
-
-        with self.assertRaisesRegex(RuntimeError, r"Invalid value range for 1"):
-            _ = case_4(torch.tensor(1), torch.randn(4, 5))
-
-        self.assertTrue(
-            torch.allclose(
-                ep(torch.tensor(5), torch.ones(4, 5)),
-                case_4(torch.tensor(5), torch.ones(4, 5)),
-            )
+        foo = Foo()
+        inputs = (torch.randn(10, 2, 3),)
+        kwinputs = {"y": torch.randn(10, 3, 4)}
+        batch = Dim("batch")
+        efoo = export(
+            foo, inputs, kwinputs, dynamic_shapes={k: {0: batch} for k in ["x", "y"]}
         )
-
-        ep = export(case_5, (torch.tensor(5), torch.randn(4, 5)))
-
-        with self.assertRaisesRegex(RuntimeError, r"Invalid value range for 0"):
-            _ = case_5(torch.tensor(0), torch.randn(4, 5))
-
-        self.assertTrue(
-            torch.allclose(
-                ep(torch.tensor(5), torch.ones(4, 5)),
-                case_5(torch.tensor(5), torch.ones(4, 5)),
-            )
+        self.assertEqual(
+            efoo.module()(*inputs, **kwinputs).shape, foo(*inputs, **kwinputs).shape
         )
 
-    @testing.expectedFailureNonStrict  # non-strict does not add deferred runtime assertions
-    def test_automatic_constrain_size(self):
-
-        class M(torch.nn.Module):
-            def forward(self, x, y):
-                n = x.item()
-                return y.sum() + torch.ones(n, 5).sum()
-
-        ep = export(M(), (torch.tensor(1), torch.ones(4, 5)))
-
+        # pass dynamic shapes of inputs [partial, error]
+        foo = Foo()
+        inputs = (torch.randn(10, 2, 3),)
+        kwinputs = {"y": torch.randn(10, 3, 4)}
+        batch = Dim("batch")
         with self.assertRaisesRegex(
-            RuntimeError,
-            r"(Deferred runtime assertion failed -i0 <= 0|_local_scalar_dense is outside of inline constraint \[0, inf\])"
+            torch._dynamo.exc.UserError,
+            (
+                "Constraints violated \\(batch\\)!(.*\n)*.*"
+                "batch was inferred to be a constant(.*\n)*.*"
+                "Suggested fixes:(.*\n)*.*"
+                "batch = 10"
+            ),
         ):
-            _ = ep(torch.tensor(-1), torch.randn(4, 5))
-
-        self.assertTrue(
-            torch.allclose(
-                ep(torch.tensor(1), torch.ones(4, 5)),
-                M()(torch.tensor(1), torch.ones(4, 5)),
+            export(
+                foo,
+                inputs,
+                kwinputs,
+                dynamic_shapes={"x": {0: batch}, "y": None},
             )
+
+        # pass dynamic shapes of inputs [module]
+        foo = Foo()
+        inputs = (torch.randn(10, 2, 3), torch.randn(10, 3, 4))
+        batch = Dim("batch")
+        efoo = export(
+            foo,
+            inputs,
+            dynamic_shapes={"x": {0: batch}, "y": {0: batch}},
         )
+        self.assertEqual(efoo.module()(*inputs).shape, foo(*inputs).shape)
 
-    def test_constrain_decomp(self) -> None:
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
+        # pass dynamic shapes of inputs [bounds, mostly shared]
+        foo = Foo()
+        inputs = (torch.randn(10, 3, 3), torch.randn(10, 3, 3))
+        batch = Dim("batch", min=8, max=64)
+        size = Dim("size")
+        efoo = export(
+            foo,
+            inputs,
+            dynamic_shapes={
+                "x": (batch, size, size),
+                "y": (batch, size, size),
+            },
+        )
+        self.assertEqual(
+            [
+                str(node.meta["val"].shape)
+                for node in efoo.graph_module.graph.nodes
+                if node.op == "placeholder"
+            ],
+            ["torch.Size([s0, s1, s1])", "torch.Size([s0, s1, s1])"],
+        )
+        self.assertEqual(efoo.module()(*inputs).shape, foo(*inputs).shape)
+
+        # pass dynamic shapes of inputs [multiple, mostly distinct]
+        inputs = (torch.randn(10, 2, 3), torch.randn(10, 3, 4))
+        batch, M, K, N = dims("batch", "M", "K", "N")
+        efoo = export(
+            Foo(),
+            inputs,
+            dynamic_shapes={"x": (batch, M, K), "y": (batch, K, N)},
+        )
+        self.assertEqual(
+            [
+                str(node.meta["val"].shape)
+                for node in efoo.graph_module.graph.nodes
+                if node.op == "placeholder"
+            ],
+            ["torch.Size([s0, s1, s2])", "torch.Size([s0, s2, s5])"],
+        )
+        self.assertEqual(efoo.module()(*inputs).shape, foo(*inputs).shape)
+
+        # pass dynamic shapes of inputs [dict]
+        class Foo(torch.nn.Module):
+            def forward(self, inputs):
+                return torch.matmul(inputs["x"], inputs["y"])
+
+        foo = Foo()
+        inputs = ({"x": torch.randn(10, 2, 3), "y": torch.randn(10, 3, 4)},)
+        batch = Dim("batch")
+        efoo = export(
+            foo, inputs, dynamic_shapes={"inputs": {k: {0: batch} for k in ["x", "y"]}}
+        )
+        self.assertEqual(
+            [
+                str(node.meta["val"].shape)
+                for node in efoo.graph_module.graph.nodes
+                if node.op == "placeholder"
+            ],
+            ["torch.Size([s0, 2, 3])", "torch.Size([s0, 3, 4])"],
+        )
+        self.assertEqual(efoo.module()(*inputs).shape, foo(*inputs).shape)
+
+        # pass dynamic shapes of inputs [list]
+        class Foo(torch.nn.Module):
+            def forward(self, inputs):
+                return torch.matmul(inputs[0], inputs[1])
+
+        foo = Foo()
+        inputs = ([torch.randn(10, 2, 3), torch.randn(10, 3, 4)],)
+        batch = Dim("batch")
+        efoo = export(
+            foo, inputs, dynamic_shapes={"inputs": [{0: batch} for _ in range(2)]}
+        )
+        self.assertEqual(
+            [
+                str(node.meta["val"].shape)
+                for node in efoo.graph_module.graph.nodes
+                if node.op == "placeholder"
+            ],
+            ["torch.Size([s0, 2, 3])", "torch.Size([s0, 3, 4])"],
+        )
+        self.assertEqual(efoo.module()(*inputs).shape, foo(*inputs).shape)
+
+        # pass dynamic shapes of inputs [dataclass]
+
+        # TODO(avik): This part of the test should have failed both serde and retracing
+        # but these failures are hidden because of the local import of `export` in this test.
+        # The serde failure is benign, and easily avoided by moving the dataclass definition
+        # to the top-level. OTOH the retracing failure needs further investigation.
+        @dataclass
+        class DataClass:
+            a: Tensor
+            b: Tensor
+
+        register_dataclass_as_pytree_node(
+            DataClass,
+            serialized_type_name="test_export_api_with_dynamic_shapes.DataClass",
+        )
+
+        class Foo(torch.nn.Module):
+            def forward(self, inputs):
+                return torch.matmul(inputs.a, inputs.b)
+
+        foo = Foo()
+        inputs = (DataClass(a=torch.randn(10, 2, 3), b=torch.randn(10, 3, 4)),)
+        batch = Dim("batch")
+        efoo = export(
+            foo,
+            inputs,
+            dynamic_shapes={"inputs": [{0: batch}, {0: batch}]},
+        )
+        self.assertEqual(
+            [
+                str(node.meta["val"].shape)
+                for node in efoo.graph_module.graph.nodes
+                if node.op == "placeholder"
+            ],
+            ["torch.Size([s0, 2, 3])", "torch.Size([s0, 3, 4])"],
+        )
+
+        # pass dynamic shapes of inputs [pytree-registered classes]
+        if HAS_TORCHREC:
+            # skipping tests if torchrec not available
+            class Foo(torch.nn.Module):
+                def forward(self, kjt) -> torch.Tensor:
+                    return kjt.values() + 0, kjt.offsets() + 0
+
+            foo = Foo()
+            kjt = KeyedJaggedTensor(
+                values=torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]),
+                keys=["index_0", "index_1"],
+                lengths=torch.IntTensor([0, 2, 0, 1, 1, 1, 0, 3]),
+                offsets=torch.IntTensor([0, 0, 2, 2, 3, 4, 5, 5, 8]),
+            )
+            inputs = (kjt,)
+            dim = Dim("dim")
+            dim_plus_one = Dim("dim_plus_one")
+            efoo = torch.export.export(
+                foo,
+                inputs,
+                dynamic_shapes={"kjt": [{0: dim}, None, {0: dim}, {0: dim_plus_one}]},
+            )
+            self.assertEqual(
+                [out.shape for out in efoo.module()(*inputs)],
+                [out.shape for out in foo(*inputs)],
+            )
+
+        # pass dynamic shapes of inputs [distinct, error]
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.matmul(x, y)
+
+        foo = Foo()
+        inputs = (torch.randn(10, 2, 3), torch.randn(10, 3, 4))
+        batch, M, K1, K2, N = dims("batch", "M", "K1", "K2", "N")
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UserError,
+            (
+                "Constraints violated \\(K2\\)!(.*\n)*.*"
+                "K2.*and.*K1.*must always be equal(.*\n)*.*"
+                "Suggested fixes:(.*\n)*.*"
+                "K2 = K1"
+            ),
+        ):
+            export(
+                foo,
+                inputs,
+                dynamic_shapes={"x": (batch, M, K1), "y": (batch, K2, N)},
+            )
+
+        # pass dynamic shapes of inputs [specialized, error]
+        foo = Foo()
+        inputs = (torch.randn(10, 2, 3), torch.randn(10, 3, 4))
+        batch, M, K1, N = dims("batch", "M", "K1", "N")
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UserError,
+            (
+                "Constraints violated \\(K1\\)!(.*\n)*.*"
+                "K1 was inferred to be a constant(.*\n)*.*"
+                "Suggested fixes:(.*\n)*.*"
+                "K1 = 3"
+            ),
+        ):
+            export(
+                foo,
+                inputs,
+                dynamic_shapes={"x": (batch, M, K1), "y": (batch, None, N)},
+            )
+
+        # pass dynamic shapes of inputs [guards, error]
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                if x.shape[0] < 16 and y.shape[1] % 3 == 0:
+                    return torch.matmul(x, y)
+                else:
+                    return x + y
+
+        foo = Foo()
+        inputs = (torch.randn(10, 2, 3), torch.randn(10, 3, 4))
+        batch, M, K, N = dims("batch", "M", "K", "N")
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UserError,
+            (
+                "Constraints violated.*!(.*\n)*.*"
+                "Not all values of K.*satisfy the generated guard(.*\n)*.*"
+                "Not all values of batch.*satisfy the generated guard(.*\n)*.*"
+                "Suggested fixes:(.*\n)*.*"
+                "batch = Dim\\('batch', max=15\\)(.*\n)*.*"
+                "K = 3\\*_K"
+            ),
+        ):
+            export(
+                foo,
+                inputs,
+                dynamic_shapes={"x": (batch, M, K), "y": (batch, K, N)},
+            )
+
+    def test_dynamic_shapes_spec_with_pytree(self):
+        from torch.export import Dim, export
+        from torch.utils._pytree import tree_map
+
+        inputs = {
+            "tensor": torch.randn(3),
+            "dict_of_tensors": {k: torch.randn(3) for k in ["A", "B", "C", "D"]},
+            "list_of_tensors": [torch.randn(3) for _ in range(4)],
+        }
+
+        batch = Dim("batch")
+        # uniformly specify dynamic shapes for all inputs
+        spec = tree_map(lambda x: {0: batch}, inputs)
+
+        class Foo(torch.nn.Module):
+            def forward(self, inputs):
+                return (
+                    inputs["tensor"]
+                    + inputs["dict_of_tensors"]["A"]
+                    + inputs["list_of_tensors"][0]
+                )
+
+        ep = export(Foo(), (inputs,), dynamic_shapes={"inputs": spec})
+        input_shapes = [
+            str(node.meta["val"].shape)
+            for node in ep.graph_module.graph.nodes
+            if node.op == "placeholder"
+        ]
+        self.assertEqual(len(input_shapes), 9)
+        self.assertTrue(all(shape == "torch.Size([s0])" for shape in input_shapes))
+
+    def test_error_does_not_reference_eager_fallback(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                y = x.nonzero()
+                z = y.shape[0]
+                if z > 2:
+                    return x.cos()
+                else:
+                    return x.sin()
+
+        fn_ddo = Module()
+        if is_non_strict_test(self._testMethodName):
+            error = torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode
+            error_msg = r"Could not guard on data-dependent expression"
+        else:
+            error = torchdynamo.exc.UserError
+            error_msg = r"^(?!.*fall back to eager).*"
+        with self.assertRaisesRegex(error, error_msg):
+            _ = export(fn_ddo, (torch.tensor([2, 3, 5]),))
+
+    def test_pytree_register_data_class(self):
+        @dataclass
+        class MyDataClass:
+            x: int
+            y: int
+            z: int = None
+
+        dt = MyDataClass(x=3, y=4)
+        flat, spec = tree_flatten(dt)
+        self.assertTrue(spec, LeafSpec())
+        self.assertTrue(len(flat) == 1)
+
+        register_dataclass_as_pytree_node(
+            MyDataClass,
+            serialized_type_name="test_pytree_register_data_class.MyDataClass",
+        )
+
+        flat, spec = tree_flatten(dt)
+        self.assertEqual(
+            spec,
+            TreeSpec(MyDataClass, [["x", "y"], ["z"]], [LeafSpec(), LeafSpec()]),
+        )
+        self.assertEqual(flat, [3, 4])
+
+        orig_dt = tree_unflatten(flat, spec)
+        self.assertTrue(isinstance(orig_dt, MyDataClass))
+        self.assertEqual(orig_dt.x, 3)
+        self.assertEqual(orig_dt.y, 4)
+        self.assertEqual(orig_dt.z, None)
+
+        roundtrip_spec = treespec_loads(treespec_dumps(spec))
+        self.assertEqual(roundtrip_spec, spec)
+
+        @dataclass
+        class MyOtherDataClass:  # the pytree registration don't allow registering the same class twice
+            x: int
+            y: int
+            z: int = None
+
+        # Override the registration with keep none fields
+        register_dataclass_as_pytree_node(
+            MyOtherDataClass,
+            return_none_fields=True,
+            serialized_type_name="test_pytree_regster_data_class.MyOtherDataClass",
+        )
+
+        dt = MyOtherDataClass(x=3, y=4)
+        flat, spec = tree_flatten(dt)
+        self.assertEqual(
+            spec,
+            TreeSpec(
+                MyOtherDataClass,
+                [["x", "y", "z"], []],
+                [LeafSpec(), LeafSpec(), LeafSpec()],
+            ),
+        )
+        self.assertEqual(flat, [3, 4, None])
+
+        orig_dt = tree_unflatten(flat, spec)
+        self.assertTrue(isinstance(orig_dt, MyOtherDataClass))
+        self.assertEqual(orig_dt.x, 3)
+        self.assertEqual(orig_dt.y, 4)
+        self.assertEqual(orig_dt.z, None)
+
+        roundtrip_spec = treespec_loads(treespec_dumps(spec))
+        self.assertEqual(roundtrip_spec, spec)
+
+    def test_pytree_register_nested_data_class(self):
+        @dataclass
+        class Inner:
+            x: int
+            y: int
+
+        @dataclass
+        class Outer:
+            xy: Inner
+            ab: Inner
+
+        xy = Inner(1, 2)
+        ab = Inner(3, 4)
+        dt = Outer(xy, ab)
+        inp = {"dt1": (dt, ({},)), "dt2": ((torch.ones(1),), dt)}
+
+        register_dataclass_as_pytree_node(
+            Inner, serialized_type_name="test_pytree_register_nested_data_class.Inner"
+        )
+        register_dataclass_as_pytree_node(
+            Outer, serialized_type_name="test_pytree_register_nested_data_class.Outer"
+        )
+
+        flat, spec = tree_flatten(inp)
+        self.assertEqual(flat, [1, 2, 3, 4, torch.ones(1), 1, 2, 3, 4])
+
+        unflat = tree_unflatten(flat, spec)
+        self.assertEqual(unflat, inp)
+
+        roundtrip_spec = treespec_loads(treespec_dumps(spec))
+        self.assertEqual(roundtrip_spec, spec)
+
+    def test_param_util(self):
+        class Basic(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lin = torch.nn.Linear(10, 1)
+
+            def forward(self, x):
+                return self.lin(x)
+
+        ep = export(Basic(), (torch.randn(5, 10),))
+        num_params = 0
+        params = []
+        for node in ep.graph.nodes:
+            if is_param(ep, node):
+                num_params += 1
+                params.append(get_param(ep, node))
+        self.assertEqual(num_params, 2)
+        self.assertEqual(params[0].shape, [1, 10])  # weight
+        self.assertEqual(params[1].shape, [1])  # bias
+
+    def test_buffer_util(self):
+        ep = export(
+            torch.nn.BatchNorm2d(100, affine=False), (torch.ones(20, 100, 35, 45),)
+        )
+        num_buffer = 0
+        buffer = []
+
+        for node in ep.graph.nodes:
+            if is_buffer(ep, node):
+                num_buffer += 1
+                buffer.append(get_buffer(ep, node))
+        self.assertEqual(num_buffer, 3)
+
+        self.assertEqual(buffer[0].shape, torch.Size([100]))  # running_mean
+        self.assertEqual(buffer[1].shape, torch.Size([100]))  # running_var
+        self.assertEqual(buffer[2].shape, torch.Size([]))  # num_batches_tracked
+
+    def test_export_dynamo_config(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lstm = torch.nn.LSTM(input_size=4, hidden_size=5, num_layers=1)
+
+            def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+                return self.lstm(inputs)
+
+        config = DEFAULT_EXPORT_DYNAMO_CONFIG
+        mod = MyModule()
+
+        @contextmanager
+        def _patch_config(kwargs):
+            orig_config_dict = dataclasses.asdict(config)
+
+            try:
+                for k, v in kwargs.items():
+                    setattr(config, k, v)
+                yield
+            finally:
+                for k, v in orig_config_dict.items():
+                    setattr(config, k, v)
+
+        inp = (torch.rand(5, 4),)
+        exported_program = export(mod, inp, strict=True)
+
+        with _patch_config({"allow_rnn": False}):
+            with self.assertRaisesRegex(
+                torch._dynamo.exc.Unsupported,
+                "TorchDynamo purposely graph breaks on RNN, GRU, LSTMs",
+            ):
+                _ = export(mod, inp, strict=True)
+
+    def test_device_to_static(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return x.to("cpu")
+
+        ep = export(Module(), (torch.tensor(1, device="cpu"),))
+        ops = []
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                ops.append(node.target)
+        self.assertGreater(len(ops), 0)
+        for op in ops:
+            self.assertIn(op, (torch.ops.aten._to_copy.default,))
+
+    def test_device_to_dynamic(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return x.to("cpu")
+
+        ep = export(
+            Module(),
+            (torch.tensor([1, 2], device="cpu"),),
+            dynamic_shapes={"x": {0: Dim("i")}},
+        )
+        ops = []
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                ops.append(node.target)
+        self.assertGreater(len(ops), 0)
+        for op in ops:
+            self.assertIn(op, (torch.ops.aten._to_copy.default,))
+
+    def test_device_to_mutation(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                y = x.to("cpu")
+                y.add_(1)
+                return y, x
+
+        with self.assertRaisesRegex(
+            RuntimeError, "cannot mutate tensors with frozen storage"
+        ):
+            export(Module(), (torch.tensor(1, device="cpu"),))
+
+    def test_module(self):
+        class MyLinear(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = torch.randn(20, 98)
+                self.bias = torch.randn(20)
+
+            def forward(self, x):
+                return torch.nn.functional.linear(x, self.weight, self.bias)
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(16, 33, 3)
+                self.linear = MyLinear()
+
+            def forward(self, x):
+                a, b = x
+                a_conv = self.conv(a)
+                a_linear = self.linear(a_conv)
+                b_conv = self.conv(b)
+                b_linear = self.linear(b_conv)
+                return (
+                    a_linear.cos() + b_linear.sin(),
+                    a_linear.sin() + b_linear.cos(),
+                )
+
+        inp_container = ((torch.randn(20, 16, 50, 100), torch.randn(20, 16, 50, 100)),)
+
+        ep = export(Foo(), inp_container)
+        ep_rexported = export(ep.module(), inp_container)
+
+        inp_test = ((torch.randn(20, 16, 50, 100), torch.randn(20, 16, 50, 100)),)
+
+        self.assertTrue(
+            torch.allclose(
+                ep.module()(*inp_test)[0], ep_rexported.module()(*inp_test)[0]
+            )
+        )
+        self.assertTrue(
+            torch.allclose(
+                ep.module()(*inp_test)[1], ep_rexported.module()(*inp_test)[1]
+            )
+        )
+
+    def test_module_with_dict_container_inp_out(self):
+        class MyLinear(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = torch.randn(20, 98)
+                self.bias = torch.randn(20)
+
+            def forward(self, x):
+                return torch.nn.functional.linear(x, self.weight, self.bias)
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(16, 33, 3)
+                self.linear = MyLinear()
+
+            def forward(self, x):
+                a1, a2 = x["a"]
+                b = x["b"]
+                a1_conv = self.conv(a1)
+                a1_linear = self.linear(a1_conv)
+                a2_conv = self.conv(a2)
+                a2_linear = self.linear(a2_conv)
+                b_conv = self.conv(b)
+                b_linear = self.linear(b_conv)
+                return {
+                    "a": a1_linear.cos() + b_linear.sin(),
+                    "b": a2_linear.sin() + b_linear.cos(),
+                }
+
+        inp_container = (
+            {
+                "a": (torch.randn(20, 16, 50, 100), torch.randn(20, 16, 50, 100)),
+                "b": torch.randn(20, 16, 50, 100),
+            },
+        )
+
+        ep = export(Foo(), inp_container)
+        ep_rexported = export(ep.module(), inp_container)
+
+        inp_test = (
+            {
+                "a": (torch.randn(20, 16, 50, 100), torch.randn(20, 16, 50, 100)),
+                "b": torch.randn(20, 16, 50, 100),
+            },
+        )
+
+        self.assertTrue(
+            torch.allclose(
+                ep.module()(*inp_test)["a"], ep_rexported.module()(*inp_test)["a"]
+            )
+        )
+        self.assertTrue(
+            torch.allclose(
+                ep.module()(*inp_test)["b"], ep_rexported.module()(*inp_test)["b"]
+            )
+        )
+
+    def test_args_type_checked(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        inp = torch.rand(2, 2)
+        with self.assertRaisesRegex(torch._dynamo.exc.UserError, "to be a tuple"):
+            # Intentionally not wrapping `inp` in a tuple to trigger the error
+            _ = export(M(), inp)
+
+    def test_decomp_batch_norm_functional_predispatch(self):
+        class ConvBatchnorm(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, 3, 1, 1)
+                self.bn = torch.nn.BatchNorm2d(3)
+
+            def forward(self, x):
+                x = self.conv(x)
+                x = self.bn(x)
+                return (x,)
+
+        mod = ConvBatchnorm()
+        mod.eval()
+        inp = torch.randn(1, 1, 3, 3)
+
+        gm = torch.export._trace._export(mod, (inp,), pre_dispatch=True).module()
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    conv_weight = self.conv.weight
+    conv_bias = self.conv.bias
+    bn_weight = self.bn.weight
+    bn_bias = self.bn.bias
+    bn_running_mean = self.bn.running_mean
+    bn_running_var = self.bn.running_var
+    conv2d = torch.ops.aten.conv2d.default(x, conv_weight, conv_bias);  x = conv_weight = conv_bias = None
+    _native_batch_norm_legit_no_training = torch.ops.aten._native_batch_norm_legit_no_training.default(conv2d, bn_weight, bn_bias, bn_running_mean, bn_running_var, 0.1, 1e-05);  conv2d = bn_weight = bn_bias = bn_running_mean = bn_running_var = None
+    getitem = _native_batch_norm_legit_no_training[0];  _native_batch_norm_legit_no_training = None
+    return pytree.tree_unflatten((getitem,), self._out_spec)""",
+        )
+
+        mod.train()
+        gm_train = _export(mod, (inp,), pre_dispatch=True).module()
+        self.assertExpectedInline(
+            str(gm_train.code).strip(),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    conv_weight = self.conv.weight
+    conv_bias = self.conv.bias
+    bn_weight = self.bn.weight
+    bn_bias = self.bn.bias
+    bn_running_mean = self.bn.running_mean
+    bn_running_var = self.bn.running_var
+    bn_num_batches_tracked = self.bn.num_batches_tracked
+    conv2d = torch.ops.aten.conv2d.default(x, conv_weight, conv_bias);  x = conv_weight = conv_bias = None
+    add = torch.ops.aten.add.Tensor(bn_num_batches_tracked, 1)
+    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(conv2d, bn_weight, bn_bias, bn_running_mean, bn_running_var, True, 0.1, 1e-05);  conv2d = bn_weight = bn_bias = None
+    getitem = _native_batch_norm_legit_functional[0]
+    getitem_3 = _native_batch_norm_legit_functional[3]
+    getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
+    copy__default = torch.ops.aten.copy_.default(bn_running_mean, getitem_3);  bn_running_mean = getitem_3 = None
+    copy__default_1 = torch.ops.aten.copy_.default(bn_running_var, getitem_4);  bn_running_var = getitem_4 = None
+    copy__default_2 = torch.ops.aten.copy_.default(bn_num_batches_tracked, add);  bn_num_batches_tracked = add = None
+    return pytree.tree_unflatten((getitem,), self._out_spec)""",
+        )
+
+    def test_constrain_size_in_eager(self):
+        class Module(torch.nn.Module):
+            def forward(self, x, y):
+                n = x.max().item()
+                torch._check_is_size(n)
+                return y + n
+
+        fn = Module()
+        ep = export(
+            fn,
+            (torch.randint(1, 2, (2, 2)), torch.randint(3, 5, (2, 3))),
+        )
+        test_inp = (torch.randint(1, 2, (2, 2)), torch.randint(3, 5, (2, 3)))
+        self.assertTrue(torch.allclose(ep.module()(*test_inp), fn(*test_inp)))
+
+    def test_constrain_size_with_constrain_value(self):
+        class Module(torch.nn.Module):
+            def forward(self, x, y):
+                n = x.max().item()
+                torch._check(n >= 2)
+                torch._check(n <= 10)
+                torch._check_is_size(n)
+                return y + n
+
+        fn = Module()
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expected cond to be True, but got False"
+        ):
+            _ = fn(torch.randint(1, 2, (2, 2)), torch.randint(3, 5, (2, 3)))
+
+        ep = export(
+            fn,
+            (torch.randint(3, 4, (2, 2)), torch.randint(3, 5, (2, 3))),
+        )
+        with self.assertRaisesRegex(RuntimeError, "Invalid value range for 1 between"):
+            test_inp = (torch.randint(1, 2, (2, 2)), torch.randint(3, 5, (2, 3)))
+            _ = ep.module()(*test_inp)
+
+    def test_constrain_size_with_various_cases(self):
+        class Module1(torch.nn.Module):
+            def forward(self, x, y):
+                n = x.item()
+                torch._check_is_size(n)
+                torch._check(n >= 0)
+                return y.sum() + torch.ones(n, 5).sum()
+
+        case1 = Module1()
+
+        class Module2(torch.nn.Module):
+            def forward(self, x, y):
+                n = x.item()
+                torch._check_is_size(n)
+                torch._check(n >= 0)
+                torch._check(n <= 6)
+                return y.sum() + torch.ones(n, 5).sum()
+
+        case2 = Module2()
+
+        class Module3(torch.nn.Module):
+            def forward(self, x, y):
+                n = x.item()
+                torch._check_is_size(n)
+                torch._check(n >= 0)
+                torch._check(n <= 1)
+                return y.sum() + torch.ones(n, 5).sum()
+
+        case3 = Module3()
+
+        class Module4(torch.nn.Module):
+            def forward(self, x, y):
+                n = x.item()
+                torch._check_is_size(n)
+                torch._check(n >= 2)
+                return y.sum() + torch.ones(n, 5).sum()
+
+        case4 = Module4()
+
+        class Module5(torch.nn.Module):
+            def forward(self, x, y):
+                n = x.item()
+                torch._check_is_size(n)
+                torch._check(n >= 1)
+                return y.sum() + torch.ones(n, 5).sum()
+
+        case5 = Module5()
+
+        ep = export(case1, (torch.tensor(1), torch.ones(4, 5)))
+
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expected cond to be True, but got False"
+        ):
+            _ = case1(torch.tensor(-1), torch.randn(4, 5))
+
+        self.assertTrue(
+            torch.allclose(
+                ep.module()(torch.tensor(1), torch.ones(4, 5)),
+                case1(torch.tensor(1), torch.ones(4, 5)),
+            )
+        )
+
+        ep = export(case2, (torch.tensor(5), torch.randn(4, 5)))
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected cond to be True, but got False",
+        ):
+            _ = case2(torch.tensor(7), torch.randn(4, 5))
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected cond to be True, but got False",
+        ):
+            _ = case2(torch.tensor(9), torch.randn(4, 5))
+
+        self.assertTrue(
+            torch.allclose(
+                ep.module()(torch.tensor(5), torch.ones(4, 5)),
+                case2(torch.tensor(5), torch.ones(4, 5)),
+            )
+        )
+
+        _ = case3(torch.tensor(1), torch.randn(4, 5))
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected cond to be True, but got False",
+        ):
+            _ = case4(torch.tensor(1), torch.randn(4, 5))
+
+        ep = export(case4, (torch.tensor(5), torch.randn(4, 5)))
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected cond to be True, but got False",
+        ):
+            _ = case4(torch.tensor(1), torch.randn(4, 5))
+
+        self.assertTrue(
+            torch.allclose(
+                ep.module()(torch.tensor(5), torch.ones(4, 5)),
+                case4(torch.tensor(5), torch.ones(4, 5)),
+            )
+        )
+
+        ep = export(case5, (torch.tensor(5), torch.randn(4, 5)))
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected cond to be True, but got False",
+        ):
+            _ = case5(torch.tensor(0), torch.randn(4, 5))
+
+        self.assertTrue(
+            torch.allclose(
+                ep.module()(torch.tensor(5), torch.ones(4, 5)),
+                case5(torch.tensor(5), torch.ones(4, 5)),
+            )
+        )
+
+    def test_automatic_constrain_size(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                n = x.item()
+                return y.sum() + torch.ones(n, 5).sum()
+
+        ep = export(M(), (torch.tensor(1), torch.ones(4, 5)))
+
+        if is_non_strict_test(self._testMethodName):
+            error_msg = "Invalid value range"
+        else:
+            error_msg = "is outside of inline constraint"
+        with self.assertRaisesRegex(RuntimeError, error_msg):
+            _ = ep.module()(torch.tensor(-1), torch.randn(4, 5))
+
+        self.assertTrue(
+            torch.allclose(
+                ep.module()(torch.tensor(1), torch.ones(4, 5)),
+                M()(torch.tensor(1), torch.ones(4, 5)),
+            )
+        )
+
+    def test_constrain_decomp(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
                 self.freq = torch.ones(5, 5)
 
-            def forward(self, start_pos: torch.Tensor):
-                pos = start_pos.item()
-                torch._constrain_as_size(pos, min=0, max=4)
-                return self.freq[pos] * self.freq[pos]
+            def forward(self, start_pos: torch.Tensor):
+                pos = start_pos.item()
+                torch._check_is_size(pos)
+                torch._check(pos >= 0)
+                torch._check(pos <= 4)
+                return self.freq[pos] * self.freq[pos]
+
+        ep = torch.export.export(M(), (torch.tensor(1),))
+        FileCheck().check_count(
+            "torch.ops.aten._assert_scalar.default", 2, exactly=True
+        ).run(ep.graph_module.code)
+        decompose_ep = ep.run_decompositions()
+        FileCheck().check_count(
+            "torch.ops.aten._assert_scalar.default", 2, exactly=True
+        ).run(decompose_ep.graph_module.code)
+
+    def test_mixed_input(self):
+        class Module(torch.nn.Module):
+            def forward(self, a, b, alpha: int):
+                return torch.add(a, b, alpha=alpha)
+
+        func = Module()
+
+        a = torch.rand(1, 2)
+        b = torch.rand(1, 2)
+        alpha = 10
+
+        exported = export(func, (a, b, alpha))
+        for node in exported.graph_module.graph.nodes:
+            if node.op == "placeholder":
+                self.assertTrue(isinstance(node.meta["val"], (Tensor, int)))
+
+    def test_export_with_inline_constraints(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                a = x.item()
+                torch._check(a >= 4)
+                torch._check(a <= 7)
+                return torch.empty((a, 4))
+
+        f = Module()
+        ep = export(f, (torch.tensor([5]),))
+        self.assertEqual(ep.module()(torch.tensor([6])).shape, (6, 4))
+
+        FileCheck().check_count(
+            "torch.ops.aten._assert_scalar.default", 2, exactly=True
+        ).run(ep.graph_module.code)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Invalid value range for 30 between \[4, 7\]",
+        ) as cm:
+            ep.module()(torch.tensor([30]))
+
+    def test_export_with_inline_constraints_complex(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                a = x.item()
+                torch._check(a >= 4)
+                torch._check(a <= 7)
+                empty = torch.empty((a, 4))
+
+                return torch.cat((empty.transpose(0, 1), torch.zeros(6, a)), 0)
+
+        f = Module()
+        ep = export(f, (torch.tensor([6]),))
+        self.assertEqual(ep.module()(torch.tensor([5])).shape, (10, 5))
+        FileCheck().check_count(
+            "torch.ops.aten._assert_scalar.default", 2, exactly=True
+        ).run(ep.graph_module.code)
+
+    def test_to_module_with_mutated_buffer(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buf", torch.zeros(1))
+
+            def forward(self, x):
+                self.buf.add_(1)
+                return x.sum() + self.buf.sum()
+
+        exported = export(Foo(), (torch.ones(5, 5),))
+        stateful_gm = exported.module()
+        export_return_val = stateful_gm(torch.ones(5, 5))
+        eager = Foo()
+        eager_return_val = eager(torch.ones(5, 5))
+        self.assertTrue(torch.allclose(eager_return_val, export_return_val))
+
+        for name, buffer in stateful_gm.named_buffers():
+            self.assertTrue(torch.allclose(torch.ones(1), buffer))
+
+        changed = stateful_gm.graph.eliminate_dead_code()
+        self.assertFalse(changed)
+        self.assertTrue(
+            torch.allclose(stateful_gm(torch.ones(5, 5)), eager(torch.ones(5, 5)))
+        )
+
+        for name, buffer in stateful_gm.named_buffers():
+            self.assertTrue(torch.allclose(torch.tensor(2, dtype=torch.float), buffer))
+
+    def test_to_module_with_mutated_buffer_multiple(self):
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buf", torch.ones(1))
+
+            def forward(self, x):
+                self.buf.add_(1)
+                return x.sum() + self.buf.sum()
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buf", torch.zeros(1))
+                self.bar = Bar()
+
+            def forward(self, x):
+                self.buf.add_(1)
+                self.bar.buf.add_(2)
+                bar = self.bar(x)
+                return bar.sum() + self.buf.sum()
+
+        exported = export(Foo(), (torch.ones(5, 5),))
+        stateful_gm = exported.module()
+        export_return_val = stateful_gm(torch.ones(5, 5))
+        eager = Foo()
+        eager_return_val = eager(torch.ones(5, 5))
+        self.assertTrue(torch.allclose(eager_return_val, export_return_val))
+
+        for name, buffer in stateful_gm.named_buffers():
+            if name == "L__self___buf":
+                self.assertTrue(torch.allclose(torch.ones(1), buffer))
+            if name == "L__self___bar_buf":
+                self.assertTrue(
+                    torch.allclose(torch.tensor(4, dtype=torch.float), buffer)
+                )
+
+        changed = stateful_gm.graph.eliminate_dead_code()
+        self.assertFalse(changed)
+        self.assertTrue(
+            torch.allclose(stateful_gm(torch.ones(5, 5)), eager(torch.ones(5, 5)))
+        )
+
+        for name, buffer in stateful_gm.named_buffers():
+            if name == "L__self___buf":
+                self.assertTrue(
+                    torch.allclose(torch.tensor(2, dtype=torch.float), buffer)
+                )
+            if name == "L__self___bar_buf":
+                self.assertTrue(
+                    torch.allclose(torch.tensor(7, dtype=torch.float), buffer)
+                )
+
+    def test_runtime_assert_for_prim(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        foo = Foo()
+        tensor_inp = torch.ones(7, 5)
+        dim0_x = torch.export.Dim("dim0_x", min=6)
+        dynamic_shapes = {"x": {0: dim0_x}, "y": None}
+        exported = torch.export.export(
+            foo, (tensor_inp, 5), dynamic_shapes=dynamic_shapes
+        )
+        self.assertTrue(
+            torch.allclose(
+                exported.module()(torch.ones(8, 5), 5), foo(torch.ones(8, 5), 5)
+            )
+        )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            escape("Expected input at *args[1] to be equal to 5, but got 6"),
+        ):
+            _ = exported.module()(torch.ones(8, 5), 6)
+
+        exported = torch.export.export(
+            foo, (tensor_inp, 5.0), dynamic_shapes=dynamic_shapes
+        )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            escape("Expected input at *args[1] to be equal to 5.0, but got 6.0"),
+        ):
+            _ = exported.module()(torch.ones(7, 5), 6.0)
+
+    def test_runtime_assert_for_prm_str(self):
+        class Foo(torch.nn.Module):
+            def forward(self, a, b, mode):
+                return torch.div(a, b, rounding_mode=mode)
+
+        foo = Foo()
+        inps = (torch.randn(4, 4), torch.randn(4), "trunc")
+        exported = export(foo, inps)
+        with self.assertRaisesRegex(
+            RuntimeError, "to be equal to trunc, but got floor"
+        ):
+            _ = exported.module()(torch.randn(4, 4), torch.randn(4), "floor")
+        self.assertTrue(torch.allclose(exported.module()(*inps), foo(*inps)))
+
+    def test_to_module_with_mutated_buffer_multiple_update_sub_later(self):
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buf", torch.ones(1))
+
+            def forward(self, x):
+                self.buf.add_(1)
+                return x.sum() + self.buf.sum()
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buf", torch.zeros(1))
+                self.bar = Bar()
+
+            def forward(self, x):
+                self.buf.add_(1)
+                bar = self.bar(x)
+                self.bar.buf.add_(2)
+                return bar.sum() + self.buf.sum()
+
+        exported = export(Foo(), (torch.ones(5, 5),))
+        stateful_gm = exported.module()
+        export_return_val = stateful_gm(torch.ones(5, 5))
+        eager = Foo()
+        eager_return_val = eager(torch.ones(5, 5))
+        self.assertTrue(torch.allclose(eager_return_val, export_return_val))
+
+        for name, buffer in stateful_gm.named_buffers():
+            if name == "L__self___buf":
+                self.assertTrue(torch.allclose(torch.ones(1), buffer))
+            if name == "L__self___bar_buf":
+                self.assertTrue(
+                    torch.allclose(torch.tensor(4, dtype=torch.float), buffer)
+                )
+
+        changed = stateful_gm.graph.eliminate_dead_code()
+        self.assertFalse(changed)
+        self.assertTrue(
+            torch.allclose(stateful_gm(torch.ones(5, 5)), eager(torch.ones(5, 5)))
+        )
+
+        for name, buffer in stateful_gm.named_buffers():
+            if name == "L__self___buf":
+                self.assertTrue(
+                    torch.allclose(torch.tensor(2, dtype=torch.float), buffer)
+                )
+            if name == "L__self___bar_buf":
+                self.assertTrue(
+                    torch.allclose(torch.tensor(7, dtype=torch.float), buffer)
+                )
+
+    def test_retracable_ep(self):
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buf", torch.ones(1))
+
+            def forward(self, x):
+                self.buf.add_(1)
+                return x.sum() + self.buf.sum()
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buf", torch.zeros(1))
+                self.bar = Bar()
+
+            def forward(self, x):
+                self.buf.add_(1)
+                bar = self.bar(x)
+                self.bar.buf.add_(2)
+                return bar.sum() + self.buf.sum()
+
+        inp = torch.ones(5, 5)
+        exported = torch.export.export(Foo(), (inp,))
+        reexported = torch.export.export(exported.module(), (inp,))
+
+        self.assertTrue(torch.allclose(Foo()(inp), reexported.module()(inp)))
+
+        dim0_x = torch.export.Dim("dim0_x")
+        exported = torch.export.export(Foo(), (inp,), dynamic_shapes=({0: dim0_x},))
+        reexported = torch.export.export(exported.module(), (inp,))
+        with self.assertRaisesRegex(
+            RuntimeError, "shape\[0\] to be equal to 5, but got 7"
+        ):
+            reexported.module()(torch.ones(7, 5))
+
+        reexported = torch.export.export(
+            exported.module(), (inp,), dynamic_shapes=({0: dim0_x},)
+        )
+        self.assertTrue(
+            torch.allclose(
+                Foo()(torch.ones(7, 5)), reexported.module()(torch.ones(7, 5))
+            )
+        )
+
+        # can't retrace with invalid inputs with respect to the original ExportedProgram
+        dim0_x_v2 = torch.export.Dim("dim0_x_v2", min=3)
+        exported_v2 = torch.export.export(
+            Foo(), (inp,), dynamic_shapes={"x": {0: dim0_x_v2}}
+        )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            escape("Expected input at *args[0].shape[0] to be >= 3, but got 2"),
+        ):
+            torch.export.export(exported_v2.module(), (torch.randn(2, 2),))
+
+    def test_export_cond(self):
+        class A(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buffer", torch.ones(6, 4))
+
+            def forward(self):
+                return self.buffer.cos()
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = A()
+
+            def forward(self, x):
+                def true_fn(x):
+                    return x.cos() + self.a().sum()
+
+                def false_fn(x):
+                    return x.sin()
+
+                return cond(x.shape[0] > 4, true_fn, false_fn, [x])
+
+        inp = torch.ones(6, 4)
+        ep = export(
+            Foo(),
+            (inp,),
+        )
+        self.assertTrue(
+            torch.allclose(ep.module()(torch.ones(6, 4)), Foo()(torch.ones(6, 4)))
+        )
+
+    def test_aten_lift_fresh_copy(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.aten.lift_fresh_copy(x)
+
+        ep = export(M(), (torch.ones(6, 4),))
+        found = False
+
+        op = "torch.ops.aten.clone.default"
+        FileCheck().check_count(op, 1, exactly=True).run(ep.graph_module.code)
+
+    def test_cond_buffers(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_parameter(
+                    "param", torch.nn.Parameter(torch.ones(2, 3), requires_grad=False)
+                )
+                self.register_buffer("buffer", torch.ones(2, 3) + 1)
+
+            def true_fn(self, x):
+                return x + self.param
+
+            def false_fn(self, x):
+                return x + self.buffer
+
+            def forward(self, x):
+                return cond(x.shape[0] == 4, self.true_fn, self.false_fn, [x])
+
+        inp = torch.ones(2, 3)
+        ep = torch.export.export(M(), (inp,))
+        inp = torch.randn(2, 3)
+        epm = ep.module()
+        self.assertTrue(torch.allclose(epm(inp), M()(inp)))
+
+        for gm in epm.named_modules():
+            if not isinstance(gm, torch.fx.GraphModule):
+                continue
+            self.assertEqual(
+                len([node for node in gm.graph.nodes if node.op == "placeholder"]), 1
+            )
+
+    # map_fn references module outside the module hierarchy
+    @unittest.expectedFailure
+    def test_map_buffers(self):
+        class M1(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_parameter(
+                    "param", torch.nn.Parameter(torch.tensor(5), requires_grad=False)
+                )
+                self.register_buffer("buffer", torch.tensor(6) + 1)
+
+        m1 = M1()
+
+        def map_fn(x, y):
+            z = x + y + m1.param + m1.buffer
+            z.add_(4)
+            return z
+
+        class M(torch.nn.Module):
+            def forward(self, xs, y):
+                return map(map_fn, xs, y)
+
+        example_inputs = (torch.ones(3, 2), torch.tensor(3))
+        ep = torch.export.export(M(), example_inputs)
+        example_inputs = (torch.randn(3, 2), torch.tensor(3))
+        epm = ep.module()
+        self.assertTrue(torch.allclose(epm(*example_inputs), M()(*example_inputs)))
+
+        for gm in epm.named_modules():
+            if not isinstance(gm, torch.fx.GraphModule):
+                continue
+            self.assertEqual(
+                len([node for node in gm.graph.nodes if node.op == "placeholder"]), 2
+            )
+
+    @testing.expectedFailureSerDer  # We don't preserve metadata on graph module
+    @testing.expectedFailureSerDerPreDispatch
+    @testing.expectedFailureNonStrict
+    def test_retrace_graph_level_meta_preservation(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                if x.shape[0] > 4:
+                    return x.cos()
+                return x.sin()
+
+        inp = torch.ones(7, 5)
+        dim0_x = torch.export.Dim("dim0_x", min=6)
+        exported = torch.export.export(Foo(), (inp,), dynamic_shapes={"x": {0: dim0_x}})
+        stateful_module = exported.module()
+        self.assertTrue(len(stateful_module.meta["input_shape_constraints"]), 1)
+
+        re_exported = export(stateful_module, (inp,), dynamic_shapes=({0: dim0_x},))
+        self.assertTrue(
+            len(re_exported.graph_module.meta["input_shape_constraints"]) == 1
+        )
+        self.assertTrue(
+            torch.allclose(
+                exported.module()(torch.ones(7, 5)),
+                re_exported.module()(torch.ones(7, 5)),
+            )
+        )
+
+        re_exported_v2 = export(exported.module(), (inp,))
+        self.assertTrue(
+            len(re_exported_v2.graph_module.meta["input_shape_constraints"]) == 0
+        )
+        self.assertTrue(
+            torch.allclose(
+                exported.module()(torch.ones(7, 5)),
+                re_exported_v2.module()(torch.ones(7, 5)),
+            )
+        )
+
+    def test_check_is_size_error(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                a = x.item()
+                # We cannot automatically infer a is a size here because view
+                # accepts -1
+                return torch.randn(24).view(a, 4)
+
+        f = Module()
+        if is_non_strict_test(self._testMethodName):
+            error = torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode
+            error_msg = r"Could not guard on data-dependent expression"
+        else:
+            error = torch._dynamo.exc.UserError
+            error_msg = (
+                r"Tried to use data-dependent value in the subsequent computation"
+            )
+        with self.assertRaisesRegex(error, error_msg):
+            _ = export(f, (torch.tensor(6),))
+
+    def test_train_eval_on_exported_preautograd_module(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                if x.shape[0] > 4:
+                    return x.cos()
+                return x.sin()
+
+        graph_module = _export(Foo(), (torch.ones(7, 5),), pre_dispatch=True).module()
+        with self.assertRaisesRegex(
+            NotImplementedError, r"Calling train\(\) is not supported yet."
+        ):
+            graph_module.train()
+
+        with self.assertRaisesRegex(
+            NotImplementedError, r"Calling eval\(\) is not supported yet."
+        ):
+            graph_module.eval()
+
+    @testing.expectedFailureRetraceability  # T183144788
+    def test_lifted_constants(self) -> None:
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return x + torch.tensor(3)
+
+        f = Module()
+        ep = export(f, (torch.tensor(1),))
+
+        self.assertEqual(len(ep.graph_signature.input_specs), 2)
+        self.assertEqual(len(ep.constants), 1)
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.tensor(3)
+
+            def forward(self, x):
+                list_tensor = [torch.tensor(3), torch.tensor(4)]
+                return x + self.a + list_tensor[0] + list_tensor[1]
+
+        ep = export(Foo(), (torch.tensor(1),))
+
+        self.assertEqual(len(ep.graph_signature.input_specs), 4)
+        self.assertEqual(len(ep.state_dict), 0)
+        self.assertEqual(len(ep.constants), 3)
+
+        inp = (torch.tensor(5),)
+        self.assertTrue(torch.allclose(ep.module()(*inp), Foo()(*inp)))
+
+        transform = ep.run_decompositions()
+        self.assertEqual(len(ep.graph_signature.input_specs), 4)
+        self.assertTrue(torch.allclose(ep.module()(*inp), transform.module()(*inp)))
+
+    @testing.expectedFailureRetraceability  # T183144788
+    def test_tensor_attribute_zero_args(self):
+        class Foo(torch.nn.Module):
+            def __init__(self, value):
+                super().__init__()
+                self.x = torch.tensor(value)
+
+            def forward(self):
+                return self.x.clone()
+
+        m = Foo([1, 2])
+        ep = export(m, ())
+        self.assertEqual(ep.graph_signature.lifted_tensor_constants, ["x"])
+
+    def test_preserve_shape_dynamism_for_unused_inputs(self):
+        @dataclass
+        class Input:
+            f: torch.Tensor
+            p: torch.Tensor
+
+        torch._export.utils.register_dataclass_as_pytree_node(
+            Input,
+            serialized_type_name="test_preserve_shape_dynamism_for_unused_inputs.Input",
+        )
+
+        class Module(torch.nn.Module):
+            def forward(self, x: Input):
+                return x.f + 1
+
+        mod = Module()
+        example_inputs = (Input(f=torch.ones(10, 4), p=torch.zeros(10, 4)),)
+        ep_static = torch.export.export(mod, example_inputs)
+        for node in ep_static.graph.nodes:
+            if node.op == "placeholder":
+                for s in node.meta["val"].shape:
+                    self.assertIsInstance(s, int)
+
+        dim0_x_f, dim0_x_p = torch.export.dims("dim0_x_f", "dim0_x_p")
+        dynamic_shapes = {"x": [{0: dim0_x_f}, {0: dim0_x_p}]}
+        ep_dynamic = torch.export.export(
+            mod, example_inputs, dynamic_shapes=dynamic_shapes
+        )
+        for node in ep_dynamic.graph.nodes:
+            if node.op == "placeholder":
+                for i, s in enumerate(node.meta["val"].shape):
+                    if i == 0:
+                        self.assertIsInstance(s, torch.SymInt)
+                    else:
+                        self.assertIsInstance(s, int)
+
+    def test_multiple_definitions_same_name_dim(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.matmul(x, y)
+
+        A = torch.export.Dim("C", min=3)
+        B = torch.export.Dim("C", max=12)
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UserError,
+            "Found different definitions Dim\\(.*min=3\\) and Dim\\(.*max=12\\) "
+            "for the same symbolic dimension",
+        ):
+            torch.export.export(
+                Foo(),
+                (torch.randn(10, 10), torch.randn(10, 10)),
+                dynamic_shapes={"x": (A, B), "y": (B, A)},
+            )
+
+    def test_export_with_wrong_inputs(self):
+        class MyModule(torch.nn.Module):
+            def forward(self, x):
+                return x + x
 
-        ep = torch.export.export(M(), (torch.tensor(1),))
-        FileCheck().check_count(
-            "torch.ops.aten._assert_async.msg", 2, exactly=True
-        ).run(ep.graph_module.code)
-        decompose_ep = ep.run_decompositions()
-        FileCheck().check_count(
-            "torch.ops.aten._assert_async.msg", 2, exactly=True
-        ).run(decompose_ep.graph_module.code)
+        exported_program = export(MyModule(), (torch.rand(2, 3),), {})
+        with self.assertRaisesRegex(ValueError, "Trying to flatten user inputs"):
+            exported_program.module()(torch.rand(2, 3), torch.rand(2, 3))
 
-    @testing.expectedFailureSerDer
-    @testing.expectedFailureNonStrict
-    def test_mixed_input(self):
-        def func(a, b, alpha: int):
-            return torch.add(a, b, alpha=alpha)
+    @testing.expectedFailureSerDerPreDispatch  # linear shouldn't decompose
+    @testing.expectedFailurePreDispatchRunDecomp  # no action needed here
+    def test_export_decomps_simple(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lin = torch.nn.Linear(10, 1)
 
-        a = torch.rand(1, 2)
-        b = torch.rand(1, 2)
-        alpha = 10
+            def forward(self, x):
+                return self.lin(x)
 
-        exported = export(func, (a, b, alpha))
-        for node in exported.graph_module.graph.nodes:
-            if node.op == "placeholder":
-                self.assertTrue(isinstance(node.meta["val"], (Tensor, int)))
+        inp = (torch.randn(5, 10),)
+        m = M()
+        ep = export(m, inp)
+        state_dict = ep.state_dict
 
-    @testing.expectedFailureNonStrict
-    def test_export_with_inline_constraints(self):
-        def f(x):
-            a = x.item()
-            torch._constrain_as_value(a, 4, 7)
-            return torch.empty((a, 4))
+        FileCheck().check_count("torch.ops.aten.t.default", 1, exactly=True).run(
+            ep.graph_module.code
+        )
+        self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
 
-        ep = export(f, (torch.tensor([5]),))
-        self.assertEqual(ep(torch.tensor([6])).shape, (6, 4))
+        core_aten_ep = ep.run_decompositions()
+        FileCheck().check_count("torch.ops.aten.permute.default", 1, exactly=True).run(
+            core_aten_ep.graph_module.code
+        )
+        FileCheck().check_count("torch.ops.aten.t.default", 0, exactly=True).run(
+            core_aten_ep.graph_module.code
+        )
+        self.assertTrue(torch.allclose(core_aten_ep.module()(*inp), m(*inp)))
+        self.assertEqual(id(state_dict), id(ep.state_dict))
+
+    def test_export_decomps_dynamic(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lin = torch.nn.Linear(10, 1)
+
+            def forward(self, x):
+                return self.lin(x)
+
+        inp = (torch.randn(5, 10),)
+        m = M()
+        ep = export(m, inp, dynamic_shapes={"x": {0: Dim("batch")}})
 
+        core_aten_ep = ep.run_decompositions()
+
+        input_node = [
+            node for node in core_aten_ep.graph.nodes if node.op == "placeholder"
+        ][-1]
+        self.assertTrue(isinstance(input_node.meta["val"].shape[0], torch.SymInt))
+
+        FileCheck().check_count("torch.ops.aten.permute.default", 1, exactly=True).run(
+            core_aten_ep.graph_module.code
+        )
+        FileCheck().check_count("torch.ops.aten.t.default", 0, exactly=True).run(
+            core_aten_ep.graph_module.code
+        )
+        self.assertTrue(torch.allclose(core_aten_ep.module()(*inp), m(*inp)))
+
+    def test_nonzero_2(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return torch.nonzero(x)
+
+        f = Module()
+        ep = export(f, (torch.ones(2),))
+        inp = torch.randn(2)
+        self.assertTrue(torch.allclose(ep.module()(inp), torch.nonzero(inp)))
+
+    def test_redundant_asserts(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                y = x.item()
+                torch._check_is_size(y)
+                return torch.zeros(y)
+
+        f = Foo()
+
+        ep = export(f, (torch.tensor([3]),))
         FileCheck().check_count(
             "torch.ops.aten.sym_constrain_range.default", 1, exactly=True
         ).run(ep.graph_module.code)
+        FileCheck().check_count(
+            "torch.ops.aten._assert_scalar.default", 1, exactly=True
+        ).run(ep.graph_module.code)
+
+    def test_non_arg_name_dynamic_shapes_api(self):
+        class Foo(torch.nn.Module):
+            def forward(self, a, b):
+                return a.sum() + b.sum()
+
+        foo = Foo()
+        dim = torch.export.Dim("dim")
+        ep = torch.export.export(
+            foo,
+            (torch.randn(4, 4), torch.randn(4, 4)),
+            dynamic_shapes=(None, {0: dim}),
+        )
 
+        test_inp = (torch.randn(4, 4), torch.randn(7, 4))
+        self.assertEqual(ep.module()(*test_inp), foo(*test_inp))
+
+        ep_v2 = torch.export.export(
+            foo,
+            (torch.randn(4, 4), torch.randn(4, 4)),
+            dynamic_shapes=(None, None),
+        )
         with self.assertRaisesRegex(
-            RuntimeError,
-            r"_local_scalar_dense is outside of inline constraint \[4, 7\]",
-        ) as cm:
-            ep(torch.tensor([30]))
+            RuntimeError, "shape\[0\] to be equal to 4, but got 7"
+        ):
+            ep_v2.module()(*test_inp)
 
-    @testing.expectedFailureNonStrict
-    def test_export_with_inline_constraints_complex(self):
-        def f(x):
-            a = x.item()
-            torch._constrain_as_value(a, 4, 7)
-            empty = torch.empty((a, 4))
+    def test_constant_output(self):
+        class ModuleConstant(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.b = torch.randn(3, 2)
+
+            def forward(self):
+                return self.b
+
+        class ModuleNestedConstant(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bff = torch.randn(3, 2)
+
+            def forward(self, x, y):
+                return {"prediction": (x + y, self.bff)}
+
+        mod = ModuleConstant()
+        ep = torch.export.export(mod, ())
+        self.assertEqual(ep.module()(), mod())
+
+        args = (torch.randn(3, 2), torch.randn(3, 2))
+        mod = ModuleNestedConstant()
+        ep = torch.export.export(mod, args)
+        self.assertEqual(ep.module()(*args), mod(*args))
+
+    def test_non_arg_name_dynamic_shapes_api_with_kwarg(self):
+        class Foo(torch.nn.Module):
+            def forward(self, a, b, kw1, kw2):
+                return a.sum() + b.sum() + kw1.sum() - kw2.sum()
+
+        foo = Foo()
+        dim = torch.export.Dim("dim")
+        dim_for_kw1 = torch.export.Dim("dim_for_kw1")
+        ep = torch.export.export(
+            foo,
+            (torch.randn(4, 4), torch.randn(4, 4)),
+            {"kw2": torch.ones(4, 4), "kw1": torch.zeros(4, 4)},
+            # We are specifying dynamism on the first kwarg even though user passed in
+            # different order
+            dynamic_shapes=(None, {0: dim}, {0: dim_for_kw1}, None),
+        )
+
+        test_inp = (torch.randn(4, 4), torch.randn(7, 4))
+        test_kwargs = {"kw2": torch.ones(4, 4), "kw1": torch.zeros(9, 4)}
+        # This should work even if the kwarg order are flipped.
+        self.assertEqual(
+            ep.module()(*test_inp, **test_kwargs), foo(*test_inp, **test_kwargs)
+        )
+
+    def test_non_arg_name_dynamic_shapes_api_with_container_type(self):
+        class Foo(torch.nn.Module):
+            def forward(self, a, b):
+                return a[0].sum() + a[1].sum() + b.sum()
+
+        inp_a = (torch.randn(4, 4), torch.randn(4, 4))
+        inp_b = torch.randn(4, 4)
+        inp = (inp_a, inp_b)
+
+        count = 0
+
+        def dynamify_inp(x):
+            # Mark the second input a[1] dynamic
+            nonlocal count
+            if count == 1:
+                dim = torch.export.Dim("dim", min=3)
+                count += 1
+                return {0: dim}
+            count += 1
+            return None
+
+        dynamic_shapes = tree_map(dynamify_inp, inp)
+
+        foo = Foo()
+        ep = torch.export.export(foo, inp, dynamic_shapes=dynamic_shapes)
+
+        test_inp = ((torch.randn(4, 4), torch.randn(2, 4)), torch.randn(4, 4))
+        with self.assertRaisesRegex(RuntimeError, "shape\[0\] to be >= 3, but got 2"):
+            ep.module()(*test_inp)
+
+    def test_lazy_module_kwargs(self):
+        class LazyModule(torch.nn.modules.lazy.LazyModuleMixin, torch.nn.Module):
+            def initialize_parameters(self, *args, **kwargs):
+                pass
+
+            def forward(self, x, y):
+                return x + y
+
+        m = LazyModule()
+        ep = torch.export.export(
+            m, (), {"x": torch.randn(3, 3), "y": torch.randn(3, 3)}
+        )
+        inputs = {"x": torch.randn(3, 3), "y": torch.randn(3, 3)}
+        self.assertEqual(ep.module()(**inputs), m(**inputs))
+
+    def test_retrace_pre_autograd(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buffer", torch.ones(4, 4))
+
+            def forward(self, x):
+                self.buffer.add_(4)
+                return x.sum() + self.buffer.sum()
+
+        inp = torch.randn(4, 4)
+        gm = _export(
+            Foo(),
+            (inp,),
+            dynamic_shapes=({0: torch.export.Dim("dim", min=3)},),
+            pre_dispatch=True,
+        ).module()
+
+        with self.assertRaisesRegex(
+            RuntimeError, escape("Expected input at *args[0].shape[0]")
+        ):
+            gm(torch.randn(2, 2))
+
+        with self.assertRaisesRegex(
+            RuntimeError, escape("Expected input at *args[0].shape[0]")
+        ):
+            torch.export.export(gm, (torch.randn(2, 2),))
+
+        ep = torch.export.export(
+            gm,
+            (torch.randn(5, 4),),
+            dynamic_shapes=({0: torch.export.Dim("dim", min=3)},),
+        )
+
+        test_inp = torch.ones(8, 4)
+        self.assertTrue(torch.allclose(ep.module()(test_inp), Foo().forward(test_inp)))
+
+    def test_issue_113041(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.tensor(1.0)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return x + self.a
+
+        def forward_hook(module: torch.nn.Module, inputs, output) -> torch.Tensor:
+            return 2 * output
 
-            return torch.cat((empty.transpose(0, 1), torch.zeros(6, a)), 0)
+        seq = torch.nn.Sequential(TestModule()).eval()
+        seq.b = torch.tensor(2)
+        handle = seq.register_forward_hook(forward_hook)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.seq = seq
+
+            def forward(self, x):
+                return self.seq(x) + self.seq.b
+
+        inp = (torch.randn(2, 8),)
+        ep = export(M(), inp)  # This errors because dynamo adds an extra input
+
+    def test_export_with_fake_tensor_inputs(self):
+        fake_mode = torch._subclasses.fake_tensor.FakeTensorMode()
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+
+            def forward(self, x):
+                out = self.linear(x)
+                return out
+
+        # Put the inputs on a device
+        with fake_mode, torch.device("meta"):
+            x = torch.rand(5, 2, 2)
+            model = Model()
+
+            exported_program = torch.export.export(model, (x,))
+            export_res = exported_program.module()(x)
+            exp_res = model(x)
+            all_meta_val = [
+                node.meta["val"]
+                for node in exported_program.graph_module.graph.nodes
+                if "val" in node.meta
+            ]
+            self.assertTrue(export_res.size() == exp_res.size())
+            self.assertTrue(all(val.device == x.device for val in all_meta_val))
+            self.assertTrue(
+                all(val.fake_mode is all_meta_val[0].fake_mode for val in all_meta_val)
+            )
+            decomposed_ep = exported_program.run_decompositions()
+            export_res = decomposed_ep.module()(x)
+            self.assertTrue(export_res.size() == exp_res.size())
 
-        ep = export(f, (torch.tensor([6]),))
-        self.assertEqual(ep(torch.tensor([5])).shape, (10, 5))
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range.default", 1, exactly=True
-        ).run(ep.graph_module.code)
+    def test_export_with_fake_tensor_inputs_on_cuda_devices(self):
+        fake_mode = torch._subclasses.fake_tensor.FakeTensorMode()
 
-    def test_to_module_with_mutated_buffer(self):
-        class Foo(torch.nn.Module):
-            def __init__(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
                 super().__init__()
-                self.register_buffer("buf", torch.zeros(1))
+                self.linear = torch.nn.Linear(2, 2)
 
             def forward(self, x):
-                self.buf.add_(1)
-                return x.sum() + self.buf.sum()
+                out = self.linear(x)
+                return out
 
-        exported = export(Foo(), (torch.ones(5, 5),))
-        stateful_gm = exported.module()
-        export_return_val = stateful_gm(torch.ones(5, 5))
-        eager = Foo()
-        eager_return_val = eager(torch.ones(5, 5))
-        self.assertTrue(torch.allclose(eager_return_val, export_return_val))
+        # Put the inputs on a device
+        with fake_mode, torch.device("meta"):
+            x = torch.rand(5, 2, 2)
+            model = Model()
 
-        for name, buffer in stateful_gm.named_buffers():
-            self.assertTrue(torch.allclose(torch.ones(1), buffer))
+        # Manualy set the fake_device of fake tensors.
+        x.fake_device = torch.device("cuda:0")
+        for n, p in model.named_parameters():
+            p.fake_device = torch.device("cuda:0")
 
-        changed = stateful_gm.graph.eliminate_dead_code()
-        self.assertFalse(changed)
-        self.assertTrue(torch.allclose(stateful_gm(torch.ones(5, 5)), eager(torch.ones(5, 5))))
+        # Need to set all the requires_grad of tensors to False, because fake_tensor with CUDA device
+        # doesn't quite work well with aot_autograd right now due to some logic fails
+        # the check in call getDeviceGuardImpl in InputMetadata.
+        x.requires_grad = False
+        for n, p in model.named_parameters():
+            p.requires_grad = False
 
-        for name, buffer in stateful_gm.named_buffers():
-            self.assertTrue(torch.allclose(torch.tensor(2, dtype=torch.float), buffer))
+        def check_device_and_fake_mode():
+            exported_program = torch.export.export(model, (x,))
+            export_res = exported_program.module()(x)
+            exp_res = model(x)
+            all_meta_val = [
+                node.meta["val"]
+                for node in exported_program.graph_module.graph.nodes
+                if "val" in node.meta
+            ]
+            self.assertTrue(export_res.size() == exp_res.size())
+            self.assertTrue(all(val.device == x.device for val in all_meta_val))
+            self.assertTrue(
+                all(val.fake_mode is all_meta_val[0].fake_mode for val in all_meta_val)
+            )
 
-    def test_to_module_with_mutated_buffer_multiple(self):
+        check_device_and_fake_mode()
 
-        class Bar(torch.nn.Module):
+    def test_run_decomposition_supports_user_input_mutation(self):
+        class SingleOp(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.register_buffer("buf", torch.ones(1))
-
-            def forward(self, x):
-                self.buf.add_(1)
-                return x.sum() + self.buf.sum()
+                self.op = torch.ops.aten.native_batch_norm
 
-        class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.register_buffer("buf", torch.zeros(1))
-                self.bar = Bar()
+            def forward(
+                self,
+                input,
+                weight,
+                bias,
+                running_mean,
+                running_var,
+                training,
+                momentum,
+                eps,
+                **kwargs,
+            ):
+                return self.op(
+                    input,
+                    weight,
+                    bias,
+                    running_mean,
+                    running_var,
+                    training,
+                    momentum,
+                    eps,
+                    **kwargs,
+                )
 
-            def forward(self, x):
-                self.buf.add_(1)
-                self.bar.buf.add_(2)
-                bar = self.bar(x)
-                return bar.sum() + self.buf.sum()
+        input = torch.randn(5, 5, 5)
+        weight = torch.randn(5)
+        bias = torch.randn(5)
+        running_mean = torch.randn(5)
+        running_var = torch.randn(5)
+        training = True
+        momentum = 0.5
+        eps = 0.6
 
-        exported = export(Foo(), (torch.ones(5, 5),))
-        stateful_gm = exported.module()
-        export_return_val = stateful_gm(torch.ones(5, 5))
-        eager = Foo()
-        eager_return_val = eager(torch.ones(5, 5))
-        self.assertTrue(torch.allclose(eager_return_val, export_return_val))
+        model = SingleOp()
+        output = model(
+            input, weight, bias, running_mean, running_var, training, momentum, eps
+        )
 
-        for name, buffer in stateful_gm.named_buffers():
-            if name == "L__self___buf":
-                self.assertTrue(torch.allclose(torch.ones(1), buffer))
-            if name == "L__self___bar_buf":
-                self.assertTrue(torch.allclose(torch.tensor(4, dtype=torch.float), buffer))
+        ep = torch.export.export(
+            model,
+            args=(
+                input,
+                weight,
+                bias,
+                running_mean,
+                running_var,
+                training,
+                momentum,
+                eps,
+            ),
+        )
+        ep.run_decompositions(decomp_table=torch._decomp.decomposition_table)
+        self.assertEqual(
+            ep.module()(
+                input, weight, bias, running_mean, running_var, training, momentum, eps
+            ),
+            output,
+        )
 
-        changed = stateful_gm.graph.eliminate_dead_code()
-        self.assertFalse(changed)
-        self.assertTrue(torch.allclose(stateful_gm(torch.ones(5, 5)), eager(torch.ones(5, 5))))
+    def test_export_graph_with_no_inputs(self):
+        # We saw this pattern when users want to export
+        # a graph that initlizes the states of a model.
+        class Module(torch.nn.Module):
+            def forward(self):
+                return torch.randn(3, 4), torch.randn(3, 4)
 
-        for name, buffer in stateful_gm.named_buffers():
-            if name == "L__self___buf":
-                self.assertTrue(torch.allclose(torch.tensor(2, dtype=torch.float), buffer))
-            if name == "L__self___bar_buf":
-                self.assertTrue(torch.allclose(torch.tensor(7, dtype=torch.float), buffer))
+        f = Module()
+        ep = torch.export.export(f, ())
+        a, b = ep.module()()
+        self.assertEqual(a.size(), torch.Size([3, 4]))
+        self.assertEqual(b.size(), torch.Size([3, 4]))
 
-    def test_runtime_assert_for_prim(self):
-        def f(x, y):
-            return x + y
+    def test_pad_sequence(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return torch._C._nn.pad_sequence([x])
 
-        tensor_inp = torch.ones(7, 5)
-        dim0_x = torch.export.Dim("dim0_x", min=6)
-        dynamic_shapes = {"x": {0: dim0_x}, "y": None}
-        exported = torch.export.export(f, (tensor_inp, 5), dynamic_shapes=dynamic_shapes)
-        self.assertTrue(
-            torch.allclose(exported(torch.ones(8, 5), 5), f(torch.ones(8, 5), 5))
+        m0 = Module()
+        inputs = (torch.randn(3, 2),)
+        ep = torch.export.export(
+            m0, inputs, dynamic_shapes={"x": {0: Dim("batch_size")}}
         )
-        with self.assertRaisesRegex(
-            RuntimeError, "Expected input arg1 to be equal to 5, but got 6"
-        ):
-            _ = exported(torch.ones(8, 5), 6)
-
-        exported = torch.export.export(f, (tensor_inp, 5.0), dynamic_shapes=dynamic_shapes)
-        with self.assertRaisesRegex(
-            RuntimeError, "Expected input arg1 to be equal to 5.0, but got 6.0"
-        ):
-            _ = exported(torch.ones(7, 5), 6.0)
+        self.assertEqual(ep.module()(*inputs), m0(*inputs))
 
-    @testing.expectedFailureSerDer
-    @testing.expectedFailureNonStrict
-    def test_runtime_assert_for_prm_str(self):
+        class ModuleBatchFirst(torch.nn.Module):
+            def forward(self, x):
+                return torch._C._nn.pad_sequence([x], batch_first=True)
 
-        def g(a, b, mode):
-            return torch.div(a, b, rounding_mode=mode)
+        m1 = ModuleBatchFirst()
+        inputs = (torch.randn(3, 2),)
+        ep = torch.export.export(
+            m1, inputs, dynamic_shapes={"x": {0: Dim("batch_size")}}
+        )
+        self.assertEqual(ep.module()(*inputs), m1(*inputs))
 
-        inps = (torch.randn(4, 4), torch.randn(4), "trunc")
-        exported = export(g, inps)
-        with self.assertRaisesRegex(RuntimeError, "to be equal to trunc, but got floor"):
-            _ = exported(torch.randn(4, 4), torch.randn(4), "floor")
-        self.assertTrue(torch.allclose(exported(*inps), g(*inps)))
+        class ModuleMulti(torch.nn.Module):
+            def forward(self, x, y, z):
+                return torch._C._nn.pad_sequence([x, y, z])
 
-    def test_to_module_with_mutated_buffer_multiple_update_sub_later(self):
+        m2 = ModuleMulti()
+        inputs = (torch.randn(5, 2), torch.randn(4, 2), torch.randn(3, 2))
+        ep = torch.export.export(
+            m2,
+            inputs,
+            dynamic_shapes={
+                "x": {0: Dim("batch_size")},
+                "y": {0: Dim("y")},
+                "z": {0: Dim("z")},
+            },
+        )
+        self.assertEqual(ep.module()(*inputs), m2(*inputs))
 
-        class Bar(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.register_buffer("buf", torch.ones(1))
+        class ModuleMultiBatchFirst(torch.nn.Module):
+            def forward(self, x, y, z):
+                return torch._C._nn.pad_sequence([x, y, z], batch_first=True)
 
-            def forward(self, x):
-                self.buf.add_(1)
-                return x.sum() + self.buf.sum()
+        m3 = ModuleMulti()
+        inputs = (torch.randn(5, 2), torch.randn(4, 2), torch.randn(3, 2))
+        ep = torch.export.export(
+            m2,
+            inputs,
+            dynamic_shapes={
+                "x": {0: Dim("batch_size")},
+                "y": {0: Dim("y")},
+                "z": {0: Dim("z")},
+            },
+        )
+        self.assertEqual(ep.module()(*inputs), m3(*inputs))
 
-        class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.register_buffer("buf", torch.zeros(1))
-                self.bar = Bar()
+    def test_export_then_compile_tensor_ctor(self):
+        class M(torch.nn.Module):
+            def forward(self, scores, mask):
+                scores = scores.masked_fill(
+                    mask, torch.tensor(torch.finfo(scores.dtype).min)
+                )  # (bs, n_heads, q_length, k_length)
+                return scores
 
-            def forward(self, x):
-                self.buf.add_(1)
-                bar = self.bar(x)
-                self.bar.buf.add_(2)
-                return bar.sum() + self.buf.sum()
+        tensor_cpu = torch.randn(2, 4)
+        mask_cpu = torch.BoolTensor(
+            [[False, True, False, False], [False, False, False, False]]
+        )
 
-        exported = export(Foo(), (torch.ones(5, 5),))
-        stateful_gm = exported.module()
-        export_return_val = stateful_gm(torch.ones(5, 5))
-        eager = Foo()
-        eager_return_val = eager(torch.ones(5, 5))
-        self.assertTrue(torch.allclose(eager_return_val, export_return_val))
+        m = M().eval()
+        # res_ref = m(tensor_cpu, mask_cpu)
+        # print("res_ref is: {}".format(res_ref), flush=True)
 
-        for name, buffer in stateful_gm.named_buffers():
-            if name == "L__self___buf":
-                self.assertTrue(torch.allclose(torch.ones(1), buffer))
-            if name == "L__self___bar_buf":
-                self.assertTrue(torch.allclose(torch.tensor(4, dtype=torch.float), buffer))
+        exported_model = _export(m, (tensor_cpu, mask_cpu), pre_dispatch=True).module()
+        optimized_model = torch.compile(exported_model)
+        optimized_model(tensor_cpu, mask_cpu)
 
-        changed = stateful_gm.graph.eliminate_dead_code()
-        self.assertFalse(changed)
-        self.assertTrue(torch.allclose(stateful_gm(torch.ones(5, 5)), eager(torch.ones(5, 5))))
+    def test_export_input_mutation_static_shape(self):
+        class MutationModel(torch.nn.Module):
+            def forward(self, x, y):
+                x.view(3, 2, -1).add_(y)
+                return x
 
-        for name, buffer in stateful_gm.named_buffers():
-            if name == "L__self___buf":
-                self.assertTrue(torch.allclose(torch.tensor(2, dtype=torch.float), buffer))
-            if name == "L__self___bar_buf":
-                self.assertTrue(torch.allclose(torch.tensor(7, dtype=torch.float), buffer))
+        inputs = (torch.randn(12), torch.tensor(2))
+        model = MutationModel()
+        ep = export(model, inputs)
+        inputs_export = copy.deepcopy(inputs)
+        inputs_model = copy.deepcopy(inputs)
+        self.assertEqual(ep.module()(*inputs_export), model(*inputs_model))
+        self.assertEqual(inputs[0] + torch.tensor(2), inputs_model[0])
+        self.assertEqual(inputs[0] + torch.tensor(2), inputs_export[0])
 
-    def test_retracable_ep(self):
-        class Bar(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.register_buffer("buf", torch.ones(1))
+    def test_export_input_mutation_dynamic_shape(self):
+        class MutationModel(torch.nn.Module):
+            def forward(self, x, y):
+                x[0].mul_(y)
+                return x
 
-            def forward(self, x):
-                self.buf.add_(1)
-                return x.sum() + self.buf.sum()
+        inputs = ((torch.randn(12), torch.randn(3, 2)), 2.0)
+        model = MutationModel()
+        ep = torch.export.export(
+            model,
+            inputs,
+            dynamic_shapes={"x": ({0: torch.export.Dim("dim")}, None), "y": None},
+        )
+        nodes = list(ep.graph.nodes)
+        self.assertEqual(nodes[0].op, "placeholder")
+        self.assertIsInstance(nodes[0].meta["val"], torch.Tensor)
+        self.assertIsInstance(nodes[0].meta["val"].shape[0], torch.SymInt)
 
-        class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.register_buffer("buf", torch.zeros(1))
-                self.bar = Bar()
+        inputs_export = copy.deepcopy(inputs)
+        inputs_model = copy.deepcopy(inputs)
+        self.assertEqual(ep.module()(*inputs_export), model(*inputs_model))
+        self.assertEqual(inputs[0][0] * 2.0, inputs_model[0][0])
+        self.assertEqual(inputs[0][0] * 2.0, inputs_export[0][0])
 
-            def forward(self, x):
-                self.buf.add_(1)
-                bar = self.bar(x)
-                self.bar.buf.add_(2)
-                return bar.sum() + self.buf.sum()
+    def test_export_input_mutation_bug(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                x[:, :2, :] = x[:, :2, :] + 1
+                return x
 
-        inp = torch.ones(5, 5)
-        exported = torch.export.export(Foo(), (inp,))
-        reexported = torch.export.export(exported, (inp,))
+        inputs = (torch.ones(4, 4, 4),)
+        ep = torch.export.export(M(), inputs)
+        m = ep.module()
 
-        self.assertTrue(torch.allclose(exported(inp), reexported(inp)))
+        # Make the name conflict with a placeholder name that we get from
+        # aot_export
+        for i, node in enumerate(m.graph.nodes):
+            if node.op == "placeholder":
+                node.name = f"arg0_{i + 1}"
+        m.recompile()
 
-        dim0_x = torch.export.Dim("dim0_x")
-        exported = torch.export.export(Foo(), (inp,), dynamic_shapes={"x": {0: dim0_x}})
-        reexported = torch.export.export(exported, (inp,))
-        with self.assertRaisesRegex(RuntimeError, "shape\[0\] to be equal to 5, but got 7"):
-            reexported(torch.ones(7, 5))
+        ep = torch.export.export(m, inputs)
 
-        reexported = torch.export.export(exported, (inp,), dynamic_shapes=({0: dim0_x},))
-        self.assertTrue(torch.allclose(exported(torch.ones(7, 5)), reexported(torch.ones(7, 5))))
+        inputs = (torch.randn(4, 4, 4),)
+        self.assertEqual(
+            ep.module()(*copy.deepcopy(inputs)), M()(*copy.deepcopy(inputs))
+        )
 
-        # can't retrace with invalid inputs with respect to the original ExportedProgram
-        dim0_x_v2 = torch.export.Dim("dim0_x_v2", min=3)
-        exported_v2 = torch.export.export(Foo(), (inp,), dynamic_shapes={"x": {0: dim0_x_v2}})
-        with self.assertRaisesRegex(RuntimeError, "Expected input l_x_.shape\[0\] to be >= 3, but got 2"):
-            torch.export.export(exported_v2, (torch.randn(2, 2),))
+    def test__scaled_dot_product_flash_attention(self):
+        class Module(torch.nn.Module):
+            def forward(self, q, k, v):
+                res = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+                return res[0]
 
-    @testing.expectedFailureSerDer
-    @testing.expectedFailureNonStrict
-    def test_retrace_graph_level_meta_preservation(self):
-        class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
+        m = Module()
+        inputs = (
+            torch.randn(5, 4, 3, 2),
+            torch.randn(5, 4, 3, 2),
+            torch.randn(5, 4, 3, 2),
+        )
+        ep = export(m, inputs)
+        self.assertEqual(ep.module()(*inputs), m(*inputs))
 
-            def forward(self, x):
-                if x.shape[0] > 4:
-                    return x.cos()
-                return x.sin()
+    @testing.expectedFailureSerDer  # symfloat nyi
+    @testing.expectedFailureSerDerPreDispatch  # symfloat nyi
+    def test_sym_sqrt(self):
+        import math
 
-        inp = torch.ones(7, 5)
-        dim0_x = torch.export.Dim("dim0_x", min=6)
-        exported = torch.export.export(Foo(), (inp,), dynamic_shapes={"x": {0: dim0_x}})
-        stateful_module = exported.module()
-        self.assertTrue(len(stateful_module.meta["input_shape_constraints"]), 1)
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x / torch.sym_sqrt(x.shape[0])
 
-        re_exported = export(stateful_module, (inp,), constraints=[dynamic_dim(inp, 0) > 5])
-        self.assertTrue(len(re_exported.graph_module.meta["input_shape_constraints"]) == 1)
-        self.assertTrue(
-            torch.allclose(exported(torch.ones(7, 5)), re_exported(torch.ones(7, 5)))
+        ep = export(M(), (torch.ones(16, 4),), dynamic_shapes={"x": {0: Dim("dim")}})
+        _ExportPassBaseDeprecatedDoNotUse()(ep.graph_module)
+        FileCheck().check_count("torch._sym_sqrt", 1, exactly=True).run(
+            ep.graph_module.code
         )
 
-        re_exported_v2 = export(exported, (inp,))
-        self.assertTrue(len(re_exported_v2.graph_module.meta["input_shape_constraints"]) == 0)
-        self.assertTrue(
-            torch.allclose(exported(torch.ones(7, 5)), re_exported_v2(torch.ones(7, 5)))
-        )
+    def test_check_specialized_int(self):
+        class SingleOp(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.op = torch.ops.aten.scatter_add
 
-    @testing.expectedFailureNonStrict
-    def test_constrain_as_size_error(self):
+            def forward(self, t, dim, index, src, **kwargs):
+                return self.op(t, dim, index, src, **kwargs)
 
-        def f(x):
-            a = x.item()
-            # We cannot automatically infer a is a size here because view
-            # accepts -1
-            return torch.randn(24).view(a, 4)
+        t = torch.randn(10, 5)
+        dim = -1
+        index = torch.tensor(
+            [
+                [2, 4, 3, 1, 0],
+                [0, 2, 1, 4, 3],
+                [3, 1, 4, 2, 0],
+                [4, 0, 3, 1, 2],
+                [3, 0, 4, 1, 2],
+            ]
+        )
+        src = torch.randn(5, 5)
 
-        with self.assertRaisesRegex(
-            torch._dynamo.exc.UserError,
-            "Tried to use data-dependent value in the subsequent computation"
-        ):
-            _ = export(f, (torch.tensor(6),))
+        model = SingleOp()
+        output = model(t, dim, index, src)
 
-    def test_constraint_directly_construct(self):
-        with self.assertRaisesRegex(
-            TypeError,
-            "Constraint has no public constructor. Please use torch.export.dynamic_dim"
-        ):
-            _ = Constraint()
+        ep = torch.export.export(model, args=(t, dim, index, src))
+        ep.run_decompositions(decomp_table=torch._decomp.decomposition_table)
+        self.assertEqual(ep.module()(t, dim, index, src), output)
 
-    def test_train_eval_on_exported_preautograd_module(self):
-        class Foo(torch.nn.Module):
+    def test_fqn(self):
+        class NestedChild(torch.nn.Module):
+            def forward(self, x):
+                return x / x
+
+        class Child1(torch.nn.Module):
             def __init__(self):
                 super().__init__()
+                self.nested = NestedChild()
+                self.register_parameter(
+                    "child1param", torch.nn.Parameter(torch.ones(2, 3))
+                )
 
             def forward(self, x):
-                if x.shape[0] > 4:
-                    return x.cos()
-                return x.sin()
-
-        graph_module = _export(Foo(), (torch.ones(7, 5),), pre_dispatch=True).module()
-        with self.assertRaisesRegex(NotImplementedError, r"Calling train\(\) is not supported yet."):
-            graph_module.train()
-
-        with self.assertRaisesRegex(NotImplementedError, r"Calling eval\(\) is not supported yet."):
-            graph_module.eval()
+                x = self.nested(x)
+                return x + self.child1param
 
-    def test_export_cond_preserve_stack_trace_for_subgraphs(self):
-        class MySubModule(torch.nn.Module):
-            def foo(self, x):
-                return x.cos()
+        class Child2(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("child2buffer", torch.ones(2, 3))
 
             def forward(self, x):
-                return self.foo(x)
+                return x - self.child2buffer
 
-        class CondBranchClassMethod(torch.nn.Module):
+        class MyModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.subm = MySubModule()
-
-            def bar(self, x):
-                return x.sin()
+                self.foo = Child1()
+                self.bar = Child2()
+                self.register_parameter(
+                    "rootparam", torch.nn.Parameter(torch.ones(2, 3))
+                )
 
             def forward(self, x):
-                return cond(x.shape[0] <= 2, self.subm.forward, self.bar, [x])
+                x = x * self.rootparam
+                x = self.foo(x)
+                x = self.bar(x)
+                return x
 
+        orig_eager = MyModule()
+        test_inp = torch.randn(2, 3)
 
-        from torch._export import capture_pre_autograd_graph
+        torch_gm = _export_to_torch_ir(orig_eager, (torch.rand(2, 3),), {})
+        for k, v in orig_eager.state_dict().items():
+            normalized_k = k.replace(".", "_")
+            self.assertIn(normalized_k, torch_gm.state_dict())
+            self.assertEqual(v, torch_gm.state_dict()[normalized_k])
+        self.assertTrue(torch.allclose(torch_gm(test_inp), orig_eager(test_inp)))
 
-        example_inputs = (torch.randn(1, 3, 3, 3),)
-        m = CondBranchClassMethod()
-        m.eval()
-        # TODO (tmanlaibaatar) Setting functional IR doesn't work on aot_export yet
-        # as the branch source_fn is not captured.
-        gm = capture_pre_autograd_graph(m, example_inputs)
+        pre_autograd_gm = torch.export._trace._export(
+            orig_eager, (torch.rand(2, 3),), {}, pre_dispatch=True
+        ).module()
+        for k, v in orig_eager.state_dict().items():
+            normalized_k = k.replace(".", "_")
+            self.assertIn(k, pre_autograd_gm.state_dict())
+            self.assertEqual(v, pre_autograd_gm.state_dict()[k])
+        self.assertTrue(torch.allclose(pre_autograd_gm(test_inp), orig_eager(test_inp)))
 
-        actual_source_fns = []
-        for mod in gm.modules():
-            for node in mod.graph.nodes:
-                if node.name in {"sin", "cos"}:
-                    source_fn_st = node.meta.get("source_fn_stack", None)
-                    if source_fn_st is not None:
-                        source_names = []
-                        for source_fn in source_fn_st:
-                            source_names.append(source_fn[0])
-                        actual_source_fns.append(source_names)
-        exp_source_fns = [["cond", "cos"], ["cond", "sin"]]
-        self.assertEqual(actual_source_fns, exp_source_fns)
+        ep = export(orig_eager, (torch.rand(2, 3),), {})
+        for k, v in orig_eager.state_dict().items():
+            # We do not need to normalize the key here because exported
+            # program's state dict is able to contain the module information.
+            self.assertIn(k, ep.state_dict)
+            self.assertEqual(v, ep.state_dict[k])
+        self.assertTrue(torch.allclose(ep.module()(test_inp), orig_eager(test_inp)))
 
-    @testing.expectedFailureRetraceability
-    @testing.expectedFailureNonStrict
-    def test_lifted_constants(self) -> None:
-        def f(x):
-            return x + torch.tensor(3)
+    def test_nn_module_stack(self):
+        class Leaf(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
 
-        ep = export(f, (torch.tensor(1),))
+            def forward(self, x):
+                return self.linear(x)
 
-        self.assertEqual(len(ep.graph_signature.input_specs), 2)
-        self.assertEqual(len(ep.tensor_constants), 1)
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.leaf = Leaf()
+                self.register_buffer("buffer", torch.randn(4, 4))
+
+            def forward(self, x):
+                return self.buffer.sum() + self.leaf(x).sum()
 
         class Foo(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.a = torch.tensor(3)
+                self.bar = Bar()
 
             def forward(self, x):
-                list_tensor = [torch.tensor(3), torch.tensor(4)]
-                return x + self.a + list_tensor[0] + list_tensor[1]
-
-        ep = export(Foo(), (torch.tensor(1),))
+                y = self.bar.buffer + x
+                return (self.bar(x) + y.sum(),)
 
-        self.assertEqual(len(ep.graph_signature.input_specs), 4)
-        self.assertEqual(len(ep.state_dict), 1)
-        self.assertEqual(len(ep.tensor_constants), 2)
+        inp = (torch.randn(4, 4),)
+        mod = Foo()
+        ep_strict = torch.export.export(mod, inp)
+        ep_non_strict = torch.export.export(mod, inp, strict=False)
 
-        inp = (torch.tensor(5),)
-        self.assertTrue(torch.allclose(ep(*inp), Foo()(*inp)))
+        gm_unflat_non_strict = unflatten(ep_non_strict)
+        self.assertTrue(hasattr(gm_unflat_non_strict, "bar"))
+        self.assertTrue(hasattr(gm_unflat_non_strict.bar, "buffer"))
+        self.assertTrue(hasattr(gm_unflat_non_strict.bar, "leaf"))
 
-        transform = ep.run_decompositions()
-        self.assertEqual(len(ep.graph_signature.input_specs), 4)
-        self.assertTrue(torch.allclose(ep(*inp), transform(*inp)))
+        gm_unflat_strict = unflatten(ep_strict)
 
-        unlifted = ep.module()
-        self.assertTrue(torch.allclose(ep(*inp), unlifted(*inp)))
+        self.assertEqual(gm_unflat_non_strict(*inp), gm_unflat_strict(*inp))
+        self.assertExpectedInline(
+            str(gm_unflat_non_strict.bar.leaf.linear.graph).strip(),
+            """\
+graph():
+    %x : [num_users=1] = placeholder[target=x]
+    %weight : [num_users=1] = get_attr[target=weight]
+    %bias : [num_users=1] = get_attr[target=bias]
+    %t : [num_users=1] = call_function[target=torch.ops.aten.t.default](args = (%weight,), kwargs = {})
+    %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%bias, %x, %t), kwargs = {})
+    return addmm""",
+        )
 
-    def test_preserve_shape_dynamism_for_unused_inputs(self):
-        @dataclass
-        class Input:
-            f: torch.Tensor
-            p: torch.Tensor
+        gm_flat_non_strict = ep_non_strict.module()
+        gm_flat_strict = ep_strict.module()
 
-        torch._export.utils.register_dataclass_as_pytree_node(Input)
+        self.assertEqual(gm_flat_non_strict(*inp), gm_flat_strict(*inp))
 
-        class Module(torch.nn.Module):
-            def forward(self, x: Input):
-                return x.f + 1
+    def test_nn_module_stack_shared_submodule(self):
+        class Leaf(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
 
-        mod = Module()
-        example_inputs = (Input(f=torch.ones(10, 4), p=torch.zeros(10, 4)),)
-        ep_static = torch.export.export(mod, example_inputs)
-        for node in ep_static.graph.nodes:
-            if node.op == "placeholder":
-                for s in node.meta["val"].shape:
-                    self.assertIsInstance(s, int)
+            def forward(self, x):
+                return self.linear(x)
 
-        dim0_x_f, dim0_x_p = torch.export.dims("dim0_x_f", "dim0_x_p")
-        dynamic_shapes = {"x": Input(f={0: dim0_x_f}, p={0: dim0_x_p})}
-        ep_dynamic = torch.export.export(mod, example_inputs, dynamic_shapes=dynamic_shapes)
-        for node in ep_dynamic.graph.nodes:
-            if node.op == "placeholder":
-                for i, s in enumerate(node.meta["val"].shape):
-                    if i == 0:
-                        self.assertIsInstance(s, torch.SymInt)
-                    else:
-                        self.assertIsInstance(s, int)
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.leaf = Leaf()
+                self.register_buffer("buffer", torch.randn(4, 4))
 
-    def test_multiple_definitions_same_name_dim(self):
-        def foo(x, y):
-            return torch.matmul(x, y)
+            def forward(self, x):
+                return self.buffer.sum() + self.leaf(x).sum()
 
-        A = torch.export.Dim("C", min=3)
-        B = torch.export.Dim("C", max=12)
-        with self.assertRaisesRegex(
-            torch._dynamo.exc.UserError,
-            "Found different definitions Dim\\(.*min=3\\) and Dim\\(.*max=12\\) "
-            "for the same symbolic dimension",
-        ):
-            torch.export.export(
-                foo,
-                (torch.randn(10, 10), torch.randn(10, 10)),
-                dynamic_shapes={"x": (A, B), "y": (B, A)},
-            )
+        class BarDifferent(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.leaf = Leaf()
 
-    def test_export_with_wrong_inputs(self):
-        class MyModule(torch.nn.Module):
             def forward(self, x):
-                return x + x
-
-        exported_program = export(MyModule(), (torch.rand(2, 3),), {})
-        with self.assertRaisesRegex(
-            TypeError, "Trying to flatten user inputs with exported input tree spec"
-        ):
-            exported_program(torch.rand(2, 3), torch.rand(2, 3))
+                a = self.leaf(x).sum()
+                b = self.leaf(x).sum()
+                return a + b
 
-    def test_export_decomps_simple(self):
-        class M(torch.nn.Module):
+        class Foo(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.lin = torch.nn.Linear(10, 1)
+                self.bar = Bar()
+                self.bar_different = BarDifferent()
 
             def forward(self, x):
-                return self.lin(x)
+                y = self.bar.buffer + x
+                return (
+                    self.bar(x) + self.bar_different(x + 2),
+                    y.sum(),
+                )
 
-        inp = (torch.randn(5, 10),)
-        m = M()
-        ep = export(m, inp)
-        state_dict = ep.state_dict
+        inp = (torch.randn(4, 4),)
+        mod = Foo()
+        ep_strict = torch.export.export(mod, inp)
+        ep_non_strict = torch.export.export(mod, inp, strict=False)
 
-        FileCheck().check_count(
-            "torch.ops.aten.t.default", 1, exactly=True
-        ).run(ep.graph_module.code)
-        self.assertTrue(torch.allclose(ep(*inp), m(*inp)))
+        gm_unflat_non_strict = unflatten(ep_non_strict)
+        self.assertTrue(hasattr(gm_unflat_non_strict, "bar"))
+        self.assertTrue(hasattr(gm_unflat_non_strict.bar, "buffer"))
+        self.assertTrue(hasattr(gm_unflat_non_strict.bar, "leaf"))
+        self.assertTrue(hasattr(gm_unflat_non_strict.bar_different, "leaf"))
 
-        core_aten_ep = ep.run_decompositions()
-        FileCheck().check_count(
-            "torch.ops.aten.permute.default", 1, exactly=True
-        ).run(core_aten_ep.graph_module.code)
-        FileCheck().check_count(
-            "torch.ops.aten.t.default", 0, exactly=True
-        ).run(core_aten_ep.graph_module.code)
-        self.assertTrue(torch.allclose(core_aten_ep(*inp), m(*inp)))
-        self.assertEqual(id(state_dict), id(ep.state_dict))
+        gm_unflat_strict = unflatten(ep_strict)
 
-    @testing.expectedFailureRetraceability
-    def test_export_decomps_dynamic(self):
-        class M(torch.nn.Module):
+        self.assertEqual(gm_unflat_non_strict(*inp), gm_unflat_strict(*inp))
+        self.assertExpectedInline(
+            str(gm_unflat_non_strict.bar.leaf.linear.graph).strip(),
+            """\
+graph():
+    %x : [num_users=1] = placeholder[target=x]
+    %weight : [num_users=1] = get_attr[target=weight]
+    %bias : [num_users=1] = get_attr[target=bias]
+    %t : [num_users=1] = call_function[target=torch.ops.aten.t.default](args = (%weight,), kwargs = {})
+    %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%bias, %x, %t), kwargs = {})
+    return addmm""",
+        )
+        self.assertExpectedInline(
+            str(gm_unflat_non_strict.bar_different.leaf.linear.graph).strip(),
+            """\
+graph():
+    %add_2 : [num_users=1] = placeholder[target=add_2]
+    %weight : [num_users=1] = get_attr[target=weight]
+    %bias : [num_users=1] = get_attr[target=bias]
+    %t_1 : [num_users=1] = call_function[target=torch.ops.aten.t.default](args = (%weight,), kwargs = {})
+    %addmm_1 : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%bias, %add_2, %t_1), kwargs = {})
+    return addmm_1""",
+        )
+
+        gm_flat_non_strict = ep_non_strict.module()
+        gm_flat_strict = ep_strict.module()
+
+        self.assertEqual(gm_flat_non_strict(*inp), gm_flat_strict(*inp))
+
+    def test_stack_trace(self):
+        class Foo(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.lin = torch.nn.Linear(10, 1)
+                self.linear = torch.nn.Linear(4, 4)
 
             def forward(self, x):
-                return self.lin(x)
+                x = self.linear(x)
+                x *= 2.0
+                return x
 
-        inp = (torch.randn(5, 10),)
-        m = M()
-        ep = export(m, inp, dynamic_shapes={"x": {0: Dim("batch")}})
+        ep = export(
+            Foo(),
+            (torch.randn(4, 4),),
+        )
+        # check correct lines are in stack trace
+        trace_mul = [node for node in ep.graph.nodes if node.name == "mul"][0].meta.get(
+            "stack_trace", ""
+        )
+        self.assertTrue(
+            re.search(r"test_export.py.*in forward\n.*x \*= 2.0", trace_mul)
+        )
+        trace_addmm = [
+            node for node in ep.graph.nodes if node.name in ["addmm", "linear"]
+        ][0].meta.get("stack_trace", "")
+        self.assertTrue(
+            re.search(
+                r"test_export.py.*in forward\n.*x = self.linear\(x\)", trace_addmm
+            )
+        )
 
-        core_aten_ep = ep.run_decompositions()
+    def test_sym_stack_trace(self):
+        # TODO(avik): update this test with torch._check*
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                y = torch.sym_constrain_range_for_size(y.item(), min=2)
+                z = x.shape[0] == 4
+                z = torch.sym_ite(z, x.shape[0], x.shape[1])
+                return z
+
+        ep = export(
+            Foo(),
+            (torch.randn(4, 4), torch.tensor(5)),
+            dynamic_shapes={"x": (Dim("dx0"), Dim("dx1")), "y": None},
+        )
+        # stack trace for sym call constrain_range
+        trace_constrain_range = [  # different names for serdes/pre-dispatch
+            node
+            for node in ep.graph.nodes
+            if node.name
+            in ["sym_constrain_range_for_size", "sym_constrain_range_for_size_default"]
+        ][0].meta.get("stack_trace", None)
+        self.assertTrue(
+            re.search(
+                r"in forward\n.*torch.sym_constrain_range_for_size",
+                trace_constrain_range,
+            )
+        )
 
-        input_node = [node for node in core_aten_ep.graph.nodes if node.op == "placeholder"][-1]
-        self.assertTrue(isinstance(input_node.meta["val"].shape[0], torch.SymInt))
+    def test_cond_with_module_stack_export_with(self):
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
 
-        FileCheck().check_count(
-            "torch.ops.aten.permute.default", 1, exactly=True
-        ).run(core_aten_ep.graph_module.code)
-        FileCheck().check_count(
-            "torch.ops.aten.t.default", 0, exactly=True
-        ).run(core_aten_ep.graph_module.code)
-        self.assertTrue(torch.allclose(core_aten_ep(*inp), m(*inp)))
+            def forward(self, x):
+                def true_fn(x):
+                    return self.linear(x).cos()
 
-    @testing.expectedFailureNonStrict
-    def test_nonzero_2(self):
-        def f(x):
-            return torch.nonzero(x)
-        ep = export(f, (torch.ones(2),))
-        inp = torch.randn(2)
-        self.assertTrue(torch.allclose(ep(inp), torch.nonzero(inp)))
+                def false_fn(x):
+                    return self.linear(x).sin()
 
-    @testing.expectedFailureSerDer
-    @testing.expectedFailureNonStrict
-    def test_redundant_asserts(self):
-        def f(x):
-            y = x.item()
-            torch._constrain_as_size(y)
-            return torch.zeros(y)
+                return torch.cond(x.shape[0] > 4, true_fn, false_fn, [x])
 
-        ep = export(f, (torch.tensor([3]),))
-        self.assertExpectedInline(str(ep.graph_module.code).strip(), """\
-def forward(self, l_x_):
-    _local_scalar_dense = torch.ops.aten._local_scalar_dense.default(l_x_);  l_x_ = None
-    ge = _local_scalar_dense >= 0
-    scalar_tensor = torch.ops.aten.scalar_tensor.default(ge);  ge = None
-    _assert_async = torch.ops.aten._assert_async.msg(scalar_tensor, '_local_scalar_dense is outside of inline constraint [0, inf].');  scalar_tensor = None
-    sym_constrain_range_for_size = torch.ops.aten.sym_constrain_range_for_size.default(_local_scalar_dense)
-    zeros = torch.ops.aten.zeros.default([_local_scalar_dense], device = device(type='cpu'), pin_memory = False);  _local_scalar_dense = None
-    return (zeros,)""")
+        class CondExport(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bar = Bar()
 
-    def test_non_arg_name_dynamic_shapes_api(self):
-        def foo(a, b):
-            return a.sum() + b.sum()
+            def forward(self, x):
+                return x.cos() + self.bar(x)
 
-        dim = torch.export.Dim("dim")
-        ep = torch.export.export(foo, (torch.randn(4, 4), torch.randn(4, 4)), dynamic_shapes=(None, {0: dim}))
+        inp = (torch.randn(4, 4),)
+        ep = torch.export.export(CondExport(), inp, strict=False)
+        self.assertExpectedInline(
+            ep.graph_module.code.strip(),
+            """\
+def forward(self, p_bar_linear_weight, p_bar_linear_bias, x):
+    cos = torch.ops.aten.cos.default(x)
+    true_graph_0 = self.true_graph_0
+    false_graph_0 = self.false_graph_0
+    conditional = torch.ops.higher_order.cond(False, true_graph_0, false_graph_0, [p_bar_linear_bias, p_bar_linear_weight, x]);  true_graph_0 = false_graph_0 = p_bar_linear_bias = p_bar_linear_weight = x = None
+    getitem = conditional[0];  conditional = None
+    add = torch.ops.aten.add.Tensor(cos, getitem);  cos = getitem = None
+    return (add,)""",
+        )
 
-        test_inp = (torch.randn(4, 4), torch.randn(7, 4))
-        self.assertEqual(ep(*test_inp), foo(*test_inp))
+        cond_top_level_nn_module_stack = [
+            node.meta["nn_module_stack"]
+            for node in ep.graph.nodes
+            if node.name == "true_graph_0"
+        ][0]
 
-        ep_v2 = torch.export.export(foo, (torch.randn(4, 4), torch.randn(4, 4)), dynamic_shapes=(None, None))
-        with self.assertRaisesRegex(RuntimeError, "shape\[0\] to be equal to 4, but got 7"):
-            ep_v2(*test_inp)
+        self.assertTrue(
+            "test_cond_with_module_stack_export_with.<locals>.Bar"
+            in str(cond_top_level_nn_module_stack)
+        )
 
-    def test_constant_output(self):
-        class ModuleConstant(torch.nn.Module):
+    # TODO: See https://github.com/pytorch/pytorch/issues/115790
+    @unittest.expectedFailure
+    def test_cond_with_module_stack_export_with_unflatten(self):
+        class Bar(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.b = torch.randn(3, 2)
+                self.linear = torch.nn.Linear(4, 4)
 
-            def forward(self):
-                return self.b
+            def forward(self, x):
+                def true_fn(x):
+                    return self.linear(x).cos()
 
-        class ModuleNestedConstant(torch.nn.Module):
+                def false_fn(x):
+                    return self.linear(x).sin()
+
+                return torch.cond(x.shape[0] > 4, true_fn, false_fn, [x])
+
+        class CondExport(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.bff = torch.randn(3, 2)
+                self.bar = Bar()
 
-            def forward(self, x, y):
-                return {"prediction": (x + y, self.bff)}
+            def forward(self, x):
+                return x.cos() + self.bar(x)
 
-        mod = ModuleConstant()
-        ep = torch.export.export(mod, ())
-        self.assertEqual(ep(), mod())
+        inp = (torch.randn(4, 4),)
+        ep = torch.export.export(CondExport(), inp, strict=False)
 
-        args = (torch.randn(3, 2), torch.randn(3, 2))
-        mod = ModuleNestedConstant()
-        ep = torch.export.export(mod, args)
-        self.assertEqual(ep(*args), mod(*args))
+        cond_top_level_nn_module_stack = [
+            node.meta["nn_module_stack"]
+            for node in ep.graph.nodes
+            if node.name == "true_graph_0"
+        ][0]
 
-    def test_non_arg_name_dynamic_shapes_api_with_kwarg(self):
-        def foo(a, b, kw1, kw2):
-            return a.sum() + b.sum() + kw1.sum() - kw2.sum()
+        # we can't preserve nn_module_stack for the subgraphs for now.
+        for node in ep.graph_module.true_graph_0.graph.nodes:
+            self.assertEqual(
+                node.meta["nn_module_stack"], cond_top_level_nn_module_stack
+            )
 
-        dim = torch.export.Dim("dim")
-        dim_for_kw1 = torch.export.Dim("dim_for_kw1")
-        ep = torch.export.export(
-            foo,
-            (torch.randn(4, 4), torch.randn(4, 4)),
-            {"kw2": torch.ones(4, 4), "kw1": torch.zeros(4, 4)},
-            # We are specifying dynamism on the first kwarg even though user passed in
-            # different order
-            dynamic_shapes=(None, {0: dim}, {0: dim_for_kw1}, None))
+        # this doesn't work today
+        gm_unflat_strict = unflatten(ep)
 
-        test_inp = (torch.randn(4, 4), torch.randn(7, 4))
-        test_kwargs = {"kw2": torch.ones(4, 4), "kw1": torch.zeros(9, 4)}
-        # This should work even if the kwarg order are flipped.
-        self.assertEqual(ep(*test_inp, **test_kwargs), foo(*test_inp, **test_kwargs))
+    def test_predispatch_cond(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("pred", torch.tensor(False))
+                self.register_buffer("t", torch.tensor(10))
 
-    def test_non_arg_name_dynamic_shapes_api_with_container_type(self):
-        def foo(a, b):
-            return a[0].sum() + a[1].sum() + b.sum()
+            def forward(self, x, y):
+                def true_fn(x, y):
+                    with torch.enable_grad():
+                        return x - 1 + self.t + y
+
+                return torch.cond(
+                    self.pred,
+                    true_fn,
+                    lambda x, y: x + 1 - self.t + y,
+                    [x, y],
+                )
 
-        inp_a = (torch.randn(4, 4), torch.randn(4, 4))
-        inp_b = torch.randn(4, 4)
-        inp = (inp_a, inp_b)
+        model = Model()
+        with torch.no_grad():
+            exported_program = torch.export._trace._export(
+                model,
+                (torch.tensor(10), torch.tensor(12)),
+                {},
+                dynamic_shapes=None,
+                pre_dispatch=True,
+                strict=False,
+            )
 
-        count = 0
-        def dynamify_inp(x):
-            # Mark the second input a[1] dynamic
-            nonlocal count
-            if count == 1:
-                dim = torch.export.Dim("dim", min=3)
-                count += 1
-                return {0: dim}
-            count += 1
-            return None
+        self.assertExpectedInline(
+            str(exported_program.graph_module.code.strip()),
+            """\
+def forward(self, b_pred, b_t, x, y):
+    true_graph_0 = self.true_graph_0
+    false_graph_0 = self.false_graph_0
+    conditional = torch.ops.higher_order.cond(b_pred, true_graph_0, false_graph_0, [b_t, x, y]);  b_pred = true_graph_0 = false_graph_0 = b_t = x = y = None
+    getitem = conditional[0];  conditional = None
+    return (getitem,)""",
+        )  # noqa: B950
 
-        dynamic_shapes = tree_map(dynamify_inp, inp)
+        self.assertExpectedInline(
+            str(exported_program.graph_module.true_graph_0.code.strip()),
+            """\
+def forward(self, b_t, x, y):
+    submod_3 = self.submod_1
+    add_1 = torch._higher_order_ops.wrap.wrap_with_set_grad_enabled(True, submod_3, b_t, x, y);  submod_3 = b_t = x = y = None
+    return (add_1,)""",
+        )
 
-        ep = torch.export.export(foo, inp, dynamic_shapes=dynamic_shapes)
+        self.assertExpectedInline(
+            str(exported_program.graph_module.true_graph_0.submod_1.code.strip()),
+            """\
+def forward(self, b_t, x, y):
+    sub = torch.ops.aten.sub.Tensor(b_t, 1);  b_t = None
+    add = torch.ops.aten.add.Tensor(sub, x);  sub = x = None
+    add_1 = torch.ops.aten.add.Tensor(add, y);  add = y = None
+    return add_1""",
+        )
 
-        test_inp = ((torch.randn(4, 4), torch.randn(2, 4)), torch.randn(4, 4))
-        with self.assertRaisesRegex(
-            RuntimeError, "shape\[0\] to be >= 3, but got 2"
-        ):
-            ep(*test_inp)
+    def test_predispatch_grad_wrappers(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                with torch.enable_grad():
+                    x = x - y
+                with torch.no_grad():
+                    x = x + y
+                return x
 
-    def test_lazy_module_kwargs(self):
-        class LazyModule(torch.nn.modules.lazy.LazyModuleMixin, torch.nn.Module):
-            def initialize_parameters(self, *args, **kwargs):
-                pass
+        # no grad
+        model = Model()
+        with torch.no_grad():
+            ep_nograd = torch.export._trace._export(
+                model,
+                (torch.tensor(10), torch.tensor(12)),
+                {},
+                dynamic_shapes=None,
+                pre_dispatch=True,
+                strict=False,
+            )
+        # check that only sub op is wrapped with grad_enabled
+        getattr_nodes = [
+            node for node in ep_nograd.graph.nodes if node.op == "get_attr"
+        ]
+        self.assertEqual(len(getattr_nodes), 1)
+        grad_subgraph = getattr(ep_nograd.graph_module, getattr_nodes[0].target)
+        op_node = [
+            node for node in grad_subgraph.graph.nodes if node.op == "call_function"
+        ][0]
+        self.assertEqual(op_node.target._name, "aten::sub.Tensor")
 
-            def forward(self, x, y):
-                return x + y
+        # enable grad
+        model = Model()
+        ep_grad = torch.export._trace._export(
+            model,
+            (torch.tensor(10), torch.tensor(12)),
+            {},
+            dynamic_shapes=None,
+            pre_dispatch=True,
+            strict=False,
+        )
+        # check that only add op is wrapped with grad_enabled
+        getattr_nodes = [node for node in ep_grad.graph.nodes if node.op == "get_attr"]
+        self.assertEqual(len(getattr_nodes), 1)
+        grad_subgraph = getattr(ep_grad.graph_module, getattr_nodes[0].target)
+        op_node = [
+            node for node in grad_subgraph.graph.nodes if node.op == "call_function"
+        ][0]
+        self.assertEqual(op_node.target._name, "aten::add.Tensor")
 
-        m = LazyModule()
-        ep = torch.export.export(m, (), {'x': torch.randn(3, 3), 'y': torch.randn(3, 3)})
-        inputs = {'x': torch.randn(3, 3), 'y': torch.randn(3, 3)}
-        self.assertEqual(ep(**inputs), m(**inputs))
+    @testing.expectedFailureRetraceability
+    def test_layer_sharing(self):
+        N, C, H, W = 1, 2, 2, 3
 
-    def test_retrace_pre_autograd(self):
-        class Foo(torch.nn.Module):
+        class Module(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.register_buffer("buffer", torch.ones(4, 4))
+                layer = torch.nn.LayerNorm([C, H, W])
+                self.norms = torch.nn.ModuleList(
+                    [
+                        layer,
+                        layer,
+                    ]
+                )
 
             def forward(self, x):
-                self.buffer.add_(4)
-                return x.sum() + self.buffer.sum()
-
-        inp = torch.randn(4, 4)
-        gm = _export(Foo(), (inp,), constraints=[dynamic_dim(inp, 0) >= 3], pre_dispatch=True).module()
-
-        with self.assertRaisesRegex(RuntimeError, "Expected input l_x_.shape\[0\]"):
-            gm(torch.randn(2, 2))
-
-        with self.assertRaisesRegex(RuntimeError, "Expected input l_x_.shape\[0\]"):
-            torch.export.export(gm, (torch.randn(2, 2),))
-
-        ep = torch.export.export(gm, (torch.randn(5, 4),), dynamic_shapes=({0: torch.export.Dim("dim", min=3)},))
+                for norm in self.norms:
+                    x = norm(x)
+                return x
 
-        test_inp = torch.ones(8, 4)
-        self.assertTrue(torch.allclose(ep(test_inp), Foo().forward(test_inp)))
+        m = Module()
+        copied_m = copy.deepcopy(m)
+        ep = export(copied_m, (torch.randn(N, C, H, W),))
+        self.assertEqual(copied_m.state_dict(), m.state_dict())
+        self.assertEqual(ep.state_dict, m.state_dict())
 
-    @testing.expectedFailureNonStrict
-    def test_issue_113041(self):
-        class TestModule(torch.nn.Module):
+    def test_non_persistent_buffer(self):
+        class MyModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.a = torch.tensor(1.0)
-
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return x + self.a
-
-        def forward_hook(
-            module: torch.nn.Module, inputs, output
-        ) -> torch.Tensor:
-            return 2 * output
-
-        seq = torch.nn.Sequential(TestModule()).eval()
-        seq.b = torch.tensor(2)
-        handle = seq.register_forward_hook(forward_hook)
+                self.register_buffer("foo", torch.rand(2, 3), persistent=False)
 
-        class M(torch.nn.Module):
+            def forward(self, x):
+                return self.foo + x
+
+        inp = torch.rand(2, 3)
+        m = MyModule()
+        ep = export(m, (inp,), {})
+
+        self.assertEqual(ep.module()(inp), m(inp))
+        # Non-persistent buffers should not show up in the state dict
+        self.assertNotIn("foo", ep.state_dict)
+        named_buffers = {name: buffer for (name, buffer) in ep.named_buffers()}
+        # But they should show up in named_buffers()
+        self.assertIn("foo", named_buffers)
+        self.assertIn("foo", ep.constants)
+        self.assertEqual(len(ep.constants), 1)
+
+        # Check the same properties of the unlifted module
+        mod = ep.module()
+        self.assertNotIn("foo", mod.state_dict())
+        mod_named_buffers = {name: buffer for (name, buffer) in mod.named_buffers()}
+        self.assertIn("foo", mod_named_buffers)
+        self.assertIn("foo", ep.constants)
+        self.assertEqual(len(ep.constants), 1)
+        self.assertEqual(mod(inp), m(inp))
+
+    def test_nonstrict_retrace_preserves_metadata(self):
+        class MyModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.seq = seq
+                self.linear = torch.nn.Linear(4, 4)
 
             def forward(self, x):
-                return self.seq(x) + self.seq.b
+                return self.linear(x)
 
-        inp = (torch.randn(2, 8),)
-        ep = export(M(), inp)  # This errors because dynamo adds an extra input
+        inp = torch.randn(4, 4)
+        m = MyModule()
+        ep = torch.export.export(m, (inp,), {}, strict=False)
+        # retrace
+        ep2 = torch.export.export(ep.module(), (inp,), {}, strict=False)
 
-    def test_export_with_fake_tensor_inputs(self):
-        fake_mode = torch._subclasses.fake_tensor.FakeTensorMode()
+        for n1, n2 in zip(list(ep.graph.nodes), list(ep2.graph.nodes)):
+            self.assertEqual(n1.meta.get("stack_trace"), n2.meta.get("stack_trace"))
 
-        class Model(torch.nn.Module):
-            def __init__(self) -> None:
+    def test_fake_weights(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
                 super().__init__()
-                self.linear = torch.nn.Linear(2, 2)
+                self.foo = torch.nn.Parameter(torch.randn(4, 4))
+                self.register_buffer("bar", torch.randn(4, 4), persistent=False)
+                self.register_buffer("baz", torch.randn(4, 4), persistent=True)
 
             def forward(self, x):
-                out = self.linear(x)
-                return out
+                return self.foo + x + self.bar + self.baz
 
-        # Put the inputs on a device
-        with fake_mode, torch.device('meta'):
-            x = torch.rand(5, 2, 2)
-            model = Model()
+        fake_mode = torch._subclasses.FakeTensorMode(
+            shape_env=ShapeEnv(tracked_fakes=[])
+        )
+        with fake_mode:
+            m = MyModule()
+        inp = torch.randn(4, 4)
+        ep = export(m, (inp,))
+        # Can't compare outputs because the module has fake weights.
 
-            exported_program = torch.export.export(model, (x,))
-            export_res = exported_program(x)
-            exp_res = model(x)
-            all_meta_val = [node.meta["val"] for node in exported_program.graph_module.graph.nodes if 'val' in node.meta]
-            self.assertTrue(export_res.size() == exp_res.size())
-            self.assertTrue(all(val.device == x.device for val in all_meta_val))
-            self.assertTrue(all(val.fake_mode is all_meta_val[0].fake_mode for val in all_meta_val))
-            decomposed_ep = exported_program.run_decompositions()
-            export_res = decomposed_ep(x)
-            self.assertTrue(export_res.size() == exp_res.size())
+    def test_fake_inputs(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.foo = torch.nn.Parameter(torch.randn(4, 4))
 
-    def test_export_with_fake_tensor_inputs_on_cuda_devices(self):
-        fake_mode = torch._subclasses.fake_tensor.FakeTensorMode()
+            def forward(self, x):
+                return self.foo + x
 
-        class Model(torch.nn.Module):
-            def __init__(self) -> None:
+        fake_mode = torch._subclasses.FakeTensorMode(
+            shape_env=ShapeEnv(tracked_fakes=[])
+        )
+        m = MyModule()
+        with fake_mode:
+            inp = torch.randn(4, 4)
+
+        ep = export(m, (inp,))
+        self.assertEqual(ep.module()(torch.ones(4, 4)), m(torch.ones(4, 4)))
+
+    def test_trace_under_fake(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
                 super().__init__()
-                self.linear = torch.nn.Linear(2, 2)
+                self.foo = torch.nn.Parameter(torch.randn(4, 4))
 
             def forward(self, x):
-                out = self.linear(x)
-                return out
+                return self.foo + x
 
-        # Put the inputs on a device
-        with fake_mode, torch.device('meta'):
-            x = torch.rand(5, 2, 2)
-            model = Model()
-
-        # Manualy set the fake_device of fake tensors.
-        x.fake_device = torch.device('cuda:0')
-        for n, p in model.named_parameters():
-            p.fake_device = torch.device('cuda:0')
+        fake_mode = torch._subclasses.FakeTensorMode(
+            shape_env=ShapeEnv(tracked_fakes=[])
+        )
+        with fake_mode:
+            m = MyModule()
+            inp = torch.randn(4, 4)
+            # Can't use unqualified export() as it will attempt to deserialize
+            # under a new FakeTensorMode.
+            ep = torch.export.export(m, (inp,))
+
+    def test_compiling_state(self):
+        class TestModule1(torch.nn.Module):
+            def forward(self, x):
+                if torch._dynamo.is_compiling():
+                    return x * 2
+                else:
+                    return x * 3
 
-        # Need to set all the requires_grad of tensors to False, because fake_tensor with CUDA device
-        # doesn't quite work well with aot_autograd right now due to some logic fails
-        # the check in call getDeviceGuardImpl in InputMetadata.
-        x.requires_grad = False
-        for n, p in model.named_parameters():
-            p.requires_grad = False
+        class TestModule2(torch.nn.Module):
+            def forward(self, x):
+                if torch._utils.is_compiling():
+                    return x * 2
+                else:
+                    return x * 3
 
+        class TestModule3(torch.nn.Module):
+            def forward(self, x):
+                if torch.compiler.is_compiling():
+                    return x * 2
+                else:
+                    return x * 3
 
-        def check_device_and_fake_mode():
-            exported_program = torch.export.export(model, (x,))
-            export_res = exported_program(x)
-            exp_res = model(x)
-            all_meta_val = [node.meta["val"] for node in exported_program.graph_module.graph.nodes if 'val' in node.meta]
-            self.assertTrue(export_res.size() == exp_res.size())
-            self.assertTrue(all(val.device == x.device for val in all_meta_val))
-            self.assertTrue(all(val.fake_mode is all_meta_val[0].fake_mode for val in all_meta_val))
+        for m in [TestModule1(), TestModule2(), TestModule3()]:
+            input = torch.randn(5)
+            ep_strict = export(m, (input,), strict=True)
+            ep_non_strict = export(m, (input,), strict=False)
 
-        check_device_and_fake_mode()
+            self.assertTrue(torch.allclose(input * 3, m(input)))
+            self.assertTrue(torch.allclose(input * 2, ep_strict.module()(input)))
+            self.assertTrue(torch.allclose(input * 2, ep_non_strict.module()(input)))
 
-    def test_run_decomposition_supports_user_input_mutation(self):
-        class SingleOp(torch.nn.Module):
+    def test_user_input_and_buffer_mutation(self):
+        class MyModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.op = torch.ops.aten.native_batch_norm
+                self.register_buffer("foo", torch.randn(4, 4))
 
-            def forward(self, input, weight, bias, running_mean, running_var, training, momentum, eps, **kwargs):
-                return self.op(input, weight, bias, running_mean, running_var, training, momentum, eps, **kwargs)
+            def forward(self, x):
+                self.foo.add_(1)
+                x.add_(1)
+                return self.foo + x
 
-        input = torch.randn(5, 5, 5)
-        weight = torch.randn(5)
-        bias = torch.randn(5)
-        running_mean = torch.randn(5)
-        running_var = torch.randn(5)
-        training = True
-        momentum = 0.5
-        eps = 0.6
+        mod = MyModule()
+        mod_copy = copy.deepcopy(mod)
+        ep = export(mod_copy, (torch.rand(4, 4),))
 
-        model = SingleOp()
-        output = model(input, weight, bias, running_mean, running_var, training, momentum, eps)
+        self.assertEqual(mod.foo, ep.module().foo)
+        self.assertEqual(mod(torch.ones(4, 4)), ep.module()(torch.ones(4, 4)))
 
-        ep = torch.export.export(model, args=(input, weight, bias, running_mean, running_var, training, momentum, eps))
-        ep.run_decompositions(decomp_table=torch._decomp.decomposition_table)
-        self.assertEqual(ep(input, weight, bias, running_mean, running_var, training, momentum, eps), output)
+    def test_symint_tensor_return(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.testlib.returns_tensor_symint(x)[0]
 
-    def test_export_graph_with_no_inputs(self):
-        # We saw this pattern when users want to export
-        # a graph that initlizes the states of a model.
-        def f():
-            return torch.randn(3, 4), torch.randn(3, 4)
+        self._test_export_same_as_eager(Module(), (torch.randn(4, 4),))
 
-        ep = torch.export.export(f, ())
-        a, b = ep()
-        self.assertEqual(a.size(), torch.Size([3, 4]))
-        self.assertEqual(b.size(), torch.Size([3, 4]))
+    def test_custom_op_auto_functionalize(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
 
-    def test_pad_sequence(self):
-        class Module(torch.nn.Module):
-            def forward(self, x):
-                return torch._C._nn.pad_sequence([x])
+            def forward(self, x, z):
+                return torch.ops.testlib.foo(x, z)
 
-        m0 = Module()
-        inputs = (torch.randn(3, 2),)
-        ep = torch.export.export(m0, inputs, dynamic_shapes={"x": {0: Dim("batch_size")}})
-        self.assertEqual(ep(*inputs), m0(*inputs))
+        inps = (torch.ones(5), torch.ones(5))
+        inps_for_export = (torch.ones(5), torch.ones(5))
+        inps_for_export_with_decomp = (torch.ones(5), torch.ones(5))
 
-        class ModuleBatchFirst(torch.nn.Module):
-            def forward(self, x):
-                return torch._C._nn.pad_sequence([x], batch_first=True)
+        ep = torch.export.export(M(), inps_for_export)
+        x_new_eager, z_new_eager, legit_eager = M()(*inps)
+        x_new_export, z_new_export, legit_export = ep.module()(*inps_for_export)
+        self.assertTrue(torch.allclose(x_new_eager, x_new_export))
+        self.assertTrue(torch.allclose(z_new_eager, z_new_export))
+        self.assertTrue(torch.allclose(legit_eager, legit_export))
 
-        m1 = ModuleBatchFirst()
-        inputs = (torch.randn(3, 2),)
-        ep = torch.export.export(m1, inputs, dynamic_shapes={"x": {0: Dim("batch_size")}})
-        self.assertEqual(ep(*inputs), m1(*inputs))
+        ep = ep.run_decompositions()
+        x_new_export, z_new_export, legit_export = ep.module()(
+            *inps_for_export_with_decomp
+        )
+        self.assertTrue(torch.allclose(x_new_eager, x_new_export))
+        self.assertTrue(torch.allclose(z_new_eager, z_new_export))
+        self.assertTrue(torch.allclose(legit_eager, legit_export))
 
-        class ModuleMulti(torch.nn.Module):
-            def forward(self, x, y, z):
-                return torch._C._nn.pad_sequence([x, y, z])
+    def test_custom_op_auto_functionalize_pre_dispatch(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
 
-        m2 = ModuleMulti()
-        inputs = (torch.randn(5, 2), torch.randn(4, 2), torch.randn(3, 2))
-        ep = torch.export.export(m2, inputs, dynamic_shapes={"x": {0: Dim("batch_size")}, "y": {0: Dim("y")}, "z": {0: Dim("z")}})
-        self.assertEqual(ep(*inputs), m2(*inputs))
+            def forward(self, x):
+                return torch.ops.testlib.foo_mutated(x)
 
-        class ModuleMultiBatchFirst(torch.nn.Module):
-            def forward(self, x, y, z):
-                return torch._C._nn.pad_sequence([x, y, z], batch_first=True)
+        inps = (torch.ones(5),)
 
-        m3 = ModuleMulti()
-        inputs = (torch.randn(5, 2), torch.randn(4, 2), torch.randn(3, 2))
-        ep = torch.export.export(m2, inputs, dynamic_shapes={"x": {0: Dim("batch_size")}, "y": {0: Dim("y")}, "z": {0: Dim("z")}})
-        self.assertEqual(ep(*inputs), m3(*inputs))
+        ep = torch.export.export(M(), inps)
+        self.assertExpectedInline(
+            str(ep.graph_module.code.strip()),
+            """\
+def forward(self, x):
+    cos = torch.ops.aten.cos.default(x)
+    auto_functionalized = torch._higher_order_ops.auto_functionalize.auto_functionalized(torch.ops.testlib.foo.default, x = x, z = cos);  x = cos = None
+    getitem_3 = auto_functionalized[3];  auto_functionalized = None
+    cos_1 = torch.ops.aten.cos.default(getitem_3)
+    return (getitem_3, getitem_3, cos_1)""",
+        )
 
-    def test_export_then_compile_tensor_ctor(self):
+        ep = torch.export._trace._export(M(), inps, pre_dispatch=True)
+        self.assertExpectedInline(
+            str(ep.graph_module.code.strip()),
+            """\
+def forward(self, x):
+    cos = torch.ops.aten.cos.default(x)
+    auto_functionalized = torch._higher_order_ops.auto_functionalize.auto_functionalized(torch.ops.testlib.foo.default, x = x, z = cos);  x = cos = None
+    getitem_3 = auto_functionalized[3];  auto_functionalized = None
+    cos_1 = torch.ops.aten.cos.default(getitem_3)
+    return (getitem_3, getitem_3, cos_1)""",
+        )
+
+    def test_custom_op_auto_warn_pre_dispatch(self):
         class M(torch.nn.Module):
-            def __init__(self,):
+            def __init__(self):
                 super().__init__()
 
-            def forward(self, scores, mask):
-                scores = scores.masked_fill(
-                    mask, torch.tensor(torch.finfo(scores.dtype).min)
-                )  # (bs, n_heads, q_length, k_length)
-                return scores
+            def forward(self, x):
+                return torch.ops.testlib.foo_functional(x)
 
-        tensor_cpu = torch.randn(2, 4)
-        mask_cpu = torch.BoolTensor(
-            [[False,  True, False, False],
-            [False, False, False, False]]
+        inps = (torch.ones(5),)
+
+        ep = torch.export.export(M(), inps)
+        self.assertExpectedInline(
+            str(ep.graph_module.code.strip()),
+            """\
+def forward(self, x):
+    cos = torch.ops.aten.cos.default(x)
+    cos_1 = torch.ops.aten.cos.default(x);  x = None
+    auto_functionalized = torch._higher_order_ops.auto_functionalize.auto_functionalized(torch.ops.testlib.foo.default, x = cos, z = cos_1);  cos = cos_1 = None
+    getitem_3 = auto_functionalized[3];  auto_functionalized = None
+    cos_2 = torch.ops.aten.cos.default(getitem_3);  getitem_3 = None
+    return (cos_2,)""",
         )
 
-        m = M().eval()
-        # res_ref = m(tensor_cpu, mask_cpu)
-        # print("res_ref is: {}".format(res_ref), flush=True)
+        ep = torch.export._trace._export(M(), inps, pre_dispatch=True)
+        self.assertExpectedInline(
+            str(ep.graph_module.code.strip()),
+            """\
+def forward(self, x):
+    foo_functional = torch.ops.testlib.foo_functional.default(x);  x = None
+    return (foo_functional,)""",
+        )
 
-        exported_model = _export(m, (tensor_cpu, mask_cpu), pre_dispatch=True).module()
-        optimized_model = torch.compile(exported_model)
-        optimized_model(tensor_cpu, mask_cpu)
+    # original input names aren't retraceable:
+    # compilation will succeed, but names won't match forward() signature.
+    @testing.expectedFailureRetraceability
+    def test_placeholder_naming_collisions(self):
+        # test collisions between nested user inputs
+        class Foo(torch.nn.Module):
+            def forward(self, x, x_foo, x_foo_0):
+                return x["foo"][0] + x_foo[0] + x_foo_0
 
-    def test_export_mkldnn_disabled(self):
-        class M(torch.nn.Module):
+        inputs = (
+            {"foo": [torch.randn(4, 4)]},
+            (torch.randn(4, 4),),
+            torch.randn(4, 4),
+        )
+        ep = export(Foo(), inputs)
+        expected_names = ["x_foo_0", "x_foo_0_1", "x_foo_0_2"]
+        real_names = [spec.arg.name for spec in ep.graph_signature.input_specs]
+        self.assertEqual(expected_names, real_names)
+
+        # test collisions between user inputs and params, buffers, constants
+        class Foo(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.lstm = torch.nn.LSTM(input_size=4, hidden_size=5, num_layers=1)
-
-            def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-                return self.lstm(inputs)
+                self.param = torch.nn.Parameter(torch.randn(4))
+                self.register_buffer("alpha", torch.randn(4), persistent=True)
+                self.register_buffer("beta", torch.randn(4), persistent=False)
+                self.gamma = torch.randn(4)
+
+            def forward(self, p, b_alpha, b, c_gamma):
+                p = p["param"] + self.param
+                b = self.alpha + self.beta + b_alpha + b["beta"]
+                c = self.gamma + c_gamma
+                return p, b, c
+
+        inputs = (
+            {"param": torch.randn(4)},
+            torch.randn(4),
+            {"beta": torch.randn(4)},
+            torch.randn(4),
+        )
+        ep = export(Foo(), inputs)
+        expected_names = [  # user inputs should be prioritized, unprefixed
+            ("p_param_1", InputKind.PARAMETER),
+            ("b_alpha_1", InputKind.BUFFER),
+            ("b_beta_1", InputKind.BUFFER),
+            ("c_gamma_1", InputKind.CONSTANT_TENSOR),
+            ("p_param", InputKind.USER_INPUT),
+            ("b_alpha", InputKind.USER_INPUT),
+            ("b_beta", InputKind.USER_INPUT),
+            ("c_gamma", InputKind.USER_INPUT),
+        ]
+        real_names = [
+            (spec.arg.name, spec.kind) for spec in ep.graph_signature.input_specs
+        ]
+        self.assertEqual(expected_names, real_names)
 
-        inp = (torch.ones(3, 4),)
-        torch._C._set_mkldnn_enabled(False)
-        ep = torch.export.export(M(), inp)
-        FileCheck().check_count(
-            "torch.ops.aten.mkldnn_rnn_layer.default", 0, exactly=True
-        ).run(ep.graph_module.code)
+        # test collisions between user inputs & call_function nodes
+        class Foo(torch.nn.Module):
+            def forward(self, mul, add, add_1):
+                return mul * mul + add * add_1
+
+        ep = export(Foo(), (torch.randn(4, 4), torch.randn(4, 4), torch.randn(4, 4)))
+        expected_names_and_ops = [
+            ("mul", "placeholder"),
+            ("add", "placeholder"),
+            ("add_1", "placeholder"),
+            ("mul_1", "call_function"),
+            ("mul_2", "call_function"),
+            ("add_2", "call_function"),
+            ("output", "output"),
+        ]
+        real_names_and_ops = [(node.name, node.op) for node in ep.graph.nodes]
+        self.assertEqual(expected_names_and_ops, real_names_and_ops)
 
-        torch._C._set_mkldnn_enabled(True)
-        ep = torch.export.export(M(), inp)
-        FileCheck().check_count(
-            "torch.ops.aten.mkldnn_rnn_layer.default", 1, exactly=True
-        ).run(ep.graph_module.code)
+    @testing.expectedFailureRetraceability
+    def test_placeholder_naming_collisions_hoo_subgraphs(self):
+        # test collisions between user inputs, top-level nodes, and HOO subgraph nodes
+        class Foo(torch.nn.Module):
+            def forward(self, x, mul, mul_1):
+                _mul = x * x
+                y = cond(
+                    _mul.sum() > 0,
+                    lambda x, y, z: x * y * z,
+                    lambda x, y, z: x + y + z,
+                    [_mul, mul, mul_1],
+                )
+                with torch.enable_grad():
+                    y = y * y
+                return y
+
+        with torch.no_grad():
+            ep = torch.export._trace._export(
+                Foo(),
+                (torch.randn(4), torch.randn(4), torch.randn(4)),
+                pre_dispatch=True,
+            )
+        # test cond subgraph
+        expected_names_and_ops = [
+            ("mul_2", "placeholder"),
+            ("mul", "placeholder"),
+            ("mul_1", "placeholder"),
+            ("mul_3", "call_function"),
+            ("mul_4", "call_function"),
+            ("output", "output"),
+        ]
+        real_names_and_ops = [
+            (node.name, node.op) for node in ep.graph_module.true_graph_0.graph.nodes
+        ]
+        self.assertEqual(expected_names_and_ops, real_names_and_ops)
+        # test set_grad_enabled subgraph
+        expected_names_and_ops = [
+            ("getitem", "placeholder"),
+            ("mul_1", "call_function"),
+            ("output", "output"),
+        ]
+        real_names_and_ops = [
+            (node.name, node.op) for node in ep.graph_module.submod_1.graph.nodes
+        ]
+        self.assertEqual(expected_names_and_ops, real_names_and_ops)
 
-    def test_export_input_mutation_static_shape(self):
-        class MutationModel(torch.nn.Module):
-            def forward(self, x, y):
-                x.view(3, 2, -1).add_(y)
+        # test collisions between user inputs & higher order op subgraphs
+        # (please never do this)
+        class Foo(torch.nn.Module):
+            def forward(self, input, true_graph, body_graph):
+                def map_body(x, y):
+                    return x + y
+
+                x = map(map_body, input, body_graph[0])
+                x = x + true_graph[0] + true_graph[1]
+                x = cond(x.sum() > 0, lambda x: x * 2.0, lambda x: x + 2.0, [x])
+                x = cond(x.sum() > 0, lambda x: x * 2.0, lambda x: x + 2.0, [x])
                 return x
-        inputs = (torch.randn(12), 2.0)
-        model = MutationModel()
-        ep = torch.export.export(model, inputs)
-        inputs_export = copy.deepcopy(inputs)
-        inputs_model = copy.deepcopy(inputs)
-        self.assertEqual(ep(*inputs_export), model(*inputs_model))
-        self.assertEqual(inputs[0] + 2.0, inputs_model[0])
-        self.assertEqual(inputs[0] + 2.0, inputs_export[0])
 
-    def test_export_input_mutation_dynamic_shape(self):
-        class MutationModel(torch.nn.Module):
-            def forward(self, x, y):
-                x[0].mul_(y)
-                return x
-        inputs = ((torch.randn(12), torch.randn(3, 2)), 2.0)
-        model = MutationModel()
-        ep = torch.export.export(
-            model,
+        inputs = (
+            torch.randn(10, 4),
+            (torch.randn(4), torch.randn(4)),
+            (torch.randn(4),),
+        )
+        ep = export(Foo(), inputs)
+        expected_getattr_names = [
+            "body_graph_1",
+            "true_graph_2",
+            "false_graph_0",
+            "true_graph_3",
+            "false_graph_1",
+        ]
+        real_getattr_names = [
+            node.name for node in ep.graph.nodes if node.op == "get_attr"
+        ]
+        self.assertEqual(expected_getattr_names, real_getattr_names)
+
+    def test_constant_input_naming(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x, y, div="floor"):
+                return torch.div(x, y, rounding_mode=div)
+
+        f = Foo()
+        inputs = (torch.randn(4), torch.randn(4), "floor")
+        ep = export(f, inputs)
+        div_spec = ep.graph_signature.input_specs[2]
+        self.assertEqual(div_spec.arg.name, "div")
+        self.assertEqual(div_spec.arg.value, "floor")
+
+    def test_nested_dynamic_shapes_spec(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                (a0, a1), (b0, b1), (c0, c1, c2) = x
+                return a0 + a1 + b0 + b1 + c0 + c1 + c2
+
+        f = Foo()
+        inputs = (
+            (1, 2),
+            (
+                torch.randn(4, 4),
+                torch.randn(4, 4),
+            ),
+            (
+                torch.randn(4, 4),
+                torch.randn(4, 4),
+                torch.randn(4, 4),
+            ),
+        )
+        # make sure this gets parsed correctly as 7 individual inputs, not 3 tensors
+        dynamic_shapes = {
+            "x": (
+                (None, None),
+                (None, None),
+                (None, None, None),
+            )
+        }
+        export(f, (inputs,), dynamic_shapes=dynamic_shapes)
+
+    def test_disable_forced_specializations(self):
+        # case 1
+        # check disable_forced_specializations flag behaves correctly
+        from torch.export import dims
+
+        class Mod4Reshape(torch.nn.Module):
+            def forward(self, x):
+                return x.reshape(x.shape[0] - 1, 4, -1)  # Mod(s0*s1, 4*(s0-1)) = 0
+
+        inputs = (torch.randn(10, 72),)
+        dx, dy = dims("dx", "dy")
+        with self.assertRaisesRegex(  # this will force specialize
+            torch._dynamo.exc.UserError,
+            r".*Specializations unexpectedly required(.*\n)*"
+            r".*dx = .* must be specialized to 10 because the guards generated for it are too complex(.*\n)*"
+            r".*dy = .* must be specialized to 72 because the guards generated for it are too complex(.*\n)*",
+        ):
+            torch.export._trace._export(
+                Mod4Reshape(),
+                inputs,
+                dynamic_shapes={"x": (dx, dy)},
+                strict=False,
+                _disable_forced_specializations=False,
+            )
+        ep = torch.export._trace._export(
+            Mod4Reshape(),
             inputs,
-            dynamic_shapes={'x': ({0: torch.export.Dim("dim")}, None), "y": None}
+            dynamic_shapes={"x": (dx, dy)},
+            strict=False,
+            _disable_forced_specializations=True,
         )
-        nodes = list(ep.graph.nodes)
-        self.assertEqual(nodes[0].op, "placeholder")
-        self.assertIsInstance(nodes[0].meta['val'], torch.Tensor)
-        self.assertIsInstance(nodes[0].meta['val'].shape[0], torch.SymInt)
+        out1 = ep.module()(torch.randn(8, 7))
+        self.assertEqual(out1.shape, torch.ones(7, 4, 2).shape)
+        out2 = ep.module()(torch.randn(4, 3))
+        self.assertEqual(out2.shape, torch.ones(3, 4, 1).shape)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"shape .*7, 4, -1.* is invalid for input of size 64",
+        ):
+            ep.module()(torch.randn(8, 8))  # fail
 
-        inputs_export = copy.deepcopy(inputs)
-        inputs_model = copy.deepcopy(inputs)
-        self.assertEqual(ep(*inputs_export), model(*inputs_model))
-        self.assertEqual(inputs[0][0] * 2.0, inputs_model[0][0])
-        self.assertEqual(inputs[0][0] * 2.0, inputs_export[0][0])
+        # case 2
+        class FreeReshape(torch.nn.Module):
+            def forward(self, x, y, z):
+                return x.reshape([-1]) + y.reshape([-1]) + z  # s0*s1 = s2*s3 = s4
 
-    def test__scaled_dot_product_flash_attention(self):
-        class Module(torch.nn.Module):
-            def forward(self, q, k, v):
-                res = torch.nn.functional.scaled_dot_product_attention(q, k, v)
-                return res[0]
+        inputs = (
+            torch.randn(6, 8),
+            torch.randn(3, 16),
+            torch.randn(48),
+        )
+        dynamic_shapes = {
+            "x": [Dim(f"dx{i}") for i in range(2)],
+            "y": [Dim(f"dy{i}") for i in range(2)],
+            "z": [Dim(f"dz{i}") for i in range(1)],
+        }
+        with self.assertRaisesRegex(  # this will force specialize
+            torch._dynamo.exc.UserError,
+            r".*Specializations unexpectedly required(.*\n)*"
+            r".*dx0 = .* must be specialized to 6 because the guards generated for it are too complex(.*\n)*"
+            r".*dx1 = .* must be specialized to 8 because the guards generated for it are too complex(.*\n)*",
+        ):
+            torch.export._trace._export(
+                FreeReshape(),
+                inputs,
+                dynamic_shapes=dynamic_shapes,
+                strict=False,
+                _disable_forced_specializations=False,
+            )
+        ep = torch.export._trace._export(
+            FreeReshape(),
+            inputs,
+            dynamic_shapes=dynamic_shapes,
+            strict=False,
+            _disable_forced_specializations=True,
+        )
+        out1 = ep.module()(torch.randn(48, 1), torch.randn(4, 12), torch.randn(48))
+        self.assertEqual(out1.shape, torch.ones(48).shape)
+        out2 = ep.module()(torch.randn(5, 8), torch.randn(4, 10), torch.randn(40))
+        self.assertEqual(out2.shape, torch.ones(40).shape)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"The size of tensor a .* must match the size of tensor b .* at non-singleton dimension 0",
+        ):  # fail only at runtime
+            ep.module()(torch.randn(5, 8), torch.randn(4, 5), torch.randn(30))  # fail
 
-        m = Module()
-        inputs = (torch.randn(5, 4, 3, 2), torch.randn(5, 4, 3, 2), torch.randn(5, 4, 3, 2))
-        ep = export(m, inputs)
-        self.assertEqual(ep(*inputs), m(*inputs))
+    def test_disable_forced_specializations_errors(self):
+        # check error messages with disable_forced_specializations=False/True
+        class Foo(torch.nn.Module):
+            def forward(self, w, x, y, z):
+                return w.reshape([-1]) + x, y + z  # simple: s0*s1 = s2, s3 = s4
+
+        inputs = (
+            torch.randn(3, 4),
+            torch.randn(12),
+            torch.randn(4),
+            torch.randn(4),
+        )
+        dynamic_shapes = {
+            "w": [Dim(f"dw{i}") for i in range(2)],
+            "x": [Dim(f"dx{i}") for i in range(1)],
+            "y": [Dim("dy")],  # y & z incorrect, export is supposed to fail.
+            "z": [Dim("dz")],  # suggested fix should be to match these up.
+        }
+        with self.assertRaisesRegex(  # if disable=False, suggested fixes should specialize 3, 4, 12.
+            torch._dynamo.exc.UserError,
+            r".*Specializations unexpectedly required(.*\n)*"
+            r"Suggested fixes:(.*\n)*"
+            r".*dy = Dim.*(.*\n)*"
+            r".*dw0 = 3(.*\n)*"
+            r".*dw1 = 4(.*\n)*"
+            r".*dx0 = 12(.*\n)*"
+            r".*dz = dy(.*\n)*",
+        ):
+            torch.export._trace._export(
+                Foo(),
+                inputs,
+                dynamic_shapes=dynamic_shapes,
+                strict=False,
+                _disable_forced_specializations=False,
+            )
+        with self.assertRaisesRegex(  # if disable=True, suggested fixes should not specialize.
+            torch._dynamo.exc.UserError,
+            r".*Constraints violated(.*\n)*"
+            r"Suggested fixes:(.*\n)*"
+            r".*dw0 = Dim.*(.*\n)*"
+            r".*dw1 = Dim.*(.*\n)*"
+            r".*dy = Dim.*(.*\n)*"
+            r".*dz = dy(.*\n)*",
+        ) as msg:
+            torch.export._trace._export(
+                Foo(),
+                inputs,
+                dynamic_shapes=dynamic_shapes,
+                strict=False,
+                _disable_forced_specializations=True,
+            )
 
-    @testing.expectedFailureSerDer  # symfloat nyi
-    @testing.expectedFailureRetraceability
-    def test_sym_sqrt(self):
-        import math
-        class M(torch.nn.Module):
-            def forward(self, x):
-                return x / torch.sym_sqrt(x.shape[0])
 
-        ep = export(M(), (torch.ones(16, 4),), dynamic_shapes={'x': {0: Dim("dim")}})
-        _ExportPassBaseDeprecatedDoNotUse()(ep.graph_module)
-        FileCheck().check_count(
-            "torch.sym_sqrt", 1, exactly=True
-        ).run(ep.graph_module.code)
+@unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
+class TestOneOffModelExportResult(TestCase):
+    def test_scaled_dot_product_attention_cpu(self):
+        """
+        This test makes sure we are always getting the same decomposition result for SDPA.
+        As of now _scaled_dot_product_flash_attention_for_cpu is expected to show up in
+        export() result. Some downstream backend then further decompose it into core ATen
+        ops in torch/_decomp/decompositions.py (search for
+        _scaled_dot_product_flash_attention_for_cpu).
+
+        Export is decomposing based on the CompositeImplicitAutograd kernel implementation
+        of SDPA. If this test fails, it means the kernel is being modified. In this case
+        we strongly encourage you to change the decomposition rule under
+        torch/_decomp/decompositions.py along with the kernel changes, so all of the
+        downstream backends are not being affected.
+        """
+
+        class ScaledDotProductAttention(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
 
-    def test_check_specialized_int(self):
-        class SingleOp(torch.nn.Module):
+            def forward(self, q, k, v):
+                attn_output = F.scaled_dot_product_attention(
+                    q, k, v, None, dropout_p=0.0, is_causal=True
+                )
+                return attn_output
+
+        q = torch.randn(1, 1, 8, 8, device="cpu")
+        k = torch.randn(1, 1, 8, 8, device="cpu")
+        v = torch.randn(1, 1, 8, 8, device="cpu")
+
+        from torch.nn.attention import SDPBackend
+
+        with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]):
+            ep = torch.export.export(ScaledDotProductAttention(), (q, k, v))
+            print(ep.graph)
+            ep.run_decompositions()
+            print(ep.graph)
+
+    #         self.assertExpectedInline(ep.graph_module.code.strip(), """\
+    # def forward(self, arg0_1, arg1_1, arg2_1):
+    #     _scaled_dot_product_flash_attention_for_cpu = torch.ops.aten._scaled_dot_product_flash_attention_for_cpu.default(arg0_1, arg1_1, arg2_1, 0.0, True);  arg0_1 = arg1_1 = arg2_1 = None
+    #     getitem = _scaled_dot_product_flash_attention_for_cpu[0];  _scaled_dot_product_flash_attention_for_cpu = None
+    #     return (getitem,)""")
+
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION,
+        "Can't run fused SDPA on this platform",
+    )
+    def test_scaled_dot_product_attention_cuda(self):
+        """
+        This test makes sure we are always getting the same decomposition result for SDPA.
+        As of now _scaled_dot_product_flash_attention is expected to show up in
+        export() result (GPU tensors are given). Currently there's no downstream
+        backend relies on this export result so if this test fails, feel free to
+        change it to the latest export() result.
+        """
+
+        class ScaledDotProductAttention(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.op = torch.ops.aten.scatter_add
 
-            def forward(self, t, dim, index, src, **kwargs):
-                return self.op(t, dim, index, src, **kwargs)
+            def forward(self, q, k, v):
+                attn_output = F.scaled_dot_product_attention(
+                    q, k, v, None, dropout_p=0.0, is_causal=True
+                )
+                return attn_output
 
+        q = torch.randn(1, 16, 16, 64, dtype=torch.bfloat16, device="cuda")
+        k = torch.randn(1, 16, 16, 64, dtype=torch.bfloat16, device="cuda")
+        v = torch.randn(1, 16, 16, 64, dtype=torch.bfloat16, device="cuda")
 
-        t = torch.randn(10, 5)
-        dim = -1
-        index = torch.tensor([[2, 4, 3, 1, 0],[0, 2, 1, 4, 3],[3, 1, 4, 2, 0],[4, 0, 3, 1, 2],[3, 0, 4, 1, 2]])
-        src = torch.randn(5, 5)
+        ep = torch.export.export(ScaledDotProductAttention(), (q, k, v))
+        self.assertExpectedInline(
+            ep.graph_module.code.strip(),
+            """\
+def forward(self, q, k, v):
+    _scaled_dot_product_flash_attention = torch.ops.aten._scaled_dot_product_flash_attention.default(q, k, v, 0.0, True, scale = 0.125);  q = k = v = None
+    getitem = _scaled_dot_product_flash_attention[0];  _scaled_dot_product_flash_attention = None
+    return (getitem,)""",
+        )
 
-        model = SingleOp()
-        output = model(t, dim, index, src)
+    def test_int_list_output(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return [((1, 3), [x + x, x * x])]
 
-        ep = torch.export.export(model, args=(t, dim, index, src))
-        ep.run_decompositions(decomp_table=torch._decomp.decomposition_table)
-        self.assertEqual(ep(t, dim, index, src), output)
+        ep = torch.export.export(M(), (torch.ones(2, 3),))
+        res = ep.module()(torch.ones(2, 3))
+        self.assertEqual(res[0][0], (1, 3))
 
-    @testing.expectedFailureRetraceability
-    def test_fqn(self):
-        class NestedChild(torch.nn.Module):
-            def forward(self, x):
-                return x / x
+    def test_primitive_constant_output(self):
+        class Z(torch.nn.Module):
+            def forward(self, x, y):
+                return y * x
 
-        class Child1(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.nested = NestedChild()
-                self.register_parameter(
-                    "child1param", torch.nn.Parameter(torch.ones(2, 3))
-                )
+        ep = torch.export.export(Z(), (torch.tensor(3), 5))
+        res = ep.module()(torch.tensor(4), 5)
+        self.assertEqual(res, torch.tensor(20))
 
-            def forward(self, x):
-                x = self.nested(x)
-                return x + self.child1param
+        class B(torch.nn.Module):
+            def forward(self, x, y):
+                return y * x, y
 
-        class Child2(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.register_buffer("child2buffer", torch.ones(2, 3))
+        ep = torch.export.export(B(), (torch.tensor(3), 5))
+        res = ep.module()(torch.tensor(4), 5)
+        self.assertEqual(res[0], torch.tensor(20))
+        self.assertEqual(res[1], 5)
 
+        with self.assertRaisesRegex(
+            RuntimeError,
+            escape("Expected input at *args[1] to be equal to 5, but got 20"),
+        ):
+            res = ep.module()(torch.tensor(4), 20)
+
+        class F(torch.nn.Module):
             def forward(self, x):
-                return x - self.child2buffer
+                # return a constant of primitive type
+                y = 5
+                return y * x, y
 
-        class MyModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.foo = Child1()
-                self.bar = Child2()
-                self.register_parameter(
-                    "rootparam", torch.nn.Parameter(torch.ones(2, 3))
-                )
+        ep = torch.export.export(F(), (torch.tensor(3),))
+        res = ep.module()(torch.tensor(4))
+        self.assertEqual(res[0], torch.tensor(20))
+        self.assertEqual(res[1], 5)
 
-            def forward(self, x):
-                x = x * self.rootparam
-                x = self.foo(x)
-                x = self.bar(x)
-                return x
+        class Q(torch.nn.Module):
+            def forward(self, x, y):
+                return y * x, y - 1
 
-        orig_eager = MyModule()
-        test_inp = torch.randn(2, 3)
+        ep = torch.export.export(Q(), (torch.tensor(3), 5))
+        res = ep.module()(torch.tensor(4), 5)
+        self.assertEqual(res[0], torch.tensor(20))
+        self.assertEqual(res[1], 4)
 
-        torch_gm = _export_to_torch_ir(orig_eager, (torch.rand(2, 3),), {})
-        for k, v in orig_eager.state_dict().items():
-            normalized_k = k.replace(".", "_")
-            self.assertIn(normalized_k, torch_gm.state_dict())
-            self.assertEqual(v, torch_gm.state_dict()[normalized_k])
-        self.assertTrue(torch.allclose(torch_gm(test_inp), orig_eager(test_inp)))
+    def test_unbacked_sdpa(self):
+        import torch
+        from torch.nn.attention import sdpa_kernel, SDPBackend
+        from torch.nn.functional import scaled_dot_product_attention
 
-        pre_autograd_gm = capture_pre_autograd_graph(orig_eager, (torch.rand(2, 3),), {})
-        for k, v in orig_eager.state_dict().items():
-            normalized_k = k.replace(".", "_")
-            self.assertIn(normalized_k, pre_autograd_gm.state_dict())
-            self.assertEqual(v, pre_autograd_gm.state_dict()[normalized_k])
-        self.assertTrue(torch.allclose(pre_autograd_gm(test_inp), orig_eager(test_inp)))
+        class Module(torch.nn.Module):
+            def forward(
+                self, query: torch.Tensor, cache: torch.Tensor, start_pos: torch.Tensor
+            ) -> torch.Tensor:
+                # x.sizes(): 1, 128, 16, 128
+                sp = start_pos.item()
+                torch._check_is_size(sp)
+                torch._check(sp >= 0)
+                torch._check(sp <= 126)
+                key = cache[:, : sp + 1, :, :]  # 1, sp+1, 16, 128
+                value = cache[:, : sp + 1, :, :]  # 1, sp+1, 16, 128
+                query = query.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+                key = key.transpose(1, 2)
+                value = value.transpose(1, 2)
+                # https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/transformers/attention.cpp#L732
+                return scaled_dot_product_attention(query, key, value)
+
+        cache = torch.randn(1, 128, 16, 128, dtype=torch.float16)
+        query = torch.randn(1, 1, 16, 128, dtype=torch.float16)
+        start_pos = torch.tensor([0])
+        with sdpa_kernel(SDPBackend.MATH), torch.no_grad():
+            ep = torch.export.export(Module(), (query, cache, start_pos))
+            args = (query, cache, start_pos)
+            self.assertEqual(ep.module()(*args), Module()(*args))
+            args = (query, cache, torch.tensor([3]))
+            self.assertEqual(ep.module()(*args), Module()(*args))
+            args = (query, cache, torch.tensor([126]))
+            self.assertEqual(ep.module()(*args), Module()(*args))
+
+    def test_none_input_output(self):
+        class Z(torch.nn.Module):
+            def forward(self, x, y):
+                return x * x
 
-        ep = export(orig_eager, (torch.rand(2, 3),), {})
-        for k, v in orig_eager.state_dict().items():
-            # We do not need to normalize the key here because exported
-            # program's state dict is able to contain the module information.
-            self.assertIn(k, ep.state_dict)
-            self.assertEqual(v, ep.state_dict[k])
-        self.assertTrue(torch.allclose(ep(test_inp), orig_eager(test_inp)))
+        ep = torch.export.export(Z(), (torch.tensor(3), None))
+        res = ep.module()(torch.tensor(4), None)
+        self.assertEqual(res, torch.tensor(16))
 
-    def test_nn_module_stack(self):
-        class Leaf(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(4, 4)
+        class B(torch.nn.Module):
+            def forward(self, x, y):
+                return x * x, y
 
-            def forward(self, x):
-                return self.linear(x)
+        ep = torch.export.export(B(), (torch.tensor(3), None))
+        res = ep.module()(torch.tensor(4), None)
+        self.assertEqual(res[0], torch.tensor(16))
+        self.assertEqual(res[1], None)
 
-        class Bar(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.leaf = Leaf()
-                self.register_buffer("buffer", torch.randn(4, 4))
+        decomp = ep.run_decompositions()
+        gm = decomp.module()
+        res = gm(torch.tensor(4), None)
+        self.assertEqual(res[0], torch.tensor(16))
+        self.assertEqual(res[1], None)
 
+    def test_print(self):
+        class M(torch.nn.Module):
             def forward(self, x):
-                return self.buffer.sum() + self.leaf(x).sum()
+                print("start")
+                x1 = x + x
+                print(x1)
+                x2 = x1 * x1
+                print(1, 2, 3)
+                x3 = x2 + x2
+                return (x1, x3)
+
+        gm = export(M(), (torch.randn(3, 3),)).graph_module
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
+def forward(self, x):
+    add = torch.ops.aten.add.Tensor(x, x);  x = None
+    mul = torch.ops.aten.mul.Tensor(add, add)
+    add_1 = torch.ops.aten.add.Tensor(mul, mul);  mul = None
+    return (add, add_1)""",
+        )
 
-        class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.bar = Bar()
+    def test_logging_logger(self):
+        logger = logging.getLogger(__name__)
 
+        class M(torch.nn.Module):
             def forward(self, x):
-                y = self.bar.buffer + x
-                return (self.bar(x) + y.sum(),)
+                logger.log("start")
+                x1 = x + x
+                logger.debug(x1)
+                x2 = x1 * x1
+                logger.info(1, 2, 3)
+                x3 = x2 + x2
+                return (x1, x3)
+
+        gm = export(M(), (torch.randn(3, 3),)).graph_module
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
+def forward(self, x):
+    add = torch.ops.aten.add.Tensor(x, x);  x = None
+    mul = torch.ops.aten.mul.Tensor(add, add)
+    add_1 = torch.ops.aten.add.Tensor(mul, mul);  mul = None
+    return (add, add_1)""",
+        )
 
-        inp = (torch.randn(4, 4),)
-        mod = Foo()
-        ep_strict = torch.export.export(mod, inp)
-        ep_non_strict = torch.export.export(mod, inp, strict=False)
+    @unittest.skipIf(not TEST_TRANSFORMERS, "No transformers")
+    def test_hf_logging_logger(self):
+        import transformers
 
-        gm_unflat_non_strict = unflatten(ep_non_strict)
-        self.assertTrue(hasattr(gm_unflat_non_strict, "bar"))
-        self.assertTrue(hasattr(gm_unflat_non_strict.bar, "buffer"))
-        self.assertTrue(hasattr(gm_unflat_non_strict.bar, "leaf"))
+        logger = transformers.utils.logging.get_logger(__name__)
 
-        gm_unflat_strict = unflatten(ep_strict)
+        class M(torch.nn.Module):
+            def forward(self, x):
+                logger.warning_once("start")
+                x1 = x + x
+                x2 = x1 * x1
+                x3 = x2 + x2
+                return (x1, x3)
 
-        self.assertEqual(gm_unflat_non_strict(*inp), gm_unflat_strict(*inp))
+        gm = export(M(), (torch.randn(3, 3),)).graph_module
         self.assertExpectedInline(
-            str(gm_unflat_non_strict.bar.leaf.linear.graph).strip(), """\
-graph():
-    %arg3_1 : [num_users=1] = placeholder[target=arg3_1]
-    %bias : [num_users=1] = get_attr[target=bias]
-    %weight : [num_users=1] = get_attr[target=weight]
-    %t : [num_users=1] = call_function[target=torch.ops.aten.t.default](args = (%weight,), kwargs = {})
-    %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%bias, %arg3_1, %t), kwargs = {})
-    return addmm"""
+            gm.code.strip(),
+            """\
+def forward(self, x):
+    add = torch.ops.aten.add.Tensor(x, x);  x = None
+    mul = torch.ops.aten.mul.Tensor(add, add)
+    add_1 = torch.ops.aten.add.Tensor(mul, mul);  mul = None
+    return (add, add_1)""",
         )
 
-        gm_flat_non_strict = ep_non_strict.module()
-        gm_flat_strict = ep_strict.module()
+    def test_warning(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                warnings.warn("moo")
+                res = x + x
+                warnings.warn(f"{res}")
+                return res
 
-        self.assertEqual(gm_flat_non_strict(*inp), gm_flat_strict(*inp))
+        gm = export(M(), (torch.randn(3, 3),)).graph_module
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
+def forward(self, x):
+    add = torch.ops.aten.add.Tensor(x, x);  x = None
+    return (add,)""",
+        )
 
-    def test_nn_module_stack_shared_submodule(self):
-        class Leaf(torch.nn.Module):
+    def test_constant_fqn(self):
+        class Nested(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.linear = torch.nn.Linear(4, 4)
+                self.constant = torch.rand(2, 3)
+                self.parameter = torch.nn.Parameter(torch.rand(2, 3))
 
             def forward(self, x):
-                return self.linear(x)
+                return x + self.constant
 
-        class Bar(torch.nn.Module):
+        class Mod(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.leaf = Leaf()
-                self.register_buffer("buffer", torch.randn(4, 4))
+                self.nested = Nested()
 
             def forward(self, x):
-                return self.buffer.sum() + self.leaf(x).sum()
+                return self.nested(x) + self.nested.constant + self.nested.parameter
 
-        class BarDifferent(torch.nn.Module):
+        m = Mod()
+        ep = export(m, (torch.rand(2, 3),), strict=True)
+        self.assertEqual(ep.constants["nested.constant"], m.nested.constant)
+        self.assertEqual(ep.module()(torch.ones(2, 3)), m(torch.ones(2, 3)))
+
+    def test_constant_name(self):
+        class Nested(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.leaf = Leaf()
+                self.constant = torch.rand(2, 3)
+                self.parameter = torch.nn.Parameter(torch.rand(2, 3))
 
             def forward(self, x):
-                a = self.leaf(x).sum()
-                b = self.leaf(x).sum()
-                return a + b
+                return x + self.constant
 
-        class Foo(torch.nn.Module):
+        class Mod(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.bar = Bar()
-                self.bar_different = BarDifferent()
+                self.nested_1 = Nested()
+                self.nested_2 = Nested()
 
             def forward(self, x):
-                y = self.bar.buffer + x
-                return (self.bar(x) + self.bar_different(x + 2), y.sum(),)
-
-        inp = (torch.randn(4, 4),)
-        mod = Foo()
-        ep_strict = torch.export.export(mod, inp)
-        ep_non_strict = torch.export.export(mod, inp, strict=False)
-
-        gm_unflat_non_strict = unflatten(ep_non_strict)
-        self.assertTrue(hasattr(gm_unflat_non_strict, "bar"))
-        self.assertTrue(hasattr(gm_unflat_non_strict.bar, "buffer"))
-        self.assertTrue(hasattr(gm_unflat_non_strict.bar, "leaf"))
-        self.assertTrue(hasattr(gm_unflat_non_strict.bar_different, "leaf"))
-
-        gm_unflat_strict = unflatten(ep_strict)
-
-        self.assertEqual(gm_unflat_non_strict(*inp), gm_unflat_strict(*inp))
-        self.assertExpectedInline(
-            str(gm_unflat_non_strict.bar.leaf.linear.graph).strip(), """\
-graph():
-    %arg5_1 : [num_users=1] = placeholder[target=arg5_1]
-    %bias : [num_users=1] = get_attr[target=bias]
-    %weight : [num_users=1] = get_attr[target=weight]
-    %t : [num_users=1] = call_function[target=torch.ops.aten.t.default](args = (%weight,), kwargs = {})
-    %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%bias, %arg5_1, %t), kwargs = {})
-    return addmm"""
-        )
-        self.assertExpectedInline(
-            str(gm_unflat_non_strict.bar_different.leaf.linear.graph).strip(), """\
-graph():
-    %add_2 : [num_users=1] = placeholder[target=add_2]
-    %bias : [num_users=1] = get_attr[target=bias]
-    %weight : [num_users=1] = get_attr[target=weight]
-    %t_1 : [num_users=1] = call_function[target=torch.ops.aten.t.default](args = (%weight,), kwargs = {})
-    %addmm_1 : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%bias, %add_2, %t_1), kwargs = {})
-    return addmm_1"""
-        )
+                return (
+                    self.nested_1(x)
+                    + self.nested_2(x)
+                    + self.nested_1.constant
+                    + self.nested_2.constant
+                    + self.nested_1.parameter
+                    + self.nested_2.parameter
+                )
 
-        gm_flat_non_strict = ep_non_strict.module()
-        gm_flat_strict = ep_strict.module()
+        m = Mod()
+        ep = export(m, (torch.rand(2, 3),), strict=False)
+        self.assertEqual(ep.module()(torch.ones(2, 3)), m(torch.ones(2, 3)))
 
-        self.assertEqual(gm_flat_non_strict(*inp), gm_flat_strict(*inp))
+        # check constant fqn when there are multiple instances of the same class
+        self.assertEqual(ep.constants["nested_1.constant"], m.nested_1.constant)
+        self.assertEqual(ep.constants["nested_2.constant"], m.nested_2.constant)
 
-    def test_cond_with_module_stack_export_with(self):
-        class Bar(torch.nn.Module):
+        # check constant_name in the graph
+        placeholders = [
+            node for node in ep.graph_module.graph.nodes if node.op == "placeholder"
+        ]
+        self.assertEqual(len(placeholders), 5)
+        self.assertTrue(all(ph.name == ph.target for ph in placeholders))
+        # suffix should be added to duplicated constant_name
+        self.assertEqual(placeholders[2].name, "c_nested_1_constant")
+        self.assertEqual(placeholders[3].name, "c_nested_2_constant")
+
+    def test_nested_retrace(self):
+        class Nested(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.linear = torch.nn.Linear(4, 4)
+                self.param = torch.nn.Parameter(torch.randn(3))
 
             def forward(self, x):
-                def true_fn(x):
-                    return self.linear(x).cos()
-                def false_fn(x):
-                    return self.linear(x).sin()
-                return torch.cond(x.shape[0] > 4, true_fn, false_fn, [x])
+                return x + self.param
 
-        class CondExport(torch.nn.Module):
+        class Foo(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.bar = Bar()
+                self.nested = Nested()
 
             def forward(self, x):
-                return x.cos() + self.bar(x)
+                return x + self.nested(x)
 
-        inp = (torch.randn(4, 4),)
-        ep = torch.export.export(CondExport(), inp, strict=False)
-        self.assertExpectedInline(ep.graph_module.code.strip(), """\
-def forward(self, arg0_1, arg1_1, arg2_1):
-    cos = torch.ops.aten.cos.default(arg2_1)
-    true_graph_0 = self.true_graph_0
-    false_graph_0 = self.false_graph_0
-    conditional = torch.ops.higher_order.cond(False, true_graph_0, false_graph_0, [arg1_1, arg0_1, arg2_1]);  true_graph_0 = false_graph_0 = arg1_1 = arg0_1 = arg2_1 = None
-    getitem = conditional[0];  conditional = None
-    add = torch.ops.aten.add.Tensor(cos, getitem);  cos = getitem = None
-    return (add,)""")
+        # first export
+        foo = Foo().to("meta")
+        inputs = (torch.ones(3, device="meta"),)
+        foo(*inputs)
+        ep = torch.export.export(foo, inputs, strict=False)
 
-        cond_top_level_nn_module_stack = [
-            node.meta["nn_module_stack"]
-            for node in ep.graph.nodes
-            if node.name == "true_graph_0"
-        ][0]
+        # second export
+        foo_1 = ep.module()
+        ep_1 = torch.export.export(foo_1, inputs, strict=False)
 
-        self.assertTrue("test_cond_with_module_stack_export_with.<locals>.Bar" in str(cond_top_level_nn_module_stack))
+        for node1, node2 in zip(ep.graph.nodes, ep_1.graph.nodes):
+            nn_module_stack_1 = node1.meta.get("nn_module_stack", None)
+            nn_module_stack_2 = node2.meta.get("nn_module_stack", None)
 
-    # TODO: See https://github.com/pytorch/pytorch/issues/115790
-    @unittest.expectedFailure
-    def test_cond_with_module_stack_export_with_unflatten(self):
-        class Bar(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(4, 4)
+            if nn_module_stack_1 is None:
+                self.assertTrue(nn_module_stack_2 is None)
+            else:
+                for v1, v2 in zip(
+                    nn_module_stack_1.values(), nn_module_stack_2.values()
+                ):
+                    self.assertEqual(v1, v2)
+
+
+@unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
+class TestExportCustomClass(TorchTestCase):
+    def setUp(self):
+        if IS_FBCODE:
+            lib_file_path = "//caffe2/test/cpp/jit:test_custom_class_registrations"
+        elif IS_SANDCASTLE or IS_MACOS:
+            raise unittest.SkipTest("non-portable load_library call used in test")
+        elif IS_WINDOWS:
+            lib_file_path = find_library_location("torchbind_test.dll")
+        else:
+            lib_file_path = find_library_location("libtorchbind_test.so")
+        torch.ops.load_library(str(lib_file_path))
+
+    def test_lift_custom_obj(self):
+        # TODO: fix this test once custom class tracing is implemented
+
+        custom_obj = torch.classes._TorchScriptTesting._PickleTester([3, 4])
 
+        class Foo(torch.nn.Module):
             def forward(self, x):
-                def true_fn(x):
-                    return self.linear(x).cos()
-                def false_fn(x):
-                    return self.linear(x).sin()
-                return torch.cond(x.shape[0] > 4, true_fn, false_fn, [x])
-
-        class CondExport(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.bar = Bar()
+                return x + x
 
-            def forward(self, x):
-                return x.cos() + self.bar(x)
+        f = Foo()
 
-        inp = (torch.randn(4, 4),)
-        ep = torch.export.export(CondExport(), inp, strict=False)
+        inputs = (torch.zeros(4, 4),)
+        ep = export(f, inputs)
 
-        cond_top_level_nn_module_stack = [
-            node.meta["nn_module_stack"]
-            for node in ep.graph.nodes
-            if node.name == "true_graph_0"
-        ][0]
+        # Replace one of the values with an instance of our custom class
+        for node in ep.graph.nodes:
+            if node.op == "call_function" and node.target == torch.ops.aten.add.Tensor:
+                with ep.graph.inserting_before(node):
+                    setattr(ep.graph_module, "custom_obj", custom_obj)
+                    getattr_node = ep.graph.get_attr("custom_obj")
+                    # Copy over an nn_module_stack as they are required.
+                    getattr_node.meta["nn_module_stack"] = node.meta["nn_module_stack"]
+                    custom_node = ep.graph.call_function(
+                        torch.ops._TorchScriptTesting.take_an_instance.default,
+                        (getattr_node,),
+                    )
+                    custom_node.meta["val"] = torch.ones(4, 4)
+                    # Copy over an nn_module_stack as they are required.
+                    custom_node.meta["nn_module_stack"] = node.meta["nn_module_stack"]
+                    custom_node.meta["torch_fn"] = (
+                        "custom_op",
+                        "torch.ops._TorchScriptTesting.take_an_instance.default",
+                    )
+                    arg0, _ = node.args
+                    node.args = (arg0, custom_node)
+
+        from torch._export.passes.lift_constants_pass import lift_constants_pass
+        from torch._export.serde.serialize import deserialize, serialize
+
+        constants = lift_constants_pass(ep.graph_module, ep.graph_signature, {})
+        for k, v in constants.items():
+            assert k not in ep.constants
+            ep._constants[k] = v
+        serialized_vals = serialize(ep)
+        deserialized_ep = deserialize(serialized_vals)
+
+        for node in deserialized_ep.graph.nodes:
+            if (
+                node.op == "call_function"
+                and node.target
+                == torch.ops._TorchScriptTesting.take_an_instance.default
+            ):
+                arg = node.args[0]
+                self.assertTrue(arg.op == "placeholder")
 
-        # we can't preserve nn_module_stack for the subgraphs for now.
-        for node in ep.graph_module.true_graph_0.graph.nodes:
-            self.assertEqual(node.meta["nn_module_stack"], cond_top_level_nn_module_stack)
+    def test_tolist_nonstrict_output(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                x.tolist()
 
-        # this doesn't work today
-        gm_unflat_strict = unflatten(ep)
+        ep = torch.export.export(M(), (torch.ones(3),), strict=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_export_nonstrict.py b/test/export/test_export_nonstrict.py
index 0dbea646c488e..f4c08fba4fd73 100644
--- a/test/export/test_export_nonstrict.py
+++ b/test/export/test_export_nonstrict.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: dynamo"]
+# Owner(s): ["oncall: export"]
 
 try:
     from . import test_export, testing
@@ -11,20 +11,19 @@
 
 
 def mocked_non_strict_export(*args, **kwargs):
+    # If user already specified strict, don't make it non-strict
     if "strict" in kwargs:
-        del kwargs["strict"]
+        return export(*args, **kwargs)
     return export(*args, **kwargs, strict=False)
 
 
 def make_dynamic_cls(cls):
-    suffix = "_non_strict"
-
     cls_prefix = "NonStrictExport"
 
     test_class = testing.make_test_cls_with_mocked_export(
         cls,
         cls_prefix,
-        suffix,
+        test_export.NON_STRICT_SUFFIX,
         mocked_non_strict_export,
         xfail_prop="_expected_failure_non_strict",
     )
diff --git a/test/export/test_export_predispatch.py b/test/export/test_export_predispatch.py
new file mode 100644
index 0000000000000..2075cba58ca67
--- /dev/null
+++ b/test/export/test_export_predispatch.py
@@ -0,0 +1,50 @@
+# Owner(s): ["oncall: export"]
+
+try:
+    from . import test_export, testing
+except ImportError:
+    import test_export
+    import testing
+from torch.export._trace import _export
+
+test_classes = {}
+
+
+def mocked_predispatch_export(*args, **kwargs):
+    # If user already specified strict, don't make it non-strict
+    ep = _export(*args, **kwargs, pre_dispatch=True)
+    return ep.run_decompositions()
+
+
+def make_dynamic_cls(cls):
+    suffix = "_pre_dispatch"
+
+    cls_prefix = "PreDispatchExport"
+
+    test_class = testing.make_test_cls_with_mocked_export(
+        cls,
+        cls_prefix,
+        suffix,
+        mocked_predispatch_export,
+        xfail_prop="_expected_failure_pre_dispatch",
+    )
+
+    test_classes[test_class.__name__] = test_class
+    # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
+    globals()[test_class.__name__] = test_class
+    test_class.__module__ = __name__
+    return test_class
+
+
+tests = [
+    test_export.TestDynamismExpression,
+    test_export.TestExport,
+]
+for test in tests:
+    make_dynamic_cls(test)
+del test
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/export/test_functionalized_assertions.py b/test/export/test_functionalized_assertions.py
index 18939a55da67b..d0b0eda463608 100644
--- a/test/export/test_functionalized_assertions.py
+++ b/test/export/test_functionalized_assertions.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: dynamo"]
+# Owner(s): ["oncall: export"]
 import torch
 from torch.testing._internal.common_utils import run_tests, TestCase
 
diff --git a/test/export/test_hop.py b/test/export/test_hop.py
new file mode 100644
index 0000000000000..9ce6c5fa6cd54
--- /dev/null
+++ b/test/export/test_hop.py
@@ -0,0 +1,151 @@
+# Owner(s): ["oncall: export"]
+# flake8: noqa
+import copy
+import io
+import unittest
+
+import torch
+import torch._dynamo as torchdynamo
+import torch.utils._pytree as pytree
+from torch._dynamo.test_case import TestCase
+from torch.export import export, load, save
+from torch.export._trace import _export
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    ops,
+)
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    run_tests,
+    TestCase as TorchTestCase,
+)
+from torch.testing._internal.hop_db import (
+    hop_db,
+    hop_that_doesnt_have_opinfo_test_allowlist,
+)
+
+hop_tests = []
+
+for op_info in hop_db:
+    op_info_hop_name = op_info.name
+    if op_info_hop_name in hop_that_doesnt_have_opinfo_test_allowlist:
+        continue
+    hop_tests.append(op_info)
+
+
+class TestHOPGeneric(TestCase):
+    def test_all_hops_have_op_info(self):
+        from torch._ops import _higher_order_ops
+
+        hops_that_have_op_info = set([k.name for k in hop_db])
+        all_hops = _higher_order_ops.keys()
+
+        missing_ops = []
+
+        for op in all_hops:
+            if (
+                op not in hops_that_have_op_info
+                and op not in hop_that_doesnt_have_opinfo_test_allowlist
+            ):
+                missing_ops.append(op)
+
+        self.assertTrue(len(missing_ops) == 0, f"Missing op info for {missing_ops}")
+
+
+@unittest.skipIf(IS_WINDOWS, "Windows isn't supported for this case")
+@unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
+class TestHOP(TestCase):
+    def _compare(self, eager_model, export, args, kwargs):
+        eager_args = copy.deepcopy(args)
+        eager_kwargs = copy.deepcopy(kwargs)
+        export_args = copy.deepcopy(args)
+        export_kwargs = copy.deepcopy(kwargs)
+
+        flat_orig_outputs = pytree.tree_leaves(eager_model(*eager_args, **eager_kwargs))
+        flat_loaded_outputs = pytree.tree_leaves(
+            export.module()(*export_args, **export_kwargs)
+        )
+
+        for orig, loaded in zip(flat_orig_outputs, flat_loaded_outputs):
+            self.assertEqual(type(orig), type(loaded))
+            self.assertEqual(orig, loaded)
+
+    @ops(hop_tests, allowed_dtypes=(torch.float,))
+    def test_aot_export(self, device, dtype, op):
+        class Foo(torch.nn.Module):
+            def forward(self, *args):
+                return op.op(*args)
+
+        sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=True)
+        for inp in sample_inputs_itr:
+            model = Foo()
+            input = inp.input if isinstance(inp.input, tuple) else (inp.input,)
+            args = (*input, *inp.args)
+            kwargs = inp.kwargs
+            ep = export(model, args, kwargs)
+            self._compare(model, ep, args, kwargs)
+
+    @ops(hop_tests, allowed_dtypes=(torch.float,))
+    def test_pre_dispatch_export(self, device, dtype, op):
+        class Foo(torch.nn.Module):
+            def forward(self, *args):
+                return op.op(*args)
+
+        sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=True)
+        for inp in sample_inputs_itr:
+            model = Foo()
+            input = inp.input if isinstance(inp.input, tuple) else (inp.input,)
+            args = (*input, *inp.args)
+            kwargs = inp.kwargs
+            ep = _export(model, args, kwargs, pre_dispatch=True)
+            self._compare(model, ep, args, kwargs)
+
+    @ops(hop_tests, allowed_dtypes=(torch.float,))
+    def test_retrace_export(self, device, dtype, op):
+        class Foo(torch.nn.Module):
+            def forward(self, *args):
+                return op.op(*args)
+
+        sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=True)
+        for inp in sample_inputs_itr:
+            model = Foo()
+            input = inp.input if isinstance(inp.input, tuple) else (inp.input,)
+            args = (*input, *inp.args)
+            kwargs = inp.kwargs
+            ep = _export(model, args, kwargs, pre_dispatch=True)
+            ep = ep.run_decompositions()
+            self._compare(model, ep, args, kwargs)
+
+    @ops(hop_tests, allowed_dtypes=(torch.float,))
+    def test_serialize_export(self, device, dtype, op):
+        class Foo(torch.nn.Module):
+            def forward(self, *args):
+                return op.op(*args)
+
+        sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=True)
+        for inp in sample_inputs_itr:
+            model = Foo()
+            input = inp.input if isinstance(inp.input, tuple) else (inp.input,)
+            args = (*input, *inp.args)
+            kwargs = inp.kwargs
+            ep = _export(model, args, kwargs, pre_dispatch=True)
+            ep = ep.run_decompositions()
+            buffer = io.BytesIO()
+            save(ep, buffer)
+            buffer.seek(0)
+            ep = load(buffer)
+            if "while_loop" in str(op):
+                # while_loop's arguments are cast into list after deserailize
+                # but while_loop expects it to still be tuple
+                with self.assertRaisesRegex(
+                    RuntimeError, "carried_inputs must be a tuple"
+                ):
+                    self._compare(model, ep, args, kwargs)
+            else:
+                self._compare(model, ep, args, kwargs)
+
+
+instantiate_device_type_tests(TestHOP, globals())
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/export/test_lift_unlift.py b/test/export/test_lift_unlift.py
new file mode 100644
index 0000000000000..407336c984924
--- /dev/null
+++ b/test/export/test_lift_unlift.py
@@ -0,0 +1,416 @@
+# Owner(s): ["oncall: export"]
+import unittest
+from typing import Any, Dict, Optional, OrderedDict, Tuple
+
+import torch
+from torch._export.passes.lift_constants_pass import (
+    ConstantAttrMap,
+    lift_constants_pass,
+)
+from torch.export._unlift import _unlift_exported_program_lifted_states
+from torch.export.exported_program import (
+    ExportGraphSignature,
+    InputKind,
+    InputSpec,
+    OutputKind,
+    OutputSpec,
+    TensorArgument,
+)
+
+from torch.export.graph_signature import CustomObjArgument
+from torch.testing._internal.common_utils import (
+    find_library_location,
+    IS_FBCODE,
+    IS_MACOS,
+    IS_SANDCASTLE,
+    IS_WINDOWS,
+    run_tests,
+    TestCase,
+)
+
+
+class GraphBuilder:
+    def __init__(self):
+        self.graph = torch.fx.Graph()
+        self.nodes = {}
+        self.values = {}
+        self.nn_module_stack_key: Dict[str, int] = {}
+        self.latest_id = 0
+        self.input_to_kind: Dict[torch.fx.Node, InputKind] = {}
+
+    def input(self, name: str, value: torch.Tensor, kind: InputKind):
+        node = self.graph.placeholder(name)
+        node.meta["val"] = value
+        self.nodes[name] = node
+        self.values[name] = value
+        self.input_to_kind[node] = kind
+
+    def add(self, x: str, y: str, out: str, module_fqn: str = ""):
+        node = self.graph.create_node(
+            "call_function",
+            torch.ops.aten.add.Tensor,
+            (self.nodes[x], self.nodes[y]),
+            name=out,
+        )
+        self.values[out] = self.values[x] + self.values[y]
+        node.meta["val"] = self.values[out]
+        node.meta["nn_module_stack"] = self.create_nn_module_stack(module_fqn)
+        self.nodes[out] = node
+
+    def call_function(self, target, args, out: str, module_fqn: str = ""):
+        arg_nodes = tuple(self.nodes[arg] for arg in args)
+        arg_values = tuple(self.values[arg] for arg in args)
+        node = self.graph.create_node(
+            "call_function",
+            target,
+            arg_nodes,
+            name=out,
+        )
+        self.values[out] = target(*arg_values)
+        node.meta["val"] = self.values[out]
+        node.meta["nn_module_stack"] = self.create_nn_module_stack(module_fqn)
+        self.nodes[out] = node
+
+    def constant(
+        self, name: str, value: Any, target: Optional[str] = None, module_fqn: str = ""
+    ):
+        if target is None:
+            target = name
+        node = self.graph.get_attr(target)
+        node.meta["val"] = value
+        node.meta["nn_module_stack"] = self.create_nn_module_stack(module_fqn)
+        self.nodes[name] = node
+        self.values[name] = value
+
+    def output(self, out: str):
+        self.graph.output(self.nodes[out])
+
+    def create_nn_module_stack(
+        self, module_fqn: str
+    ) -> OrderedDict[int, Tuple[str, type]]:
+        cur_name = ""
+        nn_module_stack = OrderedDict()
+        for atom in module_fqn.split("."):
+            if cur_name == "":
+                cur_name = atom
+            else:
+                cur_name = cur_name + "." + atom
+
+            if cur_name not in self.nn_module_stack_key:
+                id_counter = self.latest_id
+                self.latest_id += 1
+                self.nn_module_stack_key[cur_name] = id_counter
+            else:
+                id_counter = self.nn_module_stack_key[cur_name]
+
+            nn_module_stack[id_counter] = (cur_name, torch.nn.Module)
+        return nn_module_stack
+
+    def create_input_specs(self):
+        input_specs = []
+        for node in self.graph.nodes:
+            if node.op == "placeholder":
+                input_specs.append(
+                    InputSpec(
+                        kind=self.input_to_kind[node],
+                        arg=TensorArgument(name=node.name),
+                        target=None,
+                        persistent=(
+                            True
+                            if self.input_to_kind[node] == InputKind.BUFFER
+                            else None
+                        ),
+                    )
+                )
+        return input_specs
+
+    # NOTE: does not handle non-user-outputs atm
+    def gen_graph_signature(self) -> ExportGraphSignature:
+        output = [n for n in self.graph.nodes if n.op == "output"]
+        assert len(output) == 1
+        output = output[0]
+        assert len(output.args) == 1, "multiple outputs NYI"
+
+        return ExportGraphSignature(
+            input_specs=self.create_input_specs(),
+            output_specs=[
+                OutputSpec(
+                    kind=OutputKind.USER_OUTPUT,
+                    arg=TensorArgument(name=n.name),
+                    target=None,
+                )
+                for n in output.args
+            ],
+        )
+
+
+class TestLift(TestCase):
+    def setUp(self):
+        if IS_MACOS:
+            raise unittest.SkipTest("non-portable load_library call used in test")
+        elif IS_SANDCASTLE or IS_FBCODE:
+            torch.ops.load_library(
+                "//caffe2/test/cpp/jit:test_custom_class_registrations"
+            )
+        elif IS_WINDOWS:
+            lib_file_path = find_library_location("torchbind_test.dll")
+            torch.ops.load_library(str(lib_file_path))
+        else:
+            lib_file_path = find_library_location("libtorchbind_test.so")
+            torch.ops.load_library(str(lib_file_path))
+
+    def test_lift_basic(self):
+        builder = GraphBuilder()
+
+        builder.input("param", torch.rand(2, 3), InputKind.PARAMETER)
+        builder.input("buffer", torch.rand(2, 3), InputKind.BUFFER)
+        builder.input("x", torch.rand(2, 3), InputKind.USER_INPUT)
+        builder.input("y", torch.rand(2, 3), InputKind.USER_INPUT)
+
+        builder.add("x", "y", out="foo")
+        builder.add("foo", "param", out="bar")
+        builder.add("bar", "buffer", out="baz")
+        builder.constant("const_tensor", torch.rand(2, 3))
+        builder.constant("const_obj", torch.classes._TorchScriptTesting._Foo(10, 20))
+        builder.add("baz", "const_tensor", out="out")
+        builder.call_function(
+            torch.ops._TorchScriptTesting.takes_foo,
+            ("const_obj", "x"),
+            out="torchbind_out",
+        )
+        builder.add("out", "torchbind_out", out="final_out")
+        builder.output("final_out")
+
+        builder.graph.lint()
+        graph = builder.graph
+        const_tensor = builder.values["const_tensor"]
+        const_obj = builder.values["const_obj"]
+
+        root = {"const_tensor": const_tensor, "const_obj": const_obj}
+        gm = torch.fx.GraphModule(root, graph)
+        graph_signature = builder.gen_graph_signature()
+        constants = lift_constants_pass(gm, graph_signature, {})
+        gm.graph.lint()
+
+        self.assertEqual(len(constants), 2)
+
+        # The key of the constants table should match the fqn of the constant.
+        # In this case, it's just the name of the constant, since the constant
+        # is at the root submodule.
+        # TODO(suo): we shouldn't hardcode these names in the test, this is an
+        # internal detail of the pass.
+        self.assertIn("lifted_tensor_0", constants)
+        self.assertEqual(constants["lifted_tensor_0"], const_tensor)
+        self.assertIn("lifted_custom_0", constants)
+        self.assertEqual(constants["lifted_custom_0"], const_obj)
+
+        # The constant node should be removed.
+        getattr_nodes = [n for n in gm.graph.nodes if n.op == "get_attr"]
+        self.assertEqual(len(getattr_nodes), 0)
+
+        # The constant should be lifted to a placeholder node.
+        placeholder_nodes = [n for n in gm.graph.nodes if n.op == "placeholder"]
+        self.assertEqual(len(placeholder_nodes), 6)
+
+        # The lifted constant should be placed before user inputs but after params/buffers
+        lifted_tensor_placeholder = placeholder_nodes[2]
+        self.assertEqual(lifted_tensor_placeholder.target, "lifted_tensor_0")
+        # It should have a val equivalent to the constant
+        self.assertEqual(lifted_tensor_placeholder.meta["val"], const_tensor)
+
+        lifted_obj_placeholder = placeholder_nodes[3]
+        self.assertEqual(lifted_obj_placeholder.target, "lifted_custom_0")
+        # It should have a val equivalent to the constant
+        self.assertEqual(
+            lifted_obj_placeholder.meta["val"],
+            CustomObjArgument(
+                name="lifted_custom_0",
+                class_fqn="__torch__.torch.classes._TorchScriptTesting._Foo",
+            ),
+        )
+
+        # Graph signature should have been mutated a way that reflects the placeholders.
+        tensor_constant_input_spec = graph_signature.input_specs[2]
+        self.assertEqual(tensor_constant_input_spec.kind, InputKind.CONSTANT_TENSOR)
+        self.assertIsInstance(tensor_constant_input_spec.arg, TensorArgument)
+        self.assertEqual(
+            tensor_constant_input_spec.arg.name, lifted_tensor_placeholder.name
+        )
+
+        obj_constant_input_spec = graph_signature.input_specs[3]
+        self.assertEqual(obj_constant_input_spec.kind, InputKind.CUSTOM_OBJ)
+        self.assertIsInstance(obj_constant_input_spec.arg, CustomObjArgument)
+        self.assertEqual(obj_constant_input_spec.arg.name, lifted_obj_placeholder.name)
+
+    def test_lift_nested(self):
+        builder = GraphBuilder()
+        builder.input("x", torch.rand(2, 3), InputKind.USER_INPUT)
+        builder.input("y", torch.rand(2, 3), InputKind.USER_INPUT)
+        builder.input("z", torch.rand(2, 3), InputKind.USER_INPUT)
+
+        builder.add("x", "y", out="foo")
+        builder.add("foo", "z", out="bar", module_fqn="foo")
+        builder.constant("const_tensor", torch.rand(2, 3), module_fqn="foo")
+        builder.add("bar", "const_tensor", "out")
+        builder.output("out")
+
+        graph = builder.graph
+        graph.lint()
+
+        const_tensor = builder.values["const_tensor"]
+        root = {"const_tensor": builder.values["const_tensor"]}
+
+        graph_signature = builder.gen_graph_signature()
+        gm = torch.fx.GraphModule(root, graph)
+
+        constants = lift_constants_pass(gm, graph_signature, {})
+        gm.graph.lint()
+
+        self.assertEqual(len(constants), 1)
+
+        # The key of the constants table should match the fqn of the constant.
+        self.assertIn("foo.lifted_tensor_0", constants)
+        self.assertEqual(constants["foo.lifted_tensor_0"], const_tensor)
+
+        # The constant node should be removed.
+        getattr_nodes = [n for n in gm.graph.nodes if n.op == "get_attr"]
+        self.assertEqual(len(getattr_nodes), 0)
+
+        # The constant should be lifted to a placeholder node.
+        placeholder_nodes = [n for n in gm.graph.nodes if n.op == "placeholder"]
+        self.assertEqual(len(placeholder_nodes), 4)
+
+        # The lifted constant should be placed before user inputs but after params/buffers
+        lifted_constant_placeholder = placeholder_nodes[0]
+        self.assertEqual(lifted_constant_placeholder.target, "lifted_tensor_0")
+
+        # Graph signature should have been mutated a way that reflects the placeholders.
+        constant_input_spec = graph_signature.input_specs[0]
+        self.assertEqual(constant_input_spec.kind, InputKind.CONSTANT_TENSOR)
+        self.assertIsInstance(constant_input_spec.arg, TensorArgument)
+        self.assertEqual(constant_input_spec.arg.name, lifted_constant_placeholder.name)
+
+    def test_duplicate_constant_access(self):
+        const = torch.rand(2, 3)
+        const_obj = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+        builder = GraphBuilder()
+        builder.input("x", torch.rand(2, 3), InputKind.USER_INPUT)
+        builder.constant("const_tensor", const, target="const_tensor")
+        # loading the same target twice
+        builder.constant("const_tensor2", const, target="const_tensor")
+
+        # loading the same object twice with different targets
+        builder.constant("const_obj", const_obj)
+        builder.constant("const_obj2", const_obj)
+        builder.call_function(
+            torch.ops._TorchScriptTesting.takes_foo,
+            ("const_obj", "x"),
+            out="torchbind_out",
+        )
+        builder.call_function(
+            torch.ops._TorchScriptTesting.takes_foo,
+            ("const_obj2", "x"),
+            out="torchbind_out2",
+        )
+        builder.add("x", "const_tensor", out="foo")
+        builder.add("foo", "const_tensor2", out="tensor_out")
+        builder.add("torchbind_out", "torchbind_out2", out="obj_out")
+        builder.add("tensor_out", "obj_out", out="out")
+        builder.output("out")
+        graph = builder.graph
+        graph.lint()
+
+        input_specs = builder.create_input_specs()
+        output_specs = [
+            OutputSpec(
+                kind=OutputKind.USER_OUTPUT,
+                arg=TensorArgument(name=builder.nodes["out"].name),
+                target=None,
+            )
+        ]
+        graph_signature = ExportGraphSignature(input_specs, output_specs)
+
+        root = {"const_tensor": const, "const_obj": const_obj, "const_obj2": const_obj}
+        gm = torch.fx.GraphModule(root, graph)
+
+        constants = lift_constants_pass(gm, graph_signature, {})
+        gm.graph.lint()
+
+        self.assertEqual(len(constants), 2)
+
+        # All get_attr nodes should be removed
+        getattr_nodes = [n for n in gm.graph.nodes if n.op == "get_attr"]
+        self.assertEqual(len(getattr_nodes), 0)
+
+        # There should only be two additional inputs (plus the existing user input)
+        placeholder_nodes = [n for n in gm.graph.nodes if n.op == "placeholder"]
+        self.assertEqual(len(placeholder_nodes), 3)
+
+        # Graph signature should have been mutated a way that reflects the placeholders.
+        self.assertEqual(len(graph_signature.input_specs), 3)
+        constant_input_spec = graph_signature.input_specs[0]
+        self.assertEqual(constant_input_spec.kind, InputKind.CONSTANT_TENSOR)
+        self.assertIsInstance(constant_input_spec.arg, TensorArgument)
+
+    def test_unlift_nonpersistent_buffer(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer(
+                    "non_persistent_buf", torch.zeros(1), persistent=False
+                )
+
+            def forward(self, x):
+                self.non_persistent_buf.add_(1)
+                return x.sum() + self.non_persistent_buf.sum()
+
+        foo = Foo()
+        exported = torch.export.export(foo, (torch.ones(5, 5),), strict=False)
+        stateful_gm = _unlift_exported_program_lifted_states(exported)
+
+        # Check the unlifted stateful_gm contains the original non-persistent buffer
+        self.assertTrue(hasattr(stateful_gm, "non_persistent_buf"))
+        non_persistent_buf = stateful_gm.get_buffer("non_persistent_buf")
+        self.assertEqual(non_persistent_buf, foo.get_buffer("non_persistent_buf"))
+        self.assertIn("non_persistent_buf", stateful_gm._non_persistent_buffers_set)
+        self.assertNotIn("non_persistent_buf", stateful_gm.state_dict())
+
+
+class ConstantAttrMapTest(TestCase):
+    def setUp(self):
+        if IS_MACOS:
+            raise unittest.SkipTest("non-portable load_library call used in test")
+        elif IS_SANDCASTLE or IS_FBCODE:
+            torch.ops.load_library(
+                "//caffe2/test/cpp/jit:test_custom_class_registrations"
+            )
+        elif IS_WINDOWS:
+            lib_file_path = find_library_location("torchbind_test.dll")
+            torch.ops.load_library(str(lib_file_path))
+        else:
+            lib_file_path = find_library_location("libtorchbind_test.so")
+            torch.ops.load_library(str(lib_file_path))
+
+    def test_dict_api(self):
+        constant_attr_map = ConstantAttrMap()
+        const_obj = torch.classes._TorchScriptTesting._Foo(10, 20)
+        const_tensor = torch.ones(2, 3)
+        constant_attr_map[const_obj] = "foo.bar"
+        constant_attr_map[const_tensor] = "foo.bar.baz"
+        self.assertEqual(len(constant_attr_map), 2)
+        self.assertEqual(list(constant_attr_map), [const_obj, const_tensor])
+        self.assertEqual(list(constant_attr_map.keys()), [const_obj, const_tensor])
+        self.assertEqual(list(constant_attr_map.values()), ["foo.bar", "foo.bar.baz"])
+        self.assertEqual(constant_attr_map[const_obj], "foo.bar")
+        self.assertEqual(constant_attr_map[const_tensor], "foo.bar.baz")
+        self.assertTrue(const_obj in constant_attr_map)
+        with self.assertRaises(TypeError):
+            constant_attr_map[1] = "foo.bar"
+
+        del constant_attr_map[const_obj]
+        self.assertEqual(len(constant_attr_map), 1)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/export/test_pass_infra.py b/test/export/test_pass_infra.py
index 23a1e399e333f..955b961f3760d 100644
--- a/test/export/test_pass_infra.py
+++ b/test/export/test_pass_infra.py
@@ -1,21 +1,25 @@
-# Owner(s): ["module: dynamo"]
+# Owner(s): ["oncall: export"]
+import copy
 import unittest
-from typing import List
 
 import torch
 from functorch.experimental import control_flow
 from torch._dynamo.eval_frame import is_dynamo_supported
-from torch.export import export
 from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse
-from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.export import export
+from torch.fx.passes.infra.pass_base import PassResult
+from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
 
 
 @unittest.skipIf(not is_dynamo_supported(), "Dynamo not supported")
 class TestPassInfra(TestCase):
     def test_export_pass_base(self) -> None:
-        def f(x: torch.Tensor) -> List[torch.Tensor]:
-            y = torch.cat([x, x])
-            return torch.ops.aten.tensor_split.sections(y, 2)
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                y = torch.cat([x, x])
+                return torch.ops.aten.tensor_split.sections(y, 2)
+
+        f = Foo()
 
         class NullPass(_ExportPassBaseDeprecatedDoNotUse):
             pass
@@ -37,6 +41,7 @@ class NullPass(_ExportPassBaseDeprecatedDoNotUse):
             self.assertEqual(new_node.op, old_node.op)
             self.assertEqual(new_node.target, old_node.target)
 
+    @unittest.skipIf(IS_WINDOWS, "Windows not supported")
     def test_cond(self) -> None:
         class M(torch.nn.Module):
             def __init__(self):
@@ -45,12 +50,14 @@ def __init__(self):
             def forward(self, pred, x, y):
                 def true_fn(x, y):
                     b = x.item()
-                    torch._constrain_as_value(b, min=2, max=5)
+                    torch._check(b >= 2)
+                    torch._check(b <= 5)
                     return x - y
 
                 def false_fn(x, y):
                     c = y.item()
-                    torch._constrain_as_value(c, min=2, max=5)
+                    torch._check(c >= 2)
+                    torch._check(c <= 5)
                     return x + y
 
                 ret = control_flow.cond(pred, true_fn, false_fn, [x, y])
@@ -59,7 +66,9 @@ def false_fn(x, y):
         x = torch.tensor([2])
         y = torch.tensor([5])
         mod = M()
-        _ = export(mod, (torch.tensor(True), x, y))._transform_do_not_use(_ExportPassBaseDeprecatedDoNotUse())
+        _ = export(mod, (torch.tensor(True), x, y))._transform_do_not_use(
+            _ExportPassBaseDeprecatedDoNotUse()
+        )
 
     def test_node_name_stability(self) -> None:
         # Tests that graph nodes stay the same for nodes that are not touched
@@ -72,12 +81,14 @@ def __init__(self):
                 self.my_parameter = torch.nn.Parameter(torch.tensor(2.0))
 
                 # Define two buffers
-                self.register_buffer('my_buffer1', torch.tensor(3.0))
-                self.register_buffer('my_buffer2', torch.tensor(4.0))
+                self.register_buffer("my_buffer1", torch.tensor(3.0))
+                self.register_buffer("my_buffer2", torch.tensor(4.0))
 
             def forward(self, x1, x2):
                 # Use the parameter, buffers, and both inputs in the forward method
-                output = (x1 + self.my_parameter) * self.my_buffer1 + x2 * self.my_buffer2
+                output = (
+                    x1 + self.my_parameter
+                ) * self.my_buffer1 + x2 * self.my_buffer2
 
                 # Mutate one of the buffers (e.g., increment it by 1)
                 self.my_buffer2.add_(1.0)
@@ -120,7 +131,7 @@ def forward(self, x1, x2):
         input_tensor1 = torch.tensor(5.0)
         input_tensor2 = torch.tensor(6.0)
 
-        ep_before = torch._export.export(my_module, (input_tensor1, input_tensor2))
+        ep_before = torch.export.export(my_module, (input_tensor1, input_tensor2))
         from torch.fx.passes.infra.pass_base import PassResult
 
         def modify_input_output_pass(gm):
@@ -139,6 +150,46 @@ def modify_input_output_pass(gm):
         old_signature = ep_before.graph_signature
         self.assertNotEqual(new_signature.user_outputs, old_signature.user_outputs)
 
+    def test_replace_hook_basic(self) -> None:
+        class CustomModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+                self.my_parameter = torch.nn.Parameter(torch.tensor(2.0))
+
+                self.register_buffer("my_buffer1", torch.tensor(3.0))
+                self.register_buffer("my_buffer2", torch.tensor(4.0))
+
+            def forward(self, x1, x2):
+                # Use the parameter, buffers, and both inputs in the forward method
+                output = (
+                    x1 + self.my_parameter
+                ) * self.my_buffer1 + x2 * self.my_buffer2
+                return output
+
+        my_module = CustomModule()
+        inputs = (torch.tensor(6.0), torch.tensor(7.0))
+        ep_before = export(my_module, inputs)
+
+        def replace_pass(gm):
+            for node in gm.graph.nodes:
+                if node.op == "call_function":
+                    node.name = node.name + "_modified"
+            gm.recompile()
+            return PassResult(gm, True)
+
+        gm = copy.deepcopy(ep_before.graph_module)
+        sig = copy.deepcopy(ep_before.graph_signature)
+
+        with gm._set_replace_hook(sig.get_replace_hook()):
+            replace_pass(gm)
+
+        for node_name in sig.user_outputs:
+            self.assertTrue("_modified" in node_name)
+
+        old_signature = ep_before.graph_signature
+        self.assertNotEqual(sig.user_outputs, old_signature.user_outputs)
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_passes.py b/test/export/test_passes.py
index 4af0ebe01ffbf..74aff27376c81 100644
--- a/test/export/test_passes.py
+++ b/test/export/test_passes.py
@@ -3,29 +3,55 @@
 with test_functionalization_with_native_python_assertion)
 """
 
-# Owner(s): ["module: dynamo"]
+# Owner(s): ["oncall: export"]
 import math
+import operator
 import unittest
+from re import escape
 from typing import List, Set
-import operator
 
 import torch
-from torch.testing._internal.common_utils import run_tests, TestCase
-from torch.testing import FileCheck
+from functorch.experimental.control_flow import cond
 from torch._dynamo.eval_frame import is_dynamo_supported
-from torch.export import export
+from torch._export.non_strict_utils import (
+    _fakify_script_objects,
+    _gather_constant_attrs,
+)
 from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse
+from torch._export.passes.replace_set_grad_with_hop_pass import (
+    _is_set_grad_enabled_node,
+    _is_set_grad_enabled_sub_mod,
+)
 from torch._export.passes.replace_view_ops_with_view_copy_ops_pass import (
-    is_view_op,
     get_view_copy_of_view_op,
+    is_view_op,
     ReplaceViewOpsWithViewCopyOpsPass,
 )
-from torch._export.passes.functionalize_side_effectful_ops_pass import (
-    _FunctionalizeSideEffectfulOpsPass,
+from torch._export.utils import (
+    node_inline_,
+    nodes_count,
+    nodes_filter,
+    nodes_map,
+    sequential_split,
 )
-from functorch.experimental.control_flow import cond
-from torch.fx.passes.operator_support import OperatorSupport
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.export import export
+from torch.export._remove_auto_functionalized_pass import (
+    unsafe_remove_auto_functionalized_pass,
+)
+from torch.export._remove_effect_tokens_pass import _remove_effect_tokens
+from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.fx.passes.infra.partitioner import Partition
+from torch.fx.passes.operator_support import OperatorSupport
+from torch.library import _scoped_library, impl
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    run_tests,
+    skipIfTorchDynamo,
+    TestCase,
+)
+from torch.testing._internal.torchbind_impls import init_torchbind_implementations
 from torch.utils import _pytree as pytree
 
 
@@ -44,9 +70,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
 
 class _AtenAddOperatorSupport(OperatorSupport):
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
-        return node.op == "call_function" and node.target in {
-            torch.ops.aten.add.Tensor
-        }
+        return node.op == "call_function" and node.target in {torch.ops.aten.add.Tensor}
 
 
 def _to_partition_names(partitions: List[Partition]) -> List[Set[str]]:
@@ -61,8 +85,186 @@ def _get_output_names(gm: torch.fx.GraphModule) -> List[str]:
     return [str(arg) for arg in args]
 
 
+class ModelsWithScriptObjectAttr:
+    class Simple(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+    class SimpleWithAttrInContainer(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+            self.pytree_attr2 = [
+                torch.classes._TorchScriptTesting._Foo(1, 2),
+                {
+                    torch.classes._TorchScriptTesting._Foo(3, 4),
+                },
+                {"foo": torch.classes._TorchScriptTesting._Foo(5, 6)},
+            ]
+
+    class NestedWithAttrInContainer(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+            self.pytree_attr2 = [
+                torch.classes._TorchScriptTesting._Foo(1, 2),
+                {
+                    torch.classes._TorchScriptTesting._Foo(3, 4),
+                },
+                {"foo": torch.classes._TorchScriptTesting._Foo(5, 6)},
+            ]
+            self.sub_mod = ModelsWithScriptObjectAttr.Simple()
+            self.sub_mod2 = ModelsWithScriptObjectAttr.SimpleWithAttrInContainer()
+
+    class MoreNestedWithAttrInContainer(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+            self.pytree_attr2 = [
+                torch.classes._TorchScriptTesting._Foo(1, 2),
+                {
+                    torch.classes._TorchScriptTesting._Foo(3, 4),
+                },
+                {"foo": torch.classes._TorchScriptTesting._Foo(5, 6)},
+            ]
+            self.sub_mod = ModelsWithScriptObjectAttr.Simple()
+            self.sub_mod2 = ModelsWithScriptObjectAttr.NestedWithAttrInContainer()
+
+
+def _set_grad_enabled_tests():
+    from torch.export._trace import _export
+
+    class SetGradOp(torch.nn.Module):
+        def forward(self, x):
+            x = x + 1
+            torch._C._set_grad_enabled(True)
+            c = x.sin().sum()
+            torch._C._set_grad_enabled(False)
+            d = c + 1
+            torch._C._set_grad_enabled(True)
+            e = d - 1
+            return d, e
+
+    class SetGradCtxManager(torch.nn.Module):
+        def forward(self, x):
+            x = x + 1
+            with torch.enable_grad():
+                c = x.sin().sum()
+            with torch.no_grad():
+                d = c + 1
+            with torch.enable_grad():
+                e = d - 1
+            return d, e
+
+    class SetGradCtxManagerMultiDep(torch.nn.Module):
+        def forward(self, x):
+            x = x + 1
+            with torch.enable_grad():
+                c1 = x.sin().sum()
+                c2 = x.cos().sum()
+            with torch.no_grad():
+                d1 = c1 + 1
+                d2 = c2 + 1
+            with torch.enable_grad():
+                e1 = d1 - 1
+                e2 = d2 - 1
+            return d1, d2, e1, e2
+
+    x = torch.randn(2, 2)
+
+    def _get_predispatch_module(mod, args, ambient_grad_enabled=True):
+        with torch.set_grad_enabled(ambient_grad_enabled):
+            return _export(mod, args, pre_dispatch=True).module()
+
+    return {
+        "ctx_manager": (_get_predispatch_module(SetGradCtxManager(), (x,)), (x,)),
+        "ctx_manager_under_no_grad": (
+            _get_predispatch_module(SetGradCtxManager(), (x,), False),
+            (x,),
+        ),
+        "ctx_manager_multi_dep": (
+            _get_predispatch_module(SetGradCtxManagerMultiDep(), (x,)),
+            (x,),
+        ),
+        "ctx_manager_multi_dep_no_grad": (
+            _get_predispatch_module(SetGradCtxManagerMultiDep(), (x,), False),
+            (x,),
+        ),
+        "op": (_get_predispatch_module(SetGradOp(), (x,)), (x,)),
+        "op_under_no_grad": (_get_predispatch_module(SetGradOp(), (x,), False), (x,)),
+    }
+
+
+def _sequential_split_inline_tests():
+    from torch.export._trace import _export
+
+    class Simple(torch.nn.Module):
+        def forward(self, x):
+            x = x + 1
+            c = x.sin().sum()
+            d = c + 1
+            e = d - 1
+            return d, e
+
+    class MultiDep(torch.nn.Module):
+        def forward(self, x1, x2):
+            x1 = x1 + 1
+            x2 = x2 + 1
+            c1 = x1.sin()
+            c2 = x2.cos()
+            d1 = c1 + 1
+            d2 = c2 + 1
+            e1 = d1 - 1
+            e2 = d2 - 1
+            return d1, d2, e1, e2
+
+    def _get_predispatch_module(mod, args):
+        return _export(mod, args, pre_dispatch=True).module()
+
+    def _insert_dilimiter_nodes(gm: torch.fx.GraphModule, step: int = 1):
+        insert_locs = []
+        for i, node in enumerate(
+            nodes_filter(gm.graph.nodes, lambda n: n.op == "call_function")
+        ):
+            if i % step == 0:
+                insert_locs.append(node)
+
+        for i, node in enumerate(insert_locs):
+            with gm.graph.inserting_before(node):
+                gm.graph.call_function(
+                    torch._C._set_grad_enabled, (True if i % 2 == 0 else False,), {}
+                )
+        return gm
+
+    x = torch.randn(2, 2)
+    simple = _get_predispatch_module(Simple(), (x,))
+    simple1 = _get_predispatch_module(Simple(), (x,))
+    multi_dep = _get_predispatch_module(MultiDep(), (x, x.sin()))
+    multi_dep1 = _get_predispatch_module(MultiDep(), (x, x.sin()))
+    return {
+        "simple_step1": (_insert_dilimiter_nodes(simple1, 1), (x,)),
+        "simple_step2": (_insert_dilimiter_nodes(simple, 2), (x,)),
+        "multi_dep_step2": (_insert_dilimiter_nodes(multi_dep, 2), (x, x.sin())),
+        "multi_dep_step3": (_insert_dilimiter_nodes(multi_dep1, 3), (x, x.sin())),
+    }
+
+
+@skipIfTorchDynamo("recursively running dynamo on export is unlikely")
 @unittest.skipIf(not is_dynamo_supported(), "Dynamo not supported")
 class TestPasses(TestCase):
+    def setUp(self):
+        super().setUp()
+        self.SEQUENTIAL_SPLIT_INLINE_TESTS = _sequential_split_inline_tests()
+        self.SET_GRAD_ENABLED_TESTS = _set_grad_enabled_tests()
+
+        init_torchbind_implementations()
+
+    def tearDown(self):
+        self.SEQUENTIAL_SPLIT_INLINE_TESTS.clear()
+        self.SET_GRAD_ENABLED_TESTS.clear()
+        super().tearDown()
+
     def test_runtime_assert_one_dim(self) -> None:
         class M(torch.nn.Module):
             def __init__(self):
@@ -76,10 +278,15 @@ def forward(self, x):
         dim1_x = torch.export.Dim("dim1_x", min=2, max=6)
         ep = torch.export.export(M(), (x,), dynamic_shapes={"x": {1: dim1_x}})
 
-        with self.assertRaisesRegex(RuntimeError, r"Expected input l_x_.shape\[1\] to be <= 6, but got 7"):
-            ep(torch.zeros(2, 7, 3))
+        with self.assertRaisesRegex(
+            RuntimeError,
+            escape("Expected input at *args[0].shape[1] to be <= 6, but got 7"),
+        ):
+            ep.module()(torch.zeros(2, 7, 3))
 
-        self.assertEqual(ep(torch.ones(2, 4, 3)), M().forward(torch.ones(2, 4, 3)))
+        self.assertEqual(
+            ep.module()(torch.ones(2, 4, 3)), M().forward(torch.ones(2, 4, 3))
+        )
 
     def test_runtime_assert_multiple_dims(self) -> None:
         class M(torch.nn.Module):
@@ -99,11 +306,17 @@ def forward(self, x, y):
             M(), (x, y), dynamic_shapes={"x": {0: dim0_x, 1: dim1_x}, "y": {0: dim0_y}}
         )
 
-        with self.assertRaisesRegex(RuntimeError, r"Expected input l_x_.shape\[1\] to be <= 6, but got 7"):
-            ep(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
+        with self.assertRaisesRegex(
+            RuntimeError,
+            escape("Expected input at *args[0].shape[1] to be <= 6, but got 7"),
+        ):
+            ep.module()(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
 
-        with self.assertRaisesRegex(RuntimeError, r"Expected input l_y_.shape\[0\] to be >= 3, but got 2"):
-            ep(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
+        with self.assertRaisesRegex(
+            RuntimeError,
+            escape("Expected input at *args[1].shape[0] to be >= 3, but got 2"),
+        ):
+            ep.module()(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
 
     def test_runtime_assert_some_dims_not_specified(self) -> None:
         class M(torch.nn.Module):
@@ -123,17 +336,21 @@ def forward(self, x, y):
             M(), (x, y), dynamic_shapes={"x": {0: dim0_x, 1: dim1_x}, "y": None}
         )
 
-        with self.assertRaisesRegex(RuntimeError, r"Expected input l_x_.shape\[1\] to be <= 6, but got 7"):
-            ep(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
+        with self.assertRaisesRegex(
+            RuntimeError,
+            escape("Expected input at *args[0].shape[1] to be <= 6, but got 7"),
+        ):
+            ep.module()(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
 
         # y is specialized to 5
         with self.assertRaisesRegex(
-            RuntimeError, r"Expected input l_y_.shape\[0\] to be equal to 5, but got 2"
+            RuntimeError,
+            escape("Expected input at *args[1].shape[0] to be equal to 5, but got 2"),
         ):
-            ep(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
+            ep.module()(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
 
         # Since we didn't insert the constraint for x[1] >= 2, it should work for case where x[1] == 1
-        gm_result_for_1_size = ep(torch.ones(3, 1, 3), torch.ones(5, 5, 5))
+        gm_result_for_1_size = ep.module()(torch.ones(3, 1, 3), torch.ones(5, 5, 5))
         eager_result_for_1_size = M().forward(torch.ones(3, 1, 3), torch.ones(5, 5, 5))
 
         self.assertEqual(gm_result_for_1_size, eager_result_for_1_size)
@@ -150,19 +367,22 @@ def forward(self, x, y):
         y = torch.zeros(5, 5, 5)
 
         dim1_y = torch.export.Dim("dim1_y", min=3, max=6)
-        ep = torch.export.export(M(), (x, y), dynamic_shapes={"x": None, "y": {1: dim1_y}})
+        ep = torch.export.export(
+            M(), (x, y), dynamic_shapes={"x": None, "y": {1: dim1_y}}
+        )
 
-        with self.assertRaisesRegex(RuntimeError, r"shape\[1\] to be equal to 2"):
-            ep(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
+        with self.assertRaisesRegex(RuntimeError, escape("shape[1] to be equal to 2")):
+            ep.module()(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
 
         # y is specialized to 5
         with self.assertRaisesRegex(
-            RuntimeError, r"Expected input l_y_.shape\[0\] to be equal to 5, but got 2"
+            RuntimeError,
+            escape("Expected input at *args[1].shape[0] to be equal to 5, but got 2"),
         ):
-            ep(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
+            ep.module()(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
 
         # Since we didn't insert the constraint for x[1] >= 2, it should work for case where x[1] == 1
-        gm_result_for_1_size = ep(torch.zeros(4, 2, 3), torch.ones(5, 5, 5))
+        gm_result_for_1_size = ep.module()(torch.zeros(4, 2, 3), torch.ones(5, 5, 5))
         eager_result_for_1_size = M().forward(torch.zeros(4, 2, 3), torch.ones(5, 5, 5))
 
         self.assertEqual(gm_result_for_1_size, eager_result_for_1_size)
@@ -185,18 +405,23 @@ def forward(self, x):
         self.assertEqual(count_call_function(ep.graph, torch.ops.aten.view.default), 0)
 
     def test_functionalization_with_view_copy(self) -> None:
-        def foo(x):
-            y = x + 4
-            y.add_(4)
-            z = y.view(y.shape)
-            return x.cos() + z.cos()
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                y = x + 4
+                y.add_(4)
+                z = y.view(y.shape)
+                return x.cos() + z.cos()
 
         x = torch.zeros(4, 2, 3)
-
-        ep = export(foo, (x,))._transform_do_not_use(ReplaceViewOpsWithViewCopyOpsPass())
+        foo = Module()
+        ep = export(foo, (x,))._transform_do_not_use(
+            ReplaceViewOpsWithViewCopyOpsPass()
+        )
         # After this pass, there shouldn't be any view nodes in the graph
         self.assertTrue(count_call_function(ep.graph, torch.ops.aten.view.default) == 0)
-        self.assertTrue(count_call_function(ep.graph, torch.ops.aten.view_copy.default) > 0)
+        self.assertTrue(
+            count_call_function(ep.graph, torch.ops.aten.view_copy.default) > 0
+        )
 
     def test_views_op_having_view_copy(self) -> None:
         schemas = torch._C._dispatch_get_registrations_for_dispatch_key("")
@@ -217,6 +442,75 @@ def test_views_op_having_view_copy(self) -> None:
             if torch.Tag.core in op_overload.tags and is_view_op(op_overload._schema):
                 self.assertIsNotNone(get_view_copy_of_view_op(op_overload._schema))
 
+    def test_custom_obj_tuple_out(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+            def forward(self, x):
+                a = torch.ops._TorchScriptTesting.takes_foo_tuple_return(self.attr, x)
+                y = a[0] + a[1]
+                b = torch.ops._TorchScriptTesting.takes_foo(self.attr, y)
+                return b
+
+        m = MyModule()
+        inputs = (torch.ones(2, 3),)
+        ep = torch.export.export(m, inputs, strict=False)
+
+        inp = torch.randn(2, 3)
+        orig_res = m(inp)
+        ep_res = ep.module()(inp)
+
+        without_token_ep = _remove_effect_tokens(ep)
+        without_token_ep.verifier().check(without_token_ep)
+        without_token_res = without_token_ep.module()(inp)
+
+        self.assertTrue(torch.allclose(orig_res, ep_res))
+        self.assertTrue(torch.allclose(orig_res, without_token_res))
+
+    def test_fakify_script_objects(self):
+        for m in [
+            ModelsWithScriptObjectAttr.Simple(),
+            ModelsWithScriptObjectAttr.SimpleWithAttrInContainer(),
+            ModelsWithScriptObjectAttr.NestedWithAttrInContainer(),
+            ModelsWithScriptObjectAttr.MoreNestedWithAttrInContainer(),
+        ]:
+            constant_attrs = _gather_constant_attrs(m)
+            fake_mode = FakeTensorMode(
+                shape_env=ShapeEnv(tracked_fakes=[]),
+                allow_non_fake_inputs=True,
+            )
+            with _fakify_script_objects(m, tuple(), {}, fake_mode) as (
+                patched_mod,
+                _,
+                _,
+                fake_constant_attrs,
+                fake_to_real,
+            ):
+                self.assertEqual(len(fake_constant_attrs), len(constant_attrs))
+                for fake_obj, fqn in fake_constant_attrs.items():
+                    self.assertEqual(constant_attrs[fake_to_real[fake_obj]], fqn)
+
+    # TODO: _gather_constants doesn't recursively look into the pytree containers.
+    @unittest.expectedFailure
+    def test_fakify_script_objects_properly_handle_containers(self):
+        m = ModelsWithScriptObjectAttr.SimpleWithAttrInContainer()
+        constant_attrs = _gather_constant_attrs(m)
+        fake_mode = FakeTensorMode(
+            shape_env=ShapeEnv(tracked_fakes=[]),
+            allow_non_fake_inputs=True,
+        )
+        with _fakify_script_objects(m, tuple(), {}, fake_mode) as (
+            patched_mod,
+            _,
+            _,
+            fake_constant_attrs,
+            fake_to_real,
+        ):
+            self.assertTrue("attr" in fake_constant_attrs.values())
+            self.assertTrue("pytree_attr2" in fake_constant_attrs.values())
+
     def test_runtime_assert_inline_constraints_for_item(self) -> None:
         class M(torch.nn.Module):
             def __init__(self):
@@ -224,18 +518,22 @@ def __init__(self):
 
             def forward(self, x):
                 b = x.item()
-                torch._constrain_as_value(b, min=2, max=5)
+                torch._check(b >= 2)
+                torch._check(b <= 5)
                 return b
 
         x = torch.tensor([2])
         mod = M()
         ep = export(mod, (x,))
 
-        with self.assertRaisesRegex(RuntimeError, r"_local_scalar_dense is outside of inline constraint \[2, 5\]."):
-            ep(torch.tensor([6]))
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Invalid value range for 6 between \[2, 5\]",
+        ):
+            ep.module()(torch.tensor([6]))
 
         new_inp = torch.tensor([5])
-        self.assertEqual(mod(new_inp), ep(new_inp))
+        self.assertEqual(mod(new_inp), ep.module()(new_inp))
 
     def test_runtime_assert_inline_constraints_for_nonzero(self) -> None:
         class M(torch.nn.Module):
@@ -244,7 +542,8 @@ def __init__(self):
 
             def forward(self, x):
                 b = x.nonzero()
-                torch._constrain_as_value(b.shape[0], min=3, max=5)
+                torch._check(b.shape[0] >= 3)
+                torch._check(b.shape[0] <= 5)
                 return b
 
         x = torch.tensor([2, 1, 2, 3, 5, 0])
@@ -253,27 +552,28 @@ def forward(self, x):
         dim0_x = torch.export.Dim("dim0_x")
         ep = torch.export.export(mod, (x,), dynamic_shapes={"x": {0: dim0_x}})
 
-        num_assert = count_call_function(ep.graph, torch.ops.aten._assert_async.msg)
-        num_scalar_tensor = count_call_function(
-            ep.graph, torch.ops.aten.scalar_tensor.default
+        num_assert = count_call_function(
+            ep.graph, torch.ops.aten._assert_scalar.default
         )
 
         self.assertEqual(num_assert, 2)
-        self.assertEqual(num_scalar_tensor, 2)
 
         with self.assertRaisesRegex(
-            RuntimeError, r"nonzero.shape\[0\] is outside of inline constraint \[3, 5\]."
+            RuntimeError,
+            r"Invalid value range for",
         ):
-            ep(torch.tensor([1, 1, 0, 0, 0]))
+            ep.module()(torch.tensor([1, 1, 0, 0, 0]))
 
         with self.assertRaisesRegex(
-            RuntimeError, r"nonzero.shape\[0\] is outside of inline constraint \[3, 5\]."
+            RuntimeError,
+            r"Invalid value range for",
         ):
-            ep(torch.ones(6))
+            ep.module()(torch.ones(6))
 
         new_inp = torch.tensor([1, 1, 1, 1])
-        self.assertEqual(mod(new_inp), ep(new_inp))
+        self.assertEqual(mod(new_inp), ep.module()(new_inp))
 
+    @unittest.skipIf(IS_WINDOWS, "Windows not supported")
     def test_runtime_assert_inline_constraints_for_cond(self) -> None:
         class M(torch.nn.Module):
             def __init__(self):
@@ -282,12 +582,14 @@ def __init__(self):
             def forward(self, pred, x, y):
                 def true_fn(x, y):
                     b = x.item()
-                    torch._constrain_as_value(b, min=2, max=5)
+                    torch._check(b >= 2)
+                    torch._check(b <= 5)
                     return x - b
 
                 def false_fn(x, y):
                     c = y.item()
-                    torch._constrain_as_value(c, min=2, max=5)
+                    torch._check(c >= 2)
+                    torch._check(c <= 5)
                     return y - c
 
                 ret = cond(pred, true_fn, false_fn, [x, y])
@@ -298,55 +600,293 @@ def false_fn(x, y):
         mod = M()
         ep = export(mod, (torch.tensor(True), x, y))
 
-
-        with self.assertRaisesRegex(RuntimeError, "is outside of inline constraint \\[2, 5\\]."):
-            ep(torch.tensor(False), torch.tensor([6]), torch.tensor([6]))
-
-    def test_functionalize_inline_contraints(self) -> None:
-        def f(x):
-            a = x.item()
-            torch._constrain_as_value(a, 4, 7)
-            return torch.empty((a, 4))
-
-        ep = torch._export.export(f, (torch.tensor([7]),))
-        gm = ep.graph_module
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range.default",
-            1,
-            exactly=True,
-        ).run(gm.code)
-
-        gm = _FunctionalizeSideEffectfulOpsPass()(ep.graph_module).graph_module
-
         with self.assertRaisesRegex(
-            RuntimeError,
-            r"_local_scalar_dense is outside of inline constraint \[4, 7\]",
-        ) as cm:
-            gm(torch.tensor([20]))
-
-        inp = torch.tensor([5])
-        res, dep_token = gm(inp)
-        self.assertEqual(res.shape, torch.Size([5, 4]))
-        self.assertEqual(dep_token.shape, torch.Size([]))
-
-        FileCheck().check_count(
-            "torch.ops.aten._functional_sym_constrain_range", 1, exactly=True
-        ).run(gm.code)
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range.default", 0, exactly=True
-        ).run(gm.code)
+            RuntimeError, "is outside of inline constraint \\[2, 5\\]."
+        ):
+            ep.module()(torch.tensor(False), torch.tensor([6]), torch.tensor([6]))
 
     def test_math_ops(self):
-        def func(x):
-            return (
-                torch.tensor([math.ceil(x.item())]),
-                torch.tensor([math.floor(x.item())]),
-            )
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return (
+                    torch.tensor([math.ceil(x.item())]),
+                    torch.tensor([math.floor(x.item())]),
+                )
 
+        func = Module()
         x = torch.randn(1, dtype=torch.float32)
         ep = torch.export.export(func, args=(x,))
         _ExportPassBaseDeprecatedDoNotUse()(ep.graph_module)
 
+    def test_predispatceh_set_grad(self):
+        mod, args = self.SET_GRAD_ENABLED_TESTS["op"]
+        self.assertExpectedInline(
+            mod.code.strip("\n"),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    add = torch.ops.aten.add.Tensor(x, 1);  x = None
+    sin = torch.ops.aten.sin.default(add);  add = None
+    sum_1 = torch.ops.aten.sum.default(sin);  sin = None
+    submod_4 = self.submod_2
+    add_1 = torch._higher_order_ops.wrap.wrap_with_set_grad_enabled(False, submod_4, sum_1);  submod_4 = sum_1 = None
+    sub = torch.ops.aten.sub.Tensor(add_1, 1)
+    return pytree.tree_unflatten((add_1, sub), self._out_spec)
+    """,
+        )
+        mod, args = self.SET_GRAD_ENABLED_TESTS["op_under_no_grad"]
+        self.assertExpectedInline(
+            mod.code.strip("\n"),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    add = torch.ops.aten.add.Tensor(x, 1);  x = None
+    sin = torch.ops.aten.sin.default(add);  add = None
+    sum_1 = torch.ops.aten.sum.default(sin);  sin = None
+    submod_4 = self.submod_2
+    add_1 = torch._higher_order_ops.wrap.wrap_with_set_grad_enabled(False, submod_4, sum_1);  submod_4 = sum_1 = None
+    sub = torch.ops.aten.sub.Tensor(add_1, 1)
+    return pytree.tree_unflatten((add_1, sub), self._out_spec)
+    """,
+        )
+
+        mod, args = self.SET_GRAD_ENABLED_TESTS["ctx_manager"]
+        self.assertExpectedInline(
+            mod.code.strip("\n"),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    add = torch.ops.aten.add.Tensor(x, 1);  x = None
+    sin = torch.ops.aten.sin.default(add);  add = None
+    sum_1 = torch.ops.aten.sum.default(sin);  sin = None
+    submod_3 = self.submod_1
+    add_1 = torch._higher_order_ops.wrap.wrap_with_set_grad_enabled(False, submod_3, sum_1);  submod_3 = sum_1 = None
+    sub = torch.ops.aten.sub.Tensor(add_1, 1)
+    return pytree.tree_unflatten((add_1, sub), self._out_spec)
+    """,
+        )
+        mod, args = self.SET_GRAD_ENABLED_TESTS["ctx_manager_under_no_grad"]
+        self.assertExpectedInline(
+            mod.code.strip("\n"),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    add = torch.ops.aten.add.Tensor(x, 1);  x = None
+    submod_5 = self.submod_1
+    sum_1 = torch._higher_order_ops.wrap.wrap_with_set_grad_enabled(True, submod_5, add);  submod_5 = add = None
+    add_1 = torch.ops.aten.add.Tensor(sum_1, 1);  sum_1 = None
+    submod_6 = self.submod_3
+    sub = torch._higher_order_ops.wrap.wrap_with_set_grad_enabled(True, submod_6, add_1);  submod_6 = None
+    return pytree.tree_unflatten((add_1, sub), self._out_spec)
+    """,
+        )
+        mod, args = self.SET_GRAD_ENABLED_TESTS["ctx_manager_multi_dep"]
+        self.assertExpectedInline(
+            mod.code.strip("\n"),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    add = torch.ops.aten.add.Tensor(x, 1);  x = None
+    sin = torch.ops.aten.sin.default(add)
+    sum_1 = torch.ops.aten.sum.default(sin);  sin = None
+    cos = torch.ops.aten.cos.default(add);  add = None
+    sum_2 = torch.ops.aten.sum.default(cos);  cos = None
+    submod_3 = self.submod_1
+    wrap_with_set_grad_enabled = torch._higher_order_ops.wrap.wrap_with_set_grad_enabled(False, submod_3, sum_1, sum_2);  submod_3 = sum_1 = sum_2 = None
+    add_1 = wrap_with_set_grad_enabled[0]
+    add_2 = wrap_with_set_grad_enabled[1];  wrap_with_set_grad_enabled = None
+    sub = torch.ops.aten.sub.Tensor(add_1, 1)
+    sub_1 = torch.ops.aten.sub.Tensor(add_2, 1)
+    return pytree.tree_unflatten((add_1, add_2, sub, sub_1), self._out_spec)
+    """,  # noqa: B950
+        )
+        mod, args = self.SET_GRAD_ENABLED_TESTS["ctx_manager_multi_dep_no_grad"]
+        self.assertExpectedInline(
+            mod.code.strip("\n"),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    add = torch.ops.aten.add.Tensor(x, 1);  x = None
+    submod_5 = self.submod_1
+    wrap_with_set_grad_enabled = torch._higher_order_ops.wrap.wrap_with_set_grad_enabled(True, submod_5, add);  submod_5 = add = None
+    sum_1 = wrap_with_set_grad_enabled[0]
+    sum_2 = wrap_with_set_grad_enabled[1];  wrap_with_set_grad_enabled = None
+    add_1 = torch.ops.aten.add.Tensor(sum_1, 1);  sum_1 = None
+    add_2 = torch.ops.aten.add.Tensor(sum_2, 1);  sum_2 = None
+    submod_6 = self.submod_3
+    wrap_with_set_grad_enabled_1 = torch._higher_order_ops.wrap.wrap_with_set_grad_enabled(True, submod_6, add_1, add_2);  submod_6 = None
+    sub = wrap_with_set_grad_enabled_1[0]
+    sub_1 = wrap_with_set_grad_enabled_1[1];  wrap_with_set_grad_enabled_1 = None
+    return pytree.tree_unflatten((add_1, add_2, sub, sub_1), self._out_spec)
+    """,  # noqa: B950
+        )
+
+    def test_sequential_split(self):
+        for gm, args in self.SEQUENTIAL_SPLIT_INLINE_TESTS.values():
+            set_grad_counts = nodes_count(gm.graph.nodes, _is_set_grad_enabled_node)
+            new_gm = sequential_split(gm, _is_set_grad_enabled_node)
+            new_set_grad_counts = nodes_count(
+                new_gm.graph.nodes, _is_set_grad_enabled_sub_mod
+            )
+            self.assertEqual(set_grad_counts, new_set_grad_counts)
+            self.assertEqual(gm(*args), new_gm(*args))
+
+    def test_sequential_split_graph(self):
+        gm, args = self.SEQUENTIAL_SPLIT_INLINE_TESTS["multi_dep_step2"]
+
+        new_gm = sequential_split(gm, _is_set_grad_enabled_node)
+        self.assertEqual(gm(*args), new_gm(*args))
+        self.assertExpectedInline(
+            new_gm.code.strip("\n"),
+            """\
+def forward(self, x1, x2):
+    x1, x2, = fx_pytree.tree_flatten_spec(([x1, x2], {}), self._in_spec)
+    submod_1 = self.submod_1(x1, x2);  x1 = x2 = None
+    getitem = submod_1[0]
+    getitem_1 = submod_1[1];  submod_1 = None
+    submod_2 = self.submod_2(getitem, getitem_1);  getitem = getitem_1 = None
+    getitem_2 = submod_2[0]
+    getitem_3 = submod_2[1];  submod_2 = None
+    submod_3 = self.submod_3(getitem_2, getitem_3);  getitem_2 = getitem_3 = None
+    getitem_4 = submod_3[0]
+    getitem_5 = submod_3[1];  submod_3 = None
+    submod_4 = self.submod_4(getitem_4, getitem_5)
+    getitem_6 = submod_4[0]
+    getitem_7 = submod_4[1];  submod_4 = None
+    return pytree.tree_unflatten((getitem_4, getitem_5, getitem_6, getitem_7), self._out_spec)
+    """,
+        )
+        self.assertExpectedInline(
+            new_gm.submod_1.code.strip("\n"),
+            """\
+def forward(self, x1, x2):
+    _set_grad_enabled = torch._C._set_grad_enabled(True)
+    add = torch.ops.aten.add.Tensor(x1, 1);  x1 = None
+    add_1 = torch.ops.aten.add.Tensor(x2, 1);  x2 = None
+    return (add, add_1)
+    """,
+        )
+        self.assertExpectedInline(
+            new_gm.submod_2.code.strip("\n"),
+            """\
+def forward(self, add, add_1):
+    _set_grad_enabled_1 = torch._C._set_grad_enabled(False)
+    sin = torch.ops.aten.sin.default(add);  add = None
+    cos = torch.ops.aten.cos.default(add_1);  add_1 = None
+    return (sin, cos)
+    """,
+        )
+        self.assertExpectedInline(
+            new_gm.submod_3.code.strip("\n"),
+            """\
+def forward(self, sin, cos):
+    _set_grad_enabled_2 = torch._C._set_grad_enabled(True)
+    add_2 = torch.ops.aten.add.Tensor(sin, 1);  sin = None
+    add_3 = torch.ops.aten.add.Tensor(cos, 1);  cos = None
+    return (add_2, add_3)
+    """,
+        )
+
+    def test_inline_(self):
+        for gm, args in self.SEQUENTIAL_SPLIT_INLINE_TESTS.values():
+            before_str = gm.print_readable(print_output=False)
+            new_gm = sequential_split(gm, _is_set_grad_enabled_node)
+            nodes_map(
+                new_gm.graph.nodes,
+                lambda node: node_inline_(node) if node.op == "call_module" else node,
+            )
+            after_inline_str = new_gm.print_readable(print_output=False)
+            self.assertEqual(before_str, after_inline_str)
+            self.assertEqual(gm(*args), new_gm(*args))
+
+    def test_remove_auto_functionalized_pass(self) -> None:
+        with _scoped_library("DO_NOT_USE_TEST_ONLY", "DEF") as lib:
+            lib.define("custom_mutator(Tensor x, Tensor(a!) y) -> Tensor")
+
+            @impl(lib, "custom_mutator", "Meta")
+            def custom_mutator_meta(
+                x: torch.Tensor,
+                y: torch.Tensor,
+            ) -> torch.Tensor:
+                return torch.empty_like(x)
+
+            @impl(lib, "custom_mutator", "CompositeExplicitAutograd")
+            def custom_mutator(
+                x: torch.Tensor,
+                y: torch.Tensor,
+            ) -> torch.Tensor:
+                return x + y.add_(1)
+
+            class M(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.register_buffer("state", torch.zeros(1))
+
+                def forward(self, x):
+                    return torch.ops.DO_NOT_USE_TEST_ONLY.custom_mutator(x, self.state)
+
+            mod = M()
+            x = torch.randn([3, 3])
+            ep = export(mod, (x,))
+            inplace_ep = unsafe_remove_auto_functionalized_pass(ep)
+            nodes = inplace_ep.graph.nodes
+            for node in nodes:
+                if node.op == "call_function":
+                    self.assertFalse(node.target is auto_functionalized)
+                    self.assertFalse(node.target is operator.getitem)
+
+            for spec in inplace_ep.graph_signature.output_specs:
+                self.assertFalse("getitem" in spec.arg.name)
+
+    def test_remove_auto_functionalized_pass_tuple(self) -> None:
+        with _scoped_library("DO_NOT_USE_TEST_ONLY", "DEF") as lib:
+            lib.define(
+                "custom_mutator_tuple(Tensor x, Tensor(a!) y) -> (Tensor, Tensor)"
+            )
 
-if __name__ == '__main__':
+            @impl(lib, "custom_mutator_tuple", "Meta")
+            def custom_mutator_tuple_meta(
+                x: torch.Tensor,
+                y: torch.Tensor,
+            ):
+                return (torch.empty_like(x), torch.empty_like(x))
+
+            @impl(lib, "custom_mutator_tuple", "CompositeExplicitAutograd")
+            def custom_mutator_tuple(
+                x: torch.Tensor,
+                y: torch.Tensor,
+            ):
+                return (x, x + y.add_(1))
+
+            class M(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.register_buffer("state", torch.zeros(1))
+
+                def forward(self, x):
+                    return torch.ops.DO_NOT_USE_TEST_ONLY.custom_mutator_tuple(
+                        x, self.state
+                    )
+
+            mod = M()
+            x = torch.randn([3, 3])
+            ep = export(mod, (x,))
+            inplace_ep = unsafe_remove_auto_functionalized_pass(ep)
+
+            nodes = inplace_ep.graph.nodes
+            getitems = 0
+            for node in nodes:
+                if node.op == "call_function":
+                    self.assertFalse(node.target is auto_functionalized)
+                    if node.target is operator.getitem:
+                        getitems += 1
+            self.assertEqual(getitems, 2)  # tuple return of len 2
+
+            out_specs = inplace_ep.graph_signature.output_specs
+            self.assertEqual(out_specs[0].arg.name, "b_state")  # state
+            self.assertEqual(out_specs[1].arg.name, "getitem")  # tuple return 1
+            self.assertEqual(out_specs[2].arg.name, "getitem_1")  # tuple return 2
+
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_retraceability.py b/test/export/test_retraceability.py
index 3c84ad73687a8..afee059a13b88 100644
--- a/test/export/test_retraceability.py
+++ b/test/export/test_retraceability.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: dynamo"]
+# Owner(s): ["oncall: export"]
 
 try:
     from . import test_export, testing
@@ -12,19 +12,21 @@
 
 def mocked_retraceability_export(*args, **kwargs):
     ep = export(*args, **kwargs)
-    ep = export(ep, *(args[1:]), **kwargs)
+    if "dynamic_shapes" in kwargs:
+        if isinstance(kwargs["dynamic_shapes"], dict):
+            kwargs["dynamic_shapes"] = tuple(kwargs["dynamic_shapes"].values())
+
+    ep = export(ep.module(), *(args[1:]), **kwargs)
     return ep
 
 
 def make_dynamic_cls(cls):
-    suffix = "_retraceability"
-
     cls_prefix = "RetraceExport"
 
     test_class = testing.make_test_cls_with_mocked_export(
         cls,
         cls_prefix,
-        suffix,
+        test_export.RETRACEABILITY_SUFFIX,
         mocked_retraceability_export,
         xfail_prop="_expected_failure_retrace",
     )
diff --git a/test/export/test_safeguard.py b/test/export/test_safeguard.py
new file mode 100644
index 0000000000000..1d4ffa030c70f
--- /dev/null
+++ b/test/export/test_safeguard.py
@@ -0,0 +1,157 @@
+# Owner(s): ["oncall: export"]
+import unittest
+
+import torch
+import torch._dynamo as torchdynamo
+from torch.export import export
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+@unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
+class TestSafeguard(TestCase):
+    # If the autograd state doesn't change, dynamo eliminates autograd state manager op and later export can succeed.
+    # Otherwise, autograd can be preserved in the produced gragh, and export will fail.
+    def test_global_autograd(self):
+        class F1(torch.nn.Module):
+            def forward(self, a):
+                with torch.no_grad():
+                    b = a + a
+                return b
+
+        f1 = F1()
+
+        class F2(torch.nn.Module):
+            def forward(self, a):
+                with torch.enable_grad():
+                    b = a + a
+                return b
+
+        f2 = F2()
+
+        class F3(torch.nn.Module):
+            def forward(self, a):
+                with torch.set_grad_enabled(False):
+                    b = a + a
+                return b
+
+        f3 = F3()
+
+        class F4(torch.nn.Module):
+            def forward(self, a):
+                with torch.set_grad_enabled(True):
+                    b = a + a
+                return b
+
+        f4 = F4()
+
+        a = torch.randn(10)
+        with torch.no_grad():
+            export(f1, (a,))
+            export(f2, (a,))
+            export(f3, (a,))
+            export(f4, (a,))
+
+        with torch.enable_grad():
+            export(f2, (a,))
+            export(f4, (a,))
+
+            with self.assertRaisesRegex(
+                RuntimeError, "Encountered autograd state manager op.*"
+            ):
+                export(f1, (a,))
+
+            with self.assertRaisesRegex(
+                RuntimeError, "Encountered autograd state manager op.*"
+            ):
+                export(f3, (a,))
+
+    def test_tensor_autograd(self):
+        # dynamo errors when Tensor.requires_grad_ change the autograd state
+        class F1(torch.nn.Module):
+            def forward(self, a):
+                a.requires_grad_(True)
+                b = a + a
+                return b
+
+        f1 = F1()
+
+        # dynamo errors when Tensor.requires_grad_ change the autograd state
+        class F2(torch.nn.Module):
+            def forward(self, a):
+                a.requires_grad_(False)
+                b = a + a
+                return b
+
+        f2 = F2()
+
+        # dynamo always errors on Tensor.requires_grad
+        class F3(torch.nn.Module):
+            def forward(self, a):
+                a.requires_grad = False
+                b = a + a
+                return b
+
+        f3 = F3()
+
+        export(f1, (torch.randn(10, requires_grad=True),))
+        export(f2, (torch.randn(10, requires_grad=False),))
+
+        with self.assertRaises(RuntimeError):
+            export(f1, (torch.randn(10, requires_grad=False),))
+        with self.assertRaises(RuntimeError):
+            export(f2, (torch.randn(10, requires_grad=True),))
+        with self.assertRaises(RuntimeError):
+            export(f3, (torch.randn(10, requires_grad=False),))
+
+    def test_global_autograd_exempt_predispatch(self):
+        class F1(torch.nn.Module):
+            def forward(self, a):
+                with torch.no_grad():
+                    b = a + a
+                return b
+
+        f1 = F1()
+
+        class F2(torch.nn.Module):
+            def forward(self, a):
+                with torch.enable_grad():
+                    b = a + a
+                return b
+
+        f2 = F2()
+
+        class F3(torch.nn.Module):
+            def forward(self, a):
+                with torch.set_grad_enabled(False):
+                    b = a + a
+                return b
+
+        f3 = F3()
+
+        class F4(torch.nn.Module):
+            def forward(self, a):
+                with torch.set_grad_enabled(True):
+                    b = a + a
+                return b
+
+        f4 = F4()
+
+        a = torch.randn(10)
+
+        from torch.export._trace import _export
+
+        with torch.no_grad():
+            _export(f1, (a,), pre_dispatch=True)
+            _export(f2, (a,), pre_dispatch=True)
+            _export(f3, (a,), pre_dispatch=True)
+            _export(f4, (a,), pre_dispatch=True)
+
+        with torch.enable_grad():
+            _export(f1, (a,), pre_dispatch=True)
+            _export(f2, (a,), pre_dispatch=True)
+            _export(f3, (a,), pre_dispatch=True)
+            _export(f4, (a,), pre_dispatch=True)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/export/test_schema.py b/test/export/test_schema.py
new file mode 100644
index 0000000000000..9878f37c0704e
--- /dev/null
+++ b/test/export/test_schema.py
@@ -0,0 +1,341 @@
+# Owner(s): ["oncall: export"]
+from torch._export.serde.schema_check import (
+    _Commit,
+    _diff_schema,
+    check,
+    SchemaUpdateError,
+    update_schema,
+)
+
+from torch.testing._internal.common_utils import IS_FBCODE, run_tests, TestCase
+
+
+class TestSchema(TestCase):
+    def test_schema_compatibility(self):
+        msg = """
+Detected an invalidated change to export schema. Please run the following script to update the schema:
+Example(s):
+    python scripts/export/update_schema.py --prefix <path_to_torch_development_diretory>
+        """
+
+        if IS_FBCODE:
+            msg += """or
+    buck run caffe2:export_update_schema -- --prefix /data/users/$USER/fbsource/fbcode/caffe2/
+            """
+        try:
+            commit = update_schema()
+        except SchemaUpdateError as e:
+            self.fail(f"Failed to update schema: {e}\n{msg}")
+
+        self.assertEqual(commit.checksum_base, commit.checksum_result, msg)
+
+    def test_schema_diff(self):
+        additions, subtractions = _diff_schema(
+            {
+                "Type0": {"kind": "struct", "fields": {}},
+                "Type2": {
+                    "kind": "struct",
+                    "fields": {
+                        "field0": {"type": ""},
+                        "field2": {"type": ""},
+                        "field3": {"type": "", "default": "[]"},
+                    },
+                },
+            },
+            {
+                "Type2": {
+                    "kind": "struct",
+                    "fields": {
+                        "field1": {"type": "", "default": "0"},
+                        "field2": {"type": "", "default": "[]"},
+                        "field3": {"type": ""},
+                    },
+                },
+                "Type1": {"kind": "struct", "fields": {}},
+            },
+        )
+
+        self.assertEqual(
+            additions,
+            {
+                "Type1": {"kind": "struct", "fields": {}},
+                "Type2": {
+                    "fields": {
+                        "field1": {"type": "", "default": "0"},
+                        "field2": {"default": "[]"},
+                    },
+                },
+            },
+        )
+        self.assertEqual(
+            subtractions,
+            {
+                "Type0": {"kind": "struct", "fields": {}},
+                "Type2": {
+                    "fields": {
+                        "field0": {"type": ""},
+                        "field3": {"default": "[]"},
+                    },
+                },
+            },
+        )
+
+    def test_schema_check(self):
+        # Adding field without default value
+        dst = {
+            "Type2": {
+                "kind": "struct",
+                "fields": {
+                    "field0": {"type": ""},
+                },
+            },
+            "SCHEMA_VERSION": [3, 2],
+        }
+        src = {
+            "Type2": {
+                "kind": "struct",
+                "fields": {
+                    "field0": {"type": ""},
+                    "field1": {"type": ""},
+                },
+            },
+            "SCHEMA_VERSION": [3, 2],
+        }
+
+        additions, subtractions = _diff_schema(dst, src)
+
+        commit = _Commit(
+            result=src,
+            checksum_result="",
+            path="",
+            additions=additions,
+            subtractions=subtractions,
+            base=dst,
+            checksum_base="",
+        )
+        next_version, _ = check(commit)
+        self.assertEqual(next_version, [4, 1])
+
+        # Removing field
+        dst = {
+            "Type2": {
+                "kind": "struct",
+                "fields": {
+                    "field0": {"type": ""},
+                },
+            },
+            "SCHEMA_VERSION": [3, 2],
+        }
+        src = {
+            "Type2": {
+                "kind": "struct",
+                "fields": {},
+            },
+            "SCHEMA_VERSION": [3, 2],
+        }
+
+        additions, subtractions = _diff_schema(dst, src)
+
+        commit = _Commit(
+            result=src,
+            checksum_result="",
+            path="",
+            additions=additions,
+            subtractions=subtractions,
+            base=dst,
+            checksum_base="",
+        )
+        next_version, _ = check(commit)
+        self.assertEqual(next_version, [4, 1])
+
+        # Adding field with default value
+        dst = {
+            "Type2": {
+                "kind": "struct",
+                "fields": {
+                    "field0": {"type": ""},
+                },
+            },
+            "SCHEMA_VERSION": [3, 2],
+        }
+        src = {
+            "Type2": {
+                "kind": "struct",
+                "fields": {
+                    "field0": {"type": ""},
+                    "field1": {"type": "", "default": "[]"},
+                },
+            },
+            "SCHEMA_VERSION": [3, 2],
+        }
+
+        additions, subtractions = _diff_schema(dst, src)
+
+        commit = _Commit(
+            result=src,
+            checksum_result="",
+            path="",
+            additions=additions,
+            subtractions=subtractions,
+            base=dst,
+            checksum_base="",
+        )
+        next_version, _ = check(commit)
+        self.assertEqual(next_version, [3, 3])
+
+        # Changing field type
+        dst = {
+            "Type2": {
+                "kind": "struct",
+                "fields": {
+                    "field0": {"type": ""},
+                },
+            },
+            "SCHEMA_VERSION": [3, 2],
+        }
+        src = {
+            "Type2": {
+                "kind": "struct",
+                "fields": {
+                    "field0": {"type": "int"},
+                },
+            },
+            "SCHEMA_VERSION": [3, 2],
+        }
+
+        with self.assertRaises(SchemaUpdateError):
+            _diff_schema(dst, src)
+
+        # Adding new type.
+        dst = {
+            "Type2": {
+                "kind": "struct",
+                "fields": {
+                    "field0": {"type": ""},
+                },
+            },
+            "SCHEMA_VERSION": [3, 2],
+        }
+        src = {
+            "Type2": {
+                "kind": "struct",
+                "fields": {
+                    "field0": {"type": ""},
+                },
+            },
+            "Type1": {"kind": "struct", "fields": {}},
+            "SCHEMA_VERSION": [3, 2],
+        }
+
+        additions, subtractions = _diff_schema(dst, src)
+
+        commit = _Commit(
+            result=src,
+            checksum_result="",
+            path="",
+            additions=additions,
+            subtractions=subtractions,
+            base=dst,
+            checksum_base="",
+        )
+        next_version, _ = check(commit)
+        self.assertEqual(next_version, [3, 3])
+
+        # Removing a type.
+        dst = {
+            "Type2": {
+                "kind": "struct",
+                "fields": {
+                    "field0": {"type": ""},
+                },
+            },
+            "SCHEMA_VERSION": [3, 2],
+        }
+        src = {
+            "SCHEMA_VERSION": [3, 2],
+        }
+
+        additions, subtractions = _diff_schema(dst, src)
+
+        commit = _Commit(
+            result=src,
+            checksum_result="",
+            path="",
+            additions=additions,
+            subtractions=subtractions,
+            base=dst,
+            checksum_base="",
+        )
+        next_version, _ = check(commit)
+        self.assertEqual(next_version, [3, 3])
+
+        # Adding new field in union.
+        dst = {
+            "Type2": {
+                "kind": "union",
+                "fields": {
+                    "field0": {"type": ""},
+                },
+            },
+            "SCHEMA_VERSION": [3, 2],
+        }
+        src = {
+            "Type2": {
+                "kind": "union",
+                "fields": {
+                    "field0": {"type": ""},
+                    "field1": {"type": ""},
+                },
+            },
+            "SCHEMA_VERSION": [3, 2],
+        }
+
+        additions, subtractions = _diff_schema(dst, src)
+
+        commit = _Commit(
+            result=src,
+            checksum_result="",
+            path="",
+            additions=additions,
+            subtractions=subtractions,
+            base=dst,
+            checksum_base="",
+        )
+        next_version, _ = check(commit)
+        self.assertEqual(next_version, [3, 3])
+
+        # Removing a field in union.
+        dst = {
+            "Type2": {
+                "kind": "union",
+                "fields": {
+                    "field0": {"type": ""},
+                },
+            },
+            "SCHEMA_VERSION": [3, 2],
+        }
+        src = {
+            "Type2": {
+                "kind": "union",
+                "fields": {},
+            },
+            "SCHEMA_VERSION": [3, 2],
+        }
+
+        additions, subtractions = _diff_schema(dst, src)
+
+        commit = _Commit(
+            result=src,
+            checksum_result="",
+            path="",
+            additions=additions,
+            subtractions=subtractions,
+            base=dst,
+            checksum_base="",
+        )
+        next_version, _ = check(commit)
+        self.assertEqual(next_version, [4, 1])
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/export/test_serdes.py b/test/export/test_serdes.py
index 8e9a0129ddeab..253c6db818198 100644
--- a/test/export/test_serdes.py
+++ b/test/export/test_serdes.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: dynamo"]
+# Owner(s): ["oncall: export"]
 
 import io
 
@@ -9,6 +9,7 @@
     import testing
 
 from torch.export import export, load, save
+from torch.export._trace import _export
 
 test_classes = {}
 
@@ -22,10 +23,21 @@ def mocked_serder_export(*args, **kwargs):
     return loaded_ep
 
 
+def mocked_serder_export_pre_dispatch(*args, **kwargs):
+    ep = _export(*args, **kwargs, pre_dispatch=True)
+    buffer = io.BytesIO()
+    save(ep, buffer)
+    buffer.seek(0)
+    loaded_ep = load(buffer)
+    return loaded_ep
+
+
 def make_dynamic_cls(cls):
     suffix = "_serdes"
+    suffix_pre_dispatch = "_serdes_pre_dispatch"
 
     cls_prefix = "SerDesExport"
+    cls_prefix_pre_dispatch = "SerDesExportPreDispatch"
 
     test_class = testing.make_test_cls_with_mocked_export(
         cls,
@@ -35,11 +47,21 @@ def make_dynamic_cls(cls):
         xfail_prop="_expected_failure_serdes",
     )
 
+    test_class_pre_dispatch = testing.make_test_cls_with_mocked_export(
+        cls,
+        cls_prefix_pre_dispatch,
+        suffix_pre_dispatch,
+        mocked_serder_export_pre_dispatch,
+        xfail_prop="_expected_failure_serdes_pre_dispatch",
+    )
+
     test_classes[test_class.__name__] = test_class
+    test_classes[test_class_pre_dispatch.__name__] = test_class_pre_dispatch
     # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
     globals()[test_class.__name__] = test_class
+    globals()[test_class_pre_dispatch.__name__] = test_class_pre_dispatch
     test_class.__module__ = __name__
-    return test_class
+    test_class_pre_dispatch.__module__ = __name__
 
 
 tests = [
diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
index c37582dbe0f6f..35a1d1fa33c07 100644
--- a/test/export/test_serialize.py
+++ b/test/export/test_serialize.py
@@ -2,7 +2,10 @@
 PYTEST_DONT_REWRITE (prevents pytest from rewriting assertions, which interferes
 with test_sym_bool)
 """
-# Owner(s): ["module: dynamo"]
+
+
+# Owner(s): ["oncall: export"]
+import copy
 import io
 import pathlib
 import tempfile
@@ -11,55 +14,154 @@
 
 import torch
 import torch._dynamo as torchdynamo
-from torch.export import export, save, load, Dim
+import torch.export._trace
+import torch.utils._pytree as pytree
 from torch._export.db.case import ExportCase, normalize_inputs, SupportLevel
 from torch._export.db.examples import all_examples
 from torch._export.serde.serialize import (
-    ExportedProgramDeserializer,
-    ExportedProgramSerializer,
     canonicalize,
     deserialize,
+    ExportedProgramDeserializer,
+    ExportedProgramSerializer,
     serialize,
     SerializeError,
 )
-from torch._subclasses.fake_tensor import FakeTensor
-from torch.fx.experimental.symbolic_shapes import is_concrete_int
-import torch.utils._pytree as pytree
+from torch._higher_order_ops.torchbind import enable_torchbind_tracing
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch.export import Dim, export, load, save
+from torch.fx.experimental.symbolic_shapes import is_concrete_int, ValueRanges
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+    IS_WINDOWS,
     parametrize,
     run_tests,
-    TestCase,
     TemporaryFileName,
-    IS_FBCODE,
-    IS_MACOS,
-    IS_SANDCASTLE,
-    IS_WINDOWS,
-    find_library_location,
+    TestCase,
 )
 
+from torch.testing._internal.torchbind_impls import init_torchbind_implementations
 
-def get_filtered_export_db_tests():
-    unsupported_test_names = {
-        "dynamic_shape_constructor",  # 'NoneType' object has no attribute 'from_tensor'
-        "dictionary",  # Graph output must be a tuple()
-        "fn_with_kwargs",  # export doesn't support kwargs yet
-        "scalar_output",  # Tracing through 'f' must produce a single graph
-        "user_input_mutation",  # TODO(zhxchen17) Support serializing user inputs mutation.
-    }
 
+def get_filtered_export_db_tests():
     return [
         (name, case)
         for name, case in all_examples().items()
-        if (
-            case.support_level == SupportLevel.SUPPORTED and
-            name not in unsupported_test_names
-        )
+        if case.support_level == SupportLevel.SUPPORTED
     ]
 
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
 class TestSerialize(TestCase):
+    def test_predispatch_export_with_autograd_op(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                with torch.enable_grad():
+                    return x + x
+
+        inp = (torch.ones(10),)
+        with torch.no_grad():
+            from torch.export._trace import _export
+
+            ep = _export(Foo(), inp, pre_dispatch=True)
+
+        buffer = io.BytesIO()
+        torch.export.save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = torch.export.load(buffer)
+
+        exp_out = ep.module()(*inp)
+        actual_out = loaded_ep.module()(*inp)
+        self.assertEqual(exp_out, actual_out)
+        self.assertEqual(exp_out.requires_grad, actual_out.requires_grad)
+
+    def test_export_example_inputs_preserved(self):
+        class MyModule(torch.nn.Module):
+            """A test module with that has multiple args and uses kwargs"""
+
+            def __init__(self):
+                super().__init__()
+                self.p = torch.nn.Parameter(torch.ones(2, 3))
+
+            def forward(self, x, y, use_p=False):
+                out = x + y
+                if use_p:
+                    out += self.p
+                return out
+
+        model = MyModule().eval()
+        random_inputs = (torch.rand([2, 3]), torch.rand([2, 3]))
+        exp_program = torch.export.export(model, random_inputs, {"use_p": True})
+
+        output_buffer = io.BytesIO()
+        # Tests that example inputs are preserved when saving and loading module.
+        torch.export.save(exp_program, output_buffer)
+        loaded_model = torch.export.load(output_buffer)
+        # Extract the example inputs from before and after saving.
+        orig_args, orig_kwargs = exp_program.example_inputs
+        loaded_args, loaded_kwargs = loaded_model.example_inputs
+        # Run both modules and confirm that outputs match.
+        orig_out = exp_program.module()(*orig_args, **orig_kwargs)
+        loaded_out = loaded_model.module()(*loaded_args, **loaded_kwargs)
+        self.assertEqual(orig_out, loaded_out)
+
+    def test_metadata_parsing_with_layer_split(self):
+        # Tests that modules with more complicated layer patterns can be serialized
+        # and deserialized correctly.
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layers = torch.nn.Sequential(
+                    torch.nn.SiLU(),
+                    torch.nn.SiLU(),
+                    torch.nn.SiLU(),
+                )
+
+            def forward(self, x):
+                # Splitting layers of a sequential stack introduces commas and parens
+                # into metadata trace.
+                out_start, out_rest = self.layers[0], self.layers[1:]
+                h = out_start(x)
+                h = out_rest(h)
+                return h
+
+        inp = (torch.ones(10),)
+        # Module will only be able to roundtrip if metadata
+        # can be correctly parsed.
+        ep = export(MyModule(), inp)
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        loaded_ep = load(buffer)
+
+        # Check that both modules run to confirm load was successful.
+        exp_out = ep.module()(*inp)
+        actual_out = loaded_ep.module()(*inp)
+        self.assertEqual(exp_out, actual_out)
+
+    def test_serialize_constant_outputs(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                # Along with tensor output, return Nonetype
+                # and constant. Although these outputs aren't
+                # very useful, they do show up in graphs.
+                return x + 1, None, 1024
+
+        # Check that module can be roundtripped, thereby confirming proper deserialization.
+        inp = (torch.ones(10),)
+        ep = export(MyModule(), inp)
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        loaded_ep = load(buffer)
+
+        exp_out = ep.module()(*inp)
+        actual_out = loaded_ep.module()(*inp)
+        self.assertEqual(exp_out, actual_out)
+
     def test_serialize_multiple_returns_from_node(self) -> None:
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -164,15 +266,41 @@ def forward(self, x):
             self.assertNotIn(name, seen)
             seen.add(name)
 
+    def test_rational_ranges(self) -> None:
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x + x
+
+        ep = torch.export.export(
+            M(), (torch.randn(4),), dynamic_shapes=({0: Dim("temp")},)
+        )
+
+        range_constraints = list(ep.range_constraints.keys())
+        assert len(range_constraints) == 1
+        symint = range_constraints[0]
+
+        import sympy
+
+        upper_range = sympy.Rational(10, 3)
+        lower_range = sympy.Rational(10, 6)
+        ep.range_constraints[symint] = ValueRanges(lower=lower_range, upper=upper_range)
+
+        serialized = ExportedProgramSerializer().serialize(ep)
+        self.assertEqual(serialized.exported_program.range_constraints["s0"].min_val, 2)
+        self.assertEqual(serialized.exported_program.range_constraints["s0"].max_val, 3)
+
     def test_kwargs_default(self) -> None:
         """
         Tests that the kwargs default values are serialized even if they are not
         specified
         """
 
-        def f(x: torch.Tensor) -> torch.Tensor:
-            values = torch.randn(3, 2)
-            return torch.searchsorted(x, values, side="right", right=True)
+        class Foo(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                values = torch.randn(3, 2)
+                return torch.searchsorted(x, values, side="right", right=True)
+
+        f = Foo()
 
         x, _ = torch.sort(torch.randn(3, 4))
         exported_module = export(f, (x,)).run_decompositions()
@@ -199,116 +327,248 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         g = c.graph_module.graph
         self.assertLess(
             g.nodes[0].inputs[0].arg.as_tensor.name,
-            g.nodes[1].inputs[0].arg.as_tensor.name
+            g.nodes[1].inputs[0].arg.as_tensor.name,
         )
 
+    def test_int_list(self) -> None:
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.aten.sum.dim_IntList(x, [])
+
+        ep = torch.export.export(M(), (torch.randn(3, 2),))
+        serialized = ExportedProgramSerializer().serialize(ep)
+        for node in serialized.exported_program.graph_module.graph.nodes:
+            if "aten.sum.dim_IntList" in node.target:
+                self.assertEqual(node.inputs[1].arg.type, "as_ints")
+
 
+@unittest.skipIf(IS_WINDOWS, "Windows not supported for this test")
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
 class TestDeserialize(TestCase):
-    def check_graph(self, fn, inputs, dynamic_shapes=None, _check_meta=True) -> None:
-        """Export a graph, serialize it, deserialize it, and compare the results."""
-        # TODO(angelayi): test better with some sort of wrapper
-        ep = torch.export.export(fn, inputs, {}, dynamic_shapes=dynamic_shapes)
-        ep.graph.eliminate_dead_code()
+    def setUp(self):
+        super().setUp()
+        init_torchbind_implementations()
+
+    def _check_graph_nodes(self, gm1, gm2, _check_meta=True):
+        # TODO: The _check_meta flag bypasses checking for
+        # source_fn/nn_module_stack as there is an issue with
+        # roundtripping the source_fn value on torch.ops.map nodes
+        # original source_fn: <functorch.experimental._map.MapWrapper object at 0x7f80a0549930>
+        # deserialized source_fn: 'functorch.experimental._map.map'
+
+        self.assertEqual(len(gm1.graph.nodes), len(gm2.graph.nodes))
+
+        for node1, node2 in zip(gm1.graph.nodes, gm2.graph.nodes):
+            self.assertEqual(node1.op, node2.op)
+            if node1.op == "call_function":
+                # Check "val" metadata
+                val1 = node1.meta.get("val", None)
+                val2 = node2.meta.get("val", None)
+                if val1 is None or val2 is None:
+                    # Either both are None
+                    self.assertEqual(val1, val2)
+                elif isinstance(val1, FakeTensor) and isinstance(val2, FakeTensor):
+                    # Or both are fake tensors with the same shape/dtype
+                    self.assertEqual(len(val1.shape), len(val2.shape))
+                    for s1, s2 in zip(val1.shape, val2.shape):
+                        if is_concrete_int(s1) and is_concrete_int(s2):
+                            self.assertEqual(s1, s2)
+                        else:
+                            self.assertEqual(str(s1), str(s2))
+                    self.assertEqual(val1.dtype, val2.dtype)
+                elif isinstance(val1, (list, tuple)) and isinstance(
+                    val2, (list, tuple)
+                ):
+                    # Or both are fake tensors lists with one element and with the
+                    # same shape/dtype
+                    for v1, v2 in zip(
+                        pytree.tree_leaves(val1), pytree.tree_leaves(val2)
+                    ):
+                        if isinstance(v1, FakeTensor):
+                            self.assertEqual(v1.shape, v2.shape)
+                            self.assertEqual(v1.dtype, v2.dtype)
+                else:
+                    # For expressions like 's0 < 10' can only compare through string
+                    self.assertEqual(str(val1), str(val2))
 
-        serialized_artifact = serialize(ep, opset_version={"aten": 0})
-        deserialized_ep = deserialize(serialized_artifact, expected_opset_version={"aten": 0})
-        deserialized_ep.graph.eliminate_dead_code()
+                # Check "stack_trace" metadata
+                self.assertEqual(
+                    node1.meta.get("stack_trace", None),
+                    node2.meta.get("stack_trace", None),
+                )
 
-        orig_outputs = ep(*inputs)
-        loaded_outputs = deserialized_ep(*inputs)
+                if node1.target == torch.ops.higher_order.cond:
+                    true_graph1 = getattr(gm1, node1.args[1].target)
+                    true_graph2 = getattr(gm2, node2.args[1].target)
+                    self._check_graph_nodes(true_graph1, true_graph2)
+
+                    false_graph1 = getattr(gm1, node1.args[2].target)
+                    false_graph2 = getattr(gm2, node2.args[2].target)
+                    self._check_graph_nodes(false_graph1, false_graph2)
+                elif node1.target == torch.ops.higher_order.map_impl:
+                    map_graph1 = getattr(gm1, node1.args[0].target)
+                    map_graph2 = getattr(gm2, node2.args[0].target)
+                    self._check_graph_nodes(map_graph1, map_graph2, False)
+
+            if _check_meta and node1.op not in ("get_attr", "placeholder", "output"):
+                # Check "nn_module_stack" metadata
+                self.assertEqual(
+                    node1.meta.get("nn_module_stack", None),
+                    node2.meta.get("nn_module_stack", None),
+                )
+                # Check "source_fn_stack" metadata
+                self.assertEqual(
+                    node1.meta.get("source_fn_stack", None),
+                    node2.meta.get("source_fn_stack", None),
+                )
 
-        flat_orig_outputs = pytree.tree_leaves(orig_outputs)
-        flat_loaded_outputs = pytree.tree_leaves(loaded_outputs)
+    def check_graph(
+        self,
+        fn,
+        inputs,
+        dynamic_shapes=None,
+        _check_meta=True,
+        use_pre_dispatch=True,
+        strict=True,
+    ) -> None:
+        """Export a graph, serialize it, deserialize it, and compare the results."""
 
-        for orig, loaded in zip(flat_orig_outputs, flat_loaded_outputs):
-            self.assertEqual(type(orig), type(loaded))
-            if isinstance(orig, torch.Tensor):
-                if orig.is_meta:
-                    self.assertEqual(orig, loaded)
-                else:
-                    self.assertTrue(torch.allclose(orig, loaded))
+        def _check_graph(pre_dispatch):
+            if pre_dispatch:
+                ep = torch.export._trace._export(
+                    fn,
+                    copy.deepcopy(inputs),
+                    {},
+                    dynamic_shapes=dynamic_shapes,
+                    pre_dispatch=True,
+                    strict=strict,
+                )
             else:
-                self.assertEqual(orig, loaded)
-
-        def _check_graph_nodes(gm1, gm2, _check_meta=True):
-            # TODO: The _check_meta flag bypasses checking for
-            # source_fn/nn_module_stack as there is an issue with
-            # roundtripping the source_fn value on torch.ops.map nodes
-            # original source_fn: <functorch.experimental._map.MapWrapper object at 0x7f80a0549930>
-            # deserialized source_fn: 'functorch.experimental._map.map'
-
-            self.assertEqual(len(gm1.graph.nodes), len(gm2.graph.nodes))
-
-            for node1, node2 in zip(gm1.graph.nodes, gm2.graph.nodes):
-                self.assertEqual(node1.op, node2.op)
-                if node1.op == "call_function":
-                    # Check "val" metadata
-                    val1 = node1.meta.get("val", None)
-                    val2 = node2.meta.get("val", None)
-                    if val1 is None or val2 is None:
-                        # Either both are None
-                        self.assertEqual(val1, val2)
-                    elif isinstance(val1, FakeTensor) and isinstance(val2, FakeTensor):
-                        # Or both are fake tensors with the same shape/dtype
-                        self.assertEqual(len(val1.shape), len(val2.shape))
-                        for s1, s2 in zip(val1.shape, val2.shape):
-                            if is_concrete_int(s1) and is_concrete_int(s2):
-                                self.assertEqual(s1, s2)
-                            else:
-                                self.assertEqual(str(s1), str(s2))
-                        self.assertEqual(val1.dtype, val2.dtype)
-                    elif isinstance(val1, (list, tuple)) and isinstance(val2, (list, tuple)):
-                        # Or both are fake tensors lists with one element and with the
-                        # same shape/dtype
-                        for v1, v2 in zip(pytree.tree_leaves(val1), pytree.tree_leaves(val2)):
-                            self.assertEqual(v1.shape, v2.shape)
-                            self.assertEqual(v1.dtype, v2.dtype)
-                    else:
-                        # For expressions like 's0 < 10' can only compare through string
-                        self.assertEqual(str(val1), str(val2))
+                ep = torch.export.export(
+                    fn,
+                    copy.deepcopy(inputs),
+                    {},
+                    dynamic_shapes=dynamic_shapes,
+                    strict=strict,
+                )
+            ep.graph.eliminate_dead_code()
 
-                    # Check "stack_trace" metadata
-                    self.assertEqual(
-                        node1.meta.get("stack_trace", None),
-                        node2.meta.get("stack_trace", None),
-                    )
+            serialized_artifact = serialize(ep, opset_version={"aten": 0})
+            deserialized_ep = deserialize(
+                serialized_artifact, expected_opset_version={"aten": 0}
+            )
+            deserialized_ep.graph.eliminate_dead_code()
 
-                    if node1.target == torch.ops.higher_order.cond:
-                        true_graph1 = getattr(gm1, node1.args[1].target)
-                        true_graph2 = getattr(gm2, node2.args[1].target)
-                        _check_graph_nodes(true_graph1, true_graph2)
-
-                        false_graph1 = getattr(gm1, node1.args[2].target)
-                        false_graph2 = getattr(gm2, node2.args[2].target)
-                        _check_graph_nodes(false_graph1, false_graph2)
-                    elif node1.target == torch.ops.higher_order.map_impl:
-                        map_graph1 = getattr(gm1, node1.args[0].target)
-                        map_graph2 = getattr(gm2, node2.args[0].target)
-                        _check_graph_nodes(map_graph1, map_graph2, False)
-
-                if (
-                    _check_meta and
-                    node1.op not in ("get_attr", "placeholder", "output")
-                ):
-                    # Check "nn_module_stack" metadata
-                    # TODO nn_module_stack is not roundtrippable.
-                    # self.assertEqual(
-                    #     node1.meta.get("nn_module_stack", None),
-                    #     node2.meta.get("nn_module_stack", None),
-                    # )
-                    # Check "source_fn_stack" metadata
-                    self.assertEqual(
-                        node1.meta.get("source_fn_stack", None),
-                        node2.meta.get("source_fn_stack", None),
-                    )
+            orig_outputs = ep.module()(*copy.deepcopy(inputs))
+            loaded_outputs = deserialized_ep.module()(*copy.deepcopy(inputs))
 
-        _check_graph_nodes(ep.graph_module, deserialized_ep.graph_module, _check_meta)
+            flat_orig_outputs = pytree.tree_leaves(orig_outputs)
+            flat_loaded_outputs = pytree.tree_leaves(loaded_outputs)
+
+            for orig, loaded in zip(flat_orig_outputs, flat_loaded_outputs):
+                self.assertEqual(type(orig), type(loaded))
+                if isinstance(orig, torch.Tensor):
+                    if orig.is_meta:
+                        self.assertEqual(orig, loaded)
+                    else:
+                        self.assertTrue(torch.allclose(orig, loaded))
+                else:
+                    self.assertEqual(orig, loaded)
+            self._check_graph_nodes(
+                ep.graph_module, deserialized_ep.graph_module, _check_meta
+            )
+
+        if use_pre_dispatch:
+            _check_graph(pre_dispatch=True)
+            _check_graph(pre_dispatch=False)
+        else:
+            _check_graph(pre_dispatch=False)
+
+    def test_optional_tuple(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            torch.library.define(
+                "mylib::foo",
+                "(Tensor a, Tensor b, Tensor? c) -> (Tensor, Tensor?)",
+                tags=torch.Tag.pt2_compliant_tag,
+                lib=lib,
+            )
+
+            @torch.library.impl("mylib::foo", "cpu", lib=lib)
+            @torch.library.impl_abstract("mylib::foo")
+            def foo_impl(a, b, c):
+                res2 = None
+                if c is not None:
+                    res2 = c + a + b
+                return a + b, res2
+
+            class M(torch.nn.Module):
+                def forward(self, a, b, c):
+                    return torch.ops.mylib.foo(a, b, c)
+
+            self.check_graph(M(), (torch.randn(3), torch.randn(3), torch.randn(3)))
+
+    def test_auto_functionalize(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            torch.library.define(
+                "mylib::foo1",
+                "(Tensor(a!) x, Tensor[] y, Tensor(b!) z, SymInt w, Tensor n) -> Tensor",
+                tags=torch.Tag.pt2_compliant_tag,
+                lib=lib,
+            )
+            torch.library.define(
+                "mylib::foo2",
+                "(Tensor(a!) x, Tensor[] y, Tensor(b!) z, SymInt w, Tensor n) -> (Tensor, Tensor)",
+                tags=torch.Tag.pt2_compliant_tag,
+                lib=lib,
+            )
+            torch.library.define(
+                "mylib::foo3",
+                "(Tensor(a!) x, Tensor[] y, Tensor(b!) z, SymInt w, Tensor n) -> ()",
+                tags=torch.Tag.pt2_compliant_tag,
+                lib=lib,
+            )
+
+            @torch.library.impl("mylib::foo1", "cpu", lib=lib)
+            @torch.library.impl_abstract("mylib::foo1")
+            def foo1_impl(x, y, z, w, n):
+                x.add_(y[0] + w)
+                z.add_(y[1] + n)
+                return n + n
+
+            @torch.library.impl("mylib::foo2", "cpu", lib=lib)
+            @torch.library.impl_abstract("mylib::foo2")
+            def foo2_impl(x, y, z, w, n):
+                x.add_(y[0] + w)
+                z.add_(y[1] + n)
+                return (n + n, n * n)
+
+            @torch.library.impl("mylib::foo3", "cpu", lib=lib)
+            @torch.library.impl_abstract("mylib::foo3")
+            def foo3_impl(x, y, z, w, n):
+                x.add_(y[0] + w)
+                z.add_(y[1] + n)
+                return
+
+            class M(torch.nn.Module):
+                def forward(self, x, y, z, n):
+                    n = torch.ops.mylib.foo1(x, y, z, 2, n)
+                    torch.ops.mylib.foo3(x, y, z, 2, n)
+                    return torch.ops.mylib.foo2(x, y, z, 2, n)
+
+            x = torch.randn(3)
+            y = (torch.randn(3), torch.randn(3))
+            z = torch.randn(3)
+            n = torch.randn(3)
+            orig_args = (x, y, z, n)
+
+            # TODO Auto_functionalize is not supported on pre_dispatch IR
+            self.check_graph(M(), orig_args, use_pre_dispatch=False)
 
     def test_multi_return(self) -> None:
         """
         Test multiple return from a single node (ex. layer_norm has 2 outputs)
         """
+
         class MyModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -361,22 +621,27 @@ def forward(self, a, b, c) -> torch.Tensor:
         dynamic_shapes = {"a": {0: dim0_ac}, "b": None, "c": {0: dim0_ac}}
         self.check_graph(DynamicShapeSimpleModel(), inputs, dynamic_shapes)
 
+    # TODO: Failing due to "constraining non-Symbols NYI (Piecewise((1, Eq(u1, 1)), (0, True)), 1, 1)"
+    @unittest.expectedFailure
     def test_sym_bool(self):
-        def f(x, y):
-            assert x.size(0) in y
-            return x + y
+        class Module(torch.nn.Module):
+            def forward(self, x, y):
+                assert x.size(0) in y
+                return x + y
 
+        f = Module()
         self.check_graph(f, (torch.ones(1), torch.ones(3)))
 
     def test_shape(self):
-        def f(x):
-            z, y = x.size()
-            return z + y + x[0], z
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                z, y = x.size()
+                return z + y + x[0], z
 
         inputs = (torch.ones(2, 3),)
         dim0_x, dim1_x = torch.export.dims("dim0_x", "dim1_x")
         dynamic_shapes = {"x": (dim0_x, dim1_x)}
-        self.check_graph(f, inputs, dynamic_shapes)
+        self.check_graph(Foo(), inputs, dynamic_shapes)
 
     def test_module(self):
         class M(torch.nn.Module):
@@ -413,6 +678,7 @@ def forward(self, x):
 
     def test_cond(self):
         from functorch.experimental.control_flow import cond
+
         inputs = torch.ones(4, 3), torch.zeros(4, 3)
 
         class M(torch.nn.Module):
@@ -422,6 +688,7 @@ def t(x, y):
 
                 def f(x, y):
                     return x - y
+
                 return cond(x[0][0] > 4, t, f, [x, y])
 
         self.check_graph(M(), inputs)
@@ -432,31 +699,43 @@ def test_map(self):
         def f(x, y):
             return x + y
 
-        def g(xs, y):
-            return control_flow.map(f, xs, y)
+        class Module(torch.nn.Module):
+            def forward(self, xs, y):
+                return control_flow.map(f, xs, y)
 
+        g = Module()
         inputs = (torch.ones(3, 2, 2), torch.ones(2))
         self.check_graph(g, inputs, _check_meta=False)
 
     def test_tensor_tensor_list(self):
-        from torch.library import Library
-        lib = Library("_export", "FRAGMENT")
-        lib.define(
-            "_test_tensor_tensor_list_output(Tensor x, Tensor y) -> (Tensor, Tensor[])",
-            tags=torch.Tag.pt2_compliant_tag)
-
-        def _test_tensor_tensor_list_output(x, y):
-            return y, [x]
-
-        lib.impl("_test_tensor_tensor_list_output", _test_tensor_tensor_list_output, "CPU")
-        lib.impl("_test_tensor_tensor_list_output", _test_tensor_tensor_list_output, "Meta")
-
-        class M(torch.nn.Module):
-            def forward(self, x, y):
-                a, b = torch.ops._export._test_tensor_tensor_list_output.default(x, y)
-                return a + b[0]
+        with torch.library._scoped_library("_export", "FRAGMENT") as lib:
+            lib.define(
+                "_test_tensor_tensor_list_output(Tensor x, Tensor y) -> (Tensor, Tensor[])",
+                tags=torch.Tag.pt2_compliant_tag,
+            )
+
+            def _test_tensor_tensor_list_output(x, y):
+                return y, [x]
+
+            lib.impl(
+                "_test_tensor_tensor_list_output",
+                _test_tensor_tensor_list_output,
+                "CPU",
+            )
+            lib.impl(
+                "_test_tensor_tensor_list_output",
+                _test_tensor_tensor_list_output,
+                "Meta",
+            )
+
+            class M(torch.nn.Module):
+                def forward(self, x, y):
+                    a, b = torch.ops._export._test_tensor_tensor_list_output.default(
+                        x, y
+                    )
+                    return a + b[0]
 
-        self.check_graph(M(), (torch.rand(3, 2), torch.rand(3, 2)))
+            self.check_graph(M(), (torch.rand(3, 2), torch.rand(3, 2)))
 
     def test_list_of_optional_tensors(self) -> None:
         class MyModule(torch.nn.Module):
@@ -472,13 +751,14 @@ def forward(self, x, y, z):
         self.check_graph(MyModule(), inputs)
 
     def test_sym_ite(self):
-        def f(x):
-            b = x.shape[0] == 5
-            ret = torch.sym_ite(b, x.shape[0], x.shape[1])
-            return ret
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                b = x.shape[0] == 5
+                ret = torch.sym_ite(b, x.shape[0], x.shape[1])
+                return ret
 
-        dynamic_shapes = {'x': {0: Dim("dim0"), 1: Dim("dim1")}}
-        self.check_graph(f, (torch.ones(4, 5),), dynamic_shapes=dynamic_shapes)
+        dynamic_shapes = {"x": {0: Dim("dim0"), 1: Dim("dim1")}}
+        self.check_graph(Foo(), (torch.ones(4, 5),), dynamic_shapes=dynamic_shapes)
 
     @parametrize(
         "name,case",
@@ -492,23 +772,29 @@ def test_exportdb_supported(self, name: str, case: ExportCase) -> None:
         self.check_graph(model, inputs.args, _check_meta=_check_meta)
 
     def test_constraints(self):
-        def f(x, y):
-            n = x.item()
-            torch._constrain_as_size(n, min=2)
-            return y.sum() + torch.ones(n, 5).sum()
+        class Module(torch.nn.Module):
+            def forward(self, x, y):
+                n = x.item()
+                torch._check_is_size(n)
+                return y.sum() + torch.ones(n, 5).sum()
 
+        f = Module()
         self.check_graph(f, (torch.tensor(3), torch.randn(4, 5)))
 
     def test_get_attr(self) -> None:
-        def f(x):
-            return x + torch.tensor(3)
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return x + torch.tensor(3)
 
+        f = Module()
         self.check_graph(f, (torch.tensor(3),))
 
     def test_get_attr_list(self) -> None:
-        def f(x):
-            return torch.cat([x, torch.tensor([1, 1])])
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return torch.cat([x, torch.tensor([1, 1])])
 
+        f = Module()
         self.check_graph(f, (torch.tensor([1, 1]),))
 
     @unittest.skipIf(not torch.cuda.is_available(), "Requires cuda")
@@ -529,21 +815,92 @@ def forward(self, x):
         model = MyModule().eval().cuda()
         self.check_graph(model, (inp,))
 
+    def test_custom_obj_tuple_out(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+            def forward(self, x):
+                a = torch.ops._TorchScriptTesting.takes_foo_tuple_return(self.attr, x)
+                y = a[0] + a[1]
+                b = torch.ops._TorchScriptTesting.takes_foo(self.attr, y)
+                return x + b
+
+        m = MyModule()
+        inputs = (torch.ones(2, 3),)
+        self.check_graph(m, inputs, strict=False)
+
+    def test_custom_obj(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+            def forward(self, x):
+                a = torch.ops._TorchScriptTesting.takes_foo(self.attr, x)
+                b = torch.ops._TorchScriptTesting.takes_foo(self.attr, a)
+                return x + b
+
+        m = MyModule()
+        inputs = (torch.ones(2, 3),)
+        self.check_graph(m, inputs, strict=False)
+
+    def test_custom_obj_list_out(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+            def forward(self, x):
+                a = torch.ops._TorchScriptTesting.takes_foo_list_return(self.attr, x)
+                y = a[0] + a[1] + a[2]
+                b = torch.ops._TorchScriptTesting.takes_foo(self.attr, y)
+                return x + b
+
+        m = MyModule()
+        inputs = (torch.ones(2, 3),)
+        self.check_graph(m, inputs, strict=False)
+
+    def test_export_no_inputs(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.p = torch.ones(3, 3)
+
+            def forward(self):
+                return self.p * self.p
+
+        ep = torch.export.export(M(), ())
+        ep._example_inputs = None
+        roundtrip_ep = deserialize(serialize(ep))
+        self.assertTrue(torch.allclose(ep.module()(), roundtrip_ep.module()()))
+
 
 instantiate_parametrized_tests(TestDeserialize)
 
+
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
 class TestSchemaVersioning(TestCase):
     def test_error(self):
-        def f(x):
-            return x + x
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return x + x
 
+        f = Module()
         ep = export(f, (torch.randn(1, 3),))
 
-        serialized_artifact = ExportedProgramSerializer().serialize(ep)
-        serialized_artifact.exported_program.schema_version.major = -1
-        with self.assertRaisesRegex(SerializeError, r"Serialized schema version .* does not match our current"):
-            ExportedProgramDeserializer().deserialize(serialized_artifact)
+        serialized_program = ExportedProgramSerializer().serialize(ep)
+        serialized_program.exported_program.schema_version.major = -1
+        with self.assertRaisesRegex(
+            SerializeError, r"Serialized schema version .* does not match our current"
+        ):
+            ExportedProgramDeserializer().deserialize(
+                serialized_program.exported_program,
+                serialized_program.state_dict,
+                serialized_program.constants,
+                serialized_program.example_inputs,
+            )
 
 
 class TestOpVersioning(TestCase):
@@ -567,27 +924,35 @@ def test_model_op_namespace_version_missing_from_deserializer_do_not_raises(self
         compiler_opset_version = {"aten": 3}
         model_opset_version = {"aten": 3, "custom": 4}
         deserializer = ExportedProgramDeserializer(compiler_opset_version)
-        with self.assertLogs(level='WARN') as log:
+        with self.assertLogs(level="WARN") as log:
             deserializer._validate_model_opset_version(model_opset_version)
-            self.assertIn("Compiler doesn't have a version table for op namespace", log.output[0])
+            self.assertIn(
+                "Compiler doesn't have a version table for op namespace", log.output[0]
+            )
 
-unittest.expectedFailure(
-    TestDeserialize.test_exportdb_supported_case_tensor_setattr
-)
+
+# We didn't set up kwargs input yet
+unittest.expectedFailure(TestDeserialize.test_exportdb_supported_case_fn_with_kwargs)
+
+# Failed to produce a graph during tracing. Tracing through 'f' must produce a single graph.
+unittest.expectedFailure(TestDeserialize.test_exportdb_supported_case_scalar_output)
 
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
 class TestSaveLoad(TestCase):
     def test_save_buffer(self):
         inp = (torch.tensor([0.1, 0.1]),)
-        linear = torch.nn.Linear(2, 2)
 
         class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+
             def forward(self, x):
                 x = x + 1
                 y = x.t()
                 y = y.relu()
-                y = linear(y)
+                y = self.linear(y)
                 return y
 
         ep = export(Module(), inp)
@@ -597,12 +962,14 @@ def forward(self, x):
         buffer.seek(0)
         loaded_ep = load(buffer)
 
-        self.assertTrue(torch.allclose(ep(*inp), loaded_ep(*inp)))
+        self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))
 
     def test_save_file(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return x * x
 
-        def f(x):
-            return x * x
+        f = Foo()
 
         inp = (torch.randn(2, 2),)
         ep = export(f, inp)
@@ -612,11 +979,14 @@ def f(x):
             f.seek(0)
             loaded_ep = load(f)
 
-        self.assertTrue(torch.allclose(ep(*inp), loaded_ep(*inp)))
+        self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))
 
     def test_save_path(self):
-        def f(x, y):
-            return x + y
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        f = Foo()
 
         inp = (torch.tensor([6]), torch.tensor([7]))
         ep = export(f, inp)
@@ -626,13 +996,16 @@ def f(x, y):
             save(ep, path)
             loaded_ep = load(path)
 
-        self.assertTrue(torch.allclose(ep(*inp), loaded_ep(*inp)))
+        self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))
 
     def test_save_extra(self):
         inp = (torch.tensor([0.1, 0.1]),)
 
-        def f(x):
-            return x * x + x
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return x * x + x
+
+        f = Foo()
 
         ep = export(f, inp)
 
@@ -642,12 +1015,15 @@ def f(x):
         extra_files = {"extra.txt": ""}
         loaded_ep = load(buffer, extra_files=extra_files)
 
-        self.assertTrue(torch.allclose(ep(*inp), loaded_ep(*inp)))
+        self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))
         self.assertEqual(extra_files["extra.txt"], "moo")
 
     def test_version_error(self):
-        def f(x):
-            return x + x
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return x + x
+
+        f = Foo()
 
         ep = export(f, (torch.randn(1, 3),))
 
@@ -656,10 +1032,12 @@ def f(x):
             f.seek(0)
 
             # Modify the version
-            with zipfile.ZipFile(f, 'a') as zipf:
-                zipf.writestr('version', "-1.1")
+            with zipfile.ZipFile(f, "a") as zipf:
+                zipf.writestr("version", "-1.1")
 
-            with self.assertRaisesRegex(RuntimeError, r"Serialized version .* does not match our current"):
+            with self.assertRaisesRegex(
+                RuntimeError, r"Serialized version .* does not match our current"
+            ):
                 f.seek(0)
                 load(f)
 
@@ -680,23 +1058,23 @@ def forward(self, x):
         loaded_ep = load(buffer)
 
         inp = (torch.tensor(1),)
-        self.assertTrue(torch.allclose(ep(*inp), loaded_ep(*inp)))
+        self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))
+
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
 class TestSerializeCustomClass(TestCase):
     def setUp(self):
-        if IS_SANDCASTLE or IS_MACOS or IS_FBCODE:
-            raise unittest.SkipTest("non-portable load_library call used in test")
-        lib_file_path = find_library_location('libtorchbind_test.so')
-        if IS_WINDOWS:
-            lib_file_path = find_library_location('torchbind_test.dll')
-        torch.ops.load_library(str(lib_file_path))
+        super().setUp()
+        init_torchbind_implementations()
 
     def test_custom_class(self):
         custom_obj = torch.classes._TorchScriptTesting._PickleTester([3, 4])
 
-        def f(x):
-            return x + x
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return x + x
+
+        f = Foo()
 
         inputs = (torch.zeros(4, 4),)
         ep = export(f, inputs)
@@ -710,22 +1088,55 @@ def f(x):
                         (custom_obj,),
                     )
                     custom_node.meta["val"] = torch.ones(4, 4)
+                    custom_node.meta["torch_fn"] = (
+                        "take_an_instance",
+                        "take_an_instance",
+                    )
                     arg0, _ = node.args
                     node.args = (arg0, custom_node)
 
         serialized_vals = serialize(ep)
+
+        ep_str = serialized_vals.exported_program.decode("utf-8")
+        assert "class_fqn" in ep_str
+        assert custom_obj._type().qualified_name() in ep_str
+
         deserialized_ep = deserialize(serialized_vals)
 
         for node in deserialized_ep.graph.nodes:
             if (
-                node.op == "call_function" and
-                node.target == torch.ops._TorchScriptTesting.take_an_instance.default
+                node.op == "call_function"
+                and node.target
+                == torch.ops._TorchScriptTesting.take_an_instance.default
             ):
                 arg = node.args[0]
                 self.assertTrue(isinstance(arg, torch._C.ScriptObject))
+                self.assertEqual(arg._type(), custom_obj._type())
                 self.assertEqual(arg.__getstate__(), custom_obj.__getstate__())
                 self.assertEqual(arg.top(), 7)
 
+    def test_custom_class_containing_fake_tensor(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.custom_obj = torch.classes._TorchScriptTesting._ContainsTensor(
+                    torch.rand(2, 3)
+                )
+
+            def forward(self, x):
+                return x + self.custom_obj.get()
+
+        with FakeTensorMode():
+            f = Foo()
+
+        inputs = (torch.zeros(2, 3),)
+        with enable_torchbind_tracing():
+            ep = export(f, inputs, strict=False)
+
+        serialized_vals = serialize(ep)
+        ep = deserialize(serialized_vals)
+        self.assertTrue(isinstance(ep.constants["custom_obj"].get(), FakeTensor))
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_torchbind.py b/test/export/test_torchbind.py
new file mode 100644
index 0000000000000..adaa5bf43547d
--- /dev/null
+++ b/test/export/test_torchbind.py
@@ -0,0 +1,1279 @@
+# Owner(s): ["oncall: export"]
+
+
+import torch
+import torch.utils._pytree as pytree
+from torch._dynamo.testing import EagerAndRecordGraphs
+from torch._functorch.aot_autograd import aot_export_module
+from torch._higher_order_ops.torchbind import enable_torchbind_tracing
+
+from torch._higher_order_ops.wrap import wrap
+from torch._library.fake_class_registry import FakeScriptObject
+from torch.export import export
+from torch.export._trace import _export
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    skipIfTorchDynamo,
+    TestCase,
+)
+from torch.testing._internal.torchbind_impls import (
+    _empty_tensor_queue,
+    init_torchbind_implementations,
+)
+
+
+def _assertEqualSkipScriptObject(test_case, exp, actual):
+    flat_exp = pytree.tree_leaves(exp)
+    flat_actual = pytree.tree_leaves(actual)
+    test_case.assertEqual(len(flat_exp), len(flat_actual))
+    for a, b in zip(flat_exp, flat_actual):
+        if isinstance(a, torch.ScriptObject) and isinstance(b, torch.ScriptObject):
+            continue
+        test_case.assertEqual(a, b)
+
+
+def _check_script_obj_equal(test_case, a: torch.ScriptObject, b: torch.ScriptObject):
+    return test_case.assertEqual(
+        a._type().qualified_name(), b._type().qualified_name()
+    ) and test_case.assertEqual(a.__obj_flatten__(), b.__obj_flatten__())
+
+
+def _assertEqualScriptObject(
+    test_case, exp, actual, check_obj_eq=_check_script_obj_equal
+):
+    flat_exp = pytree.tree_leaves(exp)
+    flat_actual = pytree.tree_leaves(actual)
+    test_case.assertEqual(len(flat_exp), len(flat_actual))
+    for a, b in zip(flat_exp, flat_actual):
+        if isinstance(a, torch.ScriptObject) and isinstance(b, torch.ScriptObject):
+            check_obj_eq(test_case, a, b)
+        else:
+            test_case.assertEqual(a, b)
+
+
+@skipIfTorchDynamo("torchbind not supported with dynamo yet")
+class TestExportTorchbind(TestCase):
+    def setUp(self):
+        init_torchbind_implementations()
+
+        test = self
+        test.tq_push_counter = 0
+        test.tq_pop_counter = 0
+        test.tq_size_counter = 0
+        test.foo_add_tensor_counter = 0
+
+        @torch._library.register_fake_class("_TorchScriptTesting::_Foo")
+        class FakeFoo:
+            def __init__(self, x: int, y: int):
+                self.x = x
+                self.y = y
+
+            @classmethod
+            def __obj_unflatten__(cls, flattend_foo):
+                return cls(**dict(flattend_foo))
+
+            def add_tensor(self, z):
+                test.foo_add_tensor_counter += 1
+                return (self.x + self.y) * z
+
+        @torch._library.register_fake_class("_TorchScriptTesting::_TensorQueue")
+        class FakeTensorQueue:
+            def __init__(self, queue):
+                self.queue = queue
+
+            @classmethod
+            def __obj_unflatten__(cls, flattened_ctx):
+                return cls(**dict(flattened_ctx))
+
+            def push(self, x):
+                test.tq_push_counter += 1
+                self.queue.append(x)
+
+            def pop(self):
+                test.tq_pop_counter += 1
+                return self.queue.pop(0)
+
+            def size(self):
+                test.tq_size_counter += 1
+                return len(self.queue)
+
+        self.torch_bind_ops = [
+            torch.ops._TorchScriptTesting.takes_foo,
+            torch.ops._TorchScriptTesting.takes_foo_python_meta,
+            torch.ops._TorchScriptTesting.takes_foo_list_return,
+            torch.ops._TorchScriptTesting.takes_foo_tuple_return,
+            torch.ops._TorchScriptTesting.take_an_instance,
+            torch.ops._TorchScriptTesting.take_an_instance_inferred,
+            torch.ops._TorchScriptTesting.takes_foo_cia,
+            torch.ops._TorchScriptTesting.queue_pop,
+            torch.ops._TorchScriptTesting.queue_push,
+            torch.ops._TorchScriptTesting.queue_size,
+        ]
+
+    def tearDown(self):
+        torch._library.fake_class_registry.deregister_fake_class(
+            "_TorchScriptTesting::_Foo"
+        )
+        torch._library.fake_class_registry.deregister_fake_class(
+            "_TorchScriptTesting::_TensorQueue"
+        )
+
+    def _test_export_same_as_eager(
+        self, f, args, kwargs=None, strict=True, pre_dispatch=False
+    ):
+        kwargs = kwargs or {}
+
+        def export_wrapper(f, args, kwargs, strcit, pre_dispatch):
+            with enable_torchbind_tracing():
+                if pre_dispatch:
+                    exported_program = _export(
+                        f, args, kwargs, strict=strict, pre_dispatch=True
+                    )
+                else:
+                    exported_program = export(f, args, kwargs, strict=strict)
+            return exported_program
+
+        exported_program = export_wrapper(f, args, kwargs, strict, pre_dispatch)
+        reversed_kwargs = {key: kwargs[key] for key in reversed(kwargs)}
+        unlifted = exported_program.module()
+        exp = f(*args, **kwargs)
+        self.assertEqual(unlifted(*args, **kwargs), exp)
+        self.assertEqual(
+            unlifted(*args, **reversed_kwargs),
+            exp,
+        )
+
+        # check re-tracing
+        retraced_ep = export_wrapper(unlifted, args, kwargs, strict, pre_dispatch)
+        self.assertEqual(retraced_ep.module()(*args, **kwargs), exp)
+        return exported_program
+
+    @parametrize("pre_dispatch", [True, False])
+    def test_none(self, pre_dispatch):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+            def forward(self, x, n):
+                return x + self.attr.add_tensor(x)
+
+        ep = self._test_export_same_as_eager(
+            MyModule(),
+            (torch.ones(2, 3), None),
+            strict=False,
+            pre_dispatch=pre_dispatch,
+        )
+        self.assertExpectedInline(
+            ep.module().code.strip(),
+            """\
+def forward(self, x, n):
+    x, n, = fx_pytree.tree_flatten_spec(([x, n], {}), self._in_spec)
+    attr = self.attr
+    call_torchbind = torch.ops.higher_order.call_torchbind(attr, 'add_tensor', x);  attr = None
+    add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
+    return pytree.tree_unflatten((add,), self._out_spec)""",
+        )
+        self.assertExpectedInline(
+            ep.graph_module.code.strip(),
+            """\
+def forward(self, obj_attr, x, n):
+    call_torchbind = torch.ops.higher_order.call_torchbind(obj_attr, 'add_tensor', x);  obj_attr = None
+    add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
+    return (add,)""",
+        )
+
+    @parametrize("pre_dispatch", [True, False])
+    def test_attribute(self, pre_dispatch):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+            def forward(self, x):
+                return x + self.attr.add_tensor(x)
+
+        ep = self._test_export_same_as_eager(
+            MyModule(), (torch.ones(2, 3),), strict=False, pre_dispatch=pre_dispatch
+        )
+        self.assertExpectedInline(
+            ep.module().code.strip(),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    attr = self.attr
+    call_torchbind = torch.ops.higher_order.call_torchbind(attr, 'add_tensor', x);  attr = None
+    add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
+    return pytree.tree_unflatten((add,), self._out_spec)""",
+        )
+        self.assertExpectedInline(
+            ep.graph_module.code.strip(),
+            """\
+def forward(self, obj_attr, x):
+    call_torchbind = torch.ops.higher_order.call_torchbind(obj_attr, 'add_tensor', x);  obj_attr = None
+    add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
+    return (add,)""",
+        )
+
+    @parametrize("pre_dispatch", [True, False])
+    def test_attribute_as_custom_op_argument(self, pre_dispatch):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+            def forward(self, x):
+                return x + torch.ops._TorchScriptTesting.takes_foo(self.attr, x)
+
+        ep = self._test_export_same_as_eager(
+            MyModule(), (torch.ones(2, 3),), strict=False, pre_dispatch=pre_dispatch
+        )
+        self.assertExpectedInline(
+            ep.module().code.strip(),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    attr = self.attr
+    takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(attr, x);  attr = None
+    add = torch.ops.aten.add.Tensor(x, takes_foo_default);  x = takes_foo_default = None
+    return pytree.tree_unflatten((add,), self._out_spec)""",
+        )
+        self.assertExpectedInline(
+            ep.graph_module.code.strip(),
+            """\
+def forward(self, token, obj_attr, x):
+    with_effects = torch._higher_order_ops.effects.with_effects(token, torch.ops._TorchScriptTesting.takes_foo.default, obj_attr, x);  token = obj_attr = None
+    getitem = with_effects[0]
+    getitem_1 = with_effects[1];  with_effects = None
+    add = torch.ops.aten.add.Tensor(x, getitem_1);  x = getitem_1 = None
+    return (getitem, add)""",  # noqa: B950
+        )
+
+    @parametrize("pre_dispatch", [True, False])
+    @parametrize("fakify_script_obj", [True, False])
+    def test_input(self, pre_dispatch, fakify_script_obj):
+        cc = torch.classes._TorchScriptTesting._Foo(10, 20)
+        if not fakify_script_obj:
+            qual_name = cc._type().qualified_name()  # type: ignore[att-defined]
+            if torch._library.fake_class_registry.has_fake_class(qual_name):
+                torch._library.fake_class_registry.deregister_fake_class(
+                    "_TorchScriptTesting::_Foo"
+                )
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, cc):
+                return x + cc.add_tensor(x)
+
+        ep = self._test_export_same_as_eager(
+            MyModule(), (torch.ones(2, 3), cc), strict=False, pre_dispatch=pre_dispatch
+        )
+        self.assertExpectedInline(
+            ep.module().code.strip(),
+            """\
+def forward(self, x, cc):
+    x, cc, = fx_pytree.tree_flatten_spec(([x, cc], {}), self._in_spec)
+    call_torchbind = torch.ops.higher_order.call_torchbind(cc, 'add_tensor', x);  cc = None
+    add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
+    return pytree.tree_unflatten((add,), self._out_spec)""",
+        )
+        self.assertExpectedInline(
+            ep.graph_module.code.strip(),
+            """\
+def forward(self, x, cc):
+    call_torchbind = torch.ops.higher_order.call_torchbind(cc, 'add_tensor', x);  cc = None
+    add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
+    return (add,)""",
+        )
+        # aot_export_function runs the program twice
+        # in run_functionalized_fw_and_collect_metadata and create_aot_dispatcher_function
+        # We also have a re-tracing test, which doubles the count.
+        if fakify_script_obj:
+            self.assertEqual(self.foo_add_tensor_counter, 4)
+
+    @parametrize("pre_dispatch", [True, False])
+    @parametrize("fakify_script_obj", [True, False])
+    def test_input_as_custom_op_argument(self, pre_dispatch, fakify_script_obj):
+        cc = torch.classes._TorchScriptTesting._Foo(10, 20)
+        if not fakify_script_obj:
+            qual_name = cc._type().qualified_name()  # type: ignore[att-defined]
+            if torch._library.fake_class_registry.has_fake_class(qual_name):
+                torch._library.fake_class_registry.deregister_fake_class(
+                    "_TorchScriptTesting::_Foo"
+                )
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, cc):
+                return x + torch.ops._TorchScriptTesting.takes_foo(cc, x)
+
+        del torch.ops._TorchScriptTesting.takes_foo.default.py_kernels[
+            torch._C.DispatchKey.Meta
+        ]
+        torch.ops._TorchScriptTesting.takes_foo.default._dispatch_cache.clear()
+        # Even though a C++ implementation for takes_foo.default is registered,
+        # we still need the python implementation for takes_foo.default to trace with FakeFoo.
+        if fakify_script_obj:
+            with self.assertRaisesRegex(
+                RuntimeError, "no python implementation is found"
+            ):
+                self._test_export_same_as_eager(
+                    MyModule(),
+                    (torch.ones(2, 3), cc),
+                    strict=False,
+                    pre_dispatch=pre_dispatch,
+                )
+
+        torch.ops._TorchScriptTesting.takes_foo.default.py_impl(
+            torch._C.DispatchKey.Meta
+        )(lambda cc, x: cc.add_tensor(x))
+        ep = self._test_export_same_as_eager(
+            MyModule(),
+            (torch.ones(2, 3), cc),
+            strict=False,
+            pre_dispatch=pre_dispatch,
+        )
+
+        self.assertExpectedInline(
+            ep.module().code.strip(),
+            """\
+def forward(self, x, cc):
+    x, cc, = fx_pytree.tree_flatten_spec(([x, cc], {}), self._in_spec)
+    takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(cc, x);  cc = None
+    add = torch.ops.aten.add.Tensor(x, takes_foo_default);  x = takes_foo_default = None
+    return pytree.tree_unflatten((add,), self._out_spec)""",
+        )
+        self.assertExpectedInline(
+            ep.graph_module.code.strip(),
+            """\
+def forward(self, token, x, cc):
+    with_effects = torch._higher_order_ops.effects.with_effects(token, torch.ops._TorchScriptTesting.takes_foo.default, cc, x);  token = cc = None
+    getitem = with_effects[0]
+    getitem_1 = with_effects[1];  with_effects = None
+    add = torch.ops.aten.add.Tensor(x, getitem_1);  x = getitem_1 = None
+    return (getitem, add)""",  # noqa: B950
+        )
+
+    @parametrize("pre_dispatch", [True, False])
+    def test_unlift_custom_obj(self, pre_dispatch):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+            def forward(self, x):
+                a = torch.ops._TorchScriptTesting.takes_foo(self.attr, x)
+                b = torch.ops._TorchScriptTesting.takes_foo(self.attr, a)
+                return x + b
+
+        input = torch.ones(2, 3)
+        ep = self._test_export_same_as_eager(
+            MyModule(), (input,), strict=False, pre_dispatch=pre_dispatch
+        )
+        self.assertExpectedInline(
+            ep.module().code.strip(),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    attr = self.attr
+    takes_foo_default_1 = torch.ops._TorchScriptTesting.takes_foo.default(attr, x)
+    takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(attr, takes_foo_default_1);  attr = takes_foo_default_1 = None
+    add = torch.ops.aten.add.Tensor(x, takes_foo_default);  x = takes_foo_default = None
+    return pytree.tree_unflatten((add,), self._out_spec)""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            ep.graph_module.code.strip(),
+            """\
+def forward(self, token, obj_attr, x):
+    with_effects = torch._higher_order_ops.effects.with_effects(token, torch.ops._TorchScriptTesting.takes_foo.default, obj_attr, x);  token = None
+    getitem = with_effects[0]
+    getitem_1 = with_effects[1];  with_effects = None
+    with_effects_1 = torch._higher_order_ops.effects.with_effects(getitem, torch.ops._TorchScriptTesting.takes_foo.default, obj_attr, getitem_1);  getitem = obj_attr = getitem_1 = None
+    getitem_2 = with_effects_1[0]
+    getitem_3 = with_effects_1[1];  with_effects_1 = None
+    add = torch.ops.aten.add.Tensor(x, getitem_3);  x = getitem_3 = None
+    return (getitem_2, add)""",  # noqa: B950
+        )
+
+    @parametrize("pre_dispatch", [True, False])
+    def test_custom_obj_list_out(self, pre_dispatch):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+            def forward(self, x):
+                a = torch.ops._TorchScriptTesting.takes_foo_list_return(self.attr, x)
+                y = a[0] + a[1] + a[2]
+                b = torch.ops._TorchScriptTesting.takes_foo(self.attr, y)
+                return x + b
+
+        input = torch.ones(2, 3)
+        ep = self._test_export_same_as_eager(
+            MyModule(), (input,), strict=False, pre_dispatch=pre_dispatch
+        )
+        self.assertExpectedInline(
+            ep.module().code.strip(),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    attr = self.attr
+    takes_foo_list_return_default = torch.ops._TorchScriptTesting.takes_foo_list_return.default(attr, x)
+    getitem_2 = takes_foo_list_return_default[0]
+    getitem_3 = takes_foo_list_return_default[1]
+    getitem_4 = takes_foo_list_return_default[2];  takes_foo_list_return_default = None
+    add = torch.ops.aten.add.Tensor(getitem_2, getitem_3);  getitem_2 = getitem_3 = None
+    add_1 = torch.ops.aten.add.Tensor(add, getitem_4);  add = getitem_4 = None
+    takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(attr, add_1);  attr = add_1 = None
+    add_2 = torch.ops.aten.add.Tensor(x, takes_foo_default);  x = takes_foo_default = None
+    return pytree.tree_unflatten((add_2,), self._out_spec)""",
+        )
+        self.assertExpectedInline(
+            ep.graph_module.code.strip(),
+            """\
+def forward(self, token, obj_attr, x):
+    with_effects = torch._higher_order_ops.effects.with_effects(token, torch.ops._TorchScriptTesting.takes_foo_list_return.default, obj_attr, x);  token = None
+    getitem = with_effects[0]
+    getitem_1 = with_effects[1];  with_effects = None
+    getitem_2 = getitem_1[0]
+    getitem_3 = getitem_1[1]
+    getitem_4 = getitem_1[2];  getitem_1 = None
+    add = torch.ops.aten.add.Tensor(getitem_2, getitem_3);  getitem_2 = getitem_3 = None
+    add_1 = torch.ops.aten.add.Tensor(add, getitem_4);  add = getitem_4 = None
+    with_effects_1 = torch._higher_order_ops.effects.with_effects(getitem, torch.ops._TorchScriptTesting.takes_foo.default, obj_attr, add_1);  getitem = obj_attr = add_1 = None
+    getitem_5 = with_effects_1[0]
+    getitem_6 = with_effects_1[1];  with_effects_1 = None
+    add_2 = torch.ops.aten.add.Tensor(x, getitem_6);  x = getitem_6 = None
+    return (getitem_5, add_2)""",  # noqa: B950
+        )
+
+    @parametrize("pre_dispatch", [True, False])
+    def test_custom_obj_tuple_out(self, pre_dispatch):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+            def forward(self, x):
+                a = torch.ops._TorchScriptTesting.takes_foo_tuple_return(self.attr, x)
+                y = a[0] + a[1]
+                b = torch.ops._TorchScriptTesting.takes_foo(self.attr, y)
+                return x + b
+
+        input = torch.ones(2, 3)
+        ep = self._test_export_same_as_eager(
+            MyModule(), (input,), strict=False, pre_dispatch=pre_dispatch
+        )
+        self.assertExpectedInline(
+            ep.module().code.strip(),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    attr = self.attr
+    takes_foo_tuple_return_default = torch.ops._TorchScriptTesting.takes_foo_tuple_return.default(attr, x)
+    getitem_1 = takes_foo_tuple_return_default[0]
+    getitem_2 = takes_foo_tuple_return_default[1];  takes_foo_tuple_return_default = None
+    add = torch.ops.aten.add.Tensor(getitem_1, getitem_2);  getitem_1 = getitem_2 = None
+    takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(attr, add);  attr = add = None
+    add_1 = torch.ops.aten.add.Tensor(x, takes_foo_default);  x = takes_foo_default = None
+    return pytree.tree_unflatten((add_1,), self._out_spec)""",
+        )
+        self.assertExpectedInline(
+            ep.graph_module.code.strip(),
+            """\
+def forward(self, token, obj_attr, x):
+    with_effects = torch._higher_order_ops.effects.with_effects(token, torch.ops._TorchScriptTesting.takes_foo_tuple_return.default, obj_attr, x);  token = None
+    getitem = with_effects[0]
+    getitem_1 = with_effects[1]
+    getitem_2 = with_effects[2];  with_effects = None
+    add = torch.ops.aten.add.Tensor(getitem_1, getitem_2);  getitem_1 = getitem_2 = None
+    with_effects_1 = torch._higher_order_ops.effects.with_effects(getitem, torch.ops._TorchScriptTesting.takes_foo.default, obj_attr, add);  getitem = obj_attr = add = None
+    getitem_3 = with_effects_1[0]
+    getitem_4 = with_effects_1[1];  with_effects_1 = None
+    add_1 = torch.ops.aten.add.Tensor(x, getitem_4);  x = getitem_4 = None
+    return (getitem_3, add_1)""",  # noqa: B950
+        )
+
+    @parametrize("make_fx_tracing_mode", ["fake", "symbolic"])
+    def test_make_fx_tensor_queue_methods(self, make_fx_tracing_mode):
+        test = self
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 2)
+                self.check_tq_is_fake = True
+
+            def forward(self, tq, x):
+                if self.check_tq_is_fake:
+                    test.assertTrue(isinstance(tq, FakeScriptObject))
+                tq.push(x.cos())
+                tq.push(x.sin())
+                x_cos = tq.pop() + tq.size()
+                x_sin = tq.pop() - tq.size()
+                return x_sin, x_cos, tq
+
+        mod = Model()
+        tq = torch.classes._TorchScriptTesting._TensorQueue(
+            torch.empty(
+                0,
+            ).fill_(-1)
+        )
+        tq1 = torch.classes._TorchScriptTesting._TensorQueue(
+            torch.empty(
+                0,
+            ).fill_(-1)
+        )
+        x = torch.ones(2, 3)
+        gm = make_fx(mod, tracing_mode=make_fx_tracing_mode)(tq, x)
+        self.assertEqual(self.tq_push_counter, 2)
+        self.assertEqual(self.tq_pop_counter, 2)
+        self.assertEqual(self.tq_size_counter, 2)
+        self.assertEqual(tq.size(), 0)
+        self.assertExpectedInline(
+            gm.code.strip("\n"),
+            """\
+def forward(self, arg0_1, arg1_1):
+    cos = torch.ops.aten.cos.default(arg1_1)
+    call_torchbind = torch.ops.higher_order.call_torchbind(arg0_1, 'push', cos);  cos = None
+    sin = torch.ops.aten.sin.default(arg1_1);  arg1_1 = None
+    call_torchbind_1 = torch.ops.higher_order.call_torchbind(arg0_1, 'push', sin);  sin = None
+    call_torchbind_2 = torch.ops.higher_order.call_torchbind(arg0_1, 'pop')
+    call_torchbind_3 = torch.ops.higher_order.call_torchbind(arg0_1, 'size')
+    add = torch.ops.aten.add.Tensor(call_torchbind_2, 1);  call_torchbind_2 = None
+    call_torchbind_4 = torch.ops.higher_order.call_torchbind(arg0_1, 'pop')
+    call_torchbind_5 = torch.ops.higher_order.call_torchbind(arg0_1, 'size')
+    sub = torch.ops.aten.sub.Tensor(call_torchbind_4, 0);  call_torchbind_4 = None
+    return (sub, add, arg0_1)
+    """,
+        )
+        mod.check_tq_is_fake = False
+        _assertEqualSkipScriptObject(self, gm(tq, x), mod(tq1, x))
+
+    @parametrize("make_fx_tracing_mode", ["fake", "symbolic"])
+    def test_make_fx_tensor_queue_methods_fakify_internal_states(
+        self, make_fx_tracing_mode
+    ):
+        test = self
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 2)
+                self.check_tq_is_fake = True
+                self.current_test = test
+
+            def forward(self, tq, x):
+                if self.check_tq_is_fake:
+                    self.current_test.assertTrue(isinstance(tq, FakeScriptObject))
+                x_cos = tq.pop() + tq.size() + x
+                x_sin = tq.pop() - tq.size() + x
+                return x_sin, x_cos, tq
+
+        mod = Model()
+        tq = torch.classes._TorchScriptTesting._TensorQueue(
+            torch.empty(
+                0,
+            ).fill_(-1)
+        )
+        tq1 = torch.classes._TorchScriptTesting._TensorQueue(
+            torch.empty(
+                0,
+            ).fill_(-1)
+        )
+        for _ in range(2):
+            tq.push(torch.ones(2, 3))
+            tq1.push(torch.ones(2, 3))
+        x = torch.ones(2, 3)
+        prev_size = tq.size()
+        gm = make_fx(mod, tracing_mode=make_fx_tracing_mode)(tq, x)
+        self.assertEqual(self.tq_push_counter, 0)
+        self.assertEqual(self.tq_pop_counter, 2)
+        self.assertEqual(self.tq_size_counter, 2)
+        self.assertEqual(tq.size(), prev_size)
+        self.assertExpectedInline(
+            gm.code.strip("\n"),
+            """\
+def forward(self, arg0_1, arg1_1):
+    call_torchbind = torch.ops.higher_order.call_torchbind(arg0_1, 'pop')
+    call_torchbind_1 = torch.ops.higher_order.call_torchbind(arg0_1, 'size')
+    add = torch.ops.aten.add.Tensor(call_torchbind, 1);  call_torchbind = None
+    add_1 = torch.ops.aten.add.Tensor(add, arg1_1);  add = None
+    call_torchbind_2 = torch.ops.higher_order.call_torchbind(arg0_1, 'pop')
+    call_torchbind_3 = torch.ops.higher_order.call_torchbind(arg0_1, 'size')
+    sub = torch.ops.aten.sub.Tensor(call_torchbind_2, 0);  call_torchbind_2 = None
+    add_2 = torch.ops.aten.add.Tensor(sub, arg1_1);  sub = arg1_1 = None
+    return (add_2, add_1, arg0_1)
+    """,
+        )
+        # turn off tq type checking in eager execution
+        mod.check_tq_is_fake = False
+        _assertEqualSkipScriptObject(self, gm(tq, x), mod(tq1, x))
+        self.assertEqual(tq.size(), 0)
+        self.assertEqual(tq1.size(), 0)
+
+    def test_identifying_torchbind_ops(self):
+        for op in self.torch_bind_ops:
+            self.assertTrue(op._has_torchbind_op_overload)
+
+        for op in [
+            torch.ops.aten.add,
+            torch.ops.aten.cos,
+        ]:
+            self.assertFalse(op._has_torchbind_op_overload)
+
+    def test_torchbind_op_register_fallthrough(self):
+        TEST_DISPATCH_KEY = torch._C.DispatchKey.AutocastCPU
+        TEST_DISPATCH_KEY_STR = "AutocastCPU"
+
+        for op_packet in self.torch_bind_ops:
+            op = op_packet.default
+            ns, _ = torch._library.utils.parse_namespace(op_packet._qualified_op_name)
+            with torch.library._scoped_library(ns, "FRAGMENT") as lib:
+                lib.impl(
+                    op.name(), torch.library.fallthrough_kernel, TEST_DISPATCH_KEY_STR
+                )
+                self.assertTrue(
+                    torch._C._dispatch_kernel_for_dispatch_key_is_fallthrough(
+                        op.name(), TEST_DISPATCH_KEY
+                    )
+                )
+
+    def test_torchbind_op_fallthrough_keys_respects_lib_impl(self):
+        TEST_DISPATCH_KEY = torch._C.DispatchKey.AutogradCPU
+        TEST_DISPATCH_KEY_STR = "AutogradCPU"
+
+        tested = 0
+        for op_packet in self.torch_bind_ops:
+            op = op_packet.default
+            ns, _ = torch._library.utils.parse_namespace(op_packet._qualified_op_name)
+            if (
+                not torch._C._dispatch_has_kernel_for_dispatch_key(
+                    op.name(), TEST_DISPATCH_KEY
+                )
+                and TEST_DISPATCH_KEY not in op.py_kernels
+            ):
+                tested += 1
+                with torch.library._scoped_library(ns, "FRAGMENT") as lib:
+                    lib.impl(
+                        op.name(), lambda *args, **kwargs: args, TEST_DISPATCH_KEY_STR
+                    )
+                    self.assertTrue(TEST_DISPATCH_KEY not in op._fallthrough_keys())
+
+                with torch.library._scoped_library(ns, "FRAGMENT") as lib:
+                    lib.impl(
+                        op.name(),
+                        torch.library.fallthrough_kernel,
+                        TEST_DISPATCH_KEY_STR,
+                    )
+                    self.assertTrue(TEST_DISPATCH_KEY in op._fallthrough_keys())
+        self.assertTrue(tested > 0)
+
+    def test_make_fx_schema_checking_script_object(self):
+        class Model(torch.nn.Module):
+            def forward(self, tq, x, foo):
+                torch.ops._TorchScriptTesting.queue_push(foo, x.cos())
+                return tq
+
+        class ModelCallByKW(torch.nn.Module):
+            def forward(self, tq, x, foo):
+                torch.ops._TorchScriptTesting.queue_push(x=x.cos(), foo=foo)
+                return tq
+
+        mod = Model()
+        modkw = ModelCallByKW()
+
+        foo = torch.classes._TorchScriptTesting._Foo(10, 20)
+        x = torch.ones(3, 3)
+        tq = torch.classes._TorchScriptTesting._TensorQueue(
+            torch.empty(
+                0,
+            ).fill_(-1)
+        )
+        ns = "_TorchScriptTesting"
+        with torch.library._scoped_library(ns, "FRAGMENT") as lib:
+            op = torch.ops._TorchScriptTesting.queue_push
+            lib.impl(op.__name__, torch.library.fallthrough_kernel, "AutogradCPU")
+            lib.impl(op.__name__, torch.library.fallthrough_kernel, "ADInplaceOrView")
+            lib.impl(
+                op.__name__,
+                torch.library.fallthrough_kernel,
+                "PythonTLSSnapshot",
+            )
+
+            with self.assertRaisesRegex(
+                RuntimeError, "is expected to be a FakeScriptObject"
+            ):
+                _ = make_fx(mod, tracing_mode="fake")(tq, x, foo)
+
+            with self.assertRaisesRegex(
+                RuntimeError, "is expected to be a FakeScriptObject"
+            ):
+                _ = make_fx(modkw, tracing_mode="fake")(tq, x, foo)
+
+    @parametrize("fallthrough_via", ["lib_impl", "py_impl"])
+    def test_make_fx_tensor_queue_operators(self, fallthrough_via):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, tq, x):
+                with torch.autocast("cuda", dtype=torch.bfloat16):
+                    torch.ops._TorchScriptTesting.queue_push(tq, x.cos())
+                    torch.ops._TorchScriptTesting.queue_push(tq, x.sin())
+                    x_sin = torch.ops._TorchScriptTesting.queue_pop(
+                        tq
+                    ) - torch.ops._TorchScriptTesting.queue_size(tq)
+                    x_cos = torch.ops._TorchScriptTesting.queue_pop(
+                        tq
+                    ) + torch.ops._TorchScriptTesting.queue_size(tq)
+                    return x_sin, x_cos, tq
+
+        mod = Model()
+
+        tq1 = torch.classes._TorchScriptTesting._TensorQueue(
+            torch.empty(
+                0,
+            ).fill_(-1)
+        )
+        tq2 = torch.classes._TorchScriptTesting._TensorQueue(
+            torch.empty(
+                0,
+            ).fill_(-1)
+        )
+        x = torch.ones(2, 3)
+
+        mod(tq1, x)
+
+        ops = [
+            torch.ops._TorchScriptTesting.queue_push,
+            torch.ops._TorchScriptTesting.queue_pop,
+            torch.ops._TorchScriptTesting.queue_size,
+        ]
+        if fallthrough_via == "lib_impl":
+            ns = "_TorchScriptTesting"
+            with torch.library._scoped_library(ns, "FRAGMENT") as lib:
+                for op in ops:
+                    lib.impl(
+                        op.__name__, torch.library.fallthrough_kernel, "AutocastCUDA"
+                    )
+
+                gm = make_fx(mod, tracing_mode="fake")(tq1, x)
+        else:
+            for op in ops:
+                op.default.py_impl(torch._C.DispatchKey.AutocastCUDA)(
+                    torch.library.fallthrough_kernel
+                )
+            gm = make_fx(mod, tracing_mode="fake")(tq1, x)
+            for op in ops:
+                op.default._dispatch_cache.clear()
+                del op.default.py_kernels[torch._C.DispatchKey.AutocastCUDA]
+
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    cos = torch.ops.aten.cos.default(arg1_1)
+    queue_push = torch.ops._TorchScriptTesting.queue_push.default(arg0_1, cos);  cos = None
+    sin = torch.ops.aten.sin.default(arg1_1);  arg1_1 = None
+    queue_push_1 = torch.ops._TorchScriptTesting.queue_push.default(arg0_1, sin);  sin = None
+    queue_pop = torch.ops._TorchScriptTesting.queue_pop.default(arg0_1)
+    queue_size = torch.ops._TorchScriptTesting.queue_size.default(arg0_1)
+    sub = torch.ops.aten.sub.Tensor(queue_pop, 1);  queue_pop = None
+    queue_pop_1 = torch.ops._TorchScriptTesting.queue_pop.default(arg0_1)
+    queue_size_1 = torch.ops._TorchScriptTesting.queue_size.default(arg0_1)
+    add = torch.ops.aten.add.Tensor(queue_pop_1, 0);  queue_pop_1 = None
+    return (sub, add, arg0_1)""",
+        )
+        _assertEqualSkipScriptObject(self, gm(tq1, x), mod(tq2, x))
+
+    def test_aot_export_tensor_queue_operators(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, tq, x):
+                torch.ops._TorchScriptTesting.queue_push(tq, x.cos())
+                torch.ops._TorchScriptTesting.queue_push(tq, x.sin())
+                x_sin = torch.ops._TorchScriptTesting.queue_pop(
+                    tq
+                ) - torch.ops._TorchScriptTesting.queue_size(tq)
+                x_cos = torch.ops._TorchScriptTesting.queue_pop(
+                    tq
+                ) + torch.ops._TorchScriptTesting.queue_size(tq)
+                return x_sin, x_cos, tq
+
+        mod = Model()
+
+        tq1 = torch.classes._TorchScriptTesting._TensorQueue(
+            torch.empty(
+                0,
+            ).fill_(-1)
+        )
+        x = torch.ones(2, 3)
+
+        fake_mode = torch._subclasses.fake_tensor.FakeTensorMode()
+        fake_tq1 = torch._library.fake_class_registry.to_fake_obj(fake_mode, tq1)
+        fake_x = fake_mode.from_tensor(x)
+        gm = aot_export_module(mod, (fake_tq1, fake_x), trace_joint=False)[0]
+
+        # inputs: token, tq, x
+        # return: token, x_sin, x_cos, tq
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1):
+    cos = torch.ops.aten.cos.default(arg2_1)
+    with_effects = torch._higher_order_ops.effects.with_effects(arg0_1, torch.ops._TorchScriptTesting.queue_push.default, arg1_1, cos);  arg0_1 = cos = None
+    getitem = with_effects[0];  with_effects = None
+    sin = torch.ops.aten.sin.default(arg2_1);  arg2_1 = None
+    with_effects_1 = torch._higher_order_ops.effects.with_effects(getitem, torch.ops._TorchScriptTesting.queue_push.default, arg1_1, sin);  getitem = sin = None
+    getitem_2 = with_effects_1[0];  with_effects_1 = None
+    with_effects_2 = torch._higher_order_ops.effects.with_effects(getitem_2, torch.ops._TorchScriptTesting.queue_pop.default, arg1_1);  getitem_2 = None
+    getitem_4 = with_effects_2[0]
+    getitem_5 = with_effects_2[1];  with_effects_2 = None
+    with_effects_3 = torch._higher_order_ops.effects.with_effects(getitem_4, torch.ops._TorchScriptTesting.queue_size.default, arg1_1);  getitem_4 = None
+    getitem_6 = with_effects_3[0];  with_effects_3 = None
+    sub = torch.ops.aten.sub.Tensor(getitem_5, 1);  getitem_5 = None
+    with_effects_4 = torch._higher_order_ops.effects.with_effects(getitem_6, torch.ops._TorchScriptTesting.queue_pop.default, arg1_1);  getitem_6 = None
+    getitem_8 = with_effects_4[0]
+    getitem_9 = with_effects_4[1];  with_effects_4 = None
+    with_effects_5 = torch._higher_order_ops.effects.with_effects(getitem_8, torch.ops._TorchScriptTesting.queue_size.default, arg1_1);  getitem_8 = None
+    getitem_10 = with_effects_5[0];  with_effects_5 = None
+    add = torch.ops.aten.add.Tensor(getitem_9, 0);  getitem_9 = None
+    return (getitem_10, sub, add, arg1_1)""",  # noqa: B950
+        )
+
+
+class TestCompileTorchbind(TestCase):
+    def setUp(self):
+        init_torchbind_implementations()
+
+        @torch._library.register_fake_class("_TorchScriptTesting::_TensorQueue")
+        class FakeTensorQueue:
+            def __init__(self, queue):
+                self.queue = queue
+
+            @classmethod
+            def __obj_unflatten__(cls, flattened_ctx):
+                return cls(**dict(flattened_ctx))
+
+            def push(self, x):
+                self.queue.append(x)
+
+            def pop(self):
+                return self.queue.pop(0)
+
+            def size(self):
+                return len(self.queue)
+
+        torch._dynamo.reset()
+
+    def tearDown(self):
+        torch._dynamo.reset()
+
+    def test_compile_script_object_input(self):
+        backend = EagerAndRecordGraphs()
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.check_tq_is_fake = True
+
+            def forward(self, tq, x):
+                tq.push(x.cos())
+                tq.push(x.sin())
+                x_sin = tq.pop() - tq.size()
+                return x_sin, tq
+
+        mod = Model()
+        tq1 = torch.classes._TorchScriptTesting._TensorQueue(
+            torch.empty(
+                0,
+            ).fill_(-1)
+        )
+        tq2 = torch.classes._TorchScriptTesting._TensorQueue(
+            torch.empty(
+                0,
+            ).fill_(-1)
+        )
+        tq3 = torch.classes._TorchScriptTesting._TensorQueue(
+            torch.empty(
+                0,
+            ).fill_(-1)
+        )
+        tq4 = torch.classes._TorchScriptTesting._TensorQueue(
+            torch.empty(
+                0,
+            ).fill_(-1)
+        )
+        x = torch.randn(2, 3)
+        ret = torch.compile(mod, backend=backend)(tq1, x)
+        eager_ret = mod(tq2, x)
+        _assertEqualSkipScriptObject(self, ret, eager_ret)
+        self.assertEqual(ret[1].size(), eager_ret[1].size())
+        self.assertEqual(ret[1].pop(), eager_ret[1].pop())
+        # Note that dynamo captured graph
+        # does not return L_tq_ as output. This is because it's able
+        # to detect that L_tq_ is an input therefore don't return
+        # it as graph output. Related logic is in dynamo/codegen.py
+        self.assertExpectedInline(
+            backend.graphs[0].code.strip(),
+            """\
+def forward(self, L_tq_ : torch.ScriptObject, L_x_ : torch.Tensor):
+    l_tq_ = L_tq_
+    l_x_ = L_x_
+    cos = l_x_.cos()
+    call_torchbind = torch.ops.higher_order.call_torchbind(l_tq_, 'push', cos);  cos = None
+    sin = l_x_.sin();  l_x_ = None
+    call_torchbind_1 = torch.ops.higher_order.call_torchbind(l_tq_, 'push', sin);  sin = None
+    call_torchbind_2 = torch.ops.higher_order.call_torchbind(l_tq_, 'pop')
+    call_torchbind_3 = torch.ops.higher_order.call_torchbind(l_tq_, 'size');  l_tq_ = None
+    x_sin = call_torchbind_2 - 1;  call_torchbind_2 = None
+    return (x_sin,)""",
+        )
+
+    def test_compile_script_object_input_guards(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.check_tq_is_fake = True
+
+            def forward(self, tq, x):
+                tq.push(x.cos())
+                tq.push(x.sin())
+                x_sin = tq.pop() - tq.size()
+                return x_sin, tq
+
+        mod = Model()
+        cnt = torch._dynamo.testing.CompileCounter()
+        x = torch.randn(2, 3)
+
+        tq1 = _empty_tensor_queue()
+        torch.compile(mod, backend=cnt)(tq1, x)
+        self.assertEqual(cnt.frame_count, 1)
+
+        tq2 = _empty_tensor_queue()
+        for _ in range(10):
+            tq2.push(torch.randn(4, 5, requires_grad=False))
+        torch.compile(mod, backend=cnt)(tq2, x)
+        # Queue length change causes re-compile
+        self.assertEqual(cnt.frame_count, 2)
+
+        tq3 = _empty_tensor_queue()
+        tq3.push(torch.randn(2, 3, requires_grad=False))
+        torch.compile(mod, backend=cnt)(tq3, x)
+        # Tensor in queue changes shape causes re-compile
+        self.assertEqual(cnt.frame_count, 3)
+
+        tq4 = _empty_tensor_queue()
+        tq4.push(torch.randn(2, 3, requires_grad=False))
+        torch.compile(mod, backend=cnt)(tq4, x)
+        # No recompile
+        self.assertEqual(cnt.frame_count, 3)
+
+        tq5 = _empty_tensor_queue()
+        tq5.push(torch.randn(2, 3, requires_grad=True))
+        torch.compile(mod, backend=cnt)(tq5, x)
+        # Tensor in queue changes dispatch key causes re-compile
+        self.assertEqual(cnt.frame_count, 4)
+
+        tq6 = _empty_tensor_queue()
+        tq6.push(torch.randn(2, 3, requires_grad=True, dtype=torch.float64))
+        torch.compile(mod, backend=cnt)(tq6, x)
+        # Tensor in queue changes dtype causes re-compile
+        self.assertEqual(cnt.frame_count, 5)
+
+    def test_compile_script_object_input_automatic_dynamic_shape(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.check_tq_is_fake = True
+
+            def forward(self, tq, x):
+                tq.push(x.cos())
+                tq.push(x.sin())
+                x_sin = tq.pop() - tq.size()
+                return x_sin, tq
+
+        mod = Model()
+        cnt = torch._dynamo.testing.CompileCounter()
+        x = torch.randn(2, 3)
+
+        tq1 = _empty_tensor_queue()
+        tq1.push(torch.randn(2, 3, requires_grad=False))
+        torch.compile(mod, backend=cnt)(tq1, x)
+        self.assertEqual(cnt.frame_count, 1)
+
+        tq2 = _empty_tensor_queue()
+        # make first tensor's secon dim dynamic
+        tq2.push(torch.randn(2, 4, requires_grad=False))
+        torch.compile(mod, backend=cnt)(tq2, x)
+        self.assertEqual(cnt.frame_count, 2)
+
+        tq3 = _empty_tensor_queue()
+        tq3.push(torch.randn(2, 5, requires_grad=False))
+        # should have no-recompilation
+        torch.compile(mod, backend=cnt)(tq3, x)
+        self.assertEqual(cnt.frame_count, 2)
+
+    def test_compile_error_on_input_aliasing_contents(self):
+        backend = EagerAndRecordGraphs()
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.check_tq_is_fake = True
+
+            def forward(self, tq, x):
+                return x.sin(), tq.pop().cos()
+
+        x = torch.randn(2, 3)
+        mod = Model()
+
+        tq1 = _empty_tensor_queue()
+        tq1.push(x)
+        with self.assertRaisesRegex(RuntimeError, "is alising"):
+            torch.compile(mod, backend=backend)(tq1, x)
+
+    def test_compile_error_on_script_obj_setattr(self):
+        def setattr_f(tq):
+            tq.a = 1
+            return tq
+
+        with self.assertRaisesRegex(
+            RuntimeError, "call method __setattr__ on script object is not safe"
+        ):
+            torch.compile(setattr_f, backend="eager")(_empty_tensor_queue())
+
+    def test_compile_error_on_script_obj_missing_attr(self):
+        def setattr_f(tq):
+            return tq._not_defined_attr
+
+        with self.assertRaisesRegex(
+            RuntimeError, "doesn't define method _not_defined_attr"
+        ):
+            torch.compile(setattr_f, backend="eager")(_empty_tensor_queue())
+
+    def test_compile_body_aliasing_contents(self):
+        backend = EagerAndRecordGraphs()
+
+        def f(tq, x):
+            x1 = x.view(-1)
+            x2 = x.permute(1, 0)
+            tq.push(x1)
+            tq.push(x2)
+            return x1 - tq.size(), x2 + tq.size(), tq
+
+        x = torch.randn(2, 3)
+        _assertEqualScriptObject(
+            self,
+            f(_empty_tensor_queue(), x),
+            torch.compile(f, backend=backend)(_empty_tensor_queue(), x),
+        )
+        if not torch._dynamo.is_compiling():
+            self.assertExpectedInline(
+                backend.graphs[0].code.strip(),
+                """\
+def forward(self, L_x_ : torch.Tensor, L_tq_ : torch.ScriptObject):
+    l_x_ = L_x_
+    l_tq_ = L_tq_
+    x1 = l_x_.view(-1)
+    x2 = l_x_.permute(1, 0);  l_x_ = None
+    call_torchbind = torch.ops.higher_order.call_torchbind(l_tq_, 'push', x1)
+    call_torchbind_1 = torch.ops.higher_order.call_torchbind(l_tq_, 'push', x2)
+    call_torchbind_2 = torch.ops.higher_order.call_torchbind(l_tq_, 'size')
+    sub = x1 - 2;  x1 = None
+    call_torchbind_3 = torch.ops.higher_order.call_torchbind(l_tq_, 'size');  l_tq_ = None
+    add = x2 + 2;  x2 = None
+    return (sub, add)""",
+            )
+
+    def test_compile_error_on_non_fakified_method(self):
+        backend = EagerAndRecordGraphs()
+
+        def f(tq, x):
+            x1 = x.view(-1)
+            x2 = x.permute(1, 0)
+            tq.push(x1)
+            tq.push(x2)
+            # though real tensor queue implemented a method clone_queue,
+            # The fakified version doesn't.
+            flat_obj = tq.clone_queue()
+            return flat_obj
+
+        x = torch.randn(2, 3)
+        with self.assertRaisesRegex(
+            RuntimeError, "FakeScriptObject doesn't define method"
+        ):
+            torch.compile(f, backend=backend)(_empty_tensor_queue(), x)
+
+    def test_compile_obj_as_hop_input(self):
+        def f(tq, x):
+            def fn(tq, x):
+                tq.push(x)
+                return x.sin()
+
+            return wrap(fn, tq, x)
+
+        x = torch.randn(2, 3)
+        _assertEqualScriptObject(
+            self,
+            f(_empty_tensor_queue(), x),
+            torch.compile(f, backend="eager")(_empty_tensor_queue(), x),
+        )
+
+    def test_compile_obj_closure(self):
+        def f(x):
+            def inner_f(x):
+                tq.push(x.sin())
+
+            inner_f(x)
+            return tq.pop(), tq
+
+        opt_f = torch.compile(f, backend="eager")
+
+        tq = _empty_tensor_queue()
+        x = torch.randn(3, 2)
+        _assertEqualScriptObject(self, f(x), opt_f(x))
+
+    def test_compile_global_obj(self):
+        global _TENSOR_QUEUE_GLOBAL_TEST
+        _TENSOR_QUEUE_GLOBAL_TEST = _empty_tensor_queue()
+
+        def f(x):
+            _TENSOR_QUEUE_GLOBAL_TEST.push(x.sin())
+            return _TENSOR_QUEUE_GLOBAL_TEST.pop(), _TENSOR_QUEUE_GLOBAL_TEST
+
+        opt_f = torch.compile(f, backend="eager")
+        x = torch.randn(3, 2)
+        eager_ret = f(x)
+        opt_ret = opt_f(x)
+        _assertEqualScriptObject(self, eager_ret, opt_ret)
+
+    def test_compile_obj_graph_breaks(self):
+        cnt = torch._dynamo.testing.CompileCounter()
+
+        def f(tq, x):
+            tq.push(x.sin())
+            tq.push(x.sin())
+            torch._dynamo.graph_break()
+            tq.pop()
+            torch._dynamo.graph_break()
+            tq.push(x.cos() + tq.size())
+            torch._dynamo.graph_break()
+            tq.push(x.cos() - tq.size())
+            return x, tq.pop(), tq
+
+        opt_f = torch.compile(f, backend=cnt)
+        x = torch.randn(3, 2)
+        _assertEqualScriptObject(
+            self, f(_empty_tensor_queue(), x), opt_f(_empty_tensor_queue(), x)
+        )
+        self.assertEqual(cnt.frame_count, 4)
+
+    def test_compile_obj_attributes(self):
+        backend = EagerAndRecordGraphs()
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.tq = _empty_tensor_queue()
+
+            def forward(self, x):
+                self.tq.push(x)
+                return self.tq.pop()
+
+        x = torch.randn(2, 3)
+        opt_f = torch.compile(Model(), backend=backend)
+        _assertEqualScriptObject(self, Model()(x), opt_f(x))
+        self.assertEqual(len(backend.graphs), 1)
+        # lifted as input. In the future, we would want to cosolidate this
+        # with non-strict behavior, where they're set as attributes.
+        self.assertExpectedInline(
+            backend.graphs[0].code.strip(),
+            """\
+def forward(self, L_self_tq : torch.ScriptObject, L_x_ : torch.Tensor):
+    l_self_tq = L_self_tq
+    l_x_ = L_x_
+    call_torchbind = torch.ops.higher_order.call_torchbind(l_self_tq, 'push', l_x_);  l_x_ = None
+    call_torchbind_1 = torch.ops.higher_order.call_torchbind(l_self_tq, 'pop');  l_self_tq = None
+    return (call_torchbind_1,)""",
+        )
+
+    def test_compile_obj_torchbind_op(self):
+        def f(tq, x):
+            torch.ops._TorchScriptTesting.queue_push(tq, x.cos())
+            torch.ops._TorchScriptTesting.queue_push(tq, x.cos() + 1)
+            torch.ops._TorchScriptTesting.queue_pop(tq)
+            torch.ops._TorchScriptTesting.queue_push(tq, x.sin())
+            return tq.pop(), tq.pop() + tq.size(), tq
+
+        opt_f = torch.compile(f, backend="eager")
+        x = torch.randn(2)
+        _assertEqualScriptObject(
+            self, f(_empty_tensor_queue(), x), opt_f(_empty_tensor_queue(), x)
+        )
+
+
+@skipIfTorchDynamo("torchbind not supported with dynamo yet")
+class TestRegisterFakeClass(TestCase):
+    def setUp(self):
+        init_torchbind_implementations()
+
+    def tearDown(self):
+        torch._library.fake_class_registry.global_fake_class_registry.clear()
+
+    def test_register_fake_class_no_torch_bind_class(self):
+        with self.assertRaisesRegex(RuntimeError, "Tried to instantiate class"):
+
+            @torch._library.register_fake_class("_TorchScriptTesting::NOT_A_VALID_NAME")
+            class Invalid:
+                pass
+
+    def test_register_fake_class_no_from_real(self):
+        with self.assertRaisesRegex(
+            RuntimeError, "define a classmethod __obj_unflatten__"
+        ):
+
+            @torch._library.register_fake_class("_TorchScriptTesting::_Foo")
+            class InvalidFakeFoo:
+                def __init__(self):
+                    pass
+
+    def test_register_fake_class_from_real_not_classmethod(self):
+        with self.assertRaisesRegex(RuntimeError, "is not a classmethod"):
+
+            @torch._library.register_fake_class("_TorchScriptTesting::_Foo")
+            class FakeFoo:
+                def __init__(self, x, y):
+                    self.x = x
+                    self.y = y
+
+                def __obj_unflatten__(cls, flattend_foo):  # noqa: B902
+                    return cls(**dict(flattend_foo))
+
+    def test_register_fake_class_valid(self):
+        class FakeFoo:
+            def __init__(self, x, y):
+                self.x = x
+                self.y = y
+
+            @classmethod
+            def __obj_unflatten__(cls, flattend_foo):
+                return cls(**dict(flattend_foo))
+
+        torch._library.register_fake_class("_TorchScriptTesting::_Foo", FakeFoo)
+
+
+instantiate_parametrized_tests(TestExportTorchbind)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/export/test_tree_utils.py b/test/export/test_tree_utils.py
new file mode 100644
index 0000000000000..5e6e5f2c33975
--- /dev/null
+++ b/test/export/test_tree_utils.py
@@ -0,0 +1,50 @@
+# Owner(s): ["oncall: export"]
+from collections import OrderedDict
+
+import torch
+from torch._dynamo.test_case import TestCase
+
+from torch.export._tree_utils import is_equivalent, reorder_kwargs
+from torch.testing._internal.common_utils import run_tests
+from torch.utils._pytree import tree_structure
+
+
+class TestTreeUtils(TestCase):
+    def test_reorder_kwargs(self):
+        original_kwargs = {"a": torch.tensor(0), "b": torch.tensor(1)}
+        user_kwargs = {"b": torch.tensor(2), "a": torch.tensor(3)}
+        orig_spec = tree_structure(((), original_kwargs))
+
+        reordered_kwargs = reorder_kwargs(user_kwargs, orig_spec)
+
+        # Key ordering should be the same
+        self.assertEqual(reordered_kwargs.popitem()[0], original_kwargs.popitem()[0]),
+        self.assertEqual(reordered_kwargs.popitem()[0], original_kwargs.popitem()[0]),
+
+    def test_equivalence_check(self):
+        tree1 = {"a": torch.tensor(0), "b": torch.tensor(1), "c": None}
+        tree2 = OrderedDict(a=torch.tensor(0), b=torch.tensor(1), c=None)
+        spec1 = tree_structure(tree1)
+        spec2 = tree_structure(tree2)
+
+        def dict_ordered_dict_eq(type1, context1, type2, context2):
+            if type1 is None or type2 is None:
+                return type1 is type2 and context1 == context2
+
+            if issubclass(type1, (dict, OrderedDict)) and issubclass(
+                type2, (dict, OrderedDict)
+            ):
+                return context1 == context2
+
+            return type1 is type2 and context1 == context2
+
+        self.assertTrue(is_equivalent(spec1, spec2, dict_ordered_dict_eq))
+
+        # Wrong ordering should still fail
+        tree3 = OrderedDict(b=torch.tensor(1), a=torch.tensor(0))
+        spec3 = tree_structure(tree3)
+        self.assertFalse(is_equivalent(spec1, spec3, dict_ordered_dict_eq))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/export/test_unflatten.py b/test/export/test_unflatten.py
index 38a354eb7b073..b8ff48334f011 100644
--- a/test/export/test_unflatten.py
+++ b/test/export/test_unflatten.py
@@ -1,36 +1,48 @@
-# Owner(s): ["module: dynamo"]
+# Owner(s): ["oncall: export"]
 # flake8: noqa
+import copy
 import dataclasses
 import unittest
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import List, Any
+from re import escape
+from typing import Any, List
 
 import torch
 import torch._dynamo as torchdynamo
 from functorch.experimental.control_flow import cond, map
 from torch import Tensor
+from torch._export.utils import (
+    get_buffer,
+    get_param,
+    is_buffer,
+    is_param,
+    register_dataclass_as_pytree_node,
+)
+from torch._higher_order_ops.torchbind import enable_torchbind_tracing
 from torch.export import (
     Constraint,
     Dim,
     dynamic_dim,
     export,
-    unflatten,
     FlatArgsAdapter,
+    unflatten,
 )
 from torch.export._trace import DEFAULT_EXPORT_DYNAMO_CONFIG
-from torch._export import capture_pre_autograd_graph
-from torch._export.utils import (
-    get_buffer,
-    get_param,
-    is_buffer,
-    is_param,
-    register_dataclass_as_pytree_node,
-)
-from torch.export import Constraint, Dim, export
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
-from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.common_utils import (
+    find_library_location,
+    IS_FBCODE,
+    IS_MACOS,
+    IS_SANDCASTLE,
+    IS_WINDOWS,
+    run_tests,
+    skipIfTorchDynamo,
+    TestCase,
+)
+
+from torch.testing._internal.torchbind_impls import init_torchbind_implementations
 from torch.utils._pytree import (
     LeafSpec,
     tree_flatten,
@@ -199,6 +211,8 @@ def forward(self, x):
             id(getattr(unflattened_module.sub_net, "2")),
         )
 
+    @unittest.skipIf(IS_WINDOWS, "Windows not supported for this test")
+    @skipIfTorchDynamo("Non strict mode is not meant to run with dynamo")
     def test_unflatten_preserve_signature(self):
         class NestedChild(torch.nn.Module):
             def forward(self, zx, y):
@@ -234,41 +248,43 @@ def forward(self, x, y):
 
         orig_eager = MyModule()
         inps = torch.rand(2, 3), torch.rand(2, 3)
-        export_module = export(
-            orig_eager,
-            inps,
-            {},
-            preserve_module_call_signature=("foo.nested",),
-        )
-        unflattened = unflatten(export_module)
-        self.compare_outputs(export_module, unflattened, inps)
-        unflattened.foo.nested = NestedChild()
-        self.compare_outputs(export_module, unflattened, inps)
-
-        # Test tree spec mismatched input
-        orig_outs = export_module(*inps)
-        new_inps = *inps, torch.rand(2, 3)
-        with self.assertRaisesRegex(
-            TypeError,
-            "There is no flat args adapter sepcified. Are you sure you are calling this with the right arguments?",
-        ):
-            unflattened(new_inps)
-
-        # With flat args adapter
-        class KeepTwoFlatArgsAdapter(FlatArgsAdapter):
-            def adapt(
-                self,
-                target_spec: TreeSpec,
-                input_spec: TreeSpec,
-                input_args: List[Any],
-            ) -> List[Any]:
-                while len(input_args) > 2:
-                    input_args.pop(-1)
-                return input_args
-
-        unflattened = unflatten(export_module, KeepTwoFlatArgsAdapter())
-        new_outs = unflattened(*new_inps)
-        self.assertTrue(torch.allclose(orig_outs, new_outs))
+        for strict in [True, False]:
+            export_module = export(
+                orig_eager,
+                inps,
+                {},
+                preserve_module_call_signature=("foo.nested",),
+                strict=strict,
+            )
+            unflattened = unflatten(export_module)
+            self.compare_outputs(export_module.module(), unflattened, inps)
+            unflattened.foo.nested = NestedChild()
+            self.compare_outputs(export_module.module(), unflattened, inps)
+
+            # Test tree spec mismatched input
+            orig_outs = export_module.module()(*inps)
+            new_inps = *inps, torch.rand(2, 3)
+            with self.assertRaisesRegex(
+                TypeError,
+                "There is no flat args adapter sepcified. Are you sure you are calling this with the right arguments?",
+            ):
+                unflattened(new_inps)
+
+            # With flat args adapter
+            class KeepTwoFlatArgsAdapter(FlatArgsAdapter):
+                def adapt(
+                    self,
+                    target_spec: TreeSpec,
+                    input_spec: TreeSpec,
+                    input_args: List[Any],
+                ) -> List[Any]:
+                    while len(input_args) > 2:
+                        input_args.pop(-1)
+                    return input_args
+
+            unflattened = unflatten(export_module, KeepTwoFlatArgsAdapter())
+            new_outs = unflattened(*new_inps)
+            self.assertTrue(torch.allclose(orig_outs, new_outs))
 
     def test_unflatten_param_list_dict(self):
         class Mod(torch.nn.Module):
@@ -291,7 +307,9 @@ def forward(self, x):
         export_module = torch.export.export(Mod(), (torch.randn((2, 3)),))
         unflattened = unflatten(export_module)
 
-        self.compare_outputs(export_module, unflattened, (torch.randn((2, 3)),))
+        self.compare_outputs(
+            export_module.module(), unflattened, (torch.randn((2, 3)),)
+        )
 
     def test_unflatten_wrong_input(self):
         class Mod(torch.nn.Module):
@@ -313,11 +331,17 @@ def forward(self, x):
                 return a
 
         export_module = torch.export.export(Mod(), (torch.randn((2, 3)),))
-        with self.assertRaisesRegex(RuntimeError, "Expected input l_x_.shape\[0\] to be equal to 2, but got 6"):
-            export_module(torch.randn(6, 6))
+        with self.assertRaisesRegex(
+            RuntimeError,
+            escape("Expected input at *args[0].shape[0] to be equal to 2, but got 6"),
+        ):
+            export_module.module()(torch.randn(6, 6))
 
         unflattened = unflatten(export_module)
-        with self.assertRaisesRegex(RuntimeError, "Expected input l_x_.shape\[0\] to be equal to 2, but got 6"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            escape("Expected input at *args[0].shape[0] to be equal to 2, but got 6"),
+        ):
             unflattened(torch.randn(6, 6))
 
     def test_unflatten_with_inplace_compile(self):
@@ -454,7 +478,236 @@ def forward(self, x, z):
 
         gm_unflat_non_strict = unflatten(ep_non_strict)
         ep = torch.export.export(gm_unflat_non_strict, inp, strict=False)
-        self.assertTrue(torch.allclose(ep(*inp), mod(*inp)))
+        self.assertTrue(torch.allclose(ep.module()(*inp), mod(*inp)))
+
+    def test_unflattened_module_nodes_has_meta_val(self):
+        class SubMod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x + x, x * x
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.submod = SubMod()
+
+            def forward(self, x):
+                return x + sum(self.submod(x))
+
+        orig_eager = MyModule()
+        export_module = torch.export.export(orig_eager, (torch.rand(2, 3),), {})
+        unflattened = unflatten(export_module)
+
+        inputs = (torch.rand(2, 3),)
+        self.compare_outputs(orig_eager, unflattened, inputs)
+
+        def check_meta(gm):
+            for n in gm.graph.nodes:
+                if n.op == "output":
+                    continue
+                self.assertTrue(n.meta.get("val") is not None)
+
+        for m in unflattened.modules():
+            check_meta(m)
+
+    def test_placeholder_and_get_attr_ordering_after_unflattened(self):
+        class TransposeModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(3, 1, 3, stride=2)
+
+            def forward(self, x):
+                x = self.conv(x)
+                return x.transpose(0, 1)
+
+        x = torch.randn(32, 3, 64, 64)
+        exported_program = export(TransposeModule(), args=(x,))
+        unflattened_module = unflatten(exported_program)
+
+        # Check the inputs of the created call_module node are in order
+        call_module_input_order = []
+        for node in unflattened_module.graph.nodes:
+            if node.op == "call_module":
+                transpose_module = unflattened_module.get_submodule(node.target)
+                for sub_node in transpose_module.graph.nodes:
+                    if sub_node.op == "placeholder" or sub_node.op == "get_attr":
+                        call_module_input_order.append(sub_node.op)
+        self.assertEqual(
+            call_module_input_order, ["placeholder", "get_attr", "get_attr"]
+        )
+
+    def test_unflatten_constant_tensor(self):
+        class SubMod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.initializer = 0.1
+
+            def forward(self, x):
+                return x + torch.tensor(self.initializer)
+
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.submod = SubMod()
+
+            def forward(self, x):
+                return x + self.submod(x)
+
+        export_module = torch.export.export(Mod(), (torch.randn((2, 3)),))
+        unflattened = unflatten(export_module)
+
+        self.compare_outputs(
+            export_module.module(), unflattened, (torch.randn((2, 3)),)
+        )
+
+    @skipIfTorchDynamo("custom objects not supported in dynamo yet")
+    def test_unflatten_constant_obj(self):
+        init_torchbind_implementations()
+
+        @torch._library.register_fake_class("_TorchScriptTesting::_Foo")
+        class FakeFoo:
+            def __init__(self, x: int, y: int):
+                self.x = x
+                self.y = y
+
+            @classmethod
+            def __obj_unflatten__(cls, flat_ctx):
+                return cls(**dict(flat_ctx))
+
+            def add_tensor(self, z):
+                return (self.x + self.y) * z
+
+        class SubMod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+            def forward(self, x):
+                return x + self.attr.add_tensor(x)
+
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.submod = SubMod()
+
+            def forward(self, x):
+                return x + self.submod(x)
+
+        with enable_torchbind_tracing():
+            export_module = torch.export.export(
+                Mod(), (torch.randn((2, 3)),), strict=False
+            )
+        unflattened = unflatten(export_module)
+
+        self.compare_outputs(
+            export_module.module(), unflattened, (torch.randn((2, 3)),)
+        )
+
+    def test_nested_leaf_non_strict(self):
+        class Leaf(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        class Nested(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.leaf = Leaf()
+
+            def forward(self, x):
+                return self.leaf(x) + 2
+
+        class TopLevel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.nested = Nested()
+
+            def forward(self, x):
+                return self.nested(x) + 3
+
+        ep = torch.export.export(
+            TopLevel(),
+            (torch.randn(3),),
+            strict=False,
+            preserve_module_call_signature=("nested",),
+        )
+
+        torch.export.unflatten(ep)
+
+    def test_unflatten_submodule_ordering(self):
+        class Module2(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buffer", torch.rand(3, 4))
+                self.register_parameter("param", torch.nn.Parameter(torch.rand(3, 4)))
+
+            def forward(self, x):
+                return x + self.buffer + self.param
+
+        class Module1(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buffer", torch.rand(3, 4))
+                self.register_parameter("param", torch.nn.Parameter(torch.rand(3, 4)))
+
+            def forward(self, x):
+                return x + self.buffer + self.param
+
+        class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod2 = Module2()
+                self.mod3 = self.mod2
+                self.mod1 = Module1()
+
+            def forward(self, x):
+                return self.mod3(self.mod2(self.mod1(x)))
+
+        mod = Module()
+
+        ep = torch.export.export(mod, (torch.randn(3, 4),))
+
+        unflattened = torch.export.unflatten(ep)
+        fqn_list = [x for x, _ in unflattened.named_modules(remove_duplicate=False)]
+        self.assertEqual(len(fqn_list), 4)
+        self.assertEqual(
+            [x for x, _ in mod.named_modules(remove_duplicate=False)],
+            fqn_list,
+        )
+
+    def test_duplicate_placeholder(self):
+        N, C, H, W = 1, 2, 2, 3
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                layer = torch.nn.LayerNorm([C, H, W])
+                self.norms = torch.nn.ModuleList(
+                    [
+                        layer,  # reuse layer norm
+                        layer,
+                        layer,
+                    ]
+                )
+
+            def forward(self, input_):
+                for i in range(len(self.norms)):
+                    output = self.norms[i](input_)
+                    input_ = output
+                return output
+
+        mod = MyModule()
+        input_ = torch.randn(N, C, H, W)
+
+        ep_strict = export(copy.deepcopy(mod), (input_,), strict=True)
+        umod = unflatten(ep_strict)
+        self.assertTrue(torch.allclose(umod(input_), mod(input_)))
+
+        ep_non_strict = export(copy.deepcopy(mod), (input_,), strict=False)
+        umod = unflatten(ep_non_strict)
+        self.assertTrue(torch.allclose(umod(input_), mod(input_)))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_upgrade.py b/test/export/test_upgrade.py
index 0c7bb276c00f9..3913370f9e46f 100644
--- a/test/export/test_upgrade.py
+++ b/test/export/test_upgrade.py
@@ -1,16 +1,13 @@
-# Owner(s): ["module: dynamo"]
+# Owner(s): ["oncall: export"]
 import unittest
 from unittest.mock import patch
 
 import torch
 import torch._dynamo as torchdynamo
-from torch.export import export
 from torch._export.serde.serialize import GraphModuleOpUpgrader
 from torch._export.serde.upgrade import get_target_version, get_upgraders
-from torch.testing._internal.common_utils import (
-    run_tests,
-    TestCase,
-)
+from torch.export import export
+from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
 
 TEST_UPGRADERS = {
     "aten::div__Scalar_mode_0_3": (
@@ -31,8 +28,7 @@ def gelu_0_9(self: Tensor) -> Tensor:
 }
 
 TEST_UPGRADERS_ENTRY_MAP = {
-    "div__Scalar_mode_0_3":
-        """
+    "div__Scalar_mode_0_3": """
 from typing import Any, Optional
 def div__Scalar_mode_0_3(self: torch.Tensor, other: Any,  *, rounding_mode: Optional[str]=None) -> torch.Tensor:
     return self.divide_(other, rounding_mode=rounding_mode)"""
@@ -43,7 +39,7 @@ def div__Scalar_mode_0_3(self: torch.Tensor, other: Any,  *, rounding_mode: Opti
         torch._C._UpgraderEntry(
             4,
             "div__Scalar_mode_0_3",
-            "aten::div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)"
+            "aten::div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)",
         )
     ]
 }
@@ -51,27 +47,43 @@ def div__Scalar_mode_0_3(self: torch.Tensor, other: Any,  *, rounding_mode: Opti
 
 def count_op(graph, target_str):
     return len(
-        [n for n in graph.nodes if isinstance(n.target, torch._ops.OpOverload) and n.target.name() == target_str])
+        [
+            n
+            for n in graph.nodes
+            if isinstance(n.target, torch._ops.OpOverload)
+            and n.target.name() == target_str
+        ]
+    )
 
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
 class TestUpgrade(TestCase):
     def test_get_upgraders(self):
-        with patch.object(torch._C, "_get_upgraders_entry_map", return_value=TEST_UPGRADERS_ENTRY_MAP), \
-                patch.object(torch._C, "_get_operator_version_map", return_value=TEST_OP_VERSION_MAP):
+        with patch.object(
+            torch._C, "_get_upgraders_entry_map", return_value=TEST_UPGRADERS_ENTRY_MAP
+        ), patch.object(
+            torch._C, "_get_operator_version_map", return_value=TEST_OP_VERSION_MAP
+        ):
             op_upgraders = get_upgraders()
-            self.assertEqual(op_upgraders, {
-                "div__Scalar_mode_0_3": (
-                    "aten::div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)",
-                    """
+            self.assertEqual(
+                op_upgraders,
+                {
+                    "div__Scalar_mode_0_3": (
+                        "aten::div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)",
+                        """
 from typing import Any, Optional
 def div__Scalar_mode_0_3(self: torch.Tensor, other: Any,  *, rounding_mode: Optional[str]=None) -> torch.Tensor:
     return self.divide_(other, rounding_mode=rounding_mode)""",
-                )})
+                    )
+                },
+            )
 
     def test_get_upgraders_missing_from_entry_map_raises(self):
-        with patch.object(torch._C, "_get_upgraders_entry_map", return_value={}), \
-                patch.object(torch._C, "_get_operator_version_map", return_value=TEST_OP_VERSION_MAP):
+        with patch.object(
+            torch._C, "_get_upgraders_entry_map", return_value={}
+        ), patch.object(
+            torch._C, "_get_operator_version_map", return_value=TEST_OP_VERSION_MAP
+        ):
             with self.assertRaises(RuntimeError):
                 get_upgraders()
 
@@ -92,18 +104,25 @@ def test_get_target_version_invalid_format_throws_exception(self):
     def test_creates_upgrader_pass(self):
         compiler_opset_version = {"aten": 4}
         model_opset_version = {"aten": 3}
-        upgrader = GraphModuleOpUpgrader(compiler_opset_version, model_opset_version, TEST_UPGRADERS)
+        upgrader = GraphModuleOpUpgrader(
+            compiler_opset_version, model_opset_version, TEST_UPGRADERS
+        )
         self.assertEqual(len(upgrader.upgrader_passes), 1)
 
     def test_div_upgrader_replaces_op_with_old_version(self):
-        def fn(a: torch.Tensor, b):
-            return torch.ops.aten.div.Scalar_mode(a, b, rounding_mode='trunc')
+        class Foo(torch.nn.Module):
+            def forward(self, a: torch.Tensor, b):
+                return torch.ops.aten.div.Scalar_mode(a, b, rounding_mode="trunc")
 
-        inputs = (torch.ones([2, 3]) * 4, 2.)
+        fn = Foo()
+
+        inputs = (torch.ones([2, 3]) * 4, 2.0)
         ep = export(fn, inputs, [])
         compiler_opset_version = {"aten": 4}
         model_opset_version = {"aten": 3}
-        upgrader = GraphModuleOpUpgrader(compiler_opset_version, model_opset_version, TEST_UPGRADERS)
+        upgrader = GraphModuleOpUpgrader(
+            compiler_opset_version, model_opset_version, TEST_UPGRADERS
+        )
         upgraded = ep._transform_do_not_use(*upgrader.upgrader_passes)
         upgraded.graph_module.print_readable()
 
@@ -112,15 +131,21 @@ def fn(a: torch.Tensor, b):
         custom_op_count = count_op(upgraded.graph, "aten::div__Scalar_mode_0_3")
         self.assertEqual(custom_op_count, 1)
 
+    @unittest.skipIf(IS_WINDOWS, "Test case not supported on Windows")
     def test_div_upgrader_pass_return_new_op_after_retrace(self):
-        def fn(a: torch.Tensor, b):
-            return torch.ops.aten.div.Scalar_mode(a, b, rounding_mode='trunc')
+        class Foo(torch.nn.Module):
+            def forward(self, a: torch.Tensor, b):
+                return torch.ops.aten.div.Scalar_mode(a, b, rounding_mode="trunc")
 
-        inputs = (torch.ones([2, 3]) * 4, 2.)
+        fn = Foo()
+
+        inputs = (torch.ones([2, 3]) * 4, 2.0)
         ep = export(fn, inputs)
         compiler_opset_version = {"aten": 4}
         model_opset_version = {"aten": 3}
-        upgrader = GraphModuleOpUpgrader(compiler_opset_version, model_opset_version, TEST_UPGRADERS)
+        upgrader = GraphModuleOpUpgrader(
+            compiler_opset_version, model_opset_version, TEST_UPGRADERS
+        )
 
         count = count_op(ep.graph, "aten::div.Scalar_mode")
         self.assertEqual(count, 1)
@@ -138,5 +163,5 @@ def fn(a: torch.Tensor, b):
         self.assertEqual(decomposed_op_count, 1)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_verifier.py b/test/export/test_verifier.py
index 278564618db0c..c85e90f1b435f 100644
--- a/test/export/test_verifier.py
+++ b/test/export/test_verifier.py
@@ -1,21 +1,25 @@
-# Owner(s): ["module: dynamo"]
+# Owner(s): ["oncall: export"]
 import unittest
 
 import torch
 from functorch.experimental import control_flow
 from torch import Tensor
 from torch._dynamo.eval_frame import is_dynamo_supported
-from torch.export import export
 
 from torch._export.verifier import SpecViolationError, Verifier
+from torch.export import export
 from torch.export.exported_program import InputKind, InputSpec, TensorArgument
-from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
+
 
 @unittest.skipIf(not is_dynamo_supported(), "dynamo isn't supported")
 class TestVerifier(TestCase):
     def test_verifier_basic(self) -> None:
-        def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-            return x + y
+        class Foo(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return x + y
+
+        f = Foo()
 
         ep = export(f, (torch.randn(100), torch.randn(100)))
 
@@ -38,8 +42,11 @@ def forward(self, x: Tensor) -> Tensor:
             verifier._check_graph_module(gm)
 
     def test_verifier_no_functional(self) -> None:
-        def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-            return x + y
+        class Foo(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return x + y
+
+        f = Foo()
 
         ep = export(f, (torch.randn(100), torch.randn(100)))
         for node in ep.graph.nodes:
@@ -50,34 +57,38 @@ def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         with self.assertRaises(SpecViolationError):
             verifier.check(ep)
 
+    @unittest.skipIf(IS_WINDOWS, "Windows not supported for this test")
     def test_verifier_higher_order(self) -> None:
-        def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-            def true_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x + y
+        class Foo(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                def true_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                    return x + y
+
+                def false_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                    return x - y
 
-            def false_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x - y
+                return control_flow.cond(x.shape[0] > 2, true_fn, false_fn, [x, y])
 
-            return control_flow.cond(
-                x.shape[0] > 2, true_fn, false_fn, [x, y]
-            )
+        f = Foo()
 
         ep = export(f, (torch.randn(3, 3), torch.randn(3, 3)))
 
         verifier = Verifier()
         verifier.check(ep)
 
+    @unittest.skipIf(IS_WINDOWS, "Windows not supported for this test")
     def test_verifier_nested_invalid_module(self) -> None:
-        def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-            def true_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x + y
+        class Foo(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                def true_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                    return x + y
+
+                def false_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                    return x - y
 
-            def false_fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-                return x - y
+                return control_flow.cond(x.shape[0] > 2, true_fn, false_fn, [x, y])
 
-            return control_flow.cond(
-                x.shape[0] > 2, true_fn, false_fn, [x, y]
-            )
+        f = Foo()
 
         ep = export(f, (torch.randn(3, 3), torch.randn(3, 3)))
         for node in ep.graph_module.true_graph_0.graph.nodes:
@@ -104,7 +115,9 @@ def test_ep_verifier_invalid_param(self) -> None:
         class M(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
-                self.register_parameter(name="a", param=torch.nn.Parameter(torch.randn(100)))
+                self.register_parameter(
+                    name="a", param=torch.nn.Parameter(torch.randn(100))
+                )
 
             def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 return x + y + self.a
@@ -113,9 +126,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
         # Parameter doesn't exist in the state dict
         ep.graph_signature.input_specs[0] = InputSpec(
-            kind=InputKind.PARAMETER,
-            arg=TensorArgument(name="arg0_1"),
-            target="bad_param"
+            kind=InputKind.PARAMETER, arg=TensorArgument(name="p_a"), target="bad_param"
         )
         with self.assertRaisesRegex(SpecViolationError, "not in the state dict"):
             ep._validate()
@@ -141,8 +152,9 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         # Buffer doesn't exist in the state dict
         ep.graph_signature.input_specs[0] = InputSpec(
             kind=InputKind.BUFFER,
-            arg=TensorArgument(name="arg0_1"),
-            target="bad_buffer"
+            arg=TensorArgument(name="c_a"),
+            target="bad_buffer",
+            persistent=True,
         )
         with self.assertRaisesRegex(SpecViolationError, "not in the state dict"):
             ep._validate()
@@ -205,5 +217,5 @@ def forward(self, x1, x2):
             ep._validate()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/export/testing.py b/test/export/testing.py
index 3b782e281ad98..af3ad9f65e619 100644
--- a/test/export/testing.py
+++ b/test/export/testing.py
@@ -58,3 +58,13 @@ def expectedFailureRetraceability(fn):
 def expectedFailureSerDer(fn):
     fn._expected_failure_serdes = True
     return fn
+
+
+def expectedFailureSerDerPreDispatch(fn):
+    fn._expected_failure_serdes_pre_dispatch = True
+    return fn
+
+
+def expectedFailurePreDispatchRunDecomp(fn):
+    fn._expected_failure_pre_dispatch = True
+    return fn
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index 11c71582740c3..285e410a79edc 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -46,6 +46,7 @@
     ("prim::ModuleDictIndex", datetime.date(9999, 1, 1)),
     ("prim::MKLDNNRelu6", datetime.date(9999, 1, 1)),
     ("prim::MKLDNNRelu6_", datetime.date(9999, 1, 1)),
+    ("prim::is_ort", datetime.date(9999, 1, 1)),
     ("prim::Concat", datetime.date(9999, 1, 1)),
     ("aten::_NestedTensor_GeneralizedBMM", datetime.date(9999, 1, 1)),
     # Internal, profiler-specific ops
@@ -108,14 +109,13 @@
     ("aten::mps_max_pool2d_backward.out", datetime.date(9999, 1, 1)),
     # TODO: FIXME: prims shouldn't be checked
     ("prims::.*", datetime.date(9999, 1, 1)),
-
     ("aten::_flash_attention_forward", datetime.date(2023, 12, 30)),
     ("aten::_flash_attention_backward", datetime.date(2023, 12, 30)),
+    ("aten::_scaled_dot_product_cudnn_attention", datetime.date(9999, 1, 1)),
     ("aten::_sparse_mask_helper", datetime.date(2023, 3, 15)),
     # BetterTransformer 1.0 internal operators
     ("aten::_transformer_decoder_only_layer_fwd", datetime.date(9999, 1, 1)),
-    ("aten::_native_decoder_only_multi_head_attention",
-     datetime.date(9999, 1, 1)),
+    ("aten::_native_decoder_only_multi_head_attention", datetime.date(9999, 1, 1)),
     ("c10d::_allgather_base_", datetime.date(2023, 12, 30)),
     ("c10d::_reduce_scatter_base_", datetime.date(2023, 12, 30)),
     ("c10d::broadcast_", datetime.date(2023, 12, 30)),
@@ -136,12 +136,10 @@
     ("aten::batch_norm_backward_elemt", datetime.date(2023, 12, 31)),
     ("aten::sym_constrain_range", datetime.date(2023, 12, 31)),
     ("aten::_efficient_attention_forward", datetime.date(2024, 1, 15)),
-    ("aten::_sparse_semi_structured_linear", datetime.date(2024, 1, 15)),
-    ("onednn::qconv1d_pointwise", datetime.date(2023, 12, 31)),
-    ("onednn::qconv2d_pointwise", datetime.date(2023, 12, 31)),
-    ("onednn::qconv3d_pointwise", datetime.date(2023, 12, 31)),
-    ("onednn::qconv2d_pointwise.binary", datetime.date(2023, 12, 31)),
-    ("onednn::qlinear_pointwise", datetime.date(2023, 12, 31)),
+    ("onednn::qconv1d_pointwise", datetime.date(2024, 12, 31)),
+    ("onednn::qconv2d_pointwise", datetime.date(2024, 12, 31)),
+    ("onednn::qconv3d_pointwise", datetime.date(2024, 12, 31)),
+    ("onednn::qconv2d_pointwise.binary", datetime.date(2024, 12, 31)),
 ]
 
 ALLOW_LIST_COMPILED = [
@@ -149,9 +147,12 @@
         re.compile(item[0]),
         item[1],
         re.compile(item[2]) if len(item) > 2 else None,
-    ) for item in ALLOW_LIST if item[1] >= datetime.date.today()
+    )
+    for item in ALLOW_LIST
+    if item[1] >= datetime.date.today()
 ]
 
+
 def allow_listed(schema):
     for item in ALLOW_LIST_COMPILED:
         if item[0].search(str(schema)):
@@ -171,6 +172,7 @@ def allow_listed(schema):
     ("__backends__.nnc", datetime.date(2099, 9, 17)),
 ]
 
+
 def has_valid_upgraders(schema, version_map):
     # we want to parse through the map to find if
     # the schema has valid upgraders. Since the
@@ -199,6 +201,7 @@ def has_valid_upgraders(schema, version_map):
 
     return False
 
+
 def dont_parse(schema_line):
     for item in dont_parse_list:
         if item[1] < datetime.date.today():
@@ -208,6 +211,7 @@ def dont_parse(schema_line):
             return True
     return False
 
+
 def load_schemas_to_dict():
     new_schemas = torch._C._jit_get_all_schemas()
     new_schemas += torch._C._jit_get_custom_class_schemas()
@@ -216,6 +220,7 @@ def load_schemas_to_dict():
         new_schema_dict[s.name].append(s)
     return new_schema_dict
 
+
 def process_version_map(version_map):
     # version map maps full schema name to
     # list of upgraders. Since we only have
@@ -225,12 +230,13 @@ def process_version_map(version_map):
     # Dict[schema_name, Dict[overload, List[schema]]]
 
     output = defaultdict(dict)
-    for (key, entries) in version_map.items():
+    for key, entries in version_map.items():
         operator_name = key.split(".")[0]
         schema_entries = [parse_schema(entry.old_schema) for entry in entries]
         output[operator_name][key] = schema_entries
     return output
 
+
 def check_bc(existing_schemas):
     new_schema_dict = load_schemas_to_dict()
     version_map = process_version_map(torch._C._get_operator_version_map())
@@ -272,6 +278,7 @@ def check_bc(existing_schemas):
         )
     return is_bc
 
+
 def check_fc(existing_schemas):
     new_schema_dict = load_schemas_to_dict()
     is_fc = True
@@ -285,7 +292,9 @@ def check_fc(existing_schemas):
         found = False
         possible_failure_reasons = []
         for matching_new_schema in matching_new_schemas:
-            is_compatible, reason = matching_new_schema.check_forward_compatible_with(existing_schema)
+            is_compatible, reason = matching_new_schema.check_forward_compatible_with(
+                existing_schema
+            )
             if is_compatible:
                 found = True
                 break
diff --git a/test/forward_backward_compatibility/dump_all_function_schemas.py b/test/forward_backward_compatibility/dump_all_function_schemas.py
index 2e1b03287c9d4..dde408adcbcdc 100644
--- a/test/forward_backward_compatibility/dump_all_function_schemas.py
+++ b/test/forward_backward_compatibility/dump_all_function_schemas.py
@@ -1,24 +1,25 @@
-
 import argparse
+
 import torch
 
 
 def dump(filename):
     schemas = torch._C._jit_get_all_schemas()
     schemas += torch._C._jit_get_custom_class_schemas()
-    with open(filename, 'w') as f:
+    with open(filename, "w") as f:
         for s in schemas:
             f.write(str(s))
-            f.write('\n')
+            f.write("\n")
 
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Process some integers.')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process some integers.")
     parser.add_argument(
-        '-f',
-        '--filename',
-        help='filename to dump the schemas',
+        "-f",
+        "--filename",
+        help="filename to dump the schemas",
         type=str,
-        default='schemas.txt')
+        default="schemas.txt",
+    )
     args = parser.parse_args()
     dump(args.filename)
diff --git a/test/functorch/attn_ft.py b/test/functorch/attn_ft.py
index 7a81c791aa6a0..ee4656631964a 100644
--- a/test/functorch/attn_ft.py
+++ b/test/functorch/attn_ft.py
@@ -3,10 +3,11 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import math
+
 import torch
+from functorch.dim import cat, dimlists, dims, softmax
 from torch import nn
-from functorch.dim import dims, dimlists, softmax, cat
-import math
 
 
 class Linear(nn.Linear):
@@ -16,10 +17,17 @@ def forward(self, input):
         result = (input[b, ci] * self.weight[co, ci]).sum(ci) + self.bias[co]
         return result.order(b, co)
 
+
 class BertSelfAttention(nn.Module):
-    def __init__(self, hidden_size, num_attention_heads,
-                 attention_probs_dropout_prob, position_embedding_type=None,
-                 max_position_embeddings=None, linear=Linear):
+    def __init__(
+        self,
+        hidden_size,
+        num_attention_heads,
+        attention_probs_dropout_prob,
+        position_embedding_type=None,
+        max_position_embeddings=None,
+        linear=Linear,
+    ):
         super().__init__()
         if hidden_size % num_attention_heads != 0:
             raise ValueError(
@@ -41,7 +49,9 @@ def __init__(self, hidden_size, num_attention_heads,
         if self.position_embedding_type is not None:
             assert max_position_embeddings is not None
             self.max_position_embeddings = max_position_embeddings
-            self.distance_embedding = nn.Embedding(2 * max_position_embeddings - 1, self.attention_head_size)
+            self.distance_embedding = nn.Embedding(
+                2 * max_position_embeddings - 1, self.attention_head_size
+            )
 
     def forward(
         self,
@@ -70,7 +80,6 @@ def forward(
         k = k[batch, key_sequence, [heads, features]]
         v = v[batch, key_sequence, [heads, features]]
 
-
         # this option allows the model to attend to not just the elements of the current sequence
         # but the previous elements as well as additional tokens.
         if past_key_value is not None:
@@ -85,7 +94,6 @@ def forward(
             # key_sequence
             key_sequence = extended_key_sequence
 
-
         # Take the dot product between "query" and "key" to get the raw attention scores.
         # The actual outer-product and summation are explicitly represented here,
         # and like einsum, will be pattern matched to an efficient matrix multiply op.
@@ -112,7 +120,9 @@ def forward(
             # this form of indirect indexing is more straightforward than either advanced indexing or torch.gather which both
             # have a lot of dependencies on the positions of indexing tensors.
 
-            positional_embedding = self.distance_embedding.weight[self.max_position_embeddings - 1 + distance, features]
+            positional_embedding = self.distance_embedding.weight[
+                self.max_position_embeddings - 1 + distance, features
+            ]
 
             if self.position_embedding_type == "relative_key":
                 # these were einsum ops in the positional code because they are not easy to fit to existing matmul operators
@@ -120,16 +130,24 @@ def forward(
                 relative_position_scores = (q * positional_embedding).sum(features)
                 attention_scores = attention_scores + relative_position_scores
             elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = (q * positional_embedding).sum(features)
+                relative_position_scores_query = (q * positional_embedding).sum(
+                    features
+                )
                 relative_position_scores_key = (k * positional_embedding).sum(features)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+                attention_scores = (
+                    attention_scores
+                    + relative_position_scores_query
+                    + relative_position_scores_key
+                )
 
         attention_probs = attention_scores
         # Normalize the attention scores to probabilities.
         attention_probs = softmax(attention_scores, dim=key_sequence)
         # # This is actually dropping out entire tokens to attend to, which might
         # # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = torch.nn.functional.dropout(attention_probs, p=self.dropout_prob)
+        attention_probs = torch.nn.functional.dropout(
+            attention_probs, p=self.dropout_prob
+        )
 
         # similarly, we can replace the matmul with a direct listing of the outer product, which makes it clear
         # we are weighting the values v across all keys with the attention scores.
diff --git a/test/functorch/attn_positional.py b/test/functorch/attn_positional.py
index b10130751fa8a..c75e8f3ec9498 100644
--- a/test/functorch/attn_positional.py
+++ b/test/functorch/attn_positional.py
@@ -3,14 +3,21 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import math
+
 import torch
 from torch import nn
-import math
+
 
 class BertSelfAttention(nn.Module):
-    def __init__(self, hidden_size, num_attention_heads,
-                 attention_probs_dropout_prob,
-                 position_embedding_type=None, max_position_embeddings=None):
+    def __init__(
+        self,
+        hidden_size,
+        num_attention_heads,
+        attention_probs_dropout_prob,
+        position_embedding_type=None,
+        max_position_embeddings=None,
+    ):
         super().__init__()
         if hidden_size % num_attention_heads != 0:
             raise ValueError(
@@ -32,10 +39,15 @@ def __init__(self, hidden_size, num_attention_heads,
         if self.position_embedding_type is not None:
             assert max_position_embeddings is not None
             self.max_position_embeddings = max_position_embeddings
-            self.distance_embedding = nn.Embedding(2 * max_position_embeddings - 1, self.attention_head_size)
+            self.distance_embedding = nn.Embedding(
+                2 * max_position_embeddings - 1, self.attention_head_size
+            )
 
     def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
@@ -56,26 +68,43 @@ def forward(
             k = torch.cat([past_key_value[0], k], dim=2)
             v = torch.cat([past_key_value[1], v], dim=2)
 
-
         # Take the dot product between "query" and "key" to get the raw attention scores.
         attention_scores = torch.matmul(q, k.transpose(-1, -2))
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
 
         if self.position_embedding_type is not None:
             seq_length = hidden_states.size()[1]
-            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(1, -1)
             distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(dtype=q.dtype)  # fp16 compatibility
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1
+            )
+            positional_embedding = positional_embedding.to(
+                dtype=q.dtype
+            )  # fp16 compatibility
 
             if self.position_embedding_type == "relative_key":
-                relative_position_scores = torch.einsum("bhld,lrd->bhlr", q, positional_embedding)
+                relative_position_scores = torch.einsum(
+                    "bhld,lrd->bhlr", q, positional_embedding
+                )
                 attention_scores = attention_scores + relative_position_scores
             elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", q, positional_embedding)
-                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", k, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+                relative_position_scores_query = torch.einsum(
+                    "bhld,lrd->bhlr", q, positional_embedding
+                )
+                relative_position_scores_key = torch.einsum(
+                    "bhrd,lrd->bhlr", k, positional_embedding
+                )
+                attention_scores = (
+                    attention_scores
+                    + relative_position_scores_query
+                    + relative_position_scores_key
+                )
 
         attention_probs = attention_scores
         # Normalize the attention scores to probabilities.
@@ -84,7 +113,6 @@ def forward(
         # # seem a bit unusual, but is taken from the original Transformer paper.
         attention_probs = self.dropout(attention_probs)
 
-
         context_layer = torch.matmul(attention_probs, v)
 
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
diff --git a/test/functorch/common_utils.py b/test/functorch/common_utils.py
index 46555f5d51edb..fed53d645c60a 100644
--- a/test/functorch/common_utils.py
+++ b/test/functorch/common_utils.py
@@ -5,20 +5,20 @@
 # LICENSE file in the root directory of this source tree.
 
 import itertools
+import os
+import unittest
+from collections import namedtuple
+
 import torch
-from functorch import vmap
 import torch.utils._pytree as pytree
+from functorch import vmap
 from functorch_additional_op_db import additional_op_db
-from torch.testing._internal.common_methods_invocations import DecorateInfo
-from torch.testing._internal.common_methods_invocations import op_db
-from torch.testing._internal.common_modules import module_db
-import os
-import unittest
-from torch.testing._internal.common_device_type import toleranceOverride
 from torch.testing._internal.autograd_function_db import autograd_function_db
-from collections import namedtuple
+from torch.testing._internal.common_device_type import toleranceOverride
+from torch.testing._internal.common_methods_invocations import DecorateInfo, op_db
+from torch.testing._internal.common_modules import module_db
 
-IS_FBCODE = os.getenv('FUNCTORCH_TEST_FBCODE') == '1'
+IS_FBCODE = os.getenv("FUNCTORCH_TEST_FBCODE") == "1"
 
 
 def loop(op, in_dims, out_dim, batch_size, *batched_args, **kwarg_values):
@@ -28,7 +28,10 @@ def loop(op, in_dims, out_dim, batch_size, *batched_args, **kwarg_values):
         flat_args, args_spec = pytree.tree_flatten(batched_args)
         flat_dims, dims_spec = pytree.tree_flatten(in_dims)
         assert args_spec == dims_spec
-        new_args = [a.select(in_dim, idx) if in_dim is not None else a for a, in_dim in zip(flat_args, flat_dims)]
+        new_args = [
+            a.select(in_dim, idx) if in_dim is not None else a
+            for a, in_dim in zip(flat_args, flat_dims)
+        ]
         out = op(*pytree.tree_unflatten(new_args, args_spec), **kwarg_values)
         flat_out, out_spec = pytree.tree_flatten(out)
         outs.append(flat_out)
@@ -40,7 +43,17 @@ def loop(op, in_dims, out_dim, batch_size, *batched_args, **kwarg_values):
 
 # Like loop helper function but for 2 levels of vmap. If we need more levels than this, probably possible
 # to generalize the loops function but it seemed too complicated for this
-def loop2(op, in_dims1, in_dims2, out_dim1, out_dim2, batch_size1, batch_size2, *batched_args, **kwarg_values):
+def loop2(
+    op,
+    in_dims1,
+    in_dims2,
+    out_dim1,
+    out_dim2,
+    batch_size1,
+    batch_size2,
+    *batched_args,
+    **kwarg_values,
+):
     outs = []
     flat_args, args_spec = pytree.tree_flatten(batched_args)
     flat_dims1, dims_spec1 = pytree.tree_flatten(in_dims1)
@@ -50,9 +63,15 @@ def loop2(op, in_dims1, in_dims2, out_dim1, out_dim2, batch_size1, batch_size2,
     assert len(flat_dims1) == len(flat_dims2)
     for idx1 in range(batch_size1):
         out_split = []
-        arg_split = [a.select(in_dim1, idx1) if in_dim1 is not None else a for a, in_dim1 in zip(flat_args, flat_dims1)]
+        arg_split = [
+            a.select(in_dim1, idx1) if in_dim1 is not None else a
+            for a, in_dim1 in zip(flat_args, flat_dims1)
+        ]
         for idx2 in range(batch_size2):
-            new_args = [a.select(in_dim, idx2) if in_dim is not None else a for a, in_dim in zip(arg_split, flat_dims2)]
+            new_args = [
+                a.select(in_dim, idx2) if in_dim is not None else a
+                for a, in_dim in zip(arg_split, flat_dims2)
+            ]
             out = op(*pytree.tree_unflatten(new_args, args_spec), **kwarg_values)
             out_split.append(out)
         outs.append(out_split)
@@ -103,6 +122,7 @@ def wrapped(*args):
         if args not in memo:
             memo[args] = fn(*args)
         return memo[args]
+
     return wrapped
 
 
@@ -123,10 +143,13 @@ def get_bdim_choices(num_tensors):
     assert choices[-1] == (None,) * num_tensors
     return tuple(choices[:-1])
 
+
 # NB: This is O(2 ** num_tensors).
 # num_tensors ranges from 1 to 10, with 2-4 being most common.
 # Try not to extravagate it if you're modifying it.
-def get_bdim_choices_batch_norm(num_tensors, _, running_mean=None, running_var=None, *args):
+def get_bdim_choices_batch_norm(
+    num_tensors, _, running_mean=None, running_var=None, *args
+):
     choices = []
     options = (-1, None)
 
@@ -176,13 +199,20 @@ def construct_in_dims(bdim_choice_for_tensors, is_tensors):
 
 
 def is_batch_norm_training(op_name, kwarg_values):
-    batch_norm_fns = ("nn.functional.batch_norm", "nn.functional.instance_norm")  # instance norm calls batch norm
+    batch_norm_fns = (
+        "nn.functional.batch_norm",
+        "nn.functional.instance_norm",
+    )  # instance norm calls batch norm
     if op_name not in batch_norm_fns:
         return False
 
     # batch norm and instance norm require the value to be a plain bool
-    default_training = op_name == "nn.functional.instance_norm"  # instance norm defaults to training, batch norm doesn't
-    is_training = tuple(arg for arg in tuple(kwarg_values.values()) if isinstance(arg, bool))
+    default_training = (
+        op_name == "nn.functional.instance_norm"
+    )  # instance norm defaults to training, batch norm doesn't
+    is_training = tuple(
+        arg for arg in tuple(kwarg_values.values()) if isinstance(arg, bool)
+    )
     if len(is_training) == 0:
         return default_training
     else:
@@ -190,7 +220,9 @@ def is_batch_norm_training(op_name, kwarg_values):
         return is_training[0]
 
 
-def generate_vmap_inputs(arg_values, kwarg_values, is_batch_norm_and_training=False, batch_size=2):
+def generate_vmap_inputs(
+    arg_values, kwarg_values, is_batch_norm_and_training=False, batch_size=2
+):
     flat_args, arg_spec = pytree.tree_flatten(tuple(arg_values))
     is_tensors = [isinstance(a, torch.Tensor) for a in flat_args]
     num_tensors = sum(is_tensors)
@@ -198,8 +230,11 @@ def generate_vmap_inputs(arg_values, kwarg_values, is_batch_norm_and_training=Fa
     # batch it since running_mean/var will be seen as unbatched tensors
     if num_tensors == 1 and is_batch_norm_and_training:
         return
-    bdim_choices = get_bdim_choices_batch_norm(
-        num_tensors, *arg_values) if is_batch_norm_and_training else get_bdim_choices(num_tensors)
+    bdim_choices = (
+        get_bdim_choices_batch_norm(num_tensors, *arg_values)
+        if is_batch_norm_and_training
+        else get_bdim_choices(num_tensors)
+    )
 
     @memoize
     def get_batched_arg(arg, bdim):
@@ -211,8 +246,10 @@ def get_batched_arg(arg, bdim):
     for bdim_choice in bdim_choices:
         flat_in_dims = construct_in_dims(bdim_choice, is_tensors)
 
-        flat_batched_args = tuple(arg if in_dim is None else get_batched_arg(arg, in_dim)
-                                  for arg, in_dim in zip(flat_args, flat_in_dims))
+        flat_batched_args = tuple(
+            arg if in_dim is None else get_batched_arg(arg, in_dim)
+            for arg, in_dim in zip(flat_args, flat_in_dims)
+        )
         batched_args = pytree.tree_unflatten(flat_batched_args, arg_spec)
         in_dims = pytree.tree_unflatten(flat_in_dims, arg_spec)
         yield batched_args, in_dims, kwarg_values
@@ -223,13 +260,19 @@ def clone_if_tensor(x):
         return x.clone()
     return x
 
+
 # Helper function to compare output of `vmap` against the
 # `for-loop` version.
 def _compute_quantities_for_vmap_test(
-        op, orig_batched_args, orig_kwarg_values, in_dims,
-        out_dim, batch_size, compute_loop_out=True,
-        clone_inputs=False):
-
+    op,
+    orig_batched_args,
+    orig_kwarg_values,
+    in_dims,
+    out_dim,
+    batch_size,
+    compute_loop_out=True,
+    clone_inputs=False,
+):
     def maybe_clone_inputs():
         if clone_inputs:
             batched_args = pytree.tree_map(clone_if_tensor, orig_batched_args)
@@ -251,7 +294,9 @@ def maybe_clone_inputs():
     # t = make_fx(vmap(f, in_dims=in_dims, out_dims=out_dim))(*batched_args, **kwarg_values)
     # print(in_dims, [arg.shape for arg in batched_args], kwarg_values)
     batched_args, kwarg_values = maybe_clone_inputs()
-    batched_out = vmap(op, in_dims=in_dims, out_dims=out_dim)(*batched_args, **kwarg_values)
+    batched_out = vmap(op, in_dims=in_dims, out_dims=out_dim)(
+        *batched_args, **kwarg_values
+    )
 
     # Tests case where we dispatch to a batching rule with no bdims
     # This should be handled by autogenerated plumbing. For vmap support
@@ -270,7 +315,9 @@ def f(dummy, *args, **kwargs):
     inner_in_dims = (0,) + pytree.tree_map(lambda x: None, in_dims)
     outer_in_dims = (0,) + in_dims
     batched_args, kwarg_values = maybe_clone_inputs()
-    vmapvmap_output = vmap(vmap(f, inner_in_dims), outer_in_dims)(dummy, *batched_args, **kwarg_values)
+    vmapvmap_output = vmap(vmap(f, inner_in_dims), outer_in_dims)(
+        dummy, *batched_args, **kwarg_values
+    )
 
     yield (batched_out, loop_out, vmapvmap_output, vmapvmap_expected)
 
@@ -278,16 +325,36 @@ def f(dummy, *args, **kwargs):
 # Function with more friendly return types
 # compared to `_compute_quantities_for_vmap_test`
 def compute_quantities_for_vmap_test(
-        op, orig_batched_args, orig_kwarg_values, in_dims,
-        out_dim=0, batch_size=2, compute_loop_out=True,
-        clone_inputs=False):
-    for quantities in _compute_quantities_for_vmap_test(op, orig_batched_args, orig_kwarg_values, in_dims,
-                                                        out_dim, batch_size, compute_loop_out, clone_inputs):
+    op,
+    orig_batched_args,
+    orig_kwarg_values,
+    in_dims,
+    out_dim=0,
+    batch_size=2,
+    compute_loop_out=True,
+    clone_inputs=False,
+):
+    for quantities in _compute_quantities_for_vmap_test(
+        op,
+        orig_batched_args,
+        orig_kwarg_values,
+        in_dims,
+        out_dim,
+        batch_size,
+        compute_loop_out,
+        clone_inputs,
+    ):
         yield (quantities[0], quantities[1])
         yield (quantities[2], quantities[3])
 
 
-def get_fallback_and_vmap_exhaustive(op, arg_values, kwarg_values, is_batch_norm_and_training=False, compute_loop_out=True):
+def get_fallback_and_vmap_exhaustive(
+    op,
+    arg_values,
+    kwarg_values,
+    is_batch_norm_and_training=False,
+    compute_loop_out=True,
+):
     out_dim = 0
     batch_size = 2
 
@@ -303,60 +370,82 @@ def make_batched(t):
     # expand it based on the `out_dim` and `batch_size`.
     expected_unbatched = op(*arg_values, **kwarg_values)
     expected_batched = pytree.tree_map(make_batched, expected_unbatched)
-    generator = generate_vmap_inputs(arg_values, kwarg_values, is_batch_norm_and_training)
+    generator = generate_vmap_inputs(
+        arg_values, kwarg_values, is_batch_norm_and_training
+    )
     for batched_args, in_dims, kwarg_values in generator:
         for quantities in _compute_quantities_for_vmap_test(
-                op, batched_args, kwarg_values, in_dims, out_dim, batch_size,
-                compute_loop_out=False):
+            op,
+            batched_args,
+            kwarg_values,
+            in_dims,
+            out_dim,
+            batch_size,
+            compute_loop_out=False,
+        ):
             assert quantities[1] is None
             yield (quantities[0], expected_batched)
             yield (quantities[2], quantities[3])
 
 
 def opinfo_in_dict(opinfo, d):
-    return (opinfo.name in d) or (f'{opinfo.name}.{opinfo.variant_test_name}' in d)
+    return (opinfo.name in d) or (f"{opinfo.name}.{opinfo.variant_test_name}" in d)
 
 
-DecorateMeta = namedtuple("DecorateMeta", [
-    "op_name",
-    "variant_name",
-    "decorator",
-    "device_type",
-    "dtypes",
-])
+DecorateMeta = namedtuple(
+    "DecorateMeta",
+    [
+        "op_name",
+        "variant_name",
+        "decorator",
+        "device_type",
+        "dtypes",
+    ],
+)
 
 
-def decorate(op_name, variant_name='', *, decorator=None, device_type=None, dtypes=None):
+def decorate(
+    op_name, variant_name="", *, decorator=None, device_type=None, dtypes=None
+):
     assert decorator is not None
-    return DecorateMeta(op_name=op_name,
-                        variant_name=variant_name,
-                        decorator=decorator,
-                        device_type=device_type,
-                        dtypes=dtypes)
-
-
-def xfail(op_name, variant_name='', *, device_type=None, dtypes=None):
-    return decorate(op_name=op_name,
-                    variant_name=variant_name,
-                    decorator=unittest.expectedFailure,
-                    device_type=device_type,
-                    dtypes=dtypes)
-
-
-def skip(op_name, variant_name='', *, device_type=None, dtypes=None):
-    return decorate(op_name=op_name,
-                    variant_name=variant_name,
-                    decorator=unittest.skip("Skipped!"),
-                    device_type=device_type,
-                    dtypes=dtypes)
+    return DecorateMeta(
+        op_name=op_name,
+        variant_name=variant_name,
+        decorator=decorator,
+        device_type=device_type,
+        dtypes=dtypes,
+    )
+
+
+def xfail(op_name, variant_name="", *, device_type=None, dtypes=None):
+    return decorate(
+        op_name=op_name,
+        variant_name=variant_name,
+        decorator=unittest.expectedFailure,
+        device_type=device_type,
+        dtypes=dtypes,
+    )
+
+
+def skip(op_name, variant_name="", *, device_type=None, dtypes=None):
+    return decorate(
+        op_name=op_name,
+        variant_name=variant_name,
+        decorator=unittest.skip("Skipped!"),
+        device_type=device_type,
+        dtypes=dtypes,
+    )
 
 
 def skipOps(test_case_name, base_test_name, to_skip):
     all_opinfos = op_db + additional_op_db + autograd_function_db
     for decorate_meta in to_skip:
-        matching_opinfos = [o for o in all_opinfos
-                            if o.name == decorate_meta.op_name and
-                            o.variant_test_name == decorate_meta.variant_name]
+        matching_opinfos = [
+            o
+            for o in all_opinfos
+            if o.name == decorate_meta.op_name
+            and o.variant_test_name == decorate_meta.variant_name
+        ]
         assert len(matching_opinfos) > 0, f"Couldn't find OpInfo for {decorate_meta}"
         assert len(matching_opinfos) == 1, (
             "OpInfos should be uniquely determined by their (name, variant_name). "
@@ -364,39 +453,55 @@ def skipOps(test_case_name, base_test_name, to_skip):
         )
         opinfo = matching_opinfos[0]
         decorators = list(opinfo.decorators)
-        new_decorator = DecorateInfo(decorate_meta.decorator,
-                                     test_case_name, base_test_name,
-                                     device_type=decorate_meta.device_type,
-                                     dtypes=decorate_meta.dtypes)
+        new_decorator = DecorateInfo(
+            decorate_meta.decorator,
+            test_case_name,
+            base_test_name,
+            device_type=decorate_meta.device_type,
+            dtypes=decorate_meta.dtypes,
+        )
         decorators.append(new_decorator)
         opinfo.decorators = tuple(decorators)
 
     # This decorator doesn't modify fn in any way
     def wrapped(fn):
         return fn
+
     return wrapped
 
 
 def decorateForModules(decorator, module_classes, device_type=None, dtypes=None):
-
     # This decorator doesn't modify fn in any way
-    def wrapped(fn, module_classes=module_classes, decorator=decorator,
-                device_type=device_type, dtypes=dtypes):
-        name_parts = fn.__qualname__.split('.')
-        assert len(name_parts) == 2, "Decorator only applies to a test function of a test class"
+    def wrapped(
+        fn,
+        module_classes=module_classes,
+        decorator=decorator,
+        device_type=device_type,
+        dtypes=dtypes,
+    ):
+        name_parts = fn.__qualname__.split(".")
+        assert (
+            len(name_parts) == 2
+        ), "Decorator only applies to a test function of a test class"
         test_case_name, base_test_name = name_parts
         for module_cls in module_classes:
             matching_module_infos = [m for m in module_db if m.module_cls == module_cls]
-            assert len(matching_module_infos) == 1, f"Couldn't find single ModuleInfo for {module_cls}"
+            assert (
+                len(matching_module_infos) == 1
+            ), f"Couldn't find single ModuleInfo for {module_cls}"
             module_info = matching_module_infos[0]
             decorators = list(module_info.decorators)
-            new_decorator = DecorateInfo(decorator,
-                                         test_case_name, base_test_name,
-                                         device_type=device_type,
-                                         dtypes=dtypes)
+            new_decorator = DecorateInfo(
+                decorator,
+                test_case_name,
+                base_test_name,
+                device_type=device_type,
+                dtypes=dtypes,
+            )
             decorators.append(new_decorator)
             module_info.decorators = tuple(decorators)
         return fn
+
     return wrapped
 
 
@@ -405,6 +510,7 @@ def decorator(fn):
         if condition:
             return unittest.expectedFailure(fn)
         return fn
+
     return decorator
 
 
@@ -413,26 +519,35 @@ def tol2(op_name, variant_name, override_dct, *, device_type=None):
 
 
 def tol1(op_name, override_dct, *, device_type=None):
-    return tol2(op_name, '', override_dct, device_type=device_type)
+    return tol2(op_name, "", override_dct, device_type=device_type)
 
 
 def opsToleranceOverride(test_case_name, base_test_name, overrides):
     all_opinfos = op_db + additional_op_db
     for override in overrides:
         op_name, variant_name, override, device_type = override
-        matching_opinfos = [o for o in all_opinfos
-                            if o.name == op_name and o.variant_test_name == variant_name]
+        matching_opinfos = [
+            o
+            for o in all_opinfos
+            if o.name == op_name and o.variant_test_name == variant_name
+        ]
         assert len(matching_opinfos) == 1, f"Couldn't find OpInfo for {override}"
         opinfo = matching_opinfos[0]
         decorators = list(opinfo.decorators)
-        decorators.append(DecorateInfo(
-            toleranceOverride(override),
-            test_case_name, base_test_name, device_type=device_type))
+        decorators.append(
+            DecorateInfo(
+                toleranceOverride(override),
+                test_case_name,
+                base_test_name,
+                device_type=device_type,
+            )
+        )
         opinfo.decorators = tuple(decorators)
 
     # This decorator doesn't modify fn in any way
     def wrapped(fn):
         return fn
+
     return wrapped
 
 
diff --git a/test/functorch/discover_coverage.py b/test/functorch/discover_coverage.py
index 4e21be4cbe844..dd7e8b6c9aeb7 100644
--- a/test/functorch/discover_coverage.py
+++ b/test/functorch/discover_coverage.py
@@ -1,83 +1,86 @@
-import torch
 import copy
-from torch.testing._internal.common_methods_invocations import op_db
-from functorch_additional_op_db import additional_op_db
-from enum import Enum
-import torch._functorch.top_operators_github_usage as top_ops
+import enum
 import pprint
 import unittest
-import enum
-from torch.testing._internal.common_device_type import toleranceOverride
+from enum import Enum
 
 # Importing these files make modifications to the op_db that we need
 import test_ops  # noqa: F401
 import test_vmap  # noqa: F401
+import torch
+import torch._functorch.top_operators_github_usage as top_ops
+from functorch_additional_op_db import additional_op_db
+from torch.testing._internal.common_device_type import toleranceOverride
+from torch.testing._internal.common_methods_invocations import op_db
 
 all_overridable = list(torch.overrides.get_testing_overrides().keys())
 
 public_docs = [
-    (torch.nn.functional, 'torch.nn.functional', 'docs/source/nn.functional.rst'),
-    (torch.fft, 'torch.fft', 'docs/source/fft.rst'),
-    (torch.special, 'torch.special', 'docs/source/special.rst'),
-    (torch.linalg, 'torch.linalg', 'docs/source/linalg.rst'),
-    (torch, 'torch', 'docs/source/torch.rst'),
-    (torch.Tensor, 'torch.Tensor', 'docs/source/tensors.rst'),
+    (torch.nn.functional, "torch.nn.functional", "docs/source/nn.functional.rst"),
+    (torch.fft, "torch.fft", "docs/source/fft.rst"),
+    (torch.special, "torch.special", "docs/source/special.rst"),
+    (torch.linalg, "torch.linalg", "docs/source/linalg.rst"),
+    (torch, "torch", "docs/source/torch.rst"),
+    (torch.Tensor, "torch.Tensor", "docs/source/tensors.rst"),
 ]
 
 # torch.abs, Tensor.abs, Tensor.abs_ are all considered to be different
 
 
-def get_public_overridable_apis(pytorch_root='/raid/rzou/pt/debug-cpu'):
+def get_public_overridable_apis(pytorch_root="/raid/rzou/pt/debug-cpu"):
     results = {}
     all_overridable_apis = set(torch.overrides.get_testing_overrides().keys())
     for module, module_name, src in public_docs:
-        with open(f'{pytorch_root}/{src}') as f:
+        with open(f"{pytorch_root}/{src}") as f:
             lines = f.readlines()
         # APIs eitehr begin with 4 spaces or ".. autofunction::"
-        api_lines1 = [line.strip() for line in lines if line.startswith(' ' * 4)]
-        api_lines2 = [line.strip()[len('.. autofunction:: '):]
-                      for line in lines if line.startswith('.. autofunction::')]
+        api_lines1 = [line.strip() for line in lines if line.startswith(" " * 4)]
+        api_lines2 = [
+            line.strip()[len(".. autofunction:: ") :]
+            for line in lines
+            if line.startswith(".. autofunction::")
+        ]
         lines = api_lines1 + api_lines2
-        lines = [line[7:] if line.startswith('Tensor.') else line for line in lines]
+        lines = [line[7:] if line.startswith("Tensor.") else line for line in lines]
         lines = [line for line in lines if hasattr(module, line)]
         for line in lines:
             api = getattr(module, line)
             if api in all_overridable_apis:
-                results[f'{module_name}.{line}'] = api
+                results[f"{module_name}.{line}"] = api
     return results
 
 
 denylist = {
-    'torch.Tensor.data_ptr',
-    'torch.Tensor.dim',
-    'torch.Tensor.element_size',
-    'torch.Tensor.backward',
-    'torch.Tensor.as_strided',
-    'torch.Tensor.register_hook',
-    'torch.Tensor.record_stream',
-    'torch.Tensor.qscheme',
-    'torch.Tensor.ndimension',
-    'torch.Tensor.smm',
-    'torch.Tensor.sspaddmm',
-    'torch.Tensor.retain_grad',
-    'torch.Tensor.sparse_mask',
-    'torch.Tensor.sparse_dim',
-    'torch.Tensor.dense_dim',
-    'torch.Tensor.values',
-    'torch.Tensor.indices',
-    'torch.Tensor.numel',
-    'torch.Tensor.size',
-    'torch.Tensor.nelement',
-    'torch.Tensor.q_scale',
-    'torch.Tensor.q_zero_point',
-    'torch.Tensor.q_per_channel_scales',
-    'torch.Tensor.q_per_channel_zero_points',
-    'torch.Tensor.q_per_channel_axis',
-    'torch.Tensor.int_repr',
-    'torch.Tensor.to_sparse',
-    'torch.Tensor.is_inference',
-    'torch.Tensor.storage',
-    'torch.Tensor.storage_type',
+    "torch.Tensor.data_ptr",
+    "torch.Tensor.dim",
+    "torch.Tensor.element_size",
+    "torch.Tensor.backward",
+    "torch.Tensor.as_strided",
+    "torch.Tensor.register_hook",
+    "torch.Tensor.record_stream",
+    "torch.Tensor.qscheme",
+    "torch.Tensor.ndimension",
+    "torch.Tensor.smm",
+    "torch.Tensor.sspaddmm",
+    "torch.Tensor.retain_grad",
+    "torch.Tensor.sparse_mask",
+    "torch.Tensor.sparse_dim",
+    "torch.Tensor.dense_dim",
+    "torch.Tensor.values",
+    "torch.Tensor.indices",
+    "torch.Tensor.numel",
+    "torch.Tensor.size",
+    "torch.Tensor.nelement",
+    "torch.Tensor.q_scale",
+    "torch.Tensor.q_zero_point",
+    "torch.Tensor.q_per_channel_scales",
+    "torch.Tensor.q_per_channel_zero_points",
+    "torch.Tensor.q_per_channel_axis",
+    "torch.Tensor.int_repr",
+    "torch.Tensor.to_sparse",
+    "torch.Tensor.is_inference",
+    "torch.Tensor.storage",
+    "torch.Tensor.storage_type",
 }
 
 
@@ -85,18 +88,19 @@ def get_method_only_ops_we_care_about():
     apis = get_public_overridable_apis()
     result = []
     for key in apis.keys():
-        if not key.startswith('torch.Tensor'):
+        if not key.startswith("torch.Tensor"):
             continue
         if key in denylist:
             continue
-        api = key.split('.')[2]
+        api = key.split(".")[2]
         # filter out in-place
-        if api.endswith('_'):
+        if api.endswith("_"):
             continue
-        if f'torch.{api}' not in apis.keys():
+        if f"torch.{api}" not in apis.keys():
             result.append(api)
     return result
 
+
 # Deduplicates torch.abs and Tensor.abs
 
 
@@ -104,10 +108,10 @@ def get_public_overridable_ops():
     results = get_public_overridable_apis()
     cpy = copy.deepcopy(results)
     for key in cpy.keys():
-        if not key.startswith('torch.Tensor'):
+        if not key.startswith("torch.Tensor"):
             continue
-        api = key.split('.')[2]
-        if f'torch.{api}' in results.keys():
+        api = key.split(".")[2]
+        if f"torch.{api}" in results.keys():
             del results[key]
     return results
 
@@ -117,7 +121,7 @@ def get_public_overridable_outplace_ops():
     cpy = copy.deepcopy(results)
     for key in cpy.keys():
         # NB: there are no dunder methods bcs we don't document those
-        if key.endswith('_'):
+        if key.endswith("_"):
             del results[key]
     return results
 
@@ -127,22 +131,23 @@ def get_public_overridable_outplace_we_care_about():
     cpy = copy.deepcopy(results)
     for key in cpy.keys():
         # quantization
-        if 'quant' in key or '.q_' in key:
+        if "quant" in key or ".q_" in key:
             del results[key]
 
         # is_cpu, etc. It doesn't make sense to have OpInfos for these
-        if '.is_' in key:
+        if ".is_" in key:
             del results[key]
 
         if key in denylist and key in results:
             del results[key]
     return results
 
+
 # e.g. nn.functional.softmax
 
 
 def get_op(dotted_name):
-    names = dotted_name.split('.')
+    names = dotted_name.split(".")
     mod = torch
     for name in names:
         if not hasattr(mod, name):
@@ -150,6 +155,7 @@ def get_op(dotted_name):
         mod = getattr(mod, name)
     return mod
 
+
 # Maps function -> [OpInfo]
 
 
@@ -176,56 +182,92 @@ def safe_append(dct, key, val):
 
 
 factory_fns = {
-    'tensor', 'zeros', 'ones', 'randn', 'arange', 'rand', 'empty', 'randperm',
-    'linspace', 'logspace', 'hann_window', 'full', 'eye', 'blackman_window',
-    'bartlett_window', 'randint', 'range',
+    "tensor",
+    "zeros",
+    "ones",
+    "randn",
+    "arange",
+    "rand",
+    "empty",
+    "randperm",
+    "linspace",
+    "logspace",
+    "hann_window",
+    "full",
+    "eye",
+    "blackman_window",
+    "bartlett_window",
+    "randint",
+    "range",
 }
 
 
 def get_top_ops(torch_threshold, nn_fn_threshold, with_counts=False):
-    denylist = set({
-        # These are either not real "operators", factory functions
-        # that trivially work, or not-documented ops.
-        'load', 'no_grad', 'save', 'from_numpy',
-        'manual_seed', 'set_grad_enabled',
-        'set_default_tensor_type', 'set_num_threads',
-        'set_printoptions', 'numel',
-        'set_default_dtype', 'sparse_coo_tensor', 'set_rng_state',
-        'get_rng_state', 'get_default_dtype', 'initial_seed',
-        'get_num_threads', 'quantize_per_tensor',
-        'hann_window', 'is_tensor', 'as_tensor',
-        'equal', 'enable_grad', 'seed', 'is_storage',
-        'is_floating_point', 'nn.functional.torch',
-        'set_flush_denormal', 'set_num_interop_threads', 'dequantize',
-        'get_num_interop_threads', 'nn.functional.math',
-        'nn.functional.threshold_',
-        'nn.functional.selu_',
-        'nn.functional.elu_',
-        'nn.functional.rrelu_',
-        'nn.functional.leaky_relu_',
-        'nn.functional.hardtanh_',
-        'nn.functional.has_torch_function',
-        'nn.functional.has_torch_function_unary',
-        'nn.functional.has_torch_function_variadic',
-        'nn.functional.handle_torch_function',
-        'nn.functional.adaptive_max_pool1d_with_indices',
-        'nn.functional.adaptive_max_pool2d_with_indices',
-        'nn.functional.adaptive_max_pool3d_with_indices',
-        'nn.functional.fractional_max_pool2d_with_indices',
-        'nn.functional.fractional_max_pool3d_with_indices',
-        'is_complex',
-        'grad',
-        'quantize_per_channel',
-        'nn.functional.max_pool2d_with_indices',
-        'nn.functional.max_pool3d_with_indices',
-        'nn.functional.max_pool1d_with_indices',
-        'nn.functional.celu_',
-        'nn.functional.grad',
-        'nn.functional.relu_',
-        'nn.functional.boolean_dispatch',
-        'nn.functional.assert_int_or_pair',
-        'fft',  # is namespace
-    })
+    denylist = set(
+        {
+            # These are either not real "operators", factory functions
+            # that trivially work, or not-documented ops.
+            "load",
+            "no_grad",
+            "save",
+            "from_numpy",
+            "manual_seed",
+            "set_grad_enabled",
+            "set_default_tensor_type",
+            "set_num_threads",
+            "set_printoptions",
+            "numel",
+            "set_default_dtype",
+            "sparse_coo_tensor",
+            "set_rng_state",
+            "get_rng_state",
+            "get_default_dtype",
+            "initial_seed",
+            "get_num_threads",
+            "quantize_per_tensor",
+            "hann_window",
+            "is_tensor",
+            "as_tensor",
+            "equal",
+            "enable_grad",
+            "seed",
+            "is_storage",
+            "is_floating_point",
+            "nn.functional.torch",
+            "set_flush_denormal",
+            "set_num_interop_threads",
+            "dequantize",
+            "get_num_interop_threads",
+            "nn.functional.math",
+            "nn.functional.threshold_",
+            "nn.functional.selu_",
+            "nn.functional.elu_",
+            "nn.functional.rrelu_",
+            "nn.functional.leaky_relu_",
+            "nn.functional.hardtanh_",
+            "nn.functional.has_torch_function",
+            "nn.functional.has_torch_function_unary",
+            "nn.functional.has_torch_function_variadic",
+            "nn.functional.handle_torch_function",
+            "nn.functional.adaptive_max_pool1d_with_indices",
+            "nn.functional.adaptive_max_pool2d_with_indices",
+            "nn.functional.adaptive_max_pool3d_with_indices",
+            "nn.functional.fractional_max_pool2d_with_indices",
+            "nn.functional.fractional_max_pool3d_with_indices",
+            "is_complex",
+            "grad",
+            "quantize_per_channel",
+            "nn.functional.max_pool2d_with_indices",
+            "nn.functional.max_pool3d_with_indices",
+            "nn.functional.max_pool1d_with_indices",
+            "nn.functional.celu_",
+            "nn.functional.grad",
+            "nn.functional.relu_",
+            "nn.functional.boolean_dispatch",
+            "nn.functional.assert_int_or_pair",
+            "fft",  # is namespace
+        }
+    )
 
     torch_ops = top_ops.top_torch
     nn_fn_ops = top_ops.get_nn_functional_top_list()
@@ -246,7 +288,7 @@ def get_ops_percentage(torch_threshold, nn_fn_threshold):
 
     def get_num_usages(opname):
         # Ignore this, this is heavily inflated
-        if opname == 't':
+        if opname == "t":
             return 0
         result = [op[1] for op in data if op[0] == opname]
         assert len(result) == 1
@@ -254,11 +296,11 @@ def get_num_usages(opname):
 
     # get all operators that are not in the denylist
     all_ops = get_top_ops(999999, 999999)
-    total_op_usages = sum([get_num_usages(op) for op in all_ops])
+    total_op_usages = sum(get_num_usages(op) for op in all_ops)
 
     # get subset of all operators
     subset_ops = get_top_ops(torch_threshold, nn_fn_threshold)
-    subset_op_usages = sum([get_num_usages(op) for op in subset_ops])
+    subset_op_usages = sum(get_num_usages(op) for op in subset_ops)
     return subset_op_usages / total_op_usages
 
 
@@ -296,13 +338,13 @@ class Status(Enum):
 
 
 tests = {
-    'test_vmap_exhaustive',
-    'test_op_has_batch_rule',
-    'test_vjp',
-    'test_vmapvjp',
-    'test_vmapvjp_has_batch_rule',
-    'test_jvp',
-    'test_vmapjvp',
+    "test_vmap_exhaustive",
+    "test_op_has_batch_rule",
+    "test_vjp",
+    "test_vmapvjp",
+    "test_vmapvjp_has_batch_rule",
+    "test_jvp",
+    "test_vmapjvp",
 }
 
 
@@ -323,8 +365,7 @@ def get_all_tested_ops():
     result = set({})
     for op in get_covered_ops(overridable_outplace_we_care_about).values():
         opinfos = op_to_opinfo[op]
-        for opinfo in opinfos:
-            result.add(opinfo.name)
+        result.update(opinfo.name for opinfo in opinfos)
     return result
 
 
@@ -336,7 +377,7 @@ def get_skipped_or_xfailed_ops_for(test_name):
         opinfos = op_to_opinfo[op]
         for opinfo in opinfos:
             for decorator in opinfo.decorators:
-                if not hasattr(decorator, 'test_name'):
+                if not hasattr(decorator, "test_name"):
                     continue
                 if decorator.test_name != test_name:
                     continue
@@ -363,7 +404,7 @@ def get_covered_tests(op):
         result = copy.deepcopy(tests)
         for opinfo in opinfos:
             for decorator in opinfo.decorators:
-                if not hasattr(decorator, 'test_name'):
+                if not hasattr(decorator, "test_name"):
                     continue
                 if decorator.test_name in tests and decorator.test_name in result:
                     result.remove(decorator.test_name)
@@ -404,7 +445,9 @@ def transpose_statuses(for_subset=None, invert=False):
 overridable_outplace_we_care_about = get_public_overridable_outplace_we_care_about()
 
 tested_overridable_outplace_ops = get_covered_ops(overridable_outplace_we_care_about)
-untested_overridable_outplace_ops = get_covered_ops(overridable_outplace_we_care_about, invert=True)
+untested_overridable_outplace_ops = get_covered_ops(
+    overridable_outplace_we_care_about, invert=True
+)
 
 # print("List of OpInfos we need:")
 # for key in untested_overridable_outplace_ops.keys():
@@ -412,15 +455,19 @@ def transpose_statuses(for_subset=None, invert=False):
 # print("-" * 80)
 # print("")
 
-print(f'Overridable public APIs: {len(overridable_apis)}')
-print(f'Overridable public ops: {len(overridable_ops)}')
-print(f'Overridable public outplace ops: {len(overridable_outplace_ops)}')
-print(f'Overridable public outplace ops we care about: {len(overridable_outplace_we_care_about)}')
-print(f'OpInfo-tested overridable public outplace ops: {len(tested_overridable_outplace_ops)}')
+print(f"Overridable public APIs: {len(overridable_apis)}")
+print(f"Overridable public ops: {len(overridable_ops)}")
+print(f"Overridable public outplace ops: {len(overridable_outplace_ops)}")
+print(
+    f"Overridable public outplace ops we care about: {len(overridable_outplace_we_care_about)}"
+)
+print(
+    f"OpInfo-tested overridable public outplace ops: {len(tested_overridable_outplace_ops)}"
+)
 
 
 def remove_torch(name):
-    assert name[:6] == 'torch.'
+    assert name[:6] == "torch."
     return name[6:]
 
 
@@ -430,19 +477,19 @@ def get_list_of_all_tests():
 
 
 mytest = {
-    'test_vmap_exhaustive',
-    'test_op_has_batch_rule',
-    'test_vjp',
-    'test_vmapvjp',
-    'test_vmapvjp_has_batch_rule',
+    "test_vmap_exhaustive",
+    "test_op_has_batch_rule",
+    "test_vjp",
+    "test_vmapvjp",
+    "test_vmapvjp_has_batch_rule",
 }
 
-print('*' * 80)
+print("*" * 80)
 all_tests = get_list_of_all_tests()
 for test in mytest:
     result = get_skipped_or_xfailed_ops_for(test)
     diff = len(all_tests - result)
-    print(f'{test}: {diff}')
+    print(f"{test}: {diff}")
 
 
 def get_jvp_coverage(subset=None):
@@ -452,33 +499,42 @@ def get_jvp_coverage(subset=None):
     op_to_opinfo = get_ops_covered_by_opinfos()
     ops_dct = tested_overridable_outplace_ops
     if subset is not None:
-        ops_dct = {name: op for name, op in ops_dct.items()
-                   if remove_torch(name) in subset}
-    supports_autograd_ops_dct = {name: op_to_opinfo[fn] for name, fn in ops_dct.items()
-                                 if op_to_opinfo[fn][0].supports_autograd}
-    supports_forwardad_ops_dct = {name: op_to_opinfo[fn] for name, fn in ops_dct.items()
-                                  if op_to_opinfo[fn][0].supports_forward_ad}
+        ops_dct = {
+            name: op for name, op in ops_dct.items() if remove_torch(name) in subset
+        }
+    supports_autograd_ops_dct = {
+        name: op_to_opinfo[fn]
+        for name, fn in ops_dct.items()
+        if op_to_opinfo[fn][0].supports_autograd
+    }
+    supports_forwardad_ops_dct = {
+        name: op_to_opinfo[fn]
+        for name, fn in ops_dct.items()
+        if op_to_opinfo[fn][0].supports_forward_ad
+    }
 
     ops = {remove_torch(test) for test in list(ops_dct.keys())}
-    supports_autograd = {remove_torch(test)
-                         for test in list(supports_autograd_ops_dct.keys())}
-    supports_forward_ad = {remove_torch(test)
-                           for test in list(supports_forwardad_ops_dct.keys())}
+    supports_autograd = {
+        remove_torch(test) for test in list(supports_autograd_ops_dct.keys())
+    }
+    supports_forward_ad = {
+        remove_torch(test) for test in list(supports_forwardad_ops_dct.keys())
+    }
     assert supports_forward_ad.issubset(supports_autograd)
     assert supports_autograd.issubset(ops)
 
-    failed_ops = get_skipped_or_xfailed_ops_for('test_jvp')
+    failed_ops = get_skipped_or_xfailed_ops_for("test_jvp")
 
     coverage = len(supports_forward_ad - failed_ops)
     no_forward_ad = len(supports_autograd) - len(supports_forward_ad)
-    print(f'test_jvp, {coverage}, {no_forward_ad}, {len(ops)}')
+    print(f"test_jvp, {coverage}, {no_forward_ad}, {len(ops)}")
 
 
 get_jvp_coverage()
 get_jvp_coverage(get_top_ops(100, 25))
 for op in get_top_ops(100, 25):
     print(op)
-print('*' * 80)
+print("*" * 80)
 
 # result = get_skipped_or_xfailed_ops_for('test_vmap_exhaustive')
 # result = get_skipped_or_xfailed_ops_for('test_op_has_batch_rule')
@@ -489,16 +545,16 @@ def get_jvp_coverage(subset=None):
 
 statuses = transpose_statuses()
 for test in tests:
-    print(f'{test} coverage {len(statuses[test])}')
+    print(f"{test} coverage {len(statuses[test])}")
 
 method_only_ops = get_method_only_ops_we_care_about()
 # for op in method_only_ops:
 #     print(f'    {op},')
 
 top_ops_not_covered_by_opinfo = get_top_ops_not_covered_by_opinfo(100, 25)
-print('=' * 80)
+print("=" * 80)
 for op in top_ops_not_covered_by_opinfo:
-    print(f'{op}, {top_ops.usage_count[op]}')
+    print(f"{op}, {top_ops.usage_count[op]}")
 
 # print("top ops not covered by opinfo: ")
 # top_ops_not_covered_by_opinfo = get_top_ops_not_covered_by_opinfo(200, 50)
@@ -523,58 +579,60 @@ def remove_from_set(parent, to_remove):
 
 
 def print_coverage_info(th=100, nn=25):
-    print('=' * 80)
+    print("=" * 80)
     print(f"top {th}, {nn} coverage")
     statuses = transpose_statuses(get_top_ops(th, nn), invert=True)
     top_ops_not_covered_by_opinfo = get_top_ops_not_covered_by_opinfo(th, nn)
 
     # testing problems
     exemptions = {
-        'torch.nn.functional.dropout',  # randomness
+        "torch.nn.functional.dropout",  # randomness
     }
 
     # Allowed exemptions
     vmap_exemptions = {
-        'torch.randn_like',  # randomness
-        'torch.rand_like',  # randomness
-        'torch.allclose',  # number output
-        'torch.unique',  # dynamic
-        'torch.nonzero',  # dynamic
-        'torch.masked_select',  # dynamic
-        'torch.prod',  # dynamic (backward)
-        'torch.norm',  # norm with nuc is not commonly used; we support the other cases.
-        'torch.svd',  # There isn't a bug, it is just nondeterministic so we can't test it.
-        'torch.nn.functional.embedding',  # We support everything except the sparse option.
+        "torch.randn_like",  # randomness
+        "torch.rand_like",  # randomness
+        "torch.allclose",  # number output
+        "torch.unique",  # dynamic
+        "torch.nonzero",  # dynamic
+        "torch.masked_select",  # dynamic
+        "torch.prod",  # dynamic (backward)
+        "torch.norm",  # norm with nuc is not commonly used; we support the other cases.
+        "torch.svd",  # There isn't a bug, it is just nondeterministic so we can't test it.
+        "torch.nn.functional.embedding",  # We support everything except the sparse option.
     }
-    remove_from_set(statuses['test_vmap_exhaustive'], vmap_exemptions)
-    remove_from_set(statuses['test_vmapvjp'], vmap_exemptions)
-    remove_from_set(statuses['test_vmapvjp_has_batch_rule'], vmap_exemptions)
-    remove_from_set(statuses['test_op_has_batch_rule'], vmap_exemptions)
-    remove_from_set(statuses['test_vmapjvp'], vmap_exemptions)
+    remove_from_set(statuses["test_vmap_exhaustive"], vmap_exemptions)
+    remove_from_set(statuses["test_vmapvjp"], vmap_exemptions)
+    remove_from_set(statuses["test_vmapvjp_has_batch_rule"], vmap_exemptions)
+    remove_from_set(statuses["test_op_has_batch_rule"], vmap_exemptions)
+    remove_from_set(statuses["test_vmapjvp"], vmap_exemptions)
     for test in tests:
         remove_from_set(statuses[test], exemptions)
 
     print(f"total ops in set: {th + nn}")
     print(f"tested by OpInfo: {th + nn - len(top_ops_not_covered_by_opinfo)}")
     for test in tests:
-        if test in {'test_jvp', 'test_vmapjvp'}:
+        if test in {"test_jvp", "test_vmapjvp"}:
             continue
-        print(f'{test} failing coverage {len(statuses[test])}')
+        print(f"{test} failing coverage {len(statuses[test])}")
 
     # We don't care about these yet
-    del statuses['test_jvp']
-    del statuses['test_vmapjvp']
+    del statuses["test_jvp"]
+    del statuses["test_vmapjvp"]
 
     pprint.pprint(statuses)
 
 
 def get_name_to_opinfo_map():
     dct = {}
-    for op in (op_db + additional_op_db):
+    for op in op_db + additional_op_db:
+
         def add(name, op):
             if name not in dct:
                 dct[name] = []
             dct[name].append(op)
+
         add(op.name, op)
         for alias in op.aliases:
             add(alias.name, op)
@@ -591,42 +649,54 @@ class Support(enum.Enum):
 
 
 FACTORY_FNS = {
-    'tensor', 'zeros', 'ones', 'randn', 'arange', 'rand', 'empty', 'range',
-    'full', 'randperm', 'eye', 'randint', 'linspace', 'logspace',
+    "tensor",
+    "zeros",
+    "ones",
+    "randn",
+    "arange",
+    "rand",
+    "empty",
+    "range",
+    "full",
+    "randperm",
+    "eye",
+    "randint",
+    "linspace",
+    "logspace",
 }
 
 VJP_EXEMPTIONS = {
-    'nn.functional.dropout',  # not actually problem, randomness testing artifact
-    'nn.functional.dropout2d',  # not actually problem, randomness testing artifact
-    'nn.functional.rrelu',  # not actually problem, randomness testing artifact
-    'bernoulli',  # not actually problem, randomness testing artifact
-    'normal',  # not actually problem, randomness testing artifact
+    "nn.functional.dropout",  # not actually problem, randomness testing artifact
+    "nn.functional.dropout2d",  # not actually problem, randomness testing artifact
+    "nn.functional.rrelu",  # not actually problem, randomness testing artifact
+    "bernoulli",  # not actually problem, randomness testing artifact
+    "normal",  # not actually problem, randomness testing artifact
 }
 
 VMAP_EXEMPTIONS = {
-    'randn_like',  # randomness
-    'rand_like',  # randomness
-    'allclose',  # number output
-    'unique',  # dynamic
-    'nonzero',  # dynamic
-    'masked_select',  # dynamic
-    'prod',  # dynamic (backward)
-    'norm',  # norm with nuc is not commonly used; we support the other cases.
-    'svd',  # There isn't a bug, it is just nondeterministic so we can't test it.
-    'nn.functional.embedding',  # We support everything except the sparse option.
-    'nn.functional.dropout',  # randomness
-    'nn.functional.dropout2d',  # randomness
-    'bernoulli',  # randomness
-    'multinomial',  # randomness
-    'normal',  # randomness
+    "randn_like",  # randomness
+    "rand_like",  # randomness
+    "allclose",  # number output
+    "unique",  # dynamic
+    "nonzero",  # dynamic
+    "masked_select",  # dynamic
+    "prod",  # dynamic (backward)
+    "norm",  # norm with nuc is not commonly used; we support the other cases.
+    "svd",  # There isn't a bug, it is just nondeterministic so we can't test it.
+    "nn.functional.embedding",  # We support everything except the sparse option.
+    "nn.functional.dropout",  # randomness
+    "nn.functional.dropout2d",  # randomness
+    "bernoulli",  # randomness
+    "multinomial",  # randomness
+    "normal",  # randomness
 }
 
 JVP_EXEMPTIONS = {
-    'nn.functional.dropout',  # not actually problem, randomness testing artifact
-    'nn.functional.dropout2d',  # not actually problem, randomness testing artifact
-    'nn.functional.rrelu',  # not actually problem, randomness testing artifact
-    'normal',  # not actually problem, randomness testing artifact
-    'bernoulli',  # not actually problem, randomness testing artifact
+    "nn.functional.dropout",  # not actually problem, randomness testing artifact
+    "nn.functional.dropout2d",  # not actually problem, randomness testing artifact
+    "nn.functional.rrelu",  # not actually problem, randomness testing artifact
+    "normal",  # not actually problem, randomness testing artifact
+    "bernoulli",  # not actually problem, randomness testing artifact
 }
 
 
@@ -651,7 +721,7 @@ def no_opinfos_skip_test(self, test_name):
             return Support.UNKNOWN
         for opinfo in self.opinfos:
             for decorator in opinfo.decorators:
-                if not hasattr(decorator, 'test_name'):
+                if not hasattr(decorator, "test_name"):
                     continue
                 if decorator.test_name != test_name:
                     continue
@@ -661,12 +731,12 @@ def no_opinfos_skip_test(self, test_name):
 
     def any_opinfo_attr(self, attr):
         if not self.has_opinfo():
-            raise RuntimeError()
+            raise RuntimeError
         return any(getattr(opinfo, attr) for opinfo in self.opinfos)
 
     def all_opinfo_attr(self, attr):
         if not self.has_opinfo():
-            raise RuntimeError()
+            raise RuntimeError
         return all(getattr(opinfo, attr) for opinfo in self.opinfos)
 
     def supports_vjp(self):
@@ -674,35 +744,35 @@ def supports_vjp(self):
             return Support.YES
         if self.name in VJP_EXEMPTIONS:
             return Support.YES
-        return self.no_opinfos_skip_test('test_vjp')
+        return self.no_opinfos_skip_test("test_vjp")
 
     def supports_vmap(self):
         if self.name in FACTORY_FNS:
             return Support.YES
         if self.name in VMAP_EXEMPTIONS:
             return Support.YES
-        return self.no_opinfos_skip_test('test_vmap_exhaustive')
+        return self.no_opinfos_skip_test("test_vmap_exhaustive")
 
     def supports_fast_vmap(self):
         if self.name in FACTORY_FNS:
             return Support.YES
         if self.name in VMAP_EXEMPTIONS:
             return Support.YES
-        return self.no_opinfos_skip_test('test_op_has_batch_rule')
+        return self.no_opinfos_skip_test("test_op_has_batch_rule")
 
     def supports_vmapvjp(self):
         if self.name in FACTORY_FNS:
             return Support.YES
         if self.name in VMAP_EXEMPTIONS:
             return Support.YES
-        return self.no_opinfos_skip_test('test_vmapvjp')
+        return self.no_opinfos_skip_test("test_vmapvjp")
 
     def supports_fast_vmapvjp(self):
         if self.name in FACTORY_FNS:
             return Support.YES
         if self.name in VMAP_EXEMPTIONS:
             return Support.YES
-        return self.no_opinfos_skip_test('test_vmapvjp_has_batch_rule')
+        return self.no_opinfos_skip_test("test_vmapvjp_has_batch_rule")
 
     def supports_jvp(self):
         if self.name in FACTORY_FNS:
@@ -711,56 +781,58 @@ def supports_jvp(self):
             return Support.YES
         if not self.has_opinfo():
             return Support.UNKNOWN
-        if self.any_opinfo_attr('supports_autograd') and \
-                not self.all_opinfo_attr('supports_forward_ad'):
+        if self.any_opinfo_attr("supports_autograd") and not self.all_opinfo_attr(
+            "supports_forward_ad"
+        ):
             return Support.NO
-        return self.no_opinfos_skip_test('test_jvp')
+        return self.no_opinfos_skip_test("test_jvp")
 
     def supports_jvpvjp(self):
         if self.name in FACTORY_FNS:
             return Support.YES
         exemptions = {
             # we have support (see OpInfo), testing artifact
-            'nn.functional.dropout2d',
-            'nn.functional.dropout',
+            "nn.functional.dropout2d",
+            "nn.functional.dropout",
             # exception: we dont even support double backward for this
-            'nn.functional.hardswish',
-            'bernoulli',  # this isn't differentiable
-            'normal',  # not differentiable
+            "nn.functional.hardswish",
+            "bernoulli",  # this isn't differentiable
+            "normal",  # not differentiable
         }
         if self.name in exemptions:
             return Support.YES
-        return self.no_opinfos_skip_test('test_jvpvjp')
+        return self.no_opinfos_skip_test("test_jvpvjp")
 
     def _supports_vmapjvp_base(self, test):
         if self.name in FACTORY_FNS:
             return Support.YES
         VMAPJVP_EXEMPTIONS = {
-            'prod',  # dynamic (backward)
-            'nn.functional.batch_norm',  # testing problem
-            'normal',  # not actually problem, randomness testing artifact
-            'bernoulli',  # not actually problem, randomness testing artifact
-            'nn.functional.dropout2d',  # not actually problem, randomness testing artifact
-            'nn.functional.dropout',  # not actually problem, randomness testing artifact
+            "prod",  # dynamic (backward)
+            "nn.functional.batch_norm",  # testing problem
+            "normal",  # not actually problem, randomness testing artifact
+            "bernoulli",  # not actually problem, randomness testing artifact
+            "nn.functional.dropout2d",  # not actually problem, randomness testing artifact
+            "nn.functional.dropout",  # not actually problem, randomness testing artifact
             # Not a problem.
             # It's just that the max_norm testing mutates inputs...
             # (we have our own functorch variant of the OpInfo without max_norm)
-            'nn.functional.embedding',
+            "nn.functional.embedding",
         }
         if self.name in VMAPJVP_EXEMPTIONS:
             return Support.YES
         if not self.has_opinfo():
             return Support.UNKNOWN
-        if self.any_opinfo_attr('supports_autograd') and \
-                not self.all_opinfo_attr('supports_forward_ad'):
+        if self.any_opinfo_attr("supports_autograd") and not self.all_opinfo_attr(
+            "supports_forward_ad"
+        ):
             return Support.NO
         return self.no_opinfos_skip_test(test)
 
     def supports_vmapjvp(self):
-        return self._supports_vmapjvp_base('test_vmapjvpall')
+        return self._supports_vmapjvp_base("test_vmapjvpall")
 
     def supports_fast_vmapjvp(self):
-        return self._supports_vmapjvp_base('test_vmapjvpall_has_batch_rule')
+        return self._supports_vmapjvp_base("test_vmapjvpall_has_batch_rule")
 
 
 class OperatorSet:
@@ -790,14 +862,14 @@ def all(cls):
         names = dct.keys()
         names_sanitized = []
         for n in names:
-            torch_tensor = 'torch.Tensor.'
-            torch_dot = 'torch.'
+            torch_tensor = "torch.Tensor."
+            torch_dot = "torch."
             if n.startswith(torch_tensor):
-                names_sanitized.append(n[len(torch_tensor):])
+                names_sanitized.append(n[len(torch_tensor) :])
             elif n.startswith(torch_dot):
-                names_sanitized.append(n[len(torch_dot):])
+                names_sanitized.append(n[len(torch_dot) :])
             else:
-                raise AssertionError()
+                raise AssertionError
         return cls.from_names(names_sanitized)
 
     def query(self, operator_method, filter=(Support.NO, Support.YES, Support.UNKNOWN)):
@@ -812,32 +884,32 @@ def query(self, operator_method, filter=(Support.NO, Support.YES, Support.UNKNOW
 
     def summary(self):
         checks = [
-            'supports_vjp',
-            'supports_vmap',
-            'supports_fast_vmap',
-            'supports_vmapvjp',
-            'supports_fast_vmapvjp',
-            'supports_jvp',
-            'supports_vmapjvp',
-            'supports_fast_vmapjvp',
-            'supports_jvpvjp',
+            "supports_vjp",
+            "supports_vmap",
+            "supports_fast_vmap",
+            "supports_vmapvjp",
+            "supports_fast_vmapvjp",
+            "supports_jvp",
+            "supports_vmapjvp",
+            "supports_fast_vmapjvp",
+            "supports_jvpvjp",
         ]
-        result = ['test, yes, no, unknown']
+        result = ["test, yes, no, unknown"]
         for check in checks:
             accessor = getattr(Operator, check)
             all_results = self.query(accessor)
             yes_amt = len(all_results[Support.YES])
             no_amt = len(all_results[Support.NO])
             unknown_amt = len(all_results[Support.UNKNOWN])
-            result.append(f'{check}, {yes_amt}, {no_amt}, {unknown_amt}')
-        return '\n'.join(result)
+            result.append(f"{check}, {yes_amt}, {no_amt}, {unknown_amt}")
+        return "\n".join(result)
 
 
 opset = OperatorSet.all()
 has_no_opinfo = opset.query(Operator.has_opinfo, (False,))
 
 print("=" * 30 + " Summary " + "=" * 30)
-print(f'% of usages on github: {get_ops_percentage(99999, 99999)}')
+print(f"% of usages on github: {get_ops_percentage(99999, 99999)}")
 print(opset.summary())
 
 # sanity checks
@@ -845,7 +917,7 @@ def summary(self):
 # pprint.pprint(result)
 
 print("=" * 30 + " Top 60 Summary " + "=" * 30)
-print(f'% of usages on github: {get_ops_percentage(35, 25)}')
+print(f"% of usages on github: {get_ops_percentage(35, 25)}")
 opset = OperatorSet.from_top_ops_threshold(35, 25)
 # result = opset.query(Operator.supports_vmapjvp, (Support.NO, Support.UNKNOWN))
 # pprint.pprint(result)
@@ -861,7 +933,7 @@ def summary(self):
 print(opset.summary())
 
 print("=" * 30 + " Top 125 Summary " + "=" * 30)
-print(f'% of usages on github: {get_ops_percentage(100, 25)}')
+print(f"% of usages on github: {get_ops_percentage(100, 25)}")
 opset = OperatorSet.from_top125()
 # result = opset.query(Operator.supports_vmap, (Support.NO, Support.UNKNOWN))
 # pprint.pprint(result)
diff --git a/test/functorch/functorch_additional_op_db.py b/test/functorch/functorch_additional_op_db.py
index ecd3bcbe4ac85..1c0ce20c52fb7 100644
--- a/test/functorch/functorch_additional_op_db.py
+++ b/test/functorch/functorch_additional_op_db.py
@@ -1,12 +1,20 @@
-from functools import partial
 import itertools
 import unittest
+from functools import partial
 
 import torch
 
-from torch.testing._internal.common_dtype import floating_types, floating_types_and, all_types_and_complex_and
+from torch.testing._internal.common_dtype import (
+    all_types_and_complex_and,
+    floating_types,
+    floating_types_and,
+)
+from torch.testing._internal.common_methods_invocations import (
+    DecorateInfo,
+    OpInfo,
+    SampleInput,
+)
 from torch.testing._internal.common_utils import make_tensor
-from torch.testing._internal.common_methods_invocations import OpInfo, SampleInput, DecorateInfo
 
 # List of OpInfos that aren't in PyTorch Core yet.
 # They are here because we wanted a fast way of writing OpInfos and may not be
@@ -18,118 +26,176 @@
 # https://github.com/pytorch/pytorch/pull/61068
 
 
-def sample_inputs_conv2d(has_bias, self, device, dtype, requires_grad, extra_args=(), groups=1):
+def sample_inputs_conv2d(
+    has_bias, self, device, dtype, requires_grad, extra_args=(), groups=1
+):
     in_ch, out_ch = 6, 4
-    inp = make_tensor((2, in_ch * groups, 7, 5), device=device, dtype=dtype,
-                      requires_grad=requires_grad, low=-1, high=1)
-    weight = make_tensor((out_ch * groups, in_ch, 3, 2), device=device, dtype=dtype,
-                         requires_grad=requires_grad, low=-1, high=1)
+    inp = make_tensor(
+        (2, in_ch * groups, 7, 5),
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+        low=-1,
+        high=1,
+    )
+    weight = make_tensor(
+        (out_ch * groups, in_ch, 3, 2),
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+        low=-1,
+        high=1,
+    )
     bias = None
     if has_bias:
-        bias = make_tensor((out_ch * groups,), device=device, dtype=dtype,
-                           requires_grad=requires_grad, low=-1, high=1)
+        bias = make_tensor(
+            (out_ch * groups,),
+            device=device,
+            dtype=dtype,
+            requires_grad=requires_grad,
+            low=-1,
+            high=1,
+        )
     return [SampleInput(inp, args=((weight, bias) + extra_args))]
 
 
-additional_op_db.extend([
-    OpInfo('nn.functional.conv2d',
-           aten_name="conv2d",
-           variant_test_name='no_bias',
-           supports_autograd=True,
-           supports_forward_ad=True,
-           sample_inputs_func=partial(sample_inputs_conv2d, False),
-           dtypes=floating_types(),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
-           supports_out=False),
-    OpInfo('nn.functional.conv2d',
-           aten_name="conv2d",
-           variant_test_name='with_bias',
-           supports_autograd=True,
-           supports_forward_ad=True,
-           sample_inputs_func=partial(sample_inputs_conv2d, True),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
-           dtypes=floating_types(),
-           supports_out=False),
-    OpInfo('nn.functional.conv2d',
-           aten_name="conv2d",
-           variant_test_name='stride_with_bias',
-           supports_autograd=True,
-           supports_forward_ad=True,
-           sample_inputs_func=partial(sample_inputs_conv2d, True, extra_args=((2, 2))),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
-           dtypes=floating_types(),
-           supports_out=False),
-    OpInfo('nn.functional.conv2d',
-           aten_name="conv2d",
-           variant_test_name='stride_no_bias',
-           supports_autograd=True,
-           supports_forward_ad=True,
-           sample_inputs_func=partial(sample_inputs_conv2d, False, extra_args=((2, 2))),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
-           dtypes=floating_types(),
-           supports_out=False),
-    OpInfo('nn.functional.conv2d',
-           aten_name="conv2d",
-           variant_test_name='stride_padding_with_bias',
-           supports_autograd=True,
-           supports_forward_ad=True,
-           sample_inputs_func=partial(sample_inputs_conv2d, True, extra_args=((2, 2), (1, 1))),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
-           dtypes=floating_types(),
-           supports_out=False),
-    OpInfo('nn.functional.conv2d',
-           aten_name="conv2d",
-           variant_test_name='stride_padding_no_bias',
-           supports_autograd=True,
-           supports_forward_ad=True,
-           sample_inputs_func=partial(sample_inputs_conv2d, False, extra_args=((2, 2), (1, 1))),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
-           dtypes=floating_types(),
-           supports_out=False),
-    OpInfo('nn.functional.conv2d',
-           aten_name="conv2d",
-           variant_test_name='strided_padding_dilation_with_bias',
-           supports_autograd=True,
-           supports_forward_ad=True,
-           sample_inputs_func=partial(sample_inputs_conv2d, True, extra_args=((2, 2), (1, 1), (2, 2))),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
-           dtypes=floating_types(),
-           supports_out=False),
-    OpInfo('nn.functional.conv2d',
-           aten_name="conv2d",
-           variant_test_name='strided_padding_dilation_no_bias',
-           supports_autograd=True,
-           supports_forward_ad=True,
-           sample_inputs_func=partial(sample_inputs_conv2d, True, extra_args=((2, 2), (1, 1), (2, 2))),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
-           dtypes=floating_types(),
-           supports_out=False),
-    OpInfo('nn.functional.conv2d',
-           aten_name="conv2d",
-           variant_test_name='stride_groups_with_bias',
-           supports_autograd=True,
-           supports_forward_ad=True,
-           sample_inputs_func=partial(sample_inputs_conv2d, True, extra_args=((2, 3), 0, 1, 2), groups=2),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
-           dtypes=floating_types(),
-           supports_out=False),
-    OpInfo('nn.functional.conv2d',
-           aten_name="conv2d",
-           variant_test_name='stride_depthwise_with_bias',
-           supports_autograd=True,
-           supports_forward_ad=True,
-           sample_inputs_func=partial(sample_inputs_conv2d, True, extra_args=((2, 3), 0, 1, 6), groups=6),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
-           dtypes=floating_types(),
-           supports_out=False),
-])
+additional_op_db.extend(
+    [
+        OpInfo(
+            "nn.functional.conv2d",
+            aten_name="conv2d",
+            variant_test_name="no_bias",
+            supports_autograd=True,
+            supports_forward_ad=True,
+            sample_inputs_func=partial(sample_inputs_conv2d, False),
+            dtypes=floating_types(),
+            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+            supports_out=False,
+        ),
+        OpInfo(
+            "nn.functional.conv2d",
+            aten_name="conv2d",
+            variant_test_name="with_bias",
+            supports_autograd=True,
+            supports_forward_ad=True,
+            sample_inputs_func=partial(sample_inputs_conv2d, True),
+            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+            dtypes=floating_types(),
+            supports_out=False,
+        ),
+        OpInfo(
+            "nn.functional.conv2d",
+            aten_name="conv2d",
+            variant_test_name="stride_with_bias",
+            supports_autograd=True,
+            supports_forward_ad=True,
+            sample_inputs_func=partial(sample_inputs_conv2d, True, extra_args=((2, 2))),
+            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+            dtypes=floating_types(),
+            supports_out=False,
+        ),
+        OpInfo(
+            "nn.functional.conv2d",
+            aten_name="conv2d",
+            variant_test_name="stride_no_bias",
+            supports_autograd=True,
+            supports_forward_ad=True,
+            sample_inputs_func=partial(
+                sample_inputs_conv2d, False, extra_args=((2, 2))
+            ),
+            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+            dtypes=floating_types(),
+            supports_out=False,
+        ),
+        OpInfo(
+            "nn.functional.conv2d",
+            aten_name="conv2d",
+            variant_test_name="stride_padding_with_bias",
+            supports_autograd=True,
+            supports_forward_ad=True,
+            sample_inputs_func=partial(
+                sample_inputs_conv2d, True, extra_args=((2, 2), (1, 1))
+            ),
+            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+            dtypes=floating_types(),
+            supports_out=False,
+        ),
+        OpInfo(
+            "nn.functional.conv2d",
+            aten_name="conv2d",
+            variant_test_name="stride_padding_no_bias",
+            supports_autograd=True,
+            supports_forward_ad=True,
+            sample_inputs_func=partial(
+                sample_inputs_conv2d, False, extra_args=((2, 2), (1, 1))
+            ),
+            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+            dtypes=floating_types(),
+            supports_out=False,
+        ),
+        OpInfo(
+            "nn.functional.conv2d",
+            aten_name="conv2d",
+            variant_test_name="strided_padding_dilation_with_bias",
+            supports_autograd=True,
+            supports_forward_ad=True,
+            sample_inputs_func=partial(
+                sample_inputs_conv2d, True, extra_args=((2, 2), (1, 1), (2, 2))
+            ),
+            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+            dtypes=floating_types(),
+            supports_out=False,
+        ),
+        OpInfo(
+            "nn.functional.conv2d",
+            aten_name="conv2d",
+            variant_test_name="strided_padding_dilation_no_bias",
+            supports_autograd=True,
+            supports_forward_ad=True,
+            sample_inputs_func=partial(
+                sample_inputs_conv2d, True, extra_args=((2, 2), (1, 1), (2, 2))
+            ),
+            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+            dtypes=floating_types(),
+            supports_out=False,
+        ),
+        OpInfo(
+            "nn.functional.conv2d",
+            aten_name="conv2d",
+            variant_test_name="stride_groups_with_bias",
+            supports_autograd=True,
+            supports_forward_ad=True,
+            sample_inputs_func=partial(
+                sample_inputs_conv2d, True, extra_args=((2, 3), 0, 1, 2), groups=2
+            ),
+            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+            dtypes=floating_types(),
+            supports_out=False,
+        ),
+        OpInfo(
+            "nn.functional.conv2d",
+            aten_name="conv2d",
+            variant_test_name="stride_depthwise_with_bias",
+            supports_autograd=True,
+            supports_forward_ad=True,
+            sample_inputs_func=partial(
+                sample_inputs_conv2d, True, extra_args=((2, 3), 0, 1, 6), groups=6
+            ),
+            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+            dtypes=floating_types(),
+            supports_out=False,
+        ),
+    ]
+)
 
 
 # TODO: PyTorch core has a check for if requires_grad=True or not.
 # We actually want to test more things for backward here which is why we have our own
 def sample_inputs_embedding(op_info, device, dtype, requires_grad, **kwargs):
     def make_input(shape):
-        return make_tensor(shape, device=device, dtype=dtype, requires_grad=requires_grad)
+        return make_tensor(
+            shape, device=device, dtype=dtype, requires_grad=requires_grad
+        )
 
     def make_long_input(shape, *, low, high):
         return make_tensor(shape, device=device, dtype=torch.long, low=low, high=high)
@@ -140,32 +206,53 @@ def make_long_input(shape, *, low, high):
     def generator():
         # 0-D index tensor
         idx = make_long_input((), low=0, high=M)
-        yield SampleInput(make_input((M, S)), args=(idx,),)
+        yield SampleInput(
+            make_input((M, S)),
+            args=(idx,),
+        )
 
         # 1-D index tensor
         idx = make_long_input((S,), low=0, high=M)
-        yield SampleInput(make_input((M, S)), args=(idx,),)
+        yield SampleInput(
+            make_input((M, S)),
+            args=(idx,),
+        )
 
         # 2-D index tensor
         idx = make_long_input((S, S), low=0, high=M)
-        yield SampleInput(make_input((M, S)), args=(idx,),)
+        yield SampleInput(
+            make_input((M, S)),
+            args=(idx,),
+        )
 
         idx = make_long_input((2, 2), low=0, high=S)
         idx[0, 0] = 2
         idx[1, 1] = 2
-        yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': 2},)
+        yield SampleInput(
+            make_input((S, S)),
+            args=(idx,),
+            kwargs={"padding_idx": 2},
+        )
 
         idx = make_long_input((2, 2), low=0, high=S)
         idx[0, 0] = 4
         idx[1, 1] = 4
-        yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': -1},)
+        yield SampleInput(
+            make_input((S, S)),
+            args=(idx,),
+            kwargs={"padding_idx": -1},
+        )
 
         # Scale the gradient based on the inverse frequency of a particular index.
         idx = make_long_input((2, 2), low=0, high=S)
         idx[0, 0] = 1
         idx[0, 1] = 1
         weights = make_input((S, S))
-        yield SampleInput(weights, args=(idx,), kwargs={'scale_grad_by_freq': True},)
+        yield SampleInput(
+            weights,
+            args=(idx,),
+            kwargs={"scale_grad_by_freq": True},
+        )
 
     return list(generator())
 
@@ -177,29 +264,36 @@ def generator():
         # We use lambda to reshuffle the positional arguments.
         # This is because currently only the `input` field of SampleInput
         # is tested in gradient tests.
-        op=lambda weight, idx, **kwargs: torch.nn.functional.embedding(idx, weight, **kwargs),
+        op=lambda weight, idx, **kwargs: torch.nn.functional.embedding(
+            idx, weight, **kwargs
+        ),
         dtypes=floating_types_and(torch.bfloat16, torch.float16),
         sample_inputs_func=sample_inputs_embedding,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         supports_out=False,
-    ))
+    )
+)
 
 
 def sample_inputs_mse_loss(op_info, device, dtype, requires_grad, **kwargs):
     def make_input(shape, requires_grad=requires_grad):
-        return make_tensor(shape, device=device, dtype=dtype, requires_grad=requires_grad)
+        return make_tensor(
+            shape, device=device, dtype=dtype, requires_grad=requires_grad
+        )
 
-    rhs_requires_grad = kwargs.get('rhs_requires_grad', requires_grad)
+    rhs_requires_grad = kwargs.get("rhs_requires_grad", requires_grad)
     S = 5
 
     shapes = ((S, S), (S, S, S), (S, S, S, S))
     reductions = ("none", "mean", "sum")
 
     for shape, reduction in itertools.product(shapes, reductions):
-        yield SampleInput(make_input(shape),
-                          args=(make_input(shape, requires_grad=rhs_requires_grad),),
-                          kwargs={"reduction": reduction})
+        yield SampleInput(
+            make_input(shape),
+            args=(make_input(shape, requires_grad=rhs_requires_grad),),
+            kwargs={"reduction": reduction},
+        )
 
 
 additional_op_db.append(
@@ -214,7 +308,8 @@ def make_input(shape, requires_grad=requires_grad):
         backward_dtypes=floating_types(),
         dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
         backward_dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
-    ))
+    )
+)
 
 
 # TODO: upstream sample inputs to pytorch/pytorch.
@@ -234,7 +329,14 @@ def sample_inputs_getitem(op_info, device, dtype, requires_grad, **kwargs):
         (3, ([slice(None), [0, 3], slice(None)],)),
         (3, ([[0, 3], slice(None), slice(None)],)),
         (3, ([[0, 3], [1, 2], slice(None)],)),
-        (3, ([[0, 3], ],)),
+        (
+            3,
+            (
+                [
+                    [0, 3],
+                ],
+            ),
+        ),
         (3, ([[0, 3], slice(None)],)),
         (3, ([[0, 3], Ellipsis],)),
         (3, ([[0, 2, 3], [1, 3, 3], torch.LongTensor([0, 0, 2])],)),
@@ -254,31 +356,46 @@ def sample_inputs_getitem(op_info, device, dtype, requires_grad, **kwargs):
     def get_shape(dim):
         return tuple(S + i for i in range(dim))
 
-    return tuple(SampleInput(
-        make_tensor(get_shape(self_dim), device=device, dtype=dtype, low=None, high=None, requires_grad=requires_grad),
-        args=args)
-        for self_dim, args in test_args)
+    return tuple(
+        SampleInput(
+            make_tensor(
+                get_shape(self_dim),
+                device=device,
+                dtype=dtype,
+                low=None,
+                high=None,
+                requires_grad=requires_grad,
+            ),
+            args=args,
+        )
+        for self_dim, args in test_args
+    )
 
 
 # TODO: split PyTorch's __getitem__. The problem is we don't support indexing
 # with masks with vmap.
 additional_op_db.append(
-    OpInfo('__getitem__',
-           variant_test_name='functorch',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-           supports_out=False,
-           supports_inplace_autograd=False,
-           supports_scripting=False,
-           op=torch.Tensor.__getitem__,
-           assert_jit_shape_analysis=False,  # TODO: support index.Tensor()
-           supports_forward_ad=True,
-           sample_inputs_func=sample_inputs_getitem,))
+    OpInfo(
+        "__getitem__",
+        variant_test_name="functorch",
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_inplace_autograd=False,
+        supports_scripting=False,
+        op=torch.Tensor.__getitem__,
+        assert_jit_shape_analysis=False,  # TODO: support index.Tensor()
+        supports_forward_ad=True,
+        sample_inputs_func=sample_inputs_getitem,
+    )
+)
 
 
 # Turns out at::index_put is different from torch.index_put...
 # TODO: figure out how to upstream this
 def sample_inputs_aten_index_put(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    make_arg = partial(
+        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
+    )
     inputs = []
     adv_idx = torch.LongTensor([[0, 1], [2, 3]])
     # self_shape, indices
@@ -306,43 +423,62 @@ def sample_inputs_aten_index_put(op_info, device, dtype, requires_grad, **kwargs
 
 
 def sample_inputs_index_put(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
-    make_idx = partial(make_tensor, dtype=torch.long, device=device, requires_grad=False)
+    make_arg = partial(
+        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+    make_idx = partial(
+        make_tensor, dtype=torch.long, device=device, requires_grad=False
+    )
     S = 5
     inputs = []
     for accumulate in [False, True]:
         # putting vectors at indexed locations
-        inputs.append(SampleInput(
-            make_arg((S, S)),
-            args=((make_idx((2,), low=0, high=4),), make_arg((2, S))),
-            kwargs=dict(accumulate=accumulate)))
+        inputs.append(
+            SampleInput(
+                make_arg((S, S)),
+                args=((make_idx((2,), low=0, high=4),), make_arg((2, S))),
+                kwargs=dict(accumulate=accumulate),
+            )
+        )
 
         # putting multi-dim tensors at indexed locations
-        inputs.append(SampleInput(
-            make_arg((S, S, 2)),
-            args=((make_idx((3,), low=0, high=4),), make_arg((3, S, 2))),
-            kwargs=dict(accumulate=accumulate)))
+        inputs.append(
+            SampleInput(
+                make_arg((S, S, 2)),
+                args=((make_idx((3,), low=0, high=4),), make_arg((3, S, 2))),
+                kwargs=dict(accumulate=accumulate),
+            )
+        )
 
         # value with size `0` dim
-        inputs.append(SampleInput(
-            make_arg((S, 0)),
-            args=((make_idx((3,), low=0, high=4),), make_arg((3, 0))),
-            kwargs=dict(accumulate=accumulate)))
+        inputs.append(
+            SampleInput(
+                make_arg((S, 0)),
+                args=((make_idx((3,), low=0, high=4),), make_arg((3, 0))),
+                kwargs=dict(accumulate=accumulate),
+            )
+        )
 
         # scalar value
-        inputs.append(SampleInput(
-            make_arg((S,)),
-            args=((make_idx((), low=0, high=S),), make_arg(())),
-            kwargs=dict(accumulate=accumulate)))
+        inputs.append(
+            SampleInput(
+                make_arg((S,)),
+                args=((make_idx((), low=0, high=S),), make_arg(())),
+                kwargs=dict(accumulate=accumulate),
+            )
+        )
 
         # cuda and accumulate don't work well
         # Reference: https://github.com/pytorch/pytorch/issues/72053
-        if not accumulate and device == 'cuda':
+        if not accumulate and device == "cuda":
             # Broadcast `values`
-            inputs.append(SampleInput(
-                make_arg((S, S)),
-                args=((make_idx((2,), low=0, high=S),), make_arg((S,))),
-                kwargs=dict(accumulate=accumulate)))
+            inputs.append(
+                SampleInput(
+                    make_arg((S, S)),
+                    args=((make_idx((2,), low=0, high=S),), make_arg((S,))),
+                    kwargs=dict(accumulate=accumulate),
+                )
+            )
 
     return inputs
 
@@ -350,48 +486,64 @@ def sample_inputs_index_put(op_info, device, dtype, requires_grad, **kwargs):
 additional_op_db.append(
     OpInfo(
         "index_put",
-        variant_test_name='functorch',
+        variant_test_name="functorch",
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         supports_out=False,
         sample_inputs_func=sample_inputs_index_put,
         supports_forward_ad=True,
-    ))
+    )
+)
 additional_op_db.append(
     OpInfo(
         "ops.aten.index_put",
-        variant_test_name='functorch',
+        variant_test_name="functorch",
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         supports_out=False,
         sample_inputs_func=sample_inputs_aten_index_put,
         supports_forward_ad=True,
-    ))
+    )
+)
+
 
 def sample_inputs_masked_fill(op_info, device, dtype, requires_grad, **kwargs):
     S = 3
-    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
 
     yield SampleInput(make_arg((S, S)), args=(torch.randn(S, S, device=device) > 0, 10))
     yield SampleInput(make_arg((S, S)), args=(torch.randn(S, device=device) > 0, 10))
     yield SampleInput(make_arg(()), args=(torch.randn((), device=device) > 0, 10))
     yield SampleInput(make_arg((S, S)), args=(torch.randn((), device=device) > 0, 10))
-    yield SampleInput(make_arg((S,)),
-                      args=(torch.randn(S, S, device=device) > 0, 10),
-                      broadcasts_input=True)
+    yield SampleInput(
+        make_arg((S,)),
+        args=(torch.randn(S, S, device=device) > 0, 10),
+        broadcasts_input=True,
+    )
+
 
 additional_op_db.append(
-    OpInfo('masked_fill',
-           variant_test_name='functorch_Scalar_only',
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
-           sample_inputs_func=sample_inputs_masked_fill,
-           supports_forward_ad=True,
-           supports_fwgrad_bwgrad=True,
-           check_batched_forward_grad=False,
-           supports_out=False)
+    OpInfo(
+        "masked_fill",
+        variant_test_name="functorch_Scalar_only",
+        dtypes=all_types_and_complex_and(
+            torch.bool, torch.half, torch.bfloat16, torch.chalf
+        ),
+        sample_inputs_func=sample_inputs_masked_fill,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        check_batched_forward_grad=False,
+        supports_out=False,
+    )
 )
 
 
-def sample_inputs_new_zeros_with_same_feature_meta(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+def sample_inputs_new_zeros_with_same_feature_meta(
+    op_info, device, dtype, requires_grad, **kwargs
+):
+    make_arg = partial(
+        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
+    )
     matrix = [
         # tangent, base, num_tangent_bdims
         ([5], [2, 3], 0),
@@ -407,164 +559,255 @@ def sample_inputs_new_zeros_with_same_feature_meta(op_info, device, dtype, requi
     for tangent_shape, base_shape, num_tangent_bdims in matrix:
         tangent = make_arg(tangent_shape)
         base = make_arg(base_shape)
-        results.append(SampleInput(
-            tangent,
-            args=(base,),
-            kwargs=dict(self_num_batch_dims=num_tangent_bdims)))
+        results.append(
+            SampleInput(
+                tangent,
+                args=(base,),
+                kwargs=dict(self_num_batch_dims=num_tangent_bdims),
+            )
+        )
     return results
 
 
 additional_op_db.append(
     OpInfo(
         "ops.aten._new_zeros_with_same_feature_meta",
-        variant_test_name='functorchonly',
+        variant_test_name="functorchonly",
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         supports_out=False,
         supports_autograd=False,
         supports_forward_ad=False,
         sample_inputs_func=sample_inputs_new_zeros_with_same_feature_meta,
-    ))
+    )
+)
 
 
 def sample_inputs_conversion(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
-    shapes = ((),
-              (2, 3))
+    make_arg = partial(
+        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+    shapes = ((), (2, 3))
     memory_format_options = [None, torch.contiguous_format]
     for shape, memory_format in itertools.product(shapes, memory_format_options):
-        yield SampleInput(make_arg(shape),
-                          kwargs={'memory_format': memory_format} if memory_format else {})
-
-
-additional_op_db.extend([
-    OpInfo('bfloat16',
-           op=lambda x, *args, **kwargs: x.bfloat16(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='functorch_no_channels_last',
-           sample_inputs_func=sample_inputs_conversion,
-           skips=(
-               # autograd tests don't handle operators that change dtype
-               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
-               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
-               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
-           )),
-    OpInfo('bool',
-           op=lambda x, *args, **kwargs: x.bool(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='functorch_no_channels_last',
-           sample_inputs_func=sample_inputs_conversion,
-           supports_autograd=False,
-           skips=(
-               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('byte',
-           op=lambda x, *args, **kwargs: x.byte(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='functorch_no_channels_last',
-           sample_inputs_func=sample_inputs_conversion,
-           # The autograd test runner cannot handle functions that change dtype
-           supports_autograd=False,
-           skips=(
-               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('char',
-           op=lambda x, *args, **kwargs: x.char(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='functorch_no_channels_last',
-           sample_inputs_func=sample_inputs_conversion,
-           # The autograd test runner cannot handle functions that change dtype
-           supports_autograd=False,
-           skips=(
-               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('double',
-           op=lambda x, *args, **kwargs: x.double(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='functorch_no_channels_last',
-           sample_inputs_func=sample_inputs_conversion,
-           supports_forward_ad=True,
-           supports_fwgrad_bwgrad=True,
-           skips=(
-               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('float',
-           op=lambda x, *args, **kwargs: x.float(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='functorch_no_channels_last',
-           sample_inputs_func=sample_inputs_conversion,
-           skips=(
-               # autograd tests don't handle operators that change dtype
-               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
-               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
-               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('half',
-           op=lambda x, *args, **kwargs: x.half(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='functorch_no_channels_last',
-           sample_inputs_func=sample_inputs_conversion,
-           skips=(
-               # autograd tests don't handle operators that change dtype
-               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
-               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
-               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('int',
-           op=lambda x, *args, **kwargs: x.int(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='functorch_no_channels_last',
-           sample_inputs_func=sample_inputs_conversion,
-           supports_autograd=False,
-           skips=(
-               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('long',
-           op=lambda x, *args, **kwargs: x.long(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='functorch_no_channels_last',
-           sample_inputs_func=sample_inputs_conversion,
-           supports_autograd=False,
-           skips=(
-               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
-           )),
-    OpInfo('short',
-           op=lambda x, *args, **kwargs: x.short(*args, **kwargs),
-           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
-           supports_out=False,
-           variant_test_name='functorch_no_channels_last',
-           sample_inputs_func=sample_inputs_conversion,
-           supports_autograd=False,
-           skips=(
-               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
-               # RuntimeError: attribute lookup is not defined on builtin
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
-           )),
-])
+        yield SampleInput(
+            make_arg(shape),
+            kwargs={"memory_format": memory_format} if memory_format else {},
+        )
+
+
+additional_op_db.extend(
+    [
+        OpInfo(
+            "bfloat16",
+            op=lambda x, *args, **kwargs: x.bfloat16(*args, **kwargs),
+            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+            supports_out=False,
+            variant_test_name="functorch_no_channels_last",
+            sample_inputs_func=sample_inputs_conversion,
+            skips=(
+                # autograd tests don't handle operators that change dtype
+                DecorateInfo(unittest.expectedFailure, "TestFwdGradients"),
+                DecorateInfo(unittest.expectedFailure, "TestBwdGradients"),
+                DecorateInfo(
+                    unittest.expectedFailure,
+                    "TestNormalizeOperators",
+                    "test_normalize_operator_exhaustive",
+                ),
+                # RuntimeError: attribute lookup is not defined on builtin
+                DecorateInfo(
+                    unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+                ),
+                DecorateInfo(
+                    unittest.skip("Skipped!"), "TestNNCOpInfo", "test_nnc_correctness"
+                ),
+            ),
+        ),
+        OpInfo(
+            "bool",
+            op=lambda x, *args, **kwargs: x.bool(*args, **kwargs),
+            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+            supports_out=False,
+            variant_test_name="functorch_no_channels_last",
+            sample_inputs_func=sample_inputs_conversion,
+            supports_autograd=False,
+            skips=(
+                DecorateInfo(
+                    unittest.expectedFailure,
+                    "TestNormalizeOperators",
+                    "test_normalize_operator_exhaustive",
+                ),
+                # RuntimeError: attribute lookup is not defined on builtin
+                DecorateInfo(
+                    unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+                ),
+            ),
+        ),
+        OpInfo(
+            "byte",
+            op=lambda x, *args, **kwargs: x.byte(*args, **kwargs),
+            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+            supports_out=False,
+            variant_test_name="functorch_no_channels_last",
+            sample_inputs_func=sample_inputs_conversion,
+            # The autograd test runner cannot handle functions that change dtype
+            supports_autograd=False,
+            skips=(
+                DecorateInfo(
+                    unittest.expectedFailure,
+                    "TestNormalizeOperators",
+                    "test_normalize_operator_exhaustive",
+                ),
+                # RuntimeError: attribute lookup is not defined on builtin
+                DecorateInfo(
+                    unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+                ),
+            ),
+        ),
+        OpInfo(
+            "char",
+            op=lambda x, *args, **kwargs: x.char(*args, **kwargs),
+            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+            supports_out=False,
+            variant_test_name="functorch_no_channels_last",
+            sample_inputs_func=sample_inputs_conversion,
+            # The autograd test runner cannot handle functions that change dtype
+            supports_autograd=False,
+            skips=(
+                DecorateInfo(
+                    unittest.expectedFailure,
+                    "TestNormalizeOperators",
+                    "test_normalize_operator_exhaustive",
+                ),
+                # RuntimeError: attribute lookup is not defined on builtin
+                DecorateInfo(
+                    unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+                ),
+            ),
+        ),
+        OpInfo(
+            "double",
+            op=lambda x, *args, **kwargs: x.double(*args, **kwargs),
+            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+            supports_out=False,
+            variant_test_name="functorch_no_channels_last",
+            sample_inputs_func=sample_inputs_conversion,
+            supports_forward_ad=True,
+            supports_fwgrad_bwgrad=True,
+            skips=(
+                DecorateInfo(
+                    unittest.expectedFailure,
+                    "TestNormalizeOperators",
+                    "test_normalize_operator_exhaustive",
+                ),
+                # RuntimeError: attribute lookup is not defined on builtin
+                DecorateInfo(
+                    unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+                ),
+            ),
+        ),
+        OpInfo(
+            "float",
+            op=lambda x, *args, **kwargs: x.float(*args, **kwargs),
+            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+            supports_out=False,
+            variant_test_name="functorch_no_channels_last",
+            sample_inputs_func=sample_inputs_conversion,
+            skips=(
+                # autograd tests don't handle operators that change dtype
+                DecorateInfo(unittest.expectedFailure, "TestFwdGradients"),
+                DecorateInfo(unittest.expectedFailure, "TestBwdGradients"),
+                DecorateInfo(
+                    unittest.expectedFailure,
+                    "TestNormalizeOperators",
+                    "test_normalize_operator_exhaustive",
+                ),
+                # RuntimeError: attribute lookup is not defined on builtin
+                DecorateInfo(
+                    unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+                ),
+            ),
+        ),
+        OpInfo(
+            "half",
+            op=lambda x, *args, **kwargs: x.half(*args, **kwargs),
+            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+            supports_out=False,
+            variant_test_name="functorch_no_channels_last",
+            sample_inputs_func=sample_inputs_conversion,
+            skips=(
+                # autograd tests don't handle operators that change dtype
+                DecorateInfo(unittest.expectedFailure, "TestFwdGradients"),
+                DecorateInfo(unittest.expectedFailure, "TestBwdGradients"),
+                DecorateInfo(
+                    unittest.expectedFailure,
+                    "TestNormalizeOperators",
+                    "test_normalize_operator_exhaustive",
+                ),
+                # RuntimeError: attribute lookup is not defined on builtin
+                DecorateInfo(
+                    unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+                ),
+            ),
+        ),
+        OpInfo(
+            "int",
+            op=lambda x, *args, **kwargs: x.int(*args, **kwargs),
+            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+            supports_out=False,
+            variant_test_name="functorch_no_channels_last",
+            sample_inputs_func=sample_inputs_conversion,
+            supports_autograd=False,
+            skips=(
+                DecorateInfo(
+                    unittest.expectedFailure,
+                    "TestNormalizeOperators",
+                    "test_normalize_operator_exhaustive",
+                ),
+                # RuntimeError: attribute lookup is not defined on builtin
+                DecorateInfo(
+                    unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+                ),
+            ),
+        ),
+        OpInfo(
+            "long",
+            op=lambda x, *args, **kwargs: x.long(*args, **kwargs),
+            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+            supports_out=False,
+            variant_test_name="functorch_no_channels_last",
+            sample_inputs_func=sample_inputs_conversion,
+            supports_autograd=False,
+            skips=(
+                DecorateInfo(
+                    unittest.expectedFailure,
+                    "TestNormalizeOperators",
+                    "test_normalize_operator_exhaustive",
+                ),
+                # RuntimeError: attribute lookup is not defined on builtin
+                DecorateInfo(
+                    unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+                ),
+            ),
+        ),
+        OpInfo(
+            "short",
+            op=lambda x, *args, **kwargs: x.short(*args, **kwargs),
+            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+            supports_out=False,
+            variant_test_name="functorch_no_channels_last",
+            sample_inputs_func=sample_inputs_conversion,
+            supports_autograd=False,
+            skips=(
+                DecorateInfo(
+                    unittest.expectedFailure,
+                    "TestNormalizeOperators",
+                    "test_normalize_operator_exhaustive",
+                ),
+                # RuntimeError: attribute lookup is not defined on builtin
+                DecorateInfo(
+                    unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+                ),
+            ),
+        ),
+    ]
+)
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 2ee59e8d34198..9d515606621c0 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -6,89 +6,110 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Union, Callable, List, Any, Optional, Dict
-from unittest.mock import patch
-from torch.testing._internal.common_utils import (
-    TestCase,
-    run_tests,
-    IS_ARM64,
-    IS_MACOS,
-    IS_X86,
-    compare_equal_outs_and_grads,
-    outs_and_grads,
-    skipIfRocm,
-)
-from torch.testing._internal.two_tensor import TwoTensor, TwoTensorMode
 import copy
-import torch
-import torch.nn as nn
-import torch.utils._pytree as pytree
+import itertools
 import unittest
 import warnings
-import itertools
 from contextlib import nullcontext
 from functools import partial
-from torch.nn.utils.rnn import PackedSequence
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, toleranceOverride, tol
-from torch.testing._internal.common_methods_invocations import op_db
-from torch.testing._internal.common_modules import module_db, modules
-from torch.testing._internal.common_utils import parametrize, instantiate_parametrized_tests
-from torch.testing._internal.control_flow_opinfo_db import control_flow_opinfo_db
-from torch.testing._internal.optests import _test_aot_autograd_forwards_backwards_helper, aot_autograd_check
-from torch._higher_order_ops.out_dtype import out_dtype
-from functorch import (
-    grad, vjp, vmap, jacrev,
-    make_fx
-)
-from torch._functorch.aot_autograd import aot_module_simplified, aot_export_module, aot_export_joint_simple
+from typing import Any, Callable, Dict, List, Optional, Union
+from unittest.mock import patch
+
+import torch
+import torch._dynamo as torchdynamo
+import torch.nn as nn
+import torch.utils._pytree as pytree
+from common_utils import decorate, decorateForModules, skip, skipOps, xfail
+from functorch import grad, jacrev, make_fx, vjp, vmap
 from functorch.compile import (
-    nnc_jit, compiled_function, compiled_module,
-    min_cut_rematerialization_partition, aot_function, aot_module,
-    nop, default_partition, default_decompositions,
-    memory_efficient_fusion, get_aot_compilation_context, make_boxed_compiler
+    aot_function,
+    aot_module,
+    compiled_function,
+    compiled_module,
+    default_decompositions,
+    default_partition,
+    get_aot_compilation_context,
+    make_boxed_compiler,
+    memory_efficient_fusion,
+    min_cut_rematerialization_partition,
+    nnc_jit,
+    nop,
 )
+from functorch.experimental import control_flow
 from torch._decomp import decomposition_table
-
-from torch.testing._internal.common_device_type import ops
-from common_utils import (
-    decorate,
-    xfail,
-    skip,
-    skipOps,
-    decorateForModules,
+from torch._functorch.aot_autograd import (
+    aot_export_joint_simple,
+    aot_export_module,
+    aot_module_simplified,
 )
+from torch._higher_order_ops.out_dtype import out_dtype
 from torch._subclasses.fake_tensor import DynamicOutputShapeException, FakeTensorMode
 from torch.fx.experimental.proxy_tensor import is_sym_node
-from torch.fx.experimental.symbolic_shapes import ShapeEnv, GuardOnDataDependentSymNode
+from torch.fx.experimental.symbolic_shapes import GuardOnDataDependentSymNode, ShapeEnv
+from torch.nn.utils.rnn import PackedSequence
+
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    ops,
+    tol,
+    toleranceOverride,
+)
+from torch.testing._internal.common_methods_invocations import op_db
+from torch.testing._internal.common_modules import module_db, modules
+from torch.testing._internal.common_utils import (
+    compare_equal_outs_and_grads,
+    instantiate_parametrized_tests,
+    IS_ARM64,
+    IS_MACOS,
+    IS_WINDOWS,
+    IS_X86,
+    outs_and_grads,
+    parametrize,
+    run_tests,
+    skipIfRocm,
+    TestCase,
+)
+from torch.testing._internal.hop_db import hop_db
+from torch.testing._internal.optests import (
+    _test_aot_autograd_forwards_backwards_helper,
+    aot_autograd_check,
+)
+from torch.testing._internal.two_tensor import TwoTensor, TwoTensorMode
 
 USE_TORCHVISION = False
 try:
     import torchvision
+
     USE_TORCHVISION = True
 except ImportError:
-    warnings.warn("Couldn't import torchvision. Some of our tests use it, try "
-                  "to install it with commands from pytorch.org, post-fixed with "
-                  "`--no-deps` to avoid overwriting the pytorch installation",
-                  UserWarning)
+    warnings.warn(
+        "Couldn't import torchvision. Some of our tests use it, try "
+        "to install it with commands from pytorch.org, post-fixed with "
+        "`--no-deps` to avoid overwriting the pytorch installation",
+        UserWarning,
+    )
 
 USE_NETWORKX = False
 try:
     import networkx  # noqa: F401
+
     USE_NETWORKX = True
 except ImportError:
-    warnings.warn("Some tests use networkx but it was not installed",
-                  UserWarning)
+    warnings.warn("Some tests use networkx but it was not installed", UserWarning)
 
 # NB: numpy is a testing dependency!
 
+
 class AOTTestCase(TestCase):
     def setUp(self):
         super().setUp()
 
+
 class TestPythonKey(AOTTestCase):
     def test_make_fx(self, device):
         def f(x):
             return torch.sin(x)
+
         inp = torch.randn(3)
         fx_f = make_fx(f)(inp)
 
@@ -98,6 +119,7 @@ def f(x):
     def test_make_fx_grad(self, device):
         def f(x):
             return torch.sin(x).sum()
+
         inp = torch.randn(3)
         f = grad(f)
         fx_f = make_fx(f)(inp)
@@ -108,6 +130,7 @@ def f(x):
     def test_scalar_device(self, device):
         def f(a, b):
             return a + b
+
         inps = [torch.randn(3, device=device), torch.tensor(5)]
         fx_f = make_fx(f)(*inps)
         self.assertEqual(fx_f(*inps), f(*inps))
@@ -115,6 +138,7 @@ def f(a, b):
     def test_make_fx_vmap(self, device):
         def f(x):
             return torch.sin(x)
+
         inp = torch.randn(5, 3)
         f = vmap(f)
         fx_f = make_fx(f)(inp)
@@ -124,6 +148,7 @@ def f(x):
     def test_make_fx_jacrev(self, device):
         def f(x):
             return x.sin().sum()
+
         inp = torch.randn(3)
         f = jacrev(jacrev(f))
         fx_f = make_fx(f)(inp)
@@ -208,6 +233,7 @@ def f(x):
     def test_external_calls(self, device):
         def f(a, b):
             return torch.mv(a, b)
+
         jit_f = nnc_jit(f)
         inp = [torch.randn(3, 3), torch.randn(3)]
         self.assertEqual(jit_f(*inp), f(*inp))
@@ -215,14 +241,16 @@ def f(a, b):
     def test_nnc_passthrough(self, device):
         def f(x, y):
             return x + y, y
+
         inp = (torch.randn(3), torch.randn(3))
         jit_f = nnc_jit(f)
         self.assertEqual(jit_f(*inp), f(*inp))
 
         def f(x):
-            x['a'] = x['a'] * 2
+            x["a"] = x["a"] * 2
             return x
-        inp = ({'a': torch.randn(3), 'b': torch.randn(3)},)
+
+        inp = ({"a": torch.randn(3), "b": torch.randn(3)},)
         jit_f = nnc_jit(f)
         self.assertEqual(jit_f(*inp), f(*inp))
 
@@ -243,9 +271,11 @@ def f(x):
         grads2 = [a.grad for a in mod.parameters()]
         self.assertEqual(grads, grads2)
 
+
 def get_base(t):
     return t._base if t._is_view() else t
 
+
 def is_in_base(t, maybe_tensors):
     t_base = get_base(t)
     for maybe_tensor in maybe_tensors:
@@ -254,6 +284,7 @@ def is_in_base(t, maybe_tensors):
                 return True
     return False
 
+
 class TestAOTAutograd(AOTTestCase):
     # test_mutation will:
     # - Ensure that inputs are non-leaves, so our graphs can mutate them
@@ -265,14 +296,14 @@ def verify_aot_autograd(
         inp_: Union[Callable, List[Any]],
         *,
         test_mutation: bool = False,
-        only_keep_inference_mutations: bool = False,
+        keep_inp_mutations: bool = False,
         decompositions: Optional[Dict] = None,
         dynamic: bool = False,
         # Only active when inp_ is Callable.
         # TODO: probably consolidate all tests to make inp a Callable.
         make_inputs_subclasses: bool = False,
     ):
-        for keep_input_mutations in [True] if only_keep_inference_mutations else [True, False]:
+        for keep_input_mutations in [True] if keep_inp_mutations else [True, False]:
             # Some tests pass in a callable for inp, to generate the inputs
             # (useful if we want to generate complicated aliasing inputs)
             if isinstance(inp_, Callable):
@@ -318,20 +349,24 @@ def verify_aot_autograd(
             if isinstance(f, nn.Module):
                 compiled_f = aot_module(
                     f,
-                    fw_compiler=make_boxed_compiler(partial(extract_graph, graph_cell=fw_graph_cell)),
+                    fw_compiler=make_boxed_compiler(
+                        partial(extract_graph, graph_cell=fw_graph_cell)
+                    ),
                     bw_compiler=nop,
                     decompositions=decompositions,
                     keep_inference_input_mutations=keep_input_mutations,
-                    dynamic=dynamic
+                    dynamic=dynamic,
                 )
             else:
                 compiled_f = aot_function(
                     f,
-                    fw_compiler=make_boxed_compiler(partial(extract_graph, graph_cell=fw_graph_cell)),
+                    fw_compiler=make_boxed_compiler(
+                        partial(extract_graph, graph_cell=fw_graph_cell)
+                    ),
                     bw_compiler=nop,
                     decompositions=decompositions,
                     keep_inference_input_mutations=keep_input_mutations,
-                    dynamic=dynamic
+                    dynamic=dynamic,
                 )
             ref_out, ref_grad = outs_and_grads(f, graph_inps, inp)
             test_out, test_grad = outs_and_grads(compiled_f, graph_inps_copy, inp_copy)
@@ -344,9 +379,15 @@ def verify_aot_autograd(
                 if isinstance(ref_o, torch.Tensor):
                     self.assertEqual(ref_o.requires_grad, test_o.requires_grad)
                     self.assertEqual(ref_o.is_leaf, test_o.is_leaf)
-                    ref_is_view_of_non_interm = is_in_base(ref_o, graph_inps) or is_in_base(ref_o, ref_out)
-                    test_is_view_of_non_interm = is_in_base(test_o, graph_inps_copy) or is_in_base(test_o, test_out)
-                    self.assertEqual(ref_is_view_of_non_interm, test_is_view_of_non_interm)
+                    ref_is_view_of_non_interm = is_in_base(
+                        ref_o, graph_inps
+                    ) or is_in_base(ref_o, ref_out)
+                    test_is_view_of_non_interm = is_in_base(
+                        test_o, graph_inps_copy
+                    ) or is_in_base(test_o, test_out)
+                    self.assertEqual(
+                        ref_is_view_of_non_interm, test_is_view_of_non_interm
+                    )
                     self.assertEqual(ref_o, test_o)
                     if test_mutation:
                         # This tests that autograd meta is set properly on the output we can
@@ -364,6 +405,7 @@ def test_non_tensor_and_none_inputs(self):
         # int, None, Tensor
         def f(a, b, c):
             return a * c
+
         inp = [2, None, torch.ones(3, 3, dtype=torch.float32, requires_grad=True)]
         self.verify_aot_autograd(f, inp)
         inp = [2, None, torch.ones(3, 3, dtype=torch.float32, requires_grad=False)]
@@ -372,6 +414,7 @@ def f(a, b, c):
     def test_single_output(self):
         def f(a, b):
             return a + b
+
         inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3)]
         self.verify_aot_autograd(f, inp)
         inp = [torch.randn(3, 3, requires_grad=False), torch.randn(3, 3)]
@@ -380,6 +423,7 @@ def f(a, b):
     def test_multi_output(self):
         def f(a, b):
             return a + b, a - b
+
         inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3)]
         self.verify_aot_autograd(f, inp)
         inp = [torch.randn(3, 3, requires_grad=False), torch.randn(3, 3)]
@@ -388,6 +432,7 @@ def f(a, b):
     def test_multi_output_list(self):
         def f(a, b):
             return [a + b, a - b]
+
         inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3)]
         self.verify_aot_autograd(f, inp)
         inp = [torch.randn(3, 3, requires_grad=False), torch.randn(3, 3)]
@@ -397,7 +442,7 @@ def f(a, b):
     def test_squeeze_mutation(self):
         def f(a):
             b = a.clone().squeeze(-1)
-            b.add_(1.)
+            b.add_(1.0)
             return a + b
 
         inp = [torch.randn(3, 1, requires_grad=True)]
@@ -436,12 +481,11 @@ def forward(self, x, y):
         self.verify_aot_autograd(F(), [x, y], dynamic=False)
         self.verify_aot_autograd(F(), [x, y], dynamic=True)
 
-
-
     def test_input_mutation_simple(self):
         def f(a):
             a.mul_(2)
             return a * 3
+
         inp = [torch.ones(3, 3, requires_grad=True)]
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=False)]
@@ -451,12 +495,15 @@ def f(a):
         #   but autograd operates above functionalization so we need to manually clone.
         #   Hopefully backends can optimize this easily.
         # - The extra return arg is because the compiled forward returns (mutated inputs + outputs)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     mul = torch.ops.aten.mul.Tensor(clone, 2);  clone = None
     mul_1 = torch.ops.aten.mul.Tensor(mul, 3)
-    return [mul, mul_1]""")
+    return [mul, mul_1]""",
+        )
 
     def test_input_mutation_set__input_mutation(self):
         def f(a):
@@ -464,10 +511,11 @@ def f(a):
             with torch.no_grad():
                 a.set_(b)
             return a * b
+
         inp = [torch.ones(3, 3, requires_grad=True)]
-        self.verify_aot_autograd(f, inp, test_mutation=True)
+        self.verify_aot_autograd(f, inp, test_mutation=True, keep_inp_mutations=True)
         inp = [torch.ones(3, 3, requires_grad=False)]
-        self.verify_aot_autograd(f, inp, test_mutation=True)
+        self.verify_aot_autograd(f, inp, test_mutation=True, keep_inp_mutations=True)
 
     def test_set__steals_view_chain(self):
         def f(a, b):
@@ -479,20 +527,33 @@ def f(a, b):
             # Also mutates b_,
             a_.view(-1).mul_(2)
             return a_ * b_slice
-        inp = [torch.ones(3, 3, requires_grad=False), torch.zeros(3, 9, requires_grad=False)]
-        self.verify_aot_autograd(f, inp)
+
+        inp = [
+            torch.ones(3, 3, requires_grad=False),
+            torch.zeros(3, 9, requires_grad=False),
+        ]
+        self.verify_aot_autograd(f, inp, keep_inp_mutations=True)
 
     def test_set__and_data_mutation_good(self):
         def f(a, b):
             # The data mutation happens *after* the set_(). This is ok (see the graph below)
             with torch.no_grad():
                 a.set_(b)
-            b.mul_(2)
+                b.mul_(2)
             return a + b
-        inp = [torch.ones(3, 3, requires_grad=True), torch.ones(3, 3, requires_grad=True)]
-        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
-        inp = [torch.ones(3, 3, requires_grad=False), torch.zeros(3, 3, requires_grad=False)]
-        self.verify_aot_autograd(f, inp, test_mutation=True)
+
+        inp = [
+            torch.ones(3, 3, requires_grad=True),
+            torch.ones(3, 3, requires_grad=True),
+        ]
+        fw_graph = self.verify_aot_autograd(
+            f, inp, test_mutation=True, keep_inp_mutations=True
+        )
+        inp = [
+            torch.ones(3, 3, requires_grad=False),
+            torch.zeros(3, 3, requires_grad=False),
+        ]
+        self.verify_aot_autograd(f, inp, test_mutation=True, keep_inp_mutations=True)
         # Important things to note:
         # - "return a.set_(b)" desugars into "return b"
         # - Both a and b are recorded as experiencing mutations,
@@ -500,12 +561,16 @@ def f(a, b):
         #   a is recorded as both a data mutation and a metadata mutation (due to set_ swapping its storage).
         # - the runtime epilogue for a is "a.set_(mul)"
         # - the runtime epilogue for b is "b.copy_(mul)"
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1, primals_2):
-    clone = torch.ops.aten.clone.default(primals_2);  primals_2 = None
-    mul = torch.ops.aten.mul.Tensor(clone, 2);  clone = None
+    mul = torch.ops.aten.mul.Tensor(primals_2, 2)
     add = torch.ops.aten.add.Tensor(mul, mul)
-    return [mul, mul, add]""")
+    set_ = torch.ops.aten.set_.source_Tensor(primals_1, mul);  primals_1 = None
+    copy_ = torch.ops.aten.copy_.default(primals_2, mul);  primals_2 = mul = None
+    return [add]""",
+        )
 
     # This is a (hopefully) extremely rare case that is difficult to handle,
     # so we ban it.
@@ -517,13 +582,38 @@ def f(a):
             # will be tracked as graph input mutations.
             with torch.no_grad():
                 a.set_(tmp)
-            # BAD: a_view is now detached from every graph input,
-            # so we won't recognize that this caused an input mutation!
-            a_view.mul_(2)
+                # BAD: a_view is now detached from every graph input,
+                # so we won't recognize that this caused an input mutation!
+                a_view.mul_(2)
             return a + tmp
+
         inp = [torch.ones(3, 3, requires_grad=True)]
-        with self.assertRaisesRegex(RuntimeError, "cannot mutate tensors with frozen storage"):
-            self.verify_aot_autograd(f, inp, test_mutation=True)
+        with self.assertRaisesRegex(
+            RuntimeError, "cannot mutate tensors with frozen storage"
+        ):
+            self.verify_aot_autograd(
+                f, inp, test_mutation=True, keep_inp_mutations=True
+            )
+
+    def test_set__not_allowed(self):
+        def f(a, b):
+            with torch.no_grad():
+                a.set_(b)
+            # Mutating a will change a's grad_fn, which requires us to replay the mutation outside of the graph.
+            # We currently ban this today, when the input also received a set_() input mutation.
+            a.mul_(2)
+            return a + b
+
+        inp = [
+            torch.ones(3, 3, requires_grad=True),
+            torch.ones(3, 3, requires_grad=True),
+        ]
+        with self.assertRaisesRegex(
+            AssertionError, "prevented us from including it in the graph"
+        ):
+            fw_graph = self.verify_aot_autograd(
+                f, inp, test_mutation=True, keep_inp_mutations=True
+            )
 
     def test_input_mutation_set__nop(self):
         def f(a):
@@ -533,26 +623,33 @@ def f(a):
                 a.set_(b)
                 a.set_(a_old)
             return a + b.reshape(3, 3)
+
         inp = [torch.ones(3, 3, requires_grad=True)]
-        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
+        fw_graph = self.verify_aot_autograd(
+            f, inp, test_mutation=True, keep_inp_mutations=True
+        )
         inp = [torch.ones(3, 3, requires_grad=False)]
-        self.verify_aot_autograd(f, inp, test_mutation=True)
+        self.verify_aot_autograd(f, inp, test_mutation=True, keep_inp_mutations=True)
         # Things to note:
         # - There are no set_() calls in the graph (we functionalize a.set_(b) into "b")
         # - There is only **1** graph output. We properly realized that the two set_() calls
         #   undo each other, and so effectively no inputs are mutated.
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     arange = torch.ops.aten.arange.default(9, dtype = torch.float32, device = device(type='cpu'), pin_memory = False)
     alias = torch.ops.aten.alias.default(primals_1);  primals_1 = None
     view = torch.ops.aten.view.default(arange, [3, 3]);  arange = None
     add = torch.ops.aten.add.Tensor(alias, view);  alias = view = None
-    return [add]""")
+    return [add]""",
+        )
 
     def test_input_mutation_simple_with_none_and_nontensor(self):
         # Tensor, None, int
         def f(a, b, c):
             return a * c
+
         f_compiled = aot_function(f, nop)
         for req_grad in [True, False]:
             inp = [torch.ones(3, 3, requires_grad=req_grad), None, 3]
@@ -605,13 +702,13 @@ def f(a):
         self.assertEqual(x_ref.grad, x_test.grad)
         self.assertEqual(x_ref_view.grad, x_test_view.grad)
 
-
     def test_outputs_are_aliased(self):
         # Tensor, None, int
         def f(a):
             b = a.mul(2)
             c = b.view(-1)
             return b, c
+
         f_compiled = aot_function(f, nop)
         for req_grad in [True, False]:
             inp = torch.ones(3, requires_grad=req_grad)
@@ -630,15 +727,19 @@ def test_input_mutation_is_output(self):
         def f(a):
             a.mul_(2)
             return a
+
         inp = [torch.ones(3, 3, requires_grad=True)]
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=False)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     mul = torch.ops.aten.mul.Tensor(clone, 2);  clone = None
-    return [mul, mul]""")
+    return [mul, mul]""",
+        )
 
     def test_input_mutation_multiple(self):
         def f(a, b, c):
@@ -656,7 +757,9 @@ def create_inp(req_grad):
         self.verify_aot_autograd(f, create_inp(False), test_mutation=True)
 
         fw_graph = self.verify_aot_autograd(f, create_inp(True), test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1, primals_2, primals_3):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     clone_1 = torch.ops.aten.clone.default(primals_3);  primals_3 = None
@@ -664,7 +767,26 @@ def forward(self, primals_1, primals_2, primals_3):
     mul_1 = torch.ops.aten.mul.Tensor(clone_1, 2);  clone_1 = None
     add = torch.ops.aten.add.Tensor(mul, primals_2);  primals_2 = None
     add_1 = torch.ops.aten.add.Tensor(add, mul_1);  add = None
-    return [mul, mul_1, add_1]""")
+    return [mul, mul_1, add_1]""",
+        )
+
+    def test_input_mutation_return(self):
+        def f(a, b):
+            return torch.sin(a, out=b)
+
+        inp = [torch.randn(3, 3), torch.ones(3, 3)]
+
+        fw_graph = self.verify_aot_autograd(
+            f, inp, test_mutation=True, keep_inp_mutations=True
+        )
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    sin = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+    copy_ = torch.ops.aten.copy_.default(arg1_1, sin);  arg1_1 = sin = None
+    return (copy_,)""",
+        )
 
     def test_input_mutation_metadata(self):
         def f(a, b):
@@ -681,7 +803,6 @@ def create_inp(req_grad):
         self.verify_aot_autograd(f, create_inp(False), test_mutation=True)
 
     def test_input_output_aliase_custom_autograd_function(self):
-
         class Foo(torch.autograd.Function):
             @staticmethod
             def forward(ctx, x):
@@ -704,6 +825,7 @@ def test_input_mutation_requires_grad_detach(self):
         def f(a):
             a.detach().mul_(2)
             return a + 3
+
         inp = [torch.ones(4, requires_grad=True)]
         self.verify_aot_autograd(f, inp, test_mutation=False)
         inp = [torch.ones(4, requires_grad=True)]
@@ -711,37 +833,75 @@ def f(a):
         # by the time it becomes a graph input. Good to test both cases.
         self.verify_aot_autograd(f, inp, test_mutation=True)
 
+    def test_input_mutation_hidden_from_autograd_aliasing(self):
+        def f(a):
+            a_alias = a.view(-1)
+            with torch.no_grad():
+                a_alias.mul_(2)
+            return a + 1
+
+        inp = [torch.ones(4, requires_grad=True)]
+        # The important bit: we detected that the input mutation is safe
+        # to include **inside** the graph, since it was under no_grad
+        # (so all we need to do is use mark_dirty() on the input to bump the VC)
+        fw_graph = self.verify_aot_autograd(
+            f, inp, test_mutation=True, keep_inp_mutations=True
+        )
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
+def forward(self, primals_1):
+    view = torch.ops.aten.view.default(primals_1, [-1])
+    mul = torch.ops.aten.mul.Tensor(view, 2);  view = None
+    view_1 = torch.ops.aten.view.default(mul, [4]);  mul = None
+    add = torch.ops.aten.add.Tensor(view_1, 1)
+    copy_ = torch.ops.aten.copy_.default(primals_1, view_1);  primals_1 = view_1 = None
+    return [add]""",
+        )
+
     def test_input_mutation_requires_grad_no_grad(self):
         def f(a):
             with torch.no_grad():
                 a.mul_(2)
             return a + 3
+
         inp = [torch.ones(4, requires_grad=True)]
-        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True, only_keep_inference_mutations=True)
+        fw_graph = self.verify_aot_autograd(
+            f, inp, test_mutation=True, keep_inp_mutations=True
+        )
         # Even though the input requires_grad, we expect the keep the input mutation in the graph
         # (Even though this is a training graph!)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     mul = torch.ops.aten.mul.Tensor(primals_1, 2)
     add = torch.ops.aten.add.Tensor(mul, 3)
     copy_ = torch.ops.aten.copy_.default(primals_1, mul);  primals_1 = mul = None
-    return [add]""")
+    return [add]""",
+        )
 
     def test_input_mutation_requires_grad_no_grad_inference_graph(self):
         def f(a):
             with torch.no_grad():
                 a.mul_(2)
                 return a + 3
+
         inp = [torch.ones(4, requires_grad=True)]
         # Even though the input requires_grad, we expect the keep the input mutation in the graph
-        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True, only_keep_inference_mutations=True)
+        fw_graph = self.verify_aot_autograd(
+            f, inp, test_mutation=True, keep_inp_mutations=True
+        )
 
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, arg0_1):
     mul = torch.ops.aten.mul.Tensor(arg0_1, 2)
     add = torch.ops.aten.add.Tensor(mul, 3)
     copy_ = torch.ops.aten.copy_.default(arg0_1, mul);  arg0_1 = mul = None
-    return (add,)""")
+    return (add,)""",
+        )
 
     def test_input_mutation_requires_grad_no_grad_detach_mixed(self):
         # Perform a mix of mutations on a:
@@ -753,6 +913,7 @@ def f(a):
             with torch.no_grad():
                 a.mul_(4)
             return a + 5
+
         inp = [torch.ones(4, requires_grad=True)]
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
 
@@ -761,6 +922,7 @@ def f(a):
             a.transpose_(1, 0)
             a.mul_(2)
             return a + 1
+
         inp = [torch.ones(3, 3, requires_grad=True)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=False)]
@@ -772,7 +934,9 @@ def f(inpt, weight, bias, running_mean, running_var):
             # are *also* saved for backwards.
             # This tests that what we save for the backward is actually cloned inputs,
             # and not the original inputs that got mutated.
-            return torch._native_batch_norm_legit(inpt, weight, bias, running_mean, running_var, True, 0.5, 1e-5)
+            return torch._native_batch_norm_legit(
+                inpt, weight, bias, running_mean, running_var, True, 0.5, 1e-5
+            )
 
         def create_inp(req_grad):
             return [
@@ -784,13 +948,20 @@ def create_inp(req_grad):
             ]
 
         from torch._decomp import get_decompositions
+
         # This simulates what inductor does (running the fw + bw decompositions)
-        decompositions = get_decompositions([
-            torch.ops.aten._native_batch_norm_legit_functional,
-            torch.ops.aten.native_batch_norm_backward,
-        ])
-        self.verify_aot_autograd(f, create_inp(True), test_mutation=True, decompositions=decompositions)
-        self.verify_aot_autograd(f, create_inp(False), test_mutation=True, decompositions=decompositions)
+        decompositions = get_decompositions(
+            [
+                torch.ops.aten._native_batch_norm_legit_functional,
+                torch.ops.aten.native_batch_norm_backward,
+            ]
+        )
+        self.verify_aot_autograd(
+            f, create_inp(True), test_mutation=True, decompositions=decompositions
+        )
+        self.verify_aot_autograd(
+            f, create_inp(False), test_mutation=True, decompositions=decompositions
+        )
 
     def test_batchnorm_inference(self):
         inp = [
@@ -822,15 +993,19 @@ def test_batchnorm_inference(self):
     def test_input_output_view_simple(self):
         def f(a):
             return a.view(-1)
+
         inp = [torch.ones(2, 2, requires_grad=False).add(1)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(2, 2, requires_grad=True).add(1)]
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         # Outputs that alias inputs are pulled out of the graph entirely, so we don't compile anything here
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     view = torch.ops.aten.view.default(primals_1, [-1]);  primals_1 = None
-    return [view]""")
+    return [view]""",
+        )
 
     def test_input_output_view_mutate_multiple(self):
         def f(a, b, c):
@@ -851,7 +1026,9 @@ def create_inp(req_grad):
         # We expect two outputs in the functional graph, a_updated and c_updated.
         # The actual aliased outputs themselves aren't in the compiled forward graph;
         # Instead, they're generated outside of  the graph.
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1, primals_2, primals_3):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     clone_1 = torch.ops.aten.clone.default(primals_3);  primals_3 = None
@@ -859,7 +1036,8 @@ def forward(self, primals_1, primals_2, primals_3):
     mul_1 = torch.ops.aten.mul.Tensor(clone_1, 3);  clone_1 = None
     view = torch.ops.aten.view.default(primals_2, [2, 2]);  primals_2 = None
     view_2 = torch.ops.aten.view.default(mul_1, [2, 2])
-    return [mul, mul_1, view, view_2]""")
+    return [mul, mul_1, view, view_2]""",
+        )
 
     def test_input_output_view_metadata_mutate_multiple(self):
         def f(a, b, c):
@@ -881,7 +1059,9 @@ def create_inp(req_grad):
         # Everything else that does not show up in the graph includes:
         # - The metadata mutation on c (we do it outside the graph)
         # - All 3 original fw outputs, which are aliases of inputs (we regenerate them outside of the graph)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1, primals_2, primals_3):
     clone = torch.ops.aten.clone.default(primals_2);  primals_2 = None
     view = torch.ops.aten.view.default(primals_3, [2, 2]);  primals_3 = None
@@ -890,12 +1070,14 @@ def forward(self, primals_1, primals_2, primals_3):
     view_1 = torch.ops.aten.view.default(primals_1, [2, 2]);  primals_1 = None
     view_3 = torch.ops.aten.view.default(t, [2, 2])
     view_4 = torch.ops.aten.view.default(mul, [2, 2])
-    return [mul, t, view_1, view_4, view_3]""")
+    return [mul, t, view_1, view_4, view_3]""",
+        )
 
     def test_input_mutation_and_output_view(self):
         def f(a):
             a.add_(1)
             return a.view(-1)
+
         inp = [torch.ones(2, 2, requires_grad=False).add(1)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(2, 2, requires_grad=True).add(1)]
@@ -903,13 +1085,15 @@ def f(a):
         # Here, total # of outputs is 1 because:
         # - num_mutated_inps = 1 (a_updated)
         # - num_fw_outputs = 0 (the output is an alias of the input, so we move it outside the compiled fw)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     add = torch.ops.aten.add.Tensor(clone, 1);  clone = None
     view_1 = torch.ops.aten.view.default(add, [-1])
-    return [add, view_1]""")
-
+    return [add, view_1]""",
+        )
 
     def test_input_mutation_output_view_multiple(self):
         def f(a, b, c, d):
@@ -919,15 +1103,21 @@ def f(a, b, c, d):
 
         def create_inp(req_grad):
             return [
-                torch.arange(4, requires_grad=req_grad, dtype=torch.float32).view(2, 2).add(1),
-                torch.arange(4, requires_grad=req_grad, dtype=torch.float32).view(2, 2).add(1),
+                torch.arange(4, requires_grad=req_grad, dtype=torch.float32)
+                .view(2, 2)
+                .add(1),
+                torch.arange(4, requires_grad=req_grad, dtype=torch.float32)
+                .view(2, 2)
+                .add(1),
                 torch.ones(2, 2, requires_grad=req_grad).add(1),
                 torch.ones(2, 2, requires_grad=req_grad).add(1),
             ]
 
         self.verify_aot_autograd(f, create_inp(False), test_mutation=True)
         fw_graph = self.verify_aot_autograd(f, create_inp(True), test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1, primals_2, primals_3, primals_4):
     view = torch.ops.aten.view.default(primals_2, [2, 2]);  primals_2 = None
     clone = torch.ops.aten.clone.default(primals_3);  primals_3 = None
@@ -936,23 +1126,28 @@ def forward(self, primals_1, primals_2, primals_3, primals_4):
     add_1 = torch.ops.aten.add.Tensor(primals_4, 1);  primals_4 = None
     diagonal = torch.ops.aten.diagonal.default(transpose)
     add_2 = torch.ops.aten.add.Tensor(primals_1, add);  primals_1 = None
-    return [transpose, add, add_1, diagonal, add_2]""")
+    return [transpose, add, add_1, diagonal, add_2]""",
+        )
 
     def test_output_aliases_intermediate_single(self):
         def f(a):
             out = torch.mul(a, 3)
             return out.view(-1)
+
         inp = [torch.ones(3, 3, requires_grad=False)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         # In AOTAutograd, we are obligated to make the compiled forward directly return `out`,
         # and reconstruct `out.view(-1)` as a fresh output.
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     mul = torch.ops.aten.mul.Tensor(primals_1, 3);  primals_1 = None
     view = torch.ops.aten.view.default(mul, [-1]);  mul = None
-    return [view]""")
+    return [view]""",
+        )
 
     def test_output_aliases_input_multi_output_view_should_raise_autograd_error(self):
         def f1(a):
@@ -964,26 +1159,31 @@ def f1(a):
         inp2 = torch.ones(3, 3, requires_grad=True).clone()
         inp3 = torch.ones(3, 3, requires_grad=True).clone()
 
-        with self.assertRaisesRegex(RuntimeError, "Such functions do not allow the output views"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Such functions do not allow the output views"
+        ):
             out_test1 = f1_compiled(inp1)
             # This raises a runtime error from autograd in eager mode
             out_test1[0].mul_(2)
 
-        with self.assertRaisesRegex(RuntimeError, "Such functions do not allow the output views"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Such functions do not allow the output views"
+        ):
             out_test2 = f1_compiled(inp2)
             inp2.mul_(2)
             # In eager mode, if we mutate a tensor, any multi-output-view aliases
             # get their grad_fn replaced with error nodes, so accessing grad_fn should error
             grad_fn = out_test2[0].grad_fn
 
-        with self.assertRaisesRegex(RuntimeError, "Such functions do not allow the output views"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Such functions do not allow the output views"
+        ):
             out_test3 = f1_compiled(inp3)
             out_test1[0].detach().mul_(2)
             # The above case also applies to detached aliases (they turn the multi-output-view
             # alias's grad_fns into error nodes)
             grad_fn = out_test2[0].grad_fn
 
-
     def test_output_aliases_input_multi_output_view(self):
         # All aliased outs are from multi-output views, so AOTAutograd will hide the aliasing from autograd.
         def f1(a):
@@ -998,7 +1198,9 @@ def f1(a):
         # Assert that we get CompiledFunctionBackward in the backward graph,
         # and not AsStridedBackward. No view-regeneration necessary for this mult-output view case.
         # See Note: [AOTAutograd: differentiable outputs that alias each other from a multi-output view call]
-        self.assertTrue(all('CompiledFunctionBackward' in str(o.grad_fn) for o in out_test))
+        self.assertTrue(
+            all("CompiledFunctionBackward" in str(o.grad_fn) for o in out_test)
+        )
 
         sum(out_ref).sum().backward()
         sum(out_test).sum().backward()
@@ -1020,7 +1222,7 @@ def f3(a):
         inp_clone = inp.clone()
         out_ref = f3(inp_ref_clone)
         out_test = f3_compiled(inp_clone)
-        self.assertTrue(all('UnbindBackward' in str(o.grad_fn) for o in out_test[:3]))
+        self.assertTrue(all("UnbindBackward" in str(o.grad_fn) for o in out_test[:3]))
 
         # The last output is not from a multi-output view, so autograd will let us mutate it.
         out_ref[-1].mul_(2)
@@ -1033,7 +1235,6 @@ def f3(a):
         (inp + out_test[-1]).sum().backward()
         self.assertEqual(inp_ref.grad, inp.grad)
 
-
     def test_output_aliases_intermediate_multi_output_view(self):
         # All aliased outs are from multi-output views, so AOTAutograd will hide the aliasing from autograd.
         def f1(a):
@@ -1049,7 +1250,9 @@ def f1(a):
         # Assert that we get CompiledFunctionBackward in the backward graph,
         # and not AsStridedBackward. No view-regeneration necessary for this mult-output view case.
         # See Note: [AOTAutograd: differentiable outputs that alias each other from a multi-output view call]
-        self.assertTrue(all('CompiledFunctionBackward' in str(o.grad_fn) for o in out_test))
+        self.assertTrue(
+            all("CompiledFunctionBackward" in str(o.grad_fn) for o in out_test)
+        )
 
         sum(out_ref).sum().backward()
         sum(out_test).sum().backward()
@@ -1069,7 +1272,9 @@ def f2(a):
         # Assert that we get CompiledFunctionBackward in the backward graph,
         # and not AsStridedBackward. No view-regeneration necessary for this mult-output view case.
         # See Note: [AOTAutograd: differentiable outputs that alias each other from a multi-output view call]
-        self.assertTrue(all('CompiledFunctionBackward' in str(o.grad_fn) for o in out_test))
+        self.assertTrue(
+            all("CompiledFunctionBackward" in str(o.grad_fn) for o in out_test)
+        )
 
         # The last output is not from a multi-output view, so autograd will let us mutate it.
         out_ref[-1].mul_(2)
@@ -1092,7 +1297,9 @@ def f3(a):
         # Assert that we get CompiledFunctionBackward in the backward graph,
         # and not AsStridedBackward. No view-regeneration necessary for this mult-output view case.
         # See Note: [AOTAutograd: differentiable outputs that alias each other from a multi-output view call]
-        self.assertTrue(all('CompiledFunctionBackward' in str(o.grad_fn) for o in out_test))
+        self.assertTrue(
+            all("CompiledFunctionBackward" in str(o.grad_fn) for o in out_test)
+        )
 
         # The last output is not from a multi-output view, so autograd will let us mutate it.
         out_ref[-1].mul_(2)
@@ -1137,7 +1344,6 @@ def f4(a):
         out_test_sum.sum().backward()
         self.assertEqual(inp_ref.grad, inp.grad)
 
-
     def test_output_aliases_intermediate_mutation_linear(self):
         def f(x):
             return (x + 1).view(-1)
@@ -1145,6 +1351,7 @@ def f(x):
         inp = [torch.ones(3, 3, requires_grad=True)]
         # use inductor's decomps (which will e.g. turn _unsafe_view() into view())
         from torch._inductor.decomposition import decompositions
+
         f_compiled = aot_function(f, nop, decompositions=decompositions)
 
         out_ref = f(*inp)
@@ -1159,6 +1366,7 @@ def f(a, b):
             out = torch.mul(a, 3)
             # First output is an alias of an intermediate that doesn't require grad
             return out.view(-1), b.add(1)
+
         inp = [torch.ones(3, 3), torch.ones(3, 3, requires_grad=False)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3), torch.ones(3, 3, requires_grad=True)]
@@ -1166,18 +1374,22 @@ def f(a, b):
         # important bit: we don't bother generating an intermediate base as an output in the graph,
         # because the intermediate base itself didn't require gradients.
         # (the only problematic case is when both the base and the aliasesed output require gradients).
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1, primals_2):
     mul = torch.ops.aten.mul.Tensor(primals_1, 3);  primals_1 = None
     view = torch.ops.aten.view.default(mul, [-1]);  mul = None
     add = torch.ops.aten.add.Tensor(primals_2, 1);  primals_2 = None
-    return [view, add]""")
+    return [view, add]""",
+        )
 
     def test_output_aliases_intermediate_returned_multiple_times(self):
         def f(a):
             out = torch.mul(a, 3)
             out_view = out.view(-1)
             return out, out_view, out
+
         inp = [torch.ones(3, 3, requires_grad=False)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
@@ -1188,16 +1400,20 @@ def f(a):
             out = torch.mul(a, 3)
             # AOTAutograd should manually generate these two output views in the epilogue.
             return out.view(-1), out.view(-1)
+
         inp = [torch.ones(3, 3, requires_grad=False)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     mul = torch.ops.aten.mul.Tensor(primals_1, 3);  primals_1 = None
     view = torch.ops.aten.view.default(mul, [-1])
     view_1 = torch.ops.aten.view.default(mul, [-1])
-    return [view, view_1, mul]""")
+    return [view, view_1, mul]""",
+        )
 
     def test_output_aliases_intermediate_and_returned(self):
         def f(a):
@@ -1205,15 +1421,19 @@ def f(a):
             # AOTAutograd should manually generate the first output (a view of an intermediate)
             # but not the second (which is itself the intermediate for the first)
             return out.view(-1), out
+
         inp = [torch.ones(3, 3, requires_grad=False)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     mul = torch.ops.aten.mul.Tensor(primals_1, 3);  primals_1 = None
     view = torch.ops.aten.view.default(mul, [-1])
-    return [view, mul]""")
+    return [view, mul]""",
+        )
 
     def test_output_aliases_intermediate_and_returned_flipped(self):
         def f(a):
@@ -1221,15 +1441,19 @@ def f(a):
             # AOTAutograd should manually generate the first output (a view of an intermediate)
             # but not the second (which is itself the intermediate for the first)
             return out, out.view(-1)
+
         inp = [torch.ones(3, 3, requires_grad=False)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     mul = torch.ops.aten.mul.Tensor(primals_1, 3);  primals_1 = None
     view = torch.ops.aten.view.default(mul, [-1])
-    return [mul, view]""")
+    return [mul, view]""",
+        )
 
     def test_output_aliases_intermediate_and_returned_different_grad(self):
         def f(a):
@@ -1237,11 +1461,14 @@ def f(a):
             # AOTAutograd should manually generate the first output (a view of an intermediate)
             # but not the second (which is itself the intermediate for the first)
             return out.view(-1), out, out[0].detach()
+
         inp = [torch.ones(3, 3, requires_grad=False)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     mul = torch.ops.aten.mul.Tensor(primals_1, 3);  primals_1 = None
     view = torch.ops.aten.view.default(mul, [-1])
@@ -1249,13 +1476,15 @@ def forward(self, primals_1):
     detach = torch.ops.aten.detach.default(select);  select = None
     detach_1 = torch.ops.aten.detach.default(detach);  detach = None
     detach_2 = torch.ops.aten.detach.default(detach_1);  detach_1 = None
-    return [view, mul, detach_2]""")
+    return [view, mul, detach_2]""",
+        )
 
     def test_output_aliases_intermediate_inplace_view(self):
         def f(a):
             out = torch.mul(a, 3)
             out.t_()
             return out
+
         inp = [torch.ones(2, 4, requires_grad=True)]
 
         # TODO: fix this test.
@@ -1271,17 +1500,20 @@ def f(a):
             # `out` will show up as having OutputType.non_alias,
             # and ._is_view() == False
             return out, a + 1
+
         inp = [torch.ones(2, 4, requires_grad=False)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(2, 4, requires_grad=True)]
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     mul = torch.ops.aten.mul.Tensor(primals_1, 3)
     t = torch.ops.aten.t.default(mul);  mul = None
     add = torch.ops.aten.add.Tensor(primals_1, 1);  primals_1 = None
-    return [t, add]""")
-
+    return [t, add]""",
+        )
 
     def test_output_aliases_intermediate_inplace_view_and_view(self):
         def f(a):
@@ -1290,6 +1522,7 @@ def f(a):
             out.t_()
             out_view2 = out.unsqueeze(0)
             return out_view, out, out_view2
+
         inp = [torch.ones(2, 4, requires_grad=True)]
 
         # TODO: fix this test.
@@ -1302,18 +1535,22 @@ def f(a):
             out2 = torch.mul(a, 4)
             # AOTAutograd should manually generate these two output views in the epilogue.
             return out1.view(-1), out2.transpose(1, 0), out1.transpose(1, 0)
+
         inp = [torch.ones(3, 3, requires_grad=False)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     mul = torch.ops.aten.mul.Tensor(primals_1, 3)
     mul_1 = torch.ops.aten.mul.Tensor(primals_1, 4);  primals_1 = None
     view = torch.ops.aten.view.default(mul, [-1])
     transpose = torch.ops.aten.transpose.int(mul_1, 1, 0);  mul_1 = None
     transpose_1 = torch.ops.aten.transpose.int(mul, 1, 0)
-    return [view, transpose, transpose_1, mul]""")
+    return [view, transpose, transpose_1, mul]""",
+        )
 
     def test_output_all_alias_types(self):
         # There are 3 types of aliasing that require us to return metadata in the compiled fw:
@@ -1330,11 +1567,17 @@ def inp_callable(req_grad):
             x = torch.ones(1, 2, 4, requires_grad=req_grad).clone()
             return [(x,), (x,)]
 
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
-        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=False), test_mutation=True
+        )
+        fw_graph = self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=True), test_mutation=True
+        )
         # TODO: make this test run with dynamic shapes so it is more meaningful
         # metadata output order: (a_updated_meta, out1_meta, out2_meta, out3_meta)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     view = torch.ops.aten.view.default(primals_1, [1, 2, 4]);  primals_1 = None
     transpose = torch.ops.aten.transpose.int(view, 1, 0);  view = None
@@ -1342,7 +1585,8 @@ def forward(self, primals_1):
     squeeze = torch.ops.aten.squeeze.default(mul)
     transpose_1 = torch.ops.aten.transpose.int(mul, 1, 0)
     unsqueeze = torch.ops.aten.unsqueeze.default(transpose, 0)
-    return [transpose, squeeze, transpose_1, unsqueeze, mul]""")
+    return [transpose, squeeze, transpose_1, unsqueeze, mul]""",
+        )
 
     @parametrize("req_grad", [False, True])
     def test_subclass_metadata_mutation(self, req_grad):
@@ -1356,19 +1600,30 @@ def inp_callable(req_grad):
             return [(x,), (x,)]
 
         # See https://github.com/pytorch/pytorch/issues/114975
-        with self.assertRaisesRegex(RuntimeError, "Metadata mutations are currently not allowed on tensor subclasses"):
-            self.verify_aot_autograd(f, partial(inp_callable, req_grad=req_grad), test_mutation=True, make_inputs_subclasses=True)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Metadata mutations are currently not allowed on tensor subclasses",
+        ):
+            self.verify_aot_autograd(
+                f,
+                partial(inp_callable, req_grad=req_grad),
+                test_mutation=True,
+                make_inputs_subclasses=True,
+            )
 
     def test_input_data_and_metadata_mutation(self):
         def f(a):
             a.t_()
             a[0].mul_(2)
             return a.view(a.shape)
+
         inp = [torch.ones(3, 3, requires_grad=False)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     t = torch.ops.aten.t.default(clone)
@@ -1380,7 +1635,8 @@ def forward(self, primals_1):
     t_4 = torch.ops.aten.t.default(t_2)
     t_6 = torch.ops.aten.t.default(t_2);  t_2 = None
     view_1 = torch.ops.aten.view.default(t_6, [3, 3]);  t_6 = None
-    return [t_4, view_1]""")
+    return [t_4, view_1]""",
+        )
 
     def test_view_and_inplace_view(self):
         def f(a, b):
@@ -1390,24 +1646,28 @@ def f(a, b):
         def create_inp(req_grad):
             return [
                 torch.ones(3, 3, requires_grad=req_grad),
-                torch.ones(3, 3, requires_grad=req_grad)
+                torch.ones(3, 3, requires_grad=req_grad),
             ]
 
         self.verify_aot_autograd(f, create_inp(False), test_mutation=True)
         fw_graph = self.verify_aot_autograd(f, create_inp(True), test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1, primals_2):
     view = torch.ops.aten.view.default(primals_1, [3, 3]);  primals_1 = None
     t = torch.ops.aten.t.default(view);  view = None
     view_1 = torch.ops.aten.view.default(primals_2, [3, 3]);  primals_2 = None
     view_2 = torch.ops.aten.view.default(t, [3, 3])
-    return [t, view_1, view_2]""")
+    return [t, view_1, view_2]""",
+        )
 
     def test_view_detach(self):
         def f(a):
             tmp = a.detach()
             a.mul_(2)
             return a, tmp
+
         inp = [torch.ones(3, 3, requires_grad=True)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=False)]
@@ -1417,6 +1677,7 @@ def test_input_inplace_requires_grad_true(self):
         def f(a, b):
             a.requires_grad_(True)
             return a.mul(3), b.mul(4)
+
         inp = [
             # First inp doesnt require grad, but we switch it on
             torch.ones(3, 3, requires_grad=False),
@@ -1424,11 +1685,14 @@ def f(a, b):
         ]
 
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1, primals_2):
     mul = torch.ops.aten.mul.Tensor(primals_1, 3);  primals_1 = None
     mul_1 = torch.ops.aten.mul.Tensor(primals_2, 4);  primals_2 = None
-    return [mul, mul_1]""")
+    return [mul, mul_1]""",
+        )
 
     # This is a torture test:
     # a and b get turned into a synthetic base in the compiled graph
@@ -1450,12 +1714,32 @@ def inp_callable(req_grad):
             inp2 = x[0]
             return [base], [inp1, inp2]
 
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
-        with self.assertRaisesRegex(RuntimeError, "Encountered aliased inputs that are mutated in the graph, but"):
-            self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True, make_inputs_subclasses=True)
-        with self.assertRaisesRegex(RuntimeError, "Encountered aliased inputs that are mutated in the graph, but"):
-            self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True, make_inputs_subclasses=True)
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=False), test_mutation=True
+        )
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=True), test_mutation=True
+        )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Encountered aliased inputs that are mutated in the graph, but",
+        ):
+            self.verify_aot_autograd(
+                f,
+                partial(inp_callable, req_grad=False),
+                test_mutation=True,
+                make_inputs_subclasses=True,
+            )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Encountered aliased inputs that are mutated in the graph, but",
+        ):
+            self.verify_aot_autograd(
+                f,
+                partial(inp_callable, req_grad=True),
+                test_mutation=True,
+                make_inputs_subclasses=True,
+            )
 
     # https://github.com/pytorch/pytorch/issues/106456
     def test_input_mutation_noncontiguous(self):
@@ -1470,11 +1754,32 @@ def inp_callable(req_grad):
             inp = x[:, 0]
             return [base], [inp]
 
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True, make_inputs_subclasses=True)
-        with self.assertRaisesRegex(AssertionError, "attempted to compile the backward with incorrect subclass metadata"):
-            self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True, make_inputs_subclasses=True)
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=False), test_mutation=True
+        )
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=True), test_mutation=True
+        )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Mutations on non-contiguous inputs are currently not allowed on tensor subclasses",
+        ):
+            self.verify_aot_autograd(
+                f,
+                partial(inp_callable, req_grad=False),
+                test_mutation=True,
+                make_inputs_subclasses=True,
+            )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Mutations on non-contiguous inputs are currently not allowed on tensor subclasses",
+        ):
+            self.verify_aot_autograd(
+                f,
+                partial(inp_callable, req_grad=True),
+                test_mutation=True,
+                make_inputs_subclasses=True,
+            )
 
     # Mutations in the backward are allowed as long as the mutated object does not require grad
     def test_backward_mutation_data(self):
@@ -1486,7 +1791,7 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, grad_output):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 # bw mutation
                 x.mul_(2)
                 return grad_output.clone()
@@ -1507,7 +1812,9 @@ def f(a, b):
             torch.ones(3, 3, requires_grad=True),
             torch.ones(3, 3, requires_grad=True),
         ]
-        with self.assertRaisesRegex(AssertionError, "input that requires_grad and was mutated in the backward"):
+        with self.assertRaisesRegex(
+            AssertionError, "input that requires_grad and was mutated in the backward"
+        ):
             self.verify_aot_autograd(f, inp_grad, test_mutation=True)
 
     def test_backward_mutation_metadata(self):
@@ -1519,7 +1826,7 @@ def forward(ctx, a, b):
 
             @staticmethod
             def backward(ctx, grad_a, grad_b):
-                b, = ctx.saved_tensors
+                (b,) = ctx.saved_tensors
                 # bw metadata mutation
                 b.transpose_(1, 0)
                 return grad_a.clone(), grad_b.clone()
@@ -1534,7 +1841,9 @@ def f(a, b):
             torch.ones(3, 3, requires_grad=False),
         ]
 
-        with self.assertRaisesRegex(AssertionError, "input that had its metadata mutated in the backward"):
+        with self.assertRaisesRegex(
+            AssertionError, "input that had its metadata mutated in the backward"
+        ):
             self.verify_aot_autograd(f, inp_no_grad, test_mutation=True)
 
     def test_backward_mutation_on_grad_out(self):
@@ -1558,7 +1867,9 @@ def f(a, b):
             torch.ones(3, 3, requires_grad=True),
         ]
         f_compiled = aot_function(f, nop)
-        with self.assertRaisesRegex(AssertionError, "input to the backward that was mutated during the backward"):
+        with self.assertRaisesRegex(
+            AssertionError, "input to the backward that was mutated during the backward"
+        ):
             out = f_compiled(*inp_grad)
 
     # Partially addresses https://github.com/pytorch/pytorch/issues/106457
@@ -1566,33 +1877,57 @@ def test_input_mutation_false_aliasing(self):
         def f(a, b):
             a.mul_(3)
             b.mul_(2)
-            return a + b
+            return a.clone().view(-1) + b.clone().view(-1)
 
         # No overlap, contiguous
         def inp_callable1(req_grad):
             base = torch.ones(4, 4, requires_grad=req_grad)
             x = base.add(1)
-            # create two non-contiguous views that share storage, but are actually non-overlapping
+            # create two views that share storage, but are actually non-overlapping
             a = x[0:2]
             b = x[2:4]
             return [base], [a, b]
 
-        fw_graph = self.verify_aot_autograd(f, partial(inp_callable1, req_grad=False), test_mutation=True)
-        self.verify_aot_autograd(f, partial(inp_callable1, req_grad=True), test_mutation=True)
-        self.verify_aot_autograd(f, partial(inp_callable1, req_grad=False), test_mutation=True, make_inputs_subclasses=True)
+        fw_graph = self.verify_aot_autograd(
+            f, partial(inp_callable1, req_grad=False), test_mutation=True
+        )
+        self.verify_aot_autograd(
+            f, partial(inp_callable1, req_grad=True), test_mutation=True
+        )
+        self.verify_aot_autograd(
+            f,
+            partial(inp_callable1, req_grad=False),
+            test_mutation=True,
+            make_inputs_subclasses=True,
+        )
         # Input mutations on subclasses with training graphs fail backward guards today.
-        with self.assertRaisesRegex(AssertionError, "attempted to compile the backward with incorrect subclass metadata"):
-            self.verify_aot_autograd(f, partial(inp_callable1, req_grad=True), test_mutation=True, make_inputs_subclasses=True)
+        with self.assertRaisesRegex(
+            AssertionError,
+            "attempted to compile the backward with incorrect subclass metadata",
+        ):
+            self.verify_aot_autograd(
+                f,
+                partial(inp_callable1, req_grad=True),
+                test_mutation=True,
+                make_inputs_subclasses=True,
+            )
 
         # Important characteristic: the graph takes in 2 inputs!
         # That shows that we didn't try to run our complicated synthetic base logic,
         # because we successfully detected false aliasing across the two inputs.
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, arg0_1, arg1_1):
     mul = torch.ops.aten.mul.Tensor(arg0_1, 3);  arg0_1 = None
     mul_1 = torch.ops.aten.mul.Tensor(arg1_1, 2);  arg1_1 = None
-    add = torch.ops.aten.add.Tensor(mul, mul_1)
-    return (mul, mul_1, add)""")
+    clone = torch.ops.aten.clone.default(mul)
+    view = torch.ops.aten.view.default(clone, [-1]);  clone = None
+    clone_1 = torch.ops.aten.clone.default(mul_1)
+    view_1 = torch.ops.aten.view.default(clone_1, [-1]);  clone_1 = None
+    add = torch.ops.aten.add.Tensor(view, view_1);  view = view_1 = None
+    return (mul, mul_1, add)""",
+        )
 
         # No overlap, non-contiguous: first tensor ends before second tensor start
         def inp_callable2(req_grad):
@@ -1626,6 +1961,16 @@ def inp_callable5(req_grad):
             b = x.as_strided((4, 4), (9, 1), storage_offset=23)
             return [base], [a, b]
 
+        # No overlap, non-contiguous
+        def inp_callable6(req_grad):
+            base = torch.ones(256, requires_grad=req_grad)
+            x = base.add(1)
+            # a's last element is at offset 195 (24 total elements)
+            a = x.as_strided((2, 4, 3), (110, 24, 4), storage_offset=5)
+            # b's first element is at offset 196: no overlap
+            b = x[196 : 196 + a.numel()]
+            return [base], [a, b]
+
         # overlap! non-contiguous
         def inp_callable_overlap1(req_grad):
             base = torch.ones(256, requires_grad=req_grad)
@@ -1642,26 +1987,51 @@ def inp_callable_overlap2(req_grad):
             b = x.as_strided((4, 4), (9, 1), storage_offset=25)
             return [base], [a, b]
 
-        fw_graph2 = self.verify_aot_autograd(f, partial(inp_callable2, req_grad=False), test_mutation=True)
-        fw_graph3 = self.verify_aot_autograd(f, partial(inp_callable3, req_grad=False), test_mutation=True)
-        fw_graph4 = self.verify_aot_autograd(f, partial(inp_callable4, req_grad=False), test_mutation=True)
-        fw_graph5 = self.verify_aot_autograd(f, partial(inp_callable5, req_grad=False), test_mutation=True)
+        # overlap! non-contiguous
+        def inp_callable_overlap3(req_grad):
+            base = torch.ones(256, requires_grad=req_grad)
+            x = base.add(1)
+            # a's last element is at offset 195 (24 total elements)
+            a = x.as_strided((2, 4, 3), (110, 24, 4), storage_offset=5)
+            # b's first element is at offset 195: overlap!
+            b = x[195 : 195 + a.numel()]
+            return [base], [a, b]
 
-        fw_graph_overlap1 = self.verify_aot_autograd(f, partial(inp_callable_overlap2, req_grad=False), test_mutation=True)
-        fw_graph_overlap2 = self.verify_aot_autograd(f, partial(inp_callable_overlap1, req_grad=False), test_mutation=True)
+        fw_graph2 = self.verify_aot_autograd(
+            f, partial(inp_callable2, req_grad=False), test_mutation=True
+        )
+        fw_graph3 = self.verify_aot_autograd(
+            f, partial(inp_callable3, req_grad=False), test_mutation=True
+        )
+        fw_graph4 = self.verify_aot_autograd(
+            f, partial(inp_callable4, req_grad=False), test_mutation=True
+        )
+        fw_graph5 = self.verify_aot_autograd(
+            f, partial(inp_callable5, req_grad=False), test_mutation=True
+        )
+        fw_graph6 = self.verify_aot_autograd(
+            f, partial(inp_callable6, req_grad=False), test_mutation=True
+        )
 
-        # All non-overlap graphs should be the same since we detected false aliasing
+        fw_graph_overlap1 = self.verify_aot_autograd(
+            f, partial(inp_callable_overlap2, req_grad=False), test_mutation=True
+        )
+        fw_graph_overlap2 = self.verify_aot_autograd(
+            f, partial(inp_callable_overlap1, req_grad=False), test_mutation=True
+        )
+
+        # All non-overlap graphs should be the same since we detected false aliasing
         self.assertEqual(str(fw_graph.code), str(fw_graph2.code))
         self.assertEqual(str(fw_graph.code), str(fw_graph3.code))
         self.assertEqual(str(fw_graph.code), str(fw_graph4.code))
         self.assertEqual(str(fw_graph.code), str(fw_graph5.code))
+        self.assertEqual(str(fw_graph.code), str(fw_graph6.code))
 
         # All overlap graphs should be the same since we detected real aliasing
         self.assertNotEqual(str(fw_graph.code), str(fw_graph_overlap1.code))
         self.assertNotEqual(str(fw_graph.code), str(fw_graph_overlap2.code))
-        self.assertTrue('as_strided_scatter' in str(fw_graph_overlap1.code))
-        self.assertTrue('as_strided_scatter' in str(fw_graph_overlap2.code))
-
+        self.assertTrue("as_strided_scatter" in str(fw_graph_overlap1.code))
+        self.assertTrue("as_strided_scatter" in str(fw_graph_overlap2.code))
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
     def test_mem_leak_from_save_for_bw(self):
@@ -1683,8 +2053,8 @@ def f(a, b):
 
         f_compiled = aot_function(f, nop)
         inps = [
-            torch.ones(8, 8, device='cuda', requires_grad=True),
-            torch.ones(1, 4, 1, device='cuda', requires_grad=True),
+            torch.ones(8, 8, device="cuda", requires_grad=True),
+            torch.ones(1, 4, 1, device="cuda", requires_grad=True),
         ]
         mem_before = torch.cuda.memory_allocated()
         f_compiled(*inps)
@@ -1705,10 +2075,24 @@ def inp_callable(req_grad):
             inp2 = x[0]
             return [base], [inp1, inp2]
 
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True, make_inputs_subclasses=True)
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True, make_inputs_subclasses=True)
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=False), test_mutation=True
+        )
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=True), test_mutation=True
+        )
+        self.verify_aot_autograd(
+            f,
+            partial(inp_callable, req_grad=False),
+            test_mutation=True,
+            make_inputs_subclasses=True,
+        )
+        self.verify_aot_autograd(
+            f,
+            partial(inp_callable, req_grad=True),
+            test_mutation=True,
+            make_inputs_subclasses=True,
+        )
 
     def test_input_mutation_aliases_other_input(self):
         def f(a, b):
@@ -1723,13 +2107,19 @@ def inp_callable(req_grad):
             inp2 = x[0]
             return [base], [inp1, inp2]
 
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
-        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=False), test_mutation=True
+        )
+        fw_graph = self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=True), test_mutation=True
+        )
         # Important parts of the graph:
         # - the compiled graph takes in a base, and we generate a and b (the views) off of the base
         # - clone() is still in the graph, because we need to call grad() on the original (non-mutated) inputs
         # - We re-generate the views *after* the clone, to preserve view relationships.
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     as_strided = torch.ops.aten.as_strided.default(clone, [2], [1], 0)
@@ -1738,7 +2128,8 @@ def forward(self, primals_1):
     as_strided_2 = torch.ops.aten.as_strided.default(as_strided_scatter, [2], [1], 0)
     as_strided_5 = torch.ops.aten.as_strided.default(as_strided_scatter, [2], [1], 0)
     add_1 = torch.ops.aten.add.Tensor(as_strided_2, as_strided_5);  as_strided_2 = as_strided_5 = None
-    return [as_strided_scatter, add_1]""")  # noqa: B950
+    return [as_strided_scatter, add_1]""",
+        )  # noqa: B950
 
     def test_input_mutation_aliases_other_input2(self):
         def f(a, b):
@@ -1753,9 +2144,15 @@ def inp_callable(req_grad):
             inp2 = x
             return [base], [inp1, inp2]
 
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
-        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=False), test_mutation=True
+        )
+        fw_graph = self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=True), test_mutation=True
+        )
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     as_strided = torch.ops.aten.as_strided.default(clone, [2], [1], 0)
@@ -1764,7 +2161,8 @@ def forward(self, primals_1):
     as_strided_2 = torch.ops.aten.as_strided.default(as_strided_scatter, [2], [1], 0)
     as_strided_5 = torch.ops.aten.as_strided.default(as_strided_scatter, [2, 2], [2, 1], 0)
     add_1 = torch.ops.aten.add.Tensor(as_strided_2, as_strided_5);  as_strided_2 = as_strided_5 = None
-    return [as_strided_scatter, add_1]""")  # noqa: B950
+    return [as_strided_scatter, add_1]""",
+        )  # noqa: B950
 
     def test_input_mutation_aliases_and_output_alias(self):
         def f(a, b):
@@ -1778,9 +2176,15 @@ def inp_callable(req_grad):
             x = base.add(1)
             return [base], [x.view(-1), x.view(-1)]
 
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
-        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=False), test_mutation=True
+        )
+        fw_graph = self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=True), test_mutation=True
+        )
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     as_strided = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
@@ -1788,7 +2192,8 @@ def forward(self, primals_1):
     as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, add, [4], [1], 0);  clone = add = None
     as_strided_8 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0)
     view_1 = torch.ops.aten.view.default(as_strided_8, [4]);  as_strided_8 = None
-    return [as_strided_scatter, view_1]""")  # noqa: B950
+    return [as_strided_scatter, view_1]""",
+        )  # noqa: B950
 
     def test_input_aliased_with_mutation_output_alias(self):
         def f(a, b, c):
@@ -1807,9 +2212,15 @@ def inp_callable(req_grad):
             y = base2.add(1)
             return [base1, base2], [x.view(-1), y, x.view(-1)]
 
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
-        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=False), test_mutation=True
+        )
+        fw_graph = self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=True), test_mutation=True
+        )
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1, primals_2):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     as_strided_1 = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
@@ -1818,7 +2229,8 @@ def forward(self, primals_1, primals_2):
     add = torch.ops.aten.add.Tensor(primals_2, 1);  primals_2 = None
     as_strided_7 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0)
     view_1 = torch.ops.aten.view.default(as_strided_7, [-1]);  as_strided_7 = None
-    return [as_strided_scatter, add, view_1]""")  # noqa: B950
+    return [as_strided_scatter, add, view_1]""",
+        )  # noqa: B950
 
     def test_input_metadata_mutation_aliases(self):
         def f(a, b):
@@ -1833,14 +2245,21 @@ def inp_callable(req_grad):
             x = base.add(1)
             return [base], [x.view(-1), x.view(-1)]
 
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
-        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=False), test_mutation=True
+        )
+        fw_graph = self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=True), test_mutation=True
+        )
         # Expectation: fwd() takes in 2 args, and we don't construct a synthetic base.
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1, primals_2):
     t = torch.ops.aten.t.default(primals_1);  primals_1 = None
     add = torch.ops.aten.add.Tensor(t, primals_2);  t = primals_2 = None
-    return [add]""")
+    return [add]""",
+        )
 
     def test_input_mutation_aliases_and_none_require_gradients(self):
         def f(a, b, c):
@@ -1855,13 +2274,26 @@ def inp_callable(req_grad):
             x = base.add(1)
             return [base, c_arg], [x.view(-1), x.view(-1), c_arg]
 
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=False), test_mutation=True
+        )
 
-        with self.assertRaisesRegex(RuntimeError, "is a tensor subclass. This is not supported today"):
-            self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True, make_inputs_subclasses=True)
+        with self.assertRaisesRegex(
+            RuntimeError, "is a tensor subclass. This is not supported today"
+        ):
+            self.verify_aot_autograd(
+                f,
+                partial(inp_callable, req_grad=False),
+                test_mutation=True,
+                make_inputs_subclasses=True,
+            )
 
-        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        fw_graph = self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=True), test_mutation=True
+        )
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1, primals_2):
     as_strided = torch.ops.aten.as_strided.default(primals_1, [4], [1], 0)
     mul = torch.ops.aten.mul.Tensor(as_strided, 2);  as_strided = None
@@ -1869,7 +2301,8 @@ def forward(self, primals_1, primals_2):
     as_strided_3 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0)
     add = torch.ops.aten.add.Tensor(as_strided_3, 1);  as_strided_3 = None
     add_1 = torch.ops.aten.add.Tensor(primals_2, 1);  primals_2 = None
-    return [as_strided_scatter, add, add_1]""")  # noqa: B950
+    return [as_strided_scatter, add, add_1]""",
+        )  # noqa: B950
 
     def test_input_mutation_aliases_bases_out_of_order(self):
         # This tests our calling convention: if b and d are aliased, then the outer calling convention
@@ -1890,17 +2323,31 @@ def inp_callable(req_grad):
             # a and c alias, b and d alias
             return [base1, base2], [x1.view(-1), x2.view(-1), x1.view(-1), x2.view(-1)]
 
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=False), test_mutation=True
+        )
 
-        with self.assertRaisesRegex(RuntimeError, "Metadata mutations are currently not allowed on tensor subclasses"):
-            self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True, make_inputs_subclasses=True)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Metadata mutations are currently not allowed on tensor subclasses",
+        ):
+            self.verify_aot_autograd(
+                f,
+                partial(inp_callable, req_grad=False),
+                test_mutation=True,
+                make_inputs_subclasses=True,
+            )
 
-        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
+        fw_graph = self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=True), test_mutation=True
+        )
         # 3 graph inputs: (b_d_base, a, c)
         # 2 returns: (b_updated, a+c+d)
         # (there are 2 original fw outs, but one is a view of b so it's not part of the graph)
         # (there are also 2 input mutations, but one is a metadata-only mutation so the compiled forward doesn't return it)
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1, primals_2, primals_3):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     as_strided = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
@@ -1912,7 +2359,8 @@ def forward(self, primals_1, primals_2, primals_3):
     add_2 = torch.ops.aten.add.Tensor(add_1, unsqueeze_1);  add_1 = None
     as_strided_14 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0)
     view_2 = torch.ops.aten.view.default(as_strided_14, [-1]);  as_strided_14 = None
-    return [as_strided_scatter, add_2, view_2, unsqueeze_1]""")  # noqa: B950
+    return [as_strided_scatter, add_2, view_2, unsqueeze_1]""",
+        )  # noqa: B950
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
     def test_synthetic_base_base_attribute_is_none(self):
@@ -1921,7 +2369,7 @@ def f(a, b):
             return a + b
 
         def inp_callable():
-            base = torch.ones(4, 4, device='cuda')
+            base = torch.ones(4, 4, device="cuda")
             # detach() so that none of the inputs have a ._base attribute.
             a = base[0].detach()
             b = base[1].detach()
@@ -1930,7 +2378,6 @@ def inp_callable():
 
         self.verify_aot_autograd(f, inp_callable, test_mutation=True)
 
-
     def test_input_mutation_alias_everything(self):
         # Mondo test that tests a combination of:
         # input is mutated, that aliases another input (so we make a synthetic base)
@@ -1959,14 +2406,20 @@ def inp_callable(req_grad):
             c = base1_.view(-1)
             return [base1, base2], [a, b, c]
 
-        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
-        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
+        self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=False), test_mutation=True
+        )
+        fw_graph = self.verify_aot_autograd(
+            f, partial(inp_callable, req_grad=True), test_mutation=True
+        )
         # Expected:
         # - 2 inputs in the forward: synthetic_base_a_c, b
         # - 1 output in the forward: "tmp"
         #   out2 is an alias of an input, and will be generated off of b outside of the compiled fn
         #   out1 and out3 are aliases of tmp, that we generate outside of the compiled function
-        self.assertExpectedInline(fw_graph.code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
 def forward(self, primals_1, primals_2):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     view = torch.ops.aten.view.default(primals_2, [2, 2]);  primals_2 = None
@@ -1980,11 +2433,13 @@ def forward(self, primals_1, primals_2):
     view_1 = torch.ops.aten.view.default(add, [-1])
     t_1 = torch.ops.aten.t.default(t)
     unsqueeze = torch.ops.aten.unsqueeze.default(view_1, 0)
-    return [as_strided_scatter, t, view_1, t_1, unsqueeze, add]""")  # noqa: B950
+    return [as_strided_scatter, t, view_1, t_1, unsqueeze, add]""",
+        )  # noqa: B950
 
     def test_dynamic_shape_output_not_in_bw_graph(self):
         def f(x):
             return [x + 1, x.shape[0]]
+
         inp = torch.ones(5, requires_grad=True)
         bw_graph_cell = [None]
         compiled_f = aot_function(
@@ -2002,15 +2457,21 @@ def f(x):
         # 1 grad_output as an input to the backward graph.
         # (Otherwise, autograd will plumb a None as the value of the grad_output,
         # which causes inductor to complain).
-        self.assertExpectedInline(bw_graph_cell[0].code.strip(), """\
+        self.assertExpectedInline(
+            bw_graph_cell[0].code.strip(),
+            """\
 def forward(self, tangents_1):
-    return [tangents_1]""")
+    return [tangents_1]""",
+        )
 
     def test_no_grad_input_output(self):
         def f(a, b):
             return a.cos(), b.cos(), a * b
 
-        inp_thunks = [lambda: torch.randn(5, requires_grad=True), lambda: torch.randn(5, requires_grad=False)]
+        inp_thunks = [
+            lambda: torch.randn(5, requires_grad=True),
+            lambda: torch.randn(5, requires_grad=False),
+        ]
         for inps in itertools.product(inp_thunks, repeat=2):
             inps = [i() for i in inps]
             self.verify_aot_autograd(f, inps)
@@ -2020,19 +2481,28 @@ def f(a, b):
             a_view = a.view(-1)
             a_view.requires_grad_(True)
             return a_view
+
         inp = [torch.randn(3, 3), torch.randn(3, 3, requires_grad=True)]
         self.verify_aot_autograd(f, inp)
 
     def test_some_outputs_dont_require_grad_view(self):
         def f(a, b):
             return a.detach(), b
-        inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3, requires_grad=True)]
+
+        inp = [
+            torch.randn(3, 3, requires_grad=True),
+            torch.randn(3, 3, requires_grad=True),
+        ]
         self.verify_aot_autograd(f, inp)
 
     def test_some_outputs_dont_require_grad_non_view(self):
         def f(a, b):
             return a.add(1).detach(), b
-        inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3, requires_grad=True)]
+
+        inp = [
+            torch.randn(3, 3, requires_grad=True),
+            torch.randn(3, 3, requires_grad=True),
+        ]
         self.verify_aot_autograd(f, inp)
 
     def test_inner_grad(self):
@@ -2040,12 +2510,14 @@ def foo(x):
             y = torch.exp(x)
             z = torch.autograd.grad(y, x)
             return z
+
         inps = [torch.randn((), requires_grad=True)]
         self.verify_aot_autograd(foo, inps)
 
     def test_grad_context(self):
         def foo(x):
             return x * 2
+
         inps = [torch.randn((), requires_grad=True)]
         graph_size = None
 
@@ -2068,12 +2540,14 @@ def get_graph_size(fx_g, _):
 
     def test_output_dict(self):
         def f(x):
-            return {'a': x, 'b': x}
+            return {"a": x, "b": x}
+
         inp = [torch.randn(3, 3, requires_grad=True)]
         self.verify_aot_autograd(f, inp)
 
         def f(x, y):
-            return {'a': x, 'b': y + x}
+            return {"a": x, "b": y + x}
+
         inp = [torch.randn(3, requires_grad=True), torch.randn(3)]
         self.verify_aot_autograd(f, inp)
 
@@ -2087,7 +2561,7 @@ def f(x):
         b = torch.randn(3, requires_grad=True)
 
         def inp_callable():
-            inps = [{'a': a, 'b': b}]
+            inps = [{"a": a, "b": b}]
             return inps, inps
 
         self.verify_aot_autograd(f, inp_callable)
@@ -2113,19 +2587,22 @@ def test_list_codegen(self):
         def list_nop(f, _):
             def g(inps):
                 return f(*inps)
+
             g._boxed_call = True
             return g
 
         def f(a, b, c):
             return a.sin() * b.cos() * c.sin()
+
         f = aot_function(f, list_nop)
         inp = [torch.randn(5, requires_grad=True) for _ in range(3)]
         f(*inp).sum().backward()
 
-    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch("torch._functorch.aot_autograd.AOT_COUNTER", new_callable=itertools.count)
     def test_compilation_context(self, counter):
         def f(x):
             return x.sin().sin()
+
         count = []
 
         def compiler(fx_g, _):
@@ -2138,7 +2615,10 @@ def compiler(fx_g, _):
         f = aot_function(f, compiler)
         f(torch.randn(5))
         out.sum().backward()
-        self.assertExpectedInline(str(count), """[(['0_forward'], 4), (['1_inference'], 4), (['0_backward'], 8)]""")
+        self.assertExpectedInline(
+            str(count),
+            """[(['0_forward'], 4), (['1_inference'], 4), (['0_backward'], 8)]""",
+        )
 
     def test_dupe_arg(self):
         def f(x, y):
@@ -2161,6 +2641,7 @@ def test_dupe_arg_returned_as_output(self):
         def f(a, b, a_):
             a[0].add_(1)
             return a_
+
         f_compiled = aot_function(f, nop)
         a = torch.ones(2)
         b = torch.ones(2)
@@ -2173,7 +2654,7 @@ def f(a, b, a_):
         self.assertEqual(out_ref, out_test)
         self.assertEqual(a, a2)
 
-    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch("torch._functorch.aot_autograd.AOT_COUNTER", new_callable=itertools.count)
     @patch("torch._functorch.config.debug_assert", True)
     def test_invalid_dupe_left_bias(self, counter):
         # This test checks that, just because only the first
@@ -2191,23 +2672,22 @@ def forward(self, x, y):
 
         fxx = aot_module_simplified(F(), (x, x), nop)
         self.assertExpectedRaisesInline(
-            AssertionError, lambda: fxx(x, y),
-            """At compilation time, graph 2 was compiled under the assumption that input 1 would be a duplicate of input 0, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
+            AssertionError,
+            lambda: fxx(x, y),
+            """At compilation time, graph 2 was compiled under the assumption that input 1 would be a duplicate of input 0, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch.""",  # noqa: B950
         )
 
-
-    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch("torch._functorch.aot_autograd.AOT_COUNTER", new_callable=itertools.count)
     @patch("torch._functorch.config.debug_assert", True)
     def test_invalid_dupe(self, counter):
         self._test_invalid_dupe(counter, fake=False)
 
     # See Note: Dynamo recompilation guarding invalid grad for why this test exists
-    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch("torch._functorch.aot_autograd.AOT_COUNTER", new_callable=itertools.count)
     @patch("torch._functorch.config.debug_assert", True)
     def test_invalid_dupe_fake(self, counter):
         self._test_invalid_dupe(counter, fake=True)
 
-
     def _test_invalid_dupe(self, counter, fake):
         class F(torch.nn.Module):
             def forward(self, x, y):
@@ -2248,18 +2728,18 @@ def forward(self, x, y):
         x = torch.randn(3, 3, requires_grad=True).clone()
         y = torch.randn(3, 3, requires_grad=True).clone()
         self.assertExpectedRaisesInline(
-            AssertionError, lambda: fxx(x, y),
-            """At compilation time, graph 1 was compiled under the assumption that input 1 would be a duplicate of input 0, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
+            AssertionError,
+            lambda: fxx(x, y),
+            """At compilation time, graph 1 was compiled under the assumption that input 1 would be a duplicate of input 0, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch.""",  # noqa: B950
         )
 
-
-    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch("torch._functorch.aot_autograd.AOT_COUNTER", new_callable=itertools.count)
     @patch("torch._functorch.config.debug_assert", True)
     def test_invalid_requires_grad(self, counter):
         self._test_invalid_requires_grad(counter, fake=False)
 
     # See Note: Dynamo recompilation guarding invalid grad for why this test exists
-    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch("torch._functorch.aot_autograd.AOT_COUNTER", new_callable=itertools.count)
     @patch("torch._functorch.config.debug_assert", True)
     def test_invalid_requires_grad_fake(self, counter):
         self._test_invalid_requires_grad(counter, fake=True)
@@ -2297,8 +2777,9 @@ def forward(self, x, y):
         compare_equal_outs_and_grads(self, F(), fxz, (x, z))
 
         self.assertExpectedRaisesInline(
-            AssertionError, lambda: fxz(x, y),
-            """At compilation time, graph 1 was compiled under the assumption that input 1 would not require grad, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
+            AssertionError,
+            lambda: fxz(x, y),
+            """At compilation time, graph 1 was compiled under the assumption that input 1 would not require grad, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch.""",  # noqa: B950
         )
 
     def test_custom_autograd(self):
@@ -2326,11 +2807,14 @@ def test_autocast_disable_guard(self):
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
     def test_nonidempotent_amp(self):
         def f(self_s_emb, add_3):
-            einsum_2 = torch.functional.einsum('ah,th->t', self_s_emb, add_3)
+            einsum_2 = torch.functional.einsum("ah,th->t", self_s_emb, add_3)
             log_softmax_2 = einsum_2.log_softmax(-1)
             return (log_softmax_2,)
 
-        args = [torch.rand((1, 256), dtype=torch.float32, device='cuda'), torch.rand((30, 256), dtype=torch.float16, device='cuda')]
+        args = [
+            torch.rand((1, 256), dtype=torch.float32, device="cuda"),
+            torch.rand((30, 256), dtype=torch.float16, device="cuda"),
+        ]
         with torch.cuda.amp.autocast(enabled=True):
             self.verify_aot_autograd(f, args)
 
@@ -2345,8 +2829,13 @@ def test_batch_norm_amp(self):
         device = "cuda"
         input_dtype = torch.float16
         param_dtype = torch.float32
-        weight, bias = (torch.ones(64, device=device, dtype=param_dtype, requires_grad=True) for _ in range(2))
-        running_mean, running_var = (torch.ones(64, device=device, dtype=param_dtype) for _ in range(2))
+        weight, bias = (
+            torch.ones(64, device=device, dtype=param_dtype, requires_grad=True)
+            for _ in range(2)
+        )
+        running_mean, running_var = (
+            torch.ones(64, device=device, dtype=param_dtype) for _ in range(2)
+        )
 
         def bn(x):
             return torch.ops.aten.cudnn_batch_norm(
@@ -2359,10 +2848,15 @@ def bn(x):
                 0.1,
                 1e-05,
             )
-        inp = torch.ones(torch.Size([16, 64, 112, 112]), dtype=input_dtype, device=device)
+
+        inp = torch.ones(
+            torch.Size([16, 64, 112, 112]), dtype=input_dtype, device=device
+        )
 
         ref = bn(inp)
-        cudnn_batch_norm_decomp = torch._decomp.get_decompositions({torch.ops.aten.cudnn_batch_norm})
+        cudnn_batch_norm_decomp = torch._decomp.get_decompositions(
+            {torch.ops.aten.cudnn_batch_norm}
+        )
         aot_fn = make_fx(bn, decomposition_table=cudnn_batch_norm_decomp)(inp)
         res = aot_fn(inp)
         for a, b in zip(ref, res):
@@ -2390,7 +2884,14 @@ def f(x):
         # TODO: assert outputs of fwd graph trace to correct symint
 
         # e2e test that fails without symint clone fix
-        af = aot_function(f, nop, partition_fn=partial(min_cut_rematerialization_partition, compiler="inductor"), dynamic=True)
+        af = aot_function(
+            f,
+            nop,
+            partition_fn=partial(
+                min_cut_rematerialization_partition, compiler="inductor"
+            ),
+            dynamic=True,
+        )
         out = af(inp)
         self.assertEqual(out, f(inp))
 
@@ -2457,7 +2958,9 @@ def forward(self, x):
         fw_graph_cell = [None]
         compiled_f = aot_module(
             model_for_compile,
-            fw_compiler=make_boxed_compiler(partial(extract_graph, graph_cell=fw_graph_cell)),
+            fw_compiler=make_boxed_compiler(
+                partial(extract_graph, graph_cell=fw_graph_cell)
+            ),
             bw_compiler=nop,
             keep_inference_input_mutations=True,
         )
@@ -2467,7 +2970,9 @@ def forward(self, x):
         out_ref = model_for_eager(inp_ref.clone())
         out_test = compiled_f(inp_test.clone())
 
-        self.assertExpectedInline(fw_graph_cell[0].code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph_cell[0].code.strip(),
+            """\
 def forward(self, primals_1, primals_2, primals_3, primals_4):
     add = torch.ops.aten.add.Tensor(primals_3, 1)
     mul = torch.ops.aten.mul.Tensor(primals_1, primals_4)
@@ -2476,7 +2981,8 @@ def forward(self, primals_1, primals_2, primals_3, primals_4):
     sum_2 = torch.ops.aten.sum.default(add)
     add_1 = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
     copy_ = torch.ops.aten.copy_.default(primals_3, add);  primals_3 = add = None
-    return [add_1, primals_1, primals_2, primals_4, mul]""")
+    return [add_1, primals_1, primals_2, primals_4, mul]""",
+        )
 
         self.assertEqual(out_ref, out_test)
 
@@ -2494,7 +3000,9 @@ class MyModel(torch.nn.Module):
             def __init__(self):
                 super().__init__()
                 self.register_buffer("buf", torch.ones(4, 4))
-                self.w = torch.nn.Parameter(torch.Tensor([[4, 5], [1, 2], [6, 7], [8, 9]]))
+                self.w = torch.nn.Parameter(
+                    torch.Tensor([[4, 5], [1, 2], [6, 7], [8, 9]])
+                )
 
             def forward(self, x):
                 self.buf.add_(1)
@@ -2506,7 +3014,9 @@ def forward(self, x):
         fw_graph_cell = [None]
         compiled_f = aot_module(
             model_for_compile,
-            fw_compiler=make_boxed_compiler(partial(extract_graph, graph_cell=fw_graph_cell)),
+            fw_compiler=make_boxed_compiler(
+                partial(extract_graph, graph_cell=fw_graph_cell)
+            ),
             bw_compiler=nop,
             keep_inference_input_mutations=True,
         )
@@ -2516,7 +3026,9 @@ def forward(self, x):
         out_ref = model_for_eager(inp_ref.clone())
         out_test = compiled_f(inp_test.clone())
 
-        self.assertExpectedInline(fw_graph_cell[0].code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph_cell[0].code.strip(),
+            """\
 def forward(self, primals_1, primals_2, primals_3):
     add = torch.ops.aten.add.Tensor(primals_2, 1)
     mm = torch.ops.aten.mm.default(primals_1, primals_3)
@@ -2524,7 +3036,8 @@ def forward(self, primals_1, primals_2, primals_3):
     sum_2 = torch.ops.aten.sum.default(add)
     add_1 = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
     copy_ = torch.ops.aten.copy_.default(primals_2, add);  primals_2 = add = None
-    return [add_1, primals_1, primals_3]""")
+    return [add_1, primals_1, primals_3]""",
+        )
         self.assertEqual(out_ref, out_test)
 
         out_ref.sum().backward()
@@ -2553,8 +3066,12 @@ def forward(self, x):
         bw_graph_cell = [None]
         compiled_f = aot_module(
             model_for_compile,
-            fw_compiler=make_boxed_compiler(partial(extract_graph, graph_cell=fw_graph_cell)),
-            bw_compiler=make_boxed_compiler(partial(extract_graph, graph_cell=bw_graph_cell)),
+            fw_compiler=make_boxed_compiler(
+                partial(extract_graph, graph_cell=fw_graph_cell)
+            ),
+            bw_compiler=make_boxed_compiler(
+                partial(extract_graph, graph_cell=bw_graph_cell)
+            ),
             keep_inference_input_mutations=True,
         )
         inp_ref = torch.ones(20, 100, requires_grad=True)
@@ -2563,7 +3080,9 @@ def forward(self, x):
         out_ref = model_for_eager(inp_ref.clone())
         out_test = compiled_f(inp_test.clone())
 
-        self.assertExpectedInline(fw_graph_cell[0].code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph_cell[0].code.strip(),
+            """\
 def forward(self, primals_1, primals_2, primals_3, primals_4, primals_5, primals_6):
     add = torch.ops.aten.add.Tensor(primals_5, 1)
     _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(primals_6, primals_1, primals_2, primals_3, primals_4, True, 0.1, 1e-05);  primals_2 = None
@@ -2575,7 +3094,8 @@ def forward(self, primals_1, primals_2, primals_3, primals_4, primals_5, primals
     copy_ = torch.ops.aten.copy_.default(primals_3, getitem_3);  primals_3 = None
     copy__1 = torch.ops.aten.copy_.default(primals_4, getitem_4);  primals_4 = None
     copy__2 = torch.ops.aten.copy_.default(primals_5, add);  primals_5 = add = None
-    return [getitem, primals_1, primals_6, getitem_1, getitem_2, getitem_3, getitem_4]""")  # noqa: B950
+    return [getitem, primals_1, primals_6, getitem_1, getitem_2, getitem_3, getitem_4]""",  # noqa: B950
+        )
 
         self.assertEqual(out_ref, out_test)
 
@@ -2586,13 +3106,16 @@ def forward(self, primals_1, primals_2, primals_3, primals_4, primals_5, primals
         compile_grads = [p.grad for _, p in model_for_compile.named_parameters()]
         self.assertEqual(eager_grads, compile_grads)
 
-        self.assertExpectedInline(bw_graph_cell[0].code.strip(), """\
+        self.assertExpectedInline(
+            bw_graph_cell[0].code.strip(),
+            """\
 def forward(self, primals_1, primals_6, getitem_1, getitem_2, getitem_3, getitem_4, tangents_1):
     native_batch_norm_backward = torch.ops.aten.native_batch_norm_backward.default(tangents_1, primals_6, primals_1, getitem_3, getitem_4, getitem_1, getitem_2, True, 1e-05, [True, True, True]);  tangents_1 = primals_6 = primals_1 = getitem_3 = getitem_4 = getitem_1 = getitem_2 = None
     getitem_5 = native_batch_norm_backward[0]
     getitem_6 = native_batch_norm_backward[1]
     getitem_7 = native_batch_norm_backward[2];  native_batch_norm_backward = None
-    return [getitem_6, getitem_7, None, None, None, getitem_5]""")  # noqa: B950
+    return [getitem_6, getitem_7, None, None, None, getitem_5]""",  # noqa: B950
+        )
 
         self.assertEqual(inp_ref.grad, inp_test.grad)
 
@@ -2604,32 +3127,48 @@ def f(x, y):
         bw_graph_cell = [None]
         compiled_f = aot_function(
             f,
-            fw_compiler=make_boxed_compiler(partial(extract_graph, graph_cell=fw_graph_cell)),
-            bw_compiler=make_boxed_compiler(partial(extract_graph, graph_cell=bw_graph_cell)),
+            fw_compiler=make_boxed_compiler(
+                partial(extract_graph, graph_cell=fw_graph_cell)
+            ),
+            bw_compiler=make_boxed_compiler(
+                partial(extract_graph, graph_cell=bw_graph_cell)
+            ),
             keep_inference_input_mutations=True,
         )
 
-        inp_ref = (torch.ones(20, 100, requires_grad=False), torch.ones(20, 100, requires_grad=True))
-        inp_test = (torch.ones(20, 100, requires_grad=False), torch.ones(20, 100, requires_grad=True))
+        inp_ref = (
+            torch.ones(20, 100, requires_grad=False),
+            torch.ones(20, 100, requires_grad=True),
+        )
+        inp_test = (
+            torch.ones(20, 100, requires_grad=False),
+            torch.ones(20, 100, requires_grad=True),
+        )
 
         out_ref = f(*inp_ref)
         out_test = compiled_f(*inp_test)
 
         # There is no copy_ method
-        self.assertExpectedInline(fw_graph_cell[0].code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph_cell[0].code.strip(),
+            """\
 def forward(self, primals_1, primals_2):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     add = torch.ops.aten.add.Tensor(clone, primals_2);  clone = primals_2 = None
-    return [add, add]""")  # noqa: B950
+    return [add, add]""",
+        )  # noqa: B950
 
         self.assertEqual(out_ref, out_test)
 
         out_ref.sum().backward()
         out_test.sum().backward()
 
-        self.assertExpectedInline(bw_graph_cell[0].code.strip(), """\
+        self.assertExpectedInline(
+            bw_graph_cell[0].code.strip(),
+            """\
 def forward(self, tangents_1):
-    return [None, tangents_1]""")  # noqa: B950
+    return [None, tangents_1]""",
+        )  # noqa: B950
 
     def test_real_weights_in_symbolic_mode(self):
         from functorch.experimental import functionalize
@@ -2650,7 +3189,13 @@ def forward(self, x):
         gm = make_fx(m, tracing_mode="symbolic", _allow_non_fake_inputs=True)(inp)
         self.assertEqual(gm(torch.ones(2, 5)), m(torch.ones(2, 5)))
 
-        gm_functionalized = make_fx(functionalize(gm,), tracing_mode="symbolic", _allow_non_fake_inputs=True)(inp)
+        gm_functionalized = make_fx(
+            functionalize(
+                gm,
+            ),
+            tracing_mode="symbolic",
+            _allow_non_fake_inputs=True,
+        )(inp)
         self.assertEqual(gm_functionalized(torch.ones(2, 5)), m(torch.ones(2, 5)))
 
         inp_count = 0
@@ -2669,11 +3214,14 @@ def forward(self, x):
         # No more param lifting
         self.assertEqual(inp_count, 1)
 
-        with self.assertRaisesRegex(Exception, "Please convert all Tensors to FakeTensors"):
-            make_fx(m, tracing_mode="symbolic", _allow_non_fake_inputs=False)(torch.randn(2, 5))
+        with self.assertRaisesRegex(
+            Exception, "Please convert all Tensors to FakeTensors"
+        ):
+            make_fx(m, tracing_mode="symbolic", _allow_non_fake_inputs=False)(
+                torch.randn(2, 5)
+            )
 
     def test_real_weights_in_symbolic_mode_with_inplace_ops(self):
-
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -2691,6 +3239,78 @@ def forward(self, x):
         with self.assertRaisesRegex(Exception, "Can't call metadata"):
             make_fx(m, tracing_mode="symbolic", _allow_non_fake_inputs=True)(inp)
 
+    def _compile_and_erase_bases(self, *output_view_indices):
+        # Overrides _base and _view_func tensor attributes, so as to avoid the view-replay
+        # execution path when reconstructing views.
+        class NoViewReplayTensor(torch.Tensor):
+            @property
+            def _base(self):
+                return None
+
+            @property
+            def _view_func(self):
+                return None
+
+        # Wraps the outputs that are views of the FX graph 'g' with NoViewReplayTensor,
+        # since they are the only ones that will get reconstructed.
+        def wrapper(g, *args, **kwargs):
+            outs = g(*args, **kwargs)
+            for i in output_view_indices:
+                outs[i] = NoViewReplayTensor(outs[i])
+            return outs
+
+        return lambda f: aot_function(f, fw_compiler=lambda g, _: partial(wrapper, g))
+
+    @patch("functorch.compile.config.view_replay_for_aliased_outputs", True)
+    def test_output_aliases_input_view_meta_replay(self):
+        @self._compile_and_erase_bases(0)
+        def f(a):
+            return a.view(-1)
+
+        inp = torch.ones(2, 2, requires_grad=True)
+        out = f(inp)
+
+        self.assertIsNotNone(out.grad_fn)
+        self.assertExpectedInline(
+            str(out.grad_fn.__class__), """<class 'ViewBackward0'>"""
+        )
+
+    @patch("functorch.compile.config.view_replay_for_aliased_outputs", True)
+    def test_output_aliases_intermediate_view_meta_replay(self):
+        @self._compile_and_erase_bases(0, 1)
+        def f(a):
+            b = a.clone()
+            return b.view(-1), b.view(-1)
+
+        inp = torch.ones(2, 2, requires_grad=True)
+        out1, out2 = f(inp)
+
+        self.assertIsNotNone(out1.grad_fn)
+        self.assertExpectedInline(
+            str(out1.grad_fn.__class__), """<class 'ViewBackward0'>"""
+        )
+
+        self.assertIsNotNone(out2.grad_fn)
+        self.assertExpectedInline(
+            str(out2.grad_fn.__class__), """<class 'ViewBackward0'>"""
+        )
+
+    @patch("functorch.compile.config.view_replay_for_aliased_outputs", True)
+    def test_output_aliases_output_view_meta_replay(self):
+        @self._compile_and_erase_bases(1)
+        def f(a):
+            b = a.add(10)
+            return b, b.view(-1)
+
+        inp = torch.ones(2, 2, requires_grad=True)
+        out1, out2 = f(inp)
+
+        self.assertEqual(out1.untyped_storage(), out2.untyped_storage())
+        self.assertIsNotNone(out2.grad_fn)
+        self.assertExpectedInline(
+            str(out2.grad_fn.__class__), """<class 'ViewBackward0'>"""
+        )
+
 
 def extract_graph(fx_g, _, graph_cell):
     graph_cell[0] = fx_g
@@ -2701,9 +3321,9 @@ def get_ins_outs(fx_g):
     ins = []
     outs = []
     for n in fx_g.graph.nodes:
-        if n.op == 'placeholder':
+        if n.op == "placeholder":
             ins.append(n)
-        elif n.op == 'output':
+        elif n.op == "output":
             outs = tuple(n.args[0])
     return ins, outs
 
@@ -2712,17 +3332,22 @@ def get_num_ins_outs(fx_g):
     return tuple(len(i) for i in get_ins_outs(fx_g))
 
 
-def get_fw_bw_graph(f, inps, partitioner=min_cut_rematerialization_partition, dynamic=False):
+def get_fw_bw_graph(
+    f, inps, partitioner=min_cut_rematerialization_partition, dynamic=False
+):
     fw_graph_cell = [None]
     bw_graph_cell = [None]
-    aot_function(f,
-                 fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell),
-                 bw_compiler=partial(extract_graph, graph_cell=bw_graph_cell),
-                 partition_fn=partitioner,
-                 decompositions=default_decompositions,
-                 dynamic=dynamic)(*inps).sum().backward()
+    aot_function(
+        f,
+        fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell),
+        bw_compiler=partial(extract_graph, graph_cell=bw_graph_cell),
+        partition_fn=partitioner,
+        decompositions=default_decompositions,
+        dynamic=dynamic,
+    )(*inps).sum().backward()
     return (fw_graph_cell[0], bw_graph_cell[0])
 
+
 class TestMod(torch.nn.Module):
     def __init__(self, fn):
         super().__init__()
@@ -2732,7 +3357,54 @@ def __init__(self, fn):
     def forward(self, *args):
         return self.fn(self.p, *args)
 
+
 class TestAOTExport(AOTTestCase):
+    def test_aot_export_ban_dropout_mut_pre_dispatch(self):
+        def fn(p, x):
+            y = torch.ops.aten.dropout.default(x, 0.1, train=False)
+            y.add_(1)
+            return (y,)
+
+        mod = TestMod(fn)
+        inp = torch.randn(2, 2)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "cannot mutate tensors with frozen storage"
+        ):
+            aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=True)
+
+        gm, _ = aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=False)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    clone = torch.ops.aten.clone.default(arg1_1);  arg1_1 = None
+    add = torch.ops.aten.add.Tensor(clone, 1);  clone = None
+    return (add,)""",
+        )
+
+        fw_graph_cell = [None]
+        bw_graph_cell = [None]
+
+        compiled_outs = aot_function(
+            fn,
+            fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell),
+            bw_compiler=partial(extract_graph, graph_cell=bw_graph_cell),
+            partition_fn=default_partition,
+            decompositions=default_decompositions,
+            dynamic=True,
+        )(*inp)
+        fw_graph = fw_graph_cell[0]
+        bw_graph = bw_graph_cell[0]
+
+        self.assertExpectedInline(
+            str(fw_graph.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    clone = torch.ops.aten.clone.default(arg1_1);  arg1_1 = None
+    add = torch.ops.aten.add.Tensor(clone, 1);  clone = None
+    return (add,)""",
+        )
 
     def test_aot_export_predispatch_func_simple(self):
         def fn(p, x):
@@ -2744,8 +3416,11 @@ def fn(p, x):
         mod = TestMod(fn)
         inp = torch.randn(2, 2)
 
-        gm, _ = aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=True)
-        self.assertExpectedInline(str(gm.code).strip(), """\
+        with torch.no_grad():
+            gm, _ = aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=True)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
 def forward(self, arg0_1, arg1_1):
     add = torch.ops.aten.add.Tensor(arg1_1, 2)
     _set_grad_enabled = torch._C._set_grad_enabled(False)
@@ -2753,7 +3428,8 @@ def forward(self, arg0_1, arg1_1):
     _set_grad_enabled_1 = torch._C._set_grad_enabled(False)
     mul = torch.ops.aten.mul.Tensor(arg1_1, 2);  arg1_1 = None
     add_2 = torch.ops.aten.add.Tensor(mul, add_1);  mul = add_1 = None
-    return (add_2,)""")
+    return (add_2,)""",
+        )
 
     def test_aot_export_predispatch_func_composite_implicit(self):
         def fn(p, x):
@@ -2765,8 +3441,11 @@ def fn(p, x):
         mod = TestMod(fn)
         inp = torch.randn(2, 2)
 
-        gm, _ = aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=True)
-        self.assertExpectedInline(str(gm.code).strip(), """\
+        with torch.no_grad():
+            gm, _ = aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=True)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
 def forward(self, arg0_1, arg1_1):
     _set_grad_enabled = torch._C._set_grad_enabled(True)
     matmul = torch.ops.aten.matmul.default(arg1_1, arg1_1)
@@ -2775,7 +3454,8 @@ def forward(self, arg0_1, arg1_1):
     sum_1 = torch.ops.aten.sum.default(arg1_1);  arg1_1 = None
     sum_2 = torch.ops.aten.sum.default(add);  add = None
     add_1 = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
-    return (add_1,)""")
+    return (add_1,)""",
+        )
 
     def test_aot_export_predispatch_composite_implicit_inplace(self):
         def fn(x, p):
@@ -2785,11 +3465,14 @@ def fn(x, p):
         inp = torch.randn(2, 2)
 
         gm, _ = aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=True)
-        self.assertExpectedInline(str(gm.code).strip(), """\
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
 def forward(self, arg0_1, arg1_1):
     clone = torch.ops.aten.clone.default(arg0_1);  arg0_1 = None
     abs_1 = torch.ops.aten.abs.default(clone);  clone = None
-    return (abs_1,)""")
+    return (abs_1,)""",
+        )
 
     def test_aot_export_predispatch_composite_implicit_linear(self):
         class MM(torch.nn.Module):
@@ -2804,10 +3487,13 @@ def forward(self, x):
         inp = torch.randn(2, 2)
 
         gm, _ = aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=True)
-        self.assertExpectedInline(str(gm.code).strip(), """\
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
 def forward(self, arg0_1, arg1_1, arg2_1):
     linear = torch.ops.aten.linear.default(arg2_1, arg0_1, arg1_1);  arg2_1 = arg0_1 = arg1_1 = None
-    return (linear,)""")
+    return (linear,)""",
+        )
 
     @unittest.expectedFailure
     def test_aot_export_predispatch_outdtype(self):
@@ -2819,16 +3505,18 @@ def __init__(self, weight):
             def forward(self, x):
                 y = x + 2
                 y.add_(5)
-                return (out_dtype(
-                    torch.ops.aten.mm.default, torch.int32, y, self.weight
-                ),)
+                return (
+                    out_dtype(torch.ops.aten.mm.default, torch.int32, y, self.weight),
+                )
 
         weight = torch.randint(-128, 127, (5, 5), dtype=torch.int8)
         mod = M(weight)
         inp = torch.randint(-128, 127, (5, 5), dtype=torch.int8)
 
         gm, _ = aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=True)
-        self.assertExpectedInline(str(gm.code).strip(), """\
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
 def forward(self, arg0_1, arg1_1):
     _set_grad_enabled = torch._C._set_grad_enabled(True)
     mm = torch.ops.aten.mm.default(arg1_1, arg1_1)
@@ -2837,7 +3525,8 @@ def forward(self, arg0_1, arg1_1):
     sum_1 = torch.ops.aten.sum.default(arg1_1);  arg1_1 = None
     sum_2 = torch.ops.aten.sum.default(add);  add = None
     add_1 = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
-    return (add_1,)""")
+    return (add_1,)""",
+        )
 
     def test_aot_export_predispatch_func_view(self):
         def fn(p, x):
@@ -2849,7 +3538,9 @@ def fn(p, x):
         inp = torch.randn(2, 2)
 
         gm, _ = aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=True)
-        self.assertExpectedInline(str(gm.code).strip(), """\
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
 def forward(self, arg0_1, arg1_1):
     matmul = torch.ops.aten.matmul.default(arg1_1, arg1_1)
     add = torch.ops.aten.add.Tensor(matmul, 2);  matmul = None
@@ -2857,13 +3548,14 @@ def forward(self, arg0_1, arg1_1):
     view_1 = torch.ops.aten.view.default(add, [1, 4]);  add = None
     sum_2 = torch.ops.aten.sum.default(view_1);  view_1 = None
     add_1 = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
-    return (add_1,)""")
+    return (add_1,)""",
+        )
 
     def test_aot_export_predispatch_buffer_mutation_metadata(self):
         class Foo(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.register_buffer('foo', torch.zeros(2, 2))
+                self.register_buffer("foo", torch.zeros(2, 2))
 
             def forward(self, x):
                 self.foo.add_(4)
@@ -2871,14 +3563,19 @@ def forward(self, x):
 
         inp = torch.randn(2, 2)
 
-        gm, graph_sig = aot_export_module(Foo(), [inp], trace_joint=False, pre_dispatch=True)
-        self.assertExpectedInline(str(gm.code).strip(), """\
+        gm, graph_sig = aot_export_module(
+            Foo(), [inp], trace_joint=False, pre_dispatch=True
+        )
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
 def forward(self, arg0_1, arg1_1):
     add = torch.ops.aten.add.Tensor(arg0_1, 4);  arg0_1 = None
     sum_1 = torch.ops.aten.sum.default(arg1_1);  arg1_1 = None
     sum_2 = torch.ops.aten.sum.default(add)
     add_1 = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
-    return (add, add_1)""")
+    return (add, add_1)""",
+        )
         eager_mod = Foo()
         output_1, output_2 = gm(torch.zeros(2, 2), inp)
         eager_output = eager_mod(inp)
@@ -2901,8 +3598,11 @@ def foo(p, x):
         inp = torch.randn(2, 2)
         mod = TestMod(foo)
 
-        gm, _ = aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=True)
-        self.assertExpectedInline(str(gm.code).strip(), """\
+        with torch.no_grad():
+            gm, _ = aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=True)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
 def forward(self, arg0_1, arg1_1):
     _set_grad_enabled = torch._C._set_grad_enabled(True)
     add = torch.ops.aten.add.Tensor(arg1_1, 5)
@@ -2912,31 +3612,257 @@ def forward(self, arg0_1, arg1_1):
     sin = torch.ops.aten.sin.default(add_2);  add_2 = None
     add_3 = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
     _set_grad_enabled_1 = torch._C._set_grad_enabled(False)
-    return (add_3,)""")
+    return (add_3,)""",
+        )
 
-    # TODO(tmanlaibaatar) properly support functionalizing HOO in
-    # predispatch tracing mode
-    @unittest.expectedFailure
+    @unittest.skipIf(IS_WINDOWS, "Windows isn't supported for this case")
+    @unittest.skipIf(
+        not torchdynamo.is_dynamo_supported(), "TorchDynamo is not supported"
+    )
+    def test_aot_export_predispatch_with_cond_nested(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                def true_fn(x):
+                    y = x.sin()
+                    y.add_(5)
+
+                    def true_true_fn(x):
+                        y = x.sin()
+                        y.add_(7)
+                        return y.sin()
+
+                    def true_false_fn(x):
+                        return x.cos()
+
+                    return torch.cond(
+                        y.cos().shape[0] > 5, true_true_fn, true_false_fn, [y.cos()]
+                    )
+
+                def false_fn(x):
+                    z = x.cos()
+                    z.add_(6)
+                    return z.sin()
+
+                a = torch.cond(x.shape[0] > 4, true_fn, false_fn, [x])
+                return (a + 3, a + 4)
+
+        inp = torch.randn(2, 2)
+        gm, _ = aot_export_module(M(), [inp], trace_joint=False, pre_dispatch=True)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1):
+    true_graph_0 = self.true_graph_0
+    false_graph_0 = self.false_graph_0
+    conditional = torch.ops.higher_order.cond(False, true_graph_0, false_graph_0, [arg0_1]);  true_graph_0 = false_graph_0 = arg0_1 = None
+    getitem = conditional[0];  conditional = None
+    add = torch.ops.aten.add.Tensor(getitem, 3)
+    add_1 = torch.ops.aten.add.Tensor(getitem, 4);  getitem = None
+    return (add, add_1)""",  # noqa: B950
+        )
+
+        self.assertExpectedInline(
+            str(gm.true_graph_0.code).strip(),
+            """\
+def forward(self, arg0_1):
+    sin = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+    add = torch.ops.aten.add.Tensor(sin, 5);  sin = None
+    cos = torch.ops.aten.cos.default(add)
+    cos_1 = torch.ops.aten.cos.default(add);  add = None
+    true_graph_0 = self.true_graph_0
+    false_graph_0 = self.false_graph_0
+    conditional = torch.ops.higher_order.cond(False, true_graph_0, false_graph_0, [cos_1]);  true_graph_0 = false_graph_0 = cos_1 = None
+    getitem = conditional[0];  conditional = None
+    return (getitem,)""",  # noqa: B950
+        )
+
+        self.assertExpectedInline(
+            str(gm.true_graph_0.true_graph_0.code).strip(),
+            """\
+def forward(self, arg0_1):
+    sin = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+    add = torch.ops.aten.add.Tensor(sin, 7);  sin = None
+    sin_1 = torch.ops.aten.sin.default(add);  add = None
+    return (sin_1,)""",
+        )
+
+    @unittest.skipIf(IS_WINDOWS, "Windows isn't supported for this case")
+    @unittest.skipIf(
+        not torchdynamo.is_dynamo_supported(), "TorchDynamo is not supported"
+    )
+    def test_aot_export_predispatch_map_1(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                def true_fn(x, r):
+                    y = x.sin()
+                    y.add_(5)
+                    return y.cos() + r.sum()
+
+                def false_fn(x, r):
+                    z = x.cos()
+
+                    def f(x, y):
+                        a = x.cos()
+                        a.add_(5)
+                        return a + y
+
+                    return (
+                        z
+                        + control_flow.map(f, z, r).sum()
+                        + control_flow.map(f, z, r).sum()
+                    )
+
+                a = torch.cond(x.shape[0] > 4, true_fn, false_fn, [x, y])
+                return (a + 3, a + 4)
+
+        inps = [torch.randn(2, 2), torch.ones(2)]
+        gm, _ = aot_export_module(M(), inps, trace_joint=False, pre_dispatch=True)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    true_graph_0 = self.true_graph_0
+    false_graph_0 = self.false_graph_0
+    conditional = torch.ops.higher_order.cond(False, true_graph_0, false_graph_0, [arg0_1, arg1_1]);  true_graph_0 = false_graph_0 = arg0_1 = arg1_1 = None
+    getitem = conditional[0];  conditional = None
+    add = torch.ops.aten.add.Tensor(getitem, 3)
+    add_1 = torch.ops.aten.add.Tensor(getitem, 4);  getitem = None
+    return (add, add_1)""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(gm.true_graph_0.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    sin = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+    add = torch.ops.aten.add.Tensor(sin, 5);  sin = None
+    cos = torch.ops.aten.cos.default(add);  add = None
+    sum_1 = torch.ops.aten.sum.default(arg1_1);  arg1_1 = None
+    add_1 = torch.ops.aten.add.Tensor(cos, sum_1);  cos = sum_1 = None
+    return (add_1,)""",
+        )
+        self.assertExpectedInline(
+            str(gm.false_graph_0.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    cos = torch.ops.aten.cos.default(arg0_1);  arg0_1 = None
+    select = torch.ops.aten.select.int(cos, 0, 0)
+    body_graph_0 = self.body_graph_0
+    map_impl = torch.ops.higher_order.map_impl(body_graph_0, [cos], [arg1_1]);  body_graph_0 = None
+    getitem = map_impl[0];  map_impl = None
+    sum_1 = torch.ops.aten.sum.default(getitem);  getitem = None
+    add = torch.ops.aten.add.Tensor(cos, sum_1);  sum_1 = None
+    select_1 = torch.ops.aten.select.int(cos, 0, 0)
+    body_graph_1 = self.body_graph_1
+    map_impl_1 = torch.ops.higher_order.map_impl(body_graph_1, [cos], [arg1_1]);  body_graph_1 = cos = arg1_1 = None
+    getitem_1 = map_impl_1[0];  map_impl_1 = None
+    sum_2 = torch.ops.aten.sum.default(getitem_1);  getitem_1 = None
+    add_1 = torch.ops.aten.add.Tensor(add, sum_2);  add = sum_2 = None
+    return (add_1,)""",
+        )
+        self.assertExpectedInline(
+            str(gm.false_graph_0.body_graph_0.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    cos = torch.ops.aten.cos.default(arg0_1);  arg0_1 = None
+    add = torch.ops.aten.add.Tensor(cos, 5);  cos = None
+    add_1 = torch.ops.aten.add.Tensor(add, arg1_1);  add = arg1_1 = None
+    return (add_1,)""",
+        )
+
+    def test_aot_export_predispatch_map_2(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = x.cos()
+
+                def f(x, y):
+                    a = x.cos()
+                    a.add_(5)
+                    return a + y
+
+                return (z + control_flow.map(f, z, y).sum(),)
+
+        inps = [torch.randn(2, 2), torch.ones(2)]
+        gm, _ = aot_export_module(M(), inps, trace_joint=False, pre_dispatch=True)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    cos = torch.ops.aten.cos.default(arg0_1);  arg0_1 = None
+    body_graph_0 = self.body_graph_0
+    map_impl = torch.ops.higher_order.map_impl(body_graph_0, [cos], [arg1_1]);  body_graph_0 = arg1_1 = None
+    getitem = map_impl[0];  map_impl = None
+    sum_1 = torch.ops.aten.sum.default(getitem);  getitem = None
+    add = torch.ops.aten.add.Tensor(cos, sum_1);  cos = sum_1 = None
+    return (add,)""",
+        )  # noqa: B950
+        self.assertExpectedInline(
+            str(gm.body_graph_0.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    cos = torch.ops.aten.cos.default(arg0_1);  arg0_1 = None
+    add = torch.ops.aten.add.Tensor(cos, 5);  cos = None
+    add_1 = torch.ops.aten.add.Tensor(add, arg1_1);  add = arg1_1 = None
+    return [add_1]""",
+        )
+
+    @unittest.skipIf(IS_WINDOWS, "Windows isn't supported for this case")
+    @unittest.skipIf(
+        not torchdynamo.is_dynamo_supported(), "TorchDynamo is not supported"
+    )
     def test_aot_export_predispatch_with_cond(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.register_buffer("buffer", torch.randn(4, 4))
 
             def forward(self, x):
                 def true_fn(x):
-                    self.buffer.add_(5)
-                    return x.cos() + self.buffer.sum()
+                    y = x.sin()
+                    z = torch.ops.aten.linear.default(y, torch.randn(2, 2))
+                    z.add_(5)
+                    return z.cos()
 
                 def false_fn(x):
-                    self.buffer.add_(6)
-                    return x.sin() + self.buffer.sum()
+                    z = x.cos()
+                    z.add_(6)
+                    return z.sin()
 
                 a = torch.cond(x.shape[0] > 4, true_fn, false_fn, [x])
                 return (a + 3, a + 4)
 
         inp = torch.randn(2, 2)
         gm, _ = aot_export_module(M(), [inp], trace_joint=False, pre_dispatch=True)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1):
+    true_graph_0 = self.true_graph_0
+    false_graph_0 = self.false_graph_0
+    conditional = torch.ops.higher_order.cond(False, true_graph_0, false_graph_0, [arg0_1]);  true_graph_0 = false_graph_0 = arg0_1 = None
+    getitem = conditional[0];  conditional = None
+    add = torch.ops.aten.add.Tensor(getitem, 3)
+    add_1 = torch.ops.aten.add.Tensor(getitem, 4);  getitem = None
+    return (add, add_1)""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(gm.true_graph_0.code).strip(),
+            """\
+def forward(self, arg0_1):
+    sin = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+    randn = torch.ops.aten.randn.default([2, 2], device = device(type='cpu'), pin_memory = False)
+    linear = torch.ops.aten.linear.default(sin, randn);  sin = randn = None
+    add = torch.ops.aten.add.Tensor(linear, 5);  linear = None
+    cos = torch.ops.aten.cos.default(add);  add = None
+    return (cos,)""",
+        )
 
     def test_aot_export_predispatch_conv_and_bn(self):
         class ConvBatchnorm(torch.nn.Module):
@@ -2955,7 +3881,9 @@ def forward(self, x):
         inp = torch.randn(1, 1, 3, 3)
 
         gm, _ = aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=True)
-        self.assertExpectedInline(str(gm.code).strip(), """\
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
 def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1):
     conv2d = torch.ops.aten.conv2d.default(arg7_1, arg0_1, arg1_1);  arg7_1 = arg0_1 = arg1_1 = None
     add = torch.ops.aten.add.Tensor(arg6_1, 1);  arg6_1 = None
@@ -2963,7 +3891,8 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1
     getitem = _native_batch_norm_legit_functional[0]
     getitem_3 = _native_batch_norm_legit_functional[3]
     getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
-    return (getitem_3, getitem_4, add, getitem)""")  # noqa: B950
+    return (getitem_3, getitem_4, add, getitem)""",  # noqa: B950
+        )
 
     def test_aot_export_predispatch_reshape(self):
         class Reshape(torch.nn.Module):
@@ -2975,11 +3904,14 @@ def forward(self, x):
         inp = torch.randn(2, 8)
 
         gm, _ = aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=True)
-        self.assertExpectedInline(str(gm.code).strip(), """\
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
 def forward(self, arg0_1):
     view = torch.ops.aten.view.default(arg0_1, [4, 4]);  arg0_1 = None
     sum_1 = torch.ops.aten.sum.default(view);  view = None
-    return (sum_1,)""")  # noqa: B950
+    return (sum_1,)""",
+        )  # noqa: B950
 
     def test_aot_export_predispatch_contiguous(self):
         class Cont(torch.nn.Module):
@@ -2991,10 +3923,13 @@ def forward(self, x):
         inp = torch.randn(2, 8)
 
         gm, _ = aot_export_module(mod, [inp], trace_joint=False, pre_dispatch=True)
-        self.assertExpectedInline(str(gm.code).strip(), """\
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
 def forward(self, arg0_1):
     sum_1 = torch.ops.aten.sum.default(arg0_1);  arg0_1 = None
-    return (sum_1,)""")  # noqa: B950
+    return (sum_1,)""",
+        )  # noqa: B950
 
     def test_aot_export_module_joint(self):
         class ConvBatchnormRelu(torch.nn.Module):
@@ -3014,11 +3949,17 @@ def forward(self, x):
         mod.train()
         inp = torch.randn(1, 1, 3, 3)
         o_ref = mod(inp)
-        fx_g, signature = aot_export_module(mod, [inp], trace_joint=True, output_loss_index=0)
+        fx_g, signature = aot_export_module(
+            mod, [inp], trace_joint=True, output_loss_index=0
+        )
         # Some important characteristics of the exported graph below:
         # 8 arguments: 2 params from conv, 2 params from batchnorm, 2 buffers from 1 batchnorm, 1 user input
         # 9 outputs: 3 mutated buffers (from batchnorm), 2 user outputs and 4 gradients (since there were 4 parameters)
-        self.assertExpectedInline(fx_g.print_readable(print_output=False), """\
+        for node in fx_g.graph.nodes:
+            node.meta.pop("stack_trace", None)
+        self.assertExpectedInline(
+            fx_g.print_readable(print_output=False),
+            """\
 class <lambda>(torch.nn.Module):
     def forward(self, arg0_1: "f32[3, 1, 1, 1]", arg1_1: "f32[3]", arg2_1: "f32[3]", arg3_1: "f32[3]", arg4_1: "f32[3]", arg5_1: "f32[3]", arg6_1: "i64[]", arg7_1: "f32[1, 1, 3, 3]"):
         # No stacktrace found for following nodes
@@ -3059,23 +4000,51 @@ def forward(self, arg0_1: "f32[3, 1, 1, 1]", arg1_1: "f32[3]", arg2_1: "f32[3]",
         getitem_9: "f32[3, 1, 1, 1]" = convolution_backward[1]
         getitem_10: "f32[3]" = convolution_backward[2];  convolution_backward = None
         return (getitem_3, getitem_4, add, sum_1, detach_10, getitem_9, getitem_10, getitem_6, getitem_7)
-        """)  # noqa: B950
-
+        """,  # noqa: B950
+        )
 
-        self.assertExpectedInline(str(signature.parameters), """['conv.weight', 'conv.bias', 'bn.weight', 'bn.bias']""")
-        self.assertExpectedInline(str(signature.buffers), """['bn.running_mean', 'bn.running_var', 'bn.num_batches_tracked']""")
+        self.assertExpectedInline(
+            str(signature.parameters),
+            """['conv.weight', 'conv.bias', 'bn.weight', 'bn.bias']""",
+        )
+        self.assertExpectedInline(
+            str(signature.buffers),
+            """['bn.running_mean', 'bn.running_var', 'bn.num_batches_tracked']""",
+        )
         self.assertExpectedInline(str(signature.user_inputs), """['arg7_1']""")
-        self.assertExpectedInline(str(signature.inputs_to_parameters), """{'arg0_1': 'conv.weight', 'arg1_1': 'conv.bias', 'arg2_1': 'bn.weight', 'arg3_1': 'bn.bias'}""")  # noqa: B950
-        self.assertExpectedInline(str(signature.inputs_to_buffers), """{'arg4_1': 'bn.running_mean', 'arg5_1': 'bn.running_var', 'arg6_1': 'bn.num_batches_tracked'}""")  # noqa: B950
-        self.assertExpectedInline(str(signature.buffers_to_mutate), """{'getitem_3': 'bn.running_mean', 'getitem_4': 'bn.running_var', 'add': 'bn.num_batches_tracked'}""")  # noqa: B950
-        self.assertExpectedInline(str(signature.backward_signature.gradients_to_parameters), """{'getitem_9': 'conv.weight', 'getitem_10': 'conv.bias', 'getitem_6': 'bn.weight', 'getitem_7': 'bn.bias'}""")  # noqa: B950
-        self.assertExpectedInline(str(signature.backward_signature.gradients_to_user_inputs), """{}""")
-        self.assertExpectedInline(str(signature.backward_signature.loss_output), """getitem_3""")
+        self.assertExpectedInline(
+            str(signature.inputs_to_parameters),
+            """{'arg0_1': 'conv.weight', 'arg1_1': 'conv.bias', 'arg2_1': 'bn.weight', 'arg3_1': 'bn.bias'}""",
+        )  # noqa: B950
+        self.assertExpectedInline(
+            str(signature.inputs_to_buffers),
+            """{'arg4_1': 'bn.running_mean', 'arg5_1': 'bn.running_var', 'arg6_1': 'bn.num_batches_tracked'}""",
+        )  # noqa: B950
+        self.assertExpectedInline(
+            str(signature.buffers_to_mutate),
+            """{'getitem_3': 'bn.running_mean', 'getitem_4': 'bn.running_var', 'add': 'bn.num_batches_tracked'}""",
+        )  # noqa: B950
+        self.assertExpectedInline(
+            str(signature.backward_signature.gradients_to_parameters),
+            """{'getitem_9': 'conv.weight', 'getitem_10': 'conv.bias', 'getitem_6': 'bn.weight', 'getitem_7': 'bn.bias'}""",
+        )  # noqa: B950
+        self.assertExpectedInline(
+            str(signature.backward_signature.gradients_to_user_inputs), """{}"""
+        )
+        self.assertExpectedInline(
+            str(signature.backward_signature.loss_output), """getitem_3"""
+        )
 
         # Also check the inference graph
         # Main important thing here is that there are 5 total outputs: 3 total mutated buffers (from batchnorm), 2 user outputs.
-        fx_g_inference, signature_inference = aot_export_module(mod, [inp], trace_joint=False)
-        self.assertExpectedInline(fx_g_inference.print_readable(print_output=False), """\
+        fx_g_inference, signature_inference = aot_export_module(
+            mod, [inp], trace_joint=False
+        )
+        for node in fx_g_inference.graph.nodes:
+            node.meta.pop("stack_trace", None)
+        self.assertExpectedInline(
+            fx_g_inference.print_readable(print_output=False),
+            """\
 class <lambda>(torch.nn.Module):
     def forward(self, arg0_1: "f32[3, 1, 1, 1]", arg1_1: "f32[3]", arg2_1: "f32[3]", arg3_1: "f32[3]", arg4_1: "f32[3]", arg5_1: "f32[3]", arg6_1: "i64[]", arg7_1: "f32[1, 1, 3, 3]"):
         # No stacktrace found for following nodes
@@ -3091,7 +4060,8 @@ def forward(self, arg0_1: "f32[3, 1, 1, 1]", arg1_1: "f32[3]", arg2_1: "f32[3]",
         detach_1: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach);  detach = None
         detach_2: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_1);  detach_1 = None
         return (getitem_3, getitem_4, add, sum_1, detach_2)
-        """)  # noqa: B950
+        """,  # noqa: B950
+        )
         # Some important characteristics of the exported graph below:
         # 8 arguments: 2 params from conv, 2 params from batchnorm, 2 buffers from 1 batchnorm, 1 user input
         # 9 outputs: 2 mutated buffers (from batchnorm), 2 user outputs and 4 gradients (since there were 4 parameters)
@@ -3118,7 +4088,9 @@ def f(x, y):
         y3 = y.clone().detach().requires_grad_(True)
         f_graph_joint = aot_export_joint_simple(f, [x, y], trace_joint=True)
         num_fw_outputs = 2
-        fw_g, bw_g = default_partition(f_graph_joint, [x, y], num_fwd_outputs=num_fw_outputs)
+        fw_g, bw_g = default_partition(
+            f_graph_joint, [x, y], num_fwd_outputs=num_fw_outputs
+        )
         out_ref2 = f(x2, y2)
         fw_outs = fw_g(x3, y3)
         out_test2, activations = fw_outs[:num_fw_outputs], fw_outs[num_fw_outputs:]
@@ -3135,6 +4107,7 @@ def test_aot_export_metadata_mutation_banned(self):
         def fn(p, x):
             x.t_()
             return (x * 2,)
+
         mod = TestMod(fn)
         inp = torch.randn(2, 4)
         with self.assertRaisesRegex(
@@ -3144,7 +4117,7 @@ def fn(p, x):
             aot_export_joint_simple(fn, [mod.p, inp], trace_joint=True)
             aot_export_module(mod, [inp], trace_joint=False)
 
-    def test_aot_export_forward_mutation_no_buffer_mut_banned(self):
+    def test_aot_export_forward_mutation_no_buffer_mut(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -3154,10 +4127,23 @@ def forward(self, x):
                 x.add_(4)
                 return (x.cos().sum() + self.buffer1.sum(),)
 
-        with self.assertRaisesRegex(RuntimeError, "Found following user inputs located at \\[0\\] are mutated"):
-            aot_export_module(M(), [torch.ones(6, 4)], trace_joint=False)
+        mod = M()
+        inp = torch.ones(6, 4)
+        gm, sig = aot_export_module(mod, [inp], trace_joint=False)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    add = torch.ops.aten.add.Tensor(arg1_1, 4);  arg1_1 = None
+    cos = torch.ops.aten.cos.default(add)
+    sum_1 = torch.ops.aten.sum.default(cos);  cos = None
+    sum_2 = torch.ops.aten.sum.default(arg0_1);  arg0_1 = None
+    add_1 = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
+    return (add, add_1)""",
+        )  # noqa: B950
+        self.assertEqual(sig.user_inputs_to_mutate, {"add": "arg1_1"})
 
-    def test_aot_export_forward_mutation_multiple_mut_banned(self):
+    def test_aot_export_forward_mutation_multiple_mut(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -3166,19 +4152,55 @@ def __init__(self):
             def forward(self, x, y):
                 y.add_(4)
                 self.buffer1.add_(5)
-                return (x.cos().sum() + y.sin().sum(), self.buffer1.sum(),)
+                return (
+                    x.cos().sum() + y.sin().sum(),
+                    self.buffer1.sum(),
+                )
+
+        mod = M()
+        inp = [torch.ones(6, 4), torch.zeros(6, 4)]
+        gm, sig = aot_export_module(mod, inp, trace_joint=False)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1):
+    add = torch.ops.aten.add.Tensor(arg2_1, 4);  arg2_1 = None
+    add_1 = torch.ops.aten.add.Tensor(arg0_1, 5);  arg0_1 = None
+    cos = torch.ops.aten.cos.default(arg1_1);  arg1_1 = None
+    sum_1 = torch.ops.aten.sum.default(cos);  cos = None
+    sin = torch.ops.aten.sin.default(add)
+    sum_2 = torch.ops.aten.sum.default(sin);  sin = None
+    add_2 = torch.ops.aten.add.Tensor(sum_1, sum_2);  sum_1 = sum_2 = None
+    sum_3 = torch.ops.aten.sum.default(add_1)
+    return (add_1, add, add_2, sum_3)""",
+        )  # noqa: B950
+        self.assertEqual(sig.user_inputs_to_mutate, {"add": "arg2_1"})
+        self.assertEqual(sig.buffers_to_mutate, {"add_1": "buffer1"})
+
+    def test_aot_export_input_mutation_on_input_requiring_grad_banned(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                x.add_(4)
+                return (x,)
 
-        with self.assertRaisesRegex(RuntimeError, "Found following user inputs located at \\[1\\] are mutated"):
-            aot_export_module(M(), [torch.ones(6, 4), torch.zeros(6, 4)], trace_joint=False)
+        mod = M()
+        inp = torch.randn(2, requires_grad=True)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Found a graph input that requires gradients, and received a mutation",
+        ):
+            aot_export_module(mod, [inp], trace_joint=False)
 
     def test_aot_export_input_mutation_on_parameter_banned(self):
         def fn(p, x):
             p.mul_(2)
             return (p + x,)
+
         mod = TestMod(fn)
         inp = torch.randn(2)
         with self.assertRaisesRegex(
-            RuntimeError, "Found a graph input that requires gradients, and received a mutation"
+            RuntimeError,
+            "Found a graph input that requires gradients, and received a mutation",
         ):
             aot_export_joint_simple(fn, [mod.p, inp], trace_joint=False)
             aot_export_joint_simple(fn, [mod.p, inp], trace_joint=True)
@@ -3188,6 +4210,7 @@ def test_aot_export_synthetic_bases_banned(self):
         def fn(p, x, y):
             x.mul_(2)
             return (x + y,)
+
         mod = TestMod(fn)
         inp = torch.randn(2)
         inp2 = inp.view(-1)
@@ -3202,6 +4225,7 @@ def test_aot_export_input_dupes_banned(self):
         def fn(p, x, y):
             x.mul_(2)
             return (x + y,)
+
         mod = TestMod(fn)
         inp = torch.randn(2)
         with self.assertRaisesRegex(
@@ -3215,14 +4239,19 @@ def test_aot_export_multiple_outputs_require_grad_banned(self):
         def fn(p, x):
             out = p * x
             return out, out.sum()
+
         mod = TestMod(fn)
         inp = torch.randn(2)
         with self.assertRaisesRegex(
-            RuntimeError, "Found an output of the forward that requires gradients, that was not"
+            RuntimeError,
+            "Found an output of the forward that requires gradients, that was not",
         ):
             aot_export_module(mod, [inp], trace_joint=True, output_loss_index=1)
 
-    @unittest.skipIf(not torch._dynamo.is_dynamo_supported(), "Cond needs dynamo to run")
+    @unittest.skipIf(IS_WINDOWS, "Windows isn't supported for this case")
+    @unittest.skipIf(
+        not torch._dynamo.is_dynamo_supported(), "Cond needs dynamo to run"
+    )
     def test_aot_export_with_torch_cond(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -3244,7 +4273,9 @@ def false_fn(x):
 
         inp = torch.randn(3, 4)
         gm, _ = aot_export_module(M(), (inp,), trace_joint=False)
-        self.assertExpectedInline(gm.code.strip(), """\
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
 def forward(self, arg0_1):
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
@@ -3252,41 +4283,39 @@ def forward(self, arg0_1):
     getitem = conditional[0];  conditional = None
     add = torch.ops.aten.add.Tensor(getitem, 3)
     add_1 = torch.ops.aten.add.Tensor(getitem, 4);  getitem = None
-    return (add, add_1)""")  # noqa: B950
+    return (add, add_1)""",  # noqa: B950
+        )
 
-        self.assertExpectedInline(gm.true_graph_0.code.strip(), """\
+        self.assertExpectedInline(
+            gm.true_graph_0.code.strip(),
+            """\
 def forward(self, arg0_1):
     add = torch.ops.aten.add.Tensor(arg0_1, 4)
     add_1 = torch.ops.aten.add.Tensor(add, 5);  add = None
     cos = torch.ops.aten.cos.default(arg0_1);  arg0_1 = None
-    return (cos,)""")
+    return (cos,)""",
+        )
 
-        self.assertExpectedInline(gm.false_graph_0.code.strip(), """\
+        self.assertExpectedInline(
+            gm.false_graph_0.code.strip(),
+            """\
 def forward(self, arg0_1):
     add = torch.ops.aten.add.Tensor(arg0_1, 5)
     add_1 = torch.ops.aten.add.Tensor(add, 6);  add = None
     sin = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
-    return (sin,)""")
-
-    def test_aot_export_simplified_input_mutations_banned(self):
-        def fn(x):
-            x.mul_(2)
-            return (x + x,)
-        inp = torch.randn(2)
-        with self.assertRaisesRegex(
-            RuntimeError, "Found following user inputs located at \\[0\\] are mutated"
-        ):
-            aot_export_joint_simple(fn, [inp], trace_joint=False)
-            aot_export_joint_simple(fn, [inp], trace_joint=True)
+    return (sin,)""",
+        )
 
     def test_aot_export_simplified_pytrees_banned(self):
         def fn(inps):
             return (inps[0] + inps[1],)
+
         inp1 = torch.randn(2)
         inp2 = torch.randn(2)
         inps = [inp1, inp2]
         with self.assertRaisesRegex(
-            RuntimeError, "aot_export_joint_simple requires individual inputs not to be pytrees"
+            RuntimeError,
+            "aot_export_joint_simple requires individual inputs not to be pytrees",
         ):
             aot_export_joint_simple(fn, [inps], trace_joint=False)
             aot_export_joint_simple(fn, [inps], trace_joint=True)
@@ -3294,10 +4323,14 @@ def fn(inps):
     def test_aot_export_functionalized_rng_banned(self):
         def fn(p, x):
             return (p + x,)
+
         mod = TestMod(fn)
         inp = torch.randn(2)
-        with patch("functorch.compile.config.functionalize_rng_ops", True), self.assertRaisesRegex(
-            RuntimeError, "Functionalized RNG is not currently supported in the aot_export"
+        with patch(
+            "functorch.compile.config.functionalize_rng_ops", True
+        ), self.assertRaisesRegex(
+            RuntimeError,
+            "Functionalized RNG is not currently supported in the aot_export",
         ):
             aot_export_joint_simple(fn, [mod.p, inp], trace_joint=False)
             aot_export_joint_simple(fn, [mod.p, inp], trace_joint=True)
@@ -3323,7 +4356,9 @@ def fn(a, b):
         def compile_fn(x, _):
             return x
 
-        compiled_fn = compiled_function(fn, compile_fn, compile_fn, min_cut_rematerialization_partition)
+        compiled_fn = compiled_function(
+            fn, compile_fn, compile_fn, min_cut_rematerialization_partition
+        )
         res = compiled_fn(res_a, res_b)
         res.sum().backward()
         assert torch.allclose(ref, res, atol=1e-3, rtol=1e-3)
@@ -3336,22 +4371,28 @@ def test_meta_tensor_inplace_op(self):
         class MockModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.weight = torch.nn.Parameter(torch.randn(3072, 768, requires_grad=True))
+                self.weight = torch.nn.Parameter(
+                    torch.randn(3072, 768, requires_grad=True)
+                )
                 self.bias = torch.nn.Parameter(torch.randn(3072, requires_grad=True))
 
             def forward(self, add_4):
-                linear_4 = torch.nn.functional.linear(add_4, self.weight, bias=self.bias)
+                linear_4 = torch.nn.functional.linear(
+                    add_4, self.weight, bias=self.bias
+                )
                 gelu = torch.nn.functional.gelu(linear_4)
                 return gelu
 
         def check_meta_tensor(fx_g, _):
             for node in fx_g.graph.nodes:
-                if node.op != 'output':
-                    assert 'tensor_meta' in node.meta
+                if node.op != "output":
+                    assert "tensor_meta" in node.meta
             return fx_g
 
         inp0 = torch.randn(16, 128, 768, requires_grad=True)
-        inputs = [inp0, ]
+        inputs = [
+            inp0,
+        ]
         mod = MockModule().to(device="cpu")
         aot_mod = aot_module(mod, fw_compiler=check_meta_tensor)
         aot_mod(*inputs)
@@ -3360,16 +4401,20 @@ def test_default_partitioner_getitem(self):
         mod = nn.LayerNorm([10])
 
         def f(x, mod_weight, mod_bias):
-            return torch.nn.functional.layer_norm(x, [10], mod_weight, mod_bias, eps=1e-6)
+            return torch.nn.functional.layer_norm(
+                x, [10], mod_weight, mod_bias, eps=1e-6
+            )
 
-        fw_graph, bw_graph = get_fw_bw_graph(f, [torch.randn(3, 10, requires_grad=True), mod.weight, mod.bias],
-                                             partitioner=default_partition)
+        fw_graph, bw_graph = get_fw_bw_graph(
+            f,
+            [torch.randn(3, 10, requires_grad=True), mod.weight, mod.bias],
+            partitioner=default_partition,
+        )
         self.assertEqual(get_num_ins_outs(fw_graph), (3, 6))
         self.assertEqual(get_num_ins_outs(bw_graph), (6, 3))
 
     @unittest.skipIf(not USE_NETWORKX, "networkx not available")
     def test_min_cut_partitioner_save_shape(self):
-
         def f(x):
             s = x.sum(dim=1)
             return s
@@ -3400,6 +4445,7 @@ def f(a, b, c):
             x = sb[0] + sc[0]
             a_sz = (x, a.size(0))
             return torch.cat([a.expand(a_sz), b, c])
+
         fw_graph, bw_graph = get_fw_bw_graph(f, inp, dynamic=True)
         self.assertEqual(get_num_ins_outs(fw_graph), (3, 4))
         self.assertEqual(get_num_ins_outs(bw_graph), (4, 3))
@@ -3407,7 +4453,6 @@ def f(a, b, c):
         self.assertTrue(all(is_sym_node(n) for n in outs[1:]))
 
     def test_default_partitioner_output_tensor_shape_tensor(self):
-
         inp = [
             torch.randn(10, requires_grad=True),
             torch.randn((3, 10), requires_grad=True),
@@ -3423,7 +4468,9 @@ def f(a, b, c, d):
             a_sz = (x, a.size(0))
             cat = torch.cat([a.expand(a_sz), b, c])
             mm = torch.mm(cat, d)
-            mm2 = torch.mm(mm, a.view(mm.size(1), a.size(0)))  # this saves 4 new ints for backward. why?
+            mm2 = torch.mm(
+                mm, a.view(mm.size(1), a.size(0))
+            )  # this saves 4 new ints for backward. why?
             # and what do i have to do to make it save a tensor for backward?
             return cat, sb, c, mm2
 
@@ -3435,7 +4482,8 @@ def f(a, b, c, d):
             bw_compiler=partial(extract_graph, graph_cell=bw_graph_cell),
             partition_fn=default_partition,
             decompositions=default_decompositions,
-            dynamic=True)(*inp)
+            dynamic=True,
+        )(*inp)
         fw_graph = fw_graph_cell[0]
         (compiled_outs[0].sum() + compiled_outs[2].sum()).backward()
         bw_graph = bw_graph_cell[0]
@@ -3459,7 +4507,7 @@ def f(a, b, c, d):
             # Of the 5 original forward outputs, the 4th (c) is an input,
             # which won't show up in the compiled forward graph
             [False, True, True, False, False] + [False] * 4 + [True] * 4,
-            [is_sym_node(n) for n in fw_graph_out_nodes]
+            [is_sym_node(n) for n in fw_graph_out_nodes],
         )
 
         real_outs = f(*inp)
@@ -3471,7 +4519,6 @@ def f(a, b, c, d):
 
     @unittest.skipIf(not USE_NETWORKX, "networkx not available")
     def test_min_cut_partitioner_output_tensor_shape_tensor(self):
-
         inp = [
             torch.randn(10, requires_grad=True),
             torch.randn((3, 10), requires_grad=True),
@@ -3487,7 +4534,9 @@ def f(a, b, c, d):
             a_sz = (x, a.size(0))
             cat = torch.cat([a.expand(a_sz), b, c])
             mm = torch.mm(cat, d)
-            mm2 = torch.mm(mm, a.view(mm.size(1), a.size(0)))  # this saves 4 new ints for backward. why?
+            mm2 = torch.mm(
+                mm, a.view(mm.size(1), a.size(0))
+            )  # this saves 4 new ints for backward. why?
             # and what do i have to do to make it save a tensor for backward?
             return cat, sb, c, mm2
 
@@ -3499,7 +4548,8 @@ def f(a, b, c, d):
             bw_compiler=partial(extract_graph, graph_cell=bw_graph_cell),
             partition_fn=min_cut_rematerialization_partition,
             decompositions=default_decompositions,
-            dynamic=True)(*inp)
+            dynamic=True,
+        )(*inp)
         fw_graph = fw_graph_cell[0]
         (compiled_outs[0].sum() + compiled_outs[2].sum()).backward()
         bw_graph = bw_graph_cell[0]
@@ -3512,7 +4562,7 @@ def f(a, b, c, d):
             # then 4 tensors (transposes of matricies used for mm) are saved
             # finally 3 symints are saved
             [False, True, True, False, False] + [False] * 4 + [True] * 3,
-            [is_sym_node(n) for n in fw_graph_out_nodes]
+            [is_sym_node(n) for n in fw_graph_out_nodes],
         )
 
         real_outs = f(*inp)
@@ -3535,7 +4585,9 @@ def f(a, b, c, d):
             x = a + b + c + d
             return x.cos().cos()
 
-        fw_graph, bw_graph = get_fw_bw_graph(f, [torch.randn(3, requires_grad=True) for _ in range(4)])
+        fw_graph, bw_graph = get_fw_bw_graph(
+            f, [torch.randn(3, requires_grad=True) for _ in range(4)]
+        )
         self.assertEqual(get_num_ins_outs(fw_graph), (4, 2))
         self.assertEqual(get_num_ins_outs(bw_graph), (2, 4))
 
@@ -3545,9 +4597,13 @@ def f(x):
             return x * x * x
 
         recomputable_ops = []
-        partition_fn = partial(min_cut_rematerialization_partition, recomputable_ops=recomputable_ops)
+        partition_fn = partial(
+            min_cut_rematerialization_partition, recomputable_ops=recomputable_ops
+        )
 
-        fw_graph, bw_graph = get_fw_bw_graph(f, [torch.randn(3, requires_grad=True)], partition_fn)
+        fw_graph, bw_graph = get_fw_bw_graph(
+            f, [torch.randn(3, requires_grad=True)], partition_fn
+        )
         # Expected forward graph:
         # opcode         name       target           args                        kwargs
         # -------------  ---------  ---------------  --------------------------  --------
@@ -3571,8 +4627,12 @@ def f(x):
         self.assertEqual(get_num_ins_outs(bw_graph), (3, 1))
 
         recomputable_ops = [torch.ops.aten.mul]
-        partition_fn = partial(min_cut_rematerialization_partition, recomputable_ops=recomputable_ops)
-        fw_graph, bw_graph = get_fw_bw_graph(f, [torch.randn(3, requires_grad=True)], partition_fn)
+        partition_fn = partial(
+            min_cut_rematerialization_partition, recomputable_ops=recomputable_ops
+        )
+        fw_graph, bw_graph = get_fw_bw_graph(
+            f, [torch.randn(3, requires_grad=True)], partition_fn
+        )
         # Expected forward graph:
         # opcode         name       target           args                    kwargs
         # -------------  ---------  ---------------  ----------------------  --------
@@ -3629,14 +4689,15 @@ def generate(x):
                 return torch.mul(x, x)
 
         inference_graph_cell = [None]
-        inference_compiler = make_boxed_compiler(partial(extract_graph, graph_cell=inference_graph_cell))
+        inference_compiler = make_boxed_compiler(
+            partial(extract_graph, graph_cell=inference_graph_cell)
+        )
         aot_fn = aot_function(generate, nop, inference_compiler=inference_compiler)
         # Even though x requires grad, we should still get an inference graph
         x = torch.randn(4, requires_grad=True)
         res = aot_fn(x)
         self.assertTrue(inference_graph_cell[0] is not None)
 
-
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
     @unittest.skipIf(not USE_TORCHVISION, "test requires torchvision")
     def test_autocast(self):
@@ -3653,7 +4714,6 @@ def test_autocast(self):
 
 
 class TestAOTDispatch(AOTTestCase):
-
     # Tests to add cases for (non-exhaustive list, mostly for my notes):
     # - subclass / mode introduced in the middle of the compiled fn
     # - various input mutation / intermediate base tests
@@ -3685,7 +4745,7 @@ def f(a, b):
             f,
             fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell),
             bw_compiler=partial(extract_graph, graph_cell=bw_graph_cell),
-            partition_fn=min_cut_rematerialization_partition
+            partition_fn=min_cut_rematerialization_partition,
         )
         out_ref = f(a_ref, b_ref)
         out_test = compiled_f(a_test, b_test)
@@ -3709,14 +4769,17 @@ def f(a, b):
         #   while the graph itself returns two outputs (add, add_1)
         # - add, add_1 correspond to the two inner dense tensors that will be wrapped
         # - into a single TwoTensor output.
-        self.assertExpectedInline(fw_graph_cell[0].code.strip(), """\
+        self.assertExpectedInline(
+            fw_graph_cell[0].code.strip(),
+            """\
 def forward(self, primals_1, primals_2, primals_3):
     mul = torch.ops.aten.mul.Tensor(primals_1, 6);  primals_1 = None
     mul_1 = torch.ops.aten.mul.Tensor(primals_2, 6);  primals_2 = None
     div = torch.ops.aten.div.Tensor(primals_3, 2);  primals_3 = None
     add = torch.ops.aten.add.Tensor(mul, div);  mul = None
     add_1 = torch.ops.aten.add.Tensor(mul_1, div);  mul_1 = div = None
-    return [add, add_1]""")
+    return [add, add_1]""",
+        )
 
         # Important pieces of the graph:
         # - 4 total dense outputs.
@@ -3725,13 +4788,16 @@ def forward(self, primals_1, primals_2, primals_3):
         #   so (mul_2, mul_3) will be wrapped into a.grad
         #   and (div_1, div_2) will be wrapped into b.grad
         # - 4 total dense outputs,
-        self.assertExpectedInline(bw_graph_cell[0].code.strip(), """\
+        self.assertExpectedInline(
+            bw_graph_cell[0].code.strip(),
+            """\
 def forward(self, tangents_1, tangents_2):
     div_1 = torch.ops.aten.div.Tensor(tangents_1, 2)
     div_2 = torch.ops.aten.div.Tensor(tangents_2, 2)
     mul_2 = torch.ops.aten.mul.Tensor(tangents_1, 6);  tangents_1 = None
     mul_3 = torch.ops.aten.mul.Tensor(tangents_2, 6);  tangents_2 = None
-    return [mul_2, mul_3, div_1, div_2]""")
+    return [mul_2, mul_3, div_1, div_2]""",
+        )
 
     def test_aot_dispatch_inference(self):
         # a is a subclass, b is not
@@ -3754,7 +4820,7 @@ def f(a, b):
             f,
             fw_compiler=nop,
             bw_compiler=nop,
-            partition_fn=min_cut_rematerialization_partition
+            partition_fn=min_cut_rematerialization_partition,
         )
         out_ref = f(a_ref, b_ref)
         out_test = compiled_f(a_test, b_test)
@@ -3790,7 +4856,7 @@ def f(a, b):
             f,
             fw_compiler=nop,
             bw_compiler=nop,
-            partition_fn=min_cut_rematerialization_partition
+            partition_fn=min_cut_rematerialization_partition,
         )
         out_ref = f(a_ref, b_ref)
         out_test = compiled_f(a_test, b_test)
@@ -3804,7 +4870,7 @@ def f(a, b):
         # This will eventually require a repartition + recompile
         with self.assertRaisesRegex(
             AssertionError,
-            "incorrectly attempted to compile the backward with incorrect subclass metadata"
+            "incorrectly attempted to compile the backward with incorrect subclass metadata",
         ):
             (out_test[0] + out_test[1]).sum().backward()
 
@@ -3827,7 +4893,7 @@ def f(a, b):
             f,
             fw_compiler=nop,
             bw_compiler=nop,
-            partition_fn=min_cut_rematerialization_partition
+            partition_fn=min_cut_rematerialization_partition,
         )
         out_ref1, out_ref2 = f(a_ref, b_ref)
         out_test1, out_test2 = compiled_f(a_test, b_test)
@@ -3867,7 +4933,7 @@ def f(a, b):
             f,
             fw_compiler=nop,
             bw_compiler=nop,
-            partition_fn=min_cut_rematerialization_partition
+            partition_fn=min_cut_rematerialization_partition,
         )
         out_ref = f(a_ref, b_ref)
         out_test = compiled_f(a_test, b_test)
@@ -3900,7 +4966,12 @@ def f(a, b):
         b1_ref = torch.arange(9, requires_grad=True, dtype=torch.float32).reshape(3, 3)
         b2_ref = torch.arange(9, requires_grad=True, dtype=torch.float32).reshape(3, 3)
         b_ref_base = TwoTensor(b1_ref, b2_ref)
-        a_ref_base = torch.arange(9, dtype=torch.float32).reshape(3, 3).detach().requires_grad_(True)
+        a_ref_base = (
+            torch.arange(9, dtype=torch.float32)
+            .reshape(3, 3)
+            .detach()
+            .requires_grad_(True)
+        )
         b_ref = b_ref_base + 1
         a_ref = a_ref_base + 1
 
@@ -3915,7 +4986,7 @@ def f(a, b):
             f,
             fw_compiler=nop,
             bw_compiler=nop,
-            partition_fn=min_cut_rematerialization_partition
+            partition_fn=min_cut_rematerialization_partition,
         )
         out_ref = f(a_ref, b_ref)
         out_test = compiled_f(a_test, b_test)
@@ -3950,7 +5021,12 @@ def f(a, b):
         b1_ref = torch.arange(9, requires_grad=True, dtype=torch.float32).reshape(3, 3)
         b2_ref = torch.arange(9, requires_grad=True, dtype=torch.float32).reshape(3, 3)
         b_ref_base = TwoTensor(b1_ref, b2_ref)
-        a_ref_base = torch.arange(9, dtype=torch.float32).reshape(3, 3).detach().requires_grad_(True)
+        a_ref_base = (
+            torch.arange(9, dtype=torch.float32)
+            .reshape(3, 3)
+            .detach()
+            .requires_grad_(True)
+        )
         b_ref = b_ref_base + 1
         a_ref = a_ref_base + 1
 
@@ -3965,7 +5041,7 @@ def f(a, b):
             f,
             fw_compiler=nop,
             bw_compiler=nop,
-            partition_fn=min_cut_rematerialization_partition
+            partition_fn=min_cut_rematerialization_partition,
         )
         out_ref = f(a_ref, b_ref)
         out_test = compiled_f(a_test, b_test)
@@ -3995,7 +5071,12 @@ def f(a, b):
         b1_ref = torch.arange(9, requires_grad=True, dtype=torch.float32).reshape(3, 3)
         b2_ref = torch.arange(9, requires_grad=True, dtype=torch.float32).reshape(3, 3)
         b_ref_base = TwoTensor(b1_ref, b2_ref)
-        a_ref_base = torch.arange(9, dtype=torch.float32).reshape(3, 3).detach().requires_grad_(True)
+        a_ref_base = (
+            torch.arange(9, dtype=torch.float32)
+            .reshape(3, 3)
+            .detach()
+            .requires_grad_(True)
+        )
         b_ref = b_ref_base + 1
         a_ref = a_ref_base + 1
 
@@ -4010,7 +5091,7 @@ def f(a, b):
             f,
             fw_compiler=nop,
             bw_compiler=nop,
-            partition_fn=min_cut_rematerialization_partition
+            partition_fn=min_cut_rematerialization_partition,
         )
         out_ref1, out_ref2 = f(a_ref, b_ref)
         out_test1, out_test2 = compiled_f(a_test, b_test)
@@ -4039,7 +5120,7 @@ def __init__(self):
                 self.linear = torch.nn.Linear(20, 30)
 
             def forward(self, x, y):
-                return (self.linear(x) + y, )
+                return (self.linear(x) + y,)
 
         mod = MockModule()
         mod.zero_grad()
@@ -4068,7 +5149,7 @@ def __init__(self):
                 self.linear = torch.nn.Linear(20, 30)
 
             def forward(self, x, y):
-                return (self.linear(x) + y, )
+                return (self.linear(x) + y,)
 
         mod = MockModule()
 
@@ -4089,9 +5170,12 @@ def forward(self, x, y):
         res = compiled_f(*cloned_inputs)
         res[0].sum().backward()
 
-        self.assertExpectedInline(shape_env.format_guards(), """\
+        self.assertExpectedInline(
+            shape_env.format_guards(),
+            """\
  - Eq(s1, 20)
- - Eq(s2, 30)""")
+ - Eq(s2, 30)""",
+        )
 
         assert torch.allclose(ref[0], res[0])
         assert torch.allclose(inputs[0].grad, cloned_inputs[0].grad)
@@ -4102,7 +5186,9 @@ def test_lift_fresh_copy_in_graph(self):
         class MyMod(torch.nn.Module):
             def forward(self, x):
                 _tensor_constant0 = torch.tensor([1])
-                lift_fresh_copy = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0)
+                lift_fresh_copy = torch.ops.aten.lift_fresh_copy.default(
+                    _tensor_constant0
+                )
                 y = x.mul(lift_fresh_copy)
                 return (y,)
 
@@ -4123,10 +5209,12 @@ def test_inference_python_dispatcher(self):
         class MockModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.upsample = torch.nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+                self.upsample = torch.nn.Upsample(
+                    scale_factor=2, mode="bilinear", align_corners=True
+                )
 
             def forward(self, x):
-                return (self.upsample(x), )
+                return (self.upsample(x),)
 
         mod = MockModule()
         shape_env = ShapeEnv()
@@ -4146,7 +5234,7 @@ def forward(self, x, y):
                 z = self.linear(x)
                 z = z + y
                 z = z.relu()
-                return (z, )
+                return (z,)
 
         tracer = torch.fx.Tracer()
         tracer.record_stack_traces = True
@@ -4154,27 +5242,69 @@ def forward(self, x, y):
         mod = torch.fx.GraphModule(tracer.root, graph)
 
         for node in mod.graph.nodes:
-            if node.op == 'output':
+            if node.op == "output":
                 continue
             self.assertTrue(node.stack_trace is not None)
-            assert 'test_aotdispatch.py' in node.stack_trace
+            assert "test_aotdispatch.py" in node.stack_trace
 
         def assert_compiler(gm: torch.fx.GraphModule, _):
             for node in gm.graph.nodes:
-                if node.op == 'output' or node.op == 'placeholder':
+                if node.op == "output" or node.op == "placeholder":
                     continue
                 self.assertTrue(node.stack_trace is not None)
-                assert 'test_aotdispatch.py' in node.stack_trace
+                assert "test_aotdispatch.py" in node.stack_trace
             return gm.forward  # return a python callable
 
         x = torch.randn(128, 20, requires_grad=True)
         y = torch.randn(128, 30, requires_grad=True)
         inputs = [x, y]
 
-        compiled_f = aot_module_simplified(mod, inputs, fw_compiler=assert_compiler, bw_compiler=assert_compiler)
+        compiled_f = aot_module_simplified(
+            mod, inputs, fw_compiler=assert_compiler, bw_compiler=assert_compiler
+        )
         res = compiled_f(*inputs)
         res[0].sum().backward()
 
+    def test_aot_module_simplified_preserves_stack_trace_from_mutation(self):
+        class MockModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x_view = x[0]
+                x_view.mul_(2)
+                return (x + x,)
+
+        tracer = torch.fx.Tracer()
+        tracer.record_stack_traces = True
+        graph = tracer.trace(MockModule())
+        mod = torch.fx.GraphModule(tracer.root, graph)
+
+        for node in mod.graph.nodes:
+            if node.op == "output":
+                continue
+            self.assertTrue(node.stack_trace is not None)
+            assert "test_aotdispatch.py" in node.stack_trace
+
+        def assert_compiler(gm: torch.fx.GraphModule, _):
+            assert torch.ops.aten.copy_.default in [x.target for x in gm.graph.nodes]
+            for node in gm.graph.nodes:
+                if node.target == torch.ops.aten.copy_.default:
+                    assert "stack_trace" in node.meta
+                    assert "x_view.mul_(2)" in node.meta["stack_trace"]
+            return gm.forward  # return a python callable
+
+        x = torch.randn(128, 20)
+        inputs = [x]
+
+        aot_module_simplified(
+            mod,
+            inputs,
+            fw_compiler=assert_compiler,
+            bw_compiler=assert_compiler,
+            keep_inference_input_mutations=True,
+        )
+
     def test_aot_module_simplified_fake_tensor_gm_raises(self):
         fake_mode = torch._subclasses.fake_tensor.FakeTensorMode()
         real_x = torch.randn(4, requires_grad=True)
@@ -4188,11 +5318,9 @@ def forward(self, x):
                 # constant to make_fx, and result in the tensor being traced
                 # into the graph, which is an error condition.  Make sure we
                 # report adequately in this case.
-                return (x + fake_z, )
+                return (x + fake_z,)
 
-        with self.assertRaisesRegex(
-            AssertionError, "Unexpected fake"
-        ):
+        with self.assertRaisesRegex(AssertionError, "Unexpected fake"):
             aot_module_simplified(MockModule(), (fake_x,), nop)
 
 
@@ -4200,94 +5328,124 @@ def forward(self, x):
 # Each one of these is a bug (or needs to be investigated)
 aot_autograd_failures = {
     # data-dependent control flow
-    xfail('cov'),
-    xfail('nn.functional.gaussian_nll_loss'),
-    xfail('tensor_split'),
-    xfail('corrcoef'),
-    xfail('quantile'),
-    xfail('nanquantile'),
-    xfail('narrow'),
-    xfail('istft'),
-    xfail('linalg.eig'),
-
-    skip('as_strided_scatter'),
-    skip('as_strided', 'partial_views'),  # flaky
-
+    xfail("cov"),
+    xfail("nn.functional.gaussian_nll_loss"),
+    xfail("tensor_split"),
+    xfail("corrcoef"),
+    xfail("quantile"),
+    xfail("nanquantile"),
+    xfail("narrow"),
+    xfail("istft"),
+    xfail("linalg.eig"),
+    skip("as_strided_scatter"),
+    skip("as_strided", "partial_views"),  # flaky
     # Given input size: (s0xs1x2). Calculated output size: ...
-    skip('max_pool2d_with_indices_backward'),
-
-    skip('nn.functional.nll_loss', ''),  # UBSAN failure!
-
+    skip("max_pool2d_with_indices_backward"),
+    skip("nn.functional.nll_loss", ""),  # UBSAN failure!
     # Misc
-    xfail('to_sparse'),
-    xfail('corrcoef'),
-    xfail('cov'),
-    xfail('chalf'),  # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
-    xfail('sparse.sampled_addmm'),
-    xfail('sparse.mm', 'reduce'),
-    skip('nn.functional.binary_cross_entropy_with_logits'),  # seems to fail sometimes?
-    skip('nn.functional.margin_ranking_loss'),  # seems flaky
-    skip('linalg.lu_solve'),  # flaky
-    decorate('matmul', decorator=unittest.skipIf(IS_ARM64, 'flaky')),
-    decorate('__rmatmul__', decorator=unittest.skipIf(IS_ARM64, 'flaky')),
+    xfail("to_sparse"),
+    xfail("corrcoef"),
+    xfail("cov"),
+    xfail("chalf"),  # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
+    xfail("sparse.sampled_addmm"),
+    xfail("sparse.mm", "reduce"),
+    skip("nn.functional.binary_cross_entropy_with_logits"),  # seems to fail sometimes?
+    skip("nn.functional.margin_ranking_loss"),  # seems flaky
+    skip("linalg.lu_solve"),  # flaky
+    decorate("matmul", decorator=unittest.skipIf(IS_ARM64, "flaky")),
+    decorate("__rmatmul__", decorator=unittest.skipIf(IS_ARM64, "flaky")),
     # overrides atol=1e-4, rtol=1e-5 would do as well
-    decorate('svd_lowrank', decorator=toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-05)})),
-    decorate('linalg.householder_product', decorator=unittest.skipIf(IS_MACOS and IS_X86, 'flaky')),
-    decorate('linalg.pinv', 'singular', decorator=toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1e-05)})),
-    decorate('nn.functional.interpolate', 'bicubic', decorator=toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-05)})),
+    decorate(
+        "svd_lowrank",
+        decorator=toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-05)}),
+    ),
+    decorate(
+        "linalg.householder_product",
+        decorator=unittest.skipIf(IS_MACOS and IS_X86, "flaky"),
+    ),
+    decorate(
+        "linalg.pinv",
+        "singular",
+        decorator=toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1e-05)}),
+    ),
+    decorate(
+        "nn.functional.interpolate",
+        "bicubic",
+        decorator=toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-05)}),
+    ),
     # conv2d sometimes nondeterministic in this config?
-    decorate('nn.functional.conv2d', decorator=unittest.skipIf(IS_ARM64, "flaky")),
+    decorate("nn.functional.conv2d", decorator=unittest.skipIf(IS_ARM64, "flaky")),
 }
 
 symbolic_aot_autograd_failures = {
-    xfail('combinations', ''),  # aten.masked_select.default
-    xfail('frexp', ''),  # aten.frexp.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('i0', ''),  # aten.i0.default - couldn't find symbolic meta function/decomposition
-    xfail('index_fill', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('kthvalue', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('linalg.eigvals', ''),  # aten.linalg_eig.default - couldn't find symbolic meta function/decomposition
-    xfail('linalg.lstsq', ''),  # aten.linalg_lstsq.default - couldn't find symbolic meta function/decomposition
-    xfail('linalg.lstsq', 'grad_oriented'),  # aten.linalg_lstsq.default - couldn't find symbolic meta funct...
-    xfail('linalg.lu_solve', ''),  # aten.linalg_lu_solve.default - couldn't find symbolic meta function/deco...
-    skip('nn.functional.batch_norm', ''),  # '0 is not tracked with proxy for <torch.fx.experimental.proxy_te..
-    xfail('nn.functional.binary_cross_entropy', ''),  # aten.fill_.Scalar - couldn't find symbolic meta funct...
-    xfail('nn.functional.cross_entropy', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.ctc_loss', ''),  # aten._ctc_loss.Tensor - couldn't find symbolic meta function/deco...
-    xfail('nn.functional.embedding_bag', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.fractional_max_pool2d', ''),  # rand() received an invalid combination of arguments - g...
-    xfail('nn.functional.fractional_max_pool3d', ''),  # rand() received an invalid combination of arguments - g...
-    xfail('nn.functional.group_norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.interpolate', 'linear'),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.interpolate', 'trilinear'),  # Cannot call sizes() on tensor with symbolic sizes/st...
-    xfail('nn.functional.nll_loss', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.pixel_shuffle', ''),  # aten.pixel_shuffle.default - couldn't find symbolic meta fun...
-    xfail('nn.functional.pixel_unshuffle', ''),  # aten.pixel_unshuffle.default - couldn't find symbolic meta...
-    xfail('_segment_reduce', 'lengths'),  # aten.segment_reduce.default - couldn't find symbolic meta functio...
-    xfail('_segment_reduce', 'offsets'),  # aten.segment_reduce.default - couldn't find symbolic meta functio...
-    xfail('special.i1', ''),  # aten.i0.default - couldn't find symbolic meta function/decomposition
-    xfail('trace', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('_upsample_bilinear2d_aa'),  # RuntimeError: isIntList() INTERNAL ASSERT FAILED  Expected IntList but got GenericList
-    decorate('linalg.householder_product', decorator=unittest.skipIf(IS_MACOS and IS_X86, 'flaky')),
-
+    xfail("combinations", ""),  # aten.masked_select.default
+    xfail(
+        "index_fill", ""
+    ),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail("kthvalue", ""),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail(
+        "linalg.lstsq", ""
+    ),  # aten.linalg_lstsq.default - couldn't find symbolic meta function/decomposition
+    xfail(
+        "linalg.lstsq", "grad_oriented"
+    ),  # aten.linalg_lstsq.default - couldn't find symbolic meta funct...
+    xfail(
+        "linalg.lu_solve", ""
+    ),  # aten.linalg_lu_solve.default - couldn't find symbolic meta function/deco...
+    skip(
+        "nn.functional.batch_norm", ""
+    ),  # '0 is not tracked with proxy for <torch.fx.experimental.proxy_te..
+    xfail(
+        "nn.functional.binary_cross_entropy", ""
+    ),  # aten.fill_.Scalar - couldn't find symbolic meta funct...
+    xfail(
+        "nn.functional.cross_entropy", ""
+    ),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail(
+        "nn.functional.ctc_loss", ""
+    ),  # aten._ctc_loss.Tensor - couldn't find symbolic meta function/deco...
+    xfail(
+        "nn.functional.fractional_max_pool3d", ""
+    ),  # rand() received an invalid combination of arguments - g...
+    xfail(
+        "nn.functional.group_norm", ""
+    ),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail(
+        "nn.functional.nll_loss", ""
+    ),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail(
+        "_segment_reduce", "lengths"
+    ),  # aten.segment_reduce.default - couldn't find symbolic meta functio...
+    xfail(
+        "_segment_reduce", "offsets"
+    ),  # aten.segment_reduce.default - couldn't find symbolic meta functio...
+    xfail("trace", ""),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail(
+        "_upsample_bilinear2d_aa"
+    ),  # RuntimeError: isIntList() INTERNAL ASSERT FAILED  Expected IntList but got GenericList
+    decorate(
+        "linalg.householder_product",
+        decorator=unittest.skipIf(IS_MACOS and IS_X86, "flaky"),
+    ),
     # many complex operators incorrect striding, metadata
-    xfail('fft.fft', ''),
-    xfail('fft.hfft2', ''),
-    xfail('fft.hfft', ''),
-    xfail('fft.hfftn', ''),
-    xfail('fft.ifft', ''),
-    xfail('fft.ihfft2', ''),
-    xfail('fft.ihfft', ''),
-    xfail('fft.ihfftn', ''),
-    xfail('fft.irfft2', ''),
-    xfail('fft.irfft', ''),
-    xfail('fft.irfftn', ''),
-    xfail('fft.rfft2', ''),
-    xfail('fft.rfft', ''),
-    xfail('fft.rfftn', ''),
-
-    xfail('stft', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail("fft.fft", ""),
+    xfail("fft.hfft2", ""),
+    xfail("fft.hfft", ""),
+    xfail("fft.hfftn", ""),
+    xfail("fft.ifft", ""),
+    xfail("fft.ihfft2", ""),
+    xfail("fft.ihfft", ""),
+    xfail("fft.ihfftn", ""),
+    xfail("fft.irfft2", ""),
+    xfail("fft.irfft", ""),
+    xfail("fft.irfftn", ""),
+    xfail("fft.rfft2", ""),
+    xfail("fft.rfft", ""),
+    xfail("fft.rfftn", ""),
+    xfail("stft", ""),  # Cannot call sizes() on tensor with symbolic sizes/strides
 }
 
+
 def _test_aot_autograd_helper(self, device, dtype, op, dynamic=False):
     if not op.supports_autograd:
         self.skipTest("Op does not support autograd")
@@ -4295,11 +5453,13 @@ def _test_aot_autograd_helper(self, device, dtype, op, dynamic=False):
     # aot_autograd_check is able to check data specialization by
     # randomizing the inputs. Here's a list of ops that really do not
     # like random inputs for which we want to disable that.
-    cant_check_data_specialization = set({
-        'nn.functional.max_unpool1d',
-        'nn.functional.max_unpool2d',
-        'nn.functional.max_unpool3d',
-    })
+    cant_check_data_specialization = set(
+        {
+            "nn.functional.max_unpool1d",
+            "nn.functional.max_unpool2d",
+            "nn.functional.max_unpool3d",
+        }
+    )
     try_check_data_specialization = op.name not in cant_check_data_specialization
 
     sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=True)
@@ -4308,10 +5468,15 @@ def _test_aot_autograd_helper(self, device, dtype, op, dynamic=False):
         t_kwargs = sample_input.kwargs
         try:
             aot_autograd_check(
-                op.op, t_args, t_kwargs, dynamic,
-                self.assertRaisesRegex, self.assertEqual,
+                op.op,
+                t_args,
+                t_kwargs,
+                dynamic,
+                self.assertRaisesRegex,
+                self.assertEqual,
                 check_gradients=True,
-                try_check_data_specialization=try_check_data_specialization)
+                try_check_data_specialization=try_check_data_specialization,
+            )
         except DynamicOutputShapeException:
             self.skipTest("Dynamic output shape operation in trace")
         except GuardOnDataDependentSymNode:
@@ -4323,21 +5488,31 @@ def _test_aot_autograd_helper(self, device, dtype, op, dynamic=False):
             else:
                 raise
 
-def _test_aot_autograd_module_helper(self, device, dtype, training, module_info, *, dynamic=False):
+
+def _test_aot_autograd_module_helper(
+    self, device, dtype, training, module_info, *, dynamic=False
+):
     module_cls = module_info.module_cls
-    module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
-                                                   requires_grad=True, training=training)
+    module_inputs = module_info.module_inputs_func(
+        module_info, device=device, dtype=dtype, requires_grad=True, training=training
+    )
     for module_input in module_inputs:
         if module_input.forward_input is None:
             continue
 
-        args, kwargs = module_input.constructor_input.args, module_input.constructor_input.kwargs
+        args, kwargs = (
+            module_input.constructor_input.args,
+            module_input.constructor_input.kwargs,
+        )
         m = module_cls(*args, **kwargs)
         m.to(device).to(dtype)
         m.train(training)
 
         # Lazy modules need to see an input first to initialize params.
-        args, kwargs = module_input.forward_input.args, module_input.forward_input.kwargs
+        args, kwargs = (
+            module_input.forward_input.args,
+            module_input.forward_input.kwargs,
+        )
         flat_args, args_spec = pytree.tree_flatten((args, kwargs))
 
         # PackedSequence is only used for RNNs. It might be possible to fake-ify if they're pytrees but
@@ -4350,8 +5525,9 @@ def _test_aot_autograd_module_helper(self, device, dtype, training, module_info,
                 m(*args, **kwargs)
 
         sentinel_val = -42
-        is_tensor_spec = [sentinel_val if isinstance(arg, torch.Tensor)
-                          else arg for arg in flat_args]
+        is_tensor_spec = [
+            sentinel_val if isinstance(arg, torch.Tensor) else arg for arg in flat_args
+        ]
         args = [arg for arg in flat_args if isinstance(arg, torch.Tensor)]
 
         def f(params_buffers_args):
@@ -4368,59 +5544,70 @@ def f(params_buffers_args):
         named_params = dict(m.named_parameters(remove_duplicate=False))
         named_buffers = dict(m.named_buffers(remove_duplicate=False))
         num_params_buffers = len(named_params) + len(named_buffers)
-        compiled_f = aot_function(f, nop, num_params_buffers=num_params_buffers, dynamic=dynamic)
+        compiled_f = aot_function(
+            f, nop, num_params_buffers=num_params_buffers, dynamic=dynamic
+        )
         params_buffers_args = [named_params, named_buffers, args]
         _test_aot_autograd_forwards_backwards_helper(
-            f, compiled_f, params_buffers_args,
-            self.assertRaisesRegex, self.assertEqual, True)
+            f,
+            compiled_f,
+            params_buffers_args,
+            self.assertRaisesRegex,
+            self.assertEqual,
+            True,
+        )
 
 
 class TestEagerFusionOpInfo(AOTTestCase):
-    @ops(op_db + control_flow_opinfo_db, allowed_dtypes=(torch.float,))
-    @skipOps('TestEagerFusionOpInfo', 'test_aot_autograd_exhaustive', aot_autograd_failures)
+    @ops(op_db + hop_db, allowed_dtypes=(torch.float,))
+    @skipOps(
+        "TestEagerFusionOpInfo", "test_aot_autograd_exhaustive", aot_autograd_failures
+    )
     def test_aot_autograd_exhaustive(self, device, dtype, op):
         _test_aot_autograd_helper(self, device, dtype, op)
 
-    @ops(op_db + control_flow_opinfo_db, allowed_dtypes=(torch.float,))
+    @ops(op_db + hop_db, allowed_dtypes=(torch.float,))
     @patch("functorch.compile.config.debug_assert", True)
-    @skipOps('TestEagerFusionOpInfo', 'test_aot_autograd_symbolic_exhaustive',
-             aot_autograd_failures | symbolic_aot_autograd_failures)
+    @skipOps(
+        "TestEagerFusionOpInfo",
+        "test_aot_autograd_symbolic_exhaustive",
+        aot_autograd_failures | symbolic_aot_autograd_failures,
+    )
     def test_aot_autograd_symbolic_exhaustive(self, device, dtype, op):
         _test_aot_autograd_helper(self, device, dtype, op, dynamic=True)
 
 
-aot_autograd_module_failures = set({
-    torch.nn.CTCLoss,  # torch._subclasses.fake_tensor.DynamicOutputShapeException: aten._ctc_loss.default
-    torch.nn.GaussianNLLLoss,  # RuntimeError: It appears that you're trying to get value out
-                               # of a tracing tensor with aten._local_scalar_dense.default -
-                               # erroring out! It's likely that this is caused by data-dependent
-                               # control flow or similar.
-    torch.nn.MultiLabelMarginLoss,  # AssertionError: The values for attribute 'shape' do not match:
-                                    # torch.Size([1]) != torch.Size([]). Outputs of the operator are different in
-                                    # eager-mode PyTorch vs AOTAutograd. This means the operator will have incorrect
-                                    # output underneath torch.compile. This could be because the operator's
-                                    # implementation not traceable or that there is a bug in AOTAutograd.
-    torch.nn.TransformerEncoder,  # DataDependentOutputException: aten.eq compares a mask input
-                                  # to a causal mask tensor, to see if Boolean is_causal should be set
-                                  # for TrnasformerEncoder layers, MHA and sdp custom kernels
-    torch.nn.Transformer,  # DataDependentOutputException: aten.equal compares a mask input
-                           # to a causal mask tensor, to see if Boolean is_causal should be set
-                           # for TransformerEncoder layers, MHA and sdp custom kernels
-                           # (this bubbles up to Transformer)
-})
+aot_autograd_module_failures = set(
+    {
+        torch.nn.CTCLoss,  # torch._subclasses.fake_tensor.DynamicOutputShapeException: aten._ctc_loss.default
+        torch.nn.GaussianNLLLoss,  # RuntimeError: It appears that you're trying to get value out
+        # of a tracing tensor with aten._local_scalar_dense.default -
+        # erroring out! It's likely that this is caused by data-dependent
+        # control flow or similar.
+        torch.nn.MultiLabelMarginLoss,  # AssertionError: The values for attribute 'shape' do not match:
+        # torch.Size([1]) != torch.Size([]). Outputs of the operator are different in
+        # eager-mode PyTorch vs AOTAutograd. This means the operator will have incorrect
+        # output underneath torch.compile. This could be because the operator's
+        # implementation not traceable or that there is a bug in AOTAutograd.
+        torch.nn.TransformerEncoder,  # DataDependentOutputException: aten.eq compares a mask input
+        # to a causal mask tensor, to see if Boolean is_causal should be set
+        # for TrnasformerEncoder layers, MHA and sdp custom kernels
+        torch.nn.Transformer,  # DataDependentOutputException: aten.equal compares a mask input
+        # to a causal mask tensor, to see if Boolean is_causal should be set
+        # for TransformerEncoder layers, MHA and sdp custom kernels
+        # (this bubbles up to Transformer)
+    }
+)
 
 symbolic_aot_autograd_module_failures = {
     torch.nn.Transformer,  # DataDependentOutputException: aten.equal compares a mask input to a mask producing a bool
     torch.nn.TransformerEncoder,  # DataDependentOutputException: aten.equal compares a mask input to a mask producing a bool
     torch.nn.GaussianNLLLoss,  # NotImplementedError: local_scalar_dense/item NYI for torch.bool
     torch.nn.GroupNorm,  # in native_group_norm_backward cpg, _rem = divmod(C, group)
-                         # TypeError: unsupported operand type(s) for divmod(): 'SymInt' and 'int'
-    torch.nn.FractionalMaxPool2d,  # int() argument must be a string, a bytes-like object or a number, not 'SymFloat'
+    # TypeError: unsupported operand type(s) for divmod(): 'SymInt' and 'int'
     torch.nn.FractionalMaxPool3d,  # int() argument must be a string, a bytes-like object or a number, not 'SymFloat'
     torch.nn.BCELoss,  # new_size = _infer_size(target.size(), weight.size())
-                       # RuntimeError: expected int at position 0, but got: SymInt
-    torch.nn.CrossEntropyLoss,  # RuntimeError: Cannot call numel() on tensor with symbolic sizes/strides
-    torch.nn.NLLLoss,  # RuntimeError: Cannot call numel() on tensor with symbolic sizes/strides
+    # RuntimeError: expected int at position 0, but got: SymInt
 }
 
 
@@ -4431,14 +5618,20 @@ def test_aot_autograd_module_exhaustive(self, device, dtype, training, module_in
         _test_aot_autograd_module_helper(self, device, dtype, training, module_info)
 
     @modules(module_db, allowed_dtypes=(torch.float,))
-    @decorateForModules(unittest.expectedFailure,
-                        aot_autograd_module_failures | symbolic_aot_autograd_module_failures)
-    def test_aot_autograd_symbolic_module_exhaustive(self, device, dtype, training, module_info):
-        _test_aot_autograd_module_helper(self, device, dtype, training, module_info, dynamic=True)
+    @decorateForModules(
+        unittest.expectedFailure,
+        aot_autograd_module_failures | symbolic_aot_autograd_module_failures,
+    )
+    def test_aot_autograd_symbolic_module_exhaustive(
+        self, device, dtype, training, module_info
+    ):
+        _test_aot_autograd_module_helper(
+            self, device, dtype, training, module_info, dynamic=True
+        )
 
 
 instantiate_parametrized_tests(TestAOTAutograd)
-only_for = ("cpu")
+only_for = "cpu"
 instantiate_device_type_tests(
     TestPythonKey,
     globals(),
@@ -4448,5 +5641,5 @@ def test_aot_autograd_symbolic_module_exhaustive(self, device, dtype, training,
 instantiate_device_type_tests(TestEagerFusionModuleInfo, globals(), only_for=only_for)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index 8dc49257ffc7b..a1aeb8c1de7d3 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -1,16 +1,32 @@
 # Owner(s): ["module: functorch"]
+import contextlib
 import functools
 import unittest
 
-from torch.testing._internal.common_utils import TEST_WITH_TORCHDYNAMO, parametrize, instantiate_parametrized_tests
 import torch
 import torch.utils._pytree as pytree
 from functorch.experimental import control_flow
-from functorch.experimental.control_flow import UnsupportedAliasMutationException, cond
+from functorch.experimental.control_flow import cond, UnsupportedAliasMutationException
+from torch._higher_order_ops.while_loop import while_loop
+from torch._subclasses.functional_tensor import (
+    CppFunctionalizeAPI,
+    FunctionalTensor,
+    FunctionalTensorMode,
+    PythonFunctionalizeAPI,
+)
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.testing._internal.common_utils import run_tests, TestCase
 from torch.testing._internal.common_quantization import skipIfNoDynamoSupport
-from torch._subclasses.functional_tensor import FunctionalTensor
+
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    IS_WINDOWS,
+    parametrize,
+    run_tests,
+    skipIfTorchDynamo,
+    TEST_WITH_TORCHDYNAMO,
+    TestCase,
+)
+
 
 # TODO: pull these helpers from AOTAutograd later
 def to_fun(t):
@@ -18,6 +34,7 @@ def to_fun(t):
         return FunctionalTensor.to_functional(t)
     return t
 
+
 def from_fun(t):
     if not isinstance(t, FunctionalTensor):
         # quick sanity assert
@@ -27,6 +44,7 @@ def from_fun(t):
     torch._sync(t)
     return torch._from_functional_tensor(t.elem)
 
+
 def to_fun_old(t):
     if isinstance(t, torch.Tensor) and not torch._is_functional_tensor(t):
         out = torch._to_functional_tensor(t)
@@ -34,6 +52,7 @@ def to_fun_old(t):
         return out
     return t
 
+
 def from_fun_old(t):
     # quick sanity assert
     if isinstance(t, torch.Tensor):
@@ -42,15 +61,142 @@ def from_fun_old(t):
         return torch._from_functional_tensor(t)
     return t
 
+
 def _fake_map(f, x, *args):
     from functorch.experimental.control_flow import _stack_pytree, _unstack_pytree
+
     x_pytrees = _unstack_pytree(x)
     zs = []
     for xp in x_pytrees:
         zs.append(f(xp, *args))
     return _stack_pytree(zs)
 
-def collect_meta_for_filtered_nodes(gm: torch.fx.GraphModule, node_names, meta_field_name):
+
+def _fake_while_loop(cond_fn, body_fn, operands):
+    while cond_fn(*operands):
+        operands = body_fn(*operands)
+    return operands
+
+
+def _while_loop_tests():
+    def simple(x):
+        def cond_fn(x):
+            return x.sum() < 10
+
+        def body_fn(x):
+            return (x + 1,)
+
+        return while_loop(cond_fn, body_fn, (x,))
+
+    def simple_with_mutation(x):
+        def cond_fn(x):
+            y = x.clone().add_(1).add_(-1)
+            return y.sum() < 10
+
+        def body_fn(x):
+            y = x.clone().add_(1).add_(-1)
+            return (y + 1,)
+
+        return while_loop(cond_fn, body_fn, (x,))
+
+    def nested(out_iter, it, y):
+        def cond_fn(out_iter, it, y):
+            return it.sum() < 10
+
+        def body_fn(out_iter, it, y):
+            return (out_iter.clone(), it + y, y + 1)
+
+        def outer_cond_fn(out_iter, it, y):
+            return out_iter.sum() < 2
+
+        def outer_body_fn(out_iter, it, y):
+            out_iter, it, y = while_loop(cond_fn, body_fn, (out_iter, it, y))
+            return (out_iter + 1, it, y)
+
+        return while_loop(outer_cond_fn, outer_body_fn, (out_iter, it, y))
+
+    class Nested(torch.nn.Module):
+        def forward(self, ci, cj, a, b):
+            def cond_fn(i1, j1, x1, y1):
+                return i1 > 0
+
+            def body_fn(i1, j1, x1, y1):
+                def cond_fn_nested(i2, j2, x2, y2):
+                    return j2 > 0
+
+                def body_fn_nested(i2, j2, x2, y2):
+                    return i2.clone(), j2 - 1, x2 + 3.14, y2 - 2.71
+
+                i1, j1, x1, y1 = while_loop(
+                    cond_fn_nested, body_fn_nested, [i1, j1, x1, y1]
+                )
+                return i1 - 1, j1.clone(), x1 * 2, y1 / 2
+
+            return while_loop(cond_fn, body_fn, (ci, cj, a, b))
+
+    class SimpleWithLinear(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = torch.nn.Linear(2, 2)
+            self.register_buffer("dec", torch.tensor(1))
+
+        def forward(self, iter, x):
+            def cond_fn(it, x):
+                return it - self.dec > 0
+
+            def body_fn(it, x):
+                return it - 1, self.linear(x)
+
+            return while_loop(cond_fn, body_fn, (iter, x))
+
+    class NestedWithLinear(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.mod = SimpleWithLinear()
+            self.outer_linear = torch.nn.Linear(2, 2)
+            self.register_buffer("dec", torch.tensor(1))
+
+        def forward(self, iter, x):
+            def cond_fn(it, x):
+                return it - self.dec > 0
+
+            def body_fn(it, x):
+                return it - 1, self.outer_linear(self.mod(it, x)[1])
+
+            return while_loop(cond_fn, body_fn, (iter, x))
+
+    nested2 = Nested()
+    simple_with_linear = SimpleWithLinear()
+    nested_with_linear = NestedWithLinear()
+
+    x = torch.zeros(1)
+    y = torch.zeros(1)
+    z = torch.zeros(1)
+    return {
+        "simple": (simple, (x,)),
+        "nested": (nested, (x, y, z)),
+        "nested2": (
+            nested2,
+            (torch.tensor(2), torch.tensor(2), torch.ones(2, 2), torch.ones(2, 2)),
+        ),
+        "simple_with_mutation": (simple_with_mutation, (x,)),
+        "simple_with_linear": (
+            simple_with_linear,
+            (torch.tensor(3), torch.randn(2, 2)),
+        ),
+        "nested_with_linear": (
+            nested_with_linear,
+            (torch.tensor(3), torch.randn(2, 2)),
+        ),
+    }
+
+
+WHILE_LOOP_TESTS = _while_loop_tests()
+
+
+def collect_meta_for_filtered_nodes(
+    gm: torch.fx.GraphModule, node_names, meta_field_name
+):
     ret = []
     for mod in gm.modules():
         for node in mod.graph.nodes:
@@ -59,16 +205,19 @@ def collect_meta_for_filtered_nodes(gm: torch.fx.GraphModule, node_names, meta_f
                     ret.append(node.meta.get(field_name))
     return ret
 
+
 def reduce_func(*operands):
     acc = 0
     for operand in operands:
         acc += operand
     return acc
 
+
 class ReduceObj:
     def __call__(self, *operands):
         return reduce_func(*operands)
 
+
 class ReduceMod(torch.nn.Module):
     def _reduce(self, *operands):
         return reduce_func(*operands)
@@ -77,7 +226,7 @@ def forward(self, *operands):
         return self._reduce(*operands)
 
 
-
+@unittest.skipIf(IS_WINDOWS, "Windows not supported for this test")
 @skipIfNoDynamoSupport
 class TestControlFlow(TestCase):
     def setUp(self):
@@ -119,22 +268,44 @@ def f(x, y):
         expected = _fake_map(f, xs, y)
         self.assertEqual(expected, res)
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA.")
+    def test_while_loop_gpu(self):
+        def cond_fn(x):
+            return x.sum() < 10
+
+        def body_fn(x):
+            return (x + 1,)
+
+        x = torch.zeros(1, device="cuda")
+        res = while_loop(cond_fn, body_fn, (x,))
+        expected = _fake_while_loop(cond_fn, body_fn, (x,))
+        self.assertEqual(expected, res)
+
     def test_map_illegal_inputs(self):
         def f(x, y):
             return x[0] + x[1] + y
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"Mapped xs can only consist of tensors\. Got xs \[3, tensor\(\[1\., 1\.\]\)\]\."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Mapped xs can only consist of tensors\. Got xs \[3, tensor\(\[1\., 1\.\]\)\]\.",
+        ):
             _ = control_flow.map(f, (3, torch.ones(2)), torch.ones(2))
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"Leading dimensions of mapped xs cannot be 0\."):
-            _ = control_flow.map(f, (torch.ones(0, 1, 2), torch.ones(0, 1, 2)), torch.ones(2))
+        with self.assertRaisesRegex(
+            RuntimeError, r"Leading dimensions of mapped xs cannot be 0\."
+        ):
+            _ = control_flow.map(
+                f, (torch.ones(0, 1, 2), torch.ones(0, 1, 2)), torch.ones(2)
+            )
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"Leading dimensions of mapped xs must be consistent\. "
-                                    r"Got shapes \[torch\.Size\(\[3, 4, 5\]\), torch\.Size\(\[4, 4, 5\]\)\]\."):
-            _ = control_flow.map(f, (torch.ones(3, 4, 5), torch.ones(4, 4, 5)), torch.ones(5))
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Leading dimensions of mapped xs must be consistent\. "
+            r"Got shapes \[torch\.Size\(\[3, 4, 5\]\), torch\.Size\(\[4, 4, 5\]\)\]\.",
+        ):
+            _ = control_flow.map(
+                f, (torch.ones(3, 4, 5), torch.ones(4, 4, 5)), torch.ones(5)
+            )
 
     def test_map_illegal_outputs(self):
         def f(x, y):
@@ -148,16 +319,19 @@ def f2(x, y):
 
         x = torch.ones([3])
         y = torch.ones([1, 2, 3])
-        with self.assertRaisesRegex(RuntimeError, r"Expect outputs of map only contains tensors or None\."):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expect outputs of map only contains tensors or None\."
+        ):
             _ = control_flow.map(f, x, y)
 
-        with self.assertRaisesRegex(RuntimeError, r"Expect outputs of map only contains tensors or None\."):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expect outputs of map only contains tensors or None\."
+        ):
             out = control_flow.map(f1, x, y)
 
         # return None is OK
         _ = control_flow.map(f2, x, y)
 
-
     def test_map_list_in_out(self):
         def f(x, y):
             return [[x[0][0] + y]]
@@ -226,7 +400,6 @@ def f(x, y):
         self.assertEqual(expected_res, res)
         self.assertEqual(expected_grads, grads)
 
-
     def test_map_autograd_nested_list(self):
         import torch.utils._pytree as pytree
 
@@ -239,23 +412,49 @@ def fwbw(map_op, f, x, y):
             z = map_op(f, x, y)
             flat_x = pytree.tree_leaves(x)
             flat_z = pytree.tree_leaves(z)
-            grads = torch.autograd.grad(flat_z, flat_x, [torch.ones_like(z) for z in flat_z])
+            grads = torch.autograd.grad(
+                flat_z, flat_x, [torch.ones_like(z) for z in flat_z]
+            )
             return z, grads
 
-        x = [[torch.randn(3, 2, 2, requires_grad=True), torch.randn(3, 2, 1, requires_grad=True)],
-             torch.ones(3, 1, 2, requires_grad=True)]
+        x = [
+            [
+                torch.randn(3, 2, 2, requires_grad=True),
+                torch.randn(3, 2, 1, requires_grad=True),
+            ],
+            torch.ones(3, 1, 2, requires_grad=True),
+        ]
         y = torch.ones(1, requires_grad=True)
         true_outs = fwbw(control_flow.map, f, x, y)
         fake_outs = fwbw(_fake_map, f, x, y)
         self.assertEqual(true_outs, fake_outs)
 
 
+@unittest.skipIf(IS_WINDOWS, "Windows not supported for this test")
 @skipIfNoDynamoSupport
 class TestControlFlowTraced(TestCase):
     def setUp(self):
         torch._dynamo.reset()
         super().setUp()
 
+    def _check_tracing(self, fn, args, allow_non_fake_inputs=False):
+        graphs = {}
+        eager_res = fn(*args)
+        for tracing_mode in ["symbolic", "real", "fake"]:
+            graph = make_fx(
+                fn,
+                tracing_mode=tracing_mode,
+                _allow_non_fake_inputs=allow_non_fake_inputs,
+            )(*args)
+            graphs[tracing_mode] = graph
+            self.assertEqual(graph(*args), eager_res)
+        return graphs
+
+    def _check_compile(self, fn, args, *, backend="eager"):
+        eager_res = fn(*args)
+        compiled_fn = torch.compile(fn, backend=backend)
+        self.assertEqual(compiled_fn(*args), eager_res)
+
     def test_cond_traced_not_nested(self):
         def true_fn(x):
             return x.sin()
@@ -277,6 +476,320 @@ def f(x, y):
         graph = make_fx(f, tracing_mode="symbolic")(x, torch.tensor(False))
         self.assertEqual(graph(x, torch.tensor(True)), f(x, torch.tensor(True)))
 
+    def test_while_loop_nested_traced(self):
+        fn, inp = WHILE_LOOP_TESTS["nested"]
+        graphs = self._check_tracing(fn, inp)
+        self.assertExpectedInline(
+            graphs["symbolic"].code.strip("\n"),
+            """\
+def forward(self, out_iter_1, it_1, y_1):
+    while_loop_cond_graph_0 = self.while_loop_cond_graph_0
+    while_loop_body_graph_0 = self.while_loop_body_graph_0
+    while_loop = torch.ops.higher_order.while_loop(while_loop_cond_graph_0, while_loop_body_graph_0, (out_iter_1, it_1, y_1), ());  while_loop_cond_graph_0 = while_loop_body_graph_0 = out_iter_1 = it_1 = y_1 = None
+    getitem = while_loop[0]
+    getitem_1 = while_loop[1]
+    getitem_2 = while_loop[2];  while_loop = None
+    return (getitem, getitem_1, getitem_2)
+    """,  # noqa: B950
+        )
+        self.assertExpectedInline(
+            graphs["symbolic"].while_loop_cond_graph_0.code.strip("\n"),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1):
+    sum_1 = torch.ops.aten.sum.default(arg0_1);  arg0_1 = None
+    lt = torch.ops.aten.lt.Scalar(sum_1, 2);  sum_1 = None
+    return lt
+    """,
+        )
+        self.assertExpectedInline(
+            graphs["symbolic"].while_loop_body_graph_0.code.strip("\n"),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1):
+    while_loop_cond_graph_0 = self.while_loop_cond_graph_0
+    while_loop_body_graph_0 = self.while_loop_body_graph_0
+    while_loop = torch.ops.higher_order.while_loop(while_loop_cond_graph_0, while_loop_body_graph_0, (arg0_1, arg1_1, arg2_1), ());  while_loop_cond_graph_0 = while_loop_body_graph_0 = arg0_1 = arg1_1 = arg2_1 = None
+    getitem = while_loop[0]
+    getitem_1 = while_loop[1]
+    getitem_2 = while_loop[2];  while_loop = None
+    add = torch.ops.aten.add.Tensor(getitem, 1);  getitem = None
+    return (add, getitem_1, getitem_2)
+    """,  # noqa: B950
+        )
+
+    def _wrap_with_functionalize(self, fn, func_type):
+        mode = None
+        if func_type == "cpp":
+            fn = CppFunctionalizeAPI().functionalize(fn)
+        elif func_type == "python":
+            fn = PythonFunctionalizeAPI().functionalize(fn)
+            mode = FunctionalTensorMode()
+        elif func_type == "functorch":
+            fn = torch.func.functionalize(fn)
+        else:
+            assert func_type == "no"
+        return fn, mode
+
+    @parametrize("func_type", ["no", "cpp", "python", "functorch"])
+    def test_while_loop_simple_functionalize_check_graph(self, func_type):
+        fn, inp = WHILE_LOOP_TESTS["simple_with_mutation"]
+        fn, mode = self._wrap_with_functionalize(fn, func_type)
+        mode = mode if mode is not None else contextlib.nullcontext()
+        with mode:
+            graphs = self._check_tracing(fn, inp)
+        if func_type == "no":
+            self.assertExpectedInline(
+                graphs["symbolic"].code.strip("\n"),
+                """\
+def forward(self, x_1):
+    while_loop_cond_graph_0 = self.while_loop_cond_graph_0
+    while_loop_body_graph_0 = self.while_loop_body_graph_0
+    while_loop = torch.ops.higher_order.while_loop(while_loop_cond_graph_0, while_loop_body_graph_0, (x_1,), ());  while_loop_cond_graph_0 = while_loop_body_graph_0 = x_1 = None
+    getitem = while_loop[0];  while_loop = None
+    return (getitem,)
+    """,  # noqa: B950
+            )
+            self.assertExpectedInline(
+                graphs["symbolic"].while_loop_cond_graph_0.code.strip("\n"),
+                """\
+def forward(self, arg0_1):
+    clone = torch.ops.aten.clone.default(arg0_1);  arg0_1 = None
+    add_ = torch.ops.aten.add_.Tensor(clone, 1);  clone = None
+    add__1 = torch.ops.aten.add_.Tensor(add_, -1);  add_ = None
+    sum_1 = torch.ops.aten.sum.default(add__1);  add__1 = None
+    lt = torch.ops.aten.lt.Scalar(sum_1, 10);  sum_1 = None
+    return lt
+    """,
+            )
+            self.assertExpectedInline(
+                graphs["symbolic"].while_loop_body_graph_0.code.strip("\n"),
+                """\
+def forward(self, arg0_1):
+    clone = torch.ops.aten.clone.default(arg0_1);  arg0_1 = None
+    add_ = torch.ops.aten.add_.Tensor(clone, 1);  clone = None
+    add__1 = torch.ops.aten.add_.Tensor(add_, -1);  add_ = None
+    add = torch.ops.aten.add.Tensor(add__1, 1);  add__1 = None
+    return (add,)
+    """,
+            )
+        elif func_type == "python":
+            self.assertExpectedInline(
+                graphs["symbolic"].code.strip("\n"),
+                """\
+def forward(self, arg0_1):
+    while_loop_cond_graph_0 = self.while_loop_cond_graph_0
+    while_loop_body_graph_0 = self.while_loop_body_graph_0
+    while_loop = torch.ops.higher_order.while_loop(while_loop_cond_graph_0, while_loop_body_graph_0, (arg0_1,), ());  while_loop_cond_graph_0 = while_loop_body_graph_0 = arg0_1 = None
+    getitem = while_loop[0];  while_loop = None
+    return (getitem,)
+    """,  # noqa: B950
+            )
+            self.assertExpectedInline(
+                graphs["symbolic"].while_loop_cond_graph_0.code.strip("\n"),
+                """\
+def forward(self, arg0_1):
+    clone = torch.ops.aten.clone.default(arg0_1);  arg0_1 = None
+    add = torch.ops.aten.add.Tensor(clone, 1);  clone = None
+    add_1 = torch.ops.aten.add.Tensor(add, -1);  add = None
+    sum_1 = torch.ops.aten.sum.default(add_1);  add_1 = None
+    lt = torch.ops.aten.lt.Scalar(sum_1, 10);  sum_1 = None
+    return lt
+    """,
+            )
+            self.assertExpectedInline(
+                graphs["symbolic"].while_loop_body_graph_0.code.strip("\n"),
+                """\
+def forward(self, arg0_1):
+    clone = torch.ops.aten.clone.default(arg0_1);  arg0_1 = None
+    add = torch.ops.aten.add.Tensor(clone, 1);  clone = None
+    add_1 = torch.ops.aten.add.Tensor(add, -1);  add = None
+    add_2 = torch.ops.aten.add.Tensor(add_1, 1);  add_1 = None
+    return (add_2,)
+    """,
+            )
+        else:
+            self.assertExpectedInline(
+                graphs["symbolic"].code.strip("\n"),
+                """\
+def forward(self, x_1):
+    while_loop_cond_graph_0 = self.while_loop_cond_graph_0
+    while_loop_body_graph_0 = self.while_loop_body_graph_0
+    while_loop = torch.ops.higher_order.while_loop(while_loop_cond_graph_0, while_loop_body_graph_0, (x_1,), ());  while_loop_cond_graph_0 = while_loop_body_graph_0 = x_1 = None
+    getitem = while_loop[0];  while_loop = None
+    return (getitem,)
+    """,  # noqa: B950
+            )
+            self.assertExpectedInline(
+                graphs["symbolic"].while_loop_cond_graph_0.code.strip("\n"),
+                """\
+def forward(self, arg0_1):
+    clone = torch.ops.aten.clone.default(arg0_1);  arg0_1 = None
+    add = torch.ops.aten.add.Tensor(clone, 1);  clone = None
+    add_1 = torch.ops.aten.add.Tensor(add, -1);  add = None
+    sum_1 = torch.ops.aten.sum.default(add_1);  add_1 = None
+    lt = torch.ops.aten.lt.Scalar(sum_1, 10);  sum_1 = None
+    return lt
+    """,
+            )
+            self.assertExpectedInline(
+                graphs["symbolic"].while_loop_body_graph_0.code.strip("\n"),
+                """\
+def forward(self, arg0_1):
+    clone = torch.ops.aten.clone.default(arg0_1);  arg0_1 = None
+    add = torch.ops.aten.add.Tensor(clone, 1);  clone = None
+    add_1 = torch.ops.aten.add.Tensor(add, -1);  add = None
+    add_2 = torch.ops.aten.add.Tensor(add_1, 1);  add_1 = None
+    return (add_2,)
+    """,
+            )
+
+    @parametrize("func_type", ["no", "cpp", "python", "functorch"])
+    @parametrize("while_loop_test", list(WHILE_LOOP_TESTS.keys()))
+    def test_while_loop_functionalize(self, func_type, while_loop_test):
+        # simple_with_linear doesn't work becaue parameters and buffers
+        # are not inputs so they're not wrapped by functionalization and tracing.
+        if while_loop_test not in ("simple_with_linear", "nested_with_linear"):
+            fn, inp = WHILE_LOOP_TESTS[while_loop_test]
+            fn, mode = self._wrap_with_functionalize(fn, func_type)
+            mode = mode if mode is not None else contextlib.nullcontext()
+            with mode:
+                self._check_tracing(fn, inp)
+
+    @parametrize("while_loop_test", list(WHILE_LOOP_TESTS.keys()))
+    def test_while_loop_tracing(self, while_loop_test):
+        fn, inp = WHILE_LOOP_TESTS[while_loop_test]
+        allow_non_fake_inputs = (
+            False
+            if while_loop_test not in ("simple_with_linear", "nested_with_linear")
+            else True
+        )
+        self._check_tracing(fn, inp, allow_non_fake_inputs)
+
+    @parametrize("backend", ["eager", "aot_eager"])
+    @parametrize("while_loop_test", list(WHILE_LOOP_TESTS.keys()))
+    def test_while_loop_compile(self, backend, while_loop_test):
+        fn, inp = WHILE_LOOP_TESTS[while_loop_test]
+        self._check_compile(fn, inp, backend=backend)
+
+    @skipIfTorchDynamo("Graph is not captured by backend if test with dynamo")
+    def test_while_loop_simple_with_linear_compile_check_graph(self):
+        fn, inp = WHILE_LOOP_TESTS["simple_with_linear"]
+        from torch._dynamo.testing import EagerAndRecordGraphs
+
+        backend = EagerAndRecordGraphs()
+        torch.compile(fn, backend=backend)(*inp)
+        self.assertEqual(len(backend.graphs), 1)
+        gm = backend.graphs[0]
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
+def forward(self, L_iter_ : torch.Tensor, L_x_ : torch.Tensor):
+    l_iter_ = L_iter_
+    l_x_ = L_x_
+    l__self___dec = self.L__self___dec
+    l__self___linear_weight = self.L__self___linear_weight
+    l__self___linear_bias = self.L__self___linear_bias
+    cond_fn_0 = self.cond_fn_0
+    body_fn_0 = self.body_fn_0
+    while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (l_iter_, l_x_), (l__self___dec, l__self___linear_bias, l__self___linear_weight));  cond_fn_0 = body_fn_0 = l_iter_ = l_x_ = l__self___dec = l__self___linear_bias = l__self___linear_weight = None
+    getitem = while_loop[0]
+    getitem_1 = while_loop[1];  while_loop = None
+    return (getitem, getitem_1)""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            gm.cond_fn_0.code.strip(),
+            """\
+def forward(self, l_iter_, l_x_, l__self___dec_cond_fn, l__self___linear_bias_body_fn, l__self___linear_weight_body_fn):
+    sub = l_iter_ - l__self___dec_cond_fn;  l_iter_ = l__self___dec_cond_fn = None
+    gt = sub > 0;  sub = None
+    return gt""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            gm.body_fn_0.code.strip(),
+            """\
+def forward(self, l_iter_, l_x_, l__self___dec_cond_fn, l__self___linear_bias_body_fn, l__self___linear_weight_body_fn):
+    sub = l_iter_ - 1;  l_iter_ = None
+    linear = torch._C._nn.linear(l_x_, l__self___linear_weight_body_fn, l__self___linear_bias_body_fn);  l_x_ = l__self___linear_weight_body_fn = l__self___linear_bias_body_fn = None
+    return (sub, linear)""",  # noqa: B950
+        )
+
+    def test_while_loop_nested2_traced(self):
+        fn, inp = WHILE_LOOP_TESTS["nested2"]
+        graphs = self._check_tracing(fn, inp)
+        gm = graphs["symbolic"]
+        outer_body = gm.while_loop_body_graph_0
+        outer_cond = gm.while_loop_cond_graph_0
+        inner_body = outer_body.while_loop_body_graph_0
+        inner_cond = outer_body.while_loop_cond_graph_0
+        self.assertExpectedInline(
+            gm.code.strip("\n"),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1, arg3_1):
+    while_loop_cond_graph_0 = self.while_loop_cond_graph_0
+    while_loop_body_graph_0 = self.while_loop_body_graph_0
+    while_loop = torch.ops.higher_order.while_loop(while_loop_cond_graph_0, while_loop_body_graph_0, (arg0_1, arg1_1, arg2_1, arg3_1), ());  while_loop_cond_graph_0 = while_loop_body_graph_0 = arg0_1 = arg1_1 = arg2_1 = arg3_1 = None
+    getitem = while_loop[0]
+    getitem_1 = while_loop[1]
+    getitem_2 = while_loop[2]
+    getitem_3 = while_loop[3];  while_loop = None
+    return (getitem, getitem_1, getitem_2, getitem_3)
+    """,  # noqa: B950
+        )
+        self.assertExpectedInline(
+            outer_body.code.strip("\n"),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1, arg3_1):
+    while_loop_cond_graph_0 = self.while_loop_cond_graph_0
+    while_loop_body_graph_0 = self.while_loop_body_graph_0
+    while_loop = torch.ops.higher_order.while_loop(while_loop_cond_graph_0, while_loop_body_graph_0, (arg0_1, arg1_1, arg2_1, arg3_1), ());  while_loop_cond_graph_0 = while_loop_body_graph_0 = arg0_1 = arg1_1 = arg2_1 = arg3_1 = None
+    getitem = while_loop[0]
+    getitem_1 = while_loop[1]
+    getitem_2 = while_loop[2]
+    getitem_3 = while_loop[3];  while_loop = None
+    sub = torch.ops.aten.sub.Tensor(getitem, 1);  getitem = None
+    clone = torch.ops.aten.clone.default(getitem_1);  getitem_1 = None
+    mul = torch.ops.aten.mul.Tensor(getitem_2, 2);  getitem_2 = None
+    div = torch.ops.aten.div.Tensor(getitem_3, 2);  getitem_3 = None
+    return (sub, clone, mul, div)
+    """,  # noqa: B950
+        )
+        self.assertExpectedInline(
+            outer_body.code.strip("\n"),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1, arg3_1):
+    while_loop_cond_graph_0 = self.while_loop_cond_graph_0
+    while_loop_body_graph_0 = self.while_loop_body_graph_0
+    while_loop = torch.ops.higher_order.while_loop(while_loop_cond_graph_0, while_loop_body_graph_0, (arg0_1, arg1_1, arg2_1, arg3_1), ());  while_loop_cond_graph_0 = while_loop_body_graph_0 = arg0_1 = arg1_1 = arg2_1 = arg3_1 = None
+    getitem = while_loop[0]
+    getitem_1 = while_loop[1]
+    getitem_2 = while_loop[2]
+    getitem_3 = while_loop[3];  while_loop = None
+    sub = torch.ops.aten.sub.Tensor(getitem, 1);  getitem = None
+    clone = torch.ops.aten.clone.default(getitem_1);  getitem_1 = None
+    mul = torch.ops.aten.mul.Tensor(getitem_2, 2);  getitem_2 = None
+    div = torch.ops.aten.div.Tensor(getitem_3, 2);  getitem_3 = None
+    return (sub, clone, mul, div)
+    """,  # noqa: B950
+        )
+        self.assertExpectedInline(
+            inner_body.code.strip("\n"),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1, arg3_1):
+    clone = torch.ops.aten.clone.default(arg0_1);  arg0_1 = None
+    sub = torch.ops.aten.sub.Tensor(arg1_1, 1);  arg1_1 = None
+    add = torch.ops.aten.add.Tensor(arg2_1, 3.14);  arg2_1 = None
+    sub_1 = torch.ops.aten.sub.Tensor(arg3_1, 2.71);  arg3_1 = None
+    return (clone, sub, add, sub_1)
+    """,
+        )
+        self.assertExpectedInline(
+            inner_cond.code.strip("\n"),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1, arg3_1):
+    gt = torch.ops.aten.gt.Scalar(arg1_1, 0);  arg1_1 = None
+    return gt
+    """,
+        )
+
     def test_cond_nested_traced(self):
         def true_nested(y):
             return y * y
@@ -297,10 +810,18 @@ def f(x, pred, pred2):
         x = torch.randn(4)
         graph = make_fx(f)(x, torch.tensor(False), torch.tensor(False))
 
-        result_true_true = graph.forward(x, torch.tensor(True), torch.tensor(True))  # True + True -> x * x
-        result_true_false = graph.forward(x, torch.tensor(True), torch.tensor(False))  # True + True -> x + x
-        result_false_true = graph.forward(x, torch.tensor(False), torch.tensor(True))  # False + either -> cos
-        result_false_false = graph.forward(x, torch.tensor(False), torch.tensor(False))  # False + either -> cos
+        result_true_true = graph.forward(
+            x, torch.tensor(True), torch.tensor(True)
+        )  # True + True -> x * x
+        result_true_false = graph.forward(
+            x, torch.tensor(True), torch.tensor(False)
+        )  # True + True -> x + x
+        result_false_true = graph.forward(
+            x, torch.tensor(False), torch.tensor(True)
+        )  # False + either -> cos
+        result_false_false = graph.forward(
+            x, torch.tensor(False), torch.tensor(False)
+        )  # False + either -> cos
 
         self.assertNotEqual(result_true_true, result_true_false)
         self.assertFalse(torch.allclose(result_false_true, result_true_true))
@@ -312,8 +833,13 @@ def f(x, pred, pred2):
 
         self.assertEqual(result_false_true, torch.cos(x))
 
-        graph = make_fx(f, tracing_mode="symbolic")(x, torch.tensor(False), torch.tensor(False))
-        self.assertEqual(graph(x, torch.tensor(True), torch.tensor(True)), f(x, torch.tensor(True), torch.tensor(True)))
+        graph = make_fx(f, tracing_mode="symbolic")(
+            x, torch.tensor(False), torch.tensor(False)
+        )
+        self.assertEqual(
+            graph(x, torch.tensor(True), torch.tensor(True)),
+            f(x, torch.tensor(True), torch.tensor(True)),
+        )
 
     def test_cond_functionalized_hah(self):
         def true_fn(x):
@@ -342,7 +868,9 @@ def f(x):
 
         self.assertFalse(any(op._schema.is_mutable for op in all_ops_in_true_branch))
 
-        graph_module = make_fx(torch.func.functionalize(f), tracing_mode="symbolic")(*example_inputs)
+        graph_module = make_fx(torch.func.functionalize(f), tracing_mode="symbolic")(
+            *example_inputs
+        )
         self.assertEqual(graph_module(*example_inputs), f(*example_inputs))
 
     def test_cond_retrace_functionalized(self):
@@ -357,7 +885,9 @@ def f(x):
 
         inp = torch.ones(1, 2)
         gm_non_functional = make_fx(f, tracing_mode="real")(inp)
-        gm_functional = make_fx(torch.func.functionalize(gm_non_functional), tracing_mode="real")(inp)
+        gm_functional = make_fx(
+            torch.func.functionalize(gm_non_functional), tracing_mode="real"
+        )(inp)
         self.assertEqual(gm_functional(torch.zeros(1, 2)), f(torch.zeros(1, 2)))
 
     def test_cond_functionalized_nested(self):
@@ -389,7 +919,9 @@ def f(x):
 
         gm_true_true_branch = graph_module.true_graph_0.true_graph_0
 
-        graph_module1 = make_fx(torch.func.functionalize(f), tracing_mode="symbolic")(*example_inputs)
+        graph_module1 = make_fx(torch.func.functionalize(f), tracing_mode="symbolic")(
+            *example_inputs
+        )
         self.assertEqual(graph_module1(*example_inputs), f(*example_inputs))
 
         all_ops = []
@@ -432,10 +964,14 @@ def f(x):
 
         example_inputs = (torch.ones(4, 5),)
         functional_f = torch.func.functionalize(f)
-        with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
+        with self.assertRaisesRegex(
+            UnsupportedAliasMutationException, "One of torch.cond branch"
+        ):
             functional_f(*example_inputs)
 
-        with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
+        with self.assertRaisesRegex(
+            UnsupportedAliasMutationException, "One of torch.cond branch"
+        ):
             make_fx(torch.func.functionalize(f))(*example_inputs)
 
     def test_cond_functionalized_input_mutation_on_false_branch(self):
@@ -453,10 +989,14 @@ def f(x):
 
         example_inputs = (torch.ones(5, 5),)
         functional_f = torch.func.functionalize(f)
-        with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
+        with self.assertRaisesRegex(
+            UnsupportedAliasMutationException, "One of torch.cond branch"
+        ):
             functional_f(*example_inputs)
 
-        with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
+        with self.assertRaisesRegex(
+            UnsupportedAliasMutationException, "One of torch.cond branch"
+        ):
             make_fx(torch.func.functionalize(f))(*example_inputs)
 
     def test_cond_functionalized_output_alias_input(self):
@@ -474,10 +1014,16 @@ def f(x):
         example_inputs = (torch.ones(5, 5),)
         functional_f = torch.func.functionalize(f)
 
-        with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch might be aliasing"):
+        with self.assertRaisesRegex(
+            UnsupportedAliasMutationException,
+            "One of torch.cond branch might be aliasing",
+        ):
             functional_f(*example_inputs)
 
-        with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch might be aliasing"):
+        with self.assertRaisesRegex(
+            UnsupportedAliasMutationException,
+            "One of torch.cond branch might be aliasing",
+        ):
             make_fx(torch.func.functionalize(f))(*example_inputs)
 
     def test_cond_functionalized_nested_input_mutation(self):
@@ -501,10 +1047,14 @@ def f(x):
 
         example_inputs = (torch.ones(4, 5),)
         functional_f = torch.func.functionalize(f)
-        with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
+        with self.assertRaisesRegex(
+            UnsupportedAliasMutationException, "One of torch.cond branch"
+        ):
             functional_f(*example_inputs)
 
-        with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
+        with self.assertRaisesRegex(
+            UnsupportedAliasMutationException, "One of torch.cond branch"
+        ):
             make_fx(torch.func.functionalize(f))(*example_inputs)
 
     def test_cond_functionalized_nested_input_mutation_with_aot_func(self):
@@ -530,10 +1080,14 @@ def f(x):
         try:
             example_input_func = to_fun_old(example_input)
             torch._enable_functionalization(reapply_views=False)
-            with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
+            with self.assertRaisesRegex(
+                UnsupportedAliasMutationException, "One of torch.cond branch"
+            ):
                 f(example_input_func)
 
-            with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
+            with self.assertRaisesRegex(
+                UnsupportedAliasMutationException, "One of torch.cond branch"
+            ):
                 make_fx(f)(example_input_func)
         finally:
             torch._disable_functionalization()
@@ -546,12 +1100,14 @@ def wrapper(*args, **kwargs):
                     return func(*args, **kwargs)
                 finally:
                     torch._disable_functionalization()
+
             return wrapper
 
-        with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
+        with self.assertRaisesRegex(
+            UnsupportedAliasMutationException, "One of torch.cond branch"
+        ):
             make_fx(f_wrapper(f))(example_input_func)
 
-
     def test_cond_functionalized_input_aliasing_with_aot_func(self):
         def true_fn(x):
             return x
@@ -568,7 +1124,10 @@ def f(x):
         try:
             example_input_func = to_fun_old(example_input)
             torch._enable_functionalization(reapply_views=False)
-            with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch might be aliasing"):
+            with self.assertRaisesRegex(
+                UnsupportedAliasMutationException,
+                "One of torch.cond branch might be aliasing",
+            ):
                 f(example_input_func)
         finally:
             torch._disable_functionalization()
@@ -579,15 +1138,27 @@ def wrapper(*args, **kwargs):
                 torch._enable_functionalization(reapply_views=False)
                 try:
                     func_args = pytree.tree_map(
-                        lambda x: torch._to_functional_tensor(x) if isinstance(x, torch.Tensor) else x, args)
+                        lambda x: torch._to_functional_tensor(x)
+                        if isinstance(x, torch.Tensor)
+                        else x,
+                        args,
+                    )
                     func_kwargs = pytree.tree_map(
-                        lambda x: torch._to_functional_tensor(x) if isinstance(x, torch.Tensor) else x, kwargs)
+                        lambda x: torch._to_functional_tensor(x)
+                        if isinstance(x, torch.Tensor)
+                        else x,
+                        kwargs,
+                    )
                     return func(*func_args, **func_kwargs)
                 finally:
                     torch._disable_functionalization()
+
             return wrapper
 
-        with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch might be aliasing"):
+        with self.assertRaisesRegex(
+            UnsupportedAliasMutationException,
+            "One of torch.cond branch might be aliasing",
+        ):
             make_fx(f_wrapper(f))(example_input)
 
     def test_cond_functionalized_aot_func_check_functional(self):
@@ -611,12 +1182,19 @@ def wrapper(*args, **kwargs):
                 torch._enable_functionalization(reapply_views=False)
                 try:
                     func_args = pytree.tree_map(
-                        lambda x: to_fun_old(x) if isinstance(x, torch.Tensor) else x, args)
+                        lambda x: to_fun_old(x) if isinstance(x, torch.Tensor) else x,
+                        args,
+                    )
                     func_kwargs = pytree.tree_map(
-                        lambda x: to_fun_old(x) if isinstance(x, torch.Tensor) else x, kwargs)
-                    return pytree.tree_map(from_fun_old, func(*func_args, **func_kwargs))
+                        lambda x: to_fun_old(x) if isinstance(x, torch.Tensor) else x,
+                        kwargs,
+                    )
+                    return pytree.tree_map(
+                        from_fun_old, func(*func_args, **func_kwargs)
+                    )
                 finally:
                     torch._disable_functionalization()
+
             return wrapper
 
         result_gm = make_fx(f_wrapper(f))(example_input)
@@ -639,7 +1217,7 @@ def false_nested(y):
 
         def true_fn(k, pred2):
             z = cond(pred2, true_nested, false_nested, [k])
-            return torch.add(torch.tensor([.25, .25]), z)
+            return torch.add(torch.tensor([0.25, 0.25]), z)
 
         def false_fn(k, _):
             return k.cos()
@@ -679,7 +1257,9 @@ def f(x, pred, pred2):
         x = torch.randn(4)
         graph = make_fx(f)(x, torch.tensor(False), torch.tensor(False))
 
-        self.assertExpectedInline(graph.code.strip(), """\
+        self.assertExpectedInline(
+            graph.code.strip(),
+            """\
 def forward(self, x_1, pred_1, pred2_1):
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
@@ -690,11 +1270,15 @@ def forward(self, x_1, pred_1, pred2_1):
     conditional_1 = torch.ops.higher_order.cond(pred2_1, true_graph_1, false_graph_1, [x_1]);  pred2_1 = true_graph_1 = false_graph_1 = x_1 = None
     getitem_1 = conditional_1[0];  conditional_1 = None
     add = torch.ops.aten.add.Tensor(getitem, getitem_1);  getitem = getitem_1 = None
-    return add""")  # noqa: B950
-        self.assertExpectedInline(graph.true_graph_0.code.strip(), """\
+    return add""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            graph.true_graph_0.code.strip(),
+            """\
 def forward(self, arg0_1):
     mul = torch.ops.aten.mul.Tensor(arg0_1, arg0_1);  arg0_1 = None
-    return (mul,)""")
+    return (mul,)""",
+        )
 
     def test_raise_error_on_mismatch_type_size(self):
         def true_fn(x):
@@ -709,7 +1293,7 @@ def f(x, y):
         x = torch.randn(4)
         with self.assertRaisesRegex(
             torch._dynamo.exc.UncapturedHigherOrderOpError,
-            "Cond doesn't work unless it is captured completely with torch.compile"
+            "Cond doesn't work unless it is captured completely with torch.compile",
         ):
             make_fx(f)(x, torch.tensor(False))
 
@@ -726,7 +1310,7 @@ def f(x, y):
         x = torch.randn(4)
         with self.assertRaisesRegex(
             torch._dynamo.exc.UncapturedHigherOrderOpError,
-            "Cond doesn't work unless it is captured completely with torch.compile"
+            "Cond doesn't work unless it is captured completely with torch.compile",
         ):
             make_fx(f)(x, torch.tensor(False))
 
@@ -766,12 +1350,22 @@ def f(x, pred, pred2):
             return cond(pred, true_fn, false_fn, [x, pred2])
 
         x = torch.randn(4)
-        graph = make_fx(f, tracing_mode="fake")(x, torch.tensor(False), torch.tensor(False))
+        graph = make_fx(f, tracing_mode="fake")(
+            x, torch.tensor(False), torch.tensor(False)
+        )
 
-        result_true_true = graph.forward(x, torch.tensor(True), torch.tensor(True))  # True + True -> x * x
-        result_true_false = graph.forward(x, torch.tensor(True), torch.tensor(False))  # True + True -> x + x
-        result_false_true = graph.forward(x, torch.tensor(False), torch.tensor(True))  # False + either -> cos
-        result_false_false = graph.forward(x, torch.tensor(False), torch.tensor(False))  # False + either -> cos
+        result_true_true = graph.forward(
+            x, torch.tensor(True), torch.tensor(True)
+        )  # True + True -> x * x
+        result_true_false = graph.forward(
+            x, torch.tensor(True), torch.tensor(False)
+        )  # True + True -> x + x
+        result_false_true = graph.forward(
+            x, torch.tensor(False), torch.tensor(True)
+        )  # False + either -> cos
+        result_false_false = graph.forward(
+            x, torch.tensor(False), torch.tensor(False)
+        )  # False + either -> cos
 
         self.assertNotEqual(result_true_true, result_true_false)
         self.assertFalse(torch.allclose(result_false_true, result_true_true))
@@ -792,7 +1386,7 @@ def false_nested(y):
 
         def true_fn(k, pred2):
             z = cond(pred2, true_nested, false_nested, [k])
-            return torch.add(torch.tensor([.25, .25]), z)
+            return torch.add(torch.tensor([0.25, 0.25]), z)
 
         def false_fn(k, _):
             return k.cos()
@@ -801,7 +1395,9 @@ def f(k, pred, pred2):
             return cond(pred, true_fn, false_fn, [k, pred2])
 
         x = torch.tensor([0.5, 0.5])
-        graph = make_fx(f, tracing_mode="fake")(x, torch.tensor(False), torch.tensor(False))
+        graph = make_fx(f, tracing_mode="fake")(
+            x, torch.tensor(False), torch.tensor(False)
+        )
 
         a = torch.tensor([1.0, 1.0])
         result_true_true = graph.forward(a, torch.tensor(True), torch.tensor(True))
@@ -830,9 +1426,13 @@ def f(x, pred, pred2):
             return a_out + b_out
 
         x = torch.randn(4)
-        graph = make_fx(f, tracing_mode="fake")(x, torch.tensor(False), torch.tensor(False))
+        graph = make_fx(f, tracing_mode="fake")(
+            x, torch.tensor(False), torch.tensor(False)
+        )
 
-        self.assertExpectedInline(graph.code.strip(), """\
+        self.assertExpectedInline(
+            graph.code.strip(),
+            """\
 def forward(self, x_1, pred_1, pred2_1):
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
@@ -843,11 +1443,15 @@ def forward(self, x_1, pred_1, pred2_1):
     conditional_1 = torch.ops.higher_order.cond(pred2_1, true_graph_1, false_graph_1, [x_1]);  pred2_1 = true_graph_1 = false_graph_1 = x_1 = None
     getitem_1 = conditional_1[0];  conditional_1 = None
     add = torch.ops.aten.add.Tensor(getitem, getitem_1);  getitem = getitem_1 = None
-    return add""")  # noqa: B950
-        self.assertExpectedInline(graph.true_graph_0.code.strip(), """\
+    return add""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            graph.true_graph_0.code.strip(),
+            """\
 def forward(self, arg0_1):
     mul = torch.ops.aten.mul.Tensor(arg0_1, arg0_1);  arg0_1 = None
-    return (mul,)""")
+    return (mul,)""",
+        )
 
     def test_raise_error_on_mismatch_type_size_fake_tensor(self):
         def true_fn(x):
@@ -862,11 +1466,10 @@ def f(x, y):
         x = torch.randn(4)
         with self.assertRaisesRegex(
             torch._dynamo.exc.UncapturedHigherOrderOpError,
-            "Cond doesn't work unless it is captured completely with torch.compile"
+            "Cond doesn't work unless it is captured completely with torch.compile",
         ):
             make_fx(f, tracing_mode="fake")(x, torch.tensor(False))
 
-
     def test_raise_error_on_mismatch_tensor_size_fake_tensor(self):
         def true_fn(x):
             return x.sin()
@@ -880,7 +1483,7 @@ def f(x, y):
         x = torch.randn(4)
         with self.assertRaisesRegex(
             torch._dynamo.exc.UncapturedHigherOrderOpError,
-            "Cond doesn't work unless it is captured completely with torch.compile"
+            "Cond doesn't work unless it is captured completely with torch.compile",
         ):
             make_fx(f, tracing_mode="fake")(x, torch.tensor(False))
 
@@ -888,7 +1491,10 @@ def check_map_count(self, gm, op_count):
         i = 0
         for m in gm.modules():
             for node in m.graph.nodes:
-                if node.op == "call_function" and node.target == torch.ops.higher_order.map_impl:
+                if (
+                    node.op == "call_function"
+                    and node.target == torch.ops.higher_order.map_impl
+                ):
                     i += 1
         self.assertEqual(i, op_count)
 
@@ -929,7 +1535,9 @@ def g(xs, y, z):
             return out[0] + z, out[1] * z
 
         example_x = [[torch.ones(3, 4, 5)], torch.ones(3, 4, 5)]
-        gm = make_fx(g, tracing_mode="symbolic")(example_x, torch.ones(5), torch.ones(5))
+        gm = make_fx(g, tracing_mode="symbolic")(
+            example_x, torch.ones(5), torch.ones(5)
+        )
         x = [[torch.randn(4, 5, 6)], torch.ones(4, 5, 6)]
         y = torch.randn(6)
         z = torch.ones(6)
@@ -946,7 +1554,9 @@ def g(xs, y, z):
             return {"f": out["d"] + z, "g": out["e"] * z}
 
         example_x = {"b": {"a": torch.ones(3, 4, 5)}, "c": torch.ones(3, 4, 5)}
-        gm = make_fx(g, tracing_mode="symbolic")(example_x, torch.ones(5), torch.ones(5))
+        gm = make_fx(g, tracing_mode="symbolic")(
+            example_x, torch.ones(5), torch.ones(5)
+        )
         x = {"b": {"a": torch.randn(4, 5, 6)}, "c": torch.ones(4, 5, 6)}
         y = torch.randn(6)
         z = torch.ones(6)
@@ -962,14 +1572,15 @@ def g(xs, y):
             out = control_flow.map(f, xs, y)
             return torch.autograd.grad(out, (xs, y), torch.ones_like(out))
 
-        gm = make_fx(g, tracing_mode="symbolic")(torch.ones(3, 4, 5, requires_grad=True), torch.ones(5, requires_grad=True))
+        gm = make_fx(g, tracing_mode="symbolic")(
+            torch.ones(3, 4, 5, requires_grad=True), torch.ones(5, requires_grad=True)
+        )
         x = torch.randn(4, 5, 6, requires_grad=True)
         y = torch.randn(6, requires_grad=True)
         res = gm(x, y)
         self.assertEqual(res, g(x, y))
         self.check_map_count(gm, 2)
 
-
     def test_tracing_map_autograd_symbolic_list(self):
         import torch.utils._pytree as pytree
 
@@ -981,11 +1592,14 @@ def g(xs, y):
             flat_out = pytree.tree_leaves(out)
             flat_inp = pytree.tree_leaves((xs, y))
             requires_grad_inp = [inp for inp in flat_inp if inp.requires_grad]
-            return torch.autograd.grad(flat_out, requires_grad_inp, [torch.ones_like(out) for out in flat_out])
+            return torch.autograd.grad(
+                flat_out, requires_grad_inp, [torch.ones_like(out) for out in flat_out]
+            )
 
         gm = make_fx(g, tracing_mode="symbolic")(
             [torch.ones(3, 4, 5), torch.ones(3, 4, 5, requires_grad=True)],
-            torch.ones(5, requires_grad=True))
+            torch.ones(5, requires_grad=True),
+        )
         x = [torch.randn(4, 5, 6), torch.ones(4, 5, 6, requires_grad=True)]
         y = torch.randn(6, requires_grad=True)
         res = gm(x, y)
@@ -1001,11 +1615,21 @@ def g(xs, y):
             flat_out = pytree.tree_leaves(out)
             flat_inp = pytree.tree_leaves((xs, y))
             requires_grad_inp = [inp for inp in flat_inp if inp.requires_grad]
-            return torch.autograd.grad(flat_out, requires_grad_inp, [torch.ones_like(out) for out in flat_out])
+            return torch.autograd.grad(
+                flat_out, requires_grad_inp, [torch.ones_like(out) for out in flat_out]
+            )
 
-        traced_x = {"a": torch.ones(3, 4, 5, requires_grad=True), "b": torch.ones(3, 4, 5, requires_grad=True)}
-        gm = make_fx(g, tracing_mode="symbolic")(traced_x, torch.ones(5, requires_grad=True))
-        x = {"a": torch.randn(4, 5, 6, requires_grad=True), "b": torch.ones(4, 5, 6, requires_grad=True)}
+        traced_x = {
+            "a": torch.ones(3, 4, 5, requires_grad=True),
+            "b": torch.ones(3, 4, 5, requires_grad=True),
+        }
+        gm = make_fx(g, tracing_mode="symbolic")(
+            traced_x, torch.ones(5, requires_grad=True)
+        )
+        x = {
+            "a": torch.randn(4, 5, 6, requires_grad=True),
+            "b": torch.ones(4, 5, 6, requires_grad=True),
+        }
         y = torch.randn(6, requires_grad=True)
         res = gm(x, y)
         self.assertEqual(res, g(x, y))
@@ -1030,9 +1654,13 @@ def wrapper(*args, **kwargs):
                     return pytree.tree_map(from_fun_old, func(*args, **kwargs))
                 finally:
                     torch._disable_functionalization()
+
             return wrapper
 
-        example_inputs = (torch.ones(3, 2, 4, requires_grad=True), torch.ones(2, 4, requires_grad=True))
+        example_inputs = (
+            torch.ones(3, 2, 4, requires_grad=True),
+            torch.ones(2, 4, requires_grad=True),
+        )
         gm = make_fx(f, tracing_mode="symbolic")(*example_inputs)
         fgm = make_fx(f_wrapper(f), tracing_mode="symbolic")(*example_inputs)
         xs = torch.ones(3, 4, 5, requires_grad=True)
@@ -1049,6 +1677,7 @@ def count_mutable(gm):
                     elif schema := getattr(node.target, "_schema", None):
                         c += int(schema.is_mutable)
             return c
+
         self.assertEqual(count_mutable(fgm), 0)
         # One for forward, one for recomputation logic in backward
         self.assertEqual(count_mutable(gm), 2)
@@ -1069,7 +1698,9 @@ def f(xs, y):
         gm = make_fx(torch.func.functionalize(f))(*example_inputs)
         self.assertEqual(gm(*example_inputs), f(*example_inputs))
 
-        gm = make_fx(torch.func.functionalize(f), tracing_mode="symbolic")(*example_inputs)
+        gm = make_fx(torch.func.functionalize(f), tracing_mode="symbolic")(
+            *example_inputs
+        )
         self.assertEqual(gm(*example_inputs), f(*example_inputs))
 
         for node in gm.body_graph_0.graph.nodes:
@@ -1094,6 +1725,7 @@ def wrapper(*args, **kwargs):
                     return pytree.tree_map(from_fun_old, func(*args, **kwargs))
                 finally:
                     torch._disable_functionalization()
+
             return wrapper
 
         example_inputs = (torch.ones(3, 2, 4), torch.ones(4))
@@ -1116,7 +1748,9 @@ def f(xs, y):
 
         example_inputs = (torch.ones(3, 2, 4), torch.ones(4))
         functional_f = torch.func.functionalize(f)
-        with self.assertRaisesRegex(UnsupportedAliasMutationException, "torch.map is mutating the input!"):
+        with self.assertRaisesRegex(
+            UnsupportedAliasMutationException, "torch.map is mutating the input!"
+        ):
             functional_f(*example_inputs)
 
     def test_map_functionalized_elem_mutation(self):
@@ -1129,7 +1763,9 @@ def f(xs, y):
 
         example_inputs = (torch.ones(3, 2, 4), torch.ones(4))
         functional_f = torch.func.functionalize(f)
-        with self.assertRaisesRegex(UnsupportedAliasMutationException, "torch.map is mutating the input!"):
+        with self.assertRaisesRegex(
+            UnsupportedAliasMutationException, "torch.map is mutating the input!"
+        ):
             functional_f(*example_inputs)
 
     def test_cond_autograd_fail(self):
@@ -1142,7 +1778,10 @@ def false_fn(x):
         def f(x, y):
             return control_flow.cond(x.shape[0] > 4, true_fn, false_fn, [y])
 
-        example_inputs = (torch.ones(3, 2, 4, requires_grad=True), torch.ones(4, requires_grad=True))
+        example_inputs = (
+            torch.ones(3, 2, 4, requires_grad=True),
+            torch.ones(4, requires_grad=True),
+        )
         with self.assertRaisesRegex(RuntimeError, "Autograd not implemented for cond"):
             f(*example_inputs).sum().backward()
 
@@ -1159,7 +1798,9 @@ def f(xs):
 
         example_inputs = (torch.ones(3, 2, 4),)
         functional_f = torch.func.functionalize(f)
-        with self.assertRaisesRegex(UnsupportedAliasMutationException, "torch.map is aliasing the input!"):
+        with self.assertRaisesRegex(
+            UnsupportedAliasMutationException, "torch.map is aliasing the input!"
+        ):
             functional_f(*example_inputs)
 
     def test_nested_map_cond_real(self):
@@ -1209,7 +1850,6 @@ def g(pred, xs, y):
         self.check_map_count(gm, 1)
 
     def test_nested_cond_map_cond_symbolic(self):
-
         def true_fn(x, y):
             return x * y
 
@@ -1256,7 +1896,9 @@ def foo(x):
         # The symbols in make_fx's shape_env should not be specialized.
         self.assertEqual(len(gm.shape_env.guards), 0)
 
-        self.assertExpectedInline(gm.code.strip(), """\
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
 def forward(self, x_1):
     sym_size_int = torch.ops.aten.sym_size.int(x_1, 0)
     eq = sym_size_int == 4;  sym_size_int = None
@@ -1264,35 +1906,41 @@ def forward(self, x_1):
     false_graph_0 = self.false_graph_0
     conditional = torch.ops.higher_order.cond(eq, true_graph_0, false_graph_0, [x_1]);  eq = true_graph_0 = false_graph_0 = x_1 = None
     getitem = conditional[0];  conditional = None
-    return getitem""")  # noqa: B950
+    return getitem""",  # noqa: B950
+        )
 
         # We expect the traced graph module to work even if input size changes.
         x = torch.ones(4, 3, 2)
         self.assertEqual(gm(x), true_fn(x))
         self.assertEqual(foo(x), true_fn(x))
 
-
     def _check_closure_correctly_lifted(self, f, *, args, exp_res, exp_arg_num):
         assert isinstance(args, (tuple, list))
         self.assertEqual(f(*args), exp_res)
         gm = make_fx(f)(*args)
-        gm.print_readable()
         self.assertEqual(gm(*args), exp_res)
 
         def cnt_placeholder(gm):
             return len([node for node in gm.graph.nodes if node.op == "placeholder"])
+
         placeholder_cnts = [cnt_placeholder(mod) for mod in gm.children()]
         self.assertTrue(all(cnt == exp_arg_num for cnt in placeholder_cnts))
 
-    def _check_closure_correctly_lifted_with_mutation(self, f, closures_to_be_mutated, *, args, exp_arg_num):
+    def _check_closure_correctly_lifted_with_mutation(
+        self, f, closures_to_be_mutated, *, args, exp_arg_num
+    ):
         exp_res = f(*args)
-        self._check_closure_correctly_lifted(f, args=args, exp_res=exp_res, exp_arg_num=exp_arg_num)
+        self._check_closure_correctly_lifted(
+            f, args=args, exp_res=exp_res, exp_arg_num=exp_arg_num
+        )
 
         for closure in closures_to_be_mutated:
             closure.add(-1)
         new_exp_res = f(*args)
 
-        self._check_closure_correctly_lifted(f, args=args, exp_res=new_exp_res, exp_arg_num=exp_arg_num)
+        self._check_closure_correctly_lifted(
+            f, args=args, exp_res=new_exp_res, exp_arg_num=exp_arg_num
+        )
 
     def test_cond_with_tensor_closure(self):
         a = torch.ones(2, 3)
@@ -1307,10 +1955,11 @@ def false_fn(x):
         def foo(x):
             return cond(x.shape[0] == 4, true_fn, false_fn, [x])
 
-
         # expected branches takes [x, a, b] as input
         inp = torch.randn(2, 3)
-        self._check_closure_correctly_lifted_with_mutation(foo, (a, b), args=(inp, ), exp_arg_num=3)
+        self._check_closure_correctly_lifted_with_mutation(
+            foo, (a, b), args=(inp,), exp_arg_num=3
+        )
 
     def test_cond_with_tensor_closure_graph_module(self):
         a = torch.ones(2, 3)
@@ -1325,13 +1974,14 @@ def false_fn(x):
         def foo(x):
             return cond(x.shape[0] == 4, true_fn, false_fn, [x])
 
-
         # expected branches takes [x, a, b] as input
         inp = torch.randn(2, 3)
 
         gm = make_fx(foo)(inp)
 
-        self.assertExpectedInline(gm.code.strip(), """\
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
 def forward(self, x_1):
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
@@ -1339,17 +1989,23 @@ def forward(self, x_1):
     _tensor_constant1 = self._tensor_constant1
     conditional = torch.ops.higher_order.cond(False, true_graph_0, false_graph_0, [x_1, _tensor_constant0, _tensor_constant1]);  true_graph_0 = false_graph_0 = x_1 = _tensor_constant0 = _tensor_constant1 = None
     getitem = conditional[0];  conditional = None
-    return getitem""")  # noqa: B950
-        self.assertExpectedInline(gm.true_graph_0.code.strip(), """\
+    return getitem""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            gm.true_graph_0.code.strip(),
+            """\
 def forward(self, arg0_1, arg1_1, arg2_1):
     add = torch.ops.aten.add.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = None
-    return (add,)""")
+    return (add,)""",
+        )
 
     def test_cond_with_module_param_closure(self):
         class Mod(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.register_parameter("param", torch.nn.Parameter(torch.ones(2, 3), requires_grad=False))
+                self.register_parameter(
+                    "param", torch.nn.Parameter(torch.ones(2, 3), requires_grad=False)
+                )
                 self.register_buffer("buffer", torch.ones(2, 3) + 1)
 
         my_mode = Mod()
@@ -1365,11 +2021,11 @@ def foo(x):
 
         inp = torch.ones(2, 3)
         # expected both branches takes (x, param, buffer)
-        self._check_closure_correctly_lifted_with_mutation(foo, (my_mode.param, my_mode.buffer), args=(inp,), exp_arg_num=3)
-
+        self._check_closure_correctly_lifted_with_mutation(
+            foo, (my_mode.param, my_mode.buffer), args=(inp,), exp_arg_num=3
+        )
 
     def test_cond_with_module_python_scalar_closure(self):
-
         def foo(x):
             a = torch.ones(1, 1)
             b = 1
@@ -1379,12 +2035,15 @@ def true_fn(x):
 
             def false_fn(x):
                 return x + b
+
             return cond(x.shape[0] == 4, true_fn, false_fn, [x])
 
         inp = torch.ones(2, 3)
         res = inp + 1
         # python scalar b is not lifted as input, so both branches take (x, a)
-        self._check_closure_correctly_lifted(foo, args=(inp,), exp_res=res, exp_arg_num=2)
+        self._check_closure_correctly_lifted(
+            foo, args=(inp,), exp_res=res, exp_arg_num=2
+        )
 
     def test_cond_nested_with_closure(self):
         a = torch.ones(1, 1)
@@ -1402,12 +2061,17 @@ def true_fn(x):
 
             def false_fn(x):
                 return cond(x.shape[0] > 4, inner_true_fn, inner_false_fn, [x])
+
             return cond(x.shape[0] == 4, true_fn, false_fn, [x])
 
         inp = torch.ones(2, 3)
-        # For top-level cond, it take 5 arguments (x, a, b, a, b)
+        # For top-level cond, it take 3 arguments (x, a, b). Dynamo should
+        # realize that the nonlocal variables are same for the true and false
+        # branches, so it should de-dupe them.
         # For second-level conds, it takes (x, a, b)
-        self._check_closure_correctly_lifted_with_mutation(foo, (a, b), args=(inp,), exp_arg_num=5)
+        self._check_closure_correctly_lifted_with_mutation(
+            foo, (a, b), args=(inp,), exp_arg_num=3
+        )
 
     def test_cond_nested_with_closure_graph_module(self):
         a = torch.ones(1, 1)
@@ -1425,12 +2089,14 @@ def true_fn(x):
 
             def false_fn(x):
                 return cond(x.shape[0] > 4, inner_true_fn, inner_false_fn, [x])
+
             return cond(x.shape[0] == 4, true_fn, false_fn, [x])
 
     def test_map_unfunc_boolean_tensor_for_nested_map_cond(self):
         def map_fn(pred, x):
             def fn(x, pred):
-                return control_flow.cond(pred, lambda x: x * 2, lambda x: x / 2 , (x,))
+                return control_flow.cond(pred, lambda x: x * 2, lambda x: x / 2, (x,))
+
             return control_flow.map(fn, x, pred)
 
         def f_wrapper(func):
@@ -1439,32 +2105,45 @@ def wrapper(*args, **kwargs):
                 torch._enable_functionalization(reapply_views=False)
                 try:
                     func_args = pytree.tree_map(
-                        lambda x: to_fun_old(x) if isinstance(x, torch.Tensor) else x, args)
+                        lambda x: to_fun_old(x) if isinstance(x, torch.Tensor) else x,
+                        args,
+                    )
                     func_kwargs = pytree.tree_map(
-                        lambda x: to_fun_old(x) if isinstance(x, torch.Tensor) else x, kwargs)
-                    return pytree.tree_map(from_fun_old, func(*func_args, **func_kwargs))
+                        lambda x: to_fun_old(x) if isinstance(x, torch.Tensor) else x,
+                        kwargs,
+                    )
+                    return pytree.tree_map(
+                        from_fun_old, func(*func_args, **func_kwargs)
+                    )
                 finally:
                     torch._disable_functionalization()
+
             return wrapper
 
-        gm = make_fx(f_wrapper(map_fn))(torch.tensor(True), torch.ones([2, 3], requires_grad=False))
-        self.assertExpectedInline(gm.code.strip(), """\
+        gm = make_fx(f_wrapper(map_fn))(
+            torch.tensor(True), torch.ones([2, 3], requires_grad=False)
+        )
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
 def forward(self, pred_1, x_1):
     body_graph_0 = self.body_graph_0
-    map_impl = torch.ops.higher_order.map_impl(body_graph_0, 1, x_1, pred_1);\
-  body_graph_0 = x_1 = pred_1 = None
+    map_impl = torch.ops.higher_order.map_impl(body_graph_0, [x_1], [pred_1]);  body_graph_0 = x_1 = pred_1 = None
     getitem = map_impl[0];  map_impl = None
-    return getitem""")
-        self.assertExpectedInline(gm.body_graph_0.code.strip(), """\
+    return getitem""",
+        )
+        self.assertExpectedInline(
+            gm.body_graph_0.code.strip(),
+            """\
 def forward(self, arg0_1, arg1_1):
     true_graph_0 = self.true_graph_0
     false_graph_0 = self.false_graph_0
     conditional = torch.ops.higher_order.cond(arg1_1, true_graph_0, false_graph_0, [arg0_1]);  arg1_1 = true_graph_0 = false_graph_0 = arg0_1 = None
     getitem = conditional[0];  conditional = None
-    return [getitem]""")  # noqa: B950
+    return [getitem]""",  # noqa: B950
+        )
 
     def test_cond_make_fx_preserve_stack_trace_for_nodes_in_subgraph(self):
-
         def true_fn(x):
             return x + x.cos()
 
@@ -1473,22 +2152,28 @@ def false_fn(x):
 
         def foo(x):
             return cond(x.shape[0] == 4, true_fn, false_fn, (x,))
+
         inp = torch.randn([4, 3])
         gm, _ = torch._dynamo.export(foo)(inp)
 
         def run_with_interpreter(*args):
             with torch.fx.traceback.preserve_node_meta():
                 return torch.fx.Interpreter(gm).run(*args)
-        new_gm = make_fx(run_with_interpreter)(inp)
 
+        new_gm = make_fx(run_with_interpreter)(inp)
 
         checked_ops = {"add", "mul", "sin", "cos"}
         checked_meta = ["source_fn_stack", "stack_trace"]
         all_source_fns = collect_meta_for_filtered_nodes(gm, checked_ops, checked_meta)
-        new_source_fns = collect_meta_for_filtered_nodes(new_gm, checked_ops, checked_meta)
+        new_source_fns = collect_meta_for_filtered_nodes(
+            new_gm, checked_ops, checked_meta
+        )
         self.assertEqual(all_source_fns, new_source_fns)
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "triggers cache limit for foo and changes unique_graphs count.")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO,
+        "triggers cache limit for foo and changes unique_graphs count.",
+    )
     def test_cond_no_dynamo_cache_limit(self):
         torch._dynamo.reset()
         counters = torch._dynamo.utils.counters
@@ -1508,7 +2193,9 @@ def make_dummy_fn(op):
 
         for _ in range(iter_n):
             # each lambda has a different object id thus fails the guard
-            self.assertEqual(foo(inp, make_dummy_fn("cos"), make_dummy_fn("sin")), exp_out)
+            self.assertEqual(
+                foo(inp, make_dummy_fn("cos"), make_dummy_fn("sin")), exp_out
+            )
 
         # each iteration captures a cond and a getitem from the tuple output
         self.assertEqual(counters["stats"]["calls_captured"], iter_n * 2)
@@ -1526,8 +2213,10 @@ def foo(x):
 
         inps = (torch.ones(3, 4), torch.ones(3, 5), torch.ones(5, 4), torch.ones(5, 3))
         for inp in inps:
-            gm = make_fx(foo, tracing_mode='symbolic')(torch.ones(3, 4))
-            self.assertExpectedInline(gm.code.strip(), """\
+            gm = make_fx(foo, tracing_mode="symbolic")(torch.ones(3, 4))
+            self.assertExpectedInline(
+                gm.code.strip(),
+                """\
 def forward(self, x_1):
     sym_size_int = torch.ops.aten.sym_size.int(x_1, 0)
     eq = sym_size_int == 4;  sym_size_int = None
@@ -1535,29 +2224,41 @@ def forward(self, x_1):
     false_graph_0 = self.false_graph_0
     conditional = torch.ops.higher_order.cond(eq, true_graph_0, false_graph_0, [x_1]);  eq = true_graph_0 = false_graph_0 = x_1 = None
     getitem = conditional[0];  conditional = None
-    return getitem""")  # noqa: B950
+    return getitem""",  # noqa: B950
+            )
 
-            self.assertExpectedInline(gm.true_graph_0.code.strip(), """\
+            self.assertExpectedInline(
+                gm.true_graph_0.code.strip(),
+                """\
 def forward(self, arg0_1):
     cos = torch.ops.aten.cos.default(arg0_1)
     sub = torch.ops.aten.sub.Tensor(arg0_1, cos);  arg0_1 = cos = None
-    return (sub,)""")
+    return (sub,)""",
+            )
 
-            self.assertExpectedInline(gm.false_graph_0.code.strip(), """\
+            self.assertExpectedInline(
+                gm.false_graph_0.code.strip(),
+                """\
 def forward(self, arg0_1):
     sin = torch.ops.aten.sin.default(arg0_1)
     add = torch.ops.aten.add.Tensor(arg0_1, sin);  arg0_1 = sin = None
-    return (add,)""")
+    return (add,)""",
+            )
 
-    def _create_test_fns_for_cond(self, pred, inner_most_fn, operands, closure_list, nested_level):
+    def _create_test_fns_for_cond(
+        self, pred, inner_most_fn, operands, closure_list, nested_level
+    ):
         if nested_level == 0:
             if len(closure_list) > 0:
+
                 def true_fn(*operands):
                     return inner_most_fn(*operands) + inner_most_fn(*closure_list)
 
                 def false_fn(*operands):
                     return inner_most_fn(*operands) - inner_most_fn(*closure_list)
+
             else:
+
                 def true_fn(*operands):
                     return inner_most_fn(*operands)
 
@@ -1568,9 +2269,12 @@ def fn(*operands):
                 if len(operands) == 0 and len(closure_list) == 0:
                     return torch.zeros(1)
                 return cond(pred, true_fn, false_fn, operands)
+
             return operands, fn
         else:
-            args, inner_fn = self._create_test_fns_for_cond(pred <= 0, inner_most_fn, operands, closure_list, nested_level - 1)
+            args, inner_fn = self._create_test_fns_for_cond(
+                pred <= 0, inner_most_fn, operands, closure_list, nested_level - 1
+            )
 
             def true_fn(*operands):
                 return inner_most_fn(*operands) + inner_fn(*args)
@@ -1582,6 +2286,7 @@ def fn(*operands):
                 if len(operands) == 0 and len(closure_list) == 0:
                     return torch.ones(1)
                 return cond(pred, true_fn, false_fn, operands)
+
             return operands, fn
 
     def _init_predicate(self, pred_type):
@@ -1590,7 +2295,7 @@ def _init_predicate(self, pred_type):
         elif pred_type == "intTensor":
             return torch.tensor(1)
         elif pred_type == "floatTensor":
-            return torch.tensor(1.)
+            return torch.tensor(1.0)
         elif pred_type == "boolTensor":
             return torch.tensor(False)
         else:
@@ -1611,17 +2316,23 @@ def _init_fn(self, inner_fn_type):
     @parametrize("nOperands", [0, 1])
     @parametrize("nClosure", [0, 1])
     @parametrize("nesting", [0, 2])
-    def test_cond_tracing_with_valid_inputs(self, predType, innerFnType, nOperands, nClosure, nesting):
+    def test_cond_tracing_with_valid_inputs(
+        self, predType, innerFnType, nOperands, nClosure, nesting
+    ):
         pred = self._init_predicate(predType)
         inner_fn = self._init_fn(innerFnType)
         operands = [torch.ones(2, 3) + i for i in range(nOperands)]
         closure = [torch.ones(2, 3) - i for i in range(nClosure)]
-        args, fn = self._create_test_fns_for_cond(pred, inner_fn, operands, closure, nesting)
+        args, fn = self._create_test_fns_for_cond(
+            pred, inner_fn, operands, closure, nesting
+        )
         eager_res = fn(*args)
         for tracing_mode in ["symbolic", "fake", "real"]:
             # set _allow_non_fake_inputs = True to allow fake prop through closures
             with self.subTest(tracing_mode=tracing_mode):
-                gm = make_fx(fn, tracing_mode=tracing_mode, _allow_non_fake_inputs=True)(*args)
+                gm = make_fx(
+                    fn, tracing_mode=tracing_mode, _allow_non_fake_inputs=True
+                )(*args)
                 self.assertEqual(gm(*args), eager_res)
 
     @parametrize("predType", ["boolTensor"])
@@ -1634,7 +2345,9 @@ def test_cond_vmap(self, predType, innerFnType, nOperands, nClosure, nesting):
         inner_fn = self._init_fn(innerFnType)
         operands = [torch.ones(2, 3) + i for i in range(nOperands)]
         closure = [torch.ones(2, 3) - i for i in range(nClosure)]
-        args, fn = self._create_test_fns_for_cond(pred, inner_fn, operands, closure, nesting)
+        args, fn = self._create_test_fns_for_cond(
+            pred, inner_fn, operands, closure, nesting
+        )
         eager_res = fn(*args)
         out = torch.vmap(fn)(*args)
         if nClosure == 0:
@@ -1644,13 +2357,12 @@ def test_cond_vmap(self, predType, innerFnType, nOperands, nClosure, nesting):
             self.assertEqual(eager_res, out[1])
 
     def test_cond_vmap_simple(self):
-
         def fn(x):
             return torch.cond(
                 pred=torch.tensor([True]),
                 true_fn=lambda x: x + 100,
                 false_fn=lambda x: x,
-                operands=(x,)
+                operands=(x,),
             )
 
         a = torch.arange(15).reshape((3, 5))
@@ -1659,30 +2371,24 @@ def fn(x):
         self.assertEqual(res, a + 100)
 
     def test_cond_vmap_multiple_inputs(self):
-
         def fn(x, y):
             return torch.cond(
                 pred=x.sum() < y.sum(),
                 true_fn=lambda x, y: x + 100,
                 false_fn=lambda x, y: y,
-                operands=(x, y)
+                operands=(x, y),
             )
 
         a = torch.arange(15).reshape(3, 5)
         b = torch.ones_like(a) + 3
         res = torch.vmap(fn, in_dims=(0, 0))(a, b)
         expected = torch.tensor(
-            [
-                [100, 101, 102, 103, 104],
-                [4, 4, 4, 4, 4],
-                [4, 4, 4, 4, 4]
-            ]
+            [[100, 101, 102, 103, 104], [4, 4, 4, 4, 4], [4, 4, 4, 4, 4]]
         )
         self.assertEqual(res.shape, (3, 5))
         self.assertEqual(expected, res)
 
     def test_cond_vmap_single_input_with_closure(self):
-
         a = torch.ones((3, 5)) + 3
         c = torch.arange(5)
 
@@ -1691,14 +2397,19 @@ def fn(x):
                 pred=torch.tensor([True]),
                 true_fn=lambda x: x + c,
                 false_fn=lambda x: x - c,
-                operands=(x,)
+                operands=(x,),
             )
 
-        res = torch.vmap(fn, in_dims=(0,))(a,)
+        res = torch.vmap(fn, in_dims=(0,))(
+            a,
+        )
+        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
+            res = torch.vmap(fn, in_dims=(0,))(
+                a,
+            )
         self.assertEqual(a + c, res)
 
     def test_cond_vmap_multiple_args_with_closure(self):
-
         a = torch.ones((3, 5), dtype=torch.int64) + 3
         b = torch.arange(15).reshape(3, 5)
         c = torch.arange(5)
@@ -1708,7 +2419,7 @@ def fn(x, y):
                 pred=torch.tensor([False]),
                 true_fn=lambda x, y: x + c,
                 false_fn=lambda x, y: y - c,
-                operands=(x, y)
+                operands=(x, y),
             )
 
         res = torch.vmap(fn)(a, b)
@@ -1716,7 +2427,6 @@ def fn(x, y):
 
     @parametrize("nClosure", [0, 1])
     def test_cond_vmap_multiple_outputs(self, nClosure):
-
         if nClosure:
             c = torch.ones(5, dtype=torch.int64) + 5
 
@@ -1725,19 +2435,23 @@ def fn(x):
                     pred=torch.tensor([True]),
                     true_fn=lambda x: (x + c, x - c),
                     false_fn=lambda x: (x, x),
-                    operands=(x,)
+                    operands=(x,),
                 )
+
         else:
+
             def fn(x):
                 return torch.cond(
                     pred=torch.tensor([True]),
                     true_fn=lambda x: (x + 1, x - 1),
                     false_fn=lambda x: (x, x),
-                    operands=(x,)
+                    operands=(x,),
                 )
 
         a = torch.arange(15).reshape(3, 5)
-        res = torch.vmap(fn)(a,)
+        res = torch.vmap(fn)(
+            a,
+        )
         self.assertEqual(len(res), 2)
         if nClosure:
             self.assertEqual(res, (a + c, a - c))
@@ -1750,7 +2464,7 @@ def fn(x):
                 pred=torch.tensor([True]),
                 true_fn=lambda x: x + 1,
                 false_fn=lambda x: x - 1,
-                operands=(x,)
+                operands=(x,),
             )
 
         def wrapper(x):
@@ -1760,7 +2474,8 @@ def wrapper(x):
         res = torch.vmap(wrapper)(a)
         self.assertEqual(res, a + 1)
 
+
 instantiate_parametrized_tests(TestControlFlowTraced)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_dims.py b/test/functorch/test_dims.py
index 21ba04fe55181..0181c1c4d22e3 100644
--- a/test/functorch/test_dims.py
+++ b/test/functorch/test_dims.py
@@ -5,25 +5,43 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-from functorch.dim import Tensor, Dim, dims, dimlists, stack, DimensionBindError, DimList
-
-from attn_ft import BertSelfAttention as BertSelfAttentionA, Linear
-from attn_positional import BertSelfAttention as BertSelfAttentionB
-
-from torch.testing._internal.common_utils import TestCase, run_tests, TEST_CUDA
+import gc
 
 from unittest import skip, skipIf
+
 import torch
-import gc
+
+from attn_ft import BertSelfAttention as BertSelfAttentionA, Linear
+from attn_positional import BertSelfAttention as BertSelfAttentionB
 
 from functorch._C import dim as _C
+from functorch.dim import (
+    Dim,
+    DimensionBindError,
+    DimList,
+    dimlists,
+    dims,
+    stack,
+    Tensor,
+)
+
+from torch.testing._internal.common_utils import (
+    run_tests,
+    skipIfTorchDynamo,
+    TEST_CUDA,
+    TestCase,
+)
 
 try:
     from torchvision.models import resnet18
 except ImportError:
     resnet18 = None
 
-_test_c, _parse_test, _set_pointwise_optimize = _C._test_c, _C._parse_test, _C._set_pointwise_optimize
+_test_c, _parse_test, _set_pointwise_optimize = (
+    _C._test_c,
+    _C._parse_test,
+    _C._set_pointwise_optimize,
+)
 
 from contextlib import contextmanager
 from time import perf_counter
@@ -32,10 +50,12 @@
 if measure_perf:
     from torchdim.magic_trace import magic_trace
 else:
+
     @contextmanager
     def magic_trace(*args, **kwargs):
         yield
 
+
 @contextmanager
 def measure(what):
     b = perf_counter()
@@ -43,12 +63,14 @@ def measure(what):
     e = perf_counter()
     print(f"{what}: {e - b:.20f} seconds")
 
+
 def triu(A):
     i, j = dims()
     a = A[i, j]
     zero = torch.tensor(0, dtype=torch.float)  # XXX - torch.where is janky...
     return torch.where(i <= j, a, zero).order(i, j)
 
+
 def gpu_time(lmb, name, r=100):
     b = torch.cuda.Event(enable_timing=True)
     e = torch.cuda.Event(enable_timing=True)
@@ -71,8 +93,9 @@ def gpu_time(lmb, name, r=100):
     print(name, elapsed / r)
     return elapsed / r
 
-class TestMin(TestCase):
 
+@skipIfTorchDynamo("Bad interaction")
+class TestMin(TestCase):
     def setUp(self):
         super().setUp()
         gc.disable()
@@ -81,32 +104,36 @@ def setUp(self):
         for o in gc.get_objects():
             if isinstance(o, (torch.Tensor, Dim, Tensor, DimList)):
                 self.interesting.add(id(o))
-        if 'cuda' in self._testMethodName:
+        if "cuda" in self._testMethodName:
             self.mem_allocated = torch.cuda.memory_allocated()
 
     def tearDown(self):
         interesting = []
         for o in gc.get_objects():
-            if isinstance(o, (torch.Tensor, Dim, Tensor, DimList)) and id(o) not in self.interesting:
+            if (
+                isinstance(o, (torch.Tensor, Dim, Tensor, DimList))
+                and id(o) not in self.interesting
+            ):
                 interesting.append(o)
 
         extra_memory = 0
-        if 'cuda' in self._testMethodName:
+        if "cuda" in self._testMethodName:
             extra_memory += torch.cuda.memory_allocated() - self.mem_allocated
 
         #  nolevels = _n_levels_in_use() == 0
         if extra_memory != 0 or len(interesting) != 0:
             import refcycle
-            refcycle.garbage().export_image('garbage.pdf')
+
+            refcycle.garbage().export_image("garbage.pdf")
         gc.collect()
         # assert nolevels, f"cleanup failed? {_n_levels_in_use()}"
-        assert extra_memory == 0, f'extra cuda memory left allocated: {extra_memory}'
-        assert len(interesting) == 0, \
-            f'extra torch.Tensor, Dim, or Tensor left allocated: {len(interesting)} objects of types:' \
-            f' { [type(t) for t in interesting] }'
+        assert extra_memory == 0, f"extra cuda memory left allocated: {extra_memory}"
+        assert len(interesting) == 0, (
+            f"extra torch.Tensor, Dim, or Tensor left allocated: {len(interesting)} objects of types:"
+            f" { [type(t) for t in interesting] }"
+        )
 
     def test_manual_stuff(self):
-
         A_ = torch.rand(3, 4)
         B_ = torch.rand(4, 5)
         i, j, k = dims()
@@ -122,33 +149,71 @@ def test_manual_stuff(self):
 
         A.index([i], [D]).order(k, d)
 
-    def attn(self, batch_size=1, sequence_length=4, hidden_size=6, num_attention_heads=3, linear=Linear, device=None, time=False):
+    def attn(
+        self,
+        batch_size=1,
+        sequence_length=4,
+        hidden_size=6,
+        num_attention_heads=3,
+        linear=Linear,
+        device=None,
+        time=False,
+    ):
         def maybe_to(x):
             return x if device is None else x.to(device)
 
-        attention_probs_dropout_prob = 0.
-        A = maybe_to(BertSelfAttentionA(hidden_size, num_attention_heads, attention_probs_dropout_prob, linear=linear))
-        B = maybe_to(BertSelfAttentionB(hidden_size, num_attention_heads, attention_probs_dropout_prob))
-
+        attention_probs_dropout_prob = 0.0
+        A = maybe_to(
+            BertSelfAttentionA(
+                hidden_size,
+                num_attention_heads,
+                attention_probs_dropout_prob,
+                linear=linear,
+            )
+        )
+        B = maybe_to(
+            BertSelfAttentionB(
+                hidden_size, num_attention_heads, attention_probs_dropout_prob
+            )
+        )
 
         A.load_state_dict(B.state_dict())
         hidden_state = maybe_to(torch.rand(batch_size, sequence_length, hidden_size))
         b_out = B(hidden_state)
         a_out = A(hidden_state)
-        self.assertTrue(torch.allclose(a_out, b_out))  # why does a simple matmul not do the right thing?
+        self.assertTrue(
+            torch.allclose(a_out, b_out)
+        )  # why does a simple matmul not do the right thing?
 
         if time:
             gpu_time(lambda: B(hidden_state), "positional", r=3)
             gpu_time(lambda: A(hidden_state), "first_class", r=3)
 
-        for approach in ('relative_key', 'relative_key_query'):
-            A = maybe_to(BertSelfAttentionA(hidden_size, num_attention_heads,
-                         attention_probs_dropout_prob, approach, sequence_length, linear=linear))
-            B = maybe_to(BertSelfAttentionB(hidden_size, num_attention_heads,
-                         attention_probs_dropout_prob, approach, sequence_length))
+        for approach in ("relative_key", "relative_key_query"):
+            A = maybe_to(
+                BertSelfAttentionA(
+                    hidden_size,
+                    num_attention_heads,
+                    attention_probs_dropout_prob,
+                    approach,
+                    sequence_length,
+                    linear=linear,
+                )
+            )
+            B = maybe_to(
+                BertSelfAttentionB(
+                    hidden_size,
+                    num_attention_heads,
+                    attention_probs_dropout_prob,
+                    approach,
+                    sequence_length,
+                )
+            )
             A.load_state_dict(B.state_dict())
 
-            hidden_state = maybe_to(torch.rand(batch_size, sequence_length, hidden_size))
+            hidden_state = maybe_to(
+                torch.rand(batch_size, sequence_length, hidden_size)
+            )
             b_out = B(hidden_state)
             a_out = A(hidden_state)
             self.assertTrue(torch.allclose(a_out, b_out))
@@ -157,17 +222,46 @@ def maybe_to(x):
                 gpu_time(lambda: B(hidden_state), "positional", r=3)
                 gpu_time(lambda: A(hidden_state), "first_class", r=3)
 
-        A = maybe_to(BertSelfAttentionA(hidden_size, num_attention_heads,
-                                        attention_probs_dropout_prob, None, None, linear=linear))
-        B = maybe_to(BertSelfAttentionB(hidden_size, num_attention_heads,
-                                        attention_probs_dropout_prob, None, None))
+        A = maybe_to(
+            BertSelfAttentionA(
+                hidden_size,
+                num_attention_heads,
+                attention_probs_dropout_prob,
+                None,
+                None,
+                linear=linear,
+            )
+        )
+        B = maybe_to(
+            BertSelfAttentionB(
+                hidden_size,
+                num_attention_heads,
+                attention_probs_dropout_prob,
+                None,
+                None,
+            )
+        )
         A.load_state_dict(B.state_dict())
 
         hidden_state = maybe_to(torch.rand(batch_size, sequence_length, hidden_size))
-        past_key_value = (maybe_to(torch.rand(batch_size, num_attention_heads,
-                                   sequence_length, hidden_size // num_attention_heads)),
-                          maybe_to(torch.rand(batch_size, num_attention_heads,
-                                   sequence_length, hidden_size // num_attention_heads)))
+        past_key_value = (
+            maybe_to(
+                torch.rand(
+                    batch_size,
+                    num_attention_heads,
+                    sequence_length,
+                    hidden_size // num_attention_heads,
+                )
+            ),
+            maybe_to(
+                torch.rand(
+                    batch_size,
+                    num_attention_heads,
+                    sequence_length,
+                    hidden_size // num_attention_heads,
+                )
+            ),
+        )
 
         b_out = B(hidden_state, past_key_value=past_key_value)
         a_out = A(hidden_state, past_key_value=past_key_value)
@@ -195,6 +289,7 @@ def test_inplace(self):
     def test_adapt(self):
         def f():
             ci, co = dims()
+
         # python 3.11 adapts bytecode after a number of iterations
         # check that we still match names correctly
         for i in range(10):
@@ -203,8 +298,15 @@ def f():
     @skipIf(not TEST_CUDA, "no CUDA")
     def test_attn_cuda(self):
         # size from the BERT paper, 90% pretraining of sequence length 128
-        self.attn(batch_size=256, hidden_size=768, sequence_length=128,
-                  num_attention_heads=12, device='cuda', time=measure_perf, linear=torch.nn.Linear)
+        self.attn(
+            batch_size=256,
+            hidden_size=768,
+            sequence_length=128,
+            num_attention_heads=12,
+            device="cuda",
+            time=measure_perf,
+            linear=torch.nn.Linear,
+        )
 
     def test_stack(self):
         i, j, d = dims()
@@ -246,8 +348,6 @@ def test_hello(self):
         B = torch.rand(4, 5)
         i, j, k = dims()
 
-
-
         # r = A[i]*4
         r = (A[i, k] * B[k, j]).sum(k).order(i, j)
         assert torch.allclose(r, A @ B)
@@ -303,17 +403,19 @@ def test_hello(self):
         # test with too many elements
         try:
             A[1, ..., 1, 1]
-            raise NotImplementedError()
+            raise NotImplementedError
         except IndexError:
             pass
         c, d = dims()
         c.size = 2
         assert torch.allclose(A[i, [c, d]].order(i, c, d), A.view(3, 2, 2))
 
-        assert torch.allclose(A[c + 1, c + 0].order(c), A[torch.arange(2) + 1, torch.arange(2)])
+        assert torch.allclose(
+            A[c + 1, c + 0].order(c), A[torch.arange(2) + 1, torch.arange(2)]
+        )
         try:
             A[..., 3, ...]
-            raise NotImplementedError()
+            raise NotImplementedError
         except DimensionBindError:
             pass
 
@@ -328,8 +430,9 @@ def test_hello(self):
             assert torch.allclose(a, b.order(s, d))
 
         D = torch.rand(3, 4, 5)
-        assert torch.allclose(D.transpose(0, 1).flatten(1, 2), D[i, k, j].order((i, j)).order(k))
-
+        assert torch.allclose(
+            D.transpose(0, 1).flatten(1, 2), D[i, k, j].order((i, j)).order(k)
+        )
 
         r = [id(x) for x in torch.rand_like(A[i, k]).dims]
         assert id(i) in r and id(k) in r
@@ -356,7 +459,6 @@ def test_time_mm_fuse(self):
         A = torch.rand(3, 4)
         B = torch.rand(4, 5)
 
-
         for _ in range(10):
             r0 = A @ B
 
@@ -365,26 +467,25 @@ def test_time_mm_fuse(self):
             b = B[k, j]
             r1 = (a * b).sum(k)
 
-        with measure('pp'):
+        with measure("pp"):
             for _ in range(10000):
                 A @ B
         # magic_trace_stop_indicator()
 
-        with measure('fc'):
+        with measure("fc"):
             for _ in range(10000):
                 (A[i, k] * B[k, j]).sum(k).order(i, j)
 
-        with magic_trace('f.fxt'):
+        with magic_trace("f.fxt"):
             for _ in range(10000):
                 (A[i, k] * B[k, j]).sum(k).order(i, j)
 
-        with magic_trace('p.fxt'):
+        with magic_trace("p.fxt"):
             for _ in range(10000):
                 A @ B
 
         # magic_trace_stop_indicator()
 
-
         assert torch.allclose(r1.order(i, j), r0)
 
     def test_compare_dims(self):
@@ -408,7 +509,6 @@ def test_expand(self):
         i = dims()
         assert list(A[i].expand(2, 4).order(i).size()) == [3, 2, 4]
 
-
     def test_parse(self):
         self.assertEqual(("x", None, None, None), _parse_test(1, 0, "x"))
         self.assertEqual(("x", None, "y", None), _parse_test(1, 0, "x", c="y"))
@@ -428,8 +528,10 @@ def test_parse(self):
 
     def test_network(self):
         if resnet18 is None:
-            self.skipTest('no torchvision')
-        rn = resnet18(norm_layer=lambda x: torch.nn.BatchNorm2d(x, track_running_stats=False))
+            self.skipTest("no torchvision")
+        rn = resnet18(
+            norm_layer=lambda x: torch.nn.BatchNorm2d(x, track_running_stats=False)
+        )
         rn.train()
         img = torch.rand(1, 1, 2, 3, 224, 224)
         imgf = img.view(2, 3, 224, 224)
@@ -447,7 +549,7 @@ def test_dim_args(self):
         b = dimlists()
         assert isinstance(a, Dim)
         assert isinstance(b, DimList)
-        assert str(a) == 'a'
+        assert str(a) == "a"
         a, b = dims(sizes=[3, 4])
         assert a.size == 3
         assert b.size == 4
@@ -467,7 +569,7 @@ def test_diag(self):
     def test_softmax_split(self):
         a = torch.rand(16)
         g, i = dims(sizes=[2, None])
-        a2 = a[[i, g], ]
+        a2 = a[[i, g],]
 
         m_b, _ = a2.max(i)
         f_b = torch.exp(a2 - m_b)
@@ -497,7 +599,9 @@ def test_index(self):
 
         C = torch.rand(3, 4, 5)
         ik = dims()
-        assert torch.allclose(C.index((0, 2), ik).order(ik), C.permute(0, 2, 1).reshape(15, 4))
+        assert torch.allclose(
+            C.index((0, 2), ik).order(ik), C.permute(0, 2, 1).reshape(15, 4)
+        )
 
     # failures that came up from monkey patching some operators...
     def test_monkey(self):
@@ -545,6 +649,7 @@ def test_dims_with_size(self):
 
         class Foo:
             pass
+
         y = Foo()
         z, y.x, q = dims(3)
         assert str(z) == "z"
@@ -559,7 +664,6 @@ def test_doc(self):
         assert Tensor.clamp.__doc__ == torch.Tensor.clamp.__doc__
 
     def test_embed(self):
-
         embeddings = torch.rand(8, 32)
         ids = torch.tensor([1, 0, 3, 4])
 
@@ -605,7 +709,10 @@ def test_big_split(self):
         x = torch.randn(total, 1)
         x.split(l, 0)
 
-skip_functorch_only = ['test_time_mm_fuse', 'test_attn_cuda']
+
+skip_functorch_only = ["test_time_mm_fuse", "test_attn_cuda"]
+
+
 class TestMinFunctorchOnly(TestMin):
     def setUp(self):
         super().setUp()
@@ -615,8 +722,9 @@ def tearDown(self):
         _set_pointwise_optimize(True)
         super().tearDown()
 
+
 for n in skip_functorch_only:
     setattr(TestMinFunctorchOnly, n, skip("skip_functorch_only")(lambda self: None))
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index db7fcbfaec1bb..b54b9762fbc32 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -7,66 +7,123 @@
 # LICENSE file in the root directory of this source tree.
 
 import copy
-from torch.testing._internal.common_utils import (
-    TestCase, run_tests, parametrize, subtest, instantiate_parametrized_tests,
-    IS_FBCODE, freeze_rng_state, skipIfTorchDynamo, IS_WINDOWS, IS_MACOS, IS_ARM64,
-    markDynamoStrictTest, xfailIfTorchDynamo, TEST_WITH_TORCHDYNAMO
-)
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
+import math
 import os
 import subprocess
 import sys
 import unittest
 import warnings
-import math
-from functools import wraps
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyCPU, dtypes, onlyCUDA
-from torch.testing._internal.common_dtype import get_all_fp_dtypes
-from torch.testing._internal.common_cuda import with_tf32_off, SM70OrLater, TEST_CUDA
-from torch.testing import make_tensor
-from torch._dynamo import allow_in_graph
-from torch._subclasses.fake_tensor import FakeTensorMode
-from functools import partial
-from functorch.experimental import replace_all_batch_norm_modules_
-from torch._C import _ExcludeDispatchKeyGuard, DispatchKeySet, DispatchKey
+from functools import partial, wraps
 
 import functorch
+
+# NB: numpy is a testing dependency!
+import numpy as np
+import torch
+import torch.autograd.forward_ad as fwAD
+import torch.nn as nn
+import torch.nn.functional as F
+from common_utils import expectedFailureIf
 from functorch import (
-    grad, vjp, vmap, jacrev, jacfwd, grad_and_value, hessian,
-    jvp, make_functional, make_functional_with_buffers,
-    combine_state_for_ensemble, make_fx
+    combine_state_for_ensemble,
+    grad,
+    grad_and_value,
+    hessian,
+    jacfwd,
+    jacrev,
+    jvp,
+    make_functional,
+    make_functional_with_buffers,
+    make_fx,
+    vjp,
+    vmap,
 )
+from functorch.experimental import functionalize, replace_all_batch_norm_modules_
+from torch._C import _ExcludeDispatchKeyGuard, DispatchKey, DispatchKeySet
+from torch._dynamo import allow_in_graph
+from torch._functorch.eager_transforms import _slice_argnums
 from torch._functorch.make_functional import (
-    functional_init, functional_init_with_buffers,
+    functional_init,
+    functional_init_with_buffers,
 )
-from torch._functorch.eager_transforms import _slice_argnums
-from functorch.experimental import functionalize
-from torch._ops import HigherOrderOperator
 from torch._functorch.utils import enable_single_level_autograd_function
-import torch.autograd.forward_ad as fwAD
-from torch.func import functional_call, stack_module_state, linearize
-from common_utils import expectedFailureIf
-
-# NB: numpy is a testing dependency!
-import numpy as np
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.func import functional_call, linearize, stack_module_state
+from torch.testing import make_tensor
+from torch.testing._internal.common_cuda import (
+    SM70OrLater,
+    TEST_CUDA,
+    tf32_on_and_off,
+    with_tf32_off,
+)
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+    onlyCPU,
+    onlyCUDA,
+)
+from torch.testing._internal.common_dtype import get_all_fp_dtypes
+from torch.testing._internal.common_utils import (
+    freeze_rng_state,
+    instantiate_parametrized_tests,
+    IS_ARM64,
+    IS_FBCODE,
+    IS_MACOS,
+    IS_WINDOWS,
+    markDynamoStrictTest,
+    parametrize,
+    run_tests,
+    skipIfRocm,
+    skipIfTorchDynamo,
+    subtest,
+    TEST_WITH_TORCHDYNAMO,
+    TestCase,
+)
 
-from torch.utils._pytree import tree_flatten, tree_unflatten, tree_map
+from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
 
 USE_TORCHVISION = False
 try:
     import torchvision  # noqa: F401
+
     USE_TORCHVISION = True
 except ImportError:
-    warnings.warn("Couldn't import torchvision. Some of our tests use it, try "
-                  "to install it with commands from pytorch.org, post-fixed with "
-                  "`--no-deps` to avoid overwriting the pytorch installation",
-                  UserWarning)
+    warnings.warn(
+        "Couldn't import torchvision. Some of our tests use it, try "
+        "to install it with commands from pytorch.org, post-fixed with "
+        "`--no-deps` to avoid overwriting the pytorch installation",
+        UserWarning,
+    )
 
 # TestCase for _slice_argnums, an important helper function
 
 
+class VmapTearDownMixin:
+    def tearDown(self):
+        # Ensure that in the case of a test failure, the next test won't fail
+        # because of a previous call to _vmap_increment_nesting that wasn't undone
+        # i.e. test_vmap_free_tensor fails when PYTORCH_TEST_WITH_DYNAMO=1
+        # and the call to increment nesting is not undone
+        if not TEST_WITH_TORCHDYNAMO:
+            return
+
+        warn = False
+        while ci := torch._C._functorch.peek_interpreter_stack():
+            if ci.key() == torch._C._functorch.TransformType.Vmap:
+                warn = True
+                torch._C._functorch._vmap_decrement_nesting()
+            else:
+                break
+
+        if warn:
+            msg = (
+                "Interpreter stack is not empty. Test should have called "
+                "'torch._C._functorch._vmap_decrement_nesting()'"
+            )
+            warnings.warn(msg)
+
+
 @markDynamoStrictTest
 class TestSliceArgnums(TestCase):
     def test_invalid_argnum_type(self):
@@ -156,6 +213,7 @@ def test_argnums_reorders(self):
         res = _slice_argnums(args, (1, 0))
         self.assertEqual(res, (args[1], args[0]))
 
+
 def _get_weights_and_functional_call(net, mechanism):
     if mechanism == "make_functional":
         return make_functional(net)
@@ -168,6 +226,7 @@ def net_func(weights, data):
 
         return net_func, dict(net.named_parameters())
 
+
 def _get_weights_and_functional_call_with_buffers(net, mechanism):
     if mechanism == "make_functional":
         return make_functional_with_buffers(net)
@@ -215,7 +274,7 @@ def foo(x, y):
 
         x.requires_grad_()
         out = foo(x, y)
-        expected, = torch.autograd.grad(out, x)
+        (expected,) = torch.autograd.grad(out, x)
 
         self.assertEqual(result, expected)
 
@@ -230,7 +289,7 @@ def foo(y, targets):
         result = grad(foo)(y, targets)
 
         y.requires_grad_()
-        expected, = torch.autograd.grad(foo(y, targets), y)
+        (expected,) = torch.autograd.grad(foo(y, targets), y)
 
         self.assertEqual(result, expected)
 
@@ -278,7 +337,7 @@ def foo(x):
 
         x.requires_grad_()
         out = foo(x)
-        expected, = torch.autograd.grad(out, x)
+        (expected,) = torch.autograd.grad(out, x)
 
         self.assertEqual(result, expected)
 
@@ -295,19 +354,19 @@ def foo(x):
 
         x.requires_grad_()
         out = foo(x)
-        expected, = torch.autograd.grad(out, x)
+        (expected,) = torch.autograd.grad(out, x)
 
         self.assertEqual(result, expected)
 
     def test_inplace_on_captures(self, device):
-        x = torch.tensor([1., 2., 3.], device=device)
+        x = torch.tensor([1.0, 2.0, 3.0], device=device)
         captured = torch.randn(3, device=device)
 
         def foo(x):
             captured.copy_(x)
             return (x * captured).sum()
 
-        with self.assertRaisesRegex(RuntimeError, 'mutate a captured Tensor'):
+        with self.assertRaisesRegex(RuntimeError, "mutate a captured Tensor"):
             grad(foo)(x)
 
     def test_nesting_simple(self, device):
@@ -354,24 +413,23 @@ def f(x):
         with freeze_rng_state():
             result = grad(f)(x)
             x.requires_grad_()
-            expected, = torch.autograd.grad(f(x), x)
+            (expected,) = torch.autograd.grad(f(x), x)
             self.assertEqual(result, expected)
 
-    @xfailIfTorchDynamo
     def test_vjp(self, device):
         x = torch.randn([], device=device)
         out, vjp_fn = vjp(torch.sin, x)
         self.assertEqual(out, x.sin())
 
         v = torch.randn([], device=device)
-        result, = vjp_fn(v)
+        (result,) = vjp_fn(v)
         self.assertEqual(result, v * x.cos())
 
-    @xfailIfTorchDynamo
     def test_vjp_two_outputs(self, device):
         def f(x):
             return x, x
-        result, vjp_fn = vjp(f, torch.tensor(1.))
+
+        result, vjp_fn = vjp(f, torch.tensor(1.0))
         vjp_fn(result)
 
     def test_conj_bit(self):
@@ -382,6 +440,7 @@ def foo(x):
             y = x.conj()
             assert y.is_conj()
             return y.abs()
+
         res = grad(foo)(x)
         with torch.no_grad():
             self.assertEqual(res, torch.ones_like(res) * torch.sgn(x))
@@ -390,10 +449,9 @@ def test_composed_with_autograd(self, device):
         x = torch.randn([], requires_grad=True, device=device)
 
         y = grad(torch.sin)(x)
-        result, = torch.autograd.grad(y, x)
+        (result,) = torch.autograd.grad(y, x)
         self.assertEqual(result, -x.sin())
 
-    @xfailIfTorchDynamo
     def test_grad_of_vjp_composition(self, device):
         x = torch.randn([], device=device)
         y = torch.randn([], device=device)
@@ -406,7 +464,6 @@ def foo(x, y):
         expected = x.cos()
         self.assertEqual(result, expected)
 
-    @xfailIfTorchDynamo
     def test_vjp_of_grad_composition(self, device):
         x = torch.randn([], device=device)
         y = torch.randn([], device=device)
@@ -419,7 +476,6 @@ def foo(x, y):
         expected = -y * x.sin()
         self.assertEqual(result, expected)
 
-    @xfailIfTorchDynamo
     def test_grad_of_vjp_of_grad_composition(self, device):
         x = torch.randn([], device=device)
         y = torch.randn([], device=device)
@@ -464,17 +520,17 @@ def foo(x):
     def test_invalid_argnums(self, device):
         x = torch.randn([])
         y = torch.randn([])
-        with self.assertRaisesRegex(RuntimeError, 'but only'):
+        with self.assertRaisesRegex(RuntimeError, "but only"):
             grad(torch.mul, argnums=-3)(x, y)
-        with self.assertRaisesRegex(RuntimeError, 'but only'):
+        with self.assertRaisesRegex(RuntimeError, "but only"):
             grad(torch.mul, argnums=2)(x, y)
-        with self.assertRaisesRegex(RuntimeError, 'int or Tuple'):
+        with self.assertRaisesRegex(RuntimeError, "int or Tuple"):
             grad(torch.mul, argnums=[0])(x, y)
-        with self.assertRaisesRegex(RuntimeError, 'must be int'):
-            grad(torch.mul, argnums=('0',))(x, y)
-        with self.assertRaisesRegex(RuntimeError, 'must be unique'):
+        with self.assertRaisesRegex(RuntimeError, "must be int"):
+            grad(torch.mul, argnums=("0",))(x, y)
+        with self.assertRaisesRegex(RuntimeError, "must be unique"):
             grad(torch.mul, argnums=(0, 0))(x, y)
-        with self.assertRaisesRegex(RuntimeError, 'must be unique'):
+        with self.assertRaisesRegex(RuntimeError, "must be unique"):
             grad(torch.mul, argnums=(0, -2))(x, y)
 
     def test_argnums(self, device):
@@ -486,7 +542,7 @@ def test_argnums(self, device):
         gy = grad(torch.mul, argnums=1)(x, y)
         self.assertEqual(gy, x)
 
-        gx, = grad(torch.mul, argnums=(0,))(x, y)
+        (gx,) = grad(torch.mul, argnums=(0,))(x, y)
         self.assertEqual(gx, y)
 
         gx, gy = grad(torch.mul, argnums=(0, 1))(x, y)
@@ -509,7 +565,7 @@ def test_negative_argnums(self, device):
         gy = grad(torch.mul, argnums=-1)(x, y)
         self.assertEqual(gy, x)
 
-        gx, = grad(torch.mul, argnums=(-2,))(x, y)
+        (gx,) = grad(torch.mul, argnums=(-2,))(x, y)
         self.assertEqual(gx, y)
 
         gx, gy = grad(torch.mul, argnums=(-2, -1))(x, y)
@@ -521,36 +577,35 @@ def test_grad_pytree_inputs(self, device):
 
         def f(a, b):
             x, y = a
-            return 1 * x + 2 * y + 3 * b['foo']
+            return 1 * x + 2 * y + 3 * b["foo"]
 
-        args = ((x, x), {'foo': x})
+        args = ((x, x), {"foo": x})
 
         gx, gy = grad(f)(*args)
-        self.assertEqual(gx, torch.tensor(1., device=device))
-        self.assertEqual(gy, torch.tensor(2., device=device))
+        self.assertEqual(gx, torch.tensor(1.0, device=device))
+        self.assertEqual(gy, torch.tensor(2.0, device=device))
 
-        (gx, gy), = grad(f, argnums=(0,))(*args)
-        self.assertEqual(gx, torch.tensor(1., device=device))
-        self.assertEqual(gy, torch.tensor(2., device=device))
+        ((gx, gy),) = grad(f, argnums=(0,))(*args)
+        self.assertEqual(gx, torch.tensor(1.0, device=device))
+        self.assertEqual(gy, torch.tensor(2.0, device=device))
 
         (gx, gy), gz = grad(f, argnums=(0, 1))(*args)
-        self.assertEqual(gx, torch.tensor(1., device=device))
-        self.assertEqual(gy, torch.tensor(2., device=device))
-        self.assertEqual(gz['foo'], torch.tensor(3., device=device))
+        self.assertEqual(gx, torch.tensor(1.0, device=device))
+        self.assertEqual(gy, torch.tensor(2.0, device=device))
+        self.assertEqual(gz["foo"], torch.tensor(3.0, device=device))
 
     def test_grad_aux_tensor(self, device):
-
         x = torch.randn(3, device=device)
 
         with self.assertRaisesRegex(
             RuntimeError,
-            r'grad_and_value\(f\)\(\*args\): output of function f should be a tuple'
+            r"grad_and_value\(f\)\(\*args\): output of function f should be a tuple",
         ):
             grad(lambda t: [t, t], has_aux=True)(x)
 
         with self.assertRaisesRegex(
             RuntimeError,
-            r'grad_and_value\(f\)\(\*args\): output of function f should be a tuple'
+            r"grad_and_value\(f\)\(\*args\): output of function f should be a tuple",
         ):
             grad(lambda t: (t, t + 2, t + 3), has_aux=True)(x)
 
@@ -565,7 +620,7 @@ def f(t):
     def test_grad_aux_pytree(self, device):
         def f(x):
             y = x.sin()
-            return y.sum(), {'a': x.cos(), 'b': [x.tan()]}
+            return y.sum(), {"a": x.cos(), "b": [x.tan()]}
 
         x = torch.randn(3, device=device)
 
@@ -575,22 +630,30 @@ def f(x):
         self.assertEqual(out, x.cos())
 
         for aux in [1, 1.0, "abc"]:
-            with self.assertRaisesRegex(RuntimeError, r"Expected tensors, got unsupported type"):
+            with self.assertRaisesRegex(
+                RuntimeError, r"Expected tensors, got unsupported type"
+            ):
                 _ = grad(lambda x: (x.sum(), aux), has_aux=True)(x)
-            with self.assertRaisesRegex(RuntimeError, r"Expected tensors, got unsupported type"):
+            with self.assertRaisesRegex(
+                RuntimeError, r"Expected tensors, got unsupported type"
+            ):
                 _ = grad(lambda x: (x.sum(), [x, aux]), has_aux=True)(x)
 
     def test_zero_grad(self, device):
         def f(x):
-            return (x['a']**2.0).sum()
-        inps = ({'a': torch.randn(10, device=device) + 3, 'b': torch.randn(10, device=device)})
+            return (x["a"] ** 2.0).sum()
+
+        inps = {
+            "a": torch.randn(10, device=device) + 3,
+            "b": torch.randn(10, device=device),
+        }
         grads = grad(f)(inps)
-        self.assertNotEqual(grads['a'].sum(), 0.0)
-        self.assertEqual(grads['b'].sum(), 0.0)
+        self.assertNotEqual(grads["a"].sum(), 0.0)
+        self.assertEqual(grads["b"].sum(), 0.0)
 
     def test_unrelated_grad(self, device):
-        x = torch.tensor(1., device=device)
-        y = torch.tensor(2., device=device)
+        x = torch.tensor(1.0, device=device)
+        y = torch.tensor(2.0, device=device)
 
         def unrelated(x):
             return y
@@ -598,11 +661,10 @@ def unrelated(x):
         result = grad(unrelated)(x)
         self.assertEqual(result, torch.zeros_like(x))
 
-    @xfailIfTorchDynamo
     def test_unrelated_vjp(self, device):
-        x = torch.tensor(1., device=device)
-        y = torch.tensor(2., device=device)
-        v = torch.tensor(1., device=device)
+        x = torch.tensor(1.0, device=device)
+        y = torch.tensor(2.0, device=device)
+        v = torch.tensor(1.0, device=device)
 
         def unrelated(x):
             return y
@@ -612,12 +674,11 @@ def unrelated(x):
         expected = (torch.zeros_like(x),)
         self.assertEqual(result, expected)
 
-    @xfailIfTorchDynamo
     def test_unrelated_vjp_multiple_inputs_outputs(self, device):
-        w = torch.tensor(3., device=device)
-        x = torch.tensor(4., device=device)
-        y = torch.tensor(2., device=device)
-        v = torch.tensor(1., device=device)
+        w = torch.tensor(3.0, device=device)
+        x = torch.tensor(4.0, device=device)
+        y = torch.tensor(2.0, device=device)
+        v = torch.tensor(1.0, device=device)
 
         def unrelated(w, x):
             return y, y, x
@@ -629,7 +690,6 @@ def unrelated(w, x):
 
     # TODO: https://github.com/zou3519/functorch/issues/12
     @onlyCPU
-    @xfailIfTorchDynamo
     def test_unrelated_hessian(self, device):
         N = 5
         M = 3
@@ -643,7 +703,6 @@ def f(x):
         expected = torch.zeros(N, M, M, device=device)
         self.assertEqual(result, expected)
 
-    @xfailIfTorchDynamo
     def test_vjp_pytree_input(self, device):
         def f(x):
             return x[0] * x[1][0]
@@ -653,9 +712,8 @@ def f(x):
         out, vjp_fn = vjp(f, (x, (x, x)))
         self.assertEqual(out, x * x)
         result = vjp_fn(v)
-        self.assertEqual(result, ((x * v, (x * v, 0.)),))
+        self.assertEqual(result, ((x * v, (x * v, 0.0)),))
 
-    @xfailIfTorchDynamo
     def test_vjp_pytree_output(self, device):
         def f(x):
             return x, (x, x)
@@ -665,37 +723,38 @@ def f(x):
         v2 = torch.randn([], device=device)
         v3 = torch.randn([], device=device)
         _, vjp_fn = vjp(f, x)
-        result, = vjp_fn((v1, (v2, v3)))
+        (result,) = vjp_fn((v1, (v2, v3)))
         self.assertEqual(result, v1 + v2 + v3)
 
-    @xfailIfTorchDynamo
     def test_vjp_outputs_can_any_pytree(self, device):
         x = torch.randn(2, 3, device=device)
         t = torch.randn(2, 3, device=device)
 
         for output in [None, ()]:
             with self.assertRaisesRegex(
-                RuntimeError, r"vjp\(f, \*primals\): Expected f to be a function that has non-empty output"
+                RuntimeError,
+                r"vjp\(f, \*primals\): Expected f to be a function that has non-empty output",
             ):
                 _, vjp_fn = vjp(lambda _: output, x)
                 vjp_fn(t)
 
         for output in [1, True, 12.2, "abc"]:
             with self.assertRaisesRegex(
-                RuntimeError, r"vjp\(f, \*primals\): expected f\(\*primals\) to return only tensors"
+                RuntimeError,
+                r"vjp\(f, \*primals\): expected f\(\*primals\) to return only tensors",
             ):
                 _, vjp_fn = vjp(lambda _: output, x)
                 vjp_fn(t)
 
         # Check list output
         output, vjp_fn = vjp(lambda x: [x, x.sum()], x)
-        vjp_out, = vjp_fn([t, t.sum()])
+        (vjp_out,) = vjp_fn([t, t.sum()])
         assert isinstance(output, list) and len(output) == 2
         assert isinstance(vjp_out, torch.Tensor)
 
         # Check dict output
         output, vjp_fn = vjp(lambda x: {"x": x, "xsum": x.sum()}, x)
-        vjp_out, = vjp_fn({"x": t, "xsum": t.sum()})
+        (vjp_out,) = vjp_fn({"x": t, "xsum": t.sum()})
         assert isinstance(output, dict) and len(output) == 2 and "xsum" in output
         assert isinstance(vjp_out, torch.Tensor)
 
@@ -706,12 +765,15 @@ def composite_output(x):
             ]
 
         output, vjp_fn = vjp(composite_output, x)
-        vjp_out, = vjp_fn([(t.sum(), {"a": t, "out": [t, t.sum()]}), ])
+        (vjp_out,) = vjp_fn(
+            [
+                (t.sum(), {"a": t, "out": [t, t.sum()]}),
+            ]
+        )
         assert isinstance(output, list)
         assert isinstance(output[0], tuple) and isinstance(output[0][1], dict)
         assert isinstance(vjp_out, torch.Tensor)
 
-    @xfailIfTorchDynamo
     def test_vjp_pytree_error(self, device):
         def f(x):
             return x, (x, x)
@@ -721,18 +783,20 @@ def f(x):
         v2 = torch.randn([], device=device)
         v3 = torch.randn([], device=device)
         _, vjp_fn = vjp(f, x)
-        with self.assertRaisesRegex(RuntimeError, 'Expected pytree structure'):
-            result, = vjp_fn(((v1, (v2, v3)),))
+        with self.assertRaisesRegex(RuntimeError, "Expected pytree structure"):
+            (result,) = vjp_fn(((v1, (v2, v3)),))
 
-    @xfailIfTorchDynamo
     def test_vjp_aux_tensor(self, device):
-
         x = torch.randn(3, device=device)
 
-        with self.assertRaisesRegex(RuntimeError, r'vjp\(f, \*primals\): output of function f should be a tuple'):
+        with self.assertRaisesRegex(
+            RuntimeError, r"vjp\(f, \*primals\): output of function f should be a tuple"
+        ):
             vjp(lambda t: [t, t], x, has_aux=True)
 
-        with self.assertRaisesRegex(RuntimeError, r'vjp\(f, \*primals\): output of function f should be a tuple'):
+        with self.assertRaisesRegex(
+            RuntimeError, r"vjp\(f, \*primals\): output of function f should be a tuple"
+        ):
             vjp(lambda t: (t, t + 2, t + 3), x, has_aux=True)
 
         def f(t):
@@ -744,14 +808,13 @@ def f(t):
         self.assertEqual(out, x.sin())
 
         v = torch.randn(3, device=device)
-        grad_x, = vjp_fn(v)
+        (grad_x,) = vjp_fn(v)
         self.assertEqual(grad_x, v * x.cos())
 
-    @xfailIfTorchDynamo
     def test_vjp_aux_pytree(self, device):
         def f(x):
             y = x.sin()
-            return y, {'a': x.cos(), 'b': [x.tan()]}
+            return y, {"a": x.cos(), "b": [x.tan()]}
 
         x = torch.randn(3, device=device)
 
@@ -761,13 +824,17 @@ def f(x):
         self.assertEqual(aux, expected_aux)
 
         v = torch.randn(3, device=device)
-        grad_x, = vjp_fn(v)
+        (grad_x,) = vjp_fn(v)
         self.assertEqual(grad_x, v * x.cos())
 
         for aux in [1, 1.0, "abc"]:
-            with self.assertRaisesRegex(RuntimeError, r"Expected tensors, got unsupported type"):
+            with self.assertRaisesRegex(
+                RuntimeError, r"Expected tensors, got unsupported type"
+            ):
                 _ = vjp(lambda x: (x, aux), x, has_aux=True)
-            with self.assertRaisesRegex(RuntimeError, r"Expected tensors, got unsupported type"):
+            with self.assertRaisesRegex(
+                RuntimeError, r"Expected tensors, got unsupported type"
+            ):
                 _ = vjp(lambda x: (x, [x, aux]), x, has_aux=True)
 
     def test_functional_init(self, device):
@@ -812,15 +879,16 @@ def forward(self, x):
                 return x
 
         B = 10
-        weights, buffers, fn, _, _ = \
-            functional_init_with_buffers(MLPClassifier, [B], device=device)(32, 2)
+        weights, buffers, fn, _, _ = functional_init_with_buffers(
+            MLPClassifier, [B], device=device
+        )(32, 2)
         inputs = torch.randn(B, 7, 2, device=device)
         vmap(fn)(weights, buffers, (inputs,))
 
     def test_advanced_indexing(self, device):
         def f(value):
             log_prob = torch.ones((), device=device)
-            val = (torch.zeros(()) > 0)
+            val = torch.zeros(()) > 0
             log_prob[val] = 0
             return value
 
@@ -838,20 +906,46 @@ def f2(value):
 
     def test_tensor_ctor_inside_grad(self, device):
         def foo(x):
-            return x * torch.tensor(2., device=device)
+            return x * torch.tensor(2.0, device=device)
 
         x = torch.tensor(3.14, device=device)
         functorch.grad(foo)(x)
 
-    @parametrize("op_list_data", [
-        subtest(([vmap, ], [(4, 2), (64, 3, 32, 32)]), name='vmap'),
-        subtest(([vmap, vmap], [(4, 3, 2), (64, 3, 32, 32)]), name='vmap_vmap'),
-        subtest(([grad, ], [(0, ), [], (4, 2), (64, 3, 32, 32)]), name='grad'),
-        subtest(([grad, grad], [[], ]), name='grad_grad'),
-        subtest(([vmap, grad], [(4, 2)]), name='vmap_grad'),
-    ])
+    @parametrize(
+        "op_list_data",
+        [
+            subtest(
+                (
+                    [
+                        vmap,
+                    ],
+                    [(4, 2), (64, 3, 32, 32)],
+                ),
+                name="vmap",
+            ),
+            subtest(([vmap, vmap], [(4, 3, 2), (64, 3, 32, 32)]), name="vmap_vmap"),
+            subtest(
+                (
+                    [
+                        grad,
+                    ],
+                    [(0,), [], (4, 2), (64, 3, 32, 32)],
+                ),
+                name="grad",
+            ),
+            subtest(
+                (
+                    [grad, grad],
+                    [
+                        [],
+                    ],
+                ),
+                name="grad_grad",
+            ),
+            subtest(([vmap, grad], [(4, 2)]), name="vmap_grad"),
+        ],
+    )
     def test_tensor_print(self, device, op_list_data):
-
         op_list, shapes = op_list_data
 
         for dt in get_all_fp_dtypes():
@@ -882,16 +976,17 @@ def foo(t):
                         expected = f"GradTrackingTensor(lvl={level}, value={expected})"
                     elif op == vmap:
                         bdim -= 1
-                        expected = f"BatchedTensor(lvl={level}, bdim={bdim}, value={expected})"
+                        expected = (
+                            f"BatchedTensor(lvl={level}, bdim={bdim}, value={expected})"
+                        )
 
                 fn(x)
                 buf = buf.replace("\n", "").replace("  ", "")
                 expected = expected.replace("\n", "").replace("  ", "")
                 self.assertEqual(expected, buf)
 
-    @xfailIfTorchDynamo
     def test_print_captured_tensor_inside_transform(self, device):
-        x = torch.tensor([1., 2., 3.], device=device)
+        x = torch.tensor([1.0, 2.0, 3.0], device=device)
         out = None
 
         def f(y):
@@ -912,8 +1007,8 @@ def test_no_grad_outside(self, device):
     def test_no_grad_inside(self, device):
         def f(x):
             with torch.no_grad():
-                shift = x ** 2
-            return x ** 2 - shift
+                shift = x**2
+            return x**2 - shift
 
         x = torch.randn([], device=device)
         y = grad(f)(x)
@@ -923,14 +1018,14 @@ def f(x):
 
         x = torch.randn([], device=device, requires_grad=True)
         y = grad(f)(x)
-        z, = torch.autograd.grad(y, x)
+        (z,) = torch.autograd.grad(y, x)
         self.assertEqual(z, 2)
 
     def test_no_grad_mixed(self, device):
         def f(x):
             with torch.no_grad():
-                shift = x ** 2
-            return x ** 2 - shift
+                shift = x**2
+            return x**2 - shift
 
         x = torch.randn([], device=device, requires_grad=True)
         with torch.no_grad():
@@ -942,21 +1037,21 @@ def f(x):
     def test_no_grad_nested_simple(self, device):
         def h(x):
             with torch.no_grad():
-                shift = grad(lambda x: 0.25 * x ** 4)(x)
-            return x ** 3 - shift
+                shift = grad(lambda x: 0.25 * x**4)(x)
+            return x**3 - shift
 
         x = torch.tensor(1.5, device=device, requires_grad=True)
         y = grad(h)(x)
-        self.assertEqual(y, 3 * x ** 2)
+        self.assertEqual(y, 3 * x**2)
 
-        z, = torch.autograd.grad(y, x)
+        (z,) = torch.autograd.grad(y, x)
         self.assertEqual(z, 6 * x)
 
     def test_no_grad_nested_complicated(self, device):
         def f(x):
             with torch.no_grad():
-                shift = x ** 3
-            return x ** 3 - shift
+                shift = x**3
+            return x**3 - shift
 
         def g(x):
             r1 = grad(f)(x)
@@ -969,62 +1064,59 @@ def g(x):
         # The only differential part of g is x ** 3
         self.assertEqual(y, 6 * x)
 
-        z, = torch.autograd.grad(y, x)
+        (z,) = torch.autograd.grad(y, x)
         self.assertEqual(z, 6)
 
     def test_no_grad_value(self, device):
         def h(x):
             with torch.no_grad():
-                gvalue, value = grad_and_value(lambda x: x ** 3)(x)
-            return x ** 3 - value
+                gvalue, value = grad_and_value(lambda x: x**3)(x)
+            return x**3 - value
 
         x = torch.tensor(1.6, device=device, requires_grad=True)
         y = grad(h)(x)
-        self.assertEqual(y, 3 * x ** 2)
+        self.assertEqual(y, 3 * x**2)
 
-        z, = torch.autograd.grad(y, x)
+        (z,) = torch.autograd.grad(y, x)
         self.assertEqual(z, 6 * x)
 
-    @xfailIfTorchDynamo
     def test_no_grad_outside_vjp(self, device):
         def h(x):
-            return x ** 2
+            return x**2
 
-        x = torch.tensor(2., requires_grad=True, device=device)
+        x = torch.tensor(2.0, requires_grad=True, device=device)
         with torch.no_grad():
             out, vjp_fn = vjp(h, x)
-            y, = vjp_fn(torch.tensor(1., device=device))
+            (y,) = vjp_fn(torch.tensor(1.0, device=device))
 
         self.assertEqual(y, 2 * x)
         self.assertFalse(y.requires_grad)
         self.assertFalse(out.requires_grad)
 
-    @xfailIfTorchDynamo
     def test_no_grad_outside_vjp_fn(self, device):
         def h(x):
-            return x ** 2
+            return x**2
 
         x = torch.tensor(3.14, requires_grad=True, device=device)
         out, vjp_fn = vjp(h, x)
         with torch.no_grad():
-            y, = vjp_fn(torch.tensor(1., device=device))
+            (y,) = vjp_fn(torch.tensor(1.0, device=device))
 
         self.assertEqual(y, 2 * x)
         self.assertFalse(y.requires_grad)
         self.assertTrue(out.requires_grad)
 
-        z, = torch.autograd.grad(out, x)
+        (z,) = torch.autograd.grad(out, x)
         self.assertEqual(z, 2 * x)
 
-    @xfailIfTorchDynamo
     def test_no_grad_outside_vjp_only(self, device):
         def h(x):
-            return x ** 2
+            return x**2
 
         x = torch.tensor(3.14, requires_grad=True, device=device)
         with torch.no_grad():
             out, vjp_fn = vjp(h, x)
-        y, = vjp_fn(torch.tensor(1., device=device))
+        (y,) = vjp_fn(torch.tensor(1.0, device=device))
 
         self.assertEqual(y, 2 * x)
         self.assertFalse(out.requires_grad)
@@ -1032,13 +1124,12 @@ def h(x):
         # This one is a little weird...
         self.assertTrue(y.requires_grad)
 
-        z, = torch.autograd.grad(y, x)
+        (z,) = torch.autograd.grad(y, x)
         self.assertEqual(z, 2)
 
 
 @markDynamoStrictTest
 class TestAutogradFunction(TestCase):
-    @xfailIfTorchDynamo
     def test_set_materialize_grads(self, device):
         class A(torch.autograd.Function):
             @staticmethod
@@ -1057,10 +1148,10 @@ def backward(ctx, gx, gy):
 
         def f(y, x):
             x, y = A.apply(x, y)
-            return x ** 2
+            return x**2
 
-        x = torch.tensor(2., device=device)
-        y = torch.tensor(3., device=device)
+        x = torch.tensor(2.0, device=device)
+        y = torch.tensor(3.0, device=device)
         # grad differentiates w.r.t. arg 0 by default
         grad(f)(y, x)
         grad(grad(f))(y, x)
@@ -1069,8 +1160,9 @@ def f(y, x):
     @parametrize("save_for", ["jvp", "vjp"])
     @parametrize("save_tensors", ["input", "output", "neither"])
     @parametrize("mark_dirty", [True, False])
-    @xfailIfTorchDynamo
-    def test_function_returns_input(self, device, inner_requires_grad, save_for, save_tensors, mark_dirty):
+    def test_function_returns_input(
+        self, device, inner_requires_grad, save_for, save_tensors, mark_dirty
+    ):
         class A(torch.autograd.Function):
             @staticmethod
             def forward(x):
@@ -1112,8 +1204,8 @@ def fn(x):
 
         err_msg = "A input that has been returned as-is"
 
-        a = torch.tensor(2., device=device, requires_grad=inner_requires_grad)
-        a_t = torch.tensor(2., device=device, requires_grad=inner_requires_grad)
+        a = torch.tensor(2.0, device=device, requires_grad=inner_requires_grad)
+        a_t = torch.tensor(2.0, device=device, requires_grad=inner_requires_grad)
         if save_tensors in ("input", "output") and not mark_dirty:
             with self.assertRaisesRegex(RuntimeError, err_msg):
                 grad(fn)(a)
@@ -1123,8 +1215,10 @@ def fn(x):
             grad(fn)(a)
             jvp(fn, (a,), (a_t,))
 
-        a = torch.tensor(2., device=device, requires_grad=inner_requires_grad).clone()
-        a_t = torch.tensor(2., device=device, requires_grad=inner_requires_grad).clone()
+        a = torch.tensor(2.0, device=device, requires_grad=inner_requires_grad).clone()
+        a_t = torch.tensor(
+            2.0, device=device, requires_grad=inner_requires_grad
+        ).clone()
 
         if save_tensors in ("input", "output") and not mark_dirty:
             with self.assertRaisesRegex(RuntimeError, err_msg):
@@ -1136,7 +1230,9 @@ def fn(x):
             b = A.apply(a)
             if mark_dirty:
                 self.assertTrue(a is b)
-            if not (mark_dirty and save_for == "vjp" and save_tensors in ("input", "output")):
+            if not (
+                mark_dirty and save_for == "vjp" and save_tensors in ("input", "output")
+            ):
                 # TODO(soulitzer): https://github.com/pytorch/pytorch/issues/97827
                 with fwAD.dual_level():
                     a_dual = fwAD.make_dual(a, a_t)
@@ -1144,7 +1240,6 @@ def fn(x):
                 if mark_dirty:
                     self.assertTrue(a_dual is b_dual)
 
-    @xfailIfTorchDynamo
     def test_needs_input_grads(self, device):
         class A(torch.autograd.Function):
             @staticmethod
@@ -1161,8 +1256,8 @@ def backward(ctx, grad_output):
                 self.assertFalse(ctx.needs_input_grad[1])
                 return None, None
 
-        x = torch.tensor(2., device=device)
-        y = torch.tensor(3., device=device)
+        x = torch.tensor(2.0, device=device)
+        y = torch.tensor(3.0, device=device)
         # grad differentiates w.r.t. arg 0 by default
         grad(A.apply)(x, y)
         grad(grad(A.apply))(x, y)
@@ -1172,7 +1267,7 @@ class NumpyCubeNotComposable(torch.autograd.Function):
             @staticmethod
             def forward(input):
                 input_np = input.cpu().numpy()
-                return torch.tensor(input_np ** 3, device=input.device), input_np
+                return torch.tensor(input_np**3, device=input.device), input_np
 
             @staticmethod
             def setup_context(ctx, inputs, output):
@@ -1182,12 +1277,11 @@ def setup_context(ctx, inputs, output):
             @staticmethod
             @torch.autograd.function.once_differentiable
             def backward(ctx, grad_output, grad_saved):
-                result_np = 3 * (ctx.input_np ** 2)
+                result_np = 3 * (ctx.input_np**2)
                 return torch.tensor(result_np, device=ctx.device)
 
         return NumpyCubeNotComposable
 
-    @xfailIfTorchDynamo
     def test_once_differentiable_autograd_vjp(self, device):
         NumpyCubeNotComposable = self._get_NumpyCubeNotComposable()
 
@@ -1199,7 +1293,7 @@ def f(x):
         x = torch.randn([], requires_grad=True, device=device)
         grad_y = torch.randn_like(x, requires_grad=True)
         _, vjp_fn = vjp(f, x)
-        gx, = vjp_fn(grad_y)
+        (gx,) = vjp_fn(grad_y)
 
         with self.assertRaisesRegex(RuntimeError, "marked with @once_differentiable"):
             gx.backward()
@@ -1217,12 +1311,11 @@ def test_once_differentiable_grad_vjp(self, device):
 
         def h(x, grad_y):
             _, vjp_fn = vjp(f, x)  # noqa: F821
-            gx, = vjp_fn(grad_y)
+            (gx,) = vjp_fn(grad_y)
             return gx
 
         grad(h, argnums=(0, 1))(x, grad_y)
 
-    @xfailIfTorchDynamo
     def test_grad_fn_name(self, device):
         names = []
 
@@ -1244,9 +1337,9 @@ def f(x):
             names.append(type(y.grad_fn).__name__)
             return y
 
-        x = torch.tensor(1.)
+        x = torch.tensor(1.0)
         grad(f)(x)
-        self.assertEqual(names, ['FooBarGeneratedBackward'])
+        self.assertEqual(names, ["FooBarGeneratedBackward"])
 
 
 @markDynamoStrictTest
@@ -1256,8 +1349,8 @@ class NumpyCube(torch.autograd.Function):
             @staticmethod
             def forward(input):
                 input_np = to_numpy(input)  # noqa: F821
-                dinput = torch.tensor(3 * input_np ** 2, device=input.device)
-                return torch.tensor(input_np ** 3, device=input.device), dinput
+                dinput = torch.tensor(3 * input_np**2, device=input.device)
+                return torch.tensor(input_np**3, device=input.device), dinput
 
             @staticmethod
             def setup_context(ctx, inputs, output):
@@ -1268,7 +1361,7 @@ def backward(ctx, grad_output, grad_saved):
                 raise RuntimeError("foobar")
 
         x = torch.randn(3, device=device)
-        with self.assertRaisesRegex(RuntimeError, 'does not have vmap support'):
+        with self.assertRaisesRegex(RuntimeError, "does not have vmap support"):
             vmap(NumpyCube.apply)(x)
 
     def test_has_vmap_staticmethod_and_has_generate_vmap_rule(self, device):
@@ -1278,8 +1371,8 @@ class NumpyCube(torch.autograd.Function):
             @staticmethod
             def forward(input):
                 input_np = to_numpy(input)  # noqa: F821
-                dinput = torch.tensor(3 * input_np ** 2, device=input.device)
-                return torch.tensor(input_np ** 3, device=input.device), dinput
+                dinput = torch.tensor(3 * input_np**2, device=input.device)
+                return torch.tensor(input_np**3, device=input.device), dinput
 
             @staticmethod
             def setup_context(ctx, outputs, input):
@@ -1294,7 +1387,7 @@ def vmap(infos, in_dims, x):
                 raise RuntimeError("foobar")
 
         x = torch.randn(3, device=device)
-        with self.assertRaisesRegex(RuntimeError, 'generate_vmap_rule=True and'):
+        with self.assertRaisesRegex(RuntimeError, "generate_vmap_rule=True and"):
             vmap(NumpyCube.apply)(x)
 
     def test_info_object(self, device):
@@ -1321,7 +1414,7 @@ def vmap(info, in_dims, input):
 
         x = torch.randn(batch_size, 3, device=device)
 
-        for randomness in ('error', 'different', 'same'):
+        for randomness in ("error", "different", "same"):
             vmap(Id.apply, randomness=randomness)(x)
 
     def test_in_dims_single_input(self, device):
@@ -1391,7 +1484,7 @@ def vmap(info, in_dims, input):
                 raise RuntimeError("expected to not be called")
 
         def f(x):
-            y = torch.tensor(1.)
+            y = torch.tensor(1.0)
             y = Id.apply(y)
             return x * 1
 
@@ -1590,16 +1683,17 @@ def _compare_expected_and_result(self, expected, result, mechanism):
             expected = zip(*expected)
             expected = tuple(torch.stack(shards) for shards in expected)
             for r, e in zip(result, expected):
-                # TODO: Check if the rtol is a problem
-                self.assertEqual(r, e, atol=0, rtol=1e-3)
+                self.assertEqual(r, e, atol=0, rtol=1.5e-3)
         else:
             assert mechanism == "functional_call"
-            expected = {k: tuple(d[k] for d in expected) for k, v in expected[0].items()}
+            expected = {
+                k: tuple(d[k] for d in expected) for k, v in expected[0].items()
+            }
             expected = {k: torch.stack(shards) for k, shards in expected.items()}
             for key in result:
-                # TODO: Check if the rtol is a problem
-                self.assertEqual(result[key], expected[key], atol=0, rtol=1e-3)
+                self.assertEqual(result[key], expected[key], atol=0, rtol=1.5e-3)
 
+    @tf32_on_and_off(0.005)
     @parametrize("mechanism", ["make_functional", "functional_call"])
     def test_per_sample_grads_embeddingnet(self, device, mechanism):
         class SampleNet(nn.Module):
@@ -1625,7 +1719,9 @@ def name(self):
         vocab_size = 1000
         batch_shape = [64]
         words_per_sentence = 5
-        data = torch.randint(0, vocab_size, (*batch_shape, words_per_sentence), device=device)
+        data = torch.randint(
+            0, vocab_size, (*batch_shape, words_per_sentence), device=device
+        )
         targets = torch.randint(0, 1, (*batch_shape,), device=device)
 
         # Construct our module
@@ -1660,18 +1756,15 @@ def foo(x, v):
         self.assertEqual(result, x.grad)
 
 
-jacrev_and_jacfwd = parametrize("jacapi", [subtest(jacrev, name='jacrev'), subtest(jacfwd, name='jacfwd')])
+jacrev_and_jacfwd = parametrize(
+    "jacapi", [subtest(jacrev, name="jacrev"), subtest(jacfwd, name="jacfwd")]
+)
 
-FIXME_jacrev_only = parametrize("jacapi", [subtest(jacrev, name='jacrev')])
+FIXME_jacrev_only = parametrize("jacapi", [subtest(jacrev, name="jacrev")])
 
-FIXME_skip_jacrev_dynamo = parametrize("jacapi", [
-    subtest(jacrev, name='jacrev', decorators=[unittest.expectedFailure] if TEST_WITH_TORCHDYNAMO else None),
-    subtest(jacfwd, name='jacfwd')
-])
 
 @markDynamoStrictTest
-class TestJac(TestCase):
-    @xfailIfTorchDynamo
+class TestJac(VmapTearDownMixin, TestCase):
     @jacrev_and_jacfwd
     def test_simple(self, device, jacapi):
         x = torch.randn(3, device=device)
@@ -1679,7 +1772,6 @@ def test_simple(self, device, jacapi):
         expected = torch.diagflat(x.cos())
         assert torch.allclose(y, expected)
 
-    @xfailIfTorchDynamo
     @jacrev_and_jacfwd
     def test_simple_not_flat(self, device, jacapi):
         x = torch.randn(2, 3, device=device)
@@ -1688,7 +1780,6 @@ def test_simple_not_flat(self, device, jacapi):
         expected = expected.view(2, 3, 2, 3)
         assert torch.allclose(y, expected)
 
-    @xfailIfTorchDynamo
     @jacrev_and_jacfwd
     def test_take(self, device, jacapi):
         x = torch.rand(5)
@@ -1700,8 +1791,7 @@ def func(x):
 
         self.assertEqual(jacrev(func)(x), torch.autograd.functional.jacobian(func, x))
 
-    @xfailIfTorchDynamo
-    @FIXME_jacrev_only
+    @jacrev_and_jacfwd
     def test_diff_numel(self, device, jacapi):
         x = torch.randn(2, 4, device=device)
 
@@ -1718,16 +1808,14 @@ def f(x):
         expected[2, 0, 0, 3] = 1
         self.assertEqual(y, expected)
 
-    @xfailIfTorchDynamo
-    @FIXME_jacrev_only
+    @jacrev_and_jacfwd
     def test_vmap_on_jac_simple(self, device, jacapi):
         x = torch.randn(2, 3, device=device)
         y = vmap(jacapi(torch.sin))(x)
         expected = torch.stack([torch.diagflat(x[i].cos()) for i in range(2)])
         assert torch.allclose(y, expected)
 
-    @xfailIfTorchDynamo
-    @FIXME_jacrev_only
+    @jacrev_and_jacfwd
     def test_nested_jac_simple(self, device, jacapi):
         def foo(x):
             return x.sin().sum()
@@ -1737,7 +1825,6 @@ def foo(x):
         expected = torch.diagflat(-x.sin())
         assert torch.allclose(y, expected)
 
-    @xfailIfTorchDynamo
     @jacrev_and_jacfwd
     def test_multiple_args(self, device, jacapi):
         x = torch.randn(3, device=device)
@@ -1746,7 +1833,7 @@ def test_multiple_args(self, device, jacapi):
         expected = torch.diagflat(x)
         assert torch.allclose(z, expected)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_multiple_outputs_multiple_argnums(self, device, jacapi):
         def f(x, y):
             return 2 * x + 3 * y, 4 * x + 5 * y
@@ -1768,7 +1855,7 @@ def f(x, y):
         self.assertEqual(z[1][0], expected_out1_x)
         self.assertEqual(z[1][1], expected_out1_y)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_multiple_outputs_single_argnums(self, device, jacapi):
         def f(x, y):
             return 2 * x + 3 * y, 4 * x + 5 * y
@@ -1789,11 +1876,10 @@ def f(x, y):
         self.assertTrue(isinstance(z[0], tuple))
         self.assertEqual(z, ((expected_out0_x,), (expected_out1_x,)))
 
-    @xfailIfTorchDynamo
-    @FIXME_jacrev_only
+    @jacrev_and_jacfwd
     def test_multiple_outputs_pytree(self, device, jacapi):
         def f(x, y):
-            return {'left': 2 * x + 3 * y, 'right': 4 * x + 5 * y}
+            return {"left": 2 * x + 3 * y, "right": 4 * x + 5 * y}
 
         x = torch.randn(3, device=device)
         y = torch.randn(3, device=device)
@@ -1803,15 +1889,15 @@ def f(x, y):
         expected_right_x = torch.diagflat(torch.full_like(x, 4))
         expected_right_y = torch.diagflat(torch.full_like(y, 5))
         expected = {
-            'left': (expected_left_x, expected_left_y),
-            'right': (expected_right_x, expected_right_y),
+            "left": (expected_left_x, expected_left_y),
+            "right": (expected_right_x, expected_right_y),
         }
         self.assertTrue(isinstance(z, dict))
-        self.assertTrue(isinstance(z['left'], tuple))
-        self.assertTrue(isinstance(z['right'], tuple))
+        self.assertTrue(isinstance(z["left"], tuple))
+        self.assertTrue(isinstance(z["right"], tuple))
         self.assertEqual(z, expected)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_multiple_inputs_pytree(self, device, jacapi):
         def f(a, b, c):
             a0, a1 = a
@@ -1822,21 +1908,23 @@ def f(a, b, c):
 
         result = jacapi(f, argnums=(0, 1, 2))(*args)
         expected = (
-            (torch.tensor(1., device=device), torch.tensor(2., device=device)),
-            torch.tensor(3., device=device),
-            torch.tensor(4., device=device),
+            (torch.tensor(1.0, device=device), torch.tensor(2.0, device=device)),
+            torch.tensor(3.0, device=device),
+            torch.tensor(4.0, device=device),
         )
         self.assertEqual(result, expected)
 
         result = jacapi(f, argnums=(0,))(*args)
-        expected = ((torch.tensor(1., device=device), torch.tensor(2., device=device)),)
+        expected = (
+            (torch.tensor(1.0, device=device), torch.tensor(2.0, device=device)),
+        )
         self.assertEqual(result, expected)
 
         result = jacapi(f)(*args)
-        expected = (torch.tensor(1., device=device), torch.tensor(2., device=device))
+        expected = (torch.tensor(1.0, device=device), torch.tensor(2.0, device=device))
         self.assertEqual(result, expected)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_dimensionality(self, device, jacapi):
         def f(x):
             return x
@@ -1851,8 +1939,7 @@ def f(x):
         self.assertEqual(result.dim(), 2)
         self.assertEqual(result, x.new_ones(1, 1))
 
-    @xfailIfTorchDynamo
-    @FIXME_jacrev_only
+    @jacrev_and_jacfwd
     def test_aux_tensor(self, device, jacapi):
         def f(x):
             y = x.clone()
@@ -1864,11 +1951,11 @@ def f(x):
         self.assertEqual(result, torch.eye(3, 3, device=device))
         self.assertEqual(aux, x.cos())
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_aux_pytree(self, device, jacapi):
         def f(x):
             y = x.clone()
-            return y, {'a': y.cos(), 'b': [y.tan()]}
+            return y, {"a": y.cos(), "b": [y.tan()]}
 
         x = torch.randn(3, device=device)
 
@@ -1878,24 +1965,30 @@ def f(x):
         self.assertEqual(aux, expected_aux)
 
         for aux in [1, 1.0, "abc"]:
-            with self.assertRaisesRegex(RuntimeError, r"Expected tensors, got unsupported type"):
+            with self.assertRaisesRegex(
+                RuntimeError, r"Expected tensors, got unsupported type"
+            ):
                 _ = jacapi(lambda x: (x, aux), has_aux=True)(x)
-            with self.assertRaisesRegex(RuntimeError, r"Expected tensors, got unsupported type"):
+            with self.assertRaisesRegex(
+                RuntimeError, r"Expected tensors, got unsupported type"
+            ):
                 _ = jacapi(lambda x: (x, [x, aux]), has_aux=True)(x)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_outputs_can_any_pytree(self, device, jacapi):
         x = torch.randn(2, 3, device=device)
 
         for output in [None, ()]:
             with self.assertRaisesRegex(
-                RuntimeError, r"(vjp|jvp).+: Expected f to be a function that has non-empty output"
+                RuntimeError,
+                r"(vjp|jvp).+: Expected f to be a function that has non-empty output",
             ):
                 jacapi(lambda _: output)(x)
 
         for output in [1, True, 12.2, "abc"]:
             with self.assertRaisesRegex(
-                RuntimeError, r"(vjp|jvp).+: expected f\(\*primals\) to return only tensors"
+                RuntimeError,
+                r"(vjp|jvp).+: expected f\(\*primals\) to return only tensors",
             ):
                 jacapi(lambda _: output)(x)
 
@@ -1917,11 +2010,11 @@ def composite_output(x):
         assert isinstance(out, list)
         assert isinstance(out[0], tuple) and isinstance(out[0][1], dict)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_multiple_inputs_outputs_pytree(self, device, jacapi):
         def f(a, b, c):
             a0, a1 = a
-            return a0 + a1 * 2, {'foo': b * 3 + c * 4}
+            return a0 + a1 * 2, {"foo": b * 3 + c * 4}
 
         x = torch.randn([], device=device)
         zero = torch.zeros([], device=device)
@@ -1929,44 +2022,46 @@ def f(a, b, c):
 
         result = jacapi(f)(*args)
         expected = (
-            (torch.tensor(1., device=device), torch.tensor(2., device=device)),
-            {'foo': (zero, zero)},
+            (torch.tensor(1.0, device=device), torch.tensor(2.0, device=device)),
+            {"foo": (zero, zero)},
         )
         self.assertEqual(result, expected)
 
         result = jacapi(f, argnums=(0,))(*args)
         expected = (
-            ((torch.tensor(1., device=device), torch.tensor(2., device=device)),),
-            {'foo': ((zero, zero),)},
+            ((torch.tensor(1.0, device=device), torch.tensor(2.0, device=device)),),
+            {"foo": ((zero, zero),)},
         )
         self.assertEqual(result, expected)
 
         result = jacapi(f, argnums=(0, 1))(*args)
         expected = (
-            ((torch.tensor(1., device=device), torch.tensor(2., device=device)), zero),
-            {'foo': ((zero, zero), torch.tensor(3., device=device))},
+            (
+                (torch.tensor(1.0, device=device), torch.tensor(2.0, device=device)),
+                zero,
+            ),
+            {"foo": ((zero, zero), torch.tensor(3.0, device=device))},
         )
         self.assertEqual(result, expected)
 
-    @xfailIfTorchDynamo
-    @FIXME_jacrev_only
+    @jacrev_and_jacfwd
     def test_multiple_inputs_outputs_pytree_multidim(self, device, jacapi):
         def f(dct):
-            a = dct['a']
-            b = dct['b']
-            return {'c': a.sin(), 'd': b.cos()}
+            a = dct["a"]
+            b = dct["b"]
+            return {"c": a.sin(), "d": b.cos()}
 
         x = torch.randn(3, device=device)
-        args = ({'a': x, 'b': x},)
+        args = ({"a": x, "b": x},)
 
         result = jacapi(f)(*args)
         expected = {
-            'c': {'a': x.cos().diagflat(), 'b': x.new_zeros(3, 3)},
-            'd': {'a': x.new_zeros(3, 3), 'b': -x.sin().diagflat()},
+            "c": {"a": x.cos().diagflat(), "b": x.new_zeros(3, 3)},
+            "d": {"a": x.new_zeros(3, 3), "b": -x.sin().diagflat()},
         }
         self.assertEqual(result, expected)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_unrelated_input(self, device, jacapi):
         def f(x, y):
             return x
@@ -1981,7 +2076,7 @@ def f(x, y):
         self.assertTrue(isinstance(result, tuple))
         self.assertEqual(result, expected)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_unrelated_output(self, device, jacapi):
         y = torch.randn(2, 3, device=device)
 
@@ -1994,7 +2089,7 @@ def f(x):
         expected = x.new_zeros(2, 3, 2, 3)
         self.assertEqual(result, expected)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_empty_output(self, device, jacapi):
         x = torch.randn(3, device=device)
         y = torch.randn(3, device=device)
@@ -2002,10 +2097,9 @@ def test_empty_output(self, device, jacapi):
         def f(x, y):
             return ()
 
-        with self.assertRaisesRegex(RuntimeError, 'xpected'):
+        with self.assertRaisesRegex(RuntimeError, "xpected"):
             jacapi(f)(x, y)
 
-    @xfailIfTorchDynamo
     @jacrev_and_jacfwd
     def test_argnums_tuple(self, device, jacapi):
         x = torch.randn(3, device=device)
@@ -2017,7 +2111,6 @@ def test_argnums_tuple(self, device, jacapi):
         assert torch.allclose(z[0], expected0)
         assert torch.allclose(z[1], expected1)
 
-    @xfailIfTorchDynamo
     @jacrev_and_jacfwd
     def test_argnums_effect_on_return(self, device, jacapi):
         x = torch.randn(3, device=device)
@@ -2035,7 +2128,7 @@ def test_argnums_effect_on_return(self, device, jacapi):
         assert isinstance(z, torch.Tensor)
         assert torch.allclose(z, expected0)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_argnums_defaults_to_zero(self, device, jacapi):
         def f(x, y):
             return x * 2 + y * 3
@@ -2046,35 +2139,30 @@ def f(x, y):
         expected = torch.diagflat(torch.full_like(x, 2))
         self.assertEqual(z, expected)
 
-    @xfailIfTorchDynamo
     @jacrev_and_jacfwd
     def test_empty_argnums(self, device, jacapi):
         x = torch.randn(3, device=device)
         with self.assertRaisesRegex(RuntimeError, "must be non-empty"):
             jacapi(torch.sin, argnums=())(x)
 
-    @xfailIfTorchDynamo
     @jacrev_and_jacfwd
     def test_out_of_bounds_argnums(self, device, jacapi):
         x = torch.randn(3, device=device)
         with self.assertRaisesRegex(RuntimeError, "only 1 positional inputs"):
             jacapi(torch.sin, argnums=2)(x)
 
-    @xfailIfTorchDynamo
     @jacrev_and_jacfwd
     def test_negative_argnums(self, device, jacapi):
         x = torch.randn(3, device=device)
         with self.assertRaisesRegex(RuntimeError, "only 1 positional inputs"):
             jacapi(torch.sin, argnums=-2)(x)
 
-    @xfailIfTorchDynamo
     @jacrev_and_jacfwd
     def test_repeated_argnums(self, device, jacapi):
         x = torch.randn(3, device=device)
         with self.assertRaisesRegex(RuntimeError, "must be unique"):
             jacapi(torch.sin, argnums=(0, 0))(x)
 
-    @xfailIfTorchDynamo
     @jacrev_and_jacfwd
     def test_float_argnums(self, device, jacapi):
         x = torch.randn(3, device=device)
@@ -2098,15 +2186,15 @@ def foo(inputs):
         result = jacapi(foo)(inputs)
         self.assertEqual(result, expected)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_against_reference_simple(self, device, jacapi):
         def f(x):
-            return 3 * x ** 2
+            return 3 * x**2
 
         x = torch.randn(2, 3, 5, device=device)
         self._test_against_reference(f, (x,), jacapi)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_against_reference_multi_input(self, device, jacapi):
         def f(x, y):
             return (x.cos() * x) @ y.sin()
@@ -2115,7 +2203,7 @@ def f(x, y):
         y = torch.randn(3, 5, device=device)
         self._test_against_reference(f, (x, y), jacapi)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_against_reference_multi_input_multi_output(self, device, jacapi):
         def f(x, y):
             return (x * x) @ y, x @ (x.sum(1) * y), y.sum()
@@ -2124,7 +2212,7 @@ def f(x, y):
         y = torch.randn(3, 5, device=device)
         self._test_against_reference(f, (x, y), jacapi)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_against_reference_unrelated_outputs(self, device, jacapi):
         def f(x, y):
             return x, y, x, y
@@ -2133,7 +2221,7 @@ def f(x, y):
         y = torch.randn(3, device=device)
         self._test_against_reference(f, (x, y), jacapi)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_against_reference_zero_dim(self, device, jacapi):
         # zero-dim output
         def f(x, y):
@@ -2158,7 +2246,7 @@ def h(x, y):
         y = torch.randn(1, device=device)
         self._test_against_reference(h, (x, y), jacapi)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_against_reference_correctness_different_devices(self, device, jacapi):
         def f(x, y):
             return x * y, (x * y).to(device=device)
@@ -2167,16 +2255,16 @@ def f(x, y):
         y = torch.randn(3)
         self._test_against_reference(f, (x, y), jacapi)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_against_reference_default_arg(self, device, jacapi):
-        def f(x, y, z=3.):
+        def f(x, y, z=3.0):
             return x * y * z
 
         x = torch.randn(3, device=device)
         y = torch.randn(3, device=device)
         self._test_against_reference(f, (x, y), jacapi)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_inplace(self, device, jacapi):
         def f(x, y):
             y.copy_(x)
@@ -2192,17 +2280,23 @@ def g(x, y, z):
             return torch.vstack([(x**2).sum(), (z**3).sum()])
 
         out = jacapi(g, argnums=(1, 2))
-        x, y, z = torch.randn(3, device=device), torch.randn(2, device=device), torch.randn(2, device=device)
+        x, y, z = (
+            torch.randn(3, device=device),
+            torch.randn(2, device=device),
+            torch.randn(2, device=device),
+        )
 
-        expected_out = (torch.zeros(2, 1, 2, device=device), torch.zeros(2, 1, 2, device=device))
+        expected_out = (
+            torch.zeros(2, 1, 2, device=device),
+            torch.zeros(2, 1, 2, device=device),
+        )
         expected_out[0][0][0] = 2 * y  # top left corner
-        expected_out[1][1][0] = 3 * (z ** 2)  # bottom right corner
+        expected_out[1][1][0] = 3 * (z**2)  # bottom right corner
 
         out_val = out(x, y, z)
         self.assertEqual(out_val, expected_out)
 
-    @xfailIfTorchDynamo
-    @parametrize('_preallocate_and_copy', (True, False))
+    @parametrize("_preallocate_and_copy", (True, False))
     def test_chunk_jacrev(self, device, _preallocate_and_copy):
         x = torch.randn(10, 2, device=device)
         y = torch.randn(1, 2, device=device)
@@ -2212,19 +2306,22 @@ def f(x, y):
 
         for chunk_size in (1, 2, 3, 4, 7, 10, 1000):
             expected = jacrev(f, argnums=(0, 1))(x, y)
-            actual = jacrev(f, argnums=(0, 1),
-                            chunk_size=chunk_size,
-                            _preallocate_and_copy=_preallocate_and_copy)(x, y)
+            actual = jacrev(
+                f,
+                argnums=(0, 1),
+                chunk_size=chunk_size,
+                _preallocate_and_copy=_preallocate_and_copy,
+            )(x, y)
             self.assertEqual(actual, expected)
 
         err_msg = "jacrev: `chunk_size` should be greater than 0."
         with self.assertRaisesRegex(ValueError, err_msg):
-            jacrev(f, argnums=(0, ), chunk_size=0)(x, y)
+            jacrev(f, argnums=(0,), chunk_size=0)(x, y)
 
         with self.assertRaisesRegex(ValueError, err_msg):
-            jacrev(f, argnums=(0, ), chunk_size=-2)(x, y)
+            jacrev(f, argnums=(0,), chunk_size=-2)(x, y)
 
-    @parametrize('_preallocate_and_copy', (True, False))
+    @parametrize("_preallocate_and_copy", (True, False))
     def test_chunk_jacrev_composition(self, device, _preallocate_and_copy):
         x = torch.randn(10, 2, device=device)
         chunk_size = 3
@@ -2233,12 +2330,19 @@ def f(x):
             return (x.sin(), x), (x + 2, x.sum())
 
         expected = vmap(jacrev(jacrev(f)))(x)
-        actual = vmap(jacrev(jacrev(f, chunk_size=chunk_size,
-                             _preallocate_and_copy=_preallocate_and_copy), chunk_size=chunk_size))(x)
+        actual = vmap(
+            jacrev(
+                jacrev(
+                    f,
+                    chunk_size=chunk_size,
+                    _preallocate_and_copy=_preallocate_and_copy,
+                ),
+                chunk_size=chunk_size,
+            )
+        )(x)
         self.assertEqual(actual, expected)
 
-    @xfailIfTorchDynamo
-    @parametrize('_preallocate_and_copy', (True, False))
+    @parametrize("_preallocate_and_copy", (True, False))
     def test_chunk_jacrev_chunksize_one(self, device, _preallocate_and_copy):
         # With chunk_size=1, we shouldn't `vmap` and hence not be limited
         # by it's constraints.
@@ -2271,11 +2375,12 @@ def f(x):
         self.assertEqual(actual, expected)
 
         # Should fail with `chunk_size=2`.
-        msg = r"vmap: We do not support batching operators that can output dynamic shape."
+        msg = (
+            r"vmap: We do not support batching operators that can output dynamic shape."
+        )
         with self.assertRaisesRegex(RuntimeError, msg):
             jacrev(f, chunk_size=2, _preallocate_and_copy=_preallocate_and_copy)(x)
 
-    @xfailIfTorchDynamo
     def test_complex_error(self, device):
         # Verify complex input raises error
         # C -> C
@@ -2303,7 +2408,7 @@ def fn(x):
         with self.assertRaisesRegex(RuntimeError, "jacfwd: Expected all outputs"):
             jacfwd(fn)(x)
 
-    @FIXME_skip_jacrev_dynamo
+    @jacrev_and_jacfwd
     def test_jac_with_non_tensor_args(self, device, jacapi):
         def f(t, int_x):
             return t + int_x
@@ -2314,6 +2419,7 @@ def f(t, int_x):
         expected = torch.autograd.functional.jacobian(partial(f, int_x=3), t)
         self.assertEqual(actual, expected)
 
+
 @markDynamoStrictTest
 class TestHessian(TestCase):
     def _test_against_reference(self, f, inputs):
@@ -2326,7 +2432,7 @@ def foo(inputs):
 
     def test_hessian_vectorize_correctness_simple(self, device):
         def f(x):
-            return (3 * x ** 2).sum()
+            return (3 * x**2).sum()
 
         x = torch.randn(2, 3, 5, device=device)
         self._test_against_reference(f, (x,))
@@ -2343,7 +2449,7 @@ def f(x, y, z):
     def test_hessian_vectorize_correctness_unrelated_outputs(self, device):
         # output unrelated to one input
         def f(x, y):
-            return (x ** 2).sum()
+            return (x**2).sum()
 
         x = torch.randn(2, device=device)
         y = torch.randn(3, device=device)
@@ -2381,17 +2487,16 @@ def loss(A, x1, x2):
 @markDynamoStrictTest
 class TestJvp(TestCase):
     def test_inplace_on_captures(self, device):
-        x = torch.tensor([1., 2., 3.], device=device)
+        x = torch.tensor([1.0, 2.0, 3.0], device=device)
         captured = torch.randn(3, device=device)
 
         def foo(x):
             captured.copy_(x)
             return (x * captured).sum()
 
-        with self.assertRaisesRegex(RuntimeError, 'mutate a captured Tensor'):
+        with self.assertRaisesRegex(RuntimeError, "mutate a captured Tensor"):
             grad(foo)(x)
 
-    @xfailIfTorchDynamo
     def test_simple(self, device):
         x = torch.randn(2, 3, device=device)
         t = torch.randn(2, 3, device=device)
@@ -2400,7 +2505,6 @@ def test_simple(self, device):
         self.assertTrue(isinstance(result, tuple))
         self.assertEqual(result, expected)
 
-    @xfailIfTorchDynamo
     def test_multiple_inputs(self, device):
         x = torch.randn(2, 3, device=device)
         y = torch.randn(2, 3, device=device)
@@ -2415,14 +2519,15 @@ def f(x, y):
         self.assertTrue(isinstance(result, tuple))
         self.assertEqual(result, expected)
 
-    @xfailIfTorchDynamo
     def test_pytree_inputs(self, device):
         def f(x, y, z):
             a, b = x
             return a + 2 * b + 3 * y + 4 * z
 
-        one = torch.tensor(1., device=device)
-        primal_outs, tangent_outs = jvp(f, ((one, one), one, one), ((one, one), one, one))
+        one = torch.tensor(1.0, device=device)
+        primal_outs, tangent_outs = jvp(
+            f, ((one, one), one, one), ((one, one), one, one)
+        )
         self.assertEqual(primal_outs, one * 10)
         self.assertEqual(tangent_outs, one * 10)
 
@@ -2430,20 +2535,19 @@ def test_pytree_inputs_error_cases(self, device):
         def f(x):
             return x
 
-        one = torch.tensor(1., device=device)
+        one = torch.tensor(1.0, device=device)
 
-        with self.assertRaisesRegex(RuntimeError, 'Expected primals to be a tuple'):
+        with self.assertRaisesRegex(RuntimeError, "Expected primals to be a tuple"):
             jvp(f, one, one)
-        with self.assertRaisesRegex(RuntimeError, 'same python structure'):
+        with self.assertRaisesRegex(RuntimeError, "same python structure"):
             jvp(f, ((one, one), one), (one, one))
-        with self.assertRaisesRegex(RuntimeError, 'only contain Tensors'):
+        with self.assertRaisesRegex(RuntimeError, "only contain Tensors"):
             jvp(f, ((one, one), 1), ((one, one), one))
-        with self.assertRaisesRegex(RuntimeError, 'only contain Tensors'):
+        with self.assertRaisesRegex(RuntimeError, "only contain Tensors"):
             jvp(f, ((one, one), 1), ((1, one), one))
-        with self.assertRaisesRegex(RuntimeError, 'at least one Tensor'):
+        with self.assertRaisesRegex(RuntimeError, "at least one Tensor"):
             jvp(f, ((),), ((),))
 
-    @xfailIfTorchDynamo
     def test_unrelated_input(self, device):
         def f(x, y):
             return x
@@ -2458,7 +2562,6 @@ def f(x, y):
         self.assertTrue(isinstance(result, tuple))
         self.assertEqual(result, expected)
 
-    @xfailIfTorchDynamo
     def test_unrelated_output(self, device):
         y = torch.randn(2, 3, device=device)
 
@@ -2473,7 +2576,6 @@ def f(x):
         self.assertTrue(isinstance(result, tuple))
         self.assertEqual(result, expected)
 
-    @xfailIfTorchDynamo
     def test_strict_mode(self, device):
         y = torch.randn(2, 3, device=device)
 
@@ -2486,7 +2588,6 @@ def f(x):
         with self.assertRaisesRegex(RuntimeError, "strict"):
             jvp(f, (x,), (tx,), strict=True)
 
-    @xfailIfTorchDynamo
     def test_multiple_outputs(self, device):
         x = torch.randn(2, 3, device=device)
         t = torch.randn(2, 3, device=device)
@@ -2499,7 +2600,6 @@ def f(x):
         self.assertTrue(isinstance(result, tuple))
         self.assertEqual(result, expected)
 
-    @xfailIfTorchDynamo
     def test_multiple_inputs_outputs(self, device):
         x = torch.randn(2, 3, device=device)
         y = torch.randn(2, 3, device=device)
@@ -2514,7 +2614,6 @@ def f(x, y):
         self.assertTrue(isinstance(result, tuple))
         self.assertEqual(result, expected)
 
-    @xfailIfTorchDynamo
     def test_primals_tangents_length_mismatch(self, device):
         x = torch.randn(2, 3, device=device)
         t = torch.randn(2, 3, device=device)
@@ -2525,41 +2624,40 @@ def test_primals_tangents_length_mismatch(self, device):
         with self.assertRaisesRegex(RuntimeError, msg):
             jvp(torch.sin, (x, x), (t, t, t))
 
-    @xfailIfTorchDynamo
     def test_nonempty_primals_and_tangents(self, device):
         with self.assertRaisesRegex(RuntimeError, "at least one Tensor"):
             jvp(torch.sin, (), ())
 
-    @xfailIfTorchDynamo
     def test_inputs_are_tuples_of_tensors(self, device):
         x = torch.randn(2, 3, device=device)
         t = torch.randn(2, 3, device=device)
 
-        with self.assertRaisesRegex(RuntimeError, 'be a tuple'):
+        with self.assertRaisesRegex(RuntimeError, "be a tuple"):
             jvp(torch.sin, x, (t,))
-        with self.assertRaisesRegex(RuntimeError, 'same python structure'):
+        with self.assertRaisesRegex(RuntimeError, "same python structure"):
             jvp(torch.sin, (x,), t)
-        with self.assertRaisesRegex(RuntimeError, 'same python structure'):
+        with self.assertRaisesRegex(RuntimeError, "same python structure"):
             jvp(torch.sin, (x,), [t])
-        with self.assertRaisesRegex(RuntimeError, 'only contain Tensors'):
-            jvp(torch.sin, (1.,), (t,))
-        with self.assertRaisesRegex(RuntimeError, 'only contain Tensors'):
-            jvp(torch.sin, (x,), (1.,))
+        with self.assertRaisesRegex(RuntimeError, "only contain Tensors"):
+            jvp(torch.sin, (1.0,), (t,))
+        with self.assertRaisesRegex(RuntimeError, "only contain Tensors"):
+            jvp(torch.sin, (x,), (1.0,))
 
-    @xfailIfTorchDynamo
     def test_outputs_can_any_pytree(self, device):
         x = torch.randn(2, 3, device=device)
         t = torch.randn(2, 3, device=device)
 
         for output in [None, ()]:
             with self.assertRaisesRegex(
-                RuntimeError, r"jvp\(f, primals, tangents\): Expected f to be a function that has non-empty output"
+                RuntimeError,
+                r"jvp\(f, primals, tangents\): Expected f to be a function that has non-empty output",
             ):
                 jvp(lambda _: output, (x,), (t,))
 
         for output in [1, True, 12.2, "abc"]:
             with self.assertRaisesRegex(
-                RuntimeError, r"jvp\(f, primals, tangents\): expected f\(\*primals\) to return only tensors"
+                RuntimeError,
+                r"jvp\(f, primals, tangents\): expected f\(\*primals\) to return only tensors",
             ):
                 jvp(lambda _: output, (x,), (t,))
 
@@ -2582,54 +2680,56 @@ def composite_output(x):
         out = jvp(composite_output, (x,), (t,))
         for i in range(2):
             assert isinstance(out[i], list)
-            assert isinstance(out[i][0], tuple) and \
-                isinstance(out[i][0][1], dict)
+            assert isinstance(out[i][0], tuple) and isinstance(out[i][0][1], dict)
 
-    @xfailIfTorchDynamo
     def test_aux_tensor(self, device):
-
         x = torch.randn(3, device=device)
         t = torch.randn(3, device=device)
 
         with self.assertRaisesRegex(
-            RuntimeError, r'jvp\(f, primals, tangents\): output of function f should be a tuple'
+            RuntimeError,
+            r"jvp\(f, primals, tangents\): output of function f should be a tuple",
         ):
-            jvp(lambda t: [t, t], (x, ), (t, ), has_aux=True)
+            jvp(lambda t: [t, t], (x,), (t,), has_aux=True)
 
         with self.assertRaisesRegex(
-            RuntimeError, r'jvp\(f, primals, tangents\): output of function f should be a tuple'
+            RuntimeError,
+            r"jvp\(f, primals, tangents\): output of function f should be a tuple",
         ):
-            jvp(lambda t: (t, t + 2, t + 3), (x, ), (t, ), has_aux=True)
+            jvp(lambda t: (t, t + 2, t + 3), (x,), (t,), has_aux=True)
 
         def f(z):
             y = z.sin()
             return y, z.cos()
 
-        out, jvp_out, aux = jvp(f, (x, ), (t, ), has_aux=True)
+        out, jvp_out, aux = jvp(f, (x,), (t,), has_aux=True)
         self.assertEqual(aux, x.cos())
         self.assertEqual(out, x.sin())
         self.assertEqual(jvp_out, t * x.cos())
 
-    @xfailIfTorchDynamo
     def test_aux_pytree(self, device):
         def f(x):
             y = x.sin()
-            return y, {'a': x.cos(), 'b': [x.tan()]}
+            return y, {"a": x.cos(), "b": [x.tan()]}
 
         x = torch.randn(3, device=device)
         t = torch.randn(3, device=device)
 
-        out, jvp_out, aux = jvp(f, (x, ), (t, ), has_aux=True)
+        out, jvp_out, aux = jvp(f, (x,), (t,), has_aux=True)
         expected_out, expected_aux = f(x)
         self.assertEqual(out, expected_out)
         self.assertEqual(aux, expected_aux)
         self.assertEqual(jvp_out, t * x.cos())
 
         for aux in [1, 1.0, "abc"]:
-            with self.assertRaisesRegex(RuntimeError, r"Expected tensors, got unsupported type"):
-                _ = jvp(lambda x: (x, aux), (x, ), (t, ), has_aux=True)
-            with self.assertRaisesRegex(RuntimeError, r"Expected tensors, got unsupported type"):
-                _ = jvp(lambda x: (x, [x, aux]), (x, ), (t, ), has_aux=True)
+            with self.assertRaisesRegex(
+                RuntimeError, r"Expected tensors, got unsupported type"
+            ):
+                _ = jvp(lambda x: (x, aux), (x,), (t,), has_aux=True)
+            with self.assertRaisesRegex(
+                RuntimeError, r"Expected tensors, got unsupported type"
+            ):
+                _ = jvp(lambda x: (x, [x, aux]), (x,), (t,), has_aux=True)
 
     def test_autograd_function_disables_fwd_grad(self, device):
         # Sanity check. We don't really assume this anywhere so
@@ -2648,7 +2748,6 @@ def backward(ctx, gx):
         x = torch.randn(3, requires_grad=True)
         MySquare.apply(x)
 
-    @xfailIfTorchDynamo
     def test_disable_fwd_grad_outside(self, device):
         x = torch.randn([], device=device)
         t = torch.ones_like(x)
@@ -2656,12 +2755,11 @@ def test_disable_fwd_grad_outside(self, device):
             _, y = jvp(torch.sin, (x,), (t,))
         self.assertEqual(y, x.cos())
 
-    @xfailIfTorchDynamo
     def test_disable_fwd_grad_inside(self, device):
         def f(x):
             with fwAD._set_fwd_grad_enabled(False):
-                shift = x ** 2
-            return x ** 2 - shift
+                shift = x**2
+            return x**2 - shift
 
         x = torch.randn([], device=device)
         t = torch.ones_like(x)
@@ -2670,12 +2768,11 @@ def f(x):
         _, y = jvp(lambda x: jvp(f, (x,), (t,))[1], (x,), (t,))
         self.assertEqual(y, 2)
 
-    @xfailIfTorchDynamo
     def test_disable_fwd_grad_mixed(self, device):
         def f(x):
             with fwAD._set_fwd_grad_enabled(False):
-                shift = x ** 2
-            return x ** 2 - shift
+                shift = x**2
+            return x**2 - shift
 
         x = torch.randn([], device=device)
         t = torch.ones_like(x)
@@ -2684,7 +2781,6 @@ def f(x):
 
         self.assertEqual(y, 2 * x)
 
-    @xfailIfTorchDynamo
     def test_jvp_inside_autograd_function(self, device):
         class MySin(torch.autograd.Function):
             @staticmethod
@@ -2696,7 +2792,7 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, gx):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 t = torch.ones_like(x)
                 _, cos_x = jvp(torch.sin, (x,), (t,))
                 return gx * cos_x
@@ -2705,7 +2801,7 @@ def backward(ctx, gx):
         y = MySin.apply(x)
         self.assertEqual(y, x.sin())
 
-        gx, = torch.autograd.grad(y, x)
+        (gx,) = torch.autograd.grad(y, x)
         self.assertEqual(gx, x.cos())
 
     def test_zerotensor_vmapjvp_interaction(self, device):
@@ -2720,9 +2816,9 @@ def push_jvp(dummy, x):
         # Should not error
         vmap(vmap(push_jvp, (0, None)))(dummy, x)
 
+
 @markDynamoStrictTest
 class TestLinearize(TestCase):
-    @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/96559")
     @dtypes(torch.float)
     def test_linearize_basic(self, device, dtype):
         x_p = make_tensor((3, 1), device=device, dtype=dtype)
@@ -2737,7 +2833,6 @@ def fn(x):
         self.assertEqual(actual_output, expected_output)
         self.assertEqual(actual_jvp, expected_jvp)
 
-    @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/96559")
     @dtypes(torch.float)
     def test_linearize_return(self, device, dtype):
         x_p = make_tensor((3, 1), device=device, dtype=dtype)
@@ -2752,7 +2847,6 @@ def fn(x):
         self.assertEqual(actual_output, expected_output)
         self.assertEqual(actual_jvp, expected_jvp)
 
-    @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/96559")
     @dtypes(torch.float)
     def test_linearize_composition(self, device, dtype):
         x_p = make_tensor((3, 1), device=device, dtype=dtype)
@@ -2766,12 +2860,12 @@ def fn(x):
 
         def jvp_fn(x_t):
             return jvp(fn, (x_p,), (x_t,))[1]
+
         expected_batched_jvp = vmap(jvp_fn)(x_t)
 
         self.assertEqual(actual_batched_jvp, expected_batched_jvp)
 
     @dtypes(torch.float)
-    @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/96559")
     def test_linearize_nested_input_nested_output(self, device, dtype):
         x_p = make_tensor((3, 1), device=device, dtype=dtype)
         x_t = make_tensor((3, 1), device=device, dtype=dtype)
@@ -2781,14 +2875,14 @@ def test_linearize_nested_input_nested_output(self, device, dtype):
         z_t = make_tensor((3, 1), device=device, dtype=dtype)
 
         def fn(arg):
-            x = arg['x']
-            y = arg['yz'][0]
-            z = arg['yz'][1]
+            x = arg["x"]
+            y = arg["yz"][0]
+            z = arg["yz"][1]
 
-            return {'a': x.sum(), 'b': {'c': y + z, 'd': (x * z, y.exp())}}
+            return {"a": x.sum(), "b": {"c": y + z, "d": (x * z, y.exp())}}
 
-        inp_p = {'x': x_p, 'yz': (y_p, z_p)}
-        inp_t = {'x': x_t, 'yz': (y_t, z_t)}
+        inp_p = {"x": x_p, "yz": (y_p, z_p)}
+        inp_t = {"x": x_t, "yz": (y_t, z_t)}
         actual_output, jvp_fn = linearize(fn, inp_p)
         actual_jvp = jvp_fn(inp_t)
 
@@ -2798,10 +2892,9 @@ def fn(arg):
         self.assertEqual(actual_jvp, expected_jvp)
 
     @onlyCUDA
-    @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/96559")
     def test_linearize_errors(self):
         dtype = torch.float
-        device = torch.device('cpu')
+        device = torch.device("cpu")
         x_p = make_tensor((3, 1), device=device, dtype=dtype)
         x_t = make_tensor((3, 1), device=device, dtype=dtype)
 
@@ -2810,17 +2903,26 @@ def fn(x):
 
         _, jvp_fn = linearize(fn, x_p)
 
-        with self.assertRaisesRegex(RuntimeError, "to have the same argspec as the primals"):
+        with self.assertRaisesRegex(
+            RuntimeError, "to have the same argspec as the primals"
+        ):
             jvp_fn((x_t, x_t))
 
-        with self.assertRaisesRegex(RuntimeError, "in flattened pytree doesn't match the shape"):
+        with self.assertRaisesRegex(
+            RuntimeError, "in flattened pytree doesn't match the shape"
+        ):
             jvp_fn(x_t.unsqueeze(0))
 
-        with self.assertRaisesRegex(RuntimeError, "in flattened pytree doesn't match the dtype"):
+        with self.assertRaisesRegex(
+            RuntimeError, "in flattened pytree doesn't match the dtype"
+        ):
             jvp_fn(x_t.to(torch.double))
 
-        with self.assertRaisesRegex(RuntimeError, "in flattened pytree doesn't match the device"):
-            jvp_fn(x_t.to(torch.device('cuda')))
+        with self.assertRaisesRegex(
+            RuntimeError, "in flattened pytree doesn't match the device"
+        ):
+            jvp_fn(x_t.to(torch.device("cuda")))
+
 
 # The tests here follow the cases in [Forward Grad View/inplace]
 # https://github.com/pytorch/pytorch/blob/master/torch/csrc/autograd/autograd_meta.cpp#L18-L43
@@ -2833,11 +2935,13 @@ def test_all_dual_no_view(self, device):
         def push_jvp(f):
             def inner(x, xt, y, yt):
                 return jvp(f, (x, y), (xt, yt))
+
             return inner
 
         def f(x, y):
             x.copy_(y)
             return x
+
         x = torch.randn(3, B, device=device)
         xt = torch.randn(3, B, device=device)
         y = torch.randn(3, B, device=device)
@@ -2861,6 +2965,7 @@ def test_all_dual_base_view_inplace(self, device):
         def push_jvp(f):
             def inner(x, xt, y, yt):
                 return jvp(f, (x, y), (xt, yt))
+
             return inner
 
         # with view, propagate from view to base
@@ -2898,6 +3003,7 @@ def test_all_dual_base_inplace(self, device):
         def push_jvp(f):
             def inner(x, xt, y, yt):
                 return jvp(f, (x, y), (xt, yt))
+
             return inner
 
         # Case 3: with view, propagate from base to view
@@ -2995,7 +3101,9 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, gy):
-                wrapped = torch._functorch.autograd_function.CtxWithSavedTensors(ctx, (y,))
+                wrapped = torch._functorch.autograd_function.CtxWithSavedTensors(
+                    ctx, (y,)
+                )
                 return gy
 
         class B(torch.autograd.Function):
@@ -3007,14 +3115,16 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, gy):
-                wrapped = torch._functorch.autograd_function.CtxWithSavedTensors(ctx, (y,))
+                wrapped = torch._functorch.autograd_function.CtxWithSavedTensors(
+                    ctx, (y,)
+                )
                 return gy
 
         out = A.apply(x)
-        with self.assertRaisesRegex(RuntimeError, 'name collision'):
+        with self.assertRaisesRegex(RuntimeError, "name collision"):
             out.backward()
         out = B.apply(x)
-        with self.assertRaisesRegex(RuntimeError, 'name collision'):
+        with self.assertRaisesRegex(RuntimeError, "name collision"):
             out.backward()
 
     def test_CtxWithSavedTensors_nesting(self, device):
@@ -3064,7 +3174,9 @@ def forward(ctx, x):
             def backward(ctx, gy):
                 # The override can be literally anything
                 override = (1, 2, 3)
-                wrapped = torch._functorch.autograd_function.CtxWithSavedTensors(ctx, override)
+                wrapped = torch._functorch.autograd_function.CtxWithSavedTensors(
+                    ctx, override
+                )
                 assert wrapped.saved_tensors == override
                 return gy
 
@@ -3085,13 +3197,15 @@ def forward(ctx, x, y):
             def backward(ctx, gz):
                 # The override can be literally anything
                 override = (1, 2, 3)
-                wrapped = torch._functorch.autograd_function.CtxWithSavedTensors(ctx, override)
+                wrapped = torch._functorch.autograd_function.CtxWithSavedTensors(
+                    ctx, override
+                )
 
                 assert wrapped.needs_input_grad[0] == ctx.needs_input_grad[0]
                 assert wrapped.needs_input_grad[1] == ctx.needs_input_grad[1]
-                wrapped.foo = 'bar'
-                assert wrapped.foo == 'bar'
-                assert ctx.foo == 'bar'
+                wrapped.foo = "bar"
+                assert wrapped.foo == "bar"
+                assert ctx.foo == "bar"
                 return gz, gz
 
         out = A.apply(x, y)
@@ -3152,10 +3266,10 @@ def test_deprecation_vmap(self, device):
             torch.vmap(torch.sin)
 
     # Some of these pass, some of these don't
-    @skipIfTorchDynamo
-    @parametrize('transform', [
-        'grad', 'jacrev', 'jacfwd', 'grad_and_value', 'hessian', 'functionalize'
-    ])
+    @parametrize(
+        "transform",
+        ["grad", "jacrev", "jacfwd", "grad_and_value", "hessian", "functionalize"],
+    )
     def test_deprecation_transforms(self, device, transform):
         api = getattr(functorch, transform)
         new_api = getattr(torch.func, transform)
@@ -3204,7 +3318,6 @@ def test_vmap_vmap(self, device):
         y = vmap(vmap(torch.sin))(x)
         self.assertEqual(y, x.sin())
 
-    @xfailIfTorchDynamo
     def test_vmap_vjp(self, device):
         x = torch.randn(3, device=device)
         _, vjp_fn = vjp(torch.sin, x)
@@ -3223,7 +3336,6 @@ def foo(x):
         result = vmap(lambda x: vjp_fn(x)[0])(xs)
         self.assertEqual(result, expected)
 
-    @xfailIfTorchDynamo
     def test_vjp_grad(self, device):
         x = torch.randn([], device=device)
         y, vjp_fn = vjp(grad(torch.sin), x)
@@ -3232,7 +3344,6 @@ def test_vjp_grad(self, device):
         v = torch.randn([])
         self.assertEqual(vjp_fn(v)[0], -x.sin() * v)
 
-    @xfailIfTorchDynamo
     def test_vjp_vmap(self, device):
         x = torch.randn(3, device=device)
         y, vjp_fn = vjp(vmap(torch.sin), x)
@@ -3241,7 +3352,6 @@ def test_vjp_vmap(self, device):
         v = torch.randn(3, device=device)
         self.assertEqual(vjp_fn(v)[0], x.cos() * v)
 
-    @xfailIfTorchDynamo
     def test_vjp_vjp(self, device):
         x = torch.randn(3, device=device)
         y, vjp_fn = vjp(torch.sin, x)
@@ -3256,23 +3366,23 @@ def test_vjp_vjp(self, device):
     def test_make_fx_vmap(self, device):
         def f(x):
             return torch.sin(x)
+
         inp = torch.randn(5, 3)
         f = vmap(f)
         fx_f = make_fx(f)(inp)
         new_inp = torch.randn(5, 3)
         self.assertEqual(fx_f(new_inp), f(new_inp))
 
-    @xfailIfTorchDynamo
     def test_make_fx_jacrev(self, device):
         def f(x):
             return x.sin().sum()
+
         inp = torch.randn(3)
         f = jacrev(jacrev(f))
         fx_f = make_fx(f)(inp)
         new_inp = torch.randn(3)
         self.assertEqual(fx_f(new_inp), f(new_inp))
 
-    @xfailIfTorchDynamo
     def test_make_fx_vjp(self, device):
         def f(x):
             return torch.sin(x).sum()
@@ -3291,7 +3401,8 @@ def test_no_warning_on_import_functorch(self, device):
         out = subprocess.check_output(
             [sys.executable, "-W", "all", "-c", "import functorch"],
             stderr=subprocess.STDOUT,
-            cwd=os.path.dirname(os.path.realpath(__file__)),).decode("utf-8")
+            cwd=os.path.dirname(os.path.realpath(__file__)),
+        ).decode("utf-8")
         self.assertEqual(out, "")
 
     def test_requires_grad_inside_transform(self, device):
@@ -3369,17 +3480,29 @@ def f(x):
     def test_autograd_functional_jacfwd_inside_transform(self, device):
         def f(x):
             y = torch.autograd.functional.jacobian(
-                lambda x: x.sin().sum(), x, strategy='forward-mode', vectorize=True)
+                lambda x: x.sin().sum(), x, strategy="forward-mode", vectorize=True
+            )
             return y
 
         B = 5
         x = torch.randn(B, 3)
-        with self.assertRaisesRegex(RuntimeError, "Batching rule not implemented for aten::_make_dual"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Batching rule not implemented for aten::_make_dual"
+        ):
             vmap(f)(x)
 
-    @parametrize('transform', [
-        'vmap', 'grad', 'jacrev', 'jacfwd', 'grad_and_value', 'hessian', 'functionalize'
-    ])
+    @parametrize(
+        "transform",
+        [
+            "vmap",
+            "grad",
+            "jacrev",
+            "jacfwd",
+            "grad_and_value",
+            "hessian",
+            "functionalize",
+        ],
+    )
     def test_autograd_function_no_setup_context(self, device, transform):
         class MySin(torch.autograd.Function):
             @staticmethod
@@ -3389,19 +3512,27 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, gy):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 return gy * x.cos()
 
         x = torch.randn(3, device=device)
         transform = getattr(functorch, transform)
-        with self.assertRaisesRegex(RuntimeError, 'must override the setup_context'):
+        with self.assertRaisesRegex(RuntimeError, "must override the setup_context"):
             transform(MySin.apply)(x)
 
     # Some of these pass, some of these don't
-    @skipIfTorchDynamo
-    @parametrize('transform', [
-        'vmap', 'grad', 'jacrev', 'jacfwd', 'grad_and_value', 'hessian', 'functionalize'
-    ])
+    @parametrize(
+        "transform",
+        [
+            "vmap",
+            "grad",
+            "jacrev",
+            "jacfwd",
+            "grad_and_value",
+            "hessian",
+            "functionalize",
+        ],
+    )
     def test_transforms_dont_support_saved_tensor_hooks(self, device, transform):
         def f(x):
             return torch.sin(x).sum()
@@ -3412,7 +3543,7 @@ def g(x):
 
         x = torch.randn(3, device=device)
 
-        if transform == 'functionalize':
+        if transform == "functionalize":
             transform = functorch.experimental.functionalize
         else:
             transform = getattr(functorch, transform)
@@ -3468,7 +3599,7 @@ def f(x):
 
         with _ExcludeDispatchKeyGuard(DispatchKeySet(DispatchKey.Functionalize)):
             gm = make_fx(functorch.functionalize(f))(x)
-            self.assertTrue('sin_' not in gm.code)
+            self.assertTrue("sin_" not in gm.code)
             self.assertEqual(gm(x), expected)
 
             local_exclude_set = torch._C._dispatch_tls_local_exclude_set()
@@ -3503,7 +3634,7 @@ def f(x):
 
 @markDynamoStrictTest
 class TestMakeFunctional(TestCase):
-    @parametrize('disable_autograd_tracking', [True, False])
+    @parametrize("disable_autograd_tracking", [True, False])
     def test_disable_autograd_tracking(self, disable_autograd_tracking):
         class Foo(nn.Module):
             def __init__(self):
@@ -3515,7 +3646,9 @@ def forward(self, x):
                 return x
 
         mod = Foo()
-        _, params = make_functional(mod, disable_autograd_tracking=disable_autograd_tracking)
+        _, params = make_functional(
+            mod, disable_autograd_tracking=disable_autograd_tracking
+        )
         self.assertEqual(len(params), 2)
         for param in params:
             self.assertEqual(param.requires_grad, not disable_autograd_tracking)
@@ -3555,8 +3688,8 @@ def __init__(self):
                 super().__init__()
                 self.bias = nn.Parameter(torch.randn(3))
                 self.linear = nn.Linear(3, 3)
-                self.register_buffer('buffer', torch.randn(3))
-                self.register_buffer('buffer_tied', self.buffer)
+                self.register_buffer("buffer", torch.randn(3))
+                self.register_buffer("buffer_tied", self.buffer)
 
             def forward(self, x):
                 x = self.linear(x)
@@ -3580,13 +3713,13 @@ def forward(self, x):
         expected = mod(x)
         self.assertEqual(result, expected)
 
-    @parametrize('disable_autograd_tracking', [True, False])
+    @parametrize("disable_autograd_tracking", [True, False])
     def test_with_buffers_disable_autograd_tracking(self, disable_autograd_tracking):
         class Foo(nn.Module):
             def __init__(self):
                 super().__init__()
                 self.linear = nn.Linear(3, 3)
-                self.register_buffer('buffer', torch.randn(3))
+                self.register_buffer("buffer", torch.randn(3))
 
             def forward(self, x):
                 x = self.linear(x)
@@ -3594,19 +3727,21 @@ def forward(self, x):
                 return x
 
         mod = Foo()
-        _, params, buffers = make_functional_with_buffers(mod, disable_autograd_tracking=disable_autograd_tracking)
+        _, params, buffers = make_functional_with_buffers(
+            mod, disable_autograd_tracking=disable_autograd_tracking
+        )
         self.assertEqual(len(params), 2)
         self.assertEqual(len(buffers), 1)
         for param in params:
             self.assertEqual(param.requires_grad, not disable_autograd_tracking)
 
-    @parametrize('detach_params', [True, False])
+    @parametrize("detach_params", [True, False])
     def test_using_detach_functional_call(self, detach_params):
         class Foo(nn.Module):
             def __init__(self):
                 super().__init__()
                 self.linear = nn.Linear(3, 3)
-                self.register_buffer('buffer', torch.randn(3))
+                self.register_buffer("buffer", torch.randn(3))
 
             def forward(self, x):
                 x = self.linear(x)
@@ -3615,7 +3750,11 @@ def forward(self, x):
 
         def params_dict(mod):
             named_params = mod.named_parameters()
-            return {k: v.detach() for k, v in named_params} if detach_params else dict(named_params)
+            return (
+                {k: v.detach() for k, v in named_params}
+                if detach_params
+                else dict(named_params)
+            )
 
         mod = Foo()
         x = torch.randn(3, 3)
@@ -3662,8 +3801,8 @@ def __init__(self):
                 self.linear = nn.Linear(3, 3)
                 self.weight = self.linear.weight
                 self.bias = self.linear.bias
-                self.register_buffer('buffer', torch.randn(3))
-                self.register_buffer('buffer_tied', self.buffer)
+                self.register_buffer("buffer", torch.randn(3))
+                self.register_buffer("buffer_tied", self.buffer)
 
             def forward(self, x):
                 x = self.linear(x)
@@ -3779,18 +3918,24 @@ def test_stack_module_state_error(self):
         out_features = 2
 
         models = []
-        with self.assertRaisesRegex(RuntimeError, "stack_module_state:.* Expected at least one model"):
+        with self.assertRaisesRegex(
+            RuntimeError, "stack_module_state:.* Expected at least one model"
+        ):
             _ = stack_module_state(models)
 
         num_models = 3
         models = [torch.nn.Linear(in_features, out_features) for i in range(num_models)]
         models[1].eval()
-        with self.assertRaisesRegex(RuntimeError, "stack_module_state:.* same training/eval mode."):
+        with self.assertRaisesRegex(
+            RuntimeError, "stack_module_state:.* same training/eval mode."
+        ):
             _ = stack_module_state(models)
 
         models = [torch.nn.Linear(in_features, out_features) for i in range(num_models)]
         models[1] = torch.nn.Conv2d(3, 3, (3, 3))
-        with self.assertRaisesRegex(RuntimeError, "stack_module_state:.* models to be of the same class"):
+        with self.assertRaisesRegex(
+            RuntimeError, "stack_module_state:.* models to be of the same class"
+        ):
             _ = stack_module_state(models)
 
     @parametrize("mechanism", ["make_functional", "functional_call"])
@@ -3839,6 +3984,7 @@ def get_module_info(mod):
         self.assertEqual(old_state_linear_weight, new_state_linear_weight)
         self.assertEqual(old_state_linear_bias, new_state_linear_bias)
 
+
 @markDynamoStrictTest
 class TestExamplesCorrectness(TestCase):
     def _update_params(self, params, grads, alpha, mechanism):
@@ -3848,7 +3994,6 @@ def _update_params(self, params, grads, alpha, mechanism):
             assert mechanism == "functional_call"
             return {k: params[k] - alpha * grads[k] for k in params}
 
-    @xfailIfTorchDynamo
     @parametrize("mechanism", ["make_functional", "functional_call"])
     def test_maml_regression(self, device, mechanism):
         class ThreeLayerNet(nn.Module):
@@ -3872,7 +4017,9 @@ def forward(self, x):
         def mse_loss(x, y):
             return torch.mean((x - y) ** 2)
 
-        net, params = _get_weights_and_functional_call(ThreeLayerNet().to(device), mechanism)
+        net, params = _get_weights_and_functional_call(
+            ThreeLayerNet().to(device), mechanism
+        )
         K = 20
         num_tasks = 4
         alpha = 0.1
@@ -3882,18 +4029,22 @@ def sample_tasks(outer_batch_size, inner_batch_size):
             As = []
             phases = []
             for _ in range(outer_batch_size):
-                As.append(np.random.uniform(low=0.1, high=.5))
-                phases.append(np.random.uniform(low=0., high=np.pi))
+                As.append(np.random.uniform(low=0.1, high=0.5))
+                phases.append(np.random.uniform(low=0.0, high=np.pi))
 
             def get_batch():
                 xs, ys = [], []
                 for A, phase in zip(As, phases):
-                    x = np.random.uniform(low=-5., high=5., size=(inner_batch_size, 1))
+                    x = np.random.uniform(
+                        low=-5.0, high=5.0, size=(inner_batch_size, 1)
+                    )
                     y = A * np.sin(x + phase)
                     xs.append(x)
                     ys.append(y)
-                return torch.tensor(xs, dtype=torch.float, device=device), \
-                    torch.tensor(ys, dtype=torch.float, device=device)
+                return torch.tensor(xs, dtype=torch.float, device=device), torch.tensor(
+                    ys, dtype=torch.float, device=device
+                )
+
             x1, y1 = get_batch()
             x2, y2 = get_batch()
             return x1, y1, x2, y2
@@ -3918,10 +4069,14 @@ def inner_loss(params, x1, y1):
             return mse_loss(v_f, y2)
 
         task = sample_tasks(num_tasks, K)
-        list_params = params if mechanism == "make_functional" else list(params.values())
+        list_params = (
+            params if mechanism == "make_functional" else list(params.values())
+        )
 
         # Compute with vmap+grad
-        inner_losses = vmap(partial(get_loss_for_task, True))(task[0], task[1], task[2], task[3])
+        inner_losses = vmap(partial(get_loss_for_task, True))(
+            task[0], task[1], task[2], task[3]
+        )
         loss2 = sum(inner_losses) / len(inner_losses)
         result_grads = torch.autograd.grad(loss2, list_params)
 
@@ -3948,23 +4103,30 @@ def test_maml_omniglot(self, device, mechanism):
 
         # real example uses batch norm but it's numerically unstable in the first
         # iteration, when near 0, and won't produce same gradients. Uses group norm instead
-        net = nn.Sequential(
-            nn.Conv2d(1, 64, 3),
-            nn.GroupNorm(64, 64, affine=True),
-            nn.ReLU(inplace=inplace_relu),
-            nn.MaxPool2d(2, 2),
-            nn.Conv2d(64, 64, 3),
-            nn.GroupNorm(64, 64, affine=True),
-            nn.ReLU(inplace=inplace_relu),
-            nn.MaxPool2d(2, 2),
-            nn.Conv2d(64, 64, 3),
-            nn.GroupNorm(64, 64, affine=True),
-            nn.ReLU(inplace=inplace_relu),
-            nn.MaxPool2d(2, 2),
-            nn.Flatten(),
-            nn.Linear(64, n_way)).to(device).to(dtype)
-
-        fnet, params, buffers = _get_weights_and_functional_call_with_buffers(net, mechanism)
+        net = (
+            nn.Sequential(
+                nn.Conv2d(1, 64, 3),
+                nn.GroupNorm(64, 64, affine=True),
+                nn.ReLU(inplace=inplace_relu),
+                nn.MaxPool2d(2, 2),
+                nn.Conv2d(64, 64, 3),
+                nn.GroupNorm(64, 64, affine=True),
+                nn.ReLU(inplace=inplace_relu),
+                nn.MaxPool2d(2, 2),
+                nn.Conv2d(64, 64, 3),
+                nn.GroupNorm(64, 64, affine=True),
+                nn.ReLU(inplace=inplace_relu),
+                nn.MaxPool2d(2, 2),
+                nn.Flatten(),
+                nn.Linear(64, n_way),
+            )
+            .to(device)
+            .to(dtype)
+        )
+
+        fnet, params, buffers = _get_weights_and_functional_call_with_buffers(
+            net, mechanism
+        )
         net = (params, buffers, fnet)
 
         def loss_for_task(net, n_inner_iter, use_transform, x_spt, y_spt, x_qry, y_qry):
@@ -3990,8 +4152,7 @@ def compute_loss(new_params, buffers, x, y):
 
             qry_logits = fnet(new_params, buffers, x_qry)
             qry_loss = F.cross_entropy(qry_logits, y_qry)
-            qry_acc = (qry_logits.argmax(
-                dim=1) == y_qry).sum() / querysz
+            qry_acc = (qry_logits.argmax(dim=1) == y_qry).sum() / querysz
 
             return qry_loss, qry_acc
 
@@ -4004,34 +4165,47 @@ def compute_loss(new_params, buffers, x, y):
         # compute with vmap + grad
         compute_loss = partial(loss_for_task, net, n_inner_iter, True)
         qry_losses, _ = vmap(compute_loss)(x_spt, y_spt, x_qry, y_qry)
-        list_params = params if mechanism == "make_functional" else list(params.values())
+        list_params = (
+            params if mechanism == "make_functional" else list(params.values())
+        )
         result_grads = torch.autograd.grad(qry_losses.sum(), list_params)
 
         # compute without vmap + grad
         compute_loss = partial(loss_for_task, net, n_inner_iter, False)
-        losses = [compute_loss(x_spt[i], y_spt[i], x_qry[i], y_qry[i])[0]
-                  for i in range(num_tasks)]
+        losses = [
+            compute_loss(x_spt[i], y_spt[i], x_qry[i], y_qry[i])[0]
+            for i in range(num_tasks)
+        ]
         expected_grads = torch.autograd.grad(sum(losses), list_params)
 
         self.assertEqual(result_grads, expected_grads)
 
-    @parametrize('mechanism', ["make_functional", "functional_call"])
-    @parametrize('originally_track_running_stats', [True, False])
+    @parametrize("mechanism", ["make_functional", "functional_call"])
+    @parametrize("originally_track_running_stats", [True, False])
     def test_update_batch_norm(self, device, originally_track_running_stats, mechanism):
         dtype = torch.double
         inplace_relu = False
         classes = 5
         num_batches = 2
-        net = nn.Sequential(
-            nn.Conv2d(64, 64, 3),
-            nn.BatchNorm2d(64, affine=True, track_running_stats=originally_track_running_stats),
-            nn.ReLU(inplace=inplace_relu),
-            nn.Flatten(),
-            nn.Linear(43264, classes)).to(device).to(dtype)
+        net = (
+            nn.Sequential(
+                nn.Conv2d(64, 64, 3),
+                nn.BatchNorm2d(
+                    64, affine=True, track_running_stats=originally_track_running_stats
+                ),
+                nn.ReLU(inplace=inplace_relu),
+                nn.Flatten(),
+                nn.Linear(43264, classes),
+            )
+            .to(device)
+            .to(dtype)
+        )
 
         replace_all_batch_norm_modules_(net)
         transformed_net = net
-        fnet, params, buffers = _get_weights_and_functional_call_with_buffers(transformed_net, mechanism)
+        fnet, params, buffers = _get_weights_and_functional_call_with_buffers(
+            transformed_net, mechanism
+        )
         criterion = nn.CrossEntropyLoss()
 
         def compute_loss(x, y, params, buffers):
@@ -4042,10 +4216,14 @@ def compute_loss(x, y, params, buffers):
         y = torch.randint(0, classes, (num_batches, 1), device=device)
 
         # compute some per sample grads with vmap + grad
-        result_grads = vmap(grad(compute_loss, argnums=2), in_dims=(0, 0, None, None))(x, y, params, buffers)
+        result_grads = vmap(grad(compute_loss, argnums=2), in_dims=(0, 0, None, None))(
+            x, y, params, buffers
+        )
 
         # compute some per sample grads without vmap + grad
-        fnet, params, buffers = _get_weights_and_functional_call_with_buffers(transformed_net, mechanism)
+        fnet, params, buffers = _get_weights_and_functional_call_with_buffers(
+            transformed_net, mechanism
+        )
         flat_params, spec = tree_flatten(params)
         expected_grads = [
             torch.autograd.grad(compute_loss(x[i], y[i], params, buffers), flat_params)
@@ -4056,29 +4234,29 @@ def compute_loss(x, y, params, buffers):
 
         self.assertEqual(result_grads, expected_grads)
 
-    @parametrize('jac', ['jacfwd', 'jacrev'])
+    @parametrize("jac", ["jacfwd", "jacrev"])
     def test_lennard_jones_batched_jac(self, device, jac):
         sigma = 0.5
-        epsilon = 4.
+        epsilon = 4.0
 
         jac = getattr(functorch, jac)
 
         def lennard_jones(r):
-            return epsilon * ((sigma / r)**12 - (sigma / r)**6)
+            return epsilon * ((sigma / r) ** 12 - (sigma / r) ** 6)
 
         def lennard_jones_force(r):
             """Get magnitude of LJ force"""
-            return \
-                -epsilon * ((-12 * sigma**12 / r**13) + (6 * sigma**6 / r**7))
+            return -epsilon * (
+                (-12 * sigma**12 / r**13) + (6 * sigma**6 / r**7)
+            )
 
         r = torch.linspace(0.5, 2 * sigma, steps=100, requires_grad=True, device=device)
         drs = torch.outer(r, torch.tensor([1.0, 0, 0], device=device))
         norms = torch.norm(drs, dim=1).reshape(-1, 1)
-        training_energies = \
-            torch.stack(list(map(lennard_jones, norms))).reshape(-1, 1)
+        training_energies = torch.stack(list(map(lennard_jones, norms))).reshape(-1, 1)
         training_forces = torch.stack(
-            [force * dr
-             for force, dr in zip(map(lennard_jones_force, norms), drs)])
+            [force * dr for force, dr in zip(map(lennard_jones_force, norms), drs)]
+        )
 
         model = nn.Sequential(
             nn.Linear(1, 16),
@@ -4089,7 +4267,7 @@ def lennard_jones_force(r):
             nn.Tanh(),
             nn.Linear(16, 16),
             nn.Tanh(),
-            nn.Linear(16, 1)
+            nn.Linear(16, 1),
         ).to(device)
 
         def make_prediction(model, drs, use_functorch):
@@ -4103,15 +4281,18 @@ def make_prediction(model, drs, use_functorch):
                 forces = []
                 for r, dr in zip(norms, drs):
                     network_deriv = torch.autograd.functional.jacobian(
-                        model, r, create_graph=True)
+                        model, r, create_graph=True
+                    )
                     force = -network_deriv * dr / r
                     forces.append(force)
                 forces = torch.cat(forces)
             return energies, forces
 
         def loss_fn(energies, forces, predicted_energies, predicted_forces):
-            return F.mse_loss(energies, predicted_energies) + \
-                0.01 * F.mse_loss(forces, predicted_forces) / 3
+            return (
+                F.mse_loss(energies, predicted_energies)
+                + 0.01 * F.mse_loss(forces, predicted_forces) / 3
+            )
 
         energies, forces = make_prediction(model, drs, use_functorch=True)
         loss = loss_fn(training_energies, training_forces, energies, forces)
@@ -4123,11 +4304,11 @@ def loss_fn(energies, forces, predicted_energies, predicted_forces):
 
         self.assertEqual(result, expected)
 
-    @parametrize('mechanism', ["make_functional", "functional_call"])
+    @parametrize("mechanism", ["make_functional", "functional_call"])
     def test_ensemble_regression(self, device, mechanism):
-        def make_spirals(n_samples, noise_std=0., rotations=1.):
+        def make_spirals(n_samples, noise_std=0.0, rotations=1.0):
             ts = torch.linspace(0, 1, n_samples)
-            rs = ts ** 0.5
+            rs = ts**0.5
             thetas = rs * rotations * 2 * math.pi
             signs = torch.randint(0, 2, (n_samples,)) * 2 - 1
             labels = (signs > 0).to(torch.long)
@@ -4157,7 +4338,9 @@ def forward(self, x):
 
         loss_fn = nn.NLLLoss()
 
-        func_model, weights = _get_weights_and_functional_call(MLPClassifier().to(device), mechanism)
+        func_model, weights = _get_weights_and_functional_call(
+            MLPClassifier().to(device), mechanism
+        )
 
         def train_step_fn(use_transform, weights, batch, targets, lr=0.2):
             def compute_loss(weights, batch, targets):
@@ -4166,7 +4349,9 @@ def compute_loss(weights, batch, targets):
                 return loss
 
             if use_transform:
-                grad_weights, loss = grad_and_value(compute_loss)(weights, batch, targets)
+                grad_weights, loss = grad_and_value(compute_loss)(
+                    weights, batch, targets
+                )
             else:
                 loss = compute_loss(weights, batch, targets)
                 flat_weights, spec = tree_flatten(weights)
@@ -4187,37 +4372,54 @@ def init_fn(num_models):
                 return stack_module_state(models)[0]
 
         def slice_weights(batched_weights, index):
-            return tree_map(lambda weight: weight[index].detach().requires_grad_(), batched_weights)
+            return tree_map(
+                lambda weight: weight[index].detach().requires_grad_(), batched_weights
+            )
 
         batched_weights = init_fn(num_models=2)
-        parallel_train_step_fn = vmap(partial(train_step_fn, True), in_dims=(0, None, None))
+        parallel_train_step_fn = vmap(
+            partial(train_step_fn, True), in_dims=(0, None, None)
+        )
 
-        result_loss, result_weights = unpack(parallel_train_step_fn(batched_weights, points, labels))
+        result_loss, result_weights = unpack(
+            parallel_train_step_fn(batched_weights, points, labels)
+        )
 
-        loss0, weights0 = unpack(train_step_fn(False, slice_weights(batched_weights, 0), points, labels))
-        loss1, weights1 = unpack(train_step_fn(False, slice_weights(batched_weights, 1), points, labels))
+        loss0, weights0 = unpack(
+            train_step_fn(False, slice_weights(batched_weights, 0), points, labels)
+        )
+        loss1, weights1 = unpack(
+            train_step_fn(False, slice_weights(batched_weights, 1), points, labels)
+        )
         expected_loss = torch.stack([loss0, loss1])
 
         weights0, spec0 = tree_flatten(weights0)
         weights1, spec1 = tree_flatten(weights1)
         assert spec0 == spec1
-        expected_weights = tuple(torch.stack([w0, w1]) for w0, w1 in zip(weights0, weights1))
+        expected_weights = tuple(
+            torch.stack([w0, w1]) for w0, w1 in zip(weights0, weights1)
+        )
         expected_weights = tree_unflatten(expected_weights, spec0)
 
         self.assertEqual(result_loss, expected_loss)
         self.assertEqual(result_weights, expected_weights)
 
-    @parametrize("dropout_layer", [
-        subtest(nn.Dropout, 'Dropout'),
-        subtest(nn.AlphaDropout, 'AlphaDropout'),
-        subtest(nn.FeatureAlphaDropout, 'FeatureAlphaDropout'),
-    ])
-    @parametrize('mechanism', ["make_functional", "functional_call"])
+    @parametrize(
+        "dropout_layer",
+        [
+            subtest(nn.Dropout, "Dropout"),
+            subtest(nn.AlphaDropout, "AlphaDropout"),
+            subtest(nn.FeatureAlphaDropout, "FeatureAlphaDropout"),
+        ],
+    )
+    @parametrize("mechanism", ["make_functional", "functional_call"])
     def test_find_learning_rate_ensembling(self, device, dropout_layer, mechanism):
         # This example mimics what a user might do when trying to find the optimal learning rate. They would
         # want to run a bunch of models with the same behavior (including the same dropout!) and have them
         # each run with different learning rates. Specifically, this is an example of using same randomness with vmap
-        points, labels = torch.randn(100, 2, 2, 2, 2, device=device), torch.randint(0, 2, (100,), device=device)
+        points, labels = torch.randn(100, 2, 2, 2, 2, device=device), torch.randint(
+            0, 2, (100,), device=device
+        )
 
         class MLPClassifier(nn.Module):
             def __init__(self, hidden_dim=32, n_classes=2):
@@ -4240,7 +4442,9 @@ def forward(self, x):
 
         loss_fn = nn.NLLLoss()
 
-        func_model, weights = _get_weights_and_functional_call(MLPClassifier().to(device), mechanism)
+        func_model, weights = _get_weights_and_functional_call(
+            MLPClassifier().to(device), mechanism
+        )
 
         def train_step_fn(weights, batch, targets, lr):
             def compute_loss(weights, batch, targets):
@@ -4260,32 +4464,42 @@ def unpack(train_result):
 
         def init_fn(num_models):
             og_model = MLPClassifier().to(device)
-            models = tuple(copy.deepcopy(og_model) for _ in range(num_models))  # have same initialization
+            models = tuple(
+                copy.deepcopy(og_model) for _ in range(num_models)
+            )  # have same initialization
             if mechanism == "make_functional":
                 return combine_state_for_ensemble(models)[1]
             else:
                 return stack_module_state(models)[0]
 
         batched_weights = init_fn(num_models=2)
-        parallel_train_step_fn = vmap(train_step_fn, in_dims=(0, None, None, 0), randomness="same")
+        parallel_train_step_fn = vmap(
+            train_step_fn, in_dims=(0, None, None, 0), randomness="same"
+        )
 
         lrs = torch.tensor([0.2, 0.4], device=device)
-        result_loss, result_weights = unpack(parallel_train_step_fn(batched_weights, points, labels, lrs))
+        result_loss, result_weights = unpack(
+            parallel_train_step_fn(batched_weights, points, labels, lrs)
+        )
 
         self.assertEqual(result_loss[0], result_loss[1])
-        self.assertNotEqual(tuple(weight[0] for weight in result_weights),
-                            tuple(weight[1] for weight in result_weights))
+        self.assertNotEqual(
+            tuple(weight[0] for weight in result_weights),
+            tuple(weight[1] for weight in result_weights),
+        )
 
-    @xfailIfTorchDynamo
     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @unittest.skipIf(not USE_TORCHVISION, "test requires torchvision")
-    @parametrize('mechanism', ["make_functional", "functional_call"])
+    @parametrize("mechanism", ["make_functional", "functional_call"])
     def test_resnet18_per_sample_grads(self, device, mechanism):
         import torchvision.models as models
-        model = models.__dict__['resnet18'](
+
+        model = models.__dict__["resnet18"](
             pretrained=False, norm_layer=(lambda c: nn.GroupNorm(min(32, c), c))
         ).to(device)
-        criterion = nn.CrossEntropyLoss(reduction='sum')  # avoid cross batch reductions for for loop comparison
+        criterion = nn.CrossEntropyLoss(
+            reduction="sum"
+        )  # avoid cross batch reductions for for loop comparison
 
         func_model, weights = _get_weights_and_functional_call(model, mechanism)
 
@@ -4300,34 +4514,40 @@ def compute_loss(weights, image, target):
         images = torch.randn(batch_size, 3, 32, 32, device=device)
         targets = torch.randint(0, 10, (batch_size,), device=device)
 
-        result_grads = vmap(grad(compute_loss), in_dims=(None, 0, 0))(weights, images, targets)
+        result_grads = vmap(grad(compute_loss), in_dims=(None, 0, 0))(
+            weights, images, targets
+        )
 
         flat_weights, spec = tree_flatten(weights)
         expected_grads = [
-            torch.autograd.grad(compute_loss(weights, images[i], targets[i]), flat_weights)
+            torch.autograd.grad(
+                compute_loss(weights, images[i], targets[i]), flat_weights
+            )
             for i in range(batch_size)
         ]
         expected_grads = [torch.stack(shards) for shards in zip(*expected_grads)]
         expected_grads = tree_unflatten(expected_grads, spec)
 
-        self.assertEqual(result_grads, expected_grads, atol=1e-3, rtol=1.)
+        self.assertEqual(result_grads, expected_grads, atol=1e-3, rtol=1.0)
+
 
 def normalize_devices(fx_g):
     for node in fx_g.graph.nodes:
         args = list(node.args)
         for idx, arg in enumerate(args):
             if isinstance(arg, torch.device):
-                args[idx] = 'cpu'
+                args[idx] = "cpu"
         node.args = tuple(args)
         new_kwargs = {}
         for k, v in node.kwargs.items():
             if isinstance(v, torch.device):
-                v = 'cpu'
+                v = "cpu"
             new_kwargs[k] = v
         node.kwargs = new_kwargs
     fx_g.recompile()
     return fx_g
 
+
 @markDynamoStrictTest
 class TestFunctionalize(TestCase):
     def _check_functionalize_correctness(self, f, inpt, *, skip_vmap=False):
@@ -4344,7 +4564,7 @@ def _check_functionalize_correctness(self, f, inpt, *, skip_vmap=False):
         # isn't being used with vmap
         # That's because {view}_copy ops don't have batching rules yet
         # (although we should probably fix that)
-        actual_outputs_view_copy = functionalize(f, remove='mutations_and_views')(inpt3)
+        actual_outputs_view_copy = functionalize(f, remove="mutations_and_views")(inpt3)
         # Check that outputs are the same
         self.assertEqual(actual_outputs, expected_outputs)
         self.assertEqual(actual_outputs_view_copy, expected_outputs)
@@ -4354,26 +4574,25 @@ def _check_functionalize_correctness(self, f, inpt, *, skip_vmap=False):
         self.assertEqual(inpt1, inpt3)
 
     def test_simple_view(self, device):
-
         def f(x: torch.Tensor) -> torch.Tensor:
             tmp = torch.ones(2, device=device)
             y = x.view(4, 2)
             y.add_(tmp)
             return x
+
         self._check_functionalize_correctness(f, torch.zeros(4, 2, device=device))
 
     def test_multioutput_view(self, device):
-
         def f(x: torch.Tensor) -> torch.Tensor:
             tmp = torch.ones(2, device=device)
             y1, y2 = x.split(2)
             y1_view = y1.diagonal()
             y1_view.add_(tmp)
             return x
+
         self._check_functionalize_correctness(f, torch.zeros(4, 2, device=device))
 
     def test_inplace_view(self, device):
-
         def f(x: torch.Tensor) -> torch.Tensor:
             tmp = torch.ones(4, device=device)
             y = x + x
@@ -4381,11 +4600,13 @@ def f(x: torch.Tensor) -> torch.Tensor:
             z = y2[0]
             z.add_(tmp)
             return y
-        self._check_functionalize_correctness(f, torch.zeros(4, 2, device=device), skip_vmap=True)
+
+        self._check_functionalize_correctness(
+            f, torch.zeros(4, 2, device=device), skip_vmap=True
+        )
 
     # See https://github.com/pytorch/functorch/issues/780
     def test_linear(self, device):
-
         def f(x, y, z) -> torch.Tensor:
             return torch._C._nn.linear(x, y, z)
 
@@ -4398,7 +4619,6 @@ def f(x, y, z) -> torch.Tensor:
         self.assertEqual(out_expected, out_actual)
 
     def test_multioutput_inplace_slice_view(self, device):
-
         def f(x: torch.Tensor) -> torch.Tensor:
             tmp = torch.ones(2, 2, device=device)
             y = x.view(8)
@@ -4409,13 +4629,15 @@ def f(x: torch.Tensor) -> torch.Tensor:
             z2, z3 = z1.split(2)
             z2.add_(tmp)
             return x
+
         # See Note [Fix vmap slice_scatter]
-        self._check_functionalize_correctness(f, torch.zeros(4, 2, device=device), skip_vmap=True)
+        self._check_functionalize_correctness(
+            f, torch.zeros(4, 2, device=device), skip_vmap=True
+        )
 
     # Ensure functionalize works with List[Optional[Tensor]] arguments.
     # See the fix / discussion at https://github.com/pytorch/pytorch/pull/76085
     def test_functionalize_opt_tensor_list(self, device):
-
         def f(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
             return x[indices]
 
@@ -4425,18 +4647,20 @@ def f(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
         out2 = functionalize(f)(inpta, inptb)
         self.assertEqual(out1, out2)
         out = make_fx(functionalize(f))(inpta, inptb)
-        self.assertExpectedInline((out.code), """\
+        self.assertExpectedInline(
+            (out.code),
+            """\
 
 
 
 def forward(self, x_1, indices_1) -> torch.Tensor:
     index = torch.ops.aten.index.Tensor(x_1, [indices_1]);  x_1 = indices_1 = None
     return index
-    """)
+    """,
+        )
 
     # Ensure grad(functionalize(f)) works
     def test_functionalize_grad(self, device):
-
         def f(x: torch.Tensor) -> torch.Tensor:
             tmp = torch.ones(2, device=device)
             y = x + x
@@ -4451,9 +4675,8 @@ def f(x: torch.Tensor) -> torch.Tensor:
         self.assertEqual(out1, out2)
         self.assertEqual(inpt1, inpt2)
 
-    @unittest.skipIf(IS_FBCODE, 'fails in fbcode')
+    @unittest.skipIf(IS_FBCODE, "fails in fbcode")
     def test_vmap_functionalize_jvp(self, device):
-
         def f(x: torch.Tensor) -> torch.Tensor:
             y = x + x
             z = y.view(-1)
@@ -4461,7 +4684,11 @@ def f(x: torch.Tensor) -> torch.Tensor:
             return z
 
         def jvp_wrapper(x, t):
-            return jvp(f, (x,), (t,),)
+            return jvp(
+                f,
+                (x,),
+                (t,),
+            )
 
         x = torch.randn(2, 3, device=device)
         t = torch.randn(2, 3, device=device)
@@ -4473,7 +4700,6 @@ def jvp_wrapper(x, t):
     # TODO: move this test into test_fake_tensor.py
     # once functionalize() can be used in core tests.
     def test_functionalize_fake_tensors(self, device):
-
         def f(x: torch.Tensor) -> torch.Tensor:
             y = x.detach()
             return y + y
@@ -4484,18 +4710,20 @@ def f(x: torch.Tensor) -> torch.Tensor:
         self.assertEqual(x.size(), (2,))
 
     def test_functionalize_fx_simple(self, device):
-
         def f(x: torch.Tensor) -> torch.Tensor:
             tmp = torch.ones(2, device=device)
             y = x.view(4, 2)
             y.add_(tmp)
             return x
+
         # There's a copy_ in the graph, because the input (x) was mutated.
         # To preserve semantics, functionalize() needs to propagate the mutation.
-        fn = make_fx(functionalize(f, remove='mutations_and_views'))
+        fn = make_fx(functionalize(f, remove="mutations_and_views"))
         out = fn(torch.zeros(4, 2, device=device))
         out = normalize_devices(out)
-        self.assertExpectedInline((out.code), """\
+        self.assertExpectedInline(
+            (out.code),
+            """\
 
 
 
@@ -4507,26 +4735,29 @@ def forward(self, x_1) -> torch.Tensor:
     view_copy_2 = torch.ops.aten.view_copy.default(view_copy_1, [4, 2])
     copy_ = torch.ops.aten.copy_.default(x_1, view_copy_1);  x_1 = None
     return view_copy_1
-    """)
+    """,
+        )
 
     def test_functionalize_fx_transpose_simple(self, device):
-
         def f(x: torch.Tensor) -> torch.Tensor:
             return x.transpose(1, 0)
-        fn = make_fx(functionalize(f, remove='mutations_and_views'))
+
+        fn = make_fx(functionalize(f, remove="mutations_and_views"))
         out = fn(torch.zeros(4, 2, device=device))
         out = normalize_devices(out)
-        self.assertExpectedInline(out.code, """\
+        self.assertExpectedInline(
+            out.code,
+            """\
 
 
 
 def forward(self, x_1) -> torch.Tensor:
     transpose_copy = torch.ops.aten.transpose_copy.int(x_1, 1, 0);  x_1 = None
     return transpose_copy
-    """)
+    """,
+        )
 
     def test_functionalize_fx_out_op(self, device):
-
         def f(inpt: torch.Tensor) -> torch.Tensor:
             out = torch.empty((), dtype=torch.float32)
             torch.add(inpt, inpt, out=out)
@@ -4534,10 +4765,12 @@ def f(inpt: torch.Tensor) -> torch.Tensor:
             out_view.add_(1)
             return out
 
-        fn = make_fx(functionalize(f, remove='mutations_and_views'))
+        fn = make_fx(functionalize(f, remove="mutations_and_views"))
         out = fn(torch.arange(4, device=device, dtype=torch.float32))
         out = normalize_devices(out)
-        self.assertExpectedInline(out.code, """\
+        self.assertExpectedInline(
+            out.code,
+            """\
 
 
 
@@ -4550,10 +4783,10 @@ def forward(self, inpt_1) -> torch.Tensor:
     view_copy_2 = torch.ops.aten.view_copy.default(add_1, [4]);  add_1 = None
     view_copy_3 = torch.ops.aten.view_copy.default(view_copy_2, [4])
     return view_copy_2
-    """)
+    """,
+        )
 
     def test_functionalize_fx_multi_out_op(self, device):
-
         def f(inpt: torch.Tensor) -> torch.Tensor:
             mins = torch.empty(4, dtype=torch.float32)
             maxs = torch.empty(2, 2, dtype=torch.float32)
@@ -4562,10 +4795,12 @@ def f(inpt: torch.Tensor) -> torch.Tensor:
             torch.aminmax(inpt_view, dim=0, out=(mins, maxs_view))
             return (maxs, mins)
 
-        fn = make_fx(functionalize(f, remove='mutations_and_views'))
+        fn = make_fx(functionalize(f, remove="mutations_and_views"))
         out = fn(torch.arange(8, device=device, dtype=torch.float32))
         out = normalize_devices(out)
-        self.assertExpectedInline(out.code, """\
+        self.assertExpectedInline(
+            out.code,
+            """\
 
 
 
@@ -4580,10 +4815,10 @@ def forward(self, inpt_1) -> torch.Tensor:
     view_copy_2 = torch.ops.aten.view_copy.default(getitem_1, [2, 2]);  getitem_1 = None
     view_copy_3 = torch.ops.aten.view_copy.default(view_copy_2, [4])
     return (view_copy_2, getitem)
-    """)
+    """,
+        )
 
     def test_functionalize_fx_reapply_views_simple(self, device):
-
         def f(x: torch.Tensor) -> torch.Tensor:
             tmp = torch.ones(2, device=device)
             y = x.view(4, 2)
@@ -4592,7 +4827,9 @@ def f(x: torch.Tensor) -> torch.Tensor:
 
         out = make_fx(functionalize(f))(torch.zeros(4, 2, device=device))
         out = normalize_devices(out)
-        self.assertExpectedInline(out.code, """\
+        self.assertExpectedInline(
+            out.code,
+            """\
 
 
 
@@ -4604,10 +4841,10 @@ def forward(self, x_1) -> torch.Tensor:
     view_2 = torch.ops.aten.view.default(view_1, [4, 2])
     copy_ = torch.ops.aten.copy_.default(x_1, view_1);  x_1 = None
     return view_1
-    """)
+    """,
+        )
 
     def test_functionalize_nonfunctional_output(self, device):
-
         global_out = torch.ones(2, device=device)
 
         def f() -> torch.Tensor:
@@ -4615,17 +4852,19 @@ def f() -> torch.Tensor:
 
         out = make_fx(functionalize(f))()
         out = normalize_devices(out)
-        self.assertExpectedInline(out.code, """\
+        self.assertExpectedInline(
+            out.code,
+            """\
 
 
 
 def forward(self) -> torch.Tensor:
     _tensor_constant0 = self._tensor_constant0
     return _tensor_constant0
-    """)
+    """,
+        )
 
     def test_functionalize_optional_tensorlist1(self, device):
-
         def f(a, b) -> torch.Tensor:
             # at::index has OptionalTensorList arguments,
             # test that here
@@ -4635,18 +4874,20 @@ def f(a, b) -> torch.Tensor:
         b = torch.ones(2, dtype=torch.long)
         out = make_fx(functionalize(f))(a, b)
         out = normalize_devices(out)
-        self.assertExpectedInline(out.code, """\
+        self.assertExpectedInline(
+            out.code,
+            """\
 
 
 
 def forward(self, a_1, b_1) -> torch.Tensor:
     index = torch.ops.aten.index.Tensor(a_1, [b_1]);  a_1 = b_1 = None
     return index
-    """)
+    """,
+        )
 
-    @unittest.skipIf(IS_FBCODE, 'fails in fbcode')
+    @unittest.skipIf(IS_FBCODE, "fails in fbcode")
     def test_functionalize_optional_tensorlist2(self, device):
-
         def f(a, b) -> torch.Tensor:
             # See https://github.com/pytorch/pytorch/pull/77846
             return torch.ops.aten.index(a, b)
@@ -4654,7 +4895,9 @@ def f(a, b) -> torch.Tensor:
         a = torch.arange(4).reshape(2, 2)
         b = torch.ones(2, dtype=torch.long)
         out = make_fx(functionalize(f))(a, b)
-        self.assertExpectedInline(out.code, """\
+        self.assertExpectedInline(
+            out.code,
+            """\
 
 
 
@@ -4664,7 +4907,8 @@ def forward(self, a_1, b_1) -> torch.Tensor:
     getitem_1 = unbind[1];  unbind = None
     index = torch.ops.aten.index.Tensor(a_1, [getitem, getitem_1]);  a_1 = getitem = getitem_1 = None
     return index
-    """)
+    """,
+        )
 
     def test_resize_program_inputs(self, device):
         def f(x):
@@ -4674,7 +4918,9 @@ def f(x):
         fn = make_fx(functionalize(f))
         out = fn(torch.zeros(0, device=device))
         out = normalize_devices(out)
-        self.assertExpectedInline((out.code), """\
+        self.assertExpectedInline(
+            (out.code),
+            """\
 
 
 
@@ -4684,7 +4930,8 @@ def forward(self, x_1):
     resize_ = torch.ops.aten.resize_.default(x_1, [10]);  x_1 = None
     copy_ = torch.ops.aten.copy_.default(resize_, fill);  resize_ = fill = None
     return None
-    """)
+    """,
+        )
 
 
 def construct_sum_pyop():
@@ -4740,11 +4987,12 @@ def mysum_autograd_cuda(x, dim):
 
     return mysum
 
+
 sum_pyop = construct_sum_pyop()
 
+
 @markDynamoStrictTest
 class TestHigherOrderOperatorInteraction(TestCase):
-
     def test_basic_sum(self, device):
         x = torch.randn(2, 3, 4, device=device)
         result = sum_pyop(x, 1)
@@ -4758,7 +5006,6 @@ def test_vmap_sum(self, device):
         result = vmap(vmap(sum_pyop, (0, None)), (0, None))(x, 0)
         self.assertEqual(result, torch.sum(x, 2))
 
-    @xfailIfTorchDynamo
     def test_grad_sum(self, device):
         x = torch.randn(3, device=device)
         gx = grad(sum_pyop)(x, 0)
@@ -4782,7 +5029,6 @@ def test_vmap_grad_sum(self, device):
         gx = vmap(grad(sum_pyop), (0, None))(x, 0)
         self.assertEqual(gx, torch.ones_like(x))
 
-    @xfailIfTorchDynamo
     def test_no_grad_outside_grad(self, device):
         x = torch.randn(3, device=device, requires_grad=True)
         with torch.no_grad():
@@ -4790,12 +5036,11 @@ def test_no_grad_outside_grad(self, device):
         self.assertEqual(y, torch.ones_like(x))
         self.assertFalse(y.requires_grad)
 
-    @xfailIfTorchDynamo
     def test_no_grad_inside_grad(self, device):
         def f(x):
             with torch.no_grad():
-                shift = sum_pyop(x ** 2, 0)
-            return sum_pyop(x ** 2, 0) - shift
+                shift = sum_pyop(x**2, 0)
+            return sum_pyop(x**2, 0) - shift
 
         x = torch.randn(3, device=device)
         y = grad(f)(x)
@@ -4805,22 +5050,23 @@ def f(x):
 
         x = torch.randn(3, device=device, requires_grad=True)
         y = grad(f)(x)
-        z, = torch.autograd.grad(y.sum(), x)
+        (z,) = torch.autograd.grad(y.sum(), x)
         self.assertEqual(z, torch.full_like(x, 2))
 
     def test_grad_name_wrapping(self, device):
-
         def my_fn(x):
             return x.sum()
+
         grad_fn = grad(my_fn)
         self.assertEqual(grad_fn.__name__, "my_fn")
 
     def test_functional_call_multiple_dicts(self):
         mod = nn.Linear(1, 1)
         x = torch.randn((1, 1))
-        params = ({'weight': torch.zeros(1, 1)}, {'bias': torch.ones(1)})
+        params = ({"weight": torch.zeros(1, 1)}, {"bias": torch.ones(1)})
         functional_call(mod, params, x)
 
+
 def traceable(f):
     f = allow_in_graph(f)
 
@@ -4833,10 +5079,12 @@ def wrapper(*args, **kwargs):
 
 @markDynamoStrictTest
 class TestCompileTransforms(TestCase):
-    @xfailIfTorchDynamo
+    @skipIfRocm(msg="test leaks memory on ROCm")
     # torch.compile is not supported on Windows
     # Triton only supports GPU with SM70 or later.
-    @expectedFailureIf((IS_ARM64 and not IS_MACOS) or IS_WINDOWS or (TEST_CUDA and not SM70OrLater))
+    @expectedFailureIf(
+        (IS_ARM64 and not IS_MACOS) or IS_WINDOWS or (TEST_CUDA and not SM70OrLater)
+    )
     def test_compile_vmap_hessian(self, device):
         # The model and inputs are a smaller version
         # of code at benchmark repo:
@@ -4848,7 +5096,10 @@ def test_compile_vmap_hessian(self, device):
 
         model = nn.Sequential(nn.Linear(D, D), nn.ReLU()).to(device)
 
-        params_and_buffers = (dict(model.named_parameters()), dict(model.named_buffers()))
+        params_and_buffers = (
+            dict(model.named_parameters()),
+            dict(model.named_buffers()),
+        )
 
         def predict(params_and_buffers, x):
             out = torch.func.functional_call(model, params_and_buffers, x)
@@ -4868,8 +5119,6 @@ def predict(params_and_buffers, x):
     # torch.compile is not supported on Windows
     @expectedFailureIf(IS_WINDOWS)
     @torch._dynamo.config.patch(suppress_errors=False)
-    @torch._dynamo.config.patch(capture_func_transforms=True)
-    @skipIfTorchDynamo("Do not test torch.compile on top of torch.compile")
     def test_grad_deprecated_api(self, device):
         x = torch.randn((), device=device)
         y = torch.randn((), device=device)
@@ -4878,16 +5127,18 @@ def wrapper_fn(x, y):
             return functorch.grad(torch.mul)(x, y)
 
         actual = wrapper_fn(x, y)
-        expected = torch.compile(wrapper_fn, backend='eager', fullgraph=True)(x, y)
+        expected = torch.compile(wrapper_fn, backend="eager", fullgraph=True)(x, y)
+        fn = torch.compile(wrapper_fn, backend="eager", fullgraph=True)
         self.assertEqual(actual, expected)
 
         def wrapper_fn(x, y):
             return functorch.grad(torch.mul, argnums=(0, 1))(x, y)
 
         actual = wrapper_fn(x, y)
-        expected = torch.compile(wrapper_fn, backend='eager', fullgraph=True)(x, y)
+        expected = torch.compile(wrapper_fn, backend="eager", fullgraph=True)(x, y)
         self.assertEqual(actual, expected)
 
+
 only_for = ("cpu", "cuda")
 instantiate_device_type_tests(
     TestGradTransform,
@@ -4968,5 +5219,5 @@ def wrapper_fn(x, y):
     only_for=only_for,
 )
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_logging.py b/test/functorch/test_logging.py
index 042e8607cbbf2..658750d323b9a 100644
--- a/test/functorch/test_logging.py
+++ b/test/functorch/test_logging.py
@@ -1,25 +1,23 @@
 # Owner(s): ["module: dynamo"]
+import logging
+
 import torch
-from torch.testing._internal.common_utils import run_tests
-from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
 from torch._functorch.aot_autograd import aot_function
 from torch._functorch.compilers import nop
-import logging
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
 
-class TestAOTLogging(LoggingTestCase):
 
+class TestAOTLogging(LoggingTestCase):
     @make_logging_test(aot=logging.DEBUG)
     def test_logging(self, records):
         def f(x):
             return torch.sin(x)
-        compiled_f = aot_function(
-            f,
-            fw_compiler=nop,
-            bw_compiler=nop
-        )
+
+        compiled_f = aot_function(f, fw_compiler=nop, bw_compiler=nop)
         compiled_f(torch.randn(3))
         self.assertGreater(len(records), 0)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_memory_efficient_fusion.py b/test/functorch/test_memory_efficient_fusion.py
index e12da51004504..44cd5e5e5ea8b 100644
--- a/test/functorch/test_memory_efficient_fusion.py
+++ b/test/functorch/test_memory_efficient_fusion.py
@@ -1,17 +1,18 @@
 # Owner(s): ["module: functorch"]
 
+import inspect
+import random
+import unittest
+from typing import Callable
+
 import torch
-import torch.nn as nn
 import torch.fx as fx
+import torch.nn as nn
 from functorch import make_fx
-from torch.nn import functional as F
 from functorch.compile import memory_efficient_fusion
 from torch._functorch.compile_utils import fx_graph_cse
-from torch.testing._internal.common_utils import TestCase, run_tests
-import inspect
-import random
-from typing import Callable
-import unittest
+from torch.nn import functional as F
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 HAS_CUDA = torch.cuda.is_available()
 
@@ -101,7 +102,10 @@ def run_and_compare_activation(self, fn, inps):
         if isinstance(fn, nn.Module):
             fn = fn.to(device=device, dtype=dtype)
 
-        ref_args = [torch.randn(shape, device=device, dtype=dtype, requires_grad=True) for shape in inps]
+        ref_args = [
+            torch.randn(shape, device=device, dtype=dtype, requires_grad=True)
+            for shape in inps
+        ]
         res_args = [i.clone().detach().requires_grad_(True) for i in ref_args]
 
         ref = fn(*ref_args)
@@ -143,7 +147,7 @@ def layer_norm(x, weight, bias):
             mean = torch.mean(x, dim, keepdim=True)
             centered = x - mean
             var = torch.sum(centered * centered, dim, keepdim=True) / x.size(-1)
-            rvar = 1. / torch.sqrt(var + eps)
+            rvar = 1.0 / torch.sqrt(var + eps)
             normed = (x - mean) * rvar
             return normed * weight + bias
 
@@ -165,13 +169,16 @@ def __init__(self, hidden_size, eps=1e-6):
             def forward(self, hidden_states):
                 # layer norm should always be calculated in float32
                 variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-                hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+                hidden_states = hidden_states * torch.rsqrt(
+                    variance + self.variance_epsilon
+                )
 
                 # convert into half-precision if necessary
                 if self.weight.dtype in [torch.float16, torch.bfloat16]:
                     hidden_states = hidden_states.to(self.weight.dtype)
 
                 return self.weight * hidden_states
+
         bs = 256
         seq = 256
         hidden = 1024
@@ -200,31 +207,36 @@ def check(f, t, delta, check_val=True, graph_input=False):
     old_num_nodes = len(fx_g.graph.nodes)
     new_num_nodes = len(new_graph.nodes)
     if delta == -1:
-        assert old_num_nodes >= new_num_nodes, (
-            f"number of nodes increased {old_num_nodes}, {new_num_nodes}")
+        assert (
+            old_num_nodes >= new_num_nodes
+        ), f"number of nodes increased {old_num_nodes}, {new_num_nodes}"
     else:
-        assert old_num_nodes == new_num_nodes + delta, (
-            f"number of nodes not the same {old_num_nodes - delta}, {new_num_nodes}\n {fx_g.graph} \n {new_graph}")
+        assert (
+            old_num_nodes == new_num_nodes + delta
+        ), f"number of nodes not the same {old_num_nodes - delta}, {new_num_nodes}\n {fx_g.graph} \n {new_graph}"
 
     # a second pass should not reduce more nodes
     pass_2_graph = fx_graph_cse(new_graph)
     pass_2_num_nodes = len(pass_2_graph.nodes)
-    assert pass_2_num_nodes == new_num_nodes, (
-        f"second pass graph has less node {pass_2_num_nodes}, {new_num_nodes}\n {new_graph} \n {pass_2_graph}")
+    assert (
+        pass_2_num_nodes == new_num_nodes
+    ), f"second pass graph has less node {pass_2_num_nodes}, {new_num_nodes}\n {new_graph} \n {pass_2_graph}"
 
     # check correctness
     if check_val:
         true_result = fx_g(t)
         our_result = new_g(t)
         if true_result is None:  # both return None
-            assert our_result is None, f"true result is None, CSE result is {our_result}"
+            assert (
+                our_result is None
+            ), f"true result is None, CSE result is {our_result}"
         else:  # results returned are the same
-            assert torch.all(true_result == our_result), (
-                f"results are different {true_result}, {our_result}")  # check results are the same
+            assert torch.all(
+                true_result == our_result
+            ), f"results are different {true_result}, {our_result}"  # check results are the same
 
 
 class NoChangeTestCase(TestCase):
-
     def test_nochange(self):
         def f(x):
             a = x + 1
@@ -232,12 +244,14 @@ def f(x):
             a = x
             d = x + a
             return b + d
+
         t = torch.randn(2, 2)
         check(f, t, 0)
 
     def test_empty(self):
         def f(x):
             pass
+
         t = torch.randn(2, 2)
         check(f, t, 0)
 
@@ -246,6 +260,7 @@ def f(x):
             a = torch.rand_like(x)
             b = torch.rand_like(x)
             return a + b
+
         t = torch.randn(2, 2)
         check(f, t, 0, check_val=False)
 
@@ -254,12 +269,43 @@ def f(x):
             a = torch.randn(4)
             b = torch.randn(4)
             return a + b
+
         t = torch.randn(2, 2)
         check(f, t, 0, check_val=False)
 
+    def test_hash_with_numbers(self):
+        # Test to repro issue with fx_graph_cse when
+        # hash((primals_2, 1.0)) == hash((primals_2, 1))
 
-class ReduceTestCase(TestCase):
+        if torch._dynamo.is_compiling():
+            self.skipTest("Unsupported if test run is compiled")
+
+        def f(inpt, osize):
+            size = inpt.shape[-1]
+            s1 = size - 1
+            s2 = size - 1.0
+            scale = s2 / (osize - 1.0)
+            inpt = torch.clamp(inpt, 0, s1)
+            return scale * inpt
+
+        # Fetch dynamic graph
+        gms = []
+
+        def toy_backend(gm, _):
+            gms.append(gm)
+            return gm.forward
+
+        torch._dynamo.reset()
+        fn = torch.compile(backend=toy_backend, dynamic=True)(f)
 
+        t = torch.rand(3, 100)
+        _ = fn(t, 50)
+        assert len(gms) == 1, gms
+        fx_g = gms[0]
+        check(fx_g, None, 0, check_val=False, graph_input=True)
+
+
+class ReduceTestCase(TestCase):
     def test_immutable_list_type(self):
         def f(x):
             a = x.sum(dim=1)
@@ -267,6 +313,7 @@ def f(x):
             c = x.sum()
             d = x.sum()
             return a + b + c + d
+
         t = torch.randn(2, 2)
         check(f, t, 2)
 
@@ -277,6 +324,7 @@ def f(x):
             c = x.sum(dim=1)
             d = x.sum(dim=1)
             return a + b + c + d
+
         t = torch.randn(2, 2)
         check(f, t, 2)
 
@@ -287,6 +335,7 @@ def f(x):
             c = a + a
             d = b + b
             return c + d
+
         t = torch.randn(2, 2)
         check(f, t, 2)
 
@@ -297,6 +346,7 @@ def f(x):
             c = a + a
             d = b + b
             return c + d
+
         t = torch.randn(1)
         check(f, t, 3)
 
@@ -307,6 +357,7 @@ def f(x):
             c = x.sum(dim=1, keepdim=False)
             d = x.sum(dim=1)
             return a + b + c + d
+
         t = torch.randn(2, 2)
         check(f, t, 3)
 
@@ -317,6 +368,7 @@ def f(x):
             c = x.sum(dim=1, keepdim=True)
             d = x.sum(dim=1)
             return a + b + c + d
+
         t = torch.randn(2, 2)
         check(f, t, 2)
 
@@ -327,6 +379,7 @@ def f(x):
             c = x.sum()
             d = x.sum()
             return a + b + c + d
+
         t = torch.randn(2, 2)
         check(f, t, 3)
 
@@ -335,6 +388,7 @@ def f(x):
             a = torch.cat((x, x))
             b = torch.cat((x, x))
             return a + b
+
         t = torch.randn(2, 2)
         check(f, t, 1)
 
@@ -343,6 +397,7 @@ def f(x):
             a = torch.ones_like(x)
             b = torch.ones_like(x)
             return a + b
+
         t = torch.randn(2, 2)
         check(f, t, 1)
 
@@ -366,6 +421,5 @@ def f(x):
             check(fx_g, t, -1, graph_input=True)
 
 
-
 if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_minifier.py b/test/functorch/test_minifier.py
index 9e6f495bcd4b6..c354cedc4e053 100644
--- a/test/functorch/test_minifier.py
+++ b/test/functorch/test_minifier.py
@@ -1,10 +1,10 @@
 # Owner(s): ["module: functorch"]
 
 import torch
-from functorch.compile import minifier
-from torch._functorch.compile_utils import get_placeholders, get_outputs
 from functorch import make_fx
-from torch.testing._internal.common_utils import TestCase, run_tests
+from functorch.compile import minifier
+from torch._functorch.compile_utils import get_outputs, get_placeholders
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class TestMinifier(TestCase):
@@ -14,11 +14,12 @@ def failing_f(x, y):
             x = x + 3
             x = x * y
             return x + y
+
         inps = [torch.randn(3), torch.randn(3)]
         failing_f = make_fx(failing_f)(*inps)
 
         def has_mul(fx_g, inps):
-            return (torch.ops.aten.mul.Tensor in (i.target for i in fx_g.graph.nodes))
+            return torch.ops.aten.mul.Tensor in (i.target for i in fx_g.graph.nodes)
 
         min_f, inps = minifier(failing_f, inps, has_mul)
         self.assertEqual(len(min_f.graph.nodes), 4)
@@ -54,6 +55,7 @@ def f(a, b, c):
             c = c.cos()
             d = a * c
             return (a, b, c, d)
+
         inps = [torch.randn(3) for _ in range(3)]
 
         def inputs_returned(fx_g, inps):
@@ -74,7 +76,7 @@ def f(a, b):
         inps = [torch.randn(3), torch.randn(3)]
 
         def has_add(fx_g, inps):
-            return (torch.ops.aten.add.Tensor in (i.target for i in fx_g.graph.nodes))
+            return torch.ops.aten.add.Tensor in (i.target for i in fx_g.graph.nodes)
 
         failing_f = make_fx(f)(*inps)
         min_f, inps = minifier(failing_f, inps, has_add)
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 924bf07a509b8..e913d90dde48c 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -6,50 +6,61 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import functools
 import itertools
 import unittest
 
-from torch.testing._internal.common_utils import unMarkDynamoStrictTest
-from torch.testing._internal.common_utils import TestCase, run_tests, is_iterable_of_tensors, IS_MACOS, \
-    IS_X86, parametrize, TEST_WITH_ASAN, noncontiguous_like
-from torch.testing._internal.common_utils import skipIfRocm, runOnRocm
 import torch
-from torch import Tensor
-import functools
-from torch.testing._internal.common_cuda import with_tf32_off
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
-from torch.testing._internal.common_device_type import ops
-from torch.testing._internal.common_device_type import \
-    toleranceOverride, tol
-from functorch_additional_op_db import additional_op_db
-from torch.testing._internal.common_methods_invocations import op_db
+import torch.autograd.forward_ad as fwAD
 from common_utils import (
-    get_fallback_and_vmap_exhaustive,
-    generate_vmap_inputs,
-    decorate,
-    xfail,
-    skip,
-    skipOps,
-    tol1,
-    tol2,
-    opsToleranceOverride,
     check_vmap_fallback,
+    decorate,
+    expectedFailureIf,
+    generate_vmap_inputs,
+    get_fallback_and_vmap_exhaustive,
     is_batch_norm_training,
     is_valid_inplace_sample_input,
     loop,
     loop2,
-    expectedFailureIf,
+    opsToleranceOverride,
+    skip,
+    skipOps,
+    tol1,
+    tol2,
+    xfail,
+)
+from functorch import grad, jacfwd, jacrev, vjp, vmap
+from functorch_additional_op_db import additional_op_db
+from torch import Tensor
+from torch._functorch.eager_transforms import _as_tuple, jvp
+from torch.testing._internal.autograd_function_db import autograd_function_db
+from torch.testing._internal.common_cuda import with_tf32_off
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    ops,
+    tol,
+    toleranceOverride,
 )
-from torch.testing._internal.autograd_function_db import (
-    autograd_function_db
+from torch.testing._internal.common_methods_invocations import op_db
+
+from torch.testing._internal.common_utils import (
+    is_iterable_of_tensors,
+    IS_MACOS,
+    IS_X86,
+    noncontiguous_like,
+    parametrize,
+    run_tests,
+    runOnRocm,
+    skipIfRocm,
+    TEST_WITH_ASAN,
+    TEST_WITH_ROCM,
+    TestCase,
+    unMarkDynamoStrictTest,
 )
 
 from torch.testing._internal.opinfo.core import SampleInput
-from torch.utils._pytree import tree_flatten, tree_unflatten, tree_map
 from torch.utils import _pytree as pytree
-from functorch import grad, vjp, vmap, jacrev, jacfwd
-import torch.autograd.forward_ad as fwAD
-from torch._functorch.eager_transforms import _as_tuple, jvp
+from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
 
 aten = torch.ops.aten
 
@@ -102,6 +113,7 @@ def is_differentiable_arg(arg):
             return arg.requires_grad
         else:
             return arg.is_floating_point() or arg.is_complex()
+
     if is_iterable_of_tensors(arg):
         if all(is_differentiable_arg(a) for a in arg):
             return True
@@ -115,9 +127,15 @@ def is_differentiable_arg(arg):
 # - f' takes only positional arguments
 # - All arguments to f' are floating-point Tensors
 # - All outputs of f' are floating-point Tensors
-def normalize_op_input_output2(f, args, kwargs, output_process_fn_grad=None, requires_grad=True):
+def normalize_op_input_output2(
+    f, args, kwargs, output_process_fn_grad=None, requires_grad=True
+):
     flat_args, args_spec = tree_flatten(args)
-    diff_argnums = tuple(i for i, arg in enumerate(flat_args) if diff_arg(arg, requires_grad=requires_grad))
+    diff_argnums = tuple(
+        i
+        for i, arg in enumerate(flat_args)
+        if diff_arg(arg, requires_grad=requires_grad)
+    )
     assert len(diff_argnums) > 0
     primals = tuple(flat_args[i] for i in diff_argnums)
 
@@ -134,15 +152,21 @@ def wrapped(*primals):
             result = tuple(r for r in result if torch.is_floating_point(r))
             assert len(result) > 0
         return result
+
     return wrapped, primals
 
 
 # TODO: consolidate with normalize_op_input_output2
-def normalize_op_input_output3(f, args, kwargs, sample_args, output_process_fn_grad=None):
+def normalize_op_input_output3(
+    f, args, kwargs, sample_args, output_process_fn_grad=None
+):
     flat_args, args_spec = tree_flatten(args)
     flat_sample_args = pytree.tree_leaves(sample_args)
-    diff_argnums = tuple(i for i, (arg, sample) in enumerate(zip(flat_args, flat_sample_args))
-                         if diff_arg(sample, requires_grad=True))
+    diff_argnums = tuple(
+        i
+        for i, (arg, sample) in enumerate(zip(flat_args, flat_sample_args))
+        if diff_arg(sample, requires_grad=True)
+    )
     assert len(diff_argnums) > 0
     primals = tuple(flat_args[i] for i in diff_argnums)
 
@@ -159,13 +183,18 @@ def wrapped(*primals):
             result = tuple(r for r in result if torch.is_floating_point(r))
             assert len(result) > 0
         return result
+
     return wrapped, primals
 
 
 def normalize_op_input_output(f, sample, requires_grad=True):
     args = tuple([sample.input] + list(sample.args))
     return normalize_op_input_output2(
-        f, args, sample.kwargs, sample.output_process_fn_grad, requires_grad=requires_grad
+        f,
+        args,
+        sample.kwargs,
+        sample.output_process_fn_grad,
+        requires_grad=requires_grad,
     )
 
 
@@ -209,14 +238,14 @@ def get_vjp_fn_and_args_with_cotangents(f, sample, cotangents):
     @functools.wraps(f)
     def wrapped(*args):
         assert len(args) == len(flat_args) + len(flat_cotangents)
-        actual_args = args[:len(flat_args)]
-        cotangents = args[len(flat_args):]
+        actual_args = args[: len(flat_args)]
+        cotangents = args[len(flat_args) :]
         actual_args = tree_unflatten(actual_args, args_spec)
         cotangents = tree_unflatten(cotangents, cotangents_spec)
 
-        fn, primals = normalize_op_input_output3(f, actual_args, kwargs,
-                                                 flat_args,
-                                                 sample.output_process_fn_grad)
+        fn, primals = normalize_op_input_output3(
+            f, actual_args, kwargs, flat_args, sample.output_process_fn_grad
+        )
         _, vjp_fn = vjp(fn, *primals)
         return vjp_fn(cotangents)
 
@@ -238,7 +267,8 @@ def get_vjpfull_variant2(f, args, kwargs):
 def _get_vjpfull_variant(fn, primals):
     result = fn(*primals)
     cotangents = _as_tuple(
-        tree_map(lambda x: torch.randn_like(x, requires_grad=True), result))
+        tree_map(lambda x: torch.randn_like(x, requires_grad=True), result)
+    )
     num_primals = len(primals)
     args = (*primals, *cotangents)
 
@@ -259,8 +289,7 @@ def get_jvp_variant(f, sample):
     # We want this higher-order variant of jvp, so that it can
     # be used to wrap vmap
     fn, primals = normalize_op_input_output(f, sample, requires_grad=False)
-    tangents = _as_tuple(
-        tree_map(lambda x: torch.randn_like(x), primals))
+    tangents = _as_tuple(tree_map(lambda x: torch.randn_like(x), primals))
 
     @functools.wraps(f)
     def wrapped(*args):
@@ -277,12 +306,13 @@ def wrapped(*args):
     return wrapped, tangents
 
 
-def get_jvp_variant_primals_tangents2(f, args, kwargs, output_process_fn_grad=None,
-                                      requires_grad=False):
-    fn, primals = normalize_op_input_output2(f, args, kwargs, output_process_fn_grad,
-                                             requires_grad)
-    tangents = _as_tuple(
-        tree_map(lambda x: torch.randn_like(x), primals))
+def get_jvp_variant_primals_tangents2(
+    f, args, kwargs, output_process_fn_grad=None, requires_grad=False
+):
+    fn, primals = normalize_op_input_output2(
+        f, args, kwargs, output_process_fn_grad, requires_grad
+    )
+    tangents = _as_tuple(tree_map(lambda x: torch.randn_like(x), primals))
     return _get_jvp_variant(fn, primals, tangents)
 
 
@@ -290,16 +320,15 @@ def get_jvp_variant_primals_tangents(f, sample):
     # We want this higher-order variant of jvp, so that it can
     # be used to wrap vmap
     fn, primals = normalize_op_input_output(f, sample, requires_grad=False)
-    tangents = _as_tuple(
-        tree_map(lambda x: torch.randn_like(x), primals))
+    tangents = _as_tuple(tree_map(lambda x: torch.randn_like(x), primals))
     return _get_jvp_variant(fn, primals, tangents)
 
 
 def _get_jvp_variant(fn, primals, tangents):
     @functools.wraps(fn)
     def wrapped(*args):
-        primals_in = args[:len(primals)]
-        tangents_in = args[len(primals):]
+        primals_in = args[: len(primals)]
+        tangents_in = args[len(primals) :]
         primals_out, tangents_out = jvp(fn, primals_in, tangents_in)
 
         if isinstance(primals_out, torch.Tensor):
@@ -319,101 +348,139 @@ def is_inplace(op, variant):
 
 
 vjp_fail = {
-    xfail('tensor_split'),  # data_ptr composite compliance
-    decorate('nn.functional.batch_norm', decorator=skipIfRocm),
-    decorate('nn.functional.instance_norm', decorator=skipIfRocm),
+    xfail("tensor_split"),  # data_ptr composite compliance
+    decorate("nn.functional.batch_norm", decorator=skipIfRocm),
+    decorate("nn.functional.instance_norm", decorator=skipIfRocm),
     # https://github.com/pytorch/pytorch/issues/96560
-    decorate('nn.functional.scaled_dot_product_attention', decorator=skipIfRocm),
+    decorate("nn.functional.scaled_dot_product_attention", decorator=skipIfRocm),
 }
 
 aliasing_ops = {
-    'T',
-    'broadcast_to',
-    'conj',
-    'contiguous',
-    'diagonal',  # linalg.diagonal is an alias
-    'expand',
-    'flatten',
-    'imag',
-    'mH',  # adjoint is an alias
-    'mT',
-    'movedim',  # moveaxis is an alias
-    'narrow',
-    'permute',
-    'positive',
+    "T",
+    "broadcast_to",
+    "conj",
+    "contiguous",
+    "diagonal",  # linalg.diagonal is an alias
+    "expand",
+    "flatten",
+    "imag",
+    "mH",  # adjoint is an alias
+    "mT",
+    "movedim",  # moveaxis is an alias
+    "narrow",
+    "permute",
+    "positive",
     # 'ravel', is composite implicit autograd and may call clone
-    'real',
-    'reshape',
-    'resolve_conj',
-    'resolve_neg',
-    'select',
-    'squeeze',
-    'transpose',  # swapdims and swapaxes are aliases
-    'unflatten',
-    'unfold',
-    'unsqueeze',
-    'view',
-    'view_as',
-    'view_as_complex',
-    'view_as_real',
+    "real",
+    "reshape",
+    "resolve_conj",
+    "resolve_neg",
+    "select",
+    "squeeze",
+    "transpose",  # swapdims and swapaxes are aliases
+    "unflatten",
+    "unfold",
+    "unsqueeze",
+    "view",
+    "view_as",
+    "view_as_complex",
+    "view_as_real",
 }
 
 aliasing_ops_list_return = {
-    'chunks',
-    'dsplit',
-    'hsplit',
-    'split',
-    'unbind',
-    'vsplit',
+    "chunks",
+    "dsplit",
+    "hsplit",
+    "split",
+    "unbind",
+    "vsplit",
     # 'tensor_split' not composite compliant, see vjp_fail
 }
 
+skip_noncontig = {
+    "_batch_norm_with_update",
+}
+
 
 @unittest.skipIf(TEST_WITH_ASAN, "tests time out with asan, are probably redundant")
 @unMarkDynamoStrictTest
 class TestOperators(TestCase):
     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
-    @skipOps('TestOperators', 'test_grad', vjp_fail.union({
-        xfail('chalf', '', device_type='cpu'),  # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
-        xfail('sparse.sampled_addmm', ''),  # RuntimeError: Sparse CSR tensors do not have strides
-        xfail('sparse.mm', 'reduce'),  # RuntimeError: Sparse CSR tensors do not have strides
-
-        # Non-contiguous Bugs
-        #
-        # AssertionError: Tensor-likes are not close!
-        xfail('_softmax_backward_data', device_type='cpu'),
-        xfail('as_strided'),
-        xfail('as_strided', 'partial_views'),
-
-        # RuntimeError: !self.requires_grad() || self.is_contiguous()
-        xfail('as_strided_scatter'),
-
-        # RuntimeError: Tensor must have a last dimension with stride 1
-        xfail('view_as_complex'),
-        # query: last dimension must be contiguous
-        # Fused attention kernels require last dim to be contiguous
-        xfail('nn.functional.scaled_dot_product_attention'),
-        xfail("torch.ops.aten._flash_attention_forward"),
-        xfail("torch.ops.aten._efficient_attention_forward"),
-    }))
-    @opsToleranceOverride('TestOperators', 'test_grad', (
-        tol1('nn.functional.binary_cross_entropy_with_logits',
-             {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
-        tol1('masked.cumprod',
-             {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
-        tol1('svd_lowrank',
-             {torch.float32: tol(atol=3e-04, rtol=3e-04)}, device_type='cuda'),
-        tol1('linalg.tensorsolve',
-             {torch.float32: tol(atol=3e-04, rtol=3e-04)}, device_type='cuda'),
-        tol1('nn.functional.multi_head_attention_forward',
-             {torch.float32: tol(atol=8e-04, rtol=1e-03)}),
-        tol1('__rmatmul__',
-             {torch.float32: tol(atol=3e-04, rtol=3e-04)}, device_type='cuda'),
-        tol1('matmul',
-             {torch.float32: tol(atol=3e-04, rtol=3e-04)}, device_type='cuda'),
-
-    ))
+    @skipOps(
+        "TestOperators",
+        "test_grad",
+        vjp_fail.union(
+            {
+                xfail(
+                    "chalf", "", device_type="cpu"
+                ),  # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
+                xfail(
+                    "sparse.sampled_addmm", ""
+                ),  # RuntimeError: Sparse CSR tensors do not have strides
+                xfail(
+                    "sparse.mm", "reduce"
+                ),  # RuntimeError: Sparse CSR tensors do not have strides
+                # Non-contiguous Bugs
+                #
+                # AssertionError: Tensor-likes are not close!
+                xfail("_softmax_backward_data", device_type="cpu"),
+                xfail("as_strided"),
+                xfail("as_strided", "partial_views"),
+                # RuntimeError: !self.requires_grad() || self.is_contiguous()
+                xfail("as_strided_scatter"),
+                # RuntimeError: Tensor must have a last dimension with stride 1
+                xfail("view_as_complex"),
+                # query: last dimension must be contiguous
+                # Fused attention kernels require last dim to be contiguous
+                xfail("nn.functional.scaled_dot_product_attention"),
+                xfail("torch.ops.aten._flash_attention_forward"),
+                xfail("torch.ops.aten._efficient_attention_forward"),
+                # RuntimeError: Expected contiguous tensor, but got
+                # non-contiguous tensor for argument #2 'grad_output'
+                decorate(
+                    "_batch_norm_with_update",
+                    decorator=expectedFailureIf(TEST_WITH_ROCM),
+                    device_type="cuda",
+                ),
+            }
+        ),
+    )
+    @opsToleranceOverride(
+        "TestOperators",
+        "test_grad",
+        (
+            tol1(
+                "nn.functional.binary_cross_entropy_with_logits",
+                {torch.float32: tol(atol=1e-04, rtol=1e-04)},
+            ),
+            tol1("masked.cumprod", {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
+            tol1(
+                "svd_lowrank",
+                {torch.float32: tol(atol=3e-04, rtol=3e-04)},
+                device_type="cuda",
+            ),
+            tol1(
+                "linalg.tensorsolve",
+                {torch.float32: tol(atol=3e-04, rtol=3e-04)},
+                device_type="cuda",
+            ),
+            tol1(
+                "nn.functional.multi_head_attention_forward",
+                {torch.float32: tol(atol=8e-04, rtol=1e-03)},
+            ),
+            tol1(
+                "__rmatmul__",
+                {torch.float32: tol(atol=3e-04, rtol=3e-04)},
+                device_type="cuda",
+            ),
+            tol1(
+                "matmul",
+                {torch.float32: tol(atol=3e-04, rtol=3e-04)},
+                device_type="cuda",
+            ),
+        ),
+    )
     def test_grad(self, device, dtype, op):
         if op.name in vjp_fail:
             self.skipTest("Skipped; Expected failures")
@@ -433,9 +500,10 @@ def test_grad(self, device, dtype, op):
             args = [sample.input] + list(sample.args)
             kwargs = sample.kwargs
 
-            noncontig_sample = sample.noncontiguous()
-            noncontig_args = [noncontig_sample.input] + list(noncontig_sample.args)
-            noncontig_kwargs = noncontig_sample.kwargs
+            if op.name not in skip_noncontig:
+                noncontig_sample = sample.noncontiguous()
+                noncontig_args = [noncontig_sample.input] + list(noncontig_sample.args)
+                noncontig_kwargs = noncontig_sample.kwargs
 
             diff_argnums = tuple(i for i, arg in enumerate(args) if diff_arg(arg))
             assert len(diff_argnums) > 0
@@ -454,76 +522,110 @@ def abs_if_complex(t):
                 # Reduce into single value for grad
                 if isinstance(result, torch.Tensor):
                     return abs_if_complex(result.sum())
-                result = sum([abs_if_complex(res.sum()) for res in result])
+                result = sum(abs_if_complex(res.sum()) for res in result)
                 return result
 
             result = grad(wrapped_fn, diff_argnums)(*args, **kwargs)
-            result_noncontig = grad(wrapped_fn, diff_argnums)(*noncontig_args, **noncontig_kwargs)
             expected = _autograd_grad(_as_tuple(wrapped_fn(*args, **kwargs)), diff_args)
-
             self.assertEqual(result, expected)
-            self.assertEqual(result_noncontig, expected)
-
-    @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
-    @skipOps('TestOperators', 'test_jvp', set({
-        # Composite ops that do bad things. Need to be fixed in PyTorch core.
-        # RuntimeError: Cannot access data pointer of Tensor that doesn't have storage
-        xfail('tensor_split'),
 
-        # BUG: silent incorrectness: runs and produces numerical differences
-        skip('nn.functional.max_unpool1d'),  # fails everywhere except on mac
-        skip('nn.functional.max_unpool2d'),  # fails everywhere except on windows
-        skip('nn.functional.max_unpool3d'),  # fails everywhere except on mac
-        xfail("native_batch_norm"),          # TODO: fails comparing None to tensor of 0s for saved_mean/var tangents
-        xfail("_native_batch_norm_legit"),    # TODO: fails comparing None to tensor of 0s for saved_mean/var tangents
+            if op.name not in skip_noncontig:
+                result_noncontig = grad(wrapped_fn, diff_argnums)(
+                    *noncontig_args, **noncontig_kwargs
+                )
+                self.assertEqual(result_noncontig, expected)
 
-        xfail('nn.functional.scaled_dot_product_attention'),
-        xfail('torch.ops.aten._flash_attention_forward'),
-        xfail('torch.ops.aten._efficient_attention_forward'),
-
-        xfail('nn.functional.rrelu'),  # in-place test errors out with no formula implemented
-        xfail('NumpyExpMarkDirtyAutogradFunction'),  # TODO: https://github.com/pytorch/pytorch/issues/91280
-
-        # https://github.com/pytorch/pytorch/issues/96560
-        # ROCm: NotImplementedError
-        decorate('nn.functional.batch_norm', decorator=skipIfRocm),
-        # ROCm: NotImplementedError
-        decorate('nn.functional.instance_norm', decorator=skipIfRocm),
-
-        # --- Non-Contiguous Failures! ---
-        # This is expected to fail as the operator
-        # expects last dim to have stride=1
-        xfail('view_as_complex'),
-        # BUG
-        # AssertionError: Tensor-likes are not close!
-        xfail('as_strided'),
-        xfail('as_strided', 'partial_views'),
-        xfail('as_strided_scatter'),
-        decorate('linalg.det', 'singular',
-                 decorator=expectedFailureIf(IS_MACOS and IS_X86)),
-    }))
-    @opsToleranceOverride('TestOperators', 'test_jvp', (
-        tol1('nn.functional.conv_transpose3d',
-             {torch.float32: tol(atol=1e-04, rtol=1.3e-06)}, device_type='cuda'),
-        tol1('linalg.tensorsolve',
-             {torch.float32: tol(atol=1e-04, rtol=1.3e-05)}, device_type='cuda'),
-        tol1('nn.functional.binary_cross_entropy_with_logits',
-             {torch.float32: tol(atol=4e-04, rtol=4e-04)}),
-        tol1('nn.functional.batch_norm',
-             {torch.float32: tol(atol=4e-05, rtol=5e-05)}),
-        tol1('nn.functional.conv2d',
-             {torch.float32: tol(atol=4e-05, rtol=5e-05)}),
-        tol1('pca_lowrank',
-             {torch.float32: tol(atol=5e-05, rtol=5e-05)}),
-        tol1('nn.functional.multi_head_attention_forward',
-             {torch.float32: tol(atol=6e-05, rtol=2e-05)}),
-    ))
+    @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
+    @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
+    @skipOps(
+        "TestOperators",
+        "test_jvp",
+        set(
+            {
+                # Composite ops that do bad things. Need to be fixed in PyTorch core.
+                # RuntimeError: Cannot access data pointer of Tensor that doesn't have storage
+                xfail("tensor_split"),
+                # BUG: silent incorrectness: runs and produces numerical differences
+                skip("nn.functional.max_unpool1d"),  # fails everywhere except on mac
+                skip(
+                    "nn.functional.max_unpool2d"
+                ),  # fails everywhere except on windows
+                skip("nn.functional.max_unpool3d"),  # fails everywhere except on mac
+                xfail(
+                    "native_batch_norm"
+                ),  # TODO: fails comparing None to tensor of 0s for saved_mean/var tangents
+                xfail(
+                    "_native_batch_norm_legit"
+                ),  # TODO: fails comparing None to tensor of 0s for saved_mean/var tangents
+                xfail(
+                    "_batch_norm_with_update"
+                ),  # TODO: fails comparing None to tensor of 0s for saved_mean/var tangents
+                xfail("nn.functional.scaled_dot_product_attention"),
+                xfail("torch.ops.aten._flash_attention_forward"),
+                xfail("torch.ops.aten._efficient_attention_forward"),
+                xfail(
+                    "nn.functional.rrelu"
+                ),  # in-place test errors out with no formula implemented
+                xfail(
+                    "NumpyExpMarkDirtyAutogradFunction"
+                ),  # TODO: https://github.com/pytorch/pytorch/issues/91280
+                # https://github.com/pytorch/pytorch/issues/96560
+                # ROCm: NotImplementedError
+                decorate("nn.functional.batch_norm", decorator=skipIfRocm),
+                # ROCm: NotImplementedError
+                decorate("nn.functional.instance_norm", decorator=skipIfRocm),
+                # --- Non-Contiguous Failures! ---
+                # This is expected to fail as the operator
+                # expects last dim to have stride=1
+                xfail("view_as_complex"),
+                # BUG
+                # AssertionError: Tensor-likes are not close!
+                xfail("as_strided"),
+                xfail("as_strided", "partial_views"),
+                xfail("as_strided_scatter"),
+                decorate(
+                    "linalg.det",
+                    "singular",
+                    decorator=expectedFailureIf(IS_MACOS and IS_X86),
+                ),
+            }
+        ),
+    )
+    @opsToleranceOverride(
+        "TestOperators",
+        "test_jvp",
+        (
+            tol1(
+                "nn.functional.conv_transpose3d",
+                {torch.float32: tol(atol=1e-04, rtol=1.3e-06)},
+                device_type="cuda",
+            ),
+            tol1(
+                "linalg.tensorsolve",
+                {torch.float32: tol(atol=1e-04, rtol=1.3e-05)},
+                device_type="cuda",
+            ),
+            tol1(
+                "nn.functional.binary_cross_entropy_with_logits",
+                {torch.float32: tol(atol=4e-04, rtol=4e-04)},
+            ),
+            tol1(
+                "nn.functional.batch_norm", {torch.float32: tol(atol=4e-05, rtol=5e-05)}
+            ),
+            tol1("nn.functional.conv2d", {torch.float32: tol(atol=4e-05, rtol=5e-05)}),
+            tol1("pca_lowrank", {torch.float32: tol(atol=5e-05, rtol=5e-05)}),
+            tol1(
+                "nn.functional.multi_head_attention_forward",
+                {torch.float32: tol(atol=6e-05, rtol=2e-05)},
+            ),
+        ),
+    )
     def test_jvp(self, device, dtype, op):
         # TODO: get rid of vjp_decomp when we add decomposition support to
         # PyTorch's forward-mode ad. Currently the decomposition support only
         # works for functorch.jvp
         VJP_DECOMP = {
-            'nn.functional.logsigmoid',
+            "nn.functional.logsigmoid",
         }
         if op.name in VJP_DECOMP:
             fixme_ref_jvp_local = simulate_jvp
@@ -541,36 +643,43 @@ def test_jvp(self, device, dtype, op):
 
         for sample in samples:
             if outplace_variant:
-                self.jvp_opinfo_test(outplace_variant, sample,
-                                     sample.output_process_fn_grad,
-                                     clone_inputs=False,
-                                     fixme_ref_jvp_local=fixme_ref_jvp_local)
+                self.jvp_opinfo_test(
+                    outplace_variant,
+                    sample,
+                    sample.output_process_fn_grad,
+                    clone_inputs=False,
+                    fixme_ref_jvp_local=fixme_ref_jvp_local,
+                    test_noncontig=op.name not in skip_noncontig,
+                )
             if is_valid_inplace_sample_input(sample, op, inplace_variant):
-                self.jvp_opinfo_test(inplace_variant, sample,
-                                     sample.output_process_fn_grad,
-                                     clone_inputs=True,
-                                     fixme_ref_jvp_local=fixme_ref_jvp_local)
-
-    def jvp_opinfo_test(self, fn, sample, output_process_fn,
-                        clone_inputs, fixme_ref_jvp_local):
+                self.jvp_opinfo_test(
+                    inplace_variant,
+                    sample,
+                    sample.output_process_fn_grad,
+                    clone_inputs=True,
+                    fixme_ref_jvp_local=fixme_ref_jvp_local,
+                    test_noncontig=op.name not in skip_noncontig,
+                )
+
+    def jvp_opinfo_test(
+        self,
+        fn,
+        sample,
+        output_process_fn,
+        clone_inputs,
+        fixme_ref_jvp_local,
+        test_noncontig,
+    ):
         # NB: we used requires_grad=True to determine where the primals are,
         # but don't need that information otherwise
         args = (sample.input,) + sample.args
         kwargs = sample.kwargs
         contig_fn, primals = normalize_op_input_output2(
-            fn, args, kwargs, output_process_fn, requires_grad=True)
+            fn, args, kwargs, output_process_fn, requires_grad=True
+        )
         orig_primals = tree_map(lambda x: x.detach(), primals)
         orig_tangents = tree_map(lambda x: torch.randn_like(x), primals)
 
-        noncontig_sample = sample.noncontiguous()
-        noncontig_args = (noncontig_sample.input,) + noncontig_sample.args
-        noncontig_kwargs = sample.kwargs
-        noncontig_fn, primals = normalize_op_input_output2(
-            fn, noncontig_args, noncontig_kwargs,
-            output_process_fn, requires_grad=True)
-        noncontig_primals = tree_map(lambda x: x.detach(), primals)
-        noncontig_tangents = tree_map(lambda x: noncontiguous_like(x), orig_tangents)
-
         def maybe_clone_inputs():
             if clone_inputs:
                 primals = tree_map(torch.clone, orig_primals)
@@ -579,65 +688,93 @@ def maybe_clone_inputs():
             return orig_primals, orig_tangents
 
         primals, tangents = maybe_clone_inputs()
-        expected_primal_outs, expected_tangent_outs = \
-            fixme_ref_jvp_local(contig_fn, primals, tangents)
+        expected_primal_outs, expected_tangent_outs = fixme_ref_jvp_local(
+            contig_fn, primals, tangents
+        )
 
         primals, tangents = maybe_clone_inputs()
         primal_outs, tangent_outs = jvp(contig_fn, primals, tangents)
 
-        noncontig_primal_outs, noncontig_tangent_outs = jvp(noncontig_fn,
-                                                            noncontig_primals,
-                                                            noncontig_tangents)
-
         self.assertEqual(primal_outs, expected_primal_outs)
         self.assertEqual(tangent_outs, expected_tangent_outs)
 
-        self.assertEqual(noncontig_primal_outs, expected_primal_outs)
-        self.assertEqual(noncontig_tangent_outs, expected_tangent_outs)
+        if test_noncontig:
+            noncontig_sample = sample.noncontiguous()
+            noncontig_args = (noncontig_sample.input,) + noncontig_sample.args
+            noncontig_kwargs = sample.kwargs
+            noncontig_fn, primals = normalize_op_input_output2(
+                fn,
+                noncontig_args,
+                noncontig_kwargs,
+                output_process_fn,
+                requires_grad=True,
+            )
+            noncontig_primals = tree_map(lambda x: x.detach(), primals)
+            noncontig_tangents = tree_map(
+                lambda x: noncontiguous_like(x), orig_tangents
+            )
+            noncontig_primal_outs, noncontig_tangent_outs = jvp(
+                noncontig_fn, noncontig_primals, noncontig_tangents
+            )
+
+            self.assertEqual(noncontig_primal_outs, expected_primal_outs)
+            self.assertEqual(noncontig_tangent_outs, expected_tangent_outs)
 
+    @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
-    @skipOps('TestOperators', 'test_vjp', vjp_fail.union({
-        xfail('sparse.sampled_addmm', ''),
-        xfail('sparse.mm', 'reduce'),
-
-        # ---- Non-Contiguous Failures ----
-        # This is expected to fail as the operator
-        # expects last dim to have stride=1
-        xfail('view_as_complex'),
-        # RuntimeError: query: last dimension must be contiguous
-        # The fused attention kernels require the last dim to be contiguous
-        xfail('nn.functional.scaled_dot_product_attention'),
-        xfail('torch.ops.aten._flash_attention_forward'),
-        xfail('torch.ops.aten._efficient_attention_forward'),
-        # BUG
-        # AssertionError: Tensor-likes are not close!
-        xfail('as_strided'),
-        xfail('as_strided_scatter'),
-        xfail('_softmax_backward_data', device_type='cpu'),
-        xfail('as_strided', 'partial_views'),
-    }))
-    @opsToleranceOverride('TestOperators', 'test_vjp', (
-        tol1('nn.functional.conv_transpose3d',
-             {torch.float32: tol(atol=5e-05, rtol=9e-05)}, device_type='cuda'),
-        tol1('nn.functional.binary_cross_entropy_with_logits',
-             {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
-        tol1('nn.functional.multi_head_attention_forward',
-             {torch.float32: tol(atol=2e-03, rtol=2e-04)}),
-        tol1('__rmatmul__',
-             {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
-        tol1('matmul',
-             {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
-        tol2('linalg.pinv', 'hermitian',
-             {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
-        tol1('linalg.tensorsolve',
-             {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
-        tol1('linalg.multi_dot',
-             {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
-        tol1('svd_lowrank',
-             {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
-        tol1('pca_lowrank',
-             {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
-    ))
+    @skipOps(
+        "TestOperators",
+        "test_vjp",
+        vjp_fail.union(
+            {
+                xfail("sparse.sampled_addmm", ""),
+                xfail("sparse.mm", "reduce"),
+                # ---- Non-Contiguous Failures ----
+                # This is expected to fail as the operator
+                # expects last dim to have stride=1
+                xfail("view_as_complex"),
+                # RuntimeError: query: last dimension must be contiguous
+                # The fused attention kernels require the last dim to be contiguous
+                xfail("nn.functional.scaled_dot_product_attention"),
+                xfail("torch.ops.aten._flash_attention_forward"),
+                xfail("torch.ops.aten._efficient_attention_forward"),
+                # BUG
+                # AssertionError: Tensor-likes are not close!
+                xfail("as_strided"),
+                xfail("as_strided_scatter"),
+                xfail("_softmax_backward_data", device_type="cpu"),
+                xfail("as_strided", "partial_views"),
+            }
+        ),
+    )
+    @opsToleranceOverride(
+        "TestOperators",
+        "test_vjp",
+        (
+            tol1(
+                "nn.functional.conv_transpose3d",
+                {torch.float32: tol(atol=5e-05, rtol=9e-05)},
+                device_type="cuda",
+            ),
+            tol1(
+                "nn.functional.binary_cross_entropy_with_logits",
+                {torch.float32: tol(atol=1e-04, rtol=1e-04)},
+            ),
+            tol1(
+                "nn.functional.multi_head_attention_forward",
+                {torch.float32: tol(atol=2e-03, rtol=2e-04)},
+            ),
+            tol1("__rmatmul__", {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
+            tol1("matmul", {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
+            tol2(
+                "linalg.pinv", "hermitian", {torch.float32: tol(atol=1e-05, rtol=1e-05)}
+            ),
+            tol1("linalg.tensorsolve", {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
+            tol1("linalg.multi_dot", {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
+            tol1("svd_lowrank", {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
+            tol1("pca_lowrank", {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
+        ),
+    )
     def test_vjp(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
@@ -647,69 +784,89 @@ def test_vjp(self, device, dtype, op):
 
         def _test(_op, inplace=False):
             for sample in samples:
-                if inplace and not is_valid_inplace_sample_input(sample, op, op.inplace_variant):
+                if inplace and not is_valid_inplace_sample_input(
+                    sample, op, op.inplace_variant
+                ):
                     continue
                 fn, primals = normalize_op_input_output(_op, sample)
                 result = fn(*primals)
                 cotangents = tree_map(lambda x: torch.randn_like(x), result)
 
-                noncontig_fn, noncontig_primals = normalize_op_input_output(_op, sample.noncontiguous())
-                noncontig_cotangents = tree_map(lambda x: noncontiguous_like(x), cotangents)
-
                 out, vjp_fn = vjp(fn, *primals)
                 self.assertEqual(out, result)
                 result_vjps = vjp_fn(cotangents)
 
-                out_noncontig, vjp_fn = vjp(noncontig_fn, *noncontig_primals)
-                self.assertEqual(out_noncontig, result)
-                noncontig_result_vjps = vjp_fn(noncontig_cotangents)
-
                 _, vjp_fn = ref_vjp(fn, *primals)
                 expected_vjps = vjp_fn(cotangents)
 
                 self.assertEqual(result_vjps, expected_vjps)
-                self.assertEqual(noncontig_result_vjps, expected_vjps)
+
+                if op.name not in skip_noncontig:
+                    noncontig_fn, noncontig_primals = normalize_op_input_output(
+                        _op, sample.noncontiguous()
+                    )
+                    noncontig_cotangents = tree_map(
+                        lambda x: noncontiguous_like(x), cotangents
+                    )
+                    out_noncontig, vjp_fn = vjp(noncontig_fn, *noncontig_primals)
+                    self.assertEqual(out_noncontig, result)
+                    noncontig_result_vjps = vjp_fn(noncontig_cotangents)
+                    self.assertEqual(noncontig_result_vjps, expected_vjps)
 
         _test(op)
         for a_op in op.aliases:
             _test(a_op)
         if op.inplace_variant:
+
             def f(inp, *args, **kwargs):
                 return op.inplace_variant(inp.clone(), *args, **kwargs)
+
             _test(f, inplace=True)
 
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
-    @skipOps('TestOperators', 'test_vjpvjp', vjp_fail.union({
-        skip('nn.functional.max_unpool1d'),  # silent incorrectness; Flaky
-        skip('nn.functional.max_unpool2d'),  # silent incorrectness; Flaky
-        xfail('nn.functional.ctc_loss'),  # Not Implemented
-        xfail('native_layer_norm', ''),  # Expected a proper Tensor but got None for argument #1 'other'
-        xfail('sparse.sampled_addmm', ''),  # sparse tensors have no strides
-        xfail('sparse.mm', 'reduce'),  # sparse tensors have no strides
-        skip('nn.functional.scaled_dot_product_attention'),
-        xfail('torch.ops.aten._flash_attention_forward'),
-        xfail('torch.ops.aten._efficient_attention_forward'),
-        # AssertionError: Tensor-likes are not close!
-        # Mismatched elements: 1 / 15 (6.7%)
-        # Greatest absolute difference: 24.0 at index (2, 4) (up to 1e-05 allowed)
-        # Greatest relative difference: 1.7933241714393998e-06 at index (2, 4) (up to 1.3e-06 allowed)
-        # The failure occurred for item [0]
-        xfail('masked.prod')
-    }))
-    @opsToleranceOverride('TestOperators', 'test_vjpvjp', (
-        tol1('nn.functional.conv_transpose3d',
-             {torch.float32: tol(atol=5e-05, rtol=9e-05)}, device_type='cuda'),
-        tol1('prod',
-             {torch.float32: tol(atol=2e-05, rtol=1e-04)}),
-        tol1('masked.cumprod',
-             {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
-        tol1('cumprod',
-             {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
-        tol1('linalg.vander',
-             {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
-        tol2('linalg.det', 'singular',
-             {torch.float32: tol(atol=2e-05, rtol=2e-05)}),
-    ))
+    @skipOps(
+        "TestOperators",
+        "test_vjpvjp",
+        vjp_fail.union(
+            {
+                skip("nn.functional.max_unpool1d"),  # silent incorrectness; Flaky
+                skip("nn.functional.max_unpool2d"),  # silent incorrectness; Flaky
+                xfail("nn.functional.ctc_loss"),  # Not Implemented
+                xfail(
+                    "native_layer_norm", ""
+                ),  # Expected a proper Tensor but got None for argument #1 'other'
+                xfail("sparse.sampled_addmm", ""),  # sparse tensors have no strides
+                xfail("sparse.mm", "reduce"),  # sparse tensors have no strides
+                skip("nn.functional.scaled_dot_product_attention"),
+                xfail("torch.ops.aten._flash_attention_forward"),
+                xfail("torch.ops.aten._efficient_attention_forward"),
+                # AssertionError: Tensor-likes are not close!
+                # Mismatched elements: 1 / 15 (6.7%)
+                # Greatest absolute difference: 24.0 at index (2, 4) (up to 1e-05 allowed)
+                # Greatest relative difference: 1.7933241714393998e-06 at index (2, 4) (up to 1.3e-06 allowed)
+                # The failure occurred for item [0]
+                xfail("masked.prod"),
+            }
+        ),
+    )
+    @opsToleranceOverride(
+        "TestOperators",
+        "test_vjpvjp",
+        (
+            tol1(
+                "nn.functional.conv_transpose3d",
+                {torch.float32: tol(atol=5e-05, rtol=9e-05)},
+                device_type="cuda",
+            ),
+            tol1("prod", {torch.float32: tol(atol=2e-05, rtol=1e-04)}),
+            tol1("masked.cumprod", {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
+            tol1("cumprod", {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
+            tol1("linalg.vander", {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
+            tol2(
+                "linalg.det", "singular", {torch.float32: tol(atol=2e-05, rtol=2e-05)}
+            ),
+        ),
+    )
     def test_vjpvjp(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
@@ -722,7 +879,9 @@ def test_vjpvjp(self, device, dtype, op):
 
         def test(_op, inplace=False):
             for sample in samples:
-                if inplace and not is_valid_inplace_sample_input(sample, op, op.inplace_variant):
+                if inplace and not is_valid_inplace_sample_input(
+                    sample, op, op.inplace_variant
+                ):
                     continue
                 fn, args = get_vjpfull_variant(_op, sample)
                 result = fn(*args)
@@ -742,108 +901,147 @@ def test(_op, inplace=False):
 
         test(op)
         if op.inplace_variant:
+
             def fn(inp, *args, **kwargs):
                 return op.inplace_variant(inp.clone(), *args, **kwargs)
+
             test(fn, inplace=True)
 
     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
-    @skipOps('TestOperators', 'test_vmapvjpvjp', vjp_fail.union({
-        skip("atleast_1d"),  # Takes too long
-        skip("atleast_2d"),  # Takes too long
-        skip("atleast_3d"),  # Takes too long
-        skip("ormqr"),  # Takes too long
-        xfail("as_strided"),  # incorrect output
-        xfail("as_strided", "partial_views"),  # incorrect output
-        xfail("as_strided_scatter"),  # incorrect output
-        skip("bernoulli"),  # calls random op
-        xfail("bfloat16"),  # rank 4 tensor for channels_last
-        xfail("cdouble"),  # rank 4 tensor for channels_last
-        xfail("cfloat"),  # rank 4 tensor for channels_last
-        xfail("chalf"),  # rank 4 tensor for channels_last
-        xfail("double"),  # rank 4 tensor for channels_last
-        xfail("float"),  # rank 4 tensor for channels_last
-        xfail("half"),  # rank 4 tensor for channels_last
-        xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable autograd.Function
-        # It looks like you're either (1) calling .item() on a Tensor or
-        # (2) attempting to use a Tensor in some data-dependent control flow or
-        # (3) encountering this error in PyTorch internals.
-        xfail("index_reduce"),
-        decorate("linalg.householder_product", decorator=runOnRocm),  # works on ROCm
-        xfail("nanquantile", device_type='cpu'),  # vmap not implemented for at::equal.
-        xfail("native_layer_norm"),  # vmap: inplace into a regular tensor
-        # got a batched tensor as input while the running_mean or running_var,
-        # which will be updated in place, were not batched.
-        xfail("nn.functional.batch_norm"),
-        xfail("nn.functional.binary_cross_entropy"),  # vmap: inplace into a regular tensor
-        xfail("nn.functional.ctc_loss"),  # derivate not implemented for _ctc_loss_backward
-        # flaky on ROCM needs investigation
-        decorate('nn.functional.conv_transpose2d', decorator=skipIfRocm),
-        skip("nn.functional.dropout"),  # calls random op
-        skip("nn.functional.dropout2d"),  # calls random op
-        skip("nn.functional.dropout3d"),  # calls random op
-        skip("nn.functional.alpha_dropout"),  # calls random op
-        skip("nn.functional.feature_alpha_dropout", "with_train"),  # calls random op
-        skip("nn.functional.fractional_max_pool2d"),  # calls random op
-        skip("nn.functional.fractional_max_pool3d"),  # calls random op
-        xfail('nn.functional.scaled_dot_product_attention'),  # randomness
-        xfail('torch.ops.aten._efficient_attention_forward'),  # outputs ints
-        xfail('nn.functional.multi_head_attention_forward'),  # randomness
-        # It looks like you're either (1) calling .item() on a Tensor or
-        # (2) attempting to use a Tensor in some data-dependent control flow or
-        # (3) encountering this error in PyTorch internals.
-        xfail("nn.functional.gaussian_nll_loss"),
-        # got a batched tensor as input while the running_mean or running_var,
-        # which will be updated in place, were not batched.
-        xfail("nn.functional.instance_norm"),
-        xfail("nn.functional.layer_norm"),  # vmap: inplace into a regular tensor
-        # RuntimeError: NYI: querying is_contiguous inside of vmap
-        # for memory_format other than torch.contiguous_formats
-        xfail("nn.functional.max_pool2d"),
-        # RuntimeError: NYI: Tensor.clone(memory_format) inside vmap is only
-        # supported with memory_format torch.preserve_format or
-        # torch.contiguous_format (got ChannelsLast)
-        xfail("nn.functional.max_unpool2d"),
-        # RuntimeError: NYI: Tensor.clone(memory_format) inside vmap is only
-        # supported with memory_format torch.preserve_format
-        # or torch.contiguous_format (got ChannelsLast)s
-        xfail("nn.functional.max_unpool2d", "grad"),
-        xfail("nn.functional.rrelu"),  # RuntimeError: vmap: we do not yet support aten::rrelu_with_noise.
-        xfail("normal"),  # calls random op
-        xfail("normal", "number_mean"),  # calls random op
-        xfail("pca_lowrank"),  # calls random op
-        # https://github.com/pytorch/pytorch/issues/96560
-        decorate('linalg.pinv', 'hermitian', decorator=skipIfRocm),
-        xfail("quantile", device_type='cpu'),  # Batching rule not implemented for `at::equal`
-        xfail("scatter_reduce", "prod"),  # vmap (looks like you are calling item/data-dependent)
-        xfail("sparse.sampled_addmm"),  # RuntimeError: Sparse CSR tensors do not have strides
-        xfail("sparse.mm", "reduce"),  # RuntimeError: Sparse CSR tensors do not have strides
-        xfail("svd_lowrank"),  # calls random op
-        xfail("take"),  # vmap: inplace into a regular tensor
-        xfail("to"),  # rank 4 tensor for channels_last
-        xfail("view_as_complex"),  # RuntimeError: Tensor must have a last dimension with stride 1
-        # got a batched tensor as input while the running_mean or running_var,
-        # which will be updated in place, were not batched.
-        xfail("nn.functional.batch_norm", 'without_cudnn'),
-        # view doesn't work on sparse
-        xfail("to_sparse"),
-        xfail("native_batch_norm"),
-        xfail("_native_batch_norm_legit"),
-    }))
+    @skipOps(
+        "TestOperators",
+        "test_vmapvjpvjp",
+        vjp_fail.union(
+            {
+                skip("atleast_1d"),  # Takes too long
+                skip("atleast_2d"),  # Takes too long
+                skip("atleast_3d"),  # Takes too long
+                skip("ormqr"),  # Takes too long
+                xfail("as_strided"),  # incorrect output
+                xfail("as_strided", "partial_views"),  # incorrect output
+                xfail("as_strided_scatter"),  # incorrect output
+                skip("bernoulli"),  # calls random op
+                xfail("bfloat16"),  # rank 4 tensor for channels_last
+                xfail("cdouble"),  # rank 4 tensor for channels_last
+                xfail("cfloat"),  # rank 4 tensor for channels_last
+                xfail("chalf"),  # rank 4 tensor for channels_last
+                xfail("double"),  # rank 4 tensor for channels_last
+                xfail("float"),  # rank 4 tensor for channels_last
+                xfail("half"),  # rank 4 tensor for channels_last
+                xfail(
+                    "NumpyCubeNotComposableAutogradFunction"
+                ),  # Not composable autograd.Function
+                # It looks like you're either (1) calling .item() on a Tensor or
+                # (2) attempting to use a Tensor in some data-dependent control flow or
+                # (3) encountering this error in PyTorch internals.
+                xfail("index_reduce", "prod"),
+                decorate(
+                    "linalg.householder_product", decorator=runOnRocm
+                ),  # works on ROCm
+                xfail(
+                    "nanquantile", device_type="cpu"
+                ),  # vmap not implemented for at::equal.
+                xfail("native_layer_norm"),  # vmap: inplace into a regular tensor
+                # got a batched tensor as input while the running_mean or running_var,
+                # which will be updated in place, were not batched.
+                xfail("nn.functional.batch_norm"),
+                xfail(
+                    "nn.functional.binary_cross_entropy"
+                ),  # vmap: inplace into a regular tensor
+                xfail(
+                    "nn.functional.ctc_loss"
+                ),  # derivate not implemented for _ctc_loss_backward
+                # flaky on ROCM needs investigation
+                decorate("nn.functional.conv_transpose2d", decorator=skipIfRocm),
+                skip("nn.functional.dropout"),  # calls random op
+                skip("nn.functional.dropout2d"),  # calls random op
+                skip("nn.functional.dropout3d"),  # calls random op
+                skip("nn.functional.alpha_dropout"),  # calls random op
+                skip(
+                    "nn.functional.feature_alpha_dropout", "with_train"
+                ),  # calls random op
+                skip("nn.functional.fractional_max_pool2d"),  # calls random op
+                skip("nn.functional.fractional_max_pool3d"),  # calls random op
+                xfail("nn.functional.scaled_dot_product_attention"),  # randomness
+                xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
+                xfail("nn.functional.multi_head_attention_forward"),  # randomness
+                # It looks like you're either (1) calling .item() on a Tensor or
+                # (2) attempting to use a Tensor in some data-dependent control flow or
+                # (3) encountering this error in PyTorch internals.
+                xfail("nn.functional.gaussian_nll_loss"),
+                # got a batched tensor as input while the running_mean or running_var,
+                # which will be updated in place, were not batched.
+                xfail("nn.functional.instance_norm"),
+                xfail(
+                    "nn.functional.layer_norm"
+                ),  # vmap: inplace into a regular tensor
+                # RuntimeError: NYI: querying is_contiguous inside of vmap
+                # for memory_format other than torch.contiguous_formats
+                xfail("nn.functional.max_pool2d"),
+                # RuntimeError: NYI: Tensor.clone(memory_format) inside vmap is only
+                # supported with memory_format torch.preserve_format or
+                # torch.contiguous_format (got ChannelsLast)
+                xfail("nn.functional.max_unpool2d"),
+                # RuntimeError: NYI: Tensor.clone(memory_format) inside vmap is only
+                # supported with memory_format torch.preserve_format
+                # or torch.contiguous_format (got ChannelsLast)s
+                xfail("nn.functional.max_unpool2d", "grad"),
+                xfail(
+                    "nn.functional.rrelu"
+                ),  # RuntimeError: vmap: we do not yet support aten::rrelu_with_noise.
+                xfail("normal"),  # calls random op
+                xfail("normal", "number_mean"),  # calls random op
+                xfail("pca_lowrank"),  # calls random op
+                # https://github.com/pytorch/pytorch/issues/96560
+                decorate("linalg.pinv", "hermitian", decorator=skipIfRocm),
+                xfail(
+                    "quantile", device_type="cpu"
+                ),  # Batching rule not implemented for `at::equal`
+                xfail(
+                    "scatter_reduce", "prod"
+                ),  # vmap (looks like you are calling item/data-dependent)
+                xfail(
+                    "sparse.sampled_addmm"
+                ),  # RuntimeError: Sparse CSR tensors do not have strides
+                xfail(
+                    "sparse.mm", "reduce"
+                ),  # RuntimeError: Sparse CSR tensors do not have strides
+                xfail("svd_lowrank"),  # calls random op
+                xfail("to"),  # rank 4 tensor for channels_last
+                xfail(
+                    "view_as_complex"
+                ),  # RuntimeError: Tensor must have a last dimension with stride 1
+                # got a batched tensor as input while the running_mean or running_var,
+                # which will be updated in place, were not batched.
+                xfail("nn.functional.batch_norm", "without_cudnn"),
+                # view doesn't work on sparse
+                xfail("to_sparse"),
+                xfail("native_batch_norm"),
+                xfail("_native_batch_norm_legit"),
+                # TODO: implement batching rule
+                xfail("_batch_norm_with_update"),
+            }
+        ),
+    )
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
-    @opsToleranceOverride('TestOperators', 'test_vmapvjpvjp', (
-        tol1('linalg.svd',
-             {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
-        tol1('linalg.lu_factor',
-             {torch.float32: tol(atol=2e-03, rtol=2e-02)}),
-        tol1('svd',
-             {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
-        tol1('matrix_exp',
-             {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
-    ))
-    @skipOps('TestOperators', 'test_vmapvjpvjp', {
-        xfail('as_strided', 'partial_views'),
-    })
+    @opsToleranceOverride(
+        "TestOperators",
+        "test_vmapvjpvjp",
+        (
+            tol1("linalg.svd", {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
+            tol1("linalg.lu_factor", {torch.float32: tol(atol=2e-03, rtol=2e-02)}),
+            tol1("svd", {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
+            tol1("matrix_exp", {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
+        ),
+    )
+    @skipOps(
+        "TestOperators",
+        "test_vmapvjpvjp",
+        {
+            xfail("as_strided", "partial_views"),
+        },
+    )
     def test_vmapvjpvjp(self, device, dtype, op):
         # Since, we test `vjpvjp` independently,
         # for this test, we just verify that vmap
@@ -882,91 +1080,121 @@ def vjp_of_vjp(*args_and_cotangents):
 
             is_batch_norm_and_training = is_batch_norm_training(op.name, sample.kwargs)
             generator = get_fallback_and_vmap_exhaustive(
-                vjp_of_vjp, args_and_cotangents, {}, is_batch_norm_and_training=is_batch_norm_and_training)
+                vjp_of_vjp,
+                args_and_cotangents,
+                {},
+                is_batch_norm_and_training=is_batch_norm_and_training,
+            )
             for loop_out, batched_out in generator:
                 self.assertEqual(loop_out, batched_out)
 
-    vmapvjp_fail = vjp_fail.union({
-        # -------------------- ALLOWED FAILURES --------------------------------
-        # The following are not bugs and are expected behavior
-        xfail('masked_select'),  # Not possible due to dynamic shapes
-        skip('bernoulli'),  # randomness
-        skip('normal', ''),  # randomness
-        skip('normal', 'number_mean'),  # randomness
-        skip('nn.functional.rrelu'),  # randomness
-        skip('nn.functional.feature_alpha_dropout', 'with_train'),  # randomness
-        skip('nn.functional.feature_alpha_dropout', 'without_train'),  # randomness
-        skip('nn.functional.dropout'),  # randomness
-        skip('nn.functional.dropout2d'),  # randomness
-        skip('nn.functional.dropout3d', ''),  # randomness
-        skip('nn.functional.alpha_dropout'),  # randomness
-        skip('nn.functional.scaled_dot_product_attention'),  # randomness
-        xfail('torch.ops.aten._efficient_attention_forward'),  # outputs ints
-        skip('nn.functional.multi_head_attention_forward'),  # randomness
-        xfail('index_put', ''),  # not possible due to dynamic shapes; we support a subset
-        xfail('nn.functional.fractional_max_pool2d'),  # random
-        xfail('nn.functional.fractional_max_pool3d'),  # random
-        xfail('pca_lowrank', ''),  # randomness
-        xfail('svd_lowrank', ''),  # randomness
-        xfail('to_sparse', ''),  # non-dense output
-        skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
-        xfail('as_strided', 'partial_views'),
-        xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable autograd.Function
-        # ----------------------------------------------------------------------
-
-        # ---------------------------- BUGS ------------------------------------
-        # All of the following are bugs and need to be fixed
-        skip('linalg.svdvals'),  # # really annoying thing where it passes correctness check but not has_batch_rule
-        skip("native_batch_norm"),
-        skip("_native_batch_norm_legit"),
-        xfail('__getitem__', ''),  # dynamic error
-        xfail('nanquantile', device_type='cpu'),  # checks q via a .item() call
-        xfail('nn.functional.gaussian_nll_loss'),  # checks var for if any value < 0
-        xfail('narrow'),  # .item() call
-        xfail('quantile', device_type='cpu'),  # checks q via a .item() call
-        xfail('view_as_complex'),  # Tensor must have a last dimension with stride 1
-
-        # required rank 4 tensor to use channels_last format
-        xfail('bfloat16'),
-        xfail('double'),
-        xfail('float'),
-        xfail('half'),
-        xfail('cdouble', ''),
-        xfail('cfloat', ''),
-        xfail('chalf', ''),
-
-        xfail('scatter_reduce', 'prod'),  # item call
-
-        # Batching rule not implemented for aten::_use_cudnn_ctc_loss.Tensor
-        xfail('nn.functional.ctc_loss', device_type='cuda'),
-        # NYI: querying is_contiguous inside of vmap for memory_format other than torch.contiguous_format
-        xfail('nn.functional.max_unpool2d'),
-        xfail('nn.functional.max_unpool2d', 'grad'),
-
-        xfail('sparse.sampled_addmm', ''),
-        xfail('sparse.mm', 'reduce'),
-        xfail('as_strided_scatter', ''),  # calls as_strided
-        xfail('index_reduce', ''),  # .item() call
-        # ---------------------------------------------------------------------
-    })
+    vmapvjp_fail = vjp_fail.union(
+        {
+            # -------------------- ALLOWED FAILURES --------------------------------
+            # The following are not bugs and are expected behavior
+            xfail("masked_select"),  # Not possible due to dynamic shapes
+            skip("bernoulli"),  # randomness
+            skip("normal", ""),  # randomness
+            skip("normal", "number_mean"),  # randomness
+            skip("nn.functional.rrelu"),  # randomness
+            skip("nn.functional.feature_alpha_dropout", "with_train"),  # randomness
+            skip("nn.functional.feature_alpha_dropout", "without_train"),  # randomness
+            skip("nn.functional.dropout"),  # randomness
+            skip("nn.functional.dropout2d"),  # randomness
+            skip("nn.functional.dropout3d", ""),  # randomness
+            skip("nn.functional.alpha_dropout"),  # randomness
+            skip("nn.functional.scaled_dot_product_attention"),  # randomness
+            xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
+            skip("nn.functional.multi_head_attention_forward"),  # randomness
+            xfail(
+                "index_put", ""
+            ),  # not possible due to dynamic shapes; we support a subset
+            xfail("nn.functional.fractional_max_pool2d"),  # random
+            xfail("nn.functional.fractional_max_pool3d"),  # random
+            xfail("pca_lowrank", ""),  # randomness
+            xfail("svd_lowrank", ""),  # randomness
+            xfail("to_sparse", ""),  # non-dense output
+            skip(
+                "to"
+            ),  # RuntimeError: required rank 4 tensor to use channels_last format
+            xfail("as_strided", "partial_views"),
+            xfail(
+                "NumpyCubeNotComposableAutogradFunction"
+            ),  # Not composable autograd.Function
+            # ----------------------------------------------------------------------
+            # ---------------------------- BUGS ------------------------------------
+            # All of the following are bugs and need to be fixed
+            skip(
+                "linalg.svdvals"
+            ),  # # really annoying thing where it passes correctness check but not has_batch_rule
+            skip("native_batch_norm"),
+            skip("_native_batch_norm_legit"),
+            # TODO: implement batching rule
+            skip("_batch_norm_with_update"),
+            xfail("__getitem__", ""),  # dynamic error
+            xfail("nanquantile", device_type="cpu"),  # checks q via a .item() call
+            xfail("nn.functional.gaussian_nll_loss"),  # checks var for if any value < 0
+            xfail("narrow"),  # .item() call
+            xfail("quantile", device_type="cpu"),  # checks q via a .item() call
+            xfail("view_as_complex"),  # Tensor must have a last dimension with stride 1
+            # required rank 4 tensor to use channels_last format
+            xfail("bfloat16"),
+            xfail("double"),
+            xfail("float"),
+            xfail("half"),
+            xfail("cdouble", ""),
+            xfail("cfloat", ""),
+            xfail("chalf", ""),
+            xfail("scatter_reduce", "prod"),  # item call
+            # Batching rule not implemented for aten::_use_cudnn_ctc_loss.Tensor
+            xfail("nn.functional.ctc_loss", device_type="cuda"),
+            # NYI: querying is_contiguous inside of vmap for memory_format other than torch.contiguous_format
+            xfail("nn.functional.max_unpool2d"),
+            xfail("nn.functional.max_unpool2d", "grad"),
+            xfail("sparse.sampled_addmm", ""),
+            xfail("sparse.mm", "reduce"),
+            xfail("as_strided_scatter", ""),  # calls as_strided
+            xfail("index_reduce", "prod"),  # .item() call
+            # ---------------------------------------------------------------------
+        }
+    )
 
     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
-    @opsToleranceOverride('TestOperators', 'test_vmapvjp', (
-        tol1('linalg.svd',
-             {torch.float32: tol(atol=5e-04, rtol=1e-04)}, device_type="cuda"),
-        tol1('svd',
-             {torch.float32: tol(atol=5e-04, rtol=1e-04)}, device_type="cuda"),
-        tol1('linalg.householder_product',
-             {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
-        tol1('matrix_exp',
-             {torch.float32: tol(atol=5e-04, rtol=1e-04)}, device_type="cuda"),
-    ))
-    @skipOps('TestOperators', 'test_vmapvjp', vmapvjp_fail.union({
-        xfail('as_strided'),
-        xfail('as_strided', 'partial_views'),
-    }))
+    @opsToleranceOverride(
+        "TestOperators",
+        "test_vmapvjp",
+        (
+            tol1(
+                "linalg.svd",
+                {torch.float32: tol(atol=5e-04, rtol=1e-04)},
+                device_type="cuda",
+            ),
+            tol1(
+                "svd", {torch.float32: tol(atol=5e-04, rtol=1e-04)}, device_type="cuda"
+            ),
+            tol1(
+                "linalg.householder_product",
+                {torch.float32: tol(atol=1e-04, rtol=1e-04)},
+            ),
+            tol1(
+                "matrix_exp",
+                {torch.float32: tol(atol=5e-04, rtol=1e-04)},
+                device_type="cuda",
+            ),
+        ),
+    )
+    @skipOps(
+        "TestOperators",
+        "test_vmapvjp",
+        vmapvjp_fail.union(
+            {
+                xfail("as_strided"),
+                xfail("as_strided", "partial_views"),
+            }
+        ),
+    )
     def test_vmapvjp(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
@@ -983,85 +1211,114 @@ def test_vmapvjp(self, device, dtype, op):
             fn, args = get_vjp_fn_and_args_with_cotangents(op, sample, cotangents)
             is_batch_norm_and_training = is_batch_norm_training(op.name, sample.kwargs)
             generator = get_fallback_and_vmap_exhaustive(
-                fn, args, {}, is_batch_norm_and_training=is_batch_norm_and_training)
+                fn, args, {}, is_batch_norm_and_training=is_batch_norm_and_training
+            )
             for loop_out, batched_out in generator:
                 self.assertEqual(loop_out, batched_out)
 
     vmapjvpall_fail = {
         # -------------------- ALLOWED FAILURES --------------------------------
         # The following are expected (not a bug)
-        skip('bernoulli', ''),  # randomness
-        skip('nn.functional.dropout'),  # randomness
-        skip('nn.functional.rrelu'),  # randomness
-        skip('nn.functional.dropout2d', ''),
-        skip('nn.functional.dropout3d', ''),
-        skip('nn.functional.scaled_dot_product_attention'),  # randomness
-        xfail('torch.ops.aten._efficient_attention_forward'),  # outputs ints
-        skip('nn.functional.multi_head_attention_forward'),  # randomness
-        skip('nn.functional.alpha_dropout'),  # randomness
-        skip('nn.functional.feature_alpha_dropout', 'without_train'),
-        skip('nn.functional.feature_alpha_dropout', 'with_train'),
-        xfail('nn.functional.fractional_max_pool2d'),  # Cannot access data pointer of Tensor that doesn't have storage
-        xfail('nn.functional.fractional_max_pool3d'),  # Cannot access data pointer of Tensor that doesn't have storage
+        skip("bernoulli", ""),  # randomness
+        skip("nn.functional.dropout"),  # randomness
+        skip("nn.functional.rrelu"),  # randomness
+        skip("nn.functional.dropout2d", ""),
+        skip("nn.functional.dropout3d", ""),
+        skip("nn.functional.scaled_dot_product_attention"),  # randomness
+        xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
+        skip("nn.functional.multi_head_attention_forward"),  # randomness
+        skip("nn.functional.alpha_dropout"),  # randomness
+        skip("nn.functional.feature_alpha_dropout", "without_train"),
+        skip("nn.functional.feature_alpha_dropout", "with_train"),
+        xfail(
+            "nn.functional.fractional_max_pool2d"
+        ),  # Cannot access data pointer of Tensor that doesn't have storage
+        xfail(
+            "nn.functional.fractional_max_pool3d"
+        ),  # Cannot access data pointer of Tensor that doesn't have storage
         # Not actually a problem: embedding with max_norm mutates the weight
         # and causes different runs to produce different results.
         # skip because this is flaky depending on what the max_norm is!
-        skip('nn.functional.embedding', ''),
-        skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
-        xfail('NumpyExpMarkDirtyAutogradFunction'),  # vmap: inplace into a regular tensor
+        skip("nn.functional.embedding", ""),
+        skip("to"),  # RuntimeError: required rank 4 tensor to use channels_last format
+        xfail(
+            "NumpyExpMarkDirtyAutogradFunction"
+        ),  # vmap: inplace into a regular tensor
         # ----------------------------------------------------------------------
-
         # ---------------------------- BUGS ------------------------------------
         # The following are bugs that we should fix
-        xfail('masked.mean'),  # silent incorrectness (nan difference)
-        xfail('as_strided', 'partial_views'),  # Tensor-likes are not close!
-
-        xfail('nn.functional.soft_margin_loss', ''),  # soft_margin_loss_backward does not support forward-ad
-        xfail('tensor_split'),  # data_ptr composite compliance
-        xfail('quantile'),  # at::equal batching rule (cpu), also, in-place vmap (cuda)
-        skip('as_strided'),  # Test runner cannot handle this
+        xfail("masked.mean"),  # silent incorrectness (nan difference)
+        xfail("as_strided", "partial_views"),  # Tensor-likes are not close!
+        xfail(
+            "nn.functional.soft_margin_loss", ""
+        ),  # soft_margin_loss_backward does not support forward-ad
+        xfail("tensor_split"),  # data_ptr composite compliance
+        xfail("quantile"),  # at::equal batching rule (cpu), also, in-place vmap (cuda)
+        skip("as_strided"),  # Test runner cannot handle this
         # requires special handling, and does not yet have a batching rule. Feel free to file a github issue!
-        xfail('as_strided_scatter'),
-        xfail('nn.functional.gaussian_nll_loss'),  # .item or data-dependent control flow
-        xfail('scatter'),  # forward-mode AD does not support at::scatter
-        xfail('nanquantile'),  # at::equal batching rule (cpu), also, in-place vmap (cuda)
-        xfail('view_as_complex'),  # Tensor must have a last dimension with stride 1
-
-        skip('pca_lowrank', ''),  # randomness
-        skip('svd_lowrank', ''),  # randomness
-
-        xfail('double'),  # required rank 4 tensor to use channels_last format
-        xfail('cdouble'),  # required rank 4 tensor to use channels_last format
-
+        xfail("as_strided_scatter"),
+        xfail(
+            "nn.functional.gaussian_nll_loss"
+        ),  # .item or data-dependent control flow
+        xfail("scatter"),  # forward-mode AD does not support at::scatter
+        xfail(
+            "nanquantile"
+        ),  # at::equal batching rule (cpu), also, in-place vmap (cuda)
+        xfail("view_as_complex"),  # Tensor must have a last dimension with stride 1
+        skip("pca_lowrank", ""),  # randomness
+        skip("svd_lowrank", ""),  # randomness
+        xfail("double"),  # required rank 4 tensor to use channels_last format
+        xfail("cdouble"),  # required rank 4 tensor to use channels_last format
         # potential silent incorrectness
-        skip('nn.functional.max_unpool1d'),  # Flaky, seems to sometimes his max_unpool2d
-        skip('nn.functional.max_unpool2d'),  # fails everywhere except on mac
-        skip('nn.functional.max_unpool3d'),  # fails everywhere except on mac
-
+        skip(
+            "nn.functional.max_unpool1d"
+        ),  # Flaky, seems to sometimes his max_unpool2d
+        skip("nn.functional.max_unpool2d"),  # fails everywhere except on mac
+        skip("nn.functional.max_unpool3d"),  # fails everywhere except on mac
         # erroring because running_mean and running_var aren't differentiable
-        xfail('nn.functional.batch_norm'),
-        xfail('nn.functional.batch_norm', 'without_cudnn'),
+        xfail("nn.functional.batch_norm"),
+        xfail("nn.functional.batch_norm", "without_cudnn"),
         xfail("native_batch_norm"),
         xfail("_native_batch_norm_legit"),
-
+        # TODO: implement batching rule
+        xfail("_batch_norm_with_update"),
         # https://github.com/pytorch/pytorch/issues/96560
         # ROCm: NotImplementedError
-        decorate('nn.functional.instance_norm', decorator=skipIfRocm),
+        decorate("nn.functional.instance_norm", decorator=skipIfRocm),
         # ----------------------------------------------------------------------
     }
 
     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
-    @opsToleranceOverride('TestOperators', 'test_vmapjvpall', (
-        tol1('nn.functional.conv_transpose3d',
-             {torch.float32: tol(atol=2e-04, rtol=9e-3)}, device_type='cuda'),
-        tol1('linalg.householder_product',
-             {torch.float32: tol(atol=2e-04, rtol=9e-3)}),
-    ))
-    @skipOps('TestOperators', 'test_vmapjvpall', vmapjvpall_fail.union({
-        decorate('linalg.det', 'singular', decorator=expectedFailureIf(IS_MACOS and IS_X86)),
-    }))
+    @opsToleranceOverride(
+        "TestOperators",
+        "test_vmapjvpall",
+        (
+            tol1(
+                "nn.functional.conv_transpose3d",
+                {torch.float32: tol(atol=2e-04, rtol=9e-3)},
+                device_type="cuda",
+            ),
+            tol1(
+                "linalg.householder_product",
+                {torch.float32: tol(atol=2e-04, rtol=9e-3)},
+            ),
+        ),
+    )
+    @skipOps(
+        "TestOperators",
+        "test_vmapjvpall",
+        vmapjvpall_fail.union(
+            {
+                decorate(
+                    "linalg.det",
+                    "singular",
+                    decorator=expectedFailureIf(IS_MACOS and IS_X86),
+                ),
+            }
+        ),
+    )
     # This is technically a superset of test_vmapjvp. We should either delete test_vmapjvp
     # or figure out if we can split vmapjvpall. It's useful to keep test_vmapjvp intact
     # because that corresponds to "batched forward-mode AD" testing in PyTorch core
@@ -1084,51 +1341,65 @@ def test_vmapjvpall(self, device, dtype, op):
             fn, args = get_jvp_variant_primals_tangents(op, sample)
             is_batch_norm_and_training = is_batch_norm_training(op.name, kwarg_values)
             generator = get_fallback_and_vmap_exhaustive(
-                fn, args, {}, is_batch_norm_and_training=is_batch_norm_and_training)
+                fn, args, {}, is_batch_norm_and_training=is_batch_norm_and_training
+            )
             for loop_out, batched_out in generator:
                 self.assertEqual(loop_out, batched_out)
 
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
-    @skipOps('TestOperators', 'test_vmapjvpall_has_batch_rule', vmapjvpall_fail.union({
-        skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
-        xfail('cdouble'),  # RuntimeError: required rank 4 tensor to use channels_last format
-        xfail('lu'),
-        xfail('cumprod'),
-        xfail('masked_fill'),
-        xfail('fill'),
-        skip('masked.mean'),  # ???
-        xfail('masked_scatter'),
-        xfail('put'),
-        xfail('take'),
-        xfail('nn.functional.feature_alpha_dropout', 'without_train'),
-        xfail('linalg.lu_factor', ''),
-        xfail('nn.functional.dropout2d', ''),
-        xfail('pca_lowrank', ''),
-        xfail('svd_lowrank', ''),
-        xfail('linalg.lu_factor_ex', ''),
-        xfail('nn.functional.feature_alpha_dropout', 'with_train'),
-        xfail('special.log_ndtr', ''),
-        xfail('fft.ihfft2'),  # conj_physical fallback
-        xfail('fft.ihfftn'),  # conj_physical fallback
-        xfail('nn.functional.max_unpool3d', 'grad'),
-        xfail('nn.functional.max_unpool2d', 'grad'),
-        xfail('nn.functional.soft_margin_loss', ''),
-        xfail('nn.functional.max_unpool1d', 'grad'),
-        xfail('nn.functional.embedding', ''),
-        xfail('scatter_reduce', "sum"),   # aten::scatter_reduce.two hit the vmap fallback
-        xfail('scatter_reduce', "mean"),  # aten::scatter_reduce.two hit the vmap fallback
-        xfail('scatter_reduce', "amin"),  # aten::scatter_reduce.two hit the vmap fallback
-        xfail('scatter_reduce', "amax"),  # aten::scatter_reduce.two hit the vmap fallback
-        xfail('lu_unpack'),
-        xfail('nn.functional.glu'),
-        xfail('nn.functional.bilinear'),  # trilinear doesn't have batching rule
-        xfail('linalg.lu', ''),
-        xfail('linalg.lu_solve', ''),
-        xfail('nn.functional.dropout3d', ''),
-        xfail('as_strided_scatter', ''),
-        xfail('masked.cumprod', ''),
-        xfail("renorm"),  # hit vmap fallback, which is disabled
-    }))
+    @skipOps(
+        "TestOperators",
+        "test_vmapjvpall_has_batch_rule",
+        vmapjvpall_fail.union(
+            {
+                skip(
+                    "to"
+                ),  # RuntimeError: required rank 4 tensor to use channels_last format
+                xfail(
+                    "cdouble"
+                ),  # RuntimeError: required rank 4 tensor to use channels_last format
+                xfail("cumprod"),
+                xfail("masked_fill"),
+                xfail("fill"),
+                skip("masked.mean"),  # ???
+                xfail("masked_scatter"),
+                xfail("put"),
+                xfail("take"),
+                xfail("nn.functional.feature_alpha_dropout", "without_train"),
+                xfail("nn.functional.dropout2d", ""),
+                xfail("pca_lowrank", ""),
+                xfail("svd_lowrank", ""),
+                xfail("nn.functional.feature_alpha_dropout", "with_train"),
+                xfail("special.log_ndtr", ""),
+                xfail("fft.ihfft2"),  # conj_physical fallback
+                xfail("fft.ihfftn"),  # conj_physical fallback
+                xfail("nn.functional.max_unpool3d", "grad"),
+                xfail("nn.functional.max_unpool2d", "grad"),
+                xfail("nn.functional.soft_margin_loss", ""),
+                xfail("nn.functional.max_unpool1d", "grad"),
+                xfail("nn.functional.embedding", ""),
+                xfail(
+                    "scatter_reduce", "sum"
+                ),  # aten::scatter_reduce.two hit the vmap fallback
+                xfail(
+                    "scatter_reduce", "mean"
+                ),  # aten::scatter_reduce.two hit the vmap fallback
+                xfail(
+                    "scatter_reduce", "amin"
+                ),  # aten::scatter_reduce.two hit the vmap fallback
+                xfail(
+                    "scatter_reduce", "amax"
+                ),  # aten::scatter_reduce.two hit the vmap fallback
+                xfail("nn.functional.glu"),
+                xfail("nn.functional.bilinear"),  # trilinear doesn't have batching rule
+                xfail("linalg.lu", ""),
+                xfail("nn.functional.dropout3d", ""),
+                xfail("as_strided_scatter", ""),
+                xfail("masked.cumprod", ""),
+                xfail("renorm"),  # hit vmap fallback, which is disabled
+            }
+        ),
+    )
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     def test_vmapjvpall_has_batch_rule(self, device, dtype, op):
         if is_inplace(op, op.get_op()):
@@ -1148,89 +1419,123 @@ def test():
                 kwarg_values = sample.kwargs
                 args = tuple(arg_values) + tuple(kwarg_values)
                 fn, args = get_jvp_variant_primals_tangents(op, sample)
-                is_batch_norm_and_training = is_batch_norm_training(op.name, kwarg_values)
+                is_batch_norm_and_training = is_batch_norm_training(
+                    op.name, kwarg_values
+                )
                 for loop_out, batched_out in get_fallback_and_vmap_exhaustive(
-                        fn, args, {}, is_batch_norm_and_training=is_batch_norm_and_training, compute_loop_out=False):
+                    fn,
+                    args,
+                    {},
+                    is_batch_norm_and_training=is_batch_norm_and_training,
+                    compute_loop_out=False,
+                ):
                     pass
+
         check_vmap_fallback(self, test, op, dry_run=False)
 
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
-    @skipOps('TestOperators', 'test_vmapvjp_has_batch_rule', vmapvjp_fail.union({
-        skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
-        xfail('view_as_complex'),
-        xfail('cummax'),
-        xfail('cummin'),
-        xfail('fill'),
-        xfail('narrow'),  # Batching rule not implemented for `narrow.Tensor` (and view op)
-        xfail('special.log_ndtr'),
-        xfail('linalg.householder_product'),
-        xfail('lu'),
-        xfail('lu_solve'),
-        xfail('lu_unpack'),
-        xfail('masked_fill'),
-        xfail('masked_scatter'),
-        xfail('masked_select'),
-        xfail('nanquantile'),
-        xfail('ormqr'),
-        xfail('put'),
-        xfail('scatter_reduce', "sum"),   # aten::scatter_reduce.two hit the vmap fallback
-        xfail('scatter_reduce', "mean"),  # aten::scatter_reduce.two hit the vmap fallback
-        xfail('scatter_reduce', "amin"),  # aten::scatter_reduce.two hit the vmap fallback
-        xfail('scatter_reduce', "amax"),  # aten::scatter_reduce.two hit the vmap fallback
-        xfail('quantile'),
-        xfail('renorm'),
-        xfail('take'),
-        xfail('tensor_split'),
-        xfail('to_sparse'),
-        xfail('unfold'),
-        xfail('unfold_copy'),
-        xfail('nn.functional.dropout'),
-        xfail('fft.ihfft2'),
-        xfail('fft.ihfftn'),
-        xfail('nn.functional.gaussian_nll_loss'),
-        xfail('nn.functional.bilinear'),
-        xfail('nn.functional.fractional_max_pool3d'),
-        xfail('nn.functional.ctc_loss'),
-        xfail('nn.functional.rrelu'),
-        xfail('nn.functional.embedding_bag'),
-        xfail('nn.functional.fractional_max_pool2d'),
-        xfail('linalg.lu_factor', ''),
-        xfail('nn.functional.feature_alpha_dropout', 'with_train'),
-        xfail('pca_lowrank', ''),
-        xfail('nn.functional.dropout2d', ''),
-        xfail('nn.functional.feature_alpha_dropout', 'without_train'),
-        xfail('svd_lowrank', ''),
-        xfail('linalg.lu_factor_ex', ''),
-
-        xfail('nn.functional.max_unpool2d', ''),
-        xfail('nn.functional.multi_margin_loss', ''),
-        xfail('nn.functional.multilabel_margin_loss', ''),
-        xfail('nn.functional.pdist', ''),
-        xfail('scatter_reduce', 'prod'),
-        xfail('nn.functional.max_unpool1d', ''),
-        xfail('nn.functional.max_unpool3d', ''),
-        xfail('nn.functional.max_unpool3d', 'grad'),
-        xfail('nn.functional.soft_margin_loss', ''),
-        xfail('nn.functional.max_unpool1d', 'grad'),
-        xfail('nn.functional.max_unpool2d', 'grad'),
-        xfail('linalg.lu', ''),
-        xfail('linalg.lu_solve', ''),
-        xfail('cdouble', ''),
-        xfail('cfloat', ''),
-        xfail('chalf', ''),
-        xfail('index_reduce', ''),
-        xfail('nn.functional.dropout3d', ''),
-        xfail('as_strided_scatter', ''),
-        xfail('_segment_reduce', 'offsets'),
-        xfail('_segment_reduce', 'lengths'),
-        xfail('sparse.sampled_addmm', ''),
-        xfail('sparse.mm', 'reduce'),
-        xfail("native_batch_norm"),
-        xfail("_native_batch_norm_legit"),
-        xfail("native_dropout_backward"),
-        xfail("index_fill"),  # aten::_unique hit the vmap fallback which is currently disabled
-    }))
+    @skipOps(
+        "TestOperators",
+        "test_vmapvjp_has_batch_rule",
+        vmapvjp_fail.union(
+            {
+                skip(
+                    "to"
+                ),  # RuntimeError: required rank 4 tensor to use channels_last format
+                xfail("view_as_complex"),
+                xfail("cummax"),
+                xfail("cummin"),
+                xfail("fill"),
+                xfail(
+                    "narrow"
+                ),  # Batching rule not implemented for `narrow.Tensor` (and view op)
+                xfail("special.log_ndtr"),
+                xfail("linalg.householder_product"),
+                xfail("masked_fill"),
+                xfail("masked_scatter"),
+                xfail("masked_select"),
+                xfail("nanquantile"),
+                xfail("ormqr"),
+                xfail("put"),
+                xfail(
+                    "scatter_reduce", "sum"
+                ),  # aten::scatter_reduce.two hit the vmap fallback
+                xfail(
+                    "scatter_reduce", "mean"
+                ),  # aten::scatter_reduce.two hit the vmap fallback
+                xfail(
+                    "scatter_reduce", "amin"
+                ),  # aten::scatter_reduce.two hit the vmap fallback
+                xfail(
+                    "scatter_reduce", "amax"
+                ),  # aten::scatter_reduce.two hit the vmap fallback
+                xfail("quantile"),
+                xfail("renorm"),
+                xfail("take"),
+                xfail("tensor_split"),
+                xfail("to_sparse"),
+                xfail("unfold"),
+                xfail("unfold_copy"),
+                xfail("nn.functional.dropout"),
+                xfail("fft.ihfft2"),
+                xfail("fft.ihfftn"),
+                xfail("nn.functional.gaussian_nll_loss"),
+                xfail("nn.functional.bilinear"),
+                xfail("nn.functional.fractional_max_pool3d"),
+                xfail("nn.functional.ctc_loss"),
+                xfail("nn.functional.rrelu"),
+                xfail("nn.functional.embedding_bag"),
+                xfail("nn.functional.fractional_max_pool2d"),
+                xfail("nn.functional.feature_alpha_dropout", "with_train"),
+                xfail("pca_lowrank", ""),
+                xfail("nn.functional.dropout2d", ""),
+                xfail("nn.functional.feature_alpha_dropout", "without_train"),
+                xfail("svd_lowrank", ""),
+                xfail("nn.functional.max_unpool2d", ""),
+                xfail("nn.functional.multi_margin_loss", ""),
+                xfail("nn.functional.multilabel_margin_loss", ""),
+                xfail("nn.functional.pdist", ""),
+                xfail("scatter_reduce", "prod"),
+                xfail("nn.functional.max_unpool1d", ""),
+                xfail("nn.functional.max_unpool3d", ""),
+                xfail("nn.functional.max_unpool3d", "grad"),
+                xfail("nn.functional.soft_margin_loss", ""),
+                xfail("nn.functional.max_unpool1d", "grad"),
+                xfail("nn.functional.max_unpool2d", "grad"),
+                xfail("linalg.lu", ""),
+                xfail("cdouble", ""),
+                xfail("cfloat", ""),
+                xfail("chalf", ""),
+                xfail(
+                    "index_reduce", "prod"
+                ),  # aten::index_reduce hit the vmap fallback which is currently disabled
+                xfail(
+                    "index_reduce", "mean"
+                ),  # aten::index_reduce hit the vmap fallback which is currently disabled
+                xfail(
+                    "index_reduce", "amax"
+                ),  # aten::index_reduce hit the vmap fallback which is currently disabled
+                xfail(
+                    "index_reduce", "amin"
+                ),  # aten::index_reduce hit the vmap fallback which is currently disabled
+                xfail("nn.functional.dropout3d", ""),
+                xfail("as_strided_scatter", ""),
+                xfail("_segment_reduce", "offsets"),
+                xfail("_segment_reduce", "lengths"),
+                xfail("sparse.sampled_addmm", ""),
+                xfail("sparse.mm", "reduce"),
+                xfail("native_batch_norm"),
+                xfail("_native_batch_norm_legit"),
+                # TODO: implement batching rule
+                xfail("_batch_norm_with_update"),
+                xfail("native_dropout_backward"),
+                xfail(
+                    "index_fill"
+                ),  # aten::_unique hit the vmap fallback which is currently disabled
+            }
+        ),
+    )
     def test_vmapvjp_has_batch_rule(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
@@ -1247,71 +1552,106 @@ def test():
             for sample in samples:
                 cotangents = get_sample_cotangents(op, sample)
                 fn, args = get_vjp_fn_and_args_with_cotangents(op, sample, cotangents)
-                is_batch_norm_and_training = is_batch_norm_training(op.name, sample.kwargs)
+                is_batch_norm_and_training = is_batch_norm_training(
+                    op.name, sample.kwargs
+                )
                 for loop_out, batched_out in get_fallback_and_vmap_exhaustive(
-                        fn, args, {}, is_batch_norm_and_training=is_batch_norm_and_training, compute_loop_out=False):
+                    fn,
+                    args,
+                    {},
+                    is_batch_norm_and_training=is_batch_norm_and_training,
+                    compute_loop_out=False,
+                ):
                     pass
                 for a_op in op.aliases:
-                    fn, args = get_vjp_fn_and_args_with_cotangents(a_op, sample, cotangents)
+                    fn, args = get_vjp_fn_and_args_with_cotangents(
+                        a_op, sample, cotangents
+                    )
                     for loop_out, batched_out in get_fallback_and_vmap_exhaustive(
-                            fn, args, {}, is_batch_norm_and_training=is_batch_norm_and_training, compute_loop_out=False):
+                        fn,
+                        args,
+                        {},
+                        is_batch_norm_and_training=is_batch_norm_and_training,
+                        compute_loop_out=False,
+                    ):
                         pass
 
         check_vmap_fallback(self, test, op, dry_run=False)
 
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
-    @skipOps('TestOperators', 'test_vjpvmap', vjp_fail.union({
-        skip('bernoulli', ''),  # vjpvmap testing can't handle randomness
-        skip('normal', ''),  # vjpvmap testing can't handle randomness
-        skip('normal', 'number_mean'),  # vjpvmap testing can't handle randomness
-        skip('nn.functional.rrelu'),  # randomness
-        skip('nn.functional.feature_alpha_dropout', 'with_train'),  # randomness
-        skip('nn.functional.feature_alpha_dropout', 'without_train'),  # randomness
-        skip('nn.functional.scaled_dot_product_attention'),
-        xfail('torch.ops.aten._efficient_attention_forward'),  # outputs ints
-        skip('nn.functional.multi_head_attention_forward'),  # randomness
-        skip('nn.functional.alpha_dropout'),  # randomness
-        skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
-        skip('to_sparse', ''),  # non-dense output
-        skip('ormqr', ''),  # takes too long
-        xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable autograd.Function
-
-        # fallback path doesn't work
-        # All of the following are bugs and need to be fixed
-        xfail('__getitem__', ''),
-        xfail('index_put', ''),
-        xfail('view_as_complex'),
-        xfail('nn.functional.gaussian_nll_loss'),
-        xfail('masked_select'),
-        xfail('narrow'),  # Batching rule not implemented for `narrow.Tensor` (and view op)
-        skip('nn.functional.fractional_max_pool3d'),  # generator works on cpu, fails on cuda
-        skip('nn.functional.fractional_max_pool2d'),  # generator works on cpu, fails on cuda
-        xfail('column_stack', ''),
-        xfail('nn.functional.dropout2d', ''),
-        xfail('svd_lowrank', ''),
-        xfail('pca_lowrank', ''),
-        xfail('clamp'),
-        # something weird happening with channels_last
-        xfail('bfloat16'),
-        xfail('double'),
-        xfail('float'),
-        xfail('half'),
-        xfail('cdouble'),
-        xfail('cfloat'),
-        xfail('nn.functional.dropout3d', ''),
-        xfail('as_strided_scatter', ''),
-        xfail('sparse.sampled_addmm', ''),
-        xfail('sparse.mm', 'reduce'),
-        xfail("native_batch_norm"),
-        xfail("_native_batch_norm_legit"),
-        xfail('as_strided', 'partial_views'),
-    }))
+    @skipOps(
+        "TestOperators",
+        "test_vjpvmap",
+        vjp_fail.union(
+            {
+                skip("bernoulli", ""),  # vjpvmap testing can't handle randomness
+                skip("normal", ""),  # vjpvmap testing can't handle randomness
+                skip(
+                    "normal", "number_mean"
+                ),  # vjpvmap testing can't handle randomness
+                skip("nn.functional.rrelu"),  # randomness
+                skip("nn.functional.feature_alpha_dropout", "with_train"),  # randomness
+                skip(
+                    "nn.functional.feature_alpha_dropout", "without_train"
+                ),  # randomness
+                skip("nn.functional.scaled_dot_product_attention"),
+                xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
+                skip("nn.functional.multi_head_attention_forward"),  # randomness
+                skip("nn.functional.alpha_dropout"),  # randomness
+                skip(
+                    "to"
+                ),  # RuntimeError: required rank 4 tensor to use channels_last format
+                skip("to_sparse", ""),  # non-dense output
+                skip("ormqr", ""),  # takes too long
+                xfail(
+                    "NumpyCubeNotComposableAutogradFunction"
+                ),  # Not composable autograd.Function
+                # fallback path doesn't work
+                # All of the following are bugs and need to be fixed
+                xfail("__getitem__", ""),
+                xfail("index_put", ""),
+                xfail("view_as_complex"),
+                xfail("nn.functional.gaussian_nll_loss"),
+                xfail("masked_select"),
+                xfail(
+                    "narrow"
+                ),  # Batching rule not implemented for `narrow.Tensor` (and view op)
+                skip(
+                    "nn.functional.fractional_max_pool3d"
+                ),  # generator works on cpu, fails on cuda
+                skip(
+                    "nn.functional.fractional_max_pool2d"
+                ),  # generator works on cpu, fails on cuda
+                xfail("column_stack", ""),
+                xfail("nn.functional.dropout2d", ""),
+                xfail("svd_lowrank", ""),
+                xfail("pca_lowrank", ""),
+                xfail("clamp"),
+                # something weird happening with channels_last
+                xfail("bfloat16"),
+                xfail("double"),
+                xfail("float"),
+                xfail("half"),
+                xfail("cdouble"),
+                xfail("cfloat"),
+                xfail("nn.functional.dropout3d", ""),
+                xfail("as_strided_scatter", ""),
+                xfail("sparse.sampled_addmm", ""),
+                xfail("sparse.mm", "reduce"),
+                xfail("native_batch_norm"),
+                xfail("_native_batch_norm_legit"),
+                # TODO: implement batching rule
+                xfail("_batch_norm_with_update"),
+                xfail("as_strided", "partial_views"),
+            }
+        ),
+    )
     def test_vjpvmap(self, device, dtype, op):
         # NB: there is no vjpvmap_has_batch_rule test because that is almost
         # certainly redundant with the vmap_has_batch_rule test in test_vmap.py
 
         # one-off skip
-        if op.name == 'nn.functional.dropout':
+        if op.name == "nn.functional.dropout":
             self.skipTest("Skipped!")
 
         if not op.supports_autograd:
@@ -1325,21 +1665,28 @@ def test_vjpvmap(self, device, dtype, op):
             return
 
         samples = op.sample_inputs(device, dtype, requires_grad=True)
-        batch_norm_fns = ("nn.functional.batch_norm", "nn.functional.instance_norm")  # instance norm calls batch norm
+        batch_norm_fns = (
+            "nn.functional.batch_norm",
+            "nn.functional.instance_norm",
+        )  # instance norm calls batch norm
         is_batch_norm = op.name in batch_norm_fns
 
         for sample in samples:
             args = [sample.input] + list(sample.args)
             kwargs = sample.kwargs
 
-            is_batch_norm_and_training = is_batch_norm and is_batch_norm_training(op.name, kwargs)
-            generator = generate_vmap_inputs(args, kwargs,
-                                             is_batch_norm_and_training=is_batch_norm_and_training)
+            is_batch_norm_and_training = is_batch_norm and is_batch_norm_training(
+                op.name, kwargs
+            )
+            generator = generate_vmap_inputs(
+                args, kwargs, is_batch_norm_and_training=is_batch_norm_and_training
+            )
 
             for batched_args, in_dims, kwargs in generator:
                 vmapped_op = vmap(op, in_dims)
-                fn, primals = normalize_op_input_output2(vmapped_op, batched_args, kwargs,
-                                                         sample.output_process_fn_grad)
+                fn, primals = normalize_op_input_output2(
+                    vmapped_op, batched_args, kwargs, sample.output_process_fn_grad
+                )
                 result = fn(*primals)
                 cotangents = tree_map(lambda x: torch.randn_like(x), result)
 
@@ -1351,7 +1698,9 @@ def test_vjpvmap(self, device, dtype, op):
 
                 self.assertEqual(result_vjps, expected_vjps)
 
-    def _compare_jacobians_of_vjp(self, fn, cotangents_and_primals, argnums=None, atol_rtol=None):
+    def _compare_jacobians_of_vjp(
+        self, fn, cotangents_and_primals, argnums=None, atol_rtol=None
+    ):
         if argnums is None:
             argnums = tuple(range(len(cotangents_and_primals)))
 
@@ -1373,50 +1722,88 @@ def get_vjp(cotangents, *primals):
             self.assertEqual(jacobian_jvp, jacobian_vjp)
 
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
-    @skipOps('TestOperators', 'test_jvpvjp', vjp_fail.union({
-        xfail('to_sparse', ''),  # NYI
-        # RuntimeError: Trying to set a forward gradient that has a different size than that of the original Tensor,
-        # this is not supported. Tensor is of size [5, 2, 3] while the given forward gradient is of size [1, 2, 3].
-        xfail('normal', ''),
-        xfail('cdist', ''),  # NYI: forward-AD for _cdist_forward
-        xfail('cholesky', ''),  # NYI: forward-AD for cholesky
-        xfail('nn.functional.embedding_bag', ''),  # NYI: forward-AD for _embedding_bag
-        xfail('nn.functional.grid_sample', ''),  # NYI: forward AD for grid_sampler_2d
-        xfail('grid_sampler_2d', ''),  # NYI: forward AD for grid_sampler_2d
-        xfail('nn.functional.hardsigmoid', ''),  # NYI: forward AD for hardsigmoid_backward
-        xfail('nn.functional.huber_loss', ''),  # NYI: forward AD for huber_loss_backward
-        xfail('NumpyCubeNotComposableAutogradFunction'),  # not composable
-        xfail('ormqr', ''),  # NYI: forward AD for ormqr
-        xfail('nn.functional.multilabel_margin_loss', ''),  # NYI: multilabel_margin_loss_forward
-        xfail('nn.functional.soft_margin_loss', ''),  # NYI: forward-AD for soft_margin_loss_backward
-        xfail('nn.functional.ctc_loss', ''),  # NYI: forward-AD for _ctc_loss
-        xfail('nn.functional.pdist', ''),  # NYI: forward-AD with _pdist_forward
-        skip('nn.functional.scaled_dot_product_attention'),
-        xfail('torch.ops.aten._efficient_attention_forward'),  # outputs ints
-        xfail('nn.functional.multi_margin_loss', ''),  # NYI: forward AD with multi_margin_loss
-        skip('linalg.householder_product', '', device_type='cuda'),  # flaky, I'm not sure why
-        xfail('sparse.sampled_addmm', ''),  # Sparse tensors have no strides
-        xfail('_segment_reduce', 'offsets'),  # NYI: forward-AD for _segment_reduce
-        xfail('sparse.mm', 'reduce'),  # Sparse tensors have no strides
-        xfail('index_reduce', ''),  # NYI: forward-AD for index_reduce
-        xfail('_segment_reduce', 'lengths'),  # NYI: forward-AD for _segment_reduce
-        xfail('native_dropout_backward'),  # NYI
-
-    }))
-    @opsToleranceOverride('TestOperators', 'test_jvpvjp', (
-        tol1('masked.prod',
-             {torch.float32: tol(atol=1e-04, rtol=1.3e-05)}),
-        tol1('masked.cumprod',
-             {torch.float32: tol(atol=1e-04, rtol=5e-04)}),
-        tol1('cumprod',
-             {torch.float32: tol(atol=1e-04, rtol=1.3e-05)}, device_type='cuda'),
-        tol1('linalg.vander',
-             {torch.float32: tol(atol=1e-04, rtol=1.3e-05)}, device_type='cuda'),
-        tol1('nn.functional.group_norm',
-             {torch.float32: tol(atol=1e-03, rtol=1e-03)}),
-        tol2('linalg.pinv', 'hermitian',
-             {torch.float32: tol(atol=5e-03, rtol=5e-03)}),
-    ))
+    @skipOps(
+        "TestOperators",
+        "test_jvpvjp",
+        vjp_fail.union(
+            {
+                xfail("to_sparse", ""),  # NYI
+                # RuntimeError: Trying to set a forward gradient that has a different size than that of the original Tensor,
+                # this is not supported. Tensor is of size [5, 2, 3] while the given forward gradient is of size [1, 2, 3].
+                xfail("normal", ""),
+                xfail("cdist", ""),  # NYI: forward-AD for _cdist_forward
+                xfail("cholesky", ""),  # NYI: forward-AD for cholesky
+                xfail(
+                    "nn.functional.embedding_bag", ""
+                ),  # NYI: forward-AD for _embedding_bag
+                xfail(
+                    "nn.functional.grid_sample", ""
+                ),  # NYI: forward AD for grid_sampler_2d
+                xfail("grid_sampler_2d", ""),  # NYI: forward AD for grid_sampler_2d
+                xfail(
+                    "nn.functional.hardsigmoid", ""
+                ),  # NYI: forward AD for hardsigmoid_backward
+                xfail(
+                    "nn.functional.huber_loss", ""
+                ),  # NYI: forward AD for huber_loss_backward
+                xfail("NumpyCubeNotComposableAutogradFunction"),  # not composable
+                xfail("ormqr", ""),  # NYI: forward AD for ormqr
+                xfail(
+                    "nn.functional.multilabel_margin_loss", ""
+                ),  # NYI: multilabel_margin_loss_forward
+                xfail(
+                    "nn.functional.soft_margin_loss", ""
+                ),  # NYI: forward-AD for soft_margin_loss_backward
+                xfail("nn.functional.ctc_loss", ""),  # NYI: forward-AD for _ctc_loss
+                xfail("nn.functional.pdist", ""),  # NYI: forward-AD with _pdist_forward
+                skip("nn.functional.scaled_dot_product_attention"),
+                xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
+                xfail(
+                    "nn.functional.multi_margin_loss", ""
+                ),  # NYI: forward AD with multi_margin_loss
+                skip(
+                    "linalg.householder_product", "", device_type="cuda"
+                ),  # flaky, I'm not sure why
+                xfail("sparse.sampled_addmm", ""),  # Sparse tensors have no strides
+                xfail(
+                    "_segment_reduce", "offsets"
+                ),  # NYI: forward-AD for _segment_reduce
+                xfail("sparse.mm", "reduce"),  # Sparse tensors have no strides
+                xfail("index_reduce", "prod"),  # NYI: forward-AD for index_reduce
+                xfail("index_reduce", "mean"),  # NYI: forward-AD for index_reduce
+                xfail("index_reduce", "amax"),  # NYI: forward-AD for index_reduce
+                xfail("index_reduce", "amin"),  # NYI: forward-AD for index_reduce
+                xfail(
+                    "_segment_reduce", "lengths"
+                ),  # NYI: forward-AD for _segment_reduce
+                xfail("native_dropout_backward"),  # NYI
+            }
+        ),
+    )
+    @opsToleranceOverride(
+        "TestOperators",
+        "test_jvpvjp",
+        (
+            tol1("masked.prod", {torch.float32: tol(atol=1e-04, rtol=1.3e-05)}),
+            tol1("masked.cumprod", {torch.float32: tol(atol=1e-04, rtol=5e-04)}),
+            tol1(
+                "cumprod",
+                {torch.float32: tol(atol=1e-04, rtol=1.3e-05)},
+                device_type="cuda",
+            ),
+            tol1(
+                "linalg.vander",
+                {torch.float32: tol(atol=1e-04, rtol=1.3e-05)},
+                device_type="cuda",
+            ),
+            tol1(
+                "nn.functional.group_norm", {torch.float32: tol(atol=1e-03, rtol=1e-03)}
+            ),
+            tol2(
+                "linalg.pinv", "hermitian", {torch.float32: tol(atol=5e-03, rtol=5e-03)}
+            ),
+        ),
+    )
     def test_jvpvjp(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
@@ -1441,7 +1828,9 @@ def push_vjp(primals, cotangents):
                 _, vjp_fn = vjp(fn, *primals)
                 return vjp_fn(cotangents)
 
-            result = jvp(push_vjp, (primals, cotangents), (primals_tangents, cotangents_tangents))
+            result = jvp(
+                push_vjp, (primals, cotangents), (primals_tangents, cotangents_tangents)
+            )
             self.assertEqual(len(result), 2)
 
             def tree_map2(fn, first, second):
@@ -1456,130 +1845,204 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
                     primal_duals = tree_map2(fwAD.make_dual, primals, primals_tangents)
                     _, vjp_fn = ref_vjp(fn, *primal_duals)
 
-                    cotangent_duals = tree_map2(fwAD.make_dual, cotangents, cotangents_tangents)
+                    cotangent_duals = tree_map2(
+                        fwAD.make_dual, cotangents, cotangents_tangents
+                    )
                     result = vjp_fn(cotangent_duals)
 
                     flat_result, spec = tree_flatten(result)
-                    primals_out, tangents_out = zip(*[fwAD.unpack_dual(r) for r in flat_result])
-                    tangents_out = [t if t is not None else torch.zeros_like(p)
-                                    for p, t in zip(primals_out, tangents_out)]
-                    expected = (tree_unflatten(primals_out, spec), tree_unflatten(tangents_out, spec))
+                    primals_out, tangents_out = zip(
+                        *[fwAD.unpack_dual(r) for r in flat_result]
+                    )
+                    tangents_out = [
+                        t if t is not None else torch.zeros_like(p)
+                        for p, t in zip(primals_out, tangents_out)
+                    ]
+                    expected = (
+                        tree_unflatten(primals_out, spec),
+                        tree_unflatten(tangents_out, spec),
+                    )
                 return expected
 
-            expected = reference(primals, cotangents, primals_tangents, cotangents_tangents)
+            expected = reference(
+                primals, cotangents, primals_tangents, cotangents_tangents
+            )
             self.assertEqual(result, expected)
 
     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
-    @skipOps('TestOperators', 'test_vmapjvpvjp', vjp_fail.union({
-        # Following operators take too long, hence skipped
-        skip('atleast_1d'),
-        skip('atleast_2d'),
-        skip('atleast_3d'),
-        skip('meshgrid', 'list_of_tensors'),
-        skip('meshgrid', 'variadic_tensors'),
-        skip('broadcast_tensors'),
-        skip('linalg.lstsq'),
-        skip('nn.functional.bilinear'),
-        skip('native_layer_norm'),
-        skip('ormqr'),
-
-        # Not actually a problem
-        xfail('NumpyCubeNotComposableAutogradFunction'),  # not composable
-        xfail('NumpyExpMarkDirtyAutogradFunction'),  # vmap: inplace into a regular tensor
-
-        # Potential bugs/errors
-        xfail('as_strided'),  # AssertionError: Tensor-likes are not close!
-        xfail('as_strided', 'partial_views'),  # AssertionError: Tensor-likes are not close!
-        xfail('as_strided_scatter'),  # AssertionError: Tensor-likes are not close!
-        xfail('bernoulli'),  # calls random op
-        xfail('bfloat16'),  # required rank 4 tensor to use channels_last format
-        xfail('cdist'),  # Forward AD not implemented and no decomposition
-        xfail('cdouble'),  # required rank 4 tensor to use channels_last format
-        xfail('cfloat'),  # required rank 4 tensor to use channels_last format
-        xfail('chalf'),  # required rank 4 tensor to use channels_last format
-        xfail('cholesky'),  # Forward AD not implemented and no decomposition
-        xfail('ormqr'),  # Forward AD not implemented and no decomposition
-        xfail('double'),  # required rank 4 tensor to use channels_last format
-        xfail('float'),  # required rank 4 tensor to use channels_last format
-        xfail('half'),  # required rank 4 tensor to use channels_last format
-        xfail('index_reduce'),  # Forward AD not implemented and no decomposition
-        xfail('mvlgamma', 'mvlgamma_p_1'),  # vmap: inplace into a regular tensor
-        xfail('mvlgamma', 'mvlgamma_p_3'),  # vmap: inplace into a regular tensor
-        xfail('mvlgamma', 'mvlgamma_p_5'),  # vmap: inplace into a regular tensor
-        xfail('nanquantile'),  # Batching rule not implemented for aten::equal
-        # RuntimeError: Batch norm got a batched tensor as input while the
-        # running_mean or running_var, which will be updated in place,
-        # were not batched.
-        xfail('nn.functional.batch_norm'),
-        xfail('nn.functional.batch_norm', 'without_cudnn'),
-        xfail("nn.functional.ctc_loss"),  # ForwardAD not implemented and no decomposition
-        xfail('nn.functional.dropout2d'),  # calls random op
-        xfail('nn.functional.dropout3d'),  # calls random op
-        xfail('nn.functional.dropout'),  # calls random op
-        xfail('nn.functional.scaled_dot_product_attention'),  # randomness
-        xfail('torch.ops.aten._efficient_attention_forward'),  # outputs ints
-        xfail('nn.functional.multi_head_attention_forward'),  # randomness
-        xfail('nn.functional.embedding_bag'),  # Forward AD not implemented and no decomposition
-        xfail('nn.functional.alpha_dropout'),  # calls randomn op
-        xfail('nn.functional.feature_alpha_dropout', 'with_train'),  # calls random op
-        xfail('nn.functional.fractional_max_pool2d'),  # calls random op
-        xfail('nn.functional.fractional_max_pool3d'),  # calls random op
-        xfail('nn.functional.gaussian_nll_loss'),  # data depenedant flow
-        xfail('nn.functional.grid_sample'),  # Forward AD not implemented and no decomposition
-        xfail('grid_sampler_2d'),  # Forward AD not implemented and no decomposition
-        xfail('nn.functional.hardsigmoid'),  # Forward AD not implemented and no decomposition
-        xfail('nn.functional.hinge_embedding_loss'),  # vmap: inplace into a regular tensor
-        xfail('nn.functional.huber_loss'),  # Forward AD not implemented and no decomposition
-        # RuntimeError: Batch norm got a batched tensor as input while the
-        # running_mean or running_var, which will be updated in place,
-        # were not batched.
-        xfail('nn.functional.instance_norm'),
-        # NYI: Tensor.clone(memory_format) inside vmap is only supported with
-        # memory_format torch.preserve_format or torch.contiguous_format (got ChannelsLast)
-        xfail('nn.functional.max_unpool2d'),
-        xfail('nn.functional.max_unpool2d', 'grad'),
-        xfail('nn.functional.multi_margin_loss'),  # Forward AD not implemented and no decomposition
-        xfail('nn.functional.multilabel_margin_loss'),  # Forward AD not implemented and no decomposition
-        xfail('nn.functional.pdist'),  # Forward AD not implemented and no decomposition
-        xfail('nn.functional.rrelu'),  # vmap: we do not yet support aten::rrelu_with_noise.
-        xfail('nn.functional.soft_margin_loss'),  # Forward AD not implemented and no decomposition
-        xfail('normal'),  # calls random op
-        xfail('normal', 'number_mean'),  # calls random op
-        xfail('pca_lowrank'),  # calls random op
-        xfail('quantile'),  # Batching rule not implemented for aten::equal
-        xfail('scatter_reduce', 'prod'),  # Forward AD not implemented and no decomposition
-        xfail('_segment_reduce', 'lengths'),  # Forward AD not implemented and no decomposition
-        xfail('_segment_reduce', 'offsets'),  # Forward AD not implemented and no decomposition
-        xfail('sparse.sampled_addmm'),  # RuntimeError: Sparse CSR tensors do not have strides
-        xfail('sparse.mm', 'reduce'),  # RuntimeError: Sparse CSR tensors do not have strides
-        xfail('svd_lowrank'),  # calls random op
-        xfail('take'),  # vmap: inplace into regular tensor
-        xfail('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
-        xfail('to_sparse'),  # Forward AD not implemented and no decomposition
-        xfail('view_as_complex'),  # RuntimeError: Tensor must have a last dimension with stride 1
-        # RuntimeError: Batch norm got a batched tensor as
-        # input while the running_mean or running_var, which will be updated in
-        # place, were not batched.
-        xfail("native_batch_norm"),
-        xfail("_native_batch_norm_legit"),
-        xfail('native_dropout_backward'),
-    }))
+    @skipOps(
+        "TestOperators",
+        "test_vmapjvpvjp",
+        vjp_fail.union(
+            {
+                # Following operators take too long, hence skipped
+                skip("atleast_1d"),
+                skip("atleast_2d"),
+                skip("atleast_3d"),
+                skip("meshgrid", "list_of_tensors"),
+                skip("meshgrid", "variadic_tensors"),
+                skip("broadcast_tensors"),
+                skip("linalg.lstsq"),
+                skip("nn.functional.bilinear"),
+                skip("native_layer_norm"),
+                skip("ormqr"),
+                # Not actually a problem
+                xfail("NumpyCubeNotComposableAutogradFunction"),  # not composable
+                xfail(
+                    "NumpyExpMarkDirtyAutogradFunction"
+                ),  # vmap: inplace into a regular tensor
+                # Potential bugs/errors
+                xfail("as_strided"),  # AssertionError: Tensor-likes are not close!
+                xfail(
+                    "as_strided", "partial_views"
+                ),  # AssertionError: Tensor-likes are not close!
+                xfail(
+                    "as_strided_scatter"
+                ),  # AssertionError: Tensor-likes are not close!
+                xfail("bernoulli"),  # calls random op
+                xfail("bfloat16"),  # required rank 4 tensor to use channels_last format
+                xfail("cdist"),  # Forward AD not implemented and no decomposition
+                xfail("cdouble"),  # required rank 4 tensor to use channels_last format
+                xfail("cfloat"),  # required rank 4 tensor to use channels_last format
+                xfail("chalf"),  # required rank 4 tensor to use channels_last format
+                xfail("cholesky"),  # Forward AD not implemented and no decomposition
+                xfail("ormqr"),  # Forward AD not implemented and no decomposition
+                xfail("double"),  # required rank 4 tensor to use channels_last format
+                xfail("float"),  # required rank 4 tensor to use channels_last format
+                xfail("half"),  # required rank 4 tensor to use channels_last format
+                xfail("index_reduce", "prod"),  # NYI: forward AD for index_reduce
+                xfail("index_reduce", "mean"),  # NYI: forward AD for index_reduce
+                xfail("index_reduce", "amax"),  # NYI: forward AD for index_reduce
+                xfail("index_reduce", "amin"),  # NYI: forward AD for index_reduce
+                xfail(
+                    "mvlgamma", "mvlgamma_p_1"
+                ),  # vmap: inplace into a regular tensor
+                xfail(
+                    "mvlgamma", "mvlgamma_p_3"
+                ),  # vmap: inplace into a regular tensor
+                xfail(
+                    "mvlgamma", "mvlgamma_p_5"
+                ),  # vmap: inplace into a regular tensor
+                xfail("nanquantile"),  # Batching rule not implemented for aten::equal
+                # RuntimeError: Batch norm got a batched tensor as input while the
+                # running_mean or running_var, which will be updated in place,
+                # were not batched.
+                xfail("nn.functional.batch_norm"),
+                xfail("nn.functional.batch_norm", "without_cudnn"),
+                xfail(
+                    "nn.functional.ctc_loss"
+                ),  # ForwardAD not implemented and no decomposition
+                xfail("nn.functional.dropout2d"),  # calls random op
+                xfail("nn.functional.dropout3d"),  # calls random op
+                xfail("nn.functional.dropout"),  # calls random op
+                xfail("nn.functional.scaled_dot_product_attention"),  # randomness
+                xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
+                xfail("nn.functional.multi_head_attention_forward"),  # randomness
+                xfail(
+                    "nn.functional.embedding_bag"
+                ),  # Forward AD not implemented and no decomposition
+                xfail("nn.functional.alpha_dropout"),  # calls randomn op
+                xfail(
+                    "nn.functional.feature_alpha_dropout", "with_train"
+                ),  # calls random op
+                xfail("nn.functional.fractional_max_pool2d"),  # calls random op
+                xfail("nn.functional.fractional_max_pool3d"),  # calls random op
+                xfail("nn.functional.gaussian_nll_loss"),  # data depenedant flow
+                xfail(
+                    "nn.functional.grid_sample"
+                ),  # Forward AD not implemented and no decomposition
+                xfail(
+                    "grid_sampler_2d"
+                ),  # Forward AD not implemented and no decomposition
+                xfail(
+                    "nn.functional.hardsigmoid"
+                ),  # Forward AD not implemented and no decomposition
+                xfail(
+                    "nn.functional.hinge_embedding_loss"
+                ),  # vmap: inplace into a regular tensor
+                xfail(
+                    "nn.functional.huber_loss"
+                ),  # Forward AD not implemented and no decomposition
+                # RuntimeError: Batch norm got a batched tensor as input while the
+                # running_mean or running_var, which will be updated in place,
+                # were not batched.
+                xfail("nn.functional.instance_norm"),
+                # NYI: Tensor.clone(memory_format) inside vmap is only supported with
+                # memory_format torch.preserve_format or torch.contiguous_format (got ChannelsLast)
+                xfail("nn.functional.max_unpool2d"),
+                xfail("nn.functional.max_unpool2d", "grad"),
+                xfail(
+                    "nn.functional.multi_margin_loss"
+                ),  # Forward AD not implemented and no decomposition
+                xfail(
+                    "nn.functional.multilabel_margin_loss"
+                ),  # Forward AD not implemented and no decomposition
+                xfail(
+                    "nn.functional.pdist"
+                ),  # Forward AD not implemented and no decomposition
+                xfail(
+                    "nn.functional.rrelu"
+                ),  # vmap: we do not yet support aten::rrelu_with_noise.
+                xfail(
+                    "nn.functional.soft_margin_loss"
+                ),  # Forward AD not implemented and no decomposition
+                xfail("normal"),  # calls random op
+                xfail("normal", "number_mean"),  # calls random op
+                xfail("pca_lowrank"),  # calls random op
+                xfail("quantile"),  # Batching rule not implemented for aten::equal
+                xfail(
+                    "scatter_reduce", "prod"
+                ),  # Forward AD not implemented and no decomposition
+                xfail(
+                    "_segment_reduce", "lengths"
+                ),  # Forward AD not implemented and no decomposition
+                xfail(
+                    "_segment_reduce", "offsets"
+                ),  # Forward AD not implemented and no decomposition
+                xfail(
+                    "sparse.sampled_addmm"
+                ),  # RuntimeError: Sparse CSR tensors do not have strides
+                xfail(
+                    "sparse.mm", "reduce"
+                ),  # RuntimeError: Sparse CSR tensors do not have strides
+                xfail("svd_lowrank"),  # calls random op
+                xfail(
+                    "to"
+                ),  # RuntimeError: required rank 4 tensor to use channels_last format
+                xfail("to_sparse"),  # Forward AD not implemented and no decomposition
+                xfail(
+                    "view_as_complex"
+                ),  # RuntimeError: Tensor must have a last dimension with stride 1
+                # RuntimeError: Batch norm got a batched tensor as
+                # input while the running_mean or running_var, which will be updated in
+                # place, were not batched.
+                xfail("native_batch_norm"),
+                xfail("_native_batch_norm_legit"),
+                # TODO: implement batching rule
+                xfail("_batch_norm_with_update"),
+                xfail("native_dropout_backward"),
+            }
+        ),
+    )
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
-    @opsToleranceOverride('TestOperators', 'test_vmapjvpvjp', (
-        tol1('linalg.svd',
-             {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
-        tol1('linalg.householder_product',
-             {torch.float32: tol(atol=5e-03, rtol=5e-03)}),
-        tol1('linalg.multi_dot',
-             {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
-        tol2('linalg.pinv', 'hermitian',
-             {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
-        tol1('svd',
-             {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
-        tol1('matrix_exp',
-             {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
-    ))
+    @opsToleranceOverride(
+        "TestOperators",
+        "test_vmapjvpvjp",
+        (
+            tol1("linalg.svd", {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
+            tol1(
+                "linalg.householder_product",
+                {torch.float32: tol(atol=5e-03, rtol=5e-03)},
+            ),
+            tol1("linalg.multi_dot", {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
+            tol2(
+                "linalg.pinv", "hermitian", {torch.float32: tol(atol=5e-04, rtol=5e-04)}
+            ),
+            tol1("svd", {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
+            tol1("matrix_exp", {torch.float32: tol(atol=5e-04, rtol=5e-04)}),
+        ),
+    )
     def test_vmapjvpvjp(self, device, dtype, op):
         # Since we test `jvpvjp` separately,
         # in this we just check that vmap of `jvpvjp`
@@ -1607,7 +2070,9 @@ def push_vjp(primals, cotangents):
                 _, vjp_fn = vjp(fn, *primals)
                 return vjp_fn(cotangents)
 
-            args, spec = tree_flatten(((primals, cotangents), (primals_tangents, cotangents_tangents)))
+            args, spec = tree_flatten(
+                ((primals, cotangents), (primals_tangents, cotangents_tangents))
+            )
 
             def jvp_of_vjp(*args):
                 (primals, tangents) = tree_unflatten(args, spec)
@@ -1619,7 +2084,11 @@ def jvp_of_vjp(*args):
 
             is_batch_norm_and_training = is_batch_norm_training(op, sample.kwargs)
             generator = get_fallback_and_vmap_exhaustive(
-                jvp_of_vjp, args, {}, is_batch_norm_and_training=is_batch_norm_and_training)
+                jvp_of_vjp,
+                args,
+                {},
+                is_batch_norm_and_training=is_batch_norm_and_training,
+            )
             for loop_out, batched_out in generator:
                 self.assertEqual(loop_out, batched_out)
 
@@ -1627,9 +2096,9 @@ def _make_extremal_inputs(self, shape, device):
         if shape is None:
             return (None,)
         return (
-            torch.full(shape, -1000., device=device),
+            torch.full(shape, -1000.0, device=device),
             torch.zeros(shape, device=device),
-            torch.full(shape, 1000., device=device),
+            torch.full(shape, 1000.0, device=device),
         )
 
     def _arg_and_kwarg_options(self, args_options, kwargs_options):
@@ -1644,18 +2113,29 @@ def test_extremal_numerics_nll_loss(self, device):
             ((N, C, d1, d2, d3), (N, d1, d2, d3), (C,)),
             ((N, C, d1, d2, d3), (N, d1, d2, d3), None),
         )
-        kwargs_options = ({'ignore_index': 0, 'reduction': 'mean'}, {'reduction': 'sum'}, {'reduction': 'none'}, {})
+        kwargs_options = (
+            {"ignore_index": 0, "reduction": "mean"},
+            {"reduction": "sum"},
+            {"reduction": "none"},
+            {},
+        )
         for input_shape, target_shape, weight_shape in shapes:
             input_options = self._make_extremal_inputs(input_shape, device)
-            for input, kwargs in self._arg_and_kwarg_options((input_options,), kwargs_options):
+            for input, kwargs in self._arg_and_kwarg_options(
+                (input_options,), kwargs_options
+            ):
                 if weight_shape is None:
                     weight = None
                 else:
                     weight = torch.randn(weight_shape, device=device)
                 target = torch.randint(0, C, target_shape, device=device)
-                target[0] = 1  # since we're ignoring index 0, at least one element must be non-zero
+                target[
+                    0
+                ] = 1  # since we're ignoring index 0, at least one element must be non-zero
 
-                fn = functools.partial(torch.nn.functional.nll_loss, target=target, weight=weight, **kwargs)
+                fn = functools.partial(
+                    torch.nn.functional.nll_loss, target=target, weight=weight, **kwargs
+                )
                 result = fn(input)
                 cotangents = torch.randn_like(result, device=device)
                 self._compare_jacobians_of_vjp(fn, (cotangents, input))
@@ -1663,48 +2143,64 @@ def test_extremal_numerics_nll_loss(self, device):
     def test_extremal_numerics_l1_loss(self, device):
         N, C, H, W = 3, 4, 5, 6
         shapes = ((N, C), (N, C, H), (N, C, H, W))
-        kwargs_options = ({'reduction': 'sum'}, {'reduction': 'none'}, {})
+        kwargs_options = ({"reduction": "sum"}, {"reduction": "none"}, {})
         for shape in shapes:
             input_options = self._make_extremal_inputs(shape, device)
             target_options = self._make_extremal_inputs(shape, device)
-            for input, target, kwargs in self._arg_and_kwarg_options((input_options, target_options), kwargs_options):
+            for input, target, kwargs in self._arg_and_kwarg_options(
+                (input_options, target_options), kwargs_options
+            ):
                 result = torch.nn.functional.l1_loss(input, target)
                 cotangents = torch.randn_like(result, device=device)
-                self._compare_jacobians_of_vjp(torch.nn.functional.l1_loss, (cotangents, input, target))
+                self._compare_jacobians_of_vjp(
+                    torch.nn.functional.l1_loss, (cotangents, input, target)
+                )
 
     def test_extremal_numerics_mse_loss(self, device):
         N, C, H, W = 3, 4, 5, 6
         shapes = ((N, C), (N, C, H), (N, C, H, W))
-        kwargs_options = ({'reduction': 'sum'}, {'reduction': 'none'}, {})
+        kwargs_options = ({"reduction": "sum"}, {"reduction": "none"}, {})
         for shape in shapes:
             input_options = self._make_extremal_inputs(shape, device)
             target_options = self._make_extremal_inputs(shape, device)
-            for input, target, kwargs in self._arg_and_kwarg_options((input_options, target_options), kwargs_options):
+            for input, target, kwargs in self._arg_and_kwarg_options(
+                (input_options, target_options), kwargs_options
+            ):
                 result = torch.nn.functional.mse_loss(input, target)
                 cotangents = torch.randn_like(result, device=device)
-                self._compare_jacobians_of_vjp(torch.nn.functional.mse_loss, (cotangents, input, target))
+                self._compare_jacobians_of_vjp(
+                    torch.nn.functional.mse_loss, (cotangents, input, target)
+                )
 
     def test_extremal_numerics_softmax(self, device):
         N, C, H, W = 3, 4, 5, 6
         shapes = ((N, C), (N, C, H), (N, C, H, W))
-        kwargs_options = ({'dim': 1}, {})
+        kwargs_options = ({"dim": 1}, {})
         for shape in shapes:
             input_options = self._make_extremal_inputs(shape, device)
-            for input, kwargs in self._arg_and_kwarg_options((input_options,), kwargs_options):
+            for input, kwargs in self._arg_and_kwarg_options(
+                (input_options,), kwargs_options
+            ):
                 result = torch.nn.functional.softmax(input)
                 cotangents = torch.randn_like(result, device=device)
-                self._compare_jacobians_of_vjp(torch.nn.functional.softmax, (cotangents, input))
+                self._compare_jacobians_of_vjp(
+                    torch.nn.functional.softmax, (cotangents, input)
+                )
 
     def test_extremal_numerics_log_softmax(self, device):
         N, C, H, W = 3, 4, 5, 6
         shapes = ((N, C), (N, C, H), (N, C, H, W))
-        kwargs_options = ({'dim': 1}, {})
+        kwargs_options = ({"dim": 1}, {})
         for shape in shapes:
             input_options = self._make_extremal_inputs(shape, device)
-            for input, kwargs in self._arg_and_kwarg_options((input_options,), kwargs_options):
+            for input, kwargs in self._arg_and_kwarg_options(
+                (input_options,), kwargs_options
+            ):
                 result = torch.nn.functional.log_softmax(input)
                 cotangents = torch.randn_like(result, device=device)
-                self._compare_jacobians_of_vjp(torch.nn.functional.log_softmax, (cotangents, input))
+                self._compare_jacobians_of_vjp(
+                    torch.nn.functional.log_softmax, (cotangents, input)
+                )
 
     def test_extremal_numerics_cross_entropy(self, device):
         N, C = 3, 4
@@ -1725,11 +2221,13 @@ def test_extremal_numerics_cross_entropy(self, device):
         )
         for input_shape, target_shape, weight_shape in shapes:
             input_options = self._make_extremal_inputs(input_shape, device)
-            kwargs_options = [{'reduction': 'sum'}, {'reduction': 'none'}, {}]
+            kwargs_options = [{"reduction": "sum"}, {"reduction": "none"}, {}]
             if input_shape != target_shape:
-                kwargs_options.append({'ignore_index': 0, 'reduction': 'mean'})
+                kwargs_options.append({"ignore_index": 0, "reduction": "mean"})
 
-            for input, kwargs in self._arg_and_kwarg_options((input_options,), kwargs_options):
+            for input, kwargs in self._arg_and_kwarg_options(
+                (input_options,), kwargs_options
+            ):
                 if weight_shape is None:
                     weight = None
                 else:
@@ -1738,29 +2236,47 @@ def test_extremal_numerics_cross_entropy(self, device):
                 if input_shape == target_shape:
                     target = torch.rand(target_shape, device=device)
                 elif len(target_shape) == 0:
-                    target = torch.tensor(1, device=device)  # must be non-zero since ignore_index may be 0
+                    target = torch.tensor(
+                        1, device=device
+                    )  # must be non-zero since ignore_index may be 0
                 else:
                     target = torch.randint(0, C, target_shape, device=device)
 
-                fn = functools.partial(torch.nn.functional.cross_entropy, target=target, weight=weight, **kwargs)
+                fn = functools.partial(
+                    torch.nn.functional.cross_entropy,
+                    target=target,
+                    weight=weight,
+                    **kwargs,
+                )
                 result = fn(input)
                 cotangents = torch.randn_like(result, device=device)
-                self._compare_jacobians_of_vjp(fn, (cotangents, input), atol_rtol=(1e-4, 1e-5))
+                self._compare_jacobians_of_vjp(
+                    fn, (cotangents, input), atol_rtol=(1e-4, 1e-5)
+                )
 
     def test_extremal_numerics_binary_cross_entropy(self, device):
         N, C, H, W = 3, 4, 5, 6
         shapes = ((N, C), (N, C, H), (N, C, H, W))
         for shape in shapes:
             weight_options = self._make_extremal_inputs(shape, device)
-            kwargs_options = [{'reduction': 'sum'}, {'reduction': 'none'}, {}]
+            kwargs_options = [{"reduction": "sum"}, {"reduction": "none"}, {}]
 
-            for weight, kwargs in self._arg_and_kwarg_options((weight_options,), kwargs_options):
+            for weight, kwargs in self._arg_and_kwarg_options(
+                (weight_options,), kwargs_options
+            ):
                 input = torch.rand(shape, device=device)
                 target = torch.rand(shape, device=device)
-                fn = functools.partial(torch.nn.functional.binary_cross_entropy, target=target, weight=weight, **kwargs)
+                fn = functools.partial(
+                    torch.nn.functional.binary_cross_entropy,
+                    target=target,
+                    weight=weight,
+                    **kwargs,
+                )
                 result = fn(input)
                 cotangents = torch.randn_like(result, device=device)
-                self._compare_jacobians_of_vjp(fn, (cotangents, input), atol_rtol=(1e-4, 2e-5))
+                self._compare_jacobians_of_vjp(
+                    fn, (cotangents, input), atol_rtol=(1e-4, 2e-5)
+                )
 
     def test_extremal_numerics_layer_norm(self, device):
         N, C, H, W = 3, 4, 5, 6
@@ -1771,57 +2287,100 @@ def test_extremal_numerics_layer_norm(self, device):
             weight_options = self._make_extremal_inputs(normalized_shape, device)
             bias_options = self._make_extremal_inputs(normalized_shape, device)
 
-            for input, bias, weight in self._arg_and_kwarg_options((input_options, bias_options, weight_options), ()):
+            for input, bias, weight in self._arg_and_kwarg_options(
+                (input_options, bias_options, weight_options), ()
+            ):
+
                 def fn(input, weight, bias):
-                    return torch.nn.functional.layer_norm(input, normalized_shape, weight=weight, bias=bias)
+                    return torch.nn.functional.layer_norm(
+                        input, normalized_shape, weight=weight, bias=bias
+                    )
+
                 result = fn(input, weight, bias)
                 cotangents = torch.randn_like(result, device=device)
                 self._compare_jacobians_of_vjp(fn, (cotangents, input, weight, bias))
 
     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
-    @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float32, torch.double))
-    @skipOps('TestOperators', 'test_vmap_autograd_grad', {
-        # The size of tensor a (4) must match the size of tensor b (10) at non-singleton dimension 0
-        xfail('masked_select'),
-        xfail('nn.functional.max_unpool2d', 'grad'),  # contiguous call
-        xfail('nn.functional.max_unpool2d'),  # contiguous call
-        xfail('to_sparse'),  # dispatch key issue
-        xfail('torch.ops.aten._efficient_attention_forward'),  # outputs ints
-
-        # https://github.com/pytorch/pytorch/issues/96560
-        decorate('xlogy', decorator=skipIfRocm),
-
-        # numerical inconsistencies, look like bugs
-        skip('matrix_exp', dtypes=(torch.float32,), device_type='cuda'),  # fails on linux, passes on windows
-        skip('ldexp', dtypes=(torch.float32,), device_type='cpu'),  # fails on all but mac
-        skip('__rmatmul__'),  # flaky needs investigation
-        skip('matmul'),  # flaky needs investigation
-        skip('nn.functional.conv_transpose3d'),  # flaky needs investigation
-        skip('nn.functional.conv_transpose2d'),  # flaky needs investigation
-        skip('nn.functional.conv_transpose1d'),  # flaky needs investigation
-        skip('nn.functional.layer_norm', dtypes=(torch.float32,), device_type='cpu'),  # fails on windows
-        skip('linalg.lu_factor', dtypes=(torch.float32,), device_type='cuda'),  # fails on all but windows
-        skip('linalg.lu_factor_ex', dtypes=(torch.float32,), device_type='cuda'),  # fails on all but windows
-        skip('linalg.multi_dot', '', device_type='cpu'),
-        skip('sparse.sampled_addmm', ''),
-        skip('sparse.mm', 'reduce'),
-        skip('native_layer_norm', '', device_type='cpu'),
-    })
-    @opsToleranceOverride('TestOperators', 'test_vmap_autograd_grad', (
-        tol1('linalg.householder_product',
-             {torch.float32: tol(atol=5e-04, rtol=9e-03)}, device_type='cuda'),
-        tol1('linalg.householder_product',
-             {torch.float32: tol(atol=1e-04, rtol=1e-04)}, device_type='cpu'),
-        tol1('linalg.multi_dot',
-             {torch.float32: tol(atol=2e-04, rtol=1e-04)}, device_type='cuda'),
-        tol2('linalg.pinv', 'hermitian',
-             {torch.float32: tol(atol=5e-06, rtol=5e-06)}),
-        tol1('nn.functional.conv3d',
-             {torch.float32: tol(atol=5e-04, rtol=9e-03)}),
-    ))
+    @ops(
+        op_db + additional_op_db + autograd_function_db,
+        allowed_dtypes=(torch.float32, torch.double),
+    )
+    @skipOps(
+        "TestOperators",
+        "test_vmap_autograd_grad",
+        {
+            # The size of tensor a (4) must match the size of tensor b (10) at non-singleton dimension 0
+            xfail("masked_select"),
+            xfail("nn.functional.max_unpool2d", "grad"),  # contiguous call
+            xfail("nn.functional.max_unpool2d"),  # contiguous call
+            xfail("to_sparse"),  # dispatch key issue
+            xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
+            # https://github.com/pytorch/pytorch/issues/96560
+            decorate("xlogy", decorator=skipIfRocm),
+            # numerical inconsistencies, look like bugs
+            skip(
+                "matrix_exp", dtypes=(torch.float32,), device_type="cuda"
+            ),  # fails on linux, passes on windows
+            skip(
+                "ldexp", dtypes=(torch.float32,), device_type="cpu"
+            ),  # fails on all but mac
+            skip("__rmatmul__"),  # flaky needs investigation
+            skip("matmul"),  # flaky needs investigation
+            skip("nn.functional.conv_transpose3d"),  # flaky needs investigation
+            skip("nn.functional.conv_transpose2d"),  # flaky needs investigation
+            skip("nn.functional.conv_transpose1d"),  # flaky needs investigation
+            skip(
+                "nn.functional.layer_norm", dtypes=(torch.float32,), device_type="cpu"
+            ),  # fails on windows
+            skip(
+                "linalg.lu_factor", dtypes=(torch.float32,), device_type="cuda"
+            ),  # fails on all but windows
+            skip(
+                "linalg.lu_factor_ex", dtypes=(torch.float32,), device_type="cuda"
+            ),  # fails on all but windows
+            skip("linalg.multi_dot", "", device_type="cpu"),
+            skip("sparse.sampled_addmm", ""),
+            skip("sparse.mm", "reduce"),
+            skip("native_layer_norm", "", device_type="cpu"),
+            # RuntimeError: Expected contiguous tensor, but got
+            # non-contiguous tensor for argument #2 'grad_output'
+            decorate(
+                "_batch_norm_with_update",
+                decorator=expectedFailureIf(TEST_WITH_ROCM),
+                device_type="cuda",
+            ),
+        },
+    )
+    @opsToleranceOverride(
+        "TestOperators",
+        "test_vmap_autograd_grad",
+        (
+            tol1(
+                "linalg.householder_product",
+                {torch.float32: tol(atol=5e-04, rtol=9e-03)},
+                device_type="cuda",
+            ),
+            tol1(
+                "linalg.householder_product",
+                {torch.float32: tol(atol=1e-04, rtol=1e-04)},
+                device_type="cpu",
+            ),
+            tol1(
+                "linalg.multi_dot",
+                {torch.float32: tol(atol=2e-04, rtol=1e-04)},
+                device_type="cuda",
+            ),
+            tol2(
+                "linalg.pinv", "hermitian", {torch.float32: tol(atol=5e-06, rtol=5e-06)}
+            ),
+            tol1("nn.functional.conv3d", {torch.float32: tol(atol=5e-04, rtol=9e-03)}),
+        ),
+    )
     def test_vmap_autograd_grad(self, device, dtype, op):
         def is_differentiable(inp):
-            return isinstance(inp, Tensor) and (inp.grad_fn is not None or inp.requires_grad)
+            return isinstance(inp, Tensor) and (
+                inp.grad_fn is not None or inp.requires_grad
+            )
 
         def get_flat_differentiable(tree):
             flattened = pytree.tree_leaves(tree)
@@ -1829,7 +2388,11 @@ def get_flat_differentiable(tree):
 
         def get_differentiable_linked(list1, list2):
             paired_list = zip(list1, list2)
-            paired_list = tuple((first, second) for (first, second) in paired_list if is_differentiable(first))
+            paired_list = tuple(
+                (first, second)
+                for (first, second) in paired_list
+                if is_differentiable(first)
+            )
             return zip(*paired_list)
 
         def filter_none(out):
@@ -1853,15 +2416,27 @@ def compute_grad(cotangents):
                 if not isinstance(out_flattened, torch.Tensor):
                     out_flattened = pytree.tree_leaves(out)
                     cotangents_flattened = pytree.tree_leaves(cotangents)
-                    out_flattened, cotangents_flattened = get_differentiable_linked(out_flattened, cotangents_flattened)
+                    out_flattened, cotangents_flattened = get_differentiable_linked(
+                        out_flattened, cotangents_flattened
+                    )
 
                 return filter_none(
-                    torch.autograd.grad(out_flattened, get_flat_differentiable(primals), cotangents_flattened,
-                                        retain_graph=True, allow_unused=True))
+                    torch.autograd.grad(
+                        out_flattened,
+                        get_flat_differentiable(primals),
+                        cotangents_flattened,
+                        retain_graph=True,
+                        allow_unused=True,
+                    )
+                )
 
             is_batch_norm_and_training = is_batch_norm_training(op, sample_input.kwargs)
             generator = get_fallback_and_vmap_exhaustive(
-                compute_grad, (cotangents,), {}, is_batch_norm_and_training=is_batch_norm_and_training)
+                compute_grad,
+                (cotangents,),
+                {},
+                is_batch_norm_and_training=is_batch_norm_and_training,
+            )
             for loop_out, batched_out in generator:
                 self.assertEqual(loop_out, batched_out)
 
@@ -1879,42 +2454,78 @@ def test_vmapvmapjvp_linalg_solve(self):
         # non-contiguous because vmap will expand. This will happen during both levels of vmap
         A = torch.randn(4, 4)
         k = torch.randn(4, 5, B1, B0)
-        fn, args = get_jvp_variant_primals_tangents(torch.linalg.solve, SampleInput(A, args=(k,)))
+        fn, args = get_jvp_variant_primals_tangents(
+            torch.linalg.solve, SampleInput(A, args=(k,))
+        )
 
         in_dims_all = (None, -1, None, -1)
         batched_out = vmap(vmap(fn, in_dims=in_dims_all), in_dims=in_dims_all)(*args)
         loop_out = loop2(fn, in_dims_all, in_dims_all, 0, 0, B0, B1, *args)
         self.assertEqual(loop_out, batched_out)
 
-    @ops(filter(lambda op: op.name in aliasing_ops, op_db + additional_op_db), allowed_dtypes=(torch.float,))
+    @ops(
+        filter(lambda op: op.name in aliasing_ops, op_db + additional_op_db),
+        allowed_dtypes=(torch.float,),
+    )
     @parametrize("grad_op", ["jvp", "vjp"])
     def test_view_then_inplace(self, device, dtype, op, grad_op):
         for sample_input in op.sample_inputs(device, dtype):
+
             def f(x):
-                op(sample_input.input, *sample_input.args, **sample_input.kwargs).copy_(x)
+                op(sample_input.input, *sample_input.args, **sample_input.kwargs).copy_(
+                    x
+                )
                 return x
 
-            without_grad = op(sample_input.input, *sample_input.args, **sample_input.kwargs)
+            without_grad = op(
+                sample_input.input, *sample_input.args, **sample_input.kwargs
+            )
             if grad_op == "jvp":
-                with self.assertRaisesRegex(RuntimeError, "During a grad .* attempted to call in-place operation"):
-                    jvp(f, (torch.randn_like(without_grad),), (torch.randn_like(without_grad),))
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "During a grad .* attempted to call in-place operation",
+                ):
+                    jvp(
+                        f,
+                        (torch.randn_like(without_grad),),
+                        (torch.randn_like(without_grad),),
+                    )
             else:
                 assert grad_op == "vjp"
-                with self.assertRaisesRegex(RuntimeError, "During a grad .* attempted to call in-place operation"):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "During a grad .* attempted to call in-place operation",
+                ):
                     vjp(f, torch.randn_like(without_grad))
 
-    @ops(filter(lambda op: op.name in aliasing_ops_list_return, op_db + additional_op_db), allowed_dtypes=(torch.float,))
+    @ops(
+        filter(
+            lambda op: op.name in aliasing_ops_list_return, op_db + additional_op_db
+        ),
+        allowed_dtypes=(torch.float,),
+    )
     @parametrize("grad_op", ["jvp", "vjp"])
     def test_view_then_inplace_list_return(self, device, dtype, op, grad_op):
         for sample_input in op.sample_inputs(device, dtype):
+
             def f(x):
-                op(sample_input.input, *sample_input.args, **sample_input.kwargs)[0].copy_(x)
+                op(sample_input.input, *sample_input.args, **sample_input.kwargs)[
+                    0
+                ].copy_(x)
                 return x
 
-            without_grad = op(sample_input.input, *sample_input.args, **sample_input.kwargs)[0]
-            with self.assertRaisesRegex(RuntimeError, "During a grad .* attempted to call in-place operation"):
+            without_grad = op(
+                sample_input.input, *sample_input.args, **sample_input.kwargs
+            )[0]
+            with self.assertRaisesRegex(
+                RuntimeError, "During a grad .* attempted to call in-place operation"
+            ):
                 if grad_op == "jvp":
-                    jvp(f, (torch.randn_like(without_grad),), (torch.randn_like(without_grad),))
+                    jvp(
+                        f,
+                        (torch.randn_like(without_grad),),
+                        (torch.randn_like(without_grad),),
+                    )
                 else:
                     assert grad_op == "vjp"
                     vjp(f, torch.randn_like(without_grad))
@@ -1931,6 +2542,7 @@ def test_view_then_inplace_special(self, grad_op):
         ]
 
         for op in ops:
+
             def f(x):
                 op(captured).copy_(x)
                 return x
@@ -1938,11 +2550,21 @@ def f(x):
             captured = torch.randn(4, 3, 3)
             without_grad = op(captured)
             if grad_op == "jvp":
-                with self.assertRaisesRegex(RuntimeError, "During a grad .* attempted to call in-place operation"):
-                    jvp(f, (torch.randn_like(without_grad),), (torch.randn_like(without_grad),))
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "During a grad .* attempted to call in-place operation",
+                ):
+                    jvp(
+                        f,
+                        (torch.randn_like(without_grad),),
+                        (torch.randn_like(without_grad),),
+                    )
             else:
                 assert grad_op == "vjp"
-                with self.assertRaisesRegex(RuntimeError, "During a grad .* attempted to call in-place operation"):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "During a grad .* attempted to call in-place operation",
+                ):
                     vjp(f, torch.randn_like(without_grad))
 
     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
@@ -1955,9 +2577,13 @@ def f(x):
     # - autograd.Function. The mechanism is via PyDispatcher/HigherOrderOperator, not the
     #   regular PyTorch dispatcher, so it's good to exercise more caution.
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
-    @skipOps('TestOperators', 'test_vmapvjpvmap', {
-        xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-    })
+    @skipOps(
+        "TestOperators",
+        "test_vmapvjpvmap",
+        {
+            xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
+        },
+    )
     def test_vmapvjpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
@@ -1970,9 +2596,14 @@ def test_vmapvjpvmap(self, device, dtype, op):
                 inner_mapped_op = functools.partial(loop, op, in_dims, 0, B)
 
                 inner_vmapped_fn, primals = normalize_op_input_output2(
-                    inner_vmapped_op, batched_args, kwargs, sample.output_process_fn_grad)
+                    inner_vmapped_op,
+                    batched_args,
+                    kwargs,
+                    sample.output_process_fn_grad,
+                )
                 inner_mapped_fn, _ = normalize_op_input_output2(
-                    inner_mapped_op, batched_args, kwargs, sample.output_process_fn_grad)
+                    inner_mapped_op, batched_args, kwargs, sample.output_process_fn_grad
+                )
                 result = inner_mapped_fn(*primals)
                 cotangents = tree_map(lambda x: torch.rand_like(x), result)
 
@@ -1980,6 +2611,7 @@ def apply_vjp(fn):
                     def inner(primals, cotangents):
                         _, vjp_fn = vjp(fn, *primals)
                         return vjp_fn(cotangents)
+
                     return inner
 
                 vjpvmap_fn = apply_vjp(inner_vmapped_fn)
@@ -1998,9 +2630,13 @@ def inner(primals, cotangents):
 
     # See NOTE: [three-transform testing]
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
-    @skipOps('TestOperators', 'test_vjpvmapvmap', {
-        xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-    })
+    @skipOps(
+        "TestOperators",
+        "test_vjpvmapvmap",
+        {
+            xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
+        },
+    )
     def test_vjpvmapvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
@@ -2018,9 +2654,11 @@ def test_vjpvmapvmap(self, device, dtype, op):
                     mapped_op = functools.partial(loop, inner_mapped_op, in_dims, 0, B)
 
                     vmapped_fn, primals = normalize_op_input_output2(
-                        vmapped_op, batched_args, kwargs, sample.output_process_fn_grad)
+                        vmapped_op, batched_args, kwargs, sample.output_process_fn_grad
+                    )
                     mapped_fn, _ = normalize_op_input_output2(
-                        mapped_op, batched_args, kwargs, sample.output_process_fn_grad)
+                        mapped_op, batched_args, kwargs, sample.output_process_fn_grad
+                    )
 
                     result = mapped_fn(*primals)
                     cotangents = tree_map(lambda x: torch.rand_like(x), result)
@@ -2035,9 +2673,13 @@ def test_vjpvmapvmap(self, device, dtype, op):
 
     # See NOTE: [three-transform testing]
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
-    @skipOps('TestOperators', 'test_vjpvjpvmap', {
-        xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-    })
+    @skipOps(
+        "TestOperators",
+        "test_vjpvjpvmap",
+        {
+            xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
+        },
+    )
     def test_vjpvjpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
@@ -2049,8 +2691,12 @@ def test_vjpvjpvmap(self, device, dtype, op):
                 inner_vmapped_op = vmap(op, in_dims)
                 inner_mapped_op = functools.partial(loop, op, in_dims, 0, B)
 
-                vjpmap_fn, args = get_vjpfull_variant2(inner_mapped_op, batched_args, kwargs)
-                vjpvmap_fn, _ = get_vjpfull_variant2(inner_vmapped_op, batched_args, kwargs)
+                vjpmap_fn, args = get_vjpfull_variant2(
+                    inner_mapped_op, batched_args, kwargs
+                )
+                vjpvmap_fn, _ = get_vjpfull_variant2(
+                    inner_vmapped_op, batched_args, kwargs
+                )
 
                 vjpvjpvmap_fn, new_args = get_vjpfull_variant2(vjpvmap_fn, args, {})
                 vjpvjpmap_fn, _ = get_vjpfull_variant2(vjpmap_fn, args, {})
@@ -2064,9 +2710,13 @@ def test_vjpvjpvmap(self, device, dtype, op):
     # we only test it on the things we're not sure about:
     # - the autograd.Function <> functorch interaction
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
-    @skipOps('TestOperators', 'test_jvpvmap', {
-        xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-    })
+    @skipOps(
+        "TestOperators",
+        "test_jvpvmap",
+        {
+            xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
+        },
+    )
     def test_jvpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
@@ -2079,11 +2729,14 @@ def test_jvpvmap(self, device, dtype, op):
                 inner_mapped_op = functools.partial(loop, op, in_dims, 0, B)
 
                 jvpvmap_op, primals = get_jvp_variant_primals_tangents2(
-                    inner_vmapped_op, batched_args, kwargs,
-                    sample.output_process_fn_grad)
+                    inner_vmapped_op,
+                    batched_args,
+                    kwargs,
+                    sample.output_process_fn_grad,
+                )
                 jvpmap_op, _ = get_jvp_variant_primals_tangents2(
-                    inner_mapped_op, batched_args, kwargs,
-                    sample.output_process_fn_grad)
+                    inner_mapped_op, batched_args, kwargs, sample.output_process_fn_grad
+                )
 
                 expected = jvpmap_op(*primals)
                 result = jvpvmap_op(*primals)
@@ -2091,9 +2744,13 @@ def test_jvpvmap(self, device, dtype, op):
 
     # See NOTE: [three-transform testing]
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
-    @skipOps('TestOperators', 'test_jvpvmapvmap', {
-        xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-    })
+    @skipOps(
+        "TestOperators",
+        "test_jvpvmapvmap",
+        {
+            xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
+        },
+    )
     def test_jvpvmapvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
@@ -2111,11 +2768,11 @@ def test_jvpvmapvmap(self, device, dtype, op):
                     mapped_op = functools.partial(loop, inner_mapped_op, in_dims, 0, B)
 
                     jvpvmapvmap_fn, primals = get_jvp_variant_primals_tangents2(
-                        vmapped_op, batched_args, kwargs,
-                        sample.output_process_fn_grad)
+                        vmapped_op, batched_args, kwargs, sample.output_process_fn_grad
+                    )
                     jvpmapmap_fn, _ = get_jvp_variant_primals_tangents2(
-                        mapped_op, batched_args, kwargs,
-                        sample.output_process_fn_grad)
+                        mapped_op, batched_args, kwargs, sample.output_process_fn_grad
+                    )
 
                     expected = jvpmapmap_fn(*primals)
                     result = jvpvmapvmap_fn(*primals)
@@ -2124,9 +2781,13 @@ def test_jvpvmapvmap(self, device, dtype, op):
     # See NOTE: [three-transform testing]
     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
-    @skipOps('TestOperators', 'test_vmapjvpvmap', {
-        xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-    })
+    @skipOps(
+        "TestOperators",
+        "test_vmapjvpvmap",
+        {
+            xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
+        },
+    )
     def test_vmapjvpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
@@ -2139,11 +2800,14 @@ def test_vmapjvpvmap(self, device, dtype, op):
                 inner_mapped_op = functools.partial(loop, op, in_dims, 0, B)
 
                 jvpvmap_fn, primals = get_jvp_variant_primals_tangents2(
-                    inner_vmapped_op, batched_args, kwargs,
-                    sample.output_process_fn_grad)
+                    inner_vmapped_op,
+                    batched_args,
+                    kwargs,
+                    sample.output_process_fn_grad,
+                )
                 jvpmap_fn, _ = get_jvp_variant_primals_tangents2(
-                    inner_mapped_op, batched_args, kwargs,
-                    sample.output_process_fn_grad)
+                    inner_mapped_op, batched_args, kwargs, sample.output_process_fn_grad
+                )
 
                 generator = generate_vmap_inputs(primals, {})
 
@@ -2158,9 +2822,13 @@ def test_vmapjvpvmap(self, device, dtype, op):
 
     # See NOTE: [three-transform testing]
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
-    @skipOps('TestOperators', 'test_jvpjvpvmap', {
-        xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-    })
+    @skipOps(
+        "TestOperators",
+        "test_jvpjvpvmap",
+        {
+            xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
+        },
+    )
     def test_jvpjvpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
@@ -2173,11 +2841,18 @@ def test_jvpjvpvmap(self, device, dtype, op):
                 inner_mapped_op = functools.partial(loop, op, in_dims, 0, B)
 
                 jvpmap_fn, args = get_jvp_variant_primals_tangents2(
-                    inner_mapped_op, batched_args, kwargs, sample.output_process_fn_grad)
+                    inner_mapped_op, batched_args, kwargs, sample.output_process_fn_grad
+                )
                 jvpvmap_fn, _ = get_jvp_variant_primals_tangents2(
-                    inner_vmapped_op, batched_args, kwargs, sample.output_process_fn_grad)
-
-                jvpjvpvmap_fn, new_args = get_jvp_variant_primals_tangents2(jvpvmap_fn, args, {})
+                    inner_vmapped_op,
+                    batched_args,
+                    kwargs,
+                    sample.output_process_fn_grad,
+                )
+
+                jvpjvpvmap_fn, new_args = get_jvp_variant_primals_tangents2(
+                    jvpvmap_fn, args, {}
+                )
                 jvpjvpmap_fn, _ = get_jvp_variant_primals_tangents2(jvpmap_fn, args, {})
 
                 expected = jvpjvpmap_fn(*new_args)
@@ -2186,9 +2861,13 @@ def test_jvpjvpvmap(self, device, dtype, op):
 
     # See NOTE: [three-transform testing]
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
-    @skipOps('TestOperators', 'test_jvpvjpvmap', {
-        xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-    })
+    @skipOps(
+        "TestOperators",
+        "test_jvpvjpvmap",
+        {
+            xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable
+        },
+    )
     def test_jvpvjpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
         B = 2
@@ -2200,10 +2879,16 @@ def test_jvpvjpvmap(self, device, dtype, op):
                 inner_vmapped_op = vmap(op, in_dims)
                 inner_mapped_op = functools.partial(loop, op, in_dims, 0, B)
 
-                vjpmap_fn, args = get_vjpfull_variant2(inner_mapped_op, batched_args, kwargs)
-                vjpvmap_fn, _ = get_vjpfull_variant2(inner_vmapped_op, batched_args, kwargs)
+                vjpmap_fn, args = get_vjpfull_variant2(
+                    inner_mapped_op, batched_args, kwargs
+                )
+                vjpvmap_fn, _ = get_vjpfull_variant2(
+                    inner_vmapped_op, batched_args, kwargs
+                )
 
-                jvpvjpvmap_fn, new_args = get_jvp_variant_primals_tangents2(vjpvmap_fn, args, {})
+                jvpvjpvmap_fn, new_args = get_jvp_variant_primals_tangents2(
+                    vjpvmap_fn, args, {}
+                )
                 jvpvjpmap_fn, _ = get_jvp_variant_primals_tangents2(vjpmap_fn, args, {})
 
                 expected = jvpvjpmap_fn(*new_args)
@@ -2240,11 +2925,14 @@ def func(x):
         expected_o, expected_fn = vjp(func, x)
 
         self.assertEqual(actual_o, expected_o)
-        self.assertEqual(expected_fn(torch.ones_like(expected_o)), actual_fn(torch.ones_like(actual_o)))
+        self.assertEqual(
+            expected_fn(torch.ones_like(expected_o)),
+            actual_fn(torch.ones_like(actual_o)),
+        )
 
 
 only_for = ("cpu", "cuda")
 instantiate_device_type_tests(TestOperators, globals(), only_for=only_for)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_parsing.py b/test/functorch/test_parsing.py
index 56043ab64cc6d..2b4d4e5e60cb8 100644
--- a/test/functorch/test_parsing.py
+++ b/test/functorch/test_parsing.py
@@ -28,9 +28,13 @@
 from unittest import mock
 
 from functorch.einops._parsing import (
-    AnonymousAxis, ParsedExpression, parse_pattern, validate_rearrange_expressions, _ellipsis
+    _ellipsis,
+    AnonymousAxis,
+    parse_pattern,
+    ParsedExpression,
+    validate_rearrange_expressions,
 )
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 mock_anonymous_axis_eq: Callable[[AnonymousAxis, object], bool] = (
     lambda self, other: isinstance(other, AnonymousAxis) and self.value == other.value
@@ -39,11 +43,11 @@
 
 class TestAnonymousAxis(TestCase):
     def test_anonymous_axes(self) -> None:
-        a, b = AnonymousAxis('2'), AnonymousAxis('2')
+        a, b = AnonymousAxis("2"), AnonymousAxis("2")
         self.assertNotEqual(a, b)
 
-        with mock.patch.object(AnonymousAxis, '__eq__', mock_anonymous_axis_eq):
-            c, d = AnonymousAxis('2'), AnonymousAxis('3')
+        with mock.patch.object(AnonymousAxis, "__eq__", mock_anonymous_axis_eq):
+            c, d = AnonymousAxis("2"), AnonymousAxis("3")
             self.assertEqual(a, c)
             self.assertEqual(b, c)
             self.assertNotEqual(a, d)
@@ -53,92 +57,146 @@ def test_anonymous_axes(self) -> None:
 
 class TestParsedExpression(TestCase):
     def test_elementary_axis_name(self) -> None:
-        for name in ['a', 'b', 'h', 'dx', 'h1', 'zz', 'i9123', 'somelongname',
-                     'Alex', 'camelCase', 'u_n_d_e_r_score', 'unreasonablyLongAxisName']:
+        for name in [
+            "a",
+            "b",
+            "h",
+            "dx",
+            "h1",
+            "zz",
+            "i9123",
+            "somelongname",
+            "Alex",
+            "camelCase",
+            "u_n_d_e_r_score",
+            "unreasonablyLongAxisName",
+        ]:
             self.assertTrue(ParsedExpression.check_axis_name(name))
 
-        for name in ['', '2b', '12', '_startWithUnderscore', 'endWithUnderscore_', '_', '...', _ellipsis]:
+        for name in [
+            "",
+            "2b",
+            "12",
+            "_startWithUnderscore",
+            "endWithUnderscore_",
+            "_",
+            "...",
+            _ellipsis,
+        ]:
             self.assertFalse(ParsedExpression.check_axis_name(name))
 
     def test_invalid_expressions(self) -> None:
         # double ellipsis should raise an error
-        ParsedExpression('... a b c d')
+        ParsedExpression("... a b c d")
         with self.assertRaises(ValueError):
-            ParsedExpression('... a b c d ...')
+            ParsedExpression("... a b c d ...")
         with self.assertRaises(ValueError):
-            ParsedExpression('... a b c (d ...)')
+            ParsedExpression("... a b c (d ...)")
         with self.assertRaises(ValueError):
-            ParsedExpression('(... a) b c (d ...)')
+            ParsedExpression("(... a) b c (d ...)")
 
         # double/missing/enclosed parenthesis
-        ParsedExpression('(a) b c (d ...)')
+        ParsedExpression("(a) b c (d ...)")
         with self.assertRaises(ValueError):
-            ParsedExpression('(a)) b c (d ...)')
+            ParsedExpression("(a)) b c (d ...)")
         with self.assertRaises(ValueError):
-            ParsedExpression('(a b c (d ...)')
+            ParsedExpression("(a b c (d ...)")
         with self.assertRaises(ValueError):
-            ParsedExpression('(a) (()) b c (d ...)')
+            ParsedExpression("(a) (()) b c (d ...)")
         with self.assertRaises(ValueError):
-            ParsedExpression('(a) ((b c) (d ...))')
+            ParsedExpression("(a) ((b c) (d ...))")
 
         # invalid identifiers
-        ParsedExpression('camelCase under_scored cApiTaLs ß ...')
+        ParsedExpression("camelCase under_scored cApiTaLs \u00DF ...")
         with self.assertRaises(ValueError):
-            ParsedExpression('1a')
+            ParsedExpression("1a")
         with self.assertRaises(ValueError):
-            ParsedExpression('_pre')
+            ParsedExpression("_pre")
         with self.assertRaises(ValueError):
-            ParsedExpression('...pre')
+            ParsedExpression("...pre")
         with self.assertRaises(ValueError):
-            ParsedExpression('pre...')
+            ParsedExpression("pre...")
 
-    @mock.patch.object(AnonymousAxis, '__eq__', mock_anonymous_axis_eq)
+    @mock.patch.object(AnonymousAxis, "__eq__", mock_anonymous_axis_eq)
     def test_parse_expression(self, *mocks: mock.MagicMock) -> None:
-        parsed = ParsedExpression('a1  b1   c1    d1')
-        self.assertSetEqual(parsed.identifiers, {'a1', 'b1', 'c1', 'd1'})
-        self.assertListEqual(parsed.composition, [['a1'], ['b1'], ['c1'], ['d1']])
+        parsed = ParsedExpression("a1  b1   c1    d1")
+        self.assertSetEqual(parsed.identifiers, {"a1", "b1", "c1", "d1"})
+        self.assertListEqual(parsed.composition, [["a1"], ["b1"], ["c1"], ["d1"]])
         self.assertFalse(parsed.has_non_unitary_anonymous_axes)
         self.assertFalse(parsed.has_ellipsis)
 
-        parsed = ParsedExpression('() () () ()')
+        parsed = ParsedExpression("() () () ()")
         self.assertSetEqual(parsed.identifiers, set())
         self.assertListEqual(parsed.composition, [[], [], [], []])
         self.assertFalse(parsed.has_non_unitary_anonymous_axes)
         self.assertFalse(parsed.has_ellipsis)
 
-        parsed = ParsedExpression('1 1 1 ()')
+        parsed = ParsedExpression("1 1 1 ()")
         self.assertSetEqual(parsed.identifiers, set())
         self.assertListEqual(parsed.composition, [[], [], [], []])
         self.assertFalse(parsed.has_non_unitary_anonymous_axes)
         self.assertFalse(parsed.has_ellipsis)
 
-        parsed = ParsedExpression('5 (3 4)')
+        parsed = ParsedExpression("5 (3 4)")
         self.assertEqual(len(parsed.identifiers), 3)
-        self.assertSetEqual({i.value if isinstance(i, AnonymousAxis) else i for i in parsed.identifiers}, {3, 4, 5})
-        self.assertListEqual(parsed.composition, [[AnonymousAxis('5')], [AnonymousAxis('3'), AnonymousAxis('4')]])
+        self.assertSetEqual(
+            {
+                i.value if isinstance(i, AnonymousAxis) else i
+                for i in parsed.identifiers
+            },
+            {3, 4, 5},
+        )
+        self.assertListEqual(
+            parsed.composition,
+            [[AnonymousAxis("5")], [AnonymousAxis("3"), AnonymousAxis("4")]],
+        )
         self.assertTrue(parsed.has_non_unitary_anonymous_axes)
         self.assertFalse(parsed.has_ellipsis)
 
-        parsed = ParsedExpression('5 1 (1 4) 1')
+        parsed = ParsedExpression("5 1 (1 4) 1")
         self.assertEqual(len(parsed.identifiers), 2)
-        self.assertSetEqual({i.value if isinstance(i, AnonymousAxis) else i for i in parsed.identifiers}, {4, 5})
-        self.assertListEqual(parsed.composition, [[AnonymousAxis('5')], [], [AnonymousAxis('4')], []])
+        self.assertSetEqual(
+            {
+                i.value if isinstance(i, AnonymousAxis) else i
+                for i in parsed.identifiers
+            },
+            {4, 5},
+        )
+        self.assertListEqual(
+            parsed.composition, [[AnonymousAxis("5")], [], [AnonymousAxis("4")], []]
+        )
 
-        parsed = ParsedExpression('name1 ... a1 12 (name2 14)')
+        parsed = ParsedExpression("name1 ... a1 12 (name2 14)")
         self.assertEqual(len(parsed.identifiers), 6)
-        self.assertEqual(len(parsed.identifiers - {'name1', _ellipsis, 'a1', 'name2'}), 2)
+        self.assertEqual(
+            len(parsed.identifiers - {"name1", _ellipsis, "a1", "name2"}), 2
+        )
         self.assertListEqual(
-            parsed.composition, [['name1'], _ellipsis, ['a1'], [AnonymousAxis('12')], ['name2', AnonymousAxis('14')]]
+            parsed.composition,
+            [
+                ["name1"],
+                _ellipsis,
+                ["a1"],
+                [AnonymousAxis("12")],
+                ["name2", AnonymousAxis("14")],
+            ],
         )
         self.assertTrue(parsed.has_non_unitary_anonymous_axes)
         self.assertTrue(parsed.has_ellipsis)
         self.assertFalse(parsed.has_ellipsis_parenthesized)
 
-        parsed = ParsedExpression('(name1 ... a1 12) name2 14')
+        parsed = ParsedExpression("(name1 ... a1 12) name2 14")
         self.assertEqual(len(parsed.identifiers), 6)
-        self.assertEqual(len(parsed.identifiers - {'name1', _ellipsis, 'a1', 'name2'}), 2)
+        self.assertEqual(
+            len(parsed.identifiers - {"name1", _ellipsis, "a1", "name2"}), 2
+        )
         self.assertListEqual(
-            parsed.composition, [['name1', _ellipsis, 'a1', AnonymousAxis('12')], ['name2'], [AnonymousAxis('14')]]
+            parsed.composition,
+            [
+                ["name1", _ellipsis, "a1", AnonymousAxis("12")],
+                ["name2"],
+                [AnonymousAxis("14")],
+            ],
         )
         self.assertTrue(parsed.has_non_unitary_anonymous_axes)
         self.assertTrue(parsed.has_ellipsis)
@@ -240,5 +298,5 @@ def test_unexpected_axes_lengths(self) -> None:
             validate_rearrange_expressions(left, right, axes_lengths)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_rearrange.py b/test/functorch/test_rearrange.py
index c7e773a91b06f..446a2fbb7e8d8 100644
--- a/test/functorch/test_rearrange.py
+++ b/test/functorch/test_rearrange.py
@@ -30,39 +30,39 @@
 import numpy as np
 import torch
 from functorch.einops import rearrange
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 identity_patterns: List[str] = [
-    '...->...',
-    'a b c d e-> a b c d e',
-    'a b c d e ...-> ... a b c d e',
-    'a b c d e ...-> a ... b c d e',
-    '... a b c d e -> ... a b c d e',
-    'a ... e-> a ... e',
-    'a ... -> a ... ',
-    'a ... c d e -> a (...) c d e',
+    "...->...",
+    "a b c d e-> a b c d e",
+    "a b c d e ...-> ... a b c d e",
+    "a b c d e ...-> a ... b c d e",
+    "... a b c d e -> ... a b c d e",
+    "a ... e-> a ... e",
+    "a ... -> a ... ",
+    "a ... c d e -> a (...) c d e",
 ]
 
 equivalent_rearrange_patterns: List[Tuple[str, str]] = [
-    ('a b c d e -> (a b) c d e', 'a b ... -> (a b) ... '),
-    ('a b c d e -> a b (c d) e', '... c d e -> ... (c d) e'),
-    ('a b c d e -> a b c d e', '... -> ... '),
-    ('a b c d e -> (a b c d e)', '... ->  (...)'),
-    ('a b c d e -> b (c d e) a', 'a b ... -> b (...) a'),
-    ('a b c d e -> b (a c d) e', 'a b ... e -> b (a ...) e'),
+    ("a b c d e -> (a b) c d e", "a b ... -> (a b) ... "),
+    ("a b c d e -> a b (c d) e", "... c d e -> ... (c d) e"),
+    ("a b c d e -> a b c d e", "... -> ... "),
+    ("a b c d e -> (a b c d e)", "... ->  (...)"),
+    ("a b c d e -> b (c d e) a", "a b ... -> b (...) a"),
+    ("a b c d e -> b (a c d) e", "a b ... e -> b (a ...) e"),
 ]
 
 
 class TestRearrange(TestCase):
     def test_collapsed_ellipsis_errors_out(self) -> None:
         x = torch.zeros([1, 1, 1, 1, 1])
-        rearrange(x, 'a b c d ... ->  a b c ... d')
+        rearrange(x, "a b c d ... ->  a b c ... d")
         with self.assertRaises(ValueError):
-            rearrange(x, 'a b c d (...) ->  a b c ... d')
+            rearrange(x, "a b c d (...) ->  a b c ... d")
 
-        rearrange(x, '... ->  (...)')
+        rearrange(x, "... ->  (...)")
         with self.assertRaises(ValueError):
-            rearrange(x, '(...) -> (...)')
+            rearrange(x, "(...) -> (...)")
 
     def test_ellipsis_ops(self) -> None:
         x = torch.arange(2 * 3 * 4 * 5 * 6).reshape([2, 3, 4, 5, 6])
@@ -70,64 +70,73 @@ def test_ellipsis_ops(self) -> None:
             torch.testing.assert_close(rearrange(x, pattern), x, msg=pattern)
 
         for pattern1, pattern2 in equivalent_rearrange_patterns:
-            torch.testing.assert_close(rearrange(x, pattern1), rearrange(x, pattern2), msg=f"{pattern1} vs {pattern2}")
+            torch.testing.assert_close(
+                rearrange(x, pattern1),
+                rearrange(x, pattern2),
+                msg=f"{pattern1} vs {pattern2}",
+            )
 
     def test_rearrange_consistency(self) -> None:
         shape = [1, 2, 3, 5, 7, 11]
         x = torch.arange(int(np.prod(shape, dtype=int))).reshape(shape)
         for pattern in [
-            'a b c d e f -> a b c d e f',
-            'b a c d e f -> a b d e f c',
-            'a b c d e f -> f e d c b a',
-            'a b c d e f -> (f e) d (c b a)',
-            'a b c d e f -> (f e d c b a)',
+            "a b c d e f -> a b c d e f",
+            "b a c d e f -> a b d e f c",
+            "a b c d e f -> f e d c b a",
+            "a b c d e f -> (f e) d (c b a)",
+            "a b c d e f -> (f e d c b a)",
         ]:
             result = rearrange(x, pattern)
             self.assertEqual(len(np.setdiff1d(x, result)), 0)
             self.assertIs(result.dtype, x.dtype)
 
-        result = rearrange(x, 'a b c d e f -> a (b) (c d e) f')
+        result = rearrange(x, "a b c d e f -> a (b) (c d e) f")
         torch.testing.assert_close(x.flatten(), result.flatten())
 
-        result = rearrange(x, 'a aa aa1 a1a1 aaaa a11 -> a aa aa1 a1a1 aaaa a11')
+        result = rearrange(x, "a aa aa1 a1a1 aaaa a11 -> a aa aa1 a1a1 aaaa a11")
         torch.testing.assert_close(x, result)
 
-        result1 = rearrange(x, 'a b c d e f -> f e d c b a')
-        result2 = rearrange(x, 'f e d c b a -> a b c d e f')
+        result1 = rearrange(x, "a b c d e f -> f e d c b a")
+        result2 = rearrange(x, "f e d c b a -> a b c d e f")
         torch.testing.assert_close(result1, result2)
 
-        result = rearrange(rearrange(x, 'a b c d e f -> (f d) c (e b) a'), '(f d) c (e b) a -> a b c d e f', b=2, d=5)
+        result = rearrange(
+            rearrange(x, "a b c d e f -> (f d) c (e b) a"),
+            "(f d) c (e b) a -> a b c d e f",
+            b=2,
+            d=5,
+        )
         torch.testing.assert_close(x, result)
 
-        sizes = dict(zip('abcdef', shape))
-        temp = rearrange(x, 'a b c d e f -> (f d) c (e b) a', **sizes)
-        result = rearrange(temp, '(f d) c (e b) a -> a b c d e f', **sizes)
+        sizes = dict(zip("abcdef", shape))
+        temp = rearrange(x, "a b c d e f -> (f d) c (e b) a", **sizes)
+        result = rearrange(temp, "(f d) c (e b) a -> a b c d e f", **sizes)
         torch.testing.assert_close(x, result)
 
         x2 = torch.arange(2 * 3 * 4).reshape([2, 3, 4])
-        result = rearrange(x2, 'a b c -> b c a')
+        result = rearrange(x2, "a b c -> b c a")
         self.assertEqual(x2[1, 2, 3], result[2, 3, 1])
         self.assertEqual(x2[0, 1, 2], result[1, 2, 0])
 
     def test_rearrange_permutations(self) -> None:
         # tests random permutation of axes against two independent numpy ways
         for n_axes in range(1, 10):
-            input = torch.arange(2 ** n_axes).reshape([2] * n_axes)
+            input = torch.arange(2**n_axes).reshape([2] * n_axes)
             permutation = np.random.permutation(n_axes)
-            left_expression = ' '.join('i' + str(axis) for axis in range(n_axes))
-            right_expression = ' '.join('i' + str(axis) for axis in permutation)
-            expression = left_expression + ' -> ' + right_expression
+            left_expression = " ".join("i" + str(axis) for axis in range(n_axes))
+            right_expression = " ".join("i" + str(axis) for axis in permutation)
+            expression = left_expression + " -> " + right_expression
             result = rearrange(input, expression)
 
             for pick in np.random.randint(0, 2, [10, n_axes]):
                 self.assertEqual(input[tuple(pick)], result[tuple(pick[permutation])])
 
         for n_axes in range(1, 10):
-            input = torch.arange(2 ** n_axes).reshape([2] * n_axes)
+            input = torch.arange(2**n_axes).reshape([2] * n_axes)
             permutation = np.random.permutation(n_axes)
-            left_expression = ' '.join('i' + str(axis) for axis in range(n_axes)[::-1])
-            right_expression = ' '.join('i' + str(axis) for axis in permutation[::-1])
-            expression = left_expression + ' -> ' + right_expression
+            left_expression = " ".join("i" + str(axis) for axis in range(n_axes)[::-1])
+            right_expression = " ".join("i" + str(axis) for axis in permutation[::-1])
+            expression = left_expression + " -> " + right_expression
             result = rearrange(input, expression)
             self.assertEqual(result.shape, input.shape)
             expected_result = torch.zeros_like(input)
@@ -140,29 +149,32 @@ def test_concatenations_and_stacking(self) -> None:
         for n_arrays in [1, 2, 5]:
             shapes: List[List[int]] = [[], [1], [1, 1], [2, 3, 5, 7], [1] * 6]
             for shape in shapes:
-                arrays1 = [torch.arange(i, i + np.prod(shape, dtype=int)).reshape(shape) for i in range(n_arrays)]
+                arrays1 = [
+                    torch.arange(i, i + np.prod(shape, dtype=int)).reshape(shape)
+                    for i in range(n_arrays)
+                ]
                 result0 = torch.stack(arrays1)
-                result1 = rearrange(arrays1, '...->...')
+                result1 = rearrange(arrays1, "...->...")
                 torch.testing.assert_close(result0, result1)
 
     def test_unsqueeze(self) -> None:
         x = torch.randn((2, 3, 4, 5))
-        actual = rearrange(x, 'b h w c -> b 1 h w 1 c')
+        actual = rearrange(x, "b h w c -> b 1 h w 1 c")
         expected = x.unsqueeze(1).unsqueeze(-2)
         torch.testing.assert_close(actual, expected)
 
     def test_squeeze(self) -> None:
         x = torch.randn((2, 1, 3, 4, 1, 5))
-        actual = rearrange(x, 'b 1 h w 1 c -> b h w c')
+        actual = rearrange(x, "b 1 h w 1 c -> b h w c")
         expected = x.squeeze()
         torch.testing.assert_close(actual, expected)
 
     def test_0_dim_tensor(self) -> None:
         x = expected = torch.tensor(1)
-        actual = rearrange(x, '->')
+        actual = rearrange(x, "->")
         torch.testing.assert_close(actual, expected)
 
-        actual = rearrange(x, '... -> ...')
+        actual = rearrange(x, "... -> ...")
         torch.testing.assert_close(actual, expected)
 
     def test_dimension_mismatch_no_ellipsis(self) -> None:
@@ -179,5 +191,5 @@ def test_dimension_mismatch_with_ellipsis(self) -> None:
             rearrange(x, "a ... -> ... a")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 6ca33b71ab73b..a23b51da923f9 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -6,67 +6,70 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import contextlib
+import functools
+import itertools
+import os
+import random
+import types
+import unittest
+import warnings
+from collections import namedtuple
 from typing import OrderedDict
 from unittest.case import skipIf
-from torch.testing._internal.common_utils import TestCase, run_tests
+
+import functorch
 import torch
 import torch.nn.functional as F
-from torch import Tensor
-import functools
-import itertools
-import warnings
-import unittest
-import random
-from torch.testing._internal.common_methods_invocations import op_db
-from torch.testing._internal.common_cuda import with_tf32_off
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, \
-    OpDTypes
-from torch.testing._internal.common_device_type import ops
-from torch.testing._internal.common_utils import (
-    parametrize,
-    instantiate_parametrized_tests,
-    IS_WINDOWS,
-    subtest,
-    skipIfRocm,
-    TEST_WITH_TORCHDYNAMO,
-    xfailIfTorchDynamo,
-    markDynamoStrictTest,
-    skipIfTorchDynamo,
-)
-from torch.testing._internal.common_device_type import \
-    toleranceOverride, tol
-from functorch_additional_op_db import additional_op_db
 from common_utils import (
-    get_fallback_and_vmap_exhaustive,
-    xfail,
-    skip,
-    skipOps,
     check_vmap_fallback,
-    tol1,
-    opsToleranceOverride,
-    is_batch_norm_training,
-    generate_vmap_inputs,
     compute_quantities_for_vmap_test,
-    is_valid_inplace_sample_input,
     decorate,
     DisableVmapFallback,
+    generate_vmap_inputs,
+    get_fallback_and_vmap_exhaustive,
+    is_batch_norm_training,
+    is_valid_inplace_sample_input,
+    opsToleranceOverride,
+    skip,
+    skipOps,
+    tol1,
+    xfail,
 )
-import types
-import os
-from collections import namedtuple
-import contextlib
-
-import functorch
-from functorch import vmap, grad, grad_and_value, jvp, vjp, jacfwd
+from functorch import grad, grad_and_value, jacfwd, jvp, vjp, vmap
 from functorch.experimental import chunk_vmap
+from functorch_additional_op_db import additional_op_db
+from torch import Tensor
 from torch._C._functorch import reshape_dim_into, reshape_dim_outof
 from torch._functorch.make_functional import functional_init_with_buffers
-from torch.testing._internal.autograd_function_db import autograd_function_db
 from torch._functorch.vmap import restore_vmap
+from torch.testing._internal.autograd_function_db import autograd_function_db
+from torch.testing._internal.common_cuda import with_tf32_off
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    OpDTypes,
+    ops,
+    tol,
+    toleranceOverride,
+)
+from torch.testing._internal.common_methods_invocations import op_db
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    IS_WINDOWS,
+    markDynamoStrictTest,
+    parametrize,
+    run_tests,
+    skipIfRocm,
+    skipIfTorchDynamo,
+    subtest,
+    TEST_WITH_TORCHDYNAMO,
+    TestCase,
+    unMarkDynamoStrictTest,
+    xfailIfTorchDynamo,
+)
 from torch.utils import _pytree as pytree
-from torch.testing._internal.common_utils import unMarkDynamoStrictTest
 
-FALLBACK_REGEX = 'There is a performance drop'
+FALLBACK_REGEX = "There is a performance drop"
 
 
 class EnableVmapFallbackWarnings:
@@ -93,16 +96,20 @@ def multiple_outputs(x):
     def test_different_map_dim_size_raises(self):
         x = torch.randn(2)
         y = torch.randn(3)
-        expected_msg = 'Expected all tensors to have the same size in the mapped dimension'
+        expected_msg = (
+            "Expected all tensors to have the same size in the mapped dimension"
+        )
         with self.assertRaisesRegex(ValueError, expected_msg):
             vmap(torch.mul)(x, y)
         with self.assertRaisesRegex(ValueError, expected_msg):
             vmap(lambda z: z[0] + z[1], in_dims=((0, 0),))((x, y))
         with self.assertRaisesRegex(ValueError, expected_msg):
-            vmap(lambda z: z['x'] + z['y'], in_dims=({'x': 0, 'y': 0},))({'x': x, 'y': y})
+            vmap(lambda z: z["x"] + z["y"], in_dims=({"x": 0, "y": 0},))(
+                {"x": x, "y": y}
+            )
 
     def test_func_with_no_inputs(self):
-        expected_msg = 'got no inputs'
+        expected_msg = "got no inputs"
 
         def foo():
             return torch.randn(3)
@@ -120,7 +127,7 @@ def test_func_with_no_tensors(self):
         def foo(x):
             return torch.randn(3)
 
-        with self.assertRaisesRegex(ValueError, 'at least one Tensor'):
+        with self.assertRaisesRegex(ValueError, "at least one Tensor"):
             vmap(foo, (None,))(1)
 
     def test_constant_function(self):
@@ -223,7 +230,7 @@ def out_op(x, y):
 
         # Don't support non-tensor returns. This is a limitation of vmap;
         # functions that don't return tensors must be special cased
-        with self.assertRaisesRegex(RuntimeError, 'Batching rule not implemented'):
+        with self.assertRaisesRegex(RuntimeError, "Batching rule not implemented"):
             vmap(torch.equal)(tensor, tensor)
 
     def test_nonzero_out_dims(self):
@@ -249,7 +256,9 @@ def test_nonzero_out_dims(self):
         tensor = torch.randn(2, 3, 5, 7)
         other = torch.randn(2, 3, 5, 7)
         result = vmap(lambda x, y: (x, y), out_dims=2)(tensor, other)
-        self.assertEqual(result, (tensor.permute(1, 2, 0, 3), other.permute(1, 2, 0, 3)))
+        self.assertEqual(
+            result, (tensor.permute(1, 2, 0, 3), other.permute(1, 2, 0, 3))
+        )
 
         # use out_dims with the maximum vmap-able tensor dims (64 dims)
         ndims = 64
@@ -262,12 +271,18 @@ def test_nonzero_out_dims(self):
         # test something that is not the identity function
         def foo(x, y):
             return x, x * y, x * y * y
+
         x = torch.randn(2, 3, 5)
         y = torch.randn(2, 3, 5)
         result = vmap(foo, out_dims=1)(x, y)
         self.assertEqual(
             result,
-            (x.permute(1, 0, 2), (x * y).permute(1, 0, 2), (x * y * y).permute(1, 0, 2)))
+            (
+                x.permute(1, 0, 2),
+                (x * y).permute(1, 0, 2),
+                (x * y * y).permute(1, 0, 2),
+            ),
+        )
 
     def test_multiple_out_dims(self):
         def foo(x):
@@ -327,36 +342,38 @@ def foo(x):
 
     def test_out_dims_none_tuple(self):
         def foo(x):
-            return x, 'hello world'
+            return x, "hello world"
 
         tensor = torch.randn(2, 3)
         result = vmap(foo, out_dims=(0, None))(tensor)
-        self.assertEqual(result[1], 'hello world')
+        self.assertEqual(result[1], "hello world")
         self.assertEqual(result[0], tensor)
 
         def foo(x):
             x.add_(1)
-            return None, 'hello world'
-        result = vmap(foo, out_dims=(None, None))(tensor)
-        self.assertEqual(result, (None, 'hello world'))
+            return None, "hello world"
 
+        result = vmap(foo, out_dims=(None, None))(tensor)
+        self.assertEqual(result, (None, "hello world"))
 
     def test_out_dims_none(self):
         def foo(x):
             return x
 
         tensor = torch.randn(2, 3)
-        with self.assertRaisesRegex(ValueError, 'can not return a BatchedTensor when out_dim is None'):
+        with self.assertRaisesRegex(
+            ValueError, "can not return a BatchedTensor when out_dim is None"
+        ):
             vmap(foo, out_dims=None)(tensor)
 
         def foo(x):
             x.add_(1)
-            return 'hello world'
+            return "hello world"
+
         result = vmap(foo, out_dims=None)(tensor)
-        self.assertEqual(result, 'hello world')
+        self.assertEqual(result, "hello world")
 
     def test_out_dims_normal_tensor(self):
-
         def foo(x):
             return torch.arange(3)
 
@@ -367,7 +384,6 @@ def foo(x):
         result = vmap(foo, out_dims=None)(tensor)
         self.assertEqual(result, torch.arange(3))
 
-
     def test_pytree_returns(self):
         x = torch.randn(2, 3)
 
@@ -433,15 +449,15 @@ def f(x):
         self.assertEqual(y2, y0.t())
 
     def test_out_dims_must_be_int_or_collection_of_int_err_msg(self):
-        msg = 'must be an int, None or a python collection of ints'
+        msg = "must be an int, None or a python collection of ints"
         tensor = torch.randn(2, 3)
         with self.assertRaisesRegex(ValueError, msg):
-            vmap(lambda x: x, out_dims='lol')(tensor)
+            vmap(lambda x: x, out_dims="lol")(tensor)
         with self.assertRaisesRegex(ValueError, msg):
-            vmap(lambda x: x, out_dims=('lol',))(tensor)
+            vmap(lambda x: x, out_dims=("lol",))(tensor)
 
     def test_out_dims_and_num_outputs_mismatch_err_msg(self):
-        msg = 'not compatible'
+        msg = "not compatible"
         x = torch.randn(2, 3, 5)
 
         # Too many out_dims
@@ -460,7 +476,7 @@ def test_out_dim_out_of_bounds_err_msg(self):
         # TODO(rzou): This error message isn't that great. It comes straight
         # from maybe_wrap_dim. Consider doing a try-catch-(add some context) to
         # the error message in the future in C++
-        msg = 'Dimension out of range'
+        msg = "Dimension out of range"
         x = torch.randn(2, 3, 5)
         with self.assertRaisesRegex(IndexError, msg):
             vmap(lambda x: x, out_dims=3)(x)
@@ -542,7 +558,7 @@ def test_item_throws(self):
         def f(x):
             return x.item()
 
-        with self.assertRaisesRegex(RuntimeError, r'item\(\) on a Tensor'):
+        with self.assertRaisesRegex(RuntimeError, r"item\(\) on a Tensor"):
             vmap(f)(torch.randn(3))
 
     def test_data_dependent_control_flow_throws(self):
@@ -551,7 +567,7 @@ def f(x):
                 return x
             return 0
 
-        with self.assertRaisesRegex(RuntimeError, r'data-dependent control flow'):
+        with self.assertRaisesRegex(RuntimeError, r"data-dependent control flow"):
             vmap(f)(torch.randn(3))
 
     def test_accepts_nested_inputs(self):
@@ -573,28 +589,30 @@ def test_accepts_nested_inputs(self):
         out = vmap(lambda z: z[0] + z[1], in_dims=([0, 0],))([x, y])
         self.assertEqual(out, x + y)
 
-        out = vmap(lambda z: z['x'] + z['y'])({'x': x, 'y': y})
+        out = vmap(lambda z: z["x"] + z["y"])({"x": x, "y": y})
         self.assertEqual(out, x + y)
-        out = vmap(lambda z: z['x'] + z['y'], in_dims=(0,))({'x': x, 'y': y})
+        out = vmap(lambda z: z["x"] + z["y"], in_dims=(0,))({"x": x, "y": y})
         self.assertEqual(out, x + y)
-        out = vmap(lambda z: z['x'] + z['y'], in_dims=({'x': 0, 'y': 0},))({'x': x, 'y': y})
+        out = vmap(lambda z: z["x"] + z["y"], in_dims=({"x": 0, "y": 0},))(
+            {"x": x, "y": y}
+        )
         self.assertEqual(out, x + y)
 
         # Multiple layers of nesting
-        out_fn = vmap(lambda z: z['x'][0] + z['x'][1][0] + z['y'][0] + z['y'][1])
-        out = out_fn({'x': [x, (x,)], 'y': [y, y]})
+        out_fn = vmap(lambda z: z["x"][0] + z["x"][1][0] + z["y"][0] + z["y"][1])
+        out = out_fn({"x": [x, (x,)], "y": [y, y]})
         self.assertEqual(out, x + x + y + y)
 
     def test_in_dims_wrong_type_err_msg(self):
         x = torch.randn(3)
         y = torch.randn(3)
-        msg = r'expected `in_dims` to be int or a \(potentially nested\) tuple'
+        msg = r"expected `in_dims` to be int or a \(potentially nested\) tuple"
         with self.assertRaisesRegex(ValueError, msg):
             vmap(torch.mul, [0, 0])(x, y)
         with self.assertRaisesRegex(ValueError, msg):
             vmap(torch.mul, set({0}))(x, y)
         with self.assertRaisesRegex(ValueError, msg):
-            vmap(torch.mul, 'lol')(x, y)
+            vmap(torch.mul, "lol")(x, y)
         with self.assertRaisesRegex(ValueError, msg):
             vmap(lambda z: z[0] + z[1], in_dims=[0, 0])([x, y])
         # The following should not throw
@@ -603,7 +621,7 @@ def test_in_dims_wrong_type_err_msg(self):
     def test_not_enough_in_dims_err_msg(self):
         x = torch.randn(3)
         y = torch.randn(3)
-        msg = r'in_dims is not compatible with the structure of `inputs`'
+        msg = r"in_dims is not compatible with the structure of `inputs`"
 
         with self.assertRaisesRegex(ValueError, msg):
             vmap(torch.mul, (0,))(x, y)
@@ -626,7 +644,7 @@ def bar(x, yz):
         x = torch.randn(2, 3)
 
         # the following are errors in jax (and will always be errors)
-        msg = 'Got in_dim=0 for an input but the input is of type'
+        msg = "Got in_dim=0 for an input but the input is of type"
         with self.assertRaisesRegex(ValueError, msg):
             vmap(torch.sum)(x, 0)
         with self.assertRaisesRegex(ValueError, msg):
@@ -643,7 +661,7 @@ def foo(x):
         x = torch.randn(2, 3)
         y = torch.randn(2, 3)
 
-        msg = r'Got in_dim=-?\w for some input, but that input is a Tensor of dimensionality \w'
+        msg = r"Got in_dim=-?\w for some input, but that input is a Tensor of dimensionality \w"
         with self.assertRaisesRegex(ValueError, msg):
             vmap(foo)(torch.randn([]))
         with self.assertRaisesRegex(ValueError, msg):
@@ -700,7 +718,7 @@ def test_fallback_zero_dim(self):
         x = torch.randn(B0, 11)
         y = torch.randn(11)
 
-        msg = 'The fallback path does not support vmap over dims of size 0'
+        msg = "The fallback path does not support vmap over dims of size 0"
 
         with self.assertRaisesRegex(RuntimeError, msg):
             vmap(op, (0, None))(x, y)
@@ -758,11 +776,12 @@ def run_test(batch_size):
             index = torch.tensor([0, 4, 2])
             values = torch.randn(B0, 3, 13)
 
-            self._assert_uses_vmap_fallback((torch.index_add, (0, None, None, 0)), (x, dim, index, values))
+            self._assert_uses_vmap_fallback(
+                (torch.index_add, (0, None, None, 0)), (x, dim, index, values)
+            )
 
             result = vmap(torch.index_add, (0, None, None, 0))(x, dim, index, values)
-            expected = torch.index_add(
-                x, dim + 1, index, values.view(B0, 3, 1, 13))
+            expected = torch.index_add(x, dim + 1, index, values.view(B0, 3, 1, 13))
             self.assertEqual(result, expected)
 
         run_test(batch_size=5)
@@ -897,7 +916,7 @@ def test_inplace_fallback_nary_different_levels(self):
         self.assertEqual(x, outplace_op(x_orig, y.view(B0, 1, 7)))
 
         # op(left, right): Some of the levels in right are not found in left
-        msg = r'vmap: aten::atan2_\(self, \*extra_args\) is not possible'
+        msg = r"vmap: aten::atan2_\(self, \*extra_args\) is not possible"
         x = torch.rand(7)
         y = torch.rand(B0, 7)
         with self.assertRaisesRegex(RuntimeError, msg):
@@ -922,13 +941,15 @@ def test_backward_unsupported_interaction(self):
         x = torch.randn(3, requires_grad=True)
         y = torch.randn(5)
         grad = torch.randn_like(x)
-        err_msg = r'backward\(\) called inside a functorch transform'
+        err_msg = r"backward\(\) called inside a functorch transform"
 
         def backward_on_vmapped_tensor(x):
             x.sum().backward()
 
         # FIXME
-        return self.skipTest("error: element 0 of tensors does not require grad and does not have a grad_fn")
+        return self.skipTest(
+            "error: element 0 of tensors does not require grad and does not have a grad_fn"
+        )
         with self.assertRaisesRegex(RuntimeError, err_msg):
             vmap(backward_on_vmapped_tensor)(x)
 
@@ -948,7 +969,7 @@ def completely_unrelated_backward(y):
     @unittest.expectedFailure
     def test_grad_unsupported_interaction(self):
         input_tensor = torch.randn(3, requires_grad=True)
-        err_msg = 'autograd.grad.* called inside torch.vmap'
+        err_msg = "autograd.grad.* called inside torch.vmap"
 
         captured = torch.randn(3, requires_grad=True)
 
@@ -959,7 +980,7 @@ def output_to_grad_is_vmapped(input_tensor):
         with self.assertRaisesRegex(RuntimeError, err_msg):
             vmap(output_to_grad_is_vmapped)(input_tensor)
 
-        output = (input_tensor ** 2).sum()
+        output = (input_tensor**2).sum()
 
         def input_to_grad_is_vmapped(input_tensor):
             return torch.autograd.grad([output], [input_tensor])[0]
@@ -979,7 +1000,6 @@ def vjp_mul(v):
         jacobian = vmap(vjp_mul)(batched_v)
         self.assertEqual(jacobian, torch.diagflat(y))
 
-    @xfailIfTorchDynamo
     def test_functools_partial(self):
         x = torch.randn(3)
         y = torch.randn(2, 3)
@@ -1000,7 +1020,7 @@ def test_fallback_with_undefined_grad(self):
 
         def get_vjp(v):
             result = torch.nn.functional.conv2d(x, weight)
-            grad_x, = torch.autograd.grad(result, x, v)
+            (grad_x,) = torch.autograd.grad(result, x, v)
             return grad_x
 
         # Runs vmap(get_vjp)(v), which should not error out.
@@ -1078,7 +1098,6 @@ def test_decomposition_under_python_dispatcher(self):
         self.assertEqual(o, torch.square(t))
 
     def _test_vmap_autocast(self, device):
-
         if torch.device(device).type == "cpu":
             amp_dtype = torch.bfloat16
         else:
@@ -1141,17 +1160,17 @@ def test_restore_vmap_pytree_input_output(self):
         def f(x, y):
             output0 = x[0] + x[1]
             output1 = y
-            return {'a': output0, 'b': output1}
+            return {"a": output0, "b": output1}
 
         B = 2
         x0 = torch.randn(B, 3)
         x1 = torch.randn(B)
         y = torch.randn(4, B)
 
-        out, out_dims = restore_vmap(f, ((0, 0), 1), B, 'error')((x0, x1), y)
-        expected = vmap(f, in_dims=((0, 0), 1), out_dims={'a': 0, 'b': 1})((x0, x1), y)
+        out, out_dims = restore_vmap(f, ((0, 0), 1), B, "error")((x0, x1), y)
+        expected = vmap(f, in_dims=((0, 0), 1), out_dims={"a": 0, "b": 1})((x0, x1), y)
         self.assertEqual(out, expected)
-        self.assertEqual(out_dims, {'a': 0, 'b': 1})
+        self.assertEqual(out_dims, {"a": 0, "b": 1})
 
     def test_restore_vmap_no_vmapped_inputs(self):
         def f(x, y, z):
@@ -1162,7 +1181,7 @@ def f(x, y, z):
         x = torch.randn(3)
         y = torch.randn(4)
         z = 5
-        out, out_dims = restore_vmap(f, (None, None, None), B, 'error')(x, y, z)
+        out, out_dims = restore_vmap(f, (None, None, None), B, "error")(x, y, z)
         self.assertEqual(out, f(x, y, z))
         self.assertEqual(out_dims, (None, None, None))
 
@@ -1174,7 +1193,7 @@ def f(x, y):
         B = 2
         x = torch.randn(B, 3)
         y = torch.randn(4)
-        out, out_dims = restore_vmap(f, (0, None), B, 'error')(x, y)
+        out, out_dims = restore_vmap(f, (0, None), B, "error")(x, y)
         self.assertEqual(out, f(None, y))
         self.assertEqual(out_dims, (None, None, None))
 
@@ -1183,14 +1202,18 @@ def foo(x):
             y = x.data
             return x
 
-        with self.assertRaisesRegex(RuntimeError, "accessing `data` under vmap transform"):
+        with self.assertRaisesRegex(
+            RuntimeError, "accessing `data` under vmap transform"
+        ):
             torch.func.vmap(foo)(torch.randn(3, 3))
 
         def foo(x):
             x.data = torch.ones(3, 3)
             return x
 
-        with self.assertRaisesRegex(RuntimeError, "mutating directly with `.data` under vmap"):
+        with self.assertRaisesRegex(
+            RuntimeError, "mutating directly with `.data` under vmap"
+        ):
             torch.func.vmap(foo)(torch.randn(3, 3))
 
 
@@ -1229,26 +1252,31 @@ def reference_vmap(op, inputs, in_dims=0, out_dims=0, return_nt=False):
     if isinstance(out_dims, int):
         out_dims = (out_dims,) * num_returns
     if return_nt:
-        return tuple(torch.nested.nested_tensor(list(result_shards))
-                     for result_shards in zip(*results))
+        return tuple(
+            torch.nested.nested_tensor(list(result_shards))
+            for result_shards in zip(*results)
+        )
     else:
-        return tuple(torch.stack(result_shards, out_dim)
-                     for result_shards, out_dim in zip(zip(*results), out_dims))
+        return tuple(
+            torch.stack(result_shards, out_dim)
+            for result_shards, out_dim in zip(zip(*results), out_dims)
+        )
 
 
 class TensorFactory:
     @staticmethod
-    def rand(size, device='cpu', dtype=torch.float):
+    def rand(size, device="cpu", dtype=torch.float):
         return torch.rand(size, device=device, dtype=dtype)
 
     @staticmethod
-    def randn(size, device='cpu', dtype=torch.float):
+    def randn(size, device="cpu", dtype=torch.float):
         return torch.randn(size, device=device, dtype=dtype)
 
     @staticmethod
-    def randp1(size, device='cpu', dtype=torch.float):
+    def randp1(size, device="cpu", dtype=torch.float):
         return torch.rand(size, device=device, dtype=dtype) + 1
 
+
 # Tests vmap(op, in_dims, out_dims)(*inputs) by comparing the output to a
 # (slow) sequential map+stack fallback.
 #
@@ -1256,11 +1284,20 @@ def randp1(size, device='cpu', dtype=torch.float):
 # check_propagates_grad: Test if the operation propagates gradients.
 
 
-def _vmap_test(self, op, inputs, in_dims=0, out_dims=0,
-               check_view=False, check_propagates_grad=True):
+def _vmap_test(
+    self,
+    op,
+    inputs,
+    in_dims=0,
+    out_dims=0,
+    check_view=False,
+    check_propagates_grad=True,
+):
     result = vmap(op, in_dims, out_dims)(*inputs)
     are_nested = [t.is_nested for t in pytree.tree_leaves(result)]
-    reference_result = reference_vmap(op, inputs, in_dims, out_dims, return_nt=any(are_nested))
+    reference_result = reference_vmap(
+        op, inputs, in_dims, out_dims, return_nt=any(are_nested)
+    )
     self.assertEqual(result, reference_result)
     op_has_single_return = not isinstance(result, tuple)
 
@@ -1268,8 +1305,10 @@ def _vmap_test(self, op, inputs, in_dims=0, out_dims=0,
         result_as_tuple = (result,) if op_has_single_return else result
         for output in result_as_tuple:
             input0_base = inputs[0] if inputs[0]._base is None else inputs[0]._base
-            self.assertTrue(output._base is input0_base,
-                            msg="result was not a view of the first input!")
+            self.assertTrue(
+                output._base is input0_base,
+                msg="result was not a view of the first input!",
+            )
 
     if not check_propagates_grad:
         return
@@ -1287,13 +1326,14 @@ def _vmap_test(self, op, inputs, in_dims=0, out_dims=0,
 
 
 def should_allow_vmap_fallback_usage(fn):
-    return getattr(fn, '_allow_vmap_fallback_usage', False)
+    return getattr(fn, "_allow_vmap_fallback_usage", False)
 
 
 def allowVmapFallbackUsage(fn):
     fn._allow_vmap_fallback_usage = True
     return fn
 
+
 # All tests of TestVmapBase check that the slow vmap fallback is never invoked.
 # This is so that we can incrementally add batching rules for operators to
 # replace the slow vmap fallback path for said operators. To skip this check,
@@ -1308,7 +1348,7 @@ def allowVmapFallbackUsage(fn):
 
 class Namespace:
     class TestVmapBase(TestCase):
-        def __init__(self, method_name='runTest'):
+        def __init__(self, method_name="runTest"):
             super().__init__(method_name)
 
             test_method = getattr(self, method_name, None)
@@ -1316,8 +1356,11 @@ def __init__(self, method_name='runTest'):
                 return
 
             if not should_allow_vmap_fallback_usage(test_method):
-                setattr(self, method_name,
-                        self._wrap_method_with_vmap_fallback_check(test_method))
+                setattr(
+                    self,
+                    method_name,
+                    self._wrap_method_with_vmap_fallback_check(test_method),
+                )
 
         def _wrap_method_with_vmap_fallback_check(self, method):
             # msg = (
@@ -1332,11 +1375,12 @@ def _wrap_method_with_vmap_fallback_check(self, method):
             @functools.wraps(method)
             def wrapper(self, *args, **kwargs):
                 with warnings.catch_warnings(record=True):
-                    warnings.simplefilter('always')
+                    warnings.simplefilter("always")
                     with EnableVmapFallbackWarnings():
                         method(*args, **kwargs)
                     # for captured_warning in wa:
                     #     self.assertNotRegex(str(captured_warning.message), FALLBACK_REGEX, msg)
+
             return types.MethodType(wrapper, self)
 
         @allowVmapFallbackUsage
@@ -1392,56 +1436,70 @@ def _test_unary(self, op, getter, device, *args, **kwargs):
         # Doubly nested vmap
         test(vmap(op), [getter([B0, B1], device)])
         test(vmap(op), [getter([B1, 2, 5, B0, 3], device)], in_dims=2)
-        test(vmap(op, in_dims=2), [getter([2, 5, B0, B1, 3], device)],
-             in_dims=2, out_dims=2)
-
-    @parametrize("case", [
-        (torch.abs, TensorFactory.randn),
-        (torch.acos, TensorFactory.rand),
-        (torch.asin, TensorFactory.rand),
-        (torch.atan, TensorFactory.rand),
-        (torch.ceil, TensorFactory.randn),
-        (torch.cos, TensorFactory.rand),
-        (torch.cosh, TensorFactory.rand),
-        (torch.digamma, TensorFactory.rand),
-        (torch.exp, TensorFactory.randn),
-        (torch.expm1, TensorFactory.randn),
-        (torch.floor, TensorFactory.randn),
-        (torch.frac, TensorFactory.randn),
-        (torch.lgamma, TensorFactory.rand),
-        (torch.log, TensorFactory.randp1),
-        (torch.log10, TensorFactory.randp1),
-        (torch.log1p, TensorFactory.randp1),
-        (torch.log2, TensorFactory.randp1),
-        (torch.neg, TensorFactory.randn),
-        (torch.reciprocal, TensorFactory.randp1),
-        (torch.relu, TensorFactory.randn),
-        (torch.round, TensorFactory.randn),
-        (torch.rsqrt, TensorFactory.randp1),
-        (torch.sigmoid, TensorFactory.randn),
-        (torch.sign, TensorFactory.randn),
-        (torch.sin, TensorFactory.rand),
-        (torch.sinh, TensorFactory.rand),
-        (torch.sqrt, TensorFactory.rand),
-        (torch.tan, TensorFactory.rand),
-        (torch.tanh, TensorFactory.rand),
-        (torch.trunc, TensorFactory.randn),
-    ], name_fn=lambda x: x[0].__name__)
+        test(
+            vmap(op, in_dims=2),
+            [getter([2, 5, B0, B1, 3], device)],
+            in_dims=2,
+            out_dims=2,
+        )
+
+    @parametrize(
+        "case",
+        [
+            (torch.abs, TensorFactory.randn),
+            (torch.acos, TensorFactory.rand),
+            (torch.asin, TensorFactory.rand),
+            (torch.atan, TensorFactory.rand),
+            (torch.ceil, TensorFactory.randn),
+            (torch.cos, TensorFactory.rand),
+            (torch.cosh, TensorFactory.rand),
+            (torch.digamma, TensorFactory.rand),
+            (torch.exp, TensorFactory.randn),
+            (torch.expm1, TensorFactory.randn),
+            (torch.floor, TensorFactory.randn),
+            (torch.frac, TensorFactory.randn),
+            (torch.lgamma, TensorFactory.rand),
+            (torch.log, TensorFactory.randp1),
+            (torch.log10, TensorFactory.randp1),
+            (torch.log1p, TensorFactory.randp1),
+            (torch.log2, TensorFactory.randp1),
+            (torch.neg, TensorFactory.randn),
+            (torch.reciprocal, TensorFactory.randp1),
+            (torch.relu, TensorFactory.randn),
+            (torch.round, TensorFactory.randn),
+            (torch.rsqrt, TensorFactory.randp1),
+            (torch.sigmoid, TensorFactory.randn),
+            (torch.sign, TensorFactory.randn),
+            (torch.sin, TensorFactory.rand),
+            (torch.sinh, TensorFactory.rand),
+            (torch.sqrt, TensorFactory.rand),
+            (torch.tan, TensorFactory.rand),
+            (torch.tanh, TensorFactory.rand),
+            (torch.trunc, TensorFactory.randn),
+        ],
+        name_fn=lambda x: x[0].__name__,
+    )
     def test_unary_pointwise(self, case):
         op, getter = case
-        self._test_unary(op, getter, 'cpu')
+        self._test_unary(op, getter, "cpu")
 
         # test in-place
         method = getattr(Tensor, f'{op.__name__ + "_"}')
-        self._test_unary(method, getter, 'cpu', check_propagates_grad=False)
+        self._test_unary(method, getter, "cpu", check_propagates_grad=False)
 
     def test_clone(self):
         # Some basic tests
-        self._test_unary(lambda x: x.clone(), TensorFactory.randn, 'cpu')
-        self._test_unary(lambda x: x.clone(memory_format=torch.preserve_format),
-                         TensorFactory.randn, 'cpu')
-        self._test_unary(lambda x: x.clone(memory_format=torch.contiguous_format),
-                         TensorFactory.randn, 'cpu')
+        self._test_unary(lambda x: x.clone(), TensorFactory.randn, "cpu")
+        self._test_unary(
+            lambda x: x.clone(memory_format=torch.preserve_format),
+            TensorFactory.randn,
+            "cpu",
+        )
+        self._test_unary(
+            lambda x: x.clone(memory_format=torch.contiguous_format),
+            TensorFactory.randn,
+            "cpu",
+        )
 
         # Test that the per-examples are contiguous when using torch.contiguous_format
         def clone_contiguous(x):
@@ -1458,11 +1516,13 @@ def clone_contiguous(x):
         self.assertTrue(y.is_contiguous())
         self.assertTrue(y[0][0].is_contiguous())
 
-        msg = r'only supported with memory_format torch.preserve_format or torch.contiguous_format'
+        msg = r"only supported with memory_format torch.preserve_format or torch.contiguous_format"
         with self.assertRaisesRegex(RuntimeError, msg):
             vmap(lambda x: x.clone(memory_format=torch.channels_last))(torch.randn(B0))
         with self.assertRaisesRegex(RuntimeError, msg):
-            vmap(lambda x: x.clone(memory_format=torch.channels_last_3d))(torch.randn(B0))
+            vmap(lambda x: x.clone(memory_format=torch.channels_last_3d))(
+                torch.randn(B0)
+            )
 
     def test_weird_matmul_case(self):
         # Check that this doesn't crash.
@@ -1472,11 +1532,14 @@ def test_weird_matmul_case(self):
 
         vmap(vmap(torch.matmul, in_dims=(None, 0)))(x, y)
 
-    @parametrize("case",
-                 (
-                     (torch.clamp_min_, TensorFactory.randn),
-                     (torch.clamp_max_, TensorFactory.randn),
-                 ), name_fn=lambda x: x[0].__name__)
+    @parametrize(
+        "case",
+        (
+            (torch.clamp_min_, TensorFactory.randn),
+            (torch.clamp_max_, TensorFactory.randn),
+        ),
+        name_fn=lambda x: x[0].__name__,
+    )
     def test_clamp_inplace_variant(self, case):
         test = self._vmap_test
 
@@ -1484,29 +1547,66 @@ def get_number(getter):
             return getter([]).item()
 
         op, getter = case
-        device = 'cpu'
+        device = "cpu"
         B0, B1 = 7, 11
 
         # Single vmap: op(Tensor, Tensor)
-        test(op, (getter([B0, 3], device), getter([B0, 3], device)), check_propagates_grad=False)
-        test(op, (getter([B0], device), getter([B0], device)), check_propagates_grad=False)
-        test(op, (getter([2, B0, 3], device), getter([2, B0, 3], device)), in_dims=(1, 1), check_propagates_grad=False)
-        test(op, (getter([B0, 2, 3], device), getter([2, B0, 3], device)),
-             in_dims=(0, 1), out_dims=1, check_propagates_grad=False)
-        test(op, (getter([B0, 2, 3], device), getter([1, 1], device)), in_dims=(0, None), check_propagates_grad=False)
-        test(op, (getter([B0, 3], device), getter([B0, 3], device)), in_dims=(0, 0), check_propagates_grad=False)
+        test(
+            op,
+            (getter([B0, 3], device), getter([B0, 3], device)),
+            check_propagates_grad=False,
+        )
+        test(
+            op,
+            (getter([B0], device), getter([B0], device)),
+            check_propagates_grad=False,
+        )
+        test(
+            op,
+            (getter([2, B0, 3], device), getter([2, B0, 3], device)),
+            in_dims=(1, 1),
+            check_propagates_grad=False,
+        )
+        test(
+            op,
+            (getter([B0, 2, 3], device), getter([2, B0, 3], device)),
+            in_dims=(0, 1),
+            out_dims=1,
+            check_propagates_grad=False,
+        )
+        test(
+            op,
+            (getter([B0, 2, 3], device), getter([1, 1], device)),
+            in_dims=(0, None),
+            check_propagates_grad=False,
+        )
+        test(
+            op,
+            (getter([B0, 3], device), getter([B0, 3], device)),
+            in_dims=(0, 0),
+            check_propagates_grad=False,
+        )
 
         # Nested vmap: op(Tensor, Tensor)
-        test(vmap(op), (getter([B0, B1, 2, 3], device), getter([B0, B1, 1, 3], device)), check_propagates_grad=False)
+        test(
+            vmap(op),
+            (getter([B0, B1, 2, 3], device), getter([B0, B1, 1, 3], device)),
+            check_propagates_grad=False,
+        )
 
         # Python number overload: op(Tensor, Number)
         number = get_number(getter)
-        self._test_unary(lambda t: op(t, number), getter, device, check_propagates_grad=False)
+        self._test_unary(
+            lambda t: op(t, number), getter, device, check_propagates_grad=False
+        )
 
-    @parametrize('case', [
-        subtest(_make_case(torch.clamp_min), name='clamp_min'),
-        subtest(_make_case(torch.clamp_max), name='clamp_max'),
-    ])
+    @parametrize(
+        "case",
+        [
+            subtest(_make_case(torch.clamp_min), name="clamp_min"),
+            subtest(_make_case(torch.clamp_max), name="clamp_max"),
+        ],
+    )
     def test_clamp_variant(self, case):
         test = self._vmap_test
 
@@ -1514,22 +1614,29 @@ def get_number(getter):
             return getter([]).item()
 
         op, getter = case
-        device = 'cpu'
+        device = "cpu"
         B0, B1 = 7, 11
 
         # Single vmap: op(Tensor, Tensor)
         test(op, (getter([B0, 3], device), getter([B0, 3], device)))
         test(op, (getter([B0], device), getter([B0, 2, 3], device)))
         test(op, (getter([B0], device), getter([2, B0, 3], device)), in_dims=(0, 1))
-        test(op, (getter([B0], device), getter([2, B0, 3], device)),
-             in_dims=(0, 1), out_dims=1)
+        test(
+            op,
+            (getter([B0], device), getter([2, B0, 3], device)),
+            in_dims=(0, 1),
+            out_dims=1,
+        )
         test(op, (getter([B0], device), getter([2, 3], device)), in_dims=(0, None))
         test(op, (getter([2, 3], device), getter([B0, 3], device)), in_dims=(None, 0))
 
         # Nested vmap: op(Tensor, Tensor)
         test(vmap(op), (getter([B0, B1, 2, 3], device), getter([B0, B1, 3], device)))
-        test(vmap(op, in_dims=(None, 0)),
-             (getter([B0, 2, 3], device), getter([B1, 3], device)), in_dims=(0, None))
+        test(
+            vmap(op, in_dims=(None, 0)),
+            (getter([B0, 2, 3], device), getter([B1, 3], device)),
+            in_dims=(0, None),
+        )
 
         # Python number overload: op(Tensor, Number)
         number = get_number(getter)
@@ -1548,12 +1655,12 @@ def test_copy_(self):
 
         x = torch.randn(3)
         y = torch.randn(2, 3)
-        with self.assertRaisesRegex(RuntimeError, 'inplace'):
+        with self.assertRaisesRegex(RuntimeError, "inplace"):
             vmap(Tensor.copy_, in_dims=(None, 0))(x, y)
 
     def test_silu_backward(self):
         test = self._vmap_test
-        device = 'cpu'
+        device = "cpu"
         getter = TensorFactory.randp1
         B0 = 7
         op = torch.ops.aten.silu_backward
@@ -1563,20 +1670,36 @@ def test_silu_backward(self):
         test(op, (getter([], device), getter([B0], device)), in_dims=(None, 0))
         test(op, (getter([2, B0], device), getter([2], device)), in_dims=(1, None))
 
-    @skipIf(TEST_WITH_TORCHDYNAMO and os.getenv('BUILD_ENVIRONMENT', '') == 'linux-focal-py3.8-clang10',
-            "Segfaults with dynamo on focal, see https://github.com/pytorch/pytorch/issues/107173")
-    @parametrize('case', [
-        subtest(_make_case(torch.add), name='add'),
-        subtest(_make_case(lambda x, y: x + y), name='add_dunder'),
-        subtest(_make_case(torch.sub), name='sub'),
-        subtest(_make_case(lambda x, y: x - y), name='sub_dunder'),
-        subtest(_make_case(torch.mul), name='mul'),
-        subtest(_make_case(lambda x, y: x * y), name='mul_dunder'),
-        subtest(_make_case(torch.div, input_getter=TensorFactory.randp1), name='div'),
-        subtest(_make_case(lambda x, y: x / y, input_getter=TensorFactory.randp1), name='div_dunder'),
-        subtest(_make_case(torch.pow, input_getter=TensorFactory.randp1), name='pow'),
-        subtest(_make_case(lambda x, y: x ** y, input_getter=TensorFactory.randp1), name='pow_dunder'),
-    ])
+    @skipIf(
+        TEST_WITH_TORCHDYNAMO
+        and os.getenv("BUILD_ENVIRONMENT", "") == "linux-focal-py3.8-clang10",
+        "Segfaults with dynamo on focal, see https://github.com/pytorch/pytorch/issues/107173",
+    )
+    @parametrize(
+        "case",
+        [
+            subtest(_make_case(torch.add), name="add"),
+            subtest(_make_case(lambda x, y: x + y), name="add_dunder"),
+            subtest(_make_case(torch.sub), name="sub"),
+            subtest(_make_case(lambda x, y: x - y), name="sub_dunder"),
+            subtest(_make_case(torch.mul), name="mul"),
+            subtest(_make_case(lambda x, y: x * y), name="mul_dunder"),
+            subtest(
+                _make_case(torch.div, input_getter=TensorFactory.randp1), name="div"
+            ),
+            subtest(
+                _make_case(lambda x, y: x / y, input_getter=TensorFactory.randp1),
+                name="div_dunder",
+            ),
+            subtest(
+                _make_case(torch.pow, input_getter=TensorFactory.randp1), name="pow"
+            ),
+            subtest(
+                _make_case(lambda x, y: x**y, input_getter=TensorFactory.randp1),
+                name="pow_dunder",
+            ),
+        ],
+    )
     def test_arithmetic(self, case):
         test = self._vmap_test
 
@@ -1584,22 +1707,29 @@ def get_number(getter):
             return getter([]).item()
 
         op, getter = case
-        device = 'cpu'
+        device = "cpu"
         B0, B1 = 7, 11
 
         # Single vmap: op(Tensor, Tensor)
         test(op, (getter([B0, 3], device), getter([B0, 3], device)))
         test(op, (getter([B0], device), getter([B0, 2, 3], device)))
         test(op, (getter([B0], device), getter([2, B0, 3], device)), in_dims=(0, 1))
-        test(op, (getter([B0], device), getter([2, B0, 3], device)),
-             in_dims=(0, 1), out_dims=1)
+        test(
+            op,
+            (getter([B0], device), getter([2, B0, 3], device)),
+            in_dims=(0, 1),
+            out_dims=1,
+        )
         test(op, (getter([B0], device), getter([2, 3], device)), in_dims=(0, None))
         test(op, (getter([2, 3], device), getter([B0, 3], device)), in_dims=(0, None))
 
         # Nested vmap: op(Tensor, Tensor)
         test(vmap(op), (getter([B0, B1, 2, 3], device), getter([B0, B1, 3], device)))
-        test(vmap(op, in_dims=(None, 0)),
-             (getter([B0, 2, 3], device), getter([B1, 3], device)), in_dims=(0, None))
+        test(
+            vmap(op, in_dims=(None, 0)),
+            (getter([B0, 2, 3], device), getter([B1, 3], device)),
+            in_dims=(0, None),
+        )
 
         # Python number overload: op(Tensor, Number) (and vice-versa)
         number = get_number(getter)
@@ -1667,7 +1797,9 @@ def _test(sizes, strides, offset, tensor, lambd):
             offset = x.storage_offset()
 
             # Broadcast
-            _test([5, 5, 2, 3], [0, 0, S0, S1], offset, x, lambda x: x.expand(5, 5, 2, 3))
+            _test(
+                [5, 5, 2, 3], [0, 0, S0, S1], offset, x, lambda x: x.expand(5, 5, 2, 3)
+            )
             # transpose
             _test([3, 2], [S1, S0], offset, x, lambda x: x.transpose(0, 1))
             # select
@@ -1681,13 +1813,17 @@ def _test(sizes, strides, offset, tensor, lambd):
         B1 = 7
         x = torch.randn(B1, B0, 2, 3)
         S0, S1 = x.stride()[2:]
-        result = vmap(vmap(lambda t: t.as_strided([5, 5, 2, 3], [0, 0, S0, S1])), in_dims=1)(x)
+        result = vmap(
+            vmap(lambda t: t.as_strided([5, 5, 2, 3], [0, 0, S0, S1])), in_dims=1
+        )(x)
         expected = vmap(vmap(lambda t: t.expand(5, 5, 2, 3)), in_dims=1)(x)
         self.assertTrue(result._base is expected._base)
         self.assertEqual(result, expected)
 
         # Check that mal-formatted size/strides doesn't crash
-        with self.assertRaisesRegex(RuntimeError, 'size and stride must have the same length'):
+        with self.assertRaisesRegex(
+            RuntimeError, "size and stride must have the same length"
+        ):
             x = torch.randn(B0, 2, 3).transpose(0, 1)
             vmap(lambda x: x.as_strided([1, 1, 1], [1, 1]))(x)
 
@@ -1700,7 +1836,7 @@ def _test(sizes, strides, offset, tensor, lambd):
         # Sanity check #1a: The maximum indexable location of
         # xs[i].as_strided(sizes, strides, offset + xs[i].offset() - xs.offset())
         # is less than or equal to the maximum indexable location of xs[i].
-        msg = 'This is not supported inside of vmap'
+        msg = "This is not supported inside of vmap"
         with self.assertRaisesRegex(RuntimeError, msg):
             x = torch.randn(B0, 3)
             vmap(lambda x: x.as_strided([3], [1], 1))(x)
@@ -1726,7 +1862,6 @@ def _test(sizes, strides, offset, tensor, lambd):
             x = torch.randn(B0, 0, 3)
             vmap(lambda x: x.as_strided([3], [1]))(x)
 
-    @xfailIfTorchDynamo
     def test_nll_loss(self):
         test = self._vmap_test
         op = F.nll_loss
@@ -1735,16 +1870,15 @@ def test_nll_loss(self):
         y = torch.randn(B, 2, 5)
         t = torch.randint(0, 5, (B, 2))
         test(op, (y, t))
-        test(functools.partial(op, reduction='sum'), (y, t))
-        test(functools.partial(op, reduction='none'), (y, t))
+        test(functools.partial(op, reduction="sum"), (y, t))
+        test(functools.partial(op, reduction="none"), (y, t))
 
         y = torch.randn(B, 2, 5)
         t = torch.randint(0, 5, (2,))
         test(op, (y, t), in_dims=(0, None))
-        test(functools.partial(op, reduction='sum'), (y, t), in_dims=(0, None))
-        test(functools.partial(op, reduction='none'), (y, t), in_dims=(0, None))
+        test(functools.partial(op, reduction="sum"), (y, t), in_dims=(0, None))
+        test(functools.partial(op, reduction="none"), (y, t), in_dims=(0, None))
 
-    @xfailIfTorchDynamo
     def test_adaptive_avg_pool2d(self):
         test = self._vmap_test
         op = functools.partial(F.adaptive_avg_pool2d, output_size=(3, 3))
@@ -1770,19 +1904,32 @@ def test_bmm(self):
 
         # left arg is vmapped
         test(op, (torch.rand(B0, 2, 3, 5), torch.rand(2, 5, 3)), in_dims=(0, None))
-        test(vmap(op, in_dims=(0, None)), (torch.rand(B1, B0, 2, 3, 5), torch.rand(2, 5, 3)),
-             in_dims=(1, None))
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, B0, 2, 3, 5), torch.rand(2, 5, 3)),
+            in_dims=(1, None),
+        )
 
         # right arg is vmapped
         test(op, (torch.rand(2, 5, 3), torch.rand(B0, 2, 3, 5)), in_dims=(None, 0))
-        test(vmap(op, in_dims=(None, 0)), (torch.rand(2, 5, 3), torch.rand(B1, B0, 2, 3, 5)),
-             in_dims=(None, 1))
+        test(
+            vmap(op, in_dims=(None, 0)),
+            (torch.rand(2, 5, 3), torch.rand(B1, B0, 2, 3, 5)),
+            in_dims=(None, 1),
+        )
 
         # both args are vmapped
         test(op, (torch.rand(B0, 2, 3, 5), torch.rand(B0, 2, 5, 3)))
-        test(vmap(op), (torch.rand(B1, B0, 2, 3, 5), torch.rand(B0, B1, 2, 5, 3)), in_dims=(1, 0))
-        test(vmap(op, in_dims=(0, None)),
-             (torch.rand(B1, 2, 3, 5), torch.rand(B0, 2, 5, 3)), in_dims=(None, 0))
+        test(
+            vmap(op),
+            (torch.rand(B1, B0, 2, 3, 5), torch.rand(B0, B1, 2, 5, 3)),
+            in_dims=(1, 0),
+        )
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, 2, 3, 5), torch.rand(B0, 2, 5, 3)),
+            in_dims=(None, 0),
+        )
 
     def test_cat(self):
         test = self._vmap_test
@@ -1792,22 +1939,37 @@ def test_cat(self):
         def get_op(dim):
             def op(*tensors):
                 return torch.cat(tensors, dim=dim)
+
             return op
 
         test(get_op(0), (torch.rand(B0, 2), torch.rand(B0, 3)))
         test(get_op(0), (torch.rand(B0, 0), torch.rand(B0, 0)))
         test(get_op(0), (torch.rand(2), torch.rand(B0, 0)), in_dims=(None, 0))
-        test(get_op(1), (torch.rand(2, 5), torch.rand(B0, 0), torch.rand(2, 3)), in_dims=(None, 0, None))
+        test(
+            get_op(1),
+            (torch.rand(2, 5), torch.rand(B0, 0), torch.rand(2, 3)),
+            in_dims=(None, 0, None),
+        )
         test(get_op(1), (torch.rand(B0, 2, 3), torch.rand(B0, 0)))
         test(get_op(1), (torch.rand(B0, 2, 3, 4), torch.rand(0)), in_dims=(0, None))
-        test(get_op(0), (torch.rand(0), torch.rand(B0, 2), torch.rand(B0, 0)), in_dims=(None, 0, 0))
+        test(
+            get_op(0),
+            (torch.rand(0), torch.rand(B0, 2), torch.rand(B0, 0)),
+            in_dims=(None, 0, 0),
+        )
         test(get_op(0), (torch.rand(2), torch.rand(B0, 3)), in_dims=(None, 0))
         test(get_op(0), (torch.rand(2, 17), torch.rand(3, 17, B0)), in_dims=(None, 2))
         test(get_op(-1), (torch.rand(17, 2), torch.rand(17, 3, B0)), in_dims=(None, 2))
-        test(vmap(get_op(0), in_dims=(0, None)),
-             (torch.rand(B1, 2), torch.rand(B0, 3)), in_dims=(None, 0))
-        test(vmap(get_op(0), in_dims=(0, 0)),
-             (torch.rand(B1, 2), torch.rand(B0, B1, 3)), in_dims=(None, 0))
+        test(
+            vmap(get_op(0), in_dims=(0, None)),
+            (torch.rand(B1, 2), torch.rand(B0, 3)),
+            in_dims=(None, 0),
+        )
+        test(
+            vmap(get_op(0), in_dims=(0, 0)),
+            (torch.rand(B1, 2), torch.rand(B0, B1, 3)),
+            in_dims=(None, 0),
+        )
 
     def test_unsafe_view(self):
         # Unsafe view isn't exposed, so we get at it via
@@ -1828,6 +1990,7 @@ def test_conj(self):
         def run_test(dtype):
             def get(shape):
                 return torch.randn(shape, dtype=dtype)
+
             B0, B1 = 7, 11
             test = self._vmap_test
 
@@ -1839,8 +2002,7 @@ def get(shape):
             # Doubly nested vmap
             test(vmap(op), [get([B0, B1])])
             test(vmap(op), [get([B1, 2, 5, B0, 3])], in_dims=2)
-            test(vmap(op, in_dims=2), [get([2, 5, B0, B1, 3])],
-                 in_dims=2, out_dims=2)
+            test(vmap(op, in_dims=2), [get([2, 5, B0, B1, 3])], in_dims=2, out_dims=2)
 
         # correctness tests
         run_test(torch.float)
@@ -1854,7 +2016,7 @@ def get(shape):
     def test_contiguous(self):
         op = Tensor.contiguous
 
-        self._test_unary(op, TensorFactory.randn, 'cpu')
+        self._test_unary(op, TensorFactory.randn, "cpu")
 
         # check that contiguous returns the original tensor if the per-examples
         # are already contiguous
@@ -1864,7 +2026,7 @@ def test_contiguous(self):
         result = vmap(Tensor.contiguous, in_dims=2, out_dims=2)(x)
         self.assertTrue(result is x)
 
-        msg = 'NYI: querying is_contiguous inside of vmap for memory_format'
+        msg = "NYI: querying is_contiguous inside of vmap for memory_format"
         tensor = torch.randn(B0, 3)
         with self.assertRaisesRegex(RuntimeError, msg):
             vmap(functools.partial(op, memory_format=torch.channels_last))(tensor)
@@ -1898,10 +2060,16 @@ def test_chunk(self):
         # tests for torch.split(self, split_size: int, dim)
         test(op, (torch.rand(B0, 2, 1024), 15, -1), in_dims=(0, None, None))
         test(op, (torch.rand(2, B0, 1024), 9, 1), in_dims=(1, None, None))
-        test(vmap(op, in_dims=(0, None, None)), (torch.rand(B1, 1023, B0, 5), 4, 0),
-             in_dims=(2, None, None))
-        test(vmap(vmap(lambda t: op(t, 4, 1), in_dims=2)),
-             (torch.rand(B1, 2, B0, 64, B2),), in_dims=2)
+        test(
+            vmap(op, in_dims=(0, None, None)),
+            (torch.rand(B1, 1023, B0, 5), 4, 0),
+            in_dims=(2, None, None),
+        )
+        test(
+            vmap(vmap(lambda t: op(t, 4, 1), in_dims=2)),
+            (torch.rand(B1, 2, B0, 64, B2),),
+            in_dims=2,
+        )
 
     def test_clamp(self):
         clamp_cases = (
@@ -1912,7 +2080,7 @@ def test_clamp(self):
             (lambda t: t.clamp_max(max=0.5), TensorFactory.randn),
         )
         for op, getter in clamp_cases:
-            self._test_unary(op, getter, 'cpu')
+            self._test_unary(op, getter, "cpu")
 
     def test_comparison_ops(self):
         test = functools.partial(self._vmap_test, check_propagates_grad=False)
@@ -1921,12 +2089,18 @@ def test_comparison_ops(self):
         B0, B1 = 7, 11
 
         ops = (
-            torch.eq, lambda x, y: x == y,
-            torch.gt, lambda x, y: x > y,
-            torch.ge, lambda x, y: x >= y,
-            torch.le, lambda x, y: x <= y,
-            torch.lt, lambda x, y: x < y,
-            torch.ne, lambda x, y: x != y,
+            torch.eq,
+            lambda x, y: x == y,
+            torch.gt,
+            lambda x, y: x > y,
+            torch.ge,
+            lambda x, y: x >= y,
+            torch.le,
+            lambda x, y: x <= y,
+            torch.lt,
+            lambda x, y: x < y,
+            torch.ne,
+            lambda x, y: x != y,
         )
 
         for op in ops:
@@ -1940,12 +2114,17 @@ def test_comparison_ops(self):
 
             # Nested vmap: op(Tensor, Tensor)
             test(vmap(op), (getter([B0, B1, 2, 3]), getter([B0, B1, 3])))
-            test(vmap(op, in_dims=(None, 0)),
-                 (getter([B0, 2, 3]), getter([B1, 3])), in_dims=(0, None))
+            test(
+                vmap(op, in_dims=(None, 0)),
+                (getter([B0, 2, 3]), getter([B1, 3])),
+                in_dims=(0, None),
+            )
 
             # test number as inputs
             number = getter([]).item()
-            self._test_unary(lambda t: op(t, number), getter, 'cpu', check_propagates_grad=False)
+            self._test_unary(
+                lambda t: op(t, number), getter, "cpu", check_propagates_grad=False
+            )
 
     def test_cross_batch_size_three(self):
         # Let's test corner case when batch_size is 3 and cross' dim argument is not specified
@@ -1955,8 +2134,11 @@ def test_cross_batch_size_three(self):
         test = self._vmap_test
         B0 = B1 = 3
         test(op, (torch.rand(B0, 2, 3), torch.rand(B0, 2, 3)))
-        test(vmap(op, in_dims=(0, None)), (torch.rand(B0, B1, 2, 3), torch.rand(B0, B1, 2, 3)),
-             in_dims=(None, 1))
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B0, B1, 2, 3), torch.rand(B0, B1, 2, 3)),
+            in_dims=(None, 1),
+        )
 
     def test_diagonal(self):
         tensor = torch.randn(3, 5, 7, 11, 13)
@@ -1967,8 +2149,12 @@ def test_diagonal(self):
         test(op, (tensor, 2, 1, 2), in_dims=(1, None, None, None))
         test(op, (tensor, 0, -2, -1), in_dims=(1, None, None, None), out_dims=1)
         test(vmap(lambda t: op(t, 0, 0, -1)), (tensor,), in_dims=1, out_dims=1)
-        test(vmap(vmap(lambda t: op(t, 0, 0, 1), in_dims=1), in_dims=3),
-             (tensor,), in_dims=1, out_dims=1)
+        test(
+            vmap(vmap(lambda t: op(t, 0, 0, 1), in_dims=1), in_dims=3),
+            (tensor,),
+            in_dims=1,
+            out_dims=1,
+        )
 
     def test_dot(self):
         op = torch.dot
@@ -1986,19 +2172,28 @@ def test_dot(self):
 
         # left arg is vmapped
         test(op, (torch.rand(B0, 5), torch.rand(5)), in_dims=(0, None))
-        test(vmap(op, in_dims=(0, None)), (torch.rand(B1, B0, 5), torch.rand(5)),
-             in_dims=(1, None))
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, B0, 5), torch.rand(5)),
+            in_dims=(1, None),
+        )
 
         # right arg is vmapped
         test(op, (torch.rand(5), torch.rand(B0, 5)), in_dims=(None, 0))
-        test(vmap(op, in_dims=(None, 0)), (torch.rand(5), torch.rand(B1, B0, 5)),
-             in_dims=(None, 1))
+        test(
+            vmap(op, in_dims=(None, 0)),
+            (torch.rand(5), torch.rand(B1, B0, 5)),
+            in_dims=(None, 1),
+        )
 
         # both args are vmapped
         test(op, (torch.rand(B0, 5), torch.rand(B0, 5)))
         test(vmap(op), (torch.rand(B1, B0, 5), torch.rand(B0, B1, 5)), in_dims=(1, 0))
-        test(vmap(op, in_dims=(0, None)),
-             (torch.rand(B1, 5), torch.rand(B0, 5)), in_dims=(None, 0))
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, 5), torch.rand(B0, 5)),
+            in_dims=(None, 0),
+        )
 
     def test_expand_as(self):
         op = torch.Tensor.expand_as
@@ -2008,7 +2203,11 @@ def test_expand_as(self):
         test(op, (torch.rand(B0, 1, 5), torch.rand(2, 3, 5)), in_dims=(0, None))
         test(op, (torch.rand(1, 5), torch.rand(B0, 2, 3, 5)), in_dims=(None, 0))
         test(vmap(op), (torch.rand(B0, B1, 1, 5), torch.rand(B0, B1, 2, 3, 5)))
-        test(vmap(op), (torch.rand(B0, B1, 1, 5), torch.rand(B1, B0, 2, 3, 5)), in_dims=(0, 1))
+        test(
+            vmap(op),
+            (torch.rand(B0, B1, 1, 5), torch.rand(B1, B0, 2, 3, 5)),
+            in_dims=(0, 1),
+        )
         test(vmap(op), (torch.rand(B0, B1), torch.rand(B1, 2, 3, 5)), in_dims=(0, None))
         test(vmap(vmap(op)), (torch.rand(B0, B1, B2), torch.rand(B0, B1, B2, 2, 3, 5)))
 
@@ -2030,18 +2229,22 @@ def test_fill_and_zero_inplace(self):
             # Doubly nested vmap
             test(vmap(op), [TensorFactory.randn([B0, B1])])
             test(vmap(op), [TensorFactory.randn([B1, 2, 5, B0, 3])], in_dims=2)
-            test(vmap(op, in_dims=2), [TensorFactory.randn([2, 5, B0, B1, 3])],
-                 in_dims=2, out_dims=2)
+            test(
+                vmap(op, in_dims=2),
+                [TensorFactory.randn([2, 5, B0, B1, 3])],
+                in_dims=2,
+                out_dims=2,
+            )
 
         # test when value is a batched tensor for fill_ operator
         B0, B1 = 3, 5
         test(Tensor.fill_, [TensorFactory.randn([B0, B1]), TensorFactory.randn(B0)])
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    ""):
+        with self.assertRaisesRegex(RuntimeError, ""):
             # Runtime Error is thrown when the tensor being written to isn't being vmapped over
-            vmap(Tensor.fill_, (None, 0))(TensorFactory.randn([B0, B1]),
-                                          TensorFactory.randn([B0]))
+            vmap(Tensor.fill_, (None, 0))(
+                TensorFactory.randn([B0, B1]), TensorFactory.randn([B0])
+            )
 
     def _test_complex_views(self, op, dtypes):
         test = self._vmap_view_test
@@ -2061,8 +2264,7 @@ def get(shape):
             # Doubly nested vmap
             test(vmap(op), [get([B0, B1])])
             test(vmap(op), [get([B1, 2, 5, 3, B0])], in_dims=4)
-            test(vmap(op, in_dims=2), [get([2, 5, B0, B1, 3])],
-                 in_dims=2, out_dims=2)
+            test(vmap(op, in_dims=2), [get([2, 5, B0, B1, 3])], in_dims=2, out_dims=2)
 
         for dtype in dtypes:
             run_test(op, dtype)
@@ -2074,7 +2276,9 @@ def test_imag(self):
         self._test_complex_views(torch.imag, dtypes=[torch.cfloat, torch.cdouble])
 
     def test_view_as_real(self):
-        self._test_complex_views(torch.view_as_real, dtypes=[torch.cfloat, torch.cdouble])
+        self._test_complex_views(
+            torch.view_as_real, dtypes=[torch.cfloat, torch.cdouble]
+        )
 
     def test_view_as_complex(self):
         def run_test(dtype):
@@ -2093,8 +2297,9 @@ def get(shape):
             # Doubly nested vmap
             test(vmap(op), [get([B0, B1, 2])])
             test(vmap(op), [get([B1, 2, 5, B0, 3, 2])], in_dims=2)
-            test(vmap(op, in_dims=2), [get([2, 5, B0, B1, 3, 2])],
-                 in_dims=2, out_dims=2)
+            test(
+                vmap(op, in_dims=2), [get([2, 5, B0, B1, 3, 2])], in_dims=2, out_dims=2
+            )
 
             # Interesting case #1: Batch dim directly before dim of size 2
             test(op, [get([3, B0, 2])], in_dims=1)
@@ -2115,7 +2320,7 @@ def get(shape):
                 vmap(vmap(op, in_dims=1), in_dims=1)(get([2, B0, B1]))
 
             # Invalid input: no dimension of size 2
-            msg = 'Input tensor must have one or more dimensions'
+            msg = "Input tensor must have one or more dimensions"
             with self.assertRaisesRegex(RuntimeError, msg):
                 vmap(op)(get([B0]))
             with self.assertRaisesRegex(RuntimeError, msg):
@@ -2123,7 +2328,7 @@ def get(shape):
 
             # Invalid input: Batch dim has size 2, but the logical last dim does
             # not have size 2
-            msg = 'Tensor must have a last dimension of size 2'
+            msg = "Tensor must have a last dimension of size 2"
             with self.assertRaisesRegex(RuntimeError, msg):
                 vmap(op, in_dims=1)(get([3, 2]))
 
@@ -2144,7 +2349,7 @@ def foo(x):
         self.assertEqual(vmap(foo)(tensor), torch.tensor([0, 0, 0]))
 
     def test_is_floating_point(self):
-        float_tensor = torch.tensor([1., 2., 3.])
+        float_tensor = torch.tensor([1.0, 2.0, 3.0])
         long_tensor = torch.tensor([1, 2, 3])
 
         def foo(x):
@@ -2156,14 +2361,13 @@ def foo(x):
         self.assertEqual(vmap(foo)(float_tensor), torch.tensor([1, 1, 1]))
         self.assertEqual(vmap(foo)(long_tensor), torch.tensor([0, 0, 0]))
 
-    @unittest.skipIf(IS_WINDOWS,
-                     reason="Windows not yet supported for torch.compile")
+    @unittest.skipIf(IS_WINDOWS, reason="Windows not yet supported for torch.compile")
     def test_is_contiguous(self):
         def foo(x):
             if x.is_contiguous():
-                return torch.tensor(1.)
+                return torch.tensor(1.0)
             else:
-                return torch.tensor(0.)
+                return torch.tensor(0.0)
 
         B0, B1 = 3, 5
 
@@ -2207,7 +2411,7 @@ def baz(x, memory_format):
             x.is_contiguous(memory_format=memory_format)
             return x
 
-        msg = 'NYI: querying is_contiguous inside of vmap for memory_format'
+        msg = "NYI: querying is_contiguous inside of vmap for memory_format"
         tensor = torch.randn(B0, 2, 7, 3)
         with self.assertRaisesRegex(RuntimeError, msg):
             vmap(functools.partial(baz, memory_format=torch.channels_last))(tensor)
@@ -2215,6 +2419,7 @@ def baz(x, memory_format):
             vmap(functools.partial(baz, memory_format=torch.channels_last_3d))(tensor)
 
         for mf in (torch.channels_last, torch.channels_last_3d):
+
             @torch.compile(backend="eager", fullgraph=True)
             def f(x):
                 if x.is_contiguous(memory_format=mf):
@@ -2249,7 +2454,7 @@ def unsqueeze_last(x):
             return torch.unsqueeze(x, -1)
 
         # bdims in canonical order
-        test(vmap(unsqueeze_0), (torch.rand(B0, B1, 2), ))
+        test(vmap(unsqueeze_0), (torch.rand(B0, B1, 2),))
         test(vmap(unsqueeze_last), (torch.rand(B0, B1, 2),))
 
         # wild bdims
@@ -2266,17 +2471,30 @@ def test_movedim(self):
         # movedim(tensor, int, int) variant
         test(op, (torch.rand(B0, 2, 5), 0, 1), in_dims=(0, None, None))
         test(op, (torch.rand(2, B0, 5), 0, 1), in_dims=(1, None, None))
-        test(vmap(op, in_dims=(0, None, None)), (torch.rand(B1, 2, B0, 5), 0, 1), in_dims=(2, None, None))
-        test(vmap(vmap(op, in_dims=(2, None, None)), in_dims=(0, None, None)),
-             (torch.rand(B1, 2, B0, 5, B2), 0, 1), in_dims=(2, None, None))
+        test(
+            vmap(op, in_dims=(0, None, None)),
+            (torch.rand(B1, 2, B0, 5), 0, 1),
+            in_dims=(2, None, None),
+        )
+        test(
+            vmap(vmap(op, in_dims=(2, None, None)), in_dims=(0, None, None)),
+            (torch.rand(B1, 2, B0, 5, B2), 0, 1),
+            in_dims=(2, None, None),
+        )
 
         # movedim(tensor, intlist, intlist) variant
         test(op, (torch.rand(B0, 2, 3, 5), [1, 0], [0, 2]), in_dims=(0, None, None))
         test(op, (torch.rand(2, 3, B0, 5), [1, 0], [0, 2]), in_dims=(1, None, None))
-        test(vmap(op, in_dims=(0, None, None)),
-             (torch.rand(B1, 2, B0, 5), [0, 1], [1, 0]), in_dims=(2, None, None))
-        test(vmap(vmap(op, in_dims=(2, None, None)), in_dims=(0, None, None)),
-             (torch.rand(B1, 2, B0, 5, B2), [0, 1], [1, 0]), in_dims=(2, None, None))
+        test(
+            vmap(op, in_dims=(0, None, None)),
+            (torch.rand(B1, 2, B0, 5), [0, 1], [1, 0]),
+            in_dims=(2, None, None),
+        )
+        test(
+            vmap(vmap(op, in_dims=(2, None, None)), in_dims=(0, None, None)),
+            (torch.rand(B1, 2, B0, 5, B2), [0, 1], [1, 0]),
+            in_dims=(2, None, None),
+        )
 
     def test_mm(self):
         op = torch.mm
@@ -2294,19 +2512,32 @@ def test_mm(self):
 
         # left arg is vmapped
         test(op, (torch.rand(B0, 2, 5), torch.rand(5, 2)), in_dims=(0, None))
-        test(vmap(op, in_dims=(0, None)), (torch.rand(B1, B0, 2, 5), torch.rand(5, 2)),
-             in_dims=(1, None))
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, B0, 2, 5), torch.rand(5, 2)),
+            in_dims=(1, None),
+        )
 
         # right arg is vmapped
         test(op, (torch.rand(2, 5), torch.rand(B0, 5, 2)), in_dims=(None, 0))
-        test(vmap(op, in_dims=(None, 0)), (torch.rand(2, 5), torch.rand(B1, B0, 5, 2)),
-             in_dims=(None, 1))
+        test(
+            vmap(op, in_dims=(None, 0)),
+            (torch.rand(2, 5), torch.rand(B1, B0, 5, 2)),
+            in_dims=(None, 1),
+        )
 
         # both args are vmapped
         test(op, (torch.rand(B0, 2, 5), torch.rand(B0, 5, 2)))
-        test(vmap(op), (torch.rand(B1, B0, 2, 5), torch.rand(B0, B1, 5, 2)), in_dims=(1, 0))
-        test(vmap(op, in_dims=(0, None)),
-             (torch.rand(B1, 2, 5), torch.rand(B0, 5, 2)), in_dims=(None, 0))
+        test(
+            vmap(op),
+            (torch.rand(B1, B0, 2, 5), torch.rand(B0, B1, 5, 2)),
+            in_dims=(1, 0),
+        )
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, 2, 5), torch.rand(B0, 5, 2)),
+            in_dims=(None, 0),
+        )
 
     def test_mv(self):
         op = torch.mv
@@ -2324,19 +2555,30 @@ def test_mv(self):
 
         # left arg is vmapped
         test(op, (torch.rand(B0, 2, 5), torch.rand(5)), in_dims=(0, None))
-        test(vmap(op, in_dims=(0, None)), (torch.rand(B1, B0, 2, 5), torch.rand(5)),
-             in_dims=(1, None))
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, B0, 2, 5), torch.rand(5)),
+            in_dims=(1, None),
+        )
 
         # right arg is vmapped
         test(op, (torch.rand(2, 5), torch.rand(B0, 5)), in_dims=(None, 0))
-        test(vmap(op, in_dims=(None, 0)), (torch.rand(2, 5), torch.rand(B1, B0, 5)),
-             in_dims=(None, 1))
+        test(
+            vmap(op, in_dims=(None, 0)),
+            (torch.rand(2, 5), torch.rand(B1, B0, 5)),
+            in_dims=(None, 1),
+        )
 
         # both args are vmapped
         test(op, (torch.rand(B0, 2, 5), torch.rand(B0, 5)))
-        test(vmap(op), (torch.rand(B1, B0, 2, 5), torch.rand(B0, B1, 5)), in_dims=(1, 0))
-        test(vmap(op, in_dims=(0, None)),
-             (torch.rand(B1, 2, 5), torch.rand(B0, 5)), in_dims=(None, 0))
+        test(
+            vmap(op), (torch.rand(B1, B0, 2, 5), torch.rand(B0, B1, 5)), in_dims=(1, 0)
+        )
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, 2, 5), torch.rand(B0, 5)),
+            in_dims=(None, 0),
+        )
 
     def test_narrow(self):
         op = torch.narrow
@@ -2345,10 +2587,18 @@ def test_narrow(self):
 
         test(op, (torch.rand(B0, 2, 5), -1, 1, 3), in_dims=(0, None, None, None))
         test(op, (torch.rand(2, B0, 5), 1, 1, 3), in_dims=(1, None, None, None))
-        test(vmap(op, in_dims=(0, None, None, None)),
-             (torch.rand(B1, 2, B0, 5), 1, 0, 0), in_dims=(2, None, None, None))
-        test(vmap(vmap(op, in_dims=(2, None, None, None)), in_dims=(0, None, None, None)),
-             (torch.rand(B1, 2, B0, 5, B2), -1, 2, 3), in_dims=(2, None, None, None))
+        test(
+            vmap(op, in_dims=(0, None, None, None)),
+            (torch.rand(B1, 2, B0, 5), 1, 0, 0),
+            in_dims=(2, None, None, None),
+        )
+        test(
+            vmap(
+                vmap(op, in_dims=(2, None, None, None)), in_dims=(0, None, None, None)
+            ),
+            (torch.rand(B1, 2, B0, 5, B2), -1, 2, 3),
+            in_dims=(2, None, None, None),
+        )
 
     def test_new_empty(self):
         # Empty is non-deterministic so we just check that the shape of the
@@ -2386,7 +2636,9 @@ def _test_double_vmap(size, stride, B0, B1):
             self.assertEqual(result.stride(), [B1 * S, S] + stride)
 
             x = torch.randn(B1, B0)
-            result = vmap(vmap(lambda x: x.new_empty_strided(size, stride)), in_dims=1)(x)
+            result = vmap(vmap(lambda x: x.new_empty_strided(size, stride)), in_dims=1)(
+                x
+            )
             S = x.new_empty_strided(size, stride).storage().size()
             self.assertEqual(result.shape, [B0, B1] + size)
             self.assertEqual(result.stride(), [B1 * S, S] + stride)
@@ -2423,7 +2675,11 @@ def test_select(self):
         test(op, (torch.rand(B0, 2, 5), 0, 0), in_dims=(0, None, None))
         test(op, (torch.rand(2, B0, 5), 1, 1), in_dims=(1, None, None))
         test(vmap(lambda t: op(t, 1, 1)), (torch.rand(B1, 2, B0, 5),), in_dims=2)
-        test(vmap(vmap(lambda t: op(t, 1, 1), in_dims=1)), (torch.rand(B1, 2, B0, B2, 5),), in_dims=2)
+        test(
+            vmap(vmap(lambda t: op(t, 1, 1), in_dims=1)),
+            (torch.rand(B1, 2, B0, B2, 5),),
+            in_dims=2,
+        )
 
     def test_roll_no_dims(self):
         op = torch.roll
@@ -2432,7 +2688,11 @@ def test_roll_no_dims(self):
         test(op, (torch.rand(B0, 2, 5), 2), in_dims=(0, None))
         test(op, (torch.rand(2, B0, 5), 3), in_dims=(1, None))
         test(vmap(lambda t: op(t, 3)), (torch.rand(B1, 2, B0, 5),), in_dims=2)
-        test(vmap(vmap(lambda t: op(t, 3), in_dims=1)), (torch.rand(B1, 2, B0, B2, 5),), in_dims=2)
+        test(
+            vmap(vmap(lambda t: op(t, 3), in_dims=1)),
+            (torch.rand(B1, 2, B0, B2, 5),),
+            in_dims=2,
+        )
 
     def test_stack(self):
         test = self._vmap_test
@@ -2442,25 +2702,37 @@ def test_stack(self):
         def get_op(dim):
             def op(*tensors):
                 return torch.stack(tensors, dim=dim)
+
             return op
 
         test(get_op(0), (torch.rand(B0, 3), torch.rand(B0, 3)))
         test(get_op(0), (torch.rand(3), torch.rand(B0, 3)), in_dims=(None, 0))
         test(get_op(0), (torch.rand(2, 17), torch.rand(2, 17, B0)), in_dims=(None, 2))
         test(get_op(-1), (torch.rand(2, 17), torch.rand(2, 17, B0)), in_dims=(None, 2))
-        test(vmap(get_op(0), in_dims=(0, None)),
-             (torch.rand(B1, 2), torch.rand(B0, 2)), in_dims=(None, 0))
-        test(vmap(get_op(0), in_dims=(0, 0)),
-             (torch.rand(B1, 2), torch.rand(B0, B1, 2)), in_dims=(None, 0))
+        test(
+            vmap(get_op(0), in_dims=(0, None)),
+            (torch.rand(B1, 2), torch.rand(B0, 2)),
+            in_dims=(None, 0),
+        )
+        test(
+            vmap(get_op(0), in_dims=(0, 0)),
+            (torch.rand(B1, 2), torch.rand(B0, B1, 2)),
+            in_dims=(None, 0),
+        )
 
     def test_slice(self):
         test = self._vmap_view_test
         B0, B1, B2 = 7, 11, 13
         test(lambda t: t[0:1], (torch.rand(B0, 3, 5),))
         test(lambda t: t[:, 1:3], (torch.rand(3, 5, B0),), in_dims=2)
-        test(vmap(lambda t: t[:, 0:1], in_dims=2), (torch.rand(3, 5, B0, B1),), in_dims=2)
-        test(vmap(vmap(lambda t: t[0:1], in_dims=2), in_dims=2),
-             (torch.rand(3, 5, B0, B1, B2),), in_dims=2)
+        test(
+            vmap(lambda t: t[:, 0:1], in_dims=2), (torch.rand(3, 5, B0, B1),), in_dims=2
+        )
+        test(
+            vmap(vmap(lambda t: t[0:1], in_dims=2), in_dims=2),
+            (torch.rand(3, 5, B0, B1, B2),),
+            in_dims=2,
+        )
 
     @xfailIfTorchDynamo
     def test_squeeze(self):
@@ -2510,8 +2782,12 @@ def _test_mean_sum_dim(self, op):
         test(vmap(lambda x: op(x, 0)), [torch.randn([B0, B1])])
         test(vmap(lambda x: op(x, -1)), [torch.randn([B0, B1])])
         test(vmap(lambda x: op(x, -2)), [torch.randn([B1, 2, 5, B0, 3])], in_dims=2)
-        test(vmap(lambda x: op(x, 2), in_dims=2), [torch.randn([2, 5, B0, B1, 3])],
-             in_dims=2, out_dims=2)
+        test(
+            vmap(lambda x: op(x, 2), in_dims=2),
+            [torch.randn([2, 5, B0, B1, 3])],
+            in_dims=2,
+            out_dims=2,
+        )
 
     def test_sum_dim(self):
         self._test_mean_sum_dim(torch.sum)
@@ -2523,6 +2799,7 @@ def test_argmax_dim(self):
         def test(f, args):
             for loop_out, batched_out in get_fallback_and_vmap_exhaustive(f, args, {}):
                 self.assertEqual(loop_out, batched_out)
+
         B0 = 5
         test(lambda x: torch.argmax(x), [torch.randn(B0)])
         test(lambda x: torch.argmax(x), [torch.randn(B0, 2, 3)])
@@ -2558,7 +2835,7 @@ def test_repeat(self):
         test(lambda x: op(x, (2, 3)), (torch.rand(B0, 1, 1),))
         test(lambda x: op(x, (2, 3)), (torch.rand(1, B0, 1),), in_dims=1)
 
-    @skipIfTorchDynamo
+    @skipIfTorchDynamo()
     def test_slogdet(self):
         test = functools.partial(self._vmap_test, check_propagates_grad=False)
         B0 = 7
@@ -2573,31 +2850,64 @@ def test_reshape(self):
         B0, B1, B2 = 7, 11, 13
         op = torch.reshape
         test(op, (torch.rand(B0, 2 * 5), [2, 5]), in_dims=(0, None), check_view=True)
-        test(op, (torch.rand(2, B0, 5), [1, 1, 10]), in_dims=(1, None), check_view=False)
-        test(vmap(lambda t: t.reshape([-1])), (torch.rand(B0, B1, 2, 5),), check_view=True)
-        test(vmap(vmap(lambda t: t.reshape([-1]), in_dims=2), in_dims=1),
-             (torch.rand(3, B1, 2, B2, 5, B0),), in_dims=5, check_view=False)
+        test(
+            op, (torch.rand(2, B0, 5), [1, 1, 10]), in_dims=(1, None), check_view=False
+        )
+        test(
+            vmap(lambda t: t.reshape([-1])),
+            (torch.rand(B0, B1, 2, 5),),
+            check_view=True,
+        )
+        test(
+            vmap(vmap(lambda t: t.reshape([-1]), in_dims=2), in_dims=1),
+            (torch.rand(3, B1, 2, B2, 5, B0),),
+            in_dims=5,
+            check_view=False,
+        )
 
     def test_reshape_as(self):
         test = self._vmap_test
         B0, B1, B2 = 7, 11, 13
         op = torch.Tensor.reshape_as
         test(op, (torch.rand(B0, 2 * 5), torch.rand(B0, 2, 5)), check_view=True)
-        test(op, (torch.rand(2 * 5), torch.rand(B0, 2, 5)), in_dims=(None, 0), check_view=True)
-        test(op, (torch.rand(B0, 2 * 5), torch.rand(2, 5)), in_dims=(0, None), check_view=True)
+        test(
+            op,
+            (torch.rand(2 * 5), torch.rand(B0, 2, 5)),
+            in_dims=(None, 0),
+            check_view=True,
+        )
+        test(
+            op,
+            (torch.rand(B0, 2 * 5), torch.rand(2, 5)),
+            in_dims=(0, None),
+            check_view=True,
+        )
 
-        test(op, (torch.rand(2, B0, 5), torch.rand(1, 1, 10)), in_dims=(1, None), check_view=False)
+        test(
+            op,
+            (torch.rand(2, B0, 5), torch.rand(1, 1, 10)),
+            in_dims=(1, None),
+            check_view=False,
+        )
 
-        test(vmap(op), (torch.rand(B0, B1, 2, 5), torch.randn(B0, B1, 10)), check_view=True)
-        test(vmap(vmap(op, in_dims=(2, None)), in_dims=(1, None)),
-             (torch.rand(3, B1, 2, B2, 5, B0), torch.rand(B0, 3 * 2 * 5)),
-             in_dims=(5, 0), check_view=False)
+        test(
+            vmap(op),
+            (torch.rand(B0, B1, 2, 5), torch.randn(B0, B1, 10)),
+            check_view=True,
+        )
+        test(
+            vmap(vmap(op, in_dims=(2, None)), in_dims=(1, None)),
+            (torch.rand(3, B1, 2, B2, 5, B0), torch.rand(B0, 3 * 2 * 5)),
+            in_dims=(5, 0),
+            check_view=False,
+        )
 
     def test_result_type(self):
         def scalar_tensor_with_dtype(op):
             def wrapped(*args, **kwargs):
                 dtype = op(*args, **kwargs)
                 return torch.ones([], dtype=dtype)
+
             return wrapped
 
         test = self._vmap_test
@@ -2605,36 +2915,66 @@ def wrapped(*args, **kwargs):
 
         B0 = 2
 
-        test(op, (torch.randn(B0), torch.randn(B0, dtype=torch.float64)),
-             check_propagates_grad=False)
-        test(op, (torch.randn(B0), torch.randint(10, [B0], dtype=torch.int64)),
-             check_propagates_grad=False)
+        test(
+            op,
+            (torch.randn(B0), torch.randn(B0, dtype=torch.float64)),
+            check_propagates_grad=False,
+        )
+        test(
+            op,
+            (torch.randn(B0), torch.randint(10, [B0], dtype=torch.int64)),
+            check_propagates_grad=False,
+        )
 
         test(lambda x: op(x, 1), (torch.randn(B0),), check_propagates_grad=False)
         test(lambda x: op(x, 1.6), (torch.randn(B0),), check_propagates_grad=False)
 
-        test(lambda x: op(x, torch.tensor(1)), (torch.randn(B0),),
-             check_propagates_grad=False)
-        test(lambda x: op(x, torch.tensor(1.6, dtype=torch.double)),
-             (torch.randn(B0),), check_propagates_grad=False)
+        test(
+            lambda x: op(x, torch.tensor(1)),
+            (torch.randn(B0),),
+            check_propagates_grad=False,
+        )
+        test(
+            lambda x: op(x, torch.tensor(1.6, dtype=torch.double)),
+            (torch.randn(B0),),
+            check_propagates_grad=False,
+        )
 
-        test(op, (torch.randn(B0, 2), torch.randn(B0, 2, dtype=torch.float64)),
-             check_propagates_grad=False)
-        test(op, (torch.randn(B0, 2), torch.randint(10, [B0, 2], dtype=torch.int64)),
-             check_propagates_grad=False)
+        test(
+            op,
+            (torch.randn(B0, 2), torch.randn(B0, 2, dtype=torch.float64)),
+            check_propagates_grad=False,
+        )
+        test(
+            op,
+            (torch.randn(B0, 2), torch.randint(10, [B0, 2], dtype=torch.int64)),
+            check_propagates_grad=False,
+        )
 
         test(lambda x: op(x, 1), (torch.randn(B0, 2),), check_propagates_grad=False)
         test(lambda x: op(x, 1.6), (torch.randn(B0, 2),), check_propagates_grad=False)
 
-        test(lambda x: op(x, torch.tensor(1)), (torch.randn(B0, 2),),
-             check_propagates_grad=False)
-        test(lambda x: op(x, torch.tensor(1.6, dtype=torch.double)),
-             (torch.randn(B0, 2),), check_propagates_grad=False)
+        test(
+            lambda x: op(x, torch.tensor(1)),
+            (torch.randn(B0, 2),),
+            check_propagates_grad=False,
+        )
+        test(
+            lambda x: op(x, torch.tensor(1.6, dtype=torch.double)),
+            (torch.randn(B0, 2),),
+            check_propagates_grad=False,
+        )
 
-        test(op, (torch.randn(B0, 2), torch.randn(B0, dtype=torch.float64)),
-             check_propagates_grad=False)
-        test(op, (torch.randn(B0, 2), torch.randint(10, [B0], dtype=torch.int64)),
-             check_propagates_grad=False)
+        test(
+            op,
+            (torch.randn(B0, 2), torch.randn(B0, dtype=torch.float64)),
+            check_propagates_grad=False,
+        )
+        test(
+            op,
+            (torch.randn(B0, 2), torch.randint(10, [B0], dtype=torch.int64)),
+            check_propagates_grad=False,
+        )
 
     def test_tensor_split(self):
         test = self._vmap_view_test
@@ -2644,19 +2984,40 @@ def test_tensor_split(self):
         # tests for torch.tensor_split(self, indices_or_sections: int, dim)
         test(op, (torch.rand(B0, 2, 1024), 5, -1), in_dims=(0, None, None))
         test(op, (torch.rand(2, B0, 1024), 150, 1), in_dims=(1, None, None))
-        test(vmap(op, in_dims=(0, None, None)), (torch.rand(B1, 1023, B0, 5), 256, 0),
-             in_dims=(2, None, None))
-        test(vmap(vmap(lambda t: op(t, 4, 1), in_dims=2)),
-             (torch.rand(B1, 2, B0, 64, B2),), in_dims=2)
+        test(
+            vmap(op, in_dims=(0, None, None)),
+            (torch.rand(B1, 1023, B0, 5), 256, 0),
+            in_dims=(2, None, None),
+        )
+        test(
+            vmap(vmap(lambda t: op(t, 4, 1), in_dims=2)),
+            (torch.rand(B1, 2, B0, 64, B2),),
+            in_dims=2,
+        )
 
         # tests for torch.tensor_split(self, indices_or_sections: List[int], dim)
-        test(op, (torch.rand(B0, 2, 1024), [50, 100, 378, 890], -1), in_dims=(0, None, None))
-        test(op, (torch.rand(2, B0, 1024), [50, 100, 212, 345, 0, 378, 890], 1), in_dims=(1, None, None))
-        test(vmap(op, in_dims=(0, None, None)), (torch.rand(B1, 1023, B0, 5), [50, 100, 212, 345, 0, 378, 890], 0),
-             in_dims=(2, None, None))
-        test(vmap(vmap(lambda t: op(t, [4, 8, 9, 34, 29], 1), in_dims=2)),
-             (torch.rand(B1, 2, B0, 64, B2),), in_dims=2)
+        test(
+            op,
+            (torch.rand(B0, 2, 1024), [50, 100, 378, 890], -1),
+            in_dims=(0, None, None),
+        )
+        test(
+            op,
+            (torch.rand(2, B0, 1024), [50, 100, 212, 345, 0, 378, 890], 1),
+            in_dims=(1, None, None),
+        )
+        test(
+            vmap(op, in_dims=(0, None, None)),
+            (torch.rand(B1, 1023, B0, 5), [50, 100, 212, 345, 0, 378, 890], 0),
+            in_dims=(2, None, None),
+        )
+        test(
+            vmap(vmap(lambda t: op(t, [4, 8, 9, 34, 29], 1), in_dims=2)),
+            (torch.rand(B1, 2, B0, 64, B2),),
+            in_dims=2,
+        )
 
+    @skipIfTorchDynamo("really slow")
     def test_split(self):
         test = self._vmap_view_test
         op = torch.split
@@ -2665,18 +3026,32 @@ def test_split(self):
         # tests for torch.split(self, split_size: int, dim)
         test(op, (torch.rand(B0, 2, 1024), 101, -1), in_dims=(0, None, None))
         test(op, (torch.rand(2, B0, 1024), 130, 1), in_dims=(1, None, None))
-        test(vmap(op, in_dims=(0, None, None)), (torch.rand(B1, 1023, B0, 5), 256, 0),
-             in_dims=(2, None, None))
-        test(vmap(vmap(lambda t: op(t, 4, 1), in_dims=2)),
-             (torch.rand(B1, 2, B0, 64, B2),), in_dims=2)
+        test(
+            vmap(op, in_dims=(0, None, None)),
+            (torch.rand(B1, 1023, B0, 5), 256, 0),
+            in_dims=(2, None, None),
+        )
+        test(
+            vmap(vmap(lambda t: op(t, 4, 1), in_dims=2)),
+            (torch.rand(B1, 2, B0, 64, B2),),
+            in_dims=2,
+        )
 
         # tests for torch.split(self, split_size: List[int], dim)
         test(op, (torch.rand(B0, 2, 1024), [1, 1020, 3], -1), in_dims=(0, None, None))
-        test(op, (torch.rand(2, B0, 1024), [100] * 10 + [24], 1), in_dims=(1, None, None))
-        test(vmap(op, in_dims=(0, None, None)), (torch.rand(B1, 1023, B0, 5), [256] * 3 + [255], 0),
-             in_dims=(2, None, None))
-        test(vmap(vmap(lambda t: op(t, [4] * 8 + [8] * 4, 1), in_dims=2)),
-             (torch.rand(B1, 2, B0, 64, B2),), in_dims=2)
+        test(
+            op, (torch.rand(2, B0, 1024), [100] * 10 + [24], 1), in_dims=(1, None, None)
+        )
+        test(
+            vmap(op, in_dims=(0, None, None)),
+            (torch.rand(B1, 1023, B0, 5), [256] * 3 + [255], 0),
+            in_dims=(2, None, None),
+        )
+        test(
+            vmap(vmap(lambda t: op(t, [4] * 8 + [8] * 4, 1), in_dims=2)),
+            (torch.rand(B1, 2, B0, 64, B2),),
+            in_dims=2,
+        )
 
     def test_trace(self):
         op = torch.trace
@@ -2697,8 +3072,11 @@ def test_transpose(self):
         test(lambda x: op(x, 3, 1), (torch.rand(B0, 2, 5, 4, 6),))
         test(lambda x: op(x, 1, 0), (torch.rand(2, B0, 5),), in_dims=1)
         test(vmap(lambda x: op(x, 0, 1)), (torch.rand(B1, 2, B0, 5),), in_dims=2)
-        test(vmap(vmap(lambda x: op(x, 0, 1), in_dims=2)),
-             (torch.rand(B1, 2, B0, 5, B2),), in_dims=2)
+        test(
+            vmap(vmap(lambda x: op(x, 0, 1), in_dims=2)),
+            (torch.rand(B1, 2, B0, 5, B2),),
+            in_dims=2,
+        )
 
         # Special case: scalar tensor
         for dim1, dim2 in itertools.product([0, -1], [0, -1]):
@@ -2731,12 +3109,16 @@ def test_to(self):
         test = self._vmap_test
         B0, B1 = 7, 11
 
-        test(lambda t: t.to('cpu'), (torch.rand(B0),))
+        test(lambda t: t.to("cpu"), (torch.rand(B0),))
         test(lambda t: t.to(torch.double), (torch.rand(B0),))
-        test(lambda t, o: t.to(o), (torch.rand(B0), torch.randn(B0, dtype=torch.float64)))
-        test(lambda t, o: t.to(o),
-             (torch.rand(B0), torch.randn(B0, dtype=torch.float64)),
-             in_dims=(0, None))
+        test(
+            lambda t, o: t.to(o), (torch.rand(B0), torch.randn(B0, dtype=torch.float64))
+        )
+        test(
+            lambda t, o: t.to(o),
+            (torch.rand(B0), torch.randn(B0, dtype=torch.float64)),
+            in_dims=(0, None),
+        )
         test(vmap(lambda t: t.to(torch.double)), (torch.rand(B0, B1, 3),))
 
         # also test some casting methods
@@ -2752,10 +3134,18 @@ def test_unfold(self):
 
         test(op, (torch.rand(B0, 7, 11), 0, 2, 1), in_dims=(0, None, None, None))
         test(op, (torch.rand(7, B0, 11), 1, 4, 2), in_dims=(1, None, None, None))
-        test(vmap(op, in_dims=(0, None, None, None)),
-             (torch.rand(B1, 7, B0, 11), 1, 5, 1), in_dims=(2, None, None, None))
-        test(vmap(vmap(op, in_dims=(2, None, None, None)), in_dims=(0, None, None, None)),
-             (torch.rand(B1, 7, B0, 11, B2), -1, 2, 4), in_dims=(2, None, None, None))
+        test(
+            vmap(op, in_dims=(0, None, None, None)),
+            (torch.rand(B1, 7, B0, 11), 1, 5, 1),
+            in_dims=(2, None, None, None),
+        )
+        test(
+            vmap(
+                vmap(op, in_dims=(2, None, None, None)), in_dims=(0, None, None, None)
+            ),
+            (torch.rand(B1, 7, B0, 11, B2), -1, 2, 4),
+            in_dims=(2, None, None, None),
+        )
 
     def test_unbind(self):
         test = self._vmap_view_test
@@ -2765,10 +3155,16 @@ def test_unbind(self):
         test(op, (torch.rand(B0, 2, 1024), -1), in_dims=(0, None))
         test(op, (torch.rand(B0, 2, 0),))
         test(op, (torch.rand(2, B0, 7), 0), in_dims=(1, None))
-        test(vmap(op, in_dims=(0, None)), (torch.rand(B1, 1023, B0, 5), 1),
-             in_dims=(2, None))
-        test(vmap(vmap(lambda t: op(t, dim=1), in_dims=2)),
-             (torch.rand(B1, 2, B0, 32, B2),), in_dims=2)
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, 1023, B0, 5), 1),
+            in_dims=(2, None),
+        )
+        test(
+            vmap(vmap(lambda t: op(t, dim=1), in_dims=2)),
+            (torch.rand(B1, 2, B0, 32, B2),),
+            in_dims=2,
+        )
 
     def test_view(self):
         test = self._vmap_view_test
@@ -2782,8 +3178,11 @@ def test_view(self):
         test(op, (torch.rand(B0, 2 * 5), [2, 5]), in_dims=(0, None))
         test(op, (torch.rand(B0, 4, 5), [1, 2, 1, 10]), in_dims=(0, None))
         test(vmap(lambda t: t.view([-1])), (torch.rand(B0, B1, 2, 5, 3),))
-        test(vmap(vmap(lambda t: t.reshape([-1])), in_dims=1),
-             (torch.rand(B2, B0, B1, 3, 2, 5),), in_dims=1)
+        test(
+            vmap(vmap(lambda t: t.reshape([-1])), in_dims=1),
+            (torch.rand(B2, B0, B1, 3, 2, 5),),
+            in_dims=1,
+        )
 
     def test_view_as(self):
         test = self._vmap_view_test
@@ -2801,9 +3200,11 @@ def test_view_as(self):
         test(op, (torch.rand(B0, 4, 5), torch.rand(2, 1, 1, 10)), in_dims=(0, None))
 
         test(vmap(op), (torch.rand(B0, B1, 2, 5), torch.randn(B0, B1, 10)))
-        test(vmap(vmap(op, in_dims=(0, None)), in_dims=(0, None)),
-             (torch.rand(B1, B2, B0, 3, 2, 5), torch.rand(B0, 3 * 2 * 5)),
-             in_dims=(2, 0))
+        test(
+            vmap(vmap(op, in_dims=(0, None)), in_dims=(0, None)),
+            (torch.rand(B1, B2, B0, 3, 2, 5), torch.rand(B0, 3 * 2 * 5)),
+            in_dims=(2, 0),
+        )
 
     def test_conv2d(self):
         conv_setups = [
@@ -2816,21 +3217,31 @@ def test_conv2d(self):
             mod = conv_mod(4, 8, kernel_size=3)
             arg_values = [torch.randn(inp_shape), mod.weight, mod.bias]
             kwarg_values = {}
-            for loop_out, batched_out in get_fallback_and_vmap_exhaustive(conv_fn, arg_values, kwarg_values):
+            for loop_out, batched_out in get_fallback_and_vmap_exhaustive(
+                conv_fn, arg_values, kwarg_values
+            ):
                 self.assertEqual(loop_out, batched_out)
 
             arg_values = [torch.randn(inp_shape), mod.weight, None]
-            for loop_out, batched_out in get_fallback_and_vmap_exhaustive(conv_fn, arg_values, kwarg_values):
+            for loop_out, batched_out in get_fallback_and_vmap_exhaustive(
+                conv_fn, arg_values, kwarg_values
+            ):
                 self.assertEqual(loop_out, batched_out)
 
-            mod2 = conv_mod(4, 8, kernel_size=3, groups=2, stride=3, padding=1, dilation=2)
+            mod2 = conv_mod(
+                4, 8, kernel_size=3, groups=2, stride=3, padding=1, dilation=2
+            )
             arg_values = [torch.randn(inp_shape), mod2.weight, mod2.bias]
             kwarg_values = dict(groups=2, stride=3, padding=1, dilation=2)
-            for loop_out, batched_out in get_fallback_and_vmap_exhaustive(conv_fn, arg_values, kwarg_values):
+            for loop_out, batched_out in get_fallback_and_vmap_exhaustive(
+                conv_fn, arg_values, kwarg_values
+            ):
                 self.assertEqual(loop_out, batched_out)
 
             arg_values = [torch.randn(inp_shape), mod2.weight, None]
-            for loop_out, batched_out in get_fallback_and_vmap_exhaustive(conv_fn, arg_values, kwarg_values):
+            for loop_out, batched_out in get_fallback_and_vmap_exhaustive(
+                conv_fn, arg_values, kwarg_values
+            ):
                 self.assertEqual(loop_out, batched_out)
 
     def test_one_hot(self):
@@ -2839,7 +3250,9 @@ def test_one_hot(self):
             (torch.randint(0, 3, [2, 3, 4]), 4),
         ]
         for args in sample_inputs:
-            for loop_out, batched_out in get_fallback_and_vmap_exhaustive(F.one_hot, args, {}):
+            for loop_out, batched_out in get_fallback_and_vmap_exhaustive(
+                F.one_hot, args, {}
+            ):
                 self.assertEqual(loop_out, batched_out)
 
     def test_conj_bit(self):
@@ -2850,6 +3263,7 @@ def foo(x):
             y = x.conj()
             assert y.is_conj()
             return y
+
         res = vmap(foo)(x)
         self.assertEqual(res, x.conj())
 
@@ -2861,14 +3275,18 @@ def naive_f(x, shape):
             return x + torch.randn(shape)
 
         torch.manual_seed(0)
-        out1 = vmap(vmap(vmap_f, randomness='different'), randomness='different')(torch.ones(2, 3))
+        out1 = vmap(vmap(vmap_f, randomness="different"), randomness="different")(
+            torch.ones(2, 3)
+        )
 
         torch.manual_seed(0)
         out2 = naive_f(torch.ones(2, 3), (2, 3))
         self.assertEqual(out1, out2)
 
         torch.manual_seed(0)
-        out1 = vmap(vmap(vmap_f, randomness='different'), randomness='different')(torch.ones(2, 3, 4))
+        out1 = vmap(vmap(vmap_f, randomness="different"), randomness="different")(
+            torch.ones(2, 3, 4)
+        )
 
         torch.manual_seed(0)
         out2 = naive_f(torch.ones(2, 3, 4), (2, 3, 1))
@@ -2876,11 +3294,10 @@ def naive_f(x, shape):
 
         self.assertTrue(torch.randn(()).dim() == 0)
 
-    @parametrize('in_dim', [0, 1, 2])
-    @parametrize('out_dim', [0, 1, 2])
-    @parametrize('randomness', ['error', 'same'])
+    @parametrize("in_dim", [0, 1, 2])
+    @parametrize("out_dim", [0, 1, 2])
+    @parametrize("randomness", ["error", "same"])
     def test_chunk_vmap(self, in_dim, out_dim, randomness):
-
         x = torch.randn(4, 5, 6)
 
         def f(x):
@@ -2895,15 +3312,18 @@ def f(x):
         for chunks in [1, 2, 3, 4, 7, 10, 16]:
             torch.set_rng_state(rs)
             output = chunk_vmap(
-                f, in_dims=in_dim, out_dims=out_dim, randomness=randomness, chunks=chunks
+                f,
+                in_dims=in_dim,
+                out_dims=out_dim,
+                randomness=randomness,
+                chunks=chunks,
             )(x)
             self.assertEqual(output, expected)
 
-    @parametrize('in_dim', [0, 1, 2])
-    @parametrize('out_dim', [0, 1, 2])
-    @parametrize('randomness', ['error', 'same'])
+    @parametrize("in_dim", [0, 1, 2])
+    @parametrize("out_dim", [0, 1, 2])
+    @parametrize("randomness", ["error", "same"])
     def test_vmap_chunksize(self, in_dim, out_dim, randomness):
-
         x = torch.randn(4, 5, 6)
         y = torch.randn_like(x)
 
@@ -2913,8 +3333,9 @@ def f(x):
             if randomness != "error":
                 y = y + torch.rand_like(x)
             return y
+
         f_args = (x,)
-        f_kwargs = {'in_dims': in_dim, 'out_dims': out_dim, 'randomness': randomness}
+        f_kwargs = {"in_dims": in_dim, "out_dims": out_dim, "randomness": randomness}
 
         # fn: Nested Input/Single Output
         def f1(pair):
@@ -2923,55 +3344,79 @@ def f1(pair):
             if randomness != "error":
                 z = z + torch.rand_like(z)
             return z
+
         f1_args = ((x, y),)
-        f1_kwargs = {'in_dims': ((in_dim,) * 2,), 'out_dims': out_dim, 'randomness': randomness}
+        f1_kwargs = {
+            "in_dims": ((in_dim,) * 2,),
+            "out_dims": out_dim,
+            "randomness": randomness,
+        }
 
         # fn: Single Input/Nested Output
         def f2(x):
             y = x.sin()
             if randomness != "error":
                 y = y + torch.rand_like(x)
-            return {'out': y, 'out1': y + 2}
+            return {"out": y, "out1": y + 2}
+
         f2_args = (x,)
-        f2_kwargs = {'in_dims': in_dim, 'out_dims': out_dim, 'randomness': randomness}
+        f2_kwargs = {"in_dims": in_dim, "out_dims": out_dim, "randomness": randomness}
 
         # fn: Nested Input/Nested Output (first tensor is not vmapped).
         def f3(inp_dict):
-            x = inp_dict['inp']
-            y = inp_dict['inp1']
+            x = inp_dict["inp"]
+            y = inp_dict["inp1"]
             z = x.sin() + y.cos()
             if randomness != "error":
                 z = z + torch.rand_like(z)
-            return {'z': z, 'tuple': (z, z + 1)}
-        f3_args = ({'inp': x.index_select(in_dim, torch.tensor([0])).squeeze(in_dim), 'inp1': y},)
-        f3_kwargs = {'in_dims': ({'inp': None, 'inp1': in_dim},), 'out_dims': out_dim, 'randomness': randomness}
+            return {"z": z, "tuple": (z, z + 1)}
+
+        f3_args = (
+            {
+                "inp": x.index_select(in_dim, torch.tensor([0])).squeeze(in_dim),
+                "inp1": y,
+            },
+        )
+        f3_kwargs = {
+            "in_dims": ({"inp": None, "inp1": in_dim},),
+            "out_dims": out_dim,
+            "randomness": randomness,
+        }
 
         # fn: Nested Input/Nested Output (first argument is not a Tensor).
         def f4(inp_dict):
-            x = inp_dict['inp']
-            y = inp_dict['inp1']
+            x = inp_dict["inp"]
+            y = inp_dict["inp1"]
             z = x + y.cos()
             if randomness != "error":
                 z = z + torch.rand_like(z)
-            return {'z': z, 'tuple': (z, z + 1)}
-        f4_args = ({'inp': 2., 'inp1': y},)
-        f4_kwargs = {'in_dims': ({'inp': None, 'inp1': in_dim},), 'out_dims': out_dim, 'randomness': randomness}
-
-        fns_and_args = ((f, f_args, f_kwargs), (f1, f1_args, f1_kwargs), (f2, f2_args, f2_kwargs),
-                        (f3, f3_args, f3_kwargs), (f4, f4_args, f4_kwargs))
+            return {"z": z, "tuple": (z, z + 1)}
+
+        f4_args = ({"inp": 2.0, "inp1": y},)
+        f4_kwargs = {
+            "in_dims": ({"inp": None, "inp1": in_dim},),
+            "out_dims": out_dim,
+            "randomness": randomness,
+        }
+
+        fns_and_args = (
+            (f, f_args, f_kwargs),
+            (f1, f1_args, f1_kwargs),
+            (f2, f2_args, f2_kwargs),
+            (f3, f3_args, f3_kwargs),
+            (f4, f4_args, f4_kwargs),
+        )
         for fn, args, kwargs in fns_and_args:
             rs = torch.get_rng_state()
             expected_vmap = vmap(fn, **kwargs)(*args)
             for chunk_size in (1, 2, 3, 4, 7, 10, 16, 100):
                 torch.set_rng_state(rs)
-                output = vmap(
-                    fn, chunk_size=chunk_size, **kwargs
-                )(*args)
+                output = vmap(fn, chunk_size=chunk_size, **kwargs)(*args)
                 self.assertEqual(output, expected_vmap)
 
-    @parametrize('in_dim', [0, 1])
-    @parametrize('out_dim', [0, 1])
-    @parametrize('randomness', ['error', 'same'])
+    @parametrize("in_dim", [0, 1])
+    @parametrize("out_dim", [0, 1])
+    @parametrize("randomness", ["error", "same"])
     def test_vmap_chunksize_error(self, in_dim, out_dim, randomness):
         x = torch.randn(4, 5, 6)
 
@@ -2983,21 +3428,31 @@ def f(x):
 
         # Incorrect `chunk_size`
         for chunk_size in (-1, 0):
-            with self.assertRaisesRegex(ValueError, "vmap: chunk_size should be None or greater than 0."):
+            with self.assertRaisesRegex(
+                ValueError, "vmap: chunk_size should be None or greater than 0."
+            ):
                 vmap(
-                    f, in_dims=in_dim, out_dims=out_dim, randomness=randomness, chunk_size=chunk_size
+                    f,
+                    in_dims=in_dim,
+                    out_dims=out_dim,
+                    randomness=randomness,
+                    chunk_size=chunk_size,
                 )(x)
 
         # Incorrect `out_dims`
         msg = "out_dims is not compatible with the structure of `outputs`"
         with self.assertRaisesRegex(ValueError, msg):
             vmap(
-                f, in_dims=in_dim, out_dims=(out_dim, out_dim), randomness=randomness, chunk_size=2
+                f,
+                in_dims=in_dim,
+                out_dims=(out_dim, out_dim),
+                randomness=randomness,
+                chunk_size=2,
             )(x)
 
-    @parametrize('in_dim', [0, 1])
-    @parametrize('out_dim', [0, 1])
-    @parametrize('randomness', ['error', 'same'])
+    @parametrize("in_dim", [0, 1])
+    @parametrize("out_dim", [0, 1])
+    @parametrize("randomness", ["error", "same"])
     def test_vmap_chunksize_composition(self, in_dim, out_dim, randomness):
         x = torch.randn(4, 5, 6)
         y = torch.randn_like(x)
@@ -3008,6 +3463,7 @@ def f(x):
             if randomness != "error":
                 y = y + torch.rand_like(x)
             return y
+
         f_args = (x,)
 
         # fn: Nested Input/Single Output
@@ -3017,6 +3473,7 @@ def f1(pair):
             if randomness != "error":
                 z = z + torch.rand_like(z)
             return z
+
         f1_args = ((x, y),)
 
         # fn: Single Input/Nested Output
@@ -3024,59 +3481,82 @@ def f2(x):
             y = x.sin()
             if randomness != "error":
                 y = y + torch.rand_like(x)
-            return {'out': y, 'out1': y + 2}
+            return {"out": y, "out1": y + 2}
+
         f2_args = (x,)
 
         # fn: Nested Input/Nested Output
         def f3(inp_dict):
-            x = inp_dict['inp']
-            y = inp_dict['inp1']
+            x = inp_dict["inp"]
+            y = inp_dict["inp1"]
             z = x.sin() + y.cos()
             if randomness != "error":
                 z = z + torch.rand_like(z)
-            return {'z': z, 'tuple': (z, z + 1)}
-        f3_args = ({'inp': x, 'inp1': y},)
+            return {"z": z, "tuple": (z, z + 1)}
+
+        f3_args = ({"inp": x, "inp1": y},)
 
         for fn, args in ((f, f_args), (f1, f1_args), (f2, f2_args), (f3, f3_args)):
             rs = torch.get_rng_state()
-            expected = vmap(vmap(fn, in_dims=in_dim, out_dims=out_dim, randomness=randomness),
-                            in_dims=in_dim, out_dims=out_dim, randomness=randomness)(*args)
+            expected = vmap(
+                vmap(fn, in_dims=in_dim, out_dims=out_dim, randomness=randomness),
+                in_dims=in_dim,
+                out_dims=out_dim,
+                randomness=randomness,
+            )(*args)
             for chunk_size in (1, 2, 3, 4, 7, 10, 16, 100):
                 torch.set_rng_state(rs)
-                actual = vmap(vmap(
-                    fn, in_dims=in_dim, out_dims=out_dim, randomness=randomness, chunk_size=chunk_size
-                ), in_dims=in_dim, out_dims=out_dim, randomness=randomness, chunk_size=chunk_size)(*args)
+                actual = vmap(
+                    vmap(
+                        fn,
+                        in_dims=in_dim,
+                        out_dims=out_dim,
+                        randomness=randomness,
+                        chunk_size=chunk_size,
+                    ),
+                    in_dims=in_dim,
+                    out_dims=out_dim,
+                    randomness=randomness,
+                    chunk_size=chunk_size,
+                )(*args)
                 self.assertEqual(actual, expected)
 
+
 instantiate_parametrized_tests(TestVmapOperators)
 
 
 def construct_v(output, batch_size, contig=False):
     if contig:
-        return torch.randn(batch_size, *output.shape,
-                           dtype=output.dtype, device=output.device)
-    result = torch.randn(*output.shape, batch_size,
-                         dtype=output.dtype, device=output.device)
+        return torch.randn(
+            batch_size, *output.shape, dtype=output.dtype, device=output.device
+        )
+    result = torch.randn(
+        *output.shape, batch_size, dtype=output.dtype, device=output.device
+    )
     return result.movedim(-1, 0)
 
+
 def as_tuple(x):
     if isinstance(x, tuple):
         return x
     elif isinstance(x, list):
         return tuple(x)
     else:
-        return x,
+        return (x,)
 
 
 def differentiable(args):
-    return tuple(arg for arg in as_tuple(args)
-                 if isinstance(arg, torch.Tensor) and arg.requires_grad)
+    return tuple(
+        arg
+        for arg in as_tuple(args)
+        if isinstance(arg, torch.Tensor) and arg.requires_grad
+    )
 
 
 def _get_rand_no_zeros(*args, **kwargs):
-    requires_grad = kwargs.get('requires_grad', False)
+    requires_grad = kwargs.get("requires_grad", False)
     kwargs_without_requires_grad = kwargs.copy()
-    kwargs_without_requires_grad['requires_grad'] = False
+    kwargs_without_requires_grad["requires_grad"] = False
     result = torch.rand(*args, **kwargs_without_requires_grad)
     return result.clamp_min_(0.1).requires_grad_(requires_grad)
 
@@ -3092,20 +3572,26 @@ def _vmap_test(self, *args, **kwargs):
     # output_process_fn: a function that maps the outputs to the part
     #       that should be differentiated.
     # batch_size: the batch dim size for the batched grad
-    def _batched_grad_test(self, op, args, kwargs=None, output_process_fn=lambda x: x, batch_size=3):
+    def _batched_grad_test(
+        self, op, args, kwargs=None, output_process_fn=lambda x: x, batch_size=3
+    ):
         if kwargs is None:
             kwargs = {}
         outputs = op(*args, **kwargs)
         outputs = differentiable(output_process_fn(outputs))
         for contig in [True, False]:
-            batched_vectors = tuple(construct_v(out, batch_size, contig)
-                                    for out in outputs)
+            batched_vectors = tuple(
+                construct_v(out, batch_size, contig) for out in outputs
+            )
 
             def vector_jacobian_product(*vectors):
-                return torch.autograd.grad(outputs, differentiable(args), vectors,
-                                           retain_graph=True)
-            self._vmap_test(vector_jacobian_product, batched_vectors,
-                            check_propagates_grad=False)
+                return torch.autograd.grad(
+                    outputs, differentiable(args), vectors, retain_graph=True
+                )
+
+            self._vmap_test(
+                vector_jacobian_product, batched_vectors, check_propagates_grad=False
+            )
 
     # Tests batched second grad computation of outputs = op(*args, **kwargs).
     # by comparing it to a sequential map+stack fallback.
@@ -3120,32 +3606,43 @@ def vector_jacobian_product(*vectors):
     # Regression.
     # It might be useful to have a test that computes batched first gradients and
     # then uses those to compute batched second gradients in the future.
-    def _batched_grad_grad_test(self, op, args, kwargs=None, output_process_fn=lambda x: x, batch_size=3):
+    def _batched_grad_grad_test(
+        self, op, args, kwargs=None, output_process_fn=lambda x: x, batch_size=3
+    ):
         if kwargs is None:
             kwargs = {}
         outputs = op(*args, **kwargs)
         outputs = differentiable(output_process_fn(outputs))
         ones = tuple(torch.ones_like(out) for out in outputs)
         # Same thing as summing together all of the outputs and calling .backward()
-        first_grads = torch.autograd.grad(outputs, differentiable(args), ones,
-                                          create_graph=True)
+        first_grads = torch.autograd.grad(
+            outputs, differentiable(args), ones, create_graph=True
+        )
         first_grads = differentiable(first_grads)
         self.assertNotEqual(
-            len(first_grads), 0, "None of the first grads depend on the input!")
+            len(first_grads), 0, "None of the first grads depend on the input!"
+        )
 
         for contig in [True, False]:
-            batched_vectors = tuple(construct_v(grad, batch_size, contig)
-                                    for grad in first_grads)
+            batched_vectors = tuple(
+                construct_v(grad, batch_size, contig) for grad in first_grads
+            )
 
             def vector_hessian_product(*vectors):
-                outputs = torch.autograd.grad(first_grads, differentiable(args), vectors,
-                                              retain_graph=True, allow_unused=True)
+                outputs = torch.autograd.grad(
+                    first_grads,
+                    differentiable(args),
+                    vectors,
+                    retain_graph=True,
+                    allow_unused=True,
+                )
                 outputs = tuple(out for out in outputs if out is not None)
                 assert len(outputs) > 0
                 return outputs
 
-            self._vmap_test(vector_hessian_product, batched_vectors,
-                            check_propagates_grad=False)
+            self._vmap_test(
+                vector_hessian_product, batched_vectors, check_propagates_grad=False
+            )
 
     def _test_arithmetic(self, op, device, test_grad_grad=True):
         x = torch.randn(2, 3, requires_grad=True, device=device)
@@ -3174,7 +3671,6 @@ def test_div(self, device):
         self._test_arithmetic(torch.div, device)
         self._test_arithmetic(lambda x, y: x / y, device)
 
-    @xfailIfTorchDynamo
     def test_binary_cross_entropy(self, device):
         x = F.sigmoid(torch.randn(3, 2, device=device, requires_grad=True))
         target = torch.rand(3, 2, device=device)
@@ -3184,7 +3680,6 @@ def test_binary_cross_entropy(self, device):
         self._batched_grad_test(op, (x,), {})
         self._batched_grad_grad_test(op, (x,), {})
 
-    @xfailIfTorchDynamo
     def test_log_softmax(self, device):
         op = functools.partial(torch.log_softmax, dim=-1)
         x = torch.randn(3, 2, device=device, requires_grad=True)
@@ -3197,6 +3692,7 @@ def test_expand(self, device):
 
         def op(x):
             return x.expand(5, 5, 2, 3)
+
         self._batched_grad_test(op, (x,))
 
     @allowVmapFallbackUsage
@@ -3277,6 +3773,7 @@ def test_stack(self, device):
 
         def op(x, y):
             return torch.stack([x, y])
+
         self._batched_grad_test(op, (x, y))
 
     def test_select(self, device):
@@ -3318,7 +3815,9 @@ def f(x, y):
         def f(t):
             return torch.where(t)
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to vmap over aten::where"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to vmap over aten::where"
+        ):
             vmap(f)(x)
 
     def test_threshold(self, device):
@@ -3371,7 +3870,7 @@ def test_unrelated_output(self, device):
         gy = torch.randn(B0, requires_grad=True)
 
         def vjp(v):
-            res, = torch.autograd.grad(y, x, v, allow_unused=True)
+            (res,) = torch.autograd.grad(y, x, v, allow_unused=True)
             return torch.zeros_like(x) if res is None else res
 
         result = vmap(vjp)(gy)
@@ -3385,7 +3884,7 @@ def test_unrelated_output_multiple_grad(self, device):
         gy = torch.randn(B0, requires_grad=True)
 
         def vjp(v):
-            res, = torch.autograd.grad(y, x, v, allow_unused=True)
+            (res,) = torch.autograd.grad(y, x, v, allow_unused=True)
             return torch.zeros_like(x) if res is None else res
 
         _ = vjp(gy[0])
@@ -3412,10 +3911,12 @@ def discover_variants(opinfo):
 # @markDynamoStrictTest
 @unMarkDynamoStrictTest
 class TestVmapOperatorsOpInfo(TestCase):
-
-    def vmap_outplace_test(self, func, args, kwargs, in_dims, check_shape_only=False,
-                           postprocess_fn=None):
-        for vmap_out, loop_out in compute_quantities_for_vmap_test(func, args, kwargs, in_dims):
+    def vmap_outplace_test(
+        self, func, args, kwargs, in_dims, check_shape_only=False, postprocess_fn=None
+    ):
+        for vmap_out, loop_out in compute_quantities_for_vmap_test(
+            func, args, kwargs, in_dims
+        ):
             if postprocess_fn is not None:
                 loop_out = postprocess_fn(loop_out)
                 vmap_out = postprocess_fn(vmap_out)
@@ -3433,18 +3934,32 @@ def vmap_inplace_test(self, func, args, kwargs, in_dims, postprocess_fn=None):
             # on the in-place operation
             with self.assertRaises(RuntimeError):
                 for _ in compute_quantities_for_vmap_test(
-                        func, args, kwargs, in_dims, compute_loop_out=False, clone_inputs=True):
+                    func,
+                    args,
+                    kwargs,
+                    in_dims,
+                    compute_loop_out=False,
+                    clone_inputs=True,
+                ):
                     pass
             return
         for vmap_out, loop_out in compute_quantities_for_vmap_test(
-                func, args, kwargs, in_dims, clone_inputs=True):
+            func, args, kwargs, in_dims, clone_inputs=True
+        ):
             if postprocess_fn is not None:
                 loop_out = postprocess_fn(loop_out)
                 vmap_out = postprocess_fn(vmap_out)
             self.assertEqual(vmap_out, loop_out)
 
-    def opinfo_vmap_test(self, device, dtype, op, check_has_batch_rule,
-                         skip_inplace=(), postprocess_fn=None):
+    def opinfo_vmap_test(
+        self,
+        device,
+        dtype,
+        op,
+        check_has_batch_rule,
+        skip_inplace=(),
+        postprocess_fn=None,
+    ):
         def test():
             # Error inputs check
             if op.error_inputs_func is not None:
@@ -3475,9 +3990,11 @@ def test():
             if op.name in sample_inputs_op:
                 sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=False)
             else:
-                sample_inputs_itr = op.reference_inputs(device, dtype, requires_grad=False)
+                sample_inputs_itr = op.reference_inputs(
+                    device, dtype, requires_grad=False
+                )
             aliases, inplace_aliases = discover_variants(op)
-            check_shape_only = op.name in ('empty_like', 'new_empty')
+            check_shape_only = op.name in ("empty_like", "new_empty")
             for sample_input in sample_inputs_itr:
                 args = (sample_input.input,) + sample_input.args
                 if not any(isinstance(arg, torch.Tensor) for arg in args):
@@ -3486,15 +4003,27 @@ def test():
                 kwargs = sample_input.kwargs
                 is_batch_norm_and_training = is_batch_norm_training(op.name, kwargs)
                 for batched_args, in_dims, _ in generate_vmap_inputs(
-                        args, {}, is_batch_norm_and_training=is_batch_norm_and_training):
+                    args, {}, is_batch_norm_and_training=is_batch_norm_and_training
+                ):
                     for func in aliases:
-                        self.vmap_outplace_test(func, batched_args, kwargs, in_dims, check_shape_only, postprocess_fn)
+                        self.vmap_outplace_test(
+                            func,
+                            batched_args,
+                            kwargs,
+                            in_dims,
+                            check_shape_only,
+                            postprocess_fn,
+                        )
                     if op.name in skip_inplace:
                         continue
-                    if not is_valid_inplace_sample_input(sample_input, op, op.inplace_variant):
+                    if not is_valid_inplace_sample_input(
+                        sample_input, op, op.inplace_variant
+                    ):
                         continue
                     for func in inplace_aliases:
-                        self.vmap_inplace_test(func, batched_args, kwargs, in_dims, postprocess_fn)
+                        self.vmap_inplace_test(
+                            func, batched_args, kwargs, in_dims, postprocess_fn
+                        )
 
         if check_has_batch_rule:
             check_vmap_fallback(self, test, op)
@@ -3504,327 +4033,402 @@ def test():
     vmap_fail = {
         # -------------------- ALLOWED FAILURES --------------------------------
         # These are things that we either cannot fix or are not actually problems
-        xfail('resize_'),
-        xfail('resize_as_'),
-        xfail('to_sparse'),
-        xfail('__getitem__'),  # dynamic mask
-        xfail('index_put'),  # dynamic mask
-        xfail('nn.functional.dropout'),  # works, can't check against for loop because of randomness inconsistency
-        xfail('nn.functional.scaled_dot_product_attention'),  # randomness
-        xfail('nn.functional.multi_head_attention_forward'),  # randomness
-        xfail('masked_select'),  # dynamic op
-        xfail('nonzero'),  # dynamic op
-        xfail('unique', ''),  # dynamic op
-        xfail('unique_consecutive', ''),  # dynamic op
-        xfail('allclose'),  # returns a boolean
-        xfail('uniform'),  # randomness is tested separately
-        xfail('rand_like'),  # randomness is tested separately
-        xfail('randint_like'),  # randomness is tested separately
-        xfail('randn_like'),  # randomness is tested separately
-        xfail('bernoulli', ''),  # randomness is tested separately
-        xfail('normal', ''),  # randomness is tested separately
-        xfail('normal', 'number_mean'),  # randomness is tested separately
-        xfail('multinomial', ''),  # randomness
-        xfail('nn.functional.embedding', ''),  # we only support some cases
-        xfail('nn.functional.rrelu'),  # randomness
-        xfail('nn.functional.dropout2d', ''),  # randomness
-        xfail('nn.functional.dropout3d', ''),  # randomness
-        xfail('nn.functional.alpha_dropout', ''),  # randomness
-        xfail('nn.functional.feature_alpha_dropout', 'with_train'),  # randomness
-        xfail('as_strided'),  # Our test runner can't handle this; manual test exists
-        xfail('as_strided_scatter'),  # no batching rule implemented, default doesnt work
-        skip('new_empty_strided'),  # empty tensor data is garbage so it's hard to make comparisons with it
-        xfail('nn.functional.fractional_max_pool3d'),  # randomness
-        xfail('nn.functional.fractional_max_pool2d'),  # randomness
-        xfail('pca_lowrank', ''),  # random operation
-        xfail('svd_lowrank', ''),  # random operation
-        xfail('sparse.sampled_addmm'),  # sparse
-        xfail('sparse.mm', 'reduce'),  # sparse
-        xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable autograd.Function
-        skip('_softmax_backward_data'),
-        skip('linalg.eigh', ''),  # not always return the same result for the same input, see test_linalg_eigh for manual test
-        skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
+        xfail("resize_"),
+        xfail("resize_as_"),
+        xfail("to_sparse"),
+        xfail("__getitem__"),  # dynamic mask
+        xfail("index_put"),  # dynamic mask
+        xfail(
+            "nn.functional.dropout"
+        ),  # works, can't check against for loop because of randomness inconsistency
+        xfail("nn.functional.scaled_dot_product_attention"),  # randomness
+        xfail("nn.functional.multi_head_attention_forward"),  # randomness
+        xfail("masked_select"),  # dynamic op
+        xfail("nonzero"),  # dynamic op
+        xfail("unique", ""),  # dynamic op
+        xfail("unique_consecutive", ""),  # dynamic op
+        xfail("allclose"),  # returns a boolean
+        xfail("uniform"),  # randomness is tested separately
+        xfail("rand_like"),  # randomness is tested separately
+        xfail("randint_like"),  # randomness is tested separately
+        xfail("randn_like"),  # randomness is tested separately
+        xfail("bernoulli", ""),  # randomness is tested separately
+        xfail("normal", ""),  # randomness is tested separately
+        xfail("normal", "number_mean"),  # randomness is tested separately
+        xfail("multinomial", ""),  # randomness
+        xfail("nn.functional.embedding", ""),  # we only support some cases
+        xfail("nn.functional.rrelu"),  # randomness
+        xfail("nn.functional.dropout2d", ""),  # randomness
+        xfail("nn.functional.dropout3d", ""),  # randomness
+        xfail("nn.functional.alpha_dropout", ""),  # randomness
+        xfail("nn.functional.feature_alpha_dropout", "with_train"),  # randomness
+        xfail("as_strided"),  # Our test runner can't handle this; manual test exists
+        xfail(
+            "as_strided_scatter"
+        ),  # no batching rule implemented, default doesnt work
+        skip(
+            "new_empty_strided"
+        ),  # empty tensor data is garbage so it's hard to make comparisons with it
+        xfail("nn.functional.fractional_max_pool3d"),  # randomness
+        xfail("nn.functional.fractional_max_pool2d"),  # randomness
+        xfail("pca_lowrank", ""),  # random operation
+        xfail("svd_lowrank", ""),  # random operation
+        xfail("sparse.sampled_addmm"),  # sparse
+        xfail("sparse.mm", "reduce"),  # sparse
+        xfail(
+            "NumpyCubeNotComposableAutogradFunction"
+        ),  # Not composable autograd.Function
+        skip("_softmax_backward_data"),
+        skip(
+            "linalg.eigh", ""
+        ),  # not always return the same result for the same input, see test_linalg_eigh for manual test
+        skip("to"),  # RuntimeError: required rank 4 tensor to use channels_last format
         # ----------------------------------------------------------------------
-
         # ---------------------------- BUGS ------------------------------------
         # entries in here don't work and need to be fixed.
         # Each one of these is a bug
         decorate("frexp", decorator=skipIfTorchDynamo()),
-
-        xfail('clamp_min', ''),  # Exception not raised on error input
-        xfail('clamp_max', ''),  # Exception not raised on error input
-
-        xfail('view_as_complex'),  # RuntimeError: Tensor must have a last dimension with stride 1
-        xfail('tensor_split'),  # data_ptr
-        xfail('histogramdd'),  # expected Tensor as element 0 in argument 0, but got tuple
-        xfail('nn.functional.gaussian_nll_loss'),  # data-dependent control flow error
-        xfail('nn.functional.embedding_bag'),  # embedding renorm vmap inplace incompatible
-        xfail('narrow'),  # Batching rule not implemented for aten::narrow.Tensor
-
+        xfail("clamp_min", ""),  # Exception not raised on error input
+        xfail("clamp_max", ""),  # Exception not raised on error input
+        xfail(
+            "view_as_complex"
+        ),  # RuntimeError: Tensor must have a last dimension with stride 1
+        xfail("tensor_split"),  # data_ptr
+        xfail(
+            "histogramdd"
+        ),  # expected Tensor as element 0 in argument 0, but got tuple
+        xfail("nn.functional.gaussian_nll_loss"),  # data-dependent control flow error
+        xfail(
+            "nn.functional.embedding_bag"
+        ),  # embedding renorm vmap inplace incompatible
+        xfail("narrow"),  # Batching rule not implemented for aten::narrow.Tensor
         # required rank 4 tensor to use channels_last format
-        xfail('bfloat16'),
-        xfail('bool'),
-        xfail('byte'),
-        xfail('char'),
-        xfail('double'),
-        xfail('float'),
-        xfail('half'),
-        xfail('int'),
-        xfail('long'),
-        xfail('short'),
-        xfail('cdouble'),
-        xfail('cfloat'),
-
-        xfail('jiterator_binary', device_type='cuda'),  # NYI: querying is_contiguous inside of vmap
-        xfail('jiterator_binary_return_by_ref', device_type='cuda'),  # NYI: querying is_contiguous inside of vmap
-        xfail('jiterator_4inputs_with_extra_args', device_type='cuda'),  # NYI: querying is_contiguous inside of vmap
-        xfail('equal', ''),  # TypeError: object of type 'bool' has no len(); likely testrunner problem
-        xfail('jiterator_unary', device_type='cuda'),  # NYI: querying is_contiguous inside of vmap
-        xfail('jiterator_2inputs_2outputs', device_type='cuda'),  # NYI: querying is_contiguous inside of vmap
+        xfail("bfloat16"),
+        xfail("bool"),
+        xfail("byte"),
+        xfail("char"),
+        xfail("double"),
+        xfail("float"),
+        xfail("half"),
+        xfail("int"),
+        xfail("long"),
+        xfail("short"),
+        xfail("cdouble"),
+        xfail("cfloat"),
+        xfail(
+            "jiterator_binary", device_type="cuda"
+        ),  # NYI: querying is_contiguous inside of vmap
+        xfail(
+            "jiterator_binary_return_by_ref", device_type="cuda"
+        ),  # NYI: querying is_contiguous inside of vmap
+        xfail(
+            "jiterator_4inputs_with_extra_args", device_type="cuda"
+        ),  # NYI: querying is_contiguous inside of vmap
+        xfail(
+            "equal", ""
+        ),  # TypeError: object of type 'bool' has no len(); likely testrunner problem
+        xfail(
+            "jiterator_unary", device_type="cuda"
+        ),  # NYI: querying is_contiguous inside of vmap
+        xfail(
+            "jiterator_2inputs_2outputs", device_type="cuda"
+        ),  # NYI: querying is_contiguous inside of vmap
         # ---------------------------------------------------------------------
-
         # TypeError: expected Tensor as element 0 in argument 0, but got NotImplementedType
-        xfail('__rsub__'),
+        xfail("__rsub__"),
         # RuntimeError: Batching rule not implemented for aten::moveaxis.int;
         # the fallback path doesn't work on out= or view ops.
-        xfail('movedim'),
+        xfail("movedim"),
         # RuntimeError: NYI: querying is_contiguous inside of vmap for
         # memory_format other than torch.contiguous_format
-        xfail('contiguous'),
+        xfail("contiguous"),
         # RuntimeError: NYI: Tensor.clone(memory_format) inside vmap is only supported
         # with memory_format torch.preserve_format or torch.contiguous_format (got ChannelsLast)
-        xfail('clone'),
+        xfail("clone"),
         # RuntimeError: When vmap-ing torch.nn.functional.one_hot,
         # please provide an explicit positive num_classes argument.
-        xfail('nn.functional.one_hot'),
+        xfail("nn.functional.one_hot"),
         # RuntimeError: Expected all tensors to be on the same device,
         # but found at least two devices, cuda:0 and cpu!
-        xfail('eq', device_type='cuda'),
-        xfail('ge', device_type='cuda'),
-        xfail('gt', device_type='cuda'),
-        xfail('le', device_type='cuda'),
-        xfail('lt', device_type='cuda'),
-        xfail('ne', device_type='cuda'),
-
+        xfail("eq", device_type="cuda"),
+        xfail("ge", device_type="cuda"),
+        xfail("gt", device_type="cuda"),
+        xfail("le", device_type="cuda"),
+        xfail("lt", device_type="cuda"),
+        xfail("ne", device_type="cuda"),
         # RuntimeError: aten::_flash_attention_forward hit the vmap fallback which is currently disabled
-        xfail('torch.ops.aten._flash_attention_forward'),
+        xfail("torch.ops.aten._flash_attention_forward"),
     }
 
     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
     @ops(op_db + additional_op_db + autograd_function_db, dtypes=OpDTypes.any_one)
-    @opsToleranceOverride('TestVmapOperatorsOpInfo', 'test_vmap_exhaustive', (
-        tol1('linalg.det',
-             {torch.float32: tol(atol=1e-04, rtol=1e-04)}, device_type='cuda'),
-        # The following is often flaky, but just on windows.
-        # We should investigate if it's actually a problem or not.
-        tol1('nn.functional.conv_transpose3d',
-             {torch.float32: tol(atol=1e-04, rtol=1e-02)}, device_type='cuda'),
-    ))
-    @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04),
-                        torch.complex64: tol(atol=1e-04, rtol=1e-04)})
-    @skipOps('TestVmapOperatorsOpInfo', 'test_vmap_exhaustive', vmap_fail.union({
-        # RuntimeError: Batch norm got a batched tensor as input while the running_mean or running_var,
-        # which will be updated in place, were not batched.
-        xfail('native_batch_norm'),
-        xfail('_native_batch_norm_legit'),
-        xfail('tril'),  # Exception not raised on error input
-        xfail('triu'),  # Exception not raised on error input
-        xfail('as_strided', 'partial_views'),
-
-        # https://github.com/pytorch/pytorch/issues/96560
-        decorate('nn.functional.batch_norm', decorator=skipIfRocm),
-
-        # RuntimeError: output with shape [4, 4] doesn't match the broadcast shape [1, 4, 4]
-        xfail('addcdiv'),
-        xfail('addcmul'),
-        xfail('clamp'),
-
-        xfail('torch.ops.aten._efficient_attention_forward'),  # outputs ints
-
-        # TypeError: expected Tensor as element 0 in argument 0, but got float
-        xfail('item'),
-    }))
+    @opsToleranceOverride(
+        "TestVmapOperatorsOpInfo",
+        "test_vmap_exhaustive",
+        (
+            tol1(
+                "linalg.det",
+                {torch.float32: tol(atol=1e-04, rtol=1e-04)},
+                device_type="cuda",
+            ),
+            # The following is often flaky, but just on windows.
+            # We should investigate if it's actually a problem or not.
+            tol1(
+                "nn.functional.conv_transpose3d",
+                {torch.float32: tol(atol=1e-04, rtol=1e-02)},
+                device_type="cuda",
+            ),
+        ),
+    )
+    @toleranceOverride(
+        {
+            torch.float32: tol(atol=1e-04, rtol=1e-04),
+            torch.complex64: tol(atol=1e-04, rtol=1e-04),
+        }
+    )
+    @skipOps(
+        "TestVmapOperatorsOpInfo",
+        "test_vmap_exhaustive",
+        vmap_fail.union(
+            {
+                # RuntimeError: Batch norm got a batched tensor as input while the running_mean or running_var,
+                # which will be updated in place, were not batched.
+                xfail("native_batch_norm"),
+                xfail("_native_batch_norm_legit"),
+                # TODO: implement batching rule
+                xfail("_batch_norm_with_update"),
+                xfail("tril"),  # Exception not raised on error input
+                xfail("triu"),  # Exception not raised on error input
+                xfail("as_strided", "partial_views"),
+                # https://github.com/pytorch/pytorch/issues/96560
+                decorate("nn.functional.batch_norm", decorator=skipIfRocm),
+                # RuntimeError: output with shape [4, 4] doesn't match the broadcast shape [1, 4, 4]
+                xfail("addcdiv"),
+                xfail("addcmul"),
+                xfail("clamp"),
+                xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
+                # TypeError: expected Tensor as element 0 in argument 0, but got float
+                xfail("item"),
+            }
+        ),
+    )
     def test_vmap_exhaustive(self, device, dtype, op):
         # needs to be fixed
-        inplace_failure_list = (
+        inplace_failure_list = ()
+        self.opinfo_vmap_test(
+            device,
+            dtype,
+            op,
+            check_has_batch_rule=False,
+            skip_inplace=inplace_failure_list,
         )
-        self.opinfo_vmap_test(device, dtype, op, check_has_batch_rule=False,
-                              skip_inplace=inplace_failure_list)
 
     @with_tf32_off
     @ops(op_db + additional_op_db + autograd_function_db, dtypes=OpDTypes.any_one)
-    @opsToleranceOverride('TestVmapOperatorsOpInfo', 'test_op_has_batch_rule', (
-        tol1('linalg.det',
-             {torch.float32: tol(atol=1e-04, rtol=1e-04)}, device_type='cuda'),
-    ))
-    @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04), torch.complex64: tol(atol=1e-04, rtol=1e-04)})
-    @skipOps('TestVmapOperatorsOpInfo', 'test_op_has_batch_rule', vmap_fail.union({
-        xfail('as_strided', 'partial_views'),
-        skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
-        xfail('fill'),
-        # Batch norm got a batched tensor as input while the running_mean or running_var,
-        # which will be updated in place, were not batched.
-        xfail('native_batch_norm'),
-        xfail('_native_batch_norm_legit'),
-        xfail('histogram'),
-        xfail('scatter_reduce', 'sum'),
-        xfail('scatter_reduce', 'mean'),
-        xfail('scatter_reduce', 'amax'),
-        xfail('scatter_reduce', 'amin'),
-        # `index_put` OpInfo in pytorch/pytorch has
-        # masked index as input which is not supported
-        xfail('index_put', ''),
-        xfail('isin'),
-        xfail('lu_unpack'),
-        xfail('masked_fill'),
-        xfail('masked_scatter'),
-        xfail('masked_select'),
-        xfail('nanquantile'),
-        xfail('ormqr'),
-        xfail('put'),
-        xfail('quantile'),
-        xfail('renorm'),
-        xfail('resize_as_'),
-        xfail('take'),
-        xfail('tensor_split'),
-        xfail('to_sparse'),
-        # TypeError: expected Tensor as element 0 in argument 0, but got float
-        xfail('item'),
-        xfail('tril'),  # Exception not raised on error input
-        xfail('triu'),  # Exception not raised on error input
-        xfail('__getitem__', ''),
-        xfail('count_nonzero'),
-        xfail('nn.functional.dropout'),  # works, can't check against for loop because of randomness inconsistency
-        xfail('nn.functional.scaled_dot_product_attention'),  # randomness
-        xfail('nn.functional.multi_head_attention_forward'),  # randomness
-        xfail('torch.ops.aten._efficient_attention_forward'),  # outputs ints
-        xfail('resize_'),
-        xfail('view_as_complex'),
-        xfail('matrix_exp'),
-        xfail('fft.ihfft2'),
-        xfail('fft.ihfftn'),
-        xfail('allclose'),
-        xfail('argwhere'),
-        xfail('unique_consecutive'),
-        xfail('unique'),
-        xfail('nn.functional.ctc_loss'),
-        xfail('nn.functional.gaussian_nll_loss'),
-        xfail('histc'),
-        xfail('as_strided'),
-        xfail('istft'),
-        xfail('nonzero'),
-        xfail('nn.functional.fractional_max_pool2d'),
-        xfail('stft'),
-        xfail('isclose'),
-        xfail('nn.functional.fractional_max_pool3d'),
-        xfail('nn.functional.bilinear'),
-        xfail('nn.functional.embedding_bag'),
-        xfail('linalg.tensorsolve'),
-        xfail('bernoulli', ''),
-        xfail('nn.functional.feature_alpha_dropout', 'with_train'),
-        xfail('native_dropout_backward'),
-        xfail('nn.functional.kl_div', ''),
-        xfail('multinomial', ''),
-        xfail('pca_lowrank', ''),
-        xfail('normal', ''),
-        xfail('nn.functional.dropout2d', ''),
-        xfail('normal', 'number_mean'),
-        xfail('svd_lowrank', ''),
-        xfail('diagflat', ''),
-        xfail('special.log_ndtr'),
-        xfail('narrow'),  # Batching rule not implemented for aten::narrow.Tensor
-        xfail('nn.functional.triplet_margin_loss', ''),
-        xfail('nn.functional.pdist', ''),
-        xfail('scatter_reduce', 'sum'),
-        xfail('scatter_reduce', 'amax'),
-        xfail('nn.functional.max_unpool1d', 'grad'),
-        xfail('nn.functional.multi_margin_loss', ''),
-        xfail('scatter_reduce', 'prod'),
-        xfail('nn.functional.multilabel_margin_loss', ''),
-        xfail('scatter_reduce', 'amin'),
-        xfail('nn.functional.max_unpool3d', 'grad'),
-        xfail('nn.functional.max_unpool2d', ''),
-        xfail('nn.functional.max_unpool2d', 'grad'),
-        xfail('nn.functional.margin_ranking_loss', ''),
-        xfail('nn.functional.max_unpool1d', ''),
-        xfail('nn.functional.soft_margin_loss', ''),
-        xfail('scatter_reduce', 'mean'),
-        xfail('nn.functional.max_unpool3d', ''),
-        xfail('linalg.ldl_solve', '', device_type='cpu'),
-        xfail('chalf', ''),
-        xfail('clamp_max', ''),
-        xfail('jiterator_binary_return_by_ref', device_type='cuda'),
-        xfail('jiterator_unary', device_type='cuda'),
-        xfail('jiterator_2inputs_2outputs', device_type='cuda'),
-        xfail('special.airy_ai'),
-        xfail('clamp_min', ''),
-        xfail('sparse.sampled_addmm'),
-        xfail('sparse.mm', 'reduce'),
-        xfail('special.chebyshev_polynomial_u'),
-        xfail('_segment_reduce', 'offsets'),
-        xfail('index_reduce', ''),
-        xfail('special.laguerre_polynomial_l'),
-        xfail('special.hermite_polynomial_h'),
-        xfail('jiterator_binary', device_type='cuda'),
-        xfail('jiterator_4inputs_with_extra_args', device_type='cuda'),
-        xfail('_segment_reduce', 'lengths'),
-        xfail('lu_solve', ''),
-        xfail('special.hermite_polynomial_he'),
-        xfail('nn.functional.dropout3d', ''),
-        xfail('special.chebyshev_polynomial_t'),
-        xfail('as_strided_scatter', ''),
-        xfail('equal', ''),
-        xfail('linalg.lu', ''),
-        skip('linalg.ldl_solve', ''),
-        skip('_softmax_backward_data'),
-        # https://github.com/pytorch/pytorch/issues/96560
-        decorate('nn.functional.batch_norm', decorator=skipIfRocm),
-
-        # One or more of the overload doesn't have a Batch rule.
-        xfail('bincount'),
-        # RuntimeError: Expected all tensors to be on the same device,
-        # but found at least two devices, cuda:0 and cpu!
-        xfail('ge', device_type='cuda'),
-        xfail('argsort'),  # aten::argsort.stable hit the vmap fallback which is currently disabled
-        xfail('searchsorted'),  # aten::searchsorted.Scalar hit the vmap fallback which is currently disabled
-    }))
+    @opsToleranceOverride(
+        "TestVmapOperatorsOpInfo",
+        "test_op_has_batch_rule",
+        (
+            tol1(
+                "linalg.det",
+                {torch.float32: tol(atol=1e-04, rtol=1e-04)},
+                device_type="cuda",
+            ),
+        ),
+    )
+    @toleranceOverride(
+        {
+            torch.float32: tol(atol=1e-04, rtol=1e-04),
+            torch.complex64: tol(atol=1e-04, rtol=1e-04),
+        }
+    )
+    @skipOps(
+        "TestVmapOperatorsOpInfo",
+        "test_op_has_batch_rule",
+        vmap_fail.union(
+            {
+                xfail("as_strided", "partial_views"),
+                skip(
+                    "to"
+                ),  # RuntimeError: required rank 4 tensor to use channels_last format
+                xfail("fill"),
+                # Batch norm got a batched tensor as input while the running_mean or running_var,
+                # which will be updated in place, were not batched.
+                xfail("native_batch_norm"),
+                xfail("_native_batch_norm_legit"),
+                # TODO: implement batching rule
+                xfail("_batch_norm_with_update"),
+                xfail("histogram"),
+                xfail("scatter_reduce", "sum"),
+                xfail("scatter_reduce", "mean"),
+                xfail("scatter_reduce", "amax"),
+                xfail("scatter_reduce", "amin"),
+                # `index_put` OpInfo in pytorch/pytorch has
+                # masked index as input which is not supported
+                xfail("index_put", ""),
+                xfail("isin"),
+                xfail("masked_fill"),
+                xfail("masked_scatter"),
+                xfail("masked_select"),
+                xfail("nanquantile"),
+                xfail("ormqr"),
+                xfail("put"),
+                xfail("quantile"),
+                xfail("renorm"),
+                xfail("resize_as_"),
+                xfail("take"),
+                xfail("tensor_split"),
+                xfail("to_sparse"),
+                # TypeError: expected Tensor as element 0 in argument 0, but got float
+                xfail("item"),
+                xfail("tril"),  # Exception not raised on error input
+                xfail("triu"),  # Exception not raised on error input
+                xfail("__getitem__", ""),
+                xfail("count_nonzero"),
+                xfail(
+                    "nn.functional.dropout"
+                ),  # works, can't check against for loop because of randomness inconsistency
+                xfail("nn.functional.scaled_dot_product_attention"),  # randomness
+                xfail("nn.functional.multi_head_attention_forward"),  # randomness
+                xfail("torch.ops.aten._efficient_attention_forward"),  # outputs ints
+                xfail("resize_"),
+                xfail("view_as_complex"),
+                xfail("matrix_exp"),
+                xfail("fft.ihfft2"),
+                xfail("fft.ihfftn"),
+                xfail("allclose"),
+                xfail("argwhere"),
+                xfail("unique_consecutive"),
+                xfail("unique"),
+                xfail("nn.functional.ctc_loss"),
+                xfail("nn.functional.gaussian_nll_loss"),
+                xfail("histc"),
+                xfail("as_strided"),
+                xfail("istft"),
+                xfail("nonzero"),
+                xfail("nn.functional.fractional_max_pool2d"),
+                xfail("stft"),
+                xfail("isclose"),
+                xfail("nn.functional.fractional_max_pool3d"),
+                xfail("nn.functional.bilinear"),
+                xfail("nn.functional.embedding_bag"),
+                xfail("linalg.tensorsolve"),
+                xfail("bernoulli", ""),
+                xfail("nn.functional.feature_alpha_dropout", "with_train"),
+                xfail("native_dropout_backward"),
+                xfail("nn.functional.kl_div", ""),
+                xfail("multinomial", ""),
+                xfail("pca_lowrank", ""),
+                xfail("normal", ""),
+                xfail("nn.functional.dropout2d", ""),
+                xfail("normal", "number_mean"),
+                xfail("svd_lowrank", ""),
+                xfail("diagflat", ""),
+                xfail("special.log_ndtr"),
+                xfail(
+                    "narrow"
+                ),  # Batching rule not implemented for aten::narrow.Tensor
+                xfail("nn.functional.triplet_margin_loss", ""),
+                xfail("nn.functional.pdist", ""),
+                xfail("scatter_reduce", "sum"),
+                xfail("scatter_reduce", "amax"),
+                xfail("nn.functional.max_unpool1d", "grad"),
+                xfail("nn.functional.multi_margin_loss", ""),
+                xfail("scatter_reduce", "prod"),
+                xfail("nn.functional.multilabel_margin_loss", ""),
+                xfail("scatter_reduce", "amin"),
+                xfail("nn.functional.max_unpool3d", "grad"),
+                xfail("nn.functional.max_unpool2d", ""),
+                xfail("nn.functional.max_unpool2d", "grad"),
+                xfail("nn.functional.margin_ranking_loss", ""),
+                xfail("nn.functional.max_unpool1d", ""),
+                xfail("nn.functional.soft_margin_loss", ""),
+                xfail("scatter_reduce", "mean"),
+                xfail("nn.functional.max_unpool3d", ""),
+                xfail("linalg.ldl_solve", "", device_type="cpu"),
+                xfail("chalf", ""),
+                xfail("clamp_max", ""),
+                xfail("jiterator_binary_return_by_ref", device_type="cuda"),
+                xfail("jiterator_unary", device_type="cuda"),
+                xfail("jiterator_2inputs_2outputs", device_type="cuda"),
+                xfail("special.airy_ai"),
+                xfail("clamp_min", ""),
+                xfail("sparse.sampled_addmm"),
+                xfail("sparse.mm", "reduce"),
+                xfail("special.chebyshev_polynomial_u"),
+                xfail("_segment_reduce", "offsets"),
+                xfail("index_reduce", "prod"),
+                xfail("index_reduce", "mean"),
+                xfail("index_reduce", "amin"),
+                xfail("index_reduce", "amax"),
+                xfail("special.laguerre_polynomial_l"),
+                xfail("special.hermite_polynomial_h"),
+                xfail("jiterator_binary", device_type="cuda"),
+                xfail("jiterator_4inputs_with_extra_args", device_type="cuda"),
+                xfail("_segment_reduce", "lengths"),
+                xfail("lu_solve", ""),
+                xfail("special.hermite_polynomial_he"),
+                xfail("nn.functional.dropout3d", ""),
+                xfail("special.chebyshev_polynomial_t"),
+                xfail("as_strided_scatter", ""),
+                xfail("equal", ""),
+                xfail("linalg.lu", ""),
+                skip("linalg.ldl_solve", ""),
+                skip("_softmax_backward_data"),
+                # https://github.com/pytorch/pytorch/issues/96560
+                decorate("nn.functional.batch_norm", decorator=skipIfRocm),
+                # One or more of the overload doesn't have a Batch rule.
+                xfail("bincount"),
+                # RuntimeError: Expected all tensors to be on the same device,
+                # but found at least two devices, cuda:0 and cpu!
+                xfail("ge", device_type="cuda"),
+                xfail(
+                    "argsort"
+                ),  # aten::argsort.stable hit the vmap fallback which is currently disabled
+                xfail(
+                    "searchsorted"
+                ),  # aten::searchsorted.Scalar hit the vmap fallback which is currently disabled
+            }
+        ),
+    )
     def test_op_has_batch_rule(self, device, dtype, op):
         # needs to be fixed
         inplace_failures = (
-            'addbmm',
-            'addcdiv',
-            'addcmul',
-            'addmm',
-            'addmv',
-            'addr',
-            'baddbmm',
-            'clamp',
-            'conj_physical',
-            'cumprod',
-            'cumsum',
-            'floor_divide',
-            'fmod',
-            'heaviside',
-            'hypot',
-            'igamma',
-            'igammac',
-            'index_copy',
-            'ldexp',
-            'lerp',
-            'neg',
-            'nextafter',
-            'polygamma',
-            'pow',
-            'remainder',
-            'scatter_add',
-            'scatter',
-            'square',
-            'sub',
-            'trunc',
-            'xlogy',
-        )
-        self.opinfo_vmap_test(device, dtype, op, check_has_batch_rule=True,
-                              skip_inplace=inplace_failures)
+            "addbmm",
+            "addcdiv",
+            "addcmul",
+            "addmm",
+            "addmv",
+            "addr",
+            "baddbmm",
+            "clamp",
+            "conj_physical",
+            "cumprod",
+            "cumsum",
+            "floor_divide",
+            "fmod",
+            "heaviside",
+            "hypot",
+            "igamma",
+            "igammac",
+            "index_copy",
+            "ldexp",
+            "lerp",
+            "neg",
+            "nextafter",
+            "polygamma",
+            "pow",
+            "remainder",
+            "scatter_add",
+            "scatter",
+            "square",
+            "sub",
+            "trunc",
+            "xlogy",
+        )
+        self.opinfo_vmap_test(
+            device, dtype, op, check_has_batch_rule=True, skip_inplace=inplace_failures
+        )
 
     def test_linalg_svd(self, device):
         # linalg_svd returns a tuple of three tensors, (U, S, Vh).
@@ -3840,12 +4444,17 @@ def compute_A(out):
             diag_S.diagonal(offset=0, dim1=-2, dim2=-1).copy_(S)
             return U @ diag_S @ Vh
 
-        opinfos = [op for op in op_db if op.name == 'linalg.svd']
+        opinfos = [op for op in op_db if op.name == "linalg.svd"]
         assert len(opinfos) > 0
 
         for op in opinfos:
-            self.opinfo_vmap_test(device, torch.float, op, check_has_batch_rule=True,
-                                  postprocess_fn=compute_A)
+            self.opinfo_vmap_test(
+                device,
+                torch.float,
+                op,
+                check_has_batch_rule=True,
+                postprocess_fn=compute_A,
+            )
 
     def test_linalg_eigh(self, device):
         # linalg_svd returns two tensors, (Q, L).
@@ -3862,14 +4471,19 @@ def compute_A(out):
             Qh = Q.transpose(-2, -1).conj()
             return Q @ diag_L @ Qh
 
-        opinfos = [op for op in op_db if op.name == 'linalg.eigh']
+        opinfos = [op for op in op_db if op.name == "linalg.eigh"]
         assert len(opinfos) > 0
 
         for op in opinfos:
-            self.opinfo_vmap_test(device, torch.float, op, check_has_batch_rule=True,
-                                  postprocess_fn=compute_A)
-
-    @skipIfTorchDynamo
+            self.opinfo_vmap_test(
+                device,
+                torch.float,
+                op,
+                check_has_batch_rule=True,
+                postprocess_fn=compute_A,
+            )
+
+    @skipIfTorchDynamo()
     def test_slogdet(self, device):
         # There's no OpInfo for this
         def test():
@@ -3890,7 +4504,9 @@ def test1():
             dim = -2
             index = torch.tensor([[2, 3], [0, 4]], device=device)
             value = 5.0
-            self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None))
+            self.vmap_outplace_test(
+                torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None)
+            )
 
         def test2():
             # self batched, self logical rank 1, index logical rank 1
@@ -3898,7 +4514,9 @@ def test2():
             dim = 0
             index = torch.tensor([[0], [1]], device=device)
             for value in (1.0, torch.rand((), device=device)):
-                self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (0, None, 0, None))
+                self.vmap_outplace_test(
+                    torch.index_fill, (x, dim, index, value), {}, (0, None, 0, None)
+                )
 
         def test3():
             # self batched, self logical rank 1, index logical rank 0
@@ -3906,7 +4524,9 @@ def test3():
             dim = 0
             index = torch.tensor([0, 1], device=device)
             for value in (1.0, torch.rand((), device=device)):
-                self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (0, None, 0, None))
+                self.vmap_outplace_test(
+                    torch.index_fill, (x, dim, index, value), {}, (0, None, 0, None)
+                )
 
         def test4():
             # self not batched, self logical rank 0, index logical rank 1
@@ -3914,7 +4534,9 @@ def test4():
             dim = 0
             index = torch.tensor([[0], [0]], device=device)
             for value in (1.0, torch.rand((), device=device)):
-                self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None))
+                self.vmap_outplace_test(
+                    torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None)
+                )
 
         def test5():
             # self not batched, self logical rank 0, index logical rank 0
@@ -3922,7 +4544,9 @@ def test5():
             dim = 0
             index = torch.tensor([0, 0], device=device)
             for value in (1.0, torch.rand((), device=device)):
-                self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None))
+                self.vmap_outplace_test(
+                    torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None)
+                )
 
         def test6():
             # self not batched, self logical rank 0, index logical rank 1
@@ -3930,7 +4554,9 @@ def test6():
             dim = 0
             index = torch.tensor([[0], [1]], device=device)
             for value in (1.0, torch.rand((), device=device)):
-                self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None))
+                self.vmap_outplace_test(
+                    torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None)
+                )
 
         def test7():
             # self not batched, self logical rank 0, index logical rank 0
@@ -3938,7 +4564,9 @@ def test7():
             dim = 0
             index = torch.tensor([0, 1], device=device)
             for value in (1.0, torch.rand((), device=device)):
-                self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None))
+                self.vmap_outplace_test(
+                    torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None)
+                )
 
         def test8():
             # self batched, self logical rank > 1, index logical rank 0
@@ -3946,7 +4574,9 @@ def test8():
             dim = 0
             index = torch.tensor([0, 1], device=device)
             for value in (1.0, torch.rand((), device=device)):
-                self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (0, None, 0, None))
+                self.vmap_outplace_test(
+                    torch.index_fill, (x, dim, index, value), {}, (0, None, 0, None)
+                )
 
         for test in (test1, test2, test3, test4, test5, test6, test7, test8):
             check_vmap_fallback(self, test, torch.index_fill)
@@ -3983,16 +4613,31 @@ def test_conv_double_backward(self, device):
         output_padding = (0, 0)
         groups = 1
         output_mask = (True, True, True)
-        gO = torch.randn_like(F.conv2d(images, weight, bias, stride, padding, dilation, groups))
+        gO = torch.randn_like(
+            F.conv2d(images, weight, bias, stride, padding, dilation, groups)
+        )
 
         args = (
-            ggI, ggW, ggb, gO, weight, images, stride, padding, dilation,
-            transposed, output_padding, groups, output_mask,
+            ggI,
+            ggW,
+            ggb,
+            gO,
+            weight,
+            images,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+            output_mask,
         )
         op = torch.ops.aten._convolution_double_backward
 
         generator = get_fallback_and_vmap_exhaustive(op, args, {})
-        is_cuda_sm86 = device.startswith("cuda") and torch.cuda.get_device_capability(0) == (8, 6)
+        is_cuda_sm86 = device.startswith("cuda") and torch.cuda.get_device_capability(
+            0
+        ) == (8, 6)
         atol, rtol = (1e-3, 1e-3) if is_cuda_sm86 else (1e-4, 1e-4)
 
         def test():
@@ -4008,11 +4653,11 @@ def test_isnan(self, device):
         op = torch.isnan
 
         x = torch.randn(B, N, C, H, W)
-        x[x > 0] = float('nan')
+        x[x > 0] = float("nan")
         test(self, op, (x,), in_dims=(0))
 
     def test_sum_scalar(self, device):
-        x = torch.tensor([10.], device=device)
+        x = torch.tensor([10.0], device=device)
         y = vmap(torch.sum)(x)
         self.assertEqual(y, x)
 
@@ -4029,7 +4674,7 @@ def test_isinf(self, device):
         op = torch.isinf
 
         x = torch.randn(B, N, C, H, W)
-        x[x > 0] = float('inf')
+        x[x > 0] = float("inf")
         test(self, op, (x,), in_dims=(0))
 
     def test_foo_like(self, device):
@@ -4041,7 +4686,9 @@ def test_foo_like(self, device):
             x = torch.randn(B, N, C, H, W)
             # todo(chilli): test these better
             # Not testing correctness, just that they run
-            vmap(op, in_dims=(0,))(x,)
+            vmap(op, in_dims=(0,))(
+                x,
+            )
 
     def test_flatten(self, device):
         test = functools.partial(_vmap_test, check_propagates_grad=False)
@@ -4071,7 +4718,9 @@ def test_index_put(self, device):
         def test(f, t, idx, values):
             base = f(t[0], idx[0], values[0])
             self.assertEqual(vmap(f, in_dims=(0, 0, 0))(t, idx, values)[0], base)
-            self.assertEqual(vmap(f, in_dims=(0, None, None))(t, idx[0], values[0])[0], base)
+            self.assertEqual(
+                vmap(f, in_dims=(0, None, None))(t, idx[0], values[0])[0], base
+            )
             self.assertEqual(vmap(f, in_dims=(0, None, 0))(t, idx[0], values)[0], base)
             self.assertEqual(vmap(f, in_dims=(0, 0, None))(t, idx, values[0])[0], base)
 
@@ -4116,15 +4765,24 @@ def f(t, values):
         # index_put_
         tensor = torch.zeros(3, 3, 4)
         value = torch.ones(3, 2)
-        idxs = (torch.tensor([[0], [1], [2]]), torch.tensor([[0]]), torch.tensor([1, 2]))
+        idxs = (
+            torch.tensor([[0], [1], [2]]),
+            torch.tensor([[0]]),
+            torch.tensor([1, 2]),
+        )
         expected = torch.index_put_(tensor.clone(), idxs, value)
 
         def f(t, idx, v):
             torch.index_put_(t, idx, v)
             return t
 
-        self.assertEqual(vmap(f, in_dims=(0, (None, None), 0))(tensor, idxs[1:], value), expected)
-        self.assertEqual(vmap(f, in_dims=(0, (None, None), None))(tensor, idxs[1:], value[0]), expected)
+        self.assertEqual(
+            vmap(f, in_dims=(0, (None, None), 0))(tensor, idxs[1:], value), expected
+        )
+        self.assertEqual(
+            vmap(f, in_dims=(0, (None, None), None))(tensor, idxs[1:], value[0]),
+            expected,
+        )
 
         # boolean mask
         B = 2
@@ -4139,9 +4797,9 @@ def f(x, gy):
 
         self.vmap_outplace_test(f, (x, gy), {}, in_dims=(None, 0))
 
-    @parametrize('training', [True, False])
-    @parametrize('track_running_stats', [True, False])
-    @parametrize('affine', [True, False])
+    @parametrize("training", [True, False])
+    @parametrize("track_running_stats", [True, False])
+    @parametrize("affine", [True, False])
     def test_batch_norm(self, device, affine, track_running_stats, training):
         if not track_running_stats and not training:
             return
@@ -4151,9 +4809,9 @@ def test_batch_norm(self, device, affine, track_running_stats, training):
         ensemble_size = 10
         hidden_dim = 3
 
-        weights, buffers, _, _, _ = \
-            functional_init_with_buffers(BN, [ensemble_size])(
-                hidden_dim, affine=affine, track_running_stats=track_running_stats)
+        weights, buffers, _, _, _ = functional_init_with_buffers(BN, [ensemble_size])(
+            hidden_dim, affine=affine, track_running_stats=track_running_stats
+        )
 
         inputs = [torch.randn(ensemble_size, 32, hidden_dim, 16, 16, device=device)]
         in_dims = [0]
@@ -4190,13 +4848,23 @@ def op(inp, running_mean, running_var, weight, bias, training):
 
     def test_torch_return_types_returns(self, device):
         t = torch.randn(3, 2, 2, device=device)
-        self.assertTrue(isinstance(vmap(torch.min, (0, None))(t, 0), torch.return_types.min))
-        self.assertTrue(isinstance(vmap(torch.max, (0, None))(t, 0), torch.return_types.max))
-        self.assertTrue(isinstance(vmap(torch.topk, (0, None, None))(t, 1, 0), torch.return_types.topk))
-        self.assertTrue(isinstance(vmap(torch.linalg.eig, (0))(t), torch.return_types.linalg_eig))
+        self.assertTrue(
+            isinstance(vmap(torch.min, (0, None))(t, 0), torch.return_types.min)
+        )
+        self.assertTrue(
+            isinstance(vmap(torch.max, (0, None))(t, 0), torch.return_types.max)
+        )
+        self.assertTrue(
+            isinstance(
+                vmap(torch.topk, (0, None, None))(t, 1, 0), torch.return_types.topk
+            )
+        )
+        self.assertTrue(
+            isinstance(vmap(torch.linalg.eig, (0))(t), torch.return_types.linalg_eig)
+        )
 
     def test_namedtuple_returns(self, device):
-        Point = namedtuple('Point', ['x', 'y'])
+        Point = namedtuple("Point", ["x", "y"])
 
         def f(x, y):
             return Point(x=x, y=y)
@@ -4217,14 +4885,18 @@ def func(leaf):
 
         def push_vjp(leaf, gout):
             _, vjp_fn = vjp(func, leaf)
-            result, = vjp_fn(gout)
+            (result,) = vjp_fn(gout)
             return result
 
         leaf = torch.randn(4, 4, device=device)
         gout = torch.randn(2, 2, device=device)
         args = (leaf, gout)
 
-        for batched_args, in_dims, _, in generate_vmap_inputs(args, {}):
+        for (
+            batched_args,
+            in_dims,
+            _,
+        ) in generate_vmap_inputs(args, {}):
             if in_dims[1] is None:
                 # triggers some composite compliance problem
                 continue
@@ -4244,13 +4916,17 @@ def f2(x, idx):
         def f3(x, idx):
             return x[:, :, idx]
 
-        inps = (torch.randn(5, 5, 5, device=device),
-                torch.randn(5, 5, 5, 5, device=device),
-                torch.randn(5, 5, 5, 5, 5, device=device))
-        idxes = (torch.tensor([0, 1, 2], device=device),
-                 torch.tensor([0, 1, 2], device=device).reshape(3, 1),
-                 torch.tensor([0, 1, 2], device=device).reshape(3, 1, 1))
-        for (inp, idx) in itertools.product(inps, idxes):
+        inps = (
+            torch.randn(5, 5, 5, device=device),
+            torch.randn(5, 5, 5, 5, device=device),
+            torch.randn(5, 5, 5, 5, 5, device=device),
+        )
+        idxes = (
+            torch.tensor([0, 1, 2], device=device),
+            torch.tensor([0, 1, 2], device=device).reshape(3, 1),
+            torch.tensor([0, 1, 2], device=device).reshape(3, 1, 1),
+        )
+        for inp, idx in itertools.product(inps, idxes):
             test(f, (inp, idx))
             test(f2, (inp, idx))
             test(f3, (inp, idx))
@@ -4281,27 +4957,38 @@ def f(e_):
         b = with_vmap(_fake_vmap)
         self.assertEqual(a, b)
 
-    @ops(filter(lambda op: "linalg" in op.name, op_db + additional_op_db), allowed_dtypes=(torch.float,))
-    @skipOps('TestVmapOperatorsOpInfo', 'test_vmap_linalg_failure_1D_input', {
-        xfail('linalg.vector_norm'),  # can accept vector inputs
-        xfail('linalg.norm'),  # can accept vector inputs
-        xfail('linalg.norm', 'subgradients_at_zero'),  # can accept vector inputs
-        xfail('linalg.vander'),  # can accept vector inputs
-        skip('linalg.multi_dot'),  # accepts list of tensor inputs, has its own special test
-        xfail('linalg.vecdot'),
-        # throws in vmap on CUDA
-        # IndexError: Dimension out of range (expected to be in range of [-1, 0], but got -2)
-        # https://github.com/pytorch/pytorch/runs/8110653462?check_suite_focus=true
-        # but it passes locally
-        xfail('linalg.diagonal'),
-        skip('linalg.matrix_norm', ''),
-        skip('linalg.ldl_solve', ''),
-    })
+    @ops(
+        filter(lambda op: "linalg" in op.name, op_db + additional_op_db),
+        allowed_dtypes=(torch.float,),
+    )
+    @skipOps(
+        "TestVmapOperatorsOpInfo",
+        "test_vmap_linalg_failure_1D_input",
+        {
+            xfail("linalg.vector_norm"),  # can accept vector inputs
+            xfail("linalg.norm"),  # can accept vector inputs
+            xfail("linalg.norm", "subgradients_at_zero"),  # can accept vector inputs
+            xfail("linalg.vander"),  # can accept vector inputs
+            skip(
+                "linalg.multi_dot"
+            ),  # accepts list of tensor inputs, has its own special test
+            xfail("linalg.vecdot"),
+            # throws in vmap on CUDA
+            # IndexError: Dimension out of range (expected to be in range of [-1, 0], but got -2)
+            # https://github.com/pytorch/pytorch/runs/8110653462?check_suite_focus=true
+            # but it passes locally
+            xfail("linalg.diagonal"),
+            skip("linalg.matrix_norm", ""),
+            skip("linalg.ldl_solve", ""),
+        },
+    )
     def test_vmap_linalg_failure_1D_input(self, device, dtype, op):
         for sample in op.sample_inputs(device, dtype, requires_grad=False):
             if sample.input.dim() != 2 or sample.input.shape[0] == 0:
                 continue
-            test_input = sample.input[0]  # using the sample input avoids numerical inconsistency issues
+            test_input = sample.input[
+                0
+            ]  # using the sample input avoids numerical inconsistency issues
             with self.assertRaisesRegex(RuntimeError, "dimension"):
                 op(test_input, *sample.args, **sample.kwargs)
 
@@ -4330,41 +5017,59 @@ def test_vmap_escaped_error(self):
         def f(x):
             nonlocal escaped
             escaped = x
-            return x ** 2
+            return x**2
 
         x = torch.randn([3, 3, 3, 3, 3])
         vmap(f)(x)
 
-        common_message = r"your tensor may have escaped from inside a function being vmapped.*{0}.*"
+        common_message = (
+            r"your tensor may have escaped from inside a function being vmapped.*{0}.*"
+        )
 
         # Note: These are not a complete set of tests for all possible functions calling 'vmap_check_escaped'
 
-        with self.assertRaisesRegex(RuntimeError, common_message.format("gen_vmap_plumbing")):
+        with self.assertRaisesRegex(
+            RuntimeError, common_message.format("gen_vmap_plumbing")
+        ):
             escaped.sin()
 
-        with self.assertRaisesRegex(RuntimeError, common_message.format("boxed_tensor_inputs_batch_rule")):
+        with self.assertRaisesRegex(
+            RuntimeError, common_message.format("boxed_tensor_inputs_batch_rule")
+        ):
             escaped.sin_()
 
-        with self.assertRaisesRegex(RuntimeError, common_message.format("gen_vmap_inplace_plumbing")):
+        with self.assertRaisesRegex(
+            RuntimeError, common_message.format("gen_vmap_inplace_plumbing")
+        ):
             escaped.mul_(1)
 
-        with self.assertRaisesRegex(RuntimeError, common_message.format("binary_cross_entropy_plumbing")):
+        with self.assertRaisesRegex(
+            RuntimeError, common_message.format("binary_cross_entropy_plumbing")
+        ):
             torch.nn.functional.binary_cross_entropy(escaped, torch.zeros([3, 3, 3, 3]))
 
-        with self.assertRaisesRegex(RuntimeError, common_message.format("boxed_existing_bdim_all_batch_rule")):
+        with self.assertRaisesRegex(
+            RuntimeError, common_message.format("boxed_existing_bdim_all_batch_rule")
+        ):
             torch.nn.functional.adaptive_max_pool2d(escaped, output_size=(1, 1))
 
-        with self.assertRaisesRegex(RuntimeError, common_message.format("boxed_reduction_batch_rule")):
+        with self.assertRaisesRegex(
+            RuntimeError, common_message.format("boxed_reduction_batch_rule")
+        ):
             escaped.argmin()
 
         a = torch.zeros([4, 4, 4, 4])
         b = torch.zeros([4, 4, 4, 4], dtype=torch.long)
-        with self.assertRaisesRegex(RuntimeError, common_message.format("boxed_all_tensors_have_optional_bdim")):
+        with self.assertRaisesRegex(
+            RuntimeError, common_message.format("boxed_all_tensors_have_optional_bdim")
+        ):
             torch.ops.aten.adaptive_max_pool2d_backward(escaped, a, b)
 
         vmap(f)(torch.tensor([[0, 0], [0, 0]], dtype=torch.int))
-        with self.assertRaisesRegex(RuntimeError, common_message.format("gen_vmap_plumbing_no_returns")):
-            torch.ops.aten._linalg_check_errors(escaped, 'linalg.inv', is_matrix=False)
+        with self.assertRaisesRegex(
+            RuntimeError, common_message.format("gen_vmap_plumbing_no_returns")
+        ):
+            torch.ops.aten._linalg_check_errors(escaped, "linalg.inv", is_matrix=False)
 
     def test_vmap_with_anomaly_detection(self):
         with torch.autograd.set_detect_anomaly(True):
@@ -4388,7 +5093,9 @@ def test_searchsorted_bucketize(self, device):
         # Thus we test explicitly with different samples across a batch.
 
         def test():
-            boundaries = torch.tensor([[1, 4, 5, 7, 9], [1, 2, 6, 8, 10]], device=device)
+            boundaries = torch.tensor(
+                [[1, 4, 5, 7, 9], [1, 2, 6, 8, 10]], device=device
+            )
             v = torch.tensor(3, device=device)
             self.vmap_outplace_test(torch.searchsorted, (boundaries, v), {}, (0, None))
             self.vmap_outplace_test(torch.bucketize, (v, boundaries), {}, (None, 0))
@@ -4399,10 +5106,15 @@ def test():
 
         test()
 
+
 @markDynamoStrictTest
 class TestRandomness(TestCase):
     def _reset_random(self, generator, orig_state, use_generator, seed):
-        return generator.set_state(orig_state) if use_generator else torch.manual_seed(seed)
+        return (
+            generator.set_state(orig_state)
+            if use_generator
+            else torch.manual_seed(seed)
+        )
 
     def _get_image(self, batched_input, batch_size, device):
         if batched_input == "first":
@@ -4418,26 +5130,33 @@ def _assert_all_slices_equal(self, tensor):
 
     def _assert_all_slices_unique(self, tensor):
         B0 = tensor.shape[0]
-        slices_equal = vmap(vmap(lambda x, y: (x == y).all(), (0, None)), (None, 0))(tensor, tensor)
+        slices_equal = vmap(vmap(lambda x, y: (x == y).all(), (0, None)), (None, 0))(
+            tensor, tensor
+        )
         assert slices_equal.shape == (B0, B0)
         slices_equal.diagonal().zero_()
         self.assertEqual(slices_equal, torch.zeros_like(slices_equal))
 
     def _assert_throws_in_error_mode(self, fn, args, in_dims):
-        with self.assertRaisesRegex(RuntimeError, r"called random operation while in randomness error mode"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"called random operation while in randomness error mode"
+        ):
             vmap(fn, in_dims=in_dims, randomness="error")(*args)
 
     def _assert_throws_in_different_mode_inplace(self, fn, args, in_dims):
-        with self.assertRaisesRegex(RuntimeError, r"different inplace randomness on an unbatched tensor"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"different inplace randomness on an unbatched tensor"
+        ):
             vmap(fn, in_dims=in_dims, randomness="different")(*args)
 
     def _assert_throws_in_same_mode_batched(self, fn, args, in_dims):
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"Vmap does not currently support same randomness with a batched tensor input"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Vmap does not currently support same randomness with a batched tensor input",
+        ):
             vmap(fn, in_dims=in_dims, randomness="same")(*args)
 
     def _in_dims(self, *batched_strings):
-
         def get_in_dim(batched_string):
             if batched_string == "first":
                 return 0
@@ -4446,24 +5165,27 @@ def get_in_dim(batched_string):
             assert batched_string == "none"
             return None
 
-        batched_strings = batched_strings + ("first",)  # for the always batched as first dim dummy argument
+        batched_strings = batched_strings + (
+            "first",
+        )  # for the always batched as first dim dummy argument
         return tuple(get_in_dim(batched_string) for batched_string in batched_strings)
 
-    @parametrize('randomness', ['same', 'different', 'error'])
-    @parametrize('use_generator', [True, False])
+    @parametrize("randomness", ["same", "different", "error"])
+    @parametrize("use_generator", [True, False])
     def test_factory_ops(self, device, randomness, use_generator):
-        if TEST_WITH_TORCHDYNAMO and randomness == "different":
-            self.skipTest("needs investigation")
-
         generator = torch.Generator(device=device)
         orig_state = generator.get_state()
-        kwargs = {'device': device, 'generator': generator} if use_generator else {'device': device}
+        kwargs = (
+            {"device": device, "generator": generator}
+            if use_generator
+            else {"device": device}
+        )
         ops = [
             lambda _, shape: torch.randn(shape, **kwargs),
             lambda _, shape: torch.rand(shape, **kwargs),
             lambda _, shape: torch.randint(100, shape, **kwargs),
             lambda _, shape: torch.randint(5, 100, shape, **kwargs),
-            lambda _, shape: torch.normal(0., 1., shape, **kwargs),
+            lambda _, shape: torch.normal(0.0, 1.0, shape, **kwargs),
         ]
         B0 = 4
         shape = (3, 3)
@@ -4471,12 +5193,16 @@ def test_factory_ops(self, device, randomness, use_generator):
 
         for op in ops:
             passed = torch.randn(B0, device=device)
-            if randomness == 'error':
-                self._assert_throws_in_error_mode(op, (passed, shape), in_dims=(0, None))
+            if randomness == "error":
+                self._assert_throws_in_error_mode(
+                    op, (passed, shape), in_dims=(0, None)
+                )
                 return
 
             generator = self._reset_random(generator, orig_state, use_generator, seed)
-            vmap_result = vmap(op, in_dims=(0, None), randomness=randomness)(passed, shape)
+            vmap_result = vmap(op, in_dims=(0, None), randomness=randomness)(
+                passed, shape
+            )
 
             generator = self._reset_random(generator, orig_state, use_generator, seed)
             if randomness == "different":
@@ -4489,12 +5215,9 @@ def test_factory_ops(self, device, randomness, use_generator):
                 for i in range(B0):
                     self.assertEqual(vmap_result[i], expected)
 
-    @parametrize('randomness', ['same', 'different', 'error'])
-    @parametrize('use_generator', [True, False])
+    @parametrize("randomness", ["same", "different", "error"])
+    @parametrize("use_generator", [True, False])
     def test_randperm(self, device, randomness, use_generator):
-        if TEST_WITH_TORCHDYNAMO and randomness == "different":
-            self.skipTest("needs investigation")
-
         # needs a special case because randperm doesn't take a batch size
         B0 = 4
         seed = 1234567
@@ -4504,39 +5227,46 @@ def test_randperm(self, device, randomness, use_generator):
         generator = torch.Generator(device=device)
         orig_state = generator.get_state()
 
-        kwargs = {'device': device, 'generator': generator} if use_generator else {'device': device}
+        kwargs = (
+            {"device": device, "generator": generator}
+            if use_generator
+            else {"device": device}
+        )
 
-        if randomness == 'error':
-            with self.assertRaisesRegex(RuntimeError, r"called random operation while in randomness error mode"):
-                vmap(lambda _: torch.randperm(10, **kwargs), randomness=randomness)(passed)
+        if randomness == "error":
+            with self.assertRaisesRegex(
+                RuntimeError, r"called random operation while in randomness error mode"
+            ):
+                vmap(lambda _: torch.randperm(10, **kwargs), randomness=randomness)(
+                    passed
+                )
             return
 
-        vmap_result = vmap(lambda _: torch.randperm(10, **kwargs), randomness=randomness)(passed)
+        vmap_result = vmap(
+            lambda _: torch.randperm(10, **kwargs), randomness=randomness
+        )(passed)
         generator = generator.set_state(orig_state)
         torch.manual_seed(seed)
-        if randomness == 'different':
+        if randomness == "different":
             for i in range(B0):
                 expected = torch.randperm(10, **kwargs)
                 # RNG differs between eager and via dynamo trace on CUDA
-                if (TEST_WITH_TORCHDYNAMO and torch.device(device).type == 'cuda'):
+                if TEST_WITH_TORCHDYNAMO and torch.device(device).type == "cuda":
                     self._assert_all_slices_unique(vmap_result)
                 else:
                     self.assertEqual(vmap_result[i], expected)
         else:
             expected = torch.randperm(10, **kwargs)
             # RNG differs between eager and via dynamo trace on CUDA
-            if (TEST_WITH_TORCHDYNAMO and torch.device(device).type == 'cuda'):
+            if TEST_WITH_TORCHDYNAMO and torch.device(device).type == "cuda":
                 self._assert_all_slices_equal(vmap_result)
             else:
                 for i in range(B0):
                     self.assertEqual(vmap_result[i], expected)
 
-    @parametrize('randomness', ['error', 'same', 'different'])
-    @parametrize('batched_input', ["first", "last", "none"])
+    @parametrize("randomness", ["error", "same", "different"])
+    @parametrize("batched_input", ["first", "last", "none"])
     def test_dropout(self, device, randomness, batched_input):
-        if TEST_WITH_TORCHDYNAMO and randomness == "different" and batched_input == "none":
-            self.skipTest("needs investigation")
-
         def op(t, ignored):
             return torch.nn.functional.dropout(torch.ones_like(t), training=True)
 
@@ -4545,12 +5275,16 @@ def op(t, ignored):
         passed = self._get_image(batched_input, B0, device)
         in_dims = self._in_dims(batched_input)
 
-        if randomness == 'error':
-            with self.assertRaisesRegex(RuntimeError, r"called random operation while in randomness error mode"):
+        if randomness == "error":
+            with self.assertRaisesRegex(
+                RuntimeError, r"called random operation while in randomness error mode"
+            ):
                 vmap(op, randomness=randomness, in_dims=in_dims)(passed, always_batched)
             return
 
-        vmap_result = vmap(op, randomness=randomness, in_dims=in_dims)(passed, always_batched)
+        vmap_result = vmap(op, randomness=randomness, in_dims=in_dims)(
+            passed, always_batched
+        )
 
         # Check that the randomness is within bounds...
         # ideally this is close to 0.5
@@ -4558,19 +5292,16 @@ def op(t, ignored):
         self.assertTrue(p_estimate < 0.75)
         self.assertTrue(p_estimate > 0.25)
 
-        if randomness == 'different':
+        if randomness == "different":
             self._assert_all_slices_unique(vmap_result)
             return
 
-        assert randomness == 'same'
+        assert randomness == "same"
         self._assert_all_slices_equal(vmap_result)
 
-    @parametrize('randomness', ['error', 'same', 'different'])
-    @parametrize('batched_input', ["first", "last", "none"])
+    @parametrize("randomness", ["error", "same", "different"])
+    @parametrize("batched_input", ["first", "last", "none"])
     def test_alpha_dropout(self, device, randomness, batched_input):
-        if TEST_WITH_TORCHDYNAMO and randomness == "different" and batched_input == "none":
-            self.skipTest("needs investigation")
-
         def op(t, ignored):
             return torch.nn.functional.alpha_dropout(torch.ones_like(t), training=True)
 
@@ -4579,30 +5310,35 @@ def op(t, ignored):
         passed = self._get_image(batched_input, B0, device)
         in_dims = self._in_dims(batched_input)
 
-        if randomness == 'error':
-            with self.assertRaisesRegex(RuntimeError, r"called random operation while in randomness error mode"):
+        if randomness == "error":
+            with self.assertRaisesRegex(
+                RuntimeError, r"called random operation while in randomness error mode"
+            ):
                 vmap(op, randomness=randomness, in_dims=in_dims)(passed, always_batched)
             return
 
         # I have no clue how to actually test correctness of alpha dropout because the docs
         # seem wrong: https://github.com/pytorch/pytorch/issues/74004
-        vmap_result = vmap(op, randomness=randomness, in_dims=in_dims)(passed, always_batched)
-        if randomness == 'different':
+        vmap_result = vmap(op, randomness=randomness, in_dims=in_dims)(
+            passed, always_batched
+        )
+        if randomness == "different":
             self._assert_all_slices_unique(vmap_result)
             return
 
-        assert randomness == 'same'
+        assert randomness == "same"
         self._assert_all_slices_equal(vmap_result)
 
-    @parametrize('randomness', ['error', 'same', 'different'])
-    @parametrize('batched_input', ["first", "last", "none"])
-    @parametrize('dim', [2, 3])
+    @parametrize("randomness", ["error", "same", "different"])
+    @parametrize("batched_input", ["first", "last", "none"])
+    @parametrize("dim", [2, 3])
     def test_feature_dropout(self, device, randomness, batched_input, dim):
-        if TEST_WITH_TORCHDYNAMO and randomness == "different" and batched_input == "none":
-            self.skipTest("needs investigation")
-
         def op(t, ignored):
-            f = torch.nn.functional.dropout2d if dim == 2 else torch.nn.functional.dropout3d
+            f = (
+                torch.nn.functional.dropout2d
+                if dim == 2
+                else torch.nn.functional.dropout3d
+            )
             return f(torch.ones_like(t), training=True)
 
         B0 = 4
@@ -4613,35 +5349,42 @@ def op(t, ignored):
             passed = passed.unsqueeze(unsqueeze_dim)
         in_dims = self._in_dims(batched_input)
 
-        if randomness == 'error':
-            with self.assertRaisesRegex(RuntimeError, r"called random operation while in randomness error mode"):
+        if randomness == "error":
+            with self.assertRaisesRegex(
+                RuntimeError, r"called random operation while in randomness error mode"
+            ):
                 vmap(op, randomness=randomness, in_dims=in_dims)(passed, always_batched)
             return
 
-        vmap_result = vmap(op, randomness=randomness, in_dims=in_dims)(passed, always_batched)
+        vmap_result = vmap(op, randomness=randomness, in_dims=in_dims)(
+            passed, always_batched
+        )
 
         # Check the "feature" pattern
         dims = [-1, -2] if dim == 2 else [-1, -2, -3]
-        planes_numel = 2 * vmap_result.numel() / (vmap_result.shape[0] * vmap_result.shape[1] * vmap_result.shape[2])
+        planes_numel = (
+            2
+            * vmap_result.numel()
+            / (vmap_result.shape[0] * vmap_result.shape[1] * vmap_result.shape[2])
+        )
         planes = vmap_result.sum(dims)
         result = (planes == 0) ^ (planes == planes_numel)
         self.assertEqual(result, torch.ones_like(result, dtype=torch.bool))
 
-        if randomness == 'different':
+        if randomness == "different":
             self._assert_all_slices_unique(vmap_result)
             return
 
-        assert randomness == 'same'
+        assert randomness == "same"
         self._assert_all_slices_equal(vmap_result)
 
-    @parametrize('randomness', ['error', 'same', 'different'])
-    @parametrize('batched_input', ["first", "last", "none"])
+    @parametrize("randomness", ["error", "same", "different"])
+    @parametrize("batched_input", ["first", "last", "none"])
     def test_feature_alpha_dropout(self, device, randomness, batched_input):
-        if TEST_WITH_TORCHDYNAMO and randomness == "different" and batched_input == "none":
-            self.skipTest("needs investigation")
-
         def op(t, ignored):
-            return torch.nn.functional.feature_alpha_dropout(torch.ones_like(t), training=True)
+            return torch.nn.functional.feature_alpha_dropout(
+                torch.ones_like(t), training=True
+            )
 
         B0 = 4
         always_batched = torch.randn((B0,))
@@ -4650,12 +5393,16 @@ def op(t, ignored):
         passed = passed.unsqueeze(unsqueeze_dim)
         in_dims = self._in_dims(batched_input)
 
-        if randomness == 'error':
-            with self.assertRaisesRegex(RuntimeError, r"called random operation while in randomness error mode"):
+        if randomness == "error":
+            with self.assertRaisesRegex(
+                RuntimeError, r"called random operation while in randomness error mode"
+            ):
                 vmap(op, randomness=randomness, in_dims=in_dims)(passed, always_batched)
             return
 
-        vmap_result = vmap(op, randomness=randomness, in_dims=in_dims)(passed, always_batched)
+        vmap_result = vmap(op, randomness=randomness, in_dims=in_dims)(
+            passed, always_batched
+        )
 
         # I have no clue how to actually test correctness of alpha dropout because the docs
         # seem wrong: https://github.com/pytorch/pytorch/issues/74004
@@ -4668,19 +5415,16 @@ def op(t, ignored):
         result = (planes == min_elt) ^ (planes == max_elt)
         self.assertEqual(result, torch.ones_like(result, dtype=torch.bool))
 
-        if randomness == 'different':
+        if randomness == "different":
             self._assert_all_slices_unique(vmap_result)
             return
 
-        assert randomness == 'same'
+        assert randomness == "same"
         self._assert_all_slices_equal(vmap_result)
 
-    @parametrize('randomness', ['error', 'same', 'different'])
-    @parametrize('batched_input', ["first", "last", "none"])
+    @parametrize("randomness", ["error", "same", "different"])
+    @parametrize("batched_input", ["first", "last", "none"])
     def test_like_functions(self, device, randomness, batched_input):
-        if TEST_WITH_TORCHDYNAMO and randomness == "different" and batched_input == "none":
-            self.skipTest("needs investigation")
-
         seed = 1234567
         supported_ops = [
             lambda t, _: torch.randint_like(t, 20),
@@ -4695,46 +5439,55 @@ def test_like_functions(self, device, randomness, batched_input):
             passed = self._get_image(batched_input, B0, device)
             in_dims = self._in_dims(batched_input)
 
-            if randomness == 'error':
-                with self.assertRaisesRegex(RuntimeError, r"called random operation while in randomness error mode"):
-                    vmap(op, in_dims=in_dims, randomness=randomness)(passed, always_batched)
+            if randomness == "error":
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"called random operation while in randomness error mode",
+                ):
+                    vmap(op, in_dims=in_dims, randomness=randomness)(
+                        passed, always_batched
+                    )
                 return
 
             torch.manual_seed(seed)
-            vmap_result = vmap(op, randomness=randomness, in_dims=in_dims)(passed, always_batched)
+            vmap_result = vmap(op, randomness=randomness, in_dims=in_dims)(
+                passed, always_batched
+            )
 
             torch.manual_seed(seed)
 
             if batched_input == "last":
                 passed = passed.movedim(-1, 0)
-            if randomness == 'different':
+            if randomness == "different":
                 if batched_input == "none":
                     passed = passed.expand(B0, *passed.shape)
                 expected = op(passed, 0)
 
                 self._assert_all_slices_unique(vmap_result)
                 # RNG differs between eager and via dynamo trace on CUDA
-                if not (TEST_WITH_TORCHDYNAMO and torch.device(device).type == 'cuda'):
+                if not (TEST_WITH_TORCHDYNAMO and torch.device(device).type == "cuda"):
                     self.assertEqual(expected, vmap_result)
                 return
 
-            assert randomness == 'same'
+            assert randomness == "same"
             if batched_input != "none":
                 passed = passed[0]
             expected = op(passed, 0)
             self._assert_all_slices_equal(vmap_result)
             # RNG differs between eager and via dynamo trace on CUDA
-            if not (TEST_WITH_TORCHDYNAMO and torch.device(device).type == 'cuda'):
+            if not (TEST_WITH_TORCHDYNAMO and torch.device(device).type == "cuda"):
                 for i in range(B0):
                     self.assertEqual(expected, vmap_result[i])
 
-    @parametrize('use_generator', [True, False])
-    @parametrize('randomness', ['error', 'same', 'different'])
-    @parametrize('batched_input', ["first", "last", "none"])
-    def test_random_unary_inplace(self, device, use_generator, randomness, batched_input):
+    @parametrize("use_generator", [True, False])
+    @parametrize("randomness", ["error", "same", "different"])
+    @parametrize("batched_input", ["first", "last", "none"])
+    def test_random_unary_inplace(
+        self, device, use_generator, randomness, batched_input
+    ):
         generator = torch.Generator(device=device)
         orig_state = generator.get_state()
-        kwargs = {'generator': generator} if use_generator else {}
+        kwargs = {"generator": generator} if use_generator else {}
         ops = [
             lambda t, _: t.random_(**kwargs),
             lambda t, _: t.random_(100, **kwargs),
@@ -4757,15 +5510,21 @@ def test_random_unary_inplace(self, device, use_generator, randomness, batched_i
             passed = self._get_image(batched_input, B0, device)
             passed_expected = passed.clone()
 
-            if randomness == 'error':
-                self._assert_throws_in_error_mode(op, (passed, always_batched), in_dims=in_dims)
+            if randomness == "error":
+                self._assert_throws_in_error_mode(
+                    op, (passed, always_batched), in_dims=in_dims
+                )
                 return
-            if randomness == 'different' and batched_input == "none":
-                self._assert_throws_in_different_mode_inplace(op, (passed, always_batched), in_dims=in_dims)
+            if randomness == "different" and batched_input == "none":
+                self._assert_throws_in_different_mode_inplace(
+                    op, (passed, always_batched), in_dims=in_dims
+                )
                 return
 
             generator = self._reset_random(generator, orig_state, use_generator, seed)
-            vmap_result = vmap(op, in_dims=in_dims, randomness=randomness)(passed, always_batched)
+            vmap_result = vmap(op, in_dims=in_dims, randomness=randomness)(
+                passed, always_batched
+            )
 
             if batched_input == "last":
                 passed_expected = passed_expected.movedim(-1, 0)
@@ -4776,22 +5535,26 @@ def test_random_unary_inplace(self, device, use_generator, randomness, batched_i
                 self.assertEqual(vmap_result, expected)
             else:
                 if batched_input != "none":
-                    passed_expected = passed_expected[0].clone()  # bug in pytorch, normal_ on views doesn't work
+                    passed_expected = passed_expected[
+                        0
+                    ].clone()  # bug in pytorch, normal_ on views doesn't work
                 expected = op(passed_expected, always_batched)
                 self._assert_all_slices_equal(vmap_result)
                 for i in range(B0):
                     self.assertEqual(vmap_result[i], expected)
 
-    @parametrize('use_generator', [True, False])
-    @parametrize('randomness', ['error', 'same', 'different'])
-    @parametrize('batched_input', ["first", "last", "none"])
-    @parametrize('batched_probability', ["first", "last", "none"])
-    def test_bernoulli_in_place(self, device, use_generator, randomness, batched_input, batched_probability):
+    @parametrize("use_generator", [True, False])
+    @parametrize("randomness", ["error", "same", "different"])
+    @parametrize("batched_input", ["first", "last", "none"])
+    @parametrize("batched_probability", ["first", "last", "none"])
+    def test_bernoulli_in_place(
+        self, device, use_generator, randomness, batched_input, batched_probability
+    ):
         B0 = 4
         seed = 1234567
         generator = torch.Generator(device=device)
         orig_state = generator.get_state()
-        kwargs = {'generator': generator} if use_generator else {}
+        kwargs = {"generator": generator} if use_generator else {}
         in_dims = self._in_dims(batched_input, batched_probability)
 
         def op(t, p, ignored):
@@ -4803,23 +5566,33 @@ def op(t, p, ignored):
         input_expected = input.clone()
         probability = self._get_image(batched_probability, B0, device) - 0.5
 
-        if randomness == 'error':
-            self._assert_throws_in_error_mode(op, (input, probability, always_batched), in_dims=in_dims)
+        if randomness == "error":
+            self._assert_throws_in_error_mode(
+                op, (input, probability, always_batched), in_dims=in_dims
+            )
             return
-        if randomness == 'same' and batched_probability != "none":
-            self._assert_throws_in_same_mode_batched(op, (input, probability, always_batched), in_dims=in_dims)
+        if randomness == "same" and batched_probability != "none":
+            self._assert_throws_in_same_mode_batched(
+                op, (input, probability, always_batched), in_dims=in_dims
+            )
             return
         if batched_input == "none" and batched_probability != "none":
             regex = r"there exists a Tensor `other` in extra_args that has more elements than `self`"
             with self.assertRaisesRegex(RuntimeError, regex):
-                vmap(op, in_dims=in_dims, randomness=randomness)(input, probability, always_batched)
+                vmap(op, in_dims=in_dims, randomness=randomness)(
+                    input, probability, always_batched
+                )
             return
-        if randomness == 'different' and batched_input == "none":
-            self._assert_throws_in_different_mode_inplace(op, (input, probability, always_batched), in_dims=in_dims)
+        if randomness == "different" and batched_input == "none":
+            self._assert_throws_in_different_mode_inplace(
+                op, (input, probability, always_batched), in_dims=in_dims
+            )
             return
 
         self._reset_random(generator, orig_state, use_generator, seed)
-        vmap_result = vmap(op, in_dims=in_dims, randomness=randomness)(input, probability, always_batched)
+        vmap_result = vmap(op, in_dims=in_dims, randomness=randomness)(
+            input, probability, always_batched
+        )
 
         self._reset_random(generator, orig_state, use_generator, seed)
         if batched_input == "last":
@@ -4838,17 +5611,16 @@ def op(t, p, ignored):
             for i in range(B0):
                 self.assertEqual(vmap_result[i], expected)
 
-    @parametrize('use_generator', [True, False])
-    @parametrize('randomness', ['error', 'same', 'different'])
-    @parametrize('batched_input', ["first", "last", "none"])
-    @parametrize('batched_other', ["first", "last", "none"])
-    def test_random_binary_out_of_place(self, device, use_generator, randomness, batched_input, batched_other):
-        if TEST_WITH_TORCHDYNAMO and randomness == "different" and batched_input == "none":
-            self.skipTest("needs investigation")
-
+    @parametrize("use_generator", [True, False])
+    @parametrize("randomness", ["error", "same", "different"])
+    @parametrize("batched_input", ["first", "last", "none"])
+    @parametrize("batched_other", ["first", "last", "none"])
+    def test_random_binary_out_of_place(
+        self, device, use_generator, randomness, batched_input, batched_other
+    ):
         generator = torch.Generator(device=device)
         orig_state = generator.get_state()
-        kwargs = {'generator': generator} if use_generator else {}
+        kwargs = {"generator": generator} if use_generator else {}
         ops = [
             lambda t, o, _: torch.normal(t, o, **kwargs),
             lambda t, o, _: torch.binomial(t, (o - 0.5), **kwargs),
@@ -4863,15 +5635,23 @@ def test_random_binary_out_of_place(self, device, use_generator, randomness, bat
             input = self._get_image(batched_input, B0, device)
             other = self._get_image(batched_other, B0, device)
 
-            if randomness == 'error':
-                self._assert_throws_in_error_mode(op, (input, other, always_batched), in_dims=in_dims)
+            if randomness == "error":
+                self._assert_throws_in_error_mode(
+                    op, (input, other, always_batched), in_dims=in_dims
+                )
                 return
-            if randomness == 'same' and (batched_input != "none" or batched_other != "none"):
-                self._assert_throws_in_same_mode_batched(op, (input, other, always_batched), in_dims=in_dims)
+            if randomness == "same" and (
+                batched_input != "none" or batched_other != "none"
+            ):
+                self._assert_throws_in_same_mode_batched(
+                    op, (input, other, always_batched), in_dims=in_dims
+                )
                 return
 
             generator = self._reset_random(generator, orig_state, use_generator, seed)
-            vmap_result = vmap(op, in_dims=in_dims, randomness=randomness)(input, other, always_batched)
+            vmap_result = vmap(op, in_dims=in_dims, randomness=randomness)(
+                input, other, always_batched
+            )
 
             if batched_input == "last":
                 input = input.movedim(-1, 0)
@@ -4892,19 +5672,18 @@ def test_random_binary_out_of_place(self, device, use_generator, randomness, bat
                 for i in range(B0):
                     self.assertEqual(vmap_result[i], expected)
 
-    @parametrize('use_generator', [True, False])
-    @parametrize('randomness', ['error', 'same', 'different'])
-    @parametrize('batched_input', ["first", "last", "none"])
-    def test_random_unary_out_of_place(self, device, use_generator, randomness, batched_input):
-        if TEST_WITH_TORCHDYNAMO and randomness == "different" and batched_input == "none":
-            self.skipTest("needs investigation")
-
+    @parametrize("use_generator", [True, False])
+    @parametrize("randomness", ["error", "same", "different"])
+    @parametrize("batched_input", ["first", "last", "none"])
+    def test_random_unary_out_of_place(
+        self, device, use_generator, randomness, batched_input
+    ):
         generator = torch.Generator(device=device)
         orig_state = generator.get_state()
-        kwargs = {'generator': generator} if use_generator else {}
+        kwargs = {"generator": generator} if use_generator else {}
         ops = [
-            lambda t, _: torch.normal(0., torch.abs(t), **kwargs),
-            lambda t, _: torch.normal(t, 1., **kwargs),
+            lambda t, _: torch.normal(0.0, torch.abs(t), **kwargs),
+            lambda t, _: torch.normal(t, 1.0, **kwargs),
             lambda t, _: torch.bernoulli(t - 0.5, **kwargs),
             lambda t, _: torch.bernoulli(t, 0.5, **kwargs),
             lambda t, _: torch._standard_gamma(t, **kwargs),
@@ -4919,15 +5698,21 @@ def test_random_unary_out_of_place(self, device, use_generator, randomness, batc
         for op in ops:
             always_batched = torch.randn(B0, device=device)
             passed = self._get_image(batched_input, B0, device)
-            if randomness == 'error':
-                self._assert_throws_in_error_mode(op, (passed, always_batched), in_dims=in_dims)
+            if randomness == "error":
+                self._assert_throws_in_error_mode(
+                    op, (passed, always_batched), in_dims=in_dims
+                )
                 return
-            if randomness == 'same' and batched_input != "none":
-                self._assert_throws_in_same_mode_batched(op, (passed, always_batched), in_dims=in_dims)
+            if randomness == "same" and batched_input != "none":
+                self._assert_throws_in_same_mode_batched(
+                    op, (passed, always_batched), in_dims=in_dims
+                )
                 return
 
             generator = self._reset_random(generator, orig_state, use_generator, seed)
-            vmap_result = vmap(op, in_dims=in_dims, randomness=randomness)(passed, always_batched)
+            vmap_result = vmap(op, in_dims=in_dims, randomness=randomness)(
+                passed, always_batched
+            )
 
             generator = self._reset_random(generator, orig_state, use_generator, seed)
             if randomness == "different":
@@ -4944,14 +5729,13 @@ def test_random_unary_out_of_place(self, device, use_generator, randomness, batc
                 for i in range(B0):
                     self.assertEqual(vmap_result[i], expected)
 
-    @parametrize('use_generator', [True, False])
-    @parametrize('randomness', ['error', 'same', 'different'])
-    @parametrize('batched_call', [True, False])
-    @parametrize('batched_input', ["first", "last", "none"])
-    def test_multinomial(self, device, use_generator, randomness, batched_call, batched_input):
-        if TEST_WITH_TORCHDYNAMO and randomness == "different" and batched_input == "none":
-            self.skipTest("needs investigation")
-
+    @parametrize("use_generator", [True, False])
+    @parametrize("randomness", ["error", "same", "different"])
+    @parametrize("batched_call", [True, False])
+    @parametrize("batched_input", ["first", "last", "none"])
+    def test_multinomial(
+        self, device, use_generator, randomness, batched_call, batched_input
+    ):
         def flatten_input(input, batch_call, batch_location):
             if batch_call and batch_location != "none":
                 final_size = 3  # [B0, B, N]
@@ -4964,7 +5748,9 @@ def flatten_input(input, batch_call, batch_location):
             end_idx = -1
             if batch_location == "last":
                 start_idx -= 1
-                end_idx -= 1   # gets to correct final size because using negative indices
+                end_idx -= (
+                    1  # gets to correct final size because using negative indices
+                )
 
             ret = input.flatten(start_idx, end_idx)
             assert ret.dim() == final_size
@@ -4975,7 +5761,7 @@ def op(input, _):
 
         generator = torch.Generator(device=device)
         orig_state = generator.get_state()
-        kwargs = {'generator': generator} if use_generator else {}
+        kwargs = {"generator": generator} if use_generator else {}
 
         B0 = 4
         seed = 1234567
@@ -4984,15 +5770,21 @@ def op(input, _):
         always_batched = torch.randn(B0, device=device)
         passed = self._get_image(batched_input, B0, device)
         passed = flatten_input(passed, batched_call, batched_input)
-        if randomness == 'error':
-            self._assert_throws_in_error_mode(op, (passed, always_batched), in_dims=in_dims)
+        if randomness == "error":
+            self._assert_throws_in_error_mode(
+                op, (passed, always_batched), in_dims=in_dims
+            )
             return
-        if randomness == 'same' and batched_input != "none":
-            self._assert_throws_in_same_mode_batched(op, (passed, always_batched), in_dims=in_dims)
+        if randomness == "same" and batched_input != "none":
+            self._assert_throws_in_same_mode_batched(
+                op, (passed, always_batched), in_dims=in_dims
+            )
             return
 
         generator = self._reset_random(generator, orig_state, use_generator, seed)
-        vmap_result = vmap(op, in_dims=in_dims, randomness=randomness)(passed, always_batched)
+        vmap_result = vmap(op, in_dims=in_dims, randomness=randomness)(
+            passed, always_batched
+        )
 
         generator = self._reset_random(generator, orig_state, use_generator, seed)
 
@@ -5018,22 +5810,27 @@ def test_unsupported_random(self, device):
         y = x.abs()
         z = x.abs()
         with self.assertRaisesRegex(RuntimeError, "calling out variants"):
+
             def f(x):
                 return torch.randn(3, device=device, out=y)
-            vmap(f, randomness='same')(x)
+
+            vmap(f, randomness="same")(x)
         with self.assertRaisesRegex(RuntimeError, "calling out variants"):
+
             def f(x0, x1):
                 return torch.normal(x, y, out=x)
-            vmap(f, randomness='same')(z, z)
+
+            vmap(f, randomness="same")(z, z)
         with self.assertRaisesRegex(RuntimeError, "do not yet support"):
+
             def f(z):
                 return torch.rrelu(x)
-            vmap(f, randomness='same')(z)
 
-    @parametrize('in_dim', [0, 1, 2])
-    @parametrize('out_dim', [0, 1, 2])
-    def test_chunk_vmap(self, in_dim, out_dim):
+            vmap(f, randomness="same")(z)
 
+    @parametrize("in_dim", [0, 1, 2])
+    @parametrize("out_dim", [0, 1, 2])
+    def test_chunk_vmap(self, in_dim, out_dim):
         randomness = "different"
 
         x = torch.randn(4, 5, 6)
@@ -5044,14 +5841,17 @@ def f(x):
 
         for chunks in [1, 2, 3, 4, 7, 10, 16]:
             output = chunk_vmap(
-                f, in_dims=in_dim, out_dims=out_dim, randomness=randomness, chunks=chunks
+                f,
+                in_dims=in_dim,
+                out_dims=out_dim,
+                randomness=randomness,
+                chunks=chunks,
             )(x)
             self._assert_all_slices_unique(output)
 
-    @parametrize('in_dim', [0, 1, 2])
-    @parametrize('out_dim', [0, 1, 2])
+    @parametrize("in_dim", [0, 1, 2])
+    @parametrize("out_dim", [0, 1, 2])
     def test_vmap_chunksize(self, in_dim, out_dim):
-
         randomness = "different"
 
         x = torch.randn(4, 5, 6)
@@ -5062,29 +5862,30 @@ def f(x):
 
         for chunk_size in [1, 2, 3, 4, 7, 10, 16, 100]:
             output = vmap(
-                f, in_dims=in_dim, out_dims=out_dim, randomness=randomness, chunk_size=chunk_size
+                f,
+                in_dims=in_dim,
+                out_dims=out_dim,
+                randomness=randomness,
+                chunk_size=chunk_size,
             )(x)
             self._assert_all_slices_unique(output)
 
-
-    @xfailIfTorchDynamo
     def test_jacfwd_with_random(self):
         # checks on behavior are above, this just checks that jacfwd respects
         # the randomness param
 
         x = torch.rand(3, 4)
-        with self.assertRaisesRegex(RuntimeError, r"called random operation while in randomness error mode"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"called random operation while in randomness error mode"
+        ):
             jacfwd(torch.bernoulli)(x)
 
         # x isn't batched so use bernoulli since it doesn't do inplace randomness
         jacfwd(torch.bernoulli, randomness="same")(x)
         jacfwd(torch.bernoulli, randomness="different")(x)
 
-    @parametrize('randomness', ['error', 'same', 'different'])
+    @parametrize("randomness", ["error", "same", "different"])
     def test_dropout_unbatched(self, device, randomness):
-        if TEST_WITH_TORCHDYNAMO and randomness == "different":
-            self.skipTest("needs investigation")
-
         x = torch.randn(3, device=device)
         y = torch.randn(1, 3, device=device)
 
@@ -5095,20 +5896,34 @@ def fn(x, y):
         # We just verify that this doesn't raise an error for
         # `same` and `different` randomness.
         # Ref: https://github.com/pytorch/pytorch/issues/92283
-        context = self.assertRaises(RuntimeError) if randomness == 'error' else contextlib.nullcontext()
+        context = (
+            self.assertRaises(RuntimeError)
+            if randomness == "error"
+            else contextlib.nullcontext()
+        )
         with context:
             vmap(fn, in_dims=(0, None), randomness=randomness)(x, y)
 
+
 @markDynamoStrictTest
 class TestTransformFailure(TestCase):
-    @skipIfTorchDynamo
-    @parametrize('transform', ['vmap', 'grad', 'grad_and_value', 'vjp', 'jvp', 'jacrev', 'jacfwd'])
+    @skipIfTorchDynamo()
+    @parametrize(
+        "transform",
+        ["vmap", "grad", "grad_and_value", "vjp", "jvp", "jacrev", "jacfwd"],
+    )
     def test_fails_with_autograd_function(self, device, transform):
-        failed_build_envs = ('linux-focal-py3.8-clang10', 'linux-focal-py3.11-clang10')
-        if (device == 'cpu' and transform in ['grad', 'vmap'] and
-                TEST_WITH_TORCHDYNAMO and os.getenv('BUILD_ENVIRONMENT', '') in failed_build_envs):
-            raise unittest.SkipTest("Unexpected successes on focal with dynamo," +
-                                    " see https://github.com/pytorch/pytorch/issues/107173")
+        failed_build_envs = ("linux-focal-py3.8-clang10", "linux-focal-py3.11-clang10")
+        if (
+            device == "cpu"
+            and transform in ["grad", "vmap"]
+            and TEST_WITH_TORCHDYNAMO
+            and os.getenv("BUILD_ENVIRONMENT", "") in failed_build_envs
+        ):
+            raise unittest.SkipTest(
+                "Unexpected successes on focal with dynamo,"
+                + " see https://github.com/pytorch/pytorch/issues/107173"
+            )
 
         class Test(torch.autograd.Function):
             @staticmethod
@@ -5125,7 +5940,7 @@ def f(x):
             return Test.apply(x)
 
         if transform in (grad, grad_and_value):
-            input = torch.tensor(4.)
+            input = torch.tensor(4.0)
         else:
             input = torch.randn(5)
 
@@ -5140,6 +5955,7 @@ def f(x):
         with self.assertRaisesRegex(RuntimeError, "autograd.Function"):
             transform(input)
 
+
 @markDynamoStrictTest
 class TestVmapDeviceType(Namespace.TestVmapBase):
     def _vmap_test(self, *args, **kwargs):
@@ -5237,6 +6053,7 @@ def check_gte_0(t):
 
         check_vmap_fallback(self, test, torch._test_check_tensor)
 
+
 @markDynamoStrictTest
 class TestVmapNestedTensor(Namespace.TestVmapBase):
     def _vmap_test(self, *args, **kwargs):
@@ -5246,12 +6063,15 @@ def _vmap_test(self, *args, **kwargs):
     # random ragged structure should be used
     def _create_nt(self, dims, device):
         sizes = [
-            [d if d is not None else torch.randint(2, 10, size=(1,)).item() for d in dims[1:]]
+            [
+                d if d is not None else torch.randint(2, 10, size=(1,)).item()
+                for d in dims[1:]
+            ]
             for d in range(dims[0])
         ]
-        return torch.nested.nested_tensor([
-            torch.randn(*size) for size in sizes
-        ], device=device)
+        return torch.nested.nested_tensor(
+            [torch.randn(*size) for size in sizes], device=device
+        )
 
     # Creates an NT matching another NT's number of components and
     # shape / ragged structure for all dims specified to be -1.
@@ -5270,14 +6090,14 @@ def _nt_from_similar(self, other, dims):
                     ret_size.append(d)
             ret_sizes.append(ret_size)
 
-        return torch.nested.nested_tensor([
-            torch.randn(*size) for size in ret_sizes
-        ], device=other.device)
+        return torch.nested.nested_tensor(
+            [torch.randn(*size) for size in ret_sizes], device=other.device
+        )
 
     @allowVmapFallbackUsage
     def test_fallback_unary(self, device):
         def f(x):
-            return x.sin() * 5. + 4.
+            return x.sin() * 5.0 + 4.0
 
         nt = self._create_nt([4, None, 3], device=device)
         self._vmap_test(f, (nt,))
@@ -5356,7 +6176,8 @@ def f(x):
 
         x = self._create_nt([3, None, 2], device=device)
         with self.assertRaisesRegex(
-                RuntimeError, "Nested tensors can only be vmapped over dim=0"):
+            RuntimeError, "Nested tensors can only be vmapped over dim=0"
+        ):
             vmap(f, in_dims=2)(x)
 
     def test_nt_with_nonzero_out_dim_raises(self, device):
@@ -5365,7 +6186,8 @@ def f(x):
 
         x = self._create_nt([3, None, 2], device=device)
         with self.assertRaisesRegex(
-                RuntimeError, "Nested tensors can only be vmapped over dim=0"):
+            RuntimeError, "Nested tensors can only be vmapped over dim=0"
+        ):
             vmap(f, out_dims=2)(x)
 
     def test_fallback_with_nt_and_batched_dense_with_nonzero_bdim_raises(self, device):
@@ -5377,22 +6199,27 @@ def f(x, y):
 
         with self.assertRaisesRegex(
             RuntimeError,
-            "Fallback not supported for mixed nested / non-nested arguments without bdim=0"
+            "Fallback not supported for mixed nested / non-nested arguments without bdim=0",
         ):
             vmap(f, in_dims=(0, 1))(x, y)
 
     def test_multilevel_vmap_raises(self, device):
         def f(x):
-            return x.sin() * 4. + 3.
+            return x.sin() * 4.0 + 3.0
 
         x = self._create_nt([2, 2, 2, None], device=device)
 
-        with self.assertRaisesRegex(RuntimeError, "Only one level of vmap is supported"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Only one level of vmap is supported"
+        ):
             vmap(vmap(f))(x)
 
-        with self.assertRaisesRegex(RuntimeError, "Only one level of vmap is supported"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Only one level of vmap is supported"
+        ):
             vmap(vmap(vmap(f)))(x)
 
+
 only_for = ("cpu", "cuda")
 instantiate_device_type_tests(TestVmapOperatorsOpInfo, globals(), only_for=only_for)
 
@@ -5406,5 +6233,5 @@ def f(x):
 instantiate_device_type_tests(TestVmapDeviceType, globals(), only_for=only_for)
 instantiate_device_type_tests(TestVmapNestedTensor, globals(), only_for=only_for)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_vmap_registrations.py b/test/functorch/test_vmap_registrations.py
index 4952f2745b6d7..967152945af5c 100644
--- a/test/functorch/test_vmap_registrations.py
+++ b/test/functorch/test_vmap_registrations.py
@@ -2,16 +2,16 @@
 import typing
 import unittest
 
+from torch._C import (
+    _dispatch_get_registrations_for_dispatch_key as get_registrations_for_dispatch_key,
+)
+
 from torch.testing._internal.common_utils import (
-    TestCase,
-    run_tests,
     instantiate_parametrized_tests,
     parametrize,
-    subtest
-)
-
-from torch._C import (
-    _dispatch_get_registrations_for_dispatch_key as get_registrations_for_dispatch_key,
+    run_tests,
+    subtest,
+    TestCase,
 )
 
 xfail_functorch_batched = {
@@ -226,17 +226,21 @@
     "aten::var_mean.correction_names",
     "aten::var_mean.names_dim",
     "aten::where",
-
 }
 
 
 def dispatch_registrations(
-        dispatch_key: str, xfails: set, filter_func: typing.Callable = lambda reg: True):
+    dispatch_key: str, xfails: set, filter_func: typing.Callable = lambda reg: True
+):
     registrations = sorted(get_registrations_for_dispatch_key(dispatch_key))
     subtests = [
-        subtest(reg, name=f"[{reg}]",
-                decorators=([unittest.expectedFailure] if reg in xfails else []))
-        for reg in registrations if filter_func(reg)
+        subtest(
+            reg,
+            name=f"[{reg}]",
+            decorators=([unittest.expectedFailure] if reg in xfails else []),
+        )
+        for reg in registrations
+        if filter_func(reg)
     ]
     return parametrize("registration", subtests)
 
@@ -262,17 +266,17 @@ def filter_vmap_implementable(reg):
         return False
     if reg.endswith("_out"):
         return False
-    if '.dimname' in reg:
+    if ".dimname" in reg:
         return False
     if "_dimname" in reg:
         return False
-    if 'fbgemm' in reg:
+    if "fbgemm" in reg:
         return False
-    if 'quantize' in reg:
+    if "quantize" in reg:
         return False
-    if 'sparse' in reg:
+    if "sparse" in reg:
         return False
-    if '::is_' in reg:
+    if "::is_" in reg:
         return False
     return True
 
diff --git a/test/functorch/xfail_suggester.py b/test/functorch/xfail_suggester.py
index cfe1460a01ac3..5de2e0e3d8454 100644
--- a/test/functorch/xfail_suggester.py
+++ b/test/functorch/xfail_suggester.py
@@ -1,4 +1,5 @@
 import re
+
 import torch
 
 """
@@ -8,11 +9,11 @@
 2. python test/xfail_suggester.py
 """
 
-with open('result.txt') as f:
+with open("result.txt") as f:
     lines = f.readlines()
 
-failed = [line for line in lines if line.startswith('FAILED')]
-p = re.compile('FAILED test/test_\w+.py::\w+::(\S+)')  # noqa: W605
+failed = [line for line in lines if line.startswith("FAILED")]
+p = re.compile("FAILED test/test_\w+.py::\w+::(\S+)")  # noqa: W605
 
 
 def get_failed_test(line):
@@ -23,22 +24,22 @@ def get_failed_test(line):
 
 
 base_names = {
-    'test_grad_',
-    'test_vjp_',
-    'test_vmapvjp_',
-    'test_vmapvjp_has_batch_rule_',
-    'test_vjpvmap_',
-    'test_jvp_',
-    'test_vmapjvp_',
-    'test_vmapjvpall_has_batch_rule_',
-    'test_vmapjvpall_',
-    'test_jvpvjp_',
-    'test_vjpvjp_',
-    'test_decomposition_',
-    'test_make_fx_exhaustive_',
-    'test_vmap_exhaustive_',
-    'test_op_has_batch_rule_',
-    'test_vmap_autograd_grad_',
+    "test_grad_",
+    "test_vjp_",
+    "test_vmapvjp_",
+    "test_vmapvjp_has_batch_rule_",
+    "test_vjpvmap_",
+    "test_jvp_",
+    "test_vmapjvp_",
+    "test_vmapjvpall_has_batch_rule_",
+    "test_vmapjvpall_",
+    "test_jvpvjp_",
+    "test_vjpvjp_",
+    "test_decomposition_",
+    "test_make_fx_exhaustive_",
+    "test_vmap_exhaustive_",
+    "test_op_has_batch_rule_",
+    "test_vmap_autograd_grad_",
 }
 
 failed_tests = [get_failed_test(line) for line in lines]
@@ -49,7 +50,7 @@ def get_failed_test(line):
 
 
 def remove_device_dtype(test):
-    return '_'.join(test.split('_')[:-2])
+    return "_".join(test.split("_")[:-2])
 
 
 def belongs_to_base(test, base):
@@ -64,23 +65,23 @@ def belongs_to_base(test, base):
 
 def parse_namespace(base):
     mappings = {
-        'nn_functional_': 'nn.functional',
-        'fft_': 'fft',
-        'linalg_': 'linalg',
-        '_masked_': '_masked',
-        'sparse_': 'sparse',
-        'special_': 'special',
+        "nn_functional_": "nn.functional",
+        "fft_": "fft",
+        "linalg_": "linalg",
+        "_masked_": "_masked",
+        "sparse_": "sparse",
+        "special_": "special",
     }
     for heading in mappings.keys():
         if base.startswith(heading):
-            return mappings[heading], base[len(heading):]
+            return mappings[heading], base[len(heading) :]
     return None, base
 
 
 def get_torch_module(namespace):
     if namespace is None:
         return torch
-    if namespace == 'nn.functional':
+    if namespace == "nn.functional":
         return torch.nn.functional
     return getattr(torch, namespace)
 
@@ -92,11 +93,11 @@ def parse_base(base):
     apis = sorted(apis, key=lambda x: -len(x))
 
     api = rest
-    variant = ''
+    variant = ""
     for candidate in apis:
         if rest.startswith(candidate):
             api = candidate
-            variant = rest[len(candidate) + 1:]
+            variant = rest[len(candidate) + 1 :]
             break
     print(base, namespace, api, variant)
     return namespace, api, variant
@@ -111,19 +112,18 @@ def any_starts_with(strs, thing):
 
 def get_suggested_xfails(base, tests):
     result = []
-    tests = [test[len(base):] for test in tests if
-             belongs_to_base(test, base)]
+    tests = [test[len(base) :] for test in tests if belongs_to_base(test, base)]
 
     base_tests = {remove_device_dtype(test) for test in tests}
     tests = set(tests)
     for base in base_tests:
-        cpu_variant = base + '_cpu_float32'
-        cuda_variant = base + '_cuda_float32'
+        cpu_variant = base + "_cpu_float32"
+        cuda_variant = base + "_cuda_float32"
         namespace, api, variant = parse_base(base)
         if namespace is None:
             api = api
         else:
-            api = f'{namespace}.{api}'
+            api = f"{namespace}.{api}"
         if cpu_variant in tests and cuda_variant in tests:
             result.append(f"xfail('{api}', '{variant}'),")
             continue
@@ -139,7 +139,7 @@ def get_suggested_xfails(base, tests):
 
 result = {base: get_suggested_xfails(base, failed_tests) for base in base_names}
 for k, v in result.items():
-    print('=' * 50)
+    print("=" * 50)
     print(k)
-    print('=' * 50)
-    print('\n'.join(v))
+    print("=" * 50)
+    print("\n".join(v))
diff --git a/test/fx/named_tup.py b/test/fx/named_tup.py
index d26fe9df3c922..b149b68cabc07 100644
--- a/test/fx/named_tup.py
+++ b/test/fx/named_tup.py
@@ -2,6 +2,7 @@
 
 import torch
 
+
 class MyNamedTup(NamedTuple):
-    i : torch.Tensor
-    f : torch.Tensor
+    i: torch.Tensor
+    f: torch.Tensor
diff --git a/test/fx/quantization.py b/test/fx/quantization.py
index db720111917c9..288859982aa59 100644
--- a/test/fx/quantization.py
+++ b/test/fx/quantization.py
@@ -1,20 +1,24 @@
-r'''
+r"""
 **This file is EXPERIMENTAL and is mostly used for testing purposes! Do not
 rely on it for anything!**
-'''
+"""
+import operator
+import sys
+from typing import Optional
+
+import torch
 from torch.fx import Graph, GraphModule, Node
 from torch.fx.graph import map_arg
 from torch.fx.proxy import Proxy
-import sys
-import torch
 from torch.nn.utils import fuse_conv_bn_weights
-import operator
-from typing import Optional
 
 # can be a
 #  module type, a builtin function, or a string to match target
 
-def _minmax_scale_zeropoint(min_val, max_val, qmin=-127, qmax=128, eps=torch.finfo(torch.float32).eps):
+
+def _minmax_scale_zeropoint(
+    min_val, max_val, qmin=-127, qmax=128, eps=torch.finfo(torch.float32).eps
+):
     min_val = min(0.0, min_val)
     max_val = max(0.0, max_val)
     if max_val == min_val:
@@ -28,9 +32,10 @@ def _minmax_scale_zeropoint(min_val, max_val, qmin=-127, qmax=128, eps=torch.fin
         zero_point = int(zero_point)
         return scale, zero_point
 
+
 class MinMaxObserver:
     def __init__(self, quantizer, node):
-        self.min, self.max = float('inf'), float('-inf')
+        self.min, self.max = float("inf"), float("-inf")
         self.all_tensors = True
 
     def observe(self, node, env):
@@ -44,6 +49,7 @@ def observe(self, node, env):
     def scale_zeropoint(self):
         return _minmax_scale_zeropoint(self.min, self.max, qmin=0, qmax=255)
 
+
 class NoObserver:
     def __init__(self, quantizer, node):
         pass
@@ -51,11 +57,15 @@ def __init__(self, quantizer, node):
     def observe(self, node, env):
         pass
 
+
 _DEFAULT_QUANTIZATION_PATTERNS = {}
+
+
 def register_pattern(pattern):
     def insert(fn):
         _DEFAULT_QUANTIZATION_PATTERNS[pattern] = fn
         return fn
+
     return insert
 
 
@@ -66,12 +76,19 @@ def quantize(self, quantizer, node, load_arg):
             return NotImplemented
         scale, zeropoint = self.scale_zeropoint()
         return quantizer.quantized_graph.create_node(
-            'call_function', torch.ops.quantized.add, load_arg(node.args), {'scale': scale, 'zero_point': zeropoint})
+            "call_function",
+            torch.ops.quantized.add,
+            load_arg(node.args),
+            {"scale": scale, "zero_point": zeropoint},
+        )
 
 
 class Relu(NoObserver):
     def quantize(self, quantizer, node, load_arg):
-        return torch.relu(load_arg(node.args[0]))  # torch.relu works directly on quantized tensors?
+        return torch.relu(
+            load_arg(node.args[0])
+        )  # torch.relu works directly on quantized tensors?
+
 
 # these ops have quantized equivalents that do not need any extra information
 @register_pattern(torch.nn.ReLU)
@@ -82,15 +99,24 @@ class CopyNode(NoObserver):
     def quantize(self, quantizer, node, load_arg):
         return quantizer.quantized_graph.node_copy(node, load_arg)
 
+
 class IdentityModule(torch.nn.Module):
     def forward(self, x):
         return x
 
+
 # handle conv, maybe followed by bn, maybe followed by relu
 @register_pattern(torch.nn.modules.conv.Conv2d)
 @register_pattern((torch.nn.ReLU, torch.nn.modules.conv.Conv2d))
-@register_pattern((torch.nn.modules.batchnorm.BatchNorm2d, torch.nn.modules.conv.Conv2d))
-@register_pattern((torch.nn.ReLU, (torch.nn.modules.batchnorm.BatchNorm2d, torch.nn.modules.conv.Conv2d)))
+@register_pattern(
+    (torch.nn.modules.batchnorm.BatchNorm2d, torch.nn.modules.conv.Conv2d)
+)
+@register_pattern(
+    (
+        torch.nn.ReLU,
+        (torch.nn.modules.batchnorm.BatchNorm2d, torch.nn.modules.conv.Conv2d),
+    )
+)
 class ConvNormRelu(MinMaxObserver):
     def __init__(self, quantizer, node):
         super().__init__(quantizer, node)
@@ -112,21 +138,41 @@ def quantize(self, quantizer, node, load_arg):
 
         if self.bn_node is not None:
             weight, bias = fuse_conv_bn_weights(
-                weight, bias, self.bn.running_mean, self.bn.running_var,
-                self.bn.eps, self.bn.weight, self.bn.bias)
+                weight,
+                bias,
+                self.bn.running_mean,
+                self.bn.running_var,
+                self.bn.eps,
+                self.bn.weight,
+                self.bn.bias,
+            )
 
         min_val, max_val = float(weight.min()), float(weight.max())
 
         act_scale, act_zp = self.scale_zeropoint()
 
         weight_scale, weight_zp = _minmax_scale_zeropoint(min_val, max_val)
-        qweight = torch.quantize_per_tensor(weight, weight_scale, weight_zp, torch.qint8)
-
-        ctor = torch.ao.nn.intrinsic.quantized.ConvReLU2d if self.relu_node is not None else torch.ao.nn.quantized.Conv2d
-
-        qconv = ctor(mod.in_channels, mod.out_channels, mod.kernel_size,
-                     mod.stride, mod.padding, mod.dilation, mod.groups,
-                     mod.bias is not None, mod.padding_mode)
+        qweight = torch.quantize_per_tensor(
+            weight, weight_scale, weight_zp, torch.qint8
+        )
+
+        ctor = (
+            torch.ao.nn.intrinsic.quantized.ConvReLU2d
+            if self.relu_node is not None
+            else torch.ao.nn.quantized.Conv2d
+        )
+
+        qconv = ctor(
+            mod.in_channels,
+            mod.out_channels,
+            mod.kernel_size,
+            mod.stride,
+            mod.padding,
+            mod.dilation,
+            mod.groups,
+            mod.bias is not None,
+            mod.padding_mode,
+        )
 
         qconv.set_weight_bias(qweight, bias)
         qconv.scale = float(act_scale)
@@ -139,24 +185,31 @@ def quantize(self, quantizer, node, load_arg):
             # try to call it, so replace with something that does nothing.
             setattr(quantizer.modules[parent_name], bn_name, IdentityModule())
 
-        return quantizer.quantized_graph.create_node('call_module', self.conv_node.target, (load_arg(self.conv_node.args[0]),), {})
+        return quantizer.quantized_graph.create_node(
+            "call_module",
+            self.conv_node.target,
+            (load_arg(self.conv_node.args[0]),),
+            {},
+        )
 
 
 # turn foo.bar -> ['foo', 'bar']
 def _parent_name(target):
-    r = target.rsplit('.', 1)
+    r = target.rsplit(".", 1)
     if len(r) == 1:
-        return '', r[0]
+        return "", r[0]
     else:
         return r[0], r[1]
 
 
-
 class DefaultQuant(MinMaxObserver):
     def quantize(self, input):
         assert self.all_tensors
         scale, zeropoint = self.scale_zeropoint()
-        return torch.quantize_per_tensor(Proxy(input), scale, zeropoint, torch.quint8).node
+        return torch.quantize_per_tensor(
+            Proxy(input), scale, zeropoint, torch.quint8
+        ).node
+
 
 def matches(modules, node, pattern, max_uses=sys.maxsize):
     if isinstance(pattern, tuple):
@@ -169,12 +222,12 @@ def matches(modules, node, pattern, max_uses=sys.maxsize):
         return False
 
     if isinstance(self_match, type) and issubclass(self_match, torch.nn.Module):
-        if node.op != 'call_module':
+        if node.op != "call_module":
             return False
         if not isinstance(modules[node.target], self_match):
             return False
     elif callable(self_match):
-        if node.op != 'call_function' or node.target is not self_match:
+        if node.op != "call_function" or node.target is not self_match:
             return False
     elif node.target != self_match:
         return False
@@ -185,11 +238,16 @@ def matches(modules, node, pattern, max_uses=sys.maxsize):
     if len(arg_matches) != len(node.args):
         return False
 
-    return all(matches(modules, node, arg_match, max_uses=1) for node, arg_match in zip(node.args, arg_matches))
+    return all(
+        matches(modules, node, arg_match, max_uses=1)
+        for node, arg_match in zip(node.args, arg_matches)
+    )
 
 
 class Quantizer:
-    def __init__(self, mod, patterns=_DEFAULT_QUANTIZATION_PATTERNS, quant_ctor=DefaultQuant):
+    def __init__(
+        self, mod, patterns=_DEFAULT_QUANTIZATION_PATTERNS, quant_ctor=DefaultQuant
+    ):
         self.root = mod
         self.graph = mod.graph
         self.quant_ctor = quant_ctor
@@ -205,8 +263,6 @@ def __init__(self, mod, patterns=_DEFAULT_QUANTIZATION_PATTERNS, quant_ctor=Defa
         # initialize an quant_ctor object for each
         self.quants = self._find_quants(quant_ctor)
 
-
-
     def observe(self, args):
         # most of this function is just an interpreter for the graph
         # it would be possible to put this in some abstraction, but
@@ -220,21 +276,23 @@ def observe(self, args):
         def load_arg(a):
             return map_arg(a, lambda node: env[node.name])
 
-        output_node : Optional[Node] = None
+        output_node: Optional[Node] = None
         for node in self.graph.nodes:
-            if node.op == 'placeholder':
+            if node.op == "placeholder":
                 result = next(args_iter)
-            elif node.op == 'get_attr':
+            elif node.op == "get_attr":
                 result = self.state_dict[node.target]
-            elif node.op == 'call_function':
+            elif node.op == "call_function":
                 result = node.target(*load_arg(node.args), **load_arg(node.kwargs))
-            elif node.op == 'call_method':
+            elif node.op == "call_method":
                 self_obj, *args = load_arg(node.args)
                 kwargs = load_arg(node.kwargs)
                 result = getattr(self_obj, node.target)(*args, **kwargs)
-            elif node.op == 'call_module':
-                result = self.modules[node.target](*load_arg(node.args), **load_arg(node.kwargs))
-            elif node.op == 'output':
+            elif node.op == "call_module":
+                result = self.modules[node.target](
+                    *load_arg(node.args), **load_arg(node.kwargs)
+                )
+            elif node.op == "output":
                 return load_arg(node.args[0])
 
             env[node.name] = result
@@ -244,7 +302,7 @@ def load_arg(a):
             if node.name in self.quants:
                 self.quants[node.name].observe(node, env)
 
-        raise RuntimeError('Graph had no output node!')
+        raise RuntimeError("Graph had no output node!")
 
     def quantize(self):
         self.quantized_graph = Graph()
@@ -268,17 +326,26 @@ def load_or_emit(n):
                     return load_arg(n, quantized=False)
                 else:
                     return copy_recursive(n)
-            r = env[node.name] = self.quantized_graph.node_copy(node, lambda n: load_arg(n, quantized=False))
+
+            r = env[node.name] = self.quantized_graph.node_copy(
+                node, lambda n: load_arg(n, quantized=False)
+            )
             return r
 
         for node in self.graph.nodes:
             root_node, obj = self.matches.get(node.name, (None, None))
             if root_node is None:
                 # not quantized just copy it
-                env[node.name] = self.quantized_graph.node_copy(node, lambda n: load_arg(n, quantized=False))
+                env[node.name] = self.quantized_graph.node_copy(
+                    node, lambda n: load_arg(n, quantized=False)
+                )
 
             elif root_node is node:
-                r = obj.quantize(self, node, lambda a: map_arg(a, lambda n: load_arg(n, quantized=True)))
+                r = obj.quantize(
+                    self,
+                    node,
+                    lambda a: map_arg(a, lambda n: load_arg(n, quantized=True)),
+                )
                 if r is NotImplemented:
                     # quantizer choose to to quantize the node take the entire match, and just copy it over
                     env[node.name] = copy_recursive(node)
@@ -318,6 +385,7 @@ def visit_arg(n):
             # say NotImplemented (if for instance, it is an __add__ and the data type is not appropriate)
             if n.name not in quants:
                 quants[n.name] = quant_ctor(self, n)
+
         for node in self.graph.nodes:
             if node.name in self.matches:
                 map_arg(node.args, visit_arg)
diff --git a/test/fx/test_common_passes.py b/test/fx/test_common_passes.py
index 407e707db8797..0fa0384a3c03e 100644
--- a/test/fx/test_common_passes.py
+++ b/test/fx/test_common_passes.py
@@ -1,14 +1,19 @@
 # Owner(s): ["oncall: fx"]
 
-import torch
+import itertools
 
-from torch.testing._internal.common_utils import (
-    TestCase, parametrize, instantiate_parametrized_tests, run_tests)
+import torch
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.fx.passes.dialect.common.cse_pass import CSEPass
 from torch.fx.graph_module import GraphModule
+from torch.fx.passes.dialect.common.cse_pass import CSEPass
+
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    TestCase,
+)
 
-import itertools
 
 def FactoryFunctionCall(x, device):
     y = torch.full(x.shape, 3, device=device)
@@ -62,12 +67,14 @@ def MutationMetadata(x):
 
 
 Passes = [CSEPass]
-Test_Cases = [TakeList,
-              ReturnList,
-              Mutation,
-              MutationInput,
-              MutationMetadata,
-              MutationTorchTensorCall]
+Test_Cases = [
+    TakeList,
+    ReturnList,
+    Mutation,
+    MutationInput,
+    MutationMetadata,
+    MutationTorchTensorCall,
+]
 Factory_Test_Cases = [FactoryFunctionCall, MutationFactory]
 Devices = ["cpu"]
 if torch.cuda.is_available():
@@ -76,12 +83,14 @@ def MutationMetadata(x):
 
 def name_fn(common_pass, f, device):
     """Names parameterized test cases."""
-    return f'{type(common_pass()).__name__}_{f.__name__}_{device}'
+    return f"{type(common_pass()).__name__}_{f.__name__}_{device}"
+
 
 @instantiate_parametrized_tests
 class TestCommonPass(TestCase):
-
-    @parametrize("common_pass,f,device", itertools.product(Passes, Test_Cases, Devices), name_fn)
+    @parametrize(
+        "common_pass,f,device", itertools.product(Passes, Test_Cases, Devices), name_fn
+    )
     def test_correctness(self, common_pass, f, device):
         inp = torch.randn(10, device=device)
 
@@ -98,8 +107,11 @@ def test_correctness(self, common_pass, f, device):
 
         self.assertEqual(result, expected)
 
-
-    @parametrize("common_pass,f,device", itertools.product(Passes, Factory_Test_Cases, Devices), name_fn)
+    @parametrize(
+        "common_pass,f,device",
+        itertools.product(Passes, Factory_Test_Cases, Devices),
+        name_fn,
+    )
     def test_correctness_factory(self, common_pass, f, device):
         inp = torch.randn(10, device=device)
         traced_m = make_fx(f)(inp, device)
@@ -116,5 +128,5 @@ def test_correctness_factory(self, common_pass, f, device):
         self.assertEqual(result, expected)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/fx/test_cse_pass.py b/test/fx/test_cse_pass.py
index 13ed344dc43e1..96acc959a5c9c 100644
--- a/test/fx/test_cse_pass.py
+++ b/test/fx/test_cse_pass.py
@@ -1,19 +1,19 @@
 # Owner(s): ["oncall: fx"]
 
-import torch
+import random
 
-from torch.testing._internal.common_utils import (
-    TestCase, run_tests)
+import torch
+from torch.fx import symbolic_trace
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.passes.dialect.common.cse_pass import CSEPass, get_CSE_banned_ops
-from torch.fx import symbolic_trace
 
-import random
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 banned_ops = get_CSE_banned_ops()
 P_default = CSEPass(banned_ops=banned_ops)
 
+
 def check(self, f, t, delta, check_val=True, graph_input=False, P=None):
     """
     check if the CSE modified graph of ``f``
@@ -47,34 +47,50 @@ def check(self, f, t, delta, check_val=True, graph_input=False, P=None):
     old_num_nodes = len(fx_g.graph.nodes)
     new_num_nodes = len(new_graph.nodes)
 
-    assert (new_num_nodes < old_num_nodes) == modified, "modified should be True if the number of nodes decrease"
+    assert (
+        new_num_nodes < old_num_nodes
+    ) == modified, "modified should be True if the number of nodes decrease"
 
     if delta == -1:
-        self.assertTrue(old_num_nodes >= new_num_nodes, (
-            f"number of nodes increased {old_num_nodes}, {new_num_nodes}"))
+        self.assertTrue(
+            old_num_nodes >= new_num_nodes,
+            (f"number of nodes increased {old_num_nodes}, {new_num_nodes}"),
+        )
     else:
-        self.assertTrue(old_num_nodes == new_num_nodes + delta, (
-            f"number of nodes not the same {old_num_nodes - delta}, {new_num_nodes}\n {fx_g.graph} \n {new_graph}"))
+        self.assertTrue(
+            old_num_nodes == new_num_nodes + delta,
+            (
+                f"number of nodes not the same {old_num_nodes - delta}, {new_num_nodes}\n {fx_g.graph} \n {new_graph}"
+            ),
+        )
 
     # a second pass should not reduce more nodes
     res = P(new_g)
     pass_2_graph = res.graph_module.graph
     pass_2_num_nodes = len(pass_2_graph.nodes)
-    self.assertTrue(pass_2_num_nodes == new_num_nodes, (
-        f"second pass graph has less node {pass_2_num_nodes}, {new_num_nodes}\n {new_graph} \n {pass_2_graph}"))
+    self.assertTrue(
+        pass_2_num_nodes == new_num_nodes,
+        (
+            f"second pass graph has less node {pass_2_num_nodes}, {new_num_nodes}\n {new_graph} \n {pass_2_graph}"
+        ),
+    )
 
     # check correctness
     if check_val:
         true_result = fx_g(t)
         our_result = new_g(t)
         if true_result is None:  # both return None
-            self.assertTrue(our_result is None, f"true result is None, CSE result is {our_result}")
+            self.assertTrue(
+                our_result is None, f"true result is None, CSE result is {our_result}"
+            )
         else:  # results returned are the same
-            self.assertTrue(torch.all(true_result == our_result), (
-                f"results are different {true_result}, {our_result}"))  # check results are the same
+            self.assertTrue(
+                torch.all(true_result == our_result),
+                (f"results are different {true_result}, {our_result}"),
+            )  # check results are the same
 
-class TestCSEPass(TestCase):
 
+class TestCSEPass(TestCase):
     def test_nochange(self):
         def f(x):
             a = x + 1
@@ -82,16 +98,17 @@ def f(x):
             a = x
             d = x + a
             return b + d
+
         t = torch.randn(2, 2)
         check(self, f, t, 0)
 
     def test_empty(self):
         def f(x):
             pass
+
         t = torch.randn(2, 2)
         check(self, f, t, 0)
 
-
     def test_immutable_list_type(self):
         def f(x):
             a = x.sum(dim=1)
@@ -99,6 +116,7 @@ def f(x):
             c = x.sum()
             d = x.sum()
             return a + b + c + d
+
         t = torch.randn(2, 2)
         check(self, f, t, 2)
 
@@ -109,6 +127,7 @@ def f(x):
             c = x.sum(dim=1)
             d = x.sum(dim=1)
             return a + b + c + d
+
         t = torch.randn(2, 2)
         check(self, f, t, 2)
 
@@ -119,6 +138,7 @@ def f(x):
             c = a + a
             d = b + b
             return c + d
+
         t = torch.randn(2, 2)
         check(self, f, t, 2)
 
@@ -129,6 +149,7 @@ def f(x):
             c = a + a
             d = b + b
             return c + d
+
         t = torch.randn(1)
         check(self, f, t, 3)
 
@@ -139,6 +160,7 @@ def f(x):
             c = x.sum(dim=1, keepdim=False)
             d = x.sum(dim=1)
             return a + b + c + d
+
         t = torch.randn(2, 2)
         check(self, f, t, 3)
 
@@ -149,6 +171,7 @@ def f(x):
             c = x.sum(dim=1, keepdim=True)
             d = x.sum(dim=1)
             return a + b + c + d
+
         t = torch.randn(2, 2)
         check(self, f, t, 2)
 
@@ -159,6 +182,7 @@ def f(x):
             c = x.sum()
             d = x.sum()
             return a + b + c + d
+
         t = torch.randn(2, 2)
         check(self, f, t, 3)
 
@@ -167,6 +191,7 @@ def f(x):
             a = torch.cat((x, x))
             b = torch.cat((x, x))
             return a + b
+
         t = torch.randn(2, 2)
         check(self, f, t, 1)
 
@@ -175,12 +200,14 @@ def f(x):
             a = torch.ones_like(x)
             b = torch.ones_like(x)
             return a + b
+
         t = torch.randn(2, 2)
         check(self, f, t, 1)
 
     """
     Generate function with random ops and check if the result is the same
     """
+
     def test_random(self):
         def f(x):
             vals = [x]
@@ -201,6 +228,7 @@ def f(x):
     """
     Test that banned list ban ops as expected.
     """
+
     def test_banned_list(self):
         def f(x):
             a = x + 1
@@ -217,6 +245,7 @@ def f(x):
             a = torch.rand_like(x)
             b = torch.rand_like(x)
             return a + b
+
         t = torch.randn(2, 2)
         check(self, f, t, 0, check_val=False)
 
@@ -225,9 +254,10 @@ def f(x):
             a = torch.randn(4)
             b = torch.randn(4)
             return a + b
+
         t = torch.randn(2, 2)
         check(self, f, t, 0, check_val=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/fx/test_dce_pass.py b/test/fx/test_dce_pass.py
index b8074049eaec0..605d9aadeaf4f 100644
--- a/test/fx/test_dce_pass.py
+++ b/test/fx/test_dce_pass.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: fx"]
 
 from typing import Set, Type
+
 import torch
 import torch.fx
 
diff --git a/test/fx/test_future.py b/test/fx/test_future.py
index 4525f678eaeb6..af169f56abcbe 100644
--- a/test/fx/test_future.py
+++ b/test/fx/test_future.py
@@ -1,34 +1,42 @@
 # Owner(s): ["module: fx"]
 
-from __future__ import annotations    # type: ignore[attr-defined]
-import torch
+from __future__ import annotations  # type: ignore[attr-defined]
+
 import typing
+
+import torch
 from torch.fx import symbolic_trace
 
+
 class A:
     def __call__(self, x: torch.Tensor):
         return torch.add(x, x)
 
+
 # No forward references
 class M1(torch.nn.Module):
     def forward(self, x: torch.Tensor, a: A) -> torch.Tensor:
         return a(x)
 
+
 # Forward references
 class M2(torch.nn.Module):
     def forward(self, x: torch.Tensor, a: A) -> torch.Tensor:
         return a(x)
 
+
 # Non-torch annotation with no internal forward references
 class M3(torch.nn.Module):
     def forward(self, x: typing.List[torch.Tensor], a: A) -> torch.Tensor:
         return a(x[0])
 
+
 # Non-torch annotation with internal forward references
 class M4(torch.nn.Module):
     def forward(self, x: typing.List[torch.Tensor], a: A) -> torch.Tensor:
         return a(x[0])
 
+
 x = torch.rand(2, 3)
 
 ref = torch.add(x, x)
diff --git a/test/fx/test_fx_param_shape_control_flow.py b/test/fx/test_fx_param_shape_control_flow.py
index 04db468a7e631..373e3b5b8395d 100644
--- a/test/fx/test_fx_param_shape_control_flow.py
+++ b/test/fx/test_fx_param_shape_control_flow.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: fx"]
 
 import unittest
+
 import torch
 import torch.fx
 
@@ -19,7 +20,8 @@ def get_mul_matrix(self):
         return self.param
 
     def no_relu(self):
-        raise Exception("not implemented")
+        raise Exception("not implemented")  # noqa: TRY002
+
 
 class MyModuleParamShape(MyModuleBase):
     def __init__(self, in_channels):
@@ -72,7 +74,6 @@ def no_relu(self):
         return self.param.numel() < 10 * 3
 
 
-
 class MyModuleParamNElement(MyModuleBase):
     def __init__(self, in_channels):
         super().__init__()
@@ -82,43 +83,49 @@ def no_relu(self):
         return self.param.nelement() < 10 * 3
 
 
-
 class TestConstParamShapeInControlFlow(TestCase):
-
     def verify_mm_relu_mods(self, mm_only_mod, relu_mod):
         """
         Verify one module only does a mm op while the other
         performs both mm and relu ops in cascade
         """
         x = torch.randn(10, 5)
-        torch.testing.assert_close(mm_only_mod(x), torch.mm(x, mm_only_mod.get_mul_matrix()))
+        torch.testing.assert_close(
+            mm_only_mod(x), torch.mm(x, mm_only_mod.get_mul_matrix())
+        )
         tracer = torch.fx.Tracer(param_shapes_constant=True)
         traced_graph = tracer.trace(mm_only_mod)
 
         # verify the graph module calculates the same result
         graph_mod_mm = torch.fx.GraphModule(mm_only_mod, traced_graph)
-        torch.testing.assert_close(graph_mod_mm(x), torch.mm(x, mm_only_mod.get_mul_matrix()))
-
+        torch.testing.assert_close(
+            graph_mod_mm(x), torch.mm(x, mm_only_mod.get_mul_matrix())
+        )
 
         # Make a new module with different parameter shape to go down the different
         # code path
         x = torch.randn(10, 15)
-        torch.testing.assert_close(relu_mod(x), torch.relu(torch.mm(x, relu_mod.get_mul_matrix())))
+        torch.testing.assert_close(
+            relu_mod(x), torch.relu(torch.mm(x, relu_mod.get_mul_matrix()))
+        )
 
         tracer2 = torch.fx.Tracer(param_shapes_constant=True)
         traced_graph2 = tracer2.trace(relu_mod)
 
         # verify the graph module calculates the same result
         graph_mod_relu = torch.fx.GraphModule(relu_mod, traced_graph2)
-        torch.testing.assert_close(graph_mod_relu(x), torch.relu(torch.mm(x, relu_mod.get_mul_matrix())))
-
+        torch.testing.assert_close(
+            graph_mod_relu(x), torch.relu(torch.mm(x, relu_mod.get_mul_matrix()))
+        )
 
         graph1_node_targets = [n.target for n in traced_graph.nodes]
         graph2_node_targets = [n.target for n in traced_graph2.nodes]
 
         # the second graph has an exta relu function call node
         assert torch.mm in graph1_node_targets and torch.mm in graph2_node_targets
-        assert torch.relu not in graph1_node_targets and torch.relu in graph2_node_targets
+        assert (
+            torch.relu not in graph1_node_targets and torch.relu in graph2_node_targets
+        )
 
     def test_param_shape_const(self):
         mymod = MyModuleParamShape(in_channels=5)
@@ -151,5 +158,5 @@ def test_param_nelement_const(self):
         self.verify_mm_relu_mods(mymod, mymod2)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/test/fx/test_fx_split.py b/test/fx/test_fx_split.py
index ff776fe1adcde..b97de599cd71c 100644
--- a/test/fx/test_fx_split.py
+++ b/test/fx/test_fx_split.py
@@ -1,7 +1,7 @@
 # Owner(s): ["module: fx"]
 
 from collections import defaultdict
-from typing import List, Tuple, Dict
+from typing import Dict, List, Tuple
 
 import torch
 from torch.fx.passes.split_utils import split_by_tags
@@ -40,6 +40,7 @@ def forward(self, x, y):
                 self.assertIn("name", node.meta)
                 self.assertEqual(node.meta["name"], node.name)
 
+
 class TestSplitByTags(TestCase):
     class TestModule(torch.nn.Module):
         def __init__(self) -> None:
@@ -150,3 +151,77 @@ def test_split_by_tags(self) -> None:
             },
             f"{orig_to_split_fqn_mapping=}",
         )
+
+
+class TestSplitOutputType(TestCase):
+    class TestModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 16, 3, stride=1, bias=True)
+            self.relu = torch.nn.ReLU()
+
+        def forward(self, x):
+            conv = self.conv(x)
+            conv = conv * 0.5
+            relu = self.relu(conv)
+            return relu
+
+    @staticmethod
+    def trace_and_tag(
+        module: torch.nn.Module, inputs: torch.Tensor, tags: List[str]
+    ) -> Tuple[torch.fx.GraphModule, Dict[str, List[str]]]:
+        """
+        Test simple gm consists of nodes with tag (only show call_module nodes here):
+            conv - tag: "red"
+            mul - tag: "blue"
+            relu - tag: "green"
+
+        At the beginning we have:
+            gm:
+                conv
+                mul
+                relu
+
+        split_gm = split_by_tags(gm, tags)
+
+        Then we have:
+            split_gm:
+                red:
+                    conv
+                blue:
+                    mul
+                green:
+                    relu
+        """
+        tag_node = defaultdict(list)
+        gm: torch.fx.GraphModule = torch.export.export(module, (inputs,)).module()
+        # Add tag to all nodes and build dictionary record tag to call_module nodes
+        for node in gm.graph.nodes:
+            if "conv" in node.name:
+                node.tag = tags[0]
+                tag_node[tags[0]].append(node.name)
+            elif "mul" in node.name:
+                node.tag = tags[1]
+                tag_node[tags[1]].append(node.name)
+            else:
+                node.tag = tags[2]
+                if node.op == "call_module":
+                    tag_node[tags[2]].append(node.name)
+        return gm, tag_node
+
+    def test_split_by_tags(self) -> None:
+        tags = ["red", "blue", "green"]
+        module = TestSplitOutputType.TestModule()
+
+        inputs = torch.randn((1, 3, 224, 224))
+
+        gm, tag_node = TestSplitOutputType.trace_and_tag(module, inputs, tags)
+        split_gm, orig_to_split_fqn_mapping = split_by_tags(
+            gm, tags, return_fqn_mapping=True
+        )
+
+        gm_output = module(inputs)
+        split_gm_output = split_gm(inputs)
+
+        self.assertTrue(type(gm_output) == type(split_gm_output))
+        self.assertTrue(torch.equal(gm_output, split_gm_output))
diff --git a/test/fx/test_gradual_type.py b/test/fx/test_gradual_type.py
index 0ec4c880dfca8..1a0f5d9bde471 100644
--- a/test/fx/test_gradual_type.py
+++ b/test/fx/test_gradual_type.py
@@ -1,18 +1,22 @@
 # Owner(s): ["module: fx"]
 
 import unittest
+
+import sympy
 import torch
-from torch.fx import symbolic_trace
-from torch.fx.experimental.unify_refinements import infer_symbolic_types
-from torch.fx.experimental.refinement_types import Equality
-from torch.fx.tensor_type import TensorType, Dyn, is_consistent, is_more_precise
+from torch.fx import GraphModule, symbolic_trace
 from torch.fx.annotate import annotate
-from torch.fx.experimental.graph_gradual_typechecker import GraphTypeChecker, broadcast_types, Refine
+from torch.fx.experimental.graph_gradual_typechecker import (
+    broadcast_types,
+    GraphTypeChecker,
+    Refine,
+)
+from torch.fx.experimental.refinement_types import Equality
 from torch.fx.experimental.rewriter import RewritingTracer
-from torch.fx import GraphModule
+from torch.fx.experimental.unify_refinements import infer_symbolic_types
 from torch.fx.passes.shape_prop import ShapeProp
+from torch.fx.tensor_type import Dyn, is_consistent, is_more_precise, TensorType
 from torch.testing._internal.common_utils import TestCase
-import sympy
 
 
 try:
@@ -23,24 +27,33 @@
     HAS_TORCHVISION = False
 skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision")
 
+
 def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
     """3x3 convolution with padding"""
-    return torch.nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
-                           padding=dilation, groups=groups, bias=False, dilation=dilation)
+    return torch.nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation,
+    )
 
-class AnnotationsTest(TestCase):
 
+class AnnotationsTest(TestCase):
     def test_annotations(self):
         """
         Test type annotations in the forward function.
         The annotation should appear in the n.graph
         where n is the corresponding node in the resulting graph.
         """
+
         class M(torch.nn.Module):
-            def forward(self,
-                        x: TensorType((1, 2, 3, Dyn)),
-                        y: Dyn,
-                        z: TensorType[Dyn, 3, Dyn]):
+            def forward(
+                self, x: TensorType((1, 2, 3, Dyn)), y: Dyn, z: TensorType[Dyn, 3, Dyn]
+            ):
                 return torch.add(x, y) + z
 
         module = M()
@@ -50,20 +63,19 @@ def forward(self,
         expected_iter = iter(expected_ph_types)
 
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 assert n.type == next(expected_iter)
 
     def test_annotate(self):
         class M(torch.nn.Module):
-
             def forward(self, x):
                 y = annotate(x, TensorType((1, 2, 3, Dyn)))
                 return torch.add(x, y)
 
         module = M()
-        symbolic_traced : torch.fx.GraphModule = symbolic_trace(module)
+        symbolic_traced: torch.fx.GraphModule = symbolic_trace(module)
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 assert n.type == TensorType((1, 2, 3, Dyn))
 
     def test_consistency(self):
@@ -83,7 +95,9 @@ def test_precision(self):
         self.assertTrue(is_more_precise(TensorType((1, 2, 3)), TensorType((1, Dyn, 3))))
         self.assertTrue(is_more_precise(int, Dyn))
         self.assertTrue(is_more_precise(int, int))
-        self.assertFalse(is_more_precise(TensorType((1, 2, 3)), TensorType((1, 2, 3, 5))))
+        self.assertFalse(
+            is_more_precise(TensorType((1, 2, 3)), TensorType((1, 2, 3, 5)))
+        )
         self.assertFalse(is_more_precise(TensorType((1, 2, 3)), int))
 
     def test_broadcasting1(self):
@@ -94,7 +108,10 @@ def test_broadcasting1(self):
         t5 = TensorType((4, 4, 4))
         # todo switch all code to use list instead of tuple
         t6 = TensorType([1])
-        assert broadcast_types(t1, t2) == (TensorType((1, 2, 3, 4)), TensorType((1, 2, 3, 4)))
+        assert broadcast_types(t1, t2) == (
+            TensorType((1, 2, 3, 4)),
+            TensorType((1, 2, 3, 4)),
+        )
         assert broadcast_types(t3, t4) == (t4, t4)
         assert broadcast_types(t5, t6) == (t5, t5)
 
@@ -102,46 +119,58 @@ def test_broadcasting2(self):
         t1 = TensorType((2, 3, 4))
         t2 = TensorType((1, 2, 1, 4))
 
-        assert broadcast_types(t1, t2) == (TensorType((1, 2, 3, 4)), TensorType((1, 2, 3, 4)))
+        assert broadcast_types(t1, t2) == (
+            TensorType((1, 2, 3, 4)),
+            TensorType((1, 2, 3, 4)),
+        )
 
     def test_broadcasting3(self):
         t1 = TensorType((1, 2, 3, Dyn))
         t2 = TensorType((2, 3, 4))
-        assert broadcast_types(t1, t2) == (TensorType((1, 2, 3, Dyn)), TensorType((1, 2, 3, 4)))
+        assert broadcast_types(t1, t2) == (
+            TensorType((1, 2, 3, Dyn)),
+            TensorType((1, 2, 3, 4)),
+        )
 
-class TypeCheckerTest(TestCase):
 
+class TypeCheckerTest(TestCase):
     def test_type_check_add_with_broadcast(self):
         class M(torch.nn.Module):
             def forward(self, x: TensorType((1, 2, 3, Dyn)), y: TensorType((2, 3, 4))):
                 return torch.add(x, y)
+
         module = M()
         symbolic_traced: torch.fx.GraphModule = symbolic_trace(module)
         tc = GraphTypeChecker({}, symbolic_traced)
         tc.type_check()
-        expected_ph_types = [TensorType((1, 2, 3, Dyn)),
-                             TensorType((2, 3, 4)),
-                             TensorType((1, 2, 3, Dyn)),
-                             TensorType((1, 2, 3, Dyn))]
+        expected_ph_types = [
+            TensorType((1, 2, 3, Dyn)),
+            TensorType((2, 3, 4)),
+            TensorType((1, 2, 3, Dyn)),
+            TensorType((1, 2, 3, Dyn)),
+        ]
         expected_iter = iter(expected_ph_types)
 
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'call_function':
-                assert n.meta['broadcast']
+            if n.op == "call_function":
+                assert n.meta["broadcast"]
             assert n.type == next(expected_iter)
 
     def test_type_check_add_with_scalar(self):
         class M(torch.nn.Module):
             def forward(self, x: int, y: TensorType((2, 3, 4))):
                 return torch.add(x, y)
+
         module = M()
         symbolic_traced: torch.fx.GraphModule = symbolic_trace(module)
         tc = GraphTypeChecker({}, symbolic_traced)
         tc.type_check()
-        expected_ph_types = [int,
-                             TensorType((2, 3, 4)),
-                             TensorType((2, 3, 4)),
-                             TensorType((2, 3, 4))]
+        expected_ph_types = [
+            int,
+            TensorType((2, 3, 4)),
+            TensorType((2, 3, 4)),
+            TensorType((2, 3, 4)),
+        ]
         expected_iter = iter(expected_ph_types)
 
         for n in symbolic_traced.graph.nodes:
@@ -151,6 +180,7 @@ def test_type_check_add_false(self):
         class M(torch.nn.Module):
             def forward(self, x: TensorType((1, 2, 3, Dyn)), y: TensorType((1, 2, 3))):
                 return torch.add(x, y)
+
         module = M()
         symbolic_traced: torch.fx.GraphModule = symbolic_trace(module)
         tc = GraphTypeChecker({}, symbolic_traced)
@@ -161,6 +191,7 @@ def test_type_check_add_true(self):
         class M(torch.nn.Module):
             def forward(self, x: TensorType((1, 2, Dyn)), y: TensorType((1, 2, 3))):
                 return torch.add(x, y)
+
         module = M()
         symbolic_traced: torch.fx.GraphModule = symbolic_trace(module)
         tc = GraphTypeChecker({}, symbolic_traced)
@@ -170,9 +201,9 @@ def forward(self, x: TensorType((1, 2, Dyn)), y: TensorType((1, 2, 3))):
         expected_iter = iter(expected_ph_types)
 
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 assert n.type == next(expected_iter)
-            if n.op == 'output':
+            if n.op == "output":
                 assert n.type == TensorType((1, 2, Dyn))
 
     def test_type_check_reshape_true(self):
@@ -186,13 +217,13 @@ def forward(self, x: TensorType((1, 6))):
         self.assertTrue(tc.type_check())
 
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 assert n.type == TensorType((1, 6))
 
-            if n.op == 'call_function':
+            if n.op == "call_function":
                 assert n.type == TensorType((1, 2, 3))
 
-            if n.op == 'output':
+            if n.op == "output":
                 assert n.type == TensorType((1, 2, 3))
 
     def test_type_check_reshape_false(self):
@@ -244,16 +275,16 @@ def forward(self, x: TensorType((1, 2, 3, 5))):
                 return torch.transpose(x, 0, 1)
 
         module = M()
-        symbolic_traced : torch.fx.GraphModule = symbolic_trace(module)
+        symbolic_traced: torch.fx.GraphModule = symbolic_trace(module)
         tc = GraphTypeChecker({}, symbolic_traced)
         self.assertTrue(tc.type_check())
 
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'call_function':
+            if n.op == "call_function":
                 assert n.type == TensorType([2, 1, 3, 5])
-            if n.op == 'output':
+            if n.op == "output":
                 assert n.type == TensorType([2, 1, 3, 5])
-            if n.op == 'x':
+            if n.op == "x":
                 assert n.placeholder == TensorType([1, 2, 3, 5])
 
     def test_type_check_transpose_False(self):
@@ -269,7 +300,6 @@ def forward(self, x: TensorType((1, 2, 3, 5))):
 
     def test_type_check_batch_norm_2D(self):
         class BasicBlock(torch.nn.Module):
-
             def __init__(self, inplanes, planes):
                 super().__init__()
                 norm_layer = torch.nn.BatchNorm2d
@@ -289,18 +319,17 @@ def forward(self, x: TensorType((2, 2, 5, 4))):
         tc.type_check()
 
         for n in graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 assert n.type == TensorType((2, 2, 5, 4))
-            if n.op == 'output':
+            if n.op == "output":
                 assert n.type == TensorType((2, 2, 5, 4))
-            if n.op == 'call_module':
+            if n.op == "call_module":
                 assert n.type == TensorType((2, 2, 5, 4))
-            if n.op == 'call_function':
+            if n.op == "call_function":
                 assert n.type == TensorType((2, 2, 5, 4))
 
     def test_type_check_batch_norm_2D_false(self):
         class BasicBlock(torch.nn.Module):
-
             def __init__(self, inplanes, planes):
                 super().__init__()
                 norm_layer = torch.nn.BatchNorm2d
@@ -322,7 +351,6 @@ def forward(self, x: TensorType((2, 2, 5))):
 
     def test_type_check_batch_norm_2D_broadcast(self):
         class BasicBlock(torch.nn.Module):
-
             def __init__(self, inplanes, planes):
                 super().__init__()
                 norm_layer = torch.nn.BatchNorm2d
@@ -341,13 +369,13 @@ def forward(self, x: Dyn):
         tc = GraphTypeChecker({}, traced)
         tc.type_check()
         for n in graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 assert n.type == TensorType((Dyn, Dyn, Dyn, Dyn))
-            if n.op == 'call_function':
+            if n.op == "call_function":
                 assert n.type == TensorType((Dyn, Dyn, Dyn, Dyn))
-            if n.op == 'output':
+            if n.op == "output":
                 assert n.type == TensorType((Dyn, Dyn, Dyn, Dyn))
-            if n.op == 'call_module':
+            if n.op == "call_module":
                 assert n.type == TensorType((2, 2, Dyn, 4))
 
         B = BasicBlock(1, 1)
@@ -379,13 +407,13 @@ def forward(self, x: Dyn):
         tc = GraphTypeChecker({}, traced)
         tc.type_check()
         for n in graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 assert n.type == TensorType((Dyn, Dyn, Dyn, Dyn))
-            if n.op == 'call_function':
+            if n.op == "call_function":
                 assert n.type == TensorType((Dyn, Dyn, Dyn, Dyn))
-            if n.op == 'output':
+            if n.op == "output":
                 assert n.type == TensorType((Dyn, Dyn, Dyn, Dyn))
-            if n.op == 'call_module':
+            if n.op == "call_module":
                 assert n.type == TensorType((2, 2, Dyn, 4))
 
     def test_type_check_conv2D_2(self):
@@ -412,13 +440,13 @@ def forward(self, x: TensorType((5, 2, 3, 4))):
         tc.type_check()
         t = TensorType((5, 2, 3, 4))
         for n in graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 assert n.type == t
-            if n.op == 'call_function':
+            if n.op == "call_function":
                 assert n.type == t
-            if n.op == 'output':
+            if n.op == "output":
                 assert torch.Size(n.type.__args__) == b.shape
-            if n.op == 'call_module':
+            if n.op == "call_module":
                 assert n.type == t
 
         B = BasicBlock(1, 2)
@@ -430,12 +458,27 @@ def forward(self, x: TensorType((5, 2, 3, 4))):
             tc.type_check()
 
     def test_type_check_conv2D_2_fully_static(self):
-        annotation_list = [(1, 2, 3, 5), (2, 5, 6, 9), (10, 15, 13, 14),
-                           (10, Dyn, 13, 14), (Dyn, Dyn, Dyn, 3)]
-        input_list = [(1, 2, 3, 5), (2, 5, 6, 9), (10, 15, 13, 14),
-                      (10, 15, 13, 14), (1, 2, 2, 3)]
-        intermediate_types = [(1, Dyn, Dyn, 7), (2, Dyn, 4, 6), (10, 15, Dyn, 5),
-                              (10, 15, 7, 7), (1, Dyn, Dyn, Dyn)]
+        annotation_list = [
+            (1, 2, 3, 5),
+            (2, 5, 6, 9),
+            (10, 15, 13, 14),
+            (10, Dyn, 13, 14),
+            (Dyn, Dyn, Dyn, 3),
+        ]
+        input_list = [
+            (1, 2, 3, 5),
+            (2, 5, 6, 9),
+            (10, 15, 13, 14),
+            (10, 15, 13, 14),
+            (1, 2, 2, 3),
+        ]
+        intermediate_types = [
+            (1, Dyn, Dyn, 7),
+            (2, Dyn, 4, 6),
+            (10, 15, Dyn, 5),
+            (10, 15, 7, 7),
+            (1, Dyn, Dyn, Dyn),
+        ]
         in_planes_list = [2, 5, 15, 15, 2]
         stride_list = [1, 2, 3, 2, 2]
         out_planes_list = [2, 5, 15, 15, 2]
@@ -443,7 +486,13 @@ def test_type_check_conv2D_2_fully_static(self):
         dilation_list = [1, 2, 3, 3, 3]
         padding_list = [1, 2, 3, 3, 3]
         kernel_size_list = [1, 2, 3, 3, 3]
-        output_types = [(1, 2, Dyn, 7), (2, 5, 4, 6), (10, 15, Dyn, 5), (10, 15, 7, 7), (1, 2, Dyn, Dyn)]
+        output_types = [
+            (1, 2, Dyn, 7),
+            (2, 5, 4, 6),
+            (10, 15, Dyn, 5),
+            (10, 15, 7, 7),
+            (1, 2, Dyn, Dyn),
+        ]
 
         for i in range(5):
             annotation = annotation_list[i]
@@ -458,24 +507,42 @@ def test_type_check_conv2D_2_fully_static(self):
             intermediate_type = intermediate_types[i]
 
             class BasicBlock(torch.nn.Module):
-                def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
+                def __init__(
+                    self,
+                    in_planes,
+                    out_planes,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups,
+                    dilation,
+                ):
                     super().__init__()
-                    self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
-                                                 kernel_size=kernel_size, stride=stride,
-                                                 padding=padding, groups=groups, bias=False, dilation=dilation)
+                    self.conv1 = torch.nn.Conv2d(
+                        in_channels=in_planes,
+                        out_channels=out_planes,
+                        kernel_size=kernel_size,
+                        stride=stride,
+                        padding=padding,
+                        groups=groups,
+                        bias=False,
+                        dilation=dilation,
+                    )
 
                 def forward(self, x):
                     out = self.conv1(x)
                     return out
 
-            B = BasicBlock(in_planes, out_planes, kernel_size, stride, padding, groups, dilation)
+            B = BasicBlock(
+                in_planes, out_planes, kernel_size, stride, padding, groups, dilation
+            )
             ast_rewriter = RewritingTracer()
             graph = ast_rewriter.trace(B)
             traced = GraphModule(ast_rewriter.root, graph, "gm")
 
             # annotate our argument
             for n in graph.nodes:
-                if n.op == 'placeholder':
+                if n.op == "placeholder":
                     n.type = TensorType(annotation)
 
             b = B.forward(torch.rand(input))
@@ -483,36 +550,54 @@ def forward(self, x):
             tc.type_check()
 
             for n in graph.nodes:
-                if n.op == 'output':
+                if n.op == "output":
                     assert is_consistent(n.type, TensorType(b.size()))
 
             # test with intermediate annotations
             class BasicBlock(torch.nn.Module):
-                def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
+                def __init__(
+                    self,
+                    in_planes,
+                    out_planes,
+                    kernel_size,
+                    stride,
+                    padding,
+                    groups,
+                    dilation,
+                ):
                     super().__init__()
-                    self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
-                                                 kernel_size=kernel_size, stride=stride,
-                                                 padding=padding, groups=groups, bias=False, dilation=dilation)
+                    self.conv1 = torch.nn.Conv2d(
+                        in_channels=in_planes,
+                        out_channels=out_planes,
+                        kernel_size=kernel_size,
+                        stride=stride,
+                        padding=padding,
+                        groups=groups,
+                        bias=False,
+                        dilation=dilation,
+                    )
 
                 def forward(self, x):
                     out = self.conv1(x)
                     return out
 
-            B = BasicBlock(in_planes, out_planes, kernel_size, stride, padding, groups, dilation)
+            B = BasicBlock(
+                in_planes, out_planes, kernel_size, stride, padding, groups, dilation
+            )
             ast_rewriter = RewritingTracer()
             graph = ast_rewriter.trace(B)
             traced = GraphModule(ast_rewriter.root, graph, "gm")
 
             # populate our intermediate notes
             for n in traced.graph.nodes:
-                if n.op == 'call_module':
+                if n.op == "call_module":
                     n.type = TensorType(intermediate_type)
 
             tc = GraphTypeChecker({}, traced)
             tc.type_check()
 
             for n in traced.graph.nodes:
-                if n.op == 'output':
+                if n.op == "output":
                     assert n.type == TensorType(output_types[i])
                     assert is_consistent(n.type, TensorType(b.size()))
 
@@ -520,14 +605,26 @@ def test_typecheck_basicblock(self):
         class BasicBlock(torch.nn.Module):
             expansion = 1
 
-            def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
-                         base_width=64, dilation=1):
+            def __init__(
+                self,
+                inplanes,
+                planes,
+                stride=1,
+                downsample=None,
+                groups=1,
+                base_width=64,
+                dilation=1,
+            ):
                 super().__init__()
                 norm_layer = torch.nn.BatchNorm2d
                 if groups != 1 or base_width != 64:
-                    raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+                    raise ValueError(
+                        "BasicBlock only supports groups=1 and base_width=64"
+                    )
                 if dilation > 1:
-                    raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+                    raise NotImplementedError(
+                        "Dilation > 1 not supported in BasicBlock"
+                    )
                 # Both self.conv1 and self.downsample layers downsample the input when stride != 1
                 self.conv1 = conv3x3(inplanes, planes, stride)
                 self.bn1 = norm_layer(planes)
@@ -565,12 +662,14 @@ def forward(self, x: TensorType((2, 2, 4, 5))):
         tc.type_check()
 
         for n in traced.graph.nodes:
-            if n.target == 'output':
+            if n.target == "output":
                 assert isinstance(n.type, TensorType)
-                assert torch.Size(n.type.__args__) == B.forward(torch.rand(2, 2, 4, 5)).size()
+                assert (
+                    torch.Size(n.type.__args__)
+                    == B.forward(torch.rand(2, 2, 4, 5)).size()
+                )
 
     def test_type_check_conv2D_maxpool2d_flatten(self):
-
         class BasicBlock(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -581,7 +680,7 @@ def __init__(self):
                 self.fc1 = torch.nn.Linear(5, 120)
                 self.pool2 = torch.nn.AdaptiveAvgPool2d((6, 7))
 
-            def forward(self, x : TensorType((4, 3, 32, 32))):
+            def forward(self, x: TensorType((4, 3, 32, 32))):
                 out = self.conv1(x)
                 out = self.pool(out)
                 out = self.conv2(out)
@@ -598,10 +697,17 @@ def forward(self, x : TensorType((4, 3, 32, 32))):
         tc = GraphTypeChecker({}, traced)
         tc.type_check()
 
-        expected_ph_types = [TensorType((4, 3, 32, 32)), TensorType((4, 6, 28, 28)),
-                             TensorType((4, 6, 14, 14)), TensorType((4, 16, 10, 10)),
-                             TensorType((4, 16, 5, 5)), TensorType((4, 16, 5, 120)),
-                             TensorType((4, 16, 6, 7)), TensorType((4, 672)), TensorType((4, 672))]
+        expected_ph_types = [
+            TensorType((4, 3, 32, 32)),
+            TensorType((4, 6, 28, 28)),
+            TensorType((4, 6, 14, 14)),
+            TensorType((4, 16, 10, 10)),
+            TensorType((4, 16, 5, 5)),
+            TensorType((4, 16, 5, 120)),
+            TensorType((4, 16, 6, 7)),
+            TensorType((4, 672)),
+            TensorType((4, 672)),
+        ]
 
         expected_iter = iter(expected_ph_types)
         traced.graph.eliminate_dead_code()
@@ -619,10 +725,9 @@ def forward(self, x: TensorType((1, 2, 3, 5, Dyn))):
         tc = GraphTypeChecker({}, symbolic_traced)
         tc.type_check()
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'output':
+            if n.op == "output":
                 assert n.type == TensorType((1, 6, 5, Dyn))
 
-
     def test_type_check_flatten_2(self):
         class M(torch.nn.Module):
             def forward(self, x: TensorType((1, Dyn, 3, 5, Dyn))):
@@ -633,7 +738,7 @@ def forward(self, x: TensorType((1, Dyn, 3, 5, Dyn))):
         tc = GraphTypeChecker({}, symbolic_traced)
         tc.type_check()
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'output':
+            if n.op == "output":
                 assert n.type == TensorType((1, Dyn, 5, Dyn))
 
     def test_type_check_flatten3(self):
@@ -646,7 +751,7 @@ def forward(self, x: TensorType((2, 3, 4, 5))):
         tc = GraphTypeChecker({}, symbolic_traced)
         tc.type_check()
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'output':
+            if n.op == "output":
                 assert n.type == TensorType((2, 60))
         r = Refine(symbolic_traced)
         r.refine()
@@ -654,13 +759,12 @@ def forward(self, x: TensorType((2, 3, 4, 5))):
         assert c == [Equality(2, 2)]
 
     def test_type_typechecl_maxpool2d_3dinput(self):
-
         class BasicBlock(torch.nn.Module):
             def __init__(self):
                 super().__init__()
                 self.pool = torch.nn.MaxPool2d(5, 8)
 
-            def forward(self, x : TensorType((64, 8, 8))):
+            def forward(self, x: TensorType((64, 8, 8))):
                 out = self.pool(x)
                 return out
 
@@ -672,21 +776,42 @@ def forward(self, x : TensorType((64, 8, 8))):
         tc.type_check()
 
         for n in traced.graph.nodes:
-            if n.target == 'output':
+            if n.target == "output":
                 assert n.type == TensorType((64, 1, 1))
 
     def test_type_maxpool2d_fully_static(self):
-        annotation_list = [(Dyn, Dyn, 3, 5), (2, 5, 6, 9), (10, 15, 13, 14),
-                           (10, Dyn, 13, 14), (Dyn, Dyn, Dyn, 10)]
-        input_list = [(1, 2, 3, 5), (2, 5, 6, 9), (10, 15, 13, 14),
-                      (10, 15, 13, 14), (2, 2, 10, 10)]
-        intermediate_types = [(1, 2, Dyn, Dyn), (2, Dyn, 2, 4), (10, 15, Dyn, 2),
-                              (10, 15, 2, 3), (2, Dyn, Dyn, Dyn)]
+        annotation_list = [
+            (Dyn, Dyn, 3, 5),
+            (2, 5, 6, 9),
+            (10, 15, 13, 14),
+            (10, Dyn, 13, 14),
+            (Dyn, Dyn, Dyn, 10),
+        ]
+        input_list = [
+            (1, 2, 3, 5),
+            (2, 5, 6, 9),
+            (10, 15, 13, 14),
+            (10, 15, 13, 14),
+            (2, 2, 10, 10),
+        ]
+        intermediate_types = [
+            (1, 2, Dyn, Dyn),
+            (2, Dyn, 2, 4),
+            (10, 15, Dyn, 2),
+            (10, 15, 2, 3),
+            (2, Dyn, Dyn, Dyn),
+        ]
         stride_list = [1, 2, 3, 2, 1]
         dilation_list = [1, 2, 3, 3, 2]
         padding_list = [1, 2, 3, 3, 1]
         kernel_size_list = [2, 4, 6, 6, 3]
-        output_types = [(1, 2, 4, 6), (2, 5, 2, 4), (10, 15, 2, 2), (10, 15, 2, 3), (2, Dyn, Dyn, 8)]
+        output_types = [
+            (1, 2, 4, 6),
+            (2, 5, 2, 4),
+            (10, 15, 2, 2),
+            (10, 15, 2, 3),
+            (2, Dyn, Dyn, 8),
+        ]
 
         for i in range(5):
             annotation = annotation_list[i]
@@ -700,9 +825,14 @@ def test_type_maxpool2d_fully_static(self):
             class BasicBlock(torch.nn.Module):
                 def __init__(self, kernel_size, stride, padding, dilation):
                     super().__init__()
-                    self.pool = torch.nn.MaxPool2d(kernel_size, stride=stride,
-                                                   padding=padding, dilation=dilation,
-                                                   return_indices=False, ceil_mode=False)
+                    self.pool = torch.nn.MaxPool2d(
+                        kernel_size,
+                        stride=stride,
+                        padding=padding,
+                        dilation=dilation,
+                        return_indices=False,
+                        ceil_mode=False,
+                    )
 
                 def forward(self, x):
                     out = self.pool(x)
@@ -715,7 +845,7 @@ def forward(self, x):
 
             # annotate our argument
             for n in graph.nodes:
-                if n.op == 'placeholder':
+                if n.op == "placeholder":
                     n.type = TensorType(annotation)
 
             b = B.forward(torch.rand(input))
@@ -723,16 +853,21 @@ def forward(self, x):
             tc.type_check()
 
             for n in graph.nodes:
-                if n.op == 'output':
+                if n.op == "output":
                     assert is_consistent(n.type, TensorType(b.size()))
 
             # test with intermediate annotations
             class BasicBlock(torch.nn.Module):
                 def __init__(self, kernel_size, stride, padding, dilation):
                     super().__init__()
-                    self.pool = torch.nn.MaxPool2d(kernel_size, stride=stride,
-                                                   padding=padding, dilation=dilation,
-                                                   return_indices=False, ceil_mode=False)
+                    self.pool = torch.nn.MaxPool2d(
+                        kernel_size,
+                        stride=stride,
+                        padding=padding,
+                        dilation=dilation,
+                        return_indices=False,
+                        ceil_mode=False,
+                    )
 
                 def forward(self, x):
                     out = self.pool(x)
@@ -745,30 +880,45 @@ def forward(self, x):
 
             # annotate our argument
             for n in graph.nodes:
-                if n.op == 'placeholder':
+                if n.op == "placeholder":
                     n.type = TensorType(annotation)
 
             # populate our intermediate notes
             for n in traced.graph.nodes:
-                if n.op == 'call_module':
+                if n.op == "call_module":
                     n.type = TensorType(intermediate_type)
 
             tc = GraphTypeChecker({}, traced)
             tc.type_check()
 
             for n in traced.graph.nodes:
-                if n.op == 'output':
+                if n.op == "output":
                     assert n.type == TensorType(output_types[i])
                     assert is_consistent(n.type, TensorType(b.size()))
 
     def test_flatten_fully_static(self):
-        annotation_list = [Dyn, TensorType((2, 5, 6, 9)), TensorType((10, 15, 13, 14)),
-                           TensorType((10, Dyn, 13, 14)), TensorType((Dyn, Dyn, Dyn, 10))]
-        input_list = [(1, 2, 3, 5), (2, 5, 6, 9), (10, 15, 13, 14),
-                      (10, 15, 13, 14), (2, 2, 10, 10)]
-
-        intermediate_list = [Dyn, (2, 5, 6, 9), (10, 15, 13, 14),
-                             (10, 15, 13, 14), (2, 2, 10, 10)]
+        annotation_list = [
+            Dyn,
+            TensorType((2, 5, 6, 9)),
+            TensorType((10, 15, 13, 14)),
+            TensorType((10, Dyn, 13, 14)),
+            TensorType((Dyn, Dyn, Dyn, 10)),
+        ]
+        input_list = [
+            (1, 2, 3, 5),
+            (2, 5, 6, 9),
+            (10, 15, 13, 14),
+            (10, 15, 13, 14),
+            (2, 2, 10, 10),
+        ]
+
+        intermediate_list = [
+            Dyn,
+            (2, 5, 6, 9),
+            (10, 15, 13, 14),
+            (10, 15, 13, 14),
+            (2, 2, 10, 10),
+        ]
 
         start_dim = [1, 2, 1, 2, 0]
         end_dim = [1, 3, 3, 3, -2]
@@ -795,7 +945,7 @@ def forward(self, x):
 
             # annotate our argument
             for n in graph.nodes:
-                if n.op == 'placeholder':
+                if n.op == "placeholder":
                     n.type = annotation
 
             b = B.forward(torch.rand(input))
@@ -803,7 +953,7 @@ def forward(self, x):
             tc.type_check()
 
             for n in graph.nodes:
-                if n.op == 'output':
+                if n.op == "output":
                     assert is_consistent(n.type, TensorType(b.size()))
 
     @skipIfNoTorchVision
@@ -825,36 +975,34 @@ def test_resnet50(self):
         gm_run.graph.eliminate_dead_code()
         # here we are checking for consistency with fully dynamic nodes
         for n1, n2 in zip(gm_static.graph.nodes, gm_run.graph.nodes):
-            assert is_consistent(n1.type, TensorType(n2.meta['tensor_meta'].shape))
+            assert is_consistent(n1.type, TensorType(n2.meta["tensor_meta"].shape))
 
         # here we give the same input as to runtime
         gm_static_with_types = symbolic_trace(resnet50())
 
         # we initialize our placeholder
         for n in gm_static_with_types.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = TensorType((1, 3, 224, 224))
 
         g = GraphTypeChecker({}, gm_static_with_types)
         g.type_check()
         for n1, n2 in zip(gm_static_with_types.graph.nodes, gm_run.graph.nodes):
-            assert n1.type == TensorType(n2.meta['tensor_meta'].shape)
+            assert n1.type == TensorType(n2.meta["tensor_meta"].shape)
 
         # apply shape inference to graph and check
         # that the batch size is equal across all layers
         infer_symbolic_types(gm_static)
 
-
         batch_sizes = set()
         gm_static.graph.eliminate_dead_code()
         for n in gm_static.graph.nodes:
             assert isinstance(n.type, TensorType)
             batch_sizes.add(n.type.__args__[0])
-        assert (len(batch_sizes) == 1)
+        assert len(batch_sizes) == 1
 
     def test_type_check_batch_norm_symbolic(self):
         class BasicBlock(torch.nn.Module):
-
             def __init__(self, inplanes, planes):
                 super().__init__()
                 norm_layer = torch.nn.BatchNorm2d
@@ -875,10 +1023,14 @@ def forward(self, x: Dyn):
 
         infer_symbolic_types(traced)
 
-        my_types = iter([TensorType[(2, 2, sympy.symbols('~7'), 4)],
-                         TensorType[(2, 2, sympy.symbols('~7'), 4)],
-                         TensorType[(2, 2, sympy.symbols('~7'), 4)],
-                         TensorType[(2, 2, sympy.symbols('~7'), 4)]])
+        my_types = iter(
+            [
+                TensorType[(2, 2, sympy.symbols("~7"), 4)],
+                TensorType[(2, 2, sympy.symbols("~7"), 4)],
+                TensorType[(2, 2, sympy.symbols("~7"), 4)],
+                TensorType[(2, 2, sympy.symbols("~7"), 4)],
+            ]
+        )
 
         for n in graph.nodes:
             assert n.type == next(my_types)
@@ -887,6 +1039,7 @@ def test_symbolic_add_with_broadcast(self):
         class M(torch.nn.Module):
             def forward(self, x: TensorType((1, 2, 3, Dyn)), y: TensorType((2, 3, 4))):
                 return torch.add(x, y)
+
         module = M()
         symbolic_traced: torch.fx.GraphModule = symbolic_trace(module)
         tc = GraphTypeChecker({}, symbolic_traced)
@@ -901,13 +1054,14 @@ def forward(self, x: TensorType((1, 2, 3, Dyn)), y: TensorType((2, 3, 4))):
 
         infer_symbolic_types(symbolic_traced)
 
-        expected_ph_types = [TensorType((1, 2, 3, sympy.symbols('~0'))),
-                             TensorType((2, 3, 4)),
-                             TensorType((1, 2, 3, sympy.symbols('~1'))),
-                             TensorType((1, 2, 3, sympy.symbols('~1')))]
+        expected_ph_types = [
+            TensorType((1, 2, 3, sympy.symbols("~0"))),
+            TensorType((2, 3, 4)),
+            TensorType((1, 2, 3, sympy.symbols("~1"))),
+            TensorType((1, 2, 3, sympy.symbols("~1"))),
+        ]
         expected_iter = iter(expected_ph_types)
 
-
         for n in symbolic_traced.graph.nodes:
             assert n.type == next(expected_iter)
 
@@ -915,6 +1069,7 @@ def test_symbolic_add_with_broadcast_2(self):
         class M(torch.nn.Module):
             def forward(self, x: TensorType((1, 2)), y: TensorType((Dyn, 2))):
                 return torch.add(x, y)
+
         module = M()
         symbolic_traced: torch.fx.GraphModule = symbolic_trace(module)
         tc = GraphTypeChecker({}, symbolic_traced)
@@ -923,10 +1078,12 @@ def forward(self, x: TensorType((1, 2)), y: TensorType((Dyn, 2))):
         r = Refine(symbolic_traced)
         r.refine()
 
-        expected_ph_types = [TensorType((1, 2)),
-                             TensorType((sympy.symbols('~1'), 2)),
-                             TensorType((sympy.symbols('~1'), 2)),
-                             TensorType((sympy.symbols('~1'), 2))]
+        expected_ph_types = [
+            TensorType((1, 2)),
+            TensorType((sympy.symbols("~1"), 2)),
+            TensorType((sympy.symbols("~1"), 2)),
+            TensorType((sympy.symbols("~1"), 2)),
+        ]
         expected_iter = iter(expected_ph_types)
 
         for n in symbolic_traced.graph.nodes:
@@ -955,12 +1112,11 @@ def forward(self, x: Dyn):
         infer_symbolic_types(traced)
 
         for n in traced.graph.nodes:
-            if n.op == 'call_module':
+            if n.op == "call_module":
                 assert isinstance(n.type.__args__[2], sympy.floor)
                 assert isinstance(n.type.__args__[3], sympy.floor)
 
     def test_type_check_symbolic_inferenceconv2D_maxpool2d_flatten(self):
-
         class BasicBlock(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -971,7 +1127,7 @@ def __init__(self):
                 self.fc1 = torch.nn.Linear(5, 120)
                 self.pool2 = torch.nn.AdaptiveAvgPool2d((6, 7))
 
-            def forward(self, x : TensorType((4, 3, Dyn, Dyn))):
+            def forward(self, x: TensorType((4, 3, Dyn, Dyn))):
                 out = self.conv1(x)
                 out = self.pool(out)
                 out = self.conv2(out)
@@ -989,13 +1145,26 @@ def forward(self, x : TensorType((4, 3, Dyn, Dyn))):
         infer_symbolic_types(traced)
 
         for n in traced.graph.nodes:
-            if n.target == 'conv1':
-                assert n.type == TensorType((4, 6, sympy.floor(sympy.symbols('~0') - 4),
-                                             sympy.floor(sympy.symbols('~1') - 4)))
-
-            elif n.target == 'conv2':
-                assert n.type == TensorType((4, 16, sympy.floor(sympy.symbols('~4') - 4),
-                                             sympy.floor(sympy.symbols('~5') - 4)))
-
-if __name__ == '__main__':
+            if n.target == "conv1":
+                assert n.type == TensorType(
+                    (
+                        4,
+                        6,
+                        sympy.floor(sympy.symbols("~0") - 4),
+                        sympy.floor(sympy.symbols("~1") - 4),
+                    )
+                )
+
+            elif n.target == "conv2":
+                assert n.type == TensorType(
+                    (
+                        4,
+                        16,
+                        sympy.floor(sympy.symbols("~4") - 4),
+                        sympy.floor(sympy.symbols("~5") - 4),
+                    )
+                )
+
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/test/fx/test_lazy_graph_module.py b/test/fx/test_lazy_graph_module.py
new file mode 100644
index 0000000000000..ac9d404d67dc8
--- /dev/null
+++ b/test/fx/test_lazy_graph_module.py
@@ -0,0 +1,279 @@
+# Owner(s): ["oncall: fx"]
+
+import contextlib
+import pickle
+from io import BytesIO
+from unittest.mock import patch
+
+import torch
+import torch._export
+from torch import fx
+from torch.fx._lazy_graph_module import (
+    _LazyGraphModule,
+    _make_graph_module,
+    _use_lazy_graph_module,
+)
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.package import PackageExporter, PackageImporter
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestLazyGraphModule(TestCase):
+    exit_stack = None
+
+    @classmethod
+    def setUpClass(cls):
+        cls.exit_stack = contextlib.ExitStack()
+        cls.exit_stack.enter_context(_use_lazy_graph_module(True))
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.exit_stack.close()
+
+    @staticmethod
+    def replace_sin_with_cos(gm):
+        for n in gm.graph.nodes:
+            if n.target == "sin":
+                n.target = "cos"
+
+    def test_replace_sin_with_cos(self):
+        def f(x):
+            return x.sin()
+
+        x = torch.randn(2, 3)
+        gm = fx.symbolic_trace(f)
+        self.replace_sin_with_cos(gm)
+
+        gm.recompile()
+        expected = x.cos()
+        actual = gm(x)
+
+        self.assertTrue(torch.allclose(expected, actual))
+        code = gm.print_readable(False)
+        self.assertTrue("cos()" in code)
+        self.assertTrue(isinstance(gm, _LazyGraphModule))
+
+    def test_call_forward_directly(self):
+        def f(x):
+            return x.sin()
+
+        x = torch.randn(2, 3)
+        gm = fx.symbolic_trace(f)
+        self.assertTrue(isinstance(gm, _LazyGraphModule))
+        self.replace_sin_with_cos(gm)
+        gm.recompile()
+        expected = x.cos()
+        actual = gm.forward(x)
+
+        self.assertTrue(torch.allclose(expected, actual))
+
+    def test_needs_recompile(self):
+        """
+        Make sure needs_recompile() return the corrent state.
+        """
+
+        def f(x):
+            return x.sin()
+
+        gm = fx.symbolic_trace(f)
+        self.assertTrue(isinstance(gm, _LazyGraphModule))
+        self.assertTrue(gm._needs_recompile())
+        gm(torch.randn(2, 3))
+        self.assertFalse(gm._needs_recompile())
+
+    def test_multi_recompile(self):
+        """
+        Cover the case that multiple recompilation happens.
+        """
+
+        def f(x):
+            return x.sin()
+
+        gm = fx.symbolic_trace(f)
+        self.assertTrue(isinstance(gm, _LazyGraphModule))
+        self.assertTrue(gm._needs_recompile())
+        x = torch.randn(2, 3)
+        # trigger the first recompilation
+        self.assertTrue(torch.allclose(x.sin(), gm(x)))
+        self.assertFalse(gm._needs_recompile())
+
+        self.replace_sin_with_cos(gm)
+        self.assertFalse(gm._needs_recompile())
+        gm.recompile()
+        self.assertTrue(gm._needs_recompile())
+        # trigger the second recompilation
+        self.assertTrue(torch.allclose(x.cos(), gm(x)))
+        self.assertFalse(gm._needs_recompile())
+
+    def test_accessing_code_cause_recompiling(self):
+        """
+        Make sure we recompile if we have not done that yet when we access the code
+        property of a GraphModule.
+        """
+
+        def f(x):
+            return x.sin()
+
+        gm = fx.symbolic_trace(f)
+        self.assertTrue(isinstance(gm, _LazyGraphModule))
+        self.assertTrue(gm._needs_recompile())
+        # should trigger a recompilation
+        code = gm.code
+        self.assertTrue("sin" in code)
+        self.assertFalse(gm._needs_recompile())
+
+    def test_graph_module_str(self):
+        def f(x):
+            return x.sin()
+
+        gm = fx.symbolic_trace(f)
+        self.assertTrue(isinstance(gm, _LazyGraphModule))
+        self.assertTrue("sin" in str(gm))
+
+    def test_recapture_with_make_fx(self):
+        def f(x):
+            return x.sin()
+
+        gm = fx.symbolic_trace(f)
+        self.assertTrue(isinstance(gm, _LazyGraphModule))
+        self.assertTrue(gm._needs_recompile())
+        gm2 = make_fx(gm)(torch.randn(2, 3))
+        self.assertTrue(isinstance(gm2, _LazyGraphModule))
+        self.assertTrue(gm2._needs_recompile())
+
+        # make_fx will cal foward method of gm. That clears the _needs_recompile()
+        # flag.
+        self.assertFalse(gm._needs_recompile())
+
+    def test_recapture_with_symbolic_trace(self):
+        def f(x):
+            return x.sin()
+
+        gm = fx.symbolic_trace(f)
+        self.assertTrue(isinstance(gm, _LazyGraphModule))
+        self.assertTrue(gm._needs_recompile())
+        gm2 = fx.symbolic_trace(gm)
+
+        # the lazy recompilcation is already realized. We realize the
+        # recompilation in the beginning of symbolic_trace since symbolic_trace can not
+        # handle the tracing of lazy recompilation.
+        self.assertFalse(gm._needs_recompile())
+        self.assertTrue(gm2._needs_recompile())
+
+    def test_recapture_with_dynamo(self):
+        def f(x):
+            return x.sin()
+
+        gm = fx.symbolic_trace(f)
+        self.assertTrue(isinstance(gm, _LazyGraphModule))
+        self.assertTrue(gm._needs_recompile())
+        torch.compile(gm)(torch.rand(2, 3))
+
+        # dynamo calls gm.forward with eval hook installed. That will trigger
+        # the real recompilation.
+        self.assertFalse(gm._needs_recompile())
+
+    def test_save_lazy_foward(self):
+        """
+        Save the lazy forward method and call it repeatly. Make sure we
+        don't recompile for each such call.
+        """
+
+        def f(x):
+            return x.sin()
+
+        orig_gm_recompile = fx.GraphModule.recompile
+        recompile_count = 0
+
+        def mock_gm_recompile(self):
+            nonlocal recompile_count
+            recompile_count += 1
+            return orig_gm_recompile(self)
+
+        with patch.object(fx.GraphModule, "recompile", mock_gm_recompile):
+            gm = fx.symbolic_trace(f)
+            self.assertTrue(isinstance(gm, _LazyGraphModule))
+            saved_fwd = gm.forward
+
+            x = torch.rand(2, 3)
+            for _ in range(10):
+                saved_fwd(x)
+
+        self.assertEqual(recompile_count, 1)
+
+    def test_pickle(self):
+        """
+        Fx graph cache need the ability to pickle GraphModule/_LazyGraphModule.
+        """
+
+        def f(x):
+            return x.sin()
+
+        gm = fx.symbolic_trace(f)
+        self.assertTrue(isinstance(gm, _LazyGraphModule))
+        serialized = pickle.dumps(gm)
+        gm2 = pickle.loads(serialized)
+        self.assertTrue(isinstance(gm2, _LazyGraphModule))
+        self.assertTrue("sin" in gm2.code)
+
+    def test_make_graph_module(self):
+        gm = fx.symbolic_trace(lambda x: x.sin())
+        self.assertTrue(isinstance(gm, _LazyGraphModule))
+
+        gm1 = _make_graph_module(
+            gm, gm.graph, class_name="MyGraphModule", graph_module_cls=fx.GraphModule
+        )
+        self.assertFalse(isinstance(gm1, _LazyGraphModule))
+        self.assertTrue(gm1.__class__.__name__ == "MyGraphModule")
+
+        gm2 = _make_graph_module(gm, gm.graph)
+        self.assertTrue(isinstance(gm2, _LazyGraphModule))
+        self.assertTrue(gm2.__class__.__name__ == "GraphModule")
+
+    def test_package_fx_simple(self):
+        """
+        Copied from test/package/test_package_fx.py to make sure LazyGraphModule
+        works with torch.package.
+        """
+
+        class SimpleTest(torch.nn.Module):
+            def forward(self, x):
+                return torch.relu(x + 3.0)
+
+        st = SimpleTest()
+        traced = fx.symbolic_trace(st)
+
+        f = BytesIO()
+        with PackageExporter(f) as pe:
+            pe.save_pickle("model", "model.pkl", traced)
+
+        f.seek(0)
+        pi = PackageImporter(f)
+        loaded_traced = pi.load_pickle("model", "model.pkl")
+        input = torch.rand(2, 3)
+        self.assertEqual(loaded_traced(input), traced(input))
+
+    def test_dynamo_innermost_fn(self):
+        """
+        Repro for https://github.com/pytorch/pytorch/issues/121198 .
+        """
+
+        def f(x):
+            return x * 2
+
+        gm = torch.fx.symbolic_trace(f)
+        lazy_gm = torch.fx._lazy_graph_module._LazyGraphModule.from_graphmodule(gm)
+
+        wrapped_forward = torch._dynamo.disable(gm.forward)
+        got_inner_forward = torch._dynamo.eval_frame.innermost_fn(wrapped_forward)
+        assert hasattr(got_inner_forward, "__self__")
+
+        wrapped_lazy_forward = torch._dynamo.disable(lazy_gm.forward)
+        got_lazy_inner_forward = torch._dynamo.eval_frame.innermost_fn(
+            wrapped_lazy_forward
+        )
+        assert hasattr(got_lazy_inner_forward, "__self__")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/fx/test_matcher_utils.py b/test/fx/test_matcher_utils.py
index 87dac67211304..0a0765ef34f53 100644
--- a/test/fx/test_matcher_utils.py
+++ b/test/fx/test_matcher_utils.py
@@ -2,6 +2,7 @@
 
 import os
 import sys
+from typing import Callable
 
 import torch
 import torch.nn.functional as F
@@ -10,12 +11,24 @@
 
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+import unittest
+
 from torch.fx.passes.utils.matcher_utils import SubgraphMatcher
+from torch.fx.passes.utils.matcher_with_name_node_map_utils import (
+    SubgraphMatcherWithNameNodeMap,
+)
+from torch.testing._internal.common_utils import IS_WINDOWS, run_tests
 from torch.testing._internal.jit_utils import JitTestCase
-from torch.fx.passes.utils.matcher_with_name_node_map_utils import SubgraphMatcherWithNameNodeMap
-from torch.testing._internal.common_utils import IS_WINDOWS
-from torch.testing._internal.common_utils import run_tests
-import unittest
+
+
+class WrapperModule(torch.nn.Module):
+    def __init__(self, fn: Callable):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, *args, **kwargs):
+        return self.fn(*args, **kwargs)
+
 
 class TestMatcher(JitTestCase):
     def test_subgraph_matcher_with_attributes(self):
@@ -56,10 +69,12 @@ def forward(self, x):
     def test_subgraph_matcher_with_list(self):
         def original(x, y):
             return torch.ops.aten.view(x, [5, y.shape[0]])
+
         original_graph = torch.fx.symbolic_trace(original).graph
 
         def pattern(x, y, z):
             return torch.ops.aten.view(x, [z, y.shape[0]])
+
         pattern_graph = torch.fx.symbolic_trace(pattern).graph
 
         subgraph_matcher = SubgraphMatcher(pattern_graph)
@@ -68,11 +83,17 @@ def pattern(x, y, z):
 
     def test_subgraph_matcher_with_list_bad(self):
         def original(x, y):
-            return torch.ops.aten._reshape_alias_copy.default(x, [1, y.shape[0]], [y.shape[1], y.shape[1]])
+            return torch.ops.aten._reshape_alias_copy.default(
+                x, [1, y.shape[0]], [y.shape[1], y.shape[1]]
+            )
+
         original_graph = torch.fx.symbolic_trace(original).graph
 
         def pattern(x, y, b):
-            return torch.ops.aten._reshape_alias_copy.default(x, [b, y.shape[0], y.shape[1]], [y.shape[1]])
+            return torch.ops.aten._reshape_alias_copy.default(
+                x, [b, y.shape[0], y.shape[1]], [y.shape[1]]
+            )
+
         pattern_graph = torch.fx.symbolic_trace(pattern).graph
 
         subgraph_matcher = SubgraphMatcher(pattern_graph)
@@ -88,6 +109,7 @@ def original(x):
 
         def pattern(x):
             return x + 2
+
         pattern_graph = make_fx(pattern)(torch.ones(4, 4)).graph
         pattern_graph.eliminate_dead_code()
 
@@ -103,7 +125,10 @@ def test_variatic_arg_matching(self):
         inputs = (torch.randn(20, 16, 50, 32),)
 
         def maxpool(x, kernel_size, stride, padding, dilation):
-            return torch.ops.aten.max_pool2d_with_indices.default(x, kernel_size, stride, padding, dilation)
+            return torch.ops.aten.max_pool2d_with_indices.default(
+                x, kernel_size, stride, padding, dilation
+            )
+
         maxpool_graph = torch.fx.symbolic_trace(maxpool).graph
 
         maxpool_matcher = SubgraphMatcher(maxpool_graph)
@@ -131,7 +156,9 @@ def maxpool(x, kernel_size, stride, padding, dilation):
     @unittest.skipIf(IS_WINDOWS, "Windows not yet supported for torch.compile")
     def test_split_to_graph_and_name_node_map(self):
         """Testing the internal helper function for splitting the pattern graph"""
-        from torch.fx.passes.utils.matcher_with_name_node_map_utils import _split_to_graph_and_name_node_map
+        from torch.fx.passes.utils.matcher_with_name_node_map_utils import (
+            _split_to_graph_and_name_node_map,
+        )
 
         def pattern(x, weight):
             conv = F.conv2d(x, weight)
@@ -140,11 +167,12 @@ def pattern(x, weight):
             return relu, relu_mul_by_two, {"conv": conv, "relu": relu}
 
         from torch._export import capture_pre_autograd_graph
+
         example_inputs = (
             torch.randn(1, 3, 3, 3) * 10,
             torch.randn(3, 3, 3, 3),
         )
-        pattern_gm = capture_pre_autograd_graph(pattern, example_inputs)
+        pattern_gm = capture_pre_autograd_graph(WrapperModule(pattern), example_inputs)
         before_split_res = pattern_gm(*example_inputs)
         pattern_gm, name_node_map = _split_to_graph_and_name_node_map(pattern_gm)
         after_split_res = pattern_gm(*example_inputs)
@@ -153,8 +181,7 @@ def pattern(x, weight):
 
     @unittest.skipIf(IS_WINDOWS, "Windows not yet supported for torch.compile")
     def test_matcher_with_name_node_map_function(self):
-        """Testing SubgraphMatcherWithNameNodeMap with function pattern
-        """
+        """Testing SubgraphMatcherWithNameNodeMap with function pattern"""
 
         def target_graph(x, weight):
             x = x * 2
@@ -171,13 +198,16 @@ def pattern(x, weight):
             return relu, relu_mul_by_two, {"conv": conv, "relu": relu}
 
         from torch._export import capture_pre_autograd_graph
+
         example_inputs = (
             torch.randn(1, 3, 3, 3) * 10,
             torch.randn(3, 3, 3, 3),
         )
-        pattern_gm = capture_pre_autograd_graph(pattern, example_inputs)
+        pattern_gm = capture_pre_autograd_graph(WrapperModule(pattern), example_inputs)
         matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
-        target_gm = capture_pre_autograd_graph(target_graph, example_inputs)
+        target_gm = capture_pre_autograd_graph(
+            WrapperModule(target_graph), example_inputs
+        )
         internal_matches = matcher.match(target_gm.graph)
         for internal_match in internal_matches:
             name_node_map = internal_match.name_node_map
@@ -187,12 +217,14 @@ def pattern(x, weight):
             # check if we correctly annotated the target graph module
             for n in target_gm.graph.nodes:
                 if n == name_node_map["conv"]:
-                    assert "custom_annotation" in n.meta and n.meta["custom_annotation"] == "annotation"
+                    assert (
+                        "custom_annotation" in n.meta
+                        and n.meta["custom_annotation"] == "annotation"
+                    )
 
     @unittest.skipIf(IS_WINDOWS, "Windows not yet supported for torch.compile")
     def test_matcher_with_name_node_map_module(self):
-        """Testing SubgraphMatcherWithNameNodeMap with module pattern
-        """
+        """Testing SubgraphMatcherWithNameNodeMap with module pattern"""
 
         class M(torch.nn.Module):
             def __init__(self):
@@ -202,7 +234,6 @@ def __init__(self):
             def forward(self, x):
                 return self.linear(x)
 
-
         class Pattern(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -215,9 +246,8 @@ def forward(self, x):
                 return linear, {"linear": linear, "x": x}
 
         from torch._export import capture_pre_autograd_graph
-        example_inputs = (
-            torch.randn(3, 5),
-        )
+
+        example_inputs = (torch.randn(3, 5),)
         pattern_gm = capture_pre_autograd_graph(Pattern(), example_inputs)
         matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
         target_gm = capture_pre_autograd_graph(M(), example_inputs)
@@ -230,7 +260,11 @@ def forward(self, x):
             # check if we correctly annotated the target graph module
             for n in target_gm.graph.nodes:
                 if n == name_node_map["linear"]:
-                    assert "custom_annotation" in n.meta and n.meta["custom_annotation"] == "annotation"
+                    assert (
+                        "custom_annotation" in n.meta
+                        and n.meta["custom_annotation"] == "annotation"
+                    )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/fx/test_shape_inference.py b/test/fx/test_shape_inference.py
new file mode 100644
index 0000000000000..8e6ff11852955
--- /dev/null
+++ b/test/fx/test_shape_inference.py
@@ -0,0 +1,110 @@
+# Owner(s): ["module: fx"]
+
+import copy
+import unittest
+from collections import defaultdict
+
+import torch
+import torch.fx as fx
+from torch._dynamo.source import LocalSource
+from torch.fx.experimental.shape_inference.infer_shape import infer_shape
+from torch.fx.experimental.shape_inference.infer_symbol_values import (
+    infer_symbol_values,
+)
+from torch.fx.experimental.symbolic_shapes import DimDynamic, ShapeEnv
+
+
+class TestShapeInference(unittest.TestCase):
+    def test_infer_symbol_values(self):
+        def mksym(shape_env, value, source, dynamic_dim) -> None:
+            return shape_env.create_symintnode(
+                shape_env.create_symbol(
+                    value,
+                    source=source,
+                    dynamic_dim=dynamic_dim,
+                ),
+                hint=value,
+                source=source,
+            )
+
+        shape_env = ShapeEnv()
+        N = 8
+        sample = {f"s{i}": 2 for i in range(N)}
+        init_symints = [
+            mksym(shape_env, v, LocalSource(k), DimDynamic.DYNAMIC)
+            for k, v in sample.items()
+        ]
+        symints = copy.deepcopy(init_symints)
+        symbol_to_idx_dict = {f"s{i}": i for i in range(N)}
+        padding_constraints = defaultdict(list)
+
+        # prepare constraints strings
+        constraints = []
+        constraints.append(
+            "The size of tensor a (s1) must match the size of tensor b (1773) at non-singleton dimension 1)"
+        )
+        constraints.append(
+            "Expected size for first two dimensions of batch2 tensor to be: [s0, (s2//2) + 12] but got: [s0, 120]."
+        )
+        constraints.append("shape '[s0, -1, 32]' is invalid for input of size s0*s3")
+        constraints.append(
+            "a and b must have same reduction dim, but got [32*s0, s3] X [20, 15]."
+        )
+        constraints.append(
+            "a and b must have same reduction dim, but got [s0, s4 + 1568] X [5728, 1024]."
+        )
+        constraints.append(
+            "Expected size for first two dimensions of batch2 tensor to be: [s0, 40] but got: [s0, s5]."
+        )
+        constraints.append(
+            "shape '[s0, -1, 32]' is invalid for input of size s0*s6 + 1344*s0"
+        )
+        constraints.append(
+            "shape '[-1, 47]' is invalid for input of size 32*s0*s6 + 1344*s0"
+        )
+        constraints.append(
+            "Expected size for first two dimensions of batch2 tensor to be: [s0, 47*s6] but got: [s0*s6, 47]."
+        )
+        constraints.append("Split sizes add up to 4258 but got the tensor's size of s7")
+
+        for constraint in constraints:
+            infer_symbol_values(
+                symints,
+                init_symints,
+                symbol_to_idx_dict,
+                padding_constraints,
+                constraint,
+            )
+
+        self.assertEqual(symints[1], 1773)
+        self.assertEqual(symints[2], 216)
+        self.assertEqual(symints[3], 640)
+        self.assertEqual(symints[4], 4160)
+        self.assertEqual(symints[5], 40)
+        self.assertEqual(symints[6], 160)
+        self.assertEqual(symints[7], 4258)
+
+    def test_infer_shape(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w_1 = torch.empty([256, 328])
+                self.b_1 = torch.empty([256])
+                self.w_2 = torch.empty([328, 256])
+                self.b_2 = torch.empty([328])
+
+            def forward(self, x):
+                l_1 = torch.nn.functional.linear(x, self.w_1, bias=self.b_1)
+                s_1 = torch.sigmoid(l_1)
+                l_2 = torch.nn.functional.linear(s_1, self.w_2, bias=self.b_2)
+                t_1 = torch.tanh(l_2)
+                return t_1
+
+        def generate_graph_module(model):
+            gm = fx.symbolic_trace(model)
+            return gm
+
+        m = TestModule()
+        gm = generate_graph_module(m)
+        input_tensors = [torch.randn(1, 1)]
+        infer_shape(gm, input_tensors)
diff --git a/test/fx/test_source_matcher_utils.py b/test/fx/test_source_matcher_utils.py
index 3864eca829c9b..85774965a8006 100644
--- a/test/fx/test_source_matcher_utils.py
+++ b/test/fx/test_source_matcher_utils.py
@@ -9,9 +9,14 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from torch._dynamo.eval_frame import is_dynamo_supported
-from torch.fx.passes.utils.source_matcher_utils import get_source_partitions, check_subgraphs_connected
+from torch.fx.passes.tools_common import legalize_graph
+from torch.fx.passes.utils.source_matcher_utils import (
+    check_subgraphs_connected,
+    get_source_partitions,
+)
 from torch.testing._internal.jit_utils import JitTestCase
 
+
 class TestSourceMatcher(JitTestCase):
     @unittest.skipIf(not is_dynamo_supported(), "Dynamo not supported")
     def test_module_partitioner_linear_relu_linear(self):
@@ -33,15 +38,32 @@ def forward(self, x):
         gm, _ = torch._dynamo.export(M(), aten_graph=True)(*inputs)
         gm.graph.eliminate_dead_code()
 
-        module_partitions = get_source_partitions(gm.graph, [torch.nn.Linear, torch.nn.ReLU])
+        module_partitions = get_source_partitions(
+            gm.graph, [torch.nn.Linear, torch.nn.ReLU]
+        )
 
         self.assertEqual(len(module_partitions), 2)
         self.assertEqual(len(module_partitions[torch.nn.Linear]), 3)
         self.assertEqual(len(module_partitions[torch.nn.ReLU]), 1)
 
-        self.assertFalse(check_subgraphs_connected(module_partitions[torch.nn.Linear][0], module_partitions[torch.nn.ReLU][0]))
-        self.assertTrue(check_subgraphs_connected(module_partitions[torch.nn.Linear][1], module_partitions[torch.nn.ReLU][0]))
-        self.assertFalse(check_subgraphs_connected(module_partitions[torch.nn.Linear][2], module_partitions[torch.nn.ReLU][0]))
+        self.assertFalse(
+            check_subgraphs_connected(
+                module_partitions[torch.nn.Linear][0],
+                module_partitions[torch.nn.ReLU][0],
+            )
+        )
+        self.assertTrue(
+            check_subgraphs_connected(
+                module_partitions[torch.nn.Linear][1],
+                module_partitions[torch.nn.ReLU][0],
+            )
+        )
+        self.assertFalse(
+            check_subgraphs_connected(
+                module_partitions[torch.nn.Linear][2],
+                module_partitions[torch.nn.ReLU][0],
+            )
+        )
 
     @unittest.skipIf(not is_dynamo_supported(), "Dynamo not supported")
     def test_module_partitioner_conv_relu_maxpool(self):
@@ -69,21 +91,50 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 return self.maxpool(self.relu(z))
 
         inputs = (torch.randn(1, 3, 256, 256),)
-        gm, _ = torch._dynamo.export(M(torch.ones(1, 16, 256, 256)), aten_graph=True)(*inputs)
+        gm, _ = torch._dynamo.export(M(torch.ones(1, 16, 256, 256)), aten_graph=True)(
+            *inputs
+        )
         gm.graph.eliminate_dead_code()
 
-        module_partitions = get_source_partitions(gm.graph, [torch.nn.Conv2d, torch.nn.ReLU, torch.nn.MaxPool2d])
+        module_partitions = get_source_partitions(
+            gm.graph, [torch.nn.Conv2d, torch.nn.ReLU, torch.nn.MaxPool2d]
+        )
 
         self.assertEqual(len(module_partitions), 3)
         self.assertEqual(len(module_partitions[torch.nn.Conv2d]), 3)
         self.assertEqual(len(module_partitions[torch.nn.ReLU]), 1)
         self.assertEqual(len(module_partitions[torch.nn.MaxPool2d]), 1)
 
-        self.assertFalse(check_subgraphs_connected(module_partitions[torch.nn.Conv2d][0], module_partitions[torch.nn.ReLU][0]))
-        self.assertFalse(check_subgraphs_connected(module_partitions[torch.nn.Conv2d][1], module_partitions[torch.nn.ReLU][0]))
-        self.assertTrue(check_subgraphs_connected(module_partitions[torch.nn.Conv2d][2], module_partitions[torch.nn.ReLU][0]))
-        self.assertFalse(check_subgraphs_connected(module_partitions[torch.nn.MaxPool2d][0], module_partitions[torch.nn.ReLU][0]))
-        self.assertTrue(check_subgraphs_connected(module_partitions[torch.nn.ReLU][0], module_partitions[torch.nn.MaxPool2d][0]))
+        self.assertFalse(
+            check_subgraphs_connected(
+                module_partitions[torch.nn.Conv2d][0],
+                module_partitions[torch.nn.ReLU][0],
+            )
+        )
+        self.assertFalse(
+            check_subgraphs_connected(
+                module_partitions[torch.nn.Conv2d][1],
+                module_partitions[torch.nn.ReLU][0],
+            )
+        )
+        self.assertTrue(
+            check_subgraphs_connected(
+                module_partitions[torch.nn.Conv2d][2],
+                module_partitions[torch.nn.ReLU][0],
+            )
+        )
+        self.assertFalse(
+            check_subgraphs_connected(
+                module_partitions[torch.nn.MaxPool2d][0],
+                module_partitions[torch.nn.ReLU][0],
+            )
+        )
+        self.assertTrue(
+            check_subgraphs_connected(
+                module_partitions[torch.nn.ReLU][0],
+                module_partitions[torch.nn.MaxPool2d][0],
+            )
+        )
 
     @unittest.skipIf(not is_dynamo_supported(), "Dynamo not supported")
     def test_module_partitioner_functional_conv_relu_conv(self):
@@ -96,7 +147,15 @@ def __init__(self):
                 self.groups = 1
 
             def forward(self, x, weight, bias):
-                return torch.nn.functional.conv2d(x, weight, bias, self.stride, self.padding, self.dilation, self.groups)
+                return torch.nn.functional.conv2d(
+                    x,
+                    weight,
+                    bias,
+                    self.stride,
+                    self.padding,
+                    self.dilation,
+                    self.groups,
+                )
 
         class M(torch.nn.Module):
             def __init__(self):
@@ -114,7 +173,9 @@ def forward(self, x, weight, bias):
         gm, _ = torch._dynamo.export(M(), aten_graph=True)(*inputs)
         gm.graph.eliminate_dead_code()
 
-        module_partitions = get_source_partitions(gm.graph, [torch.nn.functional.conv2d])
+        module_partitions = get_source_partitions(
+            gm.graph, [torch.nn.functional.conv2d]
+        )
 
         self.assertEqual(len(module_partitions), 1)
         self.assertEqual(len(module_partitions[torch.nn.functional.conv2d]), 2)
@@ -138,8 +199,30 @@ def forward(self, x, weight, bias):
         gm, _ = torch._dynamo.export(M(), aten_graph=True)(*inputs)
         gm.graph.eliminate_dead_code()
 
-        module_partitions = get_source_partitions(gm.graph, [torch.nn.functional.linear, torch.nn.functional.relu])
+        module_partitions = get_source_partitions(
+            gm.graph, [torch.nn.functional.linear, torch.nn.functional.relu]
+        )
 
         self.assertEqual(len(module_partitions), 2)
         self.assertEqual(len(module_partitions[torch.nn.functional.linear]), 4)
         self.assertEqual(len(module_partitions[torch.nn.functional.relu]), 2)
+
+    @unittest.skipIf(not is_dynamo_supported(), "Dynamo not supported")
+    def test_legalize_slice(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                b = x.item()
+                torch._check_is_size(b)
+                torch._check(b + 1 < y.size(0))
+                return y[: b + 1]
+
+        ep = torch.export.export(M(), (torch.tensor(4), torch.randn(10)))
+        fake_inputs = [
+            node.meta["val"] for node in ep.graph.nodes if node.op == "placeholder"
+        ]
+        gm = ep.module()
+        with fake_inputs[0].fake_mode:
+            torch.fx.Interpreter(gm).run(*fake_inputs)
+        legalized_gm = legalize_graph(gm)
+        with fake_inputs[0].fake_mode:
+            torch.fx.Interpreter(legalized_gm).run(*fake_inputs)
diff --git a/test/fx/test_subgraph_rewriter.py b/test/fx/test_subgraph_rewriter.py
index 2d2373fe41fd0..3a64a3ce30e5d 100644
--- a/test/fx/test_subgraph_rewriter.py
+++ b/test/fx/test_subgraph_rewriter.py
@@ -4,8 +4,9 @@
 import sys
 
 import torch
-from torch.fx import symbolic_trace, subgraph_rewriter
+from torch.fx import subgraph_rewriter, symbolic_trace
 from torch.fx.annotate import annotate
+
 # Make the helper files in test/ importable
 from torch.fx.experimental.rewriter import RewritingTracer
 
@@ -13,10 +14,13 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_fx.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_fx.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 @torch.fx.wrap
 def wrapped_gemm_bias_mul(a, b, bias):
@@ -24,14 +28,15 @@ def wrapped_gemm_bias_mul(a, b, bias):
     mul_res = lin_res * a
     return lin_res, mul_res
 
+
 @torch.fx.wrap
 def wrapped_gemm_bias_mul_with_c(a, b, bias, c):
     lin_res = torch.nn.functional.linear(a, b, bias=bias)
     mul_res = lin_res * c
     return lin_res, mul_res
 
-class TestSubgraphRewriter(JitTestCase):
 
+class TestSubgraphRewriter(JitTestCase):
     def test_subgraph_rewriter_preserves_logic(self):
         class M(torch.nn.Module):
             def forward(self, x):
@@ -110,7 +115,9 @@ def comparison(x):
 
         x = torch.randn(1, 5)
 
-        matches = subgraph_rewriter.replace_pattern_with_filters(traced, pattern, replacement, [])
+        matches = subgraph_rewriter.replace_pattern_with_filters(
+            traced, pattern, replacement, []
+        )
 
         traced.graph.lint()
 
@@ -297,7 +304,9 @@ def replacement(x):
         test_outs = traced.forward(x)
         self.assertEqual(ref_outs, test_outs)
 
-    def test_subgraph_rewriter_pattern_output_pattern_node_can_have_users_that_are_not_matched(self):
+    def test_subgraph_rewriter_pattern_output_pattern_node_can_have_users_that_are_not_matched(
+        self,
+    ):
         class M(torch.nn.Module):
             def forward(self, x):
                 y = torch.relu(x)
@@ -326,7 +335,9 @@ def comparison(x):
         test_outs = traced.forward(x)
         self.assertEqual(ref_outs, test_outs)
 
-    def test_subgraph_rewriter_internal_pattern_nodes_cannot_have_users_that_are_not_matched(self):
+    def test_subgraph_rewriter_internal_pattern_nodes_cannot_have_users_that_are_not_matched(
+        self,
+    ):
         class M(torch.nn.Module):
             def forward(self, x, w1, w2, b1, b2):
                 m0 = torch.cat([w1, w2])
@@ -385,6 +396,7 @@ def test_subgraph_rewriter_placeholder_matching(self):
 
         Credit to Jerry Zhang (GitHub: jerryzh168) for this test case
         """
+
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -483,7 +495,6 @@ def forward(self, x):
         self.assertEqual(type(submod), torch.nn.ReLU)
 
     def test_subgraph_rewriter_annotations_int(self):
-
         class M1(torch.nn.Module):
             def forward(self, x):
                 y: int = x
@@ -500,12 +511,11 @@ def forward(self, x):
         module = M2()
         symbolic_traced: torch.fx.GraphModule = symbolic_trace(module)
         for n, m in zip(symbolic_traced.graph.nodes, graph.nodes):
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 assert n.type == int
                 assert m.type == int
 
     def test_subgraph_rewriter_replace_consecutive_submodules(self):
-
         def f(x):
             x = torch.sigmoid(x)
             x = torch.sigmoid(x)
@@ -536,7 +546,6 @@ def comparison(x):
         self.assertEqual(ref_outs, test_outs)
 
     def test_subgraph_rewriter_with_overlapping_matches(self):
-
         def f(x):
             x = torch.sigmoid(x)
             x = torch.sigmoid(x)
@@ -569,7 +578,6 @@ def comparison(x):
         self.assertEqual(ref_outs, test_outs)
 
     def test_subgraph_rewriter_replace_with_multiple_outputs(self):
-
         def f(x):
             y = torch.sigmoid(x)
             z = torch.relu(x)
@@ -602,7 +610,6 @@ def comparison(x):
         self.assertEqual(ref_outs, test_outs)
 
     def test_subgraph_rewriter_replace_with_duplicated_outputs(self):
-
         def f(x1, x2):
             x = x1 - x2
             y = torch.sigmoid(x)
@@ -670,7 +677,6 @@ def comparison(x1, x2, x3):
         self.assertEqual(ref_outs, test_outs)
 
     def test_subgraph_rewriter_call_method(self):
-
         class M(torch.nn.Module):
             def forward(self, x):
                 x = x.dequantize()
@@ -701,7 +707,6 @@ def replacement(x):
         self.assertEqual(ref_outs, test_outs)
 
     def test_subgraph_rewriter_nodes_with_kwargs(self):
-
         class M(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -737,7 +742,6 @@ def replacement(a, b, bias):
         self.assertTrue(found_repalcement_node)
 
     def test_subgraph_rewriter_local_revert(self):
-
         # Following model will have 3 anchors as the matching candidate with the given pattern
         # Anchor 1 and 3 is a real match, but anchor 2 is not.
         # The subgraph rewriter should be able to revert the changes made while matching anchor 2.
@@ -763,9 +767,7 @@ def forward(self, in0, in1):
                 # potential match at anchor 1
                 mul_res_1 = in1 * lin_res_2
                 sum_res_1 = mul_res_1 + in1
-                lin_res_3 = torch.nn.functional.linear(
-                    sum_res_1, self.w2, bias=self.b2
-                )
+                lin_res_3 = torch.nn.functional.linear(sum_res_1, self.w2, bias=self.b2)
                 sigmoid_res_1 = torch.sigmoid(lin_res_3)
                 # potential match at anchor 2
                 mul_res_2 = lin_res_3 * sigmoid_res_1
@@ -791,9 +793,8 @@ def gemm_bias_mul_replacement_with_c(a, b, bias, c):
 
         traced = symbolic_trace(M())
         matches = subgraph_rewriter.replace_pattern(
-            traced,
-            gemm_bias_mul_pattern_with_c,
-            gemm_bias_mul_replacement_with_c)
+            traced, gemm_bias_mul_pattern_with_c, gemm_bias_mul_replacement_with_c
+        )
 
         self.assertEqual(len(matches), 2)
 
@@ -834,7 +835,7 @@ def BinaryOpScalarReLUReplacement(x, num, scale, zero_point):
             return x
 
         def second_input_is_scalar(match, original_graph, pattern_graph):
-            """ check the node that's matched to the second input of the pattern graph
+            """check the node that's matched to the second input of the pattern graph
             is a scalar number
             """
             input_idx = 0
@@ -848,19 +849,21 @@ def second_input_is_scalar(match, original_graph, pattern_graph):
             return True
 
         def check_replacement_nodes(self, traced, matches):
-            replacement_nodes_in_graph = [node for node in traced.graph.nodes if node.target == torch.mul]
+            replacement_nodes_in_graph = [
+                node for node in traced.graph.nodes if node.target == torch.mul
+            ]
             replacement_nodes_in_res = [r for m in matches for r in m.replacements]
-            self.assertEqual(len(replacement_nodes_in_graph), len(replacement_nodes_in_res))
+            self.assertEqual(
+                len(replacement_nodes_in_graph), len(replacement_nodes_in_res)
+            )
             self.assertEqual(replacement_nodes_in_graph, replacement_nodes_in_res)
             return len(replacement_nodes_in_graph)
 
         # match without filter, should find 2 match
         traced = symbolic_trace(M())
         matches = subgraph_rewriter.replace_pattern_with_filters(
-            traced,
-            BinaryOpScalarReLUPattern,
-            BinaryOpScalarReLUReplacement,
-            None)
+            traced, BinaryOpScalarReLUPattern, BinaryOpScalarReLUReplacement, None
+        )
         self.assertEqual(len(matches), 2)
         self.assertEqual(check_replacement_nodes(self, traced, matches), 2)
 
@@ -870,7 +873,8 @@ def check_replacement_nodes(self, traced, matches):
             traced,
             BinaryOpScalarReLUPattern,
             BinaryOpScalarReLUReplacement,
-            [second_input_is_scalar])
+            [second_input_is_scalar],
+        )
         self.assertEqual(len(matches), 1)
         self.assertEqual(check_replacement_nodes(self, traced, matches), 1)
 
@@ -890,10 +894,13 @@ def replacement(x, arg0, arg1):
 
         self.assertEqual(len(matches), 1)
 
-        self.assertExpectedInline(traced.code.strip(), """\
+        self.assertExpectedInline(
+            traced.code.strip(),
+            """\
 def forward(self, x):
     _reshape_alias_copy_default_1 = torch.ops.aten._reshape_alias_copy.default(x, [3, 4], [1, 2]);  x = None
-    return _reshape_alias_copy_default_1""")  # noqa: B950
+    return _reshape_alias_copy_default_1""",
+        )  # noqa: B950
 
     def test_replacement_with_attrs(self):
         class M(torch.nn.Module):
@@ -928,11 +935,15 @@ def forward(self, x):
     def test_matching_variable_arguments(self):
         class M(torch.nn.Module):
             def forward(self, x):
-                return torch.ops.aten.max_pool2d_with_indices.default(x, [2, 2], stride=[2, 2])
+                return torch.ops.aten.max_pool2d_with_indices.default(
+                    x, [2, 2], stride=[2, 2]
+                )
 
         def pattern(x, kernel_size, stride):
             # default padding is [0, 0]
-            return torch.ops.aten.max_pool2d_with_indices.default(x, kernel_size, stride, padding=[0, 0])
+            return torch.ops.aten.max_pool2d_with_indices.default(
+                x, kernel_size, stride, padding=[0, 0]
+            )
 
         traced = symbolic_trace(M())
         matches = subgraph_rewriter.replace_pattern(traced, pattern, pattern)
@@ -951,12 +962,20 @@ def replacement(x, y):
             return torch.sub(torch.mul(x, y), y)
 
         traced = symbolic_trace(M())
-        matches = subgraph_rewriter.replace_pattern_with_filters(traced, pattern, replacement)
+        matches = subgraph_rewriter.replace_pattern_with_filters(
+            traced, pattern, replacement
+        )
 
         def check_replacement_nodes(self, traced, matches):
-            replacement_nodes_in_graph = [node for node in traced.graph.nodes if node.target in {torch.sub, torch.mul}]
+            replacement_nodes_in_graph = [
+                node
+                for node in traced.graph.nodes
+                if node.target in {torch.sub, torch.mul}
+            ]
             replacement_nodes_in_res = [r for m in matches for r in m.replacements]
-            self.assertEqual(len(replacement_nodes_in_graph), len(replacement_nodes_in_res))
+            self.assertEqual(
+                len(replacement_nodes_in_graph), len(replacement_nodes_in_res)
+            )
             self.assertEqual(replacement_nodes_in_graph, replacement_nodes_in_res)
             return len(replacement_nodes_in_graph)
 
diff --git a/test/fx/test_z3_gradual_types.py b/test/fx/test_z3_gradual_types.py
index 4e5eb07767b29..c49ab79120eb2 100644
--- a/test/fx/test_z3_gradual_types.py
+++ b/test/fx/test_z3_gradual_types.py
@@ -1,22 +1,39 @@
 # Owner(s): ["module: fx"]
 import operator
 import unittest
+
+import torch
 from torch.fx import GraphModule, symbolic_trace
 from torch.fx.experimental.meta_tracer import symbolic_trace as meta_symbolic_trace
-from torch.fx.experimental.migrate_gradual_types.constraint import BinConstraintT, DVar, TVar, T
-from torch.fx.experimental.migrate_gradual_types.constraint_generator import ConstraintGenerator
-from torch.fx.experimental.migrate_gradual_types.constraint_transformation import transform_constraint
-from torch.fx.experimental.migrate_gradual_types.operation import op_precision, op_matching, op_consistency
-from torch.fx.experimental.migrate_gradual_types.transform_to_z3 import transform_all_constraints, \
-    evaluate_conditional_with_constraints
-from torch.fx.experimental.migrate_gradual_types.z3_types import tensor_type, D, z3_dyn
+from torch.fx.experimental.migrate_gradual_types.constraint import (
+    BinConstraintT,
+    DVar,
+    T,
+    TVar,
+)
+from torch.fx.experimental.migrate_gradual_types.constraint_generator import (
+    ConstraintGenerator,
+)
+from torch.fx.experimental.migrate_gradual_types.constraint_transformation import (
+    transform_constraint,
+)
+from torch.fx.experimental.migrate_gradual_types.operation import (
+    op_consistency,
+    op_matching,
+    op_precision,
+)
+from torch.fx.experimental.migrate_gradual_types.transform_to_z3 import (
+    evaluate_conditional_with_constraints,
+    transform_all_constraints,
+)
+from torch.fx.experimental.migrate_gradual_types.z3_types import D, tensor_type, z3_dyn
 from torch.fx.experimental.rewriter import RewritingTracer
 from torch.fx.tensor_type import Dyn, TensorType
-import torch
 
 
 try:
     import z3  # type: ignore[import]
+
     HAS_Z3 = True
 except ImportError:
     HAS_Z3 = False
@@ -24,13 +41,14 @@
 
 try:
     from torchvision import models
+
     HAS_TORCHVISION = True
 except ImportError:
     HAS_TORCHVISION = False
 skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision")
 
-class TorchDynamoUseCases(unittest.TestCase):
 
+class TorchDynamoUseCases(unittest.TestCase):
     def test_dim(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([1, 2])):
@@ -45,7 +63,6 @@ def forward(self, x: TensorType([1, 2])):
         y_res = z3.z3.Int(2)
         self.assertEqual(s.model()[y_res], 2)
 
-
     def test_reshape(self):
         """
         In this example, we prove that some nodes must
@@ -70,11 +87,11 @@ def forward(self, x: Dyn):
 
 
 class HFOperations(unittest.TestCase):
-
     def test_eq_dim(self):
         """
         test dimensions and equalities
         """
+
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([32, 4, 4])):
                 eq = x.dim() == 3
@@ -88,7 +105,9 @@ def forward(self, x: TensorType([32, 4, 4])):
             if n.target == operator.eq:
                 node = n
 
-        positive, negative = evaluate_conditional_with_constraints(ast_rewriter.root, graph, node)
+        positive, negative = evaluate_conditional_with_constraints(
+            ast_rewriter.root, graph, node
+        )
         self.assertEqual(positive, z3.sat)
         self.assertEqual(negative, z3.unsat)
 
@@ -101,6 +120,7 @@ def test_conditional_ne_1(self):
         Returns:
 
         """
+
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([32, 4, 4]), y: TensorType([32, 4, 4])):
                 size_5 = x.size()
@@ -120,7 +140,9 @@ def forward(self, x: TensorType([32, 4, 4]), y: TensorType([32, 4, 4])):
 
         # since x and y are equal, the requirement that x != y cannot be true, so we should get unsat
         # for the positive condition and sat for the negative condition
-        positive, negative = evaluate_conditional_with_constraints(ast_rewriter.root, graph, node)
+        positive, negative = evaluate_conditional_with_constraints(
+            ast_rewriter.root, graph, node
+        )
         self.assertEqual(positive, z3.unsat)
         self.assertEqual(negative, z3.sat)
 
@@ -143,7 +165,6 @@ def forward(self, x: TensorType([Dyn, 2, 3]), y: TensorType([1, 3, 2])):
         self.assertEqual(s.model()[output].arg(1).arg(1), b.shape[1])
         self.assertEqual(s.model()[output].arg(2).arg(1), b.shape[2])
 
-
     def test_bmm2(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: Dyn, y: TensorType([1, 3, 2])):
@@ -176,7 +197,6 @@ def forward(self, x: TensorType([2, 3, 3]), y: TensorType([1, 3, 2])):
         s.add(transformed)
         self.assertEqual(s.check(), z3.unsat)
 
-
     def test_transpose(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([1, 2, 3, 4])):
@@ -199,7 +219,7 @@ def forward(self, x: TensorType([1, 2, 3, 4])):
 
         # change the annotation to Dyn
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = Dyn
 
         transformed = transform_all_constraints(symbolic_traced, counter=0)
@@ -208,12 +228,12 @@ def forward(self, x: TensorType([1, 2, 3, 4])):
         s.add(transformed)
         self.assertEqual(s.check(), z3.sat)
 
-
     def test_index_select(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([2050, 1024]), y: Dyn):
                 index_select = x.index_select(0, y)
                 return index_select
+
         symbolic_traced: torch.fx.GraphModule = symbolic_trace(BasicBlock())
         # print(symbolic_traced)
         b = BasicBlock().forward(torch.rand(2050, 1024), torch.ones(8).int())
@@ -258,7 +278,6 @@ def forward(self, x: TensorType([1, 2, 3])):
         assert s.model()[attr_res].arg(1).arg(1) == b.shape[1]
         assert s.model()[attr_res].arg(2).arg(1) == b.shape[2]
 
-
     def test_expand(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([1, 4])):
@@ -282,7 +301,7 @@ def forward(self, x: TensorType([1, 4])):
         # change the annotation on the input to Dyn.
         # the last dimension should still be 4
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = Dyn
 
         transformed = transform_all_constraints(symbolic_traced, counter=0)
@@ -296,7 +315,9 @@ def forward(self, x: TensorType([1, 4])):
     def test_getitem_tensor(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([4, 4])):
-                getitem = x[(None, None, slice(None, None, None), slice(None, None, None))]
+                getitem = x[
+                    (None, None, slice(None, None, None), slice(None, None, None))
+                ]
                 return getitem
 
         B = BasicBlock()
@@ -317,7 +338,7 @@ def forward(self, x: TensorType([4, 4])):
         # change the annotation on the input to make sure it propagates
         # to the output
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = TensorType([Dyn, 4])
 
         transformed = transform_all_constraints(symbolic_traced, counter=0)
@@ -327,7 +348,6 @@ def forward(self, x: TensorType([4, 4])):
         # dyn check
         assert s.model()[get_item_res].arg(2).arg(0) == 0
 
-
     def test_getitem_tensor2(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([4, 4])):
@@ -348,11 +368,12 @@ def forward(self, x: TensorType([4, 4])):
         assert s.model()[get_item_res].arg(2).arg(1) == b.shape[2]
         assert s.model()[get_item_res].arg(3).arg(1) == b.shape[3]
 
-
     def test_getitem_tensor_3(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([4, 4])):
-                getitem = x[(None, slice(None, None, None), None, slice(None, None, None))]
+                getitem = x[
+                    (None, slice(None, None, None), None, slice(None, None, None))
+                ]
                 return getitem
 
         B = BasicBlock()
@@ -368,10 +389,7 @@ def forward(self, x: TensorType([4, 4])):
         assert s.model()[get_item_res].arg(2).arg(1) == b.shape[2]
         assert s.model()[get_item_res].arg(3).arg(1) == b.shape[3]
 
-
-
     def test_layer_norm(self):
-
         class BasicBlock(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -403,7 +421,7 @@ def forward(self, x: Dyn):
 
         # change annotation to the wrong shape
         for n in graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = TensorType([10, 10])
 
         traced = GraphModule(ast_rewriter.root, graph, "gm")
@@ -414,7 +432,7 @@ def forward(self, x: Dyn):
 
         # fix the annotation
         for n in graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = TensorType([10, 1024])
 
         traced = GraphModule(ast_rewriter.root, graph, "gm")
@@ -426,9 +444,7 @@ def forward(self, x: Dyn):
         self.assertEqual(s.model()[output].arg(0).arg(1), b[0])
         self.assertEqual(s.model()[output].arg(1).arg(1), b[1])
 
-
     def test_layer_norm_functional(self):
-
         class BasicBlock(torch.nn.Module):
             def forward(self, x: Dyn):
                 return torch.nn.functional.layer_norm(x, (1024,))
@@ -455,7 +471,6 @@ def forward(self, x: Dyn):
         self.assertEqual(b.shape[0], s.model()[input].arg(0).arg(1))
 
     def test_ne_int_long_type_as(self):
-
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([Dyn, Dyn]), y: TensorType([Dyn, Dyn])):
                 ne_int = torch.ne(x, y).int()
@@ -473,7 +488,7 @@ def forward(self, x: TensorType([Dyn, Dyn]), y: TensorType([Dyn, Dyn])):
 
         input = z3.Const(1, tensor_type)
         input_2 = z3.Const(2, tensor_type)
-        s1, s2 = z3.Ints('s1 s2')
+        s1, s2 = z3.Ints("s1 s2")
 
         output_long = z3.Const(8, tensor_type)
         s.add(input == tensor_type.tensor2(D(1, 2), D(1, 4)))
@@ -484,10 +499,9 @@ def forward(self, x: TensorType([Dyn, Dyn]), y: TensorType([Dyn, Dyn])):
         self.assertEqual(s.model()[output_long].arg(0).arg(1), actual_shape[0])
         self.assertEqual(s.model()[output_long].arg(1).arg(1), actual_shape[1])
 
-
     def test_ne(self):
-        s1, s2 = z3.Ints('s1 s2')
-        s11, s22 = z3.Ints('s11 s22')
+        s1, s2 = z3.Ints("s1 s2")
+        s11, s22 = z3.Ints("s11 s22")
         d1, d2 = D(s11, s1), D(0, s2)
 
         class BasicBlock(torch.nn.Module):
@@ -505,9 +519,9 @@ def forward(self, x: Dyn, y: Dyn):
 
         # change the annotations
         for n in graph.nodes:
-            if n.name == 'x':
+            if n.name == "x":
                 n.type = TensorType([1, 2])
-            if n.name == 'y':
+            if n.name == "y":
                 n.type = TensorType([2, Dyn])
 
         # resulting type should be TensorType([2, 2])
@@ -526,14 +540,15 @@ def forward(self, x: Dyn, y: Dyn):
         self.assertEqual(s.model()[output].arg(0).arg(1), B.shape[0])
         self.assertEqual(s.model()[output].arg(1).arg(1), B.shape[0])
 
-
     def test_cumsum(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([Dyn, 4, 3])):
                 t = torch.cumsum(x, 3)
                 return t
 
-        symbolic_traced: torch.fx.GraphModule = meta_symbolic_trace(BasicBlock(), meta_args={})
+        symbolic_traced: torch.fx.GraphModule = meta_symbolic_trace(
+            BasicBlock(), meta_args={}
+        )
         transformed = transform_all_constraints(symbolic_traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
@@ -543,7 +558,7 @@ def forward(self, x: TensorType([Dyn, 4, 3])):
 
         # modify the annotation to Dyn which should give sat
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = Dyn
 
         transformed = transform_all_constraints(symbolic_traced, counter=0)
@@ -553,7 +568,7 @@ def forward(self, x: TensorType([Dyn, 4, 3])):
 
         # # modify the annotation to the right tensor size
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = TensorType([1, 2, 3, 4])
 
         # verify that the input is equal to the output
@@ -577,14 +592,15 @@ def forward(self, x: TensorType([Dyn, 4, 3])):
         self.assertNotEqual(s.model()[result].arg(2).arg(0).as_long(), 0)
         self.assertNotEqual(s.model()[result].arg(3).arg(0).as_long(), 0)
 
-
     def test_cumsum_kwargs(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([Dyn, 4, 3])):
                 t = torch.cumsum(x, dim=3)
                 return t
 
-        symbolic_traced: torch.fx.GraphModule = meta_symbolic_trace(BasicBlock(), meta_args={})
+        symbolic_traced: torch.fx.GraphModule = meta_symbolic_trace(
+            BasicBlock(), meta_args={}
+        )
         transformed = transform_all_constraints(symbolic_traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
@@ -594,7 +610,7 @@ def forward(self, x: TensorType([Dyn, 4, 3])):
 
         # modify the annotation to Dyn which should give sat
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = Dyn
 
         transformed = transform_all_constraints(symbolic_traced, counter=0)
@@ -602,7 +618,6 @@ def forward(self, x: TensorType([Dyn, 4, 3])):
         s.add(transformed)
         self.assertEqual(s.check(), z3.sat)
 
-
     def test_arange(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([2, 4])):
@@ -613,7 +628,9 @@ def forward(self, x: TensorType([2, 4])):
 
         B = BasicBlock().forward(torch.rand(2, 4))
 
-        symbolic_traced: torch.fx.GraphModule = meta_symbolic_trace(BasicBlock(), meta_args={})
+        symbolic_traced: torch.fx.GraphModule = meta_symbolic_trace(
+            BasicBlock(), meta_args={}
+        )
         transformed = transform_all_constraints(symbolic_traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
@@ -624,7 +641,7 @@ def forward(self, x: TensorType([2, 4])):
 
         # change the annotation to Dyn. This will migrate to an arbitrary type
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = Dyn
 
         transformed = transform_all_constraints(symbolic_traced, counter=0)
@@ -633,7 +650,7 @@ def forward(self, x: TensorType([2, 4])):
         self.assertEqual(s.check(), z3.sat)
 
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = TensorType([Dyn, Dyn, Dyn, Dyn])
 
         transformed = transform_all_constraints(symbolic_traced, counter=0)
@@ -650,7 +667,9 @@ def forward(self, x: TensorType([2, 4])):
                 add = arange + 1
                 return add
 
-        symbolic_traced: torch.fx.GraphModule = meta_symbolic_trace(BasicBlock(), meta_args={})
+        symbolic_traced: torch.fx.GraphModule = meta_symbolic_trace(
+            BasicBlock(), meta_args={}
+        )
         transformed = transform_all_constraints(symbolic_traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
@@ -660,7 +679,6 @@ def forward(self, x: TensorType([2, 4])):
         add_result = z3.Const(6, tensor_type)
         self.assertEqual(s.model()[arange_result], s.model()[add_result])
 
-
     def test_regular_add_2(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([2, 4])):
@@ -672,7 +690,9 @@ def forward(self, x: TensorType([2, 4])):
 
         b = BasicBlock().forward(torch.rand(2, 4))
 
-        symbolic_traced: torch.fx.GraphModule = meta_symbolic_trace(BasicBlock(), meta_args={})
+        symbolic_traced: torch.fx.GraphModule = meta_symbolic_trace(
+            BasicBlock(), meta_args={}
+        )
         transformed = transform_all_constraints(symbolic_traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
@@ -680,7 +700,6 @@ def forward(self, x: TensorType([2, 4])):
         res = z3.Int(5)
         self.assertEqual(s.model()[res], b)
 
-
     def test_regular_add_3(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([2, 4])):
@@ -692,7 +711,9 @@ def forward(self, x: TensorType([2, 4])):
 
         b = BasicBlock().forward(torch.rand(2, 4))
 
-        symbolic_traced: torch.fx.GraphModule = meta_symbolic_trace(BasicBlock(), meta_args={})
+        symbolic_traced: torch.fx.GraphModule = meta_symbolic_trace(
+            BasicBlock(), meta_args={}
+        )
         transformed = transform_all_constraints(symbolic_traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
@@ -726,7 +747,7 @@ def forward(self, x: TensorType([2, 4])):
 
         # change the type. This should still be satisfiable
         for n in traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = TensorType([Dyn, Dyn])
 
         transformed = transform_all_constraints(traced, counter=0)
@@ -739,7 +760,7 @@ def forward(self, x: TensorType([2, 4])):
 
         # change the type to Dyn. Here, we will get an arbitrary migration
         for n in traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = Dyn
 
         transformed = transform_all_constraints(traced, counter=0)
@@ -748,13 +769,16 @@ def forward(self, x: TensorType([2, 4])):
 
         self.assertEqual(s.check(), z3.sat)
 
-
     def test_embedding_2(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([2, 4]), y: TensorType([Dyn, 1024])):
                 return torch.nn.functional.embedding(x, y)
 
-        B = BasicBlock().forward(torch.ones([2, 4], dtype=torch.long), torch.rand(256008, 1024)).size()
+        B = (
+            BasicBlock()
+            .forward(torch.ones([2, 4], dtype=torch.long), torch.rand(256008, 1024))
+            .size()
+        )
         ast_rewriter = RewritingTracer()
         graph = ast_rewriter.trace(BasicBlock())
         traced = GraphModule(ast_rewriter.root, graph, "gm")
@@ -783,7 +807,7 @@ def forward(self, x: TensorType([Dyn, 2, Dyn])):
         self.assertEqual(s.check(), z3.sat)
 
         d1, d2 = z3.Int(39), z3.Int(2)
-        d4, d5 = z3.Int('input_d1'), z3.Int('input_d2')
+        d4, d5 = z3.Int("input_d1"), z3.Int("input_d2")
 
         # migrate the third dimension
         s.add(d1 != 0)
@@ -804,7 +828,6 @@ def forward(self, x: Dyn):
                 getitem = size[-1]
                 return getitem
 
-
         ast_rewriter = RewritingTracer()
         graph = ast_rewriter.trace(BasicBlock())
         traced = GraphModule(ast_rewriter.root, graph, "gm")
@@ -817,9 +840,14 @@ def forward(self, x: Dyn):
 
         # force the input to be of size 4
 
-        s1, s2, s3, s4 = z3.Ints('x1 x2 x3 x4')
-        s11, s22, s33, s44 = z3.Ints('x11 x22 x33 x44')
-        d1, d2, d3, d4 = D(s11, s1), D(s22, s2), D(s33, s3), D(s44, s4),
+        s1, s2, s3, s4 = z3.Ints("x1 x2 x3 x4")
+        s11, s22, s33, s44 = z3.Ints("x11 x22 x33 x44")
+        d1, d2, d3, d4 = (
+            D(s11, s1),
+            D(s22, s2),
+            D(s33, s3),
+            D(s44, s4),
+        )
 
         input = z3.Const(1, tensor_type)
         s.add(input == tensor_type.tensor4(d1, d2, d3, d4))
@@ -865,7 +893,6 @@ def forward(self, x: TensorType([2, 4])):
                 mul = embed_tokens * 32.0
                 return mul
 
-
         # print(B)
 
         ast_rewriter = RewritingTracer()
@@ -940,11 +967,11 @@ def forward(self, x: TensorType([2, 4]), y: Dyn):
         s.add(transformed)
         self.assertEqual(s.check(), z3.sat)
 
-
     def test_conditional_wrong_assumption(self):
         """
         Test condition after making the wrong assumption about the input
         """
+
         class BasicBlock(torch.nn.Module):
             def forward(self, x: Dyn):
                 gt = x > 1
@@ -958,7 +985,9 @@ def forward(self, x: Dyn):
             if n.target == operator.gt:
                 node = n
 
-        positive, negative = evaluate_conditional_with_constraints(ast_rewriter.root, graph, node)
+        positive, negative = evaluate_conditional_with_constraints(
+            ast_rewriter.root, graph, node
+        )
 
         self.assertEqual(positive, z3.sat)
         self.assertEqual(negative, z3.sat)
@@ -972,6 +1001,7 @@ def test_conditional(self):
         Returns:
 
         """
+
         class BasicBlock(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -995,34 +1025,38 @@ def forward(self, x: TensorType([Dyn, 4])):
             if n.target == operator.gt:
                 node = n
 
-        positive, negative = evaluate_conditional_with_constraints(ast_rewriter.root, graph, node)
+        positive, negative = evaluate_conditional_with_constraints(
+            ast_rewriter.root, graph, node
+        )
         self.assertEqual(positive, z3.sat)
         self.assertEqual(negative, z3.unsat)
 
         # change the annotation to Dyn
         for n in graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = Dyn
 
         # here, both should be SAT since the input is Dyn
-        positive, negative = evaluate_conditional_with_constraints(ast_rewriter.root, graph, node)
+        positive, negative = evaluate_conditional_with_constraints(
+            ast_rewriter.root, graph, node
+        )
 
         self.assertEqual(positive, z3.sat)
         self.assertEqual(negative, z3.sat)
 
-
         # change the annotation to TensorType[Dyn, Dyn]
         for n in graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = TensorType([Dyn, Dyn])
 
         # here, both should be SAT as well
-        positive, negative = evaluate_conditional_with_constraints(ast_rewriter.root, graph, node)
+        positive, negative = evaluate_conditional_with_constraints(
+            ast_rewriter.root, graph, node
+        )
 
         self.assertEqual(positive, z3.sat)
         self.assertEqual(negative, z3.sat)
 
-
     def test_conditional_2(self):
         """
         This test case is for the HFmodels interface.
@@ -1032,6 +1066,7 @@ def test_conditional_2(self):
         Returns the opposite result of the above testcase
 
         """
+
         class BasicBlock(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1055,13 +1090,14 @@ def forward(self, x: TensorType([Dyn, 4])):
             if n.target == operator.lt:
                 node = n
 
-        positive, negative = evaluate_conditional_with_constraints(ast_rewriter.root, graph, node)
+        positive, negative = evaluate_conditional_with_constraints(
+            ast_rewriter.root, graph, node
+        )
         self.assertEqual(positive, z3.unsat)
         self.assertEqual(negative, z3.sat)
 
 
 class ComposeOperationsGradualTypes(unittest.TestCase):
-
     def test_masked_fill(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([2, 4])):
@@ -1076,19 +1112,25 @@ def forward(self, x: TensorType([2, 4])):
         B = BasicBlock().forward(torch.rand(2, 4))
         # print(B.shape)
 
-        symbolic_traced: torch.fx.GraphModule = meta_symbolic_trace(BasicBlock(), meta_args={})
+        symbolic_traced: torch.fx.GraphModule = meta_symbolic_trace(
+            BasicBlock(), meta_args={}
+        )
         # print(symbolic_traced)
         transformed = transform_all_constraints(symbolic_traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
         self.assertEqual(s.check(), z3.sat)
         masked_fill_res = z3.Const(10, tensor_type)
-        self.assertEqual(s.model()[masked_fill_res].arg(0).arg(1).as_long(), B.size()[0])
-        self.assertEqual(s.model()[masked_fill_res].arg(1).arg(1).as_long(), B.size()[1])
+        self.assertEqual(
+            s.model()[masked_fill_res].arg(0).arg(1).as_long(), B.size()[0]
+        )
+        self.assertEqual(
+            s.model()[masked_fill_res].arg(1).arg(1).as_long(), B.size()[1]
+        )
 
         # change the annotation to Dyn. This will migrate to an arbitrary type
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = Dyn
 
         transformed = transform_all_constraints(symbolic_traced, counter=0)
@@ -1097,7 +1139,7 @@ def forward(self, x: TensorType([2, 4])):
         self.assertEqual(s.check(), z3.sat)
 
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = TensorType([Dyn, Dyn, Dyn, Dyn])
 
         transformed = transform_all_constraints(symbolic_traced, counter=0)
@@ -1136,11 +1178,27 @@ def forward(self, x: Dyn, y: Dyn):
 
     def test_conv_reshape_add_0(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
+            def __init__(
+                self,
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups,
+                dilation,
+            ):
                 super().__init__()
-                self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
-                                             kernel_size=kernel_size, stride=stride,
-                                             padding=padding, groups=groups, bias=False, dilation=dilation)
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels=in_planes,
+                    out_channels=out_planes,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    groups=groups,
+                    bias=False,
+                    dilation=dilation,
+                )
 
             def forward(self, x: Dyn, y: Dyn):
                 return torch.add(self.conv1(torch.reshape(x, (1, 2, 10, 20))), y)
@@ -1154,14 +1212,29 @@ def forward(self, x: Dyn, y: Dyn):
         solver.add(new_transformed_c)
         self.assertEqual(solver.check(), z3.sat)
 
-
     def test_conv_reshape_add_0_2(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
+            def __init__(
+                self,
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups,
+                dilation,
+            ):
                 super().__init__()
-                self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
-                                             kernel_size=kernel_size, stride=stride,
-                                             padding=padding, groups=groups, bias=False, dilation=dilation)
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels=in_planes,
+                    out_channels=out_planes,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    groups=groups,
+                    bias=False,
+                    dilation=dilation,
+                )
 
             def forward(self, x: Dyn, y: TensorType([4, 1])):
                 return torch.add(self.conv1(torch.reshape(x, (1, 2, 10, 20))), y)
@@ -1179,15 +1252,18 @@ def forward(self, x: Dyn, y: TensorType([4, 1])):
         solver.add(new_transformed_c)
         self.assertEqual(solver.check(), z3.sat)
 
-
         conv_result = z3.Const(4, tensor_type)
         add_result = z3.Const(9, tensor_type)
         input_2 = z3.Const(2, tensor_type)
 
-        s1, s2, s3, s4 = z3.Ints('x1 x2 x3 x4')
-        s11, s22, s33, s44 = z3.Ints('x11 x22 x33 x44')
-        d1, d2, d3, d4 = D(s11, s1), D(s22, s2), D(s33, s3), D(s44, s4),
-
+        s1, s2, s3, s4 = z3.Ints("x1 x2 x3 x4")
+        s11, s22, s33, s44 = z3.Ints("x11 x22 x33 x44")
+        d1, d2, d3, d4 = (
+            D(s11, s1),
+            D(s22, s2),
+            D(s33, s3),
+            D(s44, s4),
+        )
 
         solver.add(conv_result == tensor_type.tensor4(d1, d2, d3, d4))
         solver.check()
@@ -1209,11 +1285,27 @@ def forward(self, x: Dyn, y: TensorType([4, 1])):
 
     def test_conv_reshape_add_0_3(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
+            def __init__(
+                self,
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups,
+                dilation,
+            ):
                 super().__init__()
-                self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
-                                             kernel_size=kernel_size, stride=stride,
-                                             padding=padding, groups=groups, bias=False, dilation=dilation)
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels=in_planes,
+                    out_channels=out_planes,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    groups=groups,
+                    bias=False,
+                    dilation=dilation,
+                )
 
             def forward(self, x: Dyn, y: TensorType([11, 1])):
                 return torch.add(self.conv1(torch.reshape(x, (1, 2, 10, 20))), y)
@@ -1227,14 +1319,29 @@ def forward(self, x: Dyn, y: TensorType([11, 1])):
         solver.add(new_transformed_c)
         self.assertEqual(solver.check(), z3.unsat)
 
-
     def test_conv_reshape_add_1(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
+            def __init__(
+                self,
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups,
+                dilation,
+            ):
                 super().__init__()
-                self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
-                                             kernel_size=kernel_size, stride=stride,
-                                             padding=padding, groups=groups, bias=False, dilation=dilation)
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels=in_planes,
+                    out_channels=out_planes,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    groups=groups,
+                    bias=False,
+                    dilation=dilation,
+                )
 
             def forward(self, x: Dyn, y: TensorType([1, 2, 10, 20])):
                 return torch.add(self.conv1(torch.reshape(x, (1, 2, 10, 20))), y)
@@ -1251,13 +1358,28 @@ def forward(self, x: Dyn, y: TensorType([1, 2, 10, 20])):
 
 class GradualTypes(unittest.TestCase):
     def test_conv_reshape_unsat(self):
-
         class BasicBlock(torch.nn.Module):
-            def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
+            def __init__(
+                self,
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups,
+                dilation,
+            ):
                 super().__init__()
-                self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
-                                             kernel_size=kernel_size, stride=stride,
-                                             padding=padding, groups=groups, bias=False, dilation=dilation)
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels=in_planes,
+                    out_channels=out_planes,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    groups=groups,
+                    bias=False,
+                    dilation=dilation,
+                )
 
             def forward(self, x: Dyn):
                 return self.conv1(torch.reshape(x, (1, 2, 10)))
@@ -1273,11 +1395,27 @@ def forward(self, x: Dyn):
 
     def test_conv_reshape0(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
+            def __init__(
+                self,
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups,
+                dilation,
+            ):
                 super().__init__()
-                self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
-                                             kernel_size=kernel_size, stride=stride,
-                                             padding=padding, groups=groups, bias=False, dilation=dilation)
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels=in_planes,
+                    out_channels=out_planes,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    groups=groups,
+                    bias=False,
+                    dilation=dilation,
+                )
 
             def forward(self, x: Dyn):
                 return self.conv1(torch.reshape(x, (1, 2, 10, 20)))
@@ -1294,9 +1432,14 @@ def forward(self, x: Dyn):
         self.assertEqual(solver.check(), z3.sat)
         conv_result = z3.Const(3, tensor_type)
 
-        s1, s2, s3, s4 = z3.Ints('x1 x2 x3 x4')
-        s11, s22, s33, s44 = z3.Ints('x11 x22 x33 x44')
-        d1, d2, d3, d4 = D(s11, s1), D(s22, s2), D(s33, s3), D(s44, s4),
+        s1, s2, s3, s4 = z3.Ints("x1 x2 x3 x4")
+        s11, s22, s33, s44 = z3.Ints("x11 x22 x33 x44")
+        d1, d2, d3, d4 = (
+            D(s11, s1),
+            D(s22, s2),
+            D(s33, s3),
+            D(s44, s4),
+        )
 
         solver.add(conv_result == tensor_type.tensor4(d1, d2, d3, d4))
         solver.check()
@@ -1307,9 +1450,14 @@ def forward(self, x: Dyn):
         assert solver.model()[s3].as_long() == res[2]
         assert solver.model()[s4].as_long() == res[3]
 
-        s1, s2, s3, s4 = z3.Ints('y1 y2 y3 y4')
-        s11, s22, s33, s44 = z3.Ints('y11 y22 y33 y44')
-        d1, d2, d3, d4 = D(s11, s1), D(s22, s2), D(s33, s3), D(s44, s4),
+        s1, s2, s3, s4 = z3.Ints("y1 y2 y3 y4")
+        s11, s22, s33, s44 = z3.Ints("y11 y22 y33 y44")
+        d1, d2, d3, d4 = (
+            D(s11, s1),
+            D(s22, s2),
+            D(s33, s3),
+            D(s44, s4),
+        )
 
         input = z3.Const(1, tensor_type)
         solver.add(input == tensor_type.tensor4(d1, d2, d3, d4))
@@ -1323,14 +1471,29 @@ def forward(self, x: Dyn):
         # print(solver.check())
         # print(solver.model())
 
-
     def test_conv_reshape1(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
+            def __init__(
+                self,
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups,
+                dilation,
+            ):
                 super().__init__()
-                self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
-                                             kernel_size=kernel_size, stride=stride,
-                                             padding=padding, groups=groups, bias=False, dilation=dilation)
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels=in_planes,
+                    out_channels=out_planes,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    groups=groups,
+                    bias=False,
+                    dilation=dilation,
+                )
 
             def forward(self, x: TensorType([20, 20])):
                 return self.conv1(torch.reshape(x, (1, -1, 10, 20)))
@@ -1347,9 +1510,14 @@ def forward(self, x: TensorType([20, 20])):
         self.assertEqual(solver.check(), z3.sat)
         conv_result = z3.Const(3, tensor_type)
 
-        s1, s2, s3, s4 = z3.Ints('x1 x2 x3 x4')
-        s11, s22, s33, s44 = z3.Ints('x11 x22 x33 x44')
-        d1, d2, d3, d4 = D(s11, s1), D(s22, s2), D(s33, s3), D(s44, s4),
+        s1, s2, s3, s4 = z3.Ints("x1 x2 x3 x4")
+        s11, s22, s33, s44 = z3.Ints("x11 x22 x33 x44")
+        d1, d2, d3, d4 = (
+            D(s11, s1),
+            D(s22, s2),
+            D(s33, s3),
+            D(s44, s4),
+        )
 
         solver.add(conv_result == tensor_type.tensor4(d1, d2, d3, d4))
         solver.check()
@@ -1361,18 +1529,31 @@ def forward(self, x: TensorType([20, 20])):
 
 
 class TestSingleOperation(unittest.TestCase):
-
     def test_conv_wrong_example(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.conv1 = torch.nn.Conv2d(in_channels=2, out_channels=2,
-                                             kernel_size=2, stride=2,
-                                             padding=2, groups=2, bias=False, dilation=2)
-
-                self.conv2 = torch.nn.Conv2d(in_channels=4, out_channels=2,
-                                             kernel_size=2, stride=2,
-                                             padding=2, groups=2, bias=False, dilation=2)
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels=2,
+                    out_channels=2,
+                    kernel_size=2,
+                    stride=2,
+                    padding=2,
+                    groups=2,
+                    bias=False,
+                    dilation=2,
+                )
+
+                self.conv2 = torch.nn.Conv2d(
+                    in_channels=4,
+                    out_channels=2,
+                    kernel_size=2,
+                    stride=2,
+                    padding=2,
+                    groups=2,
+                    bias=False,
+                    dilation=2,
+                )
 
                 self.relu = torch.nn.ReLU(inplace=True)
 
@@ -1392,9 +1573,14 @@ def forward(self, x: Dyn):
         print(solver3.check())
         assert solver3.check() == z3.sat
 
-        s1, s2, s3, s4 = z3.Ints('s1 s2 s3 s4')
-        s11, s22, s33, s44 = z3.Ints('s11 s22 s33 s44')
-        d1, d2, d3, d4 = D(s11, s1), D(s22, s2), D(s33, s3), D(s44, s4),
+        s1, s2, s3, s4 = z3.Ints("s1 s2 s3 s4")
+        s11, s22, s33, s44 = z3.Ints("s11 s22 s33 s44")
+        d1, d2, d3, d4 = (
+            D(s11, s1),
+            D(s22, s2),
+            D(s33, s3),
+            D(s44, s4),
+        )
         x = z3.Const(1, tensor_type)
         solver3.add(x == tensor_type.tensor4(d1, d2, d3, d4))
         assert solver3.check() == z3.sat
@@ -1403,20 +1589,40 @@ def forward(self, x: Dyn):
         assert solver3.check() == z3.unsat
 
     def test_conv_dyn(self):
-
-        s1, s2, s3, s4 = z3.Ints('s1 s2 s3 s4')
-        e1, e2, e3, e4 = z3.Ints('e1 e2 e3 e4')
-        s11, s22, s33, s44 = z3.Ints('s11 s22 s33 s44')
-        e11, e22, e33, e44 = z3.Ints('e11 e22 e33 e44')
-        d1, d2, d3, d4 = D(s11, s1), D(s22, s2), D(s33, s3), D(s44, s4),
+        s1, s2, s3, s4 = z3.Ints("s1 s2 s3 s4")
+        e1, e2, e3, e4 = z3.Ints("e1 e2 e3 e4")
+        s11, s22, s33, s44 = z3.Ints("s11 s22 s33 s44")
+        e11, e22, e33, e44 = z3.Ints("e11 e22 e33 e44")
+        d1, d2, d3, d4 = (
+            D(s11, s1),
+            D(s22, s2),
+            D(s33, s3),
+            D(s44, s4),
+        )
         b1, b2, b3, b4 = D(e11, e1), D(e22, e2), D(e33, e3), D(e44, e4)
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
+            def __init__(
+                self,
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups,
+                dilation,
+            ):
                 super().__init__()
-                self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
-                                             kernel_size=kernel_size, stride=stride,
-                                             padding=padding, groups=groups, bias=False, dilation=dilation)
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels=in_planes,
+                    out_channels=out_planes,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    groups=groups,
+                    bias=False,
+                    dilation=dilation,
+                )
 
             def forward(self, x: Dyn):
                 return self.conv1(x)
@@ -1436,8 +1642,10 @@ def forward(self, x: Dyn):
         x = z3.Const(1, tensor_type)
         y = z3.Const(2, tensor_type)
 
-        solver3.add(x == tensor_type.tensor4(d1, d2, d3, d4),
-                    y == tensor_type.tensor4(b1, b2, b3, b4))
+        solver3.add(
+            x == tensor_type.tensor4(d1, d2, d3, d4),
+            y == tensor_type.tensor4(b1, b2, b3, b4),
+        )
 
         assert solver3.check() == z3.sat
         assert solver3.model()[s1].as_long() == solver3.model()[e1].as_long()
@@ -1456,11 +1664,15 @@ def forward(self, x: Dyn):
         solver2.add(x == tensor_type.tensor3(d1, d2, d3))
         self.assertEqual(solver2.check(), z3.unsat)
 
-
     def test_add(self):
-        s1, s2, s3, s4 = z3.Ints('s1 s2 s3 s4')
-        s11, s22, s33, s44 = z3.Ints('s11 s22 s33 s44')
-        d1, d2, d3, d4 = D(s11, s1), D(s22, s2), D(s33, s3), D(s44, s4),
+        s1, s2, s3, s4 = z3.Ints("s1 s2 s3 s4")
+        s11, s22, s33, s44 = z3.Ints("s11 s22 s33 s44")
+        d1, d2, d3, d4 = (
+            D(s11, s1),
+            D(s22, s2),
+            D(s33, s3),
+            D(s44, s4),
+        )
 
         class BasicBlock(torch.nn.Module):
             def forward(self, x: Dyn, y: Dyn):
@@ -1528,7 +1740,7 @@ def forward(self, x: TensorType((Dyn,)), y: Dyn):
         self.assertEqual(s.check(), z3.unsat)
 
     def test_add_padding(self):
-        s1, s2, s3, s4 = z3.Ints('s1 s2 s3 s4')
+        s1, s2, s3, s4 = z3.Ints("s1 s2 s3 s4")
 
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType((Dyn,)), y: TensorType((Dyn, Dyn))):
@@ -1552,7 +1764,7 @@ def forward(self, x: TensorType((Dyn,)), y: TensorType((Dyn, Dyn))):
         # print(s.model())
 
     def test_add_padding_2(self):
-        s1, s2, s3, s4 = z3.Ints('s1 s2 s3 s4')
+        s1, s2, s3, s4 = z3.Ints("s1 s2 s3 s4")
 
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([Dyn, Dyn]), y: TensorType([Dyn])):
@@ -1578,7 +1790,9 @@ def forward(self, x: TensorType([Dyn, Dyn]), y: TensorType([Dyn])):
         self.assertEqual(s.check(), z3.sat)
 
         add_result = z3.Const(3, tensor_type)
-        broadcast_res1, broadcast_res2 = z3.Const(4, tensor_type), z3.Const(5, tensor_type)
+        broadcast_res1, broadcast_res2 = z3.Const(4, tensor_type), z3.Const(
+            5, tensor_type
+        )
 
         # print(s.model())
 
@@ -1600,7 +1814,7 @@ def forward(self, x: TensorType([Dyn, Dyn]), y: TensorType([Dyn])):
         assert s.model()[add_result].arg(1).arg(0).as_long() != 0
 
     def test_add_padding_3(self):
-        s1, s2, s3, s4 = z3.Ints('s1 s2 s3 s4')
+        s1, s2, s3, s4 = z3.Ints("s1 s2 s3 s4")
 
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([Dyn, 1]), y: TensorType([Dyn])):
@@ -1632,7 +1846,6 @@ def forward(self, x: TensorType([Dyn, 1]), y: TensorType([Dyn])):
         assert s.model()[add_result].arg(0).arg(0).as_long() == 0
         assert s.model()[add_result].arg(1).arg(0).as_long() == 0
 
-
     def test_add_padding_4(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([2, 1]), y: TensorType([3])):
@@ -1668,9 +1881,10 @@ def forward(self, x: TensorType([2, 2]), y: TensorType([3])):
         self.assertEqual(s.check(), z3.unsat)
 
     def test_add_size_3(self):
-
         class BasicBlock(torch.nn.Module):
-            def forward(self, x: TensorType([Dyn, Dyn, Dyn]), y: TensorType([Dyn, Dyn, Dyn])):
+            def forward(
+                self, x: TensorType([Dyn, Dyn, Dyn]), y: TensorType([Dyn, Dyn, Dyn])
+            ):
                 return torch.add(x, y)
 
         ast_rewriter = RewritingTracer()
@@ -1686,7 +1900,7 @@ def forward(self, x: TensorType([Dyn, Dyn, Dyn]), y: TensorType([Dyn, Dyn, Dyn])
         x = z3.Const(1, tensor_type)
         y = z3.Const(2, tensor_type)
 
-        s1, s2, s3, s4, s5 = z3.Ints('s1 s2 s3 s4 s5')
+        s1, s2, s3, s4, s5 = z3.Ints("s1 s2 s3 s4 s5")
 
         s.add(x == tensor_type.tensor3(D(1, s1), D(1, 1), D(1, s2)))
         s.add(y == tensor_type.tensor3(D(1, s3), D(1, s4), D(1, s5)))
@@ -1698,7 +1912,6 @@ def forward(self, x: TensorType([Dyn, Dyn, Dyn]), y: TensorType([Dyn, Dyn, Dyn])
         self.assertEqual(s.check(), z3.unsat)
 
     def test_add_padding_6(self):
-
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn])):
                 return torch.add(x, y)
@@ -1715,7 +1928,7 @@ def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn])):
         x = z3.Const(1, tensor_type)
         y = z3.Const(2, tensor_type)
 
-        s1, s2, s3, s4, s5 = z3.Ints('s1 s2 s3 s4 s5')
+        s1, s2, s3, s4, s5 = z3.Ints("s1 s2 s3 s4 s5")
 
         s.add(x == tensor_type.tensor1(D(1, s1)))
         s.add(y == tensor_type.tensor3(D(1, s2), D(1, s3), D(1, s4)))
@@ -1728,9 +1941,10 @@ def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn])):
         self.assertEqual(s.check(), z3.unsat)
 
     def test_add_padding_7(self):
-
         class BasicBlock(torch.nn.Module):
-            def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn, Dyn])):
+            def forward(
+                self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn, Dyn])
+            ):
                 return torch.add(x, y)
 
         ast_rewriter = RewritingTracer()
@@ -1742,15 +1956,15 @@ def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn, Dyn])):
         s.add(transformed)
         self.assertEqual(s.check(), z3.sat)
         x = z3.Const(1, tensor_type)
-        s1, s2, s3, s4, s5 = z3.Ints('s1 s2 s3 s4 s5')
+        s1, s2, s3, s4, s5 = z3.Ints("s1 s2 s3 s4 s5")
         s.add(x == tensor_type.tensor2(D(s1, s2), D(s2, s3)))
         self.assertEqual(s.check(), z3.unsat)
 
-
     def test_add_padding_8(self):
-
         class BasicBlock(torch.nn.Module):
-            def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn, Dyn])):
+            def forward(
+                self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn, Dyn])
+            ):
                 return torch.add(x, y)
 
         ast_rewriter = RewritingTracer()
@@ -1764,7 +1978,7 @@ def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn, Dyn])):
         x = z3.Const(1, tensor_type)
         y = z3.Const(2, tensor_type)
 
-        s1, s2, s3, s4, s5 = z3.Ints('s1 s2 s3 s4 s5')
+        s1, s2, s3, s4, s5 = z3.Ints("s1 s2 s3 s4 s5")
         s.add(x == tensor_type.tensor1(D(s1, 1)))
         s.add(s1 >= 0)
 
@@ -1774,7 +1988,6 @@ def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn, Dyn])):
         self.assertEqual(s.check(), z3.sat)
 
     def test_add_padding_9(self):
-
         class BasicBlock(torch.nn.Module):
             def forward(self, x: Dyn, y: TensorType([Dyn, Dyn, Dyn, Dyn])):
                 return torch.add(x, y)
@@ -1791,7 +2004,7 @@ def forward(self, x: Dyn, y: TensorType([Dyn, Dyn, Dyn, Dyn])):
         x = z3.Const(1, tensor_type)
         y = z3.Const(2, tensor_type)
 
-        s1, s2, s3, s4, s5, s6, s7 = z3.Ints('s1 s2 s3 s4 s5 s6 s7')
+        s1, s2, s3, s4, s5, s6, s7 = z3.Ints("s1 s2 s3 s4 s5 s6 s7")
         s.add(x == tensor_type.tensor1(D(s1, s7)))
         s.add(s1 == 1)
         self.assertEqual(s.check(), z3.sat)
@@ -1808,19 +2021,38 @@ def forward(self, x: Dyn, y: TensorType([Dyn, Dyn, Dyn, Dyn])):
         assert s.model()[s5].as_long() == s.model()[s7].as_long()
 
     def test_conv_static(self):
-        s1, s2, s3, s4 = z3.Ints('s1 s2 s3 s4')
-        e1, e2, e3, e4 = z3.Ints('e1 e2 e3 e4')
-        s11, s22, s33, s44 = z3.Ints('s11 s22 s33 s44')
-        e11, e22, e33, e44 = z3.Ints('e11 e22 e33 e44')
-        d1, d2, d3, d4 = D(s11, s1), D(s22, s2), D(s33, s3), D(s44, s4),
+        s1, s2, s3, s4 = z3.Ints("s1 s2 s3 s4")
+        e1, e2, e3, e4 = z3.Ints("e1 e2 e3 e4")
+        s11, s22, s33, s44 = z3.Ints("s11 s22 s33 s44")
+        e11, e22, e33, e44 = z3.Ints("e11 e22 e33 e44")
+        d1, d2, d3, d4 = (
+            D(s11, s1),
+            D(s22, s2),
+            D(s33, s3),
+            D(s44, s4),
+        )
         b1, b2, b3, b4 = D(e11, e1), D(e22, e2), D(e33, e3), D(e44, e4)
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
+            def __init__(
+                self,
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups,
+                dilation,
+            ):
                 super().__init__()
-                self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
-                                             kernel_size=kernel_size, stride=stride,
-                                             padding=padding, dilation=dilation)
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels=in_planes,
+                    out_channels=out_planes,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    dilation=dilation,
+                )
 
             def forward(self, x: TensorType((1, 2, 10, 20))):
                 return self.conv1(x)
@@ -1864,7 +2096,7 @@ def forward(self, x: TensorType((1, 2, 10, 20))):
         assert solver.model()[e4].as_long() == res2[3]
 
     def test_reshape_dyn(self):
-        s11, s22, s33, s44 = z3.Ints('s11 s22 s33 s44')
+        s11, s22, s33, s44 = z3.Ints("s11 s22 s33 s44")
 
         class BasicBlock(torch.nn.Module):
             def forward(self, x: Dyn):
@@ -1886,11 +2118,15 @@ def forward(self, x: Dyn):
         s.add(s11 == 9)
         self.assertEqual(s.check(), z3.unsat)
 
-
     def test_reshape_annotated(self):
-        s1, s2, s3, s4 = z3.Ints('s1 s2 s3 s4')
-        s11, s22, s33, s44 = z3.Ints('s11 s22 s33 s44')
-        d1, d2, d3, d4 = D(s11, s1), D(s22, s2), D(s33, s3), D(s44, s4),
+        s1, s2, s3, s4 = z3.Ints("s1 s2 s3 s4")
+        s11, s22, s33, s44 = z3.Ints("s11 s22 s33 s44")
+        d1, d2, d3, d4 = (
+            D(s11, s1),
+            D(s22, s2),
+            D(s33, s3),
+            D(s44, s4),
+        )
 
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([Dyn])):
@@ -1908,7 +2144,7 @@ def forward(self, x: TensorType([Dyn])):
         self.assertEqual(s.check(), z3.unsat)
 
     def test_reshape_static_target(self):
-        s11, s22, s33, s44 = z3.Ints('s11 s22 s33 s44')
+        s11, s22, s33, s44 = z3.Ints("s11 s22 s33 s44")
 
         class BasicBlock(torch.nn.Module):
             def forward(self, x: TensorType([Dyn])):
@@ -1930,7 +2166,7 @@ def forward(self, x: TensorType([Dyn])):
         self.assertEqual(s.check(), z3.unsat)
 
     def test_reshape_static_target2(self):
-        s11, s22, s33, s44 = z3.Ints('s11 s22 s33 s44')
+        s11, s22, s33, s44 = z3.Ints("s11 s22 s33 s44")
 
         class BasicBlock(torch.nn.Module):
             def forward(self, x: Dyn):
@@ -1950,7 +2186,6 @@ def forward(self, x: Dyn):
         s.add(s11 != 6)
         self.assertEqual(s.check(), z3.unsat)
 
-
     def test_conv2D_maxpool2d_flatten(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self):
@@ -1962,7 +2197,7 @@ def __init__(self):
                 self.fc1 = torch.nn.Linear(5, 120)
                 self.pool2 = torch.nn.AdaptiveAvgPool2d((6, 7))
 
-            def forward(self, x : TensorType((4, 3, 32, 32))):
+            def forward(self, x: TensorType((4, 3, 32, 32))):
                 out = self.conv1(x)
                 out = self.pool(out)
                 out = self.conv2(out)
@@ -1999,7 +2234,7 @@ def __init__(self):
                 self.fc1 = torch.nn.Linear(5, 120)
                 self.pool2 = torch.nn.AdaptiveAvgPool2d((6, 7))
 
-            def forward(self, x : TensorType((4, 3, 32, 32))):
+            def forward(self, x: TensorType((4, 3, 32, 32))):
                 out = self.conv1(x)
                 out = self.pool(out)
                 out = self.conv2(out)
@@ -2032,7 +2267,7 @@ def __init__(self):
                 self.fc1 = torch.nn.Linear(5, 120)
                 self.pool2 = torch.nn.AdaptiveAvgPool2d((6, 7))
 
-            def forward(self, x : TensorType((Dyn, 3, 32, 32))):
+            def forward(self, x: TensorType((Dyn, 3, 32, 32))):
                 out = self.conv1(x)
                 out = self.pool(out)
                 out = self.conv2(out)
@@ -2052,7 +2287,7 @@ def forward(self, x : TensorType((Dyn, 3, 32, 32))):
         self.assertEqual(solver.check(), z3.sat)
 
     def test_type_check_flatten(self):
-        s1, s2, s3, s4 = z3.Ints('s1 s2 s3 s4')
+        s1, s2, s3, s4 = z3.Ints("s1 s2 s3 s4")
 
         class M(torch.nn.Module):
             def forward(self, x: TensorType([2, 3, 4, 5])):
@@ -2087,7 +2322,6 @@ def forward(self, x: TensorType([2, 3, Dyn, 5])):
         self.assertEqual(solver.check(), z3.sat)
         assert solver.model()[y].arg(1).arg(0) == 0
 
-
         class M(torch.nn.Module):
             def forward(self, x: TensorType([2, 3, Dyn])):
                 return torch.flatten(x, 10, 0)
@@ -2100,8 +2334,8 @@ def forward(self, x: TensorType([2, 3, Dyn])):
         solver.add(constraints)
         self.assertEqual(solver.check(), z3.unsat)
 
-class ConstraintGeneration(unittest.TestCase):
 
+class ConstraintGeneration(unittest.TestCase):
     def test_add_reshape(self):
         class BasicBlock(torch.nn.Module):
             def forward(self, x: Dyn, y: Dyn):
@@ -2115,14 +2349,29 @@ def forward(self, x: Dyn, y: Dyn):
         new_constraints, counter = generator.generate_constraints(0)
         assert len(new_constraints.conjucts) == 11
 
-
     def test_conv_reshape_add(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
+            def __init__(
+                self,
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups,
+                dilation,
+            ):
                 super().__init__()
-                self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
-                                             kernel_size=kernel_size, stride=stride,
-                                             padding=padding, groups=groups, bias=False, dilation=dilation)
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels=in_planes,
+                    out_channels=out_planes,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    groups=groups,
+                    bias=False,
+                    dilation=dilation,
+                )
 
             def forward(self, x: Dyn, y: Dyn):
                 return torch.add(self.conv1(torch.reshape(x, (1, 2, 10, 20))), y)
@@ -2139,24 +2388,27 @@ def forward(self, x: Dyn, y: Dyn):
 
 class TestInternalConstraints(unittest.TestCase):
     def test_precision(self):
-
-        c1 = BinConstraintT(Dyn, TVar('x'), op_precision)
+        c1 = BinConstraintT(Dyn, TVar("x"), op_precision)
         transformed, _ = transform_constraint(c1, 0)
         assert transformed == T()
 
-        c2 = BinConstraintT(TensorType([1, Dyn, 3]), TVar('x'), op_precision)
+        c2 = BinConstraintT(TensorType([1, Dyn, 3]), TVar("x"), op_precision)
         transformed, counter = transform_constraint(c2, 0)
         assert len(transformed.conjucts) == 7
 
     def test_matching(self):
-        c1 = BinConstraintT(TVar('x'),
-                            TensorType([DVar('a'), DVar('b'), DVar('c'), DVar('d')]), op_matching)
+        c1 = BinConstraintT(
+            TVar("x"),
+            TensorType([DVar("a"), DVar("b"), DVar("c"), DVar("d")]),
+            op_matching,
+        )
         transformed, _ = transform_constraint(c1, 0)
         assert len(transformed.disjuncts) == 2
 
     def test_consistency(self):
-        c1 = BinConstraintT(TVar('x'),
-                            TensorType([DVar('a'), DVar('b')]), op_consistency)
+        c1 = BinConstraintT(
+            TVar("x"), TensorType([DVar("a"), DVar("b")]), op_consistency
+        )
         transformed, count = transform_constraint(c1, 0)
 
         assert len(transformed.disjuncts) == 5
@@ -2168,9 +2420,9 @@ def test_consistency(self):
     #     transformed, count = transform_apply_broadcasting(c1, 5)
     #     assert len(transformed.conjucts) == 41
 
+
 @skipIfNoTorchVision
 class TestResNet(unittest.TestCase):
-
     def test_resnet50_unsat(self):
         traced = symbolic_trace(models.resnet50())
         for n in traced.graph.nodes:
@@ -2184,8 +2436,6 @@ def test_resnet50_unsat(self):
         solver.add(input == tensor_type.tensor3(D(1, 1), D(1, 3), D(1, 224)))
         self.assertEqual(solver.check(), z3.unsat)
 
-
-
     def test_resnet50(self):
         traced = symbolic_trace(models.resnet50())
         for n in traced.graph.nodes:
@@ -2214,8 +2464,10 @@ def test_resnet502(self):
         solver.add(constraints)
         linear = z3.Const(650, tensor_type)
         input = z3.Const(1, tensor_type)
-        batch = z3.Int('b')
-        solver.add(input == tensor_type.tensor4(D(1, batch), D(1, 3), D(1, 224), D(1, 224)))
+        batch = z3.Int("b")
+        solver.add(
+            input == tensor_type.tensor4(D(1, batch), D(1, 3), D(1, 224), D(1, 224))
+        )
         solver.add(batch > 4)
         solver.check()
         assert solver.model()[batch] == solver.model()[linear].arg(0).arg(1)
@@ -2230,19 +2482,21 @@ def test_resnet503(self):
         solver.add(constraints)
         linear = z3.Const(650, tensor_type)
         input = z3.Const(1, tensor_type)
-        batch, d1, d2 = z3.Ints('b d1 d2')
-        solver.add(input == tensor_type.tensor4(D(1, batch), D(1, 3), D(1, 224), D(1, 224)))
+        batch, d1, d2 = z3.Ints("b d1 d2")
+        solver.add(
+            input == tensor_type.tensor4(D(1, batch), D(1, 3), D(1, 224), D(1, 224))
+        )
         solver.add(linear == tensor_type.tensor2(D(1, d1), D(1, d2)))
         self.assertEqual(solver.check(), z3.sat)
         solver.add(batch != d1)
         self.assertEqual(solver.check(), z3.unsat)
 
+
 @skipIfNoTorchVision
 class TestAlexNet(unittest.TestCase):
     def test_alexnet1(self):
-
         alexnet = models.alexnet()
-        symbolic_traced : torch.fx.GraphModule = symbolic_trace(alexnet)
+        symbolic_traced: torch.fx.GraphModule = symbolic_trace(alexnet)
 
         for n in symbolic_traced.graph.nodes:
             n.type = Dyn
@@ -2256,18 +2510,28 @@ def test_alexnet1(self):
         self.assertEqual(solver.check(), z3.sat)
         input = z3.Const(1, tensor_type)
         conv = z3.Const(2, tensor_type)
-        solver.add(input == tensor_type.tensor4(D(1, 10), D(1, 3), D(1, 227), D(1, 227)))
+        solver.add(
+            input == tensor_type.tensor4(D(1, 10), D(1, 3), D(1, 227), D(1, 227))
+        )
         self.assertEqual(solver.check(), z3.sat)
-        assert solver.model()[conv] == tensor_type.tensor4(D(1, 10), D(1, 64), D(1, 56), D(1, 56))
+        assert solver.model()[conv] == tensor_type.tensor4(
+            D(1, 10), D(1, 64), D(1, 56), D(1, 56)
+        )
 
         relu = z3.Const(7, tensor_type)
-        assert solver.model()[relu] == tensor_type.tensor4(D(1, 10), D(1, 64), D(1, 56), D(1, 56))
+        assert solver.model()[relu] == tensor_type.tensor4(
+            D(1, 10), D(1, 64), D(1, 56), D(1, 56)
+        )
 
         maxpool = z3.Const(8, tensor_type)
-        assert solver.model()[maxpool] == tensor_type.tensor4(D(1, 10), D(1, 64), D(1, 27), D(1, 27))
+        assert solver.model()[maxpool] == tensor_type.tensor4(
+            D(1, 10), D(1, 64), D(1, 27), D(1, 27)
+        )
 
         maxpool2 = z3.Const(42, tensor_type)
-        assert solver.model()[maxpool2] == tensor_type.tensor4(D(1, 10), D(1, 256), D(1, 6), D(1, 6))
+        assert solver.model()[maxpool2] == tensor_type.tensor4(
+            D(1, 10), D(1, 256), D(1, 6), D(1, 6)
+        )
 
         flatten = z3.Const(52, tensor_type)
         assert solver.model()[flatten] == tensor_type.tensor2(D(1, 10), D(1, 9216))
@@ -2276,15 +2540,16 @@ def test_alexnet1(self):
         assert solver.model()[linear] == tensor_type.tensor2(D(1, 10), D(1, 4096))
 
         linear2 = z3.Const(109, tensor_type)
-        assert solver.model()[linear2] == tensor_type.tensor2(D(1, res[0]), D(1, res[1]))
-
+        assert solver.model()[linear2] == tensor_type.tensor2(
+            D(1, res[0]), D(1, res[1])
+        )
 
     def test_alexnet2(self):
         alexnet = models.alexnet()
-        symbolic_traced : torch.fx.GraphModule = symbolic_trace(alexnet)
+        symbolic_traced: torch.fx.GraphModule = symbolic_trace(alexnet)
 
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = TensorType([Dyn, 4, 227, 227])
 
         constraints = transform_all_constraints(symbolic_traced, counter=0)
@@ -2294,10 +2559,10 @@ def test_alexnet2(self):
 
     def test_alexnet3(self):
         alexnet = models.alexnet()
-        symbolic_traced : torch.fx.GraphModule = symbolic_trace(alexnet)
+        symbolic_traced: torch.fx.GraphModule = symbolic_trace(alexnet)
 
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = TensorType([Dyn, Dyn, 227, 227])
 
         constraints = transform_all_constraints(symbolic_traced, counter=0)
@@ -2307,10 +2572,10 @@ def test_alexnet3(self):
 
     def test_alexnet4(self):
         alexnet = models.alexnet()
-        symbolic_traced : torch.fx.GraphModule = symbolic_trace(alexnet)
+        symbolic_traced: torch.fx.GraphModule = symbolic_trace(alexnet)
 
         for n in symbolic_traced.graph.nodes:
-            if n.op == 'placeholder':
+            if n.op == "placeholder":
                 n.type = TensorType([Dyn, Dyn, 227])
 
         constraints = transform_all_constraints(symbolic_traced, counter=0)
@@ -2319,6 +2584,5 @@ def test_alexnet4(self):
         self.assertEqual(solver.check(), z3.unsat)
 
 
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/test/hi.py b/test/hi.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/higher_order_ops/test_with_effects.py b/test/higher_order_ops/test_with_effects.py
new file mode 100644
index 0000000000000..98e8783785bc9
--- /dev/null
+++ b/test/higher_order_ops/test_with_effects.py
@@ -0,0 +1,347 @@
+# Owner(s): ["module: functorch"]
+import unittest
+from collections import deque
+from functools import partial
+from typing import List
+
+import torch
+import torch._dynamo
+import torch._functorch
+import torch._inductor
+import torch._inductor.decomposition
+from torch._functorch.aot_autograd import aot_export_module
+from torch._higher_order_ops.effects import with_effects
+from torch._higher_order_ops.torchbind import enable_torchbind_tracing
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing import FileCheck
+from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM80OrLater
+from torch.testing._internal.common_quantization import skipIfNoDynamoSupport
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    run_tests,
+    TEST_CUDA,
+    TEST_WITH_ROCM,
+    TestCase,
+)
+
+from torch.testing._internal.torchbind_impls import init_torchbind_implementations
+from torch.utils.hooks import RemovableHandle
+
+
+@unittest.skipIf(not torch._dynamo.is_dynamo_supported(), "dynamo isn't support")
+class TestWithEffects(TestCase):
+    def setUp(self):
+        init_torchbind_implementations()
+
+    def test_print(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                torch.ops.aten._print("moo")
+                res = x + x
+                torch.ops.aten._print("moo")
+                return (res,)
+
+        inputs = (torch.randn(3),)
+
+        # Without functionalization, print should just appear in the graph directly
+        gm = make_fx(M())(*inputs)
+        FileCheck().check_count("torch.ops.aten._print.default", 2, exactly=True).run(
+            gm.code
+        )
+
+        # With functionalization, it should appear wrapped with with_effects()
+        gm, gs = aot_export_module(M(), inputs, trace_joint=False)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    with_effects = torch._higher_order_ops.effects.with_effects(arg0_1, torch.ops.aten._print.default, 'moo');  arg0_1 = None
+    getitem = with_effects[0];  with_effects = None
+    add = torch.ops.aten.add.Tensor(arg1_1, arg1_1);  arg1_1 = None
+    with_effects_1 = torch._higher_order_ops.effects.with_effects(getitem, torch.ops.aten._print.default, 'moo');  getitem = None
+    getitem_2 = with_effects_1[0];  with_effects_1 = None
+    return (getitem_2, add)""",
+        )
+        self.assertEqual(len(gs.input_tokens), 1)
+        self.assertEqual(len(gs.output_tokens), 1)
+
+        with torch._functorch.config.patch(unlift_effect_tokens=True):
+            gm, gs = aot_export_module(M(), inputs, trace_joint=False)
+            self.assertExpectedInline(
+                str(gm.code).strip(),
+                """\
+def forward(self, arg1_1):
+    _make_token_default = torch.ops.prims._make_token.default()
+    with_effects = torch._higher_order_ops.effects.with_effects(_make_token_default, torch.ops.aten._print.default, 'moo');  _make_token_default = None
+    getitem = with_effects[0];  with_effects = None
+    add = torch.ops.aten.add.Tensor(arg1_1, arg1_1);  arg1_1 = None
+    with_effects_1 = torch._higher_order_ops.effects.with_effects(getitem, torch.ops.aten._print.default, 'moo');  getitem = None
+    getitem_2 = with_effects_1[0];  with_effects_1 = None
+    _sink_tokens_default = torch.ops.prims._sink_tokens.default((getitem_2,));  getitem_2 = None
+    return (add,)""",  # noqa: B950
+            )
+
+    def test_torchbind_custom_op(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.classes._TorchScriptTesting._Foo(10, 20)
+
+            def forward(self, x):
+                return (x + torch.ops._TorchScriptTesting.takes_foo(self.attr, x),)
+
+        with enable_torchbind_tracing():
+            gm, gs = aot_export_module(M(), (torch.ones(2, 3),), trace_joint=False)
+
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    _tensor_constant0 = self._tensor_constant0
+    with_effects = torch._higher_order_ops.effects.with_effects(arg0_1, torch.ops._TorchScriptTesting.takes_foo.default, _tensor_constant0, arg1_1);  arg0_1 = _tensor_constant0 = None
+    getitem = with_effects[0]
+    getitem_1 = with_effects[1];  with_effects = None
+    add = torch.ops.aten.add.Tensor(arg1_1, getitem_1);  arg1_1 = getitem_1 = None
+    return (getitem, add)""",  # noqa: B950
+        )
+        self.assertEqual(len(gs.input_tokens), 1)
+        self.assertEqual(len(gs.output_tokens), 1)
+
+    def test_print_with_buffer_mutations(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buf", torch.ones(3))
+
+            def forward(self, x):
+                torch.ops.aten._print("moo")
+                res = x + x
+                self.buf.add_(res)
+                res = self.buf + x
+                torch.ops.aten._print("moo")
+                return (res,)
+
+        inputs = (torch.randn(3),)
+
+        # With functionalization, it should appear wrapped with with_effects()
+        gm, gs = aot_export_module(M(), inputs, trace_joint=False)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1):
+    with_effects = torch._higher_order_ops.effects.with_effects(arg0_1, torch.ops.aten._print.default, 'moo');  arg0_1 = None
+    getitem = with_effects[0];  with_effects = None
+    add = torch.ops.aten.add.Tensor(arg2_1, arg2_1)
+    add_1 = torch.ops.aten.add.Tensor(arg1_1, add);  arg1_1 = add = None
+    add_2 = torch.ops.aten.add.Tensor(add_1, arg2_1);  arg2_1 = None
+    with_effects_1 = torch._higher_order_ops.effects.with_effects(getitem, torch.ops.aten._print.default, 'moo');  getitem = None
+    getitem_2 = with_effects_1[0];  with_effects_1 = None
+    return (getitem_2, add_1, add_2)""",
+        )
+        self.assertEqual(len(gs.input_tokens), 1)
+        self.assertEqual(len(gs.output_tokens), 1)
+        self.assertEqual(len(gs.buffers_to_mutate), 1)
+
+    def test_print_with_input_mutations(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                torch.ops.aten._print("moo")
+                res = x + x
+                x.add_(res)
+                res = x + x
+                torch.ops.aten._print("moo")
+                return (res,)
+
+        inputs = (torch.randn(3),)
+
+        # With functionalization, it should appear wrapped with with_effects()
+        gm, gs = aot_export_module(M(), inputs, trace_joint=False)
+        self.assertEqual(len(gs.input_tokens), 1)
+        self.assertEqual(len(gs.output_tokens), 1)
+        self.assertEqual(len(gs.user_inputs_to_mutate), 1)
+
+    def test_alias_op(self):
+        def f(token, x):
+            token, out = with_effects(token, torch.ops.aten.absolute_.default, x)
+            return token, out
+
+        with self.assertRaisesRegex(
+            AssertionError, r"Ops with aliasing is not supported"
+        ):
+            make_fx(f)(torch.tensor([]), torch.tensor(4))
+
+    def test_compile_aot_eager(self):
+        def f(x):
+            torch.ops.aten._print("moo")
+            res = x + x
+            torch.ops.aten._print("moo")
+            return res
+
+        inputs = (torch.randn(2, 3),)
+
+        res = torch.compile(f, backend="aot_eager")(*inputs)
+        self.assertTrue(torch.allclose(res, f(*inputs)))
+
+    @unittest.skipIf(IS_WINDOWS, "triton")
+    def test_compile_inductor(self):
+        def f(x):
+            torch.ops.aten._print("moo")
+            res = x + x
+            torch.ops.aten._print("moo")
+            return res
+
+        inputs = (torch.randn(2, 3),)
+
+        res = torch.compile(f, backend="inductor")(*inputs)
+        self.assertTrue(torch.allclose(res, f(*inputs)))
+
+    def test_compile_aot_eager_requires_grad(self):
+        def f(x):
+            torch.ops.aten._print("moo")
+            res = x + x
+            torch.ops.aten._print("moo")
+            return res
+
+        inputs = (torch.randn(2, 3, requires_grad=True),)
+
+        res = torch.compile(f, backend="aot_eager")(*inputs)
+        self.assertTrue(torch.allclose(res, f(*inputs)))
+
+        res.sum().backward()
+
+    @unittest.skipIf(IS_WINDOWS, "triton")
+    @unittest.skipIf(TEST_WITH_ROCM, "triton")
+    @unittest.skipIf(not SM80OrLater, "triton")
+    @unittest.skipIf(_get_torch_cuda_version() >= (11, 7), "triton")
+    @unittest.skipIf(not TEST_CUDA, "triton")
+    @skipIfNoDynamoSupport
+    def test_register_effectful_custom_op(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            torch._dynamo.config.capture_scalar_outputs = True
+            torch._dynamo.config.capture_dynamic_output_shape_ops = True
+
+            torch.library.define(
+                "mylib::record_scalar_tensor",
+                "(Tensor x, str prefix) -> ()",
+                lib=lib,
+            )
+
+            # global variable to store the recorded tensor and prefix.
+            recorded_dict = {}
+
+            # Pytorch custorm op implementation
+            @torch.library.impl(
+                "mylib::record_scalar_tensor",
+                "CompositeExplicitAutograd",
+                lib=lib,
+            )
+            def record_scalar_tensor(x, prefix):
+                recorded_dict[prefix] = x.clone()
+                return
+
+            # Meta function of the custom op
+            @torch.library.impl_abstract(
+                "mylib::record_scalar_tensor",
+                lib=lib,
+            )
+            def record_scalar_tensor_meta(x, prefix):
+                return
+
+            from torch._higher_order_ops.effects import (
+                _EffectType,
+                _register_effectful_op,
+            )
+
+            _register_effectful_op(
+                torch.ops.mylib.record_scalar_tensor.default, _EffectType.ORDERED
+            )
+
+            my_config = {}
+            my_config["MockModule"] = "mean"
+            my_config["MockModule.linear"] = "mean"
+            my_config["MockModule.relu"] = "mean"
+
+            class MyLinear(torch.nn.Module):
+                def __init__(self, in_features, out_features):
+                    super().__init__()
+                    self.weight = torch.nn.Parameter(
+                        torch.randn(out_features, in_features), requires_grad=True
+                    )
+                    self.bias = torch.nn.Parameter(
+                        torch.randn(out_features), requires_grad=True
+                    )
+
+                def forward(self, x):
+                    return torch.nn.functional.linear(x, self.weight, self.bias)
+
+            class MockModule(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.linear = MyLinear(10, 10)
+                    self.register_buffer(
+                        "buf0", torch.randn(10, 10, requires_grad=True)
+                    )
+
+                def forward(self, x):
+                    return torch.nn.functional.relu(self.linear(x) + self.buf0)
+
+            def forward_hook(
+                module: torch.nn.Module,
+                inputs: torch.Tensor,
+                output: torch.Tensor,
+                prefix: str,
+                aggregate_method: str,
+            ) -> torch.Tensor:
+                if aggregate_method == "mean":
+                    torch.ops.mylib.record_scalar_tensor(output.mean(), prefix)
+                elif aggregate_method == "max":
+                    torch.ops.mylib.record_scalar_tensor(output.max(), prefix)
+                else:
+                    # demo purpose, using "min"
+                    torch.ops.mylib.record_scalar_tensor(output.sum(), prefix)
+                return output
+
+            def add_hooks(module, config):
+                handles: List[RemovableHandle] = []
+                q = deque([(module.__class__.__name__, module)])
+                while q:
+                    name, m = q.pop()
+                    children = [(name + "." + n, y) for (n, y) in m.named_children()]
+                    q.extend(children)
+                    aggregate_method = config.get(name, "mean")
+                    prefix = name + ":" + aggregate_method
+                    handle = m.register_forward_hook(
+                        partial(
+                            forward_hook,
+                            prefix=prefix,
+                            aggregate_method=aggregate_method,
+                        )
+                    )
+                    if handle:
+                        handles.append(handle)
+                return handles
+
+            x = torch.randn(10, 10, device="cuda")
+            mod = MockModule().to("cuda")
+
+            add_hooks(mod, my_config)
+
+            opt_mod = torch.compile(backend="inductor")(mod)
+            y = opt_mod(x)
+
+            self.assertTrue(torch.allclose(y, mod(x)))
+            # Ensure it works well with backward
+            y.sum().backward()
+            # Ensure the grad is existing
+            self.assertTrue(isinstance(opt_mod.linear.weight.grad, torch.Tensor))
+
+            self.assertEqual(len(recorded_dict), 2)
+            self.assertTrue("MockModule.linear:mean" in recorded_dict)
+            self.assertTrue("MockModule:mean" in recorded_dict)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/extension_backends/cpp/extension_codegen_backend.py b/test/inductor/extension_backends/cpp/extension_codegen_backend.py
new file mode 100644
index 0000000000000..5a6a4d4304f30
--- /dev/null
+++ b/test/inductor/extension_backends/cpp/extension_codegen_backend.py
@@ -0,0 +1,40 @@
+from torch._inductor.codegen import cpp, cpp_wrapper_cpu, wrapper
+from torch._inductor.scheduler import BaseScheduling
+from torch._inductor.virtualized import V
+
+
+class ExtensionWrapperCodegen(wrapper.WrapperCodeGen):
+    def __init__(self):
+        super().__init__()
+
+
+class ExtensionCppWrapperCodegen(cpp_wrapper_cpu.CppWrapperCpu):
+    def __init__(self):
+        super().__init__()
+
+
+class ExtensionScheduling(BaseScheduling):
+    def __init__(self, scheduler):
+        self.scheduler = scheduler
+        self._scheduling = cpp.CppScheduling(scheduler)
+
+    def can_fuse_vertical(self, node1, node2):
+        return True
+
+    def can_fuse_horizontal(self, node1, node2):
+        return True
+
+    def group_fn(self, sizes):
+        return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes)
+
+    def codegen_template(self, template_node, epilogue_nodes):
+        pass
+
+    def codegen_node(self, node):
+        self._scheduling.codegen_node(node)
+
+    def codegen_sync(self):
+        pass
+
+    def flush(self):
+        self._scheduling.flush()
diff --git a/test/inductor/extension_backends/extension_device.cpp b/test/inductor/extension_backends/cpp/extension_device.cpp
similarity index 97%
rename from test/inductor/extension_backends/extension_device.cpp
rename to test/inductor/extension_backends/cpp/extension_device.cpp
index fa2922afd3d31..71f3f5919a9b2 100644
--- a/test/inductor/extension_backends/extension_device.cpp
+++ b/test/inductor/extension_backends/cpp/extension_device.cpp
@@ -66,7 +66,7 @@ at::Tensor custom_to_device(
 // A dummy allocator for our custom device, that secretly uses the CPU
 struct DummyCustomAllocator final : at::Allocator {
   DummyCustomAllocator() = default;
-  at::DataPtr allocate(size_t nbytes) const override {
+  at::DataPtr allocate(size_t nbytes) override {
     void* data = c10::alloc_cpu(nbytes);
     return {data, data, &ReportAndDelete, at::Device(at::DeviceType::PrivateUse1, 0)};
   }
@@ -81,6 +81,10 @@ struct DummyCustomAllocator final : at::Allocator {
   at::DeleterFnPtr raw_deleter() const override {
     return &ReportAndDelete;
   }
+
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    default_copy_data(dest, src, count);
+  }
 };
 
 // Register our dummy allocator
diff --git a/test/inductor/extension_backends/extension_codegen_backend.py b/test/inductor/extension_backends/extension_codegen_backend.py
deleted file mode 100644
index 634a8e257fe82..0000000000000
--- a/test/inductor/extension_backends/extension_codegen_backend.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from torch._inductor.codegen import cpp, wrapper
-from torch._inductor.scheduler import BaseScheduling
-from torch._inductor.virtualized import V
-
-
-class ExtensionWrapperCodegen(wrapper.WrapperCodeGen):
-    def __init__(self):
-        super().__init__()
-
-
-class ExtensionScheduling(BaseScheduling):
-    def __init__(self, scheduler):
-        self.scheduler = scheduler
-        self._scheduling = cpp.CppScheduling(scheduler)
-
-    def can_fuse_vertical(self, node1, node2):
-        return True
-
-    def can_fuse_horizontal(self, node1, node2):
-        return True
-
-    def group_fn(self, sizes):
-        return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes)
-
-    def codegen_template(self, template_node, epilogue_nodes):
-        pass
-
-    def codegen_nodes(self, nodes):
-        self._scheduling.codegen_nodes(nodes)
-
-    def codegen_sync(self):
-        pass
-
-    def flush(self):
-        self._scheduling.flush()
diff --git a/test/inductor/extension_backends/triton/device_interface.py b/test/inductor/extension_backends/triton/device_interface.py
new file mode 100644
index 0000000000000..e666997bf20f4
--- /dev/null
+++ b/test/inductor/extension_backends/triton/device_interface.py
@@ -0,0 +1,123 @@
+from __future__ import annotations
+
+import time
+
+from torch._dynamo import device_interface  # noqa: PLC2701 import-private-name
+
+
+class DeviceProperties:
+    def __init__(self) -> None:
+        self.major = 8  # TODO: bypass check for H100 in triton_heuristics.py
+        self.max_threads_per_multi_processor = 1
+        self.multi_processor_count = 80
+
+
+class DeviceInterface(device_interface.DeviceInterface):
+    class Event(
+        device_interface._EventBase
+    ):  # pyright: ignore [reportPrivateImportUsage]
+        def __init__(
+            self,
+            enable_timing: bool = False,
+            blocking: bool = False,
+            interprocess: bool = False,
+        ) -> None:
+            self.enable_timing = enable_timing
+            self.recorded_time: int | None = None
+
+        def record(self, stream) -> None:
+            if not self.enable_timing:
+                return
+            assert self.recorded_time is None
+            self.recorded_time = time.perf_counter_ns()
+
+        def elapsed_time(self, end_event: DeviceInterface.Event) -> float:
+            assert self.recorded_time
+            assert end_event.recorded_time
+            # convert to ms
+            return (end_event.recorded_time - self.recorded_time) / 1000000
+
+        def wait(self, stream) -> None:
+            pass
+
+        def query(self) -> None:
+            pass
+
+        def synchronize(self) -> None:
+            pass
+
+    class device:  # noqa: N801 invalid-class-name # pyright: ignore [reportIncompatibleVariableOverride]
+        def __init__(self, device) -> None:
+            self.device = device
+
+    class Worker(device_interface.DeviceInterface.Worker):
+        @staticmethod
+        def set_device(device: int) -> None:
+            # No device index for our backend
+            pass
+
+        @staticmethod
+        def current_device() -> int:
+            # No device index for our backend
+            return 0
+
+        @staticmethod
+        def get_device_properties(
+            device=None,
+        ) -> DeviceProperties:
+            return DeviceProperties()
+
+    @staticmethod
+    def current_device() -> int:
+        return 0
+
+    @staticmethod
+    def set_device(device) -> None:
+        pass
+
+    @staticmethod
+    def device_count() -> int:
+        raise NotImplementedError
+
+    @staticmethod
+    def maybe_exchange_device(device: int) -> int:
+        assert (
+            device == 0
+        ), f"Only device index 0 is supported, tried to set index to {device}"
+        return 0  # previous device is always 0
+
+    @staticmethod
+    def exchange_device(device: int) -> int:
+        assert (
+            device == 0
+        ), f"Only device index 0 is supported, tried to set index to {device}"
+        return 0  # previous device is always 0
+
+    @staticmethod
+    def current_stream():
+        raise NotImplementedError
+
+    @staticmethod
+    def set_stream(stream) -> None:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_raw_stream(device_index: int):
+        return None
+
+    @staticmethod
+    def synchronize(device) -> None:
+        pass
+
+    @staticmethod
+    def get_device_properties(device) -> DeviceProperties:
+        raise NotImplementedError
+
+    # Can be mock patched by @patch decorator.
+    @staticmethod
+    def is_available() -> bool:
+        return True
+
+    @staticmethod
+    def get_compute_capability(device) -> int:
+        return 0
diff --git a/test/inductor/extension_backends/triton/extension_codegen_backend.py b/test/inductor/extension_backends/triton/extension_codegen_backend.py
new file mode 100644
index 0000000000000..41ac11b574bf8
--- /dev/null
+++ b/test/inductor/extension_backends/triton/extension_codegen_backend.py
@@ -0,0 +1,49 @@
+from torch._inductor.codegen import triton, wrapper
+from torch._inductor.codegen.common import DeviceOpOverrides
+from torch._inductor.scheduler import BaseScheduling
+
+
+class ExtensionWrapperCodegen(wrapper.WrapperCodeGen):
+    def __init__(self):
+        super().__init__()
+
+
+class ExtensionScheduling(BaseScheduling):
+    def __init__(self, scheduler):
+        self.scheduler = scheduler
+        self._triton_scheduling = triton.TritonScheduling(scheduler)
+
+    def can_fuse_vertical(self, node1, node2):
+        return True
+
+    def can_fuse_horizontal(self, node1, node2):
+        return True
+
+    def group_fn(self, sizes):
+        return self._triton_scheduling.group_fn(sizes)
+
+    def codegen_template(self, template_node, epilogue_nodes):
+        pass
+
+    def codegen_node(self, node):
+        self._triton_scheduling.codegen_node(node)
+
+    def codegen_sync(self):
+        pass
+
+    def flush(self):
+        self._triton_scheduling.flush()
+
+
+class CPUDeviceOpOverrides(DeviceOpOverrides):
+    def import_get_raw_stream_as(self, name: str) -> str:
+        return f"def {name}(name): None\n"
+
+    def set_device(self, device_idx: int) -> str:  # noqa: ARG002 unused-argument
+        return ""
+
+    def synchronize(self) -> None:
+        pass
+
+    def device_guard(self, device_idx: int) -> str:  # noqa: ARG002 unused-argument
+        return ""
diff --git a/test/inductor/indirect_assert_helper.py b/test/inductor/indirect_assert_helper.py
index bbe11629b1f7c..d6ea77ad158b0 100644
--- a/test/inductor/indirect_assert_helper.py
+++ b/test/inductor/indirect_assert_helper.py
@@ -2,6 +2,8 @@
 
 import torch
 
+from torch.testing._internal.inductor_utils import GPU_TYPE
+
 
 def first_arg(x, y):
     return x[y]
@@ -38,13 +40,13 @@ def store(x, y, z):
     assert dyn_shape in ("True", "False")
     dynamic_shapes = dyn_shape == "True"
 
-    x = torch.randn(shape_x, device="cuda")
-    y = torch.arange(4, device="cuda")
+    x = torch.randn(shape_x, device=GPU_TYPE)
+    y = torch.arange(4, device=GPU_TYPE)
     fn = vars()[fn_name]
     fn = torch.compile(dynamic=dynamic_shapes)(fn)
     if fn_name == "store":
         shape = (y.numel(),) + x.shape[2:]
-        z = torch.randn(shape, device="cuda")
+        z = torch.randn(shape, device=GPU_TYPE)
         fn(x, y, z)
     else:
         fn(x, y)
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index fdcba53edd54b..92e4d1ebe3030 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -3,29 +3,36 @@
 import os
 import sys
 import tempfile
+import types
 import unittest
-from typing import Dict
+from typing import Dict, Tuple
+from unittest import skip
 
 import torch
-import torch._export
 import torch._inductor
-from torch._dynamo.testing import same
+import torch.nn as nn
+from torch._dynamo.testing import rand_strided, same
 from torch._dynamo.utils import counters
 from torch._inductor import config
 from torch._inductor.exc import CppWrapperCodeGenError
-from torch._inductor.utils import cache_dir
+from torch._inductor.runtime.runtime_utils import cache_dir
+from torch._inductor.test_case import TestCase
 
+from torch.export import Dim, export
 from torch.testing import FileCheck
 from torch.testing._internal import common_utils
-from torch.testing._internal.common_cuda import SM80OrLater
-from torch.testing._internal.common_quantization import skip_if_no_torchvision
+from torch.testing._internal.common_cuda import SM80OrLater, SM90OrLater
+from torch.testing._internal.common_quantization import (
+    skip_if_no_torchvision,
+    skipIfNoFBGEMM,
+)
 from torch.testing._internal.common_utils import (
+    DeterministicGuard,
     IS_CI,
     IS_FBCODE,
     IS_WINDOWS,
     skipIfRocm,
     TEST_WITH_ROCM,
-    TestCase,
 )
 
 from torch.testing._internal.triton_utils import HAS_CUDA, requires_cuda
@@ -38,6 +45,7 @@
         add_kernel_2d_autotuned,
         add_kernel_autotuned,
         add_kernel_with_optional_param,
+        add_kernel_with_scaling,
     )
 
 if IS_WINDOWS and IS_CI:
@@ -51,9 +59,21 @@
 try:
     try:
         from .test_aot_inductor_utils import AOTIRunnerUtil
+        from .test_control_flow import (
+            CondModels,
+            prepend_counters,
+            prepend_predicates,
+            WhileLoopModels,
+        )
         from .test_torchinductor import copy_tests, requires_multigpu, TestFailure
     except ImportError:
         from test_aot_inductor_utils import AOTIRunnerUtil
+        from test_control_flow import (
+            CondModels,
+            prepend_counters,
+            prepend_predicates,
+            WhileLoopModels,
+        )
         from test_torchinductor import copy_tests, requires_multigpu, TestFailure
 except (unittest.SkipTest, ImportError) as e:
     if __name__ == "__main__":
@@ -66,18 +86,19 @@ def check_model(
     model,
     example_inputs,
     options=None,
-    constraints=None,
+    dynamic_shapes=None,
     disable_constraint_solver=False,
 ):
     with torch.no_grad(), config.patch(
         {
-            "aot_inductor.abi_compatible": self.abi_compatible,
+            "abi_compatible": self.abi_compatible,
             "allow_stack_allocation": self.allow_stack_allocation,
             "use_minimal_arrayref_interface": self.use_minimal_arrayref_interface,
         }
     ):
         torch.manual_seed(0)
-        model = model.to(self.device)
+        if not isinstance(model, types.FunctionType):
+            model = model.to(self.device)
         ref_model = copy.deepcopy(model)
         ref_inputs = copy.deepcopy(example_inputs)
         expected = ref_model(*ref_inputs)
@@ -88,7 +109,7 @@ def check_model(
             model,
             example_inputs,
             options,
-            constraints,
+            dynamic_shapes,
             disable_constraint_solver,
         )
 
@@ -100,11 +121,11 @@ def check_model_with_multiple_inputs(
     model,
     list_example_inputs,
     options=None,
-    constraints=None,
+    dynamic_shapes=None,
 ):
     with torch.no_grad(), config.patch(
         {
-            "aot_inductor.abi_compatible": self.abi_compatible,
+            "abi_compatible": self.abi_compatible,
             "allow_stack_allocation": self.allow_stack_allocation,
         }
     ):
@@ -116,7 +137,7 @@ def check_model_with_multiple_inputs(
 
         torch.manual_seed(0)
         list_actual = AOTIRunnerUtil.run_multiple(
-            self.device, model, list_example_inputs, options, constraints
+            self.device, model, list_example_inputs, options, dynamic_shapes
         )
 
     self.assertTrue(same(list_actual, list_expected))
@@ -187,7 +208,42 @@ def forward(self, x, y):
         )
         self.assertTrue(actual_path == expected_path)
 
-    @requires_cuda()
+    def test_constant_folding(self):
+        class Model(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.w_pre = torch.randn(4, 4, device=device)
+                self.b = torch.randn(4, device=device)
+
+            def forward(self, x):
+                w_transpose = torch.transpose(self.w_pre, 0, 1)
+                w_relu = torch.nn.functional.relu(w_transpose)
+                w = w_relu + self.b
+                return torch.matmul(x, w)
+
+        example_inputs = (torch.randn(4, 4, device=self.device),)
+        with config.patch({"aot_inductor.use_runtime_constant_folding": True}):
+            self.check_model(Model(self.device), example_inputs)
+
+    @requires_cuda
+    def test_duplicate_constant_folding(self):
+        class Model(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.w1 = torch.randn(4, 4, device=device)
+                self.w2 = torch.randn(4, 4, device=device)
+                self.w3 = torch.randn(4, 4, device=device)
+                self.w4 = torch.randn(4, 4, device=device)
+
+            def forward(self, x):
+                w_concat = torch.cat((self.w1, self.w2, self.w3, self.w4))
+                return torch.cat((x, w_concat))
+
+        example_inputs = (torch.randn(4, 4, device=self.device),)
+        with config.patch({"aot_inductor.use_runtime_constant_folding": True}):
+            self.check_model(Model(self.device), example_inputs)
+
+    @requires_cuda
     def test_multi_device(self):
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -215,6 +271,22 @@ def forward(self, x, y):
         )
         self.check_model(Model(), example_inputs)
 
+    def test_large_mmaped_weights(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(512, 250112)
+
+            def forward(self, x, y):
+                return x + self.linear(y)
+
+        example_inputs = (
+            torch.randn(1, 250112, device=self.device),
+            torch.randn(1, 512, device=self.device),
+        )
+        with config.patch({"aot_inductor.force_mmap_weights": True}):
+            self.check_model(Model(), example_inputs)
+
     def test_with_offset(self):
         class Model(torch.nn.Module):
             def __init__(self, device):
@@ -254,6 +326,19 @@ def forward(self, x, y):
         with config.patch({"freezing": True}):
             self.check_model(Model(self.device), example_inputs)
 
+    @torch._inductor.config.patch(
+        pre_grad_fusion_options={
+            "normalization_pass": {},
+            "remove_split_with_size_one_pass": {},
+            "merge_getitem_cat_pass": {},
+            "merge_stack_tahn_unbind_pass": {},
+            "merge_splits_pass": {},
+            "mutate_cat_pass": {},
+            "split_cat_pass": {},
+            "unbind_stack_pass": {},
+        },
+        post_grad_fusion_options={},
+    )
     def test_simple_split(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -269,6 +354,25 @@ def forward(self, x):
         self.assertEqual(counters["inductor"]["scmerge_cat_removed"], 1)
         self.assertEqual(counters["inductor"]["scmerge_split_sections_removed"], 1)
 
+    def test_amp_fallback_random(self):
+        def fn(x, w):
+            return torch.functional.F.linear(x, w)
+
+        example_inputs = (
+            torch.randn(10, 10, device=self.device),
+            torch.randn(10, 10, device=self.device),
+        )
+        if self.device == "cuda":
+            ctx = torch.cuda.amp.autocast
+        elif self.device == "cpu":
+            ctx = torch.cpu.amp.autocast
+        else:
+            raise AssertionError("Unsupported device")
+
+        with config.patch({"fallback_random": True}):
+            with ctx():
+                self.check_model(fn, example_inputs)
+
     def test_missing_output(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -306,6 +410,7 @@ def forward(self, x, y):
         )
         self.check_model(Model(), example_inputs)
 
+    @skip("Test was marked as expected failure, but does not fail always anymore.")
     def test_dynamic_smem_above_default_limit(self):
         class Model(torch.nn.Module):
             def forward(self, x, y):
@@ -457,6 +562,23 @@ def forward(self, x, y):
             options={"debug_check_inf_and_nan": True},
         )
 
+    def test_assert_async(self):
+        if self.device != "cuda":
+            raise unittest.SkipTest("requires CUDA")
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                u0 = x.item()
+                torch._check(u0 > 3)
+                return torch.ones(u0)[0]
+
+        x = torch.tensor(23, device=self.device)
+        example_inputs = (x,)
+        self.check_model(Model(), example_inputs)
+
     def test_simple_dynamic(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -466,15 +588,12 @@ def forward(self, x, y):
                 add_0 = x + y
                 return torch.nn.functional.relu(input=add_0, inplace=False)
 
-        a = torch.randn(128, 2048, device=self.device)
-        b = torch.randn(128, 2048, device=self.device)
-        constraints = [
-            torch._export.dynamic_dim(a, 0) >= 1,
-            torch._export.dynamic_dim(a, 0) <= 2048,
-            torch._export.dynamic_dim(a, 0) == torch._export.dynamic_dim(b, 0),
-        ]
-        example_inputs = (a, b)
-        self.check_model(Model(), example_inputs, constraints=constraints)
+        x = torch.randn(128, 2048, device=self.device)
+        y = torch.randn(128, 2048, device=self.device)
+        dim0_x = Dim("dim0_x", min=1, max=2048)
+        dynamic_shapes = {"x": {0: dim0_x}, "y": {0: dim0_x}}
+        example_inputs = (x, y)
+        self.check_model(Model(), example_inputs, dynamic_shapes=dynamic_shapes)
 
     @unittest.skipIf(
         not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0),
@@ -511,14 +630,12 @@ def forward(self, x, weight, bias, scale_a, scale_b):
 
         x_shape = (16, 16)
         x = torch.rand(*x_shape, device="cuda", dtype=dtype).to(torch.float8_e4m3fn)
-        constraints = [
-            torch._export.dynamic_dim(x, 0) >= 1,
-            torch._export.dynamic_dim(x, 0) <= 2048,
-        ]
+        dim0_x = Dim("dim0_x", min=1, max=2048)
+        dynamic_shapes = ({0: dim0_x}, None, None, None, None)
         self.check_model(
             Model(dtype),
             (x, weight, input_bias, a_inverse_scale, b_inverse_scale),
-            constraints=constraints,
+            dynamic_shapes=dynamic_shapes,
         )
 
     def test_poi_multiple_dynamic(self):
@@ -530,14 +647,11 @@ def forward(self, x, y):
                 add_0 = x + y
                 return torch.nn.functional.relu(input=add_0, inplace=False)
 
-        a = torch.randn(128, 2048, device=self.device)
-        b = torch.randn(128, 2048, device=self.device)
-        constraints = [
-            torch._export.dynamic_dim(a, 0) >= 1,
-            torch._export.dynamic_dim(a, 0) <= 2048,
-            torch._export.dynamic_dim(a, 0) == torch._export.dynamic_dim(b, 0),
-        ]
-        list_example_inputs = [(a, b)]
+        x = torch.randn(128, 2048, device=self.device)
+        y = torch.randn(128, 2048, device=self.device)
+        dim0_x = Dim("dim0_x", min=1, max=2048)
+        dynamic_shapes = {"x": {0: dim0_x}, "y": {0: dim0_x}}
+        list_example_inputs = [(x, y)]
         list_example_inputs.append(
             (
                 torch.randn(64, 2048, device=self.device),
@@ -551,7 +665,7 @@ def forward(self, x, y):
             ),
         )
         self.check_model_with_multiple_inputs(
-            Model(), list_example_inputs, constraints=constraints
+            Model(), list_example_inputs, dynamic_shapes=dynamic_shapes
         )
 
     def test_addmm_multiple_dynamic(self):
@@ -570,10 +684,8 @@ def forward(self, a):
         model = Model(N, K, self.device)
         batch = 2
         a = torch.randn(batch, M, K, device=self.device)
-        constraints = [
-            torch._export.dynamic_dim(a, 0) >= 1,
-            torch._export.dynamic_dim(a, 0) <= 2048,
-        ]
+        dim0_a = Dim("dim0_a", min=1, max=2048)
+        dynamic_shapes = {"a": {0: dim0_a}}
         list_example_inputs = [(a,)]
         batch = 2048
         list_example_inputs.append(
@@ -586,7 +698,7 @@ def forward(self, a):
         self.check_model_with_multiple_inputs(
             model,
             list_example_inputs,
-            constraints=constraints,
+            dynamic_shapes=dynamic_shapes,
             options={
                 "max_autotune": True,
                 "max_autotune_gemm_backends": "TRITON",
@@ -608,11 +720,8 @@ def forward(self, a, b):
         batch = 1024
         a = torch.randn(batch, M, K, device=self.device)
         b = torch.randn(batch, K, N, device=self.device)
-        constraints = [
-            torch._export.dynamic_dim(a, 0) >= 1,
-            torch._export.dynamic_dim(a, 0) <= 2048,
-            torch._export.dynamic_dim(a, 0) == torch._export.dynamic_dim(b, 0),
-        ]
+        dim0_a = Dim("dim0_a", min=1, max=2048)
+        dynamic_shapes = {"a": {0: dim0_a}, "b": {0: dim0_a}}
         list_example_inputs = [(a, b)]
         batch = 2048
         list_example_inputs.append(
@@ -635,7 +744,7 @@ def forward(self, a, b):
                 "max_autotune": True,
                 "max_autotune_gemm_backends": "TRITON",
             },
-            constraints=constraints,
+            dynamic_shapes=dynamic_shapes,
         )
 
     def test_foreach_multiple_dynamic(self):
@@ -650,14 +759,11 @@ def forward(self, x, y):
                 return cat
 
         model = Model()
-        a = torch.randn(128, 2048, device=self.device)
-        b = torch.randn(128, 2048, device=self.device)
-        constraints = [
-            torch._export.dynamic_dim(a, 0) >= 1,
-            torch._export.dynamic_dim(a, 0) <= 2048,
-            torch._export.dynamic_dim(a, 0) == torch._export.dynamic_dim(b, 0),
-        ]
-        list_example_inputs = [(a, b)]
+        x = torch.randn(128, 2048, device=self.device)
+        y = torch.randn(128, 2048, device=self.device)
+        dim0_x = Dim("dim0_x", min=1, max=2048)
+        dynamic_shapes = {"x": {0: dim0_x}, "y": {0: dim0_x}}
+        list_example_inputs = [(x, y)]
         list_example_inputs.append(
             (
                 torch.randn(64, 2048, device=self.device),
@@ -673,7 +779,7 @@ def forward(self, x, y):
         self.check_model_with_multiple_inputs(
             model,
             list_example_inputs,
-            constraints=constraints,
+            dynamic_shapes=dynamic_shapes,
         )
 
     # scaled_dot_product_flash_attention
@@ -715,6 +821,23 @@ def forward(self, q, k, v, x):
         )
         self.check_model(Model(), example_inputs)
 
+    @skipIfNoFBGEMM
+    def test_quantized_linear(self):
+        class Model(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.weight = torch.randn(10, 10, device=device)
+                self.bias = torch.randn(10, device=device)
+
+            def forward(self, x):
+                return torch.ops.quantized.linear_dynamic_fp16_unpacked_weight(
+                    x, self.weight, self.bias
+                )
+
+        example_inputs = (torch.randn(10, 10, device=self.device),)
+        with config.patch({"aot_inductor.use_runtime_constant_folding": True}):
+            self.check_model(Model(self.device), example_inputs)
+
     def test_zero_grid_with_unbacked_symbols(self):
         class Repro(torch.nn.Module):
             def __init__(self):
@@ -727,14 +850,292 @@ def forward(self, x, y):
                 d = (b + c) @ y
                 return d.sum()
 
-        if self.device != "cuda":
-            raise unittest.SkipTest("requires CUDA")
         example_inputs = (
             torch.tensor([1, 1, 1], device=self.device),
             torch.randn((1, 32), dtype=torch.float16, device=self.device),
         )
         self.check_model(Repro(), example_inputs)
 
+    def test_cond_simple(self):
+        inputs = (
+            torch.randn((10, 20), device=self.device),
+            torch.randn((10, 20), device=self.device),
+        )
+        dim0_ab = Dim("s0", min=2, max=1024)
+        dynamic_shapes = {
+            "p": {},
+            "a": {0: dim0_ab, 1: None},
+            "b": {0: dim0_ab, 1: None},
+        }
+        self.check_model_with_multiple_inputs(
+            CondModels.Simple(),
+            prepend_predicates(inputs),
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    def test_cond_nested(self):
+        inputs = (
+            torch.randn((10, 20), device=self.device),
+            torch.randn((10, 20), device=self.device),
+            torch.randn((10, 20), device=self.device),
+        )
+        dim0_abc = Dim("s0", min=2, max=1024)
+        dynamic_shapes = {
+            "p0": {},
+            "p1": {},
+            "p2": {},
+            "a": {0: dim0_abc, 1: None},
+            "b": {0: dim0_abc, 1: None},
+            "c": {0: dim0_abc, 1: None},
+        }
+        self.check_model_with_multiple_inputs(
+            CondModels.Nested(),
+            prepend_predicates(inputs, num_predicates=3),
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    def test_cond_with_parameters(self):
+        inputs = (torch.randn((10, 20), device=self.device),)
+        dim0_abc = Dim("s0", min=2, max=1024)
+        dynamic_shapes = {
+            "p": {},
+            "a": {0: dim0_abc, 1: None},
+        }
+        self.check_model_with_multiple_inputs(
+            CondModels.Parameters(self.device),
+            prepend_predicates(inputs),
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    def test_cond_with_reinterpret_view_inputs_outputs(self):
+        inputs = (
+            torch.randn((10, 20), device=self.device),
+            torch.randn((10, 20), device=self.device),
+        )
+        dim0_ab = Dim("s0", min=3, max=1024)
+        dynamic_shapes = {
+            "p": {},
+            "a": {0: dim0_ab, 1: None},
+            "b": {0: dim0_ab, 1: None},
+        }
+        self.check_model_with_multiple_inputs(
+            CondModels.ReinterpretView(),
+            prepend_predicates(inputs),
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    def test_cond_with_multiple_outputs(self):
+        inputs = (
+            torch.randn((10, 20), device=self.device),
+            torch.randn((10, 20), device=self.device),
+            torch.randn((30, 40), device=self.device),
+        )
+        dim0_ab = Dim("s0", min=2, max=1024)
+        dim0_c = Dim("s1", min=2, max=1024)
+        dynamic_shapes = {
+            "p": {},
+            "a": {0: dim0_ab, 1: None},
+            "b": {0: dim0_ab, 1: None},
+            "c": {0: dim0_c, 1: None},
+        }
+        self.check_model_with_multiple_inputs(
+            CondModels.MultipleOutputs(),
+            prepend_predicates(inputs),
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    def test_cond_with_outer_code_before_after(self):
+        inputs = (
+            torch.randn((10, 20), device=self.device),
+            torch.randn((10, 20), device=self.device),
+        )
+        dim0_ab = Dim("s0", min=2, max=1024)
+        dynamic_shapes = {
+            "p": {},
+            "a": {0: dim0_ab, 1: None},
+            "b": {0: dim0_ab, 1: None},
+        }
+        self.check_model_with_multiple_inputs(
+            CondModels.OuterCode(),
+            prepend_predicates(inputs),
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    def test_cond_use_buffers_from_outer_scope(self):
+        inputs = (
+            torch.randn((10, 20), device=self.device),
+            torch.randn((10, 20), device=self.device),
+            torch.randn((10, 20), device=self.device),
+        )
+        dim0_abc = Dim("s0", min=2, max=1024)
+        dynamic_shapes = {
+            "p": {},
+            "a": {0: dim0_abc, 1: None},
+            "b": {0: dim0_abc, 1: None},
+            "c": {0: dim0_abc, 1: None},
+        }
+        self.check_model_with_multiple_inputs(
+            CondModels.OuterBuffers(),
+            prepend_predicates(inputs),
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    @common_utils.parametrize("dynamic", [False, True])
+    def test_cond_non_tensor_predicates(self, dynamic):
+        inputs1 = (
+            torch.randn((10, 20), device=self.device),
+            torch.randn((15, 20), device=self.device),
+        )
+        inputs2 = (
+            torch.randn((10, 20), device=self.device),
+            torch.randn((5, 20), device=self.device),
+        )
+        inputs = (inputs1,)
+        dynamic_shapes = None
+        if dynamic:
+            inputs = (inputs1, inputs2)
+            dim0_a = Dim("s0", min=2, max=1024)
+            dim0_b = Dim("s1", min=2, max=1024)
+            dynamic_shapes = {
+                "a": {0: dim0_a, 1: None},
+                "b": {0: dim0_b, 1: None},
+            }
+        self.check_model_with_multiple_inputs(
+            CondModels.WithNonTensorPredicate(),
+            inputs,
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    def test_while_loop_simple(self):
+        inputs = (
+            torch.randn((10, 20), device=self.device),
+            torch.randn((10, 20), device=self.device),
+        )
+        dim0_ab = Dim("s0", min=2, max=1024)
+        dynamic_shapes = {
+            "ci": {},
+            "a": {0: dim0_ab, 1: None},
+            "b": {0: dim0_ab, 1: None},
+        }
+        self.check_model_with_multiple_inputs(
+            WhileLoopModels.Simple(),
+            prepend_counters(inputs),
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    def test_while_loop_nested(self):
+        inputs = (
+            torch.randn((10, 20), device=self.device),
+            torch.randn((10, 20), device=self.device),
+        )
+        dim0_ab = Dim("s0", min=2, max=1024)
+        dynamic_shapes = {
+            "ci": {},
+            "cj": {},
+            "a": {0: dim0_ab, 1: None},
+            "b": {0: dim0_ab, 1: None},
+        }
+        self.check_model_with_multiple_inputs(
+            WhileLoopModels.Nested(),
+            prepend_counters(inputs, num_counters=2),
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    def test_while_loop_with_outer_code(self):
+        inputs = (
+            torch.randn((10, 20), device=self.device),
+            torch.randn((10, 20), device=self.device),
+        )
+        dim0_ab = Dim("s0", min=2, max=1024)
+        dynamic_shapes = {
+            "c": {},
+            "a": {0: dim0_ab, 1: None},
+            "b": {0: dim0_ab, 1: None},
+        }
+        self.check_model_with_multiple_inputs(
+            WhileLoopModels.OuterCode(),
+            prepend_counters(inputs),
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    def test_while_loop_with_parameters(self):
+        inputs = (torch.randn((10, 20), device=self.device),)
+        dim0_a = Dim("s0", min=2, max=1024)
+        dynamic_shapes = {
+            "c": {},
+            "a": {0: dim0_a, 1: None},
+        }
+        self.check_model_with_multiple_inputs(
+            WhileLoopModels.Parameters(self.device),
+            prepend_counters(inputs),
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    def test_while_loop_with_outer_buffers(self):
+        inputs = (
+            torch.randn((10, 20), device=self.device),
+            torch.randn((10, 20), device=self.device),
+        )
+        # dynamic shapes don't work now due to
+        # https://github.com/pytorch/pytorch/issues/123596
+        # dim0_ab = Dim("s0", min=2, max=1024)
+        # dynamic_shapes = {
+        #     "c": {},
+        #     "a": {0: dim0_ab, 1: None},
+        #     "b": {0: dim0_ab, 1: None},
+        # }
+        dynamic_shapes = None
+        self.check_model_with_multiple_inputs(
+            WhileLoopModels.OuterBuffers(),
+            prepend_counters(inputs),
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    @config.patch({"is_predispatch": True})
+    def test_constant(self):
+        class M(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.device = device
+
+            def forward(self, x):
+                t = torch.tensor(x.size(-1), device=self.device, dtype=torch.float)
+                t = torch.sqrt(t * 3)
+                return x * t
+
+        self.check_model(M(self.device), (torch.randn(5, 5, device=self.device),))
+
+    def test_zero_grid_with_backed_symbols(self):
+        class Repro(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, b):
+                return x + b
+
+        example_inputs = (
+            x := torch.randn((3, 2), device=self.device),
+            torch.randn((1, 2), device=self.device),
+        )
+        torch._dynamo.mark_dynamic(x, index=0)  # Create dynamic symbol
+
+        # Compile & run model where dynamic dim size > 0.
+        so_path: str = AOTIRunnerUtil.compile(
+            Repro(),
+            example_inputs,
+        )
+        aot_inductor_module = AOTIRunnerUtil.load("cuda", so_path)
+        aot_inductor_module(*example_inputs)
+
+        # Re-run where dynamic dim size is 0.
+        example_inputs = (
+            torch.randn((0, 2), device=self.device),
+            torch.randn((1, 2), device=self.device),
+        )
+        actual = aot_inductor_module(*example_inputs)
+        expected = Repro()(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+
     def test_repeat_interleave(self):
         class Repro(torch.nn.Module):
             def __init__(self):
@@ -751,21 +1152,86 @@ class Model(torch.nn.Module):
             def __init__(self):
                 super().__init__()
 
-            def forward(self, x1, x2):
-                return torch.cat([x1, x2], dim=0)
+            def forward(self, a, b):
+                return torch.cat([a, b], dim=0)
 
         a = torch.randn(2, 4, device=self.device)
         b = torch.randn(3, 4, device=self.device)
-        constraints = [
-            torch._export.dynamic_dim(a, 0) >= 1,
-            torch._export.dynamic_dim(a, 0) <= 10,
-            torch._export.dynamic_dim(b, 0) >= 1,
-            torch._export.dynamic_dim(b, 0) <= 20,
-        ]
+        dim0_a = Dim("dim0_a", min=1, max=10)
+        dim0_b = Dim("dim0_b", min=1, max=20)
+        dynamic_shapes = {"a": {0: dim0_a}, "b": {0: dim0_b}}
         example_inputs = (a, b)
-        self.check_model(Model(), example_inputs, constraints=constraints)
+        self.check_model(Model(), example_inputs, dynamic_shapes=dynamic_shapes)
+
+    def test_buffer_mutation_1(self):
+        class Model(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.register_buffer("foo", torch.randn(4, 4, device=device))
+
+            def forward(self, x):
+                self.foo.add_(1)
+                return self.foo + x
+
+        example_inputs = (torch.rand(4, 4, device=self.device),)
+        torch._export.aot_compile(Model(self.device), example_inputs)
+        self.check_model(Model(self.device), example_inputs)
+
+    def test_buffer_mutation_2(self):
+        class Model(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.register_buffer("foo", torch.arange(10, device=device))
+                self.register_buffer("bar", torch.arange(10, device=device))
+
+            def forward(self, x):
+                self.bar.mul_(2)
+                self.foo[5] = self.bar[0]
+                return x + self.bar, x * self.foo
+
+        example_inputs = (torch.randn(10, device=self.device),)
+        self.check_model(Model(self.device), example_inputs)
+
+    def test_buffer_mutation_3(self):
+        class KVCache(torch.nn.Module):
+            def __init__(
+                self,
+                max_batch_size,
+                max_seq_length,
+                n_heads,
+                head_dim,
+                dtype=torch.float,
+            ):
+                super().__init__()
+                cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
+                self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype))
+                self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype))
+
+            def update(self, input_pos, k_val, v_val):
+                # input_pos: [S], k_val: [B, H, S, D]
+                k_out = self.k_cache
+                v_out = self.v_cache
+                k_out[:, :, input_pos] = k_val
+                v_out[:, :, input_pos] = v_val
+
+                return k_out, v_out
+
+        class Model(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.kv_cache = KVCache(1, 256, 6, 48)
+
+            def forward(self, inp_pos, k, v):
+                self.kv_cache.update(inp_pos, k, v)
+                return self.kv_cache.k_cache + 1, self.kv_cache.v_cache / 2
+
+        example_inputs = (
+            torch.tensor([0], device=self.device),
+            torch.randn(1, 6, 1, 48, device=self.device),
+            torch.randn(1, 6, 1, 48, device=self.device),
+        )
+        self.check_model(Model(self.device), example_inputs)
 
-    @skipIfRocm
     @requires_multigpu()
     def test_replicate_on_devices(self):
         if self.device != "cuda":
@@ -788,9 +1254,7 @@ def forward(self, x, y):
         result_cpu = Model(w1, w2)(*inputs)
 
         # Compile model with AOTInductor
-        with torch.cuda.device(0), config.patch(
-            "aot_inductor.abi_compatible", self.abi_compatible
-        ):
+        with torch.cuda.device(0), config.patch("abi_compatible", self.abi_compatible):
             so_path = AOTIRunnerUtil.compile(
                 model=Model(w1.cuda(0), w2.cuda(0)),
                 example_inputs=tuple(t.cuda(0) for t in inputs),
@@ -801,7 +1265,7 @@ def forward(self, x, y):
             with torch.cuda.device(i):
                 example_inputs = tuple(t.cuda(i) for t in inputs)
                 optimized = AOTIRunnerUtil.load("cuda", so_path)
-                result_cuda = optimized(example_inputs)
+                result_cuda = optimized(*example_inputs)
             self.assertTrue(same(result_cpu, result_cuda.cpu()))
 
     def test_pytree_inputs(self):
@@ -820,7 +1284,6 @@ def forward(self, x: Dict[str, torch.Tensor]):
 
         self.check_model(M(), ({"x": torch.ones(5), "y": torch.ones(5)},))
 
-    @skipIfRocm
     @requires_multigpu()
     def test_non_default_cuda_device(self):
         if self.device != "cuda":
@@ -839,14 +1302,14 @@ def forward(self, x, y):
         result_cpu = Model(weight)(*inputs)
 
         with torch.cuda.device(0), torch.no_grad(), config.patch(
-            "aot_inductor.abi_compatible", self.abi_compatible
+            "abi_compatible", self.abi_compatible
         ):
             result_cuda_0 = AOTIRunnerUtil.run(
                 "cuda", Model(weight.cuda(0)), tuple(t.cuda(0) for t in inputs)
             )
 
         with torch.cuda.device(1), torch.no_grad(), config.patch(
-            "aot_inductor.abi_compatible", self.abi_compatible
+            "abi_compatible", self.abi_compatible
         ):
             result_cuda_1 = AOTIRunnerUtil.run(
                 "cuda", Model(weight.cuda(1)), tuple(t.cuda(1) for t in inputs)
@@ -883,6 +1346,70 @@ def forward(self, x, y):
                     exactly=True,
                 ).run(src_code)
 
+    def test_reuse_kernel_dynamic(self):
+        class Model(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.cst = torch.randn(48, device=device, dtype=torch.float)
+                self.weights = torch.randn(6, 48, 48, device=device, dtype=torch.float)
+                self.cst_1 = torch.randn(48, device=device, dtype=torch.float)
+                self.weights_1 = torch.randn(
+                    6, 48, 48, device=device, dtype=torch.float
+                )
+
+            def forward(self, x, y, z):
+                dim0 = x.size(1)
+                add_0 = z + z
+                expand_2 = add_0.expand(-1, -1, 48)
+                # [s0, 6, 48]
+                mul_3 = add_0 * expand_2
+                # [6, s0, 48]
+                permute_4 = torch.permute(mul_3, (1, 0, 2))
+                # [6, s0, 48]
+                bmm_5 = torch.bmm(permute_4, self.weights)
+                add_6 = bmm_5 + self.cst
+                reshape_7 = torch.reshape(add_6, [6, dim0 * 6, 8])
+                # [6*s0, 6, 8]
+                permute_8 = torch.permute(reshape_7, (1, 0, 2))
+                mul_9 = permute_8 * 0.123
+                reshape_10 = torch.reshape(y, [8, dim0 * 6, 4])
+                # [6*s0, 8, 4]
+                permute_11 = torch.permute(reshape_10, (1, 0, 2))
+                bmm_12 = torch.bmm(mul_9, permute_11)
+
+                add_0_1 = z + z
+                expand_2_1 = add_0_1.expand(-1, -1, 48)
+                # [s0, 6, 48]
+                mul_3_1 = add_0_1 * expand_2_1
+                # [6, s0, 48]
+                permute_4_1 = torch.permute(mul_3_1, (1, 0, 2))
+                # [6, s0, 48]
+                bmm_5_1 = torch.bmm(permute_4_1, self.weights_1)
+                add_6_1 = bmm_5_1 + self.cst_1
+                reshape_7_1 = torch.reshape(add_6_1, [6, dim0 * 6, 8])
+                # [6*s0, 6, 8]
+                permute_8_1 = torch.permute(reshape_7_1, (1, 0, 2))
+                mul_9_1 = permute_8_1 * 0.123
+                reshape_10_1 = torch.reshape(y, [8, dim0 * 6, 4])
+                # [6*s0, 8, 4]
+                permute_11_1 = torch.permute(reshape_10_1, (1, 0, 2))
+                bmm_12_1 = torch.bmm(mul_9_1, permute_11_1)
+                return bmm_12 + bmm_12_1
+
+        x = torch.randn(6, 2, 48, device=self.device, dtype=torch.float)
+        y = torch.randn(48, 2, 4, device=self.device, dtype=torch.float)
+        z = torch.randn(2, 6, 1, device=self.device, dtype=torch.float)
+        dim0 = Dim("dim0", min=1, max=2048)
+        dynamic_shapes = {
+            "x": {1: dim0},
+            "y": {1: dim0},
+            "z": {0: dim0},
+        }
+
+        example_inputs = (x, y, z)
+        m = Model(self.device).to(dtype=torch.float)
+        self.check_model(m, example_inputs, dynamic_shapes=dynamic_shapes)
+
     def test_fake_tensor_device_validation(self):
         if self.device != "cuda":
             raise unittest.SkipTest("requires CUDA")
@@ -897,11 +1424,7 @@ def forward(self, x, y):
         example_inputs = (torch.randn(10, 10), torch.randn(10, 10))
 
         # Export on CPU
-        exported_program = torch._export.export(
-            Model(),
-            example_inputs,
-            constraints=[],
-        )
+        exported_program = export(Model(), example_inputs)
 
         # Compile exported model on CUDA
         gm = exported_program.graph_module.to(self.device)
@@ -994,6 +1517,27 @@ def forward(self, x):
         example_inputs = (torch.randn(4, 4, 4, 4).to(self.device),)
         self.check_model(Model(), example_inputs)
 
+    # This exercises _eliminate_unbacked path in ShapeEnv
+    @unittest.skipIf(IS_FBCODE, "Not runnable in fbcode")
+    def test_dup_unbacked_sym_decl_with_refinement(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                abs_1 = torch.ops.aten.abs.default(x)
+                lt = torch.ops.aten.lt.Scalar(abs_1, 0.001)
+                eq = torch.ops.aten.eq.Scalar(lt, 0)
+                index_1 = torch.ops.aten.index.Tensor(x, [eq])
+                torch._check(index_1.size(0) == 4**4)
+                sin = torch.ops.aten.sin.default(index_1)
+                index_2 = torch.ops.aten.index.Tensor(x, [eq])
+                div_3 = torch.ops.aten.div.Tensor(sin, index_2)
+                return div_3
+
+        example_inputs = (torch.ones(4, 4, 4, 4).to(self.device),)
+        self.check_model(Model(), example_inputs)
+
     def test_run_with_grad_enabled(self):
         class Model(torch.nn.Module):
             def forward(self, x, weight, bias):
@@ -1016,7 +1560,7 @@ def forward(self, x, weight, bias):
         self.assertTrue(torch.is_grad_enabled())
 
         optimized = AOTIRunnerUtil.load(self.device, so_path)
-        actual = optimized(example_inputs)
+        actual = optimized(*example_inputs)
         actual = pytree.tree_leaves(actual)
 
         self.assertTrue(same(actual, expected))
@@ -1034,6 +1578,19 @@ def forward(self, x):
         x = torch.randn(5, device=self.device)
         self.check_model(Model(self.device), (x,))
 
+    def test_return_view_constant(self):
+        class Model(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.cst = torch.randn(5, 5, device=device)
+
+            def forward(self, x):
+                a = torch.transpose(self.cst, 0, 1)
+                return (x, a)
+
+        x = torch.randn(5, device=self.device)
+        self.check_model(Model(self.device), (x,))
+
     def test_with_profiler(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -1074,6 +1631,17 @@ def forward(self, x):
         example_inputs = (torch.randn(3, 10, device=self.device),)
         self.check_model(Model(), example_inputs)
 
+    def test_view_outputs(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                y = torch.sin(x)
+                y_same_size = y.view(*y.shape)
+                y_diff_size = y.view(1, *y.shape)
+                return y, y_same_size, y_diff_size
+
+        example_inputs = (torch.randn(3, 10, device=self.device),)
+        self.check_model(Model(), example_inputs)
+
     @skip_if_no_torchvision
     def test_missing_cubin(self):
         from torchvision.models.resnet import Bottleneck, ResNet
@@ -1110,7 +1678,6 @@ def forward(self, x):
         )
         self.check_model(model, example_inputs)
 
-    @skipIfRocm
     @common_utils.parametrize("grid_type", [1, 2, 3])
     @common_utils.parametrize("num_dims", [1, 2])
     @common_utils.parametrize("dynamic", [False, True])
@@ -1124,9 +1691,6 @@ def __init__(self):
                 super().__init__()
 
             def forward(self, x, y):
-                # AOT export does not allow for input mutation
-                x = x.clone()
-                y = y.clone()
                 output = torch.zeros_like(x)
                 if autotune and num_dims == 2:
                     x_elements = output.size()[0]
@@ -1179,17 +1743,14 @@ def grid_fn(meta):
                 return output
 
         dims = [10] * num_dims
-        a = torch.randn(*dims, device=self.device)
-        b = torch.randn(*dims, device=self.device)
-        constraints = []
+        x = torch.randn(*dims, device=self.device)
+        y = torch.randn(*dims, device=self.device)
+        dynamic_shapes = []
         if dynamic:
-            constraints = [
-                torch._export.dynamic_dim(a, 0) >= 1,
-                torch._export.dynamic_dim(a, 0) <= 10,
-                torch._export.dynamic_dim(b, 0) >= 1,
-                torch._export.dynamic_dim(b, 0) <= 10,
-            ]
-        self.check_model(Model(), (a, b), constraints=constraints)
+            dim0_x = Dim("dim0_x", min=1, max=10)
+            dim0_y = Dim("dim0_y", min=1, max=10)
+            dynamic_shapes = {"x": {0: dim0_x}, "y": {0: dim0_y}}
+        self.check_model(Model(), (x, y), dynamic_shapes=dynamic_shapes)
 
     def test_triton_kernel_dynamic_shape_with_div(self):
         if self.device != "cuda":
@@ -1204,20 +1765,16 @@ def __init__(self):
                 super().__init__()
 
             def forward(self, x):
-                # AOT export does not allow for input mutation
-                x = x.clone()
                 num = x.numel() // 4
 
                 grid = lambda meta: (triton.cdiv(num, 16),)  # noqa: E731
                 pass_kernel[grid](x, num)
                 return x
 
-        a = torch.randn(10, device=self.device)
-        constraints = [
-            torch._export.dynamic_dim(a, 0) >= 1,
-            torch._export.dynamic_dim(a, 0) <= 10,
-        ]
-        self.check_model(Model(), (a,), constraints=constraints)
+        x = torch.randn(10, device=self.device)
+        dim0_x = Dim("dim0_x", min=1, max=10)
+        dynamic_shapes = {"x": {0: dim0_x}}
+        self.check_model(Model(), (x,), dynamic_shapes=dynamic_shapes)
 
     def test_triton_kernel_reinterpret_view(self):
         if self.device != "cuda":
@@ -1232,12 +1789,19 @@ def __init__(self):
                 super().__init__()
 
             def forward(self, x):
-                # AOT export does not allow for input mutation
-                x = x.clone()
-                pass_kernel[(1,)](x, torch.empty_like(x))
-                return x
+                out = torch.zeros_like(x[:, 4:])
+                # the slicing below creates two ReinterpretView
+                # instances: with offset=3 and offset=4
+                add_kernel[(10,)](
+                    in_ptr0=x[:, 3:-1],
+                    in_ptr1=x[:, 4:],
+                    out_ptr=out,
+                    n_elements=160,
+                    BLOCK_SIZE=16,
+                )
+                return out
 
-        example_inputs = (torch.randn(4, device=self.device),)
+        example_inputs = (torch.randn(10, 20, device=self.device),)
         self.check_model(Model(), example_inputs)
 
     def test_triton_kernel_with_none_input(self):
@@ -1249,12 +1813,9 @@ def __init__(self):
                 super().__init__()
 
             def forward(self, x, y):
-                # AOT export does not allow for input mutation
                 n_elements = x.size()[0]
                 BLOCK_SIZE = 1024
 
-                x = x.clone()
-                y = y.clone()
                 output_wo_y = torch.empty_like(x)
                 output_with_y = torch.empty_like(x)
 
@@ -1284,6 +1845,61 @@ def forward(self, x, y):
 
         self.check_model(Model(), example_inputs)
 
+    def test_triton_kernel_equal_to_1_arg(self):
+        if self.device != "cuda":
+            raise unittest.SkipTest("requires CUDA")
+
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                out = torch.empty_like(x)
+                n_elements = x.numel()
+                add_kernel[(n_elements,)](x, y, out, n_elements, BLOCK_SIZE=16)
+                return out
+
+        example_inputs = (
+            torch.randn(1, device=self.device),
+            torch.randn(1, device=self.device),
+        )
+
+        self.check_model(Model(), example_inputs)
+
+    @common_utils.parametrize("dynamic", [False, True])
+    def test_triton_kernel_equal_to_1_float_arg(self, dynamic):
+        if self.device != "cuda":
+            raise unittest.SkipTest("requires CUDA")
+
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                out = torch.empty_like(x)
+                n_elements = x.numel()
+                scaling_factor = (n_elements**0) / 1.0
+                add_kernel_with_scaling[(n_elements,)](
+                    x,
+                    y,
+                    out,
+                    n_elements,
+                    scaling_factor,
+                    BLOCK_SIZE=16,
+                )
+                return out
+
+        dynamic_shapes = None
+        if dynamic:
+            dim0_xy = Dim("s0", min=2, max=1024)
+            dynamic_shapes = {
+                "x": {0: dim0_xy, 1: None},
+                "y": {0: dim0_xy, 1: None},
+            }
+        example_inputs = (
+            torch.randn(2, device=self.device),
+            torch.randn(2, device=self.device),
+        )
+        self.check_model(
+            Model(),
+            example_inputs,
+            dynamic_shapes=dynamic_shapes,
+        )
+
     def test_shifted_constraint_ranges(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -1299,19 +1915,13 @@ def forward(
 
         a = torch.randn((4, 5), device=self.device)
         b = torch.randn((5, 5), device=self.device)
-
-        constraints = [
-            torch._export.dynamic_dim(a, 0) >= 2,
-            torch._export.dynamic_dim(a, 0) <= 1024,
-            torch._export.dynamic_dim(b, 0) >= 3,
-            torch._export.dynamic_dim(b, 0) <= 1025,
-        ]
-
+        dim0_x = Dim("dim0_x", min=2, max=1024)
+        dim0_y = dim0_x + 1
+        dynamic_shapes = {"x": {0: dim0_x}, "y": {0: dim0_y}}
         self.check_model(
             Model(),
             (a, b),
-            constraints=constraints,
-            disable_constraint_solver=True,
+            dynamic_shapes=dynamic_shapes,
         )
 
     def test_scatter_fallback(self):
@@ -1356,6 +1966,32 @@ def forward(
 
         self.check_model(Model(), inputs)
 
+    def test_index_put_fallback(self):
+        # index_put falls back in the deterministic mode
+        with DeterministicGuard(True):
+
+            class Model(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+
+                def forward(
+                    self,
+                    self_tensor: torch.Tensor,
+                    indices: Tuple[torch.Tensor],
+                    values: torch.Tensor,
+                ):
+                    return torch.index_put(
+                        self_tensor, indices, values, accumulate=True
+                    )
+
+            inputs = (
+                torch.ones(4, device=self.device, dtype=torch.int64),
+                (torch.tensor([1, 1, 2, 2], device=self.device, dtype=torch.bool),),
+                torch.ones(4, device=self.device, dtype=torch.int64),
+            )
+
+            self.check_model(Model(), inputs)
+
     def test_convolution(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -1532,6 +2168,589 @@ def forward(self, x):
 
         self.check_model(MyModule(), (torch.randn(2, 3, device=self.device),))
 
+    def test_model_modified_weights(self):
+        class Model(torch.nn.Module):
+            def __init__(self, n, k, device):
+                super().__init__()
+                self.weight = torch.randn(n, k, device=device)
+                self.bias = torch.randn(n, device=device)
+
+            def forward(self, a):
+                return torch.nn.functional.linear(a, self.weight, self.bias)
+
+        M = 16
+        N = 10
+        K = 128
+        batch = 8
+        example_inputs = (torch.randn(2, M, K, device=self.device),)
+        model = Model(N, K, self.device)
+        self.check_model(model, example_inputs)
+        # Update model weights, after this AOTInductor should re-generate model.so
+        # if weights are stored in the model.so
+        model.weight += 1
+        self.check_model(model, example_inputs)
+
+    def test_triton_kernel_extern_kernel_arg(self):
+        if self.device != "cuda":
+            raise unittest.SkipTest("requires CUDA")
+
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                out = torch.zeros_like(x)
+                # torch.mm is ExternKernelOut
+                add_kernel[(4,)](x, torch.mm(x, y), out, 4, 16)
+                return out
+
+        example_inputs = (
+            torch.randn(4, 4, device="cuda"),
+            torch.randn(4, 4, device="cuda"),
+        )
+
+        self.check_model(Model(), example_inputs)
+
+    def test_triton_kernel_multi_output_arg(self):
+        if self.device != "cuda":
+            raise unittest.SkipTest("requires CUDA")
+
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                out = torch.zeros_like(x)
+                # torch.sort creates fallback kernel and hence MultiOutput
+                add_kernel[(4,)](x, torch.sort(y).values, out, 4, 16)
+                return out
+
+        example_inputs = (
+            torch.randn(4, 4, device="cuda"),
+            torch.randn(4, 4, device="cuda"),
+        )
+
+        self.check_model(Model(), example_inputs)
+
+    @config.patch({"abi_compatible": True})
+    def test_triton_kernel_reinterpret_view_mem_leak(self):
+        # Check for memory leak when using user-defined Triton Kernel + AOTI.
+        if self.device != "cuda":
+            raise unittest.SkipTest("requires CUDA")
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                out = torch.zeros_like(x)
+                yy = y * y
+                # reshape creates a ReinterpretView
+                add_kernel[(4,)](x, yy.reshape_as(x), out, 4, 16)
+                return out
+
+        example_inputs = (
+            torch.randn(4, 4, device="cuda"),
+            torch.randn(1, 16, device="cuda"),
+        )
+
+        so_path: str = AOTIRunnerUtil.compile(
+            Model(),
+            example_inputs,
+        )
+        aot_inductor_module = AOTIRunnerUtil.load("cuda", so_path)
+
+        # Don't assign outputs to a variable b/c it will allocate GPU memory.
+        device: int = torch.cuda.current_device()
+        mem_before = torch.cuda.memory_allocated(device)
+        aot_inductor_module(*example_inputs)
+        aot_inductor_module(*example_inputs)
+        mem_after = torch.cuda.memory_allocated(device)
+        self.assertEqual(mem_before, mem_after)
+
+        actual = aot_inductor_module(*example_inputs)
+        expected = Model()(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    @common_utils.parametrize("dynamic", [False, True])
+    @common_utils.parametrize("autotuning", [False, True])
+    def test_triton_kernel_unbacked_symint_in_grid(self, dynamic, autotuning):
+        if self.device != "cuda":
+            raise unittest.SkipTest("requires CUDA")
+
+        class Model(torch.nn.Module):
+            def forward(self, x, y, n_elements_tensor):
+                output = torch.zeros_like(x)
+                n_elements_symint = n_elements_tensor.item()
+                n_elements = x.numel()
+
+                def grid(meta):
+                    return (triton.cdiv(n_elements_symint, meta["BLOCK_SIZE"]),)
+
+                if autotuning:
+                    add_kernel_autotuned[grid](
+                        x,
+                        y,
+                        output,
+                        n_elements,
+                    )
+                else:
+                    add_kernel[grid](
+                        x,
+                        y,
+                        output,
+                        n_elements,
+                        BLOCK_SIZE=16,
+                    )
+
+                return output
+
+        example_inputs = (
+            torch.randn(123, device="cuda"),
+            torch.randn(123, device="cuda"),
+            torch.tensor(123),
+        )
+
+        dynamic_shapes = None
+        if dynamic:
+            dim0 = Dim("s0", min=2, max=1024)
+            dynamic_shapes = {
+                "x": {0: dim0},
+                "y": {0: dim0},
+                "n_elements_tensor": {},
+            }
+
+        self.check_model(
+            Model(),
+            example_inputs,
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    @skipIfRocm  # USE_MEM_EFF_ATTENTION was not enabled for build.
+    def test_scaled_dot_product_efficient_attention(self):
+        if self.device != "cuda":
+            raise unittest.SkipTest("requires CUDA")
+
+        class Model(torch.nn.Module):
+            def forward(self, q, k, v, attn_bias):
+                return torch.ops.aten._scaled_dot_product_efficient_attention(
+                    q, k, v, attn_bias, False
+                )[0]
+
+        example_inputs = (
+            torch.randn(4, 4, 36, 36, device="cuda"),
+            torch.randn(4, 4, 36, 36, device="cuda"),
+            torch.randn(4, 4, 36, 36, device="cuda"),
+            torch.randn(4, 4, 36, 36, device="cuda"),
+        )
+        self.check_model(Model(), example_inputs)
+
+    def test_index_put_with_none_index(self):
+        # index_put falls back in the deterministic mode
+        with DeterministicGuard(True):
+
+            class Model(torch.nn.Module):
+                def forward(self, x, i1, i2, y):
+                    return torch.ops.aten.index_put(
+                        x,
+                        (None, None, i1, i2.transpose(0, 1)),
+                        y,
+                        accumulate=True,
+                    )
+
+            example_inputs = (
+                torch.rand(8, 192, 30, 30, device=self.device),
+                torch.zeros(3, 14, 1, 1, dtype=torch.int64, device=self.device),
+                torch.ones(14, 3, dtype=torch.int64, device=self.device),
+                torch.randn(8, 192, 3, 14, 3, 14, device=self.device),
+            )
+            self.check_model(Model(), example_inputs)
+
+    def test_runtime_checks(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9):
+                return (x0, x1, x2, x3, x4, x5, x6, x7, x8, x9)
+
+        inputs = []
+        for dtype in (
+            torch.float16,
+            torch.float32,
+            torch.float64,
+            torch.bfloat16,
+            torch.bool,
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.uint8,
+        ):
+            inputs.append(torch.ones(4, 8, 10, dtype=dtype, device=self.device))
+        dim0 = Dim("s0", min=2, max=1024)
+        dim1 = Dim("s1", min=2, max=512)
+        dim2 = Dim("s2", min=2, max=128)
+        dynamic_shapes = {
+            "x0": {0: dim0},
+            "x1": {0: dim0},
+            "x2": {0: dim0},
+            "x3": {1: dim1},
+            "x4": {1: dim1},
+            "x5": {1: dim1},
+            "x6": {},
+            "x7": {2: dim2},
+            "x8": {2: dim2},
+            "x9": {2: dim2},
+        }
+        m = Model()
+        inputs = tuple(inputs)
+        with torch.no_grad(), config.patch(
+            {
+                "abi_compatible": self.abi_compatible,
+                "aot_inductor.debug_compile": True,
+            }
+        ):
+            so_path = AOTIRunnerUtil.compile(m, inputs, dynamic_shapes=dynamic_shapes)
+        with open(os.path.splitext(so_path)[0] + ".cpp") as cpp:
+            src_code = cpp.read()
+            FileCheck().check_count(
+                "unmatched dtype",
+                10,
+                exactly=True,
+            ).run(src_code)
+            FileCheck().check_count(
+                "unmatched dim value at",
+                21,  # we have 9 dynamic dims for which we generate different checks
+                exactly=True,
+            ).run(src_code)
+            FileCheck().check_count(
+                "dim value is too",
+                18,  # we have 9 dynamic dims for which we generate two checks
+                exactly=True,
+            ).run(src_code)
+            FileCheck().check_count(
+                "unmatched stride value at",
+                21,  # we have 9 symbolic strides for which we don't generate checks
+                exactly=True,
+            ).run(src_code)
+        optimized = AOTIRunnerUtil.load(self.device, so_path)
+        actual = optimized(*inputs)
+        expected = m(*inputs)
+        torch.testing.assert_close(actual, expected)
+
+    @unittest.skipIf(TEST_WITH_ROCM, "FP8 is not supported on ROCM")
+    @unittest.skipIf(not SM90OrLater, "FP8 is only supported on H100+")
+    def test_runtime_checks_fp8(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x0, x1, x2, x3):
+                t = (
+                    x0.to(torch.float)
+                    + x1.to(torch.float)
+                    + x2.to(torch.float)
+                    + x3.to(torch.float)
+                )
+                return t
+
+        inputs = []
+        for dtype in (
+            torch.float8_e4m3fn,
+            torch.float8_e5m2,
+            torch.float8_e4m3fnuz,
+            torch.float8_e5m2fnuz,
+        ):
+            inputs.append(torch.ones(8, 8, 8, dtype=dtype, device=self.device))
+        dim0 = Dim("s0", min=2, max=1024)
+        dynamic_shapes = {
+            "x0": {0: dim0},
+            "x1": {0: dim0},
+            "x2": {0: dim0},
+            "x3": {0: dim0},
+        }
+        with torch.no_grad(), config.patch(
+            {
+                "abi_compatible": self.abi_compatible,
+                "aot_inductor.debug_compile": True,
+            }
+        ):
+            self.check_model(
+                Model(),
+                tuple(inputs),
+                dynamic_shapes=dynamic_shapes,
+            )
+
+    def test_runtime_checks_complex(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x0, x1, x2):
+                return (x0, x1, x2)
+
+        inputs = []
+        x0 = torch.tensor([1, -1], dtype=torch.complex32, device=self.device)
+        x1 = torch.tensor(
+            [1 + 1j, -1 + 1j, -2 + 2j, 3 - 3j, 0, 1j, 1, -1],
+            dtype=torch.complex64,
+            device=self.device,
+        )
+        x2 = torch.tensor(128, dtype=torch.complex128, device=self.device)
+        inputs.append(x0)
+        inputs.append(x1)
+        inputs.append(x2)
+        dim0 = Dim("s0", min=2, max=1024)
+        dynamic_shapes = {
+            "x0": {0: dim0},
+            "x1": {},
+            "x2": {},
+        }
+        with torch.no_grad(), config.patch(
+            {
+                "abi_compatible": self.abi_compatible,
+                "aot_inductor.debug_compile": True,
+            }
+        ):
+            self.check_model(
+                Model(),
+                tuple(inputs),
+                dynamic_shapes=dynamic_shapes,
+            )
+
+    @unittest.skipIf(IS_FBCODE, "Not yet runnable in fbcode")
+    def test_runtime_checks_dtype_failed(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                y = x.type(torch.float)
+                return y
+
+        x = torch.randn(1, 4, dtype=torch.float16, device=self.device)
+        model = Model()
+        with torch.no_grad(), config.patch(
+            {
+                "abi_compatible": self.abi_compatible,
+                "aot_inductor.debug_compile": True,
+            }
+        ):
+            so_path: str = AOTIRunnerUtil.compile(
+                model,
+                (x,),
+            )
+        aot_inductor_module = AOTIRunnerUtil.load(self.device, so_path)
+        x_casted = x.float()
+        with self.assertRaisesRegex(Exception, ""):
+            aot_inductor_module(x_casted)
+
+    def test_non_contiguous_output_alias(self):
+        # Test return x, x.contiguous() where x is non-contiguous.
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                squared = x * x
+                transposed = squared.t()  # non-contiguous
+                contig = transposed.contiguous()
+                return transposed, contig
+
+        x = torch.randn(3, 4, dtype=torch.float16, device=self.device)
+        model = Model()
+        with torch.no_grad(), config.patch(
+            {
+                "abi_compatible": self.abi_compatible,
+            }
+        ):
+            result = AOTIRunnerUtil.run(
+                self.device,
+                model,
+                (x,),
+            )
+        actual = model(x)
+        self.assertTrue(same(result, actual))
+
+        # contiguous() should create a new tensor
+        self.assertTrue(result[0].data_ptr() != result[1].data_ptr())
+
+    def test_multiple_output_alias(self):
+        # Test when mutliple outputs alias the same tensor
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                squared = x * x
+                contig = squared.contiguous()  # alias
+                reshaped = squared.reshape(squared.shape)  # alias
+                cubed = squared * x
+                return squared, contig, reshaped, cubed
+
+        x = torch.randn(3, 4, dtype=torch.float32, device=self.device)
+        model = Model()
+
+        with torch.no_grad(), config.patch(
+            {
+                "abi_compatible": self.abi_compatible,
+            }
+        ):
+            result = AOTIRunnerUtil.run(
+                self.device,
+                model,
+                (x,),
+            )
+        actual = model(x)
+        self.assertTrue(same(result, actual))
+
+        # squared, contig and reshaped alias the same tensor.
+        self.assertTrue(result[0].data_ptr() == result[1].data_ptr())
+        self.assertTrue(result[0].data_ptr() == result[2].data_ptr())
+        # cubed shouldn't be an alias.
+        self.assertTrue(result[0].data_ptr() != result[3].data_ptr())
+
+    def test_runtime_checks_shape_failed(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x
+
+        x = torch.randn(4, 4, 4, dtype=torch.float16, device=self.device)
+        y0 = torch.randn(8, 4, 4, dtype=torch.float16, device=self.device)
+        y1 = torch.randn(4, 8, 4, dtype=torch.float16, device=self.device)
+        y2 = rand_strided(
+            (4, 4, 4), (16, 1, 4), dtype=torch.float16, device=self.device
+        )
+        # batch size is outside of the range
+        y3 = torch.randn(2048, 3, 4, dtype=torch.float16, device=self.device)
+        y4 = torch.randn(2048, 4, 4, dtype=torch.float16, device=self.device)
+        dim0 = Dim("s0", min=4, max=1024)
+        dynamic_shapes = {
+            "x": {0: dim0},
+        }
+        model = Model()
+        with torch.no_grad(), config.patch(
+            {
+                "abi_compatible": self.abi_compatible,
+                "aot_inductor.debug_compile": True,
+            }
+        ):
+            so_path: str = AOTIRunnerUtil.compile(
+                model, (x,), dynamic_shapes=dynamic_shapes
+            )
+        aot_inductor_module = AOTIRunnerUtil.load(self.device, so_path)
+        # dynamic dim works fine
+        _ = aot_inductor_module(y0)
+        with self.assertRaisesRegex(Exception, ""):
+            aot_inductor_module(y1)
+        with self.assertRaisesRegex(Exception, ""):
+            aot_inductor_module(y2)
+        with self.assertRaisesRegex(Exception, ""):
+            aot_inductor_module(y3)
+        with self.assertRaisesRegex(Exception, ""):
+            aot_inductor_module(y4)
+
+    def test_add_complex(self):
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b)
+
+        x = torch.tensor(
+            [1 + 1j, -1 + 1j, -2 + 2j, 3 - 3j, 0, 1j, 1, -1], device=self.device
+        )
+        y = torch.tensor(
+            [1 + 1j, -1 + 1j, -2 + 2j, 3 - 3j, 0, 1j, 1, -1], device=self.device
+        )
+        self.check_model(Model(), (x, y))
+
+    def test_embedding_bag(self):
+        class Model(torch.nn.Module):
+            def forward(self, w, i, o):
+                return torch.ops.aten._embedding_bag(w, i, o, False, 0, False, None)
+
+        example_inputs = (
+            torch.randn([10, 4], device=self.device),
+            torch.randint(10, [8], device=self.device),
+            torch.tensor([0, 2, 6], device=self.device),
+        )
+        self.check_model(Model(), example_inputs)
+
+    def test_fft_c2c(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return torch.fft.fftn(x), torch.fft.fftn(x).real
+
+        example_inputs = (torch.randn(16, 16, 16, device=self.device),)
+        self.check_model(Model(), example_inputs)
+
+    def test_nested_tensor_from_jagged(self):
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mlp = nn.Sequential(
+                    nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, 32), nn.Sigmoid()
+                )
+
+            def forward(self, values, offsets):
+                nt = torch.nested.nested_tensor_from_jagged(values, offsets)
+                res = self.mlp(nt)
+                return res.values()
+
+        model = Model().to(device=self.device)
+
+        example_inputs_1 = (
+            torch.randn((15, 128), device=self.device),
+            torch.tensor([0, 3, 4, 10, 15], device=self.device),
+        )
+
+        # same "NT batch size", different actual amount of data
+        example_inputs_2 = (
+            torch.randn((31, 128), device=self.device),
+            torch.tensor([0, 1, 20, 25, 31], device=self.device),
+        )
+
+        # same actual amount of data, different "NT batch size"
+        example_inputs_3 = (
+            torch.randn((15, 128), device=self.device),
+            torch.tensor([0, 3, 10, 15], device=self.device),
+        )
+
+        # different "NT batch size"
+        example_inputs_4 = (
+            torch.randn((37, 128), device=self.device),
+            torch.tensor([0, 5, 16, 25, 29, 37], device=self.device),
+        )
+
+        dim0_values = Dim("dim0_values", min=1, max=128)
+        dim0_offsets = Dim("dim0_offsets", min=1, max=9)
+        dynamic_shapes = {"values": {0: dim0_values}, "offsets": {0: dim0_offsets}}
+        example_inputs_list = [
+            example_inputs_1,
+            example_inputs_2,
+            example_inputs_3,
+            example_inputs_4,
+        ]
+
+        self.check_model_with_multiple_inputs(
+            model, example_inputs_list, dynamic_shapes=dynamic_shapes
+        )
+
+    def test_misc_1(self):
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mlp = nn.Sequential(
+                    nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, 32), nn.Sigmoid()
+                )
+                self.emb = nn.EmbeddingBag(num_embeddings=128, embedding_dim=32)
+                self.over_arch = nn.Sequential(
+                    nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 32), nn.Sigmoid()
+                )
+
+            def forward(self, x, y):
+                mlp_output = self.mlp(x)
+                emb_output = self.emb(y)
+                return self.over_arch(torch.concat([mlp_output, emb_output], dim=1))
+
+        example_inputs = (
+            torch.randn(16, 128, device=self.device),
+            torch.randint(0, 128, (16, 10), device=self.device),
+        )
+        self.check_model(Model(), example_inputs)
+
 
 common_utils.instantiate_parametrized_tests(AOTInductorTestsTemplate)
 
@@ -1587,57 +2806,124 @@ def fail_abi_compatible_cuda(is_skip=False):
     )
 
 
+def fail_non_abi_compatible_cuda(is_skip=False):
+    return TestFailure(
+        ("non_abi_compatible_cuda",),
+        is_skip=is_skip,
+    )
+
+
 # test_failures, xfail by default, set is_skip=True to skip
 CPU_TEST_FAILURES = {
-    "test_addmm_multiple_dynamic": fail_with_and_without_stack_allocation(),
-    "test_bmm_multiple_dynamic": fail_with_and_without_stack_allocation(),
+    "test_add_complex": fail_stack_allocation(is_skip=True),
+    # FIXME: failed with Segfault while exiting the Python runtime
+    "test_duplicate_constant_folding": fail_with_and_without_stack_allocation(
+        is_skip=True
+    ),
     "test_dup_unbacked_sym_decl": fail_with_and_without_stack_allocation(),
-    "test_dynamic_cat": fail_with_and_without_stack_allocation(),
-    "test_dynamic_scalar": fail_with_and_without_stack_allocation(),
-    "test_dynamic_smem_above_default_limit": fail_with_and_without_stack_allocation(),
-    "test_foreach_multiple_dynamic": fail_with_and_without_stack_allocation(),
+    "test_dup_unbacked_sym_decl_with_refinement": fail_with_and_without_stack_allocation(),
+    "test_dynamic_cat": fail_minimal_arrayref_interface(),
+    # https://github.com/pytorch/pytorch/issues/122978
+    "test_dynamic_scalar": fail_stack_allocation(is_skip=True),
+    # https://github.com/pytorch/pytorch/issues/122980
+    "test_fft_c2c": fail_stack_allocation(is_skip=True),
     # TODO: test_freezing_abi_compatible_cpu somehow fails on CI but not locally,
     #   NotImplementedError: Cannot access storage of OpaqueTensorImpl
     "test_freezing": fail_with_and_without_stack_allocation(is_skip=True),
     # FIXME: failed with Segfault while exiting the Python runtime
     "test_missing_cubin": fail_with_and_without_stack_allocation(is_skip=True),
     # minimal arrayref interface only works with CPU; test crashes.
+    # https://github.com/pytorch/pytorch/issues/122983
     "test_multi_device": fail_minimal_arrayref_interface(is_skip=True),
     "test_normal_functional": fail_with_and_without_stack_allocation(),
-    "test_poi_multiple_dynamic": fail_with_and_without_stack_allocation(),
-    # There is a double-free issue which will be fixed in another PR
-    "test_repeat_output": fail_with_and_without_stack_allocation(is_skip=True),
+    # undefined symbol: _Z16aoti_torch_dtypeIN3c104HalfEEiv
+    "test_non_contiguous_output_alias": fail_with_and_without_stack_allocation(
+        is_skip=True
+    ),
+    "test_return_view_constant": fail_minimal_arrayref_interface(is_skip=True),
+    # The same issue as https://github.com/pytorch/pytorch/issues/122978
+    "test_reuse_kernel_dynamic": fail_minimal_arrayref_interface(is_skip=True),
     # the test segfaults
+    "test_repeat_output": fail_stack_allocation(is_skip=True),
+    "test_view_outputs": fail_stack_allocation(is_skip=True),
+    "test_multiple_output_alias": fail_with_and_without_stack_allocation(is_skip=True),
+    "test_buffer_mutation_1": fail_stack_allocation(is_skip=True),
+    "test_buffer_mutation_2": fail_stack_allocation(is_skip=True),
+    "test_buffer_mutation_3": fail_stack_allocation(is_skip=True),
+    # FIXME: failed with Segfault while exiting the Python runtime
     "test_scatter_fallback": fail_stack_allocation(is_skip=True),
-    "test_scatter_reduce_fallback": fail_stack_allocation(is_skip=True),
+    # Looks like the same issue as https://github.com/pytorch/pytorch/issues/122978
+    "test_scatter_reduce_fallback": fail_minimal_arrayref_interface(is_skip=True),
+    # Looks like the same issue as https://github.com/pytorch/pytorch/issues/122978
+    "test_index_put_fallback": fail_minimal_arrayref_interface(is_skip=True),
+    # https://github.com/pytorch/pytorch/issues/122984
+    "test_index_put_with_none_index": fail_minimal_arrayref_interface(is_skip=True),
+    # FIXME: failed with Segfault while exiting the Python runtime
+    "test_constant": fail_stack_allocation(is_skip=True),
     # C++ compile error, need for aoti_torch___scaled_dot_product_flash_attention_for_cpu
+    # https://github.com/pytorch/pytorch/issues/122986
     "test_sdpa": fail_with_and_without_stack_allocation(is_skip=True),
+    # The same issue as https://github.com/pytorch/pytorch/issues/122986
     "test_sdpa_2": fail_with_and_without_stack_allocation(is_skip=True),
-    # error: could not find s0
+    # Looks like the same issue as https://github.com/pytorch/pytorch/issues/122978
     "test_shifted_constraint_ranges": fail_with_and_without_stack_allocation(
         is_skip=True
     ),
-    "test_simple_dynamic": fail_with_and_without_stack_allocation(),
+    # https://github.com/pytorch/pytorch/issues/123691
+    "test_amp_fallback_random": fail_minimal_arrayref_interface(is_skip=True),
+    "test_simple_dynamic": fail_minimal_arrayref_interface(),
+    # https://github.com/pytorch/pytorch/issues/123691
+    "test_zero_grid_with_unbacked_symbols": fail_minimal_arrayref_interface(
+        is_skip=True
+    ),
+    # failed on MacOS
+    "test_zero_grid_with_backed_symbols": fail_with_and_without_stack_allocation(
+        is_skip=True
+    ),
+    # https://github.com/pytorch/pytorch/issues/122990
+    "test_cond_non_tensor_predicates_dynamic_False": fail_stack_allocation(
+        is_skip=True
+    ),
+    # same issue as https://github.com/pytorch/pytorch/issues/122990
+    "test_cond_non_tensor_predicates_dynamic_True": fail_stack_allocation(is_skip=True),
+    # https://github.com/pytorch/pytorch/issues/122991
+    "test_runtime_checks_complex": fail_with_and_without_stack_allocation(is_skip=True),
+    "test_runtime_checks_fp8": fail_with_and_without_stack_allocation(is_skip=True),
+    "test_while_loop_simple": fail_stack_allocation(is_skip=True),
+    "test_while_loop_nested": fail_stack_allocation(is_skip=True),
+    "test_while_loop_with_outer_code": fail_stack_allocation(is_skip=True),
+    "test_while_loop_with_parameters": fail_stack_allocation(is_skip=True),
+    "test_while_loop_with_outer_buffers": fail_stack_allocation(is_skip=True),
 }
 
 CUDA_TEST_FAILURES = {
     # test_failures, xfail by default, set is_skip=True to skip
     "test_dup_unbacked_sym_decl": fail_abi_compatible_cuda(),
-    # will add .item support later
-    "test_dynamic_scalar": fail_abi_compatible_cuda(),
+    "test_dup_unbacked_sym_decl_with_refinement": fail_abi_compatible_cuda(),
     "test_normal_functional": fail_abi_compatible_cuda(),
     # There is a double-free issue which will be fixed in another PR
-    "test_repeat_output": fail_abi_compatible_cuda(is_skip=True),
+    # no ABI shim fn for torch.sort; remove this when adding one
+    "test_triton_kernel_multi_output_arg": fail_abi_compatible_cuda(is_skip=True),
+    # no runtime checks for non_abi_compatible mode
+    "test_runtime_checks": fail_non_abi_compatible_cuda(is_skip=True),
+    "test_runtime_checks_complex": fail_non_abi_compatible_cuda(is_skip=True),
+    "test_runtime_checks_fp8": fail_non_abi_compatible_cuda(is_skip=True),
+    "test_runtime_checks_dtype_failed": fail_non_abi_compatible_cuda(is_skip=True),
+    "test_runtime_checks_shape_failed": fail_non_abi_compatible_cuda(is_skip=True),
+    # quantized unsupported for GPU
+    "test_quantized_linear": fail_cuda(is_skip=True),
 }
 
 if TEST_WITH_ROCM:
     CUDA_TEST_FAILURES.update(
         {
             "test_dup_unbacked_sym_decl": fail_cuda(is_skip=True),
+            "test_dup_unbacked_sym_decl_with_refinement": fail_cuda(is_skip=True),
             "test_addmm_multiple_dynamic": fail_cuda(is_skip=True),
             "test_bmm_multiple_dynamic": fail_cuda(is_skip=True),
             "test_convolution": fail_cuda(is_skip=True),
             "test_large": fail_cuda(is_skip=True),
+            "test_large_mmaped_weights": fail_cuda(is_skip=True),
             "test_missing_cubin": fail_cuda(is_skip=True),
             "test_multi_device": fail_cuda(is_skip=True),
             "test_poi_multiple_dynamic": fail_cuda(is_skip=True),
@@ -1647,6 +2933,17 @@ def fail_abi_compatible_cuda(is_skip=False):
             "test_foreach_multiple_dynamic": fail_cuda(is_skip=True),
             "test_reuse_kernel": fail_cuda(is_skip=True),
             "test_zero_grid_with_unbacked_symbols": fail_cuda(is_skip=True),
+            "test_zero_grid_with_backed_symbols": fail_cuda(is_skip=True),
+            "test_reuse_kernel_dynamic": fail_cuda(is_skip=True),
+            "test_duplicate_constant_folding": fail_cuda(is_skip=True),
+            "test_cond_simple": fail_cuda(is_skip=True),
+            "test_cond_nested": fail_cuda(is_skip=True),
+            "test_cond_with_parameters": fail_cuda(is_skip=True),
+            "test_cond_with_reinterpret_view_inputs_outputs": fail_cuda(is_skip=True),
+            "test_cond_with_multiple_outputs": fail_cuda(is_skip=True),
+            "test_cond_with_outer_code_before_after": fail_cuda(is_skip=True),
+            "test_cond_use_buffers_from_outer_scope": fail_cuda(is_skip=True),
+            "test_index_put_with_none_index": fail_cuda(is_skip=True),
         }
     )
 
@@ -1657,6 +2954,7 @@ def fail_abi_compatible_cuda(is_skip=False):
     CPU_TEST_FAILURES.update(
         {
             "test_duplicated_params": fail_stack_allocation(is_skip=True),
+            "test_embedding_bag": fail_stack_allocation(is_skip=True),
             "test_fqn": fail_stack_allocation(is_skip=True),
             "test_no_args": fail_stack_allocation(is_skip=True),
             "test_output_misaligned": fail_stack_allocation(is_skip=True),
@@ -1666,11 +2964,18 @@ def fail_abi_compatible_cuda(is_skip=False):
             "test_addmm": fail_minimal_arrayref_interface(is_skip=True),
             "test_aliased_buffer_reuse": fail_minimal_arrayref_interface(is_skip=True),
             "test_buffer_reuse": fail_minimal_arrayref_interface(is_skip=True),
+            "test_constant_folding": fail_minimal_arrayref_interface(is_skip=True),
             "test_convolution": fail_minimal_arrayref_interface(is_skip=True),
             "test_empty_graph": fail_minimal_arrayref_interface(is_skip=True),
             "test_large": fail_minimal_arrayref_interface(is_skip=True),
+            "test_large_mmaped_weights": fail_minimal_arrayref_interface(is_skip=True),
+            "test_misc_1": fail_minimal_arrayref_interface(is_skip=True),
             "test_missing_output": fail_minimal_arrayref_interface(is_skip=True),
+            "test_model_modified_weights": fail_minimal_arrayref_interface(
+                is_skip=True
+            ),
             "test_output_path_1": fail_minimal_arrayref_interface(is_skip=True),
+            "test_quantized_linear": fail_minimal_arrayref_interface(is_skip=True),
             "test_repeat_interleave": fail_minimal_arrayref_interface(is_skip=True),
             "test_return_constant": fail_minimal_arrayref_interface(is_skip=True),
             "test_reuse_kernel": fail_minimal_arrayref_interface(is_skip=True),
@@ -1729,6 +3034,7 @@ class AOTInductorTestABICompatibleCpuWithStackAllocationAndMinimalArrayRefInterf
 )
 
 
+@unittest.skipIf(sys.platform == "darwin", "No CUDA on MacOS")
 class AOTInductorTestABICompatibleCuda(TestCase):
     device = "cuda"
     abi_compatible = True
@@ -1746,7 +3052,10 @@ class AOTInductorTestABICompatibleCuda(TestCase):
 )
 
 
-@unittest.skipIf(IS_FBCODE, "NonABI mode should not be used in fbcode")
+@unittest.skipIf(
+    IS_FBCODE or sys.platform == "darwin",
+    "NonABI mode should not be used in fbcode nor on MacOS",
+)
 class AOTInductorTestNonABICompatibleCpu(TestCase):
     device = "cpu"
     abi_compatible = False
@@ -1762,19 +3071,28 @@ class AOTInductorTestNonABICompatibleCpu(TestCase):
     "non_abi_compatible_cpu",
     # test_failures, xfail by default, set is_skip=True to skip
     {
-        "test_addmm_multiple_dynamic": TestFailure(("non_abi_compatible_cpu",)),
-        "test_bmm_multiple_dynamic": TestFailure(("non_abi_compatible_cpu",)),
-        "test_dynamic_smem_above_default_limit": TestFailure(
-            ("non_abi_compatible_cpu",)
+        "test_duplicate_constant_folding": TestFailure(
+            ("non_abi_compatible_cpu",), is_skip=True
         ),
         # TODO: test_freezing_non_abi_compatible_cpu somehow fails on CI but not locally,
         #   NotImplementedError: Cannot access storage of OpaqueTensorImpl
         "test_freezing": TestFailure(("non_abi_compatible_cpu",), is_skip=True),
+        # no runtime checks for non_abi_compatible mode
+        "test_runtime_checks": TestFailure(("non_abi_compatible_cpu",), is_skip=True),
+        "test_runtime_checks_dtype_failed": TestFailure(
+            ("non_abi_compatible_cpu",), is_skip=True
+        ),
+        "test_runtime_checks_shape_failed": TestFailure(
+            ("non_abi_compatible_cpu",), is_skip=True
+        ),
     },
 )
 
 
-@unittest.skipIf(IS_FBCODE, "NonABI mode should not be used in fbcode")
+@unittest.skipIf(
+    IS_FBCODE or sys.platform == "darwin",
+    "NonABI mode should not be used in fbcode nor on MacOS",
+)
 class AOTInductorTestNonABICompatibleCuda(TestCase):
     device = "cuda"
     abi_compatible = False
@@ -1793,8 +3111,8 @@ class AOTInductorTestNonABICompatibleCuda(TestCase):
 
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     # cpp_extension N/A in fbcode
-    if HAS_CUDA:
+    if HAS_CUDA or sys.platform == "darwin":
         run_tests(needs="filelock")
diff --git a/test/inductor/test_aot_inductor_utils.py b/test/inductor/test_aot_inductor_utils.py
index 67c11ff2ebc30..33632f3800b53 100644
--- a/test/inductor/test_aot_inductor_utils.py
+++ b/test/inductor/test_aot_inductor_utils.py
@@ -3,6 +3,7 @@
 import torch
 import torch._export
 import torch._inductor
+import torch.export._trace
 import torch.fx._pytree as fx_pytree
 
 from torch.testing._internal.common_utils import IS_FBCODE
@@ -10,6 +11,15 @@
 from torch.utils import _pytree as pytree
 
 
+class WrapperModule(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+
 class AOTIRunnerUtil:
     @classmethod
     def compile(
@@ -17,18 +27,31 @@ def compile(
         model,
         example_inputs,
         options=None,
-        constraints=None,
+        dynamic_shapes=None,
         disable_constraint_solver=False,
     ):
+        if not isinstance(model, torch.nn.Module):
+            model = WrapperModule(model)
         # The exact API is subject to change
-        so_path = torch._export.aot_compile(
-            model,
-            example_inputs,
-            options=options,
-            constraints=constraints,
-            remove_runtime_assertions=True,
-            disable_constraint_solver=disable_constraint_solver,
-        )
+        if torch._inductor.config.is_predispatch:
+            ep = torch.export._trace._export(
+                model, example_inputs, dynamic_shapes=dynamic_shapes, pre_dispatch=True
+            )
+            gm = ep.module()
+        else:
+            gm = torch.export._trace._export_to_torch_ir(
+                model,
+                example_inputs,
+                dynamic_shapes=dynamic_shapes,
+                disable_constraint_solver=disable_constraint_solver,
+                # Disabling this flag, because instead we can rely on the mapping
+                # dynamo_flat_name_to_original_fqn which is coming from Dynamo.
+                restore_fqn=False,
+            )
+
+        with torch.no_grad():
+            so_path = torch._inductor.aot_compile(gm, example_inputs, options=options)  # type: ignore[arg-type]
+
         return so_path
 
     @classmethod
@@ -43,22 +66,26 @@ def load_runner(cls, device, so_path):
             return (
                 torch._C._aoti.AOTIModelContainerRunnerCpu(so_path, 1)
                 if device == "cpu"
-                else torch._C._aoti.AOTIModelContainerRunnerCuda(so_path, 1)
+                else torch._C._aoti.AOTIModelContainerRunnerCuda(so_path, 1, device)
             )
 
     @classmethod
     def load(cls, device, so_path):
-        runner = AOTIRunnerUtil.load_runner(device, so_path)
+        # TODO: unify fbcode and oss behavior to only use torch._export.aot_load
+        if IS_FBCODE:
+            runner = AOTIRunnerUtil.load_runner(device, so_path)
 
-        def optimized(*args):
-            call_spec = runner.get_call_spec()
-            in_spec = pytree.treespec_loads(call_spec[0])
-            out_spec = pytree.treespec_loads(call_spec[1])
-            flat_inputs = fx_pytree.tree_flatten_spec((*args, {}), in_spec)
-            flat_outputs = runner.run(flat_inputs)
-            return pytree.tree_unflatten(flat_outputs, out_spec)
+            def optimized(*args, **kwargs):
+                call_spec = runner.get_call_spec()
+                in_spec = pytree.treespec_loads(call_spec[0])
+                out_spec = pytree.treespec_loads(call_spec[1])
+                flat_inputs = fx_pytree.tree_flatten_spec((args, kwargs), in_spec)
+                flat_outputs = runner.run(flat_inputs)
+                return pytree.tree_unflatten(flat_outputs, out_spec)
 
-        return optimized
+            return optimized
+        else:
+            return torch._export.aot_load(so_path, device)
 
     @classmethod
     def run(
@@ -67,18 +94,18 @@ def run(
         model,
         example_inputs,
         options=None,
-        constraints=None,
+        dynamic_shapes=None,
         disable_constraint_solver=False,
     ):
         so_path = AOTIRunnerUtil.compile(
             model,
             example_inputs,
             options=options,
-            constraints=constraints,
+            dynamic_shapes=dynamic_shapes,
             disable_constraint_solver=disable_constraint_solver,
         )
         optimized = AOTIRunnerUtil.load(device, so_path)
-        return optimized(example_inputs)
+        return optimized(*example_inputs)
 
     @classmethod
     def run_multiple(
@@ -87,16 +114,16 @@ def run_multiple(
         model,
         list_example_inputs,
         options=None,
-        constraints=None,
+        dynamic_shapes=None,
     ):
         so_path = AOTIRunnerUtil.compile(
             model,
             list_example_inputs[0],
             options=options,
-            constraints=constraints,
+            dynamic_shapes=dynamic_shapes,
         )
         optimized = AOTIRunnerUtil.load(device, so_path)
         list_output_tensors = []
         for example_inputs in list_example_inputs:
-            list_output_tensors.append(optimized(example_inputs))
+            list_output_tensors.append(optimized(*example_inputs))
         return list_output_tensors
diff --git a/test/inductor/test_benchmark_fusion.py b/test/inductor/test_benchmark_fusion.py
index 237be763aa0a6..c2f435e9bf94e 100644
--- a/test/inductor/test_benchmark_fusion.py
+++ b/test/inductor/test_benchmark_fusion.py
@@ -4,13 +4,17 @@
 import sys
 
 import torch
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch._inductor.utils import fresh_inductor_cache, run_and_get_code
+from torch.testing import FileCheck
 from torch.testing._internal.common_utils import (
     IS_CI,
     IS_WINDOWS,
     skipIfRocm,
+    slowTest,
     TEST_WITH_ASAN,
-    TestCase as TorchTestCase,
 )
+
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 # Make the helper files in test/ importable
@@ -32,10 +36,11 @@
         sys.exit(0)
     raise unittest.SkipTest("requires sympy/functorch/filelock")
 
+
 from inductor.test_torchinductor import check_model, check_model_cuda, copy_tests
 
 
-class TestCase(TorchTestCase):
+class TestCase(InductorTestCase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
@@ -62,6 +67,7 @@ def f(x):
 
         self.common(f, (torch.rand(2, 8192),))
 
+    @slowTest
     @skipIfRocm  # fail accuracy check on ROCm
     def test_resnet18(self):
         import torchvision
@@ -115,6 +121,69 @@ def f(*inputs):
             opt_f = torch.compile(f)
             opt_f(*inputs)
 
+    def test_foreach_kernel(self):
+        """
+        Benchmark fusion should skip benchmarking kernels involves foreach kernel
+        for now. Without the skipping logic, `codegen_node_schedule` may fail.
+        """
+        a = torch.randn(1024, 256, device=self.device)
+        b = torch.randn(1024, 512, device=self.device)
+
+        def f(a, b):
+            a, b = torch._foreach_abs([a, b])
+            return a + 1, b + 2
+
+        self.common(f, (a, b))
+
+    @torch._inductor.config.patch(max_autotune_gemm_backends="TRITON")
+    def test_avoid_register_spilling(self):
+        if self.device != "cuda":
+            raise unittest.SkipTest("CUDA only")
+
+        from torch.nn.functional import gelu
+
+        def foo(m, inp):
+            curr = m(inp)
+            tmps = []
+            for _ in range(4):
+                curr = gelu(curr)
+                for t in tmps:
+                    curr = curr + t
+                tmps.append(curr)
+
+            return curr
+
+        m = torch.nn.Linear(2048, 2048, bias=True).half().cuda()
+        inp = torch.rand([2048, 2048]).half().cuda()
+
+        with torch.no_grad():
+            foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
+
+            _, out_code = run_and_get_code(foo_c, m, inp)
+
+            # occasionally, CI will make this one kernel. just skip in this case
+            if not out_code[0].count("def triton_") == 2:
+                return
+
+            # should be multiple triton invocations
+            FileCheck().check("async_compile.wait").check_count(
+                ".run", 2, exactly=True
+            ).run(out_code[0])
+
+        with config.patch(
+            {"benchmark_fusion": False, "epilogue_fusion": False}
+        ), torch.no_grad():
+            torch._dynamo.reset()
+
+            foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
+
+            _, out_code2 = run_and_get_code(foo_c, m, inp)
+
+        for c in out_code[0], out_code2[0]:
+            FileCheck().check("async_compile.wait").check("DeviceGuard").check_count(
+                "empty_strided_cuda", 2, exactly=True
+            ).check("return").run(c)
+
 
 if HAS_CUDA and not TEST_WITH_ASAN:
 
@@ -124,6 +193,108 @@ class BenchmarkFusionCudaTest(TestCase):
 
     copy_tests(BenchmarkFusionTestTemplate, BenchmarkFusionCudaTest, "cuda")
 
+    class BenchmarkMultiTemplateFusionCudaTest(InductorTestCase):
+        @classmethod
+        def setUpClass(cls):
+            super().setUpClass()
+            cls._stack = contextlib.ExitStack()
+            cls._stack.enter_context(
+                config.patch(
+                    {
+                        "benchmark_kernel": True,
+                        "benchmark_fusion": True,
+                        "benchmark_multi_templates": True,
+                    }
+                )
+            )
+
+        @classmethod
+        def tearDownClass(cls):
+            cls._stack.close()
+            super().tearDownClass()
+
+        def _equivalent_output_code_impl(self, size, first_dim=None, activation=True):
+            def foo(m, inp):
+                a = m(inp)
+                if activation:
+                    return torch.nn.functional.relu(a)
+                return a
+
+            foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
+            first_dim = first_dim if first_dim is not None else size
+
+            m = torch.nn.Linear(size, size, bias=True).half().cuda()
+            inp = torch.rand([first_dim, size]).half().cuda()
+
+            with torch.no_grad():
+                res, code = run_and_get_code(foo_c, m, inp)
+
+            torch._dynamo.reset()
+            with unittest.mock.patch.object(
+                torch._inductor.config, "benchmark_multi_templates", False
+            ):
+                foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
+                with torch.no_grad():
+                    res2, code2 = run_and_get_code(foo_c, m, inp)
+
+            self.assertEqual(res, res2, atol=1e-4, rtol=1.1)
+            return code, code2
+
+        @fresh_inductor_cache()
+        @torch._inductor.config.patch(max_autotune_gemm_backends="TRITON")
+        def test_equivalent_template_code(self):
+            code, code2 = self._equivalent_output_code_impl(256)
+            for out_code in [code, code2]:
+                FileCheck().check("def call").check_count(
+                    "empty_strided_cuda", 1, exactly=True
+                ).check("triton_tem_fused_relu_0.run").check_count(
+                    "del", 3, exactly=True
+                ).check(
+                    "return"
+                ).run(
+                    out_code[0]
+                )
+
+        @fresh_inductor_cache()
+        @torch._inductor.config.patch(max_autotune_gemm_backends="ATEN")
+        def test_equivalent_extern_code(self):
+            torch._dynamo.reset()
+
+            code, code2 = self._equivalent_output_code_impl(512, 1, False)
+
+            for out_code in [code, code2]:
+                FileCheck().check("def call").check_count(
+                    "empty_strided_cuda", 1, exactly=True
+                ).check("extern_kernels.").check_count("del", 3, exactly=True).check(
+                    "return"
+                ).run(
+                    out_code[0]
+                )
+
+        def test_changed_layout(self):
+            # cat addmm planning will change layout - make sure propagated
+            def fn(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
+                return torch.cat(
+                    [
+                        torch.addmm(a, b, c),
+                        torch.addmm(b, c, a),
+                    ],
+                    1,
+                )
+
+            args = [
+                torch.randn(4, 4, device="cuda"),
+                torch.randn(4, 4, device="cuda"),
+                torch.randn(4, 4, device="cuda"),
+            ]
+
+            expected = fn(*args)
+            actual = torch.compile(fn, mode="max-autotune")(*args)
+            self.assertEqual(expected, actual)
+
+            torch._dynamo.reset()
+
+
 if HAS_CPU and not torch.backends.mps.is_available():
 
     class BenchmarkFusionCpuTest(TestCase):
@@ -133,7 +304,7 @@ class BenchmarkFusionCpuTest(TestCase):
     copy_tests(BenchmarkFusionTestTemplate, BenchmarkFusionCpuTest, "cpu")
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if HAS_CPU or HAS_CUDA:
         run_tests()
diff --git a/test/inductor/test_binary_folding.py b/test/inductor/test_binary_folding.py
index 3c51a00171e13..a8e6392892f7b 100644
--- a/test/inductor/test_binary_folding.py
+++ b/test/inductor/test_binary_folding.py
@@ -252,7 +252,7 @@ class FreezingCudaTests(TestCase):
 del BinaryFoldingTemplate
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if HAS_CPU or HAS_CUDA:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
index a2e1355f47558..5b5dd5cde3c2a 100644
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@@ -1,35 +1,57 @@
 # Owner(s): ["module: inductor"]
+import base64
 import functools
+import json
+import os
 import pickle
-import tempfile
 import unittest
-from unittest.mock import patch
+from typing import List
+from unittest import mock
 
 import torch
-from torch._dynamo.test_case import run_tests, TestCase
+from torch._dynamo import reset
 from torch._dynamo.utils import counters
-from torch._inductor import config
+from torch._inductor import config, metrics
 from torch._inductor.codecache import (
     AsyncCompile,
+    cuda_compile_command,
+    CUDACodeCache,
     FxGraphCachePickler,
     FxGraphHashDetails,
+    PyCodeCache,
     TensorMetadata,
     TensorMetadataAndValues,
 )
+from torch._inductor.runtime.runtime_utils import cache_dir
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import clear_inductor_caches, fresh_inductor_cache
 from torch.testing._internal.common_cuda import SM80OrLater
 from torch.testing._internal.common_device_type import largeTensorTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
+    skipIfRocm,
+)
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_CUDA,
+    HAS_GPU,
+    HAS_MULTIGPU,
 )
-from torch.testing._internal.inductor_utils import HAS_CUDA
 from torch.utils._triton import has_triton
 
 HAS_TRITON = has_triton()
 
-requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
+if HAS_TRITON:
+    import triton
+    from torch.testing._internal.triton_utils import add_kernel
+
+requires_gpu = functools.partial(unittest.skipIf, not HAS_GPU, "requires gpu")
 requires_triton = functools.partial(unittest.skipIf, not HAS_TRITON, "requires triton")
 
+torch._dynamo.config.fake_tensor_cache_enabled = True
+torch._dynamo.config.fake_tensor_cache_crosscheck_enabled = True
+
 
 class MyModel(torch.nn.Module):
     def __init__(self):
@@ -46,18 +68,18 @@ def _run_codecache_test(start_method):
     ):
         AsyncCompile.warm_pool()
 
-        model = MyModel().cuda()
+        model = MyModel().to(device=GPU_TYPE)
         model = torch.compile(model)
-        inp = torch.rand(10, 10).cuda()
+        inp = torch.rand(10, 10).to(device=GPU_TYPE)
         model(inp).sum().backward()
 
 
-@requires_cuda()
+@requires_gpu()
 def test_codecache_spawn():
     _run_codecache_test("spawn")
 
 
-@requires_cuda()
+@requires_gpu()
 def test_codecache_fork():
     _run_codecache_test("fork")
 
@@ -77,37 +99,25 @@ def forward(self, x):
 
 @instantiate_parametrized_tests
 class TestFxGraphCache(TestCase):
-    @classmethod
-    def setUpClass(cls):
-        # Reroute all cache disk activity to a clean temporary directory to
-        # ensure isolation (and initial cache misses). Deliberately create the
-        # temp dir in setUpClass, however, so that individual test runs reuse
-        # the same location. We don't expect different tests to reuse cache
-        # entries, so preserving the temp dir provides that additional testing.
-        cls.tmpdir = tempfile.TemporaryDirectory()
-        cls.cache_dir_patch = patch("torch._inductor.codecache.cache_dir")
-        cls.cache_dir_patch.start().return_value = cls.tmpdir.name
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.cache_dir_patch.stop()
-        cls.tmpdir.cleanup()
-
     def setUp(self):
         super().setUp()
         counters.clear()
 
+    def reset(self):
+        torch._dynamo.reset()
+        clear_inductor_caches()
+
     @requires_triton()
     @config.patch({"fx_graph_cache": True})
-    @parametrize("device", ("cuda", "cpu"))
+    @parametrize("device", (GPU_TYPE, "cpu"))
     @parametrize("dtype", (torch.float32, torch.bfloat16))
     @parametrize("dynamic", (False, True))
     def test_cache_load_function(self, device, dtype, dynamic):
         """
         Verify that we can populate and load functions from the cache.
         """
-        if device == "cuda" and not HAS_CUDA:
-            raise unittest.SkipTest("requires CUDA")
+        if device == GPU_TYPE and not HAS_GPU:
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
         if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
             raise unittest.SkipTest("requires SM80 or later")
 
@@ -116,39 +126,105 @@ def fn(x, y):
 
         a = torch.rand(25, dtype=dtype, device=device)
         b = torch.rand(5, 5, dtype=dtype, device=device)
-        c = a.view(5, 5)
 
         compiled_fn = torch.compile(fn, dynamic=dynamic)
 
-        # A first call shold miss in the cache.
+        # A first call should miss in the cache.
         self.assertEqual(fn(a, b), compiled_fn(a, b))
         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
         self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+        self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 0)
 
         # A second call should hit. (First reset so in-memory guards
         # don't prevent compilation).
-        torch._dynamo.reset()
+        for m in torch._inductor.codecache.PyCodeCache.cache.values():
+            os.remove(m.__file__)
+        self.reset()
         self.assertEqual(fn(a, b), compiled_fn(a, b))
         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
         self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
+        self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 1)
 
-        # But we expect different code if the tensors are aliased.
-        torch._dynamo.reset()
-        self.assertEqual(fn(a, c), compiled_fn(a, c))
-        self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 2)
-        self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
+    @requires_triton()
+    @parametrize("device", (GPU_TYPE, "cpu"))
+    @parametrize("dtype", (torch.float32, torch.bfloat16))
+    @parametrize("dynamic", (False, True))
+    def test_remote_cache_load_function(self, device, dtype, dynamic):
+        from unittest.mock import patch
+
+        if device == GPU_TYPE and not HAS_GPU:
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
+        if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
+            raise unittest.SkipTest("requires SM80 or later")
+
+        def fn(x, y):
+            return (x * 2, y @ y)
+
+        a = torch.rand(25, dtype=dtype, device=device)
+        b = torch.rand(5, 5, dtype=dtype, device=device)
+
+        cache = {}
+        num_get = 0
+        num_put = 0
+
+        class MyCache:
+            def __init__(self, key, is_autotune=False):
+                pass
+
+            def get(self, filename):
+                nonlocal cache
+                nonlocal num_get
+                if filename not in cache:
+                    return None
+                ret = json.loads(cache[filename])
+                num_get += 1
+                if config.is_fbcode():
+                    return base64.b64decode(ret["data"]) if ret is not None else ret
+                else:
+                    return base64.b64decode(ret) if ret is not None else ret
+
+            def put(self, filename, data):
+                nonlocal cache
+                nonlocal num_put
+                if config.is_fbcode():
+                    data["data"] = base64.b64encode(data["data"]).decode("ascii")
+                else:
+                    data = base64.b64encode(data).decode("ascii")
+                cache[filename] = json.dumps(data)
+                num_put += 1
+
+        cache_module = (
+            "triton.runtime.fb_memcache.FbMemcacheRemoteFxGraphCacheBackend"
+            if config.is_fbcode()
+            else "triton.runtime.cache.RedisRemoteCacheBackend"
+        )
+
+        with config.patch(
+            {
+                "fx_graph_cache": False,
+                "fx_graph_remote_cache": True,
+            }
+        ), patch.dict(os.environ), patch(cache_module, MyCache, create=True):
+            os.environ.pop("TRITON_CACHE_MANAGER", None)
+            for _ in range(4):
+                with fresh_inductor_cache():
+                    compiled_fn = torch.compile(fn, dynamic=dynamic)
+                    self.assertEqual(fn(a, b), compiled_fn(a, b))
+                reset()
+            self.assertEqual(num_get, 3)
+            self.assertEqual(num_put, 1)
 
     @requires_triton()
     @config.patch({"fx_graph_cache": True})
-    @parametrize("device", ("cuda", "cpu"))
+    @parametrize("device", (GPU_TYPE, "cpu"))
     @parametrize("dtype", (torch.float32, torch.float64))
     @parametrize("dynamic", (False, True))
     def test_cache_load_model(self, device, dtype, dynamic):
         """
         Verify that we can populate and load models from the cache.
         """
-        if device == "cuda" and not HAS_CUDA:
-            raise unittest.SkipTest("requires CUDA")
+        if device == GPU_TYPE and not HAS_GPU:
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
 
         def fn(mod, x):
             mod.zero_grad()
@@ -169,7 +245,7 @@ def fn(mod, x):
         # The second should see all hits. (First reset so in-memory guards
         # don't prevent compilation).
         counters.clear()
-        torch._dynamo.reset()
+        self.reset()
         grads2 = compiled_fn(mod, inp)
         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
         self.assertGreater(counters["inductor"]["fxgraph_cache_hit"], 0)
@@ -177,19 +253,19 @@ def fn(mod, x):
         # And the results should be the same.
         self.assertEqual(grads1, grads2)
 
-    @largeTensorTest("64GB", device="cuda")
+    @largeTensorTest("64GB", device=GPU_TYPE)
     @config.patch({"fx_graph_cache": True})
-    @parametrize("device", ("cuda",))
+    @parametrize("device", (GPU_TYPE,))
     @parametrize("dtype", (torch.float16, torch.bfloat16))
     def test_cache_load_with_guards_int32_bounds(self, device, dtype):
         """
         Test caching the same graph, but under conditions that introduce guards
         for tensor sizes < int32.
         """
-        if device == "cuda" and not HAS_CUDA:
-            raise unittest.SkipTest("requires CUDA")
+        if device == GPU_TYPE and not HAS_GPU:
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
         if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
-            raise unittest.SkipTest("requires SM80 or later")
+            raise unittest.SkipTest("requires CUDA SM80 or later")
 
         def fn(x, y):
             return (x + x, y + y)
@@ -219,7 +295,7 @@ def fn(x, y):
 
             # A second call should hit. (Reset here to force compilation).
             counters.clear()
-            torch._dynamo.reset()
+            self.reset()
             res2 = compiled_fn(a, b)
             self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
             self.assertGreater(counters["inductor"]["fxgraph_cache_hit"], 0)
@@ -227,15 +303,15 @@ def fn(x, y):
             self.assertEqual(res1, res2)
 
     @config.patch({"fx_graph_cache": True})
-    @parametrize("device", ("cuda", "cpu"))
+    @parametrize("device", (GPU_TYPE, "cpu"))
     @parametrize("dtype", (torch.float32, torch.bfloat16))
     def test_cache_load_with_guards_static_bounds(self, device, dtype):
         """
         Test caching the same graph, but under conditions that introduce guards
         for static bounds.
         """
-        if device == "cuda" and not HAS_CUDA:
-            raise unittest.SkipTest("requires CUDA")
+        if device == GPU_TYPE and not HAS_GPU:
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
         if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
             raise unittest.SkipTest("requires SM80 or later")
 
@@ -262,13 +338,98 @@ def fn(x):
 
             # A second call should hit.
             counters.clear()
-            torch._dynamo.reset()
+            self.reset()
             res2 = compiled_fn(x)
             self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
             self.assertGreater(counters["inductor"]["fxgraph_cache_hit"], 0)
 
             self.assertEqual(res1, res2)
 
+    @config.patch({"fx_graph_cache": True})
+    @parametrize("device", (GPU_TYPE, "cpu"))
+    def test_constant_handling(self, device):
+        """
+        Test that different constants are recognized correctly.
+        """
+        if device == GPU_TYPE and not HAS_GPU:
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
+
+        def fn1(x):
+            return x + torch.tensor(list(range(0, 12)), device=device)
+
+        def fn2(x):
+            return x + torch.tensor(list(range(1, 13)), device=device)
+
+        a = torch.rand(12, device=device)
+
+        compiled_fn1 = torch.compile(fn1)
+        compiled_fn2 = torch.compile(fn2)
+
+        # A call to fn1 should miss in the cache.
+        self.assertEqual(fn1(a), compiled_fn1(a))
+        self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
+        self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+
+        # A call to fn2 should also miss (the constant is different)
+        self.assertEqual(fn2(a), compiled_fn2(a))
+        self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 2)
+        self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+
+    @requires_gpu()
+    @requires_triton()
+    @config.patch({"fx_graph_cache": True})
+    def test_higher_order_op_bypass(self):
+        """
+        Verify that we bypass the cache when we have higher order ops.
+        """
+
+        def fn(x, y):
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+            grid = lambda meta: (  # noqa: E731
+                triton.cdiv(n_elements, meta["BLOCK_SIZE"]),
+            )
+            add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=4)
+            return output
+
+        compiled_fn = torch.compile(fn, fullgraph=True)
+
+        x = torch.randn(4, device=GPU_TYPE)
+        y = torch.randn(4, device=GPU_TYPE)
+        compiled_fn(x, y)
+
+        self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
+        self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+        self.assertGreater(counters["inductor"]["fxgraph_cache_bypass"], 0)
+
+    @config.patch({"fx_graph_cache": True})
+    def test_generated_kernel_count(self):
+        """
+        Test that we bump the generated_kernel_count metric on a cache hit.
+        """
+
+        def fn(x, y):
+            return (x * y + y,)
+
+        a = torch.rand(5, 5)
+        b = torch.rand(5, 5)
+
+        compiled_fn = torch.compile(fn)
+
+        metrics.reset()
+        self.assertEqual(metrics.generated_kernel_count, 0)
+
+        # Verify the "miss" case.
+        self.assertEqual(fn(a, b), compiled_fn(a, b))
+        self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
+        self.assertEqual(metrics.generated_kernel_count, 1)
+
+        # Verify the "hit" case
+        self.reset()
+        self.assertEqual(fn(a, b), compiled_fn(a, b))
+        self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
+        self.assertEqual(metrics.generated_kernel_count, 2)
+
     @config.patch({"fx_graph_cache": True})
     def test_cache_clear(self):
         """
@@ -283,21 +444,21 @@ def fn(x, y):
 
         compiled_fn = torch.compile(fn)
 
-        # A first call shold miss in the cache.
+        # A first call should miss in the cache.
         self.assertEqual(fn(a, b), compiled_fn(a, b))
         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
         self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
 
         # A second call should hit.
         counters.clear()
-        torch._dynamo.reset()
+        self.reset()
         self.assertEqual(fn(a, b), compiled_fn(a, b))
         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
         self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
 
         # Clear the cache; now we should miss.
         counters.clear()
-        torch._dynamo.reset()
+        self.reset()
         torch._inductor.codecache.FxGraphCache.clear()
         self.assertEqual(fn(a, b), compiled_fn(a, b))
         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
@@ -307,12 +468,9 @@ def fn(x, y):
 class TestFxGraphCacheHashing(TestCase):
     def test_tensor_constants(self):
         """
-        Test the handling of small vs. large tensor constants.
+        Test the hashing of tensor constants.
         """
         data = FxGraphCachePickler.dumps(torch.tensor(list(range(9))))
-        self.assertIsInstance(pickle.loads(data), TensorMetadata)
-
-        data = FxGraphCachePickler.dumps(torch.tensor(list(range(8))))
         self.assertIsInstance(pickle.loads(data), TensorMetadataAndValues)
 
     def test_hash_fake_tensors(self):
@@ -411,14 +569,14 @@ def test_hash_fake_tensors(self):
                 FxGraphCachePickler.dumps(torch.randn(3, device="cpu")),
             )
 
-            if HAS_CUDA and torch.cuda.device_count() >= 2:
+            if HAS_MULTIGPU:
                 self.assertEqual(
-                    FxGraphCachePickler.dumps(torch.randn(3, device="cuda:1")),
-                    FxGraphCachePickler.dumps(torch.randn(3, device="cuda:1")),
+                    FxGraphCachePickler.dumps(torch.randn(3, device=f"{GPU_TYPE}:1")),
+                    FxGraphCachePickler.dumps(torch.randn(3, device=f"{GPU_TYPE}:1")),
                 )
                 self.assertNotEqual(
-                    FxGraphCachePickler.dumps(torch.randn(3, device="cuda:0")),
-                    FxGraphCachePickler.dumps(torch.randn(3, device="cuda:1")),
+                    FxGraphCachePickler.dumps(torch.randn(3, device=f"{GPU_TYPE}:0")),
+                    FxGraphCachePickler.dumps(torch.randn(3, device=f"{GPU_TYPE}:1")),
                 )
 
     def test_hash_kwargs(self):
@@ -481,6 +639,57 @@ def test_hash_config_changes(self):
             FxGraphCachePickler.dumps(details3),
         )
 
+    @skipIfRocm
+    @unittest.skipIf(not HAS_CUDA, "Requires CUDA")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    def test_cuda_compile_command(self):
+        cmd_no_extra_args: str = cuda_compile_command(
+            ["abc.cu", "def.cu"], "output", "so"
+        )
+        assert "nvcc " in cmd_no_extra_args, cmd_no_extra_args
+        assert "abc.cu" in cmd_no_extra_args, cmd_no_extra_args
+        assert "def.cu" in cmd_no_extra_args, cmd_no_extra_args
+        assert "output" in cmd_no_extra_args, cmd_no_extra_args
+        cmd_extra_args: str = cuda_compile_command(
+            ["abc.cu", "def.cu"], "output", "so", ["-Wwhatever", "-nothing"]
+        )
+        assert "nvcc " in cmd_extra_args, cmd_extra_args
+        assert " -Wwhatever" in cmd_extra_args, cmd_extra_args
+        assert " -nothing" in cmd_extra_args, cmd_extra_args
+        assert "abc.cu" in cmd_extra_args, cmd_extra_args
+        assert "def.cu" in cmd_extra_args, cmd_extra_args
+        assert "output " in cmd_extra_args, cmd_extra_args
+        with mock.patch("subprocess.check_output") as check_output_mock:
+            CUDACodeCache.compile("test123.cu", "so", ["-Wsomething"])
+            check_output_mock.assert_called()
+            cmd_parts: List[str] = check_output_mock.call_args[0][0]
+            assert cmd_parts[0] == "nvcc", cmd_parts
+            assert "-Wsomething" in cmd_parts, cmd_parts
+            assert "-DNDEBUG" in cmd_parts, cmd_parts
+
+
+class TestUtils(TestCase):
+    def test_fresh_inductor_cache(self):
+        def fn(x, y):
+            return x + y
+
+        a = torch.rand(10)
+        b = torch.rand(10)
+
+        with fresh_inductor_cache():
+            self.assertEqual(len(PyCodeCache.cache.keys()), 0)
+            res1 = torch.compile(fn)(a, b)
+            cache_dir1 = cache_dir()
+
+        torch._dynamo.reset()
+        with fresh_inductor_cache():
+            self.assertEqual(len(PyCodeCache.cache.keys()), 0)
+            res2 = torch.compile(fn)(a, b)
+            cache_dir2 = cache_dir()
+
+        self.assertEqual(res1, res2)
+        self.assertNotEqual(cache_dir1, cache_dir2)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/inductor/test_codegen_triton.py b/test/inductor/test_codegen_triton.py
index 9bcaf6bd2200e..87c6c1150ac65 100644
--- a/test/inductor/test_codegen_triton.py
+++ b/test/inductor/test_codegen_triton.py
@@ -9,13 +9,13 @@
 from torch._inductor.codegen import triton_utils
 from torch._inductor.codegen.common import SizeArg
 from torch._inductor.graph import GraphLowering
+from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.virtualized import V
 
-from torch.testing._internal.common_utils import TestCase as TorchTestCase
-from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_GPU
 
 
-class TestCodegenTriton(TorchTestCase):
+class TestCodegenTriton(InductorTestCase):
     def setUp(self):
         super().setUp()
 
@@ -71,7 +71,7 @@ def test_config_of_sizearg(self):
 
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
-    if HAS_CPU or HAS_CUDA:
+    if HAS_CPU or HAS_GPU:
         run_tests("sympy")
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index aaef65650a172..074d075fc848c 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -11,21 +11,30 @@
 import torch.nn as nn
 from torch import _inductor as inductor
 from torch._dynamo import compiled_autograd
-from torch._dynamo.test_case import run_tests, TestCase
 from torch._dynamo.utils import counters
+from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+from torch.testing._internal.logging_utils import logs_to_string
 
 # note: these tests are not run on windows due to inductor_utils.HAS_CPU
 
 
-def compiler_fn(gm):
-    """Same as torch.compile() but counts number of compiles"""
+def make_compiler_fn(fullgraph=True, dynamic=True):
+    def _compiler_fn(gm):
+        """Same as torch.compile() but counts number of compiles"""
+
+        def _inner_compiler(gm_, example_inputs_):
+            counters["compiled_autograd"]["compiles"] += 1
+            return inductor.compile(gm_, example_inputs_)
+
+        return torch.compile(
+            gm, backend=_inner_compiler, fullgraph=fullgraph, dynamic=dynamic
+        )
 
-    def inner_compiler(gm_, example_inputs_):
-        counters["compiled_autograd"]["compiles"] += 1
-        return inductor.compile(gm_, example_inputs_)
+    return _compiler_fn
 
-    return torch.compile(gm, backend=inner_compiler, fullgraph=True, dynamic=True)
+
+compiler_fn = make_compiler_fn()
 
 
 # TODO(jansel): hooks as lambdas creates recompiles in dynamo, we should fix that
@@ -42,7 +51,13 @@ def hook3(gI, gO):
 
 
 class TestCompiledAutograd(TestCase):
-    def check_output_and_recompiles(self, fn, count=1):
+    def check_output_and_recompiles(
+        self, fn, count=1, compiler_fn=compiler_fn, compile_fn=False
+    ):
+        if isinstance(count, list):
+            captures, compiles = count
+        else:
+            captures, compiles = count, count
         with torch.autograd.set_multithreading_enabled(False):
             torch._dynamo.reset()
             counters["compiled_autograd"].clear()
@@ -50,10 +65,53 @@ def check_output_and_recompiles(self, fn, count=1):
             expected = list(fn())
             torch.manual_seed(123)
             with compiled_autograd.enable(compiler_fn):
-                actual = list(fn())
+                opt_fn = torch.compile(fn) if compile_fn else fn
+                actual = list(opt_fn())
             self.assertEqual(expected, actual)
-            self.assertEqual(counters["compiled_autograd"]["captures"], count)
-            self.assertEqual(counters["compiled_autograd"]["compiles"], count)
+            self.assertEqual(counters["compiled_autograd"]["captures"], captures)
+            self.assertEqual(counters["compiled_autograd"]["compiles"], compiles)
+
+    def test_dynamo_flaky_segfault(self):
+        import os
+        import subprocess
+
+        script = """
+import torch
+
+def main():
+    def compiler_fn(gm):
+        return torch.compile(gm, backend="eager")
+
+    def inner():
+        x = torch.randn(1000, 3000)
+        w = torch.randn(1000, 3000, requires_grad=True)
+        def model(i):
+            return torch.nn.functional.linear(i, w)
+        out = model(x)
+        loss = out.sum()
+        with torch._dynamo.compiled_autograd.enable(compiler_fn):
+            loss.backward()
+        assert(w.grad is not None)
+
+    inner()
+    torch._dynamo.reset()
+    inner()
+
+main()
+        """
+        # Run it three times to catch bad dynamo state resets
+        for _ in range(3):
+            try:
+                subprocess.check_output(
+                    [sys.executable, "-c", script],
+                    stderr=subprocess.STDOUT,
+                    # On Windows, opening the subprocess with the default CWD makes `import torch`
+                    # fail, so just set CWD to this script's directory
+                    cwd=os.path.dirname(os.path.realpath(__file__)),
+                )
+            except subprocess.CalledProcessError as e:
+                if e.returncode < 0:
+                    self.fail("Subprocess exited with a fatal signal")
 
     def test_basic(self):
         def fn():
@@ -163,6 +221,165 @@ def fn():
 
         self.check_output_and_recompiles(fn)
 
+    def test_dynamo_boxed(self):
+        def get_placeholders(gm_):
+            placeholders = []
+            for node in gm_.graph.nodes:
+                if node.op == "placeholder":
+                    placeholders.append(node)
+            return placeholders
+
+        def eager_with_check(gm, is_bwd):
+            def inner_compiler(gm_, example_inputs_):
+                placeholders = get_placeholders(gm_)
+                if is_bwd:
+                    # should be boxed inputs
+                    assert len(placeholders) == 1
+                    pass
+                else:
+                    assert len(placeholders) > 1
+
+                return gm_
+
+            return torch.compile(gm, backend=inner_compiler)
+
+        fwd_compiler_fn = functools.partial(eager_with_check, is_bwd=False)
+        bwd_compiler_fn = functools.partial(eager_with_check, is_bwd=True)
+
+        def fn(inputs):
+            args_0, args_1, args_2 = inputs
+            out = torch.mm(args_0, args_1)
+            out = torch.mm(out, args_2)
+            loss = out.sum()
+            with compiled_autograd.enable(bwd_compiler_fn):
+                loss.backward()
+            yield args_0.grad
+            yield args_1.grad
+            yield args_2.grad
+
+        inputs = [
+            torch.randn([1, 2], requires_grad=True),
+            torch.randn([2, 3], requires_grad=True),
+            torch.randn([3, 4], requires_grad=True),
+        ]
+
+        compiled_fn = eager_with_check(fn, is_bwd=False)
+        grads = list(compiled_fn(inputs))
+        self.assertEqual(len(grads), 3)
+        self.assertNotEqual(grads[0], None)
+        self.assertNotEqual(grads[1], None)
+        self.assertNotEqual(grads[2], None)
+
+    def test_inputs_aliasing_bytecode_attr_mutations(self):
+        # Freeze compiled autograd graph
+        compiler = torch._dynamo.compiled_autograd.AutogradCompilerInstance(compiler_fn)
+        param = torch.ones(100)
+        activ = torch.ones(100) * 2
+        inputs = [param, activ]
+        proxies, _ = compiler.begin_capture(inputs=inputs, sizes=[])
+        param_proxy, activ_proxy = proxies
+        buf = activ_proxy * 2
+        torch.ops.inductor.accumulate_grad_.default(param_proxy, buf)
+        compiled_fn = compiler.end_capture(buf)
+
+        def bytecode_hook(code, out_code):
+            import dis
+            import sys
+
+            if sys.version_info < (3, 11):
+                call_op = "CALL_FUNCTION"
+            else:
+                call_op = "CALL"
+
+            insts = list(dis.get_instructions(out_code))
+            call_graph_idx = next(
+                i for i, inst in enumerate(insts) if inst.opname == call_op
+            )
+            # pre-graph should alias: inputs_ref_0 = inputs[0]
+            matches = [
+                inst
+                for inst in insts[:call_graph_idx]
+                if inst.opname == "STORE_FAST" and inst.argval == "inputs_ref_0"
+            ]
+            self.assertTrue(len(matches) == 1)
+            # post-graph should access inputs_ref_0 instead of inputs
+            matches = [
+                inst for inst in insts[call_graph_idx:] if inst.argval == "inputs"
+            ]
+            self.assertTrue(len(matches) == 0)
+            matches = [
+                inst
+                for inst in insts[call_graph_idx:]
+                if inst.opname == "LOAD_FAST" and inst.argval == "inputs_ref_0"
+            ]
+            self.assertTrue(len(matches) == 1)
+
+        torch._dynamo.reset()
+        handle = torch._dynamo.convert_frame.register_bytecode_hook(bytecode_hook)
+        try:
+            compiled_fn(inputs=[param, activ], sizes=(), hooks=())
+        finally:
+            handle.remove()
+
+    def test_inputs_aliasing_bytecode_stack_restore(self):
+        from torch.testing._internal.logging_tensor import LoggingTensor
+
+        # Create a graph that allows inputs stealing
+        def forward(inputs):
+            add = inputs[0] + 1
+            add_1 = add + inputs[1]  # handled in suffix for tensor subclass
+            out = add_1.cpu()
+            return (out,)
+
+        gm = torch.fx.symbolic_trace(forward)
+        torch._dynamo.utils.set_locals_to_steal(gm, ["inputs"])
+        compiled_fn = torch.compile(gm)
+
+        inputs = [
+            torch.ones(1000000, dtype=torch.float32),
+            LoggingTensor(torch.ones(1)),
+        ]
+
+        def bytecode_hook(code, out_code):
+            import dis
+            import sys
+
+            if sys.version_info < (3, 11):
+                call_op = "CALL_FUNCTION"
+            else:
+                call_op = "CALL"
+
+            insts = list(dis.get_instructions(out_code))
+            call_graph_idx = next(
+                i for i, inst in enumerate(insts) if inst.opname == call_op
+            )
+            # pre-graph should alias: inputs_ref_0 = inputs[0]
+            matches = [
+                inst
+                for inst in insts[:call_graph_idx]
+                if inst.opname == "STORE_FAST" and inst.argval == "inputs_ref_0"
+            ]
+            self.assertTrue(len(matches) == 1)
+            # post-graph should access inputs_ref_0 instead of inputs
+            matches = [
+                inst for inst in insts[call_graph_idx:] if inst.argval == "inputs"
+            ]
+            self.assertTrue(len(matches) == 0)
+            matches = [
+                inst
+                for inst in insts[call_graph_idx:]
+                if inst.opname == "LOAD_FAST" and inst.argval == "inputs_ref_0"
+            ]
+            self.assertTrue(len(matches) == 1)
+
+        torch._dynamo.reset()
+        handle = torch._dynamo.convert_frame.register_bytecode_hook(bytecode_hook)
+        try:
+            out = compiled_fn(inputs)
+            self.assertTrue(len(inputs) == 0)
+        finally:
+            handle.remove()
+
     def test_implicit_add(self):
         def fn():
             y = torch.randn(1, 4, requires_grad=True)
@@ -376,6 +593,990 @@ def eager_check():
 
             eager_check()
 
+    def test_custom_fn_saved_tensors(self):
+        def fn():
+            class MySin(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    ctx.save_for_backward(x)
+                    return torch.sin(x)
+
+                @staticmethod
+                def backward(ctx, gO):
+                    (x,) = ctx.saved_tensors
+                    return gO * torch.cos(x)
+
+            for i in [10, 100, 10, 15, 20, 25]:
+                x = torch.arange(0.0, i, requires_grad=True)
+                out = MySin.apply(x)
+                loss = out.sum()
+                loss.backward()
+                yield x.grad
+
+        self.check_output_and_recompiles(fn, count=2)
+
+    def test_custom_fn_saved_multiple_tensors(self):
+        def fn():
+            class MyFn(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x, y):
+                    ctx.save_for_backward(x, y)
+                    return torch.sin(x), torch.sin(y)
+
+                @staticmethod
+                def backward(ctx, gO_x, gO_y):
+                    (x, y) = ctx.saved_tensors
+                    return gO_x * torch.cos(x), gO_y * torch.cos(y)
+
+            for i in [10, 100, 10, 15, 20, 25]:
+                x = torch.arange(0.0, i, requires_grad=True)
+                y = torch.arange(0.0, i, requires_grad=True)
+                out1, out2 = MyFn.apply(x, y)
+                loss = (out1 * out2).sum()
+                loss.backward()
+                yield x.grad
+
+        self.check_output_and_recompiles(fn, count=2)
+
+    def test_custom_fn_saved_multiple_tensors_dedup(self):
+        def fn():
+            class MyFn(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    ctx.save_for_backward(x, x)
+                    return torch.sin(x)
+
+                @staticmethod
+                def backward(ctx, gO):
+                    (x1, x2) = ctx.saved_tensors
+                    return gO * torch.cos(x1) * torch.cos(x2)
+
+            for i in [10, 100, 10, 15, 20, 25]:
+                x = torch.arange(0.0, i, requires_grad=True)
+                out = MyFn.apply(x)
+                loss = out.sum()
+                loss.backward()
+                yield x.grad
+
+        self.check_output_and_recompiles(fn, count=2)
+
+    def test_custom_fn_saved_shape_tensor(self):
+        def fn():
+            class MyFn(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    ctx.save_for_backward(x)
+                    return x
+
+                @staticmethod
+                def backward(ctx, gO):
+                    (x,) = ctx.saved_tensors
+                    return gO * x.shape[0]
+
+            for i in [10, 100, 10, 15, 20, 25]:
+                x = torch.arange(0.0, i, requires_grad=True)
+                out = MyFn.apply(x)
+                loss = out.sum()
+                loss.backward()
+                yield x.grad
+
+        self.check_output_and_recompiles(fn, count=2)
+
+    def test_custom_fn_saved_attr(self):
+        def fn():
+            class MyFn(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    ctx.shape = x.shape
+                    return x
+
+                @staticmethod
+                def backward(ctx, gO):
+                    x_shape = ctx.shape[0]
+                    return gO * x_shape
+
+            for i in [10, 100, 10, 15, 20, 25]:
+                x = torch.arange(0.0, i, requires_grad=True)
+                out = MyFn.apply(x)
+                loss = out.sum()
+                loss.backward()
+                yield x.grad
+
+        self.check_output_and_recompiles(
+            fn, count=2, compiler_fn=make_compiler_fn(fullgraph=False)
+        )
+
+    def test_custom_fn_multiple_grads(self):
+        def fn():
+            class MyFn(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x, y):
+                    return x + y, y
+
+                @staticmethod
+                def backward(ctx, gO_1, gO_2):
+                    return gO_1, gO_2
+
+            for i in [10, 100, 10, 15, 20, 25]:
+                x = torch.arange(0.0, i, requires_grad=True)
+                y = torch.arange(0.0, i, requires_grad=True)
+                out1, out2 = MyFn.apply(x, y)
+                loss = (out1 + out2).sum()
+                loss.backward()
+                yield x.grad
+                yield y.grad
+
+        self.check_output_and_recompiles(fn, count=2)
+
+    def test_custom_fn_non_variable_input(self):
+        def fn():
+            class MyFn(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x, y, z):
+                    return x * 2, y * 3, z * 4
+
+                @staticmethod
+                def backward(ctx, gO_1, gO_2, gO_3):
+                    return gO_1, gO_2, gO_3
+
+            for i in [10, 100, 10, 15, 20, 25]:
+                x = torch.arange(0.0, i, requires_grad=True)
+                y = 1
+                z = torch.arange(0.0, i, requires_grad=True)
+                out1, out2, out3 = MyFn.apply(x, y, z)
+                loss = (out1 + out2 + out3).sum()
+                loss.backward()
+                yield x
+                yield y
+                yield z
+
+        self.check_output_and_recompiles(fn, count=2)
+
+    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+    def test_custom_fn_output_metadata(self):
+        def my_compiler_fn(gm):
+            for node in gm.graph.nodes:
+                if isinstance(node.target, torch._ops.OpOverload):
+                    assert (
+                        node.target._name != "aten::_to_copy"
+                    ), "there should be no implicit copies (e.g. dtype casting)"
+
+            def inner_compiler(gm_, example_inputs_):
+                counters["compiled_autograd"]["compiles"] += 1
+                return inductor.compile(gm_, example_inputs_)
+
+            return torch.compile(
+                gm, backend=inner_compiler, fullgraph=True, dynamic=True
+            )
+
+        def fn():
+            class MyFn(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    return x
+
+                @staticmethod
+                def backward(ctx, gO):
+                    return gO
+
+            x = torch.arange(
+                1, 10, requires_grad=True, dtype=torch.float16, device="cuda"
+            )
+            x_view = x.view(3, 3)
+            out = MyFn.apply(x_view)
+            loss = out.sum()
+            loss.backward()
+            yield x.dtype
+            yield x.device
+            yield x.grad
+
+        self.check_output_and_recompiles(fn, count=1)
+
+    def test_custom_fn_with_same_graph(self):
+        def fn():
+            class MyFn1(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    return x
+
+                @staticmethod
+                def backward(ctx, gO):
+                    return gO
+
+            # same as MyFn1, but different autograd function id
+            # should not be using same graph as MyFn1
+            class MyFn2(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    return x
+
+                @staticmethod
+                def backward(ctx, gO):
+                    return gO
+
+            for myfn in [MyFn1, MyFn2, MyFn1, MyFn2]:
+                x = torch.arange(0.0, 10, requires_grad=True)
+                out = myfn.apply(x)
+                loss = out.sum()
+                loss.backward()
+                yield x.grad
+
+        self.check_output_and_recompiles(
+            fn, count=2
+        )  # should compile once for MyFn1 and once for MyFn2
+
+    def test_custom_fn_dynamically_defined_class(self):
+        def fn():
+            def create_class(multiplier: int):
+                class DynamicFn(torch.autograd.Function):
+                    @staticmethod
+                    def forward(ctx, x):
+                        return x * multiplier
+
+                    @staticmethod
+                    def backward(ctx, gO):
+                        return gO * multiplier
+
+                return DynamicFn
+
+            for multiplier in [10, 20, 30]:
+                x = torch.arange(0.0, 10, requires_grad=True)
+                out = create_class(multiplier).apply(x)
+                loss = out.sum()
+                loss.backward()
+                yield x.grad
+
+        self.check_output_and_recompiles(fn, count=3)
+
+    def test_custom_fn_bw_graph_break(self):
+        def fn():
+            class MySin(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    ctx.save_for_backward(x)
+                    return torch.sin(x)
+
+                @staticmethod
+                def backward(ctx, gO):
+                    print("graph break")
+                    (x,) = ctx.saved_tensors
+                    print("graph break")
+                    return gO * torch.cos(x)
+
+            for i in [10, 100, 10, 15, 20, 25]:
+                x = torch.arange(0.0, i, requires_grad=True)
+                out = MySin.apply(x)
+                loss = out.sum()
+                loss.backward()
+                yield x.grad
+
+        self.check_output_and_recompiles(
+            fn, count=[2, 6], compiler_fn=make_compiler_fn(fullgraph=False)
+        )
+
+    def test_custom_fn_compiled_fw_graph_break(self):
+        def fn():
+            class MySin(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    print("graph break")
+                    ctx.save_for_backward(x)
+                    return torch.sin(x)
+
+                @staticmethod
+                def backward(ctx, gO):
+                    (x,) = ctx.saved_tensors
+                    return gO * torch.cos(x)
+
+            opt_model = torch.compile(MySin.apply)
+            for i in [10, 100, 10, 15, 20, 25]:
+                x = torch.arange(0.0, i, requires_grad=True)
+                out = opt_model(x)
+                loss = out.sum()
+                loss.backward()
+                yield x.grad
+
+        self.check_output_and_recompiles(
+            fn, count=2, compiler_fn=make_compiler_fn(fullgraph=False)
+        )
+        self.assertEqual(counters["stats"]["unique_graphs"], 5)  # 3 fw, 2 bw
+
+    def test_custom_fn_compiled_fw_bw_graph_break(self):
+        def fn():
+            class MySin(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    print("graph break")
+                    ctx.save_for_backward(x)
+                    return torch.sin(x)
+
+                @staticmethod
+                def backward(ctx, gO):
+                    print("graph break")
+                    (x,) = ctx.saved_tensors
+                    return gO * torch.cos(x)
+
+            opt_model = torch.compile(MySin.apply)
+            for i in [10, 100, 10, 15, 20, 25]:
+                x = torch.arange(0.0, i, requires_grad=True)
+                out = opt_model(x)
+                loss = out.sum()
+                loss.backward()
+                yield x.grad
+
+        self.check_output_and_recompiles(
+            fn, count=[2, 6], compiler_fn=make_compiler_fn(fullgraph=False)
+        )
+        self.assertEqual(counters["stats"]["unique_graphs"], 9)  # 3 fw, 6 bw
+
+    def test_mismatch_fake_tensor_mode(self, dynamic_shape=False):
+        """
+        Repro the failure of training nanogpt with both compiled-autograd
+        and _LazyGraphModule. Check https://github.com/pytorch/pytorch/pull/118981
+        for more context.
+        """
+        B = 8
+        x = torch.rand(B, 16)
+        y = torch.rand(B, 16, requires_grad=True)
+
+        if dynamic_shape:
+            torch._dynamo.mark_dynamic(x, 0)
+            torch._dynamo.mark_dynamic(y, 0)
+
+        def f():
+            y.grad = None
+            out = x + y
+
+            # make sure the backward call does not trigger any error when
+            # compiling the backward graph
+            out.sum().backward()
+            return out, y.grad
+
+        self.check_output_and_recompiles(f, compile_fn=True)
+
+    def test_mismatch_fake_tensor_mode_dynamic_shape(self):
+        self.test_mismatch_fake_tensor_mode(dynamic_shape=True)
+
+    def test_accumulate_grad_accuracy(self):
+        def fn():
+            model = torch.nn.Sequential(
+                torch.nn.Linear(2, 1, bias=False),
+                torch.nn.Linear(1, 2, bias=False),
+            )
+            x = torch.randn(2, 2)
+
+            out = model(x)
+            loss = out.sum()
+            torch.manual_seed(0)
+            loss.backward()
+
+            yield model[0].weight.grad
+            yield model[1].weight.grad
+
+        self.check_output_and_recompiles(fn, 1)
+
+    def test_autograd_cpp_node(self):
+        cpp_source = """
+struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
+  static constexpr bool is_traceable = true;
+
+  static torch::Tensor forward(
+      torch::autograd::AutogradContext* ctx,
+      const torch::Tensor& x) {
+    return x;
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext *ctx,
+      torch::autograd::variable_list grad_output) {
+    return grad_output;
+  }
+};
+
+torch::Tensor custom_op_backed_by_autograd_fn(torch::Tensor x) {
+  return CustomOpAutogradFunction::apply(x);
+}
+
+TORCH_LIBRARY(test_autograd_cpp_node, m) {
+    m.def("custom_op_backed_by_autograd_fn", custom_op_backed_by_autograd_fn);
+}
+        """
+
+        module = torch.utils.cpp_extension.load_inline(
+            name="test_autograd_cpp_node",
+            cpp_sources=cpp_source,
+            functions="custom_op_backed_by_autograd_fn",
+            verbose=True,
+        )
+
+        def fn():
+            for i in [10, 100, 10, 20, 10]:
+                x = torch.ones(i, i, requires_grad=True)
+                out = torch.ops.test_autograd_cpp_node.custom_op_backed_by_autograd_fn(
+                    x
+                )
+                loss = out.sum()
+                loss.backward()
+                yield x.grad
+
+        # compiles for 10 (static) and 100 (dynamic)
+        self.check_output_and_recompiles(fn, 2)
+
+    def test_autograd_cpp_node_id(self):
+        cpp_source = """
+struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
+  static constexpr bool is_traceable = true;
+
+  static torch::Tensor forward(
+      torch::autograd::AutogradContext* ctx,
+      const torch::Tensor& x) {
+    return x;
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext *ctx,
+      torch::autograd::variable_list grad_output) {
+    return grad_output;
+  }
+};
+
+struct CustomOpAutogradFunction2 : public torch::autograd::Function<CustomOpAutogradFunction2> {
+  static constexpr bool is_traceable = true;
+
+  static torch::Tensor forward(
+      torch::autograd::AutogradContext* ctx,
+      const torch::Tensor& x) {
+    return x;
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext *ctx,
+      torch::autograd::variable_list grad_output) {
+    return grad_output;
+  }
+};
+
+torch::Tensor custom_op_backed_by_autograd_fn(torch::Tensor x) {
+  return CustomOpAutogradFunction::apply(x);
+}
+
+torch::Tensor custom_op_backed_by_autograd_fn2(torch::Tensor x) {
+  return CustomOpAutogradFunction2::apply(x);
+}
+
+TORCH_LIBRARY(test_autograd_cpp_node_id, m) {
+    m.def("custom_op_backed_by_autograd_fn", custom_op_backed_by_autograd_fn);
+    m.def("custom_op_backed_by_autograd_fn2", custom_op_backed_by_autograd_fn2);
+}
+        """
+
+        module = torch.utils.cpp_extension.load_inline(
+            name="test_autograd_cpp_node_id",
+            cpp_sources=cpp_source,
+            functions="custom_op_backed_by_autograd_fn",
+            verbose=True,
+        )
+
+        def same_autograd_fn():
+            def fn():
+                x = torch.ones(10, 10, requires_grad=True)
+                out = (
+                    torch.ops.test_autograd_cpp_node_id.custom_op_backed_by_autograd_fn(
+                        x
+                    )
+                )
+                loss = out.sum()
+                loss.backward()
+                yield x.grad
+
+            yield from fn()  # compile
+            yield from fn()  # reuse
+            yield from fn()  # reuse
+            yield from fn()  # reuse
+
+        self.check_output_and_recompiles(same_autograd_fn, 1)
+
+        def different_autograd_fn():
+            def fn(op):
+                x = torch.ones(10, 10, requires_grad=True)
+                out = op(x)
+                loss = out.sum()
+                loss.backward()
+                yield x.grad
+
+            op1 = torch.ops.test_autograd_cpp_node_id.custom_op_backed_by_autograd_fn
+            op2 = torch.ops.test_autograd_cpp_node_id.custom_op_backed_by_autograd_fn2
+            yield from fn(op1)  # compile
+            yield from fn(op2)  # compile
+            yield from fn(op1)  # reuse
+            yield from fn(op2)  # reuse
+
+        self.check_output_and_recompiles(different_autograd_fn, 2)
+
+    def test_autograd_cpp_node_saved(self):
+        cpp_source = """
+struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
+  static constexpr bool is_traceable = true;
+
+  static torch::Tensor forward(
+      torch::autograd::AutogradContext* ctx,
+      const torch::Tensor& x,
+      const torch::Tensor& y,
+      const torch::Tensor& fixed) {
+    ctx->save_for_backward({x, y});
+    ctx->saved_data["fixed_tensor"] = fixed;
+    ctx->saved_data["bool"] = true;
+    ctx->saved_data["int"] = 1;
+    c10::List<std::string> list({"string"});
+    ctx->saved_data["list"] = std::move(list);
+    c10::Dict<std::string, double> dict;
+    dict.insert("string", 1.0);
+    ctx->saved_data["dict"] = std::move(dict);
+    return x;
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext *ctx,
+      torch::autograd::variable_list grad_output) {
+    const auto& saved_variables = ctx->get_saved_variables();
+    assert(saved_variables.size() == 2);
+    torch::Tensor x = saved_variables[0];
+    torch::Tensor y = saved_variables[1];
+    torch::Tensor fixed = ctx->saved_data["fixed_tensor"].toTensor();
+    assert(ctx->saved_data["bool"].isBool());
+    int i = ctx->saved_data["int"].toInt();
+    c10::List<c10::IValue> list = ctx->saved_data["list"].toList();
+    assert(list.size() == 1);
+    assert(list.get(0).toStringRef() == "string");
+    c10::Dict<c10::IValue, c10::IValue> dict = ctx->saved_data["dict"].toGenericDict();
+    assert(dict.size() == 1);
+    assert(dict.at("string") == 1.0);
+
+    torch::autograd::variable_list grad_inputs(3);
+    grad_inputs[0] = x + y + torch::sum(fixed) + i;
+    return grad_inputs;
+  }
+};
+
+torch::Tensor custom_op_backed_by_autograd_fn(const torch::Tensor& x, const torch::Tensor& y, const torch::Tensor& fixed) {
+  return CustomOpAutogradFunction::apply(x, y, fixed);
+}
+
+TORCH_LIBRARY(test_autograd_cpp_node_saved, m) {
+    m.def("custom_op_backed_by_autograd_fn", custom_op_backed_by_autograd_fn);
+}
+        """
+
+        module = torch.utils.cpp_extension.load_inline(
+            name="test_autograd_cpp_node_saved",
+            cpp_sources=cpp_source,
+            functions="custom_op_backed_by_autograd_fn",
+            verbose=True,
+        )
+
+        def fn():
+            fixed = torch.ones(2, 2)
+            for i in [10, 100, 10, 20, 10]:
+                x = torch.ones(i, i, requires_grad=True)
+                y = torch.randn(i, i)
+                out = torch.ops.test_autograd_cpp_node_saved.custom_op_backed_by_autograd_fn(
+                    x, y, fixed
+                )
+                loss = out.sum()
+                loss.backward()
+                yield x.grad
+
+        self.check_output_and_recompiles(fn, 2)
+
+    def test_autograd_cpp_node_saved_dynamic(self):
+        cpp_source = """
+struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
+  static constexpr bool is_traceable = true;
+
+  static torch::Tensor forward(
+      torch::autograd::AutogradContext* ctx,
+      const torch::Tensor& x) {
+    ctx->save_for_backward({x});
+    ctx->saved_data["dynamic"] = x.view(-1);
+    return x;
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext *ctx,
+      torch::autograd::variable_list grad_output) {
+    const auto& saved_variables = ctx->get_saved_variables();
+    assert(saved_variables.size() == 1);
+    torch::Tensor x = saved_variables[0];
+    torch::Tensor z = ctx->saved_data["dynamic"].toTensor();
+
+    torch::autograd::variable_list grad_inputs(1);
+    grad_inputs[0] = x + torch::sum(z);
+    return grad_inputs;
+  }
+};
+
+torch::Tensor custom_op_backed_by_autograd_fn(const torch::Tensor& x) {
+  return CustomOpAutogradFunction::apply(x);
+}
+
+TORCH_LIBRARY(test_autograd_cpp_node_saved_dynamic, m) {
+    m.def("custom_op_backed_by_autograd_fn", custom_op_backed_by_autograd_fn);
+}
+        """
+
+        module = torch.utils.cpp_extension.load_inline(
+            name="test_autograd_cpp_node_saved_dynamic",
+            cpp_sources=cpp_source,
+            functions="custom_op_backed_by_autograd_fn",
+            verbose=True,
+        )
+
+        def fn():
+            for i in [10, 100, 10, 20, 10]:
+                x = torch.ones(i, i, requires_grad=True)
+                out = torch.ops.test_autograd_cpp_node_saved_dynamic.custom_op_backed_by_autograd_fn(
+                    x
+                )
+                loss = out.sum()
+                loss.backward()
+                yield x.grad
+
+        # can bring this down to 2 if we support dynamic shapes
+        # instead of collecting the saved_data's tensor hash
+        self.check_output_and_recompiles(fn, 5)
+
+    def test_autograd_cpp_node_data_dependent(self):
+        cpp_source = """
+struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
+  static constexpr bool is_traceable = true;
+  static int iteration;
+
+  static torch::autograd::variable_list forward(
+      torch::autograd::AutogradContext* ctx,
+      const torch::Tensor& x,
+      const torch::Tensor& y) {
+    ctx->save_for_backward({x, y});
+    ctx->saved_data["bool"] = true;
+    ctx->saved_data["int"] = 1;
+
+    switch (iteration) {
+        case 0: {
+            break;
+        }
+        case 1: {
+            // recompile
+            ctx->saved_data["forces_recompile"] = iteration;
+            break;
+        }
+        case 2: {
+            // recompile
+            ctx->set_materialize_grads(false);
+            break;
+        }
+        case 3: {
+            // reuse
+            break;
+        }
+        default: {
+            throw std::runtime_error("unexpected iteration");
+        }
+    }
+    iteration++;
+    return {x, y};
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext *ctx,
+      torch::autograd::variable_list grad_output) {
+    const auto& saved_variables = ctx->get_saved_variables();
+    assert(saved_variables.size() == 2);
+    torch::Tensor x = saved_variables[0];
+    torch::Tensor y = saved_variables[1];
+    assert(ctx->saved_data["bool"].isBool());
+    assert(ctx->saved_data["int"].isInt());
+    int i = ctx->saved_data["int"].toInt();
+
+    torch::autograd::variable_list grad_inputs(2);
+    grad_inputs[0] = x + y + i;
+    return grad_inputs;
+  }
+};
+
+int CustomOpAutogradFunction::iteration = 0;
+
+torch::autograd::variable_list custom_op_backed_by_autograd_fn(const torch::Tensor& x, const torch::Tensor& y) {
+  return CustomOpAutogradFunction::apply(x, y);
+}
+
+void reset() {
+    CustomOpAutogradFunction::iteration = 0;
+}
+
+TORCH_LIBRARY(test_autograd_cpp_node_data_dependent, m) {
+    m.def("custom_op_backed_by_autograd_fn", custom_op_backed_by_autograd_fn);
+    m.def("reset", reset);
+}
+        """
+
+        module = torch.utils.cpp_extension.load_inline(
+            name="test_autograd_cpp_node_data_dependent",
+            cpp_sources=cpp_source,
+            functions="custom_op_backed_by_autograd_fn",
+            verbose=True,
+        )
+
+        def fn():
+            torch.ops.test_autograd_cpp_node_data_dependent.reset()
+            for i in [10, 10, 10, 10]:
+                x = torch.ones(i, i, requires_grad=True)
+                y = torch.randn(i, i)
+                (
+                    out1,
+                    out2,
+                ) = torch.ops.test_autograd_cpp_node_data_dependent.custom_op_backed_by_autograd_fn(
+                    x, y
+                )
+                loss = (out1 + out2).sum()
+                loss.backward()
+                yield x.grad
+
+        self.check_output_and_recompiles(fn, 3)
+
+    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+    def test_free_activation_memory(self):
+        self.assertTrue(torch.cuda.memory_allocated() == 0)
+
+        # Use an op to check that the memory is freed by the time the op is executed
+        def assertion_impl(to_clone):
+            mem_allocated = torch.cuda.memory_allocated()
+            self.assertTrue(
+                mem_allocated < 4000000, "activations should have been freed"
+            )
+            return to_clone.clone()
+
+        with torch.library._scoped_library("test_compiled_autograd", "FRAGMENT") as lib:
+            lib.define(
+                "assertion_op(Tensor x) -> Tensor", tags=(torch.Tag.pt2_compliant_tag,)
+            )
+            lib.impl("assertion_op", assertion_impl, "CPU")
+            lib.impl("assertion_op", lambda x: x.clone(), "Meta")
+
+            # Create a graph that allows inputs stealing
+            def forward(activations):
+                add = activations[0] + 1
+                out = add.cpu()
+                cloned_out = torch.ops.test_compiled_autograd.assertion_op(out)
+                return (cloned_out,)
+
+            gm = torch.fx.symbolic_trace(forward)
+            torch._dynamo.utils.set_locals_to_steal(gm, ["activations"])
+            compiled_fn = torch.compile(gm)
+
+            # allocate at least 4,000,000 bytes (1,000,000 * 4 bytes)
+            activations = [torch.ones(1000000, dtype=torch.float32, device="cuda")]
+            self.assertTrue(torch.cuda.memory_allocated() > 4000000)
+
+            out = compiled_fn(activations)
+            self.assertTrue(len(activations) == 0)
+
+    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+    def test_free_activation_memory_subclass(self):
+        # cover the case when aot inputs have subclasses, resulting in a different runtime wrapper
+        self.assertTrue(torch.cuda.memory_allocated() == 0)
+
+        # Use an op to check that the memory is freed by the time the op is executed
+        def assertion_impl(to_clone):
+            mem_allocated = torch.cuda.memory_allocated()
+            self.assertTrue(
+                mem_allocated < 1200000, "some activations should have been freed"
+            )
+            self.assertTrue(
+                mem_allocated > 800000,
+                "currently subclasses don't seem to be freed in inductor",
+            )
+            return to_clone.clone()
+
+        with torch.library._scoped_library("test_compiled_autograd", "FRAGMENT") as lib:
+            lib.define(
+                "assertion_op(Tensor x) -> Tensor", tags=(torch.Tag.pt2_compliant_tag,)
+            )
+            lib.impl("assertion_op", assertion_impl, "CPU")
+            lib.impl("assertion_op", lambda x: x.clone(), "Meta")
+            lib.impl("assertion_op", lambda x: x.clone(), "NestedTensor")
+
+            def fn(inputs):
+                _, y = inputs
+                out = y.cpu()
+                cloned_out = torch.ops.test_compiled_autograd.assertion_op(out)
+                return cloned_out
+
+            gm = torch.fx.symbolic_trace(fn)
+            torch._dynamo.utils.set_locals_to_steal(gm, ["inputs"])
+            compiled_fn = torch.compile(gm)
+
+            from torch.nested._internal.nested_tensor import jagged_from_list
+
+            activations = [
+                jagged_from_list(
+                    [
+                        torch.ones((1, 100000), device="cuda"),  # 400,000 bytes
+                        torch.ones((1, 100000), device="cuda"),  # 400,000 bytes
+                    ],
+                    None,
+                )[
+                    0
+                ],  # NestedTensor
+                torch.ones((1, 100000), device="cuda"),  # 400,000 bytes
+            ]
+            # 1,200,000 bytes (3 * 4 * 100,000 bytes)
+            self.assertTrue(torch.cuda.memory_allocated() > 1200000)
+
+            out = compiled_fn(activations)
+            self.assertTrue(len(activations) == 0)
+
+    def test_verbose_logs_graph(self):
+        torch._logging.set_logs(compiled_autograd_verbose=True)
+
+        def fn():
+            model = torch.nn.Sequential(
+                torch.nn.Linear(4, 4),
+                torch.nn.ReLU(),
+                torch.nn.Linear(4, 4),
+                torch.nn.ReLU(),
+            )
+            x = torch.randn([2, 4])
+            result = model(x).sum()
+            result.backward()
+            yield model[0].weight.grad
+            yield model[0].bias.grad
+            yield model[2].weight.grad
+            yield model[2].bias.grad
+
+        logs, ctx = logs_to_string(
+            torch._dynamo.compiled_autograd.__name__, "compiled_autograd_verbose"
+        )
+        with ctx():
+            self.check_output_and_recompiles(fn)
+
+        expected_logs = [
+            "SumBackward0 (NodeCall 1)",
+            "ReluBackward0 (NodeCall 2)",
+            "AddmmBackward0 (NodeCall 3)",
+            "TBackward0 (NodeCall 4)",
+            "torch::autograd::AccumulateGrad (NodeCall 5)",
+            "ReluBackward0 (NodeCall 6)",
+            "AddmmBackward0 (NodeCall 7)",
+            "TBackward0 (NodeCall 8)",
+            "torch::autograd::AccumulateGrad (NodeCall 9)",
+            "torch::autograd::AccumulateGrad (NodeCall 10)",
+            "torch::autograd::AccumulateGrad (NodeCall 11)",
+        ]
+
+        self.assertEqual(
+            sum(1 for e in expected_logs if e in logs.getvalue()), len(expected_logs)
+        )
+
+    def test_verbose_logs_cpp(self):
+        script = """
+import torch
+
+def compiler_fn(gm):
+    return torch.compile(gm, backend="eager")
+
+def main():
+    torch._logging.set_logs(compiled_autograd_verbose=True)
+    model = torch.nn.Sequential(
+        torch.nn.Linear(4, 4),
+        torch.nn.ReLU(),
+        torch.nn.Linear(4, 4),
+        torch.nn.ReLU(),
+    )
+
+    for i in range(10, 100):
+        x = torch.randn([i, 4])
+        result = model(x).sum()
+        with torch._dynamo.compiled_autograd.enable(compiler_fn):
+            result.backward()
+
+main()
+"""
+        stdout, _ = self.run_process_no_exception(script)
+        stdout = stdout.decode("utf-8")
+
+        patterns = [
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for SumBackward0, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for ReluBackward0, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for AddmmBackward0, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for TBackward0, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for torch::autograd::AccumulateGrad, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for ReluBackward0, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for AddmmBackward0, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for TBackward0, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for torch::autograd::AccumulateGrad, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for torch::autograd::AccumulateGrad, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for torch::autograd::AccumulateGrad, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for torch::autograd::AccumulateGrad, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for ReluBackward0, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for AddmmBackward0, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for TBackward0, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for torch::autograd::AccumulateGrad, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for torch::autograd::AccumulateGrad, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] Creating cache entry for torch::autograd::AccumulateGrad, with key of size (\d+)\n",
+            r"\[python_compiled_autograd.cpp\] cache miss: marking sizes\[(\d+)\] as dynamic\n",
+            r"\[python_compiled_autograd.cpp\] cache miss: marking sizes\[(\d+)\] as dynamic\n",
+            r"\[python_compiled_autograd.cpp\] cache miss: marking sizes\[(\d+)\] as dynamic\n",
+            r"\[python_compiled_autograd.cpp\] cache miss: marking sizes\[(\d+)\] as dynamic\n",
+            r"\[python_compiled_autograd.cpp\] cache miss: marking sizes\[(\d+)\] as dynamic\n",
+            r"\[python_compiled_autograd.cpp\] cache miss: marking sizes\[(\d+)\] as dynamic\n",
+            r"\[python_compiled_autograd.cpp\] cache miss: marking sizes\[(\d+)\] as dynamic\n",
+        ]
+
+        pattern = r"".join(patterns)
+        matches = re.findall(pattern, stdout)
+        self.assertEqual(len(matches), 1)
+        self.assertEqual(len(matches[0]), len(patterns))
+
+    def test_snapshot_verbose_logs_flag(self):
+        def fn():
+            model = torch.nn.Sequential(
+                torch.nn.Linear(4, 4),
+                torch.nn.ReLU(),
+                torch.nn.Linear(4, 4),
+                torch.nn.ReLU(),
+            )
+            x = torch.randn([2, 4])
+            result = model(x).sum()
+            result.backward()
+            yield model[0].weight.grad
+            yield model[0].bias.grad
+            yield model[2].weight.grad
+            yield model[2].bias.grad
+
+        logs, ctx = logs_to_string(
+            torch._dynamo.compiled_autograd.__name__, "compiled_autograd_verbose"
+        )
+        with ctx():
+            with compiled_autograd.enable(compiler_fn):
+                # unused, verbose level already snapshot with contextmanager
+                torch._logging.set_logs(compiled_autograd_verbose=True)
+                fn()
+
+        unexpected_logs = [
+            "SumBackward0 (NodeCall 1)",
+            "ReluBackward0 (NodeCall 2)",
+            "AddmmBackward0 (NodeCall 3)",
+            "TBackward0 (NodeCall 4)",
+            "torch::autograd::AccumulateGrad (NodeCall 5)",
+            "ReluBackward0 (NodeCall 6)",
+            "AddmmBackward0 (NodeCall 7)",
+            "TBackward0 (NodeCall 8)",
+            "torch::autograd::AccumulateGrad (NodeCall 9)",
+            "torch::autograd::AccumulateGrad (NodeCall 10)",
+            "torch::autograd::AccumulateGrad (NodeCall 11)",
+        ]
+
+        self.assertEqual(sum(1 for e in unexpected_logs if e in logs.getvalue()), 0)
+
 
 def load_test_module(name):
     testdir = Path(__file__).absolute().parent.parent
@@ -385,27 +1586,40 @@ def load_test_module(name):
         ).load_module()
 
 
-test_autograd = load_test_module("test_autograd")
+def make_wrapped(fn, fullgraph):
+    @functools.wraps(fn)
+    def wrapped(self):
+        torch._dynamo.reset()
+        with compiled_autograd.enable(make_compiler_fn(fullgraph=fullgraph)):
+            out = fn(self)
 
+        return out
+
+    return wrapped
 
-class EagerAutogradTests(TestCase):
-    @classmethod
-    def add_test(cls, name, fn):
-        @functools.wraps(fn)
-        def wrapped(self: EagerAutogradTests):
-            torch._dynamo.reset()
-            with compiled_autograd.enable(compiler_fn):
-                return fn(self)
 
+def wrap_test_class(orig_cls):
+    dct = orig_cls.__dict__.copy()
+    for name in list(dct.keys()):
+        fn = dct[name]
         if not callable(fn):
-            return
+            continue
         elif known_failures_re.match(name) or name in known_failing_tests:
-            setattr(cls, name, unittest.expectedFailure)
-        elif name.startswith("test"):
-            setattr(cls, name, wrapped)
-        else:
-            setattr(cls, name, fn)
+            dct[name] = unittest.expectedFailure
+        elif name.startswith("test_"):
+            fullgraph = name not in known_graph_breaks_tests
+            dct[name] = make_wrapped(fn, fullgraph)
 
+    cls = type(
+        orig_cls.__name__ + "WithCompiledAutograd",
+        orig_cls.__bases__,
+        dct,
+    )
+    cls.__file__ = __file__
+    return cls
+
+
+known_graph_breaks_tests = {}
 
 # These groups of tests aren't supported yet
 known_failures_re = re.compile(
@@ -428,42 +1642,14 @@ def wrapped(self: EagerAutogradTests):
     "test_wrapped_number_saved_variable_hooks",  # RuntimeError: this hook should not be called
     "test_accumulate_grad_posthooks_can_observe_tensor_prehook",  # data dependent operator: aten.allclose.default
     "test_accumulate_grad_tensor_reference",  # backend='inner_compiler' raised:
-    "test_anomaly_detect_nan",  # type object 'MyFunc' has no attribute '_compiled_autograd_key'
     "test_anomaly_grad_warnings",  # "one of the variables needed for gradient computation has been modified by an...
     "test_autograd_inplace_views_cross_dtype",  # view_fn not supported by compiled autograd
-    "test_autograd_multiple_views_python",  # type object 'ComplexView' has no attribute '_compiled_autograd_key'
-    "test_autograd_node_isinstance",  # type object 'Func' has no attribute '_compiled_autograd_key'
-    "test_autograd_python_custom_function_inplace",  # type object 'MyAdder' has no attribute '_compiled_autograd_key'
     "test_backward_with_inputs",  # specifying inputs= with .backward() not yet implemented for compiled autograd
-    "test_callback_adds_callback",  # type object 'MyFunc' has no attribute '_compiled_autograd_key'
     "test_current_node",  # TorchDispatchMode not yet implemented for compiled autograd
-    "test_custom_function_cycle",  # type object 'MyFn' has no attribute '_compiled_autograd_key'
-    "test_custom_function_error",  # type object 'BadBw' has no attribute '_compiled_autograd_key'
     "test_custom_function_exception",  # "Simulate error on backward pass" does not match "type object 'SimulateBackwa...
-    "test_custom_function_non_tensor_inputs_outputs",  # type object 'MyFunction' has no attribute '_compiled_autograd_key'
-    "test_custom_function_save_for_forward",  # type object 'Func' has no attribute '_compiled_autograd_key'
-    "test_custom_function_saved_tensors",  # type object 'MyFn' has no attribute '_compiled_autograd_key'
-    "test_custom_function_setup_context_multi_input",  # type object 'MyReshape' has no attribute '_compiled_autograd_key'
-    "test_custom_function_setup_context_multi_output",  # type object 'MySquare' has no attribute '_compiled_autograd_key'
-    "test_custom_function_setup_context_simple",  # type object 'MySquare' has no attribute '_compiled_autograd_key'
-    "test_deep_reentrant",  # type object 'DeepReentrant' has no attribute '_compiled_autograd_key'
-    "test_dep_nograd",  # type object 'F2' has no attribute '_compiled_autograd_key'
-    "test_dont_materialize_grads",  # type object 'MyFunction' has no attribute '_compiled_autograd_key'
-    "test_function_returns_input",  # type object 'MyFunction' has no attribute '_compiled_autograd_key'
-    "test_function_returns_undefined_tensor",  # type object 'MyFunction' has no attribute '_compiled_autograd_key'
     "test_grad_batched_grad",  # Cannot access storage of BatchedTensorImpl
-    "test_grad_fn_prehooks",  # type object 'Mul2' has no attribute '_compiled_autograd_key'
-    "test_grad_fn_prehooks_multiple_outputs",  # type object 'DoubleMul2' has no attribute '_compiled_autograd_key'
-    "test_grad_fn_prehooks_remove_hooks",  # type object 'Mul2' has no attribute '_compiled_autograd_key'
-    "test_grad_mode_restored_reentrant",  # type object 'MyFunction' has no attribute '_compiled_autograd_key'
     "test_grad_unreachable_discovery",  # specifying inputs= with .backward() not yet implemented for compiled autograd
-    "test_hook_none",  # type object 'NoneGradientFunction' has no attribute '_compiled_autograd_key'
     "test_index_backward_does_not_save_tensor",  # dynamic shape operator: aten.nonzero.default
-    "test_invalid_gradients",  # type object 'MyFunction' has no attribute '_compiled_autograd_key'
-    "test_mark_non_differentiable_mixed",  # type object 'MyFunction' has no attribute '_compiled_autograd_key'
-    "test_materialize_grads",  # type object 'MyFunction' has no attribute '_compiled_autograd_key'
-    "test_naughty_anomaly_access",  # type object 'MyFunction' has no attribute '_compiled_autograd_key'
-    "test_no_grad_copy",  # type object 'NonContGradFunc' has no attribute '_compiled_autograd_key'
     "test_post_accumulate_grad_hook_e2e",  # tensor_post_acc_grad_hooks not implemented for compiled autograd
     "test_post_accumulate_grad_hook_gets_cleaned_up",  # tensor_post_acc_grad_hooks not implemented for compiled autograd
     "test_post_accumulate_grad_hook_multiple_hooks",  # tensor_post_acc_grad_hooks not implemented for compiled autograd
@@ -471,36 +1657,19 @@ def wrapped(self: EagerAutogradTests):
     "test_post_accumulate_grad_hook_ordering",  # tensor_post_acc_grad_hooks not implemented for compiled autograd
     "test_post_accumulate_grad_hook_returns_not_None",  # "hooks should return None." does not match
     "test_reentrant_child_error",  # "Simulate error" does not match "type object 'ReentrantFunc' has no attribute...
-    "test_reentrant_priority",  # type object 'Reentrant' has no attribute '_compiled_autograd_key'
-    "test_reentrant_with_callbacks_both_depths",  # type object 'MyReentrantFunc' has no attribute '_compiled_autograd_key'
-    "test_reentrant_with_callbacks_depth_0",  # type object 'MyReentrantFunc' has no attribute '_compiled_autograd_key'
-    "test_reentrant_with_callbacks_depth_1",  # type object 'MyReentrantFunc' has no attribute '_compiled_autograd_key'
     "test_retain_grad_cycle",  # retains_grad_hooks not implemented for compiled autograd
     "test_retain_grad_inplace",  # retains_grad_hooks not implemented for compiled autograd
     "test_retain_grad_inplace_over_view",  # retains_grad_hooks not implemented for compiled autograd
     "test_retains_grad_can_always_observe_tensor_prehook",  # retains_grad_hooks not implemented for compiled autograd
     "test_retains_grad_inplace_multiple_outputs",  # retains_grad_hooks not implemented for compiled autograd
-    "test_return_leaf",  # type object 'Identity' has no attribute '_compiled_autograd_key'
-    "test_return_leaf_inplace",  # type object 'Inplace' has no attribute '_compiled_autograd_key'
-    "test_save_none_for_backward",  # type object 'MyFn' has no attribute '_compiled_autograd_key'
-    "test_save_output_nr",  # type object 'TestFn' has no attribute '_compiled_autograd_key'
-    "test_saved_tensor_hooks_custom_function_intermediates",  # type object 'Func' has no attribute '_compiled_autograd_key'
-    "test_saved_variables_deprecated",  # type object 'MyFunction' has no attribute '_compiled_autograd_key'
-    "test_set_materialize_non_diff_grads",  # type object 'Func' has no attribute '_compiled_autograd_key'
-    "test_setup_context_when_forward_has_default_args",  # type object 'PowFunction' has no attribute '_compiled_autograd_key'
-    "test_simple_reentrant",  # type object 'Reenter' has no attribute '_compiled_autograd_key'
-    "test_tensor_hooks_inplace_multiple_outputs",  # type object 'DoubleMul' has no attribute '_compiled_autograd_key'
     "test_to_sparse_backward",  # backend='inner_compiler' raised:
-    "test_too_many_grads",  # type object 'MyFn' has no attribute '_compiled_autograd_key'
     "test_accumulate_grad",  # RuntimeError: compiled_autograd does not support create_graph
     "test_anomaly_assign_parent_cleanup",  # RuntimeError: compiled_autograd does not support create_graph
     "test_anomaly_mode_no_check_nan",  # RuntimeError: compiled_autograd does not support AnomalyMode
-    "test_autograd_simple_views_python",  # AttributeError: type object 'IdOneOutput' has no attribute '_compiled_autograd_key'
     "test_backward_create_graph_warns",  # RuntimeError: compiled_autograd does not support create_graph
     "test_backward_with_nonleaf_inputs",  # RuntimeError: compiled_autograd does not support create_graph
     "test_create_graph_and_full_backward_hook_cycle",  # RuntimeError: compiled_autograd does not support create_graph
     "test_current_graph_task_id",  # torch._dynamo.exc.Unsupported: torch.* op returned non-Tensor int
-    "test_custom_autograd_no_early_free",  # AttributeError: type object 'Double' has no attribute '_compiled_autograd_key'
     "test_custom_autograd_repeated_grad_grad",  # RuntimeError: compiled_autograd does not support create_graph
     "test_custom_function_forward_mode_forward_is_no_op",  # AttributeError: type object 'MyFn'
     "test_custom_function_forward_mode_inplace_checks",  # AttributeError: type object 'InplaceFn'
@@ -519,32 +1688,85 @@ def wrapped(self: EagerAutogradTests):
     "test_hook_edge_case_when_called_with_grad",  # RuntimeError: specifying inputs= with .backward() not yet
     "test_hooks",  # torch._dynamo.exc.Unsupported: inline in skipfiles
     "test_inplace_on_view_backward",  # RuntimeError: compiled_autograd does not support create_graph
-    "test_lobpcg",  # AttributeError: type object 'LOBPCGAutogradFunction' has no attribute '_compiled_autograd_key'
-    "test_multi_grad_hooks",  # RuntimeError: specifying inputs= with .backward() not yet implemented for compiled autograd
-    "test_naughty_autograd_function_stashing_ctx",  # AttributeError: type object 'Id' has no attribute '_compiled_autograd_key'
+    "test_multi_grad_any_hooks",  # RuntimeError: specifying inputs= with .backward() not yet implemented for compiled autograd
+    "test_multi_grad_all_hooks",  # RuntimeError: specifying inputs= with .backward() not yet implemented for compiled autograd
     "test_nested_anomaly_detect_nan",  # RuntimeError: compiled_autograd does not support create_graph
     "test_nested_anomaly_printstack_cleanup",  # RuntimeError: compiled_autograd does not support create_graph
-    "test_no_grad_copy_sparse",  # AttributeError: type object 'MyFunc' has no attribute '_compiled_autograd_key'
     "test_once_differentiable",  # RuntimeError: compiled_autograd does not support create_graph
     "test_prehook_ordering",  # RuntimeError: specifying inputs= with .backward() not yet implemented for compiled autograd
     "test_retain_grad",  # RuntimeError: retains_grad_hooks not implemented for compiled autograd
-    "test_return_duplicate",  # AttributeError: type object 'DoubleDuplicate' has no attribute '_compiled_autograd_key'
-    "test_return_duplicate_inplace",  # AttributeError: type object 'DoubleInplace' has no attribute '_compiled_autograd_key'
     "test_saved_variable_packing_unpacking_saved_original_with_hooks",  # RuntimeError: compiled_autograd
     "test_select_sum",  # torch.autograd.gradcheck.GradcheckError: While computing batched gradients
     "test_unrelated_inputs",  # torch.autograd.gradcheck.GradcheckError: While computing batched gradients
     "test_will_engine_execute_node",  # RuntimeError: specifying inputs= with .backward() not yet implemented for compiled autograd
     "test_backward_to_node",  # RuntimeError: specifying inputs= with .backward() not yet implemented for compiled autograd
-    "test_callback_propagates_errors_from_device_thread",  # AssertionError: "blah" does not match "call_method UserDefinedObj..."
+    "test_anomaly_detect_nan",  # torch._dynamo.exc.TorchRuntimeError: Failed running call_function aten.add.Tensor(
+    "test_autograd_multiple_views_python",  # torch._dynamo.exc.Unsupported: call_function args: TensorVariable(
+    "test_autograd_node_isinstance",  # torch._dynamo.exc.Unsupported: 'inline in skipfiles: TestCase.assertIsInstance
+    "test_autograd_simple_views_python",  # torch._dynamo.exc.TorchRuntimeError: Failed running call_function
+    "test_callback_adds_callback",  # torch._dynamo.exc.Unsupported: call_method UserDefinedObjectVariable
+    "test_callback_propagates_errors_from_device_thread",  # AssertionError: "blah" does not match "call_method
+    "test_custom_autograd_no_early_free",  # torch.autograd.gradcheck.GradcheckError: While computing batched gradients
+    "test_custom_function_cycle",  # torch._dynamo.exc.Unsupported: call_function UserDefinedClassVariable() [] {}
+    "test_custom_function_error",  # AssertionError: "must implement either the backward" does not match "call_function
+    "test_custom_function_non_tensor_inputs_outputs",  # torch._dynamo.exc.Unsupported: call_function
+    "test_custom_function_save_for_forward",  # torch._dynamo.exc.Unsupported: call_function
+    "test_custom_function_setup_context_multi_input",  # torch._dynamo.exc.Unsupported: call_function args
+    "test_custom_function_setup_context_multi_output",  # torch._dynamo.exc.Unsupported: call_function args
+    "test_deep_reentrant",  # torch._dynamo.exc.InternalTorchDynamoError: '<' not supported between instances of
+    "test_dont_materialize_grads",  # torch._dynamo.exc.Unsupported: 'inline in skipfiles: TestCase.assertIsNone
+    "test_function_returns_undefined_tensor",  # torch._dynamo.exc.TorchRuntimeError: Failed running call_function
+    "test_grad_fn_prehooks",  # torch._dynamo.exc.Unsupported: call_function UserDefinedClassVariable() [] {}
+    "test_grad_fn_prehooks_multiple_outputs",  # torch._dynamo.exc.Unsupported: 'inline in skipfiles:
+    "test_grad_fn_prehooks_remove_hooks",  # torch._dynamo.exc.Unsupported: 'inline in skipfiles: RemovableHandle.remove
+    "test_grad_mode_restored_reentrant",  # torch._dynamo.exc.Unsupported: 'inline in skipfiles: TestCase.assertTrue
+    "test_hook_none",  # torch._dynamo.exc.Unsupported: 'inline in skipfiles: TestCase.assertIsNotNone
+    "test_invalid_gradients",  # AssertionError: "expected shape" does not match "The size of tensor a (5) must match
+    "test_mark_non_differentiable_mixed",  # torch._dynamo.exc.Unsupported: 'inline in skipfiles: TestCase.assertTrue
+    "test_materialize_grads",  # torch._dynamo.exc.Unsupported: call_function UserDefinedClassVariable() [] {}
+    "test_naughty_autograd_function_stashing_ctx",  # torch._dynamo.exc.TorchRuntimeError: Failed running call_function
+    "test_no_grad_copy",  # torch._dynamo.exc.Unsupported: call_function args: TensorVariable() SkipFunctionVariable()
+    "test_no_grad_copy_sparse",  # torch._dynamo.exc.Unsupported: Tensor.data_ptr
+    "test_reentrant_priority",  # torch._dynamo.exc.InternalTorchDynamoError: '<' not supported between instances of
+    "test_reentrant_with_callbacks_both_depths",  # torch._dynamo.exc.Unsupported: call_method UserDefinedObjectVariable
+    "test_reentrant_with_callbacks_depth_0",  # torch._dynamo.exc.Unsupported: call_method UserDefinedObjectVariable
+    "test_reentrant_with_callbacks_depth_1",  # torch._dynamo.exc.Unsupported: Tensor.requires_grad_
+    "test_return_duplicate",  # torch.autograd.gradcheck.GradcheckError: While computing batched gradients
+    "test_return_duplicate_inplace",  # torch.autograd.gradcheck.GradcheckError: While computing batched gradients
+    "test_return_leaf",  # torch._dynamo.exc.Unsupported: call_function UserDefinedClassVariable() [] {}
+    "test_save_none_for_backward",  # AssertionError:
+    "test_save_output_nr",  # torch._dynamo.exc.Unsupported: call_function UserDefinedClassVariable() [] {}
+    "test_saved_variables_deprecated",  # torch._dynamo.exc.Unsupported: UNPACK_SEQUENCE SkipFunctionVariable()
+    "test_set_materialize_non_diff_grads",  # torch._dynamo.exc.Unsupported: 'inline in skipfiles: TestCase.assertIsNone
+    "test_setup_context_when_forward_has_default_args",  # torch._dynamo.exc.Unsupported: call_function args
+    "test_simple_reentrant",  # torch._dynamo.exc.Unsupported: call_method SkipFunctionVariable() sum [] {}
+    "test_tensor_hooks_inplace_multiple_outputs",  # torch._dynamo.exc.Unsupported: call_function UserDefinedClassVariable() [] {}
+    "test_lobpcg",  # torch._dynamo.exc.Unsupported: 'call_function LOBPCGAutogradFunction.backward in skip_files
+    "test_backward_dict_grad_for_nontensor",  # AssertionError: "non-Tensor-like types" does not match "'skip function
+    "test_backward_dict_invalid_keys",  # AssertionError: "to have keys {'x'}" does not match "'skip function
+    "test_backward_dict_requires_keys_for_input_optional_tensors",  # AssertionError: "to have keys {.*'y'.*}"
+    "test_backward_dict_requires_keys_for_input_tensors",  # AssertionError: "to have keys {.*'y'.*}" does not
+    "test_backward_grads_are_tensor_or_none",  # AssertionError: "either None or a Tensor" does not match "'
+    "test_backward_impl_on_existing_op",  # torch._dynamo.exc.Unsupported: 'skip function
+    "test_backward_returns_dict",  # AssertionError: "to be a dict" does not match "'skip function
+    "test_backward_tensorlist_input_requires_list_grads",  # AssertionError: "list of gradients" does not
+    "test_backward_tensorlist_input_requires_list_grads_none_or_Tensor",  # AssertionError: "None or Tensor"
+    "test_backward_tensorlist_input_requires_list_grads_with_same_numel",  # AssertionError: "3 gradients
+    "test_save_for_backward_inputs_are_namedtuple",  # torch._dynamo.exc.Unsupported: 'skip function
+    "test_autograd_function_backed_op",  # RuntimeError: compiled_args not implemented
+    "test_setitem",  # AssertionError: Tensor-likes are not close!
+    "test_grad_nonleaf_register_hook",  # IndexError: list index out of range (NB: x.grad = y where both x and y are input tensors)
 }
 
 if not HAS_CUDA:
     # Found Tesla M60 which is too old to be supported by the triton GPU compiler
     known_failing_tests.add("test_type_conversions")
 
-for name, fn in test_autograd.TestAutograd.__dict__.items():
-    EagerAutogradTests.add_test(name, fn)
+test_autograd = load_test_module("test_autograd")
+test_custom_ops = load_test_module("test_custom_ops")
 
+TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
+TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
 
 if __name__ == "__main__":
     if HAS_CPU:
diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py
index d443681655903..a8fabacd742ec 100644
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@@ -3,6 +3,7 @@
 import sys
 import unittest
 import weakref
+from contextlib import ExitStack
 
 from copy import deepcopy
 from typing import NamedTuple
@@ -10,15 +11,98 @@
 import torch
 
 import torch._inductor
+import torch._inductor.cudagraph_trees
+import torch.optim.lr_scheduler
+from torch._inductor import config
+
+from torch._inductor.test_case import TestCase
+
+from torch.optim import (
+    Adadelta,
+    Adagrad,
+    Adam,
+    Adamax,
+    AdamW,
+    ASGD,
+    NAdam,
+    RAdam,
+    RMSprop,
+    Rprop,
+    SGD,
+    SparseAdam,
+)
+
+from torch.optim.lr_scheduler import (
+    ChainedScheduler,
+    ConstantLR,
+    CosineAnnealingLR,
+    CosineAnnealingWarmRestarts,
+    CyclicLR,
+    ExponentialLR,
+    LambdaLR,
+    LinearLR,
+    MultiplicativeLR,
+    MultiStepLR,
+    OneCycleLR,
+    PolynomialLR,
+    ReduceLROnPlateau,
+    SequentialLR,
+    StepLR,
+)
+
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    skipCUDAIf,
+)
+
+from torch.testing._internal.common_optimizers import (
+    _get_optim_inputs_including_global_cliquey_kwargs,
+    optim_db,
+    optims,
+)
+
+from torch.testing._internal.common_utils import parametrize
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA, has_triton
+from torch.testing._internal.triton_utils import requires_cuda
+
+
+# Note: we use atypical values to amplify error
+LR_SCHEDULER_TO_KWARGS = {
+    LambdaLR: {"lr_lambda": lambda x: 10},
+    MultiplicativeLR: {"lr_lambda": lambda x: 10},
+    StepLR: {"step_size": 1, "gamma": 100},
+    MultiStepLR: {"milestones": [1, 2], "gamma": 100},
+    ExponentialLR: {"gamma": 100},
+    SequentialLR: {"schedulers": None, "milestones": [1, 2]},
+    CosineAnnealingLR: {"T_max": 7},
+    ChainedScheduler: {"schedulers": None},
+    CyclicLR: {"base_lr": 0.001, "max_lr": 0.02, "cycle_momentum": False},
+    CosineAnnealingWarmRestarts: {"T_0": 1},
+    OneCycleLR: {
+        "max_lr": 0.02,
+        "cycle_momentum": False,
+        "steps_per_epoch": 1,
+        "epochs": 10,
+    },
+    ConstantLR: {"factor": 0.001},
+    LinearLR: {},
+    ReduceLROnPlateau: {"factor": 0.99, "patience": 1},
+    PolynomialLR: {},
+}
 
-# The rest of the optimizers not yet imported: Adamax, LBFGS, RAdam, SGD, SparseAdam
-from torch.optim import Adadelta, Adagrad, Adam, AdamW, ASGD, NAdam, RMSprop, Rprop
-
-from torch.testing._internal.common_optimizers import optim_db
 
-from torch.testing._internal.common_utils import TestCase
+def create_scheduler(scheduler, optim):
+    kwargs = LR_SCHEDULER_TO_KWARGS[scheduler]
+    if "schedulers" in kwargs:
+        kwargs["schedulers"] = [
+            create_scheduler(torch.optim.lr_scheduler.ConstantLR, optim)
+            for _ in range(2)
+        ] + [create_scheduler(torch.optim.lr_scheduler.LambdaLR, optim)]
 
-from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+    if scheduler == ChainedScheduler:
+        return scheduler(**kwargs)
+    else:
+        return scheduler(optim, **kwargs)
 
 
 class KernelCounts(NamedTuple):
@@ -33,91 +117,129 @@ class KernelCounts(NamedTuple):
 KERNEL_COUNT_OVERRIDES = {
     "test_rmsprop_foreach_weight_decay_cpu": 12,
     "test_nadam_foreach_weight_decay_momentum_decay_cpu": 20,
-    "test_adamw_foreach_amsgrad_capturable_cuda": 3,
+    "test_adamw_amsgrad_capturable_foreach_cuda": 3,
     "test_adamw_amsgrad_capturable_cuda": 6,
+    "test_adamw_tensor_lr_amsgrad_capturable_foreach_cuda": 3,
+    "test_adamw_tensor_lr_amsgrad_capturable_cuda": 6,
+    "test_adam_tensor_lr_amsgrad_capturable_cuda": 6,
     "test_adam_amsgrad_capturable_cuda": 6,
+    "test_adadelta_tensor_lr_capturable_cuda": 6,
+    "test_rmsprop_tensor_lr_capturable_cuda": 6,
+    "test_adadelta_tensor_lr_capturable_foreach_cuda": 4,
     "test_adadelta_foreach_weight_decay_maximize_cpu": 12,
     "test_adadelta_foreach_rho_weight_decay_cpu": 12,
     "test_adadelta_foreach_weight_decay_cpu": 12,
+    "test_sgd_foreach_momentum_weight_decay_cpu": 16,
+    "test_sgd_foreach_momentum_nesterov_weight_decay_cpu": 16,
+    "test_sgd_momentum_dampening_foreach_cuda": 5,
+    "test_sgd_momentum_foreach_cuda": 5,
+    "test_sgd_weight_decay_maximize_cuda": 4,
+    "test_sgd_weight_decay_maximize_cpu": 4,
+    "test_sgd_momentum_weight_decay_foreach_cuda": 2,
+    "test_sgd_momentum_nesterov_weight_decay_foreach_cuda": 2,
+    "test_sgd_cuda": 4,
+    "test_sgd_cpu": 4,
+    "test_rmsprop_tensor_lr_capturable_foreach_cuda": 4,
+    "test_adagrad_initial_accumulator_value_weight_decay_foreach_cuda": 3,
+    "test_adagrad_lr_decay_weight_decay_foreach_cuda": 3,
+    "test_adagrad_weight_decay_foreach_cuda": 3,
+    "test_adagrad_weight_decay_maximize_foreach_cuda": 3,
+    "test_adagrad_tensor_lr_cpu": 6,
+    "test_adagrad_tensor_lr_cuda": 6,
+    "test_adamax_tensor_lr_weight_decay_capturable_cuda": 6,
+    "test_asgd_tensor_lr_weight_decay_maximize_capturable_cuda": 8,
+    "test_asgd_tensor_lr_weight_decay_maximize_capturable_foreach_cuda": 4,
+    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_cuda": 9,
+    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_foreach_cuda": 3,
+    "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_cuda": 6,
+    "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_foreach_cuda": 3,
+    "test_sgd_tensor_lr_cpu": 2,
+    "test_sgd_tensor_lr_cuda": 2,
+    "test_sgd_tensor_lr_foreach_cuda": 2,
 }
 
 # also tracks currently supported optimizers
 KERNEL_COUNTS = {
     Adam: KernelCounts(multitensor=2, singletensor=8),
     AdamW: KernelCounts(multitensor=2, singletensor=8),
-    NAdam: KernelCounts(multitensor=2, singletensor=12),
-    Rprop: KernelCounts(multitensor=1, singletensor=4),
-    RMSprop: KernelCounts(multitensor=1, singletensor=4),
-    Adadelta: KernelCounts(multitensor=1, singletensor=4),
-    Adagrad: KernelCounts(multitensor=5, singletensor=8),
-    ASGD: KernelCounts(multitensor=2, singletensor=12),
+    NAdam: KernelCounts(multitensor=2, singletensor=11),
+    Rprop: KernelCounts(multitensor=2, singletensor=8),
+    RMSprop: KernelCounts(multitensor=2, singletensor=8),
+    Adadelta: KernelCounts(multitensor=2, singletensor=8),
+    Adagrad: KernelCounts(multitensor=2, singletensor=8),
+    SGD: KernelCounts(multitensor=1, singletensor=8),
+    ASGD: KernelCounts(multitensor=2, singletensor=11),
+    RAdam: KernelCounts(multitensor=2, singletensor=8),
+    Adamax: KernelCounts(multitensor=2, singletensor=8),
 }
 
 
-def build_compiled_opt_kwarg_db():
+def build_opt_kwarg_db():
     compiled_opt_db = []
     for optim_info in optim_db:
         if optim_info.optim_cls not in KERNEL_COUNTS:
             continue
 
-        for optim_inputs in optim_info.optim_inputs_func():
-            for device in ["cpu", "cuda"]:
-                for foreach in [True, False]:
-                    if device == "cpu" and "capturable" in optim_inputs.kwargs:
-                        continue
-
-                    kwargs = dict(optim_inputs.kwargs)
-                    name = (
-                        f"test_{optim_info.optim_cls.__name__.lower()}"
-                        f"{'_foreach' if foreach else ''}"
-                    )
-
-                    for key in optim_inputs.kwargs:
-                        if key == "lr":
-                            continue
+        for device in ["cpu", "cuda"]:
+            for optim_inputs in _get_optim_inputs_including_global_cliquey_kwargs(
+                device, None, optim_info, skip=("differentiable", "fused")
+            ):
+                kwargs = dict(optim_inputs.kwargs)
+                name = f"test_{optim_info.optim_cls.__name__.lower()}"
+
+                has_tensor_lr = False
+                for key, val in kwargs.items():
+                    if not key == "lr" and (
+                        not isinstance(val, bool) or (isinstance(val, bool) and val)
+                    ):
                         name += "_" + key
 
-                    name += f"_{device}"
-
-                    # Eager for-loop impl doesn't support capturable ASGD
-                    if name == "test_asgd_capturable_cuda":
-                        continue
-
-                    kwargs["foreach"] = foreach
-                    kwargs["device"] = device
-                    if name in KERNEL_COUNT_OVERRIDES:
-                        kwargs["kernel_count"] = KERNEL_COUNT_OVERRIDES[name]
-                    else:
-                        kwargs["kernel_count"] = (
-                            KERNEL_COUNTS[optim_info.optim_cls].multitensor
-                            if foreach and device == "cuda"
-                            else KERNEL_COUNTS[optim_info.optim_cls].singletensor
-                        )
+                    if key == "lr" and isinstance(kwargs["lr"], torch.Tensor):
+                        has_tensor_lr = True
+                        name += "_tensor_lr"
+
+                name += f"_{device}"
 
-                    # Note on tolerances:
-                    # test_adadelta_foreach_rho_weight_decay_cuda
-                    # Mismatched elements: 1 / 100 (1.0%)
-                    # Greatest absolute difference: 2.0936131477355957e-05 at index (2, 7) (up to 2e-05 allowed)
-                    # Greatest relative difference: 8.520411211065948e-05 at index (2, 7) (up to 1e-06 allowed)
-                    if optim_info.optim_cls is Adadelta:
-                        kwargs["rtol"] = 2e-5
-                        kwargs["atol"] = 2e-5
+                kwargs["device"] = device
+                if name in KERNEL_COUNT_OVERRIDES:
+                    kwargs["kernel_count"] = KERNEL_COUNT_OVERRIDES[name]
+                else:
+                    kwargs["kernel_count"] = (
+                        KERNEL_COUNTS[optim_info.optim_cls].multitensor
+                        if kwargs.get("foreach", False) and device == "cuda"
+                        else KERNEL_COUNTS[optim_info.optim_cls].singletensor
+                    )
 
-                    compiled_opt_db.append((optim_info.optim_cls, name, kwargs))
+                if kwargs["kernel_count"] is None or kwargs.get("fused", False):
+                    continue
+
+                if has_tensor_lr:
+                    for scheduler_cls in LR_SCHEDULER_TO_KWARGS.keys():
+                        name_w_scheduler = name + f"_{scheduler_cls.__name__.lower()}"
+                        compiled_opt_db.append(
+                            (
+                                optim_info.optim_cls,
+                                name_w_scheduler,
+                                kwargs,
+                                scheduler_cls,
+                            )
+                        )
+                else:
+                    compiled_opt_db.append((optim_info.optim_cls, name, kwargs, None))
 
     return compiled_opt_db
 
 
-COMPILED_OPT_KWARG_DB = build_compiled_opt_kwarg_db()
+COMPILED_OPT_KWARG_DB = build_opt_kwarg_db()
 
 aten = torch.ops.aten
 
 
 try:
     try:
-        from .test_torchinductor import check_model, check_model_cuda, requires_cuda
+        from .test_torchinductor import check_model, check_model_cuda
     except ImportError:
-        from test_torchinductor import check_model, check_model_cuda, requires_cuda
+        from test_torchinductor import check_model, check_model_cuda
 except (unittest.SkipTest, ImportError) as e:
     sys.stderr.write(f"{type(e)}: {e}\n")
     if __name__ == "__main__":
@@ -125,17 +247,28 @@ def build_compiled_opt_kwarg_db():
     raise
 
 
-def compile_opt(opt_compiled, closure=None):
+def call_scheduler(scheduler):
+    if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+        scheduler.step(1.0)  # we won't reduce the metric over two iters anyway
+    else:
+        scheduler.step()
+
+
+def compile_opt(opt_compiled, closure=None, fullgraph=True):
     # run the patcher so that step has the expected structure
     torch._dynamo.eval_frame.TorchPatcher.patch()
 
-    # unwrap step to avoid a deliberate graph break due to
+    # unwrap step TWICE to avoid a deliberate graph break due to
     # a limitation of functionalization/no_grad detection
     # see the [Note on graph break] in optimizer.py
     # This ignores the outer _use_grad_if_differentiable wrapper
     # and instead manually disables grad before calling step, which is fine
     # for now as dynamo does not support differentiable optimizers anyway
-    step_fn = opt_compiled.step.__wrapped__
+    step_fn = opt_compiled.step.__wrapped__.__wrapped__
+
+    # This ensures we don't receive spam of warnings from LR Scheduler
+    opt_compiled._opt_called = True
+
     if closure is not None:
 
         def fn():
@@ -146,77 +279,127 @@ def fn():
         def fn():
             step_fn(opt_compiled)
 
-    return torch.compile(fn, backend="inductor", fullgraph=True)
+    return torch.compile(fn, backend="inductor", fullgraph=fullgraph)
+
+
+def check_optim(
+    self,
+    optim_cls,
+    params_eager,
+    params_compiled,
+    state_eager,
+    state_compiled,
+    atol=None,
+    rtol=None,
+):
+    params_eager = list(params_eager)
+    params_compiled = list(params_compiled)
+    # Note on tolerances:
+    # test_correctness_Adadelta_cuda_float32
+    # Mismatched elements: 10 / 100 (10.0%)
+    # Greatest absolute difference: 4.838220775127411e-05 at index (7, 4) (up to 1e-05 allowed)
+    # Greatest relative difference: 0.007270356640219688 at index (7, 2) (up to 1e-05 allowed)
+    # This is due to floating point ordering error + usage of sqrt
+    rtol = None
+    atol = None
+    if optim_cls is Adadelta:
+        rtol = 5.5e-4
+        atol = 5e-5
+
+    self.assertEqual(list(params_eager), list(params_compiled), atol=atol, rtol=rtol)
+
+    for p_eager, p_compiled in zip(params_eager, params_compiled):
+        self.assertEqual(
+            state_eager[p_eager],
+            state_compiled[p_compiled],
+            atol=atol,
+            rtol=rtol,
+        )
 
 
 def make_test(
     optim_cls,
     closure=None,
+    scheduler_cls=None,
     kernel_count=2,
     device="cuda",
-    atol=None,
-    rtol=None,
     **kwargs,
 ):
     def test_fn(self):
-        torch._dynamo.reset()
-        torch._inductor.metrics.reset()
-        input = torch.ones([10, 10], device=device)
-        model_eager = torch.nn.Sequential(
-            *[torch.nn.Linear(10, 10, device=device) for _ in range(2)]
-        )
-        model_eager(input).sum().backward()
+        stack = ExitStack()
+        try:
+            # https://github.com/pytorch/pytorch/issues/118715 for capturable Adagrad support
+            # https://github.com/pytorch/pytorch/issues/118018 for capturable SGD support
+            run_cudagraphs = device == "cuda" and optim_cls not in (Adagrad, SGD)
+            if run_cudagraphs:
+                stack.enter_context(config.patch({"triton.cudagraphs": True}))
+
+            kwargs_compiled = deepcopy(kwargs)
+            if isinstance(kwargs.get("lr", None), torch.Tensor):
+                kwargs["lr"] = kwargs["lr"].to(device)
+                kwargs_compiled["lr"] = kwargs_compiled["lr"].to(device)
+
+            torch._dynamo.reset()
+            torch._inductor.metrics.reset()
+            input = torch.ones([10, 10], device=device)
+            model_eager = torch.nn.Sequential(
+                *[torch.nn.Linear(10, 10, device=device) for _ in range(2)]
+            )
+            model_eager(input).sum().backward()
 
-        input = torch.ones([10, 10], device=device)
-        model_compiled = deepcopy(model_eager)
-        model_compiled(input).sum().backward()
+            input = torch.ones([10, 10], device=device)
+            model_compiled = deepcopy(model_eager)
+            model_compiled(input).sum().backward()
 
-        opt_eager = optim_cls(model_eager.parameters(), **kwargs)
-        opt_compiled = optim_cls(model_compiled.parameters(), **kwargs)
-        compiled_step = compile_opt(opt_compiled, closure=closure)
+            opt_eager = optim_cls(model_eager.parameters(), **kwargs)
+            opt_compiled = optim_cls(model_compiled.parameters(), **kwargs_compiled)
+            compiled_step = compile_opt(opt_compiled, closure=closure)
 
-        with torch.set_grad_enabled(False):
-            compiled_step()
-            compiled_step()
-            opt_eager.step()
-            opt_eager.step()
+            if scheduler_cls:
+                scheduler_compiled = create_scheduler(scheduler_cls, opt_compiled)
+                scheduler_eager = create_scheduler(scheduler_cls, opt_eager)
+                # some schedulers only change after at least an epoch has passed
+                scheduler_compiled.last_epoch = 1
+                scheduler_eager.last_epoch = 1
 
-        self.assertEqual(
-            list(model_eager.parameters()),
-            list(model_compiled.parameters()),
-            atol=atol,
-            rtol=rtol,
-        )
+            with torch.set_grad_enabled(False):
+                for i in range(2):
+                    compiled_step()
+                    opt_eager.step()
+                    if scheduler_cls:
+                        call_scheduler(scheduler_eager)
+                        call_scheduler(scheduler_compiled)
+
+            check_optim(
+                self,
+                optim_cls,
+                model_eager.parameters(),
+                model_compiled.parameters(),
+                opt_eager.state,
+                opt_compiled.state,
+            )
 
-        # currently we don't mutate step properly until we resolve
-        # https://github.com/pytorch/pytorch/issues/115679
-        if optim_cls not in (Adadelta, Rprop, RMSprop):
-            for p_eager, p_compiled in zip(
-                model_eager.parameters(), model_compiled.parameters()
-            ):
+            if run_cudagraphs:
+                self.check_cudagraphs_ran()
+
+            if self.check_kernel_count:
+                # currently, we compile the step and the rest of the computation
+                # separately because the step is a single element tensor
+                # hence, the usual kernel count is 2
                 self.assertEqual(
-                    opt_eager.state[p_eager],
-                    opt_compiled.state[p_compiled],
-                    atol=atol,
-                    rtol=rtol,
+                    torch._inductor.metrics.generated_kernel_count, kernel_count
                 )
-
-        if self.check_kernel_count:
-            # currently, we compile the step and the rest of the computation
-            # separately because the step is a single element tensor
-            # hence, the usual kernel count is 2
-            self.assertEqual(
-                torch._inductor.metrics.generated_kernel_count, kernel_count
-            )
+        finally:
+            stack.close()
 
     if device == "cuda":
-        test_fn = requires_cuda()(test_fn)
+        test_fn = requires_cuda(test_fn)
 
     return test_fn
 
 
 def make_recompile_test(optim_cls, closure=None, kernel_count=2, **kwargs):
-    @requires_cuda()
+    @requires_cuda
     def test_fn(self):
         torch._dynamo.reset()
         torch._inductor.metrics.reset()
@@ -231,14 +414,21 @@ def test_fn(self):
 
         # check no recompile here
         with torch.set_grad_enabled(False):
-            compiled_step()
-
-            compiled_step()
+            for _ in range(4):
+                compiled_step()
 
             # perturb state to force recompile
             # Adagrad doesn't reinitialize state on each step
-            if optim_cls is Adagrad:
+            # SGD has an empty state
+            if optim_cls in (Adagrad, SGD):
                 opt_compiled.param_groups[0]["lr"] = 0.02
+            elif optim_cls is Adam:  # ensure we are guarding on the data_ptr of states
+                state_tensor = opt_compiled.state[
+                    opt_compiled.param_groups[0]["params"][0]
+                ]["exp_avg"]
+                opt_compiled.state[opt_compiled.param_groups[0]["params"][0]][
+                    "exp_avg"
+                ] = torch.zeros_like(state_tensor)
             else:
                 opt_compiled.state.clear()
 
@@ -249,13 +439,121 @@ def test_fn(self):
             # separately because the step is a single element tensor
             # hence, the usual kernel count is 2
             # multiply by 2 to account for the recompile
+            multiplier = 2
+
             self.assertEqual(
-                torch._inductor.metrics.generated_kernel_count, 2 * kernel_count
+                torch._inductor.metrics.generated_kernel_count,
+                multiplier * kernel_count,
             )
 
     return test_fn
 
 
+class CompiledOptimizerParityTests(TestCase):
+    @skipCUDAIf(not has_triton(), "torch.compile with cuda requires triton")
+    @optims(optim_db, dtypes=[torch.float32])
+    @parametrize("use_closure", [True, False])
+    def test_correctness(self, device, dtype, optim_info, use_closure):
+        optim_cls = optim_info.optim_cls
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
+            device, dtype, optim_info, skip=("differentiable",)
+        )
+
+        if optim_info.step_requires_closure and not use_closure:
+            return
+
+        for optim_input in all_optim_inputs:
+            kwargs = optim_input.kwargs
+
+            use_scheduler = isinstance(kwargs.get("lr", None), torch.Tensor)
+            scheduler_classes = (
+                list(LR_SCHEDULER_TO_KWARGS.keys()) if use_scheduler else [None]
+            )
+
+            for scheduler_cls in scheduler_classes:
+                torch._dynamo.reset()
+                torch._inductor.metrics.reset()
+                input = torch.ones([10, 10], device=device)
+                model_eager = torch.nn.Sequential(
+                    *[torch.nn.Linear(10, 10, device=device) for _ in range(2)]
+                )
+                model_eager(input).sum().backward()
+                model_compiled = deepcopy(model_eager)
+                model_compiled(input).sum().backward()
+
+                if optim_cls is SparseAdam:
+                    for param in model_eager.parameters():
+                        param.grad = param.grad.to_sparse()
+                    for param in model_compiled.parameters():
+                        param.grad = param.grad.to_sparse()
+
+                opt_compiled = optim_cls(
+                    model_compiled.parameters(), **deepcopy(kwargs)
+                )
+                opt_eager = optim_cls(model_eager.parameters(), **deepcopy(kwargs))
+                if scheduler_cls:
+                    scheduler_compiled = create_scheduler(scheduler_cls, opt_compiled)
+                    scheduler_eager = create_scheduler(scheduler_cls, opt_eager)
+                    # some schedulers only change after at least an epoch has passed
+                    scheduler_compiled.last_epoch = 1
+                    scheduler_eager.last_epoch = 1
+
+                num_steps = 2
+                if use_closure:
+
+                    @torch.compile()
+                    def fn():
+                        def closure():
+                            loss = model_compiled(input).sum()
+                            loss.backward()
+                            if optim_info.only_supports_sparse_grads:
+                                for param in model_compiled.parameters():
+                                    param.grad = param.grad.to_sparse()
+                            return loss
+
+                        opt_compiled.step(closure)
+                        if scheduler_cls:
+                            call_scheduler(scheduler_compiled)
+
+                    def closure_eager():
+                        loss = model_eager(input).sum()
+                        loss.backward()
+                        if optim_info.only_supports_sparse_grads:
+                            for param in model_eager.parameters():
+                                param.grad = param.grad.to_sparse()
+
+                        return loss
+
+                    for _ in range(num_steps):
+                        opt_eager.step(closure_eager)
+                        if scheduler_cls:
+                            call_scheduler(scheduler_eager)
+                else:
+
+                    @torch.compile()
+                    def fn():
+                        opt_compiled.step()
+                        if scheduler_cls:
+                            call_scheduler(scheduler_compiled)
+
+                    for _ in range(num_steps):
+                        opt_eager.step()
+                        if scheduler_cls:
+                            call_scheduler(scheduler_eager)
+
+                for _ in range(num_steps):
+                    fn()
+
+                check_optim(
+                    self,
+                    optim_cls,
+                    model_eager.parameters(),
+                    model_compiled.parameters(),
+                    opt_eager.state,
+                    opt_compiled.state,
+                )
+
+
 class CompiledOptimizerTests(TestCase):
     check_model_cuda = check_model_cuda
     check_model_cpu = check_model
@@ -263,34 +561,45 @@ class CompiledOptimizerTests(TestCase):
 
     def setUp(self):
         super().setUp()
+        torch._dynamo.reset()
         torch._inductor.metrics.reset()
 
     def tearDown(self):
         super().tearDown()
+        torch._dynamo.reset()
         torch._inductor.metrics.reset()
 
-    # test_sgd = make_test(SGD, kernel_count=1, lr=0.01)
+    def check_cudagraphs_ran(self):
+        # We run the zeroth device currently
+        manager = torch._inductor.cudagraph_trees.get_container(0).tree_manager
+        self.assertIsNotNone(manager)
+        self.assertEqual(manager.new_graph_id().id, 1)
 
     test_adam_recompile = make_recompile_test(Adam, lr=0.01)
     test_adamw_recompile = make_recompile_test(AdamW, lr=0.01)
-    # Need an impl which does not use python scalars
-    # test_adamax_recompile = make_recompile_test(Adamax, lr=0.01)
+    test_adamax_recompile = make_recompile_test(Adamax, lr=0.01)
     test_nadam_recompile = make_recompile_test(NAdam, lr=0.01)
-    test_rprop_recompile = make_recompile_test(Rprop, kernel_count=1, lr=0.01)
-    test_rmsprop_recompile = make_recompile_test(RMSprop, kernel_count=1, lr=0.01)
-    test_adadelta_recompile = make_recompile_test(Adadelta, kernel_count=1, lr=0.01)
-    test_adagrad_recompile = make_recompile_test(Adagrad, kernel_count=5, lr=0.01)
-    test_asgd_recompile_default = make_recompile_test(ASGD, kernel_count=2, lr=0.01)
+    test_rprop_recompile = make_recompile_test(Rprop, lr=0.01)
+    test_rmsprop_recompile = make_recompile_test(RMSprop, lr=0.01)
+    test_adadelta_recompile = make_recompile_test(Adadelta, lr=0.01)
+    test_adagrad_recompile = make_recompile_test(Adagrad, lr=0.01)
+    test_asgd_recompile_default = make_recompile_test(ASGD, lr=0.01)
     test_asgd_recompile_single = make_recompile_test(
-        ASGD, kernel_count=12, lr=0.01, foreach=False
+        ASGD, kernel_count=11, lr=0.01, foreach=False
     )
-    test_asgd_recompile_foreach = make_recompile_test(
-        ASGD, kernel_count=2, lr=0.01, foreach=True
+    test_asgd_recompile_foreach = make_recompile_test(ASGD, lr=0.01, foreach=True)
+    test_sgd_recompile_single = make_recompile_test(
+        SGD, kernel_count=4, lr=0.01, foreach=False
+    )
+    test_sgd_recompile_foreach = make_recompile_test(
+        SGD, kernel_count=1, lr=0.01, foreach=True
     )
-    # test_sgd_recompile = make_recompile_test(SGD, kernel_count=1, lr=0.01)
 
-    @requires_cuda()
+    @requires_cuda
     def test_static_address_finalizer(self):
+        import gc
+
+        gc.disable()
         p_ref = None
 
         def fn():
@@ -313,13 +622,162 @@ def fn():
         fn()
 
         self.assertTrue(p_ref() is None)
+        gc.enable()
+
+    def test_guard_on_none_grads(self):
+        def training_loop():
+            input = torch.tensor([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]).reshape(3, 2)
+
+            model = torch.nn.Sequential(
+                torch.nn.Linear(2, 3),
+                torch.nn.Sigmoid(),
+                torch.nn.Linear(3, 1),
+                torch.nn.Sigmoid(),
+            )
+
+            params = list(model.parameters())
+            optimizer = torch.optim.Adam(params)
+            step_list = []
+
+            for i in range(6):
+                optimizer.zero_grad()
+                # Test that step behaves as expected (a no-op) when grads are set to None
+                if i != 3:
+                    output = model(input)
+                    loss = output.sum()
+                    loss.backward()
+
+                optimizer.step()
+                step_list.append(optimizer.state[params[0]]["step"])
+
+            return step_list
+
+        compiled_training_loop = torch._dynamo.optimize("eager")(training_loop)
+        actual_steps = compiled_training_loop()
+        expected_steps = training_loop()
+        self.assertEqual(actual_steps, expected_steps)
+
+    # Basic shampoo test to verify we support compiling the various ops without error
+    @requires_cuda
+    def test_basic_shampoo(self):
+        param_buf = torch.rand((1024, 128))
+        param_buf_c = param_buf.clone().detach()
+
+        params_c = [param_buf_c[0:512, :].t(), param_buf_c[512:, :].t()]
+        params = [param_buf[0:512, :].t(), param_buf[512:, :].t()]
+
+        for p, p_c in zip(params, params_c):
+            p.grad = torch.rand_like(p)
+            p_c.grad = p.grad.clone().detach()
+
+        # note this skips the root inverse because this has a lot of internal dependencies
+        # we also don't compile it regardless
+        @torch.no_grad()
+        def shampoo_functional_basic(params):
+            step = 1
+            weight_decay = 0.1
+            grads = [p.grad for p in params]
+            beta1 = 0.9
+            beta2 = 1.0
+            epsilon = 1e-10
+            preconditioners = [torch.zeros_like(p) for p in params]
+            lr = 0.01
+
+            # pt2 region 1
+            # weight decay
+            torch._foreach_add_(grads, params, alpha=weight_decay)
+
+            # update preconditioners
+            torch._foreach_addcmul_(preconditioners, grads, grads, value=1.0)
+
+            torch._foreach_mul_(grads, beta1)
+            torch._foreach_add_(
+                grads,
+                grads,
+                alpha=1 - beta1,
+            )
+            bias_correction1 = 1.0 - beta1**step
+            grad_list = torch._foreach_div(grads, bias_correction1)
+
+            # pt2 region 2
+            # precondition (with shampoo branch), with no grafting
+            bias_correction2 = 1.0 - beta2**step
+            bias_corrected_preconditioner_list = torch._foreach_div(
+                preconditioners, bias_correction2
+            )
+            torch._foreach_sqrt_(bias_corrected_preconditioner_list)
+            torch._foreach_add_(bias_corrected_preconditioner_list, epsilon)
+            search_directions = torch._foreach_div(
+                grad_list, bias_corrected_preconditioner_list
+            )
+
+            torch._foreach_add_(
+                search_directions,
+                params,
+                alpha=weight_decay,
+            )
+
+            torch._foreach_mul_(search_directions, -lr)
+            # pt2 region 3 update params
+            torch._foreach_add_(params, search_directions)
+
+            return params, preconditioners, grads
+
+        compiled_fn = torch.compile(shampoo_functional_basic)
+
+        self.assertEqual(compiled_fn(params_c), shampoo_functional_basic(params))
+
+    @requires_cuda
+    def test_closure_graph_break(self):
+        param = torch.rand(2, 3, dtype=torch.float32, device="cuda", requires_grad=True)
+        param_c = param.clone().detach().requires_grad_(True)
+
+        def closure():
+            param.grad = torch.ones_like(param) * 2
+            return param.grad
+
+        def closure_c():
+            param_c.grad = torch.ones_like(param_c) * 2
+            return param_c.grad
 
+        optimizer = torch.optim.AdamW([param])
+        optimizer_c = torch.optim.AdamW([param_c])
+
+        def loop(opt, c):
+            opt.step(c)
+
+        compiled_loop = torch._dynamo.optimize("eager")(loop)
+
+        compiled_loop(optimizer, closure)
+        loop(optimizer_c, closure_c)
+
+        self.assertEqual(param, param_c)
+
+    def test_get_value_on_static_address(self):
+        from torch._dynamo.decorators import mark_static_address
+        from torch.optim.optimizer import _get_value
+
+        compiled = torch.compile(_get_value)
+
+        x = torch.ones(2, 2)
+        mark_static_address(x)
+
+        ret_val = compiled(x)
+
+        self.assertEqual(ret_val, x)
+
+
+for optim_cls, name, kwargs, scheduler_cls in COMPILED_OPT_KWARG_DB:
+    setattr(
+        CompiledOptimizerTests,
+        name,
+        make_test(optim_cls, scheduler_cls=scheduler_cls, **kwargs),
+    )
 
-for optim_cls, name, kwargs in COMPILED_OPT_KWARG_DB:
-    setattr(CompiledOptimizerTests, name, make_test(optim_cls, **kwargs))
+instantiate_device_type_tests(CompiledOptimizerParityTests, globals())
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if HAS_CPU or HAS_CUDA:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_config.py b/test/inductor/test_config.py
index f8d9fc43a2d82..ce08edd310cd7 100644
--- a/test/inductor/test_config.py
+++ b/test/inductor/test_config.py
@@ -4,9 +4,9 @@
 
 import torch
 
-from torch._dynamo.test_case import run_tests, TestCase
-
 from torch._inductor import config
+
+from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.inductor_utils import HAS_CPU
 
 
diff --git a/test/inductor/test_control_flow.py b/test/inductor/test_control_flow.py
new file mode 100644
index 0000000000000..47a5980b6d79c
--- /dev/null
+++ b/test/inductor/test_control_flow.py
@@ -0,0 +1,673 @@
+# Owner(s): ["module: inductor"]
+import itertools
+
+import torch
+
+from torch._inductor.test_case import TestCase
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    skipIfRocm,
+)
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+from torch.testing._internal.triton_utils import requires_cuda
+
+
+def _prepend_product_of_values(inputs, possible_values, num_to_prepend=1):
+    result = []
+    device = inputs[0].device
+    # iterate over the cartesian product of predicate values
+    for values in itertools.product(*([possible_values] * num_to_prepend)):
+        prepended = [torch.tensor(v, device=device) for v in values]
+        result.append((*prepended, *inputs))
+    return result
+
+
+def prepend_predicates(inputs, num_predicates=1):
+    return _prepend_product_of_values(inputs, [False, True], num_predicates)
+
+
+def prepend_counters(inputs, num_counters=1, counter_values=(0, 1, 5)):
+    return _prepend_product_of_values(inputs, counter_values, num_counters)
+
+
+class CondModels:
+    class Simple(torch.nn.Module):
+        def forward(self, p, a, b):
+            def true_fn(x, y):
+                return x + y
+
+            def false_fn(x, y):
+                return x - y
+
+            return torch.cond(p, true_fn, false_fn, [a, b])
+
+    class Nested(torch.nn.Module):
+        def forward(self, p0, p1, p2, a, b, c):
+            def true_fn(x0, y0, z0):
+                def true_true_fn(x1, y1, z1):
+                    return (x1 - y1 * z1) * 3.14
+
+                def true_false_fn(x1, y1, z1):
+                    def true_false_true_fn(x2, y2, z2):
+                        return (x2 * y2 * z2) / 2.71
+
+                    def true_false_false_fn(x2, y2, z2):
+                        return (x2 + y2 + z2) * 1.23
+
+                    return torch.cond(
+                        p2, true_false_true_fn, true_false_false_fn, [x1, y1, z1]
+                    )
+
+                return torch.cond(p1, true_true_fn, true_false_fn, [x0, y0, z0])
+
+            def false_fn(x0, y0, z0):
+                def false_true_fn(x1, y1, z1):
+                    def false_true_true_fn(x2, y2, z2):
+                        return (x2 - y2 - z2) + 1.23
+
+                    def false_true_false_fn(x2, y2, z2):
+                        return (x2 / y2 / z2) - 3.14
+
+                    return torch.cond(
+                        p2, false_true_true_fn, false_true_false_fn, [x1, y1, z1]
+                    )
+
+                def false_false_fn(x1, y1, z1):
+                    return (x1 - y1 * z1) / 2.71
+
+                return torch.cond(p1, false_true_fn, false_false_fn, [x0, y0, z0])
+
+            return torch.cond(p0, true_fn, false_fn, [a, b, c])
+
+    class Parameters(torch.nn.Module):
+        class InnerModel1(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.layer = torch.nn.Linear(20, 30, device=device)
+
+            def forward(self, x):
+                return self.layer(x + 1) * 3.14
+
+        class InnerModel2(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.layer1 = torch.nn.Linear(20, 10, device=device)
+                self.layer2 = torch.nn.Linear(10, 30, device=device)
+
+            def forward(self, x):
+                return self.layer2(self.layer1(x - 2)) * 3.14
+
+        def __init__(self, device):
+            super().__init__()
+            self.true_fn = self.InnerModel1(device)
+            self.false_fn = self.InnerModel2(device)
+
+        def forward(self, p, a):
+            return torch.cond(p, self.true_fn, self.false_fn, [a])
+
+    class ReinterpretView(torch.nn.Module):
+        def forward(self, p, a, b):
+            def true_fn(x, y):
+                z1 = x + y
+                z2 = x - y
+                return z1[2:], z2[:, 4:]
+
+            def false_fn(x, y):
+                z1 = x - y
+                z2 = x + y
+                return z1[2:], z2[:, 4:]
+
+            return torch.cond(p, true_fn, false_fn, [a[:-1], b[:-1]])
+
+    class MultipleOutputs(torch.nn.Module):
+        def forward(self, p, a, b, c):
+            def true_fn(x, y, z):
+                return x * y, z / 2.71, (y - x).sum(dim=1)
+
+            def false_fn(x, y, z):
+                return y / x, z * 3.14, (x + y).mean(dim=1)
+
+            return torch.cond(p, true_fn, false_fn, [a, b, c])
+
+    class OuterCode(torch.nn.Module):
+        def forward(self, p, a, b):
+            c = a * b + 3.14
+            d = a / b - 2.71
+
+            def true_fn(x, y):
+                return x + y
+
+            def false_fn(x, y):
+                return x - y
+
+            e = torch.cond(p, true_fn, false_fn, [c, d])
+
+            return e * e / 1.41
+
+    class OuterBuffers(torch.nn.Module):
+        def forward(self, p, a, b, c):
+            d = a * 2
+            e = b / 2
+
+            def true_fn(x):
+                return x + d
+
+            def false_fn(x):
+                return x - e
+
+            return torch.cond(p, true_fn, false_fn, [c])
+
+    class WithNonTensorPredicate(torch.nn.Module):
+        def forward(self, a, b):
+            def true_fn(x, y):
+                return x.sum(0) / 3.14
+
+            def false_fn(x, y):
+                return y.sum(0) * 2.71
+
+            return torch.cond(a.size(0) > b.size(0), true_fn, false_fn, [a, b])
+
+
+class CondTests(TestCase):
+    def _run_test(
+        self,
+        model,
+        inputs,
+        device,
+        dynamic=False,
+        num_predicates=1,
+    ):
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("inductor")
+        compiled_model = torch.compile(backend=cnt, fullgraph=True)(model)
+
+        inputs = [inp.to(device=device) for inp in inputs]
+        input_sets = [inputs]
+        if dynamic:
+            larger_inputs = []
+            for inp in inputs:
+                # tile every first dim 5x
+                tiling = [5] + [1] * (inp.ndim - 1)
+                larger_inputs.append(torch.tile(inp, tiling))
+            input_sets.append(larger_inputs)
+            for inputs in input_sets:
+                for inp in inputs:
+                    # mark every first dim as dynamic
+                    torch._dynamo.mark_dynamic(inp, 0)
+
+        for inputs in input_sets:
+            for inputs_with_predicates in prepend_predicates(inputs, num_predicates):
+                cloned_inputs = [inp.clone() for inp in inputs_with_predicates]
+                result = model(*inputs_with_predicates)
+                result_compiled = compiled_model(*inputs_with_predicates)
+                # inputs must not be mutated
+                torch.testing.assert_close(cloned_inputs, inputs_with_predicates)
+                torch.testing.assert_close(result, result_compiled)
+
+        self.assertEqual(cnt.frame_count, 1, "only one compilation expected")
+
+    @requires_cuda
+    @parametrize("device", ["cpu", "cuda"])
+    @parametrize("dynamic", [False, True])
+    def test_cond_simple_control_flow(self, device, dynamic):
+        # cond control flow without nesting
+        self._run_test(
+            model=CondModels.Simple(),
+            inputs=(
+                torch.randn(10, 20),
+                torch.randn(10, 20),
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_cuda
+    @parametrize("device", ["cpu", "cuda"])
+    @parametrize("dynamic", [False, True])
+    def test_cond_nested_control_flow(self, device, dynamic):
+        # cond control flow with nesting
+        self._run_test(
+            model=CondModels.Nested(),
+            inputs=(
+                torch.randn(10, 20),
+                torch.randn(10, 20),
+                torch.randn(10, 20),
+            ),
+            device=device,
+            dynamic=dynamic,
+            num_predicates=3,
+        )
+
+    @requires_cuda
+    @parametrize("device", ["cpu", "cuda"])
+    @parametrize("dynamic", [False, True])
+    def test_cond_outer_code_before_after(self, device, dynamic):
+        # some code before and after the conditional
+        self._run_test(
+            model=CondModels.OuterCode(),
+            inputs=(
+                torch.randn(10, 20),
+                torch.randn(10, 20),
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_cuda
+    @parametrize("device", ["cpu", "cuda"])
+    @parametrize("dynamic", [False, True])
+    def test_cond_multiple_outputs(self, device, dynamic):
+        # multiple outputs with different shapes
+        self._run_test(
+            model=CondModels.MultipleOutputs(),
+            inputs=(
+                torch.randn(10, 20),
+                torch.randn(10, 20),
+                torch.randn(30, 40),
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_cuda
+    @parametrize("device", ["cpu", "cuda"])
+    def test_cond_advanced_dynamic_shapes(self, device):
+        # subgraphs input shapes include symbolic expressions
+        class Model(torch.nn.Module):
+            def forward(self, p, a, b):
+                def true_fn(x, y):
+                    return torch.cat([x - 3, y * 3], dim=1)
+
+                def false_fn(x, y):
+                    return torch.cat([x / 3, y - 3], dim=1)
+
+                c = torch.cat([a, b], dim=0)
+                d = c * 2
+                e = c / 2
+
+                return torch.cond(p, true_fn, false_fn, [d, e])
+
+        self._run_test(
+            model=Model(),
+            inputs=(
+                torch.randn(2, 3, 3),
+                torch.randn(4, 3, 3),
+            ),
+            device=device,
+            dynamic=True,
+        )
+
+    @requires_cuda
+    def test_cond_use_buffers_from_outer_scope(self):
+        # subgraphs input shapes include symbolic expressions
+        self._run_test(
+            model=CondModels.OuterBuffers(),
+            inputs=(
+                torch.randn(10, 20),
+                torch.randn(10, 20),
+                torch.randn(10, 20),
+            ),
+            device="cuda",
+            dynamic=False,
+        )
+
+    @requires_cuda
+    def test_cond_reintepret_view_inputs_outputs(self):
+        # ReinterpretView in inputs and outputs of the subgraphs
+        self._run_test(
+            model=CondModels.ReinterpretView(),
+            inputs=(
+                torch.randn(10, 20),
+                torch.randn(10, 20),
+            ),
+            device="cuda",
+            dynamic=True,
+        )
+
+    @skipIfRocm
+    @requires_cuda
+    @parametrize("device", ["cpu", "cuda"])
+    @parametrize("dynamic", [False, True])
+    def test_cond_subgraphs_with_parameters(self, device, dynamic):
+        # nested Modules with parameters
+        self._run_test(
+            model=CondModels.Parameters(device),
+            inputs=(torch.randn(10, 20),),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_cuda
+    @parametrize("device", ["cpu", "cuda"])
+    @parametrize("dynamic", [False, True])
+    def test_cond_non_tensor_predicates(self, device, dynamic):
+        # model with a boolean predicate
+        for b_size_0 in [5, 15]:
+            torch._dynamo.reset()
+            self._run_test(
+                model=CondModels.WithNonTensorPredicate(),
+                inputs=(
+                    torch.randn(10, 20),
+                    torch.randn(b_size_0, 20),
+                ),
+                device=device,
+                dynamic=dynamic,
+                num_predicates=0,
+            )
+
+    @requires_cuda
+    def test_cond_aliasing_outputs(self):
+        # output aliasing in subgraphs: not supported
+        class Model(torch.nn.Module):
+            def forward(self, p, a, b):
+                def true_fn(x, y):
+                    z = x + y
+                    return z, z[1:]
+
+                def false_fn(x, y):
+                    z = x - y
+                    return z, z[1:]
+
+                return torch.cond(p, true_fn, false_fn, [a, b])
+
+        # AssertionError: Output aliasing is currently not supported...
+        with self.assertRaises(torch._dynamo.exc.BackendCompilerFailed):
+            torch.compile(Model())(
+                torch.tensor(True),
+                torch.randn(10, 20),
+                torch.randn(10, 20),
+            )
+
+    @requires_cuda
+    @parametrize("device", ["cpu", "cuda"])
+    def test_cond_decompose_ops_in_subgraph(self, device):
+        class Model(torch.nn.Module):
+            def forward(self, p, a):
+                def true_fn(x):
+                    return torch.zeros_like(x)
+
+                def false_fn(x):
+                    return torch.ones_like(x)
+
+                b = torch.ones_like(a)
+                c = torch.cond(p, true_fn, false_fn, [b])
+                return c
+
+        self._run_test(
+            model=Model(),
+            inputs=(torch.rand(10, 20),),
+            device=device,
+        )
+
+    @requires_cuda
+    @parametrize("device", ["cpu", "cuda"])
+    def test_cond_decompose_ops_in_subgraph_recursive(self, device):
+        def inner_fn1(x):
+            return torch.zeros_like(x)
+
+        def inner_fn2(x):
+            return torch.ones_like(x)
+
+        class Model(torch.nn.Module):
+            def forward(self, p, a):
+                def true_fn(x):
+                    return torch.cond(p, inner_fn2, inner_fn1, [x])
+
+                def false_fn(x):
+                    return torch.cond(p, inner_fn1, inner_fn2, [x])
+
+                b = torch.ones_like(a)
+                c = torch.cond(p, true_fn, false_fn, [b])
+                return c
+
+        self._run_test(
+            model=Model(),
+            inputs=(torch.rand(10, 20),),
+            device=device,
+        )
+
+    @requires_cuda
+    def test_cond_inductor_fx_passes_recursively_applied(self):
+        counters = {"pre_grad": 0, "post_grad": 0}
+
+        def pre_grad_pass_counter(gm):
+            counters["pre_grad"] += 1
+
+        def post_grad_pass_counter(gm):
+            counters["post_grad"] += 1
+
+        with torch._inductor.config.patch(
+            {
+                "pre_grad_custom_pass": pre_grad_pass_counter,
+                "post_grad_custom_pre_pass": post_grad_pass_counter,
+                # The above patches don't pickle
+                "fx_graph_cache": False,
+            }
+        ):
+            self._run_test(
+                model=CondModels.Nested(),
+                inputs=(
+                    torch.randn(10, 20),
+                    torch.randn(10, 20),
+                    torch.randn(10, 20),
+                ),
+                device="cuda",
+                dynamic=True,
+                num_predicates=3,
+            )
+
+        self.assertEqual(counters["pre_grad"], 11)
+        self.assertEqual(counters["post_grad"], 11)
+
+
+class WhileLoopModels:
+    class Simple(torch.nn.Module):
+        def forward(self, ci, a, b):
+            def cond_fn(i, x, y):
+                return i > 0
+
+            def body_fn(i, x, y):
+                return i - 1, x + y, y - x
+
+            return torch._higher_order_ops.while_loop(cond_fn, body_fn, [ci, a, b])
+
+    class Nested(torch.nn.Module):
+        def forward(self, ci, cj, a, b):
+            def cond_fn(i1, j1, x1, y1):
+                return i1 > 0
+
+            def body_fn(i1, j1, x1, y1):
+                def cond_fn_nested(i2, j2, x2, y2):
+                    return j2 > 0
+
+                def body_fn_nested(i2, j2, x2, y2):
+                    return i2.clone(), j2 - 1, x2 + 3.14, y2 - 2.71
+
+                i1, j1, x1, y1 = torch._higher_order_ops.while_loop(
+                    cond_fn_nested, body_fn_nested, [i1, j1, x1, y1]
+                )
+
+                return i1 - 1, j1.clone(), x1 * 2, y1 / 2
+
+            return torch._higher_order_ops.while_loop(cond_fn, body_fn, (ci, cj, a, b))
+
+    class Parameters(torch.nn.Module):
+        class InnerModel(torch.nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.layer1 = torch.nn.Linear(20, 30, device=device)
+                self.layer2 = torch.nn.Linear(30, 20, device=device)
+
+            def forward(self, c, x):
+                return c - 1, self.layer2(self.layer1(x - 2)) * 3.14
+
+        def __init__(self, device):
+            super().__init__()
+            self.body_fn = self.InnerModel(device)
+            self.cond_fn = lambda c, x: c > 0
+
+        def forward(self, c, a):
+            return torch._higher_order_ops.while_loop(
+                self.cond_fn, self.body_fn, [c, a]
+            )
+
+    class OuterCode(torch.nn.Module):
+        def forward(self, c, a, b):
+            d = a * b + 3.14
+            e = a / b - 2.71
+
+            def cond_fn(c, x, y):
+                return c > 0
+
+            def body_fn(c, x, y):
+                return c - 1, y - x, x + y
+
+            _, f, g = torch._higher_order_ops.while_loop(cond_fn, body_fn, [c, d, e])
+
+            return f * g / 1.41
+
+    # TODO(aakhundov): add while_loop test with outer buffers
+    # with dynamic=True once dynamo / export allows while_loop
+    # closure capture with mark_dynamic:
+    # https://github.com/pytorch/pytorch/issues/123596
+    class OuterBuffers(torch.nn.Module):
+        def forward(self, c, a, b):
+            d = a * 2
+            e = b / 2
+
+            def cond_fn(c, x, y):
+                return c > 0
+
+            def body_fn(c, x, y):
+                return c - 1, x + d, y - e
+
+            return torch._higher_order_ops.while_loop(cond_fn, body_fn, [c, a, b])
+
+
+class WhileLoopTests(TestCase):
+    def _run_test(
+        self,
+        model,
+        inputs,
+        device,
+        dynamic=False,
+        num_counters=1,
+    ):
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("inductor")
+        compiled_model = torch.compile(backend=cnt, fullgraph=True)(model)
+
+        inputs = [inp.to(device=device) for inp in inputs]
+        input_sets = [inputs]
+        if dynamic:
+            larger_inputs = []
+            for inp in inputs:
+                # tile every first dim 5x
+                tiling = [5] + [1] * (inp.ndim - 1)
+                larger_inputs.append(torch.tile(inp, tiling))
+            input_sets.append(larger_inputs)
+            for inputs in input_sets:
+                for inp in inputs:
+                    # mark every first dim as dynamic
+                    if inp.ndim:
+                        torch._dynamo.mark_dynamic(inp, 0)
+
+        for inputs in input_sets:
+            for inputs_with_counters in prepend_counters(inputs, num_counters):
+                cloned_inputs = [inp.clone() for inp in inputs_with_counters]
+                result = model(*inputs_with_counters)
+                with torch.no_grad():
+                    result_compiled = compiled_model(*inputs_with_counters)
+                # inputs must not be mutated
+                torch.testing.assert_close(cloned_inputs, inputs_with_counters)
+                torch.testing.assert_close(
+                    result, result_compiled, atol=1e-4, rtol=1e-4
+                )
+
+        self.assertEqual(cnt.frame_count, 1, "only one compilation expected")
+
+    @requires_cuda
+    @parametrize("device", ["cpu", "cuda"])
+    @parametrize("dynamic", [False, True])
+    def test_while_loop_simple_control_flow(self, device, dynamic):
+        # while_loop control flow without nesting
+        self._run_test(
+            model=WhileLoopModels.Simple(),
+            inputs=(
+                torch.randn(10, 20),
+                torch.randn(10, 20),
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_cuda
+    @parametrize("device", ["cpu", "cuda"])
+    @parametrize("dynamic", [False, True])
+    def test_while_loop_nested_control_flow(self, device, dynamic):
+        # while_loop control flow with nesting
+        self._run_test(
+            model=WhileLoopModels.Nested(),
+            inputs=(
+                torch.randn(10, 20),
+                torch.randn(10, 20),
+            ),
+            device=device,
+            dynamic=dynamic,
+            num_counters=2,
+        )
+
+    @requires_cuda
+    @parametrize("device", ["cpu", "cuda"])
+    @parametrize("dynamic", [False, True])
+    def test_while_loop_with_outer_code(self, device, dynamic):
+        # while_loop control flow with outer code
+        self._run_test(
+            model=WhileLoopModels.OuterCode(),
+            inputs=(
+                torch.randn(10, 20),
+                torch.randn(10, 20),
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @skipIfRocm
+    @requires_cuda
+    @parametrize("device", ["cpu", "cuda"])
+    @parametrize("dynamic", [False, True])
+    def test_while_loop_with_parameters(self, device, dynamic):
+        # while_loop control flow with parameters
+        self._run_test(
+            model=WhileLoopModels.Parameters(device),
+            inputs=(torch.randn(10, 20),),
+            device=device,
+            dynamic=dynamic,
+        )
+
+    @requires_cuda
+    @parametrize("device", ["cpu", "cuda"])
+    # dynamic=True doesn't work now due to
+    # https://github.com/pytorch/pytorch/issues/123596
+    @parametrize("dynamic", [False])
+    def test_while_loop_with_outer_buffers(self, device, dynamic):
+        # while_loop control flow with outer code
+        self._run_test(
+            model=WhileLoopModels.OuterBuffers(),
+            inputs=(
+                torch.randn(10, 20),
+                torch.randn(10, 20),
+            ),
+            device=device,
+            dynamic=dynamic,
+        )
+
+
+instantiate_parametrized_tests(CondTests)
+instantiate_parametrized_tests(WhileLoopTests)
+
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    if HAS_CPU or HAS_CUDA:
+        run_tests(needs="filelock")
diff --git a/test/inductor/test_coordinate_descent_tuner.py b/test/inductor/test_coordinate_descent_tuner.py
index 90153195eb1fc..70618c06e9ec6 100644
--- a/test/inductor/test_coordinate_descent_tuner.py
+++ b/test/inductor/test_coordinate_descent_tuner.py
@@ -5,8 +5,9 @@
 from unittest import mock
 
 import torch
+from torch._inductor.runtime.hints import TRITON_MAX_BLOCK
 
-from torch._dynamo.test_case import run_tests, TestCase
+from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.common_utils import IS_LINUX
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
@@ -18,7 +19,7 @@
     raise unittest.SkipTest("requires triton")  # noqa: TRY200
 
 from torch._inductor import config
-from torch._inductor.coordinate_descent_tuner import CoordescTuner
+from torch._inductor.runtime.coordinate_descent_tuner import CoordescTuner
 
 config.benchmark_kernel = True
 config.coordinate_descent_tuning = True
@@ -98,6 +99,18 @@ def f(x):
                 f"Expected:\n{expected}\nActual:\n{actual}",
             )
 
+    def test_value_too_large(self):
+        # Simulate a reduction
+        size_hints = [2**20, 2**20]
+
+        tuner = CoordescTuner(size_hints=size_hints)
+
+        max_block = TRITON_MAX_BLOCK
+        self.assertFalse(tuner.value_too_large("XBLOCK", max_block["X"]))
+        self.assertTrue(tuner.value_too_large("XBLOCK", max_block["X"] * 2))
+        self.assertFalse(tuner.value_too_large("RBLOCK", max_block["R"]))
+        self.assertTrue(tuner.value_too_large("RBLOCK", max_block["R"] * 2))
+
 
 if __name__ == "__main__":
     if IS_LINUX and HAS_CUDA:
diff --git a/test/inductor/test_cpu_cpp_wrapper.py b/test/inductor/test_cpu_cpp_wrapper.py
index 221a7d231f7cf..b8fdbc49bd387 100644
--- a/test/inductor/test_cpu_cpp_wrapper.py
+++ b/test/inductor/test_cpu_cpp_wrapper.py
@@ -5,14 +5,11 @@
 
 import torch
 from torch._inductor import config
+from torch._inductor.test_case import TestCase as InductorTestCase
 from torch.testing._internal.common_device_type import (
     get_desired_device_type_test_bases,
 )
-from torch.testing._internal.common_utils import (
-    IS_MACOS,
-    slowTest,
-    TestCase as TorchTestCase,
-)
+from torch.testing._internal.common_utils import IS_MACOS, slowTest
 from torch.testing._internal.inductor_utils import HAS_CPU
 
 
@@ -47,11 +44,11 @@ class CppWrapperTemplate:
     pass
 
 
-class TestCppWrapper(TorchTestCase):
+class TestCppWrapper(InductorTestCase):
     device = "cpu"
 
 
-class DynamicShapesCppWrapperCpuTests(TorchTestCase):
+class DynamicShapesCppWrapperCpuTests(InductorTestCase):
     device = "cpu"
 
 
@@ -72,6 +69,43 @@ class DynamicShapesCppWrapperCpuTests(TorchTestCase):
     ),
 }
 
+if config.abi_compatible:
+    xfail_list = [
+        "test_bernoulli1_cpu",  # cpp fallback op naming issue
+        "test_conv2d_binary_inplace_fusion_failed_cpu",
+        "test_conv2d_binary_inplace_fusion_pass_cpu",
+        "test_dynamic_qlinear_cpu",
+        "test_dynamic_qlinear_qat_cpu",
+        "test_lstm_packed_change_input_sizes_cpu",
+        "test_profiler_mark_wrapper_call_cpu",
+        "test_qconv2d_add_cpu",
+        "test_qconv2d_add_relu_cpu",
+        "test_qconv2d_cpu",
+        "test_qconv2d_dequant_promotion_cpu",
+        "test_qconv2d_maxpool2d_linear_dynamic_cpu",
+        "test_qconv2d_relu_cpu",
+        "test_qlinear_cpu",
+        "test_qlinear_dequant_promotion_cpu",
+        "test_qlinear_relu_cpu",
+    ]
+    for test_name in xfail_list:
+        test_failures_cpp_wrapper[test_name] = test_torchinductor.TestFailure(
+            ("cpp_wrapper",), is_skip=False
+        )
+        test_failures_cpp_wrapper[
+            f"{test_name}_dynamic_shapes"
+        ] = test_torchinductor.TestFailure(("cpp_wrapper",), is_skip=False)
+    skip_list = [
+        "test_multihead_attention_cpu",
+    ]
+    for test_name in skip_list:
+        test_failures_cpp_wrapper[test_name] = test_torchinductor.TestFailure(
+            ("cpp_wrapper",), is_skip=True
+        )
+        test_failures_cpp_wrapper[
+            f"{test_name}_dynamic_shapes"
+        ] = test_torchinductor.TestFailure(("cpp_wrapper",), is_skip=True)
+
 
 def make_test_case(
     name,
@@ -95,16 +129,21 @@ def fn(self):
         tests.setUpClass()
         tests.setUp()
         try:
-            _, code = test_torchinductor.run_and_get_cpp_code(
-                func, *func_inputs if func_inputs else []
-            )
-            self.assertEqual("CppWrapperCodeCache" in code, True)
-            self.assertTrue(
-                all(
-                    code.count(string) == code_string_count[string]
-                    for string in code_string_count
+            with torch._C._PreserveDispatchKeyGuard():
+                torch._C._dispatch_tls_set_dispatch_key_included(
+                    torch._C.DispatchKey.Dense, True
+                )
+
+                _, code = test_torchinductor.run_and_get_cpp_code(
+                    func, *func_inputs if func_inputs else []
+                )
+                self.assertEqual("CppWrapperCodeCache" in code, True)
+                self.assertTrue(
+                    all(
+                        code.count(string) == code_string_count[string]
+                        for string in code_string_count
+                    )
                 )
-            )
         finally:
             tests.tearDown()
             tests.tearDownClass()
@@ -126,14 +165,15 @@ def fn(self):
     class BaseTest(NamedTuple):
         name: str
         device: str = "cpu"
-        tests: TorchTestCase = test_torchinductor.CpuTests()
+        tests: InductorTestCase = test_torchinductor.CpuTests()
         condition: bool = True
         slow: bool = False
         func_inputs: list = None
         code_string_count: dict = {}
 
     for item in [
-        BaseTest("test_add_complex2"),
+        BaseTest("test_add_complex"),
+        BaseTest("test_add_complex4"),
         BaseTest("test_as_strided"),  # buffer reuse
         BaseTest("test_bernoulli1"),
         BaseTest("test_bitwise"),  # int32
@@ -169,12 +209,15 @@ class BaseTest(NamedTuple):
         ),
         BaseTest("test_conv_transpose2d_packed", "cpu", test_cpu_repro.CPUReproTests()),
         BaseTest("test_cumsum"),
-        BaseTest("test_custom_op"),
+        BaseTest("test_custom_op_1"),
+        BaseTest("test_custom_op_2"),
+        BaseTest("test_custom_op_3"),
         BaseTest("test_dtype_sympy_expr"),
         BaseTest("test_embedding_bag"),  # test default FallbackKernel
         BaseTest("test_index_put1"),
         BaseTest("test_index_put_deterministic_fallback"),
         BaseTest("test_adding_tensor_offsets"),
+        BaseTest("test_inductor_layout_optimization_input_mutations"),
         BaseTest("test_int_div", "", test_cpu_repro.CPUReproTests()),
         BaseTest("test_linear1"),
         BaseTest("test_linear2"),
@@ -194,7 +237,10 @@ class BaseTest(NamedTuple):
         ),
         BaseTest("test_mm_views"),
         BaseTest("test_multihead_attention", "cpu", test_cpu_repro.CPUReproTests()),
-        BaseTest("test_multi_threading"),
+        BaseTest(
+            "test_multi_threading",
+            code_string_count={"py::gil_scoped_release release;": 1},
+        ),
         BaseTest("test_profiler_mark_wrapper_call"),
         BaseTest(
             "test_qconv2d",
@@ -257,12 +303,25 @@ class BaseTest(NamedTuple):
             test_mkldnn_pattern_matcher.TestPatternMatcher(),
             condition=torch.backends.mkldnn.is_available(),
         ),
+        BaseTest(
+            "test_dynamic_qlinear",
+            "cpu",
+            test_mkldnn_pattern_matcher.TestPatternMatcher(),
+            condition=torch.backends.mkldnn.is_available(),
+        ),
+        BaseTest(
+            "test_dynamic_qlinear_qat",
+            "cpu",
+            test_mkldnn_pattern_matcher.TestPatternMatcher(),
+            condition=torch.backends.mkldnn.is_available(),
+        ),
         BaseTest("test_randint"),
         BaseTest("test_randn_with_dtype_and_device"),
         BaseTest("test_reduction1"),  # Reduction
         BaseTest("test_relu"),  # multiple inputs
         BaseTest("test_repeat_interleave", "", test_cpu_repro.CPUReproTests()),
         BaseTest("test_scalar_input"),
+        BaseTest("test_scalar_output"),
         BaseTest("test_scaled_dot_product_attention"),
         BaseTest("test_scatter1"),
         BaseTest("test_scatter2"),
@@ -294,7 +353,12 @@ class BaseTest(NamedTuple):
             item.code_string_count,
         )
 
-    test_torchinductor.copy_tests(CppWrapperTemplate, TestCppWrapper, "cpp_wrapper")
+    test_torchinductor.copy_tests(
+        CppWrapperTemplate,
+        TestCppWrapper,
+        "cpp_wrapper",
+        test_failures_cpp_wrapper,
+    )
 
     DynamicShapesCppWrapperTemplate = (
         test_torchinductor_dynamic_shapes.make_dynamic_cls(CppWrapperTemplate)
@@ -310,7 +374,7 @@ class BaseTest(NamedTuple):
 
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if RUN_CPU:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
index cb12f3943f4a3..1705cd5e2996d 100644
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: cpu inductor"]
 import contextlib
 import copy
+import functools
 import itertools
 import math
 import platform
@@ -12,6 +13,7 @@
 import numpy as np
 import sympy
 import torch
+from torch import nn
 from torch._C import FileCheck
 from torch._dynamo.testing import rand_strided
 from torch._dynamo.utils import same
@@ -33,7 +35,12 @@
 from torch._inductor.virtualized import V
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.nn import functional as F
-from torch.testing._internal.common_utils import IS_MACOS, slowTest
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    IS_MACOS,
+    parametrize,
+    slowTest,
+)
 from torch.utils._python_dispatch import TorchDispatchMode
 
 try:
@@ -57,6 +64,23 @@
 aten = torch.ops.aten
 check_model = test_torchinductor.check_model
 
+requires_vectorization = unittest.skipUnless(
+    codecache.valid_vec_isa_list(), "Does not support vectorization"
+)
+
+
+def check_metrics_vec_kernel_count(num_expected_vec_kernels):
+    if codecache.valid_vec_isa_list():
+        assert metrics.generated_cpp_vec_kernel_count == num_expected_vec_kernels
+
+
+@contextlib.contextmanager
+def set_num_threads(num_threads):
+    orig_num_threads = torch.get_num_threads()
+    torch.set_num_threads(num_threads)
+    yield
+    torch.set_num_threads(orig_num_threads)
+
 
 class LstmModule(torch.nn.Module):
     def __init__(
@@ -83,6 +107,7 @@ def forward(self, x, h=None):
         return x, h
 
 
+@instantiate_parametrized_tests
 class CPUReproTests(TestCase):
     common = check_model
 
@@ -247,11 +272,16 @@ def forward(self, x):
                 res = self.linear(res)
                 return res
 
+        dtypes = []
         if torch.ops.mkldnn._is_mkldnn_bf16_supported():
+            dtypes.append(torch.bfloat16)
+        if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+            dtypes.append(torch.float16)
+        for dtype in dtypes:
             with torch.no_grad():
-                m = M(224, 224).bfloat16().eval()
+                m = M(224, 224).to(dtype).eval()
                 m_opt = torch.compile(m)
-                x = torch.randn(224, 224, dtype=torch.bfloat16)
+                x = torch.randn(224, 224, dtype=dtype)
                 m_opt(x)
                 self.assertEqual(m(x), m_opt(x))
 
@@ -313,13 +343,36 @@ def fn(
         ]
         self.common(fn, inps)
 
+    @config.patch(freezing=True)
+    def test_module_buffer_mutation(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("foo", torch.rand((3, 10)))
+
+            def forward(self, x):
+                lx = [x, x.clone(), x.clone()]
+                y = []
+                for i in range(3):
+                    y.append(lx[i] + self.foo[i])
+                return torch.cat(y, 1)
+
+        with torch.no_grad():
+            example_inputs = (torch.rand(1, 10),)
+            self.common(Model(), example_inputs)
+
     @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKLDNN is not enabled")
     @patch("torch.cuda.is_available", lambda: False)
     def test_linear_packed(self):
+        dtypes = []
+        if torch.ops.mkldnn._is_mkldnn_bf16_supported():
+            dtypes.append(torch.bfloat16)
+        if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+            dtypes.append(torch.float16)
         options = itertools.product(
-            [[2, 3, 10], [2, 10], [10], [2, 0]], [3, 0], [True, False]
+            [[2, 3, 10], [2, 10], [10], [2, 0]], [3, 0], [True, False], dtypes
         )
-        for input_shape, out_dim, bias in options:
+        for input_shape, out_dim, bias, dtype in options:
             mod = torch.nn.Sequential(
                 torch.nn.Linear(input_shape[-1], out_dim, bias=bias)
             ).eval()
@@ -327,17 +380,9 @@ def test_linear_packed(self):
             v = torch.randn(input_shape)
             with torch.no_grad():
                 self.common(
-                    mod,
-                    (v,),
+                    mod.to(dtype),
+                    (v.to(dtype),),
                 )
-            if torch.ops.mkldnn._is_mkldnn_bf16_supported() and len(input_shape) > 1:
-                mod = mod.to(torch.bfloat16)
-                v = v.to(torch.bfloat16)
-                with torch.no_grad():
-                    self.common(
-                        mod,
-                        (v,),
-                    )
 
     @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKLDNN is not enabled")
     @patch("torch.cuda.is_available", lambda: False)
@@ -354,6 +399,39 @@ def test_conv_transpose2d_packed_cpu(self):
                     (v,),
                 )
 
+    @config.patch(freezing=True)
+    @unittest.skipIf(not torch._C._has_mkldnn, "MKLDNN is not enabled")
+    @torch._dynamo.config.patch(dynamic_shapes=True)
+    @torch._dynamo.config.patch(assume_static_by_default=False)
+    def test_conv_in_channel_1_dynamic_shapes(self):
+        class M(torch.nn.Module):
+            def __init__(self, in_channel, out_channel) -> None:
+                super().__init__()
+                self.conv = torch.nn.Conv2d(in_channel, out_channel, 3)
+
+            def forward(self, x):
+                res = self.conv(x)
+                res = F.relu(res)
+                return res
+
+        # test the case where the channels dim of the input is 1
+        # Reproducer from the maml_omniglot model in Torchbench
+        in_channel = 1
+        out_channel = 3
+        amp_enabled_configs = [False]
+        if torch.ops.mkldnn._is_mkldnn_bf16_supported():
+            # When amp is enabled here, the input to Conv is a FlexibleLayout.
+            # While it's disabled, the input is a FixedLayout.
+            amp_enabled_configs.append(True)
+        for amp_enabled in amp_enabled_configs:
+            mod = M(in_channel, out_channel).eval()
+            v = torch.randn(5, in_channel, 15, 15)
+            with torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled):
+                self.common(
+                    mod,
+                    (v,),
+                )
+
     @unittest.skipIf(not torch._C._has_mkldnn, "MKLDNN is not enabled")
     @patch("torch.cuda.is_available", lambda: False)
     @torch._dynamo.config.patch(dynamic_shapes=True)
@@ -378,6 +456,8 @@ def _test_lstm_packed(self, params_dict, change_input_sizes=False):
             dtypes = [torch.float]
             if torch.ops.mkldnn._is_mkldnn_bf16_supported():
                 dtypes.append(torch.bfloat16)
+            if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+                dtypes.append(torch.float16)
             for dtype in dtypes:
                 counters.clear()
                 num_directions = 2 if bidirectional else 1
@@ -572,6 +652,54 @@ def fn(value, mask):
                 (value, mask),
             )
 
+    def test_relu_with_inf_value(self):
+        # https://github.com/pytorch/pytorch/issues/117544.
+
+        def fn(out):
+            out = torch.sinh(input=out)
+            out = torch.relu(input=out)
+            return out
+
+        x = torch.Tensor([-572373.5000, 755109.1250, 330995.5625])
+        with torch.no_grad():
+            self.common(
+                fn,
+                (x,),
+            )
+
+    def test_acosh_with_negative_large_input(self):
+        # https://github.com/pytorch/pytorch/issues/118267.
+
+        def fn(input):
+            out = torch.acosh(input)
+            return out
+
+        x = torch.Tensor(
+            [
+                [
+                    -8493.9854,
+                    431654.1250,
+                    71741.5859,
+                    608234.5000,
+                    -103814.7500,
+                    -699397.0000,
+                    -910685.8125,
+                    -832737.1875,
+                    875343.5000,
+                ]
+            ]
+        ).repeat(3, 9)
+
+        for dtype in [torch.float32, torch.bfloat16, torch.double]:
+            with torch.no_grad():
+                torch._dynamo.reset()
+                metrics.reset()
+                _x = x.to(dtype)
+                self.common(
+                    fn,
+                    (_x,),
+                )
+
     @config.patch(implicit_fallbacks=True)
     def test_repeat_interleave(self):
         def fn(y):
@@ -727,9 +855,7 @@ def forward(self, x, xk):
             xk = torch.randn(4, 2, 2, 2).to(dtype)
             self.assertEqual(opt_model(x, xk), ref_model(x, xk))
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_sigmoid_with_reduction(self):
         def fn(x):
@@ -776,9 +902,7 @@ def fn(query, key, window_overlap):
             ),
         )
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_to_uint8_rounding_method(self):
         def fn(x):
@@ -791,14 +915,13 @@ def fn(x):
                 torch._dynamo.reset()
                 metrics.reset()
                 self.common(fn, (x,))
-                assert metrics.generated_cpp_vec_kernel_count == 1
+                check_metrics_vec_kernel_count(1)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
-    @patch("torch.cuda.is_available", lambda: False)
-    def test_decomposed_dequant_relu_quant(self):
-        def fn(x, scale, zero_point, use_dequant, use_quant):
+    @requires_vectorization
+    def _test_decomposed_dequant_relu_quant_helper(self, dtype):
+        def fn(
+            x, scale, zero_point, use_dequant, use_quant, quant_min, quant_max, dtype
+        ):
             # For quantized_decomposed.dequantize_per_tensor
             # Refer to torch/ao/quantization/fx/_decomposed.py
             if use_dequant:
@@ -810,59 +933,90 @@ def fn(x, scale, zero_point, use_dequant, use_quant):
             # Refer to torch/ao/quantization/fx/_decomposed.py
             if use_quant:
                 inv_scale = 1.0 / scale
-                x = torch.clamp(torch.round(x * inv_scale) + zero_point, 0, 255).to(
-                    torch.uint8
-                )
+                x = torch.clamp(
+                    torch.round(x * inv_scale) + zero_point, quant_min, quant_max
+                ).to(dtype)
             return x
 
+        assert dtype in [torch.uint8, torch.int8]
+        quant_min = 0 if dtype == torch.uint8 else -128
+        quant_max = 255 if dtype == torch.uint8 else 127
+
         use_dequant_list = [False, True]
         use_quant_list = [False, True]
         for use_dequant, use_quant in itertools.product(
             use_dequant_list, use_quant_list
         ):
             x = torch.clamp(
-                torch.randn((1, 7, 7, 9), dtype=torch.float32) * 100, 0, 255
+                torch.randn((1, 7, 7, 9), dtype=torch.float32) * 100,
+                quant_min,
+                quant_max,
             )
             if use_dequant:
-                x = x.to(torch.uint8)
+                x = x.to(dtype)
             zero_point = 100
             scale = 0.01
             with config.patch({"cpp.simdlen": None}):
                 torch._dynamo.reset()
                 metrics.reset()
-                self.common(fn, (x, scale, zero_point, use_dequant, use_quant))
-                assert metrics.generated_cpp_vec_kernel_count == 1
+                self.common(
+                    fn,
+                    (
+                        x,
+                        scale,
+                        zero_point,
+                        use_dequant,
+                        use_quant,
+                        quant_min,
+                        quant_max,
+                        dtype,
+                    ),
+                )
+                check_metrics_vec_kernel_count(1)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
-    @patch("torch.cuda.is_available", lambda: False)
-    def test_dequant_quant_lowering(self):
-        def fn(x, scale, zero_point, use_dequant, use_quant):
+    @requires_vectorization
+    def test_decomposed_dequant_relu_quant_uint8(self):
+        self._test_decomposed_dequant_relu_quant_helper(torch.uint8)
+
+    @requires_vectorization
+    def test_decomposed_dequant_relu_quant_int8(self):
+        self._test_decomposed_dequant_relu_quant_helper(torch.int8)
+
+    def _test_dequant_quant_lowering_helper(self, dtype):
+        def fn(
+            x, scale, zero_point, use_dequant, use_quant, quant_min, quant_max, dtype
+        ):
             if use_dequant:
                 x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                    x, scale, zero_point, 0, 255, torch.uint8
+                    x, scale, zero_point, quant_min, quant_max, dtype
                 )
 
             x = torch.relu(x)
 
             if use_quant:
                 x = torch.ops.quantized_decomposed.quantize_per_tensor(
-                    x, scale, zero_point, 0, 255, torch.uint8
+                    x, scale, zero_point, quant_min, quant_max, dtype
                 )
             return x
 
         use_dequant_list = [False, True]
         use_quant_list = [False, True]
         use_tensor_overload_list = [False, True]
+
+        assert dtype in [torch.uint8, torch.int8]
+        quant_min = 0 if dtype == torch.uint8 else -128
+        quant_max = 255 if dtype == torch.uint8 else 127
+
         for use_dequant, use_quant, use_tensor_overload in itertools.product(
             use_dequant_list, use_quant_list, use_tensor_overload_list
         ):
             x = torch.clamp(
-                torch.randn((1, 7, 7, 9), dtype=torch.float32) * 100, 0, 255
+                torch.randn((1, 7, 7, 9), dtype=torch.float32) * 100,
+                quant_min,
+                quant_max,
             )
             if use_dequant:
-                x = x.to(torch.uint8)
+                x = x.to(dtype)
             zero_point = 100
             scale = 0.01
             if use_tensor_overload:
@@ -871,17 +1025,33 @@ def fn(x, scale, zero_point, use_dequant, use_quant):
             with config.patch({"cpp.simdlen": None}):
                 torch._dynamo.reset()
                 metrics.reset()
-                self.common(fn, (x, scale, zero_point, use_dequant, use_quant))
-                assert metrics.generated_cpp_vec_kernel_count == 1
+                self.common(
+                    fn,
+                    (
+                        x,
+                        scale,
+                        zero_point,
+                        use_dequant,
+                        use_quant,
+                        quant_min,
+                        quant_max,
+                        dtype,
+                    ),
+                )
+                check_metrics_vec_kernel_count(1)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
-    @patch("torch.cuda.is_available", lambda: False)
-    def test_dequant_maxpool2d_lowering(self):
-        def fn(x, scale, zero_point):
+    @requires_vectorization
+    def test_dequant_quant_lowering_uint8(self):
+        self._test_dequant_quant_lowering_helper(torch.uint8)
+
+    @requires_vectorization
+    def test_dequant_quant_lowering_int8(self):
+        self._test_dequant_quant_lowering_helper(torch.int8)
+
+    def _test_dequant_maxpool2d_lowering_helper(self, dtype):
+        def fn(x, scale, zero_point, quant_min, quant_max, dtype):
             x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                x, scale, zero_point, 0, 255, torch.uint8
+                x, scale, zero_point, quant_min, quant_max, dtype
             )
             max_pool2d_with_indices_default = (
                 torch.ops.aten.max_pool2d_with_indices.default(
@@ -890,13 +1060,19 @@ def fn(x, scale, zero_point):
             )
             return max_pool2d_with_indices_default
 
+        assert dtype in [torch.uint8, torch.int8]
+        quant_min = 0 if dtype == torch.uint8 else -128
+        quant_max = 255 if dtype == torch.uint8 else 127
+
         use_tensor_overload_list = [False, True]
         for use_tensor_overload in use_tensor_overload_list:
             x = (
                 torch.clamp(
-                    torch.randn((3, 16, 8, 8), dtype=torch.float32) * 100, 0, 255
+                    torch.randn((3, 16, 8, 8), dtype=torch.float32) * 100,
+                    quant_min,
+                    quant_max,
                 )
-                .to(torch.uint8)
+                .to(dtype)
                 .contiguous(memory_format=torch.channels_last)
             )
             zero_point = 100
@@ -907,14 +1083,18 @@ def fn(x, scale, zero_point):
             with config.patch({"cpp.simdlen": None}):
                 torch._dynamo.reset()
                 metrics.reset()
-                self.common(fn, (x, scale, zero_point))
-                assert metrics.generated_cpp_vec_kernel_count == 1
+                self.common(fn, (x, scale, zero_point, quant_min, quant_max, dtype))
+                check_metrics_vec_kernel_count(1)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
-    @patch("torch.cuda.is_available", lambda: False)
-    def test_tile2d_load_decomposed_dequant_add_relu_quant(self):
+    @requires_vectorization
+    def test_dequant_maxpool2d_lowering_uint8(self):
+        self._test_dequant_maxpool2d_lowering_helper(torch.uint8)
+
+    @requires_vectorization
+    def test_dequant_maxpool2d_lowering_int8(self):
+        self._test_dequant_maxpool2d_lowering_helper(torch.int8)
+
+    def _test_tile2d_load_decomposed_dequant_add_relu_quant_helper(self, dtype):
         def fn(
             x,
             scale,
@@ -927,40 +1107,52 @@ def fn(
             use_dequant,
             use_dequant2,
             use_quant,
+            quant_min,
+            quant_max,
+            dtype,
         ):
             if use_dequant:
                 x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                    x, scale, zero_point, 0, 255, torch.uint8
+                    x, scale, zero_point, quant_min, quant_max, dtype
                 )
             if use_dequant2:
                 x2 = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                    x2, scale2, zero_point2, 0, 255, torch.uint8
+                    x2, scale2, zero_point2, quant_min, quant_max, dtype
                 )
             temp = x + x2
             y = torch.relu(temp)
 
             if use_quant:
                 y = torch.ops.quantized_decomposed.quantize_per_tensor(
-                    y, output_scale, output_zero_point, 0, 255, torch.uint8
+                    y, output_scale, output_zero_point, quant_min, quant_max, dtype
                 )
             return y.contiguous()
 
+        assert dtype in [torch.uint8, torch.int8]
+        quant_min = 0 if dtype == torch.uint8 else -128
+        quant_max = 255 if dtype == torch.uint8 else 127
+
         use_dequant_list = [False, True]
         use_dequant_list2 = [False, True]
         use_quant_list = [False, True]
+
         for use_dequant, use_dequant2, use_quant in itertools.product(
             use_dequant_list, use_dequant_list2, use_quant_list
         ):
             x = torch.clamp(
-                torch.randn((1, 1024, 14, 14), dtype=torch.float32) * 100, 0, 255
+                torch.randn((1, 1024, 14, 14), dtype=torch.float32) * 100,
+                quant_min,
+                quant_max,
             ).contiguous(memory_format=torch.channels_last)
             x2 = torch.clamp(
-                torch.randn((1, 1024, 14, 14), dtype=torch.float32) * 100, 0, 255
+                torch.randn((1, 1024, 14, 14), dtype=torch.float32) * 100,
+                quant_min,
+                quant_max,
             ).contiguous(memory_format=torch.channels_last)
             if use_dequant:
-                x = x.to(torch.uint8).contiguous(memory_format=torch.channels_last)
+                x = x.to(dtype).contiguous(memory_format=torch.channels_last)
             if use_dequant2:
-                x2 = x2.to(torch.uint8).contiguous(memory_format=torch.channels_last)
+                x2 = x2.to(dtype).contiguous(memory_format=torch.channels_last)
             zero_point = 1
             scale = 0.01
             zero_point2 = 2
@@ -984,41 +1176,187 @@ def fn(
                         use_dequant,
                         use_dequant2,
                         use_quant,
+                        quant_min,
+                        quant_max,
+                        dtype,
                     ),
                 )
-                assert metrics.generated_cpp_vec_kernel_count == 2
+                check_metrics_vec_kernel_count(2)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
-    @patch("torch.cuda.is_available", lambda: False)
-    def test_non_contiguous_load_buf_quant(self):
+    @requires_vectorization
+    def test_tile2d_load_decomposed_dequant_add_relu_quant_uint8(self):
+        self._test_tile2d_load_decomposed_dequant_add_relu_quant_helper(torch.uint8)
+
+    @requires_vectorization
+    def test_tile2d_load_decomposed_dequant_add_relu_quant_int8(self):
+        self._test_tile2d_load_decomposed_dequant_add_relu_quant_helper(torch.int8)
+
+    @requires_vectorization
+    def _test_per_tensor_fake_quant_helper(self, dtype):
+        def fn(input, scales, zero_points, quant_min, quant_max, dtype):
+            input = torch.ops.quantized_decomposed.quantize_per_tensor(
+                input, scales, zero_points, quant_min, quant_max, dtype
+            )
+            input = torch.ops.quantized_decomposed.dequantize_per_tensor(
+                input, scales, zero_points, quant_min, quant_max, dtype
+            )
+            return input
+
+        use_tensor_overload_list = [False, True]
+        for use_tensor_overload in use_tensor_overload_list:
+            assert dtype in [torch.uint8, torch.int8]
+            quant_min = 0 if dtype == torch.uint8 else -128
+            quant_max = 255 if dtype == torch.uint8 else 127
+            x = torch.clamp(
+                torch.randn((1, 7, 7, 9), dtype=torch.float32) * 100,
+                quant_min,
+                quant_max,
+            )
+            zero_point = 100
+            scale = 0.01
+            if use_tensor_overload:
+                zero_point = torch.tensor(zero_point, dtype=torch.int64)
+                scale = torch.tensor(scale)
+            with config.patch({"cpp.simdlen": None}):
+                torch._dynamo.reset()
+                metrics.reset()
+                self.common(fn, (x, scale, zero_point, quant_min, quant_max, dtype))
+                assert metrics.generated_cpp_vec_kernel_count == 1
+
+    @requires_vectorization
+    def test_per_tensor_fake_quant_uint8(self):
+        self._test_per_tensor_fake_quant_helper(torch.uint8)
+
+    @requires_vectorization
+    def test_per_tensor_fake_quant_int8(self):
+        self._test_per_tensor_fake_quant_helper(torch.int8)
+
+    def _test_per_channel_fake_quant_helper(self, dtype, input_dtype=torch.float32):
+        def fn(input, scales, zero_points, axis, quant_min, quant_max, dtype):
+            input = torch.ops.quantized_decomposed.quantize_per_channel(
+                input, scales, zero_points, axis, quant_min, quant_max, dtype
+            )
+            input = torch.ops.quantized_decomposed.dequantize_per_channel(
+                input, scales, zero_points, axis, quant_min, quant_max, dtype
+            )
+            return input
+
+        assert dtype in [torch.uint8, torch.int8]
+        quant_min = 0 if dtype == torch.uint8 else -128
+        quant_max = 255 if dtype == torch.uint8 else 127
+        x = torch.clamp(
+            torch.randn((1, 3, 224, 224), dtype=torch.float32) * 100,
+            quant_min,
+            quant_max,
+        )
+        if input_dtype != torch.float32:
+            x = x.to(dtype=input_dtype)
+        scales = torch.ones((3,))
+        zero_points = torch.zeros((3,))
+        axis = 1
+        with config.patch({"cpp.simdlen": None}):
+            torch._dynamo.reset()
+            metrics.reset()
+            self.common(fn, (x, scales, zero_points, axis, quant_min, quant_max, dtype))
+            check_metrics_vec_kernel_count(1)
+
+    @requires_vectorization
+    def test_per_channel_fake_quant_uint8(self):
+        self._test_per_channel_fake_quant_helper(torch.uint8)
+
+    @requires_vectorization
+    def test_per_channel_fake_quant_module_uint8(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.scales = torch.ones((3,)).to(torch.float64)
+                self.zero_points = torch.zeros((3,)).to(torch.int64)
+                self.axis = 1
+                self.quant_min = 0
+                self.quant_max = 255
+                self.dtype = torch.uint8
+
+            def forward(self, input):
+                input = torch.ops.quantized_decomposed.quantize_per_channel(
+                    input,
+                    self.scales,
+                    self.zero_points,
+                    self.axis,
+                    self.quant_min,
+                    self.quant_max,
+                    self.dtype,
+                )
+                input = torch.ops.quantized_decomposed.dequantize_per_channel(
+                    input,
+                    self.scales,
+                    self.zero_points,
+                    self.axis,
+                    self.quant_min,
+                    self.quant_max,
+                    self.dtype,
+                )
+                return input
+
+        m = Mod().eval()
+        x = torch.clamp(
+            torch.randn((1, 3, 224, 224), dtype=torch.float32) * 100,
+            0,
+            255,
+        )
+        with config.patch({"cpp.simdlen": None}):
+            torch._dynamo.reset()
+            metrics.reset()
+            self.common(m, (x,))
+            assert metrics.generated_cpp_vec_kernel_count == 1
+
+    @requires_vectorization
+    def test_per_channel_fake_quant_int8(self):
+        self._test_per_channel_fake_quant_helper(torch.int8)
+
+    @requires_vectorization
+    def test_per_channel_fake_quant_uint8_bf16_input(self):
+        self._test_per_channel_fake_quant_helper(
+            torch.uint8, input_dtype=torch.bfloat16
+        )
+
+    @requires_vectorization
+    def test_per_channel_fake_quant_int8_bf16_input(self):
+        self._test_per_channel_fake_quant_helper(torch.int8, input_dtype=torch.bfloat16)
+
+    def _test_non_contiguous_load_buf_quant_helper(self, dtype):
         def fn(
             x1,
             x2,
             groups,
+            quant_min,
+            quant_max,
+            dtype,
         ):
             x = torch.cat((x1, x2), dim=1)
             batchsize, num_channels, height, width = x.size()
             channels_per_group = num_channels // groups
             x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                x, 1.0, 0, 0, 255, torch.uint8
+                x, 1.0, 0, quant_min, quant_max, dtype
             )
             x = x.view(batchsize, groups, channels_per_group, height, width)
             x = torch.ops.quantized_decomposed.quantize_per_tensor(
-                x, 1.0, 0, 0, 255, torch.uint8
+                x, 1.0, 0, quant_min, quant_max, dtype
             )
             x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                x, 1.0, 0, 0, 255, torch.uint8
+                x, 1.0, 0, quant_min, quant_max, dtype
             )
             x = torch.transpose(x, 1, 2).contiguous()
             x = x.view(batchsize, num_channels, height, width)
             return x
 
-        x = torch.randint(0, 8, (1, 116, 28, 28), dtype=torch.uint8).contiguous(
+        assert dtype in [torch.uint8, torch.int8]
+        quant_min = 0 if dtype == torch.uint8 else -128
+        quant_max = 255 if dtype == torch.uint8 else 127
+
+        x = torch.randint(0, 8, (1, 116, 28, 28), dtype=dtype).contiguous(
             memory_format=torch.channels_last
         )
-        x2 = torch.randint(0, 8, (1, 116, 28, 28), dtype=torch.uint8).contiguous(
+        x2 = torch.randint(0, 8, (1, 116, 28, 28), dtype=dtype).contiguous(
             memory_format=torch.channels_last
         )
 
@@ -1031,61 +1369,98 @@ def fn(
                     x,
                     x2,
                     2,
+                    quant_min,
+                    quant_max,
+                    dtype,
                 ),
             )
-            assert metrics.generated_cpp_vec_kernel_count == 2
+            check_metrics_vec_kernel_count(2)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
-    @patch("torch.cuda.is_available", lambda: False)
-    def test_tile2d_store_channel_shuffle_cl_quant_output(self):
-        def channel_shuffle(x, groups, output_scale, output_zero_point):
+    @requires_vectorization
+    def test_non_contiguous_load_buf_quant_uint8(self):
+        self._test_non_contiguous_load_buf_quant_helper(torch.uint8)
+
+    @requires_vectorization
+    def test_non_contiguous_load_buf_quant_int8(self):
+        self._test_non_contiguous_load_buf_quant_helper(torch.int8)
+
+    def _test_tile2d_store_channel_shuffle_cl_quant_output_helper(self, dtype):
+        def channel_shuffle(
+            x, groups, output_scale, output_zero_point, quant_min, quant_max, dtype
+        ):
             batchsize, num_channels, height, width = x.size()
             channels_per_group = num_channels // groups
             x = x.view(batchsize, groups, channels_per_group, height, width)
             x = torch.transpose(x, 1, 2).contiguous()
             x = x.view(batchsize, -1, height, width)
             x = torch.ops.quantized_decomposed.quantize_per_tensor(
-                x, output_scale, output_zero_point, 0, 255, torch.uint8
+                x, output_scale, output_zero_point, quant_min, quant_max, dtype
             )
             return x.contiguous(memory_format=torch.channels_last)
 
+        assert dtype in [torch.uint8, torch.int8]
+        quant_min = 0 if dtype == torch.uint8 else -128
+        quant_max = 255 if dtype == torch.uint8 else 127
+
         with config.patch({"cpp.simdlen": None}):
             torch._dynamo.reset()
             metrics.reset()
             x = torch.randn(64, 58, 28, 28)
             output_zero_point = 3
             output_scale = 0.03
-            self.common(channel_shuffle, (x, 2, output_scale, output_zero_point))
-            assert metrics.generated_cpp_vec_kernel_count == 2
+            self.common(
+                channel_shuffle,
+                (x, 2, output_scale, output_zero_point, quant_min, quant_max, dtype),
+            )
+            check_metrics_vec_kernel_count(2)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
-    @patch("torch.cuda.is_available", lambda: False)
-    def test_dequant_relu_quant_dequant_relu_quant_lowering(self):
-        def fn(x, scale, zero_point, scale2, zero_point2, scale3, zero_point3):
+    @requires_vectorization
+    def test_tile2d_store_channel_shuffle_cl_quant_output_uint8(self):
+        self._test_tile2d_store_channel_shuffle_cl_quant_output_helper(torch.uint8)
+
+    @requires_vectorization
+    def test_tile2d_store_channel_shuffle_cl_quant_output_int8(self):
+        self._test_tile2d_store_channel_shuffle_cl_quant_output_helper(torch.int8)
+
+    def _test_dequant_relu_quant_dequant_relu_quant_lowering_helper(self, dtype):
+        def fn(
+            x,
+            scale,
+            zero_point,
+            scale2,
+            zero_point2,
+            scale3,
+            zero_point3,
+            quant_min,
+            quant_max,
+            dtype,
+        ):
             x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                x, scale, zero_point, 0, 255, torch.uint8
+                x, scale, zero_point, quant_min, quant_max, dtype
             )
             x = torch.relu(x)
             x = torch.ops.quantized_decomposed.quantize_per_tensor(
-                x, scale2, zero_point2, 0, 255, torch.uint8
+                x, scale2, zero_point2, quant_min, quant_max, dtype
             )
             x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                x, scale2, zero_point2, 0, 255, torch.uint8
+                x, scale2, zero_point2, quant_min, quant_max, dtype
             )
             x = torch.relu(x)
             x = torch.ops.quantized_decomposed.quantize_per_tensor(
-                x, scale3, zero_point3, 0, 255, torch.uint8
+                x, scale3, zero_point3, quant_min, quant_max, dtype
             )
             return x
 
+        assert dtype in [torch.uint8, torch.int8]
+        quant_min = 0 if dtype == torch.uint8 else -128
+        quant_max = 255 if dtype == torch.uint8 else 127
+
         for use_tensor_overload in [True, False]:
             x = torch.clamp(
-                torch.randn((1, 7, 7, 9), dtype=torch.float32) * 100, 0, 255
-            ).to(torch.uint8)
+                torch.randn((1, 7, 7, 9), dtype=torch.float32) * 100,
+                quant_min,
+                quant_max,
+            ).to(dtype)
             zero_point_list = [100, 101, 102]
             scale_list = [0.01, 0.02, 0.03]
             if use_tensor_overload:
@@ -1101,11 +1476,30 @@ def fn(x, scale, zero_point, scale2, zero_point2, scale3, zero_point3):
                 metrics.reset()
                 self.common(
                     fn,
-                    (x, scale, zero_point, scale2, zero_point2, scale3, zero_point3),
+                    (
+                        x,
+                        scale,
+                        zero_point,
+                        scale2,
+                        zero_point2,
+                        scale3,
+                        zero_point3,
+                        quant_min,
+                        quant_max,
+                        dtype,
+                    ),
                     rtol=1e-2,
                     atol=1e-2,
                 )
-                assert metrics.generated_cpp_vec_kernel_count == 1
+                check_metrics_vec_kernel_count(1)
+
+    @requires_vectorization
+    def test_dequant_relu_quant_dequant_relu_quant_lowering_uint8(self):
+        self._test_dequant_relu_quant_dequant_relu_quant_lowering_helper(torch.uint8)
+
+    @requires_vectorization
+    def test_dequant_relu_quant_dequant_relu_quant_lowering_int8(self):
+        self._test_dequant_relu_quant_dequant_relu_quant_lowering_helper(torch.int8)
 
     def test_inplace_add_alpha(self):
         def fn(x, y):
@@ -1146,13 +1540,6 @@ def test_parallel_num_threads(self):
         def fn(x1, x2):
             return x1 + x2
 
-        @contextlib.contextmanager
-        def set_num_threads(num_threads):
-            orig_num_threads = torch.get_num_threads()
-            torch.set_num_threads(num_threads)
-            yield
-            torch.set_num_threads(orig_num_threads)
-
         x1 = torch.randn((10, 20))
         x2 = torch.randn((10, 20))
         with set_num_threads(1):
@@ -1181,9 +1568,7 @@ def test_complex_memory_overlap(self):
         self.assertFalse(complex_memory_overlap(gathered))
         self.assertFalse(complex_memory_overlap(gathered.t()))
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     def test_vec_dynamic_shapes(self):
         def fn(x):
             return torch.softmax(x, -1)
@@ -1245,9 +1630,7 @@ def test_auto_simd(self):
                 isa = codecache.pick_vec_isa()
                 self.assertTrue(isa == vec_avx2)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_masked_fill_softmax(self):
         def fn(value, mask):
@@ -1266,6 +1649,19 @@ def fn(value, mask):
                         self.common(fn, (value, mask))
                         assert metrics.generated_cpp_vec_kernel_count >= 1
 
+    def test_channels_last_view_as_complex(self):
+        # https://github.com/pytorch/pytorch/issues/122448#issuecomment-2046169554
+
+        def reduce_example(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            """Applies the rotary embedding to the query and key tensors."""
+            x_out = torch.view_as_complex(torch.stack([x.float(), y.float()], dim=-1))
+            return x_out
+
+        args = [torch.randn(1, 1, 1, 128), torch.randn(1, 1, 1, 128)]
+        expected = reduce_example(*args)
+        actual = torch.compile(reduce_example, fullgraph=True)(*args)
+        self.assertEqual(expected, actual)
+
     def test_load_same_bool_tensor_twice(self):
         @torch._dynamo.optimize("inductor")
         def fn(a, b):
@@ -1289,10 +1685,47 @@ def test_cpu_vec_cosim(self):
                 cpp_op_list.append(k)
 
         diff = [
+            "airy_ai",
+            "bessel_j0",
+            "bessel_j1",
+            "bessel_y0",
+            "bessel_y1",
+            "modified_bessel_i0",
+            "modified_bessel_i1",
+            "modified_bessel_k0",
+            "modified_bessel_k1",
+            "scaled_modified_bessel_k0",
+            "scaled_modified_bessel_k1",
+            "spherical_bessel_j0",
+            "i1",
+            "i1e",
+            "ndtr",
+            "ndtri",
+            "log_ndtr",
+            "erfcx",
+            "gammainc",
+            "gammaincc",
+            "igamma",
+            "igammac",
+            "polygamma",
+            "zeta",
+            "shifted_chebyshev_polynomial_u",
+            "chebyshev_polynomial_u",
+            "chebyshev_polynomial_t",
+            "shifted_chebyshev_polynomial_w",
+            "chebyshev_polynomial_w",
+            "shifted_chebyshev_polynomial_t",
+            "chebyshev_polynomial_v",
+            "shifted_chebyshev_polynomial_v",
+            "hermite_polynomial_he",
+            "laguerre_polynomial_l",
+            "hermite_polynomial_h",
+            "legendre_polynomial_p",
             "constant",
             "index_expr",
             "signbit",
             "isinf",
+            "frexp",
             "mod",
             "masked",
             "randn",
@@ -1359,6 +1792,87 @@ def fn(test_args):
             res_grad = test_args_for_opt["input"].grad
             self.assertEqual(ref_grad, res_grad)
 
+    def test_meta_device(self):
+        @torch.compile(fullgraph=True)
+        def fn():
+            x = torch.ops.aten.empty.memory_format(
+                [1024, 128, 128],
+                dtype=torch.float16,
+                device="meta",
+                pin_memory=False,
+            )
+            return x.sin() + 1
+
+        self.assertEqual(fn().shape, [1024, 128, 128])
+
+    def test_decomposed_fake_quant_per_channel(self):
+        def fq(input, scales, zero_points, axis, quant_min, quant_max):
+            res = torch.fake_quantize_per_channel_affine(
+                input, scales, zero_points, axis, quant_min, quant_max
+            )
+            return res
+
+        def qdq(input, scales, zero_points, axis, quant_min, quant_max):
+            res = torch.ops.quantized_decomposed.fake_quant_per_channel(
+                input, scales, zero_points, axis, quant_min, quant_max
+            )
+            return res
+
+        def run_eager_aten_fake_quant(
+            input, scales, zero_points, axis, quant_min, quant_max
+        ):
+            input.grad = None
+            res = fq(input, scales, zero_points, axis, quant_min, quant_max)
+            res.sum().backward()
+            return res, input.grad
+
+        def run_eager_decomposed_fake_quant(
+            input, scales, zero_points, axis, quant_min, quant_max
+        ):
+            input.grad = None
+            res = qdq(input, scales, zero_points, axis, quant_min, quant_max)
+            res.sum().backward()
+            return res, input.grad
+
+        def run_compile_decomposed_fake_quant(
+            input, scales, zero_points, axis, quant_min, quant_max
+        ):
+            input.grad = None
+            compiled_qdq = torch.compile(qdq)
+            res = compiled_qdq(input, scales, zero_points, axis, quant_min, quant_max)
+            res.sum().backward()
+            return res, input.grad
+
+        input = torch.randn(2, 3, 224, 224)
+        input[1, 2, 3, 4] = 257
+        input.requires_grad_()
+        scales = torch.ones((3,))
+        zero_points = torch.zeros((3,))
+        axis = 1
+        quant_min = -128
+        quant_max = 127
+
+        aten_input = copy.deepcopy(input)
+        compiler_input = copy.deepcopy(input)
+
+        res_aten_eager, input_grad_aten_eager = run_eager_aten_fake_quant(
+            aten_input, scales, zero_points, axis, quant_min, quant_max
+        )
+        res_decomp_eager, input_grad_decomp_eager = run_eager_decomposed_fake_quant(
+            input, scales, zero_points, axis, quant_min, quant_max
+        )
+        res, input_grad = run_compile_decomposed_fake_quant(
+            compiler_input, scales, zero_points, axis, quant_min, quant_max
+        )
+
+        self.assertEqual(res_aten_eager, res)
+        self.assertEqual(res_decomp_eager, res)
+        self.assertEqual(input_grad_aten_eager, input_grad)
+        self.assertEqual(input_grad_decomp_eager, input_grad)
+        self.assertEqual(input_grad[1, 2, 3, 4], torch.tensor(0.0))
+        # For forward and backward kernel
+        check_metrics_vec_kernel_count(2)
+
     @patch("torch.cuda.is_available", lambda: False)
     def test_scatter_using_atomic_add(self):
         def fn(a, dim, index, b):
@@ -1371,19 +1885,42 @@ def fn(a, dim, index, b):
             torch.randn(1, 1, 10),
         )
 
-        fn_opt = torch.compile()(fn)
-        with config.patch({"cpp.fallback_scatter_reduce_sum": False}):
-            _, code = run_and_get_cpp_code(fn_opt, *inps)
-            FileCheck().check("atomic_add").run(code)
+        def _internal_check(
+            _fn,
+            _inps,
+            _target_code_check=None,
+            _target_code_check_not=None,
+        ):
+            torch._dynamo.reset()
+            metrics.reset()
+            _fn_opt = torch.compile()(_fn)
+            _, code = run_and_get_cpp_code(_fn_opt, *inps)
+            if _target_code_check:
+                FileCheck().check(_target_code_check).run(code)
+            if _target_code_check_not:
+                FileCheck().check_not(_target_code_check_not).run(code)
 
             self.assertEqual(
-                fn(*inps),
-                fn_opt(*inps),
+                _fn(*_inps),
+                _fn_opt(*_inps),
             )
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+        with config.patch({"cpp.fallback_scatter_reduce_sum": False}):
+            _internal_check(fn, inps, "atomic_add")
+
+        with config.patch({"cpp.fallback_scatter_reduce_sum": True}):
+            _internal_check(fn, inps, "aten.scatter_reduce_")
+
+        if "ATen parallel backend: OpenMP" in torch.__config__.parallel_info():
+            # Fix https://github.com/pytorch/pytorch/issues/118518
+            # which fails to change thread number with native thread pool
+            with set_num_threads(1):
+                _internal_check(fn, inps, _target_code_check_not="aten.scatter_reduce_")
+
+            with config.patch({"cpp.dynamic_threads": True}), set_num_threads(1):
+                _internal_check(fn, inps, "aten.scatter_reduce_")
+
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_new_vec_op_cpu_only(self):
         def fn(x):
@@ -1403,11 +1940,9 @@ def fn(x):
                         torch._dynamo.reset()
                         metrics.reset()
                         self.common(fn, (x,))
-                        assert metrics.generated_cpp_vec_kernel_count == 1
+                        check_metrics_vec_kernel_count(1)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_vec_cpu_only_for_all_available_isa(self):
         def fn(x):
@@ -1423,12 +1958,10 @@ def fn(x):
                 torch._dynamo.reset()
                 metrics.reset()
                 self.common(fn, (x,))
-                assert metrics.generated_cpp_vec_kernel_count == 1
+                check_metrics_vec_kernel_count(1)
 
     @slowTest
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test__adaptive_avg_pool2d(self):
         def wrap_fn(oh, ow):
@@ -1453,11 +1986,9 @@ def fn(x):
                 torch._dynamo.reset()
                 metrics.reset()
                 self.common(_fn, (x,))
-                assert metrics.generated_cpp_vec_kernel_count == 1
+                check_metrics_vec_kernel_count(1)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_vec_logical(self):
         def wrap_fn1(op: Callable):
@@ -1491,11 +2022,9 @@ def fn(x: torch.Tensor, y: torch.Tensor):
                     _fn = wrap_fn2(logical_fn)
                     _args = (x, y)
                 self.common(_fn, _args)
-                assert metrics.generated_cpp_vec_kernel_count == 1
+                check_metrics_vec_kernel_count(1)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_vec_compare_op_cpu_only(self):
         def fn(x):
@@ -1532,7 +2061,7 @@ def fn(x):
                 torch._dynamo.reset()
                 metrics.reset()
                 self.common(fn, (x,))
-                assert metrics.generated_cpp_vec_kernel_count == 1
+                check_metrics_vec_kernel_count(1)
                 assert (
                     metrics.generated_kernel_count
                     - metrics.generated_cpp_vec_kernel_count
@@ -1598,8 +2127,7 @@ def fn(x, y):
                     torch._dynamo.reset()
                     metrics.reset()
                     self.common(fn, (x, y))
-                    if codecache.valid_vec_isa_list():
-                        assert metrics.generated_cpp_vec_kernel_count == 1
+                    check_metrics_vec_kernel_count(1)
 
     def test_do_not_insert_to_dtype_for_memory_copy_only_kernel(self):
         def fn(x):
@@ -1612,8 +2140,7 @@ def fn(x):
         metrics.reset()
         self.common(fn, (x,))
         assert metrics.cpp_to_dtype_count == 0
-        if codecache.valid_vec_isa_list():
-            assert metrics.generated_cpp_vec_kernel_count == 1
+        check_metrics_vec_kernel_count(1)
 
     def test_insert_to_dtype_count(self):
         def fn(x):
@@ -1626,8 +2153,7 @@ def fn(x):
         metrics.reset()
         self.common(fn, (x,))
         assert metrics.cpp_to_dtype_count == 2
-        if codecache.valid_vec_isa_list():
-            assert metrics.generated_cpp_vec_kernel_count == 1
+        check_metrics_vec_kernel_count(1)
 
     def test_memory_copy_with_fusion(self):
         def fn(x):
@@ -1641,12 +2167,9 @@ def fn(x):
         metrics.reset()
         self.common(fn, (x,))
         assert metrics.cpp_to_dtype_count == 2
-        if codecache.valid_vec_isa_list():
-            assert metrics.generated_cpp_vec_kernel_count == 1
+        check_metrics_vec_kernel_count(1)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_cpp_vec_constant_checker(self):
         _graph: torch.fx.Graph = torch.fx.Graph()
@@ -1745,14 +2268,14 @@ def set_opt_dtype(graph):
                 InterpreterShim(_graph, submodules).run(
                     V.get_ops_handler(), i32_iinfo.min - 1, f32_iinfo.min
                 )
-                self.assertFalse(vec_checker.simd_vec)
+                self.assertTrue(vec_checker.simd_vec)
 
                 vec_checker.simd_vec = True
                 set_opt_dtype(_graph)
                 InterpreterShim(_graph, submodules).run(
                     V.get_ops_handler(), i32_iinfo.max + 1, f32_iinfo.max
                 )
-                self.assertFalse(vec_checker.simd_vec)
+                self.assertTrue(vec_checker.simd_vec)
 
                 vec_checker.simd_vec = True
                 set_opt_dtype(_graph)
@@ -1768,9 +2291,7 @@ def set_opt_dtype(graph):
                 )
                 self.assertFalse(vec_checker.simd_vec)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_cpp_vec_index_expr_checker(self):
         _graph: torch.fx.Graph = torch.fx.Graph()
@@ -1874,9 +2395,7 @@ def get_index():
                 InterpreterShim(_graph, submodules).run(V.get_ops_handler())
                 self.assertFalse(vec_checker.simd_vec)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_maxpool2d_cpu_only(self):
         for dtype in vec_dtypes:
@@ -1892,11 +2411,9 @@ def func(x):
                 torch._dynamo.reset()
                 metrics.reset()
                 self.common(func, (input,))
-                assert metrics.generated_cpp_vec_kernel_count == 1
+                check_metrics_vec_kernel_count(1)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_maxpool2d_with_pre_loop_collapse_cpu_only(self):
         x1 = torch.randn(2, 3, 20, 20).to(memory_format=torch.channels_last)
@@ -1911,11 +2428,20 @@ def func(x1, x2):
             torch._dynamo.reset()
             metrics.reset()
             self.common(func, (x1, x2))
-            assert metrics.generated_cpp_vec_kernel_count == 2
+            check_metrics_vec_kernel_count(2)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    def test_randint_symint_input(self):
+        # https://github.com/pytorch/pytorch/issues/122405
+        @torch.compile(fullgraph=True)
+        def get_traj_idx(lengths: torch.Tensor, num_slices: int) -> torch.Tensor:
+            return torch.randint(lengths.shape[0], (num_slices,), device=lengths.device)
+
+        lengths = torch.zeros(10, dtype=torch.long)
+        get_traj_idx(lengths, num_slices=4)
+        lengths = torch.zeros(11, dtype=torch.long)
+        get_traj_idx(lengths, num_slices=4)
+
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_sign_cpu_only(self):
         def fn(x):
@@ -1930,11 +2456,9 @@ def fn(x):
                 torch._dynamo.reset()
                 metrics.reset()
                 self.common(fn, (x,))
-                assert metrics.generated_cpp_vec_kernel_count == 1
+                check_metrics_vec_kernel_count(1)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_reduction_cpu_only(self):
         def fn(x):
@@ -1949,13 +2473,66 @@ def fn(x):
                 self.common(fn, (x,))
                 assert metrics.generated_cpp_vec_kernel_count == 0
 
+    def test_outer_loop_fusion(self):
+        def fn(x):
+            max = torch.amax(x, dim=-1, keepdim=True)
+            return x - max
+
+        x = torch.randn(4, 12, 1023, 1022)
+
+        with config.patch({"cpp.simdlen": None}):
+            torch._dynamo.reset()
+            metrics.reset()
+            self.common(fn, (x,))
+            assert len(metrics.cpp_outer_loop_fused_inner_counts) == 1
+            assert metrics.cpp_outer_loop_fused_inner_counts[0] == 2
+
+    def test_argmin(self):
+        def fn(x):
+            return torch.argmin(x, -1)
+
+        for dtype in vec_dtypes:
+            x = torch.randn((10, 10), dtype=dtype)
+            torch._dynamo.reset()
+            metrics.reset()
+            self.common(fn, (x,))
+            assert metrics.generated_cpp_vec_kernel_count == 0
+
+    def test_argmax_argmin_with_nan_value(self):
+        def fn(x):
+            return torch.argmax(x)
+
+        def fn2(x):
+            return torch.argmin(x)
+
+        inputs = [
+            torch.Tensor([-755832.1250, 100]),
+            torch.Tensor([-755832.1250, 100, 200]),
+            torch.Tensor([100, -755832.1250]),
+            torch.Tensor([100, 200, -755832.1250]),
+        ]
+
+        for x in inputs:
+            x = x.repeat(16, 16)
+            x = torch.log1p(x)
+
+            # Test argmax
+            torch._dynamo.reset()
+            metrics.reset()
+            self.common(fn, (x,))
+            assert metrics.generated_cpp_vec_kernel_count == 0
+
+            # Test argmin
+            torch._dynamo.reset()
+            metrics.reset()
+            self.common(fn2, (x,))
+            assert metrics.generated_cpp_vec_kernel_count == 0
+
     # Currently, we enabled AVX2 and AVX512 for vectorization. If the platform is not
     # supported, the vectorization will not work and skip this test case. For ARM or
     # other platforms support, we just need to add the ISA info to the supported_vector_isa
     # and include proper aten vectorization head file.
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)
     def test_vec_kernel_cpu_only(self):
         def fn(x1, x2):
@@ -2006,7 +2583,7 @@ def fn(x1, x2):
                 torch._dynamo.reset()
                 metrics.reset()
                 self.common(fn, (x1, x2))
-                assert metrics.generated_cpp_vec_kernel_count == 1
+                check_metrics_vec_kernel_count(1)
 
         with config.patch({"cpp.simdlen": None}):
             torch._dynamo.reset()
@@ -2014,14 +2591,14 @@ def fn(x1, x2):
             x1 = torch.randn(10, 20).permute(1, 0)
             x2 = torch.randn((20, 10))
             self.common(fn, (x1, x2))
-            assert metrics.generated_cpp_vec_kernel_count == 2
+            check_metrics_vec_kernel_count(2)
 
             torch._dynamo.reset()
             metrics.reset()
             x1 = torch.randn((10, 7))
             x2 = torch.randn((10, 7))
             self.common(fn, (x1, x2))
-            assert metrics.generated_cpp_vec_kernel_count == 1
+            check_metrics_vec_kernel_count(1)
 
     @unittest.skipIf(
         sys.platform != "linux", "cpp kernel profile only support linux now"
@@ -2047,9 +2624,7 @@ def fn(a, b):
                 kernel_profile_events.append(e.name)
         assert len(kernel_profile_events) > 0
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     def test_channel_shuffle_cl_output(self):
         """code and shape extracted from shufflenet_v2_x1_0"""
 
@@ -2068,12 +2643,10 @@ def channel_shuffle(x, groups):
                 x = torch.randn(64, 58, 28, 28)
                 self.common(channel_shuffle, (x, 2))
                 if simdlen != 1:
-                    assert metrics.generated_cpp_vec_kernel_count == 2
+                    check_metrics_vec_kernel_count(2)
 
     @slowTest
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     def test_transpose_with_norm(self):
         """a sub-module from TIMM gmlp_s16_224"""
 
@@ -2106,11 +2679,9 @@ def forward(self, x):
                     m = Model().eval() if eval_mode else Model()
                     self.common(m, (x,))
                     if simdlen != 1:
-                        assert metrics.generated_cpp_vec_kernel_count == 8
+                        check_metrics_vec_kernel_count(8)
 
-    @unittest.skipIf(
-        not codecache.valid_vec_isa_list(), "Does not support vectorization"
-    )
+    @requires_vectorization
     def test_transpose_copy(self):
         def fn(a):
             return a.t().contiguous()
@@ -2132,7 +2703,21 @@ def fn(a):
                         x = torch.randn(shape, dtype=dtype)
                         self.common(fn, (x,))
                         if simdlen != 1:
-                            assert metrics.generated_cpp_vec_kernel_count == 2
+                            check_metrics_vec_kernel_count(2)
+
+    @torch._dynamo.config.patch(specialize_int=False)
+    def test_slice_scatter_issue122291(self):
+        @torch.compile(fullgraph=True)
+        def fn(t, t_src, dim, start, end, step):
+            return t.slice_scatter(t_src, dim, start, end, step)
+
+        shape = ((16, 16), (16, 2), 1, 4, 10, 1)
+        input_tensor = torch.zeros(shape[0], requires_grad=False, device="cpu")
+        src_tensor = torch.ones(shape[1], requires_grad=False, device="cpu")
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.BackendCompilerFailed, r".*shape error in scatter op"
+        ):
+            fn(input_tensor, src_tensor, shape[2], shape[3], shape[4], shape[5])
 
     def test_horizontal_fusion(self):
         def fn(a, b, c, idx):
@@ -2200,7 +2785,7 @@ def fn(x):
             opt_fn = torch._dynamo.optimize("inductor")(fn)
             self.assertTrue(same(fn(x), opt_fn(x)))
             assert metrics.cpp_to_dtype_count == 0
-            assert metrics.generated_cpp_vec_kernel_count == 1
+            check_metrics_vec_kernel_count(1)
 
     def test_transpose_non_contiguous(self):
         def fn(a):
@@ -2237,7 +2822,7 @@ def fn(a):
         metrics.reset()
         x = torch.randn(1, 384, 20, 20).to(memory_format=torch.channels_last)
         self.common(fn, (x,))
-        assert metrics.generated_cpp_vec_kernel_count == 1
+        check_metrics_vec_kernel_count(1)
 
     def test_non_contiguous_index_with_constant_stride(self):
         def fn(x):
@@ -2263,6 +2848,18 @@ def fn(a):
         with self.assertRaises(RuntimeError):
             torch.compile(fn)(a)
 
+    @torch.no_grad()
+    @torch._inductor.config.patch(freezing=True)
+    def test_issue122380(self):
+        def func(x):
+            t1 = torch.unbind(x)
+            t2 = torch.stack(t1, dim=1)
+            t3 = torch.tanh(t2)
+            return t3
+
+        x = torch.randn(2, 3, 4)
+        self.assertEqual(torch.compile(func)(x), func(x))
+
     def test_ir_node_str(self):
         @torch.compile
         def fn(x: torch.Tensor) -> torch.Tensor:
@@ -2289,12 +2886,12 @@ def fn2(a):
         metrics.reset()
         x = torch.randn(100, 100)
         self.common(fn1, (x,))
-        assert metrics.generated_cpp_vec_kernel_count == 1
+        check_metrics_vec_kernel_count(1)
 
         metrics.reset()
         x = torch.randn(100, 100, 100)
         self.common(fn2, (x,))
-        assert metrics.generated_cpp_vec_kernel_count == 1
+        check_metrics_vec_kernel_count(1)
 
     def test_transpose_vertical_sum_cpu_only(self):
         def fn(a, b):
@@ -2305,7 +2902,29 @@ def fn(a, b):
         x = torch.randn(100, 50, 50)
         y = torch.randn(100, 50, 50).transpose(1, 2)
         self.common(fn, (x, y))
-        assert metrics.generated_cpp_vec_kernel_count == 2
+        check_metrics_vec_kernel_count(2)
+
+    def test_transpose_mxn_16_16_bf16_fp16(self):
+        def fn(a, b):
+            c = a * b
+            return c.sum(dim=1)
+
+        for dtype in [torch.bfloat16, torch.float16]:
+            metrics.reset()
+            x = torch.randn(100, 50, 50).to(dtype)
+            y = torch.randn(100, 50, 50).to(dtype).transpose(1, 2)
+            self.common(fn, (x, y))
+            check_metrics_vec_kernel_count(2)
+
+    def test_transpose_mxn_32_32_bf16_fp16(self):
+        def fn(a):
+            return a.permute(0, 2, 1).contiguous()
+
+        for dtype in [torch.bfloat16, torch.float16]:
+            metrics.reset()
+            x = torch.randn(2, 9216, 9216).to(dtype)
+            self.common(fn, (x,))
+            check_metrics_vec_kernel_count(2)
 
     def test_transpose_sum2d_cpu_only(self):
         def fn(a, b):
@@ -2316,7 +2935,7 @@ def fn(a, b):
         x = torch.randn(50, 50)
         y = torch.randn(50, 50).transpose(0, 1)
         self.common(fn, (x, y))
-        assert metrics.generated_cpp_vec_kernel_count == 2
+        check_metrics_vec_kernel_count(2)
 
     def test_transpose_sum_outer(self):
         # https://github.com/pytorch/pytorch/issues/98573
@@ -2326,7 +2945,7 @@ def fn(a):
         metrics.reset()
         x = torch.randn(10, 50, 50, 50)
         self.common(fn, (x,))
-        assert metrics.generated_cpp_vec_kernel_count == 1
+        check_metrics_vec_kernel_count(1)
 
     def test_to_dtype_bool_float(self):
         # https://github.com/pytorch/pytorch/issues/100800
@@ -2419,6 +3038,67 @@ def forward(self, x):
         self.assertRaises(RuntimeError, lambda: func(example_inputs))
         self.assertRaises(RuntimeError, lambda: jit_func(example_inputs))
 
+    def test_nn_param_assign(self):
+        # https://github.com/pytorch/pytorch/issues/99569
+        class Model2(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = nn.Conv2d(in_channels=3, out_channels=5, kernel_size=3)
+                self.batchnorm = nn.BatchNorm2d(num_features=5)
+                self.conv_weight = torch.randn(5, 3, 3, 3)
+                self.conv_bias = torch.randn(5)
+
+            def forward(self, x):
+                self.conv.weight = nn.Parameter(self.conv_weight)
+                self.conv.bias = nn.Parameter(self.conv_bias, requires_grad=False)
+                self.conv.eval()
+                x = self.conv(x)
+                x = self.batchnorm(x)
+                x = F.relu(x)
+                return x
+
+        input_tensor = torch.randn(1, 3, 10, 10)
+        func = Model2().to("cpu")
+
+        with torch.no_grad():
+            func.train(False)
+            v1 = func(input_tensor)
+            jit_func = torch.compile(func, fullgraph=True)
+            v2 = jit_func(input_tensor)
+            self.assertEqual(v1, v2)
+
+    def test_nn_param_assign_wrapped(self):
+        class Model2(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = nn.Conv2d(in_channels=3, out_channels=5, kernel_size=3)
+                self.batchnorm = nn.BatchNorm2d(num_features=5)
+                self.conv_weight = torch.randn(5, 3, 3, 3)
+                self.conv_bias = torch.randn(5)
+
+            def forward(self, x):
+                self.conv.weight = nn.Parameter(self.conv_weight)
+                self.conv.bias = nn.Parameter(self.conv_bias, requires_grad=False)
+                self.conv.eval()
+                x = self.conv(x)
+                x = self.batchnorm(x)
+                x = F.relu(x)
+                return x
+
+        input_tensor = torch.randn(1, 3, 10, 10)
+        func = Model2().to("cpu")
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        with torch.no_grad():
+            func.train(False)
+            v1 = func(input_tensor)
+            jit_func = torch.compile(wrapper, fullgraph=True)
+            v2 = jit_func(input_tensor)
+            self.assertEqual(v1, v2)
+
     @config.patch(inplace_buffers=True)
     def test_in_out_buffer(self):
         def fn(x, y):
@@ -2488,7 +3168,7 @@ def forward(self, x):
                     batch_size, seq_len, self.num_heads, self.head_size
                 ).permute(0, 2, 1, 3)
                 attention_weights = (
-                    torch.matmul(query, key).div(self.inv_scale).softmax(dim=-1)
+                    torch.matmul(query, key).mul(self.inv_scale).softmax(dim=-1)
                 )
                 output = torch.matmul(attention_weights, value)
                 return output
@@ -2515,7 +3195,7 @@ def f(x):
         metrics.reset()
         x = torch.randn(4, 5, dtype=torch.bfloat16)
         self.common(f, (x,))
-        assert metrics.generated_cpp_vec_kernel_count == 1
+        check_metrics_vec_kernel_count(1)
 
     def test_bf16_zeros(self):
         def fn():
@@ -2539,22 +3219,22 @@ def fn(x, y):
     @patch("torch.cuda.is_available", lambda: False)
     @config.patch(freezing=True)
     def test_linear_with_no_default_contiguous_input(self):
+        dtypes = [
+            torch.float32,
+        ]
+        if torch.ops.mkldnn._is_mkldnn_bf16_supported():
+            dtypes.append(torch.bfloat16)
+        if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+            dtypes.append(torch.float16)
         mod = torch.nn.Sequential(torch.nn.Linear(16, 16)).eval()
         temp = torch.randn(1, 16, 1, 1)
         v = torch.as_strided(temp, [1, 16], [0, 1], 0)
         self.assertTrue(v.is_contiguous())
-        with torch.no_grad():
-            self.common(
-                mod,
-                (v,),
-            )
-        if torch.ops.mkldnn._is_mkldnn_bf16_supported():
-            mod = mod.to(torch.bfloat16)
-            v = v.to(torch.bfloat16)
+        for dtype in dtypes:
             with torch.no_grad():
                 self.common(
-                    mod,
-                    (v,),
+                    mod.to(dtype),
+                    (v.to(dtype),),
                 )
 
     @patch("torch.cuda.is_available", lambda: False)
@@ -2611,7 +3291,7 @@ def forward(self, x):
         with torch.no_grad():
             self.common(mod, (x,))
             # 2 generated kernels (one for var_mean, the other for result)
-            assert metrics.generated_cpp_vec_kernel_count == 2
+            check_metrics_vec_kernel_count(2)
 
     def test_int_div_vec(self):
         def fn(x, y, mode):
@@ -2623,8 +3303,7 @@ def fn(x, y, mode):
             with torch.no_grad():
                 metrics.reset()
                 self.common(fn, (x, y, mode))
-                # TODO: support vectorization for int div
-                assert metrics.generated_cpp_vec_kernel_count == 0
+                check_metrics_vec_kernel_count(1)
 
     def test_uint8_add(self):
         # https://github.com/pytorch/pytorch/issues/113016
@@ -2673,7 +3352,7 @@ def forward(self, idx, x):
         with torch.no_grad():
             metrics.reset()
             self.common(m, (idx, x))
-            assert metrics.generated_cpp_vec_kernel_count == 1
+            check_metrics_vec_kernel_count(1)
 
     def test_embedding_vec_bf16(self):
         class M(torch.nn.Module):
@@ -2690,7 +3369,7 @@ def forward(self, idx, x):
         with torch.no_grad():
             metrics.reset()
             self.common(m, (idx, x))
-            assert metrics.generated_cpp_vec_kernel_count == 1
+            check_metrics_vec_kernel_count(1)
 
         # we are doing direct load/store, make sure we do not generate
         # redundant type casts
@@ -2708,7 +3387,7 @@ def fn(x, y):
         y = torch.randn(32, 120)
         metrics.reset()
         self.common(fn, (x, y))
-        assert metrics.generated_cpp_vec_kernel_count == 1
+        check_metrics_vec_kernel_count(3)
 
     def test_expr_vec_non_contiguous(self):
         def fn(x):
@@ -2720,14 +3399,354 @@ def fn(x):
             return y.softmax(dim=-1)
 
         x = torch.randn(128, 2048)
+        opt_fn = torch.compile(fn)
         metrics.reset()
-        self.common(fn, (x,))
+        _, code = run_and_get_cpp_code(opt_fn, x)
+        self.assertTrue(same(fn(x), opt_fn(x)))
         # 4 kernels for max, exp, sum and div
-        assert metrics.generated_cpp_vec_kernel_count == 4
+        check_metrics_vec_kernel_count(4)
+        FileCheck().check_count(
+            "Vectorized<int>::loadu(tmpbuf.data())", 0, exactly=True
+        ).run(code)
+
+    def test_vec_contiguous_ModularIndexing(self):
+        # https://github.com/pytorch/pytorch/issues/114488
+        class M(torch.nn.Module):
+            def __init__(self, dim):
+                super().__init__()
+                self.norm = torch.nn.LayerNorm(dim * 4)
+
+            def forward(self, x):
+                # the pattern from swin_base_patch4_window7_224
+                B, H, W, C = x.shape
+                x = (
+                    x.reshape(B, H // 2, 2, W // 2, 2, C)
+                    .permute(0, 1, 3, 4, 2, 5)
+                    .flatten(3)
+                )
+                x = self.norm(x)
+                return x
+
+        x = torch.randn(1, 56, 56, 128)
+        m = M(128)
+        opt_m = torch.compile(m)
+        with torch.no_grad():
+            metrics.reset()
+            _, code = run_and_get_cpp_code(opt_m, x)
+            self.assertTrue(same(m(x), opt_m(x)))
+            # Two kernels: one for reduction, one pointwises
+            check_metrics_vec_kernel_count(2)
+            FileCheck().check_count(
+                "Vectorized<float>::loadu(tmpbuf.data())", 0, exactly=True
+            ).run(code)
+
+    @parametrize("dtype", (torch.float16, torch.bfloat16, torch.float))
+    @parametrize("shape", ("15,3,13", "4,2048,4096"))
+    def test_fp8_cast(self, dtype: torch.dtype, shape: str):
+        def fp8_cast(x):
+            y0 = x.to(dtype=torch.float8_e4m3fn).to(dtype)
+            y1 = x.to(dtype=torch.float8_e5m2).to(dtype)
+            return y0, y1
+
+        shape = [int(dim) for dim in shape.split(",")]
+        x = torch.rand(*shape, device="cpu", dtype=dtype)
+        self.common(fp8_cast, (x,))
+
+    def test_logical_op_store_to_lowp_data_dtype(self):
+        # https://github.com/pytorch/pytorch/issues/117624
+        # https://github.com/pytorch/pytorch/issues/117627
+        def fn(out1, out2, input, other):
+            o1 = torch.logical_or(out=out1, input=input, other=other)
+            o2 = torch.logical_xor(out=out2, input=input, other=other)
+            return o1, o2
+
+        x = torch.rand([3, 3, 2, 8, 9, 2], dtype=torch.float)
+        y = torch.rand([3, 3, 2, 8, 9, 2], dtype=torch.float)
+        for dtype in _lowp_fp_dtypes:
+            o1 = torch.rand([3, 3, 2, 8, 9, 2], dtype=dtype)
+            o2 = torch.rand([3, 3, 2, 8, 9, 2], dtype=dtype)
+            with torch.no_grad():
+                self.common(fn, (o1, o2, x, y))
+
+    def test_constant_bool_vec(self):
+        def fn(x):
+            mask = torch.zeros(1, dtype=torch.bool)
+            return torch.where(mask, x, -1.0)
+
+        x = torch.rand(1000)
+        metrics.reset()
+        self.common(fn, (x,))
+        check_metrics_vec_kernel_count(1)
+
+    @torch._dynamo.config.patch(dynamic_shapes=True)
+    @torch._dynamo.config.patch(assume_static_by_default=False)
+    def test_symbolic_shape_scalar_value_reduction(self):
+        def fn(x, y):
+            return y + torch.ones(x).sum()
+
+        with torch.no_grad():
+            metrics.reset()
+            y = torch.randn(100)
+            self.common(fn, (100, y))
+            check_metrics_vec_kernel_count(2)
+
+    def test_int32_pointwise_vec(self):
+        def fn(x):
+            return x * x
+
+        x = torch.randint(0, 100, (32, 32), dtype=torch.int32)
+        metrics.reset()
+        self.common(fn, (x,))
+        check_metrics_vec_kernel_count(1)
+
+    def test_int32_reduction_vec(self):
+        def fn(x):
+            return x.sum(dim=1)
+
+        x = torch.randint(0, 100, (32, 32), dtype=torch.int32)
+        metrics.reset()
+        self.common(fn, (x,))
+        check_metrics_vec_kernel_count(1)
+
+    def test_uint32_pointwise_vec(self):
+        def fn(x):
+            return x * x
+
+        x = torch.randint(0, 100, (32, 32), dtype=torch.uint32)
+        metrics.reset()
+        self.common(fn, (x,))
+        # TODO(jgong5): change to 1 with vectorized uint32 load
+        assert metrics.generated_cpp_vec_kernel_count == 0
+
+    def test_uint32_reduction_vec(self):
+        def fn(x):
+            return x.sum(dim=1)
+
+        x = torch.randint(0, 100, (32, 32), dtype=torch.uint32)
+        metrics.reset()
+        self.common(fn, (x,))
+        # TODO(jgong5): change to 1 with vectorized uint32/uint64 load
+        assert metrics.generated_cpp_vec_kernel_count == 0
+
+    def test_int64_pointwise_vec(self):
+        def fn(x):
+            return x * x
+
+        x = torch.randint(0, 100, (32, 32), dtype=torch.int64)
+        metrics.reset()
+        self.common(fn, (x,))
+        check_metrics_vec_kernel_count(1)
+
+    def test_int64_reduction_vec(self):
+        def fn(x):
+            return x.sum(dim=1)
+
+        x = torch.randint(0, 100, (32, 32), dtype=torch.int64)
+        metrics.reset()
+        self.common(fn, (x,))
+        check_metrics_vec_kernel_count(1)
+
+    def test_uint64_pointwise_vec(self):
+        def fn(x):
+            return x * x
+
+        x = torch.randint(0, 100, (32, 32), dtype=torch.uint64)
+        metrics.reset()
+        self.common(fn, (x,))
+        # TODO(jgong5): change to 1 with vectorized uint64 load
+        assert metrics.generated_cpp_vec_kernel_count == 0
+
+    def test_uint64_reduction_vec(self):
+        def fn(x):
+            return x.sum(dim=1)
+
+        x = torch.randint(0, 100, (32, 32), dtype=torch.uint64)
+        metrics.reset()
+        self.common(fn, (x,))
+        # TODO(jgong5): change to 1 with vectorized uint64 load
+        assert metrics.generated_cpp_vec_kernel_count == 0
+
+    def test_convert_int32_to_int64_vec(self):
+        def fn(x):
+            return x.to(torch.int64)
+
+        x = torch.randint(0, 100, (32, 32), dtype=torch.int32)
+        metrics.reset()
+        self.common(fn, (x,))
+        check_metrics_vec_kernel_count(1)
+
+    def test_convert_int64_to_int32_vec(self):
+        def fn(x):
+            return x.to(torch.int32)
+
+        x = torch.randint(0, 100, (32, 32), dtype=torch.int64)
+        metrics.reset()
+        self.common(fn, (x,))
+        check_metrics_vec_kernel_count(1)
+
+    def test_convert_fp32_to_int64_vec(self):
+        def fn(x):
+            return x.to(torch.int64)
+
+        x = torch.rand(32, 32)
+        metrics.reset()
+        self.common(fn, (x,))
+        check_metrics_vec_kernel_count(1)
+
+    def test_convert_int64_to_fp32_vec(self):
+        def fn(x):
+            return x.to(torch.float32)
+
+        x = torch.randint(0, 100, (32, 32), dtype=torch.int64)
+        metrics.reset()
+        self.common(fn, (x,))
+        check_metrics_vec_kernel_count(1)
+
+    def test_no_redundant_to_dtypes_between_fused_scheduler_node(self):
+        # https://github.com/pytorch/pytorch/issues/115260
+        p0 = torch.tensor([1.0879], dtype=torch.float16)
+
+        class Model1(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, *args):
+                cat = torch.cat((args[3], args[2], args[1], args[0]), dim=2)
+                max_1 = torch.max(args[4], p0)
+                mul = torch.mul(cat, max_1)
+                tan = torch.tan(mul)
+                return (mul, tan)
+
+        metrics.reset()
+        m = Model1()
+        self.common(
+            m,
+            (
+                torch.randn((17, 5, 1, 7)).half(),
+                torch.randn((17, 5, 1, 7)).half(),
+                torch.randn((17, 5, 11, 7)).half(),
+                torch.randn((17, 5, 1, 7)).half(),
+                torch.tensor(4.39, dtype=torch.float16),
+            ),
+        )
+
+    def test_masked_load_int64_vec(self):
+        # https://github.com/pytorch/pytorch/issues/120377
+        def fn(x):
+            return torch.nn.functional.pad(x, (0, 13))
+
+        x = torch.randint(0, 100, (819,), dtype=torch.int64)
+        metrics.reset()
+        self.common(fn, (x,))
+        assert metrics.generated_cpp_vec_kernel_count == 1
+
+    def test_reduction_float_to_int64(self):
+        # https://github.com/pytorch/pytorch/issues/124821
+        def fn(x):
+            return x.max(0).values
+
+        x = torch.randint(0, 100, (22, 51), dtype=torch.int64)
+        metrics.reset()
+        self.common(fn, (x,))
+        assert metrics.generated_cpp_vec_kernel_count == 1
+
+    @config.patch({"cpp.dynamic_threads": True})
+    def test_reduction_with_dynamic_threads(self):
+        def fn(a, b):
+            return a.sum(), b.sum()
+
+        self.common(
+            fn,
+            (torch.randn(1000), torch.rand(1000)),
+        )
+
+    @patch("torch.cuda.is_available", lambda: False)
+    @config.patch(freezing=True)
+    def test_linear_float64(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight1 = torch.nn.Parameter(
+                    torch.randn(10, 10, dtype=torch.float64)
+                )
+                self.weight2 = torch.nn.Parameter(
+                    torch.randn(10, 10, dtype=torch.float64)
+                )
+                self.bias = torch.nn.Parameter(torch.randn(10, dtype=torch.float64))
+
+            def forward(self, x1):
+                v1 = torch.mm(x1, self.weight1)
+                v2 = torch.addmm(self.bias, x1, self.weight2)
+                return (v1, v2)
+
+        mod = M().eval()
+        v = torch.randn(10, 10, dtype=torch.float64)
+        with torch.no_grad():
+            self.common(
+                mod,
+                (v,),
+            )
+
+    def test_fused_attention_conv(self):
+        # https://github.com/pytorch/pytorch/issues/121174.
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.q_conv = torch.nn.Conv2d(4, 4, 1)
+                self.k_conv = torch.nn.Conv2d(4, 4, 1)
+                self.v_conv = torch.nn.Conv2d(4, 4, 1)
+
+            def forward(self, x):
+                q = self.q_conv(x)
+                k = self.k_conv(x)
+                v = self.v_conv(x)
+                q = q.permute(0, 2, 1, 3)
+                k = k.permute(0, 2, 1, 3)
+                v = v.permute(0, 2, 1, 3)
+                return torch.nn.functional.scaled_dot_product_attention(
+                    q, k, v, dropout_p=0.0, is_causal=False
+                )
+
+        fn = Model()
+        x = torch.randn(1, 4, 2, 2)
+        self.common(fn, (x,))
+
+    @requires_vectorization
+    def test_vec_indirect_load_cse_cache(self):
+        # https://github.com/pytorch/pytorch/issues/123502
+        from math import inf
+
+        def fn(arg0_1):
+            full_default = torch.ops.aten.full.default([209985], 1)
+            select = torch.ops.aten.select.int(arg0_1, 0, 0)
+            select_1 = torch.ops.aten.select.int(arg0_1, 0, 1)
+            view = torch.ops.aten.reshape.default(select_1, [-1])
+            expand = torch.ops.aten.expand.default(view, [209985])
+            full_default_1 = torch.ops.aten.full.default([10000], 0)
+            scatter_add = torch.ops.aten.scatter_add.default(
+                full_default_1, 0, expand, full_default
+            )
+            pow_1 = torch.ops.aten.pow.Tensor_Scalar(scatter_add, -0.5)
+            eq = torch.ops.aten.eq.Scalar(pow_1, inf)
+            full_default_2 = torch.ops.aten.full.default([], 0.0)
+            where = torch.ops.aten.where.self(eq, full_default_2, pow_1)
+            index = torch.ops.aten.index.Tensor(where, [select])
+            index_1 = torch.ops.aten.index.Tensor(where, [select_1])
+            mul_1 = torch.ops.aten.mul.Tensor(index, index_1)
+            return (mul_1,)
+
+        x = torch.zeros(2, 209985).to(torch.int64)
+        opt_fn = torch._dynamo.optimize("inductor")(fn)
+        _, code = run_and_get_cpp_code(opt_fn, x)
+        FileCheck().check_count(
+            "return at::vec::VectorizedN<int64_t,2>::loadu(tmpbuf.data(),",
+            2,
+            exactly=True,
+        ).run(code)
 
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
     from torch.testing._internal.inductor_utils import HAS_CPU
 
     if HAS_CPU and not IS_MACOS:
diff --git a/test/inductor/test_cuda_cpp_wrapper.py b/test/inductor/test_cuda_cpp_wrapper.py
index 71aaad35030a2..ec62ddb6f0ebe 100644
--- a/test/inductor/test_cuda_cpp_wrapper.py
+++ b/test/inductor/test_cuda_cpp_wrapper.py
@@ -3,7 +3,9 @@
 import unittest
 from typing import NamedTuple
 
+import torch
 from torch._inductor import config
+from torch._inductor.test_case import TestCase as InductorTestCase
 from torch.testing._internal.common_device_type import (
     get_desired_device_type_test_bases,
 )
@@ -11,7 +13,6 @@
     slowTest,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
-    TestCase as TorchTestCase,
 )
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
@@ -49,11 +50,11 @@ class CudaWrapperTemplate:
     pass
 
 
-class TestCudaWrapper(TorchTestCase):
+class TestCudaWrapper(InductorTestCase):
     device = "cuda"
 
 
-class DynamicShapesCudaWrapperCudaTests(TorchTestCase):
+class DynamicShapesCudaWrapperCudaTests(InductorTestCase):
     device = "cuda"
 
 
@@ -76,6 +77,7 @@ class DynamicShapesCudaWrapperCudaTests(TorchTestCase):
         "test_foreach_cpp_wrapper_cuda",
         "test_index_put_deterministic_fallback_cuda",
         "test_index_tensor_cuda",
+        "test_inductor_layout_optimization_input_mutations_cuda",
         "test_linear_relu_cuda",
         "test_multi_device_cuda",
         "test_mm_plus_mm2_cuda",
@@ -93,6 +95,31 @@ class DynamicShapesCudaWrapperCudaTests(TorchTestCase):
             dynamic_shapes_test_name
         ] = test_torchinductor.TestFailure(("cuda_wrapper",), is_skip=True)
 
+if config.abi_compatible:
+    xfail_list = [
+        "test_bernoulli1_cuda",  # cpp fallback op naming issue
+        "test_conv_backward_cuda",
+        "test_profiler_mark_wrapper_call_cuda",
+        "test_scaled_dot_product_attention_cuda_dynamic_shapes",
+    ]
+    for test_name in xfail_list:
+        test_failures_cuda_wrapper[test_name] = test_torchinductor.TestFailure(
+            ("cuda_wrapper",), is_skip=False
+        )
+        test_failures_cuda_wrapper[
+            f"{test_name}_dynamic_shapes"
+        ] = test_torchinductor.TestFailure(("cuda_wrapper",), is_skip=False)
+    skip_list = [
+        "test_multi_device_cuda",
+    ]
+    for test_name in skip_list:
+        test_failures_cuda_wrapper[test_name] = test_torchinductor.TestFailure(
+            ("cuda_wrapper",), is_skip=True
+        )
+        test_failures_cuda_wrapper[
+            f"{test_name}_dynamic_shapes"
+        ] = test_torchinductor.TestFailure(("cuda_wrapper",), is_skip=True)
+
 
 def make_test_case(
     name,
@@ -116,16 +143,21 @@ def fn(self):
         tests.setUpClass()
         tests.setUp()
         try:
-            _, code = test_torchinductor.run_and_get_cpp_code(
-                func, *func_inputs if func_inputs else []
-            )
-            self.assertEqual("CppWrapperCodeCache" in code, True)
-            self.assertTrue(
-                all(
-                    code.count(string) == code_string_count[string]
-                    for string in code_string_count
+            with torch._C._PreserveDispatchKeyGuard():
+                torch._C._dispatch_tls_set_dispatch_key_included(
+                    torch._C.DispatchKey.Dense, True
+                )
+
+                _, code = test_torchinductor.run_and_get_cpp_code(
+                    func, *func_inputs if func_inputs else []
+                )
+                self.assertEqual("CppWrapperCodeCache" in code, True)
+                self.assertTrue(
+                    all(
+                        code.count(string) == code_string_count[string]
+                        for string in code_string_count
+                    )
                 )
-            )
         finally:
             tests.tearDown()
             tests.tearDownClass()
@@ -147,11 +179,12 @@ def fn(self):
     class BaseTest(NamedTuple):
         name: str
         device: str = "cuda"
-        tests: TorchTestCase = test_torchinductor.CudaTests()
+        tests: InductorTestCase = test_torchinductor.GPUTests()
 
     # Maintain two separate test lists for cuda and cpp for now
     for item in [
-        BaseTest("test_add_complex2"),
+        BaseTest("test_add_complex"),
+        BaseTest("test_add_complex4"),
         BaseTest("test_as_strided"),  # buffer reuse
         BaseTest("test_batch_norm_2d_2"),
         BaseTest("test_bernoulli1"),
@@ -161,11 +194,14 @@ class BaseTest(NamedTuple):
         BaseTest("test_cat"),  # alias
         BaseTest("test_convolution1"),
         BaseTest("test_conv_backward"),
-        BaseTest("test_custom_op"),
+        BaseTest("test_custom_op_1"),
+        BaseTest("test_custom_op_2"),
+        BaseTest("test_custom_op_3"),
         BaseTest("test_embedding_bag"),  # test default FallbackKernel
         BaseTest("test_index_put_deterministic_fallback"),
         BaseTest("test_adding_tensor_offsets"),
         BaseTest("test_index_tensor"),
+        BaseTest("test_inductor_layout_optimization_input_mutations"),
         BaseTest("test_layer_norm"),
         BaseTest("test_linear1"),
         BaseTest("test_linear2"),
@@ -209,10 +245,16 @@ class BaseTest(NamedTuple):
         #     device=None,
         #     tests=test_select_algorithm.TestSelectAlgorithm(),
         # ),
-        BaseTest(
-            "test_mm_plus_mm2",
-            tests=test_select_algorithm.TestSelectAlgorithm(),
-        ),
+        # TODO: Re-enable this test after fixing cpp wrapper for mm_plus_mm2.
+        # This test is unstable: it succeeds when an Triton kernel is used, and fails when a Aten kernel is used.
+        # The current state is that it's unstable, depending on the autotune result.
+        # The failing code generates aoti_torch_cuda__mm_plus_mm (likely some bug when generating ExternKernel)
+        # More information check:
+        # https://hud.pytorch.org/pytorch/pytorch/commit/b6982bf2b25d2d3ba5d82488a39721d6013a838f?fbclid=IwAR23OCV2rCALsGQk6kmkOqT8DfgQedYDt_Gs2R-t9ejSJNjRskkS1rzncDE
+        # BaseTest(
+        #     "test_mm_plus_mm2",
+        #     tests=test_select_algorithm.TestSelectAlgorithm(),
+        # ),
         BaseTest("test_fft_real_input"),
         BaseTest("test_fft_real_input_real_output"),
     ]:
@@ -234,7 +276,7 @@ class BaseTest(NamedTuple):
     )
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if RUN_CUDA:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
index c38ffa54bbcbd..db02d19310097 100644
--- a/test/inductor/test_cuda_repro.py
+++ b/test/inductor/test_cuda_repro.py
@@ -1,4 +1,5 @@
 # Owner(s): ["module: inductor"]
+import gc
 import math
 import sys
 import unittest
@@ -13,7 +14,10 @@
 from torch._dynamo.utils import same
 from torch._inductor import config
 from torch._inductor.compile_fx import compile_fx_inner
+from torch._inductor.runtime.hints import DeviceProperties
+from torch._inductor.utils import run_and_get_code
 from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.testing._internal.common_utils import (
     DeterministicGuard,
@@ -378,12 +382,8 @@ def test_autotune_inplace_kernel(self):
         https://github.com/pytorch/torchdynamo/issues/1670
         """
         from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
-        from torch._inductor.triton_heuristics import (
-            CachingAutotuner,
-            grid,
-            HeuristicType,
-        )
-        from torch._inductor.utils import instance_descriptor
+        from torch._inductor.runtime.hints import HeuristicType, instance_descriptor
+        from torch._inductor.runtime.triton_heuristics import CachingAutotuner, grid
 
         def autotune(configs, meta):
             def decorator(fn):
@@ -406,7 +406,7 @@ def decorator(fn):
             ],
             meta={
                 "signature": {0: "*fp32", 1: "*fp32", 2: "i32"},
-                "device": 0,
+                "device": DeviceProperties.create(torch.device("cuda")),
                 "configs": [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())],
                 "constants": {},
             },
@@ -966,24 +966,29 @@ def fn(x, y, z):
         self.assertEqual(ref, res)
 
     @config.patch({"triton.cudagraphs": True})
+    @config.patch({"fx_graph_cache": True})
     def test_index_put_cudagraph(self):
-        def fn(x, y, z):
-            x = torch.zeros_like(x)
-            return x.index_put([y], z, True)
+        for _ in range(2):
 
-        x = torch.zeros((512, 512), device="cuda", dtype=torch.bool)
-        y = torch.zeros((512,), device="cuda", dtype=torch.int64)
-        z = torch.ones((512, 512), device="cuda", dtype=torch.bool)
+            def fn(x, y, z):
+                x = torch.zeros_like(x)
+                return x.index_put([y], z, True)
 
-        opt_fn = torch._dynamo.optimize("inductor")(fn)
+            x = torch.zeros((512, 512), device="cuda", dtype=torch.bool)
+            y = torch.zeros((512,), device="cuda", dtype=torch.int64)
+            z = torch.ones((512, 512), device="cuda", dtype=torch.bool)
 
-        ref = fn(x, y, z)
+            opt_fn = torch._dynamo.optimize("inductor")(fn)
 
-        # run it twice to test cuda graph issue
-        res = opt_fn(x, y, z)
-        res = opt_fn(x, y, z)
+            ref = fn(x, y, z)
 
-        self.assertEqual(ref, res)
+            # run it twice to test cuda graph issue
+            res = opt_fn(x, y, z)
+            res = opt_fn(x, y, z)
+
+            self.assertEqual(ref, res)
+            torch._dynamo.reset()
+            gc.collect()
 
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION, "flash attention not supported"
@@ -1072,9 +1077,67 @@ def test_multi_output_layout_fallback(self):
 
         self.assertEqual(o1, o2)
 
+    def test_cat_int8_one_kernel(self):
+        @torch.compile()
+        def cat(inps):
+            return torch.cat(inps) + 1
+
+        for dtype in [torch.uint8, torch.int8]:
+            inps = [
+                torch.empty([256, 256], dtype=dtype, device="cuda") for _ in range(4)
+            ]
+
+            out, code = run_and_get_code(cat, inps)
+            self.assertEqual(torch.cat(inps) + 1, out)
+            FileCheck().check_not("aten.cat.default(").check_count(
+                ".run(", 1, exactly=True
+            ).run(code[0])
+
+    @config.patch("triton.use_block_ptr", True)
+    def test_selecsls42b_misaligned_address(self):
+        # https://github.com/openai/triton/issues/2836
+
+        @torch.compile(fullgraph=True)
+        def fn(arg207_1, arg208_1, convert_element_type_40, expand, full, mul_3):
+            div = torch.ops.aten.div.Scalar(expand, 16)
+            where = torch.ops.aten.where.self(arg207_1, full, div)
+            convert_element_type_43 = torch.ops.prims.convert_element_type.default(
+                where, torch.float32
+            )
+            sum_2 = torch.ops.aten.sum.dim_IntList(convert_element_type_43, [0, 2, 3])
+            sub = torch.ops.aten.sub.Tensor(convert_element_type_40, arg208_1)
+            mul = torch.ops.aten.mul.Tensor(convert_element_type_43, sub)
+            sum_3 = torch.ops.aten.sum.dim_IntList(mul, [0, 2, 3])
+            mul_1 = torch.ops.aten.mul.Tensor(sum_2, 0.0078125)
+            unsqueeze = torch.ops.aten.unsqueeze.default(mul_1, 0)
+            unsqueeze_1 = torch.ops.aten.unsqueeze.default(unsqueeze, 2)
+            unsqueeze_2 = torch.ops.aten.unsqueeze.default(unsqueeze_1, 3)
+            mul_2 = torch.ops.aten.mul.Tensor(sum_3, 0.0078125)
+            mul_4 = torch.ops.aten.mul.Tensor(mul_2, mul_3)
+            unsqueeze_3 = torch.ops.aten.unsqueeze.default(mul_4, 0)
+            unsqueeze_4 = torch.ops.aten.unsqueeze.default(unsqueeze_3, 2)
+            unsqueeze_5 = torch.ops.aten.unsqueeze.default(unsqueeze_4, 3)
+            mul_6 = torch.ops.aten.mul.Tensor(sub, unsqueeze_5)
+            sub_1 = torch.ops.aten.sub.Tensor(convert_element_type_43, mul_6)
+            sub_2 = torch.ops.aten.sub.Tensor(sub_1, unsqueeze_2)
+            return (sub_2,)
+
+        args = [
+            torch.randn((8, 1024, 4, 4), device="cuda") > 0,  # torch.bool tensor
+            torch.randn((1, 1024, 1, 1), device="cuda"),
+            torch.randn((8, 1024, 4, 4), device="cuda"),
+            torch.randn((8, 1024, 1, 1), dtype=torch.float16, device="cuda").expand(
+                (8, 1024, 4, 4)
+            ),
+            torch.randn((), device="cuda"),
+            torch.randn((1024,), device="cuda"),
+        ]
+        fn(*args)
+        torch.cuda.synchronize()  # shake out Triton Error [CUDA]: misaligned address
+
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
     from torch.testing._internal.inductor_utils import HAS_CUDA
 
     if HAS_CUDA and not TEST_WITH_ASAN:
diff --git a/test/inductor/test_cudacodecache.py b/test/inductor/test_cudacodecache.py
index b7df07ff954b7..33a179a9abc7c 100644
--- a/test/inductor/test_cudacodecache.py
+++ b/test/inductor/test_cudacodecache.py
@@ -9,7 +9,7 @@
 from torch._inductor.codecache import AsyncCompile, CUDACodeCache
 from torch._inductor.codegen.cuda.cuda_env import nvcc_exist
 from torch._inductor.exc import CUDACompileError
-from torch.testing._internal.common_utils import TestCase as TorchTestCase
+from torch._inductor.test_case import TestCase as InductorTestCase
 
 _SOURCE_CODE = r"""
 
@@ -36,7 +36,7 @@
 
 
 @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUDA_HOME setup")
-class TestCUDACodeCache(TorchTestCase):
+class TestCUDACodeCache(InductorTestCase):
     def test_cuda_load(self):
         # Test both .o and .so compilation.
         object_file_path, object_hash_key, source_code_path0 = CUDACodeCache.compile(
@@ -86,7 +86,7 @@ def test_async_compile(self):
 
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if nvcc_exist():
         run_tests("cuda")
diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
index 24f6e71223822..c8877d4a8e978 100644
--- a/test/inductor/test_cudagraph_trees.py
+++ b/test/inductor/test_cudagraph_trees.py
@@ -11,21 +11,24 @@
 
 import torch._dynamo.config as dynamo_config
 import torch.nn as nn
+from torch._dynamo.utils import counters
 from torch._inductor import config
 from torch._inductor.compile_fx import compile_fx_inner
 from torch._inductor.cudagraph_trees import cudagraphify_impl as tree_cudagraphify_impl
+from torch._inductor.test_case import TestCase as InductorTestCase
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
 
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
     IS_CI,
     IS_LINUX,
     IS_WINDOWS,
+    parametrize,
     skipIfRocm,
     TEST_CUDA_GRAPH,
     TEST_WITH_ASAN,
-    TestCase as TorchTestCase,
 )
 from torch.utils._python_dispatch import TorchDispatchMode
 
@@ -43,17 +46,42 @@
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 aten = torch.ops.aten
-requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 requires_multigpu = functools.partial(
     unittest.skipIf, not TEST_MULTIGPU, "requires multiple cuda devices"
 )
+from io import StringIO
+
+
+def get_compile_fn(backend):
+    if backend == "cudagraphs":
+        return functools.partial(torch.compile, backend="cudagraphs")
+    else:
+        return functools.partial(torch.compile, mode="reduce-overhead")
+
+
+class capture_stderr(list):
+    """
+    Replace sys.stderr with a temporary StringIO
+    """
+
+    def __enter__(self):
+        self.sys_stderr = sys.stderr
+        self.stringio = StringIO()
+        sys.stderr = self.stringio
+        return self
+
+    def __exit__(self, *args):
+        self.append(str(self.stringio.getvalue()))
+        del self.stringio
+        sys.stderr = self.sys_stderr
 
 
 def cdata(t):
     return t.untyped_storage()._cdata
 
 
-class TestCase(TorchTestCase):
+class TestCase(InductorTestCase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
@@ -205,20 +233,96 @@ def test_rng_trees(self):
         def test_rng_non_trees(self):
             self.check_rng()
 
-        def test_mutation(self):
-            @torch.compile()
+        def test_mutation_reinplaced(self):
+            import torch.nn as nn
+
+            class Model(nn.Module):
+                def __init__(self):
+                    super().__init__()
+
+                def forward(self, input, other, out):
+                    input = torch.logical_xor(input=input, other=other, out=out)
+                    return input
+
+            x = torch.rand([1, 2, 1, 4, 9, 7], dtype=torch.float32).cuda()
+            y = torch.rand([1, 2, 1, 4, 9, 7], dtype=torch.float32).cuda()
+            z = torch.rand([1, 2, 1, 4, 9, 7], dtype=torch.float16).cuda()
+
+            model = Model().cuda()
+            eag = model(x, y, z)
+            with capture_stderr() as captured_output:
+                opt = torch.compile(model.forward, mode="reduce-overhead")(x, y, z)
+
+            FileCheck().check(
+                "skipping cudagraphs due to mutated inputs (1 instances). Found from"
+            ).check("torch.logical_xor").run(captured_output[0])
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+
+        @requires_multigpu()
+        @parametrize("backend", ("inductor", "cudagraphs"))
+        def test_multiple_devices_msg(self, backend):
+            def foo(x, y):
+                return (x + 1, y + 2)
+
+            foo = get_compile_fn(backend)(foo)
+            with capture_stderr() as captured_output:
+                foo(torch.ones([10], device="cuda"), torch.ones([20]))
+
+            FileCheck().check(
+                "skipping cudagraphs due to cpu device (arg1_1). Found from"
+            ).check("y + 2").run(captured_output[0])
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+
+            with capture_stderr() as captured_output:
+                foo(
+                    torch.ones([10], device="cuda:0"), torch.ones([10], device="cuda:1")
+                )
+
+            FileCheck().check("skipping cudagraphs due to multiple devices").run(
+                captured_output[0]
+            )
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 2)
+
+        @torch._inductor.config.patch("triton.cudagraph_skip_dynamic_graphs", True)
+        def test_skip_symbolic(self):
+            @torch.compile(dynamic=True)
+            def foo(x, y):
+                return x + y
+
+            with capture_stderr() as captured_output:
+                foo(torch.rand([10], device="cuda"), torch.rand([10], device="cuda"))
+
+            FileCheck().check(
+                "skipping cudagraphs due to graph with symbolic shapes inputs"
+            ).check("x + y").run(captured_output[0])
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+
+        @parametrize("backend", ("inductor", "cudagraphs"))
+        @torch._dynamo.config.patch("cudagraph_backend_keep_input_mutation", True)
+        @torch._dynamo.config.patch("cudagraph_backend_support_input_mutation", True)
+        @torch._inductor.config.patch("triton.cudagraph_support_input_mutation", True)
+        def test_mutation_on_inp(self, backend):
             def foo(x):
                 x.add_(2)
                 return x
 
+            foo = get_compile_fn(backend)(foo)
+
             def inp():
                 return torch.ones([10], device="cuda")
 
-            foo(inp())
+            with capture_stderr() as captured_output:
+                foo(inp())
+
+            FileCheck().check(
+                "skipping cudagraphs due to mutated inputs (1 instances). Found from"
+            ).check(".add_(2)").run(captured_output[0])
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
 
             # mutation on inp doesnt hit cudagraphs
-            self.assertIsNone(self.get_manager())
+            self.assertEqual(len(self.get_manager().roots), 0)
 
+            # mutation on parameters/buffers hits cudagraphs
             class Mod(torch.nn.Module):
                 def __init__(self):
                     super().__init__()
@@ -228,10 +332,10 @@ def forward(self, x):
                     self.buf.add_(x)
                     return self.buf + x
 
-            @torch.compile()
             def foo(mod, x):
                 return mod(x)
 
+            foo = get_compile_fn(backend)(foo)
             mod = Mod()
             mod2 = Mod()
 
@@ -241,6 +345,150 @@ def foo(mod, x):
 
             self.assertIsNotNone(self.get_manager())
 
+        @parametrize("backend", ("inductor", "cudagraphs"))
+        @torch._dynamo.config.patch("cudagraph_backend_keep_input_mutation", True)
+        @torch._dynamo.config.patch("cudagraph_backend_support_input_mutation", False)
+        @torch._inductor.config.patch("triton.cudagraph_support_input_mutation", False)
+        def test_mutation_cudagraph_managed_tensors_config(self, backend):
+            def foo(x):
+                return x + 1
+
+            def mut(x):
+                x.add_(2)
+                return x
+
+            def non_mut(x):
+                return x.add(2)
+
+            mut = get_compile_fn(backend)(mut)
+            foo = get_compile_fn(backend)(foo)
+
+            with capture_stderr() as captured_output:
+                for i in range(3):
+                    torch.compiler.cudagraph_mark_step_begin()
+                    inp = torch.rand([4], device="cuda")
+
+                    tmp = foo(inp)
+                    mut_out = mut(tmp)
+                    self.assertEqual(mut_out, non_mut(foo(inp)))
+            FileCheck().check_count(
+                "skipping cudagraphs due to mutated inputs (1 instances). Found from",
+                1,
+                exactly=True,
+            ).run(captured_output[0])
+
+        @parametrize("backend", ("inductor", "cudagraphs"))
+        @torch._dynamo.config.patch("cudagraph_backend_keep_input_mutation", True)
+        @torch._dynamo.config.patch("cudagraph_backend_support_input_mutation", True)
+        @torch._inductor.config.patch("triton.cudagraph_support_input_mutation", True)
+        def test_mutation_cudagraph_managed_tensors(self, backend):
+            def foo(x):
+                return x + 1
+
+            def mut(x):
+                x.add_(2)
+                return x
+
+            def non_mut(x):
+                return x.add(2)
+
+            mut = get_compile_fn(backend)(mut)
+            foo = get_compile_fn(backend)(foo)
+
+            with capture_stderr() as captured_output:
+                for i in range(3):
+                    torch.compiler.cudagraph_mark_step_begin()
+                    inp = torch.rand([4], device="cuda")
+
+                    tmp = foo(inp)
+                    mut_out = mut(tmp)
+                    self.assertEqual(mut_out, non_mut(foo(inp)))
+            FileCheck().check_count(
+                "skipping cudagraphs due to mutated inputs (1 instances). Found from",
+                0,
+                exactly=True,
+            ).run(captured_output[0])
+            self.assertTrue("cudagraph_skips" not in counters["inductor"])
+
+            torch.compiler.cudagraph_mark_step_begin()
+            inp = torch.rand([4], device="cuda")
+            tmp = foo(inp)
+            mut_inp = tmp.clone()
+            # in this case, what previously a mutated cudagraph managed tensor is no longer,
+            # now its an input from eager we should fallback to inductor without cudagraphs
+            with capture_stderr() as captured_output:
+                mut(mut_inp)
+            FileCheck().check(
+                "skipping cudagraphs due to mutated inputs (1 instances). Found from"
+            ).check("x.add_(2)").run(captured_output[0])
+            self.assertEqual(mut_inp, non_mut(foo(inp)))
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+
+        @parametrize("backend", ("inductor", "cudagraphs"))
+        @torch._dynamo.config.patch("cudagraph_backend_keep_input_mutation", True)
+        @torch._dynamo.config.patch("cudagraph_backend_support_input_mutation", True)
+        @torch._inductor.config.patch("triton.cudagraph_support_input_mutation", True)
+        def test_mutation_cudagraph_managed_tensor_warn(self, backend):
+            def foo(x):
+                return x.add_(1)
+
+            def fee(y, z):
+                return z.add(3)
+
+            def inp():
+                return torch.rand([4], device="cuda")
+
+            foo = get_compile_fn(backend)(foo)
+            fee = get_compile_fn(backend)(fee)
+
+            with capture_stderr() as captured_output:
+                for _ in range(3):
+                    torch.compiler.cudagraph_mark_step_begin()
+                    fee(inp(), foo(inp()))
+            FileCheck().check_count(
+                "skipping cudagraphs due to mutated inputs (1 instances). Found from",
+                1,
+                exactly=True,
+            ).run(captured_output[0])
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+
+        @parametrize("backend", ("inductor", "cudagraphs"))
+        @torch._dynamo.config.patch("cudagraph_backend_keep_input_mutation", True)
+        @torch._dynamo.config.patch("cudagraph_backend_support_input_mutation", True)
+        @torch._inductor.config.patch("triton.cudagraph_support_input_mutation", True)
+        def test_mutation_cudagraph_managed_tensor_warn_only_once(self, backend):
+            def foo(x):
+                return x + 1
+
+            def mut(x):
+                x.add_(2)
+                return x
+
+            def inp():
+                return torch.rand([4], device="cuda")
+
+            mut = get_compile_fn(backend)(mut)
+            foo = get_compile_fn(backend)(foo)
+
+            with capture_stderr() as captured_output:
+                # Should warn for current_node=None
+                mut(inp())
+
+                for i in range(3):
+                    torch.compiler.cudagraph_mark_step_begin()
+                    tmp = foo(inp())
+                    mut(tmp)  # should not warn
+
+                mut_inp = tmp.clone()
+                mut(mut_inp)  # should not warn since mut has warned
+
+            FileCheck().check_count(
+                "skipping cudagraphs due to mutated inputs (1 instances). Found from",
+                1,
+                exactly=True,
+            ).run(captured_output[0])
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+
         def test_function_compiled_multiple_times(self):
             def foo(x):
                 y = foo2(x)
@@ -362,14 +610,16 @@ def complex_memory_overlap_new(t):
 
             self.assertFalse(self.get_manager().running_forwards_with_pending_backwards)
 
-        def test_forward_backward_not_called(self):
-            @torch.compile(mode="reduce-overhead")
+        @parametrize("backend", ("inductor", "cudagraphs"))
+        def test_forward_backward_not_called(self, backend):
             def foo(x, y):
                 x_out = x * x * x
                 torch._dynamo.graph_break()
                 y_out = y * y * y
                 return x_out, y_out
 
+            foo = get_compile_fn(backend)(foo)
+
             for _ in range(3):
                 inps = [
                     torch.rand([20, 20], requires_grad=True, device="cuda")
@@ -384,7 +634,7 @@ def foo(x, y):
             new_id = self.get_manager().new_graph_id().id
             self.assertEqual(new_id, 3)
 
-        def _test_unaligned_static_input_impl(self):
+        def _test_unaligned_static_input_impl(self, expected_clones):
             def fn(x, y):
                 return (x + y,)
 
@@ -415,21 +665,21 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             for _ in range(3):
                 with CloneCounterMode() as m:
                     compiled_f(get_unaligned_inputs())
-                    self.assertEqual(m.count, 2)
+                    self.assertEqual(m.count, expected_clones)
 
                     compiled_f(get_aligned_inputs())
-                    self.assertEqual(m.count, 2)
+                    self.assertEqual(m.count, expected_clones)
 
         def test_unaligned_static_input_trees(self):
-            self._test_unaligned_static_input_impl()
+            self._test_unaligned_static_input_impl(expected_clones=0)
 
         @torch._inductor.config.patch("triton.cudagraph_trees", False)
         def test_unaligned_static_input_non_trees(self):
-            self._test_unaligned_static_input_impl()
+            self._test_unaligned_static_input_impl(expected_clones=0)
 
         @torch._inductor.config.patch("triton.cudagraphs", False)
         def test_unaligned_static_input_no_cudagraphs(self):
-            self._test_unaligned_static_input_impl()
+            self._test_unaligned_static_input_impl(expected_clones=0)
 
         def test_sparsity(self):
             def foo(view_6, buf31):
@@ -708,6 +958,26 @@ def foo(args):
             self.assertEqual(node.cached_tensor_outputs, [None])
             self.assertEqual(node.unaliased_in_all_paths, [False])
 
+        def test_warmup_stream_sync(self):
+            def foo(args):
+                x = args[0]
+                args.clear()
+                x_orig = x
+                for _ in range(100):
+                    x = x @ x
+                return (x,)
+
+            inp = torch.rand([4096, 4096], device="cuda")
+            ref = foo([inp])[0]
+            torch.cuda.synchronize()
+
+            user_stream = torch.cuda.Stream()
+            with torch.cuda.stream(user_stream):
+                foo_cg = self.cudagraphify_impl(foo, [inp], (0,))
+                out = foo_cg([inp])[0]
+                y = out + 1
+                self.assertEqual(y, ref + 1)
+
         def test_unaligned_static_parameter(self):
             def gen_inp():
                 inp = torch.ones([20], device="cuda")
@@ -1141,6 +1411,23 @@ def foo(mod, inp):
             node = self.get_manager().current_node
             self.assertEqual(len(list(node.path_live_weakrefs())), 1)
 
+        def test_unstable_ptr(self):
+            import torch
+
+            @torch.compile(mode="reduce-overhead")
+            def foo(m, inp):
+                return m(inp)
+
+            def f():
+                l = []
+                m = torch.nn.Linear(20, 20).cuda()
+                for _ in range(4):
+                    inp = torch.rand([20, 20], device="cuda")
+                    foo(m, inp)
+                    m.weight.data = torch.rand([20, 20], device="cuda")
+
+            self.assertRaises(RuntimeError, f)
+
         @requires_multigpu()
         def test_manager_per_device(self):
             def test():
@@ -1342,8 +1629,16 @@ def test_incompatible_cudagraph_ops_item(self):
             def foo(x):
                 return x.item()
 
-            self.assertEqual(foo(torch.tensor(3.0, device="cuda")), 3.0)
-            self.assertEqual(foo(torch.tensor(6.0, device="cuda")), 6.0)
+            with capture_stderr() as captured_output:
+                self.assertEqual(foo(torch.tensor(3.0, device="cuda")), 3.0)
+                self.assertEqual(foo(torch.tensor(6.0, device="cuda")), 6.0)
+
+            # NOTE: this test is named after incompatible ops, but is not skipping due to incompatible ops.
+            # This should get fixed.
+            FileCheck().check(
+                "skipping cudagraphs due to cpu device (_local_scalar_dense)"
+            ).run(captured_output[0])
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
 
         @torch._dynamo.config.patch("capture_dynamic_output_shape_ops", True)
         def test_incompatible_cudagraph_ops_nonzero(self):
@@ -1351,12 +1646,52 @@ def test_incompatible_cudagraph_ops_nonzero(self):
             def foo(x):
                 return x.nonzero()
 
-            self.assertEqual(
-                foo(torch.tensor([1, 0, 2], device="cuda")), torch.tensor([[0], [2]])
-            )
-            self.assertEqual(
-                foo(torch.tensor([1, 0, 0], device="cuda")), torch.tensor([[0]])
+            with capture_stderr() as captured_output:
+                self.assertEqual(
+                    foo(torch.tensor([1, 0, 2], device="cuda")),
+                    torch.tensor([[0], [2]]),
+                )
+                self.assertEqual(
+                    foo(torch.tensor([1, 0, 0], device="cuda")), torch.tensor([[0]])
+                )
+
+            FileCheck().check("skipping cudagraphs due to ['incompatible ops']").run(
+                captured_output[0]
             )
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
+
+        @torch._dynamo.config.patch("capture_dynamic_output_shape_ops", True)
+        def test_incompatible_cudagraph_ops_nonzero_graph_breaks(self):
+            @torch.compile(mode="reduce-overhead")
+            def foo(x):
+                y = x.nonzero()  # skip
+                torch._dynamo.graph_break()
+                return y.nonzero()  # skip 2 times (due to recompile)
+
+            foo(torch.tensor([1, 0, 2], device="cuda"))
+            foo(torch.tensor([1, 0, 0], device="cuda"))
+
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 3)
+
+        @torch._dynamo.config.patch("capture_dynamic_output_shape_ops", True)
+        def test_incompatible_cudagraph_ops_nonzero_backend(self):
+            @torch.compile(backend="cudagraphs")
+            def foo(x):
+                return x.nonzero()
+
+            with capture_stderr() as captured_output:
+                self.assertEqual(
+                    foo(torch.tensor([1, 0, 2], device="cuda")),
+                    torch.tensor([[0], [2]]),
+                )
+                self.assertEqual(
+                    foo(torch.tensor([1, 0, 0], device="cuda")), torch.tensor([[0]])
+                )
+
+            FileCheck().check(
+                "skipping cudagraphs due to incompatible op (nonzero)"
+            ).run(captured_output[0])
+            self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
 
         def test_storage_access_error(self):
             x = torch.rand([4], device="cuda")
@@ -1365,9 +1700,10 @@ def test_storage_access_error(self):
             with self.assertRaisesRegex(Exception, "custom error msg"):
                 device = x.untyped_storage()
 
+    instantiate_parametrized_tests(CudaGraphTreeTests)
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if not TEST_CUDA_GRAPH:
         if __name__ == "__main__":
diff --git a/test/inductor/test_custom_lowering.py b/test/inductor/test_custom_lowering.py
index 6740108298fa9..7308ac1365617 100644
--- a/test/inductor/test_custom_lowering.py
+++ b/test/inductor/test_custom_lowering.py
@@ -6,20 +6,26 @@
 
 from torch._inductor.ir import Pointwise
 from torch._inductor.lowering import register_lowering
+from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.virtualized import ops
 
-from torch.testing._internal.common_utils import TestCase as TorchTestCase
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 
 # These tests check issues for lowerings that aren't in the main pytorch repo
-class TestCustomLowering(TorchTestCase):
+class TestCustomLowering(InductorTestCase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
-        cls.test_inductor_ops = torch.library.Library("test_inductor_ops", "DEF")
-        cls.impl_cuda = torch.library.Library("test_inductor_ops", "IMPL", "CUDA")
-        cls.impl_meta = torch.library.Library("test_inductor_ops", "IMPL", "Meta")
+        cls.test_inductor_ops = torch.library.Library(  # noqa: TOR901
+            "test_inductor_ops", "DEF"
+        )
+        cls.impl_cuda = torch.library.Library(  # noqa: TOR901
+            "test_inductor_ops", "IMPL", "CUDA"
+        )
+        cls.impl_meta = torch.library.Library(  # noqa: TOR901
+            "test_inductor_ops", "IMPL", "Meta"
+        )
         cls._register_jagged_to_padded_dense()
 
     @classmethod
@@ -139,7 +145,7 @@ def fn(inp, offsets, max_seq_len):
 
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if HAS_CPU or HAS_CUDA:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_custom_post_grad_passes.py b/test/inductor/test_custom_post_grad_passes.py
index aaa80b0639705..5b1d3031c9fe2 100644
--- a/test/inductor/test_custom_post_grad_passes.py
+++ b/test/inductor/test_custom_post_grad_passes.py
@@ -6,14 +6,14 @@
 import torch
 import torch._inductor.pattern_matcher as pattern_matcher
 import torch.fx as fx
-
-from torch._dynamo.test_case import run_tests, TestCase
 from torch._dynamo.utils import counters
 
 from torch._inductor import config
 from torch._inductor.lowering import lowerings as L
 from torch._inductor.pattern_matcher import Arg, CallFunction, PatternMatcherPass
 
+from torch._inductor.test_case import run_tests, TestCase
+
 from torch.testing._internal.common_utils import IS_LINUX
 from torch.testing._internal.inductor_utils import HAS_CPU
 
@@ -57,6 +57,12 @@ def _test_common(
 mkldnn = torch.ops.mkldnn
 
 
+def change_cos_pass(graph):
+    for node in graph.nodes:
+        if node.op == "call_function" and node.target == aten.cos.default:
+            node.target = aten.sin.default
+
+
 class TestPostGradCustomPrePostPass(TestCustomPassBase):
     #  mkldnn fusion's pattern_matcher
     # (torch/_inductor/fx_passes/mkldnn_fusion.py),
@@ -121,6 +127,30 @@ def forward(self, x):
             x1 = self.conv(x)
             return x1.relu()
 
+    def test_custom_joint_pass_pre(self):
+        with config.patch(joint_custom_pre_pass=change_cos_pass):
+
+            def g(x):
+                return x.sin().sin().sin()
+
+            def f(x):
+                return x.cos().cos().cos()
+
+            x = torch.randn(8, dtype=torch.float32)
+            torch.testing.assert_close(torch.compile(f)(x), g(x))
+
+    def test_custom_joint_pass_post(self):
+        with config.patch(joint_custom_post_pass=change_cos_pass):
+
+            def g(x):
+                return x.sin().sin().sin()
+
+            def f(x):
+                return x.cos().cos().cos()
+
+            x = torch.randn(8, dtype=torch.float32)
+            torch.testing.assert_close(torch.compile(f)(x), g(x))
+
     def test_custom_pre_pass(self):
         with config.patch(
             # leave custom pass only in post_grad_passes()
diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
new file mode 100644
index 0000000000000..3325a3caa87e2
--- /dev/null
+++ b/test/inductor/test_cutlass_backend.py
@@ -0,0 +1,725 @@
+# Owner(s): ["module: inductor"]
+import logging
+import os
+import unittest
+from typing import Callable, List, Optional
+from unittest import mock
+
+import torch
+from torch._dynamo.utils import counters
+from torch._inductor import config
+from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
+from torch._inductor.codegen.cuda.cutlass_utils import get_max_alignment
+from torch._inductor.ir import ChoiceCaller, FixedLayout
+from torch._inductor.select_algorithm import NoValidChoicesError
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import fresh_inductor_cache
+from torch.testing._internal.common_cuda import SM75OrLater, SM80OrLater, SM90OrLater
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+
+torch.set_float32_matmul_precision("high")
+if HAS_CUDA:
+    torch.cuda.memory._set_allocator_settings("expandable_segments:False")
+
+_CUTLASS_DIR = os.path.join(os.path.dirname(__file__), "../../third_party/cutlass/")
+
+log = logging.getLogger(__name__)
+
+HAS_CUDA = HAS_CUDA and not torch.version.hip
+SM75OrLater = SM75OrLater and not torch.version.hip
+SM80OrLater = SM80OrLater and not torch.version.hip
+SM90OrLater = SM90OrLater and not torch.version.hip
+
+
+def _get_path_without_sccache() -> str:
+    """
+    Get the PATH environment variable without sccache.
+    """
+    path_envs = os.environ.get("PATH", "").split(":")
+    path_envs = [env for env in path_envs if "/opt/cache/bin" not in env]
+    return ":".join(path_envs)
+
+
+@instantiate_parametrized_tests
+class TestCutlassBackend(TestCase):
+    def setUp(self):
+        # The new inductor cache refresh mechanism
+        # introduced with https://github.com/pytorch/pytorch/pull/122661
+        # interacts badly with persistent subprocesses during
+        # autotuning. So we need to disable automatic cache refresh
+        # before calling setUp() on the parent class.
+        old_disable_fresh_cache_envvar = os.environ.get(
+            "INDUCTOR_TEST_DISABLE_FRESH_CACHE", ""
+        )
+        try:
+            os.environ["INDUCTOR_TEST_DISABLE_FRESH_CACHE"] = "1"
+            super().setUp()
+        finally:
+            os.environ[
+                "INDUCTOR_TEST_DISABLE_FRESH_CACHE"
+            ] = old_disable_fresh_cache_envvar
+        torch.random.manual_seed(1234)
+
+    @unittest.skipIf(not SM75OrLater, "need sm_75")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_max_autotune_cutlass_threshold(self):
+        """
+        Make sure Cutlass GEMM threshold works as intended.
+        """
+
+        if torch.version.hip:
+            return
+
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+        def mm(a, b):
+            return a @ b
+
+        a = torch.randn(100, 10).cuda().half()
+        b = torch.randn(10, 100).cuda().half()
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_backends": "CUTLASS,ATen",
+                "compile_threads": 4,
+                "cuda.cutlass_backend_min_gemm_size": 100000,
+                "cuda.cutlass_dir": _CUTLASS_DIR,
+                "cuda.cutlass_max_profiling_configs": 2,
+            }
+        ):
+            from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
+
+            with mock.patch(
+                "torch._inductor.select_algorithm.autotune_select_algorithm"
+            ) as mocked_select_algorithm:
+                Y_compiled = torch.compile(mm, dynamic=False)(a, b)
+                Y = mm(a, b)
+                passed_choice_callers: List[ChoiceCaller] = mocked_select_algorithm[0][
+                    1
+                ]
+                assert all(
+                    isinstance(cc, ChoiceCaller) for cc in passed_choice_callers
+                ), "Argument 1 to autotune_select_algorithm should be a list of ChoiceCaller instances"
+                # We expect that no Cutlass Kernels are considered, due to the threshold
+                assert all(
+                    not isinstance(cc, CUDATemplateCaller)
+                    for cc in passed_choice_callers
+                ), "Cutlass Kernels should have been filtered, GEMM size is too small"
+            torch.testing.assert_close(Y_compiled, Y)
+
+    @unittest.skipIf(not SM75OrLater, "need sm_75")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_max_autotune_precompile(self):
+        """
+        Make sure autotuning mm in sub processes work without crashes.
+        """
+
+        if torch.version.hip:
+            return
+
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+        def mm(a, b):
+            return a @ b
+
+        a = torch.randn(100, 10).cuda().half()
+        b = torch.randn(10, 100).cuda().half()
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_backends": "CUTLASS,Triton,ATen",
+                "compile_threads": 4,
+                "cuda.cutlass_dir": _CUTLASS_DIR,
+                "cuda.cutlass_max_profiling_configs": 2,
+            }
+        ):
+            Y_compiled = torch.compile(mm, dynamic=False)(a, b)
+            Y = mm(a, b)
+            torch.testing.assert_close(Y_compiled, Y)
+
+    # TODO: Enable dynamic test cases when dynamic support is added.
+    @unittest.skipIf(not SM75OrLater, "need sm_75")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    @parametrize("dynamic", (False, True))
+    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS"))
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_max_autotune_cutlass_backend_regular_mm(
+        self, dynamic: bool, max_autotune_gemm_backends: str
+    ):
+        """
+        Make sure autotuning mm in sub processes work without crashes.
+        """
+
+        if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip:
+            return
+
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+        def mm(a, b):
+            return a @ b
+
+        a = torch.randn(128, 16).cuda().half()
+        b = torch.randn(16, 128).cuda().half()
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": False,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "cuda.cutlass_dir": _CUTLASS_DIR,
+                "cuda.cutlass_max_profiling_configs": 2,
+            }
+        ):
+            Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
+            Y = mm(a, b)
+            torch.testing.assert_close(Y_compiled, Y)
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_max_autotune_cutlass_backend_regular_mm_streamk(
+        self, dynamic: bool = False, max_autotune_gemm_backends: str = "CUTLASS"
+    ):
+        """
+        Make sure autotuning mm in sub processes work without crashes.
+        """
+
+        if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip:
+            return
+
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+        def mm(a, b):
+            return a @ b
+
+        a = torch.randn(128, 16).cuda().half()
+        b = torch.randn(16, 128).cuda().half()
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "cuda.cutlass_dir": _CUTLASS_DIR,
+                "cuda.cutlass_max_profiling_configs": 2,
+                "cuda.cutlass_op_allowlist_regex": "stream_k",  # only stream-k GEMM Kernels
+            }
+        ):
+            for M, K, N in (
+                (128, 16, 128),
+                (1024, 256, 1024),
+                (
+                    16384,
+                    1024,
+                    16384,
+                ),
+                (
+                    16384,
+                    1408,
+                    16384,
+                ),
+            ):
+                a = torch.randn(M, K).cuda().half()
+                b = torch.randn(K, N).cuda().half()
+                Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
+                Y = mm(a, b)
+                # we need relaxed numerical limits due to the sheer size of the
+                # matmuls involved. Many small addition differences add up.
+                torch.testing.assert_close(Y_compiled, Y, atol=0.01, rtol=0.01)
+
+    def _test_max_autotune_cutlass_backend_epilogue_fusion(
+        self,
+        dynamic: bool = False,
+        max_autotune_gemm_backends: str = "CUTLASS",
+        mixed_precision=False,
+        fp16=True,
+        expected_fuse_count=0,
+        mm: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None,
+        batch_size: Optional[int] = None,
+    ):
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
+            mixed_precision
+        )
+
+        # Note: The ops that are available
+        # also depend on the alignment of the shapes
+        # so if these shapes don't all align to at least 8 elements
+        # it can happen that no Cutlass 3.x op is available
+        # that allows fusions
+        if batch_size is None:
+            a = torch.randn(256, 32).cuda()
+            b = torch.randn(32, 256).cuda()
+        else:
+            a = torch.randn(batch_size, 256, 32).cuda()
+            b = torch.randn(batch_size, 32, 256).cuda()
+        if fp16:
+            a = a.half()
+            b = b.half()
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "cuda.cutlass_dir": _CUTLASS_DIR,
+                "cuda.cutlass_max_profiling_configs": 4,
+                "cuda.version": "12.2",  # required to enable the Kernels we need
+            }
+        ):
+            counters["inductor"]["cuda_epilogue_fusion_counter"] = 0
+            Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
+            Y = mm(a, b)
+            actual_count = counters["inductor"]["cuda_epilogue_fusion_counter"]
+            assert (
+                actual_count == expected_fuse_count
+            ), f"Expected fuse count of {expected_fuse_count} but got {actual_count}"
+            torch.testing.assert_close(Y_compiled, Y, atol=1e-2, rtol=1e-2)
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(torch.version.hip, "HIP not supported")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    def test_max_autotune_cutlass_backend_simple_fusion_fp16(self):
+        def mm(a, b):
+            return (a @ b) * 3.0
+
+        #  The pointwise ops seem to be pre-fused into a single Pointwise
+        self._test_max_autotune_cutlass_backend_epilogue_fusion(
+            mixed_precision=False, fp16=True, expected_fuse_count=0, mm=mm
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(torch.version.hip, "HIP not supported")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    def test_max_autotune_cutlass_backend_simple_fusion_fp16_fp32acc(self):
+        def mm(a, b):
+            return (a @ b) * 3.0
+
+        self._test_max_autotune_cutlass_backend_epilogue_fusion(
+            mixed_precision=True, fp16=True, expected_fuse_count=0, mm=mm
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(torch.version.hip, "HIP not supported")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    def test_max_autotune_cutlass_backend_chained_fusion_fp16(self):
+        def mm(a, b):
+            return (a @ b) * 3.3 - 1.234
+
+        #  The pointwise ops seem to be pre-fused into a single Pointwise
+        self._test_max_autotune_cutlass_backend_epilogue_fusion(
+            mixed_precision=False, fp16=True, expected_fuse_count=0, mm=mm
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(torch.version.hip, "HIP not supported")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    def test_max_autotune_cutlass_backend_chained_fusion_fp16_fp32acc(self):
+        def mm(a, b):
+            return (a @ b) * 3.3 - 1.234
+
+        self._test_max_autotune_cutlass_backend_epilogue_fusion(
+            mixed_precision=True, fp16=True, expected_fuse_count=0, mm=mm
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(torch.version.hip, "HIP not supported")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    def test_max_autotune_cutlass_backend_relu_fusion_fp16(self):
+        def mm(a, b):
+            return torch.nn.functional.relu((a @ b) * 3.3 - 1.234)
+
+        self._test_max_autotune_cutlass_backend_epilogue_fusion(
+            mixed_precision=False, fp16=True, expected_fuse_count=0, mm=mm
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(torch.version.hip, "HIP not supported")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    def test_max_autotune_cutlass_backend_relu_fusion_fp16_fp32acc(self):
+        def mm(a, b):
+            return torch.nn.functional.relu((a @ b) * 3.3 - 1.234)
+
+        #  The pointwise ops seem to be pre-fused into a single Pointwise
+        self._test_max_autotune_cutlass_backend_epilogue_fusion(
+            mixed_precision=True, fp16=True, expected_fuse_count=0, mm=mm
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(torch.version.hip, "HIP not supported")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    def test_max_autotune_cutlass_backend_relu6_fusion_fp16_fp32acc(self):
+        def mm(a, b):
+            return torch.clamp(torch.nn.functional.relu(a @ b), max=6.0)
+
+        #  The pointwise ops seem to be pre-fused into a single Pointwise
+        self._test_max_autotune_cutlass_backend_epilogue_fusion(
+            mixed_precision=True, fp16=True, expected_fuse_count=0, mm=mm
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(torch.version.hip, "HIP not supported")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    def test_max_autotune_cutlass_backend_no_fusion_dtype_mismatch(self):
+        def mm(a, b):
+            # this should not be fused, since the output dtype is different from the matmul dtype
+            return (a @ b).to(torch.float32) * 0.00001
+
+        self._test_max_autotune_cutlass_backend_epilogue_fusion(
+            mixed_precision=True, fp16=True, expected_fuse_count=0, mm=mm
+        )
+
+    def test_max_autotune_cutlass_backend_simple_bmm(self):
+        def bmm(a, b):
+            return torch.bmm(a, b)
+
+        self._test_max_autotune_cutlass_backend_epilogue_fusion(  # test bmm
+            mixed_precision=False,
+            fp16=True,
+            expected_fuse_count=0,
+            mm=bmm,
+            batch_size=10,
+        )
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(torch.version.hip, "HIP not supported")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    def test_max_autotune_cutlass_backend_shape_dependent_normalization_fusion(self):
+        def mm(a, b):
+            return (a @ b) / b.size(1)
+
+        self._test_max_autotune_cutlass_backend_epilogue_fusion(
+            mixed_precision=True, fp16=True, expected_fuse_count=0, mm=mm
+        )
+
+    # TODO: Enable dynamic test cases when dynamic support is added.
+    @unittest.skipIf(not SM75OrLater, "need sm_75")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    @parametrize("dynamic", (False,))
+    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS"))
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_max_autotune_cutlass_backend_mm_bias(
+        self, dynamic: bool = False, max_autotune_gemm_backends: str = "CUTLASS"
+    ):
+        """
+        Make sure autotuning mm in sub processes work without crashes.
+        """
+
+        if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip:
+            return
+
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+        def mm(a, b, bias):
+            return torch.nn.functional.linear(a, b, bias)
+
+        a = torch.randn(2048, 4096).cuda().half()
+        bias = torch.randn(2048).cuda().half()
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "cuda.cutlass_dir": _CUTLASS_DIR,
+                "cuda.cutlass_max_profiling_configs": 2,
+            }
+        ):
+            Y = mm(a, a, bias)
+            Y_compiled = torch.compile(mm, dynamic=dynamic)(a, a, bias)
+            torch.testing.assert_close(Y_compiled, Y, atol=1e-1, rtol=1e-1)
+
+    @unittest.skipIf(not SM75OrLater, "need sm_75")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    @parametrize("dynamic", (False,))
+    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS"))
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_max_autotune_cutlass_backend_addmm(
+        self, dynamic, max_autotune_gemm_backends
+    ):
+        """
+        Make sure autotuning addmm in sub processes work without crashes.
+        """
+
+        if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip:
+            return
+
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+        def addmm(x, a, b, alpha, beta):
+            return torch.addmm(x, a, b, alpha=alpha, beta=beta)
+
+        def compare_results(
+            m: int, k: int, n: int, alpha: float, beta: float, x_shape: List[int]
+        ) -> None:
+            x = torch.randn(x_shape).cuda().half()
+            a = torch.randn(m, k).cuda().half()
+            b = torch.randn(k, n).cuda().half()
+            y_expected = addmm(x, a, b, alpha, beta)
+
+            compiled_fn = torch.compile(addmm, dynamic=dynamic)
+            y = compiled_fn(x, a, b, alpha, beta)
+            torch.testing.assert_close(y, y_expected)
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                # Some Cutlass Kernels fail with IMA on this example, which leads to unrecoverable CUDA errors
+                # unless we tune in a subproc here.
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "cuda.cutlass_dir": _CUTLASS_DIR,
+                "cuda.cutlass_max_profiling_configs": 4,
+                "cuda.cutlass_op_allowlist_regex": "",
+                "cuda.cutlass_op_denylist_regex": "pingpong",  # Pingpong Kernels can lead to numerical issues
+            }
+        ):
+            # No broadcast
+            compare_results(4096, 25728, 2048, 2.0, 0.4, [4096, 2048])
+            # Broadcast first dim.
+            compare_results(4096, 25728, 2048, 2.0, 0.4, [2048])
+            # Broadcast last dim.
+            compare_results(4096, 25728, 2048, 2.0, 0.4, [4096, 1])
+
+    # TODO: Enable dynamic test cases when dynamic support is added.
+    @unittest.skipIf(not SM80OrLater, "need sm_80")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    @parametrize("dynamic", (False,))
+    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "CUTLASS,ATen"))
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_max_autotune_cutlass_backend_int_mm(
+        self, dynamic: bool, max_autotune_gemm_backends: str
+    ):
+        """
+        Make sure autotuning mm in sub processes work without crashes.
+        """
+
+        if "CUTLASS" in max_autotune_gemm_backends.upper() and torch.version.hip:
+            return
+
+        def mm(a, b):
+            return torch._int_mm(a, b)
+
+        # CUTLASS only supports row-major/column-major combination of
+        # layouts for this operation, thus the transpose of tensor b
+        # (on the other side, Triton at the moment doesn't support
+        # this combination, so it's excluded from the test).  Also,
+        # for CUTLASS alignment requirements, number of columns in
+        # both tensors has to be divisible by 16.
+        a = torch.randint(0, 5, (100, 16), dtype=torch.int8).cuda()
+        b = torch.randint(0, 5, (32, 16), dtype=torch.int8).cuda().T
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "cuda.cutlass_dir": _CUTLASS_DIR,
+                "cuda.cutlass_max_profiling_configs": 2,
+            }
+        ):
+            Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
+            Y = mm(a, b)
+            torch.testing.assert_close(Y_compiled, Y)
+
+    # TODO: Enable dynamic test cases when dynamic support is added.
+    @unittest.skipIf(not SM80OrLater, "need sm_80")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    @parametrize("dynamic", (False,))
+    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "CUTLASS,Triton,ATen"))
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_max_autotune_cutlass_backend_mixed_mm(
+        self, dynamic: bool, max_autotune_gemm_backends: str
+    ):
+        """
+        Make sure autotuning mm in sub processes work without crashes.
+        """
+
+        if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip:
+            return
+
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+        def mm(a, b):
+            return torch.mm(a, b.to(torch.half))
+
+        # CUTLASS only supports row-major/column-major combination of
+        # layouts for this operation, thus the transpose of tensor b.
+        # Also, for CUTLASS alignment requirements, number of columns
+        # of the first tensor has to be divisible by 16.
+        a = torch.randn(100, 16).cuda().half()
+        b = torch.randint(0, 5, (100, 16), dtype=torch.int8).cuda().T
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": True,
+                "max_autotune_gemm_backends": max_autotune_gemm_backends,
+                "cuda.cutlass_dir": _CUTLASS_DIR,
+                "cuda.cutlass_max_profiling_configs": 2,
+                "use_mixed_mm": True,
+            }
+        ):
+            Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
+            Y = mm(a, b)
+            torch.testing.assert_close(Y_compiled, Y)
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_cutlass_backend_op_denylist(
+        self,
+    ):
+        def my_addmm(x, a, b, alpha, beta):
+            return torch.addmm(x, a, b, alpha=beta, beta=alpha)
+
+        x = torch.randn((128, 128)).cuda().half()
+        a = torch.randn(128, 128).cuda().half()
+        b = torch.randn(128, 128).cuda().half()
+
+        def select_no_algorithm(*args, **kwargs):
+            raise NoValidChoicesError
+
+        with fresh_inductor_cache():
+            with config.patch(
+                {
+                    "max_autotune": True,
+                    # Some Cutlass Kernels fail with IMA on this example, which leads to unrecoverable CUDA errors
+                    # unless we tune in a subproc here.
+                    "autotune_in_subproc": False,
+                    "max_autotune_gemm_backends": "CUTLASS,ATen",
+                    "cuda.cutlass_dir": _CUTLASS_DIR,
+                    "cuda.cutlass_max_profiling_configs": 2,
+                    "cuda.cutlass_op_allowlist_regex": "",
+                    "cuda.cutlass_op_denylist_regex": "pingpong",  # Pingpong Kernels can lead to numerical issues
+                }
+            ):
+                with mock.patch(
+                    "torch._inductor.kernel.mm.autotune_select_algorithm",
+                    wraps=select_no_algorithm,
+                ) as sa:
+                    torch.compile(my_addmm, dynamic=False)(x, a, b, 1.0, 2.0)
+                    args, kwargs = sa.call_args
+                    op_name, choices, _, __ = args
+                    assert op_name == "addmm"
+                    cuda_template_count = 0
+                    for choice in choices:
+                        if isinstance(choice, CUDATemplateCaller):
+                            choice_info = choice.info_dict()
+                            assert (
+                                "pingpong" not in choice_info["op_conf_name"]
+                            ), "All pingpong Kernels should have been filtered"
+                            cuda_template_count += 1
+                    assert cuda_template_count > 0, "No CUDATemplateCaller choices"
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_cutlass_backend_op_allowlist(
+        self,
+    ):
+        def addmm(x, a, b, alpha, beta):
+            return torch.addmm(x, a, b, alpha=alpha, beta=beta)
+
+        x = torch.randn((128, 128)).cuda().half()
+        a = torch.randn(128, 128).cuda().half()
+        b = torch.randn(128, 128).cuda().half()
+
+        def select_no_algorithm(*args, **kwargs):
+            raise NoValidChoicesError
+
+        with fresh_inductor_cache():
+            with config.patch(
+                {
+                    "max_autotune": True,
+                    # Some Cutlass Kernels fail with IMA on this example, which leads to unrecoverable CUDA errors
+                    # unless we tune in a subproc here.
+                    "autotune_in_subproc": False,
+                    "max_autotune_gemm_backends": "CUTLASS,ATen",
+                    "cuda.cutlass_dir": _CUTLASS_DIR,
+                    "cuda.cutlass_max_profiling_configs": 2,
+                    "cuda.cutlass_op_allowlist_regex": "pingpong",
+                    "cuda.cutlass_op_denylist_regex": None,  # Pingpong Kernels can lead to numerical issues
+                }
+            ):
+                with mock.patch(
+                    "torch._inductor.kernel.mm.autotune_select_algorithm",
+                    wraps=select_no_algorithm,
+                ) as sa:
+                    torch.compile(addmm, dynamic=False)(x, a, b, 1.0, 1.0)
+                    args, kwargs = sa.call_args
+                    op_name, choices, _, __ = args
+                    assert op_name == "addmm"
+                    cuda_template_count = 0
+                    for choice in choices:
+                        if isinstance(choice, CUDATemplateCaller):
+                            choice_info = choice.info_dict()
+                            assert (
+                                "pingpong" in choice_info["op_conf_name"]
+                            ), "Only pingpong Kernels should have been allowed"
+                            cuda_template_count += 1
+                    assert cuda_template_count > 0, "No CUDATemplateCaller choices"
+
+    @unittest.skipIf(not SM80OrLater, "need sm_90")
+    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
+    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_get_max_alignment(self):
+        l4 = FixedLayout("cpu", torch.half, size=(1, 2, 4), stride=(0, 4, 1))
+        m4 = get_max_alignment(l4)
+        self.assertEqual(
+            m4, 4, "Wrong max alignment. Should have been 4. (simple, contiguous case)"
+        )
+
+        l4_2 = FixedLayout("cpu", torch.half, size=(1, 4, 2), stride=(0, 1, 4))
+        m4_2 = get_max_alignment(l4_2)
+        self.assertEqual(
+            m4_2,
+            4,
+            "Wrong max alignment. Should have been 4. Did not deal with strides correctly",
+        )
+
+        l1 = FixedLayout("cpu", torch.half, size=(2, 4, 2), stride=(23, 1, 4))
+        m1 = get_max_alignment(l1)
+        self.assertEqual(
+            m1,
+            1,
+            "Wrong max alignment. Should have been 1. Did not take stride into account correctly",
+        )
+
+        l2 = FixedLayout("cpu", torch.half, size=(1, 2, 4), stride=(0, 4, 1), offset=6)
+        m2 = get_max_alignment(l2)
+        self.assertEqual(
+            m2, 2, "Wrong max alignment. Should have been 2. (due to choice of offset)"
+        )
+
+        l8 = FixedLayout(
+            "cpu", torch.half, size=(2, 2, 8), stride=(32, 8, 1), offset=24
+        )
+        m8 = get_max_alignment(l8)
+        self.assertEqual(m8, 8, "Wrong max alignment. Should have been 8.")
+
+        l4 = FixedLayout(
+            "cpu", torch.float32, size=(2, 2, 8), stride=(32, 8, 1), offset=24
+        )
+        m4 = get_max_alignment(l4)
+        self.assertEqual(
+            m4, 4, "Wrong max alignment. Should have been 4 (due to float32 dtype )."
+        )
+
+
+if __name__ == "__main__":
+    from torch._inductor.utils import is_big_gpu
+
+    # Set env to make it work in CI.
+    if HAS_CUDA and HAS_CPU and is_big_gpu(0):
+        run_tests()
diff --git a/test/inductor/test_debug_trace.py b/test/inductor/test_debug_trace.py
index 50ea00b862ef8..41b885fde6a35 100644
--- a/test/inductor/test_debug_trace.py
+++ b/test/inductor/test_debug_trace.py
@@ -9,6 +9,8 @@
 
 import torch
 from torch._inductor import config, test_operators
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.utils._triton import has_triton
 
 try:
     try:
@@ -59,6 +61,8 @@ def fn(a, b):
 buf0.group.device = cpu
 buf0.group.iteration = ((256,), ())
 buf0.sizes = ([256], [])
+arg0_1_layout = FixedLayout('cpu', torch.float32, size=[16, 16], stride=[16, 1])
+buf0_layout = FixedLayout('cpu', torch.float32, size=[16, 16], stride=[16, 1])
 class buf0_loop_body:
     var_ranges = {z0: 256}
     index0 = z0
@@ -80,6 +84,8 @@ def body(self, ops):
 buf1.group.device = cpu
 buf1.group.iteration = ((256,), ())
 buf1.sizes = ([256], [])
+buf0_layout = FixedLayout('cpu', torch.float32, size=[16, 16], stride=[16, 1])
+buf1_layout = FixedLayout('cpu', torch.float32, size=[16, 16], stride=[16, 1])
 class buf1_loop_body:
     var_ranges = {z0: 256}
     index0 = z0
@@ -103,7 +109,7 @@ def body(self, ops):
         self.assertExpectedInline(
             open(filename / "ir_post_fusion.txt").read().rstrip(),
             """\
-buf0_buf1: FusedSchedulerNode(NoneType)
+buf0_buf1: FusedSchedulerNode(SchedulerNode,SchedulerNode)
 buf0_buf1.writes = [MemoryDep('buf0', c0, {c0: 256}), MemoryDep('buf1', c0, {c0: 256})]
 buf0_buf1.unmet_dependencies = []
 buf0_buf1.met_dependencies = [MemoryDep('arg0_1', c0, {c0: 256})]
@@ -117,6 +123,8 @@ def body(self, ops):
     buf0.group.device = cpu
     buf0.group.iteration = ((256,), ())
     buf0.sizes = ([256], [])
+    arg0_1_layout = FixedLayout('cpu', torch.float32, size=[16, 16], stride=[16, 1])
+    buf0_layout = FixedLayout('cpu', torch.float32, size=[16, 16], stride=[16, 1])
     class buf0_loop_body:
         var_ranges = {z0: 256}
         index0 = z0
@@ -137,6 +145,8 @@ def body(self, ops):
     buf1.group.device = cpu
     buf1.group.iteration = ((256,), ())
     buf1.sizes = ([256], [])
+    buf0_layout = FixedLayout('cpu', torch.float32, size=[16, 16], stride=[16, 1])
+    buf1_layout = FixedLayout('cpu', torch.float32, size=[16, 16], stride=[16, 1])
     class buf1_loop_body:
         var_ranges = {z0: 256}
         index0 = z0
@@ -160,9 +170,32 @@ def body(self, ops):
         # intentionally only cleanup on success so debugging test is easier
         shutil.rmtree(filename)
 
+    @unittest.skipIf(not TEST_CUDA or not has_triton(), "requires cuda")
+    def test_debug_multi_tempalte(self):
+        class ToyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.l = torch.nn.Linear(100, 100)
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.l(x))
+
+        # no failure
+
+        from torch._inductor.utils import fresh_inductor_cache
+
+        with self.assertLogs(
+            logging.getLogger("torch._inductor.debug"), level=logging.WARNING
+        ), fresh_inductor_cache():
+            m = ToyModel().to(device="cuda:0")
+            m = torch.compile(m, mode="max-autotune")
+            input_tensor = torch.randn(100).to(device="cuda:0")
+            m(input_tensor)
+
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
     from torch.testing._internal.inductor_utils import HAS_CPU
 
     if HAS_CPU:
diff --git a/test/inductor/test_decompose_mem_bound_mm.py b/test/inductor/test_decompose_mem_bound_mm.py
new file mode 100644
index 0000000000000..6bfb5a9228f2e
--- /dev/null
+++ b/test/inductor/test_decompose_mem_bound_mm.py
@@ -0,0 +1,260 @@
+# Owner(s): ["module: inductor"]
+
+import logging
+import unittest
+
+import torch
+import torch._inductor
+from torch._dynamo.utils import counters
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import run_and_get_code
+from torch.testing import FileCheck
+
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+
+
+class MyModule(torch.nn.Module):
+    def __init__(
+        self, n_input: int, n_output: int, has_bias: bool, device="cuda"
+    ) -> None:
+        super().__init__()
+        self.linear = torch.nn.Linear(n_input, n_output, bias=has_bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(x)
+
+
+class MyModule2(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input1, input2):
+        output = torch.bmm(input1, input2)
+        return output
+
+
+class MyModule3(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input1, input2):
+        output = torch.mm(input1, input2)
+        return output
+
+
+@requires_cuda
+@torch._inductor.config.patch(
+    post_grad_fusion_options={
+        "decompose_mm_pass": {},
+    }
+)
+@instantiate_parametrized_tests
+class TestDecomposeMemMM(TestCase):
+    def compare_dict_tensors(self, ref_dict, res_dict, rtol=1e-3, atol=1e-3):
+        if len(set(ref_dict.keys())) != len(set(res_dict.keys())):
+            return False
+        for key1 in ref_dict.keys():
+            key2 = "_orig_mod." + key1
+            assert key2 in res_dict, f"{key1} does not exist in traced module"
+            if not torch.allclose(ref_dict[key1], res_dict[key2], rtol=rtol, atol=atol):
+                return False
+        return True
+
+    def compare_pred(self, module, traced, input, rtol=1e-3, atol=1e-3):
+        ref = module(*input)
+        res = traced(*input)
+        self.assertEqual(ref, res, rtol=rtol, atol=atol)
+
+    def compare_parameters(self, module, traced, rtol=1e-3, atol=1e-3):
+        ref_params = dict(module.named_parameters())
+        res_params = dict(traced.named_parameters())
+        self.assertTrue(self.compare_dict_tensors(ref_params, res_params, rtol, atol))
+
+    def compare_gradients(self, module, traced, rtol=1e-3, atol=1e-3):
+        ref_grad = {key: param.grad for key, param in module.named_parameters()}
+        res_grad = {key: param.grad for key, param in traced.named_parameters()}
+        self.assertTrue(
+            self.compare_dict_tensors(ref_grad, res_grad, rtol=rtol, atol=atol)
+        )
+
+    @parametrize(
+        "b,m,k,n,should_decompose",
+        [(10240, 2, 2, 2, True), (10240, 2, 32, 32, False), (2000, 2, 2, 2, False)],
+    )
+    def test_decompose_bmm(self, b, m, n, k, should_decompose):
+        torch._logging.set_logs(inductor=logging.DEBUG)
+        mat1 = torch.randn(b, m, k, device="cuda").requires_grad_(True)
+        mat2 = torch.randn(b, k, n, device="cuda").requires_grad_(True)
+
+        counters.clear()
+
+        module = MyModule2().to("cuda")
+        traced = torch.compile(module)
+        input = [mat1, mat2]
+        ref = module(*input)
+        res = traced(*input)
+
+        self.compare_pred(module, traced, input)
+
+        expected_val = 1 if should_decompose else 0
+        self.assertEqual(
+            counters["inductor"]["decompose_bmm"],
+            expected_val,
+        )
+
+        ref.sum().backward()
+        res.sum().backward()
+        self.compare_parameters(module, traced)
+        self.compare_gradients(module, traced)
+
+        expected_val = 3 if should_decompose else 0
+        self.assertEqual(
+            counters["inductor"]["decompose_bmm"],
+            expected_val,
+        )
+        counters.clear()
+
+    @parametrize(
+        "m,k,n, should_decompose",
+        [(20480, 5, 2, True), (20480, 32, 2, False), (2048, 2, 2, False)],
+    )
+    @parametrize("has_bias", [True, False])
+    def test_decompose_linear(self, m, n, k, has_bias, should_decompose):
+        torch._logging.set_logs(inductor=logging.DEBUG)
+        input = torch.randn(m, k, device="cuda").requires_grad_(True)
+
+        counters.clear()
+
+        module = MyModule(k, n, has_bias).to("cuda")
+        traced = torch.compile(module)
+        input = [input]
+        ref = module(*input)
+        res = traced(*input)
+
+        self.compare_pred(module, traced, input)
+
+        expected_val = 1 if should_decompose else 0
+        if has_bias:
+            self.assertEqual(
+                counters["inductor"]["decompose_addmm"],
+                expected_val,
+            )
+        else:
+            self.assertEqual(
+                counters["inductor"]["decompose_mm"],
+                expected_val,
+            )
+        decompose_mm_fwd = counters["inductor"]["decompose_mm"]
+
+        ref.sum().backward()
+        res.sum().backward()
+
+        self.compare_parameters(module, traced)
+        self.compare_gradients(module, traced)
+
+        self.assertEqual(
+            counters["inductor"]["decompose_mm"] - decompose_mm_fwd,
+            expected_val,
+        )
+        counters.clear()
+
+    @parametrize(
+        "m,k,n, should_decompose",
+        [(20480, 5, 2, True), (20480, 32, 2, False), (2048, 2, 2, False)],
+    )
+    @parametrize("has_bias", [True, False])
+    def test_decompose_mm(self, m, n, k, has_bias, should_decompose):
+        torch._logging.set_logs(inductor=logging.DEBUG)
+        mat1 = torch.randn(m, k, device="cuda").requires_grad_(True)
+        mat2 = torch.randn(k, n, device="cuda").requires_grad_(True)
+
+        counters.clear()
+
+        module = MyModule3().to("cuda")
+        traced = torch.compile(module)
+        input = [mat1, mat2]
+        ref = module(*input)
+        res = traced(*input)
+
+        self.compare_pred(module, traced, input)
+
+        expected_val = 1 if should_decompose else 0
+        self.assertEqual(
+            counters["inductor"]["decompose_mm"],
+            expected_val,
+        )
+        decompose_mm_fwd = counters["inductor"]["decompose_mm"]
+
+        ref.sum().backward()
+        res.sum().backward()
+        self.compare_parameters(module, traced)
+        self.compare_gradients(module, traced)
+
+        expected_val = 1 if should_decompose else 0
+        self.assertEqual(
+            counters["inductor"]["decompose_mm"] - decompose_mm_fwd,
+            expected_val,
+        )
+        counters.clear()
+
+    @parametrize("m,k,n, should_decompose", [(20480, 5, 2, True)])
+    @parametrize("has_bias", [True, False])
+    def test_dynamic_shape(self, m, n, k, has_bias, should_decompose):
+        torch._logging.set_logs(inductor=logging.DEBUG)
+        input = torch.randn(m, k, device="cuda").requires_grad_(True)
+
+        counters.clear()
+
+        module = MyModule(k, n, has_bias).to("cuda")
+        traced = torch.compile(module, dynamic=True)
+        input = [input]
+        ref = module(*input)
+        res = traced(*input)
+
+        self.compare_pred(module, traced, input)
+
+        expected_val = 1 if should_decompose else 0
+        if has_bias:
+            self.assertEqual(
+                counters["inductor"]["decompose_addmm"],
+                expected_val,
+            )
+
+        ref.sum().backward()
+        res.sum().backward()
+
+        self.compare_parameters(module, traced)
+        self.compare_gradients(module, traced)
+
+        self.assertEqual(
+            counters["inductor"]["decompose_mm"],
+            1 if has_bias else 2,
+        )
+        counters.clear()
+
+    def test_realize_input(self):
+        m = 20480
+        k = 5
+        n = 2
+        torch._logging.set_logs(inductor=logging.DEBUG)
+        input1 = torch.randn(m, k, device="cuda").T.contiguous()
+        input2 = torch.randn(k, n, device="cuda")
+
+        @torch.compile()
+        def foo(x, y):
+            return x.T.contiguous() @ y
+
+        out, code = run_and_get_code(foo, input1, input2)
+
+        # two kernels generated
+        FileCheck().check_count(".run(", 2, exactly=True).run(code[0])
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/test_dependencies.py b/test/inductor/test_dependencies.py
index 7a80e17f9f603..f0aefed2c4427 100644
--- a/test/inductor/test_dependencies.py
+++ b/test/inductor/test_dependencies.py
@@ -1,18 +1,19 @@
 # Owner(s): ["module: inductor"]
 import contextlib
-import unittest
 
 import torch
+from torch._inductor.dependencies import MemoryDep
 
 from torch._inductor.graph import GraphLowering
 from torch._inductor.ir import Buffer, FixedLayout, Pointwise
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch._inductor.utils import sympy_index_symbol
 from torch._inductor.virtualized import ops, V
 
-from torch.testing._internal.common_utils import TestCase as TorchTestCase
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 
-class TestDependencies(TorchTestCase):
+class TestDependencies(InductorTestCase):
     def _create_buffer(self, name, shape, dtype=torch.float32):
         return Buffer(name, FixedLayout(torch.device("cuda:0"), dtype, shape))
 
@@ -33,7 +34,6 @@ def tearDown(self):
         self._stack.close()
         super().tearDown()
 
-    @unittest.skipIf(not HAS_CUDA, "CUDA-only test")
     def test_bucketize_dependencies(self):
         offsets = self._create_buffer("offsets", (1025,), torch.int32)
 
@@ -56,9 +56,82 @@ def inner_fn(index):
 
         self.assertEqual(len(pointwise.get_reads()), 1)
 
+    def test_get_offset(self):
+        x = sympy_index_symbol("x")
+        y = sympy_index_symbol("y")
+        var_ranges = {
+            x: 1024,
+            y: 2048,
+        }
+        dep1 = MemoryDep(
+            "dep1",
+            x * 2048 + y,
+            list(var_ranges.keys()),
+            list(var_ranges.values()),
+        )
+        dep2 = MemoryDep(
+            "dep2",
+            x * 2048 + y + 1024,
+            list(var_ranges.keys()),
+            list(var_ranges.values()),
+        )
+        self.assertEqual(dep1.get_offset(), 0)
+        self.assertEqual(dep2.get_offset(), 1024)
+
+    def test_normalize_with_stride_order_equal(self):
+        x = sympy_index_symbol("x")
+        y = sympy_index_symbol("y")
+        var_ranges = {
+            x: 1024,
+            y: 2048,
+        }
+
+        loop_order1 = MemoryDep(
+            "access_the_same_buffer",
+            x * 2048 + y,
+            [x, y],
+            [1024, 2048],
+        )
+        loop_order2 = MemoryDep(
+            "access_the_same_buffer",
+            x * 2048 + y,
+            [y, x],
+            [2048, 1024],
+        )
+        self.assertTrue(loop_order1 != loop_order2)
+        normalized_loop_order1 = loop_order1.normalize_with_stride_order()
+        normalized_loop_order2 = loop_order2.normalize_with_stride_order()
+        self.assertTrue(normalized_loop_order1 == normalized_loop_order2)
+
+    def test_normalize_with_stride_order_unequal(self):
+        x = sympy_index_symbol("x")
+        y = sympy_index_symbol("y")
+        var_ranges = {
+            x: 1024,
+            y: 2048,
+        }
+
+        loop_order1 = MemoryDep(
+            "access_the_same_buffer",
+            x * 2048 + y,
+            [x, y],
+            [1024, 2048],
+        )
+        loop_order2 = MemoryDep(
+            "access_the_same_buffer",
+            x * 2048 + y + 5,
+            [y, x],
+            [2048, 1024],
+        )
+        self.assertTrue(loop_order1 != loop_order2)
+        normalized_loop_order1 = loop_order1.normalize_with_stride_order()
+        normalized_loop_order2 = loop_order2.normalize_with_stride_order()
+        # unequal due to different offset
+        self.assertTrue(normalized_loop_order1 != normalized_loop_order2)
+
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
-    if HAS_CPU or HAS_CUDA:
+    if HAS_CPU and HAS_CUDA:
         run_tests("sympy")
diff --git a/test/inductor/test_distributed_patterns.py b/test/inductor/test_distributed_patterns.py
new file mode 100644
index 0000000000000..d822ad0af6c7d
--- /dev/null
+++ b/test/inductor/test_distributed_patterns.py
@@ -0,0 +1,450 @@
+# Owner(s): ["oncall: pt2"]
+import dataclasses
+import functools
+
+import torch
+from torch import nn
+from torch._dynamo import compiled_autograd
+from torch._dynamo.test_case import run_tests, TestCase
+from torch._dynamo.testing import CompileCounter
+from torch.testing._internal.common_utils import IS_MACOS
+from torch.testing._internal.inductor_utils import HAS_CPU
+
+
+def init_fake_distributed():
+    # Fake distributed
+    WORLD_SIZE = 2
+
+    # TODO(jansel): fix support for this
+    RESIZE = False
+
+    @torch.no_grad
+    def all_gather(t):
+        return torch.cat([t] * WORLD_SIZE, 0)
+
+    @torch.no_grad
+    def reduce_scatter(t):
+        return t.narrow(0, 0, t.size(0) // WORLD_SIZE)
+
+    def fw_pre_hook(mod, inp):
+        with torch.no_grad():
+            mod.og_weight = mod.weight
+            mod.weight = nn.Parameter(all_gather(mod.weight))
+
+    def fw_post_hook(mod, inp, out):
+        if RESIZE:
+            # Drop the big weight
+            mod.weight.untyped_storage().resize_(0)
+        mod.empty_weight = mod.weight
+        mod.weight = mod.og_weight
+        del mod.og_weight
+
+    def bw_pre_hook(mod, gO):
+        if RESIZE:
+            mod.empty_weight.untyped_storage().resize_(
+                WORLD_SIZE * mod.weight.nelement() * mod.weight.element_size()
+            )
+        mod.og_weight = mod.weight
+        full_weight = nn.Parameter(all_gather(mod.weight))
+        with torch.no_grad(), torch.autograd._unsafe_preserve_version_counter(
+            mod.empty_weight
+        ):
+            mod.empty_weight.copy_(full_weight)
+        mod.weight = mod.empty_weight
+        del mod.empty_weight
+
+    def bw_post_hook(mod, gI, gO):
+        grad = mod.weight.grad
+        new_grad = reduce_scatter(grad)
+        # No need to re-empty the weight here, the graph has been cleared
+        # This removes the last reference to the big Tensor
+        mod.weight = mod.og_weight
+        del mod.og_weight
+        mod.weight.grad = new_grad
+
+    torch.manual_seed(1234)
+    m = nn.Linear(20, 10, bias=False)
+    m.weight = nn.Parameter(reduce_scatter(m.weight))
+    m.register_full_backward_pre_hook(bw_pre_hook)
+    m.register_full_backward_hook(bw_post_hook)
+    m.register_forward_pre_hook(fw_pre_hook)
+    m.register_forward_hook(fw_post_hook)
+    return m, torch.rand(2, 20, requires_grad=True)
+
+
+def init_module_bw_hooks(allow_eager):
+    def bw_pre_hook(mod, gO):
+        assert allow_eager or torch._dynamo.is_compiling()
+        assert mod.weight.size() == (10, 10)
+        mod.hook_count_pre.add_(1)
+        return (torch.sin(gO[0] + 1.2),)
+
+    def bw_post_hook(mod, gI, gO):
+        assert allow_eager or torch._dynamo.is_compiling()
+        assert mod.weight.size() == (10, 10)
+        mod.hook_count_post.add_(1)
+        return (torch.sin(gI[0] + 3.4),)
+
+    torch.manual_seed(1234)
+    m = nn.Linear(10, 10)
+    m.hook_count_pre = torch.tensor(0)
+    m.hook_count_post = torch.tensor(0)
+    m.register_full_backward_pre_hook(bw_pre_hook)
+    m.register_full_backward_hook(bw_post_hook)
+    return m, torch.rand(2, 10, requires_grad=True)
+
+
+def steps(m, inp):
+    for _ in range(4):
+        out = m(inp)
+        out.sum().backward()
+    return out
+
+
+class DistributedPatternTests(TestCase):
+    def test_intermediate_hook_with_closure(self):
+        @dataclasses.dataclass
+        class CustomObj:
+            val: torch.Tensor
+
+        def fn(x, obj):
+            y = x.sin()
+            closure_var = y + 1
+            y.register_hook(lambda grad: grad + obj.val + closure_var)
+            z = y.sin()
+            return z
+
+        opt = torch.compile(fn, fullgraph=True)
+
+        obj1 = CustomObj(torch.tensor(88))
+        obj2 = CustomObj(torch.tensor(99))
+        x0 = torch.ones(4, requires_grad=True)
+        x1 = torch.ones(4, requires_grad=True)
+        x2 = torch.ones(4, requires_grad=True)
+        x3 = torch.ones(4, requires_grad=True)
+        fn(x0, obj1).sum().backward()
+        fn(x1, obj2).sum().backward()
+
+        with compiled_autograd.enable(functools.partial(torch.compile, fullgraph=True)):
+            opt(x2, obj1).sum().backward()
+            opt(x3, obj2).sum().backward()
+
+        self.assertEqual(x0.grad, x2.grad)
+        self.assertEqual(x1.grad, x3.grad)
+
+    @torch.no_grad()
+    def test_storage_resize_zero(self):
+        @torch.compile(fullgraph=True)
+        def fn(x):
+            y = torch.sin(x)
+            x.untyped_storage().resize_(0)
+            return torch.cos(y)
+
+        x = torch.randn(10)
+        expected = torch.cos(torch.sin(x))
+        y = fn(x)
+        self.assertEqual(y, expected)
+        self.assertEqual(x.untyped_storage().size(), 0)
+
+    @torch.no_grad()
+    def test_storage_resize_nonzero(self):
+        @torch.compile(fullgraph=True)
+        def fn(x, out):
+            y = torch.sin(x)
+            assert out.untyped_storage().size() == 0
+            out.untyped_storage().resize_(x.untyped_storage().size())
+            out.copy_(y.cos())
+
+        x = torch.randn(10)
+        out = torch.randn(10)
+        expected = torch.cos(torch.sin(x))
+        out.untyped_storage().resize_(0)
+        fn(x, out)
+        self.assertEqual(out.untyped_storage().size(), x.untyped_storage().size())
+        self.assertEqual(out, expected)
+
+    @torch.no_grad()
+    def test_unsafe_set_version_counter1(self):
+        cnt = CompileCounter()
+
+        @torch.compile(backend=cnt, fullgraph=True)
+        def fn(w, x):
+            x = x.sin()
+            v = w._version
+            w.copy_(x + 1)
+            torch._C._autograd._unsafe_set_version_counter(w, v)
+            return w, v
+
+        for v in (3, 0, 1):
+            w1 = torch.randn(16)
+            for i in range(v):
+                w1.fill_(i)  # bump w1._version
+            self.assertEqual(w1._version, v)
+            x1 = torch.randn(16)
+            w2, v2 = fn(w1, x1)
+
+            self.assertIs(w1, w2)
+            self.assertEqual(w1, x1.sin() + 1)
+            self.assertEqual(v2, v)
+            self.assertEqual(w1._version, v)
+            self.assertEqual(cnt.frame_count, 1)
+
+    def test_unsafe_set_version_counter2(self):
+        @torch.compile(backend="inductor", fullgraph=True)
+        def fn(w, x):
+            r = w.sin()
+            with torch.no_grad():
+                v = w._version
+                w.copy_(x)
+                torch._C._autograd._unsafe_set_version_counter(w, v)
+            return r
+
+        w1 = torch.randn(1, requires_grad=True)
+        x1 = torch.randn(1)
+        expected_r1 = w1.detach().sin()
+
+        r1 = fn(w1, x1)
+        r1.backward()
+        self.assertEqual(r1, expected_r1)
+        self.assertEqual(w1, x1)
+        self.assertEqual(w1.grad, x1.cos())
+
+    @torch.no_grad()
+    def test_unsafe_preserve_version_counter1(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(w, x):
+            x = x.sin()
+            with torch.autograd._unsafe_preserve_version_counter(w):
+                w.copy_(x + 1)
+            return w
+
+        w1 = torch.randn(16).fill_(0).fill_(1)
+        x1 = torch.randn(16)
+        v1 = w1._version
+        w2 = fn(w1, x1)
+        v2 = w1._version
+
+        self.assertIs(w1, w2)
+        self.assertEqual(w1, x1.sin() + 1)
+        self.assertEqual(v1, v2)
+
+    def test_unsafe_preserve_version_counter2(self):
+        @torch.compile(backend="inductor", fullgraph=True)
+        def fn(w, x):
+            r = w.sin()
+            with torch.no_grad(), torch.autograd._unsafe_preserve_version_counter(w):
+                w.copy_(x)
+            return r
+
+        w1 = torch.randn(1, requires_grad=True)
+        x1 = torch.randn(1)
+        expected_r1 = w1.detach().sin()
+
+        r1 = fn(w1, x1)
+        r1.backward()
+        self.assertEqual(r1, expected_r1)
+        self.assertEqual(w1, x1)
+        self.assertEqual(w1.grad, x1.cos())
+
+    def test_module_backward_hooks_eager(self):
+        m1, inp1 = init_module_bw_hooks(True)
+        out1 = steps(m1, inp1)
+
+        m2, inp2 = init_module_bw_hooks(False)
+        fw_cnt = CompileCounter()
+        bw_cnt = CompileCounter()
+        with compiled_autograd.enable(torch.compile(backend=bw_cnt, fullgraph=True)):
+            m2 = torch.compile(m2, backend=fw_cnt, fullgraph=True)
+            out2 = steps(m2, inp2)
+
+        self.assertEqual(m1.hook_count_pre, m2.hook_count_pre)
+        self.assertEqual(m1.hook_count_post, m2.hook_count_post)
+        self.assertEqual(out1, out2)
+        self.assertEqual(inp1.grad, inp2.grad)
+        self.assertEqual(m1.weight.grad, m2.weight.grad)
+        self.assertEqual(m1.bias.grad, m2.bias.grad)
+
+        self.assertEqual(fw_cnt.frame_count, 1)
+        self.assertEqual(fw_cnt.op_count, 5)
+        self.assertEqual(bw_cnt.frame_count, 2)  # grad=None and grad!=None
+        self.assertEqual(bw_cnt.op_count, 48)
+
+    def test_module_backward_hooks_aot(self):
+        m1, inp1 = init_module_bw_hooks(True)
+        out1 = steps(m1, inp1)
+
+        m2, inp2 = init_module_bw_hooks(True)
+        m2 = torch.compile(m2, backend="aot_eager", fullgraph=True)
+        with compiled_autograd.enable(lambda gm: gm):
+            out2 = steps(m2, inp2)
+
+        self.assertEqual(m1.hook_count_pre, m2.hook_count_pre)
+        self.assertEqual(m1.hook_count_post, m2.hook_count_post)
+        self.assertEqual(out1, out2)
+        self.assertEqual(inp1.grad, inp2.grad)
+        self.assertEqual(m1.weight.grad, m2.weight.grad)
+        self.assertEqual(m1.bias.grad, m2.bias.grad)
+
+    def test_module_backward_hooks_inductor(self):
+        m1, inp1 = init_module_bw_hooks(True)
+        out1 = steps(m1, inp1)
+
+        m2, inp2 = init_module_bw_hooks(False)
+        m2 = torch.compile(m2, fullgraph=True)
+        with compiled_autograd.enable(torch.compile(fullgraph=True)):
+            out2 = steps(m2, inp2)
+
+        self.assertEqual(m1.hook_count_pre, m2.hook_count_pre)
+        self.assertEqual(m1.hook_count_post, m2.hook_count_post)
+        self.assertEqual(out1, out2)
+        self.assertEqual(inp1.grad, inp2.grad)
+        self.assertEqual(m1.weight.grad, m2.weight.grad)
+        self.assertEqual(m1.bias.grad, m2.bias.grad)
+
+    def test_module_backward_hooks_multi_layers(self):
+        a1, inp1 = init_module_bw_hooks(True)
+        b1, _ = init_module_bw_hooks(True)
+        out1 = steps(torch.nn.Sequential(a1, b1), inp1)
+
+        a2, inp2 = init_module_bw_hooks(False)
+        b2, _ = init_module_bw_hooks(False)
+        with compiled_autograd.enable(torch.compile(fullgraph=True)):
+            out2 = steps(
+                torch.compile(torch.nn.Sequential(a2, b2), fullgraph=True), inp2
+            )
+
+        self.assertEqual(a1.hook_count_pre, a2.hook_count_pre)
+        self.assertEqual(a1.hook_count_post, a2.hook_count_post)
+        self.assertEqual(b1.hook_count_pre, b2.hook_count_pre)
+        self.assertEqual(b1.hook_count_post, b2.hook_count_post)
+        self.assertEqual(out1, out2)
+        self.assertEqual(inp1.grad, inp2.grad)
+        self.assertEqual(a1.weight.grad, a2.weight.grad)
+        self.assertEqual(a1.bias.grad, a2.bias.grad)
+        self.assertEqual(b1.weight.grad, b2.weight.grad)
+        self.assertEqual(b1.bias.grad, b2.bias.grad)
+
+    # TODO(jansel): support bw hooks with graph break
+
+    def _assert_same_grad(self, a, b):
+        self.assertEqual(type(a), type(b))
+        self.assertEqual(a, b)
+        self.assertEqual(a.grad, b.grad)
+        self.assertEqual(a.requires_grad, b.requires_grad)
+
+    def test_nn_param_return1(self):
+        def fn(x):
+            p = torch.nn.Parameter(x)
+            return p, p.sin()
+
+        opt = torch.compile(fn, fullgraph=True)
+        x1 = torch.randn(16)
+        x2 = x1.clone()
+
+        p1, r1 = fn(x1)
+        r1.sum().backward()
+        p2, r2 = opt(x2)
+        r2.sum().backward()
+        self._assert_same_grad(r1, r2)
+        self._assert_same_grad(p1, p2)
+
+    def test_nn_param_return2(self):
+        def fn(x):
+            p = torch.nn.Parameter(x, requires_grad=False)
+            return p, x + 1
+
+        opt = torch.compile(fn, fullgraph=True)
+        x1 = torch.randn(16)
+        x2 = x1.clone()
+
+        p1, r1 = fn(x1)
+        p2, r2 = opt(x2)
+        self._assert_same_grad(r1, r2)
+        self._assert_same_grad(p1, p2)
+
+    def test_nn_param_return3(self):
+        def fn(x):
+            p = torch.nn.Parameter(x + 123)
+            return p, p.sin()
+
+        opt = torch.compile(fn, fullgraph=True)
+        x1 = torch.randn(16)
+        x2 = x1.clone()
+
+        p1, r1 = fn(x1)
+        r1.sum().backward()
+        p2, r2 = opt(x2)
+        r2.sum().backward()
+        self._assert_same_grad(r1, r2)
+        self._assert_same_grad(p1, p2)
+
+    def test_nn_param_return4(self):
+        def fn(x):
+            p = torch.nn.Parameter(x + 123, requires_grad=False)
+            return p, x + 1
+
+        opt = torch.compile(fn, fullgraph=True)
+        x1 = torch.randn(16)
+        x2 = x1.clone()
+
+        p1, r1 = fn(x1)
+        p2, r2 = opt(x2)
+        self._assert_same_grad(r1, r2)
+        self._assert_same_grad(p1, p2)
+
+    def test_fake_distributed_eager(self):
+        m1, inp1 = init_fake_distributed()
+        out1 = steps(m1, inp1)
+
+        m2, inp2 = init_fake_distributed()
+        fw_cnt = CompileCounter()
+        m2 = torch.compile(m2, backend=fw_cnt, fullgraph=True)
+
+        bw_cnt = CompileCounter()
+        with compiled_autograd.enable(torch.compile(backend=bw_cnt, fullgraph=False)):
+            for step in range(1, 5):
+                out2 = m2(inp2)
+                out2.sum().backward()
+
+                # Graph break on TracableCreateParameter.backward
+                # Recompile on grad==None/grad!=None
+                self.assertEqual(bw_cnt.frame_count, min(step, 2) * 2)
+
+        self.assertEqual(fw_cnt.frame_count, 1)
+        self._assert_same_grad(m1.weight, m2.weight)
+        self._assert_same_grad(inp1, inp2)
+        self._assert_same_grad(out1, out2)
+
+    def test_fake_distributed_aot_eager(self):
+        m1, inp1 = init_fake_distributed()
+        out1 = steps(m1, inp1)
+
+        m2, inp2 = init_fake_distributed()
+        m2 = torch.compile(m2, backend="aot_eager", fullgraph=True)
+        bw_cnt = CompileCounter()
+        with compiled_autograd.enable(torch.compile(backend=bw_cnt, fullgraph=True)):
+            out2 = steps(m2, inp2)
+
+        self._assert_same_grad(m1.weight, m2.weight)
+        self._assert_same_grad(inp1, inp2)
+        self._assert_same_grad(out1, out2)
+        # Recompile on grad==None/grad!=None
+        self.assertEqual(bw_cnt.frame_count, 2)
+
+    def test_fake_distributed_inductor(self):
+        m1, inp1 = init_fake_distributed()
+        out1 = steps(m1, inp1)
+
+        m2, inp2 = init_fake_distributed()
+        m2 = torch.compile(m2, fullgraph=True)
+        with compiled_autograd.enable(torch.compile(fullgraph=True)):
+            out2 = steps(m2, inp2)
+
+        self._assert_same_grad(m1.weight, m2.weight)
+        self._assert_same_grad(inp1, inp2)
+        self._assert_same_grad(out1, out2)
+
+
+if __name__ == "__main__":
+    if HAS_CPU and not IS_MACOS:
+        run_tests(needs="filelock")
diff --git a/test/inductor/test_efficient_conv_bn_eval.py b/test/inductor/test_efficient_conv_bn_eval.py
index fdb879f8b10db..c65b7585f9f32 100644
--- a/test/inductor/test_efficient_conv_bn_eval.py
+++ b/test/inductor/test_efficient_conv_bn_eval.py
@@ -13,9 +13,9 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
-from torch._dynamo.test_case import TestCase
 from torch._dynamo.utils import counters
 from torch._inductor import config as inductor_config
+from torch._inductor.test_case import TestCase
 
 from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS, TEST_WITH_ASAN
 
@@ -103,7 +103,12 @@ def forward(self, x):
 class EfficientConvBNEvalTemplate(TestCase):
     @inductor_config.patch({"efficient_conv_bn_eval_fx_passes": True})
     def test_basic(self):
-        def test_conv_bn_eval(test_class, use_bias, module, sync_bn):
+        def test_conv_bn_eval(
+            test_class, use_bias, module, sync_bn, decompose_nn_module
+        ):
+            from functorch import make_fx
+            from torch._dispatch.python import enable_python_dispatcher
+
             kwargs = {"kernel_size": 3, "stride": 2} if module[0] != nn.Linear else {}
             mod_eager = test_class(
                 module[0],
@@ -122,7 +127,6 @@ def test_conv_bn_eval(test_class, use_bias, module, sync_bn):
                     mod_optimized
                 ).eval()
             torch._dynamo.reset()
-            mod_optimized = torch.compile(mod_optimized)
 
             inps = [4, 3]
             # Conv shape goes from big to small, and ConvTranspose shape goes from small to big
@@ -137,6 +141,11 @@ def test_conv_bn_eval(test_class, use_bias, module, sync_bn):
                 inps += [spatial_d] * 3
             inp = torch.rand(inps).to(self.device)
 
+            if decompose_nn_module:
+                with enable_python_dispatcher():
+                    mod_optimized = make_fx(mod_optimized, pre_dispatch=True)(inp)
+            mod_optimized = torch.compile(mod_optimized)
+
             original_value = counters["inductor"]["efficient_conv_bn_eval"]
 
             optim_eager = torch.optim.SGD(mod_eager.parameters(), lr=1e-3)
@@ -179,10 +188,23 @@ def test_conv_bn_eval(test_class, use_bias, module, sync_bn):
         ]
         test_classes = [ConvOp, MultiUserConvOp]
         sync_bns = [False, True]
-        for test_class, use_bias, module, sync_bn in itertools.product(
-            test_classes, conv_bias, modules, sync_bns
+        decompose_nn_modules = [False, True]
+        for (
+            test_class,
+            use_bias,
+            module,
+            sync_bn,
+            decompose_nn_module,
+        ) in itertools.product(
+            test_classes,
+            conv_bias,
+            modules,
+            sync_bns,
+            decompose_nn_modules,
         ):
-            test_conv_bn_eval(test_class, use_bias, module, sync_bn)
+            test_conv_bn_eval(
+                test_class, use_bias, module, sync_bn, decompose_nn_module
+            )
 
 
 if HAS_CPU and not torch.backends.mps.is_available():
@@ -202,7 +224,7 @@ class EfficientConvBNEvalCudaTests(TestCase):
 del EfficientConvBNEvalTemplate
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if HAS_CPU or HAS_CUDA:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_extension_backend.py b/test/inductor/test_extension_backend.py
index 54f020907a57a..3cb473255e74b 100644
--- a/test/inductor/test_extension_backend.py
+++ b/test/inductor/test_extension_backend.py
@@ -7,20 +7,24 @@
 import torch
 import torch._dynamo
 import torch.utils.cpp_extension
+from torch._C import FileCheck
 
 try:
-    from extension_backends.extension_codegen_backend import (
+    from extension_backends.cpp.extension_codegen_backend import (
+        ExtensionCppWrapperCodegen,
         ExtensionScheduling,
         ExtensionWrapperCodegen,
     )
 except ImportError:
-    from .extension_backends.extension_codegen_backend import (
+    from .extension_backends.cpp.extension_codegen_backend import (
+        ExtensionCppWrapperCodegen,
         ExtensionScheduling,
         ExtensionWrapperCodegen,
     )
 
-from torch._C import FileCheck
-from torch._inductor import metrics
+import torch._inductor.config as config
+from torch._inductor import codecache, metrics
+from torch._inductor.codegen import cpp_utils
 from torch._inductor.codegen.common import (
     get_scheduling_for_device,
     get_wrapper_codegen_for_device,
@@ -64,7 +68,7 @@ def setUpClass(cls):
         remove_build_path()
         source_file_path = os.path.dirname(os.path.abspath(__file__))
         source_file = os.path.join(
-            source_file_path, "extension_backends/extension_device.cpp"
+            source_file_path, "extension_backends/cpp/extension_device.cpp"
         )
         cls.module = torch.utils.cpp_extension.load(
             name="extension_device",
@@ -101,9 +105,13 @@ def tearDown(self):
 
     def test_open_device_registration(self):
         torch.utils.rename_privateuse1_backend("extension_device")
+        torch._register_device_module("extension_device", self.module)
 
         register_backend_for_device(
-            "extension_device", ExtensionScheduling, ExtensionWrapperCodegen
+            "extension_device",
+            ExtensionScheduling,
+            ExtensionWrapperCodegen,
+            ExtensionCppWrapperCodegen,
         )
         self.assertTrue(
             get_scheduling_for_device("extension_device") == ExtensionScheduling
@@ -112,6 +120,10 @@ def test_open_device_registration(self):
             get_wrapper_codegen_for_device("extension_device")
             == ExtensionWrapperCodegen
         )
+        self.assertTrue(
+            get_wrapper_codegen_for_device("extension_device", True)
+            == ExtensionCppWrapperCodegen
+        )
 
         self.assertFalse(self.module.custom_op_called())
         device = self.module.custom_device()
@@ -128,19 +140,26 @@ def test_open_device_registration(self):
         def fn(a, b, c):
             return a * b + c
 
-        metrics.reset()
-        opt_fn = torch.compile()(fn)
-        _, code = run_and_get_cpp_code(opt_fn, x, y, z)
-        FileCheck().check("void kernel").check("loadu").check("extension_device").run(
-            code
-        )
-        opt_fn(x, y, z)
-        res = opt_fn(x, y, z)
-        self.assertEqual(ref, res.to(device="cpu"))
+        cpp_utils.DEVICE_TO_ATEN["extension_device"] = "at::kPrivateUse1"
+        for cpp_wrapper_flag in [True, False]:
+            with config.patch({"cpp_wrapper": cpp_wrapper_flag}):
+                metrics.reset()
+                opt_fn = torch.compile()(fn)
+                _, code = run_and_get_cpp_code(opt_fn, x, y, z)
+                if codecache.valid_vec_isa_list():
+                    load_expr = "loadu"
+                else:
+                    load_expr = " = in_ptr0[static_cast<long>(i0)];"
+                FileCheck().check("void").check(load_expr).check(
+                    "extension_device"
+                ).run(code)
+                opt_fn(x, y, z)
+                res = opt_fn(x, y, z)
+                self.assertEqual(ref, res.to(device="cpu"))
 
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
     from torch.testing._internal.inductor_utils import HAS_CPU
 
     # cpp_extension doesn't work in fbcode right now
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
new file mode 100644
index 0000000000000..37461bc2c50a8
--- /dev/null
+++ b/test/inductor/test_flex_attention.py
@@ -0,0 +1,733 @@
+# Owner(s): ["module: inductor"]
+
+import functools
+from collections import namedtuple
+from typing import Callable
+
+from unittest import expectedFailure, skip, skipUnless
+from unittest.mock import patch
+
+import torch
+
+from torch._dynamo.testing import CompileCounterWithBackend, normalize_gm
+from torch._higher_order_ops.flex_attention import flex_attention as flex_attention_hop
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch._inductor.utils import run_and_get_code
+from torch.nn.attention._flex_attention import (
+    _causal,
+    _compose,
+    _flex_attention,
+    _generate_alibi_bias,
+    _identity,
+    _rel_bias,
+    _rel_causal,
+)
+from torch.testing import FileCheck
+from torch.testing._internal import common_utils
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_BF16
+from torch.utils._triton import has_triton
+
+# Skip tests if Triton is not available
+supported_platform = skipUnless(
+    torch.cuda.is_available() and has_triton() and torch.version.hip is None,
+    "Requires CUDA and Triton",
+)
+
+Tolerances = namedtuple("Tolerances", ["atol", "rtol"])
+torch.set_float32_matmul_precision("high")
+
+index = torch.ops.aten.index
+
+
+def create_attention(score_mod):
+    return functools.partial(_flex_attention, score_mod=score_mod)
+
+
+test_dtypes = (
+    [torch.float16, torch.bfloat16, torch.float32]
+    if PLATFORM_SUPPORTS_BF16
+    else [torch.float16, torch.float32]
+)
+
+test_dtypes_fast = [torch.float16]
+
+# TODO float16 was causing ERRORs for tests on ROCm
+# See https://github.com/pytorch/pytorch/issues/123531
+if common_utils.TEST_WITH_ROCM:
+    test_dtypes = [torch.float32]
+
+
+# --------- Useful score mod functions for testing ---------
+
+test_score_mods = [
+    _identity,
+    _causal,
+    _rel_bias,
+    _rel_causal,
+    _generate_alibi_bias(8),
+]
+
+
+def _times_two(score, b, h, m, n):
+    """Joint graph needed for correctness"""
+    return score * 2
+
+
+def _squared(score, b, h, m, n):
+    """Joint graph needed for correctness"""
+    return score * score
+
+
+def _head_offset(dtype: torch.dtype):
+    """Captured Buffer
+    Note: this builds a score_mod with index of a type
+    """
+    head_offset = torch.rand(H, device="cuda", dtype=dtype)
+
+    def score_mod(score, b, h, m, n):
+        return score * index(head_offset, [h])
+
+    return score_mod
+
+
+def _trig(score, b, h, m, n):
+    """Joint graph needed for correctness"""
+    return torch.sin(torch.cos(score)) + torch.tan(b)
+
+
+def _trig2(score, b, h, m, n):
+    """Branching joint graph"""
+    cos_score = torch.cos(score)
+    sin_score = torch.sin(score)
+    z = cos_score * sin_score + torch.tan(b)
+    return z
+
+
+def _buffer_reduced(dtype: torch.dtype):
+    """Reduction in captured buffer"""
+    batch_offsets = torch.rand(B, 8, device="cuda", dtype=dtype)
+
+    def score_mod(score, b, h, m, n):
+        batch_vals = index(batch_offsets, [b])
+        return score + batch_vals.sum()
+
+    return score_mod
+
+
+captured_buffers_map = {
+    "_head_offset": _head_offset,
+    "_buffer_reduced": _buffer_reduced,
+}
+
+B = 4
+H = 8
+S = 2048
+D = 64
+
+
+class TestTemplatedSDPA(InductorTestCase):
+    def run_test(
+        self,
+        score_mod: Callable,
+        dtype: torch.dtype = torch.float16,
+        B: int = B,
+        H: int = H,
+        S: int = S,
+        D: int = D,
+    ):
+        sdpa_partial = create_attention(score_mod)
+        compiled_sdpa = torch.compile(sdpa_partial)
+        q = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        k = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        v = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        golden_out = sdpa_partial(
+            q.to(torch.float64), k.to(torch.float64), v.to(torch.float64)
+        )
+        ref_out = sdpa_partial(q, k, v)
+        compiled_out = compiled_sdpa(q, k, v)
+
+        compiled_error = (golden_out - compiled_out).abs().mean()
+        ref_error = (golden_out - ref_out).abs().mean()
+        # Note, it seems like we really are less accurate than the float32
+        # computation, likely due to the online softmax
+        if dtype == torch.float32:
+            fudge_factor = 10.0
+        else:
+            fudge_factor = 1.1
+        if compiled_error > ref_error * fudge_factor:
+            msg = f"Compiled error {compiled_error} is greater than ref error {ref_error} by more than {fudge_factor}X."
+            self.assertTrue(False, msg)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes)
+    @common_utils.parametrize("score_mod", test_score_mods)
+    def test_builtin_score_mods(self, dtype: torch.dtype, score_mod: Callable):
+        self.run_test(score_mod, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes)
+    def test_skip_odd_keys(self, dtype: torch.dtype):
+        def score_mod(score, b, h, q, kv):
+            return torch.where(kv % 2 == 0, score, float("-inf"))
+
+        self.run_test(score_mod, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes)
+    def test_function_composition(self, dtype: torch.dtype):
+        def score_mod_1(score, b, h, m, n):
+            return score + (m - n)
+
+        def score_mod_2(score, b, h, m, n):
+            return torch.where(m <= n, score, float("-inf"))
+
+        composed_score_mod = _compose(score_mod_1, score_mod_2)
+
+        self.run_test(composed_score_mod, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes)
+    def test_captured_buffers(self, dtype: torch.dtype):
+        head_offset = torch.rand(H, device="cuda", dtype=dtype)
+
+        def score_mod(score, b, h, m, n):
+            return score + head_offset[h]
+
+        self.run_test(score_mod, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_seq_masking(self, dtype):
+        seq_idx = torch.zeros(S, device="cuda", dtype=torch.bool)
+        seq_idx[S // 2 :] = 1
+
+        def seq_mask_mod(score, b, h, q, kv):
+            return torch.where(seq_idx[q] == seq_idx[kv], score, float("-inf"))
+
+        self.run_test(seq_mask_mod, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_load_from_bias_seq_only(self, dtype):
+        bias = torch.randn(S, S, device="cuda", dtype=dtype)
+
+        def bias_mod(score, b, h, q, kv):
+            return score + bias[q, kv]
+
+        self.run_test(bias_mod, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_load_from_bias_seq_batch(self, dtype):
+        bias = torch.randn(B, S, S, device="cuda", dtype=dtype)
+
+        def bias_mod(score, b, h, q, kv):
+            return score + bias[b, q, kv]
+
+        self.run_test(bias_mod, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_load_from_bias_head_seq_batch(self, dtype):
+        bias = torch.randn(B, H, S, S, device="cuda", dtype=dtype)
+
+        def bias_mod(score, b, h, q, kv):
+            return score + bias[b, h, q, kv]
+
+        self.run_test(bias_mod, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_load_rel_bias(self, dtype):
+        rel_bias = torch.randn(2 * S, device="cuda", dtype=dtype)
+
+        def bias_mod(score, b, h, q, kv):
+            return score + rel_bias[(q - kv) + S]
+
+        self.run_test(bias_mod, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_dependent_causal_bidirectional(self, dtype):
+        num_bidirectional = torch.randint(0, S, (B,), device="cuda", dtype=torch.int32)
+
+        def bias_mod(score, b, h, q, kv):
+            causal_attention = q >= kv
+            cur_num_bidirectional = num_bidirectional[b]
+            bidirectional_attention_on_video = (q <= cur_num_bidirectional) & (
+                kv <= cur_num_bidirectional
+            )
+            return torch.where(
+                bidirectional_attention_on_video | causal_attention,
+                score,
+                -float("inf"),
+            )
+
+        self.run_test(bias_mod, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_natten_2d(self, dtype):
+        H = 32
+        W = S // H
+        WINDOW = 3
+        assert W * H == S
+
+        def get_x_y(idx):
+            # This should be a floor divide, but we don't support that properly
+            return idx / W, idx % W
+
+        def natten_mask(score, b, h, q, kv):
+            q_x, q_y = get_x_y(q)
+            kv_x, kv_y = get_x_y(kv)
+            return torch.where(
+                ((q_x - kv_x).abs() <= WINDOW) | ((q_y - kv_y).abs() <= WINDOW),
+                score,
+                float("-inf"),
+            )
+
+        self.run_test(natten_mask, dtype)
+
+    @supported_platform
+    @expectedFailure
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_silu_on_score(self, dtype):
+        def silu_score(score, b, h, q, kv):
+            return torch.nn.functional.silu(score)
+
+        self.run_test(silu_score, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_padded_dense_causal(self, dtype):
+        seq_len = torch.arange(B, device="cuda", dtype=torch.int32) + 1
+
+        def create_padded_dense_wrapper(orig_score_mod):
+            def njt_score_mod(qk, b, h, q, kv):
+                return torch.where(
+                    qk <= seq_len[b], orig_score_mod(qk, b, h, q, kv), -float("inf")
+                )
+
+            return njt_score_mod
+
+        causal_njt = create_padded_dense_wrapper(_causal)
+
+        self.run_test(causal_njt, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_captured_scale(self, dtype):
+        scale = torch.ones((), device="cuda", dtype=torch.int32)
+
+        def score_mod_scale(qk, b, h, q, kv):
+            return qk + scale
+
+        self.run_test(score_mod_scale, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_recompile_changed_score_mod(self, dtype):
+        scale = torch.ones((), device="cuda", dtype=torch.int32)
+        ADD = True
+
+        def score_mod_scale(qk, b, h, q, kv):
+            if ADD:
+                return qk + scale
+            else:
+                return qk * scale
+
+        self.run_test(score_mod_scale, dtype)
+        ADD = False
+        self.run_test(score_mod_scale, dtype)
+
+    @supported_platform
+    @expectedFailure  # If we capture a tensor then we can perform a reduction on it, and that shouldn't be allowed
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_captured_reduction(self, dtype):
+        scale = torch.randn((B, 8), device="cuda")
+
+        def score_mod_scale(qk, b, h, q, kv):
+            return qk + scale[b].sum(dim=-1)
+
+        self.run_test(score_mod_scale, dtype)
+
+    @supported_platform
+    def test_multiple_score_mod_calls(self):
+        query = torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
+        keys = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
+            for _ in range(2)
+        ]
+        values = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
+            for _ in range(2)
+        ]
+
+        def scoremod_1(qk, b, h, q, kv):
+            return qk + (q - kv)
+
+        def scoremod_2(qk, b, h, q, kv):
+            return torch.where(q >= kv, qk, -float("inf"))
+
+        def f(q, k1, k2, v1, v2):
+            q2 = _flex_attention(q, k1, v1, score_mod=scoremod_1)
+            return _flex_attention(q2, k2, v2, score_mod=scoremod_2)
+
+        out = f(query, *keys, *values)
+        out2 = torch.compile(f)(query, *keys, *values)
+        tolerance = Tolerances(atol=2e-1, rtol=2e-1)
+        torch.testing.assert_close(out, out2, atol=tolerance.atol, rtol=tolerance.rtol)
+
+    @supported_platform
+    def test_multiple_score_mod_calls2(self):
+        query = torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
+        keys = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
+            for _ in range(3)
+        ]
+        values = [
+            torch.randn((1, 8, 1024, 64), dtype=torch.float32, device="cuda")
+            for _ in range(3)
+        ]
+
+        def scoremod_1(qk, b, h, q, kv):
+            return qk + (q - kv)
+
+        def scoremod_2(qk, b, h, q, kv):
+            return torch.where(q >= kv, qk, -float("inf"))
+
+        attention1 = functools.partial(_flex_attention, score_mod=scoremod_1)
+
+        def f(q, k1, k2, k3, v1, v2, v3):
+            q2 = attention1(q, k1, v1)
+            q3 = _flex_attention(q2, k2, v2, score_mod=scoremod_2)
+            return _flex_attention(q3, k3, v3, score_mod=scoremod_1)
+
+        out = f(query, *keys, *values)
+        out2 = torch.compile(f)(query, *keys, *values)
+        self.assertTrue((out - out2).abs().mean() < 1e-2)
+
+    @supported_platform
+    @skip("Triton bug ")  # https://github.com/pytorch/pytorch/issues/124571
+    @common_utils.parametrize("dtype", test_dtypes)
+    def test_njt_causal(self, dtype):
+        offsets = torch.tensor(
+            [0, 1024, 1024 + 512, S], device="cuda", dtype=torch.int32
+        )
+        seq_idx = torch.zeros(S, device="cuda", dtype=torch.int32)
+        for idx in range(len(offsets) - 1):
+            seq_idx[offsets[idx] : offsets[idx + 1]] = idx
+
+        def create_njt_wrapper(orig_score_mod, offsets, seq_idx):
+            def njt_score_mod(qk, b, h, q, kv):
+                q_nested = q - offsets[seq_idx[q]]
+                kv_nested = kv - offsets[seq_idx[kv]]
+                return orig_score_mod(qk, b, h, q_nested, kv_nested)
+
+            return njt_score_mod
+
+        causal_njt = create_njt_wrapper(_causal, offsets, seq_idx)
+
+        self.run_test(causal_njt, dtype)
+
+    @supported_platform
+    def test_backwards_fails(self):
+        make_tensor = functools.partial(
+            torch.randn,
+            (B, H, S, D),
+            dtype=torch.float32,
+            device="cuda",
+            requires_grad=True,
+        )
+        q, k, v = make_tensor(), make_tensor(), make_tensor()
+        func = torch.compile(_flex_attention, backend="inductor", fullgraph=True)
+        with self.assertRaisesRegex(
+            AssertionError, "flex_attention_backward is not an OpOverload"
+        ):
+            out = func(q, k, v, _identity)
+            out.backward(torch.ones_like(out))
+
+    @supported_platform
+    def test_mixed_dtypes_fails(self):
+        query = torch.randn((1, 1, 1024, 64), dtype=torch.float32, device="cuda")
+        key = torch.randn((1, 1, 1024, 64), dtype=torch.float16, device="cuda")
+        value = torch.randn((1, 1, 1024, 64), dtype=torch.float16, device="cuda")
+        with self.assertRaisesRegex(
+            ValueError, "Expected query, key, and value to have the same dtype"
+        ):
+            _flex_attention(query, key, value, _identity)
+
+    @supported_platform
+    def test_different_sequence_length_fails(self):
+        query = torch.randn((1, 1, 2048, 64), dtype=torch.float32, device="cuda")
+        key = torch.randn((1, 1, 1024, 64), dtype=torch.float32, device="cuda")
+        value = torch.randn((1, 1, 1024, 64), dtype=torch.float32, device="cuda")
+        with self.assertRaisesRegex(ValueError, "NYI: The target sequence length"):
+            _flex_attention(query, key, value, _identity)
+
+    @supported_platform
+    @patch.object(torch._inductor.config, "max_autotune", True)
+    def test_max_autotune(self):
+        def score_mod(score, b, h, m, n):
+            return score * 2
+
+        self.run_test(score_mod)
+
+    @supported_platform
+    @patch.object(torch._inductor.config, "max_autotune", True)
+    def test_max_autotune_with_captured(self):
+        head_scale = torch.randn(H, device="cuda")
+        batch_scale = torch.randn(B, device="cuda")
+        tok_scale = torch.randn(S, device="cuda")
+
+        def bias_mod(score, batch, head, token_q, token_kv):
+            score = score + tok_scale[token_q]
+            score = score + batch_scale[batch]
+            score = score + head_scale[head]
+            return score
+
+        self.run_test(bias_mod)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes)
+    @common_utils.parametrize("score_mod", [_identity, _causal])
+    def test_logsumexp_correctness(self, dtype, score_mod):
+        @torch.compile
+        def sdpa_hop(q, k, v, score_mod):
+            return flex_attention_hop(q, k, v, score_mod)
+
+        @torch.compile(backend="aot_eager")
+        def eager_sdpa_hop(q, k, v, score_mod):
+            """The main entrypoint for FlexAttention doesnt return LSE.
+            Besides dropping LSE it also ensures that the hop is compiled with aot-eager
+            backend. We need to replicate this.
+            """
+            return flex_attention_hop(q, k, v, score_mod)
+
+        make_tensor = functools.partial(
+            torch.randn,
+            (B, H, S, D),
+            dtype=dtype,
+            device="cuda",
+            requires_grad=True,
+        )
+        q, k, v = make_tensor(), make_tensor(), make_tensor()
+
+        ref_out, ref_lse = eager_sdpa_hop(
+            q.to(torch.float64), k.to(torch.float64), v.to(torch.float64), score_mod
+        )
+        compiled_out, compiled_lse = sdpa_hop(q, k, v, score_mod)
+
+        # Comparing LSE for the ref and the compiled version
+        # The compiled uses a change of base trick to more efficiently compute the LSE
+        # this means that the base for the LSE computed by ref is e while for the compiled
+        # version it is 2. To compare we use the change of base formula
+        # log_2(x_compiled) = log_e(x_ref) * log_2(e) where
+        # x_ref      = sum(_i e^(scores[i]))
+        # x_compiled = sum(_i 2^(log2(e) * scores[i]))
+
+        self.assertTrue(ref_lse.dtype == torch.float64)
+        self.assertTrue(compiled_lse.dtype == torch.float32)
+        ref_lse = ref_lse * torch.log2(torch.tensor(torch.e))
+
+        tolerance = Tolerances(atol=2e-2, rtol=2e-2)
+        torch.testing.assert_close(
+            ref_out.to(dtype=torch.float32),
+            compiled_out.to(dtype=torch.float32),
+            atol=tolerance.atol,
+            rtol=tolerance.rtol,
+        )
+        torch.testing.assert_close(
+            ref_lse.to(dtype=torch.float32),
+            compiled_lse.to(dtype=torch.float32),
+            atol=tolerance.atol,
+            rtol=tolerance.rtol,
+        )
+
+    @supported_platform
+    def test_logsumexp_only_return(self):
+        make_tensor = functools.partial(
+            torch.randn,
+            (B, H, S, D),
+            dtype=torch.float32,
+            device="cuda",
+            requires_grad=True,
+        )
+        q, k, v = make_tensor(), make_tensor(), make_tensor()
+
+        @torch.compile
+        def func(q, k, v, score_mod):
+            _, lse = flex_attention_hop(q, k, v, score_mod)
+            lse_2 = lse * 2
+            return lse_2
+
+        _, code = run_and_get_code(func, q, k, v, _identity)
+        # Ensure that two kernels are generated
+        FileCheck().check_count(".run(", 2, True).run(code[0])
+
+    @supported_platform
+    def test_logsumexp_is_not_fused(self):
+        make_tensor = functools.partial(
+            torch.randn,
+            (B, H, S, D),
+            dtype=torch.float32,
+            device="cuda",
+            requires_grad=True,
+        )
+        q, k, v = make_tensor(), make_tensor(), make_tensor()
+
+        @torch.compile
+        def func(q, k, v, score_mod):
+            out, lse = flex_attention_hop(q, k, v, score_mod)
+            lse_2 = lse * 2
+            return out, lse_2
+
+        _, code = run_and_get_code(func, q, k, v, _identity)
+        # Ensure that two kernels are generated
+        FileCheck().check_count(".run(", 2, True).run(code[0])
+
+    @supported_platform
+    @common_utils.parametrize(
+        "score_mod", [_identity, _causal, _times_two, _squared, _trig, _trig2]
+    )
+    def test_aot_eager_gradcheck(self, score_mod):
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 8, 4),
+            device="cuda",
+            dtype=torch.float64,
+            requires_grad=True,
+        )
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+
+        func = torch.compile(_flex_attention, backend="aot_eager", fullgraph=True)
+
+        self.assertTrue(
+            torch.autograd.gradcheck(
+                func, (query, key, value, score_mod), raise_exception=True
+            )
+        )
+
+    @supported_platform
+    @common_utils.parametrize("score_mod_name", ["_head_offset", "_buffer_reduced"])
+    @common_utils.parametrize("mode", ["eager", "aot_eager"])
+    def test_captured_score_mod_aot_eager_gradcheck(
+        self, score_mod_name: str, mode: str
+    ):
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 8, 4),
+            device="cuda",
+            dtype=torch.float64,
+            requires_grad=True,
+        )
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+
+        func = torch.compile(_flex_attention, backend=mode, fullgraph=True)
+        score_mod = captured_buffers_map[score_mod_name](torch.float64)
+
+        self.assertTrue(
+            torch.autograd.gradcheck(
+                func, (query, key, value, score_mod), raise_exception=True
+            )
+        )
+
+    @supported_platform
+    def test_fw_bw_graph_correctness(self):
+        cnt = CompileCounterWithBackend("aot_eager")
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 8, 4),
+            device="cuda",
+            dtype=torch.float64,
+            requires_grad=True,
+        )
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+
+        func = torch.compile(_flex_attention, backend=cnt, fullgraph=True)
+        out = func(query, key, value, _squared)
+        out.sum().backward()
+        self.assertEqual(cnt.frame_count, 1)
+        self.assertEqual(len(cnt.graphs), 1)
+        graph = cnt.graphs[0]
+        norm_graph = normalize_gm(graph.print_readable(print_output=False))
+        self.assertExpectedInline(
+            norm_graph,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_args_0_: "f64[2, 2, 8, 4]", L_args_1_: "f64[2, 2, 8, 4]", L_args_2_: "f64[2, 2, 8, 4]"):
+        l_args_0_ = L_args_0_
+        l_args_1_ = L_args_1_
+        l_args_2_ = L_args_2_
+
+        new_empty: "f64[]" = l_args_0_.new_empty([], requires_grad = True)
+        new_empty_1: "i32[]" = l_args_0_.new_empty([], dtype = torch.int32)
+        new_empty_2: "i32[]" = l_args_0_.new_empty([], dtype = torch.int32)
+        new_empty_3: "i32[]" = l_args_0_.new_empty([], dtype = torch.int32)
+        new_empty_4: "i32[]" = l_args_0_.new_empty([], dtype = torch.int32)
+        flex_attention_0 = self.flex_attention_0
+        flex_attention = torch.ops.higher_order.flex_attention(l_args_0_, l_args_1_, l_args_2_, flex_attention_0);  l_args_0_ = l_args_1_ = l_args_2_ = flex_attention_0 = None
+        out: "f64[2, 2, 8, 4]" = flex_attention[0];  flex_attention = None
+        return (out,)
+
+    class GraphModule(torch.nn.Module):
+        def forward(self, new_empty: "f64[]", new_empty_1: "i32[]", new_empty_2: "i32[]", new_empty_3: "i32[]", new_empty_4: "i32[]"):
+            mul: "f64[]" = new_empty * new_empty;  new_empty = None
+            return mul
+""",  # noqa: B950
+        )
+        # Save the AOT graphs
+        aot_graphs = []
+        from torch._inductor import compile_fx
+
+        def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
+            aot_graphs.append(graph)
+            return graph
+
+        backend = functools.partial(
+            compile_fx.compile_fx, inner_compile=debug_compile_fx_inner
+        )
+        func = torch.compile(func, backend=backend, fullgraph=True)
+        out = func(query, key, value, _squared)
+        out.sum().backward()
+
+        joint_graph = normalize_gm(aot_graphs[1].print_readable(print_output=False))
+
+        self.assertExpectedInline(
+            joint_graph,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f64[2, 2, 8, 4]", primals_2: "f64[2, 2, 8, 4]", primals_3: "f64[2, 2, 8, 4]", """
+            + """alias_5: "f64[2, 2, 8, 4]", alias_7: "f32[2, 2, 8]", tangents_1: "f64[2, 2, 8, 4]"):
+        fw_graph = self.fw_graph
+        joint_graph = self.joint_graph
+        flex_attention_backward = torch.ops.higher_order.flex_attention_backward(primals_1, primals_2, """
+            + """primals_3, alias_5, alias_7, tangents_1, fw_graph, joint_graph);  primals_1 = primals_2 = primals_3 = alias_5 """
+            + """= alias_7 = tangents_1 = fw_graph = joint_graph = None
+        getitem_2: "f64[2, 2, 8, 4]" = flex_attention_backward[0]
+        getitem_3: "f64[2, 2, 8, 4]" = flex_attention_backward[1]
+        getitem_4: "f64[2, 2, 8, 4]" = flex_attention_backward[2];  flex_attention_backward = None
+        return [getitem_2, getitem_3, getitem_4]
+
+    class <lambda>(torch.nn.Module):
+        def forward(self, arg0_1: "f64[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i32[]", arg4_1: "i32[]"):
+            mul: "f64[]" = torch.ops.aten.mul.Tensor(arg0_1, arg0_1);  arg0_1 = None
+            return mul
+
+    class <lambda>(torch.nn.Module):
+        def forward(self, arg0_1: "f64[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i32[]", arg4_1: "i32[]", arg5_1: "f64[]"):
+            mul: "f64[]" = torch.ops.aten.mul.Tensor(arg0_1, arg0_1)
+            mul_1: "f64[]" = torch.ops.aten.mul.Tensor(arg5_1, arg0_1)
+            mul_2: "f64[]" = torch.ops.aten.mul.Tensor(arg5_1, arg0_1);  arg5_1 = arg0_1 = None
+            add: "f64[]" = torch.ops.aten.add.Tensor(mul_2, mul_1);  mul_2 = mul_1 = None
+            return [add, None, None, None, None]
+""",
+        )
+
+
+common_utils.instantiate_parametrized_tests(TestTemplatedSDPA)
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    run_tests()
diff --git a/test/inductor/test_foreach.py b/test/inductor/test_foreach.py
index 34d54503bc216..8dfa5a2aecc54 100644
--- a/test/inductor/test_foreach.py
+++ b/test/inductor/test_foreach.py
@@ -7,22 +7,23 @@
 
 import torch._inductor
 
+from torch._inductor.test_case import TestCase
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_FBCODE,
     parametrize,
-    TestCase,
 )
 
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+from torch.testing._internal.triton_utils import requires_cuda
 
 aten = torch.ops.aten
 
 try:
     try:
-        from .test_torchinductor import check_model, check_model_cuda, requires_cuda
+        from .test_torchinductor import check_model, check_model_cuda
     except ImportError:
-        from test_torchinductor import check_model, check_model_cuda, requires_cuda
+        from test_torchinductor import check_model, check_model_cuda
 except (unittest.SkipTest, ImportError) as e:
     sys.stderr.write(f"{type(e)}: {e}\n")
     if __name__ == "__main__":
@@ -140,29 +141,29 @@ def fn(a0, a1):
         )
 
     # called in test_cuda_cpp_wrapper.py
-    @requires_cuda()
+    @requires_cuda
     def test_foreach_cpp_wrapper_cuda(self):
         self._test_single_list(op=torch._foreach_add)
 
-    @requires_cuda()
+    @requires_cuda
     @all_ops
     def test_single_list(self, op):
         self._test_single_list(op)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @scalar_bin_ops
     def test_single_scalar(self, op):
         self._test_single_scalar(op)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @scalar_tensor_bin_ops
     def test_single_scalar_tensor(self, op):
         self._test_single_scalar_tensor(op)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @all_ops
     def test_scheduler_fusion_list(self, op):
         if op in un_ops_under_test:
@@ -184,7 +185,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @scalar_bin_ops
     def test_scheduler_fusion_scalar(self, op):
         def fn(a0, a1):
@@ -201,7 +202,7 @@ def fn(a0, a1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @scalar_bin_ops
     def test_broadcasting(self, op):
         def fn(a0, a1, b0, b1):
@@ -220,7 +221,7 @@ def fn(a0, a1, b0, b1):
         self.assertEqual(actual, expected)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @all_ops
     def test_singleton_lists(self, op):
         if op in un_ops_under_test:
@@ -246,7 +247,7 @@ def fn(a0, b0):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @bin_ops
     def test_type_promotion(self, op):
         def fn(a0, a1, b0, b1):
@@ -267,7 +268,7 @@ def fn(a0, a1, b0, b1):
         self.assertEqual(actual, expected)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @scalar_bin_ops
     def test_kernel_split_arg_limit_list(self, op):
         # NB: foeach_copy won't pass this test because it will dce one set of buffers
@@ -289,7 +290,7 @@ def fn(a, b):
         self.assertEqual(actual, expected)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda()
+    @requires_cuda
     @scalar_bin_ops
     @unittest.skip(
         "Triton recursion depth exceeded: https://github.com/openai/triton/issues/1763"
@@ -309,7 +310,7 @@ def fn(a):
         self.assertEqual(actual, expected)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda()
+    @requires_cuda
     @bin_ops
     def test_fusion_duplicate_buffer_list(self, op):
         def fn(a0, a1, b0, b1):
@@ -330,7 +331,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @all_ops
     def test_non_foreach_consumer_list(self, op):
         if op in un_ops_under_test:
@@ -352,7 +353,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @scalar_bin_ops
     def test_non_foreach_consumer_scalar(self, op):
         def fn(a0, a1):
@@ -369,7 +370,7 @@ def fn(a0, a1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @all_ops
     def test_non_foreach_producer_list(self, op):
         if op in un_ops_under_test:
@@ -392,7 +393,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @scalar_bin_ops
     def test_non_foreach_producer_scalar(self, op):
         def fn(a0, a1, b0, b1):
@@ -412,7 +413,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @all_ops
     def test_non_foreach_consumer_producer_list(self, op):
         if op in un_ops_under_test:
@@ -444,7 +445,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @scalar_bin_ops
     def test_non_foreach_consumer_producer_scalar(self, op):
         def fn(a0, a1, b0, b1):
@@ -469,7 +470,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @bin_ops
     @torch._dynamo.config.patch("automatic_dynamic_shapes", False)
     @torch._dynamo.config.patch("assume_static_by_default", False)
@@ -505,7 +506,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda()
+    @requires_cuda
     @decomp_ops
     def test_decomp(self, op):
         def fn(a0, a1, b0, b1, c0, c1):
@@ -525,7 +526,7 @@ def fn(a0, a1, b0, b1, c0, c1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     def test_fuse_concat(self):
         def fn(x1, x2, x3, w1, w2, w3):
             x = torch.stack([x1, x2, x3])
@@ -548,7 +549,7 @@ def fn(x1, x2, x3, w1, w2, w3):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda()
+    @requires_cuda
     def test_zero_elems(self):
         def fn(a0, a1, b0, b1):
             return torch._foreach_add([a0, a1], [b0, b1])
@@ -565,7 +566,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @bin_ops
     def test_2d_blocking(self, op):
         def fn(a0, a1, b0, b1):
@@ -583,7 +584,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @bin_ops
     def test_2d_blocking_partitioning(self, op):
         def fn(a0, a1, b0, b1):
@@ -601,7 +602,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda()
+    @requires_cuda
     @bin_ops
     def test_2d_blocking_partitioning_elems(self, op):
         """2D blocking should be grouped by number of yelems"""
@@ -623,7 +624,7 @@ def fn(a0, a1, a2, b0, b1, b2):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @requires_cuda()
+    @requires_cuda
     @inplace_bin_ops
     def test_reinplacing(self, op):
         def fn(a0, a1, b0, b1):
@@ -641,7 +642,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @inplace_bin_ops
     def test_reinplacing_mut_before(self, op):
         def fn(a0, a1, b0, b1):
@@ -660,7 +661,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     @inplace_bin_ops
     def test_reinplacing_mut_after(self, op):
         def fn(a0, a1, b0, b1):
@@ -679,7 +680,7 @@ def fn(a0, a1, b0, b1):
 
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    @requires_cuda()
+    @requires_cuda
     def test_multi_device(self):
         def test_foreach_add(a0, a1, b0, b1):
             return torch._foreach_add([a0, a1], [b0, b1])
@@ -697,9 +698,31 @@ def test_foreach_add(a0, a1, b0, b1):
         self.assertEqual(out_eager, out_compiled)
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
+    @requires_cuda
+    def test_aliasing(self):
+        def test_foreach_add(a0, a1, a2, b0, b1, b2):
+            return torch._foreach_add_([a0, a1, a2], [b0, b1, b2])
+
+        input = torch.ones(10, 10, device="cuda")
+        input2 = torch.ones(10, 10, device="cuda")
+        inps = [
+            input,
+            input.view(10, 10),
+            input.view(10, 10),
+            input2,
+            input2.view(10, 10),
+            input2.view(10, 10),
+        ]
+
+        out_eager = test_foreach_add(*inps)
+        out_compiled = torch.compile(test_foreach_add)(*inps)
+
+        self.assertEqual(out_eager, out_compiled)
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
+
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if HAS_CPU or HAS_CUDA:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_fp8.py b/test/inductor/test_fp8.py
index b39f15c4f529d..5d910a74c9837 100644
--- a/test/inductor/test_fp8.py
+++ b/test/inductor/test_fp8.py
@@ -5,8 +5,8 @@
 
 import torch
 from torch import Tensor
-from torch._dynamo.test_case import run_tests, TestCase
 from torch._inductor import utils
+from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.common_cuda import SM90OrLater
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -243,10 +243,12 @@ def ln_fp8(x: Tensor, scale: Tensor, amax_buffer: Tensor):
     @unittest.skipIf(not SM90OrLater, "FP8 is only supported on H100+")
     @parametrize("float8_dtype", (torch.float8_e4m3fn, torch.float8_e5m2))
     @parametrize("shape", ("4,2048,4096",))
+    @parametrize("keepdim", (False, True))
     def test_layernorm_fp8_quant_benchmark(
         self,
         float8_dtype: torch.dtype,
         shape: str,
+        keepdim: bool,
     ):
         shape = [int(dim) for dim in shape.split(",")]
         batch_size, sequence_length, hidden_size = shape
@@ -269,7 +271,8 @@ def ln_fp8(x: Tensor, scale: Tensor, amax_buffer: Tensor):
                 bias=None,
                 eps=1e-05,
             )
-            amax_buffer.fill_(torch.amax(torch.abs(x)))
+            amax = torch.amax(torch.abs(x), keepdim=keepdim)
+            amax_buffer.view_as(amax).copy_(amax)
             x_scaled = x * scale
             bits_fp8 = _to_fp8_saturated(x_scaled, float8_dtype)
             return bits_fp8
@@ -295,7 +298,7 @@ def ln_fp8(x: Tensor, scale: Tensor, amax_buffer: Tensor):
         ln_latency = utils.do_bench_using_profiling(functools.partial(compiled_ln, x))
 
         print(
-            f"Config: {float8_dtype=}, {shape=}. "
+            f"Config: {float8_dtype=}, {shape=}, {keepdim=}. "
             f"Benchmark results: Inductor: {compiled_latency}ms, Eager: {eager_latency}ms, "
             f"LN only Inductor: {ln_latency}ms."
         )
diff --git a/test/inductor/test_fused_attention.py b/test/inductor/test_fused_attention.py
index b01dfc6c3d43b..e53ab76036d6d 100644
--- a/test/inductor/test_fused_attention.py
+++ b/test/inductor/test_fused_attention.py
@@ -6,8 +6,9 @@
 import torch
 import torch._inductor.config
 import torch.utils.checkpoint
-from torch._dynamo.test_case import run_tests, TestCase
+from torch._dynamo.debug_utils import aot_graph_input_parser
 from torch._dynamo.utils import counters
+from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import run_and_get_code
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FUSED_ATTENTION,
@@ -25,6 +26,8 @@ def inner(*args):
 
 
 class TestSDPAPatternRewriterTemplate(TestCase):
+    use_static_shapes = True
+
     def _clone_inputs(self, inputs):
         def clone(x):
             if not isinstance(x, torch.Tensor):
@@ -62,16 +65,22 @@ def _check_common(
                 if isinstance(x, torch.Tensor) and x.is_floating_point():
                     x.requires_grad = training
 
+            if not self.use_static_shapes:
+                torch._dynamo.mark_dynamic(args2[0], 0)
+                torch._dynamo.mark_dynamic(args2[1], 0)
+                torch._dynamo.mark_dynamic(args2[2], 0)
+
             dropout_arg = [training] if has_dropout else []
             torch.manual_seed(1234)
             result1 = dot_prod_attention(*(args1 + dropout_arg))
 
             counters.clear()
             torch.manual_seed(1234)
-            result2, (source_code,) = run_and_get_code(
+            result2, source_code = run_and_get_code(
                 torch.compile(dot_prod_attention, fullgraph=True),
                 *(args2 + dropout_arg),
             )
+            source_code = "\n".join(source_code)
             if has_fuse_pattern:
                 self.assertGreaterEqual(counters["inductor"]["fuse_attention"], 1)
             if contains:
@@ -110,17 +119,126 @@ def dot_prod_attention(
             )
 
         for dtype in [torch.float, torch.half]:
-            if self.device == "cpu" and dtype == torch.half:
-                continue
+            atol = 0.001
             rtol = 1.3e-6 if dtype == torch.float else 0.7
-            self._check_common(dot_prod_attention, dtype=dtype, atol=0.001, rtol=rtol)
+            if self.device == "cpu" and dtype == torch.half:
+                atol = 2e-3
+                rtol = 1e-2
+            self._check_common(dot_prod_attention, dtype=dtype, atol=atol, rtol=rtol)
             self._check_common(
                 checkpoint_wrapper(dot_prod_attention),
                 dtype=dtype,
-                atol=0.001,
+                atol=atol,
                 rtol=rtol,
             )
 
+    @skipIfRocm
+    @torch._inductor.config.patch("freezing", True)
+    def _test_sdpa_rewriter_1_freezing(self):
+        def dot_prod_attention(
+            query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+        ) -> torch.Tensor:
+            """Input tensors assumed to have shape (batch_size, n_head, seq_len, embed_dim)"""
+            return (
+                torch.matmul(query, key.transpose(-2, -1))
+                .div(math.sqrt(key.shape[-1]))
+                .softmax(dim=-1)
+                .matmul(value)
+            )
+
+        for dtype in [torch.float, torch.half]:
+            atol = 0.001
+            rtol = 1.3e-6 if dtype == torch.float else 0.7
+            if self.device == "cpu" and dtype == torch.half:
+                atol = 2e-3
+                rtol = 1e-2
+            with torch.no_grad():
+                self._check_common(
+                    dot_prod_attention,
+                    dtype=dtype,
+                    atol=atol,
+                    rtol=rtol,
+                    check_train=False,
+                )
+
+    @skipIfRocm
+    def _test_insignificant_strides(self):
+        f32 = torch.float32
+
+        # repro taken from https://github.com/pytorch/pytorch/issues/124289
+        # constant_pad_nd is a single element tensor that gets expanded
+
+        def forward(
+            permute_3: "f32[1, 32, 1, 128]",
+            permute_4: "f32[1, 32, 1, 128]",
+            permute_5: "f32[1, 32, 1, 128]",
+            permute_6: "f32[1, 1, 64]",
+            mul_2: "f32[1, 1, 1, 1]",
+        ):
+            cat = torch.ops.aten.cat.default([permute_6, permute_6], 2)
+            permute_6 = None
+            cos = torch.ops.aten.cos.default(cat)
+            sin = torch.ops.aten.sin.default(cat)
+            unsqueeze_10 = torch.ops.aten.unsqueeze.default(cos, 1)
+            cos = None
+            unsqueeze_11 = torch.ops.aten.unsqueeze.default(sin, 1)
+            sin = None
+            mul_5 = torch.ops.aten.mul.Tensor(permute_3, unsqueeze_10)
+            slice_10 = torch.ops.aten.slice.Tensor(permute_3, 3, 0, 64)
+            slice_11 = torch.ops.aten.slice.Tensor(
+                permute_3, 3, 64, 9223372036854775807
+            )
+            permute_3 = None
+            neg = torch.ops.aten.neg.default(slice_11)
+            slice_11 = None
+            cat_1 = torch.ops.aten.cat.default([neg, slice_10], 3)
+            neg = slice_10 = None
+            mul_6 = torch.ops.aten.mul.Tensor(cat_1, unsqueeze_11)
+            cat_1 = None
+            add_1 = torch.ops.aten.add.Tensor(mul_5, mul_6)
+            mul_5 = mul_6 = None
+            mul_7 = torch.ops.aten.mul.Tensor(permute_4, unsqueeze_10)
+            unsqueeze_10 = None
+            slice_12 = torch.ops.aten.slice.Tensor(permute_4, 3, 0, 64)
+            slice_13 = torch.ops.aten.slice.Tensor(
+                permute_4, 3, 64, 9223372036854775807
+            )
+            permute_4 = None
+            neg_1 = torch.ops.aten.neg.default(slice_13)
+            slice_13 = None
+            cat_2 = torch.ops.aten.cat.default([neg_1, slice_12], 3)
+            neg_1 = slice_12 = None
+            mul_8 = torch.ops.aten.mul.Tensor(cat_2, unsqueeze_11)
+            cat_2 = unsqueeze_11 = None
+            add_2 = torch.ops.aten.add.Tensor(mul_7, mul_8)
+            mul_7 = mul_8 = None
+            slice_14 = torch.ops.aten.slice.Tensor(mul_2, 0, 0, 9223372036854775807)
+            mul_2 = None
+            slice_15 = torch.ops.aten.slice.Tensor(slice_14, 1, 0, 9223372036854775807)
+            slice_14 = None
+            slice_16 = torch.ops.aten.slice.Tensor(slice_15, 2, 0, 9223372036854775807)
+            slice_15 = None
+            constant_pad_nd = torch.ops.aten.constant_pad_nd.default(
+                slice_16, [0, 7], 0.0
+            )
+            slice_16 = None
+            slice_17 = torch.ops.aten.slice.Tensor(constant_pad_nd, -1, 0, 1)
+            constant_pad_nd = None
+            expand_5 = torch.ops.aten.expand.default(slice_17, [1, 32, 1, 1])
+            _scaled_dot_product_efficient_attention = (
+                torch.ops.aten._scaled_dot_product_efficient_attention.default(
+                    add_1, add_2, permute_5, expand_5, True
+                )
+            )
+            return _scaled_dot_product_efficient_attention
+
+        kwargs = aot_graph_input_parser(forward, device="cuda")
+        # runs successfully
+        out_eager = forward(**kwargs)
+        out_c = torch.compile(forward)(**kwargs)
+        # dont compare philox_seed/offset
+        torch.testing.assert_close(out_eager[0:2], out_c[0:2])
+
     def _test_pattern_fails_with_reuse(self):
         """
         This test checks that the replacement is not done
@@ -393,6 +511,9 @@ def __init__(self, is_inv_factor):
                 self.is_inv_factor = is_inv_factor
 
             def forward(self, query, key, value, scale_factor) -> torch.Tensor:
+                # Dividing by scale_factor makes scale_factor gradients very
+                # unstable
+                scale_factor = scale_factor.detach()
                 y = torch.matmul(query, key.transpose(-2, -1))
                 if self.is_inv_factor:
                     y = y.div(scale_factor)
@@ -411,10 +532,13 @@ def forward(self, query, key, value, scale_factor) -> torch.Tensor:
             model = Model(is_inv_factor).eval()
             # The training path has an accuracy gap compared with eager mode.
             self._check_common(
-                model, args1=args, contains=False, atol=1e-4, has_fuse_pattern=False
+                model, args1=args, contains=False, atol=1e-3, has_fuse_pattern=False
             )
 
     def _test_pattern_fails_with_unsupported_mask(self):
+        if not self.use_static_shapes:
+            self.skipTest("Causes shape specialization. TODO: investigate")
+
         # https://github.com/pytorch/pytorch/issues/100315
         class Model(torch.nn.Module):
             def __init__(
@@ -574,6 +698,266 @@ def dot_prod_attention(
             rtol=1e-2,
         )
 
+    @skipIfRocm
+    def _test_sdpa_rewriter_14(self):
+        def dot_prod_attention(
+            query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+        ) -> torch.Tensor:
+            """Input tensors assumed to have shape (batch_size, seq_len, n_head, embed_dim)"""
+            attn_mask = torch.ones(
+                query.size(1), key.size(1), dtype=torch.bool, device=query.device
+            ).tril(diagonal=0)
+            attn_mask = attn_mask.masked_fill(
+                torch.logical_not(attn_mask), -float("inf")
+            )
+            q = query.permute(0, 2, 1, 3)
+            k = key.permute(0, 2, 1, 3)
+            v = value.permute(0, 2, 1, 3)
+            return (
+                (torch.matmul(q, k.transpose(-2, -1)).div(3.0) + attn_mask)
+                .softmax(dim=-1)
+                .matmul(v)
+            )
+
+        self._check_common(dot_prod_attention)
+
+    @skipIfRocm
+    def _test_sdpa_rewriter_15(self):
+        def dot_prod_attention(
+            query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+        ) -> torch.Tensor:
+            """Input tensors assumed to have shape (batch_size, seq_len, n_head, embed_dim)"""
+            q = query.transpose(1, 2)
+            k = key.transpose(1, 2)
+            v = value.transpose(1, 2)
+            bs = q.size(0)
+            k_len = k.size(-2)
+            attn_mask = torch.ones(
+                bs, k_len, dtype=torch.bool, device=query.device
+            ).tril(diagonal=0)
+            scores = torch.matmul(q, k.transpose(-2, -1)) / 3.0
+            attn_mask = (attn_mask == 0).view((bs, 1, 1, k_len)).expand_as(scores)
+            scores = scores.masked_fill(attn_mask, -float("inf"))
+            weights = torch.nn.functional.softmax(scores, dim=-1)
+            return torch.matmul(weights, v)
+
+        self._check_common(dot_prod_attention, check_train=False)
+
+    @skipIfRocm
+    def _test_sdpa_rewriter_16(self):
+        def dot_prod_attention(
+            query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, training
+        ) -> torch.Tensor:
+            """Input tensors assumed to have shape (batch_size, seq_len, n_head, embed_dim)"""
+            attn_mask = torch.ones(
+                query.size(1), key.size(1), dtype=torch.bool, device=query.device
+            ).tril(diagonal=0)
+            attn_mask = attn_mask.masked_fill(
+                torch.logical_not(attn_mask), -float("inf")
+            )
+            q = query.permute(0, 2, 1, 3)
+            k = key.permute(0, 2, 1, 3)
+            v = value.permute(0, 2, 1, 3)
+            return torch.nn.functional.dropout(
+                (torch.matmul(q, k.transpose(-2, -1)).div(3.0) + attn_mask).softmax(
+                    dim=-1
+                ),
+                p=0.4,
+                training=training,
+                inplace=False,
+            ).matmul(v)
+
+        self._check_common(dot_prod_attention, contains=False, has_dropout=True)
+
+        # also check batch_size=1 because the graph is slightly different
+        tensor_shape = (1, 2, 16, 32)
+        args = [
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+        ]
+        self._check_common(
+            dot_prod_attention, args1=args, contains=False, has_dropout=True
+        )
+
+    @skipIfRocm
+    def _test_sdpa_rewriter_16_fp32_mask(self):
+        def dot_prod_attention(
+            query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, training
+        ) -> torch.Tensor:
+            """Input tensors assumed to have shape (batch_size, seq_len, n_head, embed_dim)"""
+            attn_mask = torch.randn(
+                query.size(1), key.size(1), dtype=torch.float, device=query.device
+            ).tril(diagonal=0)
+            q = query.permute(0, 2, 1, 3)
+            k = key.permute(0, 2, 1, 3)
+            v = value.permute(0, 2, 1, 3)
+            return torch.nn.functional.dropout(
+                (torch.matmul(q, k.transpose(-2, -1)).div(3.0) + attn_mask).softmax(
+                    dim=-1
+                ),
+                p=0.4,
+                training=training,
+                inplace=False,
+            ).matmul(v)
+
+        self._check_common(dot_prod_attention, contains=False, has_dropout=True)
+
+        # also check batch_size=1 because the graph is slightly different
+        tensor_shape = (1, 2, 16, 32)
+        args = [
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+        ]
+        self._check_common(
+            dot_prod_attention, args1=args, contains=False, has_dropout=True
+        )
+
+    @skipIfRocm
+    def _test_sdpa_rewriter_17(self):
+        def dot_prod_attention(
+            query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, training
+        ) -> torch.Tensor:
+            """Input tensors assumed to have shape (batch_size, seq_len, n_head, embed_dim)"""
+            q = query.transpose(1, 2)
+            k = key.transpose(1, 2)
+            v = value.transpose(1, 2)
+            bs = q.size(0)
+            k_len = k.size(-2)
+            attn_mask = torch.ones(
+                bs, k_len, dtype=torch.bool, device=query.device
+            ).tril(diagonal=0)
+            scores = torch.matmul(q, k.transpose(-2, -1)) / 3.0
+            attn_mask = (attn_mask == 0).view((bs, 1, 1, k_len)).expand_as(scores)
+            scores = scores.masked_fill(attn_mask, -float("inf"))
+            weights = torch.nn.functional.softmax(scores, dim=-1)
+            weights = torch.nn.functional.dropout(
+                weights,
+                p=0.4,
+                training=training,
+                inplace=False,
+            )
+            return torch.matmul(weights, v)
+
+        self._check_common(dot_prod_attention, check_train=False, has_dropout=True)
+
+    @skipIfRocm
+    def _test_sdpa_rewriter_18(self):
+        def dot_prod_attention(
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            causal_mask: torch.Tensor,
+        ) -> torch.Tensor:
+            # for hf_GPT2 with dropout
+            query = query.permute([0, 2, 1, 3])
+            key = key.permute([0, 2, 1, 3])
+            value = value.permute([0, 2, 1, 3])
+            attn_weights = torch.matmul(query, key.permute(0, 1, 3, 2))
+            inv_scale = torch.full(
+                (), math.sqrt(value.size(-1)), dtype=query.dtype, device=query.device
+            )
+            attn_weights = attn_weights.div(inv_scale)
+            causal_mask_value = torch.full(
+                (), torch.finfo(query.dtype).min, dtype=query.dtype, device=query.device
+            )
+            attn_weights = torch.where(causal_mask, attn_weights, causal_mask_value)
+            return (
+                (
+                    torch.nn.functional.dropout(
+                        attn_weights.softmax(dim=-1), 0.0
+                    ).matmul(value)
+                ),
+                key.permute([0, 2, 1, 3]),
+                value.permute([0, 2, 1, 3]),
+            )
+
+        tensor_shape = (4, 2, 16, 32)
+        causal_mask = torch.ones(2, 2, dtype=torch.bool, device=self.device).tril(
+            diagonal=0
+        )
+        args = [
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            causal_mask,
+        ]
+        self._check_common(
+            dot_prod_attention,
+            args1=args,
+            contains=False,
+            has_dropout=False,
+            check_train=False,
+        )
+
+        # also check batch_size=1 because the graph is slightly different
+        tensor_shape = (1, 2, 16, 32)
+        args = [
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            causal_mask,
+        ]
+        self._check_common(
+            dot_prod_attention,
+            args1=args,
+            contains=False,
+            has_dropout=False,
+            check_train=False,
+        )
+
+    @skipIfRocm
+    def _test_sdpa_rewriter_19(self):
+        def dot_prod_attention(
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            causal_mask: torch.Tensor,
+            attn_mask: torch.Tensor,
+            training,
+        ) -> torch.Tensor:
+            attn_weights = torch.matmul(query, key.permute(0, 1, 3, 2))
+            inv_scale = torch.full(
+                (),
+                math.sqrt(value.size(-1)),
+                dtype=attn_weights.dtype,
+                device=attn_weights.device,
+            )
+            attn_weights = attn_weights.div(inv_scale)
+            causal_mask_value = torch.full(
+                (), torch.finfo(query.dtype).min, dtype=query.dtype, device=query.device
+            )
+            attn_weights = torch.where(causal_mask, attn_weights, causal_mask_value)
+            attn_weights = attn_weights + attn_mask
+            attn_weights = attn_weights.softmax(dim=-1).type(value.dtype)
+            return torch.nn.functional.dropout(
+                attn_weights,
+                p=0.4,
+                training=training,
+                inplace=False,
+            ).matmul(value)
+
+        tensor_shape = (4, 2, 16, 32)
+        causal_mask = torch.ones(16, 16, dtype=torch.bool, device=self.device).tril(
+            diagonal=0
+        )
+        attn_mask = torch.randn((16, 16), dtype=torch.float, device=self.device)
+        args = [
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            torch.randn(tensor_shape, device=self.device),
+            causal_mask,
+            attn_mask,
+        ]
+        self._check_common(
+            dot_prod_attention,
+            args1=args,
+            contains=False,
+            has_dropout=True,
+            check_train=False,
+        )
+
 
 if HAS_CUDA and PLATFORM_SUPPORTS_FUSED_ATTENTION:
 
@@ -582,6 +966,12 @@ class SDPAPatternRewriterCudaTests(TestSDPAPatternRewriterTemplate):
         test_sdpa_rewriter_1_cuda = (
             TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_1
         )
+        test_sdpa_rewriter_1_freezing = (
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_1_freezing
+        )
+        test_insignificant_strides = (
+            TestSDPAPatternRewriterTemplate._test_insignificant_strides
+        )
         test_pattern_fails_with_reuse_cuda = (
             TestSDPAPatternRewriterTemplate._test_pattern_fails_with_reuse
         )
@@ -630,6 +1020,21 @@ class SDPAPatternRewriterCudaTests(TestSDPAPatternRewriterTemplate):
         test_sdpa_rewriter_13_cuda = functools.partialmethod(
             TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_13, dtype=torch.half
         )
+        test_sdpa_rewriter_14_cuda = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_14
+        )
+        test_sdpa_rewriter_15_cuda = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_15
+        )
+        test_sdpa_rewriter_17_cuda = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_17
+        )
+        test_sdpa_rewriter_19_cuda = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_19
+        )
+
+    class SDPAPatternRewriterCudaDynamicTests(SDPAPatternRewriterCudaTests):
+        use_static_shapes = False
 
 
 if HAS_CPU:
@@ -660,6 +1065,30 @@ class SDPAPatternRewriterCpuTests(TestSDPAPatternRewriterTemplate):
         test_sdpa_rewriter_13_cpu = functools.partialmethod(
             TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_13, dtype=torch.float32
         )
+        test_sdpa_rewriter_14_cpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_14
+        )
+        test_sdpa_rewriter_15_cpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_15
+        )
+        test_sdpa_rewriter_16_cpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_16
+        )
+        test_sdpa_rewriter_16_fp32_mask_cpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_16_fp32_mask
+        )
+        test_sdpa_rewriter_17_cpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_17
+        )
+        test_sdpa_rewriter_18_cpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_18
+        )
+        test_sdpa_rewriter_19_cpu = functools.partialmethod(
+            TestSDPAPatternRewriterTemplate._test_sdpa_rewriter_19
+        )
+
+    class SDPAPatternRewriterCpuDynamicTests(SDPAPatternRewriterCpuTests):
+        use_static_shapes = False
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_fx_fusion.py b/test/inductor/test_fx_fusion.py
index 18dcab1952f21..ba65a5b3a7eb8 100644
--- a/test/inductor/test_fx_fusion.py
+++ b/test/inductor/test_fx_fusion.py
@@ -11,8 +11,8 @@
     transpose_linear,
     transpose_matmul,
 )
+from torch._inductor.test_case import run_tests, TestCase
 from torch.fx.passes.shape_prop import ShapeProp
-from torch.testing._internal.common_utils import run_tests, TestCase
 
 PassFunc = Callable[[torch.fx.GraphModule, Any], torch.fx.GraphModule]
 
@@ -30,7 +30,7 @@ def parent_pass(module: torch.fx.GraphModule, input: Any) -> torch.fx.GraphModul
 
 def count_call(module: torch.fx.GraphModule, op: str, target_op: Any) -> int:
     return sum(
-        [1 if (n.op == op and n.target == target_op) else 0 for n in module.graph.nodes]
+        1 if (n.op == op and n.target == target_op) else 0 for n in module.graph.nodes
     )
 
 
diff --git a/test/inductor/test_group_batch_fusion.py b/test/inductor/test_group_batch_fusion.py
index db58761770547..6dd2ff51219d7 100644
--- a/test/inductor/test_group_batch_fusion.py
+++ b/test/inductor/test_group_batch_fusion.py
@@ -1,12 +1,12 @@
 # Owner(s): ["module: inductor"]
 
-import functools
+import collections
 import unittest
 
 import torch
 import torch._inductor
-from torch._dynamo.test_case import run_tests, TestCase
-from torch._dynamo.utils import counters
+from torch._dynamo.utils import counters, optimus_scuba_log
+from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
 try:
@@ -18,7 +18,7 @@
     has_fbgemm = False
     pass
 
-requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 
 
 class MyModule(torch.nn.Module):
@@ -220,7 +220,7 @@ def forward(self, x):
         return torch.cat(div, dim=1)
 
 
-@requires_cuda()
+@requires_cuda
 @torch._inductor.config.patch(
     pre_grad_fusion_options={
         "batch_linear": {},
@@ -278,25 +278,23 @@ def test_group_linear_fusion(self):
             res = traced(*input)
             self.compare_pred(module, traced, input)
             self.assertEqual(
-                counters["inductor"]["group_fusion"],
+                counters["inductor"]["group_linear"],
                 2,
             )
-            self.assertEqual(
-                counters["inductor"]["batch_fusion"],
-                0,
-            )
+            self.assertNotIn("group_batch_fusion_pre_grad", optimus_scuba_log)
             ref.sum().backward()
             res.sum().backward()
             self.compare_parameters(module, traced)
             self.compare_gradients(module, traced)
             self.assertEqual(
-                counters["inductor"]["group_fusion"],
+                counters["inductor"]["group_linear"],
                 4,
             )
             self.assertEqual(
-                counters["inductor"]["batch_fusion"],
+                counters["inductor"]["batch_aten_add"],
                 3,
             )
+            self.assertIn("GroupLinearFusion", optimus_scuba_log)
             counters.clear()
 
     @unittest.skipIf(not has_fbgemm, "requires fbgemm")
@@ -309,7 +307,7 @@ def test_group_linear_fusion_different_shapes(self):
         res = traced(*input)
         self.compare_pred(module, traced, input)
         self.assertEqual(
-            counters["inductor"]["group_fusion"],
+            counters["inductor"]["group_linear"],
             1,
         )
         self.assertEqual(
@@ -321,11 +319,11 @@ def test_group_linear_fusion_different_shapes(self):
         self.compare_parameters(module, traced)
         self.compare_gradients(module, traced)
         self.assertEqual(
-            counters["inductor"]["group_fusion"],
+            counters["inductor"]["group_linear"],
             2,
         )
         self.assertEqual(
-            counters["inductor"]["batch_fusion"],
+            counters["inductor"]["batch_aten_mul"],
             1,
         )
         counters.clear()
@@ -340,19 +338,7 @@ def test_batch_layer_norm_fusion(self):
                 ref = module(*input)
                 res = traced(*input)
                 self.compare_pred(module, traced, input)
-                self.assertEqual(
-                    counters["inductor"]["group_fusion"],
-                    0,
-                )
-                self.assertEqual(counters["inductor"]["batch_fusion"], 2)
-                self.assertEqual(
-                    counters["inductor"]["scmerge_split_removed"],
-                    3,
-                )
-                self.assertEqual(
-                    counters["inductor"]["scmerge_cat_removed"],
-                    3,
-                )
+                self.assertEqual(counters["inductor"]["batch_layernorm"], 2)
                 ref.sum().backward()
                 res.sum().backward()
                 self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
@@ -369,15 +355,7 @@ def test_batch_linear_lhs_fusion(self):
             ref = module(*input)
             res = traced(*input)
             self.compare_pred(module, traced, input)
-            self.assertEqual(counters["inductor"]["batch_fusion"], 2)
-            self.assertEqual(
-                counters["inductor"]["scmerge_split_removed"],
-                1,
-            )
-            self.assertEqual(
-                counters["inductor"]["scmerge_cat_removed"],
-                1,
-            )
+            self.assertEqual(counters["inductor"]["batch_linear_lhs"], 2)
             ref.sum().backward()
             res.sum().backward()
             self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
@@ -393,16 +371,7 @@ def test_batch_linear_pre_grad_fusion(self):
             ref = module(*input)
             res = traced(*input)
             self.compare_pred(module, traced, input)
-            self.assertEqual(counters["inductor"]["batch_fusion"], 1)
-            self.assertEqual(counters["inductor"]["group_fusion"], 0)
-            self.assertEqual(
-                counters["inductor"]["scmerge_split_removed"],
-                2,
-            )
-            self.assertEqual(
-                counters["inductor"]["scmerge_cat_removed"],
-                2,
-            )
+            self.assertEqual(counters["inductor"]["batch_linear"], 1)
             ref.sum().backward()
             res.sum().backward()
             self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
@@ -417,15 +386,13 @@ def test_pointwise_op_fusion(self):
         ref = module(*input)
         res = traced(*input)
         self.compare_pred(module, traced, input)
-        self.assertEqual(counters["inductor"]["batch_fusion"], 7)
-        self.assertEqual(
-            counters["inductor"]["scmerge_split_removed"],
-            0,
-        )
-        self.assertEqual(
-            counters["inductor"]["scmerge_cat_removed"],
-            0,
-        )
+        self.assertEqual(counters["inductor"]["batch_tanh"], 1)
+        self.assertEqual(counters["inductor"]["batch_relu"], 1)
+        self.assertEqual(counters["inductor"]["batch_sigmoid"], 1)
+        self.assertEqual(counters["inductor"]["batch_aten_add"], 1)
+        self.assertEqual(counters["inductor"]["batch_aten_mul"], 1)
+        self.assertEqual(counters["inductor"]["batch_aten_sub"], 1)
+        self.assertEqual(counters["inductor"]["batch_aten_div"], 1)
         ref.sum().backward()
         res.sum().backward()
         self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
@@ -450,7 +417,7 @@ def forward(self, inputs):
         return output
 
 
-@requires_cuda()
+@requires_cuda
 @torch._inductor.config.patch(
     post_grad_fusion_options={"batch_linear_post_grad": {"require_fbgemm": False}}
 )
@@ -465,10 +432,635 @@ def test_batch_linear_post_grad_fusion(self):
         pt2_output = pt2_module(inputs)
         self.assertTrue(torch.allclose(eager_output, pt2_output))
         self.assertEqual(
-            counters["inductor"]["batch_fusion"],
+            counters["inductor"]["batch_linear_post_grad"],
+            2,
+        )
+        self.assertIn("PostGradBatchLinearFusion", optimus_scuba_log)
+
+
+class TestFindIndependentSubsetGreedy(TestCase):
+    # Helper function to build a Graph from a data description.
+    def build_graph(self, desc):
+        # desc: {
+        #   "n1": ["n2", "n3"],
+        #   "n2": ["n3"],
+        #   "n3": [],
+        # }
+        #
+        g = torch.fx.Graph()
+        lookup = {}
+        desc = collections.deque((k, v) for k, v in desc.items())
+        unsatisfied = 0
+        while desc:
+            unsatisfied += 1
+            assert unsatisfied <= len(desc)  # cycle or bad input?
+            name, v = desc.popleft()
+            args = tuple(lookup.get(n, None) for n in v)
+            if None in args:
+                desc.append((name, v))
+                continue
+            node = g.create_node("placeholder", "target", name=name, args=args)
+            lookup[name] = node
+            unsatisfied = 0
+        return g, lookup
+
+    def verify(self, tree, subnodes, min_fuse, max_fuse, expected):
+        g, lookup = self.build_graph(tree)
+        subnodes = [lookup[n] for n in subnodes]
+        expected = [[lookup[n] for n in sub] for sub in expected]
+        opts = {
+            "min_fuse_set_size": min_fuse,
+            "max_fuse_set_size": max_fuse,
+        }
+        result = list(
+            torch._inductor.fx_passes.group_batch_fusion.find_independent_subset_greedy(
+                subnodes, opts
+            )
+        )
+        self.assertEqual(expected, result)
+
+    def test_find_independent_subset_greedy(self):
+        # First some randomly generated tests.
+        self.verify({"n0": (), "n1": ()}, ["n0"], 0, 100, [["n0"]])
+        self.verify(
+            {"n0": (), "n1": (), "n2": ("n0",)}, ["n1", "n2"], 0, 100, [["n1", "n2"]]
+        )
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": ("n0",),
+                "n3": (),
+                "n4": ("n0", "n1", "n2"),
+                "n5": ("n0", "n2", "n4"),
+                "n6": ("n3",),
+                "n7": ("n4", "n5", "n6", "n1", "n3"),
+                "n8": ("n7", "n1", "n3", "n5", "n0"),
+                "n9": ("n3", "n4", "n8", "n6", "n5", "n2", "n0", "n7"),
+                "n10": ("n0",),
+                "n11": ("n4", "n0", "n2", "n3", "n1", "n9"),
+                "n12": ("n2", "n3", "n10", "n6", "n9"),
+            },
+            ["n10", "n5", "n3", "n4", "n9"],
+            0,
+            100,
+            [["n10", "n5", "n3"], ["n4"], ["n9"]],
+        )
+        self.verify({"n0": (), "n1": (), "n2": ("n0",)}, ["n2"], 0, 100, [["n2"]])
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": (),
+                "n3": (),
+                "n4": ("n3", "n1", "n0"),
+                "n5": ("n1", "n2", "n4", "n0"),
+                "n6": ("n0", "n3", "n2"),
+                "n7": ("n6", "n1", "n5", "n4", "n3", "n0"),
+                "n8": ("n2", "n7", "n3"),
+                "n9": ("n3", "n5", "n6", "n7", "n2", "n1"),
+                "n10": ("n8", "n0", "n2", "n4", "n6", "n3"),
+                "n11": ("n6", "n5", "n8", "n1", "n3", "n10", "n2"),
+                "n12": ("n7", "n4"),
+            },
+            ["n7"],
+            0,
+            100,
+            [["n7"]],
+        )
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": (),
+                "n3": ("n1", "n2"),
+                "n4": ("n1",),
+                "n5": (),
+                "n6": ("n5",),
+                "n7": ("n1", "n6", "n5", "n2", "n3", "n0"),
+                "n8": ("n5", "n7", "n2", "n6"),
+                "n9": ("n1",),
+                "n10": ("n9",),
+                "n11": ("n3", "n4", "n0", "n2"),
+                "n12": ("n8", "n9", "n5", "n1"),
+                "n13": ("n11", "n4", "n12", "n1", "n9", "n3", "n0"),
+            },
+            ["n9", "n2", "n8", "n10", "n5", "n6", "n13", "n7", "n3", "n0", "n4"],
+            0,
+            100,
+            [
+                ["n9", "n2", "n5", "n0", "n4"],
+                ["n8", "n10"],
+                ["n6", "n3"],
+                ["n13"],
+                ["n7"],
+            ],
+        )
+        self.verify({"n0": ()}, ["n0"], 0, 100, [["n0"]])
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": (),
+                "n3": (),
+                "n4": ("n1", "n2"),
+                "n5": ("n0", "n4", "n1"),
+                "n6": ("n1", "n5"),
+                "n7": (),
+                "n8": ("n7", "n1", "n3", "n5", "n6"),
+                "n9": ("n2", "n1", "n8", "n0", "n4", "n7", "n6", "n5"),
+                "n10": ("n4", "n7", "n2", "n3", "n8"),
+                "n11": (),
+                "n12": ("n9", "n7", "n5", "n11", "n8"),
+                "n13": (
+                    "n5",
+                    "n6",
+                    "n12",
+                    "n3",
+                    "n9",
+                    "n8",
+                    "n4",
+                    "n11",
+                    "n2",
+                    "n10",
+                    "n1",
+                ),
+                "n14": ("n7", "n3", "n12", "n10", "n2", "n0", "n4", "n5"),
+                "n15": ("n9", "n5", "n1", "n13", "n8", "n10", "n12", "n7", "n11", "n3"),
+                "n16": (
+                    "n2",
+                    "n4",
+                    "n15",
+                    "n5",
+                    "n0",
+                    "n6",
+                    "n3",
+                    "n8",
+                    "n14",
+                    "n12",
+                    "n9",
+                    "n10",
+                    "n7",
+                    "n13",
+                ),
+            },
+            ["n0", "n3", "n2", "n11", "n1", "n6", "n12", "n5", "n4", "n15", "n8"],
+            0,
+            100,
+            [
+                ["n0", "n3", "n2", "n11", "n1"],
+                ["n6"],
+                ["n12"],
+                ["n5"],
+                ["n4"],
+                ["n15"],
+                ["n8"],
+            ],
+        )
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": (),
+                "n3": ("n2", "n1"),
+                "n4": ("n2", "n3", "n1"),
+                "n5": ("n3", "n1"),
+                "n6": ("n1",),
+                "n7": ("n5", "n4"),
+                "n8": ("n6", "n2"),
+            },
+            ["n4", "n3", "n1", "n8", "n5", "n6", "n2"],
+            0,
+            100,
+            [["n4", "n8", "n5"], ["n3", "n6"], ["n1", "n2"]],
+        )
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": (),
+                "n3": ("n1", "n0"),
+                "n4": ("n0",),
+                "n5": ("n1", "n4"),
+                "n6": ("n2", "n1", "n4"),
+                "n7": ("n0", "n3"),
+                "n8": ("n5", "n0", "n6", "n1", "n4", "n2", "n3"),
+                "n9": ("n1", "n4", "n8", "n7", "n5"),
+                "n10": ("n9", "n8", "n0", "n2", "n7", "n1", "n3", "n5"),
+                "n11": ("n9", "n2", "n6", "n0", "n3"),
+                "n12": ("n1", "n4", "n7", "n10", "n5", "n2", "n11", "n6"),
+                "n13": ("n9", "n2", "n3", "n0", "n7", "n5", "n10", "n11"),
+                "n14": (
+                    "n8",
+                    "n0",
+                    "n3",
+                    "n6",
+                    "n10",
+                    "n1",
+                    "n5",
+                    "n9",
+                    "n12",
+                    "n11",
+                    "n4",
+                ),
+                "n15": (
+                    "n3",
+                    "n10",
+                    "n0",
+                    "n4",
+                    "n9",
+                    "n11",
+                    "n2",
+                    "n13",
+                    "n12",
+                    "n8",
+                    "n5",
+                    "n14",
+                ),
+                "n16": ("n6",),
+                "n17": (
+                    "n4",
+                    "n3",
+                    "n14",
+                    "n8",
+                    "n15",
+                    "n16",
+                    "n2",
+                    "n5",
+                    "n7",
+                    "n12",
+                    "n1",
+                    "n0",
+                    "n11",
+                ),
+            },
+            ["n17", "n16", "n10", "n4", "n8", "n12", "n6", "n1"],
+            0,
+            100,
+            [["n17"], ["n16", "n10"], ["n4", "n1"], ["n8"], ["n12"], ["n6"]],
+        )
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": ("n0",),
+                "n3": ("n0", "n1"),
+                "n4": ("n0",),
+                "n5": ("n0",),
+                "n6": ("n5", "n3", "n0", "n2"),
+                "n7": (),
+                "n8": ("n2", "n5", "n3", "n1", "n7", "n6", "n0"),
+                "n9": ("n4",),
+                "n10": ("n4", "n5", "n1", "n2", "n0", "n6", "n8", "n9", "n7"),
+                "n11": ("n3", "n0", "n9", "n10", "n5", "n1", "n2", "n7", "n4", "n6"),
+                "n12": ("n9", "n5"),
+            },
+            ["n8", "n3", "n1", "n12", "n2", "n5", "n11", "n4", "n10", "n6", "n0"],
+            0,
+            100,
+            [
+                ["n8", "n12"],
+                ["n3", "n2", "n5", "n4"],
+                ["n1", "n0"],
+                ["n11"],
+                ["n10"],
+                ["n6"],
+            ],
+        )
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": (),
+                "n3": (),
+                "n4": ("n2", "n3"),
+                "n5": ("n1", "n3", "n2", "n4"),
+                "n6": ("n5", "n4", "n1", "n3"),
+                "n7": ("n5",),
+                "n8": ("n5", "n4", "n1"),
+                "n9": ("n2", "n3", "n1", "n5", "n7", "n0", "n8"),
+                "n10": ("n5", "n3", "n1", "n7", "n8", "n9"),
+                "n11": ("n1", "n4", "n2", "n0", "n8", "n9"),
+                "n12": ("n4", "n3", "n9"),
+                "n13": (
+                    "n6",
+                    "n10",
+                    "n4",
+                    "n8",
+                    "n0",
+                    "n11",
+                    "n12",
+                    "n7",
+                    "n3",
+                    "n2",
+                    "n1",
+                ),
+                "n14": ("n4", "n13", "n2"),
+                "n15": ("n11", "n7", "n6", "n10", "n14"),
+                "n16": ("n15", "n3"),
+                "n17": ("n10", "n2", "n7", "n0", "n5", "n6", "n9"),
+                "n18": (
+                    "n16",
+                    "n8",
+                    "n6",
+                    "n9",
+                    "n11",
+                    "n12",
+                    "n14",
+                    "n5",
+                    "n13",
+                    "n4",
+                    "n1",
+                ),
+            },
+            [
+                "n1",
+                "n0",
+                "n16",
+                "n6",
+                "n15",
+                "n9",
+                "n7",
+                "n4",
+                "n3",
+                "n11",
+                "n13",
+                "n17",
+                "n12",
+                "n18",
+            ],
+            0,
+            100,
+            [
+                ["n1", "n0", "n4"],
+                ["n16", "n17"],
+                ["n6", "n9"],
+                ["n15"],
+                ["n7"],
+                ["n3"],
+                ["n11", "n12"],
+                ["n13"],
+                ["n18"],
+            ],
+        )
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": (),
+                "n3": ("n2",),
+                "n4": ("n1",),
+                "n5": (),
+                "n6": ("n1", "n4"),
+                "n7": ("n5", "n1"),
+                "n8": ("n6",),
+                "n9": ("n6", "n1", "n2", "n0"),
+                "n10": ("n0", "n7"),
+                "n11": ("n0", "n4", "n3", "n5"),
+                "n12": ("n9", "n8", "n7", "n4", "n0"),
+            },
+            ["n8", "n9", "n11", "n2", "n4", "n0", "n7", "n5", "n1"],
+            0,
+            100,
+            [["n8", "n9", "n11", "n7"], ["n2", "n4", "n0", "n5"], ["n1"]],
+        )
+        self.verify(
+            {"n0": (), "n1": (), "n2": (), "n3": ("n0",), "n4": ("n3",)},
+            ["n1", "n2", "n4"],
+            0,
+            100,
+            [["n1", "n2", "n4"]],
+        )
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": ("n1",),
+                "n3": ("n2", "n1"),
+                "n4": ("n3",),
+                "n5": (),
+                "n6": ("n1", "n5"),
+                "n7": (),
+                "n8": ("n4", "n5"),
+                "n9": ("n0", "n3", "n6", "n4", "n5", "n8", "n7", "n1"),
+                "n10": ("n3", "n0", "n6", "n9", "n7"),
+                "n11": (),
+                "n12": ("n1", "n8", "n3", "n6", "n7", "n0", "n10", "n5", "n9", "n11"),
+                "n13": ("n9", "n11", "n4"),
+                "n14": (),
+                "n15": ("n6", "n12"),
+                "n16": (
+                    "n1",
+                    "n7",
+                    "n10",
+                    "n3",
+                    "n9",
+                    "n0",
+                    "n2",
+                    "n5",
+                    "n8",
+                    "n13",
+                    "n14",
+                    "n15",
+                    "n4",
+                    "n6",
+                ),
+            },
+            [
+                "n11",
+                "n16",
+                "n5",
+                "n12",
+                "n7",
+                "n2",
+                "n0",
+                "n6",
+                "n3",
+                "n9",
+                "n8",
+                "n15",
+                "n14",
+                "n4",
+                "n13",
+                "n1",
+            ],
+            0,
+            100,
+            [
+                ["n11", "n5", "n7", "n2", "n0", "n14"],
+                ["n16"],
+                ["n12", "n13"],
+                ["n6", "n3"],
+                ["n9"],
+                ["n8"],
+                ["n15"],
+                ["n4"],
+                ["n1"],
+            ],
+        )
+        self.verify({"n0": (), "n1": ()}, ["n1"], 0, 100, [["n1"]])
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": ("n1",),
+                "n3": (),
+                "n4": ("n0", "n2", "n3"),
+                "n5": ("n2", "n3"),
+                "n6": ("n3",),
+            },
+            ["n6", "n2", "n3", "n1"],
+            0,
+            100,
+            [["n6", "n2"], ["n3", "n1"]],
+        )
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": (),
+                "n3": ("n2",),
+                "n4": ("n0",),
+                "n5": ("n1", "n2"),
+                "n6": ("n2", "n3", "n1", "n0", "n5"),
+                "n7": ("n6", "n2", "n0", "n4", "n5", "n1"),
+                "n8": ("n4",),
+                "n9": ("n4", "n6", "n7", "n1", "n2"),
+            },
+            ["n8", "n6", "n2", "n4", "n7", "n5", "n3", "n9"],
+            0,
+            100,
+            [["n8", "n6"], ["n2", "n4"], ["n7"], ["n5", "n3"], ["n9"]],
+        )
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": (),
+                "n3": ("n1", "n2"),
+                "n4": ("n0",),
+                "n5": ("n2", "n3", "n0", "n1"),
+                "n6": ("n4", "n1"),
+                "n7": ("n5",),
+                "n8": ("n7", "n1", "n5", "n6", "n3", "n4", "n0"),
+                "n9": ("n2", "n8"),
+            },
+            ["n1", "n7", "n4", "n2", "n0", "n8", "n3", "n5"],
+            0,
+            100,
+            [["n1", "n4", "n2"], ["n7"], ["n0", "n3"], ["n8"], ["n5"]],
+        )
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": ("n0",),
+                "n3": ("n1",),
+                "n4": ("n2", "n1"),
+                "n5": (),
+                "n6": ("n0",),
+                "n7": ("n6", "n3", "n2", "n1", "n0"),
+                "n8": ("n0", "n2"),
+                "n9": ("n6", "n5", "n8", "n4", "n0"),
+                "n10": ("n1", "n7", "n5", "n8", "n6", "n2", "n4", "n9"),
+            },
+            ["n0"],
+            0,
+            100,
+            [["n0"]],
+        )
+
+        # trivial test of min_fuse
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": (),
+                "n3": ("n1", "n2"),
+                "n4": ("n1",),
+                "n5": (),
+                "n6": ("n5",),
+                "n7": ("n1", "n6", "n5", "n2", "n3", "n0"),
+                "n8": ("n5", "n7", "n2", "n6"),
+                "n9": ("n1",),
+                "n10": ("n9",),
+                "n11": ("n3", "n4", "n0", "n2"),
+                "n12": ("n8", "n9", "n5", "n1"),
+                "n13": ("n11", "n4", "n12", "n1", "n9", "n3", "n0"),
+            },
+            ["n9", "n2", "n8", "n10", "n5", "n6", "n13", "n7", "n3", "n0", "n4"],
             2,
+            10,
+            [["n9", "n2", "n5", "n0", "n4"], ["n8", "n10"], ["n6", "n3"]],
         )
 
+        # trivial test of max_fuse
+        self.verify(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": (),
+                "n3": ("n1", "n2"),
+                "n4": ("n1",),
+                "n5": (),
+                "n6": ("n5",),
+                "n7": ("n1", "n6", "n5", "n2", "n3", "n0"),
+                "n8": ("n5", "n7", "n2", "n6"),
+                "n9": ("n1",),
+                "n10": ("n9",),
+                "n11": ("n3", "n4", "n0", "n2"),
+                "n12": ("n8", "n9", "n5", "n1"),
+                "n13": ("n11", "n4", "n12", "n1", "n9", "n3", "n0"),
+            },
+            ["n9", "n2", "n8", "n10", "n5", "n6", "n13", "n7", "n3", "n0", "n4"],
+            0,
+            3,
+            [
+                ["n9", "n2", "n5"],
+                ["n8", "n10", "n4"],
+                ["n6", "n3", "n0"],
+                ["n13"],
+                ["n7"],
+            ],
+        )
+
+    def test_find_independent_subset_greedy_fuse(self):
+        # ensure that fusing the sets during iteration results in the correct
+        # iteration results. In the example graph after we merge n2 and n3,
+        # n4 is no longer independent from n1.
+        g, lookup = self.build_graph(
+            {
+                "n0": (),
+                "n1": (),
+                "n2": ("n0",),
+                "n3": ("n1",),
+                "n4": ("n2",),
+                "n5": (),
+            }
+        )
+        opts = {
+            "min_fuse_set_size": 0,
+            "max_fuse_set_size": 100,
+        }
+        subnodes = ["n2", "n3", "n4", "n0", "n1", "n5"]
+        subnodes = [lookup[n] for n in subnodes]
+        i = torch._inductor.fx_passes.group_batch_fusion.find_independent_subset_greedy(
+            subnodes, opts
+        )
+        self.assertEqual(next(i), [lookup[n] for n in ["n2", "n3", "n5"]])
+
+        # fuse n2 and n3 which makes n4 now dependant on n1.
+        args = tuple(lookup[n] for n in ["n0", "n1"])
+        fused = g.create_node("placeholder", "target", name="n2+n3", args=args)
+        lookup["n2"].replace_all_uses_with(fused)
+        g.erase_node(lookup["n2"])
+        lookup["n3"].replace_all_uses_with(fused)
+        g.erase_node(lookup["n3"])
+
+        self.assertEqual(next(i), [lookup[n] for n in ["n4"]])
+        self.assertEqual(next(i), [lookup[n] for n in ["n0", "n1"]])
+        self.assertRaises(StopIteration, lambda: next(i))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/inductor/test_indexing.py b/test/inductor/test_indexing.py
index d30a75f1ce4c6..299a619f9cd61 100644
--- a/test/inductor/test_indexing.py
+++ b/test/inductor/test_indexing.py
@@ -2,19 +2,19 @@
 import sympy
 
 from torch._inductor.codegen.cpp import cexpr
-from torch._inductor.codegen.triton import texpr, TritonPrinter
+from torch._inductor.codegen.triton import texpr
 from torch._inductor.codegen.wrapper import pexpr
 
 from torch._inductor.sizevars import SizeVarAllocator
+from torch._inductor.test_case import TestCase as InductorTestCase
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
-    TestCase as TorchTestCase,
 )
 from torch.utils._sympy.functions import FloorDiv, ModularIndexing, Round, RoundDecimal
 
 
-class TestIndexingSimplification(TorchTestCase):
+class TestIndexingSimplification(InductorTestCase):
     def test_indexing_simplification(self):
         sizevars = SizeVarAllocator()
         i0 = sympy.Symbol("i0", integer=True)
@@ -87,12 +87,12 @@ def test_indexing_simplification(self):
         self.assertEqual(FloorDiv(i0 * 4, 2), i0 * 2)
 
         # Nested modular indexing is correctly simplified
-        var_ranges = {"i1": 13, "i2": 121}
+        var_ranges = {sympy.Symbol("i1"): 13, sympy.Symbol("i2"): 121}
         expr = ModularIndexing(ModularIndexing(121 * i1 + i2, 1, 784), 1, 28)
         self.assertEqual(sizevars.simplify_with_ranges(expr, var_ranges), expr)
         expr = ModularIndexing(ModularIndexing(121 * i1 + i2, 1, 784) + 1, 1, 28)
         self.assertEqual(sizevars.simplify_with_ranges(expr, var_ranges), expr)
-        var_ranges = {"i2": 784}
+        var_ranges = {sympy.Symbol("i2"): 784}
         expr = ModularIndexing(ModularIndexing(i2, 1, 28), 7, 4)
         expected = FloorDiv(ModularIndexing(i2, 1, 28), 7)
         self.assertEqual(sizevars.simplify_with_ranges(expr, var_ranges), expected)
@@ -160,7 +160,7 @@ def test_indexing_join(self):
         self.assertEqual(expr6.subs({i0: 39485}), simplified.subs({i0: 39485}))
 
 
-class ExprPrinterTests(TorchTestCase):
+class ExprPrinterTests(InductorTestCase):
     def test_print_pow(self):
         s1 = sympy.Symbol("foo", integer=True)
         s2 = sympy.Symbol("bar", integer=True)
@@ -212,7 +212,8 @@ def test_print_floor(self):
             else:
                 self.assertExpectedInline(pexpr(expr), """math.floor((1/2)*s1)""")
                 self.assertExpectedInline(
-                    texpr(expr), """tl.math.floor((1/2)*s1).to(tl.int64)"""
+                    texpr(expr),
+                    """libdevice.floor((1/2)*s1).to(tl.int64)""",
                 )
                 self.assertExpectedInline(cexpr(expr), """std::floor((1.0/2.0)*s1)""")
 
@@ -234,7 +235,7 @@ def test_print_round(self):
         self.assertExpectedInline(pexpr(expr), """round((1/2)*x)""")
         self.assertExpectedInline(cexpr(expr), """std::lrint((1.0/2.0)*x)""")
         self.assertExpectedInline(
-            texpr(expr), """tl.math.llrint((1/2)*x).to(tl.int64)"""
+            texpr(expr), """libdevice.llrint((1/2)*x).to(tl.int64)"""
         )
 
     @parametrize("ndigits", [-1, 0, 1])
@@ -247,7 +248,7 @@ def test_print_round_decimal(self, ndigits):
         )
         self.assertEqual(
             texpr(expr),
-            f"tl.math.nearbyint(1e{ndigits} * ((1/2)*x)) * 1e{-ndigits}",
+            f"libdevice.nearbyint(1e{ndigits} * ((1/2)*x)) * 1e{-ndigits}",
         )
 
         expr = RoundDecimal(sympy.Symbol("x", integer=True), ndigits)
@@ -295,17 +296,16 @@ def test_print_Min_Max(self):
             (sympy.Min, "min"),
             (sympy.Max, "max"),
         )
-        extra_arg = TritonPrinter._propagate_nan_arg()
         for f, s in cases:
             x = sympy.Symbol("x", integer=True)
             expr = f(-2, x)
-            self.assertEqual(texpr(expr), f"tl.math.{s}(-2, x{extra_arg})")
+            self.assertEqual(texpr(expr), f"tl.{s}imum(-2, x)")
             self.assertEqual(cexpr(expr), f"std::{s}(-2L, x)")
 
             expr = f(x, 2 * x, 3 * x)
             self.assertEqual(
                 texpr(expr),
-                f"tl.math.{s}(x, tl.math.{s}(2*x, 3*x{extra_arg}){extra_arg})",
+                f"tl.{s}imum(x, tl.{s}imum(2*x, 3*x))",
             )
             self.assertEqual(cexpr(expr), f"std::{s}({{x, 2L*x, 3L*x}})")
 
@@ -314,7 +314,7 @@ def test_print_Min_Max(self):
 
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
     from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
     if HAS_CPU or HAS_CUDA:
diff --git a/test/inductor/test_inductor_freezing.py b/test/inductor/test_inductor_freezing.py
index 78dad8a37df5b..7d1688b366c4f 100644
--- a/test/inductor/test_inductor_freezing.py
+++ b/test/inductor/test_inductor_freezing.py
@@ -12,6 +12,7 @@
 
 from torch import nn
 from torch._inductor import config
+from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.utils import override_lowering, run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import SM80OrLater
@@ -24,7 +25,7 @@
     IS_CI,
     IS_WINDOWS,
     TEST_WITH_ASAN,
-    TestCase as TorchTestCase,
+    TEST_WITH_ROCM,
 )
 
 if IS_WINDOWS and IS_CI:
@@ -44,10 +45,10 @@
 
 aten = torch.ops.aten
 prims = torch.ops.prims
-requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 
 
-class TestCase(TorchTestCase):
+class TestCase(InductorTestCase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
@@ -89,6 +90,102 @@ def forward(self, x):
         return self.bn(self.conv(x))
 
 
+class ConvFunctionalBN(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        bias=False,
+        kernel_size=3,
+        stride=2,
+        running_mean=None,
+        running_var=None,
+        weight=None,
+        bn_bias=None,
+    ):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels, out_channels, bias=bias, kernel_size=kernel_size, stride=stride
+        )
+        self.running_mean = running_mean
+        self.running_var = running_var
+        self.weight = weight
+        self.bias = bn_bias
+
+    def forward(self, x):
+        return torch.nn.functional.batch_norm(
+            self.conv(x),
+            self.running_mean,
+            self.running_var,
+            self.weight,
+            self.bias,
+            False,
+            0.1,
+            1e-5,
+        )
+
+
+class ConvMultiBN(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, bias=False, **kwargs):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels, out_channels, bias=bias, **kwargs)
+        self.bn = torch.nn.BatchNorm2d(out_channels, eps=0.001, dtype=torch.float)
+        self.bn2 = torch.nn.BatchNorm2d(out_channels, eps=0.1, dtype=torch.float)
+
+    def forward(self, x):
+        tmp = self.bn(self.conv(x))
+        tmp2 = self.bn2(self.conv(x))
+        return tmp + tmp2
+
+
+class ConvMultiFunctionalBN(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        bias=False,
+        kernel_size=3,
+        stride=2,
+        running_mean=None,
+        running_var=None,
+        weight=None,
+        bn_bias=None,
+        running_mean2=None,
+    ):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels, out_channels, bias=bias, kernel_size=kernel_size, stride=stride
+        )
+        self.running_mean = running_mean
+        self.running_var = running_var
+        self.weight = weight
+        self.bias = bn_bias
+        self.running_mean2 = running_mean2
+
+    def forward(self, x):
+        tmp = torch.nn.functional.batch_norm(
+            self.conv(x),
+            self.running_mean,
+            self.running_var,
+            self.weight,
+            self.bias,
+            False,
+            0.1,
+            1e-5,
+        )
+        tmp2 = torch.nn.functional.batch_norm(
+            self.conv(x),
+            self.running_mean2,
+            self.running_var,
+            self.weight,
+            self.bias,
+            False,
+            0.1,
+            1e-5,
+        )
+        return tmp + tmp2
+
+
 class OptimizeForInferenceTemplate(TestCase):
     def test_mutation(self):
         class Mod(torch.nn.Module):
@@ -284,7 +381,7 @@ def fn(a):
             torch._dynamo.mark_dynamic(inp2, 1)
             self.assertEqual(fn(inp2), fn_opt(inp2))
 
-    @requires_cuda()
+    @requires_cuda
     def test_conv_multiple_uses(self):
         from torch import nn
 
@@ -359,11 +456,6 @@ def foo(mod, x):
                 out_eager = mod(x)
                 out_optimized_for_infernece, code = run_and_get_code(foo, mod, x)
 
-            self.assertNotIn(
-                "aten._native_batch_norm_legit_no_training(",
-                code[0],
-            )
-
             # we unfuse the conv bias, but it should only have one constant in the kernel
             if self.device == "cuda":
                 FileCheck().check_not(".run(").check("conv").check(".run(").check_same(
@@ -374,6 +466,129 @@ def foo(mod, x):
                 out_optimized_for_infernece, out_eager, atol=1e-2, rtol=1e-2
             )
 
+    @torch._inductor.config.patch(layout_optimization=False)
+    def test_folded_conv_bn_with_module_sharing(self):
+        mod = (
+            ConvBN(32, 32, bias=True, kernel_size=3, stride=2)
+            .to(self.device)
+            .to(torch.float32)
+        )
+
+        # Update the default parameters of BN module
+        for _ in range(10):
+            mod(torch.rand(3, 32, 32, 32).to(self.device).to(torch.float32))
+
+        mod.eval()
+        x = torch.rand(3, 32, 32, 32).to(self.device).to(torch.float32)
+
+        def foo(mod, x):
+            mod(x)
+            return mod(x)
+
+        with torch.no_grad():
+            out_eager = foo(mod, x)
+            out_optimized_for_infernece, _ = run_and_get_code(
+                torch.compile(foo), mod, x
+            )
+
+        self.assertEqual(out_optimized_for_infernece, out_eager, atol=1e-2, rtol=1e-2)
+
+    @torch._inductor.config.patch(layout_optimization=False)
+    def test_folded_conv_functional_bn_with_module_sharing(self):
+        x = torch.rand(3, 32, 32, 32).to(self.device).to(torch.float32)
+        running_mean = torch.mean(x, dim=(0, 2, 3)).to(self.device)
+        running_var = torch.var(x, dim=(0, 2, 3)).to(self.device)
+
+        mod = (
+            ConvFunctionalBN(
+                32,
+                32,
+                bias=True,
+                kernel_size=3,
+                stride=2,
+                running_mean=running_mean,
+                running_var=running_var,
+                weight=torch.ones(32).to(self.device),
+                bn_bias=torch.zeros(32).to(self.device),
+            )
+            .eval()
+            .to(self.device)
+            .to(torch.float32)
+        )
+
+        def foo(mod, x):
+            mod(x)
+            return mod(x)
+
+        with torch.no_grad():
+            out_eager = foo(mod, x)
+            out_optimized_for_infernece, _ = run_and_get_code(
+                torch.compile(foo), mod, x
+            )
+
+        self.assertEqual(out_optimized_for_infernece, out_eager, atol=1e-2, rtol=1e-2)
+
+    @torch._inductor.config.patch(layout_optimization=False)
+    def test_conv_bn_with_multi_bn_share_conv(self):
+        mod = (
+            ConvMultiBN(32, 32, bias=True, kernel_size=3, stride=2)
+            .to(self.device)
+            .to(torch.float32)
+        )
+
+        # Update the default parameters of BN module
+        for _ in range(10):
+            mod(torch.rand(3, 32, 32, 32).to(self.device).to(torch.float32))
+
+        mod.eval()
+        x = torch.rand(3, 32, 32, 32).to(self.device).to(torch.float32)
+
+        def foo(mod, x):
+            return mod(x)
+
+        with torch.no_grad():
+            out_eager = foo(mod, x)
+            out_optimized_for_infernece, _ = run_and_get_code(
+                torch.compile(foo), mod, x
+            )
+
+        self.assertEqual(out_optimized_for_infernece, out_eager, atol=1e-2, rtol=1e-2)
+
+    @torch._inductor.config.patch(layout_optimization=False)
+    def test_conv_functional_bn_with_multi_bn_share_conv(self):
+        x = torch.rand(3, 32, 32, 32).to(self.device).to(torch.float32)
+        running_mean = torch.mean(x, dim=(0, 2, 3)).to(self.device)
+        running_var = torch.var(x, dim=(0, 2, 3)).to(self.device)
+        running_mean2 = torch.mean(x, dim=(0, 2, 3)).to(self.device)
+
+        mod = (
+            ConvMultiFunctionalBN(
+                32,
+                32,
+                bias=True,
+                kernel_size=3,
+                stride=2,
+                running_mean=running_mean,
+                running_var=running_var,
+                weight=torch.ones(32).to(self.device),
+                bn_bias=torch.zeros(32).to(self.device),
+                running_mean2=running_mean2,
+            )
+            .eval()
+            .to(self.device)
+            .to(torch.float32)
+        )
+
+        def foo(mod, x):
+            return mod(x)
+
+        with torch.no_grad():
+            out_eager = foo(mod, x)
+            out_optimized_for_infernece, _ = run_and_get_code(
+                torch.compile(foo), mod, x
+            )
+        self.assertEqual(out_optimized_for_infernece, out_eager, atol=1e-2, rtol=1e-2)
+
     @torch._inductor.config.patch(layout_optimization=False)
     def test_dont_change_dtype_folding(self):
         dtype = torch.float16 if self.device == "cuda" else torch.bfloat16
@@ -564,6 +779,41 @@ def my_inner_compile(gm, example_inputs, *args, **kwargs):
         if self.device == "cuda":
             self.assertTrue(nconv == 1)
 
+    def test_unequal_bias_horizontal_addmm_fusion(self):
+        device = self.device
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w1 = torch.tensor(
+                    [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]], device=device
+                )
+                self.b1 = torch.zeros(3, device=device)
+                self.w2 = torch.tensor(
+                    [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0]], device=device
+                )
+                self.b2 = torch.tensor([[-1.0, -1.0, -1.0]], device=device)
+                self.w3 = torch.tensor(
+                    [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]], device=device
+                )
+                self.b3 = torch.tensor([1.0, 2.0, 3.0], device=device)
+
+            def forward(self, x):
+                out1 = torch.nn.functional.linear(x, self.w1, self.b1)
+                out2 = torch.nn.functional.linear(x, self.w2, self.b2)
+                out3 = torch.nn.functional.linear(x, self.w3, self.b3)
+                return (out1, out2, out3)
+
+        func = Model().to(device).eval()
+        x = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], device=device)
+
+        with torch.no_grad():
+            out_eager = func(x.clone())
+
+            func1 = torch.compile(func)
+            out_compiled = func1(x.clone())
+            self.assertEqual(out_eager, out_compiled)
+
     def test_redundant_clone_for_layout_convert(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -633,6 +883,10 @@ def debug_inductor_force_stride_order(orig_fn, input_tensor, stride):
         self.assertTrue(num_diff_stride == 1, f"num_diff_stride is {num_diff_stride}")
 
 
+if TEST_WITH_ROCM:
+    torch._inductor.config.force_layout_optimization = 1
+    os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC"] = "1"
+
 if HAS_CPU and not torch.backends.mps.is_available():
 
     class FreezingCpuTests(TestCase):
@@ -656,7 +910,7 @@ class FreezingCudaTests(TestCase):
 
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if HAS_CPU or HAS_CUDA:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_inductor_utils.py b/test/inductor/test_inductor_utils.py
index ed829be35359b..f86dd9219e892 100644
--- a/test/inductor/test_inductor_utils.py
+++ b/test/inductor/test_inductor_utils.py
@@ -4,11 +4,11 @@
 import logging
 
 import torch
+from torch._inductor.runtime.runtime_utils import do_bench
 
-from torch._dynamo.test_case import run_tests, TestCase
-
-from torch._inductor.utils import do_bench, do_bench_using_profiling
+from torch._inductor.test_case import run_tests, TestCase
 
+from torch._inductor.utils import do_bench_using_profiling
 
 log = logging.getLogger(__name__)
 
diff --git a/test/inductor/test_inplacing_pass.py b/test/inductor/test_inplacing_pass.py
index 8062782e06959..802d68017fbbb 100644
--- a/test/inductor/test_inplacing_pass.py
+++ b/test/inductor/test_inplacing_pass.py
@@ -1,7 +1,7 @@
 # Owner(s): ["module: inductor"]
 
 import torch
-from torch._dynamo.test_case import run_tests, TestCase
+from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.common_utils import IS_LINUX
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
diff --git a/test/inductor/test_kernel_benchmark.py b/test/inductor/test_kernel_benchmark.py
index 462238bd0ed7b..e779693baf2ce 100644
--- a/test/inductor/test_kernel_benchmark.py
+++ b/test/inductor/test_kernel_benchmark.py
@@ -5,15 +5,19 @@
 from unittest.mock import patch
 
 import torch
-from torch._dynamo.test_case import run_tests, TestCase
+from torch._dynamo.testing import rand_strided
 from torch._inductor import config
 from torch._inductor.codecache import PyCodeCache
+from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import fresh_inductor_cache
 from torch.testing import FileCheck
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.common_device_type import expectedFailureXPU
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 
 
 class TestKernelBenchmark(TestCase):
+    device_type = GPU_TYPE
+
     @classmethod
     def setUpClass(cls):
         cls.exit_stack = contextlib.ExitStack()
@@ -39,7 +43,7 @@ def get_compiled_module(self):
         self.assertTrue(compiled_module is not None)
         return compiled_module
 
-    def verify_compiled_kernels(self):
+    def verify_compiled_kernels(self, GB_count=1):
         compiled_module = self.get_compiled_module()
 
         # now run the compiled module in subprocess and check its output
@@ -51,6 +55,20 @@ def verify_compiled_kernels(self):
         # make sure we have the bandwidth information in the output
         FileCheck().check_count(
             "GB/s",
+            GB_count,
+            exactly=1,
+        ).run(bench_out)
+
+    def check_bandwidth(self, compiled_module, num_gb):
+        # now run the compiled module in subprocess and check its output
+        bench_out = subprocess.check_output(
+            f"{sys.executable} {compiled_module.__file__} -k".split(),
+            stderr=subprocess.STDOUT,
+        ).decode()
+
+        # make sure we have the bandwidth information in the output
+        FileCheck().check_count(
+            f"{num_gb} GB ",
             1,
             exactly=1,
         ).run(bench_out)
@@ -60,7 +78,7 @@ def test_pw_kernel_benchmark(self):
         def f(x):
             return torch.sin(x) + torch.cos(x)
 
-        inp = torch.rand(2, 3).cuda()
+        inp = torch.rand(2, 3).to(device=GPU_TYPE)
         out = f(inp)
         self.verify_compiled_kernels()
 
@@ -70,8 +88,8 @@ def test_matmul_triton_kernel_benchmark(self):
         M = 12544
         N = 256
         K = 64
-        a = torch.rand(M, K, dtype=torch.float16, device="cuda")
-        b = torch.rand(N, K, dtype=torch.float16, device="cuda").t()
+        a = torch.rand(M, K, dtype=torch.float16, device=GPU_TYPE)
+        b = torch.rand(N, K, dtype=torch.float16, device=GPU_TYPE).t()
 
         @torch.compile
         def f(a, b):
@@ -80,7 +98,44 @@ def f(a, b):
         f(a, b)
         self.verify_compiled_kernels()
 
-    def test_bandwidth_computation(self):
+    @expectedFailureXPU
+    @config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
+    @fresh_inductor_cache()
+    def test_mm_triton_kernel_benchmark(self):
+        M = 2048
+        N = 2432
+        K = 1949
+        K_2 = 3581
+        a = rand_strided((M, K_2), (K_2, 1), device="cuda", dtype=torch.float16)
+        b = rand_strided((K, N), (1, K), device="cuda", dtype=torch.float16)
+
+        @torch.compile
+        def f(a, b):
+            a_1 = torch.narrow(a, 1, 0, K)
+            c = torch.mm(a_1, b)
+            return c
+
+        f(a, b)
+        self.verify_compiled_kernels(GB_count=3)
+
+        # make sure we correctly generate the grid info
+        compiled_module = self.get_compiled_module()
+        with open(compiled_module.__file__) as f:
+            source_code = f.read()
+        lines = source_code.split("\n")
+        meta = [l for l in lines if "meta0 = {" in l]
+        scope = {}
+        from torch._inductor.kernel.mm_common import mm_grid
+
+        exec(meta[0], scope)
+        grid = mm_grid(M, N, scope["meta0"])
+        FileCheck().check_count(
+            f"grid={grid}",
+            2,
+            exactly=1,
+        ).run(source_code)
+
+    def test_matmul_bandwidth_computation(self):
         """
         The test does a matmul and then mul. Without max-autotune, we use
         the matmul in aten. So there is a single triton kernel for mul.
@@ -103,26 +158,224 @@ def f(x, y):
             return w
 
         M, N, K = 1000, 1000, 10
-        x = torch.rand(M, K).to("cuda")
-        y = torch.rand(K, N).to("cuda")
+        x = torch.rand(M, K).to(device=GPU_TYPE)
+        y = torch.rand(K, N).to(device=GPU_TYPE)
         out = f(x, y)
 
         compiled_module = self.get_compiled_module()
 
-        # now run the compiled module in subprocess and check its output
-        bench_out = subprocess.check_output(
-            f"{sys.executable} {compiled_module.__file__} -k".split(),
-            stderr=subprocess.STDOUT,
-        ).decode()
+        self.check_bandwidth(compiled_module, 0.008)
 
-        # make sure we have the bandwidth information in the output
-        FileCheck().check_count(
-            "0.008 GB ",
-            1,
-            exactly=1,
-        ).run(bench_out)
+    def test_unused_input_bandwidth_computation(self):
+        M, N = 5, 1000000
+
+        @torch.compile
+        def f(a, b, c):
+            return a + c
+
+        a = torch.rand(M, N, dtype=torch.float16, device=GPU_TYPE)
+        b = torch.rand(M, N, dtype=torch.float16, device=GPU_TYPE)
+        c = torch.rand(M, N, dtype=torch.float16, device=GPU_TYPE)
+        torch._dynamo.mark_dynamic(a, 0)
+        torch._dynamo.mark_dynamic(b, 0)
+        torch._dynamo.mark_dynamic(c, 0)
+        inputs = (a, b, c)
+        out = f(*inputs)
+
+        compiled_module = self.get_compiled_module()
+        # num_gb = size_a + size_c + size_out
+        # num_gb = (5 * 1000000 + 5 * 1000000 + 5 * 1000000) * 2 / 1e9
+        #        = 0.030
+        self.check_bandwidth(compiled_module, "0.030")
+
+    def test_reduction_bandwidth_computation(self):
+        @torch.compile
+        def f(a):
+            return torch.sum(a, dim=1)
+
+        a = torch.rand(1000, 20, 1000, dtype=torch.float16, device=GPU_TYPE)
+        inputs = (a,)
+        out = f(*inputs)
+
+        compiled_module = self.get_compiled_module()
+        # num_gb = size_a + size_out
+        # num_gb = (1000 * 20 * 1000 + 1000 * 1000) * 2 / 1e9
+        #        = 0.042
+        self.check_bandwidth(compiled_module, "0.042")
+
+    @config.patch(max_autotune=True)
+    def test_fused_layernorm_bandwidth_computation(self):
+        M, N = 10, 1000000
+
+        @torch.compile
+        def f(a, b, c, d):
+            x0 = a + b
+            x1 = torch.nn.functional.layer_norm(
+                x0, normalized_shape=(N,), weight=c, bias=d, eps=1e-05
+            )
+            x2 = torch.sigmoid(x1)
+            return x0 * x2
+
+        a = torch.rand(M, N, dtype=torch.float16, device=GPU_TYPE)
+        b = torch.rand(N, dtype=torch.float16, device=GPU_TYPE)
+        c = torch.rand(N, dtype=torch.float16, device=GPU_TYPE)
+        d = torch.rand(N, dtype=torch.float16, device=GPU_TYPE)
+        inputs = (a, b, c, d)
+        out = f(*inputs)
+
+        compiled_module = self.get_compiled_module()
+        # num_gb = size_a + size_b + size_c + size_d + size_out
+        # num_gb = (10 * 1000000 + 1000000 + 1000000 + 1000000 + 10 * 1000000) * 2 / 1e9
+        #        = 0.046
+        self.check_bandwidth(compiled_module, "0.046")
+
+    def test_slice_add_cat_bandwidth_computation(self):
+        M, N = 5, 1000000
+
+        @torch.compile
+        def f(a, b, c):
+            x0 = torch.narrow(b, 1, N, N)
+            # broadcasting
+            x1 = x0 + c
+            return torch.cat([a, x1], dim=1)
+
+        a = torch.rand(M, N, dtype=torch.float16, device=GPU_TYPE)
+        b = torch.rand(M, N * 5, dtype=torch.float16, device=GPU_TYPE)
+        c = torch.rand(N, dtype=torch.float16, device=GPU_TYPE)
+        torch._dynamo.mark_dynamic(a, 0)
+        torch._dynamo.mark_dynamic(b, 0)
+        inputs = (a, b, c)
+        out = f(*inputs)
+
+        compiled_module = self.get_compiled_module()
+        # we overestimate the size of "slice_b" due to torch.cat
+        # num_gp = size_a + size_slice_b + size_c + size_out
+        # num_gb = (5 * 1000000 + 5 * 2000000 + 1000000 + 5 * 2000000) * 2 / 1e9
+        #        = 0.052
+        self.check_bandwidth(compiled_module, "0.052")
+
+    def test_slice_add_bandwidth_computation(self):
+        M, N = 5, 1000000
+
+        @torch.compile
+        def f(a, b, c):
+            x0 = torch.narrow(b, 1, N, N)
+            return a + x0 + c
+
+        a = torch.rand(M, N, dtype=torch.float16, device=GPU_TYPE)
+        b = torch.rand(M, N * 5, dtype=torch.float16, device=GPU_TYPE)
+        c = torch.rand(N, dtype=torch.float16, device=GPU_TYPE)
+        torch._dynamo.mark_dynamic(a, 0)
+        torch._dynamo.mark_dynamic(b, 0)
+        inputs = (a, b, c)
+        out = f(*inputs)
+
+        compiled_module = self.get_compiled_module()
+        # num_gb = size_a + size_slice_b + size_c + out_size
+        # num_gb = (5 * 1000000 + 5 * 1000000 + 1000000 + 5 * 1000000) * 2 / 1e9
+        #        = 0.032
+        self.check_bandwidth(compiled_module, "0.032")
+
+    def test_mm_slice_add_bandwidth_computation(self):
+        M, N, K = 1000, 1000, 30
+
+        @torch.compile
+        def f(a, b, c):
+            x0 = torch.mm(a, b)
+            x1 = torch.narrow(c, 1, 20 * N, N)
+            x2 = torch.narrow(c, 1, 21 * N, N)
+            return x0 + x1 + x2
+
+        a = torch.rand(M, K, dtype=torch.float16, device=GPU_TYPE)
+        b = torch.rand(K, N, dtype=torch.float16, device=GPU_TYPE)
+        c = torch.rand(N, N * 100, dtype=torch.float16, device=GPU_TYPE)
+        inputs = (a, b, c)
+        out = f(*inputs)
+
+        compiled_module = self.get_compiled_module()
+        # torch.mm becomes an extern kernel, so we measure the nbytes
+        # for the pointwise add kernel:
+        # num_gb = x0 + 2 * size_slice_c + size_out
+        # num_gb = (1000 * 1000 + 2 * 1000 * 1000 + 1000 * 1000) * 2/ 1e9
+        #        = 0.008
+        num_gb = "0.008"
+        if GPU_TYPE == "xpu":
+            # In XPU backend, mm + add + add will be fused as admm + add
+            # And CUDA prefer not fuse add + mm, please check in function
+            # `should_prefer_unfused_addmm` in torch/_inductor/fx_passes/post_grad.py
+            num_gb = "0.006"
+
+        self.check_bandwidth(compiled_module, num_gb)
+
+    def test_mm_slice_add_bandwidth_computation_2(self):
+        M, N, K = 1000, 1000, 30
+
+        @torch.compile
+        def f(a, b, c):
+            x0 = torch.mm(a, b)
+            x1 = torch.narrow(c, 1, 20 * N, N)
+            x2 = torch.narrow(c, 1, 20 * N, N)
+            return x0 + x1 + x2
+
+        a = torch.rand(M, K, dtype=torch.float16, device=GPU_TYPE)
+        b = torch.rand(K, N, dtype=torch.float16, device=GPU_TYPE)
+        c = torch.rand(N, N * 100, dtype=torch.float16, device=GPU_TYPE)
+        inputs = (a, b, c)
+        out = f(*inputs)
+
+        compiled_module = self.get_compiled_module()
+        # torch.mm becomes an extern kernel, so we measure the nbytes
+        # for the pointwise add kernel:
+        # num_gb = x0 + size_slice_c + size_out
+        # num_gb = (1000 * 1000 + 1000 * 1000 + 1000 * 1000) * 2 / 1e9
+        #        = 0.006
+        # note that we only count one size_slice_c because two accesses
+        # have the same index.
+        self.check_bandwidth(compiled_module, "0.006")
+
+    @expectedFailureXPU
+    @config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
+    def test_slice_mm_bandwidth_computation(self):
+        M, N, K = 1000, 2000, 3000
+
+        @torch.compile
+        def f(a, b):
+            x = torch.narrow(a, 1, K, K)
+            return torch.mm(x, b)
+
+        a = torch.rand(M, 3 * K, dtype=torch.float16, device=GPU_TYPE)
+        b = torch.rand(K, N, dtype=torch.float16, device=GPU_TYPE)
+        torch._dynamo.mark_dynamic(a, 0)
+        inputs = (a, b)
+        out = f(*inputs)
+
+        compiled_module = self.get_compiled_module()
+
+        # c[1000, 2000] = x[1000, 3000] @ b[3000, 2000]
+        # num_gb = (1000 * 2000 + 1000 * 3000 + 3000 * 2000) * 2 / 1e9
+        #        = 0.022
+        self.check_bandwidth(compiled_module, "0.022")
+
+    def test_star_dep(self):
+        """
+        Test the bandwidth estimation for StarDep
+        """
+
+        @torch.compile
+        def f(a, b):
+            a[b] = 3.0
+
+        a = torch.rand(10000, 5000, device=GPU_TYPE)
+        b = torch.randint(
+            0, 10000, [20000], device=GPU_TYPE, dtype=torch.int32
+        ).unsqueeze(1)
+        f(a, b)
+        compiled_module = self.get_compiled_module()
+        # 20000 * 4 = 80KB for b
+        # 20000 * 5000 * 4 = 200MB for a
+        self.check_bandwidth(compiled_module, "0.200")
 
 
 if __name__ == "__main__":
-    if HAS_CUDA:
+    if HAS_GPU:
         run_tests()
diff --git a/test/inductor/test_layout_optim.py b/test/inductor/test_layout_optim.py
index 3b5ec6d918cb9..d58d0d277816c 100644
--- a/test/inductor/test_layout_optim.py
+++ b/test/inductor/test_layout_optim.py
@@ -5,9 +5,10 @@
 
 import torch
 from torch import nn
-from torch._dynamo.test_case import run_tests, TestCase
 from torch._dynamo.utils import same
 from torch._inductor import config
+from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.common_cuda import tf32_off
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
 USE_DDP_WRAPPER = os.environ.get("USE_DDP_WRAPPER", "1") == "1"
@@ -227,6 +228,7 @@ def f(x):
         y = f(x)
         self.assertTrue(torch.equal(y, torch.ones(3, 2).cuda() * 2))
 
+    @tf32_off()
     def test_mutate_base_for_conv_output(self):
         class Model(nn.Module):
             def __init__(self, manual_graph_break=False):
@@ -244,6 +246,7 @@ def get_example_inputs(self):
 
         self.verify_accuracy_for_infer(Model)
 
+    @tf32_off()
     def test_mutate_view_for_conv_output(self):
         class Model(nn.Module):
             def __init__(self, manual_graph_break=False):
@@ -283,6 +286,53 @@ def f(a, b):
             # Trigger the compiling of the backward graph
             actual.sum().backward()
 
+    def test_nll_loss_backward(self):
+        """
+        Repro for issue https://github.com/pytorch/pytorch/issues/120759
+
+        The CUDA implementation of aten.nll_loss2d_backward.default requires
+        the self tensor (whose layout will be used to create grad_input)
+        to be contiguous. Layout optimization may change the self tensor's layout
+        and cause failure. We fix that by adding layout constaints to the
+        fallback of aten.nll_loss2d_backward.default .
+        """
+
+        class MyModel(torch.nn.Module):
+            def __init__(self, input_dim, num_classes):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, num_classes, 3, 1, padding="same")
+                self.out = torch.nn.Linear(input_dim * num_classes, num_classes)
+
+            def forward(self, x: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+                x = self.conv(x)
+                b, c, t, f = x.size()
+                x = self.out(x.reshape(b, t, c * f))
+                logits = x.reshape(x.size(0), x.size(2), x.size(1))
+                loss = torch.nn.functional.cross_entropy(logits, targets)
+                return loss
+
+        device = "cuda"
+        batch_size = 48
+        seq_len = 144
+        input_dim = 39
+        num_classes = 111
+
+        model = MyModel(input_dim, num_classes)
+        model.to(device)
+
+        opt_model = torch.compile(model)
+
+        x = torch.ones((batch_size, 1, seq_len, input_dim), device=device)
+        targets = torch.randint(
+            0, num_classes - 1, (batch_size, seq_len), device=device, dtype=torch.int64
+        )
+
+        loss = model(x, targets)
+        loss.backward()
+
+        ref = model(x, targets)
+        self.assertTrue(torch.allclose(ref, loss))
+
 
 if __name__ == "__main__":
     if HAS_CUDA:
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index 6808252cf3cf5..c8622de6faf8f 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -1,4 +1,5 @@
 # Owner(s): ["module: inductor"]
+import json
 import os
 import unittest
 
@@ -6,9 +7,8 @@
 
 import torch
 from torch import multiprocessing as mp
-from torch._dynamo.test_case import run_tests, TestCase
+from torch._dynamo import reset
 from torch._dynamo.testing import reset_rng_state
-from torch._dynamo.utils import counters
 from torch._inductor import config
 from torch._inductor.autotune_process import (
     BenchmarkRequest,
@@ -16,18 +16,18 @@
     TuningProcessPool,
 )
 from torch._inductor.graph import GraphLowering
-from torch._inductor.ir import Buffer, FixedLayout
+from torch._inductor.ir import Buffer, ChoiceCaller, FixedLayout
 from torch._inductor.kernel.mm_plus_mm import aten_mm_plus_mm
 from torch._inductor.select_algorithm import (
     AlgorithmSelectorCache,
-    ChoiceCaller,
     TritonTemplateCaller,
 )
-from torch._inductor.utils import run_and_get_code
+from torch._inductor.test_case import run_tests, TestCase
+
+from torch._inductor.utils import fresh_inductor_cache, run_and_get_code
 from torch._inductor.virtualized import V
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
-from torch.testing._internal.common_cuda import SM75OrLater, SM90OrLater
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -220,227 +220,148 @@ def mm(a, b):
         with config.patch({"max_autotune": True}):
             torch.compile(mm, dynamic=dynamic)(a, b)
 
-    # TODO: Enable dynamic test cases when dynamic support is added.
-    @unittest.skipIf(not SM75OrLater, "need sm_75")
-    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
-    @parametrize("dynamic", (False,))
-    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS"))
-    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
-    def test_max_autotune_cutlass_backend_regular_mm(
-        self, dynamic: bool, max_autotune_gemm_backends: str
-    ):
-        """
-        Make sure autotuning mm in sub processes work without crashes.
-        """
-
-        if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip:
-            return
-
-        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+    @skipIfRocm
+    @parametrize("dynamic", (False, True))
+    def test_max_autotune_remote_caching(self, dynamic: bool):
+        from unittest.mock import patch
 
         def mm(a, b):
+            a = torch.sin(a)
             return a @ b
 
-        a = torch.randn(100, 10).cuda().half()
-        b = torch.randn(10, 100).cuda().half()
+        a = torch.randn(100, 10).cuda()
+        b = torch.randn(10, 100).cuda()
 
-        with config.patch(
-            {
-                "max_autotune": True,
-                "autotune_in_subproc": False,
-                "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                "cuda.cutlass_dir": _CUTLASS_DIR,
-                "cuda.cutlass_max_profiling_configs": 2,
-            }
-        ):
-            Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
-            Y = mm(a, b)
-            torch.testing.assert_close(Y_compiled, Y)
-
-    def _test_max_autotune_cutlass_backend_epilogue_fusion(
-        self,
-        dynamic: bool = False,
-        max_autotune_gemm_backends: str = "CUTLASS",
-        mixed_precision=False,
-        fp16=True,
-        expected_fuse_count=1,
-        mm: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None,
-    ):
-        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
-            mixed_precision
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        def f(x, y):
+            return Model()(x, y)
+
+        x = torch.randn(100, 100).cuda()
+        y = torch.randn(100, 100).cuda()
+
+        cache = {}
+        num_get = 0
+        num_put = 0
+
+        class MyCache:
+            def __init__(self, key, is_autotune=False):
+                pass
+
+            def get(self, filename):
+                nonlocal cache
+                nonlocal num_get
+                if filename not in cache:
+                    return None
+                ret = json.loads(cache[filename])
+                num_get += 1
+                return ret
+
+            def put(self, filename, data):
+                nonlocal cache
+                nonlocal num_put
+                cache[filename] = json.dumps(data)
+                num_put += 1
+
+        cache_module = (
+            "triton.runtime.fb_memcache.FbMemcacheRemoteAutotuneCacheBackend"
+            if config.is_fbcode()
+            else "triton.runtime.cache.RedisRemoteCacheBackend"
         )
 
-        # Note: The ops that are available
-        # also depend on the alignment of the shapes
-        # so if these shapes don't all align to at least 8 elements
-        # it can happen that no Cutlass 3.x op is available
-        # that allows fusions
-        a = torch.randn(256, 32).cuda()
-        b = torch.randn(32, 256).cuda()
-        if fp16:
-            a = a.half()
-            b = b.half()
-
         with config.patch(
             {
-                "max_autotune": True,
-                "autotune_in_subproc": False,
-                "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                "cuda.cutlass_dir": _CUTLASS_DIR,
-                "cuda.cutlass_max_profiling_configs": 4,
-                "cuda.cutlass_only_evt_capable_ops": True,
-                "cuda.version": "12.2",  # required to enable the Kernels we need
+                "autotune_local_cache": False,
+                "autotune_remote_cache": True,
             }
-        ):
-            counters["inductor"]["cuda_epilogue_fusion_counter"] = 0
-            Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
-            Y = mm(a, b)
-            actual_count = counters["inductor"]["cuda_epilogue_fusion_counter"]
-            assert (
-                actual_count == expected_fuse_count
-            ), f"Expected fuse count of {expected_fuse_count} but got {actual_count}"
-            torch.testing.assert_close(Y_compiled, Y, atol=1e-2, rtol=1e-2)
-
-    @unittest.skipIf(not SM90OrLater, "need sm_90")
-    @unittest.skipIf(torch.version.hip, "HIP not supported")
-    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
-    def test_max_autotune_cutlass_backend_simple_fusion_fp16(self):
-        def mm(a, b):
-            return (a @ b) * 3.0
-
-        #  The pointwise ops seem to be pre-fused into a single Pointwise
-        self._test_max_autotune_cutlass_backend_epilogue_fusion(
-            mixed_precision=False, fp16=True, expected_fuse_count=1, mm=mm
-        )
-
-    @unittest.skipIf(not SM90OrLater, "need sm_90")
-    @unittest.skipIf(torch.version.hip, "HIP not supported")
-    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
-    def test_max_autotune_cutlass_backend_simple_fusion_fp16_fp32acc(self):
-        def mm(a, b):
-            return (a @ b) * 3.0
-
-        self._test_max_autotune_cutlass_backend_epilogue_fusion(
-            mixed_precision=True, fp16=True, expected_fuse_count=1, mm=mm
-        )
-
-    @unittest.skipIf(not SM90OrLater, "need sm_90")
-    @unittest.skipIf(torch.version.hip, "HIP not supported")
-    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
-    def test_max_autotune_cutlass_backend_chained_fusion_fp16(self):
-        def mm(a, b):
-            return (a @ b) * 3.3 - 1.234
-
-        #  The pointwise ops seem to be pre-fused into a single Pointwise
-        self._test_max_autotune_cutlass_backend_epilogue_fusion(
-            mixed_precision=False, fp16=True, expected_fuse_count=1, mm=mm
-        )
-
-    @unittest.skipIf(not SM90OrLater, "need sm_90")
-    @unittest.skipIf(torch.version.hip, "HIP not supported")
-    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
-    def test_max_autotune_cutlass_backend_chained_fusion_fp16_fp32acc(self):
-        def mm(a, b):
-            return (a @ b) * 3.3 - 1.234
-
-        self._test_max_autotune_cutlass_backend_epilogue_fusion(
-            mixed_precision=True, fp16=True, expected_fuse_count=1, mm=mm
-        )
-
-    @unittest.skipIf(not SM90OrLater, "need sm_90")
-    @unittest.skipIf(torch.version.hip, "HIP not supported")
-    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
-    def test_max_autotune_cutlass_backend_relu_fusion_fp16(self):
-        def mm(a, b):
-            return torch.nn.functional.relu((a @ b) * 3.3 - 1.234)
-
-        self._test_max_autotune_cutlass_backend_epilogue_fusion(
-            mixed_precision=False, fp16=True, expected_fuse_count=1, mm=mm
-        )
-
-    @unittest.skipIf(not SM90OrLater, "need sm_90")
-    @unittest.skipIf(torch.version.hip, "HIP not supported")
-    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
-    def test_max_autotune_cutlass_backend_relu_fusion_fp16_fp32acc(self):
-        def mm(a, b):
-            return torch.nn.functional.relu((a @ b) * 3.3 - 1.234)
-
-        #  The pointwise ops seem to be pre-fused into a single Pointwise
-        self._test_max_autotune_cutlass_backend_epilogue_fusion(
-            mixed_precision=True, fp16=True, expected_fuse_count=1, mm=mm
-        )
-
-    @unittest.skipIf(not SM90OrLater, "need sm_90")
-    @unittest.skipIf(torch.version.hip, "HIP not supported")
-    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
-    def test_max_autotune_cutlass_backend_relu6_fusion_fp16_fp32acc(self):
-        def mm(a, b):
-            return torch.clamp(torch.nn.functional.relu(a @ b), max=6.0)
-
-        #  The pointwise ops seem to be pre-fused into a single Pointwise
-        self._test_max_autotune_cutlass_backend_epilogue_fusion(
-            mixed_precision=True, fp16=True, expected_fuse_count=1, mm=mm
-        )
-
-    @unittest.skipIf(not SM90OrLater, "need sm_90")
-    @unittest.skipIf(torch.version.hip, "HIP not supported")
-    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
-    def test_max_autotune_cutlass_backend_no_fusion_dtype_mismatch(self):
-        def mm(a, b):
-            # this should not be fused, since the output dtype is different from the matmul dtype
-            return (a @ b).to(torch.float32) * 0.00001
-
-        self._test_max_autotune_cutlass_backend_epilogue_fusion(
-            mixed_precision=True, fp16=True, expected_fuse_count=0, mm=mm
-        )
-
-    @unittest.skipIf(not SM90OrLater, "need sm_90")
-    @unittest.skipIf(torch.version.hip, "HIP not supported")
-    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
-    def test_max_autotune_cutlass_backend_shape_dependent_normalization_fusion(self):
-        def mm(a, b):
-            return (a @ b) / b.size(1)
-
-        self._test_max_autotune_cutlass_backend_epilogue_fusion(
-            mixed_precision=True, fp16=True, expected_fuse_count=1, mm=mm
-        )
-
-    # TODO: Enable dynamic test cases when dynamic support is added.
-    @unittest.skipIf(not SM75OrLater, "need sm_75")
-    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
-    @parametrize("dynamic", (False,))
-    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS"))
-    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
-    def test_max_autotune_cutlass_backend_mm_bias(
-        self, dynamic: bool = False, max_autotune_gemm_backends: str = "CUTLASS"
-    ):
-        """
-        Make sure autotuning mm in sub processes work without crashes.
-        """
-
-        if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip:
-            return
-
-        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
-
-        def mm(a, b, bias):
-            return torch.nn.functional.linear(a, b, bias)
+        ), patch.dict(os.environ), patch(cache_module, MyCache, create=True):
+            os.environ.pop("TRITON_CACHE_MANAGER", None)
+            with config.patch({"max_autotune": True}):
+                for _ in range(4):
+                    with fresh_inductor_cache():
+                        torch.compile(mm, dynamic=dynamic)(a, b)
+                    reset()
+                self.assertEqual(num_get, 3)
+                self.assertEqual(num_put, 1)
+            num_get = 0
+            num_put = 0
+            for _ in range(4):
+                with fresh_inductor_cache():
+                    torch.compile(f, dynamic=dynamic)(x, y)
+                reset()
+            self.assertEqual(num_get, 3)
+            self.assertEqual(num_put, 1)
 
-        a = torch.randn(2048, 4096).cuda().half()
-        bias = torch.randn(2048).cuda().half()
-
-        with config.patch(
-            {
-                "max_autotune": True,
-                "autotune_in_subproc": False,
-                "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                "cuda.cutlass_dir": _CUTLASS_DIR,
-                "cuda.cutlass_max_profiling_configs": 2,
-            }
-        ):
-            Y = mm(a, a, bias)
-            Y_compiled = torch.compile(mm, dynamic=dynamic)(a, a, bias)
-            torch.testing.assert_close(Y_compiled, Y, atol=1e-1, rtol=1e-1)
+    @skipIfRocm
+    def test_precompilation_threads(self):
+        import threading
+        from typing import Any, Dict
+        from unittest.mock import Mock, patch
+
+        class FakeChoiceCaller(ChoiceCaller):
+            def __init__(self):
+                super().__init__("none", [], Mock())
+                self.thread_id = None
+
+            def precompile(self):
+                self.thread_id = threading.get_ident()
+
+            def call_name(self) -> str:
+                return None
+
+            def to_callable(self):
+                return None
+
+            def hash_key(self) -> str:
+                return None
+
+            def output_node(self) -> "TensorBox":  # noqa: F821
+                return None
+
+        fake_choices = [FakeChoiceCaller() for i in range(10)]
+        fake_lookup_result = {choice: 0.123 for choice in fake_choices}
+
+        def no_lookup(
+            choices: List[ChoiceCaller],
+            op: str,
+            inputs: str,
+            benchmark: Callable[[Any], Dict[ChoiceCaller, float]],
+        ) -> Dict[ChoiceCaller, float]:
+            if benchmark is not None:
+                return benchmark(choices)
+
+        asc = AlgorithmSelectorCache()
+
+        def fake_benchmark_fn(*args, **kwargs):
+            return fake_lookup_result
+
+        main_thread_id = threading.get_ident()
+        mock_debug_handler = Mock()
+        old_debug_handler = V.debug
+        try:
+            V.set_debug_handler(mock_debug_handler)
+            with patch.object(asc, "lookup", new=no_lookup):
+                with patch.object(
+                    asc, "make_benchmark_fn", return_value=fake_benchmark_fn
+                ):
+                    with config.patch(
+                        {
+                            "autotune_in_subproc": False,
+                            "compile_threads": len(fake_choices),
+                        }
+                    ):
+                        asc("test_call", fake_choices, [], Mock())
+            for fake_choice in fake_choices:
+                assert (
+                    fake_choice.thread_id is not None
+                ), "Expected all ChoiceCaller's precompile method to have been called"
+                assert (
+                    fake_choice.thread_id != main_thread_id
+                ), "Expected all ChoiceCaller's precompile method to have been called on separate thread"
+        finally:
+            V.set_debug_handler(old_debug_handler)
 
     @parametrize("dynamic", (False, True))
     def test_max_autotune_addmm(self, dynamic=False):
@@ -476,60 +397,6 @@ def addmm(x, a, b):
         with config.patch({"max_autotune": True}):
             torch.compile(addmm, dynamic=dynamic)(x, a, b)
 
-    # TODO: Enable dynamic test cases when dynamic support is added.
-    @unittest.skipIf(not SM75OrLater, "need sm_75")
-    @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
-    @parametrize("dynamic", (False,))
-    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS"))
-    @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
-    def test_max_autotune_cutlass_backend_addmm(
-        self, dynamic, max_autotune_gemm_backends
-    ):
-        """
-        Make sure autotuning addmm in sub processes work without crashes.
-        """
-
-        if max_autotune_gemm_backends == "CUTLASS" and torch.version.hip:
-            return
-
-        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
-
-        def addmm(x, a, b, alpha, beta):
-            return torch.addmm(x, a, b, alpha=alpha, beta=beta)
-
-        def compare_results(
-            m: int, k: int, n: int, alpha: float, beta: float, x_shape: List[int]
-        ) -> None:
-            x = torch.randn(x_shape).cuda().half()
-            a = torch.randn(m, k).cuda().half()
-            b = torch.randn(k, n).cuda().half()
-            y_expected = addmm(x, a, b, alpha, beta)
-
-            compiled_fn = torch.compile(addmm, dynamic=dynamic)
-            y = compiled_fn(x, a, b, alpha, beta)
-            torch.testing.assert_close(y, y_expected)
-
-        with config.patch(
-            {
-                "max_autotune": True,
-                "autotune_in_subproc": False,
-                "max_autotune_gemm_backends": max_autotune_gemm_backends,
-                "cuda.cutlass_dir": _CUTLASS_DIR,
-                "cuda.cutlass_max_profiling_configs": 2,
-            }
-        ):
-            # No broadcast
-            compare_results(4096, 25728, 2048, 2.0, 0.4, [4096, 2048])
-            # Broadcast first dim.
-            compare_results(4096, 25728, 2048, 2.0, 0.4, [2048])
-            # Broadcast last dim.
-            if not SM90OrLater and max_autotune_gemm_backends == "CUTLASS":
-                with self.assertRaisesRegex(RuntimeError, "No choices to select"):
-                    # CUTLASS2 doesn't support Bias last-dim broadcast.
-                    compare_results(4096, 25728, 2048, 2.0, 0.4, [4096, 1])
-            else:
-                compare_results(4096, 25728, 2048, 2.0, 0.4, [4096, 1])
-
     @skipIfRocm
     def test_autotune_conv1x1(self):
         # Assuming input has 3 channels and we want to produce 16 channels as output
@@ -561,6 +428,57 @@ def foo(mod, x):
             FileCheck().check_not("extern_kernels.convolution").run(code[0])
             self.assertEqual(conv1x1(input_tensor), out, atol=1e-2, rtol=0)
 
+    @skipIfRocm
+    def test_filled_cache_precompile(self):
+        def fn(a, b, c):
+            a = (a @ b) @ c
+            a, b, c = (t.to(torch.float16) for t in [a, b, c])
+            return (a @ b) @ c
+
+        fn_c = torch.compile(mode="max-autotune-no-cudagraphs")(fn)
+        inputs = [torch.rand([256, 256], device="cuda") for _ in range(3)]
+        from torch._dynamo.utils import counters
+
+        self.assertEqual(fn(*inputs), fn_c(*inputs), atol=1e-2, rtol=1e-2)
+
+        torch._dynamo.reset()
+        counters.clear()
+
+        fn_c = torch.compile(mode="max-autotune-no-cudagraphs")(fn)
+        self.assertEqual(counters["inductor"]["select_algorithm_precompile"], 0)
+
+    @skipIfRocm
+    @fresh_inductor_cache()
+    @config.patch(search_autotune_cache=True)
+    def test_search_autotune_cache(self):
+        def fn(a, b, c):
+            a = (a @ b) @ c
+            a, b, c = (t.to(torch.float16) for t in [a, b, c])
+            return (a @ b) @ c
+
+        fn_c = torch.compile()(fn)
+        inputs = [torch.rand([256, 256], device="cuda") for _ in range(3)]
+        from torch._dynamo.utils import counters
+
+        self.assertEqual(fn(*inputs), fn_c(*inputs), atol=1e-2, rtol=1e-2)
+        self.assertEqual(counters["inductor"]["select_algorithm_precompile"], 0)
+
+    @config.patch(autotune_local_cache=False, autotune_remote_cache=False)
+    def test_precompilations(self):
+        def fn(a, b, c):
+            a = (a @ b) @ c
+            a, b, c = (t.to(torch.float16) for t in [a, b, c])
+            return (a @ b) @ c
+
+        fn_c = torch.compile(mode="max-autotune-no-cudagraphs")(fn)
+        inputs = [torch.rand([256, 256], device="cuda") for _ in range(3)]
+
+        self.assertEqual(fn(*inputs), fn_c(*inputs), atol=1e-2, rtol=1e-2)
+
+        from torch._dynamo.utils import counters
+
+        self.assertEqual(counters["inductor"]["select_algorithm_precompile"], 2)
+
     def test_cat_addmm(self):
         def fn(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
             return torch.cat(
@@ -659,6 +577,51 @@ def fn(a, b):
             print(f"ref\n{ref}\nact\n{act}")
         torch.testing.assert_close(ref, act, atol=1e-1, rtol=1e-1)
 
+    @config.patch(
+        max_autotune_gemm=True,
+    )
+    @unittest.skipIf(
+        torch.cuda.device_count() < 2, "Need at least 2 devices for this test"
+    )
+    def test_autotune_device_guard(self):
+        x = torch.randn(1024, 1024, device="cuda:1")
+        y = torch.randn(1024, 1024, device="cuda:1")
+
+        def f(x, y):
+            return x @ y
+
+        with fresh_inductor_cache():
+            act = torch.compile(f)(x, y)
+        ref = f(x, y)
+        self.assertTrue(torch.allclose(act, ref, atol=4 * 1e-3, rtol=4 * 1e-3))
+
+    @config.patch(max_autotune=True)
+    def test_empty_conv_input(self, kernel_size=3):
+        x = torch.randn(0, 256, 14, 14, device="cuda")
+        weight = torch.randn(256, 256, kernel_size, kernel_size, device="cuda")
+
+        def f(x, weight):
+            return torch.convolution(
+                x,
+                weight,
+                bias=None,
+                stride=[1, 1],
+                padding=[0, 0],
+                dilation=[1, 1],
+                transposed=False,
+                output_padding=[0, 0],
+                groups=1,
+            )
+
+        opt_f = torch.compile(f)
+        ref = f(x, weight)
+        act = opt_f(x, weight)
+        self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3))
+
+    @config.patch(max_autotune=True)
+    def test_empty_conv_input_with_1x1_kernel(self):
+        self.test_empty_conv_input(kernel_size=1)
+
 
 class TestBenchmarkRequest(BenchmarkRequest):
     def __init__(
diff --git a/test/inductor/test_memory_planning.py b/test/inductor/test_memory_planning.py
index 5c22d2d781567..1bd546e5b4dfb 100644
--- a/test/inductor/test_memory_planning.py
+++ b/test/inductor/test_memory_planning.py
@@ -3,6 +3,7 @@
 import sys
 
 from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS, skipIfRocm
+from torch.testing._internal.inductor_utils import HAS_CUDA
 
 if IS_WINDOWS and IS_CI:
     sys.stderr.write(
@@ -13,14 +14,14 @@
     raise unittest.SkipTest("requires sympy/functorch/filelock")  # noqa: F821
 
 import unittest
-from typing import List
 
 import torch
 from test_torchinductor import run_and_get_cpp_code
 from torch._C import FileCheck
-from torch._dynamo.test_case import run_tests, TestCase
 from torch._dynamo.utils import same
 from torch._inductor import config
+from torch._inductor.test_case import run_tests, TestCase
+from torch.export import Dim
 from torch.utils._triton import has_triton
 
 
@@ -32,17 +33,18 @@ def _generate(self, *, device):
         Generate a simple test case that has multiple simultaneously-live intermediate tensors.
         """
 
-        def f(x, y, z):
-            t0 = x.matmul(y)
-            t1 = x.matmul(z)
-            t0 = x.transpose(0, 1).matmul(t1)
-            t1 = x.matmul(t0)
-            return t0.sum() + t1.sum()
+        class Foo(torch.nn.Module):
+            def forward(self, x, y, z):
+                t0 = x.matmul(y)
+                t1 = x.matmul(z)
+                t0 = x.transpose(0, 1).matmul(t1)
+                t1 = x.matmul(t0)
+                return t0.sum() + t1.sum()
 
         x = torch.randn((3, 2), device=device)
         y = torch.randn((2, 4), device=device)
         z = torch.randn((2, 3), device=device)
-        return (f, (x, y, z))
+        return (Foo(), (x, y, z))
 
     def test_python_wrapper(self):
         f, args = self._generate(device="cuda")
@@ -50,7 +52,7 @@ def test_python_wrapper(self):
         result, code = run_and_get_cpp_code(compiled, *args)
 
         FileCheck().check(
-            "pool1 = empty_strided(((4*s0*s1) + (align(4*(s0*s0))), ), (1, )"
+            "pool1 = empty_strided_cuda(((4*s0*s1) + (align(4*(s0*s0))), ), (1, )"
         ).check_next(
             "buf0 = alloc_from_pool(pool1, 0, torch.float32, (s0, s0), (s0, 1))"
         ).check(
@@ -68,11 +70,11 @@ def test_cpp_wrapper(self):
             result, code = run_and_get_cpp_code(compiled, *args)
 
         FileCheck().check(
-            "auto pool1 = at::empty_strided({(4L*s0*s1) + (align(4L*(static_cast<long>(s0*s0)))), }, {1L, }"
+            "pool1 = at::detail::empty_strided_cuda({(4L*s0*s1) + (align(4L*(static_cast<long>(s0*s0)))), }, {1L, }"
         ).check_next(
             "auto buf0 = alloc_from_pool(pool1, 0, at::kFloat, {s0, s0}, {s0, 1L});"
         ).check(
-            "auto buf1 = alloc_from_pool(pool1, align((4*s0) + (4*s0*((-1) + s0))),"
+            "auto buf1 = alloc_from_pool(pool1, align((4L*s0) + (4L*s0*((-1L) + s0))),"
         ).run(
             code
         )
@@ -83,13 +85,13 @@ def test_abi_compatible(self):
         from test_aot_inductor import AOTIRunnerUtil
 
         f, args = self._generate(device="cuda")
-        constraints: List[torch.export.Constraint] = [
-            torch._export.dynamic_dim(args[0], 0) >= 1,
-            torch._export.dynamic_dim(args[0], 0) <= 2048,
-        ]
-        with config.patch("aot_inductor.abi_compatible", True):
+        dim0_x = Dim("dim0_x", min=1, max=2048)
+        dynamic_shapes = ({0: dim0_x}, None, None)
+        with config.patch("abi_compatible", True):
             result, code = run_and_get_cpp_code(
-                lambda: AOTIRunnerUtil.run("cuda", f, args, constraints=constraints)
+                lambda: AOTIRunnerUtil.run(
+                    "cuda", f, args, dynamic_shapes=dynamic_shapes
+                )
             )
 
         FileCheck().check(
@@ -115,4 +117,5 @@ def test_abi_compatible(self):
 
 
 if __name__ == "__main__":
-    run_tests()
+    if HAS_CUDA:
+        run_tests()
diff --git a/test/inductor/test_metrics.py b/test/inductor/test_metrics.py
new file mode 100644
index 0000000000000..b644d553c8169
--- /dev/null
+++ b/test/inductor/test_metrics.py
@@ -0,0 +1,118 @@
+# Owner(s): ["module: inductor"]
+import torch
+from torch._inductor import config, metrics
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import collect_defined_kernels
+from torch._inductor.wrapper_benchmark import get_kernel_category_by_source_code
+from torch.testing._internal.common_device_type import largeTensorTest
+
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+
+example_kernel = """
+@triton_heuristics.reduction(
+    size_hints=[1024, 2048],
+    reduction_hint=ReductionHint.INNER,
+    filename=__file__,
+    triton_meta={
+        'signature': {0: '*fp32', 1: '*fp32', 2: 'i32', 3: 'i32'},
+        'device': 0,
+        'device_type': 'GPU_TYPE',
+        'constants': {},
+        'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=(), ids_of_folded_args=(), divisible_by_8=(2, 3))]},
+    inductor_meta={
+        'autotune_hints': set(),
+        'kernel_name': 'triton_red_fused_add_sum_2',
+        'mutated_arg_names': ['in_out_ptr0'],
+        'no_x_dim': False,
+        'kernel_num_gb': 0.0083968
+    }
+)
+@triton.jit
+def triton_red_fused_add_sum_2(in_out_ptr0, in_ptr0, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
+    xnumel = 1024
+    rnumel = 2048
+    xoffset = tl.program_id(0) * XBLOCK
+    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+    xmask = xindex < xnumel
+    rbase = tl.arange(0, RBLOCK)[None, :]
+    x0 = xindex
+    _tmp2 = tl.full([XBLOCK, RBLOCK], 0, tl.float32)
+    for roffset in range(0, rnumel, RBLOCK):
+        rindex = roffset + rbase
+        rmask = rindex < rnumel
+        r1 = rindex
+        tmp0 = tl.load(in_ptr0 + (r1 + (2048*x0)), rmask & xmask, eviction_policy='evict_first', other=0.0)
+        tmp1 = tl.broadcast_to(tmp0, [XBLOCK, RBLOCK])
+        tmp3 = _tmp2 + tmp1
+        _tmp2 = tl.where(rmask & xmask, tmp3, _tmp2)
+    tmp2 = tl.sum(_tmp2, 1)[:, None]
+    tmp4 = tl.load(in_out_ptr0 + (x0), xmask, eviction_policy='evict_last')
+    tmp5 = tmp4 + tmp2
+    tl.debug_barrier()
+    tl.store(in_out_ptr0 + (x0), tmp5, xmask)
+""".replace(
+    "GPU_TYPE", GPU_TYPE
+)
+
+
+class TestMetrics(TestCase):
+    def test_parse_proper_kernel_fn_code(self):
+        proper_kernel_fn_code = metrics._parse_proper_kernel_fn_code(example_kernel)
+        assert proper_kernel_fn_code.startswith("def ")
+
+    def test_count_args(self):
+        proper_kernel_fn_code = metrics._parse_proper_kernel_fn_code(example_kernel)
+        self.assertEqual(6, metrics._count_args(proper_kernel_fn_code))
+
+    def test_count_pattern(self):
+        proper_kernel_fn_code = metrics._parse_proper_kernel_fn_code(example_kernel)
+        self.assertEqual(2, metrics._count_pattern(proper_kernel_fn_code, "tl.load"))
+        self.assertEqual(1, metrics._count_pattern(proper_kernel_fn_code, "tl.store"))
+        self.assertEqual(1, metrics._count_pattern(proper_kernel_fn_code, "for "))
+
+    def test_parse_reduction_hint(self):
+        kernel_category = get_kernel_category_by_source_code(example_kernel)
+        self.assertEqual("reduction", kernel_category)
+        self.assertEqual(
+            "INNER", metrics._parse_reduction_hint(kernel_category, example_kernel)
+        )
+
+    def test_atomic_add(self):
+        @torch.compile
+        def f(lhs, index, rhs):
+            return lhs.index_put_([index], rhs, accumulate=True)
+
+        lhs = torch.randn(1024, device=GPU_TYPE)
+        index = torch.randint(0, 1024, [32], device=GPU_TYPE, dtype=torch.int32)
+        rhs = torch.randn(32, device=GPU_TYPE)
+
+        kernel_list = []
+        with collect_defined_kernels(kernel_list):
+            f(lhs, index, rhs)
+
+        self.assertEqual(len(kernel_list), 1)
+        kernel_code = kernel_list[0]
+        self.assertEqual(metrics._count_pattern(kernel_code, "tl.atomic_add"), 1)
+
+    @largeTensorTest(25e7 * 2 * 4, device=GPU_TYPE)
+    @config.patch("benchmark_kernel", True)
+    def test_kernel_args_num_gb(self):
+        @torch.compile
+        def f(x):
+            return x + 1
+
+        x = torch.randn(int(25e7), device=GPU_TYPE)
+        kernel_list = []
+        with collect_defined_kernels(kernel_list):
+            f(x)
+
+        self.assertEqual(len(kernel_list), 1)
+        kernel_code = kernel_list[0]
+        self.assertEqual(
+            metrics._parse_kernel_args_num_gb(kernel_code, "pointwise"), 2.0
+        )
+
+
+if __name__ == "__main__":
+    if HAS_GPU:
+        run_tests()
diff --git a/test/inductor/test_minifier.py b/test/inductor/test_minifier.py
index 9506a1ce9ddd2..6ddec1dcdec44 100644
--- a/test/inductor/test_minifier.py
+++ b/test/inductor/test_minifier.py
@@ -1,5 +1,4 @@
 # Owner(s): ["module: inductor"]
-import functools
 import unittest
 from unittest.mock import patch
 
@@ -8,10 +7,9 @@
 from torch._dynamo.test_minifier_common import MinifierTestBase
 from torch._inductor import config
 from torch.testing._internal.common_utils import IS_JETSON, IS_MACOS, TEST_WITH_ASAN
-from torch.utils._triton import has_triton
+from torch.testing._internal.inductor_utils import HAS_CUDA
 
-_HAS_TRITON = has_triton()
-requires_cuda = functools.partial(unittest.skipIf, not _HAS_TRITON, "requires cuda")
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 
 
 class MinifierTests(MinifierTestBase):
@@ -41,12 +39,12 @@ def test_after_aot_cpu_compile_error(self):
     def test_after_aot_cpu_accuracy_error(self):
         self._test_after_aot("cpu", "AccuracyError")
 
-    @requires_cuda()
+    @requires_cuda
     @inductor_config.patch("triton.inject_relu_bug_TESTING_ONLY", "compile_error")
     def test_after_aot_cuda_compile_error(self):
         self._test_after_aot("cuda", "SyntaxError")
 
-    @requires_cuda()
+    @requires_cuda
     @inductor_config.patch("triton.inject_relu_bug_TESTING_ONLY", "accuracy")
     def test_after_aot_cuda_accuracy_error(self):
         self._test_after_aot("cuda", "AccuracyError")
@@ -62,7 +60,7 @@ def inner(x):
 """
         self._run_full_test(run_code, "aot", "AccuracyError", isolate=False)
 
-    @requires_cuda()
+    @requires_cuda
     @patch.object(config, "joint_graph_constant_folding", False)
     def test_rmse_improves_over_atol(self):
         # From https://twitter.com/itsclivetime/status/1651135821045719041?s=20
diff --git a/test/inductor/test_minifier_isolate.py b/test/inductor/test_minifier_isolate.py
index 9d6b31d6dd546..c01849876e2e4 100644
--- a/test/inductor/test_minifier_isolate.py
+++ b/test/inductor/test_minifier_isolate.py
@@ -1,5 +1,4 @@
 # Owner(s): ["module: inductor"]
-import functools
 import unittest
 
 import torch._inductor.config as inductor_config
@@ -10,10 +9,9 @@
     skipIfRocm,
     TEST_WITH_ASAN,
 )
-from torch.utils._triton import has_triton
+from torch.testing._internal.inductor_utils import HAS_CUDA
 
-_HAS_TRITON = has_triton()
-requires_cuda = functools.partial(unittest.skipIf, not _HAS_TRITON, "requires cuda")
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 
 
 # These minifier tests are slow, because they must be run in separate
@@ -38,7 +36,7 @@ def test_after_aot_cpu_runtime_error(self):
         self._test_after_aot_runtime_error("cpu", "")
 
     @skipIfRocm
-    @requires_cuda()
+    @requires_cuda
     @inductor_config.patch("triton.inject_relu_bug_TESTING_ONLY", "runtime_error")
     def test_after_aot_cuda_runtime_error(self):
         self._test_after_aot_runtime_error("cuda", "device-side assert")
diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
index 13b31bceec7b2..cbf9dd89c506b 100644
--- a/test/inductor/test_mkldnn_pattern_matcher.py
+++ b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -8,10 +8,10 @@
 import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
 
 from torch._dynamo import config as dynamo_config
-from torch._dynamo.test_case import run_tests, TestCase
 from torch._dynamo.utils import counters
 from torch._export import capture_pre_autograd_graph
 from torch._inductor import config
+from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import run_and_get_code
 from torch.ao.quantization.quantize_pt2e import (
     convert_pt2e,
@@ -73,6 +73,16 @@
 ]
 
 
+def get_default_quantizer(is_qat, is_dynamic):
+    quantizer = X86InductorQuantizer()
+    quantizer.set_global(
+        xiq.get_default_x86_inductor_quantization_config(
+            is_qat=is_qat, is_dynamic=is_dynamic
+        )
+    )
+    return quantizer
+
+
 @config.patch({"freezing": True})
 class TestPatternMatcherBase(TestCase):
     def _check_unary_is_decomposed(self, unary_fn):
@@ -89,16 +99,17 @@ def clone(x):
 
         return tuple(clone(x) for x in inputs)
 
-    def _generate_qdq_quantized_model(self, mod, inputs, is_qat=False):
+    def _generate_qdq_quantized_model(
+        self, mod, inputs, is_qat=False, is_dynamic=False, quantizer=None
+    ):
         maybe_no_grad = contextlib.nullcontext() if is_qat else torch.no_grad()
         with maybe_no_grad:
             export_model = capture_pre_autograd_graph(
                 mod,
                 inputs,
             )
-            quantizer = X86InductorQuantizer()
-            quantizer.set_global(
-                xiq.get_default_x86_inductor_quantization_config(is_qat=is_qat)
+            quantizer = (
+                quantizer if quantizer else get_default_quantizer(is_qat, is_dynamic)
             )
             prepare_model = (
                 prepare_qat_pt2e(export_model, quantizer)
@@ -106,7 +117,7 @@ def _generate_qdq_quantized_model(self, mod, inputs, is_qat=False):
                 else prepare_pt2e(export_model, quantizer)
             )
             prepare_model(*inputs)
-            convert_model = convert_pt2e(prepare_model, fold_quantize=True)
+            convert_model = convert_pt2e(prepare_model)
             torch.ao.quantization.move_exported_model_to_eval(convert_model)
             return convert_model
 
@@ -118,22 +129,39 @@ def _test_common(
         matcher_nodes=None,
         atol=1e-5,
         rtol=1.3e-6,
-        check_autocast=False,
+        check_autocast=torch.float32,
         check_quantization=False,
         is_qat=False,
         matcher_check_fn=None,
+        dtype=None,
+        is_dynamic=False,
+        quantizer=None,
     ):
         counters.clear()
         torch._dynamo.reset()
-        maybe_autocast = contextlib.nullcontext()
         assert matcher_check_fn is not None or (
             matcher_count is not None and matcher_nodes is not None
         )
-        if check_autocast and torch.ops.mkldnn._is_mkldnn_bf16_supported():
-            maybe_autocast = torch.cpu.amp.autocast()
+        if (
+            check_autocast == torch.bfloat16
+            and torch.ops.mkldnn._is_mkldnn_bf16_supported()
+        ):
+            maybe_autocast = torch.cpu.amp.autocast(dtype=torch.bfloat16)
             atol, rtol = 1e-2, 1e-2
+        elif (
+            check_autocast == torch.float16
+            and torch.ops.mkldnn._is_mkldnn_fp16_supported()
+        ):
+            maybe_autocast = torch.cpu.amp.autocast(dtype=torch.float16)
+            atol, rtol = 1e-2, 1e-2
+        else:
+            assert check_autocast == torch.float32
+            maybe_autocast = contextlib.nullcontext()
+
         if check_quantization:
-            convert_model = self._generate_qdq_quantized_model(mod, inputs, is_qat)
+            convert_model = self._generate_qdq_quantized_model(
+                mod, inputs, is_qat, is_dynamic, quantizer
+            )
             with torch.no_grad(), maybe_autocast:
                 _ = torch.compile(convert_model)(*inputs)
                 if matcher_count is not None:
@@ -153,13 +181,17 @@ def _test_common(
                 expected = mod(*inputs)
                 actual = torch.compile(mod)(*clone_inputs)
                 torch.testing.assert_close(actual, expected, atol=atol, rtol=rtol)
-                self.assertEqual(
-                    counters["inductor"]["pattern_matcher_count"], matcher_count
-                )
-                self.assertEqual(
-                    counters["inductor"]["pattern_matcher_nodes"],
-                    matcher_nodes,
-                )
+                if matcher_count is not None:
+                    self.assertEqual(
+                        counters["inductor"]["pattern_matcher_count"], matcher_count
+                    )
+                if matcher_nodes is not None:
+                    self.assertEqual(
+                        counters["inductor"]["pattern_matcher_nodes"],
+                        matcher_nodes,
+                    )
+                if matcher_check_fn is not None:
+                    matcher_check_fn()
 
     def _test_code_common(
         self,
@@ -193,7 +225,9 @@ def _test_code_common(
 
 
 class TestPatternMatcher(TestPatternMatcherBase):
-    def test_conv2d_unary_cpu(self):
+    def _test_conv_unary_cpu_base(self, dim=4):
+        assert dim == 4 or dim == 5
+
         class M(torch.nn.Module):
             def __init__(
                 self,
@@ -201,25 +235,39 @@ def __init__(
                 **kwargs,
             ):
                 super().__init__()
-                self.conv = torch.nn.Conv2d(3, 16, kernel_size=3, stride=1)
+                if dim == 4:
+                    self.conv = torch.nn.Conv2d(3, 16, kernel_size=3, stride=1)
+                else:
+                    self.conv = torch.nn.Conv3d(3, 16, kernel_size=3, stride=1)
                 self.unary_fn = unary_fn
 
             def forward(self, x):
                 x = self.conv(x)
                 return self.unary_fn(x)
 
+        dtypes = [
+            torch.float,
+        ]
+        if torch.ops.mkldnn._is_mkldnn_bf16_supported():
+            dtypes.append(torch.bfloat16)
+        if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+            dtypes.append(torch.float16)
+        cl_format = torch.channels_last if dim == 4 else torch.channels_last_3d
         options = itertools.product(
             unary_list.keys(),
-            [torch.contiguous_format, torch.channels_last],
-            [True, False] if torch.ops.mkldnn._is_mkldnn_bf16_supported() else [False],
+            [torch.contiguous_format, cl_format],
+            dtypes,
         )
 
         for (
             unary_fn,
             memory_format,
-            check_autocast,
+            dtype,
         ) in options:
-            x_shape = (1, 3, 56, 56)
+            if dim == 4:
+                x_shape = (1, 3, 56, 56)
+            else:
+                x_shape = (1, 3, 20, 56, 56)
             mod = M(unary_fn).to(memory_format=memory_format).eval()
 
             v = (
@@ -229,10 +277,19 @@ def forward(self, x):
             )
             # Add 1 for weight packing pass.
             match_nodes = unary_list[unary_fn] + 1
-            if check_autocast and self._check_unary_is_decomposed(unary_fn):
+            if dtype in (
+                torch.float16,
+                torch.bfloat16,
+            ) and self._check_unary_is_decomposed(unary_fn):
                 # Has extra dtype conversion nodes for autocast.
                 match_nodes += 2
-            self._test_common(mod, (v,), 2, match_nodes, check_autocast=check_autocast)
+            self._test_common(mod, (v,), 2, match_nodes, check_autocast=dtype)
+
+    def test_conv2d_unary_cpu(self):
+        self._test_conv_unary_cpu_base(dim=4)
+
+    def test_conv3d_unary_cpu(self):
+        self._test_conv_unary_cpu_base(dim=5)
 
     def test_linear_unary(self):
         class M(torch.nn.Module):
@@ -257,24 +314,27 @@ def forward(self, x):
                 x = self.linear(x)
                 return self.unary_fn(x)
 
-        options = itertools.product(unary_list, [True, False])
-        dtype = torch.bfloat16
+        dtypes = []
         if torch.ops.mkldnn._is_mkldnn_bf16_supported():
-            for unary_fn, bias in options:
-                mod = M(unary_fn, 10, 30, bias=bias).eval()
-                # only fuse for linear when the dtype is bf16
-                mod = mod.to(dtype)
-                v = torch.randn(2, 10).to(dtype)
-                # packing pass + unary fusion.
-                matcher_count = 2
-                # Add 1 for weight packing pass.
-                matcher_nodes = unary_list[unary_fn] + 1
-                if self._check_unary_is_decomposed(unary_fn):
-                    # Has extra dtype conversion nodes for autocast.
-                    matcher_nodes += 2
-                self._test_common(
-                    mod, (v,), matcher_count, matcher_nodes, check_autocast=True
-                )
+            dtypes.append(torch.bfloat16)
+        if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+            dtypes.append(torch.float16)
+        options = itertools.product(unary_list, [True, False], dtypes)
+        for unary_fn, bias, dtype in options:
+            mod = M(unary_fn, 10, 30, bias=bias).eval()
+            # only fuse for linear when the dtype is bf16
+            mod = mod
+            v = torch.randn(2, 10)
+            # packing pass + unary fusion.
+            matcher_count = 2
+            # Add 1 for weight packing pass.
+            matcher_nodes = unary_list[unary_fn] + 1
+            if self._check_unary_is_decomposed(unary_fn):
+                # Has extra dtype conversion nodes for autocast.
+                matcher_nodes += 2
+            self._test_common(
+                mod, (v,), matcher_count, matcher_nodes, check_autocast=dtype
+            )
 
     @unittest.skipIf(not TEST_MKL, "Test requires MKL")
     def test_linear_fp32(self):
@@ -311,13 +371,21 @@ def forward(self, x):
                 x = self.conv_transpose2d(x)
                 return self.unary_fn(x)
 
+        dtypes = [
+            torch.float,
+        ]
+        if torch.ops.mkldnn._is_mkldnn_bf16_supported():
+            dtypes.append(torch.bfloat16)
+        if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+            dtypes.append(torch.float16)
+
         options = itertools.product(
             unary_list,
             [torch.contiguous_format, torch.channels_last],
-            [True, False] if torch.ops.mkldnn._is_mkldnn_bf16_supported() else [False],
+            dtypes,
         )
 
-        for unary_fn, memory_format, check_autocast in options:
+        for unary_fn, memory_format, dtype in options:
             x_shape = (1, 3, 28, 28)
             mod = M(unary_fn).eval()
 
@@ -326,12 +394,17 @@ def forward(self, x):
             )
             # Add 1 for weight packing pass.
             match_nodes = unary_list[unary_fn] + 1
-            if check_autocast and self._check_unary_is_decomposed(unary_fn):
+            if dtype in (
+                torch.float16,
+                torch.bfloat16,
+            ) and self._check_unary_is_decomposed(unary_fn):
                 # Has extra dtype conversion nodes for autocast.
                 match_nodes += 2
-            self._test_common(mod, (v,), 2, match_nodes, check_autocast=check_autocast)
+            self._test_common(mod, (v,), 2, match_nodes, check_autocast=dtype)
+
+    def _test_conv_binary_base(self, dim=4):
+        assert dim == 4 or dim == 5
 
-    def test_conv2d_binary(self):
         class M(torch.nn.Module):
             def __init__(
                 self,
@@ -340,8 +413,12 @@ def __init__(
                 **kwargs,
             ):
                 super().__init__()
-                self.conv1 = torch.nn.Conv2d(3, 16, kernel_size=3, stride=1)
-                self.conv2 = torch.nn.Conv2d(3, 16, kernel_size=3, stride=1)
+                if dim == 4:
+                    self.conv1 = torch.nn.Conv2d(3, 16, kernel_size=3, stride=1)
+                    self.conv2 = torch.nn.Conv2d(3, 16, kernel_size=3, stride=1)
+                else:
+                    self.conv1 = torch.nn.Conv3d(3, 16, kernel_size=3, stride=1)
+                    self.conv2 = torch.nn.Conv3d(3, 16, kernel_size=3, stride=1)
                 self.binary_fn = binary_fn
                 self.has_relu = has_relu
 
@@ -353,7 +430,8 @@ def forward(self, x):
                 else:
                     return self.binary_fn(x1, x2)
 
-        test_memory_format = [torch.contiguous_format, torch.channels_last]
+        cl_format = torch.channels_last if dim == 4 else torch.channels_last_3d
+        test_memory_format = [torch.contiguous_format, cl_format]
         options = itertools.product(
             binary_list,
             [True, False],
@@ -365,7 +443,10 @@ def forward(self, x):
             has_relu,
             memory_format,
         ) in options:
-            x_shape = (1, 3, 56, 56)
+            if dim == 4:
+                x_shape = (1, 3, 56, 56)
+            else:
+                x_shape = (1, 3, 20, 56, 56)
             mod = M(binary_fn, has_relu).eval()
             v = (
                 torch.randn(x_shape, dtype=torch.float32, requires_grad=True)
@@ -378,6 +459,12 @@ def forward(self, x):
                 match_nodes += 1
             self._test_common(mod, (v,), match_count, match_nodes + 2)
 
+    def test_conv2d_binary(self):
+        self._test_conv_binary_base(dim=4)
+
+    def test_conv3d_binary(self):
+        self._test_conv_binary_base(dim=5)
+
     def test_linear_binary(self):
         class M(torch.nn.Module):
             def __init__(self, binary_fn, in_channels, out_channels, bias, **kwargs):
@@ -392,27 +479,32 @@ def forward(self, x, y):
                 x = self.binary_fn(x, y.clone())
                 return x
 
-        options = itertools.product(binary_list, [[2, 3, 10], [2, 10]], [True, False])
-        dtype = torch.bfloat16
-        out_feature = 30
+        dtypes = []
         if torch.ops.mkldnn._is_mkldnn_bf16_supported():
-            for binary_fn, input_shape, bias in options:
-                torch._dynamo.reset()
-                # addmm(mm) + (linear+add)
-                match_count = 2
-                match_nodes = 3
-                if len(input_shape) == 3:
-                    is_inplace = binary_list[binary_fn][2]
-                    # view + linear + view(joint_graph+freeze pass)
-                    match_count = match_count + 5 if is_inplace else match_count + 3
-                    match_nodes = match_nodes + 7 if is_inplace else match_nodes + 5
-                mod = M(binary_fn, input_shape[-1], out_feature, bias).to(dtype).eval()
-                v = torch.randn(input_shape).to(dtype)
-                other = torch.randn(input_shape[:-1] + [out_feature]).to(dtype)
-                mod_c = torch.compile(mod)
-                out, code = run_and_get_code(mod_c, v, other)
-                self.assertEqual(out, mod(v, other), rtol=1e-2, atol=1e-2)
-                # TODO - assert fusions work code
+            dtypes.append(torch.bfloat16)
+        if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+            dtypes.append(torch.float16)
+        options = itertools.product(
+            binary_list, [[2, 3, 10], [2, 10]], [True, False], dtypes
+        )
+        out_feature = 30
+        for binary_fn, input_shape, bias, dtype in options:
+            torch._dynamo.reset()
+            # addmm(mm) + (linear+add)
+            match_count = 2
+            match_nodes = 3
+            if len(input_shape) == 3:
+                is_inplace = binary_list[binary_fn][2]
+                # view + linear + view(joint_graph+freeze pass)
+                match_count = match_count + 5 if is_inplace else match_count + 3
+                match_nodes = match_nodes + 7 if is_inplace else match_nodes + 5
+            mod = M(binary_fn, input_shape[-1], out_feature, bias).to(dtype).eval()
+            v = torch.randn(input_shape).to(dtype)
+            other = torch.randn(input_shape[:-1] + [out_feature]).to(dtype)
+            mod_c = torch.compile(mod)
+            out, code = run_and_get_code(mod_c, v, other)
+            self.assertEqual(out, mod(v, other), rtol=1e-2, atol=1e-2)
+            # TODO - assert fusions work code
 
     def test_multi_linear_share_same_input(self):
         # llama pattern.
@@ -427,9 +519,14 @@ def __init__(
             def forward(self, x):
                 return F.silu(self.w1(x)) * F.relu(self.w2(x))
 
-        mod = M().to(torch.bfloat16).eval()
+        dtypes = []
         if torch.ops.mkldnn._is_mkldnn_bf16_supported():
-            v = torch.randn(2, 4, 16).to(torch.bfloat16)
+            dtypes.append(torch.bfloat16)
+        if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+            dtypes.append(torch.float16)
+        for dtype in dtypes:
+            mod = M().to(dtype).eval()
+            v = torch.randn(2, 4, 16).to(dtype)
             # 1. view(match_count=4, match_nodes=4).
             # 2. mm to packed linear(match_count=2, match_nodes=2).
             # 3. view+linear+view to linear(match_count=2, match_nodes=6).
@@ -458,22 +555,22 @@ def forward(self, x):
 
         def matcher_check_fn():
             # 1. Dequant-Conv2D pattern matched in QConv2D weight prepack * 1
-            #    int8_mixed_fp32: [convert_element_type_1, sub, mul_1, dequantize_per_channel, clone, convolution]
-            #    int8_mixed_bf16: [convert_element_type_1, sub, mul_1, optional(convert_element_type_4),
+            #    int8_mixed_fp32: [dequant_node, dequantize_per_channel, clone, convolution]
+            #    int8_mixed_bf16: [dequant_node, optional(convert_element_type_4),
             #     dequantize_per_channel, optional(convert_element_type_3), clone, convolution]
             self.assertEqual(
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 2
             )
             self.assertEqual(
                 counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"],
-                16 if int8_mixed_bf16 else 12,
+                12 if int8_mixed_bf16 else 8,
             )
 
         self._test_common(
             mod,
             (v,),
             check_quantization=True,
-            check_autocast=int8_mixed_bf16,
+            check_autocast=torch.bfloat16 if int8_mixed_bf16 else torch.float,
             matcher_check_fn=matcher_check_fn,
         )
 
@@ -500,6 +597,7 @@ def _qconv2d_unary_cpu_test_helper(
         self,
         int8_mixed_bf16=False,
         unary_op=torch.nn.ReLU(),
+        qconv2d_unary_matcher_nodes=None,
     ):
         class M(torch.nn.Module):
             def __init__(
@@ -526,12 +624,17 @@ def matcher_check_fn():
             )
             # 2. QConv2D Unary fusion in post-grad fusion pass * 2
             self.assertEqual(counters["inductor"]["qconv2d_unary_matcher_count"], 2)
+            if qconv2d_unary_matcher_nodes:
+                self.assertEqual(
+                    counters["inductor"]["qconv2d_unary_matcher_nodes"],
+                    qconv2d_unary_matcher_nodes,
+                )
 
         self._test_common(
             mod,
             (v,),
             check_quantization=True,
-            check_autocast=int8_mixed_bf16,
+            check_autocast=torch.bfloat16 if int8_mixed_bf16 else torch.float,
             matcher_check_fn=matcher_check_fn,
         )
 
@@ -572,6 +675,77 @@ def test_qconv2d_hardtanh_cpu(self):
         """
         self._qconv2d_unary_cpu_test_helper(unary_op=torch.nn.Hardtanh())
 
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNNBF16
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_qconv2d_hardtanh_int8_mixed_bf16_cpu(self):
+        r"""
+        This testcase will quantize Conv2d->Hardtanh pattern.
+        Match.nodes:
+            [qconv2d_pointwise_default, convert_element_type, clamp_min, clamp_max, convert_element_type, quantize_per_tensor]
+            [qconv2d_pointwise_default, convert_element_type, clamp_min, clamp_max, convert_element_type]
+        """
+        self._qconv2d_unary_cpu_test_helper(
+            unary_op=torch.nn.Hardtanh(),
+            int8_mixed_bf16=True,
+            qconv2d_unary_matcher_nodes=11,
+        )
+
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_qconv2d_hardswish_cpu(self):
+        r"""
+        This testcase will quantize Conv2d->Hardswish pattern.
+        """
+        self._qconv2d_unary_cpu_test_helper(unary_op=torch.nn.Hardswish())
+
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNNBF16
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_qconv2d_hardswish_int8_mixed_bf16_cpu(self):
+        r"""
+        This testcase will quantize Conv2d->Hardswish pattern.
+        Match.nodes:
+            [qconv2d_pointwise_default, convert_element_type, add, clamp_min,
+             clamp_max, mul, div, convert_element_type, quantize_per_tensor]
+            [qconv2d_pointwise_default, convert_element_type, add, clamp_min, clamp_max, mul, div, convert_element_type]
+        """
+        self._qconv2d_unary_cpu_test_helper(
+            unary_op=torch.nn.Hardswish(),
+            int8_mixed_bf16=True,
+            qconv2d_unary_matcher_nodes=17,
+        )
+
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_qconv2d_silu_cpu(self):
+        r"""
+        This testcase will quantize Conv2d->SiLU pattern.
+        """
+        self._qconv2d_unary_cpu_test_helper(unary_op=torch.nn.SiLU())
+
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNNBF16
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_qconv2d_silu_int8_mixed_bf16_cpu(self):
+        r"""
+        This testcase will quantize Conv2d->SiLU pattern.
+        Match.nodes:
+            [qconv2d_pointwise_default, convert_element_type, sigmoid, mul,
+             convert_element_type, quantize_per_tensor]
+            [qconv2d_pointwise_default, convert_element_type, sigmoid, mul, convert_element_type]
+        """
+        self._qconv2d_unary_cpu_test_helper(
+            unary_op=torch.nn.SiLU(),
+            int8_mixed_bf16=True,
+            qconv2d_unary_matcher_nodes=11,
+        )
+
     def _qconv2d_add_cpu_test_helper(self, use_relu=False, int8_mixed_bf16=False):
         r"""
         This testcase will quantize a Conv2d->Add pattern as:
@@ -637,7 +811,7 @@ def matcher_check_fn():
                 mod,
                 (v,),
                 check_quantization=True,
-                check_autocast=int8_mixed_bf16,
+                check_autocast=torch.bfloat16 if int8_mixed_bf16 else torch.float,
                 matcher_check_fn=matcher_check_fn,
             )
 
@@ -853,17 +1027,17 @@ def forward(self, x):
 
         def matcher_check_fn():
             # 1. Dequant-conv pattern matched in quantization weight prepack * 1
-            #    [convert_element_type_1, sub, mul_1, dequantize_per_channel, clone, convolution]
+            #    [dequantize_per_tensor, dequantize_per_channel, clone, convolution]
             self.assertEqual(
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 1
             )
             self.assertEqual(
-                counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"], 6
+                counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"], 4
             )
             # 2. QConv2D Unary fusion in post-grad fusion pass * 1
-            #    [qconv2d_pointwise_default, div_1, round_2, add_1, clamp_min_1, clamp_max_1, convert_element_type_2]
+            #    [qconv2d_pointwise_default, quantize_per_tensor]
             self.assertEqual(counters["inductor"]["qconv2d_unary_matcher_count"], 1)
-            self.assertEqual(counters["inductor"]["qconv2d_unary_matcher_nodes"], 7)
+            self.assertEqual(counters["inductor"]["qconv2d_unary_matcher_nodes"], 2)
 
         self._test_common(
             mod,
@@ -932,7 +1106,6 @@ def test_qat_qconv2d_relu6(self):
         r"""
         This testcase will quantize Conv2d->ReLU6 pattern with qat flow.
         """
-
         self._qat_qconv2d_unary_cpu_test_helper(unary_op=torch.nn.ReLU6())
 
     @skipIfNoDynamoSupport
@@ -942,9 +1115,26 @@ def test_qat_qconv2d_hardtanh(self):
         r"""
         This testcase will quantize Conv2d->Hardtanh pattern with qat flow.
         """
-
         self._qat_qconv2d_unary_cpu_test_helper(unary_op=torch.nn.Hardtanh())
 
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_qat_qconv2d_silu(self):
+        r"""
+        This testcase will quantize Conv2d->SiLU pattern with qat flow.
+        """
+        self._qat_qconv2d_unary_cpu_test_helper(unary_op=torch.nn.SiLU())
+
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_qat_qconv2d_hardswish(self):
+        r"""
+        This testcase will quantize Conv2d->Hardswish pattern with qat flow.
+        """
+        self._qat_qconv2d_unary_cpu_test_helper(unary_op=torch.nn.Hardswish())
+
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
@@ -981,18 +1171,17 @@ def forward(self, x):
 
         def matcher_check_fn():
             # 1. Dequant-conv pattern matched in quantization weight prepack * 2
-            #    [convert_element_type_1, sub, mul_1, dequantize_per_channel, clone, convolution]
+            #    [dequantize_per_tensor, dequantize_per_channel, clone, convolution]
             self.assertEqual(
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 2
             )
             self.assertEqual(
-                counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"], 12
+                counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"], 8
             )
             # 2. Qconv2d Binary fusion in post-grad fusion pass * 1
-            #    [qconv2d_pointwise_default_1, convert_element_type_5, sub_2, mul_5, add_3, mul_6, round_4, add_4,
-            #     clamp_min_3, clamp_max_3, convert_element_type_6]
+            #    [qconv2d_pointwise_default_1, dequantize_per_tensor, add_3, quantize_per_tensor]
             self.assertEqual(counters["inductor"]["qconv2d_binary_matcher_count"], 1)
-            self.assertEqual(counters["inductor"]["qconv2d_binary_matcher_nodes"], 11)
+            self.assertEqual(counters["inductor"]["qconv2d_binary_matcher_nodes"], 4)
 
         self._test_common(
             mod,
@@ -1041,18 +1230,17 @@ def forward(self, x):
 
         def matcher_check_fn():
             # 1. Dequant-conv pattern matched in quantization weight prepack * 2
-            #    [convert_element_type_1, sub, mul_1, dequantize_per_channel, clone, convolution]
+            #    [dequantize_per_tensor, dequantize_per_channel, clone, convolution]
             self.assertEqual(
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 2
             )
             self.assertEqual(
-                counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"], 12
+                counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"], 8
             )
             # 2. Qconv2d Binary fusion in post-grad fusion pass * 1
-            #    [qconv2d_pointwise_default_1, convert_element_type_5, sub_2, mul_5, add_3, relu, mul_6, round_4, add_4,
-            #     clamp_min_3, clamp_max_3, convert_element_type_6]
+            #    [qconv2d_pointwise_default_1, dequantize_per_tensor, add_3, relu, quantize_per_tensor]
             self.assertEqual(counters["inductor"]["qconv2d_binary_matcher_count"], 1)
-            self.assertEqual(counters["inductor"]["qconv2d_binary_matcher_nodes"], 12)
+            self.assertEqual(counters["inductor"]["qconv2d_binary_matcher_nodes"], 5)
 
         self._test_common(
             mod,
@@ -1099,16 +1287,16 @@ def forward(self, x):
 
         def matcher_check_fn():
             # 1. Dequant pattern matcher for dequant promotion * 1
-            #    [convert_element_type_3, sub_1, mul_3]
+            #    [dequantize_per_tensor]
             self.assertEqual(counters["inductor"]["dequant_promotion_matcher_count"], 1)
-            self.assertEqual(counters["inductor"]["dequant_promotion_matcher_nodes"], 3)
+            self.assertEqual(counters["inductor"]["dequant_promotion_matcher_nodes"], 1)
             # 2. Dequant-conv pattern matched in quantization weight prepack * 3
-            #    [convert_element_type_1, sub, mul_1, dequantize_per_channel, clone, convolution]
+            #    [dequantize_per_tensor, dequantize_per_channel, clone, convolution]
             self.assertEqual(
                 counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 3
             )
             self.assertEqual(
-                counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"], 18
+                counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"], 12
             )
             # 3. Qconv2d Binary fusion in post-grad fusion pass * 1
             #    [qconv2d_pointwise_default_1, add_3]
@@ -1129,12 +1317,14 @@ def _qlinear_cpu_test_helper(
         do_permute=False,
         matcher_check_fn=None,
         bias=True,
+        is_dynamic=False,
+        is_qat=False,
     ):
         class M(torch.nn.Module):
             def __init__(self, use_bias, do_permute=False):
                 super().__init__()
-                self.linear = torch.nn.Linear(4, 4, use_bias)
-                self.linear2 = torch.nn.Linear(4, 4, use_bias)
+                self.linear = torch.nn.Linear(4, 3, use_bias)
+                self.linear2 = torch.nn.Linear(3, 4, use_bias)
                 self.do_permute = do_permute
 
             def forward(self, x):
@@ -1152,11 +1342,13 @@ def _default_matcher_check_fn():
         self._test_common(
             mod,
             inputs,
-            check_autocast=int8_mixed_bf16,
+            check_autocast=torch.bfloat16 if int8_mixed_bf16 else torch.float,
             check_quantization=True,
             matcher_check_fn=matcher_check_fn
             if matcher_check_fn is not None
             else _default_matcher_check_fn,
+            is_qat=is_qat,
+            is_dynamic=is_dynamic,
         )
 
     @skipIfNoDynamoSupport
@@ -1169,6 +1361,42 @@ def test_qlinear_cpu(self):
         for bias in [True, False]:
             self._qlinear_cpu_test_helper((torch.randn((2, 4)),), bias=bias)
 
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_dynamic_qlinear_cpu(self):
+        r"""
+        This testcase will quantize a single Linear Moduel.
+        """
+        for bias in [True, False]:
+            self._qlinear_cpu_test_helper(
+                (torch.randn((2, 4)),), bias=bias, is_dynamic=True
+            )
+
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_dynamic_qlinear_qat_cpu(self):
+        r"""
+        This testcase will quantize a single Linear Moduel.
+        """
+        for bias in [True, False]:
+            self._qlinear_cpu_test_helper(
+                (torch.randn((2, 4)),), bias=bias, is_dynamic=True, is_qat=True
+            )
+
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_dynamic_qlinear_input_dim_exceeds_2(self):
+        r"""
+        This testcase will quantize a single Linear Moduel.
+        """
+        for bias in [True, False]:
+            self._qlinear_cpu_test_helper(
+                (torch.randn((2, 3, 4)),), bias=bias, is_dynamic=True
+            )
+
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
@@ -1222,7 +1450,7 @@ def matcher_check_fn():
                 )
                 self.assertEqual(
                     counters["inductor"]["qlinear_weight_prepack_matcher_nodes"],
-                    17 if bias else 16,
+                    13 if bias else 12,
                 )
 
             self._qlinear_cpu_test_helper(
@@ -1250,7 +1478,7 @@ def matcher_check_fn():
                 )
                 self.assertEqual(
                     counters["inductor"]["qlinear_weight_prepack_matcher_nodes"],
-                    21 if bias else 20,
+                    17 if bias else 16,
                 )
 
             self._qlinear_cpu_test_helper(
@@ -1261,14 +1489,16 @@ def matcher_check_fn():
                 bias=bias,
             )
 
-    def _qlinear_unary_cpu_test_helper(self, inputs, int8_mixed_bf16=False):
+    def _qlinear_unary_cpu_test_helper(
+        self, inputs, unary_op=torch.nn.ReLU(), int8_mixed_bf16=False
+    ):
         class M(torch.nn.Module):
             def __init__(self, use_bias):
                 super().__init__()
                 self.linear = torch.nn.Linear(4, 4, use_bias)
-                self.unary_fn = torch.nn.ReLU()
+                self.unary_fn = copy.deepcopy(unary_op)
                 self.linear2 = torch.nn.Linear(4, 4, use_bias)
-                self.unary_fn2 = torch.nn.ReLU()
+                self.unary_fn2 = copy.deepcopy(unary_op)
 
             def forward(self, x):
                 tmp = self.unary_fn(self.linear(x))
@@ -1289,7 +1519,7 @@ def matcher_check_fn():
             self._test_common(
                 mod,
                 inputs,
-                check_autocast=int8_mixed_bf16,
+                check_autocast=torch.bfloat16 if int8_mixed_bf16 else torch.float,
                 check_quantization=True,
                 matcher_check_fn=matcher_check_fn,
             )
@@ -1336,7 +1566,36 @@ def test_qlinear_relu_int8_mixed_bf16_input_dim_exceeds_2(self):
             (torch.randn((2, 3, 4)),), int8_mixed_bf16=True
         )
 
-    def _qlinear_dequant_promotion_cpu_test_helper(self, inputs, int8_mixed_bf16=False):
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_qlinear_gelu_cpu(self):
+        r"""
+        This testcase will quantize a Linear->GELU pattern.
+        """
+        for gelu in [torch.nn.GELU("none"), torch.nn.GELU("tanh")]:
+            self._qlinear_unary_cpu_test_helper((torch.randn((2, 4)),), gelu)
+
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNNBF16
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_qlinear_gelu_int8_mixed_bf16(self):
+        r"""
+        This testcase will quantize a Linear->GELU pattern with int8_mixed_bf16 quantization.
+        """
+        for gelu in [torch.nn.GELU("none"), torch.nn.GELU("tanh")]:
+            self._qlinear_unary_cpu_test_helper(
+                (torch.randn((2, 4)),), gelu, int8_mixed_bf16=True
+            )
+
+    def _qlinear_dequant_promotion_cpu_test_helper(
+        self,
+        inputs,
+        int8_mixed_bf16=False,
+        is_dynamic=False,
+        matcher_check_fn=None,
+    ):
         class M(torch.nn.Module):
             def __init__(
                 self,
@@ -1354,7 +1613,7 @@ def forward(self, x):
 
         mod = M().eval()
 
-        def matcher_check_fn():
+        def default_matcher_check_fn():
             # 1. Dequant pattern matcher for dequant promotion * 1
             self.assertEqual(counters["inductor"]["dequant_promotion_matcher_count"], 1)
             # 2. dequant-linear pattern matched in quantization weight prepack * 3
@@ -1367,9 +1626,12 @@ def matcher_check_fn():
         self._test_common(
             mod,
             inputs,
-            check_autocast=int8_mixed_bf16,
+            check_autocast=torch.bfloat16 if int8_mixed_bf16 else torch.float,
             check_quantization=True,
-            matcher_check_fn=matcher_check_fn,
+            matcher_check_fn=matcher_check_fn
+            if matcher_check_fn is not None
+            else default_matcher_check_fn,
+            is_dynamic=is_dynamic,
         )
 
     @skipIfNoDynamoSupport
@@ -1452,6 +1714,37 @@ def test_qlinear_dequant_promotion_int8_mixed_bf16_input_dim_exceeds_2(self):
             (torch.randn((2, 3, 4)),), int8_mixed_bf16=True
         )
 
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_qlinear_dequant_promotion_dynamic_cpu(self):
+        r"""
+        This testcase test if dequant node before linear is promoted correctly:
+                  X
+                  |
+               Linear1(X)
+                /   \
+        Linear2(X)   Linear3(X)
+                \   /
+                 Add
+                  |
+                  Y
+        """
+
+        def matcher_check_fn():
+            # 1. Dequant pattern matcher for dequant promotion * 1
+            self.assertEqual(counters["inductor"]["dequant_promotion_matcher_count"], 1)
+            # 2. dequant-linear pattern matched in quantization weight prepack * 3
+            self.assertEqual(
+                counters["inductor"]["qlinear_weight_prepack_matcher_count"], 3
+            )
+
+        self._qlinear_dequant_promotion_cpu_test_helper(
+            (torch.randn((2, 4)),),
+            matcher_check_fn=matcher_check_fn,
+            is_dynamic=True,
+        )
+
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
     @skipIfRocm
@@ -1474,12 +1767,16 @@ def forward(self, x1, x2):
             x1 = torch.randn((2, 4))
             x2 = torch.randn((2, 5))
 
+            def matcher_check_fn():
+                self.assertEqual(
+                    counters["inductor"]["qlinear_weight_prepack_matcher_count"], 1
+                )
+
             self._test_common(
                 mod,
                 (x1, x2),
-                2,
-                8,
                 check_quantization=True,
+                matcher_check_fn=matcher_check_fn,
             )
 
     @skipIfNoDynamoSupport
@@ -1515,22 +1812,19 @@ def forward(self, x):
             v = torch.randn((1, 3, 8, 8), dtype=torch.float32, requires_grad=False).add(
                 1
             )
-            # Totally 6 pattern_matcher_count, 31 pattern_matcher_nodes
-            # 1. Pair of to_int8 and to_fp32 * 3, matched in pointless_convert pass at
-            #    torch/_inductor/fx_passes/joint_graph.py: [convert_element_type, convert_element_type_1]
-            # 2. Dequant-conv pattern matched in quantization weight prepack * 1
-            #    [convert_element_type_1, sub, mul_1, dequantize_per_channel, clone, convolution]
-            # 3. qconv2d_relu fusion in post-grad fusion pass * 1
-            #    [qconv2d_pointwise_default, relu, mul_2, round_2, add_1, clamp_min_1, clamp_max_1, convert_element_type_2]
-            # 4. qmaxpool2d * 1
-            #    [convert_element_type_3, sub_1, mul_3, max_pool2d_with_indices, getitem, mul_4, round_3, add_2,
-            #    clamp_min_2, clamp_max_2, convert_element_type_4]
+
+            def matcher_check_fn():
+                self.assertEqual(counters["inductor"]["qmaxpool2d_matcher_count"], 1)
+                self.assertEqual(
+                    counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 1
+                )
+                self.assertEqual(counters["inductor"]["qconv2d_unary_matcher_count"], 1)
+
             self._test_common(
                 mod,
                 (v,),
-                6,
-                31,
                 check_quantization=True,
+                matcher_check_fn=matcher_check_fn,
             )
 
     @skipIfNoDynamoSupport
@@ -1604,22 +1898,19 @@ def forward(self, x):
 
         mod = M().eval()
         v = torch.randn((1, 3, 8, 8), dtype=torch.float32, requires_grad=False).add(1)
-        # Totally 10 pattern_matcher_count, 49 pattern_matcher_nodes
-        # 1. Pair of to_int8 and to_fp32 * 5, matched in pointless_convert pass at
-        #    torch/_inductor/fx_passes/joint_graph.py: [convert_element_type, convert_element_type_1]
-        # 2. Dequant-conv pattern matched in quantization weight prepack * 2
-        #    [convert_element_type_1, sub, mul_1, dequantize_per_channel, clone, convolution]
-        # 3. qconv2d fusion in post-grad fusion pass * 2
-        #    [qconv2d_pointwise_default, mul_2, round_2, add_1, clamp_min_1, clamp_max_1, convert_element_type_2]
-        # 4. qcat * 1
-        #    [convert_element_type_3, sub_1, mul_3, convert_element_type_7, sub_3, mul_7, cat, mul_8, round_5,
-        #    add_4, clamp_min_4, clamp_max_4, convert_element_type_8]
+
+        def matcher_check_fn():
+            self.assertEqual(counters["inductor"]["qcat_matcher_count"], 1)
+            self.assertEqual(
+                counters["inductor"]["qconv2d_weight_prepack_matcher_count"], 2
+            )
+            self.assertEqual(counters["inductor"]["qconv2d_unary_matcher_count"], 2)
+
         self._test_common(
             mod,
             (v,),
-            10,
-            49,
             check_quantization=True,
+            matcher_check_fn=matcher_check_fn,
         )
 
     # https://github.com/pytorch/pytorch/issues/99841.
@@ -1779,22 +2070,51 @@ def forward(self, x, _):
                 res = res + other2
                 return res
 
+        # Written buffer is an ReinterpretView, we can't fuse inplace.
+        class Model_v4(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(3, 32, 3, padding=1, bias=True)
+                self.linear = torch.nn.Linear(32 * 28, 32 * 28)
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x, y):
+                x = self.conv(self.relu(x))
+                y = self.linear(y)
+                y = torch.cat((y, y), 1)
+                y = torch.ops.aten.permute.default(y, [0, 2, 1]).reshape(1, 32, 28, 28)
+                return x + y
+
+        class Model_v5(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(32, 32, 3, padding=1, bias=True)
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, _, x):
+                x1 = self.relu(x)
+                return self.conv(x1) + x1
+
         input = torch.randn(1, 3, 28, 28).to(memory_format=torch.channels_last)
         others = [
             torch.randn(1, 32, 28, 28).to(memory_format=torch.channels_last),
             torch.randn(2, 32, 28, 28).to(memory_format=torch.channels_last),
             torch.randn(1, 32, 28, 28).to(memory_format=torch.channels_last),
+            torch.randn(1, 14, 32 * 28),
+            torch.randn(1, 32, 28, 28).to(memory_format=torch.channels_last),
         ]
         mod_v1 = Model_v1().to(memory_format=torch.channels_last).eval()
         mod_v2 = Model_v2().to(memory_format=torch.channels_last).eval()
         mod_v3 = Model_v3().to(memory_format=torch.channels_last).eval()
+        mod_v4 = Model_v4().to(memory_format=torch.channels_last).eval()
+        mod_v5 = Model_v5().to(memory_format=torch.channels_last).eval()
 
         if include_ops is None:
             include_ops = ["mkldnn._convolution_pointwise.binary"]
         if exclude_ops is None:
             exclude_ops = ["mkldnn._convolution_pointwise_.binary"]
 
-        for other, mod in zip(others, [mod_v1, mod_v2, mod_v3]):
+        for other, mod in zip(others, [mod_v1, mod_v2, mod_v3, mod_v4, mod_v5]):
             self._test_code_common(mod, (input, other), include_ops, exclude_ops)
 
     def test_conv2d_binary_fusion_failed(self):
@@ -1972,11 +2292,84 @@ def forward(self, x):
             om(*example_inputs)
             om(*example_inputs)
 
+    def test_reproduce_121253_issue(self):
+        class Mod(torch.nn.Module):
+            def __init__(self, weight, bias, beta, alpha):
+                super().__init__()
+                self.weight = weight
+                self.bias = bias
+                self.beta = beta
+                self.alpha = alpha
+
+            def forward(self, x):
+                return torch.addmm(
+                    self.bias, x, self.weight, beta=self.beta, alpha=self.alpha
+                )
+
+        dtypes = [torch.float32]
+        if torch.ops.mkldnn._is_mkldnn_bf16_supported():
+            dtypes.append(torch.bfloat16)
+        for dtype in dtypes:
+            linear_op = (
+                "mkl._mkl_linear"
+                if dtype == torch.float32
+                else "mkldnn._linear_pointwise"
+            )
+            for beta, alpha in zip([1.0, 0.1, 0.0], [1.0, 0.1, 1.0]):
+                weight = torch.randn(64, 64, dtype=dtype)
+                bias = torch.randn(64, dtype=dtype)
+                mod = Mod(weight, bias, beta, alpha).to(dtype).eval()
+                with torch.no_grad():
+                    x = torch.randn(1, 64, dtype=dtype)
+                    include_ops = []
+                    exclude_ops = []
+                    if (beta != 1.0 and beta != 0.0) or alpha != 1.0:
+                        exclude_ops = [linear_op]
+                    else:
+                        include_ops = [linear_op]
+                    self._test_code_common(mod, (x,), include_ops, exclude_ops)
+
+    @skipIfNoDynamoSupport
+    @skipIfRocm
+    def test_woq_int8(self):
+        class M(torch.nn.Module):
+            def forward(self, x, weight, scales):
+                return torch.nn.functional.linear(x, weight.to(dtype=x.dtype)) * scales
+
+        mod = M().eval()
+        x_shape = (1, 1, 256)
+        w_shape = (12, 256)
+        s_shape = 12
+        x_strides = [
+            (256, 256, 1),  # linear dispatching to mm
+            (256, 32, 1),  # linear dispatching to bmm
+        ]
+        for x_stride in x_strides:
+            x = torch.randn(x_shape, dtype=torch.bfloat16).as_strided(x_shape, x_stride)
+            w = torch.randint(-128, 127, w_shape, dtype=torch.int8)
+            s = torch.randn(s_shape, dtype=torch.bfloat16)
+
+            def matcher_check_fn():
+                self.assertEqual(counters["inductor"]["woq_matcher_count"], 1)
+
+            self._test_common(
+                mod,
+                (x, w, s),
+                matcher_check_fn=matcher_check_fn,
+                check_quantization=False,
+                atol=0.001,
+                rtol=0.07,
+            )
+
 
 @dynamo_config.patch({"dynamic_shapes": True, "assume_static_by_default": False})
 class TestDynamicPatternMatcher(TestPatternMatcherBase):
+    _test_conv_unary_cpu_base = TestPatternMatcher._test_conv_unary_cpu_base
     test_conv2d_unary_dynamic_shapes = TestPatternMatcher.test_conv2d_unary_cpu
+    test_conv3d_unary_dynamic_shapes = TestPatternMatcher.test_conv3d_unary_cpu
+    _test_conv_binary_base = TestPatternMatcher._test_conv_binary_base
     test_conv2d_binary_dynamic_shapes = TestPatternMatcher.test_conv2d_binary
+    test_conv3d_binary_dynamic_shapes = TestPatternMatcher.test_conv3d_binary
     test_linear_unary_dynamic_shapes = TestPatternMatcher.test_linear_unary
 
     def test_conv_transpose2d_dynamic_shapes(self):
@@ -2009,15 +2402,22 @@ def __init__(
             def forward(self, x):
                 return F.silu(self.w1(x)) * F.relu(self.w2(x))
 
-        mod = M().to(torch.bfloat16).eval()
+        dtypes = []
         if torch.ops.mkldnn._is_mkldnn_bf16_supported():
-            v = torch.randn(2, 4, 16).to(torch.bfloat16)
+            dtypes.append(torch.bfloat16)
+        if torch.ops.mkldnn._is_mkldnn_fp16_supported():
+            dtypes.append(torch.float16)
+        for dtype in dtypes:
+            mod = M().to(dtype).eval()
+            v = torch.randn(2, 4, 16).to(dtype)
             # 1. view(match_count=4, match_nodes=4).
             # 2. mm to packed linear(match_count=2, match_nodes=2).
             # 3. view+linear+view to linear(match_count=2, match_nodes=6).
+            # 4. linear to linear+swish(match_count=1, match_nodes=2).
+            # 5. linear to linear+relu(match_count=1, match_nodes=5).
 
-            match_count = 8
-            match_nodes = 12
+            match_count = 10
+            match_nodes = 19
             self._test_common(mod, (v,), match_count, match_nodes, rtol=1e-2, atol=1e-2)
 
     def test_qconv2d_maxpool2d_linear_dynamic_cpu(self, include_ops=None):
@@ -2102,6 +2502,85 @@ def matcher_check_fn():
             matcher_check_fn=matcher_check_fn,
         )
 
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_q_attention_block(self):
+        class SelfAttnLikeModule(torch.nn.Module):
+            def __init__(
+                self,
+                input_dim,
+                transpose_for_score=False,
+                num_attention_heads=None,
+                attention_head_size=None,
+            ) -> None:
+                super().__init__()
+                self.input_dim = input_dim
+                self.q_proj = torch.nn.Linear(input_dim, input_dim, bias=False)
+                self.k_proj = torch.nn.Linear(input_dim, input_dim, bias=False)
+                self.v_proj = torch.nn.Linear(input_dim, input_dim, bias=False)
+                self.softmax = torch.nn.Softmax(dim=-1)
+                self.transpose_for_score = transpose_for_score
+                if self.transpose_for_score:
+                    assert num_attention_heads is not None
+                    assert attention_head_size is not None
+                    self.num_attention_heads = num_attention_heads
+                    self.attention_head_size = attention_head_size
+
+            def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+                new_x_shape = x.size()[:-1] + (
+                    self.num_attention_heads,
+                    self.attention_head_size,
+                )
+                x = x.view(new_x_shape)
+                return x.permute(0, 2, 1, 3)
+
+            def forward(self, x):
+                q = self.q_proj(x)
+                k = self.k_proj(x)
+                v = self.v_proj(x)
+                if self.transpose_for_score:
+                    q = self.transpose_for_scores(q)
+                    k = self.transpose_for_scores(k)
+                    v = self.transpose_for_scores(v)
+                scores = torch.matmul(q, k.transpose(-1, -2)) / (self.input_dim**0.5)
+                attention = self.softmax(scores)
+                weighted = torch.matmul(attention, v)
+                return weighted
+
+        for annotate_matmul in [False, True]:
+            mod = SelfAttnLikeModule(
+                input_dim=64 * 16,
+                transpose_for_score=True,
+                num_attention_heads=16,
+                attention_head_size=64,
+            ).eval()
+            v = torch.randn(2, 384, 1024)
+
+            def matcher_check_fn():
+                self.assertEqual(
+                    counters["inductor"]["qlinear_weight_prepack_matcher_count"], 3
+                )
+                self.assertEqual(
+                    counters["inductor"]["qlinear_unary_matcher_count"],
+                    3 if annotate_matmul else 0,
+                )
+
+            quantizer = X86InductorQuantizer()
+            quantizer.set_global(xiq.get_default_x86_inductor_quantization_config())
+            if annotate_matmul:
+                quantizer.set_function_type_qconfig(
+                    torch.matmul, quantizer.get_global_quantization_config()
+                )
+
+            self._test_common(
+                mod,
+                (v,),
+                check_quantization=True,
+                matcher_check_fn=matcher_check_fn,
+                quantizer=quantizer,
+            )
+
 
 if __name__ == "__main__":
     if IS_LINUX and HAS_CPU and torch.backends.mkldnn.is_available():
diff --git a/test/inductor/test_mmdecomp.py b/test/inductor/test_mmdecomp.py
index 3b44196dddb7d..9319fed2b7941 100644
--- a/test/inductor/test_mmdecomp.py
+++ b/test/inductor/test_mmdecomp.py
@@ -197,6 +197,6 @@ def test_some_batched(self, device, dtype, bs):
 instantiate_device_type_tests(TestDecomp, globals(), only_for=device_types)
 
 if __name__ == "__main__":
-    # We don't support torch.compile() on Windows presently
+    # We don't support torch.compile() on Windows
     if not IS_WINDOWS:
         run_tests()
diff --git a/test/inductor/test_move_constructors_to_cuda.py b/test/inductor/test_move_constructors_to_cuda.py
index 1bc0b408099f3..c4dbbe1640138 100644
--- a/test/inductor/test_move_constructors_to_cuda.py
+++ b/test/inductor/test_move_constructors_to_cuda.py
@@ -4,7 +4,7 @@
 import unittest
 
 import torch
-from torch._dynamo.test_case import run_tests, TestCase
+from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
diff --git a/test/inductor/test_multi_kernel.py b/test/inductor/test_multi_kernel.py
new file mode 100644
index 0000000000000..808802ebfd215
--- /dev/null
+++ b/test/inductor/test_multi_kernel.py
@@ -0,0 +1,285 @@
+# Owner(s): ["module: inductor"]
+
+import os
+import re
+import unittest
+
+import torch
+from torch import nn
+from torch._dynamo.testing import reset_rng_state
+
+from torch._inductor import config, test_operators
+from torch._inductor.codegen.multi_kernel import MultiKernelCall
+from torch._inductor.test_case import TestCase
+from torch._inductor.utils import run_and_get_code
+from torch.nn import functional as F
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    skipIfRocm,
+)
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+
+class TransformerSnippet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(64)
+        self.ln2 = nn.LayerNorm(64)
+
+    def forward(self, x1, x2):
+        x1 = F.dropout(x1, 0.1)
+        x2 = F.dropout(self.ln1(x2), 0.1)
+
+        return self.ln2(x1 + x2)
+
+    def example_inputs(self):
+        return (torch.randn(2, 64).cuda(), torch.randn(2, 64).cuda())
+
+
+def _contains_multi_kernel_code(wrapper_code: str):
+    return (
+        re.search(r"multi_kernel_[^ ]* = async_compile.multi_kernel[(]", wrapper_code)
+        is not None
+    )
+
+
+def make_cpp_wrapper_test(orig_test, **extra_args):
+    """
+    Wrap an existing test into a new test with cpp-wrapper enabled.
+
+    Make this as a free function rather than staticmethod in MultiKernelTest.
+    Otherwise we get 'TypeError: 'staticmethod' object is not callable'
+    error in py3.8. (py3.10 works)
+    """
+
+    @config.patch("cpp_wrapper", True)
+    def fn(self):
+        # The same kernel may have been compiled by previous tests with
+        # cpp_wrapper disabled. Clear the cache so we go ahead to re-compile
+        # the kernel with cpp_wrapper enabled.
+        from torch._inductor import codecache
+
+        codecache.PyCodeCache.cache_clear()
+        return orig_test(self, **extra_args)
+
+    return fn
+
+
+@config.patch(
+    {
+        "triton.multi_kernel": int(os.environ.get("TORCHINDUCTOR_MULTI_KERNEL", "1")),
+        "benchmark_kernel": True,
+    }
+)
+@instantiate_parametrized_tests
+class MultiKernelTest(TestCase):
+    def test_softmax(self, expect_multi_kernel=True):
+        x = torch.rand(2, 1024).cuda()
+        ref = torch.softmax(x, -1)
+        compiled_fn = torch.compile(torch.softmax)
+        act, wrapper_code = run_and_get_code(compiled_fn, x, -1)
+
+        # wrapper_code will contains 2 entries if cpp_wrapper=True.
+        # One for the first pass and one for the second pass.
+        # We mainly care about the wrapper for the final pass here.
+        wrapper_code = wrapper_code[-1]
+        self.assertTrue(torch.allclose(ref, act))
+        if expect_multi_kernel:
+            self.assertTrue(_contains_multi_kernel_code(wrapper_code))
+        else:
+            # Skip verifying the wrapper_code in fbcode since we may fail
+            # compiling the cpp wrapper cuda code due to lacking proper setup of
+            # cuda compiler in fbcode environment. In that case, the last
+            # collected wrapper_code will corresponds to the first pass
+            # cpp-wrapper codegen which contains the multi-kernel.
+            if not config.is_fbcode():
+                self.assertFalse(_contains_multi_kernel_code(wrapper_code))
+
+    @parametrize("force_kernel", (0, 1))
+    @unittest.mock.patch.dict(
+        os.environ, {"TORCHINDUCTOR_DISABLE_MULTI_KERNEL_CACHE": "1"}
+    )
+    def test_softmax_force_non_persistent_reduction(self, force_kernel):
+        """
+        Force a specific sub-kernel being picked by mocking the benchmark result.
+        """
+        x = torch.rand(2, 1024).cuda()
+        mock_latency = [0.2, 0.2]
+        mock_latency[force_kernel] = 0.1  # this make sure force_kernel will be picked
+
+        def f(x):
+            return torch.softmax(x, -1) + force_kernel
+
+        orig_run = MultiKernelCall.run_with_argless_kernels
+        picked_kernel = None
+
+        def mock_run(self, kernel_calls):
+            out = orig_run(self, kernel_calls)
+            nonlocal picked_kernel
+            picked_kernel = self.picked_kernel
+            return out
+
+        with unittest.mock.patch.object(
+            MultiKernelCall, "run_with_argless_kernels", mock_run
+        ), unittest.mock.patch.object(
+            MultiKernelCall, "benchmark_sub_kernels", lambda *args: mock_latency
+        ):
+            torch.compile(f)(x)
+        self.assertEqual(picked_kernel, force_kernel)
+
+    @config.patch("warn_mix_layout", True)
+    def test_softmax_warn_mixed_layout(self):
+        self.test_softmax()
+
+    test_softmax_cpp_wrapper = skipIfRocm(
+        make_cpp_wrapper_test(test_softmax, expect_multi_kernel=False)
+    )
+
+    def test_layernorm(self):
+        ln = nn.LayerNorm(1024).cuda()
+        x = torch.rand(2, 1024).cuda()
+        ref = ln(x)
+        act = torch.compile(ln)(x)
+        self.assertTrue(
+            torch.allclose(ref, act, atol=1e-4, rtol=1e-4), f"ref:\n{ref}\nact:\n{act}"
+        )
+
+    def test_inplace_update(self):
+        """
+        Inductor generate inplace kernel for mul.
+        """
+
+        def f(x, y):
+            return x.sum(dim=-1, keepdims=True) * (y @ y)
+
+        x = torch.rand(1024, 1024).cuda()
+        y = torch.rand(1024, 1024).cuda()
+        ref = f(x, y)
+        act = torch.compile(f)(x, y)
+        self.assertTrue(torch.allclose(ref, act))
+
+    def test_transformer_snippet(self):
+        model = TransformerSnippet().cuda()
+        x = model.example_inputs()
+
+        def f(*x):
+            y = model(*x)
+            return y
+
+        reset_rng_state()
+        ref = f(*x)
+
+        opt_f = torch.compile(f)
+        reset_rng_state()
+        act = opt_f(*x)
+
+        # don't compare tensor if using inductor random number generator.
+        # inductor random number implementation is different to eager.
+        # We should fallback to eager if we want to test accuracy.
+        if config.fallback_random:
+            self.assertTrue(
+                torch.allclose(ref, act, atol=1e-4, rtol=1e-4),
+                f"ref:\n{ref}\nact:\n{act}",
+            )
+
+    def test_transformer_snippet_with_fallback_random(self):
+        """
+        Same as test_transformer_snippet but fallback the random number
+        generator to eager so we can check accuracy.
+        """
+        with config.patch("fallback_random", True):
+            self.test_transformer_snippet()
+
+    def test_batchnorm_training(self):
+        """
+        For training, batchnorm will tracking running mean/variance during forward pass.
+        The kernel generated by inductor currently will pass in those tensors twice as arguments:
+        once for input and once for output. They are ruled out as in-out argument because
+        they are considered as graph inputs.
+
+        Multi-kernel previously assumes that we never pass the same argument mutli times
+        for a kernel. No mater if we change inductor behavior to assure that, it's better
+        to make multi-kernel being able to handle those cases.
+        """
+        bn = nn.BatchNorm2d(3).to("cuda")
+
+        @torch.compile
+        def f(x):
+            bn(x).sum().backward()
+
+        _, (wrapper_code, _) = run_and_get_code(
+            f, torch.randn(2, 3, 8, 8, device="cuda")
+        )
+        self.assertTrue(_contains_multi_kernel_code(wrapper_code))
+
+    def test_pass_same_arg_multi_times(self):
+        """
+        A super simple example that simulate how BatchNorm update the running
+        stats.
+
+        Inductor currently pass the same tensor multiple times for the generated
+        kernel: once for input and once for output.
+
+        Here is a paster for the generated kernel (without multi-kernel enabled):
+        https://gist.github.com/shunting314/f0b446b4b9a28f4940e31dcd3e809cf9
+        """
+
+        def f(x, y):
+            x = x.sum(dim=1, keepdim=False)
+            y.copy_(y * 0.9 + x * 0.1)
+
+        x = torch.randn(8, 16, device="cuda")
+        y = torch.randn(8, device="cuda")
+        y_ref = y.clone()
+
+        ref = f(x, y_ref)
+        act = torch.compile(f)(x, y)
+        self.assertTrue(torch.allclose(y_ref, y))
+
+    def test_reduction_scratch_buffer(self, force_multi_kernel=1):
+        """
+        The explicited realized buffer in the test function will be passed in
+        as a scratch buffer for the non-persistent reduction kernel but
+        can be skipped for the persistent reduction kernel.
+
+        This causes different argument lists for non-persistent reduction kernel and
+        persistent reduction kernel.
+
+        Check documentation around torch._inductor.config.triton.multi_kernel about
+        how to interpret the force_multi_kernel argument.
+        """
+
+        def f(x):
+            x = x.sum(dim=-1, keepdim=True) + x
+            x = test_operators.realize(x)
+            x = x.sum(dim=-1, keepdim=True) + x
+            return x
+
+        x = torch.rand(16, 16, device="cuda")
+        ref = f(x)
+        with config.patch("triton.multi_kernel", force_multi_kernel):
+            act = torch.compile(f)(x)
+        self.assertTrue(torch.allclose(ref, act))
+
+    # Use benchmarking to pick the faster kernel
+    test_reduction_scratch_buffer_cpp_wrapper = make_cpp_wrapper_test(
+        test_reduction_scratch_buffer, force_multi_kernel=1
+    )
+    # force pick persistent reduction. This can be a good test since this persistent
+    # reduction uses less call arguments than the corresponding non-persistent
+    # reduction.
+    test_reduction_scratch_buffer_cpp_wrapper_persistent_reduction = (
+        make_cpp_wrapper_test(test_reduction_scratch_buffer, force_multi_kernel=2)
+    )
+    # force pick non-persistent reduction
+    test_reduction_scratch_buffer_cpp_wrapper_non_persistent_reduction = (
+        make_cpp_wrapper_test(test_reduction_scratch_buffer, force_multi_kernel=3)
+    )
+
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    if HAS_CUDA:
+        run_tests()
diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
new file mode 100644
index 0000000000000..5aeb4d01edbd0
--- /dev/null
+++ b/test/inductor/test_pad_mm.py
@@ -0,0 +1,318 @@
+# Owner(s): ["module: inductor"]
+import unittest
+
+import torch
+
+import torch._inductor.config as inductor_config
+from torch._dynamo.testing import rand_strided
+from torch._inductor.fx_passes.pad_mm import get_alignment_size, get_padded_length
+
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import run_and_get_code
+from torch.testing import FileCheck
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+
+class PadMMTest(TestCase):
+    @inductor_config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
+    def test_pad_mm_dyn_m(self):
+        M = 40
+        K1 = 581
+        K2 = 49
+        N = 30
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = rand_strided(
+                    (K2, N), (1, K2), device="cuda", dtype=torch.float32
+                )
+
+            def forward(self, a):
+                a1 = torch.narrow(a, 1, 0, K2)
+                return torch.mm(a1, self.w)
+
+        fn = Model().cuda()
+        a = rand_strided((M, K1), (K1, 1), device="cuda", dtype=torch.float32)
+        aligned_k = get_padded_length(K2, get_alignment_size(a)) + K2
+        torch._dynamo.mark_dynamic(a, 0)
+        with unittest.mock.patch(
+            "torch._inductor.fx_passes.pad_mm._skip_do_bench_times", True
+        ):
+            res1 = fn(a)
+            compiled_fn = torch.compile(fn)
+            res2, (code,) = run_and_get_code(compiled_fn, a)
+            FileCheck().check(f"K = {aligned_k}").run(code)
+        self.assertEqual(res1, res2)
+
+    @inductor_config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
+    def test_cat_pad_mm_dyn_m(self):
+        M1 = 128
+        M2 = 40
+        K1 = 129
+        K2 = 111
+        N = 100
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = rand_strided(
+                    (K2, N), (1, K2), device="cuda", dtype=torch.float32
+                )
+
+            def forward(self, a, b):
+                c = torch.cat([a, b], dim=0)
+                a1 = torch.narrow(c, 1, 0, K2)
+                return torch.mm(a1, self.w)
+
+        fn = Model().cuda()
+        a = rand_strided((M1, K1), (K1, 1), device="cuda", dtype=torch.float32)
+        b = rand_strided((M2, K1), (K1, 1), device="cuda", dtype=torch.float32)
+        torch._dynamo.mark_dynamic(a, 0)
+        torch._dynamo.mark_dynamic(b, 0)
+        aligned_k = get_padded_length(K2, get_alignment_size(a)) + K2
+        with unittest.mock.patch(
+            "torch._inductor.fx_passes.pad_mm._skip_do_bench_times", True
+        ):
+            res1 = fn(a, b)
+            compiled_fn = torch.compile(fn)
+            res2, (code,) = run_and_get_code(compiled_fn, a, b)
+            FileCheck().check(f"K = {aligned_k}").run(code)
+        self.assertEqual(res1, res2)
+
+    @inductor_config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
+    def test_pad_mm_dyn_n(self):
+        M = 20
+        K = 81
+        N = 30
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, a, b):
+                return torch.mm(a, b)
+
+        fn = Model().cuda()
+        a = rand_strided((M, K), (K, 1), device="cuda", dtype=torch.float32)
+        b = rand_strided((K, N), (1, K), device="cuda", dtype=torch.float32)
+        aligned_k = get_padded_length(K, get_alignment_size(a)) + K
+        torch._dynamo.mark_dynamic(b, 1)
+        with unittest.mock.patch(
+            "torch._inductor.fx_passes.pad_mm._skip_do_bench_times", True
+        ):
+            res1 = fn(a, b)
+            compiled_fn = torch.compile(fn)
+            res2, (code,) = run_and_get_code(compiled_fn, a, b)
+            FileCheck().check(f"K = {aligned_k}").run(code)
+        self.assertEqual(res1, res2)
+
+    @inductor_config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
+    def test_pad_mm_dyn_k(self):
+        M = 21
+        K = 80
+        N = 30
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, a, b):
+                return torch.mm(a, b)
+
+        fn = Model().cuda()
+        a = rand_strided((M, K), (K, 1), device="cuda", dtype=torch.float32)
+        b = rand_strided((K, N), (1, K), device="cuda", dtype=torch.float32)
+        # TODO: Getting the alignment right requires pattern matcher to
+        # run on newly added nodes
+        aligned_m = get_padded_length(M, get_alignment_size(a)) + M - 3
+        torch._dynamo.mark_dynamic(a, 1)
+        torch._dynamo.mark_dynamic(b, 0)
+        with unittest.mock.patch(
+            "torch._inductor.fx_passes.pad_mm._skip_do_bench_times", True
+        ):
+            res1 = fn(a, b)
+            compiled_fn = torch.compile(fn)
+            res2, (code,) = run_and_get_code(compiled_fn, a, b)
+            FileCheck().check(f"M = {aligned_m}").run(code)
+        self.assertEqual(res1, res2)
+
+    def test_pad_mm_dyn_mnk(self):
+        M = 20
+        K = 81
+        N = 30
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, a, b):
+                return torch.mm(a, b)
+
+        fn = Model().cuda()
+        a = rand_strided((M, K), (K, 1), device="cuda", dtype=torch.float32)
+        b = rand_strided((K, N), (1, K), device="cuda", dtype=torch.float32)
+        torch._dynamo.mark_dynamic(a, 0)
+        torch._dynamo.mark_dynamic(a, 1)
+        torch._dynamo.mark_dynamic(b, 0)
+        torch._dynamo.mark_dynamic(b, 1)
+        with unittest.mock.patch(
+            "torch._inductor.fx_passes.pad_mm._skip_do_bench_times", True
+        ):
+            res1 = fn(a, b)
+            compiled_fn = torch.compile(fn)
+            res2, (code,) = run_and_get_code(compiled_fn, a, b)
+        self.assertEqual(res1, res2)
+
+    @inductor_config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
+    def test_pad_bmm_dyn_b(self):
+        B = 10
+        M = 128
+        K = 33
+        N = 40
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, a, b):
+                return torch.bmm(a, b)
+
+        fn = Model().cuda()
+        a = torch.randn(B, M, K, device="cuda", dtype=torch.float32)
+        b = torch.randn(B, K, N, device="cuda", dtype=torch.float32)
+        aligned_k = get_padded_length(K, get_alignment_size(a)) + K
+        torch._dynamo.mark_dynamic(a, 0)
+        torch._dynamo.mark_dynamic(b, 0)
+        with unittest.mock.patch(
+            "torch._inductor.fx_passes.pad_mm._skip_do_bench_times", True
+        ):
+            res1 = fn(a, b)
+            compiled_fn = torch.compile(fn)
+            res2, (code,) = run_and_get_code(compiled_fn, a, b)
+            FileCheck().check(f"K = {aligned_k}").run(code)
+        self.assertEqual(res1, res2)
+
+    @inductor_config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
+    def test_pad_bmm_dyn_k(self):
+        B = 10
+        M = 128
+        K = 40
+        N = 41
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, a, b):
+                return torch.bmm(a, b)
+
+        fn = Model().cuda()
+        a = torch.randn(B, M, K, device="cuda", dtype=torch.float32)
+        b = torch.randn(B, K, N, device="cuda", dtype=torch.float32)
+        aligned_n = get_padded_length(N, get_alignment_size(b)) + N
+        torch._dynamo.mark_dynamic(a, 2)
+        torch._dynamo.mark_dynamic(b, 1)
+        with unittest.mock.patch(
+            "torch._inductor.fx_passes.pad_mm._skip_do_bench_times", True
+        ):
+            res1 = fn(a, b)
+            compiled_fn = torch.compile(fn)
+            res2, (code,) = run_and_get_code(compiled_fn, a, b)
+            FileCheck().check(f"N = {aligned_n}").run(code)
+        self.assertEqual(res1, res2)
+
+    @inductor_config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
+    def test_pad_bmm_dyn_bm(self):
+        B = 10
+        M = 128
+        K = 40
+        N = 41
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, a, b):
+                return torch.bmm(a, b)
+
+        fn = Model().cuda()
+        a = torch.randn(B, M, K, device="cuda", dtype=torch.float32)
+        b = torch.randn(B, K, N, device="cuda", dtype=torch.float32)
+        aligned_n = get_padded_length(N, get_alignment_size(b)) + N
+        torch._dynamo.mark_dynamic(a, 0)
+        torch._dynamo.mark_dynamic(a, 1)
+        torch._dynamo.mark_dynamic(b, 0)
+        with unittest.mock.patch(
+            "torch._inductor.fx_passes.pad_mm._skip_do_bench_times", True
+        ):
+            res1 = fn(a, b)
+            compiled_fn = torch.compile(fn)
+            res2, (code,) = run_and_get_code(compiled_fn, a, b)
+            FileCheck().check(f"N = {aligned_n}").run(code)
+        self.assertEqual(res1, res2)
+
+    @inductor_config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
+    def test_pad_addmm_dyn_m(self):
+        M = 128
+        K = 33
+        N = 40
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, a, b, c):
+                return torch.addmm(a, b, c)
+
+        fn = Model().cuda()
+        a = torch.randn(M, N, device="cuda", dtype=torch.float32)
+        b = torch.randn(M, K, device="cuda", dtype=torch.float32)
+        c = torch.randn(K, N, device="cuda", dtype=torch.float32)
+        aligned_k = get_padded_length(K, get_alignment_size(b)) + K
+        torch._dynamo.mark_dynamic(a, 0)
+        torch._dynamo.mark_dynamic(b, 0)
+        with unittest.mock.patch(
+            "torch._inductor.fx_passes.pad_mm._skip_do_bench_times", True
+        ):
+            res1 = fn(a, b, c)
+            compiled_fn = torch.compile(fn)
+            res2, (code,) = run_and_get_code(compiled_fn, a, b, c)
+            FileCheck().check(f"K = {aligned_k}").run(code)
+        self.assertEqual(res1, res2)
+
+    @inductor_config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
+    def test_pad_addmm_dyn_mn(self):
+        M = 128
+        K = 33
+        N = 40
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, a, b, c):
+                return torch.addmm(a, b, c)
+
+        fn = Model().cuda()
+        a = torch.randn(M, N, device="cuda", dtype=torch.float32)
+        b = torch.randn(M, K, device="cuda", dtype=torch.float32)
+        c = torch.randn(K, N, device="cuda", dtype=torch.float32)
+        torch._dynamo.mark_dynamic(a, 0)
+        torch._dynamo.mark_dynamic(a, 1)
+        torch._dynamo.mark_dynamic(b, 0)
+        torch._dynamo.mark_dynamic(c, 1)
+        with unittest.mock.patch(
+            "torch._inductor.fx_passes.pad_mm._skip_do_bench_times", True
+        ):
+            res1 = fn(a, b, c)
+            compiled_fn = torch.compile(fn)
+            res2, (code,) = run_and_get_code(compiled_fn, a, b, c)
+            # no padding
+            FileCheck().check(f"K = {K}").run(code)
+        self.assertEqual(res1, res2)
+
+
+if __name__ == "__main__":
+    if HAS_CUDA:
+        run_tests()
diff --git a/test/inductor/test_padding.py b/test/inductor/test_padding.py
new file mode 100644
index 0000000000000..d9bf81e6630bd
--- /dev/null
+++ b/test/inductor/test_padding.py
@@ -0,0 +1,640 @@
+# Owner(s): ["module: inductor"]
+import copy
+
+import functools
+import os
+import unittest
+
+import torch
+from torch import nn, Tensor
+from torch._dynamo.test_case import run_tests, TestCase
+from torch._dynamo.testing import rand_strided, reduce_to_scalar_loss
+from torch._dynamo.utils import maybe_cprofile
+from torch._inductor import config, ir, metrics
+from torch._inductor.fx_passes import pad_mm as pad_mm_pass
+from torch._inductor.runtime.runtime_utils import do_bench
+from torch._inductor.utils import run_and_get_code
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+DO_PERF_TEST = os.environ.get("DO_PERF_TEST") == "1"
+DO_ACC_TEST = os.environ.get("DO_ACC_TEST", "1") == "1"
+WITH_STACK = os.environ.get("WITH_STACK") == "1"
+USE_CUDA_GRAPHS = os.environ.get("USE_CUDA_GRAPHS", "1") == "1"
+
+try:
+    import transformers  # noqa: F401
+
+    HAS_TRANSFORMER = True
+except ImportError:
+    HAS_TRANSFORMER = False
+
+
+def get_optim(m):
+    return torch.optim.Adam(m.parameters(), lr=0.01, capturable=True, foreach=True)
+
+
+def gen_transformer_inputs(vocab_size, bs, seq_length):
+    def geninp():
+        return torch.randint(
+            0, vocab_size, (bs, seq_length), dtype=torch.int64, requires_grad=False
+        )
+
+    input_dict = {"input_ids": geninp(), "labels": geninp()}
+    return input_dict
+
+
+class LinearAndSoftmax(nn.Module):
+    """
+    It's very common that a transformer model will do a matmul and then
+    softmax/log_softmax in the end.
+
+    Creating this toy model to capture the pattern and make sure we do
+    proper padding.
+    """
+
+    def __init__(self, vocab_size=30523, bias=True):
+        """
+        The default vocab size for BertForMaskedLM is 30522.
+        We run a few test cases with good or bad vocab_size around Bert's
+        default value.
+        """
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.linear = nn.Linear(768, vocab_size, bias=bias)
+        self.ce = nn.CrossEntropyLoss()
+
+    def forward(self, x, label):
+        x = self.linear(x)
+        return self.ce(x.view(-1, self.vocab_size), label.view(-1))
+
+    def get_example_inputs(self, batch_size=16):
+        return torch.randn(batch_size, 512, 768), torch.randint(
+            0, self.vocab_size, (batch_size, 512)
+        )
+
+
+def forward_and_backward_pass(m, inputs):
+    m(*inputs).sum().backward()
+
+
+@config.patch(
+    {
+        "benchmark_kernel": True,
+        "triton.unique_kernel_names": True,
+        "triton.cudagraphs": USE_CUDA_GRAPHS,
+    }
+)
+class TestCaseBase(TestCase):
+    def check_close(self, ref, act, tol=1e-3):
+        if type(ref).__name__ == "LongformerMaskedLMOutput":
+            ref = ref.loss
+            act = act.loss
+        if type(ref).__name__ == "SequenceClassifierOutput":
+            ref = ref.logits
+            act = act.logits
+        if isinstance(ref, dict) and "loss" in ref:
+            ref = ref["loss"]
+            act = act["loss"]
+        self.assertTrue(
+            torch.allclose(ref, act, atol=tol, rtol=tol), f"ref:\n{ref}\nact:\n{act}"
+        )
+
+    def common_numeric_check(self, f, *args, tol=1e-3, **kwargs):
+        ref = f(*args, **kwargs)
+        opt_f = torch.compile(f)
+        act = opt_f(*args, **kwargs)
+        self.check_close(ref, act, tol)
+
+    def do_profiling(
+        self,
+        f_lhs,
+        f_rhs,
+        tag_lhs="With padding",
+        tag_rhs="Without padding",
+        args=(),
+        kwargs=None,
+    ):
+        if kwargs is None:
+            kwargs = {}
+        torch.cuda.synchronize()
+        with torch.profiler.profile(with_stack=WITH_STACK) as p:
+            niter = 3
+            for _ in range(niter):
+                with torch.profiler.record_function(tag_lhs):
+                    f_lhs(*args, **kwargs)
+
+                with torch.profiler.record_function(tag_rhs):
+                    f_rhs(*args, **kwargs)
+            torch.cuda.synchronize()
+
+        profile_path = "/tmp/chrome.json"
+        p.export_chrome_trace(profile_path)
+        print(f"Chrome trace is written to {profile_path}")
+
+
+class PerfTestBetweenGoodAndBadShape(TestCaseBase):
+    @unittest.skipIf(not DO_PERF_TEST, "Perf test not enabled")
+    def test_nobias_LinearAndSoftmax_both_shapes(self):
+        self.test_LinearAndSoftmax_both_shapes(bias=False)
+
+    @unittest.skipIf(not DO_PERF_TEST, "Perf test not enabled")
+    def test_LinearAndSoftmax_both_shapes(self, bias=True):
+        """
+        Compare the perf with good and bad shape.
+        """
+        m_bad_shape = LinearAndSoftmax(vocab_size=30523, bias=bias)
+        inptus_bad_shape = m_bad_shape.get_example_inputs()
+        m_good_shape = LinearAndSoftmax(vocab_size=30528, bias=bias)
+        inputs_good_shape = m_good_shape.get_example_inputs()
+
+        m_bad_shape_opt = torch.compile(m_bad_shape)
+        m_good_shape_opt = torch.compile(m_good_shape)
+
+        latency_good_shape = do_bench(
+            lambda: forward_and_backward_pass(m_good_shape_opt, inputs_good_shape)
+        )
+        latency_bad_shape = do_bench(
+            lambda: forward_and_backward_pass(m_bad_shape_opt, inptus_bad_shape)
+        )
+        print(
+            f"Latency for good shape v.s. bad shape: {latency_good_shape:.3f}ms v.s. {latency_bad_shape:.3f}ms"
+        )
+
+    @unittest.skipIf(not DO_PERF_TEST or not HAS_TRANSFORMER, "Perf test not enabled")
+    def test_BertForMaskedLM(self, num_layers=1):
+        """
+        Compare the perf between doing padding and good shape.
+        """
+        from transformers import BertForMaskedLM
+
+        config_cls = BertForMaskedLM.config_class
+        bs = 16
+        seq_length = 512
+
+        def create_model(vocab_size):
+            config = config_cls()
+            config.num_hidden_layers = num_layers
+            config.vocab_size = vocab_size
+            inputs = gen_transformer_inputs(config.vocab_size, bs, seq_length)
+            model = BertForMaskedLM(config)
+
+            optim = get_optim(model)
+
+            def f(**inputs):
+                optim.zero_grad(True)
+                with torch.cuda.amp.autocast():
+                    pred = model(**inputs)
+                    loss = pred[0]
+                loss.backward()
+                optim.step()
+
+            return torch.compile(f), inputs
+
+        f_good_shape, inputs_good_shape = create_model(30528)
+        f_bad_shape, inputs_bad_shape = create_model(30522)
+
+        print("benchmark for good shape")
+        latency_good_shape = do_bench(lambda: f_good_shape(**inputs_good_shape))
+        print("benchmark for bad shape")
+        latency_bad_shape = do_bench(lambda: f_bad_shape(**inputs_bad_shape))
+        print(
+            f"Latency with good and bad shape: {latency_good_shape:.3f} v.s. {latency_bad_shape:.3f}"
+        )
+
+        self.do_profiling(
+            lambda: f_good_shape(**inputs_good_shape),
+            lambda: f_bad_shape(**inputs_bad_shape),
+            tag_lhs="With good shape",
+            tag_rhs="With bad shape",
+        )
+
+
+class PerfTestWithAndWithoutPadding(TestCaseBase):
+    @maybe_cprofile
+    def run_acc_and_perf_test(self, model, inputs, perf_inputs=None, tol=1e-3):
+        """
+        Run accuracy test.
+
+        Also compare the perf with and without the comprehensive padding if
+        DO_PERF_TEST is true.
+        """
+        if perf_inputs is None:
+            perf_inputs = inputs
+
+        def _process_inputs(x):
+            """
+            return args and kwargs
+            """
+            if isinstance(x, dict):
+                return [], x
+
+            if not isinstance(inputs, (tuple, list)):
+                x = [x]
+
+            return x, {}
+
+        args, kwargs = _process_inputs(inputs)
+        perf_args, perf_kwargs = _process_inputs(perf_inputs)
+
+        if DO_ACC_TEST:
+            model.eval()
+            self.common_numeric_check(model, *args, **kwargs, tol=tol)
+        else:
+            print("Accuracy test skipped")
+
+        model.train()
+
+        if DO_PERF_TEST:
+            print("Do performance test")
+
+            def get_f(m, optim):
+                def f(*args, **kwargs):
+                    optim.zero_grad(True)
+                    with torch.cuda.amp.autocast():
+                        pred = m(*args, **kwargs)
+                        loss = reduce_to_scalar_loss(pred)
+                    loss.backward()
+                    optim.step()
+
+                return f
+
+            latency_with_padding = None
+            print("Benchmark with padding")
+            with config.patch(comprehensive_padding=True):
+                m_copy_with_padding = copy.deepcopy(model)
+                optim_with_padding = get_optim(m_copy_with_padding)
+                opt_f_with_padding = torch.compile(
+                    get_f(m_copy_with_padding, optim_with_padding)
+                )
+                latency_with_padding = do_bench(
+                    lambda: opt_f_with_padding(*perf_args, **perf_kwargs)
+                )
+            latency_without_padding = None
+            print("bencmark without padding")
+            with config.patch(comprehensive_padding=False):
+                m_copy_without_padding = copy.deepcopy(model)
+                optim_without_padding = get_optim(m_copy_without_padding)
+                opt_f_without_padding = torch.compile(
+                    get_f(m_copy_without_padding, optim_without_padding)
+                )
+                latency_without_padding = do_bench(
+                    lambda: opt_f_without_padding(*perf_args, **perf_kwargs)
+                )
+            print(
+                f"Latency with and without padding: {latency_with_padding:.3f} v.s. {latency_without_padding:.3f}"
+            )
+
+            # profiling
+            self.do_profiling(
+                opt_f_with_padding,
+                opt_f_without_padding,
+                args=perf_args,
+                kwargs=perf_kwargs,
+            )
+
+    def test_nvidia_deeprecommender(self):
+        """
+        Compared the perf with and without comprehensive padding.
+        """
+        layer_sizes = [197951, 512, 512, 1024, 512, 512, 197951]
+        x = torch.randn(4, layer_sizes[0])
+
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                mod_list = []
+                for i in range(len(layer_sizes) - 1):
+                    mod_list.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1]))
+                    mod_list.append(nn.SELU())
+
+                    if i == 2:
+                        mod_list.append(nn.Dropout(0.8))
+                self.seq = nn.Sequential(*mod_list)
+
+            def forward(self, x):
+                return self.seq(x)
+
+        m = Model()
+        perf_inputs = torch.randn(256, layer_sizes[0])
+        self.run_acc_and_perf_test(m, x, perf_inputs)
+
+    @unittest.skipIf(not DO_PERF_TEST or not HAS_TRANSFORMER, "Perf test not enabled")
+    def test_longformer(self, bs=4):
+        from transformers import AutoConfig, AutoModelForMaskedLM
+
+        config = AutoConfig.from_pretrained("allenai/longformer-base-4096")
+        model = AutoModelForMaskedLM.from_config(config)
+
+        vocab_size = model.config.vocab_size
+        seq_length = 1024
+        input_dict = gen_transformer_inputs(vocab_size, bs, seq_length)
+
+        self.run_acc_and_perf_test(model, input_dict)
+
+    @unittest.skipIf(not DO_PERF_TEST or not HAS_TRANSFORMER, "Perf test not enabled")
+    def test_longformer_small_bs(self):
+        """
+        The model exists in both HF and TB. In TB it uses a samller batch size.
+        """
+        self.test_longformer(bs=2)
+
+
+class PaddingTest(TestCaseBase):
+    @unittest.skipIf(not DO_PERF_TEST, "Perf test not enabled")
+    def test_mm_padding_perf(self):
+        def naive_mm(a, b):
+            return a @ b
+
+        def _compute_padding(s, align):
+            return (s + align - 1) // align * align - s
+
+        @torch.compile
+        def pad_mm(a, b, align=16):
+            """
+            NOTE: this function only pad a single dimension which is good
+            enough for testing.
+            """
+            m_padding = _compute_padding(a.size(0), align)
+            k_padding = _compute_padding(a.size(1), align)
+            n_padding = _compute_padding(b.size(1), align)
+            return pad_mm_pass.pad_mm(a, b, m_padding, k_padding, n_padding)
+
+        for M, K, N, f in (
+            (8192, 768, 30523, naive_mm),
+            (8192, 768, 30523, pad_mm),
+            (8192, 768, 30528, naive_mm),
+            (30523, 8192, 768, naive_mm),
+            (30528, 8192, 768, naive_mm),
+        ):
+            a = torch.randn(M, K)
+            b = torch.randn(K, N)
+            ms = do_bench(lambda: f(a, b))
+            print(f"MxKxN {M}x{K}x{N} {f.__name__}: {ms:.3f}ms")
+
+    @unittest.skipIf(not DO_PERF_TEST, "Perf test not enabled")
+    def test_padmm(self):
+        """
+        Latency between origional matmul and padded matmul: 2.717 v.s. 2.356
+        """
+        mat1_pad = torch.randn(8192, 30522, dtype=torch.float16)
+        mat2_pad = torch.randn(30522, 768, dtype=torch.float16)
+
+        def f():
+            return mat1_pad @ mat2_pad
+
+        def pad_dim(x: Tensor, padded_length: int, dim: int) -> Tensor:
+            pad = x.new_zeros(*x.shape[:dim], padded_length, *x.shape[dim + 1 :])
+            return torch.cat([x, pad], dim=dim)
+
+        @torch.compile(fullgraph=True, options={"triton.cudagraphs": False})
+        def g():
+            mat1 = mat1_pad
+            mat2 = mat2_pad
+            mat1 = pad_dim(mat1, 6, 1)
+            mat2 = pad_dim(mat2, 6, 0)
+            return torch.ops.aten.mm(mat1, mat2)
+
+        ori_time = do_bench(f)
+        pad_time = do_bench(g)
+
+        print(
+            f"Latency between origional matmul and padded matmul: {ori_time:.3f} v.s. {pad_time:.3f}"
+        )
+        self.do_profiling(f, g, "No MM Padding", "With mm padding")
+
+    @unittest.skipIf(not DO_PERF_TEST, "Perf test not enabled")
+    def test_matmul(self):
+        """
+        Latency with good and bad shapes: 1.705 v.s. 2.625
+        """
+        x_good_shape = torch.randn(8192, 30528, dtype=torch.float16)
+        weight_good_shape = torch.randn(30528, 768, dtype=torch.float16)
+        out_good_shape = torch.randn(8192, 768, dtype=torch.float16)
+
+        # Using stride (30522, 1) does not make a difference here.
+        x_bad_shape = rand_strided(
+            (8192, 30522), (30528, 1), device="cuda", dtype=torch.float16
+        )
+        weight_bad_shape = torch.randn(30522, 768, dtype=torch.float16)
+        out_bad_shape = torch.randn(8192, 768, dtype=torch.float16)
+
+        def f(x, weight, out):
+            torch.mm(x, weight, out=out)
+            return out
+
+        f1 = torch.compile(
+            functools.partial(f, x_good_shape, weight_good_shape, out_good_shape)
+        )
+        f2 = torch.compile(
+            functools.partial(f, x_bad_shape, weight_bad_shape, out_bad_shape)
+        )
+        latency_good_shape = do_bench(f1)
+        latency_bad_shape = do_bench(f2)
+        print(
+            f"Latency with good and bad shapes: {latency_good_shape:.3f} v.s. {latency_bad_shape:.3f}"
+        )
+        self.do_profiling(f1, f2)
+
+    def test_nobias_LinearAndSoftmax_codegen(self):
+        self.test_LinearAndSoftmax_codegen(bias=False)
+
+    def test_LinearAndSoftmax_codegen(self, bias=True):
+        m_bad_shape = LinearAndSoftmax(vocab_size=30523, bias=bias)
+        inputs_bad_shape = m_bad_shape.get_example_inputs()
+        m_bad_shape_opt = torch.compile(copy.deepcopy(m_bad_shape))
+
+        _, wrapper_codes = run_and_get_code(
+            forward_and_backward_pass, m_bad_shape_opt, inputs_bad_shape
+        )
+        forward_and_backward_pass(m_bad_shape, inputs_bad_shape)
+        self.assertTrue(
+            torch.allclose(
+                m_bad_shape.linear.weight.grad, m_bad_shape_opt.linear.weight.grad
+            )
+        )
+        self.assertTrue(len(wrapper_codes) == 2)  # one for forward and oen for backward
+        forward_wrapper = wrapper_codes[0]
+
+        # make sure the load for softmax is aligned
+        self.assertTrue(
+            "tl.load(in_ptr0 + (r1 + (30528*x0))" in forward_wrapper,
+            f"forward_wrapper: {forward_wrapper}",
+        )
+
+        if DO_PERF_TEST:
+            latency = do_bench(
+                lambda: forward_and_backward_pass(m_bad_shape_opt, inputs_bad_shape)
+            )
+            print(f"latency: {latency:.3f}ms")
+
+    @config.patch(pattern_matcher=False)
+    def test_attention(self):
+        batch_size, seq_len, num_heads, hidden_size = 1, 4, 1, 16
+        inv_scale = (num_heads / hidden_size) ** 0.5
+
+        class Attention(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.query = nn.Linear(hidden_size, hidden_size)
+                self.key = nn.Linear(hidden_size, hidden_size)
+                self.value = nn.Linear(hidden_size, hidden_size)
+
+            @staticmethod
+            def reshape(x):
+                return x.view(batch_size, seq_len, num_heads, -1).permute(0, 2, 1, 3)
+
+            @staticmethod
+            def cancel_reshape(x):
+                return x.permute(0, 2, 1, 3).view(batch_size, seq_len, hidden_size)
+
+            def forward(self, x):
+                query, key, value = self.query(x), self.key(x), self.value(x)
+                weights = (
+                    torch.matmul(
+                        self.reshape(query), self.reshape(key).permute(0, 1, 3, 2)
+                    )
+                    * inv_scale
+                ).softmax(dim=-1)
+                return self.cancel_reshape(torch.matmul(weights, self.reshape(value)))
+
+        attn = Attention()
+        x = torch.randn(batch_size, seq_len, hidden_size)
+
+        self.common_numeric_check(attn, x)
+
+    def test_view(self):
+        def f(x):
+            return x.view(3, 3, 3)
+
+        x = torch.randn(3, 9)
+        self.common_numeric_check(f, x)
+
+    def test_pad_strides(self):
+        """
+        Note that dim0's stride is also padded even though its previous value
+        is already multiple of 16. The reason is we padded dim1's stride.
+        We have to correspondingly increase the stride for dim0.
+        """
+        sizes = [2, 16, 2047]
+        in_strides = [2047 * 16, 2047, 1]
+        out_strides = list(ir.Layout._pad_strides(in_strides, sizes, torch.float32))
+        expected_strides = [2048 * 16, 2048, 1]
+        self.assertEqual(
+            expected_strides, out_strides, f"{expected_strides} v.s. {out_strides}"
+        )
+
+    def test_pad_strides_skip(self):
+        """
+        The padding is skipped to avoid too much memory overhead.
+        """
+        sizes = [2, 32, 127]
+        in_strides = [4064, 127, 1]
+        out_strides = list(ir.Layout._pad_strides(in_strides, sizes, torch.float32))
+        expected_strides = [4064, 127, 1]
+        self.assertEqual(
+            expected_strides, out_strides, f"{expected_strides} v.s. {out_strides}"
+        )
+
+    def test_pad_3d_tensor(self):
+        """
+        Constructing this test case guided by the fact that we don't pad
+        placeholder or user visible output's strides.
+
+        Add a matmul in the beginning and end so we can pad strides for
+        intermediate tensors.
+        """
+
+        def f(x, y):
+            x = torch.matmul(x, y)
+            x = x + 1
+            return torch.matmul(x, y)
+
+        x = torch.randn(2, 16, 2047)
+        y = torch.randn(2047, 2047)
+        self.common_numeric_check(f, x, y, tol=1e-2)
+        self.assertTrue(metrics.num_comprehensive_padding > 0)
+
+    def test_conv(self):
+        """
+        Padding the input for convolution may cause extra copy kernel being called.
+        Check this example trace: https://gist.github.com/shunting314/ce45398f7d51a63ce05fc8d411faddb3
+        """
+        x_shape = (1, 128, 640, 959)
+        x1 = torch.randn(*x_shape)
+
+        padded_stride = ir.Layout._pad_strides(x1.stride(), x1.shape, torch.float32)
+        x2 = rand_strided(x_shape, padded_stride, device="cuda")
+        x2.copy_(x1)
+
+        weight = torch.randn(64, 128, 3, 3)
+
+        def fun(x, weight):
+            return torch.convolution(
+                x,
+                weight,
+                stride=(1, 1),
+                padding=(1, 1),
+                dilation=(1, 1),
+                transposed=False,
+                output_padding=(0, 0),
+                groups=1,
+                bias=None,
+            )
+
+        ref = fun(x1, weight)
+        act = fun(x2, weight)
+        self.check_close(ref, act)
+        if DO_PERF_TEST:
+            latency_with_padding = do_bench(lambda: fun(x2, weight))
+            latency_without_padding = do_bench(lambda: fun(x1, weight))
+            print(
+                f"Latency with and without padding: {latency_with_padding:.3f} v.s. {latency_without_padding:.3f}"
+            )
+
+            self.do_profiling(lambda: fun(x2, weight), lambda: fun(x1, weight))
+
+    @unittest.skipIf(not DO_PERF_TEST, "Perf test not enabled")
+    def test_cat(self):
+        """
+        Compare the perf between aten cat and compiled cat.
+
+        Latency between eager and compiled: 1.596 v.s. 0.601
+
+        Eager cat can be 2.66x slower than inductor kernel.
+        """
+        x = torch.randn(8192, 30522, dtype=torch.float16)
+
+        def f(x):
+            pad = x.new_zeros(x.size(0), 6)
+            return torch.cat([x, pad], dim=1)
+
+        # disable cudagraphs since cudagraphs need copy the input which
+        # distort the latency a lot! (double the latency here for compiled
+        # version)
+        with config.patch("triton.cudagraphs", False):
+            opt_f = torch.compile(f)
+            opt_f(x)
+        eager_time = do_bench(lambda: f(x))
+        opt_time = do_bench(lambda: opt_f(x))
+        print(
+            f"Latency between eager and compiled: {eager_time:.3f} v.s. {opt_time:.3f}"
+        )
+        self.do_profiling(lambda: f(x), lambda: opt_f(x), "Eager Cat", "Compiled Cat")
+
+    def test_pad_channels_last(self):
+        t = torch.randn(2, 3, 5, 1025)
+        in_strides = t.stride()
+        out_strides = ir.Layout._pad_strides(in_strides, t.shape, torch.float32)
+        self.assertTrue(in_strides != out_strides)
+
+        t = t.to(memory_format=torch.channels_last)
+        in_strides = t.stride()
+        out_strides = ir.Layout._pad_strides(in_strides, t.shape, torch.float32)
+        self.assertTrue(in_strides == out_strides)
+
+
+if __name__ == "__main__":
+    if HAS_CUDA:
+        torch.set_float32_matmul_precision("high")
+        torch.set_default_device("cuda")
+        run_tests()
diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py
index 4735736406ca8..fc1a9a5ec507d 100644
--- a/test/inductor/test_pattern_matcher.py
+++ b/test/inductor/test_pattern_matcher.py
@@ -1,31 +1,27 @@
 # Owner(s): ["module: inductor"]
 import copy
+import os
 import unittest
 
 import torch
 import torch._dynamo.config as dynamo_config
 import torch._inductor.config as inductor_config
-from torch._dynamo.test_case import run_tests, TestCase
 from torch._dynamo.utils import count_calls, counters
 from torch._higher_order_ops.out_dtype import out_dtype
 from torch._inductor.fx_passes import joint_graph
 
-from torch._inductor.fx_passes.serialized_patterns.central_index import (
-    get_serialized_pattern,
-)
 from torch._inductor.pattern_matcher import (
-    _TargetExpr,
     Arg,
     CallFunction,
     gen_pattern,
     KeywordArg,
     Match,
-    PatternExpr,
     PatternMatcherPass,
     PatternPrettyPrinter,
     register_graph_pattern,
     stable_topological_sort,
 )
+from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import run_and_get_code
 from torch._inductor.virtualized import V
 from torch.testing import FileCheck
@@ -63,13 +59,26 @@ def test_mm_plus_mm(self):
         def fn(a, b, c, d):
             return torch.add(torch.mm(a, b), torch.mm(c, d))
 
-        args_list = [
+        # when m1 == n1 and m2 == n2, mm_plus_mm can be matched to fused op
+        fusible_args_list = [
             (
                 torch.randn(16, 16, device="cuda"),
                 torch.randn(16, 16, device="cuda"),
                 torch.randn(16, 16, device="cuda"),
                 torch.randn(16, 16, device="cuda"),
             ),
+            (
+                torch.randn(1, 4, device="cuda"),
+                torch.randn(4, 2, device="cuda"),
+                torch.randn(1, 5, device="cuda"),
+                torch.randn(5, 2, device="cuda"),
+            ),
+        ]
+        for args in fusible_args_list:
+            self.common(fn, args, 1, 3)
+
+        # if not fusible, it can only match add(mm())
+        unfusible_args_list = [
             # https://github.com/pytorch/pytorch/issues/100670.
             (
                 torch.randn(1, 4, device="cuda"),
@@ -83,15 +92,9 @@ def fn(a, b, c, d):
                 torch.randn(1, 4, device="cuda"),
                 torch.randn(4, 2, device="cuda"),
             ),
-            (
-                torch.randn(1, 4, device="cuda"),
-                torch.randn(4, 2, device="cuda"),
-                torch.randn(1, 5, device="cuda"),
-                torch.randn(5, 2, device="cuda"),
-            ),
         ]
-        for args in args_list:
-            self.common(fn, args, 1, 3)
+        for args in unfusible_args_list:
+            self.common(fn, args, 1, 2)
 
     def _test_fused_int_mm_mul_impl(self, fn, args, fused_int_mm_mul_expected=True):
         torch._dynamo.reset()
@@ -517,7 +520,7 @@ def fn(a, b, c):
             torch.randn(16, 16, device="cuda"),
             torch.randn(16, 16, device="cuda"),
         ]
-        self.common(fn, args, 2, 5)
+        self.common(fn, args, 1, 4)
 
     def test_cat_addmm(self):
         def fn(a, b, c):
@@ -535,7 +538,7 @@ def fn(a, b, c):
             torch.randn(16, 16, device="cuda"),
             torch.randn(16, 16, device="cuda"),
         ]
-        self.common(fn, args, 2, 5)
+        self.common(fn, args, 1, 4)
 
     def test_cat_slice_cat_cuda(self):
         def fn(a, b):
@@ -763,9 +766,82 @@ def fn(a, b, c):
         ]
         self.common(fn, args, 0, 0)
 
+    def test_symint_pattern_matching(self):
+        import torch._inductor.config as config
+        from torch._inductor.pattern_matcher import (
+            fwd_only,
+            PatternMatcherPass,
+            register_replacement,
+        )
+
+        saved_graph = None
+
+        class _CustomPass(PatternMatcherPass):
+            def __init__(self):
+                super().__init__()
+
+            def __call__(self, g: torch.fx.graph.Graph):
+                self.apply(g)
+                nonlocal saved_graph
+                saved_graph = g
+
+        with config.patch(
+            # leave custom pass only in post_grad_passes()
+            pattern_matcher=False,
+            # define pattern match as custom post grad opt pass
+            post_grad_custom_pre_pass=None,
+            post_grad_custom_post_pass=_CustomPass(),
+        ):
+
+            def add(x, y):
+                return x + y
+
+            # testing that
+            def sym_minus(x, y):
+                return (x - (-y.size(0))) - (y * -1) - y.size(0)
+
+            device = "cpu"
+            my_args = [
+                torch.empty([8, 1], device=device),
+                torch.empty([10], device=device),
+            ]
+
+            invoked = False
+
+            def extra_check(match):
+                nonlocal invoked
+                invoked = True
+                return True
+
+            register_replacement(
+                add,
+                sym_minus,
+                my_args,
+                fwd_only,
+                [config.post_grad_custom_post_pass],
+                extra_check=extra_check,
+            )
+
+            @torch.compile(dynamic=True)
+            def foo(x, y):
+                return x + y
+
+            x = torch.rand([8, 1])
+            y = torch.rand([10])
+
+            self.assertEqual(foo(x, y), x + y)
+
+            self.assertTrue(invoked)
+            # we trace out the y.sym_size in replacement
+            FileCheck().check("sym_size_int").check_same("num_users=2").check_same(
+                "target=torch.ops.aten.sym_size"
+            ).run(str(saved_graph))
+
     def test_match_with_mutation(self):
         counter = 0
-        test_pass = PatternMatcherPass(prevent_match_across_mutations=True)
+        test_pass = PatternMatcherPass(
+            prevent_match_across_mutations=True, pass_name="test"
+        )
 
         @register_graph_pattern(
             CallFunction(
@@ -818,7 +894,14 @@ def fn5(x, y):
         ]
 
         with unittest.mock.patch(
-            "torch._inductor.fx_passes.pre_grad.pattern_matcher_passes", [test_pass]
+            "torch._inductor.fx_passes.pre_grad.config.pre_grad_fusion_options",
+            {"test": {}},
+        ), unittest.mock.patch(
+            "torch._inductor.fx_passes.pre_grad.PRE_GRAD_FUSIONS",
+            [],
+        ), unittest.mock.patch(
+            "torch._inductor.fx_passes.pre_grad.PRE_GRAD_PATTERNS",
+            {"test": test_pass},
         ):
             for fn in (fn0, fn1, fn2, fn3, fn4, fn5):
                 counter = 0
@@ -869,66 +952,50 @@ def fn2(inp, a, b):
         _, (code) = run_and_get_code(fn2, args[0], args[1], args[2])
         FileCheck().check_not("extern_kernels.addmm(").run(code[0])
 
-    def test_fuse_attention_roundtrip_pattern(self):
-        # are we losing anything in serialization
-        from torch._inductor.fx_passes.fuse_attention import _get_sfdp_patterns
-
-        global_vals = {
-            "aten": torch.ops.aten,
-            "prims": torch.ops.prims,
-            "torch": torch,
-        }
-
-        for name in dir(torch._inductor.pattern_matcher):
-            attr = getattr(torch._inductor.pattern_matcher, name)
-            if isinstance(attr, type) and issubclass(attr, (PatternExpr, _TargetExpr)):
-                global_vals[name] = attr
-
-        with torch._subclasses.FakeTensorMode():
-            for _, kwargs in _get_sfdp_patterns():
-                gen_kwargs = {
-                    key: kwargs[key]
-                    for key in (
-                        "search_fn",
-                        "example_inputs",
-                        "trace_fn",
-                        "scalar_workaround",
-                    )
-                }
-                pattern = gen_pattern(**gen_kwargs)
-                pattern_pp = PatternPrettyPrinter.run(pattern)
-                env = global_vals.copy()
-                exec(pattern_pp, env)
-                pattern_2 = env["output"]
-                self.assertEqual(pattern_pp, PatternPrettyPrinter.run(pattern_2))
-
-    def test_fuse_attention_all_patterns_serialized(self):
-        from torch._inductor.fx_passes.fuse_attention import _get_sfdp_patterns
-
-        with torch._subclasses.FakeTensorMode():
-            for key, kwargs in _get_sfdp_patterns():
-                gen_kwargs = {
-                    key: kwargs[key]
-                    for key in (
-                        "search_fn",
-                        "example_inputs",
-                        "trace_fn",
-                        "scalar_workaround",
-                    )
-                }
-                pattern = gen_pattern(**gen_kwargs)
+    def test_serialized_patterns_up_to_date(self):
+        import torch.utils._pytree as pytree
+        from torch._inductor.fx_passes import joint_graph
+        from torch._inductor.pattern_matcher import _known_precompiled_patterns
+
+        # Ensure the patterns are loaded
+        os.environ.pop("PYTORCH_GEN_PATTERNS", None)
+        joint_graph.lazy_init()
+
+        with torch._subclasses.FakeTensorMode() as mode:
+            for (
+                search_fn,
+                example_inputs,
+                trace_fn,
+                scalar_workaround,
+                search_fn_pattern,
+            ) in _known_precompiled_patterns:
+                # Because the example_inputs were saved as fake tensors in a
+                # different FakeTensorMode we need to update them to our
+                # FakeTensorMode().
+                def remap_fake_tensor(x):
+                    if isinstance(x, torch.Tensor):
+                        return torch._subclasses.FakeTensor.from_tensor(x, mode)
+                    return x
+
+                example_inputs = pytree.tree_map(remap_fake_tensor, example_inputs)
+
+                pattern = gen_pattern(
+                    search_fn, example_inputs, trace_fn, scalar_workaround
+                )
                 pattern_pp = PatternPrettyPrinter.run(pattern)
 
-                search_fn_pattern = get_serialized_pattern(key)
-                if search_fn_pattern is None:
-                    continue
-
                 self.assertEqual(
                     pattern_pp,
                     PatternPrettyPrinter.run(search_fn_pattern),
-                    msg=f"Found mismatched pattern {key}. Run gen_attention_patterns.py",
+                    msg=f"Found mismatched pattern {search_fn.__name__}. Run torchgen/fuse/gen_patterns.py",
                 )
 
+                # Since we've already checked that the serialized patterns match
+                # lets verify the serializer by ensuring the generated patterns
+                # also match (since search_fn_pattern is the serialized version
+                # of search_fn).
+                self.assertTrue(pattern.pattern_eq(search_fn_pattern))
+
     def test_match_equivalent_function_invocations1(self):
         counter = 0
         test_pass = PatternMatcherPass(prevent_match_across_mutations=True)
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index 0f858fc44037c..c4394b3964865 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -6,9 +6,26 @@
 
 import torch
 import torch._inductor.config as config
+import torch.autograd
 from torch._inductor import metrics
 from torch._inductor.compile_fx import compile_fx, count_bytes_inner
-from torch.testing._internal.common_utils import IS_WINDOWS, TestCase as TorchTestCase
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch.testing._internal.common_utils import IS_WINDOWS, skipIfRocm
+
+########################
+# Explanation of Tests #
+########################
+# These tests are all testing *memory accesses* of TorchInductor.
+# They are intended to be deterministic performance tests.
+# The expect tests are all measuring the number of memory bytes read/written by
+# the code that Inductor has generated
+#
+# If the test is failing because the number became smaller, feel free to lower it.
+# On the other hand, if the test is failing because the number became larger,
+# that means that your change is leading to *more* memory accesses on this test.
+#
+# That may still be aceeptable, but be aware that you are likely lowering
+# performance for that setting.
 
 # Defines all the kernels for tests
 from torch.testing._internal.triton_utils import HAS_CUDA, requires_cuda
@@ -23,6 +40,7 @@ def count_bytes_inductor(gm, example_inputs):
     return compile_fx(gm, example_inputs, inner_compile=count_bytes_inner)
 
 
+# We don't support torch.compile() on Windows
 if not IS_WINDOWS:
 
     @torch._dynamo.optimize(count_bytes_inductor)
@@ -72,7 +90,7 @@ def TI(*size, mx=10, dtype=torch.int32, device=DEVICE):
     return torch.randint(0, mx, size, dtype=dtype, device=device)
 
 
-class TestCase(TorchTestCase):
+class TestCase(InductorTestCase):
     device = DEVICE
     pass
 
@@ -204,7 +222,7 @@ def f(a, b, c, d, e):
             return torch.cat((a + 1, b + 2, c + 3, d + 4, e + 5)) + 10
 
         inp = [T(10, 10) for _ in range(5)]
-        self.assertExpectedInline(count_numel(f, *inp), """2000""")
+        self.assertExpectedInline(count_numel(f, *inp), """1000""")
 
         def f(a, b):
             return torch.cat([a.sum(dim=0), b.sum(dim=0)]) + 10
@@ -212,6 +230,95 @@ def f(a, b):
         inp = [T(10, 10, 10), T(10, 10, 10)]
         self.assertExpectedInline(count_numel(f, *inp), """2600""")
 
+    def test_cat_pointwise(self):
+        def f(a, b):
+            return torch.cat([torch.softmax(a, dim=-1), torch.softmax(b, dim=-1)])
+
+        inp = (T(10, 10), T(10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """400""")
+
+        def f(a, b):
+            return torch.cat([torch.softmax(a, dim=-1), torch.softmax(b, dim=-1)]).cos()
+
+        # potentially beneficial to fuse but we exclude reductions from pointwise cat
+        inp = (T(10, 10), T(10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """800""")
+
+        # Should turn into pointwise even if only some of inputs are pointwise.
+        def f(a, b):
+            out = torch.cat([a.cos(), torch.mm(b, b)])
+            return out.cos()
+
+        inp = (T(10, 10), T(10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """600""")
+
+        # Should not turn into pointwise if all inputs are not pointwise
+        def f(a, b):
+            out = torch.cat([torch.mm(a, a), torch.mm(b, b)])
+            return out.cos()
+
+        inp = (T(10, 10), T(10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """800""")
+
+        def f(a, b):
+            out = torch.cat([a, b])
+            return out.cos()
+
+        inp = (T(10, 10), T(10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """400""")
+
+    @patch.object(config, "split_cat_fx_passes", False)
+    @patch.object(
+        config,
+        "pre_grad_fusion_options",
+        {
+            "batch_linear": {},
+            "batch_linear_lhs": {},
+            "batch_layernorm": {},
+            "batch_tanh": {},
+            "batch_relu": {},
+            "batch_sigmoid": {},
+        },
+    )
+    @patch.object(config, "post_grad_fusion_options", {})
+    def test_cat_pointwise_many_complex_inputs(self):
+        def f(*inputs):
+            input = [torch.nn.functional.gelu(val) for val in inputs]
+            return torch.cat(input) + 10
+
+        inp = (T(10, 10) for _ in range(16))
+        self.assertExpectedInline(count_numel(f, *inp), """6400""")
+
+    @patch.object(config, "split_cat_fx_passes", False)
+    @patch.object(
+        config,
+        "pre_grad_fusion_options",
+        {
+            "batch_linear": {},
+            "batch_linear_lhs": {},
+            "batch_layernorm": {},
+            "batch_tanh": {},
+            "batch_relu": {},
+            "batch_sigmoid": {},
+        },
+    )
+    @patch.object(config, "post_grad_fusion_options", {})
+    def test_cat_pointwise_many_simple_inputs(self):
+        def f(*inputs):
+            input = [torch.nn.functional.relu(val) for val in inputs]
+            return torch.cat(input) + 10
+
+        inp = (T(10, 10) for _ in range(16))
+        self.assertExpectedInline(count_numel(f, *inp), """9600""")
+
+    @patch.object(config, "max_pointwise_cat_inputs", 0)
+    def test_cat_pointwise_config_option(self):
+        def f(a, b):
+            return torch.cat([a + 1, b + 2]) + 3
+
+        inp = (T(10, 10), T(10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """400""")
+
     def test_index(self):
         def f(a, b):
             return a[b]
@@ -380,31 +487,15 @@ def f(x, scale, amax_keep_dim):
 
         inp = (T(4, 2048, hidden_size, dtype=torch.float), T(1, dtype=torch.float))
 
-        # 3 kernels:
-        # kernel 1: (input = X, scale, LN scale, LN bias, output = LN_pointwise(X), welford_reduction(X) * 2)
-        # kernel 2: (input = X, welford_reduction(X) * 2, LN scale, LN bias, output = first-level amax (split-reduction))
-        # kernel 3: (input = first-level amax, output = final amax)
-        # scale (1) + X (4*2048*hidden_size) * 3 + welford_reduction (4*2048) * 4 +
-        #   LN scale (hidden_size) * 2 + LN bias (hidden_size) * 2 + amax (num_splits * 2 + 1)
-        # num_splits depends on SM architectures.
-        expected_amax_keep_dim_numel = (
-            1 + hidden_size * 4 + 4 * 2048 * hidden_size * 3 + 4 * 2048 * 4 + 1
-        )
-        self.assertGreaterAlmostEqual(
-            int(count_numel(f, *inp, True)), expected_amax_keep_dim_numel
-        )
-
         # 2 kernels:
         # kernel 1: (input = X, scale, LN scale, LN bias, output = LN_pointwise(X), first-level amax (split-reduction))
         # kernel 2: (input = first-level amax, output = final amax)
         # scale (1) + X (4*2048*hidden_size) * 2 + LN scale (hidden_size) + LN bias (hidden_size) + amax (4 * 2048 * 2 + 1)
-
-        expected_amax_no_keep_dim_numel = (
+        expected_numel = (
             1 + hidden_size * 2 + 4 * 2048 * hidden_size * 2 + 4 * 2048 * 2 + 1
         )
-        self.assertExpectedInline(
-            count_numel(f, *inp, False), str(expected_amax_no_keep_dim_numel)
-        )
+        self.assertExpectedInline(count_numel(f, *inp, True), str(expected_numel))
+        self.assertExpectedInline(count_numel(f, *inp, False), str(expected_numel))
 
     def test_pointwise_multi_level_reduction(self):
         # TODO: this can be optimized by having the first pointwise kernel leveraging block sizes
@@ -447,7 +538,7 @@ class SchedulerFusionTests(TestCase):
     def setUpClass(cls):
         super().setUpClass()
         cls._stack = contextlib.ExitStack()
-        cls._stack.enter_context(patch.object(config, "realize_bytes_threshold", 0))
+        cls._stack.enter_context(patch.object(config, "realize_opcount_threshold", 0))
 
     @classmethod
     def tearDownClass(cls):
@@ -494,6 +585,28 @@ def f(a):
         inp = (T(10, 10),)
         self.assertExpectedInline(count_numel(f, *inp), """800""")
 
+    @patch.object(config, "pattern_matcher", False)
+    def test_fusion_choice4_cpu(self):
+        # Fuse nodes with same number of elements and compatible orginal var ranges
+        # [buf0: {d0: 60, d1: 11}, buf1: {d0: 660}] -> buf0_buf1
+        def f(x, w):
+            o1 = x * w
+            output = o1 + 1.0
+            return output
+
+        inp = (T(2, 3, 10, 11, device="cpu"), T(11, device="cpu"))
+        self.assertExpectedInline(count_numel(f, *inp), """1331""")
+
+        # [buf0_buf1: {d0: 60, d1: 11}, buf2: {d0: 660}] -> buf0_buf1_buf2
+        def f(x, w1, w2):
+            o1 = x * w1
+            o2 = x * w2
+            output = o1 + o2
+            return output
+
+        inp = (T(2, 3, 10, 11, device="cpu"), T(11, device="cpu"), T(11, device="cpu"))
+        self.assertExpectedInline(count_numel(f, *inp), """1342""")
+
 
 class TilingTests(TestCase):
     def test_tiling_simple(self):
@@ -575,9 +688,51 @@ def f(a, b):
         inp = (T(10, grad=True), T(10, grad=True))
         self.assertExpectedInline(count_numel_train(f, *inp), """70""")
 
+    def test_partitioning_with_view(self):
+        class Foo(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                y = x.sin()
+                x = x.cos()
+                x = x.view(10, 10)
+                ctx.save_for_backward(x, y)
+                x = x.cos()
+                return x
+
+            @staticmethod
+            def backward(ctx, gradOut):
+                x, y = ctx.saved_tensors
+                return torch.mm(gradOut, x).view(100) * y
+
+        def f(a):
+            return Foo.apply(a)
+
+        inp = (T(100, grad=True),)
+        # We do not want to recompute the x.cos().view() chain, as it's
+        # materialized in backwards
+        self.assertExpectedInline(count_numel_train(f, *inp), """900""")
+
+    @patch.object(config, "pattern_matcher", False)
+    def test_partitioning_long_chain_add(self):
+        def f(x):
+            orig = x
+            for _ in range(2):
+                x = x * x
+                x = torch.mm(x, x)
+                x = x * 2
+                x = orig + x
+                orig = x
+            return x
+
+        inp = (T(10, 10, grad=True),)
+        self.assertExpectedInline(count_numel_train(f, *inp), """3900""")
+
 
 def unfusible(x):
-    return aten.special_bessel_j0(x)
+    # For the purpose of noop tests, we want inductor to fall back to
+    # eager mode, so, below we must use a aten operator that does not
+    # have decomposition nor lowering:
+    return aten._lazy_clone(x)
 
 
 class NoopTests(TestCase):
@@ -699,7 +854,8 @@ def f(a, b):
         inp = (T(10, 10), TI(2, mx=5))
         self.assertExpectedInline(count_numel(f, *inp), """42""")
 
-    @requires_cuda()
+    @requires_cuda
+    @skipIfRocm
     def test_inplace_triton_kernel_v1(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             output = torch.zeros_like(x)
@@ -711,7 +867,8 @@ def f(x: torch.Tensor, y: torch.Tensor):
         inp = (T(10), T(10))
         self.assertExpectedInline(count_numel(f, *inp), """40""")
 
-    @requires_cuda()
+    @requires_cuda
+    @skipIfRocm
     def test_inplace_triton_kernel_v2(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             output = torch.zeros_like(x)
@@ -724,7 +881,8 @@ def f(x: torch.Tensor, y: torch.Tensor):
         inp = (T(10), T(10))
         self.assertExpectedInline(count_numel(f, *inp), """60""")
 
-    @requires_cuda()
+    @requires_cuda
+    @skipIfRocm
     def test_inplace_triton_kernel_v3(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             output = torch.zeros_like(x)
@@ -735,9 +893,10 @@ def f(x: torch.Tensor, y: torch.Tensor):
             return output
 
         inp = (T(10), T(10))
-        self.assertExpectedInline(count_numel(f, *inp), """80""")
+        self.assertExpectedInline(count_numel(f, *inp), """60""")
 
-    @requires_cuda()
+    @requires_cuda
+    @skipIfRocm
     def test_inplace_triton_kernel_v4(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             x_view = x.view(-1)
@@ -751,7 +910,8 @@ def f(x: torch.Tensor, y: torch.Tensor):
         inp = (T(10), T(10))
         self.assertExpectedInline(count_numel(f, *inp), """60""")
 
-    @requires_cuda()
+    @requires_cuda
+    @skipIfRocm
     def test_inplace_triton_kernel_v5(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             x_view = x.view(-1)
@@ -763,9 +923,10 @@ def f(x: torch.Tensor, y: torch.Tensor):
             return output
 
         inp = (T(10), T(10))
-        self.assertExpectedInline(count_numel(f, *inp), """80""")
+        self.assertExpectedInline(count_numel(f, *inp), """60""")
 
-    @requires_cuda()
+    @requires_cuda
+    @skipIfRocm
     def test_inplace_triton_kernel_v6(self):
         def f(x: torch.Tensor, y: torch.Tensor):
             output = torch.zeros_like(x)
@@ -808,7 +969,7 @@ def f(a):
         self.assertExpectedInline(count_numel(f, *inp), """200""")
 
     # TODO: The greedy fusion strategy results in suboptimal grouping
-    @patch.object(config, "realize_bytes_threshold", 0)
+    @patch.object(config, "realize_opcount_threshold", 0)
     def test_fusion_choice4(self):
         def f(a, b, b2):
             c = a + b
@@ -830,7 +991,7 @@ def f(a, b):
 
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if HAS_CUDA:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_profiler.py b/test/inductor/test_profiler.py
index 0e85b94a38ee5..d4f8a235ec82c 100644
--- a/test/inductor/test_profiler.py
+++ b/test/inductor/test_profiler.py
@@ -3,20 +3,21 @@
 import unittest
 
 import torch
-import torch._dynamo.test_case
+import torch._inductor.test_case
 import torch._inductor.utils
 
 from torch._inductor import config
 from torch.profiler import ProfilerActivity
 
 from torch.testing._internal.common_utils import skipIfRocm, TemporaryFileName
+from torch.testing._internal.inductor_utils import HAS_CUDA
 
 from torch.utils._triton import has_triton
 
 HAS_TRITON = has_triton()
 
 
-class DynamoProfilerTests(torch._dynamo.test_case.TestCase):
+class DynamoProfilerTests(torch._inductor.test_case.TestCase):
     @unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
     def test_inductor_profiling_triton_launch(self):
         # Verify that we get some sort of CPU-side indication of triton kernel launches
@@ -58,7 +59,9 @@ def _test_profiling_kernel_names(self, fn, args, kernel_name_str: str):
         for _ in range(2):
             fn_opt(*args)
 
-        with torch.profiler.profile(activities=[ProfilerActivity.CPU]) as prof:
+        with torch.profiler.profile(
+            activities=[ProfilerActivity.CPU], record_shapes=True
+        ) as prof:
             fn_opt(*args)
 
         # The name of the kernel is expected to match the name of the kernel in debug
@@ -78,6 +81,7 @@ def _test_profiling_kernel_names(self, fn, args, kernel_name_str: str):
                 for event in prof.events()
             )
         )
+        return prof.events()
 
     @unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
     def test_inductor_profiling_kernel_names_pointwise(self):
@@ -86,7 +90,13 @@ def fn(x, y):
 
         args = [torch.rand((4, 4), device="cuda") for _ in range(2)]
 
-        self._test_profiling_kernel_names(fn, args, "sin")
+        events = self._test_profiling_kernel_names(fn, args, "sin")
+        event_found = False
+        for event in events:
+            if event.name == "triton_poi_fused_add_cos_sin_0":
+                event_found = True
+                self.assertTrue(event.input_shapes == [[4, 4], [4, 4], [4, 4], []])
+        self.assertTrue(event_found)
 
     @unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
     @skipIfRocm
@@ -100,7 +110,13 @@ def fn(x, y):
 
             args = [torch.rand((4, 4), device="cuda") for _ in range(2)]
 
-            self._test_profiling_kernel_names(fn, args, "mm")
+            events = self._test_profiling_kernel_names(fn, args, "mm")
+            event_found = False
+            for event in events:
+                if event.name == "triton_tem_fused_mm_0":
+                    event_found = True
+                    self.assertTrue(event.input_shapes == [[4, 4], [4, 4], [4, 4]])
+            self.assertTrue(event_found)
 
     @unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
     def test_inductor_profiling_kernel_names_foreach(self):
@@ -116,10 +132,58 @@ def fn(x, y):
 
             args = (x, y)
 
-            self._test_profiling_kernel_names(fn, args, "_for_")
+            events = self._test_profiling_kernel_names(fn, args, "_for_")
+            event_found = False
+            for event in events:
+                if event.name == "triton_for_fused_0":
+                    event_found = True
+                    self.assertTrue(
+                        event.input_shapes
+                        == [
+                            [4, 4],
+                            [4, 4],
+                            [4, 4],
+                            [4, 4],
+                            [4, 4],
+                            [4, 4],
+                            [4, 4],
+                            [4, 4],
+                            [4, 4],
+                        ]
+                    )
+            self.assertTrue(event_found)
+
+    @unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
+    def test_inductor_profiling_triton_hooks(self):
+        from triton.compiler import CompiledKernel
+
+        hooks_called = {"enter": False, "exit": False}
+
+        def launch_enter_hook(*args):
+            hooks_called["enter"] = True
+
+        def launch_exit_hook(*args):
+            hooks_called["exit"] = True
+
+        CompiledKernel.launch_enter_hook = launch_enter_hook
+        CompiledKernel.launch_exit_hook = launch_exit_hook
+
+        def fn(x, y):
+            return torch._foreach_add(x, y)
+
+        x = [torch.rand((4, 4), device="cuda") for _ in range(3)]
+        y = [torch.rand((4, 4), device="cuda") for _ in range(3)]
+
+        args = (x, y)
+        fn_opt = torch.compile(fn)
+        fn_opt(*args)
+
+        self.assertTrue(hooks_called["enter"])
+        self.assertTrue(hooks_called["exit"])
 
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
-    run_tests()
+    if HAS_CUDA:
+        run_tests()
diff --git a/test/inductor/test_select_algorithm.py b/test/inductor/test_select_algorithm.py
index e475dae62493b..ca5b99f02c89d 100644
--- a/test/inductor/test_select_algorithm.py
+++ b/test/inductor/test_select_algorithm.py
@@ -7,10 +7,10 @@
 import torch._inductor.config as inductor_config
 import torch._inductor.select_algorithm as select_algorithm
 import torch.nn.functional as F
-from torch._dynamo.test_case import run_tests, TestCase
 from torch._dynamo.testing import expectedFailureDynamicWrapper
 from torch._dynamo.utils import counters
 from torch._inductor.autotune_process import TritonBenchmarkRequest
+from torch._inductor.test_case import run_tests, TestCase
 
 from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm
 from torch.testing._internal.inductor_utils import HAS_CUDA
@@ -19,8 +19,10 @@
 
 
 def patches(fn):
-    def skip_cache(self, choices, name, key, generate):
-        return generate(choices)
+    def skip_cache(self, choices, name, key, benchmark):
+        if benchmark is None:
+            return {}
+        return benchmark(choices)
 
     for patcher in [
         dynamo_config.patch(verbose=True),
@@ -107,6 +109,8 @@ def foo(a, b):
         )
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
 
+    # FIXME: Investigate why _int_mm_out_cuda is not compiled on ROCm
+    @skipIfRocm
     @patches
     def test__int_mm(self):
         @torch.compile
diff --git a/test/inductor/test_smoke.py b/test/inductor/test_smoke.py
index 67433407b2bec..309c007a17b3a 100644
--- a/test/inductor/test_smoke.py
+++ b/test/inductor/test_smoke.py
@@ -5,7 +5,8 @@
 import torch
 import torch._logging
 
-from torch.testing._internal.common_utils import IS_LINUX, TestCase
+from torch._inductor.test_case import TestCase
+from torch.testing._internal.common_utils import IS_LINUX
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
 
@@ -59,7 +60,7 @@ def test_compile_invalid_options(self):
 
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if IS_LINUX and torch.cuda.is_available():
         if torch.cuda.get_device_properties(0).major > 5:
diff --git a/test/inductor/test_snode_runtime.py b/test/inductor/test_snode_runtime.py
index b18a3e122d711..b62c219f85e81 100644
--- a/test/inductor/test_snode_runtime.py
+++ b/test/inductor/test_snode_runtime.py
@@ -1,13 +1,21 @@
 # Owner(s): ["module: inductor"]
 
 
+from unittest import skipIf
+
 import torch
+import torch.distributed as dist
+
 from torch._inductor import metrics
+from torch._inductor.comm_analysis import estimate_nccl_collective_runtime
 from torch._inductor.compile_fx import compile_fx, count_bytes_inner
-from torch.testing._internal.common_utils import TestCase as TorchTestCase
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch._inductor.utils import is_collective
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
 aten = torch.ops.aten
+c10d = torch.ops.c10d_functional
+_c10d = torch.ops._c10d_functional
 
 
 def count_bytes_inductor(gm, example_inputs):
@@ -36,7 +44,7 @@ def T(*size, dtype=torch.float32, device=DEVICE, grad=False) -> torch.Tensor:
     return torch.randn(size, dtype=dtype, device=device, requires_grad=grad)
 
 
-class TestCase(TorchTestCase):
+class TestCase(InductorTestCase):
     device = DEVICE
 
     """
@@ -165,8 +173,137 @@ def f(x):
         self.assertNotZero(calculate_runtime(f, *inp))
 
 
+@skipIf(not dist.is_available(), "requires distributed")
+class TestCommAnalysis(TestCase):
+    WORLD_SIZE: int = 8
+    RANKS = list(range(8))
+
+    def _verify_runtime_estimation(self, fn, inps):
+        from torch.testing._internal.distributed.fake_pg import FakeStore
+
+        store = FakeStore()
+        dist.init_process_group(
+            backend="fake", rank=0, world_size=self.WORLD_SIZE, store=store
+        )
+        try:
+            metrics.reset()
+            torch._dynamo.optimize(count_bytes_inductor)(fn)(*inps)
+            found_collective = False
+            for snode, runtime in metrics.node_runtimes:
+                if not is_collective(snode.node):
+                    continue
+                found_collective = True
+                # Inductor swallows errors from snode runtime estimations.
+                # We call estimate_nccl_collective_runtime in a white-box
+                # fashion here so potential issues can be surfaced in tests.
+                est = estimate_nccl_collective_runtime(snode.node)
+                self.assertNotZero(est)
+                # Also make sure estimate_nccl_collective_runtime works
+                # correctly in inductor.
+                self.assertNotZero(runtime)
+            # Make sure a collective kernel is found in graph
+            self.assertTrue(found_collective)
+        finally:
+            dist.destroy_process_group()
+
+    def test_legacy_all_reduce(self):
+        def fn(x):
+            r = c10d.all_reduce(x, "sum", "", self.RANKS, self.WORLD_SIZE)
+            return c10d.wait_tensor(r)
+
+        inp = T(10, 10)
+        self._verify_runtime_estimation(fn, (inp,))
+
+    def test_legacy_all_reduce_coalesced(self):
+        def fn(x):
+            rs = c10d.all_reduce_coalesced(x, "sum", "", self.RANKS, self.WORLD_SIZE)
+            return [c10d.wait_tensor(r) for r in rs]
+
+        inp = [T(10, 10), T(15, 15)]
+        self._verify_runtime_estimation(fn, (inp,))
+
+    def test_legacy_all_gather_into_tensor_coalesced(self):
+        def fn(x):
+            rs = c10d.all_gather_into_tensor_coalesced(
+                x,
+                "",
+                self.RANKS,
+                self.WORLD_SIZE,
+            )
+            return [c10d.wait_tensor(r) for r in rs]
+
+        inp = [T(10, 10), T(15, 15)]
+        self._verify_runtime_estimation(fn, (inp,))
+
+    def test_all_reduce(self):
+        def fn(x):
+            r = _c10d.all_reduce(x, "sum", "0")
+            return _c10d.wait_tensor(r)
+
+        inp = T(10, 10)
+        self._verify_runtime_estimation(fn, (inp,))
+
+    def test_all_reduce_coalesced(self):
+        def fn(x):
+            rs = _c10d.all_reduce_coalesced(x, "sum", "0")
+            return [_c10d.wait_tensor(r) for r in rs]
+
+        inp = [T(10, 10), T(15, 15)]
+        self._verify_runtime_estimation(fn, (inp,))
+
+    def test_all_gather_into_tensor(self):
+        def fn(x):
+            rs = _c10d.all_gather_into_tensor(
+                x,
+                self.WORLD_SIZE,
+                "0",
+            )
+            return [_c10d.wait_tensor(r) for r in rs]
+
+        inp = T(10, 10)
+        self._verify_runtime_estimation(fn, (inp,))
+
+    def test_all_gather_into_tensor_coalesced(self):
+        def fn(x):
+            rs = _c10d.all_gather_into_tensor_coalesced(
+                x,
+                self.WORLD_SIZE,
+                "0",
+            )
+            return [_c10d.wait_tensor(r) for r in rs]
+
+        inp = [T(10, 10), T(15, 15)]
+        self._verify_runtime_estimation(fn, (inp,))
+
+    def test_reduce_scatter_tensor(self):
+        def fn(x):
+            rs = _c10d.reduce_scatter_tensor(
+                x,
+                "sum",
+                self.WORLD_SIZE,
+                "0",
+            )
+            return [_c10d.wait_tensor(r) for r in rs]
+
+        inp = T(self.WORLD_SIZE, 10)
+        self._verify_runtime_estimation(fn, (inp,))
+
+    def test_reduce_scatter_tensor_coalesced(self):
+        def fn(x):
+            rs = _c10d.reduce_scatter_tensor_coalesced(
+                x,
+                "sum",
+                self.WORLD_SIZE,
+                "0",
+            )
+            return [_c10d.wait_tensor(r) for r in rs]
+
+        inp = [T(self.WORLD_SIZE, 10), T(self.WORLD_SIZE, 15)]
+        self._verify_runtime_estimation(fn, (inp,))
+
+
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if HAS_CUDA:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_split_cat_fx_passes.py b/test/inductor/test_split_cat_fx_passes.py
index 6d532ece14015..b0cc28205daa2 100644
--- a/test/inductor/test_split_cat_fx_passes.py
+++ b/test/inductor/test_split_cat_fx_passes.py
@@ -1,15 +1,31 @@
 # Owner(s): ["module: inductor"]
 
+import unittest
+
 import torch
-from torch._dynamo.test_case import run_tests, TestCase
-from torch._dynamo.utils import counters
+from torch._dynamo.utils import counters, optimus_scuba_log
 from torch._inductor.fx_passes.misc_patterns import numpy_compat_normalization
+from torch._inductor.test_case import run_tests, TestCase
 from torch.testing._internal.common_utils import IS_LINUX
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+
 
 def patch(f):
-    f = torch._inductor.config.patch(split_cat_fx_passes=True)(f)
+    f = torch._inductor.config.patch(
+        pre_grad_fusion_options={
+            "normalization_pass": {},
+            "remove_split_with_size_one_pass": {},
+            "merge_getitem_cat_pass": {},
+            "merge_stack_tahn_unbind_pass": {},
+            "merge_splits_pass": {},
+            "mutate_cat_pass": {},
+            "split_cat_pass": {},
+            "unbind_stack_pass": {},
+        },
+        post_grad_fusion_options={},
+    )(f)
     return f
 
 
@@ -68,17 +84,17 @@ def cm_with_list(x):
         ]
         for fn, expected_split_norm_count in [
             (arg_only, 1),
-            (arg_only_dim0, 0),
+            (arg_only_dim0, 1),
             (kwarg1, 1),
             (kwarg2, 1),
             (kwarg3, 1),
-            (list_replace, 1),
-            (multi_split, 1),
+            (list_replace, 0),
+            (multi_split, 17),
             (unequal_split, 1),
             (arg_only_cm, 1),
             (kwarg1_cm, 1),
             (kwarg2_cm, 1),
-            (multi_split_cm, 1),
+            (multi_split_cm, 17),
             (unequal_split_cm, 1),
             (cm_with_list, 1),
         ]:
@@ -87,9 +103,12 @@ def cm_with_list(x):
 
             torch.testing.assert_close(actual, expected)
             self.assertEqual(
-                counters["inductor"]["split_cat_norm"],
+                counters["inductor"]["normalization_pass"],
                 expected_split_norm_count,
+                msg=f"for {fn}",
             )
+            if expected_split_norm_count > 0:
+                self.assertIn("normalization_pass_pre_grad", optimus_scuba_log)
             counters.clear()
 
     @patch
@@ -223,6 +242,18 @@ def split_getitem_out_of_order(x):
 
             return torch.cat(final_items, dim=1)
 
+        def split_partial_getitem_cat(x):
+            fs = torch.split(x, [4, 4, 24], dim=1)
+            item0 = fs[0]
+            item2 = fs[2]
+
+            final_items = [
+                item0,
+            ]
+            final_items.extend(item2.split((4, 4, 4, 4, 4, 4), 1))
+
+            return torch.cat(final_items, dim=1)
+
         args = [
             torch.randn(2, 32),
         ]
@@ -242,15 +273,18 @@ def split_getitem_out_of_order(x):
             (duplicate_getitems_neg_index, 1),
             (split_getitem_gap, 1),
             (split_getitem_out_of_order, 1),
+            (split_partial_getitem_cat, 1),
         ]:
             expected = fn(*args)
             actual = torch.compile(fn)(*args)
 
             torch.testing.assert_close(actual, expected)
             self.assertEqual(
-                counters["inductor"]["consecutive_split_merged"],
+                counters["inductor"]["merge_splits_pass"],
                 expected_split_merged,
             )
+            if expected_split_merged > 0:
+                self.assertIn("merge_splits_pass_pre_grad", optimus_scuba_log)
             counters.clear()
 
     @patch
@@ -555,7 +589,7 @@ def multi_split_cat(x1, x2):
             (input_shuffling_multiple_output, 1, 1, 2, 2, 3, default_args),
             (input_shuffling_direct_output, 1, 1, 2, 2, 3, default_args),
             (unequal_split_multiple_output, 1, 1, 2, 2, 3, default_args),
-            (multi_split_cat, 2, 2, 4, 4, 3, multi_args),
+            (multi_split_cat, 1, 1, 2, 2, 3, multi_args),
         ]:
             expected = fn(*args)
             actual = torch.compile(fn)(*args)
@@ -583,7 +617,10 @@ def multi_split_cat(x1, x2):
             )
             counters.clear()
 
-    @torch._inductor.config.patch(split_cat_fx_passes=False)
+    @torch._inductor.config.patch(
+        pre_grad_fusion_options={},
+        post_grad_fusion_options={},
+    )
     def test_config_flag_is_respected(self):
         def split_with_cat(x):
             fs = torch.split(x, [4, 4, 24], dim=-1)
@@ -605,11 +642,11 @@ def split_with_cat(x):
 
         torch.testing.assert_close(actual, expected)
         self.assertEqual(
-            counters["inductor"]["consecutive_split_merged"],
+            counters["inductor"]["merge_splits_pass"],
             0,
         )
         self.assertEqual(
-            counters["inductor"]["split_cat_norm"],
+            counters["inductor"]["normalization_pass"],
             0,
         )
 
@@ -717,7 +754,7 @@ def graph_should_be_topological_sorted(x):
 
             torch.testing.assert_close(actual, expected)
             self.assertEqual(
-                counters["inductor"]["split_squeeze_replaced"],
+                counters["inductor"]["split_cat_pass"],
                 split_squeeze_replaced,
             )
             counters.clear()
@@ -725,13 +762,13 @@ def graph_should_be_topological_sorted(x):
     @patch
     def test_unbind_stack(self):
         def unbind_stack(x):
-            return torch.stack(torch.unbind(x, dim=1), 1)
+            return torch.stack(torch.unbind(x, 1), 1)
 
         def unbind_cat(x):
-            return torch.cat(torch.unbind(x, dim=1), 1)
+            return torch.cat(torch.unbind(x, dim=-3), 1)
 
         def unbind_stack_argspec1(x):
-            return torch.stack(torch.unbind(x, dim=1), dim=1)
+            return torch.stack(torch.unbind(input=x, dim=1), dim=1)
 
         def unbind_stack_argspec2(x):
             return torch.stack(tensors=torch.unbind(x, dim=1), dim=1)
@@ -830,25 +867,24 @@ def unbind_cat_multi_users_diff_dims(x):
             expected_cat_added,
             expected_cat_removed,
             expected_sections_removed,
+            expected_unbind_normalized,
         ) in [
-            (unbind_stack, 0, 1, 0, 1, 31),
-            (unbind_stack_argspec1, 0, 1, 0, 1, 31),
-            (unbind_stack_argspec2, 0, 1, 0, 1, 31),
-            (dim_mismatch, 0, 1, 0, 1, 31),
-            (split_squeeze_stack, 0, 1, 0, 1, 31),
-            (split_squeeze_stack_callmethod, 0, 1, 0, 1, 31),
-            (other_users, 0, 0, 0, 0, 0),
-            (other_users_2, 0, 0, 0, 0, 0),
-            (unbind_cat_addn_args, 0, 1, 1, 1, 31),
-            (unbind_stack_addn_args, 0, 1, 1, 1, 31),
-            (unbind_cat_addn_args_dim2, 0, 1, 1, 1, 31),
-            (unbind_cat_dim_mismatch, 0, 1, 1, 1, 31),
-            (unbind_stack_dim_mismatch, 0, 1, 1, 1, 31),
-            (unbind_cat_multi_users, 0, 1, 2, 2, 31),
-            (unbind_cat_multi_users_diff_dims, 0, 1, 2, 2, 31),
+            (unbind_stack, 0, 1, 0, 1, 31, 2),
+            (unbind_stack_argspec1, 0, 1, 0, 1, 31, 2),
+            (unbind_stack_argspec2, 0, 1, 0, 1, 31, 2),
+            (dim_mismatch, 0, 1, 0, 1, 31, 2),
+            (split_squeeze_stack, 0, 1, 0, 1, 31, 2),
+            (split_squeeze_stack_callmethod, 0, 1, 0, 1, 31, 2),
+            (other_users, 0, 0, 0, 0, 0, 2),
+            (other_users_2, 0, 0, 0, 0, 0, 2),
+            (unbind_cat_addn_args, 0, 1, 1, 1, 31, 1),
+            (unbind_stack_addn_args, 0, 1, 1, 1, 31, 2),
+            (unbind_cat_addn_args_dim2, 0, 1, 1, 1, 31, 1),
+            (unbind_cat_dim_mismatch, 0, 1, 1, 1, 31, 1),
+            (unbind_stack_dim_mismatch, 0, 1, 1, 1, 31, 2),
+            (unbind_cat_multi_users, 0, 1, 2, 2, 31, 2),
+            (unbind_cat_multi_users_diff_dims, 0, 1, 2, 2, 31, 2),
         ]:
-            print()
-            print(fn)
             expected = fn(*args)
             actual = torch.compile(fn)(*args)
 
@@ -856,27 +892,37 @@ def unbind_cat_multi_users_diff_dims(x):
             self.assertEqual(
                 counters["inductor"]["scmerge_split_added"],
                 expected_unbind_added,
+                msg=f"for {fn}",
             )
             self.assertEqual(
                 counters["inductor"]["scmerge_split_removed"],
                 expected_unbind_removed,
+                msg=f"for {fn}",
             )
             self.assertEqual(
                 counters["inductor"]["scmerge_cat_added"],
                 expected_cat_added,
+                msg=f"for {fn}",
             )
             self.assertEqual(
                 counters["inductor"]["scmerge_cat_removed"],
                 expected_cat_removed,
+                msg=f"for {fn}",
             )
             self.assertEqual(
                 counters["inductor"]["scmerge_split_sections_removed"],
                 expected_sections_removed,
+                msg=f"for {fn}",
+            )
+            self.assertEqual(
+                counters["inductor"]["normalization_pass"],
+                expected_unbind_normalized,
+                msg=f"for {fn}",
             )
             counters.clear()
 
     @patch
-    def test_getitem_cat_merge(self):
+    def test_split_cat_new_patterns(self):
         def split_cat_split(x):
             l1_out = torch.split(x, [200, 50, 50, 20, 20, 20, 20, 20, 20, 50, 30], 1)
             item0 = l1_out[0]
@@ -967,7 +1013,7 @@ def split_cat_split_kwarg(x):
             )
             return output
 
-        def split_cat_split_with_multiple_users(x):
+        def remove_cat_node_with_all_getitmes(x):
             l1_out = torch.split(
                 x, [50, 50, 200, 20, 20, 20, 20, 20, 40, 10, 50], dim=0
             )
@@ -982,6 +1028,22 @@ def split_cat_split_with_multiple_users(x):
             item8 = l1_out[8]
             item9 = l1_out[9]
             item10 = l1_out[10]
+            cat = torch.cat(
+                (
+                    item0,
+                    item1,
+                    item2,
+                    item3,
+                    item4,
+                    item5,
+                    item6,
+                    item7,
+                    item8,
+                    item9,
+                    item10,
+                ),
+                dim=0,
+            )
             cat_1 = torch.cat((item0, item1), dim=0)
             cat_2 = torch.cat((item0, item10), dim=0)
             l2_out = torch.split(cat_1, [20, 30, 50], dim=0)
@@ -1011,24 +1073,83 @@ def split_cat_split_with_multiple_users(x):
                 ],
                 dim=0,
             )
-            return output
+            return torch.cat((output, cat), dim=0)
+
+        def mutate_cat_node_with_some_getitmes(x):
+            l1_out = torch.split(
+                x, [50, 50, 200, 20, 20, 20, 20, 20, 40, 10, 50], dim=0
+            )
+            item0 = l1_out[0]
+            item1 = l1_out[1]
+            item2 = l1_out[2]
+            item3 = l1_out[3]
+            item4 = l1_out[4]
+            item5 = l1_out[5]
+            item6 = l1_out[6]
+            item7 = l1_out[7]
+            item8 = l1_out[8]
+            item9 = l1_out[9]
+            item10 = l1_out[10]
+            cat = torch.cat(
+                (
+                    item6,
+                    item7,
+                    item8,
+                    item9,
+                    item10,
+                    item2,
+                    item3,
+                    item4,
+                    item5,
+                ),
+                dim=0,
+            )
+            cat_1 = torch.cat((item0, item1), dim=0)
+            cat_2 = torch.cat((item0, item10), dim=0)
+            l2_out = torch.split(cat_1, [20, 30, 50], dim=0)
+            l3_out = torch.split(cat_2, [10, 60, 30], dim=0)
+            item11 = l2_out[0]
+            item12 = l2_out[1]
+            item13 = l2_out[2]
+            item14 = l3_out[0]
+            item15 = l3_out[1]
+            item16 = l3_out[2]
+
+            output = torch.cat(
+                [
+                    item11,
+                    item12,
+                    item13,
+                    item14,
+                    item15,
+                    item16,
+                    item2,
+                ],
+                dim=0,
+            )
+            return torch.cat((output, cat), dim=0)
 
         args = [
             torch.randn(500, 500),
         ]
-        for fn, expected_getitem_cat_merged in [
-            (split_cat_split, 2),
-            (split_cat_split_kwarg, 2),
-            (split_cat_split_with_multiple_users, 0),
+        for fn, expected_getitem_cat_merged, expected_cat_removed in [
+            (split_cat_split, 2, 0),
+            (split_cat_split_kwarg, 2, 0),
+            (remove_cat_node_with_all_getitmes, 0, 2),
+            (mutate_cat_node_with_some_getitmes, 0, 1),
         ]:
             expected = fn(*args)
             actual = torch.compile(fn)(*args)
 
             torch.testing.assert_close(actual, expected)
             self.assertEqual(
-                counters["inductor"]["getitem_cat_merged"],
+                counters["inductor"]["merge_getitem_cat_pass"],
                 expected_getitem_cat_merged,
             )
+            self.assertEqual(
+                counters["inductor"]["mutate_cat_pass"],
+                expected_cat_removed,
+            )
             counters.clear()
 
     @patch
@@ -1060,9 +1181,10 @@ def stack_tahn_unbind(x):
 
             torch.testing.assert_close(actual, expected)
             self.assertEqual(
-                counters["inductor"]["stack_tahn_unbind_merged"],
+                counters["inductor"]["merge_stack_tahn_unbind_pass"],
                 expected_stack_tahn_unbind_merged,
             )
+            self.assertIn("merge_getitem_cat_pass_pre_grad", optimus_scuba_log)
             counters.clear()
 
     def test_numpy_compat_normalization(self):
@@ -1083,6 +1205,7 @@ def fn(x, y):
                 self.assertTrue(k not in {"x", "x1", "x2", "a", "axis", "keepdims"})
 
     @patch
+    @requires_cuda
     def test_stack_normalization_axis_kwarg(self):
         def fn(x, y):
             return torch.stack([x, y], axis=1)
diff --git a/test/inductor/test_standalone_compile.py b/test/inductor/test_standalone_compile.py
index 88c528c891a4f..c9f02ef3c2a70 100644
--- a/test/inductor/test_standalone_compile.py
+++ b/test/inductor/test_standalone_compile.py
@@ -1,7 +1,7 @@
 # Owner(s): ["module: inductor"]
 import torch
 from torch import _dynamo as dynamo, _inductor as inductor
-from torch._dynamo.test_case import run_tests, TestCase
+from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import gen_gm_and_inputs
 from torch.fx import symbolic_trace
 from torch.fx.experimental.proxy_tensor import make_fx
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index d48ad5e97ad2a..4fd58bae854dd 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -28,6 +28,7 @@
 import torch._dynamo.config as dynamo_config
 import torch.nn as nn
 from torch._dispatch.python import enable_python_dispatcher
+from torch._dynamo.debug_utils import aot_graph_input_parser
 from torch._dynamo.testing import (
     CompileCounterWithBackend,
     expectedFailureCodegenDynamic,
@@ -35,12 +36,15 @@
     same,
 )
 from torch._inductor.codegen.common import DataTypePropagation, OptimizationContext
+from torch._inductor.fx_passes import pad_mm
+from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.utils import (
     add_scheduler_init_hook,
     run_and_get_code,
     run_and_get_triton_code,
 )
 from torch._inductor.virtualized import V
+from torch._prims_common import is_integer_dtype
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.nn import functional as F
 from torch.testing import FileCheck, make_tensor
@@ -49,32 +53,38 @@
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     SM80OrLater,
     TEST_CUDNN,
-    TEST_MULTIGPU,
     with_tf32_off,
 )
 
 from torch.testing._internal.common_device_type import (
     _has_sufficient_memory,
-    get_desired_device_type_test_bases,
+    expectedFailureXPU,
 )
-from torch.testing._internal.common_dtype import all_types
+from torch.testing._internal.common_dtype import all_types, get_all_dtypes
 from torch.testing._internal.common_utils import (
     DeterministicGuard,
+    instantiate_parametrized_tests,
     IS_CI,
     IS_FBCODE,
     IS_MACOS,
     IS_WINDOWS,
     IS_X86,
+    parametrize,
+    serialTest,
     skipIfRocm,
+    skipIfXpu,
+    subtest,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
-    TestCase as TorchTestCase,
 )
 from torch.utils import _pytree as pytree
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_flatten, tree_unflatten
+from torch.utils._triton import has_triton
 from torch.utils.weak import WeakTensorKeyDictionary
 
+DO_PERF_TEST = os.environ.get("DO_PERF_TEST") == "1"
+
 if IS_WINDOWS and IS_CI:
     sys.stderr.write(
         "Windows CI does not have necessary dependencies for test_torchinductor yet\n"
@@ -88,33 +98,119 @@
 
 from torch._inductor import config, test_operators
 
-from torch._inductor.compile_fx import compile_fx, compile_fx_inner
+from torch._inductor.compile_fx import (
+    compile_fx,
+    compile_fx_inner,
+    complex_memory_overlap,
+)
 from torch._inductor.utils import has_torchvision_roi_align
 
 from torch.testing._internal.common_utils import slowTest
 from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
     HAS_CPU,
-    HAS_CUDA,
+    HAS_GPU,
+    HAS_MULTIGPU,
     skipCPUIf,
     skipCUDAIf,
 )
 
 HAS_AVX2 = "fbgemm" in torch.backends.quantized.supported_engines
-_desired_test_bases = get_desired_device_type_test_bases()
-RUN_CPU = any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
-RUN_CUDA = any(getattr(x, "device_type", "") == "cuda" for x in _desired_test_bases)
 
 aten = torch.ops.aten
-requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
+requires_gpu = functools.partial(unittest.skipIf, not HAS_GPU, "requires gpu")
+
 requires_multigpu = functools.partial(
-    unittest.skipIf, not TEST_MULTIGPU, "requires multiple cuda devices"
+    unittest.skipIf, not HAS_MULTIGPU, f"requires multiple {GPU_TYPE} devices"
 )
 skip_if_x86_mac = functools.partial(
     unittest.skipIf, IS_MACOS and IS_X86, "Does not work on x86 Mac"
 )
 vec_dtypes = [torch.float, torch.bfloat16, torch.float16]
 
-libfoo = None
+libtest = torch.library.Library("test", "FRAGMENT")  # noqa: TOR901
+ids = set()
+
+f32 = torch.float32
+i64 = torch.int64
+i32 = torch.int32
+
+
+def _large_cumprod_input(shape, dim, dtype, device):
+    # Construct a cumprod input which guaruntees not to overflow or underflow
+    if is_integer_dtype(dtype):
+        # Large products don't fit in integers, the best we can do
+        # is random +/-1 values to test the sign of the result
+        x = torch.randint(0, 1, shape, dtype=dtype, device=device)
+        return x * 2 - 1
+
+    comp_dtype = torch._prims_common.get_computation_dtype(dtype)
+    batch_size = 256
+    if comp_dtype != dtype:
+        batch_size = math.floor(math.log2(torch.finfo(dtype).max) / 3)
+
+    # Create random values with a uniform magnitude and uniform exponent
+    num_batches = (shape[dim] + 2 * batch_size - 1) // (2 * batch_size)
+    batch_shape = (
+        shape[:dim]
+        + (
+            num_batches,
+            batch_size,
+        )
+        + shape[dim + 1 :]
+    )
+    magnitude = 1 + torch.rand(batch_shape, dtype=comp_dtype, device=device)
+    exponent = torch.randint(-1, 1, batch_shape, device=device).to(comp_dtype)
+    batch = magnitude * exponent.exp2()
+
+    # Alternate each batch of values with their reciprocals so the product
+    # never gets too far away from 1
+    t = torch.cat((batch, batch.reciprocal()), dim=dim + 1)
+    t = t.flatten(dim, dim + 1)
+    t = aten.slice(t, dim=dim, start=0, end=shape[dim])
+
+    # Randomize sign
+    sign = torch.randint(0, 1, shape, device=device) * 2 - 1
+    return (t * sign).to(dtype)
+
+
+def define_custom_op_for_test(id_, fn_cpu, fn_cuda, fn_xpu, fn_meta, tags=()):
+    global libtest
+    global ids
+    if id_ not in ids:
+        libtest.define(f"{id_}(Tensor self) -> Tensor", tags=tags)
+        libtest.impl(id_, fn_cpu, "CPU")
+        libtest.impl(id_, fn_cuda, "CUDA")
+        libtest.impl(id_, fn_xpu, "XPU")
+        libtest.impl(id_, fn_meta, "Meta")
+        ids.add(id_)
+
+
+def define_custom_op_2_for_test(id_, fn_cpu, fn_cuda, fn_xpu, fn_meta, tags=()):
+    global libtest
+    global ids
+    if id_ not in ids:
+        libtest.define(
+            f"{id_}(Tensor self, float scale) -> (Tensor, Tensor)", tags=tags
+        )
+        libtest.impl(id_, fn_cpu, "CPU")
+        libtest.impl(id_, fn_cuda, "CUDA")
+        libtest.impl(id_, fn_xpu, "XPU")
+        libtest.impl(id_, fn_meta, "Meta")
+        ids.add(id_)
+
+
+def define_custom_op_3_for_test(id_, fn_cpu, fn_cuda, fn_xpu, fn_meta, tags=()):
+    global libtest
+    global ids
+    if id_ not in ids:
+        libtest.define(f"{id_}(Tensor[] x) -> Tensor", tags=tags)
+        libtest.impl(id_, fn_cpu, "CPU")
+        libtest.impl(id_, fn_cuda, "CUDA")
+        libtest.impl(id_, fn_xpu, "XPU")
+        libtest.impl(id_, fn_meta, "Meta")
+        ids.add(id_)
+
 
 f32 = torch.float32
 
@@ -128,7 +224,7 @@ def run_with_backward():
     return run_and_get_code(run_with_backward)
 
 
-class TestCase(TorchTestCase):
+class TestCase(InductorTestCase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
@@ -211,7 +307,9 @@ def gather_leaf_tensors(args, kwargs):
         return leaf_tensors
 
     flat_results = pytree.tree_leaves(results)
-    flat_diff_results = [r for r in flat_results if r.requires_grad]
+    flat_diff_results = [
+        r for r in flat_results if isinstance(r, torch.Tensor) and r.requires_grad
+    ]
     assert len(flat_diff_results) > 0
 
     leaf_tensors = gather_leaf_tensors(args, kwrags)
@@ -270,10 +368,12 @@ def check_model(
     *,
     atol=None,
     rtol=None,
+    grad_atol=None,
+    grad_rtol=None,
     check_lowp=True,
     exact_dtype=True,
     nopython=True,
-    copy_to_cuda=True,
+    copy_to_gpu=True,
     reference_in_float=True,
     assert_equal=True,
     check_gradient=False,
@@ -349,15 +449,23 @@ def run(*ex, **kwargs):
     if check_has_compiled:
         assert called, "Ran graph without calling compile_fx"
     assert type(actual) == type(correct)
+    if isinstance(actual, (tuple, list)):
+        assert len(actual) == len(correct)
+        assert all(
+            type(actual_item) == type(correct_item)
+            for actual_item, correct_item in zip(actual, correct)
+        )
 
     correct_flat, correct_spec = tree_flatten(correct)
     actual_flat = pytree.tree_leaves(actual)
 
     def reference_to_expect(actual_flat, correct_flat):
         return tuple(
-            y.to(x.dtype)
-            if isinstance(y, torch.Tensor) and y.dtype.is_floating_point
-            else y
+            (
+                y.to(x.dtype)
+                if isinstance(y, torch.Tensor) and y.dtype.is_floating_point
+                else y
+            )
             for x, y in zip(actual_flat, correct_flat)
         )
 
@@ -414,7 +522,7 @@ def reference_to_expect(actual_flat, correct_flat):
         grads = [
             torch.rand(r.shape, device=r.device, dtype=r.dtype)
             for r in correct_flat
-            if r.requires_grad
+            if isinstance(r, torch.Tensor) and r.requires_grad
         ]
         for g in grads:
             g /= g.norm()
@@ -446,8 +554,8 @@ def reference_to_expect(actual_flat, correct_flat):
             self.assertEqual(
                 actual_grad,
                 expect_grad,
-                atol=atol,
-                rtol=rtol,
+                atol=grad_atol or atol,
+                rtol=grad_rtol or rtol,
                 equal_nan=True,
                 exact_dtype=exact_dtype,
             )
@@ -456,7 +564,7 @@ def reference_to_expect(actual_flat, correct_flat):
 
 
 @torch._inductor.config.patch("triton.cudagraphs", False)
-def check_model_cuda(
+def check_model_gpu(
     self: TestCase,
     model,
     example_inputs,
@@ -464,10 +572,12 @@ def check_model_cuda(
     *,
     atol=None,
     rtol=None,
+    grad_atol=None,
+    grad_rtol=None,
     check_lowp=True,
     exact_dtype=True,
     nopython=True,
-    copy_to_cuda=True,
+    copy_to_gpu=True,
     reference_in_float=True,
     assert_equal=True,
     check_gradient=False,
@@ -476,11 +586,11 @@ def check_model_cuda(
 ):
     kwargs = kwargs or {}
     if hasattr(model, "to"):
-        model = model.to("cuda")
+        model = model.to(device=GPU_TYPE)
 
-    if copy_to_cuda:
+    if copy_to_gpu:
         example_inputs = tuple(
-            clone_preserve_strides(x, device="cuda") for x in example_inputs
+            clone_preserve_strides(x, device=GPU_TYPE) for x in example_inputs
         )
 
     check_model(
@@ -490,6 +600,8 @@ def check_model_cuda(
         kwargs,
         atol=atol,
         rtol=rtol,
+        grad_atol=grad_atol,
+        grad_rtol=grad_rtol,
         exact_dtype=exact_dtype,
         nopython=nopython,
         reference_in_float=reference_in_float,
@@ -505,7 +617,7 @@ def downcast_fn(x):
             if not isinstance(x, torch.Tensor) or not x.dtype == torch.float:
                 return x
             return torch.empty_strided(
-                x.size(), x.stride(), device="cuda", dtype=torch.half
+                x.size(), x.stride(), device=GPU_TYPE, dtype=torch.half
             ).copy_(x)
 
         example_inputs = list(map(downcast_fn, example_inputs))
@@ -520,6 +632,8 @@ def downcast_fn(x):
             kwargs,
             atol=atol,
             rtol=rtol,
+            grad_atol=grad_atol,
+            grad_rtol=grad_rtol,
             exact_dtype=exact_dtype,
             nopython=nopython,
             reference_in_float=reference_in_float,
@@ -530,7 +644,12 @@ def downcast_fn(x):
         )
 
 
-def _run_and_assert_no_indirect_indexing(test_case, func, *args, **kwargs):
+check_model_cuda = check_model_gpu
+
+
+def _run_and_assert_no_indirect_indexing(
+    test_case, func, *args, has_wrapping=None, **kwargs
+):
     result, source_codes = run_and_get_code(func, *args, **kwargs)
 
     for code in source_codes:
@@ -546,6 +665,8 @@ def _run_and_assert_no_indirect_indexing(test_case, func, *args, **kwargs):
                 stmt = line.split(".store")[-1]
             elif "[" in line:
                 stmt = line.split("[")[-1].split("]")[0]
+            if "tl.make_block_ptr(" in line:
+                continue
 
             if stmt is None:
                 continue
@@ -555,11 +676,21 @@ def _run_and_assert_no_indirect_indexing(test_case, func, *args, **kwargs):
                 "tmp" not in stmt,
                 msg=f"Found indirect indexing in statement '{stmt}' from code:\n{code}",
             )
+        if has_wrapping is not None:
+            test_case.assertTrue(
+                ("where" in code or "?" in code) is has_wrapping,
+                msg=f"Wanted {has_wrapping=} but got\n{code}",
+            )
 
     return result
 
 
 def assertGeneratedKernelCountEqual(self: TestCase, expected: int):
+    if config.triton.multi_kernel:
+        # when multi_kernel is enabled, we generated both persistent reduction
+        # and non-persistent reduction kernels for the same node schedule.
+        # That will mess up with the kernel count. Just don't check it.
+        return
     if config.cpp_wrapper:
         expected *= 2
     self.assertEqual(torch._inductor.metrics.generated_kernel_count, expected)
@@ -605,6 +736,7 @@ def populate(cls):
                 cls.gen_template(name1, name2)
 
 
+@instantiate_parametrized_tests
 class CommonTemplate:
     def test_bool(self):
         def fn(a, b):
@@ -659,21 +791,6 @@ def fn(a, b, alpha):
 
         self.common(fn, (x, y, 2))
 
-    def test_add_complex2(self):
-        @torch.compile
-        def fn(a, b):
-            c = a + b
-            d = a + b
-            return c + d
-
-        x = torch.tensor([1 + 1j, -1 + 1j, -2 + 2j, 3 - 3j, 0, 1j, 1, -1])
-        y = torch.tensor([1 + 1j, -1 + 1j, -2 + 2j, 3 - 3j, 0, 1j, 1, -1])
-
-        _, code = run_and_get_code(fn, x, y)
-        self.assertEqual(
-            code[0].count("::view_dtype" if config.cpp_wrapper else "aten.view"), 3
-        )
-
     def test_add_complex3(self):
         # fix https://github.com/pytorch/pytorch/issues/115071
         @torch.compile
@@ -688,6 +805,32 @@ def fn(*args):
         fn(x)
         self.assertEqual(x, y)
 
+    def test_add_complex4(self):
+        @torch.compile
+        def fn(a, b):
+            c = a + b
+            d = a + b
+            return c + d
+
+        for dtype in [torch.complex32, torch.complex64, torch.complex128]:
+            x = torch.tensor(
+                [1 + 1j, -1 + 1j, -2 + 2j, 3 - 3j, 0, 1j, 1, -1],
+                dtype=dtype,
+                device=self.device,
+            )
+            y = torch.tensor(
+                [1 + 1j, -1 + 1j, -2 + 2j, 3 - 3j, 0, 1j, 1, -1],
+                dtype=dtype,
+                device=self.device,
+            )
+            _, code = run_and_get_code(fn, x, y)
+            self.assertEqual(
+                " ".join(code).count(
+                    "view_dtype" if config.cpp_wrapper else "aten.view"
+                ),
+                3,
+            )
+
     def test_concat_add_inplace(self):
         def fn(x, y, z):
             return torch.cat([x, y], dim=1).add_(z)
@@ -721,6 +864,7 @@ def fn(a):
 
         self.common(fn, [torch.linspace(-10, 10, 41)])
 
+    @skipCUDAIf(not SM80OrLater, "uses bfloat16 which requires SM >= 80")
     def test_scatter_bf16(self):
         def fn(inp, src, index):
             return inp.scatter_add(0, index, src)
@@ -816,6 +960,7 @@ def fn(sa, ct, p):
         )
         assertGeneratedKernelCountEqual(self, 1)
 
+    @config.patch({"fx_graph_cache": False})
     def test_forced_buffer_realize(self):
         # Test torch._test_inductor_realize forces a buffer to be realized
         def fn(a):
@@ -825,6 +970,7 @@ def fn(a):
         self.common(fn, (torch.randn(10),))
         self.assertEqual(torch._inductor.metrics.ir_nodes_pre_fusion, 2)
 
+    @config.patch({"fx_graph_cache": False})
     def test_scheduler_vertical_fusion1(self):
         realize = test_operators.realize
 
@@ -849,7 +995,7 @@ def fn(sa, ct, p):
             ),
         )
         self.assertEqual(torch._inductor.metrics.ir_nodes_pre_fusion, 5)
-        assertGeneratedKernelCountEqual(self, 1 if self.device == "cuda" else 3)
+        assertGeneratedKernelCountEqual(self, 1 if self.device == GPU_TYPE else 2)
 
     def test_index_propagation(self):
         def flip(x):
@@ -887,7 +1033,9 @@ def repeat(x, n):
         repeat_opt = torch._dynamo.optimize("inductor")(repeat)
 
         # this should be collapsed to direct indexing
-        actual = _run_and_assert_no_indirect_indexing(self, repeat_opt, x, 3)
+        actual = _run_and_assert_no_indirect_indexing(
+            self, repeat_opt, x, 3, has_wrapping=False
+        )
         expect = x.repeat(3)
         self.assertEqual(expect, actual)
         self.assertEqual(actual, repeat(x, 3))
@@ -901,9 +1049,8 @@ def test(fn, inps, has_assert: bool, has_wrapping: bool):
                 if self.device == "cpu":
                     _, code = run_and_get_cpp_code(fn_opt, *inps)
                     found = False
-                    # match ternary operator
-                    pattern = r"\?.*:"
-                    if re.findall(pattern, code):
+                    # match ternary operator for scalar or blendv for vector
+                    if re.findall(r"\?.*:", code) or re.findall("blendv", code):
                         found = True
                     self.assertTrue(found is has_wrapping)
                     self.assertTrue(("TORCH_CHECK" in code) is has_assert)
@@ -1103,7 +1250,7 @@ def fn(x):
     def test_multilayer_sum_low_prec(self):
         # fp16 nyi for cpu
         if self.device == "cpu":
-            raise unittest.SkipTest("requires CUDA")
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
 
         def fn(a):
             return torch.mean(a)
@@ -1119,6 +1266,7 @@ def fn(a):
         sample[-1] = 1
         self.common(fn, (sample,))
 
+    @skipCPUIf(IS_MACOS, "fails on macos")
     def test_multilayer_var(self):
         def fn(a):
             return torch.var(a)
@@ -1134,9 +1282,166 @@ def fn(a):
         self.common(fn, (torch.rand((16, 16, 352, 352), dtype=torch.float16),))
         self.common(fn, (torch.rand((14923), dtype=torch.float16),))
 
+    def test_split_cumsum(self):
+        def fn(a):
+            return torch.cumsum(a, -1)
+
+        for dtype in get_all_dtypes(
+            include_bfloat16=False,
+            include_bool=True,
+            include_complex=False,
+            include_half=False,
+        ):
+            # Use low=0 since when the mean value is 0, cumsum at all points
+            # tends towards zero which makes the relative error term blow up
+            inp = make_tensor(10, 3, 352, 352, low=0, dtype=dtype, device=self.device)
+            self.common(fn, (inp.view(-1),), rtol=1e-5, atol=1e-5, check_lowp=False)
+            self.common(fn, (inp.view(10, -1),), rtol=1e-5, atol=1e-5, check_lowp=False)
+
+    @skipCUDAIf(not SM80OrLater, "Requires sm80")
+    @skipCUDAIf(TEST_WITH_ROCM, "Computation not done in float on ROCm")
+    def test_split_cumsum_low_prec(self):
+        if self.device == "cpu":
+            raise unittest.SkipTest("ir.Scan nyi on CPU")
+
+        def fn(a):
+            return torch.cumsum(a.view(-1), 0)
+
+        self.common(
+            fn,
+            (torch.rand((10, 3, 352, 352), dtype=torch.float16),),
+            reference_in_float=True,
+            check_lowp=False,
+        )
+
+    def test_consecutive_split_cumsum(self):
+        def fn(a, b):
+            a = a.view(-1)
+            b = b.view(-1)
+            return torch.cumsum(a, 0) + torch.cumsum(b, 0)
+
+        a = make_tensor(10, 3, 352, 352, low=0, dtype=torch.float32, device=self.device)
+        b = make_tensor(10, 3, 352, 352, low=0, dtype=torch.float64, device=self.device)
+        self.common(fn, (a, b), rtol=1e-5, atol=1e-5, check_lowp=False)
+
+    def test_split_cumprod(self):
+        def fn(a):
+            return torch.cumprod(a, -1)
+
+        for dtype in [torch.float32, torch.float64, torch.int32, torch.int64]:
+            inp = _large_cumprod_input(
+                (10, 10000), dim=1, dtype=dtype, device=self.device
+            )
+            self.common(fn, (inp,), atol=1e-5, rtol=1e-4, check_lowp=False)
+
+    @skipCUDAIf(not SM80OrLater, "Requires sm80")
+    @skipCUDAIf(TEST_WITH_ROCM, "Computation not done in float on ROCm")
+    def test_split_cumprod_low_prec(self):
+        if self.device == "cpu":
+            raise unittest.SkipTest("ir.Scan nyi on CPU")
+
+        def fn(a):
+            return torch.cumprod(a.view(-1), 0)
+
+        for dtype in [torch.float16, torch.bfloat16]:
+            inp = _large_cumprod_input(
+                (10, 10000), dim=1, dtype=dtype, device=self.device
+            )
+            self.common(
+                fn,
+                (inp,),
+                reference_in_float=True,
+                check_lowp=False,
+            )
+
+    def test_consecutive_split_cumprod(self):
+        def fn(a, b):
+            return torch.cumprod(a, 0) + torch.cumprod(b, 0)
+
+        a = _large_cumprod_input(
+            (10000,), dim=0, dtype=torch.float32, device=self.device
+        )
+        b = _large_cumprod_input(
+            (10000,), dim=0, dtype=torch.float64, device=self.device
+        )
+        self.common(fn, (a, b), atol=1e-5, rtol=1e-5, check_lowp=False)
+
+    @skipCUDAIf(TEST_WITH_ROCM, "associative_scan is not supported on ROCm")
+    def test_custom_scan_op(self):
+        if self.device != "cuda":
+            raise unittest.SkipTest("associative_scan only supported on GPU")
+
+        def sum_combine(a, b):
+            return a + b
+
+        from torch._higher_order_ops.associative_scan import associative_scan
+
+        a = torch.randn(100, 100, device=self.device)
+        expect = torch.cumsum(a, 0)
+        actual = associative_scan(sum_combine, a, 0)
+        self.assertEqual(expect, actual)
+
+        def logcumsum_combine(a, b):
+            min_v = torch.minimum(a, b)
+            max_v = torch.maximum(a, b)
+            mask = (min_v != max_v) | ~min_v.isinf()
+            return torch.where(mask, max_v + (min_v - max_v).exp().log1p(), a)
+
+        expect = torch.logcumsumexp(a, 0)
+        actual = associative_scan(logcumsum_combine, a, 0)
+        self.assertEqual(expect, actual)
+
+    def test_custom_scan_op_compiled(self):
+        if self.device != "cuda":
+            raise unittest.SkipTest("associative_scan only supported on GPU")
+
+        from torch._higher_order_ops.associative_scan import associative_scan
+
+        def sum_combine(a, b):
+            return a + b
+
+        def fn(a, b, dim):
+            diff = (a - b).abs()
+            sad = associative_scan(sum_combine, diff, dim)
+            return sad.sum(dim)
+
+        a = torch.randn(100, 100, device=self.device)
+        b = torch.randn(100, 100, device=self.device)
+        self.common(fn, (a, b, 0))
+        cfn = torch.compile(fn)
+        _, code = run_and_get_code(cfn, a, b, 0)
+
+        # Check everything is fused into a single kernel
+        FileCheck().check_not("run(").check_regex(
+            r"triton_.*\.run\(arg[01]_1, arg[12]_1, buf1,"
+        ).check_not("run(").run(code[0])
+
+    @skipCUDAIf(TEST_WITH_ROCM, "associative_scan is not supported on ROCm")
+    def test_custom_scan_op_multi_input(self):
+        if self.device != "cuda":
+            raise unittest.SkipTest("associative_scan only supported on GPU")
+
+        def argmax_combine(a, b):
+            a_value, a_index = a
+            b_value, b_index = b
+            mask = (a_value > b_value) | ((a_value == b_value) & (a_index > b_index))
+            return (
+                torch.where(mask, a_value, b_value),
+                torch.where(mask, a_index, b_index),
+            )
+
+        from torch._higher_order_ops.associative_scan import associative_scan
+
+        a = torch.randn(100, 100, device=self.device)
+        expect = torch.cummax(a, 0)
+
+        idx = torch.arange(100, device=self.device).view(100, 1).expand(100, 100)
+        actual = associative_scan(argmax_combine, (a, idx), 0)
+        self.assertEqual(expect, actual)
+
     def test_embedding_bag_byte_unpack(self):
         if self.device != "cpu":
-            raise unittest.SkipTest("No CUDA implementation (it returns empty)")
+            raise unittest.SkipTest(f"No {GPU_TYPE} implementation (it returns empty)")
 
         def fn(a):
             return torch.ops.quantized.embedding_bag_byte_unpack(a)
@@ -1149,16 +1454,24 @@ def fn(a):
         self.common(fn, [packed])
 
     def test_expanded_reduction(self):
-        if self.device == "cpu":
-            raise unittest.SkipTest(
-                "https://github.com/pytorch/torchdynamo/issues/1697"
-            )
-
         def fn(x, y):
             z = x * y
             return z.sum((0, 1))
 
-        self.common(fn, (torch.randn(2, 197, 256), torch.randn(2, 1, 256)))
+        atol = None
+        rtol = None
+
+        # By default, inductor generate non-persistent reduction kernels in this
+        # case. But when multi-kernel is enabled, inductor will pick the faster
+        # of persistent reduction and non-persistent-reduction kernel.
+        # In this case, inductor picked the persistent-reduction kernel.
+        # The persistent reduction kernel happens to need looser tolerance.
+        if config.triton.multi_kernel:
+            atol = 1e-5
+            rtol = 1e-5
+        self.common(
+            fn, (torch.randn(2, 197, 256), torch.randn(2, 1, 256)), atol=atol, rtol=rtol
+        )
 
     def test_min_max_reduction(self):
         def fn(a, b):
@@ -1222,6 +1535,43 @@ def test_cumsum(self):
         def fn(x):
             return x.cumsum(0), x.cumsum(1)
 
+        # Persistent reductions
+        self.common(fn, (torch.rand(16, 32),), check_lowp=True)
+        self.common(fn, (torch.rand(20, 30),), check_lowp=True)
+
+        # Non-persistent reduction
+        self.common(fn, (torch.rand(100, 4000),), check_lowp=True)
+
+    def test_cumsum_zero_dim(self):
+        def fn(x):
+            return x.cumsum(0), x.cumsum(-1)
+
+        a = torch.rand(())
+        self.common(fn, (a,))
+
+    def test_cumsum_no_mask(self):
+        def fn(x):
+            return x.cumsum(-1)
+
+        # Persistent reduction
+        a = torch.rand((1, 1024))
+        self.common(fn, (a,), check_lowp=not TEST_WITH_ROCM)
+
+        # Non-persistent reduction
+        b = torch.rand((1, 8192))
+        self.common(fn, (b,), check_lowp=not TEST_WITH_ROCM)
+
+    def test_cumprod_zero_dim(self):
+        def fn(x):
+            return x.cumprod(0), x.cumprod(-1)
+
+        a = torch.rand(())
+        self.common(fn, (a,))
+
+    def test_logcumsumexp(self):
+        def fn(x):
+            return x.logcumsumexp(0), x.logcumsumexp(1)
+
         # Persistent reductions
         self.common(fn, (torch.rand(16, 32),), check_lowp=not TEST_WITH_ROCM)
         self.common(fn, (torch.rand(20, 30),), check_lowp=not TEST_WITH_ROCM)
@@ -1229,6 +1579,13 @@ def fn(x):
         # Non-persistent reduction
         self.common(fn, (torch.rand(100, 4000),), check_lowp=not TEST_WITH_ROCM)
 
+    def test_logcumsumexp_zero_dim(self):
+        def fn(x):
+            return x.logcumsumexp(0), x.logcumsumexp(-1)
+
+        a = torch.rand(())
+        self.common(fn, (a,))
+
     def test_clamp(self):
         def fn(a, b):
             return (a.clamp(-0.1, 0.1), b.clamp(0), torch.clamp(a + b, max=0))
@@ -1592,8 +1949,6 @@ def fn(a):
 
         self.common(fn, (torch.randn(8, 8),))
 
-    # TODO(voz): Re-enable this test ASAP https://github.com/pytorch/pytorch/issues/82763
-    @unittest.skip("Skipping due to op bugs")
     def test_nan_to_num(self):
         def fn(a):
             return (
@@ -1799,6 +2154,39 @@ def fn_int_input(a, i):
             fn_int_input, (make_tensor(10, device=self.device, dtype=torch.float32), 33)
         )
 
+    def test_div_precision(self):
+        # Reproducer for https://github.com/pytorch/pytorch/issues/101039
+
+        def forward(y):
+            z = y.div(1e-06)
+            return F.softmax(z, dim=-1)
+
+        query = torch.randn(1, 10, 40)
+        key = torch.randn(1, 2, 40)
+        y = torch.matmul(query, key.transpose(-2, -1))
+        self.common(forward, (y,))
+
+    def test_div_by_zero(self):
+        def fn(x, runtime_zero, runtime_neg_zero):
+            zero = torch.zeros_like(x)
+            return (
+                x / 0.0,
+                x / -0.0,
+                zero / 0.0,
+                x / zero,
+                x / -zero,
+                zero / zero,
+                x / runtime_zero,
+                # NOTE: -runtime_zero doesn't work as -(0.0) is broken in triton
+                x / runtime_neg_zero,
+                runtime_zero / runtime_neg_zero,
+            )
+
+        a = torch.randn(10)
+        zero = torch.zeros(10)
+        neg_zero = -zero
+        self.common(fn, (a, zero, neg_zero))
+
     def test_both_scalars(self):
         def fn(a, b):
             return (
@@ -1872,8 +2260,9 @@ def fn(a):
 
         # Can't use assertEqual as it expands broadcasted inputs
         del t
-        if torch.device(self.device).type == "cuda":
-            torch.cuda.empty_cache()
+        if torch.device(self.device).type == GPU_TYPE:
+            getattr(torch, GPU_TYPE).empty_cache()
+
         self.assertTrue((actual == 2).all())
 
     def test_large_offset_pointwise(self):
@@ -2044,6 +2433,7 @@ def fn(a, b, c):
 
     # https://github.com/pytorch/pytorch/issues/98979
     @skipCUDAIf(True, "cuda failed for float64 linear")
+    @skipIfXpu(msg="Double and complex datatype matmul is not supported in oneDNN")
     def test_linear_float64(self):
         mod = torch.nn.Sequential(torch.nn.Linear(8, 16).to(torch.float64)).eval()
         with torch.no_grad():
@@ -2142,6 +2532,7 @@ def fn(a, b, scale, bias):
             check_lowp=True,
         )
 
+    @with_tf32_off
     @config.patch(use_mixed_mm=True)
     def test_uint4x2_mixed_mm(self):
         def fn(a, b):
@@ -2162,6 +2553,7 @@ def fn(a, b):
             check_lowp=True,
         )
 
+    @expectedFailureXPU
     def test_mm_mixed_dtype(self):
         def fn(a, b):
             return torch.mm(a, b)
@@ -2175,6 +2567,7 @@ def fn(a, b):
         with self.assertRaisesRegex(RuntimeError, msg):
             fn(t1, t2)
 
+    @expectedFailureXPU
     def test_linear_mixed_dtype(self):
         class Net(nn.Module):
             def __init__(self):
@@ -2195,7 +2588,7 @@ def forward(self, x):
             with torch.no_grad():
                 torch.compile(fn)(t)
         # TODO: Autograd internal assertion
-        msg = "Failed running call_module .*"
+        msg = r".*isDifferentiableType\(variable.scalar_type\(\)\) INTERNAL ASSERT FAILED.*"
         with self.assertRaisesRegex(RuntimeError, msg):
             torch.compile(fn)(t)
 
@@ -2206,6 +2599,20 @@ def fn(x, y):
 
         self.common(fn, [torch.randint(5, (1, 8)), 5400])
 
+    @torch._dynamo.config.patch(dynamic_shapes=True)
+    @torch._dynamo.config.patch(assume_static_by_default=False)
+    def test_scalar_output(self):
+        def fn(arg0_1, arg2_1):
+            arg1_1 = arg2_1.size(1)
+            view = torch.ops.aten.view.default(arg2_1, [-1, arg1_1])
+            embedding = torch.ops.aten.embedding.default(arg0_1, view)
+            full = torch.ops.aten.full.default([1, arg1_1], 1, dtype=torch.float32)
+            return (full, arg1_1, embedding)
+
+        arg0_1 = rand_strided((32128, 768), (768, 1), device="cpu", dtype=torch.float32)
+        arg2_1 = rand_strided((1, 22), (22, 1), device="cpu", dtype=torch.int64)
+        self.common(fn, [arg0_1, arg2_1])
+
     def test_shape_prop_torch_ones(self):
         class Model(torch.nn.Module):
             def forward(self, attention_scores):
@@ -2228,7 +2635,7 @@ def forward(self, attention_scores):
     @config.patch({"freezing": True})
     def test_conv_bn_fuse(self):
         # For gpu path, there is an accuracy issue
-        if self.device == "cuda":
+        if self.device == GPU_TYPE:
             raise unittest.SkipTest("only support cpu conv bn test")
 
         # fails dynamic check which bn is fused, and there will not have loops vars.
@@ -2270,7 +2677,7 @@ def test_conv_bn_fuse(self):
             ).eval()
             test_memory_format = [torch.contiguous_format]
             # TODO: GPU path doesn't support channels_last now.
-            if not HAS_CUDA and dim > 1:
+            if not HAS_GPU and dim > 1:
                 channels_last = (
                     torch.channels_last if dim == 2 else torch.channels_last_3d
                 )
@@ -2287,7 +2694,7 @@ def test_conv_bn_fuse(self):
 
     def test_conv_functional_bn_fuse(self):
         # For gpu path, there is an accuracy issue
-        if self.device == "cuda":
+        if self.device == GPU_TYPE:
             raise unittest.SkipTest("only support cpu conv bn test")
 
         # Define a BatchNorm using functional BN.
@@ -2337,12 +2744,16 @@ def forward(self, x):
                 x = F.batch_norm(
                     x,
                     # If buffers are not to be tracked, ensure that they won't be updated
-                    self.running_mean
-                    if not self.training or self.track_running_stats
-                    else None,
-                    self.running_var
-                    if not self.training or self.track_running_stats
-                    else None,
+                    (
+                        self.running_mean
+                        if not self.training or self.track_running_stats
+                        else None
+                    ),
+                    (
+                        self.running_var
+                        if not self.training or self.track_running_stats
+                        else None
+                    ),
                     self.weight,
                     self.bias,
                     bn_training,
@@ -2371,8 +2782,8 @@ def forward(self, x):
 
     @skipIfRocm
     def test_conv_inference_heuristics(self):
-        if self.device != "cuda":
-            raise unittest.SkipTest("cuda only test")
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest(f"{GPU_TYPE} only test")
 
         in_channels = 6
         out_channels = 6
@@ -2411,7 +2822,7 @@ def foo(m, inp):
             FileCheck().check(".run(").check(".convolution(").run(code[0])
 
     def test_upsample_cat_conv(self):
-        if self.device == "cuda":
+        if self.device == GPU_TYPE:
             raise unittest.SkipTest("only support cpu upsample_cat_conv test")
 
         class M(torch.nn.Module):
@@ -2544,7 +2955,30 @@ def fn(a):
             (torch.randn([2, 20, 2]),),
         )
 
-    def test_split_with_sizes(self):
+    # It's a view so it doens't generate a kernel
+    @expectedFailureCodegenDynamic
+    def test_slice3(self):
+        def fn(a, b):
+            return torch.ops.aten.slice.Tensor(a, 0, 0, -b)
+
+        x = torch.rand(48, 3, 512, 512)
+        self.common(fn, (x, 2))
+
+    @expectedFailureCodegenDynamic
+    def test_slice4(self):
+        # empty slices that require clamping the start or end
+        def fn(a):
+            return (
+                aten.slice.Tensor(a, 0, 2, 0, 1),
+                aten.slice.Tensor(a, 0, a.shape[0], a.shape[0] + 10, 1),
+                aten.slice.Tensor(a, 0, -20, 0, 1),
+                aten.slice.Tensor(a, 0, -20, -16, 1),
+            )
+
+        x = torch.rand(10)
+        self.common(fn, (x,))
+
+    def test_split_with_list(self):
         def fn(a, sizes):
             return [t + 1.0 for t in torch.split(a * 2.0, sizes, -1)]
 
@@ -2552,7 +2986,31 @@ def fn(a, sizes):
         self.common(fn, (torch.randn(2, 2, 10), [4, 3, 3]))
         self.common(fn, (torch.randn(2, 2, 10), [1, 2, 3, 4]))
 
-    def test_split_with_sizes_failed(self):
+    def test_split_with_integer(self):
+        # argument `split_size_or_sections` is integer
+        @torch.compile(dynamic=True)
+        def f(x, sizes):
+            return torch.split(x, sizes, -1)
+
+        # split into equally sized chunks, 10 = 5 + 5
+        r1, r2 = f(torch.randn(2, 10), 5)
+        self.assertTrue(r1.size() == (2, 5))
+        self.assertTrue(r2.size() == (2, 5))
+
+        # split into equally sized chunks, 12 = 4 + 4 + 4
+        r1, r2, r3 = f(torch.randn(2, 12), 4)
+        self.assertTrue(r1.size() == (2, 4))
+        self.assertTrue(r2.size() == (2, 4))
+        self.assertTrue(r3.size() == (2, 4))
+
+        # split unevenly, 10 = 3 + 3 + 3 + 1
+        r1, r2, r3, r4 = f(torch.randn(2, 10), 3)
+        self.assertTrue(r1.size() == (2, 3))
+        self.assertTrue(r2.size() == (2, 3))
+        self.assertTrue(r3.size() == (2, 3))
+        self.assertTrue(r4.size() == (2, 1))
+
+    def test_split_failed(self):
         @torch._dynamo.optimize("inductor")
         def fn(a):
             return torch.split(a, [2, 1, 1], dim=1)
@@ -2606,11 +3064,13 @@ def fn(a, b):
             ),
         )
 
-    @requires_cuda()
+    @requires_gpu()
     def test_to_device(self):
         def fn(a):
             if a.device.type == "cpu":
-                return aten._to_copy(a, device=torch.device("cuda"), dtype=6, layout=0)
+                return aten._to_copy(
+                    a, device=torch.device(GPU_TYPE), dtype=6, layout=0
+                )
             else:
                 return aten._to_copy(a, device=torch.device("cpu"), dtype=6, layout=0)
 
@@ -2635,12 +3095,12 @@ def fn(a, memory_format):
             ),
         )
 
-    @requires_cuda()
+    @requires_gpu()
     def test_to_device_constant(self):
         def fn(a):
             d1 = a.device.type
             if d1 == "cpu":
-                d2 = "cuda"
+                d2 = GPU_TYPE
             else:
                 d2 = "cpu"
 
@@ -2656,18 +3116,18 @@ def fn(a):
             (torch.randn([10]),),
         )
 
-    @requires_cuda()
+    @requires_gpu()
     def test_multi_device(self):
         def fn(x):
             x = x + 1
             x = x + 2
-            x = x.cuda()
+            x = x.to(device=GPU_TYPE)
             x = x + 3
             x = x + 4
             x = x.cpu()
             x = x + 5
             x = x + 6
-            x = x.cuda()
+            x = x.to(device=GPU_TYPE)
             x = x + 7
             x = x + 8
             x = x.cpu()
@@ -2685,11 +3145,11 @@ def fn(x):
     @requires_multigpu()
     def test_multi_gpu_device(self):
         # TODO: https://github.com/pytorch/pytorch/issues/92627
-        x = torch.rand([4], device="cuda")
+        x = torch.rand([4], device=GPU_TYPE)
 
         def fn(x, y):
             r = torch.ops.aten.div(x, y)
-            r = r.to("cuda:1")
+            r = r.to(f"{GPU_TYPE}:1")
             return 2 * r
 
         self.common(fn, (torch.randn(4), torch.randn(4)), check_lowp=False)
@@ -2709,13 +3169,13 @@ def fail(guard):
 
         gemm_opt = torch._dynamo.optimize("inductor", guard_fail_fn=fail)(gemm)
 
-        x0 = torch.randn(1024, 1024, device="cuda:0")
-        y0 = torch.randn(1024, 1024, device="cuda:0")
+        x0 = torch.randn(1024, 1024, device=f"{GPU_TYPE}:0")
+        y0 = torch.randn(1024, 1024, device=f"{GPU_TYPE}:0")
 
         gemm_opt(x0, y0)
 
-        x1 = torch.randn(1024, 1024, device="cuda:1")
-        y1 = torch.randn(1024, 1024, device="cuda:1")
+        x1 = torch.randn(1024, 1024, device=f"{GPU_TYPE}:1")
+        y1 = torch.randn(1024, 1024, device=f"{GPU_TYPE}:1")
 
         gemm_opt(x1, y1)
         self.assertTrue(failed_guard is not None)
@@ -2797,7 +3257,7 @@ def fn(x, w):
         )
 
     def test_conv2d_channels_last(self):
-        if self.device == "cuda":
+        if self.device == GPU_TYPE:
             raise unittest.SkipTest("only support cpu conv2d channels_last")
 
         m = torch.nn.Sequential(
@@ -2852,7 +3312,7 @@ def fn(grad_output, inp, weight):
         )
 
     def test_conv3d_channels_last(self):
-        if self.device == "cuda":
+        if self.device == GPU_TYPE:
             raise unittest.SkipTest("only support cpu conv3d channels_last")
 
         m = torch.nn.Sequential(
@@ -2912,6 +3372,85 @@ def fn(x):
         )
         assertGeneratedKernelCountEqual(self, 0)
 
+    def test_adaptive_max_pool2d1(self):
+        def fn(x):
+            return aten.adaptive_max_pool2d(x, (6, 6))
+
+        self.common(
+            fn,
+            (torch.randn(2, 4, 16, 16),),
+            check_lowp=False,
+        )
+
+        # lowering to max_pool2d case
+        self.common(
+            fn,
+            (torch.randn(2, 4, 3, 3),),
+        )
+
+        # no-op case
+        self.common(
+            fn,
+            (torch.randn(2, 4, 6, 6),),
+        )
+
+    def test_adaptive_max_pool2d2(self):
+        # Big kernel size, use fallback
+        def fn(x):
+            return aten.adaptive_max_pool2d(x, (4, 4))
+
+        torch._inductor.metrics.generated_kernel_count = 0
+        self.common(
+            fn,
+            (torch.randn(2, 4, 21, 21),),
+            check_lowp=False,
+        )
+        assertGeneratedKernelCountEqual(self, 0)
+
+    def test_fractional_max_pool2d1(self):
+        def fn(x, samples):
+            return aten.fractional_max_pool2d(x, (3, 3), (2, 2), samples)
+
+        self.common(
+            fn, (torch.randn(1, 4, 16, 16), torch.rand(1, 4, 2)), check_lowp=False
+        )
+
+    def test_fractional_max_pool2d2(self):
+        # fallback for larger kernel size
+
+        def fn(x, samples):
+            return aten.fractional_max_pool2d(x, (6, 5), (3, 3), samples)
+
+        torch._inductor.metrics.generated_kernel_count = 0
+        self.common(
+            fn,
+            (torch.randn(2, 4, 36, 36), torch.rand(2, 4, 2)),
+            check_lowp=False,
+        )
+        assertGeneratedKernelCountEqual(self, 0)
+
+    def test_fractional_max_pool2d3(self):
+        def fn(x, samples):
+            return aten.fractional_max_pool2d(x, (1, 1), (16, 16), samples)
+
+        self.common(
+            fn, (torch.randn(2, 4, 16, 16), torch.rand(2, 4, 2)), check_lowp=False
+        )
+
+    @config.patch(fallback_random=True)
+    def test_fractional_max_pool2d4(self):
+        random.seed(1234)
+        torch.manual_seed(1234)
+
+        # check rectangular kernel/output size
+
+        def fn(x):
+            return torch.nn.functional.fractional_max_pool2d_with_indices(
+                x, (4, 3), (3, 2)
+            )
+
+        self.common(fn, (torch.randn(1, 4, 16, 16),), check_lowp=False)
+
     def test_multi_threading(self):
         model = torch.nn.Linear(2, 3).eval()
         inp = torch.randn(4, 2)
@@ -3410,7 +3949,7 @@ def fn(x):
 
     @config.patch(fallback_random=True)
     def test_randn_with_dtype_and_device(self):
-        if self.device == "cuda":
+        if self.device == GPU_TYPE:
             raise unittest.SkipTest("only support cpu randn_with_dtype_and_device test")
 
         def fn(vectors):
@@ -3533,7 +4072,7 @@ def test_batch_norm_2d(self):
     @with_tf32_off
     def test_batch_norm_2d_2(self):
         if self.device == "cpu":
-            raise unittest.SkipTest("requires CUDA")
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
 
         class Repro(torch.nn.Module):
             def __init__(self):
@@ -3561,8 +4100,8 @@ def forward(self, l_input_: torch.Tensor):
                 self_2 = self.self_2(self_1)
                 return (self_2,)
 
-        inp = torch.randn((4, 64, 192, 256), dtype=torch.float32, device="cuda")
-        mod = Repro().cuda()
+        inp = torch.randn((4, 64, 192, 256), dtype=torch.float32, device=GPU_TYPE)
+        mod = Repro().to(device=GPU_TYPE)
         o1 = mod(inp)
         o2 = torch.compile(mod)(inp)
         self.assertEqual(o1, o2)
@@ -3685,6 +4224,24 @@ def fn(x, y):
         if self.device != "cpu":
             assertGeneratedKernelCountEqual(self, 1)
 
+    def test_fusing_write_into_disjoint_read(self):
+        def test_flip(a):
+            return a.copy_(torch.flip(a, (0,)))
+
+        self.common(test_flip, (torch.rand([20]),))
+
+        assertGeneratedKernelCountEqual(self, 2)
+
+        # issue only manifests on cuda with large tensors
+        if self.device != "cpu":
+
+            def f(a):
+                a[:, 20:40] = a[:, 20:40] + 1
+                a[:, 2:900025] = a[:, 1:900024] + 2
+
+            a = torch.rand((1, 1000000), device=GPU_TYPE)
+            self.common(f, (a,))
+
     def test_gather_scatter(self):
         def fn(node_feat, edge_index):
             src_node_feat = node_feat[edge_index[0]]
@@ -3787,7 +4344,7 @@ def fn(mask, value):
         for inp in (
             torch.randn(
                 [16, 16],
-                dtype=torch.float16 if self.device == "cuda" else torch.float32,
+                dtype=torch.float16 if self.device == GPU_TYPE else torch.float32,
                 device=self.device,
             ),
             torch.randint(16, (16, 16), device=self.device),
@@ -3845,11 +4402,10 @@ def fn(x):
 
         self.common(
             fn,
-            # TODO: Remove dtype once https://github.com/pytorch/pytorch/issues/94010 is fixed
             (
                 torch.randn(
                     [16, 16],
-                    dtype=torch.float64 if self.device == "cpu" else torch.float32,
+                    dtype=torch.float32,
                 ),
             ),
             # Mismatched elements: 9 / 256 (3.5%)
@@ -3896,18 +4452,68 @@ def fn(x):
             (torch.randn([8, 16, 8, 8]),),
         )
 
-    def test_cat(self):
-        def fn(a):
-            tmp = a * 2
-            return (
-                torch.cat((a, a[:, :4] + 1, a + 2), -1),
-                torch.cat((tmp, tmp), 0),
-                torch.cat((tmp, tmp.double()), 0),
-            )
+    @torch._dynamo.config.patch(capture_dynamic_output_shape_ops=True)
+    def test_nonzero_unbacked_refinement(self):
+        def fn(x):
+            z = x.nonzero()
+            torch._check(z.size(0) == 4)
+            return z + 3
 
         self.common(
             fn,
-            (torch.randn([8, 16]),),
+            (torch.tensor([0, 1, 3, 4, 2, 0, 0]),),
+        )
+
+        with self.assertRaises(RuntimeError):
+            torch.compile(fn)(torch.tensor([0, 0, 0, 0]))
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_unbacked_floordiv_simplify(self):
+        def fn(x, y):
+            z = y.item()
+            torch._check(z // 2 == 3)
+            return x + x.new_zeros(z)
+
+        self.common(
+            fn,
+            (
+                torch.randn(6),
+                torch.tensor([6]),
+            ),
+        )
+
+        self.common(
+            fn,
+            (
+                torch.randn(7),
+                torch.tensor([7]),
+            ),
+        )
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_unbacked_floordiv_simplify_errors(self):
+        def fn(x, y):
+            z = y.item()
+            torch._check(z // 2 == 3)
+            return x + x.new_zeros(z)
+
+        # This is a little suboptimal: we actually fail /in the compiler/ but
+        # not in a way that causes Dynamo to graph break
+        with self.assertRaises(RuntimeError):
+            torch.compile(fn)(torch.randn(8), torch.tensor(8))
+
+    def test_cat(self):
+        def fn(a):
+            tmp = a * 2
+            return (
+                torch.cat((a, a[:, :4] + 1, a + 2), -1),
+                torch.cat((tmp, tmp), 0),
+                torch.cat((tmp, tmp.double()), 0),
+            )
+
+        self.common(
+            fn,
+            (torch.randn([8, 16]),),
         )
         self.common(
             fn,
@@ -3952,6 +4558,94 @@ def fn_2(*tensors):
             ),
         )
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_cat_unbacked_legacy_empty(self):
+        def fn(x, y):
+            z = y.item()
+            return torch.cat([x, x.new_ones(z)])
+
+        self.common(
+            fn,
+            (
+                torch.randn([2, 3]),
+                torch.tensor([0]),
+            ),
+        )
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_cat_unbacked_empty_1d(self):
+        def fn(x, y):
+            z = y.item()
+            return torch.cat([x, x.new_ones(z)])
+
+        self.common(
+            fn,
+            (
+                torch.randn([2]),
+                torch.tensor([0]),
+            ),
+        )
+
+        self.common(
+            fn,
+            (
+                torch.randn([2]),
+                torch.tensor([3]),
+            ),
+        )
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_cat_unbacked_2d(self):
+        def fn(x, y):
+            z = y.item()
+            return torch.cat([x, x.new_ones(z, x.shape[1])])
+
+        self.common(
+            fn,
+            (
+                torch.randn([2, 3]),
+                torch.tensor([0]),
+            ),
+        )
+
+        self.common(
+            fn,
+            (
+                torch.randn([2, 3]),
+                torch.tensor([4]),
+            ),
+        )
+
+    def test_cat_negative_dim(self):
+        def fn(*tensors):
+            return torch.cat(tensors, dim=-1)
+
+        self.common(
+            fn,
+            (
+                torch.randn([2, 3]),
+                torch.randn([2, 4]),
+            ),
+        )
+
+        self.common(
+            fn,
+            (
+                torch.randn([2, 3]),
+                torch.randn([0]),
+                torch.randn([2, 4]),
+            ),
+        )
+
+        self.common(
+            fn,
+            (
+                torch.randn([0]),
+                torch.randn([2, 3]),
+                torch.randn([2, 4]),
+            ),
+        )
+
     @expectedFailureCodegenDynamic
     def test_cat_single_empty(self):
         # fails dynamic check for 'has a dynamic dimension'
@@ -3984,6 +4678,13 @@ def fn(x1, x2, x3, x4):
             c = torch.cat((x, x1), 1)
             return (c,)
 
+        if self.device == "xpu":
+            atol = 3e-4
+            rtol = 1e-4
+        else:
+            # use default
+            atol = None
+            rtol = None
         self.common(
             fn,
             (
@@ -3992,6 +4693,8 @@ def fn(x1, x2, x3, x4):
                 torch.randn(1024, 1600),
                 torch.randn(100, 256),
             ),
+            atol=atol,
+            rtol=rtol,
             check_lowp=False,  # accuracy issues with relatively large matmuls
         )
 
@@ -4073,6 +4776,14 @@ def fn2(a, b):
             ),
         )
 
+    def test_remove_noop_clone(self):
+        def fn(x):
+            y = x.clone().reshape(-1, 4)
+            y[:, [2, 0]] = y[:, [0, 2]]
+            return y + x
+
+        self.common(fn, (torch.randn(2, 4),))
+
     def test_cat_of_loops_and_extern_kernel(self):
         class M(torch.nn.Module):
             def __init__(
@@ -4357,7 +5068,7 @@ def fn(a):
                     dtype=torch.float32,
                     device=a.device,
                 ),
-                torch.zeros(2, 3, names=None),
+                torch.zeros(2, 3),
                 a + torch.ones(8, device=a.device),
                 torch.full((2, 3), 3.1416, device=a.device),
             )
@@ -4477,10 +5188,11 @@ def fn(a, b):
             )
 
     @skipCUDAIf(not TEST_CUDNN, "CUDNN not available")
+    @skipIfXpu
     @skipIfRocm
     def test_cudnn_rnn(self):
         if self.device == "cpu":
-            raise unittest.SkipTest("requires CUDA")
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
 
         def fn(
             a0,
@@ -4863,7 +5575,7 @@ def fn(x, y):
 
     # The following 2 tests are meant to check the logic that drops
     # xmask from triton load/store if xnumel = 1
-    @requires_cuda()
+    @requires_gpu()
     def test_single_elem(self):
         def fn(a):
             b = a + 1
@@ -4871,7 +5583,7 @@ def fn(a):
 
         self.common(fn, (torch.randn(1),))
 
-    @requires_cuda()
+    @requires_gpu()
     def test_single_elem_indirect(self):
         def fn(a, b):
             c = a[b] + 1
@@ -4885,7 +5597,7 @@ def fn(a, b):
     # This test is meant to check for issues from the logic
     # that drops xmask from trito load/store if XBLOCK divides xnumel
 
-    @requires_cuda()
+    @requires_gpu()
     def test_xblock_divides_xnumel(self):
         def fn(a):
             b = a + 1
@@ -5051,10 +5763,19 @@ def fn(a):
         fn(arg1)
         opt_fn = torch._dynamo.optimize_assert(compile_fx)(fn)
         opt_fn(arg2)
+        self.assertTrue(same(arg1, arg2))
 
-        # TODO, fix: See https://github.com/pytorch/pytorch/issues/94693
-        if self.device != "cpu":
-            self.assertTrue(same(arg1, arg2))
+    def test_slice_mutation3(self):
+        def fn(a):
+            a[:2, :2].fill_(10)
+
+        opt_fn = torch._dynamo.optimize_assert(compile_fx)(fn)
+
+        x1 = torch.randn(8, 8, device=self.device)
+        x2 = x1.clone()
+        fn(x1)
+        opt_fn(x2)
+        self.assertEqual(x1, x2)
 
     def test_tensor_index_slice(self):
         def fn(a):
@@ -5456,6 +6177,30 @@ def fn(ind, x, src):
         args = [torch.tensor([1], dtype=torch.int64), torch.randn(8, 4), torch.randn(4)]
         self.common(fn, args)
 
+    def test_index_put_reinplace(self):
+        def fn(x, idx):
+            src = torch.ones(idx.size(0), device=x.device)
+            x.index_put_((idx,), src)
+            return x.expand((2, x.shape[0]))
+
+        a = torch.randn(1024)
+        idx = torch.arange(10)
+        torch._inductor.metrics.generated_kernel_count = 0
+        self.common(fn, (a, idx))
+        assertGeneratedKernelCountEqual(self, 1)
+
+    def test_index_put_failed_reinplace(self):
+        def fn(x, idx):
+            src = torch.ones(idx.size(0), device=x.device)
+            y = x.index_put((idx,), src)
+            return x, y
+
+        a = torch.randn(1024)
+        idx = torch.arange(10)
+        torch._inductor.metrics.generated_kernel_count = 0
+        self.common(fn, (a, idx))
+        assertGeneratedKernelCountEqual(self, 2)
+
     def test_adding_tensor_offsets(self):
         @torch.compile(fullgraph=True)
         def fn(x):
@@ -5633,6 +6378,49 @@ def fn(a, b):
             ],
         )
 
+    def test_slice_scatter5(self):
+        # empty slices that require clamping the start or end
+        def fn(a, b):
+            return (
+                aten.slice_scatter.default(a, b, 0, 2, 0, 1),
+                aten.slice_scatter.default(a, b, 0, a.shape[0], a.shape[0] + 10, 1),
+                aten.slice_scatter.default(a, b, 0, -20, 0, 1),
+                aten.slice_scatter.default(a, b, 0, -20, -16, 1),
+            )
+
+        a = torch.arange(10, dtype=torch.float)
+        b = torch.empty(0)
+        self.common(fn, [a, b])
+
+    def test_slice_scatter_reinplace(self):
+        class M(nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.linear1 = nn.Linear(64, 64, bias=False)
+                self.cache_k = torch.zeros((56, 384, 8, 64), device=device)
+
+            def forward(self, x, start_pos):
+                bsz, seqlen, _, _ = x.shape
+                xk = self.linear1(x)
+                with torch.no_grad():
+                    self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk
+                keys = self.cache_k[:bsz, : start_pos + seqlen]
+                scores = torch.matmul(
+                    xk.transpose(1, 2), keys.transpose(1, 2).transpose(2, 3)
+                )
+                return scores
+
+        kv_cache_module = M(self.device)
+        inp = torch.randn(1, 32, 8, 64)
+
+        # Test that the cache update is reinplaced such that the cache is updated inplace
+        # rather than copy-scatter-copy-back.
+
+        torch._inductor.metrics.generated_kernel_count = 0
+        with torch.no_grad():
+            self.common(kv_cache_module, (inp, 1), check_lowp=False)
+        assertGeneratedKernelCountEqual(self, 1)
+
     def test_scatter1(self):
         def fn(a, dim, index, b):
             return aten.scatter(a, dim, index, b)
@@ -5651,6 +6439,10 @@ def test_scatter2(self):
         if self.device == "cuda":
             raise unittest.SkipTest("unstable on sm86")
 
+        check_lowp = True
+        if self.device == "xpu":
+            check_lowp = False
+
         def fn(a, dim, index, b):
             return aten.scatter.reduce(a, dim, index, b, reduce="add")
 
@@ -5662,12 +6454,17 @@ def fn(a, dim, index, b):
                 torch.zeros((64, 512), dtype=torch.int64),
                 torch.ones(64, 512),
             ],
+            check_lowp=check_lowp,
         )
 
     def test_scatter3(self):
         def fn(a, dim, index, b):
             return aten.scatter(a, dim, index, b, reduce="add")
 
+        check_lowp = True
+        if self.device == "xpu":
+            check_lowp = False
+
         self.common(
             fn,
             [
@@ -5681,12 +6478,17 @@ def fn(a, dim, index, b):
             # Greatest relative difference: 0.0022371364653243847 at index (0, 0, 3) (up to 0.001 allowed)
             atol=2e-4,
             rtol=1e-3,
+            check_lowp=check_lowp,
         )
 
     def test_scatter4(self):
         def fn(x, ind, src):
             return torch.scatter(x, 0, ind, src)
 
+        check_lowp = True
+        if self.device == "xpu":
+            check_lowp = False
+
         for deterministic in [False, True]:
             with DeterministicGuard(deterministic):
                 self.common(
@@ -5696,6 +6498,7 @@ def fn(x, ind, src):
                         torch.randint(196, (1, 992)),
                         torch.randn(1, 992),
                     ],
+                    check_lowp=check_lowp,
                 )
 
     def test_scatter5(self):
@@ -5706,6 +6509,10 @@ def fn(a, dim, index, b, reduce):
             a1.scatter_(dim, index, b, reduce=reduce)
             return (a, a1)
 
+        check_lowp = True
+        if self.device == "xpu":
+            check_lowp = False
+
         for reduce in ["add", "multiply"]:
             self.common(
                 fn,
@@ -5716,12 +6523,17 @@ def fn(a, dim, index, b, reduce):
                     torch.randn(4, 5),
                     reduce,
                 ],
+                check_lowp=check_lowp,
             )
 
     def test_scatter6(self):
         def fn(a, dim, index, b):
             return aten.scatter(a, dim, index, b)
 
+        check_lowp = True
+        if self.device == "xpu":
+            check_lowp = False
+
         for deterministic in [False, True]:
             with DeterministicGuard(deterministic):
                 self.common(
@@ -5732,6 +6544,7 @@ def fn(a, dim, index, b):
                         torch.tensor([[[3, 5, 7, 9]]]),
                         0.8,  # src can be a scalar
                     ],
+                    check_lowp=check_lowp,
                 )
 
     @unittest.skip("Flaky test, needs debugging")
@@ -5739,6 +6552,10 @@ def test_scatter_add1(self):
         def fn(a, dim, index, b):
             return aten.scatter_add(a, dim, index, b)
 
+        check_lowp = True
+        if self.device == "xpu":
+            check_lowp = False
+
         self.common(
             fn,
             [
@@ -5747,12 +6564,17 @@ def fn(a, dim, index, b):
                 torch.tensor([[0]]),
                 torch.randn(2, 3),
             ],
+            check_lowp=check_lowp,
         )
 
     def test_scatter_add2(self):
         def fn(a, dim, index, b):
             return aten.scatter_add(a, dim, index, b)
 
+        check_lowp = True
+        if self.device == "xpu":
+            check_lowp = False
+
         self.common(
             fn,
             [
@@ -5761,12 +6583,17 @@ def fn(a, dim, index, b):
                 torch.tensor([[0, 0, 0], [1, 1, 1]]),
                 torch.randn(2, 3),
             ],
+            check_lowp=check_lowp,
         )
 
     def test_scatter_add3(self):
         def fn(a, dim, index, b):
             return aten.scatter_add(a, dim, index, b)
 
+        check_lowp = True
+        if self.device == "xpu":
+            check_lowp = False
+
         for deterministic in [False, True]:
             with DeterministicGuard(deterministic):
                 self.common(
@@ -5777,12 +6604,17 @@ def fn(a, dim, index, b):
                         torch.tensor([[[3, 5, 7, 9]]]),
                         torch.randn(1, 1, 10),
                     ],
+                    check_lowp=check_lowp,
                 )
 
     def test_scatter_reduce1(self):
         def fn(a, dim, index, b):
             return aten.scatter_reduce(a, dim, index, b, "sum")
 
+        check_lowp = True
+        if self.device == "xpu":
+            check_lowp = False
+
         self.common(
             fn,
             [
@@ -5791,12 +6623,17 @@ def fn(a, dim, index, b):
                 torch.tensor([[[3, 5, 7, 9]]]),
                 torch.randn(1, 1, 10),
             ],
+            check_lowp=check_lowp,
         )
 
     def test_scatter_reduce2(self):
         def fn(a, dim, index, b, reduce):
             return aten.scatter_reduce(a, dim, index, b, reduce, include_self=False)
 
+        check_lowp = True
+        if self.device == "xpu":
+            check_lowp = False
+
         for reduce in ["sum", "amax"]:
             self.common(
                 fn,
@@ -5807,6 +6644,7 @@ def fn(a, dim, index, b, reduce):
                     torch.randn(2, 3),
                     reduce,
                 ],
+                check_lowp=check_lowp,
             )
 
     def test_scatter_reduce3(self):
@@ -5817,6 +6655,10 @@ def fn(a, dim, index, b, reduce):
             a1.scatter_reduce_(dim, index, b, reduce=reduce)
             return (a, a1)
 
+        check_lowp = True
+        if self.device == "xpu":
+            check_lowp = False
+
         for reduce in ["sum", "prod"]:
             self.common(
                 fn,
@@ -5827,21 +6669,27 @@ def fn(a, dim, index, b, reduce):
                     torch.randn(4, 5),
                     reduce,
                 ],
+                check_lowp=check_lowp,
             )
 
-    # issue #1150
     def test_dense_mask_index(self):
+        r"""
+        There will be a little difference for reduce order between aten and inductor
+        https://github.com/pytorch/pytorch/pull/122289
+        Absolute difference: 0.00067138671875 (up to 1e-05 allowed)
+        Relative difference: 3.1747371732500974e-06 (up to 1.3e-06 allowed)
+        """
+        kwargs = {}
         if self.device == "cpu":
-            raise unittest.SkipTest(
-                "https://github.com/pytorch/torchdynamo/issues/1697"
-            )
+            kwargs["atol"] = 1e-4
+            kwargs["rtol"] = 1.3e-5
 
         def fn(x, y):
             y = torch.ops.aten.select.int(y, 0, 2)
             z = x * y
             return z.sum()
 
-        self.common(fn, [torch.randn(102400), torch.randn(3)])
+        self.common(fn, [torch.randn(102400), torch.randn(3)], **kwargs)
 
     def test_empty1(self):
         def fn():
@@ -5975,6 +6823,145 @@ def fn(a):
         self.assertTrue((d >= 0).all())
         self.assertTrue((d < 1).all())
 
+    @config.patch(implicit_fallbacks=True)
+    def test_fallback_mutable_op_basic(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as m:
+
+            def impl(a, b, c, d, e=2):
+                a.add_(b[0] * c * e),
+                if d is not None:
+                    d.add_(b[1])
+
+            m.define(
+                "inplace_(Tensor(a!) a, Tensor[] b, SymInt c, *, Tensor(b!)? d, SymInt e=2) -> ()"
+            )
+            m.impl("inplace_", impl, "CompositeExplicitAutograd")
+
+            # We do some clones and copy_ to test that Inductor doesn't reorder
+            # the copy_ w.r.t. inplace_.
+            def f(a, b1, b2, c, d):
+                a_ = a.clone()
+                d_ = d if d is None else d.clone()
+                torch.ops.mylib.inplace_(a_, (b1, b2), c, d=d_)
+                a.copy_(a_)
+                if d is not None:
+                    d.copy_(d_)
+                return ()
+
+            a = torch.tensor([0.0, 1.0, 2])
+            b = [torch.tensor([2.0, 3.0, 5.0]), torch.tensor([1.0, 4.0, 6.0])]
+            c = 4
+            d = torch.tensor([2.0, 1, 0])
+            args = (a, b[0], b[1], c, d)
+            cloned_args = pytree.tree_map_only(torch.Tensor, torch.clone, args)
+            mod = make_fx(f)(*cloned_args)
+            cloned_args = pytree.tree_map_only(torch.Tensor, torch.clone, args)
+            compiled_f = compile_fx_inner(mod, cloned_args)
+
+            cloned_args = pytree.tree_map_only(torch.Tensor, torch.clone, args)
+            compiled_f(list(cloned_args))
+            f(*args)
+            self.assertEqual(cloned_args, args)
+
+    @config.patch(implicit_fallbacks=True)
+    def test_fallback_mutable_op_with_return(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as m:
+
+            def impl(a, b, c, d, e=2):
+                a.add_(b[0] * c * e),
+                if d is not None:
+                    d.add_(b[1])
+                return b[0] + b[1]
+
+            m.define(
+                "inplace_(Tensor(a!) a, Tensor[] b, SymInt c, *, Tensor(b!)? d, SymInt e=2) -> Tensor"
+            )
+            m.impl("inplace_", impl, "CompositeExplicitAutograd")
+
+            # We do some clones and copy_ to test that Inductor doesn't reorder
+            # the copy_ w.r.t. inplace_.
+            def f(a, b0, b1, c, d):
+                a_ = a.clone()
+                d_ = d if d is None else d.clone()
+                res = torch.ops.mylib.inplace_(a_, (b0, b1), c, d=d_)
+                a.copy_(a_)
+                if d is not None:
+                    d.copy_(d_)
+                return (res,)
+
+            a = torch.tensor([0.0, 1.0, 2])
+            b = [torch.tensor([2.0, 3.0, 5.0]), torch.tensor([1.0, 4.0, 6.0])]
+            c = 4
+            d = torch.tensor([2.0, 1, 0])
+            args = (a, b[0], b[1], c, d)
+
+            cloned_args = pytree.tree_map_only(torch.Tensor, torch.clone, args)
+            mod = make_fx(f)(*cloned_args)
+            cloned_args = pytree.tree_map_only(torch.Tensor, torch.clone, args)
+            compiled_f = compile_fx_inner(mod, cloned_args)
+
+            cloned_args = pytree.tree_map_only(torch.Tensor, torch.clone, args)
+            compiled_out = compiled_f(list(cloned_args))
+            out = f(*args)
+            self.assertEqual(cloned_args, args)
+            self.assertEqual(compiled_out, out)
+
+    @config.patch(implicit_fallbacks=True)
+    def test_fallback_mutable_op_no_mutated_tensors(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as m:
+
+            def impl(a, b):
+                if b is not None:
+                    b.add_(1)
+
+            m.define("inplace_(Tensor a, Tensor(b!)? b) -> ()")
+            m.impl("inplace_", impl, "CompositeExplicitAutograd")
+
+            def f(a):
+                torch.ops.mylib.inplace_(a, None)
+                return ()
+
+            a = torch.tensor([0.0, 1.0, 2])
+            args = (a,)
+            cloned_args = pytree.tree_map_only(torch.Tensor, torch.clone, args)
+            mod = make_fx(f)(*cloned_args)
+            cloned_args = pytree.tree_map_only(torch.Tensor, torch.clone, args)
+            compiled_f = compile_fx_inner(mod, cloned_args)
+
+            cloned_args = pytree.tree_map_only(torch.Tensor, torch.clone, args)
+            compiled_f(list(cloned_args))
+            f(*args)
+            self.assertEqual(cloned_args, args)
+
+    @config.patch(implicit_fallbacks=True)
+    def test_fallback_mutable_op_list(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as m:
+
+            def impl(a, b):
+                for bi in b:
+                    bi.add_(a)
+
+            m.define("inplace_(Tensor a, Tensor(a!)[] b) -> ()")
+            m.impl("inplace_", impl, "CompositeExplicitAutograd")
+
+            def f(a, b):
+                torch.ops.mylib.inplace_(a, b)
+                return ()
+
+            a = torch.tensor([0.0, 1.0, 2])
+            b = [torch.tensor([2.0, 3.0, 5.0]), torch.tensor([1.0, 4.0, 6.0])]
+            args = (a, b)
+            cloned_args = pytree.tree_map_only(torch.Tensor, torch.clone, args)
+            mod = make_fx(f)(*cloned_args)
+            cloned_args = pytree.tree_map_only(torch.Tensor, torch.clone, args)
+
+            with self.assertRaisesRegex(
+                torch._inductor.exc.LoweringException,
+                "NYI: Can't generate FallbackKernel",
+            ):
+                compiled_f = compile_fx_inner(mod, cloned_args)
+
+    @expectedFailureXPU
     def test_functionalize_rng_wrappers(self):
         # Ideally, we would like to use torch.compile for these operators. But
         # currently the plan is to introduce these operators at the partitioner
@@ -6020,10 +7007,11 @@ def fn():
         self.assertEqual(a2, b2)
 
     @patch.object(torch._functorch.config, "functionalize_rng_ops", True)
+    @expectedFailureXPU
     def test_philox_rand(self):
         if self.device == "cpu":
             raise unittest.SkipTest(
-                "functionalization of rng ops supported only on CUDA"
+                f"functionalization of rng ops supported only on {GPU_TYPE}"
             )
 
         @torch._dynamo.optimize("inductor")
@@ -6049,10 +7037,40 @@ def check(x):
             self.assertFalse(torch.allclose(a, b))
 
         check(torch.ones(1024, device=self.device, dtype=torch.float32))
-        self.assertEqual(torch.cuda._get_rng_state_offset(), 2048)
+        # Need comment: should we add "_get_rng_state_offset" to common device interface?
+        self.assertEqual(getattr(torch, self.device)._get_rng_state_offset(), 2048)
         # Check non-multiple of 4 numel
         check(torch.ones(3, device=self.device, dtype=torch.float32))
-        self.assertEqual(torch.cuda._get_rng_state_offset(), 8)
+        self.assertEqual(getattr(torch, self.device)._get_rng_state_offset(), 8)
+
+    # Already on by default, just want to make sure
+    @patch.object(torch._inductor.config, "allow_buffer_reuse", True)
+    def test_reuse_buffers_with_aliasing(self):
+        def f(x):
+            z = x + 1
+            z = torch.view_as_complex(z)
+            a = torch.view_as_real(z)
+            out = a + 1
+            return out, torch.view_as_real(z + 1)
+
+        self.common(f, (torch.zeros((4, 2)),))
+
+        code = run_and_get_triton_code(torch.compile(f), torch.zeros((4, 2)))
+        # Make sure that we haven't added complex support and made this test
+        # invalid. If we've added complex support please update the test to use
+        # a different set of view ops we don't lower
+        self.assertTrue("aten.view_as_real" in code)
+
+        def f2(x):
+            z = x + 1
+            z = torch.view_as_complex(z)
+            z = torch.view_as_real(z)
+            z = torch.view_as_complex(z)
+            a = torch.view_as_real(z)
+            out = a + 1
+            return out, torch.view_as_real(z + 1)
+
+        self.common(f, (torch.zeros((4, 2)),))
 
     def test_randn_like_empty(self):
         class Model(torch.nn.Module):
@@ -6122,7 +7140,7 @@ def fn(x):
         a1 = fn(x).clone()
         self.assertFalse(torch.allclose(a0, a1))
 
-    @requires_cuda()
+    @requires_gpu()
     def test_like_rands3(self):
         # rand_like with `device` which is different from `x.device`
         def test_like_rands_on_different_device(device1, device2):
@@ -6133,9 +7151,9 @@ def fn(x, device):
             x = torch.ones(10, device=device1, dtype=torch.float32)
             return fn(x, device2).clone()
 
-        a0 = test_like_rands_on_different_device("cpu", "cuda")
-        a1 = test_like_rands_on_different_device("cuda", "cpu")
-        self.assertTrue(a0.device.type == "cuda")
+        a0 = test_like_rands_on_different_device("cpu", GPU_TYPE)
+        a1 = test_like_rands_on_different_device(GPU_TYPE, "cpu")
+        self.assertTrue(a0.device.type == GPU_TYPE)
         self.assertTrue(a1.device.type == "cpu")
 
     def test_max_pool2d_with_indices_backward(self):
@@ -6240,6 +7258,7 @@ def fn(a, b, c):
         )
         assertGeneratedKernelCountEqual(self, 1)
 
+    @expectedFailureXPU
     def test_max_pool2d_with_indices_backward5(self):
         # Window size is too big. Should fallback
         def fn(a, b, c):
@@ -6411,8 +7430,6 @@ def fn(a, b):
 
     @torch._dynamo.config.patch(assume_static_by_default=False)
     def test_dtype_sympy_expr(self):
-        torch._inductor.metrics.disable_cpp_wrapper = 0
-
         @torch._dynamo.optimize_assert("inductor")
         def fn(a):
             y = a[..., :-1, :].contiguous()
@@ -6421,11 +7438,6 @@ def fn(a):
         result = fn(torch.randn([1, 2, 16, 4]).requires_grad_())
         result.sum().backward()
 
-        expected_disable_cpp_wrapper = 0
-        self.assertEqual(
-            torch._inductor.metrics.disable_cpp_wrapper, expected_disable_cpp_wrapper
-        )
-
     def test_dropout2(self):
         n = 100000
         weight = torch.ones(
@@ -6464,7 +7476,7 @@ def check(r, g):
         torch.manual_seed(1234)
         weight.grad.zero_()
         r2, (fw_code, bw_code) = run_fw_bw_and_get_code(lambda: run(ones))
-        if self.device == "cuda":
+        if self.device == GPU_TYPE:
             self.assertEqual(fw_code.count("tl.rand"), 1)
             self.assertEqual(bw_code.count("tl.rand"), 0)
         g2 = weight.grad.clone()
@@ -6500,8 +7512,8 @@ def run(x):
             lambda: run(torch.randn([8, 32], device=self.device))
         )
 
-        if self.device == "cuda":
-            self.assertEqual(fw_code.count("tl.rand"), 1)
+        if self.device == GPU_TYPE:
+            self.assertEqual(fw_code.count("tl.rand"), 2)
             self.assertEqual(bw_code.count("tl.rand"), 0)
         expected_kernel = 4
 
@@ -6518,9 +7530,9 @@ def fn1():
             return random_tensor1, random_tensor2, random_tensor3
 
         _, source_codes = run_and_get_code(fn1)
-        if self.device == "cuda":
+        if self.device == GPU_TYPE:
             self.assertEqual(len(source_codes), 1)
-            self.assertEqual(source_codes[0].count("async_compile.triton"), 1)
+            self.assertEqual(source_codes[0].count("async_compile.triton"), 2)
 
     def test_roll(self):
         def fn(a):
@@ -6771,8 +7783,7 @@ def forward(arg38_1, arg81_1, getitem_17, new_zeros_default_4):
             sum_default_7 = torch.ops.aten.sum.default(mul_tensor_24)
             return (new_zeros_default_4, sum_default_7)
 
-        # TODO: Remove once https://github.com/pytorch/pytorch/issues/94017 is resolved
-        dtype = torch.float64 if self.device == "cpu" else torch.float32
+        dtype = torch.float32
         args = [
             ((1, 88, 40, 40), (140800, 1600, 40, 1), dtype),
             ((), (), dtype),
@@ -6785,6 +7796,121 @@ def forward(arg38_1, arg81_1, getitem_17, new_zeros_default_4):
         ]
         self.common(forward, args)
 
+    @requires_gpu()
+    def test_tmp_not_defined_issue3(self):
+        from torch import device
+
+        def forward(
+            self,
+            primals_1: "f32[1001, 6]",
+            primals_2: "f32[1001]",
+            primals_3: "f32[1001, 64]",
+            primals_4: "f32[4190]",
+            primals_5: "f32[4190]",
+            primals_6: "f32[1739, 4190]",
+            primals_48: "f32[6144, 4191]",
+        ):
+            _tensor_constant0: "i64[4190]" = self._tensor_constant0
+            lift_fresh_copy: "i64[4190]" = torch.ops.aten.lift_fresh_copy.default(
+                _tensor_constant0
+            )
+
+            index: "f32[6144, 4190]" = torch.ops.aten.index.Tensor(
+                primals_48, [None, lift_fresh_copy]
+            )
+
+            _tensor_constant1: "i64[6]" = self._tensor_constant1
+            lift_fresh_copy_1: "i64[6]" = torch.ops.aten.lift_fresh_copy.default(
+                _tensor_constant1
+            )
+            index_1: "f32[6144, 6]" = torch.ops.aten.index.Tensor(
+                primals_48, [None, lift_fresh_copy_1]
+            )
+            primals_48 = lift_fresh_copy_1 = None
+            permute: "f32[6, 1001]" = torch.ops.aten.permute.default(primals_1, [1, 0])
+            addmm: "f32[6144, 1001]" = torch.ops.aten.addmm.default(
+                primals_2, index_1, permute
+            )
+            amax: "f32[6144, 1]" = torch.ops.aten.amax.default(addmm, [-1], True)
+            sub: "f32[6144, 1001]" = torch.ops.aten.sub.Tensor(addmm, amax)
+            exp: "f32[6144, 1001]" = torch.ops.aten.exp.default(sub)
+            sum_1: "f32[6144, 1]" = torch.ops.aten.sum.dim_IntList(exp, [-1], True)
+            div: "f32[6144, 1001]" = torch.ops.aten.div.Tensor(exp, sum_1)
+
+            full_default: "i32[6144, 1001]" = torch.ops.aten.full.default(
+                [6144, 1001],
+                1,
+                dtype=torch.int32,
+                layout=torch.strided,
+                device=device(type=GPU_TYPE, index=0),
+                pin_memory=False,
+            )
+
+            iota: "i32[1001]" = torch.ops.prims.iota.default(
+                1001,
+                start=0,
+                step=1,
+                dtype=torch.int32,
+                device=device(type=GPU_TYPE),
+                requires_grad=False,
+            )
+
+            mul: "i32[6144, 1001]" = torch.ops.aten.mul.Tensor(full_default, iota)
+            iota_1: "i32[6144]" = torch.ops.prims.iota.default(
+                6144,
+                start=0,
+                step=1001,
+                dtype=torch.int32,
+                device=device(type=GPU_TYPE, index=0),
+                requires_grad=False,
+            )
+            view: "i32[6150144]" = torch.ops.aten.reshape.default(mul, [-1])
+            view_1: "f32[6150144]" = torch.ops.aten.reshape.default(div, [-1])
+            _embedding_bag = torch.ops.aten._embedding_bag.default(
+                primals_3, view, iota_1, False, 0, False, view_1
+            )
+            getitem: "f32[6144, 64]" = _embedding_bag[0]
+            getitem_1: "i32[6150144]" = _embedding_bag[1]
+            getitem_2: "i32[6144]" = _embedding_bag[2]
+            getitem_3: "i32[0]" = _embedding_bag[3]
+            unsqueeze: "f32[6144, 1, 64]" = torch.ops.aten.unsqueeze.default(getitem, 1)
+            var_mean = torch.ops.aten.var_mean.correction(
+                index, [1], correction=0, keepdim=True
+            )
+            getitem_4: "f32[6144, 1]" = var_mean[0]
+            getitem_5: "f32[6144, 1]" = var_mean[1]
+            add: "f32[6144, 1]" = torch.ops.aten.add.Tensor(getitem_4, 1e-05)
+            rsqrt: "f32[6144, 1]" = torch.ops.aten.rsqrt.default(add)
+            sub_1: "f32[6144, 4190]" = torch.ops.aten.sub.Tensor(index, getitem_5)
+            mul_1: "f32[6144, 4190]" = torch.ops.aten.mul.Tensor(sub_1, rsqrt)
+            mul_2: "f32[6144, 4190]" = torch.ops.aten.mul.Tensor(mul_1, primals_4)
+            add_1: "f32[6144, 4190]" = torch.ops.aten.add.Tensor(mul_2, primals_5)
+            permute_1: "f32[4190, 1739]" = torch.ops.aten.permute.default(
+                primals_6, [1, 0]
+            )
+
+            return [
+                index,
+                index_1,
+                addmm,
+                amax,
+                sum_1,
+                iota_1,
+                view,
+                view_1,
+                getitem_1,
+                getitem_2,
+                getitem_3,
+                unsqueeze,
+                getitem_5,
+                rsqrt,
+                add_1,
+                permute_1,
+            ]
+
+        kwargs = aot_graph_input_parser(forward, device=GPU_TYPE)
+        self.common(forward, [], kwargs=kwargs)
+
     def test_misaligned_address_issue1(self):
         def forward(sub_tensor_1, unsqueeze_default):
             gather_default = torch.ops.aten.gather.default(
@@ -6922,7 +8048,7 @@ def fn(x, y):
 
         for d in dtypes:
             inputs = (
-                rand_strided((2, 3), (3, 1), dtype=torch.float32, device="cuda"),
+                rand_strided((2, 3), (3, 1), dtype=torch.float32, device=GPU_TYPE),
                 rand_strided((), (), dtype=d, device="cpu"),
             )
             self.assertTrue(same(opt(*inputs), fn(*inputs)))
@@ -6981,7 +8107,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                     fn_compiled(inps)
 
                 # do an extra run to make sure we are deallocating on warmup and record
-                if self.device == "cuda":
+                if self.device == GPU_TYPE:
                     inps.extend(
                         [
                             torch.rand([5, 5]).to(self.device),
@@ -7009,8 +8135,15 @@ def fn(x):
         x = torch.rand(128, 32, 63)
         self.common(fn, (x,))
 
+    def test_diagonal_copy(self):
+        def fn(x):
+            return torch.diagonal_copy(x)
+
+        for x in (torch.randn(2, 3), torch.randn(2, 2), torch.randn(3, 2)):
+            self.common(fn, (x,))
+
     def test_kwargs(self):
-        if self.device == "cuda":
+        if self.device == GPU_TYPE:
             raise unittest.SkipTest("histogramdd only supports cpu")
 
         def fn(x, y):
@@ -7028,7 +8161,7 @@ def fn(x, y):
     # Shape padding causes the inputs to all get specialized, so the codegen
     # test fails
     @expectedFailureCodegenDynamic
-    @requires_cuda()
+    @requires_gpu()
     @torch._inductor.config.patch("shape_padding", True)
     def test_shape_padding(self):
         dtypes = [
@@ -7039,7 +8172,7 @@ def test_shape_padding(self):
         b, m, n, k = 7, 11, 13, 15
 
         def gen(*shape, dtype=torch.float32):
-            return torch.randn(*shape, device="cuda", dtype=dtype) / k + 1.0
+            return torch.randn(*shape, device=GPU_TYPE, dtype=dtype) / k + 1.0
 
         for dtype in dtypes:
             x = gen(m, k, dtype=dtype)
@@ -7057,11 +8190,11 @@ def gen(*shape, dtype=torch.float32):
             self.common(lambda x, y: torch.matmul(x, y), (x, y))
             self.common(lambda x, y, z: torch.baddbmm(z, x, y), (x, y, z))
 
-    @requires_cuda()
+    @requires_gpu()
     @torch._inductor.config.patch("layout_optimization", True)
     def test_inductor_layout_optimization_input_mutations(self):
         # channel dim must be > 64 for inductor to do layout optimization and use NHWC
-        mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).cuda()
+        mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(GPU_TYPE)
 
         def f(x):
             x.mul_(2)
@@ -7069,7 +8202,7 @@ def f(x):
             return out
 
         f_compiled = torch.compile(f)
-        x_ref = torch.rand(2, 3, 128, 128, device="cuda")
+        x_ref = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
         x_test = x_ref.clone().detach()
         with torch.no_grad():
             out_ref = f(x_ref)
@@ -7093,7 +8226,7 @@ def fn(x, i):
     def test_sqrt_dynamic_shapes(self):
         # TIMM convit_base model: https://github.com/pytorch/pytorch/issues/97877.
         # TODO: support cuda path.
-        if self.device == "cuda":
+        if self.device == GPU_TYPE:
             raise unittest.SkipTest("sqrt dynamic shapes only supports cpu")
 
         class Model(torch.nn.Module):
@@ -7236,6 +8369,18 @@ def fn(a, b):
             "inductor_wrapper_call" in e.name for e in prof.profiler.function_events
         )
 
+    def test_insignificant_strides(self):
+        def f(x):
+            tmp = x + 1
+            return tmp.view(-1, 1, 2)
+
+        x = torch.arange(8, device=self.device, dtype=torch.float32)
+        out = f(x)
+        compiled_out = torch.compile(f)(x)
+
+        self.assertEqual(out.stride(), compiled_out.stride())
+        self.assertEqual(out, compiled_out)
+
     @unittest.skipIf(IS_X86 and not HAS_AVX2, "Requires AVX2")
     def test_pixel_shuffle_channels_last(self):
         def fn(x):
@@ -7326,7 +8471,7 @@ def forward(arg6, arg7, arg16):
         # expanded dim should not cause copy in require_stride_order
         assertGeneratedKernelCountEqual(self, 0)
 
-    @requires_cuda()
+    @requires_gpu()
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION,
         "Does not support SDPA or pre-SM80 hardware",
@@ -7364,7 +8509,7 @@ def foo(arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
             _scaled_dot_product_efficient_attention = None
             return (getitem,)
 
-        DEVICE = torch.device("cuda:0")
+        DEVICE = torch.device(f"{GPU_TYPE}:0")
         DTYPE = torch.float16
         B = 3
         H = 8
@@ -7387,7 +8532,7 @@ def foo(arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
             rtol=1e4,
         )
 
-    @requires_cuda()
+    @requires_gpu()
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
         "Does not support mem_eff_attention",
@@ -7422,10 +8567,10 @@ def foo(
             _scaled_dot_product_efficient_attention = None
             return (getitem,)
 
-        query = torch.rand(8, 8, 16, 16, device="cuda")
-        key = torch.rand(8, 8, 15, 16, device="cuda")
-        value = torch.rand(8, 8, 15, 16, device="cuda")
-        bias = torch.rand(1, 1, 16, 15, device="cuda")
+        query = torch.rand(8, 8, 16, 16, device=GPU_TYPE)
+        key = torch.rand(8, 8, 15, 16, device=GPU_TYPE)
+        value = torch.rand(8, 8, 15, 16, device=GPU_TYPE)
+        bias = torch.rand(1, 1, 16, 15, device=GPU_TYPE)
         self.common(
             foo,
             (query, key, value, bias),
@@ -7433,7 +8578,7 @@ def foo(
             rtol=1e4,
         )
 
-    @requires_cuda()
+    @requires_gpu()
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
         "Does not support mem_eff_attention",
@@ -7444,7 +8589,7 @@ def test_sdpa_unaligned_mask_freezing(self):
         class Mod(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.arg3_1 = torch.rand(1, 1, 16, 15, device="cuda")
+                self.arg3_1 = torch.rand(1, 1, 16, 15, device=GPU_TYPE)
 
             def forward(
                 self,
@@ -7477,9 +8622,9 @@ def forward(
                 _scaled_dot_product_efficient_attention = None
                 return (getitem,)
 
-        query = torch.rand(8, 8, 16, 16, device="cuda")
-        key = torch.rand(8, 8, 15, 16, device="cuda")
-        value = torch.rand(8, 8, 15, 16, device="cuda")
+        query = torch.rand(8, 8, 16, 16, device=GPU_TYPE)
+        key = torch.rand(8, 8, 15, 16, device=GPU_TYPE)
+        value = torch.rand(8, 8, 15, 16, device=GPU_TYPE)
 
         mod = Mod()
         out_eager = mod(query, key, value)
@@ -7606,12 +8751,20 @@ def fn(a):
         x = torch.rand(48, 3, 512, 512)
         self.common(fn, (x,))
 
-    @unittest.skipIf(not HAS_CPU or not RUN_CPU, "requires C++ compiler")
+    def test_pad_cast(self):
+        def fn(x):
+            return torch.nn.functional.pad(x.to(torch.float32), (0, 3, 0, 0))
+
+        for dtype in [torch.int32, torch.int64]:
+            self.common(fn, (torch.ones(1, 1, 13, dtype=dtype),))
+
+    @unittest.skipIf(not HAS_CPU, "requires C++ compiler")
     def test_data_type_propogation(self):
         from torch._dynamo.utils import detect_fake_mode
         from torch._inductor.codegen.common import boolean_ops
         from torch._inductor.compile_fx import _shape_env_from_inputs
         from torch._inductor.debug import DebugContext
+        from torch._inductor.decomposition import decompositions
         from torch._inductor.graph import GraphLowering
         from torch._inductor.virtualized import V
         from torch.fx.passes.fake_tensor_prop import FakeTensorProp
@@ -7637,7 +8790,9 @@ def func(arg0_1):
             )
         ]
 
-        gm = torch.fx.symbolic_trace(func)
+        gm = make_fx(func, decomposition_table=decompositions, tracing_mode="fake")(
+            *example_inputs
+        )
 
         shape_env = _shape_env_from_inputs(example_inputs)
 
@@ -7752,6 +8907,17 @@ def fn(query, scores, window_overlap):
         ]
         args = [rand_strided(sh, st) for (sh, st) in args]
         args.append(256)
+
+        if self.device == "cpu":
+            opt_fn = torch._dynamo.optimize("inductor")(fn)
+            _, code = run_and_get_cpp_code(opt_fn, *args)
+            print(code)
+            FileCheck().check_count(
+                "static_cast<int>(256)",
+                1,
+                exactly=True,
+            ).run(code)
+
         self.common(fn, args)
 
     def test_cumsum_pattern_matcher_issue(self):
@@ -7770,14 +8936,115 @@ def fn(input_ids) -> torch.Tensor:
         x = torch.randn(2, 2)
         self.common(fn, (x,), atol=0, rtol=0)
 
-    # It's a view so it doens't generate a kernel
-    @expectedFailureCodegenDynamic
-    def test_slice(self):
-        def fn(a, b):
-            return torch.ops.aten.slice.Tensor(a, 0, 0, -b)
+    @staticmethod
+    def _check_resize_common(
+        self, fn, x, size_or_y, memory_format, inplace, deterministic
+    ):
+        x_ref_arg = x.clone()
+        x_opt_arg = x.clone()
+        x_numel = x.numel()
+        torch._dynamo.reset_code_caches()
+        opt_fn = torch._dynamo.optimize_assert(compile_fx)(fn)
+        correct = fn(x_ref_arg, size_or_y, memory_format)
+        actual = opt_fn(x_opt_arg, size_or_y, memory_format)
 
-        x = torch.rand(48, 3, 512, 512)
-        self.common(fn, (x, 2))
+        def get_numel(size_or_y):
+            if isinstance(size_or_y, torch.Tensor):
+                return size_or_y.numel()
+            else:
+                # assume shape
+                return functools.reduce(lambda x, y: x * y, size_or_y, 1)
+
+        if deterministic:
+            nele_check = correct.numel()
+        else:
+            nele_check = min(x_numel, get_numel(size_or_y))
+
+        correct_values = correct.as_strided((nele_check,), (1,))
+        actual_values = actual.as_strided((nele_check,), (1,))
+        self.assertTrue(same(correct_values, actual_values, equal_nan=deterministic))
+        correct_strides = correct.stride()
+        actual_strides = actual.stride()
+        self.assertEqual(correct_strides, actual_strides)
+
+    @staticmethod
+    def _cases_resize_common():
+        sizes = [
+            ((2,), (1, 3, 2, 3)),
+            ((100,), (1, 3, 2, 3)),
+            ((1, 3, 2, 3), (1, 3, 2, 3)),
+            ((2,), (1, 3, 2, 3, 1)),
+            ((100,), (1, 3, 2, 3, 1)),
+            ((1, 3, 2, 3, 1), (1, 3, 2, 3, 1)),
+            ((2, 0, 1), (2, 2)),
+        ]
+        for x_size, y_size in sizes:
+            memory_formats = [torch.contiguous_format]
+            if len(y_size) == 4:
+                memory_formats.append(torch.channels_last)
+            if len(y_size) == 5:
+                memory_formats.append(torch.channels_last_3d)
+            for memory_format in memory_formats:
+                x = torch.randn(*x_size)
+                yield x, y_size, memory_format
+                # check some non-contiguous tensors
+                if x.numel() == 100:
+                    x_strided = x[::2].reshape(25, 2).transpose(0, 1)
+                    yield x_strided, y_size, memory_format
+
+    def test_resize(self):
+        def fn(x, size, memory_format):
+            # NOTE: Tensor.resize() =/= aten::resize()
+            return torch.ops.aten.resize(x, size, memory_format=memory_format)
+
+        for deterministic in [True, False]:
+            with DeterministicGuard(
+                deterministic, fill_uninitialized_memory=deterministic
+            ):
+                for x, y_size, memory_format in CommonTemplate._cases_resize_common():
+                    CommonTemplate._check_resize_common(
+                        self,
+                        fn,
+                        x,
+                        y_size,
+                        memory_format,
+                        inplace=False,
+                        deterministic=deterministic,
+                    )
+
+    @staticmethod
+    def _cases_resize_as_common():
+        for x, y_size, memory_format in CommonTemplate._cases_resize_common():
+            # each sizes /memory_format combintation tested in 2 ways:
+            # 1. y is contiguous fn gets memory_format kwargs
+            # 2. y has memory_format contiguity and fn gets preserve kwarg
+            # 3. y has some other strides (not contiguous or channels last) and fn gets preserve
+            yield x, torch.randn(*y_size), memory_format
+            yield x, torch.randn(*y_size).contiguous(
+                memory_format=memory_format
+            ), torch.preserve_format
+            yield x, torch.randn(*y_size).permute(
+                tuple(reversed(range(len(y_size))))
+            ), torch.preserve_format
+
+    def test_resize_as(self):
+        def fn(x, y, memory_format):
+            return torch.ops.aten.resize_as(x, y, memory_format=memory_format)
+
+        for deterministic in [True, False]:
+            with DeterministicGuard(
+                deterministic, fill_uninitialized_memory=deterministic
+            ):
+                for x, y, memory_format in CommonTemplate._cases_resize_as_common():
+                    CommonTemplate._check_resize_common(
+                        self,
+                        fn,
+                        x,
+                        y,
+                        memory_format,
+                        inplace=False,
+                        deterministic=deterministic,
+                    )
 
     def test_inplace_resize_as(self):
         def fn(x, y):
@@ -7840,9 +9107,10 @@ def fn(q, k, v):
         )
 
     @skipIfRocm
+    @expectedFailureXPU
     def test_scaled_dot_product_efficient_attention(self):
         if self.device == "cpu":
-            raise unittest.SkipTest("requires CUDA")
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
 
         # The first two values should be the same, attention output
         # and logsumexp since dropout is not being set
@@ -7938,8 +9206,112 @@ def fn(inp, offsets):
 
         self.common(fn, (inp, offsets), check_lowp=False)
 
+    @requires_gpu()
+    @config.patch(assume_aligned_inputs=False)
+    def test_config_option_dont_assume_alignment(self):
+        def fn(x: torch.Tensor) -> torch.Tensor:
+            return x.sin() + x.cos()
+
+        # Inductor specializes on the (unguarded) alignment of the initial input.
+        # Make sure that for different configurations, nothing breaks.
+        for offset in (0, 1, 2, 3, 4):
+            base = torch.randn(64 * 64 + 64, dtype=torch.float32, device=GPU_TYPE)
+            inp = torch.as_strided(base, (64, 64), (64, 1), offset)
+            torch._dynamo.reset()
+            fn_c = torch.compile(fn)
+
+            ref = fn(inp)
+            res = fn_c(inp)
+            self.assertEqual(ref, res)
+
+            for offset2 in (0, 1, 2, 3, 4):
+                base2 = torch.randn(64 * 64 + 64, dtype=torch.float32, device=GPU_TYPE)
+                inp2 = torch.as_strided(base, (64, 64), (64, 1), offset2)
+                ref2 = fn(inp2)
+                res2 = fn_c(inp2)
+                self.assertEqual(ref2, res2)
+
+    @requires_gpu()
+    @config.patch(assume_aligned_inputs=False)
+    def test_config_option_dont_assume_alignment_recompiles(self):
+        # Inputs:
+        #  1. (32, 32) shape
+        #  2. (64, 64) shape -> causes a recompile
+        #  3. (64, 64) shape with different storage offset -> should NOT cause a recompile
+        failed_guards = []
+
+        def fail(guard):
+            nonlocal failed_guards
+            failed_guards.append(guard)
+
+        def fn(x: torch.Tensor) -> torch.Tensor:
+            return x.sin() + x.cos()
+
+        base = torch.randn(64 * 64 + 64, dtype=torch.float32, device=GPU_TYPE)
+
+        inp1 = torch.as_strided(base, (32, 32), (32, 1), 4)
+        inp2 = torch.as_strided(base, (64, 64), (64, 1), 4)
+        inp3 = torch.as_strided(base, (64, 64), (64, 1), 5)
+
+        torch._dynamo.reset()
+
+        fn_c = torch._dynamo.optimize("inductor", guard_fail_fn=fail)(fn)
+
+        ref1 = fn(inp1)
+        res1 = fn_c(inp1)
+        self.assertEqual(ref1, res1)
+        self.assertEqual(0, len(failed_guards))
+
+        ref2 = fn(inp2)
+        res2 = fn_c(inp2)
+        self.assertEqual(ref2, res2)
+        # if dynamic shapes isn't already turned on, we might have a guard failure as we turn
+        # on dynamic shapes
+        self.assertLessEqual(len(failed_guards), 1)
+        failed_guard_count_iteration_2 = len(failed_guards)
+
+        failed_guards = []
+        ref3 = fn(inp3)
+        res3 = fn_c(inp3)
+        self.assertEqual(ref3, res3)
+        # we might still have the dynamics shapes failure, but offset change shouldn't be guarded on
+        # see Note: [Input Alignment handling in Inductor]
+        self.assertLessEqual(len(failed_guards), failed_guard_count_iteration_2)
+
+    @requires_gpu()
+    @config.patch(assume_aligned_inputs=False)
+    def test_config_option_dont_assume_alignment_cudagraphs(self):
+        def fn(x):
+            return x.cos() * x.sin()
+
+        fn_c = torch.compile(fn, mode="reduce-overhead", dynamic=True)
+
+        for size, stride, offset in (
+            ((32, 32), (32, 1), 4),
+            ((48, 48), (48, 1), 4),
+            ((64, 64), (64, 1), 5),
+        ):
+            torch.manual_seed(42)
+            base = torch.randn(64 * 64 + 64, dtype=torch.float32, device=GPU_TYPE)
+            torch.manual_seed(42)
+            base_ref = torch.randn(64 * 64 + 64, dtype=torch.float32, device=GPU_TYPE)
+
+            inp = torch.as_strided(base, size, stride, offset)
+            inp_ref = torch.as_strided(base_ref, size, stride, offset)
+
+            inp.requires_grad_(True)
+            inp_ref.requires_grad_(True)
+
+            res = fn_c(inp)
+            ref = fn(inp_ref)
+            self.assertEqual(ref, res)
+
+            res.sum().backward()
+            ref.sum().backward()
+            self.assertEqual(base.grad, base_ref.grad)
+
     @config.patch(implicit_fallbacks=True)
-    def test_custom_op(self):
+    def test_custom_op_1(self):
         import torch.library
 
         def foo_cpu(x):
@@ -7948,25 +9320,184 @@ def foo_cpu(x):
         def foo_cuda(x):
             return 3 * x
 
+        def foo_xpu(x):
+            return 3 * x
+
         def foo_meta(x):
             return torch.empty_like(x)
 
-        global libfoo
-        if libfoo is None:
-            libfoo = torch.library.Library("foo", "DEF")
-            libfoo.define("custom(Tensor self) -> Tensor")
-            libfoo.impl("custom", foo_cpu, "CPU")
-            libfoo.impl("custom", foo_cuda, "CUDA")
-            libfoo.impl("custom", foo_meta, "Meta")
+        define_custom_op_for_test("foo", foo_cpu, foo_cuda, foo_xpu, foo_meta)
 
         def fn(x):
             a = torch.nn.functional.relu(x)
-            b = torch.ops.foo.custom(a)
+            b = torch.ops.test.foo(a)
             c = torch.cos(b)
             return c
 
         self.common(fn, (torch.randn((16, 32)),), check_lowp=False)
 
+    @config.patch(implicit_fallbacks=True)
+    def test_custom_op_2(self):
+        import torch.library
+
+        def foo_cpu(x, scale: float):
+            return scale * x, torch.cos(x)
+
+        def foo_cuda(x, scale: float):
+            return scale * x, torch.cos(x)
+
+        def foo_xpu(x, scale: float):
+            return scale * x, torch.cos(x)
+
+        def foo_meta(x, scale: float):
+            return torch.empty_like(x), torch.empty_like(x)
+
+        define_custom_op_2_for_test("foo2", foo_cpu, foo_cuda, foo_xpu, foo_meta)
+
+        def fn(x, scale: float):
+            a = torch.nn.functional.relu(x)
+            return torch.ops.test.foo2(a, scale)
+
+        self.common(fn, (torch.randn((16, 32)), 2.0), check_lowp=False)
+
+    @config.patch(implicit_fallbacks=True)
+    def test_custom_op_3(self):
+        import torch.library
+
+        def foo_cpu(x):
+            result = torch.zeros_like(x[0])
+            for t in x:
+                result += t
+            return result
+
+        def foo_cuda(x):
+            result = torch.zeros_like(x[0])
+            for t in x:
+                result += t
+            return result
+
+        def foo_xpu(x):
+            result = torch.zeros_like(x[0])
+            for t in x:
+                result += t
+            return result
+
+        def foo_meta(x):
+            return torch.empty_like(x[0])
+
+        define_custom_op_3_for_test("foo3", foo_cpu, foo_cuda, foo_xpu, foo_meta)
+
+        def fn(x):
+            return torch.ops.test.foo3(x)
+
+        self.common(
+            fn,
+            ([torch.randn((16, 32)), torch.randn((16, 32)), torch.randn((16, 32))],),
+            check_lowp=False,
+        )
+
+    @requires_gpu()
+    @torch._inductor.config.patch("layout_optimization", True)
+    @torch._inductor.config.patch("keep_output_stride", False)
+    @config.patch(implicit_fallbacks=True)
+    def test_custom_op_fixed_layout_sequential(self):
+        import torch.library
+
+        mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=GPU_TYPE)
+        inp = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
+        expected_stride = mod(inp).stride()
+
+        def bar_cpu(x):
+            self.assertEqual(x.stride(), expected_stride)
+            return x.clone()
+
+        def bar_cuda(x):
+            self.assertEqual(x.stride(), expected_stride)
+            return x.clone()
+
+        def bar_xpu(x):
+            self.assertEqual(x.stride(), expected_stride)
+            return x.clone()
+
+        def bar_meta(x):
+            return torch.empty_like(x)
+
+        define_custom_op_for_test(
+            "bar",
+            bar_cpu,
+            bar_cuda,
+            bar_xpu,
+            bar_meta,
+            tags=[torch._C.Tag.needs_fixed_stride_order],
+        )
+
+        def fn(x):
+            z = mod(x)
+            output = torch.ops.test.bar(z)
+            return output
+
+        with torch.no_grad():
+            # With keep_output_stride False, inductor would normally have different layout from eager execution
+            # But because our custom op needs fixed layout, the assertions in the custom op will pass
+            self.common(fn, (inp,), check_lowp=False)
+
+    @requires_gpu()
+    @config.patch(implicit_fallbacks=True)
+    def test_custom_op_fixed_layout_channels_last(self):
+        class Block(nn.Module):
+            def __init__(
+                self,
+            ):
+                super().__init__()
+
+                self.in_layers = nn.Sequential(
+                    nn.Dropout(p=0.1),
+                )
+
+            def helper(self, x):
+                out = F.gelu(x)
+                out = self.in_layers(out)
+                return out
+
+            def forward(self, x):
+                out = self.helper(x)
+                out = torch.ops.test.baz(out)
+                return out
+
+        model = Block()
+        model = model.to(GPU_TYPE).to(memory_format=torch.channels_last)
+        input_t = torch.randn([1, 320, 128, 128], dtype=torch.float32, device=GPU_TYPE)
+        input_t = input_t.to(memory_format=torch.channels_last)
+        expected_strides = model.helper(input_t).stride()
+
+        def baz_cpu(x):
+            self.assertEqual(expected_strides, x.stride())
+            return x.clone()
+
+        def baz_cuda(x):
+            self.assertEqual(expected_strides, x.stride())
+            return x.clone()
+
+        def baz_xpu(x):
+            self.assertEqual(expected_strides, x.stride())
+            return x.clone()
+
+        def baz_meta(x):
+            return torch.empty_like(x)
+
+        define_custom_op_for_test(
+            "baz",
+            baz_cpu,
+            baz_cuda,
+            baz_xpu,
+            baz_meta,
+            tags=[torch._C.Tag.needs_fixed_stride_order],
+        )
+
+        with torch.no_grad():
+            net = torch.compile(model)
+            out = net(input_t)
+
     def test_buffer_use_after_remove(self):
         # https://github.com/pytorch/pytorch/issues/102857
 
@@ -8030,6 +9561,7 @@ def run(fn):
         self.assertEqual(rot.grad, rot_e.grad)
         self.assertEqual(trans.grad, trans_e.grad)
 
+    @config.patch({"fx_graph_cache": False})
     def test_inner_fn_str_and_stride(self):
         def f(x):
             x = x + 1
@@ -8059,7 +9591,7 @@ def hook_fn(scheduler, nodes):
                 # 'i1 + 3 * i0' is cached.
                 self.assertTrue(
                     "i0 + 2 * i1" in mul_buf.data.inner_fn_str()
-                    or "i0 + i1 * s0" in mul_buf.data.inner_fn_str()
+                    or "i0 + i1 * s1" in mul_buf.data.inner_fn_str()
                 )
 
         with add_scheduler_init_hook(hook_fn):
@@ -8088,6 +9620,7 @@ def fn(tensor, index, source):
     @config.patch(
         "triton.autotune_pointwise", True
     )  # needed to introduce config that exceed max shared memory usage
+    @serialTest()
     def test_large_block_sizes(self):
         """
         Inductor will try triton configs like x = 64 and y = 1024 which will
@@ -8109,6 +9642,8 @@ def fn(x, y):
         b = torch.randn(65, 2**24, device=self.device)
         fn(a, b)
 
+    # Skipped on ROCm until https://github.com/ROCm/triton/issues/443 resolved
+    @skipIfRocm
     def test_fuse_large_params(self):
         def pt2_optimizer_step(optimizer):
             @torch.compile()
@@ -8147,6 +9682,7 @@ def fn(x):
         actual = torch.compile(fn)(x)
         self.assertEqual(ref, actual)
 
+    @skipCUDAIf(not SM80OrLater, "uses bfloat16 which requires SM >= 80")
     def test_bfloat16_to_int16(self):
         def fn(a, b):
             x = a + b
@@ -8171,6 +9707,218 @@ def fn(a, b):
         actual = torch.compile(fn)(a, b)
         self.assertEqual(ref, actual)
 
+    def test_randint_int64_mod(self):
+        # This used to not compile due to a wrong return type of randint64_cpu
+        # See https://github.com/pytorch/pytorch/issues/117435
+        def fn(n):
+            return (
+                torch.randint(
+                    low=-5, high=5, size=(n,), dtype=torch.int64, device=self.device
+                )
+                % 10
+            )
+
+        res = torch.compile(fn)(20)
+        self.assertTrue(torch.all((0 <= res) & (res < 10)).item())
+
+    def test_should_pad_bench_for_bmm(self):
+        B = 2
+        M = 1024
+        N = 1024
+        K = 1024 + 1  # a size that requires padding
+
+        mat1 = torch.rand(B, M, K, device=self.device)
+        mat2 = torch.rand(B, K, N, device=self.device)
+
+        def return_true(*args, **kwargs):
+            return True
+
+        # return value of is_mm_compute_bound depends on flops and membw of
+        # the GPU. Mock it so the test does not becomes flaky when running
+        # on different GPUs.
+        patch1 = patch.object(pad_mm, "is_mm_compute_bound", return_true)
+        # mock get_cached_should_pad so the test does not rely on benchmarking
+        # result.
+        patch2 = patch.object(pad_mm, "get_cached_should_pad", return_true)
+
+        with patch1, patch2:
+            should_pad = pad_mm.should_pad_bench(mat1, mat2, torch.ops.aten.bmm)
+
+        if has_triton():
+            self.assertTrue(should_pad)
+        else:
+            # should_pad_bench always returns False if has_triton returns False
+            self.assertFalse(should_pad)
+
+    @parametrize(
+        "name, op",
+        [
+            subtest((name, getattr(torch.special, name)), name=name)
+            for name in torch.special.__all__
+            if name not in {"softmax", "log_softmax", "logsumexp"}
+        ],
+    )
+    def test_pointwise(self, name, op):
+        dtype = torch.float32
+        check_lowp = True
+        if self.device == GPU_TYPE and name in {
+            "airy_ai",
+            "bessel_i0",
+            "bessel_i1",
+            "bessel_j0",
+            "bessel_j1",
+            "bessel_y0",
+            "bessel_y1",
+            "erfcx",
+            "gammainc",
+            "gammaincc",
+            "i1",
+            "i1e",
+            "modified_bessel_i0",
+            "modified_bessel_i1",
+            "modified_bessel_k0",
+            "modified_bessel_k1",
+            "ndtri",
+            "scaled_modified_bessel_k0",
+            "scaled_modified_bessel_k1",
+            "spherical_bessel_j0",
+            "zeta",
+            "chebyshev_polynomial_t",
+            "chebyshev_polynomial_v",
+            "chebyshev_polynomial_u",
+            "chebyshev_polynomial_w",
+            "legendre_polynomial_p",
+            "shifted_chebyshev_polynomial_t",
+            "shifted_chebyshev_polynomial_u",
+            "shifted_chebyshev_polynomial_v",
+            "shifted_chebyshev_polynomial_w",
+            "hermite_polynomial_h",
+            "hermite_polynomial_he",
+            "laguerre_polynomial_l",
+        }:
+            # <func>_cuda not implemented for Half
+            check_lowp = False
+
+        if name in {"gammainc", "gammaincc"}:
+            args = (
+                torch.randn(8, 8, dtype=dtype, device=self.device),
+                torch.empty(8, 8, dtype=dtype, device=self.device).uniform_(1, 2),
+            )
+
+            def fn(x, y):
+                return op(x, y)
+
+        elif name in {"xlog1py", "xlogy", "zeta"}:
+            args = (
+                torch.randn(8, 8, dtype=dtype, device=self.device),
+                torch.empty(8, 8, dtype=dtype, device=self.device).uniform_(1, 2),
+            )
+
+            def fn(x, y):
+                return op(x, y)
+
+        elif name == "multigammaln":
+            args = (
+                torch.empty(8, 8, dtype=dtype, device=self.device).uniform_(1, 2),
+                2,
+            )
+
+            def fn(x, p):
+                return op(x, p)
+
+        elif name == "polygamma":
+            args = (
+                1,
+                torch.empty(8, 8, dtype=dtype, device=self.device).uniform_(1, 10),
+            )
+
+            def fn(n, x):
+                return op(n, x)
+
+        elif "_polynomial_" in name:
+            args = (
+                torch.randn(8, 8, dtype=dtype, device=self.device),
+                2,
+            )
+
+            def fn(x, n):
+                return op(x, n)
+
+        else:
+            args = (torch.randn(8, 8, dtype=dtype, device=self.device),)
+
+            def fn(x):
+                return op(x)
+
+        self.common(fn, args, check_lowp=check_lowp)
+
+    # codegen test fails with no dynamic for loop in dynamic shape tests
+    @expectedFailureCodegenDynamic
+    def test_view_uint8_through_differing_bitwidths(self):
+        # https://github.com/pytorch/pytorch/issues/120998
+        def fn(x, view_dtype):
+            return x.view(view_dtype).view(torch.uint8)
+
+        view_dtypes = [torch.int16, torch.int32, torch.int64]
+        for dtype in view_dtypes:
+            x = torch.randint(0, 2**4, [4096, 4096], dtype=torch.uint8)
+            self.common(
+                fn,
+                (
+                    x,
+                    dtype,
+                ),
+            )
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_split_with_sizes_with_unbacked_symints(self):
+        @torch.compile()
+        def f(sz, x):
+            s0, s1 = sz.tolist()
+            r0, r1 = torch.ops.aten.split_with_sizes.default(x, [s0, s1])
+            return torch.ops.aten.sort.default(r1)
+
+        N = 7312
+        S0 = 420
+        S1 = N - S0
+
+        result = f(torch.tensor([S0, S1]), torch.randn(N))
+        self.assertTrue(len(result) == 2)
+
+        @torch.compile()
+        def f2(x):
+            y = torch.arange(x.item())
+            return torch.ops.aten.split_with_sizes.default(y, [5, 5, 10])
+
+        result = f2(torch.tensor([20]))
+        self.assertTrue(len(result) == 3)
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_split_with_unbacked_symints(self):
+        # https://github.com/pytorch/pytorch/issues/122937
+        @torch.compile()
+        def f(x):
+            y = torch.arange(x.item())
+            return torch.split(y, [5, 5, 10])
+
+        result = f(torch.tensor([20]))
+        self.assertTrue(len(result) == 3)
+
+    def test_complex_memory_overlap(self):
+        t = rand_strided((8, 1500, 1), (1504, 1, 1), device=self.device)
+        self.assertFalse(complex_memory_overlap(t))
+
+    def test_generate_rand_fp8(self):
+        """
+        PyTorch can not generate fp8 tensors with a normal distribution because of
+        missing needed kernels.
+
+        We work around that in rand_strided by generating an fp16 tensor first and
+        then do casting.
+        """
+        t = rand_strided((2, 3), (3, 1), device=self.device, dtype=torch.float8_e4m3fn)
+        self.assertTrue(t.dtype is torch.float8_e4m3fn)
+
 
 @dataclasses.dataclass
 class TestFailure:
@@ -8212,7 +9960,7 @@ def new_test(self, value=value):
             setattr(other_cls, f"{name}_{suffix}", new_test)
 
 
-if HAS_CPU and RUN_CPU and not torch.backends.mps.is_available():
+if HAS_CPU:
 
     class SweepInputsCpuTest(SweepInputs2, TestCase):
         gen = InputGen(10, "cpu")
@@ -8225,21 +9973,23 @@ class CpuTests(TestCase):
 
     copy_tests(CommonTemplate, CpuTests, "cpu")
 
-if HAS_CUDA and RUN_CUDA and not TEST_WITH_ASAN:
+if HAS_GPU and not TEST_WITH_ASAN:
 
-    class SweepInputsCudaTest(SweepInputs2, TestCase):
-        gen = InputGen(10, "cuda")
+    class SweepInputsGPUTest(SweepInputs2, TestCase):
+        gen = InputGen(10, GPU_TYPE)
 
-    SweepInputsCudaTest.populate()
+    SweepInputsGPUTest.populate()
 
-    class CudaTests(TestCase):
-        common = check_model_cuda
-        device = "cuda"
+    class GPUTests(TestCase):
+        common = check_model_gpu
+        device = GPU_TYPE
 
-    copy_tests(CommonTemplate, CudaTests, "cuda")
+    copy_tests(CommonTemplate, GPUTests, GPU_TYPE)
 
     class TritonCodeGenTests(TestCase):
-        from torch._inductor.triton_heuristics import CachingAutotuner
+        from torch._inductor.runtime.triton_heuristics import CachingAutotuner
+
+        device_type = GPU_TYPE
 
         class NoOpCompilerBackend:
             def __init__(self):
@@ -8291,20 +10041,27 @@ def get_kernels(self, fn, args) -> typing.List[CachingAutotuner]:
 
                 for val in mod.__dict__.values():
                     if isinstance(
-                        val, torch._inductor.triton_heuristics.CachingAutotuner
+                        val, torch._inductor.runtime.triton_heuristics.CachingAutotuner
                     ):
                         kernels.append(val)
 
             return kernels
 
+        @expectedFailureXPU
         def test_divisible_by_16_covers_numel_args(self):
             torch._dynamo.reset()
 
             def fn(a: torch.Tensor) -> torch.Tensor:
                 return torch.sum(a)
 
-            kernels = self.get_kernels(fn, [torch.randn([256, 256], device="cuda")])
-            self.assertTrue(len(kernels) == 2, "SUM should result in two kernels")
+            kernels = self.get_kernels(fn, [torch.randn([256, 256], device=GPU_TYPE)])
+            if config.triton.multi_kernel:
+                self.assertTrue(
+                    len(kernels) == 4,
+                    "SUM should result in four kernels when multi-kernel is enabled",
+                )
+            else:
+                self.assertTrue(len(kernels) == 2, "SUM should result in two kernels")
 
             # kernel0 reduces from 256 to (xnumel=8, rnumel=8192), which means it reduces 256 by 256 into an array of
             # size 8 by accumulating 8192 elements at once note that rnumel is equal to 512 * 16, so rnumel which is
@@ -8315,18 +10072,55 @@ def fn(a: torch.Tensor) -> torch.Tensor:
             self.assertEqual(arguments_that_are_divisible_by_16_in_kernel0, (0, 1, 3))
 
             # kernel1 reduces from 8 elements to a single scalar.
+            # Since multi-kernel generate 2 variants for each kernel. The second
+            # persistent-reduction has index 2.
+            kernel1_index = 2 if config.triton.multi_kernel else 1
             arguments_that_are_divisible_by_16_in_kernel1 = (
-                kernels[1].triton_meta["configs"][0].divisible_by_16
+                kernels[kernel1_index].triton_meta["configs"][0].divisible_by_16
             )
             self.assertEqual(arguments_that_are_divisible_by_16_in_kernel1, (0, 1))
             torch._dynamo.reset()
 
+        @expectedFailureXPU
+        @config.patch(assume_aligned_inputs=False)
+        def test_codegen_config_option_dont_assume_alignment(self):
+            def fn(x: torch.Tensor) -> torch.Tensor:
+                return x.sin() + x.cos()
+
+            # We want code that assumes alignment if the initial input is 16-byte aligned
+            for offset in (0, 1, 2, 3, 4):
+                base = torch.randn(64 * 64 + 64, dtype=torch.float32, device=GPU_TYPE)
+                inps = torch.as_strided(base, (64, 64), (64, 1), offset)
+                torch._dynamo.reset()
+                kernels = self.get_kernels(fn, [inps])
+                arguments_that_are_divisible_by_16 = (
+                    kernels[0].triton_meta["configs"][0].divisible_by_16
+                )
+
+                #             NO_ALIGN ALIGN     ALIGN
+                # def triton_(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr)
+
+                if offset % 4 == 0:
+                    expected_aligned = (0, 1, 2)
+                else:
+                    expected_aligned = (1, 2)
+                self.assertEqual(arguments_that_are_divisible_by_16, expected_aligned)
+
+            # If input isn't a view, storage offset != , inductor will assume alignment.
+            torch._dynamo.reset()
+            inp = torch.randn((64, 64), device=GPU_TYPE)
+            kernels = self.get_kernels(fn, [inp])
+            arguments_that_are_divisible_by_16 = (
+                kernels[0].triton_meta["configs"][0].divisible_by_16
+            )
+            self.assertEqual(arguments_that_are_divisible_by_16, (0, 1, 2))
+
         def test_optimize_indexing_dtype(self):
             def fn(x: torch.Tensor) -> torch.Tensor:
                 return aten.upsample_bilinear2d.vec(x, None, True, [2.0, 2.0])
 
             fn_opt = torch._dynamo.optimize("inductor")(fn)
-            inps = [torch.randn(2, 4, 16, 16, device="cuda")]
+            inps = [torch.randn(2, 4, 16, 16, device=GPU_TYPE)]
             code = run_and_get_triton_code(fn_opt, *inps)
             self.assertTrue("to(tl.int32)" in code)
             self.assertFalse("to(tl.int64)" in code)
@@ -8335,19 +10129,21 @@ def fn(x: torch.Tensor) -> torch.Tensor:
 
         def test_optimize_indexing_dtype_with_constraint(self):
             def fn1(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-                x = torch.arange(0, b.shape[0], device="cuda")
+                x = torch.arange(0, b.shape[0], device=GPU_TYPE)
                 y = ((x + x) / 3).int()
                 return a[y.to(torch.int64)]
 
             def fn2(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-                torch._constrain_as_size(b.shape[0], 2, 100)
+                torch._check_is_size(b.shape[0])
+                torch._check(b.shape[0] >= 2)
+                torch._check(b.shape[0] <= 100)
                 return fn1(a, b)
 
             fn1_opt = torch._dynamo.optimize("inductor")(fn1)
             fn2_opt = torch._dynamo.optimize("inductor")(fn2)
 
-            a = torch.rand([100, 100], device="cuda")
-            b = torch.rand([100], device="cuda")
+            a = torch.rand([100, 100], device=GPU_TYPE)
+            b = torch.rand([100], device=GPU_TYPE)
             torch._dynamo.mark_dynamic(b, 0)
             inps = [a, b]
 
@@ -8414,7 +10210,7 @@ def fn(x: torch.Tensor) -> torch.Tensor:
                 return a, a.detach()
 
             fn_opt = torch._dynamo.optimize("inductor")(fn)
-            inp = torch.ones(2, 2, requires_grad=True, device="cuda")
+            inp = torch.ones(2, 2, requires_grad=True, device=GPU_TYPE)
             inp_ref = inp.clone().detach().requires_grad_(True)
             out_ref = fn(inp_ref)
             out = fn_opt(inp)
@@ -8461,7 +10257,7 @@ def fn(x: torch.Tensor) -> torch.Tensor:
             for dynamic in (False, True):
                 fn_opt = torch.compile(fn, dynamic=dynamic)
 
-                x = torch.randn(8, device="cuda")
+                x = torch.randn(8, device=GPU_TYPE)
                 code = run_and_get_triton_code(fn_opt, x)
                 self.assertEqual(fn_opt(x), fn(x), msg=f"{dynamic=}")
 
@@ -8482,11 +10278,11 @@ def fn(a, z, b, idx0, idx1):
             # aten.index_put
             for dynamic in (False, True):
                 fn_opt = torch.compile(fn, dynamic=dynamic)
-                a = torch.randn(1, 32, 32, 4, device="cuda")
-                z = torch.zeros((), dtype=torch.int64, device="cuda")
-                b = torch.randn(33, 1, device="cuda")
-                idx0 = torch.randint(32, (33,), device="cuda").view(33, 1, 1)
-                idx1 = torch.randint(32, (33,), device="cuda").view(33, 1)
+                a = torch.randn(1, 32, 32, 4, device=GPU_TYPE)
+                z = torch.zeros((), dtype=torch.int64, device=GPU_TYPE)
+                b = torch.randn(33, 1, device=GPU_TYPE)
+                idx0 = torch.randint(32, (33,), device=GPU_TYPE).view(33, 1, 1)
+                idx1 = torch.randint(32, (33,), device=GPU_TYPE).view(33, 1)
                 inps = (a.clone(), z, b, idx0, idx1)
                 code = run_and_get_triton_code(fn_opt, *inps)
 
@@ -8508,29 +10304,31 @@ def fn(a, b):
             K = 7
             fn_opt = torch._dynamo.optimize("inductor")(fn)
             inps = [
-                torch.randn(N, 1, K, device="cuda"),
-                torch.randn(1, N, K, device="cuda"),
+                torch.randn(N, 1, K, device=GPU_TYPE),
+                torch.randn(1, N, K, device=GPU_TYPE),
             ]
             code = run_and_get_triton_code(fn_opt, *inps)
-            self.assertEqual(code.count("tl.store"), 1)
+            self.assertEqual(
+                code.count("tl.store"), 2 if config.triton.multi_kernel else 1
+            )
             self.assertTrue("out_ptr1" in code)
             self.assertFalse("out_ptr0" in code)
             self.assertEqual(fn_opt(*inps), fn(*inps))
 
-        def test_numpy_on_cuda(self):
+        def test_numpy_on_gpu(self):
             x = np.arange(10, dtype=np.float32)
 
             @torch.compile
             def fn(x):
                 return np.sin(x)
 
-            def fn_cuda(x):
-                with torch.device("cuda"):
+            def fn_gpu(x):
+                with torch.device(GPU_TYPE):
                     return fn(x)
 
-            r = fn_cuda(x)
-            code = run_and_get_triton_code(fn_cuda, x)
-            self.assertIn("tl.sin", code)
+            r = fn_gpu(x)
+            code = run_and_get_triton_code(fn_gpu, x)
+            self.assertIn("tl_math.sin", code)
             self.assertEqual(type(r), np.ndarray)
             self.assertEqual(r, np.sin(x))
 
@@ -8575,12 +10373,12 @@ def wrapper2(x):
         @patch.object(config, "joint_graph_constant_folding", False)
         def test_cant_optimize_compute(self):
             def ones():
-                return torch.ones([4], device="cuda")
+                return torch.ones([4], device=GPU_TYPE)
 
             def suffix(inp):
                 return (inp.to(torch.int64) + 1).to(torch.float64)
 
-            ten = torch.rand([4], device="cuda")
+            ten = torch.rand([4], device=GPU_TYPE)
 
             for foo in (
                 lambda x: x + 2147483657,
@@ -8604,7 +10402,7 @@ def fn():
         @patch.object(config, "joint_graph_constant_folding", False)
         def test_optimize_compute(self):
             def ones():
-                return torch.ones([4], device="cuda")
+                return torch.ones([4], device=GPU_TYPE)
 
             def suffix(inp):
                 return (inp.to(torch.int64) + 1).to(torch.float64)
@@ -8627,6 +10425,7 @@ def fn():
 
                 self.assertEqual(fn_opt(), fn())
 
+        @config.patch("triton.use_block_ptr", False)
         def test_evict_last_non_coalesced_loads(self):
             @torch.compile
             def f(a, b):
@@ -8634,18 +10433,63 @@ def f(a, b):
 
             N = 512
             inps = (
-                torch.randn(N, N, N, device="cuda").permute(2, 1, 0),
-                torch.randn(N, N, N, device="cuda").permute(1, 2, 0),
+                torch.randn(N, N, N, device=GPU_TYPE).permute(2, 1, 0),
+                torch.randn(N, N, N, device=GPU_TYPE).permute(1, 2, 0),
             )
             code = run_and_get_triton_code(f, *inps)
-            self.assertTrue(
-                "tl.load(in_ptr0 + (x1 + (512*x0) + (262144*r2)), rmask, eviction_policy='evict_last'"
-                in code
-            )
-            self.assertTrue(
-                "tl.load(in_ptr1 + (x3 + (262144*r2)), rmask, eviction_policy='evict_first',"
-                in code
+            lines = [line for line in code.split("\n") if "tl.load" in line]
+            if config.triton.multi_kernel:
+                # the first 2 lines are generated for the persistent reduction
+                # variant.
+                self.assertExpectedInline(
+                    "\n".join(lines),
+                    """\
+    tmp0 = tl.load(in_ptr0 + (x1 + (512*x0) + (262144*r2)), rmask, eviction_policy='evict_last', other=0.0)
+    tmp1 = tl.load(in_ptr1 + (x3 + (262144*r2)), rmask, other=0.0)
+        tmp0 = tl.load(in_ptr0 + (x1 + (512*x0) + (262144*r2)), rmask, eviction_policy='evict_last', other=0.0)
+        tmp1 = tl.load(in_ptr1 + (x3 + (262144*r2)), rmask, eviction_policy='evict_first', other=0.0)""",
+                )
+            else:
+                self.assertExpectedInline(
+                    "\n".join(lines),
+                    """\
+        tmp0 = tl.load(in_ptr0 + (x1 + (512*x0) + (262144*r2)), rmask, eviction_policy='evict_last', other=0.0)
+        tmp1 = tl.load(in_ptr1 + (x3 + (262144*r2)), rmask, eviction_policy='evict_first', other=0.0)""",
+                )
+
+        @skipIfRocm
+        @config.patch("triton.use_block_ptr", True)
+        def test_evict_last_non_coalesced_loads_block_ptr(self):
+            @torch.compile
+            def f(a, b):
+                return (a * b).sum(dim=-1)
+
+            N = 512
+            inps = (
+                torch.randn(N, N, N, device=GPU_TYPE).permute(2, 1, 0),
+                torch.randn(N, N, N, device=GPU_TYPE).permute(1, 2, 0),
             )
+            code = run_and_get_triton_code(f, *inps)
+            lines = [line for line in code.split("\n") if "tl.load" in line]
+
+            if config.triton.multi_kernel:
+                # the first 2 lines are generated for the persistent reduction
+                # variant.
+                self.assertExpectedInline(
+                    "\n".join(lines),
+                    """\
+    tmp0 = tl.load(in_ptr0 + (x1 + (512*x0) + (262144*r2)), rmask, eviction_policy='evict_last', other=0.0)
+    tmp1 = tl.load(tl.make_block_ptr(in_ptr1, shape=[262144, 512], strides=[1, 262144], block_shape=[XBLOCK, RBLOCK], order=[0, 1], offsets=[xoffset, roffset]), boundary_check=[1], padding_option='zero')
+        tmp0 = tl.load(in_ptr0 + (x1 + (512*x0) + (262144*r2)), rmask, eviction_policy='evict_last', other=0.0)
+        tmp1 = tl.load(block_ptr0, boundary_check=[1], padding_option='zero', eviction_policy='evict_first')""",  # noqa: B950 line too long
+                )
+            else:
+                self.assertExpectedInline(
+                    "\n".join(lines),
+                    """\
+        tmp0 = tl.load(in_ptr0 + (x1 + (512*x0) + (262144*r2)), rmask, eviction_policy='evict_last', other=0.0)
+        tmp1 = tl.load(block_ptr0, boundary_check=[1], padding_option='zero', eviction_policy='evict_first')""",
+                )
 
         # Disable index propagation, so the indirect indexing isn't optimized away
         @patch.object(config, "constant_and_index_propagation", False)
@@ -8654,7 +10498,7 @@ def fn(x, n):
                 tmp = torch.arange(n, device=x.device)
                 return x[tmp] + 1
 
-            x = torch.randn(8, device="cuda")
+            x = torch.randn(8, device=GPU_TYPE)
             fn_opt = torch.compile(fn)
             code = run_and_get_triton_code(fn_opt, x, 8)
             # load should be masked
@@ -8676,28 +10520,36 @@ def fn2(x):
                 nn.Linear(4, 4),
                 nn.LayerNorm(4),
                 nn.ReLU(),
-            ).cuda()
+            ).to(device=GPU_TYPE)
 
             @torch._dynamo.optimize("inductor")
             def fn3(x):
                 return mod(x)
 
             func_and_kernel_aten = [
-                (fn1, "triton_poi_fused_cos_sin", (torch.randn(8, device="cuda"),)),
-                (fn2, "triton_poi_fused__softmax", (torch.randn(4, 4, device="cuda"),)),
+                (fn1, "triton_poi_fused_cos_sin", (torch.randn(8, device=GPU_TYPE),)),
+                (
+                    fn2,
+                    "triton_poi_fused__softmax",
+                    (torch.randn(4, 4, device=GPU_TYPE),),
+                ),
                 (
                     fn3,
                     "triton_poi_fused_native_layer_norm_relu",
-                    (torch.randn(4, 4, device="cuda"),),
+                    (torch.randn(4, 4, device=GPU_TYPE),),
                 ),
             ]
             func_and_kernel_torch = [
-                (fn1, "triton_poi_fused_cos_sin", (torch.randn(8, device="cuda"),)),
-                (fn2, "triton_poi_fused_softmax", (torch.randn(4, 4, device="cuda"),)),
+                (fn1, "triton_poi_fused_cos_sin", (torch.randn(8, device=GPU_TYPE),)),
+                (
+                    fn2,
+                    "triton_poi_fused_softmax",
+                    (torch.randn(4, 4, device=GPU_TYPE),),
+                ),
                 (
                     fn3,
                     "triton_poi_fused_LayerNorm_ReLU",
-                    (torch.randn(4, 4, device="cuda"),),
+                    (torch.randn(4, 4, device=GPU_TYPE),),
                 ),
             ]
 
@@ -8725,7 +10577,7 @@ def fn(x):
                 x = x.relu()
                 return x
 
-            inp = torch.randn(4, 4, device="cuda")
+            inp = torch.randn(4, 4, device=GPU_TYPE)
             code = run_and_get_triton_code(fn, inp)
             fn(inp)
             self.assertTrue("start_graph" in code)
@@ -8791,6 +10643,7 @@ def test_indirect_device_assert(self):
             )
 
         @patch("torch._inductor.config.comment_origin", True)
+        @patch("torch._functorch.config.max_dist_from_bw", 0)
         def test_inductor_sequence_nr(self):
             class Model(torch.nn.Module):
                 def __init__(self):
@@ -8826,10 +10679,10 @@ def run_with_backward():
                 res, (fwd_code, bwd_code) = run_and_get_code(run_with_backward)
                 return fwd_code, bwd_code
 
-            x = torch.rand(100, 16, 32, 32, requires_grad=True, device="cuda")
-            target = torch.rand(1, device="cuda")
+            x = torch.rand(100, 16, 32, 32, requires_grad=True, device=GPU_TYPE)
+            target = torch.rand(1, device=GPU_TYPE)
             args = [x, target]
-            model = Model().cuda()
+            model = Model().to(device=GPU_TYPE)
             opt_model = torch.compile(model)
             fwd_code, bwd_code = get_triton_codegen(opt_model, args)
 
@@ -8843,10 +10696,75 @@ def run_with_backward():
                         res = re.search(r"seq_nr:(\d+)", line)
                         if res:
                             seq_nr_set.add(int(res.group(1)))
-
             self.assertTrue(bwd_seq_nr_set.issubset(fwd_seq_nr_set))
 
+        @config.patch(
+            {
+                "coordinate_descent_tuning": True,
+                "triton.unique_kernel_names": True,
+                "benchmark_kernel": True,
+            }
+        )
+        @skipIfRocm
+        @expectedFailureXPU
+        @unittest.skipIf(
+            torch.cuda.is_available() and torch.cuda.get_device_capability() < (9, 0),
+            "Triton does not support fp8 on A100",
+        )
+        def test_red_followed_by_transposed_pointwise(self):
+            bs = 26624
+            dim = 1024
+
+            @torch.compile(dynamic=False)
+            def f(in1, in2, a, b):
+                out = torch.nn.functional.silu(in1) * in2
+                out_row = (out / out.amax(dim=1, keepdim=True)).to(torch.float8_e4m3fn)
+                out_col = (out / out.amax(dim=0, keepdim=True)).to(torch.float8_e4m3fn)
+
+                # setup strides for _scaled_mm
+                out_row = out_row.contiguous()
+                out_col = out_col.t().contiguous().t()
+
+                return (
+                    torch._scaled_mm(out_row, a, out_dtype=torch.bfloat16)[0],
+                    torch._scaled_mm(b, out_col, out_dtype=torch.bfloat16)[0],
+                )
+
+            in1 = torch.randn((bs, dim), dtype=torch.bfloat16, device=GPU_TYPE)
+            in2 = torch.randn((bs, dim), dtype=torch.bfloat16, device=GPU_TYPE)
+            a = (
+                torch.randn((dim, dim), dtype=torch.bfloat16, device=GPU_TYPE)
+                .t()
+                .to(torch.float8_e4m3fn)
+            )
+            b = torch.randn((dim, bs), dtype=torch.bfloat16, device=GPU_TYPE).to(
+                torch.float8_e4m3fn
+            )
+
+            # warmup
+            _, (wrapper,) = run_and_get_code(f, in1, in2, a, b)
+
+            # Previously indcutor decide reduction hint for a reduction kernel without considering
+            # the pointwise nodes. That will cause the third reduction kernel in this wrapper to be a
+            # persistent inner reduction and cause bad perf.
+            #
+            # We fix that by making the third reduction a non-persistent reduction
+            # and improve the perf by 4.14x (451us -> 109us)
+            self.assertEqual(3, wrapper.count("def triton_red_"))
+            self.assertEqual(0, wrapper.count("def triton_per_"))
+
+            if DO_PERF_TEST:
+                with torch.profiler.profile(
+                    activities=[torch.profiler.ProfilerActivity.CUDA]
+                ) as p:
+                    for _ in range(1000):
+                        f(in1, in2, a, b)
+
+                print(p.key_averages().table(max_name_column_width=200))
+
     class RNNTest(TestCase):
+        device_type = GPU_TYPE
+
         class Model(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -8855,8 +10773,9 @@ def __init__(self):
             def forward(self, x):
                 return self.gru(x)
 
+        @expectedFailureXPU
         def test_rnn_compile_safe(self):
-            device = torch.device("cuda")
+            device = torch.device(GPU_TYPE)
             model = RNNTest.Model().to(device)
             model = torch._dynamo.optimize("inductor")(model)
             x = torch.rand(1024, 20, 16).to(device)
@@ -8868,10 +10787,11 @@ def test_nan_checker_pass(self):
             def f(x):
                 return torch.softmax(x, dim=-1)
 
-            x = torch.randn(2, 1024, device="cuda")
+            x = torch.randn(2, 1024, device=GPU_TYPE)
             ref = f(x)
             actual, (code,) = run_and_get_code(torch.compile(f), x)
             self.assertTrue(torch.allclose(ref, actual))
+            self.assertTrue("# make sure graph inputs are not nan/inf" in code)
             self.assertTrue(
                 re.search(r"assert not .*\.isnan\(\)\.any\(\).item\(\)", code)
                 is not None
@@ -8886,13 +10806,13 @@ def test_nan_checker_fail(self):
             def f(x):
                 return torch.softmax(x, dim=-1)
 
-            x = torch.randn(2, 1024, device="cuda")
+            x = torch.randn(2, 1024, device=GPU_TYPE)
             x[0, 0] = float("nan")
             with self.assertRaises(AssertionError):
                 torch.compile(f)(x)
 
 
-if HAS_CPU and RUN_CPU:
+if HAS_CPU:
 
     class TestFull(TestCase):
         def test_full_dtype(self):
@@ -8940,7 +10860,7 @@ def fn(pytype, dtype):
 
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
-    if HAS_CPU or HAS_CUDA:
+    if HAS_CPU or HAS_GPU:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
index a21271cd53041..b1ccc49df499b 100644
--- a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
@@ -6,15 +6,16 @@
 
 import torch
 from torch._inductor.compile_fx import compile_fx
+from torch._inductor.test_case import TestCase
 from torch.testing._internal.common_utils import (
     IS_CI,
     IS_WINDOWS,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
-    TestCase,
 )
 from torch.testing._internal.inductor_utils import (
     _check_has_dynamic_shape,
+    GPU_TYPE,
     HAS_CPU,
     HAS_CUDA,
 )
@@ -39,7 +40,10 @@
     run_and_get_triton_code,
     TestFailure,
 )
-from inductor.test_torchinductor_dynamic_shapes import make_dynamic_cls
+from inductor.test_torchinductor_dynamic_shapes import (
+    make_dynamic_cls,
+    test_failures as dynamic_shapes_test_failures,
+)
 
 
 # Checks for patterns in generated C++/Triton code to see if it's dynamic
@@ -55,19 +59,20 @@ def check_codegen(
 
     if is_cpp_code is False:
         if hasattr(model, "to"):
-            model = model.to("cuda")
+            model = model.to(device=GPU_TYPE)
 
         def copy_fn(x):
             # preserve strides of the input on the device
             if not isinstance(x, torch.Tensor):
                 return x
             return torch.empty_strided(
-                x.size(), x.stride(), device="cuda", dtype=x.dtype
+                x.size(), x.stride(), device=GPU_TYPE, dtype=x.dtype
             ).copy_(x)
 
         example_inputs = tuple(copy_fn(x) for x in example_inputs)
 
     torch._dynamo.reset()
+    torch._inductor.codecache.FxGraphCache.clear()
     torch._inductor.metrics.reset()
 
     called = False
@@ -87,13 +92,7 @@ def run(*ex, **kwargs):
         _check_has_dynamic_shape(self, code)
     else:
         code = run_and_get_triton_code(run, *example_inputs, **kwargs)
-        triton_kernel_found = False
-        lines = code.split("\n")
-        for line in lines:
-            if "def triton" in line:
-                triton_kernel_found = True
-                continue
-        self.assertTrue(triton_kernel_found, f"Failed to find triton kernel\n{code}")
+        self.assertTrue("def triton" in code, f"Failed to find triton kernel\n{code}")
 
     assert called, "Ran graph without calling compile_fx"
 
@@ -141,6 +140,8 @@ def run(*ex, **kwargs):
     #
     "test_complex_fallback_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_adaptive_avg_pool2d2_dynamic_shapes": TestFailure(("cpu", "cuda")),
+    "test_adaptive_max_pool2d2_dynamic_shapes": TestFailure(("cpu", "cuda")),
+    "test_fractional_max_pool2d2_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_argmax_to_float_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_avg_pool2d7_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_avg_pool2d_backward4_dynamic_shapes": TestFailure(("cpu", "cuda")),
@@ -153,7 +154,10 @@ def run(*ex, **kwargs):
     "test_conv_backward_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_conv_functional_bn_fuse_dynamic_shapes": TestFailure(("cpu",), is_skip=True),
     "test_convolution2_dynamic_shapes": TestFailure(("cpu",)),
+    "test_cumprod_zero_dim_dynamic_shapes": TestFailure(("cpu",)),
     "test_cumsum_dynamic_shapes": TestFailure(("cpu",)),
+    "test_cumsum_no_mask_dynamic_shapes": TestFailure(("cpu",)),
+    "test_cumsum_zero_dim_dynamic_shapes": TestFailure(("cpu",)),
     "test_div8_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_embedding_bag_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_empty1_dynamic_shapes": TestFailure(("cpu", "cuda")),
@@ -165,6 +169,8 @@ def run(*ex, **kwargs):
     "test_like_rands_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_linspace2_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_linspace3_dynamic_shapes": TestFailure(("cpu", "cuda")),
+    "test_logcumsumexp_dynamic_shapes": TestFailure(("cpu",)),
+    "test_logcumsumexp_zero_dim_dynamic_shapes": TestFailure(("cpu",)),
     "test_max_pool2d6_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_max_pool2d8_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_max_pool2d_with_indices_backward5_dynamic_shapes": TestFailure(
@@ -174,22 +180,59 @@ def run(*ex, **kwargs):
         ("cpu", "cuda")
     ),
     "test_misaligned_address_issue1_dynamic_shapes": TestFailure(("cpu",)),
-    "test_multilayer_cumsum_dynamic_shapes": TestFailure(("cpu",)),
     "test_mm_views_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_new_empty_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_new_empty_strided_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_new_ones_dynamic_shapes": TestFailure(("cpu",)),
     "test_permute2_dynamic_shapes": TestFailure(("cpu", "cuda")),
+    "test_pointwise_airy_ai_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_digamma_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_gammainc_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_gammaincc_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_i0e_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_i1e_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_modified_bessel_k0_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_modified_bessel_k1_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_ndtri_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_polygamma_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_psi_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_scaled_modified_bessel_k0_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_scaled_modified_bessel_k1_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_spherical_bessel_j0_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_zeta_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_chebyshev_polynomial_t_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_chebyshev_polynomial_u_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_chebyshev_polynomial_v_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_chebyshev_polynomial_w_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_shifted_chebyshev_polynomial_t_dynamic_shapes": TestFailure(
+        ("cuda",)
+    ),
+    "test_pointwise_shifted_chebyshev_polynomial_u_dynamic_shapes": TestFailure(
+        ("cuda",)
+    ),
+    "test_pointwise_shifted_chebyshev_polynomial_v_dynamic_shapes": TestFailure(
+        ("cuda",)
+    ),
+    "test_pointwise_shifted_chebyshev_polynomial_w_dynamic_shapes": TestFailure(
+        ("cuda",)
+    ),
+    "test_pointwise_hermite_polynomial_h_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_hermite_polynomial_he_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_laguerre_polynomial_l_dynamic_shapes": TestFailure(("cuda",)),
+    "test_pointwise_legendre_polynomial_p_dynamic_shapes": TestFailure(("cuda",)),
     "test_randn_generator_dynamic_shapes": TestFailure(("cpu",)),
     "test_randn_like_empty_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_single_elem_dynamic_shapes": TestFailure(("cpu",)),
     "test_single_elem_indirect_dynamic_shapes": TestFailure(("cpu",)),
     "test_sort_dynamic_shapes": TestFailure(("cpu", "cuda")),
+    "test_split_cumsum_dynamic_shapes": TestFailure(("cpu",)),
+    "test_split_cumsum_low_prec_dynamic_shapes": TestFailure(("cpu",)),
+    "test_split_cumprod_dynamic_shapes": TestFailure(("cpu",)),
+    "test_split_cumprod_low_prec_dynamic_shapes": TestFailure(("cpu",)),
     "test_split_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_topk_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_unbind_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_views5_dynamic_shapes": TestFailure(("cpu", "cuda")),
-    "test_views6_dynamic_shapes": TestFailure(("cpu",)),
     "test_view_detach_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_view_on_aliased_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_linear_float64_dynamic_shapes": TestFailure("cpu"),
@@ -197,6 +240,10 @@ def run(*ex, **kwargs):
         ("cpu", "cuda")
     ),
     "test_zero_element_mutation_dynamic_shapes": TestFailure(("cpu", "cuda")),
+    "test_custom_op_3_dynamic_shapes": TestFailure(("cpu", "cuda")),
+    "test_custom_op_fixed_layout_sequential_dynamic_shapes": TestFailure(
+        ("cpu", "cuda")
+    ),
     "test_cat_uint8_dynamic_shapes": TestFailure(
         ("cpu",)
     ),  # cat on uint8 input is using aten fallback on cpu
@@ -209,7 +256,6 @@ def run(*ex, **kwargs):
         ("cpu", "cuda"), is_skip=True
     ),
     # need to enable CL with dynamic shapes
-    "test_conv_inference_heuristics_dynamic_shapes": TestFailure("cuda"),
     "test_scaled_dot_product_efficient_attention_dynamic_shapes": TestFailure(
         ("cpu", "cuda"), is_skip=True
     ),
@@ -223,6 +269,7 @@ def run(*ex, **kwargs):
     "test_forced_buffer_realize_dynamic_shapes": TestFailure(
         ("cpu", "cuda"), is_skip=True
     ),
+    "test_tmp_not_defined_issue3_dynamic_shapes": TestFailure(("cpu",), is_skip=True),
     "test_gather2_dynamic_shapes": TestFailure(("cpu", "cuda"), is_skip=True),
     "test_inplace_add_dynamic_shapes": TestFailure(("cpu", "cuda"), is_skip=True),
     "test_inplace_mixed_dtype_ops_dynamic_shapes": TestFailure(
@@ -274,7 +321,6 @@ def run(*ex, **kwargs):
     # The following tests do not support dynamic shapes yet:
     #
     "test_cudnn_rnn_dynamic_shapes": TestFailure(("cuda",)),
-    "test_kwargs_dynamic_shapes": TestFailure(("cpu",)),
     # test_roi_align uses torchvision, which doesn't work with dynamic shapes
     "test_roi_align_dynamic_shapes": TestFailure(("cpu", "cuda")),
     "test_aliased_buffer_reuse_dynamic_shapes": TestFailure(("cpu",)),
@@ -282,16 +328,22 @@ def run(*ex, **kwargs):
     "test_mutations_loop_fusion_dynamic_shapes": TestFailure(
         ("cpu", "cuda"), is_skip=True
     ),
+    # Refinement means we don't actually generate dynamic shapes (but only on
+    # cpu apparently?!)
+    "test_nonzero_unbacked_refinement_dynamic_shapes": TestFailure(("cpu",)),
+    **dynamic_shapes_test_failures,
 }
 
 if TEST_WITH_ROCM:
     test_failures.update(
         {
-            "test_cumsum_dynamic_shapes": TestFailure(("cpu", "cuda")),
+            "test_split_cumsum_dynamic_shapes": TestFailure(("cpu", "cuda")),
+            "test_split_cumsum_low_prec_dynamic_shapes": TestFailure(("cpu", "cuda")),
+            "test_split_cumprod_dynamic_shapes": TestFailure(("cpu", "cuda")),
+            "test_split_cumprod_low_prec_dynamic_shapes": TestFailure(("cpu", "cuda")),
         }
     )
 
-
 DynamicShapesCodegenCommonTemplate = make_dynamic_cls(
     CommonTemplate, xfail_prop="_expected_failure_codegen_dynamic"
 )
@@ -322,9 +374,9 @@ def common(self: TestCase, model, example_inputs, kwargs=None, **_rest):
 
 if HAS_CUDA and not TEST_WITH_ASAN:
 
-    class DynamicShapesCodegenCudaTests(TestCase):
+    class DynamicShapesCodegenGPUTests(TestCase):
         maxDiff = None
-        device = "cuda"
+        device = GPU_TYPE
 
         def common(self: TestCase, model, example_inputs, kwargs=None, **_rest):
             return check_codegen(
@@ -337,14 +389,14 @@ def common(self: TestCase, model, example_inputs, kwargs=None, **_rest):
 
     copy_tests(
         DynamicShapesCodegenCommonTemplate,
-        DynamicShapesCodegenCudaTests,
-        "cuda",
+        DynamicShapesCodegenGPUTests,
+        GPU_TYPE,
         test_failures,
     )
 
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     if HAS_CPU or HAS_CUDA:
         run_tests(needs="filelock")
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
index c06c1ffd19362..c5200a6014241 100644
--- a/test/inductor/test_torchinductor_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: inductor"]
 import contextlib
 import importlib
+
 import math
 import os
 import sys
@@ -8,22 +9,28 @@
 from functools import partial
 
 import torch
-import torch._custom_ops as custom_ops
 import torch.library
 from torch._dynamo.testing import make_test_cls_with_patches
+from torch._inductor.codegen.common import device_codegens, register_backend_for_device
+from torch._inductor.codegen.cpp import CppScheduling
+from torch._inductor.codegen.wrapper import WrapperCodeGen
+from torch._inductor.test_case import TestCase
+from torch._inductor.virtualized import V
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     onlyCPU,
     onlyCUDA,
 )
 from torch.testing._internal.common_utils import (
+    IS_ARM64,
     IS_CI,
     IS_WINDOWS,
+    parametrize,
+    TEST_CUDA_MEM_LEAK_CHECK,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
-    TestCase,
 )
-from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_CUDA, HAS_GPU
 
 if IS_WINDOWS and IS_CI:
     sys.stderr.write(
@@ -38,7 +45,7 @@
 sys.path.append(pytorch_test_dir)
 from inductor.test_torchinductor import (
     check_model,
-    check_model_cuda,
+    check_model_gpu,
     CommonTemplate,
     copy_tests,
     TestFailure,
@@ -91,12 +98,12 @@ class DynamicShapesCpuTests(TestCase):
 
 if HAS_CUDA and not TEST_WITH_ASAN:
 
-    class DynamicShapesCudaTests(TestCase):
-        common = check_model_cuda
-        device = "cuda"
+    class DynamicShapesGPUTests(TestCase):
+        common = check_model_gpu
+        device = GPU_TYPE
 
     copy_tests(
-        DynamicShapesCommonTemplate, DynamicShapesCudaTests, "cuda", test_failures
+        DynamicShapesCommonTemplate, DynamicShapesGPUTests, GPU_TYPE, test_failures
     )
 
 
@@ -106,7 +113,7 @@ class TestInductorDynamic(TestCase):
     def setUp(self):
         # HAS_CUDA also checks compute capability to skip tests
         # on older devices
-        if self.device_type == "cuda" and not HAS_CUDA:
+        if not HAS_GPU:
             self.skipTest("Triton not available")
         torch._dynamo.reset()
         super(TestCase, self).setUp()
@@ -185,6 +192,19 @@ def fn(x, y):
         opt_r = opt_f(x, y)
         self.assertEqual(r, opt_r)
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_unwrap_storage_didnt_work_repro(self, device):
+        def f():
+            full = torch.full((), 11)
+            i0 = full.item()
+            torch._check_is_size(i0)
+            return torch.full((i0,), 0)
+
+        opt_f = torch.compile(f, fullgraph=True)
+        r = f()
+        opt_r = opt_f()
+        self.assertEqual(r, opt_r)
+
     @torch._dynamo.config.patch(capture_dynamic_output_shape_ops=True)
     def test_nonzero_size_factory_nobreak(self, device):
         def f(x, b):
@@ -198,6 +218,15 @@ def f(x, b):
         opt_r = opt_f(x, b)
         self.assertEqual(r, opt_r)
 
+    @torch._dynamo.config.patch(capture_dynamic_output_shape_ops=True)
+    def test_nonzero_no_realloc(self, device):
+        @torch.compile(fullgraph=True, dynamic=True)
+        def f(x, y):
+            z = x.nonzero()
+            return torch.split(z, [y.size(0)])
+
+        f(torch.tensor([1, 0, 1, 1, 0, 1, 0]), torch.randn(4))
+
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_item_nobreak(self, device):
         @torch.compile(fullgraph=True)
@@ -215,6 +244,18 @@ def f(x):
 
         f(torch.tensor([True], device=device))
 
+    @torch._dynamo.config.patch(
+        capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+    )
+    def test_noops_tensor_repropagate(self, device):
+        @torch.compile(fullgraph=True)
+        def f(x):
+            b = torch.ops.prims.convert_element_type.default(x, torch.int64)
+            r = b.nonzero()
+            return r * 2
+
+        f(torch.tensor([0, 4, 2, 0, 1], dtype=torch.int64, device=device))
+
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_item_zeros_nobreak(self, device):
         @torch.compile(fullgraph=True)
@@ -259,31 +300,20 @@ def f(x):
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     @torch._inductor.config.patch(implicit_fallbacks=True)
     def test_item_to_inputs_kernel_nobreak(self, device):
-        lib = torch.library.Library("test", "DEF")
+        @torch.library.custom_op("test::foo", mutates_args=())
+        def foo(x: torch.Tensor, y: int) -> torch.Tensor:
+            return x.clone()
 
-        try:
-
-            @custom_ops.custom_op("test::foo")
-            def foo(x: torch.Tensor, y: int) -> torch.Tensor:
-                raise NotImplementedError()
-
-            @custom_ops.impl("test::foo")
-            def foo_impl(x: torch.Tensor, y: int) -> torch.Tensor:
-                return x.clone()
-
-            @torch.library.impl_abstract("test::foo", lib=lib)
-            def foo_meta(x: torch.Tensor, y: int) -> torch.Tensor:
-                return x.clone()
-
-            @torch.compile(fullgraph=True)
-            def f(x, r):
-                y = x.item()
-                return torch.ops.test.foo(r, y)
+        @foo.register_fake
+        def _(x: torch.Tensor, y: int) -> torch.Tensor:
+            return x.clone()
 
-            f(torch.tensor([3], device=device), torch.randn(10, device=device))
+        @torch.compile(fullgraph=True)
+        def f(x, r):
+            y = x.item()
+            return torch.ops.test.foo(r, y)
 
-        finally:
-            custom_ops._destroy("test::foo")
+        f(torch.tensor([3], device=device), torch.randn(10, device=device))
 
     @torch._dynamo.config.patch(
         capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
@@ -295,40 +325,151 @@ def f(x):
 
         f(torch.tensor([3.0], device=device))
 
+    @unittest.skipIf(TEST_CUDA_MEM_LEAK_CHECK, "failing memory leak check")
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_unbacked_index_select(self, device):
+        # Tests if unbacked symbols captured by inner_fn are properly tracked
+        def f(x):
+            y = x.item()
+            return torch.index_select(
+                torch.ones(y, device=device), 0, torch.tensor([0, 2, 1], device=device)
+            )
+
+        cf = torch.compile(fullgraph=True)(f)
+        arg = torch.tensor(5, device=device)
+        self.assertEqual(f(arg), cf(arg))
+
     @torch._dynamo.config.patch(
         capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
     )
-    @torch._inductor.config.patch(implicit_fallbacks=True)
-    def test_dynamic_stride_nobreak(self, device):
-        lib = torch.library.Library("test", "DEF")
+    def test_return_unbacked_view_split(self, device):
+        def f(values, length_per_key):
+            u0, u1 = length_per_key.tolist()
+            torch._check_is_size(u0)
+            torch._check_is_size(u1)
+            v1, v2 = torch.functional.split(values, [u0, u1])
+            return v1, v2
+
+        cf = torch.compile(fullgraph=True)(f)
+        args = (
+            torch.randn(8, requires_grad=True, device=device),
+            torch.tensor([3, 5], device=device),
+        )
+        self.assertEqual(f(*args), cf(*args))
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_unbacked_matmul(self, device):
+        def f(x):
+            y = x.item()
+            return torch.ones(1, y, device=device) @ torch.ones(y, 1, device=device)
+
+        cf = torch.compile(fullgraph=True)(f)
+        arg = torch.tensor(5, device=device)
+        self.assertEqual(f(arg), cf(arg))
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_unbacked_reduction(self, device):
+        expect_fail = device == "cpu" and not IS_ARM64
         try:
 
-            @custom_ops.custom_op("test::foo")
-            def foo(x: torch.Tensor) -> torch.Tensor:
-                raise NotImplementedError()
+            def f(x):
+                y = x.item()
+                return torch.ones(y, device=device).sum()
+
+            cf = torch.compile(fullgraph=True)(f)
+            arg = torch.tensor(5, device=device)
+            self.assertEqual(f(arg), cf(arg))
+        except Exception:
+            if not expect_fail:
+                raise
+        else:
+            if expect_fail:
+                self.fail("expected to fail, but actually passed")
 
-            @custom_ops.impl("test::foo")
-            def foo_impl(x: torch.Tensor) -> torch.Tensor:
-                stride = x.item()
-                return torch.empty_strided((1,), (stride,), device=x.device)
+    @torch._dynamo.config.patch(
+        capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+    )
+    def test_cat_unbacked_duplicate_size(self, device):
+        def f(x):
+            device = x.device
+            s, s2 = x.tolist()
+            g = torch.zeros(s, device=device)
+            g2 = torch.ones(s2, device=device)
+            return torch.ops.aten.cat.default([g, g, g2])
 
-            @torch.library.impl_abstract("test::foo", lib=lib)
-            def foo_meta(x: torch.Tensor) -> torch.Tensor:
-                ctx = torch.library.get_ctx()
-                stride = ctx.new_dynamic_size()
-                return torch.empty_strided((1,), (stride,), device=x.device)
+        cf = torch.compile(fullgraph=True)(f)
+        arg = torch.tensor([4, 6], device="cuda")
+        self.assertEqual(f(arg), cf(arg))
 
-            @torch.compile(fullgraph=True)
-            def f(x):
-                r = torch.ops.test.foo(x)
-                y = r.stride(0)
-                return torch.empty(y, device=x.device)
+    @torch._dynamo.config.patch(
+        capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+    )
+    def test_unbacked_cat_backwards(self, device):
+        def f(x, w):
+            device = w.device
+            a, b = x.tolist()
+            ta = torch.ones(a, device=device)
+            tb = torch.ones(b, device=device)
+            pa = ta * w  # make it require gradients
+            pb = tb * w
+            r = torch.cat([pa, pb])
+            return r.sum()
+
+        x = torch.tensor([4, 9])
+        w = torch.randn(1, requires_grad=True)
+        f(x, w).backward()
+        orig_w = w.grad
+        w.grad = None
+
+        torch.compile(fullgraph=True)(f)(x, w).backward()
+        self.assertEqual(orig_w, w.grad)
 
-            f(torch.tensor([3], device=device))
+    @torch._dynamo.config.patch(
+        capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+    )
+    def test_unbacked_cat_backwards_save_data_dependent(self, device):
+        def f(x, w):
+            device = w.device
+            a, b = x.tolist()
+            ta = torch.ones(a, device=device)
+            tb = torch.ones(b, device=device)
+            pa = ta * w  # make it require gradients
+            pb = tb * w
+            r = torch.cat([pa, pb])
+            return r
+
+        x = torch.tensor([4, 9])
+        w = torch.randn(1, requires_grad=True)
+        f(x, w).sum().backward()
+        orig_w = w.grad
+        w.grad = None
+
+        torch.compile(fullgraph=True)(f)(x, w).sum().backward()
+        self.assertEqual(orig_w, w.grad)
 
-        finally:
-            custom_ops._destroy("test::foo")
+    @torch._dynamo.config.patch(
+        capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+    )
+    @torch._inductor.config.patch(implicit_fallbacks=True)
+    def test_dynamic_stride_nobreak(self, device):
+        @torch.library.custom_op("test::foo", mutates_args=())
+        def foo(x: torch.Tensor) -> torch.Tensor:
+            stride = x.item()
+            return torch.empty_strided((1,), (stride,), device=x.device)
+
+        @foo.register_fake
+        def _(x: torch.Tensor) -> torch.Tensor:
+            ctx = torch.library.get_ctx()
+            stride = ctx.new_dynamic_size()
+            return torch.empty_strided((1,), (stride,), device=x.device)
+
+        @torch.compile(fullgraph=True)
+        def f(x):
+            r = torch.ops.test.foo(x)
+            y = r.stride(0)
+            return torch.empty(y, device=x.device)
+
+        f(torch.tensor([3], device=device))
 
     @torch._inductor.config.patch(disable_cpp_codegen=True)
     def test_floor(self):
@@ -351,6 +492,7 @@ def fn(x):
         res1 = opt(x1)
         self.assertEqual(ref1, res1)
 
+    # Need to comment: is xpu need this? if yes we may need to add onlyGPU
     @onlyCUDA
     def test_pad_dynamic(self, device):
         def get_same_padding(x: int, k: int, s: int, d: int):
@@ -417,6 +559,16 @@ def fn(x):
         cfn = self.compile_fn(fn)
         self.assertEqual(fn(a), cfn(a))
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_item_materialize(self, device):
+        def fn(x):
+            return x.sum(dim=0).view(4).tolist()
+
+        cfn = torch.compile(fullgraph=True)(fn)
+
+        a = torch.ones(3, 4, dtype=torch.int64, device=device)
+        self.assertEqual(cfn(a), fn(a))
+
     def test_abs(self, device):
         def fn(x, y):
             y0, y1 = y.shape
@@ -487,7 +639,7 @@ def sub(x):
         actual = cfn(3)
         self.assertEqual(expect, actual)
 
-    def test_full(self, device):
+    def test_full_symbolic_value(self, device):
         def fn(a):
             return torch.full((3,), a), torch.full((3,), torch.sym_float(a))
 
@@ -496,11 +648,150 @@ def fn(a):
         actual = cfn(5)
         self.assertEqual(expect, actual)
 
+    def test_full_recompiles(self, device):
+        def fn(x):
+            _, L = x.shape
+            return torch.full((L, L), torch.finfo(torch.float16).min, device=device)
+
+        cfn = self.compile_fn(fn)
+
+        import functools
+
+        input_fn = functools.partial(torch.randint, 10, 1000, device=device)
+
+        cfn(input_fn((2, 3)))
+        cfn(input_fn((2, 4)))  # expect don't recompile here
+
+        # check compiled times of frame 0
+        from torch._dynamo.convert_frame import FRAME_COMPILE_COUNTER
+
+        self.assertEqual(FRAME_COMPILE_COUNTER[0], 1)
+
+    @parametrize(
+        "op",
+        [
+            math.sqrt,
+            math.sin,
+            math.cos,
+            math.cosh,
+            math.sin,
+            math.sinh,
+            math.tan,
+            math.tanh,
+            math.asin,
+            math.acos,
+            math.atan,
+        ],
+    )
+    def test_math_ops(self, device, op):
+        def func(x, fn, a):
+            return x + fn(a)
+
+        cfunc = self.compile_fn(func, fullgraph=True)
+        x = torch.rand(10, device=device)
+        a = -1 if op in (math.asin, math.acos) else 12
+        expected = func(x, op, a)
+        output = cfunc(x, op, a)
+        self.assertEqual(output, expected)
+
+    def test_wrapper_codegen_statically_known_int_or_none(self):
+        torch._dynamo.reset()
+
+        _x = torch.randn([5, 3, 3])
+        torch._dynamo.maybe_mark_dynamic(_x, 0)
+
+        # Simple functions introducing constraints on x.shape[0]
+        def fn_1(x):
+            # no constraint
+            return x.sin()
+
+        def fn_2(x):
+            # constrain in two directions
+            if x.shape[0] > 5:
+                return x.cos()
+            if x.shape[0] < 5:
+                return x * 2
+            # x.shape[0] == 5 at this point
+            return x.sin()
+
+        def fn_3(x):
+            # equality constraint, which matches example shape
+            if x.size(0) == 5:
+                return x.sin()
+            else:
+                return x.cos()
+
+        call_count = 0
+
+        def _test_wrapper_codegen_statically_known_int_or_none_in_context():
+            nonlocal call_count
+            call_count += 1
+            graph = V.graph
+            input_layouts = [
+                inp.layout
+                for inp in graph.graph_inputs.values()
+                if hasattr(inp, "layout")
+            ]
+            batch_dim = input_layouts[0].size[0]
+            if call_count == 1:
+                # testing fn_1
+                assert (
+                    WrapperCodeGen.statically_known_int_or_none(batch_dim) is None
+                ), "Should not be statically known on first call"
+            elif call_count == 2:
+                # testing fn_2
+                assert (
+                    WrapperCodeGen.statically_known_int_or_none(batch_dim) == 5
+                ), "Should be limited to exactly 5 on second call due to multiple constraints"
+            elif call_count == 2:
+                # testing fn_3
+                assert (
+                    WrapperCodeGen.statically_known_int_or_none(batch_dim) == 5
+                ), "Should be exactly 5 on third call"
+
+        class TestWrapperCodegen(WrapperCodeGen):
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+
+            def generate(self, is_inference, *args, **kwargs):
+                _test_wrapper_codegen_statically_known_int_or_none_in_context()
+                return super().generate(is_inference, *args, **kwargs)
+
+        if "cpu" not in device_codegens:
+            register_backend_for_device("cpu", CppScheduling, WrapperCodeGen)
+        orig_cpu_codegens = device_codegens["cpu"]
+        try:
+            register_backend_for_device(
+                "cpu", orig_cpu_codegens.scheduling, TestWrapperCodegen
+            )
+            # Compile each of the functions above, with an example input
+            # that has 5 in the first dimension, but is marked as dynamic
+
+            torch.compile(backend="inductor", dynamic=None)(fn_1)(_x)
+            torch.compile(backend="inductor", dynamic=None)(fn_2)(_x)
+            torch.compile(backend="inductor", dynamic=None)(fn_3)(_x)
+        finally:
+            register_backend_for_device(
+                "cpu", orig_cpu_codegens.scheduling, orig_cpu_codegens.wrapper_codegen
+            )
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_item_unbacked_stride_nobreak(self, device):
+        @torch.compile(fullgraph=True, dynamic=True)
+        def f(x):
+            a = x.item()
+            torch._check_is_size(a)
+            torch._check(a >= 1)
+            torch._check(a <= 10)
+            return torch.ones(a, a)
+
+        f(torch.tensor([5], device=device))
+
 
 instantiate_device_type_tests(TestInductorDynamic, globals())
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
+    from torch._inductor.test_case import run_tests
 
     # Slow on ASAN after https://github.com/pytorch/pytorch/pull/94068
     if (HAS_CPU or HAS_CUDA) and not TEST_WITH_ASAN:
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 268685312c531..fdb9a8c37a47c 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -13,7 +13,7 @@
 import torch
 
 from torch._dispatch.python import enable_python_dispatcher
-from torch._dynamo.test_case import run_tests
+from torch._inductor.test_case import run_tests, TestCase
 from torch._subclasses.fake_tensor import (
     DataDependentOutputException,
     DynamicOutputShapeException,
@@ -40,17 +40,16 @@
     TEST_MKL,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
-    TestCase,
 )
-from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_CUDA
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_map
 
 try:
     try:
-        from .test_torchinductor import check_model, check_model_cuda
+        from .test_torchinductor import check_model, check_model_gpu
     except ImportError:
-        from test_torchinductor import check_model, check_model_cuda
+        from test_torchinductor import check_model, check_model_gpu
 except (unittest.SkipTest, ImportError) as e:
     sys.stderr.write(f"{type(e)}: {e}\n")
     if __name__ == "__main__":
@@ -67,9 +66,14 @@
 i64 = torch.int64
 b8 = torch.bool
 u8 = torch.uint8  # not tested except upsampling and interpolate ops
+u16 = torch.uint16  # not tested
+u32 = torch.uint32  # not tested
+u64 = torch.uint64  # not tested
 
 _ops = partial(
-    ops, dtypes=OpDTypes.supported, allowed_dtypes=[f16, f32, f64, i32, i64, b8, u8]
+    ops,
+    dtypes=OpDTypes.supported,
+    allowed_dtypes=[f16, f32, f64, i32, i64, b8, u8, u16, u32, u64],
 )
 
 # Success forces pass; failure forces fail; skip unconditionally skips testing
@@ -139,7 +143,7 @@ def format_op(op):
                 f"    {format_op(op)}: {fmt_dtypes(failed_dtypes)},{reasons}"
             )
 
-    for device_type in ("cpu", "cuda"):
+    for device_type in ("cpu", GPU_TYPE):
         expected_failures[device_type]
         nl = "\n"
         print(
@@ -163,6 +167,8 @@ def format_op(op):
 inductor_skips["cpu"] = {
     "linalg.ldl_factor": {f32, f64},  # flaky
     "nn.functional.cosine_embedding_loss": {b8},  # flaky
+    ("index_reduce", "prod"): {f16},  # flaky
+    ("index_reduce", "mean"): {f16},  # flaky
 }
 
 if IS_MACOS and IS_X86:
@@ -187,6 +193,7 @@ def format_op(op):
     "nn.functional.cosine_embedding_loss": {b8},
     "native_batch_norm": {f16, f32, f64},
     "_native_batch_norm_legit": {f16, f32, f64},
+    "_batch_norm_with_update": {f16, f32, f64},
 }
 
 if not SM80OrLater:
@@ -195,6 +202,7 @@ def format_op(op):
 if TEST_WITH_ROCM:
     # Tensors are not alike
     inductor_skips["cuda"]["logcumsumexp"] = {f32}
+    inductor_skips["cuda"]["special.modified_bessel_i1"] = {f64}
 
 inductor_expected_failures_single_sample = defaultdict(dict)
 
@@ -205,14 +213,13 @@ def format_op(op):
     "_upsample_bilinear2d_aa": {f32, f64},
     "cholesky": {f32, f64},
     "complex": {f16},
-    "cross": {f16},
     "resize_": {b8, f16, f32, f64, i32, i64},
     "resize_as_": {b8, f16, f32, f64, i32, i64},
     "histc": {f16},
-    "linalg.cross": {f16},
     "multinomial": {f16, f32, f64},
     "nn.functional.avg_pool1d": {i64},
     "nn.functional.avg_pool2d": {i64},
+    "nn.functional.avg_pool3d": {i64},
     "nn.functional.local_response_norm": {i64},
     "nn.functional.rrelu": {f32, f64},
     "nonzero_static": {b8, f16, f32, f64, i32, i64},
@@ -220,7 +227,10 @@ def format_op(op):
     ("normal", "number_mean"): {f16, f32, f64},
     ("sparse.mm", "reduce"): {f32, f64},
     "sparse.sampled_addmm": {f32, f64},
-    "to_sparse": {f32, f64},
+    "to_sparse": {
+        f32,
+        f64,
+    },  # NYI: could not find kernel for aten.view.default at dispatch key DispatchKey.SparseCPU
     "view_as_complex": {f16},
 }
 
@@ -229,34 +239,44 @@ def format_op(op):
     "_upsample_bilinear2d_aa": {f16, f32, f64},
     "cholesky": {f32, f64},
     "multinomial": {f16, f32, f64},
-    "nn.functional.normalize": {f16},
     ("normal", "in_place"): {f16, f32, f64},
     ("normal", "number_mean"): {f16, f32, f64},
     "sparse.sampled_addmm": {f32, f64},
-    "to_sparse": {f16, f32, f64},
-    "torch.ops.aten._efficient_attention_forward": {f16, bf16, f32},
-    "torch.ops.aten._flash_attention_forward": {f16, bf16, f32},
+    "torch.ops.aten._flash_attention_forward": {f16},
+    "torch.ops.aten._efficient_attention_forward": {f16, f32},
+    "to_sparse": {
+        f16,
+        f32,
+        f64,
+    },  # NYI: could not find kernel for aten.view.default at dispatch key DispatchKey.SparseCUDA
 }
 
 
 # intentionally not handled
 intentionally_not_handled = {
-    ("as_strided", "partial_views"): {b8, f16, f32, f64, i32, i64},
     "resize_": {b8, f16, f32, f64, i32, i64},
     "resize_as_": {b8, f16, f32, f64, i32, i64},
 }
+# This is only fixed when this config is set
+# We should eventually always turn it on
+import torch._functorch.config as functorch_config
+
+if not functorch_config.view_replay_for_aliased_outputs:
+    intentionally_not_handled['("as_strided", "partial_views")'] = {
+        b8,
+        f16,
+        f32,
+        f64,
+        i32,
+        i64,
+    }
 
 inductor_expected_failures_single_sample["cuda"].update(intentionally_not_handled)
 
 
 inductor_gradient_expected_failures_single_sample = defaultdict(dict)
 
-inductor_gradient_expected_failures_single_sample["cuda"] = {
-    "nn.functional.normalize": {f16},
-}
-
-if not TEST_WITH_ROCM:
-    inductor_gradient_expected_failures_single_sample["cuda"]["tanh"] = {f16}
+inductor_gradient_expected_failures_single_sample["cuda"] = {}
 
 if not TEST_MKL:
     inductor_expected_failures_single_sample["cpu"].update({})
@@ -309,6 +329,8 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     "empty_strided": {"assert_equal": False},
     "new_empty_strided": {"assert_equal": False},
     "randn": {"assert_equal": False},
+    ("cross", "cuda", f16): {"reference_in_float": True},
+    ("linalg.cross", "cuda", f16): {"reference_in_float": True},
     ("addr", "cuda", f16): {"reference_in_float": True},
     ("baddbmm", "cuda", f16): {"atol": 2e-3, "rtol": 0.002},  # decomp affects accuracy
     ("angle", "cuda", f64): {"reference_in_float": True},
@@ -316,7 +338,9 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     ("atanh", "cuda", f16): {"reference_in_float": True},
     ("cauchy", "cuda"): {"reference_in_float": True},
     ("cummax", "cuda", f16): {"atol": 5e-4, "rtol": 0.002},
+    ("cumsum", "cuda", f16): {"reference_in_float": True},
     ("cumprod", "cuda"): {"reference_in_float": True, "atol": 7e-5, "rtol": 0.002},
+    ("logcumsumexp", "cuda"): {"grad_atol": 8e-4, "grad_rtol": 0.001},
     ("exponential", "cuda"): {"reference_in_float": True},
     ("geometric", "cuda"): {"reference_in_float": True},
     ("kron", "cuda", f16): {"reference_in_float": True},
@@ -329,6 +353,8 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     ("nn.functional.cosine_similarity", "cuda", f16): {"reference_in_float": True},
     ("nn.functional.instance_norm", "cuda", f16): {"reference_in_float": True},
     ("nn.functional.local_response_norm", "cuda", f16): {"reference_in_float": True},
+    ("nn.functional.normalize", "cuda", f16): {"atol": 1e-3, "rtol": 0.05},
+    ("nn.functional.rms_norm", "cuda", f16): {"reference_in_float": True},
     ("nn.functional.soft_margin_loss", "cuda", f16): {"reference_in_float": True},
     ("nn.functional.softmin", "cuda", f16): {"atol": 1e-4, "rtol": 0.01},
     ("nn.functional.softsign", "cuda", f16): {"reference_in_float": True},
@@ -340,34 +366,46 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
         "atol": 1e-4,
         "rtol": 0.02,
     },
+    ("sinc", "cuda", f16): {"atol": 0.008, "rtol": 0.002},
     ("softmax", "cpu", f16): {"atol": 1e-4, "rtol": 0.02},
     ("softmax", "cuda", f16): {"atol": 1e-4, "rtol": 0.02},
     ("_softmax_backward_data", "cuda", f16): {"atol": 0.008, "rtol": 0.002},
     ("special.log_ndtr", "cuda", f64): {"atol": 1e-6, "rtol": 1e-5},
+    ("polygamma.polygamma_n_0", "cpu", f32): {"atol": 1e-3, "rtol": 1e-4},
+    ("polygamma.polygamma_n_1", "cpu", f32): {"atol": 1e-3, "rtol": 1e-4},
+    ("polygamma.polygamma_n_2", "cpu", f32): {"atol": 1e-3, "rtol": 1e-4},
+    ("polygamma.polygamma_n_3", "cpu", f32): {"atol": 1e-3, "rtol": 1e-4},
+    ("polygamma.polygamma_n_4", "cpu", f32): {"atol": 1e-3, "rtol": 1e-4},
+    ("special.polygamma.special_polygamma_n_0", "cpu", f32): {
+        "atol": 1e-3,
+        "rtol": 1e-4,
+    },
     ("std_mean.unbiased", "cuda", f16): {"reference_in_float": True},
     ("uniform", "cuda"): {"reference_in_float": True},
     # Following tests are failing with strict comparision but atol=1 is acceptable due roundings errors
     ("nn.functional.interpolate.bilinear", "cpu", u8): {"atol": 1, "rtol": 0},
     ("nn.functional.upsample_bilinear", "cpu", u8): {"atol": 1, "rtol": 0},
+    ("nn.functional.interpolate.bicubic", "cpu", u8): {"atol": 1, "rtol": 0},
+    # High atol due to precision loss
     ("nn.functional.interpolate.bilinear", "cuda", f64): {"atol": 5e-4, "rtol": 0},
     ("nn.functional.upsample_bilinear", "cuda", f64): {"atol": 5e-4, "rtol": 0},
-    # Temporarily skip interpolat bicubic tests:
-    "nn.functional.interpolate.bicubic": {
-        "assert_equal": False,
-        "check_gradient": False,
-    },
+    ("nn.functional.interpolate.bicubic", "cpu", f32): {"atol": 5e-3, "rtol": 0},
+    ("nn.functional.interpolate.bicubic", "cuda", f64): {"atol": 1e-3, "rtol": 0},
+    # Unreasonably high atol requirement:
+    ("index_reduce.mean", "cuda", f16): {"check_gradient": False},
+    ("index_reduce.mean", "cuda", f32): {"check_gradient": False},
+    ("index_reduce.mean", "cuda", f64): {"check_gradient": False},
+    # Gradient contains non-finite entries:
+    ("index_reduce.amin", "cuda", f64): {"check_gradient": False},
+    ("index_reduce.amin", "cuda", f32): {"check_gradient": False},
+    ("index_reduce.amin", "cuda", f16): {"check_gradient": False},
+    ("index_reduce.amax", "cuda", f64): {"check_gradient": False},
+    ("index_reduce.amax", "cuda", f32): {"check_gradient": False},
+    ("index_reduce.amax", "cuda", f16): {"check_gradient": False},
+    ("tanh", "cuda", f16): {"atol": 1e-4, "rtol": 1e-2},
 }
 
 
-if not TEST_WITH_ROCM:
-    inductor_override_kwargs.update(
-        {
-            # We have better precision than eager
-            ("cumsum", "cuda", f16): {"reference_in_float": True},
-        }
-    )
-
-
 # Always test with all sample for following ops
 inductor_all_samples = {
     "arange",
@@ -377,6 +415,10 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     "softmax.with_dtype",
     "index_add",
     "index_copy",
+    "index_reduce.prod",
+    "index_reduce.mean",
+    "index_reduce.amax",
+    "index_reduce.amin",
     "scatter_reduce.sum",
     "select_scatter",
     "squeeze",
@@ -421,7 +463,7 @@ def tearDown(self):
         torch._dynamo.reset()
 
     check_model = check_model
-    check_model_cuda = check_model_cuda
+    check_model_gpu = check_model_gpu
 
     @onlyNativeDeviceTypes
     @suppress_warnings
@@ -443,7 +485,9 @@ def tearDown(self):
     def test_comprehensive(self, device, dtype, op):
         torch._dynamo.reset()
         with torch.no_grad():
-            torch.cuda.empty_cache()
+            # TODO: should we move empty_cache to the common device interface
+            if device == "cuda":
+                torch.cuda.empty_cache()
         op_name = op.name
         if op.variant_test_name:
             op_name += f".{op.variant_test_name}"
@@ -452,6 +496,7 @@ def test_comprehensive(self, device, dtype, op):
         allowed_dtypes = [f16, f32, f64, i32, i64, b8]
         if op_name not in (
             "nn.functional.interpolate.bilinear",
+            "nn.functional.interpolate.bicubic",
             "nn.functional.upsample_bilinear",
             "nn.functional.upsample_nearest",
         ):
@@ -460,7 +505,7 @@ def test_comprehensive(self, device, dtype, op):
 
         device_type = torch.device(device).type
 
-        assert device_type in ("cuda", "cpu")
+        assert device_type in (GPU_TYPE, "cpu")
 
         # with open("test_output.txt", "a") as f:
         #     print(f"CONSIDERING OP {op_name} on {device_type} with {dtype} |
@@ -490,7 +535,6 @@ def test_comprehensive(self, device, dtype, op):
             overridden_kwargs = inductor_override_kwargs[(op_name, device_type)]
         elif (op_name, device_type, dtype) in inductor_override_kwargs:
             overridden_kwargs = inductor_override_kwargs[(op_name, device_type, dtype)]
-
         func = op.get_op()
 
         def fn(*args, **kwargs):
@@ -582,9 +626,9 @@ def _get_tolerances(dtype):
                 #     print(f"RUNNING OP {op_name} on {device_type} with {dtype}", flush=True, file=f)
                 #     print(f"RUNNING OP {op_name} on {device_type} with {dtype}", flush=True)
                 rtol, atol = _get_tolerances(dtype)
-                if device_type == "cuda":
+                if device_type == GPU_TYPE:
                     # opinfo test case have already place the input on the correct device
-                    # so we don't need do additional copy by setting copy_to_cuda=False
+                    # so we don't need do additional copy by setting copy_to_gpu=False
 
                     no_python, has_rng_op = do_nopython_and_has_rng(fn, args, kwargs)
                     for context_fn, kwarg_overrides in get_contexts(has_rng_op):
@@ -592,7 +636,7 @@ def _get_tolerances(dtype):
                             adjusted_kwargs = {
                                 "check_lowp": False,
                                 "nopython": no_python,
-                                "copy_to_cuda": False,
+                                "copy_to_gpu": False,
                                 "reference_in_float": False,
                                 "check_gradient": requires_grad,
                                 "check_has_compiled": no_python,
@@ -602,7 +646,7 @@ def _get_tolerances(dtype):
                             }
                             adjusted_kwargs.update(overridden_kwargs)
                             adjusted_kwargs.update(kwarg_overrides)
-                            self.check_model_cuda(
+                            self.check_model_gpu(
                                 fn,
                                 args,
                                 kwargs,
diff --git a/test/inductor/test_triton_extension_backend.py b/test/inductor/test_triton_extension_backend.py
new file mode 100644
index 0000000000000..fa4d4d2359ed9
--- /dev/null
+++ b/test/inductor/test_triton_extension_backend.py
@@ -0,0 +1,120 @@
+# Owner(s): ["module: inductor"]
+import random
+import string
+import sys
+import unittest
+
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+try:
+    from extension_backends.triton.device_interface import DeviceInterface
+    from extension_backends.triton.extension_codegen_backend import (
+        CPUDeviceOpOverrides,
+        ExtensionScheduling,
+        ExtensionWrapperCodegen,
+    )
+except ImportError:
+    from .extension_backends.triton.device_interface import DeviceInterface
+    from .extension_backends.triton.extension_codegen_backend import (
+        CPUDeviceOpOverrides,
+        ExtensionScheduling,
+        ExtensionWrapperCodegen,
+    )
+
+from torch._C import FileCheck
+from torch._dynamo import device_interface
+from torch._inductor import metrics
+from torch._inductor.codegen.common import (
+    get_scheduling_for_device,
+    get_wrapper_codegen_for_device,
+    register_backend_for_device,
+    register_device_op_overrides,
+)
+from torch._inductor.utils import get_triton_code
+from torch.testing._internal.common_utils import IS_MACOS
+
+try:
+    try:
+        from . import test_torchinductor
+    except ImportError:
+        import test_torchinductor
+except unittest.SkipTest:
+    if __name__ == "__main__":
+        sys.exit(0)
+    raise
+
+
+TestCase = test_torchinductor.TestCase
+
+
+def mock_triton_hash_with_backend(*args, **kwargs):
+    # Generate a random string of length 64. Used to mock the triton_hash_with_backend function
+    # since we don't have a triton backend
+    return "".join(random.choices(string.ascii_uppercase + string.digits, k=64))
+
+
+class TritonExtensionBackendTests(TestCase):
+    """
+    Test creating a backend for inductor with Triton scheduling.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._stack.close()
+        super().tearDownClass()
+
+    def setUp(self):
+        torch._dynamo.reset()
+        super().setUp()
+
+    def tearDown(self):
+        super().tearDown()
+        torch._dynamo.reset()
+
+    def test_open_device_registration(self):
+        register_backend_for_device("cpu", ExtensionScheduling, ExtensionWrapperCodegen)
+        register_device_op_overrides("cpu", CPUDeviceOpOverrides())
+        device_interface.register_interface_for_device("cpu", DeviceInterface)
+
+        self.assertTrue(get_scheduling_for_device("cpu") == ExtensionScheduling)
+        self.assertTrue(
+            get_wrapper_codegen_for_device("cpu") == ExtensionWrapperCodegen
+        )
+        self.assertTrue(
+            device_interface.get_interface_for_device("cpu") == DeviceInterface
+        )
+
+        device = torch.device("cpu")
+        x = torch.empty(2, 16).fill_(1).to(device)
+
+        def foo(x):
+            return torch.sin(x) + x.min()
+
+        metrics.reset()
+        opt_fn = torch.compile(foo)
+
+        # Since we don't have a triton backend, we need to mock the triton_hash_with_backend
+        # function
+        with unittest.mock.patch(
+            "torch.utils._triton.triton_hash_with_backend",
+            new=mock_triton_hash_with_backend,
+        ):
+            code = get_triton_code(opt_fn, x)
+
+        FileCheck().check("import triton").check("@triton.jit").check(
+            "tl_math.sin"
+        ).check("device_str='cpu'").run(code)
+
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+    from torch.testing._internal.inductor_utils import HAS_CPU
+
+    if HAS_CPU and not IS_MACOS:
+        run_tests()
diff --git a/test/inductor/test_triton_heuristics.py b/test/inductor/test_triton_heuristics.py
index b63beb318e9c8..6375512cc1289 100644
--- a/test/inductor/test_triton_heuristics.py
+++ b/test/inductor/test_triton_heuristics.py
@@ -3,8 +3,11 @@
 import sys
 import unittest
 
+import torch
+from torch.testing._internal.common_device_type import expectedFailureXPU
+
 from torch.testing._internal.common_utils import IS_LINUX
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 
 try:
     import triton  # noqa: F401
@@ -13,12 +16,15 @@
         sys.exit(0)
     raise unittest.SkipTest("requires triton")  # noqa: TRY200
 
-from torch._dynamo.test_case import run_tests, TestCase
 from torch._inductor import config
-from torch._inductor.triton_heuristics import triton_config
+from torch._inductor.runtime.hints import TRITON_MAX_BLOCK
+from torch._inductor.runtime.triton_heuristics import triton_config
+from torch._inductor.test_case import run_tests, TestCase
 
 
 class TestTritonHeuristics(TestCase):
+    device_type = GPU_TYPE
+
     def test_triton_config(self):
         """
         Make sure block size does not exceed the maximum defined in inductor config.
@@ -28,9 +34,56 @@ def test_triton_config(self):
             key = f"{label}BLOCK"
             if key not in cfg.kwargs:
                 continue
-            self.assertTrue(cfg.kwargs[key] <= config.triton.max_block[label])
+            self.assertTrue(cfg.kwargs[key] <= TRITON_MAX_BLOCK[label])
+
+    def _test_artificial_zgrid(self):
+        def forward(primals_1, primals_2, primals_5):
+            view = torch.ops.aten.reshape.default(primals_5, [-1, 4, 128])
+            primals_5 = None
+            permute = torch.ops.aten.permute.default(view, [0, 2, 1])
+            clone = torch.ops.aten.clone.default(
+                permute, memory_format=torch.contiguous_format
+            )
+            permute = None
+            view_1 = torch.ops.aten.reshape.default(clone, [-1, 4])
+            clone = None
+            permute_1 = torch.ops.aten.permute.default(primals_1, [1, 0])
+            primals_1 = None
+            addmm = torch.ops.aten.addmm.default(primals_2, view_1, permute_1)
+            primals_2 = None
+            return addmm
+
+        s0 = 727828
+        s1 = 512
+
+        args = [
+            torch.rand([2, 4], device=GPU_TYPE),
+            torch.rand([2], device=GPU_TYPE),
+            torch.rand([s0, s1], device=GPU_TYPE),
+        ]
+        torch._dynamo.mark_dynamic(args[-1], 0)
+        foo_c = torch.compile(forward)
+
+        self.assertEqual(forward(*args), foo_c(*args))
+
+        args = [
+            torch.rand([2, 4], device=GPU_TYPE),
+            torch.rand([2], device=GPU_TYPE),
+            torch.rand([s0, s1], device=GPU_TYPE),
+        ]
+        self.assertEqual(forward(*args), foo_c(*args))
+
+    @unittest.skip("https://github.com/pytorch/pytorch/issues/123210")
+    @expectedFailureXPU
+    def test_artificial_zgrid(self):
+        self._test_artificial_zgrid()
+
+    @expectedFailureXPU
+    @config.patch("cpp_wrapper", True)
+    def test_artificial_grid_cpp_wrapper(self):
+        self._test_artificial_zgrid()
 
 
 if __name__ == "__main__":
-    if IS_LINUX and HAS_CUDA:
+    if IS_LINUX and HAS_GPU:
         run_tests()
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
new file mode 100644
index 0000000000000..b49ffefedf9b6
--- /dev/null
+++ b/test/inductor/test_triton_kernels.py
@@ -0,0 +1,2061 @@
+# Owner(s): ["module: inductor"]
+# flake8: noqa: E731
+# Skip do not assign a lambda expression, use a def
+from unittest.mock import patch
+
+import torch
+import torch._dynamo.testing
+
+import torch._inductor.test_case
+
+from torch._higher_order_ops.triton_kernel_wrap import (
+    generate_ttir,
+    triton_kernel_wrapper_functional,
+    triton_kernel_wrapper_mutation,
+)
+from torch._inductor import metrics
+from torch._inductor.utils import run_and_get_code
+from torch.testing._internal import common_utils
+from torch.testing._internal.common_utils import skipIfRocm, TEST_WITH_ROCM
+
+# Defines all the kernels for tests
+from torch.testing._internal.triton_utils import *  # noqa: F403
+
+if HAS_CUDA:
+    import triton
+    from triton import language as tl
+
+    if not TEST_WITH_ROCM:
+        from triton.language.extra.cuda.libdevice import (
+            fast_dividef,
+            fast_dividef as my_fast_dividef,
+        )
+
+
+# Define shared triton constants here.
+CONSTANT_C = 4
+STRING_CONSTANT_C = "CONSTANT_C"
+BOOL_CONSTANT_C = True
+
+
+class KernelTests(torch._inductor.test_case.TestCase):
+    @requires_cuda
+    def test_triton_kernel_with_kernel_param(self):
+        @triton.jit
+        def pass_kernel(kernel):
+            pass
+
+        @torch.compile(backend="eager")
+        def f(x):
+            grid = (x.numel(),)
+            pass_kernel[grid](kernel=x)
+
+        t1 = torch.rand(5, device="cuda")
+        f(t1)
+        # No need to assert anything, the goal is to make sure dynamo does
+        # not crash
+
+    @requires_cuda
+    def test_triton_kernel_higher_order_func(self):
+        from torch._higher_order_ops.triton_kernel_wrap import kernel_side_table
+
+        add_kernel_id = kernel_side_table.add_kernel(add_kernel)
+
+        t1 = torch.rand(5, device="cuda")
+        t2 = torch.rand(5, device="cuda")
+
+        torch_add = t1 + t2
+
+        # Test higher order function with mutation
+        output = torch.zeros_like(t1)
+        n_elements = output.numel()
+        constant_args_idx = kernel_side_table.add_constant_args(
+            {"n_elements": n_elements, "BLOCK_SIZE": 16}
+        )
+        grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+        triton_kernel_wrapper_mutation(
+            kernel_idx=add_kernel_id,
+            constant_args_idx=constant_args_idx,
+            grid=[grid],
+            kwargs={
+                "in_ptr0": t1,
+                "in_ptr1": t2,
+                "out_ptr": output,
+            },
+        )
+        self.assertEqual(output, torch_add)
+        # Make sure it is modified
+        self.assertNotEqual(output, torch.zeros_like(t1))
+
+        # Test higher order function without mutation
+        output = torch.zeros_like(t1)
+        out_dict = triton_kernel_wrapper_functional(
+            kernel_idx=add_kernel_id,
+            constant_args_idx=constant_args_idx,
+            grid=[grid],
+            kwargs={
+                "in_ptr0": t1,
+                "in_ptr1": t2,
+                "out_ptr": output,
+            },
+            tensors_to_clone=["in_ptr0", "in_ptr1", "out_ptr"],
+        )
+        self.assertEqual(out_dict["out_ptr"], torch_add)
+        # Make sure it is NOT modified
+        self.assertEqual(output, torch.zeros_like(t1))
+
+    @requires_cuda
+    @skipIfRocm
+    def test_triton_kernel_functionalize(self):
+        from functorch import make_fx
+        from torch._higher_order_ops.triton_kernel_wrap import kernel_side_table
+        from torch._subclasses.functional_tensor import (
+            CppFunctionalizeAPI,
+            FunctionalTensorMode,
+            PythonFunctionalizeAPI,
+        )
+
+        kernel_side_table.reset_table()
+
+        def f(x, output):
+            out = triton_kernel_wrapper_functional(
+                kernel_idx=kernel_side_table.add_kernel(mul2_kernel),
+                constant_args_idx=kernel_side_table.add_constant_args(
+                    {"n_elements": output.numel(), "BLOCK_SIZE": 16}
+                ),
+                grid=[(x.numel(),)],
+                kwargs={
+                    "in_ptr0": x,
+                    "out_ptr": output,
+                },
+                tensors_to_clone=["in_ptr0", "out_ptr"],
+            )
+            return out["out_ptr"]
+
+        t1 = torch.rand(5, device="cuda")
+        t2 = torch.rand(5, device="cuda")
+        with FunctionalTensorMode():
+            gm = make_fx(PythonFunctionalizeAPI().functionalize(f))(t1, t2)
+        # Make sure t2 was not modified
+        self.assertNotEqual(gm(t1, t2), t2)
+
+        gm = make_fx(CppFunctionalizeAPI().functionalize(f))(t1, t2)
+        # Make sure t2 was not modified
+        self.assertNotEqual(gm(t1, t2), t2)
+
+        gm = make_fx(torch.func.functionalize(f))(t1, t2)
+        # Make sure t2 was not modified
+        self.assertNotEqual(gm(t1, t2), t2)
+
+        gm = make_fx(f, tracing_mode="fake")(t1, t2)
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
+def forward(self, x_1, output_1):
+    triton_kernel_wrapper_functional_proxy = torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional(kernel_idx = 0, constant_args_idx = 3, grid = [(5,)], kwargs = {'in_ptr0': x_1, 'out_ptr': output_1}, tensors_to_clone = ['in_ptr0', 'out_ptr']);  x_1 = output_1 = None
+    getitem = triton_kernel_wrapper_functional_proxy['in_ptr0']
+    getitem_1 = triton_kernel_wrapper_functional_proxy['out_ptr'];  triton_kernel_wrapper_functional_proxy = None
+    return getitem_1""",
+        )
+
+    @requires_cuda
+    @skipIfRocm
+    def test_triton_kernel_mutation_type(self):
+        from torch._higher_order_ops.triton_kernel_wrap import kernel_side_table
+        from torch._subclasses.fake_tensor import FakeTensorMode
+        from torch._subclasses.functional_tensor import (
+            FunctionalTensor,
+            FunctionalTensorMode,
+        )
+
+        def prep():
+            x = torch.ones(4, device="cuda", requires_grad=True)
+            with FunctionalTensorMode():
+                x_func = FunctionalTensor.to_functional(x)
+            self.assertTrue(torch._is_functional_tensor(x_func.elem))
+            return x_func
+
+        # normal mutation only
+        with FakeTensorMode():
+            x_func = prep()
+
+            with FunctionalTensorMode():
+                x_func.mul_(2)
+
+            self.assertFalse(
+                torch._functionalize_are_all_mutations_hidden_from_autograd(x_func.elem)
+            )
+
+        # triton kernel mutation only
+        with FakeTensorMode():
+            x_func = prep()
+
+            with FunctionalTensorMode():
+                triton_kernel_wrapper_mutation(
+                    kernel_idx=kernel_side_table.add_kernel(mul2_inplace_kernel),
+                    constant_args_idx=kernel_side_table.add_constant_args(
+                        {"n_elements": x_func.numel(), "BLOCK_SIZE": 16}
+                    ),
+                    grid=[(x_func.numel(),)],
+                    kwargs={
+                        "ptr": x_func,
+                    },
+                )
+
+            self.assertTrue(
+                torch._functionalize_are_all_mutations_hidden_from_autograd(x_func.elem)
+            )
+
+        # normal mutation + triton kernel mutation
+        with FakeTensorMode():
+            x_func = prep()
+
+            with FunctionalTensorMode():
+                x_func.mul_(2)
+                triton_kernel_wrapper_mutation(
+                    kernel_idx=kernel_side_table.add_kernel(mul2_inplace_kernel),
+                    constant_args_idx=kernel_side_table.add_constant_args(
+                        {"n_elements": x_func.numel(), "BLOCK_SIZE": 16}
+                    ),
+                    grid=[(x_func.numel(),)],
+                    kwargs={
+                        "ptr": x_func,
+                    },
+                )
+
+            self.assertFalse(
+                torch._functionalize_are_all_mutations_hidden_from_autograd(x_func.elem)
+            )
+
+    @requires_cuda
+    @common_utils.parametrize("dynamic", [False, True])
+    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
+    def test_triton_kernel_with_views(self, dynamic, backend):
+        def call_triton_take_view(x: torch.Tensor):
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            mul2_kernel[grid](x, output, n_elements, BLOCK_SIZE=16)
+            return output
+
+        def call_triton_return_view(x: torch.Tensor):
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            mul2_kernel[grid](x, output, n_elements, BLOCK_SIZE=16)
+            return output.view(4, 4)
+
+        t = torch.rand(4, 4, device="cuda")
+        t_view = t.view(16)
+
+        compiled_func = torch.compile(
+            call_triton_take_view, backend=backend, fullgraph=True, dynamic=dynamic
+        )
+        self.assertEqual(2 * t_view, compiled_func(t_view))
+        self.assertEqual(2 * t, compiled_func(t_view).view(4, 4))
+
+        compiled_func = torch.compile(
+            call_triton_return_view, backend=backend, fullgraph=True, dynamic=dynamic
+        )
+        self.assertEqual(2 * t_view, compiled_func(t).view(16))
+        self.assertEqual(2 * t, compiled_func(t))
+
+    @requires_cuda
+    @common_utils.parametrize("grad_fn", [torch.no_grad, torch.enable_grad])
+    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
+    def test_triton_kernel_with_grad_option(self, grad_fn, backend):
+        def call_triton(x: torch.Tensor):
+            with grad_fn():
+                output = torch.zeros_like(x)
+                n_elements = output.numel()
+                grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+                mul2_kernel[grid](x, output, n_elements, BLOCK_SIZE=16)
+                return output
+
+        t = torch.rand(5, device="cuda")
+        compiled_func = torch.compile(call_triton, backend=backend, fullgraph=True)
+        self.assertEqual(2 * t, compiled_func(t))
+
+    @requires_cuda
+    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
+    def test_triton_kernel_inner_triton_function(self, backend):
+        def f(x: torch.Tensor):
+            @triton.jit
+            def pow2_kernel(
+                in_ptr0,
+                out_ptr,
+                n_elements,
+                BLOCK_SIZE: "tl.constexpr",
+            ):
+                pid = tl.program_id(axis=0)
+                block_start = pid * BLOCK_SIZE
+                offsets = block_start + tl.arange(0, BLOCK_SIZE)
+                mask = offsets < n_elements
+                x = tl.load(in_ptr0 + offsets, mask=mask)
+                output = x * x
+                tl.store(out_ptr + offsets, output, mask=mask)
+
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            pow2_kernel[grid](x, output, n_elements, BLOCK_SIZE=16)
+            return output
+
+        t = torch.rand(5, device="cuda")
+
+        compiled_func = torch.compile(f, backend=backend, fullgraph=True)
+        # TODO(oulgen): NYI - Support this
+        # self.assertEqual(t * t, compiled_func(t))
+
+    @requires_cuda
+    @common_utils.parametrize("grad", [False, True])
+    @common_utils.parametrize("dynamic", [False, True])
+    @patch.object(torch._inductor.config, "implicit_fallbacks", False)
+    def test_triton_kernel_no_clones(self, grad, dynamic):
+        from torch._inductor.utils import run_and_get_code
+
+        def call_triton(x: torch.Tensor, y: torch.Tensor, output: torch.Tensor):
+            n_elements = output.numel()
+
+            tmp = torch.add(x, 1)
+            grid = (x.numel(),)
+            add_kernel.run(
+                x, y, output, n_elements, warmup=False, grid=grid, BLOCK_SIZE=16
+            )
+
+            return output, tmp
+
+        t1 = torch.rand(5, device="cuda", requires_grad=grad)
+        t2 = torch.rand(5, device="cuda", requires_grad=grad)
+        o1 = torch.zeros_like(t1, requires_grad=grad)
+
+        torch_add = call_triton(t1, t2, o1)
+        metrics.reset()
+        o2 = torch.zeros_like(t1, requires_grad=grad)
+        test, codes = run_and_get_code(
+            torch.compile(call_triton, dynamic=dynamic), t1, t2, o2
+        )
+        if not grad:
+            self.assertEqual(metrics.generated_kernel_count, 1)
+        self.assertEqual(torch_add, test)
+        # These two asserts are not optimal since it requires original aten
+        # to be in the metadata, so there might be false negatives
+        self.assertTrue("aten.copy" not in codes[0])
+        self.assertTrue("aten.clone" not in codes[0])
+        # The following checks that there are only the tensor output is in
+        # the compiled graph
+        if dynamic and grad:
+            self.assertTrue("return (buf0, s0, )" in codes[0])
+        else:
+            self.assertTrue("return (buf0, )" in codes[0])
+
+    @requires_cuda
+    @skipIfRocm
+    def test_triton_kernel_caching(self):
+        from torch._inductor.utils import run_and_get_code
+
+        def add_in_loop(
+            x: torch.Tensor,
+            y: torch.Tensor,
+        ):
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            add_kernel_autotuned[grid](x, y, output, n_elements)
+            return output
+
+        def call_triton_add(
+            x: torch.Tensor,
+            y: torch.Tensor,
+        ):
+            for i in range(4):
+                x = add_in_loop(x, y)
+            return x
+
+        t1 = torch.ones(5, device="cuda")
+        t2 = torch.ones(5, device="cuda")
+
+        test, (code,) = run_and_get_code(torch.compile(call_triton_add), t1, t2)
+        self.assertEqual(test, 5 * torch.ones(5, device="cuda"))
+        self.assertTrue("add_kernel_autotuned_1.run" not in code)
+
+    @requires_cuda
+    @skipIfRocm
+    def test_triton_kernel_caching_duplicate(self):
+        from torch._inductor.utils import run_and_get_code
+
+        class C:
+            @triton.jit
+            def pass_kernel(
+                in_ptr0,
+                out_ptr,
+                n_elements,
+                BLOCK_SIZE: "tl.constexpr",
+            ):
+                pid = tl.program_id(axis=0)
+                block_start = pid * BLOCK_SIZE
+                offsets = block_start + tl.arange(0, BLOCK_SIZE)
+                mask = offsets < n_elements
+                x = tl.load(in_ptr0 + offsets, mask=mask)
+                tl.store(out_ptr + offsets, x, mask=mask)
+
+        class D:
+            @triton.jit
+            def pass_kernel(
+                in_ptr0,
+                out_ptr,
+                n_elements,
+                BLOCK_SIZE: "tl.constexpr",
+            ):
+                pid = tl.program_id(axis=0)
+                block_start = pid * BLOCK_SIZE
+                offsets = block_start + tl.arange(0, BLOCK_SIZE)
+                mask = offsets < n_elements
+                x = tl.load(in_ptr0 + offsets, mask=mask)
+                tl.store(out_ptr + offsets, x, mask=mask)
+
+        def call_triton(x: torch.Tensor):
+            output1 = torch.zeros_like(x)
+            output2 = torch.zeros_like(x)
+            n_elements = output1.numel()
+            grid = (n_elements,)
+            C.pass_kernel[grid](x, output1, n_elements, BLOCK_SIZE=16)
+            D.pass_kernel[grid](x, output2, n_elements, BLOCK_SIZE=16)
+            return output1 + output2
+
+        t = torch.ones(5, device="cuda")
+        test, (code,) = run_and_get_code(torch.compile(call_triton), t)
+        # Make sure we emitted two kernels here
+        self.assertTrue("pass_kernel_0.run" in code)
+        self.assertTrue("pass_kernel_1.run" in code)
+
+    @requires_cuda
+    @skipIfRocm
+    def test_triton_kernel_various_args(self):
+        @triton.autotune(
+            configs=[triton.Config({"BLOCK_SIZE": 128})],
+            key=[],
+        )
+        @triton.jit
+        def pass_kernel(
+            out_ptr,
+            n_elements,
+            dummy_None,
+            dummy_empty,
+            dummy_float,
+            BLOCK_SIZE: "tl.constexpr",
+            RANDOM_SIZE: "tl.constexpr",
+        ):
+            pass
+
+        @torch.compile
+        def call_triton(output):
+            n_elements = output.numel()
+            grid = (n_elements,)
+            pass_kernel[grid](
+                output,
+                n_elements,
+                None,
+                torch.empty_like(output),
+                3.1415926,
+                RANDOM_SIZE=0,
+            )
+            return output
+
+        output = torch.randn(5, device="cuda")
+        # Make sure this does not crash
+        call_triton(output)
+
+    @requires_cuda
+    @skipIfRocm
+    def test_triton_kernel_dependancies(self):
+        def call_triton(
+            x: torch.Tensor,
+            y: torch.Tensor,
+        ):
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            add_kernel_autotuned[grid](x, y, output, n_elements)
+            output2 = torch.zeros_like(output)
+            add_kernel_autotuned[grid](output, y, output2, n_elements)
+            output3 = torch.add(output2, 1)
+            return output3
+
+        t1 = torch.rand(5, device="cuda")
+        t2 = torch.rand(5, device="cuda")
+        torch_result = call_triton(t1, t2)
+        compiled_result = torch.compile(call_triton)(t1, t2)
+        self.assertEqual(torch_result, compiled_result)
+
+    @requires_cuda
+    @skipIfRocm
+    def test_triton_kernel_reinplace_inplaceable_pass(self):
+        def call_triton(
+            x: torch.Tensor,
+            y: torch.Tensor,
+        ):
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            add_kernel_autotuned[grid](x, y, output, n_elements)
+            add_kernel_autotuned[grid](output, x, output, n_elements)
+            return output
+
+        t1 = torch.rand(5, device="cuda")
+        t2 = torch.rand(5, device="cuda")
+        torch_result = call_triton(t1, t2)
+        compiled_result = torch.compile(call_triton)(t1, t2)
+        self.assertEqual(torch_result, compiled_result)
+
+    @requires_cuda
+    @common_utils.parametrize("grad", [False, True])
+    def test_triton_kernel_multi_kernel(self, grad):
+        @triton.jit
+        def mul2_and_add_and_zero_negatives_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+            ACTIVATION: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            indirection_kernel(
+                in_ptr0,
+                in_ptr0,
+                n_elements,
+                BLOCK_SIZE=BLOCK_SIZE,
+                ACTIVATION="mul2_inplace_kernel",
+            )
+            indirection_kernel(
+                in_ptr1,
+                in_ptr1,
+                n_elements,
+                BLOCK_SIZE=BLOCK_SIZE,
+                ACTIVATION="mul2_inplace_kernel",
+            )
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            if ACTIVATION == "zero_negs":
+                output = zero_negs(output)
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        @torch.compile
+        def call_triton(
+            x: torch.Tensor,
+            y: torch.Tensor,
+            xi: torch.Tensor,
+            yi: torch.Tensor,
+            output: torch.Tensor,
+            outputi: torch.Tensor,
+        ):
+            n_elements = output.numel()
+
+            grid = (x.numel(),)
+            mul2_and_add_and_zero_negatives_kernel[grid](
+                x, y, output, n_elements, BLOCK_SIZE=16, ACTIVATION="zero_negs"
+            )
+            mul2_and_add_and_zero_negatives_kernel[grid](
+                xi, yi, outputi, n_elements, BLOCK_SIZE=16, ACTIVATION=None
+            )
+
+            return (output, outputi)
+
+        t1 = torch.tensor(
+            [-2.0, -1.0, 0.0, 1.0, 2.0], device="cuda", requires_grad=grad
+        )
+        t2 = torch.tensor(
+            [-2.0, -1.0, 0.0, 1.0, 2.0], device="cuda", requires_grad=grad
+        )
+        float_result = 2 * t1 + 2 * t2
+        float_result = float_result.where(float_result >= 0, 0.0)
+
+        t1i = torch.randint(-2, 2, (5,), device="cuda")
+        t2i = torch.randint(-2, 2, (5,), device="cuda")
+        o = torch.zeros_like(t1, requires_grad=grad)
+        oi = torch.zeros_like(t1i)
+        int_result = 2 * t1i + 2 * t2i
+
+        (result, resulti) = call_triton(t1, t2, t1i, t2i, o, oi)
+        self.assertEqual(float_result, result)
+        self.assertEqual(int_result, resulti)
+
+    @requires_cuda
+    def test_triton_kernel_constants(self):
+        @triton.jit
+        def mulC_kernel(
+            in_ptr0,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+            CONSTANT_NAME: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            if CONSTANT_NAME.value == STRING_CONSTANT_C:
+                output = CONSTANT_C * x
+            if BOOL_CONSTANT_C:
+                output *= CONSTANT_C
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        def call_triton(
+            x: torch.Tensor,
+        ):
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+
+            grid = (x.numel(),)
+            mulC_kernel[grid](
+                x, output, n_elements, BLOCK_SIZE=16, CONSTANT_NAME="CONSTANT_C"
+            )
+            return output
+
+        # Triton kernels capture global constants by their parse time value
+        # not runtime value
+        global CONSTANT_C
+        prev_c = CONSTANT_C
+        # If the behavior of triton kernels change, this test will fail
+        CONSTANT_C = 10
+        assert CONSTANT_C != prev_c
+
+        t = torch.randn(5, device="cuda")
+        torch_result = call_triton(t)
+        compiled_result = torch.compile(call_triton)(t)
+
+        self.assertEqual(torch_result, compiled_result)
+
+        # reset back
+        CONSTANT_C = prev_c
+
+    @requires_cuda
+    @skipIfRocm
+    @common_utils.parametrize("grad", [False, True])
+    @common_utils.parametrize("dynamic", [False, True])
+    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
+    @common_utils.parametrize("grid_type", [1, 2, 3])
+    def test_triton_kernel_autotune(self, grad, dynamic, backend, grid_type):
+        def call_triton(x: torch.Tensor, y: torch.Tensor, output: torch.Tensor):
+            n_elements = output.numel()
+
+            def grid_fn(meta):
+                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+
+            if grid_type == 1:
+                grid = (n_elements,)
+            elif grid_type == 2:
+                grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            elif grid_type == 3:
+                grid = grid_fn
+
+            add_kernel_autotuned[grid](x, y, output, n_elements)
+            return output
+
+        t1 = torch.rand(256, device="cuda", requires_grad=grad)
+        t2 = torch.rand(256, device="cuda", requires_grad=grad)
+        output = torch.zeros_like(t1, requires_grad=grad)
+
+        torch_add = call_triton(t1, t2, output)
+        compiled_func = torch.compile(
+            call_triton, backend=backend, fullgraph=True, dynamic=dynamic
+        )
+
+        output2 = torch.zeros_like(t1, requires_grad=grad)
+        self.assertEqual(compiled_func(t1, t2, output2), torch_add)
+
+    @requires_cuda
+    @skipIfRocm
+    @common_utils.parametrize("grad", [False, True])
+    @common_utils.parametrize("dynamic", [False, True])
+    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
+    @common_utils.parametrize("grid_type", [1, 2, 3])
+    def test_triton_kernel_2d_autotune(self, grad, dynamic, backend, grid_type):
+        def call_triton(x: torch.Tensor, y: torch.Tensor, output: torch.Tensor):
+            x_elements = output.size()[0]
+            y_elements = output.size()[1]
+
+            def grid_fn(meta):
+                return (
+                    triton.cdiv(x_elements, meta["BLOCK_SIZE_X"]),
+                    triton.cdiv(y_elements, meta["BLOCK_SIZE_Y"]),
+                )
+
+            if grid_type == 1:
+                grid = (x_elements, y_elements)
+            elif grid_type == 2:
+                grid = lambda meta: (
+                    triton.cdiv(x_elements, meta["BLOCK_SIZE_X"]),
+                    triton.cdiv(y_elements, meta["BLOCK_SIZE_Y"]),
+                )
+            elif grid_type == 3:
+                grid = grid_fn
+
+            add_kernel_2d_autotuned[grid](x, y, output, x_elements, y_elements)
+            return output
+
+        t1 = torch.rand((512, 256), device="cuda", requires_grad=grad)
+        t2 = torch.rand((512, 256), device="cuda", requires_grad=grad)
+        output = torch.zeros_like(t1, requires_grad=grad)
+
+        torch_result = call_triton(t1, t2, output)
+        compiled_func = torch.compile(
+            call_triton, backend=backend, fullgraph=True, dynamic=dynamic
+        )
+        output2 = torch.zeros_like(t1, requires_grad=grad)
+        self.assertEqual(compiled_func(t1, t2, output2), torch_result)
+
+    @requires_cuda
+    @common_utils.parametrize("grad", [False, True])
+    @common_utils.parametrize("dynamic", [False, True])
+    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
+    @patch.object(torch._inductor.config, "implicit_fallbacks", False)
+    def test_triton_kernel_native(self, grad, dynamic, backend):
+        def call_triton_add(
+            x: torch.Tensor,
+            y: torch.Tensor,
+            output: torch.Tensor,
+            grid_type: int,
+            num=1,
+            positional=False,
+        ):
+            n_elements = output.numel()
+
+            def grid_fn(meta):
+                return (triton.cdiv(num, meta["BLOCK_SIZE"]),)
+
+            if grid_type == 0:
+                grid = (x.numel(),)
+            elif grid_type == 1:
+                grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            else:
+                grid = grid_fn
+
+            if positional:
+                add_kernel[grid](x, y, output, n_elements, 16)
+            else:
+                add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=16)
+
+            return output
+
+        t1 = torch.rand(5, device="cuda", requires_grad=grad)
+        t2 = torch.rand(5, device="cuda", requires_grad=grad)
+        o1 = torch.zeros_like(t1, requires_grad=grad)
+
+        torch_add = t1 + t2
+
+        # No Dynamo -- Make sure triton kernel works
+        self.assertEqual(call_triton_add(t1, t2, o1, 1), torch_add)
+        # No Dynamo -- Make sure triton kernel works (with positional BLOCK_SIZE)
+        o2 = torch.zeros_like(t1, requires_grad=grad)
+        self.assertEqual(call_triton_add(t1, t2, o2, 1, True), torch_add)
+
+        # With Dynamo
+        compiled_func = torch.compile(
+            call_triton_add, backend=backend, fullgraph=True, dynamic=dynamic
+        )
+        # With simple kernel
+        o3 = torch.zeros_like(t1, requires_grad=grad)
+        self.assertEqual(compiled_func(t1, t2, o3, 0), torch_add)
+        # With lambda kernel
+        o4 = torch.zeros_like(t1, requires_grad=grad)
+        self.assertEqual(compiled_func(t1, t2, o4, 1), torch_add)
+        # With lambda kernel (with positional BLOCK_SIZE)
+        o5 = torch.zeros_like(t1, requires_grad=grad)
+        self.assertEqual(compiled_func(t1, t2, o5, 1, 1, True), torch_add)
+        # With user defined function kernel
+        o6 = torch.zeros_like(t1, requires_grad=grad)
+        self.assertEqual(compiled_func(t1, t2, o6, 2, 200), torch_add)
+
+    @requires_cuda
+    def test_triton_kernel_mutation_not_mark_dirty(self):
+        @torch.compile
+        def f(x):
+            n_elements = x.numel()
+            add_kernel[(n_elements,)](x, x, x, n_elements, 16)
+            return x
+
+        x = torch.randn(5, device="cuda", requires_grad=True)
+        x_cloned = x.clone()
+        out = x_cloned.sin()
+        f(x_cloned)
+        out.sum().backward()
+
+    @requires_cuda
+    @patch.object(torch._inductor.config, "allow_buffer_reuse", True)
+    def test_triton_kernel_inputs_buffer_reuse(self):
+        def _mul2(x):
+            y = torch.empty_like(x)
+            mul2_kernel[(10,)](
+                in_ptr0=x,
+                out_ptr=y,
+                n_elements=x.numel(),
+                BLOCK_SIZE=1,
+            )
+            return y
+
+        @torch.compile
+        def f(x):
+            for _ in range(4):
+                # The output of one kernel is the input to the next kernel, but
+                # at some point we should re-use buffers not allocate new ones.
+                x = _mul2(x)
+            return x + 1
+
+        x = torch.randn(10, device="cuda", dtype=torch.float32)
+        eager_out = f(x)
+        compiled_out, (code,) = run_and_get_code(torch.compile(f), x)
+        self.assertEqual(compiled_out, eager_out)
+
+        # Check that we're allocating the minimal # of buffers.
+        num_bufs_allocated = code.count(
+            "empty_strided_cuda((10, ), (1, ), torch.float32)"
+        )
+        self.assertEqual(num_bufs_allocated, 2)
+
+        # Check we're re-using buffers if not allocating.
+        num_bufs_reused = code.count("# reuse")
+        self.assertEqual(num_bufs_reused, 3)
+
+    @requires_cuda
+    def test_triton_kernel_matmul_tracking(self):
+        @triton.jit
+        def ones_kernel(x_ptr, n_elements, BLOCK_SIZE: "tl.constexpr"):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = 1.0
+            tl.store(x_ptr + offsets, x, mask=mask)
+
+        @torch.compile
+        def f(x):
+            out = torch.zeros_like(x)
+            ones_kernel[(4,)](out, 16, BLOCK_SIZE=16)
+            return torch.mm(out, x) + 10
+
+        x = torch.randn(4, 4, device="cuda")
+        torch_out = f(x)
+        python_out = torch.mm(torch.ones(4, 4, device="cuda"), x) + 10
+        self.assertEqual(torch_out, python_out)
+
+    @requires_cuda
+    def test_triton_kernel_strided_input(self):
+        def f(inp):
+            # left has strides [256, 1]
+            left, right = torch.split(inp, [128, 128], dim=1)
+            out = torch.empty_like(left)
+            X_BLOCK_SIZE, Y_BLOCK_SIZE = 32, 16
+            grid = (left.size(1) // X_BLOCK_SIZE, left.size(0) // Y_BLOCK_SIZE)
+            double_strided_kernel[grid](
+                in_ptr=left,
+                out_ptr=out,
+                in_y_stride=left.stride(0),
+                out_y_stride=out.stride(0),
+                X_BLOCK_SIZE=X_BLOCK_SIZE,
+                Y_BLOCK_SIZE=Y_BLOCK_SIZE,
+            )
+            return out
+
+        inp = torch.randn(64, 256, device="cuda")
+
+        eager_out = f(inp)
+        compiled_out = torch.compile(f)(inp)
+        self.assertEqual(compiled_out, eager_out)
+
+    @requires_cuda
+    def test_triton_kernel_strided_input_nonzero_offset(self):
+        def f(inp):
+            # right has strides [256, 1] and storage offset 128
+            left, right = torch.split(inp, [128, 128], dim=1)
+            out = torch.empty_like(right)
+            X_BLOCK_SIZE, Y_BLOCK_SIZE = 32, 16
+            grid = (right.size(1) // X_BLOCK_SIZE, right.size(0) // Y_BLOCK_SIZE)
+            double_strided_kernel[grid](
+                in_ptr=right,
+                out_ptr=out,
+                in_y_stride=right.stride(0),
+                out_y_stride=out.stride(0),
+                X_BLOCK_SIZE=X_BLOCK_SIZE,
+                Y_BLOCK_SIZE=Y_BLOCK_SIZE,
+            )
+            return out
+
+        inp = torch.randn(64, 256, device="cuda")
+
+        eager_out = f(inp)
+        compiled_out = torch.compile(f)(inp)
+        self.assertEqual(compiled_out, eager_out)
+
+    @requires_cuda
+    def test_triton_kernel_slice_and_view_input(self):
+        def f(inp):
+            # left has strides [256, 1]
+            left = inp[:, :128]
+            left = left.view(64, 4, 32)
+            out = torch.empty_like(left)
+            X_BLOCK_SIZE, Y_BLOCK_SIZE = 32, 16
+            grid = (
+                (left.size(1) * left.size(2)) // X_BLOCK_SIZE,
+                left.size(0) // Y_BLOCK_SIZE,
+            )
+            double_strided_kernel[grid](
+                in_ptr=left,
+                out_ptr=out,
+                in_y_stride=left.stride(0),
+                out_y_stride=out.stride(0),
+                X_BLOCK_SIZE=X_BLOCK_SIZE,
+                Y_BLOCK_SIZE=Y_BLOCK_SIZE,
+            )
+            return out + left
+
+        inp = torch.randn(64, 256, device="cuda")
+
+        eager_out = f(inp)
+        compiled_out = torch.compile(f)(inp)
+        self.assertEqual(compiled_out, eager_out)
+
+    @requires_cuda
+    def test_triton_kernel_fallback(self):
+        def f(x, y):
+            out = torch.zeros_like(x)
+            out2 = torch.zeros_like(x)
+            # torch.mm is ExternKernelOut
+            add_kernel[(4,)](x, torch.mm(x, y), out, 4, 16)
+            # torch.sort creates fallback kernel and hence MultiOutput
+            add_kernel[(4,)](x, torch.sort(y).values, out, 4, 16)
+            return out, out2
+
+        x = torch.randn(4, 4, device="cuda")
+        y = torch.randn(4, 4, device="cuda")
+        eager_out = f(x, y)
+        compiled_out = torch.compile(f)(x, y)
+        self.assertEqual(compiled_out, eager_out)
+
+    @requires_cuda
+    @skipIfRocm
+    def test_triton_kernel_out_of_order(self):
+        @triton.jit
+        def add_kernel(
+            in_ptr0,
+            in_ptr1,
+            BLOCK_SIZE: "tl.constexpr",
+            out_ptr,
+            n_elements,
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        def f(x, y):
+            out = torch.zeros_like(x)
+            n_elements = x.numel()
+            add_kernel[(n_elements,)](x, y, 4, out, n_elements)
+            return out
+
+        x = torch.randn(4, device="cuda")
+        y = torch.randn(4, device="cuda")
+        eager_out = f(x, y)
+        compiled_out = torch.compile(f)(x, y)
+        self.assertEqual(compiled_out, eager_out)
+
+    @requires_cuda
+    @skipIfRocm
+    @torch._dynamo.config.patch(capture_dynamic_output_shape_ops=True)
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
+    def test_triton_kernel_unbacked_shape_tensor(self, backend):
+        @triton.jit
+        def square(
+            in_ptr,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr + offsets, mask=mask)
+            output = x * x
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        def f(x):
+            x = x[x > 2]
+            n_elements = x.numel()
+            output = torch.zeros_like(x)
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            square[grid](x, output, n_elements, BLOCK_SIZE=16)
+            return output
+
+        x = torch.randn(4, device="cuda")
+        eager_out = f(x)
+        compiled_out = torch.compile(f, fullgraph=True, backend=backend)(x)
+        self.assertEqual(compiled_out, eager_out)
+
+    @requires_cuda
+    @skipIfRocm
+    @common_utils.parametrize("dynamic", [False, True])
+    def test_triton_kernel_equal_to_1_arg(self, dynamic):
+        @triton.jit
+        def add_kernel_half_n_elements(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            half_n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < half_n_elements * 2
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        def f(x, y):
+            out = torch.empty_like(x)
+            half_n_elements = x.numel() // 2
+            add_kernel_half_n_elements[(half_n_elements,)](
+                x, y, out, half_n_elements, BLOCK_SIZE=16
+            )
+            return out
+
+        x = torch.randn(2, device="cuda")
+        y = torch.randn(2, device="cuda")
+        eager_out = f(x, y)
+        compiled_out, sources = run_and_get_code(
+            torch.compile(f, dynamic=dynamic), x, y
+        )
+
+        if dynamic:
+            # when half_n_elements passed to the Triton kernel is
+            # dynamic, equal_to_1 specializaiton can't be enforced
+            self.assertTrue("equal_to_1=()" in sources[0])
+        else:
+            self.assertTrue("equal_to_1=(3,)" in sources[0])
+        self.assertEqual(compiled_out, eager_out)
+
+    @requires_cuda
+    @skipIfRocm
+    @common_utils.parametrize("dynamic", [False, True])
+    def test_triton_kernel_equal_to_1_float_arg(self, dynamic):
+        def f(x, y):
+            out = torch.empty_like(x)
+            n_elements = x.numel()
+            scaling_factor = (n_elements**0) / 1.0
+            add_kernel_with_scaling[(n_elements,)](
+                x,
+                y,
+                out,
+                n_elements,
+                scaling_factor,
+                BLOCK_SIZE=16,
+            )
+            return out
+
+        x = torch.randn(2, device="cuda")
+        y = torch.randn(2, device="cuda")
+        eager_out = f(x, y)
+        compiled_out, sources = run_and_get_code(
+            torch.compile(f, dynamic=dynamic), x, y
+        )
+
+        # float 1.0 (both literal or symbolic)
+        # should not be added to equal_to_1
+        self.assertTrue("equal_to_1=()" in sources[0])
+        self.assertEqual(compiled_out, eager_out)
+
+    @requires_cuda
+    @skipIfRocm
+    def test_triton_kernel_with_imported_symbol(self):
+        @triton.jit
+        def add_kernel_with_imported_symbol(
+            in_ptr,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr + offsets, mask=mask)
+            output = fast_dividef(x, 3.14)
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        def f(x):
+            out = torch.empty_like(x)
+            n_elements = x.numel()
+            add_kernel_with_imported_symbol[(n_elements,)](
+                x, out, n_elements, BLOCK_SIZE=16
+            )
+            return out
+
+        x = torch.randn(4, device="cuda")
+        eager_out = f(x)
+        compiled_out = torch.compile(f)(x)
+
+        self.assertEqual(compiled_out, eager_out)
+
+    @requires_cuda
+    @skipIfRocm
+    def test_triton_kernel_with_imported_symbol_with_custom_name(self):
+        @triton.jit
+        def add_kernel_with_imported_symbol(
+            in_ptr,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr + offsets, mask=mask)
+            output = my_fast_dividef(x, 3.14)
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        def f(x):
+            out = torch.empty_like(x)
+            n_elements = x.numel()
+            add_kernel_with_imported_symbol[(n_elements,)](
+                x, out, n_elements, BLOCK_SIZE=16
+            )
+            return out
+
+        x = torch.randn(4, device="cuda")
+        eager_out = f(x)
+        compiled_out = torch.compile(f)(x)
+
+        self.assertEqual(compiled_out, eager_out)
+
+    @requires_cuda
+    @skipIfRocm
+    @common_utils.parametrize("size", [4, 16])
+    @common_utils.parametrize("dynamic", [False, True])
+    def test_triton_kernel_different_shapes(self, size, dynamic):
+        from torch._inductor.utils import run_and_get_code
+
+        def f(x, y, xx, yy):
+            n_elements = x.numel()
+            output_1 = torch.zeros_like(x)
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            add_kernel[grid](x, y, output_1, n_elements, BLOCK_SIZE=4)
+
+            n_elements = xx.numel()
+            output_2 = torch.zeros_like(xx)
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            add_kernel[grid](xx, yy, output_2, n_elements, BLOCK_SIZE=4)
+
+            return output_1, output_2
+
+        x = torch.rand(size, device="cuda")
+        y = torch.rand(size, device="cuda")
+        xx = torch.rand(size, size, device="cuda")
+        yy = torch.rand(size, size, device="cuda")
+        args = [x, y, xx, yy]
+
+        eager_out = f(*args)
+        compiled_out, (code,) = run_and_get_code(
+            torch.compile(f, fullgraph=True, dynamic=dynamic, backend="inductor"), *args
+        )
+        if size == 4 and not dynamic:
+            # Produce 2 kernels due to divisibility
+            self.assertTrue("add_kernel_0.run" in code)
+            self.assertTrue("add_kernel_1.run" in code)
+        else:
+            # size == 16 or dynamic
+            # Only one kernel
+            self.assertTrue("add_kernel_0.run" in code)
+            self.assertTrue("add_kernel_1.run" not in code)
+
+        self.assertEqual(compiled_out, eager_out)
+
+    @requires_cuda
+    @skipIfRocm
+    def test_triton_kernel_reset_to_zero(self):
+        @triton.autotune(
+            configs=[
+                triton.Config({"BLOCK_SIZE": 128}, num_stages=3, num_warps=8),
+                triton.Config({"BLOCK_SIZE": 64}, num_stages=3, num_warps=8),
+            ],
+            key=["n_elements"],
+            reset_to_zero=["out_ptr"],
+        )
+        @triton.jit
+        def add_kernel_autotuned_reset(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        @torch.compile(fullgraph=True)
+        def f(x, y):
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            add_kernel_autotuned_reset[grid](x, y, output, n_elements)
+            return output
+
+        x = torch.randn(4, device="cuda")
+        msg = "Only configs and keys are supported for triton.autotune"
+        with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, msg):
+            f(x, x)
+
+    @requires_cuda
+    @skipIfRocm
+    @common_utils.parametrize("dynamic", [False, True])
+    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
+    def test_triton_kernel_triton_dtype(self, dynamic, backend):
+        @triton.jit
+        def add_kernel_with_dtype(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            dtype: "tl.constexpr",
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask).to(dtype)
+            y = tl.load(in_ptr1 + offsets, mask=mask).to(dtype)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        def f(x, y, dtype_torch, dtype_triton):
+            output = torch.zeros_like(x).to(dtype=dtype_torch)
+            n_elements = output.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            add_kernel_with_dtype[grid](
+                x, y, output, dtype_triton, n_elements, BLOCK_SIZE=4
+            )
+            return output
+
+        x = torch.randn(4, device="cuda")
+        y = torch.randn(4, device="cuda")
+        args_list = (
+            [x, y, torch.float32, tl.float32],
+            [x, y, torch.bfloat16, tl.bfloat16],
+        )
+        for args in args_list:
+            eager_out = f(*args)
+            compiled_out = torch.compile(
+                f, fullgraph=True, backend=backend, dynamic=dynamic
+            )(*args)
+            self.assertEqual(compiled_out, eager_out)
+
+    @requires_cuda
+    @skipIfRocm
+    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
+    def test_triton_kernel_special_kwargs_with_autotune(self, backend):
+        @triton.autotune(
+            configs=[
+                triton.Config({"BLOCK_SIZE": 128}),
+                triton.Config({"BLOCK_SIZE": 64}),
+            ],
+            key=["n_elements"],
+        )
+        @triton.jit
+        def add_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        @torch.compile(fullgraph=True, backend=backend)
+        def f(x, y):
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            add_kernel[grid](
+                x,
+                y,
+                output,
+                n_elements,
+                num_warps=8,
+                num_stages=3,
+            )
+            return output
+
+        x = torch.randn(4, device="cuda")
+        f(x, x)
+
+    @requires_cuda
+    @skipIfRocm
+    @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
+    def test_triton_kernel_special_kwargs_without_autotune(self, backend):
+        @triton.jit
+        def add_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        @torch.compile(fullgraph=True, backend=backend)
+        def f(x, y):
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            add_kernel[grid](
+                x,
+                y,
+                output,
+                n_elements,
+                BLOCK_SIZE=128,
+                num_warps=8,
+                num_stages=3,
+            )
+            return output
+
+        x = torch.randn(4, device="cuda")
+        f(x, x)
+
+
+def make_mutation_test(fn):
+    @requires_cuda
+    @skipIfRocm
+    def test_fn(self):
+        from torch._higher_order_ops.triton_kernel_wrap import identify_mutated_tensors
+
+        kernel, inputs, outputs = fn()
+        self.assertListEqual(
+            identify_mutated_tensors(kernel, inputs),
+            outputs,
+        )
+
+    return test_fn
+
+
+# Triton codegen suffers from scoping issues.
+# Define helpers here
+if HAS_CUDA:
+
+    @triton.jit
+    def helper_id(p):
+        return p
+
+    @triton.jit
+    def helper_add_and_out(x, y, out_ptr):
+        return x + y, out_ptr
+
+
+class MutationTests(torch._inductor.test_case.TestCase):
+    # Tests injected below
+
+    @make_mutation_test
+    def test_out_of_order_kernel():
+        @triton.jit
+        def add_kernel_out_of_order(
+            in_ptr0,
+            n_elements,
+            in_ptr1,
+            out_ptr,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        t = torch.randn(4)
+        return (
+            add_kernel_out_of_order,
+            {
+                "in_ptr0": t,
+                "n_elements": 4,
+                "in_ptr1": t,
+                "out_ptr": t,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        )
+
+    @make_mutation_test
+    def test_out_of_order_kernel_call():
+        @triton.jit
+        def add_kernel_out_of_order_fn1(
+            in_ptr0,
+            n_elements,
+            in_ptr1,
+            out_ptr,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            add_kernel_out_of_order_fn2(
+                in_ptr0, in_ptr1, n_elements, out_ptr, BLOCK_SIZE=BLOCK_SIZE
+            )
+
+        t = torch.randn(4)
+        return (
+            add_kernel_out_of_order_fn1,
+            {
+                "in_ptr0": t,
+                "n_elements": 4,
+                "in_ptr1": t,
+                "out_ptr": t,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        )
+
+    @make_mutation_test
+    def test_reduce_sum():
+        @triton.jit
+        def reduce_sum_kernel(a_ptr, c_ptr, stride_am, stride_an):
+            offs_am = tl.arange(0, 4)
+            offs_an = tl.arange(0, 4)
+            a_ptrs = a_ptr + (
+                offs_am[:, None] * stride_am + offs_an[None, :] * stride_an
+            )
+            a = tl.load(a_ptrs)
+            m = tl.sum(a, axis=1)
+            tl.store(c_ptr + tl.arange(0, 4), m)
+
+        t = torch.randn(4)
+        kernel = reduce_sum_kernel
+        kwargs = {
+            "a_ptr": t,
+            "c_ptr": t,
+            "stride_am": 4,
+            "stride_an": 4,
+        }
+
+        # TODO(aakhundov): tt.reduce is now supported, but only
+        # in the new MLIR-based Triton analysis pass (not in the
+        # old TTIR string parsing-based one). remove this gating
+        # and use ["c_ptr"] as `expected` after the new Triton
+        # pin lands both in OSS and internally.
+        ttir_module, _ = generate_ttir(kernel, kwargs)
+        if hasattr(ttir_module, "walk"):
+            # with MLIR-based Triton analysis pass
+            expected = ["c_ptr"]
+        else:
+            # with TTIR string parsing-based Triton analysis pass
+            expected = ["a_ptr", "c_ptr"]
+
+        return (
+            kernel,
+            kwargs,
+            expected,
+        )
+
+    @make_mutation_test
+    def test_argmax():
+        @triton.jit
+        def argmax_kernel(a_ptr, c_ptr, stride_am, stride_an):
+            offs_am = tl.arange(0, 4)
+            offs_an = tl.arange(0, 4)
+            a_ptrs = a_ptr + (
+                offs_am[:, None] * stride_am + offs_an[None, :] * stride_an
+            )
+            a = tl.load(a_ptrs)
+            m = tl.argmax(a, axis=1)
+            tl.store(c_ptr + tl.arange(0, 4), m)
+
+        t = torch.randn(4)
+        kernel = argmax_kernel
+        kwargs = {
+            "a_ptr": t,
+            "c_ptr": t,
+            "stride_am": 4,
+            "stride_an": 4,
+        }
+
+        # TODO(aakhundov): tt.reduce is now supported, but only
+        # in the new MLIR-based Triton analysis pass (not in the
+        # old TTIR string parsing-based one). remove this gating
+        # and use ["c_ptr"] as `expected` after the new Triton
+        # pin lands both in OSS and internally.
+        ttir_module, _ = generate_ttir(kernel, kwargs)
+        if hasattr(ttir_module, "walk"):
+            # with MLIR-based Triton analysis pass
+            expected = ["c_ptr"]
+        else:
+            # with TTIR string parsing-based Triton analysis pass
+            expected = ["a_ptr", "c_ptr"]
+
+        return (
+            kernel,
+            kwargs,
+            expected,
+        )
+
+    @make_mutation_test
+    def test_cumsum():
+        @triton.jit
+        def cumsum_kernel(in_ptr, out_ptr, XBLOCK: tl.constexpr, RBLOCK: tl.constexpr):
+            rindex = tl.arange(0, RBLOCK)[None, :]
+            xindex = tl.arange(0, XBLOCK)[:, None]
+            data = tl.load(in_ptr + rindex)
+            scan = tl.cumsum(data, 1)
+            expected_max = tl.sum(data, 1)
+            tl.device_assert(scan <= expected_max)
+            tl.store(out_ptr + xindex * RBLOCK + rindex, scan)
+
+        t = torch.randn(4)
+        kernel = cumsum_kernel
+        kwargs = {
+            "in_ptr": t,
+            "out_ptr": t,
+            "XBLOCK": 4,
+            "RBLOCK": 16,
+        }
+
+        # TODO(aakhundov): tt.scan is now supported, but only
+        # in the new MLIR-based Triton analysis pass (not in the
+        # old TTIR string parsing-based one). remove this gating
+        # and use ["out_ptr"] as `expected` after the new Triton
+        # pin lands both in OSS and internally.
+        ttir_module, _ = generate_ttir(kernel, kwargs)
+        if hasattr(ttir_module, "walk"):
+            # with MLIR-based Triton analysis pass
+            expected = ["out_ptr"]
+        else:
+            # with TTIR string parsing-based Triton analysis pass
+            expected = ["in_ptr", "out_ptr"]
+
+        return (
+            kernel,
+            kwargs,
+            expected,
+        )
+
+    @make_mutation_test
+    def test_fn_call_one_return():
+        @triton.jit
+        def add_kernel_with_fn_call(
+            in_ptr0,
+            in_ptr1,
+            n_elements,
+            out_ptr,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            out = helper_id(out_ptr)
+            tl.store(out + offsets, output, mask=mask)
+
+        t = torch.randn(4)
+        return (
+            add_kernel_with_fn_call,
+            {
+                "in_ptr0": t,
+                "in_ptr1": t,
+                "n_elements": 4,
+                "out_ptr": t,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        )
+
+    @make_mutation_test
+    def test_fn_call_multi_return():
+        @triton.jit
+        def add_kernel_with_fn_call(
+            in_ptr0,
+            in_ptr1,
+            n_elements,
+            out_ptr,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output, out = helper_add_and_out(x, y, out_ptr)
+            tl.store(out + offsets, output, mask=mask)
+
+        t = torch.randn(4)
+        return (
+            add_kernel_with_fn_call,
+            {
+                "in_ptr0": t,
+                "in_ptr1": t,
+                "n_elements": 4,
+                "out_ptr": t,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        )
+
+    @make_mutation_test
+    def test_nested_cond_op_kernel():
+        @triton.jit
+        def nested_cond_op_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            if tl.program_id(0) == 0:
+                if tl.program_id(1) == 0:
+                    output = x + y
+                    tl.store(out_ptr + offsets, output, mask=mask)
+            else:
+                pass
+
+        t = torch.randn(4)
+        return (
+            nested_cond_op_kernel,
+            {
+                "in_ptr0": t,
+                "in_ptr1": t,
+                "out_ptr": t,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        )
+
+    @make_mutation_test
+    def test_add_for_loop():
+        @triton.jit
+        def add_4_times_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = tl.zeros((n_elements,), dtype=tl.float32)
+            for i in range(4):
+                output += x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        t = torch.randn(4)
+        return (
+            add_4_times_kernel,
+            {
+                "in_ptr0": t,
+                "in_ptr1": t,
+                "out_ptr": t,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        )
+
+    @make_mutation_test
+    def test_add_for_loop2():
+        @triton.jit
+        def add_1_time_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            for i in range(0, BLOCK_SIZE):
+                i = tl.multiple_of(i, 1)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        t = torch.randn(4)
+        return (
+            add_1_time_kernel,
+            {
+                "in_ptr0": t,
+                "in_ptr1": t,
+                "out_ptr": t,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        )
+
+    @make_mutation_test
+    def test_add_nested_for_loop():
+        @triton.jit
+        def add_4_times_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = tl.zeros((n_elements,), dtype=tl.float32)
+            for i in range(2):
+                for j in range(2):
+                    output += x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        t = torch.randn(4)
+        return (
+            add_4_times_kernel,
+            {
+                "in_ptr0": t,
+                "in_ptr1": t,
+                "out_ptr": t,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        )
+
+    @make_mutation_test
+    def test_add_nested_for_loop_multi_return():
+        @triton.jit
+        def add_4_times_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output1 = tl.zeros((n_elements,), dtype=tl.float32)
+            output2 = tl.zeros((n_elements,), dtype=tl.float32)
+            for i in range(2):
+                for j in range(2):
+                    output1 += y
+                    output2 += x
+            output = output1 + output2
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        t = torch.randn(4)
+        return (
+            add_4_times_kernel,
+            {
+                "in_ptr0": t,
+                "in_ptr1": t,
+                "out_ptr": t,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        )
+
+    @make_mutation_test
+    def test_labels():
+        @triton.jit
+        def kernel_with_label(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            if pid > 1:
+                return
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        t = torch.randn(4)
+        return (
+            kernel_with_label,
+            {
+                "in_ptr0": t,
+                "in_ptr1": t,
+                "out_ptr": t,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        )
+
+    @make_mutation_test
+    def test_for_loop_arg():
+        @triton.jit
+        def fwd_kernel(
+            X_ptr,
+            W1_ptr,
+            b1_ptr,
+            O_ptr,
+            M: tl.constexpr,
+            C1: tl.constexpr,
+            C2: tl.constexpr,
+            BLOCK_SIZE_M: tl.constexpr,
+            BLOCK_SIZE_C2: tl.constexpr,
+        ):
+            # Get program ids
+            pid_m = tl.program_id(0)
+
+            # Compute offsets
+            offs_c1 = tl.arange(0, C1)
+            offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+
+            # Load input data
+            x_block_ptr = X_ptr + offs_m[:, None] * C1 + offs_c1[None, :]
+            x = tl.load(x_block_ptr)
+
+            # Compute gating
+            for c2 in range(0, tl.cdiv(C2, BLOCK_SIZE_C2)):
+                # Compute block pointers
+                offs_c2 = c2 * BLOCK_SIZE_C2 + tl.arange(0, BLOCK_SIZE_C2)
+                o_block_ptr = O_ptr + offs_m[:, None] * C2 + offs_c2[None, :]
+                w1_block_ptr = W1_ptr + offs_c1[:, None] * C2 + offs_c2[None, :]
+                b1_block_ptr = b1_ptr + offs_c2
+
+                # Compute output
+                w = tl.load(w1_block_ptr)
+                b = tl.load(b1_block_ptr)
+                o = tl.dot(x, w, allow_tf32=False)
+                o += b[None, :]
+
+                # Store output
+                tl.store(o_block_ptr, o)
+
+        t = torch.randn(64)
+        return (
+            fwd_kernel,
+            {
+                "X_ptr": t,
+                "W1_ptr": t,
+                "b1_ptr": t,
+                "O_ptr": t,
+                "M": 64,
+                "C1": 64,
+                "C2": 64,
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_C2": 64,
+            },
+            ["O_ptr"],
+        )
+
+
+if HAS_CUDA:
+    t = torch.randn(4)
+    tt = torch.randn(4, 1)
+    tests = [
+        [
+            add_kernel,
+            {
+                "in_ptr0": t,
+                "in_ptr1": t,
+                "out_ptr": t,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        ],
+        [
+            add_kernel_2d_autotuned,
+            {
+                "in_ptr0": t,
+                "in_ptr1": t,
+                "out_ptr": t,
+                "x_elements": 4,
+                "y_elements": 4,
+            },
+            ["out_ptr"],
+        ],
+        [
+            indirection_kernel,
+            {
+                "in_ptr0": t,
+                "out_ptr": t,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+                "ACTIVATION": "mul2_inplace_kernel",
+            },
+            ["in_ptr0", "out_ptr"],
+        ],
+        [
+            indirection_kernel,
+            {
+                "in_ptr0": t,
+                "out_ptr": t,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+                "ACTIVATION": "add_kernel",
+            },
+            ["out_ptr"],
+        ],
+        [
+            mul2_inplace_kernel,
+            {"ptr": t, "n_elements": 4, "BLOCK_SIZE": 4},
+            ["ptr"],
+        ],
+        # Cant optimize since the kernel contains a tl.inline_asm_elementwise
+        [
+            inline_asm_kernel,
+            {"X": t, "Y": t, "Z": t, "n": 4, "BLOCK": 4},
+            ["X", "Y", "Z"],
+        ],
+        [
+            add_kernel_with_block_ptr,
+            {
+                "x_ptr": t,
+                "y_ptr": t,
+                "output_ptr": t,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+            },
+            ["output_ptr"],
+        ],
+        [
+            kernel_with_block_ptr_2d,
+            {
+                "x_ptr": tt,
+                "output_ptr": tt,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+            },
+            ["output_ptr"],
+        ],
+        [
+            add_kernel_with_import,
+            {
+                "in_ptr0": t,
+                "in_ptr1": t,
+                "out_ptr": t,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        ],
+        [
+            atomic_add_kernel,
+            {
+                "in_ptr0": t,
+                "in_ptr1": t,
+                "out_ptr": t,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        ],
+        [
+            add_4_times_kernel,
+            {
+                "in_ptr0": t,
+                "in_ptr1": t,
+                "out_ptr": t,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        ],
+        [
+            cond_op_kernel,
+            {
+                "in_ptr0": t,
+                "in_ptr1": t,
+                "out_ptr": t,
+                "n_elements": 4,
+                "BLOCK_SIZE": 4,
+            },
+            ["out_ptr"],
+        ],
+    ]
+    for kernel, inputs, outputs in tests:
+        fn = make_mutation_test(
+            # Add default arguments to avoid Python lambda capture pitfall
+            # This forces the capture at lambda creation
+            lambda kernel=kernel, inputs=inputs, outputs=outputs: (
+                kernel,
+                inputs,
+                outputs,
+            )
+        )
+        name = f"test_mutations_{kernel.fn.__name__}"
+        # Poor way to make test names be unique
+        while name in MutationTests.__dict__:
+            name += "1"
+
+        setattr(MutationTests, name, fn)
+
+
+common_utils.instantiate_parametrized_tests(KernelTests)
+
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    run_tests()
diff --git a/test/inductor/test_triton_wrapper.py b/test/inductor/test_triton_wrapper.py
index ae0725a3de3cf..24ba84ebf86ac 100644
--- a/test/inductor/test_triton_wrapper.py
+++ b/test/inductor/test_triton_wrapper.py
@@ -4,9 +4,9 @@
 import sys
 
 import torch
-from torch._dynamo.test_case import run_tests, TestCase
 from torch._inductor.codecache import PyCodeCache
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 
 
 class TestTritonWrapper(TestCase):
@@ -22,7 +22,7 @@ def get_compiled_module(self):
         self.assertTrue(compiled_module is not None)
         return compiled_module
 
-    def test_wrapper_using_cuda_seed(self):
+    def test_wrapper_using_gpu_seed(self):
         """
         Make sure the subprocess.check_output does not throw.
         """
@@ -34,8 +34,8 @@ def f(x, y):
             return z + y
 
         N = 10
-        x = torch.rand(N).to("cuda")
-        y = torch.rand(N).to("cuda")
+        x = torch.rand(N).to(device=GPU_TYPE)
+        y = torch.rand(N).to(device=GPU_TYPE)
         out = f(x, y)
         compiled_module = self.get_compiled_module()
 
@@ -49,5 +49,5 @@ def f(x, y):
 
 
 if __name__ == "__main__":
-    if HAS_CUDA:
+    if HAS_GPU:
         run_tests()
diff --git a/test/inductor/test_unbacked_symints.py b/test/inductor/test_unbacked_symints.py
index 9caec73f32cb7..43d1307fcfa58 100644
--- a/test/inductor/test_unbacked_symints.py
+++ b/test/inductor/test_unbacked_symints.py
@@ -1,17 +1,23 @@
 # Owner(s): ["module: inductor"]
 
+import unittest
+
 import torch
 
 from torch._dynamo import config as dynamo_config
 from torch._inductor import config as inductor_config
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch._inductor.utils import is_big_gpu
 from torch.testing import make_tensor
-
-from torch.testing._internal.common_utils import IS_LINUX, TestCase as TorchTestCase
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_utils import IS_LINUX
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA, skipCUDAIf
 
 
-class TestUnbackedSymints(TorchTestCase):
-    def test_expand(self):
+class TestUnbackedSymints(InductorTestCase):
+    @skipCUDAIf(not HAS_CUDA, "requires cuda")
+    @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
+    def test_expand(self, device):
         def fn(x, y):
             nz = torch.nonzero(x)
             # unbacked symint in nz.size
@@ -21,27 +27,43 @@ def fn(x, y):
             return x_exp, y_exp
 
         example_inputs = (
-            torch.randn((32), device="cuda"),
-            torch.randn((32, 1), device="cuda"),
+            torch.randn((32), device=device),
+            torch.randn((32, 1), device=device),
         )
 
-        with dynamo_config.patch({"capture_dynamic_output_shape_ops": True}):
-            actual = torch.compile(fn, fullgraph=True)(*example_inputs)
-            expected = fn(*example_inputs)
+        actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
 
         torch.testing.assert_close(actual, expected)
 
-    def test_expand_mismatch(self):
+    @skipCUDAIf(not HAS_CUDA, "requires cuda")
+    @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
+    def test_expand_ok_with_runtime_assert(self, device):
+        def fn(x):
+            nz = x.nonzero()
+            torch._check(nz.size(0) == 128)
+            return nz.expand([128, -1, 2])
+
+        x = make_tensor(32, 4, device=device, dtype=torch.float32, exclude_zero=True)
+        actual = torch.compile(fn, fullgraph=True)(x)
+
+    @skipCUDAIf(not HAS_CUDA, "requires cuda")
+    @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
+    def test_broadcast_tensors(self, device):
         def fn(x):
             nz = x.nonzero()
-            return nz.expand([-1, 128])
+            a = torch.zeros([nz.size(0), 512])
+            b = torch.ones([nz.size(0), 1])
+            return a * b
 
-        x = make_tensor(32, 4, device="cpu", dtype=torch.float32, exclude_zero=True)
-        with dynamo_config.patch({"capture_dynamic_output_shape_ops": True}):
-            with self.assertRaises(torch._dynamo.exc.TorchRuntimeError):
-                actual = torch.compile(fn, fullgraph=True)(x)
+        x = torch.randn(32, 4, device=device)
+        actual = torch.compile(fn, fullgraph=True)(x)
+        expected = fn(x)
+        torch.testing.assert_close(actual, expected)
 
-    def test_autotuning(self):
+    @skipCUDAIf(not HAS_CUDA, "requires cuda")
+    @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
+    def test_autotuning(self, device):
         def fn(x, y):
             nz = torch.nonzero(x)
             # unbacked symint in the GEMM input shape
@@ -49,41 +71,130 @@ def fn(x, y):
             return a @ y
 
         example_inputs = (
-            torch.randn((64), device="cuda"),
-            torch.randn((32, 16), device="cuda"),
+            torch.randn((64), device=device),
+            torch.randn((32, 16), device=device),
         )
 
-        with dynamo_config.patch({"capture_dynamic_output_shape_ops": True}):
-            with inductor_config.patch(
-                {
-                    "max_autotune_gemm": True,
-                    "max_autotune_gemm_backends": "TRITON",
-                }
-            ):
-                actual = torch.compile(fn, fullgraph=True)(*example_inputs)
-                expected = fn(*example_inputs)
+        with inductor_config.patch(
+            {
+                "max_autotune_gemm": True,
+            }
+        ):
+            actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+            expected = fn(*example_inputs)
 
         torch.testing.assert_close(actual, expected)
 
-    def test_split_with_sizes(self):
+    @skipCUDAIf(not HAS_CUDA, "requires cuda")
+    @dynamo_config.patch({"capture_scalar_outputs": True})
+    def test_split_with_sizes(self, device):
         def fn(x, y):
             l = y.tolist()
             s = torch.split(x, l)
             d = l[0] + l[1] + l[2]
             return s[0].sum(), d
 
-        example_inputs = (torch.randn((32), device="cuda"), torch.tensor((7, 16, 9)))
+        example_inputs = (torch.randn((32), device=device), torch.tensor((7, 16, 9)))
+
+        actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
+
+        torch.testing.assert_close(actual, expected)
+
+    @skipCUDAIf(not HAS_CUDA, "requires cuda")
+    @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
+    def test_view_of_slice(self, device):
+        # Tests View.create(slice, size_with_unbacked_symint)
+        def fn(x):
+            nz = torch.nonzero(x)  # introduce unbacked symint
+            squared = nz * nz  # avoid ReinterpretView when lowering Slice
+            sliced = torch.ops.aten.slice.Tensor(squared, dim=1, start=-2, end=None)
+            view = sliced.unsqueeze(dim=0)
+            return view.squeeze(
+                dim=0
+            )  # make sure no unbacked symint in output's stride
+
+        example_inputs = (torch.randn(1, 1, 1, 1, device=device),)
+        actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+
+    @skipCUDAIf(not HAS_CUDA, "requires cuda")
+    @dynamo_config.patch({"capture_scalar_outputs": True})
+    @inductor_config.patch({"abi_compatible": True})
+    def test_triton_kernel_grid(self, device):
+        if device == "cpu":
+            raise unittest.SkipTest("Triton kernel requires GPU")
+
+        from torch.testing._internal.triton_utils import add_kernel
+
+        def fn(x):
+            maxlen = max(x.item(), 512)
+            a = torch.ones(maxlen, device=device)
+            b = torch.ones(maxlen, device=device)
+            out = torch.zeros_like(a)
+            # unbacked symint in grid
+            add_kernel[(1, 1, maxlen)](a, b, out, maxlen, 32)
+            return out
+
+        example_inputs = (torch.randint(high=1024, size=(1,), device=device),)
+        actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+
+    @skipCUDAIf(not HAS_CUDA, "requires cuda")
+    @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
+    def test_nonzero_in_inference_mode(self, device):
+        def fn(x):
+            return torch.nonzero(x)
+
+        example_inputs = (torch.randint(0, 2, (128,), device=device),)
 
-        with dynamo_config.patch({"capture_scalar_outputs": True}):
+        with torch.inference_mode():
             actual = torch.compile(fn, fullgraph=True)(*example_inputs)
             expected = fn(*example_inputs)
 
         torch.testing.assert_close(actual, expected)
 
+    @inductor_config.patch({"max_autotune": True})
+    @dynamo_config.patch({"capture_scalar_outputs": True})
+    def test_equivalent_backed_unbacked(self, device):
+        # Tests scenario when there are two equivalent backed & unbacked symints,
+        # but when we look-up a size hint on the unbacked symint, we ignorantly
+        # use the default fallback hint.
+
+        def fn(x, w, a, b):
+            # Make tensors where 1st dim is unbacked/backed.
+            u0, s0 = a.item(), b.size(0)
+            unbacked = x.expand(u0, *x.shape)
+            backed = x.expand(s0, *x.shape)
+
+            # The cat unifies u0 and s0 -- i.e. u0 == s0.
+            cat = torch.cat([backed, unbacked, unbacked], dim=1)  # [s0, 30, 16]
+            mat1 = torch.permute(cat, [0, 2, 1])  # [s0, 16, 30]
+            mat2 = w.expand(u0, *w.shape)  # [u0, 30, 32]
+            bmm = torch.ops.aten.bmm(mat1, mat2)
+            return bmm
+
+        example_inputs = (
+            torch.randn((10, 16), dtype=torch.float32, device=device),
+            torch.randn((30, 32), dtype=torch.float32, device=device),
+            torch.tensor(7, device=device),
+            backed := torch.randn((7,), device=device),
+        )
+        torch._dynamo.mark_dynamic(backed, 0)  # create backed symint
+
+        actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+
+
+instantiate_device_type_tests(
+    TestUnbackedSymints, globals(), only_for=(GPU_TYPE, "cpu")
+)
 
 if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
-    from torch._inductor.utils import is_big_gpu
+    from torch._inductor.test_case import run_tests
 
     if IS_LINUX and HAS_CUDA and is_big_gpu(0):
         run_tests()
diff --git a/test/inductor/test_utils.py b/test/inductor/test_utils.py
new file mode 100644
index 0000000000000..667e1513d3328
--- /dev/null
+++ b/test/inductor/test_utils.py
@@ -0,0 +1,55 @@
+# Owner(s): ["module: inductor"]
+
+from sympy import Symbol
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import sympy_subs
+
+
+class TestUtils(TestCase):
+    def testSympySubs(self):
+        # integer and nonnegetaive attributes are preserved.
+        expr = Symbol("x")
+        result = sympy_subs(expr, {expr: "y"})
+        self.assertEqual(result.name, "y")
+        self.assertEqual(result.is_integer, None)
+        self.assertEqual(result.is_nonnegative, None)
+
+        expr = Symbol("x", integer=True, nonnegative=False)
+        result = sympy_subs(expr, {expr: "y"})
+        self.assertEqual(result.name, "y")
+        self.assertEqual(result.is_integer, True)
+        self.assertEqual(result.is_nonnegative, False)
+
+        # invalid replacement.
+        expr = Symbol("x", integer=True)
+        result = sympy_subs(expr, {Symbol("x"): Symbol("y")})
+        self.assertEqual(result.name, "x")
+
+        # valid replacement since properties match.
+        expr = Symbol("x", integer=True)
+        result = sympy_subs(expr, {Symbol("x", integer=True): Symbol("y")})
+        self.assertEqual(result.name, "y")
+
+        # invalid replacement.
+        expr = Symbol("x", integer=None)
+        result = sympy_subs(expr, {Symbol("x", integer=False): Symbol("y")})
+        self.assertEqual(result.name, "x")
+
+        # replaced cant be string
+        self.assertRaises(AssertionError, sympy_subs, expr, {"x": "y"})
+
+        # replaced can be an expression
+        expr = Symbol("x")
+        expr = abs(expr)
+        self.assertEqual(expr.is_integer, None)
+        self.assertEqual(expr.is_nonnegative, None)
+        # replace abs(x) with y
+        # propagte abs(x) sympy properties.
+        result = sympy_subs(expr, {expr: Symbol("y")})
+        self.assertEqual(result.name, "y")
+        self.assertEqual(result.is_integer, None)
+        self.assertEqual(result.is_nonnegative, None)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/test_xpu_basic.py b/test/inductor/test_xpu_basic.py
new file mode 100644
index 0000000000000..f6e1b53a5a75b
--- /dev/null
+++ b/test/inductor/test_xpu_basic.py
@@ -0,0 +1,65 @@
+# Owner(s): ["module: inductor"]
+import importlib
+import os
+import sys
+import unittest
+
+import torch
+from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS
+
+if IS_WINDOWS and IS_CI:
+    sys.stderr.write(
+        "Windows CI does not have necessary dependencies for test_xpu_basic yet\n"
+    )
+    if __name__ == "__main__":
+        sys.exit(0)
+    raise unittest.SkipTest("requires sympy/functorch/filelock")
+
+importlib.import_module("filelock")
+
+pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+sys.path.append(pytorch_test_dir)
+from inductor.test_torchinductor import check_model_gpu, TestCase
+
+
+# TODO: Remove this file.
+# This is a temporary test case to test the base functionality of first Intel GPU Inductor integration.
+# We are working on reuse and pass the test cases in test/inductor/*  step by step.
+# Will remove this file when pass full test in test/inductor/*.
+
+
+class XpuBasicTests(TestCase):
+    common = check_model_gpu
+    device = "xpu"
+
+    def test_add(self):
+        def fn(a, b):
+            return a + b
+
+        self.common(fn, (torch.rand(2, 3, 16, 16), torch.rand(2, 3, 16, 16)))
+
+    def test_sub(self):
+        def fn(a, b):
+            return a - b
+
+        self.common(fn, (torch.rand(2, 3, 16, 16), torch.rand(2, 3, 16, 16)))
+
+    def test_mul(self):
+        def fn(a, b):
+            return a * b
+
+        self.common(fn, (torch.rand(2, 3, 16, 16), torch.rand(2, 3, 16, 16)))
+
+    def test_div(self):
+        def fn(a, b):
+            return a / b
+
+        self.common(fn, (torch.rand(2, 3, 16, 16), torch.rand(2, 3, 16, 16)))
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+    from torch.testing._internal.inductor_utils import HAS_XPU
+
+    if HAS_XPU:
+        run_tests(needs="filelock")
diff --git a/test/jit/_imported_class_test/bar.py b/test/jit/_imported_class_test/bar.py
index f6bdc593109b9..5e4e9839d36f2 100644
--- a/test/jit/_imported_class_test/bar.py
+++ b/test/jit/_imported_class_test/bar.py
@@ -1,4 +1,5 @@
 import torch
+
 # This file contains definitions of script classes.
 # They are used by test_jit.py to test ScriptClass imports
 
diff --git a/test/jit/_imported_class_test/foo.py b/test/jit/_imported_class_test/foo.py
index fe0123be32545..c2d982ba19a0a 100644
--- a/test/jit/_imported_class_test/foo.py
+++ b/test/jit/_imported_class_test/foo.py
@@ -1,5 +1,7 @@
 import torch
+
 from . import bar
+
 # This file contains definitions of script classes.
 # They are used by test_jit.py to test ScriptClass imports
 
diff --git a/test/jit/_imported_class_test/very/very/nested.py b/test/jit/_imported_class_test/very/very/nested.py
index dcf8dcb40cf87..af7e0d1b0921b 100644
--- a/test/jit/_imported_class_test/very/very/nested.py
+++ b/test/jit/_imported_class_test/very/very/nested.py
@@ -1,4 +1,5 @@
 import torch
+
 # This file contains definitions of script classes.
 # They are used by test_jit.py to test ScriptClass imports
 
diff --git a/test/jit/fixtures_srcs/fixtures_src.py b/test/jit/fixtures_srcs/fixtures_src.py
index afba17800c9ce..c15acf5a0e0e8 100644
--- a/test/jit/fixtures_srcs/fixtures_src.py
+++ b/test/jit/fixtures_srcs/fixtures_src.py
@@ -1,6 +1,8 @@
-import torch
 from typing import Union
 
+import torch
+
+
 class TestVersionedDivTensorExampleV7(torch.nn.Module):
     def forward(self, a, b):
         result_0 = a / b
@@ -8,35 +10,52 @@ def forward(self, a, b):
         result_2 = a.div(b)
         return result_0, result_1, result_2
 
+
 class TestVersionedLinspaceV7(torch.nn.Module):
     def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
         c = torch.linspace(a, b, steps=5)
         d = torch.linspace(a, b)
         return c, d
 
+
 class TestVersionedLinspaceOutV7(torch.nn.Module):
-    def forward(self, a: Union[int, float, complex], b: Union[int, float, complex], out: torch.Tensor):
+    def forward(
+        self,
+        a: Union[int, float, complex],
+        b: Union[int, float, complex],
+        out: torch.Tensor,
+    ):
         return torch.linspace(a, b, out=out)
 
+
 class TestVersionedLogspaceV8(torch.nn.Module):
     def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
         c = torch.logspace(a, b, steps=5)
         d = torch.logspace(a, b)
         return c, d
 
+
 class TestVersionedLogspaceOutV8(torch.nn.Module):
-    def forward(self, a: Union[int, float, complex], b: Union[int, float, complex], out: torch.Tensor):
+    def forward(
+        self,
+        a: Union[int, float, complex],
+        b: Union[int, float, complex],
+        out: torch.Tensor,
+    ):
         return torch.logspace(a, b, out=out)
 
+
 class TestVersionedGeluV9(torch.nn.Module):
     def forward(self, x):
         return torch._C._nn.gelu(x)
 
+
 class TestVersionedGeluOutV9(torch.nn.Module):
     def forward(self, x):
         out = torch.zeros_like(x)
         return torch._C._nn.gelu(x, out=out)
 
+
 class TestVersionedRandomV10(torch.nn.Module):
     def forward(self, x):
         out = torch.zeros_like(x)
diff --git a/test/jit/fixtures_srcs/generate_models.py b/test/jit/fixtures_srcs/generate_models.py
index f99a2cf2612c5..973a31e3c1265 100644
--- a/test/jit/fixtures_srcs/generate_models.py
+++ b/test/jit/fixtures_srcs/generate_models.py
@@ -6,9 +6,11 @@
 from typing import Set
 
 import torch
+
 # Use asterisk symbol so developer doesn't need to import here when they add tests for upgraders.
 from test.jit.fixtures_srcs.fixtures_src import *  # noqa: F403
-from torch.jit.mobile import _load_for_lite_interpreter, _export_operator_list
+from torch.jit.mobile import _export_operator_list, _load_for_lite_interpreter
+
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -105,28 +107,41 @@ def div_Tensor_0_3(self: Tensor, other: Tensor) -> Tensor:
 Get the path to `test/jit/fixtures`, where all test models for operator changes
 (upgrader/downgrader) are stored
 """
+
+
 def get_fixtures_path() -> Path:
     pytorch_dir = Path(__file__).resolve().parents[3]
     fixtures_path = pytorch_dir / "test" / "jit" / "fixtures"
     return fixtures_path
 
+
 """
 Get all models' name in `test/jit/fixtures`
 """
+
+
 def get_all_models(model_directory_path: Path) -> Set[str]:
-    files_in_fixtures = model_directory_path.glob('**/*')
-    all_models_from_fixtures = [fixture.stem for fixture in files_in_fixtures if fixture.is_file()]
+    files_in_fixtures = model_directory_path.glob("**/*")
+    all_models_from_fixtures = [
+        fixture.stem for fixture in files_in_fixtures if fixture.is_file()
+    ]
     return set(all_models_from_fixtures)
 
+
 """
 Check if a given model already exist in `test/jit/fixtures`
 """
+
+
 def model_exist(model_file_name: str, all_models: Set[str]) -> bool:
     return model_file_name in all_models
 
+
 """
 Get the operator list given a module
 """
+
+
 def get_operator_list(script_module: torch) -> Set[str]:
     buffer = io.BytesIO(script_module._save_to_buffer_for_lite_interpreter())
     buffer.seek(0)
@@ -134,21 +149,25 @@ def get_operator_list(script_module: torch) -> Set[str]:
     operator_list = _export_operator_list(mobile_module)
     return operator_list
 
+
 """
 Get the output model operator version, given a module
 """
+
+
 def get_output_model_version(script_module: torch.nn.Module) -> int:
     buffer = io.BytesIO()
     torch.jit.save(script_module, buffer)
     buffer.seek(0)
     zipped_model = zipfile.ZipFile(buffer)
     try:
-        version = int(zipped_model.read('archive/version').decode("utf-8"))
+        version = int(zipped_model.read("archive/version").decode("utf-8"))
         return version
     except KeyError:
-        version = int(zipped_model.read('archive/.data/version').decode("utf-8"))
+        version = int(zipped_model.read("archive/.data/version").decode("utf-8"))
         return version
 
+
 """
 Loop through all test modules. If the corresponding model doesn't exist in
 `test/jit/fixtures`, generate one. For the following reason, a model won't be exported:
@@ -165,6 +184,8 @@ def get_output_model_version(script_module: torch.nn.Module) -> int:
 3. The model already exists in `test/jit/fixtures`.
 
 """
+
+
 def generate_models(model_directory_path: Path):
     all_models = get_all_models(model_directory_path)
     for a_module, expect_operator in ALL_MODULES.items():
@@ -176,13 +197,17 @@ def generate_models(model_directory_path: Path):
                 "The module %s "
                 "is not a torch.nn.module instance. "
                 "Please ensure it's a subclass of torch.nn.module in fixtures_src.py"
-                "and it's registered as an instance in ALL_MODULES in generated_models.py", torch_module_name)
-
+                "and it's registered as an instance in ALL_MODULES in generated_models.py",
+                torch_module_name,
+            )
 
         # The corresponding model name is: test_versioned_div_tensor_example_v4
-        model_name = ''.join([
-            '_' + char.lower() if char.isupper() else char for char in torch_module_name
-        ]).lstrip('_')
+        model_name = "".join(
+            [
+                "_" + char.lower() if char.isupper() else char
+                for char in torch_module_name
+            ]
+        ).lstrip("_")
 
         # Some models may not compile anymore, so skip the ones
         # that already has pt file for them.
@@ -199,7 +224,10 @@ def generate_models(model_directory_path: Path):
             logger.error(
                 "Actual model version %s "
                 "is equal or larger than %s + 1. "
-                "Please run the script before the commit to change operator.", actual_model_version, current_operator_version)
+                "Please run the script before the commit to change operator.",
+                actual_model_version,
+                current_operator_version,
+            )
             continue
 
         actual_operator_list = get_operator_list(script_module)
@@ -207,16 +235,23 @@ def generate_models(model_directory_path: Path):
             logger.error(
                 "The model includes operator: %s, "
                 "however it doesn't cover the operator %s."
-                "Please ensure the output model includes the tested operator.", actual_operator_list, expect_operator)
+                "Please ensure the output model includes the tested operator.",
+                actual_operator_list,
+                expect_operator,
+            )
             continue
 
         export_model_path = str(model_directory_path / (str(model_name) + ".ptl"))
         script_module._save_for_lite_interpreter(export_model_path)
-        logger.info("Generating model %s and it's save to %s", model_name, export_model_path)
+        logger.info(
+            "Generating model %s and it's save to %s", model_name, export_model_path
+        )
+
 
 def main() -> None:
     model_directory_path = get_fixtures_path()
     generate_models(model_directory_path)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/test/jit/fixtures_srcs/test_upgrader_models_generation.py b/test/jit/fixtures_srcs/test_upgrader_models_generation.py
index 58267c1e0ea32..a23b95af9dfcf 100644
--- a/test/jit/fixtures_srcs/test_upgrader_models_generation.py
+++ b/test/jit/fixtures_srcs/test_upgrader_models_generation.py
@@ -2,7 +2,7 @@
 
 import torch
 from test.jit.fixtures_srcs.generate_models import ALL_MODULES
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class TestUpgraderModelGeneration(TestCase):
@@ -14,7 +14,9 @@ def test_all_modules(self):
                 f"The module {module_name} "
                 f"is not a torch.nn.module instance. "
                 f"Please ensure it's a subclass of torch.nn.module in fixtures_src.py"
-                f"and it's registered as an instance in ALL_MODULES in generated_models.py")
+                f"and it's registered as an instance in ALL_MODULES in generated_models.py",
+            )
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/jit/mydecorator.py b/test/jit/mydecorator.py
new file mode 100644
index 0000000000000..18c84b92103a3
--- /dev/null
+++ b/test/jit/mydecorator.py
@@ -0,0 +1,20 @@
+r"""
+Decorator used in test_decorator.py. We define it in a
+separate file on purpose to test that the names in different modules
+are resolved correctly.
+"""
+
+import functools
+
+
+def my_decorator(func):
+    """Dummy decorator that removes itself when torchscripting"""
+
+    @functools.wraps(func)
+    def wrapped_func(*args, **kwargs):
+        return func(*args, **kwargs)
+
+    # torch.jit.script() uses __prepare_scriptable__ to remove the decorator
+    wrapped_func.__prepare_scriptable__ = lambda: func
+
+    return wrapped_func
diff --git a/test/jit/myexception.py b/test/jit/myexception.py
index e60d30bd17692..0f6d898c6eb94 100644
--- a/test/jit/myexception.py
+++ b/test/jit/myexception.py
@@ -3,5 +3,7 @@
 separate file on purpose to make sure the fully qualified exception class name
 is captured correctly in suce cases.
 """
+
+
 class MyKeyError(KeyError):
     pass
diff --git a/test/jit/myfunction_a.py b/test/jit/myfunction_a.py
new file mode 100644
index 0000000000000..34efea69dcb2e
--- /dev/null
+++ b/test/jit/myfunction_a.py
@@ -0,0 +1,13 @@
+"""
+Helper function used in test_decorator.py. We define it in a
+separate file on purpose to test that the names in different modules
+are resolved correctly.
+"""
+
+from jit.mydecorator import my_decorator
+from jit.myfunction_b import my_function_b
+
+
+@my_decorator
+def my_function_a(x: float) -> float:
+    return my_function_b(x) + 1
diff --git a/test/jit/myfunction_b.py b/test/jit/myfunction_b.py
new file mode 100644
index 0000000000000..6407672defe83
--- /dev/null
+++ b/test/jit/myfunction_b.py
@@ -0,0 +1,16 @@
+r"""
+Helper function used in test_decorator.py. We define it in a
+separate file on purpose to test that the names in different modules
+are resolved correctly.
+"""
+
+from jit.mydecorator import my_decorator
+
+
+@my_decorator
+def my_function_b(x: float) -> float:
+    return my_function_c(x) + 2
+
+
+def my_function_c(x: float) -> float:
+    return x + 3
diff --git a/test/jit/test_alias_analysis.py b/test/jit/test_alias_analysis.py
index 2f8216eaaf9a5..54518595be32f 100644
--- a/test/jit/test_alias_analysis.py
+++ b/test/jit/test_alias_analysis.py
@@ -1,14 +1,18 @@
 # Owner(s): ["oncall: jit"]
 
-from torch.testing._internal.jit_utils import JitTestCase
-from torch._C import parse_ir
 import torch
+from torch._C import parse_ir
+from torch.testing._internal.common_utils import TemporaryFileName
+from torch.testing._internal.jit_utils import JitTestCase
 
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestAliasAnalysis(JitTestCase):
     def test_becomes_wildcard_annotations(self):
@@ -25,9 +29,13 @@ def test_becomes_wildcard_annotations(self):
         alias_db = graph.alias_db()
         split_node = graph.findNode("aten::split")
         # split input enters wildcard set, list initalized as containing wildcard set
-        self.assertTrue(alias_db.may_contain_alias(next(split_node.inputs()), split_node.output()))
+        self.assertTrue(
+            alias_db.may_contain_alias(next(split_node.inputs()), split_node.output())
+        )
         # because %x.1 enters wildcard set, it now aliases other members of wildcard set (graph inputs)
-        self.assertTrue(alias_db.may_contain_alias(next(split_node.inputs()), next(graph.inputs())))
+        self.assertTrue(
+            alias_db.may_contain_alias(next(split_node.inputs()), next(graph.inputs()))
+        )
 
     def test_nested_list_construct_not_wildcard(self):
         @torch.jit.script
@@ -41,7 +49,9 @@ def foo(x):
         ten_construct = graph.findNode("aten::rand").output()
         output = next(graph.outputs())
         self.assertTrue(alias_db.may_contain_alias(ten_construct, output))
-        self.assertFalse(alias_db.may_contain_alias(next(graph.inputs()), ten_construct))
+        self.assertFalse(
+            alias_db.may_contain_alias(next(graph.inputs()), ten_construct)
+        )
 
     def test_recursive_calls(self):
         @torch.jit.script
@@ -91,3 +101,56 @@ def foo2(self, x, y):
             inps = list(node.inputs())
             self.assertTrue(alias_db.has_writers(inps[1]))
             self.assertFalse(alias_db.has_writers(inps[2]))
+
+    def test_multiple_compilation_units(self):
+        # This is a repro of an internal issue we saw.
+        # Here, we have a large number (40) of modules each with the same name (MyModuleCUTest).
+        # AliasDB uses some hash tables that hash on types; each of these 40 modules are not
+        # identical because they have different compilation units, but they have the same name.
+        # Therefore, if we hash only on the module name (which we previously did), we will have
+        # hash collisions for all of these module types.
+        #
+        # flat_hash_map has very bad performance (exponential) for this hash collision behavior.
+        # This OOMs prior to the fix.
+        N = 40
+
+        class MultiTmpFile:
+            def __init__(self, N):
+                self.N = N
+                self.ctxs = [
+                    TemporaryFileName(mode="w", suffix=".py") for _ in range(N)
+                ]
+
+            def __enter__(self):
+                return [x.__enter__() for x in self.ctxs]
+
+            def __exit__(self, exc_type, exc_value, traceback):
+                return [x.__exit__(exc_type, exc_value, traceback) for x in self.ctxs]
+
+        class ModuleWrapper(torch.nn.Module):
+            def __init__(self, module_list):
+                super().__init__()
+                self.module_list = module_list
+
+            def forward(self, x):
+                for mod in self.module_list:
+                    x = mod(x)
+                return x
+
+        with MultiTmpFile(N) as fnames:
+            module_list = torch.nn.ModuleList()
+            global MyModuleCUTest
+
+            class MyModuleCUTest(torch.nn.Module):
+                def forward(self, x):
+                    return x + 2
+
+            for _, fname in enumerate(fnames):
+                mod = torch.jit.script(MyModuleCUTest())
+                torch.jit.save(mod, fname)
+                loaded_mod = torch.jit.load(fname)
+                module_list.append(loaded_mod)
+
+            mod = ModuleWrapper(module_list)
+            mod = torch.jit.script(mod)
+            mod(torch.zeros((2, 2)))
diff --git a/test/jit/test_async.py b/test/jit/test_async.py
index 637b82f117ed8..29f4fb22fbef5 100644
--- a/test/jit/test_async.py
+++ b/test/jit/test_async.py
@@ -3,18 +3,20 @@
 import os
 import sys
 
+from typing import Any, Tuple
+
 import torch
 import torch.nn as nn
 
-from typing import Any, Tuple
-
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase, _inline_everything
 from typing import List
+
 from torch import Tensor
 from torch.jit import Future
+from torch.testing._internal.jit_utils import _inline_everything, JitTestCase
+
 
 class TestAsync(JitTestCase):
     def test_async_python(self):
@@ -51,8 +53,7 @@ def bar(x):
             futures = torch.jit.annotate(List[Future[List[Tensor]]], [])
             for _ in range(3):
                 future = torch.jit.annotate(
-                    Future[List[Tensor]],
-                    torch.jit.fork(foo, x)
+                    Future[List[Tensor]], torch.jit.fork(foo, x)
                 )
                 futures.append(future)
 
@@ -85,7 +86,7 @@ def wait_script(x):
 
     def test_async_script_capture(self):
         class Mod(torch.jit.ScriptModule):
-            __constants__ = ['const']
+            __constants__ = ["const"]
 
             def __init__(self):
                 super().__init__()
@@ -139,7 +140,10 @@ def wait_script_nest(x):
     def test_async_script_no_script_mod(self):
         x = torch.rand(3, 4)
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, 'cannot call a value', 'torch.jit._fork(x'):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "cannot call a value", "torch.jit._fork(x"
+        ):
+
             @torch.jit.script
             def wait_script(x):
                 fut = torch.jit._fork(x)
@@ -213,7 +217,7 @@ def foo(x1, x2):
             lambda x1, x2: torch.jit._wait(torch.jit._fork(foo, x1, x2)),
             lambda x1, x2: torch.jit._wait(torch.jit._fork(foo, x1, x2=x2)),
             lambda x1, x2: torch.jit._wait(torch.jit._fork(foo, x1=x1, x2=x2)),
-            lambda x1, x2: torch.jit._wait(torch.jit._fork(foo, x2=x2, x1=x1))
+            lambda x1, x2: torch.jit._wait(torch.jit._fork(foo, x2=x2, x1=x1)),
         ]:
             for wrapper in [
                 func,
@@ -234,8 +238,8 @@ def foo_script_kwargs(x1, x2):
             return torch.jit._wait(torch.jit._fork(foo, x1=x1, x2=x2))
 
         for wrapper in [
-                foo_script_args,
-                foo_script_kwargs,
+            foo_script_args,
+            foo_script_kwargs,
         ]:
             self.assertEqual(wrapper(x1, x2), y_hat)
             self.assertEqual(wrapper(x1, x2=x2), y_hat)
@@ -255,7 +259,9 @@ def __init__(self):
                 self.traced = torch.jit.trace(Traced(), (x), _force_outplace=True)
 
             @torch.jit.script_method
-            def forward(self, x: Tensor) -> Tuple[List[Tensor], Tuple[Tensor, Tensor], Tensor]:
+            def forward(
+                self, x: Tensor
+            ) -> Tuple[List[Tensor], Tuple[Tensor, Tensor], Tensor]:
                 future1 = torch.jit._fork(self.traced, x)
                 future2 = torch.jit._fork(torch.neg, x)
 
@@ -284,10 +290,16 @@ def forward(self, x):
         module = torch.jit.trace(TupleCl(), (x), _force_outplace=True)
 
         # Make sure we have forks
-        self.assertGraphContainsExactly(module.graph, kind='prim::fork', num_kind_nodes=2)
+        self.assertGraphContainsExactly(
+            module.graph, kind="prim::fork", num_kind_nodes=2
+        )
         # Make sure 1 ::neg is in the root graph and 2 ::negs are in the subgraphs
-        self.assertGraphContainsExactly(module.graph, kind='aten::neg', num_kind_nodes=1)
-        self.assertGraphContainsExactly(module.graph, kind='aten::neg', num_kind_nodes=3, consider_subgraphs=True)
+        self.assertGraphContainsExactly(
+            module.graph, kind="aten::neg", num_kind_nodes=1
+        )
+        self.assertGraphContainsExactly(
+            module.graph, kind="aten::neg", num_kind_nodes=3, consider_subgraphs=True
+        )
 
         y = torch.neg(x)
         self.assertEqual(module(x), (y, y, y, y, x, x))
@@ -311,19 +323,23 @@ def wait_script_nest(x):
             return torch.jit._wait(fut)
 
         # no future
-        error_msg = 'The size.*must match the size of tensor'
-        with self.assertRaisesRegexWithHighlight(Exception, error_msg, 'x.t() + x'):
+        error_msg = "The size.*must match the size of tensor"
+        with self.assertRaisesRegexWithHighlight(Exception, error_msg, "x.t() + x"):
             foo(x)
 
         # one future
-        with self.assertRaisesRegexWithHighlight(Exception, error_msg, 'torch.jit._fork(foo, x'):
+        with self.assertRaisesRegexWithHighlight(
+            Exception, error_msg, "torch.jit._fork(foo, x"
+        ):
             wait_script(x)
 
         # two futures with a different error
         x = torch.rand(3, 4, 5)
-        with self.assertRaisesRegexWithHighlight(Exception,
-                                                 'expects a tensor with <= 2 dimensions',
-                                                 'torch.jit._fork(wait_script, x'):
+        with self.assertRaisesRegexWithHighlight(
+            Exception,
+            "expects a tensor with <= 2 dimensions",
+            "torch.jit._fork(wait_script, x",
+        ):
             wait_script_nest(x)
 
     def test_async_grad_guard_with_grad(self):
@@ -381,9 +397,15 @@ def fn(x):
         x = torch.rand(3, 4)
         self.assertEqual(fn(x), traced(x))
 
-        self.assertGraphContainsExactly(traced.graph, kind='prim::fork', num_kind_nodes=1)
-        self.assertGraphContainsExactly(traced.graph, kind='aten::wait', num_kind_nodes=1)
-        self.assertGraphContainsExactly(traced.graph, kind='aten::neg', num_kind_nodes=2, consider_subgraphs=True)
+        self.assertGraphContainsExactly(
+            traced.graph, kind="prim::fork", num_kind_nodes=1
+        )
+        self.assertGraphContainsExactly(
+            traced.graph, kind="aten::wait", num_kind_nodes=1
+        )
+        self.assertGraphContainsExactly(
+            traced.graph, kind="aten::neg", num_kind_nodes=2, consider_subgraphs=True
+        )
 
     def test_trace_fork_wait_leaking(self):
         my_list = []
@@ -397,9 +419,13 @@ def fn(x):
             val = torch.jit._wait(fut)
             return my_list[0]
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, 'did not have observable data dependence with trace inputs; '
-                                                 'this probably indicates your program cannot be understood '
-                                                 'by the tracer.', ''):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError,
+            "did not have observable data dependence with trace inputs; "
+            "this probably indicates your program cannot be understood "
+            "by the tracer.",
+            "",
+        ):
             traced = torch.jit.trace(fn, (torch.rand(3, 4),), check_trace=False)
 
     def test_trace_fork_wait_inline(self):
@@ -413,9 +439,15 @@ def fn(x):
 
         traced = torch.jit.trace(fn, (torch.rand(3, 4),))
         torch._C._jit_pass_inline_fork_wait(traced.graph)
-        self.assertGraphContainsExactly(traced.graph, kind='prim::fork', num_kind_nodes=0)
-        self.assertGraphContainsExactly(traced.graph, kind='aten::wait', num_kind_nodes=0)
-        self.assertGraphContainsExactly(traced.graph, kind='aten::add', num_kind_nodes=2)
+        self.assertGraphContainsExactly(
+            traced.graph, kind="prim::fork", num_kind_nodes=0
+        )
+        self.assertGraphContainsExactly(
+            traced.graph, kind="aten::wait", num_kind_nodes=0
+        )
+        self.assertGraphContainsExactly(
+            traced.graph, kind="aten::add", num_kind_nodes=2
+        )
 
     def test_trace_fork_wait_list_modulecalls(self):
         def add_one(input):
@@ -472,7 +504,10 @@ def forward(self, input):
         self.checkTrace(TestModule(), (torch.randn(5, 5),))
 
     def test_no_future_subtype_message(self):
-        with self.assertRaisesRegexWithHighlight(RuntimeError, 'Future without a contained type', ''):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Future without a contained type", ""
+        ):
+
             @torch.jit.script
             def forward(self, x):
                 futs = torch.jit.annotate(List[torch.jit.Future], [])
@@ -481,6 +516,7 @@ def test_future_subtyping(self):
         """
         Test that futures subtype each other properly.
         """
+
         # Successful subtyping.
         def returns_int(x: int) -> int:
             return x + x + 1
@@ -495,10 +531,11 @@ def fn_int(x: int) -> Any:
 
         # Unsuccessful subtyping.
         with self.assertRaisesRegexWithHighlight(
-                RuntimeError,
-                r"was annotated as having type Future\[float\] but is actually of type Future\[int\]",
-                "fut = returns_future_float(x"
+            RuntimeError,
+            r"was annotated as having type Future\[float\] but is actually of type Future\[int\]",
+            "fut = returns_future_float(x",
         ):
+
             def returns_future_float(x: int) -> torch.jit.Future[float]:
                 return torch.jit._fork(returns_int, (x))
 
@@ -508,8 +545,9 @@ def fn_float(x: int) -> Any:
                 return fut.wait()
 
 
-
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
diff --git a/test/jit/test_aten_pow.py b/test/jit/test_aten_pow.py
index a287d05720853..d227f25250491 100644
--- a/test/jit/test_aten_pow.py
+++ b/test/jit/test_aten_pow.py
@@ -3,35 +3,40 @@
 import torch
 from torch.testing._internal.common_utils import TestCase
 
+
 class TestAtenPow(TestCase):
     def test_aten_pow_zero_negative_exponent(self):
-        '''
+        """
         1. Testing a = int, b = int
-        '''
+        """
+
         @torch.jit.script
         def fn_int_int(a: int, b: int):
-            return a ** b
+            return a**b
+
         # Existing correct behaviors of aten::pow
-        self.assertEqual(fn_int_int(2, 1), 2 ** 1)
-        self.assertEqual(fn_int_int(2, 0), 2 ** 0)
+        self.assertEqual(fn_int_int(2, 1), 2**1)
+        self.assertEqual(fn_int_int(2, 0), 2**0)
         self.assertEqual(fn_int_int(2, -2), 2 ** (-2))
         self.assertEqual(fn_int_int(-2, 2), (-2) ** 2)
         self.assertEqual(fn_int_int(-2, 0), (-2) ** 0)
         self.assertEqual(fn_int_int(-2, -2), (-2) ** (-2))
         self.assertEqual(fn_int_int(-2, -1), (-2) ** (-1))
-        self.assertEqual(fn_int_int(0, 2), 0 ** 1)
-        self.assertEqual(fn_int_int(0, 0), 0 ** 0)
+        self.assertEqual(fn_int_int(0, 2), 0**1)
+        self.assertEqual(fn_int_int(0, 0), 0**0)
         # zero base and negative exponent case that should trigger RunTimeError
         self.assertRaises(RuntimeError, fn_int_int, 0, -2)
 
-        '''
+        """
         2. Testing a = int, b = float
-        '''
+        """
+
         @torch.jit.script
         def fn_int_float(a: int, b: float):
-            return a ** b
+            return a**b
+
         # Existing correct behaviors of aten::pow
-        self.assertEqual(fn_int_float(2, 2.5), 2 ** 2.5)
+        self.assertEqual(fn_int_float(2, 2.5), 2**2.5)
         self.assertEqual(fn_int_float(2, -2.5), 2 ** (-2.5))
         self.assertEqual(fn_int_float(2, -0.0), 2 ** (-0.0))
         self.assertEqual(fn_int_float(2, 0.0), 2 ** (0.0))
@@ -40,53 +45,57 @@ def fn_int_float(a: int, b: float):
         self.assertEqual(fn_int_float(-2, -3.0), (-2) ** (-3.0))
         self.assertEqual(fn_int_float(-2, -0.0), (-2) ** (-0.0))
         self.assertEqual(fn_int_float(-2, 0.0), (-2) ** (0.0))
-        self.assertEqual(fn_int_float(0, 2.0), 0 ** 2.0)
-        self.assertEqual(fn_int_float(0, 0.5), 0 ** 0.5)
-        self.assertEqual(fn_int_float(0, 0.0), 0 ** 0.0)
+        self.assertEqual(fn_int_float(0, 2.0), 0**2.0)
+        self.assertEqual(fn_int_float(0, 0.5), 0**0.5)
+        self.assertEqual(fn_int_float(0, 0.0), 0**0.0)
         self.assertEqual(fn_int_float(0, -0.0), 0 ** (-0.0))
         # zero base and negative exponent case that should trigger RunTimeError
         self.assertRaises(RuntimeError, fn_int_float, 0, -2.5)
 
-        '''
+        """
         3. Testing a = float, b = int
-        '''
+        """
+
         @torch.jit.script
         def fn_float_int(a: float, b: int):
-            return a ** b
+            return a**b
+
         # Existing correct behaviors of aten::pow
-        self.assertEqual(fn_float_int(2.5, 2), 2.5 ** 2)
+        self.assertEqual(fn_float_int(2.5, 2), 2.5**2)
         self.assertEqual(fn_float_int(2.5, -2), 2.5 ** (-2))
         self.assertEqual(fn_float_int(2.5, -0), 2.5 ** (-0))
-        self.assertEqual(fn_float_int(2.5, 0), 2.5 ** 0)
-        self.assertEqual(fn_float_int(-2.5, 2), 2.5 ** 2)
+        self.assertEqual(fn_float_int(2.5, 0), 2.5**0)
+        self.assertEqual(fn_float_int(-2.5, 2), 2.5**2)
         self.assertEqual(fn_float_int(-2.5, -2), (-2.5) ** (-2))
         self.assertEqual(fn_float_int(-2.5, -3), (-2.5) ** (-3))
         self.assertEqual(fn_float_int(-2.5, -0), (-2.5) ** (-0))
         self.assertEqual(fn_float_int(-2.5, 0), (-2.5) ** 0)
-        self.assertEqual(fn_float_int(0.0, 2), 0 ** 2)
-        self.assertEqual(fn_float_int(0.0, 0), 0 ** 0)
+        self.assertEqual(fn_float_int(0.0, 2), 0**2)
+        self.assertEqual(fn_float_int(0.0, 0), 0**0)
         self.assertEqual(fn_float_int(0.0, -0), 0 ** (-0))
         # zero base and negative exponent case that should trigger RunTimeError
         self.assertRaises(RuntimeError, fn_float_int, 0.0, -2)
 
-        '''
+        """
         4. Testing a = float, b = float
-        '''
+        """
+
         @torch.jit.script
         def fn_float_float(a: float, b: float):
-            return a ** b
+            return a**b
+
         # Existing correct behaviors of aten::pow
-        self.assertEqual(fn_float_float(2.5, 2.0), 2.5 ** 2.0)
+        self.assertEqual(fn_float_float(2.5, 2.0), 2.5**2.0)
         self.assertEqual(fn_float_float(2.5, -2.0), 2.5 ** (-2.0))
         self.assertEqual(fn_float_float(2.5, -0.0), 2.5 ** (-0.0))
-        self.assertEqual(fn_float_float(2.5, 0.0), 2.5 ** 0.0)
-        self.assertEqual(fn_float_float(-2.5, 2.0), 2.5 ** 2.0)
+        self.assertEqual(fn_float_float(2.5, 0.0), 2.5**0.0)
+        self.assertEqual(fn_float_float(-2.5, 2.0), 2.5**2.0)
         self.assertEqual(fn_float_float(-2.5, -2.0), (-2.5) ** (-2.0))
         self.assertEqual(fn_float_float(-2.5, -3.0), (-2.5) ** (-3.0))
         self.assertEqual(fn_float_float(-2.5, -0.0), (-2.5) ** (-0.0))
         self.assertEqual(fn_float_float(-2.5, 0.0), (-2.5) ** 0.0)
-        self.assertEqual(fn_float_float(0.0, 2.0), 0.0 ** 2.0)
-        self.assertEqual(fn_float_float(0.0, 0.0), 0.0 ** 0.0)
+        self.assertEqual(fn_float_float(0.0, 2.0), 0.0**2.0)
+        self.assertEqual(fn_float_float(0.0, 0.0), 0.0**0.0)
         self.assertEqual(fn_float_float(0.0, -0.0), 0.0 ** (-0.0))
         # zero base and negative exponent case that should trigger RunTimeError
         self.assertRaises(RuntimeError, fn_float_float, 0.0, -2.0)
diff --git a/test/jit/test_attr.py b/test/jit/test_attr.py
index 1fd85be9fadc5..d6edc16f5ce25 100644
--- a/test/jit/test_attr.py
+++ b/test/jit/test_attr.py
@@ -1,19 +1,22 @@
 # Owner(s): ["oncall: jit"]
 
+from typing import NamedTuple, Tuple
+
+import torch
 from torch.testing import FileCheck
 from torch.testing._internal.jit_utils import JitTestCase
-import torch
 
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
 
 class TestGetDefaultAttr(JitTestCase):
     def test_getattr_with_default(self):
-
         class A(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -21,7 +24,7 @@ def __init__(self):
 
             def forward(self, x):
                 y = getattr(self, "init_attr_val")  # noqa: B009
-                w : list[float] = [1.0]
+                w: list[float] = [1.0]
                 z = getattr(self, "missing", w)  # noqa: B009
                 z.append(y)
                 return z
@@ -31,8 +34,35 @@ def forward(self, x):
         graph = torch.jit.script(A()).graph
 
         # The "init_attr_val" attribute exists
-        FileCheck().check("prim::GetAttr[name=\"init_attr_val\"]").run(graph)
+        FileCheck().check('prim::GetAttr[name="init_attr_val"]').run(graph)
         # The "missing" attribute does not exist, so there should be no corresponding GetAttr in AST
         FileCheck().check_not("missing").run(graph)
         # instead the getattr call will emit the default value, which is a list with one float element
         FileCheck().check("float[] = prim::ListConstruct").run(graph)
+
+    def test_getattr_named_tuple(self):
+        global MyTuple
+
+        class MyTuple(NamedTuple):
+            x: str
+            y: torch.Tensor
+
+        def fn(x: MyTuple) -> Tuple[str, torch.Tensor, int]:
+            return (
+                getattr(x, "x", "fdsa"),
+                getattr(x, "y", torch.ones((3, 3))),
+                getattr(x, "z", 7),
+            )
+
+        inp = MyTuple(x="test", y=torch.ones(3, 3) * 2)
+        ref = fn(inp)
+        fn_s = torch.jit.script(fn)
+        res = fn_s(inp)
+        self.assertEqual(res, ref)
+
+    def test_getattr_tuple(self):
+        def fn(x: Tuple[str, int]) -> int:
+            return getattr(x, "x", 2)
+
+        with self.assertRaisesRegex(RuntimeError, "but got a normal Tuple"):
+            torch.jit.script(fn)
diff --git a/test/jit/test_autodiff.py b/test/jit/test_autodiff.py
index a77569fb4f915..752112e74df29 100644
--- a/test/jit/test_autodiff.py
+++ b/test/jit/test_autodiff.py
@@ -1,10 +1,11 @@
 # Owner(s): ["oncall: jit"]
 
+from typing import List
+
 import torch
 
 from torch.testing._internal.common_utils import skipIfTorchDynamo
 from torch.testing._internal.jit_utils import JitTestCase
-from typing import List
 
 
 @skipIfTorchDynamo()
@@ -119,7 +120,6 @@ def fn(a, b, c):
             self.assertEqual(y_s.requires_grad, y.requires_grad)
             self.assertEqual(z_s.requires_grad, z.requires_grad)
 
-
     def test_autodiff_requires_grad_nograd(self):
         @torch.jit.ignore
         def python_fn(x):
diff --git a/test/jit/test_autodiff_subgraph_slicing.py b/test/jit/test_autodiff_subgraph_slicing.py
index 1fb40b6229718..20d2b46f81bc5 100644
--- a/test/jit/test_autodiff_subgraph_slicing.py
+++ b/test/jit/test_autodiff_subgraph_slicing.py
@@ -3,26 +3,38 @@
 import os
 import sys
 import unittest
-from torch.testing._internal.common_utils import GRAPH_EXECUTOR, ProfilingMode, \
-    num_profiled_runs, enable_profiling_mode_for_profiling_tests
-from torch.testing._internal.common_jit import check_against_reference
+
 import torch
+from torch.testing._internal.common_jit import check_against_reference
+from torch.testing._internal.common_utils import (
+    enable_profiling_mode_for_profiling_tests,
+    GRAPH_EXECUTOR,
+    num_profiled_runs,
+    ProfilingMode,
+)
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase, disable_autodiff_subgraph_inlining
-from torch.testing import FileCheck
+from typing import List, Optional, Tuple
 
-from typing import List, Tuple, Optional
-
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
-
-
-@unittest.skipIf(GRAPH_EXECUTOR == ProfilingMode.SIMPLE, "Simple Executor doesn't support gradients")
+from torch.testing import FileCheck
+from torch.testing._internal.jit_utils import (
+    disable_autodiff_subgraph_inlining,
+    JitTestCase,
+)
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
+
+@unittest.skipIf(
+    GRAPH_EXECUTOR == ProfilingMode.SIMPLE, "Simple Executor doesn't support gradients"
+)
 class TestAutodiffSubgraphSlicing(JitTestCase):
     # TODO: It is better if we can test directly on graphs instead of the current
     # end-to-end fashion.
@@ -35,11 +47,17 @@ def _perform_ad_subgraph_slicing(self, fn, *input_sizes):
                 return ge.graph_for(*inputs)
 
     def assertGraphSize(self, graph, size):
-        nodes = list(filter(lambda n: (n.kind() != "prim::BailOut" and
-                                       n.kind() != "prim::BailoutTemplate" and
-                                       n.kind() != "prim::TypeCheck" and
-                                       n.kind() != "prim::RequiresGradCheck"),
-                            graph.nodes()))
+        nodes = list(
+            filter(
+                lambda n: (
+                    n.kind() != "prim::BailOut"
+                    and n.kind() != "prim::BailoutTemplate"
+                    and n.kind() != "prim::TypeCheck"
+                    and n.kind() != "prim::RequiresGradCheck"
+                ),
+                graph.nodes(),
+            )
+        )
         self.assertEqual(len(list(nodes)), size)
 
     def test_chunk_constant_script_ad(self):
@@ -52,16 +70,21 @@ def func(x):
         with disable_autodiff_subgraph_inlining():
             with enable_profiling_mode_for_profiling_tests():
                 output = func(input, profile_and_replay=True)
-                FileCheck().check_not("prim::DifferentiableGraph").run(func.graph_for(input))
-
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "This threshold is only valid for Profiling Executor")
+                FileCheck().check_not("prim::DifferentiableGraph").run(
+                    func.graph_for(input)
+                )
+
+    @unittest.skipIf(
+        GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+        "This threshold is only valid for Profiling Executor",
+    )
     def test_diff_graph_inline_threshold(self):
         with enable_profiling_mode_for_profiling_tests():
             NUM_RUNS = 1
             with num_profiled_runs(NUM_RUNS):
+
                 @torch.jit.script
                 def foo(x):
-
                     #  two nodes should be fused
                     #  see https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/runtime/graph_executor_impl.h#L49
                     return torch.sigmoid(torch.sigmoid(x))
@@ -78,12 +101,16 @@ def bar(x):
                 bar(input)
                 bar(input)
 
-                self.assertGraphContainsExactly(foo.graph_for(input), 'prim::DifferentiableGraph', 1)
-                self.assertGraphContainsExactly(bar.graph_for(input), 'prim::DifferentiableGraph', 0)
+                self.assertGraphContainsExactly(
+                    foo.graph_for(input), "prim::DifferentiableGraph", 1
+                )
+                self.assertGraphContainsExactly(
+                    bar.graph_for(input), "prim::DifferentiableGraph", 0
+                )
 
     def test_bias_as_module_attr(self):
-
         with enable_profiling_mode_for_profiling_tests():
+
             class M(torch.nn.Module):
                 def __init__(self, has_bias):
                     super().__init__()
@@ -99,19 +126,40 @@ def forward(self, x, y):
             scripted_no_bias(x, x)
             scripted_no_bias(x, x)
             has_bias = M(True)
-            check_against_reference(self, scripted_no_bias, no_bias, lambda x: x, (x, x,), check_types=False)
+            check_against_reference(
+                self,
+                scripted_no_bias,
+                no_bias,
+                lambda x: x,
+                (
+                    x,
+                    x,
+                ),
+                check_types=False,
+            )
             scripted_has_bias = torch.jit.script(has_bias)
             scripted_has_bias(x, x)
             scripted_has_bias(x, x)
             scripted_has_bias(x, x)
-            check_against_reference(self, scripted_has_bias, has_bias, lambda x: x, (x, x,), check_types=False)
+            check_against_reference(
+                self,
+                scripted_has_bias,
+                has_bias,
+                lambda x: x,
+                (
+                    x,
+                    x,
+                ),
+                check_types=False,
+            )
 
     def test_constructed_bias(self):
-
         with enable_profiling_mode_for_profiling_tests():
+
             def method1(x, weight, b1, b2):
                 bias = b1 * b2
                 return torch.nn.functional.linear(x, weight, bias)
+
             N = 10
             x = torch.rand(N, N, requires_grad=True)
             weight = torch.rand(N, N, requires_grad=True)
@@ -119,35 +167,58 @@ def method1(x, weight, b1, b2):
             b2 = torch.rand(N, N, requires_grad=True)
             scripted = self.checkScript(method1, (x, weight, b1, b2))
             # check_types requires last_graph on scripted to be set, so we just skip it
-            check_against_reference(self, scripted, method1, lambda x: x, (x, weight, b1, b2), check_types=False)
+            check_against_reference(
+                self,
+                scripted,
+                method1,
+                lambda x: x,
+                (x, weight, b1, b2),
+                check_types=False,
+            )
 
     def test_bias_as_arg(self):
-
         with enable_profiling_mode_for_profiling_tests():
+
             def method1(x, weight, bias: Optional[torch.Tensor]):
                 return torch.nn.functional.linear(x, weight, bias).relu() + 2
+
             N = 10
             x = torch.rand(N, N, requires_grad=True)
             weight = torch.rand(N, N, requires_grad=True)
             bias = None
             scripted = self.checkScript(method1, (x, weight, bias))
             # check_types requires last_graph on scripted to be set, so we just skip it
-            check_against_reference(self, scripted, method1, lambda x: x, (x, weight, bias), check_types=False)
+            check_against_reference(
+                self,
+                scripted,
+                method1,
+                lambda x: x,
+                (x, weight, bias),
+                check_types=False,
+            )
             bias = torch.rand(N, N, requires_grad=True)
             scripted = self.checkScript(method1, (x, weight, bias))
             # check_types requires last_graph on scripted to be set, so we just skip it
-            check_against_reference(self, scripted, method1, lambda x: x, (x, weight, bias), check_types=False)
+            check_against_reference(
+                self,
+                scripted,
+                method1,
+                lambda x: x,
+                (x, weight, bias),
+                check_types=False,
+            )
 
     def test_requires_grad_for_tensor_list(self):
-
         with enable_profiling_mode_for_profiling_tests():
-
             # output & var_list[0] should have requires_grad set to True
-            def func(input0: torch.Tensor, input1: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+            def func(
+                input0: torch.Tensor, input1: torch.Tensor
+            ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
                 var_list = [input0, input1]
                 var = torch.cat(var_list)
                 output = var + 1.0
                 return output, var_list
+
             jit_f = torch.jit.script(func)
             input0 = torch.randn((2,), requires_grad=True)
             input1 = torch.randn((2,))
@@ -158,12 +229,14 @@ def func(input0: torch.Tensor, input1: torch.Tensor) -> Tuple[torch.Tensor, List
                 assert output_ref[1][0].requires_grad == output[1][0].requires_grad
                 assert output_ref[1][1].requires_grad == output[1][1].requires_grad
 
-    @unittest.skip("disable until we property handle tensor lists with undefined gradients")
+    @unittest.skip(
+        "disable until we property handle tensor lists with undefined gradients"
+    )
     def test_differentiable_graph_ops_requires_grad(self):
         x = torch.randn(8, 2, dtype=torch.float).requires_grad_()
         y = torch.randn(8, 2, dtype=torch.float)
 
-        def t(x : torch.Tensor, y : torch.Tensor, flag : bool):
+        def t(x: torch.Tensor, y: torch.Tensor, flag: bool):
             o = x + 1.0
             o1 = torch.relu(o)
             o = y + 1.5
@@ -186,13 +259,14 @@ def t(x : torch.Tensor, y : torch.Tensor, flag : bool):
             return o1, o2, o3, oo1, oo2, oo3
 
         with enable_profiling_mode_for_profiling_tests():
-
             t_jit = torch.jit.script(t)
             jit_o = t_jit(x, y, False)
             jit_o = t_jit(x, y, False)
             o = t(x, y, False)
 
-            FileCheck().check("prim::DifferentiableGraph").run(t_jit.graph_for(x, y, False))
+            FileCheck().check("prim::DifferentiableGraph").run(
+                t_jit.graph_for(x, y, False)
+            )
             # validate the differentiableGraphOps are marking proper requires_grad
             for oo, jit_oo in zip(o, jit_o):
                 self.assertEqual(oo.requires_grad, jit_oo.requires_grad)
@@ -204,22 +278,28 @@ def t(x : torch.Tensor, y : torch.Tensor, flag : bool):
                 self.assertEqual(oo.requires_grad, jit_oo.requires_grad)
                 self.assertEqual(oo, jit_oo)
 
-    @unittest.skipIf(GRAPH_EXECUTOR == ProfilingMode.PROFILING, "Simple Executor doesn't support gradients")
+    @unittest.skipIf(
+        GRAPH_EXECUTOR == ProfilingMode.PROFILING,
+        "Simple Executor doesn't support gradients",
+    )
     def test_prune_grad(self):
         @torch.jit.script
         def t(input, bias):
             return torch.nn.functional.relu(input + bias)
+
         input = torch.randn(2, 8, requires_grad=True)
-        bias = torch.randn(8, requires_grad=False)    # bias does NOT require grad
+        bias = torch.randn(8, requires_grad=False)  # bias does NOT require grad
         NUM_PROFILED_RUNS = 1
         with num_profiled_runs(NUM_PROFILED_RUNS):
-            WARMUP = 3    # 2 runs to reach backward + 1 to optimize it
+            WARMUP = 3  # 2 runs to reach backward + 1 to optimize it
             for x in range(WARMUP):
                 o = t(input, bias)
                 o.sum().backward()
 
             fwd_plan = list(t.get_debug_state().execution_plans.values())[0]
-            bwd_graph = list(fwd_plan.code.grad_executor_states()[0].execution_plans.values())[0].graph
+            bwd_graph = list(
+                fwd_plan.code.grad_executor_states()[0].execution_plans.values()
+            )[0].graph
             tup = next(bwd_graph.outputs())
             self.assertEqual(len(list(tup.node().inputs())), 1)
 
@@ -233,7 +313,7 @@ def fn(x, y, z):
         graph = self._perform_ad_subgraph_slicing(fn, 1, 1, 1)
 
         self.assertGraphSize(graph, 1)
-        self.assertGraphContainsExactly(graph, 'prim::DifferentiableGraph', 1)
+        self.assertGraphContainsExactly(graph, "prim::DifferentiableGraph", 1)
 
     def test_simple_no_merge(self):
         # o: autodiff supported. x: not autodiff supported.
@@ -245,8 +325,10 @@ def fn(x, y, z):
 
         graph = self._perform_ad_subgraph_slicing(fn, 1, 1, 1)
         g_str = str(graph)
-        FileCheck().check("aten::Int").check("aten::zeros").check_not("aten::mul").run(g_str[0:g_str.find("return")])
-        self.assertGraphContainsExactly(graph, 'prim::DifferentiableGraph', 1)
+        FileCheck().check("aten::Int").check("aten::zeros").check_not("aten::mul").run(
+            g_str[0 : g_str.find("return")]
+        )
+        self.assertGraphContainsExactly(graph, "prim::DifferentiableGraph", 1)
 
     def test_does_not_merge_unrelated(self):
         # o  o
@@ -258,7 +340,7 @@ def fn(w, x, y, z):
         graph = self._perform_ad_subgraph_slicing(fn, 1, 1, 1, 1)
 
         self.assertGraphSize(graph, 3)
-        self.assertGraphContainsExactly(graph, 'prim::DifferentiableGraph', 2)
+        self.assertGraphContainsExactly(graph, "prim::DifferentiableGraph", 2)
 
     def test_merges_without_cycles(self):
         # o --> o --> o
@@ -273,7 +355,7 @@ def fn(w, x, y):
         graph = self._perform_ad_subgraph_slicing(fn, 1, 1, 1)
 
         self.assertGraphSize(graph, 1)
-        self.assertGraphContainsExactly(graph, 'prim::DifferentiableGraph', 1)
+        self.assertGraphContainsExactly(graph, "prim::DifferentiableGraph", 1)
 
     def test_merges_dense(self):
         #   o      o
@@ -290,7 +372,7 @@ def fn(x, y):
         graph = self._perform_ad_subgraph_slicing(fn, 2, 2)
 
         self.assertGraphSize(graph, 2)
-        self.assertGraphContainsExactly(graph, 'prim::DifferentiableGraph', 1)
+        self.assertGraphContainsExactly(graph, "prim::DifferentiableGraph", 1)
 
     def test_does_not_create_cycles(self):
         # o --> x --> o
@@ -303,7 +385,7 @@ def fn(w, x, y):
             return c
 
         graph = self._perform_ad_subgraph_slicing(fn, 1, 1, 1)
-        self.assertGraphContainsExactly(graph, 'prim::DifferentiableGraph', 2)
+        self.assertGraphContainsExactly(graph, "prim::DifferentiableGraph", 2)
 
     def test_merges_up(self):
         # o --> x     o
@@ -317,8 +399,8 @@ def fn(w, x, y, z):
 
         graph = self._perform_ad_subgraph_slicing(fn, 1, 1, 1, 1)
         g_str = str(graph)
-        FileCheck().check_not("aten::add").run(g_str[0:g_str.find("return")])
-        self.assertGraphContainsExactly(graph, 'prim::DifferentiableGraph', 1)
+        FileCheck().check_not("aten::add").run(g_str[0 : g_str.find("return")])
+        self.assertGraphContainsExactly(graph, "prim::DifferentiableGraph", 1)
 
     def test_merges_down(self):
         # o     x --> o
@@ -335,8 +417,8 @@ def fn(v, w, x, y):
         num_nodes = 4 if GRAPH_EXECUTOR == ProfilingMode.PROFILING else 3
         # add moved down
         g_str = str(graph)
-        FileCheck().check_not("aten::add").run(g_str[0:g_str.find("return")])
-        self.assertGraphContainsExactly(graph, 'prim::DifferentiableGraph', 1)
+        FileCheck().check_not("aten::add").run(g_str[0 : g_str.find("return")])
+        self.assertGraphContainsExactly(graph, "prim::DifferentiableGraph", 1)
 
     def test_respects_lexical_scoping(self):
         def fn(x, k):
@@ -346,12 +428,10 @@ def fn(x, k):
             z = y * k
             return z, k
 
-
         graph = self._perform_ad_subgraph_slicing(fn, 1, 1)
         # We should not have combined the two multiplications into
         # the same group; they should each be a separate DiffGraph
-        self.assertGraphContainsExactly(graph, 'prim::DifferentiableGraph', 3)
-
+        self.assertGraphContainsExactly(graph, "prim::DifferentiableGraph", 3)
 
     def test_merge_respects_aliasing(self):
         def fn(x, k, cond):
@@ -368,15 +448,13 @@ def fn(x, k, cond):
 
         graph = self._perform_ad_subgraph_slicing(fn, [2, 2], [2, 2], 1)
         # z2 did did not get merged into the subgraph
-        FileCheck().check("prim::If").check("aten::select").check_next("aten::select")\
-            .check_next("aten::add_").check("Differentiable").run(graph)
-        self.assertGraphContainsExactly(graph, 'prim::DifferentiableGraph', 2)
+        FileCheck().check("prim::If").check("aten::select").check_next(
+            "aten::select"
+        ).check_next("aten::add_").check("Differentiable").run(graph)
+        self.assertGraphContainsExactly(graph, "prim::DifferentiableGraph", 2)
 
     def test_aliased_outputs(self):
-
         with enable_profiling_mode_for_profiling_tests():
-
-
             # Case 1: aliasing between relu and t
             # is within a DifferentiableGraph. It should be valid
             # to merge both split_with_sizes in relu in one graph
@@ -389,9 +467,9 @@ def test_aliased_outputs(self):
 
             graph = torch._C.parse_ir(input_str)
             torch._C._jit_pass_create_autodiff_subgraphs(graph, 1)
-            FileCheck().check("with prim::DifferentiableGraph") \
-                .check("aten::relu").check("aten::t") \
-                .run(graph)
+            FileCheck().check("with prim::DifferentiableGraph").check(
+                "aten::relu"
+            ).check("aten::t").run(graph)
 
             # Case 2: aliasing between relu and split_with_sizes
             # are both outputs of a Diff graph. It should be invalid
@@ -410,11 +488,11 @@ def test_aliased_outputs(self):
 
             graph = torch._C.parse_ir(input_str)
             torch._C._jit_pass_create_autodiff_subgraphs(graph, 1)
-            FileCheck().check("Tensor = prim::DifferentiableGraph") \
-                .check("with prim::DifferentiableGraph") \
-                .check("Tensor = aten::relu") \
-                .check_not("aten::split_with_sizes") \
-                .run(graph)
+            FileCheck().check("Tensor = prim::DifferentiableGraph").check(
+                "with prim::DifferentiableGraph"
+            ).check("Tensor = aten::relu").check_not("aten::split_with_sizes").run(
+                graph
+            )
 
             # Case 3: two aliased nodes in a graph.
             # Both `split_with_sizes` should be unfused
@@ -432,11 +510,11 @@ def test_aliased_outputs(self):
 
             graph = torch._C.parse_ir(input_str)
             torch._C._jit_pass_create_autodiff_subgraphs(graph, 1)
-            FileCheck().check("Tensor = prim::DifferentiableGraph") \
-                .check("with prim::DifferentiableGraph") \
-                .check("Tensor = aten::relu") \
-                .check_not("aten::split_with_sizes") \
-                .run(graph)
+            FileCheck().check("Tensor = prim::DifferentiableGraph").check(
+                "with prim::DifferentiableGraph"
+            ).check("Tensor = aten::relu").check_not("aten::split_with_sizes").run(
+                graph
+            )
 
             # Case 4: the aliased output has a descendant
             # Both should be unfused. Note, %3 comes before %2
@@ -454,11 +532,9 @@ def test_aliased_outputs(self):
 
             graph = torch._C.parse_ir(input_str)
             torch._C._jit_pass_create_autodiff_subgraphs(graph, 1)
-            FileCheck().check("Tensor = prim::DifferentiableGraph") \
-                .check("with prim::DifferentiableGraph") \
-                .check("Tensor = aten::relu") \
-                .check_not("aten::t") \
-                .run(graph)
+            FileCheck().check("Tensor = prim::DifferentiableGraph").check(
+                "with prim::DifferentiableGraph"
+            ).check("Tensor = aten::relu").check_not("aten::t").run(graph)
 
             # Case 5: multiple aliased groups
             # Both should be unfused. Note, %3 comes before %2
@@ -478,11 +554,9 @@ def test_aliased_outputs(self):
 
             graph = torch._C.parse_ir(input_str)
             torch._C._jit_pass_create_autodiff_subgraphs(graph, 1)
-            FileCheck().check("Tensor = prim::DifferentiableGraph") \
-                .check("with prim::DifferentiableGraph") \
-                .check("Tensor = aten::relu") \
-                .check_not("aten::t") \
-                .run(graph)
+            FileCheck().check("Tensor = prim::DifferentiableGraph").check(
+                "with prim::DifferentiableGraph"
+            ).check("Tensor = aten::relu").check_not("aten::t").run(graph)
 
     def test_has_profiled_info_aliasing_outputs(self):
         # The expectation is that CallFunction will prevent the final profile node from
@@ -511,9 +585,6 @@ def test_has_profiled_info_aliasing_outputs(self):
         output = outputs[0]
         self.assertEqual(False, output.requiresGrad())
 
-        FileCheck().check("= prim::DifferentiableGraph") \
-            .check("with prim::DifferentiableGraph") \
-            .check(" = aten::relu") \
-            .check("requires_grad=0") \
-            .check("aten::relu") \
-            .run(graph)
+        FileCheck().check("= prim::DifferentiableGraph").check(
+            "with prim::DifferentiableGraph"
+        ).check(" = aten::relu").check("requires_grad=0").check("aten::relu").run(graph)
diff --git a/test/jit/test_await.py b/test/jit/test_await.py
index 1500ed27b7f2b..9d77a94698fc1 100644
--- a/test/jit/test_await.py
+++ b/test/jit/test_await.py
@@ -1,18 +1,19 @@
 # Owner(s): ["oncall: jit"]
 
 import io
-import torch
-from torch.testing._internal.jit_utils import JitTestCase
-from torch.testing._internal.jit_utils import make_global
 from typing import List, Optional, Tuple
+
+import torch
 from torch import Tensor
 from torch._awaits import _Await as Await
+from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 
 class TestAwait(JitTestCase):
     def test_await_python(self):
         def foo(x: int) -> int:
             return x + 13
+
         aw: Await[int] = torch.jit._awaitable(foo, 13)
         self.assertTrue(aw.fn()(*aw.args()) == torch.jit._awaitable_wait(aw))
         nw = torch.jit._awaitable_nowait(33)
@@ -22,6 +23,7 @@ def foo(x: int) -> int:
     def test_await_type_python(self):
         def foo() -> Tensor:
             return torch.randn()
+
         awaits = torch.jit.annotate(List[Await[Tensor]], [])
         awaits.append(torch.jit._awaitable(foo))
 
@@ -82,9 +84,7 @@ def fn(x: Tensor):
         self.assertTrue(torch.allclose(torch.eye(2), script_out))
         self.assertTrue(torch.allclose(script_out, out))
 
-
     def test_await_class_arg(self):
-
         class C:
             def __init__(self, a: Tensor, b: Tensor):
                 self.__a = a
@@ -104,6 +104,7 @@ def fn(x: Tensor):
             _a = torch.eye(2)
             c2_t = torch.jit._awaitable_wait(aw)
             return _a + c2_t + x
+
         inp = torch.zeros(2)
 
         sm = torch.jit.script(fn)
@@ -120,7 +121,6 @@ def __init__(self, a: Tensor, b: Tensor):
                 self._a = a
                 self._b = b
 
-
         make_global(C)
 
         # Can not stay in the class as Jit does not support Recursive annotations
@@ -143,7 +143,6 @@ def fn(x: Tensor):
         self.assertTrue(torch.allclose(script_out, out))
 
     def test_await_class_return(self):
-
         class C:
             __slots__ = ["a", "b"]
 
@@ -151,7 +150,6 @@ def __init__(self, a: Tensor, b: Tensor):
                 self.a = a
                 self.b = b
 
-
         make_global(C)
 
         # Can not stay in the class as Jit does not support Recursive annotations
@@ -175,7 +173,9 @@ def fn(x: Tensor):
         script_out = sm(inp)
         self.assertTrue(torch.allclose(torch.eye(2) + 6 * torch.ones(2), script_out))
         self.assertTrue(torch.allclose(script_out, out))
-        self.assertGraphContainsExactly(sm.graph, kind='prim::awaitable_wait', num_kind_nodes=1)
+        self.assertGraphContainsExactly(
+            sm.graph, kind="prim::awaitable_wait", num_kind_nodes=1
+        )
 
     def test_await_getattr_implicit_convertion(self):
         class C:
@@ -186,7 +186,6 @@ def __init__(self, a: Tensor, b: Tensor):
             def b(self):
                 return self._b
 
-
         make_global(C)
 
         # Can not stay in the class as Jit does not support Recursive annotations
@@ -212,10 +211,11 @@ def fn(x: Tensor):
         script_out = sm(inp)
         self.assertTrue(torch.allclose(torch.eye(2) + 7 * torch.ones(2), script_out))
         self.assertTrue(torch.allclose(script_out, out))
-        self.assertGraphContainsExactly(sm.graph, kind='prim::awaitable_wait', num_kind_nodes=2)
+        self.assertGraphContainsExactly(
+            sm.graph, kind="prim::awaitable_wait", num_kind_nodes=2
+        )
 
     def test_await_nested(self):
-
         class C:
             def __init__(self, a: Tensor, b: Tensor):
                 self.__a = a
@@ -250,6 +250,7 @@ class Tree:
             def __init__(self, v):
                 self.parent = torch.jit.annotate(Optional[Tree], None)
                 self.v = v
+
         make_global(Tree)
 
         def delayed(t: Tree):
@@ -275,12 +276,15 @@ def main(x: Tensor) -> Tensor:
         sm = torch.jit.script(main)
         out = main(inp)
         script_out = sm(inp)
-        self.assertTrue(torch.allclose(2 * torch.eye(2) + 2 * torch.ones(2), script_out))
+        self.assertTrue(
+            torch.allclose(2 * torch.eye(2) + 2 * torch.ones(2), script_out)
+        )
         self.assertTrue(torch.allclose(script_out, out))
 
     def test_await_eager_lazy(self):
         def delayed(x: Tensor) -> Tensor:
             return 2 * (x + 1)
+
         t = torch.ones(2, dtype=torch.int64)
         aw = torch.jit._awaitable(delayed, t)
         self.assertTrue(isinstance(aw, torch._C._Await))
@@ -302,7 +306,9 @@ def main(x: Tensor) -> Await[Tensor]:
 
         script_out_aw = sm(inp)
         script_out = torch.jit._awaitable_wait(script_out_aw)
-        self.assertTrue(torch.allclose(2 * torch.eye(2) + 2 * torch.ones(2), script_out))
+        self.assertTrue(
+            torch.allclose(2 * torch.eye(2) + 2 * torch.ones(2), script_out)
+        )
         self.assertTrue(torch.allclose(script_out, out))
 
     def test_jit_trace(self):
diff --git a/test/jit/test_backend_nnapi.py b/test/jit/test_backend_nnapi.py
index e0f7f671147c3..289827ecd3f01 100644
--- a/test/jit/test_backend_nnapi.py
+++ b/test/jit/test_backend_nnapi.py
@@ -3,11 +3,11 @@
 import os
 import sys
 import unittest
+from pathlib import Path
 
 import torch
 import torch._C
-from pathlib import Path
-from torch.testing._internal.common_utils import IS_FBCODE
+from torch.testing._internal.common_utils import IS_FBCODE, skipIfTorchDynamo
 
 # hacky way to skip these tests in fbcode:
 # during test execution in fbcode, test_nnapi is available during test discovery,
@@ -15,9 +15,11 @@
 # it sees tests but then fails when it tries to actuall run them.
 if not IS_FBCODE:
     from test_nnapi import TestNNAPI
+
     HAS_TEST_NNAPI = True
 else:
     from torch.testing._internal.common_utils import TestCase as TestNNAPI
+
     HAS_TEST_NNAPI = False
 
 
@@ -39,9 +41,14 @@
 """
 # First skip is needed for IS_WINDOWS or IS_MACOS to skip the tests.
 torch_root = Path(__file__).resolve().parent.parent.parent
-lib_path = torch_root / 'build' / 'lib' / 'libnnapi_backend.so'
-@unittest.skipIf(not os.path.exists(lib_path),
-                 "Skipping the test as libnnapi_backend.so was not found")
+lib_path = torch_root / "build" / "lib" / "libnnapi_backend.so"
+
+
+@skipIfTorchDynamo("weird py38 failures")
+@unittest.skipIf(
+    not os.path.exists(lib_path),
+    "Skipping the test as libnnapi_backend.so was not found",
+)
 @unittest.skipIf(IS_FBCODE, "test_nnapi.py not found")
 class TestNnapiBackend(TestNNAPI):
     def setUp(self):
@@ -88,35 +95,44 @@ def test_compile_spec_santiy(self):
 
         # No forward key
         compile_spec = {"backward": {"inputs": args}}
-        with self.assertRaisesRegex(RuntimeError, "method_compile_spec does not contain the \"forward\" key." + errorMsgTail):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            'method_compile_spec does not contain the "forward" key.' + errorMsgTail,
+        ):
             torch._C._jit_to_backend("nnapi", traced, compile_spec)
 
         # No dictionary under the forward key
         compile_spec = {"forward": 1}
-        with self.assertRaisesRegex(RuntimeError,
-                                    "method_compile_spec does not contain a dictionary with an \"inputs\" key, "
-                                    "under it's \"forward\" key."
-                                    + errorMsgTail):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            'method_compile_spec does not contain a dictionary with an "inputs" key, '
+            'under it\'s "forward" key.' + errorMsgTail,
+        ):
             torch._C._jit_to_backend("nnapi", traced, compile_spec)
 
         # No inputs key (in the dictionary under the forward key)
         compile_spec = {"forward": {"not inputs": args}}
-        with self.assertRaisesRegex(RuntimeError,
-                                    "method_compile_spec does not contain a dictionary with an \"inputs\" key, "
-                                    "under it's \"forward\" key."
-                                    + errorMsgTail):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            'method_compile_spec does not contain a dictionary with an "inputs" key, '
+            'under it\'s "forward" key.' + errorMsgTail,
+        ):
             torch._C._jit_to_backend("nnapi", traced, compile_spec)
 
         # No Tensor or TensorList under the inputs key
         compile_spec = {"forward": {"inputs": 1}}
-        with self.assertRaisesRegex(RuntimeError,
-                                    "method_compile_spec does not contain either a Tensor or TensorList, under it's \"inputs\" key."
-                                    + errorMsgTail):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            'method_compile_spec does not contain either a Tensor or TensorList, under it\'s "inputs" key.'
+            + errorMsgTail,
+        ):
             torch._C._jit_to_backend("nnapi", traced, compile_spec)
         compile_spec = {"forward": {"inputs": [1]}}
-        with self.assertRaisesRegex(RuntimeError,
-                                    "method_compile_spec does not contain either a Tensor or TensorList, under it's \"inputs\" key."
-                                    + errorMsgTail):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            'method_compile_spec does not contain either a Tensor or TensorList, under it\'s "inputs" key.'
+            + errorMsgTail,
+        ):
             torch._C._jit_to_backend("nnapi", traced, compile_spec)
 
     def tearDown(self):
diff --git a/test/jit/test_backends.py b/test/jit/test_backends.py
index afd59e3e94d3a..493d34f3eb5fc 100644
--- a/test/jit/test_backends.py
+++ b/test/jit/test_backends.py
@@ -1,6 +1,5 @@
 # Owner(s): ["oncall: jit"]
 
-from torch.testing._internal.jit_utils import JitTestCase
 import io
 import os
 import sys
@@ -8,18 +7,20 @@
 
 import torch
 import torch._C
-from torch.testing import FileCheck
 from torch.jit.mobile import _load_for_lite_interpreter
+from torch.testing import FileCheck
 
 from torch.testing._internal.common_utils import (
+    find_library_location,
     IS_FBCODE,
     IS_MACOS,
     IS_SANDCASTLE,
     IS_WINDOWS,
-    TEST_WITH_ROCM,
     skipIfRocm,
-    find_library_location,
+    TEST_WITH_ROCM,
 )
+from torch.testing._internal.jit_utils import JitTestCase
+
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
@@ -33,7 +34,9 @@
 
 
 def to_test_backend(module, method_compile_spec):
-    return torch._C._jit_to_backend("test_backend", module, {"forward": method_compile_spec})
+    return torch._C._jit_to_backend(
+        "test_backend", module, {"forward": method_compile_spec}
+    )
 
 
 def to_test_backend_multi(module, method_compile_spec):
@@ -63,8 +66,10 @@ def sub_accum(self, x, h):
 
 
 # This is ignored in IS_WINDOWS or IS_MACOS cases. Hence we need the one in TestBackends.
-@unittest.skipIf(TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
-                 "Non-portable load_library call used in test")
+@unittest.skipIf(
+    TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
+    "Non-portable load_library call used in test",
+)
 class JitBackendTestCase(JitTestCase):
     """
     A common base class for JIT backend tests that contains common utility
@@ -73,7 +78,7 @@ class JitBackendTestCase(JitTestCase):
 
     def setUp(self):
         super().setUp()
-        lib_file_path = find_library_location('libjitbackend_test.so')
+        lib_file_path = find_library_location("libjitbackend_test.so")
         torch.ops.load_library(str(lib_file_path))
         # Subclasses are expected to set up three variables in their setUp methods:
         # module - a regular, Python version of the module being tested
@@ -154,13 +159,17 @@ def test_save_load(self):
         self.test_execution()
 
         # Save the compile spec to compare against the version retrieved after loading.
-        pre_compile_spec = self.lowered_module.__getattr__("__loweredModule__").__getattr__("__method_compile_spec")
+        pre_compile_spec = self.lowered_module.__getattr__(
+            "__loweredModule__"
+        ).__getattr__("__method_compile_spec")
 
         # Save and load the lowered module.
         self.save_load()
 
         # Get the compile spec after loading.
-        post_compile_spec = self.lowered_module.__getattr__("__loweredModule__").__getattr__("__method_compile_spec")
+        post_compile_spec = self.lowered_module.__getattr__(
+            "__loweredModule__"
+        ).__getattr__("__method_compile_spec")
 
         # Compile specs should match.
         self.assertEqual(pre_compile_spec, post_compile_spec)
@@ -195,9 +204,11 @@ def test_execution(self):
         input = torch.randn(5)
 
         # Test exception is thrown.
-        with self.assertRaisesRegexWithHighlight(Exception,
-                                                 r"Backend is not available.",
-                                                 "raise Exception(\"Backend is not available.\""):
+        with self.assertRaisesRegexWithHighlight(
+            Exception,
+            r"Backend is not available.",
+            'raise Exception("Backend is not available."',
+        ):
             backend_method = self.lowered_module.__getattr__("forward")
             backend_output = backend_method(*(input, input))
 
@@ -207,9 +218,11 @@ def test_save_load(self):
         buffer = io.BytesIO()
         torch.jit.save(self.lowered_module, buffer)
         buffer.seek(0)
-        with self.assertRaisesRegexWithHighlight(Exception,
-                                                 r"Backend is not available.",
-                                                 "raise Exception(\"Backend is not available.\""):
+        with self.assertRaisesRegexWithHighlight(
+            Exception,
+            r"Backend is not available.",
+            'raise Exception("Backend is not available."',
+        ):
             imported = torch.jit.load(buffer)
 
 
@@ -218,6 +231,7 @@ class NestedModuleTest(JitBackendTestCase):
     Tests for NestedModule that check that a module lowered to a backend can be used
     as a submodule.
     """
+
     class NestedModule(torch.nn.Module):
         """
         A Module with one submodule that is used to test that lowered Modules
@@ -237,7 +251,9 @@ def setUp(self):
         # Both modules in self.module are regular Python modules.
         self.module = NestedModuleTest.NestedModule(BasicModule())
         # Both modules in self.scripted_module are ScriptModules.
-        self.scripted_module = torch.jit.script(NestedModuleTest.NestedModule(BasicModule()))
+        self.scripted_module = torch.jit.script(
+            NestedModuleTest.NestedModule(BasicModule())
+        )
 
         # First, script another instance of NestedModule with share_types=False so that it can be
         # selectively lowered without modifying the type of self.scripted_module.
@@ -246,7 +262,9 @@ def setUp(self):
             {"accum": {"": ""}, "sub_accum": {"": ""}, "forward": {"": ""}},
         )
         # self.lowered_module is a ScriptModule, but its submodule is a lowered module.
-        self.lowered_module = torch.jit.script(NestedModuleTest.NestedModule(lowered_module))
+        self.lowered_module = torch.jit.script(
+            NestedModuleTest.NestedModule(lowered_module)
+        )
 
     def test_execution(self):
         # Test execution with backend against Python and JIT.
@@ -270,6 +288,7 @@ class SelectiveLoweringTest(JitBackendTestCase):
     """
     Tests for the selective lowering API.
     """
+
     class OuterModule(torch.nn.Module):
         def __init__(self, sub1, sub2, other):
             super().__init__()
@@ -299,7 +318,10 @@ def setUp(self):
         MiddleModule = SelectiveLoweringTest.MiddleModule
 
         def script_without_type_sharing(mod):
-            return torch.jit._recursive.create_script_module(mod, torch.jit._recursive.infer_methods_to_compile, share_types=False)
+            return torch.jit._recursive.create_script_module(
+                mod, torch.jit._recursive.infer_methods_to_compile, share_types=False
+            )
+
         # Create Python, JIT and backend versions of a hierarchy that looks like this:
         #                 --------- OuterModule --------
         #                 |              |              |
@@ -308,13 +330,28 @@ def script_without_type_sharing(mod):
         #           BasicModule     BasicModule    BasicModule
         #
         # Two BasicModules will be lowered and the third will not.
-        self.module = OuterModule(MiddleModule(BasicModule()), MiddleModule(BasicModule()), MiddleModule(BasicModule()))
-        self.scripted_module = script_without_type_sharing(OuterModule(MiddleModule(
-            BasicModule()), MiddleModule(BasicModule()), MiddleModule(BasicModule())))
-        self.lowered_module = script_without_type_sharing(OuterModule(MiddleModule(
-            BasicModule()), MiddleModule(BasicModule()), MiddleModule(BasicModule())))
-        self.lowered_module = to_test_backend_selective(self.lowered_module, {"forward": ""}, [
-                                                        "sub1.submodule", "sub2.submodule"])
+        self.module = OuterModule(
+            MiddleModule(BasicModule()),
+            MiddleModule(BasicModule()),
+            MiddleModule(BasicModule()),
+        )
+        self.scripted_module = script_without_type_sharing(
+            OuterModule(
+                MiddleModule(BasicModule()),
+                MiddleModule(BasicModule()),
+                MiddleModule(BasicModule()),
+            )
+        )
+        self.lowered_module = script_without_type_sharing(
+            OuterModule(
+                MiddleModule(BasicModule()),
+                MiddleModule(BasicModule()),
+                MiddleModule(BasicModule()),
+            )
+        )
+        self.lowered_module = to_test_backend_selective(
+            self.lowered_module, {"forward": ""}, ["sub1.submodule", "sub2.submodule"]
+        )
 
     def test_execution(self):
         input = torch.randn(5)
@@ -335,93 +372,93 @@ def test_selective_lowering_type_remap(self):
         """
         # Check that self.lowered_module was not lowered, but that it does contain test_backendLoweredModule due to it
         # calling the lowered module directly.
-        FileCheck() \
-            .check("OuterModule") \
-            .check("BasicModule") \
-            .run(self.scripted_module.graph)
-        FileCheck() \
-            .check("OuterModule") \
-            .check_not("__torch__.torch.classes.__backends__.test_backend") \
-            .check("LoweredWrapper.test_backend") \
-            .run(self.lowered_module.graph)
+        FileCheck().check("OuterModule").check("BasicModule").run(
+            self.scripted_module.graph
+        )
+        FileCheck().check("OuterModule").check_not(
+            "__torch__.torch.classes.__backends__.test_backend"
+        ).check("LoweredWrapper.test_backend").run(self.lowered_module.graph)
 
         # Check that self.lowered_module.sub1/sub2 were not lowered but that BasicModule has been replaced in their graphs.
-        FileCheck() \
-            .check("MiddleModule") \
-            .check("BasicModule") \
-            .check_not("LoweredWrapper.test_backend") \
-            .run(self.scripted_module.sub1.graph)
-        FileCheck() \
-            .check("MiddleModule") \
-            .check_not("__torch__.torch.classes.__backends__.test_backend") \
-            .check("LoweredWrapper.test_backend") \
-            .run(self.lowered_module.sub1.graph)
-
-        FileCheck() \
-            .check("MiddleModule") \
-            .check("BasicModule") \
-            .check_not("LoweredWrapper.test_backend") \
-            .run(self.scripted_module.sub2.graph)
-        FileCheck() \
-            .check("MiddleModule") \
-            .check_not("__torch__.torch.classes.__backends__.test_backend") \
-            .check("LoweredWrapper.test_backend") \
-            .run(self.lowered_module.sub2.graph)
+        FileCheck().check("MiddleModule").check("BasicModule").check_not(
+            "LoweredWrapper.test_backend"
+        ).run(self.scripted_module.sub1.graph)
+        FileCheck().check("MiddleModule").check_not(
+            "__torch__.torch.classes.__backends__.test_backend"
+        ).check("LoweredWrapper.test_backend").run(self.lowered_module.sub1.graph)
+
+        FileCheck().check("MiddleModule").check("BasicModule").check_not(
+            "LoweredWrapper.test_backend"
+        ).run(self.scripted_module.sub2.graph)
+        FileCheck().check("MiddleModule").check_not(
+            "__torch__.torch.classes.__backends__.test_backend"
+        ).check("LoweredWrapper.test_backend").run(self.lowered_module.sub2.graph)
 
         # Check that self.lowered_module.sub1/sub2.submodule were lowered. They should have a new attribute
         # __loweredModule__ whose graph should mention __torch__.torch.classes.__backends__.test_backend,
         # the TorchBind class for executing functions on the test JIT backend.
-        FileCheck() \
-            .check("LoweredModule.test_backend") \
-            .check("__torch__.torch.classes.__backends__.test_backend") \
-            .run(self.lowered_module.sub1.submodule.__loweredModule__.graph)
+        FileCheck().check("LoweredModule.test_backend").check(
+            "__torch__.torch.classes.__backends__.test_backend"
+        ).run(self.lowered_module.sub1.submodule.__loweredModule__.graph)
 
-        FileCheck() \
-            .check("LoweredModule.test_backend") \
-            .check("__torch__.torch.classes.__backends__.test_backend") \
-            .run(self.lowered_module.sub2.submodule.__loweredModule__.graph)
+        FileCheck().check("LoweredModule.test_backend").check(
+            "__torch__.torch.classes.__backends__.test_backend"
+        ).run(self.lowered_module.sub2.submodule.__loweredModule__.graph)
 
         # Check that self.other and self.other.submodule have been left untouched by the selective lowering process.
-        FileCheck() \
-            .check("MiddleModule") \
-            .check("BasicModule") \
-            .check_not("__torch__.torch.classes.__backends__.test_backend") \
-            .check_not("LoweredWrapper.test_backend") \
-            .run(self.scripted_module.other.graph)
-        FileCheck() \
-            .check("BasicModule") \
-            .check_not("__torch__.torch.classes.__backends__.test_backend") \
-            .check_not("LoweredModule.test_backend") \
-            .run(self.scripted_module.other.submodule.graph)
+        FileCheck().check("MiddleModule").check("BasicModule").check_not(
+            "__torch__.torch.classes.__backends__.test_backend"
+        ).check_not("LoweredWrapper.test_backend").run(self.scripted_module.other.graph)
+        FileCheck().check("BasicModule").check_not(
+            "__torch__.torch.classes.__backends__.test_backend"
+        ).check_not("LoweredModule.test_backend").run(
+            self.scripted_module.other.submodule.graph
+        )
 
     def test_errors(self):
         """
         Check errors associated with selective lowering.
         """
         # Check error messages thrown when attempting to lower something that is not a ScriptModule.
-        with self.assertRaisesRegexWithHighlight(RuntimeError, r"Object .* is not a ScriptModule", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, r"Object .* is not a ScriptModule", ""
+        ):
             to_test_backend_selective(torch.nn.ReLU(), {"forward": ""}, ["submodule"])
 
         MiddleModule = SelectiveLoweringTest.MiddleModule
         mod = MiddleModule(BasicModule())
         mod.new_attr = 3
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, r"Attribute named new_attr is not a Module", ""):
-            to_test_backend_selective(torch.jit.script(mod), {"forward": ""}, ["new_attr"])
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, r"Attribute named new_attr is not a Module", ""
+        ):
+            to_test_backend_selective(
+                torch.jit.script(mod), {"forward": ""}, ["new_attr"]
+            )
 
         # Check error message thrown when module hierarchy doesn't have unique types.
         OuterModule = SelectiveLoweringTest.OuterModule
-        mod = OuterModule(MiddleModule(BasicModule()), MiddleModule(BasicModule()), MiddleModule(BasicModule()))
+        mod = OuterModule(
+            MiddleModule(BasicModule()),
+            MiddleModule(BasicModule()),
+            MiddleModule(BasicModule()),
+        )
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 r"Selective lowering is only supported for module hierarchies with unique types",
-                                                 ""):
-            to_test_backend_selective(torch.jit.script(mod), {"forward": ""}, ["sub1.submodule"])
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError,
+            r"Selective lowering is only supported for module hierarchies with unique types",
+            "",
+        ):
+            to_test_backend_selective(
+                torch.jit.script(mod), {"forward": ""}, ["sub1.submodule"]
+            )
 
 
 # This is needed for IS_WINDOWS or IS_MACOS to skip the tests.
-@unittest.skipIf(TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
-                 "Non-portable load_library call used in test")
+@unittest.skipIf(
+    TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
+    "Non-portable load_library call used in test",
+)
 class TestBackends(JitTestCase):
     """
     This class wraps and invokes all subclasses of JitBackendTestCase so that each one
@@ -461,6 +498,7 @@ def test_save_load(self):
     def test_errors(self):
         self.selective_lowering_test.test_errors()
 
+
 """
 Unit Tests for backend with compiler
 This test case and the existing TestBackends are separate because they cover different aspects.
@@ -468,6 +506,8 @@ def test_errors(self):
 It has a simple demo compiler to test the end-to-end flow in mobile.
 However, this test cannot cover the selective_lowering for now, which is covered in TestBackends.
 """
+
+
 class BasicModuleAdd(torch.nn.Module):
     """
     A simple add Module used to test to_backend lowering machinery.
@@ -476,9 +516,12 @@ class BasicModuleAdd(torch.nn.Module):
     def forward(self, x, h):
         return x + h
 
+
 # This is ignored in IS_WINDOWS or IS_MACOS cases. Hence we need the one in TestBackends.
-@unittest.skipIf(TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
-                 "Non-portable load_library call used in test")
+@unittest.skipIf(
+    TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
+    "Non-portable load_library call used in test",
+)
 class JitBackendTestCaseWithCompiler(JitTestCase):
     """
     A common base class for JIT backend tests with compilers that contains common utility
@@ -487,7 +530,7 @@ class JitBackendTestCaseWithCompiler(JitTestCase):
 
     def setUp(self):
         super().setUp()
-        lib_file_path = find_library_location('libbackend_with_compiler.so')
+        lib_file_path = find_library_location("libbackend_with_compiler.so")
         torch.ops.load_library(str(lib_file_path))
         # Subclasses are expected to set up four variables in their setUp methods:
         # module - a regular, Python version of the module being tested
@@ -524,6 +567,7 @@ def test_errors(self):
         """
         pass
 
+
 class BasicModuleTestWithCompiler(JitBackendTestCaseWithCompiler):
     """
     Tests for BasicModuleAdd.
@@ -541,7 +585,8 @@ def setUp(self):
             },
         }
         self.lowered_module = torch._C._jit_to_backend(
-            "backend_with_compiler_demo", self.scripted_module, compile_spec)
+            "backend_with_compiler_demo", self.scripted_module, compile_spec
+        )
         # Create mobile version of BasicModuleAdd
         buffer = io.BytesIO(self.lowered_module._save_to_buffer_for_lite_interpreter())
         buffer.seek(0)
@@ -552,6 +597,7 @@ def test_execution(self):
         input = torch.ones(1, dtype=torch.float)
         self.check_forward((input, input))
 
+
 class ErrorMessagesWithCompiler(JitBackendTestCase):
     """
     Tests for errors that occur with compiler, specifically:
@@ -562,22 +608,31 @@ class ModuleNotSupported(torch.nn.Module):
         """
         A module with an operator that is not supported.
         """
+
         def forward(self, x, h):
             return x * h
             self._loweredmodule.forward()
 
     def test_errors(self):
-        scripted_module_n = torch.jit.script(ErrorMessagesWithCompiler.ModuleNotSupported())
+        scripted_module_n = torch.jit.script(
+            ErrorMessagesWithCompiler.ModuleNotSupported()
+        )
         # Test exception is thrown when lowering a module with an unsupported operator
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 # Special escape characters are replaced with '.'
-                                                 r"""The node of aten::mul is not supported in this compiler. .*
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError,
+            # Special escape characters are replaced with '.'
+            r"""The node of aten::mul is not supported in this compiler. .*
         def forward.self, x, h.:
             return x . h
                    ~~~~~ <--- HERE
             self._loweredmodule.forward..
-""", ""):
-            lowered_module_n = torch._C._jit_to_backend("backend_with_compiler_demo", scripted_module_n, {"forward": {"": ""}})
+""",
+            "",
+        ):
+            lowered_module_n = torch._C._jit_to_backend(
+                "backend_with_compiler_demo", scripted_module_n, {"forward": {"": ""}}
+            )
+
 
 class CompModuleTestWithCompiler(JitBackendTestCase):
     """
@@ -588,6 +643,7 @@ class BasicModuleSub(torch.nn.Module):
         """
         A simple subtraction Module to be used in CompModule.
         """
+
         def forward(self, x, h):
             return x - h
 
@@ -617,14 +673,19 @@ def setUp(self):
             },
         }
         lowered_add = torch._C._jit_to_backend(
-            "backend_with_compiler_demo", torch.jit.script(BasicModuleAdd()), compile_spec)
+            "backend_with_compiler_demo",
+            torch.jit.script(BasicModuleAdd()),
+            compile_spec,
+        )
         lowered_sub = torch._C._jit_to_backend(
             "backend_with_compiler_demo",
             torch.jit.script(CompModuleTestWithCompiler.BasicModuleSub()),
-            {"forward": {"": ""}}
+            {"forward": {"": ""}},
         )
         self.module = CompModuleTestWithCompiler.CompModule(lowered_add, lowered_sub)
-        self.scripted_module = torch.jit.script(CompModuleTestWithCompiler.CompModule(lowered_add, lowered_sub))
+        self.scripted_module = torch.jit.script(
+            CompModuleTestWithCompiler.CompModule(lowered_add, lowered_sub)
+        )
         # No backend version of CompModule currently, so this is filler.
         self.lowered_module = self.scripted_module
         # Create a mobile version of CompModule from JIT version
@@ -640,9 +701,12 @@ def test_execution(self):
         # Test forward.
         self.check_function("forward", (input1, input2, input2))
 
+
 # This is needed for IS_WINDOWS or IS_MACOS to skip the tests.
-@unittest.skipIf(IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
-                 "Non-portable load_library call used in test")
+@unittest.skipIf(
+    IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
+    "Non-portable load_library call used in test",
+)
 class TestBackendsWithCompiler(JitTestCase):
     """
     This class wraps and invokes all subclasses of JitBackendTestCaseWithCompiler
@@ -711,7 +775,6 @@ def forward(self, a, b, s: int):
             y = s * (c * d)
             return y
 
-
     def setUp(self):
         super().setUp()
 
@@ -728,6 +791,7 @@ def test_execution(self):
         # Test forward.
         self.check_function("forward", (a, b, s))
 
+
 class AddedAttributesTest(JitBackendTestCase):
     """
     Tests for adding attributes to a model after lowering.
@@ -747,11 +811,19 @@ def test_attribute(self):
         input = [(torch.ones(5),)]
         pre_bundled = self.lowered_module(*input[0])
         # Attach bundled inputs which adds several attributes and functions to the model
-        self.lowered_module = torch.utils.bundled_inputs.augment_model_with_bundled_inputs(lowered_module, input)  # noqa: F821
-        post_bundled = self.lowered_module(*self.lowered_module.get_all_bundled_inputs()[0])
+        self.lowered_module = (
+            torch.utils.bundled_inputs.augment_model_with_bundled_inputs(
+                lowered_module, input  # noqa: F821
+            )
+        )
+        post_bundled = self.lowered_module(
+            *self.lowered_module.get_all_bundled_inputs()[0]
+        )
         # Save and load the lowered module.
         self.save_load()
         # Use bundled after save and load to prove its preserved
-        post_load = self.lowered_module(*self.lowered_module.get_all_bundled_inputs()[0])
+        post_load = self.lowered_module(
+            *self.lowered_module.get_all_bundled_inputs()[0]
+        )
         self.assertEqual(pre_bundled, post_bundled)
         self.assertEqual(post_bundled, post_load)
diff --git a/test/jit/test_batch_mm.py b/test/jit/test_batch_mm.py
index 517a05e132a4a..5605531b47c73 100644
--- a/test/jit/test_batch_mm.py
+++ b/test/jit/test_batch_mm.py
@@ -56,7 +56,6 @@ def test_batch_mm(
         actual = test_batch_mm_scripted(*tensors)
         self.assertEqual(expected, actual, atol=1e-9, rtol=1e-9)
 
-
     def test_batch_mm_permitted_mutation(self):
         def test_batch_mm(
             T1: torch.Tensor,
diff --git a/test/jit/test_builtins.py b/test/jit/test_builtins.py
index f68642ac3b09a..887ab39d0f33f 100644
--- a/test/jit/test_builtins.py
+++ b/test/jit/test_builtins.py
@@ -1,8 +1,8 @@
 # Owner(s): ["oncall: jit"]
 
+import inspect
 import os
 import sys
-import inspect
 import unittest
 from typing import Dict, List
 
@@ -14,10 +14,12 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
 
 class TestBuiltins(JitTestCase):
@@ -86,24 +88,27 @@ def fn(x: List[int]) -> List[int]:
         self.checkScript(fn, ([1, 2, 3],))
 
         with self.assertRaisesRegexWithHighlight(RuntimeError, "undefined value", "a"):
+
             @torch.jit.script
             def fn(x):
-                a = x ** 2
+                a = x**2
                 del a
                 return a  # noqa: F821
 
         with self.assertRaisesRegexWithHighlight(RuntimeError, "undefined value", "a"):
+
             @torch.jit.script
             def fn(x):
-                a = x ** 2
+                a = x**2
                 if a:
                     del a
                 return a
 
         with self.assertRaisesRegexWithHighlight(RuntimeError, "undefined value", "b"):
+
             @torch.jit.script
             def fn(x):
-                a = x ** 2
+                a = x**2
                 del b  # noqa: F821
                 return a
 
@@ -124,7 +129,7 @@ def del_list_multiple_operands(x: List[int]) -> List[int]:
         self.assertEqual(py_out, jit_out)
 
         def del_dict_multiple_operands(x: Dict[str, int]) -> Dict[str, int]:
-            del x['hi'], x['there']
+            del x["hi"], x["there"]
             return x
 
         py_out = del_dict_multiple_operands({"hi": 5, "there": 6})
@@ -137,7 +142,7 @@ def test_tensor_properties(self):
         def should_keep(tensor, name):
             if inspect.isroutine(getattr(tensor, name)):
                 return False
-            if name.startswith('_'):
+            if name.startswith("_"):
                 return False
             return True
 
@@ -145,8 +150,8 @@ def should_keep(tensor, name):
         keys = dir(tensor)
 
         # real and imag are only implemented for complex tensors.
-        self.assertRaises(RuntimeError, lambda: should_keep(tensor, 'imag'))
-        keys.remove('imag')
+        self.assertRaises(RuntimeError, lambda: should_keep(tensor, "imag"))
+        keys.remove("imag")
 
         properties = [p for p in keys if should_keep(tensor, p)]
 
@@ -158,16 +163,16 @@ def fn(x):
         EQUALITY_MISMATCH = {
             # TorchScript doesn't have real enums so they return an int instead
             # of the actual value
-            'dtype',
-            'layout',
+            "dtype",
+            "layout",
         }
         MISSING_PROPERTIES = {
-            'grad_fn',
+            "grad_fn",
             # This is an undocumented property so it's not included
             "output_nr",
             # This has a longer implementation, maybe not worth copying to
             # TorchScript if named tensors don't work there anyways
-            'names',
+            "names",
         }
 
         for p in properties:
@@ -232,7 +237,8 @@ def test_method_on_number(self):
         def func():
             c = 1
             return c.add(1)
-        with self.assertRaisesRegex(RuntimeError, 'object has no attribute or method'):
+
+        with self.assertRaisesRegex(RuntimeError, "object has no attribute or method"):
             torch.jit.script(func)
 
     # testing implicit conversion of tensors to scalars to match function arguments
@@ -265,10 +271,12 @@ def tensor_to_float(x, tensor):
 
         x = torch.zeros(10)
         # float tensor, float tensor with grad, int tensor (can't set grad on int tensor)
-        tensors = [torch.tensor(1.1),
-                   torch.tensor(1.1, requires_grad=True),
-                   torch.tensor(0),
-                   torch.tensor([2])]
+        tensors = [
+            torch.tensor(1.1),
+            torch.tensor(1.1, requires_grad=True),
+            torch.tensor(0),
+            torch.tensor([2]),
+        ]
 
         script_funs = [tensor_to_int_script, tensor_to_float_script]
         funs = [tensor_to_int, tensor_to_float]
@@ -286,4 +294,6 @@ def test_func(func, x, tensor):
         # assert result or exception equal for each (function, inputs)
         for tensor in tensors:
             for i in range(len(script_funs)):
-                self.assertEqual(test_func(script_funs[i], x, tensor), test_func(funs[i], x, tensor))
+                self.assertEqual(
+                    test_func(script_funs[i], x, tensor), test_func(funs[i], x, tensor)
+                )
diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py
index 80829795d0abb..0aa3087471c52 100644
--- a/test/jit/test_class_type.py
+++ b/test/jit/test_class_type.py
@@ -4,24 +4,28 @@
 import os
 import sys
 import unittest
+from typing import Any
 
 import torch
 import torch.nn as nn
 from torch.testing import FileCheck
-from typing import Any
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase, make_global
+from typing import Dict, Iterable, List, Optional, Tuple
+
 import torch.testing._internal.jit_utils
 from torch.testing._internal.common_utils import IS_SANDCASTLE, skipIfTorchDynamo
-from typing import List, Tuple, Iterable, Optional, Dict
+from torch.testing._internal.jit_utils import JitTestCase, make_global
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
 
 class TestClassType(JitTestCase):
     def test_reference_semantics(self):
@@ -29,6 +33,7 @@ def test_reference_semantics(self):
         Test that modifications made to a class instance in TorchScript
         are visible in eager.
         """
+
         class Foo:
             def __init__(self, a: int):
                 self.a = a
@@ -92,12 +97,12 @@ def __init__(self):
                 pass
 
             def __contains__(self, key: str) -> bool:
-                return key == 'hi'
+                return key == "hi"
 
         @torch.jit.script
         def fn():
             foo = FooTest()
-            return 'hi' in foo, 'no' in foo
+            return "hi" in foo, "no" in foo
 
         self.assertEqual(fn(), (True, False))
 
@@ -118,7 +123,10 @@ def fn(x: int) -> int:
         self.assertEqual(fn(1), 3)
 
     def test_set_attr_type_mismatch(self):
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "Wrong type for attribute assignment", "self.foo = 10"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Wrong type for attribute assignment", "self.foo = 10"
+        ):
+
             @torch.jit.script
             class FooTest:
                 def __init__(self, x):
@@ -126,7 +134,10 @@ def __init__(self, x):
                     self.foo = 10  # should error since int != Tensor
 
     def test_get_attr_not_initialized(self):
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "object has no attribute or method", "self.asdf"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "object has no attribute or method", "self.asdf"
+        ):
+
             @torch.jit.script
             class FooTest:
                 def __init__(self, x):
@@ -136,7 +147,10 @@ def get_non_initialized(self):
                     return self.asdf  # asdf isn't an attr
 
     def test_set_attr_non_initialized(self):
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "Tried to set nonexistent attribute", "self.bar = y"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Tried to set nonexistent attribute", "self.bar = y"
+        ):
+
             @torch.jit.script
             class FooTest:
                 def __init__(self, x):
@@ -153,12 +167,16 @@ def test_schema_human_readable(self):
         Expected a value of type 'Optional[int]' for argument 'size' but instead found type 'Tensor'.
         """
         with self.assertRaisesRegexWithHighlight(RuntimeError, "nearest", ""):
+
             @torch.jit.script
             def FooTest(x):
-                return torch.nn.functional.interpolate(x, 'bad')
+                return torch.nn.functional.interpolate(x, "bad")
 
     def test_type_annotations(self):
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "Expected a value of type \'bool", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Expected a value of type 'bool", ""
+        ):
+
             @torch.jit.script  # noqa: B903
             class FooTest:  # noqa: B903
                 def __init__(self, x: bool) -> None:
@@ -171,7 +189,10 @@ def fn(x):
             fn(2)
 
     def test_conditional_set_attr(self):
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "assignment cannot be in a control-flow block", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "assignment cannot be in a control-flow block", ""
+        ):
+
             @torch.jit.script
             class FooTest:
                 def __init__(self, x):
@@ -236,7 +257,6 @@ def forward(self, a):
         # classes are globally registered for now, so we need to clear the JIT
         # registry to simulate loading a new model
 
-
         buffer.seek(0)
         m_loaded = torch.jit.load(buffer)
 
@@ -320,7 +340,7 @@ def __init__(self, x, y):
                 self.x = x
                 self.y = y
 
-        make_global(Foo)   # see [local resolution in python]
+        make_global(Foo)  # see [local resolution in python]
 
         @torch.jit.script
         def use_foo(foo: Foo) -> Foo:
@@ -419,15 +439,22 @@ def test_nested_inside_tuple():
 
         self.assertEqual(test_nested_inside_tuple(), [(1, 11), (1, 12)])
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "bool\' for argument \'reverse", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "bool' for argument 'reverse", ""
+        ):
+
             @torch.jit.script
             def test():
                 li = [Foo(1)]
                 li.sort(li)
                 return li
+
             test()
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "must define a __lt__", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "must define a __lt__", ""
+        ):
+
             @torch.jit.script
             class NoMethod:
                 def __init__(self):
@@ -438,6 +465,7 @@ def test():
                 li = [NoMethod(), NoMethod()]
                 li.sort()
                 return li
+
             test()
 
         @torch.jit.script
@@ -449,12 +477,16 @@ def __init__(self):
             def __lt__(self, other):
                 pass
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "must define a __lt__", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "must define a __lt__", ""
+        ):
+
             @torch.jit.script
             def test():
                 li = [WrongLt(), WrongLt()]
                 li.sort()
                 return li
+
             test()
 
     def test_class_inheritance(self):
@@ -466,18 +498,21 @@ def __init__(self):
             def two(self, x):
                 return x + self.b
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "does not support inheritance", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "does not support inheritance", ""
+        ):
+
             @torch.jit.script
             class Derived(Base):
                 def two(self, x):
                     return x + self.b + 2
 
-
     def test_class_inheritance_implicit(self):
         """
         Test that inheritance is detected in
         implicit scripting codepaths (e.g. try_ann_to_type).
         """
+
         class A:
             def __init__(self, t):
                 self.t = t
@@ -502,14 +537,16 @@ def fun(x: Any):
             else:
                 return B.f(x.t)
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "object has no attribute or method", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "object has no attribute or method", ""
+        ):
             sc = torch.jit.script(fun)
 
     @skipIfTorchDynamo("Test does not work with TorchDynamo")
     @unittest.skipIf(IS_SANDCASTLE, "Importing like this doesn't work in fbcode")
     def test_imported_classes(self):
-        import jit._imported_class_test.foo
         import jit._imported_class_test.bar
+        import jit._imported_class_test.foo
         import jit._imported_class_test.very.very.nested
 
         class MyMod(torch.jit.ScriptModule):
@@ -593,6 +630,7 @@ def __init__(self):
 
             def one(self, x, y):
                 return x + y
+
             # missing two
 
         @torch.jit.script
@@ -616,6 +654,7 @@ def use_them(x):
                 x = c[i].one(x, x)
                 x = c[i].two(x)
             return x
+
         self.checkScript(use_them, (torch.rand(3, 4),))
 
         @torch.jit.script
@@ -626,22 +665,33 @@ def as_interface(x: OneTwo) -> OneTwo:
         def inherit(x: OneTwoThree) -> OneTwo:
             return as_interface(x)
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "does not have method", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "does not have method", ""
+        ):
+
             @torch.jit.script
             def wrong1():
                 return as_interface(NotMember())
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "is not compatible with interface", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "is not compatible with interface", ""
+        ):
+
             @torch.jit.script
             def wrong2():
                 return as_interface(NotMember2())
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "does not have method", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "does not have method", ""
+        ):
+
             @torch.jit.script
             def wrong3():
                 return inherit(as_interface(Foo()))
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "is not compatible with interface", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "is not compatible with interface", ""
+        ):
 
             @torch.jit.script
             def wrong4(x: OneTwoWrong) -> int:
@@ -656,7 +706,7 @@ def __init__(self):
             def forward(self, x):
                 return self.proxy_mod.two(x)
 
-        TestPyAssign.__annotations__ = {'proxy_mod': OneTwo}
+        TestPyAssign.__annotations__ = {"proxy_mod": OneTwo}
 
         input = torch.rand(3, 4)
         scripted_pyassign_mod = torch.jit.script(TestPyAssign())
@@ -671,10 +721,11 @@ def __init__(self, obj):
             def forward(self, x):
                 return self.proxy_mod.two(x)
 
-        TestPyAssignError.__annotations__ = {'proxy_mod': OneTwoThree}
+        TestPyAssignError.__annotations__ = {"proxy_mod": OneTwoThree}
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "is not compatible with interface __torch__", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "is not compatible with interface __torch__", ""
+        ):
             torch.jit.script(TestPyAssignError(Foo()))
 
         # test pure python object assignment to interface fails
@@ -682,8 +733,9 @@ class PyClass:
             def __init__(self):
                 pass
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "the value is not a TorchScript compatible type", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "the value is not a TorchScript compatible type", ""
+        ):
             torch.jit.script(TestPyAssignError(PyClass()))
         # TODO test: interface-interface class-interface inheritance errors,
         # NamedTuple inheritance errors
@@ -729,7 +781,7 @@ def __mul__(self, other: int) -> int:
                 return self.x * other
 
             def __pow__(self, other: int) -> int:
-                return int(self.x ** other)
+                return int(self.x**other)
 
             def __truediv__(self, other: int) -> float:
                 return self.x / other
@@ -773,54 +825,89 @@ def __setitem__(self, idx: int, val: int) -> None:
             def __call__(self, val: int) -> int:
                 return self.x * val * 3
 
-
         make_global(Foo)  # see [local resolution in python]
 
         def add():
             return MyClass(4) + 3
+
         def sub():  # noqa: E306
             return MyClass(4) - 3
+
         def mul():  # noqa: E306
             return MyClass(4) * 3
+
         def pow():  # noqa: E306
             return MyClass(4) ** 3
+
         def truediv():  # noqa: E306
             return MyClass(4) / 3
+
         def ne():  # noqa: E306
             return MyClass(4) != 3
+
         def eq():  # noqa: E306
             return MyClass(4) == 3
+
         def lt():  # noqa: E306
             return MyClass(4) < 3
+
         def gt():  # noqa: E306
             return MyClass(4) > 3
+
         def le():  # noqa: E306
             return MyClass(4) <= 3
+
         def ge():  # noqa: E306
             return MyClass(4) >= 3
+
         def _and():  # noqa: E306
             return MyClass(4) & 3
+
         def _or():  # noqa: E306
             return MyClass(4) | 3
+
         def _xor():  # noqa: E306
             return MyClass(4) ^ 3
+
         def getitem():  # noqa: E306
             return MyClass(4)[1]
+
         def setitem():  # noqa: E306
             a = MyClass(4)
             a[1] = 5
             return a.x
+
         def call():  # noqa: E306
             a = MyClass(5)
             return a(2)
 
-        ops = [add, sub, mul, pow, ne, eq, lt, gt, le, ge, _and, _or, _xor, getitem, setitem, call]
+        ops = [
+            add,
+            sub,
+            mul,
+            pow,
+            ne,
+            eq,
+            lt,
+            gt,
+            le,
+            ge,
+            _and,
+            _or,
+            _xor,
+            getitem,
+            setitem,
+            call,
+        ]
 
         ops.append(truediv)
         for func in ops:
             self.checkScript(func, ())
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "object has no attribute or method", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "object has no attribute or method", ""
+        ):
+
             @torch.jit.script
             def test():
                 return Foo(torch.tensor(1)) + Foo(torch.tensor(1))
@@ -852,7 +939,7 @@ def test(foo: Foo) -> Tuple[int, float, bool]:
 
         fn = torch.jit.script(test)
         self.assertEqual(fn(Foo(0.5)), test(0.5))
-        self.assertEqual(fn(Foo(0.)), test(0.0))
+        self.assertEqual(fn(Foo(0.0)), test(0.0))
         # str has slightly different formatting
         self.assertTrue("0.5" in (str(Foo(0.5))))
         self.assertTrue("0." in (str(Foo(0.0))))
@@ -865,7 +952,10 @@ def __init__(self):
             def __bool__(self):
                 return (1, 2)
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "expected a bool expression for condition", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "expected a bool expression for condition", ""
+        ):
+
             @torch.jit.script
             def test():
                 if BadBool():
@@ -921,6 +1011,7 @@ def test_recursive_class(self):
         Recursive class types not yet supported. We should give a good error message.
         """
         with self.assertRaises(RuntimeError):
+
             @torch.jit.script  # noqa: B903
             class Tree:  # noqa: B903
                 def __init__(self):
@@ -940,7 +1031,7 @@ def forward(self, x):
                 return x, y
 
         # Test serialization/deserialization of class constant
-        for c in (2, 1.0, None, True, 'str', (2, 3), [5.9, 7.3]):
+        for c in (2, 1.0, None, True, "str", (2, 3), [5.9, 7.3]):
             m = torch.jit.script(M(c))
             buffer = io.BytesIO()
             torch.jit.save(m, buffer)
@@ -954,28 +1045,31 @@ def forward(self, x):
 
     def test_py_class_to_ivalue_missing_attribute(self):
         class Foo:
-            i : int
-            f : float
+            i: int
+            f: float
 
-            def __init__(self, i : int, f : float):
+            def __init__(self, i: int, f: float):
                 self.i = i
                 self.f = f
 
         make_global(Foo)  # see [local resolution in python]
 
         @torch.jit.script
-        def test_fn(x : Foo) -> float:
+        def test_fn(x: Foo) -> float:
             return x.i + x.f
 
         test_fn(Foo(3, 4.0))
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, 'missing attribute i', ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "missing attribute i", ""
+        ):
             test_fn(torch.rand(3, 4))
 
     def test_unused_method(self):
         """
         Test unused methods on scripted classes.
         """
+
         @torch.jit.script
         class Unused:
             def __init__(self):
@@ -1028,12 +1122,13 @@ def test_self_referential_method(self):
         Test that a scripted class can have a method that refers to the class itself
         in its type annotations.
         """
+
         @torch.jit.script
         class Meta:
             def __init__(self, a: int):
                 self.a = a
 
-            def method(self, other: List['Meta']) -> 'Meta':
+            def method(self, other: List["Meta"]) -> "Meta":
                 return Meta(len(other))
 
         class ModuleWithMeta(torch.nn.Module):
@@ -1051,19 +1146,20 @@ def test_type_annotation(self):
         """
         Test that annotating container attributes with types works correctly
         """
+
         @torch.jit.script
         class CompetitiveLinkingTokenReplacementUtils:
             def __init__(self):
-                self.my_list : List[Tuple[float, int, int]] = []
-                self.my_dict : Dict[int, int] = {}
+                self.my_list: List[Tuple[float, int, int]] = []
+                self.my_dict: Dict[int, int] = {}
 
         @torch.jit.script
         def foo():
             y = CompetitiveLinkingTokenReplacementUtils()
-            new_dict : Dict[int, int] = {1: 1, 2: 2}
+            new_dict: Dict[int, int] = {1: 1, 2: 2}
             y.my_dict = new_dict
 
-            new_list : List[Tuple[float, int, int]] = [(1.0, 1, 1)]
+            new_list: List[Tuple[float, int, int]] = [(1.0, 1, 1)]
             y.my_list = new_list
             return y
 
@@ -1071,6 +1167,7 @@ def test_default_args(self):
         """
         Test that methods on class types can have default arguments.
         """
+
         @torch.jit.script
         class ClassWithDefaultArgs:
             def __init__(
@@ -1105,7 +1202,9 @@ def some_defaults() -> int:
             return obj.int + obj.list[2] + obj.dict[1]
 
         def override_defaults() -> int:
-            obj: ClassWithDefaultArgs = ClassWithDefaultArgs(3, [9, 10, 11], (12, 13, 14), {3: 4}, "str")
+            obj: ClassWithDefaultArgs = ClassWithDefaultArgs(
+                3, [9, 10, 11], (12, 13, 14), {3: 4}, "str"
+            )
             s: int = obj.int
 
             for x in obj.list:
@@ -1154,7 +1253,7 @@ def set_b() -> int:
 
         # The constructor of this class below has mutable arguments. This should throw
         # an error.
-        class ClassWithMutableArgs:   # noqa: B903
+        class ClassWithMutableArgs:  # noqa: B903
             def __init__(
                 self,
                 a: List[int] = [1, 2, 3],  # noqa: B006
@@ -1164,13 +1263,16 @@ def __init__(
         def should_fail():
             obj: ClassWithMutableArgs = ClassWithMutableArgs()
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "Mutable default parameters are not supported", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Mutable default parameters are not supported", ""
+        ):
             torch.jit.script(should_fail)
 
     def test_staticmethod(self):
         """
         Test static methods on class types.
         """
+
         @torch.jit.script
         class ClassWithStaticMethod:
             def __init__(self, a: int, b: int):
@@ -1183,22 +1285,22 @@ def get_a(self):
             def get_b(self):
                 return self.b
 
-            def __eq__(self, other: 'ClassWithStaticMethod'):
+            def __eq__(self, other: "ClassWithStaticMethod"):
                 return self.a == other.a and self.b == other.b
 
             # staticmethod that calls constructor.
             @staticmethod
-            def create(args: List['ClassWithStaticMethod']) -> 'ClassWithStaticMethod':
+            def create(args: List["ClassWithStaticMethod"]) -> "ClassWithStaticMethod":
                 return ClassWithStaticMethod(args[0].a, args[0].b)
 
             # staticmethod that calls another staticmethod.
             @staticmethod
-            def create_from(a: int, b: int) -> 'ClassWithStaticMethod':
+            def create_from(a: int, b: int) -> "ClassWithStaticMethod":
                 a = ClassWithStaticMethod(a, b)
                 return ClassWithStaticMethod.create([a])
 
         # Script function that calls staticmethod.
-        def test_function(a: int, b: int) -> 'ClassWithStaticMethod':
+        def test_function(a: int, b: int) -> "ClassWithStaticMethod":
             return ClassWithStaticMethod.create_from(a, b)
 
         make_global(ClassWithStaticMethod)
@@ -1209,21 +1311,22 @@ def test_classmethod(self):
         """
         Test classmethods on class types.
         """
+
         @torch.jit.script
         class ClassWithClassMethod:
             def __init__(self, a: int):
                 self.a: int = a
 
-            def __eq__(self, other: 'ClassWithClassMethod'):
+            def __eq__(self, other: "ClassWithClassMethod"):
                 return self.a == other.a
 
             @classmethod
-            def create(cls, a: int) -> 'ClassWithClassMethod':
+            def create(cls, a: int) -> "ClassWithClassMethod":
                 return cls(a)
 
         make_global(ClassWithClassMethod)
 
-        def test_function(a: int) -> 'ClassWithClassMethod':
+        def test_function(a: int) -> "ClassWithClassMethod":
             x = ClassWithClassMethod(a)
             # Support calling classmethod with an instance
             # Calling with the class is not supported.
@@ -1236,6 +1339,7 @@ def test_properties(self):
         """
         Test that a scripted class can make use of the @property decorator.
         """
+
         def free_function(x: int) -> int:
             return x + 1
 
@@ -1308,13 +1412,22 @@ def forward(self, a: int, b: int, c: int, d: int):
 
                 return self.props.attr + no_setter.attr + method_uses_property.forward()
 
-        self.checkModule(ModuleWithProperties(5), (5, 6, 7, 8,))
+        self.checkModule(
+            ModuleWithProperties(5),
+            (
+                5,
+                6,
+                7,
+                8,
+            ),
+        )
 
     def test_custom_delete(self):
         """
         Test that del can be called on an instance of a class that
         overrides __delitem__.
         """
+
         class Example:
             def __init__(self):
                 self._data: Dict[str, torch.Tensor] = {"1": torch.tensor(1.0)}
@@ -1346,7 +1459,9 @@ def fn() -> bool:
             del example[key]
             return example.check(key)
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, r"Class does not define __delitem__", "example[key]"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, r"Class does not define __delitem__", "example[key]"
+        ):
             self.checkScript(fn, ())
 
     def test_recursive_script_builtin_type_resolution(self):
@@ -1369,7 +1484,7 @@ def f(self, x: tensor_t, y: torch.device) -> tensor_t:
             def g(self, x: device_t) -> device_ty:
                 return x
 
-            def h(self, a: 'A') -> 'A':
+            def h(self, a: "A") -> "A":
                 return A()
 
             def i(self, a: List[int]) -> int:
@@ -1404,14 +1519,14 @@ def test_recursive_script_module_builtin_type_resolution(self):
         Test resolution of built-in torch types(e.g. torch.Tensor, torch.device) when a class is recursively compiled
         when compiling a module.
         """
-        class Wrapper():
+
+        class Wrapper:
             def __init__(self, t):
                 self.t = t
 
             def to(self, l: List[torch.device], device: Optional[torch.device] = None):
                 return self.t.to(device=device)
 
-
         class A(nn.Module):
             def forward(self):
                 return Wrapper(torch.rand(4, 4))
@@ -1424,6 +1539,7 @@ def test_class_attribute_wrong_type(self):
         Test that the error message displayed when convering a class type
         to an IValue that has an attribute of the wrong type.
         """
+
         @torch.jit.script  # noqa: B903
         class ValHolder:  # noqa: B903
             def __init__(self, val):
@@ -1442,7 +1558,9 @@ def forward(self, cond: bool):
                     mod = self.mod2
                 return mod.val
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "Could not cast attribute 'val' to type Tensor", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Could not cast attribute 'val' to type Tensor", ""
+        ):
             torch.jit.script(Mod())
 
     def test_recursive_scripting(self):
@@ -1450,6 +1568,7 @@ def test_recursive_scripting(self):
         Test that class types are recursively scripted when an Python instance of one
         is encountered as a module attribute.
         """
+
         class Class:
             def __init__(self, a: int):
                 self.a = a
@@ -1473,6 +1592,7 @@ def test_recursive_scripting_failed(self):
         are added as failed attributes and do not cause compilation itself
         to fail unless they are used in scripted code.
         """
+
         class UnscriptableClass:
             def __init__(self, a: int):
                 self.a = a
@@ -1490,7 +1610,9 @@ def __init__(self, obj):
             def forward(self) -> bool:
                 return self.obj.get_a()
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "failed to convert Python type", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "failed to convert Python type", ""
+        ):
             torch.jit.script(ShouldNotCompile(UnscriptableClass(4)))
 
         # This Module has an attribute of type UnscriptableClass
@@ -1509,7 +1631,6 @@ def forward(self, x: int) -> int:
 
         self.checkModule(ShouldCompile(UnscriptableClass(4)), (4,))
 
-
     def test_unresolved_class_attributes(self):
         class UnresolvedAttrClass:
             def __init__(self):
@@ -1538,7 +1659,9 @@ def fn_e():
             u = UnresolvedAttrClass()
             return u.attr_e
 
-        error_message_regex = "object has no attribute or method.*is defined as a class attribute"
+        error_message_regex = (
+            "object has no attribute or method.*is defined as a class attribute"
+        )
         for fn in (fn_a, fn_b, fn_c, fn_d, fn_e):
             with self.assertRaisesRegex(RuntimeError, error_message_regex):
                 torch.jit.script(fn)
diff --git a/test/jit/test_complex.py b/test/jit/test_complex.py
index 3b7d344271678..de09dd0a7c1d6 100644
--- a/test/jit/test_complex.py
+++ b/test/jit/test_complex.py
@@ -1,19 +1,21 @@
 # Owner(s): ["oncall: jit"]
 
-import torch
+import cmath
 import os
 import sys
-from torch.testing._internal.jit_utils import JitTestCase, execWrapper
-from torch.testing._internal.common_utils import IS_MACOS
-from typing import List, Dict
 from itertools import product
 from textwrap import dedent
-import cmath
+from typing import Dict, List
+
+import torch
+from torch.testing._internal.common_utils import IS_MACOS
+from torch.testing._internal.jit_utils import execWrapper, JitTestCase
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
+
 class TestComplex(JitTestCase):
     def test_script(self):
         def fn(a: complex):
@@ -32,7 +34,7 @@ def test_complexdict(self):
         def fn(a: Dict[complex, complex], key: complex) -> complex:
             return a[key]
 
-        input = {2 + 3j : 2 - 3j, -4.3 - 2j: 3j}
+        input = {2 + 3j: 2 - 3j, -4.3 - 2j: 3j}
         self.checkScript(fn, (input, -4.3 - 2j))
 
     def test_pickle(self):
@@ -41,7 +43,7 @@ def __init__(self):
                 super().__init__()
                 self.a = 3 + 5j
                 self.b = [2 + 3j, 3 + 4j, 0 - 3j, -4 + 0j]
-                self.c = {2 + 3j : 2 - 3j, -4.3 - 2j: 3j}
+                self.c = {2 + 3j: 2 - 3j, -4.3 - 2j: 3j}
 
             @torch.jit.script_method
             def forward(self, b: int):
@@ -50,7 +52,7 @@ def forward(self, b: int):
         loaded = self.getExportImportCopy(ComplexModule())
         self.assertEqual(loaded.a, 3 + 5j)
         self.assertEqual(loaded.b, [2 + 3j, 3 + 4j, -3j, -4])
-        self.assertEqual(loaded.c, {2 + 3j : 2 - 3j, -4.3 - 2j: 3j})
+        self.assertEqual(loaded.c, {2 + 3j: 2 - 3j, -4.3 - 2j: 3j})
         self.assertEqual(loaded(2), 2 + 2j)
 
     def test_complex_parse(self):
@@ -65,14 +67,19 @@ def fn(a: int, b: torch.Tensor, dim: int):
         self.checkScript(fn, (t1, t2, 2))
 
     def test_complex_constants_and_ops(self):
-        vals = ([0.0, 1.0, 2.2, -1.0, -0.0, -2.2, 1, 0, 2]
-                + [10.0 ** i for i in range(2)] + [-(10.0 ** i) for i in range(2)])
+        vals = (
+            [0.0, 1.0, 2.2, -1.0, -0.0, -2.2, 1, 0, 2]
+            + [10.0**i for i in range(2)]
+            + [-(10.0**i) for i in range(2)]
+        )
         complex_vals = tuple(complex(x, y) for x, y in product(vals, vals))
 
-        funcs_template = dedent('''
+        funcs_template = dedent(
+            """
             def func(a: complex):
                 return cmath.{func_or_const}(a)
-            ''')
+            """
+        )
 
         def checkCmath(func_name, funcs_template=funcs_template):
             funcs_str = funcs_template.format(func_or_const=func_name)
@@ -80,11 +87,13 @@ def checkCmath(func_name, funcs_template=funcs_template):
             execWrapper(funcs_str, globals(), scope)
             cu = torch.jit.CompilationUnit(funcs_str)
             f_script = cu.func
-            f = scope['func']
+            f = scope["func"]
 
-            if func_name in ['isinf', 'isnan', 'isfinite']:
-                new_vals = vals + ([float('inf'), float('nan'), -1 * float('inf')])
-                final_vals = tuple(complex(x, y) for x, y in product(new_vals, new_vals))
+            if func_name in ["isinf", "isnan", "isfinite"]:
+                new_vals = vals + ([float("inf"), float("nan"), -1 * float("inf")])
+                final_vals = tuple(
+                    complex(x, y) for x, y in product(new_vals, new_vals)
+                )
             else:
                 final_vals = complex_vals
 
@@ -107,8 +116,27 @@ def checkCmath(func_name, funcs_template=funcs_template):
                     msg = f"Failed on {func_name} with input {a}. Python: {res_python}, Script: {res_script}"
                     self.assertEqual(res_python, res_script, msg=msg)
 
-        unary_ops = ['log', 'log10', 'sqrt', 'exp', 'sin', 'cos', 'asin', 'acos', 'atan', 'sinh', 'cosh',
-                     'tanh', 'asinh', 'acosh', 'atanh', 'phase', 'isinf', 'isnan', 'isfinite']
+        unary_ops = [
+            "log",
+            "log10",
+            "sqrt",
+            "exp",
+            "sin",
+            "cos",
+            "asin",
+            "acos",
+            "atan",
+            "sinh",
+            "cosh",
+            "tanh",
+            "asinh",
+            "acosh",
+            "atanh",
+            "phase",
+            "isinf",
+            "isnan",
+            "isfinite",
+        ]
 
         # --- Unary ops ---
         for op in unary_ops:
@@ -118,7 +146,7 @@ def fn(x: complex):
             return abs(x)
 
         for val in complex_vals:
-            self.checkScript(fn, (val, ))
+            self.checkScript(fn, (val,))
 
         def pow_complex_float(x: complex, y: float):
             return pow(x, y)
@@ -126,7 +154,6 @@ def pow_complex_float(x: complex, y: float):
         def pow_float_complex(x: float, y: complex):
             return pow(x, y)
 
-
         self.checkScript(pow_float_complex, (2, 3j))
         self.checkScript(pow_complex_float, (3j, 2))
 
@@ -135,7 +162,7 @@ def pow_complex_complex(x: complex, y: complex):
 
         for x, y in zip(complex_vals, complex_vals):
             # Reference: https://github.com/pytorch/pytorch/issues/54622
-            if (x == 0):
+            if x == 0:
                 continue
             self.checkScript(pow_complex_complex, (x, y))
 
@@ -143,16 +170,25 @@ def pow_complex_complex(x: complex, y: complex):
             # --- Binary op ---
             def rect_fn(x: float, y: float):
                 return cmath.rect(x, y)
-            for x, y in product(vals, vals):
-                self.checkScript(rect_fn, (x, y, ))
 
-        func_constants_template = dedent('''
+            for x, y in product(vals, vals):
+                self.checkScript(
+                    rect_fn,
+                    (
+                        x,
+                        y,
+                    ),
+                )
+
+        func_constants_template = dedent(
+            """
             def func():
                 return cmath.{func_or_const}
-            ''')
-        float_consts = ['pi', 'e', 'tau', 'inf', 'nan']
-        complex_consts = ['infj', 'nanj']
-        for x in (float_consts + complex_consts):
+            """
+        )
+        float_consts = ["pi", "e", "tau", "inf", "nan"]
+        complex_consts = ["infj", "nanj"]
+        for x in float_consts + complex_consts:
             checkCmath(x, funcs_template=func_constants_template)
 
     def test_infj_nanj_pickle(self):
@@ -177,77 +213,293 @@ def test_complex_constructor(self):
         def fn_int(real: int, img: int):
             return complex(real, img)
 
-        self.checkScript(fn_int, (0, 0, ))
-        self.checkScript(fn_int, (-1234, 0, ))
-        self.checkScript(fn_int, (0, -1256, ))
-        self.checkScript(fn_int, (-167, -1256, ))
+        self.checkScript(
+            fn_int,
+            (
+                0,
+                0,
+            ),
+        )
+        self.checkScript(
+            fn_int,
+            (
+                -1234,
+                0,
+            ),
+        )
+        self.checkScript(
+            fn_int,
+            (
+                0,
+                -1256,
+            ),
+        )
+        self.checkScript(
+            fn_int,
+            (
+                -167,
+                -1256,
+            ),
+        )
 
         def fn_float(real: float, img: float):
             return complex(real, img)
 
-        self.checkScript(fn_float, (0.0, 0.0, ))
-        self.checkScript(fn_float, (-1234.78, 0, ))
-        self.checkScript(fn_float, (0, 56.18, ))
-        self.checkScript(fn_float, (-1.9, -19.8, ))
+        self.checkScript(
+            fn_float,
+            (
+                0.0,
+                0.0,
+            ),
+        )
+        self.checkScript(
+            fn_float,
+            (
+                -1234.78,
+                0,
+            ),
+        )
+        self.checkScript(
+            fn_float,
+            (
+                0,
+                56.18,
+            ),
+        )
+        self.checkScript(
+            fn_float,
+            (
+                -1.9,
+                -19.8,
+            ),
+        )
 
         def fn_bool(real: bool, img: bool):
             return complex(real, img)
 
-        self.checkScript(fn_bool, (True, True, ))
-        self.checkScript(fn_bool, (False, False, ))
-        self.checkScript(fn_bool, (False, True, ))
-        self.checkScript(fn_bool, (True, False, ))
+        self.checkScript(
+            fn_bool,
+            (
+                True,
+                True,
+            ),
+        )
+        self.checkScript(
+            fn_bool,
+            (
+                False,
+                False,
+            ),
+        )
+        self.checkScript(
+            fn_bool,
+            (
+                False,
+                True,
+            ),
+        )
+        self.checkScript(
+            fn_bool,
+            (
+                True,
+                False,
+            ),
+        )
 
         def fn_bool_int(real: bool, img: int):
             return complex(real, img)
 
-        self.checkScript(fn_bool_int, (True, 0, ))
-        self.checkScript(fn_bool_int, (False, 0, ))
-        self.checkScript(fn_bool_int, (False, -1, ))
-        self.checkScript(fn_bool_int, (True, 3, ))
+        self.checkScript(
+            fn_bool_int,
+            (
+                True,
+                0,
+            ),
+        )
+        self.checkScript(
+            fn_bool_int,
+            (
+                False,
+                0,
+            ),
+        )
+        self.checkScript(
+            fn_bool_int,
+            (
+                False,
+                -1,
+            ),
+        )
+        self.checkScript(
+            fn_bool_int,
+            (
+                True,
+                3,
+            ),
+        )
 
         def fn_int_bool(real: int, img: bool):
             return complex(real, img)
 
-        self.checkScript(fn_int_bool, (0, True, ))
-        self.checkScript(fn_int_bool, (0, False, ))
-        self.checkScript(fn_int_bool, (-3, True, ))
-        self.checkScript(fn_int_bool, (6, False, ))
+        self.checkScript(
+            fn_int_bool,
+            (
+                0,
+                True,
+            ),
+        )
+        self.checkScript(
+            fn_int_bool,
+            (
+                0,
+                False,
+            ),
+        )
+        self.checkScript(
+            fn_int_bool,
+            (
+                -3,
+                True,
+            ),
+        )
+        self.checkScript(
+            fn_int_bool,
+            (
+                6,
+                False,
+            ),
+        )
 
         def fn_bool_float(real: bool, img: float):
             return complex(real, img)
 
-        self.checkScript(fn_bool_float, (True, 0.0, ))
-        self.checkScript(fn_bool_float, (False, 0.0, ))
-        self.checkScript(fn_bool_float, (False, -1.0, ))
-        self.checkScript(fn_bool_float, (True, 3.0, ))
+        self.checkScript(
+            fn_bool_float,
+            (
+                True,
+                0.0,
+            ),
+        )
+        self.checkScript(
+            fn_bool_float,
+            (
+                False,
+                0.0,
+            ),
+        )
+        self.checkScript(
+            fn_bool_float,
+            (
+                False,
+                -1.0,
+            ),
+        )
+        self.checkScript(
+            fn_bool_float,
+            (
+                True,
+                3.0,
+            ),
+        )
 
         def fn_float_bool(real: float, img: bool):
             return complex(real, img)
 
-        self.checkScript(fn_float_bool, (0.0, True, ))
-        self.checkScript(fn_float_bool, (0.0, False, ))
-        self.checkScript(fn_float_bool, (-3.0, True, ))
-        self.checkScript(fn_float_bool, (6.0, False, ))
+        self.checkScript(
+            fn_float_bool,
+            (
+                0.0,
+                True,
+            ),
+        )
+        self.checkScript(
+            fn_float_bool,
+            (
+                0.0,
+                False,
+            ),
+        )
+        self.checkScript(
+            fn_float_bool,
+            (
+                -3.0,
+                True,
+            ),
+        )
+        self.checkScript(
+            fn_float_bool,
+            (
+                6.0,
+                False,
+            ),
+        )
 
         def fn_float_int(real: float, img: int):
             return complex(real, img)
 
-        self.checkScript(fn_float_int, (0.0, 1, ))
-        self.checkScript(fn_float_int, (0.0, -1, ))
-        self.checkScript(fn_float_int, (1.8, -3, ))
-        self.checkScript(fn_float_int, (2.7, 8, ))
+        self.checkScript(
+            fn_float_int,
+            (
+                0.0,
+                1,
+            ),
+        )
+        self.checkScript(
+            fn_float_int,
+            (
+                0.0,
+                -1,
+            ),
+        )
+        self.checkScript(
+            fn_float_int,
+            (
+                1.8,
+                -3,
+            ),
+        )
+        self.checkScript(
+            fn_float_int,
+            (
+                2.7,
+                8,
+            ),
+        )
 
         def fn_int_float(real: int, img: float):
             return complex(real, img)
 
-        self.checkScript(fn_int_float, (1, 0.0, ))
-        self.checkScript(fn_int_float, (-1, 1.7, ))
-        self.checkScript(fn_int_float, (-3, 0.0, ))
-        self.checkScript(fn_int_float, (2, -8.9, ))
+        self.checkScript(
+            fn_int_float,
+            (
+                1,
+                0.0,
+            ),
+        )
+        self.checkScript(
+            fn_int_float,
+            (
+                -1,
+                1.7,
+            ),
+        )
+        self.checkScript(
+            fn_int_float,
+            (
+                -3,
+                0.0,
+            ),
+        )
+        self.checkScript(
+            fn_int_float,
+            (
+                2,
+                -8.9,
+            ),
+        )
 
     def test_torch_complex_constructor_with_tensor(self):
-        tensors = ([torch.rand(1), torch.randint(-5, 5, (1, )), torch.tensor([False])])
+        tensors = [torch.rand(1), torch.randint(-5, 5, (1,)), torch.tensor([False])]
 
         def fn_tensor_float(real, img: float):
             return complex(real, img)
@@ -280,7 +532,13 @@ def fn_tensor_tensor(real, img):
             return complex(real, img) + complex(2)
 
         for x, y in product(tensors, tensors):
-            self.checkScript(fn_tensor_tensor, (x, y, ))
+            self.checkScript(
+                fn_tensor_tensor,
+                (
+                    x,
+                    y,
+                ),
+            )
 
     def test_comparison_ops(self):
         def fn1(a: complex, b: complex):
@@ -316,7 +574,7 @@ def test_complex_list_sum(self):
         def fn(x: List[complex]):
             return sum(x)
 
-        self.checkScript(fn, (torch.randn(4, dtype=torch.cdouble).tolist(), ))
+        self.checkScript(fn, (torch.randn(4, dtype=torch.cdouble).tolist(),))
 
     def test_tensor_attributes(self):
         def tensor_real(x):
@@ -326,8 +584,8 @@ def tensor_imag(x):
             return x.imag
 
         t = torch.randn(2, 3, dtype=torch.cdouble)
-        self.checkScript(tensor_real, (t, ))
-        self.checkScript(tensor_imag, (t, ))
+        self.checkScript(tensor_real, (t,))
+        self.checkScript(tensor_imag, (t,))
 
     def test_binary_op_complex_tensor(self):
         def mul(x: complex, y: torch.Tensor):
@@ -350,7 +608,7 @@ def div(x: complex, y: torch.Tensor):
 
         ops = [mul, add, eq, ne, sub, div]
 
-        for shape in [(1, ), (2, 2)]:
+        for shape in [(1,), (2, 2)]:
             x = 0.71 + 0.71j
             y = torch.randn(shape, dtype=torch.cfloat)
             for op in ops:
diff --git a/test/jit/test_complexity.py b/test/jit/test_complexity.py
index ca1bf612d0323..f58887b0129ac 100644
--- a/test/jit/test_complexity.py
+++ b/test/jit/test_complexity.py
@@ -10,18 +10,29 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase, enable_profiling_mode
-from torch.testing._internal.jit_metaprogramming_utils import try_get_nn_module_compiled_mod_and_inputs, \
-    get_nn_mod_test_name, get_all_nn_module_tests, nn_functional_tests, get_nn_functional_compiled_fn_and_inputs
-from torch.testing._internal.common_utils import run_tests, set_default_dtype, suppress_warnings, IS_FBCODE
+from torch.testing._internal.common_utils import (
+    IS_FBCODE,
+    run_tests,
+    set_default_dtype,
+    suppress_warnings,
+)
+from torch.testing._internal.jit_metaprogramming_utils import (
+    get_all_nn_module_tests,
+    get_nn_functional_compiled_fn_and_inputs,
+    get_nn_mod_test_name,
+    nn_functional_tests,
+    try_get_nn_module_compiled_mod_and_inputs,
+)
+from torch.testing._internal.jit_utils import enable_profiling_mode, JitTestCase
 
 
 def num_ifs_loops(graph):
     graph_str = str(graph)
     # only look at body of graph
-    graph_body = graph_str[0:graph_str.find("return")]
+    graph_body = graph_str[0 : graph_str.find("return")]
     return graph_body.count("prim::Loop") + graph_body.count("prim::If")
 
+
 def num_non_tensor_nodes(block):
     num_non_tensor = 0
     for node in block.nodes():
@@ -40,6 +51,7 @@ def num_non_tensor_nodes(block):
         num_non_tensor += int(not tensor_out)
     return num_non_tensor
 
+
 class TestComplexity(JitTestCase):
     def setUp(self):
         super().setUp()
@@ -90,5 +102,6 @@ def test_nn_module_tests(self):
             for line in stats:
                 print(line)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/jit/test_convert_activation.py b/test/jit/test_convert_activation.py
index f414459ecec40..6e8f4a05470b4 100644
--- a/test/jit/test_convert_activation.py
+++ b/test/jit/test_convert_activation.py
@@ -2,16 +2,18 @@
 
 import os
 import sys
+import unittest
 
 from itertools import product
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.testing import FileCheck
-import unittest
 
 try:
     import torchvision
+
     HAS_TORCHVISION = True
 except ImportError:
     HAS_TORCHVISION = False
@@ -22,10 +24,12 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
 activations = [
     F.celu,
@@ -41,6 +45,7 @@
     F.silu,
 ]
 
+
 class TestFunctionalToInplaceActivation(JitTestCase):
     def test_check_no_type_promotion(self):
         dtypes = [
@@ -67,6 +72,7 @@ def test_check_no_type_promotion(self):
 
     def test_functional_to_inplace_activation(self):
         for activation in activations:
+
             def test_basic(x):
                 y = x + 1
                 z = activation(y)
@@ -76,7 +82,7 @@ def test_basic(x):
             self.run_pass("inline", fn.graph)
             self.run_pass("constant_propagation", fn.graph)
             FileCheck().check(f"aten::{activation.__name__}(").run(fn.graph)
-            self.run_pass('functional_to_inplace_activation', fn.graph)
+            self.run_pass("functional_to_inplace_activation", fn.graph)
             FileCheck().check_not(f"aten::{activation.__name__}(").run(fn.graph)
             FileCheck().check(f"aten::{activation.__name__}_").run(fn.graph)
             inp = torch.rand([2, 2])
@@ -91,7 +97,7 @@ def test1():
             return z
 
         fn = torch.jit.script(test1)
-        self.run_pass('functional_to_inplace_activation', fn.graph)
+        self.run_pass("functional_to_inplace_activation", fn.graph)
         FileCheck().check_not("aten::sigmoid_").run(fn.graph)
 
         # inplace conversion should not happen because y is alias
@@ -102,7 +108,7 @@ def test2(x):
             return z
 
         fn = torch.jit.script(test2)
-        self.run_pass('functional_to_inplace_activation', fn.graph)
+        self.run_pass("functional_to_inplace_activation", fn.graph)
         FileCheck().check_not("aten::relu_").run(fn.graph)
 
         # inplace conversion should not happen because self.x is
@@ -117,22 +123,33 @@ def forward(self):
                 return y
 
         fn = torch.jit.script(Test3(torch.rand([2, 2])).eval())
-        self.run_pass('functional_to_inplace_activation', fn.graph)
+        self.run_pass("functional_to_inplace_activation", fn.graph)
         FileCheck().check_not("aten::relu_").run(fn.graph)
 
     @skipIfNoTorchVision
     def test_resnet18_correctness(self):
         model = torchvision.models.resnet18()
         frozen_model = torch.jit.freeze(torch.jit.script(model.eval()))
-        N, C, H, W, = 10, 3, 224, 224
+        (
+            N,
+            C,
+            H,
+            W,
+        ) = (
+            10,
+            3,
+            224,
+            224,
+        )
         inp = torch.randn(N, C, H, W)
-        self.run_pass('functional_to_inplace_activation', frozen_model.graph)
+        self.run_pass("functional_to_inplace_activation", frozen_model.graph)
         self.assertEqual(model(inp), frozen_model(inp))
 
 
 class TestInplaceToFunctionalActivation(JitTestCase):
     def test_inplace_to_functional_activation(self):
         for activation in activations:
+
             def test_basic(x):
                 y = x + 1
                 activation(y, inplace=True)
@@ -142,7 +159,7 @@ def test_basic(x):
             self.run_pass("inline", fn.graph)
             self.run_pass("constant_propagation", fn.graph)
             FileCheck().check(f"aten::{activation.__name__}_").run(fn.graph)
-            self.run_pass('inplace_to_functional_activation', fn.graph)
+            self.run_pass("inplace_to_functional_activation", fn.graph)
             FileCheck().check_not(f"aten::{activation.__name__}_").run(fn.graph)
             FileCheck().check(f"aten::{activation.__name__}(").run(fn.graph)
 
@@ -151,6 +168,7 @@ def test_basic(x):
             torch.sigmoid_,
             torch.tanh_,
         ]:
+
             def test_basic(x):
                 y = x + 1
                 activation(y)
@@ -160,7 +178,7 @@ def test_basic(x):
             self.run_pass("inline", fn.graph)
             self.run_pass("constant_propagation", fn.graph)
             FileCheck().check(f"aten::{activation.__name__}").run(fn.graph)
-            self.run_pass('inplace_to_functional_activation', fn.graph)
+            self.run_pass("inplace_to_functional_activation", fn.graph)
             FileCheck().check_not(f"aten::{activation.__name__}").run(fn.graph)
             FileCheck().check(f"aten::{activation.__name__[:-1]}(").run(fn.graph)
 
@@ -171,7 +189,17 @@ def test_basic(x):
     def test_resnet18_correctness(self):
         model = torchvision.models.resnet18()
         frozen_model = torch.jit.freeze(torch.jit.script(model.eval()))
-        N, C, H, W, = 10, 3, 224, 224
+        (
+            N,
+            C,
+            H,
+            W,
+        ) = (
+            10,
+            3,
+            224,
+            224,
+        )
         inp = torch.randn(N, C, H, W)
-        self.run_pass('inplace_to_functional_activation', frozen_model.graph)
+        self.run_pass("inplace_to_functional_activation", frozen_model.graph)
         self.assertEqual(model(inp), frozen_model(inp))
diff --git a/test/jit/test_cuda.py b/test/jit/test_cuda.py
index bc0e36a677326..cb73c65b7a95b 100644
--- a/test/jit/test_cuda.py
+++ b/test/jit/test_cuda.py
@@ -1,16 +1,21 @@
 # Owner(s): ["oncall: jit"]
 
+import gc
 import os
 import sys
-import gc
 import unittest
+from typing import NamedTuple
 
 import torch
-from typing import NamedTuple
 from torch.testing import FileCheck
-from torch.testing._internal.jit_utils import JitTestCase
-from torch.testing._internal.common_utils import skipIfRocm, skipCUDANonDefaultStreamIf, NoTest, TEST_CUDA
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_utils import (
+    NoTest,
+    skipCUDANonDefaultStreamIf,
+    skipIfRocm,
+    TEST_CUDA,
+)
+from torch.testing._internal.jit_utils import JitTestCase
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
@@ -18,7 +23,7 @@
 
 # If GPU is not available, then do not run the tests
 if not TEST_CUDA:
-    print('CUDA not available, skipping tests', file=sys.stderr)
+    print("CUDA not available, skipping tests", file=sys.stderr)
     JitTestCase = NoTest  # noqa: F811
 
 TEST_LARGE_TENSOR = TEST_CUDA
@@ -36,10 +41,12 @@
         "instead."
     )
 
+
 class TestCUDA(JitTestCase):
     """
     A suite of tests for the CUDA API in TorchScript.
     """
+
     def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
@@ -54,10 +61,10 @@ def test_cuda_synchronize(self):
         def test_device_synchronize():
             prev_current_device_index = torch.cuda.current_device()
             torch.cuda.synchronize()
-            torch.cuda.synchronize('cuda')
-            torch.cuda.synchronize('cuda:0')
+            torch.cuda.synchronize("cuda")
+            torch.cuda.synchronize("cuda:0")
             torch.cuda.synchronize(0)
-            torch.cuda.synchronize(torch.device('cuda:1'))
+            torch.cuda.synchronize(torch.device("cuda:1"))
             after_current_device_index = torch.cuda.current_device()
 
             # Check if the current device index is same as the device index before
@@ -66,7 +73,7 @@ def test_device_synchronize():
 
         @torch.jit.script
         def test_multi_device_synchronize():
-            torch.cuda.synchronize(torch.device('cuda:0'))
+            torch.cuda.synchronize(torch.device("cuda:0"))
             prev_current_device_index = torch.cuda.current_device()
             torch.cuda.synchronize(1)
             after_current_device_index = torch.cuda.current_device()
@@ -76,11 +83,9 @@ def test_multi_device_synchronize():
             return prev_current_device_index == after_current_device_index
 
         self.assertTrue(test_device_synchronize)
-        FileCheck().check("cuda::synchronize(") \
-                   .run(test_device_synchronize.graph)
+        FileCheck().check("cuda::synchronize(").run(test_device_synchronize.graph)
         self.assertTrue(test_multi_device_synchronize)
-        FileCheck().check("cuda::synchronize(") \
-                   .run(test_multi_device_synchronize.graph)
+        FileCheck().check("cuda::synchronize(").run(test_multi_device_synchronize.graph)
 
     def test_stream_args(self):
         # Test stream creation with default arguments
@@ -165,7 +170,6 @@ def fn_with_device_index_args():
     @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
     @skipCUDANonDefaultStreamIf(True)
     def test_streams_and_events(self):
-
         # Test default_stream API by passing device ID as an argument and
         # and check if the stream device index matches with the device ID
         @torch.jit.script
@@ -182,14 +186,14 @@ def test_default_streams_with_device_index_args():
         # This test checks for the default stream ID is set to 0 on the device
         @torch.jit.script
         def test_default_streams():
-            s0 = torch.cuda.default_stream(torch.device('cuda:0'))
-            s1 = torch.cuda.default_stream(torch.device('cuda:1'))
+            s0 = torch.cuda.default_stream(torch.device("cuda:0"))
+            s1 = torch.cuda.default_stream(torch.device("cuda:1"))
 
-            d = torch.device('cuda:1')
+            d = torch.device("cuda:1")
 
             # Check the current stream id and default id are same
             # on the current device. The current device id by default is 0
-            s2 = torch.cuda.current_stream(torch.device('cuda:0'))
+            s2 = torch.cuda.current_stream(torch.device("cuda:0"))
             check_s2 = s2.id() == s0.id()
             check_d0 = torch.cuda.current_device() == s2.device_index()
 
@@ -203,9 +207,25 @@ def test_default_streams():
             # Check if the current device was reset to 0
             is_device_d0 = torch.cuda.current_device() == s2.device_index()
 
-            return s0.device_index(), s1.device_index(), check_s2, check_s3, check_d0, check_d1, is_device_d0
-
-        d0, d1, check_s2, check_s3, check_d0, check_d1, is_device_d0 = test_default_streams()
+            return (
+                s0.device_index(),
+                s1.device_index(),
+                check_s2,
+                check_s3,
+                check_d0,
+                check_d1,
+                is_device_d0,
+            )
+
+        (
+            d0,
+            d1,
+            check_s2,
+            check_s3,
+            check_d0,
+            check_d1,
+            is_device_d0,
+        ) = test_default_streams()
 
         self.assertEqual(d0, 0)
         self.assertEqual(d1, 1)
@@ -228,12 +248,21 @@ def test_set_none_stream():
             with torch.cuda.stream(None):
                 cur_device_index = torch.cuda.current_device()
                 is_device_index_same = cur_device_index == device_index
-                is_current_stream_same = torch.cuda.current_stream(device).id() == current_stream.id()
-                is_default_stream_same = torch.cuda.default_stream(device).id() == default_stream.id()
+                is_current_stream_same = (
+                    torch.cuda.current_stream(device).id() == current_stream.id()
+                )
+                is_default_stream_same = (
+                    torch.cuda.default_stream(device).id() == default_stream.id()
+                )
 
             # Check if the device index, current stream and default streams have not changed
-            are_streams_same = is_device_index_same and is_current_stream_same and is_default_stream_same
+            are_streams_same = (
+                is_device_index_same
+                and is_current_stream_same
+                and is_default_stream_same
+            )
             return are_streams_same
+
         self.assertTrue(test_set_none_stream())
 
         # This test checks if the Device Context manager is a no op
@@ -246,6 +275,7 @@ def test_set_device_none():
                 # Check if the current device is the same
                 is_device_same = torch.cuda.current_device() == device_index
             return is_device_same
+
         self.assertTrue(test_set_device_none())
 
         # Check if a CUDA JIT stream is created
@@ -260,15 +290,15 @@ def test_simple_stream():
 
         # Class used to store results for the test: test_get_stream.
         class Result(NamedTuple):
-            t1 : torch.Tensor
-            t2 : torch.Tensor
-            is_current_and_default_stream_same : bool
-            is_default_and_user_stream_not_same : bool
-            is_stream_set : bool
-            is_stream_reset : bool
-            default_stream_query : bool
-            default_stream_id : int
-            user_stream_id : int
+            t1: torch.Tensor
+            t2: torch.Tensor
+            is_current_and_default_stream_same: bool
+            is_default_and_user_stream_not_same: bool
+            is_stream_set: bool
+            is_stream_reset: bool
+            default_stream_query: bool
+            default_stream_id: int
+            user_stream_id: int
 
         # The test aims at checking different stream proporties.
         @torch.jit.script
@@ -280,15 +310,23 @@ def test_get_stream():
             user_stream = torch.cuda.Stream()
 
             # Check if the current and default streams are the same on the device
-            is_current_and_default_stream_same = current_stream.id() == default_stream.id()
+            is_current_and_default_stream_same = (
+                current_stream.id() == default_stream.id()
+            )
             # Check if user stream and default stream are not the same on the device
-            is_default_and_user_stream_not_same = default_stream.id() != user_stream.id()
+            is_default_and_user_stream_not_same = (
+                default_stream.id() != user_stream.id()
+            )
 
             with torch.cuda.stream(user_stream):
-                is_stream_set = torch.cuda.current_stream(device).id() == user_stream.id()
+                is_stream_set = (
+                    torch.cuda.current_stream(device).id() == user_stream.id()
+                )
 
             # Check if the stream was reset to current_stream
-            is_stream_reset = torch.cuda.current_stream(device).id() == current_stream.id()
+            is_stream_reset = (
+                torch.cuda.current_stream(device).id() == current_stream.id()
+            )
 
             tensor1 = torch.rand(10000, 10000, device="cuda")
             tensor2 = torch.mm(tensor1, tensor1).to("cuda")
@@ -297,9 +335,16 @@ def test_get_stream():
 
             # Capture all the results in the class Result
             res = Result(
-                tensor1, tensor2, is_current_and_default_stream_same,
-                is_default_and_user_stream_not_same, is_stream_set,
-                is_stream_reset, default_stream_query, default_stream.id(), user_stream.id())
+                tensor1,
+                tensor2,
+                is_current_and_default_stream_same,
+                is_default_and_user_stream_not_same,
+                is_stream_set,
+                is_stream_reset,
+                default_stream_query,
+                default_stream.id(),
+                user_stream.id(),
+            )
             return res
 
         result = test_get_stream()
@@ -310,8 +355,12 @@ def test_get_stream():
         self.assertTrue(result.is_stream_set)
         self.assertTrue(result.is_stream_reset)
         self.assertTrue(result.default_stream_query)
-        self.assertEqual(result.default_stream_id, 0)  # Check if the default stream ID is always 0
-        self.assertNotEqual(result.user_stream_id, 0)  # Check if the user stream is always non zero
+        self.assertEqual(
+            result.default_stream_id, 0
+        )  # Check if the default stream ID is always 0
+        self.assertNotEqual(
+            result.user_stream_id, 0
+        )  # Check if the user stream is always non zero
 
         # Test the stream context manager. This test checks if the stream is switched
         # to the user stream on using the stream context manager.
@@ -329,14 +378,20 @@ def test_stream_context():
             # Wait for B to be computed
             user_stream.synchronize()
             # Check if the stream has been reset on the current device
-            is_stream_reset = torch.cuda.current_stream(device).id() == current_stream.id()
+            is_stream_reset = (
+                torch.cuda.current_stream(device).id() == current_stream.id()
+            )
 
             return A, B, check, is_stream_reset
 
         A, B, is_stream_set, is_stream_reset = test_stream_context()
         self.assertEqual(torch.matmul(A, A), B)
-        self.assertTrue(is_stream_set, "Error: Current stream was not set to user stream!")
-        self.assertTrue(is_stream_reset, "Error: The stream was not restored to previous stream!")
+        self.assertTrue(
+            is_stream_set, "Error: Current stream was not set to user stream!"
+        )
+        self.assertTrue(
+            is_stream_reset, "Error: The stream was not restored to previous stream!"
+        )
 
         # Test multiple nested streams. Check if the operations are computed as expected on the streams
         # This test has been adapted from the eager mode tests available at test/test_cuda.py
@@ -372,11 +427,24 @@ def test_multiple_stream():
 
             # Check if the stream and device has been restored to previous stream and device
             is_device_current = torch.cuda.current_device() == prev_device_index
-            is_stream_current = torch.cuda.current_stream(device).id() == prev_current_stream.id()
-
-            check_stream = is_stream_s1 and is_stream_s2 and is_stream_s1_after and is_stream_current
-            check_device = is_device_s1 and is_device_s2 and is_device_s1_after and is_device_current
+            is_stream_current = (
+                torch.cuda.current_stream(device).id() == prev_current_stream.id()
+            )
+
+            check_stream = (
+                is_stream_s1
+                and is_stream_s2
+                and is_stream_s1_after
+                and is_stream_current
+            )
+            check_device = (
+                is_device_s1
+                and is_device_s2
+                and is_device_s1_after
+                and is_device_current
+            )
             return A, B, C, D, check_stream, check_device
+
         A, B, C, D, check_stream, check_device = test_multiple_stream()
 
         self.assertEqual(torch.matmul(A, A), C)
@@ -401,7 +469,9 @@ def test_data_dependency_between_streams():
                 B = torch.mm(A, A).to("cuda")
             s1.record_event(event)
             # Check if the current_stream is reset
-            is_current_stream_1 = torch.cuda.current_stream(device).id() == prev_current_stream.id()
+            is_current_stream_1 = (
+                torch.cuda.current_stream(device).id() == prev_current_stream.id()
+            )
             # Wait for ops on s1 to be computed
             s2.wait_event(event)
             with torch.cuda.stream(s2):
@@ -410,9 +480,16 @@ def test_data_dependency_between_streams():
             # Wait for C to be computed
             s2.synchronize()
             # Check if the current_stream is reset
-            is_current_stream_2 = torch.cuda.current_stream(device).id() == prev_current_stream.id()
-
-            check_stream = is_current_stream_1 and is_current_stream_2 and is_stream_s1 and is_stream_s2
+            is_current_stream_2 = (
+                torch.cuda.current_stream(device).id() == prev_current_stream.id()
+            )
+
+            check_stream = (
+                is_current_stream_1
+                and is_current_stream_2
+                and is_stream_s1
+                and is_stream_s2
+            )
             return A, B, C, check_stream
 
         A, B, C, check_stream = test_data_dependency_between_streams()
@@ -425,6 +502,7 @@ def test_data_dependency_between_streams():
         def test_simple_event():
             e = torch.cuda.Event(True, False, False)
             return e is not None
+
         self.assertTrue(test_simple_event(), "Could not create CUDA Event!")
 
         # Record the CUDA event for operation torch.mm on the current stream
@@ -474,6 +552,7 @@ def test_stream_synchronize() -> float:
             # not necessary to check e_tik and e_tok, as elapsed_time would throw
             # exception if otherwise.
             return e_tik.elapsed_time(e_tok)
+
         self.assertGreater(test_stream_synchronize(), 0)
 
         # Test event synchronization for the event that records a stream doing
@@ -536,12 +615,13 @@ def test_event_wait() -> float:
             # not necessary to check e_tik and e_tok, as elapsed_time would throw
             # exception if otherwise.
             return e_tik.elapsed_time(e_tok)
+
         self.assertGreater(test_event_wait(), 0)
 
         # Test for stream wait_event. Checks if the stream waits on the event
         @torch.jit.script
         def test_wait_event():
-            d1 = torch.device('cuda:1')
+            d1 = torch.device("cuda:1")
 
             with torch.cuda.device(d1):
                 s0 = torch.cuda.current_stream(d1)
@@ -550,11 +630,12 @@ def test_wait_event():
                 e0 = torch.cuda.Event(False, False, False)
                 s0.record_event(e0)
 
-            s1 = torch.cuda.current_stream(torch.device('cuda:0'))
+            s1 = torch.cuda.current_stream(torch.device("cuda:0"))
             s1.wait_event(e0)
             s1.synchronize()
 
             return e0.query() and s0.query() and s1.query()
+
         self.assertTrue(test_wait_event())
 
         # Test if a scripted module with cuda streams can be saved, loaded and executed
diff --git a/test/jit/test_custom_operators.py b/test/jit/test_custom_operators.py
index e3cb6393ca221..34d47d6ce328a 100644
--- a/test/jit/test_custom_operators.py
+++ b/test/jit/test_custom_operators.py
@@ -11,33 +11,37 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 def canonical(graph):
     return torch._C._jit_pass_canonicalize(graph).str(False)
 
-class TestCustomOperators(JitTestCase):
 
+class TestCustomOperators(JitTestCase):
     def test_dynamic_op_registry(self):
         from torch._ops import _OpNamespace
-        self.assertTrue(hasattr(torch, 'ops'))
 
-        if '_test' in torch.ops.__dict__:
-            torch.ops.__dict__.pop('_test')
+        self.assertTrue(hasattr(torch, "ops"))
+
+        if "_test" in torch.ops.__dict__:
+            torch.ops.__dict__.pop("_test")
 
         # Don't use `hasattr()` because it will call `__getattr__`.
-        self.assertNotIn('_test', torch.ops.__dict__)
+        self.assertNotIn("_test", torch.ops.__dict__)
         torch.ops._test
-        self.assertIn('_test', torch.ops.__dict__)
+        self.assertIn("_test", torch.ops.__dict__)
         self.assertEqual(type(torch.ops._test), _OpNamespace)
 
-        self.assertNotIn('leaky_relu', torch.ops._test.__dict__)
+        self.assertNotIn("leaky_relu", torch.ops._test.__dict__)
         op = torch.ops._test.leaky_relu
         self.assertTrue(callable(op))
-        self.assertIn('leaky_relu', torch.ops._test.__dict__)
+        self.assertIn("leaky_relu", torch.ops._test.__dict__)
         op2 = torch.ops._test.leaky_relu
         self.assertEqual(op, op2)
 
@@ -46,7 +50,7 @@ def test_getting_invalid_attr(self):
             with self.assertRaisesRegexWithHighlight(
                 AttributeError,
                 f"Invalid attribute '{attr}' for '_OpNamespace' '_test'",
-                ""
+                "",
             ):
                 getattr(torch.ops._test, attr)
 
@@ -63,15 +67,13 @@ def test_passing_too_many_args(self):
         with self.assertRaisesRegexWithHighlight(
             RuntimeError,
             r"aten::relu\(\) expected at most 1 argument\(s\) but received 2 argument\(s\)",
-            ""
+            "",
         ):
             torch.ops.aten.relu(1, 2)
 
     def test_passing_too_few_args(self):
         with self.assertRaisesRegexWithHighlight(
-            RuntimeError,
-            r"aten::relu\(\) is missing value for argument 'self'.",
-            ""
+            RuntimeError, r"aten::relu\(\) is missing value for argument 'self'.", ""
         ):
             torch.ops.aten.relu()
 
@@ -79,7 +81,7 @@ def test_passing_one_positional_but_not_the_second(self):
         with self.assertRaisesRegexWithHighlight(
             RuntimeError,
             r"aten::type_as\(\) is missing value for argument 'other'.",
-            ""
+            "",
         ):
             torch.ops.aten.type_as(torch.ones(5, 5))
 
@@ -87,7 +89,7 @@ def test_passing_unknown_kwargs(self):
         with self.assertRaisesRegexWithHighlight(
             RuntimeError,
             "Unknown keyword argument 'foo' for operator '_test::leaky_relu'",
-            ""
+            "",
         ):
             torch.ops._test.leaky_relu(torch.ones(5), foo=torch.ones(5))
 
@@ -102,6 +104,7 @@ def test_calling_scripted_custom_op(self):
         @torch.jit.script
         def func(x):
             return torch.ops.aten.relu(x)
+
         input = torch.ones(5, 5)
         self.assertEqual(func(input), input.relu())
 
@@ -110,28 +113,37 @@ def test_calling_traced_custom_op(self):
         func = torch.jit.trace(torch.ops.aten.relu, [input])
         self.assertEqual(func(input), input.relu())
 
-    @unittest.skip("Need to figure out default dtype differences between fbcode and oss")
+    @unittest.skip(
+        "Need to figure out default dtype differences between fbcode and oss"
+    )
     def test_script_graph_for_custom_ops_matches_traced_graph(self):
         input = torch.ones(5, 5)
         trace = torch.jit.trace(torch.ops.aten.relu, [input])
-        self.assertExpectedInline(canonical(trace.graph), '''\
+        self.assertExpectedInline(
+            canonical(trace.graph),
+            """\
 graph(%0 : Float(5, 5)):
   %1 : Float(5, 5) = aten::relu(%0)
   return (%1)
-''')
+""",
+        )
 
     def test_script_graph_contains_custom_op(self):
         @torch.jit.script
         def func(x):
             return torch.ops.aten.relu(x)
-        self.assertExpectedInline(canonical(func.graph), '''\
+
+        self.assertExpectedInline(
+            canonical(func.graph),
+            """\
 graph(%x.1 : Tensor):
   %1 : Tensor = aten::relu(%x.1)
   return (%1)
-''')
+""",
+        )
 
     def test_generic_list(self):
-        self.assertEqual(torch.ops._test.get_first([['hello']]), 'hello')
+        self.assertEqual(torch.ops._test.get_first([["hello"]]), "hello")
 
     # https://github.com/pytorch/pytorch/issues/80508
     def test_where_no_scalar(self):
diff --git a/test/jit/test_data_parallel.py b/test/jit/test_data_parallel.py
index fd35a2681f50a..215d3a974aa87 100644
--- a/test/jit/test_data_parallel.py
+++ b/test/jit/test_data_parallel.py
@@ -13,17 +13,21 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA_MULTI_GPU
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestDataParallel(JitTestCase):
     class Mpy(torch.nn.Module):
         def __init__(self):
             super(TestDataParallel.Mpy, self).__init__()
-            self.m = nn.Sequential(nn.Linear(2, 2), nn.BatchNorm1d(2),
-                                   nn.ReLU(), nn.Linear(2, 2))
+            self.m = nn.Sequential(
+                nn.Linear(2, 2), nn.BatchNorm1d(2), nn.ReLU(), nn.Linear(2, 2)
+            )
 
         @torch.jit.ignore
         def forward(self, input):
@@ -50,13 +54,13 @@ def forward(self, input):
             return self.m2(x)
 
     class Msm(torch.jit.ScriptModule):
-
-        __constants__ = ['m']
+        __constants__ = ["m"]
 
         def __init__(self):
             super(TestDataParallel.Msm, self).__init__()
-            self.m = nn.Sequential(nn.Linear(2, 2), nn.BatchNorm1d(2),
-                                   nn.ReLU(), nn.Linear(2, 2))
+            self.m = nn.Sequential(
+                nn.Linear(2, 2), nn.BatchNorm1d(2), nn.ReLU(), nn.Linear(2, 2)
+            )
 
         @torch.jit.script_method
         def forward(self, input):
@@ -140,7 +144,7 @@ def test_tensor_sharing_with_forward(self):
                 # Use .data here to avoid version counter bump.
                 # The graph created by the following forward will be wrong but
                 # we never backward through them so it's fine
-                p.data -= 1. * p.grad
+                p.data -= 1.0 * p.grad
         second_forward = module(x)
 
         # replica which is on the same GPU has a shallow copy of the original
diff --git a/test/jit/test_dataclasses.py b/test/jit/test_dataclasses.py
index b8f68d7073acc..274818336aec5 100644
--- a/test/jit/test_dataclasses.py
+++ b/test/jit/test_dataclasses.py
@@ -1,14 +1,16 @@
 # Owner(s): ["oncall: jit"]
 # flake8: noqa
 
-from dataclasses import dataclass, field, InitVar
-from hypothesis import given, settings, strategies as st
-from torch.testing._internal.jit_utils import JitTestCase
-from typing import List, Optional
 import sys
-import torch
 import unittest
+from dataclasses import dataclass, field, InitVar
 from enum import Enum
+from typing import List, Optional
+
+import torch
+from hypothesis import given, settings, strategies as st
+from torch.testing._internal.jit_utils import JitTestCase
+
 
 # Example jittable dataclass
 @dataclass(order=True)
@@ -20,8 +22,8 @@ class Point:
     def __post_init__(self):
         self.norm = (torch.tensor(self.x) ** 2 + torch.tensor(self.y) ** 2) ** 0.5
 
-class MixupScheme(Enum):
 
+class MixupScheme(Enum):
     INPUT = ["input"]
 
     MANIFOLD = [
@@ -38,6 +40,7 @@ def __init__(self, alpha: float = 0.125, scheme: MixupScheme = MixupScheme.INPUT
         self.alpha = alpha
         self.scheme = scheme
 
+
 class MixupScheme2(Enum):
     A = 1
     B = 2
@@ -49,6 +52,7 @@ def __init__(self, alpha: float = 0.125, scheme: MixupScheme2 = MixupScheme2.A):
         self.alpha = alpha
         self.scheme = scheme
 
+
 @dataclass
 class MixupParams3:
     def __init__(self, alpha: float = 0.125, scheme: MixupScheme2 = MixupScheme2.A):
@@ -59,11 +63,11 @@ def __init__(self, alpha: float = 0.125, scheme: MixupScheme2 = MixupScheme2.A):
 # Make sure the Meta internal tooling doesn't raise an overflow error
 NonHugeFloats = st.floats(min_value=-1e4, max_value=1e4, allow_nan=False)
 
-class TestDataclasses(JitTestCase):
 
+class TestDataclasses(JitTestCase):
     @classmethod
     def tearDownClass(cls):
-         torch._C._jit_clear_class_registry()
+        torch._C._jit_clear_class_registry()
 
     def test_init_vars(self):
         @torch.jit.script
@@ -75,7 +79,9 @@ class Point2:
             norm: Optional[torch.Tensor] = None
 
             def __post_init__(self, norm_p: int):
-                self.norm = (torch.tensor(self.x) ** norm_p + torch.tensor(self.y) ** norm_p) ** (1 / norm_p)
+                self.norm = (
+                    torch.tensor(self.x) ** norm_p + torch.tensor(self.y) ** norm_p
+                ) ** (1 / norm_p)
 
         def fn(x: float, y: float, p: int):
             pt = Point2(x, y, p)
@@ -88,6 +94,7 @@ def fn(x: float, y: float, p: int):
     @given(NonHugeFloats, NonHugeFloats)
     def test__post_init__(self, x, y):
         P = torch.jit.script(Point)
+
         def fn(x: float, y: float):
             pt = P(x, y)
             return pt.norm
@@ -95,7 +102,9 @@ def fn(x: float, y: float):
         self.checkScript(fn, [x, y])
 
     @settings(deadline=None)
-    @given(st.tuples(NonHugeFloats, NonHugeFloats), st.tuples(NonHugeFloats, NonHugeFloats))
+    @given(
+        st.tuples(NonHugeFloats, NonHugeFloats), st.tuples(NonHugeFloats, NonHugeFloats)
+    )
     def test_comparators(self, pt1, pt2):
         x1, y1 = pt1
         x2, y2 = pt2
@@ -122,6 +131,7 @@ class Foo(object):
 
         with self.assertRaises(NotImplementedError):
             torch.jit.script(Foo)
+
             def fn():
                 foo = Foo()
                 return foo.x
@@ -137,7 +147,7 @@ class CustomEq:
             a: int
             b: int
 
-            def __eq__(self, other: 'CustomEq') -> bool:
+            def __eq__(self, other: "CustomEq") -> bool:
                 return self.a == other.a  # ignore the b field
 
         def fn(a: int, b1: int, b2: int):
@@ -154,9 +164,7 @@ def test_no_source(self):
 
         torch.jit.script(MixupParams2)  # don't throw
 
-
     def test_use_unregistered_dataclass_raises(self):
-
         def f(a: MixupParams3):
             return 0
 
diff --git a/test/jit/test_decorator.py b/test/jit/test_decorator.py
new file mode 100644
index 0000000000000..132935c37a7e4
--- /dev/null
+++ b/test/jit/test_decorator.py
@@ -0,0 +1,27 @@
+# Owner(s): ["oncall: jit"]
+# flake8: noqa
+
+import sys
+import unittest
+from enum import Enum
+from typing import List, Optional
+
+import torch
+from torch.testing._internal.jit_utils import JitTestCase
+
+from jit.myfunction_a import my_function_a
+
+
+class TestDecorator(JitTestCase):
+    def test_decorator(self):
+        # Note: JitTestCase.checkScript() does not work with decorators
+        # self.checkScript(my_function_a, (1.0,))
+        # Error:
+        #   RuntimeError: expected def but found '@' here:
+        #   @my_decorator
+        #   ~ <--- HERE
+        #   def my_function_a(x: float) -> float:
+        # Do a simple torch.jit.script() test instead
+        fn = my_function_a
+        fx = torch.jit.script(fn)
+        self.assertEqual(fn(1.0), fx(1.0))
diff --git a/test/jit/test_device_analysis.py b/test/jit/test_device_analysis.py
index 3ce42e171b659..eadde705a2e40 100644
--- a/test/jit/test_device_analysis.py
+++ b/test/jit/test_device_analysis.py
@@ -1,12 +1,12 @@
 # Owner(s): ["oncall: jit"]
 
-from itertools import product
 import unittest
+from itertools import product
 
 import torch
+from torch.jit._passes._property_propagation import apply_input_props_using_example
 from torch.testing._internal.common_utils import TEST_CUDA
 from torch.testing._internal.jit_utils import JitTestCase
-from torch.jit._passes._property_propagation import apply_input_props_using_example
 
 try:
     from torchvision import models
diff --git a/test/jit/test_dtype_analysis.py b/test/jit/test_dtype_analysis.py
index 5c2a587e70c0d..2870f3a5a9b3a 100644
--- a/test/jit/test_dtype_analysis.py
+++ b/test/jit/test_dtype_analysis.py
@@ -7,19 +7,19 @@
 import torch
 from torch import complex32, float32, float64, int32, int64
 from torch.jit._passes import _property_propagation
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    ops,
+)
 from torch.testing._internal.common_methods_invocations import (
-    SampleInput,
+    op_db,
     sample_inputs_adaptive_avg_pool2d,
     sample_inputs_conv2d,
+    SampleInput,
 )
-from torch.testing._internal.common_utils import set_default_dtype, first_sample
-from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing._internal.common_utils import first_sample, set_default_dtype
 from torch.testing._internal.jit_metaprogramming_utils import create_traced_fn
-from torch.testing._internal.common_device_type import (
-    ops,
-    instantiate_device_type_tests,
-)
-from torch.testing._internal.common_methods_invocations import op_db
+from torch.testing._internal.jit_utils import JitTestCase
 
 """
 Dtype Analysis relies on symbolic shape analysis, which is still in beta
@@ -274,7 +274,9 @@ def adaptive_avg_pool2d_fn(input, output_size: Tuple[int]):
         ):
             for dtype in (torch.int8, torch.float64):
                 # Gets default version for conv2d
-                sample_input: SampleInput = list(inputs_fn(None, "cpu", dtype, False))[-1]
+                sample_input: SampleInput = list(inputs_fn(None, "cpu", dtype, False))[
+                    -1
+                ]
                 input_args = [sample_input.input, *sample_input.args]
                 self.assert_dtype_equal_custom_args(fn, input_args)
 
@@ -352,7 +354,9 @@ def custom_rules_test_base(self, device, dtype, op, allow_eager_fail=False):
         # Run the Dtype Analysis
         graph = traced_fn.graph  # Note this is a cached graph
         input_tensors = [t for t in input_args if isinstance(t, torch.Tensor)]
-        input_tensors += [v for v in sample_input.kwargs.values() if isinstance(v, torch.Tensor)]
+        input_tensors += [
+            v for v in sample_input.kwargs.values() if isinstance(v, torch.Tensor)
+        ]
         self.prop_dtype_on_graph(graph, input_tensors)
         self.assert_output_dtype_equal(expected_res, graph)
 
diff --git a/test/jit/test_enum.py b/test/jit/test_enum.py
index e2462bd1f63a5..00c8904122b92 100644
--- a/test/jit/test_enum.py
+++ b/test/jit/test_enum.py
@@ -2,21 +2,24 @@
 
 import os
 import sys
+from enum import Enum
+from typing import Any, List
 
 import torch
 from torch.testing import FileCheck
-from enum import Enum
-from typing import Any, List
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestEnum(JitTestCase):
     def test_enum_value_types(self):
@@ -38,11 +41,9 @@ class StringEnum(Enum):
         def supported_enum_types(a: IntEnum, b: FloatEnum, c: StringEnum):
             return (a.name, b.name, c.name)
 
-        FileCheck() \
-            .check("IntEnum") \
-            .check("FloatEnum") \
-            .check("StringEnum") \
-            .run(str(supported_enum_types.graph))
+        FileCheck().check("IntEnum").check("FloatEnum").check("StringEnum").run(
+            str(supported_enum_types.graph)
+        )
 
         class TensorEnum(Enum):
             FOO = torch.tensor(0)
@@ -54,7 +55,9 @@ def unsupported_enum_types(a: TensorEnum):
             return a.name
 
         # TODO: rewrite code so that the highlight is not empty.
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "Cannot create Enum with value type 'Tensor'", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Cannot create Enum with value type 'Tensor'", ""
+        ):
             torch.jit.script(unsupported_enum_types)
 
     def test_enum_comp(self):
@@ -88,11 +91,9 @@ class Bar(Enum):
         def enum_comp(x: Foo) -> bool:
             return x == Bar.ITEM1
 
-        FileCheck() \
-            .check("prim::Constant") \
-            .check_same("Bar.ITEM1") \
-            .check("aten::eq") \
-            .run(str(enum_comp.graph))
+        FileCheck().check("prim::Constant").check_same("Bar.ITEM1").check(
+            "aten::eq"
+        ).run(str(enum_comp.graph))
 
         self.assertEqual(enum_comp(Foo.ITEM1), False)
 
@@ -107,7 +108,9 @@ def enum_comp(x: Color, y: Color) -> bool:
             return x == y
 
         # TODO: rewrite code so that the highlight is not empty.
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "Could not unify type list", ""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Could not unify type list", ""
+        ):
             torch.jit.script(enum_comp)
 
     def test_enum_name(self):
@@ -121,11 +124,9 @@ class Color(Enum):
         def enum_name(x: Color) -> str:
             return x.name
 
-        FileCheck() \
-            .check("Color") \
-            .check_next("prim::EnumName") \
-            .check_next("return") \
-            .run(str(enum_name.graph))
+        FileCheck().check("Color").check_next("prim::EnumName").check_next(
+            "return"
+        ).run(str(enum_name.graph))
 
         self.assertEqual(enum_name(Color.RED), Color.RED.name)
         self.assertEqual(enum_name(Color.GREEN), Color.GREEN.name)
@@ -141,11 +142,9 @@ class Color(Enum):
         def enum_value(x: Color) -> int:
             return x.value
 
-        FileCheck() \
-            .check("Color") \
-            .check_next("prim::EnumValue") \
-            .check_next("return") \
-            .run(str(enum_value.graph))
+        FileCheck().check("Color").check_next("prim::EnumValue").check_next(
+            "return"
+        ).run(str(enum_value.graph))
 
         self.assertEqual(enum_value(Color.RED), Color.RED.value)
         self.assertEqual(enum_value(Color.GREEN), Color.GREEN.value)
@@ -161,11 +160,9 @@ class Color(Enum):
         def enum_const(x: Color) -> bool:
             return x == Color.RED
 
-        FileCheck() \
-            .check("prim::Constant[value=__torch__.jit.test_enum.Color.RED]") \
-            .check_next("aten::eq") \
-            .check_next("return") \
-            .run(str(enum_const.graph))
+        FileCheck().check(
+            "prim::Constant[value=__torch__.jit.test_enum.Color.RED]"
+        ).check_next("aten::eq").check_next("return").run(str(enum_const.graph))
 
         self.assertEqual(enum_const(Color.RED), True)
         self.assertEqual(enum_const(Color.GREEN), False)
@@ -183,7 +180,9 @@ def enum_const(x: Color) -> bool:
             else:
                 return False
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "has no attribute 'PURPLE'", "Color.PURPLE"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "has no attribute 'PURPLE'", "Color.PURPLE"
+        ):
             torch.jit.script(enum_const)
 
     def test_enum_ivalue_type(self):
@@ -197,10 +196,9 @@ class Color(Enum):
         def is_color_enum(x: Any):
             return isinstance(x, Color)
 
-        FileCheck() \
-            .check("prim::isinstance[types=[Enum<__torch__.jit.test_enum.Color>]]") \
-            .check_next("return") \
-            .run(str(is_color_enum.graph))
+        FileCheck().check(
+            "prim::isinstance[types=[Enum<__torch__.jit.test_enum.Color>]]"
+        ).check_next("return").run(str(is_color_enum.graph))
 
         self.assertEqual(is_color_enum(Color.RED), True)
         self.assertEqual(is_color_enum(Color.GREEN), True)
@@ -217,10 +215,9 @@ class Color(Enum):
         def closed_over_aliased_type():
             return a.RED.value
 
-        FileCheck() \
-            .check("prim::Constant[value={}]".format(a.RED.value)) \
-            .check_next("return") \
-            .run(str(closed_over_aliased_type.graph))
+        FileCheck().check("prim::Constant[value={}]".format(a.RED.value)).check_next(
+            "return"
+        ).run(str(closed_over_aliased_type.graph))
 
         self.assertEqual(closed_over_aliased_type(), Color.RED.value)
 
@@ -230,10 +227,9 @@ def closed_over_aliased_type():
         def closed_over_aliased_value():
             return b.value
 
-        FileCheck() \
-            .check("prim::Constant[value={}]".format(b.value)) \
-            .check_next("return") \
-            .run(str(closed_over_aliased_value.graph))
+        FileCheck().check("prim::Constant[value={}]".format(b.value)).check_next(
+            "return"
+        ).run(str(closed_over_aliased_value.graph))
 
         self.assertEqual(closed_over_aliased_value(), Color.RED.value)
 
@@ -253,13 +249,9 @@ def forward(self):
         m = TestModule(Color.RED)
         scripted = torch.jit.script(m)
 
-        FileCheck() \
-            .check("TestModule") \
-            .check_next("Color") \
-            .check_same("prim::GetAttr[name=\"e\"]") \
-            .check_next("prim::EnumValue") \
-            .check_next("return") \
-            .run(str(scripted.graph))
+        FileCheck().check("TestModule").check_next("Color").check_same(
+            'prim::GetAttr[name="e"]'
+        ).check_next("prim::EnumValue").check_next("return").run(str(scripted.graph))
 
         self.assertEqual(scripted(), Color.RED.value)
 
@@ -316,16 +308,12 @@ def forward(self):
         m = TestModule(Color.RED)
         scripted = torch.jit.script(m)
 
-        FileCheck() \
-            .check("TestModule") \
-            .check_next("Color") \
-            .check_same("prim::GetAttr[name=\"e\"]") \
-            .check_next("return") \
-            .run(str(scripted.graph))
+        FileCheck().check("TestModule").check_next("Color").check_same(
+            'prim::GetAttr[name="e"]'
+        ).check_next("return").run(str(scripted.graph))
 
         self.assertEqual(scripted(), Color.RED)
 
-
     def test_enum_iterate(self):
         class Color(Enum):
             RED = 1
@@ -342,12 +330,9 @@ def iterate_enum(x: Color):
         make_global(Color)
         scripted = torch.jit.script(iterate_enum)
 
-        FileCheck() \
-            .check("Enum<__torch__.jit.test_enum.Color>[]") \
-            .check_same("Color.RED") \
-            .check_same("Color.GREEN") \
-            .check_same("Color.BLUE") \
-            .run(str(scripted.graph))
+        FileCheck().check("Enum<__torch__.jit.test_enum.Color>[]").check_same(
+            "Color.RED"
+        ).check_same("Color.GREEN").check_same("Color.BLUE").run(str(scripted.graph))
 
         # PURPLE always appears last because we follow Python's Enum definition order.
         self.assertEqual(scripted(Color.RED), [Color.GREEN.value, Color.BLUE.value])
@@ -355,7 +340,6 @@ def iterate_enum(x: Color):
 
     # Tests that explicitly and/or repeatedly scripting an Enum class is permitted.
     def test_enum_explicit_script(self):
-
         @torch.jit.script
         class Color(Enum):
             RED = 1
diff --git a/test/jit/test_exception.py b/test/jit/test_exception.py
index 0f3aca030a688..aa05153cefe48 100644
--- a/test/jit/test_exception.py
+++ b/test/jit/test_exception.py
@@ -1,11 +1,13 @@
 # Owner(s): ["oncall: jit"]
-from torch.testing._internal.common_utils import TestCase
 import torch
 from torch import nn
+from torch.testing._internal.common_utils import TestCase
 
 r"""
 Test TorchScript exception handling.
 """
+
+
 class TestException(TestCase):
     def test_pyop_exception_message(self):
         class Foo(torch.jit.ScriptModule):
@@ -16,31 +18,40 @@ def __init__(self):
             @torch.jit.script_method
             def forward(self, x):
                 return self.conv(x)
+
         foo = Foo()
         # testing that the correct error message propagates
-        with self.assertRaisesRegex(RuntimeError, r"Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d"
+        ):
             foo(torch.ones([123]))  # wrong size
 
     def test_builtin_error_messsage(self):
         with self.assertRaisesRegex(RuntimeError, "Arguments for call are not valid"):
+
             @torch.jit.script
             def close_match(x):
                 return x.masked_fill(True)
 
-        with self.assertRaisesRegex(RuntimeError, "This op may not exist or may not be currently "
-                                    "supported in TorchScript"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "This op may not exist or may not be currently " "supported in TorchScript",
+        ):
+
             @torch.jit.script
             def unknown_op(x):
                 torch.set_anomaly_enabled(True)
                 return x
 
     def test_exceptions(self):
-        cu = torch.jit.CompilationUnit('''
+        cu = torch.jit.CompilationUnit(
+            """
             def foo(cond):
                 if bool(cond):
                     raise ValueError(3)
                 return 1
-        ''')
+        """
+        )
 
         cu.foo(torch.tensor(0))
         with self.assertRaisesRegex(torch.jit.Error, "3"):
@@ -75,7 +86,7 @@ def foo_no_decl_always_throws():
         @torch.jit.script
         def foo_decl_always_throws():
             # type: () -> Tensor
-            raise Exception("Hi")
+            raise Exception("Hi")  # noqa: TRY002
 
         output_type = next(foo_decl_always_throws.graph.outputs()).type()
         self.assertTrue(str(output_type) == "Tensor")
@@ -93,10 +104,11 @@ def foo():
                 a = 1
             else:
                 if 1 == 1:
-                    raise Exception("Hi")
+                    raise Exception("Hi")  # noqa: TRY002
                 else:
-                    raise Exception("Hi")
+                    raise Exception("Hi")  # noqa: TRY002
             return a
+
         self.assertEqual(foo(), 1)
 
         @torch.jit.script
@@ -114,11 +126,13 @@ def no_message():
             no_message()
 
     def test_assertions(self):
-        cu = torch.jit.CompilationUnit('''
+        cu = torch.jit.CompilationUnit(
+            """
             def foo(cond):
                 assert bool(cond), "hi"
                 return 0
-        ''')
+        """
+        )
 
         cu.foo(torch.tensor(1))
         with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"):
@@ -136,13 +150,15 @@ def foo(cond):
     def test_python_op_exception(self):
         @torch.jit.ignore
         def python_op(x):
-            raise Exception("bad!")
+            raise Exception("bad!")  # noqa: TRY002
 
         @torch.jit.script
         def fn(x):
             return python_op(x)
 
-        with self.assertRaisesRegex(RuntimeError, "operation failed in the TorchScript interpreter"):
+        with self.assertRaisesRegex(
+            RuntimeError, "operation failed in the TorchScript interpreter"
+        ):
             fn(torch.tensor(4))
 
     def test_dict_expansion_raises_error(self):
@@ -150,8 +166,9 @@ def fn(self):
             d = {"foo": 1, "bar": 2, "baz": 3}
             return {**d}
 
-        with self.assertRaisesRegex(torch.jit.frontend.NotSupportedError,
-                                    "Dict expansion "):
+        with self.assertRaisesRegex(
+            torch.jit.frontend.NotSupportedError, "Dict expansion "
+        ):
             torch.jit.script(fn)
 
     def test_custom_python_exception(self):
@@ -162,7 +179,9 @@ class MyValueError(ValueError):
         def fn():
             raise MyValueError("test custom exception")
 
-        with self.assertRaisesRegex(torch.jit.Error, "jit.test_exception.MyValueError: test custom exception"):
+        with self.assertRaisesRegex(
+            torch.jit.Error, "jit.test_exception.MyValueError: test custom exception"
+        ):
             fn()
 
     def test_custom_python_exception_defined_elsewhere(self):
@@ -171,5 +190,9 @@ def test_custom_python_exception_defined_elsewhere(self):
         @torch.jit.script
         def fn():
             raise MyKeyError("This is a user defined key error")
-        with self.assertRaisesRegex(torch.jit.Error, "jit.myexception.MyKeyError: This is a user defined key error"):
+
+        with self.assertRaisesRegex(
+            torch.jit.Error,
+            "jit.myexception.MyKeyError: This is a user defined key error",
+        ):
             fn()
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index 854a54ce042c0..1e744364382be 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -10,56 +10,87 @@
 import torch.nn.functional as F
 from torch.jit._recursive import wrap_cpp_module
 from torch.testing import FileCheck
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_CUDNN
 from torch.testing._internal.common_quantization import skipIfNoFBGEMM
 from torch.testing._internal.common_quantized import override_quantized_engine
-from torch.testing._internal.common_utils import set_default_dtype, skipCUDAMemoryLeakCheckIf, TEST_WITH_ROCM
-from torch.testing._internal.common_cuda import TEST_CUDNN, TEST_CUDA
+from torch.testing._internal.common_utils import (
+    set_default_dtype,
+    skipCUDAMemoryLeakCheckIf,
+    skipIfTorchDynamo,
+    TEST_WITH_ROCM,
+)
 from torch.testing._internal.jit_utils import JitTestCase
 from torch.utils import mkldnn as mkldnn_utils
 
 try:
     import torchvision
+
     HAS_TORCHVISION = True
 except ImportError:
     HAS_TORCHVISION = False
 skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision")
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
 TEST_ROCM = torch.cuda.is_available() and torch.version.hip is not None
 
+
 def removeExceptions(graph):
-    for n in graph.findAllNodes('prim::RaiseException'):
+    for n in graph.findAllNodes("prim::RaiseException"):
         n.destroy()
 
+
+@skipIfTorchDynamo("somehow causing hanging during python shutdown")
 class TestFreezing(JitTestCase):
     def test_freeze_module(self):
         class M(nn.Module):
             def __init__(self):
                 super().__init__()
-                self.a = 1                      # folded
-                self.b = 1.2                    # folded
-                self.c = "hello"                # folded
-                self.c2 = "hi\xA1"              # not folded
-                self.d = [1, 1]                 # folded
-                self.e = [1.0, 1.1]             # folded
-                self.f = ["hello", "world"]     # folded
+                self.a = 1  # folded
+                self.b = 1.2  # folded
+                self.c = "hello"  # folded
+                self.c2 = "hi\xA1"  # not folded
+                self.d = [1, 1]  # folded
+                self.e = [1.0, 1.1]  # folded
+                self.f = ["hello", "world"]  # folded
                 self.f2 = [(1, "Over \u0e55\u0e57 57")]
-                self.g = ([1, 2], 3.2, "4.4", torch.tensor([5.5], requires_grad=True))     # folded
-                self.h = {"layer" : [torch.tensor([7.7], requires_grad=True)]}
-                self.h2 = {"layer\xB1" : [torch.tensor([8.8], requires_grad=True)]}
+                self.g = (
+                    [1, 2],
+                    3.2,
+                    "4.4",
+                    torch.tensor([5.5], requires_grad=True),
+                )  # folded
+                self.h = {"layer": [torch.tensor([7.7], requires_grad=True)]}
+                self.h2 = {"layer\xB1": [torch.tensor([8.8], requires_grad=True)]}
                 self.t = torch.tensor([1.2, 2.4], requires_grad=True)  # folded
-                self.ts = [torch.tensor([1.0, 2.0], requires_grad=True), torch.tensor([3.0, 4.0], requires_grad=True)]  # folded
+                self.ts = [
+                    torch.tensor([1.0, 2.0], requires_grad=True),
+                    torch.tensor([3.0, 4.0], requires_grad=True),
+                ]  # folded
                 self.tt = [[torch.tensor([3.3, 2.3], requires_grad=True), None]]
 
             def forward(self, x):
-                return str(self.a) + str(self.b) + self.c + self.c2 + str(self.d) + \
-                    str(self.e) + str(self.f) + str(self.f2) + str(self.g) +        \
-                    str(self.h) + str(self.h2) + str(self.t) + str(self.ts) + str(self.tt)
-
+                return (
+                    str(self.a)
+                    + str(self.b)
+                    + self.c
+                    + self.c2
+                    + str(self.d)
+                    + str(self.e)
+                    + str(self.f)
+                    + str(self.f2)
+                    + str(self.g)
+                    + str(self.h)
+                    + str(self.h2)
+                    + str(self.t)
+                    + str(self.ts)
+                    + str(self.tt)
+                )
 
         m = torch.jit.script(M())
         m.eval()
@@ -77,20 +108,20 @@ def forward(self, x):
         #   }
         #   ...
         # }
-        self.assertFalse(m2._c.hasattr('a'))
-        self.assertFalse(m2._c.hasattr('b'))
-        self.assertFalse(m2._c.hasattr('c'))
-        self.assertFalse(m2._c.hasattr('c2'))
-        self.assertFalse(m2._c.hasattr('d'))
-        self.assertFalse(m2._c.hasattr('e'))
-        self.assertFalse(m2._c.hasattr('f'))
-        self.assertFalse(m2._c.hasattr('f2'))
-        self.assertFalse(m2._c.hasattr('g'))
-        self.assertFalse(m2._c.hasattr('h'))
-        self.assertFalse(m2._c.hasattr('h2'))
-        self.assertFalse(m2._c.hasattr('t'))
-        self.assertFalse(m2._c.hasattr('ts'))
-        self.assertFalse(m2._c.hasattr('tt'))
+        self.assertFalse(m2._c.hasattr("a"))
+        self.assertFalse(m2._c.hasattr("b"))
+        self.assertFalse(m2._c.hasattr("c"))
+        self.assertFalse(m2._c.hasattr("c2"))
+        self.assertFalse(m2._c.hasattr("d"))
+        self.assertFalse(m2._c.hasattr("e"))
+        self.assertFalse(m2._c.hasattr("f"))
+        self.assertFalse(m2._c.hasattr("f2"))
+        self.assertFalse(m2._c.hasattr("g"))
+        self.assertFalse(m2._c.hasattr("h"))
+        self.assertFalse(m2._c.hasattr("h2"))
+        self.assertFalse(m2._c.hasattr("t"))
+        self.assertFalse(m2._c.hasattr("ts"))
+        self.assertFalse(m2._c.hasattr("tt"))
         output_f = m2.forward(input)
         self.assertEqual(output_s, output_f)
 
@@ -150,12 +181,12 @@ def forward(self, x):
         #   }
         # }
         mf = mf._c
-        self.assertFalse(mf.hasattr('sub1'))
-        self.assertFalse(mf.hasattr('a'))
-        self.assertTrue(mf.hasattr('b'))
-        self.assertTrue(mf.hasattr('sub2'))
-        self.assertTrue(mf.sub2.hasattr('b'))   # verify b is preserved in sub2
-        self.assertFalse(mf.sub2.hasattr('a'))  # verify a is removed in sub2
+        self.assertFalse(mf.hasattr("sub1"))
+        self.assertFalse(mf.hasattr("a"))
+        self.assertTrue(mf.hasattr("b"))
+        self.assertTrue(mf.hasattr("sub2"))
+        self.assertTrue(mf.sub2.hasattr("b"))  # verify b is preserved in sub2
+        self.assertFalse(mf.sub2.hasattr("a"))  # verify a is removed in sub2
         output_f = mf.forward(input)
         self.assertEqual(output_s, output_f)
 
@@ -194,8 +225,8 @@ def forward(self, x):
         #   submodule {
         #   }
         # }
-        self.assertFalse(mf.hasattr('a'))
-        self.assertFalse(mf.hasattr('b'))
+        self.assertFalse(mf.hasattr("a"))
+        self.assertFalse(mf.hasattr("b"))
         output_f = mf.forward(input)
         self.assertEqual(output_s, output_f)
 
@@ -247,14 +278,13 @@ def forward(self, x):
         #   submodule {
         #   }
         # }
-        self.assertFalse(mf.hasattr('a'))
-        self.assertFalse(mf.hasattr('b'))
-        self.assertFalse(mf.hasattr('c'))
-        self.assertTrue(mf.hasattr('d'))
+        self.assertFalse(mf.hasattr("a"))
+        self.assertFalse(mf.hasattr("b"))
+        self.assertFalse(mf.hasattr("c"))
+        self.assertTrue(mf.hasattr("d"))
         output_f = mf.forward(input)
         self.assertEqual(output_s, output_f)
 
-
     def test_freeze_module_with_fork2(self):
         @torch.jit.script
         def foo(x):
@@ -291,8 +321,8 @@ def forward(self, x):
         # TODO:  Although there are no mutation, the alias analysis
         # conservatively assumes there is a mutation because attributes are
         # passed to fork subgraph. both 'a' and 'b' are preserved.
-        self.assertTrue(mf.hasattr('a'))
-        self.assertFalse(mf.hasattr('b'))
+        self.assertTrue(mf.hasattr("a"))
+        self.assertFalse(mf.hasattr("b"))
         output_f = mf.forward(input)
         self.assertEqual(output_s, output_f)
 
@@ -335,8 +365,8 @@ def forward(self, x):
         # TODO:  Although there are no mutation, the alias analysis
         # conservatively assumes there is a mutation because attributes are
         # passed to fork subgraph. 'b' is preserved.
-        self.assertFalse(mf.hasattr('a'))
-        self.assertTrue(mf.hasattr('b'))
+        self.assertFalse(mf.hasattr("a"))
+        self.assertTrue(mf.hasattr("b"))
         output_f = mf.forward(input)
         self.assertEqual(output_s, output_f)
 
@@ -353,7 +383,7 @@ def forward(self, x):
             @torch.jit.export
             def modify_a(self, x):
                 self.a[0] += 10
-                return self. b
+                return self.b
 
             @torch.jit.export
             def modify_b(self, x):
@@ -420,15 +450,15 @@ def forward(self, x):
         #   }
         # }
 
-        self.assertTrue(mf.hasattr('sub1'))
-        self.assertTrue(mf.sub1.hasattr('a'))
-        self.assertTrue(mf.sub1.hasattr('b'))
-        self.assertFalse(mf.hasattr('a'))
-        self.assertTrue(mf.hasattr('sub2'))
-        self.assertTrue(mf.sub2.hasattr('sub'))
-        self.assertFalse(mf.sub2.hasattr('b'))
-        self.assertTrue(mf.sub2.sub.hasattr('a'))
-        self.assertTrue(mf.sub2.sub.hasattr('b'))
+        self.assertTrue(mf.hasattr("sub1"))
+        self.assertTrue(mf.sub1.hasattr("a"))
+        self.assertTrue(mf.sub1.hasattr("b"))
+        self.assertFalse(mf.hasattr("a"))
+        self.assertTrue(mf.hasattr("sub2"))
+        self.assertTrue(mf.sub2.hasattr("sub"))
+        self.assertFalse(mf.sub2.hasattr("b"))
+        self.assertTrue(mf.sub2.sub.hasattr("a"))
+        self.assertTrue(mf.sub2.sub.hasattr("b"))
         output_f = mf.forward(input)
         self.assertEqual(output_s, output_f)
 
@@ -445,12 +475,13 @@ def forward(self, x):
             @torch.jit.export
             def modify_a(self, x):
                 self.a[0] = 10
-                return self. b
+                return self.b
 
             @torch.jit.export
             def modify_b(self, x):
                 self.b[0] = 20
                 return self.a
+
         Sub = SubModule()
 
         class SubModule2(nn.Module):
@@ -474,13 +505,15 @@ def forward(self, x):
         m = torch.jit.script(TestModule())
         m.eval()
         mf = torch._C._freeze_module(m._c)
-        self.assertTrue(mf.hasattr('sub1'))
-        self.assertTrue(mf.sub1.hasattr('a'))
-        self.assertFalse(mf.sub1.hasattr('b'))
-        self.assertTrue(mf.hasattr('sub2'))
-        self.assertTrue(mf.sub2.hasattr('sub'))
-        self.assertTrue(mf.sub2.sub.hasattr('a'))  # Freezing detects that self.sub2.sub.a and self.sub1.a are alias
-        self.assertFalse(mf.sub2.sub.hasattr('b'))
+        self.assertTrue(mf.hasattr("sub1"))
+        self.assertTrue(mf.sub1.hasattr("a"))
+        self.assertFalse(mf.sub1.hasattr("b"))
+        self.assertTrue(mf.hasattr("sub2"))
+        self.assertTrue(mf.sub2.hasattr("sub"))
+        self.assertTrue(
+            mf.sub2.sub.hasattr("a")
+        )  # Freezing detects that self.sub2.sub.a and self.sub1.a are alias
+        self.assertFalse(mf.sub2.sub.hasattr("b"))
         input = torch.randn(2, 2)
         output_s = m.forward(input)
         output_f = mf.forward(input)
@@ -501,12 +534,13 @@ def forward(self, x):
             @torch.jit.export
             def modify_a(self, x):
                 self.a = 10.0
-                return self. b
+                return self.b
 
             @torch.jit.export
             def modify_b(self, x):
                 self.b = 20.0
                 return self.a
+
         Sub = SubModule()
 
         class SubModule2(nn.Module):
@@ -526,15 +560,16 @@ def __init__(self):
             def forward(self, x):
                 z = self.sub1.modify_a(x)
                 return self.sub2(x) + z
+
         m = TestModule()
         ms = torch.jit.script(m)
         ms.eval()
         mf = torch._C._freeze_module(ms._c)
-        self.assertTrue(mf.hasattr('sub1'))
-        self.assertTrue(mf.sub1.hasattr('a'))
-        self.assertFalse(mf.sub1.hasattr('b'))
+        self.assertTrue(mf.hasattr("sub1"))
+        self.assertTrue(mf.sub1.hasattr("a"))
+        self.assertFalse(mf.sub1.hasattr("b"))
         # sub2 is fully folded becasue self.sub1 and self.sub2.sub are not alias (Scripting bug)
-        self.assertFalse(mf.hasattr('sub2'))
+        self.assertFalse(mf.hasattr("sub2"))
         input = torch.randn(2, 2)
         output = m.forward(input)
         output_s = ms.forward(input)
@@ -543,7 +578,6 @@ def forward(self, x):
         self.assertNotEqual(output, output_s)
         self.assertEqual(output_s, output_f)
 
-
     def test_freeze_module_with_preserve_sub_module(self):
         class SubModule(nn.Module):
             def __init__(self):
@@ -562,16 +596,17 @@ def __init__(self):
 
             def forward(self, x):
                 return self.sub2(x) + self.sub1(x)
+
         m = TestModule()
         ms = torch.jit.script(m)
         ms.eval()
         mf = torch._C._freeze_module(ms._c, ["sub1"])
 
         # Test that 'sub1' is preserved entirely and 'sub2' is completely folded
-        self.assertTrue(mf.hasattr('sub1'))
-        self.assertTrue(mf.sub1.hasattr('a'))
-        self.assertTrue(mf.sub1.hasattr('b'))
-        self.assertFalse(mf.hasattr('sub2'))
+        self.assertTrue(mf.hasattr("sub1"))
+        self.assertTrue(mf.sub1.hasattr("a"))
+        self.assertTrue(mf.sub1.hasattr("b"))
+        self.assertFalse(mf.hasattr("sub2"))
         input = torch.randn(2, 2)
         output_s = ms.forward(input)
         output_f = mf.forward(input)
@@ -596,6 +631,7 @@ def __init__(self):
 
             def forward(self, x):
                 return self.sub2(x) + self.sub1(x)
+
         m = TestModule()
         ms = torch.jit.script(m)
         ms.eval()
@@ -603,18 +639,17 @@ def forward(self, x):
 
         # Test that be both sub1 and sub1 are preserved and 'b' is preserved
         # even if it is not used. To fulfill user request to preserve 'sub1'
-        self.assertTrue(mf.hasattr('sub1'))
-        self.assertTrue(mf.sub1.hasattr('a'))
-        self.assertTrue(mf.sub1.hasattr('b'))
-        self.assertTrue(mf.hasattr('sub2'))
-        self.assertTrue(mf.sub2.hasattr('a'))
-        self.assertTrue(mf.sub2.hasattr('b'))
+        self.assertTrue(mf.hasattr("sub1"))
+        self.assertTrue(mf.sub1.hasattr("a"))
+        self.assertTrue(mf.sub1.hasattr("b"))
+        self.assertTrue(mf.hasattr("sub2"))
+        self.assertTrue(mf.sub2.hasattr("a"))
+        self.assertTrue(mf.sub2.hasattr("b"))
         input = torch.randn(2, 2)
         output_s = ms.forward(input)
         output_f = mf.forward(input)
         self.assertEqual(output_s, output_f)
 
-
     def test_freeze_module_with_helperfunction(self):
         class SubModule(nn.Module):
             def __init__(self):
@@ -638,14 +673,17 @@ def forward(self, x):
 
             def _forward(self, x):
                 return self.sub(x)
+
         m = torch.jit.script(TestModule())
         m.eval()
         input = torch.randn(2, 2)
         mf = torch._C._freeze_module(m._c)
-        self.assertFalse(mf.hasattr('sub'))
-        self.assertFalse(mf.hasattr('a'))
-        self.assertTrue(mf.hasattr('b'))
-        with self.assertRaisesRegex(AttributeError, "TestModule (.*) does not have a field with name '_forward'"):
+        self.assertFalse(mf.hasattr("sub"))
+        self.assertFalse(mf.hasattr("a"))
+        self.assertTrue(mf.hasattr("b"))
+        with self.assertRaisesRegex(
+            AttributeError, "TestModule (.*) does not have a field with name '_forward'"
+        ):
             mf._forward(x)  # noqa: F821
 
     def test_freeze_module_with_inplace_mutable(self):
@@ -663,7 +701,7 @@ def forward(self, x):
         m = FreezeMe()
         m.eval()
         m_f = torch._C._freeze_module(m._c)
-        self.assertTrue(m_f.hasattr('a'))
+        self.assertTrue(m_f.hasattr("a"))
         m.forward(torch.tensor([3]))
         out = m_f.forward(torch.tensor([5]))
         expected = [11, 22, 0, 1, 2, 0, 1, 2]
@@ -692,7 +730,7 @@ def forward(self, x):
         v = m_s.a
         v.append(5)
         m_s.a = v
-        self.assertFalse(m_f.hasattr('a'))
+        self.assertFalse(m_f.hasattr("a"))
         out = m_f.forward(torch.tensor([5]))
         expected = [1, 2, 3, 4]
         self.assertEqual(out, expected)
@@ -701,7 +739,7 @@ def test_freeze_module_with_mutable_dict(self):
         class FreezeMe(nn.Module):
             def __init__(self):
                 super().__init__()
-                self.a = {"layer" : "4"}
+                self.a = {"layer": "4"}
 
             def forward(self, x):
                 return self.a
@@ -721,16 +759,16 @@ def modify_a(self, x):
         m_f = torch._C._freeze_module(m_s._c)
         m.a["layer2"] += "2"
         m_s.modify_a(t)
-        self.assertFalse(m_f.hasattr('a'))
+        self.assertFalse(m_f.hasattr("a"))
         out = m_f.forward(t)
-        expected = {"layer" : "411", "layer2" : "3"}
+        expected = {"layer": "411", "layer2": "3"}
         self.assertEqual(out, expected)
 
     def test_freeze_module_with_mutable_tensor(self):
         class FreezeMe(nn.Module):
             def __init__(self):
                 super().__init__()
-                self.a = torch.tensor([1., 2., 3.])
+                self.a = torch.tensor([1.0, 2.0, 3.0])
 
             def forward(self, x):
                 return self.a
@@ -743,9 +781,9 @@ def forward(self, x):
         # Post-freezing tensor attribute mutations affect m_f.
         # FIXME: deep copy all folded attributes so that m_f has full ownership.
         m_s.a[0] += 5.0
-        self.assertFalse(m_f.hasattr('a'))
+        self.assertFalse(m_f.hasattr("a"))
         out = m_f.forward(torch.tensor([5]))
-        expected = [6., 5., 3.]
+        expected = [6.0, 5.0, 3.0]
         self.assertEqual(out, expected)
 
     def test_freeze_module_with_tuple(self):
@@ -755,7 +793,7 @@ def __init__(self):
                 self.a = (torch.tensor([1, 2, 3, 4, 5, 6]), "hi")
 
             def forward(self, x):
-                if (x[0] == 2.0):
+                if x[0] == 2.0:
                     self.a[0][0] = 10
                 return self.a[0].sum()
 
@@ -766,7 +804,7 @@ def forward(self, x):
         expected = m_s.forward(inp)
         m_s.a[0][0] = 1
         m_f = torch._C._freeze_module(m_s._c)
-        self.assertFalse(m_f.hasattr('a'))
+        self.assertFalse(m_f.hasattr("a"))
         out = m_f.forward(inp)
         self.assertEqual(out, expected)
 
@@ -787,7 +825,7 @@ def forward(self, x):
         inp = torch.tensor([5])
         expected = m_s.forward(inp)
         m_f = torch._C._freeze_module(m_s._c)
-        self.assertTrue(m_f.hasattr('a'))
+        self.assertTrue(m_f.hasattr("a"))
         m_f.a[0] -= 10
         out = m_f.forward(inp)
         self.assertEqual(out, expected)
@@ -809,7 +847,7 @@ def forward(self, x):
         expected = m_s.forward(inp)
         m_s.a[0][1] -= 10
         m_f = torch._C._freeze_module(m_s._c)
-        self.assertFalse(m_f.hasattr('a'))
+        self.assertFalse(m_f.hasattr("a"))
         out = m_f.forward(inp)
         self.assertEqual(out, expected)
 
@@ -828,7 +866,7 @@ def forward(self, x):
         m_s = torch.jit.script(m)
         m_s.eval()
         m_f = torch._C._freeze_module(m_s._c)
-        self.assertTrue(m_f.hasattr('a'))
+        self.assertTrue(m_f.hasattr("a"))
         inp = torch.tensor([5])
         out = m_f.forward(inp)
         expected = torch.tensor(51)  # 1+2+3+14+15+16
@@ -839,7 +877,7 @@ class FreezeMe(nn.Module):
             def __init__(self):
                 super().__init__()
                 self.a = torch.tensor([1, 2, 3, 4, 5, 6])
-                self.b = {"layer" : ([self.a.view(2, 3), torch.tensor([10])], 20)}
+                self.b = {"layer": ([self.a.view(2, 3), torch.tensor([10])], 20)}
                 self.c = ([self.a.view(2, 3), torch.tensor([10])], 20)
                 self.d = (self.a.view(2, 3), 20)
 
@@ -852,7 +890,9 @@ def forward(self, x):
         m_s.eval()
         inp = torch.tensor([5])
         expected = m_s.forward(inp)
-        with self.assertRaisesRegex(RuntimeError, "module contains attributes values that overlaps"):
+        with self.assertRaisesRegex(
+            RuntimeError, "module contains attributes values that overlaps"
+        ):
             m_f = torch._C._freeze_module(m_s._c)
 
     def test_freeze_module_with_aliased_tensor_attr3(self):
@@ -872,8 +912,8 @@ def forward(self, x):
         inp = torch.tensor([5])
         expected = m_s.forward(inp)
         m_f = torch._C._freeze_module(m_s._c)
-        self.assertTrue(m_f.hasattr('a'))
-        self.assertTrue(m_f.hasattr('b'))
+        self.assertTrue(m_f.hasattr("a"))
+        self.assertTrue(m_f.hasattr("b"))
         out = m_f.forward(inp)
         expected += 10  # account for  self.a += 10.
         self.assertEqual(out, expected)
@@ -895,7 +935,9 @@ def forward(self, x):
         inp = torch.tensor([5])
         expected = m_s.forward(inp)
         m_s.a[0] -= 10
-        with self.assertRaisesRegex(RuntimeError, "module contains attributes values that overlaps"):
+        with self.assertRaisesRegex(
+            RuntimeError, "module contains attributes values that overlaps"
+        ):
             m_f = torch._C._freeze_module(m_s._c)
 
     def test_freeze_module_with_overlapping_attrs(self):
@@ -917,7 +959,9 @@ def forward(self, x):
         inp = torch.tensor([5])
         expected = m_s.forward(inp)
         a[0] -= 10
-        with self.assertRaisesRegex(RuntimeError, "module contains attributes values that overlaps"):
+        with self.assertRaisesRegex(
+            RuntimeError, "module contains attributes values that overlaps"
+        ):
             m_f = torch._C._freeze_module(m_s._c)
 
     def test_freeze_module_with_aliased_attr(self):
@@ -937,8 +981,8 @@ def forward(self, x):
         m_s.eval()
         m_f = torch._C._freeze_module(m_s._c)
         # FIXME: It should be assertTrue. Currently scripting is making a copy for setting self.b (see #33034)
-        self.assertFalse(m_f.hasattr('a'))
-        self.assertFalse(m_f.hasattr('c'))
+        self.assertFalse(m_f.hasattr("a"))
+        self.assertFalse(m_f.hasattr("c"))
         inp = torch.tensor([5])
         out = m_f.forward(inp)
         expected = m_s.forward(inp)
@@ -965,7 +1009,7 @@ def forward(self, x):
         m_s = torch.jit.script(m)
         m_s.eval()
         m_f = torch._C._freeze_module(m_s._c)
-        self.assertTrue(m_f.hasattr('a'))
+        self.assertTrue(m_f.hasattr("a"))
         inp = torch.tensor([5])
         out = m_f.forward(inp)
         expected = m.forward(inp)
@@ -989,7 +1033,7 @@ def forward(self, x):
         m_s = torch.jit.script(m)
         m_s.eval()
         m_f = torch._C._freeze_module(m_s._c)
-        self.assertTrue(m_f.hasattr('a'))
+        self.assertTrue(m_f.hasattr("a"))
         inp = torch.tensor([5])
         out = m_f.forward(inp)
         expected = m.forward(inp)
@@ -999,7 +1043,7 @@ def test_freeze_module_return_self(self):
         class FreezeMe(nn.Module):
             def __init__(self):
                 super().__init__()
-                self.a = torch.tensor([1., 2., 3.])
+                self.a = torch.tensor([1.0, 2.0, 3.0])
 
             def forward(self, x):
                 return self
@@ -1007,7 +1051,9 @@ def forward(self, x):
         m = FreezeMe()
         m_s = torch.jit.script(m)
         m_s.eval()
-        with self.assertRaisesRegex(RuntimeError, "attempted to freeze a module that return itself"):
+        with self.assertRaisesRegex(
+            RuntimeError, "attempted to freeze a module that return itself"
+        ):
             m_f = torch._C._freeze_module(m_s._c)
 
     def test_freeze_module_inlining(self):
@@ -1039,7 +1085,6 @@ def forward(self, i: int):
         self.assertTrue(torch._C._jit_object_is_non_holding(obj))
 
     def test_freeze_module_return_sub_module(self):
-
         class FreezeMe(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1052,10 +1097,9 @@ def forward(self, x):
         m_s = torch.jit.script(m)
         m_s.eval()
         m_f = torch._C._freeze_module(m_s._c)
-        self.assertTrue(m_f.hasattr('conv1'))
+        self.assertTrue(m_f.hasattr("conv1"))
 
     def test_freeze_module_no_forward(self):
-
         class FreezeMe(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1068,13 +1112,11 @@ def foo(self, x):
         m = FreezeMe()
         m_s = torch.jit.script(m)
         m_s.eval()
-        m_f = torch._C._freeze_module(m_s._c, preservedAttrs=['foo'])
+        m_f = torch._C._freeze_module(m_s._c, preservedAttrs=["foo"])
         input = torch.ones(10)
         self.assertEqual(m_s.foo(input), m_f.foo(input))
 
-
     def test_freeze_no_forward(self):
-
         class FreezeMe(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1087,11 +1129,10 @@ def foo(self, x):
         m = FreezeMe()
         m_s = torch.jit.script(m)
         m_s.eval()
-        m_f = torch.jit.freeze(m_s, preserved_attrs=['foo'])
+        m_f = torch.jit.freeze(m_s, preserved_attrs=["foo"])
         input = torch.ones(10)
         self.assertEqual(m_s.foo(input), m_f.foo(input))
 
-
     def test_freeze_module_in_training_mode(self):
         class Net(nn.Module):
             def __init__(self):
@@ -1172,50 +1213,51 @@ def forward(self, x):
         #       }
         #       ...
         #     }
-        self.assertFalse(mTrain_freezed.hasattr('training'))
-        self.assertTrue(mTrain_freezed.hasattr('conv1'))
-        self.assertFalse(mTrain_freezed.conv1.hasattr('training'))
-        self.assertTrue(mTrain_freezed.conv1.hasattr('weight'))
-        self.assertTrue(mTrain_freezed.conv1.hasattr('bias'))
-        self.assertTrue(mTrain_freezed.hasattr('conv2'))
-        self.assertFalse(mTrain_freezed.conv2.hasattr('training'))
-        self.assertTrue(mTrain_freezed.conv2.hasattr('weight'))
-        self.assertTrue(mTrain_freezed.conv2.hasattr('bias'))
-        self.assertTrue(mTrain_freezed.hasattr('dropout1'))
-        self.assertTrue(mTrain_freezed.dropout1.hasattr('training'))
-        self.assertTrue(mTrain_freezed.hasattr('dropout2'))
-        self.assertTrue(mTrain_freezed.dropout2.hasattr('training'))
-        self.assertTrue(mTrain_freezed.hasattr('fc1'))
-        self.assertTrue(mTrain_freezed.fc1.hasattr('weight'))
-        self.assertTrue(mTrain_freezed.fc1.hasattr('bias'))
-        self.assertTrue(mTrain_freezed.hasattr('fc2'))
-        self.assertTrue(mTrain_freezed.fc2.hasattr('weight'))
-        self.assertTrue(mTrain_freezed.fc2.hasattr('bias'))
+        self.assertFalse(mTrain_freezed.hasattr("training"))
+        self.assertTrue(mTrain_freezed.hasattr("conv1"))
+        self.assertFalse(mTrain_freezed.conv1.hasattr("training"))
+        self.assertTrue(mTrain_freezed.conv1.hasattr("weight"))
+        self.assertTrue(mTrain_freezed.conv1.hasattr("bias"))
+        self.assertTrue(mTrain_freezed.hasattr("conv2"))
+        self.assertFalse(mTrain_freezed.conv2.hasattr("training"))
+        self.assertTrue(mTrain_freezed.conv2.hasattr("weight"))
+        self.assertTrue(mTrain_freezed.conv2.hasattr("bias"))
+        self.assertTrue(mTrain_freezed.hasattr("dropout1"))
+        self.assertTrue(mTrain_freezed.dropout1.hasattr("training"))
+        self.assertTrue(mTrain_freezed.hasattr("dropout2"))
+        self.assertTrue(mTrain_freezed.dropout2.hasattr("training"))
+        self.assertTrue(mTrain_freezed.hasattr("fc1"))
+        self.assertTrue(mTrain_freezed.fc1.hasattr("weight"))
+        self.assertTrue(mTrain_freezed.fc1.hasattr("bias"))
+        self.assertTrue(mTrain_freezed.hasattr("fc2"))
+        self.assertTrue(mTrain_freezed.fc2.hasattr("weight"))
+        self.assertTrue(mTrain_freezed.fc2.hasattr("bias"))
         model.eval()
         mEval_freezed = torch._C._freeze_module(model._c)
-        self.assertFalse(mEval_freezed.hasattr('conv1'))
-        self.assertFalse(mEval_freezed.hasattr('conv2'))
-        self.assertFalse(mEval_freezed.hasattr('dropout1'))
-        self.assertFalse(mEval_freezed.hasattr('training'))
-        self.assertFalse(mEval_freezed.hasattr('fc1'))
-        self.assertFalse(mEval_freezed.hasattr('dropout2'))
-        self.assertFalse(mEval_freezed.hasattr('fc2'))
-        with self.assertRaisesRegex(AttributeError, "does not have a field with name 'state_dict'"):
+        self.assertFalse(mEval_freezed.hasattr("conv1"))
+        self.assertFalse(mEval_freezed.hasattr("conv2"))
+        self.assertFalse(mEval_freezed.hasattr("dropout1"))
+        self.assertFalse(mEval_freezed.hasattr("training"))
+        self.assertFalse(mEval_freezed.hasattr("fc1"))
+        self.assertFalse(mEval_freezed.hasattr("dropout2"))
+        self.assertFalse(mEval_freezed.hasattr("fc2"))
+        with self.assertRaisesRegex(
+            AttributeError, "does not have a field with name 'state_dict'"
+        ):
             print(mEval_freezed.state_dict())
         buffer = io.BytesIO()
         torch.jit.save(mEval_freezed, buffer)
         buffer.seek(0)
         m = torch.jit.load(buffer)
-        FileCheck().check_not('GetAttr[name=') \
-                   .run(m._c._get_method('forward').graph)
+        FileCheck().check_not("GetAttr[name=").run(m._c._get_method("forward").graph)
         m2 = torch._C._freeze_module(model._c, preserveParameters=True)
-        self.assertTrue(m2.hasattr('conv1'))
-        self.assertTrue(m2.hasattr('conv2'))
-        self.assertFalse(m2.hasattr('dropout1'))
-        self.assertFalse(m2.hasattr('training'))
-        self.assertTrue(m2.hasattr('fc1'))
-        self.assertFalse(m2.hasattr('dropout2'))
-        self.assertTrue(m2.hasattr('fc2'))
+        self.assertTrue(m2.hasattr("conv1"))
+        self.assertTrue(m2.hasattr("conv2"))
+        self.assertFalse(m2.hasattr("dropout1"))
+        self.assertFalse(m2.hasattr("training"))
+        self.assertTrue(m2.hasattr("fc1"))
+        self.assertFalse(m2.hasattr("dropout2"))
+        self.assertTrue(m2.hasattr("fc2"))
 
     def test_freeze_module_detach_gradient(self):
         mod = nn.Conv2d(8, 3, 4, 2, 1)
@@ -1225,7 +1267,7 @@ def test_freeze_module_detach_gradient(self):
         fmod = torch._C._freeze_module(smod._c)
         self.assertTrue(mod.weight.requires_grad)
         self.assertTrue(smod.weight.requires_grad)
-        self.assertFalse(fmod.hasattr('weight'))
+        self.assertFalse(fmod.hasattr("weight"))
         inp = torch.ones(1, 8, 32, 32)
         out1 = fmod.forward(inp)
         # FIXME: frozen module mutated from outside (original module).
@@ -1327,15 +1369,15 @@ def forward(self):
 
         m = torch.jit.script(Module())
         m.eval()
-        m = torch.jit.freeze(m, preserved_attrs=['sub1.a', 'sub2.a'])
+        m = torch.jit.freeze(m, preserved_attrs=["sub1.a", "sub2.a"])
         fm = m._c
 
-        self.assertTrue(fm.hasattr('sub1'))
-        self.assertTrue(fm.sub1.hasattr('a'))
-        self.assertFalse(fm.sub1.hasattr('b'))
-        self.assertTrue(fm.hasattr('sub2'))
-        self.assertTrue(fm.sub2.hasattr('a'))
-        self.assertFalse(fm.sub2.hasattr('b'))
+        self.assertTrue(fm.hasattr("sub1"))
+        self.assertTrue(fm.sub1.hasattr("a"))
+        self.assertFalse(fm.sub1.hasattr("b"))
+        self.assertTrue(fm.hasattr("sub2"))
+        self.assertTrue(fm.sub2.hasattr("a"))
+        self.assertFalse(fm.sub2.hasattr("b"))
         self.assertEqual(m(), 6)
         m.sub1.a += 1
         self.assertEqual(m(), 7)
@@ -1364,12 +1406,12 @@ def forward(self):
 
         m = torch.jit.script(Module())
         m.eval()
-        fm = torch.jit.freeze(m, preserved_attrs=['sub.a', 'sub.method_a'])._c
+        fm = torch.jit.freeze(m, preserved_attrs=["sub.a", "sub.method_a"])._c
 
-        self.assertTrue(fm.hasattr('sub'))
-        self.assertTrue(fm.sub.hasattr('a'))
-        self.assertFalse(fm.sub.hasattr('b'))
-        self.assertTrue(fm.sub._has_method('method_a'))
+        self.assertTrue(fm.hasattr("sub"))
+        self.assertTrue(fm.sub.hasattr("a"))
+        self.assertFalse(fm.sub.hasattr("b"))
+        self.assertTrue(fm.sub._has_method("method_a"))
 
     def test_freeze_module_with_user_preserved_method_on_submodule(self):
         class SubModule(nn.Module):
@@ -1392,11 +1434,11 @@ def forward(self, x):
 
         m = torch.jit.script(Module())
         m.eval()
-        fm = torch.jit.freeze(m, preserved_attrs=['sub.method_a'])._c
+        fm = torch.jit.freeze(m, preserved_attrs=["sub.method_a"])._c
 
-        self.assertTrue(fm.hasattr('sub'))
-        self.assertTrue(fm.sub._has_method('method_a'))
-        self.assertFalse(fm.sub._has_method('method_b'))
+        self.assertTrue(fm.hasattr("sub"))
+        self.assertTrue(fm.sub._has_method("method_a"))
+        self.assertFalse(fm.sub._has_method("method_b"))
 
     @skipIfNoFBGEMM
     def test_module_with_shared_type_instances(self):
@@ -1434,7 +1476,7 @@ def _static_quant(model):
             torch.ao.quantization.convert(qModel, inplace=True)
             return model
 
-        with override_quantized_engine('fbgemm'):
+        with override_quantized_engine("fbgemm"):
             data = torch.randn(4, 1, 4, 4, dtype=torch.float32)
             m = Parent().to(torch.float32)
             m = _static_quant(m)
@@ -1443,7 +1485,9 @@ def _static_quant(model):
             torch._C._jit_pass_inline(m.graph)
             m_frozen = wrap_cpp_module(torch._C._freeze_module(m._c))
             # Earlier bug resulted in _packed_params set to false.
-            FileCheck().check_not('_packed_params = False').run(m_frozen._c.dump_to_str(True, True, False))
+            FileCheck().check_not("_packed_params = False").run(
+                m_frozen._c.dump_to_str(True, True, False)
+            )
 
             m_res = m(data)
             # It used to segfault while running frozen module.
@@ -1481,6 +1525,7 @@ def test_freeze_module_with_non_static_module_container_index(self):
         Test that Modules containing non-static ModuleDict or ModuleList
         indexing cannot be frozen.
         """
+
         @torch.jit.interface
         class ModuleInterface(torch.nn.Module):
             def forward(self, inp: Any) -> Any:
@@ -1504,7 +1549,10 @@ def forward(self, x: torch.Tensor, key: str) -> Any:
 
         m = torch.jit.script(ModWithDict())
         m.eval()
-        with self.assertRaisesRegex(RuntimeError, "Freezing modules containing prim::ModuleContainerIndex is not supported"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Freezing modules containing prim::ModuleContainerIndex is not supported",
+        ):
             mf = torch._C._freeze_module(m._c)
 
         class ModWithList(torch.nn.Module):
@@ -1518,7 +1566,10 @@ def forward(self, x: torch.Tensor, idx: int) -> Any:
 
         m = torch.jit.script(ModWithList())
         m.eval()
-        with self.assertRaisesRegex(RuntimeError, "Freezing modules containing prim::ModuleContainerIndex is not supported"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Freezing modules containing prim::ModuleContainerIndex is not supported",
+        ):
             mf = torch._C._freeze_module(m._c)
 
     def test_freeze_with_interface_mutable(self):
@@ -1589,7 +1640,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         m = torch.jit.script(WrapperModule())
         m.eval()
-        with self.assertRaisesRegex(RuntimeError, "Freezing does not support SetAttr on an interface type"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Freezing does not support SetAttr on an interface type"
+        ):
             m_frozen = torch.jit.freeze(m)
 
     def test_freeze_recursive_interfaces(self):
@@ -1661,7 +1714,6 @@ def __init__(self):
             def forward(self, inp):
                 return inp.cos() * self.x
 
-
         class InnerImpl2(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1702,7 +1754,9 @@ def forward(self, inp):
 
         m_s = torch.jit.script(m)
         m_s.eval()
-        with self.assertRaisesRegex(RuntimeError, "Freezing does not support SetAttr on an interface type"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Freezing does not support SetAttr on an interface type"
+        ):
             m_s = torch.jit.freeze(m_s)
 
     def test_freeze_interface_swapping_two_methods(self):
@@ -1769,10 +1823,14 @@ def other_method(self, x):
         m1.eval()
         m2.eval()
 
-        with self.assertRaisesRegex(RuntimeError, "Freezing does not support SetAttr on an interface type"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Freezing does not support SetAttr on an interface type"
+        ):
             torch.jit.freeze(m1, preserved_attrs=["other_method"])
 
-        with self.assertRaisesRegex(RuntimeError, "Freezing does not support SetAttr on an interface type"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Freezing does not support SetAttr on an interface type"
+        ):
             torch.jit.freeze(m2, preserved_attrs=["other_method"])
 
     def test_freeze_recursive_interfaces_same_name(self):
@@ -1917,12 +1975,12 @@ def decode(self, input):
 
         class MyModule(torch.nn.Module):
             __annotations__ = {
-                'box_coder': BoxCoder,
+                "box_coder": BoxCoder,
             }
 
             def __init__(self):
                 super().__init__()
-                self.box_coder = BoxCoder(50.)
+                self.box_coder = BoxCoder(50.0)
 
             def forward(self, input):
                 return self.box_coder.decode(input)
@@ -1988,6 +2046,8 @@ def make_prediction(self, x):
             mod.forward(x), unscripted_mod.forward(x), atol=1e-5, rtol=1e-5
         )
 
+
+@skipIfTorchDynamo("somehow causing hanging during python shutdown")
 class TestFrozenOptimizations(JitTestCase):
     def setUp(self):
         super().setUp()
@@ -2000,16 +2060,27 @@ def tearDown(self):
 
     def test_conv_bn_folding(self):
         conv_bias = [True, False]
-        module_pairs = [(nn.Conv1d, nn.BatchNorm1d), (nn.Conv2d, nn.BatchNorm2d), (nn.Conv3d, nn.BatchNorm3d)]
+        module_pairs = [
+            (nn.Conv1d, nn.BatchNorm1d),
+            (nn.Conv2d, nn.BatchNorm2d),
+            (nn.Conv3d, nn.BatchNorm3d),
+        ]
         use_tracing = [True, False]
         bn_running_stats = [True, False]
 
-        for use_bias, modules, tracing, track_stats in product(conv_bias, module_pairs, use_tracing, bn_running_stats):
+        for use_bias, modules, tracing, track_stats in product(
+            conv_bias, module_pairs, use_tracing, bn_running_stats
+        ):
+
             class ConvBN(torch.nn.Module):
                 def __init__(self, in_channels, out_channels, **kwargs):
                     super().__init__()
-                    self.conv = modules[0](in_channels, out_channels, bias=use_bias, **kwargs)
-                    self.bn = modules[1](out_channels, eps=0.001, track_running_stats=track_stats)
+                    self.conv = modules[0](
+                        in_channels, out_channels, bias=use_bias, **kwargs
+                    )
+                    self.bn = modules[1](
+                        out_channels, eps=0.001, track_running_stats=track_stats
+                    )
 
                 def forward(self, x):
                     x = self.conv(x)
@@ -2042,9 +2113,13 @@ def forward(self, x):
             scripted_mod = torch.jit.freeze(scripted_mod)
             self.run_pass("fold_frozen_conv_bn", scripted_mod.graph)
             if track_stats:
-                FileCheck().check("conv").check_not("aten::batch_norm").run(scripted_mod.graph)
+                FileCheck().check("conv").check_not("aten::batch_norm").run(
+                    scripted_mod.graph
+                )
             else:
-                FileCheck().check("conv").check("aten::batch_norm").run(scripted_mod.graph)
+                FileCheck().check("conv").check("aten::batch_norm").run(
+                    scripted_mod.graph
+                )
 
             self.assertEqual(mod_eager(inp), scripted_mod(inp))
             self.assertEqual(mod_eager(inp), scripted_mod(inp))
@@ -2053,7 +2128,9 @@ def test_conv_bn_folding_not_forward(self):
         class ConvBN(torch.nn.Module):
             def __init__(self, in_channels, out_channels, **kwargs):
                 super().__init__()
-                self.conv = torch.nn.Conv2d(in_channels, out_channels, bias=True, **kwargs)
+                self.conv = torch.nn.Conv2d(
+                    in_channels, out_channels, bias=True, **kwargs
+                )
                 self.bn = torch.nn.BatchNorm2d(out_channels, eps=0.001)
                 self.amt = 3.2
 
@@ -2068,11 +2145,17 @@ def make_prediction(self, x):
         mod_eager = ConvBN(3, 32, kernel_size=3, stride=2).eval()
         scripted_mod = torch.jit.script(mod_eager)
         torch._C._jit_pass_inline(scripted_mod.make_prediction.graph)
-        FileCheck().check("conv").check("aten::batch_norm").run(scripted_mod.make_prediction.graph)
+        FileCheck().check("conv").check("aten::batch_norm").run(
+            scripted_mod.make_prediction.graph
+        )
 
         # _jit_pass_optimize_frozen_graph should not be called on non-method attributes (e.g. "amt")
-        scripted_mod = torch.jit.freeze(scripted_mod, preserved_attrs=["make_prediction", "amt"])
-        FileCheck().check("conv").check_not("aten::batch_norm").run(scripted_mod.make_prediction.graph)
+        scripted_mod = torch.jit.freeze(
+            scripted_mod, preserved_attrs=["make_prediction", "amt"]
+        )
+        FileCheck().check("conv").check_not("aten::batch_norm").run(
+            scripted_mod.make_prediction.graph
+        )
 
     # During freezing this creates tensors constants that are attached to the frozen graph,
     # which is then kept alive by the compilation unit (which causes a leak)
@@ -2085,8 +2168,12 @@ def test_conv_bn_folding_autocast_scenario_cuda(self):
         class ConvBN(torch.nn.Module):
             def __init__(self, in_channels, out_channels, **kwargs):
                 super().__init__()
-                self.conv = torch.nn.Conv2d(in_channels, out_channels, bias=False, dtype=torch.half, **kwargs)
-                self.bn = torch.nn.BatchNorm2d(out_channels, eps=0.001, dtype=torch.float)
+                self.conv = torch.nn.Conv2d(
+                    in_channels, out_channels, bias=False, dtype=torch.half, **kwargs
+                )
+                self.bn = torch.nn.BatchNorm2d(
+                    out_channels, eps=0.001, dtype=torch.float
+                )
 
             def forward(self, x):
                 return self.bn(self.conv(x))
@@ -2107,27 +2194,35 @@ def forward(self, x):
         self.assertEqual(mod_eager(x), scripted_mod(x), atol=1e-2, rtol=1e-2)
 
     def test_conv_add_folding(self):
-
         @torch.no_grad()
-        def test_conv_fusion(use_bias, module, tracing, op, scalar, add_tensor, expect_success):
-
+        def test_conv_fusion(
+            use_bias, module, tracing, op, scalar, add_tensor, expect_success
+        ):
             class ConvOp(torch.nn.Module):
-                __constants__ = ['use_scalar']
+                __constants__ = ["use_scalar"]
 
                 def __init__(self, in_channels, out_channels, tensor=None, **kwargs):
                     super().__init__()
-                    self.conv = module(in_channels, out_channels, bias=use_bias, **kwargs)
-                    self.conv2 = module(in_channels, out_channels, bias=use_bias, **kwargs)
+                    self.conv = module(
+                        in_channels, out_channels, bias=use_bias, **kwargs
+                    )
+                    self.conv2 = module(
+                        in_channels, out_channels, bias=use_bias, **kwargs
+                    )
                     self.use_scalar = scalar
                     tensor_size = [1 for _ in range(self.conv.weight.ndim)]
                     tensor_size[1] = self.conv.weight.size(0)
-                    self.tensor = add_tensor if add_tensor is not None else torch.rand(tensor_size)
+                    self.tensor = (
+                        add_tensor
+                        if add_tensor is not None
+                        else torch.rand(tensor_size)
+                    )
                     self.op = op
 
                 def forward(self, x):
                     x = self.conv(x)
                     if self.use_scalar:
-                        return self.op(x, 2.)
+                        return self.op(x, 2.0)
                     else:
                         return self.op(x, self.tensor)
 
@@ -2140,7 +2235,6 @@ def forward(self, x):
                 inps.append(inps[-1])
                 inps.append(inps[-1])
 
-
             inp = torch.rand(inps)
 
             if tracing:
@@ -2174,25 +2268,55 @@ def forward(self, x):
         use_scalar = [False, True]
         ops = [torch.add, torch.sub, torch.mul, torch.div]
 
-        for use_bias, module, tracing, pytorch_op, scalar in product(conv_bias, modules, use_tracing, ops, use_scalar):
-            test_conv_fusion(use_bias, module, tracing, pytorch_op, scalar, add_tensor=None, expect_success=True)
-
+        for use_bias, module, tracing, pytorch_op, scalar in product(
+            conv_bias, modules, use_tracing, ops, use_scalar
+        ):
+            test_conv_fusion(
+                use_bias,
+                module,
+                tracing,
+                pytorch_op,
+                scalar,
+                add_tensor=None,
+                expect_success=True,
+            )
 
         for use_bias, pytorch_op in product(conv_bias, ops):
             # broadcasting add
-            test_conv_fusion(use_bias, nn.Conv2d, False, pytorch_op, False,
-                             add_tensor=torch.rand(32, 1, 32), expect_success=False)
+            test_conv_fusion(
+                use_bias,
+                nn.Conv2d,
+                False,
+                pytorch_op,
+                False,
+                add_tensor=torch.rand(32, 1, 32),
+                expect_success=False,
+            )
 
             # broadcasting add
-            test_conv_fusion(use_bias, nn.Conv2d, False, pytorch_op, False, add_tensor=torch.rand(1, 1), expect_success=True)
+            test_conv_fusion(
+                use_bias,
+                nn.Conv2d,
+                False,
+                pytorch_op,
+                False,
+                add_tensor=torch.rand(1, 1),
+                expect_success=True,
+            )
 
             # add with different dtype
-            test_conv_fusion(use_bias, nn.Conv2d, False, pytorch_op, False,
-                             add_tensor=torch.tensor([2]).to(torch.int), expect_success=True)
+            test_conv_fusion(
+                use_bias,
+                nn.Conv2d,
+                False,
+                pytorch_op,
+                False,
+                add_tensor=torch.tensor([2]).to(torch.int),
+                expect_success=True,
+            )
 
     def test_conv_mul_add_bn(self):
         class Conv_Mul_Add_Bn(nn.Module):
-
             def __init__(self, in_channels, out_channels, **kwargs):
                 super().__init__()
                 self.conv = nn.Conv2d(in_channels, out_channels, **kwargs)
@@ -2201,7 +2325,9 @@ def __init__(self, in_channels, out_channels, **kwargs):
                 self.tensor2 = torch.tensor(2)
 
             def forward(self, x):
-                return self.bn(torch.add(torch.mul(self.conv(x), self.tensor1), self.tensor2))
+                return self.bn(
+                    torch.add(torch.mul(self.conv(x), self.tensor1), self.tensor2)
+                )
 
         input = torch.randn(8, 3, 64, 64)
         model = Conv_Mul_Add_Bn(3, 32, kernel_size=3, stride=1).eval()
@@ -2212,20 +2338,31 @@ def forward(self, x):
             traced_model = torch.jit.freeze(traced_model)
             tresult = traced_model(input)
             self.assertEqual(result, tresult)
-            FileCheck().check("conv").check_not("aten::batch_norm").run(traced_model.graph)
+            FileCheck().check("conv").check_not("aten::batch_norm").run(
+                traced_model.graph
+            )
             FileCheck().check("conv").check_not("aten::add").run(traced_model.graph)
 
     def test_linear_bn_folding(self):
-        module_pairs = [(nn.Linear, nn.BatchNorm1d), (nn.Linear, nn.BatchNorm2d), (nn.Linear, nn.BatchNorm3d)]
+        module_pairs = [
+            (nn.Linear, nn.BatchNorm1d),
+            (nn.Linear, nn.BatchNorm2d),
+            (nn.Linear, nn.BatchNorm3d),
+        ]
         use_tracing = [True, False]
         bn_running_stats = [True, False]
 
-        for modules, tracing, track_stats in product(module_pairs, use_tracing, bn_running_stats):
+        for modules, tracing, track_stats in product(
+            module_pairs, use_tracing, bn_running_stats
+        ):
+
             class LinearBN(torch.nn.Module):
                 def __init__(self, in_features, out_features):
                     super().__init__()
                     self.linear = modules[0](in_features, out_features)
-                    self.bn = modules[1](out_features, eps=0.001, track_running_stats=track_stats)
+                    self.bn = modules[1](
+                        out_features, eps=0.001, track_running_stats=track_stats
+                    )
 
                 def forward(self, x):
                     x = self.linear(x)
@@ -2256,30 +2393,113 @@ def forward(self, x):
             FileCheck().check("linear").check("batch").run(scripted_mod.graph)
             # successfully no-ops with non-const inputs
             self.run_pass("fold_frozen_linear_bn", scripted_mod.graph)
-            FileCheck().check("linear").check("aten::batch_norm").run(scripted_mod.graph)
+            FileCheck().check("linear").check("aten::batch_norm").run(
+                scripted_mod.graph
+            )
 
             scripted_mod = torch.jit.freeze(scripted_mod)
             self.run_pass("fold_frozen_linear_bn", scripted_mod.graph)
             if track_stats:
-                FileCheck().check("linear").check_not("aten::batch_norm").run(scripted_mod.graph)
+                FileCheck().check("linear").check_not("aten::batch_norm").run(
+                    scripted_mod.graph
+                )
             else:
-                FileCheck().check("linear").check("aten::batch_norm").run(scripted_mod.graph)
+                FileCheck().check("linear").check("aten::batch_norm").run(
+                    scripted_mod.graph
+                )
 
             self.assertEqual(mod_eager(inp), scripted_mod(inp))
             self.assertEqual(mod_eager(inp), scripted_mod(inp))
 
+    def test_bn_not_broadcast_with_linear(self):
+        module_pairs = [
+            (nn.Linear, nn.BatchNorm1d),
+            (nn.Linear, nn.BatchNorm2d),
+            (nn.Linear, nn.BatchNorm3d),
+        ]
+        use_tracing = [True, False]
+        linear_in = 3
+        # (linear_out, bn_in)
+        # case 1: linear_out < bn_in
+        # case 2: linear_out > bn_in
+        # case 3: linear_out != bn_in && linear_out = 1
+        dims = [(2, 4), (4, 2), (1, 2)]
+
+        for modules, tracing, dim in product(module_pairs, use_tracing, dims):
+            linear_out, bn_in = dim[0], dim[1]
+
+            linear = modules[0](linear_in, linear_out)
+            bn = modules[1](bn_in)
+            mod_eager = nn.Sequential(linear, bn).eval()
+
+            N, C = 3, bn_in
+            input_shape = [N, C]
+            if modules[1] == nn.BatchNorm1d:
+                H = linear_in
+                input_shape.append(H)
+            elif modules[1] == nn.BatchNorm2d:
+                H, W = 4, linear_in
+                input_shape.append(H)
+                input_shape.append(W)
+            elif modules[1] == nn.BatchNorm3d:
+                D, H, W = 4, 4, linear_in
+                input_shape.append(D)
+                input_shape.append(H)
+                input_shape.append(W)
+
+            inp = torch.rand(input_shape)
+
+            if tracing:
+                scripted_mod = torch.jit.trace(mod_eager, (inp))
+            else:
+                scripted_mod = torch.jit.script(mod_eager)
+
+            self.run_pass("inline", scripted_mod.graph)
+            self.run_pass("peephole", scripted_mod.graph)
+            self.run_pass("constant_propagation", scripted_mod.graph)
+
+            FileCheck().check("linear").check("batch").run(scripted_mod.graph)
+            self.run_pass("fold_frozen_linear_bn", scripted_mod.graph)
+            FileCheck().check("linear").check("aten::batch_norm").run(
+                scripted_mod.graph
+            )
+
+            frozen_mod = torch.jit.freeze(scripted_mod)
+            self.run_pass("fold_frozen_linear_bn", frozen_mod.graph)
+            # successfully skipped folding
+            FileCheck().check("linear").check("aten::batch_norm").run(frozen_mod.graph)
+
+            self.assertEqual(mod_eager(inp), frozen_mod(inp))
+            self.assertEqual(mod_eager(inp), frozen_mod(inp))
+
+            # successfully failed folding
+            with self.assertRaisesRegex(
+                AssertionError,
+                "To fuse, linear.out_features == bn.num_features or bn.num_features == 1",
+            ):
+                nn.utils.fusion.fuse_linear_bn_eval(linear, bn)
+
     @skipCUDAMemoryLeakCheckIf(True)
     @unittest.skipIf(not TEST_CUDA, "Optimization currently only run for GPU")
     def test_linear_bn_folding_autocast_scenario_cuda(self):
-        module_pairs = [(nn.Linear, nn.BatchNorm1d), (nn.Linear, nn.BatchNorm2d), (nn.Linear, nn.BatchNorm3d)]
+        module_pairs = [
+            (nn.Linear, nn.BatchNorm1d),
+            (nn.Linear, nn.BatchNorm2d),
+            (nn.Linear, nn.BatchNorm3d),
+        ]
         use_tracing = [True, False]
         bn_running_stats = [True, False]
 
-        for modules, tracing, track_stats in product(module_pairs, use_tracing, bn_running_stats):
+        for modules, tracing, track_stats in product(
+            module_pairs, use_tracing, bn_running_stats
+        ):
+
             class LinearBN(torch.nn.Module):
                 def __init__(self, in_features, out_features):
                     super().__init__()
-                    self.linear = modules[0](in_features, out_features, bias=False, dtype=torch.half)
+                    self.linear = modules[0](
+                        in_features, out_features, bias=False, dtype=torch.half
+                    )
                     self.bn = modules[1](out_features, eps=0.001, dtype=torch.float)
 
                 def forward(self, x):
@@ -2304,7 +2524,9 @@ def forward(self, x):
             else:
                 scripted_mod = torch.jit.script(mod_eager)
             scripted_mod = torch.jit.freeze(scripted_mod)
-            FileCheck().check("linear").check_not("aten::batch_norm").run(scripted_mod.graph)
+            FileCheck().check("linear").check_not("aten::batch_norm").run(
+                scripted_mod.graph
+            )
             lin_node = scripted_mod.graph.findNode("aten::linear", True)
             self.assertTrue(lin_node is not None)
             weight_input = lin_node.namedInput("weight")
@@ -2321,6 +2543,7 @@ def test_linear_concat(self):
         out_dimms = [[5, 10], [1, 5]]
 
         for w1_dim, w2_dim in out_dimms:
+
             class ModMultLinear(nn.Module):
                 def __init__(self, w1_dim, w2_dim):
                     super().__init__()
@@ -2337,14 +2560,15 @@ def forward(self, in_tensor1):
             mod_eager = ModMultLinear(w1_dim, w2_dim).eval()
 
             test_val1 = torch.rand([50, 5])
-            self.check_linear_optimizations(mod_eager, 2, 1, (test_val1, ))
+            self.check_linear_optimizations(mod_eager, 2, 1, (test_val1,))
 
     @unittest.skipIf(not TEST_CUDA, "Optimization currently only run for GPU")
     def test_linear_concat_complex(self):
         """
-            Testing that the interleaving of multiple optimizations does not
-            cause errors, and gets optimized as expected
+        Testing that the interleaving of multiple optimizations does not
+        cause errors, and gets optimized as expected
         """
+
         class ModMultLinear(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -2364,7 +2588,7 @@ def forward(self, in_tensor1):
 
         mod_eager = ModMultLinear().eval()
         test_val1 = torch.rand([50, 5])
-        self.check_linear_optimizations(mod_eager, 4, 2, (test_val1, ))
+        self.check_linear_optimizations(mod_eager, 4, 2, (test_val1,))
 
     @unittest.skipIf(not TEST_CUDA, "Optimization currently only run for GPU")
     def test_linear_concat_different_input(self):
@@ -2408,7 +2632,7 @@ def forward(self, in_tensor1, in_tensor2, cond: bool):
                     res3 = torch._C._nn.linear(in_tensor2, self.w2, self.b2)
                     res4 = torch._C._nn.linear(in_tensor1, self.w2, self.b1)
                 else:
-                    raise AssertionError()
+                    raise AssertionError
                 res2 = torch._C._nn.linear(in_tensor1, self.w2, self.b1)
                 return res1, res2, res3, res4
 
@@ -2417,11 +2641,15 @@ def forward(self, in_tensor1, in_tensor2, cond: bool):
         test_val2 = torch.rand([50, 5])
         self.check_linear_optimizations(mod_eager, 4, 3, (test_val1, test_val2, True))
 
-    def check_linear_optimizations(self, eager_mod, orig_linears, new_linears, test_vals):
+    def check_linear_optimizations(
+        self, eager_mod, orig_linears, new_linears, test_vals
+    ):
         for is_cuda in [False, True]:
             if is_cuda:
                 mod_to_device = eager_mod.cuda()
-                test_vals_to_device = [t.cuda() if isinstance(t, torch.Tensor) else t for t in test_vals]
+                test_vals_to_device = [
+                    t.cuda() if isinstance(t, torch.Tensor) else t for t in test_vals
+                ]
             else:
                 mod_to_device = eager_mod
                 test_vals_to_device = test_vals
@@ -2429,29 +2657,42 @@ def check_linear_optimizations(self, eager_mod, orig_linears, new_linears, test_
             script_mod = torch.jit.script(mod_to_device)
             op_graph = script_mod.graph
 
-            FileCheck().check_count("aten::linear", orig_linears, exactly=True).run(op_graph)
+            FileCheck().check_count("aten::linear", orig_linears, exactly=True).run(
+                op_graph
+            )
             # successively no-ops with non-const inputs
             self.run_pass("concat_frozen_linear", op_graph)
-            FileCheck().check_count("aten::linear", orig_linears, exactly=True).run(op_graph)
+            FileCheck().check_count("aten::linear", orig_linears, exactly=True).run(
+                op_graph
+            )
 
             script_mod = torch.jit.freeze(script_mod)
             op_graph = script_mod.graph
             self.run_pass("concat_frozen_linear", op_graph)
             if is_cuda:
-                FileCheck().check_count("aten::linear", new_linears, exactly=True).run(op_graph)
+                FileCheck().check_count("aten::linear", new_linears, exactly=True).run(
+                    op_graph
+                )
             else:
-                FileCheck().check_count("aten::linear", orig_linears, exactly=True).run(op_graph)
-
-            self.assertEqual(mod_to_device(*test_vals_to_device), script_mod(*test_vals_to_device))
+                FileCheck().check_count("aten::linear", orig_linears, exactly=True).run(
+                    op_graph
+                )
 
+            self.assertEqual(
+                mod_to_device(*test_vals_to_device), script_mod(*test_vals_to_device)
+            )
 
     def test_optimize_freeze_module(self):
         in_channels, out_channels = 3, 32
-        conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=True)
-        bn = torch.nn.BatchNorm2d(out_channels, eps=.001)
+        conv = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=2, bias=True
+        )
+        bn = torch.nn.BatchNorm2d(out_channels, eps=0.001)
         mod = torch.nn.Sequential(conv, bn)
         # set optimize to False here, by default freezing runs run_frozen_optimizations
-        frozen_mod = torch.jit.freeze(torch.jit.script(mod.eval()), optimize_numerics=False)
+        frozen_mod = torch.jit.freeze(
+            torch.jit.script(mod.eval()), optimize_numerics=False
+        )
         # inspect frozen mod
         FileCheck().check("batch_norm").run(frozen_mod.graph)
         torch.jit.run_frozen_optimizations(frozen_mod)
@@ -2503,7 +2744,9 @@ def forward(self, x):
         output_f = frozen_mod.forward(input)
         self.assertEqual(output_s, output_f)
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     def test_freeze_mkdlnn(self):
         conv = torch.nn.Conv2d(3, 32, kernel_size=3, stride=2).eval().float()
         convmkl = mkldnn_utils.to_mkldnn(conv)
@@ -2511,7 +2754,9 @@ def test_freeze_mkdlnn(self):
         inp = torch.rand([4, 3, 4, 4]).float()
         self.assertEqual(out(inp.to_mkldnn()).to_dense(), conv(inp))
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     def test_conv_to_mkldnn(self):
         with set_default_dtype(torch.float):
             for module, trace in product([nn.Conv2d, nn.Conv3d], [False, True]):
@@ -2538,7 +2783,9 @@ def test_conv_to_mkldnn(self):
 
                 scripted_mod = torch.jit.freeze(scripted_mod)
                 self.run_pass("convert_frozen_ops_to_mkldnn", scripted_mod.graph)
-                FileCheck().check("to_mkldnn").check("prim::mkldnn_convolution").check("to_dense").run(scripted_mod.graph)
+                FileCheck().check("to_mkldnn").check("prim::mkldnn_convolution").check(
+                    "to_dense"
+                ).run(scripted_mod.graph)
 
                 self.assertEqual(mod(inp), scripted_mod(inp))
                 self.assertEqual(mod(inp), scripted_mod(inp))
@@ -2555,7 +2802,9 @@ def forward(self, x):
 
         mod_eager = ModLinear().eval()
         test_val = torch.rand([50, 20])
-        self.check_linear_optimizations_2(mod_eager, 1, 0, "transpose_frozen_linear", (test_val,))
+        self.check_linear_optimizations_2(
+            mod_eager, 1, 0, "transpose_frozen_linear", (test_val,)
+        )
 
     def test_linear_non_constant_weight(self):
         class ModLinear(torch.nn.Module):
@@ -2569,9 +2818,13 @@ def forward(self, x, weight):
         mod_eager = ModLinear().eval()
         test_val = torch.rand([50, 20])
         test_weight = torch.rand([30, 20])
-        self.check_linear_optimizations_2(mod_eager, 1, 1, "transpose_frozen_linear", (test_val, test_weight))
+        self.check_linear_optimizations_2(
+            mod_eager, 1, 1, "transpose_frozen_linear", (test_val, test_weight)
+        )
 
-    def check_linear_optimizations_2(self, eager_mod, orig_linears, new_linears, opt_pass, test_vals):
+    def check_linear_optimizations_2(
+        self, eager_mod, orig_linears, new_linears, opt_pass, test_vals
+    ):
         # TODO: merge with check_linear_optimizations once both diffs land
         mod_to_device = eager_mod
         test_vals_to_device = test_vals
@@ -2579,43 +2832,52 @@ def check_linear_optimizations_2(self, eager_mod, orig_linears, new_linears, opt
         script_mod = torch.jit.script(mod_to_device)
         op_graph = script_mod.graph
 
-        FileCheck().check_count("aten::linear", orig_linears, exactly=True).run(op_graph)
+        FileCheck().check_count("aten::linear", orig_linears, exactly=True).run(
+            op_graph
+        )
         # successively no-ops with non-const inputs
         self.run_pass(opt_pass, op_graph)
-        FileCheck().check_count("aten::linear", orig_linears, exactly=True).run(op_graph)
+        FileCheck().check_count("aten::linear", orig_linears, exactly=True).run(
+            op_graph
+        )
 
         script_mod = torch.jit.freeze(script_mod)
         op_graph = script_mod.graph
         self.run_pass(opt_pass, op_graph)
         FileCheck().check_count("aten::linear", new_linears, exactly=True).run(op_graph)
 
-        self.assertEqual(mod_to_device(*test_vals_to_device), script_mod(*test_vals_to_device))
+        self.assertEqual(
+            mod_to_device(*test_vals_to_device), script_mod(*test_vals_to_device)
+        )
 
     @staticmethod
     def conv():
         # Generic composable conv for testing purposes
         return nn.Conv2d(8, 8, 1)
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     def test_collapse_adjacent_conversions(self):
-
         with set_default_dtype(torch.float):
             mod = nn.Sequential(self.conv(), self.conv()).eval()
             scripted_mod = torch.jit.script(mod)
             scripted_mod = torch.jit.freeze(scripted_mod)
             self.run_pass("convert_frozen_ops_to_mkldnn", scripted_mod.graph)
-            FileCheck().check("to_mkldnn") \
-                .check("prim::mkldnn_convolution") \
-                .check("prim::mkldnn_convolution") \
-                .check("to_dense") \
-                .run(scripted_mod.graph)
-            FileCheck().check_count("to_mkldnn", 1, exactly=True).run(scripted_mod.graph)
+            FileCheck().check("to_mkldnn").check("prim::mkldnn_convolution").check(
+                "prim::mkldnn_convolution"
+            ).check("to_dense").run(scripted_mod.graph)
+            FileCheck().check_count("to_mkldnn", 1, exactly=True).run(
+                scripted_mod.graph
+            )
 
             inp = torch.rand([1, 8, 8, 8])
             self.assertEqual(scripted_mod(inp), mod(inp))
             self.assertEqual(scripted_mod(inp), mod(inp))
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     def test_mkldnn_fuser_broadcasting(self):
         class Add(nn.Module):
             def __init__(self, tensor):
@@ -2631,7 +2893,9 @@ def forward(self, x):
                 scripted_mod = torch.jit.script(mod)
                 scripted_mod = torch.jit.freeze(scripted_mod)
                 self.run_pass("convert_frozen_ops_to_mkldnn", scripted_mod.graph)
-                FileCheck().check("prim::BroadcastMKLDNNTensors").run(scripted_mod.graph)
+                FileCheck().check("prim::BroadcastMKLDNNTensors").run(
+                    scripted_mod.graph
+                )
                 inp = torch.rand([1, 8, 8, 8])
                 self.assertEqual(scripted_mod(inp), mod(inp))
                 self.assertEqual(scripted_mod(inp), mod(inp))
@@ -2639,9 +2903,14 @@ def forward(self, x):
                 # for good measure, check that broadcasting does not work without this op
                 # so we can remove the op if it ever gets supported
                 with self.assertRaisesRegex(RuntimeError, ""):
-                    torch.rand([1, 8, 8, 8]).to_mkldnn() + torch.rand(add_inp).to_mkldnn()
+                    (
+                        torch.rand([1, 8, 8, 8]).to_mkldnn()
+                        + torch.rand(add_inp).to_mkldnn()
+                    )
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     def test_mkldnn_inplace_removal(self):
         class AddMul(nn.Module):
             def __init__(self, tensor):
@@ -2657,19 +2926,35 @@ def forward(self, x):
             scripted_mod = torch.jit.freeze(scripted_mod)
             self.run_pass("convert_frozen_ops_to_mkldnn", scripted_mod.graph)
             # add gets uninplaced and reinplaced
-            FileCheck().check("aten::to_mkldnn").check("aten::add_").check("aten::div_").run(scripted_mod.graph)
+            FileCheck().check("aten::to_mkldnn").check("aten::add_").check(
+                "aten::div_"
+            ).run(scripted_mod.graph)
             inp = torch.rand([1, 8, 8, 8])
             self.assertEqual(scripted_mod(inp), mod(inp))
             self.assertEqual(scripted_mod(inp), mod(inp))
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     @skipIfNoTorchVision
     def test_maxpool_mkldnn(self):
         with set_default_dtype(torch.float):
             model = torchvision.models.resnet18()
-            sub_model = torch.nn.Sequential(model.conv1, model.bn1, model.relu, model.maxpool)
+            sub_model = torch.nn.Sequential(
+                model.conv1, model.bn1, model.relu, model.maxpool
+            )
             mod = torch.jit.freeze(torch.jit.script(sub_model.eval()))
-            N, C, H, W, = 10, 3, 224, 224
+            (
+                N,
+                C,
+                H,
+                W,
+            ) = (
+                10,
+                3,
+                224,
+                224,
+            )
             inp = torch.randn(N, C, H, W)
             self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph)
             FileCheck().check("max_pool").check("to_dense").run(mod.graph)
@@ -2693,11 +2978,16 @@ def test_freeze_conv_relu_fusion(self):
             conv_ops = [nn.Conv2d, nn.Conv3d]
             use_add_z = [True, False]
             use_tracing = [True, False]
-            for use_bias, conv, add_z, tracing in product(conv_bias, conv_ops, use_add_z, use_tracing):
+            for use_bias, conv, add_z, tracing in product(
+                conv_bias, conv_ops, use_add_z, use_tracing
+            ):
+
                 class Net(nn.Module):
                     def __init__(self, in_channels, out_channels, **kwargs):
                         super().__init__()
-                        self.conv = conv(in_channels, out_channels, bias=use_bias, **kwargs)
+                        self.conv = conv(
+                            in_channels, out_channels, bias=use_bias, **kwargs
+                        )
                         self.relu = nn.ReLU(inplace=True)
                         self.add_z = add_z
 
@@ -2724,24 +3014,35 @@ def forward(self, x):
                 frozen_mod = torch.jit.optimize_for_inference(scripted_mod)
                 if TEST_WITH_ROCM:
                     if add_z:
-                        FileCheck().check("aten::miopen_convolution_add_relu").run(frozen_mod.graph)
+                        FileCheck().check("aten::miopen_convolution_add_relu").run(
+                            frozen_mod.graph
+                        )
                     else:
-                        FileCheck().check("aten::miopen_convolution_relu").run(frozen_mod.graph)
+                        FileCheck().check("aten::miopen_convolution_relu").run(
+                            frozen_mod.graph
+                        )
                 else:
                     if add_z:
-                        FileCheck().check("aten::cudnn_convolution_add_relu").run(frozen_mod.graph)
+                        FileCheck().check("aten::cudnn_convolution_add_relu").run(
+                            frozen_mod.graph
+                        )
                     else:
-                        FileCheck().check("aten::cudnn_convolution_relu").run(frozen_mod.graph)
+                        FileCheck().check("aten::cudnn_convolution_relu").run(
+                            frozen_mod.graph
+                        )
 
                 self.assertEqual(mod_eager(inp), frozen_mod(inp))
 
     @unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN")
     def test_freeze_conv_relu_fusion_not_forward(self):
         with set_default_dtype(torch.float):
+
             class Net(nn.Module):
                 def __init__(self, in_channels, out_channels, **kwargs):
                     super().__init__()
-                    self.conv = nn.Conv2d(in_channels, out_channels, bias=None, **kwargs)
+                    self.conv = nn.Conv2d(
+                        in_channels, out_channels, bias=None, **kwargs
+                    )
                     self.relu = nn.ReLU(inplace=True)
 
                 def forward(self, x):
@@ -2761,24 +3062,41 @@ def make_prediction(self, x):
 
             scripted_mod = torch.jit.script(mod_eager)
 
-            frozen_mod = torch.jit.freeze(scripted_mod, preserved_attrs=['make_prediction'])
-            optimized_mod = torch.jit.optimize_for_inference(frozen_mod, other_methods=['make_prediction'])
+            frozen_mod = torch.jit.freeze(
+                scripted_mod, preserved_attrs=["make_prediction"]
+            )
+            optimized_mod = torch.jit.optimize_for_inference(
+                frozen_mod, other_methods=["make_prediction"]
+            )
             if TEST_WITH_ROCM:
-                FileCheck().check("aten::miopen_convolution_relu").run(optimized_mod.make_prediction.graph)
+                FileCheck().check("aten::miopen_convolution_relu").run(
+                    optimized_mod.make_prediction.graph
+                )
             else:
-                FileCheck().check("aten::cudnn_convolution_relu").run(optimized_mod.make_prediction.graph)
+                FileCheck().check("aten::cudnn_convolution_relu").run(
+                    optimized_mod.make_prediction.graph
+                )
 
-            self.assertEqual(mod_eager.make_prediction(inp), optimized_mod.make_prediction(inp))
+            self.assertEqual(
+                mod_eager.make_prediction(inp), optimized_mod.make_prediction(inp)
+            )
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     def test_numel_less_than_size_with_padding(self):
-
         with set_default_dtype(torch.float):
+
             class MyModule(nn.Module):
                 def __init__(self):
                     super().__init__()
                     self.conv1 = nn.Conv2d(
-                        1, 2, kernel_size=(2, 4), stride=2, padding=2, dilation=(2, 1),
+                        1,
+                        2,
+                        kernel_size=(2, 4),
+                        stride=2,
+                        padding=2,
+                        dilation=(2, 1),
                     )
 
                 def forward(self, i0):
@@ -2787,7 +3105,6 @@ def forward(self, i0):
                     o1 = torch.clip(x, -1.5, 1.5)
                     return o0, o1
 
-
             i0 = torch.zeros((1, 1, 1, 2), dtype=torch.float32)
             mod = MyModule()
             out = mod(i0)
@@ -2798,9 +3115,12 @@ def forward(self, i0):
             eout = exported(i0)
             self.assertTrue(all(torch.allclose(x, y) for x, y in zip(out, eout)))
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     def test_incompatible_perf_formats(self):
         with set_default_dtype(torch.float):
+
             class Mod(nn.Module):
                 def __init__(self):
                     super().__init__()
@@ -2815,47 +3135,83 @@ def forward(self, x):
             model = Mod()
             model.eval()
             mod = torch.jit.freeze(torch.jit.script(model))
-            N, C, H, W, = 10, 3, 224, 224
+            (
+                N,
+                C,
+                H,
+                W,
+            ) = (
+                10,
+                3,
+                224,
+                224,
+            )
             inp = torch.randn(N, C, H, W)
             self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph)
             self.assertEqual(model(inp), mod(inp))
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     def test_pool2d_batchnorm(self):
         with set_default_dtype(torch.float):
-
-            pooling_layers = [torch.nn.AdaptiveAvgPool2d(4),
-                              # torch.nn.AdaptiveMaxPool2d(4), # return tuples
-                              torch.nn.MaxPool2d(4),
-                              torch.nn.AvgPool2d(4),
-                              torch.nn.BatchNorm2d(64).eval()]
+            pooling_layers = [
+                torch.nn.AdaptiveAvgPool2d(4),
+                # torch.nn.AdaptiveMaxPool2d(4), # return tuples
+                torch.nn.MaxPool2d(4),
+                torch.nn.AvgPool2d(4),
+                torch.nn.BatchNorm2d(64).eval(),
+            ]
 
             for pl in pooling_layers:
-                sub_model = torch.nn.Sequential(torch.nn.Conv2d(3, 64, 2, 2), torch.nn.ReLU(), pl, torch.nn.Hardswish())
+                sub_model = torch.nn.Sequential(
+                    torch.nn.Conv2d(3, 64, 2, 2),
+                    torch.nn.ReLU(),
+                    pl,
+                    torch.nn.Hardswish(),
+                )
                 sub_model.eval()
                 mod = torch.jit.freeze(torch.jit.script(sub_model))
-                N, C, H, W, = 10, 3, 224, 224
+                (
+                    N,
+                    C,
+                    H,
+                    W,
+                ) = (
+                    10,
+                    3,
+                    224,
+                    224,
+                )
                 inp = torch.randn(N, C, H, W)
                 # these two passes needed to remove
                 # a size check in BatchNorm2d
                 removeExceptions(mod.graph)
-                self.run_pass('dce', mod.graph)
+                self.run_pass("dce", mod.graph)
                 self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph)
                 FileCheck().check("aten::to_dense").check_next("return").run(mod.graph)
                 self.assertEqual(sub_model(inp), mod(inp))
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     def test_pool3d_batchnorm(self):
         with set_default_dtype(torch.float):
-
-            pooling_layers = [torch.nn.MaxPool3d(4),
-                              # torch.nn.AdaptiveAvgPool3d(4), # no ideep bindings
-                              # torch.nn.AdaptiveMaxPool3d(4), # return tuples
-                              torch.nn.AvgPool3d(4),
-                              torch.nn.BatchNorm3d(64).eval()]
+            pooling_layers = [
+                torch.nn.MaxPool3d(4),
+                # torch.nn.AdaptiveAvgPool3d(4), # no ideep bindings
+                # torch.nn.AdaptiveMaxPool3d(4), # return tuples
+                torch.nn.AvgPool3d(4),
+                torch.nn.BatchNorm3d(64).eval(),
+            ]
 
             for pl in pooling_layers:
-                sub_model = torch.nn.Sequential(torch.nn.Conv3d(3, 64, 2, 2), torch.nn.ReLU(), pl, torch.nn.Hardswish())
+                sub_model = torch.nn.Sequential(
+                    torch.nn.Conv3d(3, 64, 2, 2),
+                    torch.nn.ReLU(),
+                    pl,
+                    torch.nn.Hardswish(),
+                )
                 sub_model.eval()
                 mod = torch.jit.freeze(torch.jit.script(sub_model))
                 N, C, H, W, D = 10, 3, 64, 64, 64
@@ -2863,16 +3219,18 @@ def test_pool3d_batchnorm(self):
                 # these two passes needed to remove
                 # a size check in BatchNorm2d
                 removeExceptions(mod.graph)
-                self.run_pass('dce', mod.graph)
+                self.run_pass("dce", mod.graph)
                 self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph)
                 FileCheck().check("aten::to_dense").check_next("return").run(mod.graph)
                 self.assertEqual(sub_model(inp), mod(inp))
 
-
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     @skipIfNoTorchVision
     def test_conv_hardswish(self):
         with set_default_dtype(torch.float):
+
             class Clamp(torch.nn.Module):
                 def __init__(self, min_val, max_val, **kwargs):
                     super().__init__()
@@ -2882,20 +3240,30 @@ def __init__(self, min_val, max_val, **kwargs):
                 def forward(self, x):
                     return torch.clamp(x, self.min_val, self.max_val)
 
-            N, C, H, W, = 10, 3, 224, 224
+            (
+                N,
+                C,
+                H,
+                W,
+            ) = (
+                10,
+                3,
+                224,
+                224,
+            )
             activations = [
                 torch.nn.Hardswish(),
                 torch.nn.Hardsigmoid(),
                 torch.nn.ReLU6(),
                 torch.nn.Tanh(),
-                torch.nn.Hardtanh(0., 6.),
-                torch.nn.Hardtanh(1., 100.),
-                torch.nn.Hardtanh(-100., -1.),
+                torch.nn.Hardtanh(0.0, 6.0),
+                torch.nn.Hardtanh(1.0, 100.0),
+                torch.nn.Hardtanh(-100.0, -1.0),
                 torch.nn.GELU(),
-                Clamp(-100., -1.),
-                Clamp(1., 100.),
-                Clamp(0., 6.),
-                Clamp(-1., 0.),
+                Clamp(-100.0, -1.0),
+                Clamp(1.0, 100.0),
+                Clamp(0.0, 6.0),
+                Clamp(-1.0, 0.0),
             ]
 
             model = torchvision.models.resnet18()
@@ -2905,19 +3273,23 @@ def forward(self, x):
                 mod = torch.jit.freeze(torch.jit.script(sub_model))
                 inp = torch.randn(N, C, H, W)
                 self.run_pass("convert_frozen_ops_to_mkldnn", mod.graph)
-                FileCheck().check_count("aten::to_dense", 1, exactly=True).run(mod.graph)
+                FileCheck().check_count("aten::to_dense", 1, exactly=True).run(
+                    mod.graph
+                )
                 self.assertEqual(sub_model(inp), mod(inp))
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     def test_hardswish_hardsigmoid(self):
         with set_default_dtype(torch.float):
             op_map = {
-                'prim::MKLDNNHardSwish' : F.hardswish,
-                'prim::MKLDNNHardSigmoid' : F.hardsigmoid,
+                "prim::MKLDNNHardSwish": F.hardswish,
+                "prim::MKLDNNHardSigmoid": F.hardsigmoid,
             }
 
             input_sizes = ([0], [1], [3], [1, 3, 8, 8])
-            for (mkldnn_opname, aten_op) in op_map.items():
+            for mkldnn_opname, aten_op in op_map.items():
                 for size in input_sizes:
                     for inplace in (True, False):
                         inplace_str = "_" if inplace else ""
@@ -2935,9 +3307,12 @@ def test_hardswish_hardsigmoid(self):
                         # and we aren't testing aten impls anyways
                         self.assertEqual(aten_op(x, inplace=False), m(x).to_dense())
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     def test_scalar_mul(self):
         with set_default_dtype(torch.float):
+
             class Mod(nn.Module):
                 def __init__(self):
                     super().__init__()
@@ -2945,7 +3320,7 @@ def __init__(self):
 
                 def forward(self, x):
                     a1 = self.mod(x) * 4
-                    return a1 * 4 + a1 * 5.
+                    return a1 * 4 + a1 * 5.0
 
             mod = Mod().eval()
             scripted = torch.jit.freeze(torch.jit.script(mod))
@@ -2979,6 +3354,8 @@ def forward(self, x):
         FileCheck().check("aten::detach").run(frozen_mod.graph)
         self.assertEqual(frozen_mod(inp), mod(inp))
 
+
+@skipIfTorchDynamo("somehow causing hanging during python shutdown")
 @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
 class TestMKLDNNReinplacing(JitTestCase):
     def setUp(self):
@@ -3010,7 +3387,9 @@ def test_successful(self):
 
         mod_eager = nn.Sequential(self.getConv(), nn.Hardswish(), nn.ReLU())
         mod = self.freezeAndConvert(mod_eager)
-        FileCheck().check("mkldnn_convolution").check_next("prim::MKLDNNHardSwish_").check_next("aten::relu_").run(mod.graph)
+        FileCheck().check("mkldnn_convolution").check_next(
+            "prim::MKLDNNHardSwish_"
+        ).check_next("aten::relu_").run(mod.graph)
         self.checkResults(mod_eager, mod)
 
     def test_merge_liveness(self):
diff --git a/test/jit/test_functional_blocks.py b/test/jit/test_functional_blocks.py
index 31a78e2db8cd7..d8c29ede47682 100644
--- a/test/jit/test_functional_blocks.py
+++ b/test/jit/test_functional_blocks.py
@@ -11,10 +11,13 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestFunctionalBlocks(JitTestCase):
     def test_subgraph_creation(self):
@@ -30,14 +33,22 @@ def fn(x, y, z):
             return x + y + z
 
         graph = torch.jit.script(fn).graph
-        self.run_pass('create_functional_graphs', graph)
+        self.run_pass("create_functional_graphs", graph)
 
         # all uses of x and y should be sunk
-        FileCheck().check(r"%x").check_not(r"%x").check("FunctionalGraph").check(r"%x").run(graph)
-        FileCheck().check(r"%y").check_not(r"%y").check("FunctionalGraph").check(r"%y").run(graph)
+        FileCheck().check(r"%x").check_not(r"%x").check("FunctionalGraph").check(
+            r"%x"
+        ).run(graph)
+        FileCheck().check(r"%y").check_not(r"%y").check("FunctionalGraph").check(
+            r"%y"
+        ).run(graph)
 
         # Don't allow any outputs which escape scope, so there is one final addition in the graph
-        FileCheck().check("Tensor = prim::Functional").check_next("aten::add").run(graph)
+        FileCheck().check("Tensor = prim::Functional").check_next("aten::add").run(
+            graph
+        )
 
         # z + 1, z.add_(2) considered non functional, z = z * z should be considered functional
-        FileCheck().check("add").check("add_").check_not("mul").check("FunctionalGraph").run(graph)
+        FileCheck().check("add").check("add_").check_not("mul").check(
+            "FunctionalGraph"
+        ).run(graph)
diff --git a/test/jit/test_fuser_common.py b/test/jit/test_fuser_common.py
index 524690c95e9f8..6a982051b151a 100644
--- a/test/jit/test_fuser_common.py
+++ b/test/jit/test_fuser_common.py
@@ -3,9 +3,11 @@
 import torch
 from torch.testing._internal.jit_utils import JitTestCase
 
+
 class TestFuserCommon(JitTestCase):
     def test_autodiff_fallback(self):
         for rq in [True, False]:
+
             @torch.jit.script
             def fn(x):
                 return torch.max(x**2.0, x**3.0)
diff --git a/test/jit/test_graph_rewrite_passes.py b/test/jit/test_graph_rewrite_passes.py
index 3ecdba6bb404d..061ef66aa1907 100644
--- a/test/jit/test_graph_rewrite_passes.py
+++ b/test/jit/test_graph_rewrite_passes.py
@@ -1,9 +1,9 @@
 # Owner(s): ["oncall: jit"]
 
-from torch.testing._internal.jit_utils import JitTestCase
 import torch
 import torch._C
 from torch.testing import FileCheck
+from torch.testing._internal.jit_utils import JitTestCase
 
 
 class TestGraphRewritePasses(JitTestCase):
diff --git a/test/jit/test_hash.py b/test/jit/test_hash.py
index 2ca1e9cda0a0e..bc0bcda07a302 100644
--- a/test/jit/test_hash.py
+++ b/test/jit/test_hash.py
@@ -3,9 +3,9 @@
 import os
 import sys
 
-import torch
+from typing import List, Tuple
 
-from typing import Tuple, List
+import torch
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
@@ -13,9 +13,12 @@
 from torch.testing._internal.jit_utils import JitTestCase
 
 if __name__ == "__main__":
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestHash(JitTestCase):
     def test_hash_tuple(self):
@@ -38,6 +41,7 @@ def fn_unhashable(t1: Tuple[int, List[int]]):
 
     def test_hash_tensor(self):
         """Tensors should hash by identity"""
+
         def fn(t1, t2):
             return hash(t1) == hash(t2)
 
@@ -74,7 +78,7 @@ def fn(f1: float, f2: float):
         self.checkScript(fn, (1.2345, 6.789))
         self.checkScript(fn, (1.2345, float("inf")))
         self.checkScript(fn, (float("inf"), float("inf")))
-        self.checkScript(fn, (1.2345, float('nan')))
+        self.checkScript(fn, (1.2345, float("nan")))
         if sys.version_info < (3, 10):
             # Hash of two nans are not guaranteed to be equal. From https://docs.python.org/3/whatsnew/3.10.html :
             # Hashes of NaN values of both float type and decimal.Decimal type now depend on object identity.
@@ -103,9 +107,9 @@ def test_hash_device(self):
         def fn(d1: torch.device, d2: torch.device):
             return hash(d1) == hash(d2)
 
-        gpu0 = torch.device('cuda:0')
-        gpu1 = torch.device('cuda:1')
-        cpu = torch.device('cpu')
+        gpu0 = torch.device("cuda:0")
+        gpu1 = torch.device("cuda:1")
+        cpu = torch.device("cpu")
         self.checkScript(fn, (gpu0, gpu0))
         self.checkScript(fn, (gpu0, gpu1))
         self.checkScript(fn, (gpu0, cpu))
diff --git a/test/jit/test_hooks.py b/test/jit/test_hooks.py
index 2963837a638a6..acced593184d6 100644
--- a/test/jit/test_hooks.py
+++ b/test/jit/test_hooks.py
@@ -6,21 +6,29 @@
 from typing import Tuple
 
 import torch
+
 from jit.test_hooks_modules import (
-    ModuleDirectforwardSubmodCall, ModuleForwardSingleInput,
-    ModuleForwardTupleInput, create_forward_tuple_input,
-    create_module_forward_multiple_inputs, create_module_forward_single_input,
+    create_forward_tuple_input,
+    create_module_forward_multiple_inputs,
+    create_module_forward_single_input,
     create_module_hook_return_nothing,
     create_module_multiple_hooks_multiple_inputs,
-    create_module_multiple_hooks_single_input, create_module_no_forward_input,
-    create_module_same_hook_repeated, create_submodule_forward_multiple_inputs,
+    create_module_multiple_hooks_single_input,
+    create_module_no_forward_input,
+    create_module_same_hook_repeated,
+    create_submodule_forward_multiple_inputs,
     create_submodule_forward_single_input,
     create_submodule_forward_single_input_return_not_tupled,
     create_submodule_hook_return_nothing,
     create_submodule_multiple_hooks_multiple_inputs,
     create_submodule_multiple_hooks_single_input,
-    create_submodule_no_forward_input, create_submodule_same_hook_repeated,
-    create_submodule_to_call_directly_with_hooks)
+    create_submodule_no_forward_input,
+    create_submodule_same_hook_repeated,
+    create_submodule_to_call_directly_with_hooks,
+    ModuleDirectforwardSubmodCall,
+    ModuleForwardSingleInput,
+    ModuleForwardTupleInput,
+)
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
@@ -37,7 +45,6 @@
 
 # Tests for JIT forward hooks and pre-hooks
 class TestHooks(JitTestCase):
-
     def test_module_no_forward_input(self):
         self.checkModule(create_module_no_forward_input(), ())
 
@@ -73,7 +80,8 @@ def test_submodule_forward_multiple_inputs(self):
 
     def test_submodule_multiple_hooks_multiple_inputs(self):
         self.checkModule(
-            create_submodule_multiple_hooks_multiple_inputs(), (["a"], "no_pre_hook"),
+            create_submodule_multiple_hooks_multiple_inputs(),
+            (["a"], "no_pre_hook"),
         )
 
     def test_submodule_forward_single_input(self):
@@ -242,7 +250,8 @@ def pre_hook_wrong_input1(self, input: Tuple[None]) -> Tuple[str]:
         m.register_forward_pre_hook(pre_hook_wrong_input1)
 
         with self.assertRaisesRegex(
-            RuntimeError, "has the wrong inner types for the input tuple argument",
+            RuntimeError,
+            "has the wrong inner types for the input tuple argument",
         ):
             torch.jit.script(m)
 
@@ -278,7 +287,8 @@ def pre_hook_wrong_output(self, input: Tuple[str]) -> int:
         m.register_forward_pre_hook(pre_hook_wrong_output)
 
         with self.assertRaisesRegex(
-            RuntimeError, "returned the wrong type of: 'int'",
+            RuntimeError,
+            "returned the wrong type of: 'int'",
         ):
             torch.jit.script(m)
 
diff --git a/test/jit/test_hooks_modules.py b/test/jit/test_hooks_modules.py
index 1ae7a315ec886..2a5e68ab1cc68 100644
--- a/test/jit/test_hooks_modules.py
+++ b/test/jit/test_hooks_modules.py
@@ -1,8 +1,9 @@
 # Owner(s): ["oncall: jit"]
 
-import torch
 from typing import List, Tuple
 
+import torch
+
 
 class SubmoduleNoForwardInputs(torch.nn.Module):
     def __init__(self, name):
diff --git a/test/jit/test_ignorable_args.py b/test/jit/test_ignorable_args.py
index c9fcce329ba02..dc9ab1907c9da 100644
--- a/test/jit/test_ignorable_args.py
+++ b/test/jit/test_ignorable_args.py
@@ -2,6 +2,7 @@
 
 import os
 import sys
+
 import torch
 from torch._C import parse_ir
 from torch.testing import FileCheck
@@ -11,10 +12,13 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 # Tests that Python slice class is supported in TorchScript
 class TestIgnorableArgs(JitTestCase):
@@ -44,11 +48,14 @@ def test_slice_ignorable_args_for_slice(self):
         # We ignore trailing arguments after start=2 for dim 0
         # and after end=1 for dim 1
         # because in %16, %15 and %0 are default values for the schema.
-        FileCheck().check("torch.slice(torch.slice(torch.tensor(_0), 0, 2), 1, None, 1)").run(src)
+        FileCheck().check(
+            "torch.slice(torch.slice(torch.tensor(_0), 0, 2), 1, None, 1)"
+        ).run(src)
         self.assertEqual(function(), function_copy())
 
     def test_add_out_ignorable_args(self):
         @torch.jit.script
         def fn(x: torch.Tensor, y: torch.Tensor):
             torch.add(x, y, out=y)
+
         FileCheck().check("torch.add(x, y, out=y)").run(fn.code)
diff --git a/test/jit/test_ignore_context_manager.py b/test/jit/test_ignore_context_manager.py
index 4d0660e9eb824..76df7cc4e2a7f 100644
--- a/test/jit/test_ignore_context_manager.py
+++ b/test/jit/test_ignore_context_manager.py
@@ -9,13 +9,16 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase
 from torch.jit.frontend import _IS_ASTUNPARSE_INSTALLED
+from torch.testing._internal.jit_utils import JitTestCase
 
 if __name__ == "__main__":
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestIgnoreContextManager(JitTestCase):
     @unittest.skipUnless(_IS_ASTUNPARSE_INSTALLED, "astunparse package is required")
@@ -26,11 +29,14 @@ def forward(self):
                 b: int = 5
                 c: int = 0
                 d: int = 6
-                with torch.jit._IgnoreContextManager(a="inp:int", b="inp:int", c="out:int", d="out:int"):
+                with torch.jit._IgnoreContextManager(
+                    a="inp:int", b="inp:int", c="out:int", d="out:int"
+                ):
                     l = [2 for i in range(a) if i > 2]
                     c = l[0] + a + b
                     d = 9
                 return c + d
+
         model = A()
         s = torch.jit.script(model)
         self.assertEqual(s(), model())
@@ -41,10 +47,13 @@ def forward(self):
                 a: int = 4
                 b: int = 5
                 c: int = 0
-                with torch.jit._IgnoreContextManager(a="inp:int", b="inp:int", c="out:int"):
+                with torch.jit._IgnoreContextManager(
+                    a="inp:int", b="inp:int", c="out:int"
+                ):
                     l = [2 for i in range(a) if i > 2]
                     c = l[0] + a + b
                 return c
+
         model = B()
         s = torch.jit.script(model)
         self.assertEqual(s(), 11)
@@ -58,6 +67,7 @@ def forward(self):
                     l = [2 for i in range(a) if i > 2]
                     b = l[0] + a
                 return b
+
         model = C()
         s = torch.jit.script(model)
         self.assertEqual(s(), 6)
@@ -72,6 +82,7 @@ def forward(self):
                 with torch.jit._IgnoreContextManager(a="inp:int", b="inp:int"):
                     l = [2 + b for i in range(a) if i > 2]
                 return a
+
         model = A()
         s = torch.jit.script(model)
         self.assertEqual(s(), 4)
@@ -85,6 +96,7 @@ def forward(self):
                     c = [2 for i in range(7) if i > 2]
                 c[0] = 3
                 return c[0] + c[1]
+
         model = A()
         s = torch.jit.script(model)
         self.assertEqual(s(), 5)
diff --git a/test/jit/test_isinstance.py b/test/jit/test_isinstance.py
index 7f42b709e7f8d..44568fd292e78 100644
--- a/test/jit/test_isinstance.py
+++ b/test/jit/test_isinstance.py
@@ -2,10 +2,10 @@
 
 import os
 import sys
+import warnings
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
-import warnings
-from typing import List, Any, Dict, Tuple, Optional
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
@@ -19,6 +19,7 @@
         "instead."
     )
 
+
 # Tests for torch.jit.isinstance
 class TestIsinstance(JitTestCase):
     def test_int(self):
@@ -223,28 +224,42 @@ def list_no_contained_type(x: Any):
 
         x = ["1", "2", "3"]
 
-        err_msg = "Attempted to use List without a contained type. " \
+        err_msg = (
+            "Attempted to use List without a contained type. "
             r"Please add a contained type, e.g. List\[int\]"
+        )
 
-        with self.assertRaisesRegex(RuntimeError, err_msg,):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            err_msg,
+        ):
             torch.jit.script(list_no_contained_type)
-        with self.assertRaisesRegex(RuntimeError, err_msg,):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            err_msg,
+        ):
             list_no_contained_type(x)
 
-
-
     def test_tuple_no_contained_type(self):
         def tuple_no_contained_type(x: Any):
             assert torch.jit.isinstance(x, Tuple)
 
         x = ("1", "2", "3")
 
-        err_msg = "Attempted to use Tuple without a contained type. " \
+        err_msg = (
+            "Attempted to use Tuple without a contained type. "
             r"Please add a contained type, e.g. Tuple\[int\]"
+        )
 
-        with self.assertRaisesRegex(RuntimeError, err_msg,):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            err_msg,
+        ):
             torch.jit.script(tuple_no_contained_type)
-        with self.assertRaisesRegex(RuntimeError, err_msg,):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            err_msg,
+        ):
             tuple_no_contained_type(x)
 
     def test_optional_no_contained_type(self):
@@ -253,12 +268,20 @@ def optional_no_contained_type(x: Any):
 
         x = ("1", "2", "3")
 
-        err_msg = "Attempted to use Optional without a contained type. " \
+        err_msg = (
+            "Attempted to use Optional without a contained type. "
             r"Please add a contained type, e.g. Optional\[int\]"
+        )
 
-        with self.assertRaisesRegex(RuntimeError, err_msg,):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            err_msg,
+        ):
             torch.jit.script(optional_no_contained_type)
-        with self.assertRaisesRegex(RuntimeError, err_msg,):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            err_msg,
+        ):
             optional_no_contained_type(x)
 
     def test_dict_no_contained_type(self):
@@ -267,12 +290,20 @@ def dict_no_contained_type(x: Any):
 
         x = {"a": "aa"}
 
-        err_msg = "Attempted to use Dict without contained types. " \
+        err_msg = (
+            "Attempted to use Dict without contained types. "
             r"Please add contained type, e.g. Dict\[int, int\]"
+        )
 
-        with self.assertRaisesRegex(RuntimeError, err_msg,):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            err_msg,
+        ):
             torch.jit.script(dict_no_contained_type)
-        with self.assertRaisesRegex(RuntimeError, err_msg,):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            err_msg,
+        ):
             dict_no_contained_type(x)
 
     def test_tuple_rhs(self):
diff --git a/test/jit/test_jit_utils.py b/test/jit/test_jit_utils.py
index c72aad3623a95..5ef5f4e899b1b 100644
--- a/test/jit/test_jit_utils.py
+++ b/test/jit/test_jit_utils.py
@@ -13,10 +13,13 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 # Tests various JIT-related utility functions.
 class TestJitUtils(JitTestCase):
@@ -24,58 +27,71 @@ class TestJitUtils(JitTestCase):
     def test_get_callable_argument_names_positional_or_keyword(self):
         def fn_positional_or_keyword_args_only(x, y):
             return x + y
+
         self.assertEqual(
             ["x", "y"],
-            torch._jit_internal.get_callable_argument_names(fn_positional_or_keyword_args_only))
+            torch._jit_internal.get_callable_argument_names(
+                fn_positional_or_keyword_args_only
+            ),
+        )
 
     # Tests that POSITIONAL_ONLY arguments are ignored.
     def test_get_callable_argument_names_positional_only(self):
-        code = dedent('''
+        code = dedent(
+            """
             def fn_positional_only_arg(x, /, y):
                 return x + y
-        ''')
+        """
+        )
 
-        fn_positional_only_arg = jit_utils._get_py3_code(code, 'fn_positional_only_arg')
+        fn_positional_only_arg = jit_utils._get_py3_code(code, "fn_positional_only_arg")
         self.assertEqual(
             ["y"],
-            torch._jit_internal.get_callable_argument_names(fn_positional_only_arg))
+            torch._jit_internal.get_callable_argument_names(fn_positional_only_arg),
+        )
 
     # Tests that VAR_POSITIONAL arguments are ignored.
     def test_get_callable_argument_names_var_positional(self):
         # Tests that VAR_POSITIONAL arguments are ignored.
         def fn_var_positional_arg(x, *arg):
             return x + arg[0]
+
         self.assertEqual(
             ["x"],
-            torch._jit_internal.get_callable_argument_names(fn_var_positional_arg))
+            torch._jit_internal.get_callable_argument_names(fn_var_positional_arg),
+        )
 
     # Tests that KEYWORD_ONLY arguments are ignored.
     def test_get_callable_argument_names_keyword_only(self):
         def fn_keyword_only_arg(x, *, y):
             return x + y
+
         self.assertEqual(
-            ["x"],
-            torch._jit_internal.get_callable_argument_names(fn_keyword_only_arg))
+            ["x"], torch._jit_internal.get_callable_argument_names(fn_keyword_only_arg)
+        )
 
     # Tests that VAR_KEYWORD arguments are ignored.
     def test_get_callable_argument_names_var_keyword(self):
         def fn_var_keyword_arg(**args):
-            return args['x'] + args['y']
+            return args["x"] + args["y"]
+
         self.assertEqual(
-            [],
-            torch._jit_internal.get_callable_argument_names(fn_var_keyword_arg))
+            [], torch._jit_internal.get_callable_argument_names(fn_var_keyword_arg)
+        )
 
     # Tests that a function signature containing various different types of
     # arguments are ignored.
     def test_get_callable_argument_names_hybrid(self):
-        code = dedent('''
+        code = dedent(
+            """
             def fn_hybrid_args(x, /, y, *args, **kwargs):
                 return x + y + args[0] + kwargs['z']
-        ''')
-        fn_hybrid_args = jit_utils._get_py3_code(code, 'fn_hybrid_args')
+        """
+        )
+        fn_hybrid_args = jit_utils._get_py3_code(code, "fn_hybrid_args")
         self.assertEqual(
-            ["y"],
-            torch._jit_internal.get_callable_argument_names(fn_hybrid_args))
+            ["y"], torch._jit_internal.get_callable_argument_names(fn_hybrid_args)
+        )
 
     def test_checkscriptassertraisesregex(self):
         def fn():
@@ -84,22 +100,18 @@ def fn():
 
         self.checkScriptRaisesRegex(fn, (), Exception, "range", name="fn")
 
-        s = dedent("""
+        s = dedent(
+            """
         def fn():
             tup = (1, 2)
             return tup[2]
-        """)
+        """
+        )
 
         self.checkScriptRaisesRegex(s, (), Exception, "range", name="fn")
 
     def test_no_tracer_warn_context_manager(self):
         torch._C._jit_set_tracer_state_warn(True)
         with jit_utils.NoTracerWarnContextManager() as no_warn:
-            self.assertEqual(
-                False,
-                torch._C._jit_get_tracer_state_warn()
-            )
-        self.assertEqual(
-            True,
-            torch._C._jit_get_tracer_state_warn()
-        )
+            self.assertEqual(False, torch._C._jit_get_tracer_state_warn())
+        self.assertEqual(True, torch._C._jit_get_tracer_state_warn())
diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py
index 5ed4608cc9a62..f3d314dbac77a 100644
--- a/test/jit/test_list_dict.py
+++ b/test/jit/test_list_dict.py
@@ -1,29 +1,33 @@
 # Owner(s): ["oncall: jit"]
 
+import inspect
 import os
 import sys
-import inspect
+import types
 import unittest
-from typing import Any, Dict, List, NamedTuple, Optional, Tuple
-from textwrap import dedent
 from collections import OrderedDict
+from textwrap import dedent
+from typing import Any, Dict, List, NamedTuple, Optional, Tuple
 
-from torch import Tensor
 import torch
 import torch.nn as nn
-import types
+
+from torch import Tensor
 from torch.testing import FileCheck
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase, make_global
 from torch.testing._internal.common_utils import skipIfTorchDynamo, TEST_CUDA
+from torch.testing._internal.jit_utils import JitTestCase, make_global
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
 
 class TestList(JitTestCase):
     def test_list_bool_conversion(self):
@@ -63,16 +67,16 @@ def int_in(x: List[int]) -> bool:
         self.checkScript(int_in, ([1, 3, 3],))
 
         def float_in(x: List[float]) -> bool:
-            return 2. in x
+            return 2.0 in x
 
-        self.checkScript(float_in, ([1., 2., 3.],))
-        self.checkScript(float_in, ([1., 3., 3.],))
+        self.checkScript(float_in, ([1.0, 2.0, 3.0],))
+        self.checkScript(float_in, ([1.0, 3.0, 3.0],))
 
         def str_in(x: List[str]) -> bool:
-            return 'hi' in x
+            return "hi" in x
 
-        self.checkScript(str_in, (['not', 'here'],))
-        self.checkScript(str_in, (['hi', 'bye'],))
+        self.checkScript(str_in, (["not", "here"],))
+        self.checkScript(str_in, (["hi", "bye"],))
         self.checkScript(str_in, ([],))
 
     def test_list_literal(self):
@@ -81,6 +85,7 @@ def reassign():
             if 1 == 1:
                 x = [2, 3]
             return
+
         self.checkScript(reassign, (), optimize=False)
 
         def reassign_arity_change():
@@ -88,6 +93,7 @@ def reassign_arity_change():
             if 1 == 1:
                 x = [1, 2, 3]
             return
+
         self.checkScript(reassign_arity_change, (), optimize=False)
 
         def reassign_from_empty_literal():
@@ -95,7 +101,10 @@ def reassign_from_empty_literal():
             if 1 == 1:
                 x = [1, 2, 3]
             return
-        with self.assertRaisesRegexWithHighlight(RuntimeError, r"previously had type List\[Tensor\]", "x"):
+
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, r"previously had type List\[Tensor\]", "x"
+        ):
             self.checkScript(reassign_from_empty_literal, (), optimize=False)
 
         def reassign_from_empty_builtin():
@@ -109,6 +118,7 @@ def reassign_from_empty_builtin():
             if 1 == 1:
                 z = [torch.randn([1])]
             return
+
         self.checkScript(reassign_from_empty_builtin, (), optimize=False)
 
         def reassign_bad_type():
@@ -116,7 +126,10 @@ def reassign_bad_type():
             if 1 == 1:
                 x = [1.0]
             return
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "previously had type", "x"):
+
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "previously had type", "x"
+        ):
             self.checkScript(reassign_bad_type, (), optimize=False)
 
         def reassign_nested():
@@ -126,7 +139,10 @@ def reassign_nested():
                 if 1 == 1:
                     x = [1.0]
             return
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "previously had type", "x"):
+
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "previously had type", "x"
+        ):
             self.checkScript(reassign_nested, (), optimize=False)
 
     def test_list_variance(self):
@@ -146,6 +162,7 @@ def test_list_variance(self):
         y: List[None] = [None, None, None]
         x: List[Optional[int]] = y
         """
+
         def test_listliteral_is_typed_from_annotation():
             x: List[Optional[int]] = [None, None, None]
             return x
@@ -163,11 +180,14 @@ def test_lists_with_different_internal_types_are_invariant(self):
             y: List[Optional[int]] = x
             return x
 
-        with self.assertRaisesRegex(RuntimeError, "Variable 'y' is "
-                                    "annotated with type "
-                                    r"List\[Optional\[int\]\] but is "
-                                    "being assigned to a value of type "
-                                    r"List\[int\]"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Variable 'y' is "
+            "annotated with type "
+            r"List\[Optional\[int\]\] but is "
+            "being assigned to a value of type "
+            r"List\[int\]",
+        ):
             torch.jit.script(test_lists_with_different_internal_types_are_invariant)
 
         def test_lists_with_different_internal_types_are_invariant_recursive(self):
@@ -175,12 +195,17 @@ def test_lists_with_different_internal_types_are_invariant_recursive(self):
             y: List[List[Optional[int]]] = x
             return x
 
-        with self.assertRaisesRegex(RuntimeError, "Variable 'y' is "
-                                    "annotated with type "
-                                    r"List\[List\[Optional\[int\]\]\] "
-                                    "but is being assigned to a value "
-                                    r"of type List\[List\[int\]\]"):
-            torch.jit.script(test_lists_with_different_internal_types_are_invariant_recursive)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Variable 'y' is "
+            "annotated with type "
+            r"List\[List\[Optional\[int\]\]\] "
+            "but is being assigned to a value "
+            r"of type List\[List\[int\]\]",
+        ):
+            torch.jit.script(
+                test_lists_with_different_internal_types_are_invariant_recursive
+            )
 
     def test_del(self):
         def inputs():
@@ -203,10 +228,15 @@ def fn2(x: List[int]) -> List[int]:
             del x[100]
             return x
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "out of range", "x[100]"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "out of range", "x[100]"
+        ):
             fn2([])
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "deletion at a single index", "x[1:3]"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "deletion at a single index", "x[1:3]"
+        ):
+
             @torch.jit.script
             def fn(x: List[int]) -> List[int]:
                 del x[1:3]
@@ -214,14 +244,19 @@ def fn(x: List[int]) -> List[int]:
 
     def test_list_keyword(self):
         def foo():
-            return list([1, 2, 3]), list(("a", "b")), list(range(5)), list("abcdefg")  # noqa: C410
+            return (
+                list([1, 2, 3]),  # noqa: C410
+                list(("a", "b")),  # noqa: C410
+                list(range(5)),
+                list("abcdefg"),
+            )
 
         self.checkScript(foo, ())
 
         def foo2():
             x: List[int] = list()
             x.append(1)
-            return x,
+            return (x,)
 
         self.checkScript(foo2, ())
 
@@ -229,7 +264,9 @@ def foo3():
             return list(list("abc"))  # noqa: C414
 
         self.checkScript(foo3, ())
-        FileCheck().check_count("aten::list", 2, exactly=True).run(torch.jit.script(foo3).graph)
+        FileCheck().check_count("aten::list", 2, exactly=True).run(
+            torch.jit.script(foo3).graph
+        )
 
     def test_dict_keyword_with_kwargs(self):
         def fn():
@@ -245,13 +282,13 @@ def fn():
 
     def test_dict_keyword_with_iterable(self):
         def fn():
-            return dict([("foo", 1), ("bar", 2), ("baz", 3)])    # noqa: C406
+            return dict([("foo", 1), ("bar", 2), ("baz", 3)])  # noqa: C406
 
         self.checkScript(fn, ())
 
     def test_dict_keyword_with_empty_iterable(self):
         def fn():
-            return dict([])    # noqa: C406
+            return dict([])  # noqa: C406
 
         self.checkScript(fn, ())
 
@@ -263,13 +300,13 @@ def fn():
 
     def test_dict_keyword_with_mapping(self):
         def fn():
-            return {"foo" : 1, "bar" : 2, "baz" : 3}
+            return {"foo": 1, "bar": 2, "baz": 3}
 
         self.checkScript(fn, ())
 
     def test_dict_keyword_with_mapping_and_kwargs(self):
         def fn():
-            return dict({"foo" : 1, "bar" : 2}, baz=3)
+            return dict({"foo": 1, "bar": 2}, baz=3)
 
         self.checkScript(fn, ())
 
@@ -281,7 +318,7 @@ def fn():
 
     def test_dict_keyword_with_dict_comprehension_and_kwargs(self):
         def fn():
-            return dict({chr(65 + i) : i for i in range(4)}, foo=2)
+            return dict({chr(65 + i): i for i in range(4)}, foo=2)
 
         self.checkScript(fn, ())
 
@@ -300,12 +337,17 @@ def fn():
         self.checkScript(fn, ())
 
     def test_dict_keyword_with_mismatched_annotations(self):
-        err_msg = r"Dict type annotation `Dict\[int, str\]` did not " \
-                  "match the type of an actual key type `str`"
+        err_msg = (
+            r"Dict type annotation `Dict\[int, str\]` did not "
+            "match the type of an actual key type `str`"
+        )
         with self.assertRaisesRegex(RuntimeError, err_msg):
+
             @torch.jit.script
             def fn():
-                x: Dict[int, str] = dict([("foo", 1), ("bar", 2), ("baz", 3)])    # noqa: C406
+                x: Dict[int, str] = dict(  # noqa: C406
+                    [("foo", 1), ("bar", 2), ("baz", 3)]
+                )
                 return x
 
     def test_dict_keyword_with_nested_call(self):
@@ -316,14 +358,14 @@ def fn():
 
     def test_dict_keyword_with_previously_declared_variable(self):
         def fn():
-            d = {"foo" : 1, "bar" : 2}
+            d = {"foo": 1, "bar": 2}
             return dict(d)
 
         self.checkScript(fn, ())
 
     def test_dict_keyword_with_previously_declared_variable_and_kwargs(self):
         def fn():
-            d = {"foo" : 1, "bar" : 2}
+            d = {"foo": 1, "bar": 2}
             return dict(d, baz=3)
 
         self.checkScript(fn, ())
@@ -352,14 +394,39 @@ def run_tests(func, a, b):
         args_right_int = [[2, 1, 1], [1, 8, 8], [], [1], [], [1, 2]]
         run_tests(jit_min_list, args_left_int, args_right_int)
 
-        args_left_float = [[1., 8., 8.], [2., 1., 1.], [], [2.], [1.], [1., 2., 3.]]
-        args_right_float = [[2., 1., 1.], [1., 8., 8.], [], [1.], [], [1., 2.]]
+        args_left_float = [
+            [1.0, 8.0, 8.0],
+            [2.0, 1.0, 1.0],
+            [],
+            [2.0],
+            [1.0],
+            [1.0, 2.0, 3.0],
+        ]
+        args_right_float = [[2.0, 1.0, 1.0], [1.0, 8.0, 8.0], [], [1.0], [], [1.0, 2.0]]
         run_tests(jit_min_list_float, args_left_float, args_right_float)
 
-        args_left_bool = [[], [], [], [False], [True], [False, True], [True, True],
-                          [False, False, False], [False, False, True]]
-        args_right_bool = [[], [False], [True], [True], [False], [True, True],
-                           [False, True], [False, False, True], [False, False, False]]
+        args_left_bool = [
+            [],
+            [],
+            [],
+            [False],
+            [True],
+            [False, True],
+            [True, True],
+            [False, False, False],
+            [False, False, True],
+        ]
+        args_right_bool = [
+            [],
+            [False],
+            [True],
+            [True],
+            [False],
+            [True, True],
+            [False, True],
+            [False, False, True],
+            [False, False, False],
+        ]
         run_tests(jit_min_list_bool, args_left_bool, args_right_bool)
 
         def jit_max_list(a: List[int], b: List[int]) -> List[int]:
@@ -375,8 +442,15 @@ def jit_max_list_bool(a: List[bool], b: List[bool]) -> List[bool]:
         args_right_int = [[8, 1, 1], [1, 8, 8], [], [2], [1], [1, 2, 3]]
         run_tests(jit_max_list, args_left_int, args_right_int)
 
-        args_left_float = [[1., 8., 8.], [8., 1., 1.], [], [1.], [], [1., 2.]]
-        args_right_float = [[8., 1., 1.], [1., 8., 8.], [], [2.], [1.], [1., 2., 3.]]
+        args_left_float = [[1.0, 8.0, 8.0], [8.0, 1.0, 1.0], [], [1.0], [], [1.0, 2.0]]
+        args_right_float = [
+            [8.0, 1.0, 1.0],
+            [1.0, 8.0, 8.0],
+            [],
+            [2.0],
+            [1.0],
+            [1.0, 2.0, 3.0],
+        ]
         run_tests(jit_max_list_float, args_left_float, args_right_float)
 
         run_tests(jit_max_list_bool, args_left_bool, args_right_bool)
@@ -398,15 +472,15 @@ def bad_index():
             a = [1, 2, 3]
             return a[4]
 
-        self.checkScriptRaisesRegex(bad_index, (), Exception,
-                                    "list index out of range")
+        self.checkScriptRaisesRegex(bad_index, (), Exception, "list index out of range")
 
         def bad_negative_index():
             a = [1, 2, 3]
             return a[-5]
 
-        self.checkScriptRaisesRegex(bad_negative_index, (), Exception,
-                                    "list index out of range")
+        self.checkScriptRaisesRegex(
+            bad_negative_index, (), Exception, "list index out of range"
+        )
 
     def test_list_len(self):
         def func():
@@ -421,7 +495,9 @@ def func2():
 
         self.checkScript(func2, ())
 
-    @skipIfTorchDynamo("TorchDynamo fails to raise on this checkScriptRaisesRegex, because we trace it properly now")
+    @skipIfTorchDynamo(
+        "TorchDynamo fails to raise on this checkScriptRaisesRegex, because we trace it properly now"
+    )
     def test_list_ops(self):
         def test_equality():
             a = [1, 2, 3]
@@ -510,13 +586,12 @@ def test_invalid_list_equality():
             return x == y
 
         self.checkScriptRaisesRegex(
-            test_invalid_list_equality,
-            (),
-            RuntimeError,
-            "Boolean value of Tensor")
+            test_invalid_list_equality, (), RuntimeError, "Boolean value of Tensor"
+        )
 
     def test_list_sort(self):
-        template = dedent('''
+        template = dedent(
+            """
         def func():
             li_1 = {list_create}
             li_2 = {list_create}
@@ -525,26 +600,36 @@ def func():
             li_2.sort(reverse=True)
             li_4 = sorted(li_3)
             return li_1, li_2, li_3, li_4
-        ''')
+        """
+        )
 
-        lists = ["[]", "[1, 3, 2]", "[True, False, True]", "[1.2, .2, 3.2]",
-                 "[torch.tensor(1.0), torch.tensor(0.2), torch.tensor(0.5)]",
-                 "[torch.tensor(5), torch.tensor(-2), torch.tensor(4)]"]
+        lists = [
+            "[]",
+            "[1, 3, 2]",
+            "[True, False, True]",
+            "[1.2, .2, 3.2]",
+            "[torch.tensor(1.0), torch.tensor(0.2), torch.tensor(0.5)]",
+            "[torch.tensor(5), torch.tensor(-2), torch.tensor(4)]",
+        ]
         for li in lists:
             code = template.format(list_create=li)
             scope = {}
             exec(code, globals(), scope)
             cu = torch.jit.CompilationUnit(code)
             t1 = cu.func()
-            t2 = scope['func']()
+            t2 = scope["func"]()
             self.assertEqual(t1, t2)
 
         def test_fail(x: List[Tensor]) -> List[Tensor]:
             x.sort()
             return x
 
-        self.checkScriptRaisesRegex(test_fail, (([torch.zeros([2]), torch.zeros([2])],)), Exception,
-                                    "Boolean value of Tensor with more than one value")
+        self.checkScriptRaisesRegex(
+            test_fail,
+            (([torch.zeros([2]), torch.zeros([2])],)),
+            Exception,
+            "Boolean value of Tensor with more than one value",
+        )
 
         @torch.jit.script
         def test_mutation():
@@ -567,36 +652,43 @@ def test_list_slice(self):
         def test_regular_slice():
             a = [0, 1, 2, 3, 4]
             return a[2:3] == [2]
+
         self.checkScript(test_regular_slice, ())
 
         def test_open_ended_slice():
             a = [0, 1, 2, 3, 4]
             return a[2:] == [2, 3, 4]
+
         self.checkScript(test_open_ended_slice, ())
 
         def test_open_ended_slice2():
             a = [0, 1, 2, 3, 4]
             return a[:2] == [0, 1]
+
         self.checkScript(test_open_ended_slice2, ())
 
         def test_negative_slice():
             a = [0, 1, 2, 3, 4]
             return a[:-1] == [0, 1, 2, 3]
+
         self.checkScript(test_negative_slice, ())
 
         def test_negative_slice2():
             a = [0, 1, 2, 3, 4]
             return a[-3:-1] == [2, 3]
+
         self.checkScript(test_negative_slice2, ())
 
         def test_backward_slice():
             a = [0, 1, 2, 3, 4]
             return a[3:2] == torch.jit.annotate(List[int], [])
+
         self.checkScript(test_backward_slice, ())
 
         def test_over_slice():
             a = [0, 1, 2, 3, 4]
             return a[3:10] == [3, 4]
+
         self.checkScript(test_backward_slice, ())
 
     def test_slice_index(self):
@@ -611,29 +703,37 @@ def test_slice_index(self):
         def test_index_slice1(x):
             x = x[:, :, [0, 1]]
             return x
+
         self.checkScript(test_index_slice1, (a,))
 
         def test_index_slice2(x):
             x = x[[2, 1, 0], :, :]
             return x
+
         self.checkScript(test_index_slice2, (a,))
 
         def test_index_slice3(x):
             x = x[[0, 1], :, [1]]
             return x
+
         self.checkScript(test_index_slice3, (a,))
 
         def test_index_slice_empty_list(x):
             empty_list: List[int] = []
             x = x[empty_list, :, :]
             return x
+
         self.checkScript(test_index_slice_empty_list, (a,))
 
         def test_index_slice_out_of_bounds_index(x):
             x = x[[4], :, :]
             return x
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "index 4 is out of bounds for dimension 0 with size 3",
-                                                               "x[[4], :, :]"):
+
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError,
+            "index 4 is out of bounds for dimension 0 with size 3",
+            "x[[4], :, :]",
+        ):
             self.checkScript(test_index_slice_out_of_bounds_index, (a,))
 
     def test_mutable_list_append(self):
@@ -642,11 +742,11 @@ def test_append():
             a.append(2)
             a.append(3)
             return a == [0, 1, 2, 3]
+
         self.checkScript(test_append, ())
 
     def test_comprehensions_basic(self):
         def comp(l: List[int]) -> List[int]:
-
             n = [x * 3 for x in l]
             return n
 
@@ -655,7 +755,6 @@ def comp(l: List[int]) -> List[int]:
 
     def test_comprehensions_basic_float(self):
         def comp(l: List[float]) -> List[float]:
-
             n = [x * 3 for x in l]
             return n
 
@@ -664,7 +763,6 @@ def comp(l: List[float]) -> List[float]:
     def test_comprehensions_two_comps(self):
         @torch.jit.script
         def comp(l1: List[int], l2: List[int]) -> List[int]:
-
             n = [x * 3 for x in l1]
             n2 = [x + 2 for x in l2]
             return n + n2
@@ -715,6 +813,7 @@ def test_append_2():
             a = [1]
             a.append(4)
             return a == [1, 4]
+
         self.checkScript(test_append_2, ())
 
     def test_mutable_list_append_if(self):
@@ -723,6 +822,7 @@ def test_append_if():
             if 1 == 1:
                 a.append(4)
             return a == [1, 4]
+
         self.checkScript(test_append_if, ())
 
     def test_mutable_list_append_if_else(self):
@@ -733,6 +833,7 @@ def test_append_if_else():
             else:
                 a.append(10)
             return a == [1, 10]
+
         self.checkScript(test_append_if_else, ())
 
     def test_mutable_list_append_loop(self):
@@ -742,6 +843,7 @@ def test_append_loop():
                 a.append(i)
 
             return a == [0, 1, 2, 3, 4]
+
         self.checkScript(test_append_loop, ())
 
     def test_mutable_list_append_loop_if(self):
@@ -754,6 +856,7 @@ def test_append_loop_if():
                     a.append(0)
 
             return a == [0, 0, 0, 0, 4]
+
         self.checkScript(test_append_loop_if, ())
 
     def test_mutable_list_nested_loop(self):
@@ -764,6 +867,7 @@ def test_nested_loop():
                     a.append(i + j)
 
             return a == [0, 1, 1, 2]
+
         self.checkScript(test_nested_loop, ())
 
     def test_mutable_list_function_inline(self):
@@ -785,6 +889,7 @@ def test_reverse_empty():
             a.reverse()
 
             return a == []
+
         self.checkScript(test_reverse_empty, ())
 
     def test_mutable_list_reverse(self):
@@ -793,6 +898,7 @@ def test_reverse():
             a.reverse()
 
             return a == [4, 3, 2, 1]
+
         self.checkScript(test_reverse, ())
 
     def test_mutable_tensor_list_reverse(self):
@@ -801,6 +907,7 @@ def test_tensor_reverse():
             a.reverse()
 
             return a == [torch.tensor(2), torch.tensor(1)]
+
         self.checkScript(test_tensor_reverse, ())
 
     def test_mutable_list_pop_empty(self):
@@ -809,7 +916,9 @@ def test_pop_empty():
             a = torch.jit.annotate(List[int], [])
             return a.pop()
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "pop from empty list", "a.pop"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "pop from empty list", "a.pop"
+        ):
             test_pop_empty()
 
     def test_mutable_list_pop(self):
@@ -884,6 +993,7 @@ def test_clear_empty():
             a.clear()
 
             return len(a) == 0
+
         self.checkScript(test_clear_empty, ())
 
     def test_mutable_list_clear(self):
@@ -892,6 +1002,7 @@ def test_clear():
             a.clear()
 
             return len(a) == 0
+
         self.checkScript(test_clear, ())
 
     def test_mutable_list_insert(self):
@@ -900,6 +1011,7 @@ def test_list_insert():
             a.insert(2, 5)
 
             return a == [1, 2, 5, 3, 4]
+
         self.checkScript(test_list_insert, ())
 
     def test_mutable_list_insert_negative(self):
@@ -908,6 +1020,7 @@ def test_list_insert_negative():
             a.insert(-1, 5)
 
             return a == [1, 2, 3, 5, 4]
+
         self.checkScript(test_list_insert_negative, ())
 
     def test_mutable_list_insert_neg_out_of_bounds(self):
@@ -916,6 +1029,7 @@ def test_list_insert_neg_out_of_bounds():
             a.insert(-10, 5)
 
             return a == [5, 1, 2, 3, 4]
+
         self.checkScript(test_list_insert_neg_out_of_bounds, ())
 
     def test_mutable_list_insert_out_of_bounds(self):
@@ -924,6 +1038,7 @@ def test_list_insert_out_of_bounds():
             a.insert(10, 5)
 
             return a == [1, 2, 3, 4, 5]
+
         self.checkScript(test_list_insert_out_of_bounds, ())
 
     def test_mutable_list_remove_not_existing(self):
@@ -934,7 +1049,9 @@ def test_list_remove_not_existing():
 
             return a
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "x not in list", "a.remove"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "x not in list", "a.remove"
+        ):
             test_list_remove_not_existing()
 
     def test_mutable_list_remove(self):
@@ -943,6 +1060,7 @@ def test_list_remove():
             a.remove(3)
 
             return a == [1, 2, 4]
+
         self.checkScript(test_list_remove, ())
 
         def test_str_list_remove():
@@ -950,6 +1068,7 @@ def test_str_list_remove():
             a.remove("foo")
 
             return a == ["bar"]
+
         self.checkScript(test_str_list_remove, ())
 
     def test_list_index_not_existing(self):
@@ -960,7 +1079,9 @@ def list_index_not_existing():
 
             return i
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "'5' is not in list", "a.index"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "'5' is not in list", "a.index"
+        ):
             list_index_not_existing()
 
     def test_list_index(self):
@@ -969,6 +1090,7 @@ def list_index():
             i = a.index(3)
 
             return i == 2
+
         self.checkScript(list_index, ())
 
         def list_str_index():
@@ -976,6 +1098,7 @@ def list_str_index():
             i = a.index("bar")
 
             return i == 1
+
         self.checkScript(list_str_index, ())
 
     def test_tensor_list_index(self):
@@ -984,6 +1107,7 @@ def tensor_list_index():
             i = a.index(torch.tensor(3))
 
             return i == 2
+
         self.checkScript(tensor_list_index, ())
 
     def test_tensor_list_index_not_existing(self):
@@ -994,7 +1118,9 @@ def tensor_list_index_not_existing():
 
             return i
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "is not in list", "a.index"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "is not in list", "a.index"
+        ):
             tensor_list_index_not_existing()
 
     def test_list_count(self):
@@ -1003,6 +1129,7 @@ def list_count():
             i = a.count(4)
 
             return i == 3
+
         self.checkScript(list_count, ())
 
         def list_str_count():
@@ -1010,6 +1137,7 @@ def list_str_count():
             i = a.count("foo")
 
             return i == 2
+
         self.checkScript(list_str_count, ())
 
     def test_list_count_not_existing(self):
@@ -1018,6 +1146,7 @@ def list_count_not_existing():
             i = a.count(5)
 
             return i == 0
+
         self.checkScript(list_count_not_existing, ())
 
     def test_tensor_list_count(self):
@@ -1026,6 +1155,7 @@ def tensor_list_count():
             i = a.count(torch.tensor(4))
 
             return i == 3
+
         self.checkScript(tensor_list_count, ())
 
     def test_tensor_list_count_not_existing(self):
@@ -1034,6 +1164,7 @@ def tensor_list_count_not_existing():
             i = a.count(torch.tensor(5))
 
             return i == 0
+
         self.checkScript(tensor_list_count_not_existing, ())
 
     def test_mutable_list_remove_tensor(self):
@@ -1042,6 +1173,7 @@ def test_list_remove_tensor():
             a.remove(torch.zeros(1))
 
             return len(a) == 2
+
         self.checkScript(test_list_remove_tensor, ())
 
     def test_mutable_list_remove2(self):
@@ -1050,23 +1182,26 @@ def test_list_remove2():
             a.remove(1)
 
             return len(a) == 0
+
         self.checkScript(test_list_remove2, ())
 
     def test_extend_list_mutable(self):
         @torch.jit.script
         def extend_list(a: List[Tensor], b: List[Tensor]) -> List[Tensor]:
-
             a.extend(b)
             return a
 
         for l in [[], [torch.rand(2)], [torch.rand(2), torch.rand(2), torch.rand(2)]]:
-            for r in [[], [torch.rand(2)], [torch.rand(2), torch.rand(2), torch.rand(2)]]:
+            for r in [
+                [],
+                [torch.rand(2)],
+                [torch.rand(2), torch.rand(2), torch.rand(2)],
+            ]:
                 self.assertEqual(extend_list(l, r), l + r)
 
     def test_extend_list_immutable(self):
         @torch.jit.script
         def extend_list(a: List[int], b: List[int]) -> List[int]:
-
             a.extend(b)
             return a
 
@@ -1109,12 +1244,11 @@ def min_floatlist(li: List[float]) -> float:
         def max_floatlist(li: List[float]) -> float:
             return max(li)
 
-
         int_lists = [1], [2, 1, 2], [-3, 4, 2], [-2, -7, 1, 4], [2, 1, 0, 4], []
 
         def check_list(fn, li):
             if len(li) == 0:
-                self.checkScriptRaisesRegex(fn, (li,), Exception, "arg is an empty sequence")
+                self.checkScriptRaisesRegex(fn, (li,), Exception, "empty")
             else:
                 self.checkScript(fn, (li,))
 
@@ -1136,6 +1270,7 @@ def test_to_list(self):
         """
         Boolean dtype unit tests.
         """
+
         def to_list_bool_0D(x: torch.Tensor) -> bool:
             li = torch.jit.annotate(bool, x.tolist())
             return li
@@ -1173,6 +1308,7 @@ def to_list_bool_3D(x: torch.Tensor) -> List[List[List[bool]]]:
         """
         Int dtype unit tests.
         """
+
         def to_list_int_0D(x: torch.Tensor) -> int:
             li = torch.jit.annotate(int, x.tolist())
             return li
@@ -1206,6 +1342,7 @@ def to_list_int_3D(x: torch.Tensor) -> List[List[List[int]]]:
         """
         Float dtype unit tests.
         """
+
         def to_list_float_0D(x: torch.Tensor) -> float:
             li = torch.jit.annotate(float, x.tolist())
             return li
@@ -1227,17 +1364,23 @@ def to_list_float_3D(x: torch.Tensor) -> List[List[List[float]]]:
         self.checkScript(to_list_float_1D, (torch.randn(5, dtype=torch.float),))
         self.checkScript(to_list_float_2D, (torch.randn(5, 6, dtype=torch.float),))
         self.checkScript(to_list_float_3D, (torch.randn(5, 6, 7, dtype=torch.float),))
-        self.checkScript(to_list_float_3D, (torch.randn(5, 6, 7, dtype=torch.float).transpose(0, 1),))
+        self.checkScript(
+            to_list_float_3D, (torch.randn(5, 6, 7, dtype=torch.float).transpose(0, 1),)
+        )
 
         self.checkScript(to_list_float_0D, (torch.randn(5, dtype=torch.double)[0],))
         self.checkScript(to_list_float_1D, (torch.randn(5, dtype=torch.double),))
         self.checkScript(to_list_float_2D, (torch.randn(5, 6, dtype=torch.double),))
         self.checkScript(to_list_float_3D, (torch.randn(5, 6, 7, dtype=torch.double),))
-        self.checkScript(to_list_float_3D, (torch.randn(5, 6, 7, dtype=torch.double).transpose(0, 1),))
+        self.checkScript(
+            to_list_float_3D,
+            (torch.randn(5, 6, 7, dtype=torch.double).transpose(0, 1),),
+        )
 
         """
         Complex dtype unit tests.
         """
+
         def to_list_complex_0D(x: torch.Tensor) -> complex:
             li = torch.jit.annotate(complex, x.tolist())
             return li
@@ -1258,14 +1401,24 @@ def to_list_complex_3D(x: torch.Tensor) -> List[List[List[complex]]]:
         self.checkScript(to_list_complex_0D, (torch.randn(5, dtype=torch.cfloat)[0],))
         self.checkScript(to_list_complex_1D, (torch.randn(5, dtype=torch.cfloat),))
         self.checkScript(to_list_complex_2D, (torch.randn(5, 6, dtype=torch.cfloat),))
-        self.checkScript(to_list_complex_3D, (torch.randn(5, 6, 7, dtype=torch.cfloat),))
-        self.checkScript(to_list_complex_3D, (torch.randn(5, 6, 7, dtype=torch.cfloat).transpose(0, 1),))
+        self.checkScript(
+            to_list_complex_3D, (torch.randn(5, 6, 7, dtype=torch.cfloat),)
+        )
+        self.checkScript(
+            to_list_complex_3D,
+            (torch.randn(5, 6, 7, dtype=torch.cfloat).transpose(0, 1),),
+        )
 
         self.checkScript(to_list_complex_0D, (torch.randn(5, dtype=torch.cdouble)[0],))
         self.checkScript(to_list_complex_1D, (torch.randn(5, dtype=torch.cdouble),))
         self.checkScript(to_list_complex_2D, (torch.randn(5, 6, dtype=torch.cdouble),))
-        self.checkScript(to_list_complex_3D, (torch.randn(5, 6, 7, dtype=torch.cdouble),))
-        self.checkScript(to_list_complex_3D, (torch.randn(5, 6, 7, dtype=torch.cdouble).transpose(0, 1),))
+        self.checkScript(
+            to_list_complex_3D, (torch.randn(5, 6, 7, dtype=torch.cdouble),)
+        )
+        self.checkScript(
+            to_list_complex_3D,
+            (torch.randn(5, 6, 7, dtype=torch.cdouble).transpose(0, 1),),
+        )
 
         """
         Non-happy path tests:
@@ -1275,6 +1428,7 @@ def to_list_complex_3D(x: torch.Tensor) -> List[List[List[complex]]]:
             - type annotation with the wrong dimension
             - type annotation with scalar type that doesn't match the input scalar type
         """
+
         def to_list_missing_type_annotation(x: torch.Tensor) -> List[float]:
             li = x.tolist()
             return li
@@ -1291,21 +1445,21 @@ def to_list_type_annotation_wrong_dim(x: torch.Tensor) -> List[List[float]]:
             li = torch.jit.annotate(List[List[float]], x.tolist())
             return li
 
-        def to_list_type_annotation_incorrect_scalar_type(x: torch.Tensor) -> List[float]:
+        def to_list_type_annotation_incorrect_scalar_type(
+            x: torch.Tensor,
+        ) -> List[float]:
             li = torch.jit.annotate(List[float], x.tolist())
             return li
 
         with self.assertRaisesRegexWithHighlight(
-            RuntimeError,
-            r"Expected type hint for result of tolist()",
-            "x.tolist("
+            RuntimeError, r"Expected type hint for result of tolist()", "x.tolist("
         ):
             self.checkScript(to_list_missing_type_annotation, (torch.randn(5),))
 
         with self.assertRaisesRegexWithHighlight(
             RuntimeError,
             r"Return value was annotated as having type List\[float\] but is actually of type float",
-            "return li"
+            "return li",
         ):
             self.checkScript(to_list_incorrect_type_annotation, (torch.randn(5),))
 
@@ -1318,7 +1472,9 @@ def to_list_type_annotation_incorrect_scalar_type(x: torch.Tensor) -> List[float
             RuntimeError,
             r"Output annotation list dimension and runtime tensor dimension must match",
         ):
-            self.checkScript(to_list_type_annotation_wrong_dim, (torch.randn(5, dtype=torch.double),))
+            self.checkScript(
+                to_list_type_annotation_wrong_dim, (torch.randn(5, dtype=torch.double),)
+            )
 
         with self.assertRaisesRegex(
             RuntimeError,
@@ -1329,9 +1485,10 @@ def to_list_type_annotation_incorrect_scalar_type(x: torch.Tensor) -> List[float
                 (torch.ones(5, dtype=torch.long),),
             )
 
-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
     def test_to_list_gpu(self):
         """GPU tests for Tensor.tolist() function."""
+
         def to_list_bool_1D(x: torch.Tensor) -> List[bool]:
             li = torch.jit.annotate(List[bool], x.tolist())
             return li
@@ -1344,12 +1501,14 @@ def to_list_float_1D(x: torch.Tensor) -> List[float]:
             li = torch.jit.annotate(List[float], x.tolist())
             return li
 
-        self.checkScript(to_list_bool_1D, (torch.tensor(
-            [True, False, True, False], dtype=torch.bool).cuda(),))
-        self.checkScript(to_list_int_1D, (torch.tensor(
-            [1, 2, 3, 4], dtype=torch.long).cuda(),))
-        self.checkScript(to_list_float_1D, (torch.randn(
-            5, dtype=torch.double).cuda(),))
+        self.checkScript(
+            to_list_bool_1D,
+            (torch.tensor([True, False, True, False], dtype=torch.bool).cuda(),),
+        )
+        self.checkScript(
+            to_list_int_1D, (torch.tensor([1, 2, 3, 4], dtype=torch.long).cuda(),)
+        )
+        self.checkScript(to_list_float_1D, (torch.randn(5, dtype=torch.double).cuda(),))
 
     def test_no_element_type_annotation(self):
         def fn_with_comment(x: torch.Tensor) -> List:
@@ -1360,38 +1519,55 @@ def annotated_fn(x: torch.Tensor) -> List:
             a: List = x.tolist()
             return a
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use List without a contained type"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use List without a contained type"
+        ):
             cu = torch.jit.CompilationUnit()
             cu.define(dedent(inspect.getsource(fn_with_comment)))
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use List without a contained type"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use List without a contained type"
+        ):
             cu = torch.jit.CompilationUnit()
             cu.define(dedent(inspect.getsource(annotated_fn)))
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use List without a contained type"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use List without a contained type"
+        ):
             torch.jit.script(fn_with_comment)
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use List without a contained type"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use List without a contained type"
+        ):
             torch.jit.script(annotated_fn)
 
     def test_list_none(self):
-        with self.assertRaisesRegex(RuntimeError, "Can not create ListType with None type"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Can not create ListType with None type"
+        ):
             x = torch._C.ListType(None)
 
     def test_list_unification_hint(self):
-        with self.assertRaisesRegex(RuntimeError, "Expected an annotation of type List"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected an annotation of type List"
+        ):
+
             @torch.jit.script
             def x():
-                b : int = [2, 3]
+                b: int = [2, 3]
                 return b
 
 
 class TestDict(JitTestCase):
     def dict(self):
-        return {u'a': torch.ones(1), u'b': torch.ones(1) + 1, u'c': torch.ones(1) + 2}
+        return {"a": torch.ones(1), "b": torch.ones(1) + 1, "c": torch.ones(1) + 2}
 
     def dict2(self):
-        return {'x': torch.ones(1) + 100, 'y': torch.ones(1) + 101, 'z': torch.ones(1) + 102}
+        return {
+            "x": torch.ones(1) + 100,
+            "y": torch.ones(1) + 101,
+            "z": torch.ones(1) + 102,
+        }
 
     def dict_bool(self):
         return {True: 1}
@@ -1428,10 +1604,10 @@ def ternary_predicate(d: Dict[int, int]):
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
     def test_del(self):
         def inputs():
-            return {'hi': 2, 'bye': 3}
+            return {"hi": 2, "bye": 3}
 
         def fn(x: Dict[str, int]) -> Dict[str, int]:
-            del x['hi']
+            del x["hi"]
             return x
 
         python_out = fn(inputs())
@@ -1441,7 +1617,7 @@ def fn(x: Dict[str, int]) -> Dict[str, int]:
         cu.define(dedent(inspect.getsource(fn)))
         self.assertEqual(cu.fn(inputs()), python_out)
         self.assertEqual(torch.jit.script(fn)(inputs()), python_out)
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "KeyError", "x['hi']"):
+        with self.assertRaisesRegexWithHighlight(RuntimeError, "KeyError", 'x["hi"]'):
             self.checkScript(fn, [{}])
 
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
@@ -1460,6 +1636,7 @@ def test_dict_variance(self):
         here), and 2) the value type of the dict is a subtype of the
         value type of the rhs dict.
         """
+
         def test_dictliteral_is_typed_from_annotation():
             x: Dict[str, Optional[int]] = {"foo": None, "bar": None, "baz": None}
             return x
@@ -1468,7 +1645,9 @@ def test_dictliteral_is_typed_from_annotation():
 
         def test_dictcomprehension_is_typed_from_annotation():
             metasyntactics = ["foo", "bar", "baz"]
-            x: Dict[str, Optional[int]] = {word: None for word in metasyntactics}
+            x: Dict[str, Optional[int]] = {
+                word: None for word in metasyntactics
+            }  # noqa: RUF025
             return x
 
         self.checkScript(test_dictcomprehension_is_typed_from_annotation, ())
@@ -1478,11 +1657,14 @@ def test_dicts_with_different_value_types_are_invariant(self):
             y: Dict[str, Optional[int]] = x
             return x
 
-        with self.assertRaisesRegex(RuntimeError, "Variable 'y' is "
-                                    "annotated with type "
-                                    r"Dict\[str, Optional\[int\]\] but "
-                                    "is being assigned to a value of "
-                                    r"type Dict\[str, int\]"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Variable 'y' is "
+            "annotated with type "
+            r"Dict\[str, Optional\[int\]\] but "
+            "is being assigned to a value of "
+            r"type Dict\[str, int\]",
+        ):
             torch.jit.script(test_dicts_with_different_value_types_are_invariant)
 
         def test_dicts_with_different_value_types_are_invariant_recursive(self):
@@ -1491,13 +1673,18 @@ def test_dicts_with_different_value_types_are_invariant_recursive(self):
             z: Dict[str, Dict[str, Optional[int]]] = y
             return x
 
-        with self.assertRaisesRegex(RuntimeError, "Variable 'z' is "
-                                    "annotated with type "
-                                    r"Dict\[str, Dict\[str, Optional"
-                                    r"\[int\]\]\] but is being assigned"
-                                    r" to a value of type Dict\[str, "
-                                    r"Dict\[str, int\]\]"):
-            torch.jit.script(test_dicts_with_different_value_types_are_invariant_recursive)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Variable 'z' is "
+            "annotated with type "
+            r"Dict\[str, Dict\[str, Optional"
+            r"\[int\]\]\] but is being assigned"
+            r" to a value of type Dict\[str, "
+            r"Dict\[str, int\]\]",
+        ):
+            torch.jit.script(
+                test_dicts_with_different_value_types_are_invariant_recursive
+            )
 
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
     def test_keys(self):
@@ -1547,8 +1734,8 @@ def func(x: Dict[str, Tensor]) -> List[Tuple[str, Tensor]]:
         # checkScript
         scripted_func = torch.jit.script(func)
 
-        eager_out = (func(self.dict()))
-        script_out = (scripted_func(self.dict()))
+        eager_out = func(self.dict())
+        script_out = scripted_func(self.dict())
 
         self.assertEqual(len(eager_out), len(script_out))
         for item in eager_out:
@@ -1566,30 +1753,35 @@ def tester(fn, *args):
             script_out = torch.jit.script(fn)(self.dict(), *args)
             self.assertEqual(eager_out, script_out)
 
-        tester(pop, 'a')
+        tester(pop, "a")
 
         with self.assertRaisesRegexWithHighlight(RuntimeError, "KeyError", "x.pop"):
-            torch.jit.script(pop)(self.dict(), 'x')
-
+            torch.jit.script(pop)(self.dict(), "x")
 
-        def default_pop(x: Dict[str, Tensor], key: str, default: Tensor) -> Tuple[Tensor, Dict[str, Tensor]]:
+        def default_pop(
+            x: Dict[str, Tensor], key: str, default: Tensor
+        ) -> Tuple[Tensor, Dict[str, Tensor]]:
             return x.pop(key, default), x
 
-        tester(default_pop, 'a', torch.randn(2, 2))
-        tester(default_pop, 'x', torch.randn(2, 2))
+        tester(default_pop, "a", torch.randn(2, 2))
+        tester(default_pop, "x", torch.randn(2, 2))
 
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
     def test_setdefault(self):
-        def setdefault(x: Dict[str, Tensor], key: str, default: Tensor) -> Dict[str, Tensor]:
+        def setdefault(
+            x: Dict[str, Tensor], key: str, default: Tensor
+        ) -> Dict[str, Tensor]:
             x.setdefault(key, default)
             return x
 
-        self.checkScript(setdefault, (self.dict(), 'a', torch.randn(2, 2)))
-        self.checkScript(setdefault, (self.dict(), 'nonexistant', torch.randn(2, 2)))
+        self.checkScript(setdefault, (self.dict(), "a", torch.randn(2, 2)))
+        self.checkScript(setdefault, (self.dict(), "nonexistant", torch.randn(2, 2)))
 
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
     def test_update(self):
-        def update(a: Dict[str, Tensor], b: Dict[str, Tensor]) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
+        def update(
+            a: Dict[str, Tensor], b: Dict[str, Tensor]
+        ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
             a.update(b)
             return a, b
 
@@ -1601,7 +1793,7 @@ def test_update_existing_key(self):
         def foo() -> Dict[str, int]:
             a: Dict[str, int] = {}
             for i in range(3):
-                a.update({'a': i})
+                a.update({"a": i})
             return a
 
         self.checkScript(foo, ())
@@ -1609,28 +1801,30 @@ def foo() -> Dict[str, int]:
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
     def test_aug_assign(self):
         def aug_assign_dict_tensor(a: Dict[str, Tensor]) -> Dict[str, Tensor]:
-            a['a'] += 1
-            a['b'] -= 12
-            a['c'] *= 122
-            a['c'] /= 2
-            a['c'] %= 2
+            a["a"] += 1
+            a["b"] -= 12
+            a["c"] *= 122
+            a["c"] /= 2
+            a["c"] %= 2
             return a
 
         def aug_assign_dict_prim(a: Dict[str, float]) -> Dict[str, float]:
-            a['a'] += 3.4
-            a['b'] -= 2.4
-            a['c'] *= 3.0
-            a['c'] /= 2.0
-            a['c'] %= 2.0
+            a["a"] += 3.4
+            a["b"] -= 2.4
+            a["c"] *= 3.0
+            a["c"] /= 2.0
+            a["c"] %= 2.0
             return a
 
         self.checkScript(aug_assign_dict_tensor, (self.dict(),))
-        self.checkScript(aug_assign_dict_prim, ({'a': 3.0, 'b': 2.0, 'c': 4.0},))
+        self.checkScript(aug_assign_dict_prim, ({"a": 3.0, "b": 2.0, "c": 4.0},))
 
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
     def test_popitem(self):
         @torch.jit.script
-        def popitem(x: Dict[str, Tensor]) -> Tuple[Tuple[str, Tensor], Dict[str, Tensor]]:
+        def popitem(
+            x: Dict[str, Tensor]
+        ) -> Tuple[Tuple[str, Tensor], Dict[str, Tensor]]:
             item = x.popitem()
             return item, x
 
@@ -1660,13 +1854,13 @@ def test_get(self):
         def get(x: Dict[str, Tensor], key: str) -> Optional[Tensor]:
             return x.get(key)
 
-        self.checkScript(get, (self.dict(), 'a'))
+        self.checkScript(get, (self.dict(), "a"))
         self.checkScript(get, (self.dict(), "doesn't exist"))
 
         def get_default(x: Dict[str, Tensor], key: str) -> Optional[Tensor]:
             return x.get(key, torch.randn(2, 2))
 
-        self.checkScript(get, (self.dict(), 'a'))
+        self.checkScript(get, (self.dict(), "a"))
         self.checkScript(get, (self.dict(), "doesn't exist"))
 
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
@@ -1688,12 +1882,12 @@ def test_basic(self):
         def simple(x: Dict[str, int]) -> Dict[str, int]:
             return x
 
-        self.checkScript(simple, ({'item': 20, 'other_item': 120},))
+        self.checkScript(simple, ({"item": 20, "other_item": 120},))
 
         def index(x: Dict[str, int]) -> int:
-            return x['item']
+            return x["item"]
 
-        self.checkScript(index, ({'item': 20, 'other_item': 120},))
+        self.checkScript(index, ({"item": 20, "other_item": 120},))
 
         def type_default() -> Dict[str, Tensor]:
             return {}
@@ -1702,29 +1896,35 @@ def type_default() -> Dict[str, Tensor]:
 
         @torch.jit.script
         def missing_index(x: Dict[str, int]) -> int:
-            return x['dne']
+            return x["dne"]
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "KeyError", "x['dne'"):
-            missing_index({'item': 20, 'other_item': 120})
+        with self.assertRaisesRegexWithHighlight(RuntimeError, "KeyError", 'x["dne"'):
+            missing_index({"item": 20, "other_item": 120})
 
-        code = dedent('''
+        code = dedent(
+            """
             def literal1():
                 return torch.jit.annotate(Dict[int, float], {})
             def literal2():
                 return torch.jit.annotate(Dict[int, float], {10: 1.2})
-        ''')
+        """
+        )
         cu = torch.jit.CompilationUnit(code)
         self.assertEqual({}, cu.literal1())
         self.assertEqual({10: 1.2}, cu.literal2())
 
-        cu = torch.jit.CompilationUnit(dedent('''
+        cu = torch.jit.CompilationUnit(
+            dedent(
+                """
             def literal3():
                 return torch.jit.annotate(Dict[int, float], {10: 1.2, 11: 1.3})
-        '''))
+        """
+            )
+        )
         self.assertEqual({10: 1.2, 11: 1.3}, cu.literal3())
 
         def list_of_dicts() -> List[Dict[str, Tensor]]:
-            return [{'word': torch.ones(2) + 3}, {'other word': torch.ones(1) + 2}]
+            return [{"word": torch.ones(2) + 3}, {"other word": torch.ones(1) + 2}]
 
         self.checkScript(list_of_dicts, ())
 
@@ -1733,14 +1933,17 @@ def test_mutability(self):
         @torch.jit.script
         def fn() -> Dict[str, int]:
             a = torch.jit.annotate(Dict[str, int], {})
-            a['ok'] = 10
+            a["ok"] = 10
             return a
 
-        self.assertEqual(fn(), {'ok': 10})
+        self.assertEqual(fn(), {"ok": 10})
 
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
     def test_key_type(self):
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "but instead found type", "a[None]"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "but instead found type", "a[None]"
+        ):
+
             @torch.jit.script
             def fn(a: Dict[str, int]) -> int:
                 return a[None]
@@ -1751,10 +1954,10 @@ def test_loop(self):
         def fn(x: int) -> Dict[str, int]:
             a = torch.jit.annotate(Dict[str, int], {})
             for i in range(x):
-                a['ok'] = i
+                a["ok"] = i
             return a
 
-        self.assertEqual(fn(10), {'ok': 9})
+        self.assertEqual(fn(10), {"ok": 9})
 
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
     def test_view(self):
@@ -1765,6 +1968,7 @@ def fn(x, y):
             x_view.add_(y)
             b = x + x
             return a == b
+
         self.checkScript(fn, (torch.rand(2, 3), torch.rand(2, 3)))
 
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
@@ -1783,7 +1987,10 @@ def optional(x: Dict[int, int], y: int) -> bool:
         self.checkScript(fn, (d, 3))
         self.checkScript(fn, (d, 2))
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "is actually of type Optional", "return x.get(y"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "is actually of type Optional", "return x.get(y"
+        ):
+
             @torch.jit.script
             def bad_types(x: Dict[int, int], y: int) -> int:
                 return x.get(y)  # noqa: T484
@@ -1797,8 +2004,8 @@ def python_lookup(my_dict: Dict[str, int], keys: List[str]) -> List[int]:
         def fn(my_dict: Dict[str, int], keys: List[str]) -> List[int]:
             return python_lookup(my_dict, keys)
 
-        a_dict = {'a': torch.ones(1), 'b': torch.ones(1) + 1, 'c': torch.ones(1) + 2}
-        self.checkScript(fn, (a_dict, ('a', 'c')))
+        a_dict = {"a": torch.ones(1), "b": torch.ones(1) + 1, "c": torch.ones(1) + 2}
+        self.checkScript(fn, (a_dict, ("a", "c")))
 
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
     def test_ordered_dict(self):
@@ -1838,7 +2045,9 @@ def test_dict_error():
             a[1] = 2
             return a
 
-        with self.assertRaisesRegexWithHighlight(Exception, "Arguments for call are not", "a[1] = 2"):
+        with self.assertRaisesRegexWithHighlight(
+            Exception, "Arguments for call are not", "a[1] = 2"
+        ):
             torch.jit.script(test_dict_error)
 
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
@@ -1847,6 +2056,7 @@ def test_type_annotation_missing_contained_type(self):
         Test that the use of a Dict type annotation without contained
         key and value types produces an error.
         """
+
         # This function uses a type comment.
         def fn_with_comment(input: Dict) -> Any:
             return input
@@ -1855,24 +2065,32 @@ def fn_with_comment(input: Dict) -> Any:
         def annotated_fn(input: Dict) -> Any:
             return input
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use Dict without contained types"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use Dict without contained types"
+        ):
             cu = torch.jit.CompilationUnit()
             cu.define(dedent(inspect.getsource(fn_with_comment)))
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use Dict without contained types"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use Dict without contained types"
+        ):
             cu = torch.jit.CompilationUnit()
             cu.define(dedent(inspect.getsource(annotated_fn)))
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use Dict without contained types"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use Dict without contained types"
+        ):
             m = torch.jit.script(fn_with_comment)
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use Dict without contained types"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use Dict without contained types"
+        ):
             m = torch.jit.script(annotated_fn)
 
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
     def test_dict_preserves_order(self):
         def dict_ordering():
-            a : Dict[int, int] = {}
+            a: Dict[int, int] = {}
             for i in range(1000):
                 a[i] = i + 1
             return a
@@ -1973,7 +2191,7 @@ class TheType(NamedTuple):
 
         class MyModule(types.ModuleType):
             def __init__(self):
-                super().__init__('MyModule')
+                super().__init__("MyModule")
 
             def __getattr__(self, attr):
                 return TheType
@@ -1987,12 +2205,12 @@ def fn() -> some_module.Type:
 
     def test_namedtuple_slice_unpack(self):
         class MyCoolNamedTuple(NamedTuple):
-            a : int
-            b : float
-            c : List[int]
+            a: int
+            b: float
+            c: List[int]
 
         @torch.jit.script
-        def foo(a : int, b : float, c : List[int]):
+        def foo(a: int, b: float, c: List[int]):
             tup = MyCoolNamedTuple(a, b, c)
             my_a, my_b, my_c = tup
             return tup[:1], my_a, my_c
@@ -2001,29 +2219,29 @@ def foo(a : int, b : float, c : List[int]):
 
     def test_namedtuple_lower(self):
         class MyCoolNamedTuple(NamedTuple):
-            a : int
-            b : float
-            c : List[int]
+            a: int
+            b: float
+            c: List[int]
 
         @torch.jit.script
-        def foo(a : int):
+        def foo(a: int):
             tup = MyCoolNamedTuple(a, 3.14, [9])
             return tup
 
-        FileCheck().check('TupleConstruct').run(foo.graph)
+        FileCheck().check("TupleConstruct").run(foo.graph)
         torch._C._jit_pass_lower_all_tuples(foo.graph)
-        FileCheck().check_not('TupleConstruct').run(foo.graph)
+        FileCheck().check_not("TupleConstruct").run(foo.graph)
 
     def test_namedtuple_type_annotation(self):
         global MyCoolNamedTuple  # see [local resolution in python]
 
         class MyCoolNamedTuple(NamedTuple):
-            a : int
-            b : float
-            c : List[int]
+            a: int
+            b: float
+            c: List[int]
 
         @torch.jit.script
-        def foo(x : MyCoolNamedTuple) -> MyCoolNamedTuple:
+        def foo(x: MyCoolNamedTuple) -> MyCoolNamedTuple:
             return x
 
         mnt = MyCoolNamedTuple(42, 420.0, [666])
@@ -2031,22 +2249,26 @@ def foo(x : MyCoolNamedTuple) -> MyCoolNamedTuple:
 
     def test_namedtuple_wrong_types(self):
         class MyCoolNamedTuple(NamedTuple):
-            a : int
-            b : float
-            c : List[int]
+            a: int
+            b: float
+            c: List[int]
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected a value of type 'int' for argument 'a'"
+            " but instead found type 'str'",
+        ):
 
-        with self.assertRaisesRegex(RuntimeError, "Expected a value of type 'int' for argument 'a'"
-                                                  " but instead found type 'str'"):
             @torch.jit.script
             def foo():
-                tup = MyCoolNamedTuple('foo', 'bar', 'baz')
+                tup = MyCoolNamedTuple("foo", "bar", "baz")
                 return tup
 
     def test_namedtuple_kwarg_construct(self):
         class MyCoolNamedTuple(NamedTuple):
-            a : int
-            b : float
-            c : List[int]
+            a: int
+            b: float
+            c: List[int]
 
         @torch.jit.script
         def foo():
@@ -2061,9 +2283,9 @@ def foo():
     @unittest.skipIf(True, "broken while these tests were not in CI")
     def test_namedtuple_serialization(self):
         class MyCoolNamedTuple(NamedTuple):
-            a : int
-            b : float
-            c : List[int]
+            a: int
+            b: float
+            c: List[int]
 
         class MyMod(torch.jit.ScriptModule):
             @torch.jit.script_method
@@ -2071,21 +2293,21 @@ def forward(self):
                 return MyCoolNamedTuple(3, 3.5, [3, 4, 5])
 
         mm = MyMod()
-        mm.save('foo.zip')
+        mm.save("foo.zip")
         torch.testing._internal.jit_utils.clear_class_registry()
-        loaded = torch.jit.load('foo.zip')
+        loaded = torch.jit.load("foo.zip")
 
         out = mm()
         out_loaded = loaded()
 
-        for name in ['a', 'b', 'c']:
+        for name in ["a", "b", "c"]:
             self.assertEqual(getattr(out_loaded, name), getattr(out, name))
 
     def test_namedtuple_inside_forwardref(self):
         class FeatureVector(NamedTuple):
-            float_features: 'float'
-            sequence_features: 'List[float]'
-            time_since_first: 'float'
+            float_features: "float"
+            sequence_features: "List[float]"
+            time_since_first: "float"
 
         @torch.jit.script
         def foo(x) -> float:
@@ -2100,9 +2322,9 @@ def foo(x) -> float:
 
     def test_namedtuple_input_forwardref(self):
         class MyNamedTuple(NamedTuple):
-            a : 'int'
-            b : 'float'
-            c : 'torch.Tensor'
+            a: "int"
+            b: "float"
+            c: "torch.Tensor"
 
         make_global(MyNamedTuple)
 
@@ -2120,11 +2342,11 @@ def fn(obj: MyNamedTuple):
     @unittest.expectedFailure
     def test_namedtuple_resolution_forwardref(self):
         class TheType(NamedTuple):
-            t: 'int'
+            t: "int"
 
         class MyModule(types.ModuleType):
             def __init__(self):
-                super().__init__('MyModule')
+                super().__init__("MyModule")
 
             def __getattr__(self, attr):
                 return TheType
@@ -2150,12 +2372,14 @@ class TestScriptDict(JitTestCase):
     by torch.jit.script behave like dictionaries do so that they are fungible
     in almost all cirumstances with regular dictionaries.
     """
+
     def _script_dict_add(self, d: torch._C.ScriptDict, k: int, v: int):
         """
         This is a helper function that inserts the pair (k, v) into the
         dictionary d in TorchScript. It is used for testing reference
         semantics.
         """
+
         @torch.jit.script
         def dict_add(d: Dict[int, int], k: int, v: int):
             d[k] = v
@@ -2223,6 +2447,7 @@ def test_iter(self):
         """
         Test iteration over a dictionary's keys.
         """
+
         def sum_keys(input_dict):
             s = 0
             for k in input_dict:
@@ -2236,6 +2461,7 @@ def test_items(self):
         """
         Test .items().
         """
+
         def sum_pair_product(input_dict):
             s = 0
             for k, v in input_dict.items():
@@ -2287,7 +2513,12 @@ def test_contains(self):
         data = {1: 2, 3: 4}
 
         def fn(input_dict):
-            return 1 in input_dict, 2 not in input_dict, 3 in input_dict, 4 not in input_dict
+            return (
+                1 in input_dict,
+                2 not in input_dict,
+                3 in input_dict,
+                4 not in input_dict,
+            )
 
         self._compare_eager_and_script(fn, data)
 
@@ -2327,13 +2558,17 @@ def test_len(self):
         self._compare_eager_and_script(lambda d: len(d), {1: 2})
         self._compare_eager_and_script(lambda d: len(d), {})
 
-    @unittest.skip("Cannot pass until all dicts returned from TorchScript are ScriptDicts")
+    @unittest.skip(
+        "Cannot pass until all dicts returned from TorchScript are ScriptDicts"
+    )
     def test_nested(self):
         """
         Test that reference semantics are honoured when the ScriptDict that is
         mutated using TorchScript is inside another.
         """
-        nested = torch.jit.script({1: {1: 2}, 2: {3: 4}}, type_hint=Dict[int, Dict[int, int]])
+        nested = torch.jit.script(
+            {1: {1: 2}, 2: {3: 4}}, type_hint=Dict[int, Dict[int, int]]
+        )
 
         one = nested[1]
         two = nested[2]
@@ -2374,12 +2609,14 @@ class TestScriptList(JitTestCase):
     torch._C.ScriptList behave like lists do so that they are fungible
     in almost all cirumstances with regular list.
     """
+
     def _script_list_add(self, l: torch._C.ScriptList, e: int):
         """
         This is a helper function that inserts the element e into the
         list l in TorchScript. It is used for testing reference
         semantics.
         """
+
         @torch.jit.script
         def list_add(l: List[int], e: int):
             l.append(e)
@@ -2446,6 +2683,7 @@ def test_iter(self):
         """
         Test iteration over a list's elements.
         """
+
         def sum_elements(input_list):
             s = 0
             for k in input_list:
@@ -2533,7 +2771,12 @@ def test_contains(self):
         data = [1, 2, 3, 4]
 
         def fn(input_list):
-            return 1 in input_list, 2 not in input_list, 3 in input_list, 4 not in input_list
+            return (
+                1 in input_list,
+                2 not in input_list,
+                3 in input_list,
+                4 not in input_list,
+            )
 
         self._compare_eager_and_script(fn, data)
 
@@ -2617,6 +2860,7 @@ def test_extend(self):
         """
         Test extend.
         """
+
         class Iterable:
             def __init__(self, limit: int):
                 self.limit = limit
@@ -2627,7 +2871,7 @@ def __iter__(self):
 
             def __next__(self):
                 if self.value == limit:  # noqa: F821
-                    raise StopIteration()
+                    raise StopIteration
 
                 ret = self.value
                 self.value += 1
@@ -2687,7 +2931,9 @@ def test_pop(self):
         # Test error cases.
         self._compare_eager_and_script(lambda l: l.pop(10), data)
 
-    @unittest.skip("Cannot pass until all list returned from TorchScript are ScriptLists")
+    @unittest.skip(
+        "Cannot pass until all list returned from TorchScript are ScriptLists"
+    )
     def test_nested(self):
         """
         Test that reference semantics are honoured when the ScriptList that is
diff --git a/test/jit/test_logging.py b/test/jit/test_logging.py
index bbefffadeae56..596b1439b31ac 100644
--- a/test/jit/test_logging.py
+++ b/test/jit/test_logging.py
@@ -10,10 +10,13 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestLogging(JitTestCase):
     def test_bump_numeric_counter(self):
@@ -22,30 +25,29 @@ class ModuleThatLogs(torch.jit.ScriptModule):
             def forward(self, x):
                 for i in range(x.size(0)):
                     x += 1.0
-                    torch.jit._logging.add_stat_value('foo', 1)
+                    torch.jit._logging.add_stat_value("foo", 1)
 
                 if bool(x.sum() > 0.0):
-                    torch.jit._logging.add_stat_value('positive', 1)
+                    torch.jit._logging.add_stat_value("positive", 1)
                 else:
-                    torch.jit._logging.add_stat_value('negative', 1)
+                    torch.jit._logging.add_stat_value("negative", 1)
                 return x
 
         logger = torch.jit._logging.LockingLogger()
         old_logger = torch.jit._logging.set_logger(logger)
         try:
-
             mtl = ModuleThatLogs()
             for i in range(5):
                 mtl(torch.rand(3, 4, 5))
 
-            self.assertEqual(logger.get_counter_val('foo'), 15)
-            self.assertEqual(logger.get_counter_val('positive'), 5)
+            self.assertEqual(logger.get_counter_val("foo"), 15)
+            self.assertEqual(logger.get_counter_val("positive"), 5)
         finally:
             torch.jit._logging.set_logger(old_logger)
 
     def test_trace_numeric_counter(self):
         def foo(x):
-            torch.jit._logging.add_stat_value('foo', 1)
+            torch.jit._logging.add_stat_value("foo", 1)
             return x + 1.0
 
         traced = torch.jit.trace(foo, torch.rand(3, 4))
@@ -54,7 +56,7 @@ def foo(x):
         try:
             traced(torch.rand(3, 4))
 
-            self.assertEqual(logger.get_counter_val('foo'), 1)
+            self.assertEqual(logger.get_counter_val("foo"), 1)
         finally:
             torch.jit._logging.set_logger(old_logger)
 
@@ -65,7 +67,7 @@ def forward(self, x):
                 for i in range(30):
                     x += 1.0
                 tp_end = torch.jit._logging.time_point()
-                torch.jit._logging.add_stat_value('mytimer', tp_end - tp_start)
+                torch.jit._logging.add_stat_value("mytimer", tp_end - tp_start)
                 return x
 
         mtm = ModuleThatTimes()
@@ -73,7 +75,7 @@ def forward(self, x):
         old_logger = torch.jit._logging.set_logger(logger)
         try:
             mtm(torch.rand(3, 4))
-            self.assertGreater(logger.get_counter_val('mytimer'), 0)
+            self.assertGreater(logger.get_counter_val("mytimer"), 0)
         finally:
             torch.jit._logging.set_logger(old_logger)
 
@@ -85,7 +87,7 @@ def forward(self, x):
                 for i in range(30):
                     x += 1.0
                 tp_end = torch.jit._logging.time_point()
-                torch.jit._logging.add_stat_value('mytimer', tp_end - tp_start)
+                torch.jit._logging.add_stat_value("mytimer", tp_end - tp_start)
                 return x
 
         mtm = ModuleThatTimes()
@@ -93,27 +95,27 @@ def forward(self, x):
         old_logger = torch.jit._logging.set_logger(logger)
         try:
             mtm(torch.rand(3, 4))
-            self.assertGreater(logger.get_counter_val('mytimer'), 0)
+            self.assertGreater(logger.get_counter_val("mytimer"), 0)
         finally:
             torch.jit._logging.set_logger(old_logger)
 
     def test_counter_aggregation(self):
         def foo(x):
             for i in range(3):
-                torch.jit._logging.add_stat_value('foo', 1)
+                torch.jit._logging.add_stat_value("foo", 1)
             return x + 1.0
 
         traced = torch.jit.trace(foo, torch.rand(3, 4))
         logger = torch.jit._logging.LockingLogger()
-        logger.set_aggregation_type('foo', torch.jit._logging.AggregationType.AVG)
+        logger.set_aggregation_type("foo", torch.jit._logging.AggregationType.AVG)
         old_logger = torch.jit._logging.set_logger(logger)
         try:
             traced(torch.rand(3, 4))
 
-            self.assertEqual(logger.get_counter_val('foo'), 1)
+            self.assertEqual(logger.get_counter_val("foo"), 1)
         finally:
             torch.jit._logging.set_logger(old_logger)
 
     def test_logging_levels_set(self):
-        torch._C._jit_set_logging_option('foo')
-        self.assertEqual('foo', torch._C._jit_get_logging_option())
+        torch._C._jit_set_logging_option("foo")
+        self.assertEqual("foo", torch._C._jit_get_logging_option())
diff --git a/test/jit/test_misc.py b/test/jit/test_misc.py
index be5efa8d99489..8d58ab860d5b0 100644
--- a/test/jit/test_misc.py
+++ b/test/jit/test_misc.py
@@ -1,28 +1,32 @@
 # Owner(s): ["oncall: jit"]
 
-from typing import Any, Dict, List, Optional, Tuple
-
-from torch.testing._internal.jit_utils import JitTestCase, make_global
-from torch.testing import FileCheck
-from torch import jit
-from jit.test_module_interface import TestModuleInterface  # noqa: F401
 import os
 import sys
+import unittest
+from typing import Any, Dict, List, Optional, Tuple
+
 import torch
-import torch.testing._internal.jit_utils
 import torch.nn as nn
-import unittest
+import torch.testing._internal.jit_utils
+from torch import jit
+from torch.testing import FileCheck
 from torch.testing._internal.common_utils import freeze_rng_state
-from torch.testing._internal.jit_utils import RUN_CUDA_HALF
+
+from torch.testing._internal.jit_utils import JitTestCase, make_global, RUN_CUDA_HALF
+
+from jit.test_module_interface import TestModuleInterface  # noqa: F401
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestMisc(JitTestCase):
     def test_joined_str(self):
@@ -30,12 +34,12 @@ def func(x):
             hello, test = "Hello", "test"
             print(f"{hello + ' ' + test}, I'm a {test}")
             print("format blank")
-            hi = 'hi'
+            hi = "hi"
             print(f"stuff before {hi}")
             print(f"{hi} stuff after")
             return x + 1
 
-        x = torch.arange(4., requires_grad=True)
+        x = torch.arange(4.0, requires_grad=True)
         # TODO: Add support for f-strings in string parser frontend
         # self.checkScript(func, [x], optimize=True, capture_output=True)
 
@@ -50,10 +54,14 @@ def func(x):
         self.assertEqual(captured, captured_script)
 
     def test_kwarg_support(self):
-        with self.assertRaisesRegex(torch.jit.frontend.NotSupportedError, "variable number of arguments"):
+        with self.assertRaisesRegex(
+            torch.jit.frontend.NotSupportedError, "variable number of arguments"
+        ):
+
             class M(torch.nn.Module):
                 def forward(self, *, n_tokens: int, device_name: str = 2):
                     pass
+
             torch.jit.script(M())
 
         class M(torch.nn.Module):
@@ -62,32 +70,35 @@ def forward(self, *, n_tokens: int, device_name: str):
 
         sm = torch.jit.script(M())
 
-        with self.assertRaisesRegex(RuntimeError, "missing value for argument 'n_tokens'"):
+        with self.assertRaisesRegex(
+            RuntimeError, "missing value for argument 'n_tokens'"
+        ):
             sm()
 
         with self.assertRaisesRegex(RuntimeError, "positional arg"):
-            sm(3, 'hello')
+            sm(3, "hello")
 
-        self.assertEqual(sm(n_tokens=3, device_name='hello'), (3, 'hello'))
+        self.assertEqual(sm(n_tokens=3, device_name="hello"), (3, "hello"))
 
     def test_tuple_subscripted_assign(self):
         with self.assertRaisesRegex(RuntimeError, "subscripted assignment"):
+
             @torch.jit.script
             def foo(a: Tuple[int, int]) -> None:
                 a[0] = a[1]
 
         with self.assertRaisesRegex(RuntimeError, "augmented assignment"):
+
             @torch.jit.script
             def bar(a: Tuple[int, int]) -> None:
                 a[0] += a[1]
 
     def test_subexpression_List_Future(self):
-
         @torch.jit.script
         def fn(x: List[torch.jit.Future[int]]) -> torch.jit.Future[int]:
             return x[0]
 
-        FileCheck().check('Future[int]').check('Future[int]').run(fn.graph)
+        FileCheck().check("Future[int]").check("Future[int]").run(fn.graph)
 
     def test_subexpression_Future_annotate(self):
         @torch.jit.script
@@ -110,36 +121,40 @@ def forward(x: Any) -> str:
             if isinstance(x, str):
                 return x
             return "foo"
+
         forward = torch.jit.script(forward)
         self.assertEqual(forward(1), "foo")
         self.assertEqual(forward("bar"), "bar")
 
     def test_subexpression_Tuple_int_int_Future(self):
-
         @torch.jit.script
-        def fn(x: Tuple[int, int, torch.jit.Future[int]]) -> Tuple[int, torch.jit.Future[int]]:
+        def fn(
+            x: Tuple[int, int, torch.jit.Future[int]]
+        ) -> Tuple[int, torch.jit.Future[int]]:
             return x[0], x[2]
 
-        FileCheck().check('(int, int, Future[int])').check('(int, Future[int])').run(fn.graph)
+        FileCheck().check("(int, int, Future[int])").check("(int, Future[int])").run(
+            fn.graph
+        )
 
     def test_subexpression_Dict_int_Future(self):
-
         @torch.jit.script
         def fn(x: Dict[int, torch.jit.Future[int]], y: int) -> torch.jit.Future[int]:
             return x[y]
 
-        FileCheck().check('Dict(int, Future(int))').check('Future[int]').run(fn.graph)
+        FileCheck().check("Dict(int, Future(int))").check("Future[int]").run(fn.graph)
 
     def test_subexpression_Optional(self):
-
         @torch.jit.script
-        def fn(x: Optional[Dict[int, torch.jit.Future[int]]]) -> Optional[torch.jit.Future[int]]:
+        def fn(
+            x: Optional[Dict[int, torch.jit.Future[int]]]
+        ) -> Optional[torch.jit.Future[int]]:
             if x is not None:
                 return x[0]
             else:
                 return None
 
-        FileCheck().check('Dict(int, Future(int))?').run(fn.graph)
+        FileCheck().check("Dict(int, Future(int))?").run(fn.graph)
 
     def test_if_returning_any(self):
         """
@@ -147,6 +162,7 @@ def test_if_returning_any(self):
         types early from each branch when the return
         type of the function is Any.
         """
+
         def if_function(inp: torch.Tensor) -> Any:
             if inp.shape[0] == 1:
                 return inp * inp
@@ -156,14 +172,23 @@ def if_function(inp: torch.Tensor) -> Any:
         self.checkScript(if_function, (torch.randn(5),))
 
     def test_hacked_twin(self):
-
         def gen_data():
             with freeze_rng_state():
                 return torch.randn(10), torch.randint(10, (20,)), torch.randn(20)
 
-        input, index, value, = gen_data()
-        input1, index1, value1, = gen_data()
-        out1 = torch.ops.aten.index_put.hacked_twin(input, [index], value, accumulate=False)
+        (
+            input,
+            index,
+            value,
+        ) = gen_data()
+        (
+            input1,
+            index1,
+            value1,
+        ) = gen_data()
+        out1 = torch.ops.aten.index_put.hacked_twin(
+            input, [index], value, accumulate=False
+        )
         out2 = torch.index_put(input1, [index1], value1, accumulate=False)
         self.assertEqual(out1, out2)
 
@@ -172,14 +197,23 @@ def gen_data():
         self.assertEqual(input, input1)
 
     def test_unsafe_hacked_twin(self):
-
         def gen_data():
             with freeze_rng_state():
                 return torch.randn(10), torch.randint(10, (20,)), torch.randn(20)
 
-        input, index, value, = gen_data()
-        input1, index1, value1, = gen_data()
-        out1 = torch.ops.aten._unsafe_index_put.hacked_twin(input, [index], value, accumulate=False)
+        (
+            input,
+            index,
+            value,
+        ) = gen_data()
+        (
+            input1,
+            index1,
+            value1,
+        ) = gen_data()
+        out1 = torch.ops.aten._unsafe_index_put.hacked_twin(
+            input, [index], value, accumulate=False
+        )
         out2 = torch.index_put(input1, [index1], value1, accumulate=False)
         self.assertEqual(out1, out2)
 
@@ -188,7 +222,9 @@ def gen_data():
         self.assertEqual(input, input1)
 
         def index_put_fn(input, index, value):
-            return torch.ops.aten._unsafe_index_put(input, [index], value, accumulate=False)
+            return torch.ops.aten._unsafe_index_put(
+                input, [index], value, accumulate=False
+            )
 
         input2, index2, value2 = gen_data()
         script_index_put_fn = torch.jit.script(index_put_fn)
@@ -197,7 +233,9 @@ def index_put_fn(input, index, value):
         self.assertEqual(expect, actual)
 
         def index_fn(input, index, value):
-            return torch.ops.aten._unsafe_index_put(input, [index], value, accumulate=False)
+            return torch.ops.aten._unsafe_index_put(
+                input, [index], value, accumulate=False
+            )
 
         script_index_fn = torch.jit.script(index_fn)
         expect = index_fn(input2.clone(), index2, value2)
@@ -205,7 +243,6 @@ def index_fn(input, index, value):
         self.assertEqual(expect, actual)
 
     def test_export_opnames_interface(self):
-
         @torch.jit.interface
         class OneTwoModule(nn.Module):
             def one(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
@@ -240,7 +277,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         make_global(OneTwoModule)
 
         class M(nn.Module):
-            sub : OneTwoModule
+            sub: OneTwoModule
 
             def __init__(self):
                 super().__init__()
@@ -254,12 +291,18 @@ def use_module_interface(mod_list: List[OneTwoModule], x: torch.Tensor):
 
         torch._C._enable_mobile_interface_call_export()
         scripted_M_mod = torch.jit.script(M())
-        self.assertTrue({'aten::mul.Scalar', 'aten::mul.Tensor', 'aten::reciprocal'}.issubset(
-            set(torch.jit.export_opnames(scripted_M_mod))))
+        self.assertTrue(
+            {"aten::mul.Scalar", "aten::mul.Tensor", "aten::reciprocal"}.issubset(
+                set(torch.jit.export_opnames(scripted_M_mod))
+            )
+        )
 
         scripted_M_mod.sub = torch.jit.script(FooMod())
-        self.assertTrue({'aten::add.Tensor', 'aten::mul.Scalar'}.issubset(
-            set(torch.jit.export_opnames(scripted_M_mod))))
+        self.assertTrue(
+            {"aten::add.Tensor", "aten::mul.Scalar"}.issubset(
+                set(torch.jit.export_opnames(scripted_M_mod))
+            )
+        )
 
     def test_math_inf(self):
         from math import inf
@@ -292,7 +335,6 @@ def non_temporary_fail():
         with self.assertRaises(RuntimeError):
             torch.jit.script(non_temporary_fail)
 
-
         @torch.jit.script
         def test_return():
             return []
@@ -335,7 +377,9 @@ def no_inputs():
         def multiple_args():
             return torch.LongTensor(1, [2])
 
-        with self.assertRaisesRegex(RuntimeError, "multiple positional arguments that were not all integers"):
+        with self.assertRaisesRegex(
+            RuntimeError, "multiple positional arguments that were not all integers"
+        ):
             torch.jit.script(multiple_args)
 
         # kwarg bad schema
@@ -345,7 +389,6 @@ def bad_kwarg():
         with self.assertRaisesRegex(RuntimeError, "hello"):
             torch.jit.script(bad_kwarg)
 
-
     def test_broadcasting_list(self):
         """
         Test BroadcastingList and torch.nn._size_N_t alias
@@ -360,7 +403,7 @@ def sum_f(x: BroadcastingList2[float]) -> float:
             return x[0] + x[1]
 
         self.assertTrue(torch.jit.script(sum_i)(4) == 8)
-        self.assertTrue(torch.jit.script(sum_f)(4.5) == 9.)
+        self.assertTrue(torch.jit.script(sum_f)(4.5) == 9.0)
 
     def test_parse_ir_annotate(self):
         ir = """
@@ -397,7 +440,6 @@ def test_parse_ir_single_element_tensor_negative(self):
         self.assertTrue(ret.numel() == 1)
         self.assertTrue(len(ret.size()) == 1)
 
-
     def test_script_many_decorators(self):
         def no_op_decorator(f):
             return f
@@ -410,7 +452,9 @@ def no_op_decorator(f):
         def foo(x, dim: int):
             return x.unsqueeze(dim)
 
-        x = torch.randn(1,)
+        x = torch.randn(
+            1,
+        )
         expected = foo(x, 0)
         scripted = torch.jit.script(foo)
         actual = scripted(x, 0)
@@ -421,10 +465,10 @@ def test_pow_multiple_dtype(self):
         # https://github.com/pytorch/pytorch/issues/75476
         def fn(p: torch.Tensor, gamma: float = 2.0) -> torch.Tensor:
             p = torch.sigmoid(p)
-            result = p ** gamma
+            result = p**gamma
             return result
 
-        x = torch.rand((2, 2), dtype=torch.half, device='cuda')
+        x = torch.rand((2, 2), dtype=torch.half, device="cuda")
 
         ref = fn(x)
 
@@ -450,8 +494,12 @@ def test_jit_get_operation_order(self):
         # We want "Scalar" to come before "complex".
         op, override_names = torch._C._jit_get_operation("aten::add")
         print(override_names)
-        complex_indices = [i for i, name in enumerate(override_names) if name == "complex"]
-        Scalar_indices = [i for i, name in enumerate(override_names) if name == "Scalar"]
+        complex_indices = [
+            i for i, name in enumerate(override_names) if name == "complex"
+        ]
+        Scalar_indices = [
+            i for i, name in enumerate(override_names) if name == "Scalar"
+        ]
 
         self.assertTrue(len(complex_indices) > 0)
         self.assertTrue(len(Scalar_indices) > 0)
diff --git a/test/jit/test_models.py b/test/jit/test_models.py
index 252fb3dcb8d2f..acaf1a0bf8aff 100644
--- a/test/jit/test_models.py
+++ b/test/jit/test_models.py
@@ -3,28 +3,33 @@
 import os
 import sys
 import unittest
-from torch.testing._internal.common_utils import (
-    enable_profiling_mode_for_profiling_tests, GRAPH_EXECUTOR, ProfilingMode,
-    set_default_dtype,
-)
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.testing._internal.common_utils import (
+    enable_profiling_mode_for_profiling_tests,
+    GRAPH_EXECUTOR,
+    ProfilingMode,
+    set_default_dtype,
+)
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA
 from torch.testing._internal.common_utils import slowTest, suppress_warnings
-from torch.testing._internal.common_quantization import skipIfNoFBGEMM
+from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
 try:
     import torchvision
+
     HAS_TORCHVISION = True
 except ImportError:
     HAS_TORCHVISION = False
@@ -32,6 +37,7 @@
     HAS_TORCHVISION = False
 skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision")
 
+
 class MnistNet(nn.Module):
     def __init__(self):
         super().__init__()
@@ -50,6 +56,7 @@ def forward(self, x):
         x = self.fc2(x)
         return F.log_softmax(x, dim=1)
 
+
 class TestModels(JitTestCase):
     @staticmethod
     def _test_dcgan_models(self, device, check_export_import=True):
@@ -103,31 +110,38 @@ def __init__(self, nc, ndf):
                     nn.LeakyReLU(0.2, inplace=True),
                     # state size. (ndf*8) x 4 x 4
                     nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
-                    nn.Sigmoid()
+                    nn.Sigmoid(),
                 )
 
             def forward(self, input):
                 return self.main(input).view(-1, 1).squeeze(1)
 
         bs, nz, ngf, nc, ndf = 5, 6, 9, 3, 10
-        self.checkTrace(DCGANGenerator(nz, ngf, nc).to(device),
-                        (torch.rand(bs, nz, 1, 1, device=device),),
-                        export_import=check_export_import)
-        example_input = DCGANGenerator(nz, ngf, nc).to(device)(torch.rand(bs, nz, 1, 1, device=device))
-        self.checkTrace(DCGANDiscriminator(nc, ndf).to(device), (example_input,),
-                        export_import=check_export_import)
+        self.checkTrace(
+            DCGANGenerator(nz, ngf, nc).to(device),
+            (torch.rand(bs, nz, 1, 1, device=device),),
+            export_import=check_export_import,
+        )
+        example_input = DCGANGenerator(nz, ngf, nc).to(device)(
+            torch.rand(bs, nz, 1, 1, device=device)
+        )
+        self.checkTrace(
+            DCGANDiscriminator(nc, ndf).to(device),
+            (example_input,),
+            export_import=check_export_import,
+        )
 
     def test_dcgan_models(self):
         # Note: Can sometimes fail with low precision if run with float dtype
         with set_default_dtype(torch.double):
-            self._test_dcgan_models(self, device='cpu')
+            self._test_dcgan_models(self, device="cpu")
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
     def test_dcgan_models_cuda(self):
         # Note: Can sometimes fail with low precision if run with float dtype
         with set_default_dtype(torch.double):
             # XXX: export_import on CUDA modules doesn't work (#11480)
-            self._test_dcgan_models(self, device='cuda', check_export_import=False)
+            self._test_dcgan_models(self, device="cuda", check_export_import=False)
 
     @staticmethod
     def _test_neural_style(self, device, check_export_import=True):
@@ -148,9 +162,13 @@ def __init__(self):
                 self.res4 = ResidualBlock(128)
                 self.res5 = ResidualBlock(128)
                 # Upsampling Layers
-                self.deconv1 = UpsampleConvLayer(128, 64, kernel_size=3, stride=1, upsample=2)
+                self.deconv1 = UpsampleConvLayer(
+                    128, 64, kernel_size=3, stride=1, upsample=2
+                )
                 self.in4 = torch.nn.InstanceNorm2d(64, affine=True)
-                self.deconv2 = UpsampleConvLayer(64, 32, kernel_size=3, stride=1, upsample=2)
+                self.deconv2 = UpsampleConvLayer(
+                    64, 32, kernel_size=3, stride=1, upsample=2
+                )
                 self.in5 = torch.nn.InstanceNorm2d(32, affine=True)
                 self.deconv3 = ConvLayer(32, 3, kernel_size=9, stride=1)
                 # Non-linearities
@@ -175,7 +193,9 @@ def __init__(self, in_channels, out_channels, kernel_size, stride):
                 super().__init__()
                 reflection_padding = kernel_size // 2
                 self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
-                self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
+                self.conv2d = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size, stride
+                )
 
             def forward(self, x):
                 out = self.reflection_pad(x)
@@ -210,14 +230,20 @@ class UpsampleConvLayer(torch.nn.Module):
             ref: http://distill.pub/2016/deconv-checkerboard/
             """
 
-            def __init__(self, in_channels, out_channels, kernel_size, stride, upsample=None):
+            def __init__(
+                self, in_channels, out_channels, kernel_size, stride, upsample=None
+            ):
                 super().__init__()
                 self.upsample = upsample
                 if upsample:
-                    self.upsample_layer = torch.nn.Upsample(mode='nearest', scale_factor=upsample)
+                    self.upsample_layer = torch.nn.Upsample(
+                        mode="nearest", scale_factor=upsample
+                    )
                 reflection_padding = kernel_size // 2
                 self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
-                self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
+                self.conv2d = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size, stride
+                )
 
             def forward(self, x):
                 x_in = x
@@ -227,44 +253,54 @@ def forward(self, x):
                 out = self.conv2d(out)
                 return out
 
-        self.checkTrace(TransformerNet(), (torch.rand(5, 3, 16, 16),), export_import=check_export_import)
+        self.checkTrace(
+            TransformerNet(),
+            (torch.rand(5, 3, 16, 16),),
+            export_import=check_export_import,
+        )
 
     @slowTest
     def test_neural_style(self):
-        self._test_neural_style(self, device='cpu')
+        self._test_neural_style(self, device="cpu")
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
     def test_neural_style_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
-        self._test_neural_style(self, device='cuda', check_export_import=False)
+        self._test_neural_style(self, device="cuda", check_export_import=False)
 
-    @unittest.skipIf(GRAPH_EXECUTOR == ProfilingMode.LEGACY, "Bug found in deprecated executor")
+    @unittest.skipIf(
+        GRAPH_EXECUTOR == ProfilingMode.LEGACY, "Bug found in deprecated executor"
+    )
     @staticmethod
     def _test_mnist(self, device, check_export_import=True):
         # eval() is present because dropout makes this nondeterministic
         with enable_profiling_mode_for_profiling_tests():
-            self.checkTrace(MnistNet().to(device).eval(), (torch.rand(5, 1, 28, 28, device=device),),
-                            export_import=check_export_import)
+            self.checkTrace(
+                MnistNet().to(device).eval(),
+                (torch.rand(5, 1, 28, 28, device=device),),
+                export_import=check_export_import,
+            )
 
     def test_mnist(self):
-        self._test_mnist(self, device='cpu')
+        self._test_mnist(self, device="cpu")
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
     def test_mnist_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
-        self._test_mnist(self, device='cuda', check_export_import=False)
+        self._test_mnist(self, device="cuda", check_export_import=False)
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
     def test_mnist_training_leaks_no_memory_cuda(self):
         net = MnistNet().cuda()
         # MnistNet uses dropout, don't check its trace
-        traced_net = torch.jit.trace(net, [torch.randn(5, 1, 28, 28, device='cuda')],
-                                     check_trace=False)
+        traced_net = torch.jit.trace(
+            net, [torch.randn(5, 1, 28, 28, device="cuda")], check_trace=False
+        )
 
         def train(iters):
             for _ in range(iters):
                 # Get some fake data
-                inp = torch.randn(5, 1, 28, 28, device='cuda')
+                inp = torch.randn(5, 1, 28, 28, device="cuda")
                 out = traced_net(inp)
 
                 # Here's some fake loss
@@ -293,21 +329,23 @@ def forward(self, x):
                 return F.softmax(action_scores, dim=1)
 
         with enable_profiling_mode_for_profiling_tests():
-            self.checkTrace(Policy().to(device), (torch.rand(1, 4, device=device),),
-                            export_import=test_export_import)
+            self.checkTrace(
+                Policy().to(device),
+                (torch.rand(1, 4, device=device),),
+                export_import=test_export_import,
+            )
 
     def test_reinforcement_learning(self):
-        self._test_reinforcement_learning(self, device='cpu')
+        self._test_reinforcement_learning(self, device="cpu")
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
     def test_reinforcement_learning_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
-        self._test_reinforcement_learning(self, device='cuda', test_export_import=False)
+        self._test_reinforcement_learning(self, device="cuda", test_export_import=False)
 
     @staticmethod
-    def _test_snli(self, device, check_export_import=True, quantized=False):
+    def _test_snli(self, device, check_export_import=True):
         class Bottle(nn.Module):
-
             def forward(self, input):
                 if len(input.size()) <= 2:
                     return super().forward(input)
@@ -319,25 +357,31 @@ class Linear(Bottle, nn.Linear):
             pass
 
         class Encoder(nn.Module):
-
             def __init__(self, config):
                 super().__init__()
                 self.config = config
                 input_size = config.d_proj if config.projection else config.d_embed
                 dropout = 0 if config.n_layers == 1 else config.dp_ratio
-                self.rnn = nn.LSTM(input_size=input_size, hidden_size=config.d_hidden,
-                                   num_layers=config.n_layers, dropout=dropout,
-                                   bidirectional=config.birnn)
+                self.rnn = nn.LSTM(
+                    input_size=input_size,
+                    hidden_size=config.d_hidden,
+                    num_layers=config.n_layers,
+                    dropout=dropout,
+                    bidirectional=config.birnn,
+                )
 
             def forward(self, inputs):
                 batch_size = inputs.size()[1]
                 state_shape = self.config.n_cells, batch_size, self.config.d_hidden
                 h0 = c0 = inputs.new_zeros(state_shape)
                 outputs, (ht, ct) = self.rnn(inputs, (h0, c0))
-                return ht[-1] if not self.config.birnn else ht[-2:].transpose(0, 1).contiguous().view(batch_size, -1)
+                return (
+                    ht[-1]
+                    if not self.config.birnn
+                    else ht[-2:].transpose(0, 1).contiguous().view(batch_size, -1)
+                )
 
         class SNLIClassifier(nn.Module):
-
             def __init__(self, config):
                 super().__init__()
                 self.config = config
@@ -360,7 +404,8 @@ def __init__(self, config):
                     Linear(*lin_config),
                     self.relu,
                     self.dropout,
-                    Linear(seq_in_size, config.d_out))
+                    Linear(seq_in_size, config.d_out),
+                )
 
             def forward(self, premise, hypothesis):
                 prem_embed = self.embed(premise)
@@ -392,36 +437,25 @@ class Config:
         premise = torch.LongTensor(48, 64).random_(0, 100).to(device)
         hypothesis = torch.LongTensor(24, 64).random_(0, 100).to(device)
 
-        if quantized:
-            snli = SNLIClassifier(Config()).cpu()
-            torch.jit.quantized.quantize_linear_modules(snli)
-            # we don't do export/import checks because we would need to call
-            # _pack/_unpack
-            self.checkTrace(snli, (premise, hypothesis), inputs_require_grads=False,
-                            export_import=False)
-        else:
-            self.checkTrace(SNLIClassifier(Config()).to(device), (premise, hypothesis),
-                            inputs_require_grads=False, export_import=check_export_import)
+        self.checkTrace(
+            SNLIClassifier(Config()).to(device),
+            (premise, hypothesis),
+            inputs_require_grads=False,
+            export_import=check_export_import,
+        )
 
     @slowTest
     def test_snli(self):
-        self._test_snli(self, device='cpu')
-
-    @skipIfNoFBGEMM
-    # Suppression: this exercises a deprecated API
-    @suppress_warnings
-    def test_snli_quantized(self):
-        self._test_snli(self, device='cpu', quantized=True)
+        self._test_snli(self, device="cpu")
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
     def test_snli_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
-        self._test_snli(self, device='cuda', check_export_import=False)
+        self._test_snli(self, device="cuda", check_export_import=False)
 
     @staticmethod
     def _test_super_resolution(self, device, check_export_import=True):
         class Net(nn.Module):
-
             def __init__(self, upscale_factor):
                 super().__init__()
 
@@ -429,7 +463,7 @@ def __init__(self, upscale_factor):
                 self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2))
                 self.conv2 = nn.Conv2d(64, 64, (3, 3), (1, 1), (1, 1))
                 self.conv3 = nn.Conv2d(64, 32, (3, 3), (1, 1), (1, 1))
-                self.conv4 = nn.Conv2d(32, upscale_factor ** 2, (3, 3), (1, 1), (1, 1))
+                self.conv4 = nn.Conv2d(32, upscale_factor**2, (3, 3), (1, 1), (1, 1))
                 self.pixel_shuffle = nn.PixelShuffle(upscale_factor)
 
             def forward(self, x):
@@ -440,17 +474,20 @@ def forward(self, x):
                 return x
 
         net = Net(upscale_factor=4).to(device)
-        self.checkTrace(net, (torch.rand(5, 1, 32, 32, device=device),),
-                        export_import=check_export_import)
+        self.checkTrace(
+            net,
+            (torch.rand(5, 1, 32, 32, device=device),),
+            export_import=check_export_import,
+        )
 
     @slowTest
     def test_super_resolution(self):
-        self._test_super_resolution(self, device='cpu')
+        self._test_super_resolution(self, device="cpu")
 
-    @unittest.skipIf(not RUN_CUDA, 'no CUDA')
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
     def test_super_resolution_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
-        self._test_super_resolution(self, device='cuda', check_export_import=False)
+        self._test_super_resolution(self, device="cuda", check_export_import=False)
 
     @suppress_warnings
     def test_time_sequence_prediction(self):
@@ -500,11 +537,10 @@ def forward(self, input):
         # disabled due to a jitter issues that will be fixed by using load/store in the compiler
         with torch._jit_internal._disable_emit_hooks():
             # TODO: toggle export_import once above issues are fixed
-            self.checkTrace(Traced(), (torch.rand(3, 4),),
-                            export_import=False)
+            self.checkTrace(Traced(), (torch.rand(3, 4),), export_import=False)
 
     @staticmethod
-    def _test_vae(self, device, check_export_import=True, quantized=False):
+    def _test_vae(self, device, check_export_import=True):
         class VAE(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -536,39 +572,29 @@ def forward(self, x):
                 z = self.reparameterize(mu, logvar)
                 return self.decode(z), mu, logvar
 
-        if quantized:
-            vae = VAE().to(device).eval()
-            torch.jit.quantized.quantize_linear_modules(vae)
-            # We don't do export/import checks because we would need to call
-            # _unpack and _pack
-            self.checkTrace(vae, (torch.rand(128, 1, 28, 28, device=device),),
-                            export_import=False, allow_unused=True,
-                            inputs_require_grads=False)
-        else:
-            with enable_profiling_mode_for_profiling_tests():
-                # eval() is present because randn_like makes this nondeterministic
-                self.checkTrace(VAE().to(device).eval(), (torch.rand(128, 1, 28, 28, device=device),),
-                                export_import=check_export_import)
+        with enable_profiling_mode_for_profiling_tests():
+            # eval() is present because randn_like makes this nondeterministic
+            self.checkTrace(
+                VAE().to(device).eval(),
+                (torch.rand(128, 1, 28, 28, device=device),),
+                export_import=check_export_import,
+            )
 
     def test_vae(self):
-        self._test_vae(self, device='cpu')
-
-    @skipIfNoFBGEMM
-    # Suppression: this exercises a deprecated API
-    @suppress_warnings
-    def test_vae_quantized(self):
-        self._test_vae(self, device='cpu', quantized=True)
+        self._test_vae(self, device="cpu")
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
     def test_vae_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
-        self._test_vae(self, device='cuda', check_export_import=False)
+        self._test_vae(self, device="cuda", check_export_import=False)
 
     @slowTest
     @skipIfNoTorchVision
     def test_script_module_trace_resnet18(self):
         x = torch.ones(1, 3, 224, 224)
-        m_orig = torch.jit.trace(torchvision.models.resnet18(), torch.ones(1, 3, 224, 224))
+        m_orig = torch.jit.trace(
+            torchvision.models.resnet18(), torch.ones(1, 3, 224, 224)
+        )
         m_import = self.getExportImportCopy(m_orig)
 
         input = torch.randn(1, 3, 224, 224, requires_grad=True)
@@ -589,16 +615,24 @@ def test_script_module_trace_resnet18(self):
     def test_script_module_script_resnet(self):
         def conv1x1(in_planes, out_planes, stride=1):
             """1x1 convolution"""
-            return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+            return nn.Conv2d(
+                in_planes, out_planes, kernel_size=1, stride=stride, bias=False
+            )
 
         def conv3x3(in_planes, out_planes, stride=1):
             """3x3 convolution with padding"""
-            return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
-                             padding=1, bias=False)
+            return nn.Conv2d(
+                in_planes,
+                out_planes,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                bias=False,
+            )
 
         class BasicBlock(torch.jit.ScriptModule):
             expansion = 1
-            __constants__ = ['downsample']
+            __constants__ = ["downsample"]
 
             def __init__(self, inplanes, planes, stride=1, downsample=None):
                 super().__init__()
@@ -630,13 +664,14 @@ def forward(self, x):
                 return out
 
         class ResNet(torch.jit.ScriptModule):
-            __constants__ = ['layer1', 'layer2', 'layer3', 'layer4']
+            __constants__ = ["layer1", "layer2", "layer3", "layer4"]
 
             def __init__(self, block, layers, num_classes=1000):
                 super().__init__()
                 self.inplanes = 64
-                self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
-                                       bias=False)
+                self.conv1 = nn.Conv2d(
+                    3, 64, kernel_size=7, stride=2, padding=3, bias=False
+                )
                 self.bn1 = nn.BatchNorm2d(64)
                 self.relu = nn.ReLU(inplace=True)
                 self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
@@ -649,7 +684,9 @@ def __init__(self, block, layers, num_classes=1000):
 
                 for m in self.modules():
                     if isinstance(m, nn.Conv2d):
-                        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                        nn.init.kaiming_normal_(
+                            m.weight, mode="fan_out", nonlinearity="relu"
+                        )
                     elif isinstance(m, nn.BatchNorm2d):
                         nn.init.constant_(m.weight, 1)
                         nn.init.constant_(m.bias, 0)
@@ -709,8 +746,10 @@ def test_alexnet(self):
         x = torch.ones(1, 3, 224, 224)
         model = torchvision.models.AlexNet()
         with torch.random.fork_rng(devices=[]):
-            g, outputs, inputs = torch.jit._get_trace_graph(model, x, return_inputs=True)
-        self.run_pass('cse', g)
+            g, outputs, inputs = torch.jit._get_trace_graph(
+                model, x, return_inputs=True
+            )
+        self.run_pass("cse", g)
         m = self.createFunctionFromGraph(g)
         with torch.random.fork_rng(devices=[]):
             self.assertEqual(outputs, m(*inputs))
diff --git a/test/jit/test_module_apis.py b/test/jit/test_module_apis.py
index 06769764fbad4..960734f5f20fb 100644
--- a/test/jit/test_module_apis.py
+++ b/test/jit/test_module_apis.py
@@ -1,19 +1,23 @@
 # Owner(s): ["oncall: jit"]
 
-import torch
 import os
 import sys
+from typing import Any, Dict, List
+
+import torch
 from torch.testing._internal.jit_utils import JitTestCase
-from typing import Dict, Any, List
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestModuleAPIs(JitTestCase):
     def test_default_state_dict_methods(self):
@@ -52,18 +56,23 @@ def forward(self, x):
                 return x
 
             @torch.jit.export
-            def _save_to_state_dict(self, destination: Dict[str, torch.Tensor],
-                                    prefix: str, keep_vars: bool):
+            def _save_to_state_dict(
+                self, destination: Dict[str, torch.Tensor], prefix: str, keep_vars: bool
+            ):
                 self.customized_save_state_dict_called = True
                 return {"dummy": torch.ones(1)}
 
             @torch.jit.export
-            def _load_from_state_dict(self,
-                                      state_dict: Dict[str, torch.Tensor],
-                                      prefix: str, local_metadata: Any,
-                                      strict: bool, missing_keys: List[str],
-                                      unexpected_keys: List[str],
-                                      error_msgs: List[str]):
+            def _load_from_state_dict(
+                self,
+                state_dict: Dict[str, torch.Tensor],
+                prefix: str,
+                local_metadata: Any,
+                strict: bool,
+                missing_keys: List[str],
+                unexpected_keys: List[str],
+                error_msgs: List[str],
+            ):
                 self.customized_load_state_dict_called = True
                 return
 
@@ -94,18 +103,23 @@ def forward(self, x):
                 return x
 
             @torch.jit.export
-            def _save_to_state_dict(self, destination: Dict[str, torch.Tensor],
-                                    prefix: str, keep_vars: bool):
+            def _save_to_state_dict(
+                self, destination: Dict[str, torch.Tensor], prefix: str, keep_vars: bool
+            ):
                 self.customized_save_state_dict_called = True
                 return {"dummy": torch.ones(1)}
 
             @torch.jit.export
-            def _load_from_state_dict(self,
-                                      state_dict: Dict[str, torch.Tensor],
-                                      prefix: str, local_metadata: Any,
-                                      strict: bool, missing_keys: List[str],
-                                      unexpected_keys: List[str],
-                                      error_msgs: List[str]):
+            def _load_from_state_dict(
+                self,
+                state_dict: Dict[str, torch.Tensor],
+                prefix: str,
+                local_metadata: Any,
+                strict: bool,
+                missing_keys: List[str],
+                unexpected_keys: List[str],
+                error_msgs: List[str],
+            ):
                 self.customized_load_state_dict_called = True
                 return
 
diff --git a/test/jit/test_module_containers.py b/test/jit/test_module_containers.py
index 62699e0958dab..058dbca3139b5 100644
--- a/test/jit/test_module_containers.py
+++ b/test/jit/test_module_containers.py
@@ -2,9 +2,10 @@
 
 import os
 import sys
+from collections import OrderedDict
 
 from typing import Any, List, Tuple
-from collections import OrderedDict
+
 import torch
 import torch.nn as nn
 from torch.testing._internal.jit_utils import JitTestCase
@@ -13,10 +14,13 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestModuleContainers(JitTestCase):
     def test_sequential_intermediary_types(self):
@@ -54,11 +58,13 @@ def forward(self, x):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                modules = OrderedDict([
-                    ('one', Inner()),
-                    ('two', Inner2()),
-                    ('three', Inner3()),
-                ])
+                modules = OrderedDict(
+                    [
+                        ("one", Inner()),
+                        ("two", Inner2()),
+                        ("three", Inner3()),
+                    ]
+                )
                 self.moduledict = nn.ModuleDict(modules)
 
             def forward(self, x, skip_name):
@@ -115,7 +121,6 @@ def forward(self, x, skip_name):
 
                 return x, x2, names, iter
 
-
         for name in ["", "one", "two", "three"]:
             inp = torch.tensor(1)
             self.checkModule(M(), (inp, name))
@@ -136,7 +141,7 @@ def forward(self, x):
                     x = mod(x)
                 return x - 5
 
-        self.checkModule(CustomSequential(), (torch.tensor(.5),))
+        self.checkModule(CustomSequential(), (torch.tensor(0.5),))
 
         class CustomModuleList(nn.ModuleList):
             def __init__(self):
@@ -148,16 +153,19 @@ def forward(self, x):
                     x = mod(x)
                 return x - 5
 
-        self.checkModule(CustomModuleList(), (torch.tensor(.5),))
+        self.checkModule(CustomModuleList(), (torch.tensor(0.5),))
 
         class CustomModuleDict(nn.ModuleDict):
             def __init__(self):
                 super().__init__(
-                    OrderedDict([
-                        ('one', Inner()),
-                        ('two', nn.ReLU()),
-                        ('three', Inner()),
-                    ]))
+                    OrderedDict(
+                        [
+                            ("one", Inner()),
+                            ("two", nn.ReLU()),
+                            ("three", Inner()),
+                        ]
+                    )
+                )
 
             def forward(self, x):
                 x = x + 3
@@ -167,7 +175,7 @@ def forward(self, x):
                     names.append(name)
                 return names, x - 5
 
-        self.checkModule(CustomModuleDict(), (torch.tensor(.5),))
+        self.checkModule(CustomModuleDict(), (torch.tensor(0.5),))
 
     def test_script_module_list_sequential(self):
         class M(torch.jit.ScriptModule):
@@ -225,7 +233,9 @@ class M2(M):
             def forward(self, v):
                 return self.mods[-11].forward(v)
 
-        with self.assertRaisesRegexWithHighlight(Exception, "Index -11 out of range", "self.mods[-11]"):
+        with self.assertRaisesRegexWithHighlight(
+            Exception, "Index -11 out of range", "self.mods[-11]"
+        ):
             torch.jit.script(M2())
 
         class M3(M):
@@ -233,7 +243,9 @@ def forward(self, v):
                 i = 3
                 return self.mods[i].forward(v)
 
-        with self.assertRaisesRegexWithHighlight(Exception, "Enumeration is supported", "self.mods[i]"):
+        with self.assertRaisesRegexWithHighlight(
+            Exception, "Enumeration is supported", "self.mods[i]"
+        ):
             torch.jit.script(M3())
 
         class M4(M):
@@ -273,17 +285,23 @@ def __init__(self):
                 self.moduledict = CustomModuleDict({"submod": self.submod})
 
             def forward(self, inputs):
-                assert self.modulelist[0] is self.submod, "__getitem__ failing for ModuleList"
+                assert (
+                    self.modulelist[0] is self.submod
+                ), "__getitem__ failing for ModuleList"
                 assert len(self.modulelist) == 1, "__len__ failing for ModuleList"
                 for module in self.modulelist:
                     assert module is self.submod, "__iter__ failing for ModuleList"
 
-                assert self.sequential[0] is self.submod, "__getitem__ failing for Sequential"
+                assert (
+                    self.sequential[0] is self.submod
+                ), "__getitem__ failing for Sequential"
                 assert len(self.sequential) == 1, "__len__ failing for Sequential"
                 for module in self.sequential:
                     assert module is self.submod, "__iter__ failing for Sequential"
 
-                assert self.moduledict["submod"] is self.submod, "__getitem__ failing for ModuleDict"
+                assert (
+                    self.moduledict["submod"] is self.submod
+                ), "__getitem__ failing for ModuleDict"
                 assert len(self.moduledict) == 1, "__len__ failing for ModuleDict"
 
                 # note: unable to index moduledict with a string variable currently
@@ -345,12 +363,13 @@ def __init__(self):
                 super().__init__()
                 self.relu = torch.jit.script(torch.nn.ReLU())
                 self.tanh = torch.jit.script(torch.nn.Tanh())
-                self.moduledict = torch.nn.ModuleDict({"relu": self.relu,
-                                                       "tanh": self.tanh})
+                self.moduledict = torch.nn.ModuleDict(
+                    {"relu": self.relu, "tanh": self.tanh}
+                )
 
             def forward(self, input):
-                assert self.moduledict['relu'] is self.relu
-                assert self.moduledict['tanh'] is self.tanh
+                assert self.moduledict["relu"] is self.relu
+                assert self.moduledict["tanh"] is self.tanh
                 return input
 
         m = MyModule()
@@ -360,31 +379,34 @@ def test_moduledict_keyerror(self):
         class BadModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.moduledict = torch.nn.ModuleDict({"foo": None,
-                                                       "bar": None})
+                self.moduledict = torch.nn.ModuleDict({"foo": None, "bar": None})
 
             def forward(self, input):
-                assert self.moduledict['blah'] == "blah", "this is a keyerror"
+                assert self.moduledict["blah"] == "blah", "this is a keyerror"
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "Key Error, blah", "self.moduledict['blah'"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Key Error, blah", 'self.moduledict["blah"'
+        ):
             b = BadModule()
             torch.jit.script(b)
 
         class AnotherBadModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.moduledict = torch.nn.ModuleDict({"foo": None,
-                                                       "bar": None})
+                self.moduledict = torch.nn.ModuleDict({"foo": None, "bar": None})
 
             def forward(self, input):
-                idx = 'blah'
+                idx = "blah"
                 assert self.moduledict[idx] == "blah", "this is a string literal error"
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "Unable to extract string literal index. "
-                                                               "ModuleDict indexing is only supported with string literals. "
-                                                               "For example, 'i = \"a\"; self.layers\\[i\\]\\(x\\)' will fail "
-                                                               "because i is not a literal.",
-                                                               "self.moduledict[idx]"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError,
+            "Unable to extract string literal index. "
+            "ModuleDict indexing is only supported with string literals. "
+            "For example, 'i = \"a\"; self.layers\\[i\\]\\(x\\)' will fail "
+            "because i is not a literal.",
+            "self.moduledict[idx]",
+        ):
             b = AnotherBadModule()
             torch.jit.script(b)
 
@@ -393,6 +415,7 @@ def test_normal_list_attribute_with_modules_error(self):
         Test that an attempt to script a module with a regular list attribute
         containing other modules fails with a relevant error message.
         """
+
         class Mod(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -422,7 +445,9 @@ def __init__(self):
                 self.moduledict = CustomModuleDict()
 
             def forward(self, inputs):
-                assert "submod" not in self.moduledict, "__contains__ fails for ModuleDict"
+                assert (
+                    "submod" not in self.moduledict
+                ), "__contains__ fails for ModuleDict"
                 return inputs
 
         m = MyModule()
@@ -433,6 +458,7 @@ def test_typed_module_dict(self):
         Test that a type annotation can be provided for a ModuleDict that allows
         non-static indexing.
         """
+
         @torch.jit.interface
         class ModuleInterface(torch.nn.Module):
             def forward(self, inp: Any) -> Any:
@@ -485,7 +511,9 @@ def forward(self, x: torch.Tensor, key: str) -> Any:
                 submodule: ModuleInterface = self.d[key]
                 return submodule.forward(x)
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, r"Attribute module is not of annotated type", "self.d[key]"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, r"Attribute module is not of annotated type", "self.d[key]"
+        ):
             torch.jit.script(ModWithWrongAnnotation())
 
     def test_typed_module_list(self):
@@ -493,6 +521,7 @@ def test_typed_module_list(self):
         Test that a type annotation can be provided for a ModuleList that allows
         non-static indexing.
         """
+
         @torch.jit.interface
         class ModuleInterface(torch.nn.Module):
             def forward(self, inp: Any) -> Any:
@@ -545,7 +574,9 @@ def forward(self, x: torch.Tensor, idx: int) -> Any:
                 submodule: ModuleInterface = self.l[idx]
                 return submodule.forward(x)
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, r"Attribute 0 is not of annotated type", "self.l[idx]"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, r"Attribute 0 is not of annotated type", "self.l[idx]"
+        ):
             torch.jit.script(ModWithWrongAnnotation())
 
     def test_module_properties(self):
@@ -596,10 +627,34 @@ def forward(self, a: int, b: int):
             def attr(self):
                 return self.a + 1
 
-        self.checkModule(ModuleWithProperties(5), (5, 6,))
-        self.checkModule(ModuleWithProperties(5), (-5, -6,))
-        self.checkModule(ModuleWithNoSetter(5), (5, 6,))
-        self.checkModule(ModuleWithNoSetter(5), (-5, -6,))
+        self.checkModule(
+            ModuleWithProperties(5),
+            (
+                5,
+                6,
+            ),
+        )
+        self.checkModule(
+            ModuleWithProperties(5),
+            (
+                -5,
+                -6,
+            ),
+        )
+        self.checkModule(
+            ModuleWithNoSetter(5),
+            (
+                5,
+                6,
+            ),
+        )
+        self.checkModule(
+            ModuleWithNoSetter(5),
+            (
+                -5,
+                -6,
+            ),
+        )
 
         mod = ModuleWithProperties(3)
         scripted_mod = torch.jit.script(mod)
@@ -625,7 +680,6 @@ def unused_method(self):
             def forward(self, x):
                 return self.linear(self.linear(x))
 
-
         class N(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -659,7 +713,9 @@ class MyModule(nn.Module):
             def __init__(self):
                 super().__init__()
                 self.module_list = nn.ModuleList([nn.Linear(1, 1) for _ in range(10)])
-                self.parameter_list = nn.ParameterList([nn.Parameter(torch.zeros(1)) for _ in range(10)])
+                self.parameter_list = nn.ParameterList(
+                    [nn.Parameter(torch.zeros(1)) for _ in range(10)]
+                )
 
             def forward(self, x):
                 self.module_list[0]
@@ -673,7 +729,9 @@ class MyModule(nn.Module):
             def __init__(self):
                 super().__init__()
                 self.module_list = nn.ModuleList([nn.Linear(1, 1) for _ in range(10)])
-                self.parameter_list = nn.ParameterList([nn.Parameter(torch.zeros(1)) for _ in range(10)])
+                self.parameter_list = nn.ParameterList(
+                    [nn.Parameter(torch.zeros(1)) for _ in range(10)]
+                )
 
             def forward(self, x):
                 r = x
@@ -687,9 +745,14 @@ def test_parameterdict_script_getitem(self):
         class MyModule(nn.Module):
             def __init__(self):
                 super().__init__()
-                self.parameter_dict = nn.ParameterDict({k: nn.Parameter(torch.zeros(1)) for k in ['a', 'b', 'c']})
+                self.parameter_dict = nn.ParameterDict(
+                    {k: nn.Parameter(torch.zeros(1)) for k in ["a", "b", "c"]}
+                )
 
             def forward(self, x):
-                return self.parameter_dict['a'] * x + self.parameter_dict['b'] * self.parameter_dict['c']
+                return (
+                    self.parameter_dict["a"] * x
+                    + self.parameter_dict["b"] * self.parameter_dict["c"]
+                )
 
         self.checkModule(MyModule(), (torch.ones(1),))
diff --git a/test/jit/test_module_interface.py b/test/jit/test_module_interface.py
index f9e9aea23542d..4a2648b9a8830 100644
--- a/test/jit/test_module_interface.py
+++ b/test/jit/test_module_interface.py
@@ -1,10 +1,11 @@
 # Owner(s): ["oncall: jit"]
 
-from typing import List, Any
-import torch
-import torch.nn as nn
 import os
 import sys
+from typing import Any, List
+
+import torch
+import torch.nn as nn
 from torch import Tensor
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
@@ -12,10 +13,13 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class OrigModule(nn.Module):
     def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
@@ -27,6 +31,7 @@ def two(self, input: Tensor) -> Tensor:
     def forward(self, input: Tensor) -> Tensor:
         return input + self.one(input, input) + 1
 
+
 class NewModule(nn.Module):
     def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
         return inp1 * inp2 + 1
@@ -34,6 +39,7 @@ def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
     def forward(self, input: Tensor) -> Tensor:
         return self.one(input, input + 1)
 
+
 class TestModuleInterface(JitTestCase):
     def test_not_submodule_interface_call(self):
         @torch.jit.interface
@@ -42,7 +48,7 @@ def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
                 pass
 
         class TestNotModuleInterfaceCall(nn.Module):
-            proxy_mod : ModuleInterface
+            proxy_mod: ModuleInterface
 
             def __init__(self):
                 super().__init__()
@@ -51,7 +57,9 @@ def __init__(self):
             def forward(self, input: Tensor) -> Tensor:
                 return self.proxy_mod.two(input)
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "object has no attribute or method", "self.proxy_mod.two"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "object has no attribute or method", "self.proxy_mod.two"
+        ):
             torch.jit.script(TestNotModuleInterfaceCall())
 
     def test_module_interface(self):
@@ -108,17 +116,37 @@ def use_class_interface(mod_list: List[OneTwoClass], x: Tensor) -> Tensor:
 
         scripted_foo_mod = torch.jit.script(FooMod())
         scripted_bar_mod = torch.jit.script(BarMod())
-        self.checkScript(use_module_interface,
-                         ([scripted_foo_mod, scripted_bar_mod], torch.rand(3, 4),))
-        self.checkScript(use_class_interface,
-                         ([scripted_foo_mod, scripted_bar_mod], torch.rand(3, 4),))
-
-        def call_module_interface_on_other_method(mod_interface: OneTwoModule, x: Tensor) -> Tensor:
+        self.checkScript(
+            use_module_interface,
+            (
+                [scripted_foo_mod, scripted_bar_mod],
+                torch.rand(3, 4),
+            ),
+        )
+        self.checkScript(
+            use_class_interface,
+            (
+                [scripted_foo_mod, scripted_bar_mod],
+                torch.rand(3, 4),
+            ),
+        )
+
+        def call_module_interface_on_other_method(
+            mod_interface: OneTwoModule, x: Tensor
+        ) -> Tensor:
             return mod_interface.forward2(x)
 
         # ensure error out when we call the module on the method other than the interface specified.
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "object has no attribute or method", "mod_interface.forward2"):
-            self.checkScript(call_module_interface_on_other_method, (scripted_bar_mod, torch.rand(3, 4),))
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "object has no attribute or method", "mod_interface.forward2"
+        ):
+            self.checkScript(
+                call_module_interface_on_other_method,
+                (
+                    scripted_bar_mod,
+                    torch.rand(3, 4),
+                ),
+            )
 
     def test_module_doc_string(self):
         @torch.jit.interface
@@ -135,7 +163,7 @@ def forward(self, input):
                 r"""stuff 3"""
 
         class TestModule(nn.Module):
-            proxy_mod : TestInterface
+            proxy_mod: TestInterface
 
             def __init__(self):
                 super().__init__()
@@ -178,7 +206,9 @@ def forward(self, x: Tensor) -> Tensor:
                 return self.one(self.two(x), x)
 
         # check class object is not a subtype of module interface
-        with self.assertRaisesRegex(RuntimeError, "ScriptModule class can be subtype of module interface"):
+        with self.assertRaisesRegex(
+            RuntimeError, "ScriptModule class can be subtype of module interface"
+        ):
             as_module_interface(Foo())
 
         class WrongMod(nn.Module):
@@ -233,9 +263,11 @@ def forward(self, input: Any) -> torch.Tensor:
         as_tensor_to_any(torch.jit.script(TensorToAnyImplB()))
         as_any_to_any(torch.jit.script(AnyToAnyImpl()))
 
-
     def test_module_interface_inheritance(self):
-        with self.assertRaisesRegex(RuntimeError, "does not support inheritance yet. Please directly"):
+        with self.assertRaisesRegex(
+            RuntimeError, "does not support inheritance yet. Please directly"
+        ):
+
             @torch.jit.interface
             class InheritMod(nn.ReLU):
                 def three(self, x: Tensor) -> Tensor:
@@ -251,7 +283,7 @@ def forward(self, input: Tensor) -> Tensor:
                 pass
 
         class TestModule(nn.Module):
-            proxy_mod : ModuleInterface
+            proxy_mod: ModuleInterface
 
             def __init__(self):
                 super().__init__()
@@ -269,7 +301,9 @@ def forward(self, input: Tensor) -> Tensor:
         self.assertEqual(scripted_mod(input), input * (input + 1) + 1)
 
         # module swap with non-scripted module should throw error
-        with self.assertRaisesRegex(RuntimeError, "a ScriptModule with non-scripted module"):
+        with self.assertRaisesRegex(
+            RuntimeError, "a ScriptModule with non-scripted module"
+        ):
             scripted_mod.proxy_mod = NewModule()
 
     def test_module_swap_wrong_module(self):
@@ -286,7 +320,7 @@ def forward(self, input: int) -> int:
                 return input + 1
 
         class TestModule(nn.Module):
-            proxy_mod : ModuleInterface
+            proxy_mod: ModuleInterface
 
             def __init__(self):
                 super().__init__()
@@ -310,7 +344,7 @@ def forward(self, input: Tensor) -> Tensor:
                 pass
 
         class TestModule(nn.Module):
-            proxy_mod : ModuleInterface
+            proxy_mod: ModuleInterface
 
             def __init__(self):
                 super().__init__()
@@ -358,9 +392,11 @@ def forward(self, input: Tensor) -> Tensor:
         # proxy mod is swapped with the new ScriptModule that share the same JIT type, should succeed.
         scripted_no_module_interface.proxy_mod = torch.jit.script(OrigModule())
         # proxy_mod is neither a module interface or have the same JIT type, should fail
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"Expected a value of type '__torch__.jit.test_module_interface.OrigModule \(.*\)' " +
-                                    r"for field 'proxy_mod', but found '__torch__.jit.test_module_interface.NewModule \(.*\)'"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected a value of type '__torch__.jit.test_module_interface.OrigModule \(.*\)' "
+            + r"for field 'proxy_mod', but found '__torch__.jit.test_module_interface.NewModule \(.*\)'",
+        ):
             scripted_no_module_interface.proxy_mod = torch.jit.script(NewModule())
 
     def test_script_module_as_interface_swap(self):
@@ -391,7 +427,7 @@ def forward(self, input: Tensor) -> Tensor:
                 return self.one(input, input + 1)
 
         class TestNNModuleWithScriptModule(nn.Module):
-            proxy_mod : ModuleInterface
+            proxy_mod: ModuleInterface
 
             def __init__(self):
                 super().__init__()
@@ -432,7 +468,7 @@ def forward(self, x: Tensor) -> int:
                 pass
 
         class TestModule(torch.nn.Module):
-            proxy_mod : ModInterface
+            proxy_mod: ModInterface
 
             def __init__(self):
                 super().__init__()
@@ -480,7 +516,7 @@ def forward(self, x: Tensor) -> int:
                 pass
 
         class TestModule(torch.nn.Module):
-            proxy_mod : ModInterface
+            proxy_mod: ModInterface
 
             def __init__(self):
                 super().__init__()
@@ -523,7 +559,7 @@ def forward(self, x: Tensor) -> Tensor:
                 pass
 
         class TestModule(torch.nn.Module):
-            proxy_mod : ModInterface
+            proxy_mod: ModInterface
 
             def __init__(self):
                 super().__init__()
@@ -568,7 +604,7 @@ def forward(self, x: Tensor) -> Tensor:
                 pass
 
         class TestModule(torch.nn.Module):
-            proxy_mod : ModInterface
+            proxy_mod: ModInterface
 
             def __init__(self):
                 super().__init__()
@@ -583,7 +619,9 @@ def forward(self, x):
 
         m = torch.jit.script(TestModule())
         m.eval()
-        with self.assertRaisesRegex(RuntimeError, "Freezing does not support SetAttr on an interface type."):
+        with self.assertRaisesRegex(
+            RuntimeError, "Freezing does not support SetAttr on an interface type."
+        ):
             mf = torch._C._freeze_module(m._c, freezeInterfaces=True)
 
     def test_freeze_module_with_interface_and_fork(self):
@@ -610,7 +648,7 @@ def forward(self, x: Tensor) -> Tensor:
                 pass
 
         class TestModule(torch.nn.Module):
-            proxy_mod : ModInterface
+            proxy_mod: ModInterface
 
             def __init__(self):
                 super().__init__()
@@ -644,7 +682,7 @@ def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
                 pass
 
         class TestModule(nn.Module):
-            proxy_mod : ModuleInterface
+            proxy_mod: ModuleInterface
 
             def __init__(self):
                 super().__init__()
diff --git a/test/jit/test_modules.py b/test/jit/test_modules.py
index 07c0bb5052572..b620e11704501 100644
--- a/test/jit/test_modules.py
+++ b/test/jit/test_modules.py
@@ -1,18 +1,22 @@
 # Owner(s): ["oncall: jit"]
 
-import torch
 import os
 import sys
+
+import torch
 from torch.testing._internal.jit_utils import JitTestCase
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestModules(JitTestCase):
     def test_script_module_with_constants_list(self):
diff --git a/test/jit/test_op_decompositions.py b/test/jit/test_op_decompositions.py
index 6b4569cd6e397..1de6258632313 100644
--- a/test/jit/test_op_decompositions.py
+++ b/test/jit/test_op_decompositions.py
@@ -4,10 +4,13 @@
 from torch.testing import FileCheck
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestOpDecompositions(JitTestCase):
     def test_op_decomposition(self):
@@ -31,7 +34,9 @@ def foo(x):
         def square_decomp(x):
             return torch.pow(x, 2)
 
-        torch.jit._register_decomposition(torch.ops.aten.square.default, square_decomp.graph)
+        torch.jit._register_decomposition(
+            torch.ops.aten.square.default, square_decomp.graph
+        )
         torch._C._jit_pass_run_decompositions(foo.graph)
         FileCheck().check_not("aten::square").check("aten::pow").run(foo.graph)
         x = torch.rand([4])
diff --git a/test/jit/test_optimize_for_mobile_preserve_debug_info.py b/test/jit/test_optimize_for_mobile_preserve_debug_info.py
index 78d3fae593716..9ccc796c925c9 100644
--- a/test/jit/test_optimize_for_mobile_preserve_debug_info.py
+++ b/test/jit/test_optimize_for_mobile_preserve_debug_info.py
@@ -3,8 +3,9 @@
 import torch
 import torch._C
 import torch.nn.functional as F
-from torch.testing._internal.jit_utils import JitTestCase
 from torch.testing._internal.common_utils import skipIfNoXNNPACK
+from torch.testing._internal.jit_utils import JitTestCase
+
 
 class TestOptimizeForMobilePreserveDebugInfo(JitTestCase):
     def check_replacement(
@@ -133,10 +134,8 @@ def forward(self, x):
                 "prepacked::linear_clamp_run": "aten::linear",
                 "prepacked::conv2d_clamp_prepack": "aten::conv2d",
                 "prepacked::conv2d_clamp_run": "aten::conv2d",
-                "prepacked::conv2d_transpose_clamp_prepack":
-                    "aten::conv_transpose2d",
-                "prepacked::conv2d_transpose_clamp_run":
-                    "aten::conv_transpose2d",
+                "prepacked::conv2d_transpose_clamp_prepack": "aten::conv_transpose2d",
+                "prepacked::conv2d_transpose_clamp_run": "aten::conv_transpose2d",
             },
             jit_pass=torch._C._jit_pass_insert_prepacked_ops,
         )
@@ -147,7 +146,7 @@ def test_insert_pre_packed_linear_op(self):
             model=torch.jit.trace(torch.nn.Linear(5, 4), torch.rand(3, 2, 5)),
             replacements={
                 "prepacked::linear_clamp_prepack": "aten::linear",
-                "prepacked::linear_clamp_run": "aten::linear"
+                "prepacked::linear_clamp_run": "aten::linear",
             },
             jit_pass=torch._C._jit_pass_insert_prepacked_ops,
         )
@@ -223,11 +222,9 @@ def forward(self, x):
         self.check_replacement(
             model=model,
             replacements={
-                "prepacked::linear_clamp_prepack":
-                    "prepacked::linear_clamp_prepack",
+                "prepacked::linear_clamp_prepack": "prepacked::linear_clamp_prepack",
                 "prepacked::linear_clamp_run": linear_activation_kind,
-                "prepacked::conv2d_clamp_prepack":
-                    "prepacked::conv2d_clamp_prepack",
+                "prepacked::conv2d_clamp_prepack": "prepacked::conv2d_clamp_prepack",
                 "prepacked::conv2d_clamp_run": conv2d_activation_kind,
             },
             jit_pass=torch._C._jit_pass_fuse_clamp_w_prepacked_linear_conv,
@@ -239,7 +236,7 @@ def test_fuse_activation_with_pack_ops_linear_conv2d_1(self):
             linear_activation=F.hardtanh,
             linear_activation_kind="aten::hardtanh",
             conv2d_activation=F.hardtanh_,
-            conv2d_activation_kind="aten::hardtanh_"
+            conv2d_activation_kind="aten::hardtanh_",
         )
 
     @skipIfNoXNNPACK
@@ -248,7 +245,7 @@ def test_fuse_activation_with_pack_ops_linear_conv2d_2(self):
             linear_activation=F.hardtanh_,
             linear_activation_kind="aten::hardtanh_",
             conv2d_activation=F.hardtanh,
-            conv2d_activation_kind="aten::hardtanh"
+            conv2d_activation_kind="aten::hardtanh",
         )
 
     @skipIfNoXNNPACK
@@ -257,7 +254,7 @@ def test_fuse_activation_with_pack_ops_linear_conv2d_3(self):
             linear_activation=F.relu,
             linear_activation_kind="aten::relu",
             conv2d_activation=F.relu_,
-            conv2d_activation_kind="aten::relu_"
+            conv2d_activation_kind="aten::relu_",
         )
 
     @skipIfNoXNNPACK
@@ -266,5 +263,5 @@ def test_fuse_activation_with_pack_ops_linear_conv2d_4(self):
             linear_activation=F.relu_,
             linear_activation_kind="aten::relu_",
             conv2d_activation=F.relu,
-            conv2d_activation_kind="aten::relu"
+            conv2d_activation_kind="aten::relu",
         )
diff --git a/test/jit/test_parametrization.py b/test/jit/test_parametrization.py
index 8882a431f2337..41aff00a307f9 100644
--- a/test/jit/test_parametrization.py
+++ b/test/jit/test_parametrization.py
@@ -2,16 +2,19 @@
 
 
 import torch
-from torch import nn
 import torch.nn.utils.parametrize as parametrize
+from torch import nn
 
 from torch.testing._internal.jit_utils import JitTestCase
 
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestParametrization(JitTestCase):
     # Define some parametrization
@@ -29,7 +32,7 @@ def test_traceable(self):
 
         # Check the tracing works. Because traced functions cannot be called
         # directly, we run the comparison on the activations.
-        traced_model = torch.jit.trace_module(model, {'forward': x})
+        traced_model = torch.jit.trace_module(model, {"forward": x})
         y_hat = traced_model(x)
         self.assertEqual(y, y_hat)
 
@@ -39,10 +42,9 @@ def test_traceable(self):
             self.assertEqual(y, y_hat)
 
         # Check the tracing throws an error when caching
-        with self.assertRaisesRegex(RuntimeError,
-                                    'Cannot trace a model while caching'):
+        with self.assertRaisesRegex(RuntimeError, "Cannot trace a model while caching"):
             with parametrize.cached():
-                traced_model = torch.jit.trace_module(model, {'forward': x})
+                traced_model = torch.jit.trace_module(model, {"forward": x})
 
     def test_scriptable(self):
         # TODO: Need to fix the scripting in parametrizations
@@ -65,5 +67,5 @@ def test_scriptable(self):
                 self.assertEqual(y, y_hat)
 
                 # Check the scripting process throws an error when caching
-                with self.assertRaisesRegex(RuntimeError, 'Caching is not implemented'):
+                with self.assertRaisesRegex(RuntimeError, "Caching is not implemented"):
                     scripted_model = torch.jit.trace_module(model)
diff --git a/test/jit/test_pdt.py b/test/jit/test_pdt.py
index 5fa39b8cac35e..43225ebb5e924 100644
--- a/test/jit/test_pdt.py
+++ b/test/jit/test_pdt.py
@@ -2,18 +2,22 @@
 
 import os
 import sys
+from typing import Any, Dict, List, NamedTuple, Optional, Tuple  # noqa: F401
+
 import torch
-from torch.testing._internal.jit_utils import JitTestCase, make_global
 from torch.jit._monkeytype_config import _IS_MONKEYTYPE_INSTALLED
-from typing import List, Dict, Tuple, Any, Optional, NamedTuple  # noqa: F401
 from torch.testing._internal.common_utils import NoTest
+from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
 if not _IS_MONKEYTYPE_INSTALLED:
-    print("monkeytype is not installed. Skipping tests for Profile-Directed Typing", file=sys.stderr)
+    print(
+        "monkeytype is not installed. Skipping tests for Profile-Directed Typing",
+        file=sys.stderr,
+    )
     JitTestCase = NoTest  # type: ignore[misc, assignment] # noqa: F811
 
 if __name__ == "__main__":
@@ -23,10 +27,12 @@
         "instead."
     )
 
+
 class TestPDT(JitTestCase):
     """
     A suite of tests for profile directed typing in TorchScript.
     """
+
     def test_nn_module(self):
         class TestPDTModel(torch.nn.Module):
             def forward(self, x) -> Any:
@@ -39,8 +45,14 @@ def forward(self, x) -> Any:
 
         make_global(TestPDTModel)
         pdt_model = TestPDTModel()
-        inp: List[Tuple[Any, ...]] = [(20, ), (2.7, ), (False, ), ]
-        scripted_pdt_model = torch.jit.script(pdt_model, example_inputs={pdt_model: inp})
+        inp: List[Tuple[Any, ...]] = [
+            (20,),
+            (2.7,),
+            (False,),
+        ]
+        scripted_pdt_model = torch.jit.script(
+            pdt_model, example_inputs={pdt_model: inp}
+        )
         self.assertEqual(scripted_pdt_model(50), pdt_model(50))
         self.assertEqual(scripted_pdt_model(1.8), pdt_model(1.8))
         self.assertTrue(scripted_pdt_model(True), pdt_model(True))
@@ -63,8 +75,10 @@ def forward(self, x):
         make_global(NestedPDTInner, NestedModulePDTWrapper)
         inner_pdt_model = NestedPDTInner()
         wrapped_pdt_model = NestedModulePDTWrapper(inner_pdt_model)
-        inp: List[Tuple[Any, ...]] = [(20, ), (False, )]
-        scripted_pdt_model = torch.jit.script(wrapped_pdt_model, example_inputs={wrapped_pdt_model: inp})
+        inp: List[Tuple[Any, ...]] = [(20,), (False,)]
+        scripted_pdt_model = torch.jit.script(
+            wrapped_pdt_model, example_inputs={wrapped_pdt_model: inp}
+        )
         self.assertEqual(scripted_pdt_model(30), wrapped_pdt_model(30))
         self.assertEqual(scripted_pdt_model(1.9), wrapped_pdt_model(1.9))
         self.assertTrue(scripted_pdt_model(True), wrapped_pdt_model(True))
@@ -87,10 +101,18 @@ def forward(self, x):
         make_global(NestedModulePDTInner, NestedModulePDTOuter)
         inner_pdt_model = NestedModulePDTInner()
         outer_pdt_model = NestedModulePDTOuter(inner_pdt_model)
-        inner_input: List[Tuple[Any, ...]] = [(10, 10), (1.9, 20), ]
-        outer_input: List[Tuple[Any, ...]] = [(20, ), (False, )]
-        scripted_pdt_model = torch.jit.script(outer_pdt_model, example_inputs={inner_pdt_model: inner_input,
-                                              outer_pdt_model: outer_input, })
+        inner_input: List[Tuple[Any, ...]] = [
+            (10, 10),
+            (1.9, 20),
+        ]
+        outer_input: List[Tuple[Any, ...]] = [(20,), (False,)]
+        scripted_pdt_model = torch.jit.script(
+            outer_pdt_model,
+            example_inputs={
+                inner_pdt_model: inner_input,
+                outer_pdt_model: outer_input,
+            },
+        )
         self.assertEqual(scripted_pdt_model(30), outer_pdt_model(30))
         self.assertEqual(scripted_pdt_model(1.9), outer_pdt_model(1.9))
         self.assertTrue(scripted_pdt_model(True), outer_pdt_model(True))
@@ -109,8 +131,10 @@ def fun(self, x):
 
         make_global(NestedFunctionInForward)
         pdt_model = NestedFunctionInForward()
-        inp: List[Tuple[Any, ...]] = [(-1, ), (False, )]
-        scripted_pdt_model = torch.jit.script(pdt_model, example_inputs={pdt_model: inp})
+        inp: List[Tuple[Any, ...]] = [(-1,), (False,)]
+        scripted_pdt_model = torch.jit.script(
+            pdt_model, example_inputs={pdt_model: inp}
+        )
         self.assertEqual(scripted_pdt_model(30), pdt_model(30))
         self.assertEqual(scripted_pdt_model(True), pdt_model(True))
 
@@ -126,14 +150,26 @@ def fn(self, x, y) -> Any:
                 else:
                     return -1
 
-
         make_global(TestModelWithExport)
         pdt_model = TestModelWithExport()
-        inp: List[Tuple[Any, ...]] = [(20, 10, ), (2.7, 8.9, ), ]
-        scripted_pdt_model = torch.jit.script(pdt_model, example_inputs={pdt_model.fn: inp})
+        inp: List[Tuple[Any, ...]] = [
+            (
+                20,
+                10,
+            ),
+            (
+                2.7,
+                8.9,
+            ),
+        ]
+        scripted_pdt_model = torch.jit.script(
+            pdt_model, example_inputs={pdt_model.fn: inp}
+        )
         self.assertEqual(scripted_pdt_model.fn(10, 90), pdt_model.fn(10, 90))
         self.assertEqual(scripted_pdt_model.fn(1.8, 2.2), pdt_model.fn(1.8, 2.2))
-        self.assertTrue(scripted_pdt_model.fn(torch.ones(1), 2), pdt_model.fn(torch.ones(1), 2))
+        self.assertTrue(
+            scripted_pdt_model.fn(torch.ones(1), 2), pdt_model.fn(torch.ones(1), 2)
+        )
 
     def test_class_methods(self):
         class PDTModel:
@@ -142,10 +178,34 @@ def test_sum(self, a):
 
         make_global(PDTModel)
         pdt_model = PDTModel()
-        inp: List[Tuple[Any, ...]] = [([10, 20, ], ), ]
-        scripted_pdt_model = torch.jit.script(PDTModel, example_inputs={pdt_model.test_sum: inp})
+        inp: List[Tuple[Any, ...]] = [
+            (
+                [
+                    10,
+                    20,
+                ],
+            ),
+        ]
+        scripted_pdt_model = torch.jit.script(
+            PDTModel, example_inputs={pdt_model.test_sum: inp}
+        )
         script_model = scripted_pdt_model()
-        self.assertEqual(script_model.test_sum([10, 20, 30, ], ), pdt_model.test_sum([10, 20, 30, ], ))
+        self.assertEqual(
+            script_model.test_sum(
+                [
+                    10,
+                    20,
+                    30,
+                ],
+            ),
+            pdt_model.test_sum(
+                [
+                    10,
+                    20,
+                    30,
+                ],
+            ),
+        )
 
     def test_class_with_multiple_methods(self):
         class PDTModelWithManyMethods:
@@ -160,14 +220,64 @@ def test_substring(self, a, b):
 
         make_global(PDTModelWithManyMethods)
         pdt_model = PDTModelWithManyMethods()
-        list_inp: List[Tuple[Any, ...]] = [([1.2, 2.3, ], ), ]
-        str_inp: List[Tuple[Any, ...]] = [("abc", "b", ), ]
-        scripted_pdt_model = torch.jit.script(PDTModelWithManyMethods, example_inputs={pdt_model.test_list_to_dict: list_inp,
-                                              pdt_model.test_substring: str_inp})
+        list_inp: List[Tuple[Any, ...]] = [
+            (
+                [
+                    1.2,
+                    2.3,
+                ],
+            ),
+        ]
+        str_inp: List[Tuple[Any, ...]] = [
+            (
+                "abc",
+                "b",
+            ),
+        ]
+        scripted_pdt_model = torch.jit.script(
+            PDTModelWithManyMethods,
+            example_inputs={
+                pdt_model.test_list_to_dict: list_inp,
+                pdt_model.test_substring: str_inp,
+            },
+        )
         script_model = scripted_pdt_model()
-        self.assertEqual(script_model.test_list_to_dict([1.1, 2.2, 3.3, ], ), pdt_model.test_list_to_dict([1.1, 2.2, 3.3, ], ))
-        self.assertEqual(script_model.test_substring("helloworld", "world", ), pdt_model.test_substring("helloworld", "world", ))
-        self.assertEqual(script_model.test_substring("helloworld", "def", ), pdt_model.test_substring("helloworld", "def", ))
+        self.assertEqual(
+            script_model.test_list_to_dict(
+                [
+                    1.1,
+                    2.2,
+                    3.3,
+                ],
+            ),
+            pdt_model.test_list_to_dict(
+                [
+                    1.1,
+                    2.2,
+                    3.3,
+                ],
+            ),
+        )
+        self.assertEqual(
+            script_model.test_substring(
+                "helloworld",
+                "world",
+            ),
+            pdt_model.test_substring(
+                "helloworld",
+                "world",
+            ),
+        )
+        self.assertEqual(
+            script_model.test_substring(
+                "helloworld",
+                "def",
+            ),
+            pdt_model.test_substring(
+                "helloworld",
+                "def",
+            ),
+        )
 
     def test_multiple_class_with_same_method(self):
         class PDTModelOne:
@@ -181,16 +291,69 @@ def test_find(self, a, b):
         make_global(PDTModelOne, PDTModelTwo)
         pdt_model_one = PDTModelOne()
         pdt_model_two = PDTModelTwo()
-        dict_inp: List[Tuple[Any, ...]] = [({1.2: True, 2.3: False, }, 1.2), ]
-        list_inp: List[Tuple[Any, ...]] = [(["abc", "b", ], "c"), ]
-        scripted_pdt_model_one = torch.jit.script(PDTModelOne, example_inputs={pdt_model_one.test_find: dict_inp})
-        scripted_pdt_model_two = torch.jit.script(PDTModelTwo, example_inputs={pdt_model_two.test_find: list_inp})
-
-        script_model_one, script_model_two = scripted_pdt_model_one(), scripted_pdt_model_two()
-        self.assertEqual(script_model_one.test_find({1.1: True, 2.2: True, 3.3: False, }, 4.4),
-                         pdt_model_one.test_find({1.1: True, 2.2: True, 3.3: False, }, 4.4))
-        self.assertEqual(script_model_two.test_find(["hello", "world", ], "world"),
-                         pdt_model_two.test_find(["hello", "world", ], "world"))
+        dict_inp: List[Tuple[Any, ...]] = [
+            (
+                {
+                    1.2: True,
+                    2.3: False,
+                },
+                1.2,
+            ),
+        ]
+        list_inp: List[Tuple[Any, ...]] = [
+            (
+                [
+                    "abc",
+                    "b",
+                ],
+                "c",
+            ),
+        ]
+        scripted_pdt_model_one = torch.jit.script(
+            PDTModelOne, example_inputs={pdt_model_one.test_find: dict_inp}
+        )
+        scripted_pdt_model_two = torch.jit.script(
+            PDTModelTwo, example_inputs={pdt_model_two.test_find: list_inp}
+        )
+
+        script_model_one, script_model_two = (
+            scripted_pdt_model_one(),
+            scripted_pdt_model_two(),
+        )
+        self.assertEqual(
+            script_model_one.test_find(
+                {
+                    1.1: True,
+                    2.2: True,
+                    3.3: False,
+                },
+                4.4,
+            ),
+            pdt_model_one.test_find(
+                {
+                    1.1: True,
+                    2.2: True,
+                    3.3: False,
+                },
+                4.4,
+            ),
+        )
+        self.assertEqual(
+            script_model_two.test_find(
+                [
+                    "hello",
+                    "world",
+                ],
+                "world",
+            ),
+            pdt_model_two.test_find(
+                [
+                    "hello",
+                    "world",
+                ],
+                "world",
+            ),
+        )
 
     def test_pdt(self):
         def test_sum(a, b):
@@ -218,7 +381,9 @@ def test_args_complex(real, img):
             return torch.complex(real, img)
 
         make_global(test_args_complex)
-        scripted_fn_complex = torch.jit.script(test_args_complex, example_inputs=[(torch.rand(3, 4), torch.rand(3, 4))])
+        scripted_fn_complex = torch.jit.script(
+            test_args_complex, example_inputs=[(torch.rand(3, 4), torch.rand(3, 4))]
+        )
         arg1, arg2 = torch.rand(3, 4), torch.rand(3, 4)
         self.assertEqual(scripted_fn_complex(arg1, arg2), test_args_complex(arg1, arg2))
 
@@ -248,25 +413,49 @@ def test_list_and_tuple(a):
 
         make_global(test_list_and_tuple)
 
-        scripted_fn_float_list_input = torch.jit.script(test_list_and_tuple, example_inputs=[([4.9, 8.9],)])
-        self.assertEqual(scripted_fn_float_list_input([11.9, 7.6]), test_list_and_tuple([11.9, 7.6]))
-
-        scripted_fn_bool_list_input = torch.jit.script(test_list_and_tuple, example_inputs=[([True, False, True],)])
-        self.assertEqual(scripted_fn_bool_list_input([True, True, True]), test_list_and_tuple([True, True, True]))
-
-        scripted_fn_int_list_input = torch.jit.script(test_list_and_tuple, example_inputs=[([3, 4, 5], )])
-        self.assertEqual(scripted_fn_int_list_input([1, 2, 3]), test_list_and_tuple([1, 2, 3]))
-
-        scripted_fn_float_tuple_input = torch.jit.script(test_list_and_tuple, example_inputs=[((4.9, 8.9),)])
-        self.assertEqual(scripted_fn_float_tuple_input((11.9, 7.6)), test_list_and_tuple((11.9, 7.6)))
-
-        scripted_fn_bool_tuple_input = torch.jit.script(test_list_and_tuple,
-                                                        example_inputs=[((True, False, True),)])
-        self.assertEqual(scripted_fn_bool_tuple_input((True, True, True)),
-                         test_list_and_tuple((True, True, True)))
-
-        scripted_fn_int_tuple_input = torch.jit.script(test_list_and_tuple, example_inputs=[((3, 4, 5), )])
-        self.assertEqual(scripted_fn_int_tuple_input((1, 2, 3)), test_list_and_tuple((1, 2, 3)))
+        scripted_fn_float_list_input = torch.jit.script(
+            test_list_and_tuple, example_inputs=[([4.9, 8.9],)]
+        )
+        self.assertEqual(
+            scripted_fn_float_list_input([11.9, 7.6]), test_list_and_tuple([11.9, 7.6])
+        )
+
+        scripted_fn_bool_list_input = torch.jit.script(
+            test_list_and_tuple, example_inputs=[([True, False, True],)]
+        )
+        self.assertEqual(
+            scripted_fn_bool_list_input([True, True, True]),
+            test_list_and_tuple([True, True, True]),
+        )
+
+        scripted_fn_int_list_input = torch.jit.script(
+            test_list_and_tuple, example_inputs=[([3, 4, 5],)]
+        )
+        self.assertEqual(
+            scripted_fn_int_list_input([1, 2, 3]), test_list_and_tuple([1, 2, 3])
+        )
+
+        scripted_fn_float_tuple_input = torch.jit.script(
+            test_list_and_tuple, example_inputs=[((4.9, 8.9),)]
+        )
+        self.assertEqual(
+            scripted_fn_float_tuple_input((11.9, 7.6)), test_list_and_tuple((11.9, 7.6))
+        )
+
+        scripted_fn_bool_tuple_input = torch.jit.script(
+            test_list_and_tuple, example_inputs=[((True, False, True),)]
+        )
+        self.assertEqual(
+            scripted_fn_bool_tuple_input((True, True, True)),
+            test_list_and_tuple((True, True, True)),
+        )
+
+        scripted_fn_int_tuple_input = torch.jit.script(
+            test_list_and_tuple, example_inputs=[((3, 4, 5),)]
+        )
+        self.assertEqual(
+            scripted_fn_int_tuple_input((1, 2, 3)), test_list_and_tuple((1, 2, 3))
+        )
 
     def test_nested_list_and_tuple(self):
         def test_nested_list(inp):
@@ -282,43 +471,207 @@ def test_nested_tuple(inp):
 
         make_global(test_nested_list, test_nested_tuple)
 
-        list_inp = [[1, 2, 3, ], [5, 6, 7, ]]
-        scripted_fn = torch.jit.script(test_nested_list, example_inputs=[(list_inp, ), ])
-        inp = [[0, 4, 7, ], [8, 11, ], [6, -1, -20, ]]
-        self.assertEqual(scripted_fn(inp, ), test_nested_list(inp, ))
-
-        list_inp = ([1, 2, 3, ], [5, 6, 7, ])
-        scripted_fn = torch.jit.script(test_nested_list, example_inputs=[(list_inp, ), ])
-        inp = ([0, 4, 7, ], [8, 11, ], [6, -1, -20, ])
-        self.assertEqual(scripted_fn(inp, ), test_nested_list(inp, ))
-
-        tup_inp = [(1.0, 2.6, 3.7, ), (5.7, 6.1, 1.7, )]
-        scripted_fn = torch.jit.script(test_nested_tuple, example_inputs=[(tup_inp, ), ])
-        inp = [(1.0, 4.1, 7.4, ), (4.8, 1.1, -1.2, ), (6.3, -1.3, -2.0, )]
-        self.assertEqual(scripted_fn(inp, ), test_nested_tuple(inp, ))
-
-        tup_inp = ((True, False, True, ), (False, False, False, ))
-        scripted_fn = torch.jit.script(test_nested_tuple, example_inputs=[(tup_inp, ), ])
-        inp = ((True, True, True, ), (False, False, True, ))
-        self.assertEqual(scripted_fn(inp, ), test_nested_tuple(inp, ))
+        list_inp = [
+            [
+                1,
+                2,
+                3,
+            ],
+            [
+                5,
+                6,
+                7,
+            ],
+        ]
+        scripted_fn = torch.jit.script(
+            test_nested_list,
+            example_inputs=[
+                (list_inp,),
+            ],
+        )
+        inp = [
+            [
+                0,
+                4,
+                7,
+            ],
+            [
+                8,
+                11,
+            ],
+            [
+                6,
+                -1,
+                -20,
+            ],
+        ]
+        self.assertEqual(
+            scripted_fn(
+                inp,
+            ),
+            test_nested_list(
+                inp,
+            ),
+        )
+
+        list_inp = (
+            [
+                1,
+                2,
+                3,
+            ],
+            [
+                5,
+                6,
+                7,
+            ],
+        )
+        scripted_fn = torch.jit.script(
+            test_nested_list,
+            example_inputs=[
+                (list_inp,),
+            ],
+        )
+        inp = (
+            [
+                0,
+                4,
+                7,
+            ],
+            [
+                8,
+                11,
+            ],
+            [
+                6,
+                -1,
+                -20,
+            ],
+        )
+        self.assertEqual(
+            scripted_fn(
+                inp,
+            ),
+            test_nested_list(
+                inp,
+            ),
+        )
+
+        tup_inp = [
+            (
+                1.0,
+                2.6,
+                3.7,
+            ),
+            (
+                5.7,
+                6.1,
+                1.7,
+            ),
+        ]
+        scripted_fn = torch.jit.script(
+            test_nested_tuple,
+            example_inputs=[
+                (tup_inp,),
+            ],
+        )
+        inp = [
+            (
+                1.0,
+                4.1,
+                7.4,
+            ),
+            (
+                4.8,
+                1.1,
+                -1.2,
+            ),
+            (
+                6.3,
+                -1.3,
+                -2.0,
+            ),
+        ]
+        self.assertEqual(
+            scripted_fn(
+                inp,
+            ),
+            test_nested_tuple(
+                inp,
+            ),
+        )
+
+        tup_inp = (
+            (
+                True,
+                False,
+                True,
+            ),
+            (
+                False,
+                False,
+                False,
+            ),
+        )
+        scripted_fn = torch.jit.script(
+            test_nested_tuple,
+            example_inputs=[
+                (tup_inp,),
+            ],
+        )
+        inp = (
+            (
+                True,
+                True,
+                True,
+            ),
+            (
+                False,
+                False,
+                True,
+            ),
+        )
+        self.assertEqual(
+            scripted_fn(
+                inp,
+            ),
+            test_nested_tuple(
+                inp,
+            ),
+        )
 
     def test_pdt_dict(self):
         def test_dict(a):
-            return a['foo']
+            return a["foo"]
 
         def test_dict_int_list(a):
             return a[1]
 
         make_global(test_dict, test_dict_int_list)
 
-        str_bool_inp = {'foo' : True, 'bar': False}
+        str_bool_inp = {"foo": True, "bar": False}
         scripted_fn = torch.jit.script(test_dict, example_inputs=[(str_bool_inp,)])
-        self.assertEqual(scripted_fn({'foo' : False, 'bar': True}, ), test_dict({'foo' : False, 'bar': True}, ))
-
-        str_list_inp = {0 : [True, False], 1: [False, True]}
-        scripted_fn = torch.jit.script(test_dict_int_list, example_inputs=[(str_list_inp,)])
-        self.assertEqual(scripted_fn({0 : [False, False], 1: [True, True]}, ),
-                         test_dict_int_list({0 : [False, False], 1: [True, True]}, ))
+        self.assertEqual(
+            scripted_fn(
+                {"foo": False, "bar": True},
+            ),
+            test_dict(
+                {"foo": False, "bar": True},
+            ),
+        )
+
+        str_list_inp = {0: [True, False], 1: [False, True]}
+        scripted_fn = torch.jit.script(
+            test_dict_int_list, example_inputs=[(str_list_inp,)]
+        )
+        self.assertEqual(
+            scripted_fn(
+                {0: [False, False], 1: [True, True]},
+            ),
+            test_dict_int_list(
+                {0: [False, False], 1: [True, True]},
+            ),
+        )
 
     def test_any(self):
         def test_multiple_types(a):
@@ -337,20 +690,36 @@ def test_multiple_type_refinement(a):
 
         make_global(test_multiple_types, test_multiple_type_refinement)
 
-        scripted_fn = torch.jit.script(test_multiple_types, example_inputs=[(1,), ("abc", ), (8.9,), ([3, 4, 5], )])
+        scripted_fn = torch.jit.script(
+            test_multiple_types, example_inputs=[(1,), ("abc",), (8.9,), ([3, 4, 5],)]
+        )
         self.assertEqual(scripted_fn(10), test_multiple_types(10))
         self.assertEqual(scripted_fn("def"), test_multiple_types("def"))
         self.assertEqual(scripted_fn(7.89999), test_multiple_types(7.89999))
         self.assertEqual(scripted_fn([10, 11, 14]), test_multiple_types([10, 11, 14]))
 
-        scripted_fn = torch.jit.script(test_multiple_type_refinement, example_inputs=[(1,), ("abc", ), (8.9,),
-                                       ([3, 4, 5],), (True, ), ({"a": True}, ), ])
+        scripted_fn = torch.jit.script(
+            test_multiple_type_refinement,
+            example_inputs=[
+                (1,),
+                ("abc",),
+                (8.9,),
+                ([3, 4, 5],),
+                (True,),
+                ({"a": True},),
+            ],
+        )
         self.assertEqual(scripted_fn(10), test_multiple_type_refinement(10))
         self.assertEqual(scripted_fn("def"), test_multiple_type_refinement("def"))
         self.assertEqual(scripted_fn(7.89999), test_multiple_type_refinement(7.89999))
-        self.assertEqual(scripted_fn([10, 11, 14]), test_multiple_type_refinement([10, 11, 14]))
+        self.assertEqual(
+            scripted_fn([10, 11, 14]), test_multiple_type_refinement([10, 11, 14])
+        )
         self.assertEqual(scripted_fn(False), test_multiple_type_refinement(False))
-        self.assertEqual(scripted_fn({"abc" : True, "def": False}), test_multiple_type_refinement({"abc" : True, "def": False}))
+        self.assertEqual(
+            scripted_fn({"abc": True, "def": False}),
+            test_multiple_type_refinement({"abc": True, "def": False}),
+        )
 
     def test_class_as_profiled_types(self):
         class UserDefinedClass:
@@ -369,9 +738,33 @@ def test_model(a, m):
         make_global(UserDefinedClass, test_model)
 
         user_class = UserDefinedClass()
-        scripted_fn = torch.jit.script(test_model, example_inputs=[(10, user_class, ), (10.9, user_class, ), ])
-        self.assertEqual(scripted_fn(100, user_class, ), test_model(100, user_class))
-        self.assertEqual(scripted_fn(1.9, user_class, ), test_model(1.9, user_class))
+        scripted_fn = torch.jit.script(
+            test_model,
+            example_inputs=[
+                (
+                    10,
+                    user_class,
+                ),
+                (
+                    10.9,
+                    user_class,
+                ),
+            ],
+        )
+        self.assertEqual(
+            scripted_fn(
+                100,
+                user_class,
+            ),
+            test_model(100, user_class),
+        )
+        self.assertEqual(
+            scripted_fn(
+                1.9,
+                user_class,
+            ),
+            test_model(1.9, user_class),
+        )
 
     def test_class_with_args_as_profiled_types(self):
         class ClassWithArgs:
@@ -391,8 +784,26 @@ def test_model_with_args(a, m):
         make_global(ClassWithArgs, test_model_with_args)
 
         user_class = ClassWithArgs(False)
-        scripted_fn = torch.jit.script(test_model_with_args, example_inputs=[(10, user_class, ), (10.9, user_class, ), ])
-        self.assertEqual(scripted_fn(100, ClassWithArgs(True), ), test_model_with_args(100, ClassWithArgs(True)))
+        scripted_fn = torch.jit.script(
+            test_model_with_args,
+            example_inputs=[
+                (
+                    10,
+                    user_class,
+                ),
+                (
+                    10.9,
+                    user_class,
+                ),
+            ],
+        )
+        self.assertEqual(
+            scripted_fn(
+                100,
+                ClassWithArgs(True),
+            ),
+            test_model_with_args(100, ClassWithArgs(True)),
+        )
 
     def test_nn_parameter_as_arg(self):
         class TestNNParameter(torch.nn.Module):
@@ -408,7 +819,14 @@ def forward(self, y):
 
         make_global(TestNNParameter)
         pdt_model = TestNNParameter()
-        scripted_fn = torch.jit.script(pdt_model, example_inputs={pdt_model: [(10, ), ], })
+        scripted_fn = torch.jit.script(
+            pdt_model,
+            example_inputs={
+                pdt_model: [
+                    (10,),
+                ],
+            },
+        )
         self.assertEqual(scripted_fn(20), pdt_model(20))
 
     def test_fx_tracing_with_typing(self):
@@ -422,7 +840,19 @@ def forward(self, a) -> FXModelOutput:
 
         make_global(FXModel, FXModelOutput)
         pdt_model = FXModel()
-        scripted_fn = torch.jit.script(pdt_model, example_inputs={pdt_model: [([10, 20, ], ), ], })
+        scripted_fn = torch.jit.script(
+            pdt_model,
+            example_inputs={
+                pdt_model: [
+                    (
+                        [
+                            10,
+                            20,
+                        ],
+                    ),
+                ],
+            },
+        )
         self.assertEqual(scripted_fn([20]), pdt_model([20]))
 
     def test_nonetype_as_optional_of_type(self):
@@ -434,11 +864,34 @@ def test_none(a) -> Any:
 
         make_global(test_none)
 
-        scripted_fn = torch.jit.script(test_none, example_inputs=[(None, ), (10.6, )])
-        self.assertEqual(scripted_fn(30.9, ), test_none(30.9, ))
-
-        scripted_fn = torch.jit.script(test_none, example_inputs=[(None, ), (10, )])
-        self.assertEqual(scripted_fn(2, ), test_none(2, ))
-
-        scripted_fn = torch.jit.script(test_none, example_inputs=[(None, ), (torch.Tensor(1), )])
-        self.assertEqual(scripted_fn(torch.ones(1), ), test_none(torch.ones(1), ))
+        scripted_fn = torch.jit.script(test_none, example_inputs=[(None,), (10.6,)])
+        self.assertEqual(
+            scripted_fn(
+                30.9,
+            ),
+            test_none(
+                30.9,
+            ),
+        )
+
+        scripted_fn = torch.jit.script(test_none, example_inputs=[(None,), (10,)])
+        self.assertEqual(
+            scripted_fn(
+                2,
+            ),
+            test_none(
+                2,
+            ),
+        )
+
+        scripted_fn = torch.jit.script(
+            test_none, example_inputs=[(None,), (torch.Tensor(1),)]
+        )
+        self.assertEqual(
+            scripted_fn(
+                torch.ones(1),
+            ),
+            test_none(
+                torch.ones(1),
+            ),
+        )
diff --git a/test/jit/test_peephole.py b/test/jit/test_peephole.py
index e79fbf6504797..496f2d63a69ac 100644
--- a/test/jit/test_peephole.py
+++ b/test/jit/test_peephole.py
@@ -1,17 +1,20 @@
 # Owner(s): ["oncall: jit"]
 
+import unittest
+from typing import Callable, List
+
 import torch
-from torch.testing._internal.jit_utils import JitTestCase, RUN_CUDA, _inline_everything
 from torch import nn
 from torch.testing import FileCheck
-from typing import Callable, List
+from torch.testing._internal.jit_utils import _inline_everything, JitTestCase, RUN_CUDA
 
-import unittest
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
 
 class TestPeephole(JitTestCase):
     def test_peephole_with_writes(self):
@@ -62,11 +65,11 @@ def f(x, y):
 
         tf = torch.jit.trace(f, (a, b))
         FileCheck().check("type_as").run(str(tf.graph))
-        self.run_pass('peephole', tf.graph)
+        self.run_pass("peephole", tf.graph)
         FileCheck().check_not("type_as").run(str(tf.graph))
         tf2 = torch.jit.trace(f, (a, c))
         s = str(tf2.graph)
-        self.run_pass('peephole', tf2.graph)
+        self.run_pass("peephole", tf2.graph)
         self.assertEqual(s, str(s))
 
     def test_peephole_dynamic(self):
@@ -83,7 +86,7 @@ def test_peephole_list_ops(self):
         def foo(x, y, z):
             return len([x, y, z])
 
-        self.run_pass('peephole', foo.graph)
+        self.run_pass("peephole", foo.graph)
         FileCheck().check("value=3").check_next("return").run(foo.graph)
 
         @torch.jit.script
@@ -93,7 +96,7 @@ def foo(x, y, z):
                 li.append(x)
             return len([x, y, z])
 
-        self.run_pass('peephole', foo.graph)
+        self.run_pass("peephole", foo.graph)
         FileCheck().check_not("aten::len").run(foo.graph)
 
         @torch.jit.script
@@ -102,7 +105,7 @@ def foo(x, y, z):
             return li[1], li[-2]
 
         FileCheck().check("aten::__getitem__").run(foo.graph)
-        self.run_pass('peephole', foo.graph)
+        self.run_pass("peephole", foo.graph)
         FileCheck().check_not("aten::__getitem__").run(foo.graph)
 
         @torch.jit.script
@@ -110,7 +113,7 @@ def foo(x, y, z):
             li = [x, y, z]
             return li[-7]
 
-        self.run_pass('peephole', foo.graph)
+        self.run_pass("peephole", foo.graph)
         FileCheck().check("aten::__getitem__").run(foo.graph)
 
         @torch.jit.script
@@ -120,25 +123,25 @@ def foo(x, y, z):
                 li.append(x)
             return li[-2]
 
-        self.run_pass('peephole', foo.graph)
+        self.run_pass("peephole", foo.graph)
         FileCheck().check("aten::__getitem__").run(foo.graph)
 
     @unittest.skipIf(not RUN_CUDA, "cpp tests require CUDA")
     def test_peephole_cuda(self):
-        a = torch.tensor([0.4], device='cpu')
-        b = torch.tensor([0.7], device='cuda')
-        c = torch.tensor([0.7], device='cuda')
+        a = torch.tensor([0.4], device="cpu")
+        b = torch.tensor([0.7], device="cuda")
+        c = torch.tensor([0.7], device="cuda")
 
         def f(x, y):
             return x.type_as(y)
 
         trace = torch.jit.trace(f, (a, c))
         s = str(trace.graph)
-        self.run_pass('peephole', trace.graph)
+        self.run_pass("peephole", trace.graph)
         self.assertEqual(s, str(trace.graph))
         trace = torch.jit.trace(f, (b, c))
-        self.run_pass('peephole', trace.graph)
-        self.run_pass('dce', trace.graph)
+        self.run_pass("peephole", trace.graph)
+        self.run_pass("dce", trace.graph)
         FileCheck().check_not("type_as").run(str(trace.graph))
 
     @_inline_everything
@@ -152,7 +155,7 @@ def test():
             return refine(torch.tensor(4))
 
         FileCheck().check("prim::unchecked_cast").run(test.graph)
-        self.run_pass('peephole', test.graph)
+        self.run_pass("peephole", test.graph)
         FileCheck().check_not("prim::unchecked_cast").run(test.graph)
 
         # refinement not optimzied out
@@ -166,7 +169,7 @@ def is_int_tensor(x):
         self.checkScript(is_int_tensor, (torch.tensor(2),))
         self.checkScript(is_int_tensor, (torch.tensor(2.5),))
         graph = torch.jit.script(is_int_tensor).graph
-        self.run_pass('peephole', graph)
+        self.run_pass("peephole", graph)
         FileCheck().check("prim::unchecked_cast").run(graph)
 
     def test_short_circuit_optimization(self):
@@ -174,8 +177,11 @@ def test_short_circuit_optimization(self):
         def const_expressions(x):
             # type: (int) -> Tuple[bool, bool]
             return x == 1 and False, x == 1 or True
-        self.run_pass('constant_propagation', const_expressions.graph)
-        FileCheck().check_not("prim::If").check_not("aten::eq").run(const_expressions.graph)
+
+        self.run_pass("constant_propagation", const_expressions.graph)
+        FileCheck().check_not("prim::If").check_not("aten::eq").run(
+            const_expressions.graph
+        )
         self.assertEqual(const_expressions(1), (False, True))
 
         @torch.jit.script
@@ -183,15 +189,18 @@ def redundant_expressions(x):
             # type: (int) -> Tuple[bool, bool]
             return x == 1 and True, x == 1 or False
 
-        self.run_pass('peephole', redundant_expressions.graph)
+        self.run_pass("peephole", redundant_expressions.graph)
         self.assertEqual(redundant_expressions(1), (True, True))
         self.assertEqual(redundant_expressions(0), (False, False))
         # and True / or False are removed from graph
-        FileCheck().check("aten::eq").check_not("prim::If").run(redundant_expressions.graph)
+        FileCheck().check("aten::eq").check_not("prim::If").run(
+            redundant_expressions.graph
+        )
 
     def test_conv_dim_folding(self):
         modules = [nn.Conv1d, nn.Conv2d, nn.Conv3d]
         for mod in modules:
+
             class ConvDim(torch.nn.Module):
                 def __init__(self):
                     super().__init__()
@@ -233,7 +242,6 @@ def convertible_rsub(x, y):
         FileCheck().check_count("aten::sub", 2, exactly=True).run(op_graph)
         FileCheck().check_count("aten::rsub", 0, exactly=True).run(op_graph)
 
-
     def test_normalized_is_op(self):
         def convertible_is_op(x: bool, y: bool):
             return x is True, False is x, x is y
@@ -266,7 +274,7 @@ def gen_li(inp_len: int):
         @torch.jit.script
         def foo(x: List[int], y: List[int]):
             if len(x) != 4 or len(y) != 5:
-                raise Exception("")
+                raise Exception("")  # noqa: TRY002
 
             return len(x) + len(y)
 
@@ -280,7 +288,7 @@ def foo(x: List[int], y: List[int]):
             if len(x) == 4 and len(y) == 5:
                 pass
             else:
-                raise Exception("hi")
+                raise Exception("hi")  # noqa: TRY002
 
             return len(x) + len(y)
 
@@ -292,15 +300,15 @@ def foo(x: List[int], y: List[int]):
         @torch.jit.script
         def foo(x: List[int], y: List[int], z: List[int]):
             if len(x) != 4:
-                raise Exception("..")
+                raise Exception("..")  # noqa: TRY002
             else:
                 if len(y) != 8:
-                    raise Exception("...")
+                    raise Exception("...")  # noqa: TRY002
                 else:
                     if len(z) == 3:
                         pass
                     else:
-                        raise Exception("...")
+                        raise Exception("...")  # noqa: TRY002
 
             return len(x) + len(y) * len(z)
 
@@ -450,7 +458,7 @@ def run_peephole_and_check_const_value(graph, const_string):
         @torch.jit.script
         def foo(x: int, y: int):
             if x != 4 or y != 5:
-                raise Exception("")
+                raise Exception("")  # noqa: TRY002
 
             return x + y
 
@@ -469,7 +477,7 @@ def foo(x: int, y: int):
             if x == 4 and y == 5:
                 pass
             else:
-                raise Exception("hi")
+                raise Exception("hi")  # noqa: TRY002
 
             return x + y
 
@@ -481,15 +489,15 @@ def foo(x: int, y: int):
         @torch.jit.script
         def foo(x: int, y: int, z: int):
             if x != 4:
-                raise Exception("..")
+                raise Exception("..")  # noqa: TRY002
             else:
                 if y != 8:
-                    raise Exception("...")
+                    raise Exception("...")  # noqa: TRY002
                 else:
                     if z == 3:
                         pass
                     else:
-                        raise Exception("...")
+                        raise Exception("...")  # noqa: TRY002
 
             return x + y * z
 
@@ -558,7 +566,7 @@ def foo3():
 
         def foo4():
             x = torch.zeros([2, 2])
-            return x + 0.
+            return x + 0.0
 
         funcs = foo1, foo2, foo3, foo4
         inps = (torch.ones([2]),), (), (), ()
@@ -582,7 +590,7 @@ def func(x):
         self.assertEqual(func(torch.ones([2, 2])), func_s(torch.ones([2, 2])))
 
         def func(x):
-            return (x + 0.) - 5
+            return (x + 0.0) - 5
 
         func_s = torch.jit.script(func)
         inp = next(func_s.graph.inputs())
@@ -640,6 +648,7 @@ def foo(z: int, z2: int, cond: bool):
                 return z
             else:
                 return z2
+
         out = next(foo.graph.findNode("prim::If").outputs())
         out.setType(torch._C.OptionalType(torch._C.IntType.get()))
         self.run_pass("peephole", foo.graph)
@@ -665,12 +674,13 @@ def foo(input0: int, input1: int, input2: int, input3: int):
             _6 = torch.add(1 * torch.sub(_3, 3) // 1, 1) / 1
             return [_5, int(_6)]
 
-        FileCheck().check("aten::add").check("aten::sub") \
-                   .check("aten::mul").check("aten::floordiv") \
-                   .check("aten::div").run(foo.graph)
+        FileCheck().check("aten::add").check("aten::sub").check("aten::mul").check(
+            "aten::floordiv"
+        ).check("aten::div").run(foo.graph)
         self.run_pass("peephole", foo.graph)
-        FileCheck().check("graph").check("):") \
-                   .check_next("ListConstruct").check_next("return").run(foo.graph)
+        FileCheck().check("graph").check("):").check_next("ListConstruct").check_next(
+            "return"
+        ).run(foo.graph)
         self.assertEqual(foo(0, 1, 2, 3), [1, 3])
 
     def test_peephole_dict_getitem_simple(self):
@@ -687,9 +697,9 @@ def foo(a: int, b: int):
 
         @torch.jit.script
         def foo(a: int, b: int):
-            d = {'0': a, '1': b}
-            x = d['1']
-            y = d['0']
+            d = {"0": a, "1": b}
+            x = d["1"]
+            y = d["0"]
             return x, y
 
         self.run_pass("peephole", foo.graph)
@@ -815,14 +825,14 @@ def foo(x: int):
         graph = torch.jit.script(foo).graph
         self.run_pass("peephole", graph)
         FileCheck().check_not("aten::slice").run(graph)
-        self.checkScript(foo, (3, ))
+        self.checkScript(foo, (3,))
 
     def test_peephole_slice_one_empty_arg(self):
         def check_helper(fn: Callable[[int], None]) -> None:
             graph = torch.jit.script(fn).graph
             self.run_pass("peephole", graph)
             FileCheck().check_not("aten::slice").run(graph)
-            self.checkScript(fn, (3, ))
+            self.checkScript(fn, (3,))
 
         def foo(x: int):
             return [1, 2, x, 4, 5, 6, 7][1::2]
@@ -844,7 +854,7 @@ def check_helper(fn: Callable[[int], None]) -> None:
             graph = torch.jit.script(fn).graph
             self.run_pass("peephole", graph)
             FileCheck().check_not("aten::slice").run(graph)
-            self.checkScript(fn, (3, ))
+            self.checkScript(fn, (3,))
 
         def foo(x: int):
             return [1, 2, x, 4, 5, 6, 7][::2]
diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py
index 5389751a5becc..4bc45b78324de 100644
--- a/test/jit/test_profiler.py
+++ b/test/jit/test_profiler.py
@@ -9,12 +9,15 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase, warmup_backward, FileCheck
+from torch.testing._internal.jit_utils import FileCheck, JitTestCase, warmup_backward
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
 
 @skipIfTorchDynamo()
 class TestProfiler(JitTestCase):
@@ -58,8 +61,9 @@ def scalar_type_input(x, y, z):
 
         # item & add should not get pulled into the fusion group -
         # we expect to see Fusion Group (item / add) Fusion Group in ir dump
-        FileCheck().check("TensorExpr").check("Scalar = aten::item").check_next("Tensor = aten::add").check("TensorExpr").run(g)
-
+        FileCheck().check("TensorExpr").check("Scalar = aten::item").check_next(
+            "Tensor = aten::add"
+        ).check("TensorExpr").run(g)
 
         @torch.jit.script
         def non_const_dtype(x, y, cond: bool):
@@ -70,7 +74,9 @@ def non_const_dtype(x, y, cond: bool):
         non_const_dtype(x, x, True)
         g = torch.jit.last_executed_optimized_graph()
         # because dtype is non-const, sum should not get pulled into the Fusion Group
-        FileCheck().check("TensorExpr").check("TensorExpr").check_not("aten::sum").run(g)
+        FileCheck().check("TensorExpr").check("TensorExpr").check_not("aten::sum").run(
+            g
+        )
 
     def test_specialize_backward(self):
         def test_fuse(a, b):
@@ -118,13 +124,15 @@ def test_fuse(a, b):
             d = c * b
             return d
 
-        x = torch.tensor([.5])
+        x = torch.tensor([0.5])
         for _ in range(3):
             test_fuse(x, x)
 
         g = torch.jit.last_executed_optimized_graph()
         # Types should remain specialized for typecheck outputs & fusion outputs
-        FileCheck().check("Double(").check_same("prim::TypeCheck").check_same("\n").check("Double").check_same("TensorExpr").run(g)
+        FileCheck().check("Double(").check_same("prim::TypeCheck").check_same(
+            "\n"
+        ).check("Double").check_same("TensorExpr").run(g)
 
         # other outputs should not be specialized
         FileCheck().check("Tensor = prim::If").run(g)
@@ -201,7 +209,9 @@ def foo(a, b):
         foo(x, y)
         foo(x, y)
         g = torch.jit.last_executed_optimized_graph()
-        FileCheck().check("CallFunction").check_next("Tensor = prim::TupleUnpack").run(g)
+        FileCheck().check("CallFunction").check_next("Tensor = prim::TupleUnpack").run(
+            g
+        )
 
     def test_autograd_fallback_graph(self):
         @torch.jit.script
diff --git a/test/jit/test_python_builtins.py b/test/jit/test_python_builtins.py
index 34fd2fe036a1d..2269631b4dd67 100644
--- a/test/jit/test_python_builtins.py
+++ b/test/jit/test_python_builtins.py
@@ -1,13 +1,13 @@
 # Owner(s): ["oncall: jit"]
 
 import os
+import random
 import sys
 import tempfile
-import random
 from textwrap import dedent
 
 import torch
-from torch.testing._internal.jit_utils import JitTestCase, execWrapper
+from torch.testing._internal.jit_utils import execWrapper, JitTestCase
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
@@ -20,14 +20,17 @@
         "instead."
     )
 
+
 def get_fn(file_name, script_path):
     import importlib.util
+
     spec = importlib.util.spec_from_file_location(file_name, script_path)
     module = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(module)
     fn = module.fn
     return fn
 
+
 class TestPythonBuiltinOP(JitTestCase):
     def test_add(self):
         def func(a, b):
@@ -48,16 +51,18 @@ def func(a, b):
         self.checkScript(func, (a, b), optimize=True)
 
     def test_matmul_py3(self):
-        code = dedent("""
+        code = dedent(
+            """
         def fn(a, b):
             return a @ b
-        """)
+        """
+        )
 
         with tempfile.TemporaryDirectory() as tmp_dir:
-            script_path = os.path.join(tmp_dir, 'script.py')
-            with open(script_path, 'w') as f:
+            script_path = os.path.join(tmp_dir, "script.py")
+            with open(script_path, "w") as f:
                 f.write(code)
-            fn = get_fn('test_matmul_py3', script_path)
+            fn = get_fn("test_matmul_py3", script_path)
 
             a = torch.rand(4, 3, requires_grad=True)
             b = torch.rand(3, 2, requires_grad=True)
@@ -65,18 +70,18 @@ def fn(a, b):
 
     def test_pow(self):
         def func(a, b):
-            return a ** b
+            return a**b
 
         def func2(a, b, c, d):
-            return c + a ** b ** d
+            return c + a**b**d
 
         def func3(a, b):
             # type: (int, float) -> float
-            return a ** b
+            return a**b
 
         def func4():
             # type: () -> float
-            return 2 ** -2
+            return 2**-2
 
         def func5(x, y):
             return x.item() ** y.item()
@@ -90,7 +95,12 @@ def func5(x, y):
         self.checkScript(func3, (4, -0.5), optimize=True)
         self.checkScript(func4, ())
 
-        inputs = [torch.tensor(2), torch.tensor(-2), torch.tensor(.5), torch.tensor(.2)]
+        inputs = [
+            torch.tensor(2),
+            torch.tensor(-2),
+            torch.tensor(0.5),
+            torch.tensor(0.2),
+        ]
         for x in inputs:
             for y in inputs:
                 if x < 0:
@@ -100,7 +110,7 @@ def func5(x, y):
 
     def test_triple(self):
         def func(x):
-            return 3. * x
+            return 3.0 * x
 
         x = torch.rand(1, dtype=torch.float, requires_grad=True)
         self.checkScript(func, [x], optimize=True)
@@ -154,22 +164,36 @@ def _check_code(self, code_str, fn_name, inputs):
 
     def test_stepped_tuple_slicing(self):
         def check_slicing_tuple(slicing, tuple_type, tuple):
-            template = dedent("""
+            template = dedent(
+                """
             def func(x):
                 # type: ({}) -> Any
                 return x{}
-            """)
+            """
+            )
             self._check_code(template.format(tuple_type, slicing), "func", [tuple])
 
         check_slicing_tuple("[-3:3:2]", "Tuple[int, int, int]", (0, 1, 2))
         check_slicing_tuple("[::55]", "Tuple[int, int, int, int, int]", (0, 1, 2, 3, 4))
         check_slicing_tuple("[:4:4]", "Tuple[int, int, int, int, int]", (0, 1, 2, 3, 4))
-        check_slicing_tuple("[::-1]", "Tuple[int, int, int, int, int, int, int]", (0, 1, 2, 3, 4, 5, 6))
-        check_slicing_tuple("[7:5:2]", "Tuple[int, int, int, int, int, int, int]", (0, 1, 2, 3, 4, 5, 6))
-        check_slicing_tuple("[5:7:-2]", "Tuple[int, int, int, int, int, int, int]", (0, 1, 2, 3, 4, 5, 6))
+        check_slicing_tuple(
+            "[::-1]", "Tuple[int, int, int, int, int, int, int]", (0, 1, 2, 3, 4, 5, 6)
+        )
+        check_slicing_tuple(
+            "[7:5:2]", "Tuple[int, int, int, int, int, int, int]", (0, 1, 2, 3, 4, 5, 6)
+        )
+        check_slicing_tuple(
+            "[5:7:-2]",
+            "Tuple[int, int, int, int, int, int, int]",
+            (0, 1, 2, 3, 4, 5, 6),
+        )
         check_slicing_tuple("[::-2]", "Tuple[int, int, int, int, int]", (0, 1, 2, 3, 4))
-        check_slicing_tuple("[:4:-3]", "Tuple[int, int, int, int, int, int]", (0, 1, 2, 3, 4, 5))
-        check_slicing_tuple("[3::-2]", "Tuple[int, int, int, int, int]", (0, 1, 2, 3, 4))
+        check_slicing_tuple(
+            "[:4:-3]", "Tuple[int, int, int, int, int, int]", (0, 1, 2, 3, 4, 5)
+        )
+        check_slicing_tuple(
+            "[3::-2]", "Tuple[int, int, int, int, int]", (0, 1, 2, 3, 4)
+        )
 
     def test_index(self):
         def consec(size, start=0):
@@ -177,10 +201,12 @@ def consec(size, start=0):
             return torch.arange(numel).view(size)
 
         def check_indexing(indexing, tensor):
-            template = dedent("""
+            template = dedent(
+                """
             def func(x):
                 return x{}
-            """)
+            """
+            )
 
             self._check_code(template.format(indexing), "func", [tensor])
 
@@ -188,62 +214,66 @@ def check_dynamic_indexing(indexing, tensor, value1, value2):
             value1 = torch.tensor(value1)
             value2 = torch.tensor(value2)
 
-            template = dedent("""
+            template = dedent(
+                """
             def func(x, value1, value2):
                 i = int(value1)
                 j = int(value2)
                 return x{}
-            """)
+            """
+            )
 
-            self._check_code(template.format(indexing), "func", [tensor, value1, value2])
+            self._check_code(
+                template.format(indexing), "func", [tensor, value1, value2]
+            )
 
         # basic slices
-        check_indexing('[0]', consec((3, 3)))
-        check_indexing('[1]', consec((3, 3), 10))
-        check_indexing('[2]', consec((3, 3), 19))
-        check_indexing('[2]', consec((3,)))
-        check_indexing('[-1]', consec((3, 3), 19))
-        check_indexing('[0:2]', consec((3, 3, 3)))
-        check_indexing('[1:-1]', consec((3, 3, 3)))
-        check_indexing('[-3:-1]', consec((6, 3)))
-        check_indexing('[1:]', consec((3, 3)))
-        check_indexing('[:1]', consec((3, 3)))
-        check_indexing('[:]', consec((3, 2)))
+        check_indexing("[0]", consec((3, 3)))
+        check_indexing("[1]", consec((3, 3), 10))
+        check_indexing("[2]", consec((3, 3), 19))
+        check_indexing("[2]", consec((3,)))
+        check_indexing("[-1]", consec((3, 3), 19))
+        check_indexing("[0:2]", consec((3, 3, 3)))
+        check_indexing("[1:-1]", consec((3, 3, 3)))
+        check_indexing("[-3:-1]", consec((6, 3)))
+        check_indexing("[1:]", consec((3, 3)))
+        check_indexing("[:1]", consec((3, 3)))
+        check_indexing("[:]", consec((3, 2)))
 
         # multi-dim: indexes
-        check_indexing('[0, 1]', consec((3, 3)))
-        check_indexing('[0, 1]', consec((3, 3, 2)))
-        check_indexing('[1, 0, 2]', consec((3, 3, 3)))
-        check_indexing('[2, -1]', consec((3, 3)))
+        check_indexing("[0, 1]", consec((3, 3)))
+        check_indexing("[0, 1]", consec((3, 3, 2)))
+        check_indexing("[1, 0, 2]", consec((3, 3, 3)))
+        check_indexing("[2, -1]", consec((3, 3)))
 
         # multi-dim: mixed slicing and indexing
-        check_indexing('[0, 1:2]', consec((3, 3)))
-        check_indexing('[0, :1]', consec((3, 3, 2)))
-        check_indexing('[1, 2:]', consec((3, 3, 3)))
-        check_indexing('[-1, 1:, 0]', consec((3, 3, 3, 3)))
-        check_indexing('[1:, -1, 0]', consec((3, 3, 3, 3)))
-        check_indexing('[-1, 2:, 1:2]', consec((3, 3, 3, 3)))
-        check_indexing('[-1, 1:, 0]', consec((3, 3, 3, 3)))
-        check_indexing('[-1, :, 0, 2]', consec((3, 3, 3, 3)))
+        check_indexing("[0, 1:2]", consec((3, 3)))
+        check_indexing("[0, :1]", consec((3, 3, 2)))
+        check_indexing("[1, 2:]", consec((3, 3, 3)))
+        check_indexing("[-1, 1:, 0]", consec((3, 3, 3, 3)))
+        check_indexing("[1:, -1, 0]", consec((3, 3, 3, 3)))
+        check_indexing("[-1, 2:, 1:2]", consec((3, 3, 3, 3)))
+        check_indexing("[-1, 1:, 0]", consec((3, 3, 3, 3)))
+        check_indexing("[-1, :, 0, 2]", consec((3, 3, 3, 3)))
 
         # zero-sized slices
-        check_indexing('[0:0]', consec((2, 2)))
-        check_indexing('[0:0, 1]', consec((3, 3)))
+        check_indexing("[0:0]", consec((2, 2)))
+        check_indexing("[0:0, 1]", consec((3, 3)))
 
         # trivial expression usage
-        check_indexing('[1+1]', consec((3, 3)))
-        check_indexing('[1:(0 + 2)]', consec((3, 3, 3)))
+        check_indexing("[1+1]", consec((3, 3)))
+        check_indexing("[1:(0 + 2)]", consec((3, 3, 3)))
 
         # None for new dimensions
-        check_indexing('[None, 0]', consec((3, 3)))
-        check_indexing('[1, None]', consec((3, 3), 10))
-        check_indexing('[None, None, 2]', consec((3, 3), 19))
-        check_indexing('[None, 2, None]', consec((3,)))
-        check_indexing('[0:2, None]', consec((3, 3, 3)))
-        check_indexing('[None, 1:-1]', consec((3, 3, 3)))
-        check_indexing('[None, -3:-1, None]', consec((6, 3)))
-        check_indexing('[-1, None, 2:, None, 1:2]', consec((3, 3, 3, 3)))
-        check_indexing('[None, -1, None, 2:, None, 1:2, None]', consec((3, 3, 3, 3)))
+        check_indexing("[None, 0]", consec((3, 3)))
+        check_indexing("[1, None]", consec((3, 3), 10))
+        check_indexing("[None, None, 2]", consec((3, 3), 19))
+        check_indexing("[None, 2, None]", consec((3,)))
+        check_indexing("[0:2, None]", consec((3, 3, 3)))
+        check_indexing("[None, 1:-1]", consec((3, 3, 3)))
+        check_indexing("[None, -3:-1, None]", consec((6, 3)))
+        check_indexing("[-1, None, 2:, None, 1:2]", consec((3, 3, 3, 3)))
+        check_indexing("[None, -1, None, 2:, None, 1:2, None]", consec((3, 3, 3, 3)))
 
         # dynamic expression usage
         check_dynamic_indexing("[i + j]", consec((3, 3)), 0, 1)
@@ -257,10 +287,12 @@ def consec(size, start=0):
         def check_indexing(indexing, tensor, **kwargs):
             indices_dict = kwargs
 
-            template = dedent("""
+            template = dedent(
+                """
             def func(x{formals}):
                 return x{expr}
-            """)
+            """
+            )
 
             formals = []
             values = []
@@ -268,17 +300,18 @@ def func(x{formals}):
                 formals.append(formal)
                 values.append(value)
 
-            formals = ''.join(map(', {}'.format, formals))
+            formals = "".join(map(", {}".format, formals))
             inputs = [tensor] + values
-            self._check_code(template.format(formals=formals, expr=indexing),
-                             "func", inputs)
+            self._check_code(
+                template.format(formals=formals, expr=indexing), "func", inputs
+            )
 
         # Indexing with tensor (basic)
-        check_indexing('[i]', consec((3, 3)), i=torch.tensor([0]))
-        check_indexing('[i]', consec((3, 3)), i=torch.tensor(1))
-        check_indexing('[i]', consec((3, 3)), i=torch.tensor([-2]))
-        check_indexing('[i]', consec((3, 3), 2), i=torch.tensor([0, 0]))
-        check_indexing('[i]', consec((3, 3, 2, 2)), i=torch.tensor([0, -2, 1]))
+        check_indexing("[i]", consec((3, 3)), i=torch.tensor([0]))
+        check_indexing("[i]", consec((3, 3)), i=torch.tensor(1))
+        check_indexing("[i]", consec((3, 3)), i=torch.tensor([-2]))
+        check_indexing("[i]", consec((3, 3), 2), i=torch.tensor([0, 0]))
+        check_indexing("[i]", consec((3, 3, 2, 2)), i=torch.tensor([0, -2, 1]))
 
         # NB: indexing with tensors and indexing with sequences can be implemented
         # in a very similar way (sequences are converted to tensors), so only one
@@ -290,49 +323,49 @@ def func(x{formals}):
         inp = consec((4, 8, 5))
         to_check = [
             # [[0, 1, 3]]
-            ['[i]', {'i': [0, 1, 3]}],
+            ["[i]", {"i": [0, 1, 3]}],
             # [[0, 2], [1, 3]]
-            ['[i, j]', {'i': [0, 2], 'j': [1, 3]}],
+            ["[i, j]", {"i": [0, 2], "j": [1, 3]}],
             # [[[0, 1], [0, 1]], [[0, 1], [0, 1]]]
-            ['[i, j]', {'i': [[0, 1], [0, 1]], 'j': [[0, 1], [0, 1]]}],
+            ["[i, j]", {"i": [[0, 1], [0, 1]], "j": [[0, 1], [0, 1]]}],
             # [[0, 2], [1, 3], [1, 1]]
-            ['[i, j, k]', {'i': [0, 2], 'j': [1, 3], 'k': [1, 1]}],
+            ["[i, j, k]", {"i": [0, 2], "j": [1, 3], "k": [1, 1]}],
             # [[0, 2], 1, [1, 1]]
-            ['[i, j, k]', {'i': [0, 2], 'j': 1, 'k': [1, 1]}],
+            ["[i, j, k]", {"i": [0, 2], "j": 1, "k": [1, 1]}],
             # [:, :, [0, 3, 4]]
-            ['[:, :, i]', {'i': [0, 3, 4]}],
+            ["[:, :, i]", {"i": [0, 3, 4]}],
             # [:, [2, 4, 5, 7], 2:4]
-            ['[:, i, 2:4]', {'i': [0, 2, 3]}],
+            ["[:, i, 2:4]", {"i": [0, 2, 3]}],
             # [[2, 3], :, :]
-            ['[i, :, :]', {'i': [2, 3]}],
+            ["[i, :, :]", {"i": [2, 3]}],
             # [:, [0, 2, 3], [1, 3, 4]]
-            ['[:, i, j]', {'i': [0, 2, 3], 'j': [1, 3, 4]}],
+            ["[:, i, j]", {"i": [0, 2, 3], "j": [1, 3, 4]}],
             # [:, [0], [1, 2, 4]]
-            ['[:, i, j]', {'i': [0], 'j': [1, 2, 4]}],
+            ["[:, i, j]", {"i": [0], "j": [1, 2, 4]}],
             # [:, [0, 1, 3], [4]]
-            ['[:, i, j]', {'i': [0, 1, 3], 'j': [4]}],
+            ["[:, i, j]", {"i": [0, 1, 3], "j": [4]}],
             # [:, [[0, 1], [1, 0]], [[2, 3]]]
-            ['[:, i, j]', {'i': [[0, 1], [1, 0]], 'j': [[2, 3]]}],
+            ["[:, i, j]", {"i": [[0, 1], [1, 0]], "j": [[2, 3]]}],
             # [:, [[0, 1], [2, 3]], [[0]]]
-            ['[:, i, j]', {'i': [[0, 1], [2, 3]], 'j': [[0]]}],
+            ["[:, i, j]", {"i": [[0, 1], [2, 3]], "j": [[0]]}],
             # [:, [[5, 6]], [[0, 3], [4, 4]]]
-            ['[:, i, j]', {'i': [[5, 6]], 'j': [[0, 3], [4, 4]]}],
+            ["[:, i, j]", {"i": [[5, 6]], "j": [[0, 3], [4, 4]]}],
             # [[0, 2, 3], [1, 3, 4], :]
-            ['[i, j, :]', {'i': [0, 2, 3], 'j': [1, 3, 4]}],
+            ["[i, j, :]", {"i": [0, 2, 3], "j": [1, 3, 4]}],
             # [0, [1, 2, 4], :]
-            ['[i, j, :]', {'i': 0, 'j': [1, 2, 4]}],
+            ["[i, j, :]", {"i": 0, "j": [1, 2, 4]}],
             # [[0, 1, 3], 4, :]
-            ['[i, j, :]', {'i': [0, 1, 3], 'j': 4}],
+            ["[i, j, :]", {"i": [0, 1, 3], "j": 4}],
             # [[[0, 1], [1, 0]], [[2, 1], [3, 5]], :]
-            ['[i, j, :]', {'i': [[0, 1], [1, 0]], 'j': [[2, 1], [3, 5]]}],
+            ["[i, j, :]", {"i": [[0, 1], [1, 0]], "j": [[2, 1], [3, 5]]}],
             # [[[0, 1], [1, 0]], [[2, 3]], :]
-            ['[i, j, :]', {'i': [[0, 1], [1, 0]], 'j': [[2, 3]]}],
+            ["[i, j, :]", {"i": [[0, 1], [1, 0]], "j": [[2, 3]]}],
             # [[[0, 1], [2, 3]], [[0]], :]
-            ['[i, j, :]', {'i': [[0, 1], [2, 3]], 'j': [[0]]}],
+            ["[i, j, :]", {"i": [[0, 1], [2, 3]], "j": [[0]]}],
             # [[[2, 1]], [[0, 3], [4, 4]], :]
-            ['[i, j, :]', {'i': [[2, 1]], 'j': [[0, 3], [4, 4]]}],
+            ["[i, j, :]", {"i": [[2, 1]], "j": [[0, 3], [4, 4]]}],
             # [[[2]], [[0, 3], [4, 1]], 0:2]
-            ['[i, j, 0:2]', {'i': [[2]], 'j': [[0, 3], [4, 1]]}],
+            ["[i, j, 0:2]", {"i": [[2]], "j": [[0, 3], [4, 1]]}],
         ]
 
         for expr, argdict in to_check:
@@ -372,29 +405,35 @@ def test_index_ellipses(self):
         for _ in range(100):
             indices = [random.choice(vals) for _ in range(4)]
             indices[random.randint(0, len(indices) - 1)] = "..."
-            test_str = dedent("""
+            test_str = dedent(
+                """
             def f():
                 x = torch.ones(10, 9, 8, 7, 6)
                 return x{indices}.shape
-            """.format(indices=indices))
-            test_str = test_str.replace(r"'", r'')
+            """.format(
+                    indices=indices
+                )
+            )
+            test_str = test_str.replace(r"'", r"")
             scope = {}
             execWrapper(test_str, globals(), scope)
             cu = torch.jit.CompilationUnit(test_str)
             res1 = cu.f()
-            res2 = scope['f']()
+            res2 = scope["f"]()
             self.assertEqual(res1, res2)
 
     def test_inf(self):
         @torch.jit.script
         def foo(a):
-            return a < float('inf')
+            return a < float("inf")
+
         s = torch.rand(1)
         self.assertTrue(foo(s))
 
         @torch.jit.script
         def bar(a):
-            return a > float('-inf')
+            return a > float("-inf")
+
         s = torch.rand(1)
         self.assertTrue(foo(s))
 
@@ -414,19 +453,22 @@ def foo(x):
     def test_str_to_float(self):
         @torch.jit.script
         def foo(a):
-            return 0.5 == float('0.5 hello')
+            return 0.5 == float("0.5 hello")
+
         s = torch.rand(1)
         with self.assertRaisesRegex(RuntimeError, "could not convert string to float"):
             self.assertTrue(foo(s))
 
         @torch.jit.script
         def foo(a):
-            return 0.5 == float('0.5')
+            return 0.5 == float("0.5")
+
         s = torch.rand(1)
         self.assertTrue(foo(s))
 
         @torch.jit.script
         def foo(a):
-            return 0. == float('0')
+            return 0.0 == float("0")
+
         s = torch.rand(1)
         self.assertTrue(foo(s))
diff --git a/test/jit/test_python_ir.py b/test/jit/test_python_ir.py
index 2cfc287da4fe9..ff00823f11c8a 100644
--- a/test/jit/test_python_ir.py
+++ b/test/jit/test_python_ir.py
@@ -1,20 +1,100 @@
 # Owner(s): ["oncall: jit"]
 
+import unittest
+
+import numpy as np
 import torch
+from torch.testing import FileCheck
+from torch.testing._internal.common_utils import IS_MACOS
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestPythonIr(JitTestCase):
     def test_param_strides(self):
         def trace_me(arg):
             return arg
+
         t = torch.zeros(1, 3, 16, 16)
         traced = torch.jit.trace(trace_me, t)
         value = list(traced.graph.param_node().outputs())[0]
         real_strides = list(t.stride())
         type_strides = value.type().strides()
         self.assertEqual(real_strides, type_strides)
+
+    def test_permute_inputs_binding(self):
+        @torch.jit.script
+        def foo(i, j, k):
+            pass
+
+        g = foo.graph
+
+        idxs = []
+        for i, inp in enumerate(g.inputs()):
+            inp.setDebugName(f"inp{i}")
+            idxs.append(i)
+
+        permuted_idxs = list(np.random.permutation(idxs))
+        g.permuteInputs(permuted_idxs)
+        for i, inp in enumerate(g.inputs()):
+            self.assertEqual(f"inp{permuted_idxs[i]}", inp.debugName())
+
+    @unittest.skipIf(IS_MACOS, "Failing on MacOS only")
+    def test_python_ir_utils(self):
+        @torch.jit.script
+        def foo(inp):
+            x = inp + 1
+            y = x / 2
+            z = y * y
+            return z
+
+        add_node = foo.graph.findNode("aten::add")
+        div_node = foo.graph.findNode("aten::div")
+
+        with foo.graph.insert_point_guard(add_node):
+            with foo.graph.insert_point_guard(div_node):
+                foo.graph.insertConstant("goodbye")
+            foo.graph.insertConstant("hello")
+        with foo.graph.insert_point_guard(foo.graph.findNode("aten::mul")):
+            foo.graph.insertConstant("hello")
+        FileCheck().check("hello").check("goodbye").check("hello").run(foo.graph)
+
+        self.assertTrue(add_node.matches(add_node.schema()))
+        self.assertFalse(add_node.matches(div_node.schema()))
+
+    def test_python_ir_utils_graph(self):
+        @torch.jit.script
+        def unrolled_mul(x: torch.Tensor, y: int):
+            out = x
+            for _ in range(y - 1):
+                out = out + x
+            return out
+
+        @torch.jit.script
+        def foo(x):
+            return x * 4
+
+        g = foo.graph
+        muls = g.findAllNodes("aten::mul")
+        scalar_muls = filter(
+            lambda x: x.matches("aten::mul(Tensor self, Scalar other) -> Tensor"), muls
+        )
+        mul_constant_int = filter(
+            lambda x: isinstance(list(x.inputs())[1].toIValue(), int), scalar_muls
+        )
+        for mul in mul_constant_int:
+            with g.insert_point_guard(mul):
+                outputs = g.insertGraph(unrolled_mul.graph, list(mul.inputs()))
+                assert len(outputs) == len(list(mul.outputs()))
+                for new_out, old_out in zip(outputs, g.outputs()):
+                    old_out.replaceAllUsesWith(new_out)
+                mul.destroy()
+
+        FileCheck().check_not("aten::mul").check("aten::add").run(foo.graph)
+        self.assertEqual(foo(torch.ones([2, 2])), torch.ones([2, 2]) * 4)
diff --git a/test/jit/test_recursive_script.py b/test/jit/test_recursive_script.py
index 6ade2c1b3e505..0dfa00adc6016 100644
--- a/test/jit/test_recursive_script.py
+++ b/test/jit/test_recursive_script.py
@@ -5,25 +5,31 @@
 import sys
 import types
 import typing
-import typing_extensions
-from typing import List, Dict, Optional, Tuple
+from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple
 
 import torch
 import torch.jit.frontend
 import torch.nn as nn
+import typing_extensions
 from torch import Tensor
 from torch.testing import FileCheck
-from collections import OrderedDict
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase, _tmp_donotuse_dont_inline_everything
+from torch.testing._internal.jit_utils import (
+    _tmp_donotuse_dont_inline_everything,
+    JitTestCase,
+)
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
 
 class TestRecursiveScript(JitTestCase):
     def test_inferred_nonetype(self):
@@ -87,7 +93,9 @@ def forward(self, x):
                 return self.fn(x)
 
         m = M(fn)
-        with self.assertRaisesRegexWithHighlight(RuntimeError, "failed to compile", "i_dont_exist"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "failed to compile", "i_dont_exist"
+        ):
             torch.jit.script(m)
 
     def test_init_error(self):
@@ -119,12 +127,12 @@ def forward(self):
 
         # sm1 was created while m had training = True
         self.assertTrue(sm1.training)
-        self.assertEqual(sm1.training, sm1._c.getattr('training'))
+        self.assertEqual(sm1.training, sm1._c.getattr("training"))
         self.assertEqual(sm1(), 2)
 
         # sm2 was created after m was eval'ed
         self.assertFalse(sm2.training)
-        self.assertEqual(sm2.training, sm2._c.getattr('training'))
+        self.assertEqual(sm2.training, sm2._c.getattr("training"))
         self.assertEqual(sm2(), 0)
 
     def test_module_name(self):
@@ -165,7 +173,7 @@ def a(x):
 
     def test_constants_with_final(self):
         class M1(torch.nn.Module):
-            x : torch.jit.Final[int]
+            x: torch.jit.Final[int]
 
             def __init__(self):
                 super().__init__()
@@ -177,7 +185,7 @@ def forward(self, t):
         self.checkModule(M1(), (torch.randn(2, 2),))
 
         class M2(torch.nn.Module):
-            x : typing_extensions.Final[int]
+            x: typing_extensions.Final[int]
 
             def __init__(self):
                 super().__init__()
@@ -189,7 +197,7 @@ def forward(self, t):
         self.checkModule(M2(), (torch.randn(2, 2),))
 
         class M3(torch.nn.Module):
-            x : typing.Final[int]
+            x: typing.Final[int]
 
             def __init__(self):
                 super().__init__()
@@ -206,12 +214,15 @@ class MyScriptClass:
             def unscriptable(self):
                 return "a" + 200
 
-
         class TestModule(torch.nn.Module):
             def forward(self, x):
                 return MyScriptClass()
 
-        with self.assertRaisesRegexWithHighlight(torch.jit.frontend.FrontendError, "Cannot instantiate class", "MyScriptClass"):
+        with self.assertRaisesRegexWithHighlight(
+            torch.jit.frontend.FrontendError,
+            "Cannot instantiate class",
+            "MyScriptClass",
+        ):
             t = torch.jit.script(TestModule())
 
     def test_method_call(self):
@@ -246,13 +257,13 @@ def forward(self, x):
             print(m)
 
         f = FileCheck()
-        f.check('MyModule')
-        f.check('Conv2d')
-        f.check('Linear')
-        f.check('Submodule')
+        f.check("MyModule")
+        f.check("Conv2d")
+        f.check("Linear")
+        f.check("Submodule")
         f.run(out[0])
 
-        self.assertEqual(m.original_name, 'MyModule')
+        self.assertEqual(m.original_name, "MyModule")
 
     def test_dir(self):
         def test_module_dir(mod):
@@ -260,8 +271,17 @@ def test_module_dir(mod):
             scripted_mod = torch.jit.script(mod)
             dir_scripted = set(dir(scripted_mod))
             # set not currently copied over
-            ignore_set = ["training", "__delitem__", "__setitem__", "clear", "items",
-                          "keys", "pop", "update", "values"]
+            ignore_set = [
+                "training",
+                "__delitem__",
+                "__setitem__",
+                "clear",
+                "items",
+                "keys",
+                "pop",
+                "update",
+                "values",
+            ]
             for attr in dir_set:
                 if attr in ignore_set:
                     continue
@@ -283,7 +303,9 @@ def forward(self, x):
         linear = nn.Linear(10, 10)
 
         test_module_dir(nn.Sequential(conv, linear))
-        test_module_dir(nn.ModuleDict(OrderedDict([("conv", conv), ("linear", linear)])))
+        test_module_dir(
+            nn.ModuleDict(OrderedDict([("conv", conv), ("linear", linear)]))
+        )
 
     def test_class_compile(self):
         def other_fn(a: int, b: Tensor) -> Tensor:
@@ -296,7 +318,6 @@ def __init__(self, x):
             def helper(self, a):
                 return self.x + a + other_fn(self.x, a)
 
-
         class N(torch.nn.Module):
             def forward(self, x):
                 b = B(x)
@@ -411,7 +432,7 @@ def fn(x) -> X:
 
     def test_module_basic(self):
         class Other(torch.nn.Module):
-            __constants__ = ['x']
+            __constants__ = ["x"]
 
             def __init__(self, x):
                 super().__init__()
@@ -426,7 +447,6 @@ def some_unscriptable_method(self):
             def forward(self, t):
                 return t + self.x + self.param
 
-
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -439,7 +459,7 @@ def forward(self, t):
 
     def test_module_function_export(self):
         class Other(torch.nn.Module):
-            __constants__ = ['x']
+            __constants__ = ["x"]
 
             def __init__(self, x):
                 super().__init__()
@@ -453,7 +473,6 @@ def some_entry_point(self, y):
             def forward(self, t):
                 return t + self.x + self.param
 
-
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -473,9 +492,7 @@ class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
                 self.sequential = nn.Sequential(
-                    Inner(),
-                    Inner(),
-                    nn.Sequential(Inner(), Inner())
+                    Inner(), Inner(), nn.Sequential(Inner(), Inner())
                 )
                 self.module_list = nn.ModuleList([Inner(), Inner()])
 
@@ -511,12 +528,14 @@ def __init__(self):
                 self.sequential = nn.Sequential(
                     SeluButReluWhenScripted(),
                     SeluButReluWhenScripted(),
-                    nn.Sequential(SeluButReluWhenScripted(), shared, SeluButReluWhenScripted()),
+                    nn.Sequential(
+                        SeluButReluWhenScripted(), shared, SeluButReluWhenScripted()
+                    ),
                     shared,
                 )
-                self.module_list = nn.ModuleList([SeluButReluWhenScripted(),
-                                                  shared,
-                                                  SeluButReluWhenScripted()])
+                self.module_list = nn.ModuleList(
+                    [SeluButReluWhenScripted(), shared, SeluButReluWhenScripted()]
+                )
 
             def forward(self, x):
                 for mod in self.module_list:
@@ -553,7 +572,8 @@ def __call__(self, int1, int2, *args):
         self.assertEqual(obj(1, 2), 3)
         self.assertEqual(obj(1, 2, 3, 4), 10)
         with self.assertRaisesRegex(
-            torch.jit.frontend.NotSupportedError, expected_regex="can't take variable number of arguments"
+            torch.jit.frontend.NotSupportedError,
+            expected_regex="can't take variable number of arguments",
         ):
             torch.jit.script(obj)
 
@@ -568,7 +588,10 @@ def __prepare_scriptable__(self):
 
         self.assertEqual(jit_obj(1, 2), 3)
         with self.assertRaisesRegex(
-            RuntimeError, expected_regex=re.escape("expected at most 2 argument(s) but received 4 argument(s)")
+            RuntimeError,
+            expected_regex=re.escape(
+                "expected at most 2 argument(s) but received 4 argument(s)"
+            ),
         ):
             jit_obj(1, 2, 3, 4)
 
@@ -598,27 +621,26 @@ def __setstate__(self, obj: Tuple[int, Inner2]) -> None:
             def __getstate__(self):
                 return (self.a, self.inner)
 
-
         untyped_values = (
-            ('my_dict', {"I": "am", "a test": "test"}),
-            ('my_float', 2.3),
-            ('my_int', 99),
-            ('my_bool', False),
-            ('my_tuple', (1, 2, 3, 4)),
-            ('my_list', [(1, 2), (3, 4)]),
+            ("my_dict", {"I": "am", "a test": "test"}),
+            ("my_float", 2.3),
+            ("my_int", 99),
+            ("my_bool", False),
+            ("my_tuple", (1, 2, 3, 4)),
+            ("my_list", [(1, 2), (3, 4)]),
             # ('my_tensor', torch.randn(2, 2)),
-            ('my_int_list', [1, 2, 3, 4]),
+            ("my_int_list", [1, 2, 3, 4]),
             # ('my_tensor_list', [torch.ones(2, 2) + i for i in range(4)]),
-            ('my_bool_list', [True, True, False, True]),
-            ('my_float_list', [1., 2., 3., 4.]),
-            ('my_str_list', ['hello', 'bye']),
+            ("my_bool_list", [True, True, False, True]),
+            ("my_float_list", [1.0, 2.0, 3.0, 4.0]),
+            ("my_str_list", ["hello", "bye"]),
         )
         typed_values = (
-            ('my_empty_list', []),
-            ('my_empty_dict', {}),
-            ('my_none', None),
-            ('my_object', Foo()),
-            ('my_object2', SFoo()),
+            ("my_empty_list", []),
+            ("my_empty_dict", {}),
+            ("my_none", None),
+            ("my_object", Foo()),
+            ("my_object2", SFoo()),
         )
 
         class M(torch.nn.Module):
@@ -659,11 +681,11 @@ def forward(self, x):
         # since there's no string frontend for Python classes (so the `define`)
         # trick doesn't work.
         M.__annotations__ = {
-            'my_empty_list': List[int],
-            'my_empty_dict': Dict[str, int],
-            'my_none': Optional[int],
-            'my_object': Foo,
-            'my_object2': SFoo,
+            "my_empty_list": List[int],
+            "my_empty_dict": Dict[str, int],
+            "my_none": Optional[int],
+            "my_object": Foo,
+            "my_object2": SFoo,
         }
 
         m = M()
@@ -694,7 +716,7 @@ def forward(self, x):
                 return self.encoder(x)
 
         m = M()
-        self.checkModule(m, (torch.randn(5, 5), ))
+        self.checkModule(m, (torch.randn(5, 5),))
 
     def test_inner_traced_module(self):
         class Dummy(nn.Module):
@@ -715,12 +737,13 @@ def forward(self, x):
         dummy = torch.jit.trace(Dummy(), torch.randn(1, 2))
         dummies = nn.ModuleList([dummy])
         model = Model(dummies)
-        self.checkModule(model, (torch.rand(5, 5), ))
+        self.checkModule(model, (torch.rand(5, 5),))
 
     def test_script_loaded_module(self):
         """
         Test that we can hold a loaded ScriptModule as a submodule.
         """
+
         class Dummy(nn.Module):
             def forward(self, x):
                 return x
@@ -736,7 +759,7 @@ def __init__(self):
             def forward(self, input):
                 return self.encoder(input)
 
-        self.checkModule(ContainsLoaded(), (torch.rand(2, 3), ))
+        self.checkModule(ContainsLoaded(), (torch.rand(2, 3),))
 
     def test_optional_module(self):
         class Dummy(nn.Module):
diff --git a/test/jit/test_remove_mutation.py b/test/jit/test_remove_mutation.py
index 3d76f201441af..89f963e45b77f 100644
--- a/test/jit/test_remove_mutation.py
+++ b/test/jit/test_remove_mutation.py
@@ -2,20 +2,23 @@
 
 import os
 import sys
+from typing import List
 
 import torch
 from torch.testing import FileCheck
-from typing import List
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase, freeze_rng_state
+from torch.testing._internal.jit_utils import freeze_rng_state, JitTestCase
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
 
 class TestRemoveMutation(JitTestCase):
     def test_aten_inplace(self):
@@ -26,7 +29,7 @@ def test_not_new_alias(x):
 
         fn = torch.jit.script(test_not_new_alias)
         graph = fn.graph
-        self.run_pass('remove_mutation', graph)
+        self.run_pass("remove_mutation", graph)
         FileCheck().check("aten::add_").run(graph)
         self.assertEqual(fn(torch.ones([2, 2])), test_not_new_alias(torch.ones([2, 2])))
 
@@ -38,7 +41,7 @@ def test_no_lowering():
         # there is no functional equivalent of x[0] = ...
         fn = torch.jit.script(test_no_lowering)
         graph = fn.graph
-        self.run_pass('remove_mutation', graph)
+        self.run_pass("remove_mutation", graph)
         FileCheck().check("aten::copy_").run(graph)
         self.assertEqual(fn(), test_no_lowering())
 
@@ -50,7 +53,7 @@ def test_move_before_not_valid():
 
         fn = torch.jit.script(test_move_before_not_valid)
         graph = fn.graph
-        self.run_pass('remove_mutation', graph)
+        self.run_pass("remove_mutation", graph)
         FileCheck().check("aten::add_").run(graph)
         self.assertEqual(fn(), test_move_before_not_valid())
 
@@ -63,7 +66,7 @@ def test_successful():
 
         fn = torch.jit.script(test_successful)
         graph = fn.graph
-        self.run_pass('remove_mutation', graph)
+        self.run_pass("remove_mutation", graph)
         FileCheck().check_not("aten::add_").run(graph)
         self.assertEqual(test_successful(), fn())
 
@@ -77,7 +80,7 @@ def test_intermediary_use():
         fn = torch.jit.script(test_intermediary_use)
         graph = fn.graph
         FileCheck().check_count("aten::add_", 2).run(graph)
-        self.run_pass('remove_mutation', graph)
+        self.run_pass("remove_mutation", graph)
         # Unable to remove the second add_ because of the y = x + 4 use
         # In the future we could duplicating the value of x as a temporary and replacing
         # its intermediary use (so long as aliasing is safe)
@@ -96,7 +99,7 @@ def foo(x, cond: bool):
         out_eager = foo(torch.tensor(5), True)
         foo_script = torch.jit.script(foo)
         FileCheck().check("aten::add_").run(foo_script.graph)
-        self.run_pass('remove_mutation', foo_script.graph)
+        self.run_pass("remove_mutation", foo_script.graph)
         FileCheck().check_not("aten::add_").run(foo_script.graph)
 
         self.assertEqual(out_eager, foo_script(torch.tensor(5), True))
@@ -113,8 +116,8 @@ def foo(cond: bool):
             y = x.add_(2)
             return y, li
 
-        self.run_pass('inline', foo.graph)
-        self.run_pass('remove_mutation', foo.graph)
+        self.run_pass("inline", foo.graph)
+        self.run_pass("remove_mutation", foo.graph)
         FileCheck().check("aten::add_").run(foo.graph)
 
         @torch.jit.script
@@ -126,8 +129,8 @@ def foo(cond: bool, y):
             z = x.add_(2)
             return z
 
-        self.run_pass('inline', foo.graph)
-        self.run_pass('remove_mutation', foo.graph)
+        self.run_pass("inline", foo.graph)
+        self.run_pass("remove_mutation", foo.graph)
         FileCheck().check("aten::add_").run(foo.graph)
 
     def test_special_mapped_op(self):
@@ -140,7 +143,7 @@ def test_successful():
 
         fn = torch.jit.script(test_successful)
         graph = fn.graph
-        self.run_pass('remove_mutation', graph)
+        self.run_pass("remove_mutation", graph)
         FileCheck().check_not("aten::zero_").check_not("aten::fill_").run(graph)
         self.assertEqual(test_successful(), fn())
 
@@ -154,8 +157,8 @@ def test_successful():
 
         fn = torch.jit.script(test_successful)
         graph = fn.graph
-        self.run_pass('remove_mutation', graph)
-        FileCheck().check_not('aten::fill_').run(graph)
+        self.run_pass("remove_mutation", graph)
+        FileCheck().check_not("aten::fill_").run(graph)
 
         def normal():
             # NOTE: For some unknown reason, the
@@ -167,7 +170,7 @@ def normal():
 
         fn = torch.jit.script(normal)
         graph = fn.graph
-        self.run_pass('remove_mutation', graph)
+        self.run_pass("remove_mutation", graph)
         FileCheck().check_not("normal_").run(graph)
         with freeze_rng_state():
             out_eager = normal()
@@ -181,10 +184,12 @@ def successful_remove():
 
         fn = torch.jit.script(successful_remove)
         graph = fn.graph
-        self.run_pass('loop_unrolling', graph)
-        self.run_pass('remove_mutation', graph)
-        self.run_pass('constant_propagation', graph)
-        FileCheck().check("graph").check_next("Constant").check_next("return").run(graph)
+        self.run_pass("loop_unrolling", graph)
+        self.run_pass("remove_mutation", graph)
+        self.run_pass("constant_propagation", graph)
+        FileCheck().check("graph").check_next("Constant").check_next("return").run(
+            graph
+        )
         self.assertEqual(successful_remove(), successful_remove())
 
         def intermediary_use():
@@ -196,14 +201,14 @@ def intermediary_use():
         fn = torch.jit.script(intermediary_use)
         graph = fn.graph
         FileCheck().check("append").run(graph)
-        self.run_pass('remove_mutation', graph)
+        self.run_pass("remove_mutation", graph)
         # it is possible to remove the append here but don't currently have the logic for it
         FileCheck().check_not("append").run(graph)
         self.assertEqual(intermediary_use(), fn())
 
     def test_lists_insert(self):
         def successful_remove():
-            a : List[int] = []
+            a: List[int] = []
             a.insert(0, 1)
             a.insert(0, 2)
             a.insert(-10, 3)
@@ -215,7 +220,9 @@ def successful_remove():
         graph = fn.graph
         torch._C._jit_pass_remove_mutation(graph)
         torch._C._jit_pass_constant_propagation(graph)
-        FileCheck().check("graph").check_next("Constant").check_next("return").run(graph)
+        FileCheck().check("graph").check_next("Constant").check_next("return").run(
+            graph
+        )
         self.assertEqual(successful_remove(), fn())
 
     def test_list_indexing_removal(self):
@@ -271,6 +278,7 @@ def successful():
 
     def test_common_pytorch_list_ops(self):
         for op in ["cat", "stack", "vstack", "hstack", "dstack"]:
+
             class OpMod(torch.nn.Module):
                 def __init__(self, op):
                     super().__init__()
@@ -285,7 +293,7 @@ def forward(self):
             torch_op = getattr(torch, op)
             mod = OpMod(torch_op)
             mod_script = torch.jit.script(mod)
-            self.run_pass('remove_mutation', mod_script.forward.graph)
+            self.run_pass("remove_mutation", mod_script.forward.graph)
             FileCheck().check_not("aten::add_").run(mod_script.forward.graph)
             self.assertEqual(mod(), mod_script())
 
@@ -299,7 +307,6 @@ def forward(self):
 
                 self.assertEqual(sums, [ten.sum() for ten in result])
 
-
         @torch.jit.script
         def test_multiple_uses():
             x = torch.tensor([1, 2, 3, 4])
@@ -307,5 +314,5 @@ def test_multiple_uses():
             y = [x, x]
             return torch.cat(y), y
 
-        self.run_pass('remove_mutation', mod_script.forward.graph)
+        self.run_pass("remove_mutation", mod_script.forward.graph)
         FileCheck().check("aten::add_").run(test_multiple_uses.graph)
diff --git a/test/jit/test_save_load.py b/test/jit/test_save_load.py
index e10ead6baa615..d16f039798895 100644
--- a/test/jit/test_save_load.py
+++ b/test/jit/test_save_load.py
@@ -8,12 +8,12 @@
 
 import torch
 from torch import Tensor
-from torch.testing._internal.common_utils import TemporaryFileName
+from torch.testing._internal.common_utils import skipIfTorchDynamo, TemporaryFileName
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase, clear_class_registry
+from torch.testing._internal.jit_utils import clear_class_registry, JitTestCase
 
 
 if __name__ == "__main__":
@@ -439,7 +439,7 @@ def test_save_namedtuple_input_only_forwardref(self):
         global FooTuple  # see [local resolution in python]
 
         class FooTuple(NamedTuple):
-            a: 'int'
+            a: "int"
 
         class MyModule(torch.nn.Module):
             def forward(self, x: FooTuple) -> torch.Tensor:
@@ -608,7 +608,6 @@ def forward(self, x):
         self.assertTrue(m_params["bar.bias"].is_cpu)
         self.assertTrue(m_loaded_params["bar.bias"].is_cpu)
 
-
     def test_save_load_with_saved_traced_inputs(self):
         """
         Check that saving and loading with traced inputs works as expected
@@ -637,14 +636,18 @@ def get_loaded_inputs(inputs):
         # Validate that with no input specified the traced inputs are stored
         traced_module = torch.jit.trace(module, input_tensor)
         traced_inputs = list(traced_module.graph.inputs())
-        self.assertEqual(traced_module._c._retrieve_traced_inputs()['forward'], [input_tensor])
+        self.assertEqual(
+            traced_module._c._retrieve_traced_inputs()["forward"], [input_tensor]
+        )
         with TemporaryFileName() as fname:
             path = pathlib.Path(fname)
             traced_module.save(path)
             loaded_module = torch.jit.load(path, _restore_shapes=True)
             loaded_inputs = list(loaded_module.graph.inputs())
             self.assertEqual(traced_inputs[1].type(), loaded_inputs[1].type())
-            self.assertEqual(traced_inputs[1].type().sizes(), loaded_inputs[1].type().sizes())
+            self.assertEqual(
+                traced_inputs[1].type().sizes(), loaded_inputs[1].type().sizes()
+            )
             # Validate that if no shapes are requested previous functionality remains
             loaded_module = torch.jit.load(path)
             loaded_inputs = list(loaded_module.graph.inputs())
@@ -672,7 +675,7 @@ def get_loaded_inputs(inputs):
             "1000": (
                 torch.tensor([0]),
                 torch.tensor([], dtype=torch.int64),
-                torch.tensor([])
+                torch.tensor([]),
             )
         }
         traced_inputs, loaded_inputs = get_loaded_inputs(input1)
@@ -683,45 +686,53 @@ def get_loaded_inputs(inputs):
             "1000": (
                 torch.tensor([0]),
                 torch.tensor([1500000, 1500004], dtype=torch.int64),
-                torch.tensor([2.0, 3.0])
+                torch.tensor([2.0, 3.0]),
             )
         }
         traced_inputs, loaded_inputs = get_loaded_inputs(input2)
         self.assertEqual(traced_inputs[1].type(), loaded_inputs[1].type())
 
         # Testing list
-        input3 = [torch.tensor([0]),
-                  torch.tensor([1500000, 1500004], dtype=torch.int64),
-                  torch.tensor([2.0, 3.0])]
+        input3 = [
+            torch.tensor([0]),
+            torch.tensor([1500000, 1500004], dtype=torch.int64),
+            torch.tensor([2.0, 3.0]),
+        ]
 
         traced_inputs, loaded_inputs = get_loaded_inputs(input3)
         self.assertEqual(traced_inputs[1].type(), loaded_inputs[1].type())
 
         # Testing list of dict of list
-        input4 = [{
-            "1000": (
-                torch.tensor([0]),
-                torch.tensor([1500000, 1500004], dtype=torch.int64),
-                torch.tensor([2.0, 3.0])
-            )
-        }]
+        input4 = [
+            {
+                "1000": (
+                    torch.tensor([0]),
+                    torch.tensor([1500000, 1500004], dtype=torch.int64),
+                    torch.tensor([2.0, 3.0]),
+                )
+            }
+        ]
 
         traced_inputs, loaded_inputs = get_loaded_inputs(input4)
         self.assertEqual(traced_inputs[1].type(), loaded_inputs[1].type())
 
+    @skipIfTorchDynamo("too slow")
     def test_save_load_large_string_attribute(self):
         """
         Check if the model with string > 4GB can be loaded.
         """
         import psutil
+
         if psutil.virtual_memory().available < 60 * 1024 * 1024 * 1024:
             # Profiled the test execution, and got this number to be safe to run the test
-            self.skipTest("Doesn't have enough memory to run test_save_load_large_string_attribute")
+            self.skipTest(
+                "Doesn't have enough memory to run test_save_load_large_string_attribute"
+            )
 
         class Model(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.x = "x" * (2 ** 32 + 1)
+                self.x = "x" * (2**32 + 1)
 
             def forward(self, i) -> int:
                 return len(self.x) + i.numel()
@@ -792,12 +803,8 @@ def forward(self, x):
         class ContainsBoth(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.add_module(
-                    "second", torch.jit.load(second_saved_module)
-                )
-                self.add_module(
-                    "first", torch.jit.load(first_saved_module)
-                )
+                self.add_module("second", torch.jit.load(second_saved_module))
+                self.add_module("first", torch.jit.load(first_saved_module))
 
             def forward(self, x):
                 x = self.first(x)
@@ -845,12 +852,8 @@ def forward(self, x):
         class ContainsBoth(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.add_module(
-                    "second", torch.jit.load(second_saved_module)
-                )
-                self.add_module(
-                    "first", torch.jit.load(first_saved_module)
-                )
+                self.add_module("second", torch.jit.load(second_saved_module))
+                self.add_module("first", torch.jit.load(first_saved_module))
 
             def forward(self, x):
                 x = self.first(x)
@@ -930,12 +933,8 @@ def forward(self, x):
         class ContainsBoth(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.add_module(
-                    "second", torch.jit.load(second_saved_module)
-                )
-                self.add_module(
-                    "first", torch.jit.load(first_saved_module)
-                )
+                self.add_module("second", torch.jit.load(second_saved_module))
+                self.add_module("first", torch.jit.load(first_saved_module))
 
             def forward(self, x):
                 x = self.first(x)
@@ -1034,12 +1033,8 @@ def forward(self, x):
         class ContainsBoth(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.add_module(
-                    "second", torch.jit.load(second_saved_module)
-                )
-                self.add_module(
-                    "first", torch.jit.load(first_saved_module)
-                )
+                self.add_module("second", torch.jit.load(second_saved_module))
+                self.add_module("first", torch.jit.load(first_saved_module))
 
             def forward(self, x):
                 x, named_tuple_1 = self.first(x)
@@ -1117,18 +1112,18 @@ def forward(self, x):
 
         first_script_module = torch.jit.script(Foo())
         first_saved_module = io.BytesIO()
-        torch.jit.save_jit_module_to_flatbuffer(
-            first_script_module, first_saved_module)
+        torch.jit.save_jit_module_to_flatbuffer(first_script_module, first_saved_module)
         first_saved_module.seek(0)
-        ff_info = torch.jit._serialization.get_flatbuffer_module_info(first_saved_module)
-        self.assertEqual(ff_info['bytecode_version'], 9)
-        self.assertEqual(ff_info['operator_version'], 1)
-        self.assertEqual(ff_info['type_names'], set())
-        self.assertEqual(ff_info['opname_to_num_args'], {'aten::linear': 3})
-
-        self.assertEqual(len(ff_info['function_names']), 1)
-        self.assertTrue(next(iter(ff_info['function_names'])).endswith('forward'))
+        ff_info = torch.jit._serialization.get_flatbuffer_module_info(
+            first_saved_module
+        )
+        self.assertEqual(ff_info["bytecode_version"], 9)
+        self.assertEqual(ff_info["operator_version"], 1)
+        self.assertEqual(ff_info["type_names"], set())
+        self.assertEqual(ff_info["opname_to_num_args"], {"aten::linear": 3})
 
+        self.assertEqual(len(ff_info["function_names"]), 1)
+        self.assertTrue(next(iter(ff_info["function_names"])).endswith("forward"))
 
     def test_save_load_params_buffers_submodules(self):
         """
@@ -1178,7 +1173,6 @@ def __init__(self):
             self.assertEqual(m_name, loaded_name)
             self.assertEqual(m_buffer, loaded_buffer)
 
-
     def test_save_load_with_extra_files(self):
         """
         Check that parameters, buffers, and submodules are the same after loading.
@@ -1193,7 +1187,8 @@ def forward(self, x: Tensor):
 
         extra_files = {"abc.json": b"[1,2,3]"}
         script_module_io = script_module._save_to_buffer_for_lite_interpreter(
-            _extra_files=extra_files, _use_flatbuffer=True)
+            _extra_files=extra_files, _use_flatbuffer=True
+        )
 
         re_extra_files = {}
         torch._C._get_model_extra_files_from_buffer(script_module_io, re_extra_files)
diff --git a/test/jit/test_save_load_for_op_version.py b/test/jit/test_save_load_for_op_version.py
index 328f65684a700..d9eb62bc42172 100644
--- a/test/jit/test_save_load_for_op_version.py
+++ b/test/jit/test_save_load_for_op_version.py
@@ -1,20 +1,21 @@
 # Owner(s): ["oncall: jit"]
 
-from itertools import product as product
 import io
 import os
 import sys
-import hypothesis.strategies as st
-from hypothesis import example, settings, given
+from itertools import product as product
 from typing import Union
 
+import hypothesis.strategies as st
+
 import torch
+from hypothesis import example, given, settings
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase
 from torch.jit.mobile import _load_for_lite_interpreter
+from torch.testing._internal.jit_utils import JitTestCase
 
 if __name__ == "__main__":
     raise RuntimeError(
@@ -23,6 +24,7 @@
         "instead."
     )
 
+
 class TestSaveLoadForOpVersion(JitTestCase):
     # Helper that returns the module after saving and loading
     def _save_load_module(self, m):
@@ -53,7 +55,6 @@ def _verify_count(self, kind, m, count):
         node_count = sum(str(n).count(kind) for n in m.graph.nodes())
         self.assertEqual(node_count, count)
 
-
     """
     Tests that verify Torchscript remaps aten::div(_) from versions 0-3
     to call either aten::true_divide(_), if an input is a float type,
@@ -62,16 +63,21 @@ def _verify_count(self, kind, m, count):
       div behavior has not yet been updated.
     """
 
-    @settings(max_examples=10, deadline=200000)  # A total of 10 examples will be generated
+    @settings(
+        max_examples=10, deadline=200000
+    )  # A total of 10 examples will be generated
     @given(
-        sample_input=st.tuples(st.integers(min_value=5, max_value=199), st.floats(min_value=5.0, max_value=199.0))
+        sample_input=st.tuples(
+            st.integers(min_value=5, max_value=199),
+            st.floats(min_value=5.0, max_value=199.0),
+        )
     )  # Generate a pair (integer, float)
     @example((2, 3, 2.0, 3.0))  # Ensure this example will be covered
     def test_versioned_div_tensor(self, sample_input):
         def historic_div(self, other):
             if self.is_floating_point() or other.is_floating_point():
                 return self.true_divide(other)
-            return self.divide(other, rounding_mode='trunc')
+            return self.divide(other, rounding_mode="trunc")
 
         # Tensor x Tensor
         class MyModule(torch.nn.Module):
@@ -85,7 +91,9 @@ def forward(self, a, b):
         # Loads historic module
         try:
             v3_mobile_module = _load_for_lite_interpreter(
-                pytorch_test_dir + "/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl")
+                pytorch_test_dir
+                + "/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl"
+            )
         except Exception as e:
             self.skipTest("Failed to load fixture!")
 
@@ -108,16 +116,21 @@ def _helper(m, fn):
             _helper(v3_mobile_module, historic_div)
             _helper(current_mobile_module, torch.div)
 
-    @settings(max_examples=10, deadline=200000)  # A total of 10 examples will be generated
+    @settings(
+        max_examples=10, deadline=200000
+    )  # A total of 10 examples will be generated
     @given(
-        sample_input=st.tuples(st.integers(min_value=5, max_value=199), st.floats(min_value=5.0, max_value=199.0))
+        sample_input=st.tuples(
+            st.integers(min_value=5, max_value=199),
+            st.floats(min_value=5.0, max_value=199.0),
+        )
     )  # Generate a pair (integer, float)
     @example((2, 3, 2.0, 3.0))  # Ensure this example will be covered
     def test_versioned_div_tensor_inplace(self, sample_input):
         def historic_div_(self, other):
             if self.is_floating_point() or other.is_floating_point():
                 return self.true_divide_(other)
-            return self.divide_(other, rounding_mode='trunc')
+            return self.divide_(other, rounding_mode="trunc")
 
         class MyModule(torch.nn.Module):
             def forward(self, a, b):
@@ -126,7 +139,9 @@ def forward(self, a, b):
 
         try:
             v3_mobile_module = _load_for_lite_interpreter(
-                pytorch_test_dir + "/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl")
+                pytorch_test_dir
+                + "/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl"
+            )
         except Exception as e:
             self.skipTest("Failed to load fixture!")
 
@@ -151,16 +166,25 @@ def _helper(m, fn):
             a = torch.tensor((val_a,))
             _helper(current_mobile_module, torch.Tensor.div_)
 
-    @settings(max_examples=10, deadline=200000)  # A total of 10 examples will be generated
+    @settings(
+        max_examples=10, deadline=200000
+    )  # A total of 10 examples will be generated
     @given(
-        sample_input=st.tuples(st.integers(min_value=5, max_value=199), st.floats(min_value=5.0, max_value=199.0))
+        sample_input=st.tuples(
+            st.integers(min_value=5, max_value=199),
+            st.floats(min_value=5.0, max_value=199.0),
+        )
     )  # Generate a pair (integer, float)
     @example((2, 3, 2.0, 3.0))  # Ensure this example will be covered
     def test_versioned_div_tensor_out(self, sample_input):
         def historic_div_out(self, other, out):
-            if self.is_floating_point() or other.is_floating_point() or out.is_floating_point():
+            if (
+                self.is_floating_point()
+                or other.is_floating_point()
+                or out.is_floating_point()
+            ):
                 return torch.true_divide(self, other, out=out)
-            return torch.divide(self, other, out=out, rounding_mode='trunc')
+            return torch.divide(self, other, out=out, rounding_mode="trunc")
 
         class MyModule(torch.nn.Module):
             def forward(self, a, b, out):
@@ -168,7 +192,9 @@ def forward(self, a, b, out):
 
         try:
             v3_mobile_module = _load_for_lite_interpreter(
-                pytorch_test_dir + "/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl")
+                pytorch_test_dir
+                + "/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl"
+            )
         except Exception as e:
             self.skipTest("Failed to load fixture!")
 
@@ -179,6 +205,7 @@ def forward(self, a, b, out):
             b = torch.tensor((val_b,))
 
             for out in (torch.empty((1,)), torch.empty((1,), dtype=torch.long)):
+
                 def _helper(m, fn):
                     fn_result = None
                     if fn is torch.div:
@@ -196,9 +223,14 @@ def _helper(m, fn):
                 _helper(v3_mobile_module, historic_div_out)
                 _helper(current_mobile_module, torch.div)
 
-    @settings(max_examples=10, deadline=200000)  # A total of 10 examples will be generated
+    @settings(
+        max_examples=10, deadline=200000
+    )  # A total of 10 examples will be generated
     @given(
-        sample_input=st.tuples(st.integers(min_value=5, max_value=199), st.floats(min_value=5.0, max_value=199.0))
+        sample_input=st.tuples(
+            st.integers(min_value=5, max_value=199),
+            st.floats(min_value=5.0, max_value=199.0),
+        )
     )  # Generate a pair (integer, float)
     @example((2, 3, 2.0, 3.0))  # Ensure this example will be covered
     def test_versioned_div_scalar(self, sample_input):
@@ -208,7 +240,7 @@ def historic_div_scalar_float(self, other: float):
         def historic_div_scalar_int(self, other: int):
             if self.is_floating_point():
                 return torch.true_divide(self, other)
-            return torch.divide(self, other, rounding_mode='trunc')
+            return torch.divide(self, other, rounding_mode="trunc")
 
         class MyModuleFloat(torch.nn.Module):
             def forward(self, a, b: float):
@@ -220,9 +252,13 @@ def forward(self, a, b: int):
 
         try:
             v3_mobile_module_float = _load_for_lite_interpreter(
-                pytorch_test_dir + "/jit/fixtures/test_versioned_div_scalar_float_v2.ptl")
+                pytorch_test_dir
+                + "/jit/fixtures/test_versioned_div_scalar_float_v2.ptl"
+            )
             v3_mobile_module_int = _load_for_lite_interpreter(
-                pytorch_test_dir + "/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl")
+                pytorch_test_dir
+                + "/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl"
+            )
         except Exception as e:
             self.skipTest("Failed to load fixture!")
 
@@ -249,9 +285,14 @@ def _helper(m, fn):
                 _helper(v3_mobile_module_int, historic_div_scalar_int)
                 _helper(current_mobile_module_int, torch.div)
 
-    @settings(max_examples=10, deadline=200000)  # A total of 10 examples will be generated
+    @settings(
+        max_examples=10, deadline=200000
+    )  # A total of 10 examples will be generated
     @given(
-        sample_input=st.tuples(st.integers(min_value=5, max_value=199), st.floats(min_value=5.0, max_value=199.0))
+        sample_input=st.tuples(
+            st.integers(min_value=5, max_value=199),
+            st.floats(min_value=5.0, max_value=199.0),
+        )
     )  # Generate a pair (integer, float)
     @example((2, 3, 2.0, 3.0))  # Ensure this example will be covered
     def test_versioned_div_scalar_reciprocal(self, sample_input):
@@ -261,7 +302,7 @@ def historic_div_scalar_float_reciprocal(self, other: float):
         def historic_div_scalar_int_reciprocal(self, other: int):
             if self.is_floating_point():
                 return other / self
-            return torch.divide(other, self, rounding_mode='trunc')
+            return torch.divide(other, self, rounding_mode="trunc")
 
         class MyModuleFloat(torch.nn.Module):
             def forward(self, a, b: float):
@@ -273,9 +314,13 @@ def forward(self, a, b: int):
 
         try:
             v3_mobile_module_float = _load_for_lite_interpreter(
-                pytorch_test_dir + "/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl")
+                pytorch_test_dir
+                + "/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl"
+            )
             v3_mobile_module_int = _load_for_lite_interpreter(
-                pytorch_test_dir + "/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl")
+                pytorch_test_dir
+                + "/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl"
+            )
         except Exception as e:
             self.skipTest("Failed to load fixture!")
 
@@ -311,9 +356,14 @@ def _helper(m, fn):
                 _helper(v3_mobile_module_int, current_mobile_module_int)
                 _helper(current_mobile_module_int, torch.div)
 
-    @settings(max_examples=10, deadline=200000)  # A total of 10 examples will be generated
+    @settings(
+        max_examples=10, deadline=200000
+    )  # A total of 10 examples will be generated
     @given(
-        sample_input=st.tuples(st.integers(min_value=5, max_value=199), st.floats(min_value=5.0, max_value=199.0))
+        sample_input=st.tuples(
+            st.integers(min_value=5, max_value=199),
+            st.floats(min_value=5.0, max_value=199.0),
+        )
     )  # Generate a pair (integer, float)
     @example((2, 3, 2.0, 3.0))  # Ensure this example will be covered
     def test_versioned_div_scalar_inplace(self, sample_input):
@@ -324,7 +374,7 @@ def historic_div_scalar_int_inplace(self, other: int):
             if self.is_floating_point():
                 return self.true_divide_(other)
 
-            return self.divide_(other, rounding_mode='trunc')
+            return self.divide_(other, rounding_mode="trunc")
 
         class MyModuleFloat(torch.nn.Module):
             def forward(self, a, b: float):
@@ -338,9 +388,13 @@ def forward(self, a, b: int):
 
         try:
             v3_mobile_module_float = _load_for_lite_interpreter(
-                pytorch_test_dir + "/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl")
+                pytorch_test_dir
+                + "/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl"
+            )
             v3_mobile_module_int = _load_for_lite_interpreter(
-                pytorch_test_dir + "/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl")
+                pytorch_test_dir
+                + "/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl"
+            )
         except Exception as e:
             self.skipTest("Failed to load fixture!")
 
@@ -378,14 +432,16 @@ def forward(self, a: float, b: int, c: float, d: int):
 
         try:
             v3_mobile_module = _load_for_lite_interpreter(
-                pytorch_test_dir + "/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl")
+                pytorch_test_dir
+                + "/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl"
+            )
         except Exception as e:
             self.skipTest("Failed to load fixture!")
 
         current_mobile_module = self._save_load_mobile_module(MyModule)
 
         def _helper(m, fn):
-            vals = (5., 3, 2., 7)
+            vals = (5.0, 3, 2.0, 7)
             m_result = m(*vals)
             fn_result = fn(*vals)
             for mr, hr in zip(m_result, fn_result):
@@ -395,13 +451,16 @@ def _helper(m, fn):
 
     def test_versioned_linspace(self):
         class Module(torch.nn.Module):
-            def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
+            def forward(
+                self, a: Union[int, float, complex], b: Union[int, float, complex]
+            ):
                 c = torch.linspace(a, b, steps=5)
                 d = torch.linspace(a, b, steps=100)
                 return c, d
 
         scripted_module = torch.jit.load(
-            pytorch_test_dir + "/jit/fixtures/test_versioned_linspace_v7.ptl")
+            pytorch_test_dir + "/jit/fixtures/test_versioned_linspace_v7.ptl"
+        )
 
         buffer = io.BytesIO(scripted_module._save_to_buffer_for_lite_interpreter())
         buffer.seek(0)
@@ -410,7 +469,7 @@ def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
         current_mobile_module = self._save_load_mobile_module(Module)
 
         sample_inputs = ((3, 10), (-10, 10), (4.0, 6.0), (3 + 4j, 4 + 5j))
-        for (a, b) in sample_inputs:
+        for a, b in sample_inputs:
             (output_with_step, output_without_step) = v7_mobile_module(a, b)
             (current_with_step, current_without_step) = current_mobile_module(a, b)
             # when no step is given, should have used 100
@@ -422,10 +481,17 @@ def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
 
     def test_versioned_linspace_out(self):
         class Module(torch.nn.Module):
-            def forward(self, a: Union[int, float, complex], b: Union[int, float, complex], out: torch.Tensor):
+            def forward(
+                self,
+                a: Union[int, float, complex],
+                b: Union[int, float, complex],
+                out: torch.Tensor,
+            ):
                 return torch.linspace(a, b, steps=100, out=out)
 
-        model_path = pytorch_test_dir + "/jit/fixtures/test_versioned_linspace_out_v7.ptl"
+        model_path = (
+            pytorch_test_dir + "/jit/fixtures/test_versioned_linspace_out_v7.ptl"
+        )
         loaded_model = torch.jit.load(model_path)
         buffer = io.BytesIO(loaded_model._save_to_buffer_for_lite_interpreter())
         buffer.seek(0)
@@ -433,12 +499,32 @@ def forward(self, a: Union[int, float, complex], b: Union[int, float, complex],
         current_mobile_module = self._save_load_mobile_module(Module)
 
         sample_inputs = (
-            (3, 10, torch.empty((100,), dtype=torch.int64), torch.empty((100,), dtype=torch.int64)),
-            (-10, 10, torch.empty((100,), dtype=torch.int64), torch.empty((100,), dtype=torch.int64)),
-            (4.0, 6.0, torch.empty((100,), dtype=torch.float64), torch.empty((100,), dtype=torch.float64)),
-            (3 + 4j, 4 + 5j, torch.empty((100,), dtype=torch.complex64), torch.empty((100,), dtype=torch.complex64)),
+            (
+                3,
+                10,
+                torch.empty((100,), dtype=torch.int64),
+                torch.empty((100,), dtype=torch.int64),
+            ),
+            (
+                -10,
+                10,
+                torch.empty((100,), dtype=torch.int64),
+                torch.empty((100,), dtype=torch.int64),
+            ),
+            (
+                4.0,
+                6.0,
+                torch.empty((100,), dtype=torch.float64),
+                torch.empty((100,), dtype=torch.float64),
+            ),
+            (
+                3 + 4j,
+                4 + 5j,
+                torch.empty((100,), dtype=torch.complex64),
+                torch.empty((100,), dtype=torch.complex64),
+            ),
         )
-        for (start, end, out_for_old, out_for_new) in sample_inputs:
+        for start, end, out_for_old, out_for_new in sample_inputs:
             output = v7_mobile_module(start, end, out_for_old)
             output_current = current_mobile_module(start, end, out_for_new)
             # when no step is given, should have used 100
@@ -448,13 +534,16 @@ def forward(self, a: Union[int, float, complex], b: Union[int, float, complex],
 
     def test_versioned_logspace(self):
         class Module(torch.nn.Module):
-            def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
+            def forward(
+                self, a: Union[int, float, complex], b: Union[int, float, complex]
+            ):
                 c = torch.logspace(a, b, steps=5)
                 d = torch.logspace(a, b, steps=100)
                 return c, d
 
         scripted_module = torch.jit.load(
-            pytorch_test_dir + "/jit/fixtures/test_versioned_logspace_v8.ptl")
+            pytorch_test_dir + "/jit/fixtures/test_versioned_logspace_v8.ptl"
+        )
 
         buffer = io.BytesIO(scripted_module._save_to_buffer_for_lite_interpreter())
         buffer.seek(0)
@@ -463,7 +552,7 @@ def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
         current_mobile_module = self._save_load_mobile_module(Module)
 
         sample_inputs = ((3, 10), (-10, 10), (4.0, 6.0), (3 + 4j, 4 + 5j))
-        for (a, b) in sample_inputs:
+        for a, b in sample_inputs:
             (output_with_step, output_without_step) = v8_mobile_module(a, b)
             (current_with_step, current_without_step) = current_mobile_module(a, b)
             # when no step is given, should have used 100
@@ -475,10 +564,17 @@ def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
 
     def test_versioned_logspace_out(self):
         class Module(torch.nn.Module):
-            def forward(self, a: Union[int, float, complex], b: Union[int, float, complex], out: torch.Tensor):
+            def forward(
+                self,
+                a: Union[int, float, complex],
+                b: Union[int, float, complex],
+                out: torch.Tensor,
+            ):
                 return torch.logspace(a, b, steps=100, out=out)
 
-        model_path = pytorch_test_dir + "/jit/fixtures/test_versioned_logspace_out_v8.ptl"
+        model_path = (
+            pytorch_test_dir + "/jit/fixtures/test_versioned_logspace_out_v8.ptl"
+        )
         loaded_model = torch.jit.load(model_path)
         buffer = io.BytesIO(loaded_model._save_to_buffer_for_lite_interpreter())
         buffer.seek(0)
@@ -486,12 +582,32 @@ def forward(self, a: Union[int, float, complex], b: Union[int, float, complex],
         current_mobile_module = self._save_load_mobile_module(Module)
 
         sample_inputs = (
-            (3, 10, torch.empty((100,), dtype=torch.int64), torch.empty((100,), dtype=torch.int64)),
-            (-10, 10, torch.empty((100,), dtype=torch.int64), torch.empty((100,), dtype=torch.int64)),
-            (4.0, 6.0, torch.empty((100,), dtype=torch.float64), torch.empty((100,), dtype=torch.float64)),
-            (3 + 4j, 4 + 5j, torch.empty((100,), dtype=torch.complex64), torch.empty((100,), dtype=torch.complex64)),
+            (
+                3,
+                10,
+                torch.empty((100,), dtype=torch.int64),
+                torch.empty((100,), dtype=torch.int64),
+            ),
+            (
+                -10,
+                10,
+                torch.empty((100,), dtype=torch.int64),
+                torch.empty((100,), dtype=torch.int64),
+            ),
+            (
+                4.0,
+                6.0,
+                torch.empty((100,), dtype=torch.float64),
+                torch.empty((100,), dtype=torch.float64),
+            ),
+            (
+                3 + 4j,
+                4 + 5j,
+                torch.empty((100,), dtype=torch.complex64),
+                torch.empty((100,), dtype=torch.complex64),
+            ),
         )
-        for (start, end, out_for_old, out_for_new) in sample_inputs:
+        for start, end, out_for_old, out_for_new in sample_inputs:
             output = v8_mobile_module(start, end, out_for_old)
             output_current = current_mobile_module(start, end, out_for_new)
             # when no step is given, should have used 100
diff --git a/test/jit/test_script_profile.py b/test/jit/test_script_profile.py
index 438994b6a8f3c..4b67df2ed10e1 100644
--- a/test/jit/test_script_profile.py
+++ b/test/jit/test_script_profile.py
@@ -11,10 +11,13 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class Sequence(nn.Module):
     def __init__(self):
@@ -38,8 +41,8 @@ def forward(self, input):
         outputs = torch.cat(outputs, dim=1)
         return outputs
 
-class TestScriptProfile(JitTestCase):
 
+class TestScriptProfile(JitTestCase):
     def test_basic(self):
         seq = torch.jit.script(Sequence())
         p = torch.jit._ScriptProfile()
@@ -51,15 +54,17 @@ def test_basic(self):
     def test_script(self):
         seq = Sequence()
 
+        p = torch.jit._ScriptProfile()
+        p.enable()
+
         @torch.jit.script
         def fn():
-            p = torch.jit._ScriptProfile()
-            p.enable()
             _ = seq(torch.rand((10, 100)))
-            p.disable()
-            return p
 
-        self.assertNotEqual(fn().dump_string(), "")
+        fn()
+        p.disable()
+
+        self.assertNotEqual(p.dump_string(), "")
 
     def test_multi(self):
         seq = torch.jit.script(Sequence())
@@ -82,25 +87,24 @@ def test_section(self):
         seq = Sequence()
 
         @torch.jit.script
-        def fn():
-            p = torch.jit._ScriptProfile()
-            p.enable()
-            _ = seq(torch.rand((10, 100)))
-            p.disable()
-            stats0 = p.dump_string()
+        def fn(max: int):
+            _ = seq(torch.rand((10, max)))
 
-            _ = seq(torch.rand((10, 10)))
-            stats1 = p.dump_string()
+        p = torch.jit._ScriptProfile()
+        p.enable()
+        fn(100)
+        p.disable()
+        s0 = p.dump_string()
 
-            p.enable()
-            _ = seq(torch.rand((10, 10)))
-            p.disable()
-            stats2 = p.dump_string()
+        fn(10)
+        p.disable()
+        s1 = p.dump_string()
 
-            p.enable()
-            return stats0, stats1, stats2
+        p.enable()
+        fn(10)
+        p.disable()
+        s2 = p.dump_string()
 
-        s0, s1, s2 = fn()
         self.assertEqual(s0, s1)
         self.assertNotEqual(s1, s2)
 
diff --git a/test/jit/test_scriptmod_ann.py b/test/jit/test_scriptmod_ann.py
index 3a9f2fd4d2ca5..5d9856744d255 100644
--- a/test/jit/test_scriptmod_ann.py
+++ b/test/jit/test_scriptmod_ann.py
@@ -3,22 +3,24 @@
 import os
 import sys
 import warnings
+from typing import Dict, List, Optional
 
 import torch
-from typing import List, Dict, Optional
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
-class TestScriptModuleInstanceAttributeTypeAnnotation(JitTestCase):
 
+class TestScriptModuleInstanceAttributeTypeAnnotation(JitTestCase):
     # NB: There are no tests for `Tuple` or `NamedTuple` here. In fact,
     # reassigning a non-empty Tuple to an attribute previously typed
     # as containing an empty Tuple SHOULD fail. See note in `_check.py`
@@ -81,7 +83,6 @@ def forward(self, x: List[int]):
 
     def test_annotated_class_level_annotation_only(self):
         class M(torch.nn.Module):
-
             x: List[int]
 
             def __init__(self):
@@ -96,10 +97,8 @@ def forward(self, y: List[int]):
             self.checkModule(M(), ([1, 2, 3],))
         assert len(w) == 0
 
-
     def test_annotated_class_level_annotation_and_init_annotation(self):
         class M(torch.nn.Module):
-
             x: List[int]
 
             def __init__(self):
@@ -116,7 +115,6 @@ def forward(self, y: List[int]):
 
     def test_annotated_class_level_jit_annotation(self):
         class M(torch.nn.Module):
-
             x: List[int]
 
             def __init__(self):
@@ -141,12 +139,15 @@ def forward(self, x: List[int]):
                 self.x = x
                 return 1
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "Tried to set nonexistent attribute",
-                                                 "self.x = x"):
-            with self.assertWarnsRegex(UserWarning, "doesn't support "
-                                       "instance-level annotations on "
-                                       "empty non-base types"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Tried to set nonexistent attribute", "self.x = x"
+        ):
+            with self.assertWarnsRegex(
+                UserWarning,
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+            ):
                 torch.jit.script(M())
 
     def test_annotated_empty_dict(self):
@@ -159,12 +160,15 @@ def forward(self, x: Dict[str, int]):
                 self.x = x
                 return 1
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "Tried to set nonexistent attribute",
-                                                 "self.x = x"):
-            with self.assertWarnsRegex(UserWarning, "doesn't support "
-                                       "instance-level annotations on "
-                                       "empty non-base types"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Tried to set nonexistent attribute", "self.x = x"
+        ):
+            with self.assertWarnsRegex(
+                UserWarning,
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+            ):
                 torch.jit.script(M())
 
     def test_annotated_empty_optional(self):
@@ -177,12 +181,15 @@ def forward(self, x: Optional[str]):
                 self.x = x
                 return 1
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "Wrong type for attribute assignment",
-                                                 "self.x = x"):
-            with self.assertWarnsRegex(UserWarning, "doesn't support "
-                                       "instance-level annotations on "
-                                       "empty non-base types"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Wrong type for attribute assignment", "self.x = x"
+        ):
+            with self.assertWarnsRegex(
+                UserWarning,
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+            ):
                 torch.jit.script(M())
 
     def test_annotated_with_jit_empty_list(self):
@@ -195,12 +202,15 @@ def forward(self, x: List[int]):
                 self.x = x
                 return 1
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "Tried to set nonexistent attribute",
-                                                 "self.x = x"):
-            with self.assertWarnsRegex(UserWarning, "doesn't support "
-                                       "instance-level annotations on "
-                                       "empty non-base types"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Tried to set nonexistent attribute", "self.x = x"
+        ):
+            with self.assertWarnsRegex(
+                UserWarning,
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+            ):
                 torch.jit.script(M())
 
     def test_annotated_with_jit_empty_dict(self):
@@ -213,12 +223,15 @@ def forward(self, x: Dict[str, int]):
                 self.x = x
                 return 1
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "Tried to set nonexistent attribute",
-                                                 "self.x = x"):
-            with self.assertWarnsRegex(UserWarning, "doesn't support "
-                                       "instance-level annotations on "
-                                       "empty non-base types"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Tried to set nonexistent attribute", "self.x = x"
+        ):
+            with self.assertWarnsRegex(
+                UserWarning,
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+            ):
                 torch.jit.script(M())
 
     def test_annotated_with_jit_empty_optional(self):
@@ -231,12 +244,15 @@ def forward(self, x: Optional[str]):
                 self.x = x
                 return 1
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "Wrong type for attribute assignment",
-                                                 "self.x = x"):
-            with self.assertWarnsRegex(UserWarning, "doesn't support "
-                                       "instance-level annotations on "
-                                       "empty non-base types"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Wrong type for attribute assignment", "self.x = x"
+        ):
+            with self.assertWarnsRegex(
+                UserWarning,
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+            ):
                 torch.jit.script(M())
 
     def test_annotated_with_torch_jit_import(self):
@@ -251,10 +267,13 @@ def forward(self, x: Optional[str]):
                 self.x = x
                 return 1
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "Wrong type for attribute assignment",
-                                                 "self.x = x"):
-            with self.assertWarnsRegex(UserWarning, "doesn't support "
-                                       "instance-level annotations on "
-                                       "empty non-base types"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Wrong type for attribute assignment", "self.x = x"
+        ):
+            with self.assertWarnsRegex(
+                UserWarning,
+                "doesn't support "
+                "instance-level annotations on "
+                "empty non-base types",
+            ):
                 torch.jit.script(M())
diff --git a/test/jit/test_slice.py b/test/jit/test_slice.py
index ceb3c3b48e89d..3f4763ff1a5e4 100644
--- a/test/jit/test_slice.py
+++ b/test/jit/test_slice.py
@@ -2,19 +2,22 @@
 
 import os
 import sys
+from typing import List
 
 import torch
-from typing import List
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 # Tests that Python slice class is supported in TorchScript
 class TestSlice(JitTestCase):
@@ -22,7 +25,9 @@ def test_slice_kwarg(self):
         def slice_kwarg(x: List[int]):
             return x[slice(1, stop=2)]
 
-        with self.assertRaisesRegex(RuntimeError, "Slice does not accept any keyword arguments"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Slice does not accept any keyword arguments"
+        ):
             torch.jit.script(slice_kwarg)
 
     def test_slice_three_nones(self):
@@ -46,11 +51,13 @@ def one_none(x: List[int]):
     def test_slice_stop_only(self):
         def fn(x: List[int]):
             return x[slice(5)]
+
         self.checkScript(fn, (range(10),))
 
     def test_slice_stop_only_with_nones(self):
         def fn(x: List[int]):
             return x[slice(None, 5, None)]
+
         self.checkScript(fn, (range(10),))
 
     def test_slice_start_stop(self):
@@ -136,8 +143,8 @@ def tuple_slice(a):
         num_outputs = {len(x.output().type().elements()) for x in slices}
         # there should be only one tupleSlice with length of 2
         self.assertTrue(num_outputs == {2})
-        self.run_pass('lower_all_tuples', tuple_graph)
-        self.assertTrue('Tuple' not in str(tuple_graph))
+        self.run_pass("lower_all_tuples", tuple_graph)
+        self.assertTrue("Tuple" not in str(tuple_graph))
 
     def test_module_list_slicing(self):
         class Bar(torch.nn.Module):
diff --git a/test/jit/test_sparse.py b/test/jit/test_sparse.py
index 00102ccc1c2e1..6dc9f0b8b6242 100644
--- a/test/jit/test_sparse.py
+++ b/test/jit/test_sparse.py
@@ -1,8 +1,9 @@
 # Owner(s): ["oncall: jit"]
 
 import io
-import torch
 import unittest
+
+import torch
 from torch.testing._internal.common_utils import IS_WINDOWS, TEST_MKL
 from torch.testing._internal.jit_utils import JitTestCase
 
@@ -70,9 +71,7 @@ def __init__(self):
                 self.a = torch.rand(4, 4).to_sparse_csr()
                 self.b = torch.rand(4, 4).to_sparse_csr()
 
-
             def forward(self, x):
-
                 return x.matmul(self.a).matmul(self.b)
 
         x = torch.rand(4, 4).to_sparse_csr()
diff --git a/test/jit/test_string_formatting.py b/test/jit/test_string_formatting.py
index e739de3be20d6..016c28e73993a 100644
--- a/test/jit/test_string_formatting.py
+++ b/test/jit/test_string_formatting.py
@@ -2,65 +2,76 @@
 
 import os
 import sys
+from typing import List
 
 import torch
-from typing import List
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
-class TestStringFormatting(JitTestCase):
 
+class TestStringFormatting(JitTestCase):
     def test_modulo_operator(self):
         def fn(dividend: int, divisor: int) -> int:
             return dividend % divisor
+
         self.checkScript(fn, (5, 2))
 
     def test_string_interpolation_with_string_placeholder_and_string_variable(self):
         def fn(arg1: str):
             return "%s in template" % arg1
+
         self.checkScript(fn, ("foo",))
 
-    def test_string_interpolation_with_string_placeholder_and_format_string_variable(self):
+    def test_string_interpolation_with_string_placeholder_and_format_string_variable(
+        self,
+    ):
         def fn(arg1: str):
             return arg1 % "foo"
+
         self.checkScript(fn, ("%s in template",))
 
     def test_string_interpolation_with_double_percent_in_string(self):
         def fn(arg1: str):
             return "%s in template %%" % arg1
+
         self.checkScript(fn, ("foo",))
 
     def test_string_interpolation_with_percent_in_string(self):
         @torch.jit.script
         def fn(arg1: str) -> str:
-            return "%s in template %" % arg1    # noqa: F501
+            return "%s in template %" % arg1  # noqa: F501
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "Incomplete format specifier",
-                                                 "\"%s in template %\" % arg1"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Incomplete format specifier", '"%s in template %" % arg1'
+        ):
             fn("foo")
 
     def test_string_interpolation_with_string_placeholder_and_digit_variable(self):
         def fn(arg1: int) -> str:
             return "%s in template" % arg1
+
         self.checkScript(fn, (1,))
 
     def test_string_interpolation_with_digit_placeholder_and_digit_variable(self):
         def fn(arg1: int) -> str:
             return "%d in template" % arg1
+
         self.checkScript(fn, (1,))
 
     def test_string_interpolation_with_alternate_digit_placeholder(self):
         def fn(arg1: int) -> str:
             return "%i in template" % arg1
+
         self.checkScript(fn, (1,))
 
     def test_string_interpolation_with_digit_placeholder_and_string_variable(self):
@@ -68,9 +79,11 @@ def test_string_interpolation_with_digit_placeholder_and_string_variable(self):
         def fn(arg1: str) -> str:
             return "%d in template" % arg1
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "%d requires a number for formatting, but got String",
-                                                 "\"%d in template\" % arg1"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError,
+            "%d requires a number for formatting, but got String",
+            '"%d in template" % arg1',
+        ):
             fn("1")
 
     def test_string_interpolation_with_exponent_placeholder_and_string_variable(self):
@@ -78,39 +91,51 @@ def test_string_interpolation_with_exponent_placeholder_and_string_variable(self
         def fn(arg1: str) -> str:
             return "%e in template" % arg1
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "%e requires a number for formatting, but got String",
-                                                 "\"%e in template\" % arg1"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError,
+            "%e requires a number for formatting, but got String",
+            '"%e in template" % arg1',
+        ):
             fn("1")
 
-    def test_string_interpolation_with_lowercase_exponent_placeholder_and_digit_variable(self):
+    def test_string_interpolation_with_lowercase_exponent_placeholder_and_digit_variable(
+        self,
+    ):
         def fn(arg1: int) -> str:
             return "%e in template" % arg1
+
         self.checkScript(fn, (1,))
 
-    def test_string_interpolation_with_capital_exponent_placeholder_and_digit_variable(self):
+    def test_string_interpolation_with_capital_exponent_placeholder_and_digit_variable(
+        self,
+    ):
         def fn(arg1: int) -> str:
             return "%E in template" % arg1
+
         self.checkScript(fn, (1,))
 
     def test_string_interpolation_with_float_placeholder_and_float_variable(self):
         def fn(arg1: float) -> str:
             return "%f in template" % arg1
+
         self.checkScript(fn, (1.0,))
 
     def test_string_interpolation_with_float_placeholder_and_digit_variable(self):
         def fn(arg1: int) -> str:
             return "%f in template" % arg1
+
         self.checkScript(fn, (1,))
 
     def test_string_interpolation_with_char_placeholder_and_char_variable(self):
         def fn(arg1: str) -> str:
             return "%c in template" % arg1
+
         self.checkScript(fn, ("a",))
 
     def test_string_interpolation_with_char_placeholder_and_digit_variable(self):
         def fn(arg1: int) -> str:
             return "%c in template" % arg1
+
         self.checkScript(fn, (97,))
 
     def test_string_interpolation_with_char_placeholder_and_true_string_variable(self):
@@ -118,19 +143,23 @@ def test_string_interpolation_with_char_placeholder_and_true_string_variable(sel
         def fn(arg1: str) -> str:
             return "%c in template" % arg1
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "%c requires an int or char for formatting, but got String",
-                                                 "\"%c in template\" % arg1"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError,
+            "%c requires an int or char for formatting, but got String",
+            '"%c in template" % arg1',
+        ):
             fn("foo")
 
     def test_string_interpolation_with_multiple_placeholders(self):
         def fn(arg1: str, arg2: int, arg3: float) -> str:
             return "%s %d %f in template" % (arg1, arg2, arg3)
+
         self.checkScript(fn, ("foo", 1, 1))
 
     def test_string_interpolation_with_subscript(self):
         def fn(arg1: List[str]) -> str:
             return "%s in template" % arg1[0]
+
         self.checkScript(fn, (["foo", "bar"],))
 
     def test_string_interpolation_with_too_few_arguments(self):
@@ -138,27 +167,33 @@ def test_string_interpolation_with_too_few_arguments(self):
         def fn(arg1: str) -> str:
             return "%s %s in template" % arg1
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "Too few arguments for format string",
-                                                 "\"%s %s in template\" % arg1"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError,
+            "Too few arguments for format string",
+            '"%s %s in template" % arg1',
+        ):
             fn("foo")
 
     def test_string_interpolation_with_too_many_arguments(self):
         @torch.jit.script
         def fn(arg1: str, arg2: str) -> str:
-            return "%s in template" % (arg1, arg2)    # noqa: F507
+            return "%s in template" % (arg1, arg2)  # noqa: F507
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "Too many arguments for format string",
-                                                 "\"%s in template\" % (arg1, arg2"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError,
+            "Too many arguments for format string",
+            '"%s in template" % (arg1, arg2',
+        ):
             fn("foo", "bar")
 
     def test_string_interpolation_with_unknown_format_specifier(self):
         @torch.jit.script
         def fn(arg1: str) -> str:
-            return "%a in template" % arg1    # noqa: F501
+            return "%a in template" % arg1  # noqa: F501
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "The specifier %a is not supported in TorchScript format strings",
-                                                 "\"%a in template\" % arg1"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError,
+            "The specifier %a is not supported in TorchScript format strings",
+            '"%a in template" % arg1',
+        ):
             fn("foo")
diff --git a/test/jit/test_symbolic_shape_analysis.py b/test/jit/test_symbolic_shape_analysis.py
index bd3b6d7eca85c..913eaea12424a 100644
--- a/test/jit/test_symbolic_shape_analysis.py
+++ b/test/jit/test_symbolic_shape_analysis.py
@@ -3,29 +3,36 @@
 import operator
 import unittest
 from textwrap import dedent
+from typing import Any, List
 
 import torch
 from torch import nn, Tensor
 from torch.testing import FileCheck
 from torch.testing._internal.common_methods_invocations import sample_inputs_cat_concat
 from torch.testing._internal.common_utils import make_tensor
-from torch.testing._internal.jit_utils import JitTestCase, execWrapper
-from typing import List, Any
+from torch.testing._internal.jit_utils import execWrapper, JitTestCase
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
 
 # XXX: still in prototype
 class TestSymbolicShapeAnalysis(JitTestCase):
     def setUp(self):
         super(JitTestCase, self).setUp()
-        self.prev_symbolic_shapes_test_enabled = torch._C._jit_symbolic_shapes_test_mode_enabled()
+        self.prev_symbolic_shapes_test_enabled = (
+            torch._C._jit_symbolic_shapes_test_mode_enabled()
+        )
         torch._C._jit_set_symbolic_shapes_test_mode(True)
 
     def tearDown(self):
-        torch._C._jit_set_symbolic_shapes_test_mode(self.prev_symbolic_shapes_test_enabled)
+        torch._C._jit_set_symbolic_shapes_test_mode(
+            self.prev_symbolic_shapes_test_enabled
+        )
 
     def test_shape_analysis(self):
         @torch.jit.script
@@ -115,7 +122,9 @@ def foo(i: int, z):
         def neg_to_one(li):
             return [elem if elem >= 0 else -1 for elem in li]
 
-        self.assertEqual(neg_to_one(view.output().type().symbolic_sizes()), [-1, 3, 2, -1])
+        self.assertEqual(
+            neg_to_one(view.output().type().symbolic_sizes()), [-1, 3, 2, -1]
+        )
         if_out = next(foo.graph.findNode("prim::If").outputs())
         self.assertEqual(neg_to_one(if_out.type().symbolic_sizes()), [-1, 3, -1, -1])
 
@@ -135,9 +144,7 @@ def mul_inplace(x: torch.Tensor):
             y = x.mul_(2)
             return y
 
-        unary_ops = [
-            mul_inplace
-        ]
+        unary_ops = [mul_inplace]
         for fn in unary_ops:
             # t = torch.jit.trace(fn, torch.rand([4, 4]))  # For some reason tracing is erroring out.
             t = torch.jit.script(fn)
@@ -202,7 +209,9 @@ def foo2(x, y):
 
             inputs[1].setType(inputs[1].type().with_sizes([5, 8, sym1]))
             torch._C._jit_pass_propagate_shapes_on_graph(graph)
-            self.assertEqual(next(graph.outputs()).type().symbolic_sizes(), [5, 8, sym1])
+            self.assertEqual(
+                next(graph.outputs()).type().symbolic_sizes(), [5, 8, sym1]
+            )
 
     def test_adaptive_avg_pool2d(self):
         inps = [
@@ -227,25 +236,105 @@ def foo(x):
             self.checkShapeAnalysis(out_size, fn.graph, assert_propagation=True)
 
     def test_conv_deconv(self):
-        for inp_shape, weight_shape, bias, stride, padding, output_padding, dilation, groups, mod in [
-                ([32, 6, 10], [16, 3, 3], None, 2, 2, 1, 1, 2, torch.nn.functional.conv1d),
-                ([32, 16, 10], [16, 3, 3], None, 2, 2, 1, 1, 2, torch.nn.functional.conv_transpose1d),
-                ([1, 32, 5, 10], [30, 16, 3, 3], None, [2, 2], [0, 0], 0, 1, 2, torch.nn.functional.conv2d),
-                ([1, 30, 5, 10], [30, 16, 3, 3], None, [2, 2], [0, 0], 0, 1, 2, torch.nn.functional.conv_transpose2d),
-                ([3, 14, 10, 66, 55], [2, 7, 7, 4, 4], None, 1, 1, 2, 1, 2, torch.nn.functional.conv3d),
-                ([3, 2, 10, 66, 55], [2, 7, 7, 4, 4], None, 1, 1, 0, 1, 2, torch.nn.functional.conv_transpose3d)]:
+        for (
+            inp_shape,
+            weight_shape,
+            bias,
+            stride,
+            padding,
+            output_padding,
+            dilation,
+            groups,
+            mod,
+        ) in [
+            ([32, 6, 10], [16, 3, 3], None, 2, 2, 1, 1, 2, torch.nn.functional.conv1d),
+            (
+                [32, 16, 10],
+                [16, 3, 3],
+                None,
+                2,
+                2,
+                1,
+                1,
+                2,
+                torch.nn.functional.conv_transpose1d,
+            ),
+            (
+                [1, 32, 5, 10],
+                [30, 16, 3, 3],
+                None,
+                [2, 2],
+                [0, 0],
+                0,
+                1,
+                2,
+                torch.nn.functional.conv2d,
+            ),
+            (
+                [1, 30, 5, 10],
+                [30, 16, 3, 3],
+                None,
+                [2, 2],
+                [0, 0],
+                0,
+                1,
+                2,
+                torch.nn.functional.conv_transpose2d,
+            ),
+            (
+                [3, 14, 10, 66, 55],
+                [2, 7, 7, 4, 4],
+                None,
+                1,
+                1,
+                2,
+                1,
+                2,
+                torch.nn.functional.conv3d,
+            ),
+            (
+                [3, 2, 10, 66, 55],
+                [2, 7, 7, 4, 4],
+                None,
+                1,
+                1,
+                0,
+                1,
+                2,
+                torch.nn.functional.conv_transpose3d,
+            ),
+        ]:
             inp = torch.rand(inp_shape)
             weight = torch.rand(weight_shape)
-            if mod in [torch.nn.functional.conv1d, torch.nn.functional.conv2d, torch.nn.functional.conv3d]:
+            if mod in [
+                torch.nn.functional.conv1d,
+                torch.nn.functional.conv2d,
+                torch.nn.functional.conv3d,
+            ]:
                 res = mod(inp, weight, bias, stride, padding, dilation, groups).size()
             else:
-                res = mod(inp, weight, bias, stride, padding, output_padding, dilation, groups).size()
+                res = mod(
+                    inp, weight, bias, stride, padding, output_padding, dilation, groups
+                ).size()
 
             def foo(inp, weight):
-                if mod in [torch.nn.functional.conv1d, torch.nn.functional.conv2d, torch.nn.functional.conv3d]:
+                if mod in [
+                    torch.nn.functional.conv1d,
+                    torch.nn.functional.conv2d,
+                    torch.nn.functional.conv3d,
+                ]:
                     return mod(inp, weight, bias, stride, padding, dilation, groups)
                 else:
-                    return mod(inp, weight, bias, stride, padding, output_padding, dilation, groups)
+                    return mod(
+                        inp,
+                        weight,
+                        bias,
+                        stride,
+                        padding,
+                        output_padding,
+                        dilation,
+                        groups,
+                    )
 
             fn = torch.jit.trace(foo, (inp, weight))
             torch._C._jit_erase_non_input_shape_information(fn.graph)
@@ -280,33 +369,58 @@ def test_arange_shape(self):
         ]
 
         for inp in inps:
-            funcs_template = dedent('''
+            funcs_template = dedent(
+                """
             def func():
                 return torch.arange({args})
-            ''')
+            """
+            )
 
             inp_s = str(inp)[1:-1]  # remove tuple parens
             funcs_str = funcs_template.format(args=inp_s)
             scope = {}
             execWrapper(funcs_str, globals(), scope)
             cu = torch.jit.CompilationUnit(funcs_str)
-            self.checkShapeAnalysis(list(cu.func().size()), cu.func.graph, assert_propagation=True, constant_prop=False)
+            self.checkShapeAnalysis(
+                list(cu.func().size()),
+                cu.func.graph,
+                assert_propagation=True,
+                constant_prop=False,
+            )
 
     def test_shape_embedding_bag(self):
         # TODO: merge into opinfos, having difficulties there
         with torch.no_grad():
+
             def make_arg(shape, low=None, high=None):
-                return make_tensor(shape, device='cpu', dtype=torch.int64,
-                                   low=low, high=high, requires_grad=False)
+                return make_tensor(
+                    shape,
+                    device="cpu",
+                    dtype=torch.int64,
+                    low=low,
+                    high=high,
+                    requires_grad=False,
+                )
 
             nn_inps = (
-                (make_arg((40,), 0, 9), torch.nn.Embedding(20, embedding_dim=64, max_norm=1.0)),
+                (
+                    make_arg((40,), 0, 9),
+                    torch.nn.Embedding(20, embedding_dim=64, max_norm=1.0),
+                ),
                 (make_arg((2, 4), 0, 9), torch.nn.Embedding(10, 20, sparse=True)),
                 (make_arg((0,)), torch.nn.Embedding(0, 0, sparse=True)),
                 (make_arg((2, 4), 0, 9), torch.nn.Embedding(10, 0, sparse=True)),
                 (make_arg((4,), 0, 21), torch.nn.Embedding(22, 5, max_norm=1.0)),
-                (make_arg((2,), 0, 1), torch.nn.Embedding.from_pretrained(torch.arange(6.).view(2, 3), max_norm=2.,
-                                                                          norm_type=.5, scale_grad_by_freq=False, sparse=True)),
+                (
+                    make_arg((2,), 0, 1),
+                    torch.nn.Embedding.from_pretrained(
+                        torch.arange(6.0).view(2, 3),
+                        max_norm=2.0,
+                        norm_type=0.5,
+                        scale_grad_by_freq=False,
+                        sparse=True,
+                    ),
+                ),
             )
 
             for inp, module in nn_inps:
@@ -326,14 +440,16 @@ def foo(x):
 
                 fn = torch.jit.trace(foo, (inp.detach(),), check_trace=False)
 
-                self.checkShapeAnalysis(out_size, fn.graph, assert_propagation=True, constant_prop=False)
+                self.checkShapeAnalysis(
+                    out_size, fn.graph, assert_propagation=True, constant_prop=False
+                )
 
     def test_shape_concat(self):
         # TODO: unify with opinfo tests, traces of lists dont preserve sizes in IR
         sample_inputs = sample_inputs_cat_concat(None, "cpu", torch.float, False)
 
         class CatMod(nn.Module):
-            __constants__ = ['dim']
+            __constants__ = ["dim"]
 
             def __init__(self, dim=0):
                 super().__init__()
@@ -374,16 +490,23 @@ def test_convolution_backward(self):
         # Also, as the return shapes are the input, weight, and bias shape, there is no point
         # in a really complicated test
 
-        input = torch.randn((16, 16, 8, 8), dtype=torch.float32, device="cpu", requires_grad=True)
-        weight = torch.randn((8, 4, 3, 3), dtype=torch.float32, device="cpu", requires_grad=True)
+        input = torch.randn(
+            (16, 16, 8, 8), dtype=torch.float32, device="cpu", requires_grad=True
+        )
+        weight = torch.randn(
+            (8, 4, 3, 3), dtype=torch.float32, device="cpu", requires_grad=True
+        )
         out_grad = torch.randn((16, 8, 8, 8), dtype=torch.float32, device="cpu")
 
-
         @torch.jit.script
         def conv_bwd(input, weight, grad):
-            bias_sizes = [8, ]
+            bias_sizes = [
+                8,
+            ]
             args = ([1, 1], [1, 1], [1, 1], False, [0, 0], 4, [True, True, True])
-            return torch.ops.aten.convolution_backward(grad, input, weight, bias_sizes, *args)
+            return torch.ops.aten.convolution_backward(
+                grad, input, weight, bias_sizes, *args
+            )
 
         self.assert_shape_equal_scripted(conv_bwd, (input, weight, out_grad))
 
@@ -391,15 +514,19 @@ def conv_bwd(input, weight, grad):
         def conv_bwd_2(input, weight, grad):
             bias_sizes = None
             args = ([1, 1], [1, 1], [1, 1], False, [0, 0], 4, [True, True, True])
-            return torch.ops.aten.convolution_backward(grad, input, weight, bias_sizes, *args)
-        self.assert_shape_equal_scripted(conv_bwd_2, (input, weight, out_grad))
+            return torch.ops.aten.convolution_backward(
+                grad, input, weight, bias_sizes, *args
+            )
 
+        self.assert_shape_equal_scripted(conv_bwd_2, (input, weight, out_grad))
 
     def test_returning_input_symbolic_shapes(self):
         mm = torch.jit.freeze(torch.jit.script(nn.Conv2d(16, 33, 3, stride=2).eval()))
         inps = list(mm.graph.inputs())
         inps[1].setType(inps[1].type().with_sizes([None, None, None, None]))
-        shape_compute_graph = torch._C._jit_pass_propagate_shapes_on_graph_and_build_compute(mm.graph)
+        shape_compute_graph = (
+            torch._C._jit_pass_propagate_shapes_on_graph_and_build_compute(mm.graph)
+        )
         g = shape_compute_graph.partial_eval_shape_graph()
         # to make into a jit function cant have multiple outputs
         g.makeMultiOutputIntoTuple()
@@ -412,8 +539,12 @@ def test_returning_input_symbolic_shapes(self):
 
     def test_partial_eval_graph_conv(self):
         mm = torch.jit.freeze(torch.jit.script(nn.Conv2d(16, 33, 3, stride=2).eval()))
-        shape_compute_graph = torch._C._jit_pass_propagate_shapes_on_graph_and_build_compute(mm.graph)
-        output_sizes = mm.graph.findNode("aten::conv2d").output().type().symbolic_sizes()
+        shape_compute_graph = (
+            torch._C._jit_pass_propagate_shapes_on_graph_and_build_compute(mm.graph)
+        )
+        output_sizes = (
+            mm.graph.findNode("aten::conv2d").output().type().symbolic_sizes()
+        )
         # calculating 0, 2 and 3 index
         for i in [0, 2, 3]:
             self.assertTrue(output_sizes[i] < 0)
@@ -428,7 +559,9 @@ def test_partial_eval_graph_conv(self):
         for o, oe in zip(output, output_eager[0:1] + output_eager[2:]):
             self.assertEqual(o, oe)
 
-    def checkSymShapeCompute(self, shape_compute_graph, nodes, node_output_sizes, shape_inputs):
+    def checkSymShapeCompute(
+        self, shape_compute_graph, nodes, node_output_sizes, shape_inputs
+    ):
         g = shape_compute_graph.partial_eval_shape_graph()
         self.assertTrue(len(list(g.inputs())) == len(shape_inputs))
         output_sym_map = shape_compute_graph.graph_output_to_symbolic_shape_dim()
@@ -451,27 +584,49 @@ def checkSymShapeCompute(self, shape_compute_graph, nodes, node_output_sizes, sh
                     self.assertEqual(sym_outputs[sym_shape_index], output_shape[i])
 
     def test_partial_eval_stitching(self):
-        conv1 = torch.nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
-        max_pool = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
-        conv2 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
-
-        mod = torch.jit.freeze(torch.jit.script(nn.Sequential(conv1, max_pool, conv2).eval()))
+        conv1 = torch.nn.Conv2d(
+            3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
+        )
+        max_pool = torch.nn.MaxPool2d(
+            kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False
+        )
+        conv2 = nn.Conv2d(
+            64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False
+        )
+
+        mod = torch.jit.freeze(
+            torch.jit.script(nn.Sequential(conv1, max_pool, conv2).eval())
+        )
 
         conv1_output = conv1(torch.rand(1, 3, 224, 224))
         max_pool_output = max_pool(conv1_output)
         conv2_output = conv2(max_pool_output)
 
-        shape_compute_graph = torch._C._jit_pass_propagate_shapes_on_graph_and_build_compute(mod.graph)
-        nodes = [mod.graph.findNode("aten::max_pool2d")] + list(mod.graph.findAllNodes("aten::conv2d"))
-        output_shapes = [max_pool_output.size(), conv1_output.size(), conv2_output.size()]
-        self.checkSymShapeCompute(shape_compute_graph, nodes, output_shapes, ([1, 3, 224, 224],))
+        shape_compute_graph = (
+            torch._C._jit_pass_propagate_shapes_on_graph_and_build_compute(mod.graph)
+        )
+        nodes = [mod.graph.findNode("aten::max_pool2d")] + list(
+            mod.graph.findAllNodes("aten::conv2d")
+        )
+        output_shapes = [
+            max_pool_output.size(),
+            conv1_output.size(),
+            conv2_output.size(),
+        ]
+        self.checkSymShapeCompute(
+            shape_compute_graph, nodes, output_shapes, ([1, 3, 224, 224],)
+        )
 
     def test_refinement_through_graph_stitching(self):
         class TwoConvs(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.conv1 = torch.nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
-                self.conv2 = torch.nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
+                self.conv1 = torch.nn.Conv2d(
+                    3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
+                )
+                self.conv2 = torch.nn.Conv2d(
+                    3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
+                )
 
             def forward(self, x):
                 a = self.conv1(x)
@@ -495,18 +650,29 @@ def forward(self, x):
         self.assertEqual(out1, out2)
 
     def test_stitching_multi_output(self):
-        max_pool = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False, return_indices=True)
+        max_pool = torch.nn.MaxPool2d(
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            dilation=1,
+            ceil_mode=False,
+            return_indices=True,
+        )
         tensor = torch.rand(1, 3, 224, 224)
         mod = torch.jit.trace(max_pool, (tensor,))
         mod = torch.jit.freeze(mod.eval())
         inp = list(mod.graph.inputs())[1]
         inp.setType(inp.type().with_sizes([None, None, None, None]))
         output_tensor = list(mod(tensor)[0].size())
-        self.run_pass('lower_all_tuples', mod.graph)
-        shape_compute_graph = torch._C._jit_pass_propagate_shapes_on_graph_and_build_compute(mod.graph)
+        self.run_pass("lower_all_tuples", mod.graph)
+        shape_compute_graph = (
+            torch._C._jit_pass_propagate_shapes_on_graph_and_build_compute(mod.graph)
+        )
         max_pool_node = mod.graph.findNode("aten::max_pool2d_with_indices")
         outs = list(max_pool_node.outputs())
-        self.assertEqual(outs[0].type().symbolic_sizes(), outs[1].type().symbolic_sizes())
+        self.assertEqual(
+            outs[0].type().symbolic_sizes(), outs[1].type().symbolic_sizes()
+        )
         g = shape_compute_graph.partial_eval_shape_graph()
         # to make into a jit function cant have multiple outputs
         g.makeMultiOutputIntoTuple()
@@ -528,7 +694,6 @@ def test_sym_ir_parsing(self):
         self.assertEqual(out, [-2, -3])
 
     def test_stitching_concat(self):
-
         @torch.jit.script
         def foo1(a, b, x, y):
             return (a / b) + torch.cat([x, y])
@@ -542,15 +707,25 @@ def foo2(a, b, x, y):
             for inp in foo.graph.inputs():
                 inp.setType(inp.type().with_sizes([None, None]))
 
-            shape_compute_graph = torch._C._jit_pass_propagate_shapes_on_graph_and_build_compute(foo.graph)
-            nodes = [g.findNode("aten::div")] + [g.findNode("aten::add")] + [g.findNode("aten::cat")]
+            shape_compute_graph = (
+                torch._C._jit_pass_propagate_shapes_on_graph_and_build_compute(
+                    foo.graph
+                )
+            )
+            nodes = (
+                [g.findNode("aten::div")]
+                + [g.findNode("aten::add")]
+                + [g.findNode("aten::cat")]
+            )
 
             inps = [1, 10], [20, 10], [15, 1], [5, 1]
             output_shapes = [[20, 10], [20, 10], [20, 1]]
 
             self.checkSymShapeCompute(shape_compute_graph, nodes, output_shapes, inps)
 
-    @unittest.skipIf(not hasattr(torch.jit, "_shapes"), "shape functions not loaded in python")
+    @unittest.skipIf(
+        not hasattr(torch.jit, "_shapes"), "shape functions not loaded in python"
+    )
     def test_shape_function_includes(self):
         inp_shape = [1, 16, 5, 10]
         weight_shape = [33, 16, 3, 3]
@@ -559,7 +734,9 @@ def test_shape_function_includes(self):
         padding = [0, 0]
         dilation = [1, 1]
         groups = 1
-        res = torch.jit._shapes.conv2d(inp_shape, weight_shape, bias, stride, padding, dilation, groups)
+        res = torch.jit._shapes.conv2d(
+            inp_shape, weight_shape, bias, stride, padding, dilation, groups
+        )
         self.assertEqual(res, [1, 33, 2, 4])
 
         m1_shape = [10, 20]
@@ -580,8 +757,11 @@ def foo(x, y):
         def wrong_input_types(x, y):
             x: List[int] = []
             return x
+
         with self.assertRaisesRegex(RuntimeError, "Expected supertype of int"):
-            torch._C._jit_register_shape_compute_graph_for_node(node, wrong_input_types.graph)
+            torch._C._jit_register_shape_compute_graph_for_node(
+                node, wrong_input_types.graph
+            )
 
         @torch.jit.script
         def wrong_output_types(x: List[int], y: List[int]):
@@ -589,7 +769,9 @@ def wrong_output_types(x: List[int], y: List[int]):
             return x
 
         with self.assertRaisesRegex(RuntimeError, "but got graph_type"):
-            torch._C._jit_register_shape_compute_graph_for_node(node, wrong_output_types.graph)
+            torch._C._jit_register_shape_compute_graph_for_node(
+                node, wrong_output_types.graph
+            )
 
         @torch.jit.script
         def too_many_inputs(x: List[int], y: List[int], z: Any, z2: Any):
@@ -597,7 +779,9 @@ def too_many_inputs(x: List[int], y: List[int], z: Any, z2: Any):
             return x
 
         with self.assertRaises(RuntimeError) as error:
-            torch._C._jit_register_shape_compute_graph_for_node(node, too_many_inputs.graph)
+            torch._C._jit_register_shape_compute_graph_for_node(
+                node, too_many_inputs.graph
+            )
 
         self.assertTrue("fewer arguments than schema" in str(error.exception))
 
@@ -608,9 +792,22 @@ def foo(x, y):
 
         inputs = list(foo.graph.inputs())
         inputs[0].setType(inputs[0].type().with_sizes([8, 2]))
-        inputs[1].setType(inputs[1].type().with_sizes([8,]))
+        inputs[1].setType(
+            inputs[1]
+            .type()
+            .with_sizes(
+                [
+                    8,
+                ]
+            )
+        )
         torch._C._jit_pass_propagate_shapes_on_graph(foo.graph)
-        self.assertEqual(next(foo.graph.outputs()).type().sizes(), [8,])
+        self.assertEqual(
+            next(foo.graph.outputs()).type().sizes(),
+            [
+                8,
+            ],
+        )
 
     def test_squeeze_dims(self):
         @torch.jit.script
diff --git a/test/jit/test_tensor_creation_ops.py b/test/jit/test_tensor_creation_ops.py
index b3bab0eb20d3f..3ca2919adeaed 100644
--- a/test/jit/test_tensor_creation_ops.py
+++ b/test/jit/test_tensor_creation_ops.py
@@ -10,10 +10,13 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestTensorCreationOps(JitTestCase):
     """
@@ -27,7 +30,7 @@ def randperm(x: int):
             # as integers, which are not comparable against eager torch.dtype.
             assert perm.dtype == torch.int64
 
-        self.checkScript(randperm, (3, ))
+        self.checkScript(randperm, (3,))
 
     def test_randperm_specifed_dtype(self):
         def randperm(x: int):
@@ -36,7 +39,7 @@ def randperm(x: int):
             # as integers, which are not comparable against eager torch.dtype.
             assert perm.dtype == torch.float
 
-        self.checkScript(randperm, (3, ))
+        self.checkScript(randperm, (3,))
 
     def test_triu_indices_default_dtype(self):
         def triu_indices(rows: int, cols: int):
diff --git a/test/jit/test_tensor_methods.py b/test/jit/test_tensor_methods.py
index c761a3884c923..8e78c8684f0fd 100644
--- a/test/jit/test_tensor_methods.py
+++ b/test/jit/test_tensor_methods.py
@@ -8,8 +8,8 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase
 from torch.testing import FileCheck
+from torch.testing._internal.jit_utils import JitTestCase
 
 if __name__ == "__main__":
     raise RuntimeError(
@@ -18,6 +18,7 @@
         "instead."
     )
 
+
 class TestTensorMethods(JitTestCase):
     def test_getitem(self):
         def tensor_getitem(inp: torch.Tensor):
@@ -25,7 +26,7 @@ def tensor_getitem(inp: torch.Tensor):
             return inp.__getitem__(indices)
 
         inp = torch.rand(3, 4)
-        self.checkScript(tensor_getitem, (inp, ))
+        self.checkScript(tensor_getitem, (inp,))
 
         scripted = torch.jit.script(tensor_getitem)
         FileCheck().check("aten::index").run(scripted.graph)
@@ -35,5 +36,6 @@ def tensor_getitem_invalid(inp: torch.Tensor):
             return inp.__getitem__()
 
         with self.assertRaisesRegexWithHighlight(
-                RuntimeError, "expected exactly 1 argument", "inp.__getitem__"):
+            RuntimeError, "expected exactly 1 argument", "inp.__getitem__"
+        ):
             torch.jit.script(tensor_getitem_invalid)
diff --git a/test/jit/test_torchbind.py b/test/jit/test_torchbind.py
index 5e046a083edb5..eaa090455bc21 100644
--- a/test/jit/test_torchbind.py
+++ b/test/jit/test_torchbind.py
@@ -1,26 +1,27 @@
 # Owner(s): ["oncall: jit"]
 
+import copy
 import io
 import os
 import sys
-import copy
 import unittest
+from typing import Optional
 
 import torch
-from typing import Optional
+from torch.testing._internal.common_utils import skipIfTorchDynamo
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing import FileCheck
 from torch.testing._internal.common_utils import (
+    find_library_location,
     IS_FBCODE,
     IS_MACOS,
     IS_SANDCASTLE,
     IS_WINDOWS,
-    find_library_location,
 )
-from torch.testing import FileCheck
+from torch.testing._internal.jit_utils import JitTestCase
 
 if __name__ == "__main__":
     raise RuntimeError(
@@ -29,13 +30,15 @@
         "instead."
     )
 
+
+@skipIfTorchDynamo("skipping as a precaution")
 class TestTorchbind(JitTestCase):
     def setUp(self):
         if IS_SANDCASTLE or IS_MACOS or IS_FBCODE:
             raise unittest.SkipTest("non-portable load_library call used in test")
-        lib_file_path = find_library_location('libtorchbind_test.so')
+        lib_file_path = find_library_location("libtorchbind_test.so")
         if IS_WINDOWS:
-            lib_file_path = find_library_location('torchbind_test.dll')
+            lib_file_path = find_library_location("torchbind_test.dll")
         torch.ops.load_library(str(lib_file_path))
 
     def test_torchbind(self):
@@ -48,15 +51,17 @@ def f():
             val = torch.classes._TorchScriptTesting._Foo(5, 3)
             val.increment(1)
             return val
+
         test_equality(f, lambda x: x)
 
         with self.assertRaisesRegex(RuntimeError, "Expected a value of type 'int'"):
             val = torch.classes._TorchScriptTesting._Foo(5, 3)
-            val.increment('foo')
+            val.increment("foo")
 
         def f():
             ss = torch.classes._TorchScriptTesting._StackString(["asdf", "bruh"])
             return ss.pop()
+
         test_equality(f, lambda x: x)
 
         def f():
@@ -64,6 +69,7 @@ def f():
             ss2 = torch.classes._TorchScriptTesting._StackString(["111", "222"])
             ss1.push(ss2.pop())
             return ss1.pop() + ss2.pop()
+
         test_equality(f, lambda x: x)
 
         # test nn module with prepare_scriptable function
@@ -114,8 +120,11 @@ def foo():
         scripted = torch.jit.script(foo)
         # Ensure we are creating the object and calling __init__
         # rather than calling the __init__wrapper nonsense
-        fc = FileCheck().check('prim::CreateObject()')\
-                        .check('prim::CallMethod[name="__init__"]')
+        fc = (
+            FileCheck()
+            .check("prim::CreateObject()")
+            .check('prim::CallMethod[name="__init__"]')
+        )
         fc.run(str(scripted.graph))
         out = scripted()
         self.assertEqual(out.pop(), "mom")
@@ -165,7 +174,7 @@ def foo_just_getter():
         out, result = scripted()
         self.assertEqual(result, 10)
 
-        with self.assertRaisesRegex(RuntimeError, 'can\'t set attribute'):
+        with self.assertRaisesRegex(RuntimeError, "can't set attribute"):
             out.y = 5
 
         def foo_not_setter():
@@ -175,9 +184,11 @@ def foo_not_setter():
             # getY method intentionally adds 4 to x
             return fooGetterSetter.y
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 'Tried to set read-only attribute: y',
-                                                 'fooGetterSetter.y = old + 4'):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError,
+            "Tried to set read-only attribute: y",
+            "fooGetterSetter.y = old + 4",
+        ):
             scripted = torch.jit.script(foo_not_setter)
 
     def test_torchbind_def_property_readwrite(self):
@@ -194,9 +205,9 @@ def foo_readwrite_error():
             fooReadWrite.y = 5
             return fooReadWrite
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 'Tried to set read-only attribute: y',
-                                                 'fooReadWrite.y = 5'):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "Tried to set read-only attribute: y", "fooReadWrite.y = 5"
+        ):
             scripted = torch.jit.script(foo_readwrite_error)
 
     def test_torchbind_take_instance_as_method_arg(self):
@@ -248,7 +259,9 @@ def forward(self) -> int:
                 return self.foo_mod.info()
 
             def to_ivalue(self):
-                torchbind_model = torch.classes._TorchScriptTesting._Foo(self.foo_mod.info(), 1)
+                torchbind_model = torch.classes._TorchScriptTesting._Foo(
+                    self.foo_mod.info(), 1
+                )
                 return FooBar(torchbind_model)
 
         inst = FooBar(torch.classes._TorchScriptTesting._Foo(2, 3))
@@ -336,7 +349,7 @@ def forward(self):
         self.assertEqual(torch.zeros(4, 4), traced())
 
     def test_torchbind_pass_wrong_type(self):
-        with self.assertRaisesRegex(RuntimeError, 'but instead found type \'Tensor\''):
+        with self.assertRaisesRegex(RuntimeError, "but instead found type 'Tensor'"):
             torch.ops._TorchScriptTesting.take_an_instance(torch.rand(3, 4))
 
     def test_torchbind_tracing_nested(self):
@@ -366,12 +379,15 @@ def test_torchbind_pickle_serialization(self):
             self.assertEqual(nt_loaded.pop(), exp)
 
     def test_torchbind_instantiate_missing_class(self):
-        with self.assertRaisesRegex(RuntimeError, 'Tried to instantiate class \'foo.IDontExist\', but it does not exist!'):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Tried to instantiate class 'foo.IDontExist', but it does not exist!",
+        ):
             torch.classes.foo.IDontExist(3, 4, 5)
 
     def test_torchbind_optional_explicit_attr(self):
         class TorchBindOptionalExplicitAttr(torch.nn.Module):
-            foo : Optional[torch.classes._TorchScriptTesting._StackString]
+            foo: Optional[torch.classes._TorchScriptTesting._StackString]
 
             def __init__(self):
                 super().__init__()
@@ -382,13 +398,13 @@ def forward(self) -> str:
                 if foo_obj is not None:
                     return foo_obj.pop()
                 else:
-                    return '<None>'
+                    return "<None>"
 
         mod = TorchBindOptionalExplicitAttr()
         scripted = torch.jit.script(mod)
 
     def test_torchbind_no_init(self):
-        with self.assertRaisesRegex(RuntimeError, 'torch::init'):
+        with self.assertRaisesRegex(RuntimeError, "torch::init"):
             x = torch.classes._TorchScriptTesting._NoInit()
 
     def test_profiler_custom_op(self):
@@ -399,17 +415,17 @@ def test_profiler_custom_op(self):
 
         found_event = False
         for e in prof.function_events:
-            if e.name == '_TorchScriptTesting::take_an_instance':
+            if e.name == "_TorchScriptTesting::take_an_instance":
                 found_event = True
         self.assertTrue(found_event)
 
     def test_torchbind_getattr(self):
         foo = torch.classes._TorchScriptTesting._StackString(["test"])
-        self.assertEqual(None, getattr(foo, 'bar', None))
+        self.assertEqual(None, getattr(foo, "bar", None))
 
     def test_torchbind_attr_exception(self):
         foo = torch.classes._TorchScriptTesting._StackString(["test"])
-        with self.assertRaisesRegex(AttributeError, 'does not have a field'):
+        with self.assertRaisesRegex(AttributeError, "does not have a field"):
             foo.bar
 
     def test_lambda_as_constructor(self):
diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index 67e8ba2303c8c..5da8ab61c5b3c 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -1,39 +1,55 @@
 # Owner(s): ["oncall: jit"]
 
-import unittest
+import copy
 import io
 import os
 import sys
-import copy
+import unittest
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.autograd import Variable, Function
+from torch.autograd import Function, Variable
 from torch.testing import FileCheck
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.common_utils import suppress_warnings, \
-    skipIfCompiledWithoutNumpy, enable_profiling_mode_for_profiling_tests, \
-    IS_SANDCASTLE, TemporaryFileName, skipIfCrossRef, skipIfTorchDynamo
-from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, \
-    _tmp_donotuse_dont_inline_everything, _trace, RUN_CUDA, \
-    RUN_CUDA_MULTI_GPU, make_global
-from torch.testing._internal.common_cuda import with_tf32_off
-from torch import Tensor
+import warnings
 
 # Standard library
 from collections import namedtuple
 from itertools import chain
 from typing import Dict, List, Optional, Tuple
-import warnings
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+from torch import Tensor
+from torch.testing._internal.common_cuda import with_tf32_off
+from torch.testing._internal.common_utils import (
+    enable_profiling_mode_for_profiling_tests,
+    IS_SANDCASTLE,
+    skipIfCompiledWithoutNumpy,
+    skipIfCrossRef,
+    skipIfTorchDynamo,
+    suppress_warnings,
+    TemporaryFileName,
+)
+from torch.testing._internal.jit_utils import (
+    _tmp_donotuse_dont_inline_everything,
+    _trace,
+    enable_cpu_fuser,
+    JitTestCase,
+    make_global,
+    RUN_CUDA,
+    RUN_CUDA_MULTI_GPU,
+)
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
 class TestTracer(JitTestCase):
@@ -123,18 +139,23 @@ def make_decision(flag, x):
                         return x
                     else:
                         return torch.zeros_like(x)
+
                 x = torch.neg(x)
                 return make_decision(flag, x)
 
-
         decision = TracedInlineDecision()
-        torch.jit.trace(decision, (torch.rand(3, 4), torch.tensor([True], dtype=torch.bool)), check_trace=True)
+        torch.jit.trace(
+            decision,
+            (torch.rand(3, 4), torch.tensor([True], dtype=torch.bool)),
+            check_trace=True,
+        )
 
     def test_trace_single_tuple(self):
-        x = torch.tensor(2.)
+        x = torch.tensor(2.0)
 
         def f2(x):
             return (x,)
+
         jit_f2 = torch.jit.trace(f2, x)
         assert f2(x) == jit_f2(x)  # fails
 
@@ -149,7 +170,7 @@ def run_cummax(example_input, out_1, out_2):
         trace_model = torch.jit.trace(run_cummax, (example_input, out_1, out_2))
 
     def test_trace_namedtuple(self):
-        Point = namedtuple('point', ['x', 'y'])
+        Point = namedtuple("point", ["x", "y"])
 
         def f(p):
             if type(p) is tuple:
@@ -172,16 +193,21 @@ def forward(self, x, y):
         test_inputs = (torch.randint(0, 9, (9, 9)), torch.tensor(8))
         eager_out = mod(*test_inputs)
         traced_out = traced_func(*test_inputs)
-        self.assertNotWarn(lambda: traced_func(*test_inputs), "Shouldn't throw slicing related warn here")
+        self.assertNotWarn(
+            lambda: traced_func(*test_inputs),
+            "Shouldn't throw slicing related warn here",
+        )
         self.assertEqual(eager_out, traced_out)
 
         test_inputs = (torch.randint(0, 50, (50, 50)), torch.tensor(12))
         eager_out = mod(*test_inputs)
         traced_out = traced_func(*test_inputs)
-        self.assertNotWarn(lambda: traced_func(*test_inputs), "Shouldn't throw slicing related warn here")
+        self.assertNotWarn(
+            lambda: traced_func(*test_inputs),
+            "Shouldn't throw slicing related warn here",
+        )
         self.assertEqual(eager_out, traced_out)
 
-
     def test_typeas_trace_check(self):
         a = torch.tensor([0.4], requires_grad=True)
         b = torch.tensor([0.7], requires_grad=True)
@@ -198,7 +224,13 @@ def test_trace_index(self):
         def fn(x, y):
             return x[y]
 
-        fn_traced = torch.jit.trace(fn, (x, y,))
+        fn_traced = torch.jit.trace(
+            fn,
+            (
+                x,
+                y,
+            ),
+        )
 
         self.assertEqual(fn(x, y), fn_traced(x, y))
 
@@ -223,9 +255,9 @@ def run(f):
 
     def test_index_put(self):
         ten = torch.zeros(3, 3)
-        mask = torch.tensor([[True, True, True],
-                             [True, False, False],
-                             [True, True, False]])
+        mask = torch.tensor(
+            [[True, True, True], [True, False, False], [True, True, False]]
+        )
 
         def test_fn(ten, mask):
             ten[mask] = torch.ones(6)
@@ -251,14 +283,14 @@ def f(x):
         graph = traced.graph_for(x)
         # There should be 4 int constants for the right sides of operators, plus one
         # for the alpha argument for add and sub
-        self.assertTrue(str(traced.graph_for(x)).count(': int = prim::Constant') == 5)
+        self.assertTrue(str(traced.graph_for(x)).count(": int = prim::Constant") == 5)
 
     @suppress_warnings
     def test_constant(self):
         x = torch.randn(2, 2, requires_grad=True)
 
         def f(x):
-            return x.matmul(torch.diag(torch.tensor([2., 2.])))
+            return x.matmul(torch.diag(torch.tensor([2.0, 2.0])))
 
         self.checkTrace(f, (x,), (torch.ones(2, 2, requires_grad=True),))
 
@@ -276,9 +308,8 @@ def foobar():
 
         scripted = torch.jit.trace(foobar, (), check_trace=True)
 
-
     def test_inplace_transplant(self):
-        x = torch.tensor([0.], requires_grad=True)
+        x = torch.tensor([0.0], requires_grad=True)
 
         def fn(x):
             y = x.clone()
@@ -287,10 +318,10 @@ def fn(x):
             return y
 
         g, _ = torch.jit._get_trace_graph(fn, (x,))
-        self.run_pass('dce', g)
-        FileCheck().check_count("aten::clone", 1, exactly=True) \
-            .check_count("aten::add_", 2, exactly=True) \
-            .check_next("return").run(str(g))
+        self.run_pass("dce", g)
+        FileCheck().check_count("aten::clone", 1, exactly=True).check_count(
+            "aten::add_", 2, exactly=True
+        ).check_next("return").run(str(g))
         self.assertExportImport(g, (x,))
 
     def test_inplace_flags(self):
@@ -313,7 +344,7 @@ def forward(ctx, x):
             def backward(ctx, go):
                 return go
 
-        x = torch.tensor([0.], requires_grad=True)
+        x = torch.tensor([0.0], requires_grad=True)
 
         def fn(x):
             y = RegularFn.apply(x)
@@ -323,13 +354,13 @@ def fn(x):
             return y
 
         trace_graph, _ = torch.jit._get_trace_graph(fn, (x,), _force_outplace=True)
-        self.run_pass('dce', trace_graph)
+        self.run_pass("dce", trace_graph)
         ops = list(trace_graph.nodes())
         for op in ops:
-            self.assertTrue(op.hasAttribute('inplace'))
+            self.assertTrue(op.hasAttribute("inplace"))
         inplace_flags = [False, True, True, False]
         for op, is_inplace in zip(ops, inplace_flags):
-            self.assertEqual(op.i('inplace'), is_inplace)
+            self.assertEqual(op.i("inplace"), is_inplace)
 
     def test_inplace_check(self):
         class MyInplaceFn(Function):
@@ -348,12 +379,13 @@ def fn(x):
 
         x = torch.randn(5, 5)
         ge = torch.jit.trace(fn, (x,), _force_outplace=True, check_trace=False)
-        with self.assertRaisesRegex(RuntimeError, 'inplace MyInplaceFn'):
+        with self.assertRaisesRegex(RuntimeError, "inplace MyInplaceFn"):
             ge(x)
 
     def test_force_outplace_check_fill(self):
         def f(x):
             return torch.empty(x.shape).fill_(7)
+
         x = torch.randn(10, 15)
         ft = torch.jit.trace(f, x, _force_outplace=True)
         self.assertEqual(f(x), ft(x))
@@ -361,6 +393,7 @@ def f(x):
     def test_force_outplace_check_zero(self):
         def f(x):
             return torch.empty(x.shape).zero_()
+
         x = torch.randn(10, 15)
         ft = torch.jit.trace(f, x, _force_outplace=True)
         self.assertEqual(f(x), ft(x))
@@ -433,7 +466,7 @@ def test_trace_arange_with_grad(self):
     # Test that a trace of torch.full(x.shape) doesn't store the shape as a constant
     def test_trace_full_dynamic_shape(self):
         def full_with_shape_like(x):
-            return torch.full(x.shape, 2.)
+            return torch.full(x.shape, 2.0)
 
         x = torch.randn(3, 4)
         ge = torch.jit.trace(full_with_shape_like, example_inputs=x)
@@ -460,7 +493,7 @@ def do_trace_slice(self, requires_grad):
         def slice(x):
             results = []
             for i in range(4):
-                results.append(x[:x.size(0) - i, i:x.size(2), i:3])
+                results.append(x[: x.size(0) - i, i : x.size(2), i:3])
             return tuple(results)
 
         def slice_select(x):
@@ -489,20 +522,21 @@ def test_trace_slice(self):
     def test_trace_slice_with_grad(self):
         self.do_trace_slice(True)
 
-
     def test_trace_casts(self):
         casts = [
             lambda x: x.byte(),
             lambda x: x.float(),
             lambda x: x.cpu(),
-            lambda x: x.to(device='cpu'),
+            lambda x: x.to(device="cpu"),
             lambda x: x.to(dtype=torch.int64),
-            lambda x: x.to(device='cpu', dtype=torch.float),
-            lambda x: x.to(x)
+            lambda x: x.to(device="cpu", dtype=torch.float),
+            lambda x: x.to(x),
         ]
 
         def assertContainsCast(trace):
-            self.assertEqual(sum(n.kind() == 'aten::to' for n in trace.graph.nodes()), 1)
+            self.assertEqual(
+                sum(n.kind() == "aten::to" for n in trace.graph.nodes()), 1
+            )
 
         for cast in casts:
             trace = torch.jit.trace(cast, torch.randn(2, 2))
@@ -513,7 +547,9 @@ def assertContainsCast(trace):
         def to_tensor(x, y):
             return x.to(y)
 
-        to_tensor_trace = torch.jit.trace(to_tensor, (torch.randn(2, 2), torch.randn(1, 8)))
+        to_tensor_trace = torch.jit.trace(
+            to_tensor, (torch.randn(2, 2), torch.randn(1, 8))
+        )
         assertContainsCast(to_tensor_trace)
         x, y = torch.randn(2, 2), torch.randn(1, 10)
         self.assertEqual(to_tensor_trace(x, y), to_tensor(x, y))
@@ -524,7 +560,7 @@ def test_trace_warn(self):
         def fn(x):
             int(x)  # Warning 1.
             y = x * 1
-            if y:   # Warning 2.
+            if y:  # Warning 2.
                 pass
             q = [x, x * 4]
             z = q[y]
@@ -540,12 +576,12 @@ def fn(x):
         for warn in warns:
             self.assertIs(warn.category, torch.jit.TracerWarning)
         warns = [str(w.message) for w in warns]
-        self.assertIn('a Python integer', warns[0])
-        self.assertIn('a Python boolean', warns[1])
-        self.assertIn('a Python float', warns[2])
-        self.assertIn('a Python list', warns[3])
-        self.assertIn('a NumPy array', warns[4])
-        self.assertIn('Iterating over', warns[5])
+        self.assertIn("a Python integer", warns[0])
+        self.assertIn("a Python boolean", warns[1])
+        self.assertIn("a Python float", warns[2])
+        self.assertIn("a Python list", warns[3])
+        self.assertIn("a NumPy array", warns[4])
+        self.assertIn("Iterating over", warns[5])
 
     def test_trace_tuple(self):
         def fn(x, y):
@@ -555,15 +591,18 @@ def fn(x, y):
         traced_fn = torch.jit.trace(fn, (x, y))
         self.assertEqual(traced_fn(x, y), fn(x, y))
         # should be a tuple nested within another tuple
-        FileCheck().check_count("prim::TupleConstruct", 2, exactly=True).check_next("return") \
-            .run(str(traced_fn.graph))
+        FileCheck().check_count("prim::TupleConstruct", 2, exactly=True).check_next(
+            "return"
+        ).run(str(traced_fn.graph))
         self.assertExportImport(traced_fn.graph, (x, y))
 
     def test_trace_random(self):
         def f(mean, std):
             return torch.normal(mean, std)
 
-        traced = torch.jit.trace(f, (torch.zeros(2, 3), torch.ones(2, 3)), check_trace=False)
+        traced = torch.jit.trace(
+            f, (torch.zeros(2, 3), torch.ones(2, 3)), check_trace=False
+        )
         mean, std = torch.zeros(5, 5), torch.ones(5, 5)
         with torch.random.fork_rng(devices=[]):
             output = f(mean, std)
@@ -572,19 +611,20 @@ def f(mean, std):
 
     def test_trace_tensor_factory(self):
         def run(**kwargs):
-            inputs_require_grads = kwargs.pop('inputs_require_grads', True)
+            inputs_require_grads = kwargs.pop("inputs_require_grads", True)
 
             def fn(x):
                 return x + torch.ones(2, 3, **kwargs)
 
             input_kwargs = kwargs.copy()
-            if 'out' in input_kwargs:
-                del input_kwargs['out']
+            if "out" in input_kwargs:
+                del input_kwargs["out"]
             input = torch.ones(2, 3, **input_kwargs)
             self.checkTrace(fn, (input,), inputs_require_grads=inputs_require_grads)
             # check we recorded 'ones' and did not just record a constant
             tfn = torch.jit.trace(fn, input)
             self.assertTrue("ones" in str(tfn.graph))
+
         run()
         run(dtype=torch.int, inputs_require_grads=False)
         run(out=torch.tensor([]))
@@ -598,6 +638,7 @@ def stuff(x, y):
             x = x.clone()
             x[0] = y
             return x
+
         example = torch.rand(3, 4)
         self.checkTrace(stuff, (example, example[0] + 1))
 
@@ -605,8 +646,17 @@ def stuff(x, y):
     @unittest.expectedFailure
     def test_output_unflatten(self):
         """Check that outputs of traced functions retain the original structure and nesting"""
+
         def fn(x):
-            return (x * 2, (x ** 2, x + 4, (x + 2,), ), x * 4)
+            return (
+                x * 2,
+                (
+                    x**2,
+                    x + 4,
+                    (x + 2,),
+                ),
+                x * 4,
+            )
 
         self.checkTrace(fn, (torch.randn(2, 2),))
 
@@ -629,94 +679,103 @@ def test(d):
 
     def test_input_dict_remembers_keys(self):
         """Check that the trace remembers which keys were in a dict input"""
+
         class TestModule(torch.nn.Module):
             def forward(self, dict_input):
-                return dict_input['x']
+                return dict_input["x"]
 
-        input_1 = {'x': torch.tensor(1)}
+        input_1 = {"x": torch.tensor(1)}
         m = TestModule()
-        m_traced = torch.jit.trace(m, (input_1, ))
+        m_traced = torch.jit.trace(m, (input_1,))
         self.assertEqual(m_traced(input_1), torch.tensor(1))
 
         # should work to change the values and not the keys
-        input_same_key_different_value = {'x': torch.tensor(2)}
+        input_same_key_different_value = {"x": torch.tensor(2)}
         self.assertEqual(m_traced(input_same_key_different_value), torch.tensor(2))
 
         # error to use something that doesn't have `x`
-        input_different_key = {'y': torch.tensor(3)}
+        input_different_key = {"y": torch.tensor(3)}
         with self.assertRaises(RuntimeError):
             m_traced(input_different_key)
 
         # it's okay to have additional elements in the dictionary, so long as 'x' is there
-        input_additional_key = {'x': torch.tensor(4), 'y': torch.tensor(3)}
+        input_additional_key = {"x": torch.tensor(4), "y": torch.tensor(3)}
         self.assertEqual(m_traced(input_additional_key), torch.tensor(4))
 
     def test_input_dict_insertion_order(self):
         """Check that dictionary access doesn't care about insertion order"""
+
         class TestModule(torch.nn.Module):
             def forward(self, dict_input):
-                return dict_input['x'], dict_input['y']
+                return dict_input["x"], dict_input["y"]
+
         input_x_then_y = {}
-        input_x_then_y['x'] = torch.tensor(1)
-        input_x_then_y['y'] = torch.tensor(2)
+        input_x_then_y["x"] = torch.tensor(1)
+        input_x_then_y["y"] = torch.tensor(2)
 
         m = TestModule()
-        m_traced = torch.jit.trace(m, (input_x_then_y, ))
+        m_traced = torch.jit.trace(m, (input_x_then_y,))
 
         self.assertEqual(m_traced(input_x_then_y), (torch.tensor(1), torch.tensor(2)))
 
         input_y_then_x = {}
-        input_y_then_x['y'] = torch.tensor(4)
-        input_y_then_x['x'] = torch.tensor(3)
+        input_y_then_x["y"] = torch.tensor(4)
+        input_y_then_x["x"] = torch.tensor(3)
 
         self.assertEqual(m_traced(input_y_then_x), (torch.tensor(3), torch.tensor(4)))
 
     def test_input_dict_recursive(self):
         class TestModule(torch.nn.Module):
             def forward(self, dict_input):
-                return dict_input['x'][1]
+                return dict_input["x"][1]
 
-        input_1 = {'x': {1: torch.tensor(1)}}
+        input_1 = {"x": {1: torch.tensor(1)}}
         m = TestModule()
-        m_traced = torch.jit.trace(m, (input_1, ))
+        m_traced = torch.jit.trace(m, (input_1,))
 
-        input_2 = {'x': {1: torch.tensor(2)}}
+        input_2 = {"x": {1: torch.tensor(2)}}
         self.assertEqual(m_traced(input_2), torch.tensor(2))
 
     def test_input_dict_checkTrace_mut(self):
         def test(d):
-            d['x'].tanh_()
-            return d['x']
-        inputs = {'x': torch.rand(3, 4), 'y': torch.rand(3, 4)}
+            d["x"].tanh_()
+            return d["x"]
+
+        inputs = {"x": torch.rand(3, 4), "y": torch.rand(3, 4)}
         self.checkTrace(test, (inputs,), inputs_require_grads=False)
 
     def test_input_dict_unify(self):
         def test(d):
-            return d['int'], d['float']
-        inputs = {'int': torch.ones((2, 2), dtype=torch.int32),
-                  'float': torch.ones((2, 2), dtype=torch.float32)}
+            return d["int"], d["float"]
+
+        inputs = {
+            "int": torch.ones((2, 2), dtype=torch.int32),
+            "float": torch.ones((2, 2), dtype=torch.float32),
+        }
         self.checkTrace(test, (inputs,), inputs_require_grads=False)
 
     def test_input_tuple_of_dicts(self):
         def test(t):
             d = t[0]
-            return d['x']['y']
-        inputs = {'x': {'y': torch.rand(2, 3)}}
+            return d["x"]["y"]
+
+        inputs = {"x": {"y": torch.rand(2, 3)}}
         self.checkTrace(test, ((inputs, inputs),), allow_unused=True)
 
     def test_input_dict_of_dicts(self):
         def test(d):
-            return d['x']['y']
-        nested_input = {'y': torch.rand(2, 3)}
-        unified_nested = {'y': torch.rand(3, 2)}
-        inputs = {'x': nested_input, 'force_unify': unified_nested}
+            return d["x"]["y"]
+
+        nested_input = {"y": torch.rand(2, 3)}
+        unified_nested = {"y": torch.rand(3, 2)}
+        inputs = {"x": nested_input, "force_unify": unified_nested}
         self.checkTrace(test, (inputs,), allow_unused=True)
 
     def test_input_dict_of_lists(self):
         def test(d):
-            return d['x'][0]
+            return d["x"][0]
 
-        inputs = {'x': [torch.rand(3, 2)]}
+        inputs = {"x": [torch.rand(3, 2)]}
         self.checkTrace(test, (inputs,))
 
     def test_input_list_toplevel_flatten(self):
@@ -730,32 +789,38 @@ def test_input_list_toplevel_flatten_direct(self):
         class Test(torch.nn.Module):
             def forward(self, t1, t2):
                 return torch.add(t1, t2)
+
         inputs = [torch.ones(2, 2), torch.rand(2, 2)]
         torch.jit.trace(Test(), inputs)
 
     def test_input_list_of_tuples(self):
         def test(l):
             return l[0][0]
+
         inputs = [(torch.ones(2, 2),)]
         self.checkTrace(test, (inputs,))
 
     def test_input_dict_empty_list(self):
         def test(d):
             pass
+
         inputs = {1: []}
-        with self.assertRaisesRegex(RuntimeError, 'List trace'):
+        with self.assertRaisesRegex(RuntimeError, "List trace"):
             self.checkTrace(test, (inputs,))
 
     def test_input_list_mixed_type(self):
         def test(d):
             pass
+
         inputs = [torch.rand(2, 3), (torch.ones(2), torch.ones(2))]
-        with self.assertRaisesRegex(RuntimeError, 'consistent'):
+        with self.assertRaisesRegex(RuntimeError, "consistent"):
             self.checkTrace(test, (inputs,))
 
     def test_conv(self):
         x = torch.ones(20, 16, 50, 40)
-        g, outputs, inputs = torch.jit._get_trace_graph(nn.Conv2d(16, 13, 3, bias=False), x, return_inputs=True)
+        g, outputs, inputs = torch.jit._get_trace_graph(
+            nn.Conv2d(16, 13, 3, bias=False), x, return_inputs=True
+        )
         m = self.createFunctionFromGraph(g)
         self.assertEqual(outputs, m(*inputs))
 
@@ -773,7 +838,8 @@ def max_pool2d(x):
     def test_nested_inplace(self):
         x = torch.randn(2, 2)
         g, outputs, inputs = torch.jit._get_trace_graph(
-            lambda x: F.threshold(x, 0, 0, inplace=True), (x, ), return_inputs=True)
+            lambda x: F.threshold(x, 0, 0, inplace=True), (x,), return_inputs=True
+        )
         m = self.createFunctionFromGraph(g)
         self.assertEqual(outputs, m(*inputs))
         FileCheck().check("threshold_").run(str(g))
@@ -807,8 +873,8 @@ def f(x):
             out.copy_(x)
             return out
 
-        g, outputs, inputs = torch.jit._get_trace_graph(f, (x, ), return_inputs=True)
-        self.run_pass('dce', g)
+        g, outputs, inputs = torch.jit._get_trace_graph(f, (x,), return_inputs=True)
+        self.run_pass("dce", g)
         m = self.createFunctionFromGraph(g)
         self.assertEqual(outputs, m(*inputs))
         self.assertExportImport(g, (x,))
@@ -822,8 +888,9 @@ def f(x):
             return out
 
         g, outputs, inputs = torch.jit._get_trace_graph(
-            f, (x, ), return_inputs=True, _force_outplace=True)
-        self.run_pass('dce', g)
+            f, (x,), return_inputs=True, _force_outplace=True
+        )
+        self.run_pass("dce", g)
         m = self.createFunctionFromGraph(g)
         self.assertEqual(outputs, m(*inputs))
         self.assertExportImport(g, (x,))
@@ -840,7 +907,7 @@ def forward(self, x):
 
         m = MyModule()
         g, _ = torch.jit._get_trace_graph(m, (torch.randn(2, 2),))
-        self.run_pass('dce', g)
+        self.run_pass("dce", g)
         self.assertEqual(len(list(g.inputs())), 2)
         FileCheck().check("mul").check("add").run(str(g))
 
@@ -853,52 +920,73 @@ def test_trace_c10_ops(self):
         class MyModel(torch.nn.Module):
             def forward(self, scores, bbox_deltas, im_info, anchors):
                 a, b = torch.ops._caffe2.GenerateProposals(
-                    (scores), (bbox_deltas), (im_info), (anchors),
-                    2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True,
+                    (scores),
+                    (bbox_deltas),
+                    (im_info),
+                    (anchors),
+                    2.0,
+                    6000,
+                    300,
+                    0.7,
+                    16,
+                    True,
+                    -90,
+                    90,
+                    1.0,
+                    True,
                 )
                 return a, b
+
         model = MyModel()
         A = 4
         H = 10
         W = 8
         img_count = 3
         scores = torch.ones(img_count, A, H, W, dtype=torch.float32)
-        bbox_deltas = torch.linspace(0, 10, steps=img_count * 4 * A * H * W,
-                                     dtype=torch.float32)
+        bbox_deltas = torch.linspace(
+            0, 10, steps=img_count * 4 * A * H * W, dtype=torch.float32
+        )
         bbox_deltas = bbox_deltas.view(img_count, 4 * A, H, W)
         im_info = torch.ones(img_count, 3, dtype=torch.float32)
         anchors = torch.ones(A, 4, dtype=torch.float32)
         inputs = (scores, bbox_deltas, im_info, anchors)
         traced_model = torch.jit.trace(model, inputs)
         self.assertEqual(traced_model(*inputs), model(*inputs))
-        self.assertExportImportModule(traced_model, (scores, bbox_deltas, im_info, anchors))
+        self.assertExportImportModule(
+            traced_model, (scores, bbox_deltas, im_info, anchors)
+        )
 
     def run_ge_tests(self, optimize, use_cuda):
-
         with enable_profiling_mode_for_profiling_tests():
             with torch.jit.optimized_execution(optimize):
+
                 def rand(*args):
                     t = torch.rand(*args).float()
                     if use_cuda:
                         t = t.cuda()
                     return t
-                self.checkTrace(lambda a, b: a * b + b,
-                                [rand(1), rand(1)], [rand(2, 3), rand(2, 3)])
+
+                self.checkTrace(
+                    lambda a, b: a * b + b, [rand(1), rand(1)], [rand(2, 3), rand(2, 3)]
+                )
                 # trivial identity
                 self.checkTrace(lambda a, b: (b, a), [rand(1), rand(1)])
 
                 def foo(a):
                     t = a * a
                     return t * t, 4 * t
+
                 self.checkTrace(foo, [rand(1)])
                 # unused input
                 self.checkTrace(
-                    lambda a, b: a * a, [rand(1), rand(1)], allow_unused=True)
+                    lambda a, b: a * a, [rand(1), rand(1)], allow_unused=True
+                )
                 # test outputs that do not get used in grad
                 self.checkTrace(foo, [rand(1)], drop=1)
                 # test autograd fallback
-                self.checkTrace(lambda a, b: a * b /
-                                (a - 2 * b) + b, [rand(1), rand(1)])
+                self.checkTrace(
+                    lambda a, b: a * b / (a - 2 * b) + b, [rand(1), rand(1)]
+                )
 
     def test_ge_unoptimized(self):
         self.run_ge_tests(False, False)
@@ -917,22 +1005,24 @@ def test_ge_cuda(self):
     def test_ge(self):
         def foo(a, b):
             return a * b / (a - b) + b
+
         V = Variable
         a, b = V(torch.rand(1)), V(torch.rand(1))
         ge = torch.jit.trace(foo, (a, b))
         a, b = V(torch.rand(1), requires_grad=True), V(
-            torch.rand(1), requires_grad=True)
-        r, = ge(a, b)
+            torch.rand(1), requires_grad=True
+        )
+        (r,) = ge(a, b)
         da, db = torch.autograd.grad(r + 3, [a, b], create_graph=True)
 
-        l2 = (da * db + db * db)
+        l2 = da * db + db * db
         g2result = torch.autograd.grad(l2, [da, db])
 
         r = foo(a, b)
         da2, db2 = torch.autograd.grad(r + 3, [a, b], create_graph=True)
         self.assertEqual(da, da2)
         self.assertEqual(db, db2)
-        l3 = (da2 * db2 + db2 * db2)
+        l3 = da2 * db2 + db2 * db2
         g2result2 = torch.autograd.grad(l3, [da2, db2])
         self.assertEqual(g2result, g2result2)
 
@@ -953,8 +1043,10 @@ class Model(nn.Module):
             def __init__(self, num_features, num_layers):
                 super().__init__()
                 self.num_layers = num_layers
-                layers = [[nn.Linear(num_features, num_features), nn.Sigmoid()]
-                          for _ in range(num_layers)]
+                layers = [
+                    [nn.Linear(num_features, num_features), nn.Sigmoid()]
+                    for _ in range(num_layers)
+                ]
                 self.submodule = nn.Sequential(*chain(*layers))
 
             def forward(self, x):
@@ -977,7 +1069,9 @@ def forward(self, x):
         with self.assertRaises(AttributeError):
             linear_submodule.in_features
         linear_submodule.weight
-        linear_submodule.weight = nn.Parameter(torch.randn(linear_submodule.weight.shape))
+        linear_submodule.weight = nn.Parameter(
+            torch.randn(linear_submodule.weight.shape)
+        )
         with self.assertRaises(RuntimeError):
             del linear_submodule.weight
 
@@ -992,9 +1086,9 @@ def forward(self, x):
         traced_model.cpu()
         cpu_out = traced_model(x.float())
         self.assertEqual(cpu_out, cuda_out)
-        traced_model.to('cuda')
+        traced_model.to("cuda")
         cuda_out = traced_model(x.float().cuda())
-        traced_model.to('cpu')
+        traced_model.to("cpu")
         cpu_out = traced_model(x.float())
         self.assertEqual(cpu_out, cuda_out)
         traced_model.to(torch.get_default_dtype())
@@ -1022,15 +1116,23 @@ def forward(self, x=None):
                 return h
 
         a = Model()
-        b = torch.jit.trace(a, example_inputs=(torch.ones([1], device=torch.device("cuda")),))
+        b = torch.jit.trace(
+            a, example_inputs=(torch.ones([1], device=torch.device("cuda")),)
+        )
         FileCheck().check_not("device").run(b.code)
 
     def test_export_no_reorder(self):
         def func(a, b):
             return a * b / (a - 2 * b) + b
 
-        recording_inputs = [torch.tensor([0.55619788169860839844], dtype=torch.float32, requires_grad=True),
-                            torch.tensor([0.25947844982147216797], dtype=torch.float32, requires_grad=True)]
+        recording_inputs = [
+            torch.tensor(
+                [0.55619788169860839844], dtype=torch.float32, requires_grad=True
+            ),
+            torch.tensor(
+                [0.25947844982147216797], dtype=torch.float32, requires_grad=True
+            ),
+        ]
 
         ge1 = torch.jit.trace(func, recording_inputs)
         ge2 = self.getExportImportCopy(ge1)
@@ -1057,7 +1159,7 @@ def backward(ctx, grad_output):
         def fn(x):
             return MyFn.apply(x + 2) + 3
 
-        x = torch.tensor([1., 2., 3.])
+        x = torch.tensor([1.0, 2.0, 3.0])
         y = torch.randn(2, 2, requires_grad=True)
         fn(x)
         fn(y)
@@ -1076,7 +1178,8 @@ def backward(ctx, grad_output):
         def fn(x):
             a, b = MyFn.apply(x + 2)
             return a + b + 3
-        x = torch.tensor([1., 2., 3.])
+
+        x = torch.tensor([1.0, 2.0, 3.0])
         y = torch.randn(2, 2, requires_grad=True)
         fn(x)
         fn(y)
@@ -1150,21 +1253,20 @@ def __init__(self):
                 self.foo = Foo()
 
             def forward(self, a, b):
-                return self.foo({'a': a, 'b': b})['a']
+                return self.foo({"a": a, "b": b})["a"]
 
         class Foo(torch.nn.Module):
             def forward(self, x):
-                return {'a': x['a'] * x['b']}
+                return {"a": x["a"] * x["b"]}
 
         x = (torch.rand(3), torch.rand(3))
         model = Bar()
         self.checkTrace(model, x)
 
     def test_trace_dict_output(self):
-
         class TraceDictStrTensor(torch.nn.Module):
             def forward(self, a, b):
-                return {'a': a, 'b': b}
+                return {"a": a, "b": b}
 
         class TraceDictTensorTensor(torch.nn.Module):
             def forward(self, a, b):
@@ -1175,15 +1277,20 @@ def forward(self, a, b):
             torch.jit.trace(TraceDictStrTensor(), x)
 
         traced_dict_str_mod = torch.jit.trace(TraceDictStrTensor(), x, strict=False)
-        self.assertEqual(traced_dict_str_mod(*x), {'a': x[0], 'b': x[1]})
+        self.assertEqual(traced_dict_str_mod(*x), {"a": x[0], "b": x[1]})
 
-        traced_dict_tensor_mod = torch.jit.trace(TraceDictTensorTensor(), x, strict=False)
+        traced_dict_tensor_mod = torch.jit.trace(
+            TraceDictTensorTensor(), x, strict=False
+        )
         self.assertEqual(traced_dict_tensor_mod(*x), {x[0]: x[1], x[1]: x[0]})
 
     def test_trace_with_tensor_list_output(self):
         def f():
             return [torch.zeros(1), torch.zeros(5)]
-        with self.assertWarnsRegex(torch.jit.TracerWarning, "cause the trace to be incorrect"):
+
+        with self.assertWarnsRegex(
+            torch.jit.TracerWarning, "cause the trace to be incorrect"
+        ):
             torch.jit.trace(f, [])
         traced_non_strict_f = torch.jit.trace(f, [], strict=False)
         self.assertEqual(traced_non_strict_f(), f())
@@ -1191,15 +1298,43 @@ def f():
     def test_trace_with_number_list_output(self):
         def f():
             return [1, 5]
-        with self.assertRaisesRegex(RuntimeError, r"Only tensors.+can be output from traced functions"):
+
+        with self.assertRaisesRegex(
+            RuntimeError, r"Only tensors.+can be output from traced functions"
+        ):
             traced_f = torch.jit.trace(f, [])
 
     def test_trace_with_nested_tensor_list_output(self):
         def f():
             return [[torch.zeros(1)], [torch.zeros(5)]]
-        with self.assertRaisesRegex(RuntimeError, r"Only tensors.+can be output from traced functions"):
+
+        with self.assertRaisesRegex(
+            RuntimeError, r"Only tensors.+can be output from traced functions"
+        ):
             traced_f = torch.jit.trace(f, [])
 
+    def test_trace_with_nested_strided_tensor_output(self):
+        @torch.jit.script
+        def nt_construct(values, kv_lengths):
+            kv_lengths_list: List[int] = kv_lengths.tolist()
+            return torch._nested_tensor_from_tensor_list(
+                list(values.split(kv_lengths_list, dim=0)), None, None, None, None
+            )
+
+        def f(x, offsets):
+            kv_lengths = offsets[1:] - offsets[:-1]
+            return nt_construct(x, kv_lengths).cos()
+
+        x = torch.rand(5, 4)
+        offsets = torch.tensor([0, 2, 5])
+        ref = f(x, offsets)
+        f_t = torch.jit.trace(f, (x, offsets))
+        res = f_t(x, offsets)
+        self.assertEqual(ref, res)
+        x2 = torch.rand((8, 4))
+        offsets2 = torch.tensor([0, 2, 4, 8])
+        self.assertEqual(f(x2, offsets2), f_t(x2, offsets2))
+
     def test_trace_variable_instantiation(self):
         def random_foo(x):
             return Variable(Variable(x) + 1.0)
@@ -1231,15 +1366,28 @@ def tensor_size(x: torch.Tensor) -> torch.Tensor:
             return torch.tensor([x.size()[0]])
 
         self.assertEqual(
-            tensor_size(torch.rand(15,)),
-            torch.tensor([15])
+            tensor_size(
+                torch.rand(
+                    15,
+                )
+            ),
+            torch.tensor([15]),
         )
 
-        traced_tensor_size = torch.jit.trace(tensor_size, torch.rand(7,))
+        traced_tensor_size = torch.jit.trace(
+            tensor_size,
+            torch.rand(
+                7,
+            ),
+        )
 
         self.assertEqual(
-            traced_tensor_size(torch.rand(15,)),
-            torch.tensor([15])
+            traced_tensor_size(
+                torch.rand(
+                    15,
+                )
+            ),
+            torch.tensor([15]),
         )
 
         @torch.jit.script
@@ -1249,8 +1397,13 @@ def use_device(x):
         def foo(x):
             return use_device(x)
 
-        traced_tensor_size = torch.jit.trace(foo, torch.rand(7,))
-        self.run_pass('inline', traced_tensor_size.graph)
+        traced_tensor_size = torch.jit.trace(
+            foo,
+            torch.rand(
+                7,
+            ),
+        )
+        self.run_pass("inline", traced_tensor_size.graph)
         FileCheck().check("prim::device").run(traced_tensor_size.graph)
 
     def test_trace_save(self):
@@ -1314,10 +1467,12 @@ def forward(self, x):
         f = Foo()
 
         traced = torch.jit.trace(f, (torch.rand(3, 4),))
-        expected_names = ['__getstate__', '__setstate__']
+        expected_names = ["__getstate__", "__setstate__"]
 
         def check(mod):
-            self.assertTrue(all(name in mod._c._method_names() for name in expected_names))
+            self.assertTrue(
+                all(name in mod._c._method_names() for name in expected_names)
+            )
 
         check(traced)
 
@@ -1353,10 +1508,12 @@ def forward(self, x):
         f = Wrapper()
 
         traced = torch.jit.trace(f, (torch.rand(3, 4),))
-        expected_names = ['__getstate__', '__setstate__']
+        expected_names = ["__getstate__", "__setstate__"]
 
         def check(mod):
-            self.assertTrue(all(name in mod._c._method_names() for name in expected_names))
+            self.assertTrue(
+                all(name in mod._c._method_names() for name in expected_names)
+            )
 
         check(traced.foo)
 
@@ -1390,7 +1547,7 @@ def forward(self, x):
         f = WrapperExports()
 
         traced = torch.jit.trace(f, (torch.rand(3, 4),))
-        expected_names = ['addOne']
+        expected_names = ["addOne"]
         check(traced)
 
     def test_trace_autograd_function(self):
@@ -1403,12 +1560,10 @@ def forward(ctx, input):
             def backward(ctx, grad_output):
                 return torch.neg(grad_output)
 
-
         class TracedModule(torch.nn.Module):
             def forward(self, x):
                 return torch.relu(TestFunc.apply(x))
 
-
         class Wrapper(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1447,9 +1602,11 @@ def forward(self, x, y):
         print(traced.graph)
 
         # Expected output schema of the custom autograd.Function.
-        schema = '(Double(1, 2, strides=[2, 1], requires_grad=0, device=cpu), '\
-            'Double(3, 2, strides=[2, 1], requires_grad=0, device=cpu)) '\
-            '= ^Foo'
+        schema = (
+            "(Double(1, 2, strides=[2, 1], requires_grad=0, device=cpu), "
+            "Double(3, 2, strides=[2, 1], requires_grad=0, device=cpu)) "
+            "= ^Foo"
+        )
 
         # See if expected schema exists.
         FileCheck().check(schema).run(traced.graph)
@@ -1468,7 +1625,9 @@ def __init__(self):
 
             def forward(self, x):
                 y = self.conv(x)
-                w = nn.functional.interpolate(y, mode='bilinear', align_corners=False, scale_factor=3)
+                w = nn.functional.interpolate(
+                    y, mode="bilinear", align_corners=False, scale_factor=3
+                )
                 return w
 
         f = test()
@@ -1532,10 +1691,9 @@ def forward(self, x):
         tm = torch.jit.trace(TracedModule(), torch.rand(3, 4))
 
         # Note: neg op from the traced function should be properly inlined
-        FileCheck().check("aten::mm") \
-            .check('name="traced_fn"') \
-            .check_next("prim::CallFunction") \
-            .run(str(tm.graph))
+        FileCheck().check("aten::mm").check('name="traced_fn"').check_next(
+            "prim::CallFunction"
+        ).run(str(tm.graph))
 
     @_tmp_donotuse_dont_inline_everything
     def test_call_traced_module_from_traced_module(self):
@@ -1558,7 +1716,9 @@ def forward(self, x):
 
         tm = torch.jit.trace(TracedModule(), torch.rand(3, 4))
 
-        FileCheck().check("aten::mm").check("prim::CallMethod").check_same("forward").check("aten::add").run(str(tm.graph))
+        FileCheck().check("aten::mm").check("prim::CallMethod").check_same(
+            "forward"
+        ).check("aten::add").run(str(tm.graph))
 
     def test_index_put_trace_with_view(self):
         @_trace(torch.rand(100), torch.tensor([1, 2, 3, 4]), torch.rand(1, 1, 1, 4))
@@ -1566,7 +1726,9 @@ def test_index_put(target, indices, rhs):
             target[indices] = rhs
             return target
 
-        FileCheck().check("aten::view").check("index_put_").run(str(test_index_put.graph))
+        FileCheck().check("aten::view").check("index_put_").run(
+            str(test_index_put.graph)
+        )
 
     def test_index_put_trace_without_view(self):
         @_trace(torch.rand(100), torch.tensor([1, 2, 3, 4]), torch.rand(4))
@@ -1574,12 +1736,17 @@ def test_index_put(target, indices, rhs):
             target[indices] = rhs
             return target
 
-        FileCheck().check_not("aten::view").check("index_put_").run(str(test_index_put.graph))
+        FileCheck().check_not("aten::view").check("index_put_").run(
+            str(test_index_put.graph)
+        )
 
     @suppress_warnings
     def test_trace_checker_dot_data(self):
-        with self.assertRaisesRegex(torch.jit.TracingCheckError, r'Tensor-valued Constant nodes differed in value '
-                                                                 r'across invocations'):
+        with self.assertRaisesRegex(
+            torch.jit.TracingCheckError,
+            r"Tensor-valued Constant nodes differed in value " r"across invocations",
+        ):
+
             @_trace(torch.rand(3, 4), check_inputs=[(torch.rand(3, 4),)])
             def foo(x):
                 y = x.data
@@ -1592,18 +1759,25 @@ def foo(x):
                 x = torch.neg(x)
             return x
 
-        with self.assertRaisesRegex(torch.jit.TracingCheckError, r'Graphs differed across invocations!'):
+        with self.assertRaisesRegex(
+            torch.jit.TracingCheckError, r"Graphs differed across invocations!"
+        ):
             torch.jit.trace(foo, torch.randn(3, 4), check_inputs=[torch.randn(4, 4)])
 
     @suppress_warnings
     def test_trace_checker_memoization(self):
-        with self.assertRaisesRegex(torch.jit.TracingCheckError, r'Graphs differed across invocations!'):
+        with self.assertRaisesRegex(
+            torch.jit.TracingCheckError, r"Graphs differed across invocations!"
+        ):
+
             def foo(x):
-                if not hasattr(foo, 'cache'):
+                if not hasattr(foo, "cache"):
                     foo.cache = torch.neg(x)
                 return x + foo.cache
 
-            traced = torch.jit.trace(foo, torch.rand(3, 4), check_inputs=[(torch.rand(3, 4),)])
+            traced = torch.jit.trace(
+                foo, torch.rand(3, 4), check_inputs=[(torch.rand(3, 4),)]
+            )
 
     def test_trace_checker_slice_lhs(self):
         def foo(x):
@@ -1618,34 +1792,45 @@ def foo(x):
             x.view(-1).add_(-x.view(-1))
             return x
 
-        with self.assertWarnsRegex(torch.jit.TracerWarning,
-                                   'Output nr 1. of the traced function does not match the '
-                                   'corresponding output of the Python function'):
-            torch.jit.trace(foo,
-                            torch.rand(3, 4),
-                            check_inputs=[torch.rand(5, 6)],
-                            _force_outplace=True)
+        with self.assertWarnsRegex(
+            torch.jit.TracerWarning,
+            "Output nr 1. of the traced function does not match the "
+            "corresponding output of the Python function",
+        ):
+            torch.jit.trace(
+                foo,
+                torch.rand(3, 4),
+                check_inputs=[torch.rand(5, 6)],
+                _force_outplace=True,
+            )
 
     def test_lhs_index_fails(self):
         def foo(x):
             x[0, 1] = 4
             return x
 
-        with self.assertWarnsRegex(torch.jit.TracerWarning, "cause the trace to be incorrect"):
+        with self.assertWarnsRegex(
+            torch.jit.TracerWarning, "cause the trace to be incorrect"
+        ):
             torch.jit.trace(foo, torch.rand(3, 4), _force_outplace=True)
 
     def test_lhs_index_trivial(self):
         def foo(y, x):
             y[...] = x
             return y
-        self.checkTrace(foo, (torch.rand(3, 4), torch.rand(4)), inputs_require_grads=False)
+
+        self.checkTrace(
+            foo, (torch.rand(3, 4), torch.rand(4)), inputs_require_grads=False
+        )
 
     def test_inplace_warn(self):
         def foo(x):
             x.view(-1).add_(-x.view(-1))
             return x
 
-        with self.assertWarnsRegex(torch.jit.TracerWarning, "cause the trace to be incorrect"):
+        with self.assertWarnsRegex(
+            torch.jit.TracerWarning, "cause the trace to be incorrect"
+        ):
             torch.jit.trace(foo, torch.rand(3, 4), _force_outplace=True)
 
     @suppress_warnings
@@ -1653,13 +1838,16 @@ def test_trace_checker_dropout_train(self):
         def foo(x):
             return torch.dropout(x, p=0.5, train=True)
 
-        with self.assertWarnsRegex(torch.jit.TracerWarning,
-                                   'Output nr 1. of the traced function does not match the '
-                                   'corresponding output of the Python function'):
+        with self.assertWarnsRegex(
+            torch.jit.TracerWarning,
+            "Output nr 1. of the traced function does not match the "
+            "corresponding output of the Python function",
+        ):
             torch.jit.trace(foo, torch.rand(3, 4), check_inputs=[torch.rand(5, 6)])
 
-        with self.assertWarnsRegex(torch.jit.TracerWarning,
-                                   'Trace had nondeterministic nodes'):
+        with self.assertWarnsRegex(
+            torch.jit.TracerWarning, "Trace had nondeterministic nodes"
+        ):
             torch.jit.trace(foo, torch.rand(3, 4), check_inputs=[torch.rand(5, 6)])
 
     def test_trace_checker_dropout_notrain(self):
@@ -1714,10 +1902,7 @@ def forward(self, x):
         class MyMod(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.ml = torch.nn.ModuleList([
-                    MySubmod(),
-                    MySubmod()
-                ])
+                self.ml = torch.nn.ModuleList([MySubmod(), MySubmod()])
 
             def forward(self, x):
                 for mod in self.ml:
@@ -1738,9 +1923,7 @@ def forward(self, x):
         class Mod(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.ml = torch.nn.ModuleList([
-                    MySubmod() for i in range(2)
-                ])
+                self.ml = torch.nn.ModuleList([MySubmod() for i in range(2)])
 
             def forward(self, x):
                 futs = []
@@ -1791,9 +1974,9 @@ def foo(bar, baz):
 
         traced = torch.jit.trace(foo, (torch.rand(3, 3), torch.rand(3, 3)))
         graph_str = str(traced.graph)
-        assert 'bar' in graph_str
-        assert 'baz' in graph_str
-        assert 'quick_brown_fox' in graph_str
+        assert "bar" in graph_str
+        assert "baz" in graph_str
+        assert "quick_brown_fox" in graph_str
 
     @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
     def test_tracing_hooks(self):
@@ -1882,7 +2065,10 @@ def weighted_kernel_sum(self, weight):
 
         example_weight = torch.rand(1, 1, 3, 3)
         example_forward_input = torch.rand(1, 1, 3, 3)
-        inputs = {'forward' : example_forward_input, 'weighted_kernel_sum' : example_weight}
+        inputs = {
+            "forward": example_forward_input,
+            "weighted_kernel_sum": example_weight,
+        }
         n = Net()
         module = torch.jit.trace_module(n, inputs)
 
@@ -1890,14 +2076,26 @@ def weighted_kernel_sum(self, weight):
         for i in range(2):
             check_weight = torch.rand(1, 1, 3, 3)
             check_forward_input = torch.rand(1, 1, 3, 3)
-            check_inputs.append({'forward' : check_forward_input, 'weighted_kernel_sum' : check_weight})
-        module = torch.jit.trace_module(n, inputs, check_trace=True, check_inputs=check_inputs)
+            check_inputs.append(
+                {"forward": check_forward_input, "weighted_kernel_sum": check_weight}
+            )
+        module = torch.jit.trace_module(
+            n, inputs, check_trace=True, check_inputs=check_inputs
+        )
         self.assertTrue(module._c._has_method("forward"))
         self.assertTrue(module._c._has_method("weighted_kernel_sum"))
 
         module = torch.jit.trace(n.forward, example_forward_input)
-        module = torch.jit.trace(n.forward, example_forward_input, check_trace=True, check_inputs=[example_forward_input])
-        with self.assertRaisesRegex(AttributeError, "trace doesn't support compiling individual module's functions"):
+        module = torch.jit.trace(
+            n.forward,
+            example_forward_input,
+            check_trace=True,
+            check_inputs=[example_forward_input],
+        )
+        with self.assertRaisesRegex(
+            AttributeError,
+            "trace doesn't support compiling individual module's functions",
+        ):
             module = torch.jit.trace(n.weighted_kernel_sum, inputs)
 
     def test_tensor_with_grad_as_constant(self):
@@ -1906,13 +2104,19 @@ def test_tensor_with_grad_as_constant(self):
 
         def f(x):
             return x + param
-        with self.assertRaisesRegex(RuntimeError, "Cannot insert a Tensor that requires grad as a constant"):
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Cannot insert a Tensor that requires grad as a constant"
+        ):
             torch.jit.trace(f, x)
 
     def test_non_tensor_tracing(self):
         def f(x):
             return x + param  # noqa: F821
-        with self.assertRaisesRegex(RuntimeError, r"Type 'Tuple\[int\]' cannot be traced"):
+
+        with self.assertRaisesRegex(
+            RuntimeError, r"Type 'Tuple\[int\]' cannot be traced"
+        ):
             torch.jit.trace(f, (1,))
 
     def test_trace_skip_none_submodule(self):
@@ -1926,7 +2130,7 @@ def forward(self, inputs):
                 return inputs
 
         m = TestModule()
-        tm = torch.jit.trace(m, torch.tensor(1.))
+        tm = torch.jit.trace(m, torch.tensor(1.0))
         self.assertFalse(hasattr(tm, "submod"))
 
     def test_trace_with_conditional_property(self):
@@ -1935,7 +2139,7 @@ def __init__(self, attr=None):
                 super().__init__()
                 if attr is not None:
                     self._attr = attr
-                self.attr_name = '_attr'
+                self.attr_name = "_attr"
 
             @property
             def attr(self):
@@ -1952,16 +2156,16 @@ def fn(first_arg: torch.Tensor, second_arg: torch.Tensor) -> torch.Tensor:
             return first_arg + second_arg
 
         traced_fn = torch.jit.trace(fn, (torch.ones(1), torch.ones(1)))
-        FileCheck().check("first_arg").check_next("second_arg") \
-            .run(str(traced_fn.graph))
+        FileCheck().check("first_arg").check_next("second_arg").run(
+            str(traced_fn.graph)
+        )
 
     def test_trace_partial_func_argument_names_captured(self):
         def fn(first_arg: torch.Tensor, second_arg=1) -> torch.Tensor:
             return first_arg + second_arg
 
         traced_fn = torch.jit.trace(fn, (torch.ones(1),))
-        FileCheck().check("first_arg").check_not("second_arg") \
-            .run(str(traced_fn.graph))
+        FileCheck().check("first_arg").check_not("second_arg").run(str(traced_fn.graph))
 
     def test_trace_module_argument_names_captured(self):
         class TestModule(nn.Module):
@@ -1977,13 +2181,15 @@ def forward(self, first_arg: torch.Tensor, second_arg: torch.Tensor):
 
         # Explicitly tracing module's forward method
         traced_module_forward = torch.jit.trace(m.forward, example_input)
-        FileCheck().check("first_arg").check_next("second_arg") \
-            .run(str(traced_module_forward.graph))
+        FileCheck().check("first_arg").check_next("second_arg").run(
+            str(traced_module_forward.graph)
+        )
 
         # Tracing module's directly
         traced_module = torch.jit.trace(m, example_input)
-        FileCheck().check("first_arg").check_next("second_arg") \
-            .run(str(traced_module.graph))
+        FileCheck().check("first_arg").check_next("second_arg").run(
+            str(traced_module.graph)
+        )
 
     def test_trace_checking_with_deprecated_name(self):
         class MyClass(torch.nn.Module):
@@ -1992,12 +2198,18 @@ def __init__(self):
 
             def forward(self, x, y, **deprecated_arguments):
                 if len(deprecated_arguments) > 0:
-                    raise RuntimeError(f"Got unexpected arguments: {deprecated_arguments}")
+                    raise RuntimeError(
+                        f"Got unexpected arguments: {deprecated_arguments}"
+                    )
                 return x + y
 
         model = MyClass()
         m2 = torch.jit.trace(model, (torch.ones(1), torch.ones(1)))
-        m3 = torch.jit.trace(model, example_kwarg_inputs={'x': torch.ones(1), "y": torch.ones(1)}, strict=False)
+        m3 = torch.jit.trace(
+            model,
+            example_kwarg_inputs={"x": torch.ones(1), "y": torch.ones(1)},
+            strict=False,
+        )
 
     def test_trace_with_tuple_tensor(self):
         class MyClass(torch.nn.Module):
@@ -2008,11 +2220,21 @@ def forward(self, x, y):
                 return x + y[0] + y[1]
 
         model = MyClass()
-        traced_model = torch.jit.trace(model, (torch.ones(1), (torch.ones(1), torch.ones(1))))
-        input_dict = {"x": torch.tensor([2, 3]), "y": (torch.tensor([5, 6]), torch.tensor([7, 8]))}
+        traced_model = torch.jit.trace(
+            model, (torch.ones(1), (torch.ones(1), torch.ones(1)))
+        )
+        input_dict = {
+            "x": torch.tensor([2, 3]),
+            "y": (torch.tensor([5, 6]), torch.tensor([7, 8])),
+        }
         self.assertEqual(model(**input_dict), traced_model(**input_dict))
-        traced_model = torch.jit.trace(model, example_kwarg_inputs={
-                                       'x': torch.ones(1), "y": (torch.ones(1), torch.ones(1))})
+        traced_model = torch.jit.trace(
+            model,
+            example_kwarg_inputs={
+                "x": torch.ones(1),
+                "y": (torch.ones(1), torch.ones(1)),
+            },
+        )
         self.assertEqual(model(**input_dict), traced_model(**input_dict))
 
     def test_trace_no_duplicated_lifted_input_output(self):
@@ -2076,7 +2298,9 @@ def func2(x: List[Tensor]) -> Tensor:
         self.checkTrace(func2, ((a, b),))
 
         @torch.jit.script
-        def func3(x: Tensor, method: str = 'bilinear', align_corners: bool = True) -> Tensor:
+        def func3(
+            x: Tensor, method: str = "bilinear", align_corners: bool = True
+        ) -> Tensor:
             hw = x.shape[2:4]
             return F.interpolate(x, hw, mode=method, align_corners=align_corners)
 
@@ -2093,7 +2317,7 @@ def func4(x: Tensor, a: List[Optional[str]]) -> Tensor:
     def test_trace_mixed_by_script_with_dict_output(self):
         @torch.jit.script
         def return_dict(input: torch.Tensor) -> Dict[str, torch.Tensor]:
-            return {"foo" : input + 1}
+            return {"foo": input + 1}
 
         class TraceModule(torch.nn.Module):
             def forward(self, input):
@@ -2213,18 +2437,18 @@ def forward(self, x):
         # for each of these checks, check that *BOTH* the underlying
         # _C.ScriptModule object has the expected method/param, as well as the
         # Python object that wraps it.
-        self.assertTrue(traced.ssm._c._has_method('foo'))
-        self.assertTrue(hasattr(traced.ssm, 'foo'))
+        self.assertTrue(traced.ssm._c._has_method("foo"))
+        self.assertTrue(hasattr(traced.ssm, "foo"))
 
         imported = self.getExportImportCopy(traced)
 
-        self.assertTrue(imported.ssm._c._has_method('foo'))
-        self.assertTrue(hasattr(imported.ssm, 'foo'))
+        self.assertTrue(imported.ssm._c._has_method("foo"))
+        self.assertTrue(hasattr(imported.ssm, "foo"))
 
-        self.assertTrue(imported.ssm.asm._c._has_method('bar'))
-        self.assertTrue(hasattr(imported.ssm.asm, 'bar'))
+        self.assertTrue(imported.ssm.asm._c._has_method("bar"))
+        self.assertTrue(hasattr(imported.ssm.asm, "bar"))
 
-        self.assertTrue(hasattr(imported.ssm.asm, 'param'))
+        self.assertTrue(hasattr(imported.ssm.asm, "param"))
 
     def test_trace_parameter(self):
         class Param(nn.Module):
@@ -2281,7 +2505,9 @@ def forward(self, x):
                 return scripted_fn(torch.mm(x, self.param))
 
         tm = torch.jit.trace(TracedModule(), torch.rand(3, 4))
-        FileCheck().check("aten::mm").check("name=\"scripted_fn\"").check("prim::CallFunction").run(str(tm.graph))
+        FileCheck().check("aten::mm").check('name="scripted_fn"').check(
+            "prim::CallFunction"
+        ).run(str(tm.graph))
 
     @_tmp_donotuse_dont_inline_everything
     def test_call_script_module_from_traced_module(self):
@@ -2305,7 +2531,9 @@ def forward(self, x):
 
         tm = torch.jit.trace(TracedModule(), torch.rand(3, 4))
 
-        FileCheck().check("aten::mm").check("prim::CallMethod").check_same("forward").check("aten::add").run(str(tm.graph))
+        FileCheck().check("aten::mm").check("prim::CallMethod").check_same(
+            "forward"
+        ).check("aten::add").run(str(tm.graph))
 
     @_tmp_donotuse_dont_inline_everything
     def test_call_traced_fn_from_script_fn(self):
@@ -2317,10 +2545,16 @@ def traced_fn(x):
         def script_fn(x):
             return traced_fn(x) + 1
 
-        FileCheck().check("prim::CallFunction").check("aten::add").run(str(script_fn.graph))
+        FileCheck().check("prim::CallFunction").check("aten::add").run(
+            str(script_fn.graph)
+        )
 
     def test_call_traced_mod_from_script_fn(self):
-        with self.assertRaisesRegex(RuntimeError, "Cannot call a ScriptModule that is not a submodule of the caller"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Cannot call a ScriptModule that is not a submodule of the caller",
+        ):
+
             class TracedModule(torch.nn.Module):
                 def forward(self, x):
                     return torch.mm(x, torch.zeros(4, 3))
@@ -2347,7 +2581,9 @@ def forward(self, x):
                 return traced_fn(torch.mm(x, self.param))
 
         sm = ScriptMod()
-        FileCheck().check("aten::mm").check("prim::CallFunction").run(str(sm.forward.graph))
+        FileCheck().check("aten::mm").check("prim::CallFunction").run(
+            str(sm.forward.graph)
+        )
 
     @_tmp_donotuse_dont_inline_everything
     def test_call_tracing_mod_from_script_module(self):
@@ -2415,31 +2651,41 @@ def forward(self, input_map: Dict[str, List[Tensor]]) -> Tensor:
 
                 return self.b(feature_map)
 
-        input_map = {"1" : [torch.rand(2, 2), torch.rand(2, 2)], "3" : [torch.rand(2, 2), torch.rand(2, 2)]}
+        input_map = {
+            "1": [torch.rand(2, 2), torch.rand(2, 2)],
+            "3": [torch.rand(2, 2), torch.rand(2, 2)],
+        }
         model = testA()
         traced_model = torch.jit.trace(model, input_map)
-        new_input_map = {"1" : [torch.rand(2, 2), torch.randn(2, 2)], "3" : [torch.rand(2, 2), torch.rand(2, 2)]}
+        new_input_map = {
+            "1": [torch.rand(2, 2), torch.randn(2, 2)],
+            "3": [torch.rand(2, 2), torch.rand(2, 2)],
+        }
         self.assertEqual(model(new_input_map), traced_model(new_input_map))
 
     def test_trace_script_returning_complex_dict(self):
         """Tracing over a script function returning a dictionary should work.
         The dictionary can should be able to contain other containers (like a tuple) recursively.
         """
+
         class ReturnsDict(torch.nn.Module):
             def forward(
-                self, id_score_list: Dict[str, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]
+                self,
+                id_score_list: Dict[
+                    str, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+                ],
             ) -> Dict[str, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
                 # do some random operations and then return a dict of the same structure
                 v = id_score_list["1000"]
                 idx_keys = v[1] - 1500000
                 weights = v[2]
-                result = {
-                    "1000": (v[0], idx_keys, weights)
-                }
+                result = {"1000": (v[0], idx_keys, weights)}
                 return result
 
         class ChecksDict(torch.nn.Module):
-            def forward(self, input: Dict[str, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]):
+            def forward(
+                self, input: Dict[str, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]
+            ):
                 v = input["1000"]
                 return v[1] + 1
 
@@ -2449,7 +2695,9 @@ def __init__(self, checks_dict, returns_dict):
                 self.checks_dict = checks_dict
                 self.returns_dict = returns_dict
 
-            def forward(self, input: Dict[str, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]):
+            def forward(
+                self, input: Dict[str, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]
+            ):
                 foo = self.returns_dict(input)
                 return self.checks_dict(foo)
 
@@ -2457,7 +2705,7 @@ def forward(self, input: Dict[str, Tuple[torch.Tensor, torch.Tensor, torch.Tenso
             "1000": (
                 torch.tensor([0]),
                 torch.tensor([], dtype=torch.int64),
-                torch.tensor([])
+                torch.tensor([]),
             )
         }
 
@@ -2465,7 +2713,7 @@ def forward(self, input: Dict[str, Tuple[torch.Tensor, torch.Tensor, torch.Tenso
             "1000": (
                 torch.tensor([0]),
                 torch.tensor([1500000, 1500004], dtype=torch.int64),
-                torch.tensor([2.0, 3.0])
+                torch.tensor([2.0, 3.0]),
             )
         }
 
@@ -2480,15 +2728,14 @@ def test_trace_returning_dict_with_tensor_tuples(self):
         """Tracing over a module returning a dictionary whose values are tuples of tensors
         should work.
         """
+
         class ReturnsDict(torch.nn.Module):
             def forward(
                 self, k: torch.Tensor, v: torch.Tensor
             ) -> Dict[str, Tuple[torch.Tensor, torch.Tensor]]:
                 x = 2 * k
                 y = 3 * v
-                result = {
-                    "imakey": (x, y)
-                }
+                result = {"imakey": (x, y)}
                 return result
 
         class ReturnsBadDict(torch.nn.Module):
@@ -2496,22 +2743,24 @@ def forward(
                 self, k: torch.Tensor, v: torch.Tensor
             ) -> Dict[str, Tuple[torch.Tensor, float]]:
                 x = 2 * k
-                result = {
-                    "imakey": (x, 1)
-                }
+                result = {"imakey": (x, 1)}
                 return result
 
         mod = ReturnsDict()
-        traced_module = torch.jit.trace(mod, [torch.ones(1), torch.ones(1)], strict=False)
+        traced_module = torch.jit.trace(
+            mod, [torch.ones(1), torch.ones(1)], strict=False
+        )
         out = traced_module(torch.ones(1), torch.ones(1))
-        expected = {
-            "imakey": (torch.tensor([2.]), torch.tensor([3.]))
-        }
+        expected = {"imakey": (torch.tensor([2.0]), torch.tensor([3.0]))}
         self.assertEqual(out, expected)
 
-        with self.assertRaisesRegex(RuntimeError, "cannot be understood by the tracer, only outputs matching"):
+        with self.assertRaisesRegex(
+            RuntimeError, "cannot be understood by the tracer, only outputs matching"
+        ):
             mod = ReturnsBadDict()
-            traced_module = torch.jit.trace(mod, [torch.ones(1), torch.ones(1)], strict=False)
+            traced_module = torch.jit.trace(
+                mod, [torch.ones(1), torch.ones(1)], strict=False
+            )
 
     def test_trace_linear(self):
         m = torch.nn.Linear(20, 20)
@@ -2523,7 +2772,9 @@ def test_trace_linear(self):
     def test_traced_module_implements_interface(self):
         @torch.jit.interface
         class TestModuleInterface(nn.Module):
-            def forward(self, first_arg: torch.Tensor, second_arg: torch.Tensor) -> torch.Tensor:
+            def forward(
+                self, first_arg: torch.Tensor, second_arg: torch.Tensor
+            ) -> torch.Tensor:
                 pass
 
         make_global(TestModuleInterface)
@@ -2533,7 +2784,9 @@ def __init__(self):
                 super().__init__()
                 self.conv = nn.Conv2d(1, 1, 3)
 
-            def forward(self, first_arg: torch.Tensor, second_arg: torch.Tensor) -> torch.Tensor:
+            def forward(
+                self, first_arg: torch.Tensor, second_arg: torch.Tensor
+            ) -> torch.Tensor:
                 return self.conv(first_arg) + second_arg
 
         def fn_takes_interface(x: TestModuleInterface):
@@ -2544,7 +2797,6 @@ def fn_takes_interface(x: TestModuleInterface):
         self.checkScript(fn_takes_interface, (scripted_test_module,))
 
     def test_traced_module_contains_scripted_interface_types(self):
-
         class LeafModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
diff --git a/test/jit/test_type_sharing.py b/test/jit/test_type_sharing.py
index c2b84fc4e50de..55f78258ec31b 100644
--- a/test/jit/test_type_sharing.py
+++ b/test/jit/test_type_sharing.py
@@ -1,21 +1,24 @@
 # Owner(s): ["oncall: jit"]
 
+import io
 import os
 import sys
-import io
 
 import torch
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase
 from torch.testing._internal.common_utils import suppress_warnings
+from torch.testing._internal.jit_utils import JitTestCase
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
 
 class TestTypeSharing(JitTestCase):
     def assertSameType(self, m1, m2):
@@ -42,6 +45,7 @@ def __init__(self, a, b, c):
 
             def forward(self, x):
                 return x
+
         a = torch.rand(2, 3)
         b = torch.rand(2, 3)
         c = torch.rand(2, 3)
@@ -53,6 +57,7 @@ def test_diff_attr_values(self):
         """
         Types should be shared even if attribute values differ
         """
+
         class M(torch.nn.Module):
             def __init__(self, a, b, c):
                 super().__init__()
@@ -62,6 +67,7 @@ def __init__(self, a, b, c):
 
             def forward(self, x):
                 return x
+
         a = torch.rand(2, 3)
         b = torch.rand(2, 3)
         c = torch.rand(2, 3)
@@ -73,6 +79,7 @@ def test_constants(self):
         """
         Types should be shared for identical constant values, and different for different constant values
         """
+
         class M(torch.nn.Module):
             __constants__ = ["const"]
 
@@ -111,6 +118,7 @@ def test_submodules(self):
         """
         If submodules differ, the types should differ.
         """
+
         class M(torch.nn.Module):
             def __init__(self, in1, out1, in2, out2):
                 super().__init__()
@@ -137,6 +145,7 @@ def test_param_vs_attribute(self):
         The same module with an `foo` as a parameter vs. attribute shouldn't
         share types
         """
+
         class M(torch.nn.Module):
             def __init__(self, foo):
                 super().__init__()
@@ -156,6 +165,7 @@ def test_same_but_different_classes(self):
         Even if everything about the module is the same, different originating
         classes should prevent type sharing.
         """
+
         class A(torch.nn.Module):
             __constants__ = ["const"]
 
@@ -192,6 +202,7 @@ def test_mutate_attr_value(self):
         """
         Mutating the value of an attribute should not change type sharing
         """
+
         class M(torch.nn.Module):
             def __init__(self, in1, out1, in2, out2):
                 super().__init__()
@@ -214,6 +225,7 @@ def test_assign_python_attr(self):
         """
         Assigning a new (python-only) attribute should not change type sharing
         """
+
         class M(torch.nn.Module):
             def __init__(self, in1, out1, in2, out2):
                 super().__init__()
@@ -244,6 +256,7 @@ def test_failed_attribute_compilation(self):
         """
         Attributes whose type cannot be inferred should fail cleanly with nice hints
         """
+
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -255,15 +268,16 @@ def forward(self):
                 return self.foo
 
         m = M()
-        with self.assertRaisesRegexWithHighlight(RuntimeError,
-                                                 "failed to convert Python type",
-                                                 "self.foo"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, "failed to convert Python type", "self.foo"
+        ):
             torch.jit.script(m)
 
     def test_script_function_attribute_different(self):
         """
         Different functions passed in should lead to different types
         """
+
         @torch.jit.script
         def fn1(x):
             return x + x
@@ -317,6 +331,7 @@ def test_script_function_attribute_same(self):
         """
         Same functions passed in should lead to same types
         """
+
         @torch.jit.script
         def fn(x):
             return x + x
@@ -338,6 +353,7 @@ def test_python_function_attribute_different(self):
         """
         Different functions passed in should lead to different types
         """
+
         def fn1(x):
             return x + x
 
@@ -361,6 +377,7 @@ def test_python_function_attribute_same(self):
         """
         Same functions passed in should lead to same types
         """
+
         def fn(x):
             return x + x
 
@@ -383,6 +400,7 @@ def test_tracing_gives_different_types(self):
         Since we can't guarantee that methods are the same between different
         trace runs, tracing must always generate a unique type.
         """
+
         class M(torch.nn.Module):
             def forward(self, x, y):
                 if x.sum() > y.sum():
@@ -429,8 +447,8 @@ def __init__(self, input):
             def forward(self, x):
                 return self.traced(x)
 
-        a = M((torch.ones(1), ))
-        b = M((torch.zeros(1), ))
+        a = M((torch.ones(1),))
+        b = M((torch.zeros(1),))
         self.assertDifferentType(a, b)
 
     def test_loaded_modules_work(self):
@@ -465,7 +483,6 @@ def package(x):
             buffer.seek(0)
             return torch.jit.script(Wrapper(torch.jit.load(buffer)))
 
-
         a = package(AB())
         a()
         b = package(A())
@@ -476,6 +493,7 @@ def test_module_dict_same_type_different_name(self):
         We should be able to differentiate between two ModuleDict instances
         that have different keys but the same value types.
         """
+
         class A(torch.nn.Module):
             def forward(self, x):
                 return x
@@ -488,9 +506,9 @@ def __init__(self, s):
             def forward(self, x):
                 return x
 
-        a = Foo({'foo': A()})
-        b = Foo({'bar': A()})
-        c = Foo({'bar': A()})
+        a = Foo({"foo": A()})
+        b = Foo({"bar": A()})
+        c = Foo({"bar": A()})
         self.assertDifferentType(a, b)
         self.assertSameType(b, c)
 
@@ -500,13 +518,16 @@ def test_type_sharing_define_in_init(self):
         subclass that defines methods in its __init__ are not
         shared.
         """
+
         class A(torch.jit.ScriptModule):
             def __init__(self, val):
                 super().__init__()
-                self.define(f"""
+                self.define(
+                    f"""
                 def forward(self) -> int:
                     return {val}
-                """)
+                """
+                )
 
         one = A(1)
         two = A(2)
@@ -518,6 +539,7 @@ def test_type_sharing_disabled(self):
         """
         Test that type sharing can be disabled.
         """
+
         class A(torch.nn.Module):
             def __init__(self, sub):
                 super().__init__()
@@ -555,6 +577,7 @@ def test_type_shared_ignored_attributes(self):
         Test that types are shared if the exclusion of their
         ignored attributes makes them equal.
         """
+
         class A(torch.nn.Module):
             __jit_ignored_attributes__ = ["a"]
 
@@ -579,6 +602,7 @@ def test_type_not_shared_ignored_attributes(self):
         Test that types are not shared if the exclusion of their
         ignored attributes makes them not equal.
         """
+
         class A(torch.nn.Module):
             __jit_ignored_attributes__ = ["a"]
 
diff --git a/test/jit/test_types.py b/test/jit/test_types.py
index 8374afc5424dc..e331aad12e44d 100644
--- a/test/jit/test_types.py
+++ b/test/jit/test_types.py
@@ -1,26 +1,31 @@
 # Owner(s): ["oncall: jit"]
 
-from collections import namedtuple
-from typing import Dict, Iterator, List, Optional, Tuple
-
-from torch.testing._internal.jit_utils import JitTestCase
-from torch.testing import FileCheck
-from textwrap import dedent
-from jit.test_module_interface import TestModuleInterface  # noqa: F401
 import inspect
 import os
 import sys
+from collections import namedtuple
+from textwrap import dedent
+from typing import Dict, Iterator, List, Optional, Tuple
+
 import torch
 import torch.testing._internal.jit_utils
+from torch.testing import FileCheck
+
+from torch.testing._internal.jit_utils import JitTestCase
+
+from jit.test_module_interface import TestModuleInterface  # noqa: F401
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestTypesAndAnnotation(JitTestCase):
     def test_pep585_type(self):
@@ -30,7 +35,7 @@ def fn(x: torch.Tensor) -> Tuple[Tuple[torch.Tensor], Dict[str, int]]:
             xl: list[tuple[torch.Tensor]] = []
             xd: dict[str, int] = {}
             xl.append((x,))
-            xd['foo'] = 1
+            xd["foo"] = 1
             return xl.pop(), xd
 
         self.checkScript(fn, [torch.randn(2, 2)])
@@ -47,7 +52,7 @@ def fn(m: torch.Tensor) -> torch.device:
 
         self.checkScript(fn, [torch.randn(2, 2)])
 
-        GG = namedtuple('GG', ['f', 'g'])
+        GG = namedtuple("GG", ["f", "g"])
 
         class Foo(torch.nn.Module):
             @torch.jit.ignore
@@ -77,13 +82,17 @@ def fn(x: Dict[str, Optional[torch.Tensor]]):
             return x + 10
 
         class M(torch.nn.Module):
-            def forward(self, in_batch: Dict[str, Optional[torch.Tensor]]) -> torch.Tensor:
+            def forward(
+                self, in_batch: Dict[str, Optional[torch.Tensor]]
+            ) -> torch.Tensor:
                 self.dropout_modality(in_batch)
                 fn(in_batch)
                 return torch.tensor(1)
 
             @torch.jit.ignore
-            def dropout_modality(self, in_batch: Dict[str, Optional[torch.Tensor]]) -> Dict[str, Optional[torch.Tensor]]:
+            def dropout_modality(
+                self, in_batch: Dict[str, Optional[torch.Tensor]]
+            ) -> Dict[str, Optional[torch.Tensor]]:
                 return in_batch
 
         sm = torch.jit.script(M())
@@ -111,16 +120,17 @@ def fn(my_arg):
             return my_arg + 10
 
         with self.assertRaisesRegex(RuntimeError, "argument 'my_arg'"):
+
             @torch.jit.script
             def other_fn(x):
-                return fn('2')
+                return fn("2")
 
     def test_type_annotate_py3(self):
         def fn():
-            a : List[int] = []
-            b : torch.Tensor = torch.ones(2, 2)
-            c : Optional[torch.Tensor] = None
-            d : Optional[torch.Tensor] = torch.ones(3, 4)
+            a: List[int] = []
+            b: torch.Tensor = torch.ones(2, 2)
+            c: Optional[torch.Tensor] = None
+            d: Optional[torch.Tensor] = torch.ones(3, 4)
             for _ in range(10):
                 a.append(4)
                 c = torch.ones(2, 2)
@@ -130,66 +140,88 @@ def fn():
         self.checkScript(fn, ())
 
         def wrong_type():
-            wrong : List[int] = [0.5]
+            wrong: List[int] = [0.5]
             return wrong
 
-        with self.assertRaisesRegex(RuntimeError, "List type annotation"
-                                    r" `List\[int\]` did not match the "
-                                    "types of the given list elements"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "List type annotation"
+            r" `List\[int\]` did not match the "
+            "types of the given list elements",
+        ):
             torch.jit.script(wrong_type)
 
     def test_optional_no_element_type_annotation(self):
         """
         Test that using an optional with no contained types produces an error.
         """
+
         def fn_with_comment(x: torch.Tensor) -> Optional:
             return (x, x)
 
         def annotated_fn(x: torch.Tensor) -> Optional:
             return (x, x)
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use Optional without a contained type"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use Optional without a contained type"
+        ):
             cu = torch.jit.CompilationUnit()
             cu.define(dedent(inspect.getsource(fn_with_comment)))
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use Optional without a contained type"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use Optional without a contained type"
+        ):
             cu = torch.jit.CompilationUnit()
             cu.define(dedent(inspect.getsource(annotated_fn)))
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use Optional without a contained type"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use Optional without a contained type"
+        ):
             torch.jit.script(fn_with_comment)
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use Optional without a contained type"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use Optional without a contained type"
+        ):
             torch.jit.script(annotated_fn)
 
     def test_tuple_no_element_type_annotation(self):
         """
         Test that using a tuple with no contained types produces an error.
         """
+
         def fn_with_comment(x: torch.Tensor) -> Tuple:
             return (x, x)
 
         def annotated_fn(x: torch.Tensor) -> Tuple:
             return (x, x)
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use Tuple without a contained type"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use Tuple without a contained type"
+        ):
             cu = torch.jit.CompilationUnit()
             cu.define(dedent(inspect.getsource(fn_with_comment)))
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use Tuple without a contained type"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use Tuple without a contained type"
+        ):
             cu = torch.jit.CompilationUnit()
             cu.define(dedent(inspect.getsource(annotated_fn)))
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use Tuple without a contained type"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use Tuple without a contained type"
+        ):
             torch.jit.script(fn_with_comment)
 
-        with self.assertRaisesRegex(RuntimeError, r"Attempted to use Tuple without a contained type"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Attempted to use Tuple without a contained type"
+        ):
             torch.jit.script(annotated_fn)
 
     def test_ignoring_module_attributes(self):
         """
         Test that module attributes can be ignored.
         """
+
         class Sub(torch.nn.Module):
             def forward(self, a: int) -> int:
                 return sum([a])
@@ -229,10 +261,11 @@ def forward(self) -> int:
 
         mod = ModuleUsesIgnoredAttr(1)
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, r"attribute was ignored during compilation", "self.sub"):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, r"attribute was ignored during compilation", "self.sub"
+        ):
             scripted_mod = torch.jit.script(mod)
 
-
     def test_ignoring_fn_with_nonscriptable_types(self):
         class CFX:
             def __init__(self, a: List[torch.Tensor]) -> None:
@@ -246,7 +279,9 @@ def __iter__(self) -> Iterator[torch.Tensor]:
                 return iter(self.a)
 
             @torch.jit._drop
-            def __fx_create_arg__(self, tracer: torch.fx.Tracer) -> torch.fx.node.Argument:
+            def __fx_create_arg__(
+                self, tracer: torch.fx.Tracer
+            ) -> torch.fx.node.Argument:
                 # torch.fx classes are not scriptable
                 return tracer.create_node(
                     "call_function",
@@ -257,35 +292,36 @@ def __fx_create_arg__(self, tracer: torch.fx.Tracer) -> torch.fx.node.Argument:
 
         torch.jit.script(CFX)
 
-
     def test_unimported_type_resolution(self):
         # verify fallback from the python resolver to the c++ resolver
 
-        @ torch.jit.script
+        @torch.jit.script
         def fn(x):
             # type: (number) -> number
             return x + 1
 
-        FileCheck().check('Scalar').run(fn.graph)
+        FileCheck().check("Scalar").run(fn.graph)
 
     def test_parser_bug(self):
         def parser_bug(o: Optional[torch.Tensor]):
             pass
 
     def test_mismatched_annotation(self):
-        with self.assertRaisesRegex(RuntimeError, 'annotated with type'):
+        with self.assertRaisesRegex(RuntimeError, "annotated with type"):
+
             @torch.jit.script
             def foo():
-                x : str = 4
+                x: str = 4
                 return x
 
     def test_reannotate(self):
-        with self.assertRaisesRegex(RuntimeError, 'declare and annotate'):
+        with self.assertRaisesRegex(RuntimeError, "declare and annotate"):
+
             @torch.jit.script
             def foo():
                 x = 5
                 if 1 == 1:
-                    x : Optional[int] = 7
+                    x: Optional[int] = 7
 
     def test_annotate_outside_init(self):
         msg = "annotations on instance attributes must be declared in __init__"
@@ -293,6 +329,7 @@ def test_annotate_outside_init(self):
 
         # Simple case
         with self.assertRaisesRegexWithHighlight(ValueError, msg, highlight):
+
             @torch.jit.script
             class BadModule:
                 def __init__(self, x: int):
@@ -303,6 +340,7 @@ def set(self, val: int):
 
         # Type annotation in a loop
         with self.assertRaisesRegexWithHighlight(ValueError, msg, highlight):
+
             @torch.jit.script
             class BadModuleLoop:
                 def __init__(self, x: int):
@@ -324,8 +362,10 @@ def set(self, val: int):
     def test_inferred_type_error_message(self):
         inferred_type = torch._C.InferredType("ErrorReason")
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Tried to get the type from an InferredType but the type is null."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Tried to get the type from an InferredType but the type is null.",
+        ):
             t = inferred_type.type()
 
         with self.assertRaisesRegex(RuntimeError, "ErrorReason"):
diff --git a/test/jit/test_typing.py b/test/jit/test_typing.py
index 560af1b75db89..ffb4ea98e0244 100644
--- a/test/jit/test_typing.py
+++ b/test/jit/test_typing.py
@@ -2,12 +2,12 @@
 
 import os
 import sys
+from collections import namedtuple
+from typing import Dict, List, NamedTuple, Tuple
 
 import torch
-from torch.testing._internal.jit_utils import JitTestCase, make_global
 from torch.testing._internal.common_utils import IS_WINDOWS
-from collections import namedtuple
-from typing import List, Tuple, Dict, NamedTuple
+from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
@@ -20,14 +20,15 @@
         "instead."
     )
 
+
 class TestTyping(JitTestCase):
     def test_dict_in_not_in(self):
         def test_in_dict(x):
             # type: (Dict[str, int]) -> bool
-            return 'hi' in x
+            return "hi" in x
 
-        self.checkScript(test_in_dict, ({'hi': 2, 'bye': 3},))
-        self.checkScript(test_in_dict, ({'bye': 3},))
+        self.checkScript(test_in_dict, ({"hi": 2, "bye": 3},))
+        self.checkScript(test_in_dict, ({"bye": 3},))
 
         # Check evaluation order
         @torch.jit.script
@@ -57,8 +58,8 @@ def test_not_in_dict(a):
             else:
                 return True
 
-        self.checkScript(test_not_in_dict, ({"hello": 1, "world": 2}, ))
-        self.checkScript(test_not_in_dict, ({"world": 2}, ))
+        self.checkScript(test_not_in_dict, ({"hello": 1, "world": 2},))
+        self.checkScript(test_not_in_dict, ({"world": 2},))
 
         def test_dict_tensor_key(a, t):
             # type: (Dict[Tensor, int], Tensor) -> bool
@@ -80,9 +81,12 @@ def fn():
             l: List[int] = [1, 2, "foo", 3]
             return l
 
-        with self.assertRaisesRegex(RuntimeError, "List type annotation"
-                                    r" `List\[int\]` did not match the "
-                                    "types of the given list elements"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "List type annotation"
+            r" `List\[int\]` did not match the "
+            "types of the given list elements",
+        ):
             torch.jit.script(fn)
 
     def test_dict_type_refinement_annotation_key_mismatch(self):
@@ -92,10 +96,13 @@ def fn():
             d: Dict[int, str] = dict(zip(l1, l2))
             return d
 
-        with self.assertRaisesRegex(RuntimeError, "Dicts may only "
-                                    "contain homogeneous keys, but the "
-                                    "type of the first generated key "
-                                    r"was Union\[int, str\]"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Dicts may only "
+            "contain homogeneous keys, but the "
+            "type of the first generated key "
+            r"was Union\[int, str\]",
+        ):
             torch.jit.script(fn)
 
     def test_dict_type_refinement_annotation_value_mismatch(self):
@@ -105,28 +112,36 @@ def fn():
             d: Dict[str, int] = dict(zip(l1, l2))
             return d
 
-        with self.assertRaisesRegex(RuntimeError, "Dict type annotation"
-                                    r" `Dict\[str, int\]` did not match"
-                                    " the type of an actual value type"
-                                    r" `Union\[int, str\]`"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Dict type annotation"
+            r" `Dict\[str, int\]` did not match"
+            " the type of an actual value type"
+            r" `Union\[int, str\]`",
+        ):
             torch.jit.script(fn)
 
     def test_dict_invalid_annotations(self):
         # Check for invalid value type annotation
         def wrong_value_type(dictionary: Dict[str, torch.jit.ScriptModule]):
             return
+
         with self.assertRaisesRegex(ValueError, "Unknown type annotation"):
             torch.jit.script(wrong_value_type)
 
         # Check for invalid key type annotation
         def wrong_key_type(dictionary: Dict[torch.jit.ScriptModule, str]):
             return
+
         with self.assertRaisesRegex(ValueError, "Unknown type annotation"):
             torch.jit.script(wrong_key_type)
 
         # Check for invalid key and value type annotation
-        def wrong_key_value_type(dictionary: Dict[torch.jit.ScriptModule, torch.jit.ScriptModule]):
+        def wrong_key_value_type(
+            dictionary: Dict[torch.jit.ScriptModule, torch.jit.ScriptModule]
+        ):
             return
+
         with self.assertRaisesRegex(ValueError, "Unknown type annotation"):
             torch.jit.script(wrong_key_value_type)
 
@@ -138,13 +153,16 @@ def f(t, s):
             _, y = t2
             return x + y
 
-        t = torch.randn(2, 2), (1, torch.randn(2, 2)),
+        t = (
+            torch.randn(2, 2),
+            (1, torch.randn(2, 2)),
+        )
         f(t, "hi")
         graph = f.graph_for(t, "hi")
         input_types = list(next(graph.inputs()).type().elements())
         w = input_types[0]
-        self.assertEqual(input_types[0].kind(), 'TensorType')
-        self.assertEqual(input_types[1].elements()[1].kind(), 'TensorType')
+        self.assertEqual(input_types[0].kind(), "TensorType")
+        self.assertEqual(input_types[1].elements()[1].kind(), "TensorType")
 
     def test_tuple_io(self):
         def stuff(x):
@@ -165,8 +183,7 @@ def bar():
         def foo():
             return tuple(1, 2)
 
-        self.checkScriptRaisesRegex(foo, (), Exception,
-                                    "1 argument")
+        self.checkScriptRaisesRegex(foo, (), Exception, "1 argument")
 
         def cant_infer_size():
             return tuple([1, 2, 3])  # noqa: C409
@@ -179,12 +196,14 @@ def stuff2(x):
             # type: (int) -> Tuple[Tensor, Tensor]
             a = (torch.ones(x), torch.zeros(x))
             return a
+
         self.checkScript(stuff2, (3,))
 
     def test_list_io(self):
         def stuff3(x):
             # type: (List[int]) -> Tuple[Tensor, List[int]]
             return torch.ones(x), x
+
         self.checkScript(stuff3, ([3, 2],))
 
     def test_bool_list_io(self):
@@ -203,6 +222,7 @@ def foo(z):
             # type: (Tuple[int, List[List[int]]]) -> int
             x, y = z
             return y[0][1]
+
         self.checkScript(foo, ((1, [[1, 2], [3, 4]]),))
 
     def test_list_sum(self):
@@ -215,12 +235,12 @@ def fn1(x: List[float]):
         def fn2(x: List[bool]):
             return sum(x)
 
-        self.checkScript(fn, ([1, 2, 3], ))
-        self.checkScript(fn1, ([1.0, 2.0, 3.0], ))
-        self.checkScript(fn1, ([1, 2.8, 3], ))
-        self.checkScript(fn2, ([True, False, False], ))
-        self.checkScript(fn2, ([False, False, False], ))
-        self.checkScript(fn2, ([0, 1, 1, 0], ))
+        self.checkScript(fn, ([1, 2, 3],))
+        self.checkScript(fn1, ([1.0, 2.0, 3.0],))
+        self.checkScript(fn1, ([1, 2.8, 3],))
+        self.checkScript(fn2, ([True, False, False],))
+        self.checkScript(fn2, ([False, False, False],))
+        self.checkScript(fn2, ([0, 1, 1, 0],))
 
     def test_list_unification(self):
         def fn():
@@ -254,7 +274,6 @@ def test_sum_list_one(self):
         self.checkScript(self.get_sum_list_fn(), ([1],))
 
     def test_sum_list_literal(self):
-
         def sum_list():
             # type: () -> int
             sum = 0
@@ -266,8 +285,8 @@ def sum_list():
         self.checkScript(sum_list, ())
 
     def test_sum_list_wrong_type(self):
-
         with self.assertRaisesRegex(RuntimeError, "'int' object is not iterable"):
+
             @torch.jit.script
             def sum_list(a):
                 # type: (int) -> int
@@ -280,14 +299,18 @@ def sum_list(a):
             sum_list(1)
 
     def test_list_iterables(self):
-        with self.assertRaisesRegex(RuntimeError, 'List of iterables is not supported currently'):
-            cu = torch.jit.CompilationUnit('''
+        with self.assertRaisesRegex(
+            RuntimeError, "List of iterables is not supported currently"
+        ):
+            cu = torch.jit.CompilationUnit(
+                """
             def list_iterables(x):
                 for i, j in [2, 3, 4], [5, 6, 7]:
                     x += i
                     x += j
                 return x
-            ''')
+            """
+            )
 
     def test_for_in_string(self):
         def test_strings(x):
@@ -352,36 +375,43 @@ def nested_tuple_unpack(x, y):
 
     def test_dict_comprehension(self):
         def fn():
-            return {i : chr(i + 65) for i in range(4)}
+            return {i: chr(i + 65) for i in range(4)}
+
         self.checkScript(fn, ())
 
     def test_dict_comprehension_with_type_annotation(self):
         def fn():
-            d: Dict[int, str] = {i : chr(i + 65) for i in range(4)}
+            d: Dict[int, str] = {i: chr(i + 65) for i in range(4)}
             return d
+
         self.checkScript(fn, ())
 
         with self.assertRaisesRegex(RuntimeError, ""):
-            with self.assertRaisesRegex(AssertionError, "Expected Dict "
-                                        "type annotation for dict "
-                                        "comprehension, found "
-                                        "Tuple[int, str]"):
+            with self.assertRaisesRegex(
+                AssertionError,
+                "Expected Dict "
+                "type annotation for dict "
+                "comprehension, found "
+                "Tuple[int, str]",
+            ):
+
                 @torch.jit.script
                 def fn():
-                    d: Tuple[int, str] = {i : chr(i + 65) for i in range(4)}
+                    d: Tuple[int, str] = {i: chr(i + 65) for i in range(4)}
                     return d
 
     def test_dict_comprehension_scope(self):
         def comprehension_can_access_outer_scope_variables():
             lst = ["foo", "bar", "baz"]
-            return {l : len(l) for l in lst}
+            return {l: len(l) for l in lst}
 
         self.checkScript(comprehension_can_access_outer_scope_variables, ())
 
         with self.assertRaisesRegex(RuntimeError, "undefined value i"):
+
             @torch.jit.script
             def outer_scope_cannot_access_comprehension_variables():
-                d = {i : chr(i + 65) for i in range(4)}
+                d = {i: chr(i + 65) for i in range(4)}
                 i = i + 1  # noqa: F821
 
     def test_for_tuple_assign(self):
@@ -402,22 +432,28 @@ def test_tuple_assign(x):
                 sum += a[1]
             return sum
 
-        self.checkScript(test_tuple_assign, (((1, 2), (4, 7)), ))
+        self.checkScript(test_tuple_assign, (((1, 2), (4, 7)),))
 
         def test_single_starred_lhs(self):
-            with self.assertRaisesRegex(RuntimeError, 'A Starred expression may only appear on the lhs within the presence'
-                                                      ' of another non-starred expression'):
-                cu = torch.jit.CompilationUnit('''
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "A Starred expression may only appear on the lhs within the presence"
+                " of another non-starred expression",
+            ):
+                cu = torch.jit.CompilationUnit(
+                    """
                 def single_starred_lhs(x):
                     a = (x, x, x)
                     *b, = a
                     return b
-                ''')
+                """
+                )
 
     def test_singleton_tuple_unpack(self):
         def foo(a):
-            b, = (a,)
+            (b,) = (a,)
             return b + 1
+
         self.checkScript(foo, (torch.rand(3),))
 
     def test_tuple_assignments(self):
@@ -441,7 +477,9 @@ def subscript_tuple_assign(a, x, i):
             a[i], (x[i], b) = 1, (2, 3)
             return a[i] + 1, x + 5, b
 
-        self.checkScript(subscript_tuple_assign, ([12, 7, 9, 11], torch.tensor((3, 13, 17)), 0))
+        self.checkScript(
+            subscript_tuple_assign, ([12, 7, 9, 11], torch.tensor((3, 13, 17)), 0)
+        )
 
         def star_tuple_assign():
             # type: () -> Tuple[int, int, Tuple[int, int], Tuple[int, int]]
@@ -455,7 +493,7 @@ def subscript_tuple_augmented_assign(a):
             a[0] += 1
             return a
 
-        with self.assertRaisesRegex(RuntimeError, 'does not support augmented assign'):
+        with self.assertRaisesRegex(RuntimeError, "does not support augmented assign"):
             scripted_aug_assign = torch.jit.script(subscript_tuple_augmented_assign)
 
     def test_multiple_assign(self):
@@ -505,7 +543,6 @@ def other_fn(x=None):
             # type: (Optional[int]) -> int
             return torch.jit._unwrap_optional(x)
 
-
         @torch.jit.script
         def fn(x):
             # type: (int) -> int
@@ -540,7 +577,7 @@ def opt_list_tuple_caller(x):
             # type: (Tuple[float, float]) -> int
             return opt_list(x) + broadcast_opt_list(x)
 
-        self.assertEqual(opt_list_tuple_caller((2., 3.)), 4)
+        self.assertEqual(opt_list_tuple_caller((2.0, 3.0)), 4)
 
     def test_optional_tuple(self):
         def fn(x=None):
@@ -556,10 +593,11 @@ def fn(x=None):
 
     def test_namedtuple_redefine(self):
         global _1, _2
-        _1 = namedtuple('GoogLeNetOutputs', ['logits', 'aux_logits2', 'aux_logits1'])
-        _2 = namedtuple('GoogLeNetOutputs', ['different'])
+        _1 = namedtuple("GoogLeNetOutputs", ["logits", "aux_logits2", "aux_logits1"])
+        _2 = namedtuple("GoogLeNetOutputs", ["different"])
+
+        with self.assertRaisesRegex(RuntimeError, r"redefine"):
 
-        with self.assertRaisesRegex(RuntimeError, r'redefine'):
             @torch.jit.script
             def foo(x, y):
                 # type: (_1, _2) -> _1
@@ -567,7 +605,9 @@ def foo(x, y):
 
     def test_namedtuple_py2(self):
         global _GoogLeNetOutputs  # see [local resolution in python]
-        _GoogLeNetOutputs = namedtuple('GoogLeNetOutputs', ['logits', 'aux_logits2', 'aux_logits1'])
+        _GoogLeNetOutputs = namedtuple(
+            "GoogLeNetOutputs", ["logits", "aux_logits2", "aux_logits1"]
+        )
 
         @torch.jit.script
         def foo(x):
@@ -575,22 +615,27 @@ def foo(x):
             return x
 
         vals = torch.rand(3), torch.rand(4), torch.rand(5)
-        out = foo(_GoogLeNetOutputs(logits=vals[0], aux_logits2=vals[1], aux_logits1=vals[2]))
+        out = foo(
+            _GoogLeNetOutputs(logits=vals[0], aux_logits2=vals[1], aux_logits1=vals[2])
+        )
         self.assertEqual(out.logits, vals[0])
         self.assertEqual(out.aux_logits2, vals[1])
         self.assertEqual(out.aux_logits1, vals[2])
 
     def test_namedtuple_good_error(self):
         global _GoogLeNetOutputs  # see [local resolution in python]
-        _GoogLeNetOutputs = namedtuple('GoogLeNetOutputs', ['logits', 'aux_logits2', 'aux_logits1'])
+        _GoogLeNetOutputs = namedtuple(
+            "GoogLeNetOutputs", ["logits", "aux_logits2", "aux_logits1"]
+        )
 
         @torch.jit.script
         def foo(x):
             # type: (_GoogLeNetOutputs) -> _GoogLeNetOutputs
             return x
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    r'aka NamedTuple\(logits, aux_logits2, aux_logits1\)'):
+        with self.assertRaisesRegex(
+            RuntimeError, r"aka NamedTuple\(logits, aux_logits2, aux_logits1\)"
+        ):
             out = foo(_GoogLeNetOutputs(logits="3", aux_logits2="4", aux_logits1="5"))
 
     def test_namedtuple_error_source_attribution(self):
diff --git a/test/jit/test_union.py b/test/jit/test_union.py
index bee1efc0317c1..b38ac28ddf0d1 100644
--- a/test/jit/test_union.py
+++ b/test/jit/test_union.py
@@ -3,22 +3,25 @@
 import io
 import os
 import sys
-
-import torch
-from torch.testing import FileCheck
 from enum import Enum
 from textwrap import dedent
 from typing import Dict, List, Optional, Tuple, Union
 
+import torch
+from torch.testing import FileCheck
+
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestUnion(JitTestCase):
     """
@@ -57,9 +60,12 @@ def fn(x: Union[int, float]) -> str:
 
         scripted = torch.jit.script(fn)
 
-        with self.assertRaisesRegex(RuntimeError, "Expected a member of"
-                                    r" Union\[float, int\] but "
-                                    "instead found type str"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected a member of"
+            r" Union\[float, int\] but "
+            "instead found type str",
+        ):
             scripted("1")
 
     def test_union_with_collections(self):
@@ -71,22 +77,31 @@ def fn(x: Union[Dict[str, int], List[int]]) -> str:
 
         scripted = torch.jit.script(fn)
 
-        with self.assertRaisesRegex(RuntimeError, "Expected a member of"
-                                    r" Union\[List\[int\], Dict\[str, "
-                                    r"int\]\] but instead found type "
-                                    r"Dict\[str, str\]"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected a member of"
+            r" Union\[List\[int\], Dict\[str, "
+            r"int\]\] but instead found type "
+            r"Dict\[str, str\]",
+        ):
             scripted({"foo": "bar", "baz": "qux"})
 
-        with self.assertRaisesRegex(RuntimeError, "Expected a member of"
-                                    r" Union\[List\[int\], Dict\[str, "
-                                    r"int\]\] but instead found type "
-                                    r"List\[str\]"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected a member of"
+            r" Union\[List\[int\], Dict\[str, "
+            r"int\]\] but instead found type "
+            r"List\[str\]",
+        ):
             scripted(["foo", "bar", "baz"])
 
-        with self.assertRaisesRegex(RuntimeError, "Expected a member of"
-                                    r" Union\[List\[int\], Dict\[str, "
-                                    r"int\]\] but instead found type "
-                                    "str"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected a member of"
+            r" Union\[List\[int\], Dict\[str, "
+            r"int\]\] but instead found type "
+            "str",
+        ):
             scripted("1")
 
     def test_union_with_enum(self):
@@ -104,16 +119,18 @@ def fn(x: Union[str, Color]) -> str:
 
         scripted = torch.jit.script(fn)
 
-        with self.assertRaisesRegex(RuntimeError, "Expected a member of"
-                                    r" Union\[__torch__.jit.test_union."
-                                    r"Color, str\] but instead found "
-                                    "type int"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected a member of"
+            r" Union\[__torch__.jit.test_union."
+            r"Color, str\] but instead found "
+            "type int",
+        ):
             scripted(1)
 
     def test_union_in_class_constructor(self):
-
         @torch.jit.script  # noqa: B903
-        class A:    # noqa: B903
+        class A:  # noqa: B903
             def __init__(self, x: Union[int, str]) -> None:
                 self.x = x
 
@@ -125,9 +142,12 @@ def fn(x: Union[str, int]) -> A:
 
         scripted = torch.jit.script(fn)
 
-        with self.assertRaisesRegex(RuntimeError, "Expected a member of"
-                                    r" Union\[int, str\] but instead "
-                                    r"found type List\[str\]"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected a member of"
+            r" Union\[int, str\] but instead "
+            r"found type List\[str\]",
+        ):
             scripted(["foo", "bar", "baz"])
 
     def test_union_return_type(self):
@@ -171,7 +191,7 @@ def fn():
     def test_union_variable_can_be_reassigned(self):
         @torch.jit.script
         def aux1(i: int):
-            return int(i ** 2)
+            return int(i**2)
 
         @torch.jit.script
         def aux2(s: str):
@@ -225,8 +245,7 @@ def fn(x: Union[Union[int, str], float]) -> str:
 
         s = fn.graph
 
-        FileCheck().check("x : Union(float, int, str)")    \
-                   .run(s)
+        FileCheck().check("x : Union(float, int, str)").run(s)
 
     def test_unions_of_a_single_argument_vanish(self):
         @torch.jit.script
@@ -235,8 +254,7 @@ def fn(x: Union[int]) -> str:
 
         s = fn.graph
 
-        FileCheck().check("x : int")    \
-                   .run(s)
+        FileCheck().check("x : int").run(s)
 
     def test_union_redundant_arguments_are_skipped(self):
         @torch.jit.script
@@ -245,8 +263,7 @@ def fn(x: Union[int, str, int]) -> str:
 
         s = fn.graph
 
-        FileCheck().check("x : Union(int, str)")    \
-                   .run(s)
+        FileCheck().check("x : Union(int, str)").run(s)
 
     def test_union_redundant_arguments_are_skipped_optional(self):
         @torch.jit.script
@@ -255,8 +272,7 @@ def fn(x: Union[int, Optional[float], Optional[int]]) -> str:
 
         s = fn.graph
 
-        FileCheck().check("x : Union(float, int, NoneType)")    \
-                   .run(s)
+        FileCheck().check("x : Union(float, int, NoneType)").run(s)
 
     def test_union_redundant_arguments_are_skipped_subtyping(self):
         @torch.jit.script
@@ -265,8 +281,7 @@ def fn(x: Union[str, Tuple[Optional[int], int], Tuple[int, int]]) -> str:
 
         s = fn.graph
 
-        FileCheck().check("x : Union((int?, int), str)")    \
-                   .run(s)
+        FileCheck().check("x : Union((int?, int), str)").run(s)
 
     def test_union_redundant_arguments_are_skipped_container(self):
         @torch.jit.script
@@ -275,8 +290,7 @@ def fn(x: Union[List[str], List[float], List[str]]) -> str:
 
         s = fn.graph
 
-        FileCheck().check("x : Union(float[], str[])")     \
-                   .run(s)
+        FileCheck().check("x : Union(float[], str[])").run(s)
 
     def test_union_argument_order_is_ignored(self):
         @torch.jit.script
@@ -288,8 +302,7 @@ def fn2(x: Union[str, int]) -> str:
             return "foo"
 
         for s in (fn1.graph, fn2.graph):
-            FileCheck().check("x : Union(int, str)")     \
-                .run(s)
+            FileCheck().check("x : Union(int, str)").run(s)
 
     def test_union_argument_order_is_ignored_container(self):
         @torch.jit.script
@@ -301,8 +314,7 @@ def fn2(x: Union[List[int], List[str]]) -> str:
             return "foo"
 
         for s in (fn1.graph, fn2.graph):
-            FileCheck().check("x : Union(int[], str[])")     \
-                .run(s)
+            FileCheck().check("x : Union(int[], str[])").run(s)
 
     def test_union_T_None_is_equivalent_to_optional_T(self):
         @torch.jit.script
@@ -366,9 +378,9 @@ def fn(flag: int) -> Union[str, int, None]:
 
         s = l.code
 
-        FileCheck().check("Union[int, NoneType, str]")     \
-                   .check("Union[int, NoneType, str]")     \
-                   .run(s)
+        FileCheck().check("Union[int, NoneType, str]").check(
+            "Union[int, NoneType, str]"
+        ).run(s)
 
     def test_union_subclasses_larger_union(self):
         def fn() -> Union[int, str, torch.Tensor]:
@@ -386,9 +398,12 @@ def fn():
             x[1] = 2
             return x[1]
 
-        with self.assertRaisesRegex(RuntimeError, "only int, float, "
-                                    "complex, Tensor, device and string keys "
-                                    "are supported"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "only int, float, "
+            "complex, Tensor, device and string keys "
+            "are supported",
+        ):
             torch.jit.script(fn)
 
     def test_union_as_dict_value(self):
@@ -402,7 +417,6 @@ def fn():
 
     def test_union_module_with_union_instance_variable(self):
         class M(torch.nn.Module):
-
             x: Union[int, str]
 
             def __init__(self, x: Union[int, str]):
@@ -413,7 +427,12 @@ def forward(self, y: Union[int, str]):
                 self.x = y
                 return self.x
 
-        self.checkModule(M(2,), (1,))
+        self.checkModule(
+            M(
+                2,
+            ),
+            (1,),
+        )
         self.checkModule(M("bar"), ("foo",))
 
     def test_union_module_with_union_class_variable(self):
@@ -508,9 +527,7 @@ def fn(x: int) -> str:
         s = fn.graph
 
         # Check that we don't have any branching statements
-        FileCheck().check_not("block0()")    \
-            .check_not("block1()")           \
-            .run(s)
+        FileCheck().check_not("block0()").check_not("block1()").run(s)
 
     def test_union_type_refinement_statically_true(self):
         @torch.jit.script
@@ -525,9 +542,7 @@ def fn(x: Union[List[int], int]) -> Union[List[int], int]:
         s = fn.graph
 
         # Check that we don't have any branching statements
-        FileCheck().check_not("block0()")    \
-            .check_not("block1()")           \
-            .run(s)
+        FileCheck().check_not("block0()").check_not("block1()").run(s)
 
     def test_union_type_refinement_partial_static_refinement_tuple_rhs(self):
         def fn(x: Union[List[int], int]) -> int:
@@ -556,7 +571,7 @@ def fn(x: Union[List[int], int]) -> int:
     def test_union_type_refinement_internal_declaration(self):
         def fn(flag: bool) -> str:
             x: Union[int, str, None] = None
-            if (flag):
+            if flag:
                 y = "foo"
             else:
                 y = 1
@@ -589,9 +604,12 @@ def fn(x: int) -> str:
             else:
                 return "bar"
 
-        with self.assertRaisesRegex(RuntimeError, "y is set to type str"
-                                    " in the true branch and type int "
-                                    "in the false branch"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "y is set to type str"
+            " in the true branch and type int "
+            "in the false branch",
+        ):
             torch.jit.script(fn)
 
     def test_union_branching_does_not_widen_existing_inferred_type(self):
@@ -606,9 +624,12 @@ def fn(x: int) -> str:
             else:
                 return "baz"
 
-        with self.assertRaisesRegex(RuntimeError, "previously had type "
-                                    "str but is now being assigned to a"
-                                    " value of type int"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "previously had type "
+            "str but is now being assigned to a"
+            " value of type int",
+        ):
             torch.jit.script(fn)
 
     def test_union_schema_matching_on_internal_type(self):
@@ -645,8 +666,8 @@ def fn(x: Union[List[int], int]) -> int:
 
     def test_union_memory_aliasing(self):
         def fn():
-            x : List[torch.Tensor] = []
-            z : List[Optional[List[torch.Tensor]]] = []
+            x: List[torch.Tensor] = []
+            z: List[Optional[List[torch.Tensor]]] = []
             z.append(x)
             x_alias = z[0]
             if torch.jit.isinstance(x_alias, List[torch.Tensor]):
@@ -682,203 +703,212 @@ def _assert_raises(self, template: str, ann: str, lhs: str, msg: str):
         code = template.format(ann=ann, lhs=lhs)
         with self.assertRaisesRegex(RuntimeError, msg):
             cu = torch.jit.CompilationUnit(code, _frames_up=1)
-            string_frontend = getattr(cu, "fn")    # noqa: B009
+            string_frontend = getattr(cu, "fn")  # noqa: B009
 
     def test_union_with_list_assignment(self):
-        template = dedent('''
+        template = dedent(
+            """
             def fn():
                 x: {ann} = {lhs}
                 if torch.jit.isinstance(x, List[torch.Tensor]):
                     x.append(torch.tensor(3))
                 return x
-        ''')
-
-        lhs = {"list_literal_empty" : "[]",
-
-               "list_literal_of_tensor" : "[torch.arange(3), torch.arange(5)]",
-
-               "list_literal_of_str" : "[\"foo\", \"bar\", \"baz\"]",
-
-               "list_literal_of_mixed" : "[torch.arange(5), 1]",
-
-               "list_comprehension_of_tensor" :
-               "[torch.add(x, 1) for x in [torch.arange(3), torch.arange(5)]]",
-
-               "list_comprehension_of_str" :
-               "[x + \"!\" for x in [\"foo\", \"bar\", \"baz\"]]",
-
-               "list_comprehension_of_mixed" :
-               "[torch.add(1, x) for x in [torch.arange(5), 1]]"}
+        """
+        )
+
+        lhs = {
+            "list_literal_empty": "[]",
+            "list_literal_of_tensor": "[torch.arange(3), torch.arange(5)]",
+            "list_literal_of_str": '["foo", "bar", "baz"]',
+            "list_literal_of_mixed": "[torch.arange(5), 1]",
+            "list_comprehension_of_tensor": "[torch.add(x, 1) for x in [torch.arange(3), torch.arange(5)]]",
+            "list_comprehension_of_str": '[x + "!" for x in ["foo", "bar", "baz"]]',
+            "list_comprehension_of_mixed": "[torch.add(1, x) for x in [torch.arange(5), 1]]",
+        }
 
         """
         Union[List[str], List[torch.Tensor]]
         """
-        self._assert_raises(template,
-                            "Union[List[str], List[torch.Tensor]]",
-                            lhs["list_literal_empty"],
-                            "there are multiple possible List type "
-                            "candidates in the Union annotation")
-
-        self._assert_passes(template,
-                            "Union[List[str], List[torch.Tensor]]",
-                            lhs["list_literal_of_tensor"])
-
-        self._assert_passes(template,
-                            "Union[List[str], List[torch.Tensor]]",
-                            lhs["list_literal_of_str"])
-
-        self._assert_raises(template,
-                            "Union[List[str], List[torch.Tensor]]",
-                            lhs["list_literal_of_mixed"],
-                            "none of those types match the types of the"
-                            " given list elements")
-
-        self._assert_passes(template,
-                            "Union[List[str], List[torch.Tensor]]",
-                            lhs["list_comprehension_of_tensor"])
-
-        self._assert_passes(template,
-                            "Union[List[str], List[torch.Tensor]]",
-                            lhs["list_comprehension_of_str"])
+        self._assert_raises(
+            template,
+            "Union[List[str], List[torch.Tensor]]",
+            lhs["list_literal_empty"],
+            "there are multiple possible List type "
+            "candidates in the Union annotation",
+        )
+
+        self._assert_passes(
+            template,
+            "Union[List[str], List[torch.Tensor]]",
+            lhs["list_literal_of_tensor"],
+        )
+
+        self._assert_passes(
+            template, "Union[List[str], List[torch.Tensor]]", lhs["list_literal_of_str"]
+        )
+
+        self._assert_raises(
+            template,
+            "Union[List[str], List[torch.Tensor]]",
+            lhs["list_literal_of_mixed"],
+            "none of those types match the types of the" " given list elements",
+        )
+
+        self._assert_passes(
+            template,
+            "Union[List[str], List[torch.Tensor]]",
+            lhs["list_comprehension_of_tensor"],
+        )
+
+        self._assert_passes(
+            template,
+            "Union[List[str], List[torch.Tensor]]",
+            lhs["list_comprehension_of_str"],
+        )
 
         # TODO: Support mixed list comprehensions
-        self._assert_raises(template,
-                            "Union[List[str], List[torch.Tensor]]",
-                            lhs["list_comprehension_of_mixed"],
-                            "Arguments for call are not valid")
+        self._assert_raises(
+            template,
+            "Union[List[str], List[torch.Tensor]]",
+            lhs["list_comprehension_of_mixed"],
+            "Arguments for call are not valid",
+        )
 
         """
         Union[int, torch.Tensor]
         """
-        self._assert_raises(template,
-                            "Union[int, torch.Tensor]",
-                            lhs["list_literal_empty"],
-                            "Expected an Union type annotation with an "
-                            "inner List type")
-
-        self._assert_raises(template, "Union[int, torch.Tensor]",
-                            lhs["list_literal_of_tensor"],
-                            "Expected an Union type annotation with an "
-                            "inner List type")
-
-        self._assert_raises(template, "Union[int, torch.Tensor]",
-                            lhs["list_comprehension_of_tensor"],
-                            "Expected an Union type annotation with an "
-                            "inner List type")
+        self._assert_raises(
+            template,
+            "Union[int, torch.Tensor]",
+            lhs["list_literal_empty"],
+            "Expected an Union type annotation with an " "inner List type",
+        )
+
+        self._assert_raises(
+            template,
+            "Union[int, torch.Tensor]",
+            lhs["list_literal_of_tensor"],
+            "Expected an Union type annotation with an " "inner List type",
+        )
+
+        self._assert_raises(
+            template,
+            "Union[int, torch.Tensor]",
+            lhs["list_comprehension_of_tensor"],
+            "Expected an Union type annotation with an " "inner List type",
+        )
 
         """
         Union[List[torch.Tensor], int]
         """
-        self._assert_passes(template,
-                            "Union[List[torch.Tensor], int]",
-                            lhs["list_literal_empty"])
-
-        self._assert_passes(template,
-                            "Union[List[torch.Tensor], int]",
-                            lhs["list_literal_of_tensor"])
-
-        self._assert_raises(template, "Union[List[torch.Tensor], int]",
-                            lhs["list_literal_of_str"],
-                            r"List type annotation `List\[Tensor\]` did "
-                            "not match the types of the given list "
-                            "elements")
-
-        self._assert_raises(template, "Union[List[torch.Tensor], int]",
-                            lhs["list_literal_of_mixed"],
-                            r"List type annotation `List\[Tensor\]` did "
-                            "not match the types of the given list "
-                            "elements")
-
-        self._assert_passes(template,
-                            "Union[List[torch.Tensor], int]",
-                            lhs["list_comprehension_of_tensor"])
-
-        self._assert_raises(template,
-                            "Union[List[torch.Tensor], int]",
-                            lhs["list_comprehension_of_str"],
-                            r"List type annotation `List\[Tensor\]` did "
-                            "not match the types of the given list "
-                            "elements")
+        self._assert_passes(
+            template, "Union[List[torch.Tensor], int]", lhs["list_literal_empty"]
+        )
+
+        self._assert_passes(
+            template, "Union[List[torch.Tensor], int]", lhs["list_literal_of_tensor"]
+        )
+
+        self._assert_raises(
+            template,
+            "Union[List[torch.Tensor], int]",
+            lhs["list_literal_of_str"],
+            r"List type annotation `List\[Tensor\]` did "
+            "not match the types of the given list "
+            "elements",
+        )
+
+        self._assert_raises(
+            template,
+            "Union[List[torch.Tensor], int]",
+            lhs["list_literal_of_mixed"],
+            r"List type annotation `List\[Tensor\]` did "
+            "not match the types of the given list "
+            "elements",
+        )
+
+        self._assert_passes(
+            template,
+            "Union[List[torch.Tensor], int]",
+            lhs["list_comprehension_of_tensor"],
+        )
+
+        self._assert_raises(
+            template,
+            "Union[List[torch.Tensor], int]",
+            lhs["list_comprehension_of_str"],
+            r"List type annotation `List\[Tensor\]` did "
+            "not match the types of the given list "
+            "elements",
+        )
 
         # TODO(@ansley): Support mixed list comprehensions
-        self._assert_raises(template,
-                            "Union[List[torch.Tensor], int]",
-                            lhs["list_comprehension_of_mixed"],
-                            "Arguments for call are not valid")
+        self._assert_raises(
+            template,
+            "Union[List[torch.Tensor], int]",
+            lhs["list_comprehension_of_mixed"],
+            "Arguments for call are not valid",
+        )
 
     def test_union_with_dict_assignment(self):
-        template = dedent('''
+        template = dedent(
+            """
             def fn():
                 x: {ann} = {lhs}
                 if torch.jit.isinstance(x, Dict[str, torch.Tensor]):
                     x["foo"] = torch.tensor(3)
                 return x
-        ''')
-
-        lhs = {"dict_literal_empty" : "{}",
-
-               "dict_literal_of_str_tensor" :
-               "{\"foo\" : torch.arange(3), \"bar\" : torch.arange(5)}",
-
-               "dict_literal_of_str_int" :
-               "{\"foo\" : 1, \"bar\" : 2}",
-
-               "dict_literal_of_mixed" :
-               "{\"foo\" : torch.arange(3), \"bar\" : 2}",
-
-               "dict_comprehension_of_str_tensor" :
-               "{x : torch.add(y, 1) for x, y in \
-                    zip([\"foo\", \"bar\"], [torch.arange(3), torch.arange(5)])}",
-
-               "dict_comprehension_of_str_int" :
-               "{x : torch.add(y, 1) for x, y in \
-                    zip([\"foo\", \"bar\"], [1, 2]}",
-
-               "dict_comprehension_of_mixed" :
-               "{x : torch.add(y, 1) for x, y in \
-                    zip([\"foo\", \"bar\"], [torch.arange(3), 2])}",
-
-               "dict_keyword" :
-               "dict(foo=torch.arange(3), baz=torch.arange(5))",
-
-               "dict_keyword_with_iterable" :
-               "dict([(\"foo\", torch.arange(3)), (\"bar\", torch.arange(5))])",
-
-               "dict_keyword_with_empty_iterable" :
-               "dict([])",
-
-               "dict_keyword_with_internal_aggregate_function" :
-               "dict(zip([\"foo\", \"bar\"], [torch.arange(3), torch.arange(5)])",
-
-               "dict_keyword_with_mapping" :
-               "dict({\"foo\" : torch.arange(3), \"bar\" : torch.arange(5)})",
-
-               "dict_keyword_with_mapping_and_kwargs" :
-               "dict({\"foo\" : torch.arange(3), \"bar\" : torch.arange(5)}, baz=torch.arange(7))",
-
-               }
+        """
+        )
+
+        lhs = {
+            "dict_literal_empty": "{}",
+            "dict_literal_of_str_tensor": '{"foo" : torch.arange(3), "bar" : torch.arange(5)}',
+            "dict_literal_of_str_int": '{"foo" : 1, "bar" : 2}',
+            "dict_literal_of_mixed": '{"foo" : torch.arange(3), "bar" : 2}',
+            "dict_comprehension_of_str_tensor": '{x : torch.add(y, 1) for x, y in \
+                    zip(["foo", "bar"], [torch.arange(3), torch.arange(5)])}',
+            "dict_comprehension_of_str_int": '{x : torch.add(y, 1) for x, y in \
+                    zip(["foo", "bar"], [1, 2]}',
+            "dict_comprehension_of_mixed": '{x : torch.add(y, 1) for x, y in \
+                    zip(["foo", "bar"], [torch.arange(3), 2])}',
+            "dict_keyword": "dict(foo=torch.arange(3), baz=torch.arange(5))",
+            "dict_keyword_with_iterable": 'dict([("foo", torch.arange(3)), ("bar", torch.arange(5))])',
+            "dict_keyword_with_empty_iterable": "dict([])",
+            "dict_keyword_with_internal_aggregate_function": 'dict(zip(["foo", "bar"], [torch.arange(3), torch.arange(5)])',
+            "dict_keyword_with_mapping": 'dict({"foo" : torch.arange(3), "bar" : torch.arange(5)})',
+            "dict_keyword_with_mapping_and_kwargs": 'dict({"foo" : torch.arange(3), "bar" : torch.arange(5)}, baz=torch.arange(7))',
+        }
 
         """
         Union[Dict[str, torch.Tensor], Dict[str, int]]
         """
-        self._assert_raises(template,
-                            "Union[List[str], List[torch.Tensor]]",
-                            lhs["dict_literal_empty"],
-                            "Expected an Union type annotation with an "
-                            "inner Dict type")
-
-        self._assert_passes(template,
-                            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
-                            lhs["dict_literal_of_str_tensor"])
-
-        self._assert_passes(template,
-                            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
-                            lhs["dict_literal_of_str_int"])
-
-        self._assert_raises(template, "Union[Dict[str, torch.Tensor], Dict[str, int]]",
-                            lhs["dict_literal_of_mixed"],
-                            "none of those dict types can hold the "
-                            "types of the given keys and values")
+        self._assert_raises(
+            template,
+            "Union[List[str], List[torch.Tensor]]",
+            lhs["dict_literal_empty"],
+            "Expected an Union type annotation with an " "inner Dict type",
+        )
+
+        self._assert_passes(
+            template,
+            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
+            lhs["dict_literal_of_str_tensor"],
+        )
+
+        self._assert_passes(
+            template,
+            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
+            lhs["dict_literal_of_str_int"],
+        )
+
+        self._assert_raises(
+            template,
+            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
+            lhs["dict_literal_of_mixed"],
+            "none of those dict types can hold the "
+            "types of the given keys and values",
+        )
 
         # TODO: String frontend does not support tuple unpacking
         # https://github.com/pytorch/pytorch/issues/64096
@@ -899,45 +929,57 @@ def fn():
         # TODO(@ansley): Follow-up project needed for full type
         # inference with dict keyword (supported for dict comprehension
         # and dict literal already; should not be a blocker for anyone)
-        self._assert_raises(template,
-                            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
-                            lhs["dict_keyword"],
-                            "full type inference is not yet supported")
-
-        self._assert_raises(template,
-                            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
-                            lhs["dict_keyword_with_iterable"],
-                            "full type inference is not yet supported")
-
-        self._assert_raises(template,
-                            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
-                            lhs["dict_keyword_with_empty_iterable"],
-                            "full type inference is not yet supported")
-
-        self._assert_raises(template,
-                            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
-                            lhs["dict_keyword_with_mapping"],
-                            "full type inference is not yet supported")
-
-        self._assert_raises(template,
-                            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
-                            lhs["dict_keyword_with_mapping_and_kwargs"],
-                            "full type inference is not yet supported")
+        self._assert_raises(
+            template,
+            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
+            lhs["dict_keyword"],
+            "full type inference is not yet supported",
+        )
+
+        self._assert_raises(
+            template,
+            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
+            lhs["dict_keyword_with_iterable"],
+            "full type inference is not yet supported",
+        )
+
+        self._assert_raises(
+            template,
+            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
+            lhs["dict_keyword_with_empty_iterable"],
+            "full type inference is not yet supported",
+        )
+
+        self._assert_raises(
+            template,
+            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
+            lhs["dict_keyword_with_mapping"],
+            "full type inference is not yet supported",
+        )
+
+        self._assert_raises(
+            template,
+            "Union[Dict[str, torch.Tensor], Dict[str, int]]",
+            lhs["dict_keyword_with_mapping_and_kwargs"],
+            "full type inference is not yet supported",
+        )
 
         """
         Union[int, torch.Tensor]
         """
-        self._assert_raises(template,
-                            "Union[int, torch.Tensor]",
-                            lhs["dict_literal_empty"],
-                            "Expected an Union type annotation with "
-                            "an inner Dict type")
-
-        self._assert_raises(template,
-                            "Union[int, torch.Tensor]",
-                            lhs["dict_literal_of_str_tensor"],
-                            "Expected an Union type annotation with "
-                            "an inner Dict type")
+        self._assert_raises(
+            template,
+            "Union[int, torch.Tensor]",
+            lhs["dict_literal_empty"],
+            "Expected an Union type annotation with " "an inner Dict type",
+        )
+
+        self._assert_raises(
+            template,
+            "Union[int, torch.Tensor]",
+            lhs["dict_literal_of_str_tensor"],
+            "Expected an Union type annotation with " "an inner Dict type",
+        )
 
         # See above--string frontend does not support tuple unpacking
         # self._assert_raises(template, "Union[int, torch.Tensor]",
@@ -947,47 +989,61 @@ def fn():
         """
         Union[Dict[str, torch.Tensor], int]
         """
-        self._assert_passes(template,
-                            "Union[Dict[str, torch.Tensor], int]",
-                            lhs["dict_literal_empty"])
-
-        self._assert_passes(template,
-                            "Union[Dict[str, torch.Tensor], int]",
-                            lhs["dict_literal_of_str_tensor"])
-
-        self._assert_raises(template,
-                            "Union[Dict[str, torch.Tensor], int]",
-                            lhs["dict_literal_of_str_int"],
-                            "Type annotation was inferred to be "
-                            r"`Dict\[str, Tensor\]`, but the type of "
-                            "values given by the dict literal is")
-
-        self._assert_raises(template,
-                            "Union[Dict[str, torch.Tensor], int]",
-                            lhs["dict_literal_of_mixed"],
-                            "Type annotation was inferred to be "
-                            r"`Dict\[str, Tensor\]`, but the type of "
-                            "values given by the dict literal is")
-
-        self._assert_passes(template,
-                            "Union[Dict[str, torch.Tensor], int]",
-                            lhs["dict_keyword"])
-
-        self._assert_passes(template,
-                            "Union[Dict[str, torch.Tensor], int]",
-                            lhs["dict_keyword_with_iterable"])
-
-        self._assert_passes(template,
-                            "Union[Dict[str, torch.Tensor], int]",
-                            lhs["dict_keyword_with_empty_iterable"])
-
-        self._assert_passes(template,
-                            "Union[Dict[str, torch.Tensor], int]",
-                            lhs["dict_keyword_with_mapping"])
-
-        self._assert_passes(template,
-                            "Union[Dict[str, torch.Tensor], int]",
-                            lhs["dict_keyword_with_mapping_and_kwargs"])
+        self._assert_passes(
+            template, "Union[Dict[str, torch.Tensor], int]", lhs["dict_literal_empty"]
+        )
+
+        self._assert_passes(
+            template,
+            "Union[Dict[str, torch.Tensor], int]",
+            lhs["dict_literal_of_str_tensor"],
+        )
+
+        self._assert_raises(
+            template,
+            "Union[Dict[str, torch.Tensor], int]",
+            lhs["dict_literal_of_str_int"],
+            "Type annotation was inferred to be "
+            r"`Dict\[str, Tensor\]`, but the type of "
+            "values given by the dict literal is",
+        )
+
+        self._assert_raises(
+            template,
+            "Union[Dict[str, torch.Tensor], int]",
+            lhs["dict_literal_of_mixed"],
+            "Type annotation was inferred to be "
+            r"`Dict\[str, Tensor\]`, but the type of "
+            "values given by the dict literal is",
+        )
+
+        self._assert_passes(
+            template, "Union[Dict[str, torch.Tensor], int]", lhs["dict_keyword"]
+        )
+
+        self._assert_passes(
+            template,
+            "Union[Dict[str, torch.Tensor], int]",
+            lhs["dict_keyword_with_iterable"],
+        )
+
+        self._assert_passes(
+            template,
+            "Union[Dict[str, torch.Tensor], int]",
+            lhs["dict_keyword_with_empty_iterable"],
+        )
+
+        self._assert_passes(
+            template,
+            "Union[Dict[str, torch.Tensor], int]",
+            lhs["dict_keyword_with_mapping"],
+        )
+
+        self._assert_passes(
+            template,
+            "Union[Dict[str, torch.Tensor], int]",
+            lhs["dict_keyword_with_mapping_and_kwargs"],
+        )
 
         # See above--string frontend does not support tuple unpacking
         # self._assert_passes(template,
diff --git a/test/jit/test_unsupported_ops.py b/test/jit/test_unsupported_ops.py
index 9555dcac518e1..f93515a9e58f5 100644
--- a/test/jit/test_unsupported_ops.py
+++ b/test/jit/test_unsupported_ops.py
@@ -2,19 +2,21 @@
 
 import os
 import sys
+import unittest
 
 import torch
-import unittest
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
 # NOTE: FIXING FAILING TESTS
 # If you are seeing a test failure from this file, congrats, you improved
@@ -22,6 +24,7 @@
 # the corresponding section in documentation that states the unsupported behavior.
 # see: `jit_unsupported.rst`
 
+
 class TestUnsupportedOps(JitTestCase):
     def test_factory_ops_requires_grad_fail(self):
         # Keyword argument {name} unknown is a JIT-only error message,
@@ -32,31 +35,31 @@ def test_factory_ops_requires_grad_fail(self):
         def ones():
             return torch.ones([2], requires_grad=True)
 
-        with self.assertRaisesRegexWithHighlight(Exception,
-                                                 "Keyword argument requires_grad unknown",
-                                                 "torch.ones"):
+        with self.assertRaisesRegexWithHighlight(
+            Exception, "Keyword argument requires_grad unknown", "torch.ones"
+        ):
             torch.jit.script(ones)
 
         def randn():
             return torch.randn([2], requires_grad=True)
 
-        with self.assertRaisesRegexWithHighlight(Exception,
-                                                 "Keyword argument requires_grad unknown",
-                                                 "torch.randn"):
+        with self.assertRaisesRegexWithHighlight(
+            Exception, "Keyword argument requires_grad unknown", "torch.randn"
+        ):
             torch.jit.script(randn)
 
         def zeros():
             return torch.zeros([2], requires_grad=True)
 
-        with self.assertRaisesRegexWithHighlight(Exception,
-                                                 "Keyword argument requires_grad unknown",
-                                                 "torch.zeros"):
+        with self.assertRaisesRegexWithHighlight(
+            Exception, "Keyword argument requires_grad unknown", "torch.zeros"
+        ):
             torch.jit.script(zeros)
 
     @unittest.skipIf(not torch._C.has_lapack, "PyTorch compiled without Lapack")
     def test_init_ops(self):
         def calculate_gain():
-            return torch.nn.init.calculate_gain('leaky_relu', 0.2)
+            return torch.nn.init.calculate_gain("leaky_relu", 0.2)
 
         def eye_():
             return torch.nn.init.eye_(torch.zeros([2, 2]))
@@ -71,9 +74,16 @@ def orthogonal_():
             return torch.nn.init.orthogonal_(torch.empty(3, 5))
 
         def sparse():
-            return torch.nn.init.sparse_(torch.empty(3, 5), sparsity=.1)
-
-        for func in [calculate_gain, eye_, dirac_, kaiming_uniform_, orthogonal_, sparse]:
+            return torch.nn.init.sparse_(torch.empty(3, 5), sparsity=0.1)
+
+        for func in [
+            calculate_gain,
+            eye_,
+            dirac_,
+            kaiming_uniform_,
+            orthogonal_,
+            sparse,
+        ]:
             # doesn't error in eager
             func()
             with self.assertRaisesRegex(Exception, ""):
diff --git a/test/jit/test_upgraders.py b/test/jit/test_upgraders.py
index a5b0d54b5ec59..fc325d95c3f03 100644
--- a/test/jit/test_upgraders.py
+++ b/test/jit/test_upgraders.py
@@ -3,20 +3,24 @@
 import io
 import os
 import sys
-import torch
 import zipfile
-from torch.testing import FileCheck
 from typing import Union
 
+import torch
+from torch.testing import FileCheck
+
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
 
 class TestUpgraders(JitTestCase):
     def _load_model_version(self, loaded_model):
@@ -28,10 +32,10 @@ def _load_model_version(self, loaded_model):
         # in a package between version 3 and 7.
         # So we have to check for both.
         try:
-            version = int(zipped_model.read('archive/version').decode("utf-8"))
+            version = int(zipped_model.read("archive/version").decode("utf-8"))
             return version
         except KeyError:
-            version = int(zipped_model.read('archive/.data/version').decode("utf-8"))
+            version = int(zipped_model.read("archive/.data/version").decode("utf-8"))
             return version
 
     # TODO (tugsuu) We should ideally be generating this test cases.
@@ -62,15 +66,23 @@ def test_add_value_to_version_map(self):
         upgrader_bumped_version = 3
         upgrader_name = "_test_serialization_subcmul_0_2"
         upgrader_schema = "aten::_test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=2) -> Tensor"
-        dummy_entry = torch._C._UpgraderEntry(upgrader_bumped_version, upgrader_name, upgrader_schema)
+        dummy_entry = torch._C._UpgraderEntry(
+            upgrader_bumped_version, upgrader_name, upgrader_schema
+        )
 
-        torch._C._test_only_add_entry_to_op_version_map("aten::_test_serialization_subcmul", dummy_entry)
+        torch._C._test_only_add_entry_to_op_version_map(
+            "aten::_test_serialization_subcmul", dummy_entry
+        )
         map_after_test = torch._C._get_operator_version_map()
         self.assertTrue("aten::_test_serialization_subcmul" in map_after_test)
         self.assertTrue(len(map_after_test) - len(map_before_test) == 1)
-        torch._C._test_only_remove_entry_to_op_version_map("aten::_test_serialization_subcmul")
+        torch._C._test_only_remove_entry_to_op_version_map(
+            "aten::_test_serialization_subcmul"
+        )
         map_after_remove_test = torch._C._get_operator_version_map()
-        self.assertTrue("aten::_test_serialization_subcmul" not in map_after_remove_test)
+        self.assertTrue(
+            "aten::_test_serialization_subcmul" not in map_after_remove_test
+        )
         self.assertEqual(len(map_after_remove_test), len(map_before_test))
 
     def test_populated_test_upgrader_graph(self):
@@ -151,7 +163,7 @@ def test_aten_linspace(self):
         model_path = pytorch_test_dir + "/jit/fixtures/test_versioned_linspace_v7.ptl"
         loaded_model = torch.jit.load(model_path)
         sample_inputs = ((3, 10), (-10, 10), (4.0, 6.0), (3 + 4j, 4 + 5j))
-        for (a, b) in sample_inputs:
+        for a, b in sample_inputs:
             output_with_step, output_without_step = loaded_model(a, b)
             # when no step is given, should have used 100
             self.assertTrue(output_without_step.size(dim=0) == 100)
@@ -161,7 +173,9 @@ def test_aten_linspace(self):
         self.assertTrue(version == 8)
 
     def test_aten_linspace_out(self):
-        model_path = pytorch_test_dir + "/jit/fixtures/test_versioned_linspace_out_v7.ptl"
+        model_path = (
+            pytorch_test_dir + "/jit/fixtures/test_versioned_linspace_out_v7.ptl"
+        )
         loaded_model = torch.jit.load(model_path)
         sample_inputs = (
             (3, 10, torch.empty((100,), dtype=torch.int64)),
@@ -169,7 +183,7 @@ def test_aten_linspace_out(self):
             (4.0, 6.0, torch.empty((100,), dtype=torch.float64)),
             (3 + 4j, 4 + 5j, torch.empty((100,), dtype=torch.complex64)),
         )
-        for (a, b, c) in sample_inputs:
+        for a, b, c in sample_inputs:
             output = loaded_model(a, b, c)
             # when no step is given, should have used 100
             self.assertTrue(output.size(dim=0) == 100)
@@ -181,7 +195,7 @@ def test_aten_logspace(self):
         model_path = pytorch_test_dir + "/jit/fixtures/test_versioned_logspace_v8.ptl"
         loaded_model = torch.jit.load(model_path)
         sample_inputs = ((3, 10), (-10, 10), (4.0, 6.0), (3 + 4j, 4 + 5j))
-        for (a, b) in sample_inputs:
+        for a, b in sample_inputs:
             output_with_step, output_without_step = loaded_model(a, b)
             # when no step is given, should have used 100
             self.assertTrue(output_without_step.size(dim=0) == 100)
@@ -191,7 +205,9 @@ def test_aten_logspace(self):
         self.assertTrue(version == 9)
 
     def test_aten_logspace_out(self):
-        model_path = pytorch_test_dir + "/jit/fixtures/test_versioned_logspace_out_v8.ptl"
+        model_path = (
+            pytorch_test_dir + "/jit/fixtures/test_versioned_logspace_out_v8.ptl"
+        )
         loaded_model = torch.jit.load(model_path)
         sample_inputs = (
             (3, 10, torch.empty((100,), dtype=torch.int64)),
@@ -199,7 +215,7 @@ def test_aten_logspace_out(self):
             (4.0, 6.0, torch.empty((100,), dtype=torch.float64)),
             (3 + 4j, 4 + 5j, torch.empty((100,), dtype=torch.complex64)),
         )
-        for (a, b, c) in sample_inputs:
+        for a, b, c in sample_inputs:
             output = loaded_model(a, b, c)
             # when no step is given, should have used 100
             self.assertTrue(output.size(dim=0) == 100)
@@ -208,21 +224,36 @@ def test_aten_logspace_out(self):
         self.assertTrue(version == 9)
 
     def test_aten_test_serialization(self):
-        model_path = pytorch_test_dir + "/jit/fixtures/_test_serialization_subcmul_v2.pt"
+        model_path = (
+            pytorch_test_dir + "/jit/fixtures/_test_serialization_subcmul_v2.pt"
+        )
 
         # add test version entry to the version map
         upgrader_bumped_version = 3
         upgrader_name = "_test_serialization_subcmul_0_2"
         upgrader_schema = "aten::_test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=2) -> Tensor"
-        dummy_entry = torch._C._UpgraderEntry(upgrader_bumped_version, upgrader_name, upgrader_schema)
+        dummy_entry = torch._C._UpgraderEntry(
+            upgrader_bumped_version, upgrader_name, upgrader_schema
+        )
 
-        torch._C._test_only_add_entry_to_op_version_map("aten::_test_serialization_subcmul", dummy_entry)
+        torch._C._test_only_add_entry_to_op_version_map(
+            "aten::_test_serialization_subcmul", dummy_entry
+        )
 
         # add test upgrader in the upgraders map
         @torch.jit.script
-        def _test_serialization_subcmul_0_2(self: torch.Tensor, other: torch.Tensor, alpha: Union[int, float] = 2) -> torch.Tensor:
+        def _test_serialization_subcmul_0_2(
+            self: torch.Tensor, other: torch.Tensor, alpha: Union[int, float] = 2
+        ) -> torch.Tensor:
             return other - (self * alpha)
-        torch._C._test_only_populate_upgraders({"_test_serialization_subcmul_0_2": str(_test_serialization_subcmul_0_2.graph)})
+
+        torch._C._test_only_populate_upgraders(
+            {
+                "_test_serialization_subcmul_0_2": str(
+                    _test_serialization_subcmul_0_2.graph
+                )
+            }
+        )
 
         # test if the server is able to find the test upgraders and apply to IR
         loaded_model = torch.jit.load(model_path)
@@ -238,11 +269,21 @@ def _test_serialization_subcmul_0_2(self: torch.Tensor, other: torch.Tensor, alp
         # we check by its' code because graph variable names
         # can be different every time
         self.assertEqual(loaded_model.code, loaded_model_twice.code)
-        torch._C._test_only_remove_entry_to_op_version_map("aten::_test_serialization_subcmul")
-        torch._C._test_only_remove_upgraders({"_test_serialization_subcmul_0_2": str(_test_serialization_subcmul_0_2.graph)})
+        torch._C._test_only_remove_entry_to_op_version_map(
+            "aten::_test_serialization_subcmul"
+        )
+        torch._C._test_only_remove_upgraders(
+            {
+                "_test_serialization_subcmul_0_2": str(
+                    _test_serialization_subcmul_0_2.graph
+                )
+            }
+        )
 
     def test_aten_div_scalar_at_3(self):
-        model_path = pytorch_test_dir + "/jit/fixtures/test_versioned_div_scalar_float_v3.pt"
+        model_path = (
+            pytorch_test_dir + "/jit/fixtures/test_versioned_div_scalar_float_v3.pt"
+        )
         loaded_model = torch.jit.load(model_path)
         FileCheck().check("prim::If").run(loaded_model.graph)
         FileCheck().check_count("aten::div", 2).run(loaded_model.graph)
@@ -254,11 +295,15 @@ def test_aten_div_scalar_at_3(self):
         self.assertEqual(version, 4)
         loaded_model_twice = torch.jit.load(buffer)
 
-        self.assertEqual(loaded_model(torch.Tensor([5.0, 3.0]), 2.0),
-                         loaded_model_twice(torch.Tensor([5.0, 3.0]), 2.0))
+        self.assertEqual(
+            loaded_model(torch.Tensor([5.0, 3.0]), 2.0),
+            loaded_model_twice(torch.Tensor([5.0, 3.0]), 2.0),
+        )
 
     def test_aten_div_tensor_out_at_3(self):
-        model_path = pytorch_test_dir + "/jit/fixtures/test_versioned_div_tensor_out_v3.pt"
+        model_path = (
+            pytorch_test_dir + "/jit/fixtures/test_versioned_div_tensor_out_v3.pt"
+        )
         loaded_model = torch.jit.load(model_path)
         FileCheck().check("prim::If").run(loaded_model.graph)
         FileCheck().check_count("aten::div", 2).run(loaded_model.graph)
@@ -274,7 +319,9 @@ def test_aten_div_tensor_out_at_3(self):
         self.assertEqual(loaded_model.code, loaded_model_twice.code)
 
     def test_aten_full_at_4(self):
-        model_path = pytorch_test_dir + "/jit/fixtures/test_versioned_full_integer_value_v4.pt"
+        model_path = (
+            pytorch_test_dir + "/jit/fixtures/test_versioned_full_integer_value_v4.pt"
+        )
         loaded_model = torch.jit.load(model_path)
         FileCheck().check_count("aten::Float", 1).run(loaded_model.graph)
         FileCheck().check_count("aten::full", 2).run(loaded_model.graph)
@@ -290,7 +337,9 @@ def test_aten_full_at_4(self):
         self.assertEqual(loaded_model.code, loaded_model_twice.code)
 
     def test_aten_full_out_at_4(self):
-        model_path = pytorch_test_dir + "/jit/fixtures/test_versioned_full_preserved_v4.pt"
+        model_path = (
+            pytorch_test_dir + "/jit/fixtures/test_versioned_full_preserved_v4.pt"
+        )
         loaded_model = torch.jit.load(model_path)
         FileCheck().check_count("aten::full", 5).run(loaded_model.graph)
         version = self._load_model_version(loaded_model)
diff --git a/test/jit/test_warn.py b/test/jit/test_warn.py
index 32547badd1589..abd31989119b4 100644
--- a/test/jit/test_warn.py
+++ b/test/jit/test_warn.py
@@ -1,12 +1,12 @@
 # Owner(s): ["oncall: jit"]
 
+import io
 import os
 import sys
-import io
-
-import torch
 import warnings
 from contextlib import redirect_stderr
+
+import torch
 from torch.testing import FileCheck
 
 # Make the helper files in test/ importable
@@ -14,10 +14,12 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
 
 
 class TestWarn(JitTestCase):
@@ -30,12 +32,9 @@ def fn():
         with redirect_stderr(f):
             fn()
 
-        FileCheck() \
-            .check_count(
-                str="UserWarning: I am warning you",
-                count=1,
-                exactly=True) \
-            .run(f.getvalue())
+        FileCheck().check_count(
+            str="UserWarning: I am warning you", count=1, exactly=True
+        ).run(f.getvalue())
 
     def test_warn_only_once(self):
         @torch.jit.script
@@ -47,12 +46,9 @@ def fn():
         with redirect_stderr(f):
             fn()
 
-        FileCheck() \
-            .check_count(
-                str="UserWarning: I am warning you",
-                count=1,
-                exactly=True) \
-            .run(f.getvalue())
+        FileCheck().check_count(
+            str="UserWarning: I am warning you", count=1, exactly=True
+        ).run(f.getvalue())
 
     def test_warn_only_once_in_loop_func(self):
         def w():
@@ -67,12 +63,9 @@ def fn():
         with redirect_stderr(f):
             fn()
 
-        FileCheck() \
-            .check_count(
-                str="UserWarning: I am warning you",
-                count=1,
-                exactly=True) \
-            .run(f.getvalue())
+        FileCheck().check_count(
+            str="UserWarning: I am warning you", count=1, exactly=True
+        ).run(f.getvalue())
 
     def test_warn_once_per_func(self):
         def w1():
@@ -90,12 +83,9 @@ def fn():
         with redirect_stderr(f):
             fn()
 
-        FileCheck() \
-            .check_count(
-                str="UserWarning: I am warning you",
-                count=2,
-                exactly=True) \
-            .run(f.getvalue())
+        FileCheck().check_count(
+            str="UserWarning: I am warning you", count=2, exactly=True
+        ).run(f.getvalue())
 
     def test_warn_once_per_func_in_loop(self):
         def w1():
@@ -114,12 +104,9 @@ def fn():
         with redirect_stderr(f):
             fn()
 
-        FileCheck() \
-            .check_count(
-                str="UserWarning: I am warning you",
-                count=2,
-                exactly=True) \
-            .run(f.getvalue())
+        FileCheck().check_count(
+            str="UserWarning: I am warning you", count=2, exactly=True
+        ).run(f.getvalue())
 
     def test_warn_multiple_calls_multiple_warnings(self):
         @torch.jit.script
@@ -131,12 +118,9 @@ def fn():
             fn()
             fn()
 
-        FileCheck() \
-            .check_count(
-                str="UserWarning: I am warning you",
-                count=2,
-                exactly=True) \
-            .run(f.getvalue())
+        FileCheck().check_count(
+            str="UserWarning: I am warning you", count=2, exactly=True
+        ).run(f.getvalue())
 
     def test_warn_multiple_calls_same_func_diff_stack(self):
         def warn(caller: str):
@@ -155,13 +139,10 @@ def bar():
             foo()
             bar()
 
-        FileCheck() \
-            .check_count(
-                str="UserWarning: I am warning you from foo",
-                count=1,
-                exactly=True) \
-            .check_count(
-                str="UserWarning: I am warning you from bar",
-                count=1,
-                exactly=True) \
-            .run(f.getvalue())
+        FileCheck().check_count(
+            str="UserWarning: I am warning you from foo", count=1, exactly=True
+        ).check_count(
+            str="UserWarning: I am warning you from bar", count=1, exactly=True
+        ).run(
+            f.getvalue()
+        )
diff --git a/test/jit/test_with.py b/test/jit/test_with.py
index fdd11fbc68816..fe4488836ad9f 100644
--- a/test/jit/test_with.py
+++ b/test/jit/test_with.py
@@ -32,6 +32,7 @@ def test_with_as(self):
         Check that with statements that use the 'as' keyword to bind expressions
         to targets work as expected.
         """
+
         @torch.jit.script
         class Context:
             """
@@ -189,6 +190,7 @@ def test_with_no_as(self):
         Check that with statements that do not use the 'as' keyword to bind expressions
         to targets work as expected.
         """
+
         @torch.jit.script
         class Context:
             """
@@ -345,6 +347,7 @@ def test_with_exceptions(self):
         Check that exceptions thrown in the bodies of with-statements are
         handled correctly.
         """
+
         @torch.jit.script
         class Context:
             """
@@ -368,7 +371,7 @@ def __exit__(self, type: Any, value: Any, tb: Any):
 
         @torch.jit.script
         def method_that_raises() -> torch.Tensor:
-            raise Exception("raised exception")
+            raise Exception("raised exception")  # noqa: TRY002
 
         @torch.jit.script
         def test_exception(x: torch.Tensor, c: Context) -> torch.Tensor:
@@ -416,15 +419,21 @@ def test_exception_fn_call(x: torch.Tensor, c: Context) -> torch.Tensor:
         # checkScript and checkScriptRaisesRegex cannot be used because the string frontend will
         # not compile class types (of which Context, the context manager being used for this test
         # is one).
-        with self.assertRaisesRegexWithHighlight(Exception, r"raised exception", "raise Exception(\"raised exception"):
+        with self.assertRaisesRegexWithHighlight(
+            Exception, r"raised exception", 'raise Exception("raised exception'
+        ):
             test_exception(torch.randn(2), c)
         self.assertEqual(c.count, 1)
 
-        with self.assertRaisesRegexWithHighlight(Exception, r"raised exception", "raise Exception(\"raised exception"):
+        with self.assertRaisesRegexWithHighlight(
+            Exception, r"raised exception", 'raise Exception("raised exception'
+        ):
             test_exception_nested(torch.randn(2), c)
         self.assertEqual(c.count, 1)
 
-        with self.assertRaisesRegexWithHighlight(Exception, r"raised exception", "raise Exception(\"raised exception"):
+        with self.assertRaisesRegexWithHighlight(
+            Exception, r"raised exception", 'raise Exception("raised exception'
+        ):
             test_exception_fn_call(torch.randn(2), c)
         self.assertEqual(c.count, 1)
 
@@ -505,7 +514,9 @@ def test_bad_exit(x: torch.Tensor, cm: BadExit) -> torch.Tensor:
 
             return x
 
-        def test_exit_incorrect_types(x: torch.Tensor, cm: ExitIncorrectTypes) -> torch.Tensor:
+        def test_exit_incorrect_types(
+            x: torch.Tensor, cm: ExitIncorrectTypes
+        ) -> torch.Tensor:
             with cm as _:
                 pass
 
@@ -523,7 +534,9 @@ def test_enter_without_object():
             self.checkScript(test_no_enter_no_exit, (test_tensor, NoEnterNoExit()))
 
         with self.assertRaisesRegexWithHighlight(
-            RuntimeError, r"__enter__ must have only one argument and one return value", "cm"
+            RuntimeError,
+            r"__enter__ must have only one argument and one return value",
+            "cm",
         ):
             self.checkScript(test_bad_enter, (test_tensor, BadEnter()))
 
@@ -539,7 +552,9 @@ def test_enter_without_object():
                 test_exit_incorrect_types, (test_tensor, ExitIncorrectTypes())
             )
 
-        with self.assertRaisesRegexWithHighlight(RuntimeError, r"must return an object", "\"not_object\""):
+        with self.assertRaisesRegexWithHighlight(
+            RuntimeError, r"must return an object", '"not_object"'
+        ):
             self.checkScript(test_enter_without_object, ())
 
     def test_with_no_grad(self):
@@ -603,6 +618,7 @@ def test_with_record_function(self):
         Check that torch.autograd.profiler.record_function context manager is
         torchscriptable.
         """
+
         def with_rf(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
             with torch.autograd.profiler.record_function("foo"):
                 # Nested record_function.
diff --git a/test/jit/xnnpack/test_xnnpack_delegate.py b/test/jit/xnnpack/test_xnnpack_delegate.py
index 4c7bc4aa628c0..f6fc5357f2328 100644
--- a/test/jit/xnnpack/test_xnnpack_delegate.py
+++ b/test/jit/xnnpack/test_xnnpack_delegate.py
@@ -7,6 +7,7 @@
 
 torch.ops.load_library("//caffe2:xnnpack_backend")
 
+
 class TestXNNPackBackend(unittest.TestCase):
     def test_xnnpack_constant_data(self):
         class Module(torch.nn.Module):
@@ -24,17 +25,19 @@ def forward(self, x):
             scripted_module,
             {
                 "forward": {
-                    "inputs" : [torch.randn(4, 4, 4)],
-                    "outputs": [torch.randn(4, 4, 4)]
+                    "inputs": [torch.randn(4, 4, 4)],
+                    "outputs": [torch.randn(4, 4, 4)],
                 }
-            }
+            },
         )
 
         for i in range(0, 20):
             sample_input = torch.randn(4, 4, 4)
             actual_output = scripted_module(sample_input)
             expected_output = lowered_module(sample_input)
-            self.assertTrue(torch.allclose(actual_output, expected_output, atol=1e-03, rtol=1e-03))
+            self.assertTrue(
+                torch.allclose(actual_output, expected_output, atol=1e-03, rtol=1e-03)
+            )
 
     def test_xnnpack_lowering(self):
         class Module(torch.nn.Module):
@@ -45,13 +48,11 @@ def forward(self, x):
 
         faulty_compile_spec = {
             "backward": {
-                "inputs" : [torch.zeros(1)],
+                "inputs": [torch.zeros(1)],
                 "outputs": [torch.zeros(1)],
             }
         }
-        error_msg = (
-            "method_compile_spec does not contain the \"forward\" key."
-        )
+        error_msg = 'method_compile_spec does not contain the "forward" key.'
 
         with self.assertRaisesRegex(
             RuntimeError,
@@ -64,21 +65,21 @@ def forward(self, x):
             )
 
         mismatch_compile_spec = {
-            "forward" : {
-                "inputs" : [torch.zeros(1), torch.zeros(1)],
-                "outputs" : [torch.zeros(1)]
+            "forward": {
+                "inputs": [torch.zeros(1), torch.zeros(1)],
+                "outputs": [torch.zeros(1)],
             }
         }
-        error_msg = ("method_compile_spec inputs do not match expected number of forward inputs")
+        error_msg = (
+            "method_compile_spec inputs do not match expected number of forward inputs"
+        )
 
         with self.assertRaisesRegex(
             RuntimeError,
             error_msg,
         ):
             _ = torch._C._jit_to_backend(
-                "xnnpack",
-                scripted_module,
-                mismatch_compile_spec
+                "xnnpack", scripted_module, mismatch_compile_spec
             )
 
         lowered = torch._C._jit_to_backend(
@@ -86,10 +87,10 @@ def forward(self, x):
             scripted_module,
             {
                 "forward": {
-                    "inputs" : [torch.zeros(1)],
+                    "inputs": [torch.zeros(1)],
                     "outputs": [torch.zeros(1)],
                 }
-            }
+            },
         )
         lowered(torch.zeros(1))
 
@@ -113,14 +114,16 @@ def forward(self, x, y):
             add_module,
             {
                 "forward": {
-                    "inputs" : [sample_inputs[0].clone(), sample_inputs[1].clone()],
-                    "outputs": [sample_output]
+                    "inputs": [sample_inputs[0].clone(), sample_inputs[1].clone()],
+                    "outputs": [sample_output],
                 }
-            }
+            },
         )
 
         actual_output = lowered_add_module.forward(sample_inputs[0], sample_inputs[1])
-        self.assertTrue(torch.allclose(actual_output, expected_output, atol=1e-03, rtol=1e-03))
+        self.assertTrue(
+            torch.allclose(actual_output, expected_output, atol=1e-03, rtol=1e-03)
+        )
 
     def test_xnnpack_broadcasting(self):
         class AddModule(torch.nn.Module):
@@ -139,14 +142,16 @@ def forward(self, x, y):
             add_module,
             {
                 "forward": {
-                    "inputs" : [sample_inputs[0], sample_inputs[1]],
-                    "outputs": [sample_output]
+                    "inputs": [sample_inputs[0], sample_inputs[1]],
+                    "outputs": [sample_output],
                 }
-            }
+            },
         )
 
         actual_output = lowered_add_module.forward(sample_inputs[0], sample_inputs[1])
-        self.assertTrue(torch.allclose(actual_output, expected_output, atol=1e-03, rtol=1e-03))
+        self.assertTrue(
+            torch.allclose(actual_output, expected_output, atol=1e-03, rtol=1e-03)
+        )
 
     def test_xnnpack_unsupported(self):
         class AddSpliceModule(torch.nn.Module):
@@ -173,8 +178,8 @@ def forward(self, x, y):
                 add_module,
                 {
                     "forward": {
-                        "inputs" : [sample_inputs[0], sample_inputs[1]],
-                        "outputs": [sample_output]
+                        "inputs": [sample_inputs[0], sample_inputs[1]],
+                        "outputs": [sample_output],
                     }
-                }
+                },
             )
diff --git a/test/jit_hooks/model.py b/test/jit_hooks/model.py
index 4bc55c26cb820..b2f6ba5ea080e 100644
--- a/test/jit_hooks/model.py
+++ b/test/jit_hooks/model.py
@@ -1,23 +1,30 @@
 import argparse
 import os
 import sys
+
 import torch
 
 # grab modules from test_jit_hooks.cpp
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from jit.test_hooks_modules import (
-    create_forward_tuple_input, create_module_forward_multiple_inputs,
-    create_module_forward_single_input, create_module_hook_return_nothing,
+    create_forward_tuple_input,
+    create_module_forward_multiple_inputs,
+    create_module_forward_single_input,
+    create_module_hook_return_nothing,
     create_module_multiple_hooks_multiple_inputs,
-    create_module_multiple_hooks_single_input, create_module_no_forward_input,
-    create_module_same_hook_repeated, create_submodule_forward_multiple_inputs,
+    create_module_multiple_hooks_single_input,
+    create_module_no_forward_input,
+    create_module_same_hook_repeated,
+    create_submodule_forward_multiple_inputs,
     create_submodule_forward_single_input,
     create_submodule_hook_return_nothing,
     create_submodule_multiple_hooks_multiple_inputs,
     create_submodule_multiple_hooks_single_input,
     create_submodule_same_hook_repeated,
-    create_submodule_to_call_directly_with_hooks)
+    create_submodule_to_call_directly_with_hooks,
+)
+
 
 # Create saved modules for JIT forward hooks and pre-hooks
 def main():
@@ -30,23 +37,45 @@ def main():
     save_name = options.export_script_module_to + "_"
 
     tests = [
-        ("test_submodule_forward_single_input", create_submodule_forward_single_input()),
-        ("test_submodule_forward_multiple_inputs", create_submodule_forward_multiple_inputs()),
-        ("test_submodule_multiple_hooks_single_input", create_submodule_multiple_hooks_single_input()),
-        ("test_submodule_multiple_hooks_multiple_inputs", create_submodule_multiple_hooks_multiple_inputs()),
+        (
+            "test_submodule_forward_single_input",
+            create_submodule_forward_single_input(),
+        ),
+        (
+            "test_submodule_forward_multiple_inputs",
+            create_submodule_forward_multiple_inputs(),
+        ),
+        (
+            "test_submodule_multiple_hooks_single_input",
+            create_submodule_multiple_hooks_single_input(),
+        ),
+        (
+            "test_submodule_multiple_hooks_multiple_inputs",
+            create_submodule_multiple_hooks_multiple_inputs(),
+        ),
         ("test_submodule_hook_return_nothing", create_submodule_hook_return_nothing()),
         ("test_submodule_same_hook_repeated", create_submodule_same_hook_repeated()),
-
         ("test_module_forward_single_input", create_module_forward_single_input()),
-        ("test_module_forward_multiple_inputs", create_module_forward_multiple_inputs()),
-        ("test_module_multiple_hooks_single_input", create_module_multiple_hooks_single_input()),
-        ("test_module_multiple_hooks_multiple_inputs", create_module_multiple_hooks_multiple_inputs()),
+        (
+            "test_module_forward_multiple_inputs",
+            create_module_forward_multiple_inputs(),
+        ),
+        (
+            "test_module_multiple_hooks_single_input",
+            create_module_multiple_hooks_single_input(),
+        ),
+        (
+            "test_module_multiple_hooks_multiple_inputs",
+            create_module_multiple_hooks_multiple_inputs(),
+        ),
         ("test_module_hook_return_nothing", create_module_hook_return_nothing()),
         ("test_module_same_hook_repeated", create_module_same_hook_repeated()),
-
         ("test_module_no_forward_input", create_module_no_forward_input()),
         ("test_forward_tuple_input", create_forward_tuple_input()),
-        ("test_submodule_to_call_directly_with_hooks", create_submodule_to_call_directly_with_hooks())
+        (
+            "test_submodule_to_call_directly_with_hooks",
+            create_submodule_to_call_directly_with_hooks(),
+        ),
     ]
 
     for name, model in tests:
diff --git a/test/lazy/test_bindings.py b/test/lazy/test_bindings.py
index 57151d4085602..39466b33a168d 100644
--- a/test/lazy/test_bindings.py
+++ b/test/lazy/test_bindings.py
@@ -2,6 +2,7 @@
 
 import torch._lazy.metrics
 
+
 def test_metrics():
     names = torch._lazy.metrics.counter_names()
     assert len(names) == 0, f"Expected no counter names, but got {names}"
diff --git a/test/lazy/test_debug_util.py b/test/lazy/test_debug_util.py
index df201d54737f1..3bb88d866b6bd 100644
--- a/test/lazy/test_debug_util.py
+++ b/test/lazy/test_debug_util.py
@@ -3,11 +3,11 @@
 import os
 import re
 import tempfile
-import torch.nn as nn
 import unittest
 
 import torch._lazy
 import torch._lazy.ts_backend
+import torch.nn as nn
 from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
 
 torch._lazy.ts_backend.init()
@@ -21,15 +21,16 @@ def _run_linear(self):
         output = model(torch.randn(1, 5).to(device))
         torch._lazy.mark_step()
 
-
     def test_get_python_frames(self):
         # We only care about the first "Python Stacktrace" part of the saved
         # graph. However, we cannot save the whole stack for comparison given
         # it depends on a lot of things.
-        partial_graph = (r"Python Stacktrace:.*"
-                         r"mark_step \(.*/_lazy/__init__.py:[0-9]+\).*"
-                         r"_run_linear \(.*lazy/test_debug_util.py:[0-9]+\).*"
-                         r"test_get_python_frames \(.*lazy/test_debug_util.py:[0-9]+\)")
+        partial_graph = (
+            r"Python Stacktrace:.*"
+            r"mark_step \(.*/_lazy/__init__.py:[0-9]+\).*"
+            r"_run_linear \(.*lazy/test_debug_util.py:[0-9]+\).*"
+            r"test_get_python_frames \(.*lazy/test_debug_util.py:[0-9]+\)"
+        )
 
         with tempfile.NamedTemporaryFile(mode="r+", encoding="utf-8") as graph_file:
             os.environ["LTC_SAVE_TENSORS_FILE"] = graph_file.name
diff --git a/test/lazy/test_extract_compiled_graph.py b/test/lazy/test_extract_compiled_graph.py
index bde68ae4dcf4c..e54c2c1457d41 100644
--- a/test/lazy/test_extract_compiled_graph.py
+++ b/test/lazy/test_extract_compiled_graph.py
@@ -3,37 +3,44 @@
 import unittest
 
 from torch._lazy.ts_backend import init as init_ts_backend
+
 init_ts_backend()
-from torch._lazy import config
-from torch._lazy.extract_compiled_graph import extract_compiled_graph
-import torch
-from torch import nn
+import copy
 import dis
 import inspect
-from torch import fx
 import re
 from contextlib import contextmanager
-import copy
+
+import torch
+from torch import fx, nn
+from torch._lazy import config
+from torch._lazy.extract_compiled_graph import extract_compiled_graph
+
 
 class ModuleConstScale(nn.Module):
     def forward(self, a):
         return a * 2
 
+
 class ModuleSub(nn.Module):
     def forward(self, a, b):
         return a - b
 
+
 class ModuleAddcmul(nn.Module):
     """
     addcmul function takes a at::Scalar which results in a special TSData containing a Scalar rather than a Tensor.
     """
+
     def forward(self, a, b, c):
         return torch.addcmul(a, b, c, value=5)
 
+
 class ModuleReturnMulti(nn.Module):
     def forward(self, a, b):
         return (b + 1, a - 1)
 
+
 # The default fx tracer will convert torch.randn to a constant.. We may need
 # a custom tracer.
 # class ModuleEagerTensor(nn.Module):
@@ -58,21 +65,25 @@ def forward(self, a, b):
 #     def forward(self):
 #         return torch.tensor((2, 3), dtype=torch.float32)
 
+
 class ModuleReturnDupTensor(nn.Module):
     """
     Handle the corner case that the same tensor appears multiple times in the
     returned tuple. torchbench like drq will hit this corner case when running
     thru torchdynamo..
     """
+
     def forward(self, a, b):
         c = a + b
         return a - b, c, a + 1, c
 
+
 class ModuleInplaceUpdate(nn.Module):
     def forward(self, a, b):
         a.sub_(b)
         return b - 1, b + 1
 
+
 @contextmanager
 def force_fallback_ctx_mgr(fallback_op):
     oldconfig = config.get_force_fallback()
@@ -82,6 +93,7 @@ def force_fallback_ctx_mgr(fallback_op):
     finally:
         config.set_force_fallback(oldconfig)
 
+
 @contextmanager
 def nop_ctx_mgr():
     try:
@@ -89,27 +101,33 @@ def nop_ctx_mgr():
     finally:
         pass
 
+
 def gen_rand_args(mod):
     args = []
     for _ in range(len(inspect.signature(mod.forward).parameters)):
         args.append(torch.randn(2, 3))
     return args
 
+
 def allclose(expected, actual):
     def unwrap(cont):
         if isinstance(cont, (list, tuple)) and len(cont) == 1:
             return cont[0]
         return cont
+
     expected = unwrap(expected)
     actual = unwrap(actual)
 
     if isinstance(expected, torch.Tensor) and isinstance(actual, torch.Tensor):
         return torch.allclose(expected, actual)
     elif isinstance(expected, (tuple, list)) and isinstance(actual, (tuple, list)):
-        return len(expected) == len(actual) and all(torch.allclose(a, b) for a, b in zip(expected, actual))
+        return len(expected) == len(actual) and all(
+            torch.allclose(a, b) for a, b in zip(expected, actual)
+        )
     else:
         raise RuntimeError("Unexpected types")
 
+
 def verify_reusing_compiled_graph(mod, exception_msg_pattern, ncase=10):
     args = gen_rand_args(mod)
     out = mod(*args)
@@ -123,13 +141,17 @@ def verify_reusing_compiled_graph(mod, exception_msg_pattern, ncase=10):
             raise e  # reraise the exception
         exception_message = str(e)
         if not re.search(exception_msg_pattern, exception_message):
-            raise RuntimeError(f"Exception message does not match the required pattern: {exception_message}") from e
+            raise RuntimeError(
+                f"Exception message does not match the required pattern: {exception_message}"
+            ) from e
         else:
             # We are done for the test case that expects an exception
             return
 
     if exception_msg_pattern is not None:
-        raise RuntimeError(f"Expect an exception matching pattern {exception_msg_pattern}")
+        raise RuntimeError(
+            f"Expect an exception matching pattern {exception_msg_pattern}"
+        )
     print("return value of optimized_mod", optimized_mod(*args))
 
     # check correctness
@@ -148,13 +170,16 @@ def verify_reusing_compiled_graph(mod, exception_msg_pattern, ncase=10):
         # make sure arguments match after calling the model forward method to handle inplace
         # updates.
         if not allclose(rand_args, rand_args_copy):
-            print(f"Incorrect updated arguments. expected {rand_args}, actual {rand_args_copy}")
+            print(
+                f"Incorrect updated arguments. expected {rand_args}, actual {rand_args_copy}"
+            )
             failed_index.append(i)
             continue
 
     if len(failed_index) > 0:
         raise RuntimeError(f"Failed {len(failed_index)}/{ncase} cases")
 
+
 def maketest(module_cls, exception_msg_pattern=None, ctxmgr=None):
     def wrapper(self):
         nonlocal ctxmgr
@@ -165,11 +190,16 @@ def wrapper(self):
 
     return wrapper
 
+
 class OptimizeTest(unittest.TestCase):
     test_sub = maketest(ModuleSub)
     # Same as test_sub but force aten::sub to fallback
     # We expect an exception caught because of LTC fallabck.
-    test_ltc_fallback = maketest(ModuleSub, exception_msg_pattern="fallback.*aten::sub", ctxmgr=force_fallback_ctx_mgr("aten::sub"))
+    test_ltc_fallback = maketest(
+        ModuleSub,
+        exception_msg_pattern="fallback.*aten::sub",
+        ctxmgr=force_fallback_ctx_mgr("aten::sub"),
+    )
     test_const_scale = maketest(ModuleConstScale)
     test_addcmul = maketest(ModuleAddcmul)
     test_return_multi = maketest(ModuleReturnMulti)
diff --git a/test/lazy/test_functionalization.py b/test/lazy/test_functionalization.py
index fb0e1ae9b7c34..b589cc6af1c87 100644
--- a/test/lazy/test_functionalization.py
+++ b/test/lazy/test_functionalization.py
@@ -1,5 +1,7 @@
 # Owner(s): ["oncall: jit"]
 
+import re
+
 import torch
 import torch._lazy.metrics as metrics
 import torch._lazy.ts_backend
@@ -7,6 +9,8 @@
 
 torch._lazy.ts_backend.init()
 
+NODE_TYPE_PATTERN = re.compile(r", NodeType=[^\n]+")
+
 
 class LazyFuncionalizationTest(TestCase):
     def test_lazy_init_with_view(self):
@@ -56,6 +60,39 @@ def forward(self, x):
         self.assertEqual(cpu_out, lazy_out_1.to("cpu"))
         self.assertEqual(cpu_out, lazy_out_2.to("cpu"))
 
+    def test_data_assign(self):
+        def text(lazyt):
+            raw = torch._C._lazy._get_tensors_text([lazyt])
+            return NODE_TYPE_PATTERN.sub("", raw)
+
+        origin = torch.rand(3, dtype=torch.float32)
+        tensor = origin.to("lazy")
+
+        self.assertExpectedInline(
+            text(tensor),
+            """\
+IR {
+  %0 = [Float[3]] lazy_tensors::device_data(), device=CPU0, ROOT=0
+}
+""",
+        )
+
+        # Modify the data-type of tensor, and assign it to 'data'.
+        # This should update the inner tensor of FunctionalTensorWrapper,
+        # changing the corresponding IR node.
+        modified_tensor = tensor.to(torch.bfloat16)
+        tensor.data = modified_tensor
+
+        self.assertExpectedInline(
+            text(tensor),
+            """\
+IR {
+  %0 = [Float[3]] lazy_tensors::device_data(), device=CPU0
+  %1 = [BFloat16[3]] aten::_to_copy(%0), dtype=BFloat16, layout=null, device=null, pin_memory=null, non_blocking=0, memory_format=null, ROOT=0
+}
+""",  # noqa: B950
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/lazy/test_meta_kernel.py b/test/lazy/test_meta_kernel.py
index 52ee832505f0a..516121c547ac1 100644
--- a/test/lazy/test_meta_kernel.py
+++ b/test/lazy/test_meta_kernel.py
@@ -1,16 +1,16 @@
 # Owner(s): ["oncall: jit"]
 
 import torch
-
-from torch.testing._internal.common_utils import TestCase
-from torch import float32, float16
 import torch._lazy
 import torch._lazy.ts_backend
+from torch import float16, float32
+
+from torch.testing._internal.common_utils import TestCase
 
 torch._lazy.ts_backend.init()
 
-class TestMetaKernel(TestCase):
 
+class TestMetaKernel(TestCase):
     def test_addmm_invalid_dtype(self):
         """Tests that the addmm meta kernel returns the correct output type"""
         input = torch.ones(2, 2, dtype=torch.float16).to("lazy")
@@ -33,3 +33,7 @@ def test_addmm(self):
         fc_bias = torch.nn.Linear(2, 2, bias=True, dtype=float16).to("lazy")
         out_bias = fc_bias(input)
         self.assertEqual(out_bias.dtype, torch.float16)
+
+    def test_add_invalid_device(self):
+        with self.assertRaisesRegex(RuntimeError, ".*not a lazy tensor.*"):
+            _ = torch.tensor([1], device="cpu") + torch.tensor([1], device="lazy")
diff --git a/test/lazy/test_reuse_ir.py b/test/lazy/test_reuse_ir.py
index f7024e9519cca..70112cd6cc543 100644
--- a/test/lazy/test_reuse_ir.py
+++ b/test/lazy/test_reuse_ir.py
@@ -1,20 +1,23 @@
 # Owner(s): ["oncall: jit"]
 
+import os
+import unittest
+
 import torch
 import torch._lazy
 import torch._lazy.config
 import torch._lazy.ir_cache
-import torch._lazy.ts_backend
 import torch._lazy.metrics as metrics
+import torch._lazy.ts_backend
 from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
-import os
-import unittest
 
 torch._lazy.ts_backend.init()
 torch._lazy.config.set_reuse_ir(True)
 
+
 def get_test_device():
-    return 'cuda' if 'LTC_TS_CUDA' in os.environ else 'cpu'
+    return "cuda" if "LTC_TS_CUDA" in os.environ else "cpu"
+
 
 @unittest.skipIf(IS_WINDOWS, "To be fixed")
 class TestLazyReuseIr(TestCase):
@@ -24,16 +27,16 @@ def testAdd(self):
         y = torch.randn(2, 3, 4, device=device)
         z = torch.zeros(2, 3, 4, device=device)
 
-        device = 'lazy'
+        device = "lazy"
         x_lazy = x.detach().clone().to(device=device)
         y_lazy = y.detach().clone().to(device=device)
         z_lazy = z.detach().clone().to(device=device)
 
         for i in range(10):
-            z += (x + y)
+            z += x + y
 
         for i in range(10):
-            z_lazy += (x_lazy + y_lazy)
+            z_lazy += x_lazy + y_lazy
             torch._lazy.mark_step()
 
         torch.testing.assert_close(z.cpu(), z_lazy.cpu())
@@ -47,22 +50,22 @@ def testAddSub(self):
         y = torch.randn(2, 3, 4, device=device)
         z = torch.zeros(2, 3, 4, device=device)
 
-        device = 'lazy'
+        device = "lazy"
         x_lazy = x.detach().clone().to(device=device)
         y_lazy = y.detach().clone().to(device=device)
         z_lazy = z.detach().clone().to(device=device)
 
         for i in range(10):
             if i < 5:
-                z += (x + y)
+                z += x + y
             else:
-                z += (x - y)
+                z += x - y
 
         for i in range(10):
             if i < 5:
-                z_lazy += (x_lazy + y_lazy)
+                z_lazy += x_lazy + y_lazy
             else:
-                z_lazy += (x_lazy - y_lazy)
+                z_lazy += x_lazy - y_lazy
             torch._lazy.mark_step()
 
         torch.testing.assert_close(z.cpu(), z_lazy.cpu())
@@ -77,22 +80,22 @@ def testAddSubFallback(self):
         y = torch.randn(2, 3, 4, device=device)
         z = torch.zeros(2, 3, 4, device=device)
 
-        device = 'lazy'
+        device = "lazy"
         x_lazy = x.detach().clone().to(device=device)
         y_lazy = y.detach().clone().to(device=device)
         z_lazy = z.detach().clone().to(device=device)
 
         for i in range(10):
             if i < 5:
-                z += (x + y)
+                z += x + y
             else:
-                z += (x - y)
+                z += x - y
 
         for i in range(10):
             if i < 5:
-                z_lazy += (x_lazy + y_lazy)
+                z_lazy += x_lazy + y_lazy
             else:
-                z_lazy += (x_lazy - y_lazy)
+                z_lazy += x_lazy - y_lazy
             torch._lazy.mark_step()
 
         torch.testing.assert_close(z.cpu(), z_lazy.cpu())
@@ -110,16 +113,24 @@ def testBatchNorm(self):
         for i in range(10):
             # BatchNorm2d does extra checks on dimensions which SymInts don't support yet
             # so we call `torch.ops.aten.native_batch_norm` to bypass the checks.
-            z, _, _ = torch.ops.aten.native_batch_norm(x, weight, bias, None, None, True, 0.1, 1e-5)
-            z_legit, _, _ = torch.ops.aten._native_batch_norm_legit(x, weight, bias, True, 0.1, 1e-5)
+            z, _, _ = torch.ops.aten.native_batch_norm(
+                x, weight, bias, None, None, True, 0.1, 1e-5
+            )
+            z_legit, _, _ = torch.ops.aten._native_batch_norm_legit(
+                x, weight, bias, True, 0.1, 1e-5
+            )
 
         device = "lazy"
         x_lazy = x.detach().clone().to(device=device)
         weight_lazy = weight.detach().clone().to(device=device)
         bias_lazy = bias.detach().clone().to(device=device)
         for i in range(10):
-            z_lazy, _, _ = torch.ops.aten.native_batch_norm(x_lazy, weight_lazy, bias_lazy, None, None, True, 0.1, 1e-5)
-            z_legit_lazy, _, _ = torch.ops.aten._native_batch_norm_legit(x_lazy, weight_lazy, bias_lazy, True, 0.1, 1e-5)
+            z_lazy, _, _ = torch.ops.aten.native_batch_norm(
+                x_lazy, weight_lazy, bias_lazy, None, None, True, 0.1, 1e-5
+            )
+            z_legit_lazy, _, _ = torch.ops.aten._native_batch_norm_legit(
+                x_lazy, weight_lazy, bias_lazy, True, 0.1, 1e-5
+            )
             torch._lazy.mark_step()
 
         torch.testing.assert_close(z.cpu(), z_lazy.cpu())
@@ -129,5 +140,5 @@ def testBatchNorm(self):
         torch._lazy.ir_cache.reset()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/lazy/test_step_closures.py b/test/lazy/test_step_closures.py
index c84479097211f..b481d89d9cb5c 100644
--- a/test/lazy/test_step_closures.py
+++ b/test/lazy/test_step_closures.py
@@ -54,7 +54,7 @@ def closure():
             torch._lazy.add_step_closure(closure)
             torch._lazy.mark_step()
 
-            raise AssertionError()  # Should not reach here
+            raise AssertionError  # Should not reach here
         except RuntimeError as e:
             assert flag.is_set(), "Should have caught exception from closure"
 
@@ -79,7 +79,7 @@ def closure2():  # Should never execute
             torch._lazy.add_step_closure(closure2, run_async=True)
             torch._lazy.mark_step()
 
-            raise AssertionError()  # Should not reach here
+            raise AssertionError  # Should not reach here
         except RuntimeError as e:
             # Should have caught exception from closure1
             pass
diff --git a/test/lazy/test_ts_opinfo.py b/test/lazy/test_ts_opinfo.py
index 608832350624f..c880c90a8108b 100644
--- a/test/lazy/test_ts_opinfo.py
+++ b/test/lazy/test_ts_opinfo.py
@@ -1,109 +1,125 @@
 # Owner(s): ["oncall: jit"]
 
-from typing import Sequence
-import torch
 import functools
+import itertools
+import os
+import pathlib
+from typing import Sequence
+from unittest import skip
 
-from torch.testing._internal.common_utils import run_tests, TestCase
-from torch.testing._internal.jit_utils import JitTestCase
-from torch.testing._internal.common_methods_invocations import op_db
-from torch.testing._internal.common_device_type import ops, instantiate_device_type_tests
+import torch
 import torch._lazy
 import torch._lazy.config
-import torch._lazy.metrics
 import torch._lazy.ir_cache
+import torch._lazy.metrics
 import torch._lazy.ts_backend
-import itertools
 import yaml
-import os
-import pathlib
-from unittest import skip
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    ops,
+)
+from torch.testing._internal.common_methods_invocations import op_db
+
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.jit_utils import JitTestCase
 
 torch._lazy.ts_backend.init()
 
+
 def get_test_device():
-    return 'cuda' if 'LTC_TS_CUDA' in os.environ else 'cpu'
+    return "cuda" if "LTC_TS_CUDA" in os.environ else "cpu"
+
 
 def remove_suffixes(l):
     return [x.split(".")[0] for x in l]
 
+
 def init_lists():
     path_to_script = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
-    TS_NATIVE_FUNCTIONS_PATH = path_to_script.parent.parent / "aten/src/ATen/native/ts_native_functions.yaml"
+    TS_NATIVE_FUNCTIONS_PATH = (
+        path_to_script.parent.parent / "aten/src/ATen/native/ts_native_functions.yaml"
+    )
     with open(TS_NATIVE_FUNCTIONS_PATH) as f:
         yaml_ts = yaml.load(f, yaml.SafeLoader)
-    LAZY_OPS_LIST = set(remove_suffixes(itertools.chain(yaml_ts["full_codegen"], yaml_ts["supported"], yaml_ts["autograd"])))
+    LAZY_OPS_LIST = set(
+        remove_suffixes(
+            itertools.chain(
+                yaml_ts["full_codegen"], yaml_ts["supported"], yaml_ts["autograd"]
+            )
+        )
+    )
     HAS_SYMINT_SUFFIX = yaml_ts["symint"]
     FALLBACK_LIST = {"clamp"}
     SKIP_RUNTIME_ERROR_LIST = {
-        'index_select',  # Empty output_sizes is not supported
-        'clone',  # is clone decomposed?
-
+        "index_select",  # Empty output_sizes is not supported
+        "clone",  # is clone decomposed?
         # General ASAN Failure due to related to generating bool values.
         # https://github.com/pytorch/pytorch/issues/74519
         # https://github.com/pytorch/pytorch/issues/63034
-        'nonzero',  # ASAN failure (paste: P501906539)
-        'all',  # ASAN failure
-        'any',  # ASAN failure
-        'logdet',  # ASAN failure
+        "nonzero",  # ASAN failure (paste: P501906539)
+        "all",  # ASAN failure
+        "any",  # ASAN failure
+        "logdet",  # ASAN failure
     }
     SKIP_INCORRECT_RESULTS_LIST = {
-        'squeeze',  # Value out of range
-        't',  # Value out of range
-        'transpose',  # Value out of range
-        'bernoulli',  # incorrect results
-        'pow',  # incorrect results
-        'addcdiv',  # incorrect results (on CI not locally?)
+        "squeeze",  # Value out of range
+        "t",  # Value out of range
+        "transpose",  # Value out of range
+        "bernoulli",  # incorrect results
+        "pow",  # incorrect results
+        "addcdiv",  # incorrect results (on CI not locally?)
     }
     # The following ops all show up directly in ts_native_functions.yaml,
     # but run functionalized versions of the composite kernels in core.
     # This means that we don't expect the ops to show directly in the LTC metrics.
     FUNCTIONAL_DECOMPOSE_LIST = {
-        'diag_embed',
-        'block_diag',
-        'new_empty_strided',
-        'narrow_copy',
-        'pixel_shuffle',
-        'pixel_unshuffle',
-        'select_backward',
-        '_trilinear',
-        'linalg_inv_ex',
-        'linalg_pinv.atol_rtol_tensor',
-        'logsumexp',
+        "diag_embed",
+        "block_diag",
+        "new_empty_strided",
+        "narrow_copy",
+        "pixel_shuffle",
+        "pixel_unshuffle",
+        "select_backward",
+        "_trilinear",
+        "linalg_inv_ex",
+        "linalg_pinv.atol_rtol_tensor",
+        "logsumexp",
     }
     # For some ops, we don't support all variants. Here we use formatted_name
     # to uniquely identify the variant.
-    SKIP_VARIANT_LIST = {
-        'norm_nuc',
-        'min_reduction_with_dim'
-    }
-
-    return (LAZY_OPS_LIST,
-            FALLBACK_LIST,
-            SKIP_RUNTIME_ERROR_LIST,
-            SKIP_INCORRECT_RESULTS_LIST,
-            FUNCTIONAL_DECOMPOSE_LIST,
-            HAS_SYMINT_SUFFIX,
-            SKIP_VARIANT_LIST)
-
-(LAZY_OPS_LIST,
- FALLBACK_LIST,
- SKIP_RUNTIME_ERROR_LIST,
- SKIP_INCORRECT_RESULTS_LIST,
- FUNCTIONAL_DECOMPOSE_LIST,
- HAS_SYMINT_SUFFIX,
- SKIP_VARIANT_LIST) = init_lists()
+    SKIP_VARIANT_LIST = {"norm_nuc", "min_reduction_with_dim"}
+
+    return (
+        LAZY_OPS_LIST,
+        FALLBACK_LIST,
+        SKIP_RUNTIME_ERROR_LIST,
+        SKIP_INCORRECT_RESULTS_LIST,
+        FUNCTIONAL_DECOMPOSE_LIST,
+        HAS_SYMINT_SUFFIX,
+        SKIP_VARIANT_LIST,
+    )
+
+
+(
+    LAZY_OPS_LIST,
+    FALLBACK_LIST,
+    SKIP_RUNTIME_ERROR_LIST,
+    SKIP_INCORRECT_RESULTS_LIST,
+    FUNCTIONAL_DECOMPOSE_LIST,
+    HAS_SYMINT_SUFFIX,
+    SKIP_VARIANT_LIST,
+) = init_lists()
 
 torch.manual_seed(42)
 
+
 def clone_move(t):
-    dev = 'lazy'
+    dev = "lazy"
     copy_t = t.detach().clone().requires_grad_(True).to(device=dev)
     return copy_t
 
-class TestLazyTensor(JitTestCase):
-
 
+class TestLazyTensor(JitTestCase):
     @skip("Disable until autograd supports symints")
     def testConvolutionBackward(self):
         test_device = get_test_device()
@@ -118,12 +134,15 @@ def testConvolutionBackward(self):
 
         # run eager
         conv_out = torch.nn.functional.conv2d(inp, weight, bias)
-        (inp_grad, weight_grad, bias_grad) = torch.autograd.grad([conv_out], [inp, weight, bias], [grad])
+        (inp_grad, weight_grad, bias_grad) = torch.autograd.grad(
+            [conv_out], [inp, weight, bias], [grad]
+        )
 
         # run lazy
         conv_copy_out = torch.nn.functional.conv2d(inp_copy, weight_copy, bias_copy)
         (inp_copy_grad, weight_copy_grad, bias_copy_grad) = torch.autograd.grad(
-            [conv_copy_out], [inp_copy, weight_copy, bias_copy], [grad_copy])
+            [conv_copy_out], [inp_copy, weight_copy, bias_copy], [grad_copy]
+        )
 
         # check numerics
         torch.testing.assert_close(bias_copy_grad.cpu(), bias_grad.cpu())
@@ -148,7 +167,6 @@ def foo(x, *, mark_step):
             y.add_(1)
             return x
 
-
         out_ref = foo(inp, mark_step=False)
         out = foo(inp_lazy, mark_step=True)
         # out will have some pending mutations, which will be synced by the .cpu() call.
@@ -157,7 +175,7 @@ def foo(x, *, mark_step):
     def test_tensor_ctr(self):
         test_device = get_test_device()
         inp = torch.tensor([[1, 2, 3, 4, 5]], device=test_device)
-        inp_lazy = torch.tensor([[1, 2, 3, 4, 5]], device='lazy')
+        inp_lazy = torch.tensor([[1, 2, 3, 4, 5]], device="lazy")
 
         def foo(x):
             # Calling a view op to ensure that functionalization wrapping occurs.
@@ -169,19 +187,23 @@ def foo(x):
 
 
 class TestLazyOpInfo(TestCase):
-
-    @ops([op for op in op_db
-          if op.name in LAZY_OPS_LIST
-          and op.name not in SKIP_RUNTIME_ERROR_LIST
-          and op.name not in FUNCTIONAL_DECOMPOSE_LIST
-          and op.formatted_name not in SKIP_VARIANT_LIST
-          ], allowed_dtypes=(torch.float,))
+    @ops(
+        [
+            op
+            for op in op_db
+            if op.name in LAZY_OPS_LIST
+            and op.name not in SKIP_RUNTIME_ERROR_LIST
+            and op.name not in FUNCTIONAL_DECOMPOSE_LIST
+            and op.formatted_name not in SKIP_VARIANT_LIST
+        ],
+        allowed_dtypes=(torch.float,),
+    )
     def test_dispatched_to_lazy(self, device, dtype, op):
         def get_name(op):
             l = [op.name]
-            if op.variant_test_name != '':
+            if op.variant_test_name != "":
                 l.append(op.variant_test_name)
-            return '.'.join(l)
+            return ".".join(l)
 
         global HAS_SYMINT_SUFFIX, FALLBACK_LIST
         samples = op.sample_inputs("lazy", dtype, requires_grad=False)
@@ -197,20 +219,31 @@ def get_name(op):
         torch._lazy.wait_device_ops()
         prefix = "aten" if op.name in FALLBACK_LIST else "lazy"
         symint_suffix = "_symint" if op.name in HAS_SYMINT_SUFFIX else ""
-        found = f"{prefix}::{op.name}{symint_suffix}" in remove_suffixes(torch._lazy.metrics.counter_names())
+        found = f"{prefix}::{op.name}{symint_suffix}" in remove_suffixes(
+            torch._lazy.metrics.counter_names()
+        )
         # check aliases
         if not found:
             for alias in op.aliases:
-                alias_found = f"{prefix}::{alias.name}{symint_suffix}" in remove_suffixes(torch._lazy.metrics.counter_names())
+                alias_found = (
+                    f"{prefix}::{alias.name}{symint_suffix}"
+                    in remove_suffixes(torch._lazy.metrics.counter_names())
+                )
                 found = found or alias_found
                 if found:
                     break
         self.assertTrue(found)
 
-
-    @ops([op for op in op_db if op.name in LAZY_OPS_LIST and op.name not in SKIP_RUNTIME_ERROR_LIST | SKIP_INCORRECT_RESULTS_LIST], allowed_dtypes=(torch.float,))  # noqa: B950
+    @ops(
+        [
+            op
+            for op in op_db
+            if op.name in LAZY_OPS_LIST
+            and op.name not in SKIP_RUNTIME_ERROR_LIST | SKIP_INCORRECT_RESULTS_LIST
+        ],
+        allowed_dtypes=(torch.float,),
+    )  # noqa: B950
     def test_correctness(self, device, dtype, op):
-
         test_device = get_test_device()
 
         def clone_to_device(input, dev):
@@ -224,7 +257,9 @@ def assert_allclose_rec(t):
             a, b = t
             self.assertEqual(type(a), type(b))
             if isinstance(a, torch.Tensor):
-                self.assertTrue(torch.allclose(clone_to_device(a, test_device), b, atol=1e-4))
+                self.assertTrue(
+                    torch.allclose(clone_to_device(a, test_device), b, atol=1e-4)
+                )
 
             if isinstance(a, Sequence):
                 map(assert_allclose_rec, zip(a, b))
@@ -244,7 +279,15 @@ def assert_allclose_rec(t):
             torch._lazy.mark_step()
             assert_allclose_rec((r_actual, r_exp))
 
-    @ops([op for op in op_db if op.name in LAZY_OPS_LIST and op.name not in SKIP_RUNTIME_ERROR_LIST | SKIP_INCORRECT_RESULTS_LIST], allowed_dtypes=(torch.float,))  # noqa: B950
+    @ops(
+        [
+            op
+            for op in op_db
+            if op.name in LAZY_OPS_LIST
+            and op.name not in SKIP_RUNTIME_ERROR_LIST | SKIP_INCORRECT_RESULTS_LIST
+        ],
+        allowed_dtypes=(torch.float,),
+    )  # noqa: B950
     def test_correctness_with_reusing_ir(self, device, dtype, op):
         torch._lazy.config.set_reuse_ir(True)
         test_device = get_test_device()
@@ -260,7 +303,9 @@ def assert_allclose_rec(t):
             a, b = t
             self.assertEqual(type(a), type(b))
             if isinstance(a, torch.Tensor):
-                self.assertTrue(torch.allclose(clone_to_device(a, test_device), b, atol=1e-4))
+                self.assertTrue(
+                    torch.allclose(clone_to_device(a, test_device), b, atol=1e-4)
+                )
 
             if isinstance(a, Sequence):
                 map(assert_allclose_rec, zip(a, b))
@@ -284,7 +329,6 @@ def assert_allclose_rec(t):
         torch._lazy.config.set_reuse_ir(False)
 
 
-
 # TODO: after we move to master, add Lazy as a new Device here:
 # https://github.com/pytorch/pytorch/blob/master/torch/testing/_internal/common_device_type.py#L532
 instantiate_device_type_tests(TestLazyOpInfo, globals(), only_for="cpu")
@@ -306,7 +350,9 @@ def tearDownClass(cls) -> None:
     def test_nonzero_dynamic(self):
         # Test that nonzero gives upper bounds sizes when symbolic shape mode is enabled
         test_device = get_test_device()
-        x1 = torch.tensor([[0, 1.0, 2.0], [3.0, 0, 0]], device=test_device, requires_grad=True)
+        x1 = torch.tensor(
+            [[0, 1.0, 2.0], [3.0, 0, 0]], device=test_device, requires_grad=True
+        )
         x1_lazy = clone_move(x1)
         x2_lazy = torch.nonzero(x1_lazy)
 
@@ -328,5 +374,6 @@ def test_adaptiveavgpool3d_dynamic(self):
 
         self.assertEqual(out_cpu.shape, out_lazy.shape)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/linear.py b/test/linear.py
index dbf636303f422..b473d447d9a50 100644
--- a/test/linear.py
+++ b/test/linear.py
@@ -1,4 +1,6 @@
 import torch
+
+
 class LinearMod(torch.nn.Linear):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -6,4 +8,5 @@ def __init__(self, *args, **kwargs):
     def forward(self, input):
         return torch._C._nn.linear(input, self.weight, self.bias)
 
+
 print(torch.jit.trace(LinearMod(20, 20), torch.rand([20, 20])).graph)
diff --git a/test/load_torchscript_model.py b/test/load_torchscript_model.py
index 19b3e3d31d637..d04fae8076b85 100644
--- a/test/load_torchscript_model.py
+++ b/test/load_torchscript_model.py
@@ -1,7 +1,8 @@
 import sys
+
 import torch
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     script_mod = torch.jit.load(sys.argv[1])
     mod = torch.load(sys.argv[1] + ".orig")
     print(script_mod)
diff --git a/test/mkl_verbose.py b/test/mkl_verbose.py
index 879168f866b7b..5c3530fb34ddd 100644
--- a/test/mkl_verbose.py
+++ b/test/mkl_verbose.py
@@ -1,13 +1,16 @@
 import argparse
+
 import torch
 
+
 def run_model(level):
     m = torch.nn.Linear(20, 30)
     input = torch.randn(128, 20)
     with torch.backends.mkl.verbose(level):
         m(input)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--verbose-level", default=0, type=int)
     args = parser.parse_args()
diff --git a/test/mkldnn_verbose.py b/test/mkldnn_verbose.py
index 60fe87bd2308b..a2feb29ba398f 100644
--- a/test/mkldnn_verbose.py
+++ b/test/mkldnn_verbose.py
@@ -1,6 +1,8 @@
 import argparse
+
 import torch
 
+
 class Module(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -10,13 +12,15 @@ def forward(self, x):
         y = self.conv(x)
         return y
 
+
 def run_model(level):
     m = Module().eval()
     d = torch.rand(1, 1, 112, 112)
     with torch.backends.mkldnn.verbose(level):
         m(d)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--verbose-level", default=0, type=int)
     args = parser.parse_args()
diff --git a/test/mobile/custom_build/prepare_model.py b/test/mobile/custom_build/prepare_model.py
index feb044e3bfc03..6cbf1ecef2fdb 100644
--- a/test/mobile/custom_build/prepare_model.py
+++ b/test/mobile/custom_build/prepare_model.py
@@ -32,8 +32,8 @@
 # in static dispatch case these extra ops will be kept by linker automatically.
 #
 # For CI purpose this one-off hack is probably fine? :)
-EXTRA_CI_ROOT_OPS = ['aten::ones']
+EXTRA_CI_ROOT_OPS = ["aten::ones"]
 
 ops.extend(EXTRA_CI_ROOT_OPS)
-with open('MobileNetV2.yaml', 'w') as output:
+with open("MobileNetV2.yaml", "w") as output:
     yaml.dump(ops, output)
diff --git a/test/mobile/lightweight_dispatch/tests_setup.py b/test/mobile/lightweight_dispatch/tests_setup.py
index 6059961132a23..083f47b688e49 100644
--- a/test/mobile/lightweight_dispatch/tests_setup.py
+++ b/test/mobile/lightweight_dispatch/tests_setup.py
@@ -1,12 +1,12 @@
 import functools
 import os
-from io import BytesIO
 import shutil
 
 import sys
+from io import BytesIO
 
 import torch
-from torch.jit.mobile import _load_for_lite_interpreter, _export_operator_list
+from torch.jit.mobile import _export_operator_list, _load_for_lite_interpreter
 
 _OPERATORS = set()
 _FILENAMES = []
@@ -36,7 +36,13 @@ def wrapper_save():
 @save_model
 class ModelWithDTypeDeviceLayoutPinMemory(torch.nn.Module):
     def forward(self, x: int):
-        a = torch.ones(size=[3, x], dtype=torch.int64, layout=torch.strided, device="cpu", pin_memory=False)
+        a = torch.ones(
+            size=[3, x],
+            dtype=torch.int64,
+            layout=torch.strided,
+            device="cpu",
+            pin_memory=False,
+        )
         return a
 
 
@@ -54,25 +60,34 @@ def forward(self, index):
 @save_model
 class ModelWithScalarList(torch.nn.Module):
     def forward(self, a: int):
-        values = torch.tensor([4., 1., 1., 16.], )
+        values = torch.tensor(
+            [4.0, 1.0, 1.0, 16.0],
+        )
         if a == 0:
-            return torch.gradient(values, spacing=torch.scalar_tensor(2., dtype=torch.float64))
+            return torch.gradient(
+                values, spacing=torch.scalar_tensor(2.0, dtype=torch.float64)
+            )
         elif a == 1:
-            return torch.gradient(values, spacing=[torch.tensor(1.).item()])
+            return torch.gradient(values, spacing=[torch.tensor(1.0).item()])
 
 
 # upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
 @save_model
 class ModelWithFloatList(torch.nn.Upsample):
     def __init__(self):
-        super().__init__(scale_factor=(2.0,), mode="linear", align_corners=False, recompute_scale_factor=True)
+        super().__init__(
+            scale_factor=(2.0,),
+            mode="linear",
+            align_corners=False,
+            recompute_scale_factor=True,
+        )
 
 
 # index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
 @save_model
 class ModelWithListOfOptionalTensors(torch.nn.Module):
     def forward(self, index):
-        values = torch.tensor([[4., 1., 1., 16.]])
+        values = torch.tensor([[4.0, 1.0, 1.0, 16.0]])
         return values[torch.tensor(0), index]
 
 
@@ -93,13 +108,14 @@ def forward(self, a):
         b = torch.ones_like(a)
         return a + b
 
+
 @save_model
 class ModelWithStringOptional(torch.nn.Module):
     def forward(self, b):
         a = torch.tensor(3, dtype=torch.int64)
         out = torch.empty(size=[1], dtype=torch.float)
         torch.div(b, a, out=out)
-        return [torch.div(b, a, rounding_mode='trunc'), out]
+        return [torch.div(b, a, rounding_mode="trunc"), out]
 
 
 @save_model
@@ -133,7 +149,7 @@ def forward(self, x):
             ModelWithMultipleOps(),
         ]
         shutil.copyfile(ops_yaml, backup)
-        with open(ops_yaml, 'a') as f:
+        with open(ops_yaml, "a") as f:
             for op in _OPERATORS:
                 f.write(f"- {op}\n")
     elif command == "shutdown":
diff --git a/test/mobile/model_test/android_api_module.py b/test/mobile/model_test/android_api_module.py
index acada05fc2ffe..2af09e94ee666 100644
--- a/test/mobile/model_test/android_api_module.py
+++ b/test/mobile/model_test/android_api_module.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Tuple, Optional
+from typing import Dict, List, Optional, Tuple
 
 import torch
 from torch import Tensor
@@ -112,6 +112,15 @@ def conv2d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor:
             r = r.contiguous()
         return r
 
+    @torch.jit.script_method
+    def conv3d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor:
+        r = torch.nn.functional.conv3d(x, w)
+        if toChannelsLast:
+            r = r.contiguous(memory_format=torch.channels_last_3d)
+        else:
+            r = r.contiguous()
+        return r
+
     @torch.jit.script_method
     def contiguous(self, x: Tensor) -> Tensor:
         return x.contiguous()
diff --git a/test/mobile/model_test/gen_test_model.py b/test/mobile/model_test/gen_test_model.py
index a5365bc4be9b6..15e3c2e807cd4 100644
--- a/test/mobile/model_test/gen_test_model.py
+++ b/test/mobile/model_test/gen_test_model.py
@@ -1,48 +1,46 @@
 import io
 import sys
+
 import torch
 import yaml
 from android_api_module import AndroidAPIModule
-from builtin_ops import (
-    TSBuiltinOpsModule,
-    TSCollectionOpsModule,
-)
+from builtin_ops import TSBuiltinOpsModule, TSCollectionOpsModule
 from math_ops import (
-    PointwiseOpsModule,
-    ReductionOpsModule,
+    BlasLapackOpsModule,
     ComparisonOpsModule,
     OtherMathOpsModule,
+    PointwiseOpsModule,
+    ReductionOpsModule,
     SpectralOpsModule,
-    BlasLapackOpsModule,
 )
 from nn_ops import (
-    NNConvolutionModule,
-    NNPoolingModule,
-    NNPaddingModule,
-    NNNormalizationModule,
     NNActivationModule,
-    NNRecurrentModule,
-    NNTransformerModule,
-    NNLinearModule,
-    NNDropoutModule,
-    NNSparseModule,
+    NNConvolutionModule,
     NNDistanceModule,
+    NNDropoutModule,
+    NNLinearModule,
     NNLossFunctionModule,
-    NNVisionModule,
+    NNNormalizationModule,
+    NNPaddingModule,
+    NNPoolingModule,
+    NNRecurrentModule,
     NNShuffleModule,
+    NNSparseModule,
+    NNTransformerModule,
     NNUtilsModule,
+    NNVisionModule,
 )
 from quantization_ops import (
+    FusedQuantModule,
     GeneralQuantModule,
     # DynamicQuantModule,
     StaticQuantModule,
-    FusedQuantModule,
 )
 from sampling_ops import SamplingOpsModule
 from tensor_ops import (
-    TensorOpsModule,
     TensorCreationOpsModule,
     TensorIndexingOpsModule,
+    TensorOpsModule,
     TensorTypingOpsModule,
     TensorViewOpsModule,
 )
@@ -125,15 +123,23 @@ def calcOpsCoverage(ops):
 
     # weighted coverage (take op occurances into account)
     total_occurances = sum(production_ops_dict["root_operators"].values())
-    covered_ops_dict = {op: production_ops_dict["root_operators"][op] for op in covered_ops}
-    uncovered_ops_dict = {op: production_ops_dict["root_operators"][op] for op in uncovered_ops}
+    covered_ops_dict = {
+        op: production_ops_dict["root_operators"][op] for op in covered_ops
+    }
+    uncovered_ops_dict = {
+        op: production_ops_dict["root_operators"][op] for op in uncovered_ops
+    }
     covered_occurances = sum(covered_ops_dict.values())
     occurances_coverage = round(100 * covered_occurances / total_occurances, 2)
 
     print(f"\n{len(uncovered_ops)} uncovered ops: {uncovered_ops}\n")
     print(f"Generated {len(all_generated_ops)} ops")
-    print(f"Covered {len(covered_ops)}/{len(production_ops)} ({coverage}%) production ops")
-    print(f"Covered {covered_occurances}/{total_occurances} ({occurances_coverage}%) occurances")
+    print(
+        f"Covered {len(covered_ops)}/{len(production_ops)} ({coverage}%) production ops"
+    )
+    print(
+        f"Covered {covered_occurances}/{total_occurances} ({occurances_coverage}%) occurances"
+    )
     print(f"pytorch ver {torch.__version__}\n")
 
     with open(coverage_out_path, "w") as f:
diff --git a/test/mobile/model_test/nn_ops.py b/test/mobile/model_test/nn_ops.py
index 890d524d2138b..5db34f2e560d5 100644
--- a/test/mobile/model_test/nn_ops.py
+++ b/test/mobile/model_test/nn_ops.py
@@ -2,6 +2,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+
 # https://pytorch.org/docs/stable/nn.html
 class NNConvolutionModule(torch.nn.Module):
     def __init__(self):
@@ -31,11 +32,13 @@ def __init__(self):
         )
 
     def forward(self):
-        return len((
-            [module(self.input1d) for i, module in enumerate(self.module1d)],
-            [module(self.input2d) for i, module in enumerate(self.module2d)],
-            [module(self.input3d) for i, module in enumerate(self.module3d)],
-        ))
+        return len(
+            (
+                [module(self.input1d) for i, module in enumerate(self.module1d)],
+                [module(self.input2d) for i, module in enumerate(self.module2d)],
+                [module(self.input3d) for i, module in enumerate(self.module3d)],
+            )
+        )
 
 
 class NNPoolingModule(torch.nn.Module):
@@ -77,11 +80,13 @@ def __init__(self):
         # TODO max_unpool
 
     def forward(self):
-        return len((
-            [module(self.input1d) for i, module in enumerate(self.module1d)],
-            [module(self.input2d) for i, module in enumerate(self.module2d)],
-            [module(self.input3d) for i, module in enumerate(self.module3d)],
-        ))
+        return len(
+            (
+                [module(self.input1d) for i, module in enumerate(self.module1d)],
+                [module(self.input2d) for i, module in enumerate(self.module2d)],
+                [module(self.input3d) for i, module in enumerate(self.module3d)],
+            )
+        )
 
 
 class NNPaddingModule(torch.nn.Module):
@@ -116,11 +121,13 @@ def __init__(self):
         )
 
     def forward(self):
-        return len((
-            [module(self.input1d) for i, module in enumerate(self.module1d)],
-            [module(self.input2d) for i, module in enumerate(self.module2d)],
-            [module(self.input3d) for i, module in enumerate(self.module3d)],
-        ))
+        return len(
+            (
+                [module(self.input1d) for i, module in enumerate(self.module1d)],
+                [module(self.input2d) for i, module in enumerate(self.module2d)],
+                [module(self.input3d) for i, module in enumerate(self.module3d)],
+            )
+        )
 
 
 class NNNormalizationModule(torch.nn.Module):
@@ -155,11 +162,13 @@ def __init__(self):
         )
 
     def forward(self):
-        return len((
-            [module(self.input1d) for i, module in enumerate(self.module1d)],
-            [module(self.input2d) for i, module in enumerate(self.module2d)],
-            [module(self.input3d) for i, module in enumerate(self.module3d)],
-        ))
+        return len(
+            (
+                [module(self.input1d) for i, module in enumerate(self.module1d)],
+                [module(self.input2d) for i, module in enumerate(self.module2d)],
+                [module(self.input3d) for i, module in enumerate(self.module3d)],
+            )
+        )
 
 
 class NNActivationModule(torch.nn.Module):
@@ -202,9 +211,7 @@ def __init__(self):
 
     def forward(self):
         input = torch.randn(2, 3, 4)
-        return len((
-            [module(input) for i, module in enumerate(self.activations)],
-        ))
+        return len(([module(input) for i, module in enumerate(self.activations)],))
 
 
 class NNRecurrentModule(torch.nn.Module):
@@ -396,16 +403,15 @@ def __init__(self):
         self.shuffle = nn.ChannelShuffle(2)
 
     def forward(self):
-        return len(self.shuffle(torch.randn(1, 4, 2, 2)),)
+        return len(
+            self.shuffle(torch.randn(1, 4, 2, 2)),
+        )
 
 
 class NNUtilsModule(torch.nn.Module):
     def __init__(self):
         super().__init__()
-        self.flatten = nn.Sequential(
-            nn.Linear(50, 50),
-            nn.Unflatten(1, (2, 5, 5))
-        )
+        self.flatten = nn.Sequential(nn.Linear(50, 50), nn.Unflatten(1, (2, 5, 5)))
 
     def forward(self):
         a = [torch.tensor([1, 2, 3]), torch.tensor([3, 4])]
diff --git a/test/mobile/model_test/quantization_ops.py b/test/mobile/model_test/quantization_ops.py
index dd34137b51a0b..2e80886c144e8 100644
--- a/test/mobile/model_test/quantization_ops.py
+++ b/test/mobile/model_test/quantization_ops.py
@@ -93,21 +93,23 @@ def forward(self):
             trans_input = torch.randn(1, 16, 2)
             tgt = torch.rand(1, 16, 2)
 
-            return len((
-                self.rnn(input, h),
-                self.rnncell(input[0], h[0]),
-                self.gru(input, h),
-                self.grucell(input[0], h[0]),
-                self.lstm(input, (h, c)),
-                # self.lstm(torch.nn.utils.rnn.pack_padded_sequence(self.a, lengths=torch.tensor([3,2,1])), (h, c)),
-                self.lstmcell(input[0], (h[0], c[0])),
-                self.transformers[0](trans_input, tgt),
-                self.transformers[1](trans_input),
-                self.transformers[2](trans_input, tgt),
-                self.linears[0](linear_input),
-                self.linears[1](linear_input),
-                self.linears[2](linear_input, linear_input),
-            ))
+            return len(
+                (
+                    self.rnn(input, h),
+                    self.rnncell(input[0], h[0]),
+                    self.gru(input, h),
+                    self.grucell(input[0], h[0]),
+                    self.lstm(input, (h, c)),
+                    # self.lstm(torch.nn.utils.rnn.pack_padded_sequence(self.a, lengths=torch.tensor([3,2,1])), (h, c)),
+                    self.lstmcell(input[0], (h[0], c[0])),
+                    self.transformers[0](trans_input, tgt),
+                    self.transformers[1](trans_input),
+                    self.transformers[2](trans_input, tgt),
+                    self.linears[0](linear_input),
+                    self.linears[1](linear_input),
+                    self.linears[2](linear_input, linear_input),
+                )
+            )
 
 
 class StaticQuantModule:
diff --git a/test/mobile/model_test/sampling_ops.py b/test/mobile/model_test/sampling_ops.py
index 50e6d9141ca28..d3129b5a2653d 100644
--- a/test/mobile/model_test/sampling_ops.py
+++ b/test/mobile/model_test/sampling_ops.py
@@ -3,6 +3,7 @@
 
 # https://pytorch.org/docs/stable/torch.html#random-sampling
 
+
 class SamplingOpsModule(torch.nn.Module):
     def forward(self):
         a = torch.empty(3, 3).uniform_(0.0, 1.0)
diff --git a/test/mobile/model_test/torchvision_models.py b/test/mobile/model_test/torchvision_models.py
index e86fe2fdbf948..cdd27a408910e 100644
--- a/test/mobile/model_test/torchvision_models.py
+++ b/test/mobile/model_test/torchvision_models.py
@@ -14,7 +14,7 @@ def getModule(self):
         augment_model_with_bundled_inputs(
             optimized_module,
             [
-                (example, ),
+                (example,),
             ],
         )
         optimized_module(example)
@@ -31,7 +31,7 @@ def getModule(self):
         augment_model_with_bundled_inputs(
             optimized_module,
             [
-                (example, ),
+                (example,),
             ],
         )
         optimized_module(example)
@@ -48,7 +48,7 @@ def getModule(self):
         augment_model_with_bundled_inputs(
             optimized_module,
             [
-                (example, ),
+                (example,),
             ],
         )
         optimized_module(example)
diff --git a/test/mobile/model_test/update_production_ops.py b/test/mobile/model_test/update_production_ops.py
index 6bb685e6296d4..f595f6c87fa9c 100644
--- a/test/mobile/model_test/update_production_ops.py
+++ b/test/mobile/model_test/update_production_ops.py
@@ -4,6 +4,7 @@
 """
 
 import sys
+
 import yaml
 
 root_operators = {}
@@ -18,17 +19,25 @@
             root_operators[op] = 1 + (root_operators[op] if op in root_operators else 0)
         for op in info["traced_operators"]:
             # aggregate occurance per op
-            traced_operators[op] = 1 + (traced_operators[op] if op in traced_operators else 0)
+            traced_operators[op] = 1 + (
+                traced_operators[op] if op in traced_operators else 0
+            )
         # merge dtypes for each kernel
         for kernal, dtypes in info["kernel_metadata"].items():
-            new_dtypes = dtypes + (kernel_metadata[kernal] if kernal in kernel_metadata else [])
+            new_dtypes = dtypes + (
+                kernel_metadata[kernal] if kernal in kernel_metadata else []
+            )
             kernel_metadata[kernal] = list(set(new_dtypes))
 
 
 # Only test these built-in ops. No custom ops or non-CPU ops.
 namespaces = ["aten", "prepacked", "prim", "quantized"]
-root_operators = {x: root_operators[x] for x in root_operators if x.split("::")[0] in namespaces}
-traced_operators = {x: traced_operators[x] for x in traced_operators if x.split("::")[0] in namespaces}
+root_operators = {
+    x: root_operators[x] for x in root_operators if x.split("::")[0] in namespaces
+}
+traced_operators = {
+    x: traced_operators[x] for x in traced_operators if x.split("::")[0] in namespaces
+}
 
 out_path = "test/mobile/model_test/model_ops.yaml"
 with open(out_path, "w") as f:
diff --git a/test/mobile/nnc/aot_test_model.py b/test/mobile/nnc/aot_test_model.py
index 834b731a306f8..ad19d9c199fa4 100644
--- a/test/mobile/nnc/aot_test_model.py
+++ b/test/mobile/nnc/aot_test_model.py
@@ -6,6 +6,7 @@ class NeuralNetwork(nn.Module):
     def forward(self, x):
         return torch.add(x, 10)
 
+
 model = NeuralNetwork()
 script = torch.jit.script(model)
 torch.jit.save(script, "aot_test_model.pt")
diff --git a/test/mobile/test_bytecode.py b/test/mobile/test_bytecode.py
index a2f2a93eb00d2..14faa13e976e3 100644
--- a/test/mobile/test_bytecode.py
+++ b/test/mobile/test_bytecode.py
@@ -4,18 +4,21 @@
 import io
 import shutil
 import tempfile
+from pathlib import Path
+
 import torch
 import torch.utils.show_pickle
+
 # from torch.utils.mobile_optimizer import optimize_for_mobile
 from torch.jit.mobile import (
-    _load_for_lite_interpreter,
+    _backport_for_mobile,
+    _backport_for_mobile_to_buffer,
     _get_mobile_model_contained_types,
     _get_model_bytecode_version,
     _get_model_ops_and_info,
-    _backport_for_mobile_to_buffer,
-    _backport_for_mobile)
-from torch.testing._internal.common_utils import TestCase, run_tests
-from pathlib import Path
+    _load_for_lite_interpreter,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 pytorch_test_dir = Path(__file__).resolve().parents[1]
 
@@ -35,7 +38,7 @@
 # exported_optimized_scripted_module = optimized_scripted_module._save_for_lite_interpreter(
 #   str(output_model_path))
 
-SCRIPT_MODULE_V4_BYTECODE_PKL = '''
+SCRIPT_MODULE_V4_BYTECODE_PKL = """
 (4,
  ('__torch__.*.TestModule.forward',
   (('instructions',
@@ -66,9 +69,9 @@
      (('name', 'y'), ('type', 'int'), ('default_value', None)))),
    ('returns',
     ((('name', ''), ('type', 'Tensor'), ('default_value', None)),)))))
-        '''
+        """
 
-SCRIPT_MODULE_V5_BYTECODE_PKL = '''
+SCRIPT_MODULE_V5_BYTECODE_PKL = """
 (5,
  ('__torch__.*.TestModule.forward',
   (('instructions',
@@ -99,9 +102,9 @@
      (('name', 'y'), ('type', 'int'), ('default_value', None)))),
    ('returns',
     ((('name', ''), ('type', 'Tensor'), ('default_value', None)),)))))
-        '''
+        """
 
-SCRIPT_MODULE_V6_BYTECODE_PKL = '''
+SCRIPT_MODULE_V6_BYTECODE_PKL = """
 (6,
  ('__torch__.*.TestModule.forward',
   (('instructions',
@@ -131,23 +134,26 @@
      (('name', 'y'), ('type', 'int'), ('default_value', None)))),
    ('returns',
     ((('name', ''), ('type', 'Tensor'), ('default_value', None)),)))))
-    '''
+    """
 
 SCRIPT_MODULE_BYTECODE_PKL = {
     4: {
         "bytecode_pkl": SCRIPT_MODULE_V4_BYTECODE_PKL,
-        "model_name": "script_module_v4.ptl"},
+        "model_name": "script_module_v4.ptl",
+    },
 }
 
 # The minimum version a model can be backported to
 # Need to be updated when a bytecode version is completely retired
 MINIMUM_TO_VERSION = 4
 
+
 class testVariousModelVersions(TestCase):
     def test_get_model_bytecode_version(self):
         def check_model_version(model_path, expect_version):
             actual_version = _get_model_bytecode_version(model_path)
             assert actual_version == expect_version
+
         for version, model_info in SCRIPT_MODULE_BYTECODE_PKL.items():
             model_path = pytorch_test_dir / "cpp" / "jit" / model_info["model_name"]
             check_model_version(model_path, version)
@@ -165,23 +171,38 @@ def test_bytecode_values_for_all_backport_functions(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             while current_from_version > MINIMUM_TO_VERSION:
                 # Load model v5 and run forward method
-                model_name = SCRIPT_MODULE_BYTECODE_PKL[current_from_version]["model_name"]
+                model_name = SCRIPT_MODULE_BYTECODE_PKL[current_from_version][
+                    "model_name"
+                ]
                 input_model_path = pytorch_test_dir / "cpp" / "jit" / model_name
 
                 # A temporary model file will be export to this path, and run through bytecode.pkl
                 # content check.
-                tmp_output_model_path_backport = Path(tmpdirname, "tmp_script_module_backport.ptl")
+                tmp_output_model_path_backport = Path(
+                    tmpdirname, "tmp_script_module_backport.ptl"
+                )
 
                 current_to_version = current_from_version - 1
-                backport_success = _backport_for_mobile(input_model_path, tmp_output_model_path_backport, current_to_version)
+                backport_success = _backport_for_mobile(
+                    input_model_path, tmp_output_model_path_backport, current_to_version
+                )
                 assert backport_success
 
-                expect_bytecode_pkl = SCRIPT_MODULE_BYTECODE_PKL[current_to_version]["bytecode_pkl"]
+                expect_bytecode_pkl = SCRIPT_MODULE_BYTECODE_PKL[current_to_version][
+                    "bytecode_pkl"
+                ]
 
                 buf = io.StringIO()
                 torch.utils.show_pickle.main(
-                    ["", tmpdirname + "/" + tmp_output_model_path_backport.name + "@*/bytecode.pkl"],
-                    output_stream=buf)
+                    [
+                        "",
+                        tmpdirname
+                        + "/"
+                        + tmp_output_model_path_backport.name
+                        + "@*/bytecode.pkl",
+                    ],
+                    output_stream=buf,
+                )
                 output = buf.getvalue()
 
                 acutal_result_clean = "".join(output.split())
@@ -243,23 +264,37 @@ def test_bytecode_values_for_all_backport_functions(self):
     # Check just the test_backport_bytecode_from_file_to_file mechanism but not the function implementations
     def test_backport_bytecode_from_file_to_file(self):
         maximum_checked_in_model_version = max(SCRIPT_MODULE_BYTECODE_PKL.keys())
-        script_module_v5_path = pytorch_test_dir / "cpp" / "jit" / SCRIPT_MODULE_BYTECODE_PKL[
-            maximum_checked_in_model_version]["model_name"]
-
-        if (maximum_checked_in_model_version > MINIMUM_TO_VERSION):
+        script_module_v5_path = (
+            pytorch_test_dir
+            / "cpp"
+            / "jit"
+            / SCRIPT_MODULE_BYTECODE_PKL[maximum_checked_in_model_version]["model_name"]
+        )
+
+        if maximum_checked_in_model_version > MINIMUM_TO_VERSION:
             with tempfile.TemporaryDirectory() as tmpdirname:
-                tmp_backport_model_path = Path(tmpdirname, "tmp_script_module_v5_backported_to_v4.ptl")
+                tmp_backport_model_path = Path(
+                    tmpdirname, "tmp_script_module_v5_backported_to_v4.ptl"
+                )
                 # backport from file
                 success = _backport_for_mobile(
                     script_module_v5_path,
                     tmp_backport_model_path,
-                    maximum_checked_in_model_version - 1)
+                    maximum_checked_in_model_version - 1,
+                )
                 assert success
 
                 buf = io.StringIO()
                 torch.utils.show_pickle.main(
-                    ["", tmpdirname + "/" + tmp_backport_model_path.name + "@*/bytecode.pkl"],
-                    output_stream=buf)
+                    [
+                        "",
+                        tmpdirname
+                        + "/"
+                        + tmp_backport_model_path.name
+                        + "@*/bytecode.pkl",
+                    ],
+                    output_stream=buf,
+                )
                 output = buf.getvalue()
 
                 expected_result = SCRIPT_MODULE_V4_BYTECODE_PKL
@@ -272,20 +307,29 @@ def test_backport_bytecode_from_file_to_file(self):
                 mobile_module = _load_for_lite_interpreter(str(tmp_backport_model_path))
                 module_input = 1
                 mobile_module_result = mobile_module(module_input)
-                expected_mobile_module_result = 3 * torch.ones([2, 4], dtype=torch.float64)
-                torch.testing.assert_close(mobile_module_result, expected_mobile_module_result)
+                expected_mobile_module_result = 3 * torch.ones(
+                    [2, 4], dtype=torch.float64
+                )
+                torch.testing.assert_close(
+                    mobile_module_result, expected_mobile_module_result
+                )
                 shutil.rmtree(tmpdirname)
 
     # Check just the _backport_for_mobile_to_buffer mechanism but not the function implementations
     def test_backport_bytecode_from_file_to_buffer(self):
         maximum_checked_in_model_version = max(SCRIPT_MODULE_BYTECODE_PKL.keys())
-        script_module_v5_path = pytorch_test_dir / "cpp" / "jit" / SCRIPT_MODULE_BYTECODE_PKL[
-            maximum_checked_in_model_version]["model_name"]
-
-        if (maximum_checked_in_model_version > MINIMUM_TO_VERSION):
+        script_module_v5_path = (
+            pytorch_test_dir
+            / "cpp"
+            / "jit"
+            / SCRIPT_MODULE_BYTECODE_PKL[maximum_checked_in_model_version]["model_name"]
+        )
+
+        if maximum_checked_in_model_version > MINIMUM_TO_VERSION:
             # Backport model to v4
             script_module_v4_buffer = _backport_for_mobile_to_buffer(
-                script_module_v5_path, maximum_checked_in_model_version - 1)
+                script_module_v5_path, maximum_checked_in_model_version - 1
+            )
             buf = io.StringIO()
 
             # Check version of the model v4 from backport
@@ -299,8 +343,9 @@ def test_backport_bytecode_from_file_to_buffer(self):
             module_input = 1
             mobile_module_result = mobile_module(module_input)
             expected_mobile_module_result = 3 * torch.ones([2, 4], dtype=torch.float64)
-            torch.testing.assert_close(mobile_module_result, expected_mobile_module_result)
-
+            torch.testing.assert_close(
+                mobile_module_result, expected_mobile_module_result
+            )
 
     def test_get_model_ops_and_info(self):
         # TODO update this to be more in the style of the above tests after a backport from 6 -> 5 exists
@@ -324,5 +369,6 @@ def forward(self, x):
         type_list = _get_mobile_model_contained_types(buffer)
         assert len(type_list) >= 0
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/mobile/test_lite_script_module.py b/test/mobile/test_lite_script_module.py
index 7a742c55ab784..580c3d79815df 100644
--- a/test/mobile/test_lite_script_module.py
+++ b/test/mobile/test_lite_script_module.py
@@ -1,35 +1,44 @@
 # Owner(s): ["oncall: mobile"]
 
-import torch
-import torch.utils.bundled_inputs
+import inspect
 import io
 from tempfile import TemporaryFileName
 from typing import Dict, List
-import inspect
-from torch.testing import FileCheck
 
-from torch.jit.mobile import _load_for_lite_interpreter, _export_operator_list
-from torch.testing._internal.common_utils import TestCase, run_tests
+import torch
+import torch.utils.bundled_inputs
+
+from torch.jit.mobile import _export_operator_list, _load_for_lite_interpreter
+from torch.testing import FileCheck
 from torch.testing._internal.common_quantization import (
+    AnnotatedNestedModel,
     AnnotatedSingleLayerLinearModel,
+    QuantizationLiteTestCase,
     TwoLayerLinearModel,
-    AnnotatedNestedModel
 )
-from torch.testing._internal.common_quantization import QuantizationLiteTestCase
+from torch.testing._internal.common_utils import run_tests, TestCase
 
-class TestLiteScriptModule(TestCase):
 
-    def getScriptExportImportCopy(self, m, save_mobile_debug_info=True, also_test_file=False):
+class TestLiteScriptModule(TestCase):
+    def getScriptExportImportCopy(
+        self, m, save_mobile_debug_info=True, also_test_file=False
+    ):
         m_scripted = torch.jit.script(m)
 
         if not also_test_file:
-            buffer = io.BytesIO(m_scripted._save_to_buffer_for_lite_interpreter(_save_mobile_debug_info=save_mobile_debug_info))
+            buffer = io.BytesIO(
+                m_scripted._save_to_buffer_for_lite_interpreter(
+                    _save_mobile_debug_info=save_mobile_debug_info
+                )
+            )
             buffer.seek(0)
             mobile_module = _load_for_lite_interpreter(buffer)
             return mobile_module
 
         with TemporaryFileName() as fname:
-            m_scripted._save_for_lite_interpreter(fname, _save_mobile_debug_info=save_mobile_debug_info)
+            m_scripted._save_for_lite_interpreter(
+                fname, _save_mobile_debug_info=save_mobile_debug_info
+            )
             mobile_module = _load_for_lite_interpreter(fname)
             return mobile_module
 
@@ -54,7 +63,9 @@ def forward(self, x):
         torch.testing.assert_close(script_module_result, mobile_module_forward_result)
 
         mobile_module_run_method_result = mobile_module.run_method("forward", input)
-        torch.testing.assert_close(script_module_result, mobile_module_run_method_result)
+        torch.testing.assert_close(
+            script_module_result, mobile_module_run_method_result
+        )
 
     def test_save_mobile_module_with_debug_info_with_trace(self):
         class A(torch.nn.Module):
@@ -70,27 +81,35 @@ def __init__(self):
             def forward(self, x, y, z):
                 return self.A0(x, y) + self.A1(y, z)
 
-        for export_method in ['trace', 'script']:
+        for export_method in ["trace", "script"]:
             x = torch.rand((2, 3))
             y = torch.rand((2, 3))
             z = torch.rand((2, 3))
-            if export_method == 'trace':
+            if export_method == "trace":
                 trace_module = torch.jit.trace(B(), [x, y, z])
             else:
                 trace_module = torch.jit.script(B())
-            exported_module = trace_module._save_to_buffer_for_lite_interpreter(_save_mobile_debug_info=True)
+            exported_module = trace_module._save_to_buffer_for_lite_interpreter(
+                _save_mobile_debug_info=True
+            )
             buffer = io.BytesIO(exported_module)
             buffer.seek(0)
 
             assert b"callstack_debug_map.pkl" in exported_module
 
             mobile_module = _load_for_lite_interpreter(buffer)
-            with self.assertRaisesRegex(RuntimeError, r"Module hierarchy:top\(B\)::<unknown>.A0\(A\)::forward.aten::mul"):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                r"Module hierarchy:top\(B\)::<unknown>.A0\(A\)::forward.aten::mul",
+            ):
                 x = torch.rand((2, 3))
                 y = torch.rand((8, 10))
                 z = torch.rand((8, 10))
                 mobile_module(x, y, z)
-            with self.assertRaisesRegex(RuntimeError, r"Module hierarchy:top\(B\)::<unknown>.A1\(A\)::forward.aten::mul"):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                r"Module hierarchy:top\(B\)::<unknown>.A1\(A\)::forward.aten::mul",
+            ):
                 x = torch.rand((2, 3))
                 y = torch.rand((2, 3))
                 z = torch.rand((8, 10))
@@ -106,7 +125,11 @@ def forward(self, x):
         script_module = torch.jit.script(MyTestModule())
         script_module_result = script_module(input)
 
-        buffer = io.BytesIO(script_module._save_to_buffer_for_lite_interpreter(_save_mobile_debug_info=True))
+        buffer = io.BytesIO(
+            script_module._save_to_buffer_for_lite_interpreter(
+                _save_mobile_debug_info=True
+            )
+        )
         buffer.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer)
 
@@ -117,14 +140,16 @@ def forward(self, x):
         torch.testing.assert_close(script_module_result, mobile_module_forward_result)
 
         mobile_module_run_method_result = mobile_module.run_method("forward", input)
-        torch.testing.assert_close(script_module_result, mobile_module_run_method_result)
+        torch.testing.assert_close(
+            script_module_result, mobile_module_run_method_result
+        )
 
     def test_find_and_run_method(self):
         class MyTestModule(torch.nn.Module):
             def forward(self, arg):
                 return arg
 
-        input = (torch.tensor([1]), )
+        input = (torch.tensor([1]),)
 
         script_module = torch.jit.script(MyTestModule())
         script_module_result = script_module(*input)
@@ -137,7 +162,8 @@ def forward(self, arg):
         self.assertFalse(has_bundled_inputs)
 
         torch.utils.bundled_inputs.augment_model_with_bundled_inputs(
-            script_module, [input], [])
+            script_module, [input], []
+        )
 
         buffer = io.BytesIO(script_module._save_to_buffer_for_lite_interpreter())
         buffer.seek(0)
@@ -169,32 +195,26 @@ def forward(self, x, one: int = 1):
                 return self.A0(x) + one
 
         script_module = torch.jit.script(B())
-        buffer = io.BytesIO(
-            script_module._save_to_buffer_for_lite_interpreter()
-        )
+        buffer = io.BytesIO(script_module._save_to_buffer_for_lite_interpreter())
         mobile_module = _load_for_lite_interpreter(buffer)
 
         input = torch.tensor([5])
         script_module_forward_result = script_module.forward(input)
         mobile_module_forward_result = mobile_module.forward(input)
         torch.testing.assert_close(
-            script_module_forward_result,
-            mobile_module_forward_result
+            script_module_forward_result, mobile_module_forward_result
         )
 
         # change ref only
         script_module_forward_result = script_module.forward(input, 2)
         self.assertFalse(
-            (script_module_forward_result == mobile_module_forward_result)
-            .all()
-            .item()
+            (script_module_forward_result == mobile_module_forward_result).all().item()
         )
 
         # now both match again
         mobile_module_forward_result = mobile_module.forward(input, 2)
         torch.testing.assert_close(
-            script_module_forward_result,
-            mobile_module_forward_result
+            script_module_forward_result, mobile_module_forward_result
         )
 
     def test_unsupported_classtype(self):
@@ -211,10 +231,12 @@ def forward(self, arg):
                 return f.func(1, 2)
 
         script_module = torch.jit.script(MyTestModule())
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"Workaround: instead of using arbitrary class type \(class Foo\(\)\), "
-                                    r"define a pytorch class \(class Foo\(torch\.nn\.Module\)\)\. "
-                                    r"The problematic type is: "):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Workaround: instead of using arbitrary class type \(class Foo\(\)\), "
+            r"define a pytorch class \(class Foo\(torch\.nn\.Module\)\)\. "
+            r"The problematic type is: ",
+        ):
             script_module._save_to_buffer_for_lite_interpreter()
 
     def test_unsupported_return_list_with_module_class(self):
@@ -231,12 +253,14 @@ def forward(self):
                 return my_list
 
         script_module = torch.jit.script(MyTestModuleForListWithModuleClass())
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"^Returning a list or dictionary with pytorch class type "
-                                    r"is not supported in mobile module "
-                                    r"\(List\[Foo\] or Dict\[int\, Foo\] for class Foo\(torch\.nn\.Module\)\)\. "
-                                    r"Workaround\: instead of using pytorch class as their element type\, "
-                                    r"use a combination of list\, dictionary\, and single types\.$"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"^Returning a list or dictionary with pytorch class type "
+            r"is not supported in mobile module "
+            r"\(List\[Foo\] or Dict\[int\, Foo\] for class Foo\(torch\.nn\.Module\)\)\. "
+            r"Workaround\: instead of using pytorch class as their element type\, "
+            r"use a combination of list\, dictionary\, and single types\.$",
+        ):
             script_module._save_to_buffer_for_lite_interpreter()
 
     def test_unsupported_return_dict_with_module_class(self):
@@ -253,12 +277,14 @@ def forward(self):
                 return my_dict
 
         script_module = torch.jit.script(MyTestModuleForDictWithModuleClass())
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"^Returning a list or dictionary with pytorch class type "
-                                    r"is not supported in mobile module "
-                                    r"\(List\[Foo\] or Dict\[int\, Foo\] for class Foo\(torch\.nn\.Module\)\)\. "
-                                    r"Workaround\: instead of using pytorch class as their element type\, "
-                                    r"use a combination of list\, dictionary\, and single types\.$"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"^Returning a list or dictionary with pytorch class type "
+            r"is not supported in mobile module "
+            r"\(List\[Foo\] or Dict\[int\, Foo\] for class Foo\(torch\.nn\.Module\)\)\. "
+            r"Workaround\: instead of using pytorch class as their element type\, "
+            r"use a combination of list\, dictionary\, and single types\.$",
+        ):
             script_module._save_to_buffer_for_lite_interpreter()
 
     def test_module_export_operator_list(self):
@@ -304,7 +330,6 @@ def forward(self, input):
         self.assertEqual(actual_ops, expected_ops)
 
     def test_source_range_simple(self):
-
         class FooTest(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x, w):
@@ -314,15 +339,16 @@ def forward(self, x, w):
         loaded = self.getScriptExportImportCopy(ft)
         _, lineno = inspect.getsourcelines(FooTest)
 
-        with self.assertRaisesRegex(RuntimeError, f'test_lite_script_module.py\", line {lineno + 3}'):
+        with self.assertRaisesRegex(
+            RuntimeError, f'test_lite_script_module.py", line {lineno + 3}'
+        ):
             loaded(torch.rand(3, 4), torch.rand(30, 40))
 
     def test_source_range_raise_exception(self):
-
         class FooTest2(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self):
-                raise RuntimeError('foo')
+                raise RuntimeError("foo")
 
         _, lineno = inspect.getsourcelines(FooTest2)
 
@@ -333,7 +359,7 @@ def forward(self):
         # the debug handle string to show where in the Python code the exception
         # occured w/o first changing
         # torch::jit::JITException to extend c10::Error.
-        with self.assertRaisesRegex(torch.jit.Error, 'foo'):
+        with self.assertRaisesRegex(torch.jit.Error, "foo"):
             ft = FooTest2()
             loaded = self.getScriptExportImportCopy(ft)
             loaded()
@@ -358,12 +384,15 @@ def forward(self, x, y, w):
             loaded(torch.rand(3, 4), torch.rand(3, 4), torch.rand(30, 40))
         except RuntimeError as e:
             error_message = f"{e}"
-        self.assertTrue(f'test_lite_script_module.py\", line {lineno + 3}' in error_message)
-        self.assertTrue(f'test_lite_script_module.py\", line {lineno + 9}' in error_message)
-        self.assertTrue('top(FooTest3)' in error_message)
+        self.assertTrue(
+            f'test_lite_script_module.py", line {lineno + 3}' in error_message
+        )
+        self.assertTrue(
+            f'test_lite_script_module.py", line {lineno + 9}' in error_message
+        )
+        self.assertTrue("top(FooTest3)" in error_message)
 
     def test_source_range_no_debug_info(self):
-
         class FooTest4(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, x, w):
@@ -386,8 +415,8 @@ def __init__(self, val: int):
 
             @torch.jit.script_method
             def add_method(self, val: int, x, w):
-                if (val == self.val):
-                    raise RuntimeError('self.val and val are same')
+                if val == self.val:
+                    raise RuntimeError("self.val and val are same")
                 return x + w
 
             @torch.jit.script_method
@@ -412,7 +441,7 @@ def forward(self, val: int, x, y, w):
         # the debug handle string to show where in the Python code the exception
         # occured w/o first changing
         # torch::jit::JITException to extend c10::Error.
-        self.assertTrue('self.val and val are same' in error_message)
+        self.assertTrue("self.val and val are same" in error_message)
 
     def test_stacktrace_interface_call(self):
         @torch.jit.interface
@@ -434,7 +463,7 @@ def call(self):
                 return torch.ones(-1)
 
         class A(torch.nn.Module):
-            b : Forward
+            b: Forward
 
             def __init__(self):
                 super().__init__()
@@ -446,26 +475,38 @@ def forward(self):
 
         a = torch.jit.script(A())
         torch._C._enable_mobile_interface_call_export()
-        buffer = io.BytesIO(a._save_to_buffer_for_lite_interpreter(_save_mobile_debug_info=True))
+        buffer = io.BytesIO(
+            a._save_to_buffer_for_lite_interpreter(_save_mobile_debug_info=True)
+        )
         buffer.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer)
         try:
             mobile_module()
             self.assertTrue(False)
         except RuntimeError as exp:
-            FileCheck().check("Trying to create tensor with negative dimension") \
-                .check("Traceback of TorchScript") \
-                .check("self.b.forwardError").check_next("~~~~~~~~~~~~~~~~~~~ <--- HERE") \
-                .check("return self.call").check_next("~~~~~~~~~ <--- HERE") \
-                .check("return torch.ones").check_next("~~~~~~~~~~ <--- HERE").run(str(exp))
-
+            FileCheck().check("Trying to create tensor with negative dimension").check(
+                "Traceback of TorchScript"
+            ).check("self.b.forwardError").check_next(
+                "~~~~~~~~~~~~~~~~~~~ <--- HERE"
+            ).check(
+                "return self.call"
+            ).check_next(
+                "~~~~~~~~~ <--- HERE"
+            ).check(
+                "return torch.ones"
+            ).check_next(
+                "~~~~~~~~~~ <--- HERE"
+            ).run(
+                str(exp)
+            )
 
 
 class TestLiteScriptQuantizedModule(QuantizationLiteTestCase):
-
     def test_single_layer(self):
         input = torch.rand(2, 5, dtype=torch.float)
-        quantized_model = self._create_quantized_model(model_class=AnnotatedSingleLayerLinearModel, qengine="qnnpack")
+        quantized_model = self._create_quantized_model(
+            model_class=AnnotatedSingleLayerLinearModel, qengine="qnnpack"
+        )
         self._compare_script_and_mobile(model=quantized_model, input=input)
 
     def test_two_layer(self):
@@ -475,11 +516,12 @@ def test_two_layer(self):
 
     def test_annotated_nested(self):
         input = torch.rand(2, 5, dtype=torch.float)
-        quantized_model = self._create_quantized_model(model_class=AnnotatedNestedModel, qengine="qnnpack")
+        quantized_model = self._create_quantized_model(
+            model_class=AnnotatedNestedModel, qengine="qnnpack"
+        )
         self._compare_script_and_mobile(model=quantized_model, input=input)
 
     def test_quantization_example(self):
-
         # From the example in Static Quantization section of https://pytorch.org/docs/stable/quantization.html
         class M(torch.nn.Module):
             def __init__(self):
@@ -499,8 +541,10 @@ def forward(self, x):
         model_fp32 = M()
 
         model_fp32.eval()
-        model_fp32.qconfig = torch.ao.quantization.get_default_qconfig('qnnpack')
-        model_fp32_fused = torch.ao.quantization.fuse_modules(model_fp32, [['conv', 'relu']])
+        model_fp32.qconfig = torch.ao.quantization.get_default_qconfig("qnnpack")
+        model_fp32_fused = torch.ao.quantization.fuse_modules(
+            model_fp32, [["conv", "relu"]]
+        )
         model_fp32_prepared = torch.ao.quantization.prepare(model_fp32_fused)
         input_fp32 = torch.randn(4, 1, 4, 4)
         model_fp32_prepared(input_fp32)
@@ -550,5 +594,6 @@ def forward(
             ),
         )
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/mobile/test_lite_script_type.py b/test/mobile/test_lite_script_type.py
index aee61fefe1dcd..156ffb6adcba3 100644
--- a/test/mobile/test_lite_script_type.py
+++ b/test/mobile/test_lite_script_type.py
@@ -1,20 +1,22 @@
 # Owner(s): ["oncall: mobile"]
 
-import torch
-import torch.utils.bundled_inputs
 import io
-from typing import Dict, List, NamedTuple
 import unittest
+from collections import namedtuple
+from typing import Dict, List, NamedTuple
+
+import torch
+import torch.utils.bundled_inputs
 
 from torch.jit.mobile import _load_for_lite_interpreter
-from torch.testing._internal.common_utils import TestCase, run_tests
-from collections import namedtuple
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class TestLiteScriptModule(TestCase):
-
     def test_typing_namedtuple(self):
-        myNamedTuple = NamedTuple('myNamedTuple', [('a', List[torch.Tensor])])  # noqa: UP014
+        myNamedTuple = NamedTuple(  # noqa: UP014
+            "myNamedTuple", [("a", List[torch.Tensor])]
+        )
 
         class MyTestModule(torch.nn.Module):
             def forward(self, a: torch.Tensor):
@@ -25,15 +27,15 @@ def forward(self, a: torch.Tensor):
         script_module = torch.jit.script(MyTestModule())
         script_module_result = script_module(sample_input).a
 
-        buffer = io.BytesIO(script_module._save_to_buffer_for_lite_interpreter(_save_mobile_debug_info=True))
+        buffer = io.BytesIO(
+            script_module._save_to_buffer_for_lite_interpreter(
+                _save_mobile_debug_info=True
+            )
+        )
         buffer.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer)  # Error here
         mobile_module_result = mobile_module(sample_input).a
-        torch.testing.assert_close(
-            script_module_result,
-            mobile_module_result
-        )
-
+        torch.testing.assert_close(script_module_result, mobile_module_result)
 
     @unittest.skip("T137512434")
     def test_typing_dict_with_namedtuple(self):
@@ -93,10 +95,7 @@ def forward(self, a: torch.Tensor):
         buffer_mobile.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer_mobile)
         mobile_module_result = mobile_module(sample_input)
-        torch.testing.assert_close(
-            script_module_result,
-            mobile_module_result
-        )
+        torch.testing.assert_close(script_module_result, mobile_module_result)
 
     def test_typing_namedtuple_custom_classtype(self):
         class Foo(NamedTuple):
@@ -119,13 +118,10 @@ def forward(self, a: torch.Tensor):
         buffer_mobile.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer_mobile)
         mobile_module_result = mobile_module(sample_input)
-        torch.testing.assert_close(
-            script_module_result,
-            mobile_module_result
-        )
+        torch.testing.assert_close(script_module_result, mobile_module_result)
 
     def test_return_collections_namedtuple(self):
-        myNamedTuple = namedtuple('myNamedTuple', [('a')])
+        myNamedTuple = namedtuple("myNamedTuple", [("a")])
 
         class MyTestModule(torch.nn.Module):
             def forward(self, a: torch.Tensor):
@@ -138,10 +134,7 @@ def forward(self, a: torch.Tensor):
         buffer_mobile.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer_mobile)
         mobile_module_result = mobile_module(sample_input)
-        torch.testing.assert_close(
-            script_module_result,
-            mobile_module_result
-        )
+        torch.testing.assert_close(script_module_result, mobile_module_result)
 
     def test_nest_typing_namedtuple_custom_classtype(self):
         class Baz(NamedTuple):
@@ -169,9 +162,9 @@ def forward(self, a: torch.Tensor):
         mobile_module = _load_for_lite_interpreter(buffer_mobile)
         mobile_module_result = mobile_module(sample_input)
         torch.testing.assert_close(
-            script_module_result.baz.di,
-            mobile_module_result.baz.di
+            script_module_result.baz.di, mobile_module_result.baz.di
         )
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/mobile/test_quantize_fx_lite_script_module.py b/test/mobile/test_quantize_fx_lite_script_module.py
index 9a7368330e488..966217a1490e8 100644
--- a/test/mobile/test_quantize_fx_lite_script_module.py
+++ b/test/mobile/test_quantize_fx_lite_script_module.py
@@ -1,28 +1,21 @@
 # Owner(s): ["oncall: mobile"]
 
 import torch
-import torch.nn as nn
 import torch.ao.nn.quantized as nnq
+import torch.nn as nn
 import torch.utils.bundled_inputs
-from torch.ao.quantization import (
-    default_qconfig,
-    float_qparams_weight_only_qconfig,
-)
+from torch.ao.quantization import default_qconfig, float_qparams_weight_only_qconfig
 
 # graph mode quantization based on fx
-from torch.ao.quantization.quantize_fx import (
-    prepare_fx,
-    convert_fx,
-)
-from torch.testing._internal.common_quantization import NodeSpec as ns
+from torch.ao.quantization.quantize_fx import convert_fx, prepare_fx
 from torch.testing._internal.common_quantization import (
-    QuantizationLiteTestCase,
     LinearModelWithSubmodule,
+    NodeSpec as ns,
+    QuantizationLiteTestCase,
 )
 
 
 class TestLiteFuseFx(QuantizationLiteTestCase):
-
     # Tests from:
     # ./caffe2/test/quantization/fx/test_quantize_fx.py
 
diff --git a/test/mobile/test_upgrader_codegen.py b/test/mobile/test_upgrader_codegen.py
index 3357507f2e724..51acebc0f0750 100644
--- a/test/mobile/test_upgrader_codegen.py
+++ b/test/mobile/test_upgrader_codegen.py
@@ -1,34 +1,44 @@
 # Owner(s): ["oncall: mobile"]
 
-from torch.testing._internal.common_utils import TestCase, run_tests
-
-from torchgen.operator_versions.gen_mobile_upgraders import (
-    sort_upgrader,
-    write_cpp,
-)
-from pathlib import Path
-import tempfile
 import os
+import tempfile
+from pathlib import Path
+
 from torch.jit.generate_bytecode import generate_upgraders_bytecode
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+from torchgen.operator_versions.gen_mobile_upgraders import sort_upgrader, write_cpp
 
 pytorch_caffe2_dir = Path(__file__).resolve().parents[2]
 
-class TestLiteScriptModule(TestCase):
 
+class TestLiteScriptModule(TestCase):
     def test_generate_bytecode(self):
         upgrader_list = generate_upgraders_bytecode()
         sorted_upgrader_list = sort_upgrader(upgrader_list)
-        upgrader_mobile_cpp_path = pytorch_caffe2_dir / "torch" / "csrc" / "jit" / "mobile" / "upgrader_mobile.cpp"
+        upgrader_mobile_cpp_path = (
+            pytorch_caffe2_dir
+            / "torch"
+            / "csrc"
+            / "jit"
+            / "mobile"
+            / "upgrader_mobile.cpp"
+        )
         with tempfile.TemporaryDirectory() as tmpdirname:
             write_cpp(tmpdirname, sorted_upgrader_list)
-            with open(os.path.join(tmpdirname, 'upgrader_mobile.cpp')) as file_name:
-                actual_output = [line.strip() for line in file_name.readlines() if line]
+            with open(os.path.join(tmpdirname, "upgrader_mobile.cpp")) as file_name:
+                actual_output = [line.strip() for line in file_name if line]
             with open(str(upgrader_mobile_cpp_path)) as file_name:
-                expect_output = [line.strip() for line in file_name.readlines() if line]
-            actual_output_filtered = list(filter(lambda token: len(token) != 0, actual_output))
-            expect_output_filtered = list(filter(lambda token: len(token) != 0, expect_output))
+                expect_output = [line.strip() for line in file_name if line]
+            actual_output_filtered = list(
+                filter(lambda token: len(token) != 0, actual_output)
+            )
+            expect_output_filtered = list(
+                filter(lambda token: len(token) != 0, expect_output)
+            )
 
             self.assertEqual(actual_output_filtered, expect_output_filtered)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/mobile/test_upgraders.py b/test/mobile/test_upgraders.py
index a5646453972d1..910c2bfe7ac7c 100644
--- a/test/mobile/test_upgraders.py
+++ b/test/mobile/test_upgraders.py
@@ -1,20 +1,25 @@
 # Owner(s): ["oncall: mobile"]
 
+import io
+from itertools import product
+from pathlib import Path
+
 import torch
 import torch.utils.bundled_inputs
-import io
 
 from torch.jit.mobile import _load_for_lite_interpreter
-from torch.testing._internal.common_utils import TestCase, run_tests
-from pathlib import Path
-from itertools import product
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 pytorch_test_dir = Path(__file__).resolve().parents[1]
 
-class TestLiteScriptModule(TestCase):
 
+class TestLiteScriptModule(TestCase):
     def _save_load_mobile_module(self, script_module: torch.jit.ScriptModule):
-        buffer = io.BytesIO(script_module._save_to_buffer_for_lite_interpreter(_save_mobile_debug_info=True))
+        buffer = io.BytesIO(
+            script_module._save_to_buffer_for_lite_interpreter(
+                _save_mobile_debug_info=True
+            )
+        )
         buffer.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer)
         return mobile_module
@@ -26,17 +31,22 @@ def _try_fn(self, fn, *args, **kwargs):
             return e
 
     def test_versioned_div_tensor(self):
-
         def div_tensor_0_3(self, other):
             if self.is_floating_point() or other.is_floating_point():
                 return self.true_divide(other)
-            return self.divide(other, rounding_mode='trunc')
+            return self.divide(other, rounding_mode="trunc")
 
-        model_path = pytorch_test_dir / "cpp" / "jit" / "upgrader_models" / "test_versioned_div_tensor_v2.ptl"
+        model_path = (
+            pytorch_test_dir
+            / "cpp"
+            / "jit"
+            / "upgrader_models"
+            / "test_versioned_div_tensor_v2.ptl"
+        )
         mobile_module_v2 = _load_for_lite_interpreter(str(model_path))
         jit_module_v2 = torch.jit.load(str(model_path))
         current_mobile_module = self._save_load_mobile_module(jit_module_v2)
-        vals = (2., 3., 2, 3)
+        vals = (2.0, 3.0, 2, 3)
         for val_a, val_b in product(vals, vals):
             a = torch.tensor((val_a,))
             b = torch.tensor((val_b,))
@@ -60,5 +70,6 @@ def _helper(m, fn):
             # latest operator should produce the same result as applying torch.div op
             # _helper(current_mobile_module, torch.div)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index 5f0d7eea2a61a..c0d715019d6e3 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -1,7 +1,7 @@
 # Owner(s): ["module: nn"]
+import itertools
 import math
 import unittest
-import itertools
 import warnings
 from itertools import product
 
@@ -11,31 +11,63 @@
 import torch.backends.cudnn as cudnn
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.testing._internal.common_dtype import floating_types_and, floating_and_complex_types_and
-from torch.testing._internal.common_utils import run_tests, \
-    skipIfRocmVersionLessThan, skipIfNotMiopenSuggestNHWC, TEST_SCIPY, TEST_WITH_ROCM, \
-    download_file, parametrize as parametrize_test, subtest, \
-    instantiate_parametrized_tests, set_default_dtype
-from torch.testing._internal.common_cuda import TEST_CUDA, TEST_CUDNN
-from torch.testing._internal.common_nn import NNTestCase, _test_module_empty_input
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, \
-    dtypesIfCUDA, precisionOverride, skipCUDAIfNoCudnn, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \
-    skipCUDAIfRocm, skipCUDAIfRocmVersionLessThan, skipCUDAIfNotMiopenSuggestNHWC, \
-    onlyNativeDeviceTypes, largeTensorTest, skipMeta, \
-    disableMkldnn, skipCPUIfNoMkldnn, disablecuDNN, skipCUDAIfMiopen, skipCUDAIfNoMiopen
 
 from torch.testing import make_tensor
-from torch.testing._internal.common_utils import gradcheck, gradgradcheck, \
-    GRADCHECK_NONDET_TOL
-from torch.testing._internal.common_utils import dtype2prec_DONTUSE
-from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32
+from torch.testing._internal.common_cuda import (
+    TEST_CUDA,
+    TEST_CUDNN,
+    tf32_is_not_fp32,
+    tf32_on_and_off,
+)
+from torch.testing._internal.common_device_type import (
+    disablecuDNN,
+    disableMkldnn,
+    dtypes,
+    dtypesIfCUDA,
+    instantiate_device_type_tests,
+    largeTensorTest,
+    onlyCPU,
+    onlyCUDA,
+    onlyNativeDeviceTypes,
+    precisionOverride,
+    skipCPUIfNoMkldnn,
+    skipCUDAIfCudnnVersionLessThan,
+    skipCUDAIfMiopen,
+    skipCUDAIfNoCudnn,
+    skipCUDAIfNoMiopen,
+    skipCUDAIfNotMiopenSuggestNHWC,
+    skipCUDAIfRocm,
+    skipCUDAIfRocmVersionLessThan,
+    skipMeta,
+)
+from torch.testing._internal.common_dtype import (
+    floating_and_complex_types_and,
+    floating_types_and,
+)
+from torch.testing._internal.common_nn import _test_module_empty_input, NNTestCase
+from torch.testing._internal.common_utils import (
+    download_file,
+    dtype2prec_DONTUSE,
+    gradcheck,
+    GRADCHECK_NONDET_TOL,
+    gradgradcheck,
+    instantiate_parametrized_tests,
+    parametrize as parametrize_test,
+    run_tests,
+    set_default_dtype,
+    skipIfNotMiopenSuggestNHWC,
+    skipIfRocmVersionLessThan,
+    subtest,
+    TEST_SCIPY,
+    TEST_WITH_ROCM,
+)
 
 AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32()
 
 
 if TEST_SCIPY:
-    import scipy.signal
     import scipy.ndimage
+    import scipy.signal
 
 
 class TestConvolutionNN(NNTestCase):
@@ -53,26 +85,41 @@ def test_conv_backcompat(self):
         #     torch.save(m, 'legacy_conv2d.pt')
         #
         # NB: This Pickle also contains some Unicode data!
-        path = download_file('https://download.pytorch.org/test_data/legacy_conv2d.pt')
+        path = download_file("https://download.pytorch.org/test_data/legacy_conv2d.pt")
         with warnings.catch_warnings():
-            warnings.simplefilter('ignore', SourceChangeWarning)
-            m = torch.load(path, encoding='utf-8')
+            warnings.simplefilter("ignore", SourceChangeWarning)
+            m = torch.load(path, encoding="utf-8")
         input = torch.randn((1, 1, 1, 1), dtype=torch.float)
         self.assertEqual(m(input).size(), (1, 1, 1, 1))
 
     def test_invalid_conv1d(self):
-        for dtype in [torch.half, torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
-            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True).to(dtype)
+        for dtype in [
+            torch.half,
+            torch.bfloat16,
+            torch.float,
+            torch.double,
+            torch.cfloat,
+            torch.cdouble,
+        ]:
+            module = nn.Conv1d(
+                in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True
+            ).to(dtype)
             input = torch.randn(1, 3, 4).to(dtype)
-            with self.assertRaisesRegex(RuntimeError,
-                                        r'Calculated padded input size per channel: \(4\). ' +
-                                        r'Kernel size: \(10\). Kernel size can\'t be greater than actual input size'):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                r"Calculated padded input size per channel: \(4\). "
+                + r"Kernel size: \(10\). Kernel size can\'t be greater than actual input size",
+            ):
                 module(input)
 
             # Negative stride check
-            module = nn.Conv1d(in_channels=3, out_channels=6, kernel_size=3, stride=-1, bias=True).to(dtype)
+            module = nn.Conv1d(
+                in_channels=3, out_channels=6, kernel_size=3, stride=-1, bias=True
+            ).to(dtype)
             input = torch.randn(1, 3, 4).to(dtype)
-            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+            with self.assertRaisesRegex(
+                RuntimeError, "non-positive stride is not supported"
+            ):
                 module(input)
 
     def test_mismatch_shape_conv2d(self):
@@ -80,17 +127,22 @@ def test_mismatch_shape_conv2d(self):
             x = torch.randn(1, 10, 1, 28, 28, dtype=dtype)
             w = torch.randn(6, 1, 5, 5, dtype=dtype)
 
-            with self.assertRaisesRegex(RuntimeError,
-                                        r'Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d, but got ' +
-                                        r'input of size: \[1, 10, 1, 28, 28\]'):
-
+            with self.assertRaisesRegex(
+                RuntimeError,
+                r"Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d, but got "
+                + r"input of size: \[1, 10, 1, 28, 28\]",
+            ):
                 F.conv2d(x, w)
 
     def test_conv2d_discontiguous_weight(self):
         for dtype in (torch.float, torch.cfloat):
             # Test for https://github.com/pytorch/pytorch/issues/55781
             x = torch.ones(64, 16, 16, 16, dtype=dtype)
-            weight = torch.arange(0, 1.0, 1 / 2.0 ** 10).reshape(32, 16, 1, 2).to(dtype)[:, :, :, ::2]
+            weight = (
+                torch.arange(0, 1.0, 1 / 2.0**10)
+                .reshape(32, 16, 1, 2)
+                .to(dtype)[:, :, :, ::2]
+            )
             self.assertFalse(weight.is_contiguous())
             y = torch.nn.functional.conv2d(x, weight, None)
             if torch.backends.mkldnn.is_available():
@@ -98,170 +150,288 @@ def test_conv2d_discontiguous_weight(self):
                 with torch.backends.mkldnn.flags(enabled=False):
                     y_ = torch.nn.functional.conv2d(x, weight, None)
                     self.assertEqual(y, y_)
-            self.assertEqual(y.sum(), 4186112.)
+            self.assertEqual(y.sum(), 4186112.0)
 
     def test_invalid_conv2d(self):
-        for dtype in [torch.half, torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
-            module = torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2).to(dtype)
+        for dtype in [
+            torch.half,
+            torch.bfloat16,
+            torch.float,
+            torch.double,
+            torch.cfloat,
+            torch.cdouble,
+        ]:
+            module = torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2).to(
+                dtype
+            )
             input = torch.empty(1, 1, 4, 4).to(dtype)
             self.assertRaises(RuntimeError, lambda: module(input))
 
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True)
+            module = nn.Conv2d(
+                in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True
+            )
             input = torch.randn(1, 3, 1, 1)
-            with self.assertRaisesRegex(RuntimeError,
-                                        r'Calculated padded input size per channel: \(1 x 1\). ' +
-                                        r'Kernel size: \(10 x 10\). Kernel size can\'t be greater than actual input size'):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                r"Calculated padded input size per channel: \(1 x 1\). "
+                + r"Kernel size: \(10 x 10\). Kernel size can\'t be greater than actual input size",
+            ):
                 module(input)
 
             # Negative stride check
-            module = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=4, stride=-1, bias=True).to(dtype)
+            module = nn.Conv2d(
+                in_channels=3, out_channels=6, kernel_size=4, stride=-1, bias=True
+            ).to(dtype)
             input = torch.randn(1, 3, 4, 4).to(dtype)
-            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+            with self.assertRaisesRegex(
+                RuntimeError, "non-positive stride is not supported"
+            ):
                 module(input)
 
             # Zero stride check
-            module = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=4, stride=0, bias=True).to(dtype)
+            module = nn.Conv2d(
+                in_channels=3, out_channels=6, kernel_size=4, stride=0, bias=True
+            ).to(dtype)
             input = torch.randn(1, 3, 4, 4).to(dtype)
-            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+            with self.assertRaisesRegex(
+                RuntimeError, "non-positive stride is not supported"
+            ):
                 module(input)
 
     def test_invalid_conv3d(self):
-        for dtype in [torch.half, torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
-            module = torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2).to(dtype)
+        for dtype in [
+            torch.half,
+            torch.bfloat16,
+            torch.float,
+            torch.double,
+            torch.cfloat,
+            torch.cdouble,
+        ]:
+            module = torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2).to(
+                dtype
+            )
             input = torch.empty(1, 1, 4, 4, 4).to(dtype)
             self.assertRaises(RuntimeError, lambda: module(input))
 
             # Negative stride check
             module = torch.nn.Conv3d(1, 1, kernel_size=3, stride=-2)
             input = torch.empty(1, 1, 4, 4, 4)
-            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+            with self.assertRaisesRegex(
+                RuntimeError, "non-positive stride is not supported"
+            ):
                 module(input)
 
     def test_conv_invalid_groups(self):
-        with self.assertRaisesRegex(ValueError, 'groups must be a positive integer'):
+        with self.assertRaisesRegex(ValueError, "groups must be a positive integer"):
             torch.nn.Conv1d(1, 1, kernel_size=3, dilation=2, stride=2, groups=0)
-        with self.assertRaisesRegex(ValueError, 'groups must be a positive integer'):
+        with self.assertRaisesRegex(ValueError, "groups must be a positive integer"):
             torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2, groups=-1)
-        with self.assertRaisesRegex(ValueError, 'groups must be a positive integer'):
+        with self.assertRaisesRegex(ValueError, "groups must be a positive integer"):
             torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2, groups=-2)
 
     def test_Conv1d_module_same_padding(self):
         # Compare module against functional: without strides/dilation, asymmetric padding
         x = torch.rand(1, 1, 20)
-        module = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=10,
-                           padding='same')
-        expect = F.conv1d(x, module.weight, module.bias, padding='same')
+        module = nn.Conv1d(
+            in_channels=1, out_channels=1, kernel_size=10, padding="same"
+        )
+        expect = F.conv1d(x, module.weight, module.bias, padding="same")
         self.assertEqual(expect, module(x))
 
         # Test dilation, symmetric padding
-        module = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=10,
-                           padding='same', dilation=2)
-        expect = F.conv1d(x, module.weight, module.bias, padding='same', dilation=2)
+        module = nn.Conv1d(
+            in_channels=1, out_channels=1, kernel_size=10, padding="same", dilation=2
+        )
+        expect = F.conv1d(x, module.weight, module.bias, padding="same", dilation=2)
         self.assertEqual(expect, module(x))
 
         # Test non-zero padding_mode, requiring explicit padding
-        module = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=10,
-                           padding='same', padding_mode='replicate')
-        x_padded = F.pad(x, [4, 5], mode='replicate')
-        expect = F.conv1d(x_padded, module.weight, module.bias, padding='valid')
+        module = nn.Conv1d(
+            in_channels=1,
+            out_channels=1,
+            kernel_size=10,
+            padding="same",
+            padding_mode="replicate",
+        )
+        x_padded = F.pad(x, [4, 5], mode="replicate")
+        expect = F.conv1d(x_padded, module.weight, module.bias, padding="valid")
         self.assertEqual(expect, module(x))
         self.assertEqual(x.size(), expect.size())
 
         # Test connstruction with invalid padding string raises
-        with self.assertRaisesRegex(ValueError, 'Invalid padding string'):
-            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, padding='foo')
+        with self.assertRaisesRegex(ValueError, "Invalid padding string"):
+            module = nn.Conv1d(
+                in_channels=3, out_channels=33, kernel_size=10, padding="foo"
+            )
 
         # Test connstruction with same padding and strides raises
         with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=2)
+            module = nn.Conv1d(
+                in_channels=3, out_channels=33, kernel_size=10, padding="same", stride=2
+            )
 
     def test_Conv2d_module_same_padding(self):
         # Compare module against functional:
         # without strides/dilation, both symmetric and asymmetric padding
         x = torch.rand(1, 1, 9, 20)
-        module = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(5, 10),
-                           padding='same')
-        expect = F.conv2d(x, module.weight, module.bias, padding='same')
+        module = nn.Conv2d(
+            in_channels=1, out_channels=1, kernel_size=(5, 10), padding="same"
+        )
+        expect = F.conv2d(x, module.weight, module.bias, padding="same")
         self.assertEqual(expect, module(x))
 
         # with dilation, symmetric padding
-        module = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3, 4),
-                           padding='same', dilation=(1, 2))
-        expect = F.conv2d(x, module.weight, module.bias, padding='same', dilation=(1, 2))
+        module = nn.Conv2d(
+            in_channels=1,
+            out_channels=1,
+            kernel_size=(3, 4),
+            padding="same",
+            dilation=(1, 2),
+        )
+        expect = F.conv2d(
+            x, module.weight, module.bias, padding="same", dilation=(1, 2)
+        )
         self.assertEqual(expect, module(x))
 
         # Test non-zero padding_mode, requiring explicit padding
-        module = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3, 4),
-                           padding='same', padding_mode='reflect')
-        x_padded = F.pad(x, [1, 2, 1, 1], mode='reflect')
-        expect = F.conv2d(x_padded, module.weight, module.bias, padding='valid')
+        module = nn.Conv2d(
+            in_channels=1,
+            out_channels=1,
+            kernel_size=(3, 4),
+            padding="same",
+            padding_mode="reflect",
+        )
+        x_padded = F.pad(x, [1, 2, 1, 1], mode="reflect")
+        expect = F.conv2d(x_padded, module.weight, module.bias, padding="valid")
         self.assertEqual(expect, module(x))
         self.assertEqual(x.size(), expect.size())
 
         # Test connstruction with invalid padding string raises
-        with self.assertRaisesRegex(ValueError, 'Invalid padding string'):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='foo')
+        with self.assertRaisesRegex(ValueError, "Invalid padding string"):
+            module = nn.Conv2d(
+                in_channels=3, out_channels=33, kernel_size=10, padding="foo"
+            )
 
         # Test connstruction with same padding and strides raises
         with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=2)
+            module = nn.Conv2d(
+                in_channels=3, out_channels=33, kernel_size=10, padding="same", stride=2
+            )
         with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(1, 3))
+            module = nn.Conv2d(
+                in_channels=3,
+                out_channels=33,
+                kernel_size=10,
+                padding="same",
+                stride=(1, 3),
+            )
         with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(4, 1))
+            module = nn.Conv2d(
+                in_channels=3,
+                out_channels=33,
+                kernel_size=10,
+                padding="same",
+                stride=(4, 1),
+            )
 
     def test_Conv3d_module_same_padding(self):
         # Compare module against functional:
         x = torch.rand(1, 1, 4, 4, 4)
         # without dilation, both symmetric and asymmetric padding
-        module = nn.Conv3d(in_channels=1, out_channels=1, kernel_size=(2, 3, 4),
-                           padding='same')
-        expect = F.conv3d(x, module.weight, module.bias, padding='same')
+        module = nn.Conv3d(
+            in_channels=1, out_channels=1, kernel_size=(2, 3, 4), padding="same"
+        )
+        expect = F.conv3d(x, module.weight, module.bias, padding="same")
         self.assertEqual(expect, module(x))
 
         # with dilation, both symmetric and asymmetric padding
-        module = nn.Conv3d(in_channels=1, out_channels=1, kernel_size=(2, 3, 4),
-                           padding='same', dilation=(3, 2, 1))
-        expect = F.conv3d(x, module.weight, module.bias, padding='same', dilation=(3, 2, 1))
+        module = nn.Conv3d(
+            in_channels=1,
+            out_channels=1,
+            kernel_size=(2, 3, 4),
+            padding="same",
+            dilation=(3, 2, 1),
+        )
+        expect = F.conv3d(
+            x, module.weight, module.bias, padding="same", dilation=(3, 2, 1)
+        )
         self.assertEqual(expect, module(x))
 
         # Test non-zero padding_mode, requiring explicit padding
-        module = nn.Conv3d(in_channels=1, out_channels=1, kernel_size=(2, 3, 4),
-                           padding='same', padding_mode='circular')
-        x_padded = F.pad(x, [1, 2, 1, 1, 0, 1], mode='circular')
-        expect = F.conv3d(x_padded, module.weight, module.bias, padding='valid')
+        module = nn.Conv3d(
+            in_channels=1,
+            out_channels=1,
+            kernel_size=(2, 3, 4),
+            padding="same",
+            padding_mode="circular",
+        )
+        x_padded = F.pad(x, [1, 2, 1, 1, 0, 1], mode="circular")
+        expect = F.conv3d(x_padded, module.weight, module.bias, padding="valid")
         self.assertEqual(expect, module(x))
         self.assertEqual(x.size(), expect.size())
 
         # Test connstruction with invalid padding string raises
-        with self.assertRaisesRegex(ValueError, 'Invalid padding string'):
-            module = nn.Conv3d(in_channels=3, out_channels=33, kernel_size=10, padding='foo')
+        with self.assertRaisesRegex(ValueError, "Invalid padding string"):
+            module = nn.Conv3d(
+                in_channels=3, out_channels=33, kernel_size=10, padding="foo"
+            )
 
         # Test connstruction with same padding and strides raises
         with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=2)
+            module = nn.Conv2d(
+                in_channels=3, out_channels=33, kernel_size=10, padding="same", stride=2
+            )
         with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(1, 1, 3))
+            module = nn.Conv2d(
+                in_channels=3,
+                out_channels=33,
+                kernel_size=10,
+                padding="same",
+                stride=(1, 1, 3),
+            )
         with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(1, 4, 1))
+            module = nn.Conv2d(
+                in_channels=3,
+                out_channels=33,
+                kernel_size=10,
+                padding="same",
+                stride=(1, 4, 1),
+            )
         with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(5, 1, 1))
+            module = nn.Conv2d(
+                in_channels=3,
+                out_channels=33,
+                kernel_size=10,
+                padding="same",
+                stride=(5, 1, 1),
+            )
 
-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
     def test_thnn_conv_strided_padded_dilated(self):
         for convfn, dims, transposed in (
-                (torch.nn.functional.conv2d, 2, False),
-                (torch.nn.functional.conv_transpose2d, 2, True),
-                (torch.nn.functional.conv3d, 3, False),
-                (torch.nn.functional.conv_transpose3d, 3, True)):
+            (torch.nn.functional.conv2d, 2, False),
+            (torch.nn.functional.conv_transpose2d, 2, True),
+            (torch.nn.functional.conv3d, 3, False),
+            (torch.nn.functional.conv_transpose3d, 3, True),
+        ):
             for stride, padding, dilation in (
-                    (2, 0, 1), (1, 1, 1), (2, 1, 1), (1, 0, 2)):
+                (2, 0, 1),
+                (1, 1, 1),
+                (2, 1, 1),
+                (1, 0, 2),
+            ):
                 kwargs = {"stride": stride, "padding": padding, "dilation": dilation}
                 inp_shape = (1, 2) + dims * (4,)
                 weight_shape = (2, 2) + dims * (1,)
-                inputs = torch.randn(inp_shape, dtype=torch.double, device="cuda", requires_grad=True)
-                weight = torch.randn(weight_shape, dtype=torch.double, device="cuda", requires_grad=True)
-                bias = torch.randn(2, dtype=torch.double, device="cuda", requires_grad=True)
+                inputs = torch.randn(
+                    inp_shape, dtype=torch.double, device="cuda", requires_grad=True
+                )
+                weight = torch.randn(
+                    weight_shape, dtype=torch.double, device="cuda", requires_grad=True
+                )
+                bias = torch.randn(
+                    2, dtype=torch.double, device="cuda", requires_grad=True
+                )
                 with torch.backends.cudnn.flags(enabled=False):
                     res = convfn(inputs, weight, bias, **kwargs)
                 res_cpu = convfn(inputs.cpu(), weight.cpu(), bias.cpu(), **kwargs)
@@ -269,11 +439,11 @@ def test_thnn_conv_strided_padded_dilated(self):
                 with torch.backends.cudnn.flags(enabled=False):
                     torch.autograd.gradcheck(
                         lambda x, w, b: convfn(x, w, b, **kwargs),
-                        (inputs, weight, bias)
+                        (inputs, weight, bias),
                     )
                     torch.autograd.gradcheck(
                         lambda x, w, b: convfn(x, w, b, **kwargs),
-                        (inputs.cpu(), weight.cpu(), bias.cpu())
+                        (inputs.cpu(), weight.cpu(), bias.cpu()),
                     )
 
     def test_Conv2d_inconsistent_types(self):
@@ -284,7 +454,7 @@ def test_Conv2d_inconsistent_types(self):
         # but it should work with the same type
         nn.functional.conv2d(inputs.float(), weights.float())
 
-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
     def test_Conv2d_inconsistent_types_on_GPU_without_cudnn(self):
         inputs = torch.randn(4, 1, 7, 7, dtype=torch.float, device="cuda")
         weights = torch.randn(1, 1, 3, 3, dtype=torch.double, device="cuda")
@@ -292,8 +462,13 @@ def test_Conv2d_inconsistent_types_on_GPU_without_cudnn(self):
 
         with torch.backends.cudnn.flags(enabled=False):
             # inconsistent types should raise an exception
-            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights))
-            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights.float(), bias))
+            self.assertRaises(
+                RuntimeError, lambda: nn.functional.conv2d(inputs, weights)
+            )
+            self.assertRaises(
+                RuntimeError,
+                lambda: nn.functional.conv2d(inputs, weights.float(), bias),
+            )
 
             # but it should work with the same type
             nn.functional.conv2d(inputs.float(), weights.float(), bias.float())
@@ -302,7 +477,9 @@ def test_Conv2d_1x1(self):
         in_channels = 2
         out_channels = 2
         mod = torch.nn.Conv2d(2, 2, 1, bias=False).to(dtype=torch.double)
-        input = torch.randn(1, in_channels, 5, 5, requires_grad=True, dtype=torch.double)
+        input = torch.randn(
+            1, in_channels, 5, 5, requires_grad=True, dtype=torch.double
+        )
         for enabled in (False, True):
             with torch.backends.mkldnn.flags(enabled=enabled):
                 gradcheck(F.conv2d, (input, mod.weight))
@@ -320,7 +497,7 @@ def run_once(group_val=24, dilation=1):
                 dilation=[dilation, dilation],
                 groups=group_val,
                 bias=False,
-                padding_mode='zeros'
+                padding_mode="zeros",
             )
 
             op.weight.data = weights
@@ -339,20 +516,18 @@ def run_once(group_val=24, dilation=1):
 
                 self.assertEqual(without_onednn, with_onednn)
 
-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_cudnn_non_contiguous(self):
         x = torch.randn(192, 16, 50).cuda()
         x = x.permute(0, 2, 1).contiguous().permute(0, 2, 1)
         m = torch.nn.Conv1d(
-            in_channels=16,
-            out_channels=32,
-            kernel_size=2,
-            bias=True).cuda()
+            in_channels=16, out_channels=32, kernel_size=2, bias=True
+        ).cuda()
         result = m(x)
 
-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_Conv2d_inconsistent_types_on_GPU_with_cudnn(self):
         inputs = torch.randn(4, 1, 7, 7, dtype=torch.float, device="cuda")
         weights = torch.randn(1, 1, 3, 3, dtype=torch.double, device="cuda")
@@ -360,8 +535,13 @@ def test_Conv2d_inconsistent_types_on_GPU_with_cudnn(self):
 
         with torch.backends.cudnn.flags(enabled=True):
             # inconsistent types should raise an exception
-            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights))
-            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights.float(), bias))
+            self.assertRaises(
+                RuntimeError, lambda: nn.functional.conv2d(inputs, weights)
+            )
+            self.assertRaises(
+                RuntimeError,
+                lambda: nn.functional.conv2d(inputs, weights.float(), bias),
+            )
 
             # but it should work with the same type
             nn.functional.conv2d(inputs.float(), weights.float(), bias.float())
@@ -375,22 +555,26 @@ def test_Conv2d_backward_twice(self):
         c = nn.Conv2d(3, 3, 3)
         o1 = c(input)
         o1.sum().backward()
-        self.assertRaisesRegex(RuntimeError, 'Specify retain_graph=True',
-                               lambda: o1.sum().backward())
+        self.assertRaisesRegex(
+            RuntimeError, "Specify retain_graph=True", lambda: o1.sum().backward()
+        )
 
     def test_conv_modules_raise_error_on_incorrect_input_size(self):
         for dtype in [torch.half, torch.bfloat16, torch.double, torch.float]:
-            modules = [nn.Conv1d(3, 8, 3).to(dtype), nn.ConvTranspose1d(3, 8, 3).to(dtype),
-                       nn.Conv2d(3, 8, 3).to(dtype), nn.ConvTranspose2d(3, 8, 3).to(dtype),
-                       nn.Conv3d(3, 8, 3).to(dtype), nn.ConvTranspose3d(3, 8, 3).to(dtype)]
+            modules = [
+                nn.Conv1d(3, 8, 3).to(dtype),
+                nn.ConvTranspose1d(3, 8, 3).to(dtype),
+                nn.Conv2d(3, 8, 3).to(dtype),
+                nn.ConvTranspose2d(3, 8, 3).to(dtype),
+                nn.Conv3d(3, 8, 3).to(dtype),
+                nn.ConvTranspose3d(3, 8, 3).to(dtype),
+            ]
 
-            invalid_input_dims = [(1, 4), (1, 4),
-                                  (2, 5), (2, 5),
-                                  (3, 6), (3, 6)]
+            invalid_input_dims = [(1, 4), (1, 4), (2, 5), (2, 5), (3, 6), (3, 6)]
 
             for invalid_dims, module in zip(invalid_input_dims, modules):
                 for dims in invalid_dims:
-                    input = torch.empty(torch.Size((3, ) * dims))
+                    input = torch.empty(torch.Size((3,) * dims))
                     self.assertRaises(RuntimeError, lambda: module(input))
 
     def test_conv_shapecheck(self):
@@ -402,13 +586,22 @@ def test(should_raise, module, input_size, dtype):
                 # just run it to ensure no exception raised.
                 module(input)
 
-        for dtype in [torch.half, torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
+        for dtype in [
+            torch.half,
+            torch.bfloat16,
+            torch.float,
+            torch.double,
+            torch.cfloat,
+            torch.cdouble,
+        ]:
             # Conv1d
             test(True, nn.Conv1d(1, 1, 3).to(dtype), (1, 2), dtype)
             test(True, nn.Conv1d(1, 1, 3, stride=2).to(dtype), (1, 2), dtype)
             test(False, nn.Conv1d(1, 1, 2).to(dtype), (1, 2), dtype)
             test(False, nn.Conv1d(1, 1, 2, stride=2).to(dtype), (1, 2), dtype)
-            test(False, nn.Conv1d(1, 1, 3, stride=2, padding=1).to(dtype), (1, 2), dtype)
+            test(
+                False, nn.Conv1d(1, 1, 3, stride=2, padding=1).to(dtype), (1, 2), dtype
+            )
 
             # Conv2d
             test(True, nn.Conv2d(1, 1, (3, 3)).to(dtype), (1, 2, 2), dtype)
@@ -418,7 +611,12 @@ def test(should_raise, module, input_size, dtype):
             # Conv3D
             test(True, nn.Conv3d(1, 1, (3, 3, 3)).to(dtype), (1, 2, 2, 2), dtype)
             test(False, nn.Conv3d(1, 1, (3, 3, 3)).to(dtype), (1, 3, 3, 3), dtype)
-            test(False, nn.Conv3d(1, 1, (3, 3, 3), padding=1).to(dtype), (1, 2, 2, 2), dtype)
+            test(
+                False,
+                nn.Conv3d(1, 1, (3, 3, 3), padding=1).to(dtype),
+                (1, 2, 2, 2),
+                dtype,
+            )
 
     def test_ConvTranspose2d_output_size(self):
         m = nn.ConvTranspose2d(3, 4, 3, 3, 0, 2)
@@ -469,12 +667,15 @@ def test_ConvTranspose3d_correct_output_size(self):
         i = torch.rand(1, 2, 1, 1, 1)
         out = m(i, output_size=(1, 2, 2, 2, 2))
 
-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
     def test_ConvTranspose2d_half_cublas_gemm(self):
         with torch.backends.cudnn.flags(enabled=False):
-            inputs = torch.randn(1, 1, 16, 16, device='cuda', dtype=torch.half)
-            deconv = nn.ConvTranspose2d(
-                1, 1, 3, stride=2, padding=1, output_padding=1).cuda().half()
+            inputs = torch.randn(1, 1, 16, 16, device="cuda", dtype=torch.half)
+            deconv = (
+                nn.ConvTranspose2d(1, 1, 3, stride=2, padding=1, output_padding=1)
+                .cuda()
+                .half()
+            )
             output = deconv(inputs)
             output.mean().backward()
 
@@ -508,12 +709,18 @@ def test_Conv2d_groups_nobias(self):
             output2.backward(grad_output[:, 2:].contiguous())
 
             self.assertEqual(output, torch.cat([output1, output2], 1))
-            self.assertEqual(i.grad.data,
-                             torch.cat([i1.grad.data, i2.grad.data], 1),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(m.weight.grad.data,
-                             torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
-                             atol=1e-1 if dtype == torch.half else dtype2prec_DONTUSE[dtype], rtol=0)
+            self.assertEqual(
+                i.grad.data,
+                torch.cat([i1.grad.data, i2.grad.data], 1),
+                atol=dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
+            self.assertEqual(
+                m.weight.grad.data,
+                torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                atol=1e-1 if dtype == torch.half else dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
 
     # Almost identical to the above `test_Conv2d_naive_groups`
     # Covering special case when group > 1, input-channel / group < 16 and output-channel is multiple of 16
@@ -548,19 +755,27 @@ def test_Conv2d_groups_nobias_v2(self):
             output2.backward(grad_output[:, 8:].contiguous())
 
             self.assertEqual(output, torch.cat([output1, output2], 1))
-            self.assertEqual(i.grad.data,
-                             torch.cat([i1.grad.data, i2.grad.data], 1),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(m.weight.grad.data,
-                             torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
-                             atol=1e-1 if dtype == torch.half else dtype2prec_DONTUSE[dtype], rtol=0)
+            self.assertEqual(
+                i.grad.data,
+                torch.cat([i1.grad.data, i2.grad.data], 1),
+                atol=dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
+            self.assertEqual(
+                m.weight.grad.data,
+                torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                atol=1e-1 if dtype == torch.half else dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
 
     # CPU-only test for group conv3d fast implementation using bmm
     # See: https://github.com/pytorch/pytorch/pull/36355
     def test_Conv3d_groups_nobias(self):
         torch.manual_seed(123)
         m = nn.Conv3d(4, 16, kernel_size=3, groups=2, bias=False).to("cpu", torch.float)
-        i = torch.randn(2, 4, 6, 6, 6, device="cpu", dtype=torch.float, requires_grad=True)
+        i = torch.randn(
+            2, 4, 6, 6, 6, device="cpu", dtype=torch.float, requires_grad=True
+        )
         output = m(i)
         grad_output = torch.randn(2, 16, 4, 4, 4, device="cpu", dtype=torch.float)
         output.backward(grad_output)
@@ -578,17 +793,25 @@ def test_Conv3d_groups_nobias(self):
         output2.backward(grad_output[:, 8:].contiguous())
 
         self.assertEqual(output, torch.cat([output1, output2], 1))
-        self.assertEqual(i.grad.data,
-                         torch.cat([i1.grad.data, i2.grad.data], 1),
-                         atol=dtype2prec_DONTUSE[torch.float], rtol=0)
-        self.assertEqual(m.weight.grad.data,
-                         torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
-                         atol=dtype2prec_DONTUSE[torch.float], rtol=dtype2prec_DONTUSE[torch.float])
+        self.assertEqual(
+            i.grad.data,
+            torch.cat([i1.grad.data, i2.grad.data], 1),
+            atol=dtype2prec_DONTUSE[torch.float],
+            rtol=0,
+        )
+        self.assertEqual(
+            m.weight.grad.data,
+            torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+            atol=dtype2prec_DONTUSE[torch.float],
+            rtol=dtype2prec_DONTUSE[torch.float],
+        )
 
     def test_Conv3d_groups_wbias(self):
         torch.manual_seed(123)
         m = nn.Conv3d(4, 16, kernel_size=3, groups=2, bias=True).to("cpu", torch.float)
-        i = torch.randn(2, 4, 6, 6, 6, device="cpu", dtype=torch.float, requires_grad=True)
+        i = torch.randn(
+            2, 4, 6, 6, 6, device="cpu", dtype=torch.float, requires_grad=True
+        )
         output = m(i)
         grad_output = torch.randn(2, 16, 4, 4, 4, device="cpu", dtype=torch.float)
         output.backward(grad_output)
@@ -608,17 +831,24 @@ def test_Conv3d_groups_wbias(self):
         output2.backward(grad_output[:, 8:].contiguous())
 
         self.assertEqual(output, torch.cat([output1, output2], 1))
-        self.assertEqual(i.grad.data,
-                         torch.cat([i1.grad.data, i2.grad.data], 1),
-                         atol=dtype2prec_DONTUSE[torch.float],
-                         rtol=dtype2prec_DONTUSE[torch.float])
-        self.assertEqual(m.weight.grad.data,
-                         torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
-                         atol=dtype2prec_DONTUSE[torch.float],
-                         rtol=dtype2prec_DONTUSE[torch.float])
-        self.assertEqual(m.bias.grad.data,
-                         torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
-                         atol=dtype2prec_DONTUSE[torch.float], rtol=dtype2prec_DONTUSE[torch.float])
+        self.assertEqual(
+            i.grad.data,
+            torch.cat([i1.grad.data, i2.grad.data], 1),
+            atol=dtype2prec_DONTUSE[torch.float],
+            rtol=dtype2prec_DONTUSE[torch.float],
+        )
+        self.assertEqual(
+            m.weight.grad.data,
+            torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+            atol=dtype2prec_DONTUSE[torch.float],
+            rtol=dtype2prec_DONTUSE[torch.float],
+        )
+        self.assertEqual(
+            m.bias.grad.data,
+            torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
+            atol=dtype2prec_DONTUSE[torch.float],
+            rtol=dtype2prec_DONTUSE[torch.float],
+        )
 
     def test_conv_tbc(self):
         with set_default_dtype(torch.double):
@@ -626,7 +856,9 @@ def test_conv_tbc(self):
             weight = torch.randn(3, 5, 6, requires_grad=True)
             bias = torch.randn(6, requires_grad=True)
 
-            gradcheck(lambda i, w, b, pad: F.conv_tbc(i, w, b, pad), (inp, weight, bias, 3))
+            gradcheck(
+                lambda i, w, b, pad: F.conv_tbc(i, w, b, pad), (inp, weight, bias, 3)
+            )
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
@@ -634,11 +866,21 @@ def test_conv_tbc(self):
     @skipIfNotMiopenSuggestNHWC
     def test_grouped_conv_cudnn_nhwc_support(self):
         # in order to catch the hols in grouped convolution in nhwc support for earlier cudnn version
-        input = torch.randn((16, 16, 8, 8), dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
-        weight = torch.randn((8, 4, 3, 3), dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
-        out = torch.convolution(input, weight, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 4)
-        input = torch.randn((16, 8, 8, 8), dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
-        out_transpose = torch.convolution(input, weight, None, (1, 1), (1, 1), (1, 1), True, (0, 0), 4)
+        input = torch.randn((16, 16, 8, 8), dtype=torch.float16, device="cuda").to(
+            memory_format=torch.channels_last
+        )
+        weight = torch.randn((8, 4, 3, 3), dtype=torch.float16, device="cuda").to(
+            memory_format=torch.channels_last
+        )
+        out = torch.convolution(
+            input, weight, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 4
+        )
+        input = torch.randn((16, 8, 8, 8), dtype=torch.float16, device="cuda").to(
+            memory_format=torch.channels_last
+        )
+        out_transpose = torch.convolution(
+            input, weight, None, (1, 1), (1, 1), (1, 1), True, (0, 0), 4
+        )
 
     @unittest.expectedFailure
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
@@ -648,7 +890,9 @@ def test_conv_cudnn_memory_layout_dominance(self):
         # dominante the layout of output.
         # which is not the same as current behavior, we'll fix this in
         # following up PRs and remove the `expectedFailure` tag
-        input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float32, device="cuda", requires_grad=True)
+        input = torch.randint(
+            1, 10, (2, 8, 4, 4), dtype=torch.float32, device="cuda", requires_grad=True
+        )
         conv = nn.Conv2d(8, 4, 3).cuda().float()
 
         out = conv(input)
@@ -672,15 +916,21 @@ def test_cudnn_noncontiguous_weight(self):
         # passed to cuDNN
         input = torch.tensor([1, 1, 1], dtype=torch.double, device="cuda").view(1, 1, 3)
         weights1 = torch.tensor([1], dtype=torch.double, device="cuda").expand(1, 1, 2)
-        weights2 = torch.tensor([1], dtype=torch.double, device="cuda").expand(1, 1, 2).contiguous()
-        self.assertEqual(F.conv1d(input, weights1, bias=None, stride=2, dilation=2),
-                         F.conv1d(input, weights2, bias=None, stride=2, dilation=2))
-
-    def run_grad_conv_test(self, func_forward, func_backward, dim=1, gradient='input'):
+        weights2 = (
+            torch.tensor([1], dtype=torch.double, device="cuda")
+            .expand(1, 1, 2)
+            .contiguous()
+        )
+        self.assertEqual(
+            F.conv1d(input, weights1, bias=None, stride=2, dilation=2),
+            F.conv1d(input, weights2, bias=None, stride=2, dilation=2),
+        )
+
+    def run_grad_conv_test(self, func_forward, func_backward, dim=1, gradient="input"):
         for kern, inp_size in [(3, 6), (3, 7), (4, 9)]:
-            for batch, stride, padding, chan_in, chan_out, dilation in \
-                    product([1, 2], [1, 2], [0, 1, 2], [2], [3], [1]):
-
+            for batch, stride, padding, chan_in, chan_out, dilation in product(
+                [1, 2], [1, 2], [0, 1, 2], [2], [3], [1]
+            ):
                 for has_bias in [True, False]:
                     input_shape = [batch, chan_in]
                     weight_shape = [chan_out, chan_in]
@@ -692,44 +942,56 @@ def run_grad_conv_test(self, func_forward, func_backward, dim=1, gradient='input
                     weight = torch.randn(weight_shape, requires_grad=True)
                     if has_bias:
                         bias = torch.randn([chan_out], requires_grad=True)
-                    output = func_forward(input, weight, stride=stride, padding=padding, dilation=dilation, bias=bias)
+                    output = func_forward(
+                        input,
+                        weight,
+                        stride=stride,
+                        padding=padding,
+                        dilation=dilation,
+                        bias=bias,
+                    )
 
                     gradient_o = torch.randn(output.shape)
-                    gradient_w = torch.autograd.grad(output, input if (gradient == 'input') else weight, gradient_o)
+                    gradient_w = torch.autograd.grad(
+                        output, input if (gradient == "input") else weight, gradient_o
+                    )
 
-                    self.assertEqual(gradient_w[0],
-                                     func_backward(
-                                     input_shape if (gradient == 'input') else input,
-                                     weight_shape if (gradient == 'weight') else weight,
-                                     gradient_o,
-                                     stride=stride,
-                                     padding=padding,
-                                     dilation=dilation))
+                    self.assertEqual(
+                        gradient_w[0],
+                        func_backward(
+                            input_shape if (gradient == "input") else input,
+                            weight_shape if (gradient == "weight") else weight,
+                            gradient_o,
+                            stride=stride,
+                            padding=padding,
+                            dilation=dilation,
+                        ),
+                    )
 
     def test_grad_conv1d_input(self):
-        self.run_grad_conv_test(F.conv1d, F.grad.conv1d_input, 1, 'input')
+        self.run_grad_conv_test(F.conv1d, F.grad.conv1d_input, 1, "input")
 
     def test_grad_conv1d_weight(self):
-        self.run_grad_conv_test(F.conv1d, F.grad.conv1d_weight, 1, 'weight')
+        self.run_grad_conv_test(F.conv1d, F.grad.conv1d_weight, 1, "weight")
 
     def test_grad_conv2d_input(self):
-        self.run_grad_conv_test(F.conv2d, F.grad.conv2d_input, 2, 'input')
+        self.run_grad_conv_test(F.conv2d, F.grad.conv2d_input, 2, "input")
 
     def test_grad_conv2d_weight(self):
-        self.run_grad_conv_test(F.conv2d, F.grad.conv2d_weight, 2, 'weight')
+        self.run_grad_conv_test(F.conv2d, F.grad.conv2d_weight, 2, "weight")
 
     def test_grad_conv3d_input(self):
-        self.run_grad_conv_test(F.conv3d, F.grad.conv3d_input, 3, 'input')
+        self.run_grad_conv_test(F.conv3d, F.grad.conv3d_input, 3, "input")
 
     def test_grad_conv3d_weight(self):
-        self.run_grad_conv_test(F.conv3d, F.grad.conv3d_weight, 3, 'weight')
+        self.run_grad_conv_test(F.conv3d, F.grad.conv3d_weight, 3, "weight")
 
     @unittest.skipIf(not torch._nnpack_available(), "NNPACK unavailable")
     def test_nnpack_conv(self):
         for kern, inp_size in [(3, 6), (3, 7), (4, 9)]:
-            for batch, stride, padding, chan_in, chan_out in \
-                    product([1, 2, 3, 4], [1, 2], [0, 1, 2], [2], [3]):
-
+            for batch, stride, padding, chan_in, chan_out in product(
+                [1, 2, 3, 4], [1, 2], [0, 1, 2], [2], [3]
+            ):
                 for has_bias in [True, False]:
                     input_shape = [batch, chan_in]
                     weight_shape = [chan_out, chan_in]
@@ -737,19 +999,30 @@ def test_nnpack_conv(self):
                         input_shape.append(inp_size)
                         weight_shape.append(kern)
 
-                    input = torch.randn(input_shape, requires_grad=True, dtype=torch.float)
-                    weight = torch.randn(weight_shape, requires_grad=True, dtype=torch.float)
+                    input = torch.randn(
+                        input_shape, requires_grad=True, dtype=torch.float
+                    )
+                    weight = torch.randn(
+                        weight_shape, requires_grad=True, dtype=torch.float
+                    )
                     if has_bias:
-                        bias = torch.randn([chan_out], requires_grad=True, dtype=torch.float)
-                    output = torch._nnpack_spatial_convolution(input, weight, stride=stride, padding=padding, bias=bias)
+                        bias = torch.randn(
+                            [chan_out], requires_grad=True, dtype=torch.float
+                        )
+                    output = torch._nnpack_spatial_convolution(
+                        input, weight, stride=stride, padding=padding, bias=bias
+                    )
                     output_expected = torch.nn.functional.conv2d(
-                        input, weight, stride=stride, padding=padding, bias=bias)
+                        input, weight, stride=stride, padding=padding, bias=bias
+                    )
                     self.assertEqual(output, output_expected, atol=3e-4, rtol=0)
 
                     gradient_o = torch.randn(output.shape, dtype=torch.float)
 
                     grads = torch.autograd.grad(output, [input, weight], gradient_o)
-                    grads_expected = torch.autograd.grad(output_expected, [input, weight], gradient_o)
+                    grads_expected = torch.autograd.grad(
+                        output_expected, [input, weight], gradient_o
+                    )
                     for gr, gr_expected in zip(grads, grads_expected):
                         self.assertEqual(gr, gr_expected, atol=3e-4, rtol=0)
 
@@ -760,7 +1033,7 @@ def test_conv_padding_mode(self):
         with self.assertRaisesRegex(ValueError, "padding_mode must be one of"):
             nn.Conv2d(3, 3, 3, padding_mode=3)
 
-        with self.assertRaisesRegex(ValueError, "Only \"zeros\" "):
+        with self.assertRaisesRegex(ValueError, 'Only "zeros" '):
             nn.ConvTranspose2d(3, 3, 3, padding_mode="reflect")
 
     def test_functional_grad_conv(self):
@@ -770,12 +1043,18 @@ def test_functional_grad_conv(self):
         output = F.conv1d(input, weight, dilation=2)
         grad_output = torch.randn(output.shape)
 
-        grad_input_autograd, grad_weight_autograd = torch.autograd.grad(output, (input, weight), grad_output)
+        grad_input_autograd, grad_weight_autograd = torch.autograd.grad(
+            output, (input, weight), grad_output
+        )
 
-        grad_input_functional = torch.nn.grad.conv1d_input(input.shape, weight, grad_output, dilation=2)
+        grad_input_functional = torch.nn.grad.conv1d_input(
+            input.shape, weight, grad_output, dilation=2
+        )
         self.assertEqual(grad_input_functional, grad_input_autograd)
 
-        grad_weight_functional = torch.nn.grad.conv1d_weight(input, weight.shape, grad_output, dilation=2)
+        grad_weight_functional = torch.nn.grad.conv1d_weight(
+            input, weight.shape, grad_output, dilation=2
+        )
         self.assertEqual(grad_weight_functional, grad_weight_autograd)
 
         # Conv 2D
@@ -784,12 +1063,18 @@ def test_functional_grad_conv(self):
         output = F.conv2d(input, weight, dilation=2)
         grad_output = torch.randn(output.shape)
 
-        (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(output, (input, weight), grad_output)
+        (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(
+            output, (input, weight), grad_output
+        )
 
-        grad_input_functional = torch.nn.grad.conv2d_input(input.shape, weight, grad_output, dilation=2)
+        grad_input_functional = torch.nn.grad.conv2d_input(
+            input.shape, weight, grad_output, dilation=2
+        )
         self.assertEqual(grad_input_functional, grad_input_autograd)
 
-        grad_weight_functional = torch.nn.grad.conv2d_weight(input, weight.shape, grad_output, dilation=2)
+        grad_weight_functional = torch.nn.grad.conv2d_weight(
+            input, weight.shape, grad_output, dilation=2
+        )
         self.assertEqual(grad_weight_functional, grad_weight_autograd)
 
         # Conv 3D
@@ -798,12 +1083,18 @@ def test_functional_grad_conv(self):
         output = F.conv3d(input, weight, dilation=2)
         grad_output = torch.randn(output.shape)
 
-        (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(output, (input, weight), grad_output)
+        (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(
+            output, (input, weight), grad_output
+        )
 
-        grad_input_functional = torch.nn.grad.conv3d_input(input.shape, weight, grad_output, dilation=2)
+        grad_input_functional = torch.nn.grad.conv3d_input(
+            input.shape, weight, grad_output, dilation=2
+        )
         self.assertEqual(grad_input_functional, grad_input_autograd)
 
-        grad_weight_functional = torch.nn.grad.conv3d_weight(input, weight.shape, grad_output, dilation=2)
+        grad_weight_functional = torch.nn.grad.conv3d_weight(
+            input, weight.shape, grad_output, dilation=2
+        )
         self.assertEqual(grad_weight_functional, grad_weight_autograd)
 
     def test_functional_grad_conv2d(self):
@@ -815,24 +1106,53 @@ def test_functional_grad_conv2d(self):
         def _test_conv2d(stride, kernel_size, groups, dilation):
             padding = kernel_size // 2
 
-            input = torch.empty(BATCH_SIZE, IN_CH, SPATIAL, SPATIAL).uniform_(-8.0, 8.0).requires_grad_(True)
+            input = (
+                torch.empty(BATCH_SIZE, IN_CH, SPATIAL, SPATIAL)
+                .uniform_(-8.0, 8.0)
+                .requires_grad_(True)
+            )
 
-            weight = torch.empty(OUT_CH, IN_CH // groups, kernel_size,
-                                 kernel_size).uniform_(-4.0, 4.0).requires_grad_(True)
+            weight = (
+                torch.empty(OUT_CH, IN_CH // groups, kernel_size, kernel_size)
+                .uniform_(-4.0, 4.0)
+                .requires_grad_(True)
+            )
 
-            output = F.conv2d(input, weight,
-                              stride=stride, padding=padding, dilation=dilation, groups=groups)
+            output = F.conv2d(
+                input,
+                weight,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+            )
 
             grad_output = torch.randn(output.shape)
 
-            (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(output, (input, weight), grad_output)
+            (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(
+                output, (input, weight), grad_output
+            )
 
-            grad_input_functional = torch.nn.grad.conv2d_input(input.shape, weight, grad_output,
-                                                               stride=stride, padding=padding, dilation=dilation, groups=groups)
+            grad_input_functional = torch.nn.grad.conv2d_input(
+                input.shape,
+                weight,
+                grad_output,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+            )
             self.assertEqual(grad_input_functional, grad_input_autograd)
 
-            grad_weight_functional = torch.nn.grad.conv2d_weight(input, weight.shape, grad_output,
-                                                                 stride=stride, padding=padding, dilation=dilation, groups=groups)
+            grad_weight_functional = torch.nn.grad.conv2d_weight(
+                input,
+                weight.shape,
+                grad_output,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+            )
             self.assertEqual(grad_weight_functional, grad_weight_autograd)
 
         strides = [1, 2]
@@ -843,20 +1163,71 @@ def _test_conv2d(stride, kernel_size, groups, dilation):
         for s, k, g, d in product(strides, kernel_sizes, groups, dilates):
             _test_conv2d(s, k, g, d)
 
+    def test_permute_conv2d_issue_120211(self):
+        def reproducer(radius: int):
+            image = torch.rand(1, 1024, 1024, 3)
+            image = image.permute(0, 3, 1, 2)
+            kernel_x = torch.zeros([3, 1, 1, radius * 2 + 1], device=image.device)
+            image = torch.nn.functional.conv2d(image, kernel_x, groups=image.shape[-3])
+
+        for i in range(0, 128):
+            # This should not fail
+            reproducer(radius=i)
+
+    def test_conv3d_issue_120406(self):
+        # This should not fail
+        F.conv3d(torch.ones(2, 3, 8, 9, 26), torch.ones(3, 1, 1, 1, 17), groups=3)
+
+    def test_conv1d_issue_120547(self):
+        weight = torch.ones([16, 1, 32])
+        bias = torch.ones([16])
+        stride, padding, dilation, groups = (1, 16, 1, 16)
+        input = torch.rand((1, 1, 16))
+        input = input.transpose(1, 2)
+        # This should not fail
+        F.conv1d(input, weight, bias, stride, padding, dilation, groups)
+
 
 class TestConvolutionNNDeviceType(NNTestCase):
-    def run_conv_double_back_test(self, kern, stride, padding, chan_in, chan_out, batch_size,
-                                  inp_size, dilation, no_weight, groups=1, use_cuda=False,
-                                  use_bias=True, dtype=torch.double):
+    def run_conv_double_back_test(
+        self,
+        kern,
+        stride,
+        padding,
+        chan_in,
+        chan_out,
+        batch_size,
+        inp_size,
+        dilation,
+        no_weight,
+        groups=1,
+        use_cuda=False,
+        use_bias=True,
+        dtype=torch.double,
+    ):
         if use_cuda:
             device = torch.device("cuda")
         else:
             device = torch.device("cpu")
 
-        x = torch.randn(batch_size, chan_in, inp_size, inp_size, device=device,
-                        dtype=dtype, requires_grad=True)
-        weight = torch.randn(chan_out, chan_in // groups, kern, kern, device=device,
-                             dtype=dtype, requires_grad=not no_weight)
+        x = torch.randn(
+            batch_size,
+            chan_in,
+            inp_size,
+            inp_size,
+            device=device,
+            dtype=dtype,
+            requires_grad=True,
+        )
+        weight = torch.randn(
+            chan_out,
+            chan_in // groups,
+            kern,
+            kern,
+            device=device,
+            dtype=dtype,
+            requires_grad=not no_weight,
+        )
         if use_bias:
             bias = torch.randn(chan_out, device=device, dtype=dtype, requires_grad=True)
         else:
@@ -879,19 +1250,25 @@ def func(*inputs):
             inputs = x, weight
 
         dummy_out = func(*inputs)
-        grad_y = torch.randn_like(dummy_out, device=device, dtype=dtype, requires_grad=True)
+        grad_y = torch.randn_like(
+            dummy_out, device=device, dtype=dtype, requires_grad=True
+        )
 
         # Issue #15353: test mkldnn double backward, don't run gradgradcheck due
         # to imprecision issues
         if dtype == torch.float:
-            g, = torch.autograd.grad(dummy_out.sum(), x, create_graph=True)
+            (g,) = torch.autograd.grad(dummy_out.sum(), x, create_graph=True)
             return g.requires_grad
 
         return gradgradcheck(func, inputs, (grad_y,))
 
     @onlyCUDA
     @skipCUDAIfNoCudnn
-    @dtypes(*floating_and_complex_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
+    @dtypes(
+        *floating_and_complex_types_and(
+            torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []
+        )
+    )
     def test_Conv2d_deterministic_cudnn(self, device, dtype):
         inputs = torch.randn(2, 3, 5, 5, device=device, dtype=dtype, requires_grad=True)
         with cudnn.flags(enabled=True, benchmark=True, deterministic=True):
@@ -905,11 +1282,17 @@ def test_Conv2d_deterministic_cudnn(self, device, dtype):
             y = torch.randn(out1.size(), device=device, dtype=dtype)
             out1.backward(y)
             out2.backward(y)
-            self.assertEqual(conv1.bias.grad.data, conv2.bias.grad.data, atol=0.0, rtol=0)
-            self.assertEqual(conv1.weight.grad.data, conv2.weight.grad.data, atol=0.0, rtol=0)
+            self.assertEqual(
+                conv1.bias.grad.data, conv2.bias.grad.data, atol=0.0, rtol=0
+            )
+            self.assertEqual(
+                conv1.weight.grad.data, conv2.weight.grad.data, atol=0.0, rtol=0
+            )
 
     @onlyCUDA
-    @dtypes(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
+    @dtypes(
+        *floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else [])
+    )
     def test_Conv2d_large_workspace(self, device, dtype):
         # These sizes require huge cuDNN workspaces. Make sure we choose a
         # reasonable algorithm that does not run out of memory
@@ -921,7 +1304,9 @@ def test_Conv2d_large_workspace(self, device, dtype):
 
         def run_test(benchmark):
             with torch.backends.cudnn.flags(enabled=True, benchmark=benchmark):
-                conv = torch.nn.Conv2d(256, 256, kernel_size=3, padding=1).to(device, dtype)
+                conv = torch.nn.Conv2d(256, 256, kernel_size=3, padding=1).to(
+                    device, dtype
+                )
                 for size in sizes:
                     x = torch.randn(size, device=device, dtype=dtype)
                     out = conv(x.detach().clone().requires_grad_())
@@ -933,12 +1318,15 @@ def run_test(benchmark):
     @onlyCUDA
     @dtypes(torch.half, torch.float)
     def test_ConvTranspose2d_large_output_padding(self, device, dtype):
-        net1 = torch.nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)\
-            .to(device=device, dtype=dtype)
-        net2 = torch.nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1)\
-            .to(device=device, dtype=dtype)
-        net3 = torch.nn.ConvTranspose2d(32, 3, kernel_size=3, stride=2, padding=1, output_padding=1)\
-            .to(device=device, dtype=dtype)
+        net1 = torch.nn.ConvTranspose2d(
+            128, 64, kernel_size=3, stride=2, padding=1, output_padding=1
+        ).to(device=device, dtype=dtype)
+        net2 = torch.nn.ConvTranspose2d(
+            64, 32, kernel_size=3, stride=2, padding=1, output_padding=1
+        ).to(device=device, dtype=dtype)
+        net3 = torch.nn.ConvTranspose2d(
+            32, 3, kernel_size=3, stride=2, padding=1, output_padding=1
+        ).to(device=device, dtype=dtype)
         x = torch.rand(1, 128, 6, 6, device=device, dtype=dtype, requires_grad=True)
         x = net1(x)
         x = net2(x)
@@ -954,10 +1342,19 @@ def test_ConvTranspose2d_large_output_padding(self, device, dtype):
     @tf32_on_and_off(0.01)
     def test_Conv2d_depthwise_naive_groups(self, device, dtype):
         for depth_multiplier in [1, 2]:
-            m = nn.Conv2d(2, 2 * depth_multiplier, kernel_size=3, groups=2).to(device, dtype)
-            i = torch.randn(2, 2, 6, 6, device="cuda", dtype=dtype).div_(2).requires_grad_()
+            m = nn.Conv2d(2, 2 * depth_multiplier, kernel_size=3, groups=2).to(
+                device, dtype
+            )
+            i = (
+                torch.randn(2, 2, 6, 6, device="cuda", dtype=dtype)
+                .div_(2)
+                .requires_grad_()
+            )
             output = m(i)
-            grad_output = torch.randn(2, 2 * depth_multiplier, 4, 4, device=device, dtype=dtype) / 2
+            grad_output = (
+                torch.randn(2, 2 * depth_multiplier, 4, 4, device=device, dtype=dtype)
+                / 2
+            )
             output.backward(grad_output)
 
             offset = 1 * depth_multiplier
@@ -976,30 +1373,52 @@ def test_Conv2d_depthwise_naive_groups(self, device, dtype):
             output2 = m2(i2)
             output2.backward(grad_output[:, offset:].contiguous())
 
-            self.assertEqual(output, torch.cat([output1, output2], 1),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(i.grad.data,
-                             torch.cat([i1.grad.data, i2.grad.data], 1),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(m.bias.grad.data,
-                             torch.cat([m1.bias.grad.data,
-                                        m2.bias.grad.data], 0),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(m.weight.grad.data,
-                             torch.cat([m1.weight.grad.data,
-                                        m2.weight.grad.data], 0),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
+            self.assertEqual(
+                output,
+                torch.cat([output1, output2], 1),
+                atol=dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
+            self.assertEqual(
+                i.grad.data,
+                torch.cat([i1.grad.data, i2.grad.data], 1),
+                atol=dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
+            self.assertEqual(
+                m.bias.grad.data,
+                torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
+                atol=dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
+            self.assertEqual(
+                m.weight.grad.data,
+                torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                atol=dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
 
     @onlyCUDA
     @dtypes(torch.float, torch.double, torch.half)
     @torch.backends.cudnn.flags(enabled=True, benchmark=False)
-    @tf32_on_and_off(0.005)
+    @tf32_on_and_off(0.01)
     def test_Conv3d_depthwise_naive_groups(self, device, dtype):
         for depth_multiplier in [1, 2]:
-            m = nn.Conv3d(2, 2 * depth_multiplier, kernel_size=3, groups=2).to(device, dtype)
-            i = torch.randn(2, 2, 6, 6, 6, device="cuda", dtype=dtype).div_(2).requires_grad_()
+            m = nn.Conv3d(2, 2 * depth_multiplier, kernel_size=3, groups=2).to(
+                device, dtype
+            )
+            i = (
+                torch.randn(2, 2, 6, 6, 6, device="cuda", dtype=dtype)
+                .div_(2)
+                .requires_grad_()
+            )
             output = m(i)
-            grad_output = torch.randn(2, 2 * depth_multiplier, 4, 4, 4, device=device, dtype=dtype) / 2
+            grad_output = (
+                torch.randn(
+                    2, 2 * depth_multiplier, 4, 4, 4, device=device, dtype=dtype
+                )
+                / 2
+            )
             output.backward(grad_output)
 
             offset = 1 * depth_multiplier
@@ -1017,29 +1436,47 @@ def test_Conv3d_depthwise_naive_groups(self, device, dtype):
             i2 = i.detach()[:, 1:].clone().requires_grad_()
             output2 = m2(i2)
             output2.backward(grad_output[:, offset:].contiguous())
-            is_cuda_sm86 = device.startswith("cuda") and torch.cuda.get_device_capability(0) == (8, 6)
-            atol, rtol = (3e-4, 3e-2) if dtype == torch.float32 and is_cuda_sm86 else (dtype2prec_DONTUSE[dtype], 0)
-
-            self.assertEqual(output, torch.cat([output1, output2], 1),
-                             atol=atol, rtol=rtol)
-            self.assertEqual(i.grad.data,
-                             torch.cat([i1.grad.data, i2.grad.data], 1),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(m.bias.grad.data,
-                             torch.cat([m1.bias.grad.data,
-                                        m2.bias.grad.data], 0),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(m.weight.grad.data,
-                             torch.cat([m1.weight.grad.data,
-                                        m2.weight.grad.data], 0),
-                             atol=atol, rtol=rtol)
+            is_cuda_sm86 = device.startswith(
+                "cuda"
+            ) and torch.cuda.get_device_capability(0) == (8, 6)
+            atol, rtol = (
+                (3e-4, 3e-2)
+                if dtype == torch.float32 and is_cuda_sm86
+                else (dtype2prec_DONTUSE[dtype], 0)
+            )
+
+            self.assertEqual(
+                output, torch.cat([output1, output2], 1), atol=atol, rtol=rtol
+            )
+            self.assertEqual(
+                i.grad.data,
+                torch.cat([i1.grad.data, i2.grad.data], 1),
+                atol=dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
+            self.assertEqual(
+                m.bias.grad.data,
+                torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
+                atol=dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
+            self.assertEqual(
+                m.weight.grad.data,
+                torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                atol=atol,
+                rtol=rtol,
+            )
 
     @onlyCUDA
-    @dtypes(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
+    @dtypes(
+        *floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else [])
+    )
     def test_noncontig_conv_grad(self, device, dtype):
         # FIXME: remove after adding non-contiguous grad tests for all modules
         module = nn.Conv2d(3, 5, kernel_size=3, padding=1).to(device, dtype)
-        input = torch.randn(2, 3, 10, 10, dtype=dtype, device=device, requires_grad=True)
+        input = torch.randn(
+            2, 3, 10, 10, dtype=dtype, device=device, requires_grad=True
+        )
         output = module(input)
 
         grad = torch.randn(2, 2, 5, 10, 10, dtype=dtype, device=device)[:, 1]
@@ -1050,7 +1487,9 @@ def test_noncontig_conv_grad(self, device, dtype):
         input.grad.data.zero_()
 
         output.backward(grad.contiguous())
-        self.assertEqual(result, input.grad.data, atol=dtype2prec_DONTUSE[dtype], rtol=0)
+        self.assertEqual(
+            result, input.grad.data, atol=dtype2prec_DONTUSE[dtype], rtol=0
+        )
 
     @onlyCUDA
     @dtypes(torch.double)
@@ -1059,22 +1498,43 @@ def test_conv_double_backward(self, device, dtype):
             # Double backward only runs with DoubleTensor due to precision reason
             batch_size = 1
             for kern, inp_size, dilations in [(3, 5, [1, 2]), (4, 9, [1])]:
-                for stride, padding, chan_in, chan_out, dilation in product([1], [2], [2], [3], dilations):
+                for stride, padding, chan_in, chan_out, dilation in product(
+                    [1], [2], [2], [3], dilations
+                ):
                     no_weight = stride == 2
-                    result = self.run_conv_double_back_test(kern, stride,
-                                                            padding, chan_in, chan_out,
-                                                            batch_size, inp_size, dilation,
-                                                            no_weight, use_cuda=True, dtype=dtype)
-                    self.assertTrue(result,
-                                    "Conv double backward test failed with parameters:" +
-                                    "\nkern: " + str(kern) +
-                                    "\nstride: " + str(stride) +
-                                    "\npadding: " + str(padding) +
-                                    "\nchan_in: " + str(chan_in) +
-                                    "\nchan_out: " + str(chan_out) +
-                                    "\nbatch_size: " + str(batch_size) +
-                                    "\ninp_size: " + str(inp_size) +
-                                    "\ndilation: " + str(dilation))
+                    result = self.run_conv_double_back_test(
+                        kern,
+                        stride,
+                        padding,
+                        chan_in,
+                        chan_out,
+                        batch_size,
+                        inp_size,
+                        dilation,
+                        no_weight,
+                        use_cuda=True,
+                        dtype=dtype,
+                    )
+                    self.assertTrue(
+                        result,
+                        "Conv double backward test failed with parameters:"
+                        + "\nkern: "
+                        + str(kern)
+                        + "\nstride: "
+                        + str(stride)
+                        + "\npadding: "
+                        + str(padding)
+                        + "\nchan_in: "
+                        + str(chan_in)
+                        + "\nchan_out: "
+                        + str(chan_out)
+                        + "\nbatch_size: "
+                        + str(batch_size)
+                        + "\ninp_size: "
+                        + str(inp_size)
+                        + "\ndilation: "
+                        + str(dilation),
+                    )
 
     def test_conv_double_backward_no_bias(self):
         kern = 3
@@ -1086,20 +1546,38 @@ def test_conv_double_backward_no_bias(self):
         dilation = 1
         no_weight = False
         use_bias = True
-        result = self.run_conv_double_back_test(kern, stride,
-                                                padding, chan_in, chan_out,
-                                                batch_size, inp_size, dilation,
-                                                no_weight, use_bias=use_bias)
-        self.assertTrue(result,
-                        "Conv double backward test failed with parameters:" +
-                        "\nkern: " + str(kern) +
-                        "\nstride: " + str(stride) +
-                        "\npadding: " + str(padding) +
-                        "\nchan_in: " + str(chan_in) +
-                        "\nchan_out: " + str(chan_out) +
-                        "\nbatch_size: " + str(batch_size) +
-                        "\ninp_size: " + str(inp_size) +
-                        "\ndilation: " + str(dilation))
+        result = self.run_conv_double_back_test(
+            kern,
+            stride,
+            padding,
+            chan_in,
+            chan_out,
+            batch_size,
+            inp_size,
+            dilation,
+            no_weight,
+            use_bias=use_bias,
+        )
+        self.assertTrue(
+            result,
+            "Conv double backward test failed with parameters:"
+            + "\nkern: "
+            + str(kern)
+            + "\nstride: "
+            + str(stride)
+            + "\npadding: "
+            + str(padding)
+            + "\nchan_in: "
+            + str(chan_in)
+            + "\nchan_out: "
+            + str(chan_out)
+            + "\nbatch_size: "
+            + str(batch_size)
+            + "\ninp_size: "
+            + str(inp_size)
+            + "\ndilation: "
+            + str(dilation),
+        )
 
     def test_conv_double_backward_groups(self):
         kern = 3
@@ -1111,33 +1589,61 @@ def test_conv_double_backward_groups(self):
         dilation = 1
         no_weight = False
         groups = 2
-        result = self.run_conv_double_back_test(kern, stride,
-                                                padding, chan_in * groups, chan_out * groups,
-                                                batch_size, inp_size, dilation,
-                                                no_weight, groups=groups)
-        self.assertTrue(result,
-                        "Conv double backward test failed with parameters:" +
-                        "\nkern: " + str(kern) +
-                        "\nstride: " + str(stride) +
-                        "\npadding: " + str(padding) +
-                        "\nchan_in: " + str(chan_in) +
-                        "\nchan_out: " + str(chan_out) +
-                        "\nbatch_size: " + str(batch_size) +
-                        "\ninp_size: " + str(inp_size) +
-                        "\ndilation: " + str(dilation) +
-                        "\ngroups: " + str(groups))
+        result = self.run_conv_double_back_test(
+            kern,
+            stride,
+            padding,
+            chan_in * groups,
+            chan_out * groups,
+            batch_size,
+            inp_size,
+            dilation,
+            no_weight,
+            groups=groups,
+        )
+        self.assertTrue(
+            result,
+            "Conv double backward test failed with parameters:"
+            + "\nkern: "
+            + str(kern)
+            + "\nstride: "
+            + str(stride)
+            + "\npadding: "
+            + str(padding)
+            + "\nchan_in: "
+            + str(chan_in)
+            + "\nchan_out: "
+            + str(chan_out)
+            + "\nbatch_size: "
+            + str(batch_size)
+            + "\ninp_size: "
+            + str(inp_size)
+            + "\ndilation: "
+            + str(dilation)
+            + "\ngroups: "
+            + str(groups),
+        )
 
     def test_conv_double_backward_stride(self):
         batch_size = 2
 
         # Cannot provide ggW when stride is > 1
         for kern, inp_size, dilations in [(3, 5, [1, 2]), (3, 7, [1])]:
-            for stride, padding, chan_in, chan_out, dilation in product([2], [0, 1], [1], [2], dilations):
+            for stride, padding, chan_in, chan_out, dilation in product(
+                [2], [0, 1], [1], [2], dilations
+            ):
                 no_weight = False
-                self.run_conv_double_back_test(kern, stride,
-                                               padding, chan_in, chan_out,
-                                               batch_size, inp_size, dilation,
-                                               no_weight)
+                self.run_conv_double_back_test(
+                    kern,
+                    stride,
+                    padding,
+                    chan_in,
+                    chan_out,
+                    batch_size,
+                    inp_size,
+                    dilation,
+                    no_weight,
+                )
 
     @dtypes(torch.float, torch.cfloat)
     @torch.backends.cudnn.flags(enabled=True, benchmark=False)
@@ -1156,7 +1662,7 @@ def test_conv1d_same_padding(self, device, dtype):
         for in_size, k_size, dilation, stride in itertools.product(*test_args):
             x = torch.rand(1, 1, in_size, device=device, dtype=dtype)
             y = torch.rand(1, 1, k_size, device=device, dtype=dtype)
-            z = F.conv1d(x, y, padding='same', dilation=dilation, stride=stride)
+            z = F.conv1d(x, y, padding="same", dilation=dilation, stride=stride)
             self.assertEqual(z.size(2), int(math.ceil(in_size / stride)))
 
         # Compare F.conv1d padding='same' output against manual padding
@@ -1164,19 +1670,19 @@ def test_conv1d_same_padding(self, device, dtype):
         x = torch.rand(1, 1, 12, device=device, dtype=dtype)
         y = torch.rand(1, 1, 3, device=device, dtype=dtype)
         expect = F.conv1d(x, y, padding=1)
-        actual = F.conv1d(x, y, padding='same')
+        actual = F.conv1d(x, y, padding="same")
         self.assertEqual(expect, actual)
 
         # With dilation
         x = torch.rand(1, 1, 12, device=device, dtype=dtype)
         y = torch.rand(1, 1, 4, device=device, dtype=dtype)
         expect = F.conv1d(x, y, padding=3, dilation=2)
-        actual = F.conv1d(x, y, padding='same', dilation=2)
+        actual = F.conv1d(x, y, padding="same", dilation=2)
         self.assertEqual(expect, actual)
 
         # Dilation with asymmetric padding
         expect = F.conv1d(x, y, padding=5, dilation=3)[..., 1:]
-        actual = F.conv1d(x, y, padding='same', dilation=3)
+        actual = F.conv1d(x, y, padding="same", dilation=3)
         self.assertEqual(expect, actual)
 
     @dtypes(torch.float, torch.cfloat)
@@ -1190,19 +1696,19 @@ def test_conv2d_same_padding(self, device, dtype):
         x = torch.rand(1, 1, 10, 11, device=device, dtype=dtype)
         y = torch.rand(1, 1, 4, 5, device=device, dtype=dtype)
         expect = F.conv2d(x, y, padding=(2, 2))[..., 1:, :]
-        actual = F.conv2d(x, y, padding='same')
+        actual = F.conv2d(x, y, padding="same")
         self.assertEqual(expect, actual, rtol=rtol, atol=atol)
 
         # With dilation
         y = torch.rand(1, 1, 3, 4, device=device, dtype=dtype)
         expect = F.conv2d(x, y, padding=(2, 3), dilation=2)
-        actual = F.conv2d(x, y, padding='same', dilation=2)
+        actual = F.conv2d(x, y, padding="same", dilation=2)
         self.assertEqual(expect, actual, rtol=rtol, atol=atol)
 
         # Dilation with asymmetric padding
         y = torch.rand(1, 1, 4, 4, device=device, dtype=dtype)
         expect = F.conv2d(x, y, padding=5, dilation=3)[..., 1:, 1:]
-        actual = F.conv2d(x, y, padding='same', dilation=3)
+        actual = F.conv2d(x, y, padding="same", dilation=3)
         self.assertEqual(expect, actual, rtol=rtol, atol=atol)
 
     @dtypes(torch.float, torch.cfloat)
@@ -1216,18 +1722,18 @@ def test_conv3d_same_padding(self, device, dtype):
         x = torch.rand(1, 1, 10, 11, 12, device=device, dtype=dtype)
         y = torch.rand(1, 1, 1, 2, 5, device=device, dtype=dtype)
         expect = F.conv3d(x, y, padding=(0, 1, 2))[..., :, 1:, :]
-        actual = F.conv3d(x, y, padding='same')
+        actual = F.conv3d(x, y, padding="same")
         self.assertEqual(expect, actual, rtol=rtol, atol=atol)
 
         # With dilation
         expect = F.conv3d(x, y, padding=(0, 1, 4), dilation=2)
-        actual = F.conv3d(x, y, padding='same', dilation=2)
+        actual = F.conv3d(x, y, padding="same", dilation=2)
         self.assertEqual(expect, actual, rtol=rtol, atol=atol)
 
         # Dilation with asymmetric padding
         y = torch.rand(1, 1, 4, 4, 4, device=device, dtype=dtype)
         expect = F.conv3d(x, y, padding=5, dilation=3)[..., 1:, 1:, 1:]
-        actual = F.conv3d(x, y, padding='same', dilation=3)
+        actual = F.conv3d(x, y, padding="same", dilation=3)
         self.assertEqual(expect, actual, rtol=rtol, atol=atol)
 
     @dtypes(torch.float, torch.cfloat)
@@ -1236,7 +1742,7 @@ def test_conv1d_valid_padding(self, device, dtype):
         x = torch.rand(1, 1, 10, device=device, dtype=dtype)
         y = torch.rand(1, 1, 4, device=device, dtype=dtype)
         expect = F.conv1d(x, y)
-        actual = F.conv1d(x, y, padding='valid')
+        actual = F.conv1d(x, y, padding="valid")
         self.assertEqual(expect, actual)
 
     @dtypes(torch.float, torch.cfloat)
@@ -1245,7 +1751,7 @@ def test_conv2d_valid_padding(self, device, dtype):
         x = torch.rand(1, 1, 1, 10, device=device, dtype=dtype)
         y = torch.rand(1, 1, 1, 4, device=device, dtype=dtype)
         expect = F.conv2d(x, y)
-        actual = F.conv2d(x, y, padding='valid')
+        actual = F.conv2d(x, y, padding="valid")
         self.assertEqual(expect, actual)
 
     @dtypes(torch.float, torch.cfloat)
@@ -1254,7 +1760,7 @@ def test_conv3d_valid_padding(self, device, dtype):
         x = torch.rand(1, 1, 1, 1, 10, dtype=dtype, device=device)
         y = torch.rand(1, 1, 1, 1, 4, dtype=dtype, device=device)
         expect = F.conv3d(x, y)
-        actual = F.conv3d(x, y, padding='valid')
+        actual = F.conv3d(x, y, padding="valid")
         self.assertEqual(expect, actual)
 
     @dtypes(torch.float, torch.cfloat)
@@ -1269,7 +1775,7 @@ def test_conv1d_same_padding_backward(self, device, dtype):
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
-        z = F.conv1d(x, y, padding='same', dilation=2)
+        z = F.conv1d(x, y, padding="same", dilation=2)
         z.sum().abs().backward()
         self.assertEqual(gx_expect, x.grad)
         self.assertEqual(gy_expect, y.grad)
@@ -1281,7 +1787,7 @@ def test_conv1d_same_padding_backward(self, device, dtype):
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
-        z = F.conv1d(x, y, padding='same')
+        z = F.conv1d(x, y, padding="same")
         z.sum().abs().backward()
         self.assertEqual(gx_expect, x.grad)
         self.assertEqual(gy_expect, y.grad)
@@ -1299,7 +1805,7 @@ def test_conv2d_same_padding_backward(self, device, dtype):
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
-        z = F.conv2d(x, y, padding='same', dilation=2)
+        z = F.conv2d(x, y, padding="same", dilation=2)
         z.sum().abs().backward()
         self.assertEqual(gx_expect, x.grad)
         self.assertEqual(gy_expect, y.grad)
@@ -1312,14 +1818,14 @@ def test_conv2d_same_padding_backward(self, device, dtype):
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
-        z = F.conv2d(x, y, padding='same')
+        z = F.conv2d(x, y, padding="same")
         z.sum().abs().backward()
         self.assertEqual(gx_expect, x.grad)
         self.assertEqual(gy_expect, y.grad)
 
     @dtypes(torch.double, torch.cdouble)
     def test_conv3d_same_padding_backward(self, device, dtype):
-        check_forward_ad = torch.device(device).type != 'xla'
+        check_forward_ad = torch.device(device).type != "xla"
 
         # Test F.conv3d gradients work with padding='same'
         x = torch.rand(1, 1, 1, 11, 12, dtype=dtype, device=device, requires_grad=True)
@@ -1331,18 +1837,25 @@ def test_conv3d_same_padding_backward(self, device, dtype):
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
-        z = F.conv3d(x, y, padding='same', dilation=2)
+        z = F.conv3d(x, y, padding="same", dilation=2)
         z.sum().abs().backward()
         self.assertEqual(gx_expect, x.grad)
         self.assertEqual(gy_expect, y.grad)
         x.grad, y.grad = None, None
 
-        gradcheck(lambda x, y: F.conv3d(x, y, padding='same', dilation=2), (x, y),
-                  check_forward_ad=check_forward_ad, nondet_tol=1e-5)
-        if torch.device(device).type != 'cuda':
+        gradcheck(
+            lambda x, y: F.conv3d(x, y, padding="same", dilation=2),
+            (x, y),
+            check_forward_ad=check_forward_ad,
+            nondet_tol=1e-5,
+        )
+        if torch.device(device).type != "cuda":
             # https://github.com/pytorch/pytorch/issues/70702
-            gradgradcheck(lambda x, y: F.conv3d(x, y, padding='same', dilation=2), (x, y),
-                          check_fwd_over_rev=True)
+            gradgradcheck(
+                lambda x, y: F.conv3d(x, y, padding="same", dilation=2),
+                (x, y),
+                check_fwd_over_rev=True,
+            )
 
         # Asymmetric padding
         y = torch.rand(1, 1, 1, 4, 4, dtype=dtype, device=device, requires_grad=True)
@@ -1351,17 +1864,24 @@ def test_conv3d_same_padding_backward(self, device, dtype):
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
-        z = F.conv3d(x, y, padding='same')
+        z = F.conv3d(x, y, padding="same")
         z.sum().abs().backward()
         self.assertEqual(gx_expect, x.grad)
         self.assertEqual(gy_expect, y.grad)
 
-        gradcheck(lambda x, y: F.conv3d(x, y, padding='same'), (x, y),
-                  check_forward_ad=check_forward_ad, nondet_tol=1e-5)
-        if torch.device(device).type != 'cuda':
+        gradcheck(
+            lambda x, y: F.conv3d(x, y, padding="same"),
+            (x, y),
+            check_forward_ad=check_forward_ad,
+            nondet_tol=1e-5,
+        )
+        if torch.device(device).type != "cuda":
             # https://github.com/pytorch/pytorch/issues/70702
-            gradgradcheck(lambda x, y: F.conv3d(x, y, padding='same'), (x, y),
-                          check_fwd_over_rev=True)
+            gradgradcheck(
+                lambda x, y: F.conv3d(x, y, padding="same"),
+                (x, y),
+                check_fwd_over_rev=True,
+            )
 
     @dtypes(torch.float, torch.cfloat)
     def test_conv1d_valid_padding_backward(self, device, dtype):
@@ -1372,14 +1892,14 @@ def test_conv1d_valid_padding_backward(self, device, dtype):
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
-        F.conv1d(x, y, padding='valid').sum().abs().backward()
+        F.conv1d(x, y, padding="valid").sum().abs().backward()
         gx_actual, gy_actual = x.grad, y.grad
         self.assertEqual(gx_expect, gx_actual)
         self.assertEqual(gy_expect, gy_actual)
 
     @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
     @dtypes(torch.float, torch.cfloat)
-    @parametrize_test("mode", ('valid', 'same'))
+    @parametrize_test("mode", ("valid", "same"))
     def test_conv1d_vs_scipy(self, device, dtype, mode):
         t = make_tensor((1, 10), device=device, dtype=dtype)
         feat_dim = t.shape[1]
@@ -1392,8 +1912,8 @@ def _test(t, weight, mode):
             w_a = weight.view(-1).cpu().numpy()
             expected = scipy.signal.convolve(t_a, w_a, mode=mode)
 
-            kwargs = {'padding': mode}
-            if mode == 'same':
+            kwargs = {"padding": mode}
+            if mode == "same":
                 # `same` padding in PyTorch conv1d is different
                 # from SciPy
                 p = weight.shape[2] // 2
@@ -1404,7 +1924,7 @@ def _test(t, weight, mode):
             # second input is flipped in SciPy's convolve
             weight_flipped = torch.flip(weight, (2,))
             actual = torch.nn.functional.conv1d(t, weight_flipped, **kwargs).squeeze(0)
-            if mode == 'same':
+            if mode == "same":
                 actual = actual[:feat_dim]
 
             self.assertEqual(actual, expected, atol=2e-5, rtol=2e-5)
@@ -1418,7 +1938,7 @@ def _test(t, weight, mode):
 
     @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
     @dtypes(torch.float, torch.cfloat)
-    @parametrize_test("mode", ('valid', 'same'))
+    @parametrize_test("mode", ("valid", "same"))
     def test_conv2d_vs_scipy(self, device, dtype, mode):
         t = make_tensor((1, 5, 10), device=device, dtype=dtype)
         weight_even = make_tensor((1, 1, 2, 4), device=device, dtype=dtype)
@@ -1430,8 +1950,8 @@ def _test(t, weight, mode):
             w_a = weight.squeeze(0).squeeze(0).cpu().numpy()
             expected = scipy.signal.convolve2d(t_a, w_a, mode=mode)
 
-            kwargs = {'padding': mode}
-            if mode == 'same':
+            kwargs = {"padding": mode}
+            if mode == "same":
                 # `same` padding in PyTorch conv2d is different
                 # from SciPy
                 left_right_pad = weight.shape[3] // 2
@@ -1444,7 +1964,7 @@ def _test(t, weight, mode):
             # second input is flipped in SciPy's convolve2d
             weight_flipped = torch.flip(weight, (2, 3))
             actual = torch.nn.functional.conv2d(t, weight_flipped, **kwargs).squeeze(0)
-            if mode == 'same':
+            if mode == "same":
                 actual = actual[:5, :10]
 
             self.assertEqual(actual, expected, rtol=2e-5, atol=5e-6)
@@ -1458,7 +1978,7 @@ def _test(t, weight, mode):
 
     @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
     @dtypes(torch.float, torch.cfloat)
-    @parametrize_test("mode", ('valid', 'same'))
+    @parametrize_test("mode", ("valid", "same"))
     def test_conv3d_vs_scipy(self, device, dtype, mode):
         t = make_tensor((1, 5, 5, 10), device=device, dtype=dtype)
         weight_even = make_tensor((1, 1, 2, 2, 4), device=device, dtype=dtype)
@@ -1470,15 +1990,21 @@ def _test(t, weight, mode):
             w_a = weight.squeeze(0).squeeze(0).cpu().numpy()
             expected = scipy.signal.convolve(t_a, w_a, mode=mode)
 
-            kwargs = {'padding': mode}
-            if mode == 'same':
+            kwargs = {"padding": mode}
+            if mode == "same":
                 # `same` padding in PyTorch conv3d is different
                 # from SciPy
                 left_right_pad = weight.shape[4] // 2
                 top_bottom_pad = weight.shape[3] // 2
                 front_back_pad = weight.shape[2] // 2
-                p = (left_right_pad, left_right_pad, top_bottom_pad, top_bottom_pad,
-                     front_back_pad, front_back_pad)
+                p = (
+                    left_right_pad,
+                    left_right_pad,
+                    top_bottom_pad,
+                    top_bottom_pad,
+                    front_back_pad,
+                    front_back_pad,
+                )
                 t = torch.nn.functional.pad(t, p)
                 # We have already taken care of padding
                 kwargs.pop("padding")
@@ -1486,10 +2012,12 @@ def _test(t, weight, mode):
             # second input is flipped in SciPy's convolve
             weight_flipped = torch.flip(weight, (2, 3, 4))
             actual = torch.nn.functional.conv3d(t, weight_flipped, **kwargs).squeeze(0)
-            if mode == 'same':
+            if mode == "same":
                 actual = actual[:5, :5, :10]
 
-            if tf32_is_not_fp32() and (dtype == torch.float or dtype == torch.complex64):
+            if tf32_is_not_fp32() and (
+                dtype == torch.float or dtype == torch.complex64
+            ):
                 self.assertEqual(actual, expected, atol=0.05, rtol=0.05)
             else:
                 self.assertEqual(actual, expected, rtol=2e-5, atol=5e-6)
@@ -1510,14 +2038,14 @@ def test_conv2d_valid_padding_backward(self, device, dtype):
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
-        F.conv2d(x, y, padding='valid').sum().abs().backward()
+        F.conv2d(x, y, padding="valid").sum().abs().backward()
         gx_actual, gy_actual = x.grad, y.grad
         self.assertEqual(gx_expect, gx_actual)
         self.assertEqual(gy_expect, gy_actual)
 
     @dtypes(torch.double, torch.cdouble)
     def test_conv3d_valid_padding_backward(self, device, dtype):
-        check_forward_ad = torch.device(device).type != 'xla'
+        check_forward_ad = torch.device(device).type != "xla"
 
         # Test F.conv3d gradients work with padding='valid'
         x = torch.rand(1, 1, 1, 1, 10, dtype=dtype, device=device, requires_grad=True)
@@ -1526,158 +2054,721 @@ def test_conv3d_valid_padding_backward(self, device, dtype):
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
-        F.conv3d(x, y, padding='valid').sum().abs().backward()
+        F.conv3d(x, y, padding="valid").sum().abs().backward()
         gx_actual, gy_actual = x.grad, y.grad
         self.assertEqual(gx_expect, gx_actual)
         self.assertEqual(gy_expect, gy_actual)
 
-        gradcheck(lambda x, y: F.conv3d(x, y, padding='valid'), (x, y), check_forward_ad=check_forward_ad)
-        gradgradcheck(lambda x, y: F.conv3d(x, y, padding='valid'), (x, y), check_fwd_over_rev=check_forward_ad)
-
-    @parametrize_test("N", range(2, 4), name_fn=lambda N: f'ConvTranspose{N}d')
+        gradcheck(
+            lambda x, y: F.conv3d(x, y, padding="valid"),
+            (x, y),
+            check_forward_ad=check_forward_ad,
+        )
+        gradgradcheck(
+            lambda x, y: F.conv3d(x, y, padding="valid"),
+            (x, y),
+            check_fwd_over_rev=check_forward_ad,
+        )
+
+    @parametrize_test("N", range(2, 4), name_fn=lambda N: f"ConvTranspose{N}d")
     def test_conv_transpose_with_output_size_and_no_batch_dim(self, device, N):
         # For inputs with no batch dim, verify output is the correct shape when output_size is set.
         # See https://github.com/pytorch/pytorch/issues/75889
         inp = torch.randn((1, 15, 13) if N == 2 else (1, 15, 13, 13), device=device)
         output_size = (1, 240, 200) if N == 2 else (1, 240, 200, 200)
-        ConvTransposeNd = getattr(nn, f'ConvTranspose{N}d')
-        m = ConvTransposeNd(1, 1, kernel_size=16, stride=16, padding=7, bias=False, device=device)
+        ConvTransposeNd = getattr(nn, f"ConvTranspose{N}d")
+        m = ConvTransposeNd(
+            1, 1, kernel_size=16, stride=16, padding=7, bias=False, device=device
+        )
         output = m(inp, output_size=output_size)
         self.assertEqual(output.shape, output_size)
 
     @skipMeta
-    @parametrize_test("input_shape,transposed,dilated,groups,layout,backend_expected", [
-        # === slow ===
-        subtest(((2, 6, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Slow2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow1d'),
-        subtest(((2, 6, 7), True, False, 3, torch.strided, torch._C._ConvBackend.SlowTranspose2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow1d_transposed'),
-        subtest(((2, 6, 7), False, True, 3, torch.strided, torch._C._ConvBackend.SlowDilated2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow1d_dilated'),
-        subtest(((2, 6, 7), True, True, 3, torch.strided, torch._C._ConvBackend.SlowTranspose2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow1d_dilated_transposed'),
-        subtest(((2, 6, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Slow2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow2d'),
-        subtest(((2, 6, 7, 8), True, False, 3, torch.strided, torch._C._ConvBackend.SlowTranspose2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow2d_transposed'),
-        subtest(((2, 6, 7, 8), False, True, 3, torch.strided, torch._C._ConvBackend.SlowDilated2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow2d_dilated'),
-        subtest(((2, 6, 7, 8), True, True, 3, torch.strided, torch._C._ConvBackend.SlowTranspose2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow2d_dilated_transposed'),
-        subtest(((2, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Slow3d),
-                decorators=[onlyCPU, disableMkldnn], name='slow3d_cpu'),
-        # CUDA doesn't have a slow 3D implementation, so it goes to the dilated 3D implementation instead
-        subtest(((2, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.SlowDilated3d),
-                decorators=[onlyCUDA, disablecuDNN], name='slow3d_cuda'),
-        # FIXME: RuntimeError: CUDA out of memory.
-        # subtest(((2, 6, 7, 8, 9), True, False, 3, torch.strided, torch._C._ConvBackend.SlowTranspose3d),
-        #         decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow3d_transposed'),
-        subtest(((2, 6, 7, 8, 9), False, True, 3, torch.strided, torch._C._ConvBackend.SlowDilated3d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow3d_dilated'),
-        # FIXME: RuntimeError: CUDA out of memory.
-        # subtest(((2, 6, 7, 8, 9), True, True, 3, torch.strided, torch._C._ConvBackend.SlowTranspose3d),
-        #         decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow3d_dilated_transposed'),
-        subtest(((0, 6, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch1d'),
-        subtest(((2, 0, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_channel1d'),
-        subtest(((0, 0, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch_channel1d'),
-        subtest(((0, 6, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch2d'),
-        subtest(((2, 0, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_channel2d'),
-        subtest(((0, 0, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch_channel2d'),
-        subtest(((0, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch3d'),
-        subtest(((2, 0, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_channel3d'),
-        subtest(((0, 0, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch_channel3d'),
-        # === cuda ===
-        # Note that disablecuDNN disables miopen as well.
-        subtest(((2, 6, 7), False, False, 6, torch.strided, torch._C._ConvBackend.CudaDepthwise2d),
-                decorators=[onlyCUDA, disablecuDNN], name='cuda_depthwise1d'),
-        subtest(((2, 6, 7, 8), False, False, 6, torch.strided, torch._C._ConvBackend.CudaDepthwise2d),
-                decorators=[onlyCUDA, disablecuDNN], name='cuda_depthwise2d'),
-        subtest(((2, 6, 7, 8, 9), False, False, 6, torch.strided, torch._C._ConvBackend.CudaDepthwise3d),
-                decorators=[onlyCUDA, disablecuDNN], name='cuda_depthwise3d'),
-        # === cudnn ===
-        subtest(((2, 6, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Cudnn),
-                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn1d'),
-        subtest(((2, 6, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Cudnn),
-                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn2d'),
-        subtest(((2, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Cudnn),
-                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn3d'),
-        subtest(((2, 6, 7), True, False, 3, torch.strided, torch._C._ConvBackend.CudnnTranspose),
-                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn1d_transposed'),
-        subtest(((2, 6, 7, 8), True, False, 3, torch.strided, torch._C._ConvBackend.CudnnTranspose),
-                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn2d_transposed'),
-        # FIXME: RuntimeError: CUDA out of memory.
-        # subtest(((2, 6, 7, 8, 9), True, False, 3, torch.strided, torch._C._ConvBackend.CudnnTranspose),
-        #         decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn3d_transposed'),
-        # === miopen ===
-        subtest(((2, 6, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Miopen),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen1d'),
-        subtest(((2, 6, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Miopen),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen2d'),
-        subtest(((2, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Miopen),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen3d'),
-        subtest(((2, 6, 7), True, False, 3, torch.strided, torch._C._ConvBackend.MiopenTranspose),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen1d_transposed'),
-        subtest(((2, 6, 7, 8), True, False, 3, torch.strided, torch._C._ConvBackend.MiopenTranspose),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen2d_transposed'),
-        subtest(((2, 6, 7, 8, 9), True, False, 3, torch.strided, torch._C._ConvBackend.MiopenTranspose),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen3d_transposed'),
-        subtest(((2, 6, 7), False, False, 6, torch.strided, torch._C._ConvBackend.MiopenDepthwise),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen_depthwise1d'),
-        subtest(((2, 6, 7, 8), False, False, 6, torch.strided, torch._C._ConvBackend.MiopenDepthwise),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen_depthwise2d'),
-        subtest(((2, 6, 7, 8, 9), False, False, 6, torch.strided, torch._C._ConvBackend.MiopenDepthwise),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen_depthwise3d'),
-        # === mkldnn ===
-        subtest(((2, 6, 7), False, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn1d'),
-        subtest(((2, 6, 7, 8), False, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn2d'),
-        subtest(((2, 6, 7, 8, 9), False, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn3d'),
-        # Transposed convolution is broken for mkldnn. See https://github.com/pytorch/pytorch/issues/68775.
-        subtest(((2, 6, 7), True, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn, unittest.expectedFailure], name='mkldnn1d_transposed'),
-        subtest(((2, 6, 7, 8), True, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn, unittest.expectedFailure], name='mkldnn2d_transposed'),
-        subtest(((2, 6, 7, 8, 9), True, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn, unittest.expectedFailure], name='mkldnn3d_transposed'),
-        subtest(((2, 6, 7), False, True, 3, torch.strided, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn1d_cpu_input'),
-        subtest(((2, 6, 7, 8), False, True, 3, torch.strided, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn2d_cpu_input'),
-        subtest(((2, 6, 7, 8, 9), False, True, 3, torch.strided, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn3d_cpu_input'),
-        subtest(((0, 6, 7), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch1d'),
-        subtest(((2, 0, 7), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_channel1d'),
-        subtest(((0, 0, 7), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch_channel1d'),
-        subtest(((0, 6, 7, 8), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch2d'),
-        subtest(((2, 0, 7, 8), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_channel2d'),
-        subtest(((0, 0, 7, 8), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch_channel2d'),
-        subtest(((0, 6, 7, 8, 9), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch3d'),
-        subtest(((2, 0, 7, 8, 9), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_channel3d'),
-        subtest(((0, 0, 7, 8, 9), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch_channel3d'),
-        # Note: Tests for mobile backends are not currently supported. This comprises
-        # NnpackSpatial, Winograd3x3Depthwise, and Xnnpack2d backends. Testing these
-        # requires the ability to gate tests by whether PyTorch is built with USE_MOBILE=1.
-    ])
+    @parametrize_test(
+        "input_shape,transposed,dilated,groups,layout,backend_expected",
+        [
+            # === slow ===
+            subtest(
+                (
+                    (2, 6, 7),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Slow2d,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN],
+                name="slow1d",
+            ),
+            subtest(
+                (
+                    (2, 6, 7),
+                    True,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.SlowTranspose2d,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN],
+                name="slow1d_transposed",
+            ),
+            subtest(
+                (
+                    (2, 6, 7),
+                    False,
+                    True,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.SlowDilated2d,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN],
+                name="slow1d_dilated",
+            ),
+            subtest(
+                (
+                    (2, 6, 7),
+                    True,
+                    True,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.SlowTranspose2d,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN],
+                name="slow1d_dilated_transposed",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Slow2d,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN],
+                name="slow2d",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8),
+                    True,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.SlowTranspose2d,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN],
+                name="slow2d_transposed",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8),
+                    False,
+                    True,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.SlowDilated2d,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN],
+                name="slow2d_dilated",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8),
+                    True,
+                    True,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.SlowTranspose2d,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN],
+                name="slow2d_dilated_transposed",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8, 9),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Slow3d,
+                ),
+                decorators=[onlyCPU, disableMkldnn],
+                name="slow3d_cpu",
+            ),
+            # CUDA doesn't have a slow 3D implementation, so it goes to the dilated 3D implementation instead
+            subtest(
+                (
+                    (2, 6, 7, 8, 9),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.SlowDilated3d,
+                ),
+                decorators=[onlyCUDA, disablecuDNN],
+                name="slow3d_cuda",
+            ),
+            # FIXME: RuntimeError: CUDA out of memory.
+            # subtest(((2, 6, 7, 8, 9), True, False, 3, torch.strided, torch._C._ConvBackend.SlowTranspose3d),
+            #         decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow3d_transposed'),
+            subtest(
+                (
+                    (2, 6, 7, 8, 9),
+                    False,
+                    True,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.SlowDilated3d,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN],
+                name="slow3d_dilated",
+            ),
+            # FIXME: RuntimeError: CUDA out of memory.
+            # subtest(((2, 6, 7, 8, 9), True, True, 3, torch.strided, torch._C._ConvBackend.SlowTranspose3d),
+            #         decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow3d_dilated_transposed'),
+            subtest(
+                (
+                    (0, 6, 7),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Empty,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn],
+                name="empty_batch1d",
+            ),
+            subtest(
+                (
+                    (2, 0, 7),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Empty,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn],
+                name="empty_channel1d",
+            ),
+            subtest(
+                (
+                    (0, 0, 7),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Empty,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn],
+                name="empty_batch_channel1d",
+            ),
+            subtest(
+                (
+                    (0, 6, 7, 8),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Empty,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn],
+                name="empty_batch2d",
+            ),
+            subtest(
+                (
+                    (2, 0, 7, 8),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Empty,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn],
+                name="empty_channel2d",
+            ),
+            subtest(
+                (
+                    (0, 0, 7, 8),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Empty,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn],
+                name="empty_batch_channel2d",
+            ),
+            subtest(
+                (
+                    (0, 6, 7, 8, 9),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Empty,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn],
+                name="empty_batch3d",
+            ),
+            subtest(
+                (
+                    (2, 0, 7, 8, 9),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Empty,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn],
+                name="empty_channel3d",
+            ),
+            subtest(
+                (
+                    (0, 0, 7, 8, 9),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Empty,
+                ),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn],
+                name="empty_batch_channel3d",
+            ),
+            # === cuda ===
+            # Note that disablecuDNN disables miopen as well.
+            subtest(
+                (
+                    (2, 6, 7),
+                    False,
+                    False,
+                    6,
+                    torch.strided,
+                    torch._C._ConvBackend.CudaDepthwise2d,
+                ),
+                decorators=[onlyCUDA, disablecuDNN],
+                name="cuda_depthwise1d",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8),
+                    False,
+                    False,
+                    6,
+                    torch.strided,
+                    torch._C._ConvBackend.CudaDepthwise2d,
+                ),
+                decorators=[onlyCUDA, disablecuDNN],
+                name="cuda_depthwise2d",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8, 9),
+                    False,
+                    False,
+                    6,
+                    torch.strided,
+                    torch._C._ConvBackend.CudaDepthwise3d,
+                ),
+                decorators=[onlyCUDA, disablecuDNN],
+                name="cuda_depthwise3d",
+            ),
+            # === cudnn ===
+            subtest(
+                (
+                    (2, 6, 7),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Cudnn,
+                ),
+                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen],
+                name="cudnn1d",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Cudnn,
+                ),
+                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen],
+                name="cudnn2d",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8, 9),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Cudnn,
+                ),
+                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen],
+                name="cudnn3d",
+            ),
+            subtest(
+                (
+                    (2, 6, 7),
+                    True,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.CudnnTranspose,
+                ),
+                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen],
+                name="cudnn1d_transposed",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8),
+                    True,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.CudnnTranspose,
+                ),
+                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen],
+                name="cudnn2d_transposed",
+            ),
+            # FIXME: RuntimeError: CUDA out of memory.
+            # subtest(((2, 6, 7, 8, 9), True, False, 3, torch.strided, torch._C._ConvBackend.CudnnTranspose),
+            #         decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn3d_transposed'),
+            # === miopen ===
+            subtest(
+                (
+                    (2, 6, 7),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Miopen,
+                ),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen],
+                name="miopen1d",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Miopen,
+                ),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen],
+                name="miopen2d",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8, 9),
+                    False,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Miopen,
+                ),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen],
+                name="miopen3d",
+            ),
+            subtest(
+                (
+                    (2, 6, 7),
+                    True,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.MiopenTranspose,
+                ),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen],
+                name="miopen1d_transposed",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8),
+                    True,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.MiopenTranspose,
+                ),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen],
+                name="miopen2d_transposed",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8, 9),
+                    True,
+                    False,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.MiopenTranspose,
+                ),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen],
+                name="miopen3d_transposed",
+            ),
+            subtest(
+                (
+                    (2, 6, 7),
+                    False,
+                    False,
+                    6,
+                    torch.strided,
+                    torch._C._ConvBackend.MiopenDepthwise,
+                ),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen],
+                name="miopen_depthwise1d",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8),
+                    False,
+                    False,
+                    6,
+                    torch.strided,
+                    torch._C._ConvBackend.MiopenDepthwise,
+                ),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen],
+                name="miopen_depthwise2d",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8, 9),
+                    False,
+                    False,
+                    6,
+                    torch.strided,
+                    torch._C._ConvBackend.MiopenDepthwise,
+                ),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen],
+                name="miopen_depthwise3d",
+            ),
+            # === mkldnn ===
+            subtest(
+                (
+                    (2, 6, 7),
+                    False,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.Mkldnn,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn1d",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8),
+                    False,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.Mkldnn,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn2d",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8, 9),
+                    False,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.Mkldnn,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn3d",
+            ),
+            # Transposed convolution is broken for mkldnn. See https://github.com/pytorch/pytorch/issues/68775.
+            subtest(
+                (
+                    (2, 6, 7),
+                    True,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.Mkldnn,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn, unittest.expectedFailure],
+                name="mkldnn1d_transposed",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8),
+                    True,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.Mkldnn,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn, unittest.expectedFailure],
+                name="mkldnn2d_transposed",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8, 9),
+                    True,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.Mkldnn,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn, unittest.expectedFailure],
+                name="mkldnn3d_transposed",
+            ),
+            subtest(
+                (
+                    (2, 6, 7),
+                    False,
+                    True,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Mkldnn,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn1d_cpu_input",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8),
+                    False,
+                    True,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Mkldnn,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn2d_cpu_input",
+            ),
+            subtest(
+                (
+                    (2, 6, 7, 8, 9),
+                    False,
+                    True,
+                    3,
+                    torch.strided,
+                    torch._C._ConvBackend.Mkldnn,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn3d_cpu_input",
+            ),
+            subtest(
+                (
+                    (0, 6, 7),
+                    False,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.MkldnnEmpty,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn_empty_batch1d",
+            ),
+            subtest(
+                (
+                    (2, 0, 7),
+                    False,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.MkldnnEmpty,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn_empty_channel1d",
+            ),
+            subtest(
+                (
+                    (0, 0, 7),
+                    False,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.MkldnnEmpty,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn_empty_batch_channel1d",
+            ),
+            subtest(
+                (
+                    (0, 6, 7, 8),
+                    False,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.MkldnnEmpty,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn_empty_batch2d",
+            ),
+            subtest(
+                (
+                    (2, 0, 7, 8),
+                    False,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.MkldnnEmpty,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn_empty_channel2d",
+            ),
+            subtest(
+                (
+                    (0, 0, 7, 8),
+                    False,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.MkldnnEmpty,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn_empty_batch_channel2d",
+            ),
+            subtest(
+                (
+                    (0, 6, 7, 8, 9),
+                    False,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.MkldnnEmpty,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn_empty_batch3d",
+            ),
+            subtest(
+                (
+                    (2, 0, 7, 8, 9),
+                    False,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.MkldnnEmpty,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn_empty_channel3d",
+            ),
+            subtest(
+                (
+                    (0, 0, 7, 8, 9),
+                    False,
+                    False,
+                    3,
+                    torch._mkldnn,
+                    torch._C._ConvBackend.MkldnnEmpty,
+                ),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn],
+                name="mkldnn_empty_batch_channel3d",
+            ),
+            # Note: Tests for mobile backends are not currently supported. This comprises
+            # NnpackSpatial, Winograd3x3Depthwise, and Xnnpack2d backends. Testing these
+            # requires the ability to gate tests by whether PyTorch is built with USE_MOBILE=1.
+        ],
+    )
     # Test with both bias and no bias.
     @parametrize_test("has_bias", [False, True])
     # Test with both stride=1 and stride>1 cases.
@@ -1685,17 +2776,35 @@ def test_conv_transpose_with_output_size_and_no_batch_dim(self, device, N):
     # Test with both contiguous and non-contiguous inputs.
     @parametrize_test("contiguous", [False, True])
     def test_conv_backend(
-            self, device, input_shape, has_bias, strided, contiguous, transposed, dilated, groups,
-            layout, backend_expected):
+        self,
+        device,
+        input_shape,
+        has_bias,
+        strided,
+        contiguous,
+        transposed,
+        dilated,
+        groups,
+        layout,
+        backend_expected,
+    ):
         # Build up inputs.
         dtype = torch.float32
         C_in, C_out, dim, kernel_size = input_shape[1], 12, len(input_shape) - 2, 3
         x = torch.randn(*input_shape, device=device, dtype=dtype, requires_grad=True)
-        weight = torch.randn(C_in if transposed else C_out,
-                             C_out // groups if transposed else C_in // groups,
-                             *[kernel_size for _ in range(dim)],
-                             device=device, dtype=dtype, requires_grad=True)
-        bias = torch.randn(C_out, device=device, dtype=dtype, requires_grad=True) if has_bias else None
+        weight = torch.randn(
+            C_in if transposed else C_out,
+            C_out // groups if transposed else C_in // groups,
+            *[kernel_size for _ in range(dim)],
+            device=device,
+            dtype=dtype,
+            requires_grad=True,
+        )
+        bias = (
+            torch.randn(C_out, device=device, dtype=dtype, requires_grad=True)
+            if has_bias
+            else None
+        )
 
         def _make_noncontiguous(inp):
             if inp is None:
@@ -1718,7 +2827,17 @@ def _make_noncontiguous(inp):
         padding = (0,) * dim
         dilation = (2,) * dim if dilated else (1,) * dim
         output_padding = (0,) * dim
-        inputs = [x, weight, bias, stride, padding, dilation, transposed, output_padding, groups]
+        inputs = [
+            x,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+        ]
 
         # Ensure correct backend is selected.
         backend_actual = torch._C._select_conv_backend(*inputs)
@@ -1742,12 +2861,20 @@ def _make_noncontiguous(inp):
             # Forward AD and forward-over-reverse AD smoke test in float32
             # TODO: remove this if we introduce per-op gradient tests for float32
             with fwAD.dual_level():
-                dual_inputs = [(fwAD.make_dual(i, torch.rand_like(i)) if isinstance(i, torch.Tensor) else i)
-                               for i in inputs]
+                dual_inputs = [
+                    (
+                        fwAD.make_dual(i, torch.rand_like(i))
+                        if isinstance(i, torch.Tensor)
+                        else i
+                    )
+                    for i in inputs
+                ]
                 # Forward AD
                 output = convolution(*dual_inputs)
                 # Forward over reverse AD
-                grad_output_d = fwAD.make_dual(torch.rand_like(output), torch.rand_like(output))
+                grad_output_d = fwAD.make_dual(
+                    torch.rand_like(output), torch.rand_like(output)
+                )
                 if has_bias:
                     torch.autograd.grad(output, [x, weight, bias], grad_output_d)
                 else:
@@ -1758,7 +2885,17 @@ def _make_noncontiguous(inp):
         weight = weight.to(torch.float64).detach().requires_grad_(True)
         if bias is not None:
             bias = bias.to(torch.float64).detach().requires_grad_(True)
-        inputs = [x, weight, bias, stride, padding, dilation, transposed, output_padding, groups]
+        inputs = [
+            x,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+        ]
 
         # Set some backend-specific validation settings.
         gradcheck_nondet_tol = 0.0
@@ -1771,7 +2908,9 @@ def _make_noncontiguous(inp):
         # double backward doesn't support bias gradients
         if bias is not None:
             bias.requires_grad_(False)
-        self.assertTrue(gradgradcheck(convolution, inputs, nondet_tol=gradcheck_nondet_tol))
+        self.assertTrue(
+            gradgradcheck(convolution, inputs, nondet_tol=gradcheck_nondet_tol)
+        )
 
     @onlyCPU
     def test_conv_contiguous_for_oneDNN(self):
@@ -1786,12 +2925,23 @@ def test_conv_contiguous_for_oneDNN(self):
                 dilation=(1, 1),
                 groups=1,
                 bias=True,
-                padding_mode='zeros').to(dtype=dtype)
+                padding_mode="zeros",
+            ).to(dtype=dtype)
 
             x = torch.rand([1, 2, 321, 201, 1]).to(dtype=dtype)
             x = torch.transpose(x, 1, 4)
             x2 = x[..., 0]
-            inputs = [x2, conv.weight, conv.bias, (2, 1), (0, 1), (1, 1), False, (0, 1), 1]
+            inputs = [
+                x2,
+                conv.weight,
+                conv.bias,
+                (2, 1),
+                (0, 1),
+                (1, 1),
+                False,
+                (0, 1),
+                1,
+            ]
             if torch.backends.mkldnn.is_available():
                 y = conv(x2)
                 # Disable MKLDNN explicitly
@@ -1803,7 +2953,9 @@ def test_conv_contiguous_for_oneDNN(self):
     def test_conv_ic1_channels_last_for_oneDNN(self):
         # See https://github.com/pytorch/pytorch/issues/82060, N > 1 will call in OneDNN path.
         for dtype in [torch.float, torch.bfloat16, torch.half]:
-            conv = torch.nn.Conv2d(1, 64, kernel_size=(3, 3), padding=(1, 1), bias=False)
+            conv = torch.nn.Conv2d(
+                1, 64, kernel_size=(3, 3), padding=(1, 1), bias=False
+            )
             conv = conv.to(memory_format=torch.channels_last).to(dtype=dtype)
             x = torch.rand(2, 1, 100, 100).to(dtype=dtype)
             if torch.backends.mkldnn.is_available():
@@ -1841,36 +2993,42 @@ def test_conv_empty_channel(self, device, dtype):
             mod(inp)
 
     def test_group_conv_empty(self, device):
-        mod = torch.nn.Conv2d(4, 4, stride=2, kernel_size=3, padding=1, groups=4).to(device)
+        mod = torch.nn.Conv2d(4, 4, stride=2, kernel_size=3, padding=1, groups=4).to(
+            device
+        )
         inp = torch.randn(0, 4, 4, 4, device=device)
         _test_module_empty_input(self, mod, inp, check_size=False)
-        if self.device_type == 'cuda' and self.has_cudnn():
+        if self.device_type == "cuda" and self.has_cudnn():
             with torch.backends.cudnn.flags(enabled=False):
                 _test_module_empty_input(self, mod, inp, check_size=False)
 
     def test_group_convTranspose_empty(self, device):
-        mod = torch.nn.ConvTranspose2d(4, 4, stride=2, kernel_size=3, padding=1, groups=4).to(device)
+        mod = torch.nn.ConvTranspose2d(
+            4, 4, stride=2, kernel_size=3, padding=1, groups=4
+        ).to(device)
         inp = torch.randn(0, 4, 4, 4, device=device)
         _test_module_empty_input(self, mod, inp, check_size=False)
-        if self.device_type == 'cuda' and self.has_cudnn():
+        if self.device_type == "cuda" and self.has_cudnn():
             with torch.backends.cudnn.flags(enabled=False):
                 _test_module_empty_input(self, mod, inp, check_size=False)
 
     def test_convTranspose_empty(self, device):
-        mod = torch.nn.ConvTranspose2d(4, 4, stride=2, kernel_size=3, padding=1).to(device)
+        mod = torch.nn.ConvTranspose2d(4, 4, stride=2, kernel_size=3, padding=1).to(
+            device
+        )
         inp = torch.randn(0, 4, 4, 4, device=device)
         _test_module_empty_input(self, mod, inp, check_size=False)
-        if self.device_type == 'cuda' and self.has_cudnn():
+        if self.device_type == "cuda" and self.has_cudnn():
             with torch.backends.cudnn.flags(enabled=False):
                 _test_module_empty_input(self, mod, inp, check_size=False)
 
     @onlyCUDA
-    @largeTensorTest('12GB')
+    @largeTensorTest("12GB")
     def test_conv_large_nosplit(self, device):
         # Here we just test the convolution correctly route to the fallback implementation
         # that is, it does not crash. The correctness of fallback implementation should be
         # covered in other tests
-        dtype = torch.half if self.device_type == 'cuda' else torch.float
+        dtype = torch.half if self.device_type == "cuda" else torch.float
         conv1 = nn.Conv2d(2, 2, 8, 8).to(device).to(dtype)
         input_large = torch.randn(1, 2, 1024, 1024 * 1024, dtype=dtype, device=device)
         conv1(input_large)
@@ -1886,22 +3044,29 @@ def test_conv_noncontig_weights(self, device):
                 w = torch.randn([3] * dim, device=device)
                 w = w.expand([nc, int(nc / groups)] + list(w.shape))
                 w = w.detach().requires_grad_()
-                x = torch.randn([1, nc] + ([5] * dim), device=device, requires_grad=True)
-                y = getattr(F, f'conv{dim}d')(x, w, groups=groups)
+                x = torch.randn(
+                    [1, nc] + ([5] * dim), device=device, requires_grad=True
+                )
+                y = getattr(F, f"conv{dim}d")(x, w, groups=groups)
                 y.sum().backward()
-                y = getattr(F, f'conv_transpose{dim}d')(x, w, groups=groups)
+                y = getattr(F, f"conv_transpose{dim}d")(x, w, groups=groups)
                 y.sum().backward()
 
     def test_conv_noncontig_weights_and_bias(self, device):
         # need floats to exercise https://github.com/pytorch/pytorch/issues/16018
         for bias in [True, False]:
-            conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
-                              bias=bias).to(device, torch.float)
+            conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=bias).to(
+                device, torch.float
+            )
 
-            input_nc = torch.randn((1, 3, 224, 224, 2), device=device, dtype=torch.float)[:, :, :, :, 1]
+            input_nc = torch.randn(
+                (1, 3, 224, 224, 2), device=device, dtype=torch.float
+            )[:, :, :, :, 1]
             input_c = input_nc.contiguous()
 
-            weight_nc = torch.randn((64, 3, 7, 7, 2), device=device, dtype=torch.float)[:, :, :, :, 1]
+            weight_nc = torch.randn((64, 3, 7, 7, 2), device=device, dtype=torch.float)[
+                :, :, :, :, 1
+            ]
             conv1.weight = nn.Parameter(weight_nc)
             weight_c = conv1.weight.contiguous()
 
@@ -1918,19 +3083,39 @@ def test_conv_noncontig_weights_and_bias(self, device):
             self.assertEqual(out1, out2)
 
     @onlyCUDA
-    @largeTensorTest('12GB')
+    @largeTensorTest("12GB")
     @skipIfRocmVersionLessThan((6, 0))
     def test_conv_transposed_large(self, device):
-        dtype = torch.half if self.device_type == 'cuda' else torch.float
+        dtype = torch.half if self.device_type == "cuda" else torch.float
         conv = nn.ConvTranspose2d(1, 1, 1, 1, bias=False).to(device).to(dtype)
         input_large = torch.randn(4096, 1, 512, 1024, dtype=dtype, device=device)
         # forward
         ret = conv(input_large)
-        maxdiff0 = (ret.narrow(0, 0, 1024) - conv(input_large.narrow(0, 0, 1024))).abs_().max().item()
-        maxdiff1 = (ret.narrow(0, 1024, 1024) - conv(input_large.narrow(0, 1024, 1024))).abs_().max().item()
-        maxdiff2 = (ret.narrow(0, 2048, 1024) - conv(input_large.narrow(0, 2048, 1024))).abs_().max().item()
-        maxdiff3 = (ret.narrow(0, 3072, 1024) - conv(input_large.narrow(0, 3072, 1024))).abs_().max().item()
-        if self.device_type == 'cuda':
+        maxdiff0 = (
+            (ret.narrow(0, 0, 1024) - conv(input_large.narrow(0, 0, 1024)))
+            .abs_()
+            .max()
+            .item()
+        )
+        maxdiff1 = (
+            (ret.narrow(0, 1024, 1024) - conv(input_large.narrow(0, 1024, 1024)))
+            .abs_()
+            .max()
+            .item()
+        )
+        maxdiff2 = (
+            (ret.narrow(0, 2048, 1024) - conv(input_large.narrow(0, 2048, 1024)))
+            .abs_()
+            .max()
+            .item()
+        )
+        maxdiff3 = (
+            (ret.narrow(0, 3072, 1024) - conv(input_large.narrow(0, 3072, 1024)))
+            .abs_()
+            .max()
+            .item()
+        )
+        if self.device_type == "cuda":
             # cuDNN may use algorithms such as FFT that don't guarantee a diff of 0
             self.assertEqual(maxdiff0, 0, atol=2e-3, rtol=1e-5)
             self.assertEqual(maxdiff1, 0, atol=2e-3, rtol=1e-5)
@@ -1944,9 +3129,9 @@ def test_conv_transposed_large(self, device):
 
     @onlyCUDA
     @skipCUDAIfRocm
-    @largeTensorTest('12GB')
+    @largeTensorTest("12GB")
     def test_conv_large(self, device):
-        dtype = torch.half if self.device_type == 'cuda' else torch.float
+        dtype = torch.half if self.device_type == "cuda" else torch.float
         conv = nn.Conv2d(2, 2, 8, 8, bias=False).to(device).to(dtype)
         input_large = torch.randn(4097, 2, 512, 512, dtype=dtype, device=device)
         # forward
@@ -1975,6 +3160,40 @@ def test_conv_large(self, device):
         grad2 = grad2 * scale
         self.assertEqual(grad1, grad2, atol=5e-2, rtol=5e-3)
 
+    @onlyCUDA
+    @skipCUDAIfRocm
+    @largeTensorTest("20GB", "cpu")
+    @largeTensorTest("60GB", "cuda")
+    def test_conv_large_batch_1(self, device):
+        in_channels = 514
+        dim = 2048
+        out_channels = 1
+        kernel_size = 3
+        stride = 1
+        padding = 1
+
+        input_tensor = torch.ones(1, in_channels, dim, dim).cuda().half()
+        model = (
+            nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
+            .cuda()
+            .half()
+        )
+        output = model(input_tensor)
+        model_cpu = model.cpu().float()
+        output_cpu = model(input_tensor.float().cpu())
+        self.assertEqual(output.cpu().float(), output_cpu, atol=1e-3, rtol=1e-3)
+
+    @onlyCUDA
+    @skipCUDAIfRocm
+    @largeTensorTest("24GB", "cpu")
+    @largeTensorTest("20GB", "cuda")
+    def test_conv3d_large_batch_1(self, device):
+        x = torch.rand(1, 32, 512, 512, 256)
+        m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)
+        yref = m(x)
+        y = m.to(device=device)(x.to(device=device))
+        self.assertEqual(yref, y.cpu())
+
     @onlyCUDA
     @skipCUDAIfNoCudnn
     def test_contig_wrong_stride_cudnn(self, device):
@@ -1989,6 +3208,7 @@ def test_contig_wrong_stride_cudnn(self, device):
         F.conv2d(x, torch.randn(1, 16, 1, 1, device=device))
 
     @onlyCUDA
+    @tf32_on_and_off(0.005)
     def test_Conv2d_size_1_kernel(self, device):
         x_cpu = torch.randn(2, 3, 5, 5)
         conv_cpu = torch.nn.Conv2d(3, 3, kernel_size=1)
@@ -2004,10 +3224,23 @@ def test_Conv2d_size_1_kernel(self, device):
             y_cuda.backward(y.to(device))
 
         self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
-        self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
-        self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(
+            conv_cpu.bias.grad.data,
+            conv_cuda.bias.grad.data,
+            atol=1e-5,
+            rtol=0,
+            exact_device=False,
+        )
+        self.assertEqual(
+            conv_cpu.weight.grad.data,
+            conv_cuda.weight.grad.data,
+            atol=1e-5,
+            rtol=0,
+            exact_device=False,
+        )
 
     @onlyCUDA
+    @tf32_on_and_off(0.005)
     def test_ConvTranspose2d_size_1_kernel(self, device):
         x_cpu = torch.randn(2, 3, 5, 5)
         conv_cpu = torch.nn.ConvTranspose2d(3, 3, kernel_size=1)
@@ -2023,8 +3256,20 @@ def test_ConvTranspose2d_size_1_kernel(self, device):
             y_cuda.backward(y.to(device))
 
         self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
-        self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
-        self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(
+            conv_cpu.bias.grad.data,
+            conv_cuda.bias.grad.data,
+            atol=1e-5,
+            rtol=0,
+            exact_device=False,
+        )
+        self.assertEqual(
+            conv_cpu.weight.grad.data,
+            conv_cuda.weight.grad.data,
+            atol=1e-5,
+            rtol=0,
+            exact_device=False,
+        )
 
     @onlyCUDA
     def test_ConvTranspose3d_size_1_kernel(self, device):
@@ -2043,11 +3288,24 @@ def test_ConvTranspose3d_size_1_kernel(self, device):
                 y_cuda.backward(y.to(device))
 
             self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
-            self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
-            self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data,
-                             atol=1e-5, rtol=0, exact_device=False)
+            self.assertEqual(
+                conv_cpu.bias.grad.data,
+                conv_cuda.bias.grad.data,
+                atol=1e-5,
+                rtol=0,
+                exact_device=False,
+            )
+            self.assertEqual(
+                conv_cpu.weight.grad.data,
+                conv_cuda.weight.grad.data,
+                atol=1e-5,
+                rtol=0,
+                exact_device=False,
+            )
 
-    @dtypesIfCUDA(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
+    @dtypesIfCUDA(
+        *floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else [])
+    )
     @dtypes(torch.float)
     @torch.backends.cudnn.flags(enabled=True, benchmark=False)
     @unittest.skipIf(TEST_WITH_ROCM, "Skipped on ROCm, since it is failing on ROCm 5.7")
@@ -2074,15 +3332,24 @@ def test_Conv2d_naive_groups(self, device, dtype):
         output2.backward(grad_output[:, 2:].contiguous())
 
         self.assertEqual(output, torch.cat([output1, output2], 1))
-        self.assertEqual(i.grad.data,
-                         torch.cat([i1.grad.data, i2.grad.data], 1),
-                         atol=dtype2prec_DONTUSE[dtype], rtol=0)
-        self.assertEqual(m.bias.grad.data,
-                         torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
-                         atol=dtype2prec_DONTUSE[dtype], rtol=0)
-        self.assertEqual(m.weight.grad.data,
-                         torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
-                         atol=dtype2prec_DONTUSE[dtype], rtol=0)
+        self.assertEqual(
+            i.grad.data,
+            torch.cat([i1.grad.data, i2.grad.data], 1),
+            atol=dtype2prec_DONTUSE[dtype],
+            rtol=0,
+        )
+        self.assertEqual(
+            m.bias.grad.data,
+            torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
+            atol=dtype2prec_DONTUSE[dtype],
+            rtol=0,
+        )
+        self.assertEqual(
+            m.weight.grad.data,
+            torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+            atol=dtype2prec_DONTUSE[dtype],
+            rtol=0,
+        )
 
     @dtypes(torch.double, torch.cdouble)
     def test_Conv2d_backward_depthwise(self, device, dtype):
@@ -2091,7 +3358,8 @@ def test_Conv2d_backward_depthwise(self, device, dtype):
 
         def conv2d_depthwise(x, weight):
             return torch.nn.functional.conv2d(
-                x, weight, bias=None, stride=(1, 10), groups=2)
+                x, weight, bias=None, stride=(1, 10), groups=2
+            )
 
         for cudnn_enabled in [False, True]:
             with torch.backends.cudnn.flags(enabled=cudnn_enabled):
@@ -2100,20 +3368,38 @@ def conv2d_depthwise(x, weight):
     @onlyCPU
     @dtypes(torch.float, torch.double)
     def test_conv_thnn_nhwc(self, device, dtype):
-        def helper(mod, n, c, h, w, out_channels, kernel_size, dilation, groups, input_format, weight_format):
-            input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device)\
-                .to(memory_format=input_format)
+        def helper(
+            mod,
+            n,
+            c,
+            h,
+            w,
+            out_channels,
+            kernel_size,
+            dilation,
+            groups,
+            input_format,
+            weight_format,
+        ):
+            input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device).to(
+                memory_format=input_format
+            )
             input.requires_grad_()
-            conv = mod(c, out_channels, kernel_size, dilation=dilation, groups=groups)\
-                .to(device='cpu', dtype=dtype, memory_format=weight_format)
+            conv = mod(
+                c, out_channels, kernel_size, dilation=dilation, groups=groups
+            ).to(device="cpu", dtype=dtype, memory_format=weight_format)
             for p in conv.parameters():
                 p.data = torch.randint_like(p, -3, 3)
 
             ref_input = input.detach().clone().contiguous().requires_grad_()
-            ref_conv = mod(c, out_channels, kernel_size, dilation=dilation, groups=groups)
+            ref_conv = mod(
+                c, out_channels, kernel_size, dilation=dilation, groups=groups
+            )
             # load_state_dict will restore the stride & memory_layout on ref_conv.weight.
             ref_conv.load_state_dict(conv.state_dict())
-            ref_conv = ref_conv.to(device='cpu', dtype=dtype, memory_format=torch.contiguous_format)
+            ref_conv = ref_conv.to(
+                device="cpu", dtype=dtype, memory_format=torch.contiguous_format
+            )
 
             out = conv(input)
             ref_out = ref_conv(ref_input)
@@ -2132,38 +3418,161 @@ def helper(mod, n, c, h, w, out_channels, kernel_size, dilation, groups, input_f
             self.assertEqual(input.grad, ref_input.grad, exact_dtype=False)
 
         with torch.backends.mkldnn.flags(enabled=False):
-            formats = [[torch.channels_last, torch.channels_last],
-                       [torch.channels_last, torch.contiguous_format],
-                       [torch.contiguous_format, torch.channels_last]]
+            formats = [
+                [torch.channels_last, torch.channels_last],
+                [torch.channels_last, torch.contiguous_format],
+                [torch.contiguous_format, torch.channels_last],
+            ]
             for input_format, weight_format in formats:
                 # non-dilated conv: thnn_conv2d normal path (with im2col)
-                helper(nn.Conv2d, 2, 8, 4, 4, out_channels=4, kernel_size=3, dilation=1, groups=1,
-                       input_format=input_format, weight_format=weight_format)
-                helper(nn.Conv2d, 2, 8, 4, 4, out_channels=8, kernel_size=3, dilation=1, groups=8,
-                       input_format=input_format, weight_format=weight_format)
+                helper(
+                    nn.Conv2d,
+                    2,
+                    8,
+                    4,
+                    4,
+                    out_channels=4,
+                    kernel_size=3,
+                    dilation=1,
+                    groups=1,
+                    input_format=input_format,
+                    weight_format=weight_format,
+                )
+                helper(
+                    nn.Conv2d,
+                    2,
+                    8,
+                    4,
+                    4,
+                    out_channels=8,
+                    kernel_size=3,
+                    dilation=1,
+                    groups=8,
+                    input_format=input_format,
+                    weight_format=weight_format,
+                )
                 # test when input chanels is 1 and not converted to channels last
-                helper(nn.Conv2d, 2, 1, 10, 10, out_channels=8, kernel_size=3, dilation=1, groups=1,
-                       input_format=torch.contiguous_format, weight_format=torch.channels_last)
+                helper(
+                    nn.Conv2d,
+                    2,
+                    1,
+                    10,
+                    10,
+                    out_channels=8,
+                    kernel_size=3,
+                    dilation=1,
+                    groups=1,
+                    input_format=torch.contiguous_format,
+                    weight_format=torch.channels_last,
+                )
                 # non-dilated conv: thnn_conv2d fast path (skip im2col)
-                helper(nn.Conv2d, 1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=1,
-                       input_format=input_format, weight_format=weight_format)
+                helper(
+                    nn.Conv2d,
+                    1,
+                    16,
+                    56,
+                    56,
+                    out_channels=16,
+                    kernel_size=1,
+                    dilation=1,
+                    groups=1,
+                    input_format=input_format,
+                    weight_format=weight_format,
+                )
                 # ic == oc == 1 here, so need to stick input to CL to activate channels last
-                helper(nn.Conv2d, 1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=16,
-                       input_format=torch.channels_last, weight_format=weight_format)
+                helper(
+                    nn.Conv2d,
+                    1,
+                    16,
+                    56,
+                    56,
+                    out_channels=16,
+                    kernel_size=1,
+                    dilation=1,
+                    groups=16,
+                    input_format=torch.channels_last,
+                    weight_format=weight_format,
+                )
                 # dilated conv: slow_conv_dilated2d
-                helper(nn.Conv2d, 2, 8, 11, 13, out_channels=16, kernel_size=3, dilation=2, groups=1,
-                       input_format=input_format, weight_format=weight_format)
-                helper(nn.Conv2d, 2, 16, 11, 13, out_channels=32, kernel_size=3, dilation=2, groups=16,
-                       input_format=input_format, weight_format=weight_format)
+                helper(
+                    nn.Conv2d,
+                    2,
+                    8,
+                    11,
+                    13,
+                    out_channels=16,
+                    kernel_size=3,
+                    dilation=2,
+                    groups=1,
+                    input_format=input_format,
+                    weight_format=weight_format,
+                )
+                helper(
+                    nn.Conv2d,
+                    2,
+                    16,
+                    11,
+                    13,
+                    out_channels=32,
+                    kernel_size=3,
+                    dilation=2,
+                    groups=16,
+                    input_format=input_format,
+                    weight_format=weight_format,
+                )
                 # transposed-conv: slow_conv_transpose2d
-                helper(nn.ConvTranspose2d, 2, 8, 4, 4, out_channels=4, kernel_size=3, dilation=1, groups=1,
-                       input_format=input_format, weight_format=weight_format)
-                helper(nn.ConvTranspose2d, 2, 8, 4, 4, out_channels=8, kernel_size=3, dilation=1, groups=8,
-                       input_format=input_format, weight_format=weight_format)
-                helper(nn.ConvTranspose2d, 1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=1,
-                       input_format=input_format, weight_format=weight_format)
-                helper(nn.ConvTranspose2d, 1, 16, 56, 56, out_channels=32, kernel_size=1, dilation=1, groups=16,
-                       input_format=input_format, weight_format=weight_format)
+                helper(
+                    nn.ConvTranspose2d,
+                    2,
+                    8,
+                    4,
+                    4,
+                    out_channels=4,
+                    kernel_size=3,
+                    dilation=1,
+                    groups=1,
+                    input_format=input_format,
+                    weight_format=weight_format,
+                )
+                helper(
+                    nn.ConvTranspose2d,
+                    2,
+                    8,
+                    4,
+                    4,
+                    out_channels=8,
+                    kernel_size=3,
+                    dilation=1,
+                    groups=8,
+                    input_format=input_format,
+                    weight_format=weight_format,
+                )
+                helper(
+                    nn.ConvTranspose2d,
+                    1,
+                    16,
+                    56,
+                    56,
+                    out_channels=16,
+                    kernel_size=1,
+                    dilation=1,
+                    groups=1,
+                    input_format=input_format,
+                    weight_format=weight_format,
+                )
+                helper(
+                    nn.ConvTranspose2d,
+                    1,
+                    16,
+                    56,
+                    56,
+                    out_channels=32,
+                    kernel_size=1,
+                    dilation=1,
+                    groups=16,
+                    input_format=input_format,
+                    weight_format=weight_format,
+                )
 
     @onlyCUDA
     @skipCUDAIfRocmVersionLessThan((4, 3))
@@ -2172,11 +3581,13 @@ def helper(mod, n, c, h, w, out_channels, kernel_size, dilation, groups, input_f
     @dtypes(torch.half, torch.float, torch.cfloat)
     def test_conv_cudnn_nhwc(self, device, dtype):
         def helper(n, c, h, w, out_channels, kernel_size, groups):
-            input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device)\
-                .to(memory_format=torch.channels_last)
+            input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device).to(
+                memory_format=torch.channels_last
+            )
             input.requires_grad_()
-            conv = nn.Conv2d(c, out_channels, kernel_size, groups=groups)\
-                .to(device='cuda', dtype=dtype, memory_format=torch.channels_last)
+            conv = nn.Conv2d(c, out_channels, kernel_size, groups=groups).to(
+                device="cuda", dtype=dtype, memory_format=torch.channels_last
+            )
             for p in conv.parameters():
                 p.data = torch.randint_like(p, -3, 3)
 
@@ -2185,7 +3596,9 @@ def helper(n, c, h, w, out_channels, kernel_size, groups):
             ref_conv = nn.Conv2d(c, out_channels, kernel_size, groups=groups)
             # load_state_dict will restore the stride & memory_layout on ref_conv.weight.
             ref_conv.load_state_dict(conv.state_dict())
-            ref_conv = ref_conv.to(device='cuda', dtype=torch.double, memory_format=torch.contiguous_format)
+            ref_conv = ref_conv.to(
+                device="cuda", dtype=torch.double, memory_format=torch.contiguous_format
+            )
 
             out = conv(input)
             ref_out = ref_conv(ref_input)
@@ -2198,7 +3611,9 @@ def helper(n, c, h, w, out_channels, kernel_size, groups):
 
             self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
             self.assertTrue(input.grad.is_contiguous(memory_format=torch.channels_last))
-            self.assertTrue(conv.weight.grad.is_contiguous(memory_format=torch.channels_last))
+            self.assertTrue(
+                conv.weight.grad.is_contiguous(memory_format=torch.channels_last)
+            )
 
             self.assertTrue(ref_out.is_contiguous())
             self.assertTrue(ref_input.grad.is_contiguous())
@@ -2220,11 +3635,13 @@ def helper(n, c, h, w, out_channels, kernel_size, groups):
     @dtypes(torch.half, torch.float)
     def test_conv_cudnn_ndhwc(self, device, dtype):
         def helper(n, c, d, h, w, out_channels, kernel_size, groups):
-            input = torch.randint(-2, 2, (n, c, d, h, w), dtype=dtype, device=device)\
-                .to(memory_format=torch.channels_last_3d)
+            input = torch.randint(
+                -2, 2, (n, c, d, h, w), dtype=dtype, device=device
+            ).to(memory_format=torch.channels_last_3d)
             input.requires_grad_()
-            conv = nn.Conv3d(c, out_channels, kernel_size, groups=groups)\
-                .to(device='cuda', dtype=dtype, memory_format=torch.channels_last_3d)
+            conv = nn.Conv3d(c, out_channels, kernel_size, groups=groups).to(
+                device="cuda", dtype=dtype, memory_format=torch.channels_last_3d
+            )
             for p in conv.parameters():
                 p.data = torch.randint_like(p, -2, 2)
 
@@ -2233,7 +3650,9 @@ def helper(n, c, d, h, w, out_channels, kernel_size, groups):
             ref_conv = nn.Conv3d(c, out_channels, kernel_size, groups=groups)
             # load_state_dict will restore the stride & memory_layout on ref_conv.weight.
             ref_conv.load_state_dict(conv.state_dict())
-            ref_conv = ref_conv.to(device='cuda', dtype=torch.double, memory_format=torch.contiguous_format)
+            ref_conv = ref_conv.to(
+                device="cuda", dtype=torch.double, memory_format=torch.contiguous_format
+            )
 
             out = conv(input)
             ref_out = ref_conv(ref_input)
@@ -2245,8 +3664,12 @@ def helper(n, c, d, h, w, out_channels, kernel_size, groups):
             ref_out.backward(ref_grad)
 
             self.assertTrue(out.is_contiguous(memory_format=torch.channels_last_3d))
-            self.assertTrue(input.grad.is_contiguous(memory_format=torch.channels_last_3d))
-            self.assertTrue(conv.weight.grad.is_contiguous(memory_format=torch.channels_last_3d))
+            self.assertTrue(
+                input.grad.is_contiguous(memory_format=torch.channels_last_3d)
+            )
+            self.assertTrue(
+                conv.weight.grad.is_contiguous(memory_format=torch.channels_last_3d)
+            )
 
             self.assertTrue(ref_out.is_contiguous())
             self.assertTrue(ref_input.grad.is_contiguous())
@@ -2262,14 +3685,31 @@ def helper(n, c, d, h, w, out_channels, kernel_size, groups):
         helper(1, 16, 18, 18, 18, out_channels=16, kernel_size=3, groups=1)
         helper(1, 16, 18, 18, 18, out_channels=16, kernel_size=3, groups=16)
 
-    def _run_conv(self, layer, device, inp, grad, ref_conv, ref_input, ref_out,
-                  input_format, weight_format, grad_format, output_format):
-        conv = layer(inp.size(1), grad.size(1),
-                     ref_conv.weight.size(2)).float().to(device)
+    def _run_conv(
+        self,
+        layer,
+        device,
+        inp,
+        grad,
+        ref_conv,
+        ref_input,
+        ref_out,
+        input_format,
+        weight_format,
+        grad_format,
+        output_format,
+    ):
+        conv = (
+            layer(inp.size(1), grad.size(1), ref_conv.weight.size(2)).float().to(device)
+        )
         # load_state_dict will restore the stride & memory_layout on ref_conv.weight.
         conv.load_state_dict(ref_conv.state_dict())
-        weight_data = conv.weight.detach().clone().contiguous(memory_format=weight_format)
-        conv.weight.data = weight_data.resize_(weight_data.size(), memory_format=weight_format)
+        weight_data = (
+            conv.weight.detach().clone().contiguous(memory_format=weight_format)
+        )
+        conv.weight.data = weight_data.resize_(
+            weight_data.size(), memory_format=weight_format
+        )
         input = inp.clone().contiguous(memory_format=input_format)
         input.resize_(input.size(), memory_format=input_format)
         input = input.requires_grad_()
@@ -2306,8 +3746,19 @@ def _test_conv_cudnn_nhwc_nchw(self, layer, n, c, h, w, k, filter_size, device):
                                 output_format = torch.channels_last
                             if layer == nn.ConvTranspose2d and filter_size * k != 1:
                                 output_format = torch.channels_last
-                    self._run_conv(layer, device, data, grad, ref_conv, ref_input,
-                                   ref_out, input_format, w_f, g_f, output_format)
+                    self._run_conv(
+                        layer,
+                        device,
+                        data,
+                        grad,
+                        ref_conv,
+                        ref_input,
+                        ref_out,
+                        input_format,
+                        w_f,
+                        g_f,
+                        output_format,
+                    )
 
     @onlyCUDA
     @skipCUDAIfRocmVersionLessThan((4, 3))
@@ -2325,8 +3776,12 @@ def test_conv_cudnn_mismatch_memory_format(self, device):
             [4, 1, 8, 8, 4, 1],
         ]
         for n, c, h, w, k, filter_size in configs:
-            self._test_conv_cudnn_nhwc_nchw(nn.Conv2d, n, c, h, w, k, filter_size, device)
-            self._test_conv_cudnn_nhwc_nchw(nn.ConvTranspose2d, n, c, h, w, k, filter_size, device)
+            self._test_conv_cudnn_nhwc_nchw(
+                nn.Conv2d, n, c, h, w, k, filter_size, device
+            )
+            self._test_conv_cudnn_nhwc_nchw(
+                nn.ConvTranspose2d, n, c, h, w, k, filter_size, device
+            )
 
     # torch.half is erroring out on Windows with CUDA 10.1 + cuDNN 7.6.4
     # returning CUDNN_STATUS_BAD_PARAM
@@ -2335,8 +3790,12 @@ def test_conv_cudnn_mismatch_memory_format(self, device):
     @skipCUDAIfNoCudnn
     @dtypes(torch.float, torch.double)
     def test_conv_cudnn_nhwc_support(self, device, dtype):
-        input = torch.randn((1, 16, 1, 1), dtype=dtype, device="cuda", requires_grad=True)
-        weight = torch.randn((8, 16, 3, 3), dtype=dtype, device="cuda", requires_grad=True)
+        input = torch.randn(
+            (1, 16, 1, 1), dtype=dtype, device="cuda", requires_grad=True
+        )
+        weight = torch.randn(
+            (8, 16, 3, 3), dtype=dtype, device="cuda", requires_grad=True
+        )
         weight = weight.to(memory_format=torch.channels_last)
         o = torch.conv2d(input, weight, None, (2, 1), (1, 1), (1, 1), 1)
         self.assertTrue(o.is_contiguous(memory_format=torch.channels_last))
@@ -2350,7 +3809,14 @@ def test_conv2d_no_grad(self, device, dtype):
         for batch in [1, 2, 3]:
             for groups in [1, 2, 4]:
                 input = torch.rand(batch, groups, 8, 8, dtype=dtype, device=device)
-                m = nn.Conv2d(groups, 8, kernel_size=(3, 3), groups=groups, dtype=dtype, device=device)
+                m = nn.Conv2d(
+                    groups,
+                    8,
+                    kernel_size=(3, 3),
+                    groups=groups,
+                    dtype=dtype,
+                    device=device,
+                )
                 with torch.no_grad():
                     output_ng = m(input)
                 output = m(input)
@@ -2361,12 +3827,13 @@ def test_conv2d_no_grad(self, device, dtype):
     @dtypes(torch.float, torch.float16)
     @precisionOverride({torch.half: 0.002, torch.float: 1e-4})
     def test_cudnn_convolution_relu(self, device, dtype):
-        for batch, groups, image_size, kernel_size, memory_format in \
-                product((1, 2, 3),
-                        (1, 2, 4),
-                        ((1, 1), (8, 8)),
-                        ((1, 1), (3, 3)),
-                        (torch.channels_last, torch.contiguous_format)):
+        for batch, groups, image_size, kernel_size, memory_format in product(
+            (1, 2, 3),
+            (1, 2, 4),
+            ((1, 1), (8, 8)),
+            ((1, 1), (3, 3)),
+            (torch.channels_last, torch.contiguous_format),
+        ):
             if image_size[0] < kernel_size[0]:
                 continue
             inp = torch.rand(batch, groups, *image_size, dtype=dtype, device=device)
@@ -2375,9 +3842,13 @@ def test_cudnn_convolution_relu(self, device, dtype):
             inp = inp.to(memory_format=memory_format)
             w = w.to(memory_format=memory_format)
             if torch.version.hip:
-                cudnn_out = torch.miopen_convolution_relu(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
+                cudnn_out = torch.miopen_convolution_relu(
+                    inp, w, None, (1, 1), (0, 0), (1, 1), 1
+                )
             else:
-                cudnn_out = torch.cudnn_convolution_relu(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
+                cudnn_out = torch.cudnn_convolution_relu(
+                    inp, w, None, (1, 1), (0, 0), (1, 1), 1
+                )
             self.assertTrue(cudnn_out.is_contiguous(memory_format=memory_format))
             if tf32_is_not_fp32() and dtype == torch.float:
                 self.assertEqual(conv2d_out.relu(), cudnn_out, atol=4e-3, rtol=0.006)
@@ -2389,12 +3860,13 @@ def test_cudnn_convolution_relu(self, device, dtype):
     @dtypes(torch.float, torch.float16)
     @precisionOverride({torch.half: 0.002, torch.float: 1e-4})
     def test_cudnn_convolution_add_relu(self, device, dtype):
-        for batch, groups, image_size, kernel_size, memory_format in \
-            product((1, 2, 3),
-                    (1, 2, 4),
-                    ((1, 1), (8, 8)),
-                    ((1, 1), (3, 3)),
-                    (torch.channels_last, torch.contiguous_format)):
+        for batch, groups, image_size, kernel_size, memory_format in product(
+            (1, 2, 3),
+            (1, 2, 4),
+            ((1, 1), (8, 8)),
+            ((1, 1), (3, 3)),
+            (torch.channels_last, torch.contiguous_format),
+        ):
             if image_size[0] < kernel_size[0]:
                 continue
             inp = torch.rand(batch, groups, *image_size, dtype=dtype, device=device)
@@ -2407,13 +3879,19 @@ def test_cudnn_convolution_add_relu(self, device, dtype):
             w = w.to(memory_format=memory_format)
             z = z.to(memory_format=memory_format)
             if torch.version.hip:
-                cudnn_out = torch.miopen_convolution_add_relu(inp, w, z, alpha, None, (1, 1), (0, 0), (1, 1), 1)
+                cudnn_out = torch.miopen_convolution_add_relu(
+                    inp, w, z, alpha, None, (1, 1), (0, 0), (1, 1), 1
+                )
             else:
-                cudnn_out = torch.cudnn_convolution_add_relu(inp, w, z, alpha, None, (1, 1), (0, 0), (1, 1), 1)
+                cudnn_out = torch.cudnn_convolution_add_relu(
+                    inp, w, z, alpha, None, (1, 1), (0, 0), (1, 1), 1
+                )
 
             self.assertTrue(cudnn_out.is_contiguous(memory_format=memory_format))
             if tf32_is_not_fp32() and dtype == torch.float:
-                self.assertEqual(F.relu(conv2d_out + alpha * z), cudnn_out, atol=2e-3, rtol=0.006)
+                self.assertEqual(
+                    F.relu(conv2d_out + alpha * z), cudnn_out, atol=2e-3, rtol=0.006
+                )
             else:
                 self.assertEqual(F.relu(conv2d_out + alpha * z), cudnn_out)
 
@@ -2422,17 +3900,17 @@ def test_cudnn_convolution_add_relu(self, device, dtype):
     @skipCUDAIfCudnnVersionLessThan(7603)
     def test_convert_conv2d_weight_memory_format(self, device):
         input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float32, device=device)
-        model = nn.Sequential(
-            nn.Conv2d(8, 4, 3),
-            nn.BatchNorm2d(4)).to(device).float()
+        model = nn.Sequential(nn.Conv2d(8, 4, 3), nn.BatchNorm2d(4)).to(device).float()
         for memory_format in [torch.channels_last, torch.contiguous_format]:
             model = nn.utils.convert_conv2d_weight_memory_format(model, memory_format)
             out = model(input)
             self.assertTrue(out.is_contiguous(memory_format=memory_format))
 
-        model = nn.Sequential(
-            nn.ConvTranspose2d(8, 4, 3),
-            nn.BatchNorm2d(4)).to(device).float()
+        model = (
+            nn.Sequential(nn.ConvTranspose2d(8, 4, 3), nn.BatchNorm2d(4))
+            .to(device)
+            .float()
+        )
         for memory_format in [torch.channels_last, torch.contiguous_format]:
             model = nn.utils.convert_conv2d_weight_memory_format(model, memory_format)
             out = model(input)
@@ -2442,11 +3920,14 @@ def test_convert_conv2d_weight_memory_format(self, device):
     @skipCUDAIfRocm
     @skipCUDAIfCudnnVersionLessThan(7603)
     def test_convert_conv3d_weight_memory_format(self, device):
-
-        input = torch.randint(1, 10, (2, 8, 4, 4, 4), dtype=torch.float32, device=device)
-        model = nn.Sequential(
-            nn.ConvTranspose3d(8, 4, 3),
-            nn.BatchNorm3d(4)).to(device).float()
+        input = torch.randint(
+            1, 10, (2, 8, 4, 4, 4), dtype=torch.float32, device=device
+        )
+        model = (
+            nn.Sequential(nn.ConvTranspose3d(8, 4, 3), nn.BatchNorm3d(4))
+            .to(device)
+            .float()
+        )
         for memory_format in [torch.channels_last_3d, torch.contiguous_format]:
             model = nn.utils.convert_conv3d_weight_memory_format(model, memory_format)
             out = model(input)
@@ -2467,17 +3948,42 @@ def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
         transposed = False
         output_padding = (0,)
         groups = 1
-        output = torch.ops.aten.convolution(input, weight, bias, stride, padding, dilation, transposed,
-                                            output_padding, groups)
+        output = torch.ops.aten.convolution(
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+        )
 
         ggI = torch.randn(input.shape, device=device)
         ggW = torch.randn(weight.shape, device=device)
         ggB = torch.randn(bias.shape, device=device)
         gO = torch.randn(output.shape, device=device)
         output_mask = [True, True, True]
-        grad_grad_output, grad_input, grad_weight = torch.ops.aten._convolution_double_backward(
-            ggI, ggW, ggB, gO, weight, input, stride, padding, dilation, transposed,
-            output_padding, groups, output_mask)
+        (
+            grad_grad_output,
+            grad_input,
+            grad_weight,
+        ) = torch.ops.aten._convolution_double_backward(
+            ggI,
+            ggW,
+            ggB,
+            gO,
+            weight,
+            input,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+            output_mask,
+        )
 
         # Make sure the correct shapes are computed.
         self.assertEqual(grad_grad_output.shape, gO.shape)
@@ -2485,8 +3991,8 @@ def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
         self.assertEqual(grad_weight.shape, weight.shape)
 
     @onlyCUDA
-    @largeTensorTest('40GB')
-    @largeTensorTest('24GB', 'cpu')
+    @largeTensorTest("40GB")
+    @largeTensorTest("24GB", "cpu")
     def test_conv3d_64bit_indexing(self, device):
         x = torch.rand(1, 32, 512, 512, 256)
         m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)
@@ -2498,5 +4004,5 @@ def test_conv3d_64bit_indexing(self, device):
 instantiate_device_type_tests(TestConvolutionNNDeviceType, globals())
 instantiate_parametrized_tests(TestConvolutionNN)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/nn/test_dropout.py b/test/nn/test_dropout.py
index 5337b0eb76f11..46d494b58752f 100644
--- a/test/nn/test_dropout.py
+++ b/test/nn/test_dropout.py
@@ -1,18 +1,25 @@
 # Owner(s): ["module: nn"]
-from itertools import product
-import unittest
-import random
 import itertools
-
+import random
+import unittest
+from itertools import product
 
 import torch
-from torch.testing._internal.common_utils import run_tests, set_default_dtype, \
-    instantiate_parametrized_tests, TEST_PRIVATEUSE1
-from torch.testing._internal.common_cuda import TEST_CUDA
-from torch.testing._internal.common_nn import NNTestCase, freeze_rng_state
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, expectedFailureXLA
-import torch.nn.functional as F
 import torch.nn as nn
+import torch.nn.functional as F
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_device_type import (
+    expectedFailureXLA,
+    instantiate_device_type_tests,
+)
+from torch.testing._internal.common_nn import freeze_rng_state, NNTestCase
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    run_tests,
+    set_default_dtype,
+    TEST_PRIVATEUSE1,
+)
+
 
 class TestDropoutNN(NNTestCase):
     _do_cuda_memory_leak_check = True
@@ -50,10 +57,12 @@ def test_FeatureAlphaDropout(self):
         input = torch.randn(50, 20, 64, 64)
         self._test_alpha_dropout(nn.FeatureAlphaDropout, input)
 
-    @unittest.skipIf(not (TEST_CUDA or TEST_PRIVATEUSE1), "CUDA and PRIVATEUSE1 unavailable")
+    @unittest.skipIf(
+        not (TEST_CUDA or TEST_PRIVATEUSE1), "CUDA and PRIVATEUSE1 unavailable"
+    )
     def test_native_dropout_corner_case(self):
         if TEST_CUDA:
-            device = 'cuda'
+            device = "cuda"
         elif TEST_PRIVATEUSE1:
             device = torch._C._get_privateuse1_backend_name()
         for train in [True, False]:
@@ -81,6 +90,7 @@ def test_invalid_dropout_p(self):
         self.assertRaises(ValueError, lambda: F.dropout(v, -0.1))
         self.assertRaises(ValueError, lambda: F.dropout(v, 1.1))
 
+
 class TestDropoutNNDeviceType(NNTestCase):
     def _test_dropout(self, cls, device, input, memory_format=torch.contiguous_format):
         p = 0.2
@@ -113,7 +123,9 @@ def _test_dropout(self, cls, device, input, memory_format=torch.contiguous_forma
         module.__repr__()
         str(module)
 
-    def _test_dropout_discontiguous(self, cls, device, memory_format=torch.contiguous_format):
+    def _test_dropout_discontiguous(
+        self, cls, device, memory_format=torch.contiguous_format
+    ):
         # In this test, we verify that dropout preserves the layout and data for different memory formats.
         # We check whether, we get same values for the output of dropout, when the probability
         # of dropout is 0 or very close to 0.
@@ -121,7 +133,9 @@ def _test_dropout_discontiguous(self, cls, device, memory_format=torch.contiguou
         close_to_zero_p = 1e-10  # Should be almost zero but not zero, as for p=0 different path is taken
         for p in [0, close_to_zero_p]:
             inp = torch.ones(2, 3, 3, 3, device=device)
-            inp_discontiguous = torch.empty(2, 3, 3, 6, device=device, memory_format=memory_format)[..., ::2]
+            inp_discontiguous = torch.empty(
+                2, 3, 3, 6, device=device, memory_format=memory_format
+            )[..., ::2]
             inp_discontiguous.copy_(inp)
             mod = cls(p=p)
             out = mod(inp_discontiguous)
@@ -142,8 +156,10 @@ def invert_perm(p):
             for shift in shifts:
                 for p in [1e-10, 0.3, 0.5, 0.7]:
                     mod = cls(p=p)
-                    permuted_inp = inp.permute(perm).contiguous().permute(invert_perm(perm))
-                    permuted_inp = permuted_inp[shift[0]:, shift[1]:, :, :]
+                    permuted_inp = (
+                        inp.permute(perm).contiguous().permute(invert_perm(perm))
+                    )
+                    permuted_inp = permuted_inp[shift[0] :, shift[1] :, :, :]
                     out = mod(permuted_inp)
 
                     self.assertTrue(out.permute(perm).is_contiguous())
@@ -158,11 +174,13 @@ def test_Dropout(self, device):
         self._test_dropout(nn.Dropout, device, input)
 
         self._test_dropout_discontiguous(nn.Dropout, device)
-        self._test_dropout_discontiguous(nn.Dropout, device, memory_format=torch.channels_last)
+        self._test_dropout_discontiguous(
+            nn.Dropout, device, memory_format=torch.channels_last
+        )
 
         self._test_dropout_stride_mean_preserve(nn.Dropout, device)
 
-        if self.device_type == 'cuda' or self.device_type == 'cpu':
+        if self.device_type == "cuda" or self.device_type == "cpu":
             input = input.bfloat16()
             self._test_dropout(nn.Dropout, device, input)
 
@@ -191,14 +209,22 @@ def _test_dropoutNd_channel_zero(self, dropout, input):
     @expectedFailureXLA  # seems like freeze_rng_state is not honoured by XLA
     def test_Dropout1d(self, device):
         with set_default_dtype(torch.double):
-            N, C, L = random.randint(10, 15), random.randint(10, 15), random.randint(10, 15)
+            N, C, L = (
+                random.randint(10, 15),
+                random.randint(10, 15),
+                random.randint(10, 15),
+            )
             input = torch.empty(N, C, L)
             self._test_dropout(nn.Dropout1d, device, input)
 
-            with self.assertRaisesRegex(RuntimeError, "Expected 2D or 3D input, but received a 4D input"):
+            with self.assertRaisesRegex(
+                RuntimeError, "Expected 2D or 3D input, but received a 4D input"
+            ):
                 nn.Dropout1d(p=0.5)(torch.rand(1, 2, 2, 2, device=device))
 
-            with self.assertRaisesRegex(RuntimeError, "Expected 2D or 3D input, but received a 1D input"):
+            with self.assertRaisesRegex(
+                RuntimeError, "Expected 2D or 3D input, but received a 1D input"
+            ):
                 nn.Dropout1d(p=0.5)(torch.rand(2, device=device))
 
             # no batch dims
@@ -219,10 +245,14 @@ def test_Dropout2d(self, device):
         num_features = 1000
         input = torch.empty(num_features, b, w, h)
         self._test_dropout(nn.Dropout2d, device, input)
-        self._test_dropout(nn.Dropout2d, device, input, memory_format=torch.channels_last)
+        self._test_dropout(
+            nn.Dropout2d, device, input, memory_format=torch.channels_last
+        )
 
         self._test_dropout_discontiguous(nn.Dropout2d, device)
-        self._test_dropout_discontiguous(nn.Dropout2d, device, memory_format=torch.channels_last)
+        self._test_dropout_discontiguous(
+            nn.Dropout2d, device, memory_format=torch.channels_last
+        )
 
         with self.assertWarnsRegex(UserWarning, "Received a 5-D input to dropout2d"):
             nn.Dropout2d(p=0.5)(torch.rand(1, 2, 2, 2, 2, device=device))
@@ -238,7 +268,9 @@ def test_Dropout2d(self, device):
         # self._test_dropoutNd_no_batch(nn.Dropout2d(p=0.5), input)
         # self._test_dropoutNd_no_batch(nn.Dropout2d(p=0.5, inplace=True), input)
 
-        with self.assertWarnsRegex(UserWarning, "assuming that channel-wise 1D dropout behavior is desired"):
+        with self.assertWarnsRegex(
+            UserWarning, "assuming that channel-wise 1D dropout behavior is desired"
+        ):
             nn.Dropout2d(p=0.5)(torch.rand(1, 2, 2, device=device))
 
         # check that complete channels are dropped
@@ -257,7 +289,9 @@ def test_Dropout3d(self, device):
         self._test_dropout(nn.Dropout3d, device, input)
 
         self._test_dropout_discontiguous(nn.Dropout3d, device)
-        self._test_dropout_discontiguous(nn.Dropout3d, device, memory_format=torch.channels_last)
+        self._test_dropout_discontiguous(
+            nn.Dropout3d, device, memory_format=torch.channels_last
+        )
 
         with self.assertWarnsRegex(UserWarning, "Received a 6-D input to dropout3d"):
             nn.Dropout3d(p=0.5)(torch.rand(1, 2, 2, 2, 2, 2, device=device))
@@ -280,8 +314,9 @@ def test_empty_dropout(self, device):
         out = torch.nn.functional.dropout(x)
         self.assertEqual(out.size(), x.size())
 
+
 instantiate_device_type_tests(TestDropoutNNDeviceType, globals())
 instantiate_parametrized_tests(TestDropoutNN)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/nn/test_embedding.py b/test/nn/test_embedding.py
index d0bf1a63b67a5..d4e2821fba22a 100644
--- a/test/nn/test_embedding.py
+++ b/test/nn/test_embedding.py
@@ -1,20 +1,35 @@
 # Owner(s): ["module: nn"]
-import unittest
-import random
 import itertools
+import random
+import unittest
 from itertools import product
 
 import torch
-from torch.testing._internal.common_utils import run_tests, set_default_dtype, skipIfTorchDynamo, \
-    instantiate_parametrized_tests, parametrize as parametrize_test, _assertGradAndGradgradChecks, IS_JETSON
+import torch.nn as nn
+import torch.nn.functional as F
 from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    dtypesIfCUDA,
+    instantiate_device_type_tests,
+    onlyCUDA,
+    onlyNativeDeviceTypes,
+    skipCUDAIf,
+    skipMeta,
+    TEST_WITH_ROCM,
+)
 from torch.testing._internal.common_nn import NNTestCase
-from torch.testing._internal.common_device_type import onlyNativeDeviceTypes, dtypes, \
-    instantiate_device_type_tests, dtypesIfCUDA, onlyCUDA, \
-    TEST_WITH_ROCM, skipCUDAIf, skipMeta
-import torch.nn.functional as F
-import torch.nn as nn
-from torch.testing._internal.common_utils import dtype2prec_DONTUSE
+from torch.testing._internal.common_utils import (
+    _assertGradAndGradgradChecks,
+    dtype2prec_DONTUSE,
+    instantiate_parametrized_tests,
+    IS_JETSON,
+    parametrize as parametrize_test,
+    run_tests,
+    set_default_dtype,
+    skipIfTorchDynamo,
+)
+
 
 class TestEmbeddingNN(NNTestCase):
     _do_cuda_memory_leak_check = True
@@ -26,15 +41,14 @@ def create_embedding(device):
             # Seed RNG so we get the same Embedding each time
             torch.manual_seed(0)
             return torch.nn.Embedding(
-                num_embeddings=20,
-                embedding_dim=64,
-                max_norm=1.0).to(device)
+                num_embeddings=20, embedding_dim=64, max_norm=1.0
+            ).to(device)
 
-        ix = torch.arange(2, device='cpu', dtype=torch.long).repeat(2000)
-        out_cpu = create_embedding('cpu')(ix)
+        ix = torch.arange(2, device="cpu", dtype=torch.long).repeat(2000)
+        out_cpu = create_embedding("cpu")(ix)
 
-        ix = ix.to('cuda')
-        out = create_embedding('cuda')(ix)
+        ix = ix.to("cuda")
+        out = create_embedding("cuda")(ix)
         self.assertEqual(out.cpu(), out_cpu)
 
     def test_embedding_sparse_basic(self):
@@ -59,7 +73,7 @@ def test_embedding_sparse_empty_tensor(self):
 
     def test_move_sparse_half_embedding(self):
         embedding = nn.Embedding(10, 3, sparse=True)
-        self.assertEqual(embedding.weight.device.type, 'cpu')
+        self.assertEqual(embedding.weight.device.type, "cpu")
         self.assertEqual(embedding.weight.dtype, torch.get_default_dtype())
         embedding.to(torch.float16)
         self.assertEqual(embedding.weight.dtype, torch.float16)
@@ -67,10 +81,10 @@ def test_move_sparse_half_embedding(self):
         self.assertEqual(embedding.num_embeddings, 10)
 
         if torch.cuda.is_available():
-            embedding.to('cuda')
-            self.assertEqual(embedding.weight.device.type, 'cuda')
-            embedding.to('cpu')
-            self.assertEqual(embedding.weight.device.type, 'cpu')
+            embedding.to("cuda")
+            self.assertEqual(embedding.weight.device.type, "cuda")
+            embedding.to("cpu")
+            self.assertEqual(embedding.weight.device.type, "cpu")
 
     def test_embedding_max_norm(self):
         embedding = nn.Embedding(22, 5, max_norm=1.0)
@@ -79,9 +93,20 @@ def test_embedding_max_norm(self):
         self.assertEqual(output[1], output[2])
         self.assertTrue(output.data.norm(p=2, dim=1).le(1).all())
 
-    @parametrize_test("dtype", (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64, torch.float, torch.double))
+    @parametrize_test(
+        "dtype",
+        (
+            torch.uint8,
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.float,
+            torch.double,
+        ),
+    )
     def test_embedding_from_pretrained(self, dtype):
-        a = torch.tensor([[1., 2., 3.], [4., 5., 6.]], dtype=dtype)
+        a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=dtype)
         embedding = nn.Embedding.from_pretrained(a)
         self.assertEqual(a, embedding.weight.data)
 
@@ -90,7 +115,7 @@ def test_embedding_from_pretrained(self, dtype):
         self.assertEqual(a, output)
 
     def test_embedding_bag_from_pretrained(self):
-        a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
+        a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
         embedding = nn.EmbeddingBag.from_pretrained(a)
         self.assertEqual(a, embedding.weight)
 
@@ -110,17 +135,19 @@ def test_embedding_from_pretrained_padding_idx(self):
     def test_embedding_bag_from_pretrained_padding_idx(self):
         padding_idx = 2
         embeddings = torch.rand(4, 3, requires_grad=True)
-        embedding_nn = nn.EmbeddingBag.from_pretrained(embeddings, padding_idx=padding_idx)
+        embedding_nn = nn.EmbeddingBag.from_pretrained(
+            embeddings, padding_idx=padding_idx
+        )
         self.assertEqual(embedding_nn.weight, embeddings)
 
     def test_embedding_from_pretrained_options(self):
         with set_default_dtype(torch.double):
-            a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
+            a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
             opts = {
-                "max_norm": 2.,
-                "norm_type": .5,
+                "max_norm": 2.0,
+                "norm_type": 0.5,
                 "scale_grad_by_freq": False,
-                "sparse": True
+                "sparse": True,
             }
             embedding = nn.Embedding.from_pretrained(a, **opts)
             input = torch.LongTensor([0, 1])
@@ -128,13 +155,12 @@ def test_embedding_from_pretrained_options(self):
             # test output and that weight matrix was renormalized
             self.assertEqual(a, output)
             self.assertTrue(a.ne(torch.arange(1, 7, dtype=a.dtype).view(2, 3)).all())
-            self.assertTrue(output.data.norm(p=opts["norm_type"], dim=1).le(opts["max_norm"]).all())
+            self.assertTrue(
+                output.data.norm(p=opts["norm_type"], dim=1).le(opts["max_norm"]).all()
+            )
 
     def test_embedding_functional(self):
-        a = torch.tensor([
-            [1, 3, 2],
-            [0, 2, 1]
-        ], dtype=torch.long)
+        a = torch.tensor([[1, 3, 2], [0, 2, 1]], dtype=torch.long)
         embeddings = torch.rand(4, 3, requires_grad=True)
 
         embed_old = torch.nn.Embedding(4, 3)
@@ -155,10 +181,7 @@ def test_embedding_functional(self):
         self.assertEqual(res_old, res_F)
 
     def test_embedding_bag_functional(self):
-        a = torch.tensor([
-            [1, 3, 2],
-            [0, 2, 1]
-        ], dtype=torch.long)
+        a = torch.tensor([[1, 3, 2], [0, 2, 1]], dtype=torch.long)
         embeddings = torch.rand(4, 3, requires_grad=True)
 
         embed_old = torch.nn.EmbeddingBag(4, 3)
@@ -177,29 +200,30 @@ def test_embedding_bag_functional(self):
 
     # Make sure that error is thrown if padding_idx is out of bounds
     def test_embedding_bag_padding_idx_error(self):
-        a = torch.tensor([
-            [1, 3, 2],
-            [0, 2, 1]
-        ], dtype=torch.long)
+        a = torch.tensor([[1, 3, 2], [0, 2, 1]], dtype=torch.long)
         num_embeddings = 4
         num_features = 3
         embeddings = torch.rand(num_embeddings, num_features, requires_grad=True)
 
-        functional_err_msg = r'padding_idx must be within the number of embeddings'
-        module_err_msg = r'padding_idx must be within num_embeddings'
+        functional_err_msg = r"padding_idx must be within the number of embeddings"
+        module_err_msg = r"padding_idx must be within num_embeddings"
 
         for padding_idx in range(-(num_embeddings + 2), (num_embeddings + 2)):
             if (padding_idx < -num_embeddings) or (padding_idx >= num_embeddings):
                 with self.assertRaisesRegex(RuntimeError, functional_err_msg):
                     F.embedding_bag(a, embeddings, padding_idx=padding_idx)
                 with self.assertRaisesRegex(AssertionError, module_err_msg):
-                    torch.nn.EmbeddingBag(num_embeddings, num_features, padding_idx=padding_idx)
+                    torch.nn.EmbeddingBag(
+                        num_embeddings, num_features, padding_idx=padding_idx
+                    )
             else:
                 F.embedding_bag(a, embeddings, padding_idx=padding_idx)
-                torch.nn.EmbeddingBag(num_embeddings, num_features, padding_idx=padding_idx)
+                torch.nn.EmbeddingBag(
+                    num_embeddings, num_features, padding_idx=padding_idx
+                )
 
     def test_embeddingbag_from_pretrained(self):
-        a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
+        a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
         embeddingbag = nn.EmbeddingBag.from_pretrained(a)
         self.assertEqual(a, embeddingbag.weight.data)
 
@@ -209,13 +233,13 @@ def test_embeddingbag_from_pretrained(self):
 
     def test_embeddingbag_from_pretrained_options(self):
         with set_default_dtype(torch.double):
-            a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
+            a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
             opts = {
-                "max_norm": 2.,
-                "norm_type": .5,
+                "max_norm": 2.0,
+                "norm_type": 0.5,
                 "scale_grad_by_freq": False,
                 "mode": "max",
-                "sparse": False
+                "sparse": False,
             }
             embeddingbag = nn.EmbeddingBag.from_pretrained(a, **opts)
 
@@ -223,7 +247,9 @@ def test_embeddingbag_from_pretrained_options(self):
             output = embeddingbag(input)
             self.assertEqual(a.max(0, keepdim=True)[0], output)
             self.assertTrue(a.ne(torch.arange(1, 7, dtype=a.dtype).view(2, 3)).all())
-            self.assertTrue(a.norm(p=opts["norm_type"], dim=1).le(opts["max_norm"]).all())
+            self.assertTrue(
+                a.norm(p=opts["norm_type"], dim=1).le(opts["max_norm"]).all()
+            )
 
     def test_embeddingbag_include_last_offset(self):
         # Test case from https://github.com/pytorch/pytorch/issues/89677
@@ -240,6 +266,7 @@ def test_embeddingbag_include_last_offset(self):
         self.assertEqual(ref_out, out)
         self.assertEqual(ref_out, out2)
 
+
 class TestEmbeddingNNDeviceType(NNTestCase):
     def test_embedding_dense_grad(self, device):
         with set_default_dtype(torch.double):
@@ -248,12 +275,15 @@ def test_embedding_dense_grad(self, device):
 
             def fn_wrapper(device):
                 def fn(weight):
-                    inp = torch.tensor([[0, 1, 1, 2], [3, 5, 7, 11]], dtype=torch.long).to(device)
+                    inp = torch.tensor(
+                        [[0, 1, 1, 2], [3, 5, 7, 11]], dtype=torch.long
+                    ).to(device)
                     return torch.nn.functional.embedding(inp, weight)
+
                 return fn
 
             fn = fn_wrapper(device)
-            _assertGradAndGradgradChecks(self, fn, (weight, ))
+            _assertGradAndGradgradChecks(self, fn, (weight,))
 
     def test_embedding_scalar_weight_error(self, device):
         indices = torch.rand(2, 2, device=device).long()
@@ -271,7 +301,7 @@ def test_embedding_scalar_weight_error(self, device):
     def test_embedding_backward(self, device, dtype):
         embedding = nn.Embedding(10, 3, sparse=True)
         tensor = torch.tensor([[7, 1, 3]])
-        ones = torch.tensor(1., dtype=dtype).expand(3, 3)
+        ones = torch.tensor(1.0, dtype=dtype).expand(3, 3)
         tensorTwice = tensor.repeat(1, 2)
         onesTwice = torch.cat((ones, ones))
 
@@ -300,8 +330,13 @@ def test_embedding_backward(self, device, dtype):
         self.assertEqual(embedding.weight.grad._indices(), tensorTwice)
         self.assertEqual(embedding.weight.grad._values(), onesTwice)
 
-    @dtypesIfCUDA(*((torch.float, torch.double, torch.bfloat16, torch.half)
-                    if TEST_WITH_ROCM else (torch.float, torch.double, torch.half)))
+    @dtypesIfCUDA(
+        *(
+            (torch.float, torch.double, torch.bfloat16, torch.half)
+            if TEST_WITH_ROCM
+            else (torch.float, torch.double, torch.half)
+        )
+    )
     @dtypes(torch.float32)
     def test_embedding_max_norm_backward(self, device, dtype):
         # can't use gradcheck since in place renorm makes analytical gradients different from produced ones
@@ -309,17 +344,26 @@ def test_embedding_max_norm_backward(self, device, dtype):
         weight.requires_grad_()
         inp_list = [0, 1, 2, 2]
         inp = torch.tensor(inp_list, device=device)
-        out = nn.functional.embedding(inp, weight, max_norm=1.).sum()
+        out = nn.functional.embedding(inp, weight, max_norm=1.0).sum()
         out.backward()
 
-        expected_grad = torch.tensor([[1., 1., 2., 0.]], device=device, dtype=dtype).transpose(0, 1).expand(4, 4)
+        expected_grad = (
+            torch.tensor([[1.0, 1.0, 2.0, 0.0]], device=device, dtype=dtype)
+            .transpose(0, 1)
+            .expand(4, 4)
+        )
         self.assertEqual(weight.grad, expected_grad)
 
-    @dtypesIfCUDA(*((torch.float, torch.double, torch.bfloat16, torch.half)
-                    if TEST_WITH_ROCM else (torch.float, torch.double, torch.half)))
+    @dtypesIfCUDA(
+        *(
+            (torch.float, torch.double, torch.bfloat16, torch.half)
+            if TEST_WITH_ROCM
+            else (torch.float, torch.double, torch.half)
+        )
+    )
     @dtypes(torch.float32)
     def test_embedding_max_norm_fwd_AD(self, device, dtype):
-        if torch.device(device).type == 'xla':
+        if torch.device(device).type == "xla":
             self.skipTest("forward AD doesn't work on xla")
 
         # can't use gradcheck since in place renorm makes analytical gradients different from produced ones
@@ -328,14 +372,19 @@ def test_embedding_max_norm_fwd_AD(self, device, dtype):
         inp = torch.tensor([[0, 1], [2, 2]], device=device)
         with torch.autograd.forward_ad.dual_level():
             dual_weight = torch.autograd.forward_ad.make_dual(weight, tangent)
-            out = nn.functional.embedding(inp, dual_weight, max_norm=1.)
+            out = nn.functional.embedding(inp, dual_weight, max_norm=1.0)
             jvp = torch.autograd.forward_ad.unpack_dual(out).tangent
 
         expected_grad = torch.ones((2, 2, 4), device=device, dtype=dtype)
         self.assertEqual(jvp, expected_grad)
 
-    @dtypesIfCUDA(*((torch.float, torch.double, torch.bfloat16, torch.half)
-                    if TEST_WITH_ROCM else (torch.float, torch.double, torch.half)))
+    @dtypesIfCUDA(
+        *(
+            (torch.float, torch.double, torch.bfloat16, torch.half)
+            if TEST_WITH_ROCM
+            else (torch.float, torch.double, torch.half)
+        )
+    )
     @dtypes(torch.float32)
     def test_embedding_padding_idx(self, device, dtype):
         embedding = nn.Embedding(10, 20, padding_idx=0).to(device, dtype)
@@ -374,14 +423,32 @@ def test_embedding_padding_idx(self, device, dtype):
         self.assertEqual(output[1], padding_vector)
 
         # out of bounds check for padding_idx
-        self.assertRaises(AssertionError, nn.Embedding, num_embeddings=10, embedding_dim=20, padding_idx=25)
-        self.assertRaises(AssertionError, nn.Embedding, num_embeddings=10, embedding_dim=20, padding_idx=-25)
+        self.assertRaises(
+            AssertionError,
+            nn.Embedding,
+            num_embeddings=10,
+            embedding_dim=20,
+            padding_idx=25,
+        )
+        self.assertRaises(
+            AssertionError,
+            nn.Embedding,
+            num_embeddings=10,
+            embedding_dim=20,
+            padding_idx=-25,
+        )
 
         padding_idx = 0
         embedding = nn.Embedding(5, 2, padding_idx=padding_idx).to(device, dtype)
-        for n in (1, 2, 1000):  # Need large N to trigger all the methods we have implemented
+        for n in (
+            1,
+            2,
+            1000,
+        ):  # Need large N to trigger all the methods we have implemented
             for other_indices in ([], [1, 3], [2]):
-                indices = torch.tensor(other_indices + [padding_idx] * n, dtype=torch.long).to(device)
+                indices = torch.tensor(
+                    other_indices + [padding_idx] * n, dtype=torch.long
+                ).to(device)
                 pre = embedding.weight[padding_idx].clone()
                 embedding(indices).sum().backward()
                 after = (embedding.weight + embedding.weight.grad)[padding_idx]
@@ -390,7 +457,11 @@ def test_embedding_padding_idx(self, device, dtype):
 
                 # test double backward
                 emb_sum = embedding(indices).sum()
-                emb_grad = torch.autograd.grad(outputs=emb_sum, inputs=list(embedding.parameters()), retain_graph=True)
+                emb_grad = torch.autograd.grad(
+                    outputs=emb_sum,
+                    inputs=list(embedding.parameters()),
+                    retain_graph=True,
+                )
                 scalar = emb_grad[0].sum() + emb_sum
                 scalar.backward()
                 after = (embedding.weight + embedding.weight.grad)[padding_idx]
@@ -431,7 +502,10 @@ def gen_1D_indices_offsets(include_last_offset, allpad):
                     bag_size = 0
                 else:
                     bag_size = random.randint(1, max_indices_per_bag - 1)
-                indices += [1 if allpad else random.randint(0, num_words - 1) for _ in range(bag_size)]
+                indices += [
+                    1 if allpad else random.randint(0, num_words - 1)
+                    for _ in range(bag_size)
+                ]
                 cur_offset += bag_size
 
             # embedding_bag requires first entry of offsets to be 0
@@ -448,11 +522,15 @@ def gen_1D_indices_offsets(include_last_offset, allpad):
 
         # Convert a 1-D indices-offsets representation into 2-D. Fill any empty
         # indices with padding_idx
-        def gen_2D_indices_from_1D(indices_1D, offsets, include_last_offset, padding_idx):
+        def gen_2D_indices_from_1D(
+            indices_1D, offsets, include_last_offset, padding_idx
+        ):
             assert offsets[0] == 0
             if include_last_offset:
                 offsets = offsets[:-1]
-            indices_2D = torch.empty(num_bags, max_indices_per_bag, device=device, dtype=torch.long)
+            indices_2D = torch.empty(
+                num_bags, max_indices_per_bag, device=device, dtype=torch.long
+            )
             for bag in range(num_bags):
                 # Determine the start and end position of the bag within indices_1D
                 start = offsets[bag]
@@ -471,36 +549,42 @@ def gen_2D_indices_from_1D(indices_1D, offsets, include_last_offset, padding_idx
 
             return indices_2D
 
-        test_cases = product(['max', 'mean', 'sum'], [False, True], [False, True], [False, True])
+        test_cases = product(
+            ["max", "mean", "sum"], [False, True], [False, True], [False, True]
+        )
 
         for mode, sparse, include_last_offset, allpad in test_cases:
             # Max sparse and bfloat16 are not supported
-            if mode == 'max':
+            if mode == "max":
                 if sparse or (dtype == torch.bfloat16):
                     continue
             indices_1D, offsets = gen_1D_indices_offsets(include_last_offset, allpad)
             for padding_idx_1D in list(set(indices_1D.tolist())) + [None]:
                 msg = (
                     f"mode: '{mode}', sparse: {sparse}, include_last_offset: {include_last_offset}, "
-                    f"padding_idx_1D: {padding_idx_1D}")
+                    f"padding_idx_1D: {padding_idx_1D}"
+                )
 
                 # If 1D input does not use a padding index, we still need one for the 2D input,
                 # so we can add one dummy word to the weights to act as the padded word
-                padding_idx_2D = padding_idx_1D if padding_idx_1D is not None else num_words
-                num_words_with_padding = num_words if padding_idx_1D is not None else num_words + 1
+                padding_idx_2D = (
+                    padding_idx_1D if padding_idx_1D is not None else num_words
+                )
+                num_words_with_padding = (
+                    num_words if padding_idx_1D is not None else num_words + 1
+                )
 
                 indices_2D = gen_2D_indices_from_1D(
-                    indices_1D,
-                    offsets,
-                    include_last_offset,
-                    padding_idx_2D)
+                    indices_1D, offsets, include_last_offset, padding_idx_2D
+                )
 
                 weights = torch.randn(
                     num_words_with_padding,
                     num_features,
                     dtype=dtype,
                     device=device,
-                    requires_grad=True)
+                    requires_grad=True,
+                )
                 weights_check = weights.clone().detach().requires_grad_(True)
 
                 bag = torch.nn.functional.embedding_bag(
@@ -510,14 +594,16 @@ def gen_2D_indices_from_1D(indices_1D, offsets, include_last_offset, padding_idx
                     padding_idx=padding_idx_1D,
                     mode=mode,
                     sparse=sparse,
-                    include_last_offset=include_last_offset)
+                    include_last_offset=include_last_offset,
+                )
 
                 bag_check = torch.nn.functional.embedding_bag(
                     indices_2D,
                     weights_check,
                     padding_idx=padding_idx_2D,
                     mode=mode,
-                    sparse=sparse)
+                    sparse=sparse,
+                )
                 self.assertEqual(bag, bag_check, msg=msg)
 
                 bag.sum().backward()
@@ -531,7 +617,9 @@ def gen_2D_indices_from_1D(indices_1D, offsets, include_last_offset, padding_idx
                 else:
                     atol = None
                     rtol = None
-                self.assertEqual(weights.grad, weights_check.grad, msg=msg, atol=atol, rtol=rtol)
+                self.assertEqual(
+                    weights.grad, weights_check.grad, msg=msg, atol=atol, rtol=rtol
+                )
 
     # Check correctness of torch.nn.functional.embedding_bag forward and
     # backward functions with padding_idx, given a 2D indices input. Compare
@@ -545,29 +633,27 @@ def test_embedding_bag_2D_padding_idx(self, device, dtype):
         def embedding_bag_check(indices, weights, mode, sparse, padding_idx):
             assert padding_idx is not None
             embedding = torch.nn.functional.embedding(
-                indices,
-                weights,
-                padding_idx=padding_idx,
-                sparse=sparse)
+                indices, weights, padding_idx=padding_idx, sparse=sparse
+            )
 
             reduction_dim = indices.dim() - 1
 
-            if mode == 'sum' or mode == 'mean':
+            if mode == "sum" or mode == "mean":
                 # We must avoid including elements at padding_idx in the
                 # sum/mean, so multiply those elements by 0, and multiply
                 # all other elements by 1
                 per_sample_weights = indices.ne(padding_idx).to(dtype).unsqueeze(-1)
                 res = embedding.mul(per_sample_weights).sum(dim=reduction_dim)
 
-                if mode == 'mean':
+                if mode == "mean":
                     weights_sum = per_sample_weights.sum(dim=reduction_dim)
                     res = res.div(weights_sum)
 
-            elif mode == 'max':
+            elif mode == "max":
                 # We must avoid allowing elements at padding_idx to be chosen
                 # as the max, so set those elements to negative infinity
                 res = embedding.masked_fill(
-                    indices.unsqueeze(-1) == padding_idx, -float('inf')
+                    indices.unsqueeze(-1) == padding_idx, -float("inf")
                 ).amax(dim=reduction_dim)
 
             else:
@@ -576,10 +662,10 @@ def embedding_bag_check(indices, weights, mode, sparse, padding_idx):
             # If a row is all padding, set its corresponding result row to 0.
             # This is needed because the above mean and max mode
             # implementations set these elements to nan and -inf, respectively
-            if mode in ['mean', 'max']:
+            if mode in ["mean", "max"]:
                 res = res.masked_fill(
-                    indices.eq(padding_idx).all(dim=-1).unsqueeze(-1),
-                    0)
+                    indices.eq(padding_idx).all(dim=-1).unsqueeze(-1), 0
+                )
 
             return res
 
@@ -587,16 +673,22 @@ def embedding_bag_check(indices, weights, mode, sparse, padding_idx):
         num_words = 10
         indices_dim1 = 10
 
-        for mode, sparse, allpad, indices_dim0 in product(['max', 'mean', 'sum'], [False, True], [False, True], [1, 10]):
+        for mode, sparse, allpad, indices_dim0 in product(
+            ["max", "mean", "sum"], [False, True], [False, True], [1, 10]
+        ):
             # Max sparse and bfloat16 are not supported
-            if mode == 'max':
+            if mode == "max":
                 if sparse or (dtype == torch.bfloat16):
                     continue
 
             if allpad:
-                indices = torch.empty(indices_dim0, indices_dim1, dtype=torch.long, device=device).fill_(1)
+                indices = torch.empty(
+                    indices_dim0, indices_dim1, dtype=torch.long, device=device
+                ).fill_(1)
             else:
-                indices = torch.randint(0, num_words, (indices_dim0, indices_dim1), device=device)
+                indices = torch.randint(
+                    0, num_words, (indices_dim0, indices_dim1), device=device
+                )
 
                 if indices_dim0 > 1:
                     # Fill one row with duplicate index so we can test with a fully
@@ -605,26 +697,27 @@ def embedding_bag_check(indices, weights, mode, sparse, padding_idx):
                     indices[duplicate_row] = indices[duplicate_row][0]
 
             for padding_idx in list(set(indices.flatten(0, -1).tolist())):
-                weights = torch.randn(num_words, num_features, dtype=dtype, device=device, requires_grad=True)
+                weights = torch.randn(
+                    num_words,
+                    num_features,
+                    dtype=dtype,
+                    device=device,
+                    requires_grad=True,
+                )
                 weights_check = weights.clone().detach().requires_grad_(True)
 
                 msg = (
                     f"mode: '{mode}', sparse: {sparse}, padding_idx: {padding_idx}, "
-                    f"allpad: {allpad}, indices.size(): {indices.size()}")
+                    f"allpad: {allpad}, indices.size(): {indices.size()}"
+                )
 
                 # Check forward with a Python implementation of padding_idx embedding_bag
                 bag_check = embedding_bag_check(
-                    indices,
-                    weights_check,
-                    mode,
-                    sparse,
-                    padding_idx)
+                    indices, weights_check, mode, sparse, padding_idx
+                )
                 bag = torch.nn.functional.embedding_bag(
-                    indices,
-                    weights,
-                    padding_idx=padding_idx,
-                    mode=mode,
-                    sparse=sparse)
+                    indices, weights, padding_idx=padding_idx, mode=mode, sparse=sparse
+                )
 
                 self.assertEqual(bag, bag_check, msg=msg)
 
@@ -645,8 +738,13 @@ def embedding_bag_check(indices, weights, mode, sparse, padding_idx):
                 self.assertEqual(grad, grad_check, msg=msg, atol=atol, rtol=rtol)
 
     @onlyCUDA
-    @dtypes(*((torch.float, torch.double, torch.bfloat16, torch.half)
-              if TEST_WITH_ROCM else (torch.float, torch.double, torch.half)))
+    @dtypes(
+        *(
+            (torch.float, torch.double, torch.bfloat16, torch.half)
+            if TEST_WITH_ROCM
+            else (torch.float, torch.double, torch.half)
+        )
+    )
     def test_embedding_max_norm_device(self, device, dtype):
         embedding = nn.Embedding(22, 5, max_norm=1.0).to(device, dtype=dtype)
         # nn.Embedding only takes LongTensor as input
@@ -664,10 +762,14 @@ def test_embedding_bag_empty_input(self, device, dtypes):
             Embed = torch.nn.EmbeddingBag(m, n, sparse=sparse)
             Embed.to(device)
 
-            output = Embed(input=x, offsets=torch.tensor([0], device=device, dtype=dtypes[1]))
+            output = Embed(
+                input=x, offsets=torch.tensor([0], device=device, dtype=dtypes[1])
+            )
             self.assertEqual(output, torch.zeros_like(output))
 
-            output = Embed(input=x, offsets=torch.tensor([0, 0], device=device, dtype=dtypes[1]))
+            output = Embed(
+                input=x, offsets=torch.tensor([0, 0], device=device, dtype=dtypes[1])
+            )
             self.assertEqual(output, torch.zeros_like(output))
 
     @skipCUDAIf(True, "no out-of-bounds check on CUDA for perf.")
@@ -682,95 +784,137 @@ def test_embedding_bag_out_of_bounds_idx(self, device, dtypes, padding_idx, mode
         # positive out-of-bound
         idx2 = torch.tensor([[11, 8]], device=device, dtype=idx_dtype)
         weight = torch.randn(10, 2, device=device, dtype=w_dtype)
-        if mode == 'sum':
+        if mode == "sum":
             # Only `sum` supports per_sample_weight
-            per_sample_weights = (None, torch.randn_like(idx1, device=device, dtype=w_dtype))
+            per_sample_weights = (
+                None,
+                torch.randn_like(idx1, device=device, dtype=w_dtype),
+            )
         else:
             per_sample_weights = (None,)
 
         for p_s_weights, idx in itertools.product(per_sample_weights, (idx1, idx2)):
             msg = "Expected idx >= 0 && idx < num_embeddings"
             with self.assertRaisesRegex(RuntimeError, msg):
-                torch.nn.functional.embedding_bag(idx, weight,
-                                                  per_sample_weights=p_s_weights, padding_idx=padding_idx,
-                                                  mode=mode)
+                torch.nn.functional.embedding_bag(
+                    idx,
+                    weight,
+                    per_sample_weights=p_s_weights,
+                    padding_idx=padding_idx,
+                    mode=mode,
+                )
 
     def test_embedding_bag_dimension_errors(self, device):
         funcs = (
             lambda x, y, z: torch.nn.functional.embedding_bag(y, x, z),
             torch.embedding_bag,
             torch._embedding_bag,
-            torch._embedding_bag_forward_only
+            torch._embedding_bag_forward_only,
         )
         for i, f in enumerate(funcs):
-            err_type = ValueError if i == 0 else RuntimeError
-
-            weight = torch.full((2, 6,), 0, dtype=torch.float64, device=device)
-            indices = torch.full((2, 0, 0, 6, 6,), 2, dtype=torch.int64, device=device)
+            err_type = (ValueError, RuntimeError) if i == 0 else RuntimeError
+
+            weight = torch.full(
+                (
+                    2,
+                    6,
+                ),
+                0,
+                dtype=torch.float64,
+                device=device,
+            )
+            indices = torch.full(
+                (
+                    2,
+                    0,
+                    0,
+                    6,
+                    6,
+                ),
+                2,
+                dtype=torch.int64,
+                device=device,
+            )
             offsets = torch.full((2, 0, 0, 6, 6), 0, dtype=torch.int64, device=device)
 
             if i == 0:
-                error_msg = 'input has to be 1D or 2D Tensor'
+                error_msg = "input has to be 1D or 2D Tensor"
             else:
-                error_msg = 'input has to be a 1D or 2D Tensor'
-            with self.assertRaisesRegex(err_type, error_msg):
-                f(weight, indices, offsets)
+                error_msg = "input has to be a 1D or 2D Tensor"
+            torch._dynamo.disable(self.assertRaisesRegex)(
+                err_type, error_msg, lambda: f(weight, indices, offsets)
+            )
 
             weight = torch.full((2, 2), 0, dtype=torch.float64, device=device)
             indices = torch.full((2,), 1, dtype=torch.int64, device=device)
 
-            with self.assertRaisesRegex(err_type, 'offsets has to be a 1D Tensor'):
-                f(weight, indices, offsets)
+            torch._dynamo.disable(self.assertRaisesRegex)(
+                err_type,
+                "offsets has to be a 1D Tensor",
+                lambda: f(weight, indices, offsets),
+            )
 
             weight = torch.full((2, 2, 2), 0, dtype=torch.float64, device=device)
             indices = torch.full((2,), 2, dtype=torch.int64, device=device)
             offsets = torch.full((2,), 0, dtype=torch.int64, device=device)
 
-            with self.assertRaisesRegex(err_type, 'weight has to be a 2D Tensor'):
-                f(weight, indices, offsets)
+            torch._dynamo.disable(self.assertRaisesRegex)(
+                err_type,
+                "weight has to be a 2D Tensor",
+                lambda: f(weight, indices, offsets),
+            )
 
     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long)))
     def test_EmbeddingBag_per_sample_weights_failures(self, device, dtypes):
         # Failure 1: mismatched embeddings / per_sample_weights dtype
-        es = nn.EmbeddingBag(5, 2, mode='sum').to(dtype=torch.float, device=device)
+        es = nn.EmbeddingBag(5, 2, mode="sum").to(dtype=torch.float, device=device)
         input = torch.tensor([3, 1, 1, 1, 4, 0], dtype=dtypes[0], device=device)
         offsets = torch.tensor([0, 0, 3, 3, 6], dtype=dtypes[1], device=device)
         per_sample_weights = torch.randn_like(input, dtype=torch.double, device=device)
-        if device == 'cpu':
-            with self.assertRaisesRegex(RuntimeError, 'have the same type as'):
+        if device == "cpu":
+            with self.assertRaisesRegex(RuntimeError, "have the same type as"):
                 es(input, offsets, per_sample_weights)
         else:
-            with self.assertRaisesRegex(RuntimeError, 'expected scalar type'):
+            with self.assertRaisesRegex(RuntimeError, "expected scalar type"):
                 es(input, offsets, per_sample_weights)
 
         # Failure 2.1: input/per_sample_weights have different sizes (1d input)
         input = torch.tensor([3, 1, 1, 1, 4, 0], dtype=dtypes[0], device=device)
         offsets = torch.tensor([0, 0, 3, 3, 6], dtype=dtypes[1], device=device)
         per_sample_weights = torch.randn(5, dtype=torch.float, device=device)
-        with self.assertRaisesRegex(ValueError, 'same shape as the input'):
+        with self.assertRaisesRegex(ValueError, "same shape as the input"):
             es(input, offsets, per_sample_weights)
 
         # Failure 2.2: input/per_sample_weights have different sizes (2d input)
         input = torch.randint(5, (7, 3), dtype=dtypes[0], device=device)
         offsets = None
         per_sample_weights = torch.randn(7 * 3, dtype=torch.float, device=device)
-        with self.assertRaisesRegex(ValueError, 'same shape as the input'):
+        with self.assertRaisesRegex(ValueError, "same shape as the input"):
             es(input, offsets, per_sample_weights)
 
         # Failure 3: Unsupported per_sample_weights and mode=('max', 'mean')
-        for unsupported_mode in ('max', 'mean'):
+        for unsupported_mode in ("max", "mean"):
             es = nn.EmbeddingBag(5, 2, mode=unsupported_mode).to(
-                dtype=torch.float, device=device)
+                dtype=torch.float, device=device
+            )
             input = torch.randint(5, (7, 3), dtype=dtypes[0], device=device)
             offsets = None
             per_sample_weights = torch.randn(7, 3, dtype=torch.float, device=device)
-            with self.assertRaisesRegex(NotImplementedError,
-                                        "only supported for mode='sum'"):
+            with self.assertRaisesRegex(
+                NotImplementedError, "only supported for mode='sum'"
+            ):
                 es(input, offsets, per_sample_weights)
 
-    def _embedding_bag_reference_impl(self, input, weight, offsets=None, mode='sum',
-                                      per_sample_weights=None, include_last_offset=False):
-        assert mode == 'sum' or per_sample_weights is None
+    def _embedding_bag_reference_impl(
+        self,
+        input,
+        weight,
+        offsets=None,
+        mode="sum",
+        per_sample_weights=None,
+        include_last_offset=False,
+    ):
+        assert mode == "sum" or per_sample_weights is None
         assert offsets is not None
         if per_sample_weights is None:
             per_sample_weights = torch.ones(input.size()).to(
@@ -780,7 +924,9 @@ def _embedding_bag_reference_impl(self, input, weight, offsets=None, mode='sum',
 
         bags = []
         long_input = input.to(torch.long)
-        embeddings = weight.index_select(0, long_input) * per_sample_weights.unsqueeze(1)
+        embeddings = weight.index_select(0, long_input) * per_sample_weights.unsqueeze(
+            1
+        )
         if include_last_offset:
             for index in range(len(offsets) - 1):
                 offset = offsets[index]
@@ -793,12 +939,14 @@ def _embedding_bag_reference_impl(self, input, weight, offsets=None, mode='sum',
                         )
                     )
                 else:
-                    if mode == 'sum':
+                    if mode == "sum":
                         bags.append(embeddings.narrow(0, offset, length).sum(0))
-                    elif mode == 'mean':
-                        bags.append(embeddings.narrow(0, offset, length).sum(0).div(length))
+                    elif mode == "mean":
+                        bags.append(
+                            embeddings.narrow(0, offset, length).sum(0).div(length)
+                        )
                     else:
-                        assert mode == 'max'
+                        assert mode == "max"
                         bags.append(embeddings.narrow(0, offset, length).max(0)[0])
         else:
             for index, offset in enumerate(offsets):
@@ -814,115 +962,188 @@ def _embedding_bag_reference_impl(self, input, weight, offsets=None, mode='sum',
                         )
                     )
                 else:
-                    if mode == 'sum':
+                    if mode == "sum":
                         bags.append(embeddings.narrow(0, offset, length).sum(0))
-                    elif mode == 'mean':
-                        bags.append(embeddings.narrow(0, offset, length).sum(0).div(length))
+                    elif mode == "mean":
+                        bags.append(
+                            embeddings.narrow(0, offset, length).sum(0).div(length)
+                        )
                     else:
-                        assert mode == 'max'
+                        assert mode == "max"
                         bags.append(embeddings.narrow(0, offset, length).max(0)[0])
         return torch.stack(bags)
 
     @skipMeta
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
-                               (torch.half, torch.bfloat16, torch.float, torch.double)))
-    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
-                                     (torch.float, torch.double, torch.half)))
+    @dtypes(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.half, torch.bfloat16, torch.float, torch.double),
+        )
+    )
+    @dtypesIfCUDA(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float, torch.double, torch.half),
+        )
+    )
     def test_EmbeddingBag_empty_per_sample_weights_and_offsets(self, device, dtypes):
         # Test empty input and per sample weight, and backward pass. There was a CUDA
         # invalid configuration bug (more context in #46572)
         def test_per_sample_weights(mode, trainable_scale):
             es = nn.EmbeddingBag(5, 2, mode=mode).to(dtype=dtypes[2], device=device)
             es.weight.data.copy_(
-                torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2]))
+                torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2])
+            )
             input = torch.tensor([], device=device, dtype=dtypes[0])
             offsets = torch.tensor([0, 0, 0, 0, 0], device=device, dtype=dtypes[1])
-            per_sample_weights = torch.randn_like(input, dtype=dtypes[2]) \
-                                      .requires_grad_(trainable_scale)
-            ref_per_sample_weights = \
-                per_sample_weights.detach().requires_grad_(trainable_scale)
+            per_sample_weights = torch.randn_like(
+                input, dtype=dtypes[2]
+            ).requires_grad_(trainable_scale)
+            ref_per_sample_weights = per_sample_weights.detach().requires_grad_(
+                trainable_scale
+            )
             reference_weights = es.weight.detach().requires_grad_()
 
             expected = self._embedding_bag_reference_impl(
-                input, reference_weights, offsets, mode, ref_per_sample_weights)
+                input, reference_weights, offsets, mode, ref_per_sample_weights
+            )
             result = es(input, offsets, per_sample_weights)
-            self.assertEqual(result, expected, atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)
+            self.assertEqual(
+                result, expected, atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0
+            )
 
             grad = torch.randn_like(expected)
             result.backward(grad)
             # the reference impl doesn't have grad fn for empty input; but the grad should
             # simply be a zero tensor
             ref_weights_grad = torch.zeros_like(es.weight)
-            self.assertEqual(es.weight.grad, ref_weights_grad,
-                             atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)
+            self.assertEqual(
+                es.weight.grad,
+                ref_weights_grad,
+                atol=dtype2prec_DONTUSE[dtypes[2]],
+                rtol=0,
+            )
             if trainable_scale:
                 ref_per_sample_weights_grad = torch.empty_like(per_sample_weights)
-                self.assertEqual(per_sample_weights.grad, ref_per_sample_weights_grad,
-                                 atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)
-
-        modes = ('sum',)
+                self.assertEqual(
+                    per_sample_weights.grad,
+                    ref_per_sample_weights_grad,
+                    atol=dtype2prec_DONTUSE[dtypes[2]],
+                    rtol=0,
+                )
+
+        modes = ("sum",)
         trainable_scale = (True, False)
         for mode, trainable in itertools.product(modes, trainable_scale):
             test_per_sample_weights(mode, trainable)
 
     @skipMeta
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
-                               (torch.float, torch.double, torch.half, torch.bfloat16)))
-    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
-                                     (torch.float, torch.double, torch.half)))
+    @dtypes(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float, torch.double, torch.half, torch.bfloat16),
+        )
+    )
+    @dtypesIfCUDA(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float, torch.double, torch.half),
+        )
+    )
     def test_EmbeddingBag_per_sample_weights_and_offsets(self, device, dtypes):
         def test_per_sample_weights(mode, trainable_scale):
             es = nn.EmbeddingBag(5, 2, mode=mode).to(dtype=dtypes[2], device=device)
             es.weight.data.copy_(
-                torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2]))
+                torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2])
+            )
             input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=dtypes[0])
             offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=dtypes[1])
-            per_sample_weights = torch.randn_like(input, dtype=dtypes[2]) \
-                                      .requires_grad_(trainable_scale)
-            ref_per_sample_weights = \
-                per_sample_weights.detach().requires_grad_(trainable_scale)
+            per_sample_weights = torch.randn_like(
+                input, dtype=dtypes[2]
+            ).requires_grad_(trainable_scale)
+            ref_per_sample_weights = per_sample_weights.detach().requires_grad_(
+                trainable_scale
+            )
             reference_weights = es.weight.detach().requires_grad_()
 
             expected = self._embedding_bag_reference_impl(
-                input, reference_weights, offsets, mode, ref_per_sample_weights)
+                input, reference_weights, offsets, mode, ref_per_sample_weights
+            )
             result = es(input, offsets, per_sample_weights)
-            self.assertEqual(result, expected, atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)
+            self.assertEqual(
+                result, expected, atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0
+            )
 
             grad = torch.randn_like(expected).to(dtype=dtypes[2], device=device)
             result.backward(grad)
             expected.backward(grad)
-            self.assertEqual(es.weight.grad, reference_weights.grad,
-                             atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)
+            self.assertEqual(
+                es.weight.grad,
+                reference_weights.grad,
+                atol=dtype2prec_DONTUSE[dtypes[2]],
+                rtol=0,
+            )
             if trainable_scale:
-                self.assertEqual(per_sample_weights.grad, ref_per_sample_weights.grad,
-                                 atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)
-
-        modes = ('sum',)
+                self.assertEqual(
+                    per_sample_weights.grad,
+                    ref_per_sample_weights.grad,
+                    atol=dtype2prec_DONTUSE[dtypes[2]],
+                    rtol=0,
+                )
+
+        modes = ("sum",)
         trainable_scale = (True, False)
         for mode, trainable in itertools.product(modes, trainable_scale):
             test_per_sample_weights(mode, trainable)
 
     @skipMeta
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
-                               (torch.float, torch.double, torch.half, torch.bfloat16)))
-    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
-                                     (torch.float, torch.double, torch.half)))
+    @dtypes(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float, torch.double, torch.half, torch.bfloat16),
+        )
+    )
+    @dtypesIfCUDA(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float, torch.double, torch.half),
+        )
+    )
     def test_EmbeddingBag_per_sample_weights_and_new_offsets(self, device, dtypes):
-        def test_per_sample_weights_new_offsets(mode, trainable_scale, include_last_offset, has_weight=True):
-            es = nn.EmbeddingBag(5, 2, mode=mode, include_last_offset=include_last_offset).to(dtype=dtypes[2], device=device)
+        def test_per_sample_weights_new_offsets(
+            mode, trainable_scale, include_last_offset, has_weight=True
+        ):
+            es = nn.EmbeddingBag(
+                5, 2, mode=mode, include_last_offset=include_last_offset
+            ).to(dtype=dtypes[2], device=device)
             es.weight.data.copy_(
-                torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2]))
+                torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2])
+            )
             input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=dtypes[0])
             offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=dtypes[1])
 
             if include_last_offset:
-                offsets = torch.cat((offsets, torch.tensor([input.size(0)], device=device, dtype=dtypes[1])), 0)
+                offsets = torch.cat(
+                    (
+                        offsets,
+                        torch.tensor([input.size(0)], device=device, dtype=dtypes[1]),
+                    ),
+                    0,
+                )
 
             if has_weight:
-                per_sample_weights = torch.randn_like(input, device=device, dtype=dtypes[2]) \
-                                          .requires_grad_(trainable_scale)
-                ref_per_sample_weights = \
-                    per_sample_weights.detach().requires_grad_(trainable_scale)
+                per_sample_weights = torch.randn_like(
+                    input, device=device, dtype=dtypes[2]
+                ).requires_grad_(trainable_scale)
+                ref_per_sample_weights = per_sample_weights.detach().requires_grad_(
+                    trainable_scale
+                )
             else:
                 per_sample_weights = None
                 ref_per_sample_weights = None
@@ -930,22 +1151,38 @@ def test_per_sample_weights_new_offsets(mode, trainable_scale, include_last_offs
             reference_weights = es.weight.detach().requires_grad_()
 
             expected = self._embedding_bag_reference_impl(
-                input, reference_weights, offsets, mode, ref_per_sample_weights, include_last_offset)
+                input,
+                reference_weights,
+                offsets,
+                mode,
+                ref_per_sample_weights,
+                include_last_offset,
+            )
             result = es(input, offsets, per_sample_weights)
-            self.assertEqual(result, expected, atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)
+            self.assertEqual(
+                result, expected, atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0
+            )
 
             grad = torch.randn_like(expected)
             result.backward(grad)
             expected.backward(grad)
-            self.assertEqual(es.weight.grad, reference_weights.grad,
-                             atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)
+            self.assertEqual(
+                es.weight.grad,
+                reference_weights.grad,
+                atol=dtype2prec_DONTUSE[dtypes[2]],
+                rtol=0,
+            )
             if has_weight and trainable_scale:
-                self.assertEqual(per_sample_weights.grad, ref_per_sample_weights.grad,
-                                 atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)
+                self.assertEqual(
+                    per_sample_weights.grad,
+                    ref_per_sample_weights.grad,
+                    atol=dtype2prec_DONTUSE[dtypes[2]],
+                    rtol=0,
+                )
 
         trainable_scale = (True, False)
         include_last_offset_list = (True, False)
-        modes = (('sum', False), ('sum', True), ('max', False), ('mean', False))
+        modes = (("sum", False), ("sum", True), ("max", False), ("mean", False))
         for (mode, has_weight), trainable, include_last_offset in itertools.product(
             modes, trainable_scale, include_last_offset_list
         ):
@@ -953,17 +1190,26 @@ def test_per_sample_weights_new_offsets(mode, trainable_scale, include_last_offs
                 mode, trainable, include_last_offset, has_weight
             )
 
-    def _test_EmbeddingBag_vs_Embedding(self, N, D, B, L, max_norm=None,
-                                        mode='mean',
-                                        device='cpu',
-                                        wdtype=torch.float,
-                                        dtype=torch.long,
-                                        test_per_sample_weights=False,
-                                        trainable_per_sample_weights=False,
-                                        sparse=False,
-                                        test_backward=True,
-                                        backward_prec=None):
-        es = nn.EmbeddingBag(N, D, mode=mode, sparse=sparse, max_norm=max_norm).to(device, wdtype)
+    def _test_EmbeddingBag_vs_Embedding(
+        self,
+        N,
+        D,
+        B,
+        L,
+        max_norm=None,
+        mode="mean",
+        device="cpu",
+        wdtype=torch.float,
+        dtype=torch.long,
+        test_per_sample_weights=False,
+        trainable_per_sample_weights=False,
+        sparse=False,
+        test_backward=True,
+        backward_prec=None,
+    ):
+        es = nn.EmbeddingBag(N, D, mode=mode, sparse=sparse, max_norm=max_norm).to(
+            device, wdtype
+        )
         e = nn.Embedding(N, D, max_norm=max_norm).to(device, wdtype)
         e.weight.data.copy_(es.weight)
         input = torch.randint(N, (B, L), device=device, dtype=dtype)
@@ -972,10 +1218,12 @@ def _test_EmbeddingBag_vs_Embedding(self, N, D, B, L, max_norm=None,
 
         if test_per_sample_weights:
             # To prevent large gradients, weights should sum to 1 for each bag
-            per_sample_weights = \
-                torch.randn(B, L, device=device, dtype=wdtype).softmax(dim=-1)
-            per_sample_weights_reference = \
-                per_sample_weights.clone().requires_grad_(trainable_per_sample_weights)
+            per_sample_weights = torch.randn(B, L, device=device, dtype=wdtype).softmax(
+                dim=-1
+            )
+            per_sample_weights_reference = per_sample_weights.clone().requires_grad_(
+                trainable_per_sample_weights
+            )
             per_sample_weights.requires_grad_(trainable_per_sample_weights)
             output = es(input.view(-1), offsets, per_sample_weights.view(-1))
         else:
@@ -983,15 +1231,17 @@ def _test_EmbeddingBag_vs_Embedding(self, N, D, B, L, max_norm=None,
             per_sample_weights = None
             per_sample_weights_reference = None
 
-        if mode == 'sum':
+        if mode == "sum":
             if test_per_sample_weights:
-                ref_output = (e(input) * per_sample_weights_reference.unsqueeze(-1)).sum(1)
+                ref_output = (
+                    e(input) * per_sample_weights_reference.unsqueeze(-1)
+                ).sum(1)
             else:
                 ref_output = e(input).sum(1)
-        elif mode == 'mean':
+        elif mode == "mean":
             assert not test_per_sample_weights
             ref_output = e(input).mean(1)
-        elif mode == 'max':
+        elif mode == "max":
             assert not test_per_sample_weights
             ref_output = e(input).max(1)[0]
 
@@ -1002,9 +1252,9 @@ def _test_EmbeddingBag_vs_Embedding(self, N, D, B, L, max_norm=None,
 
         output.backward(grad_output)
         ref_output.backward(grad_output)
-        es_weight_grad = es.weight.grad.data
+        es_weight_grad = es.weight.grad
         if sparse:
-            es_weight_grad = es.weight.grad.data.to_dense()
+            es_weight_grad = es.weight.grad.to_dense()
 
         # We have more floating point error here because we are dealing with larger numbers
         if backward_prec is None:
@@ -1017,16 +1267,30 @@ def _test_EmbeddingBag_vs_Embedding(self, N, D, B, L, max_norm=None,
         self.assertEqual(es_weight_grad, e.weight.grad, atol=needed_prec, rtol=rtol)
 
         if test_per_sample_weights and trainable_per_sample_weights:
-            self.assertEqual(per_sample_weights.grad, per_sample_weights_reference.grad,
-                             atol=dtype2prec_DONTUSE[wdtype], rtol=0)
+            self.assertEqual(
+                per_sample_weights.grad,
+                per_sample_weights_reference.grad,
+                atol=dtype2prec_DONTUSE[wdtype],
+                rtol=0,
+            )
 
-    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.half, torch.float, torch.double)))
+    @dtypesIfCUDA(
+        *itertools.product(
+            (torch.int, torch.long), (torch.half, torch.float, torch.double)
+        )
+    )
     @dtypes(*itertools.product((torch.int, torch.long), (torch.float, torch.double)))
     def test_EmbeddingBag_per_sample_weights_and_no_offsets(self, device, dtypes):
         def run_tests(mode, sparse, trainable_per_sample_weights):
-            kwargs = dict(test_per_sample_weights=True, device=device,
-                          mode=mode, wdtype=dtypes[1], dtype=dtypes[0], sparse=sparse,
-                          trainable_per_sample_weights=trainable_per_sample_weights)
+            kwargs = dict(
+                test_per_sample_weights=True,
+                device=device,
+                mode=mode,
+                wdtype=dtypes[1],
+                dtype=dtypes[0],
+                sparse=sparse,
+                trainable_per_sample_weights=trainable_per_sample_weights,
+            )
 
             # Simple case
             self._test_EmbeddingBag_vs_Embedding(2, 3, 5, 7, **kwargs)
@@ -1040,20 +1304,22 @@ def run_tests(mode, sparse, trainable_per_sample_weights):
             # Large embedding_dim
             self._test_EmbeddingBag_vs_Embedding(2, 101, 3, 7, **kwargs)
 
-        modes = ('sum',)
+        modes = ("sum",)
         sparsity = (True, False)
         trainable_scale = (True, False)
-        for mode, sparse, trainable_per_sample_weights in \
-                itertools.product(modes, sparsity, trainable_scale):
+        for mode, sparse, trainable_per_sample_weights in itertools.product(
+            modes, sparsity, trainable_scale
+        ):
             run_tests(mode, sparse, trainable_per_sample_weights)
 
         # Test CUDA Dense on half precision
-        if device == 'cuda':
-            modes = ('sum',)
+        if device == "cuda":
+            modes = ("sum",)
             sparsity = (False,)
             trainable_scale = (True, False)
-            for mode, sparse, trainable_per_sample_weights in \
-                    itertools.product(modes, sparsity, trainable_scale):
+            for mode, sparse, trainable_per_sample_weights in itertools.product(
+                modes, sparsity, trainable_scale
+            ):
                 run_tests(mode, sparse, trainable_per_sample_weights)
 
     def _test_EmbeddingBag(
@@ -1068,65 +1334,66 @@ def _test_EmbeddingBag(
     ):
         # check a known test example
         es = nn.EmbeddingBag(5, 2, mode=mode, sparse=sparse).to(device, wdtype)
-        es.weight.data.copy_(torch.arange(1, 11, device=device).view_as(es.weight).to(wdtype))
+        es.weight.data.copy_(
+            torch.arange(1, 11, device=device).view_as(es.weight).to(wdtype)
+        )
         input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=dtype)
         offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=odtype)
 
-        grad_output = torch.tensor(
-            [1, 2,
-             3, 4], device=device, dtype=wdtype).view(2, 2)
+        grad_output = torch.tensor([1, 2, 3, 4], device=device, dtype=wdtype).view(2, 2)
         grad_output_with_empty = torch.tensor(
-            [99, 99,
-             1, 2,
-             99, 99,
-             3, 4,
-             99, 99], device=device, dtype=wdtype).view(5, 2)
+            [99, 99, 1, 2, 99, 99, 3, 4, 99, 99], device=device, dtype=wdtype
+        ).view(5, 2)
 
         if mode == "sum" or mode == "mean":
             denominator = 1 if mode == "sum" else 3
-            expected_output = torch.tensor(
-                [[13, 16],
-                 [13, 16]], device=device, dtype=wdtype) / denominator
+            expected_output = (
+                torch.tensor([[13, 16], [13, 16]], device=device, dtype=wdtype)
+                / denominator
+            )
 
-            expected_output_with_empty = torch.tensor(
-                [[0, 0],
-                 [13, 16],
-                 [0, 0],
-                 [13, 16],
-                 [0, 0]], device=device, dtype=wdtype) / denominator
+            expected_output_with_empty = (
+                torch.tensor(
+                    [[0, 0], [13, 16], [0, 0], [13, 16], [0, 0]],
+                    device=device,
+                    dtype=wdtype,
+                )
+                / denominator
+            )
 
-            expected_grad_weight = torch.tensor(
-                [[3, 4],
-                 [5, 8],
-                 [0, 0],
-                 [1, 2],
-                 [3, 4]], device=device, dtype=wdtype) / denominator
+            expected_grad_weight = (
+                torch.tensor(
+                    [[3, 4], [5, 8], [0, 0], [1, 2], [3, 4]],
+                    device=device,
+                    dtype=wdtype,
+                )
+                / denominator
+            )
         elif mode == "max":
             expected_output = torch.tensor(
-                [[7, 8],
-                 [9, 10]], device=device, dtype=wdtype)
+                [[7, 8], [9, 10]], device=device, dtype=wdtype
+            )
 
             expected_output_with_empty = torch.tensor(
-                [[0, 0],
-                 [7, 8],
-                 [0, 0],
-                 [9, 10],
-                 [0, 0]], device=device, dtype=wdtype)
+                [[0, 0], [7, 8], [0, 0], [9, 10], [0, 0]], device=device, dtype=wdtype
+            )
 
             expected_grad_weight = torch.tensor(
-                [[0, 0],
-                 [0, 0],
-                 [0, 0],
-                 [1, 2],
-                 [3, 4]], device=device, dtype=wdtype)
+                [[0, 0], [0, 0], [0, 0], [1, 2], [3, 4]], device=device, dtype=wdtype
+            )
         output = es(input, offsets)
         output.backward(grad_output_with_empty)
 
-        es_weight_grad = es.weight.grad.data
+        es_weight_grad = es.weight.grad
         if sparse:
             es_weight_grad = es.weight.grad.to_dense()
         self.assertEqual(output, expected_output_with_empty)
-        self.assertEqual(es_weight_grad, expected_grad_weight, atol=dtype2prec_DONTUSE[wdtype], rtol=0)
+        self.assertEqual(
+            es_weight_grad,
+            expected_grad_weight,
+            atol=dtype2prec_DONTUSE[wdtype],
+            rtol=0,
+        )
 
         # check same example except as 2D (2 x 3)
         input = input.view(2, -1)
@@ -1138,7 +1405,12 @@ def _test_EmbeddingBag(
         if sparse:
             es_weight_grad = es.weight.grad.to_dense()
         self.assertEqual(output, expected_output)
-        self.assertEqual(es_weight_grad, expected_grad_weight, atol=dtype2prec_DONTUSE[wdtype], rtol=0)
+        self.assertEqual(
+            es_weight_grad,
+            expected_grad_weight,
+            atol=dtype2prec_DONTUSE[wdtype],
+            rtol=0,
+        )
 
         # test all empty bags
         es.zero_grad()
@@ -1151,8 +1423,20 @@ def _test_EmbeddingBag(
         self.assertEqual(dense_grad, torch.zeros_like(es.weight))
 
         # now compare EmbeddingBag vs Embedding + Sum/Mean, for constant bag length
-        N, D, B, L = random.randint(1, 100), random.randint(1, 100), random.randint(1, 50), random.randint(1, 50)
-        kwargs = dict(mode=mode, sparse=sparse, device=device, wdtype=wdtype, dtype=dtype, test_backward=test_backward)
+        N, D, B, L = (
+            random.randint(1, 100),
+            random.randint(1, 100),
+            random.randint(1, 50),
+            random.randint(1, 50),
+        )
+        kwargs = dict(
+            mode=mode,
+            sparse=sparse,
+            device=device,
+            wdtype=wdtype,
+            dtype=dtype,
+            test_backward=test_backward,
+        )
         self._test_EmbeddingBag_vs_Embedding(N, D, B, L, **kwargs)
         for max_norm in (None, 3):
             for p in itertools.product([1, 2], repeat=4):
@@ -1162,40 +1446,77 @@ def _test_EmbeddingBag(
         es = nn.EmbeddingBag(10, 20, mode=mode, sparse=sparse)
         input = torch.ones(3, 4, dtype=dtype)
         offset = torch.arange(0, 3, dtype=odtype)
-        self.assertRaises(ValueError, lambda: es(input, offset))
-        self.assertRaises(ValueError, lambda: es(input.view(-1)))
+        torch._dynamo.disable(self.assertRaises)(ValueError, lambda: es(input, offset))
+        torch._dynamo.disable(self.assertRaises)(ValueError, lambda: es(input.view(-1)))
         offset[0] = 1
         if self.device_type == "cpu":
-            self.assertRaises(RuntimeError, lambda: es(input.view(-1), offset))
+            torch._dynamo.disable(self.assertRaises)(
+                RuntimeError, lambda: es(input.view(-1), offset)
+            )
             offset[0] = 0
             offset[-1] = 100
-            self.assertRaises(RuntimeError, lambda: es(input.view(-1), offset))
+            torch._dynamo.disable(self.assertRaises)(
+                RuntimeError, lambda: es(input.view(-1), offset)
+            )
 
     @skipMeta
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
-                               (torch.float, torch.double, torch.half, torch.bfloat16)))
-    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
-                                     (torch.float, torch.double, torch.half)))
+    @dtypes(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float, torch.double, torch.half, torch.bfloat16),
+        )
+    )
+    @dtypesIfCUDA(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float, torch.double, torch.half),
+        )
+    )
     def test_embedding_bag_device(self, device, dtypes):
         if IS_JETSON and torch.bfloat16 in dtypes and device == "cpu":
             self.skipTest("bfloat16 not supported with Jetson cpu")
         with set_default_dtype(torch.double):
-            self._test_EmbeddingBag(device, 'sum', False, wdtype=dtypes[2], dtype=dtypes[0], odtype=dtypes[1])
-            self._test_EmbeddingBag(device, 'mean', False, wdtype=dtypes[2], dtype=dtypes[0], odtype=dtypes[1])
-            self._test_EmbeddingBag(device, 'max', False, wdtype=dtypes[2], dtype=dtypes[0], odtype=dtypes[1])
+            self._test_EmbeddingBag(
+                device,
+                "sum",
+                False,
+                wdtype=dtypes[2],
+                dtype=dtypes[0],
+                odtype=dtypes[1],
+            )
+            self._test_EmbeddingBag(
+                device,
+                "mean",
+                False,
+                wdtype=dtypes[2],
+                dtype=dtypes[0],
+                odtype=dtypes[1],
+            )
+            self._test_EmbeddingBag(
+                device,
+                "max",
+                False,
+                wdtype=dtypes[2],
+                dtype=dtypes[0],
+                odtype=dtypes[1],
+            )
 
             test_backward = False
-            if self.device_type == 'cuda':
+            if self.device_type == "cuda":
                 # see 'todo' in test_embedding_bag.
                 test_backward = dtypes[2] is not torch.float16
-            elif self.device_type == 'cpu':
+            elif self.device_type == "cpu":
                 # TODO: figure out why precision on sparse embeddings isn't the
                 # same as for dense.
-                test_backward = dtypes[2] is not torch.float and dtypes[2] is not torch.float16
+                test_backward = (
+                    dtypes[2] is not torch.float and dtypes[2] is not torch.float16
+                )
 
             self._test_EmbeddingBag(
                 device,
-                'sum',
+                "sum",
                 True,
                 wdtype=dtypes[2],
                 dtype=dtypes[0],
@@ -1204,7 +1525,7 @@ def test_embedding_bag_device(self, device, dtypes):
             )
             self._test_EmbeddingBag(
                 device,
-                'mean',
+                "mean",
                 True,
                 wdtype=dtypes[2],
                 dtype=dtypes[0],
@@ -1213,19 +1534,33 @@ def test_embedding_bag_device(self, device, dtypes):
             )
 
     @skipMeta
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
-                               (torch.float, torch.double, torch.half, torch.bfloat16)))
-    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
-                                     (torch.float, torch.double, torch.half)))
+    @dtypes(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float, torch.double, torch.half, torch.bfloat16),
+        )
+    )
+    @dtypesIfCUDA(
+        *itertools.product(
+            (torch.int, torch.long),
+            (torch.int, torch.long),
+            (torch.float, torch.double, torch.half),
+        )
+    )
     def test_embedding_bag_non_contiguous_weight(self, device, dtypes):
         weight_tensor = torch.randn(3, 4, dtype=dtypes[2], device=device)
 
-        weight_tensor_non_contig = weight_tensor[:, :3]  # This is non-contiguous strided.
-        weight_tensor_contig = weight_tensor_non_contig.clone().contiguous()  # Contig-strided.
+        weight_tensor_non_contig = weight_tensor[
+            :, :3
+        ]  # This is non-contiguous strided.
+        weight_tensor_contig = (
+            weight_tensor_non_contig.clone().contiguous()
+        )  # Contig-strided.
 
         index = torch.tensor([0, 1, 2], dtype=dtypes[0], device=device)
         offsets = torch.tensor([0, 2], dtype=dtypes[1], device=device)
-        for mode in ['sum', 'mean', 'max']:
+        for mode in ["sum", "mean", "max"]:
             output_non_contig = F.embedding_bag(
                 input=index,
                 weight=weight_tensor_non_contig,
@@ -1244,21 +1579,41 @@ def test_embedding_bag_non_contiguous_weight(self, device, dtypes):
     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long)))
     def test_embedding_bag_bfloat16(self, device, dtypes):
         with set_default_dtype(torch.double):
-            self._test_EmbeddingBag(device, 'sum', True,
-                                    wdtype=torch.bfloat16, dtype=dtypes[0],
-                                    odtype=dtypes[1], test_backward=True)
-            self._test_EmbeddingBag(device, 'mean', True,
-                                    wdtype=torch.bfloat16, dtype=dtypes[0],
-                                    odtype=dtypes[1], test_backward=True)
+            self._test_EmbeddingBag(
+                device,
+                "sum",
+                True,
+                wdtype=torch.bfloat16,
+                dtype=dtypes[0],
+                odtype=dtypes[1],
+                test_backward=True,
+            )
+            self._test_EmbeddingBag(
+                device,
+                "mean",
+                True,
+                wdtype=torch.bfloat16,
+                dtype=dtypes[0],
+                odtype=dtypes[1],
+                test_backward=True,
+            )
 
     @onlyNativeDeviceTypes  # currently fails on XLA
     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long)))
     def test_embedding_bag_half(self, device, dtypes):
-        self._test_EmbeddingBag(device, 'sum', True, wdtype=torch.float16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True)
+        self._test_EmbeddingBag(
+            device,
+            "sum",
+            True,
+            wdtype=torch.float16,
+            dtype=dtypes[0],
+            odtype=dtypes[1],
+            test_backward=True,
+        )
 
 
 instantiate_device_type_tests(TestEmbeddingNNDeviceType, globals())
 instantiate_parametrized_tests(TestEmbeddingNN)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/nn/test_init.py b/test/nn/test_init.py
index b42a4f8b86cd8..8826fabc263b2 100644
--- a/test/nn/test_init.py
+++ b/test/nn/test_init.py
@@ -1,21 +1,28 @@
 # Owner(s): ["module: nn"]
-import random
-import unittest
 import math
+import random
 import string
+import unittest
 from functools import reduce
 from operator import mul
 
-from torch.testing._internal.common_utils import (
-    TestCase, TEST_SCIPY, skipIfNoLapack, skipIfTorchDynamo, run_tests
-)
 import torch
-import torch.nn.init as init
 import torch.nn.functional as F
+import torch.nn.init as init
+
+from torch.testing._internal.common_utils import (
+    run_tests,
+    skipIfNoLapack,
+    skipIfTorchDynamo,
+    slowTest,
+    TEST_SCIPY,
+    TestCase,
+)
 
 if TEST_SCIPY:
     from scipy import stats
 
+
 class TestNNInit(TestCase):
     def setUp(self):
         super().setUp()
@@ -23,7 +30,7 @@ def setUp(self):
 
     def _is_normal(self, tensor, mean, std):
         samples = tensor.view(-1).tolist()
-        p_value = stats.kstest(samples, 'norm', args=(mean, std))[1]
+        p_value = stats.kstest(samples, "norm", args=(mean, std))[1]
         return p_value > 0.0001
 
     def _is_trunc_normal(self, tensor, mean, std, a, b):
@@ -33,12 +40,12 @@ def _is_trunc_normal(self, tensor, mean, std, a, b):
         z_samples = z_samples.tolist()
         a0 = (a - mean) / std
         b0 = (b - mean) / std
-        p_value = stats.kstest(z_samples, 'truncnorm', args=(a0, b0))[1]
+        p_value = stats.kstest(z_samples, "truncnorm", args=(a0, b0))[1]
         return p_value > 0.0001
 
     def _is_uniform(self, tensor, a, b):
         samples = tensor.view(-1).tolist()
-        p_value = stats.kstest(samples, 'uniform', args=(a, (b - a)))[1]
+        p_value = stats.kstest(samples, "uniform", args=(a, (b - a)))[1]
         return p_value > 0.0001
 
     def _create_random_nd_tensor(self, dims, size_min, size_max):
@@ -50,27 +57,35 @@ def _random_float(self, a, b):
         return (b - a) * random.random() + a
 
     def test_calculate_gain_linear(self):
-        for fn in ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose2d', 'conv_transpose2d', 'conv_transpose3d']:
+        for fn in [
+            "linear",
+            "conv1d",
+            "conv2d",
+            "conv3d",
+            "conv_transpose2d",
+            "conv_transpose2d",
+            "conv_transpose3d",
+        ]:
             gain = init.calculate_gain(fn)
             self.assertEqual(gain, 1)
 
     def test_calculate_gain_nonlinear(self):
-        for fn in ['sigmoid', 'tanh', 'relu', 'leaky_relu']:
+        for fn in ["sigmoid", "tanh", "relu", "leaky_relu"]:
             gain = init.calculate_gain(fn)
-            if fn == 'sigmoid':
+            if fn == "sigmoid":
                 self.assertEqual(gain, 1)
-            elif fn == 'tanh':  # 5 / 3
+            elif fn == "tanh":  # 5 / 3
                 self.assertEqual(gain, 1.6666666666666667)
-            elif fn == 'relu':  # sqrt(2)
+            elif fn == "relu":  # sqrt(2)
                 self.assertEqual(gain, 1.4142135623730951)
-            elif fn == 'leaky_relu':  # sqrt(2 / 1 + slope^2))
+            elif fn == "leaky_relu":  # sqrt(2 / 1 + slope^2))
                 self.assertEqual(gain, 1.4141428569978354)
-            elif fn == 'selu':
+            elif fn == "selu":
                 self.assertEqual(gain, 0.75)
 
     def test_calculate_gain_leaky_relu(self):
         for param in [None, 0, 0.01, 10]:
-            gain = init.calculate_gain('leaky_relu', param)
+            gain = init.calculate_gain("leaky_relu", param)
             if param is None:  # Default slope is 0.01
                 self.assertEqual(gain, 1.4141428569978354)
             elif param == 0:  # No slope = same gain as normal ReLU
@@ -81,14 +96,16 @@ def test_calculate_gain_leaky_relu(self):
                 self.assertEqual(gain, 0.14071950894605836)
 
     def test_calculate_gain_leaky_relu_only_accepts_numbers(self):
-        for param in [True, [1], {'a': 'b'}]:
+        for param in [True, [1], {"a": "b"}]:
             with self.assertRaises(ValueError):
-                init.calculate_gain('leaky_relu', param)
+                init.calculate_gain("leaky_relu", param)
 
     def test_calculate_gain_only_accepts_valid_nonlinearities(self):
         for n in [2, 5, 25]:
             # Generate random strings of lengths that definitely aren't supported
-            random_string = ''.join([random.choice(string.ascii_lowercase) for i in range(n)])
+            random_string = "".join(
+                [random.choice(string.ascii_lowercase) for i in range(n)]
+            )
             with self.assertRaises(ValueError):
                 init.calculate_gain(random_string)
 
@@ -119,7 +136,7 @@ def test_trunc_normal(self):
         for dims in [1, 2, 4]:
             input_tensor = self._create_random_nd_tensor(dims, size_min=30, size_max=50)
             mean = self._random_float(-3, 3)
-            std = self._random_float(.01, 1)
+            std = self._random_float(0.01, 1)
             a = self._random_float(mean - 2 * std, mean)
             b = self._random_float(mean, mean + 2 * std)
             init.trunc_normal_(input_tensor, mean=mean, std=std, a=a, b=b)
@@ -152,7 +169,9 @@ def test_constant(self):
     def test_ones_and_zeros(self):
         for init_fn_, val in zip([init.ones_, init.zeros_], [1, 0]):
             for dims in [1, 2, 4]:
-                input_tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=5)
+                input_tensor = self._create_random_nd_tensor(
+                    dims, size_min=1, size_max=5
+                )
                 init_fn_(input_tensor)
 
                 self.assertEqual(input_tensor, input_tensor.clone().fill_(val))
@@ -180,7 +199,9 @@ def test_dirac_properties(self):
             for groups in [1, 2, 3]:
                 # prepare random tensor with random sizes, but fits groups
                 a, c, d, e = (random.randint(1, 5) for _ in range(4))
-                b = random.randint(1, 5 * groups)  # same range as a*groups but all range allowed
+                b = random.randint(
+                    1, 5 * groups
+                )  # same range as a*groups but all range allowed
                 # make sure first dim divides by groups
                 input_tensor = torch.randn((a * groups, b, c, d, e)[:dims])
 
@@ -193,10 +214,15 @@ def test_dirac_properties(self):
                 # Check sum of values (can have precision issues, hence assertEqual) is also equivalent
                 self.assertEqual(input_tensor.sum(), min_d * groups)
 
-
     def test_dirac_identity(self):
         for groups in [1, 3]:
-            batch, in_c, out_c, size, kernel_size = 8, 3, 9, 5, 3  # in_c, out_c must divide by groups
+            batch, in_c, out_c, size, kernel_size = (
+                8,
+                3,
+                9,
+                5,
+                3,
+            )  # in_c, out_c must divide by groups
             eff_out_c = out_c // groups
 
             # Test 1D
@@ -205,13 +231,23 @@ def test_dirac_identity(self):
             filter_var = torch.cat([filter_var] * groups)
             init.dirac_(filter_var, groups)
             output_var = F.conv1d(input_var, filter_var)
-            input_tensor, output_tensor = input_var.data, output_var.data  # Variables do not support nonzero
+            input_tensor, output_tensor = (
+                input_var.data,
+                output_var.data,
+            )  # Variables do not support nonzero
             for g in range(groups):
                 # Assert in_c outputs are preserved (per each group)
-                self.assertEqual(input_tensor[:, :, 1:-1],
-                                 output_tensor[:, eff_out_c * g:eff_out_c * g + in_c, :])
+                self.assertEqual(
+                    input_tensor[:, :, 1:-1],
+                    output_tensor[:, eff_out_c * g : eff_out_c * g + in_c, :],
+                )
                 # Assert extra outputs are 0
-                assert torch.nonzero(output_tensor[:, eff_out_c * g + in_c:eff_out_c * (g + 1), :]).numel() == 0
+                assert (
+                    torch.nonzero(
+                        output_tensor[:, eff_out_c * g + in_c : eff_out_c * (g + 1), :]
+                    ).numel()
+                    == 0
+                )
 
             # Test 2D
             input_var = torch.randn(batch, in_c, size, size)
@@ -219,27 +255,50 @@ def test_dirac_identity(self):
             filter_var = torch.cat([filter_var] * groups)
             init.dirac_(filter_var, groups)
             output_var = F.conv2d(input_var, filter_var)
-            input_tensor, output_tensor = input_var.data, output_var.data  # Variables do not support nonzero
+            input_tensor, output_tensor = (
+                input_var.data,
+                output_var.data,
+            )  # Variables do not support nonzero
             for g in range(groups):
                 # Assert in_c outputs are preserved (per each group)
-                self.assertEqual(input_tensor[:, :, 1:-1, 1:-1],
-                                 output_tensor[:, eff_out_c * g:eff_out_c * g + in_c, :, :])
+                self.assertEqual(
+                    input_tensor[:, :, 1:-1, 1:-1],
+                    output_tensor[:, eff_out_c * g : eff_out_c * g + in_c, :, :],
+                )
                 # Assert extra outputs are 0
-                assert torch.nonzero(output_tensor[:, eff_out_c * g + in_c:eff_out_c * (g + 1), :, :]).numel() == 0
+                assert (
+                    torch.nonzero(
+                        output_tensor[
+                            :, eff_out_c * g + in_c : eff_out_c * (g + 1), :, :
+                        ]
+                    ).numel()
+                    == 0
+                )
 
             # Test 3D
             input_var = torch.randn(batch, in_c, size, size, size)
-            filter_var = torch.zeros(eff_out_c, in_c, kernel_size, kernel_size, kernel_size)
+            filter_var = torch.zeros(
+                eff_out_c, in_c, kernel_size, kernel_size, kernel_size
+            )
             filter_var = torch.cat([filter_var] * groups)
             init.dirac_(filter_var, groups)
             output_var = F.conv3d(input_var, filter_var)
             input_tensor, output_tensor = input_var.data, output_var.data
             for g in range(groups):
                 # Assert in_c outputs are preserved (per each group)
-                self.assertEqual(input_tensor[:, :, 1:-1, 1:-1, 1:-1],
-                                 output_tensor[:, eff_out_c * g:eff_out_c * g + in_c, :, :, :])
+                self.assertEqual(
+                    input_tensor[:, :, 1:-1, 1:-1, 1:-1],
+                    output_tensor[:, eff_out_c * g : eff_out_c * g + in_c, :, :, :],
+                )
                 # Assert extra outputs are 0
-                assert torch.nonzero(output_tensor[:, eff_out_c * g + in_c:eff_out_c * (g + 1), :, :, :]).numel() == 0
+                assert (
+                    torch.nonzero(
+                        output_tensor[
+                            :, eff_out_c * g + in_c : eff_out_c * (g + 1), :, :, :
+                        ]
+                    ).numel()
+                    == 0
+                )
 
     def test_dirac_only_works_on_3_4_5d_inputs(self):
         for dims in [1, 2, 6]:
@@ -260,10 +319,13 @@ def test_xavier_normal_errors_on_inputs_smaller_than_2d(self):
                 init.xavier_normal_(tensor)
 
     @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    @slowTest
     def test_xavier_uniform(self):
         for use_gain in [True, False]:
             for dims in [2, 4]:
-                input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
+                input_tensor = self._create_random_nd_tensor(
+                    dims, size_min=20, size_max=25
+                )
                 gain = 1
 
                 if use_gain:
@@ -287,7 +349,9 @@ def test_xavier_uniform(self):
     def test_xavier_normal(self):
         for use_gain in [True, False]:
             for dims in [2, 4]:
-                input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
+                input_tensor = self._create_random_nd_tensor(
+                    dims, size_min=20, size_max=25
+                )
                 gain = 1
 
                 if use_gain:
@@ -319,12 +383,16 @@ def test_kaiming_normal_errors_on_inputs_smaller_than_2d(self):
 
     def test_kaiming_uniform_warning_on_0element_tensor(self):
         tensor = torch.empty(0, 1)
-        with self.assertWarnsRegex(UserWarning, "Initializing zero-element tensors is a no-op"):
+        with self.assertWarnsRegex(
+            UserWarning, "Initializing zero-element tensors is a no-op"
+        ):
             _ = init.kaiming_uniform_(tensor)
 
     def test_kaiming_normal_warning_on_0element_tensor(self):
         tensor = torch.empty(0, 1)
-        with self.assertWarnsRegex(UserWarning, "Initializing zero-element tensors is a no-op"):
+        with self.assertWarnsRegex(
+            UserWarning, "Initializing zero-element tensors is a no-op"
+        ):
             _ = init.kaiming_normal_(tensor)
 
     @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
@@ -332,8 +400,10 @@ def test_kaiming_normal_warning_on_0element_tensor(self):
     def test_kaiming_uniform(self):
         for use_a in [True, False]:
             for dims in [2, 4]:
-                for mode in ['fan_in', 'fan_out']:
-                    input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
+                for mode in ["fan_in", "fan_out"]:
+                    input_tensor = self._create_random_nd_tensor(
+                        dims, size_min=20, size_max=25
+                    )
                     if use_a:
                         a = self._random_float(0.1, 2)
                         init.kaiming_uniform_(input_tensor, a=a, mode=mode)
@@ -347,7 +417,7 @@ def test_kaiming_uniform(self):
                         fan_in *= input_tensor[0, 0].numel()
                         fan_out *= input_tensor[0, 0].numel()
 
-                    if mode == 'fan_in':
+                    if mode == "fan_in":
                         n = fan_in
                     else:
                         n = fan_out
@@ -361,8 +431,10 @@ def test_kaiming_uniform(self):
     def test_kaiming_normal(self):
         for use_a in [True, False]:
             for dims in [2, 4]:
-                for mode in ['fan_in', 'fan_out']:
-                    input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
+                for mode in ["fan_in", "fan_out"]:
+                    input_tensor = self._create_random_nd_tensor(
+                        dims, size_min=20, size_max=25
+                    )
                     if use_a:
                         a = self._random_float(0.1, 2)
                         init.kaiming_normal_(input_tensor, a=a, mode=mode)
@@ -376,7 +448,7 @@ def test_kaiming_normal(self):
                         fan_in *= input_tensor[0, 0].numel()
                         fan_out *= input_tensor[0, 0].numel()
 
-                    if mode == 'fan_in':
+                    if mode == "fan_in":
                         n = fan_in
                     else:
                         n = fan_out
@@ -428,11 +500,19 @@ def test_orthogonal(self):
                 rows, cols = tensor_size[0], reduce(mul, tensor_size[1:])
                 flattened_tensor = input_tensor.view(rows, cols)
                 if rows > cols:
-                    self.assertEqual(torch.mm(flattened_tensor.t(), flattened_tensor),
-                                     torch.eye(cols) * gain ** 2, atol=1e-6, rtol=0)
+                    self.assertEqual(
+                        torch.mm(flattened_tensor.t(), flattened_tensor),
+                        torch.eye(cols) * gain**2,
+                        atol=1e-6,
+                        rtol=0,
+                    )
                 else:
-                    self.assertEqual(torch.mm(flattened_tensor, flattened_tensor.t()),
-                                     torch.eye(rows) * gain ** 2, atol=1e-6, rtol=0)
+                    self.assertEqual(
+                        torch.mm(flattened_tensor, flattened_tensor.t()),
+                        torch.eye(rows) * gain**2,
+                        atol=1e-6,
+                        rtol=0,
+                    )
 
     def test_deprecation(self):
         x = torch.randn(3, 3)
@@ -440,9 +520,13 @@ def test_deprecation(self):
         def fn():
             init.normal(x)
 
-        with self.assertWarnsRegex(UserWarning, 'deprecated', msg='methods not suffixed with underscore should be deprecated'):
+        with self.assertWarnsRegex(
+            UserWarning,
+            "deprecated",
+            msg="methods not suffixed with underscore should be deprecated",
+        ):
             fn()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/nn/test_lazy_modules.py b/test/nn/test_lazy_modules.py
index bac16845acf8b..2de0dc656bfce 100644
--- a/test/nn/test_lazy_modules.py
+++ b/test/nn/test_lazy_modules.py
@@ -1,75 +1,80 @@
 # Owner(s): ["module: nn"]
-import unittest
 import pickle
+import unittest
 
 import torch
 import torch.nn as nn
-from torch.nn.parameter import UninitializedParameter, UninitializedBuffer
 from torch.nn import Parameter
-from torch.testing._internal.common_utils import TestCase, run_tests, suppress_warnings, TEST_PRIVATEUSE1
+from torch.nn.parameter import UninitializedBuffer, UninitializedParameter
 from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_utils import (
+    run_tests,
+    suppress_warnings,
+    TEST_PRIVATEUSE1,
+    TestCase,
+)
+
 
 class LazyModule(torch.nn.modules.lazy.LazyModuleMixin, torch.nn.Module):
     pass
 
 
 class TestLazyModules(TestCase):
-
     @suppress_warnings
     def test_lazy_module_parameter(self):
         module = LazyModule()
-        module.register_parameter('test_param', UninitializedParameter())
+        module.register_parameter("test_param", UninitializedParameter())
         self.assertTrue(module.has_uninitialized_params())
         state_dict = module.state_dict()
-        self.assertIsInstance(state_dict['test_param'], UninitializedParameter)
+        self.assertIsInstance(state_dict["test_param"], UninitializedParameter)
         new_module = LazyModule()
         # An error is raised when there is an attempt to replace an existing parameter
         # with an uninitialized one
-        new_module.register_parameter('test_param', nn.Parameter(torch.ones(5, 5)))
-        with self.assertRaisesRegex(RuntimeError, 'shape of an uninitialized'):
+        new_module.register_parameter("test_param", nn.Parameter(torch.ones(5, 5)))
+        with self.assertRaisesRegex(RuntimeError, "shape of an uninitialized"):
             new_module.load_state_dict(state_dict)
         # Uninitialized parameters are overriden when the state dict to be loaded contains a valid one
         new_module = LazyModule()
-        new_module.register_parameter('test_param', nn.Parameter(torch.ones(5, 5)))
+        new_module.register_parameter("test_param", nn.Parameter(torch.ones(5, 5)))
         module.load_state_dict(new_module.state_dict())
         self.assertEqual(module.test_param, torch.ones((5, 5)))
 
         # Uninitialized parameters are left unchanged
         module = LazyModule()
-        module.register_parameter('test_param', UninitializedParameter())
+        module.register_parameter("test_param", UninitializedParameter())
         self.assertTrue(module.has_uninitialized_params())
 
         new_module = LazyModule()
-        new_module.register_parameter('test_param', UninitializedParameter())
+        new_module.register_parameter("test_param", UninitializedParameter())
         module.load_state_dict(new_module.state_dict())
         self.assertTrue(module.has_uninitialized_params())
 
     @suppress_warnings
     def test_lazy_module_buffer(self):
         module = LazyModule()
-        module.register_buffer('test_buffer', UninitializedBuffer())
+        module.register_buffer("test_buffer", UninitializedBuffer())
         self.assertTrue(module.has_uninitialized_params())
         state_dict = module.state_dict()
-        self.assertIsInstance(state_dict['test_buffer'], UninitializedBuffer)
+        self.assertIsInstance(state_dict["test_buffer"], UninitializedBuffer)
         new_module = LazyModule()
         # An error is raised when there is an attempt to replace an existing parameter
         # with an uninitialized one
-        new_module.register_buffer('test_buffer', torch.ones(5, 5))
-        with self.assertRaisesRegex(RuntimeError, 'shape of an uninitialized'):
+        new_module.register_buffer("test_buffer", torch.ones(5, 5))
+        with self.assertRaisesRegex(RuntimeError, "shape of an uninitialized"):
             new_module.load_state_dict(state_dict)
         # Uninitialized parameters are overriden when the state dict to be loaded contains a valid one
         new_module = LazyModule()
-        new_module.register_buffer('test_buffer', torch.ones(5, 5))
+        new_module.register_buffer("test_buffer", torch.ones(5, 5))
         module.load_state_dict(new_module.state_dict())
         self.assertEqual(module.test_buffer, torch.ones((5, 5)))
 
         # Uninitialized parameters are left unchanged
         module = LazyModule()
-        module.register_buffer('test_buffer', UninitializedBuffer())
+        module.register_buffer("test_buffer", UninitializedBuffer())
         self.assertTrue(module.has_uninitialized_params())
 
         new_module = LazyModule()
-        new_module.register_buffer('test_buffer', UninitializedBuffer())
+        new_module.register_buffer("test_buffer", UninitializedBuffer())
         module.load_state_dict(new_module.state_dict())
         module.load_state_dict(new_module.state_dict())
         self.assertTrue(module.has_uninitialized_params())
@@ -77,33 +82,33 @@ def test_lazy_module_buffer(self):
     @suppress_warnings
     def test_lazy_module_jit_param(self):
         module = LazyModule()
-        module.register_parameter('test_param', UninitializedParameter())
+        module.register_parameter("test_param", UninitializedParameter())
         self.assertTrue(module.has_uninitialized_params())
-        with self.assertRaisesRegex(RuntimeError, 'run a forward pass'):
+        with self.assertRaisesRegex(RuntimeError, "run a forward pass"):
             torch.jit.script(module)
 
     @suppress_warnings
     def test_lazy_module_jit_buffer(self):
         module = LazyModule()
-        module.register_buffer('test_buffer', UninitializedBuffer())
+        module.register_buffer("test_buffer", UninitializedBuffer())
         self.assertTrue(module.has_uninitialized_params())
-        with self.assertRaisesRegex(RuntimeError, 'run a forward pass'):
+        with self.assertRaisesRegex(RuntimeError, "run a forward pass"):
             torch.jit.script(module)
 
     @suppress_warnings
     def test_lazy_share_memory_param(self):
         module = LazyModule()
-        module.register_parameter('test_param', UninitializedParameter())
+        module.register_parameter("test_param", UninitializedParameter())
         self.assertTrue(module.has_uninitialized_params())
-        with self.assertRaisesRegex(RuntimeError, 'share memory on an uninitialized'):
+        with self.assertRaisesRegex(RuntimeError, "share memory on an uninitialized"):
             module.share_memory()
 
     @suppress_warnings
     def test_lazy_share_memory_buffer(self):
         module = LazyModule()
-        module.register_buffer('test_buffer', UninitializedBuffer())
+        module.register_buffer("test_buffer", UninitializedBuffer())
         self.assertTrue(module.has_uninitialized_params())
-        with self.assertRaisesRegex(RuntimeError, 'share memory on an uninitialized'):
+        with self.assertRaisesRegex(RuntimeError, "share memory on an uninitialized"):
             module.share_memory()
 
     @suppress_warnings
@@ -118,7 +123,11 @@ def test_linear(self):
         self.assertTrue(module.weight.shape == (10, 5))
         self.assertTrue(module.bias.shape == (10,))
         y = module(input)
-        self.assertTrue(torch.equal(torch.nn.functional.linear(input, module.weight, module.bias), y))
+        self.assertTrue(
+            torch.equal(
+                torch.nn.functional.linear(input, module.weight, module.bias), y
+            )
+        )
 
     @suppress_warnings
     def test_lazy_linear_pickle(self):
@@ -153,17 +162,27 @@ def test_linear_state(self):
 
         module = nn.Linear(5, 10)
         lazy_module = nn.LazyLinear(10)
-        with self.assertRaisesRegex(RuntimeError, 'shape of an uninitialized'):
+        with self.assertRaisesRegex(RuntimeError, "shape of an uninitialized"):
             module.load_state_dict(lazy_module.state_dict())
 
-    def _check_lazy_conv(self, cls, lazy_cls, func, init_args, input_shape,
-                         expected_weight_shape, expected_bias_shape):
+    def _check_lazy_conv(
+        self,
+        cls,
+        lazy_cls,
+        func,
+        init_args,
+        input_shape,
+        expected_weight_shape,
+        expected_bias_shape,
+        *forward_args,
+        **forward_kwargs,
+    ):
         module = lazy_cls(*init_args)
         self.assertIsInstance(module.weight, UninitializedParameter)
         if module.bias is not None:
             self.assertIsInstance(module.bias, UninitializedParameter)
         input = torch.ones(*input_shape)
-        module(input)
+        module(input, *forward_args, **forward_kwargs)
         self.assertIsInstance(module, cls)
         self.assertNotIsInstance(module, lazy_cls)
         self.assertEqual(module.weight.shape, expected_weight_shape)
@@ -172,8 +191,15 @@ def _check_lazy_conv(self, cls, lazy_cls, func, init_args, input_shape,
         y = module(input)
         self.assertTrue(torch.equal(func(input, module.weight, module.bias), y))
 
-    def _check_lazy_conv_pickle(self, cls, lazy_cls, init_args, input_shape,
-                                expected_weight_shape, expected_bias_shape):
+    def _check_lazy_conv_pickle(
+        self,
+        cls,
+        lazy_cls,
+        init_args,
+        input_shape,
+        expected_weight_shape,
+        expected_bias_shape,
+    ):
         module = lazy_cls(*init_args)
         self.assertIsInstance(module.weight, UninitializedParameter)
         if module.bias is not None:
@@ -194,8 +220,9 @@ def _check_lazy_conv_pickle(self, cls, lazy_cls, init_args, input_shape,
             self.assertEqual(new_module.bias.shape, expected_bias_shape)
             self.assertNotIsInstance(new_module.bias, UninitializedParameter)
 
-    def _check_lazy_conv_state(self, gen_module, gen_lazy_module,
-                               expected_weight_shape, expected_bias_shape):
+    def _check_lazy_conv_state(
+        self, gen_module, gen_lazy_module, expected_weight_shape, expected_bias_shape
+    ):
         module = gen_module()
         lazy_module = gen_lazy_module()
         lazy_module.load_state_dict(module.state_dict())
@@ -209,15 +236,15 @@ def _check_lazy_conv_state(self, gen_module, gen_lazy_module,
 
         module = gen_module()
         lazy_module = gen_lazy_module()
-        with self.assertRaisesRegex(RuntimeError, 'shape of an uninitialized'):
+        with self.assertRaisesRegex(RuntimeError, "shape of an uninitialized"):
             module.load_state_dict(lazy_module.state_dict())
 
-
     def test_lazy_pre_forward_hook(self):
         """
         This test is to test whether lazymodule can register other pre-forward hook
         functions successfully.
         """
+
         class TestModule(torch.nn.modules.lazy.LazyModuleMixin, torch.nn.Module):
             def initialize_parameters(self, input):
                 return None
@@ -238,6 +265,7 @@ def test_lazy_forward_hook(self):
         This test is to test whether lazymodule can register other forward hook
         functions successfully.
         """
+
         class TestModule(torch.nn.modules.lazy.LazyModuleMixin, torch.nn.Module):
             def initialize_parameters(self, input):
                 return None
@@ -255,104 +283,231 @@ def hook_function(module, input, output):
 
     @suppress_warnings
     def test_lazy_conv1d(self):
-        self._check_lazy_conv(nn.Conv1d, nn.LazyConv1d, torch.nn.functional.conv1d,
-                              (32, 2), (192, 16, 50), (32, 16, 2), (32,))
+        self._check_lazy_conv(
+            nn.Conv1d,
+            nn.LazyConv1d,
+            torch.nn.functional.conv1d,
+            (32, 2),
+            (192, 16, 50),
+            (32, 16, 2),
+            (32,),
+        )
 
     @suppress_warnings
     def test_lazy_conv1d_pickle(self):
-        self._check_lazy_conv_pickle(nn.Conv1d, nn.LazyConv1d, (32, 2), (192, 16, 50),
-                                     (32, 16, 2), (32,))
+        self._check_lazy_conv_pickle(
+            nn.Conv1d, nn.LazyConv1d, (32, 2), (192, 16, 50), (32, 16, 2), (32,)
+        )
 
     @suppress_warnings
     def test_lazy_conv1d_state(self):
-        self._check_lazy_conv_state(lambda: nn.Conv1d(16, 32, 2),
-                                    lambda: nn.LazyConv1d(32, 2),
-                                    (32, 16, 2), (32,))
+        self._check_lazy_conv_state(
+            lambda: nn.Conv1d(16, 32, 2),
+            lambda: nn.LazyConv1d(32, 2),
+            (32, 16, 2),
+            (32,),
+        )
 
     @suppress_warnings
     def test_lazy_conv2d(self):
-        self._check_lazy_conv(nn.Conv2d, nn.LazyConv2d, torch.nn.functional.conv2d,
-                              (32, 2), (192, 16, 8, 6), (32, 16, 2, 2), (32,))
+        self._check_lazy_conv(
+            nn.Conv2d,
+            nn.LazyConv2d,
+            torch.nn.functional.conv2d,
+            (32, 2),
+            (192, 16, 8, 6),
+            (32, 16, 2, 2),
+            (32,),
+        )
 
     @suppress_warnings
     def test_lazy_conv2d_pickle(self):
-        self._check_lazy_conv_pickle(nn.Conv2d, nn.LazyConv2d, (32, 2), (192, 16, 8, 6),
-                                     (32, 16, 2, 2), (32,))
+        self._check_lazy_conv_pickle(
+            nn.Conv2d, nn.LazyConv2d, (32, 2), (192, 16, 8, 6), (32, 16, 2, 2), (32,)
+        )
 
     @suppress_warnings
     def test_lazy_conv2d_state(self):
-        self._check_lazy_conv_state(lambda: nn.Conv2d(16, 32, 2),
-                                    lambda: nn.LazyConv2d(32, 2),
-                                    (32, 16, 2, 2), (32,))
+        self._check_lazy_conv_state(
+            lambda: nn.Conv2d(16, 32, 2),
+            lambda: nn.LazyConv2d(32, 2),
+            (32, 16, 2, 2),
+            (32,),
+        )
 
     @suppress_warnings
     def test_lazy_conv3d(self):
-        self._check_lazy_conv(nn.Conv3d, nn.LazyConv3d, torch.nn.functional.conv3d,
-                              (32, 2), (192, 16, 8, 7, 6), (32, 16, 2, 2, 2), (32,))
+        self._check_lazy_conv(
+            nn.Conv3d,
+            nn.LazyConv3d,
+            torch.nn.functional.conv3d,
+            (32, 2),
+            (192, 16, 8, 7, 6),
+            (32, 16, 2, 2, 2),
+            (32,),
+        )
 
     @suppress_warnings
     def test_lazy_conv3d_pickle(self):
-        self._check_lazy_conv_pickle(nn.Conv3d, nn.LazyConv3d, (32, 2), (192, 16, 8, 7, 6),
-                                     (32, 16, 2, 2, 2), (32,))
+        self._check_lazy_conv_pickle(
+            nn.Conv3d,
+            nn.LazyConv3d,
+            (32, 2),
+            (192, 16, 8, 7, 6),
+            (32, 16, 2, 2, 2),
+            (32,),
+        )
 
     @suppress_warnings
     def test_lazy_conv3d_state(self):
-        self._check_lazy_conv_state(lambda: nn.Conv3d(16, 32, 2),
-                                    lambda: nn.LazyConv3d(32, 2),
-                                    (32, 16, 2, 2, 2), (32,))
+        self._check_lazy_conv_state(
+            lambda: nn.Conv3d(16, 32, 2),
+            lambda: nn.LazyConv3d(32, 2),
+            (32, 16, 2, 2, 2),
+            (32,),
+        )
 
     @suppress_warnings
     def test_lazy_conv_transposed1d(self):
-        self._check_lazy_conv(nn.ConvTranspose1d, nn.LazyConvTranspose1d, torch.nn.functional.conv_transpose1d,
-                              (32, 2), (192, 16, 50), (16, 32, 2), (32,))
+        self._check_lazy_conv(
+            nn.ConvTranspose1d,
+            nn.LazyConvTranspose1d,
+            torch.nn.functional.conv_transpose1d,
+            (32, 2),
+            (192, 16, 50),
+            (16, 32, 2),
+            (32,),
+        )
+
+    @suppress_warnings
+    def test_lazy_conv_transpose1d_kwargs(self):
+        self._check_lazy_conv(
+            nn.ConvTranspose1d,
+            nn.LazyConvTranspose1d,
+            torch.nn.functional.conv_transpose1d,
+            (32, 2),
+            (192, 16, 50),
+            (16, 32, 2),
+            (32,),
+            output_size=(51,),
+        )
 
     @suppress_warnings
     def test_lazy_conv_transpose1d_pickle(self):
-        self._check_lazy_conv_pickle(nn.ConvTranspose1d, nn.LazyConvTranspose1d, (32, 2),
-                                     (192, 16, 50), (16, 32, 2), (32,))
+        self._check_lazy_conv_pickle(
+            nn.ConvTranspose1d,
+            nn.LazyConvTranspose1d,
+            (32, 2),
+            (192, 16, 50),
+            (16, 32, 2),
+            (32,),
+        )
 
     @suppress_warnings
     def test_lazy_conv_transpose1d_state(self):
-        self._check_lazy_conv_state(lambda: nn.ConvTranspose1d(16, 32, 2),
-                                    lambda: nn.LazyConvTranspose1d(32, 2),
-                                    (16, 32, 2), (32,))
+        self._check_lazy_conv_state(
+            lambda: nn.ConvTranspose1d(16, 32, 2),
+            lambda: nn.LazyConvTranspose1d(32, 2),
+            (16, 32, 2),
+            (32,),
+        )
 
     @suppress_warnings
     def test_lazy_conv_transpose2d(self):
-        self._check_lazy_conv(nn.ConvTranspose2d, nn.LazyConvTranspose2d, torch.nn.functional.conv_transpose2d,
-                              (32, 2), (192, 16, 8, 6), (16, 32, 2, 2), (32,))
+        self._check_lazy_conv(
+            nn.ConvTranspose2d,
+            nn.LazyConvTranspose2d,
+            torch.nn.functional.conv_transpose2d,
+            (32, 2),
+            (192, 16, 8, 6),
+            (16, 32, 2, 2),
+            (32,),
+        )
+
+    @suppress_warnings
+    def test_lazy_conv_transpose2d_kwargs(self):
+        self._check_lazy_conv(
+            nn.ConvTranspose2d,
+            nn.LazyConvTranspose2d,
+            torch.nn.functional.conv_transpose2d,
+            (32, 2),
+            (192, 16, 8, 6),
+            (16, 32, 2, 2),
+            (32,),
+            output_size=(9, 7),
+        )
 
     @suppress_warnings
     def test_lazy_conv_transpose2d_pickle(self):
-        self._check_lazy_conv_pickle(nn.ConvTranspose2d, nn.LazyConvTranspose2d, (32, 2),
-                                     (192, 16, 8, 6), (16, 32, 2, 2), (32,))
+        self._check_lazy_conv_pickle(
+            nn.ConvTranspose2d,
+            nn.LazyConvTranspose2d,
+            (32, 2),
+            (192, 16, 8, 6),
+            (16, 32, 2, 2),
+            (32,),
+        )
 
     @suppress_warnings
     def test_lazy_conv_transpose2d_state(self):
-        self._check_lazy_conv_state(lambda: nn.ConvTranspose2d(16, 32, 2),
-                                    lambda: nn.LazyConvTranspose2d(32, 2),
-                                    (16, 32, 2, 2), (32,))
+        self._check_lazy_conv_state(
+            lambda: nn.ConvTranspose2d(16, 32, 2),
+            lambda: nn.LazyConvTranspose2d(32, 2),
+            (16, 32, 2, 2),
+            (32,),
+        )
 
     @suppress_warnings
     def test_lazy_conv_transpose3d(self):
-        self._check_lazy_conv(nn.ConvTranspose3d, nn.LazyConvTranspose3d, torch.nn.functional.conv_transpose3d,
-                              (32, 2), (192, 16, 8, 7, 6), (16, 32, 2, 2, 2), (32,))
+        self._check_lazy_conv(
+            nn.ConvTranspose3d,
+            nn.LazyConvTranspose3d,
+            torch.nn.functional.conv_transpose3d,
+            (32, 2),
+            (192, 16, 8, 7, 6),
+            (16, 32, 2, 2, 2),
+            (32,),
+        )
+
+    @suppress_warnings
+    def test_lazy_conv_transpose3d_kwargs(self):
+        self._check_lazy_conv(
+            nn.ConvTranspose3d,
+            nn.LazyConvTranspose3d,
+            torch.nn.functional.conv_transpose3d,
+            (32, 2),
+            (192, 16, 8, 7, 6),
+            (16, 32, 2, 2, 2),
+            (32,),
+            output_size=(9, 8, 7),
+        )
 
     @suppress_warnings
     def test_lazy_conv_transpose3d_pickle(self):
-        self._check_lazy_conv_pickle(nn.ConvTranspose3d, nn.LazyConvTranspose3d, (32, 2),
-                                     (192, 16, 8, 7, 6), (16, 32, 2, 2, 2), (32,))
+        self._check_lazy_conv_pickle(
+            nn.ConvTranspose3d,
+            nn.LazyConvTranspose3d,
+            (32, 2),
+            (192, 16, 8, 7, 6),
+            (16, 32, 2, 2, 2),
+            (32,),
+        )
 
     @suppress_warnings
     def test_lazy_conv_transpose3d_state(self):
-        self._check_lazy_conv_state(lambda: nn.ConvTranspose3d(16, 32, 2),
-                                    lambda: nn.LazyConvTranspose3d(32, 2),
-                                    (16, 32, 2, 2, 2), (32,))
+        self._check_lazy_conv_state(
+            lambda: nn.ConvTranspose3d(16, 32, 2),
+            lambda: nn.LazyConvTranspose3d(32, 2),
+            (16, 32, 2, 2, 2),
+            (32,),
+        )
 
     def _check_lazy_norm(self, cls, lazy_cls, input_shape):
         for affine in [False, True]:
             for track_running_stats in [False, True]:
-                lazy_module = lazy_cls(affine=affine, track_running_stats=track_running_stats)
+                lazy_module = lazy_cls(
+                    affine=affine, track_running_stats=track_running_stats
+                )
 
                 if affine:
                     self.assertIsInstance(lazy_module.weight, UninitializedParameter)
@@ -367,7 +522,9 @@ def _check_lazy_norm(self, cls, lazy_cls, input_shape):
                 self.assertNotIsInstance(lazy_module, lazy_cls)
 
                 num_features = input_shape[1]
-                module = cls(num_features, affine=affine, track_running_stats=track_running_stats)
+                module = cls(
+                    num_features, affine=affine, track_running_stats=track_running_stats
+                )
                 expected_output = module(input)
 
                 self.assertEqual(lazy_output, expected_output)
@@ -378,19 +535,30 @@ def _check_lazy_norm(self, cls, lazy_cls, input_shape):
                     self.assertEqual(lazy_module.bias.shape, module.bias.shape)
                     self.assertEqual(lazy_module.bias, module.bias)
                 if module.running_mean is not None:
-                    self.assertEqual(lazy_module.running_mean.shape, module.running_mean.shape)
+                    self.assertEqual(
+                        lazy_module.running_mean.shape, module.running_mean.shape
+                    )
                     self.assertEqual(lazy_module.running_mean, module.running_mean)
                 if module.running_var is not None:
-                    self.assertEqual(lazy_module.running_var.shape, module.running_var.shape)
+                    self.assertEqual(
+                        lazy_module.running_var.shape, module.running_var.shape
+                    )
                     self.assertEqual(lazy_module.running_var, module.running_var)
                 if module.num_batches_tracked is not None:
-                    self.assertEqual(lazy_module.num_batches_tracked.shape, module.num_batches_tracked.shape)
-                    self.assertEqual(lazy_module.num_batches_tracked, module.num_batches_tracked)
+                    self.assertEqual(
+                        lazy_module.num_batches_tracked.shape,
+                        module.num_batches_tracked.shape,
+                    )
+                    self.assertEqual(
+                        lazy_module.num_batches_tracked, module.num_batches_tracked
+                    )
 
     def _check_lazy_norm_pickle(self, cls, lazy_cls, input_shape):
         for affine in [False, True]:
             for track_running_stats in [False, True]:
-                module = lazy_cls(affine=affine, track_running_stats=track_running_stats)
+                module = lazy_cls(
+                    affine=affine, track_running_stats=track_running_stats
+                )
                 module = pickle.loads(pickle.dumps(module))
 
                 self.assertIsInstance(module, lazy_cls)
@@ -429,14 +597,16 @@ def _check_lazy_batchnorm_state(self, cls, lazy_cls):
 
         module = cls(10)
         lazy_module = lazy_cls()
-        with self.assertRaisesRegex(RuntimeError, 'shape of an uninitialized'):
+        with self.assertRaisesRegex(RuntimeError, "shape of an uninitialized"):
             module.load_state_dict(lazy_module.state_dict())
 
     def _check_lazy_instancenorm_state(self, cls, lazy_cls):
         for affine in [False, True]:
             for track_running_stats in [False, True]:
                 module = cls(10, affine=affine, track_running_stats=track_running_stats)
-                lazy_module = lazy_cls(affine=affine, track_running_stats=track_running_stats)
+                lazy_module = lazy_cls(
+                    affine=affine, track_running_stats=track_running_stats
+                )
                 lazy_module.load_state_dict(module.state_dict())
                 # Parameters have been initialized but the module won't become a full
                 # InstanceNorm one until the first iteration. This is due to
@@ -451,7 +621,7 @@ def _check_lazy_instancenorm_state(self, cls, lazy_cls):
 
         module = cls(10, affine=True, track_running_stats=True)
         lazy_module = lazy_cls(affine=True, track_running_stats=True)
-        with self.assertRaisesRegex(RuntimeError, 'shape of an uninitialized'):
+        with self.assertRaisesRegex(RuntimeError, "shape of an uninitialized"):
             module.load_state_dict(lazy_module.state_dict())
 
     def _check_lazy_norm_with_dict_input(self, cls, lazy_cls, input_shape):
@@ -492,7 +662,9 @@ def test_lazy_batchnorm3d(self):
         self._check_lazy_norm(nn.BatchNorm3d, nn.LazyBatchNorm3d, (16, 3, 6, 7, 8))
 
     def test_lazy_batchnorm3d_pickle(self):
-        self._check_lazy_norm_pickle(nn.BatchNorm3d, nn.LazyBatchNorm3d, (16, 3, 6, 7, 8))
+        self._check_lazy_norm_pickle(
+            nn.BatchNorm3d, nn.LazyBatchNorm3d, (16, 3, 6, 7, 8)
+        )
 
     def test_lazy_batchnorm3d_state(self):
         self._check_lazy_batchnorm_state(nn.BatchNorm3d, nn.LazyBatchNorm3d)
@@ -502,7 +674,9 @@ def test_lazy_instancenorm1d(self):
         self._check_lazy_norm(nn.InstanceNorm1d, nn.LazyInstanceNorm1d, (16, 3, 6))
 
     def test_lazy_instancenorm1d_pickle(self):
-        self._check_lazy_norm_pickle(nn.InstanceNorm1d, nn.LazyInstanceNorm1d, (16, 3, 6))
+        self._check_lazy_norm_pickle(
+            nn.InstanceNorm1d, nn.LazyInstanceNorm1d, (16, 3, 6)
+        )
 
     def test_lazy_instancenorm1d_state(self):
         self._check_lazy_instancenorm_state(nn.InstanceNorm1d, nn.LazyInstanceNorm1d)
@@ -512,52 +686,66 @@ def test_lazy_instancenorm2d(self):
         self._check_lazy_norm(nn.InstanceNorm2d, nn.LazyInstanceNorm2d, (16, 3, 6, 7))
 
     def test_lazy_instancenorm2d_pickle(self):
-        self._check_lazy_norm_pickle(nn.InstanceNorm2d, nn.LazyInstanceNorm2d, (16, 3, 6, 7))
+        self._check_lazy_norm_pickle(
+            nn.InstanceNorm2d, nn.LazyInstanceNorm2d, (16, 3, 6, 7)
+        )
 
     def test_lazy_instancenorm2d_state(self):
         self._check_lazy_instancenorm_state(nn.InstanceNorm2d, nn.LazyInstanceNorm2d)
         self._check_lazy_instancenorm_state(nn.InstanceNorm2d, nn.LazyInstanceNorm2d)
 
     def test_lazy_instancenorm3d(self):
-        self._check_lazy_norm(nn.InstanceNorm3d, nn.LazyInstanceNorm3d, (16, 3, 6, 7, 8))
+        self._check_lazy_norm(
+            nn.InstanceNorm3d, nn.LazyInstanceNorm3d, (16, 3, 6, 7, 8)
+        )
 
     def test_lazy_instancenorm3d_pickle(self):
-        self._check_lazy_norm_pickle(nn.InstanceNorm3d, nn.LazyInstanceNorm3d, (16, 3, 6, 7, 8))
+        self._check_lazy_norm_pickle(
+            nn.InstanceNorm3d, nn.LazyInstanceNorm3d, (16, 3, 6, 7, 8)
+        )
 
     def test_lazy_instancenorm3d_state(self):
         self._check_lazy_instancenorm_state(nn.InstanceNorm3d, nn.LazyInstanceNorm3d)
         self._check_lazy_instancenorm_state(nn.InstanceNorm3d, nn.LazyInstanceNorm3d)
 
     def test_lazy_batchnorm_with_dict_input(self):
-        self._check_lazy_norm_with_dict_input(nn.BatchNorm1d, nn.LazyBatchNorm1d, (16, 3, 6))
-        self._check_lazy_norm_with_dict_input(nn.BatchNorm2d, nn.LazyBatchNorm2d, (16, 3, 6, 7))
-        self._check_lazy_norm_with_dict_input(nn.BatchNorm3d, nn.LazyBatchNorm3d, (16, 3, 6, 7, 8))
+        self._check_lazy_norm_with_dict_input(
+            nn.BatchNorm1d, nn.LazyBatchNorm1d, (16, 3, 6)
+        )
+        self._check_lazy_norm_with_dict_input(
+            nn.BatchNorm2d, nn.LazyBatchNorm2d, (16, 3, 6, 7)
+        )
+        self._check_lazy_norm_with_dict_input(
+            nn.BatchNorm3d, nn.LazyBatchNorm3d, (16, 3, 6, 7, 8)
+        )
 
     @suppress_warnings
     def test_materialize_dtype(self):
         module = LazyModule()
-        module.register_parameter('test_param', UninitializedParameter())
+        module.register_parameter("test_param", UninitializedParameter())
         module.test_param.materialize(10)
         self.assertTrue(module.test_param.dtype == torch.get_default_dtype())
         module = LazyModule()
-        module.register_parameter('test_param', UninitializedParameter())
+        module.register_parameter("test_param", UninitializedParameter())
         module.half()
         module.test_param.materialize(10)
         self.assertTrue(module.test_param.dtype == torch.float16)
 
-    @unittest.skipIf(not (TEST_CUDA or TEST_PRIVATEUSE1), 'CUDA and PRIVATEUSE1 not available')
+    @unittest.skipIf(
+        not (TEST_CUDA or TEST_PRIVATEUSE1), "CUDA and PRIVATEUSE1 not available"
+    )
     @suppress_warnings
     def test_materialize_device(self):
         module = LazyModule()
-        module.register_parameter('test_param', UninitializedParameter())
+        module.register_parameter("test_param", UninitializedParameter())
         module.test_param.materialize(10)
-        self.assertTrue(module.test_param.device.type == 'cpu')
+        self.assertTrue(module.test_param.device.type == "cpu")
         if TEST_CUDA:
-            device = 'cuda'
+            device = "cuda"
         elif TEST_PRIVATEUSE1:
             device = torch._C._get_privateuse1_backend_name()
         module = LazyModule()
-        module.register_parameter('test_param', UninitializedParameter())
+        module.register_parameter("test_param", UninitializedParameter())
         module.to(device)
         module.test_param.materialize(10)
         self.assertTrue(module.test_param.device.type == device)
@@ -583,16 +771,30 @@ def forward(self, x):
 
     @suppress_warnings
     def test_optimizer_pass(self):
-        optimizers = [torch.optim.Adadelta, torch.optim.Adagrad, torch.optim.Adam,
-                      torch.optim.AdamW, torch.optim.Adamax,
-                      torch.optim.ASGD, torch.optim.SGD, torch.optim.Rprop,
-                      torch.optim.RMSprop, torch.optim.LBFGS]
+        optimizers = [
+            torch.optim.Adadelta,
+            torch.optim.Adagrad,
+            torch.optim.Adamax,
+            torch.optim.Adam,
+            torch.optim.AdamW,
+            torch.optim.ASGD,
+            torch.optim.SGD,
+            torch.optim.Rprop,
+            torch.optim.RMSprop,
+            torch.optim.LBFGS,
+            torch.optim.NAdam,
+            torch.optim.RAdam,
+        ]
 
         def run_step(module, optim):
-            self.assertIsInstance(optim.param_groups[0]['params'][0], UninitializedParameter)
+            self.assertIsInstance(
+                optim.param_groups[0]["params"][0], UninitializedParameter
+            )
             module.test_param.materialize(10)
-            self.assertIsInstance(optim.param_groups[0]['params'][0], Parameter)
-            self.assertNotIsInstance(optim.param_groups[0]['params'][0], UninitializedParameter)
+            self.assertIsInstance(optim.param_groups[0]["params"][0], Parameter)
+            self.assertNotIsInstance(
+                optim.param_groups[0]["params"][0], UninitializedParameter
+            )
             for p in module.parameters():
                 p.grad = torch.rand_like(p)
             if isinstance(optim, torch.optim.LBFGS):
@@ -602,11 +804,11 @@ def run_step(module, optim):
 
         for optim_cls in optimizers:
             module = LazyModule()
-            module.register_parameter('test_param', UninitializedParameter())
+            module.register_parameter("test_param", UninitializedParameter())
             if optim_cls is torch.optim.SGD:
                 optim = optim_cls(module.parameters(), lr=0.0)
             elif optim_cls is torch.optim.Adagrad:
-                with self.assertRaisesRegex(ValueError, 'uninitialized parameter'):
+                with self.assertRaisesRegex(ValueError, "uninitialized parameter"):
                     optim = optim_cls(module.parameters())
                 continue
             else:
@@ -616,26 +818,27 @@ def run_step(module, optim):
     @suppress_warnings
     def test_weight_norm(self):
         m = nn.LazyLinear(7)
-        with self.assertRaisesRegex(ValueError, 'have uninitialized parameters.'):
+        with self.assertRaisesRegex(ValueError, "have uninitialized parameters."):
             m = torch.nn.utils.weight_norm(m)
 
     @suppress_warnings
     def test_spectral_norm(self):
         m = nn.LazyLinear(7)
-        with self.assertRaisesRegex(ValueError, 'have uninitialized parameters.'):
+        with self.assertRaisesRegex(ValueError, "have uninitialized parameters."):
             m = torch.nn.utils.spectral_norm(m)
 
     @suppress_warnings
     def test_invalid_functions(self):
         param = torch.nn.parameter.UninitializedParameter()
-        with self.assertRaisesRegex(ValueError, 'uninitialized parameter'):
+        with self.assertRaisesRegex(ValueError, "uninitialized parameter"):
             torch.empty_like(param)
 
-        with self.assertRaisesRegex(ValueError, 'uninitialized parameter'):
+        with self.assertRaisesRegex(ValueError, "uninitialized parameter"):
             torch.add(param, param)
 
-        with self.assertRaisesRegex(ValueError, 'uninitialized parameter'):
+        with self.assertRaisesRegex(ValueError, "uninitialized parameter"):
             param + param
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/nn/test_load_state_dict.py b/test/nn/test_load_state_dict.py
new file mode 100644
index 0000000000000..cd9540382cc1a
--- /dev/null
+++ b/test/nn/test_load_state_dict.py
@@ -0,0 +1,648 @@
+# Owner(s): ["module: nn"]
+import re
+import unittest
+from copy import deepcopy
+from itertools import product
+from tempfile import NamedTemporaryFile
+
+import torch
+import torch.nn as nn
+from torch.testing._internal.common_nn import NNTestCase
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    IS_WINDOWS,
+    parametrize,
+    run_tests,
+    skipIfCrossRef,
+    skipIfTorchDynamo,
+    swap,
+    TEST_NUMPY,
+    TestCase,
+)
+from torch.utils._pytree import tree_map
+
+if TEST_NUMPY:
+    import numpy as np
+
+
+class TestLoadStateDict(NNTestCase):
+    _do_cuda_memory_leak_check = True
+    _do_cuda_non_default_stream = True
+
+    @unittest.skipIf(not TEST_NUMPY, "numpy not found")
+    @swap([True, False])
+    def test_load_state_dict_invalid(self):
+        m = torch.nn.Linear(2, 2, bias=False)
+
+        state_dict = {"weight": np.random.randn(2, 2)}
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "expected torch.Tensor or Tensor-like object from checkpoint but received",
+        ):
+            m.load_state_dict(state_dict)
+
+        state_dict = {"weight": ((1.0, 1.0), (2.0, 2.0))}
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "expected torch.Tensor or Tensor-like object from checkpoint but received",
+        ):
+            m.load_state_dict(state_dict)
+
+    @swap([True, False])
+    def test_load_state_dict_type(self):
+        m = nn.Module()
+
+        with self.assertRaisesRegex(
+            TypeError, "Expected state_dict to be dict-like, got"
+        ):
+            m.load_state_dict("")
+        with self.assertRaisesRegex(
+            TypeError, "Expected state_dict to be dict-like, got"
+        ):
+            m.load_state_dict(2)
+
+    @swap([True, False])
+    @skipIfTorchDynamo("dynamo installs weakrefs on some params")
+    def test_load_state_dict(self):
+        l = nn.Linear(5, 5)
+        block = nn.Module()
+        block.conv1 = nn.Conv2d(3, 3, 3, bias=True)
+        block.conv2 = nn.Conv2d(3, 3, 3, bias=False)
+        net = nn.Module()
+        net.linear1 = l
+        net.linear2 = l
+        net.bn = nn.BatchNorm2d(2)
+        net.block = block
+        net.add_module("empty", None)
+        conv1_bias_dtype = block.conv1.bias.dtype
+
+        state_dict = net.state_dict()
+        state_dict.update(
+            {
+                "linear1.weight": torch.ones(5, 5),
+                "block.conv1.bias": torch.arange(1, 4, dtype=conv1_bias_dtype),
+                "bn.running_mean": torch.randn(2),
+            }
+        )
+        # Also test if a DDP state_dict can be loaded from a local model.
+        ddp_state_dict = net.state_dict()
+        ddp_state_dict.update(
+            {
+                "module.linear1.weight": torch.ones(5, 5),
+                "module.block.conv1.bias": torch.arange(1, 4, dtype=conv1_bias_dtype),
+                "module.bn.running_mean": torch.randn(2),
+            }
+        )
+        torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
+            ddp_state_dict, "module."
+        )
+        for sd in [state_dict, ddp_state_dict]:
+            incompatible_keys = net.load_state_dict(sd)
+            self.assertEqual(len(incompatible_keys.missing_keys), 0)
+            self.assertEqual(len(incompatible_keys.unexpected_keys), 0)
+            self.assertNotIn("Incompatible", str(incompatible_keys))
+            self.assertEqual(net.linear1.weight, sd["linear1.weight"])
+            self.assertEqual(net.block.conv1.bias, sd["block.conv1.bias"])
+            self.assertEqual(net.bn.running_mean, sd["bn.running_mean"])
+
+        state_dict = net.state_dict()
+        state_dict.update({"extra": torch.ones(5)})
+        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict))
+        incompatible_keys = net.load_state_dict(state_dict, strict=False)
+        self.assertEqual(len(incompatible_keys.missing_keys), 0)
+        self.assertEqual(len(incompatible_keys.unexpected_keys), 1)
+        self.assertIn("extra", incompatible_keys.unexpected_keys)
+        self.assertIn("Incompatible", str(incompatible_keys))
+
+        state_dict = net.state_dict()
+        state_dict.update({"extra.param": torch.ones(5)})
+        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict))
+        incompatible_keys = net.load_state_dict(state_dict, strict=False)
+        self.assertEqual(len(incompatible_keys.missing_keys), 0)
+        self.assertEqual(len(incompatible_keys.unexpected_keys), 1)
+        self.assertIn("extra.param", incompatible_keys.unexpected_keys)
+
+        state_dict = net.state_dict()
+        del state_dict["linear1.weight"]
+        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict))
+        incompatible_keys = net.load_state_dict(state_dict, strict=False)
+        self.assertEqual(len(incompatible_keys.missing_keys), 1)
+        self.assertEqual(len(incompatible_keys.unexpected_keys), 0)
+        self.assertIn("linear1.weight", incompatible_keys.missing_keys)
+        state_dict.update({"extra.param": torch.ones(5)})
+        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict))
+        incompatible_keys = net.load_state_dict(state_dict, strict=False)
+        self.assertEqual(len(incompatible_keys.missing_keys), 1)
+        self.assertEqual(len(incompatible_keys.unexpected_keys), 1)
+        self.assertIn("linear1.weight", incompatible_keys.missing_keys)
+        self.assertIn("extra.param", incompatible_keys.unexpected_keys)
+
+        state_dict = net.state_dict()
+        state_dict.update({"bn.running_mean": torch.rand(14, 4)})  # wrong size
+        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict))
+        self.assertRaises(
+            RuntimeError, lambda: net.load_state_dict(state_dict, strict=False)
+        )
+
+        state_dict = net.state_dict()
+        old_state_dict = deepcopy(state_dict)
+        state_dict = {
+            "linear1.weight": torch.ones(5, 5),
+            "block.conv1.bias": torch.arange(1, 4, dtype=conv1_bias_dtype),
+            "bn.running_mean": torch.randn(2),
+            "nonexistent_key": torch.rand(3),
+        }
+        net.load_state_dict(state_dict, strict=False)
+        self.assertEqual(net.linear1.weight, state_dict["linear1.weight"])
+        self.assertEqual(net.block.conv1.bias, state_dict["block.conv1.bias"])
+        self.assertEqual(net.bn.running_mean, state_dict["bn.running_mean"])
+        new_state_dict = net.state_dict()
+        del old_state_dict["linear1.weight"]
+        del old_state_dict["block.conv1.bias"]
+        del old_state_dict["bn.running_mean"]
+        for (
+            k,
+            v,
+        ) in old_state_dict.items():
+            self.assertTrue(v.equal(new_state_dict[k]))
+
+    @swap([True, False])
+    def test_load_state_dict_BC(self):
+        # BatchNormNd
+        # Added num_batches_tracked buffer at version 2. For state dict with
+        # earlier versions or no versions, it should provide default value of 0.
+        bn = nn.BatchNorm2d(3)
+        state_dict = bn.state_dict()
+        del state_dict["num_batches_tracked"]
+        state_dict._metadata[""]["version"] = 1  # version 1
+        bn.load_state_dict(state_dict)
+        self.assertEqual(bn.num_batches_tracked.dtype, torch.long)
+        self.assertEqual(bn.num_batches_tracked.item(), 0)
+        del state_dict._metadata[""]["version"]  # no version
+        bn.load_state_dict(state_dict)
+        self.assertEqual(bn.num_batches_tracked.dtype, torch.long)
+        self.assertEqual(bn.num_batches_tracked.item(), 0)
+
+    @swap([True, False])
+    def test_load_state_dict_child(self):
+        base_module = nn.Linear(1, 1)
+        model = base_module
+        for _ in range(3):
+            model = nn.Sequential(*[deepcopy(model) for _ in range(10)])
+
+        def hook_fn(
+            module,
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        ):
+            module_state_dict = module.state_dict()
+            self.assertEqual(len(module_state_dict.keys()), len(state_dict.keys()))
+
+        model[0][0]._register_load_state_dict_pre_hook(hook_fn, with_module=True)
+        model.load_state_dict(model.state_dict(), strict=True)
+
+    @unittest.skipIf(IS_WINDOWS, "Tempfile permission issue on windows")
+    @swap([True, False])
+    def test_register_state_dict_pre_hook_backward_compat(self):
+        called = False
+
+        def my_state_dict_pre_hook(*args, **kwargs):
+            nonlocal called
+            called = True
+
+        m = nn.Linear(1, 1)
+        self.assertTrue(hasattr(m, "_state_dict_pre_hooks"))
+        delattr(m, "_state_dict_pre_hooks")
+        # Save and load, ensure we can still call state_dict
+        # without running into issues.
+        with NamedTemporaryFile() as f:
+            # Note that torch.save / torch.load is not recommended
+            # to save / load modules.
+            torch.save(m, f.name)
+            m = torch.load(f.name)
+
+        # Ensure we can run state_dict without issues
+        _ = m.state_dict()
+        self.assertFalse(called)
+        m.register_state_dict_pre_hook(my_state_dict_pre_hook)
+        _ = m.state_dict()
+        self.assertTrue(called)
+
+    # fails swapping as LSTM installs weak references on the parameters
+    @swap([False])
+    @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons")
+    def test_load_state_dict_ref_cycle(self):
+        # load_state_dict shouldn't cause a reference cycle involving Tensors
+        import gc
+
+        m = torch.nn.LSTM(16, 16, bidirectional=True)
+
+        gc.collect()
+        m.load_state_dict(deepcopy(m).state_dict())
+        refcycles = gc.collect()
+
+        self.assertEqual(refcycles, 0)
+
+    @swap([True, False])
+    def test_load_state_dict_custom(self):
+        class CustomState(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.param = torch.nn.Parameter(torch.ones(1))
+                self.sub = torch.nn.Linear(5, 5)
+
+            def _save_to_state_dict(self, destination, prefix, keep_vars):
+                destination[prefix + "serialized"] = self.param.data + 1
+
+            def _load_from_state_dict(
+                self,
+                state_dict,
+                prefix,
+                local_metadata,
+                strict,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            ):
+                # skip some of the error handling
+                self.param.data.copy_(state_dict[prefix + "serialized"] - 1)
+
+        # use sequential to verify nesting
+        m = nn.Sequential(CustomState())
+        with torch.no_grad():
+            m[0].param[0] = 10
+            m[0].sub.weight[0, 0] = 555
+        state_dict = m.state_dict()
+        self.assertEqual(state_dict["0.serialized"].item(), 11)
+        self.assertIn("0.sub.weight", state_dict)
+        self.assertNotIn("0.param", state_dict)
+        del m
+        mm = nn.Sequential(CustomState())
+        self.assertEqual(mm[0].param[0].item(), 1)
+        mm.load_state_dict(state_dict)
+        self.assertEqual(mm[0].param[0].item(), 10)
+        self.assertEqual(mm[0].sub.weight[0, 0].item(), 555)
+
+    @swap([True, False])
+    @parametrize("keep_vars", [True, False])
+    def test_load_state_dict_assign_meta(self, keep_vars):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = nn.Linear(3, 5)
+                self.bn = nn.BatchNorm1d(5)
+                self.x = nn.Parameter(torch.rand(5), requires_grad=False)
+
+            def forward(self, input):
+                return self.x + self.bn(self.fc1(input))
+
+        swap = torch.__future__.get_swap_module_params_on_conversion()
+        net = MyModule()
+        state_dict = net.state_dict(keep_vars=keep_vars)
+        for v in state_dict.values():
+            v.requires_grad_(False)
+
+        with torch.device("meta"):
+            net_meta = MyModule()
+
+        net_meta_state_dict_old = net_meta.state_dict(keep_vars=True)
+        net_meta.load_state_dict(state_dict, assign=True)
+
+        # Make sure parameters and persistent buffers were assigned
+        net_meta_state_dict = net_meta.state_dict(keep_vars=True)
+        for key in state_dict.keys():
+            if key in net_meta._parameters:
+                if keep_vars and not swap:
+                    # state_dict[key] is an nn.Parameter
+                    self.assertTrue(state_dict[key] is net_meta_state_dict[key])
+                else:
+                    if swap:
+                        self.assertTrue(
+                            net_meta_state_dict[key] is net_meta_state_dict_old[key]
+                        )
+                    else:
+                        # state_dict[key] is not an nn.Parameter so it will be detached when wrapping with a Parameter
+                        self.assertTrue(
+                            net_meta_state_dict[key] is not net_meta_state_dict_old[key]
+                        )
+                        self.assertEqual(
+                            net_meta_state_dict_old[key].requires_grad,
+                            net_meta_state_dict[key].requires_grad,
+                        )
+                self.assertEqual(
+                    net_meta_state_dict_old[key].requires_grad,
+                    net_meta_state_dict[key].requires_grad,
+                )
+                self.assertEqual(state_dict[key], net_meta_state_dict[key])
+            elif (
+                key in net_meta._buffers
+                and key not in net_meta._non_persistent_buffers_set
+            ):
+                self.assertTrue(state_dict[key] is net_meta_state_dict[key])
+                self.assertEqual(state_dict[key], net_meta_state_dict[key])
+
+        # Make sure that ordering of parameters and buffers is preserved
+        net_named_parameters = net.named_parameters()
+        net_named_buffers = net.named_buffers()
+        net_meta_named_parameters = net_meta.named_parameters()
+        net_meta_named_buffers = net_meta.named_buffers()
+
+        for (n1, _), (n2, _) in zip(net_named_parameters, net_meta_named_parameters):
+            self.assertEqual(n1, n2)
+
+        for (n1, _), (n2, _) in zip(net_named_buffers, net_meta_named_buffers):
+            self.assertEqual(n1, n2)
+
+        # Make sure outputs are the same
+        t = torch.randn(4, 3)
+        out_net = net(t)
+        out_net_meta = net_meta(t.clone())
+
+        self.assertEqual(out_net, out_net_meta)
+
+    @swap([True, False])
+    def test_load_state_dict_assign_with_optimizer(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = nn.Linear(3, 5)
+                self.bn = nn.BatchNorm1d(5)
+
+            def forward(self, input):
+                return self.bn(self.fc1(input))
+
+        net = MyModule()
+        opt = torch.optim.Adam(net.parameters(), lr=1000)
+        x = torch.randn(4, 3)
+        num_iters = 3
+
+        for i in range(num_iters):
+            opt.zero_grad()
+            out = net(x)
+            out.sum().backward()
+            opt.step()
+
+        opt_state_dict = deepcopy(opt.state_dict())
+        net_state_dict = deepcopy(net.state_dict())
+
+        with torch.device("meta"):
+            net_meta = MyModule()
+
+        net_meta.load_state_dict(net_state_dict, assign=True)
+        # must create optimizer only after loading state_dict when assign=True
+        opt2 = torch.optim.Adam(net_meta.parameters(), lr=1000)
+        opt2.load_state_dict(opt_state_dict)
+
+        y = x.clone()
+        for i in range(num_iters):
+            opt.zero_grad()
+            out = net(x)
+            out.sum().backward()
+            opt.step()
+
+            opt2.zero_grad()
+            out2 = net_meta(y)
+            out2.sum().backward()
+            opt2.step()
+
+        self.assertEqual(opt.state_dict(), opt2.state_dict())
+        self.assertEqual(net.state_dict(), net_meta.state_dict())
+
+    @swap([True, False])
+    def test_load_state_dict_assign_shape_stride(self):
+        # Assigned tensor is allowed to have different properties than initial
+        # tensor except for shape
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = nn.Linear(3, 5)
+                self.bn = nn.BatchNorm1d(5)
+
+            def forward(self, input):
+                return self.bn(self.fc1(input))
+
+        net = MyModule()
+        state_dict = net.state_dict()
+        # loading should be ok if stride is different
+        state_dict["fc1.weight"] = torch.randn(3, 5).transpose(0, 1)
+        net2 = MyModule()
+        net2.load_state_dict(state_dict, strict=False, assign=True)
+
+        state_dict["fc1.weight"] = torch.randn(2, 4)
+        with self.assertRaisesRegex(
+            RuntimeError, "size mismatch for fc1.weight: copying a param with shape"
+        ):
+            net2.load_state_dict(state_dict, strict=False, assign=True)
+
+    @swap([True, False])
+    def test_load_state_dict_warn_assign(self):
+        with torch.device("meta"):
+            m = torch.nn.Linear(3, 5)
+        state_dict = m.state_dict()
+        state_dict["weight"] = torch.empty_like(state_dict["weight"], device="cpu")
+        with self.assertWarnsRegex(
+            UserWarning,
+            "for weight: copying from a non-meta parameter in the checkpoint to a meta",
+        ):
+            m.load_state_dict(state_dict)
+
+    @swap([True, False])
+    def test_load_state_dict_with_unexpected_key(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = torch.nn.Linear(5, 10)
+
+        m = MyModule()
+
+        # Unexpected key & strict = True
+        with self.assertRaisesRegex(RuntimeError, "Unexpected key"):
+            state_dict = m.state_dict()
+            state_dict["fc1.bad_suffix"] = torch.randn(5, 10)
+            m.load_state_dict(state_dict)
+
+        # Unexpected key & strict = False
+        state_dict = m.load_state_dict(state_dict, strict=False)
+        self.assertIn("fc1.bad_suffix", state_dict.unexpected_keys)
+
+        # Unexpected key whose prefix matches a valid key & strict = True
+        with self.assertRaisesRegex(RuntimeError, "Unexpected key"):
+            state_dict = m.state_dict()
+            state_dict["fc1.weight.bad_suffix"] = torch.randn(5, 10)
+            m.load_state_dict(state_dict)
+
+        # Unexpected key whose prefix matches a valid key & strict = False
+        state_dict = m.load_state_dict(state_dict, strict=False)
+        self.assertIn("fc1.weight.bad_suffix", state_dict.unexpected_keys)
+
+
+def load_torch_function_handler(cls, func, types, args=(), kwargs=None):
+    kwargs = {} if kwargs is None else kwargs
+
+    def module_load(dest, src, assign=False):
+        if isinstance(dest, cls):
+            if assign:
+                return src.detach()
+            else:
+                if type(src) is torch.Tensor:
+                    return cls(src)
+                elif type(src) is cls:
+                    return src.detach()
+                else:
+                    if isinstance(src, MyWrapperLoadTensor):
+                        return cls(src._data)
+                    return cls(src)
+        else:
+            assert isinstance(
+                src, cls
+            ), f"Expected isinstance(src, {cls}) but got {type(src)}"
+            assert (
+                type(dest) == torch.Tensor
+                or type(dest) == torch.nn.Parameter
+                or issubclass(cls, type(dest))
+            )
+            if assign:
+                return src.detach()
+            else:
+                if isinstance(src, MyWrapperLoadTensor):
+                    if type(dest) not in {torch.Tensor, torch.nn.Parameter}:
+                        return type(dest)(src._data)
+                    else:
+                        return src._data.detach()
+                else:
+                    return torch.Tensor(src)
+
+    if func is torch.Tensor.module_load:
+        return module_load(*args, **kwargs)
+    else:
+        with torch._C.DisableTorchFunctionSubclass():
+            # detach must return instance of same subclass for nn.Parameter()
+            if func == torch.Tensor.detach:
+                ret = func(*args, **kwargs)
+                if not isinstance(ret, cls):
+                    return cls(ret)
+                return ret
+            return func(*args, **kwargs)
+
+
+class MyLoadTensor(torch.Tensor):
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        return load_torch_function_handler(cls, func, types, args, kwargs)
+
+
+# We use MyLoadTensor2 to test tensor subclass, wrapper tensor subclass
+# where neither inherits from each other
+class MyLoadTensor2(torch.Tensor):
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        return load_torch_function_handler(cls, func, types, args, kwargs)
+
+
+class MyBrokenLoadTensor(torch.Tensor):
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        kwargs = {} if kwargs is None else kwargs
+
+        if func is torch.Tensor.module_load:
+            # wrong as this doesn't detach!
+            return args[1]
+        else:
+            with torch._C.DisableTorchFunctionSubclass():
+                # detach must return instance of same subclass for nn.Parameter()
+                if func == torch.Tensor.detach:
+                    return cls(func(*args, **kwargs))
+                return func(*args, **kwargs)
+
+
+class MyWrapperLoadTensor(MyLoadTensor):
+    @staticmethod
+    def __new__(cls, data: torch.Tensor):
+        t = torch.Tensor._make_wrapper_subclass(
+            cls,
+            data.size(),
+            dtype=data.dtype,
+            layout=data.layout,
+            device=data.device,
+            requires_grad=data.requires_grad,
+            strides=data.stride(),
+            storage_offset=data.storage_offset(),
+        )
+        return t
+
+    def __init__(self, data: torch.Tensor):
+        self._data = data
+
+    def __repr__(self):
+        return f"MyWrapperLoadTensor({self._data.__repr__()})"
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        def unwrap(t):
+            return t._data if isinstance(t, MyWrapperLoadTensor) else t
+
+        def wrap(t):
+            return MyWrapperLoadTensor(t) if isinstance(t, torch.Tensor) else t
+
+        kwargs = {} if kwargs is None else kwargs
+        out = func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))
+        return tree_map(wrap, out)
+
+
+class TestLoadStateDictSwap(TestCase):
+    @skipIfCrossRef
+    @skipIfTorchDynamo("Can't swap with dynamo as dynamo installs weakrefs")
+    @swap([True])
+    @parametrize("assign", [True, False])
+    def test_swap_subclass(self, assign):
+        def _create_model(subclass=None):
+            m = torch.nn.Linear(2, 3, bias=False)
+            m.register_buffer("buf", torch.randn(2, 3))
+            if subclass is not None:
+                m.weight = torch.nn.Parameter(subclass(m.weight))
+                m.buf = subclass(m.buf)
+            return m
+
+        def _test(m_subclass=None, sd_subclass=None):
+            m = _create_model(m_subclass)
+            sd = _create_model(sd_subclass).state_dict()
+            m.load_state_dict(sd, assign=assign)
+            self.assertEqual(m.weight, sd["weight"])
+            self.assertEqual(m.buf, sd["buf"])
+            self.assertTrue(isinstance(m.weight, torch.nn.Parameter))
+            self.assertTrue(not isinstance(m.buf, torch.nn.Parameter))
+
+            weight_type, buf_type = (torch.nn.Parameter, torch.Tensor)
+            if assign:
+                if sd_subclass is not None:
+                    weight_type, buf_type = (sd_subclass, sd_subclass)
+            else:
+                if m_subclass is not None:
+                    weight_type, buf_type = (m_subclass, m_subclass)
+
+            self.assertTrue(type(m.weight) is weight_type)
+            self.assertTrue(type(m.buf) is buf_type)
+
+        # (MyLoadTensor, MyWrapperLoadTensor) tests the behavior of (superclass, subclass)
+        subclasses = [None, MyLoadTensor, MyLoadTensor2, MyWrapperLoadTensor]
+        for m_s, sd_s in product(subclasses, subclasses):
+            _test(m_s, sd_s)
+
+        # MyBrokenLoadTensor should error since its module_load doesn't call .detach()
+        with self.assertRaisesRegex(
+            RuntimeError, re.escape("Error(s) in loading state_dict for Linear:")
+        ):
+            _test(None, MyBrokenLoadTensor)
+
+
+instantiate_parametrized_tests(TestLoadStateDict)
+instantiate_parametrized_tests(TestLoadStateDictSwap)
+
+if __name__ == "__main__":
+    TestCase._default_dtype_check_enabled = True
+    run_tests()
diff --git a/test/nn/test_module_hooks.py b/test/nn/test_module_hooks.py
index 6e9a89bd834e3..f192986d7b634 100644
--- a/test/nn/test_module_hooks.py
+++ b/test/nn/test_module_hooks.py
@@ -1,28 +1,28 @@
 # Owner(s): ["module: nn"]
-from torch.testing._internal.common_utils import (
-    TestCase,
-    run_tests,
-    skipIfTorchDynamo,
-    IS_WINDOWS,
-    parametrize as parametrize_test,
-    instantiate_parametrized_tests
-)
-from torch.testing._internal.common_nn import NNTestCase, _create_basic_net
-
-import torch
-import torch.nn as nn
-
-from functools import partial
-from typing import Any, Dict, List, Tuple
 import gc
+import math
+import pickle
 import unittest
+import warnings
+import weakref
+from collections import namedtuple, OrderedDict
 from copy import deepcopy
+
+from functools import partial
 from tempfile import NamedTemporaryFile
-import weakref
-import pickle
-from collections import OrderedDict, namedtuple
-import math
-import warnings
+from typing import Any, Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from torch.testing._internal.common_nn import _create_basic_net, NNTestCase
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    IS_WINDOWS,
+    parametrize as parametrize_test,
+    run_tests,
+    skipIfTorchDynamo,
+    TestCase,
+)
 
 
 class Net(nn.Module):
@@ -34,8 +34,10 @@ def __init__(self) -> None:
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.seq2(self.seq1(x))
 
+
 ToyNamedTuple = namedtuple("ToyNamedTuple", "content")
 
+
 class ToyModel(nn.Module):
     def __init__(self, with_named_tuple=False) -> None:
         super().__init__()
@@ -50,6 +52,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         else:
             return (res,)
 
+
 def forward_hook(
     self: TestCase,
     fired_hooks: List[int],
@@ -111,9 +114,7 @@ def __init__(self) -> None:
         self.net1 = Net()
         self.net2 = Net()
 
-    def forward(
-        self, x: torch.Tensor, bias: torch.Tensor = None
-    ) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, bias: torch.Tensor = None) -> torch.Tensor:
         if bias is not None:
             x = x + bias
         return x
@@ -138,6 +139,7 @@ def forward(self, x: torch.Tensor, fail: bool = True) -> torch.Tensor:
             raise RuntimeError("failing in forward")
         return self.net1(x)
 
+
 def kwarg_forward_pre_hook(
     self: TestCase,
     fired_hooks: List[int],
@@ -212,15 +214,11 @@ def test_forward_pre_hooks(self, named_tuple):
         model = ToyModel(named_tuple)
         x = torch.randn(10, 10)
         hook = partial(forward_pre_hook, self, fired_hooks, model.net2.seq1)
-        model.net2.seq1.register_forward_pre_hook(
-            partial(hook, 0), prepend=True
-        )
+        model.net2.seq1.register_forward_pre_hook(partial(hook, 0), prepend=True)
         model.net2.seq1.register_forward_pre_hook(partial(hook, 1))
         model.net2.seq1.register_forward_pre_hook(partial(hook, 2))
         model.net2.seq1.register_forward_pre_hook(partial(hook, 3))
-        model.net2.seq1.register_forward_pre_hook(
-            partial(hook, 4), prepend=True
-        )
+        model.net2.seq1.register_forward_pre_hook(partial(hook, 4), prepend=True)
         expected = [4, 0, 1, 2, 3]
 
         self.assertEqual(fired_hooks, [])
@@ -260,12 +258,8 @@ def test_full_backward_pre_hooks(self, named_tuple):
         model = ToyModel(named_tuple)
         x = torch.randn(10, 10)
         hook = partial(full_backward_pre_hook, self, fired_hooks, model.net1)
-        model.net1.register_full_backward_pre_hook(
-            partial(hook, 0), prepend=True
-        )
-        model.net1.register_full_backward_pre_hook(
-            partial(hook, 1), prepend=True
-        )
+        model.net1.register_full_backward_pre_hook(partial(hook, 0), prepend=True)
+        model.net1.register_full_backward_pre_hook(partial(hook, 1), prepend=True)
         model.net1.register_full_backward_pre_hook(partial(hook, 2))
         model.net1.register_full_backward_pre_hook(partial(hook, 3))
         model.net1.register_full_backward_pre_hook(partial(hook, 4))
@@ -298,7 +292,6 @@ def fn(_unused_module, grad_output):
             else:
                 self.assertIsNone(a.grad)
 
-
     @parametrize_test("named_tuple", (True, False))
     def test_mixed_hooks(self, named_tuple):
         fired_hooks: List[int] = []
@@ -307,9 +300,7 @@ def test_mixed_hooks(self, named_tuple):
         model.register_forward_pre_hook(
             partial(forward_pre_hook, self, fired_hooks, model, 0)
         )
-        model.register_forward_hook(
-            partial(forward_hook, self, fired_hooks, model, 1)
-        )
+        model.register_forward_hook(partial(forward_hook, self, fired_hooks, model, 1))
         model.register_full_backward_pre_hook(
             partial(full_backward_pre_hook, self, fired_hooks, model, 2)
         )
@@ -371,9 +362,7 @@ def test_kwarg_hooks(self):
         x: torch.Tensor = torch.ones(10, 10)
         bias: torch.Tensor = torch.ones(10, 10)
         model = KwargModel()
-        model.register_forward_hook(
-            model.internal_forward_hook, with_kwargs=True
-        )
+        model.register_forward_hook(model.internal_forward_hook, with_kwargs=True)
 
         # forward: out = x + bias
         # forward-post: out = out + bias
@@ -381,7 +370,6 @@ def test_kwarg_hooks(self):
         out = model(x, bias=bias)
         self.assertEqual(out, x + 2 * bias, rtol=0, atol=1e-5)
 
-
     def test_remove_kwarg_hooks(self):
         # test forward pre and forward hooks
         fired_hooks: List[int] = []
@@ -413,9 +401,7 @@ def test_remove_kwarg_hooks(self):
         out = model(x, bias=bias)
         self.assertEqual(fired_hooks, [0, 1, 0])
         self.assertEqual(out, x + 2 * bias, rtol=0, atol=1e-5)
-        self.assertFalse(
-            forward_hook_handle.id in model._forward_hooks_with_kwargs
-        )
+        self.assertFalse(forward_hook_handle.id in model._forward_hooks_with_kwargs)
 
         # forward: out = x + bias
         # So, out = x + bias
@@ -457,7 +443,9 @@ def throw_hook(m, i, o):
             raise RuntimeError("failing in throw")
 
         forward_pre_hook_handle = model.register_forward_pre_hook(ctx_setup_hook)
-        forward_hook_handle = model.register_forward_hook(ctx_shutdown_hook, always_call=True)
+        forward_hook_handle = model.register_forward_hook(
+            ctx_shutdown_hook, always_call=True
+        )
         self.assertTrue(len(model._forward_hooks_always_called) == 1)
 
         # make sure always_called forward hook runs when model.forward raises RuntimeError
@@ -478,9 +466,9 @@ def throw_hook(m, i, o):
         self.assertEqual(stack, [2, -1, 2, -1, 2, -1])
 
         # make sure always_called hook runs when another always_called forward hook raises an error
-        forward_hook_handle2 = model.register_forward_hook(throw_hook,
-                                                           prepend=True,
-                                                           always_call=True)
+        forward_hook_handle2 = model.register_forward_hook(
+            throw_hook, prepend=True, always_call=True
+        )
 
         # error raised should not be error of the forced hook
         with self.assertRaisesRegex(RuntimeError, "failing in ctx setup"):
@@ -493,14 +481,18 @@ def throw_hook(m, i, o):
         self.assertTrue(len(model._forward_hooks_always_called) == 0)
 
         # make sure that always called forward hook is not run twice if it fails while running
-        forward_hook_handle3 = model.register_forward_hook(ctx_shutdown_failure_hook, always_call=True)
+        forward_hook_handle3 = model.register_forward_hook(
+            ctx_shutdown_failure_hook, always_call=True
+        )
         with self.assertRaisesRegex(RuntimeError, "failing in ctx setup"):
             model(x, fail=False)
         self.assertEqual(stack, [2, -1, 2, -1, 2, -1, 2, -1, 2, -1])
 
         forward_hook_handle3.remove()
 
-        global_forward_hook_handle = nn.modules.module.register_module_forward_hook(ctx_shutdown_hook, always_call=True)
+        global_forward_hook_handle = nn.modules.module.register_module_forward_hook(
+            ctx_shutdown_hook, always_call=True
+        )
         self.assertTrue(len(nn.modules.module._global_forward_hooks_always_called) == 1)
         # make sure global forward hook runs when forward pre hook raises RuntimeError
         with self.assertRaisesRegex(RuntimeError, "failing in ctx setup"):
@@ -517,22 +509,22 @@ def throw_hook(m, i, o):
     def test_bw_hook_warning_for_non_tensor_or_tuple(self):
         # Test to verify that backward hook raises warning
         # if result is not a Tensor or tuple of Tensors.
-        counter = {'forward': 0, 'backward': 0}
+        counter = {"forward": 0, "backward": 0}
 
         def fw_pre_hook(module: nn.Module, _inputs):
-            counter['forward'] += 1
+            counter["forward"] += 1
 
         def fw_hook(module: nn.Module, _inputs, _outputs):
-            counter['forward'] += 1
+            counter["forward"] += 1
 
         def bw_hook(module: nn.Module, _inputs, _outputs):
-            counter['backward'] += 1
+            counter["backward"] += 1
 
         class TestModule(nn.Module):
             def forward(self, dict):
-                inp = dict['x']
+                inp = dict["x"]
                 x = torch.nn.functional.softmax(inp, dim=0)
-                return {'x': x}
+                return {"x": x}
 
         x = torch.ones(2, requires_grad=True)
         model = TestModule()
@@ -542,12 +534,12 @@ def forward(self, dict):
         model.register_full_backward_hook(bw_hook)
 
         with warnings.catch_warnings(record=True) as w:
-            y = model({'x': x})['x']
+            y = model({"x": x})["x"]
             loss = y.sum()
             loss.backward()
 
-        self.assertEqual(counter['forward'], 2)
-        self.assertEqual(counter['backward'], 0)
+        self.assertEqual(counter["forward"], 2)
+        self.assertEqual(counter["backward"], 0)
         self.assertEqual(len(w), 1)
         self.assertTrue("should be a Tensor or a tuple of Tensors" in str(w[0].message))
 
@@ -555,10 +547,9 @@ def forward(self, dict):
 def _hook_to_pickle(*args, **kwargs):
     pass
 
-class TestStateDictHooks(TestCase):
 
+class TestStateDictHooks(TestCase):
     def test_load_state_dict_pre_hook(self):
-
         m = nn.Linear(10, 10)
         m_state_dict = m.state_dict()
 
@@ -566,12 +557,29 @@ def test_load_state_dict_pre_hook(self):
 
         hook_called = 0
 
-        def hook_without_module(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        def hook_without_module(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        ):
             self.assertEqual(m_state_dict, state_dict)
             nonlocal hook_called
             hook_called += 1
 
-        def hook_with_module(module, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        def hook_with_module(
+            module,
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        ):
             self.assertEqual(m_state_dict, state_dict)
             self.assertTrue(m_load is module)
             nonlocal hook_called
@@ -614,7 +622,16 @@ def __init__(self):
                 super().__init__()
                 self.foo = torch.nn.Parameter(torch.rand(10))
 
-            def my_pre_load_hook(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+            def my_pre_load_hook(
+                self,
+                state_dict,
+                prefix,
+                local_metadata,
+                strict,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            ):
                 assert [] == error_msgs
                 assert [] == unexpected_keys
                 assert [] == missing_keys
@@ -658,9 +675,7 @@ def __init__(self, mod):
                 mod = m
 
             hook_called = 0
-            mod._register_load_state_dict_pre_hook(
-                mod.my_pre_load_hook
-            )
+            mod._register_load_state_dict_pre_hook(mod.my_pre_load_hook)
             m.load_state_dict(state_dict)
             self.assertEqual(1, hook_called)
 
@@ -736,9 +751,9 @@ def my_post_load_hook(mod, _):
         for m in [nn.Softmin(10), nn.Softmax(10), nn.LogSoftmax(10)]:
             called = False
             sd = deepcopy(m.state_dict())
-            self.assertTrue(hasattr(m, '_load_state_dict_post_hooks'))
+            self.assertTrue(hasattr(m, "_load_state_dict_post_hooks"))
             # Simulate an older model that did not have this attr
-            delattr(m, '_load_state_dict_post_hooks')
+            delattr(m, "_load_state_dict_post_hooks")
             # Save and load, and ensure that load_state_dict works (without proper
             # BC we would run into errors because this attribute would be expected).
             # In particular, Softmax runs into the issue described here:
@@ -758,7 +773,6 @@ def my_post_load_hook(mod, _):
 
 
 class TestModuleGlobalHooks(TestCase):
-
     def tearDown(self):
         nn.modules.module._global_backward_hooks = OrderedDict()
         nn.modules.module._global_forward_hooks = OrderedDict()
@@ -774,10 +788,7 @@ def test_module_global_hooks(self):
 
         input = torch.ones(5, 5, requires_grad=True)
 
-        counter = {
-            'forwards': 0,
-            'backwards': 0
-        }
+        counter = {"forwards": 0, "backwards": 0}
 
         def fw_hook(inc, h_module, input, output):
             self.assertIsInstance(input, tuple)
@@ -785,67 +796,74 @@ def fw_hook(inc, h_module, input, output):
             self.assertTrue(isinstance(h_module, module))
             self.assertEqual(input[0], torch.ones(5, 5))
             self.assertEqual(output, torch.empty(5, 5).fill_(1 / (1 + 1 / math.e)))
-            counter['forwards'] += inc
+            counter["forwards"] += inc
 
         def bw_hook(inc, h_module, grad_input, grad_output):
             self.assertIsInstance(grad_input, tuple)
             self.assertIsInstance(grad_output, tuple)
             self.assertTrue(isinstance(h_module, module))
             self.assertEqual(grad_output[0], torch.ones(5, 5) * 2)
-            counter['backwards'] += inc
+            counter["backwards"] += inc
 
-        test_fwd = nn.modules.module.register_module_forward_hook(lambda *args: fw_hook(1, *args))
+        test_fwd = nn.modules.module.register_module_forward_hook(
+            lambda *args: fw_hook(1, *args)
+        )
 
         module_1(input)
         module_2(input)
         module_3(input)
-        self.assertEqual(counter['forwards'], 3)
-        self.assertEqual(counter['backwards'], 0)
+        self.assertEqual(counter["forwards"], 3)
+        self.assertEqual(counter["backwards"], 0)
 
         test_bwd = nn.modules.module.register_module_backward_hook(
-            lambda *args: bw_hook(1, *args))
+            lambda *args: bw_hook(1, *args)
+        )
 
         output_1 = module_1(input)
         output_2 = module_2(input)
         output_3 = module_3(input)
-        self.assertEqual(counter['forwards'], 6)
-        self.assertEqual(counter['backwards'], 0)
+        self.assertEqual(counter["forwards"], 6)
+        self.assertEqual(counter["backwards"], 0)
 
         output_1.backward(torch.ones(5, 5) * 2, retain_graph=True)
         output_2.backward(torch.ones(5, 5) * 2, retain_graph=False)
         output_3.backward(torch.ones(5, 5) * 2, retain_graph=False)
-        self.assertEqual(counter['forwards'], 6)
-        self.assertEqual(counter['backwards'], 3)
+        self.assertEqual(counter["forwards"], 6)
+        self.assertEqual(counter["backwards"], 3)
 
         output_1.backward(torch.ones(5, 5) * 2, retain_graph=True)
-        self.assertEqual(counter['forwards'], 6)
-        self.assertEqual(counter['backwards'], 4)
+        self.assertEqual(counter["forwards"], 6)
+        self.assertEqual(counter["backwards"], 4)
 
-        test2_fwd = nn.modules.module.register_module_forward_hook(lambda *args: fw_hook(2, *args))
+        test2_fwd = nn.modules.module.register_module_forward_hook(
+            lambda *args: fw_hook(2, *args)
+        )
 
         output = module_1(input)
         output = module_2(input)
         output = module_3(input)
-        self.assertEqual(counter['forwards'], 15)
-        self.assertEqual(counter['backwards'], 4)
+        self.assertEqual(counter["forwards"], 15)
+        self.assertEqual(counter["backwards"], 4)
 
-        test2_bwd = nn.modules.module.register_module_backward_hook(lambda *args: bw_hook(2, *args))
+        test2_bwd = nn.modules.module.register_module_backward_hook(
+            lambda *args: bw_hook(2, *args)
+        )
 
         module_1(input).backward(torch.ones(5, 5) * 2)
-        self.assertEqual(counter['forwards'], 18)
-        self.assertEqual(counter['backwards'], 7)
+        self.assertEqual(counter["forwards"], 18)
+        self.assertEqual(counter["backwards"], 7)
 
         test2_bwd.remove()
 
         module_2(input).backward(torch.ones(5, 5) * 2)
-        self.assertEqual(counter['forwards'], 21)
-        self.assertEqual(counter['backwards'], 8)
+        self.assertEqual(counter["forwards"], 21)
+        self.assertEqual(counter["backwards"], 8)
 
         test2_fwd.remove()
 
         module_3(input).backward(torch.ones(5, 5) * 2)
-        self.assertEqual(counter['forwards'], 22)
-        self.assertEqual(counter['backwards'], 9)
+        self.assertEqual(counter["forwards"], 22)
+        self.assertEqual(counter["backwards"], 9)
 
         test_fwd.remove()
         test_bwd.remove()
@@ -861,11 +879,11 @@ def bw_fail2(self, grad_input, grad_output):
             return grad_input + (torch.randn(2, 2),)
 
         with nn.modules.module.register_module_backward_hook(bw_fail1):
-            with self.assertRaisesRegex(RuntimeError, 'got 0, but expected 1'):
+            with self.assertRaisesRegex(RuntimeError, "got 0, but expected 1"):
                 module(input).sum().backward()
 
         with nn.modules.module.register_module_backward_hook(bw_fail2):
-            with self.assertRaisesRegex(RuntimeError, 'got 2, but expected 1'):
+            with self.assertRaisesRegex(RuntimeError, "got 2, but expected 1"):
                 module(input).sum().backward()
 
     def test_module_backward_global_hook_writeable(self):
@@ -903,7 +921,7 @@ def forward_hook(m, input, output):
         expected_res = -torch.sigmoid(torch.nn.functional.relu(input))
         self.assertEqual(output, expected_res)
         output.backward(torch.ones(5, 5) * 2, retain_graph=True)
-        mask = (input > 0)
+        mask = input > 0
         expected_grad = -sig_x * (1 - sig_x) * 2 * mask
         self.assertEqual(input.grad, expected_grad)
 
@@ -1033,7 +1051,12 @@ def local_backward_hook(m, input, output):
         module.register_backward_hook(local_backward_hook)
 
         output = module(input)
-        self.assertTrue(local_forward_called and local_forward_pre_called and global_forward_called and global_forward_pre_called)
+        self.assertTrue(
+            local_forward_called
+            and local_forward_pre_called
+            and global_forward_called
+            and global_forward_pre_called
+        )
 
         output.backward(torch.ones(5, 5), retain_graph=True)
         self.assertTrue(local_backward_called and global_backward_called)
@@ -1047,10 +1070,7 @@ def _test_hooks(self, backward_register_fn):
         module = nn.Sigmoid()
         input = torch.ones(5, 5, requires_grad=True)
 
-        counter = {
-            'forwards': 0,
-            'backwards': 0
-        }
+        counter = {"forwards": 0, "backwards": 0}
 
         def fw_hook(inc, h_module, input, output):
             self.assertIsInstance(input, tuple)
@@ -1058,14 +1078,14 @@ def fw_hook(inc, h_module, input, output):
             self.assertTrue(h_module is module)
             self.assertEqual(input[0], torch.ones(5, 5))
             self.assertEqual(output, torch.empty(5, 5).fill_(1 / (1 + 1 / math.e)))
-            counter['forwards'] += inc
+            counter["forwards"] += inc
 
         def bw_hook(inc, h_module, grad_input, grad_output):
             self.assertIsInstance(grad_input, tuple)
             self.assertIsInstance(grad_output, tuple)
             self.assertTrue(h_module is module)
             self.assertEqual(grad_output[0], torch.ones(5, 5) * 2)
-            counter['backwards'] += inc
+            counter["backwards"] += inc
 
         # backward_pre_hook expects callback with only `module` and `grad_output`
         # as arguments.
@@ -1073,54 +1093,61 @@ def bw_pre_hook(inc, h_module, grad_output):
             self.assertIsInstance(grad_output, tuple)
             self.assertTrue(h_module is module)
             self.assertEqual(grad_output[0], torch.ones(5, 5) * 2)
-            counter['backwards'] += inc
+            counter["backwards"] += inc
 
         test_fwd = module.register_forward_hook(lambda *args: fw_hook(1, *args))
 
         module(input)
         module(input)
-        self.assertEqual(counter['forwards'], 2)
-        self.assertEqual(counter['backwards'], 0)
+        self.assertEqual(counter["forwards"], 2)
+        self.assertEqual(counter["backwards"], 0)
 
-        bw_hook_fn = bw_pre_hook if backward_register_fn == 'register_full_backward_pre_hook' else bw_hook
+        bw_hook_fn = (
+            bw_pre_hook
+            if backward_register_fn == "register_full_backward_pre_hook"
+            else bw_hook
+        )
         test_bwd = getattr(module, backward_register_fn)(
-            lambda *args: bw_hook_fn(1, *args))
+            lambda *args: bw_hook_fn(1, *args)
+        )
 
         output = module(input)
-        self.assertEqual(counter['forwards'], 3)
-        self.assertEqual(counter['backwards'], 0)
+        self.assertEqual(counter["forwards"], 3)
+        self.assertEqual(counter["backwards"], 0)
 
         output.backward(torch.ones(5, 5) * 2, retain_graph=True)
-        self.assertEqual(counter['forwards'], 3)
-        self.assertEqual(counter['backwards'], 1)
+        self.assertEqual(counter["forwards"], 3)
+        self.assertEqual(counter["backwards"], 1)
 
         output.backward(torch.ones(5, 5) * 2, retain_graph=True)
-        self.assertEqual(counter['forwards'], 3)
-        self.assertEqual(counter['backwards'], 2)
+        self.assertEqual(counter["forwards"], 3)
+        self.assertEqual(counter["backwards"], 2)
 
         test2_fwd = module.register_forward_hook(lambda *args: fw_hook(2, *args))
 
         output = module(input)
-        self.assertEqual(counter['forwards'], 6)
-        self.assertEqual(counter['backwards'], 2)
+        self.assertEqual(counter["forwards"], 6)
+        self.assertEqual(counter["backwards"], 2)
 
-        test2_bwd = getattr(module, backward_register_fn)(lambda *args: bw_hook_fn(2, *args))
+        test2_bwd = getattr(module, backward_register_fn)(
+            lambda *args: bw_hook_fn(2, *args)
+        )
 
         module(input).backward(torch.ones(5, 5) * 2)
-        self.assertEqual(counter['forwards'], 9)
-        self.assertEqual(counter['backwards'], 5)
+        self.assertEqual(counter["forwards"], 9)
+        self.assertEqual(counter["backwards"], 5)
 
         test2_bwd.remove()
 
         module(input).backward(torch.ones(5, 5) * 2)
-        self.assertEqual(counter['forwards'], 12)
-        self.assertEqual(counter['backwards'], 6)
+        self.assertEqual(counter["forwards"], 12)
+        self.assertEqual(counter["backwards"], 6)
 
         test2_fwd.remove()
 
         module(input).backward(torch.ones(5, 5) * 2)
-        self.assertEqual(counter['forwards'], 13)
-        self.assertEqual(counter['backwards'], 7)
+        self.assertEqual(counter["forwards"], 13)
+        self.assertEqual(counter["backwards"], 7)
 
         test_fwd.remove()
         test_bwd.remove()
@@ -1148,15 +1175,15 @@ def test_backward_hooks_interaction(self):
         # the full_backward_hook
         module = torch.nn.Sigmoid()
 
-        cnt = {'backward_cnt': 0}
+        cnt = {"backward_cnt": 0}
 
         def bw_pre_hook(m, grad_output):
-            cnt['backward_cnt'] += 1
-            return (grad_output[0] * 0.5, )
+            cnt["backward_cnt"] += 1
+            return (grad_output[0] * 0.5,)
 
         def bw_hook(m, grad_in, grad_output):
             self.assertEqual(torch.full_like(grad_output[0], 0.5), grad_output[0])
-            cnt['backward_cnt'] += 1
+            cnt["backward_cnt"] += 1
             return grad_output
 
         module.register_full_backward_pre_hook(bw_pre_hook)
@@ -1164,7 +1191,7 @@ def bw_hook(m, grad_in, grad_output):
 
         t = torch.ones(1, 2, requires_grad=True)
         module(t).sum().backward()
-        self.assertEqual(cnt['backward_cnt'], 2)
+        self.assertEqual(cnt["backward_cnt"], 2)
 
     def test_hook_invalid_outputs(self):
         module = nn.Sigmoid()
@@ -1177,11 +1204,11 @@ def bw_fail2(self, grad_input, grad_output):
             return grad_input + (torch.randn(2, 2),)
 
         with module.register_backward_hook(bw_fail1):
-            with self.assertRaisesRegex(RuntimeError, 'got 0, but expected 1'):
+            with self.assertRaisesRegex(RuntimeError, "got 0, but expected 1"):
                 module(input).sum().backward()
 
         with module.register_backward_hook(bw_fail2):
-            with self.assertRaisesRegex(RuntimeError, 'got 2, but expected 1'):
+            with self.assertRaisesRegex(RuntimeError, "got 2, but expected 1"):
                 module(input).sum().backward()
 
         def bw_pre_fail1(self, grad_output):
@@ -1191,11 +1218,11 @@ def bw_pre_fail2(self, grad_output):
             return grad_output + (torch.randn(2, 2),)
 
         with module.register_full_backward_pre_hook(bw_pre_fail1):
-            with self.assertRaisesRegex(RuntimeError, 'got 0, but expected 1'):
+            with self.assertRaisesRegex(RuntimeError, "got 0, but expected 1"):
                 module(input).sum().backward()
 
         with module.register_full_backward_pre_hook(bw_pre_fail2):
-            with self.assertRaisesRegex(RuntimeError, 'got 2, but expected 1'):
+            with self.assertRaisesRegex(RuntimeError, "got 2, but expected 1"):
                 module(input).sum().backward()
 
     def test_hook_requires_grad(self):
@@ -1304,8 +1331,10 @@ def hook_pre(mod, grad_output):
 
         inp = torch.rand(10, requires_grad=True)
         mod = MyModule()
-        for hook_fn, register_fn in [(hook, mod.register_full_backward_hook),
-                                     (hook_pre, mod.register_full_backward_pre_hook)]:
+        for hook_fn, register_fn in [
+            (hook, mod.register_full_backward_hook),
+            (hook_pre, mod.register_full_backward_pre_hook),
+        ]:
             hook_called[0] = 0
             with register_fn(hook_fn):
                 # No inplace should work
@@ -1313,8 +1342,11 @@ def hook_pre(mod, grad_output):
                 self.assertEqual(hook_called[0], 1)
 
                 # Input inplace error should throw an error
-                with self.assertRaisesRegex(RuntimeError, "Output 0 of BackwardHookFunctionBackward is "
-                                            "a view and is being modified inplace."):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "Output 0 of BackwardHookFunctionBackward is "
+                    "a view and is being modified inplace.",
+                ):
                     mod(inp.clone(), True)
 
                 # Input inplace error should throw an error if we try to re-use the view after they have
@@ -1322,15 +1354,21 @@ def hook_pre(mod, grad_output):
                 local_inp = inp.clone()
                 out = mod(local_inp, False)
                 local_inp[0] *= 1
-                with self.assertRaisesRegex(RuntimeError, "Output 0 of BackwardHookFunctionBackward is "
-                                            "a view and its base or another view"):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "Output 0 of BackwardHookFunctionBackward is "
+                    "a view and its base or another view",
+                ):
                     # Any operation involving the view will fail here
                     mod.inp + 2
 
                 # Output inplace error should throw an error
                 out = mod(inp, False)
-                with self.assertRaisesRegex(RuntimeError, "BackwardHookFunctionBackward is a view "
-                                            "and is being modified inplace."):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "BackwardHookFunctionBackward is a view "
+                    "and is being modified inplace.",
+                ):
                     out += 1
 
     def test_hook_non_full_warning(self):
@@ -1348,7 +1386,9 @@ def forward(self, l):
         m = MyModule()
         m.register_backward_hook(noop)
 
-        with self.assertWarnsRegex(UserWarning, "does not take as input a single Tensor or a tuple of Tensors"):
+        with self.assertWarnsRegex(
+            UserWarning, "does not take as input a single Tensor or a tuple of Tensors"
+        ):
             m([a, b])
 
         # Check invalid output container
@@ -1359,7 +1399,9 @@ def forward(self, a, b):
         m = MyModule()
         m.register_backward_hook(noop)
 
-        with self.assertWarnsRegex(UserWarning, "does not return a single Tensor or a tuple of Tensors"):
+        with self.assertWarnsRegex(
+            UserWarning, "does not return a single Tensor or a tuple of Tensors"
+        ):
             m(a, b)
 
         # Check invalid output from different Nodes
@@ -1370,7 +1412,9 @@ def forward(self, a, b):
         m = MyModule()
         m.register_backward_hook(noop)
 
-        with self.assertWarnsRegex(UserWarning, "outputs are generated by different autograd Nodes"):
+        with self.assertWarnsRegex(
+            UserWarning, "outputs are generated by different autograd Nodes"
+        ):
             m(a, b)
 
         # Check invalid forward with multiple Nodes
@@ -1381,7 +1425,9 @@ def forward(self, a):
         m = MyModule()
         m.register_backward_hook(noop)
 
-        with self.assertWarnsRegex(UserWarning, "the forward contains multiple autograd Nodes"):
+        with self.assertWarnsRegex(
+            UserWarning, "the forward contains multiple autograd Nodes"
+        ):
             m(a)
 
     def test_hook_backward_size(self):
@@ -1443,16 +1489,18 @@ def forward_hook(m, input, output):
         expected_res = -torch.nn.functional.sigmoid(torch.nn.functional.relu(input))
         self.assertEqual(output, expected_res)
         output.backward(torch.ones(5, 5) * 2, retain_graph=True)
-        mask = (input > 0)
+        mask = input > 0
         expected_grad = -sig_x * (1 - sig_x) * 2 * mask
         self.assertEqual(input.grad, expected_grad)
 
     def test_hook_buffer_registration(self):
         for return_buffer in (True, False):
+
             def buffer_registration_hook(module, name, buffer):
                 buffer.registered = True
                 if return_buffer:
                     return buffer
+
             handle = torch.nn.modules.module.register_module_buffer_registration_hook(
                 buffer_registration_hook
             )
@@ -1465,11 +1513,13 @@ def buffer_registration_hook(module, name, buffer):
 
     def test_hook_submodule_registration(self):
         for return_submodule in (True, False):
+
             def module_registration_hook(module, name, submodule):
                 module.registered = True
                 submodule.registered = True
                 if return_submodule:
                     return submodule
+
             handle = torch.nn.modules.module.register_module_module_registration_hook(
                 module_registration_hook
             )
@@ -1482,12 +1532,16 @@ def module_registration_hook(module, name, submodule):
 
     def test_hook_parameter_registration(self):
         for return_parameter in (True, False):
+
             def parameter_registration_hook(module, name, parameter):
                 parameter.registered = True
                 if return_parameter:
                     return parameter
-            handle = torch.nn.modules.module.register_module_parameter_registration_hook(
-                parameter_registration_hook
+
+            handle = (
+                torch.nn.modules.module.register_module_parameter_registration_hook(
+                    parameter_registration_hook
+                )
             )
             try:
                 l, n, s = _create_basic_net()
@@ -1496,6 +1550,7 @@ def parameter_registration_hook(module, name, parameter):
             finally:
                 handle.remove()
 
+
 instantiate_parametrized_tests(TestModuleHooks)
 
 if __name__ == "__main__":
diff --git a/test/nn/test_multihead_attention.py b/test/nn/test_multihead_attention.py
index d5ae098b1d25a..d4d500e596f26 100644
--- a/test/nn/test_multihead_attention.py
+++ b/test/nn/test_multihead_attention.py
@@ -4,15 +4,23 @@
 import unittest
 import unittest.mock as mock
 
+import torch
+import torch.nn as nn
+
 from torch.nn import MultiheadAttention
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, \
-    onlyCUDAAndPRIVATEUSE1
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+    onlyCUDAAndPRIVATEUSE1,
+)
 from torch.testing._internal.common_nn import NNTestCase
-from torch.testing._internal.common_utils import run_tests, \
-    TEST_NUMPY, TEST_WITH_CROSSREF, \
-    parametrize as parametrize_test, instantiate_parametrized_tests
-import torch.nn as nn
-import torch
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize as parametrize_test,
+    run_tests,
+    TEST_NUMPY,
+    TEST_WITH_CROSSREF,
+)
 
 if TEST_NUMPY:
     import numpy as np
@@ -22,6 +30,7 @@
 # update test/run_test.py to list it, otherwise it will NOT be run in
 # CI.
 
+
 class TestMultiheadAttentionNN(NNTestCase):
     _do_cuda_memory_leak_check = True
     _do_cuda_non_default_stream = True
@@ -29,9 +38,16 @@ class TestMultiheadAttentionNN(NNTestCase):
     @unittest.skipIf(not TEST_NUMPY, "numpy not found")
     @parametrize_test("average_attn_weights", [True, False])
     def test_multihead_attention(self, average_attn_weights):
-        def _scaled_dot_attn_ref(Q, K, V, dims, unseen_mask=None, key_padding_mask=None,
-                                 average_attn_weights=average_attn_weights):
-            """ Numpy-based reference implementation of scaled dot attention
+        def _scaled_dot_attn_ref(
+            Q,
+            K,
+            V,
+            dims,
+            unseen_mask=None,
+            key_padding_mask=None,
+            average_attn_weights=average_attn_weights,
+        ):
+            """Numpy-based reference implementation of scaled dot attention
             for testing"""
 
             QKT = _batchmatmul(
@@ -48,7 +64,10 @@ def _scaled_dot_attn_ref(Q, K, V, dims, unseen_mask=None, key_padding_mask=None,
                             for n in range(s2):
                                 if unseen_mask is not None and unseen_mask[m][n] == 0:
                                     QKT[i, j, m, n] = -np.inf
-                                if key_padding_mask is not None and key_padding_mask[i][n]:
+                                if (
+                                    key_padding_mask is not None
+                                    and key_padding_mask[i][n]
+                                ):
                                     QKT[i, j, m, n] = -np.inf
 
             reference = _softmax(QKT)
@@ -59,7 +78,7 @@ def _scaled_dot_attn_ref(Q, K, V, dims, unseen_mask=None, key_padding_mask=None,
             return reference, ref_attn_weight
 
         def _batchmatmul(a, b):  # batchmatmul over 4 dim matrix
-            """ Numpy-based batch matrix multiply over 4 dim matrix"""
+            """Numpy-based batch matrix multiply over 4 dim matrix"""
             assert a.shape[0] == b.shape[0]
             assert a.shape[1] == b.shape[1]
             retval = np.zeros(
@@ -71,8 +90,8 @@ def _batchmatmul(a, b):  # batchmatmul over 4 dim matrix
             return retval
 
         def _softmax(x):  # softmax over 4 dim matrix
-            """ Numpy-based reference softmax over 4 dim matrix"""
-            np.seterr(invalid='ignore')
+            """Numpy-based reference softmax over 4 dim matrix"""
+            np.seterr(invalid="ignore")
             output = np.zeros(x.shape, dtype=np.float64)
             for i in range(x.shape[0]):
                 for j in range(x.shape[1]):
@@ -85,7 +104,9 @@ def _softmax(x):  # softmax over 4 dim matrix
         def _split_heads_ref(X, dims, nheads, d_head):
             X_split = np.reshape(X, dims[:2] + [nheads, d_head])
             X_split_transposed = np.transpose(X_split, [0, 2, 1, 3])
-            reference = np.reshape(X_split_transposed, [dims[0], nheads, dims[1], d_head])
+            reference = np.reshape(
+                X_split_transposed, [dims[0], nheads, dims[1], d_head]
+            )
             return reference
 
         def _combine_heads_ref(X, dims, nheads, d_head):
@@ -114,9 +135,14 @@ def _create_src_lengths_mask(batch_size, src_lengths):
             # returns [batch_size, max_seq_len]
             return (src_indices < src_lengths).int().detach()
 
-        def _multihead_attn_test_helper(add_key_padding_mask=False, add_bias_kv=False, add_zero_attn=False,
-                                        saved_kv=False, same_embed_dim=False,
-                                        average_attn_weights=average_attn_weights):
+        def _multihead_attn_test_helper(
+            add_key_padding_mask=False,
+            add_bias_kv=False,
+            add_zero_attn=False,
+            saved_kv=False,
+            same_embed_dim=False,
+            average_attn_weights=average_attn_weights,
+        ):
             for _ in range(100):
                 batch_sz, seq_len = (random.randint(2, 10) for r in range(2))
                 d_head = random.randint(3, 10)
@@ -134,15 +160,19 @@ def _multihead_attn_test_helper(add_key_padding_mask=False, add_bias_kv=False, a
                 saved_v_tensor = None
                 if saved_kv:
                     saved_k = np.random.rand(batch_sz * nheads, seq_len, d_head)
-                    saved_k_tensor = torch.from_numpy(saved_k).to(torch.get_default_dtype())
+                    saved_k_tensor = torch.from_numpy(saved_k).to(
+                        torch.get_default_dtype()
+                    )
                     saved_v = np.random.rand(batch_sz * nheads, seq_len, d_head)
-                    saved_v_tensor = torch.from_numpy(saved_v).to(torch.get_default_dtype())
+                    saved_v_tensor = torch.from_numpy(saved_v).to(
+                        torch.get_default_dtype()
+                    )
 
                 key_padding_mask = None
                 key_padding_mask_tensor = None
                 if add_key_padding_mask:
                     seq_mask = np.random.randint(0, 2, (1, seq_len))
-                    key_padding_mask = (np.repeat(seq_mask, batch_sz, axis=0) == 1)
+                    key_padding_mask = np.repeat(seq_mask, batch_sz, axis=0) == 1
                     key_padding_mask_tensor = torch.from_numpy(key_padding_mask)
                 decoder_state = np.random.rand(batch_sz, d_model)
                 K = np.random.rand(*dims)
@@ -150,16 +180,24 @@ def _multihead_attn_test_helper(add_key_padding_mask=False, add_bias_kv=False, a
                 Q = np.expand_dims(decoder_state, 1)
                 attn_mask = np.random.randint(0, 2, size=(1, seq_len))
                 attn_mask_tensor = torch.from_numpy(attn_mask).float()
-                attn_mask_tensor.masked_fill_(attn_mask_tensor == 0, float('-inf'))
-                attn_mask_tensor.masked_fill_(attn_mask_tensor > 0, float('0.0'))
+                attn_mask_tensor.masked_fill_(attn_mask_tensor == 0, float("-inf"))
+                attn_mask_tensor.masked_fill_(attn_mask_tensor > 0, float("0.0"))
 
-                decoder_state_tensor = torch.from_numpy(decoder_state).to(torch.get_default_dtype())
-                source_hid_tensor = torch.from_numpy(K).to(torch.get_default_dtype()).transpose(0, 1)
+                decoder_state_tensor = torch.from_numpy(decoder_state).to(
+                    torch.get_default_dtype()
+                )
+                source_hid_tensor = (
+                    torch.from_numpy(K).to(torch.get_default_dtype()).transpose(0, 1)
+                )
 
-                multihead_attn_module = MultiheadAttention(d_model, nheads,
-                                                           add_bias_kv=add_bias_kv,
-                                                           add_zero_attn=add_zero_attn,
-                                                           kdim=kv_dim, vdim=kv_dim)
+                multihead_attn_module = MultiheadAttention(
+                    d_model,
+                    nheads,
+                    add_bias_kv=add_bias_kv,
+                    add_zero_attn=add_zero_attn,
+                    kdim=kv_dim,
+                    vdim=kv_dim,
+                )
 
                 if add_bias_kv:
                     bias_k = multihead_attn_module.bias_k.detach().numpy()
@@ -173,30 +211,60 @@ def _multihead_attn_test_helper(add_key_padding_mask=False, add_bias_kv=False, a
                 _K = source_hid_tensor
 
                 if multihead_attn_module._qkv_same_embed_dim:
-                    result, result_weight = torch.nn.functional.multi_head_attention_forward(
-                        _Q, _K, _V,
-                        d_model, nheads,
-                        multihead_attn_module.in_proj_weight, multihead_attn_module.in_proj_bias,
-                        multihead_attn_module.bias_k, multihead_attn_module.bias_v,
-                        multihead_attn_module.add_zero_attn, multihead_attn_module.dropout,
-                        multihead_attn_module.out_proj.weight, multihead_attn_module.out_proj.bias,
-                        multihead_attn_module.training, key_padding_mask_tensor, True, attn_mask_tensor,
-                        static_k=saved_k_tensor, static_v=saved_v_tensor,
+                    (
+                        result,
+                        result_weight,
+                    ) = torch.nn.functional.multi_head_attention_forward(
+                        _Q,
+                        _K,
+                        _V,
+                        d_model,
+                        nheads,
+                        multihead_attn_module.in_proj_weight,
+                        multihead_attn_module.in_proj_bias,
+                        multihead_attn_module.bias_k,
+                        multihead_attn_module.bias_v,
+                        multihead_attn_module.add_zero_attn,
+                        multihead_attn_module.dropout,
+                        multihead_attn_module.out_proj.weight,
+                        multihead_attn_module.out_proj.bias,
+                        multihead_attn_module.training,
+                        key_padding_mask_tensor,
+                        True,
+                        attn_mask_tensor,
+                        static_k=saved_k_tensor,
+                        static_v=saved_v_tensor,
                         average_attn_weights=average_attn_weights,
                         is_causal=False,
                     )
                 else:
-                    result, result_weight = torch.nn.functional.multi_head_attention_forward(
-                        _Q, _K, _V,
-                        d_model, nheads,
-                        None, multihead_attn_module.in_proj_bias,
-                        multihead_attn_module.bias_k, multihead_attn_module.bias_v,
-                        multihead_attn_module.add_zero_attn, multihead_attn_module.dropout,
-                        multihead_attn_module.out_proj.weight, multihead_attn_module.out_proj.bias,
-                        multihead_attn_module.training, key_padding_mask_tensor, True, attn_mask_tensor,
-                        True, multihead_attn_module.q_proj_weight,
-                        multihead_attn_module.k_proj_weight, multihead_attn_module.v_proj_weight,
-                        static_k=saved_k_tensor, static_v=saved_v_tensor,
+                    (
+                        result,
+                        result_weight,
+                    ) = torch.nn.functional.multi_head_attention_forward(
+                        _Q,
+                        _K,
+                        _V,
+                        d_model,
+                        nheads,
+                        None,
+                        multihead_attn_module.in_proj_bias,
+                        multihead_attn_module.bias_k,
+                        multihead_attn_module.bias_v,
+                        multihead_attn_module.add_zero_attn,
+                        multihead_attn_module.dropout,
+                        multihead_attn_module.out_proj.weight,
+                        multihead_attn_module.out_proj.bias,
+                        multihead_attn_module.training,
+                        key_padding_mask_tensor,
+                        True,
+                        attn_mask_tensor,
+                        True,
+                        multihead_attn_module.q_proj_weight,
+                        multihead_attn_module.k_proj_weight,
+                        multihead_attn_module.v_proj_weight,
+                        static_k=saved_k_tensor,
+                        static_v=saved_v_tensor,
                         average_attn_weights=average_attn_weights,
                         is_causal=False,
                     )
@@ -205,29 +273,50 @@ def _multihead_attn_test_helper(add_key_padding_mask=False, add_bias_kv=False, a
 
                 if multihead_attn_module._qkv_same_embed_dim:
                     q_proj_weight = multihead_attn_module.in_proj_weight[:d_model]
-                    k_proj_weight = multihead_attn_module.in_proj_weight[d_model:(d_model * 2)]
-                    v_proj_weight = multihead_attn_module.in_proj_weight[(d_model * 2):]
+                    k_proj_weight = multihead_attn_module.in_proj_weight[
+                        d_model : (d_model * 2)
+                    ]
+                    v_proj_weight = multihead_attn_module.in_proj_weight[
+                        (d_model * 2) :
+                    ]
                 else:
                     q_proj_weight = multihead_attn_module.q_proj_weight
                     k_proj_weight = multihead_attn_module.k_proj_weight
                     v_proj_weight = multihead_attn_module.v_proj_weight
 
-                Q_fc = _fc(Q, q_proj_weight, multihead_attn_module.in_proj_bias[:d_model])
-                K_fc = _fc(K, k_proj_weight, multihead_attn_module.in_proj_bias[d_model:(d_model * 2)])
-                V_fc = _fc(V, v_proj_weight, multihead_attn_module.in_proj_bias[(d_model * 2):])
+                Q_fc = _fc(
+                    Q, q_proj_weight, multihead_attn_module.in_proj_bias[:d_model]
+                )
+                K_fc = _fc(
+                    K,
+                    k_proj_weight,
+                    multihead_attn_module.in_proj_bias[d_model : (d_model * 2)],
+                )
+                V_fc = _fc(
+                    V,
+                    v_proj_weight,
+                    multihead_attn_module.in_proj_bias[(d_model * 2) :],
+                )
 
                 if add_bias_kv:
-                    K_fc = np.concatenate((K_fc, np.repeat(bias_k, K_fc.shape[0], axis=0)), axis=1)
-                    V_fc = np.concatenate((V_fc, np.repeat(bias_v, V_fc.shape[0], axis=0)), axis=1)
+                    K_fc = np.concatenate(
+                        (K_fc, np.repeat(bias_k, K_fc.shape[0], axis=0)), axis=1
+                    )
+                    V_fc = np.concatenate(
+                        (V_fc, np.repeat(bias_v, V_fc.shape[0], axis=0)), axis=1
+                    )
                     if attn_mask is not None:
                         attn_mask = np.concatenate((attn_mask, np.ones([1, 1])), axis=1)
                     if key_padding_mask is not None:
                         key_padding_mask = np.concatenate(
-                            (key_padding_mask, np.full((batch_sz, 1), False, dtype=bool)), axis=1)
+                            (
+                                key_padding_mask,
+                                np.full((batch_sz, 1), False, dtype=bool),
+                            ),
+                            axis=1,
+                        )
                     dims[1] += 1
-                Q_split = _split_heads_ref(
-                    Q_fc, [batch_sz, 1, d_model], nheads, d_head
-                )
+                Q_split = _split_heads_ref(Q_fc, [batch_sz, 1, d_model], nheads, d_head)
 
                 if saved_k is not None:
                     K_split = np.reshape(saved_k, [dims[0], nheads, dims[1], d_head])
@@ -242,30 +331,62 @@ def _multihead_attn_test_helper(add_key_padding_mask=False, add_bias_kv=False, a
                 if add_zero_attn:
                     dims[1] += 1
                     K_split = np.concatenate(
-                        (K_split, np.zeros([K_split.shape[0], K_split.shape[1], 1, K_split.shape[3]])), axis=2)
+                        (
+                            K_split,
+                            np.zeros(
+                                [
+                                    K_split.shape[0],
+                                    K_split.shape[1],
+                                    1,
+                                    K_split.shape[3],
+                                ]
+                            ),
+                        ),
+                        axis=2,
+                    )
                     V_split = np.concatenate(
-                        (V_split, np.zeros([V_split.shape[0], V_split.shape[1], 1, V_split.shape[3]])), axis=2)
+                        (
+                            V_split,
+                            np.zeros(
+                                [
+                                    V_split.shape[0],
+                                    V_split.shape[1],
+                                    1,
+                                    V_split.shape[3],
+                                ]
+                            ),
+                        ),
+                        axis=2,
+                    )
 
                     if attn_mask is not None:
                         attn_mask = np.concatenate((attn_mask, np.ones([1, 1])), axis=1)
 
                     if key_padding_mask is not None:
                         key_padding_mask = np.concatenate(
-                            (key_padding_mask, np.full((batch_sz, 1), False, dtype=bool)), axis=1)
+                            (
+                                key_padding_mask,
+                                np.full((batch_sz, 1), False, dtype=bool),
+                            ),
+                            axis=1,
+                        )
                 attn_heads, ref_attn_weight = _scaled_dot_attn_ref(
                     Q=Q_split,
                     K=K_split,
                     V=V_split,
                     dims=Q_split.shape,
                     unseen_mask=attn_mask,
-                    key_padding_mask=key_padding_mask
+                    key_padding_mask=key_padding_mask,
                 )
                 combined_attn_heads = _combine_heads_ref(
                     X=attn_heads, dims=[batch_sz, 1], nheads=nheads, d_head=d_head
                 )
 
-                reference = _fc(combined_attn_heads, multihead_attn_module.out_proj.weight,
-                                multihead_attn_module.out_proj.bias)
+                reference = _fc(
+                    combined_attn_heads,
+                    multihead_attn_module.out_proj.weight,
+                    multihead_attn_module.out_proj.bias,
+                )
                 reference = np.squeeze(reference, axis=1)
 
                 # result = reference
@@ -274,7 +395,9 @@ def _multihead_attn_test_helper(add_key_padding_mask=False, add_bias_kv=False, a
 
                 # result_weight = ref_attn_weight
                 result_weight = result_weight.detach().numpy()
-                self.assertEqual(tuple(result_weight.shape), tuple(ref_attn_weight.shape))
+                self.assertEqual(
+                    tuple(result_weight.shape), tuple(ref_attn_weight.shape)
+                )
                 np.testing.assert_allclose(result_weight, ref_attn_weight, atol=1e-5)
 
         def test_multihead_attn_add_bias_kv():
@@ -293,28 +416,41 @@ def test_multihead_attn_saved_kv():
             _multihead_attn_test_helper(saved_kv=True)
 
         def test_multihead_attn_add_bias_kv_zero_attn():
-            _multihead_attn_test_helper(add_key_padding_mask=True, add_bias_kv=True,
-                                        add_zero_attn=True)
+            _multihead_attn_test_helper(
+                add_key_padding_mask=True, add_bias_kv=True, add_zero_attn=True
+            )
 
         def test_multihead_attn_all_arguments1():
-            _multihead_attn_test_helper(add_key_padding_mask=True, add_zero_attn=True, saved_kv=True)
+            _multihead_attn_test_helper(
+                add_key_padding_mask=True, add_zero_attn=True, saved_kv=True
+            )
 
         def test_multihead_attn_all_arguments2():
-            _multihead_attn_test_helper(add_key_padding_mask=True, add_bias_kv=True,
-                                        add_zero_attn=True, saved_kv=True)
+            _multihead_attn_test_helper(
+                add_key_padding_mask=True,
+                add_bias_kv=True,
+                add_zero_attn=True,
+                saved_kv=True,
+            )
 
         def test_multihead_attn_all_arguments3():
-            _multihead_attn_test_helper(add_key_padding_mask=True, add_zero_attn=True,
-                                        saved_kv=True, same_embed_dim=True)
+            _multihead_attn_test_helper(
+                add_key_padding_mask=True,
+                add_zero_attn=True,
+                saved_kv=True,
+                same_embed_dim=True,
+            )
 
         test_multihead_attn_add_zero_attn()  # Test MultiheadAttention with add_zero_attn
         test_multihead_attn_add_bias_kv()  # Test MultiheadAttention with add_bias_kv
-        test_multihead_attn_no_masking()   # Test MultiheadAttention without masking
+        test_multihead_attn_no_masking()  # Test MultiheadAttention without masking
         test_multihead_attn_key_padding_mask()  # Test MultiheadAttention with src lengths
         test_multihead_attn_saved_kv()  # Test MultiheadAttention with static kv.
         test_multihead_attn_add_bias_kv_zero_attn()  # Test MultiheadAttention with bias_kv and zero_attn.
         test_multihead_attn_all_arguments1()  # Test MultiheadAttention with all the argument.
-        with self.assertRaisesRegex(AssertionError, "bias cannot be added to static key."):
+        with self.assertRaisesRegex(
+            AssertionError, "bias cannot be added to static key."
+        ):
             test_multihead_attn_all_arguments2()  # Test MultiheadAttention with all the argument.
         test_multihead_attn_all_arguments3()  # Test MultiheadAttention with all the argument.
 
@@ -328,22 +464,34 @@ def test_multihead_attn_3d_attn_mask(self):
         query = torch.rand(batch_size, tgt_len, embed_dim)  # [N, T, D]
         key = torch.rand(batch_size, src_len, embed_dim)  # [N, S, D]
         value = key  # [N, S, D]
-        attn_mask = torch.randint(0, 2, (batch_size, tgt_len, src_len)).float()  # [N, T, S]
-        attn_mask = attn_mask.masked_fill(attn_mask == 0, float('-inf')).masked_fill(attn_mask == 1, 0.0)
+        attn_mask = torch.randint(
+            0, 2, (batch_size, tgt_len, src_len)
+        ).float()  # [N, T, S]
+        attn_mask = attn_mask.masked_fill(attn_mask == 0, float("-inf")).masked_fill(
+            attn_mask == 1, 0.0
+        )
 
         mta_model = torch.nn.MultiheadAttention(embed_dim, num_heads)
 
         # Generate 3D results
-        attn_mask_3d = torch.repeat_interleave(attn_mask, num_heads, dim=0)  # [N * H, T, S]
-        output_3d = mta_model(query.transpose(0, 1), key.transpose(
-            0, 1), value.transpose(0, 1), attn_mask=attn_mask_3d)[0]
+        attn_mask_3d = torch.repeat_interleave(
+            attn_mask, num_heads, dim=0
+        )  # [N * H, T, S]
+        output_3d = mta_model(
+            query.transpose(0, 1),
+            key.transpose(0, 1),
+            value.transpose(0, 1),
+            attn_mask=attn_mask_3d,
+        )[0]
         output_3d = output_3d.transpose(0, 1)  # [N, T, D]
 
         for i in range(0, batch_size):
-            output_2d = mta_model(query[i].unsqueeze(0).transpose(0, 1),
-                                  key[i].unsqueeze(0).transpose(0, 1),
-                                  value[i].unsqueeze(0).transpose(0, 1),
-                                  attn_mask=attn_mask[i])[0]
+            output_2d = mta_model(
+                query[i].unsqueeze(0).transpose(0, 1),
+                key[i].unsqueeze(0).transpose(0, 1),
+                value[i].unsqueeze(0).transpose(0, 1),
+                attn_mask=attn_mask[i],
+            )[0]
 
             # output_2d in shape of [T, 1, D]
             self.assertEqual(output_3d[i].unsqueeze(0).transpose(0, 1), output_2d)
@@ -376,12 +524,26 @@ def _test_multihead_attn_invalid_shape_impl(self, mha):
         msg = "expected `key_padding_mask` to be `None` or 2-D but found 1-D tensor instead"
         # 3D query, 3D key, 3D value and 1D key_padding_mask
         with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, key_padding_mask=torch.tensor([False, False, True, True], dtype=torch.bool))
+            mha(
+                query,
+                key,
+                value,
+                key_padding_mask=torch.tensor(
+                    [False, False, True, True], dtype=torch.bool
+                ),
+            )
 
-        msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
+        msg = (
+            "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
+        )
         # 3D query, 3D key, 3D value and 1D attn_mask
         with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, attn_mask=torch.tensor([False, False, True, True], dtype=torch.bool))
+            mha(
+                query,
+                key,
+                value,
+                attn_mask=torch.tensor([False, False, True, True], dtype=torch.bool),
+            )
 
         # Unbatched (2D) query cases
         query = torch.randn(4, 4)
@@ -401,17 +563,36 @@ def _test_multihead_attn_invalid_shape_impl(self, mha):
         msg = "expected `key_padding_mask` to be `None` or 1-D but found 2-D tensor instead"
         # 2D query, 2D key, 2D value and 1D key_padding_mask
         with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, key_padding_mask=torch.tensor([[False, False, True, True] * 2], dtype=torch.bool))
+            mha(
+                query,
+                key,
+                value,
+                key_padding_mask=torch.tensor(
+                    [[False, False, True, True] * 2], dtype=torch.bool
+                ),
+            )
 
-        msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
+        msg = (
+            "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
+        )
         # 2D query, 2D key, 2D value and 1D attn_mask
         with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, attn_mask=torch.tensor([False, False, True, True], dtype=torch.bool))
+            mha(
+                query,
+                key,
+                value,
+                attn_mask=torch.tensor([False, False, True, True], dtype=torch.bool),
+            )
 
         msg = r"Expected `attn_mask` shape to be \(4, 4, 4\)"
         # 2D query, 2D key, 2D value and 3D incorrect attn_mask
         with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, attn_mask=torch.randn(5, 4, 4).bernoulli_().to(torch.bool))
+            mha(
+                query,
+                key,
+                value,
+                attn_mask=torch.randn(5, 4, 4).bernoulli_().to(torch.bool),
+            )
 
     def test_multihead_attn_invalid_shape(self):
         mha = torch.nn.MultiheadAttention(4, 4)
@@ -449,13 +630,26 @@ def test_multihead_attn_fast_path_invalid_shape(self):
         msg = "expected `key_padding_mask` to be `None` or 2-D but found 1-D tensor instead"
         # 3D query, 3D key, 3D value and 1D key_padding_mask
         with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, key_padding_mask=torch.tensor(
-                [False, True, True], dtype=torch.bool), need_weights=False)
+            mha(
+                query,
+                key,
+                value,
+                key_padding_mask=torch.tensor([False, True, True], dtype=torch.bool),
+                need_weights=False,
+            )
 
-        msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
+        msg = (
+            "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
+        )
         # 3D query, 3D key, 3D value and 1D attn_mask
         with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, attn_mask=torch.tensor([False, True, True], dtype=torch.bool), need_weights=False)
+            mha(
+                query,
+                key,
+                value,
+                attn_mask=torch.tensor([False, True, True], dtype=torch.bool),
+                need_weights=False,
+            )
 
         # Unbatched (2D) query cases
         # NOTE: error messages are the same as regular path because the fast path doesn't support 2D.
@@ -476,17 +670,36 @@ def test_multihead_attn_fast_path_invalid_shape(self):
         msg = "expected `key_padding_mask` to be `None` or 1-D but found 2-D tensor instead"
         # 2D query, 2D key, 2D value and 1D key_padding_mask
         with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, key_padding_mask=torch.tensor([[False, False, True, True] * 2], dtype=torch.bool))
+            mha(
+                query,
+                key,
+                value,
+                key_padding_mask=torch.tensor(
+                    [[False, False, True, True] * 2], dtype=torch.bool
+                ),
+            )
 
-        msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
+        msg = (
+            "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
+        )
         # 2D query, 2D key, 2D value and 1D attn_mask
         with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, attn_mask=torch.tensor([False, False, True, True], dtype=torch.bool))
+            mha(
+                query,
+                key,
+                value,
+                attn_mask=torch.tensor([False, False, True, True], dtype=torch.bool),
+            )
 
         msg = r"Expected `attn_mask` shape to be \(4, 4, 4\)"
         # 2D query, 2D key, 2D value and 3D incorrect attn_mask
         with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, attn_mask=torch.randn(5, 4, 4).bernoulli_().to(torch.bool))
+            mha(
+                query,
+                key,
+                value,
+                attn_mask=torch.randn(5, 4, 4).bernoulli_().to(torch.bool),
+            )
 
     def test_multihead_attn_nested_tensor_outside_fast_path(self):
         mha = torch.nn.MultiheadAttention(4, 4, batch_first=True).eval()
@@ -494,12 +707,21 @@ def test_multihead_attn_nested_tensor_outside_fast_path(self):
         # One tested platform (linux-bionic-py3.7-clang) has a torch_function for one
         # or more of these. Take advantage of that to test the torch_function bailout.
         has_torch_func = torch.overrides.has_torch_function(
-            (nt, mha.in_proj_weight, mha.in_proj_bias, mha.out_proj.weight, mha.out_proj.bias))
+            (
+                nt,
+                mha.in_proj_weight,
+                mha.in_proj_bias,
+                mha.out_proj.weight,
+                mha.out_proj.bias,
+            )
+        )
         if has_torch_func:
             msg = "MultiheadAttention does not support NestedTensor.*argument has_torch_function"
         else:
-            msg = ("MultiheadAttention does not support NestedTensor outside of its fast path.*grad is " +
-                   "enabled and.*or biases requires_grad")
+            msg = (
+                "MultiheadAttention does not support NestedTensor outside of its fast path.*grad is "
+                + "enabled and.*or biases requires_grad"
+            )
         with self.assertRaisesRegex(AssertionError, msg):
             mha(nt, nt, nt)
 
@@ -537,60 +759,82 @@ def test_multihead_self_attn_two_masks_fast_path(self, device):
             query = value = key = torch.rand(batch_size, src_len, embed_dim).to(device)
             # Create masks of two different types
             attn_mask = torch.randint(0, 2, (src_len, src_len)).bool().to(device)
-            key_padding_mask = torch.randint(0, 2, (batch_size, src_len)).bool().to(device)
+            key_padding_mask = (
+                torch.randint(0, 2, (batch_size, src_len)).bool().to(device)
+            )
 
             # We'll need expanded versions of the masks for masking out the outputs below
-            attn_mask_expanded = attn_mask.reshape(1, 1, src_len, src_len) \
-                                          .expand(batch_size, num_heads, src_len, src_len)
-            key_padding_mask_expanded = key_padding_mask.reshape(batch_size, 1, 1, src_len) \
-                                                        .expand(batch_size, num_heads, src_len, src_len)
+            attn_mask_expanded = attn_mask.reshape(1, 1, src_len, src_len).expand(
+                batch_size, num_heads, src_len, src_len
+            )
+            key_padding_mask_expanded = key_padding_mask.reshape(
+                batch_size, 1, 1, src_len
+            ).expand(batch_size, num_heads, src_len, src_len)
             merged_mask = attn_mask_expanded.logical_or(key_padding_mask_expanded)
 
             # Compute attention on the fast path
-            mta_model = torch.nn.MultiheadAttention(embed_dim, num_heads, batch_first=True, device=device)
+            mta_model = torch.nn.MultiheadAttention(
+                embed_dim, num_heads, batch_first=True, device=device
+            )
             mta_model.training = False
-            result_fast_path, _ = mta_model(query, key, value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
+            result_fast_path, _ = mta_model(
+                query,
+                key,
+                value,
+                attn_mask=attn_mask,
+                key_padding_mask=key_padding_mask,
+            )
 
             # Compute attention on the slow path
-            result_ref, _ = torch.nn.functional.multi_head_attention_forward(query.transpose(0, 1),
-                                                                             key.transpose(0, 1),
-                                                                             value.transpose(0, 1),
-                                                                             embed_dim, num_heads,
-                                                                             mta_model.in_proj_weight,
-                                                                             mta_model.in_proj_bias,
-                                                                             mta_model.bias_k, mta_model.bias_v,
-                                                                             mta_model.add_zero_attn,
-                                                                             mta_model.dropout,
-                                                                             mta_model.out_proj.weight,
-                                                                             mta_model.out_proj.bias,
-                                                                             training=mta_model.training,
-                                                                             key_padding_mask=key_padding_mask,
-                                                                             need_weights=False,
-                                                                             attn_mask=attn_mask,
-                                                                             use_separate_proj_weight=False,
-                                                                             q_proj_weight=mta_model.q_proj_weight,
-                                                                             k_proj_weight=mta_model.k_proj_weight,
-                                                                             v_proj_weight=mta_model.v_proj_weight,
-                                                                             average_attn_weights=False,
-                                                                             )
+            result_ref, _ = torch.nn.functional.multi_head_attention_forward(
+                query.transpose(0, 1),
+                key.transpose(0, 1),
+                value.transpose(0, 1),
+                embed_dim,
+                num_heads,
+                mta_model.in_proj_weight,
+                mta_model.in_proj_bias,
+                mta_model.bias_k,
+                mta_model.bias_v,
+                mta_model.add_zero_attn,
+                mta_model.dropout,
+                mta_model.out_proj.weight,
+                mta_model.out_proj.bias,
+                training=mta_model.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=False,
+                attn_mask=attn_mask,
+                use_separate_proj_weight=False,
+                q_proj_weight=mta_model.q_proj_weight,
+                k_proj_weight=mta_model.k_proj_weight,
+                v_proj_weight=mta_model.v_proj_weight,
+                average_attn_weights=False,
+            )
             result_ref = result_ref.transpose(0, 1)  # Convert to batch-first
 
             # Rows which are completely masked out are nan, we need to exclude them from comparison
-            mask_out = merged_mask[:, 0, :, :].all(-1, keepdim=True).expand(batch_size, src_len, embed_dim)
+            mask_out = (
+                merged_mask[:, 0, :, :]
+                .all(-1, keepdim=True)
+                .expand(batch_size, src_len, embed_dim)
+            )
             result_fast_path_masked = result_fast_path.masked_fill(mask_out, 0)
             result_ref_masked = result_ref.masked_fill(mask_out, 0)
 
             self.assertEqual(result_fast_path_masked, result_ref_masked)
 
     @torch.no_grad()
-    @unittest.skipIf(TEST_WITH_CROSSREF, 'CrossRef turns on TorchFunctionMode, and so disables fastpath.')
+    @unittest.skipIf(
+        TEST_WITH_CROSSREF,
+        "CrossRef turns on TorchFunctionMode, and so disables fastpath.",
+    )
     def test_multihead_self_attn_two_masks_fast_path_mock(self, device):
         """
         Multihead self-attention should take fast path when both attention mask (mask type 0)
         and key padding mask (mask type 1) are provided at the same time on CPU and CUDA and PrivateUse1
         """
-        device = device.rstrip(':0123456789')
-        if device not in ['cpu', 'cuda', torch._C._get_privateuse1_backend_name()]:
+        device = device.rstrip(":0123456789")
+        if device not in ["cpu", "cuda", torch._C._get_privateuse1_backend_name()]:
             self.skipTest("Fastpath only runs on CPU and CUDA and PrivateUse1.")
 
         with torch.autocast(device_type=device, enabled=False):
@@ -602,15 +846,26 @@ def test_multihead_self_attn_two_masks_fast_path_mock(self, device):
             query = value = key = torch.rand(batch_size, src_len, embed_dim).to(device)
             # Create masks of two different types
             attn_mask = torch.randint(0, 2, (src_len, src_len)).bool().to(device)
-            key_padding_mask = torch.randint(0, 2, (batch_size, src_len)).bool().to(device)
+            key_padding_mask = (
+                torch.randint(0, 2, (batch_size, src_len)).bool().to(device)
+            )
 
-            with mock.patch('torch._native_multi_head_attention', new=mock.MagicMock(
-                    return_value=(torch.Tensor(), torch.Tensor()))
+            with mock.patch(
+                "torch._native_multi_head_attention",
+                new=mock.MagicMock(return_value=(torch.Tensor(), torch.Tensor())),
             ) as fastpath_mock:
                 # Compute attention on the fast path
-                mta_model = torch.nn.MultiheadAttention(embed_dim, num_heads, batch_first=True, device=device).eval()
+                mta_model = torch.nn.MultiheadAttention(
+                    embed_dim, num_heads, batch_first=True, device=device
+                ).eval()
                 mta_model.training = False
-                mta_model(query, key, value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
+                mta_model(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attn_mask,
+                    key_padding_mask=key_padding_mask,
+                )
                 # If mock was called, fastpath was taken
                 self.assertTrue(fastpath_mock.called)
 
@@ -640,7 +895,11 @@ def test_multihead_attention_dtype_batch_first(self, device, dtype):
         # the native fast path if we call .eval() and enable inference
         # mode. Test both paths.
         for training in (True, False):
-            model = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True).to(device).to(dtype)
+            model = (
+                nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
+                .to(device)
+                .to(dtype)
+            )
             if not training:
                 model = model.eval()
                 cm = torch.no_grad()
@@ -657,16 +916,24 @@ def test_multihead_attention_dtype_batch_first(self, device, dtype):
 
     @dtypes(torch.double)
     @torch.no_grad()
-    def test_multihead_attn_fast_path_query_and_bias_have_different_dtypes(self, device, dtype):
-        mha = torch.nn.MultiheadAttention(4, 4, batch_first=True, dtype=dtype, device=device).eval()
-        mha.in_proj_bias = torch.nn.Parameter(mha.in_proj_bias.to(torch.half).to(device))
+    def test_multihead_attn_fast_path_query_and_bias_have_different_dtypes(
+        self, device, dtype
+    ):
+        mha = torch.nn.MultiheadAttention(
+            4, 4, batch_first=True, dtype=dtype, device=device
+        ).eval()
+        mha.in_proj_bias = torch.nn.Parameter(
+            mha.in_proj_bias.to(torch.half).to(device)
+        )
         query = torch.randn(4, 4, 4, dtype=dtype, device=device)
         mha(query, query, query)
 
     @dtypes(torch.double)
     @torch.no_grad()
     def test_multihead_attn_fast_path_small_test(self, device, dtype):
-        mha = torch.nn.MultiheadAttention(4, 4, batch_first=True, dtype=dtype, device=device).eval()
+        mha = torch.nn.MultiheadAttention(
+            4, 4, batch_first=True, dtype=dtype, device=device
+        ).eval()
         query = torch.randn(4, 4, 4, dtype=dtype, device=device)
         mha(query, query, query)
 
@@ -683,7 +950,9 @@ def test_multihead_attn_in_proj_weight_none(self, device, dtype):
         # Setting kdim == vdim == 2 means that vdim != embed_dim
         # will cause the logic to use per-input project weights, thereby
         # forcing self.in_proj_weight = None
-        mha = torch.nn.MultiheadAttention(4, 4, vdim=2, kdim=2, dtype=dtype, device=device)
+        mha = torch.nn.MultiheadAttention(
+            4, 4, vdim=2, kdim=2, dtype=dtype, device=device
+        )
         query = torch.rand(4, 4, 4, dtype=dtype, device=device)
         key = torch.rand(4, 4, 2, dtype=dtype, device=device)
         mha(query, key, key)
@@ -692,5 +961,5 @@ def test_multihead_attn_in_proj_weight_none(self, device, dtype):
 instantiate_device_type_tests(TestMultiheadAttentionNNDeviceType, globals())
 instantiate_parametrized_tests(TestMultiheadAttentionNN)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/nn/test_packed_sequence.py b/test/nn/test_packed_sequence.py
index afc24fca473dc..a9496edd3667d 100644
--- a/test/nn/test_packed_sequence.py
+++ b/test/nn/test_packed_sequence.py
@@ -4,23 +4,22 @@
 import random
 
 import torch
-from torch.testing._internal.common_utils import TestCase, run_tests
 import torch.nn.utils.rnn as rnn_utils
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class PackedSequenceTest(TestCase):
-
     _type_by_name = {
-        'torch.DoubleTensor': (torch.DoubleTensor, 'double'),
-        'torch.FloatTensor': (torch.FloatTensor, 'float'),
+        "torch.DoubleTensor": (torch.DoubleTensor, "double"),
+        "torch.FloatTensor": (torch.FloatTensor, "float"),
         # We leave out `'torch.HalfTensor': (torch.HalfTensor, 'half'),`
         # because of an error in `pad_packed_sequence`
         # > AttributeError: 'torch.HalfTensor' object has no attribute 'fill_'
-        'torch.LongTensor': (torch.LongTensor, 'long'),
-        'torch.IntTensor': (torch.IntTensor, 'int'),
-        'torch.ShortTensor': (torch.ShortTensor, 'short'),
-        'torch.CharTensor': (torch.CharTensor, 'char'),
-        'torch.ByteTensor': (torch.ByteTensor, 'byte'),
+        "torch.LongTensor": (torch.LongTensor, "long"),
+        "torch.IntTensor": (torch.IntTensor, "int"),
+        "torch.ShortTensor": (torch.ShortTensor, "short"),
+        "torch.CharTensor": (torch.CharTensor, "char"),
+        "torch.ByteTensor": (torch.ByteTensor, "byte"),
     }
 
     def __init__(self, *args, **kwargs):
@@ -30,8 +29,10 @@ def __init__(self, *args, **kwargs):
 
     def _ordered_sequence(self, tensor_type):
         """Create ordered list of random sequences"""
-        seqs = [tensor_type(random.randint(1, self.max_length))
-                for _ in range(self.batch_size)]
+        seqs = [
+            tensor_type(random.randint(1, self.max_length))
+            for _ in range(self.batch_size)
+        ]
         if tensor_type == torch.ByteTensor:
             seqs = [s.random_(0, 256) for s in seqs]
         else:
@@ -48,12 +49,13 @@ def _padded_sequence(self, tensor_type):
 
     def test_type_casts(self):
         """Test type casting of `PackedSequence` against type casting of tensor"""
-        for (input_type, _) in self._type_by_name.values():
+        for input_type, _ in self._type_by_name.values():
             for expected_type_str, (_, cast_str) in self._type_by_name.items():
                 for enforce_sorted in [True, False]:
                     padded, lengths = self._padded_sequence(input_type)
                     packed = rnn_utils.pack_padded_sequence(
-                        padded, lengths, enforce_sorted=enforce_sorted)
+                        padded, lengths, enforce_sorted=enforce_sorted
+                    )
                     # Apply cast to `PackedSequence` instance and unpack
                     masked = getattr(packed, cast_str)()
                     unpacked, lengths_out = rnn_utils.pad_packed_sequence(masked)
@@ -65,7 +67,8 @@ def test_wrong_order(self):
         b_a = rnn_utils.pad_sequence([b, a])
         self.assertRaises(
             RuntimeError,
-            lambda: rnn_utils.pack_padded_sequence(b_a, [22, 25], enforce_sorted=True))
+            lambda: rnn_utils.pack_padded_sequence(b_a, [22, 25], enforce_sorted=True),
+        )
 
     def test_pad_sequence_with_tensor_sequences(self):
         seq_tuple_input = torch.nn.utils.rnn.pad_sequence(
@@ -89,29 +92,41 @@ def test_total_length(self):
         # test ValueError if total_length < max_length
         for total_length in (-1, 0, max_length - 1):
             for batch_first in (True, False):
+
                 def err_fn():
-                    rnn_utils.pad_packed_sequence(packed, batch_first=batch_first,
-                                                  total_length=total_length)
-            self.assertRaisesRegex(ValueError,
-                                   r'Expected total_length to be at least the '
-                                   r'length of the longest sequence in input',
-                                   err_fn)
+                    rnn_utils.pad_packed_sequence(
+                        packed, batch_first=batch_first, total_length=total_length
+                    )
+
+            self.assertRaisesRegex(
+                ValueError,
+                r"Expected total_length to be at least the "
+                r"length of the longest sequence in input",
+                err_fn,
+            )
         # test that pad_packed_sequence returns results of correct length
         for batch_first in (True, False):
-            no_extra_pad, _ = rnn_utils.pad_packed_sequence(packed, batch_first=batch_first)
+            no_extra_pad, _ = rnn_utils.pad_packed_sequence(
+                packed, batch_first=batch_first
+            )
             for total_length_delta in (0, 1, 8):
                 total_length = max_length + total_length_delta
-                unpacked, lengths_out = rnn_utils.pad_packed_sequence(packed, batch_first=batch_first,
-                                                                      total_length=total_length)
+                unpacked, lengths_out = rnn_utils.pad_packed_sequence(
+                    packed, batch_first=batch_first, total_length=total_length
+                )
                 self.assertEqual(lengths, lengths_out)
                 self.assertEqual(unpacked.size(1 if batch_first else 0), total_length)
                 if total_length_delta == 0:
                     ref_output = no_extra_pad
                 elif batch_first:
-                    extra_pad = no_extra_pad.new_zeros(self.batch_size, total_length_delta)
+                    extra_pad = no_extra_pad.new_zeros(
+                        self.batch_size, total_length_delta
+                    )
                     ref_output = torch.cat([no_extra_pad, extra_pad], 1)
                 else:
-                    extra_pad = no_extra_pad.new_zeros(total_length_delta, self.batch_size)
+                    extra_pad = no_extra_pad.new_zeros(
+                        total_length_delta, self.batch_size
+                    )
                     ref_output = torch.cat([no_extra_pad, extra_pad], 0)
                 self.assertEqual(unpacked, ref_output)
 
@@ -119,21 +134,25 @@ def test_to(self):
         for enforce_sorted in (True, False):
             padded, lengths = self._padded_sequence(torch.IntTensor)
             a = rnn_utils.pack_padded_sequence(
-                padded, lengths, enforce_sorted=enforce_sorted).cpu()
+                padded, lengths, enforce_sorted=enforce_sorted
+            ).cpu()
 
-            self.assertIs(a, a.to('cpu'))
+            self.assertIs(a, a.to("cpu"))
             self.assertIs(a, a.cpu())
-            self.assertIs(a, a.to('cpu', dtype=torch.int32))
+            self.assertIs(a, a.to("cpu", dtype=torch.int32))
             self.assertEqual(a.long(), a.to(torch.int64))
 
             if torch.cuda.is_available():
-                for cuda in ['cuda', 'cuda:0' if torch.cuda.device_count() == 1 else 'cuda:1']:
+                for cuda in [
+                    "cuda",
+                    "cuda:0" if torch.cuda.device_count() == 1 else "cuda:1",
+                ]:
                     b = a.cuda(device=cuda)
                     self.assertIs(b, b.to(cuda))
                     self.assertIs(b, b.cuda())
-                    self.assertEqual(a, b.to('cpu'))
+                    self.assertEqual(a, b.to("cpu"))
                     self.assertEqual(b, a.to(cuda))
-                    self.assertEqual(a, b.to('cpu', dtype=torch.int32))
+                    self.assertEqual(a, b.to("cpu", dtype=torch.int32))
                     self.assertIs(b, b.to(dtype=torch.int32))
                     self.assertEqual(b.long(), b.to(dtype=torch.int64))
 
@@ -147,8 +166,13 @@ def test_to_memory_format(self):
     def test_pad_sequence(self):
         def pad(tensor, length):
             return torch.cat(
-                [tensor.data, tensor.data.new(
-                    length - tensor.size(0), *tensor.size()[1:]).zero_()])
+                [
+                    tensor.data,
+                    tensor.data.new(
+                        length - tensor.size(0), *tensor.size()[1:]
+                    ).zero_(),
+                ]
+            )
 
         # single dimensional
         a = torch.tensor([1, 2, 3])
@@ -196,7 +220,6 @@ def pad(tensor, length):
             self.assertEqual(padded, expected.transpose(0, 1))
 
     def test_unpad_sequence(self):
-
         # single dimensional
         a = torch.tensor([1, 2, 3])
         b = torch.tensor([4, 5])
@@ -205,8 +228,12 @@ def test_unpad_sequence(self):
 
         lengths = torch.as_tensor([v.size(0) for v in sequences])
         for batch_first in [True, False]:
-            padded_sequences = rnn_utils.pad_sequence(sequences, batch_first=batch_first)
-            unpadded_sequences = rnn_utils.unpad_sequence(padded_sequences, lengths, batch_first=batch_first)
+            padded_sequences = rnn_utils.pad_sequence(
+                sequences, batch_first=batch_first
+            )
+            unpadded_sequences = rnn_utils.unpad_sequence(
+                padded_sequences, lengths, batch_first=batch_first
+            )
             self.assertEqual(sequences, unpadded_sequences)
 
         # more dimensions
@@ -220,8 +247,12 @@ def test_unpad_sequence(self):
             random.shuffle(sequences)
 
             lengths = torch.as_tensor([v.size(0) for v in sequences])
-            padded_sequences = rnn_utils.pad_sequence(sequences, batch_first=batch_first)
-            unpadded_sequences = rnn_utils.unpad_sequence(padded_sequences, lengths, batch_first=batch_first)
+            padded_sequences = rnn_utils.pad_sequence(
+                sequences, batch_first=batch_first
+            )
+            unpadded_sequences = rnn_utils.unpad_sequence(
+                padded_sequences, lengths, batch_first=batch_first
+            )
             self.assertEqual(sequences, unpadded_sequences)
 
     def test_pack_sequence(self):
@@ -231,7 +262,8 @@ def _compatibility_test(sequences, lengths, batch_first, enforce_sorted=False):
             unpacked = rnn_utils.pad_packed_sequence(packed, batch_first)
             self.assertEqual(padded, unpacked[0])
             pack_padded = rnn_utils.pack_padded_sequence(
-                padded, lengths, batch_first, enforce_sorted)
+                padded, lengths, batch_first, enforce_sorted
+            )
             self.assertEqual(packed, pack_padded)
 
         # single dimensional
@@ -258,10 +290,12 @@ def _compatibility_test(sequences, lengths, batch_first, enforce_sorted=False):
         self.assertTrue(packed_enforce_sorted.sorted_indices is None)
         self.assertTrue(packed_enforce_sorted.unsorted_indices is None)
 
-        with self.assertRaisesRegex(RuntimeError, 'must be sorted in decreasing order'):
+        with self.assertRaisesRegex(RuntimeError, "must be sorted in decreasing order"):
             rnn_utils.pack_sequence([b, c, a], enforce_sorted=True)
 
-        with self.assertRaisesRegex(RuntimeError, 'You can pass `enforce_sorted=False`'):
+        with self.assertRaisesRegex(
+            RuntimeError, "You can pass `enforce_sorted=False`"
+        ):
             rnn_utils.pack_sequence([b, c, a], enforce_sorted=True)
 
         # more dimensions
@@ -282,11 +316,11 @@ def _compatibility_test(sequences, lengths, batch_first, enforce_sorted=False):
             for batch_first in (True, False):
                 for enforce_sorted in (True, False):
                     _compatibility_test(sequences, lengths, batch_first, enforce_sorted)
-                _compatibility_test(unsorted_sequences, unsorted_sequences_lengths,
-                                    batch_first)
+                _compatibility_test(
+                    unsorted_sequences, unsorted_sequences_lengths, batch_first
+                )
 
     def test_unpack_sequence(self):
-
         # single dimensional
         a = torch.tensor([1, 2, 3])
         b = torch.tensor([4, 5])
@@ -314,16 +348,35 @@ def test_unpack_sequence(self):
     def test_pack_padded_sequence(self):
         def generate_test_case(sorted_lengths, should_shuffle):
             def pad(tensor, length):
-                return torch.cat([tensor, tensor.new(length - tensor.size(0), *tensor.size()[1:]).zero_()])
+                return torch.cat(
+                    [
+                        tensor,
+                        tensor.new(length - tensor.size(0), *tensor.size()[1:]).zero_(),
+                    ]
+                )
 
             max_length = sorted_lengths[0]
-            batch_sizes = [sum(map(bool, filter(lambda x: x >= i, sorted_lengths)))
-                           for i in range(1, max_length + 1)]
+            batch_sizes = [
+                sum(map(bool, filter(lambda x: x >= i, sorted_lengths)))
+                for i in range(1, max_length + 1)
+            ]
             offset = 0
-            padded = torch.cat([pad(i * 100 + torch.arange(1., 5 * l + 1).view(l, 1, 5), max_length)
-                                for i, l in enumerate(sorted_lengths, 1)], 1)
-            expected_data = [[torch.arange(1., 6) + (i + 1) * 100 + 5 * n for i in range(batch_size)]
-                             for n, batch_size in enumerate(batch_sizes)]
+            padded = torch.cat(
+                [
+                    pad(
+                        i * 100 + torch.arange(1.0, 5 * l + 1).view(l, 1, 5), max_length
+                    )
+                    for i, l in enumerate(sorted_lengths, 1)
+                ],
+                1,
+            )
+            expected_data = [
+                [
+                    torch.arange(1.0, 6) + (i + 1) * 100 + 5 * n
+                    for i in range(batch_size)
+                ]
+                for n, batch_size in enumerate(batch_sizes)
+            ]
             expected_data = list(itertools.chain.from_iterable(expected_data))
             expected_data = torch.stack(expected_data, dim=0)
 
@@ -339,7 +392,13 @@ def pad(tensor, length):
                 unsorted_indices = None
                 lengths = sorted_lengths
 
-            return padded.requires_grad_(), lengths, expected_data, batch_sizes, unsorted_indices
+            return (
+                padded.requires_grad_(),
+                lengths,
+                expected_data,
+                batch_sizes,
+                unsorted_indices,
+            )
 
         test_cases = [
             # sorted_lengths, should_shuffle
@@ -350,22 +409,30 @@ def pad(tensor, length):
 
         for test_case, batch_first in itertools.product(test_cases, (True, False)):
             sorted_lengths, should_shuffle = test_case
-            padded, lengths, expected_data, batch_sizes, unsorted_indices = generate_test_case(
-                sorted_lengths, should_shuffle)
+            (
+                padded,
+                lengths,
+                expected_data,
+                batch_sizes,
+                unsorted_indices,
+            ) = generate_test_case(sorted_lengths, should_shuffle)
 
             src = padded
             if batch_first:
                 src = src.transpose(0, 1)
 
             # check output
-            packed = rnn_utils.pack_padded_sequence(src, lengths, batch_first=batch_first,
-                                                    enforce_sorted=not should_shuffle)
+            packed = rnn_utils.pack_padded_sequence(
+                src, lengths, batch_first=batch_first, enforce_sorted=not should_shuffle
+            )
             self.assertEqual(packed.data.data, expected_data)
             self.assertEqual(packed.batch_sizes, batch_sizes)
             self.assertEqual(packed.unsorted_indices, unsorted_indices)
 
             # test inverse
-            unpacked, unpacked_len = rnn_utils.pad_packed_sequence(packed, batch_first=batch_first)
+            unpacked, unpacked_len = rnn_utils.pad_packed_sequence(
+                packed, batch_first=batch_first
+            )
             self.assertEqual(unpacked, src)
             self.assertEqual(unpacked_len, lengths)
 
@@ -382,14 +449,17 @@ def pad(tensor, length):
                     self.assertEqual(padded.grad.data[l:, i].abs().sum(), 0)
 
         # test error messages
-        with self.assertRaisesRegex(RuntimeError, 'You can pass `enforce_sorted=False`'):
+        with self.assertRaisesRegex(
+            RuntimeError, "You can pass `enforce_sorted=False`"
+        ):
             packed = rnn_utils.pack_padded_sequence(torch.randn(3, 3), [1, 3, 2])
-        with self.assertRaisesRegex(RuntimeError, 'empty tensor'):
+        with self.assertRaisesRegex(RuntimeError, "empty tensor"):
             packed = rnn_utils.pack_padded_sequence(torch.randn(0, 0), [])
-        with self.assertRaisesRegex(RuntimeError, 'empty tensor'):
-            packed = rnn_utils.pack_padded_sequence(torch.randn([0, 1, 10]),
-                                                    torch.randn([11, 14, 14, 2]), True)
+        with self.assertRaisesRegex(RuntimeError, "empty tensor"):
+            packed = rnn_utils.pack_padded_sequence(
+                torch.randn([0, 1, 10]), torch.randn([11, 14, 14, 2]), True
+            )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/nn/test_parametrization.py b/test/nn/test_parametrization.py
index 0a15dfa72d084..d547d8abb0db0 100644
--- a/test/nn/test_parametrization.py
+++ b/test/nn/test_parametrization.py
@@ -1,7 +1,7 @@
 # Owner(s): ["module: nn"]
+import pickle
 from copy import deepcopy
 from itertools import product
-import pickle
 
 import torch
 
@@ -9,13 +9,23 @@
 import torch.nn.functional as F
 import torch.nn.init as init
 import torch.nn.utils.parametrize as parametrize
+from torch import Tensor
+from torch.__future__ import get_swap_module_params_on_conversion
 from torch.nn import Parameter
-from torch.testing._internal.common_utils import run_tests, skipIfNoLapack, \
-    TemporaryFileName, instantiate_parametrized_tests, set_default_dtype
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_nn import NNTestCase
-from torch.testing._internal.common_utils import gradcheck
+from torch.testing._internal.common_utils import (
+    gradcheck,
+    instantiate_parametrized_tests,
+    run_tests,
+    set_default_dtype,
+    skipIfNoLapack,
+    skipIfTorchDynamo,
+    swap,
+    TemporaryFileName,
+)
+from torch.testing._internal.two_tensor import TwoTensor
 
 
 class TestNNParametrization(NNTestCase):
@@ -26,11 +36,13 @@ class TestNNParametrization(NNTestCase):
     #        and remove the `@skipIfNoLapack` (see #70995)
     # torch/nn/utils/parametrize
     @skipIfNoLapack
+    @swap([True, False])
     def test_register_and_remove_parametrization(self):
         r"""Test that it is possible to add a few parametrizations
         on a parameter or a buffer and that removing them restores the initial state
         It also tests that backpropagating through them works as expected
         """
+
         # Define a couple matrix parametrizations
         class Skew(nn.Module):
             def forward(self, X):
@@ -71,8 +83,13 @@ def forward(self, x):
         initial_model = deepcopy(model)
 
         # Test unsafe flag
-        with self.assertRaisesRegex(ValueError, "Registering a parametrization may not change the shape of the tensor"):
-            parametrize.register_parametrization(model, "weight", Resize())  # default unsafe = False
+        with self.assertRaisesRegex(
+            ValueError,
+            "Registering a parametrization may not change the shape of the tensor",
+        ):
+            parametrize.register_parametrization(
+                model, "weight", Resize()
+            )  # default unsafe = False
             model(torch.ones(8, 8))
 
         # One parametrization with unsafe=True
@@ -82,8 +99,7 @@ def forward(self, x):
         self.assertTrue(parametrize.is_parametrized(model, "weight"))
         self.assertFalse(parametrize.is_parametrized(model, "bias"))
         self.assertNotIn("weight", model._parameters)
-        A = model.weight
-        self.assertTrue(A.shape[0] == 1)
+        self.assertTrue(model.weight.shape[0] == 1)
         parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
         self.assertFalse(hasattr(model, "parametrizations"))
         self.assertEqual(model.weight, initial_model.weight)
@@ -98,8 +114,7 @@ def forward(self, x):
         self.assertTrue(parametrize.is_parametrized(model, "weight"))
         self.assertFalse(parametrize.is_parametrized(model, "bias"))
         self.assertNotIn("weight", model._parameters)
-        A = model.weight
-        self.assertTrue(A.shape[0] == 1)
+        self.assertTrue(model.weight.shape[0] == 1)
         parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
         self.assertFalse(hasattr(model, "parametrizations"))
         self.assertEqual(model.weight, initial_model.weight)
@@ -116,6 +131,10 @@ def forward(self, x):
         # Result should be skew-symmetric
         A = model.weight
         self.assertEqual(A, -A.T)
+        if get_swap_module_params_on_conversion():
+            # When using the swap_tensors path, this is needed so that the autograd
+            # graph is not alive anymore.
+            del A
         # Remove and check consistency
         parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
         self.assertFalse(hasattr(model, "parametrizations"))
@@ -133,6 +152,10 @@ def forward(self, x):
         # Result should be skew-symmetric
         A = model.weight
         self.assertEqual(A, -A.T)
+        if get_swap_module_params_on_conversion():
+            # When using the swap_tensors path, this is needed so that the autograd
+            # graph is not alive anymore.
+            del A
         # Remove and check consistency
         parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
         self.assertFalse(hasattr(model, "parametrizations"))
@@ -147,6 +170,10 @@ def forward(self, x):
         X = model.weight
         Id = torch.eye(X.size(0), device=X.device)
         self.assertEqual(X.T @ X, Id)
+        if get_swap_module_params_on_conversion():
+            # When using the swap_tensors path, this is needed so that the autograd
+            # graph is not alive anymore.
+            del X
         # Structure tests
         self.assertTrue(hasattr(model, "parametrizations"))
         self.assertTrue(parametrize.is_parametrized(model))
@@ -171,9 +198,11 @@ def forward(self, x):
         self.assertTrue(parametrize.is_parametrized(model))
         self.assertTrue(parametrize.is_parametrized(model, "weight"))
         self.assertTrue(parametrize.is_parametrized(model, "bias"))
-        self.assertEqual(model.bias[0].item(), 0.)
-        self.assertEqual(model.bias[-1].item(), 0.)
-        self.assertEqual(len(list(model.parameters())), 2)  # Nothing weird has happpened
+        self.assertEqual(model.bias[0].item(), 0.0)
+        self.assertEqual(model.bias[-1].item(), 0.0)
+        self.assertEqual(
+            len(list(model.parameters())), 2
+        )  # Nothing weird has happpened
         # Should not throw
 
         sgd = torch.optim.SGD(model.parameters(), lr=0.01)
@@ -189,14 +218,18 @@ def forward(self, x):
         # Remove first parametrization.
         # Check that the model is still parametrized and so is the second parameter
         parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
-        self.assertTrue(parametrize.is_parametrized(model))             # Still parametrized
-        self.assertFalse(parametrize.is_parametrized(model, "weight"))  # Parametrization removed
-        self.assertTrue(parametrize.is_parametrized(model, "bias"))     # Still parametrized
-        self.assertEqual(model.bias[0].item(), 0.)                      # Still parametrized
-        self.assertEqual(model.bias[-1].item(), 0.)                     # Still parametrized
-        self.assertNotEqual(model.weight, initial_model.weight)         # Has been updated
-        self.assertEqual(id(model.weight), initial_weight_id)           # Keeps the same id
-        self.assertEqual(len(list(model.parameters())), 2)              # Nothing weird has happened
+        self.assertTrue(parametrize.is_parametrized(model))  # Still parametrized
+        self.assertFalse(
+            parametrize.is_parametrized(model, "weight")
+        )  # Parametrization removed
+        self.assertTrue(
+            parametrize.is_parametrized(model, "bias")
+        )  # Still parametrized
+        self.assertEqual(model.bias[0].item(), 0.0)  # Still parametrized
+        self.assertEqual(model.bias[-1].item(), 0.0)  # Still parametrized
+        self.assertNotEqual(model.weight, initial_model.weight)  # Has been updated
+        self.assertEqual(id(model.weight), initial_weight_id)  # Keeps the same id
+        self.assertEqual(len(list(model.parameters())), 2)  # Nothing weird has happened
         # Should not throw
         weight_copy = model.weight.clone()
         bias_copy = model.bias.clone()
@@ -210,13 +243,15 @@ def forward(self, x):
         # Check that the module is not parametrized
         parametrize.remove_parametrizations(model, "bias", leave_parametrized=False)
         self.assertFalse(parametrize.is_parametrized(model))  # Not parametrized
-        self.assertNotEqual(model.bias, initial_model.bias)   # Has been updated
-        self.assertNotEqual(model.bias[0].item(), 0.)         # Not parametrized
-        self.assertNotEqual(model.bias[-1].item(), 0.)        # Not parametrized
-        self.assertEqual(id(model.bias), initial_bias_id)     # Keeps the same id
-        self.assertFalse(hasattr(model, "parametrizations"))  # Not parametrized the module
-        self.assertEqual(model.__class__, nn.Linear)          # Resores the previous class
-        self.assertEqual(len(list(model.parameters())), 2)    # Nothing weird has happeed
+        self.assertNotEqual(model.bias, initial_model.bias)  # Has been updated
+        self.assertNotEqual(model.bias[0].item(), 0.0)  # Not parametrized
+        self.assertNotEqual(model.bias[-1].item(), 0.0)  # Not parametrized
+        self.assertEqual(id(model.bias), initial_bias_id)  # Keeps the same id
+        self.assertFalse(
+            hasattr(model, "parametrizations")
+        )  # Not parametrized the module
+        self.assertEqual(model.__class__, nn.Linear)  # Resores the previous class
+        self.assertEqual(len(list(model.parameters())), 2)  # Nothing weird has happeed
 
         # Should not throw things are updated
         weight_copy = model.weight.clone()
@@ -226,12 +261,18 @@ def forward(self, x):
         sgd.step()
         self.assertNotEqual(model.weight, weight_copy)
         self.assertNotEqual(model.bias, bias_copy)
+        if get_swap_module_params_on_conversion():
+            # When using the swap_tensors path, this is needed so that the autograd
+            # graph is not alive anymore.
+            del weight_copy, bias_copy
 
         # Test leave_parametrized=True
         for _ in range(2):
             parametrize.register_parametrization(model, "weight", Skew())
             parametrize.register_parametrization(model, "weight", Orthogonal())
-            parametrize.remove_parametrizations(model, "weight", leave_parametrized=True)
+            parametrize.remove_parametrizations(
+                model, "weight", leave_parametrized=True
+            )
             # We didn't change the dtype nor had multiple inputs, so the id should be the same
             self.assertEqual(id(model.weight), initial_weight_id)
             self.assertEqual(id(model.bias), initial_bias_id)
@@ -244,11 +285,17 @@ def forward(self, x):
             sgd.step()
             self.assertNotEqual(model.weight, weight_copy)
             self.assertNotEqual(model.bias, bias_copy)
+            if get_swap_module_params_on_conversion():
+                # When using the swap_tensors path, this is needed so that the autograd
+                # graph is not alive anymore.
+                del weight_copy, bias_copy
 
+    @swap([True, False])
     def test_register_and_remove_nested_parametrization(self):
         r"""Test that it is possible to nest the parametrizations
         meaning that the original param is parametrized again
         """
+
         class Skew(nn.Module):
             def forward(self, X):
                 X = X.tril(-1)
@@ -265,6 +312,10 @@ def forward(self, X):
         # Result should be skew-symmetric
         A = model.weight
         self.assertEqual(A, -A.T)
+        if get_swap_module_params_on_conversion():
+            # When using the swap_tensors path, this is needed so that the autograd
+            # graph is not alive anymore.
+            del A
 
         # Add nested parametrization
         param_mod = model.parametrizations.weight
@@ -282,7 +333,9 @@ def forward(self, X):
         self.assertEqual(A, -A.T)
 
         # Remove nested param and check consistency
-        parametrize.remove_parametrizations(param_mod, "original", leave_parametrized=False)
+        parametrize.remove_parametrizations(
+            param_mod, "original", leave_parametrized=False
+        )
         self.assertFalse(hasattr(param_mod, "parametrizations"))
         self.assertEqual(param_mod.__class__, parametrize.ParametrizationList)
 
@@ -291,8 +344,10 @@ def forward(self, X):
         self.assertFalse(hasattr(model, "parametrizations"))
         self.assertEqual(model.__class__, nn.Linear)
 
+    @swap([True, False])
     def test_register_and_remove_buffer_parametrization(self):
         r"""Test that it is possible to add and remove parametrizations on buffers"""
+
         # Define a couple vector parametrizations
         class FirstZero(nn.Module):
             def forward(self, x):
@@ -311,8 +366,8 @@ def forward(self, x):
         parametrize.register_parametrization(model, "bias", LastZero())
         self.assertTrue(parametrize.is_parametrized(model))
         self.assertTrue(parametrize.is_parametrized(model, "bias"))
-        self.assertEqual(model.bias[0].item(), 0.)
-        self.assertEqual(model.bias[-1].item(), 0.)
+        self.assertEqual(model.bias[0].item(), 0.0)
+        self.assertEqual(model.bias[-1].item(), 0.0)
         self.assertTrue((model.bias[1:-1] == torch.ones(6)).all())
         self.assertEqual(len(list(model.parameters())), 1)
 
@@ -320,16 +375,18 @@ def forward(self, x):
         parametrize.remove_parametrizations(model, "bias", leave_parametrized=True)
         self.assertFalse(parametrize.is_parametrized(model))
         self.assertFalse(parametrize.is_parametrized(model, "bias"))
-        self.assertEqual(model.bias[0].item(), 0.)
-        self.assertEqual(model.bias[-1].item(), 0.)
+        self.assertEqual(model.bias[0].item(), 0.0)
+        self.assertEqual(model.bias[-1].item(), 0.0)
         self.assertTrue((model.bias[1:-1] == torch.ones(6)).all())
         self.assertEqual(len(list(model.parameters())), 1)
 
     # FIXME: Rewrite this test using functions not depending on LAPACK
     #        and remove the `@skipIfNoLapack` (see #70995)
     @skipIfNoLapack
+    @swap([True, False])
     def test_serialization_parametrization(self):
         r"""Test that it is possible to serialize a parametrized model via state_dict"""
+
         # A stateful parametrization
         class Orthogonal(nn.Module):
             def __init__(self, n):
@@ -376,10 +433,12 @@ def get_model():
     # FIXME: Rewrite this test using functions not depending on LAPACK
     #        and remove the `@skipIfNoLapack` (see #70995)
     @skipIfNoLapack
+    @swap([True, False])
     def test_initialization_parametrization(self):
         r"""Test that it is possible to initialize a parametrization when it
-            implements a `right_inverse` method
+        implements a `right_inverse` method
         """
+
         class Skew(nn.Module):
             def forward(self, X):
                 A = X.triu(1)
@@ -444,6 +503,7 @@ def right_inverse(self, X):
         self.assertEqual(model.weight, X)
         self.assertEqual(model.parametrizations.weight.original, torch.zeros_like(X))
 
+    @swap([True, False])
     def test_errors_unparametrized_tensor_parametrization(self):
         # Test errors when registering a parametrization on an unparametrized tensor
         module = nn.Linear(3, 4)
@@ -474,7 +534,9 @@ def right_inverse(self, z):
         parametrize.register_parametrization(module, "weight", Sum())
         # Cannot remove a parametrization with several outputs with `leave_parametrized=False`
         with self.assertRaisesRegex(ValueError, "leave_parametrized=False"):
-            parametrize.remove_parametrizations(module, "weight", leave_parametrized=False)
+            parametrize.remove_parametrizations(
+                module, "weight", leave_parametrized=False
+            )
         parametrize.remove_parametrizations(module, "weight", leave_parametrized=True)
 
         # A parametrization with an incorrect number of outputs
@@ -509,7 +571,9 @@ def right_inverse(self, z):
                 return None, z
 
         with self.assertRaisesRegex(ValueError, "of the sequence with type"):
-            parametrize.register_parametrization(module, "weight", WrongRightInverseSequence())
+            parametrize.register_parametrization(
+                module, "weight", WrongRightInverseSequence()
+            )
         self.assertFalse(parametrize.is_parametrized(module))
 
         # A parametrization from one tensor to one tensor that changes the dtype
@@ -521,7 +585,9 @@ def right_inverse(self, w):
                 return w.bool()
 
         # For parametrizations that return one tensor, right_inverse may not change the dtype
-        with self.assertRaisesRegex(ValueError, "outputs one tensor, it may not change the dtype"):
+        with self.assertRaisesRegex(
+            ValueError, "outputs one tensor, it may not change the dtype"
+        ):
             parametrize.register_parametrization(module, "weight", ChangeDtypeInverse())
         self.assertFalse(parametrize.is_parametrized(module))
 
@@ -579,7 +645,7 @@ def right_inverse(self, w):
         parametrize.register_parametrization(module, "weight", SequenceLen1())
         self.assertTrue(hasattr(module.parametrizations.weight, "original0"))
         self.assertFalse(hasattr(module.parametrizations.weight, "original1"))
-        _ = module.weight   # Does not throw
+        _ = module.weight  # Does not throw
         self.assertTrue(parametrize.is_parametrized(module))
         parametrize.remove_parametrizations(module, "weight", leave_parametrized=True)
 
@@ -587,6 +653,7 @@ def right_inverse(self, w):
         self.assertFalse(parametrize.is_parametrized(module))
         self.assertEqual(module.weight, weight_init)
 
+    @swap([True, False])
     def test_errors_parametrized_tensor_parametrization(self):
         # Test errors when registering a parametrization on a parametrized tensor
 
@@ -668,6 +735,7 @@ def right_inverse(self, x):
     # FIXME: Rewrite this test using functions not depending on LAPACK
     #        and remove the `@skipIfNoLapack` (see #70995)
     @skipIfNoLapack
+    @swap([True, False])
     def test_multiple_inputs_parametrization(self):
         # A parametrization with several outputs
         class RankOne(nn.Module):
@@ -707,7 +775,9 @@ def right_inverse(self, w):
 
         with self.assertRaisesRegex(ValueError, "leave_parametrized=False"):
             # Cannot remove a parametrization with multiple inputs and not leave it parametrized
-            parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
+            parametrize.remove_parametrizations(
+                model, "weight", leave_parametrized=False
+            )
         # Remove parametrization and check consistency
         parametrize.remove_parametrizations(model, "weight", leave_parametrized=True)
         self.assertFalse(hasattr(model, "parametrizations"))
@@ -742,7 +812,9 @@ def right_inverse(self, w):
         # Same drill as before, removing should work as expected
         with self.assertRaisesRegex(ValueError, "leave_parametrized=False"):
             # Cannot remove a parametrization with multiple inputs and not leave it parametrized
-            parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
+            parametrize.remove_parametrizations(
+                model, "weight", leave_parametrized=False
+            )
         # Remove parametrization and check consistency
         parametrize.remove_parametrizations(model, "weight", leave_parametrized=True)
         self.assertFalse(hasattr(model, "parametrizations"))
@@ -765,8 +837,10 @@ def right_inverse(self, w):
     # FIXME: Rewrite this test using functions not depending on LAPACK
     #        and remove the `@skipIfNoLapack` (see #70995)
     @skipIfNoLapack
+    @swap([True, False])
     def test_caching_parametrization(self):
         r"""Test the caching system of a parametrization"""
+
         # Define a couple matrix parametrizations
         class Skew(nn.Module):
             def forward(self, X):
@@ -791,8 +865,10 @@ def forward(self, X):
     # FIXME: Rewrite this test using functions not depending on LAPACK
     #        and remove the `@skipIfNoLapack` (see #70995)
     @skipIfNoLapack
+    @swap([True, False])
     def test_caching_parametrization_with_transfer_parametrizations_and_params(self):
         r"""Test that transferring parametrizations doesn't cause issues with caching"""
+
         class Skew(nn.Module):
             def forward(self, X):
                 X = X.tril(-1)
@@ -822,8 +898,10 @@ def forward(self, X):
             # test that the results are distinct objects for each module
             self.assertNotEqual(id(A), id(X))
 
+    @swap([True, False])
     def test_parametrization_same_training_mode(self):
         r"""Test training mode updated on parametrization registration"""
+
         class Identity(nn.Module):
             def forward(self, X):
                 return X
@@ -837,6 +915,7 @@ def forward(self, X):
         self.assertTrue(module.parametrizations.weight[0].training)
         self.assertTrue(module.parametrizations.weight[1].training)
 
+    @swap([True, False])
     def test_type_before_parametrizations(self):
         r"""Test that type_before_parametrizations always retrieves original type"""
 
@@ -854,6 +933,7 @@ def forward(self, X):
             parametrize.type_before_parametrizations(model) == original_type
         )
 
+    @swap([True, False])
     def test_deepcopy_after_parametrization(self):
         r"""Test that we are able to create a deepcopy of the module when it's parametrized."""
 
@@ -864,8 +944,12 @@ def forward(self, x):
         class ModelWithoutDeepcopy(nn.Module):
             def __init__(self):
                 super().__init__()
-                self.weight = nn.Parameter(torch.tensor([1., 1., 1., 1.]), requires_grad=True)
-                self.bias = nn.Parameter(torch.tensor([0., 0., 0., 0.]), requires_grad=True)
+                self.weight = nn.Parameter(
+                    torch.tensor([1.0, 1.0, 1.0, 1.0]), requires_grad=True
+                )
+                self.bias = nn.Parameter(
+                    torch.tensor([0.0, 0.0, 0.0, 0.0]), requires_grad=True
+                )
                 self.attr = [1.0, 2.0, 3.0, 4.0]
 
         class ActualModel(ModelWithoutDeepcopy):
@@ -879,8 +963,16 @@ def __deepcopy__(self, memo):
         def check_deepcopy(m1: nn.Module, m2: nn.Module):
             w1 = m1.parametrizations.weight.original
             w2 = m2.parametrizations.weight.original
-            b1 = m1.parametrizations.bias.original if parametrize.is_parametrized(m1, "bias") else m1.bias
-            b2 = m2.parametrizations.bias.original if parametrize.is_parametrized(m2, "bias") else m2.bias
+            b1 = (
+                m1.parametrizations.bias.original
+                if parametrize.is_parametrized(m1, "bias")
+                else m1.bias
+            )
+            b2 = (
+                m2.parametrizations.bias.original
+                if parametrize.is_parametrized(m2, "bias")
+                else m2.bias
+            )
             # Weights, biases and attributes should be equal but they must be different objects.
             self.assertEqual(m1.__dict__.keys(), m2.__dict__.keys())
             self.assertIsNot(m1, m2)
@@ -902,6 +994,7 @@ def check_deepcopy(m1: nn.Module, m2: nn.Module):
             parametrize.register_parametrization(model, "weight", AddOne())
             check_deepcopy(model, deepcopy(model))
 
+    @swap([True, False])
     def test_transfer_parametrizations_and_params(self):
         r"""Test that all parametrizations and their associated parameters are transferred."""
 
@@ -941,6 +1034,10 @@ def forward(self, x):
 
         # check that the transfer didn't affect the original value
         self.assertEqual(hold_weight, model.weight)
+        if get_swap_module_params_on_conversion():
+            # When using the swap_tensors path, this is needed so that the autograd
+            # graph is not alive anymore.
+            del hold_weight
 
         # testing that changes to one set of parametrizations do not affect the other
         parametrize.remove_parametrizations(to_model, "weight")
@@ -965,6 +1062,7 @@ def forward(self, x):
         # check that the new transfer didn't change the value for the from_module
         self.assertEqual(hold_test_param, model.test_param)
 
+    @swap([True, False])
     def test_transfer_parametrizations_and_params_right_inverse(self):
         r"""Test that all parametrizations and their associated parameters are transferred."""
 
@@ -994,6 +1092,7 @@ def right_inverse(self, x):
         # check that transfer doesn't affect the from_model weight
         self.assertEqual(hold_weight, model.weight)
 
+    @swap([True, False])
     def test_transfer_parametrizations_and_params_single_param(self):
         r"""Test that all parametrizations and their associated parameters are transferred."""
 
@@ -1033,6 +1132,7 @@ def forward(self, x):
     # FIXME: Rewrite this test using functions not depending on LAPACK
     # and remove the `@skipIfNoLapack` (see #70995)
     @skipIfNoLapack
+    @swap([True, False])
     def test_transfer_parametrizations_and_params_many_to_one(self):
         # A parametrization with several outputs
         class RankOne(nn.Module):
@@ -1099,6 +1199,7 @@ def forward(self, x):
         # check that the new transfer didn't change the value for the from_module
         self.assertEqual(hold_test_param, model.test_param)
 
+    @swap([True, False])
     def test_new_spectral_norm(self):
         with set_default_dtype(torch.double):
             input = torch.randn(3, 5)
@@ -1109,47 +1210,49 @@ def test_new_spectral_norm(self):
             self.assertEqual(spectral_norm_m._u.size(), torch.Size([m.weight.size(0)]))
 
             # .parametrizations.weight.original should be trainable
-            self.assertTrue(hasattr(m.parametrizations.weight, 'original'))
-            self.assertTrue('original' in m.parametrizations.weight._parameters)
+            self.assertTrue(hasattr(m.parametrizations.weight, "original"))
+            self.assertTrue("original" in m.parametrizations.weight._parameters)
 
             # u should be just a reused buffer
-            self.assertTrue(hasattr(spectral_norm_m, '_u'))
-            self.assertTrue('_u' in spectral_norm_m._buffers)
-            self.assertTrue('_v' in spectral_norm_m._buffers)
+            self.assertTrue(hasattr(spectral_norm_m, "_u"))
+            self.assertTrue("_u" in spectral_norm_m._buffers)
+            self.assertTrue("_v" in spectral_norm_m._buffers)
 
             # weight should be a plain attribute, not counted as a buffer or a param
             self.assertIsNotNone(m.weight)
-            self.assertFalse('weight' in m._buffers)
-            self.assertFalse('weight' in m._parameters)
+            self.assertFalse("weight" in m._buffers)
+            self.assertFalse("weight" in m._parameters)
 
             # it should also be sharing storage as `weight_orig`
             # self.assertEqual(m.parametrizations.weight.original.storage(), m.weight.storage())
             self.assertEqual(m.parametrizations.weight.original.size(), m.weight.size())
-            self.assertEqual(m.parametrizations.weight.original.stride(), m.weight.stride())
+            self.assertEqual(
+                m.parametrizations.weight.original.stride(), m.weight.stride()
+            )
 
-            m = torch.nn.utils.parametrize.remove_parametrizations(m, 'weight')
+            m = torch.nn.utils.parametrize.remove_parametrizations(m, "weight")
 
             # spectral_norm is the only parametrization
-            self.assertFalse(hasattr(m, 'parametrizations'))
-            self.assertTrue('weight' in m._parameters)
+            self.assertFalse(hasattr(m, "parametrizations"))
+            self.assertTrue("weight" in m._parameters)
 
             # We can register spectral_norm multiple times on the same parameter
             # and on multiple parameters in the same module
-            m = torch.nn.utils.parametrizations.spectral_norm(m, 'weight')
-            m = torch.nn.utils.parametrizations.spectral_norm(m, 'weight')
-            m = torch.nn.utils.parametrizations.spectral_norm(m, 'bias')
+            m = torch.nn.utils.parametrizations.spectral_norm(m, "weight")
+            m = torch.nn.utils.parametrizations.spectral_norm(m, "weight")
+            m = torch.nn.utils.parametrizations.spectral_norm(m, "bias")
 
             # If we remove the parametrization on bias, weight is still parametrized
             # Removing a parametrization runs forward in eval mode if leave_parametrized=True
-            m = torch.nn.utils.parametrize.remove_parametrizations(m, 'bias')
-            self.assertTrue('bias' in m._parameters)
-            self.assertTrue(hasattr(m, 'parametrizations'))
-            self.assertFalse('weight' in m._parameters)
+            m = torch.nn.utils.parametrize.remove_parametrizations(m, "bias")
+            self.assertTrue("bias" in m._parameters)
+            self.assertTrue(hasattr(m, "parametrizations"))
+            self.assertFalse("weight" in m._parameters)
 
-            m = torch.nn.utils.parametrize.remove_parametrizations(m, 'weight')
+            m = torch.nn.utils.parametrize.remove_parametrizations(m, "weight")
             # Neither weight and bias are parametrized
-            self.assertFalse(hasattr(m, 'parametrizations'))
-            self.assertTrue('weight' in m._parameters)
+            self.assertFalse(hasattr(m, "parametrizations"))
+            self.assertTrue("weight" in m._parameters)
             self.assertFalse(torch.nn.utils.parametrize.is_parametrized(m))
 
             # test correctness in training/eval modes and cpu/multi-gpu settings
@@ -1157,17 +1260,19 @@ def test_new_spectral_norm(self):
                 if apply_dp:
                     if not TEST_MULTIGPU:
                         continue
-                    device = torch.device('cuda:0')
+                    device = torch.device("cuda:0")
 
                     def maybe_wrap(m):
                         return torch.nn.DataParallel(m, [0, 1])
+
                 else:
-                    device = torch.device('cpu')
+                    device = torch.device("cpu")
 
                     def maybe_wrap(m):
                         return m
 
                 for requires_grad in (True, False):
+
                     def get_modules():
                         m = nn.Linear(3, 4).to(device)
                         m.weight.requires_grad_(requires_grad)
@@ -1180,7 +1285,7 @@ def get_modules():
 
                     m, wrapped_m, spectral_norm_m = get_modules()
 
-                    self.assertTrue(hasattr(spectral_norm_m, '_u'))
+                    self.assertTrue(hasattr(spectral_norm_m, "_u"))
                     u0 = spectral_norm_m._u.clone()
                     v0 = spectral_norm_m._v.clone()
 
@@ -1203,7 +1308,9 @@ def get_modules():
                     # can't use gradcheck because the function changes as we
                     # activate through it in training mode
                     if requires_grad:
-                        torch.autograd.grad(out.sum(), m.parametrizations.weight.original)
+                        torch.autograd.grad(
+                            out.sum(), m.parametrizations.weight.original
+                        )
 
                     # test backward works with multiple forwards
                     # it uses training mode so we need to reset `u` and `v` vectors
@@ -1221,23 +1328,39 @@ def fn(input):
                     # Make sure we can compute gradients wrt to all the parameters in the case
                     # of double forward
                     fn(input.clone().requires_grad_()).sum().backward()
-                    gradcheck(fn, (input.clone().requires_grad_(),), check_batched_grad=False)
+                    gradcheck(
+                        fn, (input.clone().requires_grad_(),), check_batched_grad=False
+                    )
 
                     # test removing
                     # spectral norm module needs to be in eval mode if we'd like to
                     # avoid doing another power iteration
                     m, wrapped_m, _ = get_modules()
                     pre_remove_out = wrapped_m(input)
+                    if get_swap_module_params_on_conversion():
+                        # When using the swap_tensors path, this is needed so that the autograd
+                        # graph is not alive anymore.
+                        pre_remove_out_ref = pre_remove_out.detach()
+                        del pre_remove_out
+                    else:
+                        pre_remove_out_ref = pre_remove_out
                     m.eval()
-                    m = torch.nn.utils.parametrize.remove_parametrizations(m, 'weight')
-                    self.assertEqual(wrapped_m(input), pre_remove_out)
+                    m = torch.nn.utils.parametrize.remove_parametrizations(m, "weight")
+                    self.assertEqual(wrapped_m(input), pre_remove_out_ref)
 
                     torch.nn.utils.parametrizations.spectral_norm(m)
                     for _ in range(3):
                         pre_remove_out = wrapped_m(input)
+                    if get_swap_module_params_on_conversion():
+                        # When using the swap_tensors path, this is needed so that the autograd
+                        # graph is not alive anymore.
+                        pre_remove_out_ref = pre_remove_out.detach()
+                        del pre_remove_out
+                    else:
+                        pre_remove_out_ref = pre_remove_out
                     m.eval()
-                    m = torch.nn.utils.parametrize.remove_parametrizations(m, 'weight')
-                    self.assertEqual(wrapped_m(input), pre_remove_out)
+                    m = torch.nn.utils.parametrize.remove_parametrizations(m, "weight")
+                    self.assertEqual(wrapped_m(input), pre_remove_out_ref)
 
                     # TEST EVAL BEHAVIOR
                     m, wrapped_m, spectral_norm_m = get_modules()
@@ -1285,11 +1408,33 @@ def fn(input):
 
                     # assert that backprop reaches weight_orig in eval
                     if requires_grad:
+
                         def fn(weight):
                             return wrapped_m(input)
 
                         gradcheck(fn, (m.parametrizations.weight.original,))
 
+    def test_register_parametrization_no_grad(self):
+        r"""Test that it is possible to register a parametrization without gradient"""
+
+        class SplitAndCat(nn.Module):
+            def right_inverse(self, x):
+                # split the tensor in two halfs
+                return torch.split(x, x.shape[1] // 2)
+
+            def forward(self, x0, x1):
+                return torch.cat([x0, x1])
+
+        model = nn.Linear(8, 8)
+
+        model.weight.requires_grad = False
+        parametrize.register_parametrization(model, "weight", SplitAndCat())
+        # making sure the parameterized and decomposed Tensors both have requires_grad == False
+        self.assertFalse(model.weight.requires_grad)
+        self.assertFalse(model.parametrizations.weight.original0.requires_grad)
+        self.assertFalse(model.parametrizations.weight.original1.requires_grad)
+
+    @swap([True, False])
     def test_new_spectral_norm_load_state_dict(self):
         for activate_times in (0, 3):
             inp = torch.randn(2, 3)
@@ -1301,38 +1446,47 @@ def test_new_spectral_norm_load_state_dict(self):
                 snm(inp)
 
             state_dict = deepcopy(snm.state_dict())
-            self.assertEqual({
-                'parametrizations.weight.original',
-                'bias',
-                'parametrizations.weight.0._v',
-                'parametrizations.weight.0._u'
-            }, set(state_dict.keys()))
+            self.assertEqual(
+                {
+                    "parametrizations.weight.original",
+                    "bias",
+                    "parametrizations.weight.0._v",
+                    "parametrizations.weight.0._u",
+                },
+                set(state_dict.keys()),
+            )
 
             # test that non-strict loading works
             non_strict_state_dict = deepcopy(state_dict)
-            non_strict_state_dict['nonsense'] = 'nonsense'
-            with self.assertRaisesRegex(RuntimeError, r'Unexpected key\(s\) in state_dict: "nonsense"'):
+            non_strict_state_dict["nonsense"] = "nonsense"
+            with self.assertRaisesRegex(
+                RuntimeError, r'Unexpected key\(s\) in state_dict: "nonsense"'
+            ):
                 snm.load_state_dict(non_strict_state_dict, strict=True)
             snm.load_state_dict(non_strict_state_dict, strict=False)
-            del non_strict_state_dict['parametrizations.weight.original']
+            del non_strict_state_dict["parametrizations.weight.original"]
             snm.load_state_dict(non_strict_state_dict, strict=False)
-            del non_strict_state_dict['parametrizations.weight.0._u']
+            del non_strict_state_dict["parametrizations.weight.0._u"]
             snm.load_state_dict(non_strict_state_dict, strict=False)
-            del non_strict_state_dict['parametrizations.weight.0._v']
+            del non_strict_state_dict["parametrizations.weight.0._v"]
             snm.load_state_dict(non_strict_state_dict, strict=False)
-            non_strict_state_dict['weight'] = snm.weight.detach().clone()     # set W as a buffer
+            non_strict_state_dict[
+                "weight"
+            ] = snm.weight.detach().clone()  # set W as a buffer
             snm.load_state_dict(non_strict_state_dict, strict=False)
-            del non_strict_state_dict._metadata['parametrizations.weight.0']  # remove metadata info
+            del non_strict_state_dict._metadata[
+                "parametrizations.weight.0"
+            ]  # remove metadata info
             snm.load_state_dict(non_strict_state_dict, strict=False)
-            del non_strict_state_dict['weight']                               # remove W buffer
+            del non_strict_state_dict["weight"]  # remove W buffer
             snm.load_state_dict(non_strict_state_dict, strict=False)
-            del non_strict_state_dict['bias']
+            del non_strict_state_dict["bias"]
             snm.load_state_dict(non_strict_state_dict, strict=False)
 
             # normal state_dict
 
             # test that re-wrapping does not matter
-            m = torch.nn.utils.parametrize.remove_parametrizations(snm, 'weight')
+            m = torch.nn.utils.parametrize.remove_parametrizations(snm, "weight")
             snm = torch.nn.utils.parametrizations.spectral_norm(m)
 
             snm.load_state_dict(state_dict)
@@ -1346,7 +1500,7 @@ def test_new_spectral_norm_load_state_dict(self):
                 out3_eval = snm(inp)
 
             # test that re-wrapping does not matter
-            m = torch.nn.utils.parametrize.remove_parametrizations(snm, 'weight')
+            m = torch.nn.utils.parametrize.remove_parametrizations(snm, "weight")
             snm = torch.nn.utils.parametrizations.spectral_norm(m)
 
             # Test normal loading
@@ -1360,6 +1514,7 @@ def test_new_spectral_norm_load_state_dict(self):
                 snm.eval()
                 self.assertEqual(out3_eval, snm(inp))
 
+    @swap([True, False])
     def test_new_spectral_norm_dim(self):
         inp = torch.randn(2, 3, 10, 12)
         m = nn.ConvTranspose2d(3, 4, (5, 6))
@@ -1368,8 +1523,11 @@ def test_new_spectral_norm_dim(self):
         # this should not run into incompatible shapes
         x = m(inp)
         # check that u refers to the same dimension
-        self.assertEqual(snm._u.shape, m.parametrizations.weight.original[0, :, 0, 0].shape)
+        self.assertEqual(
+            snm._u.shape, m.parametrizations.weight.original[0, :, 0, 0].shape
+        )
 
+    @swap([True, False])
     def test_new_spectral_norm_forward(self):
         input = torch.randn(3, 5)
         m = nn.Linear(5, 7)
@@ -1388,7 +1546,24 @@ def test_new_spectral_norm_forward(self):
         expect_out = m(input)
         self.assertEqual(expect_out, out_hat)
 
+    @swap([True, False])
+    @skipIfTorchDynamo("Test does not work with TorchDynamo")
+    def test_new_spectral_norm_value(self):
+        # a test that the spectral norm (= top singular value)
+        # is in fact properly calculated, using example of a simple diagonal matrix.
+        for dtype in (torch.float, torch.cfloat):
+            m = nn.Linear(2, 2, dtype=dtype)
+            with torch.no_grad():
+                # set weight to be diagonal
+                x = torch.diagonal(m.weight)
+                m.weight = nn.Parameter(torch.diag(x))
+                torch.nn.utils.parametrizations.spectral_norm(m)
+                # weights should be rescaled by spectral norm, (i.e., largest diagonal element in norm)
+                expected = torch.diag(x / x.abs().max())
+                self.assertEqual(m.weight.data, expected)
+
     @skipIfNoLapack
+    @swap([True, False])
     def test_orthogonal_parametrization(self):
         # Orthogonal implements 6 algorithms (3x parametrizations times 2 options of use_trivialization)
 
@@ -1397,9 +1572,11 @@ def assert_is_orthogonal(X):
             if n < k:
                 X = X.mT
                 n, k = k, n
-            Id = torch.eye(k, dtype=X.dtype, device=X.device).expand(*(X.size()[:-2]), k, k)
+            Id = torch.eye(k, dtype=X.dtype, device=X.device).expand(
+                *(X.size()[:-2]), k, k
+            )
             eps = 10 * n * torch.finfo(X.dtype).eps
-            torch.testing.assert_close(X.mH @ X, Id, atol=eps, rtol=0.)
+            torch.testing.assert_close(X.mH @ X, Id, atol=eps, rtol=0.0)
 
         def assert_weight_allclose_Q(weight, W):
             # Test that weight is equal to the Q part of the QR decomposition of W
@@ -1411,11 +1588,13 @@ def assert_weight_allclose_Q(weight, W):
             Q *= R.diagonal(dim1=-2, dim2=-1).sgn().unsqueeze(-2)
             if wide_matrix:
                 Q = Q.mT
-            torch.testing.assert_close(Q, weight, atol=1e-5, rtol=0.)
+            torch.testing.assert_close(Q, weight, atol=1e-5, rtol=0.0)
 
-        for shape, dtype, use_linear in product(((4, 4), (5, 3), (3, 5)),  # square/ tall / wide
-                                                (torch.float32, torch.complex64),
-                                                (True, False)):
+        for shape, dtype, use_linear in product(
+            ((4, 4), (5, 3), (3, 5)),  # square/ tall / wide
+            (torch.float32, torch.complex64),
+            (True, False),
+        ):
             # Conv2d does not support complex yet
             if not use_linear:
                 continue
@@ -1425,8 +1604,9 @@ def assert_weight_allclose_Q(weight, W):
             else:
                 input = torch.randn(2, 2, shape[0] + 2, shape[1] + 1, dtype=dtype)
 
-            for parametrization, use_trivialization in product(("matrix_exp", "cayley", "householder"),
-                                                               (False, True)):
+            for parametrization, use_trivialization in product(
+                ("matrix_exp", "cayley", "householder"), (False, True)
+            ):
                 # right_inverse for Cayley and matrix_exp not implemented for use_trivialization=False
                 # See Note [right_inverse expm cayley]
                 can_initialize = use_trivialization or parametrization == "householder"
@@ -1439,21 +1619,28 @@ def assert_weight_allclose_Q(weight, W):
 
                 # We do not support householder for complex inputs
                 # See Note [Householder complex]
-                w_init = m.weight.clone()
+
+                # When using the swap_tensors path, this is needed so that the autograd
+                # graph is not alive anymore.
+                if get_swap_module_params_on_conversion():
+                    w_init = m.weight.clone().detach()
+                else:
+                    w_init = m.weight.clone()
                 if parametrization == "householder" and m.weight.is_complex():
                     msg = "householder parametrization does not support complex tensors"
                     with self.assertRaisesRegex(ValueError, msg):
-                        torch.nn.utils.parametrizations.orthogonal(m,
-                                                                   "weight",
-                                                                   parametrization,
-                                                                   use_trivialization=use_trivialization)
+                        torch.nn.utils.parametrizations.orthogonal(
+                            m,
+                            "weight",
+                            parametrization,
+                            use_trivialization=use_trivialization,
+                        )
                     continue
 
                 wide_matrix = w_init.size(-2) < w_init.size(-1)
-                torch.nn.utils.parametrizations.orthogonal(m,
-                                                           "weight",
-                                                           parametrization,
-                                                           use_trivialization=use_trivialization)
+                torch.nn.utils.parametrizations.orthogonal(
+                    m, "weight", parametrization, use_trivialization=use_trivialization
+                )
                 # Forwards works as expected
                 self.assertEqual(w_init.shape, m.weight.shape)
                 assert_is_orthogonal(m.weight)
@@ -1469,9 +1656,11 @@ def assert_weight_allclose_Q(weight, W):
                     w_new = w_new.mT
                 if can_initialize:
                     m.weight = w_new
-                    torch.testing.assert_close(w_new, m.weight, atol=1e-5, rtol=0.)
+                    torch.testing.assert_close(w_new, m.weight, atol=1e-5, rtol=0.0)
                 else:
-                    msg = "assign to the matrix exponential or the Cayley parametrization"
+                    msg = (
+                        "assign to the matrix exponential or the Cayley parametrization"
+                    )
                     with self.assertRaisesRegex(NotImplementedError, msg):
                         m.weight = w_new
 
@@ -1481,7 +1670,9 @@ def assert_weight_allclose_Q(weight, W):
                     m.weight = w_new
                     assert_weight_allclose_Q(m.weight, w_new)
                 else:
-                    msg = "assign to the matrix exponential or the Cayley parametrization"
+                    msg = (
+                        "assign to the matrix exponential or the Cayley parametrization"
+                    )
                     with self.assertRaisesRegex(NotImplementedError, msg):
                         m.weight = w_new
 
@@ -1507,6 +1698,7 @@ def assert_weight_allclose_Q(weight, W):
                     assert_is_orthogonal(m.weight)
 
     @skipIfNoLapack
+    @swap([True, False])
     def test_orthogonal_errors(self):
         m = nn.Linear(3, 4)
         with self.assertRaisesRegex(ValueError, "has to be one of"):
@@ -1520,6 +1712,7 @@ def test_orthogonal_errors(self):
             m.weight = torch.randn(5, 5)
         torch.nn.utils.parametrize.remove_parametrizations(m, "weight")
 
+    @swap([True, False])
     def test_weight_norm_state_dict_compat(self):
         m = nn.Linear(4, 5)
         m = torch.nn.utils.weight_norm(m)
@@ -1532,12 +1725,14 @@ def test_weight_norm_state_dict_compat(self):
         input = torch.randn(3, 4)
         self.assertEqual(m(input), m2(input))
 
+    @swap([True, False])
     def test_weight_norm_pickle(self):
         m = nn.Linear(4, 5)
         m = torch.nn.utils.parametrizations.weight_norm(m)
-        with self.assertRaisesRegex(RuntimeError, 'state_dict'):
+        with self.assertRaisesRegex(RuntimeError, "state_dict"):
             pickle.dumps(m)
 
+    @swap([True, False])
     def test_weight_norm_deepcopy(self):
         m = nn.Linear(4, 5)
         m = torch.nn.utils.parametrizations.weight_norm(m)
@@ -1545,8 +1740,130 @@ def test_weight_norm_deepcopy(self):
         input = torch.randn(3, 4)
         self.assertEqual(m(input), m2(input))
 
+    @swap([True])
+    def test_wrapper_subclass_parametrization(self):
+        class Subclassify(nn.Module):
+            def forward(self, X):
+                return TwoTensor(X, X)
+
+        class UnSubclassify(nn.Module):
+            def forward(self, X):
+                return X.a
+
+        class IdentityWithRightInverse(nn.Module):
+            def forward(self, X):
+                return X
+
+            def right_inverse(self, X):
+                return TwoTensor(X, X)
+
+        def _check_parametrization(
+            parametrization,
+            type_before_registration,
+            type_after_registration,
+            leave_parametrized=False,
+            type_after_right_inverse=None,
+        ):
+            model = nn.Linear(2, 2)
+            buf = torch.randn(2, 2)
+            model.register_buffer("buf", buf)
+            if (
+                type_before_registration == TwoTensor
+                and type_after_registration == Tensor
+            ):
+                model._apply(lambda t: TwoTensor(t, t))
+            initial_weight = model.weight.clone().detach()
+            initial_weight_id = id(model.weight)
+            initial_buf = model.buf.clone().detach()
+            initial_buf_id = id(model.buf)
+            type_original_weight = (
+                type_before_registration
+                if type_after_right_inverse is None
+                else type_after_right_inverse
+            )
+            type_original_buf = (
+                Tensor if type_original_weight is nn.Parameter else type_original_weight
+            )
+            type_after_removal_buf = (
+                type_after_registration if leave_parametrized else type_original_buf
+            )
+            if leave_parametrized:
+                if type_after_registration is Tensor:
+                    type_after_removal_weight = nn.Parameter
+                else:
+                    type_after_removal_weight = type_after_registration
+            else:
+                type_after_removal_weight = type_original_weight
+
+            parametrize.register_parametrization(model, "weight", parametrization())
+            parametrize.register_parametrization(model, "buf", parametrization())
+            self.assertTrue(hasattr(model, "parametrizations"))
+            self.assertTrue(parametrize.is_parametrized(model))
+            self.assertFalse(parametrize.is_parametrized(model, "bias"))
+            # checks for weight
+            self.assertTrue(parametrize.is_parametrized(model, "weight"))
+            self.assertTrue(
+                isinstance(model.parametrizations.weight.original, nn.Parameter)
+            )
+            self.assertTrue(
+                type(model.parametrizations.weight.original) is type_original_weight
+            )
+            self.assertNotIn("weight", model._parameters)
+            self.assertTrue(type(model.weight) is type_after_registration)
+            # checks for buf
+            self.assertTrue(parametrize.is_parametrized(model, "buf"))
+            self.assertFalse(
+                isinstance(model.parametrizations.buf.original, nn.Parameter)
+            )
+            self.assertTrue(
+                type(model.parametrizations.buf.original) is type_original_buf
+            )
+            self.assertTrue(type(model.buf) is type_after_registration)
+            parametrize.remove_parametrizations(
+                model, "weight", leave_parametrized=leave_parametrized
+            )
+            parametrize.remove_parametrizations(
+                model, "buf", leave_parametrized=leave_parametrized
+            )
+            self.assertFalse(hasattr(model, "parametrizations"))
+            self.assertEqual(model.__class__, nn.Linear)
+            # checks for weight
+            self.assertTrue(type(model.weight) is type_after_removal_weight)
+            self.assertTrue(isinstance(model.weight, nn.Parameter))
+            self.assertEqual(id(model.weight), initial_weight_id)
+            # checks for buf
+            self.assertTrue(type(model.buf) is type_after_removal_buf)
+            self.assertFalse(isinstance(model.buf, nn.Parameter))
+            self.assertEqual(id(model.buf), initial_buf_id)
+            if not leave_parametrized and type_after_right_inverse is None:
+                self.assertEqual(model.weight, initial_weight)
+                self.assertEqual(model.buf, initial_buf)
+
+        _check_parametrization(Subclassify, nn.Parameter, TwoTensor)
+        _check_parametrization(UnSubclassify, TwoTensor, Tensor)
+        _check_parametrization(
+            IdentityWithRightInverse,
+            nn.Parameter,
+            TwoTensor,
+            type_after_right_inverse=TwoTensor,
+        )
+        _check_parametrization(
+            Subclassify, nn.Parameter, TwoTensor, leave_parametrized=True
+        )
+        _check_parametrization(
+            UnSubclassify, TwoTensor, Tensor, leave_parametrized=True
+        )
+        _check_parametrization(
+            IdentityWithRightInverse,
+            nn.Parameter,
+            TwoTensor,
+            leave_parametrized=True,
+            type_after_right_inverse=TwoTensor,
+        )
+
 
 class TestNNParametrizationDevice(NNTestCase):
+    @swap([True, False])
     def test_weight_norm_parametrization(self, device):
         for dtype in [torch.float, torch.bfloat16]:
             input = torch.randn(3, 4, dtype=dtype, device=device)
@@ -1555,7 +1872,9 @@ def test_weight_norm_parametrization(self, device):
 
             # add weight normalization
             m = torch.nn.utils.parametrizations.weight_norm(m)
-            self.assertEqual(m.parametrizations.weight.original1.size(), m.weight.size())
+            self.assertEqual(
+                m.parametrizations.weight.original1.size(), m.weight.size()
+            )
             self.assertEqual(m.parametrizations.weight.original0.size(), (5, 1))
             self.assertEqual(m(input), expected_output)
 
@@ -1566,7 +1885,9 @@ def test_weight_norm_parametrization(self, device):
 
             # test with dim=1
             m = torch.nn.utils.parametrizations.weight_norm(m, dim=1)
-            self.assertEqual(m.parametrizations.weight.original1.size(), m.weight.size())
+            self.assertEqual(
+                m.parametrizations.weight.original1.size(), m.weight.size()
+            )
             self.assertEqual(m.parametrizations.weight.original0.size(), (1, 4))
             self.assertEqual(m(input), expected_output)
 
@@ -1577,10 +1898,9 @@ def test_weight_norm_parametrization(self, device):
             self.assertEqual(m(input), expected_output)
 
 
-
 only_for = ("cpu", "cuda")
 instantiate_device_type_tests(TestNNParametrizationDevice, globals(), only_for=only_for)
 instantiate_parametrized_tests(TestNNParametrization)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/nn/test_pooling.py b/test/nn/test_pooling.py
index d73b7eafce608..eae537058d878 100644
--- a/test/nn/test_pooling.py
+++ b/test/nn/test_pooling.py
@@ -1,36 +1,61 @@
 # Owner(s): ["module: nn"]
-from functools import reduce
-from functools import partial
-from itertools import repeat
-import unittest
-import subprocess
-import sys
-import os
-import random
 import itertools
 import math
+import operator
+import os
+import random
+import subprocess
+import sys
+import unittest
+from functools import partial, reduce
+from itertools import repeat
 
-from torch import inf, nan
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch import inf, nan
+from torch.autograd import gradcheck, gradgradcheck
 from torch.testing import make_tensor
-from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_UBSAN, set_default_dtype, \
-    instantiate_parametrized_tests, slowTest, parametrize as parametrize_test, subtest, skipIfMps, gcIfJetson, \
-    skipIfTorchDynamo
 from torch.testing._internal.common_cuda import TEST_CUDA
-from torch.testing._internal.common_nn import NNTestCase, _test_bfloat16_ops, _test_module_empty_input
-from torch.testing._internal.common_device_type import largeTensorTest, onlyNativeDeviceTypes, dtypes, \
-    instantiate_device_type_tests, skipCUDAIfRocm, expectedFailureMeta, dtypesIfCUDA, onlyCPU, onlyCUDA, \
-    TEST_WITH_ROCM
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    dtypesIfCUDA,
+    expectedFailureMeta,
+    instantiate_device_type_tests,
+    largeTensorTest,
+    onlyCPU,
+    onlyCUDA,
+    onlyNativeDeviceTypes,
+    skipCUDAIfRocm,
+    TEST_WITH_ROCM,
+)
 from torch.testing._internal.common_dtype import floating_types_and
-import torch.nn.functional as F
-import torch.nn as nn
-from torch.autograd import gradcheck, gradgradcheck
-import operator
+from torch.testing._internal.common_nn import (
+    _test_bfloat16_ops,
+    _test_module_empty_input,
+    NNTestCase,
+)
+from torch.testing._internal.common_utils import (
+    gcIfJetson,
+    instantiate_parametrized_tests,
+    parametrize as parametrize_test,
+    run_tests,
+    set_default_dtype,
+    skipIfMps,
+    skipIfTorchDynamo,
+    slowTest,
+    subtest,
+    TEST_WITH_UBSAN,
+    TestCase,
+)
 
 
 class TestAvgPool(TestCase):
     def _sum_pool2d(self, x, kernel_size):
-        windows = torch.nn.functional.unfold(x, kernel_size=kernel_size, stride=kernel_size)
+        windows = torch.nn.functional.unfold(
+            x, kernel_size=kernel_size, stride=kernel_size
+        )
         return torch.sum(windows, dim=1)
 
     def _sum_pool3d(self, x, kernel_size):
@@ -38,7 +63,10 @@ def _sum_pool3d(self, x, kernel_size):
         h = kernel_size[0]
         splited_x = [t.sum(0) for t in x.split(h) if t.size(0) == h]
         # sum_pool2d assumes tensor in (1, 1, n, m) view, so unsqueeze two times
-        splited_x = [self._sum_pool2d(t.unsqueeze(0).unsqueeze(0), kernel_size[1:]) for t in splited_x]
+        splited_x = [
+            self._sum_pool2d(t.unsqueeze(0).unsqueeze(0), kernel_size[1:])
+            for t in splited_x
+        ]
         joined_x = torch.cat(splited_x)
         return joined_x.view(1, joined_x.numel())
 
@@ -77,7 +105,9 @@ def test_doubletensor_avg_pool3d(self):
         for i in range(1, h + 1):
             for j in range(1, w + 1):
                 for k in range(1, d + 1):
-                    actual = torch.nn.functional.avg_pool3d(input.unsqueeze(0), (i, j, k))
+                    actual = torch.nn.functional.avg_pool3d(
+                        input.unsqueeze(0), (i, j, k)
+                    )
                     actual = actual.view(1, actual.numel())
                     expected = self._avg_pool3d(input, (i, j, k))
                     self.assertEqual(actual, expected, rtol=0, atol=1e-5)
@@ -89,7 +119,9 @@ def test_doubletensor_avg_pool3d_with_divisor(self):
             for j in range(1, w + 1):
                 for k in range(1, d + 1):
                     for divisor in [1, 7, i * j]:
-                        actual = torch.nn.functional.avg_pool3d(input.unsqueeze(0), (i, j, k), divisor_override=divisor)
+                        actual = torch.nn.functional.avg_pool3d(
+                            input.unsqueeze(0), (i, j, k), divisor_override=divisor
+                        )
                         actual = actual.view(1, actual.numel())
                         expected = self._sum_pool3d(input, (i, j, k)) / divisor
                         self.assertEqual(actual, expected, rtol=0, atol=1e-5)
@@ -98,38 +130,60 @@ def test_avg_pool1d_ceil_mode(self):
         # Regression test for gh-36977
         x = 10 * torch.randn((1, 16, 4))
         y = torch.nn.functional.avg_pool1d(
-            x, ceil_mode=True, count_include_pad=True, kernel_size=1, stride=2)
+            x, ceil_mode=True, count_include_pad=True, kernel_size=1, stride=2
+        )
         self.assertTrue(not torch.isnan(y).any())
 
         if TEST_CUDA:
             y = torch.nn.functional.avg_pool1d(
-                x.to('cuda'), ceil_mode=True, count_include_pad=True, kernel_size=1, stride=2)
+                x.to("cuda"),
+                ceil_mode=True,
+                count_include_pad=True,
+                kernel_size=1,
+                stride=2,
+            )
             self.assertTrue(not torch.isnan(y).any())
 
     def test_avg_pool2d_ceil_mode(self):
         # Regression test for gh-36977
         x = 10 * torch.randn((1, 16, 4, 4))
         y = torch.nn.functional.avg_pool2d(
-            x, ceil_mode=True, count_include_pad=True, kernel_size=(1, 2),
-            padding=(0, 1), stride=2)
+            x,
+            ceil_mode=True,
+            count_include_pad=True,
+            kernel_size=(1, 2),
+            padding=(0, 1),
+            stride=2,
+        )
         self.assertTrue(not torch.isnan(y).any())
 
         if TEST_CUDA:
             y = torch.nn.functional.avg_pool2d(
-                x.to('cuda'), ceil_mode=True, count_include_pad=True, kernel_size=(1, 2),
-                padding=(0, 1), stride=2)
+                x.to("cuda"),
+                ceil_mode=True,
+                count_include_pad=True,
+                kernel_size=(1, 2),
+                padding=(0, 1),
+                stride=2,
+            )
             self.assertTrue(not torch.isnan(y).any())
 
     def test_avg_pool3d_ceil_mode(self):
         # Regression test for gh-36977
         x = 10 * torch.randn((1, 16, 4, 4, 4))
         y = torch.nn.functional.avg_pool3d(
-            x, ceil_mode=True, count_include_pad=True, kernel_size=(1, 2, 3), stride=2)
+            x, ceil_mode=True, count_include_pad=True, kernel_size=(1, 2, 3), stride=2
+        )
         self.assertTrue(not torch.isnan(y).any())
 
         if TEST_CUDA:
             y = torch.nn.functional.avg_pool3d(
-                x.to('cuda'), ceil_mode=True, count_include_pad=True, kernel_size=(1, 2, 3), stride=2)
+                x.to("cuda"),
+                ceil_mode=True,
+                count_include_pad=True,
+                kernel_size=(1, 2, 3),
+                stride=2,
+            )
             self.assertTrue(not torch.isnan(y).any())
 
 
@@ -139,8 +193,8 @@ class TestPoolingNN(NNTestCase):
 
     def test_adaptive_pooling_size_none(self):
         for numel in (2, 3):
-            for pool_type in ('Max', 'Avg'):
-                cls_name = f'Adaptive{pool_type}Pool{numel}d'
+            for pool_type in ("Max", "Avg"):
+                cls_name = f"Adaptive{pool_type}Pool{numel}d"
                 module_cls = getattr(nn, cls_name)
                 output_size = (2,) * (numel - 1) + (None,)
                 module = module_cls(output_size)
@@ -155,12 +209,15 @@ def test_adaptive_pooling_size_overflow(self):
         # Tensor::numel() return int64_t, so following check that negative allocs are correctly handled
         self.assertRaises(
             RuntimeError,
-            lambda: torch.nn.AdaptiveMaxPool1d(0x3fffffffffffffff)(torch.empty([2, 2, 2])))
+            lambda: torch.nn.AdaptiveMaxPool1d(0x3FFFFFFFFFFFFFFF)(
+                torch.empty([2, 2, 2])
+            ),
+        )
 
     def test_adaptive_pooling_avg_nhwc(self):
-        device_list = ['cpu']
+        device_list = ["cpu"]
         if TEST_CUDA:
-            device_list.append('cuda')
+            device_list.append("cuda")
 
         for device in device_list:
             input = torch.randint(1, 10, (4, 8, 8, 8), dtype=torch.float32).to(device)
@@ -183,9 +240,9 @@ def test_adaptive_pooling_avg_nhwc(self):
             self.assertEqual(input.grad, ref_input.grad)
 
     def test_adaptive_pooling_avg_nhwc_non_contiguous(self):
-        device_list = ['cpu']
+        device_list = ["cpu"]
         if TEST_CUDA:
-            device_list.append('cuda')
+            device_list.append("cuda")
 
         for device in device_list:
             input = torch.randint(1, 10, (4, 8, 8, 8), dtype=torch.float32).to(device)
@@ -210,7 +267,9 @@ def test_adaptive_pooling_avg_nhwc_non_contiguous(self):
             self.assertEqual(input.grad, ref_input.grad)
 
     def test_adaptive_pooling_lower_precision(self):
-        def _test_adaptive_pooling_lower_precision(self, device, dtype, mod, memory_format):
+        def _test_adaptive_pooling_lower_precision(
+            self, device, dtype, mod, memory_format
+        ):
             input = torch.randint(1, 10, (3, 19, 8, 8), dtype=torch.float32)
             input = input.to(device).to(memory_format=memory_format).requires_grad_()
             pool = mod((7, 7)).to(device)
@@ -228,18 +287,36 @@ def _test_adaptive_pooling_lower_precision(self, device, dtype, mod, memory_form
             self.assertEqual(out, out2.float(), atol=0.1, rtol=0)
             self.assertEqual(input.grad, input2.grad.float(), atol=0.1, rtol=0)
 
-        device_list = ['cpu']
+        device_list = ["cpu"]
         for device in device_list:
             for dtype in [torch.bfloat16, torch.float16]:
-                _test_adaptive_pooling_lower_precision(self, device, dtype, torch.nn.AdaptiveAvgPool2d, torch.contiguous_format)
-                _test_adaptive_pooling_lower_precision(self, device, dtype, torch.nn.AdaptiveAvgPool2d, torch.channels_last)
-                _test_adaptive_pooling_lower_precision(self, device, dtype, torch.nn.AdaptiveMaxPool2d, torch.contiguous_format)
-                _test_adaptive_pooling_lower_precision(self, device, dtype, torch.nn.AdaptiveMaxPool2d, torch.channels_last)
+                _test_adaptive_pooling_lower_precision(
+                    self,
+                    device,
+                    dtype,
+                    torch.nn.AdaptiveAvgPool2d,
+                    torch.contiguous_format,
+                )
+                _test_adaptive_pooling_lower_precision(
+                    self, device, dtype, torch.nn.AdaptiveAvgPool2d, torch.channels_last
+                )
+                _test_adaptive_pooling_lower_precision(
+                    self,
+                    device,
+                    dtype,
+                    torch.nn.AdaptiveMaxPool2d,
+                    torch.contiguous_format,
+                )
+                _test_adaptive_pooling_lower_precision(
+                    self, device, dtype, torch.nn.AdaptiveMaxPool2d, torch.channels_last
+                )
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
-    @largeTensorTest('12GB', device='cuda')
+    @largeTensorTest("12GB", device="cuda")
     def test_adaptive_pooling_avg_nhwc_launch_config_backward(self):
-        input = torch.randint(1, 10, (1, 32, 2 ** 17 + 1, 32), dtype=torch.float32, device="cuda")
+        input = torch.randint(
+            1, 10, (1, 32, 2**17 + 1, 32), dtype=torch.float32, device="cuda"
+        )
         input = input.contiguous(memory_format=torch.channels_last).requires_grad_()
         grad = torch.randint(1, 10, (1, 32, 10, 32), dtype=torch.float32, device="cuda")
 
@@ -260,14 +337,16 @@ def test_adaptive_pooling_avg_nhwc_launch_config_backward(self):
         self.assertEqual(input.grad, ref_input.grad)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
-    @largeTensorTest('12GB', device='cuda')
+    @largeTensorTest("12GB", device="cuda")
     def test_adaptive_pooling_avg_nhwc_launch_config_forward(self):
-        input = torch.randint(1, 10, (1, 32, 16, 16), dtype=torch.float32, device="cuda")
+        input = torch.randint(
+            1, 10, (1, 32, 16, 16), dtype=torch.float32, device="cuda"
+        )
         input = input.contiguous(memory_format=torch.channels_last).requires_grad_()
-        pool = torch.nn.AdaptiveAvgPool2d((2 ** 17 + 1, 32)).cuda()
+        pool = torch.nn.AdaptiveAvgPool2d((2**17 + 1, 32)).cuda()
 
         ref_input = input.detach().clone().contiguous().requires_grad_(True)
-        ref_pool = torch.nn.AdaptiveAvgPool2d((2 ** 17 + 1, 32)).cuda()
+        ref_pool = torch.nn.AdaptiveAvgPool2d((2**17 + 1, 32)).cuda()
 
         out = pool(input)
         ref_out = ref_pool(ref_input)
@@ -278,7 +357,9 @@ def test_adaptive_pooling_avg_nhwc_launch_config_forward(self):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_adaptive_avg_pooling_overflow(self):
-        input = torch.randint(-256, 256, (20, 32, 256, 256), dtype=torch.half, device='cuda')
+        input = torch.randint(
+            -256, 256, (20, 32, 256, 256), dtype=torch.half, device="cuda"
+        )
         avg_pool = torch.nn.AdaptiveAvgPool2d((2, 2))
         out = avg_pool(input)
         self.assertFalse(torch.isinf(out).any())
@@ -286,7 +367,9 @@ def test_adaptive_avg_pooling_overflow(self):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_adaptive_avg_pooling_nhwc_overflow(self):
-        input = torch.randint(-256, 256, (20, 32, 256, 256), dtype=torch.half, device='cuda')
+        input = torch.randint(
+            -256, 256, (20, 32, 256, 256), dtype=torch.half, device="cuda"
+        )
         input = input.contiguous(memory_format=torch.channels_last)
         avg_pool = torch.nn.AdaptiveAvgPool2d((2, 2))
         out = avg_pool(input)
@@ -315,7 +398,9 @@ def test_MaxUnpool2d_output_size(self):
 
                     mu(output_small, indices_small, output_size=size)
                 else:
-                    self.assertRaises(ValueError, lambda: mu(output_small, indices_small, (h, w)))
+                    self.assertRaises(
+                        ValueError, lambda: mu(output_small, indices_small, (h, w))
+                    )
 
     def test_max_unpool2d_nhwc_cpu(self):
         input = torch.randn(2, 10, 9, 9).float().cpu()
@@ -351,26 +436,47 @@ def test_max_unpool2d_nhwc_cpu(self):
     def test_max_unpool(self):
         with set_default_dtype(torch.double):
             # Test 1D
-            output, indices = F.max_pool1d(torch.randn([1, 1, 4]), 2, stride=2, return_indices=True)
-            self.assertEqual(F.max_unpool1d(output, indices, 2), F.max_unpool1d(output, indices, 2, stride=2))
+            output, indices = F.max_pool1d(
+                torch.randn([1, 1, 4]), 2, stride=2, return_indices=True
+            )
+            self.assertEqual(
+                F.max_unpool1d(output, indices, 2),
+                F.max_unpool1d(output, indices, 2, stride=2),
+            )
 
             # Test list / tuple passed as argument to max_unpool1d
             input = torch.randn([1, 1, 5], requires_grad=True)
             output, indices = F.max_pool1d(input, 2, stride=2, return_indices=True)
-            self.assertEqual(F.max_unpool1d(output, indices, 2, stride=2, output_size=input.shape),
-                             F.max_unpool1d(output, indices, 2, stride=2, output_size=input.size()))
+            self.assertEqual(
+                F.max_unpool1d(output, indices, 2, stride=2, output_size=input.shape),
+                F.max_unpool1d(output, indices, 2, stride=2, output_size=input.size()),
+            )
             gradcheck(F.max_unpool1d, (output, indices, 2), check_forward_ad=True)
 
             # Test 2D
-            output, indices = F.max_pool2d(torch.randn(
-                [1, 1, 4, 4], requires_grad=True), 2, stride=2, return_indices=True)
-            self.assertEqual(F.max_unpool2d(output, indices, 2), F.max_unpool2d(output, indices, 2, stride=2))
+            output, indices = F.max_pool2d(
+                torch.randn([1, 1, 4, 4], requires_grad=True),
+                2,
+                stride=2,
+                return_indices=True,
+            )
+            self.assertEqual(
+                F.max_unpool2d(output, indices, 2),
+                F.max_unpool2d(output, indices, 2, stride=2),
+            )
             gradcheck(F.max_unpool2d, (output, indices, 2), check_forward_ad=True)
 
             # Test 3D
-            output, indices = F.max_pool3d(torch.randn(
-                [4, 4, 4, 4, 4], requires_grad=True), 2, stride=2, return_indices=True)
-            self.assertEqual(F.max_unpool3d(output, indices, 2), F.max_unpool3d(output, indices, 2, stride=2))
+            output, indices = F.max_pool3d(
+                torch.randn([4, 4, 4, 4, 4], requires_grad=True),
+                2,
+                stride=2,
+                return_indices=True,
+            )
+            self.assertEqual(
+                F.max_unpool3d(output, indices, 2),
+                F.max_unpool3d(output, indices, 2, stride=2),
+            )
             gradcheck(F.max_unpool3d, (output, indices, 2), check_forward_ad=True)
 
     def test_max_unpool3d_input_check(self):
@@ -386,6 +492,7 @@ def test_quantized_max_pool1d_empty_kernel(self):
         with self.assertRaises(RuntimeError):
             torch.quantized_max_pool1d(temp_tensor, [])
 
+
 class TestPoolingNNDeviceType(NNTestCase):
     @onlyNativeDeviceTypes
     @dtypes(torch.float, torch.double)
@@ -410,7 +517,9 @@ def test_adaptive_pooling_zero_batch(self, dtype, device):
     @dtypes(torch.float32, torch.float64)
     @dtypesIfCUDA(torch.float32, torch.float64, torch.bfloat16, torch.float16)
     def test_adaptive_pooling_empty_output_size(self, dtype, device):
-        error_msg = "Expected grad_output to have non-zero size for non-batch dimensions"
+        error_msg = (
+            "Expected grad_output to have non-zero size for non-batch dimensions"
+        )
 
         make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=True)
         input = make_arg((1, 64, 10, 9))
@@ -474,7 +583,9 @@ def test_FractionalMaxPool3d_zero_out_size(self, device):
     @onlyNativeDeviceTypes
     def test_FractionalMaxPool2d_zero_samples(self, device):
         samples = torch.rand([0, 16, 2], device=device)
-        mod = nn.FractionalMaxPool2d([2, 2], output_size=[1, 1], _random_samples=samples)
+        mod = nn.FractionalMaxPool2d(
+            [2, 2], output_size=[1, 1], _random_samples=samples
+        )
         inp = torch.randn([0, 16, 32, 32], device=device)
         out = mod(inp)
         self.assertEqual(out, torch.empty((0, 16, 1, 1), device=device))
@@ -486,7 +597,9 @@ def test_FractionalMaxPool2d_zero_samples(self, device):
     @onlyNativeDeviceTypes
     def test_FractionalMaxPool3d_zero_samples(self, device):
         samples = torch.rand([0, 16, 3], device=device)
-        mod = nn.FractionalMaxPool3d([3, 2, 2], output_size=[1, 1, 1], _random_samples=samples)
+        mod = nn.FractionalMaxPool3d(
+            [3, 2, 2], output_size=[1, 1, 1], _random_samples=samples
+        )
         inp = torch.randn([0, 16, 50, 32, 32], device=device)
         out = mod(inp)
         self.assertEqual(out, torch.empty((0, 16, 1, 1, 1), device=device))
@@ -557,57 +670,75 @@ def test_MaxUnpool_zero_batch_dim(self, device):
     @slowTest
     @onlyNativeDeviceTypes
     @skipCUDAIfRocm
-    @parametrize_test("module_name,module_size,output_size,test_index,should_error", [
-        # Some tests are failing in trunk https://github.com/pytorch/pytorch/issues/103854
-        subtest(
-            ('MaxUnpool2d', (2, 2), (1, 3, 4, 5), -1, True),
-            name='case1',
-        ),
-        subtest(
-            ('MaxUnpool2d', (2, 2), (1, 3, 4, 5), 2 * 2 * 4 * 5, True),
-            name='case2',
-        ),
-        subtest(
-            ('MaxUnpool2d', (2, 2), (1, 3, 4, 5), (2 * 2 * 4 * 5) - 1, False),
-            name='case3'
-        ),
-        subtest(
-            ('MaxUnpool2d', (2, 3), (2, 1, 4, 2), 2 * 3 * 4 * 2, True),
-            name='case4',
-        ),
-        subtest(
-            ('MaxUnpool2d', (2, 3), (2, 1, 4, 2), (2 * 3 * 4 * 2) - 1, False),
-            name='case5'
-        ),
-        subtest(
-            ('MaxUnpool3d', (2, 2, 2), (1, 3, 4, 5), -1, True),
-            name='case6',
-        ),
-        subtest(
-            ('MaxUnpool3d', (2, 2, 2), (1, 3, 4, 5), 2 * 2 * 2 * 3 * 4 * 5, True),
-            name='case7',
-        ),
-        subtest(
-            ('MaxUnpool3d', (2, 2, 2), (1, 3, 4, 5), (2 * 2 * 2 * 3 * 4 * 5) - 1, False),
-            name='case8'
-        ),
-        subtest(
-            ('MaxUnpool3d', (2, 2, 2), (2, 3, 4, 1), 2 * 2 * 2 * 3 * 4 * 1, True),
-            name='case9',
-        ),
-        subtest(
-            ('MaxUnpool3d', (2, 2, 2), (2, 3, 4, 1), (2 * 2 * 2 * 3 * 4 * 1) - 1, False),
-            name='case10'
-        ),
-    ])
-    def test_MaxUnpool_index_errors(self, device, module_name, module_size, output_size, test_index, should_error):
+    @parametrize_test(
+        "module_name,module_size,output_size,test_index,should_error",
+        [
+            # Some tests are failing in trunk https://github.com/pytorch/pytorch/issues/103854
+            subtest(
+                ("MaxUnpool2d", (2, 2), (1, 3, 4, 5), -1, True),
+                name="case1",
+            ),
+            subtest(
+                ("MaxUnpool2d", (2, 2), (1, 3, 4, 5), 2 * 2 * 4 * 5, True),
+                name="case2",
+            ),
+            subtest(
+                ("MaxUnpool2d", (2, 2), (1, 3, 4, 5), (2 * 2 * 4 * 5) - 1, False),
+                name="case3",
+            ),
+            subtest(
+                ("MaxUnpool2d", (2, 3), (2, 1, 4, 2), 2 * 3 * 4 * 2, True),
+                name="case4",
+            ),
+            subtest(
+                ("MaxUnpool2d", (2, 3), (2, 1, 4, 2), (2 * 3 * 4 * 2) - 1, False),
+                name="case5",
+            ),
+            subtest(
+                ("MaxUnpool3d", (2, 2, 2), (1, 3, 4, 5), -1, True),
+                name="case6",
+            ),
+            subtest(
+                ("MaxUnpool3d", (2, 2, 2), (1, 3, 4, 5), 2 * 2 * 2 * 3 * 4 * 5, True),
+                name="case7",
+            ),
+            subtest(
+                (
+                    "MaxUnpool3d",
+                    (2, 2, 2),
+                    (1, 3, 4, 5),
+                    (2 * 2 * 2 * 3 * 4 * 5) - 1,
+                    False,
+                ),
+                name="case8",
+            ),
+            subtest(
+                ("MaxUnpool3d", (2, 2, 2), (2, 3, 4, 1), 2 * 2 * 2 * 3 * 4 * 1, True),
+                name="case9",
+            ),
+            subtest(
+                (
+                    "MaxUnpool3d",
+                    (2, 2, 2),
+                    (2, 3, 4, 1),
+                    (2 * 2 * 2 * 3 * 4 * 1) - 1,
+                    False,
+                ),
+                name="case10",
+            ),
+        ],
+    )
+    def test_MaxUnpool_index_errors(
+        self, device, module_name, module_size, output_size, test_index, should_error
+    ):
         # NOTE: CUDA tests need to be run in a subprocess because they cause device asserts
-        if torch.device(device).type == 'cuda':
+        if torch.device(device).type == "cuda":
             error_msgs = {
-                'MaxUnpool2d': r'Assertion `maxind >= 0 && maxind < outputImageSize` failed',
-                'MaxUnpool3d': r'Assertion `index >= 0 && index < outputImageSize` failed'}
+                "MaxUnpool2d": r"Assertion `maxind >= 0 && maxind < outputImageSize` failed",
+                "MaxUnpool3d": r"Assertion `index >= 0 && index < outputImageSize` failed",
+            }
 
-            script = f'''
+            script = f"""
 import torch
 unpool = torch.nn.{module_name}({module_size}).to('{device}')
 output = torch.rand({output_size}, dtype=torch.float32, device='{device}')
@@ -615,28 +746,22 @@ def test_MaxUnpool_index_errors(self, device, module_name, module_size, output_s
 indices.flatten()[0] = {test_index}
 unpool(output, indices)
 torch.cuda.synchronize()
-'''
+"""
             p = subprocess.run(
-                [sys.executable, '-c', script],
+                [sys.executable, "-c", script],
                 cwd=os.path.dirname(os.path.realpath(__file__)),
                 capture_output=True,
                 text=True,
             )
 
-            output = p.stdout + '\n' + p.stderr
+            output = p.stdout + "\n" + p.stderr
 
             error_msg = error_msgs[module_name]
 
             if should_error:
-                self.assertIn(
-                    error_msg,
-                    output,
-                    'The expected error was not found')
+                self.assertIn(error_msg, output, "The expected error was not found")
             else:
-                self.assertNotIn(
-                    'Error',
-                    output,
-                    'Should not have produced an error')
+                self.assertNotIn("Error", output, "Should not have produced an error")
         else:
             module_class = getattr(torch.nn, module_name)
             unpool = module_class(module_size).to(device)
@@ -645,7 +770,9 @@ def test_MaxUnpool_index_errors(self, device, module_name, module_size, output_s
             indices.flatten()[0] = test_index
 
             if should_error:
-                with self.assertRaisesRegex(RuntimeError, r'Found an invalid max index:'):
+                with self.assertRaisesRegex(
+                    RuntimeError, r"Found an invalid max index:"
+                ):
                     unpool(output, indices)
             else:
                 unpool(output, indices)
@@ -682,40 +809,69 @@ def test_AvgPool2d_empty(self, device):
         inp = torch.randn(0, 16, 20, 32, device=device)
         _test_module_empty_input(self, avgpool, inp, check_size=False)
 
-        clast_inp = torch.randn(0, 16, 20, 32, device=device).contiguous(memory_format=torch.channels_last)
+        clast_inp = torch.randn(0, 16, 20, 32, device=device).contiguous(
+            memory_format=torch.channels_last
+        )
         _test_module_empty_input(self, avgpool, clast_inp, check_size=False)
 
         # test with empty non-batch input
-        with self.assertRaisesRegex(RuntimeError, '3D or 4D'):
+        with self.assertRaisesRegex(RuntimeError, "3D or 4D"):
             inp = torch.randn(16, 0, 20, 32, device=device)
             avgpool(inp)
 
     def test_pooling_shape(self, device):
-        ''' Test the output shape calculation for pooling functions '''
+        """Test the output shape calculation for pooling functions"""
 
         # Checks output shape against expected for 1D, 2D and 3D
         def check(expected_out_shape, sizes, *args, **kwargs):
-            for kernel in ['max', 'avg']:
+            for kernel in ["max", "avg"]:
                 for i in [1, 2, 3]:
-                    if hasattr(torch.nn.functional, f'{kernel}_pool{i}d'):
-                        op = getattr(torch.nn.functional, f'{kernel}_pool{i}d')
-                        t = torch.randn(sizes[:i + 2], device=device)
-                        self.assertEqual(op(t, *args, **kwargs).shape, expected_out_shape[:i + 2])
-
-        check((1, 1, 3, 3, 4), (1, 1, 5, 6, 7), kernel_size=1, stride=2, padding=0, ceil_mode=True)
-        check((1, 1, 2, 3, 3), (1, 1, 3, 4, 5), kernel_size=2, stride=2, padding=1, ceil_mode=False)
-        check((1, 1, 2, 3, 3), (1, 1, 3, 4, 5), kernel_size=2, stride=2, padding=1, ceil_mode=True)
+                    if hasattr(torch.nn.functional, f"{kernel}_pool{i}d"):
+                        op = getattr(torch.nn.functional, f"{kernel}_pool{i}d")
+                        t = torch.randn(sizes[: i + 2], device=device)
+                        self.assertEqual(
+                            op(t, *args, **kwargs).shape, expected_out_shape[: i + 2]
+                        )
+
+        check(
+            (1, 1, 3, 3, 4),
+            (1, 1, 5, 6, 7),
+            kernel_size=1,
+            stride=2,
+            padding=0,
+            ceil_mode=True,
+        )
+        check(
+            (1, 1, 2, 3, 3),
+            (1, 1, 3, 4, 5),
+            kernel_size=2,
+            stride=2,
+            padding=1,
+            ceil_mode=False,
+        )
+        check(
+            (1, 1, 2, 3, 3),
+            (1, 1, 3, 4, 5),
+            kernel_size=2,
+            stride=2,
+            padding=1,
+            ceil_mode=True,
+        )
 
         # Test case from issue https://github.com/pytorch/pytorch/issues/45357
         x = torch.randn(1, 1, 6, 7, device=device)
-        y = torch.nn.functional.max_pool2d(x, 1, stride=(2, 2), padding=0, ceil_mode=True)
+        y = torch.nn.functional.max_pool2d(
+            x, 1, stride=(2, 2), padding=0, ceil_mode=True
+        )
         self.assertEqual(y.size(), (1, 1, 3, 4))
 
-    @onlyNativeDeviceTypes   # TODO: fix on XLA
+    @onlyNativeDeviceTypes  # TODO: fix on XLA
     def test_adaptive_avg_pool2d_output_size_one(self, device):
         def helper(size, memory_format):
-            x = torch.randint(1, 10, size, dtype=torch.float, device=device, requires_grad=True)
-            if memory_format == 'non_contiguous':
+            x = torch.randint(
+                1, 10, size, dtype=torch.float, device=device, requires_grad=True
+            )
+            if memory_format == "non_contiguous":
                 x = x[::2, ::2, ::2, ::2]
             else:
                 x = x.to(memory_format=memory_format)
@@ -724,7 +880,7 @@ def helper(size, memory_format):
             out = net(x)
             ref_out = x.contiguous().mean((-1, -2)).view((x.size(0), x.size(1), 1, 1))
 
-            out.sum().backward()    # make sure it doesn't crash
+            out.sum().backward()  # make sure it doesn't crash
 
             self.assertEqual(out, ref_out)
             if memory_format == torch.channels_last:
@@ -736,18 +892,20 @@ def helper(size, memory_format):
                 c = out.size(1)
                 self.assertEqual(out.stride(), [c, 1, 1, 1])
 
-        for mf in (torch.contiguous_format, torch.channels_last, 'non_contiguous'):
+        for mf in (torch.contiguous_format, torch.channels_last, "non_contiguous"):
             helper((2, 3, 6, 6), mf)
 
     @onlyNativeDeviceTypes
     def test_adaptive_avg_pool3d_output_size_one(self, device):
-        x = torch.randn((2, 3, 6, 6, 6), dtype=torch.float, device=device, requires_grad=True)
+        x = torch.randn(
+            (2, 3, 6, 6, 6), dtype=torch.float, device=device, requires_grad=True
+        )
 
         net = torch.nn.AdaptiveAvgPool3d(1)
         out = net(x)
         ref_out = x.contiguous().mean((-1, -2, -3)).view(out.shape)
 
-        out.sum().backward()    # make sure it doesn't crash
+        out.sum().backward()  # make sure it doesn't crash
 
         self.assertEqual(out, ref_out)
         self.assertTrue(out.is_contiguous())
@@ -759,8 +917,8 @@ def test_adaptive_avg_pool3d_output_size_one(self, device):
     @dtypes(torch.uint8, torch.int8, torch.short, torch.int, torch.long)
     def test_adaptive_pooling_no_suppot_input(self, device, dtype):
         for numel in (2, 3):
-            for pool_type in ('Max', 'Avg'):
-                cls_name = f'Adaptive{pool_type}Pool{numel}d'
+            for pool_type in ("Max", "Avg"):
+                cls_name = f"Adaptive{pool_type}Pool{numel}d"
                 module_cls = getattr(nn, cls_name)
                 output_size = (2,) * numel
                 module = module_cls(output_size)
@@ -773,21 +931,44 @@ def test_adaptive_pooling_no_suppot_input(self, device, dtype):
     @dtypes(torch.float, torch.double)
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
     def test_avg_pool2d_nhwc(self, device, dtype):
-        def helper(n, c, h, w, kernel_size, stride=None,
-                   count_include_pad=True, divisor_override=None, padding=0):
+        def helper(
+            n,
+            c,
+            h,
+            w,
+            kernel_size,
+            stride=None,
+            count_include_pad=True,
+            divisor_override=None,
+            padding=0,
+        ):
             if stride is None:
                 stride = kernel_size
             input = torch.randn(n, c, h, w, dtype=dtype, device=device)
             input = input.contiguous(memory_format=torch.channels_last).requires_grad_()
-            grad = torch.randn(n, c, (h - kernel_size) // stride + 1, (w - kernel_size) // stride + 1,
-                               dtype=dtype, device=device)
-            pool = torch.nn.AvgPool2d(kernel_size, stride=stride, count_include_pad=count_include_pad,
-                                      divisor_override=divisor_override).to(device)
+            grad = torch.randn(
+                n,
+                c,
+                (h - kernel_size) // stride + 1,
+                (w - kernel_size) // stride + 1,
+                dtype=dtype,
+                device=device,
+            )
+            pool = torch.nn.AvgPool2d(
+                kernel_size,
+                stride=stride,
+                count_include_pad=count_include_pad,
+                divisor_override=divisor_override,
+            ).to(device)
 
             ref_input = input.detach().clone().contiguous().requires_grad_(True)
             ref_grad = grad.detach().clone().contiguous()
-            ref_pool = torch.nn.AvgPool2d(kernel_size, stride=stride, count_include_pad=count_include_pad,
-                                          divisor_override=divisor_override).to(device)
+            ref_pool = torch.nn.AvgPool2d(
+                kernel_size,
+                stride=stride,
+                count_include_pad=count_include_pad,
+                divisor_override=divisor_override,
+            ).to(device)
 
             out = pool(input)
             out.backward(grad)
@@ -805,7 +986,7 @@ def helper(n, c, h, w, kernel_size, stride=None,
         helper(4, 8, 8, 8, 3, divisor_override=42)
         helper(4, 8, 8, 8, 7)
         # ROCm 16GB MI25 hits OOM error. Clear caching allocator prior to running large subtest.
-        if TEST_WITH_ROCM and 'cuda' in device:
+        if TEST_WITH_ROCM and "cuda" in device:
             torch.cuda.empty_cache()
         helper(200, 512, 28, 28, 2)
         helper(4, 8, 7, 7, 3, stride=1)
@@ -825,8 +1006,12 @@ def check(x, args, expected):
 
         # Pooling args: (kernel_size, stride, padding, dilation, return_indices, ceil_mode)
         check([[1]], (1, None, 0, 1, False, False), [[1]])
-        check([[1]], (2, None, 1, 2, False, False), [[float('-inf')]])
-        check([[1], [1]], (2, None, 1, 2, False, False), [[float('-inf')], [float('-inf')]])
+        check([[1]], (2, None, 1, 2, False, False), [[float("-inf")]])
+        check(
+            [[1], [1]],
+            (2, None, 1, 2, False, False),
+            [[float("-inf")], [float("-inf")]],
+        )
         check([[1, 2]], (2, 1, 1, 2, False, False), [[2, 1]])
         check([[1, 2]], (2, 2, 1, 2, False, True), [[2, 2]])
 
@@ -846,11 +1031,18 @@ def check(x, *args, **kwargs):
         dilations = random.sample(range(1, 5), 3)
         ceil_modes = [True, False]
 
-        for size, kernel_size, stride, dilation, ceil_mode in \
-                itertools.product(sizes, kernel_sizes, strides, dilations, ceil_modes):
+        for size, kernel_size, stride, dilation, ceil_mode in itertools.product(
+            sizes, kernel_sizes, strides, dilations, ceil_modes
+        ):
             padding = random.sample(range(0, math.floor(kernel_size / 2) + 1), 1)
-            check(torch.randn(size, device=device, dtype=dtype),
-                  kernel_size, stride, padding, dilation, ceil_mode=ceil_mode)
+            check(
+                torch.randn(size, device=device, dtype=dtype),
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode=ceil_mode,
+            )
 
         # Non-contiguous test
         tensor = torch.randn(5, 151, 33, device=device, dtype=dtype)[::2, ::3, ::2]
@@ -861,7 +1053,9 @@ def check(x, *args, **kwargs):
     @gcIfJetson
     def test_max_pool2d(self, device):
         def helper(n, c, h, w, ks):
-            x = torch.randn(n, c, h, w, device='cuda', dtype=torch.float, requires_grad=True)
+            x = torch.randn(
+                n, c, h, w, device="cuda", dtype=torch.float, requires_grad=True
+            )
             ref_x = x.detach().clone().cpu().requires_grad_()
 
             pool = torch.nn.MaxPool2d(kernel_size=ks)
@@ -889,13 +1083,23 @@ def helper(n, c, h, w, kernel_size, stride=None):
                 stride = kernel_size
             input = torch.randn(n, c, h, w, dtype=dtype, device=device)
             input = input.contiguous(memory_format=torch.channels_last).requires_grad_()
-            grad = torch.randn(n, c, (h - kernel_size) // stride + 1, (w - kernel_size) // stride + 1,
-                               dtype=dtype, device=device)
-            pool = torch.nn.MaxPool2d(kernel_size, stride, return_indices=True).to(device)
+            grad = torch.randn(
+                n,
+                c,
+                (h - kernel_size) // stride + 1,
+                (w - kernel_size) // stride + 1,
+                dtype=dtype,
+                device=device,
+            )
+            pool = torch.nn.MaxPool2d(kernel_size, stride, return_indices=True).to(
+                device
+            )
 
             ref_input = input.detach().clone().contiguous().requires_grad_(True)
             ref_grad = grad.detach().clone().contiguous()
-            ref_pool = torch.nn.MaxPool2d(kernel_size, stride, return_indices=True).to(device)
+            ref_pool = torch.nn.MaxPool2d(kernel_size, stride, return_indices=True).to(
+                device
+            )
 
             out, ind = pool(input)
             out.backward(grad)
@@ -926,7 +1130,9 @@ def helper(n, c, h, w, d, kernel_size, stride=None):
             if not batch:
                 batch = 1
             input = torch.randn(batch, c, d, h, w, dtype=dtype, device=device)
-            input = input.contiguous(memory_format=torch.channels_last_3d).requires_grad_()
+            input = input.contiguous(
+                memory_format=torch.channels_last_3d
+            ).requires_grad_()
             if not n:
                 input = input.squeeze(0).detach().clone().requires_grad_()
             if isinstance(kernel_size, int):
@@ -935,31 +1141,43 @@ def helper(n, c, h, w, d, kernel_size, stride=None):
                 stride = kernel_size
             elif isinstance(stride, int):
                 stride = [stride] * 3
-            grad = torch.randn(batch, c,
-                               (d - kernel_size[0]) // stride[0] + 1,
-                               (h - kernel_size[1]) // stride[1] + 1,
-                               (w - kernel_size[2]) // stride[2] + 1,
-                               dtype=dtype, device=device)
+            grad = torch.randn(
+                batch,
+                c,
+                (d - kernel_size[0]) // stride[0] + 1,
+                (h - kernel_size[1]) // stride[1] + 1,
+                (w - kernel_size[2]) // stride[2] + 1,
+                dtype=dtype,
+                device=device,
+            )
             grad = grad.contiguous(memory_format=torch.channels_last_3d)
             if not n:
                 grad = grad.squeeze(0)
-            pool = torch.nn.MaxPool3d(kernel_size, stride, return_indices=True).to(device)
+            pool = torch.nn.MaxPool3d(kernel_size, stride, return_indices=True).to(
+                device
+            )
 
             ref_input = input.detach().clone().contiguous().requires_grad_(True)
             ref_grad = grad.detach().clone().contiguous()
-            ref_pool = torch.nn.MaxPool3d(kernel_size, stride, return_indices=True).to(device)
+            ref_pool = torch.nn.MaxPool3d(kernel_size, stride, return_indices=True).to(
+                device
+            )
             out, ind = pool(input)
             out.backward(grad)
             ref_out, ref_ind = ref_pool(ref_input)
             ref_out.backward(ref_grad)
 
             if len(out.shape) == 4:
-                self.assertTrue(out.unsqueeze(0).is_contiguous(memory_format=torch.channels_last_3d))
+                self.assertTrue(
+                    out.unsqueeze(0).is_contiguous(memory_format=torch.channels_last_3d)
+                )
             else:
                 self.assertTrue(out.is_contiguous(memory_format=torch.channels_last_3d))
             self.assertTrue(ref_out.is_contiguous())
             if len(ind.shape) == 4:
-                self.assertTrue(ind.unsqueeze(0).is_contiguous(memory_format=torch.channels_last_3d))
+                self.assertTrue(
+                    ind.unsqueeze(0).is_contiguous(memory_format=torch.channels_last_3d)
+                )
             else:
                 self.assertTrue(ind.is_contiguous(memory_format=torch.channels_last_3d))
             self.assertTrue(ref_ind.is_contiguous())
@@ -989,9 +1207,13 @@ def helper(shape, kernel_size, stride, memory_format, dtype):
             input = torch.randn(shape, dtype=dtype, device=device)
             input = input.to(memory_format=memory_format).requires_grad_()
             if len(shape) == 4:
-                pool = torch.nn.MaxPool2d(kernel_size, stride, return_indices=True).to(device)
+                pool = torch.nn.MaxPool2d(kernel_size, stride, return_indices=True).to(
+                    device
+                )
             else:
-                pool = torch.nn.MaxPool3d(kernel_size, stride, return_indices=True).to(device)
+                pool = torch.nn.MaxPool3d(kernel_size, stride, return_indices=True).to(
+                    device
+                )
 
             input2 = input.detach().clone().float().requires_grad_(True)
 
@@ -1023,9 +1245,13 @@ def helper(shape, kernel_size, stride, memory_format, dtype):
     def test_max_pool2d_indices(self, device):
         def helper(n, c, h, w, ks):
             if n is None:
-                x = torch.randn(c, h, w, device='cuda', dtype=torch.float, requires_grad=True)
+                x = torch.randn(
+                    c, h, w, device="cuda", dtype=torch.float, requires_grad=True
+                )
             else:
-                x = torch.randn(n, c, h, w, device='cuda', dtype=torch.float, requires_grad=True)
+                x = torch.randn(
+                    n, c, h, w, device="cuda", dtype=torch.float, requires_grad=True
+                )
 
             ref_x = x.detach().clone().cpu().requires_grad_()
 
@@ -1038,7 +1264,9 @@ def helper(n, c, h, w, ks):
             ref_y.sum().backward()
 
             self.assertEqual(y, ref_y)
-            self.assertEqual(idx, ref_idx)  # assertEqual implicitly compares shape for tensors
+            self.assertEqual(
+                idx, ref_idx
+            )  # assertEqual implicitly compares shape for tensors
             self.assertEqual(x.grad, ref_x.grad)
 
         helper(2, 8, 4, 4, ks=2)
@@ -1048,7 +1276,9 @@ def helper(n, c, h, w, ks):
     @dtypes(torch.half, torch.bfloat16)
     def test_avg_pool2d_reduced_floating(self, device, dtype):
         def helper(n, c, h, w, kernel_size, stride, memory_format):
-            input = torch.randn(n, c, h, w, dtype=torch.float32, device=device).to(dtype=dtype)
+            input = torch.randn(n, c, h, w, dtype=torch.float32, device=device).to(
+                dtype=dtype
+            )
             input = input.to(memory_format=memory_format).requires_grad_()
             pool = torch.nn.AvgPool2d(kernel_size, stride).to(device)
 
@@ -1074,8 +1304,14 @@ def helper(n, c, h, w, kernel_size, stride, memory_format):
     def test_adaptive_pooling_max_nhwc(self, device, dtype):
         def helper(input_size, output_plane_size, contig):
             n_plane_dims = len(output_plane_size)
-            mod = torch.nn.AdaptiveMaxPool2d if n_plane_dims == 2 else torch.nn.AdaptiveMaxPool3d
-            channels_last = torch.channels_last if n_plane_dims == 2 else torch.channels_last_3d
+            mod = (
+                torch.nn.AdaptiveMaxPool2d
+                if n_plane_dims == 2
+                else torch.nn.AdaptiveMaxPool3d
+            )
+            channels_last = (
+                torch.channels_last if n_plane_dims == 2 else torch.channels_last_3d
+            )
             output_size = input_size[:2] + output_plane_size
             input = torch.randint(1, 10, input_size, device=device, dtype=dtype)
             input = input.contiguous(memory_format=channels_last)
@@ -1118,25 +1354,45 @@ def helper(input_size, output_plane_size, contig):
     @dtypes(torch.float, torch.double)
     def test_pooling_max_nhwc(self, device, dtype):
         def helper(n, c, h, w, kernel_size, stride, padding, dilation, contig, device):
-            output_height = math.floor((h + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1)
-            output_width = math.floor((w + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1)
+            output_height = math.floor(
+                (h + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1)
+                / stride[0]
+                + 1
+            )
+            output_width = math.floor(
+                (w + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1)
+                / stride[1]
+                + 1
+            )
 
             input = torch.randint(1, 10, (n, c, h, w), device=device, dtype=dtype)
             input = input.contiguous(memory_format=torch.channels_last)
-            grad = torch.randint(1, 10, (n, c, output_height, output_width), device=device, dtype=dtype)
+            grad = torch.randint(
+                1, 10, (n, c, output_height, output_width), device=device, dtype=dtype
+            )
             grad = grad.contiguous(memory_format=torch.channels_last)
             if not contig:
                 input = input[:, ::2, :, :]
                 grad = grad[:, ::2, :, :]
             input.requires_grad_(True)
             pool = torch.nn.MaxPool2d(
-                kernel_size, stride, padding, dilation, return_indices=True, ceil_mode=False
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                return_indices=True,
+                ceil_mode=False,
             )
 
             ref_input = input.detach().clone().contiguous().requires_grad_(True)
             ref_grad = grad.detach().clone().contiguous()
             ref_pool = torch.nn.MaxPool2d(
-                kernel_size, stride, padding, dilation, return_indices=True, ceil_mode=False
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                return_indices=True,
+                ceil_mode=False,
             ).to(device)
 
             out, ind = pool(input)
@@ -1166,8 +1422,8 @@ def test_pool3d_size_one_feature_dim(self, device):
         x = x.cpu().as_strided(x.size(), strange_strides)
 
         to_test = {
-            'max_pool3d': lambda t: F.max_pool3d(t, (5, 1, 1), stride=(5, 1, 1)),
-            'avg_pool3d': lambda t: F.avg_pool3d(t, (5, 1, 1), stride=(5, 1, 1)),
+            "max_pool3d": lambda t: F.max_pool3d(t, (5, 1, 1), stride=(5, 1, 1)),
+            "avg_pool3d": lambda t: F.avg_pool3d(t, (5, 1, 1), stride=(5, 1, 1)),
         }
 
         for test, fn in to_test.items():
@@ -1177,11 +1433,13 @@ def test_pool3d_size_one_feature_dim(self, device):
             self.assertEqual(out_y, out_x.to(device), msg=test)
 
     @onlyCUDA
-    @largeTensorTest('18GB')
-    @largeTensorTest('180GB', 'cpu')
+    @largeTensorTest("18GB")
+    @largeTensorTest("180GB", "cpu")
     def test_pool3d_large_size_int64(self, device):
         # See https://github.com/pytorch/pytorch/issues/52822
-        x = torch.randn(70, 32, 100, 100, 100, dtype=torch.half, device=device, requires_grad=True)
+        x = torch.randn(
+            70, 32, 100, 100, 100, dtype=torch.half, device=device, requires_grad=True
+        )
         y = torch.nn.functional.max_pool3d(x, 5)
         g = torch.randn_like(y, dtype=torch.half)
         torch.cuda.synchronize()
@@ -1212,7 +1470,9 @@ def test_AvgPool3d_backward_after_cat_dim1_device(self, device):
 
         y.backward(grad)
 
-    def _test_maxpool_indices(self, num_dim, adaptive=False, device="cpu", dtype=torch.float):
+    def _test_maxpool_indices(
+        self, num_dim, adaptive=False, device="cpu", dtype=torch.float
+    ):
         def expected_indices(dim, dtype):
             if dim == 1:
                 return torch.tensor([1, 3], dtype=dtype).repeat(2, 2, 1)
@@ -1234,14 +1494,18 @@ def expected_output(dim, dtype):
                 return torch.stack([col, col + 2], 1).view(2, 2, 2, 2)
 
         if adaptive:
-            cls_name = 'AdaptiveMaxPool{}d'.format(num_dim)  # noqa: UP032
+            cls_name = "AdaptiveMaxPool{}d".format(num_dim)  # noqa: UP032
         else:
             # FIXME(#105716): Test fails when using f-string
-            cls_name = 'MaxPool{}d'.format(num_dim)  # noqa: UP032
+            cls_name = "MaxPool{}d".format(num_dim)  # noqa: UP032
         module_cls = getattr(nn, cls_name)
         module = module_cls(2, return_indices=True).to(device, dtype=dtype)
         numel = 4 ** (num_dim + 1)
-        input = torch.arange(1, numel + 1).view(2, 2, *repeat(4, num_dim)).to(device, dtype=dtype)
+        input = (
+            torch.arange(1, numel + 1)
+            .view(2, 2, *repeat(4, num_dim))
+            .to(device, dtype=dtype)
+        )
         input_var = input.clone().detach().requires_grad_()
 
         # Check forward
@@ -1324,18 +1588,31 @@ def test_AdaptiveMaxPool3d_indices(self, device, dtype):
     def test_maxpool_indices_no_batch_dim(self, device, dtype):
         """Check that indices with no batch dim is consistent with a single batch."""
         max_pool_cases = [
-            (nn.MaxPool1d(3, return_indices=True),
-             torch.randn(3, 5, device=device, dtype=dtype)),
-            (nn.MaxPool2d(3, return_indices=True),
-             torch.randn(3, 5, 6, device=device, dtype=dtype)),
-            (nn.MaxPool3d(3, return_indices=True),
-             torch.randn(3, 5, 6, 7, device=device, dtype=dtype)),
-            (nn.AdaptiveMaxPool1d(3, return_indices=True),
-             torch.randn(3, 5, device=device, dtype=dtype)),
-            (nn.AdaptiveMaxPool2d(3, return_indices=True),
-             torch.randn(3, 5, 6, device=device, dtype=dtype)),
-            (nn.AdaptiveMaxPool3d(3, return_indices=True),
-             torch.randn(3, 5, 6, 7, device=device, dtype=dtype))]
+            (
+                nn.MaxPool1d(3, return_indices=True),
+                torch.randn(3, 5, device=device, dtype=dtype),
+            ),
+            (
+                nn.MaxPool2d(3, return_indices=True),
+                torch.randn(3, 5, 6, device=device, dtype=dtype),
+            ),
+            (
+                nn.MaxPool3d(3, return_indices=True),
+                torch.randn(3, 5, 6, 7, device=device, dtype=dtype),
+            ),
+            (
+                nn.AdaptiveMaxPool1d(3, return_indices=True),
+                torch.randn(3, 5, device=device, dtype=dtype),
+            ),
+            (
+                nn.AdaptiveMaxPool2d(3, return_indices=True),
+                torch.randn(3, 5, 6, device=device, dtype=dtype),
+            ),
+            (
+                nn.AdaptiveMaxPool3d(3, return_indices=True),
+                torch.randn(3, 5, 6, 7, device=device, dtype=dtype),
+            ),
+        ]
 
         for module, input in max_pool_cases:
             _, indices_no_batch = module(input)
@@ -1347,12 +1624,18 @@ def test_maxpool_indices_no_batch_dim(self, device, dtype):
     @onlyNativeDeviceTypes  # TODO: Fails on XLA
     @gcIfJetson
     def test_max_pool_nan_inf(self, device, dtype):
-        for adaptive in ['', 'adaptive_']:
+        for adaptive in ["", "adaptive_"]:
             for num_dim in [1, 2, 3]:
-                fn_name = f'{adaptive}max_pool{num_dim}d'
+                fn_name = f"{adaptive}max_pool{num_dim}d"
                 fn = getattr(F, fn_name)
 
-                x = torch.full([1, 1] + num_dim * [3], nan, device=device, dtype=dtype, requires_grad=True)
+                x = torch.full(
+                    [1, 1] + num_dim * [3],
+                    nan,
+                    device=device,
+                    dtype=dtype,
+                    requires_grad=True,
+                )
                 res = fn(x, 1 if adaptive else 3)
                 res.backward(torch.randn_like(res))
                 self.assertTrue(math.isnan(res.item()))
@@ -1360,7 +1643,13 @@ def test_max_pool_nan_inf(self, device, dtype):
                 res = fn(x, 1 if adaptive else 3)
                 self.assertTrue(math.isnan(res.item()))
 
-                x2 = torch.full([1, 1] + num_dim * [3], -inf, device=device, dtype=dtype, requires_grad=True)
+                x2 = torch.full(
+                    [1, 1] + num_dim * [3],
+                    -inf,
+                    device=device,
+                    dtype=dtype,
+                    requires_grad=True,
+                )
                 res2 = fn(x2, 1 if adaptive else 3)
                 res2.backward(torch.randn_like(res2))
                 self.assertTrue(math.isinf(res2.item()))
@@ -1377,7 +1666,8 @@ def test_fractional_max_pool2d(self, device):
 
             def func(x):
                 return F.fractional_max_pool2d(
-                    x, (2, 2), output_size=(3, 3), _random_samples=samples)
+                    x, (2, 2), output_size=(3, 3), _random_samples=samples
+                )
 
             self.assertEqual(func(x).shape, (1, 2, 3, 3))
             gradcheck(func, [x])
@@ -1385,7 +1675,7 @@ def func(x):
 
             x = torch.randn(2, 7, 7, requires_grad=True, device=device)
             self.assertEqual(func(x).shape, (2, 3, 3))
-            if self.device_type != 'cuda':
+            if self.device_type != "cuda":
                 # Reference: https://github.com/pytorch/pytorch/issues/52427
                 # Raises -> RuntimeError: TensorAccessor expected 4 dims but tensor has 3
                 # on CUDA in gradcheck
@@ -1395,17 +1685,26 @@ def func(x):
             for kernel_size in [(), (1,)]:
                 with self.assertRaisesRegex(RuntimeError, "kernel_size must either"):
                     # Incorrect kernel_size
-                    F.fractional_max_pool2d(x, kernel_size=kernel_size, output_size=(3, 3), _random_samples=samples)
+                    F.fractional_max_pool2d(
+                        x,
+                        kernel_size=kernel_size,
+                        output_size=(3, 3),
+                        _random_samples=samples,
+                    )
 
             err_large_msg = "too large relative to input "
             err_out_size_msg = "output_size must either"
-            for output_size, msg in [((9, 3), err_large_msg + "height"),
-                                     ((3, 9), err_large_msg + "width"),
-                                     ((3,), err_out_size_msg),
-                                     ((), err_out_size_msg)]:
+            for output_size, msg in [
+                ((9, 3), err_large_msg + "height"),
+                ((3, 9), err_large_msg + "width"),
+                ((3,), err_out_size_msg),
+                ((), err_out_size_msg),
+            ]:
                 with self.assertRaisesRegex(RuntimeError, msg):
                     # Incorrect output_size
-                    F.fractional_max_pool2d(x, (2, 2), output_size=output_size, _random_samples=samples)
+                    F.fractional_max_pool2d(
+                        x, (2, 2), output_size=output_size, _random_samples=samples
+                    )
 
     @expectedFailureMeta  # RuntimeError: Unrecognized tensor type ID: Meta
     @onlyNativeDeviceTypes
@@ -1416,7 +1715,8 @@ def test_fractional_max_pool3d(self, device):
 
             def func(x):
                 return F.fractional_max_pool3d(
-                    x, (2, 2, 2), output_size=(3, 3, 3), _random_samples=samples)
+                    x, (2, 2, 2), output_size=(3, 3, 3), _random_samples=samples
+                )
 
             self.assertEqual(func(x).shape, (1, 2, 3, 3, 3))
             gradcheck(func, [x])
@@ -1430,75 +1730,105 @@ def func(x):
             for kernel_size in [(), (1,), (1, 1)]:
                 with self.assertRaisesRegex(RuntimeError, "kernel_size must either"):
                     # Incorrect kernel_size
-                    F.fractional_max_pool3d(x, kernel_size=kernel_size, output_size=(3, 3, 3), _random_samples=samples)
+                    F.fractional_max_pool3d(
+                        x,
+                        kernel_size=kernel_size,
+                        output_size=(3, 3, 3),
+                        _random_samples=samples,
+                    )
 
             err_large_msg = "too large relative to input "
             err_out_size_msg = "output_size must either"
-            for output_size, msg in [((9, 3, 3), err_large_msg + "time"),
-                                     ((3, 9, 3), err_large_msg + "height"),
-                                     ((3, 3, 9), err_large_msg + "width"),
-                                     ((3, 3), err_out_size_msg),
-                                     ((3,), err_out_size_msg),
-                                     ((), err_out_size_msg)]:
+            for output_size, msg in [
+                ((9, 3, 3), err_large_msg + "time"),
+                ((3, 9, 3), err_large_msg + "height"),
+                ((3, 3, 9), err_large_msg + "width"),
+                ((3, 3), err_out_size_msg),
+                ((3,), err_out_size_msg),
+                ((), err_out_size_msg),
+            ]:
                 with self.assertRaisesRegex(RuntimeError, msg):
                     # Incorrect output_size
-                    F.fractional_max_pool3d(x, (2, 2, 2), output_size=output_size, _random_samples=samples)
+                    F.fractional_max_pool3d(
+                        x, (2, 2, 2), output_size=output_size, _random_samples=samples
+                    )
 
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
     @dtypes(torch.float)
     @onlyNativeDeviceTypes  # TODO: Fails on XLA
     def test_fractional_max_pool_nan_inf(self, device, dtype):
         for num_dim in [2, 3]:
-            fn_name = f'FractionalMaxPool{num_dim}d'
+            fn_name = f"FractionalMaxPool{num_dim}d"
             fn = getattr(nn, fn_name)(kernel_size=2, output_size=1)
-            x = torch.full([1, 1] + num_dim * [3], nan, device=device, dtype=dtype, requires_grad=True)
+            x = torch.full(
+                [1, 1] + num_dim * [3],
+                nan,
+                device=device,
+                dtype=dtype,
+                requires_grad=True,
+            )
             res = fn(x)
             res.backward(torch.randn_like(res))
             self.assertTrue(math.isnan(res.item()))
 
-            x2 = torch.full([1, 1] + num_dim * [3], -inf, device=device, dtype=dtype, requires_grad=True)
+            x2 = torch.full(
+                [1, 1] + num_dim * [3],
+                -inf,
+                device=device,
+                dtype=dtype,
+                requires_grad=True,
+            )
             res2 = fn(x2)
             res2.backward(torch.randn_like(res2))
             self.assertTrue(math.isinf(res2.item()))
 
     @onlyNativeDeviceTypes  # TODO: RuntimeError message different on XLA
     def test_pooling_zero_stride(self, device):
-        for op in ('max', 'avg'):
+        for op in ("max", "avg"):
             for num_dim in [1, 2, 3]:
-                fn_name = f'{op}_pool{num_dim}d'
+                fn_name = f"{op}_pool{num_dim}d"
                 fn = getattr(F, fn_name)
                 x = torch.ones([1, 2] + num_dim * [4], device=device, dtype=torch.float)
-                self.assertRaisesRegex(RuntimeError, r"stride should not be zero|stride must be greater than zero",
-                                       lambda: fn(x, kernel_size=2, stride=0))
+                self.assertRaisesRegex(
+                    RuntimeError,
+                    r"stride should not be zero|stride must be greater than zero",
+                    lambda: fn(x, kernel_size=2, stride=0),
+                )
 
-                fn_module_name = f'{op.title()}Pool{num_dim}d'
+                fn_module_name = f"{op.title()}Pool{num_dim}d"
                 fn_module = getattr(nn, fn_module_name)(kernel_size=2, stride=0)
-                self.assertRaisesRegex(RuntimeError, r"stride should not be zero|stride must be greater than zero",
-                                       lambda: fn_module(x))
+                self.assertRaisesRegex(
+                    RuntimeError,
+                    r"stride should not be zero|stride must be greater than zero",
+                    lambda: fn_module(x),
+                )
 
     @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
     @skipIfMps
     @dtypes(torch.float)
     def test_pool_large_size(self, device, dtype):
-        for op in ('max', 'avg'):
+        for op in ("max", "avg"):
             for num_dim in [1, 2, 3]:
-                fn_name = f'{op}_pool{num_dim}d'
+                fn_name = f"{op}_pool{num_dim}d"
                 fn = getattr(F, fn_name)
                 # 16777217 is the smallest integer not expressible in float32
-                x = torch.ones([1, 1, 16777217] + (num_dim - 1) * [1],
-                               device=device, dtype=dtype)
+                x = torch.ones(
+                    [1, 1, 16777217] + (num_dim - 1) * [1], device=device, dtype=dtype
+                )
                 res = fn(x, 1, stride=1, padding=0)
                 # check if the output shape was still computed correctly
                 self.assertEqual(x.shape[2], res.shape[2])
 
     @onlyCUDA
-    @largeTensorTest('6GB')
+    @largeTensorTest("6GB")
     def test_pooling_large(self, device):
         def helper(pool):
-            inp = torch.randn(2**7 + 10, 2**8, 2**8, 2**8, dtype=torch.half, device="cuda")
+            inp = torch.randn(
+                2**7 + 10, 2**8, 2**8, 2**8, dtype=torch.half, device="cuda"
+            )
             self.assertTrue(inp.numel() > 2**31 - 1)
             out = pool(inp)
-            torch.cuda.synchronize()    # asserts test finishes normally without raising errors
+            torch.cuda.synchronize()  # asserts test finishes normally without raising errors
 
         helper(nn.MaxPool2d(4, 4))
         helper(nn.AvgPool2d(4, 4))
@@ -1510,18 +1840,17 @@ def helper(pool):
     @skipIfMps
     @dtypes(torch.float)
     def test_pool_invalid_size(self, device, dtype):
-        for op in ('max', 'avg'):
+        for op in ("max", "avg"):
             for num_dim in [1, 2, 3]:
-                fn_name = f'{op}_pool{num_dim}d'
-                if op == 'max':
+                fn_name = f"{op}_pool{num_dim}d"
+                if op == "max":
                     # New implementation without indices supports empty tensors
                     # TODO(Heitor) change once with_indices code is updated
-                    fn_name += '_with_indices'
+                    fn_name += "_with_indices"
                 fn = getattr(F, fn_name)
                 # use a configuration that gives zero outputs only
                 # when doing a correct floor division by the stride
-                x = torch.ones([1, 1] + num_dim * [4],
-                               device=device, dtype=dtype)
+                x = torch.ones([1, 1] + num_dim * [4], device=device, dtype=dtype)
                 with self.assertRaisesRegex(RuntimeError, r"too small|smaller than"):
                     try:
                         res = fn(x, 3, stride=2, padding=0, dilation=2)
@@ -1531,12 +1860,44 @@ def test_pool_invalid_size(self, device, dtype):
 
     @onlyCUDA
     def test_pooling_bfloat16(self, device):
-        _test_bfloat16_ops(self, torch.nn.AvgPool1d(3, stride=2), device, inp_dims=(8, 4, 16), prec=0.05)
-        _test_bfloat16_ops(self, torch.nn.AvgPool2d(3, stride=2), device, inp_dims=(8, 4, 16, 16), prec=0.05)
-        _test_bfloat16_ops(self, torch.nn.AvgPool3d(3, stride=2), device, inp_dims=(8, 4, 16, 16, 16), prec=0.05)
-        _test_bfloat16_ops(self, torch.nn.AdaptiveAvgPool1d(3), device, inp_dims=(8, 4, 16), prec=0.05)
-        _test_bfloat16_ops(self, torch.nn.AdaptiveAvgPool2d((3, 5)), device, inp_dims=(8, 4, 16, 16), prec=0.05)
-        _test_bfloat16_ops(self, torch.nn.AdaptiveAvgPool3d((3, 5, 7)), device, inp_dims=(8, 4, 16, 16, 16), prec=0.05)
+        _test_bfloat16_ops(
+            self,
+            torch.nn.AvgPool1d(3, stride=2),
+            device,
+            inp_dims=(8, 4, 16),
+            prec=0.05,
+        )
+        _test_bfloat16_ops(
+            self,
+            torch.nn.AvgPool2d(3, stride=2),
+            device,
+            inp_dims=(8, 4, 16, 16),
+            prec=0.05,
+        )
+        _test_bfloat16_ops(
+            self,
+            torch.nn.AvgPool3d(3, stride=2),
+            device,
+            inp_dims=(8, 4, 16, 16, 16),
+            prec=0.05,
+        )
+        _test_bfloat16_ops(
+            self, torch.nn.AdaptiveAvgPool1d(3), device, inp_dims=(8, 4, 16), prec=0.05
+        )
+        _test_bfloat16_ops(
+            self,
+            torch.nn.AdaptiveAvgPool2d((3, 5)),
+            device,
+            inp_dims=(8, 4, 16, 16),
+            prec=0.05,
+        )
+        _test_bfloat16_ops(
+            self,
+            torch.nn.AdaptiveAvgPool3d((3, 5, 7)),
+            device,
+            inp_dims=(8, 4, 16, 16, 16),
+            prec=0.05,
+        )
 
     def test_maxpool3d_non_square_backward(self, device):
         # previous CUDA routine of this backward calculates kernel launch grid size
@@ -1561,8 +1922,9 @@ def test_adaptive_pool_odd_size(self, device):
         imgs_ = F.adaptive_avg_pool3d(imgs, (Od, Oh, Ow))
         imgs_ = F.adaptive_max_pool3d(imgs, (Od, Oh, Ow))
 
+
 instantiate_device_type_tests(TestPoolingNNDeviceType, globals())
 instantiate_parametrized_tests(TestPoolingNN)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/nn/test_pruning.py b/test/nn/test_pruning.py
index b4484830a9e59..fa268ed311e8c 100644
--- a/test/nn/test_pruning.py
+++ b/test/nn/test_pruning.py
@@ -1,15 +1,20 @@
 # Owner(s): ["module: nn"]
+import pickle
 import unittest
 import unittest.mock as mock
-import pickle
 
 import torch
 
 import torch.nn as nn
 import torch.nn.utils.prune as prune
-from torch.testing._internal.common_utils import TEST_NUMPY, TemporaryFileName, \
-    instantiate_parametrized_tests, run_tests
 from torch.testing._internal.common_nn import NNTestCase
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    run_tests,
+    TemporaryFileName,
+    TEST_NUMPY,
+)
+
 
 class TestPruningNN(NNTestCase):
     _do_cuda_memory_leak_check = True
@@ -35,7 +40,7 @@ def test_validate_pruning_amount_init(self):
         with self.assertRaises(ValueError):
             prune._validate_pruning_amount_init(amount=1.1)
         with self.assertRaises(ValueError):
-            prune._validate_pruning_amount_init(amount=20.)
+            prune._validate_pruning_amount_init(amount=20.0)
 
         # negative int should raise ValueError
         with self.assertRaises(ValueError):
@@ -45,9 +50,9 @@ def test_validate_pruning_amount_init(self):
         prune._validate_pruning_amount_init(amount=0.34)
         prune._validate_pruning_amount_init(amount=1500)
         prune._validate_pruning_amount_init(amount=0)
-        prune._validate_pruning_amount_init(amount=0.)
+        prune._validate_pruning_amount_init(amount=0.0)
         prune._validate_pruning_amount_init(amount=1)
-        prune._validate_pruning_amount_init(amount=1.)
+        prune._validate_pruning_amount_init(amount=1.0)
         self.assertTrue(True)
 
     @unittest.skipIf(not TEST_NUMPY, "numpy not found")
@@ -77,28 +82,13 @@ def test_compute_nparams_to_prune(self):
         r"""Test that requested pruning `amount` gets translated into the
         correct absolute number of units to prune.
         """
-        self.assertEqual(
-            prune._compute_nparams_toprune(amount=0, tensor_size=15),
-            0
-        )
-        self.assertEqual(
-            prune._compute_nparams_toprune(amount=10, tensor_size=15),
-            10
-        )
+        self.assertEqual(prune._compute_nparams_toprune(amount=0, tensor_size=15), 0)
+        self.assertEqual(prune._compute_nparams_toprune(amount=10, tensor_size=15), 10)
         # if 1 is int, means 1 unit
-        self.assertEqual(
-            prune._compute_nparams_toprune(amount=1, tensor_size=15),
-            1
-        )
+        self.assertEqual(prune._compute_nparams_toprune(amount=1, tensor_size=15), 1)
         # if 1. is float, means 100% of units
-        self.assertEqual(
-            prune._compute_nparams_toprune(amount=1., tensor_size=15),
-            15
-        )
-        self.assertEqual(
-            prune._compute_nparams_toprune(amount=0.4, tensor_size=17),
-            7
-        )
+        self.assertEqual(prune._compute_nparams_toprune(amount=1.0, tensor_size=15), 15)
+        self.assertEqual(prune._compute_nparams_toprune(amount=0.4, tensor_size=17), 7)
 
     def test_random_pruning_sizes(self):
         r"""Test that the new parameters and buffers created by the pruning
@@ -109,7 +99,7 @@ def test_random_pruning_sizes(self):
         # fixturize test
         # TODO: add other modules
         modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
-        names = ['weight', 'bias']
+        names = ["weight", "bias"]
 
         for m in modules:
             for name in names:
@@ -119,19 +109,14 @@ def test_random_pruning_sizes(self):
                     prune.random_unstructured(m, name=name, amount=0.1)
                     # mask has the same size as tensor being pruned
                     self.assertEqual(
-                        original_tensor.size(),
-                        getattr(m, name + '_mask').size()
+                        original_tensor.size(), getattr(m, name + "_mask").size()
                     )
                     # 'orig' tensor has the same size as the original tensor
                     self.assertEqual(
-                        original_tensor.size(),
-                        getattr(m, name + '_orig').size()
+                        original_tensor.size(), getattr(m, name + "_orig").size()
                     )
                     # new tensor has the same size as the original tensor
-                    self.assertEqual(
-                        original_tensor.size(),
-                        getattr(m, name).size()
-                    )
+                    self.assertEqual(original_tensor.size(), getattr(m, name).size())
 
     def test_random_pruning_orig(self):
         r"""Test that original tensor is correctly stored in 'orig'
@@ -141,19 +126,15 @@ def test_random_pruning_orig(self):
         # fixturize test
         # TODO: add other modules
         modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
-        names = ['weight', 'bias']
+        names = ["weight", "bias"]
 
         for m in modules:
             for name in names:
                 with self.subTest(m=m, name=name):
-
                     # tensor prior to pruning
                     original_tensor = getattr(m, name)
                     prune.random_unstructured(m, name=name, amount=0.1)
-                    self.assertEqual(
-                        original_tensor,
-                        getattr(m, name + '_orig')
-                    )
+                    self.assertEqual(original_tensor, getattr(m, name + "_orig"))
 
     def test_random_pruning_new_weight(self):
         r"""Test that module.name now contains a pruned version of
@@ -162,7 +143,7 @@ def test_random_pruning_new_weight(self):
         # fixturize test
         # TODO: add other modules
         modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
-        names = ['weight', 'bias']
+        names = ["weight", "bias"]
 
         for m in modules:
             for name in names:
@@ -173,15 +154,12 @@ def test_random_pruning_new_weight(self):
                     # weight = weight_orig * weight_mask
                     self.assertEqual(
                         getattr(m, name),
-                        getattr(m, name + '_orig')
-                        * getattr(m, name + '_mask').to(
-                            dtype=original_tensor.dtype
-                        ),
+                        getattr(m, name + "_orig")
+                        * getattr(m, name + "_mask").to(dtype=original_tensor.dtype),
                     )
 
     def test_identity_pruning(self):
-        r"""Test that a mask of 1s does not change forward or backward.
-        """
+        r"""Test that a mask of 1s does not change forward or backward."""
         input_ = torch.ones(1, 5)
         m = nn.Linear(5, 2)
         y_prepruning = m(input_)  # output prior to pruning
@@ -214,8 +192,7 @@ def test_identity_pruning(self):
         self.assertEqual(y1, y2)
 
     def test_random_pruning_0perc(self):
-        r"""Test that a mask of 1s does not change forward or backward.
-        """
+        r"""Test that a mask of 1s does not change forward or backward."""
         input_ = torch.ones(1, 5)
         m = nn.Linear(5, 2)
         y_prepruning = m(input_)  # output prior to pruning
@@ -235,7 +212,9 @@ def test_random_pruning_0perc(self):
             "torch.nn.utils.prune.RandomUnstructured.compute_mask"
         ) as compute_mask:
             compute_mask.return_value = torch.ones_like(m.weight)
-            prune.random_unstructured(m, name='weight', amount=0.9)  # amount won't count
+            prune.random_unstructured(
+                m, name="weight", amount=0.9
+            )  # amount won't count
 
         # with mask of 1s, output should be identical to no mask
         y_postpruning = m(input_)
@@ -265,7 +244,7 @@ def test_random_pruning(self):
             "torch.nn.utils.prune.RandomUnstructured.compute_mask"
         ) as compute_mask:
             compute_mask.return_value = mask
-            prune.random_unstructured(m, name='weight', amount=0.9)
+            prune.random_unstructured(m, name="weight", amount=0.9)
 
         y_postpruning = m(input_)
         y_postpruning.sum().backward()
@@ -276,7 +255,7 @@ def test_random_pruning(self):
         # make sure that weight_orig update doesn't modify [1, 0] and [0, 3]
         old_weight_orig = m.weight_orig.clone()
         # update weights
-        learning_rate = 1.
+        learning_rate = 1.0
         for p in m.parameters():
             p.data.sub_(p.grad.data * learning_rate)
         # since these are pruned, they should not be updated
@@ -284,8 +263,7 @@ def test_random_pruning(self):
         self.assertEqual(old_weight_orig[0, 3], m.weight_orig[0, 3])
 
     def test_random_pruning_forward(self):
-        r"""check forward with mask (by hand).
-        """
+        r"""check forward with mask (by hand)."""
         input_ = torch.ones(1, 5)
         m = nn.Linear(5, 2)
 
@@ -298,7 +276,7 @@ def test_random_pruning_forward(self):
             "torch.nn.utils.prune.RandomUnstructured.compute_mask"
         ) as compute_mask:
             compute_mask.return_value = mask
-            prune.random_unstructured(m, name='weight', amount=0.9)
+            prune.random_unstructured(m, name="weight", amount=0.9)
 
         yhat = m(input_)
         self.assertEqual(yhat[0, 0], m.weight_orig[0, 3] + m.bias[0])
@@ -321,11 +299,11 @@ def test_remove_pruning_forward(self):
             "torch.nn.utils.prune.RandomUnstructured.compute_mask"
         ) as compute_mask:
             compute_mask.return_value = mask
-            prune.random_unstructured(m, name='weight', amount=0.9)
+            prune.random_unstructured(m, name="weight", amount=0.9)
 
         y_postpruning = m(input_)
 
-        prune.remove(m, 'weight')
+        prune.remove(m, "weight")
 
         y_postremoval = m(input_)
         self.assertEqual(y_postpruning, y_postremoval)
@@ -347,7 +325,7 @@ def test_pruning_id_consistency(self):
 
     def test_random_pruning_pickle(self):
         modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
-        names = ['weight', 'bias']
+        names = ["weight", "bias"]
 
         for m in modules:
             for name in names:
@@ -359,19 +337,16 @@ def test_random_pruning_pickle(self):
     def test_multiple_pruning_calls(self):
         # if you call pruning twice, the hook becomes a PruningContainer
         m = nn.Conv3d(2, 2, 2)
-        prune.l1_unstructured(m, name='weight', amount=0.1)
+        prune.l1_unstructured(m, name="weight", amount=0.1)
         weight_mask0 = m.weight_mask  # save it for later sanity check
 
         # prune again
-        prune.ln_structured(m, name='weight', amount=0.3, n=2, dim=0)
+        prune.ln_structured(m, name="weight", amount=0.3, n=2, dim=0)
         hook = next(iter(m._forward_pre_hooks.values()))
-        self.assertIsInstance(
-            hook,
-            torch.nn.utils.prune.PruningContainer
-        )
+        self.assertIsInstance(hook, torch.nn.utils.prune.PruningContainer)
         # check that container._tensor_name is correctly set no matter how
         # many pruning methods are in the container
-        self.assertEqual(hook._tensor_name, 'weight')
+        self.assertEqual(hook._tensor_name, "weight")
 
         # check that the pruning container has the right length
         # equal to the number of pruning iters
@@ -387,27 +362,27 @@ def test_multiple_pruning_calls(self):
         self.assertTrue(torch.all(m.weight_mask[weight_mask0 == 0] == 0))
 
         # prune again
-        prune.ln_structured(m, name='weight', amount=0.1, n=float('inf'), dim=1)
+        prune.ln_structured(m, name="weight", amount=0.1, n=float("inf"), dim=1)
         # check that container._tensor_name is correctly set no matter how
         # many pruning methods are in the container
         hook = next(iter(m._forward_pre_hooks.values()))
-        self.assertEqual(hook._tensor_name, 'weight')
+        self.assertEqual(hook._tensor_name, "weight")
 
     def test_pruning_container(self):
         # create an empty container
         container = prune.PruningContainer()
-        container._tensor_name = 'test'
+        container._tensor_name = "test"
         self.assertEqual(len(container), 0)
 
         p = prune.L1Unstructured(amount=2)
-        p._tensor_name = 'test'
+        p._tensor_name = "test"
 
         # test adding a pruning method to a container
         container.add_pruning_method(p)
 
         # test error raised if tensor name is different
         q = prune.L1Unstructured(amount=2)
-        q._tensor_name = 'another_test'
+        q._tensor_name = "another_test"
         with self.assertRaises(ValueError):
             container.add_pruning_method(q)
 
@@ -416,7 +391,7 @@ def test_pruning_container(self):
         with self.assertRaises(TypeError):
             container.add_pruning_method(10)
         with self.assertRaises(TypeError):
-            container.add_pruning_method('ugh')
+            container.add_pruning_method("ugh")
 
     def test_pruning_container_compute_mask(self):
         r"""Test `compute_mask` of pruning container with a known `t` and
@@ -425,12 +400,12 @@ def test_pruning_container_compute_mask(self):
         """
         # create an empty container
         container = prune.PruningContainer()
-        container._tensor_name = 'test'
+        container._tensor_name = "test"
 
         # 1) test unstructured pruning
         # create a new pruning method
         p = prune.L1Unstructured(amount=2)
-        p._tensor_name = 'test'
+        p._tensor_name = "test"
         # add the pruning method to the container
         container.add_pruning_method(p)
 
@@ -446,7 +421,7 @@ def test_pruning_container_compute_mask(self):
 
         # 2) test structured pruning
         q = prune.LnStructured(amount=1, n=2, dim=0)
-        q._tensor_name = 'test'
+        q._tensor_name = "test"
         container.add_pruning_method(q)
         # since we are pruning the lowest magnitude one of the two rows, the
         # outcome of the calculation should be this:
@@ -456,7 +431,7 @@ def test_pruning_container_compute_mask(self):
 
         # 2) test structured pruning, along another axis
         r = prune.LnStructured(amount=1, n=2, dim=1)
-        r._tensor_name = 'test'
+        r._tensor_name = "test"
         container.add_pruning_method(r)
         # since we are pruning the lowest magnitude of the four columns, the
         # outcome of the calculation should be this:
@@ -472,20 +447,20 @@ def test_l1_unstructured_pruning(self):
         m = nn.Linear(4, 2)
         # modify its weight matrix by hand
         m.weight = torch.nn.Parameter(
-            torch.tensor(
-                [[1, 2, 3, 4], [-4, -3, -2, -1]], dtype=torch.float32
-            )
+            torch.tensor([[1, 2, 3, 4], [-4, -3, -2, -1]], dtype=torch.float32)
         )
 
-        prune.l1_unstructured(m, 'weight', amount=2)
-        expected_weight = torch.tensor([[0, 2, 3, 4], [-4, -3, -2, 0]],
-                                       dtype=m.weight.dtype)
+        prune.l1_unstructured(m, "weight", amount=2)
+        expected_weight = torch.tensor(
+            [[0, 2, 3, 4], [-4, -3, -2, 0]], dtype=m.weight.dtype
+        )
         self.assertEqual(expected_weight, m.weight)
 
         # check that pruning again removes the next two smallest entries
-        prune.l1_unstructured(m, 'weight', amount=2)
-        expected_weight = torch.tensor([[0, 0, 3, 4], [-4, -3, 0, 0]],
-                                       dtype=m.weight.dtype)
+        prune.l1_unstructured(m, "weight", amount=2)
+        expected_weight = torch.tensor(
+            [[0, 0, 3, 4], [-4, -3, 0, 0]], dtype=m.weight.dtype
+        )
         self.assertEqual(expected_weight, m.weight)
 
     def test_l1_unstructured_pruning_with_importance_scores(self):
@@ -497,24 +472,28 @@ def test_l1_unstructured_pruning_with_importance_scores(self):
         m = nn.Linear(4, 2)
         # modify its weight matrix by hand
         m.weight = torch.nn.Parameter(
-            torch.tensor(
-                [[1, 2, 3, 4], [-4, -3, -2, -1]], dtype=torch.float32
-            )
+            torch.tensor([[1, 2, 3, 4], [-4, -3, -2, -1]], dtype=torch.float32)
         )
         importance_scores = torch.tensor(
             [[4, 2, 1, 3], [-3, -1, -2, -4]], dtype=torch.float32
         )
 
-        prune.l1_unstructured(m, 'weight', amount=2, importance_scores=importance_scores)
-        expected_weight = torch.tensor([[1, 2, 0, 4], [-4, 0, -2, -1]],
-                                       dtype=m.weight.dtype)
+        prune.l1_unstructured(
+            m, "weight", amount=2, importance_scores=importance_scores
+        )
+        expected_weight = torch.tensor(
+            [[1, 2, 0, 4], [-4, 0, -2, -1]], dtype=m.weight.dtype
+        )
         self.assertEqual(expected_weight, m.weight)
 
         # check that pruning again removes two entries of m.weight that are colocated with
         # the next two smallest absolute values of importance scores.
-        prune.l1_unstructured(m, 'weight', amount=2, importance_scores=importance_scores)
-        expected_weight = torch.tensor([[1, 0, 0, 4], [-4, 0, 0, -1]],
-                                       dtype=m.weight.dtype)
+        prune.l1_unstructured(
+            m, "weight", amount=2, importance_scores=importance_scores
+        )
+        expected_weight = torch.tensor(
+            [[1, 0, 0, 4], [-4, 0, 0, -1]], dtype=m.weight.dtype
+        )
         self.assertEqual(expected_weight, m.weight)
 
     def test_unstructured_pruning_same_magnitude(self):
@@ -537,68 +516,78 @@ def test_random_structured_pruning_amount(self):
         AMOUNT = 0.6
         AXIS = 2
         p = prune.RandomStructured(amount=AMOUNT, dim=AXIS)
-        t = 2 * torch.randint(low=-1, high=2, size=(5, 4, 2)).to(
-            dtype=torch.float32
-        )
+        t = 2 * torch.randint(low=-1, high=2, size=(5, 4, 2)).to(dtype=torch.float32)
         nparams_toprune = prune._compute_nparams_toprune(AMOUNT, t.shape[AXIS])
 
         computed_mask = p.compute_mask(t, default_mask=torch.ones_like(t))
         # check that 1 column is fully prune, the others are left untouched
         remaining_axes = [_ for _ in range(len(t.shape)) if _ != AXIS]
-        per_column_sums = sorted(
-            torch.sum(computed_mask == 0, axis=remaining_axes)
-        )
+        per_column_sums = sorted(torch.sum(computed_mask == 0, axis=remaining_axes))
         assert per_column_sums == [0, 20]
 
     def test_ln_structured_pruning(self):
-        r"""Check Ln structured pruning by hand.
-        """
+        r"""Check Ln structured pruning by hand."""
         m = nn.Conv2d(3, 1, 2)
         m.weight.data = torch.tensor(
-            [[[[1., 2.], [1., 2.5]],
-             [[0.5, 1.], [0.1, 0.1]],
-             [[-3., -5.], [0.1, -1.]]]]
+            [
+                [
+                    [[1.0, 2.0], [1.0, 2.5]],
+                    [[0.5, 1.0], [0.1, 0.1]],
+                    [[-3.0, -5.0], [0.1, -1.0]],
+                ]
+            ]
         )
         # expected effect of pruning 1 of the 3 channels by L2-norm
         expected_mask_axis1 = torch.ones_like(m.weight)
-        expected_mask_axis1[:, 1] = 0.
+        expected_mask_axis1[:, 1] = 0.0
 
-        prune.ln_structured(m, 'weight', amount=1, n=2, dim=1)
+        prune.ln_structured(m, "weight", amount=1, n=2, dim=1)
         self.assertEqual(expected_mask_axis1, m.weight_mask)
 
         # expected effect of pruning 1 of the 2 columns along axis -1 by L1-norm
         expected_mask_axis3 = expected_mask_axis1
-        expected_mask_axis3[:, :, :, 0] = 0.
+        expected_mask_axis3[:, :, :, 0] = 0.0
 
-        prune.ln_structured(m, 'weight', amount=1, n=1, dim=-1)
+        prune.ln_structured(m, "weight", amount=1, n=1, dim=-1)
         self.assertEqual(expected_mask_axis3, m.weight_mask)
 
     def test_ln_structured_pruning_importance_scores(self):
-        r"""Check Ln structured pruning by hand.
-        """
+        r"""Check Ln structured pruning by hand."""
         m = nn.Conv2d(3, 1, 2)
         m.weight.data = torch.tensor(
-            [[[[1., 2.], [1., 2.5]],
-             [[0.5, 1.], [0.1, 0.1]],
-             [[-3., -5.], [0.1, -1.]]]]
+            [
+                [
+                    [[1.0, 2.0], [1.0, 2.5]],
+                    [[0.5, 1.0], [0.1, 0.1]],
+                    [[-3.0, -5.0], [0.1, -1.0]],
+                ]
+            ]
         )
         importance_scores = torch.tensor(
-            [[[[10., 1.], [10., 1.]],
-             [[30., 3.], [30., 3.]],
-             [[-20., -2.], [-20., -2.]]]]
+            [
+                [
+                    [[10.0, 1.0], [10.0, 1.0]],
+                    [[30.0, 3.0], [30.0, 3.0]],
+                    [[-20.0, -2.0], [-20.0, -2.0]],
+                ]
+            ]
         )
         # expected effect of pruning 1 of the 3 channels by L2-norm
         expected_mask_axis1 = torch.ones_like(m.weight)
-        expected_mask_axis1[:, 0] = 0.
+        expected_mask_axis1[:, 0] = 0.0
 
-        prune.ln_structured(m, 'weight', amount=1, n=2, dim=1, importance_scores=importance_scores)
+        prune.ln_structured(
+            m, "weight", amount=1, n=2, dim=1, importance_scores=importance_scores
+        )
         self.assertEqual(expected_mask_axis1, m.weight_mask)
 
         # expected effect of pruning 1 of the 2 columns along axis -1 by L1-norm
         expected_mask_axis3 = expected_mask_axis1
-        expected_mask_axis3[:, :, :, 1] = 0.
+        expected_mask_axis3[:, :, :, 1] = 0.0
 
-        prune.ln_structured(m, 'weight', amount=1, n=1, dim=-1, importance_scores=importance_scores)
+        prune.ln_structured(
+            m, "weight", amount=1, n=1, dim=-1, importance_scores=importance_scores
+        )
         self.assertEqual(expected_mask_axis3, m.weight_mask)
 
     def test_remove_pruning(self):
@@ -606,7 +595,7 @@ def test_remove_pruning(self):
         and makes the pruning final in the original parameter.
         """
         modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
-        names = ['weight', 'bias']
+        names = ["weight", "bias"]
 
         for m in modules:
             for name in names:
@@ -629,10 +618,9 @@ def test_remove_pruning(self):
                     self.assertEqual(pruned_t, final_t)
 
     def test_remove_pruning_exception(self):
-        r"""Removing from an unpruned tensor throws an assertion error
-        """
+        r"""Removing from an unpruned tensor throws an assertion error"""
         modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
-        names = ['weight', 'bias']
+        names = ["weight", "bias"]
 
         for m in modules:
             for name in names:
@@ -643,7 +631,6 @@ def test_remove_pruning_exception(self):
                     with self.assertRaises(ValueError):
                         prune.remove(m, name)
 
-
     def test_global_pruning(self):
         r"""Test that global l1 unstructured pruning over 2 parameters removes
         the `amount=4` smallest global weights across the 2 parameters.
@@ -652,28 +639,25 @@ def test_global_pruning(self):
         n = nn.Linear(3, 1)
         # modify the weight matrices by hand
         m.weight = torch.nn.Parameter(
-            torch.tensor([[1, 2, 3, 4], [-4, -3, -2, -1]]).to(
-                dtype=torch.float32)
+            torch.tensor([[1, 2, 3, 4], [-4, -3, -2, -1]]).to(dtype=torch.float32)
         )
         n.weight = torch.nn.Parameter(
-            torch.tensor([[0, 0.1, -2]]).to(
-                dtype=torch.float32)
+            torch.tensor([[0, 0.1, -2]]).to(dtype=torch.float32)
         )
 
         params_to_prune = (
-            (m, 'weight'),
-            (n, 'weight'),
+            (m, "weight"),
+            (n, "weight"),
         )
 
         # prune the 4 smallest weights globally by L1 magnitude
         prune.global_unstructured(
-            params_to_prune,
-            pruning_method=prune.L1Unstructured,
-            amount=4
+            params_to_prune, pruning_method=prune.L1Unstructured, amount=4
         )
 
-        expected_mweight = torch.tensor([[0, 2, 3, 4], [-4, -3, -2, 0]],
-                                        dtype=m.weight.dtype)
+        expected_mweight = torch.tensor(
+            [[0, 2, 3, 4], [-4, -3, -2, 0]], dtype=m.weight.dtype
+        )
         self.assertEqual(expected_mweight, m.weight)
 
         expected_nweight = torch.tensor([[0, 0, -2]]).to(dtype=n.weight.dtype)
@@ -687,25 +671,23 @@ def test_global_pruning_importance_scores(self):
         n = nn.Linear(3, 1)
         # modify the weight matrices by hand
         m.weight = torch.nn.Parameter(
-            torch.tensor([[1, 2, 3, 4], [-4, -3, -2, -1]]).to(
-                dtype=torch.float32)
+            torch.tensor([[1, 2, 3, 4], [-4, -3, -2, -1]]).to(dtype=torch.float32)
         )
         m_importance_scores = torch.tensor(
             [[4, 2, 1, 3], [-3, -1, -2, -4]], dtype=torch.float32
         )
         n.weight = torch.nn.Parameter(
-            torch.tensor([[0, 0.1, -2]]).to(
-                dtype=torch.float32)
+            torch.tensor([[0, 0.1, -2]]).to(dtype=torch.float32)
         )
-        n_importance_scores = torch.tensor([[0, 10., -0.2]]).to(dtype=torch.float32)
+        n_importance_scores = torch.tensor([[0, 10.0, -0.2]]).to(dtype=torch.float32)
 
         params_to_prune = (
-            (m, 'weight'),
-            (n, 'weight'),
+            (m, "weight"),
+            (n, "weight"),
         )
         importance_scores = {
-            (m, 'weight'): m_importance_scores,
-            (n, 'weight'): n_importance_scores,
+            (m, "weight"): m_importance_scores,
+            (n, "weight"): n_importance_scores,
         }
 
         # prune the 4 smallest weights globally by L1 magnitude
@@ -716,8 +698,9 @@ def test_global_pruning_importance_scores(self):
             importance_scores=importance_scores,
         )
 
-        expected_m_weight = torch.tensor([[1, 2, 0, 4], [-4, 0, -2, -1]],
-                                         dtype=m.weight.dtype)
+        expected_m_weight = torch.tensor(
+            [[1, 2, 0, 4], [-4, 0, -2, -1]], dtype=m.weight.dtype
+        )
         self.assertEqual(expected_m_weight, m.weight)
 
         expected_n_weight = torch.tensor([[0, 0.1, 0]]).to(dtype=n.weight.dtype)
@@ -739,7 +722,9 @@ def test_custom_from_mask_pruning(self):
         p = prune.CustomFromMask(mask=mask)
 
         computed_mask = p.compute_mask(t, default_mask)
-        expected_mask = torch.tensor([[0, 0, 0, 0], [0, 0, 1, 1]], dtype=computed_mask.dtype)
+        expected_mask = torch.tensor(
+            [[0, 0, 0, 0], [0, 0, 1, 1]], dtype=computed_mask.dtype
+        )
 
         self.assertEqual(computed_mask, expected_mask)
 
@@ -750,28 +735,21 @@ def test_pruning_rollback(self):
         to the previous state before pruning began.
         """
         modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
-        names = ['weight', 'bias']
+        names = ["weight", "bias"]
 
         for m in modules:
             for name in names:
                 with self.subTest(m=m, name=name):
-
                     with mock.patch(
                         "torch.nn.utils.prune.L1Unstructured.compute_mask"
                     ) as compute_mask:
-                        compute_mask.side_effect = Exception('HA!')
+                        compute_mask.side_effect = Exception("HA!")
                         with self.assertRaises(Exception):
                             prune.l1_unstructured(m, name=name, amount=0.9)
 
-                        self.assertTrue(
-                            name in dict(m.named_parameters())
-                        )
-                        self.assertFalse(
-                            name + '_mask' in dict(m.named_buffers())
-                        )
-                        self.assertFalse(
-                            name + '_orig' in dict(m.named_parameters())
-                        )
+                        self.assertTrue(name in dict(m.named_parameters()))
+                        self.assertFalse(name + "_mask" in dict(m.named_buffers()))
+                        self.assertFalse(name + "_orig" in dict(m.named_parameters()))
 
     def test_pruning_serialization_model(self):
         # create a model
@@ -781,18 +759,18 @@ def test_pruning_serialization_model(self):
             torch.nn.Linear(10, 1),
         )
         # check that everything looks normal before pruning
-        self.assertNotIn('0.weight_orig', model.state_dict())
-        self.assertNotIn('0.weight_mask', model.state_dict())
-        self.assertIn('0.weight', model.state_dict())
+        self.assertNotIn("0.weight_orig", model.state_dict())
+        self.assertNotIn("0.weight_mask", model.state_dict())
+        self.assertIn("0.weight", model.state_dict())
 
         # prune one of its parameters
-        prune.l1_unstructured(module=model[0], name='weight', amount=0.9)
+        prune.l1_unstructured(module=model[0], name="weight", amount=0.9)
 
         # check that the original weight and the new mask are present
-        self.assertIn('0.weight_orig', model.state_dict())
-        self.assertIn('0.weight_mask', model.state_dict())
-        self.assertNotIn('0.weight', model.state_dict())
-        self.assertTrue(hasattr(model[0], 'weight'))
+        self.assertIn("0.weight_orig", model.state_dict())
+        self.assertIn("0.weight_mask", model.state_dict())
+        self.assertNotIn("0.weight", model.state_dict())
+        self.assertTrue(hasattr(model[0], "weight"))
 
         pruned_weight = model[0].weight
 
@@ -801,10 +779,10 @@ def test_pruning_serialization_model(self):
             new_model = torch.load(fname)
 
         # check that the original weight and the new mask are present
-        self.assertIn('0.weight_orig', new_model.state_dict())
-        self.assertIn('0.weight_mask', new_model.state_dict())
-        self.assertNotIn('0.weight', new_model.state_dict())
-        self.assertTrue(hasattr(new_model[0], 'weight'))
+        self.assertIn("0.weight_orig", new_model.state_dict())
+        self.assertIn("0.weight_mask", new_model.state_dict())
+        self.assertNotIn("0.weight", new_model.state_dict())
+        self.assertTrue(hasattr(new_model[0], "weight"))
 
         self.assertEqual(pruned_weight, new_model[0].weight)
 
@@ -816,29 +794,29 @@ def test_pruning_serialization_state_dict(self):
             torch.nn.Linear(10, 1),
         )
         # check that everything looks normal before pruning
-        self.assertNotIn('0.weight_orig', model.state_dict())
-        self.assertNotIn('0.weight_mask', model.state_dict())
-        self.assertIn('0.weight', model.state_dict())
+        self.assertNotIn("0.weight_orig", model.state_dict())
+        self.assertNotIn("0.weight_mask", model.state_dict())
+        self.assertIn("0.weight", model.state_dict())
 
         # prune one of its parameters
-        prune.l1_unstructured(module=model[0], name='weight', amount=0.9)
+        prune.l1_unstructured(module=model[0], name="weight", amount=0.9)
 
         # check that the original weight and the new mask are present
-        self.assertIn('0.weight_orig', model.state_dict())
-        self.assertIn('0.weight_mask', model.state_dict())
-        self.assertNotIn('0.weight', model.state_dict())
-        self.assertTrue(hasattr(model[0], 'weight'))
+        self.assertIn("0.weight_orig", model.state_dict())
+        self.assertIn("0.weight_mask", model.state_dict())
+        self.assertNotIn("0.weight", model.state_dict())
+        self.assertTrue(hasattr(model[0], "weight"))
 
         pruned_weight = model[0].weight
 
         # make pruning permanent and restore parameter names as in base
         # architecture
-        prune.remove(module=model[0], name='weight')
+        prune.remove(module=model[0], name="weight")
 
         # check that the original weight and the new mask are no longer present
-        self.assertNotIn('0.weight_orig', model.state_dict())
-        self.assertNotIn('0.weight_mask', model.state_dict())
-        self.assertIn('0.weight', model.state_dict())
+        self.assertNotIn("0.weight_orig", model.state_dict())
+        self.assertNotIn("0.weight_mask", model.state_dict())
+        self.assertIn("0.weight", model.state_dict())
 
         # save the state dict of model and reload it into new_model
         new_model = torch.nn.Sequential(
@@ -852,9 +830,9 @@ def test_pruning_serialization_state_dict(self):
 
         # check that the original weight and the new mask are not present in
         # new_model either.
-        self.assertNotIn('0.weight_orig', new_model.state_dict())
-        self.assertNotIn('0.weight_mask', new_model.state_dict())
-        self.assertIn('0.weight', new_model.state_dict())
+        self.assertNotIn("0.weight_orig", new_model.state_dict())
+        self.assertNotIn("0.weight_mask", new_model.state_dict())
+        self.assertIn("0.weight", new_model.state_dict())
 
         self.assertEqual(pruned_weight, new_model[0].weight)
 
@@ -876,9 +854,9 @@ def test_prune_importance_scores(self):
         p = prune.L1Unstructured(amount=2)
         # create tensor to be pruned
         t = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]).to(dtype=torch.float32)
-        importance_scores = torch.tensor(
-            [[1, 2, 3, 4], [1.5, 1.6, 1.7, 1.8]]
-        ).to(dtype=torch.float32)
+        importance_scores = torch.tensor([[1, 2, 3, 4], [1.5, 1.6, 1.7, 1.8]]).to(
+            dtype=torch.float32
+        )
         # create prior mask by hand
         default_mask = torch.tensor([[1, 1, 1, 0], [1, 1, 0, 1]])
         # since we are pruning the two lowest magnitude units, the outcome of
@@ -898,8 +876,13 @@ def test_prune_importance_scores_mimic_default(self):
         # the calculation should be this:
         expected_mask = torch.tensor([[0, 0, 1, 0], [1, 1, 0, 1]])
         pruned_tensor_without_importance_scores = p.prune(t, default_mask)
-        pruned_tensor_with_importance_scores = p.prune(t, default_mask, importance_scores=t)
-        self.assertEqual(pruned_tensor_without_importance_scores, pruned_tensor_with_importance_scores)
+        pruned_tensor_with_importance_scores = p.prune(
+            t, default_mask, importance_scores=t
+        )
+        self.assertEqual(
+            pruned_tensor_without_importance_scores,
+            pruned_tensor_with_importance_scores,
+        )
         self.assertEqual(t * expected_mask, pruned_tensor_without_importance_scores)
 
     def test_rnn_pruning(self):
@@ -908,32 +891,27 @@ def test_rnn_pruning(self):
         # 'weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0'
 
         # Pruning one of them causes one of the weights to become a tensor
-        prune.l1_unstructured(l, 'weight_ih_l0', 0.5)
-        assert (
-            sum([isinstance(p, torch.nn.Parameter) for p in l._flat_weights])
-            == 3
-        )
+        prune.l1_unstructured(l, "weight_ih_l0", 0.5)
+        assert sum(isinstance(p, torch.nn.Parameter) for p in l._flat_weights) == 3
 
         # Removing the pruning reparametrization restores the Parameter
-        prune.remove(l, 'weight_ih_l0')
-        assert (
-            sum([isinstance(p, torch.nn.Parameter) for p in l._flat_weights])
-            == 4
-        )
+        prune.remove(l, "weight_ih_l0")
+        assert sum(isinstance(p, torch.nn.Parameter) for p in l._flat_weights) == 4
 
         # Make sure that, upon removal of the reparametrization, the
         # `._parameters` and `.named_parameters` contain the right params.
         # Specifically, the original weight ('weight_ih_l0') should be placed
         # back in the parameters, while the reparametrization component
         # ('weight_ih_l0_orig') should be removed.
-        assert 'weight_ih_l0' in l._parameters
-        assert l._parameters['weight_ih_l0'] is not None
-        assert 'weight_ih_l0_orig' not in l._parameters
-        assert 'weight_ih_l0' in dict(l.named_parameters())
-        assert dict(l.named_parameters())['weight_ih_l0'] is not None
-        assert 'weight_ih_l0_orig' not in dict(l.named_parameters())
+        assert "weight_ih_l0" in l._parameters
+        assert l._parameters["weight_ih_l0"] is not None
+        assert "weight_ih_l0_orig" not in l._parameters
+        assert "weight_ih_l0" in dict(l.named_parameters())
+        assert dict(l.named_parameters())["weight_ih_l0"] is not None
+        assert "weight_ih_l0_orig" not in dict(l.named_parameters())
+
 
 instantiate_parametrized_tests(TestPruningNN)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py b/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py
index 2de288eeaf0ef..df981e0c56add 100644
--- a/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py
+++ b/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py
@@ -1,10 +1,12 @@
 # Owner(s): ["module: onnx"]
 from __future__ import annotations
 
+import contextlib
 import copy
 import dataclasses
 import os
 import sys
+import unittest
 from typing import Tuple
 
 import onnxruntime
@@ -48,6 +50,20 @@ def tearDown(self):
         torch._dynamo.reset()
         OrtBackend.clear_cached_instances()
 
+    def test_get_ort_device_type(self):
+        self.assertEqual(
+            torch.onnx._internal.onnxruntime._get_ort_device_type("cuda"),
+            torch.onnx._internal.onnxruntime.ORTC.OrtDevice.cuda(),
+        )
+        self.assertEqual(
+            torch.onnx._internal.onnxruntime._get_ort_device_type("cpu"),
+            torch.onnx._internal.onnxruntime.ORTC.OrtDevice.cpu(),
+        )
+        self.assertEqual(
+            torch.onnx._internal.onnxruntime._get_ort_device_type("maia"),
+            torch.onnx._internal.onnxruntime.ORTC.OrtDevice.npu(),
+        )
+
     def test_torch_compile_backend_registration(self):
         self.assertIn("onnxrt", torch._dynamo.backends.registry.list_backends())
         backend = torch._dynamo.backends.registry.lookup_backend("onnxrt")
@@ -127,6 +143,10 @@ def _test_model_numerically(
         model,
         dynamo_backend,
         example_args_collection,
+        fullgraph: bool = False,
+        test_backward: bool = False,
+        atol: float = 1e-5,
+        rtol: float = 1e-6,
     ):
         """Run original and compiled model and compare the results.
 
@@ -148,16 +168,33 @@ def _test_model_numerically(
             model if not isinstance(model, torch.nn.Module) else copy.deepcopy(model),
             backend=dynamo_backend,
             dynamic=True,
+            fullgraph=fullgraph,
         )
 
         for example_args in example_args_collection:
             baseline_result = model(*example_args)
             result = compiled_model(*example_args)
             if isinstance(baseline_result, torch.Tensor):
-                torch.testing.assert_close(baseline_result, result)
+                torch.testing.assert_close(
+                    baseline_result, result, atol=atol, rtol=rtol
+                )
+                if test_backward:
+                    baseline_result.sum().backward()
+                    result.sum().backward()
+                    for baseline_param, param in zip(
+                        model.parameters(), compiled_model.parameters()
+                    ):
+                        torch.testing.assert_close(
+                            baseline_param.grad, param.grad, atol=atol, rtol=rtol
+                        )
             else:
+                assert (
+                    test_backward is False
+                ), "Calculating backward with multiple outputs is not supported yet."
                 for baseline_elem, result_elem in zip(baseline_result, result):
-                    torch.testing.assert_close(baseline_elem, result_elem)
+                    torch.testing.assert_close(
+                        baseline_elem, result_elem, atol=atol, rtol=rtol
+                    )
 
     def _assert_counting_information(
         self,
@@ -194,6 +231,32 @@ def _assert_counting_information(
         ):
             self.assertEqual(len(onnx_info), expected_number_of_onnx_models)
 
+    def _assert_dynamic_input_and_output_shapes_in_all_onnx_models(self, backend):
+        for (
+            onnx_session_infos
+        ) in backend._all_ort_execution_info.execution_info_per_graph_module.values():
+            for onnx_session_info in onnx_session_infos:
+                inputs_have_dynamic_shapes = False
+                for input in onnx_session_info.input_value_infos:
+                    if hasattr(input.type, "tensor_type") and hasattr(
+                        input.type.tensor_type, "shape"
+                    ):
+                        for dim in input.type.tensor_type.shape.dim:
+                            inputs_have_dynamic_shapes = (
+                                inputs_have_dynamic_shapes or hasattr(dim, "dim_param")
+                            )
+                output_have_dynamic_shapes = False
+                for output in onnx_session_info.output_value_infos:
+                    if hasattr(output.type, "tensor_type") and hasattr(
+                        output.type.tensor_type, "shape"
+                    ):
+                        for dim in output.type.tensor_type.shape.dim:
+                            output_have_dynamic_shapes = (
+                                output_have_dynamic_shapes or hasattr(dim, "dim_param")
+                            )
+                self.assertTrue(inputs_have_dynamic_shapes)
+                self.assertTrue(output_have_dynamic_shapes)
+
     @parameterized.expand(
         [
             (True,),
@@ -329,6 +392,470 @@ def forward(self, tensor_x: torch.Tensor):
                 number_of_exported_onnx_models_for_all_graph_modules=(1, 1),
             )
 
+    @parameterized.expand(
+        [
+            (True, True),
+            (True, False),
+        ]
+    )
+    def test_llama_attention_with_local_backend(
+        self, test_local_backend: bool, test_backward: bool
+    ):
+        from transformers import LlamaConfig  # noqa: F811
+        from transformers.models.llama.modeling_llama import (  # noqa: F811
+            LlamaAttention,
+        )
+
+        hidden_size = 16
+
+        config = LlamaConfig(
+            num_hidden_layers=1,
+            vocab_size=1024,
+            hidden_size=hidden_size,
+            intermediate_size=16,
+            max_position_embeddings=256,
+            num_attention_heads=2,
+            hidden_dropout_prob=0.0,
+            attention_dropout_prob=0.0,
+        )
+
+        class LlamaAttentionWrapper(torch.nn.Module):
+            def __init__(self, config):
+                super().__init__()
+                try:
+                    # New version of LlamaAttention has layer_idx argument.
+                    self.attention = LlamaAttention(config, layer_idx=0)
+                except TypeError:
+                    # Fall back to old version of LlamaAttention.
+                    self.attention = LlamaAttention(config)
+
+            def forward(self, hidden_states, attention_mask, position_ids):
+                attn_output, _, _ = self.attention(
+                    hidden_states, attention_mask, position_ids
+                )
+                return attn_output
+
+        def generate_example_inputs(batch: int, seq: int, hidden_size: int):
+            # shape: batch x seq x hidden_size
+            hidden_state = torch.randn(batch, seq, hidden_size)
+            # [0.0000e+00, ..., 0.0000e+00, -3.4028e+38, ...]
+            # shape: batch x 1 x seq x seq
+            attention_mask = torch.zeros(batch, 1, seq, seq, dtype=torch.float)
+            position_ids = torch.arange(0, seq, dtype=torch.int64)
+            position_ids = position_ids.unsqueeze(0).view(-1, seq)
+
+            return hidden_state, attention_mask, position_ids
+
+        # Reason for using multiple example argument groups:
+        #  Export model to ONNX with one example argument group
+        #  and test it with other example argument groups.
+        example_args_collection = (
+            generate_example_inputs(2, 8, hidden_size),
+            generate_example_inputs(4, 7, hidden_size),
+            generate_example_inputs(9, 15, hidden_size),
+        )
+
+        if test_local_backend:
+            local_aot_ort, local_ort = make_aot_ort(dynamic=True)
+        else:
+            local_aot_ort, local_ort = "onnxrt", None
+
+        model = LlamaAttentionWrapper(config).eval()
+
+        self._test_model_numerically(
+            model,
+            local_aot_ort,
+            example_args_collection,
+            fullgraph=True,
+            test_backward=test_backward,
+        )
+
+        if test_local_backend:
+            assert local_ort is not None
+            number_of_captured_graphs = 2 if test_backward else 1
+            execution_count = len(example_args_collection) * number_of_captured_graphs
+            self._assert_counting_information(
+                local_ort,
+                # Number of InferenceSession runs.
+                expected_execution_count=execution_count,
+                # Number of GraphModule's seen by ORT.
+                number_of_cached_graph_modules=number_of_captured_graphs,
+                # Number of InferenceSession's created per GraphModule.
+                number_of_exported_onnx_models_for_all_graph_modules=(1,)
+                * number_of_captured_graphs,
+            )
+            self._assert_dynamic_input_and_output_shapes_in_all_onnx_models(local_ort)
+
+    @parameterized.expand(
+        [
+            (True, False),
+            (True, True),
+        ]
+    )
+    def test_llama_decoder_with_local_backend(
+        self, test_local_backend: bool, test_backward: bool
+    ):
+        from transformers import LlamaConfig  # noqa: F811
+        from transformers.models.llama.modeling_llama import (  # noqa: F811
+            LlamaDecoderLayer,
+        )
+
+        hidden_size = 16
+
+        config = LlamaConfig(
+            num_hidden_layers=1,
+            vocab_size=1024,
+            hidden_size=hidden_size,
+            intermediate_size=16,
+            max_position_embeddings=256,
+            num_attention_heads=2,
+            hidden_dropout_prob=0.0,
+            attention_dropout_prob=0.0,
+        )
+
+        class LlamaDecoderWrapper(torch.nn.Module):
+            def __init__(self, config):
+                super().__init__()
+                try:
+                    # New version of LlamaDecoderLayer has layer_idx argument.
+                    self.decoder = LlamaDecoderLayer(config, layer_idx=0)
+                except TypeError:
+                    # Fall back to old version of LlamaDecoderLayer.
+                    self.decoder = LlamaDecoderLayer(config)
+
+            def forward(self, hidden_states, attention_mask, position_ids):
+                (decoder_output,) = self.decoder(
+                    hidden_states, attention_mask, position_ids
+                )
+                return decoder_output
+
+        def generate_example_inputs(batch: int, seq: int, hidden_size: int):
+            # shape: batch x seq x hidden_size
+            hidden_state = torch.randn(batch, seq, hidden_size)
+            # [0.0000e+00, ..., 0.0000e+00, -3.4028e+38, ...]
+            # shape: batch x 1 x seq x seq
+            attention_mask = torch.zeros(batch, 1, seq, seq, dtype=torch.float)
+            position_ids = torch.arange(0, seq, dtype=torch.int64)
+            position_ids = position_ids.unsqueeze(0).view(-1, seq)
+            return hidden_state, attention_mask, position_ids
+
+        # Reason for using multiple example argument groups:
+        #  Export model to ONNX with one example argument group
+        #  and test it with other example argument groups.
+        example_args_collection = (
+            generate_example_inputs(2, 8, hidden_size),
+            generate_example_inputs(4, 7, hidden_size),
+            generate_example_inputs(9, 15, hidden_size),
+        )
+
+        if test_local_backend:
+            local_aot_ort, local_ort = make_aot_ort(dynamic=True)
+        else:
+            local_aot_ort, local_ort = "onnxrt", None
+
+        model = LlamaDecoderWrapper(config).eval()
+
+        self._test_model_numerically(
+            model,
+            local_aot_ort,
+            example_args_collection,
+            fullgraph=True,
+            test_backward=test_backward,
+        )
+
+        if test_local_backend:
+            assert local_ort is not None
+            number_of_captured_graphs = 2 if test_backward else 1
+            execution_count = len(example_args_collection) * number_of_captured_graphs
+            self._assert_counting_information(
+                local_ort,
+                expected_execution_count=execution_count,
+                number_of_cached_graph_modules=number_of_captured_graphs,
+                number_of_exported_onnx_models_for_all_graph_modules=(1,)
+                * number_of_captured_graphs,
+            )
+            self._assert_dynamic_input_and_output_shapes_in_all_onnx_models(local_ort)
+
+    @parameterized.expand(
+        [
+            (True, False),
+            (True, True),
+        ]
+    )
+    def test_llama_with_local_backend(
+        self, test_local_backend: bool, test_backward: bool
+    ):
+        from transformers import LlamaConfig  # noqa: F811
+        from transformers.models.llama.modeling_llama import LlamaModel  # noqa: F811
+
+        config = LlamaConfig(
+            num_hidden_layers=1,
+            vocab_size=1024,
+            hidden_size=16,
+            intermediate_size=16,
+            max_position_embeddings=256,
+            num_attention_heads=2,
+            hidden_dropout_prob=0.0,
+            attention_dropout_prob=0.0,
+        )
+
+        config._attn_implementation = "eager"
+
+        class LlamaModelWrapper(torch.nn.Module):
+            def __init__(self, config):
+                super().__init__()
+                self.llama = LlamaModel(config)
+
+            def forward(self, input_ids, attention_mask, position_ids):
+                decoder_output = self.llama(
+                    input_ids, attention_mask, position_ids, return_dict=False
+                )
+                return decoder_output[0]
+
+        def generate_example_inputs(batch: int, seq: int):
+            # shape: batch x seq x hidden_size
+            input_ids = torch.randint(0, 7, size=(batch, seq), dtype=torch.int64)
+            # Usually, its shape is a tensor with shape batch x seq x seq.
+            # However, to bypass some control flow in the model, we use None.
+            attention_mask = None
+            position_ids = torch.arange(0, seq, dtype=torch.int64)
+            position_ids = position_ids.unsqueeze(0).view(-1, seq)
+            return input_ids, attention_mask, position_ids
+
+        # Reason for using multiple example argument groups:
+        #  Export model to ONNX with one example argument group
+        #  and test it with other example argument groups.
+        example_args_collection = (
+            generate_example_inputs(2, 8),
+            generate_example_inputs(4, 7),
+            generate_example_inputs(9, 15),
+        )
+
+        if test_local_backend:
+            local_aot_ort, local_ort = make_aot_ort(dynamic=True)
+        else:
+            local_aot_ort, local_ort = "onnxrt", None
+
+        model = LlamaModelWrapper(config).eval()
+
+        self._test_model_numerically(
+            model,
+            local_aot_ort,
+            example_args_collection,
+            fullgraph=True,
+            test_backward=test_backward,
+            atol=1e-4,
+            rtol=1e-4,
+        )
+
+        if test_local_backend:
+            assert local_ort is not None
+            number_of_captured_graphs = 2 if test_backward else 1
+            execution_count = len(example_args_collection) * number_of_captured_graphs
+            self._assert_counting_information(
+                local_ort,
+                expected_execution_count=execution_count,
+                number_of_cached_graph_modules=number_of_captured_graphs,
+                number_of_exported_onnx_models_for_all_graph_modules=(1,)
+                * number_of_captured_graphs,
+            )
+            self._assert_dynamic_input_and_output_shapes_in_all_onnx_models(local_ort)
+
+    @parameterized.expand(
+        [
+            (True,),
+            (False,),
+        ]
+    )
+    def test_dump_model(self, test_local_backend: bool):
+        @contextlib.contextmanager
+        def onnxrt_dump_path(path):
+            key = "ONNXRT_DUMP_PATH"
+            before = os.environ.get(key, None)
+            os.environ[key] = path
+            yield
+            if before is None:
+                del os.environ[key]
+            else:
+                os.environ[key] = before
+
+        example_args_collection = tuple(
+            (torch.randn(batch, 2, dtype=torch.float32),) for batch in (1, 2, 4, 6, 8)
+        )
+
+        class MLP(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = nn.Linear(2, 4, bias=True)
+                self.fc2 = nn.Linear(4, 2, bias=True)
+
+            def forward(self, tensor_x: torch.Tensor):
+                tensor_x = self.fc1(tensor_x)
+                tensor_x = torch.sigmoid(tensor_x)
+                tensor_x = self.fc2(tensor_x)
+                tensor_x = torch.sigmoid(tensor_x)
+                return tensor_x
+
+        if test_local_backend:
+            local_aot_ort, local_ort = make_aot_ort(dynamic=True)
+        else:
+            local_aot_ort, local_ort = "onnxrt", None
+
+        prefix = f"test_dump_model_{'local' if test_local_backend else 'onnxrt'}_"
+        expected = f"{prefix}0.onnx"
+        expected_graph = f"{prefix}0.txt"
+        if os.path.exists(expected):
+            os.remove(expected)
+        if os.path.exists(expected_graph):
+            os.remove(expected_graph)
+        not_expected = f"{prefix}1.onnx"
+        self.assertFalse(os.path.exists(not_expected))
+
+        model = MLP()
+        compiled_model = torch.compile(
+            model if not isinstance(model, torch.nn.Module) else copy.deepcopy(model),
+            backend=local_aot_ort,
+            dynamic=True,
+        )
+
+        self.assertFalse(os.path.exists(expected))
+        self.assertFalse(os.path.exists(not_expected))
+
+        with onnxrt_dump_path(prefix):
+            example_args = example_args_collection[0]
+            result = compiled_model(*example_args)
+            self.assertTrue(os.path.exists(expected))
+            self.assertTrue(os.path.exists(expected_graph))
+            self.assertFalse(os.path.exists(not_expected))
+
+            result = compiled_model(*example_args)
+            self.assertTrue(os.path.exists(expected))
+            self.assertFalse(os.path.exists(not_expected))
+
+    @unittest.skipIf(not torch.cuda.is_available(), "No CUDA to run mix devicei nputs")
+    def test_mix_device_inputs(self):
+        data = torch.randn(4, 8, device="cuda")
+        ref_data = torch.randn(8, 4, device="cpu")
+
+        def reshape_wrapper(data, ref_cpu_data):
+            # Dummy line to make sure ref_cpu_data
+            # is included in the captured graph.
+            ref_cpu_data += 1
+            shape = ref_cpu_data.shape
+            # A call with GPU and CPU inputs.
+            return torch.reshape(data, shape)
+
+        compiled_model = torch.compile(
+            reshape_wrapper,
+            backend="onnxrt",
+            dynamic=True,
+        )
+
+        result = compiled_model(data, ref_data)
+
+        self.assertTrue(torch.allclose(result, data.view(ref_data.shape)))
+
+    def test_no_input(self):
+        def reshape_wrapper():
+            # A model without input.
+            ones = torch.ones(4, 8)
+            zeros = torch.zeros(4, 8)
+            return ones + zeros
+
+        recorded_models = []
+
+        def record_onnx_model_transform(onnx_model):
+            # Record the ONNX model seen by the transform.
+            recorded_models.append(onnx_model)
+
+        compiled_model = torch.compile(
+            reshape_wrapper,
+            backend="onnxrt",
+            dynamic=True,
+            options=torch.onnx._OrtBackendOptions(
+                pre_ort_model_transforms=[
+                    record_onnx_model_transform,
+                ]
+            ),
+        )
+
+        result = compiled_model()
+
+        self.assertEqual(len(recorded_models), 1)
+        self.assertTrue(
+            "aten_add" in [node.op_type for node in recorded_models[0].graph.node]
+        )
+
+        self.assertEqual(result, torch.ones(4, 8))
+
+    def test_custom_onnx_transform(self):
+        # This test consists of 2 parts:
+        # 1. If a registered ONNX transform is called and recorded a model.
+        # 2. If a registered ONNX transform is called and changed the model
+
+        # Part 1: Record the ONNX model seen by the transform.
+        # This list contains the models recorded by record_onnx_model_transform.
+        recorded_models = []
+
+        def record_onnx_model_transform(onnx_model):
+            # Record the ONNX model seen by the transform.
+            recorded_models.append(onnx_model)
+
+        def example_model(x: torch.Tensor):
+            y = torch.sigmoid(x)
+            z = x + y
+            return z
+
+        compiled_model = torch.compile(
+            example_model,
+            backend="onnxrt",
+            dynamic=True,
+            options=torch.onnx._OrtBackendOptions(
+                pre_ort_model_transforms=[record_onnx_model_transform]
+            ),
+        )
+
+        x = torch.randn(2)
+        assert len(recorded_models) == 0
+        y = compiled_model(x)
+        assert len(recorded_models) == 1
+
+        # Part 2: Change the ONNX model seen by the transform so that
+        # ORT receives a different model.
+        def replace_relu_with_sigmoid(onnx_model):
+            for function in onnx_model.functions:
+                for node in function.node:
+                    if node.op_type == "Relu":
+                        node.op_type = "Sigmoid"
+
+        def another_example_model(x: torch.Tensor):
+            y = torch.relu(x)
+            z = x + y
+            return z
+
+        another_compiled = torch.compile(
+            another_example_model,
+            backend="onnxrt",
+            dynamic=True,
+            options=torch.onnx._OrtBackendOptions(
+                pre_ort_model_transforms=[
+                    replace_relu_with_sigmoid,
+                    record_onnx_model_transform,
+                ]
+            ),
+        )
+
+        another_y = another_compiled(x)
+        # We have 2 models recorded `record_onnx_model_transform`
+        # by the 2 torch.compile calls above.
+        assert len(recorded_models) == 2
+        # Since we have changed "Relu" to "Sigmoid" in replace_sigmoid_with_relu,
+        # the result should be the same to previous y.
+        torch.testing.assert_close(y, another_y)
+        # another_example_model still uses "Relu", so the result should be different
+        # than y.
+        self.assertFalse(torch.allclose(y, another_example_model(x)))
+
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/onnx/dynamo/test_registry_dispatcher.py b/test/onnx/dynamo/test_registry_dispatcher.py
index 427a06ecbbfdc..43fec16164633 100644
--- a/test/onnx/dynamo/test_registry_dispatcher.py
+++ b/test/onnx/dynamo/test_registry_dispatcher.py
@@ -2,7 +2,6 @@
 """Unit tests for the internal registration wrapper module."""
 from __future__ import annotations
 
-import logging
 import operator
 from typing import TypeVar, Union
 
@@ -142,8 +141,6 @@ def test_unsupported_nodes_analysis_with_missing_aten_op(self):
 class TestDispatcher(common_utils.TestCase):
     def setUp(self):
         self.registry = torch.onnx.OnnxRegistry()
-        # TODO: remove this once we have a better way to do this
-        logger = logging.getLogger("TestDispatcher")
         self.diagnostic_context = diagnostics.DiagnosticContext(
             "torch.onnx.dynamo_export", torch.__version__
         )
diff --git a/test/onnx/error_reproduction.py b/test/onnx/error_reproduction.py
new file mode 100644
index 0000000000000..470bdd7c38e4f
--- /dev/null
+++ b/test/onnx/error_reproduction.py
@@ -0,0 +1,176 @@
+"""Error reproduction utilities for op consistency tests."""
+
+from __future__ import annotations
+
+import difflib
+import pathlib
+import platform
+import sys
+import time
+import traceback
+
+import numpy as np
+
+import onnx
+import onnxruntime as ort
+import onnxscript
+import torch
+
+_MISMATCH_MARKDOWN_TEMPLATE = """\
+### Summary
+
+The output of ONNX Runtime does not match that of PyTorch when executing test
+`{test_name}`, `sample {sample_num}` in ONNX Script `TorchLib`.
+
+To recreate this report, use
+
+```bash
+CREATE_REPRODUCTION_REPORT=1 python -m pytest onnxscript/tests/function_libs/torch_lib/ops_test.py -k {short_test_name}
+```
+
+### ONNX Model
+
+```
+{onnx_model_text}
+```
+
+### Inputs
+
+Shapes: `{input_shapes}`
+
+<details><summary>Details</summary>
+<p>
+
+```python
+kwargs = {kwargs}
+inputs = {inputs}
+```
+
+</p>
+</details>
+
+### Expected output
+
+Shape: `{expected_shape}`
+
+<details><summary>Details</summary>
+<p>
+
+```python
+expected = {expected}
+```
+
+</p>
+</details>
+
+### Actual output
+
+Shape: `{actual_shape}`
+
+<details><summary>Details</summary>
+<p>
+
+```python
+actual = {actual}
+```
+
+</p>
+</details>
+
+### Difference
+
+<details><summary>Details</summary>
+<p>
+
+```diff
+{diff}
+```
+
+</p>
+</details>
+
+### Full error stack
+
+```
+{error_stack}
+```
+
+### Environment
+
+```
+{sys_info}
+```
+
+"""
+
+
+def create_mismatch_report(
+    test_name: str,
+    sample_num: int,
+    onnx_model: onnx.ModelProto,
+    inputs,
+    kwargs,
+    actual,
+    expected,
+    error: Exception,
+) -> None:
+    torch.set_printoptions(threshold=sys.maxsize)
+
+    error_text = str(error)
+    error_stack = error_text + "\n" + "".join(traceback.format_tb(error.__traceback__))
+    short_test_name = test_name.split(".")[-1]
+    diff = difflib.unified_diff(
+        str(actual).splitlines(),
+        str(expected).splitlines(),
+        fromfile="actual",
+        tofile="expected",
+        lineterm="",
+    )
+    onnx_model_text = onnx.printer.to_text(onnx_model)
+    input_shapes = repr(
+        [
+            f"Tensor<{inp.shape}, dtype={inp.dtype}>"
+            if isinstance(inp, torch.Tensor)
+            else inp
+            for inp in inputs
+        ]
+    )
+    sys_info = f"""\
+OS: {platform.platform()}
+Python version: {sys.version}
+onnx=={onnx.__version__}
+onnxruntime=={ort.__version__}
+onnxscript=={onnxscript.__version__}
+numpy=={np.__version__}
+torch=={torch.__version__}"""
+
+    markdown = _MISMATCH_MARKDOWN_TEMPLATE.format(
+        test_name=test_name,
+        short_test_name=short_test_name,
+        sample_num=sample_num,
+        input_shapes=input_shapes,
+        inputs=inputs,
+        kwargs=kwargs,
+        expected=expected,
+        expected_shape=expected.shape if isinstance(expected, torch.Tensor) else None,
+        actual=actual,
+        actual_shape=actual.shape if isinstance(actual, torch.Tensor) else None,
+        diff="\n".join(diff),
+        error_stack=error_stack,
+        sys_info=sys_info,
+        onnx_model_text=onnx_model_text,
+    )
+
+    markdown_file_name = f'mismatch-{short_test_name.replace("/", "-").replace(":", "-")}-{str(time.time()).replace(".", "_")}.md'
+    markdown_file_path = save_error_report(markdown_file_name, markdown)
+    print(f"Created reproduction report at {markdown_file_path}")
+
+
+def save_error_report(file_name: str, text: str):
+    reports_dir = pathlib.Path("error_reports")
+    reports_dir.mkdir(parents=True, exist_ok=True)
+    file_path = reports_dir / file_name
+    with open(file_path, "w", encoding="utf-8") as f:
+        f.write(text)
+
+    return file_path
diff --git a/test/onnx/expect/TestOperators.test_expand.expect b/test/onnx/expect/TestOperators.test_expand.expect
index 93f8b229a04bb..d58153d45854a 100644
--- a/test/onnx/expect/TestOperators.test_expand.expect
+++ b/test/onnx/expect/TestOperators.test_expand.expect
@@ -3,23 +3,9 @@ producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    output: "onnx::Where_1"
+    output: "onnx::ConstantOfShape_11"
     name: "Constant_4"
     op_type: "Constant"
-    attribute {
-      name: "value"
-      t {
-        dims: 3
-        data_type: 7
-        raw_data: "\004\000\000\000\000\000\000\000\006\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000"
-      }
-      type: TENSOR
-    }
-  }
-  node {
-    output: "onnx::ConstantOfShape_10"
-    name: "Constant_5"
-    op_type: "Constant"
     attribute {
       name: "value"
       t {
@@ -31,9 +17,9 @@ graph {
     }
   }
   node {
-    input: "onnx::ConstantOfShape_10"
+    input: "onnx::ConstantOfShape_11"
     output: "onnx::Mul_3"
-    name: "ConstantOfShape_6"
+    name: "ConstantOfShape_5"
     op_type: "ConstantOfShape"
     attribute {
       name: "value"
@@ -47,7 +33,7 @@ graph {
   }
   node {
     output: "onnx::Mul_4"
-    name: "Constant_7"
+    name: "Constant_6"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -62,12 +48,12 @@ graph {
     input: "onnx::Mul_3"
     input: "onnx::Mul_4"
     output: "onnx::Equal_5"
-    name: "Mul_8"
+    name: "Mul_7"
     op_type: "Mul"
   }
   node {
     output: "onnx::Equal_6"
-    name: "Constant_9"
+    name: "Constant_8"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -83,21 +69,35 @@ graph {
     input: "onnx::Equal_6"
     input: "onnx::Equal_5"
     output: "onnx::Where_7"
-    name: "Equal_10"
+    name: "Equal_9"
     op_type: "Equal"
   }
+  node {
+    output: "onnx::Where_8"
+    name: "Constant_10"
+    op_type: "Constant"
+    attribute {
+      name: "value"
+      t {
+        dims: 3
+        data_type: 7
+        raw_data: "\004\000\000\000\000\000\000\000\006\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
+    }
+  }
   node {
     input: "onnx::Where_7"
     input: "onnx::Mul_3"
-    input: "onnx::Where_1"
-    output: "onnx::Expand_8"
+    input: "onnx::Where_8"
+    output: "onnx::Expand_9"
     name: "Where_11"
     op_type: "Where"
   }
   node {
     input: "onnx::Expand_0"
-    input: "onnx::Expand_8"
-    output: "9"
+    input: "onnx::Expand_9"
+    output: "10"
     name: "Expand_12"
     op_type: "Expand"
   }
@@ -119,7 +119,7 @@ graph {
     }
   }
   output {
-    name: "9"
+    name: "10"
     type {
       tensor_type {
         elem_type: 1
diff --git a/test/onnx/onnx_test_common.py b/test/onnx/onnx_test_common.py
index ba9b64cb385d3..23a98dc0df4eb 100644
--- a/test/onnx/onnx_test_common.py
+++ b/test/onnx/onnx_test_common.py
@@ -35,6 +35,7 @@
 from torch.onnx import _constants, verification
 from torch.onnx._internal import _beartype
 from torch.onnx._internal.fx import diagnostics
+from torch.testing._internal import common_utils
 from torch.testing._internal.opinfo import core as opinfo_core
 from torch.types import Number
 
@@ -308,6 +309,7 @@ def run_test_with_fx_to_onnx_exporter_and_onnx_runtime(
                 f"test_report_{self._testMethodName}"
                 f"_op_level_debug_{self.op_level_debug}"
                 f"_dynamic_axes_{self.dynamic_shapes}"
+                f"_model_type_{self.model_type}"
                 ".sarif"
             )
 
@@ -317,6 +319,9 @@ def run_test_with_fx_to_onnx_exporter_and_onnx_runtime(
         if not skip_dynamic_shapes_check:
             assert_dynamic_shapes(onnx_program, self.dynamic_shapes)
 
+        if isinstance(ref_model, torch.export.ExportedProgram):
+            ref_model = ref_model.module()
+
         _compare_pytorch_onnx_with_ort(
             onnx_program,
             ref_model,
@@ -459,7 +464,6 @@ def _compare_pytorch_onnx_with_ort(
 MAX_ONNX_OPSET_VERSION = _constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET
 TESTED_OPSETS = range(MIN_ONNX_OPSET_VERSION, MAX_ONNX_OPSET_VERSION + 1)
 
-# TODO(titaiwang): Change this when more versions are supported
 # The min onnx opset version to test for
 FX_MIN_ONNX_OPSET_VERSION = 18
 # The max onnx opset version to test for
@@ -469,11 +473,11 @@ def _compare_pytorch_onnx_with_ort(
 BOOL_TYPES = (torch.bool,)
 
 INT_TYPES = (
-    torch.int8,
-    torch.int16,
+    # torch.int8,
+    # torch.int16,
     torch.int32,
     torch.int64,
-    torch.uint8,
+    # torch.uint8,
 )
 
 QINT_TYPES = (
@@ -500,6 +504,8 @@ def _compare_pytorch_onnx_with_ort(
     *INT_TYPES,
     # Floating types
     *FLOAT_TYPES,
+    # Complex types
+    *COMPLEX_TYPES,
 )
 
 
@@ -620,6 +626,44 @@ def skip(
     )
 
 
+def skip_slow(
+    op_name: str,
+    variant_name: str = "",
+    *,
+    reason: str,
+    opsets: Optional[Collection[Union[int, Callable[[int], bool]]]] = None,
+    dtypes: Optional[Collection[torch.dtype]] = None,
+    matcher: Optional[Callable[[Any], Any]] = None,
+    model_type: Optional[pytorch_test_common.TorchModelType] = None,
+):
+    """Skips a test case in OpInfo that is too slow.
+
+    It needs further investigation to understand why it is slow.
+
+    Args:
+        op_name: The name of the operator.
+        variant_name: The name of the variant.
+        opsets: The opsets to expect the failure. e.g. [9, 10] or [opsets_before(11)]
+        dtypes: The dtypes to expect the failure.
+        reason: The reason for the failure.
+        matcher: A function that matches the test sample input. It is used only when
+            skip is in the SKIP_XFAIL_SUBTESTS list.
+        model_type: The type of the torch model. Defaults to None.
+    """
+    return DecorateMeta(
+        op_name=op_name,
+        variant_name=variant_name,
+        decorator=common_utils.slowTest,
+        opsets=opsets,
+        dtypes=dtypes,
+        reason=reason,
+        matcher=matcher,
+        enabled_if=not common_utils.TEST_WITH_SLOW,
+        test_behavior="skip",
+        model_type=model_type,
+    )
+
+
 def add_decorate_info(
     all_opinfos: Sequence[opinfo_core.OpInfo],
     test_class_name: str,
@@ -645,6 +689,11 @@ def add_decorate_info(
         assert (
             opinfo is not None
         ), f"Couldn't find OpInfo for {decorate_meta}. Did you need to specify variant_name?"
+        assert decorate_meta.model_type is None, (
+            f"Tested op: {decorate_meta.op_name} in wrong position! "
+            "If model_type needs to be specified, it should be "
+            "put under SKIP_XFAIL_SUBTESTS_WITH_MATCHER_AND_MODEL_TYPE."
+        )
         decorators = list(opinfo.decorators)
         new_decorator = opinfo_core.DecorateInfo(
             decorate_meta.decorator,
diff --git a/test/onnx/pytorch_test_common.py b/test/onnx/pytorch_test_common.py
index 5fc0e45bc92b8..b9b5e9859bab0 100644
--- a/test/onnx/pytorch_test_common.py
+++ b/test/onnx/pytorch_test_common.py
@@ -21,7 +21,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.insert(-1, pytorch_test_dir)
 
-torch.set_default_tensor_type("torch.FloatTensor")
+torch.set_default_dtype(torch.float)
 
 BATCH_SIZE = 2
 
@@ -64,9 +64,8 @@ def wrapper(*args, **kwargs):
 
 
 # skips tests for all versions below min_opset_version.
-# if exporting the op is only supported after a specific version,
 # add this wrapper to prevent running the test for opset_versions
-# smaller than the currently tested opset_version
+# smaller than `min_opset_version`.
 def skipIfUnsupportedMinOpsetVersion(min_opset_version):
     def skip_dec(func):
         @functools.wraps(func)
@@ -83,6 +82,8 @@ def wrapper(self, *args, **kwargs):
 
 
 # skips tests for all versions above max_opset_version.
+# add this wrapper to prevent running the test for opset_versions
+# higher than `max_opset_version`.
 def skipIfUnsupportedMaxOpsetVersion(max_opset_version):
     def skip_dec(func):
         @functools.wraps(func)
@@ -194,12 +195,16 @@ def wrapper(self, *args, **kwargs):
     return skip_dec
 
 
-def skip_dynamic_fx_test(reason: str, skip_model_type: TorchModelType = None):
-    """Skip dynamic exporting test.
+def xfail_dynamic_fx_test(
+    error_message: str,
+    model_type: Optional[TorchModelType] = None,
+    reason: Optional[str] = None,
+):
+    """Xfail dynamic exporting test.
 
     Args:
-        reason: The reason for skipping dynamic exporting test.
-        skip_model_type (TorchModelType): The model type to skip dynamic exporting test for.
+        reason: The reason for xfailing dynamic exporting test.
+        model_type (TorchModelType): The model type to xfail dynamic exporting test for.
             When None, model type is not used to skip dynamic tests.
 
     Returns:
@@ -210,11 +215,9 @@ def skip_dec(func):
         @functools.wraps(func)
         def wrapper(self, *args, **kwargs):
             if self.dynamic_shapes and (
-                not skip_model_type or self.model_type == skip_model_type
+                not model_type or self.model_type == model_type
             ):
-                raise unittest.SkipTest(
-                    f"Skip verify dynamic shapes test for FX. {reason}"
-                )
+                return xfail(error_message, reason)(func)(self, *args, **kwargs)
             return func(self, *args, **kwargs)
 
         return wrapper
@@ -222,11 +225,13 @@ def wrapper(self, *args, **kwargs):
     return skip_dec
 
 
-def skip_load_checkpoint_after_model_creation(reason: str):
-    """Skip loading checkpoint right after model initialization.
+def skip_dynamic_fx_test(reason: str, model_type: TorchModelType = None):
+    """Skip dynamic exporting test.
 
     Args:
         reason: The reason for skipping dynamic exporting test.
+        model_type (TorchModelType): The model type to skip dynamic exporting test for.
+            When None, model type is not used to skip dynamic tests.
 
     Returns:
         A decorator for skipping dynamic exporting test.
@@ -235,33 +240,11 @@ def skip_load_checkpoint_after_model_creation(reason: str):
     def skip_dec(func):
         @functools.wraps(func)
         def wrapper(self, *args, **kwargs):
-            if self.load_checkpoint_during_init:
-                raise unittest.SkipTest(
-                    f"Skip loading checkpoint during model initialization for FX tests. {reason}"
-                )
-            return func(self, *args, **kwargs)
-
-        return wrapper
-
-    return skip_dec
-
-
-def skip_op_level_debug_test(reason: str):
-    """Skip tests with op_level_debug enabled.
-
-    Args:
-        reason: The reason for skipping tests with op_level_debug enabled.
-
-    Returns:
-        A decorator for skipping tests with op_level_debug enabled.
-    """
-
-    def skip_dec(func):
-        @functools.wraps(func)
-        def wrapper(self, *args, **kwargs):
-            if self.op_level_debug:
+            if self.dynamic_shapes and (
+                not model_type or self.model_type == model_type
+            ):
                 raise unittest.SkipTest(
-                    f"Skip test with op_level_debug enabled. {reason}"
+                    f"Skip verify dynamic shapes test for FX. {reason}"
                 )
             return func(self, *args, **kwargs)
 
@@ -292,7 +275,7 @@ def wrapper(self, *args, **kwargs):
     return skip_dec
 
 
-def xfail(reason: str):
+def xfail(error_message: str, reason: Optional[str] = None):
     """Expect failure.
 
     Args:
@@ -301,7 +284,29 @@ def xfail(reason: str):
     Returns:
         A decorator for expecting test failure.
     """
-    return unittest.expectedFailure
+
+    def wrapper(func):
+        @functools.wraps(func)
+        def inner(self, *args, **kwargs):
+            try:
+                func(self, *args, **kwargs)
+            except Exception as e:
+                if isinstance(e, torch.onnx.OnnxExporterError):
+                    # diagnostic message is in the cause of the exception
+                    assert error_message in str(
+                        e.__cause__
+                    ), f"Expected error message: {error_message} NOT in {str(e.__cause__)}"
+                else:
+                    assert error_message in str(
+                        e
+                    ), f"Expected error message: {error_message} NOT in {str(e)}"
+                pytest.xfail(reason if reason else f"Expected failure: {error_message}")
+            else:
+                pytest.fail("Unexpected success!")
+
+        return inner
+
+    return wrapper
 
 
 # skips tests for opset_versions listed in unsupported_opset_versions.
@@ -340,10 +345,13 @@ def wrapper(self, *args, **kwargs):
     return wrapper
 
 
-def xfail_if_model_type_is_exportedprogram(reason: str):
+def xfail_if_model_type_is_exportedprogram(
+    error_message: str, reason: Optional[str] = None
+):
     """xfail test with models using ExportedProgram as input.
 
     Args:
+        error_message: The error message to raise when the test is xfailed.
         reason: The reason for xfail the ONNX export test.
 
     Returns:
@@ -354,9 +362,7 @@ def xfail_dec(func):
         @functools.wraps(func)
         def wrapper(self, *args, **kwargs):
             if self.model_type == TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM:
-                pytest.xfail(
-                    reason=f"Xfail model_type==torch.export.ExportedProgram. {reason}"
-                )
+                return xfail(error_message, reason)(func)(self, *args, **kwargs)
             return func(self, *args, **kwargs)
 
         return wrapper
@@ -364,7 +370,9 @@ def wrapper(self, *args, **kwargs):
     return xfail_dec
 
 
-def xfail_if_model_type_is_not_exportedprogram(reason: str):
+def xfail_if_model_type_is_not_exportedprogram(
+    error_message: str, reason: Optional[str] = None
+):
     """xfail test without models using ExportedProgram as input.
 
     Args:
@@ -378,9 +386,7 @@ def xfail_dec(func):
         @functools.wraps(func)
         def wrapper(self, *args, **kwargs):
             if self.model_type != TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM:
-                pytest.xfail(
-                    reason=f"Xfail model_type!=torch.export.ExportedProgram. {reason}"
-                )
+                return xfail(error_message, reason)(func)(self, *args, **kwargs)
             return func(self, *args, **kwargs)
 
         return wrapper
diff --git a/test/onnx/test_fx_op_consistency.py b/test/onnx/test_fx_op_consistency.py
index 19ea2e9bbde4f..004574d3e8c0f 100644
--- a/test/onnx/test_fx_op_consistency.py
+++ b/test/onnx/test_fx_op_consistency.py
@@ -5,14 +5,20 @@
 
 Usage:
 
-    pytest test/onnx/test_op_consistency.py
+    1. Test all operators:
 
-    To run tests on a specific operator (e.g. torch.ceil):
+    pytest test/onnx/test_fx_op_consistency.py
 
-    pytest test/onnx/test_op_consistency.py -k ceil
-    pytest test/onnx/test_op_consistency.py -k nn_functional_scaled_dot_product_attention
+    2. To run tests on a specific operator (e.g. torch.ceil):
 
-    Read more on Running and writing tests:
+    pytest test/onnx/test_fx_op_consistency.py -k ceil
+    pytest test/onnx/test_fx_op_consistency.py -k nn_functional_scaled_dot_product_attention
+
+    3. Set `CREATE_REPRODUCTION_REPORT=1` to create markdown files for reproduction of errors. E.g.
+
+    CREATE_REPRODUCTION_REPORT=1 python -m pytest test/onnx/test_fx_op_consistency.py -k div_mode_int
+
+    NOTE: Read more on Running and writing tests:
         https://github.com/pytorch/pytorch/wiki/Running-and-writing-tests
 
 Note:
@@ -21,7 +27,7 @@
 
     2. Install pytest-xdist to run tests in parallel if runng all tests is the goal.
 
-    3. When new ops are supported, please scroll down to modify the EXPECTED_SKIPS_OR_FAILS and
+    3. When new ops are supported, please scroll down to modify the EXPECTED_SKIPS_OR_FAILS_WITH_DTYPES and
     TESTED_OPS lists. See "Modify this section"
 
 """
@@ -30,15 +36,30 @@
 
 import copy
 import itertools
-from typing import Any, Callable, Collection, Mapping, Optional, Tuple, Type, Union
+import os
+from typing import (
+    Any,
+    Callable,
+    Collection,
+    List,
+    Mapping,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
+
+import error_reproduction
 
 import onnx_test_common
 
 import parameterized
+import pytest
 import pytorch_test_common
 
 import torch
-from onnx_test_common import skip, xfail
+from onnx_test_common import skip, skip_slow, xfail
+from torch.onnx._internal.diagnostics import _rules
 from torch.testing._internal import (
     common_device_type,
     common_methods_invocations,
@@ -46,132 +67,6 @@
 )
 from torch.testing._internal.opinfo import core as opinfo_core
 
-# Modify this section ##########################################################
-# NOTE: Modify this section as more ops are supported. The list should be sorted
-# alphabetically.
-#
-# For example, to add a test for torch.ceil:
-# 1.  Add "ceil" to TESTED_OPS then run pytest.
-# 2.  If the test fails, fix the error or add a new entry to EXPECTED_SKIPS_OR_FAILS.
-
-# TODO: Directly modify DecorateInfo in each OpInfo in ob_db when all ops are enabled.
-# Ops to be tested for numerical consistency between onnx and pytorch
-TESTED_OPS: frozenset[str] = frozenset(
-    [
-        "abs",
-        "acos",
-        "acosh",
-        "add",
-        "addmm",
-        "all",
-        "allclose",
-        "amax",
-        "amin",
-        "any",
-        "arange",
-        "argmax",
-        "argmin",
-        "as_strided",
-        "asin",
-        "asinh",
-        "atan",
-        "atanh",
-        "atleast_1d",
-        "atleast_2d",
-        "atleast_3d",
-        "baddbmm",
-        "bmm",
-        "broadcast_to",
-        "cat",
-        "ceil",
-        "chunk",
-        "clamp",
-        "clamp_max",
-        "clamp_min",
-        "clone",
-        # "col2im", extra opinfo needed
-        "constant_pad_nd",
-        "contiguous",
-        # "copy",  copy is not in OPS_DB
-        "cos",
-        "cosh",
-        "cross",
-        "cumsum",
-        # "detach",  detach is not in OP-TEST-DB
-        "div",
-        "dot",
-        # "empty",  non-deterministic
-        # "empty_like",  non-deterministic
-        # "empty_strided",  empty_strided is not in OPS_DB
-        "eq",
-        "equal",
-        "erf",
-        "exp",
-        "exp2",
-        "expand",
-        "expand_as",
-        "fill",
-        "flip",
-        "floor",
-        "fmod",
-        "full",
-        "full_like",
-        "gather",
-        "hstack",  # aten::cat is invoked instead
-        "index_put",
-        "linalg.vector_norm",
-        "logit",
-        "mean",
-        "native_batch_norm",
-        # "new_empty",  non-deterministic
-        # "new_empty_strided",  non-deterministic
-        "new_full",
-        "new_ones",
-        "new_zeros",
-        "nn.functional.adaptive_avg_pool1d",
-        "nn.functional.adaptive_avg_pool2d",
-        "nn.functional.adaptive_avg_pool3d",
-        "nn.functional.avg_pool1d",
-        "nn.functional.avg_pool2d",
-        "nn.functional.avg_pool3d",
-        "nn.functional.batch_norm",
-        "nn.functional.conv1d",
-        # "nn.functional.conv2d",  AssertionError: The values for attribute 'shape' do not match in float32
-        # "nn.functional.conv3d",  extra opinfo needed
-        # "nn.functional.convolution",  extra opinfo needed
-        "nn.functional.cross_entropy",
-        "nn.functional.celu",
-        "nn.functional.dropout",
-        "nn.functional.elu",
-        "nn.functional.embedding",
-        "nn.functional.embedding_bag",
-        "nn.functional.max_pool1d",
-        "nn.functional.max_pool2d",
-        "nn.functional.max_pool3d",
-        "nn.functional.nll_loss",
-        "nn.functional.normalize",
-        # "nn.functional.scaled_dot_product_attention"  non-deterministic
-        "nonzero",
-        "rsub",
-        "scatter_add",
-        "scatter_reduce",
-        "square",
-        "stft",
-        "sub",
-        "sum",
-        "unflatten",
-        "var_mean",
-        "vstack",  # aten::cat is invoked instead
-    ]
-)
-
-COMPLEX_TESTED_OPS = frozenset(
-    [
-        "abs",
-        "stft",
-    ]
-)
-
 
 # NOTE: For ATen signature modifications that will break ONNX export,
 # use **xfail_torchlib_forward_compatibility** and **skip_torchlib_forward_compatibility** instead of xfail or skip
@@ -238,7 +133,42 @@ def skip_torchlib_forward_compatibility(
 #     2a. If a test is now failing because of xpass, because some previous errors
 #     are now fixed, removed the corresponding xfail.
 #     2b. If a test is not failing consistently, use skip.
-EXPECTED_SKIPS_OR_FAILS: Tuple[onnx_test_common.DecorateMeta, ...] = (
+# NOTE: EXPECTED_SKIPS_OR_FAILS_WITH_DTYPES only supports dtypes. If a matcher or model_type
+# is needed, use the SKIP_XFAIL_SUBTESTS_WITH_MATCHER_AND_MODEL_TYPE list further down below.
+EXPECTED_SKIPS_OR_FAILS_WITH_DTYPES: Tuple[onnx_test_common.DecorateMeta, ...] = (
+    xfail(
+        "__getitem__",
+        reason="io_adaper doesn't support __getitem__ input slice(0, 3, None)",
+    ),
+    xfail(
+        "__radd__",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_script_does_not_support("Add", "bool"),
+    ),
+    xfail(
+        "__rmatmul__",
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "__rpow__",
+        dtypes=onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_does_not_support("Pow", "int"),
+    ),
+    skip(
+        "_native_batch_norm_legit",
+        reason=onnx_test_common.reason_onnx_script_does_not_support("cpu is not supported: \
+            https://github.com/microsoft/onnxscript/pull/1289")
+    ),
+    skip(
+        "_batch_norm_with_update",
+        dtypes=(torch.float16,),
+        reason="fixme: Assertion error: result mismatch and type error",
+    ),
+    xfail(
+        "_softmax_backward_data",
+        dtypes=(torch.float16,),
+        reason="fixme: Assertion error: result mismatch",
+    ),
     xfail(
         "add", dtypes=onnx_test_common.BOOL_TYPES,
         reason=onnx_test_common.reason_onnx_does_not_support("Add")
@@ -250,12 +180,50 @@ def skip_torchlib_forward_compatibility(
             "Add", "int8, int16, uint8 have type issue."
         ),
     ),
+    xfail(
+        "addbmm",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_dynamo_does_not_support("Addbmm", "complex64")
+    ),
     xfail(
         "addmm", dtypes=onnx_test_common.BOOL_TYPES,
         reason=onnx_test_common.reason_onnx_does_not_support("Addmm")
     ),
     xfail(
-        "allclose", dtypes=onnx_test_common.BOOL_TYPES + onnx_test_common.INT_TYPES + onnx_test_common.FLOAT_TYPES,
+        "addmm",
+        variant_name="decomposed",
+        dtypes=onnx_test_common.BOOL_TYPES + onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_does_not_support("Addmm")
+    ),
+    skip(
+        "addmm", dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_dynamo_does_not_support("Addmm", "complex64 (core dump)")
+    ),
+    skip(
+        "addmm",
+        variant_name="decomposed",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_dynamo_does_not_support("Addmm", "complex64 (core dump)")
+    ),
+    xfail(
+        "addr",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_script_does_not_support(
+            "Addr", "bool"
+        ),
+    ),
+    xfail(
+        "addr",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_dynamo_does_not_support("Addr", "complex64")
+    ),
+    xfail(
+        "all",
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support(
+            "Op (ReduceXXX) [ShapeInferenceError] axis must be in [-rank, rank-1]. input rank was 0")
+    ),
+    xfail(
+        "allclose",
         reason=onnx_test_common.reason_dynamo_does_not_support("Allclose")
     ),
     xfail(
@@ -267,6 +235,16 @@ def skip_torchlib_forward_compatibility(
         "amin", dtypes=(torch.int16, *onnx_test_common.BOOL_TYPES),
         reason=onnx_test_common.reason_dynamo_does_not_support("ReduceMin", "bool, int16")
     ),
+    xfail(
+        "aminmax",
+        dtypes=(torch.int16, *onnx_test_common.BOOL_TYPES),
+        reason=onnx_test_common.reason_onnx_does_not_support("ReduceMin", "bool, int16"),
+    ),
+    xfail(
+        "any",
+        reason=onnx_test_common.reason_onnx_does_not_support(
+            "Op (ReduceXXX) [ShapeInferenceError] axis must be in [-rank, rank-1]. input rank was 0")
+    ),
     xfail(
         "arange",
         dtypes=(torch.uint8,),
@@ -299,11 +277,20 @@ def skip_torchlib_forward_compatibility(
             "ArgMin", "uint8, int8, int16, int64"
         ),
     ),
+    xfail(
+        "argwhere",
+        reason="fixme: Assertion error: result mismatch",
+    ),
     skip(
         "as_strided",
         variant_name="partial_views",
         reason="ONNX doesn't have partial view for tensor; [PostInline][ORT] segfaults",
     ),
+    xfail(
+        "atan2",
+        dtypes=onnx_test_common.BOOL_TYPES + onnx_test_common.INT_TYPES,
+        reason="fixme: Assertion error: result mismatch",
+    ),
     xfail(
         "baddbmm",
         dtypes=(
@@ -315,6 +302,23 @@ def skip_torchlib_forward_compatibility(
             "Matmul", "uint8, int8, int16"
         ),
     ),
+    xfail(
+        "baddbmm",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_dynamo_does_not_support("baddbmm", "complex64")
+    ),
+    xfail(
+        "bernoulli",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "bfloat16",
+        reason="fixme: ORT errors with RuntimeError: No corresponding Numpy type for Tensor Type.",
+    ),
+    xfail(
+        "bincount",
+        reason=onnx_test_common.reason_dynamo_does_not_support("aten.bincount.default"),
+    ),
     xfail(
         "bmm",
         dtypes=(
@@ -326,19 +330,31 @@ def skip_torchlib_forward_compatibility(
             "Matmul", "uint8, int8, int16"
         ),
     ),
+    xfail(
+        "broadcast_shapes",
+        reason=onnx_test_common.reason_dynamo_does_not_support("output is int"),
+    ),
+    xfail(
+        "cauchy",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
     skip(
         "ceil", dtypes=onnx_test_common.BOOL_TYPES + onnx_test_common.INT_TYPES,
         reason=onnx_test_common.reason_onnx_does_not_support("Ceil", "bool and int")
     ),
+    xfail(
+        "chalf",
+        reason="fixme: ONNX shape type inference error: Invalid tensor data type 0."
+    ),
     xfail(
         "chunk", dtypes=onnx_test_common.BOOL_TYPES,
         reason=onnx_test_common.reason_onnx_runtime_does_not_support("Chunk", "bool")
     ),
     xfail(
         "chunk",
-        dtypes=(torch.uint8, torch.int8, torch.int16, torch.float16,),
+        dtypes=(torch.uint8, torch.int8, torch.int16,),
         reason=onnx_test_common.reason_onnx_runtime_does_not_support(
-            "Chunk", "uint8, int8, int16, float16"
+            "Chunk", "uint8, int8, int16"
         ),
     ),
     xfail(
@@ -377,18 +393,32 @@ def skip_torchlib_forward_compatibility(
             "Constant_pad_nd", "int16"
         ),
     ),
+    xfail(
+        "constant_pad_nd",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_dynamo_does_not_support(
+            "Constant_pad_nd", "complex64"
+        ),
+    ),
+    xfail(
+        "corrcoef",
+        reason=onnx_test_common.reason_dynamo_does_not_support(
+            "aten.equal.default"
+        ),
+    ),
+    xfail(
+        "cov",
+        reason=onnx_test_common.reason_dynamo_does_not_support(
+            "aten.equal.default"
+        ),
+    ),
     xfail(
         "cumsum", dtypes=onnx_test_common.BOOL_TYPES + (torch.uint8, torch.int8, torch.int16,),
         reason=onnx_test_common.reason_onnx_does_not_support("Cumsum", "bool, uint8, int8, int16")
     ),
-    # See https://github.com/pytorch/pytorch/issues/111454
     xfail(
-        "cumsum", dtypes=(torch.float16,),
-        reason=onnx_test_common.reason_onnx_runtime_does_not_support("RUNTIME_EXCEPTION : \
-            Exception during initialization: /onnxruntime_src/onnxruntime/core/framework/\
-            allocation_planner.cc:230 int& onnxruntime::PlannerImpl::\
-            UseCount(onnxruntime::OrtValueIndex) n >= 0 && static_cast<size_t>(n) \
-            < ort_value_info_.size() was false.")
+        "combinations",
+        reason=onnx_test_common.reason_dynamo_does_not_support("aten.masked.select"),
     ),
     xfail(
         "cross",
@@ -398,6 +428,20 @@ def skip_torchlib_forward_compatibility(
         "dot", dtypes=(torch.uint8, torch.int8, torch.int16,),
         reason=onnx_test_common.reason_onnx_does_not_support("MatMul", "uint8, int8, int16")
     ),
+    skip(
+        "dot",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_dynamo_does_not_support("Dot", "complex64(core dump)"),
+    ),
+    xfail(
+        "empty",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason="fixme: kwargs dtpye=complex64 is not supported in ONNX."
+    ),
+    xfail(
+        "empty_strided",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
     xfail(
         "eq",
         dtypes=(torch.uint8, torch.int8, torch.int16,),
@@ -408,161 +452,961 @@ def skip_torchlib_forward_compatibility(
         reason=onnx_test_common.reason_dynamo_does_not_support("aten.equal.default")
     ),
     xfail(
-        "floor",
-        dtypes=onnx_test_common.BOOL_TYPES + onnx_test_common.INT_TYPES,
-        reason=onnx_test_common.reason_onnx_does_not_support("Floor", "bool, int"),
+        "exponential",
+        reason=onnx_test_common.reason_dynamo_does_not_support("exponential"),
     ),
     xfail(
-        "index_put",
-        dtypes=onnx_test_common.BOOL_TYPES,
-        reason=onnx_test_common.reason_onnx_script_does_not_support("index_put", "bool"),
+        "fft.fft",
+        reason="fixme: Assertion error: result mismatch",
     ),
     xfail(
-        "index_put",
-        dtypes=(torch.uint8, torch.int8, torch.int16,),
-        reason=onnx_test_common.reason_onnx_script_does_not_support("Add", "int8, int16"),
+        "fft.fft2",
+        reason="fixme: Assertion error: result mismatch",
     ),
     xfail(
-        "nn.functional.adaptive_avg_pool2d",
-        reason=onnx_test_common.reason_onnx_script_does_not_support("RecursionError: \
-            maximum recursion depth exceeded while calling a Python object"),
+        "fft.fftn",
+        reason="fixme: Assertion error: result mismatch",
     ),
     xfail(
-        "nn.functional.adaptive_avg_pool3d",
-        reason=onnx_test_common.reason_onnx_script_does_not_support("aten._adaptive_avg_pool3d.default"),
+        "fft.ifft",
+        reason="fixme: Assertion error: result mismatch",
     ),
     xfail(
-        "nn.functional.avg_pool1d",
-        dtypes=onnx_test_common.INT_TYPES,
-        reason=onnx_test_common.reason_onnx_does_not_support("AveragePool", "int"),
+        "fft.ifft2",
+        reason="fixme: Assertion error: result mismatch",
     ),
     xfail(
-        "nn.functional.avg_pool2d",
-        dtypes=onnx_test_common.INT_TYPES,
-        reason=onnx_test_common.reason_onnx_does_not_support("AveragePool", "int"),
+        "fft.ifftn",
+        reason="fixme: Assertion error: result mismatch",
     ),
     xfail(
-        "nn.functional.avg_pool3d",
-        dtypes=onnx_test_common.INT_TYPES,
-        reason=onnx_test_common.reason_onnx_does_not_support("AveragePool", "int"),
+        "fft.irfft",
+        reason="fixme: Assertion error: result mismatch",
     ),
     xfail(
-        "nn.functional.conv1d",
-        dtypes=(torch.int64,),
-        reason=onnx_test_common.reason_onnx_does_not_support("Conv1d", "int64"),
+        "fft.irfft2",
+        reason="fixme: Assertion error: result mismatch",
     ),
     xfail(
-        "nn.functional.conv2d",
-        dtypes=(torch.int64,),
-        reason=onnx_test_common.reason_onnx_does_not_support("Conv2d", "int64"),
+        "fft.irfftn",
+        reason=onnx_test_common.reason_onnx_script_does_not_support("aten._fft_r2c.default"),
     ),
     xfail(
-        "nn.functional.dropout",
-        reason=onnx_test_common.reason_dynamo_does_not_support("Dropout"),
+        "fft.rfft",
+        reason=onnx_test_common.reason_onnx_script_does_not_support("aten._fft_r2c.default"),
     ),
     xfail(
-        "nn.functional.max_pool2d",
+        "fft.rfftn",
+        reason=onnx_test_common.reason_onnx_script_does_not_support("aten._fft_r2c.default"),
+    ),
+    xfail(
+        "fft.rfft2",
+        reason=onnx_test_common.reason_onnx_script_does_not_support("aten._fft_r2c.default"),
+    ),
+    xfail(
+        "floor",
         dtypes=onnx_test_common.BOOL_TYPES + onnx_test_common.INT_TYPES,
-        reason=onnx_test_common.reason_onnx_does_not_support("Max_pool2d"),
+        reason=onnx_test_common.reason_onnx_does_not_support("Floor", "bool, int"),
     ),
     xfail(
-        "nn.functional.max_pool3d",
+        "floor_divide",
         dtypes=onnx_test_common.BOOL_TYPES + onnx_test_common.INT_TYPES,
-        reason=onnx_test_common.reason_onnx_does_not_support("Max_pool3d"),
+        reason=onnx_test_common.reason_onnx_does_not_support("Floor", "bool, int"),
     ),
     xfail(
-        "nonzero",
-        dtypes=(torch.int8, torch.int16),
-        reason=onnx_test_common.reason_onnx_runtime_does_not_support("NonZero", "int8, int16"),
+        "full",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_dynamo_does_not_support("full", "complex64")
     ),
     xfail(
-        "rsub",
-        dtypes=(torch.uint8, torch.int8, torch.int16),
-        reason=onnx_test_common.reason_onnx_runtime_does_not_support(
-            "Mul", "uint8, int8, int16"
-        ),
+        "full_like",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_dynamo_does_not_support("full_like", "complex64")
     ),
     xfail(
-        "scatter_add",
-        dtypes=(torch.float16,),
-        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ScatterElements reduction=sum", "float16"),
+        "geometric",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
     ),
     xfail(
-        "scatter_reduce",
-        variant_name="sum",
-        dtypes=(torch.float16,),
-        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ScatterElements reduction=sum", "float16"),
+        "heaviside",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_script_does_not_support("Heaviside", "bool"),
     ),
     xfail(
-        "scatter_reduce",
-        variant_name="prod",
-        dtypes=(torch.float16,),
-        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ScatterElements reduction=prod", "float16"),
+        "index_fill",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_dynamo_does_not_support("index_fill", "complex64")
     ),
     xfail(
-        "scatter_reduce",
-        variant_name="amin",
-        dtypes=onnx_test_common.BOOL_TYPES + (torch.float16,),
-        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ScatterElements reduction=amin", "float16"),
+        "index_put",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_script_does_not_support("index_put", "bool"),
     ),
     xfail(
-        "scatter_reduce",
-        variant_name="amax",
-        dtypes=onnx_test_common.BOOL_TYPES + (torch.float16,),
-        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ScatterElements reduction=amax", "float16"),
+        "index_put",
+        dtypes=(torch.uint8, torch.int8, torch.int16,),
+        reason=onnx_test_common.reason_onnx_script_does_not_support("Add", "int8, int16"),
     ),
     xfail(
-        "scatter_reduce",
-        variant_name="mean",
-        reason="ONNX doesn't support reduce='mean' option",
+        "isnan",
+        dtypes=onnx_test_common.INT_TYPES + onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_does_not_support("IsNaN", "int, bool"),
     ),
     xfail(
-        "square",
-        dtypes=(torch.int8, torch.uint8, torch.int16),
-        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Pow", "int8, uint8, int16"),
+        "istft",
+        reason=onnx_test_common.reason_dynamo_does_not_support("data-dependent"),
     ),
     xfail(
-        "stft",
-        reason=onnx_test_common.reason_dynamo_does_not_support("aten._fft_r2c.default"),
+        "item",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
     ),
     xfail(
-        "sub",
-        dtypes=(torch.uint8, torch.int8, torch.int16),
-        reason=onnx_test_common.reason_onnx_runtime_does_not_support(
-            "Mul", "uint8, int8, int16"
-        ),
+        "lerp",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_dynamo_does_not_support("lerp", "complex64")
     ),
     xfail(
-        "unflatten", dtypes=onnx_test_common.BOOL_TYPES,
-        reason=onnx_test_common.reason_onnx_does_not_support("Unflatten")
+        "linalg.lstsq",
+        reason=onnx_test_common.reason_dynamo_does_not_support("aten.linalg_lstsq.default"),
     ),
-)
-# fmt: on
-
-SKIP_XFAIL_SUBTESTS: tuple[onnx_test_common.DecorateMeta, ...] = (
     xfail(
-        "addmm",  # xfail can't only use dtypes to catch all cases
-        matcher=lambda sample: sample.input.dtype
-        in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64),
-        reason=onnx_test_common.reason_onnx_runtime_does_not_support(
-            "Gemm", "uint8, int8, int16, int32, int64"
-        ),
+        "linalg.lstsq",
+        variant_name="grad_oriented",
+        reason=onnx_test_common.reason_dynamo_does_not_support("aten.linalg_lstsq.default"),
     ),
-    skip(
-        "amax",
-        matcher=lambda sample: len(sample.input.shape) == 0,
-        reason="Op (ReduceMax) [ShapeInferenceError] axis must be in [-rank, rank-1]. input rank was 0",
+    xfail(
+        "linalg.norm",
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "linalg.norm",
+        variant_name="subgradients_at_zero",
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "linalg.vecdot",
+        reason="fixme: Assertion error: result shape mismatch",
+    ),
+    xfail(
+        "linspace",
+        dtypes=(torch.int64, torch.int32,),
+        reason="fixme: Results do not match with PyTorch. https://github.com/microsoft/onnxscript/issues/854",
+    ),
+    xfail(
+        "linspace",
+        variant_name="tensor_overload",
+        dtypes=(torch.int64, torch.int32,),
+        reason="fixme: Results do not match with PyTorch. https://github.com/microsoft/onnxscript/issues/854",
+    ),
+    xfail(
+        "linspace",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_onnx_script_does_not_support("linspace", "complex64")
+    ),
+    xfail(
+        "linspace",
+        variant_name="tensor_overload",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_onnx_script_does_not_support("linspace", "complex64")
+    ),
+    xfail(
+        "log_normal",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "log_softmax",
+        dtypes=(torch.float16,),
+        reason="fixme: ORT optimizer error: https://github.com/microsoft/onnxruntime/issues/16438",
+    ),
+    xfail(
+        "log_softmax",
+        variant_name="with_dtype",
+        dtypes=(torch.float16,),
+        reason="fixme: ORT optimizer error: https://github.com/microsoft/onnxruntime/issues/16438",
+    ),
+    xfail(
+        "logcumsumexp",
+        reason=onnx_test_common.reason_onnx_does_not_support(
+            "Op (ReduceXXX) [ShapeInferenceError] axis must be in [-rank, rank-1]. input rank was 0")
+    ),
+    xfail(
+        "logical_and",
+        dtypes=onnx_test_common.FLOAT_TYPES + onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_script_does_not_support("And", "float, int"),
+    ),
+    xfail(
+        "logical_not",
+        dtypes=onnx_test_common.FLOAT_TYPES + onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_script_does_not_support("Not", "float, int"),
+    ),
+    xfail(
+        "logical_or",
+        dtypes=onnx_test_common.FLOAT_TYPES + onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_script_does_not_support("Or", "float, int"),
+    ),
+    xfail(
+        "logical_xor",
+        dtypes=onnx_test_common.FLOAT_TYPES + onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_script_does_not_support("Xor", "float, int"),
+    ),
+    xfail(
+        "logsumexp",
+        dtypes=onnx_test_common.BOOL_TYPES + onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ReduceLogSumExp", "bool, int"),
+    ),
+    xfail(
+        "masked.logsumexp",
+        reason="fixme: https://github.com/onnx/onnx/issues/4986",
+    ),
+    xfail(
+        "masked.amax",
+        reason="fixme: ORT optimizer error: https://github.com/microsoft/onnxruntime/issues/16438",
+    ),
+    xfail(
+        "masked.amin",
+        reason="fixme: ORT optimizer error: https://github.com/microsoft/onnxruntime/issues/16438",
+    ),
+    xfail(
+        "masked.argmin",
+        dtypes=onnx_test_common.BOOL_TYPES + onnx_test_common.FLOAT_TYPES + (torch.int64,),
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "masked.argmax",
+        dtypes=onnx_test_common.BOOL_TYPES + onnx_test_common.FLOAT_TYPES + (torch.int64,),
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "masked_fill",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Where", "bool"),
+    ),
+    xfail(
+        "masked.sum",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Where", "bool"),
+    ),
+    xfail(
+        "masked.log_softmax",
+        dtypes=(torch.float16,),
+        reason="fixme: ORT optimizer error: https://github.com/microsoft/onnxruntime/issues/16438",
+    ),
+    xfail(
+        "masked.mean",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_does_not_support("ReduceMean", "bool"),
+    ),
+    xfail(
+        "masked.norm",
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "masked.prod",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Where", "bool"),
+    ),
+    xfail(
+        "masked_select",
+        reason=onnx_test_common.reason_dynamo_does_not_support("aten.masked_select.default"),
+    ),
+    xfail(
+        "max",
+        variant_name="reduction_no_dim",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ReduceMax", "bool"),
+    ),
+    xfail(
+        "max",
+        variant_name="reduction_with_dim",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ReduceMax", "bool"),
+    ),
+    xfail(
+        "max",
+        variant_name="reduction_with_dim",
+        reason="https://github.com/onnx/onnx/issues/4986",
+    ),
+    xfail(
+        "mean",
+        reason="(ReduceMean) [ShapeInferenceError] axis must be in [-rank, rank-1]. input rank was 0",
+    ),
+    xfail(
+        "min",
+        variant_name="reduction_no_dim",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ReduceMin", "bool"),
+    ),
+    xfail(
+        "min",
+        variant_name="reduction_with_dim",
+        dtypes=onnx_test_common.BOOL_TYPES + (torch.int64,),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ReduceMin", "bool"),
+    ),
+    skip(
+        "mm",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_dynamo_does_not_support("MM", "complex64(core dump)"),
+    ),
+    xfail(
+        "multinomial",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "nanquantile",
+        reason=onnx_test_common.reason_dynamo_does_not_support("aten.equal.default")
+    ),
+    xfail(
+        "nansum",
+        dtypes=onnx_test_common.INT_TYPES + onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("IsNaN", "int, bool"),
+    ),
+    xfail(
+        "narrow",
+        reason=onnx_test_common.reason_dynamo_does_not_support("data-dependent"),
+    ),
+    skip(
+        "native_batch_norm",
+        reason=onnx_test_common.reason_onnx_script_does_not_support("cpu is not supported: \
+            https://github.com/microsoft/onnxscript/pull/1289")
+    ),
+    xfail(
+        "native_layer_norm",
+        dtypes=(torch.float16,),
+        reason="fixme: ORT optimizer error: https://github.com/microsoft/onnxruntime/issues/16438",
+    ),
+    xfail(
+        "new_full",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason=onnx_test_common.reason_dynamo_does_not_support("new_full", "complex64")
+    ),
+    xfail(
+        "nn.functional.adaptive_avg_pool2d",
+        reason=onnx_test_common.reason_onnx_script_does_not_support("RecursionError: \
+            maximum recursion depth exceeded while calling a Python object"),
+    ),
+    xfail(
+        "nn.functional.adaptive_avg_pool3d",
+        reason=onnx_test_common.reason_onnx_script_does_not_support("aten._adaptive_avg_pool3d.default"),
+    ),
+    xfail(
+        "nn.functional.alpha_dropout",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "nn.functional.avg_pool1d",
+        dtypes=onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_does_not_support("AveragePool", "int"),
+    ),
+    xfail(
+        "nn.functional.avg_pool2d",
+        dtypes=onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_does_not_support("AveragePool", "int"),
+    ),
+    xfail(
+        "nn.functional.avg_pool3d",
+        dtypes=onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_does_not_support("AveragePool", "int"),
+    ),
+    xfail(
+        "nn.functional.batch_norm",
+        dtypes=(torch.float16,),
+        reason="fixme: https://github.com/microsoft/onnxscript/issues/1270",
+    ),
+    xfail(
+        "nn.functional.conv_transpose1d",
+        dtypes=(torch.int64,),
+        reason=onnx_test_common.reason_onnx_does_not_support("Conv1d", "int64"),
+    ),
+    xfail(
+        "nn.functional.conv_transpose2d",
+        dtypes=(torch.int64,),
+        reason=onnx_test_common.reason_onnx_does_not_support("Conv2d", "int64"),
+    ),
+    xfail(
+        "nn.functional.conv_transpose3d",
+        dtypes=(torch.int64,),
+        reason=onnx_test_common.reason_onnx_does_not_support("Conv3d", "int64"),
+    ),
+    skip(
+        "nn.functional.conv_transpose1d",
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    skip(
+        "nn.functional.conv_transpose2d",
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    skip(
+        "nn.functional.conv_transpose3d",
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "nn.functional.conv1d",
+        dtypes=(torch.int64,),
+        reason=onnx_test_common.reason_onnx_does_not_support("Conv1d", "int64"),
+    ),
+    xfail(
+        "nn.functional.conv2d",
+        dtypes=(torch.int64,),
+        reason=onnx_test_common.reason_onnx_does_not_support("Conv2d", "int64"),
+    ),
+    xfail(
+        "nn.functional.conv2d",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "nn.functional.conv3d",
+        dtypes=(torch.int64,),
+        reason=onnx_test_common.reason_onnx_does_not_support("Conv3d", "int64"),
+    ),
+    xfail(
+        "nn.functional.conv3d",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "nn.functional.ctc_loss",
+        reason=onnx_test_common.reason_dynamo_does_not_support("aten.ctc_loss.default"),
+    ),
+    xfail(
+        "nn.functional.dropout",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "nn.functional.dropout2d",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "nn.functional.dropout3d",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "nn.functional.feature_alpha_dropout",
+        variant_name="with_train",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "nn.functional.feature_alpha_dropout",
+        variant_name="without_train",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "nn.functional.fractional_max_pool2d",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "nn.functional.fractional_max_pool3d",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "nn.functional.gaussian_nll_loss",
+        reason=onnx_test_common.reason_dynamo_does_not_support("aten.gaussian_nll_loss"),
+    ),
+    xfail(
+        "nn.functional.grid_sample",
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "nn.functional.group_norm",
+        dtypes=(torch.float16,),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("GroupNormalization", "float16"),
+    ),
+    xfail(
+        "nn.functional.local_response_norm",
+        dtypes=(torch.int64,),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("avgpool", "int64"),
+    ),
+    xfail(
+        "nn.functional.linear",
+        dtypes=onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_does_not_support("Gemm", "int"),
+    ),
+    xfail(
+        "nn.functional.max_pool2d",
+        dtypes=onnx_test_common.BOOL_TYPES + onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_does_not_support("Max_pool2d"),
+    ),
+    xfail(
+        "nn.functional.max_pool3d",
+        dtypes=onnx_test_common.BOOL_TYPES + onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_does_not_support("Max_pool3d"),
+    ),
+    xfail(
+        "nn.functional.multi_head_attention_forward",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "nn.functional.one_hot",
+        reason=onnx_test_common.reason_dynamo_does_not_support("data-dependent"),
+    ),
+    xfail(
+        "nn.functional.pad",
+        variant_name="replicate",
+        reason="fixme: ORT error: padding size",
+    ),
+    xfail(
+        "nn.functional.pad",
+        variant_name="replicate_negative",
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "nn.functional.pad",
+        variant_name="reflect",
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "nn.functional.rrelu",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "nn.functional.rrelu",
+        dtypes=(torch.int64,),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Relu", "int64"),
+    ),
+    skip(
+        "nn.functional.scaled_dot_product_attention",
+        matcher=lambda sample: sample.kwargs.get("dropout_p") != 0.0,
+        reason="dropout is random so the results do not match",
+    ),
+    xfail(
+        "nn.functional.scaled_dot_product_attention",
+        dtypes=(torch.float16,),
+        reason="fixme: ORT failed. https://github.com/microsoft/onnxruntime/issues/16438",
+    ),
+    xfail(
+        "nn.functional.selu",
+        reason="fixme: nn.functional.selu is not in torch._decomp.decomposition_table",
+    ),
+    xfail(
+        "nn.functional.soft_margin_loss",
+        dtypes=(torch.float16,),
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "nn.functional.tanhshrink",
+        dtypes=(torch.float16,),
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "nonzero",
+        dtypes=(torch.int8, torch.int16),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("NonZero", "int8, int16"),
+    ),
+    xfail(
+        "normal",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "normal",
+        variant_name="in_place",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "normal",
+        variant_name="number_mean",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "ones",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason="fixme: kwargs dtpye=complex64 is not supported in ONNX."
+    ),
+    xfail(
+        "pca_lowrank",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "quantile",
+        reason=onnx_test_common.reason_dynamo_does_not_support("aten.equal.default")
+    ),
+    xfail(
+        "rand_like",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "randint",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "randint_like",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "randn",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "randn_like",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "resize_",
+        reason=onnx_test_common.reason_dynamo_does_not_support("resize_as_")
+    ),
+    xfail(
+        "resize_as_",
+        reason=onnx_test_common.reason_dynamo_does_not_support("resize_as_")
+    ),
+    xfail(
+        "round",
+        dtypes=onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Round", "int"),
+    ),
+    xfail(
+        "rsub",
+        dtypes=(torch.uint8, torch.int8, torch.int16),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support(
+            "Mul", "uint8, int8, int16"
+        ),
+    ),
+    xfail(
+        "scatter_add",
+        dtypes=(torch.float16,),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ScatterElements reduction=sum", "float16"),
+    ),
+    xfail(
+        "scatter_reduce",
+        variant_name="sum",
+        dtypes=(torch.float16,),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ScatterElements reduction=sum", "float16"),
+    ),
+    xfail(
+        "scatter_reduce",
+        variant_name="prod",
+        dtypes=(torch.float16,),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ScatterElements reduction=prod", "float16"),
+    ),
+    xfail(
+        "scatter_reduce",
+        variant_name="amin",
+        dtypes=onnx_test_common.BOOL_TYPES + (torch.float16,),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ScatterElements reduction=amin", "float16"),
+    ),
+    xfail(
+        "scatter_reduce",
+        variant_name="amax",
+        dtypes=onnx_test_common.BOOL_TYPES + (torch.float16,),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("ScatterElements reduction=amax", "float16"),
+    ),
+    xfail(
+        "scatter_reduce",
+        variant_name="mean",
+        reason="ONNX doesn't support reduce='mean' option",
+    ),
+    xfail(
+        "sign",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_script_does_not_support("Sign", "bool"),
+    ),
+    xfail(
+        "signal.windows.kaiser",
+        reason=onnx_test_common.reason_dynamo_does_not_support("functionalization"),
+    ),
+    xfail(
+        "softmax",
+        dtypes=(torch.float16,),
+        reason="ORT error: https://github.com/microsoft/onnxruntime/issues/16438"
+    ),
+    xfail(
+        "sparse.mm",
+        variant_name="reduce",
+        reason=onnx_test_common.reason_dynamo_does_not_support("InternalTorchDynamoError: Sparse CSR tensors do not have strides"),
+    ),
+    xfail(
+        "sparse.sampled_addmm",
+        reason=onnx_test_common.reason_dynamo_does_not_support("InternalTorchDynamoError: Sparse CSR tensors do not have strides"),
+    ),
+    xfail(
+        "special.erfcx",
+        dtypes=onnx_test_common.INT_TYPES + onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Erf", "int, bool"),
+    ),
+    xfail(
+        "special.erfcx",
+        dtypes=onnx_test_common.FLOAT_TYPES,
+        reason=onnx_test_common.reason_onnx_script_does_not_support("Erfcx"),
+    ),
+    xfail(
+        "special.ndtr",
+        dtypes=(torch.float16,),
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "split",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Split, SplitToSequence", "bool"),
+    ),
+    xfail(
+        "split",
+        variant_name="list_args",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Split, SplitToSequence", "bool"),
+    ),
+    xfail(
+        "split_with_sizes",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Split, SplitToSequence", "bool"),
+    ),
+    xfail(
+        "square",
+        dtypes=(torch.int8, torch.uint8, torch.int16),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Pow", "int8, uint8, int16"),
+    ),
+    xfail(
+        "squeeze",
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "squeeze",
+        variant_name="multiple",
+        reason="fixme: https://github.com/microsoft/onnxscript/issues/1264",
+    ),
+    xfail(
+        "svd_lowrank",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "std_mean",
+        reason="fixme: NotImplementedError: Type promotion does not support node output of list or tuple."
+    ),
+    xfail(
+        "std_mean",
+        variant_name="unbiased",
+        reason="fixme: NotImplementedError: Type promotion does not support node output of list or tuple."
+    ),
+    xfail(
+        "stft",
+        reason=onnx_test_common.reason_dynamo_does_not_support("aten._fft_r2c.default"),
+    ),
+    xfail(
+        "sub",
+        dtypes=(torch.uint8, torch.int8, torch.int16),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support(
+            "Mul", "uint8, int8, int16"
+        ),
+    ),
+    xfail(
+        "take",
+        reason=onnx_test_common.reason_dynamo_does_not_support("data-dependent"),
+    ),
+    xfail(
+        "tensor_split",
+        reason=onnx_test_common.reason_dynamo_does_not_support("data-dependent"),
+    ),
+    xfail(
+        "topk",
+        dtypes=(torch.int64, torch.int32),
+        reason="fixme: Assertion error: result mismatch",
+    ),
+    xfail(
+        "tril",
+        dtypes=onnx_test_common.BOOL_TYPES + (torch.int32,),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("trilu", "bool, int32"),
+    ),
+    xfail(
+        "triu",
+        dtypes=onnx_test_common.BOOL_TYPES + (torch.int32,),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("trilu", "bool, int32"),
+    ),
+    xfail(
+        "trunc",
+        dtypes=onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_does_not_support("Floor", "int"),
+    ),
+    xfail(
+        "unbind",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Split, SplitToSequence", "bool"),
+    ),
+    xfail(
+        "unflatten",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_does_not_support("Unflatten")
+    ),
+    xfail(
+        "uniform",
+        reason=onnx_test_common.reason_dynamo_does_not_support("wrapper_set_seed"),
+    ),
+    xfail(
+        "unique",
+        reason=onnx_test_common.reason_dynamo_does_not_support("aten.unique_consecutive.default"),
+    ),
+    xfail(
+        "unique_consecutive",
+        reason=onnx_test_common.reason_dynamo_does_not_support("aten.unique_consecutive.default"),
+    ),
+    xfail(
+        "unravel_index",
+        dtypes=onnx_test_common.BOOL_TYPES + onnx_test_common.INT_TYPES,
+        reason=onnx_test_common.reason_onnx_script_does_not_support("Floor", "bool, int"),
+    ),
+    xfail(
+        "unsafe_split",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Split, SplitToSequence", "bool"),
+    ),
+    xfail(
+        "unsafe_chunk",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Split, SplitToSequence", "bool"),
+    ),
+    xfail(
+        "where",
+        dtypes=onnx_test_common.BOOL_TYPES,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support("Where", "bool"),
+    ),
+    xfail(
+        "zeros",
+        dtypes=onnx_test_common.COMPLEX_TYPES,
+        reason="fixme: kwargs dtpye=complex64 is not supported in ONNX."
+    ),
+    # SLOW TESTS (All are xfails if we run them)
+    # TODO: https://github.com/pytorch/pytorch/issues/117118
+    skip_slow(
+        "cdist",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "histogram",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "histogramdd",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "linalg.lu_solve",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "linalg.solve_triangular",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "linalg.svd",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "logspace",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "logspace",
+        variant_name="tensor_overload",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "max_pool2d_with_indices_backward",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "nn.functional.interpolate",
+        variant_name="bicubic",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "nn.functional.max_unpool1d",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "nn.functional.max_unpool2d",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "nn.functional.max_unpool3d",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "nn.functional.max_pool1d",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "nn.functional.max_pool2d",
+        reason="fixme: Test sets are too many.",
     ),
+    skip_slow(
+        "nn.functional.max_pool3d",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "nn.functional.unfold",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "ormqr",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "searchsorted",
+        reason="fixme: Test sets are too many.",
+    ),
+    skip_slow(
+        "svd",
+        reason="fixme: Test sets are too many.",
+    ),
+)
+# fmt: on
+
+# NOTE: The xfail and skip with a matcher function or model_type should be
+# at under the `SKIP_XFAIL_SUBTESTS_WITH_MATCHER_AND_MODEL_TYPE` section.
+SKIP_XFAIL_SUBTESTS_WITH_MATCHER_AND_MODEL_TYPE: tuple[
+    onnx_test_common.DecorateMeta, ...
+] = (
     skip(
-        "amin",
-        matcher=lambda sample: len(sample.input.shape) == 0,
+        "_native_batch_norm_legit",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        reason="https://github.com/pytorch/pytorch/issues/115106",
+    ),
+    skip(
+        "_batch_norm_with_update",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        reason="https://github.com/pytorch/pytorch/issues/115106",
+    ),
+    # TODO: This test currently fails only for certain inputs, e.g. shape([3, 1]).
+    # Numerically the ONNX program is correct, but the output shapes for `save_mean`
+    # and `save_var` were tensor(-2.1268) instead of the correct tensor([-2.1268])
+    # for example.
+    skip(
+        "_batch_norm_with_update",
+        model_type=pytorch_test_common.TorchModelType.TORCH_NN_MODULE,
+        reason="not supported yet",
+    ),
+    xfail(
+        "addmm",  # xfail can't only use dtypes to catch all cases
+        matcher=lambda sample: sample.input.dtype
+        in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64),
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support(
+            "Gemm", "uint8, int8, int16, int32, int64"
+        ),
+    ),
+    xfail(
+        "addmm",
+        matcher=lambda sample: sample.args[0].numel() == 0,
+        reason="ONNX Runtime does not support empty tensors multiplication",
+    ),
+    xfail(
+        "addmm",
+        variant_name="decomposed",
+        matcher=lambda sample: sample.args[0].numel() == 0,
+        reason="ONNX Runtime does not support empty tensors multiplication",
+    ),
+    xfail(
+        "amax",
+        matcher=lambda sample: len(sample.input.shape) == 0
+        and (sample.kwargs.get("dim") is not None and sample.kwargs.get("dim") != ()),
         reason="Op (ReduceMax) [ShapeInferenceError] axis must be in [-rank, rank-1]. input rank was 0",
     ),
     xfail(
-        "arange",
-        matcher=lambda sample: not isinstance(sample.input, torch.Tensor),
-        reason="torch.export.export does not support non-tensor input (https://github.com/pytorch/pytorch/issues/115110)",
-        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        "amin",
+        matcher=lambda sample: len(sample.input.shape) == 0
+        and (sample.kwargs.get("dim") is not None and sample.kwargs.get("dim") != ()),
+        reason="Op (ReduceMin) [ShapeInferenceError] axis must be in [-rank, rank-1]. input rank was 0",
+    ),
+    xfail(
+        "aminmax",
+        matcher=lambda sample: len(sample.input.shape) == 0
+        and sample.kwargs.get("dim") is not None,
+        reason="Op (ReduceMin) [ShapeInferenceError] axis must be in [-rank, rank-1]. input rank was 0",
     ),
     skip(
         "cat",
@@ -570,10 +1414,24 @@ def skip_torchlib_forward_compatibility(
         reason="core dump - cat does not support zero-dim tensors yet",
     ),
     xfail(
-        "full",
-        matcher=lambda sample: not isinstance(sample.input, torch.Tensor),
-        reason="torch.export.export does not support non-tensor input (https://github.com/pytorch/pytorch/issues/115110)",
-        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        "index_add",
+        matcher=lambda sample: len(sample.input.shape) < 2,
+        reason="fixme: https://github.com/microsoft/onnxscript/issues/1212",
+    ),
+    xfail(
+        "index_add",
+        matcher=lambda sample: isinstance(sample.args[0], int) and sample.args[0] == -1,
+        reason="fixme: aten::index_put indices contains None when dim is -1",
+    ),
+    xfail(
+        "index_copy",
+        matcher=lambda sample: len(sample.input.shape) < 2,
+        reason="fixme: https://github.com/microsoft/onnxscript/issues/1212",
+    ),
+    xfail(
+        "index_copy",
+        matcher=lambda sample: isinstance(sample.args[0], int) and sample.args[0] == -1,
+        reason="fixme: aten::index_put indices contains None when dim is -1",
     ),
     xfail(
         "index_put",
@@ -583,6 +1441,49 @@ def skip_torchlib_forward_compatibility(
             "https://github.com/pytorch/pytorch/issues/101150"
         ),
     ),
+    skip(
+        "linalg.multi_dot",
+        matcher=lambda sample: sum(torch.numel(input) for input in sample.input) == 0,
+        reason="fixme: Undefined",
+    ),
+    skip(
+        "log_softmax",
+        matcher=lambda sample: len(sample.input.shape) == 0,
+        reason="fixme: LogSoftMax does not support empty tensor as input",
+    ),
+    skip(
+        "log_softmax",
+        variant_name="with_dtype",
+        matcher=lambda sample: len(sample.input.shape) == 0,
+        reason="fixme: LogSoftMax does not support empty tensor as input",
+    ),
+    xfail(
+        "logsumexp",
+        matcher=lambda sample: isinstance(sample.input, torch.Tensor)
+        and len(sample.input.shape) == 0,
+        reason="fixme: IsScalar",
+    ),
+    skip(
+        "masked.log_softmax",
+        matcher=lambda sample: len(sample.input.shape) == 0,
+        reason="fixme: LogSoftMax does not support empty tensor as input",
+    ),
+    skip(
+        "matmul",
+        matcher=lambda sample: torch.numel(sample.input) == 0,
+        reason="values of matmul of [m, 0] and [0, n] matrices are undefined",
+    ),
+    xfail(
+        "min",
+        variant_name="reduction_with_dim",
+        matcher=lambda sample: len(sample.input.shape) == 0,
+        reason="fixme: https://github.com/onnx/onnx/issues/4986",
+    ),
+    skip(
+        "mm",
+        matcher=lambda sample: torch.numel(sample.input) == 0,
+        reason="values of matmul of [m, 0] and [0, n] matrices are undefined",
+    ),
     xfail(
         "native_batch_norm",
         matcher=lambda sample: sample.args[-3] is True
@@ -629,15 +1530,15 @@ def skip_torchlib_forward_compatibility(
         model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
         reason="Flaky failure: https://github.com/pytorch/pytorch/issues/115106",
     ),
-    skip(
-        "nn.functional.conv1d",
-        matcher=lambda sample: isinstance(sample.kwargs.get("padding"), str),
-        reason="String padding is not accepted by aten::conv1d",
-    ),
-    skip(
+    xfail(
         "nn.functional.conv2d",
-        matcher=lambda sample: isinstance(sample.kwargs.get("padding"), str),
-        reason="String padding is not accepted by aten::conv2d",
+        matcher=lambda sample: sample.kwargs.get("padding") == "valid",
+        reason="fixme: https://github.com/pytorch/pytorch/issues/117054",
+    ),
+    xfail(
+        "nn.functional.conv3d",
+        matcher=lambda sample: sample.kwargs.get("padding") == "valid",
+        reason="fixme: https://github.com/pytorch/pytorch/issues/117054",
     ),
     skip(
         "nn.functional.cross_entropy",
@@ -659,7 +1560,21 @@ def skip_torchlib_forward_compatibility(
         ),
         github_issue="https://github.com/microsoft/onnxscript/issues/1056",
     ),
-    skip(
+    xfail(
+        "nn.functional.group_norm",
+        matcher=lambda sample: torch.numel(sample.input) == 0,
+        reason=onnx_test_common.reason_onnx_runtime_does_not_support(
+            "Reshape", "empty tensor"
+        ),
+    ),
+    xfail(
+        "nn.functional.instance_norm",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        matcher=lambda sample: sample.kwargs.get("running_mean") is not None
+        or sample.input.dtype in (torch.float16,),
+        reason="fixme: KeyError: 'self___kwargs__running_mean'",
+    ),
+    xfail(
         "nn.functional.max_pool3d",
         matcher=lambda sample: sample.kwargs.get("ceil_mode") is True
         and sample.kwargs.get("padding") == 1,
@@ -672,14 +1587,6 @@ def skip_torchlib_forward_compatibility(
         reason="Output 'shape' do not match: torch.Size([0, 1]) != torch.Size([0, 0]).",
         model_type=pytorch_test_common.TorchModelType.TORCH_NN_MODULE,
     ),
-    xfail(
-        "nonzero",
-        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
-        reason=onnx_test_common.reason_onnx_script_does_not_support(
-            "aten::_assert_async.msg",
-            "https://github.com/pytorch/pytorch/issues/112443",
-        ),
-    ),
     xfail(
         "scatter_add",
         matcher=lambda sample: len(sample.input.shape) == 0,
@@ -687,25 +1594,95 @@ def skip_torchlib_forward_compatibility(
     ),
     skip(
         "scatter_reduce",
+        variant_name="amax",
+        # ONNX has not include_self parameter and default is include_self=True mode
+        matcher=lambda sample: sample.kwargs.get("include_self") is False,
+        reason="ONNX does't support include_self=False option",
+    ),
+    skip(
+        "scatter_reduce",
+        variant_name="amin",
+        # ONNX has not include_self parameter and default is include_self=True mode
+        matcher=lambda sample: sample.kwargs.get("include_self") is False,
+        reason="ONNX does't support include_self=False option",
+    ),
+    skip(
+        "scatter_reduce",
+        variant_name="prod",
+        # ONNX has not include_self parameter and default is include_self=True mode
+        matcher=lambda sample: sample.kwargs.get("include_self") is False,
+        reason="ONNX does't support include_self=False option",
+    ),
+    skip(
+        "scatter_reduce",
+        variant_name="sum",
         # ONNX has not include_self parameter and default is include_self=True mode
         matcher=lambda sample: sample.kwargs.get("include_self") is False,
         reason="ONNX does't support include_self=False option",
     ),
+    skip(
+        "softmax",
+        matcher=lambda sample: len(sample.input.shape) == 0,
+        reason="fixme: LogSoftMax does not support empty tensor as input",
+    ),
+    xfail(
+        "t",
+        matcher=lambda sample: isinstance(sample.input, torch.Tensor)
+        and len(sample.input.shape) < 2,
+        reason="fixme: IsScalar",
+    ),
     xfail(
         "unflatten",
         reason="Logic not implemented for size 0 inputs in op.Reshape",
         matcher=lambda sample: any(dim == 0 for dim in sample.input.shape),
     ),
+    skip(
+        "signal.windows.hamming",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        reason="does not match node name",
+    ),
+    skip(
+        "signal.windows.general_hamming",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        reason="does not match node name",
+    ),
+    skip(
+        "signal.windows.blackman",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        reason="does not match node name",
+    ),
+    skip(
+        "signal.windows.general_cosine",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        reason="does not match node name",
+    ),
+    skip(
+        "signal.windows.hann",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        reason="does not match node name",
+    ),
+    skip(
+        "signal.windows.nuttall",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        reason="does not match node name",
+    ),
 )
 
-# END OF SECTION TO MODIFY #####################################################
-
-
 OPS_DB = copy.deepcopy(common_methods_invocations.op_db)
-OP_WITH_SKIPPED_XFAIL_SUBTESTS = frozenset(meta.op_name for meta in SKIP_XFAIL_SUBTESTS)
+OP_WITH_SKIPPED_XFAIL_SUBTESTS = frozenset(
+    meta.op_name for meta in SKIP_XFAIL_SUBTESTS_WITH_MATCHER_AND_MODEL_TYPE
+)
 ALL_OPS_IN_DB = frozenset(op_info.name for op_info in OPS_DB)
-# Assert all ops in OPINFO_FUNCTION_MAPPING are in the OPS_DB
-assert TESTED_OPS.issubset(ALL_OPS_IN_DB), f"{TESTED_OPS - ALL_OPS_IN_DB} not in OPS_DB"
+
+
+def _torch_size_flatten_spec(d: List[Any], spec: Any) -> List[Any]:
+    return [d[i] for i in range(spec.num_children)]
+
+
+torch.fx._pytree.register_pytree_flatten_spec(
+    torch.Size,
+    _torch_size_flatten_spec,
+)
 
 
 class SingleOpModel(torch.nn.Module):
@@ -721,15 +1698,36 @@ def forward(self, *args):
 
 
 def _should_skip_xfail_test_sample(
-    op_name: str, sample, model_type: pytorch_test_common.TorchModelType
+    op_name: str,
+    variant_test_name: str,
+    sample,
+    model_type: pytorch_test_common.TorchModelType,
 ) -> Tuple[Optional[str], Optional[str]]:
-    """Returns a reason if a test sample should be skipped."""
+    """Check if the test sample should be skipped or xfailed.
+
+    If the xfail/skip decorator meta is matched with its op_name and model_type,
+    return the test_behavior and reason. Otherwise, return None, None. Note that
+    if the matcher is None, the test is decorator_meta is meant to skip/xfail all model types.
+
+    Args:
+        op_name: The name of the op.
+        sample: The test sample.
+        model_type: The model type of the test.
+
+    Returns:
+        A tuple of (test_behavior, reason). test_behavior is either "skip" or "xfail".
+        reason is the reason for the test_behavior.
+    """
+
     if op_name not in OP_WITH_SKIPPED_XFAIL_SUBTESTS:
         return None, None
-    for decorator_meta in SKIP_XFAIL_SUBTESTS:
-        # Linear search on ops_test_data.SKIP_XFAIL_SUBTESTS. That's fine because the list is small.
+    for decorator_meta in SKIP_XFAIL_SUBTESTS_WITH_MATCHER_AND_MODEL_TYPE:
+        # Linear search on ops_test_data.SKIP_XFAIL_SUBTESTS_WITH_MATCHER_AND_MODEL_TYPE. That's fine because the list is small.
         # NOTE: If model_type is None, the test is decorator_meta is meant to skip/xfail all model types.
-        if decorator_meta.op_name == op_name and (
+        if (
+            decorator_meta.op_name == op_name
+            and decorator_meta.variant_name == variant_test_name
+        ) and (
             model_type == decorator_meta.model_type or decorator_meta.model_type is None
         ):
             if decorator_meta.matcher is None and decorator_meta.model_type is None:
@@ -744,6 +1742,69 @@ def _should_skip_xfail_test_sample(
     return None, None
 
 
+def _compare_onnx_and_torch_exported_program(
+    torch_exported_program,
+    onnx_exported_program,
+    input_args,
+    input_kwargs=None,
+    test_name=None,
+    sample_num=None,
+    sample_kwargs=None,
+    rtol=1e-03,
+    atol=1e-07,
+    only_check_shape=False,
+):
+    # avoid mutable default argument
+    if input_kwargs is None:
+        input_kwargs = {}
+
+    # NOTE: ONNXProgram holds a reference (not copy) to the original ref_model, including its state_dict.
+    # Thus, ONNXProgram() must run before ref_model() to prevent ref_model.forward() from changing the state_dict.
+    # Otherwise, the ref_model can change buffers on state_dict which would be used by ONNXProgram.__call__()
+    onnx_outputs = onnx_exported_program(*input_args, **input_kwargs)
+    if isinstance(torch_exported_program, torch.export.ExportedProgram):
+        torch_outputs = torch_exported_program.module()(*input_args, **input_kwargs)
+    else:
+        torch_outputs = torch_exported_program(*input_args, **input_kwargs)
+    torch_outputs_onnx_format = onnx_exported_program.adapt_torch_outputs_to_onnx(
+        torch_outputs
+    )
+    if len(torch_outputs_onnx_format) != len(onnx_outputs):
+        raise AssertionError(
+            f"Expected {len(torch_outputs_onnx_format)} outputs, got {len(onnx_outputs)}"
+        )
+
+    for j, (torch_output, onnx_output) in enumerate(
+        zip(torch_outputs_onnx_format, onnx_outputs)
+    ):
+        if only_check_shape:
+            assert torch_output.shape == onnx_output.shape
+        else:
+            try:
+                torch.testing.assert_close(
+                    torch.tensor(onnx_output),
+                    torch_output,
+                    rtol=rtol,
+                    atol=atol,
+                    equal_nan=True,
+                )
+            except AssertionError as e:
+                if os.environ.get("CREATE_REPRODUCTION_REPORT") == "1":
+                    error_reproduction.create_mismatch_report(
+                        test_name,
+                        sample_num,
+                        onnx_exported_program.model_proto,
+                        input_args,
+                        sample_kwargs,
+                        torch.tensor(onnx_output),
+                        torch_output,
+                        e,
+                    )
+                if len(torch_outputs_onnx_format) > 1:
+                    raise AssertionError(f"Output {j} mismatch") from e
+                raise
+
+
 def _run_test_output_match(
     test_suite: onnx_test_common._TestONNXRuntime,
     device: str,
@@ -752,13 +1813,11 @@ def _run_test_output_match(
 ):
     # device is provided by instantiate_device_type_tests, but we only want to run in cpu.
     assert device == "cpu"
-
     samples = op.sample_inputs(
         device,
         dtype,
         requires_grad=False,
     )
-
     for i, cpu_sample in enumerate(samples):
         inputs = (cpu_sample.input, *cpu_sample.args)
         # Provide the repr to subtest because tensors are not serializable in parallel test runs
@@ -770,7 +1829,7 @@ def _run_test_output_match(
             kwargs=repr(cpu_sample.kwargs),
         ):
             test_behavior, reason = _should_skip_xfail_test_sample(
-                op.name, cpu_sample, test_suite.model_type
+                op.name, op.variant_test_name, cpu_sample, test_suite.model_type
             )
             with onnx_test_common.normal_xfail_skip_test_behaviors(
                 test_behavior, reason
@@ -778,22 +1837,78 @@ def _run_test_output_match(
                 model = SingleOpModel(op.op, cpu_sample.kwargs)
                 model.eval()
 
-                if dtype == torch.float32:
+                if (
+                    dtype == torch.float32
+                    and op.name in test_suite.fp32_low_precision_dict
+                ):
+                    rtol = test_suite.fp32_low_precision_dict[op.name][0]
+                    atol = test_suite.fp32_low_precision_dict[op.name][1]
+                elif dtype == torch.float32:
                     # Relax atol and rtol for float32 based on empirical results
                     rtol = 1e-5
                     atol = 2e-5
                 elif (
                     dtype == torch.float16
-                    and op.name in test_suite.fp16_low_precision_list
+                    and (op.name, op.variant_test_name)
+                    in test_suite.fp16_low_precision_variant_dict
+                ):
+                    rtol = test_suite.fp16_low_precision_variant_dict[
+                        (op.name, op.variant_test_name)
+                    ][0]
+                    atol = test_suite.fp16_low_precision_variant_dict[
+                        (op.name, op.variant_test_name)
+                    ][1]
+                elif (
+                    dtype == torch.float16
+                    and op.name in test_suite.fp16_low_precision_dict
                 ):
-                    rtol = 1e-2
-                    atol = 1e-3
+                    rtol = test_suite.fp16_low_precision_dict[op.name][0]
+                    atol = test_suite.fp16_low_precision_dict[op.name][1]
                 else:
                     rtol = None
                     atol = None
-                # Run the test
-                test_suite.run_test_with_fx_to_onnx_exporter_and_onnx_runtime(
-                    model, inputs, rtol=rtol, atol=atol
+
+                if (
+                    test_suite.model_type
+                    == pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM
+                ):
+                    try:
+                        model = torch.export.export(model, inputs)
+                    except AssertionError as e:
+                        # NOTE: avoid fake_mode detection bug in torch.export.export
+                        pytest.xfail(
+                            onnx_test_common.reason_dynamo_does_not_support(str(e))
+                        )
+
+                try:
+                    onnx_program = torch.onnx.dynamo_export(
+                        model,
+                        *inputs,
+                    )
+                except torch.onnx.OnnxExporterError as e:
+                    # NOTE: If the model has unsupported nodes, we will skip the test
+                    # with non-strict xfail. Otherwise, we will raise the error.
+                    if hasattr(
+                        e.__cause__, "diagnostic"
+                    ) and e.__cause__.diagnostic.rule in (
+                        _rules._POERules.no_symbolic_function_for_call_function,
+                        _rules._POERules.unsupported_fx_node_analysis,
+                    ):
+                        pytest.xfail(
+                            onnx_test_common.reason_onnx_script_does_not_support(str(e))
+                        )
+                    else:
+                        raise e
+                _compare_onnx_and_torch_exported_program(
+                    model,
+                    onnx_program,
+                    inputs,
+                    test_name=test_suite.id(),
+                    sample_num=i,
+                    sample_kwargs=cpu_sample.kwargs,
+                    rtol=rtol,
+                    atol=atol,
+                    only_check_shape=(op.name in test_suite.only_shape_check_list),
                 )
 
 
@@ -843,35 +1958,74 @@ class TestOnnxModelOutputConsistency(onnx_test_common._TestONNXRuntime):
         pytorch_test_common.TorchModelType.TORCH_NN_MODULE
     )
 
-    fp16_low_precision_list = [
-        "baddbmm",
-        "nn.functional.batch_norm",
-        "native_batch_norm",
-        "dot",
-        "logit",
-        "rsub",
-        "sub",
+    # NOTE: Follow torchlib settings in ops_test_data.py
+    only_shape_check_list = [
+        "empty",
+        "empty_like",
+        "empty_strided",
+        "new_empty",
+        "new_empty_strided",
     ]
 
+    fp32_low_precision_dict = {
+        "native_layer_norm": [2e-4, 7e-4],
+    }
+
+    fp16_low_precision_dict = {
+        "addbmm": [2e-1, 2e-2],
+        "addcdiv": [3e-2, 1e-3],
+        "addcmul": [3e-2, 1e-3],
+        "addmv": [5e-2, 3e-2],
+        "addr": [3e-3, 4e-3],
+        "baddbmm": [3e-2, 1e-3],
+        "cumulative_trapezoid": [3e-2, 1e-3],
+        "diff": [1e-2, 5e-2],
+        "gradient": [3e-3, 4e-3],
+        "linalg.multi_dot": [3e-2, 1e-3],
+        "linalg.vecdot": [1e-2, 2e-2],
+        "linspace": [2e-2, 2e-3],
+        "masked.std": [2e-2, 2e-3],
+        "masked.var": [2e-2, 2e-2],
+        "matmul": [2e-2, 6e-2],
+        "nn.functional.batch_norm": [3e-2, 1e-3],
+        "nn.functional.binary_cross_entropy": [3e-2, 1e-3],
+        "nn.functional.binary_cross_entropy_with_logits": [3e-2, 1e-3],
+        "nn.functional.cosine_similarity": [3e-2, 1e-3],
+        "nn.functional.cosine_embedding_loss": [1e-2, 1e-3],
+        "nn.functional.hardsigmoid": [1e-3, 5e-3],
+        "nn.functional.hardswish": [1e-3, 5e-3],
+        "nn.functional.hinge_embedding_loss": [4e-1, 3e-3],
+        "nn.functional.instance_norm": [1e-2, 1e-3],
+        "nn.functional.interpolate": [1e-2, 1e-3],
+        "nn.functional.kl_div": [2e-3, 2e-4],
+        "nn.functional.multilabel_soft_margin_loss": [4e-2, 5e-3],
+        "nn.functional.local_response_norm": [1e-2, 5e-3],
+        "nn.functional.poisson_nll_loss": [3e-2, 1e-3],
+        "nn.functional.nll_loss": [3e-2, 1e-3],
+        "native_batch_norm": [3e-2, 1e-3],
+        "dot": [3e-2, 1e-3],
+        "logit": [3e-2, 1e-3],
+        "rsub": [3e-2, 1e-3],
+        "sinc": [2e-1, 6e-4],
+        "sub": [3e-2, 1e-3],
+        "trapezoid": [1e-3, 7e-3],
+        "trapz": [1e-3, 7e-3],
+    }
+
+    fp16_low_precision_variant_dict = {
+        ("nn.functional.interpolate", "trilinear"): [3e-2, 3e-3],
+        ("nn.functional.interpolate", "linear"): [3e-2, 3e-3],
+    }
+
     @common_device_type.ops(
-        [op for op in OPS_DB if op.name in TESTED_OPS],
+        [op for op in OPS_DB if op.name in ALL_OPS_IN_DB],
         allowed_dtypes=onnx_test_common.TESTED_DTYPES,
     )
     def test_output_match(self, device: str, dtype: torch.dtype, op):
         """Test the ONNX exporter."""
         _run_test_output_match(self, device, dtype, op)
 
-    @common_device_type.ops(
-        [op for op in OPS_DB if op.name in COMPLEX_TESTED_OPS],
-        allowed_dtypes=onnx_test_common.COMPLEX_TYPES,
-    )
-    def test_output_match_complex(self, device: str, dtype: torch.dtype, op):
-        """Test the ONNX exporter with complex dtype."""
-        _run_test_output_match(self, device, dtype, op)
-
 
-# TODO(titaiwang): refactor this
-# https://github.com/pytorch/pytorch/issues/105338
 for opset in onnx_test_common.FX_TESTED_OPSETS:
     for model_type in pytorch_test_common.TorchModelType:
         # The name needs to match the parameterized_class name.
@@ -881,16 +2035,9 @@ def test_output_match_complex(self, device: str, dtype: torch.dtype, op):
             test_class_name,
             "test_output_match",
             opset=opset,
-            skip_or_xfails=EXPECTED_SKIPS_OR_FAILS,
+            skip_or_xfails=EXPECTED_SKIPS_OR_FAILS_WITH_DTYPES,
         )
 
-        onnx_test_common.add_decorate_info(
-            OPS_DB,
-            test_class_name,
-            "test_output_match_complex",
-            opset=opset,
-            skip_or_xfails=EXPECTED_SKIPS_OR_FAILS,
-        )
         common_device_type.instantiate_device_type_tests(
             globals()[test_class_name], globals(), only_for="cpu"
         )
diff --git a/test/onnx/test_fx_passes.py b/test/onnx/test_fx_passes.py
index 4b175c3fcced3..9ebbf11646dc6 100644
--- a/test/onnx/test_fx_passes.py
+++ b/test/onnx/test_fx_passes.py
@@ -3,7 +3,6 @@
 import torch._dynamo
 import torch.fx
 
-from torch._custom_op import impl as custom_op
 from torch.onnx._internal.fx.passes import _utils as pass_utils
 from torch.testing._internal import common_utils
 
@@ -58,33 +57,26 @@ def func(x, y, z):
         ), f"Expected all names to be unique, got {nodes}"
 
     def test_onnx_dynamo_export_raises_when_model_contains_unsupported_fx_nodes(self):
-        @custom_op.custom_op("mylibrary::foo_op")
+        @torch.library.custom_op(
+            "mylibrary::foo_op", device_types="cpu", mutates_args=()
+        )
         def foo_op(x: torch.Tensor) -> torch.Tensor:
-            ...
+            return x + 1
 
-        @custom_op.custom_op("mylibrary::bar_op")
+        @torch.library.custom_op(
+            "mylibrary::bar_op", device_types="cpu", mutates_args=()
+        )
         def bar_op(x: torch.Tensor) -> torch.Tensor:
-            ...
+            return x + 2
 
-        @foo_op.impl_abstract()
-        def foo_op_impl_abstract(x):
+        @foo_op.register_fake
+        def _(x):
             return torch.empty_like(x)
 
-        @foo_op.impl("cpu")
-        def foo_op_impl(x):
-            return x + 1
-
-        @bar_op.impl_abstract()
-        def bar_op_impl_abstract(x):
+        @bar_op.register_fake
+        def _(x):
             return torch.empty_like(x)
 
-        @bar_op.impl("cpu")
-        def bar_op_impl(x):
-            return x + 2
-
-        torch._dynamo.allow_in_graph(foo_op)
-        torch._dynamo.allow_in_graph(bar_op)
-
         def func(x, y, z):
             return foo_op(x) + bar_op(y) + z
 
@@ -102,8 +94,24 @@ def func(x, y, z):
         torch._dynamo.reset()
 
 
+@common_utils.instantiate_parametrized_tests
 class TestModularizePass(common_utils.TestCase):
-    def test_modularize_pass_succeeds_when_submodule_output_is_unused(self):
+    @common_utils.parametrize(
+        "is_exported_program",
+        [
+            common_utils.subtest(
+                True,
+                name="exported_program",
+            ),
+            common_utils.subtest(
+                False,
+                name="nn_module",
+            ),
+        ],
+    )
+    def test_modularize_pass_succeeds_when_submodule_output_is_unused(
+        self, is_exported_program
+    ):
         # This is an ill-formed model, but exporter must not crash.
         # It is illegal for submodule to have zero output. For modularization pass it can happen
         # when the submodule output is unused, so no inner node is connected to any outer
@@ -123,9 +131,14 @@ def forward(self, x, y):
                 unused_relu_result = self.unused_relu(x)
                 return result
 
-        onnx_program = torch.onnx.dynamo_export(
-            TestModule(), torch.randn(3), torch.randn(3)
-        )
+        if is_exported_program:
+            model = torch.export.export(
+                TestModule(), args=(torch.randn(3), torch.randn(3))
+            )
+        else:
+            model = TestModule()
+
+        onnx_program = torch.onnx.dynamo_export(model, torch.randn(3), torch.randn(3))
         model_proto = onnx_program.model_proto
         function_proto_names = [function.name for function in model_proto.functions]
         self.assertIn(
@@ -133,7 +146,22 @@ def forward(self, x, y):
         )
         self.assertFalse(any("ReLU" in name for name in function_proto_names))
 
-    def test_modularize_pass_succeeds_when_a_submodule_is_called_multiple_times(self):
+    @common_utils.parametrize(
+        "is_exported_program",
+        [
+            common_utils.subtest(
+                True,
+                name="exported_program",
+            ),
+            common_utils.subtest(
+                False,
+                name="nn_module",
+            ),
+        ],
+    )
+    def test_modularize_pass_succeeds_when_a_submodule_is_called_multiple_times(
+        self, is_exported_program
+    ):
         class TestModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -146,16 +174,34 @@ def forward(self, x, y):
                 out = self.relu(out)
                 return out
 
-        onnx_program = torch.onnx.dynamo_export(
-            TestModule(), torch.randn(3), torch.randn(3)
-        )
+        if is_exported_program:
+            model = torch.export.export(
+                TestModule(), args=(torch.randn(3), torch.randn(3))
+            )
+        else:
+            model = TestModule()
+
+        onnx_program = torch.onnx.dynamo_export(model, torch.randn(3), torch.randn(3))
         model_proto = onnx_program.model_proto
         function_proto_names = [function.name for function in model_proto.functions]
         self.assertIn("torch_nn_modules_activation_ReLU_relu_1", function_proto_names)
         self.assertIn("torch_nn_modules_activation_ReLU_relu_2", function_proto_names)
 
+    @common_utils.parametrize(
+        "is_exported_program",
+        [
+            common_utils.subtest(
+                True,
+                name="exported_program",
+            ),
+            common_utils.subtest(
+                False,
+                name="nn_module",
+            ),
+        ],
+    )
     def test_modularize_pass_succeeds_when_a_submodule_is_called_from_multiple_layers(
-        self,
+        self, is_exported_program
     ):
         # Minified repro from basic_gnn_edgecnn.
         class InnerModule(torch.nn.Module):
@@ -178,9 +224,14 @@ def forward(self, x, y):
                 out = self.inner_module.relu(out)
                 return out
 
-        onnx_program = torch.onnx.dynamo_export(
-            TestModule(), torch.randn(3), torch.randn(3)
-        )
+        if is_exported_program:
+            model = torch.export.export(
+                TestModule(), args=(torch.randn(3), torch.randn(3))
+            )
+        else:
+            model = TestModule()
+
+        onnx_program = torch.onnx.dynamo_export(model, torch.randn(3), torch.randn(3))
         model_proto = onnx_program.model_proto
         function_proto_names = [function.name for function in model_proto.functions]
         self.assertIn(
diff --git a/test/onnx/test_fx_to_onnx.py b/test/onnx/test_fx_to_onnx.py
index 6643a12399986..f5de6aca086a9 100644
--- a/test/onnx/test_fx_to_onnx.py
+++ b/test/onnx/test_fx_to_onnx.py
@@ -1,13 +1,14 @@
 # Owner(s): ["module: onnx"]
 from __future__ import annotations
 
-import io
+import logging
 
 import tempfile
 
 from typing import Mapping, Tuple
 
 import onnx
+import onnx.inliner
 import pytorch_test_common
 import torch
 import transformers  # type: ignore[import]
@@ -256,8 +257,7 @@ def forward(self, input):
                 namespace="aten", op_name="add", overload="Tensor"
             )
         )
-        # TODO: Replace this example with a torch custom op when overload is supported
-        # Currently, torch only supports custom op with namespace and op_name
+
         aten_add_Tensor = registration.OpName.from_name_parts(
             namespace="aten", op_name="add", overload="Tensor"
         )
@@ -363,11 +363,11 @@ def _assert_node_outputs_has_value_info(
             node: onnx.NodeProto,
             value_infos: Mapping[str, onnx.ValueInfoProto],
             local_functions: Mapping[Tuple[str, str], onnx.FunctionProto],
-            prefix: str = "",
+            function_id: str = "",
         ):
             for output in node.output:
-                output_prefix = f"{prefix}/{output}" if prefix else output
-                self.assertIn(output_prefix, value_infos)
+                name = f"{function_id}/{output}" if function_id else output
+                self.assertIn(name, value_infos)
             if node.domain.startswith("pkg.onnxscript.torch_lib"):
                 # No shape info available for values inside torchlib functions.
                 return
@@ -375,11 +375,9 @@ def _assert_node_outputs_has_value_info(
                 function := local_functions.get((node.domain, node.op_type))
             ) is not None:
                 for node in function.node:
-                    node_prefix = (
-                        f"{prefix}/{function.name}" if prefix else function.name
-                    )
+                    function_id = f"{function.domain}::{function.name}"
                     _assert_node_outputs_has_value_info(
-                        node, value_infos, local_functions, node_prefix
+                        node, value_infos, local_functions, function_id
                     )
 
         type_infos = {vi.name: vi for vi in model_proto.graph.value_info}
@@ -430,7 +428,20 @@ def forward(self, tensor_x: torch.Tensor):
             {*model.state_dict().keys()},
         )
 
-    def test_fake_tensor_mode_simple(self):
+    @common_utils.parametrize(
+        "checkpoint_type",
+        [
+            common_utils.subtest(
+                "state_dict",
+                name="state_dict",
+            ),
+            common_utils.subtest(
+                "state_dict",
+                name="checkpoint_file",
+            ),
+        ],
+    )
+    def test_fake_tensor_mode_simple(self, checkpoint_type):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -459,29 +470,44 @@ def forward(self, x):
             len(onnx_program.model_proto.graph.initializer) == 0
         ), "Initializers cannot exist when fake mode is enabled"
 
-        # Variant 1: Save ONNX proto using Model's state_dict()
-        with tempfile.NamedTemporaryFile(suffix=".onnx") as tmp_onnx_file:
-            model_state_dict = Model().state_dict()  # Create a state_dict for testing
-            onnx_program.save(tmp_onnx_file.name, model_state_dict=model_state_dict)
-            assert (
-                len(onnx.load(tmp_onnx_file.name).graph.initializer) == 2
-            ), "Initializers must be present after loading it from model_state_dict"
-
-        # Variant 2: Save ONNX proto using Model checkpoint file
-        with tempfile.NamedTemporaryFile(
-            suffix=".onnx"
-        ) as tmp_onnx_file, tempfile.NamedTemporaryFile(
-            suffix=".pt"
-        ) as tmp_checkpoint_file:
-            torch.save(
-                Model().state_dict(), tmp_checkpoint_file.name
-            )  # Create checkpoint file for testing
-            onnx_program.save(
-                tmp_onnx_file.name, model_state_dict=tmp_checkpoint_file.name
-            )
-            assert (
-                len(onnx.load(tmp_onnx_file.name).graph.initializer) == 2
-            ), "Initializers must be present after loading it from model_state_dict"
+        if checkpoint_type == "state_dict":
+            # Variant 1: Save ONNX proto using Model's state_dict()
+            with tempfile.NamedTemporaryFile(suffix=".onnx") as tmp_onnx_file:
+                model_state_dict = (
+                    Model().state_dict()
+                )  # Create a state_dict for testing
+                onnx_program.save(tmp_onnx_file.name, model_state=model_state_dict)
+                assert (
+                    len(onnx.load(tmp_onnx_file.name).graph.initializer) == 2
+                ), "Initializers must be present after loading it from model_state_dict"
+                # Let's make sure consecutive `save` calls don't create dupes
+                onnx_program.save(tmp_onnx_file.name, model_state=model_state_dict)
+                assert (
+                    len(onnx.load(tmp_onnx_file.name).graph.initializer) == 2
+                ), "Initializers must be present after loading it from model_state_dict"
+        elif checkpoint_type == "checkpoint_file":
+            # Variant 2: Save ONNX proto using Model checkpoint file
+            with tempfile.NamedTemporaryFile(
+                suffix=".onnx"
+            ) as tmp_onnx_file, tempfile.NamedTemporaryFile(
+                suffix=".pt"
+            ) as tmp_checkpoint_file:
+                torch.save(
+                    Model().state_dict(), tmp_checkpoint_file.name
+                )  # Create checkpoint file for testing
+                onnx_program.save(
+                    tmp_onnx_file.name, model_state=tmp_checkpoint_file.name
+                )
+                assert (
+                    len(onnx.load(tmp_onnx_file.name).graph.initializer) == 2
+                ), "Initializers must be present after loading it from model_state_dict"
+                # Let's make sure consecutive `save` calls don't create dupes
+                onnx_program.save(
+                    tmp_onnx_file.name, model_state=tmp_checkpoint_file.name
+                )
+                assert (
+                    len(onnx.load(tmp_onnx_file.name).graph.initializer) == 2
+                ), "Initializers must be present after loading it from model_state_dict"
 
     def test_fake_tensor_mode_simple_invalid_input(self):
         class Model(torch.nn.Module):
@@ -528,69 +554,17 @@ def forward(self, x):
                     fake_model, real_x, export_options=export_options
                 )
 
-    # NOTE: To all transformer models, config is preferred to pre-trained model for testing because:
-    # 1. Pre-trained model is too big for CI
-    # 2. Pre-trained model is has uint8/bool issue: https://github.com/huggingface/transformers/issues/21013
-    def test_fake_tensor_mode_huggingface_gpt2(self):
-        config = transformers.GPT2Config(
-            vocab_size=8096, n_positions=256, n_embd=256, n_layer=2, n_head=2
-        )
-        batch, seq = 4, 256
-
-        with torch.onnx.enable_fake_mode() as fake_context:
-            model = transformers.GPT2Model(config).eval()
-            input_ids = torch.randint(0, config.vocab_size, (batch, seq))
-            attention_mask = torch.ones(batch, seq, dtype=torch.bool)
-            position_ids = torch.arange(0, seq, dtype=torch.long)
-            position_ids = position_ids.unsqueeze(0).view(-1, seq)
-
-            export_options = torch.onnx.ExportOptions(fake_context=fake_context)
-            onnx_program = torch.onnx.dynamo_export(
-                model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                export_options=export_options,
-            )
-            onnx.checker.check_model(onnx_program.model_proto)
-            onnx.shape_inference.infer_shapes(onnx_program.model_proto)
-
-    def test_fake_tensor_mode_huggingface_open_llama(self):
-        config = transformers.OpenLlamaConfig(
-            vocab_size=8096, hidden_size=256, num_hidden_layers=2, num_attention_heads=2
-        )
-        batch, seq = 4, 256
-
-        with torch.onnx.enable_fake_mode() as fake_context:
-            model = transformers.OpenLlamaModel(config).eval()
-            input_ids = torch.randint(0, config.vocab_size, (batch, seq))
-            attention_mask = torch.ones(batch, seq, dtype=torch.bool)
-            position_ids = torch.arange(0, seq, dtype=torch.long)
-            position_ids = position_ids.unsqueeze(0).view(-1, seq)
-
-            export_options = torch.onnx.ExportOptions(fake_context=fake_context)
-            onnx_program = torch.onnx.dynamo_export(
-                model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                export_options=export_options,
-            )
-            onnx.checker.check_model(onnx_program.model_proto)
-            onnx.shape_inference.infer_shapes(onnx_program.model_proto)
-
     @pytorch_test_common.xfail(
-        "This is addressed in main branch of transformers."
-        "https://github.com/huggingface/transformers/pull/24941"
+        error_message="Dynamic control flow is not supported at the moment."
     )
-    def test_fake_tensor_mode_huggingface_databricks_dolly_v2_3b(self):
-        config = transformers.GPTNeoXConfig(
+    def test_fake_tensor_mode_huggingface_llama(self):
+        config = transformers.LlamaConfig(
             vocab_size=8096, hidden_size=256, num_hidden_layers=2, num_attention_heads=2
         )
         batch, seq = 4, 256
 
         with torch.onnx.enable_fake_mode() as fake_context:
-            model = transformers.GPTNeoXModel(config).eval()
+            model = transformers.LlamaModel(config).eval()
             input_ids = torch.randint(0, config.vocab_size, (batch, seq))
             attention_mask = torch.ones(batch, seq, dtype=torch.bool)
             position_ids = torch.arange(0, seq, dtype=torch.long)
@@ -608,9 +582,7 @@ def test_fake_tensor_mode_huggingface_databricks_dolly_v2_3b(self):
             onnx.shape_inference.infer_shapes(onnx_program.model_proto)
 
     @pytorch_test_common.xfail(
-        "Not decorated with xfail because CI doesn't have enough memory to run and then fail."
-        "AssertionError: Mutating module attribute seq_len_cached during export."
-        "self.seq_len_cached = seq_len"
+        error_message="Dynamic control flow is not supported at the moment."
     )
     def test_fake_tensor_mode_huggingface_tiiuae_falcon(self):
         config = transformers.FalconConfig()
@@ -669,23 +641,29 @@ def forward(self, x):
                 return self.normal.sample(x.shape)
 
         x = torch.randn(2, 3)
-        exported_program = torch.export.export(Model(), args=(x,))
-        _ = torch.onnx.dynamo_export(
-            exported_program,
-            x,
-        )
+        with torch.no_grad():
+            exported_program = torch.export.export(Model(), args=(x,))
+            _ = torch.onnx.dynamo_export(
+                exported_program,
+                x,
+            )
 
-    def test_aten_linalg_vector_norm_with_reducel2(self):
-        class Net(nn.Module):
-            def forward(self, x):
-                x = F.normalize(x)
-                return x
+    def test_aten_div_no_opmath_type_promotion(self):
+        class Model(torch.nn.Module):
+            def forward(self, input):
+                return input / 2
+
+        model = Model()
+        input = torch.randn(3, 5, requires_grad=True, dtype=torch.float16)
 
-        f = io.BytesIO()
-        torch.onnx.export(Net(), (torch.randn(1, 2, 2),), f)
-        onnx_model = onnx.load_from_string(f.getvalue())
-        onnx_nodes = [n.op_type for n in onnx_model.graph.node]
-        self.assertTrue("ReduceL2" in onnx_nodes)
+        model_proto = torch.onnx.dynamo_export(model, input).model_proto
+        model_proto = onnx.inliner.inline_local_functions(model_proto)
+        div_node = next(
+            node for node in model_proto.graph.node if node.op_type == "Div"
+        )
+        # The input of Div node should be the input of the model,
+        # with no Cast node in between.
+        self.assertEqual(div_node.input[0], model_proto.graph.input[0].name)
 
     def test_exported_program_as_input_with_model_signature(self):
         class Model(torch.nn.Module):
@@ -702,6 +680,192 @@ def forward(self, x):
 
         self.assertTrue(onnx_program.model_signature, torch.export.ExportGraphSignature)
 
+    @common_utils.parametrize(
+        "float8_type",
+        [
+            common_utils.subtest(
+                torch.float8_e5m2,
+                name="torch_float8_e5m2",
+            ),
+            common_utils.subtest(
+                torch.float8_e5m2fnuz,
+                name="torch_float8_e5m2fnuz",
+            ),
+            common_utils.subtest(
+                torch.float8_e4m3fn,
+                name="torch_float8_e4m3fn",
+            ),
+            common_utils.subtest(
+                torch.float8_e4m3fnuz,
+                name="torch_float8_e4m3fnuz",
+            ),
+        ],
+    )
+    def test_float8_support(self, float8_type):
+        class Float8Module(torch.nn.Module):
+            def forward(self, input: torch.Tensor):
+                input = input.to(float8_type)
+                return input + torch.tensor(1.0, dtype=float8_type)
+
+        _ = torch.onnx.dynamo_export(Float8Module(), torch.randn(1, 2, 3, 4))
+
+    def test_export_with_logging_logger(self):
+        logger = logging.getLogger(__name__)
+
+        class LoggingLoggerModule(torch.nn.Module):
+            def forward(self, x):
+                logger.log("abc")
+                return x + 1
+
+        input = torch.randn(2, 3)
+        model = LoggingLoggerModule()
+        _ = torch.onnx.dynamo_export(model, input)
+
+    def test_export_with_hf_logging_logger(self):
+        logger = transformers.utils.logging.get_logger(__name__)
+
+        class HFLoggingLoggerModule(torch.nn.Module):
+            def forward(self, x):
+                logger.warning_once("abc")
+                return x + 1
+
+        input = torch.randn(2, 3)
+        model = HFLoggingLoggerModule()
+        _ = torch.onnx.dynamo_export(model, input)
+
+    def test_checkpoint_cast(self):
+        model_id = "openai/whisper-large-v3"
+        feature_extractor = transformers.WhisperFeatureExtractor(feature_size=128)
+        batch = 4
+
+        with torch.onnx.enable_fake_mode() as ctx:
+            model = transformers.AutoModelForSpeechSeq2Seq.from_pretrained(
+                model_id, low_cpu_mem_usage=False, use_safetensors=False
+            )
+            input = {
+                "input_features": torch.randn(
+                    (
+                        batch,
+                        feature_extractor.feature_size,
+                        feature_extractor.nb_max_frames,
+                    )
+                ),
+                "decoder_input_ids": torch.tensor([[1, 1]]) * 8001,
+                "return_dict": False,
+            }
+
+        export_options = torch.onnx.ExportOptions(fake_context=ctx)
+        onnx_program = torch.onnx.dynamo_export(
+            model, **input, export_options=export_options
+        )
+        with tempfile.NamedTemporaryFile(suffix=".onnx") as tmp_onnx_file:
+            onnx_program.save(tmp_onnx_file.name)
+            onnx.checker.check_model(tmp_onnx_file.name, full_check=True)
+
+    @common_utils.parametrize(
+        "include_initializer",
+        [
+            common_utils.subtest(
+                True,
+                name="include_initializer",
+            ),
+            common_utils.subtest(
+                False,
+                name="dont_include_initializer",
+            ),
+        ],
+    )
+    @common_utils.parametrize(
+        "use_fake_mode",
+        [
+            common_utils.subtest(
+                True,
+                name="use_fake_mode",
+            ),
+            common_utils.subtest(
+                False,
+                name="no_fake_mode",
+            ),
+        ],
+    )
+    @common_utils.parametrize(
+        "use_exported_program",
+        [
+            common_utils.subtest(
+                True,
+                name="use_exported_program",
+            ),
+            common_utils.subtest(
+                False,
+                name="no_exported_program",
+            ),
+        ],
+    )
+    def test_save_with_without_initializer(
+        self, include_initializer, use_fake_mode, use_exported_program
+    ):
+        class MNISTModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = nn.Conv2d(1, 32, 3, 1, bias=False)
+                self.conv2 = nn.Conv2d(32, 64, 3, 1, bias=False)
+                self.fc1 = nn.Linear(9216, 128, bias=False)
+                self.fc2 = nn.Linear(128, 10, bias=False)
+
+            def forward(self, tensor_x: torch.Tensor):
+                tensor_x = self.conv1(tensor_x)
+                tensor_x = F.sigmoid(tensor_x)
+                tensor_x = self.conv2(tensor_x)
+                tensor_x = F.sigmoid(tensor_x)
+                tensor_x = F.max_pool2d(tensor_x, 2)
+                tensor_x = torch.flatten(tensor_x, 1)
+                tensor_x = self.fc1(tensor_x)
+                tensor_x = F.sigmoid(tensor_x)
+                tensor_x = self.fc2(tensor_x)
+                output = F.log_softmax(tensor_x, dim=1)
+                return output
+
+        state_dict = MNISTModel().state_dict()
+        if use_fake_mode:
+            with torch.onnx.enable_fake_mode() as ctx:
+                model = MNISTModel()
+                tensor_x = torch.rand((64, 1, 28, 28), dtype=torch.float32)
+                if use_exported_program:
+                    model = torch.export.export(model, args=(tensor_x,))
+                export_options = torch.onnx.ExportOptions(fake_context=ctx)
+        else:
+            model = MNISTModel()
+            tensor_x = torch.rand((64, 1, 28, 28), dtype=torch.float32)
+            if use_exported_program:
+                model = torch.export.export(model, args=(tensor_x,))
+            export_options = torch.onnx.ExportOptions()
+
+        onnx_program = torch.onnx.dynamo_export(
+            model, tensor_x, export_options=export_options
+        )
+        with tempfile.NamedTemporaryFile(suffix=".onnx") as tmp_onnx_file:
+            onnx_program.save(
+                tmp_onnx_file.name,
+                include_initializers=include_initializer,
+                model_state=state_dict if include_initializer else None,
+            )
+            onnx_model = onnx.load(tmp_onnx_file.name)
+            self.assertEqual(
+                (include_initializer and len(onnx_model.graph.initializer) > 0)
+                or (not include_initializer and len(onnx_model.graph.initializer) == 0),
+                True,
+            )
+
+    def test_export_with_print(self):
+        class PrintModule(torch.nn.Module):
+            def forward(self, x):
+                print("abc")
+                return x + 1
+
+        input = torch.randn(2, 3)
+        model = PrintModule()
+        _ = torch.onnx.dynamo_export(model, input)
+
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/onnx/test_fx_to_onnx_decomp_skip.py b/test/onnx/test_fx_to_onnx_decomp_skip.py
new file mode 100644
index 0000000000000..16780e604d071
--- /dev/null
+++ b/test/onnx/test_fx_to_onnx_decomp_skip.py
@@ -0,0 +1,53 @@
+# Owner(s): ["module: onnx"]
+from __future__ import annotations
+
+import onnx
+import onnx.inliner
+import pytorch_test_common
+
+import torch
+from torch.testing._internal import common_utils
+
+
+def assert_op_in_onnx_model(model: onnx.ModelProto, op_type: str):
+    inlined = onnx.inliner.inline_local_functions(model)
+    for node in inlined.graph.node:
+        if node.op_type == op_type:
+            return
+    raise AssertionError(f"Op {op_type} not found in model")
+
+
+class TestDynamoExportDecompSkip(pytorch_test_common.ExportTestCase):
+    def test_upsample_bilinear2d(self):
+        class TestModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.upsample = torch.nn.Upsample(scale_factor=2, mode="bilinear")
+
+            def forward(self, x):
+                return self.upsample(x)
+
+        onnx_program = torch.onnx.dynamo_export(TestModel(), torch.randn(1, 1, 2, 2))
+        # If decomposition is skipped, the model will contain a Resize op instead of fine grained subgraph.
+        assert_op_in_onnx_model(onnx_program.model_proto, "Resize")
+
+    def test_upsample_bilinear2d_output_size(self):
+        def func(x: torch.Tensor):
+            return torch.nn.functional.interpolate(x, size=(4, 4), mode="bilinear")
+
+        onnx_program = torch.onnx.dynamo_export(func, torch.randn(1, 1, 2, 2))
+        # If decomposition is skipped, the model will contain a Resize op instead of fine grained subgraph.
+        assert_op_in_onnx_model(onnx_program.model_proto, "Resize")
+
+    def test_instance_norm(self):
+        def func(x: torch.Tensor):
+            return torch.nn.functional.instance_norm(x)
+
+        onnx_program = torch.onnx.dynamo_export(func, torch.randn(1, 1, 2, 2))
+        # If decomposition is skipped, the model will contain an InstanceNormalization op
+        # instead of BatchNormalization op w/ training=True.
+        assert_op_in_onnx_model(onnx_program.model_proto, "InstanceNormalization")
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()
diff --git a/test/onnx/test_fx_to_onnx_with_onnxruntime.py b/test/onnx/test_fx_to_onnx_with_onnxruntime.py
index 46385b930aae7..3f8ba1de2bc3f 100644
--- a/test/onnx/test_fx_to_onnx_with_onnxruntime.py
+++ b/test/onnx/test_fx_to_onnx_with_onnxruntime.py
@@ -16,12 +16,14 @@
 import pytorch_test_common
 import torch
 import torch.onnx
+
 import transformers  # type: ignore[import]
 from torch import nn
 
 from torch._subclasses import fake_tensor
 from torch.onnx._internal import _beartype, exporter
 from torch.onnx._internal.fx import (
+    diagnostics,
     fx_symbolic_graph_extractor,
     patcher,
     serialization as fx_serialization,
@@ -83,20 +85,23 @@ def setUp(self):
         self.ort_version = onnxruntime.__version__
 
     def test_simple_function(self):
-        def func(x):
-            # TODO(justinchuby): Replicate torch's type casting policy
-            # in the exporter for type promotion support
-            y = x + 1.0
-            z = y.relu()
-            return (y, z)
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                # TODO(justinchuby): Replicate torch's type casting policy
+                # in the exporter for type promotion support
+                y = x + 1.0
+                z = y.relu()
+                return (y, z)
+
+        func = Foo()
 
         tensor_x = torch.randn(1, 1, 2, dtype=torch.float32)
 
         self.run_test_with_fx_to_onnx_exporter_and_onnx_runtime(func, (tensor_x,))
 
     @pytorch_test_common.xfail(
-        "AssertionError: Dynamo input/output is not consistent with traced input/output. "
-        "Ref: https://github.com/pytorch/pytorch/issues/96379"
+        error_message="Unexpectedly found a <class 'torch.Tensor'> in the inputs.",
+        reason="https://github.com/pytorch/pytorch/issues/96379",
     )
     def test_func_with_args_and_tensor_kwargs(self):
         # Non-tensor optional kwargs are always folded into constant and
@@ -118,10 +123,13 @@ def test_func_with_args_and_tensor_kwargs(self):
         # practice to set mutable default values.
         # `DynamoOptimizeExporter` applies a workaround by binding args and kwargs to
         # model signature and fill in the default values of unprovided optional arguments.
-        def func(x, b=torch.tensor(1.0)):
-            y = x + b
-            z = y.relu()
-            return (y, z)
+        class Foo(torch.nn.Module):
+            def forward(self, x, b=torch.tensor(1.0)):
+                y = x + b
+                z = y.relu()
+                return (y, z)
+
+        func = Foo()
 
         tensor_x = torch.randn(1, 2, 3, dtype=torch.float32)
 
@@ -140,21 +148,24 @@ def func(x, b=torch.tensor(1.0)):
         "sympy operation tests don't need dynamic shape"
     )
     def test_sympy_operatons_return_numeric(self):
-        def func(x, y):
-            # TODO: add boolean tests when SymBool is supported
-            # to infer types
-            return (
-                torch.tensor([operator.add(x.item(), y.item())]),
-                torch.tensor([operator.sub(x.item(), y.item())]),
-                torch.tensor([operator.mul(x.item(), y.item())]),
-                torch.tensor([operator.truediv(x.item(), y.item())]),
-                torch.tensor([operator.floordiv(x.item(), y.item())]),
-                torch.tensor([operator.pow(x.item(), y.item())]),
-                torch.tensor([operator.abs(x.item())]),
-                torch.tensor([operator.neg(x.item())]),
-                torch.tensor([math.ceil(x.item())]),
-                torch.tensor([math.floor(x.item())]),
-            )
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                # TODO: add boolean tests when SymBool is supported
+                # to infer types
+                return (
+                    torch.tensor([operator.add(x.item(), y.item())]),
+                    torch.tensor([operator.sub(x.item(), y.item())]),
+                    torch.tensor([operator.mul(x.item(), y.item())]),
+                    torch.tensor([operator.truediv(x.item(), y.item())]),
+                    torch.tensor([operator.floordiv(x.item(), y.item())]),
+                    torch.tensor([operator.pow(x.item(), y.item())]),
+                    torch.tensor([operator.abs(x.item())]),
+                    torch.tensor([operator.neg(x.item())]),
+                    torch.tensor([math.ceil(x.item())]),
+                    torch.tensor([math.floor(x.item())]),
+                )
+
+        func = Foo()
 
         x = torch.randn(1, dtype=torch.float32)
         y = torch.randn(1, dtype=torch.float32)
@@ -167,14 +178,17 @@ def func(x, y):
         )
 
     @pytorch_test_common.xfail(
-        "https://github.com/pytorch/pytorch/issues/99534"
-        "Non-tensor input is not traceable in dynamo."
+        error_message="Model inputs incompatible with the format that was exported",
+        reason="https://github.com/pytorch/pytorch/issues/99534",
     )
     def test_xfail_func_with_non_tensor_args(self):
-        def func(x, b=1.0):
-            y = x + b
-            z = y.relu()
-            return (y, z)
+        class Foo(torch.nn.Module):
+            def forward(self, x, b=1.0):
+                y = x + b
+                z = y.relu()
+                return (y, z)
+
+        func = Foo()
 
         tensor_x = torch.randn(1, 1, 2, dtype=torch.float32)
 
@@ -202,25 +216,29 @@ def func(x, b=1.0):
             torch.testing.assert_close(ref_output, torch.tensor(ort_output))
 
     def test_func_with_nested_input_structure(self):
-        def func(
-            x_dict: Dict[str, torch.Tensor],
-            y_tuple: Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-            z_list: List[List[torch.Tensor]],
-        ):
-            if "a" in x_dict:
-                x = x_dict["a"]
-            elif "b" in x_dict:
-                x = x_dict["b"]
-            else:
-                x = torch.randn(3)
+        class Foo(torch.nn.Module):
+            def forward(
+                self,
+                x_dict: Dict[str, torch.Tensor],
+                y_tuple: Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+                z_list: List[List[torch.Tensor]],
+            ):
+                if "a" in x_dict:
+                    x = x_dict["a"]
+                elif "b" in x_dict:
+                    x = x_dict["b"]
+                else:
+                    x = torch.randn(3)
+
+                y1, (y2, y3) = y_tuple
 
-            y1, (y2, y3) = y_tuple
+                z = x + y1 + y2 + y3
+                for z_sub_list in z_list:
+                    z = z + torch.stack(z_sub_list).sum()
 
-            z = x + y1 + y2 + y3
-            for z_sub_list in z_list:
-                z = z + torch.stack(z_sub_list).sum()
+                return z
 
-            return z
+        func = Foo()
 
         x_dict = {"a": torch.randn(3), "c": torch.randn(3)}
         y_tuple = (torch.randn(3), (torch.randn(3), torch.randn(3)))
@@ -233,14 +251,17 @@ def func(
         )
 
     def test_func_with_nested_output_structure(self):
-        def func(x, y, z):
-            x = x + y
-            y = y + z
-            z = x + y
-            out1 = (x, (y, z))
-            out2 = [[x, y], [y, z]]
-            out3 = {"z": z, "x": x}
-            return out1, out2, out3
+        class Foo(torch.nn.Module):
+            def forward(self, x, y, z):
+                x = x + y
+                y = y + z
+                z = x + y
+                out1 = (x, (y, z))
+                out2 = [[x, y], [y, z]]
+                out3 = {"z": z, "x": x}
+                return out1, out2, out3
+
+        func = Foo()
 
         x = torch.randn(3)
         y = torch.randn(3)
@@ -306,9 +327,8 @@ def test_resnet18(self):
             (dummy_input,),
         )
 
-    @pytorch_test_common.skip_dynamic_fx_test(
-        "[ONNXRuntimeError] : 2 : INVALID_ARGUMENT : "
-        "Got invalid dimensions for input: arg0 for the following indices index: 0 Got: 3 Expected: 1"
+    @pytorch_test_common.xfail_dynamic_fx_test(
+        error_message="[ONNXRuntimeError] : 2 : INVALID_ARGUMENT : Got invalid dimensions for input"
     )
     @skip_if_no_torchvision
     def test_shufflenet_v2(self):
@@ -376,8 +396,8 @@ def forward(self, x, y):
             DynamicMatMul(), (x, y), additional_test_inputs=[((input_x, input_y),)]
         )
 
-    @pytorch_test_common.skip_dynamic_fx_test(
-        "fx graph does not capture symbolic value for aten::scalar_tensor."
+    @pytorch_test_common.xfail_dynamic_fx_test(
+        error_message="The values for attribute 'shape' do not match: torch.Size([]) != torch.Size([1])"
     )
     def test_scalar_tensor(self):
         class test(torch.nn.Module):
@@ -413,7 +433,7 @@ def forward(self, x):
         )
 
     @pytorch_test_common.xfail(
-        "torch._dynamo.exc.Unsupported: guard on data-dependent symbolic int/float"
+        error_message=("Unsupported FX nodes: {'call_function': [")
     )
     def test_squeeze_runtime_dim(self):
         class Squeeze(torch.nn.Module):
@@ -448,10 +468,7 @@ def forward(self, x):
         )
 
     @pytorch_test_common.xfail_if_model_type_is_exportedprogram(
-        "torch._export.verifier.SpecViolationError: User input output view_1 does not point to a user input that exists."
-        "Dict of user inputs that are mutated, in order: {'view_1': 'l_x_'}"
-        "User input nodes available: ('arg0_1',)"
-        " Github issue: https://github.com/pytorch/pytorch/issues/112429"
+        error_message="Expected 1 outputs, got 2",
     )
     def test_mutation(self):
         class MutationModel(torch.nn.Module):
@@ -480,15 +497,11 @@ def forward(self, input):
             additional_test_inputs=[((y,),)],
         )
 
-    @pytorch_test_common.xfail_if_model_type_is_exportedprogram(
-        "torch._export.verifier.SpecViolationError: User input output slice_scatter_1 does not point to a user input that exists."
-        "Dict of user inputs that are mutated, in order: {'slice_scatter_1': 'l_x_'}"
-        "User input nodes available: ('arg1_1',)"
+    @pytorch_test_common.xfail_dynamic_fx_test(
+        error_message="[ONNXRuntimeError] : 1 : FAIL : Non-zero status code returned while running Slice node. "
     )
-    @pytorch_test_common.skip_dynamic_fx_test(
-        "[ONNXRuntimeError] : 1 : FAIL : Non-zero status code returned while running Slice node. "
-        "Name:'_inline_aten_slice_scattern13' Status Message: slice.cc:193 "
-        "FillVectorsFromInput Starts must be a 1-D array"
+    @pytorch_test_common.xfail_if_model_type_is_exportedprogram(
+        error_message="Expected 1 outputs, got 2"
     )
     def test_expand_as_fill_zero(self):
         class Model(torch.nn.Module):
@@ -504,15 +517,11 @@ def forward(self, x):
             additional_test_inputs=[((x2,),)],
         )
 
-    @pytorch_test_common.xfail_if_model_type_is_exportedprogram(
-        "torch._export.verifier.SpecViolationError: User input output slice_scatter_1 does not point to a user input that exists."
-        "Dict of user inputs that are mutated, in order: {'slice_scatter_1': 'l_x_'}"
-        "User input nodes available: ('arg1_1',)"
+    @pytorch_test_common.xfail_dynamic_fx_test(
+        error_message="[ONNXRuntimeError] : 1 : FAIL : Non-zero status code returned while running Slice node. "
     )
-    @pytorch_test_common.skip_dynamic_fx_test(
-        "[ONNXRuntimeError] : 1 : FAIL : Non-zero status code returned while running Slice node. "
-        "Name:'_inline_aten_slice_scattern13' Status Message: slice.cc:193 "
-        "FillVectorsFromInput Starts must be a 1-D array"
+    @pytorch_test_common.xfail_if_model_type_is_exportedprogram(
+        error_message="Expected 1 outputs, got 2"
     )
     def test_expand_as_fill_tensor(self):
         class Model(torch.nn.Module):
@@ -529,8 +538,7 @@ def forward(self, x):
         )
 
     @pytorch_test_common.xfail_if_model_type_is_not_exportedprogram(
-        "RuntimeError: at::functionalization::impl::isFunctionalTensor(self_) INTERNAL ASSERT FAILED "
-        "at '/path/to/pytorch/torch/csrc/autograd/python_torch_functions_manual.cpp':514, please report a bug to PyTorch."
+        error_message="at::functionalization::impl::isFunctionalTensor(self_) INTERNAL ASSERT FAILED"
     )
     def test_expand_as_fill_separate_tensor(self):
         class Model(torch.nn.Module):
@@ -548,19 +556,22 @@ def forward(self, x):
 
     @pytorch_test_common.skipIfNoCuda
     def test__scaled_dot_product_flash_attention(self):
-        def func(x):
-            (
-                output,
-                _,
-                _,
-                _,
-                _,
-                _,
-                _,
-                _,
-                _,
-            ) = torch.ops.aten._scaled_dot_product_flash_attention(x, x, x)
-            return output
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                (
+                    output,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                ) = torch.ops.aten._scaled_dot_product_flash_attention(x, x, x)
+                return output
+
+        func = Foo()
 
         x = torch.randn(1, 1, 1, 32, device=torch.device("cuda"))
         self.run_test_with_fx_to_onnx_exporter_and_onnx_runtime(func, (x,))
@@ -610,53 +621,49 @@ def forward(
         )
 
     def test_operator_with_data_dependent_output(self):
-        def func(x):
-            # Repro from llama. Emits `torch.ops.aten._local_scalar_dense`.
-            return x + torch.full(x.shape, torch.tensor(torch.finfo(x.dtype).min))
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                # Repro from llama. Emits `torch.ops.aten._local_scalar_dense`.
+                return x + torch.full(x.shape, torch.tensor(torch.finfo(x.dtype).min))
+
+        func = Foo()
 
         self.run_test_with_fx_to_onnx_exporter_and_onnx_runtime(
             func, (torch.randn(3, 4),)
         )
 
     @pytorch_test_common.xfail_if_model_type_is_exportedprogram(
-        "Unsupported: {'call_function': ['<built-in function ge>', 'aten._assert_async.msg', '<built-in function le>']}."
-        " Github issue: https://github.com/pytorch/pytorch/issues/112443"
+        error_message="Unsupported FX nodes: {'call_function': ['aten._assert_async.msg']}.",
+        reason="https://github.com/pytorch/pytorch/issues/112622",
     )
     def test_operator_with_scalar_output(self):
-        def func(x, y):
-            return x.item() + y
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                return x.item() + y
+
+        func = Foo()
 
         self.run_test_with_fx_to_onnx_exporter_and_onnx_runtime(
             func, (torch.tensor([1]), torch.randn(3, 4))
         )
 
     @pytorch_test_common.xfail_if_model_type_is_exportedprogram(
-        "Unsupported: Unsupported FX nodes: {'call_function': ['aten._assert_async.msg']}."
-        " Github issue: https://github.com/pytorch/pytorch/issues/112443"
+        error_message="Unsupported FX nodes: {'call_function': ['aten._assert_async.msg']}",
+        reason="https://github.com/pytorch/pytorch/issues/112622",
     )
     def test_operator_with_dynamic_output_shape(self):
-        def func(x):
-            return x.nonzero()
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return x.nonzero()
+
+        func = Foo()
 
         self.run_test_with_fx_to_onnx_exporter_and_onnx_runtime(
             func, (torch.randn(3, 4),)
         )
 
     @pytorch_test_common.xfail_if_model_type_is_exportedprogram(
-        "AssertionError: AssertionError: original output #1 is BaseModelOutputWithPastAndCrossAttentions("
-        " last_hidden_state=FakeTensor(..., size=(2, 128, 16), grad_fn=<ViewBackward0>),"
-        " past_key_values=((FakeTensor(..., size=(2, 2, 128, 8), grad_fn=<PermuteBackward0>),"
-        "  FakeTensor(..., size=(2, 2, 128, 8), grad_fn=<PermuteBackward0>)),"
-        "  (FakeTensor(..., size=(2, 2, 128, 8), grad_fn=<PermuteBackward0>),"
-        "  FakeTensor(..., size=(2, 2, 128, 8), grad_fn=<PermuteBackward0>)),"
-        "  (FakeTensor(..., size=(2, 2, 128, 8), grad_fn=<PermuteBackward0>),"
-        "  FakeTensor(..., size=(2, 2, 128, 8), grad_fn=<PermuteBackward0>)),"
-        "  (FakeTensor(..., size=(2, 2, 128, 8), grad_fn=<PermuteBackward0>),"
-        "  FakeTensor(..., size=(2, 2, 128, 8), grad_fn=<PermuteBackward0>))),"
-        " hidden_states=None, attentions=None, cross_attentions=None),"
-        " but only the following types are supported:"
-        " (<class 'torch.Tensor'>, <class 'torch.SymInt'>, <class 'torch.SymFloat'>, <class 'torch.SymBool'>)"
-        " Github issue: https://github.com/pytorch/pytorch/issues/110100"
+        error_message="Trying to flatten user inputs with exported input tree spec"
     )
     def test_gpt2_tiny_from_config(self):
         # Model
@@ -840,8 +847,8 @@ def _test_fx_symbolic_tracer_large_scale_exporter(
             for ref_output, ort_output in zip(ref_outputs, ort_outputs):
                 torch.testing.assert_close(ref_output, torch.tensor(ort_output))
 
-    @pytorch_test_common.skip_dynamic_fx_test(
-        "FakeTensor exporting is not supported by dynamic axes."
+    @pytorch_test_common.xfail_dynamic_fx_test(
+        error_message="shape_env should be set if tracing with 'symbolic'"
     )
     def test_fx_symbolic_tracer_large_scale_exporter_with_toy_mlp(self):
         class MLPModel(nn.Module):
@@ -878,13 +885,8 @@ def create_pytorch_only_extra_kwargs():
             create_pytorch_only_extra_kwargs,
         )
 
-    @pytorch_test_common.xfail(
-        "[ONNXRuntimeError] : 1 : FAIL : Type Error: Data in initializer 'h_0_attn_bias' "
-        "has element type tensor(uint8) but usage of initializer in graph expects tensor(bool)"
-        "https://github.com/huggingface/transformers/issues/21013"
-    )
-    @pytorch_test_common.skip_dynamic_fx_test(
-        "FakeTensor exporting is not supported by dynamic axes."
+    @pytorch_test_common.xfail_dynamic_fx_test(
+        error_message="shape_env should be set if tracing with 'symbolic'"
     )
     def test_fx_symbolic_tracer_large_scale_exporter_with_tiny_gpt2(self):
         model_name = "sshleifer/tiny-gpt2"
@@ -1054,9 +1056,20 @@ def _test_fake_tensor_mode_exporter(
 
             onnx_test_common.assert_dynamic_shapes(onnx_program, self.dynamic_shapes)
 
+            if diagnostics.is_onnx_diagnostics_log_artifact_enabled():
+                onnx_program.save_diagnostics(
+                    f"test_report_{self._testMethodName}"
+                    f"_op_level_debug_{self.op_level_debug}"
+                    f"_dynamic_axes_{self.dynamic_shapes}"
+                    f"_load_checkpoint_{self.load_checkpoint_during_init}"
+                    f"_export_within_fake_mode_{self.export_within_fake_mode}"
+                    f"model_type_{self.model_type}"
+                    ".sarif"
+                )
+
             with tempfile.NamedTemporaryFile(suffix=".onnx") as tmp_onnx_file:
                 onnx_program.save(
-                    tmp_onnx_file.name, model_state_dict=tmp_checkpoint_file.name
+                    tmp_onnx_file.name, model_state=tmp_checkpoint_file.name
                 )
 
                 # Generate random inputs.
@@ -1064,8 +1077,12 @@ def _test_fake_tensor_mode_exporter(
                 kwargs = create_kwargs()
                 # Original outputs.
                 # model_with_state_dict=real_model is used to create non-fake weights
+                if isinstance(real_model, torch.export.ExportedProgram):
+                    outputs = real_model.module()(*args, **kwargs)
+                else:
+                    outputs = real_model(*args, **kwargs)
                 ref_outputs = onnx_program.adapt_torch_outputs_to_onnx(
-                    real_model(*args, **kwargs), model_with_state_dict=real_model
+                    outputs, model_with_state_dict=real_model
                 )
                 # ORT outputs.
                 # model_with_state_dict=real_model is used to create non-fake weights
@@ -1079,13 +1096,20 @@ def _test_fake_tensor_mode_exporter(
                 )
 
                 assert len(ref_outputs) == len(ort_outputs)
+                for ref_output, ort_output in zip(ref_outputs, ort_outputs):
+                    torch.testing.assert_close(ref_output, torch.tensor(ort_output))
 
+                # Test ONNXProgram.__call__ interface
+                ort_outputs = onnx_program(
+                    *args, model_with_state_dict=real_model, **kwargs
+                )
+                assert len(ref_outputs) == len(ort_outputs)
                 for ref_output, ort_output in zip(ref_outputs, ort_outputs):
                     torch.testing.assert_close(ref_output, torch.tensor(ort_output))
 
     @pytorch_test_common.skip_dynamic_fx_test(
-        "AssertionError: Dynamic shape check failed for graph inputs",
-        skip_model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        reason="Dynamic shape check is not expected for exported program in this test suite.",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
     )
     def test_fake_tensor_mode_simple(self):
         def create_model() -> nn.Module:
@@ -1116,15 +1140,13 @@ def create_kwargs():
             model_type=self.model_type,
         )
 
-    @pytorch_test_common.xfail_if_model_type_is_not_exportedprogram(
-        "[ONNXRuntimeError] : 1 : FAIL : Type Error: Data in initializer 'h_0_attn_bias' "
-        "has element type tensor(uint8) but usage of initializer in graph expects tensor(bool)"
-        "https://github.com/huggingface/transformers/issues/21013"
-        "This can be addressed by using GPT2Config, but it is not now supported by FakeTensor exporting."
-    )
     @pytorch_test_common.skip_dynamic_fx_test(
-        "AssertionError: Dynamic shape check failed for graph inputs",
-        skip_model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        reason="Dynamic shape check is not expected for exported program in this test suite.",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+    )
+    @pytorch_test_common.xfail_if_model_type_is_not_exportedprogram(
+        error_message="Expected 4 inputs, got 2",
+        reason="https://github.com/pytorch/pytorch/issues/115745",
     )
     def test_fake_tensor_mode_huggingface_tiny_gpt2(self):
         model_name = "sshleifer/tiny-gpt2"
@@ -1154,8 +1176,8 @@ def create_kwargs():
         )
 
     @pytorch_test_common.skip_dynamic_fx_test(
-        "AssertionError: Dynamic shape check failed for graph inputs",
-        skip_model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        reason="Dynamic shape check is not expected for exported program in this test suite.",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
     )
     def test_large_scale_exporter_with_toy_mlp(self):
         class MLPModel(nn.Module):
@@ -1196,8 +1218,8 @@ def create_kwargs():
         )
 
     @pytorch_test_common.skip_dynamic_fx_test(
-        "AssertionError: Dynamic shape check failed for graph inputs",
-        skip_model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        reason="Dynamic shape check is not expected for exported program in this test suite.",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
     )
     def test_fake_tensor_mode_huggingface_google_t5(self):
         config = transformers.T5Config(
@@ -1232,8 +1254,21 @@ def create_model():
         )
 
     @pytorch_test_common.skip_dynamic_fx_test(
-        "AssertionError: Dynamic shape check failed for graph inputs",
-        skip_model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        reason="Dynamic shape check is not expected for exported program in this test suite.",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+    )
+    @pytorch_test_common.xfail_dynamic_fx_test(
+        error_message="NOT_IMPLEMENTED : Could not find an implementation for Trilu(14) node",
+        reason="Need to check Trilu node in the ONNX graph",
+        model_type=pytorch_test_common.TorchModelType.TORCH_NN_MODULE,
+    )
+    @pytorch_test_common.xfail_if_model_type_is_not_exportedprogram(
+        error_message="NOT_IMPLEMENTED : Could not find an implementation for Trilu(14) node",
+        reason="Need to check Trilu node in the ONNX graph",
+    )
+    @pytorch_test_common.xfail_if_model_type_is_exportedprogram(
+        error_message="aot_autograd expected to have an entirely functional graph",
+        reason="aot_autograd doesn't support it.",
     )
     def test_fake_tensor_mode_huggingface_openai_whisper(self):
         config = transformers.WhisperConfig(
@@ -1288,8 +1323,9 @@ def create_kwargs():
             model_type=self.model_type,
         )
 
-    @pytorch_test_common.xfail(
-        "AssertionError: whole graph export entails exactly one guard export"
+    @pytorch_test_common.skip_dynamic_fx_test(
+        reason="Dynamic shape check is not expected for exported program in this test suite.",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
     )
     def test_fake_tensor_mode_huggingface_mosaicml_mpt(self):
         config = transformers.MptConfig(
@@ -1319,7 +1355,12 @@ def create_model():
         )
 
     @pytorch_test_common.skip_dynamic_fx_test(
-        "RuntimeError:: SymIntArrayRef expected to contain only concrete integers"
+        reason="Dynamic shape check is not expected for exported program in this test suite.",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+    )
+    @pytorch_test_common.xfail_dynamic_fx_test(
+        error_message="SymIntArrayRef expected to contain only concrete integers",
+        model_type=pytorch_test_common.TorchModelType.TORCH_NN_MODULE,
     )
     def test_fake_tensor_mode_huggingface_bigscience_bloom_560m(self):
         config = transformers.BloomConfig()
@@ -1346,13 +1387,13 @@ def create_model():
             model_type=self.model_type,
         )
 
-    @pytorch_test_common.xfail_if_model_type_is_not_exportedprogram(
-        "AssertionError: Expected 5 inputs, got 3"
-        "Github issue: https://github.com/pytorch/pytorch/issues/115745"
-    )
     @pytorch_test_common.skip_dynamic_fx_test(
-        "AssertionError: Dynamic shape check failed for graph inputs",
-        skip_model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+        reason="Dynamic shape check is not expected for exported program in this test suite.",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+    )
+    @pytorch_test_common.xfail_if_model_type_is_not_exportedprogram(
+        error_message="Expected 5 inputs, got 3",
+        reason="https://github.com/pytorch/pytorch/issues/115745",
     )
     def test_fake_tensor_mode_huggingface_gpt2(self):
         config = transformers.GPT2Config(
@@ -1389,6 +1430,87 @@ def create_kwargs():
             model_type=self.model_type,
         )
 
+    @pytorch_test_common.skip_dynamic_fx_test(
+        reason="Dynamic shape check is not expected for exported program in this test suite.",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+    )
+    @pytorch_test_common.xfail_dynamic_fx_test(
+        error_message="SymIntArrayRef expected to contain only concrete integers",
+        model_type=pytorch_test_common.TorchModelType.TORCH_NN_MODULE,
+    )
+    @pytorch_test_common.xfail_if_model_type_is_not_exportedprogram(
+        error_message="Expected 9 inputs, got 3",
+        reason="https://github.com/pytorch/pytorch/issues/115745",
+    )
+    def test_fake_tensor_mode_huggingface_databricks_dolly_v2_3b(self):
+        config = transformers.GPTNeoXConfig(
+            vocab_size=8096, hidden_size=256, num_hidden_layers=2, num_attention_heads=2
+        )
+        batch, seq = 4, 256
+
+        def create_model():
+            return transformers.GPTNeoXModel(config).eval()
+
+        def create_args():
+            return tuple()
+
+        def create_kwargs():
+            input_ids = torch.randint(0, config.vocab_size, (batch, seq))
+            attention_mask = torch.ones(batch, seq, dtype=torch.bool)
+            position_ids = torch.arange(0, seq, dtype=torch.long)
+            position_ids = position_ids.unsqueeze(0).view(-1, seq)
+
+            return {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "position_ids": position_ids,
+            }
+
+        self._test_fake_tensor_mode_exporter(
+            "huggingface_databricks_dolly_v2_3b",
+            create_model,
+            create_args,
+            create_kwargs,
+            load_checkpoint_during_init=self.load_checkpoint_during_init,
+            export_within_fake_mode=self.export_within_fake_mode,
+            model_type=self.model_type,
+        )
+
+    @pytorch_test_common.skip_dynamic_fx_test(
+        reason="Dynamic shape check is not expected for exported program in this test suite.",
+        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
+    )
+    @pytorch_test_common.xfail_if_model_type_is_not_exportedprogram(
+        error_message="Expected 4 inputs, got 2",
+        reason="https://github.com/pytorch/pytorch/issues/115745",
+    )
+    def test_fake_tensor_mode_huggingface_tiny_gpt2_torch_load(self):
+        model_name = "sshleifer/tiny-gpt2"
+        device = "cpu"
+
+        def create_model():
+            return transformers.AutoModel.from_pretrained(model_name).to(device).eval()
+
+        def create_args():
+            tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+            kwargs = tokenizer("Hello world!", return_tensors="pt")
+            input_ids = kwargs["input_ids"]
+            attention_mask = kwargs["attention_mask"]
+            return input_ids, None, attention_mask
+
+        def create_pytorch_only_extra_kwargs():
+            return {"return_dict": False}
+
+        self._test_fake_tensor_mode_exporter(
+            "huggingface_sshleifer_tiny-gpt2",
+            create_model,
+            create_args,
+            create_pytorch_only_extra_kwargs,
+            load_checkpoint_during_init=self.load_checkpoint_during_init,
+            export_within_fake_mode=self.export_within_fake_mode,
+            model_type=self.model_type,
+        )
+
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index caf2ecda4c363..a6eedd481beed 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -480,43 +480,157 @@ def forward(self, x):
         x = torch.randn(20, 16, 50)
         check_onnx_opsets_operator(MyDynamicModel(), x, ops, opset_versions=[9, 10])
 
-    def test_grid_sample(self):
-        n, c, h_in, w_in, h_out, w_out = 1, 1, 3, 2, 2, 4
-        ops = {16: [{"op_name": "GridSample"}]}
-
+    def test_affine_grid(self):
         class MyModule(Module):
-            def forward(self, x, grid, mode, padding_mode, align_corers):
-                return torch.nn.functional.grid_sample(
-                    x, grid, mode, padding_mode, align_corners
+            def __init__(self, align_corners):
+                super().__init__()
+                self.align_corners = align_corners
+
+            def forward(self, theta, size):
+                return torch.nn.functional.affine_grid(
+                    theta, size, align_corners=self.align_corners
                 )
 
-        for mode, padding_mode, align_corners in itertools.product(
-            ("bilinear", "nearest", "bicubic"),
-            ("zeros", "border", "reflection"),
+        opset_version = 20
+        ops_2d = {
+            opset_version: [
+                {"op_name": "Constant"},
+                {"op_name": "Unsqueeze"},
+                {"op_name": "Constant"},
+                {"op_name": "Unsqueeze"},
+                {"op_name": "Constant"},
+                {"op_name": "Unsqueeze"},
+                {"op_name": "Constant"},
+                {"op_name": "Unsqueeze"},
+                {"op_name": "Concat"},
+                {"op_name": "AffineGrid"},
+            ]
+        }
+
+        ops_3d = {
+            opset_version: [
+                {"op_name": "Constant"},
+                {"op_name": "Unsqueeze"},
+                {"op_name": "Constant"},
+                {"op_name": "Unsqueeze"},
+                {"op_name": "Constant"},
+                {"op_name": "Unsqueeze"},
+                {"op_name": "Constant"},
+                {"op_name": "Unsqueeze"},
+                {"op_name": "Constant"},
+                {"op_name": "Unsqueeze"},
+                {"op_name": "Concat"},
+                {"op_name": "AffineGrid"},
+            ]
+        }
+        # 2D affine
+        theta_2d = torch.empty(1, 2, 3, dtype=torch.double)
+        size_2d = torch.Size([1, 1, 2, 2])
+        # 3D affine
+        theta_3d = torch.empty(1, 3, 4, dtype=torch.double)
+        size_3d = torch.Size([1, 1, 2, 2, 2])
+
+        for inputs, align_corners in itertools.product(
+            ((theta_2d, size_2d, ops_2d), (theta_3d, size_3d, ops_3d)),
             (True, False),
         ):
+            theta, size, ops = inputs
             args = (
-                torch.randn(n, c, h_in, w_in),  # x
-                torch.randn(n, h_out, w_out, 2),  # grid,
-                mode,
-                padding_mode,
-                align_corners,
+                theta,
+                size,
             )
             check_onnx_opsets_operator(
-                MyModule(),
+                MyModule(align_corners=align_corners),
                 args,
                 ops,
-                opset_versions=[16],
+                opset_versions=[opset_version],
                 training=torch.onnx.TrainingMode.TRAINING,
             )
             check_onnx_opsets_operator(
-                MyModule(),
+                MyModule(align_corners=align_corners),
                 args,
                 ops,
-                opset_versions=[16],
+                opset_versions=[opset_version],
                 training=torch.onnx.TrainingMode.EVAL,
             )
 
+    def test_grid_sample(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self, mode, padding_mode, align_corners):
+                super().__init__()
+                self.mode = mode
+                self.padding_mode = padding_mode
+                self.align_corners = align_corners
+
+            def forward(self, x, grid):
+                return torch.nn.functional.grid_sample(
+                    x,
+                    grid,
+                    mode=self.mode,
+                    padding_mode=self.padding_mode,
+                    align_corners=self.align_corners,
+                )
+
+        for mode, padding_mode, align_corners, opset_version in itertools.product(
+            ("bilinear", "nearest", "bicubic"),
+            ("zeros", "border", "reflection"),
+            (True, False),
+            (16, 20),
+        ):
+
+            def test_eval_and_training(
+                ops, opset_version, mode, padding_mode, align_corners, x_shape, grid
+            ):
+                args = (
+                    torch.randn(*x_shape),  # x
+                    torch.randn(grid),  # grid,
+                )
+                check_onnx_opsets_operator(
+                    MyModule(
+                        mode=mode,
+                        padding_mode=padding_mode,
+                        align_corners=align_corners,
+                    ),
+                    args,
+                    ops,
+                    opset_versions=[opset_version],
+                    training=torch.onnx.TrainingMode.TRAINING,
+                )
+                check_onnx_opsets_operator(
+                    MyModule(
+                        mode=mode,
+                        padding_mode=padding_mode,
+                        align_corners=align_corners,
+                    ),
+                    args,
+                    ops,
+                    opset_versions=[opset_version],
+                    training=torch.onnx.TrainingMode.EVAL,
+                )
+
+            ops = {opset_version: [{"op_name": "GridSample"}]}
+            # mode = convert_grid_sample_mode(mode) if opset_version == 20 else mode
+            n, c, d_in, h_in, w_in, d_out, h_out, w_out = 1, 1, 2, 3, 2, 3, 2, 4
+            test_eval_and_training(
+                ops,
+                opset_version,
+                mode,
+                padding_mode,
+                align_corners,
+                (n, c, h_in, w_in),
+                (n, h_out, w_out, 2),
+            )
+            if opset_version == 20 and mode != "bicubic":
+                test_eval_and_training(
+                    ops,
+                    opset_version,
+                    mode,
+                    padding_mode,
+                    align_corners,
+                    (n, c, d_in, h_in, w_in),
+                    (n, d_out, h_out, w_out, 3),
+                )
+
     def test_flatten(self):
         class MyModule(Module):
             def forward(self, x):
diff --git a/test/onnx/test_onnxscript_no_runtime.py b/test/onnx/test_onnxscript_no_runtime.py
index 911dccbd5660e..4389dc53aef0c 100644
--- a/test/onnx/test_onnxscript_no_runtime.py
+++ b/test/onnx/test_onnxscript_no_runtime.py
@@ -26,8 +26,7 @@ def test_onnxscript_registration_with_multiple_models(self):
 
         @onnxscript.script(custom_opset)
         def Selu(X):
-            # TODO: onnx/ort doesn't support default values for now
-            # move this when they do
+            # default value is not supported by onnxscript
             alpha = 1.67326  # auto wrapped as Constants
             gamma = 1.0507
             alphaX = op.CastLike(alpha, X)
@@ -69,8 +68,7 @@ def layer_norm(
         def custom_layer_norm(
             g, input, normalized_shape, weight, bias, eps, cudnn_enable
         ):
-            # TODO: move the comprehension into local function once
-            # it's supported by onnxscript
+            # comprehension is not supported by onnxscript
             axes = [-i for i in range(len(normalized_shape), 0, -1)]
             return g.onnxscript_op(
                 layer_norm, input, weight, bias, axes_i=axes, eps_f=eps
diff --git a/test/onnx/test_onnxscript_runtime.py b/test/onnx/test_onnxscript_runtime.py
index c1ae4ea529b86..4266de85b107e 100644
--- a/test/onnx/test_onnxscript_runtime.py
+++ b/test/onnx/test_onnxscript_runtime.py
@@ -23,13 +23,13 @@ def test_selu_from_onnxscript_example(self):
 
         from onnxscript.onnx_opset import opset15 as op
 
-        # TODO(titaiwang): make an official domain for onnxscript usage
         custom_opset = onnxscript.values.Opset(domain="onnx-script", version=1)
 
         @onnxscript.script(custom_opset)
-        def Selu(X):
-            # TODO: onnx/ort doesn't support default values for now
-            # move this when they do
+        def Selu(
+            X,
+        ):
+            # default value is not supported by onnxscript
             alpha = 1.67326  # auto wrapped as Constants
             gamma = 1.0507
             alphaX = op.CastLike(alpha, X)
@@ -108,7 +108,7 @@ def layer_norm(
         def custom_layer_norm(
             g, input, normalized_shape, weight, bias, eps, cudnn_enable
         ):
-            # TODO: move the comprehension into local function once it's supported by onnxscript
+            # comprehension is not supported by onnxscript
             axes = [-i for i in range(len(normalized_shape), 0, -1)]
             return g.onnxscript_op(
                 layer_norm, input, weight, bias, axes_i=axes, eps_f=eps
diff --git a/test/onnx/test_op_consistency.py b/test/onnx/test_op_consistency.py
index f6a2d1484c924..b6f2db246884b 100644
--- a/test/onnx/test_op_consistency.py
+++ b/test/onnx/test_op_consistency.py
@@ -52,25 +52,26 @@
 
 # TODO: Directly modify DecorateInfo in each OpInfo in ob_db when all ops are enabled.
 # Ops to be tested for numerical consistency between onnx and pytorch
+# TODO: https://github.com/pytorch/pytorch/issues/102211
 TESTED_OPS: frozenset[str] = frozenset(
     [
         "atan",
         "atan2",
         # "atleast_1d",  # How to support list input?
-        # "atleast_2d",  # How to support list input?
-        # "atleast_3d",  # How to support list input?
+        # "atleast_2d",
+        # "atleast_3d",
         "broadcast_to",
         "ceil",
         "expand",
         "flatten",
         "hstack",
         "logical_not",
-        # "logit",  # TODO: enable after fixing https://github.com/pytorch/pytorch/issues/102211
+        # "logit",
         "nn.functional.scaled_dot_product_attention",
         "repeat",
         "round",
-        # "scatter_add",  # TODO: enable after fixing https://github.com/pytorch/pytorch/issues/102211
-        # "scatter_reduce",  # TODO: enable after fixing https://github.com/pytorch/pytorch/issues/102211
+        # "scatter_add",
+        # "scatter_reduce",
         "sqrt",
         "stft",
         "t",
@@ -272,7 +273,9 @@ class TestOnnxModelOutputConsistency(onnx_test_common._TestONNXRuntime):
 
     @common_device_type.ops(
         [op for op in OPS_DB if op.name in TESTED_OPS],
-        allowed_dtypes=onnx_test_common.TESTED_DTYPES,
+        allowed_dtypes=onnx_test_common.INT_TYPES
+        + onnx_test_common.FLOAT_TYPES
+        + onnx_test_common.BOOL_TYPES,
     )
     def test_output_match(self, device: str, dtype: torch.dtype, op):
         """Test the ONNX exporter."""
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index ed72229945b0b..0f40f9324c85f 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -283,7 +283,7 @@ class MyFun(Function):
             def symbolic(g, x):
                 # The inside of this function should never be invoked, because
                 # we will fail due to an argument mismatch first.
-                raise AssertionError()
+                raise AssertionError
 
             @staticmethod
             def forward(ctx, x, y):
diff --git a/test/onnx/test_pytorch_jit_onnx.py b/test/onnx/test_pytorch_jit_onnx.py
index 70f377e2b7c2f..2c02df78aabb4 100644
--- a/test/onnx/test_pytorch_jit_onnx.py
+++ b/test/onnx/test_pytorch_jit_onnx.py
@@ -53,8 +53,8 @@ class _TestJITIRToONNX:
     check_dtype = True
     ignore_none = True  # True for tracing, and Flase for scripting
 
-    def run_test(self, graph_ir, example_inputs):
-        graph = torch._C.parse_ir(graph_ir)
+    def run_test(self, graph_ir, example_inputs, parse_tensor_constants=False):
+        graph = torch._C.parse_ir(graph_ir, parse_tensor_constants)
         jit_outs = torch._C._jit_interpret_graph(graph, example_inputs)
 
         onnx_proto = _jit_graph_to_onnx_model(
@@ -91,6 +91,18 @@ def test_example_ir(self):
         b = torch.randn(2, 3)
         self.run_test(graph_ir, (a, b))
 
+    def test_where_constants(self):
+        graph_ir = """
+        graph(%0 : Bool(8, device=cpu),
+              %1 : Float(8, device=cpu)):
+          %3 : Double(device=cpu) = prim::Constant[value={0.}]()
+          %4 : Float(8) = aten::where(%0, %1, %3)
+          return (%4)
+        """
+        a = torch.zeros(8, dtype=bool)
+        b = torch.zeros(8)
+        self.run_test(graph_ir, (a, b), parse_tensor_constants=True)
+
     def test_add_sub_with_graph_inputs(self):
         for op in ["add", "sub", "rsub"]:
             graph_ir = f"""
diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
index 0e276c4f96de6..54fc178251539 100644
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -1314,6 +1314,18 @@ def test_aten_device_with_index(self):
                 decoder_attention_mask=ids["attention_mask"],
             )
 
+    def test_aten_linalg_vector_norm_with_reducel2(self):
+        class Net(torch.nn.Module):
+            def forward(self, x):
+                x = F.normalize(x)
+                return x
+
+        f = io.BytesIO()
+        torch.onnx.export(Net(), (torch.randn(1, 2, 2),), f)
+        onnx_model = onnx.load_from_string(f.getvalue())
+        onnx_nodes = [n.op_type for n in onnx_model.graph.node]
+        self.assertIn("ReduceL2", onnx_nodes)
+
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 8287bada16a4a..c6f329942e295 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -1479,7 +1479,7 @@ def forward(self, x):
             output_names=["output_0"],
         )
 
-    # TODO: Enable after https://github.com/onnx/onnx/pull/5741 or after ONNX 1.15.1+ is released
+    # TODO: Enable maxpool-ceil family after ONNX 1.15.1+ is bumped
     @skipIfUnsupportedMaxOpsetVersion(9)
     def test_maxpool_1d_ceil_corner(self):
         model = torch.nn.MaxPool1d(
@@ -1488,7 +1488,6 @@ def test_maxpool_1d_ceil_corner(self):
         x = torch.randn(1, 3, 32)
         self.run_test(model, x)
 
-    # TODO: Enable after https://github.com/onnx/onnx/pull/5741 or after ONNX 1.15.1+ is released
     @skipIfUnsupportedMaxOpsetVersion(9)
     def test_maxpool_2d_ceil_corner(self):
         model = torch.nn.MaxPool2d(
@@ -1501,7 +1500,6 @@ def test_maxpool_2d_ceil_corner(self):
         x = torch.randn(1, 3, 32, 32)
         self.run_test(model, x)
 
-    # TODO: Enable after https://github.com/onnx/onnx/pull/5741 or after ONNX 1.15.1+ is released
     @skipIfUnsupportedMaxOpsetVersion(9)
     def test_maxpool_3d_ceil_corner(self):
         model = torch.nn.MaxPool3d(
@@ -1515,7 +1513,6 @@ def test_maxpool_3d_ceil_corner(self):
         x = torch.randn(1, 3, 51, 52, 45)
         self.run_test(model, x)
 
-    # TODO: Enable after https://github.com/onnx/onnx/pull/5741 or after ONNX 1.15.1+ is released
     @skipIfUnsupportedMaxOpsetVersion(9)
     @skipIfUnsupportedMinOpsetVersion(8)
     def test_maxpool_1d_ceil_corner_with_indices(self):
@@ -1525,7 +1522,6 @@ def test_maxpool_1d_ceil_corner_with_indices(self):
         x = torch.randn(1, 3, 32)
         self.run_test(model, x)
 
-    # TODO: Enable after https://github.com/onnx/onnx/pull/5741 or after ONNX 1.15.1+ is released
     @skipIfUnsupportedMaxOpsetVersion(9)
     @skipIfUnsupportedMinOpsetVersion(8)
     def test_maxpool_2d_ceil_corner_with_indices(self):
@@ -1539,7 +1535,6 @@ def test_maxpool_2d_ceil_corner_with_indices(self):
         x = torch.randn(1, 3, 32, 32)
         self.run_test(model, x)
 
-    # TODO: Enable after https://github.com/onnx/onnx/pull/5741 or after ONNX 1.15.1+ is released
     @skipIfUnsupportedMaxOpsetVersion(9)
     @skipIfUnsupportedMinOpsetVersion(8)
     def test_maxpool_3d_ceil_corner_with_indices(self):
@@ -1609,7 +1604,9 @@ def test_avgpool_2d(self, padding, count_include_pad):
     # TODO: ceil_mode is not included in the test, because of
     # https://github.com/microsoft/onnxruntime/issues/16203
     # The ORT and PyTorch has different calculation for ceil_mode (the last value).
-    @skipIfUnsupportedMinOpsetVersion(19)
+    # the issue requires fix in onnx(21) (https://github.com/onnx/onnx/issues/5711)
+    # a fix in ORT is planned. After the fixes in place, we can add ceil_mode to the test.
+    @skipIfUnsupportedMinOpsetVersion(21)
     def test_avgpool_3d_ceil(self):
         model = torch.nn.AvgPool3d(3, 2, ceil_mode=True)
         x = torch.randn(20, 16, 50, 44, 31)
@@ -7160,6 +7157,47 @@ def forward(self, x):
         x = torch.ones(2, 3, dtype=torch.float16)
         self.run_test(DoNotUpcastModel(), x)
 
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_scalar_type_promotion_onnx_where_two_prim_const(self):
+        class TwoPrimConstCastWhereModel(torch.nn.Module):
+            def forward(self, c):
+                return torch.where(c, 0, 1.0)
+
+        c = torch.ones(8, dtype=torch.bool)
+        self.run_test(TwoPrimConstCastWhereModel(), (c))
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_scalar_type_promotion_onnx_where_one_prim_const(self):
+        class OnePrimConstCastWhereModel(torch.nn.Module):
+            def forward(self, c, x):
+                return torch.where(c, x, 1.0)
+
+        c = torch.ones(8, dtype=torch.bool)
+        x = torch.ones(8, dtype=torch.float16)
+        self.run_test(OnePrimConstCastWhereModel(), (c, x))
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_scalar_type_promotion_onnx_where_one_tensor_const(self):
+        class OneTensorConstCastWhereModel(torch.nn.Module):
+            def forward(self, c, x):
+                return torch.where(c, x, torch.ones(size=(), dtype=torch.float64))
+
+        c = torch.ones(8, dtype=torch.bool)
+        x = torch.ones(8, dtype=torch.float16)
+        self.run_test(OneTensorConstCastWhereModel(), (c, x))
+
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_scalar_type_upcast_type_promotion_onnx_where_no_const(self):
+        class OnnxWhereUpcastModel(torch.nn.Module):
+            def forward(self, c, x, y):
+                return torch.where(c, x, y)
+
+        c = torch.ones(8, dtype=torch.bool)
+        x = torch.ones(8, dtype=torch.float16)
+        y = torch.ones(8, dtype=torch.float32)
+
+        self.run_test(OnnxWhereUpcastModel(), (c, x, y))
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_full_like(self):
         class FullLikeModel(torch.nn.Module):
@@ -7620,6 +7658,14 @@ def forward(self, embedding_matrix, input, offset, weights):
         self.run_test(model, (embedding_matrix, x, offset, w))
 
     @skipIfUnsupportedMinOpsetVersion(11)
+    @unittest.skip(
+        "This test is broken with ONNXRuntime(17): "
+        "when running with onnxruntime 1.17.0 this test fails with the following error:"
+        "FAIL : Non-zero status code returned while running If node. "
+        "Name:'/If' Status Message: if.cc:253 Compute "
+        "If nodes condition input must have exactly one element"
+        "https://github.com/pytorch/pytorch/issues/119442"
+    )
     def test_embedding_bag_2d_per_sample_weights(self):
         class EmbeddingModel(torch.nn.Module):
             def forward(self, embedding_matrix, input, weights):
@@ -12636,7 +12682,7 @@ def test_tuple_output_from_if_with_raised_exception(self):
         class M(torch.nn.Module):
             def forward(self, t: Tensor) -> Tuple[Tensor, Tensor]:
                 if float(t) < 0:
-                    raise Exception("Negative input")
+                    raise Exception("Negative input")  # noqa: TRY002
                 else:
                     return torch.zeros(5), torch.zeros(5)
 
@@ -13349,6 +13395,113 @@ def forward(self, x):
         self.run_test(Module(False), x, rtol=1e-3, atol=1e-6)
         self.run_test(Module(True), x, rtol=1e-3, atol=1e-6)
 
+    class AffineGridModule(torch.nn.Module):
+        def __init__(self, align_corners) -> None:
+            super().__init__()
+            self.align_corners = align_corners
+
+        def forward(self, theta, size):
+            return torch.nn.functional.affine_grid(theta, size, self.align_corners)
+
+    @skipIfUnsupportedMinOpsetVersion(20)
+    @skipScriptTest()
+    @common_utils.parametrize(
+        "align_corners",
+        (True, False),
+    )
+    @common_utils.parametrize(
+        "theta_params",
+        (
+            (
+                10,
+                np.array([0.3, -0.5]),
+                np.array([1.5, 0.5]),
+            ),
+            (
+                60,
+                np.array([-0.5, -0.5]),
+                np.array([3.0, 5.5]),
+            ),
+        ),
+    )
+    @common_utils.parametrize(
+        "size",
+        ([1, 1, 3, 2], [2, 10, 2, 3]),
+    )
+    def test_affine_grid_2d(self, align_corners, theta_params, size):
+        angle, translation, scale = theta_params
+        theta = np.array([], dtype=np.float32)
+        for _ in range(size[0]):
+            angle_radian = (angle / 180.0) * np.pi
+            theta = np.append(
+                theta,
+                [
+                    np.cos(angle_radian) * scale[0],
+                    -np.sin(angle_radian),
+                    translation[0],
+                    np.sin(angle_radian),
+                    np.cos(angle_radian) * scale[1],
+                    translation[1],
+                ],
+            )
+        theta = theta.reshape(size[0], 2, 3)
+        theta = torch.Tensor(theta)
+        self.run_test(TestONNXRuntime.AffineGridModule(align_corners), (theta, size))
+
+    @skipIfUnsupportedMinOpsetVersion(20)
+    @skipScriptTest()
+    @common_utils.parametrize(
+        "align_corners",
+        (True, False),
+    )
+    @common_utils.parametrize(
+        "theta_params",
+        (
+            (
+                [10, 20],
+                np.array([0.3, -0.5, 1.8]),
+                np.array([1.5, 2.0, 0.5]),
+            ),
+            (
+                [60, -30],
+                np.array([-0.5, -0.5, 0.3]),
+                np.array([0.3, 3.0, 5.5]),
+            ),
+        ),
+    )
+    @common_utils.parametrize(
+        "size",
+        ([1, 1, 3, 2, 2], [2, 10, 2, 2, 3]),
+    )
+    def test_affine_grid_3d(self, align_corners, theta_params, size):
+        angle, translation, scale = theta_params
+        theta = np.array([], dtype=np.float32)
+        for _ in range(size[0]):
+            angle_radian_x = (angle[0] / 180.0) * np.pi
+            angle_radian_y = (angle[1] / 180.0) * np.pi
+            rot_matrix_x = np.array(
+                [
+                    [1, 0, 0],
+                    [0, np.cos(angle_radian_x), -np.sin(angle_radian_x)],
+                    [0, np.sin(angle_radian_x), np.cos(angle_radian_x)],
+                ]
+            )
+            rot_matrix_y = np.array(
+                [
+                    [np.cos(angle_radian_y), 0, np.sin(angle_radian_y)],
+                    [0, 1, 0],
+                    [-np.sin(angle_radian_y), 0, np.cos(angle_radian_y)],
+                ]
+            )
+            rot_matrix = np.matmul(rot_matrix_x, rot_matrix_y)
+            rot_matrix = rot_matrix * scale.reshape(3, 1)
+            rot_matrix = np.append(rot_matrix, np.reshape(translation, (3, 1)), axis=1)
+            theta = np.append(theta, rot_matrix.flatten())
+
+        theta = theta.reshape(size[0], 3, 4)
+        theta = torch.Tensor(theta)
+        self.run_test(TestONNXRuntime.AffineGridModule(align_corners), (theta, size))
+
     @skipIfUnsupportedMinOpsetVersion(16)
     @common_utils.parametrize(
         "mode",
@@ -13364,7 +13517,15 @@ def forward(self, x):
         name_fn=lambda align_corners: str(align_corners),
     )
     def test_grid_sample(self, mode, padding_mode, align_corners):
-        n, c, h_in, w_in, h_out, w_out = 1, 1, 3, 2, 2, 4
+        n, c, d_in, h_in, w_in, d_out, h_out, w_out = 1, 1, 2, 3, 2, 3, 2, 4
+
+        atol_rtol = {}
+        if (mode, padding_mode) == ("bicubic", "border"):
+            if align_corners:
+                atol_rtol.update({"atol": 0.3, "rtol": 0.4})
+            else:
+                atol_rtol.update({"atol": 0.02, "rtol": 0.02})
+        input, grid = torch.randn(n, c, h_in, w_in), torch.randn(n, h_out, w_out, 2)
 
         class GridSampleModule(torch.nn.Module):
             def __init__(self, mode, padding_mode, align_corners) -> None:
@@ -13380,13 +13541,6 @@ def forward(self, input, grid):
                     input, grid, self.mode, self.padding_mode, self.align_corners
                 )
 
-        atol_rtol = {}
-        if (mode, padding_mode) == ("bicubic", "border"):
-            if align_corners:
-                atol_rtol.update({"atol": 0.3, "rtol": 0.4})
-            else:
-                atol_rtol.update({"atol": 0.02, "rtol": 0.02})
-        input, grid = torch.randn(n, c, h_in, w_in), torch.randn(n, h_out, w_out, 2)
         self.run_test(
             GridSampleModule(mode, padding_mode, align_corners),
             (input, grid),
@@ -13394,8 +13548,6 @@ def forward(self, input, grid):
         )
 
         # ONNX Opset 16 GridSample with 5D volumetric input is not supported.
-        d_in = 2
-        d_out = 3
         volumetric_input_tensor = torch.randn(n, c, d_in, h_in, w_in)
         volumetric_grid_tensor = torch.randn(n, d_out, h_out, w_out, 3)
         for mode, padding_mode, align_corners in itertools.product(
@@ -13413,9 +13565,16 @@ def forward(self, input, grid):
                 False,
             ),
         ):
-            with self.assertRaises(
-                torch.onnx.errors.OnnxExporterError,
-            ):
+            if self.opset_version < 20:
+                with self.assertRaises(
+                    torch.onnx.errors.OnnxExporterError,
+                ):
+                    self.run_test(
+                        GridSampleModule(mode, padding_mode, align_corners),
+                        (volumetric_input_tensor, volumetric_grid_tensor),
+                        **atol_rtol,
+                    )
+            else:
                 self.run_test(
                     GridSampleModule(mode, padding_mode, align_corners),
                     (volumetric_input_tensor, volumetric_grid_tensor),
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 2a57f043c700f..f133b5cf149e2 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -256,6 +256,7 @@ def forward(self, x):
             self.assertNotEqual(node.kind(), "onnx::Cast")
         self.assertEqual(len(list(graph.nodes())), 2)
 
+    @skipIfUnsupportedMaxOpsetVersion(17)
     def test_constant_fold_reduceL2(self):
         class ReduceModule(torch.nn.Module):
             def forward(self, x):
@@ -273,6 +274,7 @@ def forward(self, x):
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::ReduceL2")
 
+    @skipIfUnsupportedMaxOpsetVersion(17)
     def test_constant_fold_reduceL1(self):
         class NormModule(torch.nn.Module):
             def forward(self, x):
diff --git a/test/onnx/torch_export/test_torch_export_with_onnxruntime.py b/test/onnx/torch_export/test_torch_export_with_onnxruntime.py
index dc62228947c2e..2ff59554d5e16 100644
--- a/test/onnx/torch_export/test_torch_export_with_onnxruntime.py
+++ b/test/onnx/torch_export/test_torch_export_with_onnxruntime.py
@@ -32,7 +32,10 @@ def _compare_onnx_and_torch_exported_program(
         # Thus, ONNXProgram() must run before ref_model() to prevent ref_model.forward() from changing the state_dict.
         # Otherwise, the ref_model can change buffers on state_dict which would be used by ONNXProgram.__call__()
         onnx_outputs = onnx_exported_program(*input_args, **input_kwargs)
-        torch_outputs = torch_exported_program(*input_args, **input_kwargs)
+        if isinstance(torch_exported_program, torch.export.ExportedProgram):
+            torch_outputs = torch_exported_program.module()(*input_args, **input_kwargs)
+        else:
+            torch_outputs = torch_exported_program(*input_args, **input_kwargs)
         torch_outputs_onnx_format = onnx_exported_program.adapt_torch_outputs_to_onnx(
             torch_outputs
         )
@@ -84,8 +87,11 @@ def forward(self, x):
         )
 
     def test_exported_program_with_specialized_input_during_tracing(self):
-        def f(x, y):
-            return x + y
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        f = Foo()
 
         tensor_input = torch.ones(7, 5)
         dim0_x = torch.export.Dim("dim0_x", min=6)
@@ -131,7 +137,7 @@ def forward(self, x):
         # NOTE: If input is ExportedProgram, we need to specify dynamic_shapes
         # as a tuple.
         reexported_program = torch.export.export(
-            exported_program, (tensor_input,), dynamic_shapes=({0: dim0_x},)
+            exported_program.module(), (tensor_input,), dynamic_shapes=({0: dim0_x},)
         )
         reexported_onnx_program = torch.onnx.dynamo_export(
             reexported_program, tensor_input
@@ -145,8 +151,11 @@ def forward(self, x):
         )
 
     def test_onnx_program_supports_none_arg_name_in_dynamic(self):
-        def foo(a, b):
-            return a.sum() + b.sum()
+        class Foo(torch.nn.Module):
+            def forward(self, a, b):
+                return a.sum() + b.sum()
+
+        foo = Foo()
 
         dim = torch.export.Dim("dim")
         exported_program = torch.export.export(
@@ -165,8 +174,11 @@ def foo(a, b):
         )
 
     def test_onnx_program_suppors_non_arg_name_with_kwarg(self):
-        def foo(a, b, kw1, kw2):
-            return a.sum() + b.sum() + kw1.sum() - kw2.sum()
+        class Foo(torch.nn.Module):
+            def forward(self, a, b, kw1, kw2):
+                return a.sum() + b.sum() + kw1.sum() - kw2.sum()
+
+        foo = Foo()
 
         dim = torch.export.Dim("dim")
         dim_for_kw1 = torch.export.Dim("dim_for_kw1")
@@ -238,8 +250,11 @@ def forward(self, x, b):
             )
 
     def test_onnx_program_supports_non_arg_name_with_container_type(self):
-        def foo(a, b):
-            return a[0].sum() + a[1].sum() + b.sum()
+        class Foo(torch.nn.Module):
+            def forward(self, a, b):
+                return a[0].sum() + a[1].sum() + b.sum()
+
+        foo = Foo()
 
         inp_a = (torch.randn(4, 4), torch.randn(4, 4))
         inp_b = torch.randn(4, 4)
diff --git a/test/onnx/verify.py b/test/onnx/verify.py
index 0dca46764d139..74b85acc2e5e3 100644
--- a/test/onnx/verify.py
+++ b/test/onnx/verify.py
@@ -154,7 +154,7 @@ def fail(self):
         NB: It is an error to "fail" without having added any errors to
         the error context.
         """
-        raise self.exc_class()
+        raise self.exc_class
 
     def failWith(self, msg):
         """
@@ -489,7 +489,7 @@ def run(args, remained_onnx_input_idx):
                     errs.requireEqual(
                         proto_bytes.getvalue(), alt_proto_bytes.getvalue()
                     )
-                    raise AssertionError()
+                    raise AssertionError
 
             # TODO: test that the traced model also returns the same thing...
             run_helper(torch_out, args, remained_onnx_input_idx)
diff --git a/test/onnx_caffe2/test_pytorch_onnx_caffe2.py b/test/onnx_caffe2/test_pytorch_onnx_caffe2.py
index a3b0d0656eb8f..a3254995a255b 100644
--- a/test/onnx_caffe2/test_pytorch_onnx_caffe2.py
+++ b/test/onnx_caffe2/test_pytorch_onnx_caffe2.py
@@ -3071,7 +3071,6 @@ def setup_rnn_tests():
         variable_length_opts,
         dropout_opts,
     ):
-
         for base, name, extra_kwargs in (
             ("elman", "elman_relu", {"nonlinearity": "relu"}),
             ("elman", "elman_tanh", {"nonlinearity": "tanh"}),
diff --git a/test/optim/test_lrscheduler.py b/test/optim/test_lrscheduler.py
index 20c5028649e55..bca8b3b6b69c4 100644
--- a/test/optim/test_lrscheduler.py
+++ b/test/optim/test_lrscheduler.py
@@ -1,40 +1,41 @@
 # Owner(s): ["module: optimizer", "module: LrScheduler" ]
-import types
-import warnings
 import math
 import pickle
+import tempfile
+import types
+import warnings
 from functools import partial
 
 import torch
 import torch.nn.functional as F
 from torch.nn import Parameter
-from torch.optim import Adam, SGD
+from torch.optim import Adam, Rprop, SGD
 from torch.optim.lr_scheduler import (
-    LambdaLR,
-    MultiplicativeLR,
-    SequentialLR,
-    StepLR,
-    MultiStepLR,
+    ChainedScheduler,
     ConstantLR,
-    LinearLR,
-    ExponentialLR,
     CosineAnnealingLR,
-    ReduceLROnPlateau,
-    LRScheduler,
-    CyclicLR,
     CosineAnnealingWarmRestarts,
+    CyclicLR,
+    EPOCH_DEPRECATION_WARNING,
+    ExponentialLR,
+    LambdaLR,
+    LinearLR,
+    LRScheduler,
+    MultiplicativeLR,
+    MultiStepLR,
     OneCycleLR,
-    ChainedScheduler,
     PolynomialLR,
-    EPOCH_DEPRECATION_WARNING,
+    ReduceLROnPlateau,
+    SequentialLR,
+    StepLR,
 )
 from torch.optim.swa_utils import SWALR
 from torch.testing._internal.common_utils import (
-    TestCase,
+    instantiate_parametrized_tests,
     load_tests,
     parametrize,
-    instantiate_parametrized_tests,
-    skipIfTorchDynamo
+    skipIfTorchDynamo,
+    TestCase,
 )
 
 # load_tests from common_utils is used to automatically filter tests for
@@ -52,7 +53,6 @@ def __init__(self):
         def forward(self, x):
             return self.conv2(F.relu(self.conv1(x)))
 
-
     class LambdaLRTestObject:
         def __init__(self, value):
             self.value = value
@@ -65,6 +65,7 @@ def __eq__(self, other):
                 return self.__dict__ == other.__dict__
             else:
                 return False
+
     exact_dtype = True
 
     def setUp(self):
@@ -112,7 +113,9 @@ def get_lr(self, step):
         with self.assertRaises(TypeError):
             scheduler = MultiStepLR(optimizer, gamma=1, milestones=[10, 20])
 
-    @skipIfTorchDynamo("Torchdynamo keeps references to optim in the guards and the stack of the graph break frames")
+    @skipIfTorchDynamo(
+        "Torchdynamo keeps references to optim in the guards and the stack of the graph break frames"
+    )
     def test_no_cyclic_references(self):
         import gc
 
@@ -132,7 +135,9 @@ def test_no_cyclic_references(self):
             gc.collect(), 0, msg="Optimizer should be garbage-collected on __del__"
         )
 
-    @skipIfTorchDynamo("Torchdynamo keeps references to optim in the guards and the stack of the graph break frames")
+    @skipIfTorchDynamo(
+        "Torchdynamo keeps references to optim in the guards and the stack of the graph break frames"
+    )
     def test_no_cyclic_references_in_step(self):
         import gc
         import weakref
@@ -347,9 +352,7 @@ def test_get_last_lr_step_lr(self):
         from torch.nn import Parameter
 
         epochs = 10
-        optimizer = SGD(
-            [Parameter(torch.randn(2, 2, requires_grad=True))], 0.1
-        )
+        optimizer = SGD([Parameter(torch.randn(2, 2, requires_grad=True))], 0.1)
         targets = [[0.1] * 3 + [0.01] * 3 + [0.001] * 3 + [0.0001]]
         scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 3, gamma=0.1)
         self._test_get_last_lr(scheduler, targets, epochs)
@@ -686,6 +689,16 @@ def test_reduce_lr_on_plateau8(self):
         )
         self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
 
+    def test_reduce_lr_on_plateau_get_last_lr_before_step(self):
+        for param_group in self.opt.param_groups:
+            param_group["lr"] = 0.5
+        scheduler = ReduceLROnPlateau(
+            self.opt,
+        )
+        self.assertEqual(
+            scheduler.get_last_lr(), [0.5 for param_group in self.opt.param_groups]
+        )
+
     def test_sequentiallr1(self):
         epochs = 19
         schedulers = [None] * 2
@@ -1510,8 +1523,12 @@ def test_cycle_lr_with_momentumless_optimizer(self):
 
     def test_cycle_lr_cycle_momentum_fail_with_momentumless_optimizer(self):
         with self.assertRaises(ValueError):
-            adam_opt = Adam(self.net.parameters())
-            scheduler = CyclicLR(adam_opt, base_lr=1, max_lr=5, cycle_momentum=True)
+            rprop_opt = Rprop(self.net.parameters())
+            scheduler = CyclicLR(rprop_opt, base_lr=1, max_lr=5, cycle_momentum=True)
+
+    def test_cycle_lr_cycle_momentum_with_beta1_optimizer(self):
+        adam_opt = Adam(self.net.parameters())
+        scheduler = CyclicLR(adam_opt, base_lr=1, max_lr=5, cycle_momentum=True)
 
     def test_cycle_lr_removed_after_out_of_scope(self):
         import gc
@@ -1543,7 +1560,9 @@ def test_cycle_lr_state_dict_picklable(self):
         def scale_fn(_):
             return 0.5
 
-        scheduler = CyclicLR(adam_opt, base_lr=1, max_lr=5, cycle_momentum=False, scale_fn=scale_fn)
+        scheduler = CyclicLR(
+            adam_opt, base_lr=1, max_lr=5, cycle_momentum=False, scale_fn=scale_fn
+        )
         state = scheduler.state_dict()
         self.assertNotIn("_scale_fn_ref", state)
         self.assertIs(state["_scale_fn_custom"], None)
@@ -1559,7 +1578,9 @@ def __call__(self, _):
 
         scale_fn = ScaleFn()
 
-        scheduler = CyclicLR(adam_opt, base_lr=1, max_lr=5, cycle_momentum=False, scale_fn=scale_fn)
+        scheduler = CyclicLR(
+            adam_opt, base_lr=1, max_lr=5, cycle_momentum=False, scale_fn=scale_fn
+        )
         state = scheduler.state_dict()
         self.assertNotIn("_scale_fn_ref", state)
         self.assertEqual(state["_scale_fn_custom"], scale_fn.__dict__)
@@ -1569,11 +1590,17 @@ def test_cycle_lr_scale_fn_restored_from_state_dict(self):
         adam_opt = Adam(self.net.parameters())
 
         # Case 1: Built-in mode
-        scheduler = CyclicLR(adam_opt, base_lr=1, max_lr=5, cycle_momentum=False, mode="triangular2")
-        restored_scheduler = CyclicLR(adam_opt, base_lr=1, max_lr=5, cycle_momentum=False)
+        scheduler = CyclicLR(
+            adam_opt, base_lr=1, max_lr=5, cycle_momentum=False, mode="triangular2"
+        )
+        restored_scheduler = CyclicLR(
+            adam_opt, base_lr=1, max_lr=5, cycle_momentum=False
+        )
         restored_scheduler.load_state_dict(scheduler.state_dict())
         self.assertTrue(restored_scheduler.mode == scheduler.mode == "triangular2")
-        self.assertIsNotNone(restored_scheduler._scale_fn_ref) and self.assertIsNotNone(scheduler._scale_fn_ref)
+        self.assertIsNotNone(restored_scheduler._scale_fn_ref) and self.assertIsNotNone(
+            scheduler._scale_fn_ref
+        )
         self.assertIs(restored_scheduler._scale_fn_custom, None)
         self.assertIs(scheduler._scale_fn_custom, None)
 
@@ -1581,8 +1608,12 @@ def test_cycle_lr_scale_fn_restored_from_state_dict(self):
         def scale_fn(_):
             return 0.5
 
-        scheduler = CyclicLR(adam_opt, base_lr=1, max_lr=5, cycle_momentum=False, scale_fn=scale_fn)
-        restored_scheduler = CyclicLR(adam_opt, base_lr=1, max_lr=5, cycle_momentum=False, scale_fn=scale_fn)
+        scheduler = CyclicLR(
+            adam_opt, base_lr=1, max_lr=5, cycle_momentum=False, scale_fn=scale_fn
+        )
+        restored_scheduler = CyclicLR(
+            adam_opt, base_lr=1, max_lr=5, cycle_momentum=False, scale_fn=scale_fn
+        )
         restored_scheduler.load_state_dict(scheduler.state_dict())
         self.assertIs(scheduler._scale_fn_custom, scale_fn)
         self.assertIs(restored_scheduler._scale_fn_custom, scale_fn)
@@ -1677,6 +1708,54 @@ def annealing_cos(start, end, pct):
         )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, 10)
 
+    def test_onecycle_lr_legacy_state_dict(self):
+        scheduler = OneCycleLR(
+            self.opt,
+            max_lr=25,
+            final_div_factor=2,
+            base_momentum=1,
+            max_momentum=22,
+            total_steps=10,
+            anneal_strategy="cos",
+        )
+        delattr(scheduler, "_anneal_func_type")
+        state_dict = scheduler.state_dict()
+        self.assertNotIn("anneal_func_type", state_dict)
+        state_dict["anneal_func"] = OneCycleLR._annealing_cos
+        scheduler.load_state_dict(state_dict)
+
+        def annealing_cos(start, end, pct):
+            cos_out = math.cos(math.pi * pct) + 1
+            return end + (start - end) / 2.0 * cos_out
+
+        lr_target = [
+            1,
+            13,
+            25,
+            annealing_cos(25, 0.5, 1 / 7.0),
+            annealing_cos(25, 0.5, 2 / 7.0),
+            annealing_cos(25, 0.5, 3 / 7.0),
+            annealing_cos(25, 0.5, 4 / 7.0),
+            annealing_cos(25, 0.5, 5 / 7.0),
+            annealing_cos(25, 0.5, 6 / 7.0),
+            0.5,
+        ]
+        momentum_target = [
+            22,
+            11.5,
+            1,
+            annealing_cos(1, 22, 1 / 7.0),
+            annealing_cos(1, 22, 2 / 7.0),
+            annealing_cos(1, 22, 3 / 7.0),
+            annealing_cos(1, 22, 4 / 7.0),
+            annealing_cos(1, 22, 5 / 7.0),
+            annealing_cos(1, 22, 6 / 7.0),
+            22,
+        ]
+        lr_targets = [lr_target, lr_target]
+        momentum_targets = [momentum_target, momentum_target]
+        self._test_cycle_lr(scheduler, lr_targets, momentum_targets, 10)
+
     def test_cycle_lr_with_adam(self):
         old_opt = self.opt
         self.opt = Adam(
@@ -2241,30 +2320,40 @@ def test_cosine_then_cyclic(self):
 
         self.assertLessEqual(last_lr, max_lr)
 
-
-    @parametrize("LRClass", [
-        partial(LambdaLR, lr_lambda=lambda e: e // 10),
-        partial(MultiplicativeLR, lr_lambda=lambda: 0.95),
-        partial(StepLR, step_size=30),
-        partial(MultiStepLR, milestones=[30, 80]),
-        ConstantLR,
-        LinearLR,
-        partial(ExponentialLR, gamma=0.9),
-        lambda opt, **kwargs: SequentialLR(
-            opt, schedulers=[ConstantLR(opt), ConstantLR(opt)], milestones=[2], **kwargs),
-        PolynomialLR,
-        partial(CosineAnnealingLR, T_max=10),
-        ReduceLROnPlateau,
-        partial(CyclicLR, base_lr=0.01, max_lr=0.1),
-        partial(CosineAnnealingWarmRestarts, T_0=20),
-        partial(OneCycleLR, max_lr=0.01, total_steps=10),
-    ])
+    @parametrize(
+        "LRClass",
+        [
+            partial(LambdaLR, lr_lambda=lambda e: e // 10),
+            partial(MultiplicativeLR, lr_lambda=lambda: 0.95),
+            partial(StepLR, step_size=30),
+            partial(MultiStepLR, milestones=[30, 80]),
+            ConstantLR,
+            LinearLR,
+            partial(ExponentialLR, gamma=0.9),
+            lambda opt, **kwargs: SequentialLR(
+                opt,
+                schedulers=[ConstantLR(opt), ConstantLR(opt)],
+                milestones=[2],
+                **kwargs,
+            ),
+            PolynomialLR,
+            partial(CosineAnnealingLR, T_max=10),
+            ReduceLROnPlateau,
+            partial(CyclicLR, base_lr=0.01, max_lr=0.1),
+            partial(CosineAnnealingWarmRestarts, T_0=20),
+            partial(OneCycleLR, max_lr=0.01, total_steps=10),
+        ],
+    )
     def test_lr_scheduler_verbose_deprecation_warning(self, LRClass):
         """Check that a deprecating warning with verbose parameter."""
-        with self.assertWarnsOnceRegex(UserWarning, "The verbose parameter is deprecated"):
+        with self.assertWarnsOnceRegex(
+            UserWarning, "The verbose parameter is deprecated"
+        ):
             LRClass(self.opt, verbose=True)
 
-        with self.assertWarnsOnceRegex(UserWarning, "The verbose parameter is deprecated"):
+        with self.assertWarnsOnceRegex(
+            UserWarning, "The verbose parameter is deprecated"
+        ):
             LRClass(self.opt, verbose=False)
 
         # No warning is raised when verbose is the default value.
@@ -2272,6 +2361,48 @@ def test_lr_scheduler_verbose_deprecation_warning(self, LRClass):
             warnings.simplefilter("error", UserWarning)
             LRClass(self.opt)
 
+    @parametrize(
+        "LRClass",
+        [
+            partial(LambdaLR, lr_lambda=lambda e: e // 10),
+            partial(MultiplicativeLR, lr_lambda=lambda: 0.95),
+            partial(StepLR, step_size=30),
+            partial(MultiStepLR, milestones=[30, 80]),
+            ConstantLR,
+            LinearLR,
+            partial(ExponentialLR, gamma=0.9),
+            PolynomialLR,
+            partial(CosineAnnealingLR, T_max=10),
+            lambda opt, **kwargs: ChainedScheduler(
+                schedulers=[ConstantLR(opt), ConstantLR(opt)], **kwargs
+            ),
+            lambda opt, **kwargs: SequentialLR(
+                opt,
+                schedulers=[ConstantLR(opt), ConstantLR(opt)],
+                milestones=[2],
+                **kwargs,
+            ),
+            ReduceLROnPlateau,
+            partial(CyclicLR, base_lr=0.01, max_lr=0.1),
+            partial(OneCycleLR, max_lr=0.01, total_steps=10, anneal_strategy="linear"),
+            partial(CosineAnnealingWarmRestarts, T_0=20),
+        ],
+    )
+    @parametrize("weights_only", [True, False])
+    def test_lr_scheduler_state_dict_load(self, LRClass, weights_only):
+        scheduler = LRClass(self.opt)
+        state_dict = scheduler.state_dict()
+
+        with tempfile.TemporaryFile() as f:
+            torch.save(state_dict, f)
+            f.seek(0)
+            state_dict_loaded = torch.load(f, weights_only=weights_only)
+            self.assertEqual(state_dict, state_dict_loaded)
+            # Make sure state_dict can be loaded
+            scheduler2 = LRClass(self.opt)
+            scheduler2.load_state_dict(state_dict_loaded)
+            self.assertEqual(scheduler2.state_dict(), state_dict)
+
 
 instantiate_parametrized_tests(TestLRScheduler)
 
diff --git a/test/optim/test_optim.py b/test/optim/test_optim.py
index adbf253e3a0bf..7738f0da41007 100644
--- a/test/optim/test_optim.py
+++ b/test/optim/test_optim.py
@@ -1,1651 +1,32 @@
 # Owner(s): ["module: optimizer"]
 
-import math
-import unittest
-import functools
-import itertools
-from copy import deepcopy
-
 import torch
-from torch.nn import Parameter
 from torch.optim import (
-    Adadelta, Adagrad, Adam, Adamax, AdamW, ASGD, LBFGS, NAdam, RAdam, RMSprop, Rprop, SGD, SparseAdam, Optimizer
-)
-from torch.optim.lr_scheduler import (
-    StepLR,
-    ConstantLR,
-    LinearLR,
-    ExponentialLR,
-    ReduceLROnPlateau,
-    PolynomialLR,
+    Adadelta,
+    Adagrad,
+    Adam,
+    Adamax,
+    AdamW,
+    ASGD,
+    NAdam,
+    RAdam,
+    RMSprop,
+    Rprop,
+    SGD,
 )
 from torch.testing._internal.common_utils import (
-    TestCase,
-    load_tests,
     gradcheck,
-    skipIfRocm,
-    skipIfTorchDynamo
+    load_tests,
+    skipIfTorchDynamo,
+    TestCase,
 )
 
-from torch._dynamo import disable as disable_dynamo
-
-from torch.testing._internal.common_cuda import TEST_CUDA
-from typing import Dict, Any, Tuple
-from torch.optim.optimizer import register_optimizer_step_pre_hook, register_optimizer_step_post_hook
-from unittest.mock import patch
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
 
 
-def rosenbrock(tensor):
-    assert tensor.size() == torch.Size([2]), f"Requires tensor with 2 scalars but got {tensor.size()}"
-    x, y = tensor
-    return (1 - x) ** 2 + 100 * (y - x**2) ** 2
-
-
-def drosenbrock(tensor):
-    assert tensor.size() == torch.Size([2]), f"Requires tensor with 2 scalars but got {tensor.size()}"
-    x, y = tensor
-    return torch.tensor((-400 * x * (y - x**2) - 2 * (1 - x), 200 * (y - x**2)))
-
-@skipIfTorchDynamo("This is a TEMPORARY stopgap, see https://github.com/pytorch/pytorch/issues/103322")
-class TestOptim(TestCase):
-    exact_dtype = True
-
-    def _test_rosenbrock_sparse(
-        self,
-        constructor,
-        scheduler_constructors=None,
-        sparse_only=False,
-        maximize=False,
-        multi_tensor=False
-    ):
-        if scheduler_constructors is None:
-            scheduler_constructors = []
-        # For rosenbrock tests, it is mandated that the param is a tensor with 2 numbers
-        if multi_tensor:
-            params_t = [torch.tensor([1.5, 1.5]), torch.tensor([1.5, 1.5], dtype=torch.float64)]
-        else:
-            params_t = [torch.tensor([1.5, 1.5])]
-
-        params = [Parameter(param_t) for param_t in params_t]
-        optimizer = constructor(params)
-        schedulers = []
-        for scheduler_constructor in scheduler_constructors:
-            schedulers.append(scheduler_constructor(optimizer))
-
-        if not sparse_only:
-            params_c = [Parameter(param_t.clone()) for param_t in params_t]
-            optimizer_c = constructor(params_c)
-
-        solution = torch.tensor([1, 1])
-        with torch.no_grad():
-            initial_dist = sum([param.dist(solution) for param in params])
-
-        def get_grad(param, sparse_grad):
-            grad = drosenbrock(param)
-            # NB: We torture test the optimizer by returning an
-            # uncoalesced sparse tensor
-
-            # Depending on w, provide only the x or y gradient
-            if sparse_grad:
-                if w:
-                    i = torch.LongTensor([[0, 0]])
-                    x = grad[0]
-                    v = torch.tensor([x / 4.0, x - x / 4.0])
-                else:
-                    i = torch.LongTensor([[1, 1]])
-                    y = grad[1]
-                    v = torch.tensor([y - y / 4.0, y / 4.0])
-                grad_out = torch.sparse_coo_tensor(i, v, (2,), dtype=v.dtype)
-            else:
-                if w:
-                    grad_out = torch.tensor([grad[0], 0], dtype=param.dtype)
-                else:
-                    grad_out = torch.tensor([0, grad[1]], dtype=param.dtype)
-            return grad_out
-
-        def eval(params, sparse_grad, w):
-            optimizer.zero_grad()
-            if multi_tensor:
-                loss = sum(rosenbrock(param) for param in params)
-            else:
-                loss = rosenbrock(params[0])
-            loss.backward()
-
-            grads_out = [get_grad(param, sparse_grad) for param in params]
-            with torch.no_grad():
-                params[0].grad = grads_out[0]
-                if multi_tensor:
-                    params[1].grad = grads_out[1].to(dtype=torch.float64)
-            return loss
-
-        for i in range(2000):
-            # Do cyclic coordinate descent
-            w = i % 2
-            optimizer.step(functools.partial(eval, params, True, w))
-            for scheduler in schedulers:
-                if isinstance(scheduler, ReduceLROnPlateau):
-                    scheduler.step(rosenbrock(params[0]))
-                else:
-                    scheduler.step()
-            if not sparse_only:
-                optimizer_c.step(functools.partial(eval, params_c, False, w))
-                # Tolerance is increased due to floating point error from different
-                # code path for dense case: x v.s. x - x / 4.0 + x / 4.0
-                self.assertEqual(params, params_c, atol=5e-6, rtol=5e-6)
-
-        if not maximize:
-            self.assertLessEqual(
-                sum([param.dist(solution) for param in params]),
-                initial_dist
-            )
-        else:
-            self.assertGreaterEqual(
-                sum([rosenbrock(param) for param in params]),
-                sum([rosenbrock(param_t) for param_t in params_t]),
-            )
-
-    def _test_basic_cases_template(
-        self,
-        weight_tensor,
-        bias_tensor,
-        input_tensor,
-        constructor,
-        scheduler_constructors,
-        constructor_accepts_maximize=True,
-        constructor_accepts_foreach=False,
-    ):
-        maximize_options = {False, constructor_accepts_maximize}
-        foreach_options = {False, constructor_accepts_foreach}
-
-        four_arg_constructor = constructor
-        if constructor_accepts_maximize and constructor_accepts_foreach:
-            pass
-        elif constructor_accepts_maximize:
-
-            def four_arg_constructor(weight, bias, maximize, foreach):  # noqa: F811
-                self.assertFalse(foreach)
-                return constructor(weight, bias, maximize)
-
-        elif constructor_accepts_foreach:
-
-            def four_arg_constructor(weight, bias, maximize, foreach):
-                self.assertFalse(maximize)
-                return constructor(weight, bias, foreach)
-
-        else:
-
-            def four_arg_constructor(weight, bias, maximize, foreach):
-                self.assertFalse(maximize or foreach)
-                return constructor(weight, bias)
-
-        for maximize, foreach in itertools.product(maximize_options, foreach_options):
-            with torch.no_grad():
-                weight = Parameter(weight_tensor.clone().detach())
-                bias = Parameter(bias_tensor.clone().detach())
-                input = input_tensor.clone().detach().requires_grad_()
-            optimizer = four_arg_constructor(weight, bias, maximize, foreach)
-            schedulers = []
-            for scheduler_constructor in scheduler_constructors:
-                schedulers.append(scheduler_constructor(optimizer))
-
-
-            def fn():
-                optimizer.zero_grad()
-                y = weight.mv(input)
-                if y.is_cuda and bias.is_cuda and y.get_device() != bias.get_device():
-                    y = y.cuda(bias.get_device())
-                loss = (y + bias).pow(2).sum()
-                loss.backward()
-                return loss
-
-            initial_value = fn().item()
-            for _ in range(200):
-                optimizer.step(fn)
-                for scheduler in schedulers:
-                    if isinstance(scheduler, ReduceLROnPlateau):
-                        val_loss = fn()
-                        scheduler.step(val_loss)
-                    else:
-                        scheduler.step()
-            if maximize:
-                self.assertGreater(fn().item(), initial_value)
-            else:
-                self.assertLess(fn().item(), initial_value)
-
-    # Note: disable dynamo on this function
-    # This allows us to continue running actual logic of the optimizer
-    # tests in dynamo without tracing this test code which has a lot of unsupported
-    # behavior
-    @disable_dynamo(recursive=False)
-    def _test_state_dict(self, weight, bias, input, constructor, atol=None, rtol=None):
-        weight = Parameter(weight)
-        bias = Parameter(bias)
-        with torch.no_grad():
-            input = input.clone().detach().requires_grad_()
-
-        # Note: Disable dynamo on this function
-        # This avoids a bug where input_cuda is not detected in the environment
-        # because it currently is not defined in the local environmet. Unable to repro
-        # anywhere else however and this is test code that we don't need to spend
-        # time getting dynamo to trace unless the issue repros in real models.
-        @disable_dynamo(recursive=False)
-        def fn_base(optimizer, weight, bias):
-            optimizer.zero_grad()
-            i = input_cuda if weight.is_cuda else input
-            loss = (weight.mv(i) + bias).pow(2).sum()
-            loss.backward()
-            return loss
-
-        optimizer = constructor(weight, bias)
-        fn = functools.partial(fn_base, optimizer, weight, bias)
-
-        # Prime the optimizer
-        for _i in range(20):
-            optimizer.step(fn)
-
-        # Make sure that loading optimizers with step not wrapped in tensor can work
-        state_dict = optimizer.state_dict()
-        if "step" in state_dict["state"][0] and torch.is_tensor(
-            state_dict["state"][0]["step"]
-        ):
-            for state in state_dict["state"].values():
-                state["step"] = state["step"].item()
-            optimizer.load_state_dict(state_dict)
-            optimizer.step()
-
-        # Check that state dict can be loaded even when we cast parameters
-        # to a different type and move to a different device.
-        if not torch.cuda.is_available():
-            return
-
-        with torch.no_grad():
-            input_cuda = input.clone().detach().to(dtype=torch.float32, device="cuda")
-            weight_cuda = Parameter(
-                weight.clone().detach().to(dtype=torch.float32, device="cuda")
-            )
-            bias_cuda = Parameter(
-                bias.clone().detach().to(dtype=torch.float32, device="cuda")
-            )
-        optimizer_cuda = constructor(weight_cuda, bias_cuda)
-        fn_cuda = functools.partial(fn_base, optimizer_cuda, weight_cuda, bias_cuda)
-
-        state_dict = deepcopy(optimizer.state_dict())
-        state_dict_c = deepcopy(optimizer.state_dict())
-        optimizer_cuda.load_state_dict(state_dict_c)
-
-        # Make sure state_dict_c isn't modified by merely calling load_state_dict
-        self.assertEqual(state_dict, state_dict_c)
-
-        # Make sure that device of state['step'] is still CPU
-        new_state_dict = optimizer_cuda.state_dict()
-        if "step" in state_dict["state"][0] and torch.is_tensor(
-            state_dict["state"][0]["step"]
-        ):
-            for state in new_state_dict["state"].values():
-                self.assertEqual(state["step"].device.type, "cpu")
-
-        for _ in range(20):
-            optimizer.step(fn)
-            optimizer_cuda.step(fn_cuda)
-            self.assertEqual(weight, weight_cuda)
-            self.assertEqual(bias, bias_cuda, atol=atol, rtol=rtol)
-
-        # validate deepcopy() copies all public attributes
-        def getPublicAttr(obj):
-            return {k for k in obj.__dict__ if not k.startswith("_")}
-
-        self.assertEqual(getPublicAttr(optimizer), getPublicAttr(deepcopy(optimizer)))
-
-    def _test_basic_cases(
-        self,
-        constructor,
-        scheduler_constructors=None,
-        ignore_multidevice=False,
-        constructor_accepts_maximize=False,
-        constructor_accepts_foreach=False,
-        atol=None,
-        rtol=None,
-    ):
-        if scheduler_constructors is None:
-            scheduler_constructors = []
-
-        def make_two_arg_constructor(
-            constructor, maximize: bool, foreach: bool
-        ):
-            if constructor_accepts_maximize and constructor_accepts_foreach:
-                return lambda weight, bias: constructor(weight, bias, maximize, foreach)
-            if constructor_accepts_maximize:
-                return lambda weight, bias: constructor(weight, bias, maximize)
-            if constructor_accepts_foreach:
-                return lambda weight, bias: constructor(weight, bias, foreach)
-            return constructor
-
-        for maximize, foreach in itertools.product(
-            {False, constructor_accepts_maximize},
-            {False, constructor_accepts_foreach},
-        ):
-            self._test_state_dict(
-                torch.randn(10, 5),
-                torch.randn(10),
-                torch.randn(5),
-                make_two_arg_constructor(constructor, maximize, foreach),
-                atol=atol,
-                rtol=rtol,
-            )
-        self._test_basic_cases_template(
-            torch.randn(10, 5),
-            torch.randn(10),
-            torch.randn(5),
-            constructor,
-            scheduler_constructors,
-            constructor_accepts_maximize,
-            constructor_accepts_foreach,
-        )
-        # non-contiguous parameters
-        self._test_basic_cases_template(
-            torch.randn(10, 5, 2)[..., 0],
-            torch.randn(10, 2)[..., 0],
-            torch.randn(5),
-            constructor,
-            scheduler_constructors,
-            constructor_accepts_maximize,
-            constructor_accepts_foreach,
-        )
-        # CUDA
-        if not torch.cuda.is_available():
-            return
-        self._test_basic_cases_template(
-            torch.randn(10, 5).cuda(),
-            torch.randn(10).cuda(),
-            torch.randn(5).cuda(),
-            constructor,
-            scheduler_constructors,
-            constructor_accepts_maximize,
-            constructor_accepts_foreach,
-        )
-        # Multi-GPU
-        if not torch.cuda.device_count() > 1 or ignore_multidevice:
-            return
-        self._test_basic_cases_template(
-            torch.randn(10, 5).cuda(0),
-            torch.randn(10).cuda(1),
-            torch.randn(5).cuda(0),
-            constructor,
-            scheduler_constructors,
-            constructor_accepts_maximize,
-            constructor_accepts_foreach,
-        )
-
-    def _test_complex_optimizer(self, optimizer_constructor):
-        complex_param = torch.randn(5, 5, dtype=torch.complex64, requires_grad=True)
-        real_param = torch.view_as_real(complex_param).detach().clone().requires_grad_()
-        complex_opt = optimizer_constructor(complex_param)
-        real_opt = optimizer_constructor(real_param)
-
-        for _ in range(3):
-            complex_param.grad = torch.randn_like(complex_param)
-            real_param.grad = torch.view_as_real(complex_param.grad)
-            complex_opt.step()
-            real_opt.step()
-
-            self.assertEqual(torch.view_as_real(complex_param), real_param)
-
-    def _test_complex_2d(self, optimizer_constructor):
-        a1 = torch.randn(2, dtype=torch.complex64, requires_grad=True)
-        a1_real = a1.real.clone().detach()
-        a1_imag = a1.imag.clone().detach()
-        a1_real.requires_grad_()
-        a1_imag.requires_grad_()
-        optim1 = optimizer_constructor([a1])
-        optim2 = optimizer_constructor([a1_real, a1_imag])
-
-        for _ in range(10):
-            optim1.zero_grad()
-            optim2.zero_grad()
-            a2 = torch.complex(a1_real, a1_imag)
-            rosenbrock(a1).abs().backward()
-            rosenbrock(a2).abs().backward()
-
-            self.assertEqual(a1.grad.real, a1_real.grad)
-            self.assertEqual(a1.grad.imag, a1_imag.grad)
-
-            optim1.step()
-            optim2.step()
-            self.assertEqual(a1.real, a1_real)
-            self.assertEqual(a1.imag, a1_imag)
-
-    def _build_params_dict(self, weight, bias, **kwargs):
-        return [{"params": [weight]}, dict(params=[bias], **kwargs)]
-
-    def _build_params_dict_single(self, weight, bias, **kwargs):
-        return [dict(params=bias, **kwargs)]
-
-    def test_sgd(self):
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: SGD(
-                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: SGD(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: SGD(
-                self._build_params_dict_single(weight, bias, lr=1e-2),
-                lr=1e-3,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: SGD(
-                self._build_params_dict_single(weight, bias, lr=1e-2),
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: SGD(
-                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
-            ),
-            scheduler_constructors=[lambda opt: StepLR(opt, gamma=0.9, step_size=10)],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: SGD(
-                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
-            ),
-            scheduler_constructors=[
-                lambda opt: LinearLR(
-                    opt, start_factor=0.4, end_factor=0.8, total_iters=4
-                )
-            ],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: SGD(
-                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
-            ),
-            scheduler_constructors=[lambda opt: ConstantLR(opt, factor=0.4, total_iters=4)],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: SGD(
-                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
-            ),
-            scheduler_constructors=[lambda opt: PolynomialLR(opt, power=0.9, total_iters=4)],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: SGD(
-                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
-            ),
-            scheduler_constructors=[
-                lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-                lambda opt: LinearLR(
-                    opt, start_factor=0.4, end_factor=0.6, total_iters=4
-                ),
-            ],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: SGD(
-                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
-            ),
-            [
-                lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-                lambda opt: ReduceLROnPlateau(opt),
-            ],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: SGD(
-                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
-            ),
-            [
-                lambda opt: StepLR(opt, gamma=0.99, step_size=10),
-                lambda opt: ExponentialLR(opt, gamma=0.99),
-                lambda opt: ReduceLROnPlateau(opt),
-            ],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: SGD(
-                [weight, bias],
-                lr=1e-3,
-                momentum=0.5,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: SGD(
-                [weight, bias],
-                lr=1e-3,
-                momentum=0.5,
-                weight_decay=1,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: SGD(
-                [weight, bias],
-                nesterov=True,
-                lr=1e-3,
-                momentum=0.5,
-                weight_decay=1,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-
-
-    def test_sgd_sparse(self):
-        for foreach in (False, True):
-            self._test_rosenbrock_sparse(
-                lambda params: SGD(params, lr=4.8e-3, foreach=foreach),
-                multi_tensor=foreach,
-            )
-            self._test_rosenbrock_sparse(
-                lambda params: SGD(params, lr=0.0048, foreach=foreach),
-                scheduler_constructors=[lambda opt: StepLR(opt, gamma=0.99999, step_size=300)],
-                multi_tensor=foreach,
-            )
-
-    def test_sgd_complex(self):
-        for foreach in (False, True):
-            self._test_complex_optimizer(
-                lambda param: SGD([param], lr=0.001, foreach=foreach)
-            )
-            self._test_complex_optimizer(
-                lambda param: SGD([param], lr=0.001, momentum=1, foreach=foreach)
-            )
-            self._test_complex_optimizer(
-                lambda param: SGD(
-                    [param], lr=0.001, momentum=1, weight_decay=1, foreach=foreach
-                )
-            )
-            self._test_complex_optimizer(
-                lambda param: SGD(
-                    [param],
-                    lr=0.001,
-                    nesterov=True,
-                    momentum=1,
-                    weight_decay=1,
-                    foreach=foreach,
-                )
-            )
-            self._test_complex_optimizer(
-                lambda param: SGD(
-                    [param],
-                    lr=0.001,
-                    momentum=1,
-                    dampening=0.5,
-                    weight_decay=1,
-                    foreach=foreach,
-                )
-            )
-
-
-    def test_adam(self):
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adam(
-                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adam(
-                [weight, bias],
-                lr=1e-3,
-                amsgrad=True,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adam(
-                [weight, bias],
-                lr=1e-3,
-                weight_decay=0.1,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3,
-                amsgrad=True,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            [lambda opt: ExponentialLR(opt, gamma=0.9)],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            [lambda opt: LinearLR(opt, start_factor=0.4, total_iters=4)],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            [lambda opt: ConstantLR(opt, factor=0.4, total_iters=4)],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adam(
-                [weight, bias],
-                lr=1e-3,
-                amsgrad=True,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            [
-                lambda opt: ConstantLR(opt, factor=0.4, total_iters=4),
-                lambda opt: ExponentialLR(opt, gamma=0.9),
-            ],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adam(
-                [weight, bias],
-                lr=1e-3,
-                amsgrad=True,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            [
-                lambda opt: ExponentialLR(opt, gamma=0.9),
-                lambda opt: ReduceLROnPlateau(opt),
-            ],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3,
-                amsgrad=True,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            [
-                lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-                lambda opt: ReduceLROnPlateau(opt),
-            ],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            [lambda opt: PolynomialLR(opt, total_iters=4, power=0.9)],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=torch.tensor(1e-3),
-                maximize=maximize,
-                foreach=False,  # foreach for lr tensors tested in multi configs
-            ),
-            [lambda opt: PolynomialLR(opt, total_iters=4, power=0.9)],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-
-    def test_adam_complex(self):
-        for foreach in (False, True):
-            self._test_complex_2d(functools.partial(Adam, foreach=foreach))
-            self._test_complex_2d(functools.partial(Adam, foreach=foreach, amsgrad=True))
-            self._test_complex_2d(functools.partial(Adam, foreach=foreach, weight_decay=0.2))
-            self._test_complex_2d(functools.partial(Adam, foreach=foreach, weight_decay=0.2, amsgrad=True))
-        self._test_complex_2d(Adam)
-        self._test_complex_2d(functools.partial(
-            Adam, lr=torch.tensor(.001), weight_decay=0.2, amsgrad=True,
-        ))
-
-    def test_adamw(self):
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: AdamW(
-                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: AdamW(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: AdamW(
-                [weight, bias],
-                lr=1e-3,
-                weight_decay=1,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: AdamW(
-                [weight, bias],
-                lr=1e-3,
-                weight_decay=1,
-                amsgrad=True,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: AdamW(
-                [weight, bias],
-                lr=torch.tensor(1e-3),
-                weight_decay=1,
-                amsgrad=True,
-                maximize=maximize,
-                foreach=False,  # foreach for lr tensors tested in multi configs
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-
-
-    def test_adamw_complex(self):
-        self._test_complex_2d(AdamW)
-        self._test_complex_2d(functools.partial(
-            AdamW, lr=torch.tensor(.001), weight_decay=0.2, amsgrad=True,
-        ))
-        for foreach in (False, True):
-            self._test_complex_2d(functools.partial(AdamW, foreach=foreach))
-            self._test_complex_2d(functools.partial(AdamW, foreach=foreach, amsgrad=True))
-            self._test_complex_2d(functools.partial(AdamW, foreach=foreach, weight_decay=0.2))
-            self._test_complex_2d(functools.partial(AdamW, foreach=foreach, weight_decay=0.2, amsgrad=True))
-
-    def test_sparse_adam(self):
-        self._test_rosenbrock_sparse(
-            lambda params: SparseAdam(params, lr=4e-2), [], True
-        )
-        self._test_rosenbrock_sparse(
-            lambda params: SparseAdam(params, lr=4e-2, maximize=True),
-            scheduler_constructors=[],
-            sparse_only=True,
-            maximize=True,
-        )
-
-    # ROCm precision is too low to pass this test
-    def test_adadelta(self):
-        # Handles https://github.com/pytorch/pytorch/issues/69698
-        self.rel_tol = 4e-3
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adadelta(
-                [weight, bias], maximize=maximize, foreach=foreach
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adadelta(
-                self._build_params_dict(weight, bias, rho=0.95),
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adadelta(
-                self._build_params_dict(weight, bias, rho=0.95),
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            [
-                lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-                lambda opt: ReduceLROnPlateau(opt),
-            ],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adadelta(
-                [weight, bias], weight_decay=1, maximize=maximize, foreach=foreach
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-
-    def test_adadelta_complex(self):
-        # Handles https://github.com/pytorch/pytorch/issues/110606
-        self.rel_tol = 2e-2
-        for foreach in (False, True):
-            self._test_complex_optimizer(lambda weight: Adadelta([weight], foreach=foreach))
-            self._test_complex_optimizer(lambda weight: Adadelta([weight], rho=0.95, foreach=foreach))
-            self._test_complex_optimizer(
-                lambda weight: Adadelta([weight], rho=0.95, weight_decay=1, foreach=foreach)
-            )
-
-    def test_nadam(self):
-        self._test_basic_cases(
-            lambda weight, bias, foreach: NAdam(
-                self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3, foreach=foreach
-            ),
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, foreach: NAdam(
-                [weight, bias], lr=1e-3, foreach=foreach
-            ),
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, foreach: NAdam(
-                [weight, bias],
-                lr=1e-3,
-                weight_decay=0.1,
-                momentum_decay=6e-3,
-                foreach=foreach,
-            ),
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, foreach: NAdam(
-                [weight, bias],
-                lr=1e-3,
-                weight_decay=0.1,
-                momentum_decay=6e-3,
-                foreach=foreach,
-            ),
-            [lambda opt: ExponentialLR(opt, gamma=0.9)],
-            constructor_accepts_foreach=True,
-        )
-        # NAdamW tests
-        self._test_basic_cases(
-            lambda weight, bias, foreach: NAdam(
-                [weight, bias],
-                lr=1e-3,
-                weight_decay=0.1,
-                momentum_decay=6e-3,
-                decoupled_weight_decay=True,
-                foreach=foreach,
-            ),
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, foreach: NAdam(
-                [weight, bias],
-                lr=1e-3,
-                weight_decay=0.1,
-                momentum_decay=6e-3,
-                decoupled_weight_decay=True,
-                foreach=foreach,
-            ),
-            [lambda opt: ExponentialLR(opt, gamma=0.9)],
-            constructor_accepts_foreach=True,
-        )
-
-
-    def test_nadam_complex(self):
-        for foreach in (False, True):
-            self._test_complex_optimizer(
-                lambda param: NAdam([param], lr=1e-1, foreach=foreach)
-            )
-            self._test_complex_optimizer(
-                lambda param: NAdam(
-                    [param],
-                    lr=1e-1,
-                    weight_decay=0.01,
-                    foreach=foreach,
-                )
-            )
-            self._test_complex_optimizer(
-                lambda param: NAdam(
-                    [param],
-                    lr=1e-1,
-                    momentum_decay=0.01,
-                    foreach=foreach,
-                )
-            )
-
-    def test_adagrad(self):
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adagrad(
-                [weight, bias], lr=1e-1, maximize=maximize, foreach=foreach
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adagrad(
-                [weight, bias],
-                lr=1e-1,
-                initial_accumulator_value=0.1,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adagrad(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-1,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adagrad(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-1,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            [lambda opt: ReduceLROnPlateau(opt)],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adagrad(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-1,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            [
-                lambda opt: ReduceLROnPlateau(opt),
-                lambda opt: ExponentialLR(opt, gamma=0.99),
-            ],
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-
-
-    def test_adagrad_sparse(self):
-        for foreach in (False, True):
-            self._test_rosenbrock_sparse(
-                lambda params: Adagrad(params, lr=1e-1, foreach=foreach),
-                multi_tensor=foreach,
-            )
-            self._test_rosenbrock_sparse(
-                lambda params: Adagrad(params, lr=0.1, foreach=foreach),
-                scheduler_constructors=[
-                    lambda opt: StepLR(opt, gamma=1 - 1e-5, step_size=500),
-                    lambda opt: ReduceLROnPlateau(opt, threshold=1e-4),
-                ],
-                multi_tensor=foreach,
-            )
-
-    def test_adagrad_complex(self):
-        for foreach in (False, True):
-            self._test_complex_optimizer(
-                lambda param: Adagrad([param], lr=1e-1, foreach=foreach)
-            )
-            self._test_complex_optimizer(
-                lambda param: Adagrad(
-                    [param],
-                    lr=1e-1,
-                    initial_accumulator_value=0.1,
-                    foreach=foreach,
-                )
-            )
-
-    def test_adamax(self):
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adamax(
-                [weight, bias], lr=1e-1, maximize=maximize, foreach=foreach
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adamax(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-1,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: Adamax(
-                [weight, bias],
-                lr=1e-1,
-                weight_decay=1,
-                maximize=maximize,
-                foreach=foreach,
-            ),
-            constructor_accepts_maximize=True,
-            constructor_accepts_foreach=True,
-        )
-        self._test_complex_2d(Adamax)
-        self._test_complex_2d(functools.partial(Adamax, foreach=True))
-
-
-    def test_radam(self):
-        self._test_basic_cases(
-            lambda weight, bias, foreach: RAdam(
-                [weight, bias], lr=1e-3, foreach=foreach
-            ),
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, foreach: RAdam(
-                self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3, foreach=foreach
-            ),
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, foreach: RAdam(
-                [weight, bias], lr=1e-3, weight_decay=0.1, foreach=foreach
-            ),
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, foreach: RAdam(
-                [weight, bias], lr=1e-3, foreach=foreach
-            ),
-            [
-                lambda opt: ExponentialLR(opt, gamma=0.9),
-                lambda opt: ReduceLROnPlateau(opt),
-            ],
-            constructor_accepts_foreach=True,
-        )
-        # RAdamW tests
-        self._test_basic_cases(
-            lambda weight, bias, foreach: RAdam(
-                [weight, bias], lr=1e-3, weight_decay=0.1, decoupled_weight_decay=True, foreach=foreach
-            ),
-            constructor_accepts_foreach=True,
-        )
-        self._test_basic_cases(
-            lambda weight, bias, foreach: RAdam(
-                [weight, bias], lr=1e-3, weight_decay=0.1, decoupled_weight_decay=True, foreach=foreach
-            ),
-            [
-                lambda opt: ExponentialLR(opt, gamma=0.9),
-                lambda opt: ReduceLROnPlateau(opt),
-            ],
-            constructor_accepts_foreach=True,
-        )
-
-
-    def test_radam_complex(self):
-        for foreach in (False, True):
-            self._test_complex_optimizer(
-                lambda param: RAdam([param], lr=1e-1, foreach=foreach)
-            )
-            self._test_complex_optimizer(
-                lambda param: RAdam(
-                    [param],
-                    lr=1e-1,
-                    weight_decay=0.01,
-                    foreach=foreach,
-                )
-            )
-            self._test_complex_optimizer(
-                lambda param: RAdam(
-                    [param],
-                    lr=1e-1,
-                    weight_decay=0.01,
-                    decoupled_weight_decay=True,
-                    foreach=foreach,
-                )
-            )
-
-    def test_rmsprop(self):
-        for foreach in (False, True):
-            self._test_basic_cases(
-                lambda weight, bias, maximize, foreach: RMSprop(
-                    [weight, bias], lr=1e-2, maximize=maximize, foreach=foreach
-                ),
-                constructor_accepts_maximize=True,
-                constructor_accepts_foreach=True,
-            )
-            self._test_basic_cases(
-                lambda weight, bias, maximize, foreach: RMSprop(
-                    self._build_params_dict(weight, bias, lr=1e-3),
-                    lr=1e-2,
-                    maximize=maximize,
-                    foreach=foreach,
-                ),
-                constructor_accepts_maximize=True,
-                constructor_accepts_foreach=True,
-            )
-            self._test_basic_cases(
-                lambda weight, bias, maximize, foreach: RMSprop(
-                    self._build_params_dict(weight, bias, lr=1e-3),
-                    lr=1e-2,
-                    centered=True,
-                    maximize=maximize,
-                    foreach=foreach,
-                ),
-                constructor_accepts_maximize=True,
-                constructor_accepts_foreach=True,
-            )
-            self._test_basic_cases(
-                lambda weight, bias, maximize, foreach: RMSprop(
-                    self._build_params_dict(weight, bias, lr=1e-3),
-                    lr=1e-2,
-                    centered=True,
-                    momentum=0.1,
-                    maximize=maximize,
-                    foreach=foreach,
-                ),
-                constructor_accepts_maximize=True,
-                constructor_accepts_foreach=True,
-            )
-            self._test_basic_cases(
-                lambda weight, bias, maximize, foreach: RMSprop(
-                    self._build_params_dict(weight, bias, lr=1e-3),
-                    lr=1e-2,
-                    momentum=0.1,
-                    maximize=maximize,
-                    foreach=foreach,
-                ),
-                constructor_accepts_maximize=True,
-                constructor_accepts_foreach=True,
-            )
-            self._test_basic_cases(
-                lambda weight, bias, maximize, foreach: RMSprop(
-                    self._build_params_dict(weight, bias, lr=1e-3),
-                    lr=1e-2,
-                    momentum=0.1,
-                    weight_decay=1,
-                    maximize=maximize,
-                    foreach=foreach,
-                ),
-                constructor_accepts_maximize=True,
-                constructor_accepts_foreach=True,
-            )
-            self._test_complex_2d(lambda param: RMSprop(param, foreach=foreach))
-            self._test_complex_2d(
-                lambda param: RMSprop(param, centered=True, foreach=foreach)
-            )
-            self._test_complex_2d(
-                lambda param: RMSprop(param, momentum=0.1, foreach=foreach)
-            )
-            self._test_complex_2d(
-                lambda param: RMSprop(param, maximize=True, foreach=foreach)
-            )
-            self._test_complex_optimizer(
-                lambda param: RMSprop([param], foreach=foreach)
-            )
-            self._test_complex_optimizer(
-                lambda param: RMSprop([param], centered=True, foreach=foreach)
-            )
-            self._test_complex_optimizer(
-                lambda param: RMSprop([param], momentum=0.1, foreach=foreach)
-            )
-            self._test_complex_optimizer(
-                lambda param: RMSprop([param], maximize=True, foreach=foreach)
-            )
-
-
-    def test_asgd(self):
-        for foreach in (False, True):
-            self._test_basic_cases(
-                lambda weight, bias, maximize, foreach: ASGD(
-                    [weight, bias], lr=1e-3, t0=100, maximize=maximize, foreach=foreach
-                ),
-                constructor_accepts_maximize=True,
-                constructor_accepts_foreach=True,
-            )
-            self._test_basic_cases(
-                lambda weight, bias, maximize, foreach: ASGD(
-                    self._build_params_dict(weight, bias, lr=1e-2),
-                    lr=1e-3,
-                    t0=100,
-                    maximize=maximize,
-                    foreach=foreach,
-                ),
-                constructor_accepts_maximize=True,
-                constructor_accepts_foreach=True,
-            )
-            self._test_basic_cases(
-                lambda weight, bias, maximize, foreach: ASGD(
-                    self._build_params_dict(weight, bias, lr=1e-2),
-                    lr=1e-3,
-                    weight_decay=1,
-                    maximize=maximize,
-                    foreach=foreach,
-                ),
-                constructor_accepts_maximize=True,
-                constructor_accepts_foreach=True,
-            )
-            # Ref: https://github.com/pytorch/pytorch/issues/84560
-            # self._test_complex_2d(optimizer)
-            self._test_complex_optimizer(
-                lambda params: ASGD([params], foreach=foreach)
-            )
-            self._test_complex_optimizer(
-                lambda params: ASGD([params], maximize=True, foreach=foreach)
-            )
-            self._test_complex_optimizer(
-                lambda params: ASGD(
-                    [params], maximize=True, weight_decay=0.9, foreach=foreach
-                )
-            )
-            self._test_complex_optimizer(
-                lambda params: ASGD(
-                    [params], maximize=False, weight_decay=0.9, foreach=foreach
-                )
-            )
-
-
-    @skipIfRocm
-    @skipIfTorchDynamo()
-    def test_rprop(self):
-        is_cuda_sm86 = torch.cuda.is_available() and torch.cuda.get_device_capability(
-            0
-        ) == (8, 6)
-        for foreach in (False, True):
-            self._test_basic_cases(
-                lambda weight, bias, maximize, foreach: Rprop(
-                    [weight, bias], lr=2e-4, maximize=maximize, foreach=foreach
-                ),
-                constructor_accepts_maximize=True,
-                constructor_accepts_foreach=True,
-            )
-            self._test_basic_cases(
-                lambda weight, bias, maximize, foreach: Rprop(
-                    self._build_params_dict(weight, bias, lr=1e-2),
-                    lr=2e-4,
-                    maximize=maximize,
-                    foreach=foreach,
-                ),
-                constructor_accepts_maximize=True,
-                constructor_accepts_foreach=True,
-                atol=4e-5 if is_cuda_sm86 else None,
-                rtol=3e-5 if is_cuda_sm86 else None,
-            )
-            self._test_complex_2d(lambda param: Rprop(param, foreach=foreach))
-            self._test_complex_optimizer(
-                lambda param: Rprop([param], lr=0.001, foreach=foreach)
-            )
-            self._test_complex_optimizer(
-                lambda param: Rprop(
-                    [param], lr=0.001, maximize=True, foreach=foreach
-                )
-            )
-
-
-    def test_lbfgs(self):
-        self._test_basic_cases(
-            lambda weight, bias: LBFGS([weight, bias]), ignore_multidevice=True
-        )
-        self._test_basic_cases(
-            lambda weight, bias: LBFGS(
-                [weight, bias], line_search_fn="strong_wolfe"
-            ),
-            ignore_multidevice=True,
-        )
-
-    def test_lbfgs_returns_consistent_type(self):
-        params = [torch.randn(10, 5), torch.randn(10)]
-        opt1 = LBFGS(params, 0.01, tolerance_grad=math.inf)
-        opt2 = LBFGS(params, 0.01, tolerance_grad=-math.inf)
-
-        def closure():
-            return torch.tensor([10])
-
-        res1 = opt1.step(closure)
-        res2 = opt2.step(closure)
-        self.assertEqual(type(res1), type(res2))
-
-
-    def test_fused_optimizer_does_not_step_if_foundinf(self):
-        if not torch.cuda.is_available():
-            self.skipTest("CUDA is required.")
-
-        from torch.optim import adam, adamw
-
-        num_tensors = 5
-        for functional_optim, amsgrad, no_grad_scale in itertools.product((adam.adam, adamw.adamw), (False, True), (False, True)):
-            params, grads, exp_avgs, exp_avg_sqs = (
-                [torch.ones((1,), device="cuda") for _ in range(num_tensors)] for _ in range(4))
-            prev_params = [t.clone().detach() for t in params]
-            max_exp_avg_sqs = [torch.ones((1,), device="cuda") for _ in range(num_tensors)] if amsgrad else []
-            state_steps = [torch.ones((), dtype=torch.float32, device="cuda") for _ in range(num_tensors)]
-            grad_scale = None if no_grad_scale else torch.ones((1,), dtype=torch.float32, device="cuda")
-            found_inf = torch.ones((), dtype=torch.float32, device="cuda")
-
-            functional_optim(
-                params,
-                grads,
-                exp_avgs,
-                exp_avg_sqs,
-                max_exp_avg_sqs,
-                state_steps,
-                foreach=False,
-                capturable=False,
-                fused=True,
-                amsgrad=amsgrad,
-                beta1=0.9,
-                beta2=0.99,
-                lr=1e-2,
-                weight_decay=0.0,
-                eps=1e-8,
-                maximize=False,
-                grad_scale=grad_scale,
-                found_inf=found_inf,
-            )
-
-            self.assertEqual(
-                state_steps,
-                [
-                    torch.ones((), dtype=torch.float32, device="cuda")
-                    for _ in range(num_tensors)
-                ],
-            )
-            self.assertEqual(params, prev_params)
-
-
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required.")
-    def test_fused_optimizer_load_state_dict(self):
-        # NOTE: This SIMULATES a fused/capturable optimizer with state moved to CPU, issue 103256
-        # How do we get there? Users typically create CUDA models on fused optimizers and then
-        # store checkpoints on CPU as CUDA memory is limited with torch.load(...map_location="cpu").
-        # Since this is a unit test, it is more expedient to simulate what the state_dict
-        # would look like, which is basically CPU tensors with fused/capturable flag = True.
-        for optimC, kwarg in itertools.product((Adam, AdamW), ("fused", "capturable")):
-            input = torch.tensor([0.1, 0.2], dtype=torch.float32, device="cpu")
-            optimizer = optimC([input])
-            optimizer.zero_grad()
-            input.grad = torch.rand_like(input)
-            optimizer.step()
-            optim_state_dict_cpu = deepcopy(optimizer.state_dict())
-            optim_state_dict_cpu["param_groups"][0][kwarg] = True
-
-            # load
-            input_cuda = input.clone().detach().to(device="cuda")
-            defaults = {kwarg: True}
-            optimizer_cuda = optimC([input_cuda], **defaults)
-            optimizer_cuda.load_state_dict(optim_state_dict_cpu)
-            optimizer_cuda.zero_grad()
-            input_cuda.grad = torch.rand_like(input_cuda)
-            optimizer_cuda.step()
-
-
-    @skipIfTorchDynamo()
-    def test_post_hook(self):
-        def post_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
-            nonlocal data
-            data += 2
-
-        params = [torch.Tensor([1, 1])]
-        opt = SGD(params, lr=0.001)
-        data = 2
-        hook_handle = opt.register_step_post_hook(post_hook)
-
-        opt.step()
-        opt.step()
-        # check if pre hooks were registered
-        self.assertEqual(data, 6)
-
-        # remove handles, take step and verify that hook is no longer registered
-        hook_handle.remove()
-
-        opt.step()
-        self.assertEqual(data, 6)
-
-    @skipIfTorchDynamo()
-    def test_pre_hook(self):
-        def pre_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
-            nonlocal data
-            data += 2
-
-        params = [torch.Tensor([1, 1])]
-        opt = SGD(params, lr=0.001)
-        data = 5
-        hook_handle = opt.register_step_pre_hook(pre_hook)
-
-        opt.step()
-        opt.step()
-        # check if pre hooks were registered
-        self.assertEqual(data, 9)
-
-        # remove handles, take step and verify that hook is no longer registered
-        hook_handle.remove()
-
-        opt.step()
-        self.assertEqual(data, 9)
-
-    @skipIfTorchDynamo()
-    def test_pre_and_post_hook(self):
-        def global_pre_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
-            nonlocal data
-            data.append(0)
-
-        def global_post_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
-            nonlocal data
-            data.append(5)
-
-        def local_pre_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
-            nonlocal data
-            data.append(1)
-
-        def local_post_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
-            nonlocal data
-            data.append(2)
-
-        params = [torch.Tensor([1, 1])]
-        opt1 = SGD(params, lr=0.001)
-        opt2 = Adam(params, lr=0.01)
-        data = []
-
-        # register global hooks to both optimizers
-        global_pre_handle = register_optimizer_step_pre_hook(global_pre_hook)
-        global_post_handle = register_optimizer_step_post_hook(global_post_hook)
-
-        # register local hooks
-        first_pre_handle = opt1.register_step_pre_hook(local_pre_hook)
-        first_post_handle = opt1.register_step_post_hook(local_post_hook)
-        second_pre_handle = opt2.register_step_pre_hook(local_pre_hook)
-        second_post_handle = opt2.register_step_post_hook(local_post_hook)
-
-        opt1.step()
-        self.assertListEqual(data, [0, 1, 2, 5])
-        opt2.step()
-        self.assertListEqual(data, [0, 1, 2, 5, 0, 1, 2, 5])
-        opt1.step()
-        self.assertListEqual(data, [0, 1, 2, 5, 0, 1, 2, 5, 0, 1, 2, 5])
-
-        # remove all hooks
-        global_pre_handle.remove()
-        global_post_handle.remove()
-        first_pre_handle.remove()
-        first_post_handle.remove()
-        second_pre_handle.remove()
-        second_post_handle.remove()
-
-        opt1.step()
-        opt2.step()
-        self.assertListEqual(data, [0, 1, 2, 5, 0, 1, 2, 5, 0, 1, 2, 5])
-
-
-    @staticmethod
-    def _state_dict_pre_hook(optimizer: Optimizer) -> None:
-        optimizer.state["test"] = 1
-
-    @staticmethod
-    def _state_dict_post_hook(optimizer: Optimizer, state_dict: Dict[str, Any]) -> Dict[str, Any]:
-        if "test" in state_dict["state"]:
-            state_dict["state"].pop("test")
-            state_dict["ran_state_dict_pre_hook"] = True
-        else:
-            state_dict["ran_state_dict_pre_hook"] = False
-        return state_dict
-
-    @staticmethod
-    def _load_state_dict_pre_hook1(optimizer: Optimizer, state_dict: Dict[str, Any]) -> None:
-        state_dict["param_groups"][0]["lr"] = 0.002
-
-    @staticmethod
-    def _load_state_dict_pre_hook2(optimizer: Optimizer, state_dict: Dict[str, Any]) -> Dict[str, Any]:
-        # The typical use case for returning a state dict is to drastically modify the state dict.
-        # I will simulate by simply making a deep copy and ensuring that my_state_dict still gets used
-        my_state_dict = deepcopy(state_dict)
-        my_state_dict["param_groups"][0]["lr"] = 0.003
-        return my_state_dict
-
-    @staticmethod
-    def _load_state_dict_post_hook(optimizer: Optimizer) -> None:
-        optimizer.state["ran_load_state_dict_pre_hook2"] = optimizer.param_groups[0]["lr"] == 0.003
-        optimizer.state["ran_load_state_dict_post_hook"] = True
-
-    def test_state_dict_pre_hook(self):
-        param = torch.rand(2, 3, requires_grad=True)
-        param.grad = torch.rand(2, 3, requires_grad=True)
-        opt = SGD([param], lr=0.001)
-        opt.register_state_dict_pre_hook(self._state_dict_pre_hook)
-        state_dict = opt.state_dict()
-        self.assertEqual(state_dict["state"]["test"], 1)
-
-    def test_state_dict_post_hook(self):
-        param = torch.rand(2, 3, requires_grad=True)
-        param.grad = torch.rand(2, 3, requires_grad=True)
-        opt = SGD([param], lr=0.001)
-        opt.register_state_dict_post_hook(self._state_dict_post_hook)
-        state_dict = opt.state_dict()
-        self.assertEqual(state_dict["ran_state_dict_pre_hook"], False)
-
-    def test_state_dict_pre_post_hook(self):
-        param = torch.rand(2, 3, requires_grad=True)
-        param.grad = torch.rand(2, 3, requires_grad=True)
-        opt = SGD([param], lr=0.001)
-        opt.register_state_dict_pre_hook(self._state_dict_pre_hook)
-        opt.register_state_dict_post_hook(self._state_dict_post_hook)
-        state_dict = opt.state_dict()
-        self.assertFalse("test" in state_dict["state"])
-        self.assertEqual(state_dict["ran_state_dict_pre_hook"], True)
-
-    def test_load_state_dict_pre_hook_and_prepend(self):
-        param = torch.rand(2, 3, requires_grad=True)
-        param.grad = torch.rand(2, 3, requires_grad=True)
-        opt = SGD([param], lr=0.001)
-        state_dict = opt.state_dict()
-
-        # usually one would have a new opt instance here, but it's all the same here
-        opt.register_load_state_dict_pre_hook(self._load_state_dict_pre_hook1)
-        opt.load_state_dict(state_dict)
-        self.assertEqual(opt.param_groups[0]["lr"], 0.002)
-
-        opt.register_load_state_dict_pre_hook(self._load_state_dict_pre_hook2, prepend=True)
-        opt.load_state_dict(state_dict)
-        # If prepend were False would be 0.003 but since prepend is True, the other hook overrides
-        self.assertEqual(opt.param_groups[0]["lr"], 0.002)
-
-    def test_load_state_dict_post_hook(self):
-        param = torch.rand(2, 3, requires_grad=True)
-        param.grad = torch.rand(2, 3, requires_grad=True)
-        opt = SGD([param], lr=0.001)
-
-        opt.register_load_state_dict_post_hook(self._load_state_dict_post_hook)
-        opt.load_state_dict(opt.state_dict())
-        self.assertFalse(opt.state["ran_load_state_dict_pre_hook2"])
-        self.assertTrue(opt.state["ran_load_state_dict_post_hook"])
-
-    def test_load_state_dict_pre_post_hook(self):
-        param = torch.rand(2, 3, requires_grad=True)
-        param.grad = torch.rand(2, 3, requires_grad=True)
-        opt = SGD([param], lr=0.001)
-
-        opt.register_load_state_dict_pre_hook(self._load_state_dict_pre_hook2)
-        opt.register_load_state_dict_post_hook(self._load_state_dict_post_hook)
-        opt.load_state_dict(opt.state_dict())
-        self.assertTrue(opt.state["ran_load_state_dict_pre_hook2"])
-        self.assertTrue(opt.state["ran_load_state_dict_post_hook"])
-
-
 def _diff_fn(p, grad, opt_differentiable_state, opt_class, kwargs, *ignored):
     # Ignored is the list of values in `opt_differentiable_state`, we do this
     # for `gradcheck` to correctly track the state tensors as function inputs
@@ -1669,7 +50,6 @@ def _diff_fn(p, grad, opt_differentiable_state, opt_class, kwargs, *ignored):
 
 @skipIfTorchDynamo("Differentiable optimizers not supported")
 class TestDifferentiableOptimizer(TestCase):
-
     def test_sgd(self):
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
         grad = torch.rand(10, requires_grad=True, dtype=torch.float64)
@@ -1687,7 +67,6 @@ def test_sgd(self):
             ),
         )
 
-
     def test_adam(self):
         state = {}
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
@@ -1713,12 +92,11 @@ def test_adam(self):
             ),
         )
 
-
     def test_rmsprop(self):
         state = {}
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
         grad = torch.rand(10, requires_grad=True, dtype=torch.float64)
-        state["step"] = 0
+        state["step"] = torch.zeros((), dtype=torch.float64)
         state["square_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
         state["momentum_buffer"] = torch.rand(
             10, requires_grad=True, dtype=torch.float64
@@ -1746,7 +124,6 @@ def test_rmsprop(self):
             ),
         )
 
-
     def test_adadelta(self):
         state = {}
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
@@ -1768,7 +145,6 @@ def test_adadelta(self):
             ),
         )
 
-
     def test_adagrad(self):
         state = {}
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
@@ -1789,7 +165,6 @@ def test_adagrad(self):
             ),
         )
 
-
     def test_adamax(self):
         state = {}
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
@@ -1811,9 +186,10 @@ def test_adamax(self):
             ),
         )
 
-
-    @skipIfTorchDynamo("The inplace mu update fails with dynamo, "
-                       "since this is only happening when differentiable is enabled, skipping for now")
+    @skipIfTorchDynamo(
+        "The inplace mu update fails with dynamo, "
+        "since this is only happening when differentiable is enabled, skipping for now"
+    )
     def test_asgd(self):
         state = {}
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
@@ -1947,44 +323,16 @@ def test_radam(self):
                 grad,
                 state,
                 RAdam,
-                {"lr": 0.9, "weight_decay": 0.1, "decoupled_weight_decay": True, "differentiable": True},
+                {
+                    "lr": 0.9,
+                    "weight_decay": 0.1,
+                    "decoupled_weight_decay": True,
+                    "differentiable": True,
+                },
                 *state.values(),
             ),
         )
 
-    @unittest.skipIf(not TEST_CUDA, "test requires CUDA")
-    def test_defaults_changed_to_foreach(self):
-        from torch.optim import (adam, adamw, nadam, sgd, radam, rmsprop, rprop,
-                                 asgd, adamax, adadelta, adagrad)
-        multi_optims = ((Adam, adam, "_multi_tensor_adam"),
-                        (AdamW, adamw, "_multi_tensor_adamw"),
-                        (NAdam, nadam, "_multi_tensor_nadam"),
-                        (SGD, sgd, "_multi_tensor_sgd"),
-                        (RAdam, radam, "_multi_tensor_radam"),
-                        (RMSprop, rmsprop, "_multi_tensor_rmsprop"),
-                        (Rprop, rprop, "_multi_tensor_rprop"),
-                        (ASGD, asgd, "_multi_tensor_asgd"),
-                        (Adamax, adamax, "_multi_tensor_adamax"),
-                        (Adadelta, adadelta, "_multi_tensor_adadelta"),
-                        (Adagrad, adagrad, "_multi_tensor_adagrad"),)
-
-        model = torch.nn.Linear(5, 5)
-        model.to(dtype=torch.float64, device="cuda")
-        input = torch.rand(2, 5, dtype=torch.float64, device="cuda")
-
-        for opt, mod, func in multi_optims:
-            defaults = {}
-            if opt == SGD:
-                defaults["lr"] = 1e-2
-            optimizer = opt(model.parameters(), **defaults)
-            optimizer.zero_grad()
-            output = model(input)
-            loss = output.sum()
-            loss.backward()
-            with patch.object(mod, func) as mocked_foreach_impl:
-                optimizer.step()
-                self.assertTrue(mocked_foreach_impl.called)
-
 
 if __name__ == "__main__":
     print("These tests should be run through test/test_optim.py instead")
diff --git a/test/optim/test_swa_utils.py b/test/optim/test_swa_utils.py
index aaacbf3cb9eda..edb900c6c0d29 100644
--- a/test/optim/test_swa_utils.py
+++ b/test/optim/test_swa_utils.py
@@ -4,12 +4,17 @@
 import pickle
 
 import torch
-from torch.optim.swa_utils import AveragedModel, update_bn, get_swa_multi_avg_fn, get_ema_multi_avg_fn
+from torch.optim.swa_utils import (
+    AveragedModel,
+    get_ema_multi_avg_fn,
+    get_swa_multi_avg_fn,
+    update_bn,
+)
 from torch.testing._internal.common_utils import (
-    TestCase,
+    instantiate_parametrized_tests,
     load_tests,
     parametrize,
-    instantiate_parametrized_tests,
+    TestCase,
 )
 
 # load_tests from common_utils is used to automatically filter tests for
@@ -75,9 +80,13 @@ def _test_averaged_model(self, net_device, swa_device, ema):
     def _run_averaged_steps(self, dnn, swa_device, ema):
         ema_decay = 0.999
         if ema:
-            averaged_dnn = AveragedModel(dnn, device=swa_device, multi_avg_fn=get_ema_multi_avg_fn(ema_decay))
+            averaged_dnn = AveragedModel(
+                dnn, device=swa_device, multi_avg_fn=get_ema_multi_avg_fn(ema_decay)
+            )
         else:
-            averaged_dnn = AveragedModel(dnn, device=swa_device, multi_avg_fn=get_swa_multi_avg_fn())
+            averaged_dnn = AveragedModel(
+                dnn, device=swa_device, multi_avg_fn=get_swa_multi_avg_fn()
+            )
 
         averaged_params = [torch.zeros_like(param) for param in dnn.parameters()]
 
@@ -86,7 +95,11 @@ def _run_averaged_steps(self, dnn, swa_device, ema):
             for p, p_avg in zip(dnn.parameters(), averaged_params):
                 p.detach().add_(torch.randn_like(p))
                 if ema:
-                    p_avg += p.detach() * ema_decay ** (n_updates - i - 1) * ((1 - ema_decay) if i > 0 else 1.0)
+                    p_avg += (
+                        p.detach()
+                        * ema_decay ** (n_updates - i - 1)
+                        * ((1 - ema_decay) if i > 0 else 1.0)
+                    )
                 else:
                     p_avg += p.detach() / n_updates
             averaged_dnn.update_parameters(dnn)
@@ -157,8 +170,11 @@ def test_averaged_model_exponential(self, use_multi_avg_fn, use_buffers):
         decay = 0.9
 
         if use_multi_avg_fn:
-            averaged_dnn = AveragedModel(dnn, multi_avg_fn=get_ema_multi_avg_fn(decay), use_buffers=use_buffers)
+            averaged_dnn = AveragedModel(
+                dnn, multi_avg_fn=get_ema_multi_avg_fn(decay), use_buffers=use_buffers
+            )
         else:
+
             def avg_fn(p_avg, p, n_avg):
                 return decay * p_avg + (1 - decay) * p
 
@@ -206,7 +222,6 @@ def avg_fn(p_avg, p, n_avg):
                 self.assertEqual(b_avg, b_swa)
 
     def _test_update_bn(self, dnn, dl_x, dl_xy, cuda):
-
         preactivation_sum = torch.zeros(dnn.n_features)
         preactivation_squared_sum = torch.zeros(dnn.n_features)
         if cuda:
diff --git a/test/package/package_a/long_name.py b/test/package/package_a/long_name.py
index 2b4a2af8497d1..dd315223e8562 100644
--- a/test/package/package_a/long_name.py
+++ b/test/package/package_a/long_name.py
@@ -1,6 +1,9 @@
 def add_function(d):
     # noqa: B950
-    d.append(function_with_a_long_name_256charsplus_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx)
+    d.append(
+        function_with_a_long_name_256charsplus_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+    )
+
 
 def function_with_a_long_name_256charsplus_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx():  # noqa: B950
     return 1337
diff --git a/test/package/package_d/imports_directly.py b/test/package/package_d/imports_directly.py
index 4bb96bf4c4957..a7c93766d248d 100644
--- a/test/package/package_d/imports_directly.py
+++ b/test/package/package_d/imports_directly.py
@@ -4,7 +4,6 @@
 
 
 class ImportsDirectlyFromSubSubPackage(torch.nn.Module):
-
     key = important_string
 
     def forward(self, inp):
diff --git a/test/package/package_d/imports_indirectly.py b/test/package/package_d/imports_indirectly.py
index e18434deaf2e4..1ffd41f447b19 100644
--- a/test/package/package_d/imports_indirectly.py
+++ b/test/package/package_d/imports_indirectly.py
@@ -4,7 +4,6 @@
 
 
 class ImportsIndirectlyFromSubPackage(torch.nn.Module):
-
     key = important_string
 
     def forward(self, inp):
diff --git a/test/package/test_digraph.py b/test/package/test_digraph.py
index 90dc11f3a100f..9868466b64a2d 100644
--- a/test/package/test_digraph.py
+++ b/test/package/test_digraph.py
@@ -79,8 +79,7 @@ def test_iter(self):
         g.add_node(3)
 
         nodes = set()
-        for n in g:
-            nodes.add(n)
+        nodes.update(g)
 
         self.assertEqual(nodes, {1, 2, 3})
 
diff --git a/test/package/test_directory_reader.py b/test/package/test_directory_reader.py
index 0f19b4789510e..e5854b2954184 100644
--- a/test/package/test_directory_reader.py
+++ b/test/package/test_directory_reader.py
@@ -44,7 +44,10 @@ class DirectoryReaderTest(PackageTestCase):
     """Tests use of DirectoryReader as accessor for opened packages."""
 
     @skipIfNoTorchVision
-    @skipIf(True, "Does not work with latest TorchVision, see https://github.com/pytorch/pytorch/issues/81115")
+    @skipIf(
+        True,
+        "Does not work with latest TorchVision, see https://github.com/pytorch/pytorch/issues/81115",
+    )
     def test_loading_pickle(self):
         """
         Test basic saving and loading of modules and pickles from a DirectoryReader.
@@ -108,16 +111,16 @@ def test_resource_reader(self):
         with PackageExporter(filename) as pe:
             # Layout looks like:
             #    package
-            #    ├── one/
-            #    │   ├── a.txt
-            #    │   ├── b.txt
-            #    │   ├── c.txt
-            #    │   └── three/
-            #    │       ├── d.txt
-            #    │       └── e.txt
-            #    └── two/
-            #       ├── f.txt
-            #       └── g.txt
+            #    |-- one/
+            #    |   |-- a.txt
+            #    |   |-- b.txt
+            #    |   |-- c.txt
+            #    |   +-- three/
+            #    |       |-- d.txt
+            #    |       +-- e.txt
+            #    +-- two/
+            #       |-- f.txt
+            #       +-- g.txt
             pe.save_text("one", "a.txt", "hello, a!")
             pe.save_text("one", "b.txt", "hello, b!")
             pe.save_text("one", "c.txt", "hello, c!")
diff --git a/test/package/test_importer.py b/test/package/test_importer.py
index 2148cc118ce8e..72f582cc3bfa4 100644
--- a/test/package/test_importer.py
+++ b/test/package/test_importer.py
@@ -98,7 +98,7 @@ def __init__(self, whichmodule_return):
                 self._whichmodule_return = whichmodule_return
 
             def import_module(self, module_name):
-                raise NotImplementedError()
+                raise NotImplementedError
 
             def whichmodule(self, obj, name):
                 return self._whichmodule_return
diff --git a/test/package/test_load_bc_packages.py b/test/package/test_load_bc_packages.py
index af1c26cae905b..527a65d19094f 100644
--- a/test/package/test_load_bc_packages.py
+++ b/test/package/test_load_bc_packages.py
@@ -32,7 +32,6 @@ def test_load_bc_packages_nn_module(self):
         "Tests that use temporary files are disabled in fbcode",
     )
     def test_load_bc_packages_torchscript_module(self):
-
         """Tests for backwards compatible torchscript module"""
         importer2 = PackageImporter(f"{packaging_directory}/test_torchscript_module.pt")
         loaded2 = importer2.load_pickle("torchscript_module", "torchscript_module.pkl")
diff --git a/test/package/test_misc.py b/test/package/test_misc.py
index 5bc53f8b4959a..d97eaec3ac727 100644
--- a/test/package/test_misc.py
+++ b/test/package/test_misc.py
@@ -11,7 +11,12 @@
 
 from torch.package import is_from_package, PackageExporter, PackageImporter
 from torch.package.package_exporter import PackagingError
-from torch.testing._internal.common_utils import IS_FBCODE, IS_SANDCASTLE, run_tests, skipIfTorchDynamo
+from torch.testing._internal.common_utils import (
+    IS_FBCODE,
+    IS_SANDCASTLE,
+    run_tests,
+    skipIfTorchDynamo,
+)
 
 try:
     from .common import PackageTestCase
@@ -33,46 +38,46 @@ def test_file_structure(self):
 
         export_plain = dedent(
             """\
-                ├── .data
-                │   ├── extern_modules
-                │   ├── python_version
-                │   ├── serialization_id
-                │   └── version
-                ├── main
-                │   └── main
-                ├── obj
-                │   └── obj.pkl
-                ├── package_a
-                │   ├── __init__.py
-                │   └── subpackage.py
-                ├── byteorder
-                └── module_a.py
+                \u251c\u2500\u2500 .data
+                \u2502   \u251c\u2500\u2500 extern_modules
+                \u2502   \u251c\u2500\u2500 python_version
+                \u2502   \u251c\u2500\u2500 serialization_id
+                \u2502   \u2514\u2500\u2500 version
+                \u251c\u2500\u2500 main
+                \u2502   \u2514\u2500\u2500 main
+                \u251c\u2500\u2500 obj
+                \u2502   \u2514\u2500\u2500 obj.pkl
+                \u251c\u2500\u2500 package_a
+                \u2502   \u251c\u2500\u2500 __init__.py
+                \u2502   \u2514\u2500\u2500 subpackage.py
+                \u251c\u2500\u2500 byteorder
+                \u2514\u2500\u2500 module_a.py
             """
         )
         export_include = dedent(
             """\
-                ├── obj
-                │   └── obj.pkl
-                └── package_a
-                    └── subpackage.py
+                \u251c\u2500\u2500 obj
+                \u2502   \u2514\u2500\u2500 obj.pkl
+                \u2514\u2500\u2500 package_a
+                    \u2514\u2500\u2500 subpackage.py
             """
         )
         import_exclude = dedent(
             """\
-                ├── .data
-                │   ├── extern_modules
-                │   ├── python_version
-                │   ├── serialization_id
-                │   └── version
-                ├── main
-                │   └── main
-                ├── obj
-                │   └── obj.pkl
-                ├── package_a
-                │   ├── __init__.py
-                │   └── subpackage.py
-                ├── byteorder
-                └── module_a.py
+                \u251c\u2500\u2500 .data
+                \u2502   \u251c\u2500\u2500 extern_modules
+                \u2502   \u251c\u2500\u2500 python_version
+                \u2502   \u251c\u2500\u2500 serialization_id
+                \u2502   \u2514\u2500\u2500 version
+                \u251c\u2500\u2500 main
+                \u2502   \u2514\u2500\u2500 main
+                \u251c\u2500\u2500 obj
+                \u2502   \u2514\u2500\u2500 obj.pkl
+                \u251c\u2500\u2500 package_a
+                \u2502   \u251c\u2500\u2500 __init__.py
+                \u2502   \u2514\u2500\u2500 subpackage.py
+                \u251c\u2500\u2500 byteorder
+                \u2514\u2500\u2500 module_a.py
             """
         )
 
@@ -118,7 +123,9 @@ class LoaderThatRemapsModuleA(SourceFileLoader):
             def get_filename(self, name):
                 result = super().get_filename(name)
                 if name == "module_a":
-                    return os.path.join(os.path.dirname(result), "module_a_remapped_path.py")
+                    return os.path.join(
+                        os.path.dirname(result), "module_a_remapped_path.py"
+                    )
                 else:
                     return result
 
@@ -139,7 +146,9 @@ def find_spec(self, fullname, path, target):
                     if spec is not None:
                         break
                 assert spec is not None and isinstance(spec.loader, SourceFileLoader)
-                spec.loader = LoaderThatRemapsModuleA(spec.loader.name, spec.loader.path)
+                spec.loader = LoaderThatRemapsModuleA(
+                    spec.loader.name, spec.loader.path
+                )
                 return spec
 
         sys.meta_path.insert(0, FinderThatRemapsModuleA())
@@ -154,7 +163,6 @@ def find_spec(self, fullname, path, target):
                 he.intern("**")
                 he.save_module(module_a.__name__)
 
-
             buffer.seek(0)
             hi = PackageImporter(buffer)
             self.assertTrue("remapped_path" in hi.get_source("module_a"))
diff --git a/test/package/test_model.py b/test/package/test_model.py
index 05da0954114ad..65818a49d3183 100644
--- a/test/package/test_model.py
+++ b/test/package/test_model.py
@@ -23,7 +23,10 @@
     from common import PackageTestCase
 
 
-@skipIf(True, "Does not work with recent torchvision, see https://github.com/pytorch/pytorch/issues/81115")
+@skipIf(
+    True,
+    "Does not work with recent torchvision, see https://github.com/pytorch/pytorch/issues/81115",
+)
 @skipIfNoTorchVision
 class ModelTest(PackageTestCase):
     """End-to-end tests packaging an entire model."""
@@ -88,7 +91,6 @@ def test_resnet(self):
 
     @skipIfNoTorchVision
     def test_model_save(self):
-
         # This example shows how you might package a model
         # so that the creator of the model has flexibility about
         # how they want to save it but the 'server' can always
diff --git a/test/package/test_package_fx.py b/test/package/test_package_fx.py
index 2ff4b2efa65c1..93b599ef0cbc9 100644
--- a/test/package/test_package_fx.py
+++ b/test/package/test_package_fx.py
@@ -22,6 +22,7 @@
 # Do it twice to make sure it doesn't affect anything
 torch.fx.wrap("len")
 
+
 class TestPackageFX(PackageTestCase):
     """Tests for compatibility with FX."""
 
@@ -186,6 +187,5 @@ def forward(self, a):
         self.assertEqual(loaded_traced(input), traced(input))
 
 
-
 if __name__ == "__main__":
     run_tests()
diff --git a/test/package/test_resources.py b/test/package/test_resources.py
index 208917be771ac..2f30c0aeae643 100644
--- a/test/package/test_resources.py
+++ b/test/package/test_resources.py
@@ -25,16 +25,16 @@ def test_resource_reader(self):
         with PackageExporter(buffer) as pe:
             # Layout looks like:
             #    package
-            #    ├── one/
-            #    │   ├── a.txt
-            #    │   ├── b.txt
-            #    │   ├── c.txt
-            #    │   └── three/
-            #    │       ├── d.txt
-            #    │       └── e.txt
-            #    └── two/
-            #       ├── f.txt
-            #       └── g.txt
+            #    |-- one/
+            #    |   |-- a.txt
+            #    |   |-- b.txt
+            #    |   |-- c.txt
+            #    |   +-- three/
+            #    |       |-- d.txt
+            #    |       +-- e.txt
+            #    +-- two/
+            #       |-- f.txt
+            #       +-- g.txt
             pe.save_text("one", "a.txt", "hello, a!")
             pe.save_text("one", "b.txt", "hello, b!")
             pe.save_text("one", "c.txt", "hello, c!")
diff --git a/test/package/test_save_load.py b/test/package/test_save_load.py
index 112c43b4dc9c1..44fbb448bb20e 100644
--- a/test/package/test_save_load.py
+++ b/test/package/test_save_load.py
@@ -163,7 +163,9 @@ def test_pickle_long_name_with_protocol_4(self):
         buffer = BytesIO()
         with PackageExporter(buffer) as exporter:
             exporter.intern("**")
-            exporter.save_pickle("container", "container.pkl", container, pickle_protocol=4)
+            exporter.save_pickle(
+                "container", "container.pkl", container, pickle_protocol=4
+            )
 
         buffer.seek(0)
         importer = PackageImporter(buffer)
diff --git a/test/profiler/test_execution_trace.py b/test/profiler/test_execution_trace.py
new file mode 100644
index 0000000000000..dd8299c920898
--- /dev/null
+++ b/test/profiler/test_execution_trace.py
@@ -0,0 +1,388 @@
+# Owner(s): ["oncall: profiler"]
+
+# if tqdm is not shutdown properly, it will leave the monitor thread alive.
+# This causes an issue in the multithreading test because we check all events
+# in that test with their tids. The events that correspond to these lingering
+# threads all have TID of (uint64_t)(-1) which is invalid.
+# The work around is turnning off monitoring thread when tqdm is loaded.
+# Since these are unit tests, it is safe to turn off monitor thread.
+try:
+    import tqdm
+
+    tqdm.tqdm.monitor_interval = 0
+except ImportError:
+    pass
+
+import json
+import sys
+import tempfile
+import unittest
+from typing import Any, Dict, List
+
+import torch
+import torch.nn as nn
+from torch import _dynamo as torchdynamo
+from torch.autograd import (
+    _record_function_with_args_enter,
+    _record_function_with_args_exit,
+)
+from torch.profiler import (
+    ExecutionTraceObserver,
+    kineto_available,
+    profile,
+    record_function,
+    supported_activities,
+)
+
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    run_tests,
+    skipIfTorchDynamo,
+    TestCase,
+)
+
+from torch.utils._triton import has_triton
+
+Json = Dict[str, Any]
+
+
+class TestExecutionTrace(TestCase):
+    def payload(self, use_cuda=False):
+        u = torch.randn(3, 4, 5, requires_grad=True)
+        with record_function("## TEST 1 ##", "1, 2, 3"):
+            inf_val = float("inf")
+            neg_inf_val = float("-inf")
+            nan_val = float("nan")
+            rf_handle = _record_function_with_args_enter(
+                "## TEST 2 ##",
+                1,
+                False,
+                2.5,
+                [u, u],
+                (u, u),
+                "hello",
+                u,
+                inf_val,
+                neg_inf_val,
+                nan_val,
+            )
+            x = torch.randn(10, 10, requires_grad=True)
+            if use_cuda:
+                x = x.cuda()
+            y = torch.randn(10, 10, requires_grad=True)
+            if use_cuda:
+                y = y.cuda()
+            z = x + y + x * y + x * y
+            z.backward(z)
+            gelu = nn.GELU()
+            m = torch.randn(2)
+            _ = gelu(m)
+            if use_cuda:
+                z = z.cpu()
+            _record_function_with_args_exit(rf_handle)
+
+    def get_execution_trace_root(self, output_file_name) -> Json:
+        nodes = []
+        with open(output_file_name) as f:
+            et_graph = json.load(f)
+            assert "nodes" in et_graph
+            nodes = et_graph["nodes"]
+        return nodes
+
+    def get_execution_trace_rf_ids(self, nodes: List[Json]) -> List[int]:
+        """Returns a sorted list of rf_id (record function ids) in execution trace"""
+
+        def get_rf_id(node):
+            attrs = node["attrs"]
+            for a in attrs:
+                if a["name"] == "rf_id":
+                    return a["value"]
+            return None
+
+        rf_ids_ = (
+            get_rf_id(n)
+            for n in nodes
+            if n["name"] != "[pytorch|profiler|execution_trace|process]"
+            and n["name"] != "[pytorch|profiler|execution_trace|thread]"
+        )
+        return sorted(rf_id for rf_id in rf_ids_ if rf_id is not None)
+
+    def get_kineto_rf_ids(self, events: List[Json]) -> List[int]:
+        """Returns a sorted list of Record function IDs for CPU operators and user annotations"""
+        ops_and_annotations = (
+            e for e in events if e.get("cat", "") in ["cpu_op", "user_annotation"]
+        )
+        return sorted(
+            e.get("args", {}).get("Record function id", -1) for e in ops_and_annotations
+        )
+
+    @unittest.skipIf(not kineto_available(), "Kineto is required")
+    def test_execution_trace_with_kineto(self):
+        trace_called_num = 0
+
+        def trace_handler(p):
+            nonlocal trace_called_num
+            trace_called_num += 1
+
+        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
+        # Create a temp file to save execution trace and kineto data.
+        fp = tempfile.NamedTemporaryFile("w+t", suffix=".et.json", delete=False)
+        fp.close()
+        kt = tempfile.NamedTemporaryFile(
+            mode="w+t", suffix=".kineto.json", delete=False
+        )
+        kt.close()
+
+        with profile(
+            activities=supported_activities(),
+            schedule=torch.profiler.schedule(
+                skip_first=3, wait=1, warmup=1, active=2, repeat=1
+            ),
+            on_trace_ready=trace_handler,
+            execution_trace_observer=(
+                ExecutionTraceObserver().register_callback(fp.name)
+            ),
+        ) as p:
+            for idx in range(10):
+                with record_function(f"## LOOP {idx} ##"):
+                    self.payload(use_cuda=use_cuda)
+                p.step()
+            self.assertEqual(fp.name, p.execution_trace_observer.get_output_file_path())
+
+        # Uncomment for debugging
+        # print("Output kineto = ", kt.name)
+        # print("Output ET = ", fp.name)
+
+        p.export_chrome_trace(kt.name)
+        self.assertEqual(trace_called_num, 1)
+
+        nodes = self.get_execution_trace_root(fp.name)
+        loop_count = 0
+        found_root_node = False
+        for n in nodes:
+            assert "name" in n
+            if "[pytorch|profiler|execution_trace|process]" in n["name"]:
+                found_root_node = True
+            if n["name"].startswith("## LOOP "):
+                loop_count += 1
+        self.assertTrue(found_root_node)
+        # Since profiler trace is active for 2 iterations
+        self.assertEqual(loop_count, 2)
+
+        # Compare the collected Execution Trace and Kineto Trace
+        # in terms of record func ID (rf_id) and External IDs
+        # both of these should match for the same trace window.
+
+        with open(kt.name) as f:
+            kineto = json.load(f)
+            events = kineto["traceEvents"]
+
+        # Look up rf_ids in both Execution and Kineto trace as two lists.
+        rf_ids_et = self.get_execution_trace_rf_ids(nodes)
+        rf_ids_kineto = self.get_kineto_rf_ids(events)
+
+        self.assertCountEqual(rf_ids_et, rf_ids_kineto)
+        self.assertListEqual(
+            rf_ids_et,
+            rf_ids_kineto,
+            msg=f"ET and kineto rf_id should exactly match\n"
+            f"  rf_ids_et = {rf_ids_et}\n"
+            f"  rf_ids_kineto = {rf_ids_kineto}\n",
+        )
+
+    def test_execution_trace_alone(self):
+        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
+        # Create a temp file to save execution trace data.
+        fp = tempfile.NamedTemporaryFile("w+t", suffix=".et.json", delete=False)
+        fp.close()
+        expected_loop_events = 0
+
+        et = ExecutionTraceObserver().register_callback(fp.name)
+
+        et.start()
+        for idx in range(5):
+            expected_loop_events += 1
+            with record_function(f"## LOOP {idx} ##"):
+                self.payload(use_cuda=use_cuda)
+        et.stop()
+
+        assert fp.name == et.get_output_file_path()
+        et.unregister_callback()
+        nodes = self.get_execution_trace_root(fp.name)
+        loop_count = 0
+        # Expected tensor object tuple size, in th form of:
+        # [tensor_id, storage_id, offset, numel, itemsize, device_str]
+        tensor_tuple_size = 6
+        found_root_node = False
+        for n in nodes:
+            assert "name" in n
+            if "[pytorch|profiler|execution_trace|process]" in n["name"]:
+                found_root_node = True
+            if n["name"].startswith("## LOOP "):
+                loop_count += 1
+            # Check if tensor tuple representation size is correct.
+            if n["name"] == "## TEST 2 ##":
+                assert len(n["inputs"]["values"][3][0]) == tensor_tuple_size
+        assert found_root_node
+        assert loop_count == expected_loop_events
+
+    @unittest.skipIf(IS_WINDOWS, "torch.compile does not support WINDOWS")
+    @unittest.skipIf(
+        sys.version_info >= (3, 12), "torch.compile is not supported on python 3.12+"
+    )
+    @unittest.skipIf(not TEST_CUDA or not has_triton(), "need CUDA and triton to run")
+    def test_execution_trace_with_pt2(self):
+        @torchdynamo.optimize("inductor")
+        def fn(a, b, c):
+            x = torch.nn.functional.linear(a, b)
+            x = x + c
+            return x.cos()
+
+        a, b, c = (torch.randn(4, 4, requires_grad=True).to("cuda") for _ in range(3))
+
+        inputs = [a, b, c]
+        with torch._inductor.config.patch(compile_threads=1):
+            fn(*inputs)
+
+        # Create a temp file to save execution trace data.
+        fp = tempfile.NamedTemporaryFile("w+t", suffix="_et.json", delete=False)
+        fp.close()
+
+        with profile(
+            activities=torch.profiler.supported_activities(),
+            record_shapes=True,
+            schedule=torch.profiler.schedule(
+                skip_first=3, wait=1, warmup=1, active=2, repeat=1
+            ),
+            execution_trace_observer=(
+                ExecutionTraceObserver().register_callback(fp.name)
+            ),
+        ) as p:
+            for idx in range(10):
+                with record_function(f"## LOOP {idx} ##"):
+                    fn(*inputs)
+                p.step()
+
+        nodes = self.get_execution_trace_root(fp.name)
+        found_captured_triton_kernel_node = False
+        for n in nodes:
+            assert "name" in n
+            if "triton_" in n["name"]:
+                for attr in n["attrs"]:
+                    if attr["name"] == "kernel_file" and attr["value"] != "":
+                        found_captured_triton_kernel_node = True
+                        assert len(n["inputs"]["values"]) > 0
+                        assert len(n["outputs"]["values"]) == 0
+        assert found_captured_triton_kernel_node
+
+    def test_execution_trace_start_stop(self):
+        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
+        # Create a temp file to save execution trace data.
+        fp = tempfile.NamedTemporaryFile("w+t", suffix=".et.json", delete=False)
+        fp.close()
+        expected_loop_events = 0
+        et = ExecutionTraceObserver().register_callback(fp.name)
+        for idx in range(10):
+            if idx == 3:
+                et.start()
+            elif idx == 5:
+                et.stop()
+            elif idx == 8:
+                et.start()
+            elif idx == 9:
+                et.stop()
+            if et._execution_trace_running:
+                expected_loop_events += 1
+            with record_function(f"## LOOP {idx} ##"):
+                self.payload(use_cuda=use_cuda)
+
+        assert fp.name == et.get_output_file_path()
+        et.unregister_callback()
+        nodes = self.get_execution_trace_root(fp.name)
+        loop_count = 0
+        found_root_node = False
+        for n in nodes:
+            assert "name" in n
+            if "[pytorch|profiler|execution_trace|process]" in n["name"]:
+                found_root_node = True
+            if n["name"].startswith("## LOOP "):
+                loop_count += 1
+        assert found_root_node
+        assert loop_count == expected_loop_events
+
+    def test_execution_trace_repeat_in_loop(self):
+        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
+        iter_list = {3, 4, 6, 8}
+        expected_loop_events = len(iter_list)
+        output_files = []
+        for idx in range(10):
+            if idx in iter_list:
+                # Create a temp file to save execution trace data.
+                fp = tempfile.NamedTemporaryFile("w+t", suffix=".et.json", delete=False)
+                fp.close()
+                output_files.append(fp.name)
+                et = ExecutionTraceObserver().register_callback(fp.name)
+                et.start()
+            with record_function(f"## LOOP {idx} ##"):
+                self.payload(use_cuda=use_cuda)
+            if idx in iter_list:
+                et.stop()
+                et.unregister_callback()
+
+        event_count = 0
+        for et_file in output_files:
+            nodes = self.get_execution_trace_root(et_file)
+            found_root_node = False
+            for n in nodes:
+                assert "name" in n
+                if "[pytorch|profiler|execution_trace|process]" in n["name"]:
+                    assert n["id"] == 1
+                    found_root_node = True
+                if n["name"].startswith("## LOOP "):
+                    event_count += 1
+            assert found_root_node
+        assert event_count == expected_loop_events
+
+    def test_execution_trace_no_capture(self):
+        fp = tempfile.NamedTemporaryFile("w+t", suffix=".et.json", delete=False)
+        fp.close()
+        et = ExecutionTraceObserver().register_callback(fp.name)
+
+        assert fp.name == et.get_output_file_path()
+        et.unregister_callback()
+        nodes = self.get_execution_trace_root(fp.name)
+        for n in nodes:
+            assert "name" in n
+            if "[pytorch|profiler|execution_trace|process]" in n["name"]:
+                found_root_node = True
+        assert found_root_node
+
+    @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/124500")
+    def test_execution_trace_nested_tensor(self):
+        fp = tempfile.NamedTemporaryFile("w+t", suffix=".et.json", delete=False)
+        fp.close()
+
+        observer = ExecutionTraceObserver().register_callback(fp.name)
+
+        def fn(nt):
+            return nt.sin().cos()
+
+        with torch.profiler.profile(execution_trace_observer=observer) as prof:
+            for i in range(3):
+                values = torch.rand((8 + i, 4 + i))
+                offsets = torch.tensor([0, 2, 4, 6, 8 + i])
+                nt = torch.nested.nested_tensor_from_jagged(values, offsets)
+                fn(nt)
+
+        nodes = self.get_execution_trace_root(fp.name)
+        found_cos = False
+        for n in nodes:
+            assert "name" in n
+            if "cos" in n["name"]:
+                found_cos = True
+        assert found_cos
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index f9348f8aaa0fc..6bc1f5d0af7a1 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -103,7 +103,6 @@ def gradient_detected(
         grad_tensor: torch.Tensor,
         parameter: Optional[torch.Tensor] = None,
     ) -> None:
-
         # This is not an exhaustive check, but for the purpose of unit testing
         # it is sufficient.
         def key_matches_tensor(key, tensor) -> bool:
@@ -219,7 +218,6 @@ def check(cold_start: bool):
         check(cold_start=False)
 
     def _test_extract_gradients_from_optimizer(self, set_to_none: bool) -> None:
-
         x = torch.ones((1,))
         w0 = torch.ones((1,), requires_grad=True)
         w1 = torch.ones((1,), requires_grad=True)
@@ -844,14 +842,19 @@ def _lookup_tensor_categories(
             if key.storage.allocation_id == max(ids | {-1})
         }
 
-    def _run_and_check_parameters_and_gradients(self, inner_fn, model, grads_none: bool = False):
-
+    def _run_and_check_parameters_and_gradients(
+        self, inner_fn, model, grads_none: bool = False
+    ):
         with profile() as prof:
             inner_fn()
 
         memory_profile = prof._memory_profile()
 
-        def assert_category(t: torch.Tensor, category: _memory_profiler.Category, should_be_none: bool = False):
+        def assert_category(
+            t: torch.Tensor,
+            category: _memory_profiler.Category,
+            should_be_none: bool = False,
+        ):
             if should_be_none:
                 assert t is None, "tensor should be None but is not."
                 return
@@ -940,7 +943,9 @@ def fwd_bwd_step():
         # If we profile the first step then gradients will not have been
         # created when we call `model.forward`, so if we don't call `.backward`
         # then gradients are never created.
-        self._run_and_check_parameters_and_gradients(inner_fn=fwd_only, model=model, grads_none=True)
+        self._run_and_check_parameters_and_gradients(
+            inner_fn=fwd_only, model=model, grads_none=True
+        )
 
         # On the first step we must rely on `AccumulateGrad`, since gradients
         # did not exist when `model.forward` was called.
@@ -1147,26 +1152,26 @@ def step_fn(mark_region):
             aten::mul.Tensor                         1 (INPUT), 3 (INPUT)                          -> 4 (INPUT)
             aten::mul.Tensor                         1 (INPUT), 5 (INPUT)                          -> 6 (INPUT)
             aten::cat                                4 (INPUT), 6 (INPUT)                          -> 7 (INPUT)
-            aten::binary_cross_entropy_with_logits   7 (INPUT), 2 (INPUT)                          -> 13 (INPUT)
+            aten::binary_cross_entropy_with_logits   7 (INPUT), 2 (INPUT)                          -> 11 (INPUT)
 
             -- Backward ---------------------------------------------------------------------------------------------
-            aten::ones_like                          13 (INPUT)                                    -> 16 (INPUT)
-            aten::sigmoid                            7 (INPUT)                                     -> 17 (TEMPORARY)
-            aten::sub.Tensor                         17 (TEMPORARY), 2 (INPUT)                     -> 18 (TEMPORARY)
-            aten::mul.Tensor                         18 (TEMPORARY), 16 (INPUT)                    -> 19 (AUTOGRAD_DETAIL)
-            aten::div_.Scalar                        19 (AUTOGRAD_DETAIL)                          -> 19 (AUTOGRAD_DETAIL)
-            aten::slice.Tensor                       19 (AUTOGRAD_DETAIL)                          -> 19 (AUTOGRAD_DETAIL)
-            aten::slice.Tensor                       19 (AUTOGRAD_DETAIL)                          -> 19 (AUTOGRAD_DETAIL)
-            aten::mul.Tensor                         19 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 22 (AUTOGRAD_DETAIL)
+            aten::ones_like                          11 (INPUT)                                    -> 14 (INPUT)
+            aten::sigmoid                            7 (INPUT)                                     -> 15 (TEMPORARY)
+            aten::sub.Tensor                         15 (TEMPORARY), 2 (INPUT)                     -> 16 (TEMPORARY)
+            aten::mul.Tensor                         16 (TEMPORARY), 14 (INPUT)                    -> 17 (AUTOGRAD_DETAIL)
+            aten::div_.Scalar                        17 (AUTOGRAD_DETAIL)                          -> 17 (AUTOGRAD_DETAIL)
+            aten::slice.Tensor                       17 (AUTOGRAD_DETAIL)                          -> 17 (AUTOGRAD_DETAIL)
+            aten::slice.Tensor                       17 (AUTOGRAD_DETAIL)                          -> 17 (AUTOGRAD_DETAIL)
+            aten::mul.Tensor                         17 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 20 (AUTOGRAD_DETAIL)
+            aten::sum.dim_IntList                    20 (AUTOGRAD_DETAIL)                          -> 21 (GRADIENT)
+            aten::view                               21 (GRADIENT)                                 -> 21 (GRADIENT)
+            aten::detach                             21 (GRADIENT)                                 -> 21 (GRADIENT)
+            aten::detach                             21 (GRADIENT)                                 -> ???
+            aten::mul.Tensor                         17 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 22 (AUTOGRAD_DETAIL)
             aten::sum.dim_IntList                    22 (AUTOGRAD_DETAIL)                          -> 23 (GRADIENT)
             aten::view                               23 (GRADIENT)                                 -> 23 (GRADIENT)
             aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
-            aten::detach                             23 (GRADIENT)                                 -> ???
-            aten::mul.Tensor                         19 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 24 (AUTOGRAD_DETAIL)
-            aten::sum.dim_IntList                    24 (AUTOGRAD_DETAIL)                          -> 25 (GRADIENT)
-            aten::view                               25 (GRADIENT)                                 -> 25 (GRADIENT)
-            aten::detach                             25 (GRADIENT)                                 -> 25 (GRADIENT)
-            aten::detach                             25 (GRADIENT)                                 -> ???""",
+            aten::detach                             23 (GRADIENT)                                 -> ???""",
         )
 
     def test_categories_e2e_simple_fwd_bwd_step(self) -> None:
@@ -1199,30 +1204,30 @@ def step_fn(mark_region):
             aten::mul.Tensor                         1 (INPUT), 3 (PARAMETER)                      -> 4 (ACTIVATION)
             aten::mul.Tensor                         1 (INPUT), 5 (PARAMETER)                      -> 6 (ACTIVATION)
             aten::cat                                4 (ACTIVATION), 6 (ACTIVATION)                -> 7 (ACTIVATION)
-            aten::binary_cross_entropy_with_logits   7 (ACTIVATION), 2 (INPUT)                     -> 13 (ACTIVATION)
+            aten::binary_cross_entropy_with_logits   7 (ACTIVATION), 2 (INPUT)                     -> 11 (ACTIVATION)
 
             -- Backward ---------------------------------------------------------------------------------------------
-            aten::ones_like                          13 (ACTIVATION)                               -> 16 (ACTIVATION)
-            aten::sigmoid                            7 (ACTIVATION)                                -> 17 (TEMPORARY)
-            aten::sub.Tensor                         17 (TEMPORARY), 2 (INPUT)                     -> 18 (TEMPORARY)
-            aten::mul.Tensor                         18 (TEMPORARY), 16 (ACTIVATION)               -> 19 (AUTOGRAD_DETAIL)
-            aten::div_.Scalar                        19 (AUTOGRAD_DETAIL)                          -> 19 (AUTOGRAD_DETAIL)
-            aten::slice.Tensor                       19 (AUTOGRAD_DETAIL)                          -> 19 (AUTOGRAD_DETAIL)
-            aten::slice.Tensor                       19 (AUTOGRAD_DETAIL)                          -> 19 (AUTOGRAD_DETAIL)
-            aten::mul.Tensor                         19 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 22 (AUTOGRAD_DETAIL)
+            aten::ones_like                          11 (ACTIVATION)                               -> 14 (ACTIVATION)
+            aten::sigmoid                            7 (ACTIVATION)                                -> 15 (TEMPORARY)
+            aten::sub.Tensor                         15 (TEMPORARY), 2 (INPUT)                     -> 16 (TEMPORARY)
+            aten::mul.Tensor                         16 (TEMPORARY), 14 (ACTIVATION)               -> 17 (AUTOGRAD_DETAIL)
+            aten::div_.Scalar                        17 (AUTOGRAD_DETAIL)                          -> 17 (AUTOGRAD_DETAIL)
+            aten::slice.Tensor                       17 (AUTOGRAD_DETAIL)                          -> 17 (AUTOGRAD_DETAIL)
+            aten::slice.Tensor                       17 (AUTOGRAD_DETAIL)                          -> 17 (AUTOGRAD_DETAIL)
+            aten::mul.Tensor                         17 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 20 (AUTOGRAD_DETAIL)
+            aten::sum.dim_IntList                    20 (AUTOGRAD_DETAIL)                          -> 21 (GRADIENT)
+            aten::view                               21 (GRADIENT)                                 -> 21 (GRADIENT)
+            aten::detach                             21 (GRADIENT)                                 -> 21 (GRADIENT)
+            aten::detach                             21 (GRADIENT)                                 -> 21 (GRADIENT)
+            aten::mul.Tensor                         17 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 22 (AUTOGRAD_DETAIL)
             aten::sum.dim_IntList                    22 (AUTOGRAD_DETAIL)                          -> 23 (GRADIENT)
             aten::view                               23 (GRADIENT)                                 -> 23 (GRADIENT)
             aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
             aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
-            aten::mul.Tensor                         19 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 24 (AUTOGRAD_DETAIL)
-            aten::sum.dim_IntList                    24 (AUTOGRAD_DETAIL)                          -> 25 (GRADIENT)
-            aten::view                               25 (GRADIENT)                                 -> 25 (GRADIENT)
-            aten::detach                             25 (GRADIENT)                                 -> 25 (GRADIENT)
-            aten::detach                             25 (GRADIENT)                                 -> 25 (GRADIENT)
 
             -- Optimizer --------------------------------------------------------------------------------------------
-            aten::add_.Tensor                        3 (PARAMETER), 25 (GRADIENT)                  -> 3 (PARAMETER)
-            aten::add_.Tensor                        5 (PARAMETER), 23 (GRADIENT)                  -> 5 (PARAMETER)""",
+            aten::add_.Tensor                        3 (PARAMETER), 23 (GRADIENT)                  -> 3 (PARAMETER)
+            aten::add_.Tensor                        5 (PARAMETER), 21 (GRADIENT)                  -> 5 (PARAMETER)""",
         )
 
     def test_categories_e2e_simple_module_fwd(self) -> None:
@@ -1461,7 +1466,6 @@ def format_size(size: int):
                 return f"{size / 1024:3.1f} kB"
             return f"{size // 1024} kB"
 
-
         # We generate sequential IDs for Tensors; however platforms vary
         # slightly in the exact computation executed. If this results in
         # tensor creation the IDs will be shifted and the unit test will fail.
@@ -1477,7 +1481,6 @@ def id_for_testing(key):
             f"{action.name.lower():<25}  {format_action(action, key, version):<25}  "
             f"{id_for_testing(key):>3}(v{version}) {format_size(size):>15}"
             for _, action, (key, version), size in prof._memory_profile().timeline
-
             # We generally don't care about tiny allocations during memory
             # profiling and they add a lot of noise to the unit test.
             if size > 1024
@@ -1547,7 +1550,8 @@ def id_for_testing(key):
             destroy                    ???                         29(v1)         1024 kB
             destroy                    GRADIENT                    16(v0)          128 kB
             destroy                    GRADIENT                    17(v0)            2 kB
-            destroy                    GRADIENT                    13(v0)         1024 kB""")
+            destroy                    GRADIENT                    13(v0)         1024 kB""",
+        )
 
     def test_memory_timeline_no_id(self) -> None:
         # On CPU the default behavior is to simply forward to malloc. That
@@ -1594,7 +1598,9 @@ def test_memory_timeline_no_id(self) -> None:
         if not torch.cuda.is_available():
             expected = expected[2:]
             for event in expected:
-                self.assertTrue(event in actual, f"event: {event} was not found in actual.")
+                self.assertTrue(
+                    event in actual, f"event: {event} was not found in actual."
+                )
         else:
             self.assertEqual(
                 actual,
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 9b1c97d29c115..a29cb1bcf62f4 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -1,37 +1,43 @@
 # Owner(s): ["oncall: profiler"]
+
+# if tqdm is not shutdown properly, it will leave the monitor thread alive.
+# This causes an issue in the multithreading test because we check all events
+# in that test with their tids. The events that correspond to these lingering
+# threads all have TID of (uint64_t)(-1) which is invalid.
+# The work around is turnning off monitoring thread when tqdm is loaded.
+# Since these are unit tests, it is safe to turn off monitor thread.
+try:
+    import tqdm
+
+    tqdm.tqdm.monitor_interval = 0
+except ImportError:
+    pass
+
 import collections
 import gc
 import json
 import os
+import pickle
 import re
-import tempfile
-import textwrap
+import subprocess
+import sys
 import threading
+import time
 import unittest
-from unittest.mock import patch
-import weakref
 from dataclasses import dataclass, field
 from typing import List, Optional
+from unittest.mock import patch
 
 import expecttest
-import subprocess
-import sys
 import torch
 import torch.nn as nn
 import torch.optim
 import torch.utils.data
-import torch.utils.data.datapipes as dp
-from torch.autograd import (
-    _record_function_with_args_enter,
-    _record_function_with_args_exit,
-)
-from torch.autograd.profiler import profile as _profile
-from torch.autograd.profiler import KinetoStepTracker
+from torch.autograd.profiler import KinetoStepTracker, profile as _profile
 from torch.autograd.profiler_legacy import profile as _profile_legacy
 from torch.profiler import (
     _utils,
     DeviceType,
-    ExecutionTraceObserver,
     kineto_available,
     profile,
     ProfilerAction,
@@ -39,7 +45,6 @@
     record_function,
     supported_activities,
 )
-from torch._C._profiler import _TensorMetadata
 from torch.profiler._pattern_matcher import (
     Conv2dBiasFollowedByBatchNorm2dPattern,
     ExtraCUDACopyPattern,
@@ -53,14 +58,18 @@
     report_all_anti_patterns,
     SynchronizedDataLoaderPattern,
 )
+
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
+
 from torch.testing._internal.common_device_type import skipCUDAVersionIn
 from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
     IS_JETSON,
     IS_WINDOWS,
-    instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    serialTest,
+    skipIfTorchDynamo,
     TemporaryDirectoryName,
     TemporaryFileName,
     TEST_WITH_ASAN,
@@ -75,7 +84,7 @@
     HAS_PSUTIL = True
 except ImportError:
     HAS_PSUTIL = False
-import pickle
+
 
 from torch._C._profiler import _ExperimentalConfig, _ExtraFields_PyCall
 
@@ -85,11 +94,9 @@
 @unittest.skipIf(IS_WINDOWS, "Test is flaky on Windows")
 @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
 class TestProfilerCUDA(TestCase):
-
     @skipCUDAVersionIn([(11, 5)])  # https://github.com/pytorch/pytorch/issues/69023
     def test_mem_leak(self):
-        """Checks that there's no memory leak when using profiler with CUDA
-        """
+        """Checks that there's no memory leak when using profiler with CUDA"""
         t = torch.rand(1, 1).cuda()
         p = psutil.Process()
         last_rss = collections.deque(maxlen=5)
@@ -105,12 +112,15 @@ def test_mem_leak(self):
         # with CUDA events leaking the increase in memory was ~7 MB between
         # profiler invocations above
         is_increasing = all(
-            last_rss[idx] > last_rss[idx - 1] for idx in range(1, len(last_rss)))
+            last_rss[idx] > last_rss[idx - 1] for idx in range(1, len(last_rss))
+        )
         max_diff = -1
         for idx in range(1, len(last_rss)):
             max_diff = max(max_diff, last_rss[idx] - last_rss[idx - 1])
-        self.assertTrue(not (is_increasing and max_diff > 100 * 1024),
-                        msg=f'memory usage is increasing, {str(last_rss)}')
+        self.assertTrue(
+            not (is_increasing and max_diff > 100 * 1024),
+            msg=f"memory usage is increasing, {str(last_rss)}",
+        )
 
     def test_custom_module_input_op_ids(self):
         class MyFunc(torch.autograd.Function):
@@ -121,7 +131,7 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, gO):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 return x
 
         def custom_layer(input_ten):
@@ -144,7 +154,11 @@ def test_cudagraph_profiling_workaround(self):
         # repro taken from #75504
         # Launch in a separate process to catch hanging/illegal memory errors
         # and to make sure CUPTI isn't already initialized.
-        p = subprocess.check_call([sys.executable, "-c", """
+        p = subprocess.check_call(
+            [
+                sys.executable,
+                "-c",
+                """
 import os
 import torch
 from torch.profiler import ProfilerActivity, profile
@@ -167,14 +181,17 @@ def add_one(in_: torch.Tensor):
 
 with profile(activities=[ProfilerActivity.CUDA]):
     add_one_graphed(zeros)
-"""], universal_newlines=True, timeout=60)
+""",
+            ],
+            universal_newlines=True,
+            timeout=60,
+        )
 
         # ^ this will throw an exception if the script fails.
 
 
 @unittest.skipIf(not torch.profiler.itt.is_available(), "ITT is required")
 class TestProfilerITT(TestCase):
-
     def test_custom_module_input_op_ids(self):
         class MyFunc(torch.autograd.Function):
             @staticmethod
@@ -184,7 +201,7 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, gO):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 return x
 
         def custom_layer(input_ten):
@@ -200,329 +217,14 @@ def custom_layer(input_ten):
             q = s.sum()
             q.backward()
 
-class TestRecordFunction(TestCase):
-    def _record_function_with_param(self):
-        u = torch.randn(3, 4, 5, requires_grad=True)
-        with _profile(with_stack=True, use_kineto=kineto_available(), record_shapes=True) as prof:
-            with record_function("## TEST 1 ##", "1, 2, 3"):
-                rf_handle = _record_function_with_args_enter("## TEST 2 ##", 1, False, 2.5, [u, u], "hello", u)
-                _record_function_with_args_exit(rf_handle)
-            with record_function("## TEST 3 ##"):
-                rf_handle = _record_function_with_args_enter("## TEST 4 ##")
-                _record_function_with_args_exit(rf_handle)
-        return prof
-
-    def test_record_function(self):
-        prof_result = self._record_function_with_param()
-        found_test_1 = False
-        found_test_2 = False
-        found_test_3 = False
-        found_test_4 = False
-        for e in prof_result.function_events:
-            if "## TEST 1 ##" == e.name:
-                found_test_1 = True
-                self.assertTrue(e.input_shapes == [[]])
-            elif "## TEST 2 ##" == e.name:
-                found_test_2 = True
-                self.assertTrue(e.input_shapes == [[], [], [], [], [], [3, 4, 5]])
-            elif "## TEST 3 ##" == e.name:
-                found_test_3 = True
-                self.assertTrue(e.input_shapes == [])
-            elif "## TEST 4 ##" == e.name:
-                found_test_4 = True
-                self.assertTrue(e.input_shapes == [])
-        self.assertTrue(found_test_1)
-        self.assertTrue(found_test_2)
-        self.assertTrue(found_test_3)
-        self.assertTrue(found_test_4)
-
-    def test_datapipe_with_record_function(self):
-        with _profile(with_stack=True, use_kineto=kineto_available(), record_shapes=True) as prof:
-            input_dp1 = dp.iter.IterableWrapper(range(4))
-            input_dp2 = dp.iter.IterableWrapper(range(4, 8))
-            input_dp3 = dp.iter.IterableWrapper(range(8, 12))
-            output_dp = input_dp1.mux(input_dp2, input_dp3)
-            output = list(output_dp)
-
-        has_iter = False
-        has_mux = False
-        for e in prof.function_events:
-            if has_iter and has_mux:
-                break
-
-            if not has_iter and "IterableWrapper" in e.name:
-                has_iter = True
-            if not has_mux and "Multiplexer" in e.name:
-                has_mux = True
-        self.assertTrue(has_iter)
-        self.assertTrue(has_mux)
-
-    def test_datapipe_delegation_with_profiler(self):
-        class IDPIterator(torch.utils.data.IterDataPipe):
-            def __init__(self):
-                self.data = list(range(10))
-                self._idx = 0
-
-            def __iter__(self):
-                return self
-
-            def __next__(self):
-                if self._idx >= 10:
-                    self._idx = 0
-                    raise StopIteration
-                self._idx += 1
-                return self.data[self._idx - 1]
-
-            def get_value(self, idx):
-                return self.data[idx]
-
-        dp1 = IDPIterator()  # The object itself is an iterator
-        self.assertEqual(5, dp1.get_value(5))
-        it_dp1 = iter(dp1)  # This creates the 1st iterator
-        self.assertEqual(5, it_dp1.get_value(5))  # type: ignore[attr-defined]
-        self.assertEqual(list(range(10)), list(it_dp1))
-
-        class IDPDelegator(torch.utils.data.IterDataPipe):
-            def __init__(self, datapipe):
-                self.datapipe = datapipe
-
-            def __iter__(self):
-                return iter(self.datapipe)
-
-        dp2 = IDPDelegator(dp1)
-        it_dp2 = iter(dp2)
-        self.assertEqual(5, it_dp2.get_value(5))
-        self.assertEqual(list(range(10)), list(it_dp2))
-
-    def test_datapipe_with_record_function_fork(self):
-        with _profile(with_stack=True, use_kineto=kineto_available(), record_shapes=True) as prof:
-            input_dp = dp.iter.IterableWrapper(range(10))
-            dp1, dp2, dp3 = input_dp.fork(num_instances=3)
-            output1 = list(dp1)
-        has_iter = False
-        has_child = False
-        for e in prof.function_events:
-            if has_iter and has_child:
-                break
-
-            if not has_iter and "IterableWrapper" in e.name:
-                has_iter = True
-            if not has_child and "_ChildDataPipe" in e.name:
-                has_child = True
-        self.assertTrue(has_iter)
-        self.assertTrue(has_child)
-
-
-class TestExecutionTrace(TestCase):
-    def payload(self, use_cuda=False):
-        u = torch.randn(3, 4, 5, requires_grad=True)
-        with record_function("## TEST 1 ##", "1, 2, 3"):
-            inf_val = float("inf")
-            neg_inf_val = float("-inf")
-            nan_val = float("nan")
-            rf_handle = _record_function_with_args_enter("## TEST 2 ##", 1, False, 2.5, [u, u], (u, u),
-                                                         "hello", u, inf_val, neg_inf_val, nan_val)
-            x = torch.randn(10, 10, requires_grad=True)
-            if use_cuda:
-                x = x.cuda()
-            y = torch.randn(10, 10, requires_grad=True)
-            if use_cuda:
-                y = y.cuda()
-            z = x + y + x * y + x * y
-            z.backward(z)
-            gelu = nn.GELU()
-            m = torch.randn(2)
-            _ = gelu(m)
-            if use_cuda:
-                z = z.cpu()
-            _record_function_with_args_exit(rf_handle)
-
-    def get_execution_trace_root(self, output_file_name):
-        nodes = []
-        with open(output_file_name) as f:
-            et_graph = json.load(f)
-            assert "nodes" in et_graph
-            nodes = et_graph["nodes"]
-        return nodes
-
-    @unittest.skipIf(not kineto_available(), "Kineto is required")
-    def test_execution_trace_with_kineto(self):
-        trace_called_num = 0
-
-        def trace_handler(p):
-            nonlocal trace_called_num
-            trace_called_num += 1
-
-        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
-        # Create a temp file to save execution trace data.
-        fp = tempfile.NamedTemporaryFile('w+t', suffix='.et.json', delete=False)
-        fp.close()
-        expected_loop_events = 0
-        et = ExecutionTraceObserver()
-        et.register_callback(fp.name)
-        with profile(
-            activities=supported_activities(),
-            schedule=torch.profiler.schedule(
-                skip_first=3,
-                wait=1,
-                warmup=1,
-                active=2),
-            on_trace_ready=trace_handler,
-        ) as p:
-            et.start()
-            for idx in range(10):
-                expected_loop_events += 1
-                with record_function(f"## LOOP {idx} ##"):
-                    self.payload(use_cuda=use_cuda)
-                p.step()
-            et.stop()
-
-        assert trace_called_num == 2
-        assert fp.name == et.get_output_file_path()
-
-        # cleanup
-        et.unregister_callback()
-        nodes = self.get_execution_trace_root(fp.name)
-        loop_count = 0
-        found_root_node = False
-        for n in nodes:
-            assert "name" in n
-            if "[pytorch|profiler|execution_trace|process]" in n["name"]:
-                found_root_node = True
-            if n["name"].startswith("## LOOP "):
-                loop_count += 1
-        assert found_root_node
-        assert loop_count == expected_loop_events
-
-    def test_execution_trace_alone(self):
-        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
-        # Create a temp file to save execution trace data.
-        fp = tempfile.NamedTemporaryFile('w+t', suffix='.et.json', delete=False)
-        fp.close()
-        expected_loop_events = 0
-
-        et = ExecutionTraceObserver()
-        et.register_callback(fp.name)
-        et.start()
-        for idx in range(5):
-            expected_loop_events += 1
-            with record_function(f"## LOOP {idx} ##"):
-                self.payload(use_cuda=use_cuda)
-        et.stop()
-
-        assert fp.name == et.get_output_file_path()
-        et.unregister_callback()
-        nodes = self.get_execution_trace_root(fp.name)
-        loop_count = 0
-        # Expected tensor object tuple size, in th form of:
-        # [tensor_id, storage_id, offset, numel, itemsize, device_str]
-        tensor_tuple_size = 6
-        found_root_node = False
-        for n in nodes:
-            assert "name" in n
-            if "[pytorch|profiler|execution_trace|process]" in n["name"]:
-                found_root_node = True
-            if n["name"].startswith("## LOOP "):
-                loop_count += 1
-            # Check if tensor tuple representation size is correct.
-            if n["name"] == "## TEST 2 ##":
-                assert len(n["inputs"]["values"][3][0]) == tensor_tuple_size
-        assert found_root_node
-        assert loop_count == expected_loop_events
-
-    def test_execution_trace_start_stop(self):
-        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
-        # Create a temp file to save execution trace data.
-        fp = tempfile.NamedTemporaryFile('w+t', suffix='.et.json', delete=False)
-        fp.close()
-        expected_loop_events = 0
-        et = ExecutionTraceObserver()
-        et.register_callback(fp.name)
-        for idx in range(10):
-            if idx == 3:
-                et.start()
-            elif idx == 5:
-                et.stop()
-            elif idx == 8:
-                et.start()
-            elif idx == 9:
-                et.stop()
-            if et._execution_trace_running:
-                expected_loop_events += 1
-            with record_function(f"## LOOP {idx} ##"):
-                self.payload(use_cuda=use_cuda)
-
-        assert fp.name == et.get_output_file_path()
-        et.unregister_callback()
-        nodes = self.get_execution_trace_root(fp.name)
-        loop_count = 0
-        found_root_node = False
-        for n in nodes:
-            assert "name" in n
-            if "[pytorch|profiler|execution_trace|process]" in n["name"]:
-                found_root_node = True
-            if n["name"].startswith("## LOOP "):
-                loop_count += 1
-        assert found_root_node
-        assert loop_count == expected_loop_events
-
-    def test_execution_trace_repeat_in_loop(self):
-        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
-        iter_list = {3, 4, 6, 8}
-        expected_loop_events = len(iter_list)
-        output_files = []
-        for idx in range(10):
-            if idx in iter_list:
-                # Create a temp file to save execution trace data.
-                fp = tempfile.NamedTemporaryFile('w+t', suffix='.et.json', delete=False)
-                fp.close()
-                output_files.append(fp.name)
-                et = ExecutionTraceObserver()
-                et.register_callback(fp.name)
-                et.start()
-            with record_function(f"## LOOP {idx} ##"):
-                self.payload(use_cuda=use_cuda)
-            if idx in iter_list:
-                et.stop()
-                et.unregister_callback()
-
-        event_count = 0
-        for et_file in output_files:
-            nodes = self.get_execution_trace_root(et_file)
-            found_root_node = False
-            for n in nodes:
-                assert "name" in n
-                if "[pytorch|profiler|execution_trace|process]" in n["name"]:
-                    assert n["id"] == 1
-                    found_root_node = True
-                if n["name"].startswith("## LOOP "):
-                    event_count += 1
-            assert found_root_node
-        assert event_count == expected_loop_events
-
-    def test_execution_trace_no_capture(self):
-        fp = tempfile.NamedTemporaryFile('w+t', suffix='.et.json', delete=False)
-        fp.close()
-        et = ExecutionTraceObserver()
-        et.register_callback(fp.name)
-
-        assert fp.name == et.get_output_file_path()
-        et.unregister_callback()
-        nodes = self.get_execution_trace_root(fp.name)
-        for n in nodes:
-            assert "name" in n
-            if "[pytorch|profiler|execution_trace|process]" in n["name"]:
-                found_root_node = True
-        assert found_root_node
-
 
 @instantiate_parametrized_tests
 class TestProfiler(TestCase):
-
-    @unittest.skipIf(TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite.")
+    @unittest.skipIf(
+        TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
+    )
     def test_source(self):
-        """Checks that source code attribution works for eager, TS and autograd mode
-        """
+        """Checks that source code attribution works for eager, TS and autograd mode"""
         # avoid automatic inlining
         prev_opt = torch._C._get_graph_executor_optimize()
         torch._C._set_graph_executor_optimize(False)
@@ -540,7 +242,9 @@ def ts_method_1(x, y, z):
         class DummyModule(nn.Module):
             def __init__(self):
                 super().__init__()
-                self.conv = torch.nn.Conv2d(3, 2, kernel_size=1, stride=2, padding=3, bias=False)
+                self.conv = torch.nn.Conv2d(
+                    3, 2, kernel_size=1, stride=2, padding=3, bias=False
+                )
 
             def forward(self, x):
                 return self.conv(x)
@@ -550,7 +254,11 @@ def forward(self, x):
         def call_module(x):
             return mod(x)
 
-        with _profile(with_stack=True, use_kineto=kineto_available(), experimental_config=_ExperimentalConfig(verbose=True)) as p:
+        with _profile(
+            with_stack=True,
+            use_kineto=kineto_available(),
+            experimental_config=_ExperimentalConfig(verbose=True),
+        ) as p:
             x = torch.randn(10, 10, requires_grad=True)
             y = torch.randn(10, 10, requires_grad=True)
             z = x + y
@@ -565,10 +273,16 @@ def call_module(x):
         for e in p.function_events:
             if "aten::add" in e.name or "AddBackward" in e.name:
                 self.assertTrue(any("test_profiler" in entry for entry in e.stack))
-                self.assertTrue(any((
-                    "test_source" in entry or
-                    "ts_method_1" in entry or
-                    "ts_method_2" in entry) for entry in e.stack))
+                self.assertTrue(
+                    any(
+                        (
+                            "test_source" in entry
+                            or "ts_method_1" in entry
+                            or "ts_method_2" in entry
+                        )
+                        for entry in e.stack
+                    )
+                )
 
         # TODO: https://github.com/pytorch/kineto/issues/617
         if kineto_available() and not IS_WINDOWS:
@@ -579,12 +293,17 @@ def call_module(x):
 
                 def extract(pattern: str):
                     matches = [e for e in events if re.search(pattern, e["name"])]
-                    self.assertEqual(len(matches), 1, repr([e["name"] for e in matches]))
+                    self.assertEqual(
+                        len(matches), 1, repr([e["name"] for e in matches])
+                    )
                     return matches[0]
 
                 module_event = extract(r"DummyModule_0")
                 wrapper_event = extract(r"call_module")
-                self.assertEqual(module_event["args"]["Python parent id"], wrapper_event["args"]["Python id"])
+                self.assertEqual(
+                    module_event["args"]["Python parent id"],
+                    wrapper_event["args"]["Python id"],
+                )
 
         torch._C._set_graph_executor_optimize(prev_opt)
 
@@ -592,7 +311,7 @@ def extract(pattern: str):
         "name,thread_spec",
         {
             "basic": ((False, False),),
-            "multiple_preexisting": ((False, False), ) * 2,
+            "multiple_preexisting": ((False, False),) * 2,
             "open_in_scope": ((True, False),),
             "close_in_scope": ((False, True),),
             "complex": (
@@ -601,19 +320,17 @@ def extract(pattern: str):
                 (False, False),
                 (False, False),
                 (False, False),
-
                 # some of which finish during profiling
                 (False, True),
                 (False, True),
-
                 # And the profiled section is also multithreaded
                 (True, False),
                 (True, True),
-
             ),
         }.items(),
-        name_fn=lambda name, thread_spec: name
+        name_fn=lambda name, thread_spec: name,
     )
+    @serialTest()
     @parametrize("work_in_main_thread", [True, False])
     def test_source_multithreaded(self, name, thread_spec, work_in_main_thread):
         """Test various threading configurations.
@@ -630,7 +347,6 @@ def test_source_multithreaded(self, name, thread_spec, work_in_main_thread):
         end_barrier = threading.Barrier(num_threads, timeout=timeout)
 
         class Task(threading.Thread):
-
             def __init__(self):
                 self._end_gate = threading.Event()
                 super().__init__(daemon=True)
@@ -645,7 +361,6 @@ def release(self):
 
             @staticmethod
             def _run(end_gate=None):
-
                 def known_preexisting_function():
                     start_barrier.wait()
 
@@ -721,25 +436,39 @@ def join_threads(context: bool):
                 self.assertFalse(t.is_alive())
 
         roots = prof.profiler.kineto_results.experimental_event_tree()
-        nodes = [node for node in _utils.traverse_dfs(roots) if isinstance(node.extra_fields, _ExtraFields_PyCall)]
+        nodes = [
+            node
+            for node in _utils.traverse_dfs(roots)
+            if isinstance(node.extra_fields, _ExtraFields_PyCall)
+        ]
         tid_counts = collections.Counter([node.start_tid for node in nodes])
 
-        prior_threads = sum(not start_under_profiler for start_under_profiler, _ in thread_spec)
+        prior_threads = sum(
+            not start_under_profiler for start_under_profiler, _ in thread_spec
+        )
         expected_threads = prior_threads + 1
-        self.assertEqual(len(tid_counts), expected_threads, f"{expected_threads}, {tid_counts}")
+        self.assertEqual(
+            len(tid_counts), expected_threads, f"{expected_threads}, {tid_counts}"
+        )
         self.assertEqual(len(nodes), sum(tid_counts.values()))
 
         # Profiler uses uint64_t max as a placeholder until TID can be determined.
-        no_tid = 2 ** 64 - 1
+        no_tid = 2**64 - 1
         self.assertFalse(no_tid in tid_counts)
 
         worker_threads = prior_threads + (1 if work_in_main_thread else 0)
 
-        observed_preexisting = [node.start_tid for node in nodes if "known_preexisting_function" in node.name]
+        observed_preexisting = [
+            node.start_tid
+            for node in nodes
+            if "known_preexisting_function" in node.name
+        ]
         self.assertEqual(len(observed_preexisting), worker_threads)
         self.assertEqual(len(observed_preexisting), len(set(observed_preexisting)))
 
-        observed_during_run = [node.start_tid for node in nodes if "invoked_during_run" in node.name]
+        observed_during_run = [
+            node.start_tid for node in nodes if "invoked_during_run" in node.name
+        ]
         self.assertEqual(len(observed_during_run), worker_threads)
         self.assertEqual(len(observed_during_run), len(set(observed_during_run)))
 
@@ -755,6 +484,17 @@ def payload(self, use_cuda=False):
         if use_cuda:
             z = z.cpu()
 
+    def _check_stats(self, profiler_stats):
+        self.assertGreater(profiler_stats.profiling_window_duration_sec, 0)
+        self.assertGreater(profiler_stats.number_of_events, 0)
+        self.assertGreater(profiler_stats.profiler_prepare_call_duration_us, 0)
+        self.assertGreater(profiler_stats.profiler_enable_call_duration_us, 0)
+        self.assertGreater(profiler_stats.profiler_disable_call_duration_us, 0)
+        self.assertGreater(profiler_stats.parse_kineto_call_duration_us, 0)
+        self.assertGreater(
+            profiler_stats.function_events_build_tree_call_duration_us, 0
+        )
+
     @unittest.skipIf(not kineto_available(), "Kineto is required")
     def test_kineto(self):
         use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
@@ -765,7 +505,9 @@ def test_kineto(self):
         with _profile(use_cuda=use_cuda, use_kineto=True) as p:
             self.payload(use_cuda=use_cuda)
         output = p.key_averages().table(
-            sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total", row_limit=-1)
+            sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total",
+            row_limit=-1,
+        )
         # print(output)
         found_gemm = False
         found_memcpy = False
@@ -782,16 +524,14 @@ def test_kineto(self):
             self.assertTrue(found_memcpy)
         else:
             self.assertTrue(found_mm)
+        self._check_stats(p._stats)
         # p.export_chrome_trace("/tmp/test_trace.json")
 
     @unittest.skipIf(not kineto_available(), "Kineto is required")
     @unittest.skipIf(not TEST_MULTIGPU, "Multiple GPUs needed")
     @unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
     def test_kineto_multigpu(self):
-        with profile(
-            activities=[
-                ProfilerActivity.CPU,
-                ProfilerActivity.CUDA]) as prof:
+        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
             for gpu_id in [0, 1]:
                 x = torch.randn(10, 10).cuda(gpu_id)
                 y = torch.randn(10, 10).cuda(gpu_id)
@@ -812,11 +552,16 @@ def test_kineto_multigpu(self):
         self.assertTrue(found_gemm_0)
         self.assertTrue(found_gemm_1)
         self.assertTrue(found_cuda)
+        self._check_stats(prof._stats())
 
     def test_memory_profiler(self):
         def run_profiler(tensor_creation_fn):
             # collecting allocs / deallocs
-            with _profile(profile_memory=True, record_shapes=True, use_kineto=kineto_available()) as prof:
+            with _profile(
+                profile_memory=True,
+                record_shapes=True,
+                use_kineto=kineto_available(),
+            ) as prof:
                 x = None
                 with record_function("test_user_scope_alloc"):
                     x = tensor_creation_fn()
@@ -826,16 +571,22 @@ def run_profiler(tensor_creation_fn):
 
         def check_metrics(stats, metric, allocs=None, deallocs=None):
             stat_metrics = {}
+            # print(stats)
             for stat in stats:
                 stat_metrics[stat.key] = getattr(stat, metric)
+            # print(stat_metrics)
             if allocs is not None:
                 for alloc_fn in allocs:
                     self.assertTrue(alloc_fn in stat_metrics)
-                    self.assertTrue(stat_metrics[alloc_fn] > 0)
+                    self.assertGreater(
+                        stat_metrics[alloc_fn], 0, f"alloc_fn = {alloc_fn}"
+                    )
             if deallocs is not None:
                 for dealloc_fn in deallocs:
                     self.assertTrue(dealloc_fn in stat_metrics)
-                    self.assertTrue(stat_metrics[dealloc_fn] < 0)
+                    self.assertLess(
+                        stat_metrics[dealloc_fn], 0, f"alloc_fn = {dealloc_fn}"
+                    )
 
         def create_cpu_tensor():
             return torch.rand(10, 10)
@@ -857,7 +608,7 @@ def create_mkldnn_tensor():
             ],
             deallocs=[
                 "test_user_scope_dealloc",
-            ]
+            ],
         )
 
         if kineto_available():
@@ -894,7 +645,7 @@ def create_mkldnn_tensor():
             stats = run_profiler(create_cuda_tensor)
             check_metrics(
                 stats,
-                "cuda_memory_usage",
+                "device_memory_usage",
                 allocs=[
                     "test_user_scope_alloc",
                     "aten::to",
@@ -902,7 +653,7 @@ def create_mkldnn_tensor():
                 ],
                 deallocs=[
                     "test_user_scope_dealloc",
-                ]
+                ],
             )
             check_metrics(
                 stats,
@@ -910,7 +661,7 @@ def create_mkldnn_tensor():
                 allocs=[
                     "aten::rand",
                     "aten::empty",
-                ]
+                ],
             )
 
         if torch.backends.mkldnn.is_available():
@@ -927,7 +678,7 @@ def create_mkldnn_tensor():
                 ],
                 deallocs=[
                     "test_user_scope_dealloc",
-                ]
+                ],
             )
 
         # check top-level memory events
@@ -942,24 +693,57 @@ def create_mkldnn_tensor():
         check_metrics(
             stats,
             "cpu_memory_usage",
-            allocs=[
-                "aten::rand",
-                "aten::empty"
-            ],
-            deallocs=[
-                "[memory]"
-            ]
+            allocs=["aten::rand", "aten::empty"],
+            deallocs=["[memory]"],
         )
         if torch.cuda.is_available():
-            check_metrics(
-                stats,
-                "cuda_memory_usage",
-                deallocs=[
-                    "[memory]"
-                ]
-            )
+            check_metrics(stats, "device_memory_usage", deallocs=["[memory]"])
+
+    @unittest.skipIf(not kineto_available(), "Kineto is required")
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
+    def test_kineto_cupti_range_profiler(self):
+        """CUPTI provides a newer Profiling API from CUDA 10.0 that enables measuring
+        performance events for the GPU. This is supported as an experimental pytorch profiler feature.
+        Read more here https://docs.nvidia.com/cupti/r_main.html#r_profiler.
+        """
+        exp_config = _ExperimentalConfig(
+            profiler_metrics=[
+                # Metrics list at https://docs.nvidia.com/cupti/r_main.html#r_profiler
+                # or use kineto__tensor_core_insts, kineto__cuda_core_flops
+                "kineto__tensor_core_insts",
+                "dram__bytes_read.sum",
+                "dram__bytes_write.sum",
+            ],
+            profiler_measure_per_kernel=True,
+        )
+        with _profile(
+            use_cuda=True, use_kineto=True, experimental_config=exp_config
+        ) as p:
+            self.payload(use_cuda=True)
+
+        def check_trace(fname):
+            with open(fname) as f:
+                trace = json.load(f)
+                self.assertTrue("traceEvents" in trace)
+                events = trace["traceEvents"]
+                found_cupti_profiler_events = False
+                for evt in events:
+                    self.assertTrue("name" in evt)
+                    if "__cupti_profiler__" in evt["name"]:
+                        found_cupti_profiler_events = True
+                # PyTorch OSS CI runs in docker containers where the Range Profiler
+                # does not have sufficient privilege level (CUPTI_ERROR_INSUFFICIENT_PRIVILEGES).
+                # We can check that the profiler does not crash the job and the trace is not
+                # malformed, however do not check the actual presence of data.
+                self.assertTrue(1 or found_cupti_profiler_events)
+
+        with TemporaryFileName(mode="w+") as fname:
+            p.export_chrome_trace(fname)
+            check_trace(fname)
 
-    @unittest.skipIf(IS_JETSON, "Jetson has a guard against OOM since host and gpu memory are shared")
+    @unittest.skipIf(
+        IS_JETSON, "Jetson has a guard against OOM since host and gpu memory are shared"
+    )
     def test_oom_tracing(self):
         def run_profiler(tensor_creation_fn):
             with _profile(profile_memory=True, record_shapes=True) as prof:
@@ -997,9 +781,6 @@ def check_trace(fname):
                 prof = run_profiler(create_cuda_tensor_oom)
                 check_trace(fname)
 
-
-
-
     @unittest.skipIf(not kineto_available(), "Kineto is required")
     def test_module_hierarchy(self):
         class A(nn.Module):
@@ -1036,12 +817,18 @@ def forward(self, x, y):
         op_to_module_hierarchy = {}
         op_to_module_hierarchy["aten::sub"] = ["TOP(C)::forward.A0(A)::forward."]
         op_to_module_hierarchy["aten::mul"] = [
-            "TOP(C)::forward.A0(A)::forward.SELF(A)::forward_impl_.SELF(A)::my_new_method."]
+            "TOP(C)::forward.A0(A)::forward.SELF(A)::forward_impl_.SELF(A)::my_new_method."
+        ]
         op_to_module_hierarchy["aten::add"] = [
             "TOP(C)::forward.A0(A)::forward.SELF(A)::forward_impl_.",
-            "TOP(C)::forward.SELF(C)::call_b.B0(B)::forward.", "TOP(C)::forward."]
+            "TOP(C)::forward.SELF(C)::call_b.B0(B)::forward.",
+            "TOP(C)::forward.",
+        ]
         with TemporaryFileName(mode="w+") as fname:
-            with profile(activities=[torch.profiler.ProfilerActivity.CPU], with_modules=True,) as prof:
+            with profile(
+                activities=[torch.profiler.ProfilerActivity.CPU],
+                with_modules=True,
+            ) as prof:
                 model(input_a, input_b)
             prof.export_chrome_trace(fname)
             with open(fname) as f:
@@ -1059,8 +846,8 @@ def forward(self, x, y):
                                 assert hierarchy in op_to_module_hierarchy[op_name]
 
     def test_high_level_trace(self):
-        """Checks that python side high level events are recorded.
-        """
+        """Checks that python side high level events are recorded."""
+
         class RepeatedDataset(torch.utils.data.Dataset):
             def __init__(self, N, D_in, D_out):
                 self.N = N
@@ -1099,7 +886,7 @@ def train():
 
         N, D_in, H, D_out = 8, 10, 5, 2
         model = TwoLayerNet(D_in, H, D_out)
-        criterion = torch.nn.MSELoss(reduction='sum')
+        criterion = torch.nn.MSELoss(reduction="sum")
         optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
         ds = RepeatedDataset(N, D_in, D_out)
         dataloader = torch.utils.data.DataLoader(ds, batch_size=1)
@@ -1120,9 +907,14 @@ def judge(expected_event_count, prof):
                 if "#" in e.name:
                     key = e.name
                     if key in expected_event_count.keys():
-                        actual_event_count[key] = actual_event_count.setdefault(key, 0) + 1
+                        actual_event_count[key] = (
+                            actual_event_count.setdefault(key, 0) + 1
+                        )
             for key, count in expected_event_count.items():
-                self.assertTrue((key in actual_event_count.keys()) and (count == actual_event_count[key]))
+                self.assertTrue(
+                    (key in actual_event_count.keys())
+                    and (count == actual_event_count[key])
+                )
 
         with _profile(use_kineto=kineto_available()) as prof:
             train()
@@ -1130,7 +922,7 @@ def judge(expected_event_count, prof):
             # "+1" because the final iteration will enter __next__ but skip the loop body.
             "enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__": (N + 1),
             "Optimizer.step#SGD.step": N,
-            "Optimizer.zero_grad#SGD.zero_grad": N
+            "Optimizer.zero_grad#SGD.zero_grad": N,
         }
         judge(expected_event_count, prof)
 
@@ -1147,7 +939,7 @@ def judge(expected_event_count, prof):
         expected_event_count = {
             "enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__": (N + 1),
             "Optimizer.step#CustomSGD.step": N,
-            "Optimizer.zero_grad#CustomSGD.zero_grad": N
+            "Optimizer.zero_grad#CustomSGD.zero_grad": N,
         }
         judge(expected_event_count, prof)
 
@@ -1198,7 +990,9 @@ def test_kineto_profiler_api(self):
 
         def trace_handler(p):
             output = p.key_averages().table(
-                sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total", row_limit=-1)
+                sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total",
+                row_limit=-1,
+            )
             # print(output)
             # p.export_chrome_trace("/tmp/test_trace_" + str(called_num[0]) + ".json")
             called_num[0] += 1
@@ -1207,11 +1001,8 @@ def trace_handler(p):
 
         with profile(
             activities=supported_activities(),
-            schedule=torch.profiler.schedule(
-                wait=1,
-                warmup=1,
-                active=2),
-            on_trace_ready=trace_handler
+            schedule=torch.profiler.schedule(wait=1, warmup=1, active=2),
+            on_trace_ready=trace_handler,
         ) as p:
             for idx in range(8):
                 self.payload(use_cuda=use_cuda)
@@ -1221,21 +1012,18 @@ def trace_handler(p):
         self.assertEqual(KinetoStepTracker.current_step(), initial_step + 8)
 
         # case without schedule
-        with profile(
-            activities=supported_activities()
-        ) as p:
+        with profile(activities=supported_activities()) as p:
             self.payload(use_cuda=use_cuda)
             self.payload(use_cuda=use_cuda)
         output = p.key_averages().table(
-            sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total", row_limit=-1)
+            sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total",
+            row_limit=-1,
+        )
         # print(output)
 
         test_schedule = torch.profiler.schedule(
-            skip_first=2,
-            wait=1,
-            warmup=1,
-            active=2,
-            repeat=2)
+            skip_first=2, wait=1, warmup=1, active=2, repeat=2
+        )
         test_schedule_expected_outputs = [
             ProfilerAction.NONE,
             ProfilerAction.NONE,
@@ -1287,10 +1075,7 @@ def run_batch():
 
         with profile(
             activities=supported_activities(),
-            schedule=torch.profiler.schedule(
-                wait=1,
-                warmup=1,
-                active=2),
+            schedule=torch.profiler.schedule(wait=1, warmup=1, active=2),
         ) as p:
             for idx in range(niters):
                 run_batch()
@@ -1299,7 +1084,11 @@ def run_batch():
         self.assertEqual(KinetoStepTracker.current_step(), initial_step + 2 * niters)
 
     def test_export_stacks(self):
-        with _profile(with_stack=True, use_kineto=kineto_available(), experimental_config=_ExperimentalConfig(verbose=True)) as p:
+        with _profile(
+            with_stack=True,
+            use_kineto=kineto_available(),
+            experimental_config=_ExperimentalConfig(verbose=True),
+        ) as p:
             x = torch.randn(10, 10)
             y = torch.randn(10, 10)
             z = torch.mm(x, y)
@@ -1327,17 +1116,10 @@ def test_tensorboard_trace_handler(self):
 
         with TemporaryDirectoryName() as dname:
             with profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU
-                ] + ([
-                    torch.profiler.ProfilerActivity.CUDA
-                ] if use_cuda else []),
-                schedule=torch.profiler.schedule(
-                    wait=1,
-                    warmup=1,
-                    active=2,
-                    repeat=3),
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(dname)
+                activities=[torch.profiler.ProfilerActivity.CPU]
+                + ([torch.profiler.ProfilerActivity.CUDA] if use_cuda else []),
+                schedule=torch.profiler.schedule(wait=1, warmup=1, active=2, repeat=3),
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(dname),
             ) as p:
                 for _ in range(18):
                     self.payload(use_cuda=use_cuda)
@@ -1346,27 +1128,25 @@ def test_tensorboard_trace_handler(self):
             self.assertTrue(os.path.exists(dname))
             file_num = 0
             for file_name in os.listdir(dname):
-                parts = file_name.split('.')
+                parts = file_name.split(".")
                 self.assertTrue(len(parts) > 4)
-                self.assertTrue(parts[-4].isdigit() and int(parts[-4]) > 0, "Wrong tracing file name pattern")
-                self.assertEqual(parts[-3:], ['pt', 'trace', 'json'])
+                self.assertTrue(
+                    parts[-4].isdigit() and int(parts[-4]) > 0,
+                    "Wrong tracing file name pattern",
+                )
+                self.assertEqual(parts[-3:], ["pt", "trace", "json"])
                 file_num += 1
             self.assertEqual(file_num, 3)
 
         # test case for gzip file format
         with TemporaryDirectoryName() as dname:
             p = profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU
-                ] + ([
-                    torch.profiler.ProfilerActivity.CUDA
-                ] if use_cuda else []),
-                schedule=torch.profiler.schedule(
-                    wait=1,
-                    warmup=1,
-                    active=2,
-                    repeat=3),
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(dname, use_gzip=True)
+                activities=[torch.profiler.ProfilerActivity.CPU]
+                + ([torch.profiler.ProfilerActivity.CUDA] if use_cuda else []),
+                schedule=torch.profiler.schedule(wait=1, warmup=1, active=2, repeat=3),
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    dname, use_gzip=True
+                ),
             )
             p.start()
             for _ in range(18):
@@ -1377,10 +1157,13 @@ def test_tensorboard_trace_handler(self):
             self.assertTrue(os.path.exists(dname))
             file_num = 0
             for file_name in os.listdir(dname):
-                parts = file_name.split('.')
+                parts = file_name.split(".")
                 self.assertTrue(len(parts) > 4)
-                self.assertTrue(parts[-5].isdigit() and int(parts[-5]) > 0, "Wrong tracing file name pattern")
-                self.assertEqual(parts[-4:], ['pt', 'trace', 'json', 'gz'])
+                self.assertTrue(
+                    parts[-5].isdigit() and int(parts[-5]) > 0,
+                    "Wrong tracing file name pattern",
+                )
+                self.assertEqual(parts[-4:], ["pt", "trace", "json", "gz"])
                 file_num += 1
             self.assertEqual(file_num, 3)
 
@@ -1447,14 +1230,15 @@ def test_profiler_op_event_args(self):
             a = torch.ones((64, 32), dtype=torch.float32)
             c = torch.cat([a, a]).sin()
         with TemporaryFileName(mode="w+") as fname:
-
             prof.export_chrome_trace(fname)
             with open(fname) as f:
                 j = json.load(f)
-                events = j["traceEvents"]
-                for e in events:
+                op_events = [
+                    e for e in j["traceEvents"] if e.get("cat", "") == "cpu_op"
+                ]
+                for e in op_events:
+                    args = e["args"]
                     if e["name"] == "aten::ones":
-                        args = e["args"]
                         self.assertEqual(
                             args["Input type"],
                             ["ScalarList", "Scalar", "", "", "Scalar"],
@@ -1464,14 +1248,21 @@ def test_profiler_op_event_args(self):
                         )
 
                     if e["name"] == "aten::cat":
-                        args = e["args"]
                         self.assertEqual(args["Input Dims"], [[[64, 32], [64, 32]], []])
                         self.assertEqual(args["Input type"], ["TensorList", "Scalar"])
 
+                    # check that each op has record function id
+                    self.assertGreaterEqual(
+                        args.get("Record function id", -1),
+                        0,
+                        f"Failed finding record funciont for op = {e}",
+                    )
 
     def test_profiler_fwd_bwd_link(self):
         with _profile(use_kineto=True) as prof:
-            t1, t2 = torch.ones(1, requires_grad=True), torch.ones(1, requires_grad=True)
+            t1, t2 = torch.ones(1, requires_grad=True), torch.ones(
+                1, requires_grad=True
+            )
             z = torch.add(t1, t2)
             y = torch.ones(1)
             loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
@@ -1487,7 +1278,12 @@ def test_profiler_fwd_bwd_link(self):
                 for e in events:
                     if e["ph"] == "X":
                         ts_to_name[e["ts"]] = e["name"]
-                    if "cat" in e and "name" in e and e["cat"] == "fwdbwd" and e["name"] == "fwdbwd":
+                    if (
+                        "cat" in e
+                        and "name" in e
+                        and e["cat"] == "fwdbwd"
+                        and e["name"] == "fwdbwd"
+                    ):
                         if e["ph"] == "s":
                             flow_s_to_ts[e["id"]] = e["ts"]
                         elif e["ph"] == "f":
@@ -1503,8 +1299,15 @@ def test_profiler_fwd_bwd_link(self):
                 f_ts_1 = flow_f_to_ts[1]
                 s_ts_2 = flow_s_to_ts[2]
                 f_ts_2 = flow_f_to_ts[2]
-                self.assertTrue(all(ts in ts_to_name.keys() for ts in [s_ts_1, f_ts_1, s_ts_2, f_ts_2]))
-                self.assertTrue(ts_to_name[s_ts_1] == "aten::binary_cross_entropy_with_logits")
+                self.assertTrue(
+                    all(
+                        ts in ts_to_name.keys()
+                        for ts in [s_ts_1, f_ts_1, s_ts_2, f_ts_2]
+                    )
+                )
+                self.assertTrue(
+                    ts_to_name[s_ts_1] == "aten::binary_cross_entropy_with_logits"
+                )
                 self.assertTrue(ts_to_name[s_ts_2] == "aten::add")
 
     def test_profiler_disable_fwd_bwd_link(self):
@@ -1512,7 +1315,9 @@ def test_profiler_disable_fwd_bwd_link(self):
             torch._C._profiler._set_fwd_bwd_enabled_val(False)
 
             with _profile(use_kineto=True) as prof:
-                t1, t2 = torch.ones(1, requires_grad=True), torch.ones(1, requires_grad=True)
+                t1, t2 = torch.ones(1, requires_grad=True), torch.ones(
+                    1, requires_grad=True
+                )
                 z = torch.add(t1, t2)
                 y = torch.ones(1)
                 loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
@@ -1545,9 +1350,11 @@ def workload() -> None:
             torch.add(t1, t2)
 
         def trace_and_check(exp_config: Optional[_ExperimentalConfig]) -> None:
-            with _profile(use_kineto=True, use_cuda=True,
-                          experimental_config=exp_config,
-                          ) as prof:
+            with _profile(
+                use_kineto=True,
+                use_cuda=True,
+                experimental_config=exp_config,
+            ) as prof:
                 workload()
 
             with TemporaryFileName(mode="w+") as fname:
@@ -1556,8 +1363,10 @@ def trace_and_check(exp_config: Optional[_ExperimentalConfig]) -> None:
                 with open(fname) as f:
                     j = json.load(f)
                     cats = {e.get("cat", None) for e in j["traceEvents"]}
-            self.assertTrue("cuda_sync" in cats, "Expected to find cuda_sync event"
-                            f" found = {cats}")
+            self.assertTrue(
+                "cuda_sync" in cats,
+                "Expected to find cuda_sync event" f" found = {cats}",
+            )
 
         print("Testing enable_cuda_sync_events in _ExperimentalConfig")
         trace_and_check(exp_config=_ExperimentalConfig(enable_cuda_sync_events=True))
@@ -1583,10 +1392,10 @@ def test_profiler_type(self):
             self.assertEqual(profiler_type(), ActiveProfilerType.KINETO)
 
     def test_profiler_correlation_id(self):
-        '''
+        """
         We expect the correlation_id to be unique across multiple invokation of the profiler,
         So we will reuse id_uniqueness_set.
-        '''
+        """
         id_uniqueness_set = set()
         model = torch.nn.Sequential(
             nn.Conv2d(16, 33, 18),
@@ -1621,6 +1430,7 @@ def test_nested_tensor_with_shapes(self):
                 self.assertTrue(len(e.input_shapes[0]) > 0)
 
     @patch.dict(os.environ, {"KINETO_USE_DAEMON": "1"})
+    @patch.dict(os.environ, {"KINETO_DAEMON_INIT_DELAY_S": "1"})
     def test_kineto_profiler_with_environment_variable(self):
         script = """
 import torch
@@ -1685,12 +1495,15 @@ def run_batch():
 """
         try:
             subprocess.check_output(
-                [sys.executable, '-W', 'all', '-c', script],
-                cwd=os.path.dirname(os.path.realpath(__file__))
+                [sys.executable, "-W", "all", "-c", script],
+                cwd=os.path.dirname(os.path.realpath(__file__)),
             )
         except subprocess.CalledProcessError as e:
             if e.returncode != 0:
-                self.assertTrue(False, "Kineto is not working properly with the Dynolog environment variable")
+                self.assertTrue(
+                    False,
+                    "Kineto is not working properly with the Dynolog environment variable",
+                )
 
     def test_concrete_inputs_profiling(self):
         x = torch.rand(2, 6)
@@ -1711,7 +1524,7 @@ def test_concrete_inputs_profiling(self):
 
     def test_concrete_inputs_profiling_toggling(self):
         try:
-            for (before, after) in [(True, False), (False, True)]:
+            for before, after in [(True, False), (False, True)]:
                 x = torch.rand(2, 6)
                 torch._C._profiler._set_record_concrete_inputs_enabled_val(before)
                 with profile(record_shapes=True) as p:
@@ -1730,45 +1543,98 @@ def test_concrete_inputs_profiling_toggling(self):
 
     def test_record_function_fast(self):
         x, y = (torch.rand((4, 4)) for _ in range(2))
-        with profile() as p:
+        with profile(record_shapes=True) as p:
             for _ in range(4):
+                # Test first with no optional args
                 with torch._C._profiler._RecordFunctionFast("add_test_fast_rf1"):
                     x.add(y)
 
-        self.assertGreaterEqual(len([e for e in p.events() if e.name == "add_test_fast_rf1"]), 4)
-
-        with profile() as p:
-            cm = torch._C._profiler._RecordFunctionFast("add_test_fast_rf2")
+        self.assertGreaterEqual(
+            len([e for e in p.events() if e.name == "add_test_fast_rf1"]), 4
+        )
+        for e in p.events():
+            if e.name == "add_test_fast_rf1":
+                self.assertTrue(e.input_shapes == [])
+        with profile(record_shapes=True) as p:
+            # add optional args
+            cm = torch._C._profiler._RecordFunctionFast(
+                "add_test_fast_rf2", [x, y], {"stream": 0, "grid": "lambda x : x + 1"}
+            )
             for _ in range(4):
                 with cm:
                     x.add(y)
 
-        self.assertGreaterEqual(len([e for e in p.events() if e.name == "add_test_fast_rf2"]), 4)
+        self.assertGreaterEqual(
+            len([e for e in p.events() if e.name == "add_test_fast_rf2"]), 4
+        )
 
+        for e in p.events():
+            if e.name == "add_test_fast_rf2":
+                self.assertTrue(e.input_shapes == [[4, 4], [4, 4]])
 
-        with profile() as p:
-            cm = torch._C._profiler._RecordFunctionFast("add_test_fast_rf3")
+        with profile(record_shapes=True) as p:
+            cm = torch._C._profiler._RecordFunctionFast(
+                "add_test_fast_rf3", input_values=["hi"], keyword_values={"hi": "hello"}
+            )
             for _ in range(4):
                 try:
                     with cm:
                         x.add(y)
-                        raise ValueError()
+                        raise ValueError
                         x.relu()
                 except ValueError:
                     pass
 
-        self.assertGreaterEqual(len([e for e in p.events() if e.name == "add_test_fast_rf3"]), 4)
+        self.assertGreaterEqual(
+            len([e for e in p.events() if e.name == "add_test_fast_rf3"]), 4
+        )
         self.assertFalse(any((e.name and "relu" in e.name) for e in p.events()))
 
+        for e in p.events():
+            if e.name == "add_test_fast_rf3":
+                self.assertTrue(e.input_shapes == [[]])
+
         with profile() as p:
             for _ in range(4):
-                with torch._C._profiler._RecordFunctionFast("add_test_fast_rf4"):
+                with torch._C._profiler._RecordFunctionFast(
+                    "add_test_fast_rf4", [x, y]
+                ):
                     x.add(y)
                     with torch._C._profiler._RecordFunctionFast("add_test_fast_rf5"):
                         x.relu()
 
-        self.assertGreaterEqual(len([e for e in p.events() if e.name == "add_test_fast_rf4"]), 4)
-        self.assertGreaterEqual(len([e for e in p.events() if e.name == "add_test_fast_rf5"]), 4)
+        self.assertGreaterEqual(
+            len([e for e in p.events() if e.name == "add_test_fast_rf4"]), 4
+        )
+
+        for e in p.events():
+            if e.name == "add_test_fast_rf4":
+                self.assertTrue(e.input_shapes == [])
+
+        self.assertGreaterEqual(
+            len([e for e in p.events() if e.name == "add_test_fast_rf5"]), 4
+        )
+
+        with profile(record_shapes=True) as p:
+            # test optional args with tuple
+            cm = torch._C._profiler._RecordFunctionFast(
+                "add_test_fast_rf6",
+                (
+                    x,
+                    y,
+                ),
+            )
+            for _ in range(4):
+                with cm:
+                    x.add(y)
+
+        self.assertGreaterEqual(
+            len([e for e in p.events() if e.name == "add_test_fast_rf6"]), 4
+        )
+
+        for e in p.events():
+            if e.name == "add_test_fast_rf6":
+                self.assertTrue(e.input_shapes == [[4, 4], [4, 4]])
 
     def test_is_profiler_enabled(self):
         self.assertFalse(torch.autograd.profiler._is_profiler_enabled)
@@ -1795,7 +1661,9 @@ def test_guarded_record_function_fast(self):
                 else:
                     x.add(y)
 
-        self.assertGreaterEqual(len([e for e in p.events() if e.name == "guarded_rff"]), 4)
+        self.assertGreaterEqual(
+            len([e for e in p.events() if e.name == "guarded_rff"]), 4
+        )
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
     def test_event_list(self):
@@ -1816,805 +1684,167 @@ def test_event_list(self):
 
         event_list.table()
 
-
-def find_node_with_name(nodes, name):
-    for node in _utils.traverse_dfs(nodes):
-        if node.name == name:
-            return node
-
-def find_node_with_regex(nodes, pattern):
-    for node in _utils.traverse_dfs(nodes):
-        if re.search(pattern, node.name):
-            return node
-
-
-class SimpleNet(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.fc1 = nn.Linear(10, 5)
-        self.fc2 = nn.Linear(5, 2)
-
-    def forward(self, x):
-        return self.fc2(self.fc1(x))
-
-
-class TestTorchTidyProfiler(TestCase):
-
-    def _get_tensor_fields(self, node, index):
-        self.assertIsNotNone(node)
-        self.assertIsInstance(
-            node.extra_fields,
-            torch._C._profiler._ExtraFields_TorchOp)
-        tensor_info = node.extra_fields.inputs[index]
-        self.assertIsInstance(tensor_info, _TensorMetadata)
-        self.assertIsNotNone(tensor_info.impl_ptr)
-        self.assertIsNotNone(tensor_info.storage_data_ptr)
-        self.assertIsNotNone(tensor_info.id)
-        return tensor_info.impl_ptr, tensor_info.storage_data_ptr, tensor_info.id
-
-    def test_pointers_and_ids(self):
-        a = torch.randn(4, 3)
-        a_initial_storage_data = a.storage().data_ptr()
-
-        # Views of tensors can share the same storage, but have different TensorImpls
-        b = a.view((1, 12))
-        c = torch.randn(4, 1)
-        c_initial_storage_data = c.storage().data_ptr()
-        d = torch.randn(4, 3)
-
-        with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
-            _ = a + c
-            _ = b * c
-
-            # Resize should create a new data_ptr but keep the TensorImpl the same.
-            f = a.resize_(128, 129)
-            _ = torch.relu(f)
-
-            # `.set_` points a Tensor at an existing storage.
-            _ = d.sin()
-            c.set_(d.storage())
-            _ = c.cos()
-
-        nodes = p.profiler.kineto_results.experimental_event_tree()
-
-        def get_fields(op_name, index):
-            return self._get_tensor_fields(
-                find_node_with_name(nodes, op_name),
-                index)
-
-        a_impl, a_storage_data, a_id = get_fields("aten::add", 0)
-        b_impl, b_storage_data, b_id = get_fields("aten::mul", 0)
-
-        # Profiler matches ground truth from Python API.
-        self.assertEqual(a_storage_data, a_initial_storage_data)
-
-        # Views are handled correctly.
-        self.assertEqual(a_storage_data, b_storage_data)
-        self.assertNotEqual(a_impl, b_impl)
-
-        # The same Tensor used in multiple calls gives identical results.
-        c_impl, c_storage_data, c_id = get_fields("aten::add", 1)
-        self.assertEqual((c_impl, c_storage_data, c_id), get_fields("aten::mul", 1))
-        self.assertEqual(c_storage_data, c_initial_storage_data)
-
-        # Mutations to the underlying storage are reflected. (But ID is shared.)
-        f_impl, f_storage_data, f_id = get_fields("aten::relu", 0)
-        self.assertEqual(a_impl, f_impl)
-        self.assertNotEqual(a_storage_data, f_storage_data)
-        self.assertEqual(a_id, f_id)
-
-        # Calling `set_` with an existing Tensor makes them share an ID.
-        d_impl, d_storage_data, d_id = get_fields("aten::sin", 0)
-        c_impl_new, c_storage_data_new, c_id_new = get_fields("aten::cos", 0)
-        self.assertNotEqual(d_impl, c_impl_new)
-        self.assertEqual(d_storage_data, c_storage_data_new)
-        self.assertEqual(c_id, c_id_new)
-        self.assertEqual(d_id, c_id_new)
-
-    @staticmethod
-    def _format_allocations(profiled_code):
-        gc.collect()
-        with profile(profile_memory=True, record_shapes=True) as prof:
-            profiled_code()
-            gc.collect()
-
-        root_events = prof.profiler.kineto_results.experimental_event_tree()
-        events = sorted(_utils.traverse_dfs(root_events), key=lambda x: x.start_time_ns)
-        allocations = tuple(
-            event.extra_fields
-            for event in events
-            if isinstance(event.extra_fields, torch._C._profiler._ExtraFields_Allocation)
+    def _check_all_gpu_present(self, gpu_dict, max_gpu_count):
+        for i in range(0, max_gpu_count):
+            self.assertEqual(gpu_dict["GPU " + str(i)], 1)
+
+    # Do json sanity testing. Checks that all events are between profiler start and end
+    # also checks to see that GPU values are present in trace if cuda is used
+    def _validate_basic_json(self, traceEvents, cuda_available=False):
+        MAX_GPU_COUNT = 8
+        PROFILER_IDX = -4
+        RECORD_END = -1
+        RECORD_START = -2
+        traceEventProfiler = traceEvents[PROFILER_IDX]
+
+        self.assertTrue(traceEventProfiler["name"] == "PyTorch Profiler (0)")
+        self.assertTrue(traceEvents[RECORD_END]["name"] == "Record Window End")
+        self.assertTrue(
+            traceEvents[RECORD_START]["name"] == "Iteration Start: PyTorch Profiler"
         )
-
-        return textwrap.indent("\n".join(
-            f"{repr(i.id):>5}{' ' * 6}"
-            f"{repr(i.allocation_id):>5}{' ' * 6}"
-            f"{'Allocation' if i.alloc_size > 0 else 'Free'}"
-            for i in allocations
-        ), " " * 12)
-
-    def test_tensorimpl_invalidation_set(self) -> None:
-        def profiled_code(add_empty_set: bool):
-            x = torch.ones((1,))
-
-            # Determines if new storage is created before or after the old one
-            # is destroyed.
-            if add_empty_set:
-                x.set_()
-
-            x.set_(torch.ones((1,)).storage())
-            x.view_as(x)
-
-        self.assertExpectedInline(
-            self._format_allocations(lambda: profiled_code(add_empty_set=False)),
-            """\
-                0          1      Allocation
-                0          2      Allocation
-                0          1      Free
-                0          2      Free"""
+        # check that the profiler starts/ends within the record interval
+        self.assertGreaterEqual(
+            traceEventProfiler["ts"],
+            traceEvents[RECORD_START]["ts"],
+            "Profiler starts before record!",
         )
-
-        self.assertExpectedInline(
-            self._format_allocations(lambda: profiled_code(add_empty_set=True)),
-            """\
-                0          1      Allocation
-                0          1      Free
-                0          2      Allocation
-                0          2      Free"""
+        self.assertLessEqual(
+            traceEventProfiler["ts"] + traceEventProfiler["dur"],
+            traceEvents[RECORD_END]["ts"],
+            "Profiler ends after record end!",
         )
 
-    def test_tensorimpl_invalidation_keep_alive(self) -> None:
-        def profiled_code(add_empty_set: bool):
-            x = torch.ones((1,))
-            x_storages = [x.storage()]
-            for _ in range(3):
-                x.set_()
-                x.set_(torch.ones((1,)).storage())
-
-                # This keeps the StorageImpls alive and preserves the chain.
-                # (Despite the `set_()` call.)
-                x_storages.append(x.storage())
-            x.view_as(x)
-
-            # Free storage in a deterministic fashion.
-            while x_storages:
-                x_storages.pop()
-                gc.collect()
-
-            # Determines if new storage is created before or after the old one
-            # is destroyed.
-            if add_empty_set:
-                x.set_()
-
-            for _ in range(3):
-                x.set_(torch.ones((1,)).storage())
-            x.view_as(x)
-
-            del x
-            gc.collect()
-
-        self.assertExpectedInline(
-            self._format_allocations(lambda: profiled_code(add_empty_set=False)),
-            """\
-                0          1      Allocation
-                0          2      Allocation
-                0          4      Allocation
-                0          5      Allocation
-                0          4      Free
-                0          2      Free
-                0          1      Free
-                0          6      Allocation
-                0          5      Free
-                0          7      Allocation
-                0          6      Free
-                0          8      Allocation
-                0          7      Free
-                0          8      Free"""
-        )
-
-        self.assertExpectedInline(
-            self._format_allocations(lambda: profiled_code(add_empty_set=True)),
-            """\
-                0          1      Allocation
-                0          2      Allocation
-                0          4      Allocation
-                0          5      Allocation
-                0          4      Free
-                0          2      Free
-                0          1      Free
-                0          5      Free
-                0          6      Allocation
-                0          7      Allocation
-                0          6      Free
-                0          8      Allocation
-                0          7      Free
-                0          8      Free"""
-        )
-
-    def test_tensorimpl_invalidation_full(self) -> None:
-        def profiled_code():
-            x = torch.ones((1,))
-            x_storages = [x.storage()]
-            for _ in range(3):
-                x.set_()
-                x.set_(torch.ones((1,)).storage())
-                x_storages.append(x.storage())
-            x.view_as(x)
-
-            # Free storage in a deterministic fashion.
-            while x_storages:
-                x_storages.pop()
-                gc.collect()
-
-            for _ in range(3):
-                x.set_(torch.ones((1,)).storage())
-
-            for _ in range(3):
-                x.set_()
-                x.set_(torch.ones((1,)).storage())
+        gpu_dict = collections.defaultdict(int)
+        for i, traceEvent in enumerate(traceEvents):
+            if (
+                i == len(traceEvents) + RECORD_END
+                or i == len(traceEvents) + RECORD_START
+            ):
+                continue
+            # make sure all valid trace events are within the bounds of the profiler
+            if "ts" in traceEvent:
+                self.assertGreaterEqual(
+                    traceEvent["ts"],
+                    traceEventProfiler["ts"],
+                    "Trace event is out of bounds",
+                )
+            # some python events seem to go a little past record end probably because
+            # of some clock inaccuracies so just compare events ending to RECORD_END
+            if "dur" in traceEvent:
+                self.assertLessEqual(
+                    traceEvent["ts"] + traceEvent["dur"],
+                    traceEvents[RECORD_END]["ts"],
+                    "Trace event ends too late!",
+                )
+            gpu_value = traceEvent.get("args", {}).get("labels", None)
+            if gpu_value and "GPU" in gpu_value:
+                gpu_dict[gpu_value] += 1
+                self.assertTrue(
+                    traceEvents[i + 1]["args"]["sort_index"]
+                    == 0x1000000 + int(gpu_value.split()[1])
+                )
 
-            for i in range(4):
-                x.resize_((1 + i,))
-            x.view_as(x)
+        # TODO add checking gpu count if cpuOnly_ is true or not
 
-        self.assertExpectedInline(
-            self._format_allocations(profiled_code),
-            """\
-                0          1      Allocation
-                0          2      Allocation
-                0          4      Allocation
-                0          5      Allocation
-                0          4      Free
-                0          2      Free
-                0          1      Free
-                0          6      Allocation
-                0          5      Free
-                0          7      Allocation
-                0          6      Free
-                0          8      Allocation
-                0          7      Free
-                0          8      Free
-                0          9      Allocation
-                0          9      Free
-                0         10      Allocation
-                0         10      Free
-                0         11      Allocation
-                0         12      Allocation
-                0         11      Free
-                0         13      Allocation
-                0         12      Free
-                0         14      Allocation
-                0         13      Free
-                0         14      Free"""
-        )
+    def _test_chrome_trace_basic_helper(self, with_cuda=False):
+        if with_cuda:
+            device = "cuda"
+        else:
+            device = "cpu"
+        x, y = (torch.rand(4, 4).to(device) for _ in range(2))
 
-    def test_tensorimpl_invalidation_scalar_args(self) -> None:
-        def profiled_code():
-            with torch.no_grad():
-                x = torch.ones((1,))
-                for _ in range(10):
-                    x.add_(2)
+        with profile(with_stack=True) as p:
+            torch.add(x, y)
+        with TemporaryFileName(mode="w+") as fname:
+            p.export_chrome_trace(fname)
+            with open(fname) as f:
+                report = json.load(f)
+                self._validate_basic_json(report["traceEvents"], with_cuda)
 
-        self.assertExpectedInline(
-            self._format_allocations(profiled_code),
-            """\
-                0          1      Allocation
-                1          2      Allocation
-                2          3      Allocation
-                2          3      Free
-                1          2      Free
-                3          4      Allocation
-                4          5      Allocation
-                4          5      Free
-                3          4      Free
-                5          6      Allocation
-                6          7      Allocation
-                6          7      Free
-                5          6      Free
-                7          8      Allocation
-                8          9      Allocation
-                8          9      Free
-                7          8      Free
-                9         10      Allocation
-               10         11      Allocation
-               10         11      Free
-                9         10      Free
-               11         12      Allocation
-               12         13      Allocation
-               12         13      Free
-               11         12      Free
-               13         14      Allocation
-               14         15      Allocation
-               14         15      Free
-               13         14      Free
-               15         16      Allocation
-               16         17      Allocation
-               16         17      Free
-               15         16      Free
-               17         18      Allocation
-               18         19      Allocation
-               18         19      Free
-               17         18      Free
-               19         20      Allocation
-               20         21      Allocation
-               20         21      Free
-               19         20      Free
-                0          1      Free""")
-
-
-    def test_module_and_optimizer_ids(self) -> None:
-        model = torch.nn.Linear(2, 1, bias=True)
-        optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
-
-        def check(cold_start: bool) -> None:
-            with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
-                x = torch.ones((1, 2))
-                _ = x.sin()  # Mark `x`
-                model(x).backward()
-                optimizer.step()
-                _ = optimizer.state[model.weight]["momentum_buffer"].cos()  # Mark weight momentum
-                _ = model.weight.grad.tan()  # Mark weight gradient
-
-            nodes = p.profiler.kineto_results.experimental_event_tree()
-
-            def get_fields(op_name, index):
-                return self._get_tensor_fields(
-                    find_node_with_name(nodes, op_name),
-                    index)
-
-            # Marked Tensors act as ground truth for python tracer IDs.
-            _, _, x_id = get_fields("aten::sin", 0)
-            _, _, weight_momenumtum_id = get_fields("aten::cos", 0)
-            _, _, weight_grad_id = get_fields("aten::tan", 0)
-            self.assertNotEqual(x_id, weight_momenumtum_id)
-            self.assertNotEqual(x_id, weight_grad_id)
-            self.assertNotEqual(weight_momenumtum_id, weight_grad_id)
-
-            # Use linear op to identify weight ground truth.
-            linear_op_node = find_node_with_name(nodes, "aten::linear")
-            self.assertIsNotNone(linear_op_node)
-            x_metadata, weight_metadata, _ = linear_op_node.extra_fields.inputs
-            self.assertEqual(x_id, x_metadata.id)
-
-            # Module
-            linear_module_node = find_node_with_name(nodes, "nn.Module: Linear_0")
-            self.assertIsNotNone(linear_module_node)
-            self.assertIsNotNone(linear_module_node.extra_fields.module)
-            self.assertIsNone(linear_module_node.extra_fields.optimizer)
-
-            linear_parameters = linear_module_node.extra_fields.module.parameters
-            name, weight, weight_grad = linear_parameters[0]
-            self.assertEqual(name, "weight")
-            self.assertEqual(weight.id, weight_metadata.id)
-
-            self.assertEqual(weight_grad is None, cold_start)
-            if not cold_start:
-                self.assertEqual(weight_grad.id, weight_grad_id)
-
-            # Optimizer
-            step_node = find_node_with_regex(nodes, "_optimizer_step_code")
-            self.assertIsNotNone(step_node)
-            self.assertIsNone(step_node.extra_fields.module)
-            self.assertIsNotNone(step_node.extra_fields.optimizer)
-            optimizer_parameters = step_node.extra_fields.optimizer.parameters
-            self.assertEqual(len(optimizer_parameters), 2)  # Weight and bias
-            weight, weight_grad, state = optimizer_parameters[0]
-            self.assertEqual(weight.id, weight_metadata.id)
-            self.assertEqual(weight_grad.id, weight_grad_id)
-            self.assertEqual(len(state), 1)
-            self.assertEqual(state[0][0], "momentum_buffer")
-            self.assertEqual(state[0][1].id, weight_momenumtum_id)
-
-        # Check that we handle first step (lazy initalization) and steady state.
-        check(cold_start=True)
-        check(cold_start=False)
-
-    def _test_allocation_ids(self, before_fn, after_fn) -> None:
-        with profile(profile_memory=True, record_shapes=True) as p:
-            # Introduce other operations and allocations to check robustness
-            _ = before_fn()
-
-            x = torch.rand(4, 3)
-            x.resize_(4, 4)
-
-            # We need to use `x` post resize for profiler to determine its ID.
-            x.sin()
-
-            # Introduce other operations and allocations to check robustness
-            _ = after_fn()
-
-            # Ensure `x` is the last variable collected to make it easier to
-            # find the deallocation event.
-            gc.collect()
-            del x
-            gc.collect()
+    @unittest.skipIf(not kineto_available(), "Kineto is required")
+    @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
+    def test_basic_chrome_trace(self):
+        self._test_chrome_trace_basic_helper()
+        if torch.cuda.is_available():
+            self._test_chrome_trace_basic_helper(with_cuda=True)
 
-        nodes = p.profiler.kineto_results.experimental_event_tree()
-
-        def find_chain(names: List[str]):
-            out = []
-            for name in names:
-                root = [out[-1]] if out else nodes
-                out.append(find_node_with_name(root, name))
-                self.assertIsNotNone(out[-1], name)
-            return out
-
-        allocation = find_chain(["aten::rand", "aten::empty", "[memory]"])[-1].extra_fields
-        _, uniform_node = find_chain(["aten::rand", "aten::uniform_"])
-        x_impl, x_storage_data, x_id = self._get_tensor_fields(uniform_node, 0)
-
-        # Make sure IDs are consistent between allocations and op inputs
-        self.assertEqual(allocation.ptr, x_storage_data)
-        self.assertEqual(allocation.id, x_id)
-
-        resize_node = find_node_with_name(nodes, "aten::resize_")
-        self.assertIsNotNone(resize_node)
-        self.assertEqual(len(resize_node.children), 2)
-        allocate_new = resize_node.children[0].extra_fields
-        free_old = resize_node.children[1].extra_fields
-
-        # Destruction of the old storage for x.
-        self.assertEqual(free_old.id, allocation.id)
-        self.assertEqual(free_old.ptr, allocation.ptr)
-
-        # Make sure ID is retained through change in storage.
-        self.assertEqual(allocate_new.id, allocation.id)
-        self.assertNotEqual(allocate_new.ptr, allocation.ptr)
-
-        # Deletion when `x` goes out of scope.
-        free_new = [i for i in nodes if i.tag == torch._C._profiler._EventType.Allocation][-1].extra_fields
-        self.assertIsInstance(free_new, torch._C._profiler._ExtraFields_Allocation)
-        self.assertEqual(free_new.id, allocate_new.id)
-        self.assertEqual(free_new.ptr, allocate_new.ptr)
-
-    def test_allocation_ids(self) -> None:
-        self._test_allocation_ids(lambda: None, lambda: None)
-
-    def test_allocation_ids_with_other_ops(self) -> None:
-        x = torch.ones((1,))
-        self._test_allocation_ids(
-            lambda: (x + 1).relu_(),
-            lambda: torch.zeros((1,)).cos()
+    @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
+    def test_profiler_time_scale(self):
+        MARGIN_ERROR = 0.5
+        SEC_TO_US = 1000 * 1000
+        WAIT_TIME = 10
+        with profile() as p:
+            with torch.profiler.record_function("test_span"):
+                for i in range(WAIT_TIME):
+                    torch.rand(4, 4)
+                    time.sleep(1)
+        events = p.events()
+
+        # make sure function events are scaled appropriately
+        self.assertTrue(events[0].name == "test_span")
+        test_span = events[0]
+        self.assertGreaterEqual(
+            test_span.cpu_time / SEC_TO_US,
+            WAIT_TIME - MARGIN_ERROR,
+            "event out of range",
         )
-
-    def test_impl_reuse(self) -> None:
-        repeats = 1_000
-        with profile(profile_memory=True, record_shapes=True) as p:
-            for _ in range(repeats):
-                torch.ones((1,))
-            gc.collect()
-
-        roots = p.profiler.kineto_results.experimental_event_tree()
-        tensor_impls = tuple(
-            e.extra_fields.inputs[0].impl_ptr
-            for e in _utils.traverse_dfs(roots)
-            if e.name == "aten::fill_"
+        self.assertLessEqual(
+            test_span.cpu_time / SEC_TO_US,
+            WAIT_TIME + MARGIN_ERROR,
+            "event out of range",
         )
 
-        self.assertEqual(len(tensor_impls), repeats)
-        self.assertEqual(len(set(tensor_impls)), repeats)
-
-    def test_allocation_id_uniqueness(self) -> None:
-        repeats = 1_000
-        with profile(profile_memory=True, record_shapes=True) as p:
-            for _ in range(repeats):
-                torch.ones((1,))
-            gc.collect()
-
-        roots = p.profiler.kineto_results.experimental_event_tree()
-        id_set = set()
-        for e in _utils.traverse_dfs(roots):
-            fields = e.extra_fields
-            if isinstance(fields, torch._C._profiler._ExtraFields_TorchOp):
-                id_set |= {t.allocation_id for t in fields.inputs if isinstance(t, _TensorMetadata)}
-
-            elif isinstance(fields, torch._C._profiler._ExtraFields_Allocation):
-                id_set.add(fields.allocation_id)
-
-        id_set.difference_update([None])
-        self.assertEqual(repeats, len(id_set))
-
-    def test_extra_fields(self):
-        with profile(with_stack=True, profile_memory=True) as p:
-            _ = torch.ones((1,))
-
-        nodes = p.profiler.kineto_results.experimental_event_tree()
-        node = find_node_with_name(nodes, "aten::ones")
-        self.assertIsNotNone(node)
-
-        self.assertIsInstance(
-            node.extra_fields,
-            torch._C._profiler._ExtraFields_TorchOp)
-
-        self.assertIsInstance(
-            node.parent.extra_fields,
-            torch._C._profiler._ExtraFields_PyCCall)
-
-        self.assertEqual(node.children[0].name, "aten::empty")
-        self.assertEqual(node.children[0].children[0].name, "[memory]")
-        self.assertIsInstance(
-            node.children[0].children[0].extra_fields,
-            torch._C._profiler._ExtraFields_Allocation)
-
-    def test_tensor_properties(self):
-        x = torch.ones(10, 10).as_strided([4, 4], [12, 3])
-        y = torch.ones(4, 1, requires_grad=True)
-
-        with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
-            _ = x + y
-            _ = x * y
-
-        nodes = p.profiler.kineto_results.experimental_event_tree()
-        node = find_node_with_name(nodes, "aten::add")
-        self.assertIsNotNone(node)
-
-        self.assertIsInstance(
-            node.extra_fields,
-            torch._C._profiler._ExtraFields_TorchOp)
-
-        def getattr_inputs(name, default):
-            return [getattr(i, name, default) for i in node.extra_fields.inputs]
-
-        self.assertEqual(getattr_inputs("sizes", []), [[4, 4], [4, 1], []])
-        self.assertEqual(getattr_inputs("strides", []), [[12, 3], [1, 1], []])
-        self.assertEqual(getattr_inputs("layout", None), [torch.strided, torch.strided, None])
-        self.assertEqual(getattr_inputs("device", None), [torch.device("cpu"), torch.device("cpu"), None])
-        self.assertEqual(getattr_inputs("dtype", None), [torch.float32, torch.float32, None])
-        self.assertEqual(node.extra_fields.scope, torch.profiler.RecordScope.FUNCTION)
-
-        mul_node = find_node_with_name(nodes, "aten::mul")
-        self.assertIsNotNone(mul_node)
-        self.assertEqual(
-            node.extra_fields.sequence_number + 1,
-            mul_node.extra_fields.sequence_number)
-
-    def test_sparse_tensors(self):
-        i = [[0, 1, 1], [2, 0, 2]]
-        v = [3, 4, 5]
-        s = torch.sparse_coo_tensor(i, v, (2, 3))
-
-        with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
-            _ = s + s
-
-        nodes = p.profiler.kineto_results.experimental_event_tree()
-        node = find_node_with_name(nodes, "aten::add")
-        self.assertIsNotNone(node)
-
-        self.assertIsInstance(
-            node.extra_fields,
-            torch._C._profiler._ExtraFields_TorchOp)
-
-        def getattr_inputs(name, default):
-            return [getattr(i, name, default) for i in node.extra_fields.inputs]
-
-        self.assertEqual(getattr_inputs("sizes", []), [[2, 3], [2, 3], []])
-        self.assertEqual(getattr_inputs("strides", []), [[], [], []])
-        self.assertEqual(getattr_inputs("layout", None), [torch.sparse_coo, torch.sparse_coo, None])
-        self.assertEqual(getattr_inputs("device", None), [torch.device("cpu"), torch.device("cpu"), None])
-
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
-    def test_mkldnn_tensors(self):
-        x = torch.ones(4, 3).to_mkldnn()
-
-        with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
-            _ = x + x
-
-        nodes = p.profiler.kineto_results.experimental_event_tree()
-        node = find_node_with_name(nodes, "aten::add")
-        self.assertIsNotNone(node)
-
-        self.assertIsInstance(
-            node.extra_fields,
-            torch._C._profiler._ExtraFields_TorchOp)
-
-        def getattr_inputs(name, default):
-            return [getattr(i, name, default) for i in node.extra_fields.inputs]
-
-        self.assertEqual(getattr_inputs("sizes", []), [[4, 3], [4, 3], []])
-        self.assertEqual(getattr_inputs("strides", []), [[], [], []])
-        self.assertEqual(getattr_inputs("layout", None), [torch._mkldnn, torch._mkldnn, None])
-        self.assertEqual(getattr_inputs("device", None), [torch.device("cpu"), torch.device("cpu"), None])
-
-    def test_scalar_ins(self):
-        x = torch.ones(5, 5)
-        alpha = 0.9
-
-        with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
-            _ = torch.add(x, 9.1, alpha=alpha)
-
-        nodes = p.profiler.kineto_results.experimental_event_tree()
-        node = find_node_with_name(nodes, "aten::add")
-        self.assertIsNotNone(node)
-
-        def getattr_inputs(name, default):
-            return [getattr(i, name, default) for i in node.extra_fields.inputs]
-
-        # The second argument to the add gets promotoed to a zerodim Tensor
-        self.assertEqual(getattr_inputs("dtype", None), [torch.float32, torch.float64, None])
-        self.assertEqual(getattr_inputs("sizes", []), [[5, 5], [], []])
-        self.assertEqual(node.extra_fields.inputs[2], alpha)
-
-    def test_tensor_lists(self):
-        x = torch.ones((1,))
-        y = torch.ones((1,))
-        with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
-            _ = torch.stack((x, y))
-
-        nodes = p.profiler.kineto_results.experimental_event_tree()
-        node = find_node_with_name(nodes, "aten::stack")
-        inputs = node.extra_fields.inputs
-        self.assertEqual(len(inputs), 2)
-        self.assertIsInstance(inputs[0], list)
-        self.assertEqual(len(inputs[0]), 2)
-        self.assertEqual(x.storage().data_ptr(), inputs[0][0].storage_data_ptr)
-        self.assertEqual(y.storage().data_ptr(), inputs[0][1].storage_data_ptr)
-
-
-    def test_nnmodule_params(self):
-
-        def flat_out_extrafields(nodes, out=None):
-            if out is None:
-                out = []
-            for node in nodes:
-                if isinstance(node.extra_fields, _ExtraFields_PyCall) and node.extra_fields.module:
-                    if node.extra_fields.module.parameters:
-                        out.append(node.extra_fields.module)
-                flat_out_extrafields(node.children, out)
-            return out
-
-        inputs = torch.rand(10)
-        net = SimpleNet()
-        out = net(inputs)
-        torch.nn.functional.cross_entropy(out, torch.rand(2)).backward()
-        with torch.profiler.profile(with_stack=True, profile_memory=True) as p:
-            _ = net(inputs)
-
-        modules = flat_out_extrafields(p.profiler.kineto_results.experimental_event_tree())
-        self.assertEqual(len(modules), 2, f"Expected two parameter list, but got {len(modules)}")
-
-        params = [(n, p.storage_data_ptr, g.storage_data_ptr) for module in modules for (n, p, g) in module.parameters]
-        expected = [(name, val.storage().data_ptr(), val.grad.storage().data_ptr()) for name, val in net.fc1._parameters.items()]
-        expected += [(name, val.storage().data_ptr(), val.grad.storage().data_ptr()) for name, val in net.fc2._parameters.items()]
-        self.assertEqual(expected, params, f"{expected} vs. {params}")
-
-    def _flat_out_extrafields(self, nodes, out=None):
-        if out is None:
-            out = []
-        for node in nodes:
-            if (isinstance(node.extra_fields, _ExtraFields_PyCall) and
-                    node.extra_fields.optimizer and node.extra_fields.optimizer.parameters):
-                # avoiding OptInfo duplicates from iterations
-                addr = node.extra_fields.optimizer.parameters[0][0].storage_data_ptr
-                if not [o for o in out if addr == o.parameters[0][0].storage_data_ptr]:
-                    out.append(node.extra_fields.optimizer)
-            self._flat_out_extrafields(node.children, out)
-        return out
-
-    def _check_results(self, opt, opts, check_items=False):
-        self.assertEqual(len(opts), 1, f"Expected 1 optimizer: len(opts): {len(opts)}")
-        self.assertEqual(id(opt), opts[0].self_ptr, f"Optimizer addr ({id(opt)}) vs. profiled addr ({opts[0].self_ptr})")
-        if check_items:
-            self.assertEqual(len(opt.param_groups), len(opts))
-            for group, opt_ in zip(opt.param_groups, opts):
-                self.assertEqual(
-                    [(v.storage().data_ptr()) for v in group.get("params", [])],
-                    [(o.storage_data_ptr) for (o, _, _) in opt_.parameters]
-                )
-            for opt_ in opts:
-                observed_state = {
-                    p.storage_data_ptr: {name: s.storage_data_ptr for name, s in state}
-                    for (p, _, state) in opt_.parameters
-                }
-
-                # Make sure the profiler collected all optimizer state and check
-                # that the address recorded by the profiler is correct.
-                for parameter, parameter_state in opt.state.items():
-                    self.assertEqual(
-                        {name: value.storage().data_ptr() for name, value in parameter_state.items()},
-                        observed_state.get(parameter.storage().data_ptr(), [])
+        # make sure tracing is scaled appropriately
+        with TemporaryFileName(mode="w+") as fname:
+            p.export_chrome_trace(fname)
+            with open(fname) as f:
+                report = json.load(f)
+            events = report["traceEvents"]
+            for event in events:
+                if event["name"] == "test_span":
+                    self.assertGreaterEqual(
+                        event["dur"] / SEC_TO_US,
+                        WAIT_TIME - MARGIN_ERROR,
+                        "profiling out of range",
+                    )
+                    self.assertLessEqual(
+                        event["dur"] / SEC_TO_US,
+                        WAIT_TIME + MARGIN_ERROR,
+                        "profiling out of range",
                     )
 
-    def test_optimizer(self):
-        inputs = torch.rand(10)
-        with torch.profiler.profile(with_stack=True, profile_memory=True) as p:
-            net = SimpleNet()
-            opt = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
-
-            opt.zero_grad()
-            out = net(inputs)
-            loss = torch.nn.functional.cross_entropy(out, torch.rand(2))
-            loss.backward()
-            opt.step()
-        self._check_results(opt, self._flat_out_extrafields(p.profiler.kineto_results.experimental_event_tree()), False)
-
-    def _test_optimizer_parameters(self, optimizer_factory):
-        inputs = torch.rand(10)
-        with torch.profiler.profile(with_stack=True, profile_memory=True) as p:
-            net = SimpleNet()
-            opt = optimizer_factory(net.parameters())
-            for _ in range(2):
-                opt.zero_grad()
-                out = net(inputs)
-                loss = torch.nn.functional.cross_entropy(out, torch.rand(2))
-                loss.backward()
-                opt.step()
-        self._check_results(opt, self._flat_out_extrafields(p.profiler.kineto_results.experimental_event_tree()), True)
-
-    def test_optimizer_parameters_sgd(self):
-        self._test_optimizer_parameters(lambda params: torch.optim.SGD(params, lr=0.01, momentum=0.9))
-
-    def test_optimizer_parameters_adam(self):
-        self._test_optimizer_parameters(lambda params: torch.optim.Adam(params, foreach=True))
-
-    def test_allocations(self):
-        gc.collect()
-        with profile(profile_memory=True) as p:
-            x = torch.empty((3, 4))
-
-        nodes = p.profiler.kineto_results.experimental_event_tree()
-        node = find_node_with_name(nodes, "[memory]")
-        self.assertIsNotNone(node)
-
-        alloc_size = 3 * 4 * 4  # fp32 -> 4 bytes
-        ptr = node.extra_fields.ptr
-        self.assertGreater(ptr, 0)
-        self.assertEqual(node.extra_fields.alloc_size, alloc_size)
-        self.assertEqual(node.extra_fields.device, torch.device("cpu"))
-        total_allocated = node.extra_fields.total_allocated
-
-        # total_reserved is only for CUDACachingAllocator
-        self.assertEqual(node.extra_fields.total_reserved, 0)
-
-        with profile(profile_memory=True) as p:
-            del x
-            gc.collect()
-
-        nodes = p.profiler.kineto_results.experimental_event_tree()
-        node = find_node_with_name(nodes, "[memory]")
-        self.assertIsNotNone(node)
-
-        self.assertEqual(node.extra_fields.ptr, ptr)
-        self.assertEqual(node.extra_fields.alloc_size, -alloc_size)
-        self.assertEqual(node.extra_fields.device, torch.device("cpu"))
-        self.assertEqual(node.extra_fields.total_allocated, total_allocated - alloc_size)
-
-    def test_refcounts(self):
-
-        class Sentinel:
-            pass
-
-        def make():
-            outer_sentinel = Sentinel()
-
-            def outer():
-                # Python will only close over variables used in the function.
-                _ = outer_sentinel
-                inner_sentinel = Sentinel()
-
-                def inner():
-                    _ = inner_sentinel
-
-
-                with profile(with_stack=True):
-                    inner()
-
-                return weakref.ref(inner_sentinel)
-
-            return outer, weakref.ref(outer_sentinel)
-
-        # Use a factory function to ensure the test scope never sees strong
-        # references. `del` has strange semantics that interact with closures
-        # at an AST level, so this is simpler.
-        outer, outer_sentinel_ref = make()
-        inner_sentinel_ref = outer()
+    def _schedule_helper(self, warmup, active, repeat):
+        with profile(
+            schedule=torch.profiler.schedule(
+                skip_first=0, wait=0, warmup=warmup, active=active, repeat=repeat
+            )
+        ) as prof:
+            for i in range(100):
+                torch.add(1, 2)
+                prof.step()
+        for ev in prof.key_averages():
+            if ev.key == "aten::add":
+                return ev.count
+        return 0
+
+    @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
+    def test_schedule_function_count(self):
+        self.assertEqual(self._schedule_helper(warmup=0, active=1, repeat=1), 1)
+        self.assertEqual(self._schedule_helper(warmup=0, active=5, repeat=0), 100)
+        self.assertEqual(self._schedule_helper(warmup=0, active=5, repeat=10), 50)
+        self.assertEqual(self._schedule_helper(warmup=1, active=5, repeat=0), 83)
+        self.assertEqual(self._schedule_helper(warmup=10, active=10, repeat=4), 40)
+        self.assertEqual(self._schedule_helper(warmup=50, active=1, repeat=0), 1)
 
-        self.assertIsNone(inner_sentinel_ref())
 
-        # `outer` holds the last reference via closure.
-        self.assertIsNotNone(outer_sentinel_ref())
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(10, 5)
+        self.fc2 = nn.Linear(5, 2)
 
-        del outer
-        self.assertIsNone(outer_sentinel_ref())
+    def forward(self, x):
+        return self.fc2(self.fc1(x))
 
 
 @dataclass(frozen=True)
@@ -2629,11 +1859,11 @@ class MockKinetoEvent:
     def name(self) -> str:
         return self._name
 
-    def start_us(self) -> int:
-        return self._start_us
+    def start_ns(self) -> int:
+        return self._start_us * 1000
 
-    def duration_us(self) -> int:
-        return self._duration_us
+    def duration_ns(self) -> int:
+        return self._duration_us * 1000
 
     def linked_correlation_id(self) -> int:
         return self._linked_correlation_id
@@ -2664,6 +1894,7 @@ def __post__init__(self, parent, children):
         object.__setattr__(self, "parent", parent)
         object.__setattr__(self, "children", children)
 
+
 class MockNode:
     def __init__(self, name, children) -> None:
         self.name = name
@@ -2671,13 +1902,10 @@ def __init__(self, name, children) -> None:
 
 
 class TestExperimentalUtils(TestCase):
-
     def make_tree(self) -> List[MockNode]:
         tree = {
             "root_0": {
-                "1": {
-                    "2": {}
-                },
+                "1": {"2": {}},
                 "3": {
                     "4": {},
                     "5": {},
@@ -2687,9 +1915,7 @@ def make_tree(self) -> List[MockNode]:
                 "6": {},
                 "7": {},
                 "8": {
-                    "9": {
-                        "10": {}
-                    },
+                    "9": {"10": {}},
                 },
             },
         }
@@ -2698,12 +1924,14 @@ def make_tree(self) -> List[MockNode]:
     def test_dfs(self) -> None:
         self.assertEqual(
             " ".join(i.name for i in _utils.traverse_dfs(self.make_tree())),
-            "root_0 1 2 3 4 5 root_1 6 7 8 9 10")
+            "root_0 1 2 3 4 5 root_1 6 7 8 9 10",
+        )
 
     def test_bfs(self) -> None:
         self.assertEqual(
             " ".join(i.name for i in _utils.traverse_bfs(self.make_tree())),
-            "root_0 root_1 1 3 6 7 8 2 4 5 9 10")
+            "root_0 root_1 1 3 6 7 8 2 4 5 9 10",
+        )
 
     @staticmethod
     def generate_mock_profile():
@@ -2719,24 +1947,17 @@ def generate_mock_profile():
             MockKinetoEvent("GPU", 1100, 100, 3, 1),
             MockKinetoEvent("GPU", 1200, 100, 4, 1),
             MockKinetoEvent("GPU", 1300, 100, 5, 1),
-            MockKinetoEvent("GPU", 1700, 100, 6, 1)
+            MockKinetoEvent("GPU", 1700, 100, 6, 1),
         ]
         cpu_events = [
             MockProfilerEvent("CPU (Before cudaLaunchKernel)", 1, 0, 100000),
-            MockProfilerEvent("CPU (Before cudaLaunchKernel)", 2, 100000,
-                              100000),
-            MockProfilerEvent("CPU (Before cudaLaunchKernel)", 3, 200000,
-                              100000),
-            MockProfilerEvent("CPU (Before cudaLaunchKernel)", 4, 300000,
-                              100000),
-            MockProfilerEvent("CPU (After cudaLaunchKernel)", 5, 400000,
-                              100000),
-            MockProfilerEvent("CPU (After cudaLaunchKernel)", 6, 500000,
-                              100000),
-            MockProfilerEvent("CPU (After cudaLaunchKernel)", 7, 600000,
-                              100000),
-            MockProfilerEvent("CPU (After cudaLaunchKernel)", 8, 700000,
-                              100000),
+            MockProfilerEvent("CPU (Before cudaLaunchKernel)", 2, 100000, 100000),
+            MockProfilerEvent("CPU (Before cudaLaunchKernel)", 3, 200000, 100000),
+            MockProfilerEvent("CPU (Before cudaLaunchKernel)", 4, 300000, 100000),
+            MockProfilerEvent("CPU (After cudaLaunchKernel)", 5, 400000, 100000),
+            MockProfilerEvent("CPU (After cudaLaunchKernel)", 6, 500000, 100000),
+            MockProfilerEvent("CPU (After cudaLaunchKernel)", 7, 600000, 100000),
+            MockProfilerEvent("CPU (After cudaLaunchKernel)", 8, 700000, 100000),
             MockProfilerEvent("CPU (After GPU)", 9, 800000, 100000),
             MockProfilerEvent("CPU (After GPU)", 10, 900000, 100000),
             MockProfilerEvent("CPU (After GPU)", 11, 1100000, 100000),
@@ -2745,10 +1966,10 @@ def generate_mock_profile():
 
         profiler = unittest.mock.Mock()
         profiler.kineto_results = unittest.mock.Mock()
-        profiler.kineto_results.events = unittest.mock.Mock(
-            return_value=cuda_events)
+        profiler.kineto_results.events = unittest.mock.Mock(return_value=cuda_events)
         profiler.kineto_results.experimental_event_tree = unittest.mock.Mock(
-            return_value=cpu_events)
+            return_value=cpu_events
+        )
         return profiler
 
     @staticmethod
@@ -2756,7 +1977,8 @@ def load_mock_profile():
         accept = expecttest.ACCEPT
         json_file_path = os.path.join(
             os.path.dirname(os.path.realpath(__file__)),
-            "profiler_utils_mock_events.json")
+            "profiler_utils_mock_events.json",
+        )
         if accept and torch.cuda.is_available():
 
             def garbage_code(x):
@@ -2766,30 +1988,30 @@ def garbage_code(x):
             x = torch.ones((4096, 4096), device="cuda")
             x = x @ x
             with profile(
-                    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-                    record_shapes=True,
-                    with_stack=True) as prof:
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                record_shapes=True,
+                with_stack=True,
+            ) as prof:
                 for _ in range(5):
                     x = x @ x
                 garbage_code(x)
                 for _ in range(5):
                     x = x @ x
 
-            kineto_events = [{
-                '_name':
-                e.name,
-                '_start_us':
-                e.start_us(),
-                '_duration_us':
-                e.duration_us(),
-                '_linked_correlation_id':
-                e.linked_correlation_id(),
-                '_device_type':
-                1 if e.device_type() == DeviceType.CUDA else 0
-            } for e in prof.profiler.kineto_results.events()]
+            kineto_events = [
+                {
+                    "_name": e.name,
+                    "_start_ns": e.start_ns(),
+                    "_duration_ns": e.duration_ns(),
+                    "_linked_correlation_id": e.linked_correlation_id(),
+                    "_device_type": 1 if e.device_type() == DeviceType.CUDA else 0,
+                }
+                for e in prof.profiler.kineto_results.events()
+            ]
 
             def EventTreeDFS(event_tree):
                 from collections import deque
+
                 stack = deque(event_tree)
                 while stack:
                     curr_event = stack.pop()
@@ -2797,27 +2019,29 @@ def EventTreeDFS(event_tree):
                     for child_event in curr_event.children:
                         stack.append(child_event)
 
-            profiler_events = [{
-                '_name': e.name,
-                'id': e.id,
-                'start_time_ns': e.start_time_ns,
-                'duration_time_ns': e.duration_time_ns,
-                'correlation_id': e.correlation_id,
-                'children': [child.id for child in e.children],
-                'parent': e.parent.id if e.parent else None
-            } for e in EventTreeDFS(
-                prof.profiler.kineto_results.experimental_event_tree())]
+            profiler_events = [
+                {
+                    "_name": e.name,
+                    "id": e.id,
+                    "start_time_ns": e.start_time_ns,
+                    "duration_time_ns": e.duration_time_ns,
+                    "correlation_id": e.correlation_id,
+                    "children": [child.id for child in e.children],
+                    "parent": e.parent.id if e.parent else None,
+                }
+                for e in EventTreeDFS(
+                    prof.profiler.kineto_results.experimental_event_tree()
+                )
+            ]
 
             with open(json_file_path, "w") as f:
                 json.dump([kineto_events, profiler_events], f)
 
-        assert (os.path.exists(json_file_path))
+        assert os.path.exists(json_file_path)
         with open(json_file_path) as f:
             kineto_events, profiler_events = json.load(f)
 
-        cuda_events = [
-            MockKinetoEvent(*event.values()) for event in kineto_events
-        ]
+        cuda_events = [MockKinetoEvent(*event.values()) for event in kineto_events]
         cpu_events = []
         id_map = {}
         for e in profiler_events:
@@ -2831,16 +2055,17 @@ def EventTreeDFS(event_tree):
         cpu_events = [event for event in cpu_events if event.parent is None]
         profiler = unittest.mock.Mock()
         profiler.kineto_results = unittest.mock.Mock()
-        profiler.kineto_results.events = unittest.mock.Mock(
-            return_value=cuda_events)
+        profiler.kineto_results.events = unittest.mock.Mock(return_value=cuda_events)
         profiler.kineto_results.experimental_event_tree = unittest.mock.Mock(
-            return_value=cpu_events)
+            return_value=cpu_events
+        )
         return profiler
 
     def test_utils_compute_self_time(self):
         with profile() as prof:
             t1, t2 = torch.ones(1, requires_grad=True), torch.ones(
-                1, requires_grad=True)
+                1, requires_grad=True
+            )
             z = torch.add(t1, t2)
             y = torch.ones(1)
             loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
@@ -2851,10 +2076,9 @@ def test_utils_compute_self_time(self):
         for event_key, event_metrics in metrics.items():
             self.assertEqual(
                 event_metrics.self_time_ns,
-                event_key.event.duration_time_ns - sum([
-                    child.duration_time_ns
-                    for child in event_key.event.children
-                ]))
+                event_key.event.duration_time_ns
+                - sum(child.duration_time_ns for child in event_key.event.children),
+            )
 
     def test_utils_intervals_overlap(self):
         event = _utils.EventKey(MockProfilerEvent("Event 1", 1, 5, 5))
@@ -2870,7 +2094,6 @@ def test_utils_intervals_overlap(self):
         self.assertEqual(event.intervals_overlap(intervals), 5)
 
     def test_utils_compute_queue_depth(self):
-
         def format_queue_depth(queue_depth_list, events):
             res = ""
             for data, event in zip(queue_depth_list, events):
@@ -2881,8 +2104,10 @@ def format_queue_depth(queue_depth_list, events):
         profiler = self.generate_mock_profile()
         basic_evaluation = _utils.BasicEvaluation(profiler)
         self.assertExpectedInline(
-            format_queue_depth(basic_evaluation.queue_depth_list,
-                               basic_evaluation.cuda_events), """\
+            format_queue_depth(
+                basic_evaluation.queue_depth_list, basic_evaluation.cuda_events
+            ),
+            """\
 1 [cudaLaunchKernel]
 2 [cudaLaunchKernel]
 3 [cudaLaunchKernel]
@@ -2895,12 +2120,14 @@ def format_queue_depth(queue_depth_list, events):
 0 [GPU]
 1 [cudaLaunchKernel]
 0 [GPU]
-""")
+""",
+        )
         self.assertExpectedInline(
-            format_queue_depth([
-                basic_evaluation.metrics[k]
-                for k in basic_evaluation.event_keys
-            ], basic_evaluation.events), """\
+            format_queue_depth(
+                [basic_evaluation.metrics[k] for k in basic_evaluation.event_keys],
+                basic_evaluation.events,
+            ),
+            """\
 0 [CPU (Before cudaLaunchKernel)]
 0 [CPU (Before cudaLaunchKernel)]
 0 [CPU (Before cudaLaunchKernel)]
@@ -2913,7 +2140,8 @@ def format_queue_depth(queue_depth_list, events):
 4 [CPU (After GPU)]
 2 [CPU (After GPU)]
 1 [CPU (After GPU)]
-""")
+""",
+        )
 
     def test_utils_compute_queue_depth_when_no_cuda_events(self):
         # For traces with only cpu events, we expect empty queue depth list
@@ -2927,12 +2155,15 @@ def test_utils_compute_queue_depth_when_no_cuda_events(self):
     def test_utils_compute_idle_time(self):
         profiler = self.generate_mock_profile()
         basic_evaluation = _utils.BasicEvaluation(profiler)
-        expected_output = "\n".join([
-            f"{basic_evaluation.metrics[event_key].idle_time_ns} [{event_key.event.name}]"
-            for event_key in basic_evaluation.event_keys
-        ])
+        expected_output = "\n".join(
+            [
+                f"{basic_evaluation.metrics[event_key].idle_time_ns} [{event_key.event.name}]"
+                for event_key in basic_evaluation.event_keys
+            ]
+        )
         self.assertExpectedInline(
-            expected_output, """\
+            expected_output,
+            """\
 100000 [CPU (Before cudaLaunchKernel)]
 100000 [CPU (Before cudaLaunchKernel)]
 100000 [CPU (Before cudaLaunchKernel)]
@@ -2944,19 +2175,24 @@ def test_utils_compute_idle_time(self):
 0 [CPU (After GPU)]
 0 [CPU (After GPU)]
 0 [CPU (After GPU)]
-100000 [CPU (After GPU)]""")
+100000 [CPU (After GPU)]""",
+        )
 
     @unittest.skipIf(IS_JETSON, "JSON not behaving as expected on Jetson")
     def test_utils_get_optimizable_events(self):
         basic_evaluation = _utils.BasicEvaluation(self.load_mock_profile())
         optimizable_events = basic_evaluation.get_optimizable_events(
-            2, print_enable=False)
+            2, print_enable=False
+        )
         expected_output = "\n".join(
-            [f"{event_key.event.name}" for event_key in optimizable_events])
+            [f"{event_key.event.name}" for event_key in optimizable_events]
+        )
         self.assertExpectedInline(
-            expected_output, """\
+            expected_output,
+            """\
 <built-in function _cuda_synchronize>
-aten::copy_""")
+aten::copy_""",
+        )
 
     def test_profiler_name_pattern(self):
         x = torch.ones((4096, 4096))
@@ -2966,13 +2202,18 @@ def test_profiler_name_pattern(self):
                 x = x + x
         matched_events = NamePattern(prof, "aten::mm").matched_events()
         output = "\n".join([f"{event.name}" for event in matched_events])
-        self.assertExpectedInline(output, """\
+        self.assertExpectedInline(
+            output,
+            """\
 aten::mm
 aten::mm
 aten::mm
 aten::mm
-aten::mm""")
+aten::mm""",
+        )
 
+    # TODO: Add logic for CUDA version of test
+    @unittest.skipIf(torch.cuda.is_available(), "Test not working for CUDA")
     def test_profiler_pattern_match_helper(self):
         x = torch.ones((100, 100))
         with profile() as prof:
@@ -2986,13 +2227,16 @@ def test_profiler_pattern_match_helper(self):
         child_nodes = event_tree[0].children
         self.assertEqual([], pattern.siblings_of(child_nodes[0])[0])
         self.assertEqual(child_nodes[1:], pattern.siblings_of(child_nodes[0])[1])
-        self.assertEqual(event_tree[0],
-                         pattern.root_of(event_tree[0].children[0].children[0]))
+        self.assertEqual(
+            event_tree[0], pattern.root_of(event_tree[0].children[0].children[0])
+        )
         self.assertEqual(None, pattern.next_of(event_tree[-1]))
         self.assertEqual(event_tree[1], pattern.next_of(event_tree[0]))
         self.assertEqual(event_tree[0], pattern.prev_of(event_tree[1]))
 
-    @unittest.skipIf(TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite.")
+    @unittest.skipIf(
+        TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
+    )
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
     def test_profiler_extra_cuda_copy_pattern(self):
         cases = (
@@ -3018,8 +2262,9 @@ def test_profiler_extra_cuda_copy_pattern(self):
             num_matched.append(len(pattern.matched_events()))
         self.assertEqual(num_matched, [i for i, _ in cases])
 
-    @unittest.skipIf(TEST_WITH_CROSSREF,
-                     "crossref intercepts calls and changes the callsite.")
+    @unittest.skipIf(
+        TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
+    )
     def test_profiler_for_loop_indexing_pattern(self):
         x = torch.ones((100, 100))
 
@@ -3055,7 +2300,6 @@ def case5():
             num_matched.append(len(pattern.matched_events()))
         self.assertEqual(num_matched, [i for i, _ in cases])
 
-
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
     def test_profiler_fp32_matmul_pattern(self):
         x = torch.ones((100, 100), device="cuda")
@@ -3066,7 +2310,6 @@ def test_profiler_fp32_matmul_pattern(self):
         num_matched = len(pattern.matched_events())
         self.assertEqual(num_matched, has_tf32)
 
-
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
     def test_profiler_extra_cuda_copy_pattern_benchmark(self):
         with profile(with_stack=True, record_shapes=True) as prof:
@@ -3097,7 +2340,9 @@ def test_profiler_optimizer_single_tensor_pattern(self):
                 optimizer = fn()
                 optimizer.zero_grad()
                 y_hat = model(x)
-                loss = torch.nn.functional.cross_entropy(y_hat, torch.randint(0, 10, (100,)))
+                loss = torch.nn.functional.cross_entropy(
+                    y_hat, torch.randint(0, 10, (100,))
+                )
                 loss.backward()
                 optimizer.step()
             pattern = OptimizerSingleTensorPattern(prof)
@@ -3107,7 +2352,9 @@ def test_profiler_optimizer_single_tensor_pattern(self):
     def test_profiler_synchronized_dataloader_pattern(self):
         dataset = torch.rand((100, 100))
         sync_dataloader = torch.utils.data.DataLoader(dataset, batch_size=10)
-        async_dataloader = torch.utils.data.DataLoader(dataset, batch_size=10, num_workers=4)
+        async_dataloader = torch.utils.data.DataLoader(
+            dataset, batch_size=10, num_workers=4
+        )
         with profile(with_stack=True) as prof:
             next(iter(sync_dataloader))
             next(iter(async_dataloader))
@@ -3115,6 +2362,9 @@ def test_profiler_synchronized_dataloader_pattern(self):
         num_matched = len(pattern.matched_events())
         self.assertEqual(num_matched, 1)
 
+    @skipIfTorchDynamo(
+        "pattern checks for aten::_zero op which might not be there with torch.compile'd graph"
+    )
     def test_profiler_grad_not_set_to_none_pattern(self):
         x = torch.ones((100, 100))
         model = nn.Sequential(
@@ -3127,13 +2377,15 @@ def test_profiler_grad_not_set_to_none_pattern(self):
             (0, lambda: optimizer.zero_grad()),
             (0, lambda: model.zero_grad()),
             (1, lambda: optimizer.zero_grad(set_to_none=False)),
-            (1, lambda: model.zero_grad(set_to_none=False))
+            (1, lambda: model.zero_grad(set_to_none=False)),
         )
         num_matched = []
         for _, fn in cases:
             with profile(with_stack=True) as prof:
                 y_hat = model(x)
-                loss = torch.nn.functional.cross_entropy(y_hat, torch.randint(0, 10, (100,)))
+                loss = torch.nn.functional.cross_entropy(
+                    y_hat, torch.randint(0, 10, (100,))
+                )
                 loss.backward()
                 optimizer.step()
                 fn()
@@ -3146,7 +2398,7 @@ def test_profiler_conv2d_bias_followed_by_batchnorm2d_pattern(self):
         cases = (
             (1, nn.Sequential(nn.Conv2d(3, 3, 3, 1, 1), nn.BatchNorm2d(3))),
             (0, nn.Sequential(nn.Conv2d(3, 3, 3, 1, 1, bias=False), nn.BatchNorm2d(3))),
-            (0, nn.Sequential(nn.Conv2d(3, 3, 3, 1, 1)))
+            (0, nn.Sequential(nn.Conv2d(3, 3, 3, 1, 1))),
         )
         num_matched = []
         for _, model in cases:
@@ -3156,14 +2408,13 @@ def test_profiler_conv2d_bias_followed_by_batchnorm2d_pattern(self):
             num_matched.append(len(pattern.matched_events()))
         self.assertEqual(num_matched, [i for i, _ in cases])
 
-
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
     def test_profiler_matmul_dim_fp16_pattern(self):
         cases = (
-            (1, torch.randn((201, 201), device='cuda', dtype=torch.float16)),
-            (1, torch.randn((3, 97, 97), device='cuda', dtype=torch.float16)),
-            (0, torch.randn((200, 200), device='cuda', dtype=torch.float16)),
-            (0, torch.randn((3, 200, 200), device='cuda', dtype=torch.float16))
+            (1, torch.randn((201, 201), device="cuda", dtype=torch.float16)),
+            (1, torch.randn((3, 97, 97), device="cuda", dtype=torch.float16)),
+            (0, torch.randn((200, 200), device="cuda", dtype=torch.float16)),
+            (0, torch.randn((3, 200, 200), device="cuda", dtype=torch.float16)),
         )
         num_matched = []
         for _, x in cases:
@@ -3183,7 +2434,9 @@ def test_profiler_pattern_matcher_json_report(self):
         optimizer = torch.optim.Adam(model.parameters())
         with profile(with_stack=True, record_shapes=True) as prof:
             y_hat = model(x)
-            loss = torch.nn.functional.cross_entropy(y_hat, torch.randint(0, 10, (100,)))
+            loss = torch.nn.functional.cross_entropy(
+                y_hat, torch.randint(0, 10, (100,))
+            )
             loss.backward()
             optimizer.step()
             optimizer.zero_grad()
@@ -3205,5 +2458,6 @@ def test_profiler_pattern_matcher_json_report(self):
         finally:
             os.remove("torchtidy_report.json")
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
index fab2f1301ad61..b5448025d1134 100644
--- a/test/profiler/test_profiler_tree.py
+++ b/test/profiler/test_profiler_tree.py
@@ -12,7 +12,13 @@
 import torch
 from torch._C._profiler import _ExtraFields_PyCall, _ExtraFields_PyCCall
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests, IS_WINDOWS, TEST_WITH_CROSSREF, IS_ARM64)
+    IS_ARM64,
+    IS_WINDOWS,
+    run_tests,
+    skipIfTorchDynamo,
+    TEST_WITH_CROSSREF,
+    TestCase,
+)
 from torch.utils._pytree import tree_map
 
 # These functions can vary from based on platform and build (e.g. with CUDA)
@@ -28,7 +34,6 @@
     "torch/profiler/profiler.py(...): _transit_action": KEEP_ELLIPSES,
     "<built-in method __exit__ of torch._C.DisableTorchFunctionSubclass object at 0xXXXXXXXXXXXX>": PRUNE_ALL,
     "cudaStreamIsCapturing": PRUNE_ALL,
-
     # These show up only on CUDA, prune them so the CUDA and CPU expected results can be the same
     "cudaGetDeviceCount": PRUNE_ALL,
     "cudaGetDeviceProperties_v2": PRUNE_ALL,
@@ -46,25 +51,20 @@
 
 
 class TorchFunctionTensor(torch.Tensor):
-
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
         return super().__torch_function__(func, types, args, kwargs)
 
 
 class TorchDispatchTensor(torch.Tensor):
-
     @staticmethod
     def __new__(cls, elem):
         t = torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
         t.elem = elem
         return t
 
-    __torch_function__ = torch._C._disabled_torch_function_impl
-
     @classmethod
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
-
         def unwrap(x):
             return x.elem if isinstance(x, TorchDispatchTensor) else x
 
@@ -78,7 +78,6 @@ def wrap(x):
 
 
 class ProfilerTree:
-
     @staticmethod
     def test(f):
         """Mark unit test that will be using ProfilerTree to test traces.
@@ -101,11 +100,11 @@ def begin_unit_test_marker(self, replicates=3):
                 return out
             finally:
                 delattr(self, "tree_replicate")
+
         return begin_unit_test_marker
 
     @classmethod
     def format(cls, profiler, indent: int = 0):
-
         def flatten(nodes, depth=0, out=None):
             if out is None:
                 out = []
@@ -142,10 +141,19 @@ def flatten(nodes, depth=0, out=None):
         if flat_nodes and flat_nodes[-1][1] == "hipDeviceSynchronize":
             flat_nodes = flat_nodes[:-1]
 
-        min_depth = min([d + 1 for d, name in flat_nodes if "begin_unit_test_marker" in name] or [0])
+        min_depth = min(
+            [d + 1 for d, name in flat_nodes if "begin_unit_test_marker" in name] or [0]
+        )
         return textwrap.indent(
-            "\n".join([f"{'  ' * (d - min_depth)}{name.rstrip()}" for d, name in flat_nodes if d >= min_depth]),
-            " " * indent)
+            "\n".join(
+                [
+                    f"{'  ' * (d - min_depth)}{name.rstrip()}"
+                    for d, name in flat_nodes
+                    if d >= min_depth
+                ]
+            ),
+            " " * indent,
+        )
 
     @staticmethod
     def fmt_name(name: str) -> str:
@@ -174,18 +182,15 @@ def fmt_name(name: str) -> str:
             "void at::native::reduce_kernel",
             "void at::native::vectorized_elementwise_kernel",
             "void at::native::unrolled_elementwise_kernel",
-
             r"void [a-zA-Z0-9]+_kernel",  # Nvidia kernels.
         ):
             name = re.sub(
                 rf"{kernel_pattern}<.+>\(.+\)$",
                 f"{kernel_pattern.replace('[a-zA-Z0-9]+', '...')}<...>(...)",
-                name)
+                name,
+            )
 
-        return re.sub(
-            "object at 0x[0-9a-fA-F]+>",
-            "object at 0xXXXXXXXXXXXX>",
-            name)
+        return re.sub("object at 0x[0-9a-fA-F]+>", "object at 0xXXXXXXXXXXXX>", name)
 
     @classmethod
     def validate_node(cls, node):
@@ -207,6 +212,7 @@ def to_string(frame_state):
                 caller_name = to_string(extra_fields.caller)
                 assert parent_name == caller_name, f"{parent_name} vs. {caller_name}"
 
+
 @unittest.skipIf(IS_ARM64, "Not working on ARM")
 class TestProfilerTree(TestCase):
     def assertTreesMatch(self, actual: str, expected: str, allow_failure: bool = False):
@@ -230,7 +236,9 @@ def assertTreesMatch(self, actual: str, expected: str, allow_failure: bool = Fal
         self.maxDiff = None
 
         replicate = getattr(self, "tree_replicate", None)
-        self.assertIsNotNone(replicate, "Please annotate test with `@ProfilerTree.test`")
+        self.assertIsNotNone(
+            replicate, "Please annotate test with `@ProfilerTree.test`"
+        )
 
         # The profiler should produce deterministic results and should return
         # to a clean state after each run. As a result, only the first
@@ -249,7 +257,9 @@ def assertTreesMatch(self, actual: str, expected: str, allow_failure: bool = Fal
                 else:
                     raise
 
+    # TODO: Add logic for CUDA version of test
     @ProfilerTree.test
+    @unittest.skipIf(torch.cuda.is_available(), "Test not working for CUDA")
     def test_profiler_experimental_tree(self):
         t1, t2 = torch.ones(1, requires_grad=True), torch.ones(1, requires_grad=True)
         with torch.profiler.profile() as p:
@@ -299,10 +309,12 @@ def test_profiler_experimental_tree(self):
             autograd::engine::evaluate_function: torch::autograd::AccumulateGrad
               torch::autograd::AccumulateGrad
                 aten::detach
-                  detach"""
+                  detach""",
         )
 
+    # TODO: Add logic for CUDA version of test
     @ProfilerTree.test
+    @unittest.skipIf(torch.cuda.is_available(), "Test not working for CUDA")
     def test_profiler_experimental_tree_with_record_function(self):
         with torch.profiler.profile() as p:
             with torch.autograd.profiler.record_function("Top level Annotation"):
@@ -311,7 +323,9 @@ def test_profiler_experimental_tree_with_record_function(self):
 
                 # Check that we correctly handle the case when a user
                 # annotation does not call `__exit__`.
-                _ = torch.autograd.profiler.record_function("Second Annotation").__enter__()
+                _ = torch.autograd.profiler.record_function(
+                    "Second Annotation"
+                ).__enter__()
 
                 y = x + 1
                 with torch.autograd.profiler.record_function("Third Annotation"):
@@ -345,10 +359,12 @@ def test_profiler_experimental_tree_with_record_function(self):
                     torch::autograd::AccumulateGrad
                       aten::new_empty_strided
                         aten::empty_strided
-                      aten::copy_"""
+                      aten::copy_""",
         )
 
+    # TODO: Add logic for CUDA version of test
     @ProfilerTree.test
+    @unittest.skipIf(torch.cuda.is_available(), "Test not working for CUDA")
     def test_profiler_experimental_tree_with_memory(self):
         t1, t2 = torch.ones(1, requires_grad=True), torch.ones(1, requires_grad=True)
         with torch.profiler.profile(profile_memory=True) as p:
@@ -417,10 +433,12 @@ def test_profiler_experimental_tree_with_memory(self):
               torch::autograd::AccumulateGrad
                 aten::detach
                   detach
-            [memory]"""
+            [memory]""",
         )
 
-    @unittest.skipIf(TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite.")
+    @unittest.skipIf(
+        TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
+    )
     @ProfilerTree.test
     def test_profiler_experimental_tree_with_memory_and_stack(self):
         t1, t2 = torch.ones(1, requires_grad=True), torch.ones(1, requires_grad=True)
@@ -515,10 +533,13 @@ def test_profiler_experimental_tree_with_memory_and_stack(self):
                 [memory]
               torch/profiler/profiler.py(...): __exit__
                 torch/profiler/profiler.py(...): stop
-                  ..."""
+                  ...""",
         )
 
-    @unittest.skipIf(TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite.")
+    @skipIfTorchDynamo("too slow")
+    @unittest.skipIf(
+        TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
+    )
     @ProfilerTree.test
     def test_profiler_experimental_tree_with_stack_and_modules(self):
         class MyModule(torch.nn.Module):
@@ -642,10 +663,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                                 aten::clamp_min
               torch/profiler/profiler.py(...): __exit__
                 torch/profiler/profiler.py(...): stop
-                  ..."""
+                  ...""",
         )
 
-    @unittest.skipIf(TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite.")
+    @unittest.skipIf(
+        TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
+    )
     @ProfilerTree.test
     def test_profiler_experimental_tree_with_stack_and_torch_function(self):
         x = TorchFunctionTensor(torch.ones((1,)))
@@ -681,10 +704,12 @@ def test_profiler_experimental_tree_with_stack_and_torch_function(self):
                       <built-in function isinstance>
               torch/profiler/profiler.py(...): __exit__
                 torch/profiler/profiler.py(...): stop
-                  ..."""
+                  ...""",
         )
 
-    @unittest.skipIf(TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite.")
+    @unittest.skipIf(
+        TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
+    )
     @ProfilerTree.test
     def test_profiler_experimental_tree_with_stack_and_torch_dispatch(self):
         x = TorchDispatchTensor(torch.ones((1,)))
@@ -712,7 +737,8 @@ def test_profiler_experimental_tree_with_stack_and_torch_dispatch(self):
                     ...
               torch/profiler/profiler.py(...): __exit__
                 torch/profiler/profiler.py(...): stop
-                  ...""")
+                  ...""",
+        )
 
     @unittest.skip("https://github.com/pytorch/pytorch/issues/83606")
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
@@ -870,7 +896,9 @@ def test_profiler_experimental_tree_cuda_with_stream(self):
         )
 
     @unittest.skip("https://github.com/pytorch/pytorch/issues/83606")
-    @unittest.skipIf(TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite.")
+    @unittest.skipIf(
+        TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite."
+    )
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
     @ProfilerTree.test
     def test_profiler_experimental_tree_cuda_detailed(self):
@@ -1066,5 +1094,5 @@ def step():
         )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/profiler/test_record_function.py b/test/profiler/test_record_function.py
new file mode 100644
index 0000000000000..9c0945454c55b
--- /dev/null
+++ b/test/profiler/test_record_function.py
@@ -0,0 +1,155 @@
+# Owner(s): ["oncall: profiler"]
+
+# if tqdm is not shutdown properly, it will leave the monitor thread alive.
+# This causes an issue in the multithreading test because we check all events
+# in that test with their tids. The events that correspond to these lingering
+# threads all have TID of (uint64_t)(-1) which is invalid.
+# The work around is turnning off monitoring thread when tqdm is loaded.
+# Since these are unit tests, it is safe to turn off monitor thread.
+try:
+    import tqdm
+
+    tqdm.tqdm.monitor_interval = 0
+except ImportError:
+    None
+
+from typing import Any, Dict
+
+import torch
+import torch.optim
+import torch.utils.data
+import torch.utils.data.datapipes as dp
+from torch.autograd import (
+    _record_function_with_args_enter,
+    _record_function_with_args_exit,
+)
+from torch.autograd.profiler import profile as _profile
+from torch.profiler import kineto_available, record_function
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+Json = Dict[str, Any]
+
+
+class TestRecordFunction(TestCase):
+    def _record_function_with_param(self):
+        u = torch.randn(3, 4, 5, requires_grad=True)
+        with _profile(
+            with_stack=True, use_kineto=kineto_available(), record_shapes=True
+        ) as prof:
+            with record_function("## TEST 1 ##", "1, 2, 3"):
+                rf_handle = _record_function_with_args_enter(
+                    "## TEST 2 ##", 1, False, 2.5, [u, u], "hello", u
+                )
+                _record_function_with_args_exit(rf_handle)
+            with record_function("## TEST 3 ##"):
+                rf_handle = _record_function_with_args_enter("## TEST 4 ##")
+                _record_function_with_args_exit(rf_handle)
+        return prof
+
+    def test_record_function(self):
+        prof_result = self._record_function_with_param()
+        found_test_1 = False
+        found_test_2 = False
+        found_test_3 = False
+        found_test_4 = False
+        for e in prof_result.function_events:
+            if "## TEST 1 ##" == e.name:
+                found_test_1 = True
+                self.assertTrue(e.input_shapes == [[]])
+            elif "## TEST 2 ##" == e.name:
+                found_test_2 = True
+                self.assertTrue(e.input_shapes == [[], [], [], [], [], [3, 4, 5]])
+            elif "## TEST 3 ##" == e.name:
+                found_test_3 = True
+                self.assertTrue(e.input_shapes == [])
+            elif "## TEST 4 ##" == e.name:
+                found_test_4 = True
+                self.assertTrue(e.input_shapes == [])
+        self.assertTrue(found_test_1)
+        self.assertTrue(found_test_2)
+        self.assertTrue(found_test_3)
+        self.assertTrue(found_test_4)
+
+    def test_datapipe_with_record_function(self):
+        with _profile(
+            with_stack=True, use_kineto=kineto_available(), record_shapes=True
+        ) as prof:
+            input_dp1 = dp.iter.IterableWrapper(range(4))
+            input_dp2 = dp.iter.IterableWrapper(range(4, 8))
+            input_dp3 = dp.iter.IterableWrapper(range(8, 12))
+            output_dp = input_dp1.mux(input_dp2, input_dp3)
+            output = list(output_dp)
+
+        has_iter = False
+        has_mux = False
+        for e in prof.function_events:
+            if has_iter and has_mux:
+                break
+
+            if not has_iter and "IterableWrapper" in e.name:
+                has_iter = True
+            if not has_mux and "Multiplexer" in e.name:
+                has_mux = True
+        self.assertTrue(has_iter)
+        self.assertTrue(has_mux)
+
+    def test_datapipe_delegation_with_profiler(self):
+        class IDPIterator(torch.utils.data.IterDataPipe):
+            def __init__(self):
+                self.data = list(range(10))
+                self._idx = 0
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                if self._idx >= 10:
+                    self._idx = 0
+                    raise StopIteration
+                self._idx += 1
+                return self.data[self._idx - 1]
+
+            def get_value(self, idx):
+                return self.data[idx]
+
+        dp1 = IDPIterator()  # The object itself is an iterator
+        self.assertEqual(5, dp1.get_value(5))
+        it_dp1 = iter(dp1)  # This creates the 1st iterator
+        self.assertEqual(5, it_dp1.get_value(5))  # type: ignore[attr-defined]
+        self.assertEqual(list(range(10)), list(it_dp1))
+
+        class IDPDelegator(torch.utils.data.IterDataPipe):
+            def __init__(self, datapipe):
+                self.datapipe = datapipe
+
+            def __iter__(self):
+                return iter(self.datapipe)
+
+        dp2 = IDPDelegator(dp1)
+        it_dp2 = iter(dp2)
+        self.assertEqual(5, it_dp2.get_value(5))
+        self.assertEqual(list(range(10)), list(it_dp2))
+
+    def test_datapipe_with_record_function_fork(self):
+        with _profile(
+            with_stack=True, use_kineto=kineto_available(), record_shapes=True
+        ) as prof:
+            input_dp = dp.iter.IterableWrapper(range(10))
+            dp1, dp2, dp3 = input_dp.fork(num_instances=3)
+            output1 = list(dp1)
+        has_iter = False
+        has_child = False
+        for e in prof.function_events:
+            if has_iter and has_child:
+                break
+
+            if not has_iter and "IterableWrapper" in e.name:
+                has_iter = True
+            if not has_child and "_ChildDataPipe" in e.name:
+                has_child = True
+        self.assertTrue(has_iter)
+        self.assertTrue(has_child)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/profiler/test_torch_tidy.py b/test/profiler/test_torch_tidy.py
new file mode 100644
index 0000000000000..a891d9045812d
--- /dev/null
+++ b/test/profiler/test_torch_tidy.py
@@ -0,0 +1,912 @@
+# Owner(s): ["oncall: profiler"]
+
+# if tqdm is not shutdown properly, it will leave the monitor thread alive.
+# This causes an issue in the multithreading test because we check all events
+# in that test with their tids. The events that correspond to these lingering
+# threads all have TID of (uint64_t)(-1) which is invalid.
+# The work around is turnning off monitoring thread when tqdm is loaded.
+# Since these are unit tests, it is safe to turn off monitor thread.
+try:
+    import tqdm
+
+    tqdm.tqdm.monitor_interval = 0
+except ImportError:
+    None
+
+import gc
+import re
+import textwrap
+import unittest
+import weakref
+from typing import Any, Dict, List
+
+import torch
+import torch.nn as nn
+import torch.optim
+import torch.utils.data
+from torch._C._profiler import _TensorMetadata
+from torch.profiler import _utils, profile
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+Json = Dict[str, Any]
+
+from torch._C._profiler import _ExtraFields_PyCall
+
+
+def find_node_with_name(nodes, name):
+    for node in _utils.traverse_dfs(nodes):
+        if node.name == name:
+            return node
+
+
+def find_node_with_regex(nodes, pattern):
+    for node in _utils.traverse_dfs(nodes):
+        if re.search(pattern, node.name):
+            return node
+
+
+class SimpleNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(10, 5)
+        self.fc2 = nn.Linear(5, 2)
+
+    def forward(self, x):
+        return self.fc2(self.fc1(x))
+
+
+class TestTorchTidyProfiler(TestCase):
+    def _get_tensor_fields(self, node, index):
+        self.assertIsNotNone(node)
+        self.assertIsInstance(
+            node.extra_fields, torch._C._profiler._ExtraFields_TorchOp
+        )
+        tensor_info = node.extra_fields.inputs[index]
+        self.assertIsInstance(tensor_info, _TensorMetadata)
+        self.assertIsNotNone(tensor_info.impl_ptr)
+        self.assertIsNotNone(tensor_info.storage_data_ptr)
+        self.assertIsNotNone(tensor_info.id)
+        return tensor_info.impl_ptr, tensor_info.storage_data_ptr, tensor_info.id
+
+    def test_pointers_and_ids(self):
+        a = torch.randn(4, 3)
+        a_initial_storage_data = a.storage().data_ptr()
+
+        # Views of tensors can share the same storage, but have different TensorImpls
+        b = a.view((1, 12))
+        c = torch.randn(4, 1)
+        c_initial_storage_data = c.storage().data_ptr()
+        d = torch.randn(4, 3)
+
+        with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
+            _ = a + c
+            _ = b * c
+
+            # Resize should create a new data_ptr but keep the TensorImpl the same.
+            f = a.resize_(128, 129)
+            _ = torch.relu(f)
+
+            # `.set_` points a Tensor at an existing storage.
+            _ = d.sin()
+            c.set_(d.storage())
+            _ = c.cos()
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+
+        def get_fields(op_name, index):
+            return self._get_tensor_fields(find_node_with_name(nodes, op_name), index)
+
+        a_impl, a_storage_data, a_id = get_fields("aten::add", 0)
+        b_impl, b_storage_data, b_id = get_fields("aten::mul", 0)
+
+        # Profiler matches ground truth from Python API.
+        self.assertEqual(a_storage_data, a_initial_storage_data)
+
+        # Views are handled correctly.
+        self.assertEqual(a_storage_data, b_storage_data)
+        self.assertNotEqual(a_impl, b_impl)
+
+        # The same Tensor used in multiple calls gives identical results.
+        c_impl, c_storage_data, c_id = get_fields("aten::add", 1)
+        self.assertEqual((c_impl, c_storage_data, c_id), get_fields("aten::mul", 1))
+        self.assertEqual(c_storage_data, c_initial_storage_data)
+
+        # Mutations to the underlying storage are reflected. (But ID is shared.)
+        f_impl, f_storage_data, f_id = get_fields("aten::relu", 0)
+        self.assertEqual(a_impl, f_impl)
+        self.assertNotEqual(a_storage_data, f_storage_data)
+        self.assertEqual(a_id, f_id)
+
+        # Calling `set_` with an existing Tensor makes them share an ID.
+        d_impl, d_storage_data, d_id = get_fields("aten::sin", 0)
+        c_impl_new, c_storage_data_new, c_id_new = get_fields("aten::cos", 0)
+        self.assertNotEqual(d_impl, c_impl_new)
+        self.assertEqual(d_storage_data, c_storage_data_new)
+        self.assertEqual(c_id, c_id_new)
+        self.assertEqual(d_id, c_id_new)
+
+    @staticmethod
+    def _format_allocations(profiled_code):
+        gc.collect()
+        with profile(profile_memory=True, record_shapes=True) as prof:
+            profiled_code()
+            gc.collect()
+
+        root_events = prof.profiler.kineto_results.experimental_event_tree()
+        events = sorted(_utils.traverse_dfs(root_events), key=lambda x: x.start_time_ns)
+        allocations = tuple(
+            event.extra_fields
+            for event in events
+            if isinstance(
+                event.extra_fields, torch._C._profiler._ExtraFields_Allocation
+            )
+        )
+
+        return textwrap.indent(
+            "\n".join(
+                f"{repr(i.id):>5}{' ' * 6}"
+                f"{repr(i.allocation_id):>5}{' ' * 6}"
+                f"{'Allocation' if i.alloc_size > 0 else 'Free'}"
+                for i in allocations
+            ),
+            " " * 12,
+        )
+
+    def test_tensorimpl_invalidation_set(self) -> None:
+        def profiled_code(add_empty_set: bool):
+            x = torch.ones((1,))
+
+            # Determines if new storage is created before or after the old one
+            # is destroyed.
+            if add_empty_set:
+                x.set_()
+
+            x.set_(torch.ones((1,)).storage())
+            x.view_as(x)
+
+        self.assertExpectedInline(
+            self._format_allocations(lambda: profiled_code(add_empty_set=False)),
+            """\
+                0          1      Allocation
+                0          2      Allocation
+                0          1      Free
+                0          2      Free""",
+        )
+
+        self.assertExpectedInline(
+            self._format_allocations(lambda: profiled_code(add_empty_set=True)),
+            """\
+                0          1      Allocation
+                0          1      Free
+                0          2      Allocation
+                0          2      Free""",
+        )
+
+    def test_tensorimpl_invalidation_keep_alive(self) -> None:
+        def profiled_code(add_empty_set: bool):
+            x = torch.ones((1,))
+            x_storages = [x.storage()]
+            for _ in range(3):
+                x.set_()
+                x.set_(torch.ones((1,)).storage())
+
+                # This keeps the StorageImpls alive and preserves the chain.
+                # (Despite the `set_()` call.)
+                x_storages.append(x.storage())
+            x.view_as(x)
+
+            # Free storage in a deterministic fashion.
+            while x_storages:
+                x_storages.pop()
+                gc.collect()
+
+            # Determines if new storage is created before or after the old one
+            # is destroyed.
+            if add_empty_set:
+                x.set_()
+
+            for _ in range(3):
+                x.set_(torch.ones((1,)).storage())
+            x.view_as(x)
+
+            del x
+            gc.collect()
+
+        self.assertExpectedInline(
+            self._format_allocations(lambda: profiled_code(add_empty_set=False)),
+            """\
+                0          1      Allocation
+                0          2      Allocation
+                0          4      Allocation
+                0          5      Allocation
+                0          4      Free
+                0          2      Free
+                0          1      Free
+                0          6      Allocation
+                0          5      Free
+                0          7      Allocation
+                0          6      Free
+                0          8      Allocation
+                0          7      Free
+                0          8      Free""",
+        )
+
+        self.assertExpectedInline(
+            self._format_allocations(lambda: profiled_code(add_empty_set=True)),
+            """\
+                0          1      Allocation
+                0          2      Allocation
+                0          4      Allocation
+                0          5      Allocation
+                0          4      Free
+                0          2      Free
+                0          1      Free
+                0          5      Free
+                0          6      Allocation
+                0          7      Allocation
+                0          6      Free
+                0          8      Allocation
+                0          7      Free
+                0          8      Free""",
+        )
+
+    def test_tensorimpl_invalidation_full(self) -> None:
+        def profiled_code():
+            x = torch.ones((1,))
+            x_storages = [x.storage()]
+            for _ in range(3):
+                x.set_()
+                x.set_(torch.ones((1,)).storage())
+                x_storages.append(x.storage())
+            x.view_as(x)
+
+            # Free storage in a deterministic fashion.
+            while x_storages:
+                x_storages.pop()
+                gc.collect()
+
+            for _ in range(3):
+                x.set_(torch.ones((1,)).storage())
+
+            for _ in range(3):
+                x.set_()
+                x.set_(torch.ones((1,)).storage())
+
+            for i in range(4):
+                x.resize_((1 + i,))
+            x.view_as(x)
+
+        self.assertExpectedInline(
+            self._format_allocations(profiled_code),
+            """\
+                0          1      Allocation
+                0          2      Allocation
+                0          4      Allocation
+                0          5      Allocation
+                0          4      Free
+                0          2      Free
+                0          1      Free
+                0          6      Allocation
+                0          5      Free
+                0          7      Allocation
+                0          6      Free
+                0          8      Allocation
+                0          7      Free
+                0          8      Free
+                0          9      Allocation
+                0          9      Free
+                0         10      Allocation
+                0         10      Free
+                0         11      Allocation
+                0         12      Allocation
+                0         11      Free
+                0         13      Allocation
+                0         12      Free
+                0         14      Allocation
+                0         13      Free
+                0         14      Free""",
+        )
+
+    def test_tensorimpl_invalidation_scalar_args(self) -> None:
+        def profiled_code():
+            with torch.no_grad():
+                x = torch.ones((1,))
+                for _ in range(10):
+                    x.add_(2)
+
+        self.assertExpectedInline(
+            self._format_allocations(profiled_code),
+            """\
+                0          1      Allocation
+                1          2      Allocation
+                2          3      Allocation
+                2          3      Free
+                1          2      Free
+                3          4      Allocation
+                4          5      Allocation
+                4          5      Free
+                3          4      Free
+                5          6      Allocation
+                6          7      Allocation
+                6          7      Free
+                5          6      Free
+                7          8      Allocation
+                8          9      Allocation
+                8          9      Free
+                7          8      Free
+                9         10      Allocation
+               10         11      Allocation
+               10         11      Free
+                9         10      Free
+               11         12      Allocation
+               12         13      Allocation
+               12         13      Free
+               11         12      Free
+               13         14      Allocation
+               14         15      Allocation
+               14         15      Free
+               13         14      Free
+               15         16      Allocation
+               16         17      Allocation
+               16         17      Free
+               15         16      Free
+               17         18      Allocation
+               18         19      Allocation
+               18         19      Free
+               17         18      Free
+               19         20      Allocation
+               20         21      Allocation
+               20         21      Free
+               19         20      Free
+                0          1      Free""",
+        )
+
+    def test_module_and_optimizer_ids(self) -> None:
+        model = torch.nn.Linear(2, 1, bias=True)
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+
+        def check(cold_start: bool) -> None:
+            with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
+                x = torch.ones((1, 2))
+                _ = x.sin()  # Mark `x`
+                model(x).backward()
+                optimizer.step()
+                _ = optimizer.state[model.weight][
+                    "momentum_buffer"
+                ].cos()  # Mark weight momentum
+                _ = model.weight.grad.tan()  # Mark weight gradient
+
+            nodes = p.profiler.kineto_results.experimental_event_tree()
+
+            def get_fields(op_name, index):
+                return self._get_tensor_fields(
+                    find_node_with_name(nodes, op_name), index
+                )
+
+            # Marked Tensors act as ground truth for python tracer IDs.
+            _, _, x_id = get_fields("aten::sin", 0)
+            _, _, weight_momenumtum_id = get_fields("aten::cos", 0)
+            _, _, weight_grad_id = get_fields("aten::tan", 0)
+            self.assertNotEqual(x_id, weight_momenumtum_id)
+            self.assertNotEqual(x_id, weight_grad_id)
+            self.assertNotEqual(weight_momenumtum_id, weight_grad_id)
+
+            # Use linear op to identify weight ground truth.
+            linear_op_node = find_node_with_name(nodes, "aten::linear")
+            self.assertIsNotNone(linear_op_node)
+            x_metadata, weight_metadata, _ = linear_op_node.extra_fields.inputs
+            self.assertEqual(x_id, x_metadata.id)
+
+            # Module
+            linear_module_node = find_node_with_name(nodes, "nn.Module: Linear_0")
+            self.assertIsNotNone(linear_module_node)
+            self.assertIsNotNone(linear_module_node.extra_fields.module)
+            self.assertIsNone(linear_module_node.extra_fields.optimizer)
+
+            linear_parameters = linear_module_node.extra_fields.module.parameters
+            name, weight, weight_grad = linear_parameters[0]
+            self.assertEqual(name, "weight")
+            self.assertEqual(weight.id, weight_metadata.id)
+
+            self.assertEqual(weight_grad is None, cold_start)
+            if not cold_start:
+                self.assertEqual(weight_grad.id, weight_grad_id)
+
+            # Optimizer
+            step_node = find_node_with_regex(nodes, "_optimizer_step_code")
+            self.assertIsNotNone(step_node)
+            self.assertIsNone(step_node.extra_fields.module)
+            self.assertIsNotNone(step_node.extra_fields.optimizer)
+            optimizer_parameters = step_node.extra_fields.optimizer.parameters
+            self.assertEqual(len(optimizer_parameters), 2)  # Weight and bias
+            weight, weight_grad, state = optimizer_parameters[0]
+            self.assertEqual(weight.id, weight_metadata.id)
+            self.assertEqual(weight_grad.id, weight_grad_id)
+            self.assertEqual(len(state), 1)
+            self.assertEqual(state[0][0], "momentum_buffer")
+            self.assertEqual(state[0][1].id, weight_momenumtum_id)
+
+        # Check that we handle first step (lazy initalization) and steady state.
+        check(cold_start=True)
+        check(cold_start=False)
+
+    def _test_allocation_ids(self, before_fn, after_fn) -> None:
+        with profile(profile_memory=True, record_shapes=True) as p:
+            # Introduce other operations and allocations to check robustness
+            _ = before_fn()
+
+            x = torch.rand(4, 3)
+            x.resize_(4, 4)
+
+            # We need to use `x` post resize for profiler to determine its ID.
+            x.sin()
+
+            # Introduce other operations and allocations to check robustness
+            _ = after_fn()
+
+            # Ensure `x` is the last variable collected to make it easier to
+            # find the deallocation event.
+            gc.collect()
+            del x
+            gc.collect()
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+
+        def find_chain(names: List[str]):
+            out = []
+            for name in names:
+                root = [out[-1]] if out else nodes
+                out.append(find_node_with_name(root, name))
+                self.assertIsNotNone(out[-1], name)
+            return out
+
+        allocation = find_chain(["aten::rand", "aten::empty", "[memory]"])[
+            -1
+        ].extra_fields
+        _, uniform_node = find_chain(["aten::rand", "aten::uniform_"])
+        x_impl, x_storage_data, x_id = self._get_tensor_fields(uniform_node, 0)
+
+        # Make sure IDs are consistent between allocations and op inputs
+        self.assertEqual(allocation.ptr, x_storage_data)
+        self.assertEqual(allocation.id, x_id)
+
+        resize_node = find_node_with_name(nodes, "aten::resize_")
+        self.assertIsNotNone(resize_node)
+        self.assertEqual(len(resize_node.children), 2)
+        allocate_new = resize_node.children[0].extra_fields
+        free_old = resize_node.children[1].extra_fields
+
+        # Destruction of the old storage for x.
+        self.assertEqual(free_old.id, allocation.id)
+        self.assertEqual(free_old.ptr, allocation.ptr)
+
+        # Make sure ID is retained through change in storage.
+        self.assertEqual(allocate_new.id, allocation.id)
+        self.assertNotEqual(allocate_new.ptr, allocation.ptr)
+
+        # Deletion when `x` goes out of scope.
+        free_new = [
+            i for i in nodes if i.tag == torch._C._profiler._EventType.Allocation
+        ][-1].extra_fields
+        self.assertIsInstance(free_new, torch._C._profiler._ExtraFields_Allocation)
+        self.assertEqual(free_new.id, allocate_new.id)
+        self.assertEqual(free_new.ptr, allocate_new.ptr)
+
+    def test_allocation_ids(self) -> None:
+        self._test_allocation_ids(lambda: None, lambda: None)
+
+    def test_allocation_ids_with_other_ops(self) -> None:
+        x = torch.ones((1,))
+        self._test_allocation_ids(
+            lambda: (x + 1).relu_(), lambda: torch.zeros((1,)).cos()
+        )
+
+    def test_impl_reuse(self) -> None:
+        repeats = 1_000
+        with profile(profile_memory=True, record_shapes=True) as p:
+            for _ in range(repeats):
+                torch.ones((1,))
+            gc.collect()
+
+        roots = p.profiler.kineto_results.experimental_event_tree()
+        tensor_impls = tuple(
+            e.extra_fields.inputs[0].impl_ptr
+            for e in _utils.traverse_dfs(roots)
+            if e.name == "aten::fill_"
+        )
+
+        self.assertEqual(len(tensor_impls), repeats)
+        self.assertEqual(len(set(tensor_impls)), repeats)
+
+    def test_allocation_id_uniqueness(self) -> None:
+        repeats = 1_000
+        with profile(profile_memory=True, record_shapes=True) as p:
+            for _ in range(repeats):
+                torch.ones((1,))
+            gc.collect()
+
+        roots = p.profiler.kineto_results.experimental_event_tree()
+        id_set = set()
+        for e in _utils.traverse_dfs(roots):
+            fields = e.extra_fields
+            if isinstance(fields, torch._C._profiler._ExtraFields_TorchOp):
+                id_set |= {
+                    t.allocation_id
+                    for t in fields.inputs
+                    if isinstance(t, _TensorMetadata)
+                }
+
+            elif isinstance(fields, torch._C._profiler._ExtraFields_Allocation):
+                id_set.add(fields.allocation_id)
+
+        id_set.difference_update([None])
+        self.assertEqual(repeats, len(id_set))
+
+    def test_extra_fields(self):
+        with profile(with_stack=True, profile_memory=True) as p:
+            _ = torch.ones((1,))
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+        node = find_node_with_name(nodes, "aten::ones")
+        self.assertIsNotNone(node)
+
+        self.assertIsInstance(
+            node.extra_fields, torch._C._profiler._ExtraFields_TorchOp
+        )
+
+        self.assertIsInstance(
+            node.parent.extra_fields, torch._C._profiler._ExtraFields_PyCCall
+        )
+
+        self.assertEqual(node.children[0].name, "aten::empty")
+        self.assertEqual(node.children[0].children[0].name, "[memory]")
+        self.assertIsInstance(
+            node.children[0].children[0].extra_fields,
+            torch._C._profiler._ExtraFields_Allocation,
+        )
+
+    def test_tensor_properties(self):
+        x = torch.ones(10, 10).as_strided([4, 4], [12, 3])
+        y = torch.ones(4, 1, requires_grad=True)
+
+        with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
+            _ = x + y
+            _ = x * y
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+        node = find_node_with_name(nodes, "aten::add")
+        self.assertIsNotNone(node)
+
+        self.assertIsInstance(
+            node.extra_fields, torch._C._profiler._ExtraFields_TorchOp
+        )
+
+        def getattr_inputs(name, default):
+            return [getattr(i, name, default) for i in node.extra_fields.inputs]
+
+        self.assertEqual(getattr_inputs("sizes", []), [[4, 4], [4, 1], []])
+        self.assertEqual(getattr_inputs("strides", []), [[12, 3], [1, 1], []])
+        self.assertEqual(
+            getattr_inputs("layout", None), [torch.strided, torch.strided, None]
+        )
+        self.assertEqual(
+            getattr_inputs("device", None),
+            [torch.device("cpu"), torch.device("cpu"), None],
+        )
+        self.assertEqual(
+            getattr_inputs("dtype", None), [torch.float32, torch.float32, None]
+        )
+        self.assertEqual(node.extra_fields.scope, torch.profiler.RecordScope.FUNCTION)
+
+        mul_node = find_node_with_name(nodes, "aten::mul")
+        self.assertIsNotNone(mul_node)
+        self.assertEqual(
+            node.extra_fields.sequence_number + 1, mul_node.extra_fields.sequence_number
+        )
+
+    def test_sparse_tensors(self):
+        i = [[0, 1, 1], [2, 0, 2]]
+        v = [3, 4, 5]
+        s = torch.sparse_coo_tensor(i, v, (2, 3))
+
+        with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
+            _ = s + s
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+        node = find_node_with_name(nodes, "aten::add")
+        self.assertIsNotNone(node)
+
+        self.assertIsInstance(
+            node.extra_fields, torch._C._profiler._ExtraFields_TorchOp
+        )
+
+        def getattr_inputs(name, default):
+            return [getattr(i, name, default) for i in node.extra_fields.inputs]
+
+        self.assertEqual(getattr_inputs("sizes", []), [[2, 3], [2, 3], []])
+        self.assertEqual(getattr_inputs("strides", []), [[], [], []])
+        self.assertEqual(
+            getattr_inputs("layout", None), [torch.sparse_coo, torch.sparse_coo, None]
+        )
+        self.assertEqual(
+            getattr_inputs("device", None),
+            [torch.device("cpu"), torch.device("cpu"), None],
+        )
+
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
+    def test_mkldnn_tensors(self):
+        x = torch.ones(4, 3).to_mkldnn()
+
+        with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
+            _ = x + x
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+        node = find_node_with_name(nodes, "aten::add")
+        self.assertIsNotNone(node)
+
+        self.assertIsInstance(
+            node.extra_fields, torch._C._profiler._ExtraFields_TorchOp
+        )
+
+        def getattr_inputs(name, default):
+            return [getattr(i, name, default) for i in node.extra_fields.inputs]
+
+        self.assertEqual(getattr_inputs("sizes", []), [[4, 3], [4, 3], []])
+        self.assertEqual(getattr_inputs("strides", []), [[], [], []])
+        self.assertEqual(
+            getattr_inputs("layout", None), [torch._mkldnn, torch._mkldnn, None]
+        )
+        self.assertEqual(
+            getattr_inputs("device", None),
+            [torch.device("cpu"), torch.device("cpu"), None],
+        )
+
+    def test_scalar_ins(self):
+        x = torch.ones(5, 5)
+        alpha = 0.9
+
+        with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
+            _ = torch.add(x, 9.1, alpha=alpha)
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+        node = find_node_with_name(nodes, "aten::add")
+        self.assertIsNotNone(node)
+
+        def getattr_inputs(name, default):
+            return [getattr(i, name, default) for i in node.extra_fields.inputs]
+
+        # The second argument to the add gets promotoed to a zerodim Tensor
+        self.assertEqual(
+            getattr_inputs("dtype", None), [torch.float32, torch.float64, None]
+        )
+        self.assertEqual(getattr_inputs("sizes", []), [[5, 5], [], []])
+        self.assertEqual(node.extra_fields.inputs[2], alpha)
+
+    def test_tensor_lists(self):
+        x = torch.ones((1,))
+        y = torch.ones((1,))
+        with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
+            _ = torch.stack((x, y))
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+        node = find_node_with_name(nodes, "aten::stack")
+        inputs = node.extra_fields.inputs
+        self.assertEqual(len(inputs), 2)
+        self.assertIsInstance(inputs[0], list)
+        self.assertEqual(len(inputs[0]), 2)
+        self.assertEqual(x.storage().data_ptr(), inputs[0][0].storage_data_ptr)
+        self.assertEqual(y.storage().data_ptr(), inputs[0][1].storage_data_ptr)
+
+    def test_nnmodule_params(self):
+        def flat_out_extrafields(nodes, out=None):
+            if out is None:
+                out = []
+            for node in nodes:
+                if (
+                    isinstance(node.extra_fields, _ExtraFields_PyCall)
+                    and node.extra_fields.module
+                ):
+                    if node.extra_fields.module.parameters:
+                        out.append(node.extra_fields.module)
+                flat_out_extrafields(node.children, out)
+            return out
+
+        inputs = torch.rand(10)
+        net = SimpleNet()
+        out = net(inputs)
+        torch.nn.functional.cross_entropy(out, torch.rand(2)).backward()
+        with torch.profiler.profile(with_stack=True, profile_memory=True) as p:
+            _ = net(inputs)
+
+        modules = flat_out_extrafields(
+            p.profiler.kineto_results.experimental_event_tree()
+        )
+        self.assertEqual(
+            len(modules), 2, f"Expected two parameter list, but got {len(modules)}"
+        )
+
+        params = [
+            (n, p.storage_data_ptr, g.storage_data_ptr)
+            for module in modules
+            for (n, p, g) in module.parameters
+        ]
+        expected = [
+            (name, val.storage().data_ptr(), val.grad.storage().data_ptr())
+            for name, val in net.fc1._parameters.items()
+        ]
+        expected += [
+            (name, val.storage().data_ptr(), val.grad.storage().data_ptr())
+            for name, val in net.fc2._parameters.items()
+        ]
+        self.assertEqual(expected, params, f"{expected} vs. {params}")
+
+    def _flat_out_extrafields(self, nodes, out=None):
+        if out is None:
+            out = []
+        for node in nodes:
+            if (
+                isinstance(node.extra_fields, _ExtraFields_PyCall)
+                and node.extra_fields.optimizer
+                and node.extra_fields.optimizer.parameters
+            ):
+                # avoiding OptInfo duplicates from iterations
+                addr = node.extra_fields.optimizer.parameters[0][0].storage_data_ptr
+                if not [o for o in out if addr == o.parameters[0][0].storage_data_ptr]:
+                    out.append(node.extra_fields.optimizer)
+            self._flat_out_extrafields(node.children, out)
+        return out
+
+    def _check_results(self, opt, opts, check_items=False):
+        self.assertEqual(len(opts), 1, f"Expected 1 optimizer: len(opts): {len(opts)}")
+        self.assertEqual(
+            id(opt),
+            opts[0].self_ptr,
+            f"Optimizer addr ({id(opt)}) vs. profiled addr ({opts[0].self_ptr})",
+        )
+        if check_items:
+            self.assertEqual(len(opt.param_groups), len(opts))
+            for group, opt_ in zip(opt.param_groups, opts):
+                self.assertEqual(
+                    [(v.storage().data_ptr()) for v in group.get("params", [])],
+                    [(o.storage_data_ptr) for (o, _, _) in opt_.parameters],
+                )
+            for opt_ in opts:
+                observed_state = {
+                    p.storage_data_ptr: {name: s.storage_data_ptr for name, s in state}
+                    for (p, _, state) in opt_.parameters
+                }
+
+                # Make sure the profiler collected all optimizer state and check
+                # that the address recorded by the profiler is correct.
+                for parameter, parameter_state in opt.state.items():
+                    self.assertEqual(
+                        {
+                            name: value.storage().data_ptr()
+                            for name, value in parameter_state.items()
+                        },
+                        observed_state.get(parameter.storage().data_ptr(), []),
+                    )
+
+    def test_optimizer(self):
+        inputs = torch.rand(10)
+        with torch.profiler.profile(with_stack=True, profile_memory=True) as p:
+            net = SimpleNet()
+            opt = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
+
+            opt.zero_grad()
+            out = net(inputs)
+            loss = torch.nn.functional.cross_entropy(out, torch.rand(2))
+            loss.backward()
+            opt.step()
+        self._check_results(
+            opt,
+            self._flat_out_extrafields(
+                p.profiler.kineto_results.experimental_event_tree()
+            ),
+            False,
+        )
+
+    def _test_optimizer_parameters(self, optimizer_factory):
+        inputs = torch.rand(10)
+        with torch.profiler.profile(with_stack=True, profile_memory=True) as p:
+            net = SimpleNet()
+            opt = optimizer_factory(net.parameters())
+            for _ in range(2):
+                opt.zero_grad()
+                out = net(inputs)
+                loss = torch.nn.functional.cross_entropy(out, torch.rand(2))
+                loss.backward()
+                opt.step()
+        self._check_results(
+            opt,
+            self._flat_out_extrafields(
+                p.profiler.kineto_results.experimental_event_tree()
+            ),
+            True,
+        )
+
+    def test_optimizer_parameters_sgd(self):
+        self._test_optimizer_parameters(
+            lambda params: torch.optim.SGD(params, lr=0.01, momentum=0.9)
+        )
+
+    def test_optimizer_parameters_adam(self):
+        self._test_optimizer_parameters(
+            lambda params: torch.optim.Adam(params, foreach=True)
+        )
+
+    def test_allocations(self):
+        gc.collect()
+        with profile(profile_memory=True) as p:
+            x = torch.empty((3, 4))
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+        node = find_node_with_name(nodes, "[memory]")
+        self.assertIsNotNone(node)
+
+        alloc_size = 3 * 4 * 4  # fp32 -> 4 bytes
+        ptr = node.extra_fields.ptr
+        self.assertGreater(ptr, 0)
+        self.assertEqual(node.extra_fields.alloc_size, alloc_size)
+        self.assertEqual(node.extra_fields.device, torch.device("cpu"))
+        total_allocated = node.extra_fields.total_allocated
+
+        # total_reserved is only for CUDACachingAllocator
+        self.assertEqual(node.extra_fields.total_reserved, 0)
+
+        with profile(profile_memory=True) as p:
+            del x
+            gc.collect()
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+        node = find_node_with_name(nodes, "[memory]")
+        self.assertIsNotNone(node)
+
+        self.assertEqual(node.extra_fields.ptr, ptr)
+        self.assertEqual(node.extra_fields.alloc_size, -alloc_size)
+        self.assertEqual(node.extra_fields.device, torch.device("cpu"))
+        self.assertEqual(
+            node.extra_fields.total_allocated, total_allocated - alloc_size
+        )
+
+    def test_refcounts(self):
+        class Sentinel:
+            pass
+
+        def make():
+            outer_sentinel = Sentinel()
+
+            def outer():
+                # Python will only close over variables used in the function.
+                _ = outer_sentinel
+                inner_sentinel = Sentinel()
+
+                def inner():
+                    _ = inner_sentinel
+
+                with profile(with_stack=True):
+                    inner()
+
+                return weakref.ref(inner_sentinel)
+
+            return outer, weakref.ref(outer_sentinel)
+
+        # Use a factory function to ensure the test scope never sees strong
+        # references. `del` has strange semantics that interact with closures
+        # at an AST level, so this is simpler.
+        outer, outer_sentinel_ref = make()
+        inner_sentinel_ref = outer()
+
+        self.assertIsNone(inner_sentinel_ref())
+
+        # `outer` holds the last reference via closure.
+        self.assertIsNotNone(outer_sentinel_ref())
+
+        del outer
+        self.assertIsNone(outer_sentinel_ref())
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/pytest_shard_custom.py b/test/pytest_shard_custom.py
new file mode 100644
index 0000000000000..2acec2ae1654c
--- /dev/null
+++ b/test/pytest_shard_custom.py
@@ -0,0 +1,67 @@
+"""
+Custom pytest shard plugin
+https://github.com/AdamGleave/pytest-shard/blob/64610a08dac6b0511b6d51cf895d0e1040d162ad/pytest_shard/pytest_shard.py#L1
+Modifications:
+* shards are now 1 indexed instead of 0 indexed
+* option for printing items in shard
+"""
+import hashlib
+
+from _pytest.config.argparsing import Parser
+
+
+def pytest_addoptions(parser: Parser):
+    """Add options to control sharding."""
+    group = parser.getgroup("shard")
+    group.addoption(
+        "--shard-id", dest="shard_id", type=int, default=1, help="Number of this shard."
+    )
+    group.addoption(
+        "--num-shards",
+        dest="num_shards",
+        type=int,
+        default=1,
+        help="Total number of shards.",
+    )
+    group.addoption(
+        "--print-items",
+        dest="print_items",
+        action="store_true",
+        default=False,
+        help="Print out the items being tested in this shard.",
+    )
+
+
+class PytestShardPlugin:
+    def __init__(self, config):
+        self.config = config
+
+    def pytest_report_collectionfinish(self, config, items) -> str:
+        """Log how many and which items are tested in this shard."""
+        msg = f"Running {len(items)} items in this shard"
+        if config.getoption("print_items"):
+            msg += ": " + ", ".join([item.nodeid for item in items])
+        return msg
+
+    def sha256hash(self, x: str) -> int:
+        return int.from_bytes(hashlib.sha256(x.encode()).digest(), "little")
+
+    def filter_items_by_shard(self, items, shard_id: int, num_shards: int):
+        """Computes `items` that should be tested in `shard_id` out of `num_shards` total shards."""
+        new_items = [
+            item
+            for item in items
+            if self.sha256hash(item.nodeid) % num_shards == shard_id - 1
+        ]
+        return new_items
+
+    def pytest_collection_modifyitems(self, config, items):
+        """Mutate the collection to consist of just items to be tested in this shard."""
+        shard_id = config.getoption("shard_id")
+        shard_total = config.getoption("num_shards")
+        if shard_id < 1 or shard_id > shard_total:
+            raise ValueError(
+                f"{shard_id} is not a valid shard ID out of {shard_total} total shards"
+            )
+
+        items[:] = self.filter_items_by_shard(items, shard_id, shard_total)
diff --git a/test/quantization/ao_migration/common.py b/test/quantization/ao_migration/common.py
index de6e67d35a55c..30a1034e9be00 100644
--- a/test/quantization/ao_migration/common.py
+++ b/test/quantization/ao_migration/common.py
@@ -1,42 +1,52 @@
-from torch.testing._internal.common_utils import TestCase
-
 import importlib
 from typing import List, Optional
 
+from torch.testing._internal.common_utils import TestCase
+
+
 class AOMigrationTestCase(TestCase):
-    def _test_function_import(self, package_name: str, function_list: List[str],
-                              base: Optional[str] = None, new_package_name: Optional[str] = None):
+    def _test_function_import(
+        self,
+        package_name: str,
+        function_list: List[str],
+        base: Optional[str] = None,
+        new_package_name: Optional[str] = None,
+    ):
         r"""Tests individual function list import by comparing the functions
         and their hashes."""
         if base is None:
-            base = 'quantization'
-        old_base = 'torch.' + base
-        new_base = 'torch.ao.' + base
+            base = "quantization"
+        old_base = "torch." + base
+        new_base = "torch.ao." + base
         if new_package_name is None:
             new_package_name = package_name
-        old_location = importlib.import_module(f'{old_base}.{package_name}')
-        new_location = importlib.import_module(f'{new_base}.{new_package_name}')
+        old_location = importlib.import_module(f"{old_base}.{package_name}")
+        new_location = importlib.import_module(f"{new_base}.{new_package_name}")
         for fn_name in function_list:
             old_function = getattr(old_location, fn_name)
             new_function = getattr(new_location, fn_name)
             assert old_function == new_function, f"Functions don't match: {fn_name}"
-            assert hash(old_function) == hash(new_function), \
-                f"Hashes don't match: {old_function}({hash(old_function)}) vs. " \
+            assert hash(old_function) == hash(new_function), (
+                f"Hashes don't match: {old_function}({hash(old_function)}) vs. "
                 f"{new_function}({hash(new_function)})"
+            )
 
-    def _test_dict_import(self, package_name: str, dict_list: List[str],
-                          base: Optional[str] = None):
+    def _test_dict_import(
+        self, package_name: str, dict_list: List[str], base: Optional[str] = None
+    ):
         r"""Tests individual function list import by comparing the functions
         and their hashes."""
         if base is None:
-            base = 'quantization'
-        old_base = 'torch.' + base
-        new_base = 'torch.ao.' + base
-        old_location = importlib.import_module(f'{old_base}.{package_name}')
-        new_location = importlib.import_module(f'{new_base}.{package_name}')
+            base = "quantization"
+        old_base = "torch." + base
+        new_base = "torch.ao." + base
+        old_location = importlib.import_module(f"{old_base}.{package_name}")
+        new_location = importlib.import_module(f"{new_base}.{package_name}")
         for dict_name in dict_list:
             old_dict = getattr(old_location, dict_name)
             new_dict = getattr(new_location, dict_name)
             assert old_dict == new_dict, f"Dicts don't match: {dict_name}"
             for key in new_dict.keys():
-                assert old_dict[key] == new_dict[key], f"Dicts don't match: {dict_name} for key {key}"
+                assert (
+                    old_dict[key] == new_dict[key]
+                ), f"Dicts don't match: {dict_name} for key {key}"
diff --git a/test/quantization/ao_migration/test_ao_migration.py b/test/quantization/ao_migration/test_ao_migration.py
index 374fc205e375f..020dc6d56d8d4 100644
--- a/test/quantization/ao_migration/test_ao_migration.py
+++ b/test/quantization/ao_migration/test_ao_migration.py
@@ -7,257 +7,261 @@ class TestAOMigrationNNQuantized(AOMigrationTestCase):
     def test_functional_import(self):
         r"""Tests the migration of the torch.nn.quantized.functional"""
         function_list = [
-            'avg_pool2d',
-            'avg_pool3d',
-            'adaptive_avg_pool2d',
-            'adaptive_avg_pool3d',
-            'conv1d',
-            'conv2d',
-            'conv3d',
-            'interpolate',
-            'linear',
-            'max_pool1d',
-            'max_pool2d',
-            'celu',
-            'leaky_relu',
-            'hardtanh',
-            'hardswish',
-            'threshold',
-            'elu',
-            'hardsigmoid',
-            'clamp',
-            'upsample',
-            'upsample_bilinear',
-            'upsample_nearest',
+            "avg_pool2d",
+            "avg_pool3d",
+            "adaptive_avg_pool2d",
+            "adaptive_avg_pool3d",
+            "conv1d",
+            "conv2d",
+            "conv3d",
+            "interpolate",
+            "linear",
+            "max_pool1d",
+            "max_pool2d",
+            "celu",
+            "leaky_relu",
+            "hardtanh",
+            "hardswish",
+            "threshold",
+            "elu",
+            "hardsigmoid",
+            "clamp",
+            "upsample",
+            "upsample_bilinear",
+            "upsample_nearest",
         ]
-        self._test_function_import('functional', function_list, base='nn.quantized')
+        self._test_function_import("functional", function_list, base="nn.quantized")
 
     def test_modules_import(self):
         module_list = [
             # Modules
-            'BatchNorm2d',
-            'BatchNorm3d',
-            'Conv1d',
-            'Conv2d',
-            'Conv3d',
-            'ConvTranspose1d',
-            'ConvTranspose2d',
-            'ConvTranspose3d',
-            'DeQuantize',
-            'ELU',
-            'Embedding',
-            'EmbeddingBag',
-            'GroupNorm',
-            'Hardswish',
-            'InstanceNorm1d',
-            'InstanceNorm2d',
-            'InstanceNorm3d',
-            'LayerNorm',
-            'LeakyReLU',
-            'Linear',
-            'MaxPool2d',
-            'Quantize',
-            'ReLU6',
-            'Sigmoid',
-            'Softmax',
-            'Dropout',
+            "BatchNorm2d",
+            "BatchNorm3d",
+            "Conv1d",
+            "Conv2d",
+            "Conv3d",
+            "ConvTranspose1d",
+            "ConvTranspose2d",
+            "ConvTranspose3d",
+            "DeQuantize",
+            "ELU",
+            "Embedding",
+            "EmbeddingBag",
+            "GroupNorm",
+            "Hardswish",
+            "InstanceNorm1d",
+            "InstanceNorm2d",
+            "InstanceNorm3d",
+            "LayerNorm",
+            "LeakyReLU",
+            "Linear",
+            "MaxPool2d",
+            "Quantize",
+            "ReLU6",
+            "Sigmoid",
+            "Softmax",
+            "Dropout",
             # Wrapper modules
-            'FloatFunctional',
-            'FXFloatFunctional',
-            'QFunctional',
+            "FloatFunctional",
+            "FXFloatFunctional",
+            "QFunctional",
         ]
-        self._test_function_import('modules', module_list, base='nn.quantized')
+        self._test_function_import("modules", module_list, base="nn.quantized")
 
     def test_modules_activation(self):
         function_list = [
-            'ReLU6',
-            'Hardswish',
-            'ELU',
-            'LeakyReLU',
-            'Sigmoid',
-            'Softmax',
+            "ReLU6",
+            "Hardswish",
+            "ELU",
+            "LeakyReLU",
+            "Sigmoid",
+            "Softmax",
         ]
-        self._test_function_import('activation', function_list,
-                                   base='nn.quantized.modules')
+        self._test_function_import(
+            "activation", function_list, base="nn.quantized.modules"
+        )
 
     def test_modules_batchnorm(self):
         function_list = [
-            'BatchNorm2d',
-            'BatchNorm3d',
+            "BatchNorm2d",
+            "BatchNorm3d",
         ]
-        self._test_function_import('batchnorm', function_list,
-                                   base='nn.quantized.modules')
+        self._test_function_import(
+            "batchnorm", function_list, base="nn.quantized.modules"
+        )
 
     def test_modules_conv(self):
         function_list = [
-            '_reverse_repeat_padding',
-            'Conv1d',
-            'Conv2d',
-            'Conv3d',
-            'ConvTranspose1d',
-            'ConvTranspose2d',
-            'ConvTranspose3d',
+            "_reverse_repeat_padding",
+            "Conv1d",
+            "Conv2d",
+            "Conv3d",
+            "ConvTranspose1d",
+            "ConvTranspose2d",
+            "ConvTranspose3d",
         ]
 
-        self._test_function_import('conv', function_list,
-                                   base='nn.quantized.modules')
+        self._test_function_import("conv", function_list, base="nn.quantized.modules")
 
     def test_modules_dropout(self):
         function_list = [
-            'Dropout',
+            "Dropout",
         ]
-        self._test_function_import('dropout', function_list,
-                                   base='nn.quantized.modules')
+        self._test_function_import(
+            "dropout", function_list, base="nn.quantized.modules"
+        )
 
     def test_modules_embedding_ops(self):
         function_list = [
-            'EmbeddingPackedParams',
-            'Embedding',
-            'EmbeddingBag',
+            "EmbeddingPackedParams",
+            "Embedding",
+            "EmbeddingBag",
         ]
-        self._test_function_import('embedding_ops', function_list,
-                                   base='nn.quantized.modules')
+        self._test_function_import(
+            "embedding_ops", function_list, base="nn.quantized.modules"
+        )
 
     def test_modules_functional_modules(self):
         function_list = [
-            'FloatFunctional',
-            'FXFloatFunctional',
-            'QFunctional',
+            "FloatFunctional",
+            "FXFloatFunctional",
+            "QFunctional",
         ]
-        self._test_function_import('functional_modules', function_list,
-                                   base='nn.quantized.modules')
+        self._test_function_import(
+            "functional_modules", function_list, base="nn.quantized.modules"
+        )
 
     def test_modules_linear(self):
         function_list = [
-            'Linear',
-            'LinearPackedParams',
+            "Linear",
+            "LinearPackedParams",
         ]
-        self._test_function_import('linear', function_list,
-                                   base='nn.quantized.modules')
+        self._test_function_import("linear", function_list, base="nn.quantized.modules")
 
     def test_modules_normalization(self):
         function_list = [
-            'LayerNorm',
-            'GroupNorm',
-            'InstanceNorm1d',
-            'InstanceNorm2d',
-            'InstanceNorm3d',
+            "LayerNorm",
+            "GroupNorm",
+            "InstanceNorm1d",
+            "InstanceNorm2d",
+            "InstanceNorm3d",
         ]
-        self._test_function_import('normalization', function_list,
-                                   base='nn.quantized.modules')
+        self._test_function_import(
+            "normalization", function_list, base="nn.quantized.modules"
+        )
 
     def test_modules_utils(self):
         function_list = [
-            '_ntuple_from_first',
-            '_pair_from_first',
-            '_quantize_weight',
-            '_hide_packed_params_repr',
-            'WeightedQuantizedModule',
+            "_ntuple_from_first",
+            "_pair_from_first",
+            "_quantize_weight",
+            "_hide_packed_params_repr",
+            "WeightedQuantizedModule",
         ]
-        self._test_function_import('utils', function_list,
-                                   base='nn.quantized.modules')
+        self._test_function_import("utils", function_list, base="nn.quantized.modules")
 
     def test_import_nn_quantized_dynamic_import(self):
         module_list = [
             # Modules
-            'Linear',
-            'LSTM',
-            'GRU',
-            'LSTMCell',
-            'RNNCell',
-            'GRUCell',
-            'Conv1d',
-            'Conv2d',
-            'Conv3d',
-            'ConvTranspose1d',
-            'ConvTranspose2d',
-            'ConvTranspose3d',
+            "Linear",
+            "LSTM",
+            "GRU",
+            "LSTMCell",
+            "RNNCell",
+            "GRUCell",
+            "Conv1d",
+            "Conv2d",
+            "Conv3d",
+            "ConvTranspose1d",
+            "ConvTranspose2d",
+            "ConvTranspose3d",
         ]
-        self._test_function_import('dynamic', module_list, base='nn.quantized')
+        self._test_function_import("dynamic", module_list, base="nn.quantized")
 
     def test_import_nn_quantizable_activation(self):
         module_list = [
             # Modules
-            'MultiheadAttention',
+            "MultiheadAttention",
         ]
-        self._test_function_import('activation', module_list, base='nn.quantizable.modules')
+        self._test_function_import(
+            "activation", module_list, base="nn.quantizable.modules"
+        )
 
     def test_import_nn_quantizable_rnn(self):
         module_list = [
             # Modules
-            'LSTM',
-            'LSTMCell',
+            "LSTM",
+            "LSTMCell",
         ]
-        self._test_function_import('rnn', module_list, base='nn.quantizable.modules')
+        self._test_function_import("rnn", module_list, base="nn.quantizable.modules")
 
     def test_import_nn_qat_conv(self):
         module_list = [
-            'Conv1d',
-            'Conv2d',
-            'Conv3d',
+            "Conv1d",
+            "Conv2d",
+            "Conv3d",
         ]
-        self._test_function_import('conv', module_list, base='nn.qat.modules')
+        self._test_function_import("conv", module_list, base="nn.qat.modules")
 
     def test_import_nn_qat_embedding_ops(self):
         module_list = [
-            'Embedding',
-            'EmbeddingBag',
+            "Embedding",
+            "EmbeddingBag",
         ]
-        self._test_function_import('embedding_ops', module_list, base='nn.qat.modules')
+        self._test_function_import("embedding_ops", module_list, base="nn.qat.modules")
 
     def test_import_nn_qat_linear(self):
         module_list = [
-            'Linear',
+            "Linear",
         ]
-        self._test_function_import('linear', module_list, base='nn.qat.modules')
+        self._test_function_import("linear", module_list, base="nn.qat.modules")
 
     def test_import_nn_qat_dynamic_linear(self):
         module_list = [
-            'Linear',
+            "Linear",
         ]
-        self._test_function_import('linear', module_list, base='nn.qat.dynamic.modules')
+        self._test_function_import("linear", module_list, base="nn.qat.dynamic.modules")
 
 
 class TestAOMigrationNNIntrinsic(AOMigrationTestCase):
     def test_modules_import_nn_intrinsic(self):
         module_list = [
             # Modules
-            '_FusedModule',
-            'ConvBn1d',
-            'ConvBn2d',
-            'ConvBn3d',
-            'ConvBnReLU1d',
-            'ConvBnReLU2d',
-            'ConvBnReLU3d',
-            'ConvReLU1d',
-            'ConvReLU2d',
-            'ConvReLU3d',
-            'LinearReLU',
-            'BNReLU2d',
-            'BNReLU3d',
-            'LinearBn1d',
+            "_FusedModule",
+            "ConvBn1d",
+            "ConvBn2d",
+            "ConvBn3d",
+            "ConvBnReLU1d",
+            "ConvBnReLU2d",
+            "ConvBnReLU3d",
+            "ConvReLU1d",
+            "ConvReLU2d",
+            "ConvReLU3d",
+            "LinearReLU",
+            "BNReLU2d",
+            "BNReLU3d",
+            "LinearBn1d",
         ]
-        self._test_function_import('intrinsic', module_list, base='nn')
+        self._test_function_import("intrinsic", module_list, base="nn")
 
     def test_modules_nn_intrinsic_fused(self):
         function_list = [
-            '_FusedModule',
-            'ConvBn1d',
-            'ConvBn2d',
-            'ConvBn3d',
-            'ConvBnReLU1d',
-            'ConvBnReLU2d',
-            'ConvBnReLU3d',
-            'ConvReLU1d',
-            'ConvReLU2d',
-            'ConvReLU3d',
-            'LinearReLU',
-            'BNReLU2d',
-            'BNReLU3d',
-            'LinearBn1d',
+            "_FusedModule",
+            "ConvBn1d",
+            "ConvBn2d",
+            "ConvBn3d",
+            "ConvBnReLU1d",
+            "ConvBnReLU2d",
+            "ConvBnReLU3d",
+            "ConvReLU1d",
+            "ConvReLU2d",
+            "ConvReLU3d",
+            "LinearReLU",
+            "BNReLU2d",
+            "BNReLU3d",
+            "LinearBn1d",
         ]
-        self._test_function_import('fused', function_list,
-                                   base='nn.intrinsic.modules')
+        self._test_function_import("fused", function_list, base="nn.intrinsic.modules")
 
     def test_modules_import_nn_intrinsic_qat(self):
         module_list = [
@@ -275,76 +279,83 @@ def test_modules_import_nn_intrinsic_qat(self):
             "update_bn_stats",
             "freeze_bn_stats",
         ]
-        self._test_function_import('qat', module_list, base='nn.intrinsic')
+        self._test_function_import("qat", module_list, base="nn.intrinsic")
 
     def test_modules_intrinsic_qat_conv_fused(self):
         function_list = [
-            'ConvBn1d',
-            'ConvBnReLU1d',
-            'ConvReLU1d',
-            'ConvBn2d',
-            'ConvBnReLU2d',
-            'ConvReLU2d',
-            'ConvBn3d',
-            'ConvBnReLU3d',
-            'ConvReLU3d',
-            'update_bn_stats',
-            'freeze_bn_stats'
+            "ConvBn1d",
+            "ConvBnReLU1d",
+            "ConvReLU1d",
+            "ConvBn2d",
+            "ConvBnReLU2d",
+            "ConvReLU2d",
+            "ConvBn3d",
+            "ConvBnReLU3d",
+            "ConvReLU3d",
+            "update_bn_stats",
+            "freeze_bn_stats",
         ]
-        self._test_function_import('conv_fused', function_list,
-                                   base='nn.intrinsic.qat.modules')
+        self._test_function_import(
+            "conv_fused", function_list, base="nn.intrinsic.qat.modules"
+        )
 
     def test_modules_intrinsic_qat_linear_fused(self):
         function_list = [
-            'LinearBn1d',
+            "LinearBn1d",
         ]
-        self._test_function_import('linear_fused', function_list,
-                                   base='nn.intrinsic.qat.modules')
+        self._test_function_import(
+            "linear_fused", function_list, base="nn.intrinsic.qat.modules"
+        )
 
     def test_modules_intrinsic_qat_linear_relu(self):
         function_list = [
-            'LinearReLU',
+            "LinearReLU",
         ]
-        self._test_function_import('linear_relu', function_list,
-                                   base='nn.intrinsic.qat.modules')
+        self._test_function_import(
+            "linear_relu", function_list, base="nn.intrinsic.qat.modules"
+        )
 
     def test_modules_import_nn_intrinsic_quantized(self):
         module_list = [
-            'BNReLU2d',
-            'BNReLU3d',
-            'ConvReLU1d',
-            'ConvReLU2d',
-            'ConvReLU3d',
-            'LinearReLU',
+            "BNReLU2d",
+            "BNReLU3d",
+            "ConvReLU1d",
+            "ConvReLU2d",
+            "ConvReLU3d",
+            "LinearReLU",
         ]
-        self._test_function_import('quantized', module_list, base='nn.intrinsic')
+        self._test_function_import("quantized", module_list, base="nn.intrinsic")
 
     def test_modules_intrinsic_quantized_bn_relu(self):
         function_list = [
-            'BNReLU2d',
-            'BNReLU3d',
+            "BNReLU2d",
+            "BNReLU3d",
         ]
-        self._test_function_import('bn_relu', function_list,
-                                   base='nn.intrinsic.quantized.modules')
+        self._test_function_import(
+            "bn_relu", function_list, base="nn.intrinsic.quantized.modules"
+        )
 
     def test_modules_intrinsic_quantized_conv_relu(self):
         function_list = [
-            'ConvReLU1d',
-            'ConvReLU2d',
-            'ConvReLU3d',
+            "ConvReLU1d",
+            "ConvReLU2d",
+            "ConvReLU3d",
         ]
-        self._test_function_import('conv_relu', function_list,
-                                   base='nn.intrinsic.quantized.modules')
+        self._test_function_import(
+            "conv_relu", function_list, base="nn.intrinsic.quantized.modules"
+        )
 
     def test_modules_intrinsic_quantized_linear_relu(self):
         function_list = [
-            'LinearReLU',
+            "LinearReLU",
         ]
-        self._test_function_import('linear_relu', function_list,
-                                   base='nn.intrinsic.quantized.modules')
+        self._test_function_import(
+            "linear_relu", function_list, base="nn.intrinsic.quantized.modules"
+        )
 
     def test_modules_no_import_nn_intrinsic_quantized_dynamic(self):
         # TODO(future PR): generalize this
         import torch
+
         _ = torch.ao.nn.intrinsic.quantized.dynamic
         _ = torch.nn.intrinsic.quantized.dynamic
diff --git a/test/quantization/ao_migration/test_quantization.py b/test/quantization/ao_migration/test_quantization.py
index 356ab4da0e653..3d416f3b67a28 100644
--- a/test/quantization/ao_migration/test_quantization.py
+++ b/test/quantization/ao_migration/test_quantization.py
@@ -7,102 +7,103 @@ class TestAOMigrationQuantization(AOMigrationTestCase):
     r"""Modules and functions related to the
     `torch/quantization` migration to `torch/ao/quantization`.
     """
+
     def test_function_import_quantize(self):
         function_list = [
-            '_convert',
-            '_observer_forward_hook',
-            '_propagate_qconfig_helper',
-            '_remove_activation_post_process',
-            '_remove_qconfig',
-            '_add_observer_',
-            'add_quant_dequant',
-            'convert',
-            '_get_observer_dict',
-            '_get_unique_devices_',
-            '_is_activation_post_process',
-            'prepare',
-            'prepare_qat',
-            'propagate_qconfig_',
-            'quantize',
-            'quantize_dynamic',
-            'quantize_qat',
-            '_register_activation_post_process_hook',
-            'swap_module',
+            "_convert",
+            "_observer_forward_hook",
+            "_propagate_qconfig_helper",
+            "_remove_activation_post_process",
+            "_remove_qconfig",
+            "_add_observer_",
+            "add_quant_dequant",
+            "convert",
+            "_get_observer_dict",
+            "_get_unique_devices_",
+            "_is_activation_post_process",
+            "prepare",
+            "prepare_qat",
+            "propagate_qconfig_",
+            "quantize",
+            "quantize_dynamic",
+            "quantize_qat",
+            "_register_activation_post_process_hook",
+            "swap_module",
         ]
-        self._test_function_import('quantize', function_list)
+        self._test_function_import("quantize", function_list)
 
     def test_function_import_stubs(self):
         function_list = [
-            'QuantStub',
-            'DeQuantStub',
-            'QuantWrapper',
+            "QuantStub",
+            "DeQuantStub",
+            "QuantWrapper",
         ]
-        self._test_function_import('stubs', function_list)
+        self._test_function_import("stubs", function_list)
 
     def test_function_import_quantize_jit(self):
         function_list = [
-            '_check_is_script_module',
-            '_check_forward_method',
-            'script_qconfig',
-            'script_qconfig_dict',
-            'fuse_conv_bn_jit',
-            '_prepare_jit',
-            'prepare_jit',
-            'prepare_dynamic_jit',
-            '_convert_jit',
-            'convert_jit',
-            'convert_dynamic_jit',
-            '_quantize_jit',
-            'quantize_jit',
-            'quantize_dynamic_jit',
+            "_check_is_script_module",
+            "_check_forward_method",
+            "script_qconfig",
+            "script_qconfig_dict",
+            "fuse_conv_bn_jit",
+            "_prepare_jit",
+            "prepare_jit",
+            "prepare_dynamic_jit",
+            "_convert_jit",
+            "convert_jit",
+            "convert_dynamic_jit",
+            "_quantize_jit",
+            "quantize_jit",
+            "quantize_dynamic_jit",
         ]
-        self._test_function_import('quantize_jit', function_list)
+        self._test_function_import("quantize_jit", function_list)
 
     def test_function_import_fake_quantize(self):
         function_list = [
-            '_is_per_channel',
-            '_is_per_tensor',
-            '_is_symmetric_quant',
-            'FakeQuantizeBase',
-            'FakeQuantize',
-            'FixedQParamsFakeQuantize',
-            'FusedMovingAvgObsFakeQuantize',
-            'default_fake_quant',
-            'default_weight_fake_quant',
-            'default_fixed_qparams_range_neg1to1_fake_quant',
-            'default_fixed_qparams_range_0to1_fake_quant',
-            'default_per_channel_weight_fake_quant',
-            'default_histogram_fake_quant',
-            'default_fused_act_fake_quant',
-            'default_fused_wt_fake_quant',
-            'default_fused_per_channel_wt_fake_quant',
-            '_is_fake_quant_script_module',
-            'disable_fake_quant',
-            'enable_fake_quant',
-            'disable_observer',
-            'enable_observer',
+            "_is_per_channel",
+            "_is_per_tensor",
+            "_is_symmetric_quant",
+            "FakeQuantizeBase",
+            "FakeQuantize",
+            "FixedQParamsFakeQuantize",
+            "FusedMovingAvgObsFakeQuantize",
+            "default_fake_quant",
+            "default_weight_fake_quant",
+            "default_fixed_qparams_range_neg1to1_fake_quant",
+            "default_fixed_qparams_range_0to1_fake_quant",
+            "default_per_channel_weight_fake_quant",
+            "default_histogram_fake_quant",
+            "default_fused_act_fake_quant",
+            "default_fused_wt_fake_quant",
+            "default_fused_per_channel_wt_fake_quant",
+            "_is_fake_quant_script_module",
+            "disable_fake_quant",
+            "enable_fake_quant",
+            "disable_observer",
+            "enable_observer",
         ]
-        self._test_function_import('fake_quantize', function_list)
+        self._test_function_import("fake_quantize", function_list)
 
     def test_function_import_fuse_modules(self):
         function_list = [
-            '_fuse_modules',
-            '_get_module',
-            '_set_module',
-            'fuse_conv_bn',
-            'fuse_conv_bn_relu',
-            'fuse_known_modules',
-            'fuse_modules',
-            'get_fuser_method',
+            "_fuse_modules",
+            "_get_module",
+            "_set_module",
+            "fuse_conv_bn",
+            "fuse_conv_bn_relu",
+            "fuse_known_modules",
+            "fuse_modules",
+            "get_fuser_method",
         ]
-        self._test_function_import('fuse_modules', function_list)
+        self._test_function_import("fuse_modules", function_list)
 
     def test_function_import_quant_type(self):
         function_list = [
-            'QuantType',
-            '_get_quant_type_to_str',
+            "QuantType",
+            "_get_quant_type_to_str",
         ]
-        self._test_function_import('quant_type', function_list)
+        self._test_function_import("quant_type", function_list)
 
     def test_function_import_observer(self):
         function_list = [
@@ -133,7 +134,7 @@ def test_function_import_observer(self):
             "default_dynamic_quant_observer",
             "default_float_qparams_observer",
         ]
-        self._test_function_import('observer', function_list)
+        self._test_function_import("observer", function_list)
 
     def test_function_import_qconfig(self):
         function_list = [
@@ -156,9 +157,9 @@ def test_function_import_qconfig(self):
             "_assert_valid_qconfig",
             "QConfigAny",
             "_add_module_to_qconfig_obs_ctr",
-            "qconfig_equals"
+            "qconfig_equals",
         ]
-        self._test_function_import('qconfig', function_list)
+        self._test_function_import("qconfig", function_list)
 
     def test_function_import_quantization_mappings(self):
         function_list = [
@@ -184,8 +185,8 @@ def test_function_import_quantization_mappings(self):
             "DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS",
             "DEFAULT_MODULE_TO_ACT_POST_PROCESS",
         ]
-        self._test_function_import('quantization_mappings', function_list)
-        self._test_dict_import('quantization_mappings', dict_list)
+        self._test_function_import("quantization_mappings", function_list)
+        self._test_dict_import("quantization_mappings", dict_list)
 
     def test_function_import_fuser_method_mappings(self):
         function_list = [
@@ -194,29 +195,27 @@ def test_function_import_fuser_method_mappings(self):
             "fuse_linear_bn",
             "get_fuser_method",
         ]
-        dict_list = [
-            "_DEFAULT_OP_LIST_TO_FUSER_METHOD"
-        ]
-        self._test_function_import('fuser_method_mappings', function_list)
-        self._test_dict_import('fuser_method_mappings', dict_list)
+        dict_list = ["_DEFAULT_OP_LIST_TO_FUSER_METHOD"]
+        self._test_function_import("fuser_method_mappings", function_list)
+        self._test_dict_import("fuser_method_mappings", dict_list)
 
     def test_function_import_utils(self):
         function_list = [
-            'activation_dtype',
-            'activation_is_int8_quantized',
-            'activation_is_statically_quantized',
-            'calculate_qmin_qmax',
-            'check_min_max_valid',
-            'get_combined_dict',
-            'get_qconfig_dtypes',
-            'get_qparam_dict',
-            'get_quant_type',
-            'get_swapped_custom_module_class',
-            'getattr_from_fqn',
-            'is_per_channel',
-            'is_per_tensor',
-            'weight_dtype',
-            'weight_is_quantized',
-            'weight_is_statically_quantized',
+            "activation_dtype",
+            "activation_is_int8_quantized",
+            "activation_is_statically_quantized",
+            "calculate_qmin_qmax",
+            "check_min_max_valid",
+            "get_combined_dict",
+            "get_qconfig_dtypes",
+            "get_qparam_dict",
+            "get_quant_type",
+            "get_swapped_custom_module_class",
+            "getattr_from_fqn",
+            "is_per_channel",
+            "is_per_tensor",
+            "weight_dtype",
+            "weight_is_quantized",
+            "weight_is_statically_quantized",
         ]
-        self._test_function_import('utils', function_list)
+        self._test_function_import("utils", function_list)
diff --git a/test/quantization/ao_migration/test_quantization_fx.py b/test/quantization/ao_migration/test_quantization_fx.py
index 1c4d30a39190a..25b3328c8f44d 100644
--- a/test/quantization/ao_migration/test_quantization_fx.py
+++ b/test/quantization/ao_migration/test_quantization_fx.py
@@ -2,144 +2,133 @@
 
 from .common import AOMigrationTestCase
 
+
 class TestAOMigrationQuantizationFx(AOMigrationTestCase):
     def test_function_import_quantize_fx(self):
         function_list = [
-            '_check_is_graph_module',
-            '_swap_ff_with_fxff',
-            '_fuse_fx',
-            'QuantizationTracer',
-            '_prepare_fx',
-            '_prepare_standalone_module_fx',
-            'fuse_fx',
-            'Scope',
-            'ScopeContextManager',
-            'prepare_fx',
-            'prepare_qat_fx',
-            '_convert_fx',
-            'convert_fx',
-            '_convert_standalone_module_fx',
+            "_check_is_graph_module",
+            "_swap_ff_with_fxff",
+            "_fuse_fx",
+            "QuantizationTracer",
+            "_prepare_fx",
+            "_prepare_standalone_module_fx",
+            "fuse_fx",
+            "Scope",
+            "ScopeContextManager",
+            "prepare_fx",
+            "prepare_qat_fx",
+            "_convert_fx",
+            "convert_fx",
+            "_convert_standalone_module_fx",
         ]
-        self._test_function_import('quantize_fx', function_list)
+        self._test_function_import("quantize_fx", function_list)
 
     def test_function_import_fx(self):
         function_list = [
-            'prepare',
-            'convert',
-            'fuse',
+            "prepare",
+            "convert",
+            "fuse",
         ]
-        self._test_function_import('fx', function_list)
+        self._test_function_import("fx", function_list)
 
     def test_function_import_fx_graph_module(self):
         function_list = [
-            'FusedGraphModule',
-            'ObservedGraphModule',
-            '_is_observed_module',
-            'ObservedStandaloneGraphModule',
-            '_is_observed_standalone_module',
-            'QuantizedGraphModule'
+            "FusedGraphModule",
+            "ObservedGraphModule",
+            "_is_observed_module",
+            "ObservedStandaloneGraphModule",
+            "_is_observed_standalone_module",
+            "QuantizedGraphModule",
         ]
-        self._test_function_import('fx.graph_module', function_list)
+        self._test_function_import("fx.graph_module", function_list)
 
     def test_function_import_fx_pattern_utils(self):
         function_list = [
-            'QuantizeHandler',
-            '_register_fusion_pattern',
-            'get_default_fusion_patterns',
-            '_register_quant_pattern',
-            'get_default_quant_patterns',
-            'get_default_output_activation_post_process_map'
+            "QuantizeHandler",
+            "_register_fusion_pattern",
+            "get_default_fusion_patterns",
+            "_register_quant_pattern",
+            "get_default_quant_patterns",
+            "get_default_output_activation_post_process_map",
         ]
-        self._test_function_import('fx.pattern_utils', function_list)
+        self._test_function_import("fx.pattern_utils", function_list)
 
     def test_function_import_fx_equalize(self):
         function_list = [
-            'reshape_scale',
-            '_InputEqualizationObserver',
-            '_WeightEqualizationObserver',
-            'calculate_equalization_scale',
-            'EqualizationQConfig',
-            'input_equalization_observer',
-            'weight_equalization_observer',
-            'default_equalization_qconfig',
-            'fused_module_supports_equalization',
-            'nn_module_supports_equalization',
-            'node_supports_equalization',
-            'is_equalization_observer',
-            'get_op_node_and_weight_eq_obs',
-            'maybe_get_weight_eq_obs_node',
-            'maybe_get_next_input_eq_obs',
-            'maybe_get_next_equalization_scale',
-            'scale_input_observer',
-            'scale_weight_node',
-            'scale_weight_functional',
-            'clear_weight_quant_obs_node',
-            'remove_node',
-            'update_obs_for_equalization',
-            'convert_eq_obs',
-            '_convert_equalization_ref',
-            'get_layer_sqnr_dict',
-            'get_equalization_qconfig_dict'
+            "reshape_scale",
+            "_InputEqualizationObserver",
+            "_WeightEqualizationObserver",
+            "calculate_equalization_scale",
+            "EqualizationQConfig",
+            "input_equalization_observer",
+            "weight_equalization_observer",
+            "default_equalization_qconfig",
+            "fused_module_supports_equalization",
+            "nn_module_supports_equalization",
+            "node_supports_equalization",
+            "is_equalization_observer",
+            "get_op_node_and_weight_eq_obs",
+            "maybe_get_weight_eq_obs_node",
+            "maybe_get_next_input_eq_obs",
+            "maybe_get_next_equalization_scale",
+            "scale_input_observer",
+            "scale_weight_node",
+            "scale_weight_functional",
+            "clear_weight_quant_obs_node",
+            "remove_node",
+            "update_obs_for_equalization",
+            "convert_eq_obs",
+            "_convert_equalization_ref",
+            "get_layer_sqnr_dict",
+            "get_equalization_qconfig_dict",
         ]
-        self._test_function_import('fx._equalize', function_list)
+        self._test_function_import("fx._equalize", function_list)
 
     def test_function_import_fx_quantization_patterns(self):
         function_list = [
-            'QuantizeHandler',
-            'BinaryOpQuantizeHandler',
-            'CatQuantizeHandler',
-            'ConvReluQuantizeHandler',
-            'LinearReLUQuantizeHandler',
-            'BatchNormQuantizeHandler',
-            'EmbeddingQuantizeHandler',
-            'RNNDynamicQuantizeHandler',
-            'DefaultNodeQuantizeHandler',
-            'FixedQParamsOpQuantizeHandler',
-            'CopyNodeQuantizeHandler',
-            'CustomModuleQuantizeHandler',
-            'GeneralTensorShapeOpQuantizeHandler',
-            'StandaloneModuleQuantizeHandler'
+            "QuantizeHandler",
+            "BinaryOpQuantizeHandler",
+            "CatQuantizeHandler",
+            "ConvReluQuantizeHandler",
+            "LinearReLUQuantizeHandler",
+            "BatchNormQuantizeHandler",
+            "EmbeddingQuantizeHandler",
+            "RNNDynamicQuantizeHandler",
+            "DefaultNodeQuantizeHandler",
+            "FixedQParamsOpQuantizeHandler",
+            "CopyNodeQuantizeHandler",
+            "CustomModuleQuantizeHandler",
+            "GeneralTensorShapeOpQuantizeHandler",
+            "StandaloneModuleQuantizeHandler",
         ]
         self._test_function_import(
-            'fx.quantization_patterns',
+            "fx.quantization_patterns",
             function_list,
-            new_package_name='fx.quantize_handler',
+            new_package_name="fx.quantize_handler",
         )
 
     def test_function_import_fx_match_utils(self):
-        function_list = [
-            '_MatchResult',
-            'MatchAllNode',
-            '_is_match',
-            '_find_matches'
-        ]
-        self._test_function_import('fx.match_utils', function_list)
+        function_list = ["_MatchResult", "MatchAllNode", "_is_match", "_find_matches"]
+        self._test_function_import("fx.match_utils", function_list)
 
     def test_function_import_fx_prepare(self):
-        function_list = [
-            'prepare'
-        ]
-        self._test_function_import('fx.prepare', function_list)
+        function_list = ["prepare"]
+        self._test_function_import("fx.prepare", function_list)
 
     def test_function_import_fx_convert(self):
-        function_list = [
-            'convert'
-        ]
-        self._test_function_import('fx.convert', function_list)
+        function_list = ["convert"]
+        self._test_function_import("fx.convert", function_list)
 
     def test_function_import_fx_fuse(self):
-        function_list = ['fuse']
-        self._test_function_import('fx.fuse', function_list)
+        function_list = ["fuse"]
+        self._test_function_import("fx.fuse", function_list)
 
     def test_function_import_fx_fusion_patterns(self):
-        function_list = [
-            'FuseHandler',
-            'DefaultFuseHandler'
-        ]
+        function_list = ["FuseHandler", "DefaultFuseHandler"]
         self._test_function_import(
-            'fx.fusion_patterns',
+            "fx.fusion_patterns",
             function_list,
-            new_package_name='fx.fuse_handler',
+            new_package_name="fx.fuse_handler",
         )
 
     # we removed matching test for torch.quantization.fx.quantization_types
@@ -149,15 +138,15 @@ def test_function_import_fx_fusion_patterns(self):
 
     def test_function_import_fx_utils(self):
         function_list = [
-            'get_custom_module_class_keys',
-            'get_linear_prepack_op_for_dtype',
-            'get_qconv_prepack_op',
-            'get_new_attr_name_with_prefix',
-            'graph_module_from_producer_nodes',
-            'assert_and_get_unique_device',
-            'create_getattr_from_value',
-            'all_node_args_have_no_tensors',
-            'get_non_observable_arg_indexes_and_types',
-            'maybe_get_next_module'
+            "get_custom_module_class_keys",
+            "get_linear_prepack_op_for_dtype",
+            "get_qconv_prepack_op",
+            "get_new_attr_name_with_prefix",
+            "graph_module_from_producer_nodes",
+            "assert_and_get_unique_device",
+            "create_getattr_from_value",
+            "all_node_args_have_no_tensors",
+            "get_non_observable_arg_indexes_and_types",
+            "maybe_get_next_module",
         ]
-        self._test_function_import('fx.utils', function_list)
+        self._test_function_import("fx.utils", function_list)
diff --git a/test/quantization/bc/test_backward_compatibility.py b/test/quantization/bc/test_backward_compatibility.py
index 8387a5a40fb95..028709231c082 100644
--- a/test/quantization/bc/test_backward_compatibility.py
+++ b/test/quantization/bc/test_backward_compatibility.py
@@ -1,32 +1,39 @@
 # Owner(s): ["oncall: quantization"]
 
-import sys
 import os
+import sys
 import unittest
 from typing import Set
 
 # torch
 import torch
-import torch.nn as nn
+import torch.ao.nn.intrinsic.quantized as nniq
 import torch.ao.nn.quantized as nnq
 import torch.ao.nn.quantized.dynamic as nnqd
-import torch.ao.nn.intrinsic.quantized as nniq
+import torch.ao.quantization.quantize_fx as quantize_fx
+import torch.nn as nn
+
+from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver
 from torch.fx import GraphModule
+from torch.testing._internal.common_quantization import skipIfNoFBGEMM
+from torch.testing._internal.common_quantized import (
+    override_qengines,
+    qengine_is_fbgemm,
+)
 
 # Testing utils
-from torch.testing._internal.common_utils import TestCase, IS_AVX512_VNNI_SUPPORTED
-from torch.testing._internal.common_quantized import override_qengines, qengine_is_fbgemm
-from torch.testing._internal.common_quantization import skipIfNoFBGEMM
-from torch.testing._internal.quantization_torch_package_models import LinearReluFunctional
+from torch.testing._internal.common_utils import IS_AVX512_VNNI_SUPPORTED, TestCase
+from torch.testing._internal.quantization_torch_package_models import (
+    LinearReluFunctional,
+)
 
-from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver
-import torch.ao.quantization.quantize_fx as quantize_fx
 
 def remove_prefix(text, prefix):
     if text.startswith(prefix):
-        return text[len(prefix):]
+        return text[len(prefix) :]
     return text
 
+
 def get_filenames(self, subname):
     # NB: we take __file__ from the module that defined the test
     # class, so we place the expect directory where the test script
@@ -34,9 +41,7 @@ def get_filenames(self, subname):
     module_id = self.__class__.__module__
     munged_id = remove_prefix(self.id(), module_id + ".")
     test_file = os.path.realpath(sys.modules[module_id].__file__)
-    base_name = os.path.join(os.path.dirname(test_file),
-                             "../serialized",
-                             munged_id)
+    base_name = os.path.join(os.path.dirname(test_file), "../serialized", munged_id)
 
     subname_output = ""
     if subname:
@@ -51,32 +56,59 @@ def get_filenames(self, subname):
     package_file = base_name + ".package.pt"
     get_attr_targets_file = base_name + ".get_attr_targets.pt"
 
-    return input_file, state_dict_file, scripted_module_file, \
-        traced_module_file, expected_file, package_file, get_attr_targets_file
+    return (
+        input_file,
+        state_dict_file,
+        scripted_module_file,
+        traced_module_file,
+        expected_file,
+        package_file,
+        get_attr_targets_file,
+    )
 
 
 class TestSerialization(TestCase):
-    """ Test backward compatiblity for serialization and numerics
-    """
+    """Test backward compatiblity for serialization and numerics"""
+
     # Copy and modified from TestCase.assertExpected
-    def _test_op(self, qmodule, subname=None, input_size=None, input_quantized=True,
-                 generate=False, prec=None, new_zipfile_serialization=False):
-        r""" Test quantized modules serialized previously can be loaded
+    def _test_op(
+        self,
+        qmodule,
+        subname=None,
+        input_size=None,
+        input_quantized=True,
+        generate=False,
+        prec=None,
+        new_zipfile_serialization=False,
+    ):
+        r"""Test quantized modules serialized previously can be loaded
         with current code, make sure we don't break backward compatibility for the
         serialization of quantized modules
         """
-        input_file, state_dict_file, scripted_module_file, traced_module_file, \
-            expected_file, _package_file, _get_attr_targets_file = \
-            get_filenames(self, subname)
+        (
+            input_file,
+            state_dict_file,
+            scripted_module_file,
+            traced_module_file,
+            expected_file,
+            _package_file,
+            _get_attr_targets_file,
+        ) = get_filenames(self, subname)
 
         # only generate once.
         if generate and qengine_is_fbgemm():
             input_tensor = torch.rand(*input_size).float()
             if input_quantized:
-                input_tensor = torch.quantize_per_tensor(input_tensor, 0.5, 2, torch.quint8)
+                input_tensor = torch.quantize_per_tensor(
+                    input_tensor, 0.5, 2, torch.quint8
+                )
             torch.save(input_tensor, input_file)
             # Temporary fix to use _use_new_zipfile_serialization until #38379 lands.
-            torch.save(qmodule.state_dict(), state_dict_file, _use_new_zipfile_serialization=new_zipfile_serialization)
+            torch.save(
+                qmodule.state_dict(),
+                state_dict_file,
+                _use_new_zipfile_serialization=new_zipfile_serialization,
+            )
             torch.jit.save(torch.jit.script(qmodule), scripted_module_file)
             torch.jit.save(torch.jit.trace(qmodule, input_tensor), traced_module_file)
             torch.save(qmodule(input_tensor), expected_file)
@@ -90,8 +122,16 @@ def _test_op(self, qmodule, subname=None, input_size=None, input_quantized=True,
         self.assertEqual(qmodule_scripted(input_tensor), expected, atol=prec)
         self.assertEqual(qmodule_traced(input_tensor), expected, atol=prec)
 
-    def _test_op_graph(self, qmodule, subname=None, input_size=None, input_quantized=True,
-                       generate=False, prec=None, new_zipfile_serialization=False):
+    def _test_op_graph(
+        self,
+        qmodule,
+        subname=None,
+        input_size=None,
+        input_quantized=True,
+        generate=False,
+        prec=None,
+        new_zipfile_serialization=False,
+    ):
         r"""
         Input: a floating point module
 
@@ -101,9 +141,15 @@ def _test_op_graph(self, qmodule, subname=None, input_size=None, input_quantized
         If generate == False, traces and scripts the module and quantizes the results with
         PTQ, and compares to saved results.
         """
-        input_file, state_dict_file, scripted_module_file, traced_module_file, \
-            expected_file, _package_file, _get_attr_targets_file = \
-            get_filenames(self, subname)
+        (
+            input_file,
+            state_dict_file,
+            scripted_module_file,
+            traced_module_file,
+            expected_file,
+            _package_file,
+            _get_attr_targets_file,
+        ) = get_filenames(self, subname)
 
         # only generate once.
         if generate and qengine_is_fbgemm():
@@ -119,11 +165,13 @@ def _test_op_graph(self, qmodule, subname=None, input_size=None, input_quantized
             def _eval_fn(model, data):
                 model(data)
 
-            qconfig_dict = {'': torch.ao.quantization.default_qconfig}
+            qconfig_dict = {"": torch.ao.quantization.default_qconfig}
             scripted_q = torch.ao.quantization.quantize_jit(
-                scripted, qconfig_dict, _eval_fn, [input_tensor])
+                scripted, qconfig_dict, _eval_fn, [input_tensor]
+            )
             traced_q = torch.ao.quantization.quantize_jit(
-                traced, qconfig_dict, _eval_fn, [input_tensor])
+                traced, qconfig_dict, _eval_fn, [input_tensor]
+            )
 
             torch.jit.save(scripted_q, scripted_module_file)
             torch.jit.save(traced_q, traced_module_file)
@@ -136,12 +184,21 @@ def _eval_fn(model, data):
         self.assertEqual(qmodule_scripted(input_tensor), expected, atol=prec)
         self.assertEqual(qmodule_traced(input_tensor), expected, atol=prec)
 
-    def _test_obs(self, obs, input_size, subname=None, generate=False, check_numerics=True):
+    def _test_obs(
+        self, obs, input_size, subname=None, generate=False, check_numerics=True
+    ):
         """
         Test observer code can be loaded from state_dict.
         """
-        input_file, state_dict_file, _, traced_module_file, expected_file, \
-            _package_file, _get_attr_targets_file = get_filenames(self, None)
+        (
+            input_file,
+            state_dict_file,
+            _,
+            traced_module_file,
+            expected_file,
+            _package_file,
+            _get_attr_targets_file,
+        ) = get_filenames(self, None)
         if generate:
             input_tensor = torch.rand(*input_size).float()
             torch.save(input_tensor, input_file)
@@ -159,12 +216,18 @@ def _test_package(self, fp32_module, input_size, generate=False):
         Verifies that files created in the past with torch.package
         work on today's FX graph mode quantization transforms.
         """
-        input_file, state_dict_file, _scripted_module_file, _traced_module_file, \
-            expected_file, package_file, get_attr_targets_file = \
-            get_filenames(self, None)
-
-        package_name = 'test'
-        resource_name_model = 'test.pkl'
+        (
+            input_file,
+            state_dict_file,
+            _scripted_module_file,
+            _traced_module_file,
+            expected_file,
+            package_file,
+            get_attr_targets_file,
+        ) = get_filenames(self, None)
+
+        package_name = "test"
+        resource_name_model = "test.pkl"
 
         def _do_quant_transforms(
             m: torch.nn.Module,
@@ -172,8 +235,8 @@ def _do_quant_transforms(
         ) -> torch.nn.Module:
             example_inputs = (input_tensor,)
             # do the quantizaton transforms and save result
-            qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
-            mp = quantize_fx.prepare_fx(m, {'': qconfig}, example_inputs=example_inputs)
+            qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+            mp = quantize_fx.prepare_fx(m, {"": qconfig}, example_inputs=example_inputs)
             mp(input_tensor)
             mq = quantize_fx.convert_fx(mp)
             return mq
@@ -181,7 +244,7 @@ def _do_quant_transforms(
         def _get_get_attr_target_strings(m: GraphModule) -> Set[str]:
             results = set()
             for node in m.graph.nodes:
-                if node.op == 'get_attr':
+                if node.op == "get_attr":
                     results.add(node.target)
             return results
 
@@ -191,7 +254,7 @@ def _get_get_attr_target_strings(m: GraphModule) -> Set[str]:
 
             # save the model with torch.package
             with torch.package.PackageExporter(package_file) as exp:
-                exp.intern('torch.testing._internal.quantization_torch_package_models')
+                exp.intern("torch.testing._internal.quantization_torch_package_models")
                 exp.save_pickle(package_name, resource_name_model, fp32_module)
 
             # do the quantization transforms and save the result
@@ -214,7 +277,8 @@ def _get_get_attr_target_strings(m: GraphModule) -> Set[str]:
         get_attrs = _get_get_attr_target_strings(mq)
         self.assertTrue(
             get_attrs == expected_get_attrs,
-            f'get_attrs: expected {expected_get_attrs}, got {get_attrs}')
+            f"get_attrs: expected {expected_get_attrs}, got {get_attrs}",
+        )
         output_tensor = mq(input_tensor)
         self.assertTrue(torch.allclose(output_tensor, expected_output_tensor))
 
@@ -231,29 +295,68 @@ def test_linear_relu(self):
     @override_qengines
     def test_linear_dynamic(self):
         module_qint8 = nnqd.Linear(3, 1, bias_=True, dtype=torch.qint8)
-        self._test_op(module_qint8, "qint8", input_size=[1, 3], input_quantized=False, generate=False)
+        self._test_op(
+            module_qint8,
+            "qint8",
+            input_size=[1, 3],
+            input_quantized=False,
+            generate=False,
+        )
         if qengine_is_fbgemm():
             module_float16 = nnqd.Linear(3, 1, bias_=True, dtype=torch.float16)
-            self._test_op(module_float16, "float16", input_size=[1, 3], input_quantized=False, generate=False)
+            self._test_op(
+                module_float16,
+                "float16",
+                input_size=[1, 3],
+                input_quantized=False,
+                generate=False,
+            )
 
     @override_qengines
     def test_conv2d(self):
-        module = nnq.Conv2d(3, 3, kernel_size=3, stride=1, padding=0, dilation=1,
-                            groups=1, bias=True, padding_mode="zeros")
+        module = nnq.Conv2d(
+            3,
+            3,
+            kernel_size=3,
+            stride=1,
+            padding=0,
+            dilation=1,
+            groups=1,
+            bias=True,
+            padding_mode="zeros",
+        )
         self._test_op(module, input_size=[1, 3, 6, 6], generate=False)
 
     @override_qengines
     def test_conv2d_nobias(self):
-        module = nnq.Conv2d(3, 3, kernel_size=3, stride=1, padding=0, dilation=1,
-                            groups=1, bias=False, padding_mode="zeros")
+        module = nnq.Conv2d(
+            3,
+            3,
+            kernel_size=3,
+            stride=1,
+            padding=0,
+            dilation=1,
+            groups=1,
+            bias=False,
+            padding_mode="zeros",
+        )
         self._test_op(module, input_size=[1, 3, 6, 6], generate=False)
 
     @override_qengines
     def test_conv2d_graph(self):
         module = nn.Sequential(
             torch.ao.quantization.QuantStub(),
-            nn.Conv2d(3, 3, kernel_size=3, stride=1, padding=0, dilation=1,
-                      groups=1, bias=True, padding_mode="zeros"),
+            nn.Conv2d(
+                3,
+                3,
+                kernel_size=3,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                bias=True,
+                padding_mode="zeros",
+            ),
         )
         self._test_op_graph(module, input_size=[1, 3, 6, 6], generate=False)
 
@@ -261,8 +364,17 @@ def test_conv2d_graph(self):
     def test_conv2d_nobias_graph(self):
         module = nn.Sequential(
             torch.ao.quantization.QuantStub(),
-            nn.Conv2d(3, 3, kernel_size=3, stride=1, padding=0, dilation=1,
-                      groups=1, bias=False, padding_mode="zeros"),
+            nn.Conv2d(
+                3,
+                3,
+                kernel_size=3,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                bias=False,
+                padding_mode="zeros",
+            ),
         )
         self._test_op_graph(module, input_size=[1, 3, 6, 6], generate=False)
 
@@ -272,8 +384,17 @@ def test_conv2d_graph_v2(self):
         # ConvPackedParams{n}d
         module = nn.Sequential(
             torch.ao.quantization.QuantStub(),
-            nn.Conv2d(3, 3, kernel_size=3, stride=1, padding=0, dilation=1,
-                      groups=1, bias=True, padding_mode="zeros"),
+            nn.Conv2d(
+                3,
+                3,
+                kernel_size=3,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                bias=True,
+                padding_mode="zeros",
+            ),
         )
         self._test_op_graph(module, input_size=[1, 3, 6, 6], generate=False)
 
@@ -283,8 +404,17 @@ def test_conv2d_nobias_graph_v2(self):
         # ConvPackedParams{n}d
         module = nn.Sequential(
             torch.ao.quantization.QuantStub(),
-            nn.Conv2d(3, 3, kernel_size=3, stride=1, padding=0, dilation=1,
-                      groups=1, bias=False, padding_mode="zeros"),
+            nn.Conv2d(
+                3,
+                3,
+                kernel_size=3,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                bias=False,
+                padding_mode="zeros",
+            ),
         )
         self._test_op_graph(module, input_size=[1, 3, 6, 6], generate=False)
 
@@ -294,8 +424,17 @@ def test_conv2d_graph_v3(self):
         # ConvPackedParams{n}d
         module = nn.Sequential(
             torch.ao.quantization.QuantStub(),
-            nn.Conv2d(3, 3, kernel_size=3, stride=1, padding=0, dilation=1,
-                      groups=1, bias=True, padding_mode="zeros"),
+            nn.Conv2d(
+                3,
+                3,
+                kernel_size=3,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                bias=True,
+                padding_mode="zeros",
+            ),
         )
         self._test_op_graph(module, input_size=[1, 3, 6, 6], generate=False)
 
@@ -305,48 +444,96 @@ def test_conv2d_nobias_graph_v3(self):
         # ConvPackedParams{n}d
         module = nn.Sequential(
             torch.ao.quantization.QuantStub(),
-            nn.Conv2d(3, 3, kernel_size=3, stride=1, padding=0, dilation=1,
-                      groups=1, bias=False, padding_mode="zeros"),
+            nn.Conv2d(
+                3,
+                3,
+                kernel_size=3,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                bias=False,
+                padding_mode="zeros",
+            ),
         )
         self._test_op_graph(module, input_size=[1, 3, 6, 6], generate=False)
 
     @override_qengines
     def test_conv2d_relu(self):
-        module = nniq.ConvReLU2d(3, 3, kernel_size=3, stride=1, padding=0, dilation=1,
-                                 groups=1, bias=True, padding_mode="zeros")
+        module = nniq.ConvReLU2d(
+            3,
+            3,
+            kernel_size=3,
+            stride=1,
+            padding=0,
+            dilation=1,
+            groups=1,
+            bias=True,
+            padding_mode="zeros",
+        )
         self._test_op(module, input_size=[1, 3, 6, 6], generate=False)
         # TODO: graph mode quantized conv2d module
 
     @override_qengines
     def test_conv3d(self):
         if qengine_is_fbgemm():
-            module = nnq.Conv3d(3, 3, kernel_size=3, stride=1, padding=0, dilation=1,
-                                groups=1, bias=True, padding_mode="zeros")
+            module = nnq.Conv3d(
+                3,
+                3,
+                kernel_size=3,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                bias=True,
+                padding_mode="zeros",
+            )
             self._test_op(module, input_size=[1, 3, 6, 6, 6], generate=False)
             # TODO: graph mode quantized conv3d module
 
     @override_qengines
     def test_conv3d_relu(self):
         if qengine_is_fbgemm():
-            module = nniq.ConvReLU3d(3, 3, kernel_size=3, stride=1, padding=0, dilation=1,
-                                     groups=1, bias=True, padding_mode="zeros")
+            module = nniq.ConvReLU3d(
+                3,
+                3,
+                kernel_size=3,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                bias=True,
+                padding_mode="zeros",
+            )
             self._test_op(module, input_size=[1, 3, 6, 6, 6], generate=False)
             # TODO: graph mode quantized conv3d module
 
     @override_qengines
-    @unittest.skipIf(IS_AVX512_VNNI_SUPPORTED, "This test fails on machines with AVX512_VNNI support. Ref: GH Issue 59098")
+    @unittest.skipIf(
+        IS_AVX512_VNNI_SUPPORTED,
+        "This test fails on machines with AVX512_VNNI support. Ref: GH Issue 59098",
+    )
     def test_lstm(self):
         class LSTMModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.lstm = nnqd.LSTM(input_size=3, hidden_size=7, num_layers=1).to(dtype=torch.float)
+                self.lstm = nnqd.LSTM(input_size=3, hidden_size=7, num_layers=1).to(
+                    dtype=torch.float
+                )
 
             def forward(self, x):
                 x = self.lstm(x)
                 return x
+
         if qengine_is_fbgemm():
             mod = LSTMModule()
-            self._test_op(mod, input_size=[4, 4, 3], input_quantized=False, generate=False, new_zipfile_serialization=True)
+            self._test_op(
+                mod,
+                input_size=[4, 4, 3],
+                input_quantized=False,
+                generate=False,
+                new_zipfile_serialization=True,
+            )
 
     def test_per_channel_observer(self):
         obs = PerChannelMinMaxObserver()
@@ -373,7 +560,9 @@ def forward(self, x):
         model.qconfig = torch.ao.quantization.get_default_qat_qconfig("fbgemm")
         ref_model = torch.ao.quantization.QuantWrapper(model)
         ref_model = torch.ao.quantization.prepare_qat(ref_model)
-        self._test_obs(ref_model, input_size=[5, 5], generate=False, check_numerics=False)
+        self._test_obs(
+            ref_model, input_size=[5, 5], generate=False, check_numerics=False
+        )
 
     @skipIfNoFBGEMM
     def test_linear_relu_package_quantization_transforms(self):
diff --git a/test/quantization/core/experimental/test_bits.py b/test/quantization/core/experimental/test_bits.py
index bf7f3812744b4..dfba754590d88 100644
--- a/test/quantization/core/experimental/test_bits.py
+++ b/test/quantization/core/experimental/test_bits.py
@@ -38,6 +38,12 @@ def wrap(t):
         out = tree_map(wrap, out)
         return out
 
+    # This most likely should be removed (and thus use the disabled impl)
+    # but the test below fail under Dynamo in that case.
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        return super().__torch_function__(func, types, args, kwargs)
+
     def __repr__(self) -> str:
         with no_dispatch():
             t16 = self.view(torch.int16)
diff --git a/test/quantization/core/test_docs.py b/test/quantization/core/test_docs.py
index a5a49f2ff2fa6..6e5a7cc18d923 100644
--- a/test/quantization/core/test_docs.py
+++ b/test/quantization/core/test_docs.py
@@ -55,8 +55,8 @@ def get_correct_path(path_from_pytorch):
 
         path_to_file = get_correct_path(path_from_pytorch)
         if path_to_file:
-            file = open(path_to_file)
-            content = file.readlines()
+            with open(path_to_file) as file:
+                content = file.readlines()
 
             # it will register as having a newline at the end in python
             if "\n" not in unique_identifier:
diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py
index 3fb05db64b9e5..5e0b3f378f393 100644
--- a/test/quantization/core/test_quantized_module.py
+++ b/test/quantization/core/test_quantized_module.py
@@ -151,39 +151,41 @@ def _test_linear_api_impl(self, qlinear_module, module_name, qlinear_op,
         model_dict = qlinear.state_dict()
         b = io.BytesIO()
         torch.save(model_dict, b)
-        b.seek(0)
-        loaded_dict = torch.load(b)
-        for key in model_dict:
-            if isinstance(model_dict[key], torch._C.ScriptObject):
-                assert isinstance(loaded_dict[key], torch._C.ScriptObject)
-                w_model, b_model = torch.ops.quantized.linear_unpack(model_dict[key])
-                w_loaded, b_loaded = torch.ops.quantized.linear_unpack(loaded_dict[key])
-                self.assertEqual(w_model, w_loaded)
-                self.assertEqual(b_model, b_loaded)
-            else:
-                self.assertEqual(model_dict[key], loaded_dict[key])
-
-        loaded_qlinear = qlinear_module(
-            in_features, out_features, **post_ops_kwargs)
-        loaded_qlinear.load_state_dict(loaded_dict)
-        linear_unpack = torch.ops.quantized.linear_unpack
-        self.assertEqual(linear_unpack(qlinear._packed_params._packed_params),
-                         linear_unpack(loaded_qlinear._packed_params._packed_params))
-        self.assertEqual(qlinear.scale, loaded_qlinear.scale)
-        self.assertEqual(qlinear.zero_point, loaded_qlinear.zero_point)
-        # scripting will add __overloads__ to __dict__, which is why we script a copy
-        # to be able to do the check in the next line
-        self.checkScriptable(copy.deepcopy(loaded_qlinear), [[X_q]], check_save_load=True)
-        self.assertTrue(dir(qlinear) == dir(loaded_qlinear))
-        self.assertEqual(qlinear._weight_bias(), loaded_qlinear._weight_bias())
-        self.assertEqual(qlinear._weight_bias(), torch.ops.quantized.linear_unpack(qlinear._packed_params._packed_params))
-        Z_q2 = loaded_qlinear(X_q)
-        self.assertEqual(Z_q, Z_q2)
+        for weights_only in [True, False]:
+            b.seek(0)
+            loaded_dict = torch.load(b, weights_only=weights_only)
+            for key in model_dict:
+                if isinstance(model_dict[key], torch._C.ScriptObject):
+                    assert isinstance(loaded_dict[key], torch._C.ScriptObject)
+                    w_model, b_model = torch.ops.quantized.linear_unpack(model_dict[key])
+                    w_loaded, b_loaded = torch.ops.quantized.linear_unpack(loaded_dict[key])
+                    self.assertEqual(w_model, w_loaded)
+                    self.assertEqual(b_model, b_loaded)
+                else:
+                    self.assertEqual(model_dict[key], loaded_dict[key])
+
+            loaded_qlinear = qlinear_module(
+                in_features, out_features, **post_ops_kwargs)
+            loaded_qlinear.load_state_dict(loaded_dict)
+            linear_unpack = torch.ops.quantized.linear_unpack
+            self.assertEqual(linear_unpack(qlinear._packed_params._packed_params),
+                             linear_unpack(loaded_qlinear._packed_params._packed_params))
+            self.assertEqual(qlinear.scale, loaded_qlinear.scale)
+            self.assertEqual(qlinear.zero_point, loaded_qlinear.zero_point)
+            # scripting will add __overloads__ to __dict__, which is why we script a copy
+            # to be able to do the check in the next line
+            self.checkScriptable(copy.deepcopy(loaded_qlinear), [[X_q]], check_save_load=True)
+            self.assertTrue(dir(qlinear) == dir(loaded_qlinear))
+            self.assertEqual(qlinear._weight_bias(), loaded_qlinear._weight_bias())
+            self.assertEqual(qlinear._weight_bias(), torch.ops.quantized.linear_unpack(qlinear._packed_params._packed_params))
+            Z_q2 = loaded_qlinear(X_q)
+            self.assertEqual(Z_q, Z_q2)
 
         # Test serialization
         b = io.BytesIO()
         torch.save(qlinear, b)
         b.seek(0)
+        # Don't test weights_only here as this is legacy code that saves the model
         loaded = torch.load(b)
         self.assertEqual(qlinear.weight(), loaded.weight())
         self.assertEqual(qlinear.scale, loaded.scale)
@@ -337,34 +339,36 @@ def _test_conv_api_impl(
             self.assertEqual(model_dict['bias'], b)
         bytes_io = io.BytesIO()
         torch.save(model_dict, bytes_io)
-        bytes_io.seek(0)
-        loaded_dict = torch.load(bytes_io)
-        for key in loaded_dict:
-            self.assertEqual(model_dict[key], loaded_dict[key])
-        loaded_qconv_module = type(qconv_module)(
-            in_channels, out_channels, kernel_size, stride, padding, dilation,
-            groups, use_bias, padding_mode=padding_mode)
-        loaded_qconv_module.load_state_dict(loaded_dict)
-
-        self.assertTrue(dir(loaded_qconv_module) == dir(qconv_module))
-        self.assertTrue(module_name == loaded_qconv_module._get_name())
-        self.assertTrue(hasattr(loaded_qconv_module, '_packed_params'))
-        self.assertTrue(hasattr(loaded_qconv_module, '_weight_bias'))
-
-        self.assertEqual(qconv_module.weight(), loaded_qconv_module.weight())
-        if use_bias:
-            self.assertEqual(qconv_module.bias(), loaded_qconv_module.bias())
-        self.assertEqual(qconv_module.scale, loaded_qconv_module.scale)
-        self.assertEqual(qconv_module.zero_point,
-                         loaded_qconv_module.zero_point)
-        Y_loaded = loaded_qconv_module(*example_input_q)
-        np.testing.assert_array_almost_equal(
-            Y_exp.int_repr().numpy(), Y_loaded.int_repr().numpy(), decimal=0)
+        for weights_only in [True, False]:
+            bytes_io.seek(0)
+            loaded_dict = torch.load(bytes_io, weights_only=weights_only)
+            for key in loaded_dict:
+                self.assertEqual(model_dict[key], loaded_dict[key])
+            loaded_qconv_module = type(qconv_module)(
+                in_channels, out_channels, kernel_size, stride, padding, dilation,
+                groups, use_bias, padding_mode=padding_mode)
+            loaded_qconv_module.load_state_dict(loaded_dict)
+
+            self.assertTrue(dir(loaded_qconv_module) == dir(qconv_module))
+            self.assertTrue(module_name == loaded_qconv_module._get_name())
+            self.assertTrue(hasattr(loaded_qconv_module, '_packed_params'))
+            self.assertTrue(hasattr(loaded_qconv_module, '_weight_bias'))
+
+            self.assertEqual(qconv_module.weight(), loaded_qconv_module.weight())
+            if use_bias:
+                self.assertEqual(qconv_module.bias(), loaded_qconv_module.bias())
+            self.assertEqual(qconv_module.scale, loaded_qconv_module.scale)
+            self.assertEqual(qconv_module.zero_point,
+                             loaded_qconv_module.zero_point)
+            Y_loaded = loaded_qconv_module(*example_input_q)
+            np.testing.assert_array_almost_equal(
+                Y_exp.int_repr().numpy(), Y_loaded.int_repr().numpy(), decimal=0)
 
         # Test serialization
         b = io.BytesIO()
         torch.save(qconv_module, b)
         b.seek(0)
+        # Don't test weights_only here as this is legacy code that saves the model
         loaded_conv = torch.load(b)
 
         self.assertEqual(loaded_conv.bias(), qconv_module.bias())
@@ -1449,34 +1453,36 @@ def _test_qconv_impl(self, q_mod, dq_mod, dim, dtype, bias):
         self.assertEqual(model_dict['bias'], b)
         bytes_io = io.BytesIO()
         torch.save(model_dict, bytes_io)
-        bytes_io.seek(0)
-        loaded_dict = torch.load(bytes_io)
-        for key in loaded_dict:
-            self.assertEqual(model_dict[key], loaded_dict[key])
-        loaded_qconv_module = type(dynamic_module)(
-            in_channels, out_channels, kernel_size, stride=stride, padding=padding,
-            dilation=dilation, groups=groups, bias=bias, padding_mode=padding_mode)
-        loaded_qconv_module.load_state_dict(loaded_dict)
-
-        self.assertTrue(dir(loaded_qconv_module) == dir(dynamic_module))
-        self.assertTrue(dynamic_module._get_name() == loaded_qconv_module._get_name())
-        self.assertTrue(hasattr(loaded_qconv_module, '_packed_params'))
-        self.assertTrue(hasattr(loaded_qconv_module, '_weight_bias'))
-
-        self.assertEqual(dynamic_module.weight(), loaded_qconv_module.weight())
-        if bias:
-            self.assertEqual(dynamic_module.bias(), loaded_qconv_module.bias())
-        self.assertEqual(dynamic_module.scale, loaded_qconv_module.scale)
-        self.assertEqual(dynamic_module.zero_point,
-                         loaded_qconv_module.zero_point)
-        Y_loaded = loaded_qconv_module(X_fp32, reduce_range)
-        np.testing.assert_array_almost_equal(
-            Y.numpy(), Y_loaded.numpy(), decimal=0)
+        for weights_only in [True, False]:
+            bytes_io.seek(0)
+            loaded_dict = torch.load(bytes_io, weights_only=weights_only)
+            for key in loaded_dict:
+                self.assertEqual(model_dict[key], loaded_dict[key])
+            loaded_qconv_module = type(dynamic_module)(
+                in_channels, out_channels, kernel_size, stride=stride, padding=padding,
+                dilation=dilation, groups=groups, bias=bias, padding_mode=padding_mode)
+            loaded_qconv_module.load_state_dict(loaded_dict)
+
+            self.assertTrue(dir(loaded_qconv_module) == dir(dynamic_module))
+            self.assertTrue(dynamic_module._get_name() == loaded_qconv_module._get_name())
+            self.assertTrue(hasattr(loaded_qconv_module, '_packed_params'))
+            self.assertTrue(hasattr(loaded_qconv_module, '_weight_bias'))
+
+            self.assertEqual(dynamic_module.weight(), loaded_qconv_module.weight())
+            if bias:
+                self.assertEqual(dynamic_module.bias(), loaded_qconv_module.bias())
+            self.assertEqual(dynamic_module.scale, loaded_qconv_module.scale)
+            self.assertEqual(dynamic_module.zero_point,
+                             loaded_qconv_module.zero_point)
+            Y_loaded = loaded_qconv_module(X_fp32, reduce_range)
+            np.testing.assert_array_almost_equal(
+                Y.numpy(), Y_loaded.numpy(), decimal=0)
 
         # Test serialization
         b = io.BytesIO()
         torch.save(dynamic_module, b)
         b.seek(0)
+        # Don't test weights_only here as this is legacy code that saves the model
         loaded_conv = torch.load(b)
 
         self.assertEqual(loaded_conv.bias(), dynamic_module.bias())
@@ -1622,39 +1628,41 @@ def test_linear_api(self, batch_size, in_features, out_features, use_bias, use_d
         model_dict = qlinear.state_dict()
         b = io.BytesIO()
         torch.save(model_dict, b)
-        b.seek(0)
-        loaded_dict = torch.load(b)
-        for key in model_dict:
-            if isinstance(model_dict[key], torch._C.ScriptObject):
-                assert isinstance(loaded_dict[key], torch._C.ScriptObject)
-                w_model, b_model = torch.ops.quantized.linear_unpack(model_dict[key])
-                w_loaded, b_loaded = torch.ops.quantized.linear_unpack(loaded_dict[key])
-                self.assertEqual(w_model, w_loaded)
-                self.assertEqual(b_model, b_loaded)
-            else:
-                self.assertEqual(model_dict[key], loaded_dict[key])
-        loaded_qlinear = nnqd.Linear(in_features, out_features)
-        loaded_qlinear.load_state_dict(loaded_dict)
-
-        linear_unpack = torch.ops.quantized.linear_unpack
-        self.assertEqual(linear_unpack(qlinear._packed_params._packed_params),
-                         linear_unpack(loaded_qlinear._packed_params._packed_params))
-        if use_bias:
-            self.assertEqual(qlinear.bias(), loaded_qlinear.bias())
-        self.assertTrue(dir(qlinear) == dir(loaded_qlinear))
-        self.assertTrue(hasattr(qlinear, '_packed_params'))
-        self.assertTrue(hasattr(loaded_qlinear, '_packed_params'))
-        self.assertTrue(hasattr(qlinear, '_weight_bias'))
-        self.assertTrue(hasattr(loaded_qlinear, '_weight_bias'))
-
-        self.assertEqual(qlinear._weight_bias(), loaded_qlinear._weight_bias())
-        self.assertEqual(qlinear._weight_bias(), torch.ops.quantized.linear_unpack(qlinear._packed_params._packed_params))
-        Z_dq2 = qlinear(X)
-        self.assertEqual(Z_dq, Z_dq2)
+        for weights_only in [True, False]:
+            b.seek(0)
+            loaded_dict = torch.load(b, weights_only=weights_only)
+            for key in model_dict:
+                if isinstance(model_dict[key], torch._C.ScriptObject):
+                    assert isinstance(loaded_dict[key], torch._C.ScriptObject)
+                    w_model, b_model = torch.ops.quantized.linear_unpack(model_dict[key])
+                    w_loaded, b_loaded = torch.ops.quantized.linear_unpack(loaded_dict[key])
+                    self.assertEqual(w_model, w_loaded)
+                    self.assertEqual(b_model, b_loaded)
+                else:
+                    self.assertEqual(model_dict[key], loaded_dict[key])
+            loaded_qlinear = nnqd.Linear(in_features, out_features)
+            loaded_qlinear.load_state_dict(loaded_dict)
+
+            linear_unpack = torch.ops.quantized.linear_unpack
+            self.assertEqual(linear_unpack(qlinear._packed_params._packed_params),
+                             linear_unpack(loaded_qlinear._packed_params._packed_params))
+            if use_bias:
+                self.assertEqual(qlinear.bias(), loaded_qlinear.bias())
+            self.assertTrue(dir(qlinear) == dir(loaded_qlinear))
+            self.assertTrue(hasattr(qlinear, '_packed_params'))
+            self.assertTrue(hasattr(loaded_qlinear, '_packed_params'))
+            self.assertTrue(hasattr(qlinear, '_weight_bias'))
+            self.assertTrue(hasattr(loaded_qlinear, '_weight_bias'))
+
+            self.assertEqual(qlinear._weight_bias(), loaded_qlinear._weight_bias())
+            self.assertEqual(qlinear._weight_bias(), torch.ops.quantized.linear_unpack(qlinear._packed_params._packed_params))
+            Z_dq2 = qlinear(X)
+            self.assertEqual(Z_dq, Z_dq2)
 
         b = io.BytesIO()
         torch.save(qlinear, b)
         b.seek(0)
+        # Don't test weights_only here as this is legacy code that saves the model
         loaded = torch.load(b)
         self.assertEqual(qlinear.weight(), loaded.weight())
         self.assertEqual(qlinear.zero_point, loaded.zero_point)
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 47564b1ccac19..d59f1fffd9268 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -4,9 +4,9 @@
 import copy
 import itertools
 import numpy as np
-import unittest
 import operator
 import random
+import unittest
 from typing import NamedTuple, List
 
 import torch
@@ -32,6 +32,7 @@
 )
 from torch.ao.quantization import PerChannelMinMaxObserver
 from torch.testing._internal.common_cuda import TEST_CUDNN, TEST_CUDA
+from torch.testing._internal.optests import opcheck
 import torch.backends.xnnpack
 
 from typing import Optional
@@ -3341,6 +3342,88 @@ def test_qlinear_dynamic_fp16(self):
 
             self.assertEqual(out, ref)
 
+    @skipIfNoFBGEMM
+    def test_unpacked_qlinear_dynamic_fp16(self):
+
+        options = itertools.product(
+            (2, 4),         # batch_size
+            (4, 5, 12),     # input_channels
+            (4, 7, 8),      # output_channels
+        )
+        for batch_size, input_channels, output_channels in options:
+            qlinear_dynamic = torch.ops.quantized.linear_dynamic_fp16_unpacked_weight
+
+            x = torch.randn(batch_size, input_channels)
+            w = torch.randn(output_channels, input_channels)
+            bias = torch.randn(output_channels)
+
+            out = qlinear_dynamic(x, w, bias)
+
+            # qlinear_dynamic_fp16 uses FP32 activation tensors and FP16 weight tensors
+            # output is FP32
+            w_fp16 = w.to(torch.float16).to(torch.float32)
+            ref = F.linear(x, w_fp16, bias)
+
+            self.assertEqual(out, ref)
+
+
+    @skipIfNoFBGEMM
+    def test_unpacked_qlinear_dynamic_fp16_opcheck(self):
+        qlinear_dynamic = torch.ops.quantized.linear_dynamic_fp16_unpacked_weight.default
+
+        x = torch.randn(4, 4, device='cpu')
+        w = torch.randn(4, 4, device='cpu')
+        bias = torch.randn(4, device='cpu')
+
+        opcheck(qlinear_dynamic, (x, w, bias))
+
+    @skipIfNoFBGEMM
+    def test_wrapped_fbgemm_linear_fp16(self):
+        options = itertools.product(
+            (2, 4),         # batch_size
+            (4, 5),     # input_channels
+            (4, 7),      # output_channels
+        )
+        for batch_size, input_channels, output_channels in options:
+            pack_op = torch.ops._quantized.wrapped_fbgemm_pack_gemm_matrix_fp16
+            linear_op = torch.ops._quantized.wrapped_fbgemm_linear_fp16_weight
+
+            x = torch.randn(batch_size, input_channels)
+            w = torch.randn(output_channels, input_channels)
+            bias = torch.randn(output_channels)
+
+            w_packed = pack_op(w)
+            out = linear_op(x, w_packed, bias, output_channels)
+
+            w_fp16 = w.to(torch.float16).to(torch.float32)
+            ref = F.linear(x, w_fp16, bias)
+
+            self.assertEqual(out, ref)
+
+    @skipIfNoFBGEMM
+    def test_wrapped_fbgemm_pack_gemm_matrix_fp16_pt2_compliant(self):
+        # We are not using opcheck over here because the output for the op we're testing
+        # (_quantized.wrapped_fbgemm_pack_gemm_matrix_fp16) is not deterministic
+        # due to the C-struct it's procuding. This would fail the check when we're trying
+        # to match the result between compiled and eager version.
+        #
+        # This is only a temporary solution, long term, we should be able to support PT2
+        # with torchbind natively.
+        def func(X, W, B):
+            packed_W = torch.ops._quantized.wrapped_fbgemm_pack_gemm_matrix_fp16(W)
+            return torch.ops._quantized.wrapped_fbgemm_linear_fp16_weight(X, packed_W, B, W.size(0))
+
+        x = torch.randn(1, 4, device="cpu")
+        w = torch.randn(4, 4, device="cpu")
+        b = torch.zeros(4, device="cpu")
+
+        ref_out = func(x, w, b)
+
+        compiled = torch.compile(func)
+        compiled_out = compiled(x, w, b)
+
+        self.assertEqual(ref_out, compiled_out)
+
     """Tests the correctness of the dynamic quantized lstm/gru."""
 
     def _get_rnn_inputs(self, seq_len, num_batches, input_size, hidden_size, num_directions, reduce_range):
@@ -4165,10 +4248,15 @@ def test_qlinear_tanh(self):
                                         use_bias, post_op, use_multi_dim_input,
                                         use_channelwise)
 
-    def _test_qlinear_pt2e_helper(self, post_op_to_qlinear_ref_dict, supported_post_ops, post_op_algorithms):
+    def _test_qlinear_pt2e_helper(
+        self,
+        qlinear_op,
+        post_op="none",
+        unary_post_op_args=(),
+        post_op_algorithms=("none"),
+    ):
         qlinear_prepack = torch.ops.onednn.qlinear_prepack
-        qlinear = torch.ops.onednn.qlinear_pointwise
-        qlinear_prepack_ref = torch.ops.quantized.linear_prepack
+        linear_op = F.linear
         in_channels_list = [4, 8]
         out_channels_list = [16, 32]
         batch_size = 1
@@ -4178,19 +4266,21 @@ def _test_qlinear_pt2e_helper(self, post_op_to_qlinear_ref_dict, supported_post_
         x_scale, x_zp = 1.2, 1
         w_scale, w_zp = 0.8, 0
         y_scale, y_zp = 4.7, 2
-        post_op_args = []
         input_dim_list = [2, 3]
         cases = itertools.product(
             in_channels_list, out_channels_list, use_bias_list,
-            supported_post_ops, weight_quant_per_channel_list, output_dtype_list, post_op_algorithms, input_dim_list)
+            weight_quant_per_channel_list, output_dtype_list, post_op_algorithms, input_dim_list)
         with override_quantized_engine('onednn'):
-            for ic, oc, use_bias, post_op, weight_quant_per_channel, output_dtype, post_op_algo, input_dim in cases:
+            for ic, oc, use_bias, weight_quant_per_channel, output_dtype, post_op_algo, input_dim in cases:
                 used_y_scale = y_scale
                 used_y_zp = y_zp
                 fp32_out = output_dtype == torch.float32
                 bfloat16_out = output_dtype == torch.bfloat16
                 if fp32_out or bfloat16_out:
                     used_y_scale, used_y_zp = 1.0, 0
+                    x2_scale, x2_zp = 1.0, 0
+                else:
+                    x2_scale, x2_zp = 2.3, 5
                 x = torch.rand(batch_size, (ic + 1), ic) * 10 if input_dim == 3 else torch.rand(batch_size, ic) * 10
                 w = torch.rand(oc, ic) * 10
                 qx = torch.quantize_per_tensor(x, x_scale, x_zp, torch.quint8)
@@ -4207,21 +4297,65 @@ def _test_qlinear_pt2e_helper(self, post_op_to_qlinear_ref_dict, supported_post_
                 else:
                     b = None
 
+                x_ref = qx.dequantize()
+                w_ref = qw.dequantize()
+                y_ref = linear_op(x_ref, w_ref, b)
+
                 # compute with CPU tensors
                 qx_cpu = qx.int_repr()
                 qw_cpu = qw.int_repr()
                 qw_packed = qlinear_prepack(qw_cpu, x.shape)
-                qy_cpu = qlinear(qx_cpu, x_scale, x_zp, qw_packed, w_scales, w_zps,
-                                 b, 1.0 / used_y_scale, used_y_zp, output_dtype, post_op, post_op_args, post_op_algo)
-
-                # Reference
-                qw_packed_ref = qlinear_prepack_ref(qw, b)
-                if post_op == 'gelu':
-                    qlinear_ref = torch.ops.quantized.linear(qx, qw_packed_ref, used_y_scale, used_y_zp)
-                    qy_ref = torch.nn.functional.gelu(qlinear_ref, approximate=post_op_algo)
-                else:
-                    qlinear_ref = post_op_to_qlinear_ref_dict[post_op]
-                    qy_ref = qlinear_ref(qx, qw_packed_ref, used_y_scale, used_y_zp)
+
+                if post_op in ("none", "relu", "gelu"):
+                    qy_cpu = qlinear_op(
+                        qx_cpu, x_scale, x_zp, qw_packed, w_scales, w_zps,
+                        b, used_y_scale, used_y_zp, output_dtype,
+                        post_op, unary_post_op_args, post_op_algo
+                    )
+                    if post_op == "relu":
+                        y_ref = F.relu(y_ref)
+                    elif post_op == "gelu":
+                        y_ref = F.gelu(y_ref, approximate=post_op_algo)
+                    qy_ref = torch.quantize_per_tensor(y_ref, used_y_scale, used_y_zp, torch.quint8)
+                elif post_op in ("sum", "sum_relu"):
+                    x2_int8 = torch.randint(0, 4, y_ref.size())
+                    x2 = x2_scale * ((x2_int8 - x2_zp).float())
+                    qx2 = torch.quantize_per_tensor(
+                        x2, scale=x2_scale, zero_point=x2_zp, dtype=torch.quint8
+                    )
+                    unary_post_op = "relu" if post_op == "sum_relu" else "none"
+                    binary_alpha = 1.0  # we only support alpha=1.0 now
+                    accum = qx2.int_repr() if output_dtype is None else qx2.dequantize()
+                    if bfloat16_out:
+                        accum = accum.bfloat16()
+                    qy_cpu = qlinear_op(
+                        qx_cpu, x_scale, x_zp, qw_packed, w_scales, w_zps,
+                        b, used_y_scale, used_y_zp, output_dtype,
+                        accum, x2_scale, x2_zp, "sum", binary_alpha,
+                        unary_post_op, unary_post_op_args, post_op_algo
+                    )
+                    y_ref = y_ref + x2 * binary_alpha
+                    if unary_post_op == "relu":
+                        y_ref = F.relu(y_ref)
+                    qy_ref = torch.quantize_per_tensor(y_ref, used_y_scale, used_y_zp, torch.quint8)
+                elif post_op in ("add", "add_relu"):
+                    used_y_scale, used_y_zp = 1.0, 0
+                    if output_dtype is not None:
+                        # Only support int8 output
+                        continue
+                    x2 = torch.randn(y_ref.size()) * 10
+                    unary_post_op = "relu" if post_op == "add_relu" else "none"
+                    binary_alpha = 1.0  # we only support alpha=1.0 now
+                    qy_cpu = qlinear_op(
+                        qx_cpu, x_scale, x_zp, qw_packed, w_scales, w_zps,
+                        b, used_y_scale, used_y_zp, output_dtype,
+                        x2, 1.0, 0, "add", binary_alpha,
+                        unary_post_op, unary_post_op_args, post_op_algo
+                    )
+                    y_ref = y_ref + x2 * binary_alpha
+                    if unary_post_op == "relu":
+                        y_ref = F.relu(y_ref)
+                    qy_ref = torch.quantize_per_tensor(y_ref, used_y_scale, used_y_zp, torch.quint8)
 
                 # Compare results
                 if fp32_out or bfloat16_out:
@@ -4244,21 +4378,41 @@ def _test_qlinear_pt2e_helper(self, post_op_to_qlinear_ref_dict, supported_post_
                 )
 
     @skipIfNoONEDNN
-    def test_qlinear_linear_relu_pt2e(self):
-        post_op_to_qlinear_ref_dict = {
-            'none': torch.ops.quantized.linear,
-            'relu': torch.ops.quantized.linear_relu,
-        }
-        supported_post_ops = ['none', 'relu']
-        post_op_algorithms = []
-        self._test_qlinear_pt2e_helper(post_op_to_qlinear_ref_dict, supported_post_ops, post_op_algorithms)
+    def test_qlinear_pt2e(self):
+        qlinear = torch.ops.onednn.qlinear_pointwise
+        self._test_qlinear_pt2e_helper(qlinear, "none")
+
+    @skipIfNoONEDNN
+    def test_qlinear_relu_pt2e(self):
+        qlinear = torch.ops.onednn.qlinear_pointwise
+        self._test_qlinear_pt2e_helper(qlinear, "relu")
 
     @skipIfNoONEDNN
     def test_qlinear_gelu_pt2e(self):
-        post_op_to_qlinear_ref_dict = {}
-        supported_post_ops = ['gelu']
+        qlinear = torch.ops.onednn.qlinear_pointwise
         post_op_algorithms = ['none', 'tanh']
-        self._test_qlinear_pt2e_helper(post_op_to_qlinear_ref_dict, supported_post_ops, post_op_algorithms)
+        self._test_qlinear_pt2e_helper(qlinear, "gelu", post_op_algorithms=post_op_algorithms)
+
+    @skipIfNoONEDNN
+    def test_qlinear_sum_pt2e(self):
+        qlinear = torch.ops.onednn.qlinear_pointwise.binary
+        self._test_qlinear_pt2e_helper(qlinear, "sum")
+
+    @skipIfNoONEDNN
+    def test_qlinear_sum_relu_pt2e(self):
+        qlinear = torch.ops.onednn.qlinear_pointwise.binary
+        self._test_qlinear_pt2e_helper(qlinear, "sum_relu")
+
+    @skipIfNoONEDNN
+    def test_qlinear_add_pt2e(self):
+        qlinear = torch.ops.onednn.qlinear_pointwise.binary
+        self._test_qlinear_pt2e_helper(qlinear, "add")
+
+    @skipIfNoONEDNN
+    def test_qlinear_add_relu_pt2e(self):
+        qlinear = torch.ops.onednn.qlinear_pointwise.binary
+        self._test_qlinear_pt2e_helper(qlinear, "add_relu")
+
 
 @unittest.skipIf(IS_MACOS, "Known test failure on Mac.")
 class TestQuantizedEmbeddingOps(TestCase):
@@ -6574,6 +6728,14 @@ def _test_qconv_impl_cpu_tensor(
             assert len(post_op.scalars) == 2, "For post op hardtanh, expect 2 parameters passed in"
             hardtanh = torch.nn.Hardtanh(min_val=post_op.scalars[0], max_val=post_op.scalars[1])
             result_ref = hardtanh(result_ref)
+        elif post_op.unary_attr == "hardswish":
+            assert not use_transpose, "Cannot fuse hardswish with ConvTranspose"
+            hardswish = torch.nn.Hardswish()
+            result_ref = hardswish(result_ref)
+        elif post_op.unary_attr == "swish":
+            assert not use_transpose, "Cannot fuse silu with ConvTranspose"
+            silu = torch.nn.SiLU()
+            result_ref = silu(result_ref)
 
         # Quantize reference results for comparison
         result_ref_q = torch.quantize_per_tensor(
@@ -6634,7 +6796,7 @@ def _test_qconv_impl_cpu_tensor(
                 pads,
                 dilations,
                 groups,
-                1.0 / Y_scale,  # Kernel expects pass in reciprocal of scale in fake quant
+                Y_scale,
                 Y_zero_point,
                 qconv_output_dtype,
                 post_op.binary_attr,
@@ -6656,7 +6818,7 @@ def _test_qconv_impl_cpu_tensor(
                 pads,
                 dilations,
                 groups,
-                1.0 / Y_scale,  # Kernel expects pass in reciprocal of scale in fake quant
+                Y_scale,
                 Y_zero_point,
                 qconv_output_dtype,
                 post_op.unary_attr,
@@ -6968,6 +7130,107 @@ def test_qconv2d_hardtanh_pt2e(self):
                 qconv_output_dtype=output_dtype,
             )
 
+    # Test qconv with post op silu
+    @skipIfNoONEDNN
+    def test_qconv2d_silu_pt2e(self):
+        input_channels_per_group = 2
+        output_channels_per_group = 2
+        groups_list = [1, 10]
+        input_feature_map_shape = (10, 10)
+        kernels = (3, 3)
+        strides = (2, 2)
+        pads = (1, 1)
+        dilations = (1, 1)
+        W_scale = [1.5]
+        W_zero_point = [0]
+        use_bias_list = [False, True]
+        use_channelwise_list = [False, True]
+        output_dtype_list = [None, torch.float32, torch.bfloat16]
+        options = itertools.product(groups_list, use_bias_list, use_channelwise_list, output_dtype_list)
+        for groups, use_bias, use_channelwise, output_dtype in options:
+            qconv = torch.ops.onednn.qconv2d_pointwise
+            qconv_prepack = torch.ops.onednn.qconv_prepack
+            conv_op = torch.nn.Conv2d(
+                input_channels_per_group * groups,
+                output_channels_per_group * groups,
+                kernels,
+                strides,
+                pads,
+                dilations,
+                groups,
+            )
+            pointwise_post_op = PointwisePostOp(unary_attr="swish")
+            self._test_qconv_impl_cpu_tensor(
+                qconv,
+                qconv_prepack,
+                conv_op,
+                input_channels_per_group=input_channels_per_group,
+                input_feature_map_shape=input_feature_map_shape,
+                output_channels_per_group=output_channels_per_group,
+                groups=groups,
+                kernels=kernels,
+                strides=strides,
+                pads=pads,
+                dilations=dilations,
+                W_scale=W_scale,
+                W_zero_point=W_zero_point,
+                use_bias=use_bias,
+                post_op=pointwise_post_op,
+                use_channelwise=use_channelwise,
+                qconv_output_dtype=output_dtype,
+            )
+
+        # Test qconv with post op hardswish
+        @skipIfNoONEDNN
+        def test_qconv2d_hardswish_pt2e(self):
+            input_channels_per_group = 2
+            output_channels_per_group = 2
+            groups_list = [1, 10]
+            input_feature_map_shape = (10, 10)
+            kernels = (3, 3)
+            strides = (2, 2)
+            pads = (1, 1)
+            dilations = (1, 1)
+            W_scale = [1.5]
+            W_zero_point = [0]
+            use_bias_list = [False, True]
+            use_channelwise_list = [False, True]
+            output_dtype_list = [None, torch.float32, torch.bfloat16]
+            options = itertools.product(groups_list, use_bias_list, use_channelwise_list, output_dtype_list)
+
+            for groups, use_bias, use_channelwise, output_dtype in options:
+                qconv = torch.ops.onednn.qconv2d_pointwise
+                qconv_prepack = torch.ops.onednn.qconv_prepack
+                conv_op = torch.nn.Conv2d(
+                    input_channels_per_group * groups,
+                    output_channels_per_group * groups,
+                    kernels,
+                    strides,
+                    pads,
+                    dilations,
+                    groups,
+                )
+                pointwise_post_op = PointwisePostOp(unary_attr="hardswish")
+                self._test_qconv_impl_cpu_tensor(
+                    qconv,
+                    qconv_prepack,
+                    conv_op,
+                    input_channels_per_group=input_channels_per_group,
+                    input_feature_map_shape=input_feature_map_shape,
+                    output_channels_per_group=output_channels_per_group,
+                    groups=groups,
+                    kernels=kernels,
+                    strides=strides,
+                    pads=pads,
+                    dilations=dilations,
+                    W_scale=W_scale,
+                    W_zero_point=W_zero_point,
+                    use_bias=use_bias,
+                    post_op=pointwise_post_op,
+                    use_channelwise=use_channelwise,
+                    qconv_output_dtype=output_dtype,
+                )
+
     # Test qconv with post op sum
     @skipIfNoONEDNN
     def test_qconv2d_sum_pt2e(self):
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index b2bd97bdc305f..00462023b6b5b 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -308,10 +308,11 @@ def _test_qtensor_sub_byte(self, rows, cols, dtype, elements_per_byte, expected_
         # Test save/load
         with tempfile.NamedTemporaryFile() as f:
             torch.save(qr, f)
-            f.seek(0)
-            loaded_q = torch.load(f)
-            loaded_int_repr = loaded_q.int_repr()
-            self.assertEqual(int_repr, loaded_int_repr)
+            for weights_only in [True, False]:
+                f.seek(0)
+                loaded_q = torch.load(f, weights_only=weights_only)
+                loaded_int_repr = loaded_q.int_repr()
+                self.assertEqual(int_repr, loaded_int_repr)
 
     def test_qtensor_channel_float_assignment(self):
         t1 = torch.rand(2, 3, 5, 5)
@@ -831,11 +832,12 @@ def test_qtensor_load_save(self):
             with tempfile.NamedTemporaryFile() as f:
                 # Serializing and Deserializing Tensor
                 torch.save((qr, qrv), f)
-                f.seek(0)
-                qr2, qrv2 = torch.load(f)
-                self.assertEqual(qr, qr2)
-                self.assertEqual(qrv, qrv2)
-                self.assertEqual(qr2.storage().data_ptr(), qrv2.storage().data_ptr())
+                for weights_only in [True, False]:
+                    f.seek(0)
+                    qr2, qrv2 = torch.load(f, weights_only=weights_only)
+                    self.assertEqual(qr, qr2)
+                    self.assertEqual(qrv, qrv2)
+                    self.assertEqual(qr2.storage().data_ptr(), qrv2.storage().data_ptr())
 
     def test_qtensor_per_channel_load_save(self):
         r = torch.rand(20, 10, dtype=torch.float) * 4 - 2
@@ -849,9 +851,10 @@ def test_qtensor_per_channel_load_save(self):
             with tempfile.NamedTemporaryFile() as f:
                 # Serializing and Deserializing Tensor
                 torch.save(qr, f)
-                f.seek(0)
-                qr2 = torch.load(f)
-                self.assertEqual(qr, qr2)
+                for weights_only in [True, False]:
+                    f.seek(0)
+                    qr2 = torch.load(f, weights_only=weights_only)
+                    self.assertEqual(qr, qr2)
 
     def test_qtensor_copy(self):
         scale = 0.5
@@ -1345,6 +1348,7 @@ def test_qscheme_pickle(self):
         torch.save(f, buf)
 
         buf.seek(0)
+        # Don't test weights_only here as this is loading a Module (legacy)
         f2 = torch.load(buf)
 
         self.assertEqual(f2.qscheme, torch.per_tensor_symmetric)
@@ -1429,9 +1433,10 @@ def forward(self, x, y):
             m = M()
             m(q, qc)
             with open(fname, "rb") as handle:
-                loaded_q, loaded_qc = torch.load(fname)
-                self.assertEqual(loaded_q, q)
-                self.assertEqual(loaded_qc, qc)
+                for weights_only in [True, False]:
+                    loaded_q, loaded_qc = torch.load(fname, weights_only=weights_only)
+                    self.assertEqual(loaded_q, q)
+                    self.assertEqual(loaded_qc, qc)
 
     def test_pickle_checkpoint_qtensor(self):
         self._test_pickle_checkpoint_qtensor('cpu')
@@ -1602,6 +1607,14 @@ def test_decomposed_dequantize_per_channel(self):
         self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
         self.assertEqual(dequantized_X, dequantized_decomposed_X)
 
+    def test_decomposed_choose_qparams_per_token_asymmetric_backward(self):
+        # register the ops
+        import torch.ao.quantization.fx._decomposed
+        x = torch.randn(2, 3).requires_grad_()
+        (s, zp) = torch.ops.quantized_decomposed._choose_qparams_per_token_asymmetric_impl(x, torch.int8)
+        out = x.div(s).add(zp).round()
+        out.sum().backward()
+
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
                        "\tpython test/test_quantization.py TESTNAME\n\n"
diff --git a/test/quantization/core/test_utils.py b/test/quantization/core/test_utils.py
index 94ae616096047..d30cc0d919a5b 100644
--- a/test/quantization/core/test_utils.py
+++ b/test/quantization/core/test_utils.py
@@ -191,3 +191,31 @@ def test_quantize_weight_clamping_per_channel(self):
         quantized_tensor = _quantize_weight(float_tensor, observer)
         assert quantized_tensor.int_repr().max().item() == q8_max
         assert quantized_tensor.int_repr().min().item() == q8_min
+
+    def test_uint1_7_dtype(self):
+
+        def up_size(size):
+            return (*size[:-1], size[-1] * 2)
+
+        class UInt4Tensor(torch.Tensor):
+            @staticmethod
+            def __new__(cls, elem, **kwargs):
+                assert elem.dtype is torch.uint8
+                assert not kwargs.get("requires_grad", False)
+                kwargs["requires_grad"] = False
+                return torch.Tensor._make_wrapper_subclass(cls, up_size(elem.shape), dtype=torch.uint4, **kwargs)
+
+            def __init__(self, elem):
+                self.elem = elem
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args, kwargs=None):
+                pass
+
+        # make sure it runs
+        x = UInt4Tensor(torch.tensor([
+            [0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF],
+            [0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF],
+            [0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF],
+        ], dtype=torch.uint8))
+        assert x.dtype == torch.uint4
diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py
index 2776749b331d9..71ae3492122f7 100644
--- a/test/quantization/core/test_workflow_module.py
+++ b/test/quantization/core/test_workflow_module.py
@@ -43,7 +43,7 @@
 import torch.testing._internal.hypothesis_utils as hu
 hu.assert_deadline_disabled()
 from torch.testing._internal.common_cuda import TEST_MULTIGPU, TEST_CUDA
-from torch.testing._internal.common_utils import TestCase
+from torch.testing._internal.common_utils import TestCase, skipIfTorchDynamo
 from torch.testing._internal.common_quantization import (
     QuantizationTestCase,
     AnnotatedSingleLayerLinearModel,
@@ -137,16 +137,17 @@ def _get_ref_params(reduce_range, qscheme, dtype, input_scale, min_val, max_val)
             state_dict = myobs.state_dict()
             b = io.BytesIO()
             torch.save(state_dict, b)
-            b.seek(0)
-            loaded_dict = torch.load(b)
-            for key in state_dict:
-                self.assertEqual(state_dict[key], loaded_dict[key])
-            loaded_obs = MinMaxObserver(dtype=qdtype, qscheme=qscheme, reduce_range=reduce_range)
-            loaded_obs.load_state_dict(loaded_dict)
-            loaded_qparams = loaded_obs.calculate_qparams()
-            self.assertEqual(myobs.min_val, loaded_obs.min_val)
-            self.assertEqual(myobs.max_val, loaded_obs.max_val)
-            self.assertEqual(myobs.calculate_qparams(), loaded_obs.calculate_qparams())
+            for weights_only in [True, False]:
+                b.seek(0)
+                loaded_dict = torch.load(b, weights_only=weights_only)
+                for key in state_dict:
+                    self.assertEqual(state_dict[key], loaded_dict[key])
+                loaded_obs = MinMaxObserver(dtype=qdtype, qscheme=qscheme, reduce_range=reduce_range)
+                loaded_obs.load_state_dict(loaded_dict)
+                loaded_qparams = loaded_obs.calculate_qparams()
+                self.assertEqual(myobs.min_val, loaded_obs.min_val)
+                self.assertEqual(myobs.max_val, loaded_obs.max_val)
+                self.assertEqual(myobs.calculate_qparams(), loaded_obs.calculate_qparams())
 
 
     @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)),
@@ -357,6 +358,31 @@ def test_histogram_observer_handle_close_to_infinity(self):
             result = torch.softmax(input + dequant_mask, dim=1)
             self.assertEqual(result, ref_result)
 
+    def test_histogram_observer_handle_OOM_due_to_close_min_max_value(self):
+        obser = HistogramObserver.with_args(reduce_range=False)()
+        # close min and max value in the 1st forward() pass of observer tends
+        # to cause OOM in the following pass.
+        # This is due to the allocation of histogram tensor during _combine_histograms().
+        # With sanity check on the size of histogram tensor, we expect the histogram observer
+        # can still work by resetting the histogram
+        x1 = torch.tensor([0, 1e-9])
+        obser(x1)
+
+        x2 = torch.tensor([2.0, 3.0])
+        obser(x2)
+
+    def test_histogram_observer_handle_OOM_due_to_large_upsample_rate(self):
+        # a large upsample rate leads to OOM due to the allocation of histogram tensor
+        # during _combine_histograms(). With sanity check on the size of histogram tensor,
+        # we expect the histogram observer can still work by resetting the histogram
+        obser = HistogramObserver.with_args(upsample_rate=(8000**2), reduce_range=False)()
+
+        x1 = torch.tensor([0, 1.0])
+        obser(x1)
+
+        x2 = torch.tensor([2, 2 + 1e-9])
+        obser(x2)
+
     def test_histogram_observer_save_load_state_dict(self):
         """
         Smoke test on saving/loading state_dict
@@ -449,6 +475,18 @@ def test_dynamic_quant_observer_matching_choose_qparams(self):
             self.assertEqual(scale, params[0])
             self.assertEqual(zero_point, params[1])
 
+    def test_per_channel_observers_load_state_dict(self):
+        observer_list = [PerChannelMinMaxObserver, MovingAveragePerChannelMinMaxObserver]
+
+        for obs_cls in observer_list:
+            obs = obs_cls()
+            obs(torch.randn((32, 32)))
+            new_obs = obs_cls()
+            # make sure the state_dict can be loaded
+            new_obs.load_state_dict(obs.state_dict())
+            self.assertTrue(torch.equal(obs.min_val, new_obs.min_val))
+            self.assertTrue(torch.equal(obs.max_val, new_obs.max_val))
+
 # HistogramObserver that works like it does on master
 class _ReferenceHistogramObserver(HistogramObserver):
     def __init__(self, *args, **kwargs):
@@ -725,6 +763,7 @@ def test_histogram_observer_same_inputs(self):
         self.assertEqual(myobs.max_val, 8.0)
         self.assertEqual(myobs.histogram, [2., 3., 3.])
 
+    @skipIfTorchDynamo("too slow")
     @given(N=st.sampled_from([10, 1000]),
            bins=st.sampled_from([256, 512, 1024, 2048]),
            dtype=st.sampled_from([torch.qint8, torch.quint8]),
diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py
index 0000ef9bbc27e..99d60b78aa54d 100644
--- a/test/quantization/core/test_workflow_ops.py
+++ b/test/quantization/core/test_workflow_ops.py
@@ -30,7 +30,7 @@
 import torch.testing._internal.hypothesis_utils as hu
 hu.assert_deadline_disabled()
 from torch.testing._internal.common_cuda import TEST_CUDA
-from torch.testing._internal.common_utils import TestCase
+from torch.testing._internal.common_utils import TestCase, skipIfTorchDynamo
 
 # Reference method for fake quantize
 # Note: because scale/zero_point are left as float in the actual kernel, this mimics how fake_quant works for float16/64
@@ -570,14 +570,15 @@ def test_fq_serializable_per_tensor(self):
             self.assertEqual(state_dict['zero_point'], 53)
             b = io.BytesIO()
             torch.save(state_dict, b)
-            b.seek(0)
-            loaded_dict = torch.load(b)
-            loaded_fq_module = FakeQuantizeClass(observer, quant_min, quant_max)
-            loaded_fq_module.load_state_dict(loaded_dict)
-            for key in state_dict:
-                self.assertEqual(state_dict[key], loaded_fq_module.state_dict()[key])
+            for weights_only in [True, False]:
+                b.seek(0)
+                loaded_dict = torch.load(b, weights_only=weights_only)
+                loaded_fq_module = FakeQuantizeClass(observer, quant_min, quant_max)
+                loaded_fq_module.load_state_dict(loaded_dict)
+                for key in state_dict:
+                    self.assertEqual(state_dict[key], loaded_fq_module.state_dict()[key])
 
-            self.assertEqual(loaded_fq_module.calculate_qparams(), fq_module.calculate_qparams())
+                self.assertEqual(loaded_fq_module.calculate_qparams(), fq_module.calculate_qparams())
 
     def test_fake_quant_control(self):
         for fq_module in [torch.ao.quantization.default_fake_quant(),
@@ -924,18 +925,13 @@ def _test_learnable_backward_per_channel(self, X_base, device, scale_base, zero_
 
                 self.assertTrue(
                     torch.allclose(dX_expected, dX_actual, rtol=tolerance, atol=tolerance),
-                    "Expected dX={} to match X.grad={}, X={}, s={}, z={}, dout={}, n_bits={}".format(
-                        dX_expected, dX_actual, X_curr, scale_curr, zero_point_curr, dout, n_bits))
+                    f"Expected dX={dX_expected} to match X.grad={dX_actual}, X={X_curr}, s={scale_curr}, z={zero_point_curr}, dout={dout}, n_bits={n_bits}")  # noqa: B950
                 self.assertTrue(
                     torch.allclose(dScale_expected * grad_factor, dScale_actual, rtol=tolerance, atol=tolerance),
-                    "Expected dScale={} to match scale.grad={}, X={}, s={}, z={}, dout={}, n_bits={}".format(
-                        dScale_expected * grad_factor, dScale_actual,
-                        X_curr, scale_curr, zero_point_curr, dout, n_bits))
+                    f"Expected dScale={dScale_expected * grad_factor} to match scale.grad={dScale_actual}, X={X_curr}, s={scale_curr}, z={zero_point_curr}, dout={dout}, n_bits={n_bits}")  # noqa: B950
                 self.assertTrue(
                     torch.allclose(dZeroPoint_expected * grad_factor, dZeroPoint_actual, rtol=tolerance, atol=tolerance),
-                    "Expected dZeroPoint={} to match zero_point.grad={}, X={}, s={}, z={}, dout={}, n_bits={}".format(
-                        dZeroPoint_expected * grad_factor, dZeroPoint_actual,
-                        X_curr, scale_curr, zero_point_curr, dout, n_bits))
+                    f"Expected dZeroPoint={dZeroPoint_expected * grad_factor} to match zero_point.grad={dZeroPoint_actual}, X={X_curr}, s={scale_curr}, z={zero_point_curr}, dout={dout}, n_bits={n_bits}")  # noqa: B950
                 X_curr.grad.data.zero_()
                 scale_curr.grad.data.zero_()
                 zero_point_curr.grad.data.zero_()
@@ -1013,6 +1009,30 @@ def _test_numerical_consistency(self, test_type):
                         Y, Y_prime, "Difference found between dequant+quant_per_channel and fake_quantize_per_channel")
                 self.assertTrue(test_was_run)
 
+    @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
+    def test_fake_quantize_per_channel_affine_scale_dtypes(self):
+        """
+        Ensure the error message is more helpful
+        """
+        dtype_list = [torch.float, torch.float64, torch.bfloat16, torch.half]
+        for scale_dtype in dtype_list:
+            input = torch.randn(3, 4, 5, 6)
+            scale = torch.Tensor([0.1, 0.2, 0.3, 0.4]).to(scale_dtype)
+            zero_point = torch.tensor([1, 2, 3, 4], dtype=torch.int32)
+            axis = 1
+            quant_min = 0
+            quant_max = 255
+            if scale_dtype != torch.float:
+                with self.assertRaises(RuntimeError):
+                    torch.fake_quantize_per_channel_affine(
+                        input, scale, zero_point, axis, quant_min, quant_max
+                    )
+            else:
+                torch.fake_quantize_per_channel_affine(
+                    input, scale, zero_point, axis, quant_min, quant_max
+                )
+
+
 class TestFusedObsFakeQuant(TestCase):
     @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
            symmetric_quant=st.booleans())
diff --git a/test/quantization/fx/test_model_report_fx.py b/test/quantization/fx/test_model_report_fx.py
index c7f4fe595d9ba..ee64bc3c79153 100644
--- a/test/quantization/fx/test_model_report_fx.py
+++ b/test/quantization/fx/test_model_report_fx.py
@@ -1346,7 +1346,7 @@ def test_input_weight_equalization_determine_points(self):
                 # assert that each of the desired modules have the observers inserted
                 for fqn, module in prepared_for_callibrate_model.named_modules():
                     # check if module is a supported module
-                    is_in_include_list = sum([isinstance(module, x) for x in mods_to_check]) > 0
+                    is_in_include_list = sum(isinstance(module, x) for x in mods_to_check) > 0
 
                     if is_in_include_list:
                         # make sure it has the observer attribute
@@ -1563,7 +1563,7 @@ def test_outlier_detection_determine_points(self):
             obs_name_to_find = InputWeightEqualizationDetector.DEFAULT_PRE_OBSERVER_NAME
 
             number_of_obs_found = sum(
-                [1 if obs_name_to_find in str(node.target) else 0 for node in prepared_for_callibrate_model.graph.nodes]
+                1 if obs_name_to_find in str(node.target) else 0 for node in prepared_for_callibrate_model.graph.nodes
             )
             self.assertEqual(number_of_obs_found, correct_number_of_obs_inserted)
 
@@ -1753,7 +1753,7 @@ def test_multiple_run_consistent_spike_outlier_report_gen(self):
                     assert sum(counts_info) >= 2
 
                     # half of the recorded max values should be what we set
-                    matched_max = sum([val == 3.28e8 for val in module_dict[OutlierDetector.MAX_VALS_KEY]])
+                    matched_max = sum(val == 3.28e8 for val in module_dict[OutlierDetector.MAX_VALS_KEY])
                     self.assertEqual(matched_max, param_size / 2)
 
 
diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py
index 191f927c7b217..f90a3751f9be3 100644
--- a/test/quantization/fx/test_numeric_suite_fx.py
+++ b/test/quantization/fx/test_numeric_suite_fx.py
@@ -36,6 +36,7 @@
     skip_if_no_torchvision,
     TwoLayerLinearModel
 )
+from torch.testing._internal.common_utils import skipIfTorchDynamo
 from torch.ao.quantization.quantization_mappings import (
     get_default_static_quant_module_mappings,
     get_default_dynamic_quant_module_mappings,
@@ -792,6 +793,7 @@ def test_results_order(self):
 
 class TestFXGraphMatcherModels(QuantizationTestCase):
 
+    @skipIfTorchDynamo("too slow")
     @skipIfNoFBGEMM
     @skip_if_no_torchvision
     def test_mobilenet_v2(self):
@@ -2281,6 +2283,7 @@ def _check_logger_count(model, exp_count_stats, exp_count_comparisons):
         # stats should be empty, but comparisons should be there
         _check_logger_count(msq, 0, 1)
 
+    @skipIfTorchDynamo("too slow")
     @skip_if_no_torchvision
     @withQNNPACKBackend
     def test_mobilenet_v2(self):
@@ -2736,6 +2739,7 @@ def forward(self, x):
         ):
             self._test_add_loggers_impl(m, example_input, qconfig_mapping)
 
+    @skipIfTorchDynamo("too slow")
     @skip_if_no_torchvision
     @withQNNPACKBackend
     def test_add_loggers_mobilenet_v2(self):
@@ -2887,6 +2891,7 @@ def test_sparsenn_shadow(self):
                 results_len=3,
                 should_log_inputs=should_log_inputs)
 
+    @skipIfTorchDynamo("too slow")
     @skip_if_no_torchvision
     @skipIfNoFBGEMM
     def test_resnet18(self):
@@ -2898,6 +2903,7 @@ def test_resnet18(self):
             qconfig_dict=qconfig_dict,
             should_log_inputs=False)
 
+    @skipIfTorchDynamo("too slow")
     @skip_if_no_torchvision
     @skipIfNoFBGEMM
     def test_mobilenet_v2(self):
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 7b890c1c02137..c2df4147465ca 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -191,6 +191,7 @@
 from torch.testing._internal.common_utils import (
     TemporaryFileName,
     IS_ARM64,
+    skipIfTorchDynamo,
 )
 
 from torch.testing._internal.common_quantization import NodeSpec as ns
@@ -3120,19 +3121,20 @@ def test_save_observer_state_dict(self):
 
         b = io.BytesIO()
         torch.save(obs_dict, b)
-        b.seek(0)
 
         # Load the stats into new model
-        model_2 = orig
-        model_2 = prepare_fx(model_2, qconfig_dict, example_inputs=(x,))
+        for weights_only in [True, False]:
+            b.seek(0)
+            model_2 = orig
+            model_2 = prepare_fx(model_2, qconfig_dict, example_inputs=(x,))
 
-        loaded_dict = torch.load(b)
-        torch.ao.quantization.load_observer_state_dict(model_2, loaded_dict)
+            loaded_dict = torch.load(b, weights_only=weights_only)
+            torch.ao.quantization.load_observer_state_dict(model_2, loaded_dict)
 
-        quant_2 = convert_fx(model_2)
+            quant_2 = convert_fx(model_2)
 
-        # Verify that loaded state dict produces same results.
-        self.assertEqual(quant(x), quant_2(x))
+            # Verify that loaded state dict produces same results.
+            self.assertEqual(quant(x), quant_2(x))
 
     @skipIfNoFBGEMM
     def test_custom_module_class(self):
@@ -4283,6 +4285,7 @@ def checkModel(m, data, ref_weight, ref_bias, ref_res):
         m.load_state_dict(state_dict)
         with TemporaryFileName() as fname:
             torch.save(m.state_dict(), fname)
+            # Don't test weights_only here as this is loading a ScriptModule
             m.load_state_dict(torch.load(fname))
 
         checkModel(m, data, ref_weight, ref_bias, ref_res)
@@ -8717,7 +8720,7 @@ def _test_rnn_impl(self, qconfigs, M, module_type_strs, module_types, sample_inp
                 continue
                 # fp16 dynamic quant is not supported for qnnpack
 
-            eager_qconfig_dict = {x : qconfig for x in module_types}
+            eager_qconfig_dict = dict.fromkeys(module_types, qconfig)
             model_eager = quantize_dynamic(model_eager, qconfig_spec=eager_qconfig_dict)
 
             graph_qconfig_dict = {
@@ -9261,6 +9264,7 @@ def forward(self, x):
                 out = model_quantized(input.to(device_after))
                 self.assertEqual(out.device.type, device_after)
 
+    @skipIfTorchDynamo("too slow")
     @skip_if_no_torchvision
     def test_model_dropout(self):
         from torchvision import models
diff --git a/test/quantization/jit/test_deprecated_jit_quant.py b/test/quantization/jit/test_deprecated_jit_quant.py
index 806cff230fe4a..2e5be93647c6c 100644
--- a/test/quantization/jit/test_deprecated_jit_quant.py
+++ b/test/quantization/jit/test_deprecated_jit_quant.py
@@ -1,14 +1,9 @@
 # Owner(s): ["oncall: quantization"]
 
 import torch
-from torch.testing._internal.common_quantization import (
-    skipIfNoFBGEMM
-)
-from torch.testing._internal.common_utils import suppress_warnings
+from torch.testing._internal.common_quantization import skipIfNoFBGEMM
 from torch.testing._internal.jit_utils import JitTestCase
 
-from typing import Tuple
-import copy
 
 class TestDeprecatedJitQuantized(JitTestCase):
     @skipIfNoFBGEMM
@@ -38,70 +33,29 @@ def test_rnn_cell_quantized(self):
             # product that overflows the int16 range, e.g.
             # (255*127+255*127) = 64770. So, we hardcode the test values
             # here and ensure a mix of signedness.
-            vals = [[100, -155],
-                    [100, -155],
-                    [-155, 100],
-                    [-155, 100],
-                    [100, -155],
-                    [-155, 100],
-                    [-155, 100],
-                    [100, -155]]
-            vals = vals[:d_hid * num_chunks]
+            vals = [
+                [100, -155],
+                [100, -155],
+                [-155, 100],
+                [-155, 100],
+                [100, -155],
+                [-155, 100],
+                [-155, 100],
+                [100, -155],
+            ]
+            vals = vals[: d_hid * num_chunks]
             cell.weight_ih = torch.nn.Parameter(
-                torch.tensor(vals, dtype=torch.float),
-                requires_grad=False)
+                torch.tensor(vals, dtype=torch.float), requires_grad=False
+            )
             cell.weight_hh = torch.nn.Parameter(
-                torch.tensor(vals, dtype=torch.float),
-                requires_grad=False)
+                torch.tensor(vals, dtype=torch.float), requires_grad=False
+            )
 
-            ref = copy.deepcopy(cell)
-
-            cell = torch.jit.quantized.quantize_rnn_cell_modules(cell)
-            x = torch.tensor([[100, -155],
-                              [-155, 100],
-                              [100, -155]], dtype=torch.float)
-            h0_vals = [[-155, 100],
-                       [-155, 155],
-                       [100, -155]]
-            hx = torch.tensor(h0_vals, dtype=torch.float)
-            if isinstance(cell, torch.jit.quantized.QuantizedLSTMCell):
-                cx = torch.tensor(h0_vals, dtype=torch.float)
-                hiddens = (hx, cx)
-            else:
-                hiddens = hx
-
-            if isinstance(cell, torch.jit.quantized.QuantizedLSTMCell):
-                class ScriptWrapper(torch.jit.ScriptModule):
-                    def __init__(self, cell):
-                        super().__init__()
-                        self.cell = cell
-
-                    @torch.jit.script_method
-                    def forward(self, x: torch.Tensor,
-                                hiddens: Tuple[torch.Tensor, torch.Tensor]
-                                ) -> Tuple[torch.Tensor, torch.Tensor]:
-                        return self.cell(x, hiddens)
-            else:
-
-                class ScriptWrapper(torch.jit.ScriptModule):
-                    def __init__(self, cell):
-                        super().__init__()
-                        self.cell = cell
-
-                    @torch.jit.script_method
-                    def forward(self, x: torch.Tensor, hiddens: torch.Tensor) -> torch.Tensor:
-                        return self.cell(x, hiddens)
-
-            cell = ScriptWrapper(cell)
-            outs = cell(x, hiddens)
-            cell = self.getExportImportCopyWithPacking(cell)
-
-            outs = cell(x, hiddens)
-            ref_outs = ref(x, hiddens)
-
-            self.assertEqual(len(outs), len(ref_outs))
-            for out, ref_out in zip(outs, ref_outs):
-                torch.testing.assert_close(out, ref_out)
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "quantize_rnn_cell_modules function is no longer supported",
+            ):
+                cell = torch.jit.quantized.quantize_rnn_cell_modules(cell)
 
     @skipIfNoFBGEMM
     def test_rnn_quantized(self):
@@ -111,7 +65,6 @@ def test_rnn_quantized(self):
             torch.nn.LSTM(d_in, d_hid).float(),
             torch.nn.GRU(d_in, d_hid).float(),
         ]:
-
             # Replace parameter values s.t. the range of values is exactly
             # 255, thus we will have 0 quantization error in the quantized
             # GEMM call. This i s for testing purposes.
@@ -123,105 +76,44 @@ def test_rnn_quantized(self):
             # product that overflows the int16 range, e.g.
             # (255*127+255*127) = 64770. So, we hardcode the test values
             # here and ensure a mix of signedness.
-            vals = [[100, -155],
-                    [100, -155],
-                    [-155, 100],
-                    [-155, 100],
-                    [100, -155],
-                    [-155, 100],
-                    [-155, 100],
-                    [100, -155]]
+            vals = [
+                [100, -155],
+                [100, -155],
+                [-155, 100],
+                [-155, 100],
+                [100, -155],
+                [-155, 100],
+                [-155, 100],
+                [100, -155],
+            ]
             if isinstance(cell, torch.nn.LSTM):
                 num_chunks = 4
             elif isinstance(cell, torch.nn.GRU):
                 num_chunks = 3
-            vals = vals[:d_hid * num_chunks]
+            vals = vals[: d_hid * num_chunks]
             cell.weight_ih_l0 = torch.nn.Parameter(
-                torch.tensor(vals, dtype=torch.float),
-                requires_grad=False)
+                torch.tensor(vals, dtype=torch.float), requires_grad=False
+            )
             cell.weight_hh_l0 = torch.nn.Parameter(
-                torch.tensor(vals, dtype=torch.float),
-                requires_grad=False)
-
-            ref = copy.deepcopy(cell)
-            cell_int8 = torch.jit.quantized.quantize_rnn_modules(cell, dtype=torch.int8)
-            cell_fp16 = torch.jit.quantized.quantize_rnn_modules(cell, dtype=torch.float16)
-
-            niter = 10
-            x = torch.tensor([[100, -155],
-                              [-155, 100],
-                              [100, -155]], dtype=torch.float).unsqueeze(0).repeat(niter, 1, 1)
-            h0_vals = [[-155, 100],
-                       [-155, 155],
-                       [100, -155]]
-            hx = torch.tensor(h0_vals, dtype=torch.float).unsqueeze(0)
-            cx = torch.tensor(h0_vals, dtype=torch.float).unsqueeze(0)
+                torch.tensor(vals, dtype=torch.float), requires_grad=False
+            )
 
-            if isinstance(ref, torch.nn.LSTM):
-                hiddens = (hx, cx)
-            elif isinstance(ref, torch.nn.GRU):
-                hiddens = hx
+            with self.assertRaisesRegex(
+                RuntimeError, "quantize_rnn_modules function is no longer supported"
+            ):
+                cell_int8 = torch.jit.quantized.quantize_rnn_modules(
+                    cell, dtype=torch.int8
+                )
 
-            ref_out, ref_hid = ref(x, hiddens)
+            with self.assertRaisesRegex(
+                RuntimeError, "quantize_rnn_modules function is no longer supported"
+            ):
+                cell_fp16 = torch.jit.quantized.quantize_rnn_modules(
+                    cell, dtype=torch.float16
+                )
 
-            # Compare int8 quantized to unquantized
-            output_int8, final_hiddens_int8 = cell_int8(x, hiddens)
+    if "fbgemm" in torch.backends.quantized.supported_engines:
 
-            torch.testing.assert_close(output_int8, ref_out)
-            for out, ref in zip(final_hiddens_int8, ref_hid):
-                torch.testing.assert_close(out, ref)
-
-            # Compare fp16 quantized to unquantized
-            output_fp16, final_hiddens_fp16 = cell_fp16(x, hiddens)
-
-            torch.testing.assert_close(output_fp16, ref_out)
-            for out, ref in zip(final_hiddens_fp16, ref_hid):
-                torch.testing.assert_close(out, ref)
-
-            def compare_quantized_unquantized(ScriptWrapper, cell):
-                wrapper = ScriptWrapper(cell)
-
-                # Compare quantize scripted module to unquantized
-                script_out, script_hid = wrapper(x, hiddens)
-                torch.testing.assert_close(script_out, ref_out)
-                for out, ref in zip(script_hid, ref_hid):
-                    torch.testing.assert_close(out, ref)
-
-                # Compare export/import to unquantized
-                export_import_wrapper = self.getExportImportCopyWithPacking(wrapper)
-                ei_out, ei_hid = export_import_wrapper(x, hiddens)
-                torch.testing.assert_close(ei_out, ref_out)
-                for out, ref in zip(ei_hid, ref_hid):
-                    torch.testing.assert_close(out, ref)
-
-            if isinstance(cell, torch.jit.quantized.QuantizedGRU):
-                class ScriptWrapper(torch.jit.ScriptModule):
-                    def __init__(self, cell):
-                        super().__init__()
-                        self.cell = cell
-
-                    @torch.jit.script_method
-                    def forward(self, x: torch.Tensor, hiddens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-                        return self.cell(x, hiddens)
-
-                compare_quantized_unquantized(ScriptWrapper, cell)
-            elif isinstance(cell, torch.jit.quantized.QuantizedLSTM):
-                for cell in [cell_int8, cell_fp16]:
-                    class ScriptWrapper(torch.jit.ScriptModule):
-                        def __init__(self, cell):
-                            super().__init__()
-                            self.cell = cell
-
-                        @torch.jit.script_method
-                        def forward(self, x, hiddens):
-                            # type: (torch.Tensor, Tuple[torch.Tensor, torch.Tensor])
-                            #        -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
-                            return self.cell(x, hiddens)
-                    compare_quantized_unquantized(ScriptWrapper, cell)
-
-    if 'fbgemm' in torch.backends.quantized.supported_engines:
-        # Suppression: using deprecated quant api
-        @suppress_warnings
         def test_quantization_modules(self):
             K1, N1 = 2, 2
 
@@ -236,26 +128,27 @@ def forward(self, x):
 
             fb = FooBar()
             fb.linear1.weight = torch.nn.Parameter(
-                torch.tensor([[-150, 100], [100, -150]], dtype=torch.float), requires_grad=False)
-            fb.linear1.bias = torch.nn.Parameter(torch.zeros_like(fb.linear1.bias), requires_grad=False)
+                torch.tensor([[-150, 100], [100, -150]], dtype=torch.float),
+                requires_grad=False,
+            )
+            fb.linear1.bias = torch.nn.Parameter(
+                torch.zeros_like(fb.linear1.bias), requires_grad=False
+            )
 
             x = (torch.rand(1, K1).float() - 0.5) / 10.0
             value = torch.tensor([[100, -150]], dtype=torch.float)
 
             y_ref = fb(value)
 
-            fb_int8 = torch.jit.quantized.quantize_linear_modules(fb)
-            traced_int8 = torch.jit.trace(fb_int8, (x,))
-            fb_int8 = self.getExportImportCopyWithPacking(traced_int8)
-            y_int8 = fb_int8(value)
-
-            fb_fp16 = torch.jit.quantized.quantize_linear_modules(fb, torch.float16)
-            traced_fp16 = torch.jit.trace(fb_fp16, (x,))
-            fb_fp16 = self.getExportImportCopyWithPacking(traced_fp16)
-            y_fp16 = fb_fp16(value)
+            with self.assertRaisesRegex(
+                RuntimeError, "quantize_linear_modules function is no longer supported"
+            ):
+                fb_int8 = torch.jit.quantized.quantize_linear_modules(fb)
 
-            torch.testing.assert_close(y_int8, y_ref, rtol=0.0001, atol=1e-3)
-            torch.testing.assert_close(y_fp16, y_ref, rtol=0.0001, atol=1e-3)
+            with self.assertRaisesRegex(
+                RuntimeError, "quantize_linear_modules function is no longer supported"
+            ):
+                fb_fp16 = torch.jit.quantized.quantize_linear_modules(fb, torch.float16)
 
     @skipIfNoFBGEMM
     def test_erase_class_tensor_shapes(self):
@@ -263,13 +156,19 @@ class Linear(torch.nn.Module):
             def __init__(self, in_features, out_features):
                 super().__init__()
                 qweight = torch._empty_affine_quantized(
-                    [out_features, in_features], scale=1, zero_point=0,
-                    dtype=torch.qint8)
+                    [out_features, in_features],
+                    scale=1,
+                    zero_point=0,
+                    dtype=torch.qint8,
+                )
                 self._packed_weight = torch.ops.quantized.linear_prepack(qweight)
 
             @torch.jit.export
             def __getstate__(self):
-                return (torch.ops.quantized.linear_unpack(self._packed_weight)[0], self.training)
+                return (
+                    torch.ops.quantized.linear_unpack(self._packed_weight)[0],
+                    self.training,
+                )
 
             def forward(self):
                 return self._packed_weight
@@ -292,7 +191,9 @@ def weight(self, w):
             torch._C._jit_pass_erase_shape_information(x.graph)
 
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_quantization.py TESTNAME\n\n"
-                       "instead.")
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_quantization.py TESTNAME\n\n"
+        "instead."
+    )
diff --git a/test/quantization/jit/test_fusion_passes.py b/test/quantization/jit/test_fusion_passes.py
index 1bb93d9051858..a1a9eceadb53d 100644
--- a/test/quantization/jit/test_fusion_passes.py
+++ b/test/quantization/jit/test_fusion_passes.py
@@ -5,11 +5,12 @@
 from torch.testing import FileCheck
 from torch.testing._internal.common_quantization import QuantizationTestCase
 
+
 class TestFusionPasses(QuantizationTestCase):
     def test_quantized_add_relu_fusion(self):
         class MAdd(torch.nn.Module):
             def forward(self, x, y):
-                a = torch.ops.quantized.add(x, y, 1., 0)
+                a = torch.ops.quantized.add(x, y, 1.0, 0)
                 relu_out = torch.relu(a)
                 return relu_out
 
@@ -17,10 +18,12 @@ def forward(self, x, y):
         B = torch.arange(-128, 130, dtype=torch.float)
         scale = 2.0
         zero_point = 127
-        qA = torch.quantize_per_tensor(A, scale=scale, zero_point=zero_point,
-                                       dtype=torch.quint8)
-        qB = torch.quantize_per_tensor(B, scale=scale, zero_point=zero_point,
-                                       dtype=torch.quint8)
+        qA = torch.quantize_per_tensor(
+            A, scale=scale, zero_point=zero_point, dtype=torch.quint8
+        )
+        qB = torch.quantize_per_tensor(
+            B, scale=scale, zero_point=zero_point, dtype=torch.quint8
+        )
 
         # Check quantized add + relu fusion
         m = MAdd()
@@ -33,9 +36,9 @@ def forward(self, x, y):
         # modules we have to inline graph.
         torch._C._jit_pass_inline(scripted_m.graph)
         torch._C._jit_pass_fuse_quantized_add_relu(scripted_m.graph)
-        FileCheck().check_not("aten::relu") \
-                   .check("quantized::add_relu") \
-                   .run(scripted_m.graph)
+        FileCheck().check_not("aten::relu").check("quantized::add_relu").run(
+            scripted_m.graph
+        )
         output = scripted_m(qA, qB)
         self.assertEqual(ref_output, output)
 
@@ -45,10 +48,9 @@ def forward(self, x, y, z):
                 relu_out = torch.relu(a)
                 return relu_out
 
-        qC = torch._empty_affine_quantized(qA.shape,
-                                           scale=scale,
-                                           zero_point=zero_point,
-                                           dtype=torch.quint8)
+        qC = torch._empty_affine_quantized(
+            qA.shape, scale=scale, zero_point=zero_point, dtype=torch.quint8
+        )
         # Check quantized add + relu fusion
         m = MAddOut()
         scripted_m = torch.jit.script(m)
@@ -59,15 +61,14 @@ def forward(self, x, y, z):
         # modules we have to inline graph.
         torch._C._jit_pass_inline(scripted_m.graph)
         torch._C._jit_pass_fuse_quantized_add_relu(scripted_m.graph)
-        FileCheck().check_not("aten::relu") \
-                   .check_not("quantized::add_out") \
-                   .check("quantized::add_relu_out") \
-                   .run(scripted_m.graph)
+        FileCheck().check_not("aten::relu").check_not("quantized::add_out").check(
+            "quantized::add_relu_out"
+        ).run(scripted_m.graph)
         output = scripted_m(qA, qB, qC)
         self.assertEqual(ref_output, output)
 
         class MAddScalar(torch.nn.Module):
-            def forward(self, x, y : float):
+            def forward(self, x, y: float):
                 a = torch.ops.quantized.add_scalar(x, y)
                 relu_out = torch.relu(a)
                 return relu_out
@@ -75,34 +76,31 @@ def forward(self, x, y : float):
         # Check quantized add + relu fusion
         m = MAddScalar()
         scripted_m = torch.jit.script(m)
-        ref_output = scripted_m(qA, 3.)
+        ref_output = scripted_m(qA, 3.0)
         torch._C._jit_pass_inline(scripted_m.graph)
         torch._C._jit_pass_fuse_quantized_add_relu(scripted_m.graph)
-        FileCheck().check_not("aten::relu") \
-                   .check_not("quantized::add_scalar(") \
-                   .check("quantized::add_scalar_relu") \
-                   .run(scripted_m.graph)
-        output = scripted_m(qA, 3.)
+        FileCheck().check_not("aten::relu").check_not("quantized::add_scalar(").check(
+            "quantized::add_scalar_relu"
+        ).run(scripted_m.graph)
+        output = scripted_m(qA, 3.0)
         self.assertEqual(ref_output, output)
 
         class MAddScalarOut(torch.nn.Module):
-            def forward(self, x, y : float, z):
+            def forward(self, x, y: float, z):
                 a = torch.ops.quantized.add_scalar_out(x, y, z)
                 relu_out = torch.relu(a)
                 return relu_out
 
-        qC = torch._empty_affine_quantized(qA.shape,
-                                           scale=scale,
-                                           zero_point=zero_point,
-                                           dtype=torch.quint8)
+        qC = torch._empty_affine_quantized(
+            qA.shape, scale=scale, zero_point=zero_point, dtype=torch.quint8
+        )
         m = MAddScalarOut()
         scripted_m = torch.jit.script(m)
-        ref_output = scripted_m(qA, 3., qC)
+        ref_output = scripted_m(qA, 3.0, qC)
         torch._C._jit_pass_inline(scripted_m.graph)
         torch._C._jit_pass_fuse_quantized_add_relu(scripted_m.graph)
-        FileCheck().check_not("aten::relu") \
-                   .check_not("quantized::add_scalar_out") \
-                   .check("quantized::add_scalar_relu_out") \
-                   .run(scripted_m.graph)
-        output = scripted_m(qA, 3., qC)
+        FileCheck().check_not("aten::relu").check_not(
+            "quantized::add_scalar_out"
+        ).check("quantized::add_scalar_relu_out").run(scripted_m.graph)
+        output = scripted_m(qA, 3.0, qC)
         self.assertEqual(ref_output, output)
diff --git a/test/quantization/jit/test_ondevice_quantization.py b/test/quantization/jit/test_ondevice_quantization.py
index 76dce10cf255a..ea2725f642d33 100644
--- a/test/quantization/jit/test_ondevice_quantization.py
+++ b/test/quantization/jit/test_ondevice_quantization.py
@@ -1,34 +1,32 @@
 # Owner(s): ["oncall: quantization"]
 
+import io
+from typing import Dict
+
 import torch
 import torch._C
 
-from torch.ao.quantization import (
-    default_dynamic_qconfig,
-    per_channel_dynamic_qconfig,
-)
+from torch.ao.quantization import default_dynamic_qconfig, per_channel_dynamic_qconfig
 
 from torch.ao.quantization.quantize_jit import (
-    prepare_dynamic_jit,
-    convert_dynamic_jit,
     _prepare_ondevice_dynamic_jit,
     _quantize_ondevice_dynamic_jit,
+    convert_dynamic_jit,
+    prepare_dynamic_jit,
 )
 
-from torch.testing._internal.common_utils import TestCase
+from torch.jit.mobile import _load_for_lite_interpreter, LiteScriptModule
+
+from torch.testing import FileCheck
 
 from torch.testing._internal.common_quantization import (
     get_script_module,
     LinearAddModel,
 )
 
-from torch.jit.mobile import _load_for_lite_interpreter, LiteScriptModule
-
-from torch.testing import FileCheck
+from torch.testing._internal.common_utils import TestCase
 from torch.utils import bundled_inputs as bundled_inputs
 
-import io
-from typing import Dict
 
 class myMod(torch.nn.Module):
     def __init__(self, weight):
@@ -60,7 +58,7 @@ def get_example_inputs(self):
 
 
 class OnDevicePTQUtils:
-    observer_module_name = ['MinMaxObserver', 'PerChannelMinMaxObserver']
+    observer_module_name = ["MinMaxObserver", "PerChannelMinMaxObserver"]
 
     @staticmethod
     def insert_observers(model, qconfig_dict):
@@ -73,7 +71,7 @@ def insert_observers(model, qconfig_dict):
     def ptq_dynamic_quantize(model, qconfig_dict):
         inputs = model.get_example_inputs()
         m = get_script_module(model, False, inputs)
-        m = _quantize_ondevice_dynamic_jit(m, qconfig_dict, 'forward', True)
+        m = _quantize_ondevice_dynamic_jit(m, qconfig_dict, "forward", True)
         return m
 
     @staticmethod
@@ -95,25 +93,35 @@ def is_value_type_observer(value):
     @staticmethod
     def is_calculate_qparam(node):
         if node.kind() == "prim::CallMethod":
-            if node.s('name') == "calculate_qparams":
+            if node.s("name") == "calculate_qparams":
                 return True
         return False
 
     @staticmethod
     def get_linear_packed_param_fp_weight(node):
         weight = node.inputsAt(0).node()
-        if weight.kind() != "aten::quantize_per_tensor" and weight.kind() != "aten::quantize_per_channel":
+        if (
+            weight.kind() != "aten::quantize_per_tensor"
+            and weight.kind() != "aten::quantize_per_channel"
+        ):
             raise ValueError("Quantized weight must be produced.")
         fp_weight = weight.inputsAt(0).node()
-        assert fp_weight.kind() == "prim::GetAttr", "Weight must be an attribute of the module."
-        fp_weight_name = fp_weight.s('name')
+        assert (
+            fp_weight.kind() == "prim::GetAttr"
+        ), "Weight must be an attribute of the module."
+        fp_weight_name = fp_weight.s("name")
         return fp_weight_name
 
     @staticmethod
     def is_per_channel_quantized_packed_param(node):
-        assert node.kind() == 'quantized::linear_prepack', "Node must corresponds to linear_prepack."
+        assert (
+            node.kind() == "quantized::linear_prepack"
+        ), "Node must corresponds to linear_prepack."
         weight = node.inputsAt(0).node()
-        assert weight.kind() != "aten::quantize_per_tensor" or weight.kind() != "aten::quantize_per_channel"
+        assert (
+            weight.kind() != "aten::quantize_per_tensor"
+            or weight.kind() != "aten::quantize_per_channel"
+        )
         return weight.kind() != "aten::quantize_per_tensor"
 
 
@@ -124,14 +132,14 @@ def _check_num_and_type_of_observers(self, model, num_observers):
         observer_modules = OnDevicePTQUtils.find_observer_modules(scripted_model)
         self.assertTrue(len(observer_modules) == num_observers)
         for observer in observer_modules:
-            self.assertTrue(observer.original_name == 'MinMaxObserver')
+            self.assertTrue(observer.original_name == "MinMaxObserver")
 
         qconfig_dict = {"": per_channel_dynamic_qconfig}
         scripted_model = OnDevicePTQUtils.insert_observers(model, qconfig_dict)
         observer_modules = OnDevicePTQUtils.find_observer_modules(scripted_model)
         self.assertTrue(len(observer_modules) == num_observers)
         for observer in observer_modules:
-            self.assertTrue(observer.original_name == 'PerChannelMinMaxObserver')
+            self.assertTrue(observer.original_name == "PerChannelMinMaxObserver")
 
     def _check_observer_method(self, model, num_observers):
         qconfig_dict = {"": default_dynamic_qconfig}
@@ -143,18 +151,24 @@ def _check_observer_method(self, model, num_observers):
         quant_forward_graph = scripted_model.graph.str()
         # exact graph matching is difficult so just resorting to # of lines
         # instead of implementing graph matching
-        self.assertEqual(len(orig_forward_graph.splitlines()), len(quant_forward_graph.splitlines()))
+        self.assertEqual(
+            len(orig_forward_graph.splitlines()), len(quant_forward_graph.splitlines())
+        )
         observe_method = scripted_model.observe_forward.graph
-        FileCheck().check_count("prim::CallMethod[name=\"forward\"](%_observer",
-                                num_observers, exactly=True).run(observe_method)
+        FileCheck().check_count(
+            'prim::CallMethod[name="forward"](%_observer', num_observers, exactly=True
+        ).run(observe_method)
         reset_observers_method = scripted_model.reset_observers_forward.graph
         FileCheck().check_count(
-            "prim::CallMethod[name=\"reset_min_max_vals\"](%_observer", num_observers, exactly=True).run(reset_observers_method)
+            'prim::CallMethod[name="reset_min_max_vals"](%_observer',
+            num_observers,
+            exactly=True,
+        ).run(reset_observers_method)
 
     def _observer_is_weight_only(self, node):
         if (node.kind() == "prim::CallMethod") and node.s("name") == "forward":
-            if (OnDevicePTQUtils.is_value_type_observer(node.inputsAt(0))):
-                return (node.inputsAt(1).node().kind() == "prim::GetAttr")
+            if OnDevicePTQUtils.is_value_type_observer(node.inputsAt(0)):
+                return node.inputsAt(1).node().kind() == "prim::GetAttr"
         return False
 
     def test_num_observers(self):
@@ -175,7 +189,7 @@ def test_weight_only_observers(self):
         observe_forward_graph = scripted_model.observe_forward.graph
         num_weight_only_observers = 0
         for node in observe_forward_graph.nodes():
-            if (self._observer_is_weight_only(node)):
+            if self._observer_is_weight_only(node):
                 num_weight_only_observers += 1
         self.assertEqual(num_weight_only_observers, 3)
 
@@ -203,18 +217,18 @@ def _validate_no_observer_forward(self, model):
         quantize_forward_graph = model.quantize_forward.graph
         for n in quantize_forward_graph.nodes():
             if (n.kind() == "prim::CallMethod") and n.s("name") == "forward":
-                if (OnDevicePTQUtils.is_value_type_observer(n.inputsAt(0))):
+                if OnDevicePTQUtils.is_value_type_observer(n.inputsAt(0)):
                     return False
         return True
 
     def _check_quant_dequant_and_calc_qparams(self, model, num_nodes):
-        qconfig_dict = {"" : default_dynamic_qconfig}
+        qconfig_dict = {"": default_dynamic_qconfig}
         m = OnDevicePTQUtils.ptq_dynamic_quantize(model, qconfig_dict)
         self._validate_quant_dequant_nodes(m, num_nodes)
         self._validate_calculate_qparams(m, num_nodes)
         self._validate_no_observer_forward(m)
 
-        qconfig_dict = {"" : per_channel_dynamic_qconfig}
+        qconfig_dict = {"": per_channel_dynamic_qconfig}
         m = OnDevicePTQUtils.ptq_dynamic_quantize(model, qconfig_dict)
         self._validate_quant_dequant_nodes(m, num_nodes, num_nodes)
         self._validate_calculate_qparams(m, num_nodes)
@@ -222,12 +236,12 @@ def _check_quant_dequant_and_calc_qparams(self, model, num_nodes):
 
     def _check_quantize_forward_runs(self, model):
         inputs = model.get_example_inputs()
-        qconfig_dict = {"" : default_dynamic_qconfig}
+        qconfig_dict = {"": default_dynamic_qconfig}
         m = OnDevicePTQUtils.ptq_dynamic_quantize(model, qconfig_dict)
         m.observe_forward(*inputs)
         m.quantize_forward(*inputs)
 
-        qconfig_dict = {"" : per_channel_dynamic_qconfig}
+        qconfig_dict = {"": per_channel_dynamic_qconfig}
         m = OnDevicePTQUtils.ptq_dynamic_quantize(model, qconfig_dict)
         # First must run observe forward to record the stats to produce
         # correct scales and zero points
@@ -254,13 +268,15 @@ def _validate_packed_params(self, model, num_nodes, per_channel=0):
         linear_prepack = 0
         linear_prepack_uses = 0
         for n in quantize_forward_graph.nodes():
-            if n.kind() == 'prim::SetAttr':
+            if n.kind() == "prim::SetAttr":
                 maybe_packed_param_value = n.inputsAt(1)
                 maybe_packed_param = maybe_packed_param_value.node()
-                if maybe_packed_param.kind() == 'quantized::linear_prepack':
+                if maybe_packed_param.kind() == "quantized::linear_prepack":
                     linear_prepack += 1
                     linear_prepack_uses += len(maybe_packed_param_value.uses())
-                    if OnDevicePTQUtils.is_per_channel_quantized_packed_param(maybe_packed_param):
+                    if OnDevicePTQUtils.is_per_channel_quantized_packed_param(
+                        maybe_packed_param
+                    ):
                         quantize_per_channel += 1
                     else:
                         quantize_per_tensor += 1
@@ -269,24 +285,24 @@ def _validate_packed_params(self, model, num_nodes, per_channel=0):
         self.assertEqual(linear_prepack, num_nodes)
         self.assertEqual(linear_prepack_uses, num_nodes)
 
-
     def _validate_no_linear_unpack(self, model):
         quantize_forward_graph = model.quantize_forward.graph
         for n in quantize_forward_graph.nodes():
-            if n.kind() == 'quantized::linear_unpack':
+            if n.kind() == "quantized::linear_unpack":
                 return False
         return True
 
-
     def _validate_setattr_fp_weights(self, model, num_nodes):
         quantize_forward_graph = model.quantize_forward.graph
         fp_weights_setattr = 0
         fp_weight_names = []
         for n in quantize_forward_graph.nodes():
-            if n.kind() == 'prim::SetAttr':
+            if n.kind() == "prim::SetAttr":
                 maybe_packed_param = n.inputsAt(1).node()
-                if maybe_packed_param.kind() == 'quantized::linear_prepack':
-                    weight_name = OnDevicePTQUtils.get_linear_packed_param_fp_weight(maybe_packed_param)
+                if maybe_packed_param.kind() == "quantized::linear_prepack":
+                    weight_name = OnDevicePTQUtils.get_linear_packed_param_fp_weight(
+                        maybe_packed_param
+                    )
                     fp_weight_names.append(weight_name)
 
         for n in quantize_forward_graph.nodes():
@@ -295,15 +311,14 @@ def _validate_setattr_fp_weights(self, model, num_nodes):
             # = prim::SetAttr(<weight_name>)(module_value, x)
             # Thus making sure that the original fp weights are
             # reset
-            if n.kind() == 'prim::SetAttr':
-                weight_name = n.s('name')
+            if n.kind() == "prim::SetAttr":
+                weight_name = n.s("name")
                 if weight_name in fp_weight_names:
                     maybe_constant = n.inputsAt(1).node()
-                    if maybe_constant.kind() == 'prim::Constant':
+                    if maybe_constant.kind() == "prim::Constant":
                         fp_weights_setattr += 1
         self.assertEqual(fp_weights_setattr, num_nodes)
 
-
     def _validate_quantized_forward(self, model, num_nodes):
         quantized_forward_graph = model.quantized_forward.graph
         quantize_per_tensor = quantize_per_channel = 0
@@ -317,12 +332,12 @@ def _validate_quantized_forward(self, model, num_nodes):
                 quantize_per_channel += 1
             if "quantized::linear_dynamic" in n.kind():
                 quantized_linear_dynamic += 1
-            if n.kind() == 'prim::GetAttr':
+            if n.kind() == "prim::GetAttr":
                 output = n.outputsAt(0)
                 output_type = output.type()
                 if "LinearPackedParamsBase" in output_type.str():
                     linear_packed_params += 1
-            if n.kind() == 'prim::SetAttr':
+            if n.kind() == "prim::SetAttr":
                 num_setattr += 1
         self.assertEqual(quantize_per_tensor, 0)
         self.assertEqual(quantize_per_channel, 0)
@@ -330,37 +345,34 @@ def _validate_quantized_forward(self, model, num_nodes):
         self.assertEqual(linear_packed_params, num_nodes)
         # self.assertEqual(num_setattr, 0)
 
-
     def _check_quantize_forward(self, model, num_nodes):
-        qconfig_dict = {"" : default_dynamic_qconfig}
+        qconfig_dict = {"": default_dynamic_qconfig}
         m = OnDevicePTQUtils.ptq_dynamic_quantize(model, qconfig_dict)
         self._validate_packed_params(m, num_nodes)
         self._validate_no_linear_unpack(m)
         self._validate_setattr_fp_weights(m, num_nodes)
 
-        qconfig_dict = {"" : per_channel_dynamic_qconfig}
+        qconfig_dict = {"": per_channel_dynamic_qconfig}
         m = OnDevicePTQUtils.ptq_dynamic_quantize(model, qconfig_dict)
         self._validate_packed_params(m, num_nodes, num_nodes)
         self._validate_no_linear_unpack(m)
         self._validate_setattr_fp_weights(m, num_nodes)
 
-
     def _check_quantized_forward(self, model, num_nodes):
-        qconfig_dict = {"" : default_dynamic_qconfig}
+        qconfig_dict = {"": default_dynamic_qconfig}
         m = OnDevicePTQUtils.ptq_dynamic_quantize(model, qconfig_dict)
         self._validate_quantized_forward(m, num_nodes)
 
-        qconfig_dict = {"" : per_channel_dynamic_qconfig}
+        qconfig_dict = {"": per_channel_dynamic_qconfig}
         m = OnDevicePTQUtils.ptq_dynamic_quantize(model, qconfig_dict)
         self._validate_quantized_forward(m, num_nodes)
 
-
     def _check_against_ref_dynamic_ptq(self, model):
         model.eval()
         inputs = model.get_example_inputs()
         ref_m = torch.jit.script(model)
         torch._C._jit_pass_inline(ref_m.graph)
-        qconfig_dict = {"" : default_dynamic_qconfig}
+        qconfig_dict = {"": default_dynamic_qconfig}
         ref_m = prepare_dynamic_jit(ref_m, qconfig_dict)
         ref_m = convert_dynamic_jit(ref_m)
         ref_output = ref_m(*inputs)
@@ -380,7 +392,7 @@ def _check_against_ref_dynamic_ptq(self, model):
         # test with per channel quant
         ref_m = torch.jit.script(model)
         torch._C._jit_pass_inline(ref_m.graph)
-        qconfig_dict = {"" : per_channel_dynamic_qconfig}
+        qconfig_dict = {"": per_channel_dynamic_qconfig}
         ref_m = prepare_dynamic_jit(ref_m, qconfig_dict)
         ref_m = convert_dynamic_jit(ref_m)
         ref_output = ref_m(*inputs)
@@ -397,13 +409,14 @@ def _check_against_ref_dynamic_ptq(self, model):
             thrown = True
         self.assertTrue(thrown)
 
-
-    def _check_serdes_and_device_side_api_helper(self, model, check_device_side_api=False):
+    def _check_serdes_and_device_side_api_helper(
+        self, model, check_device_side_api=False
+    ):
         model.eval()
         inputs = model.get_example_inputs()
         ref_m = torch.jit.script(model)
         torch._C._jit_pass_inline(ref_m.graph)
-        qconfig_dict = {"" : default_dynamic_qconfig}
+        qconfig_dict = {"": default_dynamic_qconfig}
         ref_m = prepare_dynamic_jit(ref_m, qconfig_dict)
         ref_m = convert_dynamic_jit(ref_m)
         buffer = io.BytesIO()
@@ -426,9 +439,11 @@ def _check_serdes_and_device_side_api_helper(self, model, check_device_side_api=
         else:
             # check for lite interpreter
             m = OnDevicePTQUtils.ptq_dynamic_quantize(model, qconfig_dict)
-            first_input, = inputs
-            rand_input = bundled_inputs.bundle_randn(first_input.size(), dtype=first_input.dtype)
-            m = bundled_inputs.bundle_inputs(m, inputs=[(rand_input, )])
+            (first_input,) = inputs
+            rand_input = bundled_inputs.bundle_randn(
+                first_input.size(), dtype=first_input.dtype
+            )
+            m = bundled_inputs.bundle_inputs(m, inputs=[(rand_input,)])
             buffer = io.BytesIO(m._save_to_buffer_for_lite_interpreter())
             buffer.seek(0)
             m = _load_for_lite_interpreter(buffer)  # Error here
@@ -451,7 +466,7 @@ def _check_serdes_and_device_side_api_helper(self, model, check_device_side_api=
         inputs = model.get_example_inputs()
         ref_m = torch.jit.script(model)
         torch._C._jit_pass_inline(ref_m.graph)
-        qconfig_dict = {"" : per_channel_dynamic_qconfig}
+        qconfig_dict = {"": per_channel_dynamic_qconfig}
         ref_m = prepare_dynamic_jit(ref_m, qconfig_dict)
         ref_m = convert_dynamic_jit(ref_m)
         buffer = io.BytesIO()
@@ -474,9 +489,11 @@ def _check_serdes_and_device_side_api_helper(self, model, check_device_side_api=
         else:
             # check for lite interpreter
             m = OnDevicePTQUtils.ptq_dynamic_quantize(model, qconfig_dict)
-            first_input, = inputs
-            rand_input = bundled_inputs.bundle_randn(first_input.size(), dtype=first_input.dtype)
-            m = bundled_inputs.bundle_inputs(m, inputs=[(rand_input, )])
+            (first_input,) = inputs
+            rand_input = bundled_inputs.bundle_randn(
+                first_input.size(), dtype=first_input.dtype
+            )
+            m = bundled_inputs.bundle_inputs(m, inputs=[(rand_input,)])
             buffer = io.BytesIO(m._save_to_buffer_for_lite_interpreter())
             buffer.seek(0)
             m = _load_for_lite_interpreter(buffer)  # Error here
@@ -488,41 +505,34 @@ def _check_serdes_and_device_side_api_helper(self, model, check_device_side_api=
             output = m(*inputs)
             self.assertTrue(torch.allclose(ref_output, output))
 
-
     def _check_serialization_deserialization(self, model):
         self._check_serdes_and_device_side_api_helper(model, False)
 
-
     def _check_device_side_api(self, model):
         self._check_serdes_and_device_side_api_helper(model, True)
 
-
     def test_quantize_forward(self):
         model = LinearAddModel()
         self._check_quantize_forward(model, 2)
         model = MyConvLinearModule()
         self._check_quantize_forward(model, 3)
 
-
     def test_quantized_forward(self):
         model = LinearAddModel()
         self._check_quantized_forward(model, 2)
         model = MyConvLinearModule()
         self._check_quantized_forward(model, 3)
 
-
     def test_against_offdevice_dynamic_ptq(self):
         model = LinearAddModel()
         self._check_against_ref_dynamic_ptq(model)
         model = MyConvLinearModule()
         self._check_against_ref_dynamic_ptq(model)
 
-
     def test_serialization_deserialization(self):
         model = MyConvLinearModule()
         self._check_serialization_deserialization(model)
 
-
     def test_device_side_api(self):
         model = MyConvLinearModule()
         self._check_device_side_api(model)
diff --git a/test/quantization/jit/test_quantize_jit.py b/test/quantization/jit/test_quantize_jit.py
index c5c16853e3dd7..2ce4aa7348dc1 100644
--- a/test/quantization/jit/test_quantize_jit.py
+++ b/test/quantization/jit/test_quantize_jit.py
@@ -1,87 +1,88 @@
 # Owner(s): ["oncall: quantization"]
 
 # torch
+import io
+import itertools
+import unittest
+
+# Standard library
+from typing import List, Tuple
+
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
 import torch.jit
 import torch.jit.quantized
+import torch.nn as nn
+import torch.nn.functional as F
 
 # torch.ao.quantization
 from torch.ao.quantization import (
-    QConfig,
     default_dynamic_qconfig,
-    float16_dynamic_qconfig,
+    default_histogram_observer,
     default_observer,
-    per_channel_dynamic_qconfig,
     default_per_channel_weight_observer,
     default_qconfig,
+    default_weight_observer,
+    float16_dynamic_qconfig,
+    fuse_modules,
     get_default_qconfig,
+    per_channel_dynamic_qconfig,
+    PlaceholderObserver,
+    QConfig,
     quantize,
     quantize_dynamic,
-    default_weight_observer,
-    default_histogram_observer,
-    fuse_modules,
-    quantize_jit,
     quantize_dynamic_jit,
-    PlaceholderObserver,
+    quantize_jit,
 )
 
 # torch.ao.quantization.quantize_jit
 from torch.ao.quantization.quantize_jit import (
-    convert_jit,
     convert_dynamic_jit,
+    convert_jit,
     fuse_conv_bn_jit,
-    prepare_jit,
     prepare_dynamic_jit,
+    prepare_jit,
     script_qconfig,
 )
 
-# Testing utils
-from torch.testing._internal.common_quantized import (
-    override_qengines,
-    qengine_is_fbgemm,
-    qengine_is_qnnpack,
-)
+from torch.jit._recursive import wrap_cpp_module
+
+from torch.testing import FileCheck
+
+# Annotated models
 
 from torch.testing._internal.common_quantization import (
-    QuantizationTestCase,
-    skipIfNoFBGEMM,
-    get_script_module,
-    SingleLayerLinearModel,
-    SkipQuantModel,
-    NestedModel,
+    AnnotatedConvBnModel,
+    AnnotatedConvModel,
+    AnnotatedConvTransposeModel,
+    AnnotatedNestedModel,
+    AnnotatedSingleLayerLinearModel,
+    AnnotatedSkipQuantModel,
+    ConvBnModel,
     ConvModel,
     ConvTransposeModel,
     default_per_channel_qconfig,
+    get_script_module,
+    NestedModel,
+    QuantizationTestCase,
+    SingleLayerLinearModel,
+    skipIfNoFBGEMM,
+    SkipQuantModel,
     test_only_eval_fn,
-    ConvBnModel,
 )
 
-# Annotated models
-from torch.testing._internal.common_quantization import (
-    AnnotatedSingleLayerLinearModel,
-    AnnotatedSkipQuantModel,
-    AnnotatedNestedModel,
-    AnnotatedConvModel,
-    AnnotatedConvTransposeModel,
-    AnnotatedConvBnModel,
+# Testing utils
+from torch.testing._internal.common_quantized import (
+    override_qengines,
+    qengine_is_fbgemm,
+    qengine_is_qnnpack,
 )
 
-from torch.testing import FileCheck
-from torch.testing._internal.jit_utils import attrs_with_prefix
-from torch.testing._internal.jit_utils import get_forward
-from torch.testing._internal.jit_utils import get_forward_graph
-
 from torch.testing._internal.common_utils import set_default_dtype
-
-from torch.jit._recursive import wrap_cpp_module
-
-# Standard library
-from typing import List, Tuple
-import io
-import itertools
-import unittest
+from torch.testing._internal.jit_utils import (
+    attrs_with_prefix,
+    get_forward,
+    get_forward_graph,
+)
 
 
 class TestQuantizeJitPasses(QuantizationTestCase):
@@ -97,9 +98,7 @@ def forward(self, x):
                 return self.conv(x)
 
         m = torch.jit.script(M())
-        observer = (
-            default_per_channel_weight_observer.with_args(ch_axis=1)
-        )
+        observer = default_per_channel_weight_observer.with_args(ch_axis=1)
         qconfig_dict = {"": QConfig(activation=default_observer, weight=observer)}
         m = prepare_jit(m, qconfig_dict)
         data = torch.randn(1, 3, 10, 10, dtype=torch.float)
@@ -114,12 +113,18 @@ def forward(self, x):
         # We have this pattern in the original graph: Constant f32_weight -> quant -> dequant
         # After skipping dequant during Constant Propagation, the resulting graph will be:
         # Constant int8_weight -> dequant
-        FileCheck().check_count("aten::quantize_per_tensor", 2, exactly=True).run(freezed.graph)
-        FileCheck().check_count("aten::quantize_per_channel", 0, exactly=True).run(freezed.graph)
+        FileCheck().check_count("aten::quantize_per_tensor", 2, exactly=True).run(
+            freezed.graph
+        )
+        FileCheck().check_count("aten::quantize_per_channel", 0, exactly=True).run(
+            freezed.graph
+        )
         FileCheck().check_count("aten::dequantize", 3, exactly=True).run(freezed.graph)
-        FileCheck().check("aten::quantize_per_tensor").check_next("aten::dequantize").check_not(
-            "aten::quantize_per_channel"
-        ).check("aten::dequantize").check_next("aten::conv2d").check_next(
+        FileCheck().check("aten::quantize_per_tensor").check_next(
+            "aten::dequantize"
+        ).check_not("aten::quantize_per_channel").check("aten::dequantize").check_next(
+            "aten::conv2d"
+        ).check_next(
             "aten::quantize_per_tensor"
         ).check_next(
             "aten::dequantize"
@@ -665,8 +670,8 @@ def forward(self, x):
         }
         assert len(activation_dtypes) == 1, "Expected to have 1 activation dtype"
         assert len(weight_dtypes) == 1, "Expected to have 1 weight dtype"
-        assert (
-            next(iter(activation_dtypes)) != next(iter(weight_dtypes))
+        assert next(iter(activation_dtypes)) != next(
+            iter(weight_dtypes)
         ), "Expected activation dtype to "
         " be different from wegiht dtype"
 
@@ -1700,9 +1705,7 @@ def forward(self, x):
                 model.graph
             )
 
-            FileCheck().check_not(f"quantized::conv{dim}d_prepack").run(
-                model.graph
-            )
+            FileCheck().check_not(f"quantized::conv{dim}d_prepack").run(model.graph)
 
     @skipIfNoFBGEMM
     def test_quantized_conv_relu(self):
@@ -2282,16 +2285,19 @@ def forward(self, x):
         options = itertools.product([True, False], [2, 3])
         for tracing, dim in options:
             for instance in [BNRelu(dim, True), BNRelu(dim, False)]:
-                model = self.checkGraphModeOp(instance, self.img_data_dict[dim],
-                                              "quantized::batch_norm_relu", tracing)
-                FileCheck().check_not("aten::batch_norm") \
-                           .check_not("aten::relu") \
-                           .check_not("aten::relu_") \
-                           .run(model.graph)
+                model = self.checkGraphModeOp(
+                    instance,
+                    self.img_data_dict[dim],
+                    "quantized::batch_norm_relu",
+                    tracing,
+                )
+                FileCheck().check_not("aten::batch_norm").check_not(
+                    "aten::relu"
+                ).check_not("aten::relu_").run(model.graph)
 
     @skipIfNoFBGEMM
     def test_qbatch_norm_relu_BNFuncRelu(self):
-        bn_module = {2 : torch.nn.BatchNorm2d, 3 : torch.nn.BatchNorm3d}
+        bn_module = {2: torch.nn.BatchNorm2d, 3: torch.nn.BatchNorm3d}
 
         class BNFuncRelu(torch.nn.Module):
             def __init__(self, dim):
@@ -2304,16 +2310,16 @@ def forward(self, x):
         options = itertools.product([True, False], [2, 3])
         for tracing, dim in options:
             instance = BNFuncRelu(dim)
-            model = self.checkGraphModeOp(instance, self.img_data_dict[dim],
-                                          "quantized::batch_norm_relu", tracing)
-            FileCheck().check_not("aten::batch_norm") \
-                       .check_not("aten::relu") \
-                       .check_not("aten::relu_") \
-                       .run(model.graph)
+            model = self.checkGraphModeOp(
+                instance, self.img_data_dict[dim], "quantized::batch_norm_relu", tracing
+            )
+            FileCheck().check_not("aten::batch_norm").check_not("aten::relu").check_not(
+                "aten::relu_"
+            ).run(model.graph)
 
     @skipIfNoFBGEMM
     def test_qbatch_norm_relu_BNFuncInplaceRelu(self):
-        bn_module = {2 : torch.nn.BatchNorm2d, 3 : torch.nn.BatchNorm3d}
+        bn_module = {2: torch.nn.BatchNorm2d, 3: torch.nn.BatchNorm3d}
 
         class BNFuncInplaceRelu(torch.nn.Module):
             def __init__(self, dim):
@@ -2326,12 +2332,12 @@ def forward(self, x):
         options = itertools.product([True, False], [2, 3])
         for tracing, dim in options:
             instance = BNFuncInplaceRelu(dim)
-            model = self.checkGraphModeOp(instance, self.img_data_dict[dim],
-                                          "quantized::batch_norm_relu", tracing)
-            FileCheck().check_not("aten::batch_norm") \
-                       .check_not("aten::relu") \
-                       .check_not("aten::relu_") \
-                       .run(model.graph)
+            model = self.checkGraphModeOp(
+                instance, self.img_data_dict[dim], "quantized::batch_norm_relu", tracing
+            )
+            FileCheck().check_not("aten::batch_norm").check_not("aten::relu").check_not(
+                "aten::relu_"
+            ).run(model.graph)
 
     @skipIfNoFBGEMM
     def test_quantized_mul(self):
@@ -3280,11 +3286,17 @@ def forward(self, x):
         for x, obs in m2._modules._c.items():
             if x == "res1":
                 graph_params.append(
-                    (obs.getattr("weight.2_scale_0"), obs.getattr("weight.2_zero_point_0"))
+                    (
+                        obs.getattr("weight.2_scale_0"),
+                        obs.getattr("weight.2_zero_point_0"),
+                    )
                 )
             elif x == "res2":
                 graph_params.append(
-                    (obs.getattr("weight.4_scale_0"), obs.getattr("weight.4_zero_point_0"))
+                    (
+                        obs.getattr("weight.4_scale_0"),
+                        obs.getattr("weight.4_zero_point_0"),
+                    )
                 )
         self.assertEqual(ref_qparams, graph_params)
 
@@ -3313,10 +3325,12 @@ def forward(self, x):
             model = quantize_dynamic_jit(model, qconfig_dict, debug=True)
             graph_qparams = []
             for x, obs in model._modules._c.items():
-                n = 2 if x == 'fc' and tracing else 1
+                n = 2 if x == "fc" and tracing else 1
                 graph_qparams.append(
-                    (obs.getattr(f"weight.{n}_scale_0"),
-                     obs.getattr(f"weight.{n}_zero_point_0"))
+                    (
+                        obs.getattr(f"weight.{n}_scale_0"),
+                        obs.getattr(f"weight.{n}_zero_point_0"),
+                    )
                 )
             self.assertEqual(ref_qparams, graph_qparams)
 
@@ -3519,21 +3533,19 @@ def forward(self, indices, offsets):
             activation=PlaceholderObserver.with_args(
                 dtype=torch.float, custom_op_name="embedding_bag_4bit"
             ),
-            weight=PlaceholderObserver.with_args(
-                custom_op_name="embedding_bag_4bit"
-            ),
+            weight=PlaceholderObserver.with_args(custom_op_name="embedding_bag_4bit"),
         )
         int8_qconfig = QConfig(
             activation=PlaceholderObserver.with_args(
                 dtype=torch.float, custom_op_name="embedding_bag_byte"
             ),
-            weight=PlaceholderObserver.with_args(
-                custom_op_name="embedding_bag_byte"
-            ),
+            weight=PlaceholderObserver.with_args(custom_op_name="embedding_bag_byte"),
         )
 
-        error_msg = r'Expected aten::embedding_bag padding_idx input to be None'
-        for trace, qconfig in itertools.product([True, False], [int4_qconfig, int8_qconfig]):
+        error_msg = r"Expected aten::embedding_bag padding_idx input to be None"
+        for trace, qconfig in itertools.product(
+            [True, False], [int4_qconfig, int8_qconfig]
+        ):
             if trace:
                 m = torch.jit.trace(module, dummy_inputs)
             else:
diff --git a/test/quantization/pt2e/test_duplicate_dq.py b/test/quantization/pt2e/test_duplicate_dq.py
index cd35a0fbe0023..579a16ff766e2 100644
--- a/test/quantization/pt2e/test_duplicate_dq.py
+++ b/test/quantization/pt2e/test_duplicate_dq.py
@@ -4,7 +4,7 @@
 from typing import Any, Dict
 
 import torch
-import torch._export as export
+from torch._export import capture_pre_autograd_graph
 
 from torch.ao.quantization.observer import (
     HistogramObserver,
@@ -102,7 +102,7 @@ def _test_duplicate_dq(
 
         # program capture
         m = copy.deepcopy(m_eager)
-        m = export.capture_pre_autograd_graph(
+        m = capture_pre_autograd_graph(
             m,
             example_inputs,
         )
diff --git a/test/quantization/pt2e/test_generate_numeric_debug_handle.py b/test/quantization/pt2e/test_generate_numeric_debug_handle.py
index 4bc4628613875..2427ff2bb170c 100644
--- a/test/quantization/pt2e/test_generate_numeric_debug_handle.py
+++ b/test/quantization/pt2e/test_generate_numeric_debug_handle.py
@@ -5,6 +5,7 @@
 import torch
 from torch._export import capture_pre_autograd_graph
 from torch.ao.quantization import generate_numeric_debug_handle
+from torch.ao.quantization.pt2e.export_utils import _WrapperModule
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer.xnnpack_quantizer import (
     get_symmetric_quantization_config,
@@ -37,7 +38,9 @@ def conv_pattern(input, weight, bias):
         torch.randn(1, 1, 1, 1),  # weight
         torch.randn(1),  # bias
     )
-    conv_gm = capture_pre_autograd_graph(conv_pattern, conv_pattern_example_inputs)
+    conv_gm = capture_pre_autograd_graph(
+        _WrapperModule(conv_pattern), conv_pattern_example_inputs
+    )
     conv_pm = SubgraphMatcherWithNameNodeMap(conv_gm)
     matches = conv_pm.match(model.graph)
     assert len(matches) == 1, "Expecting to have one match"
diff --git a/test/quantization/pt2e/test_graph_utils.py b/test/quantization/pt2e/test_graph_utils.py
index a20338a97e82e..2cb49ae63405a 100644
--- a/test/quantization/pt2e/test_graph_utils.py
+++ b/test/quantization/pt2e/test_graph_utils.py
@@ -10,14 +10,10 @@
     get_equivalent_types,
     update_equivalent_types_dict,
 )
-from torch.testing._internal.common_utils import (
-    IS_WINDOWS,
-    TestCase,
-)
+from torch.testing._internal.common_utils import IS_WINDOWS, TestCase
 
 
 class TestGraphUtils(TestCase):
-
     @unittest.skipIf(IS_WINDOWS, "torch.compile is not supported on Windows")
     def test_conv_bn_conv_relu(self):
         class M(torch.nn.Module):
diff --git a/test/quantization/pt2e/test_metadata_porting.py b/test/quantization/pt2e/test_metadata_porting.py
index 16e1cd1fb00ab..7f1bc21831e39 100644
--- a/test/quantization/pt2e/test_metadata_porting.py
+++ b/test/quantization/pt2e/test_metadata_porting.py
@@ -5,9 +5,9 @@
 from typing import List
 
 import torch
-import torch._export as export
+import torch._export
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
-from torch.ao.quantization.quantizer import Quantizer
+from torch.ao.quantization.quantizer import QuantizationAnnotation, Quantizer
 from torch.ao.quantization.quantizer.xnnpack_quantizer import (
     get_symmetric_quantization_config,
 )
@@ -64,7 +64,7 @@ class TestMetaDataPorting(QuantizationTestCase):
     def _test_quant_tag_preservation_through_decomp(
         self, model, example_inputs, from_node_to_tags
     ):
-        ep = export.export(model, example_inputs)
+        ep = torch.export.export(model, example_inputs)
         found_tags = True
         not_found_nodes = ""
         for from_node, tag in from_node_to_tags.items():
@@ -102,7 +102,7 @@ def _test_metadata_porting(
 
         # program capture
         m = copy.deepcopy(m_eager)
-        m = export.capture_pre_autograd_graph(
+        m = torch._export.capture_pre_autograd_graph(
             m,
             example_inputs,
         )
@@ -110,7 +110,7 @@ def _test_metadata_porting(
         m = prepare_pt2e(m, quantizer)
         # Calibrate
         m(*example_inputs)
-        m = convert_pt2e(m, fold_quantize=True)
+        m = convert_pt2e(m)
 
         pt2_quant_output = m(*example_inputs)
         recorded_node_tags = {}
@@ -456,3 +456,67 @@ def validate(self, model: torch.fx.GraphModule) -> None:
         self._test_quant_tag_preservation_through_decomp(
             m, example_inputs, from_node_to_tags
         )
+
+    def test_no_metadata_porting_through_unknown_ops(self):
+        """
+        Model under test
+        matmul -> add -> relu
+        matmul has get_attr as first input, but the quantization_tag should not be
+        propagated to add even if it's part of a chain that ends at get_attr
+        """
+
+        class MatmulWithConstInput(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_parameter("w", torch.nn.Parameter(torch.rand(8, 16)))
+
+            def forward(self, x, y):
+                x = torch.matmul(self.w, x)
+                z = x + y
+                return torch.nn.functional.relu(z)
+
+        class BackendAQuantizer(Quantizer):
+            def annotate(self, gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                backend_string = "BackendA"
+                qconfig = get_symmetric_quantization_config()
+                for n in gm.graph.nodes:
+                    if n.op != "call_function":
+                        continue
+
+                    n.meta["quantization_annotation"] = QuantizationAnnotation(
+                        input_qspec_map={n.args[0]: qconfig.input_activation},
+                        output_qspec=qconfig.output_activation,
+                    )
+
+                    tag = str(n.target)
+                    n.meta["quantization_tag"] = tag
+                    for arg in n.args:
+                        if arg.op == "get_attr":
+                            arg.meta["quantization_tag"] = tag
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        example_inputs = (torch.randn(16, 24), torch.randn(8, 24))
+        get_attr_tags = {"aten.matmul.default"}
+        quantize_per_tensor_tensor_tags = {
+            "aten.matmul.default",
+            "aten.add.Tensor",
+            "aten.relu.default",
+        }
+        dequantize_per_tensor_tensor_tags = {
+            "aten.matmul.default",
+            "aten.add.Tensor",
+            "aten.relu.default",
+        }
+        node_tags = {
+            "get_attr": get_attr_tags,
+            torch.ops.quantized_decomposed.quantize_per_tensor.default: quantize_per_tensor_tensor_tags,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default: dequantize_per_tensor_tensor_tags,
+        }
+        m = self._test_metadata_porting(
+            MatmulWithConstInput(),
+            example_inputs,
+            BackendAQuantizer(),
+            node_tags,
+        )
diff --git a/test/quantization/pt2e/test_quantize_pt2e.py b/test/quantization/pt2e/test_quantize_pt2e.py
index eb2bc61f8065a..b96e1ff12ac3d 100644
--- a/test/quantization/pt2e/test_quantize_pt2e.py
+++ b/test/quantization/pt2e/test_quantize_pt2e.py
@@ -2,14 +2,22 @@
 from typing import List, Tuple
 
 import torch
-from torch._export import (
-    capture_pre_autograd_graph,
-)
 from torch import Tensor
-from torch.ao.quantization import (
-    observer,
-    ObserverOrFakeQuantize,
-    QConfigMapping,
+from torch._export import capture_pre_autograd_graph
+from torch.ao.quantization import observer, ObserverOrFakeQuantize, QConfigMapping
+
+from torch.ao.quantization.qconfig import (
+    default_per_channel_symmetric_qnnpack_qconfig,
+    float_qparams_weight_only_qconfig,
+    per_channel_weight_observer_range_neg_127_to_127,
+    QConfig,
+    weight_observer_range_neg_127_to_127,
+)
+
+from torch.ao.quantization.quantize_pt2e import (
+    convert_pt2e,
+    prepare_pt2e,
+    prepare_qat_pt2e,
 )
 from torch.ao.quantization.quantizer import (
     DerivedQuantizationSpec,
@@ -19,33 +27,19 @@
     Quantizer,
     SharedQuantizationSpec,
 )
-from torch.ao.quantization.quantizer.xnnpack_quantizer import (
-    XNNPACKQuantizer,
-    get_symmetric_quantization_config,
-)
-from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
-    OP_TO_ANNOTATOR,
-    QuantizationConfig,
-)
 from torch.ao.quantization.quantizer.composable_quantizer import (  # noqa: F811
     ComposableQuantizer,
 )
 from torch.ao.quantization.quantizer.embedding_quantizer import (  # noqa: F811
     EmbeddingQuantizer,
 )
-
-from torch.ao.quantization.quantize_pt2e import (
-    convert_pt2e,
-    prepare_pt2e,
-    prepare_qat_pt2e,
+from torch.ao.quantization.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+    XNNPACKQuantizer,
 )
-
-from torch.ao.quantization.qconfig import (
-    default_per_channel_symmetric_qnnpack_qconfig,
-    float_qparams_weight_only_qconfig,
-    per_channel_weight_observer_range_neg_127_to_127,
-    QConfig,
-    weight_observer_range_neg_127_to_127,
+from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
+    OP_TO_ANNOTATOR,
+    QuantizationConfig,
 )
 from torch.fx import Node
 
@@ -56,7 +50,11 @@
     TestHelperModules,
 )
 from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
     TemporaryFileName,
+    TEST_CUDA,
+    TEST_WITH_ROCM,
 )
 
 
@@ -186,12 +184,20 @@ def validate(self, model: torch.fx.GraphModule) -> None:
         # Ensure the conv has no observer inserted at output
         node_occurrence = {
             # two for input of conv
-            ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor.default): 1,
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.default): 2,
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ): 1,
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ): 2,
         }
         node_list = [
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.default),
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.default),
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ),
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ),
             ns.call_function(torch.ops.aten.conv2d.default),
         ]
         self.checkGraphModuleNodes(
@@ -272,15 +278,27 @@ def validate(self, model: torch.fx.GraphModule) -> None:
             # two for input of conv
             # one for input of maxpool
             # one for output of maxpool
-            ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor.default): 3,
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.default): 4,
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ): 3,
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ): 4,
         }
         node_list = [
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.default),
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.default),
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ),
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ),
             ns.call_function(torch.ops.aten.conv2d.default),
-            ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor.default),
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.default),
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ),
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ),
             ns.call_function(torch.ops.aten.max_pool2d.default),
         ]
         self.checkGraphModuleNodes(
@@ -495,7 +513,14 @@ def validate(self, model: torch.fx.GraphModule) -> None:
             m, expected_node_list=node_list, expected_node_occurrence=node_occurrence
         )
 
-    def test_fixed_qparams_qspec(self):
+    def test_fixed_qparams_qspec_ptq(self):
+        self._test_fixed_qparams_qspec(is_qat=False)
+
+    # TODO: refactor and move this to test_quantize_pt2_qat.py
+    def test_fixed_qparams_qspec_qat(self):
+        self._test_fixed_qparams_qspec(is_qat=True)
+
+    def _test_fixed_qparams_qspec(self, is_qat: bool):
         class M(torch.nn.Module):
             def forward(self, x):
                 return torch.sigmoid(x)
@@ -531,7 +556,7 @@ def validate(self, model: torch.fx.GraphModule) -> None:
         m = M().eval()
         example_inputs = (torch.randn(1, 3, 5, 5),)
 
-        m = self._quantize(m, BackendAQuantizer(), example_inputs)
+        m = self._quantize(m, BackendAQuantizer(), example_inputs, is_qat)
         fixed_scale = 1.0 / 256.0
         fixed_zero_point = 0
         for n in m.graph.nodes:
@@ -574,6 +599,101 @@ def validate(self, model: torch.fx.GraphModule) -> None:
             m, expected_node_list=node_list, expected_node_occurrence=node_occurrence
         )
 
+    def test_fixed_qparams_qspec_observer_dedup(self):
+        class BackendAQuantizer(Quantizer):
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                for node in model.graph.nodes:
+                    if (
+                        node.op == "call_function"
+                        and node.target == torch.ops.aten.sigmoid.default
+                    ):
+                        input_act = node.args[0]
+                        assert isinstance(input_act, Node)
+                        act_qspec = FixedQParamsQuantizationSpec(
+                            dtype=torch.uint8,
+                            quant_min=0,
+                            quant_max=255,
+                            qscheme=torch.per_tensor_affine,
+                            scale=1.0 / 256.0,
+                            zero_point=0,
+                        )
+                        node.meta["quantization_annotation"] = QuantizationAnnotation(
+                            input_qspec_map={
+                                input_act: act_qspec,
+                            },
+                            output_qspec=act_qspec,
+                            _annotated=True,
+                        )
+                    elif (
+                        node.op == "call_function"
+                        and node.target == torch.ops.aten.add.Tensor
+                    ):
+                        input_act0 = node.args[0]
+                        assert isinstance(input_act, Node)
+                        input_act1 = node.args[1]
+                        assert isinstance(input_act, Node)
+                        act_qspec = QuantizationSpec(
+                            observer_or_fake_quant_ctr=observer.default_observer,
+                            dtype=torch.uint8,
+                            quant_min=0,
+                            quant_max=255,
+                            qscheme=torch.per_tensor_affine,
+                        )
+                        node.meta["quantization_annotation"] = QuantizationAnnotation(
+                            input_qspec_map={
+                                input_act0: act_qspec,
+                                input_act1: act_qspec,
+                            },
+                            output_qspec=act_qspec,
+                            _annotated=True,
+                        )
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.sigmoid(x) + y
+
+            def example_inputs(self):
+                return (
+                    torch.randn(1, 3, 5, 5),
+                    torch.randn(1, 3, 5, 5),
+                )
+
+        m = M().eval()
+        example_inputs = m.example_inputs()
+        m = self._quantize(m, BackendAQuantizer(), example_inputs, is_qat=False)
+
+        node_occurrence = {
+            # two for input of the first conv, one for output for the first conv
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ): 4,
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ): 4,
+        }
+        node_list = [
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ),
+            ns.call_function(torch.ops.aten.sigmoid.default),
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ),
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ),
+            ns.call_function(torch.ops.aten.add.Tensor),
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ),
+        ]
+        self.checkGraphModuleNodes(
+            m, expected_node_list=node_list, expected_node_occurrence=node_occurrence
+        )
+
     def test_shared_qspec(self):
         class BackendAQuantizer(Quantizer):
             def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
@@ -632,9 +752,13 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                             observer_or_fake_quant_ctr=observer.default_observer,
                         )
                         input_qspec_map[first_input_node] = act_qspec
-                        share_qparams_with_input_act0_qspec = SharedQuantizationSpec((first_input_node, cat_node))
+                        share_qparams_with_input_act0_qspec = SharedQuantizationSpec(
+                            (first_input_node, cat_node)
+                        )
                         for input_node in input_nodes[1:]:
-                            input_qspec_map[input_node] = share_qparams_with_input_act0_qspec
+                            input_qspec_map[
+                                input_node
+                            ] = share_qparams_with_input_act0_qspec
 
                         cat_node.meta[
                             "quantization_annotation"
@@ -647,7 +771,6 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
             def validate(self, model: torch.fx.GraphModule) -> None:
                 pass
 
-
         m = TestHelperModules.Conv2dWithCat().eval()
         example_inputs = (torch.randn(1, 3, 5, 5), torch.randn(1, 3, 5, 5))
 
@@ -671,12 +794,14 @@ def validate(self, model: torch.fx.GraphModule) -> None:
                 obs_ins0 = getattr(m, input0.target)
                 obs_ins1 = getattr(m, input1.target)
                 assert obs_ins0 == obs_ins1
-        assert len(conv_output_obs) == 2, "expecting two observer that follows conv2d ops"
+        assert (
+            len(conv_output_obs) == 2
+        ), "expecting two observer that follows conv2d ops"
         # checking that the output observers for the two convs are shared as well
         assert conv_output_obs[0] == conv_output_obs[1]
 
         m(*example_inputs)
-        m = convert_pt2e(m, fold_quantize=True)
+        m = convert_pt2e(m)
 
         node_occurrence = {
             # two for input of the first conv, one for output for the first conv
@@ -705,7 +830,12 @@ def validate(self, model: torch.fx.GraphModule) -> None:
 
     def _test_transitive_sharing_with_cat_helper(self, quantizer):
         m = TestHelperModules.Conv2dWithTwoCat().eval()
-        example_inputs = (torch.randn(1, 3, 5, 5), torch.randn(1, 3, 5, 5), torch.randn(1, 6, 3, 3), torch.randn(1, 6, 3, 3))
+        example_inputs = (
+            torch.randn(1, 3, 5, 5),
+            torch.randn(1, 3, 5, 5),
+            torch.randn(1, 6, 3, 3),
+            torch.randn(1, 6, 3, 3),
+        )
 
         # program capture
         m = capture_pre_autograd_graph(
@@ -734,12 +864,14 @@ def _test_transitive_sharing_with_cat_helper(self, quantizer):
                 obs_ins2 = getattr(m, output_obs.target)
                 assert obs_ins0 == obs_ins2, "input observer does not match output"
 
-        assert len(conv_output_obs) == 2, "expecting two observer that follows conv2d ops"
+        assert (
+            len(conv_output_obs) == 2
+        ), "expecting two observer that follows conv2d ops"
         # checking that the output observers for the two convs are shared as well
         assert conv_output_obs[0] == conv_output_obs[1]
 
         m(*example_inputs)
-        m = convert_pt2e(m, fold_quantize=True)
+        m = convert_pt2e(m)
 
         node_occurrence = {
             # two for input of the first conv, one for output for the first conv
@@ -786,6 +918,7 @@ def test_shared_qspec_transitivity(self):
         so there is an implicit sharing here, all tensors connect to cat1 and cat2 are in the same
         sharing group after transitive sharing
         """
+
         # TODO: refactor this to a common util
         class BackendAQuantizer(Quantizer):
             def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
@@ -844,9 +977,13 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                             observer_or_fake_quant_ctr=observer.default_observer,
                         )
                         input_qspec_map[first_input_node] = act_qspec
-                        share_qparams_with_input_act0_qspec = SharedQuantizationSpec((first_input_node, cat_node))
+                        share_qparams_with_input_act0_qspec = SharedQuantizationSpec(
+                            (first_input_node, cat_node)
+                        )
                         for input_node in input_nodes[1:]:
-                            input_qspec_map[input_node] = share_qparams_with_input_act0_qspec
+                            input_qspec_map[
+                                input_node
+                            ] = share_qparams_with_input_act0_qspec
 
                         cat_node.meta[
                             "quantization_annotation"
@@ -877,6 +1014,7 @@ def test_shared_qspec_transitivity_case_2(self):
         the difference is that for this one, all edges and nodes are shared with the second input edge of cat
         instead of the first input edge of cat as in previous example
         """
+
         # TODO: refactor this to a common util
         class BackendAQuantizer(Quantizer):
             def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
@@ -936,8 +1074,12 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                             observer_or_fake_quant_ctr=observer.default_observer,
                         )
                         input_qspec_map[second_input_node] = act_qspec
-                        share_qparams_with_input_act1_qspec = SharedQuantizationSpec((second_input_node, cat_node))
-                        input_qspec_map[first_input_node] = share_qparams_with_input_act1_qspec
+                        share_qparams_with_input_act1_qspec = SharedQuantizationSpec(
+                            (second_input_node, cat_node)
+                        )
+                        input_qspec_map[
+                            first_input_node
+                        ] = share_qparams_with_input_act1_qspec
 
                         cat_node.meta[
                             "quantization_annotation"
@@ -971,6 +1113,7 @@ def test_allow_implicit_sharing(self):
                x3 -> obs2 -> add2 -> obs2 -> obs3
                x4 -> obs2 -/
         """
+
         # TODO: refactor this to a common util
         class BackendAQuantizer(Quantizer):
             def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
@@ -989,8 +1132,12 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                             observer_or_fake_quant_ctr=observer.default_observer,
                         )
                         input_qspec_map[second_input_node] = act_qspec
-                        share_qparams_with_input_act1_qspec = SharedQuantizationSpec((second_input_node, add_node))
-                        input_qspec_map[first_input_node] = share_qparams_with_input_act1_qspec
+                        share_qparams_with_input_act1_qspec = SharedQuantizationSpec(
+                            (second_input_node, add_node)
+                        )
+                        input_qspec_map[
+                            first_input_node
+                        ] = share_qparams_with_input_act1_qspec
 
                         add_node.meta[
                             "quantization_annotation"
@@ -1005,7 +1152,12 @@ def validate(self, model: torch.fx.GraphModule) -> None:
                 pass
 
         m = TestHelperModules.ThreeAdd().eval()
-        example_inputs = (torch.randn(1, 3, 5, 5), torch.randn(1, 3, 5, 5), torch.randn(1, 3, 5, 5), torch.randn(1, 3, 5, 5))
+        example_inputs = (
+            torch.randn(1, 3, 5, 5),
+            torch.randn(1, 3, 5, 5),
+            torch.randn(1, 3, 5, 5),
+            torch.randn(1, 3, 5, 5),
+        )
 
         # program capture
         m = capture_pre_autograd_graph(
@@ -1029,14 +1181,15 @@ def validate(self, model: torch.fx.GraphModule) -> None:
         self.assertIsNot(observers[0], observers[2])
         self.assertIsNot(observers[1], observers[2])
 
-    def test_int16(self):
-        class Int16ActQuantizer(Quantizer):
+    @parametrize("dtype", (torch.int16, torch.float8_e5m2, torch.float8_e4m3fn))
+    def test_quantization_dtype(self, dtype):
+        class DtypeActQuantizer(Quantizer):
             def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
-                # using int32 to simulate int16
-                int16_qspec = QuantizationSpec(
-                    dtype=torch.int16,
-                    quant_min=-2**15,
-                    quant_max=2**15 - 1,
+                info_fun = torch.iinfo if dtype == torch.int16 else torch.finfo
+                activate_qspec = QuantizationSpec(
+                    dtype=dtype,
+                    quant_min=int(info_fun(dtype).min),
+                    quant_max=int(info_fun(dtype).max),
                     qscheme=torch.per_tensor_affine,
                     is_dynamic=False,
                     observer_or_fake_quant_ctr=observer.default_observer,
@@ -1050,10 +1203,10 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                     observer_or_fake_quant_ctr=observer.default_weight_observer,
                 )
                 quantization_config = QuantizationConfig(
-                    input_activation=int16_qspec,
+                    input_activation=activate_qspec,
                     weight=int8_qspec,
                     bias=None,
-                    output_activation=int16_qspec,
+                    output_activation=activate_qspec,
                 )
                 OP_TO_ANNOTATOR["conv"](model, quantization_config)
 
@@ -1068,7 +1221,7 @@ def __init__(self):
             def forward(self, x):
                 return self.conv(x)
 
-        quantizer = Int16ActQuantizer()
+        quantizer = DtypeActQuantizer()
         node_occurrence = {
             # one for input of the first conv, one for output for the first conv
             torch.ops.quantized_decomposed.quantize_per_tensor.default: 2,
@@ -1084,31 +1237,81 @@ def forward(self, x):
         self._test_quantizer(
             M().eval(),
             example_inputs,
-            Int16ActQuantizer(),
+            quantizer,
             node_occurrence,
             node_list,
         )
 
+    def test_input_edge_sanity_check(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x + 6
+
+        class BackendAQuantizer(Quantizer):
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                for node in model.graph.nodes:
+                    if (
+                        node.op == "call_function"
+                        and node.target == torch.ops.aten.add.Tensor
+                    ):
+                        input_act1 = node.args[0]
+                        # this is a constant, so not valid for annotation
+                        input_act2 = node.args[1]
+                        act_qspec = QuantizationSpec(
+                            dtype=torch.uint8,
+                            quant_min=0,
+                            quant_max=255,
+                            qscheme=torch.per_tensor_affine,
+                            is_dynamic=False,
+                            observer_or_fake_quant_ctr=observer.default_observer,
+                        )
+                        node.meta["quantization_annotation"] = QuantizationAnnotation(
+                            input_qspec_map={
+                                input_act1: act_qspec,
+                                # this is supposed to error out
+                                input_act2: act_qspec,
+                            },
+                            output_qspec=act_qspec,
+                            _annotated=True,
+                        )
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        m = M().eval()
+        example_inputs = torch.randn(1, 2, 3, 3)
+        m = capture_pre_autograd_graph(m, example_inputs)
+        with self.assertRaises(Exception):
+            m = prepare_pt2e(m, BackendAQuantizer())
+
     def test_fold_quantize(self):
-        """Test to make sure the quantized model gets quantized weight (quantize_per_tensor op is folded)
-        """
+        """Test to make sure the quantized model gets quantized weight (quantize_per_tensor op is folded)"""
         m = self._get_pt2e_quantized_linear()
         node_occurrence = {
             # quantize op for weight node is folded
-            ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor.default): 2,
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.default): 3,
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ): 2,
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ): 3,
         }
         self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
 
     def test_fold_quantize_per_channel(self):
-        """Test to make sure the quantized model gets quantized weight (quantize_per_channel op is folded)
-        """
+        """Test to make sure the quantized model gets quantized weight (quantize_per_channel op is folded)"""
         m = self._get_pt2e_quantized_linear(is_per_channel=True)
         node_occurrence = {
             # quantize op for weight node is folded
-            ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor.default): 2,
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_channel.default): 1,
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.default): 2,
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ): 2,
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_channel.default
+            ): 1,
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ): 2,
         }
         self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
 
@@ -1116,6 +1319,7 @@ def test_dont_fold_other_constant(self):
         """Make sure the constant propagation does not apply to things unrelated to
         quantization
         """
+
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1136,8 +1340,12 @@ def forward(self, x):
         m = self._quantize(m, quantizer, example_inputs)
         node_occurrence = {
             # quantize op for weight node is folded
-            ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor.default): 2,
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.default): 3,
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ): 2,
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ): 3,
             # transpose op not folded
             ns.call_function(torch.ops.aten.t.default): 1,
         }
@@ -1150,6 +1358,7 @@ def test_fold_all_ops_before_quantize(self):
         After:
             get_attr(folded_weight) -> dequantize
         """
+
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1167,8 +1376,12 @@ def forward(self, x):
         m = self._quantize(m, quantizer, example_inputs)
         node_occurrence = {
             # quantize op for weight node is folded
-            ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor.default): 2,
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.default): 3,
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ): 2,
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ): 3,
         }
         self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
 
@@ -1176,6 +1389,7 @@ def test_constant_prop_preserve_metadata(self):
         """Test to make sure the get_attr node for const propagated weight Tensor gets the correct
         metadata (from original get_attr node from weight)
         """
+
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1195,14 +1409,17 @@ def forward(self, x):
         )
         weight_meta = None
         for n in m.graph.nodes:
-            if n.op == "get_attr" and next(iter(n.users)).target == torch.ops.aten.linear.default:
+            if (
+                n.op == "get_attr"
+                and next(iter(n.users)).target == torch.ops.aten.linear.default
+            ):
                 weight_meta = n.meta
                 break
         assert weight_meta is not None, "Expect to find metadata for weight node"
 
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
-        m = convert_pt2e(m, fold_quantize=True)
+        m = convert_pt2e(m)
 
         for n in m.graph.nodes:
             if n.op == "get_attr" and "frozen_param" in n.target:
@@ -1211,8 +1428,7 @@ def forward(self, x):
                     self.assertEqual(n.meta[key], weight_meta[key])
 
     def test_save_load(self):
-        """Test save/load a quantized model
-        """
+        """Test save/load a quantized model"""
         m = self._get_pt2e_quantized_linear()
         example_inputs = (torch.randn(2, 2),)
         ref_res = m(*example_inputs)
@@ -1252,7 +1468,9 @@ def validate(self, model: torch.fx.GraphModule) -> None:
 
     def test_transform_for_annotation(self):
         class TestQuantizer(Quantizer):
-            def transform_for_annotation(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+            def transform_for_annotation(
+                self, model: torch.fx.GraphModule
+            ) -> torch.fx.GraphModule:
                 for n in model.graph.nodes:
                     if n.target == torch.ops.aten.add.Tensor:
                         n.target = torch.ops.aten.mul.Tensor
@@ -1280,6 +1498,59 @@ def forward(self, x):
         }
         self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
 
+    def test_composable_quantizer_transform_for_annotation(self):
+        class TestQuantizer1(Quantizer):
+            def transform_for_annotation(
+                self, model: torch.fx.GraphModule
+            ) -> torch.fx.GraphModule:
+                for n in model.graph.nodes:
+                    if n.target == torch.ops.aten.add.Tensor:
+                        n.target = torch.ops.aten.mul.Tensor
+                return model
+
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                return model
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        class TestQuantizer2(Quantizer):
+            def transform_for_annotation(
+                self, model: torch.fx.GraphModule
+            ) -> torch.fx.GraphModule:
+                for n in model.graph.nodes:
+                    if n.target == torch.ops.aten.sub.Tensor:
+                        n.target = torch.ops.aten.div.Tensor
+                return model
+
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                return model
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        class M(torch.nn.Module):
+            def forward(self, x, y, z):
+                return x + y - z
+
+        m = M().eval()
+        quantizer = ComposableQuantizer([TestQuantizer1(), TestQuantizer2()])
+        example_inputs = (
+            torch.randn(1, 2, 3, 3),
+            torch.randn(1, 2, 3, 3),
+            torch.randn(1, 2, 3, 3),
+        )
+        m = capture_pre_autograd_graph(m, example_inputs)
+        m = prepare_pt2e(m, quantizer)
+        m(*example_inputs)
+        node_occurrence = {
+            ns.call_function(torch.ops.aten.add.Tensor): 0,
+            ns.call_function(torch.ops.aten.sub.Tensor): 0,
+            ns.call_function(torch.ops.aten.mul.Tensor): 1,
+            ns.call_function(torch.ops.aten.div.Tensor): 1,
+        }
+        self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
+
     def test_embedding_quantizer(self):
         m_eager = TestHelperModules.EmbeddingModule().eval()
         indices = torch.tensor(
@@ -1498,7 +1769,22 @@ def test_embedding_conv_linear_quantization(self):
             qconfig_mapping,
         )
 
-    def _test_move_exported_model_to_eval_dropout(self, inplace=False):
+    def _get_node(self, m: torch.fx.GraphModule, target: torch._ops.OpOverload):
+        """
+        Return the first node matching the specified target, throwing an exception
+        if no such batch norm node is found.
+        """
+        for n in m.graph.nodes:
+            if n.target == target:
+                return n
+        raise ValueError("Did not find node with target ", target)
+
+    def _test_move_exported_model_dropout(self, inplace: bool):
+        """
+        Test switching dropout behavior between train and eval modes using
+        `move_exported_model_to_eval` and `move_exported_model_to_train` APIs.
+        """
+
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1510,71 +1796,97 @@ def forward(self, x):
         example_inputs = (torch.randn(1),)
         m = M().train()
         m = capture_pre_autograd_graph(m, example_inputs)
+        if inplace:
+            target = torch.ops.aten.dropout_.default
+        else:
+            target = torch.ops.aten.dropout.default
 
         # Assert that dropout op exists and is in train mode
-        dropout_node = None
-        for n in m.graph.nodes:
-            if n.target == torch.ops.aten.native_dropout.default or n.target == torch.ops.aten.dropout_.default:
-                dropout_node = n
-                break
+        dropout_node = self._get_node(m, target)
         self.assertTrue(dropout_node is not None)
         self.assertTrue(dropout_node.args[2])
 
-        # Do the subgraph rewriting
+        # Move to eval
         torch.ao.quantization.move_exported_model_to_eval(m)
 
-        # Assert that dropout op is now replaced with a clone op
-        targets = [n.target for n in m.graph.nodes]
-        if inplace:
-            dropout_eval_node = None
-            for node in m.graph.nodes:
-                if node.target == torch.ops.aten.dropout_.default:
-                    dropout_eval_node = node
-            self.assertTrue(dropout_eval_node is not None)
-            self.assertFalse(dropout_eval_node.args[2])
+        # Assert that dropout op is now in eval mode
+        dropout_node = self._get_node(m, target)
+        self.assertTrue(dropout_node is not None)
+        self.assertTrue(not dropout_node.args[2])
+
+        # Move back to train
+        torch.ao.quantization.move_exported_model_to_train(m)
+
+        # Assert that dropout op is now in train mode again
+        dropout_node = self._get_node(m, target)
+        self.assertTrue(dropout_node is not None)
+        self.assertTrue(dropout_node.args[2])
+
+    def test_move_exported_model_dropout(self):
+        self._test_move_exported_model_dropout(inplace=False)
+
+    def test_move_exported_model_dropout_inplace(self):
+        self._test_move_exported_model_dropout(inplace=True)
+
+    def _get_bn_train_eval_ops(self):
+        if TEST_WITH_ROCM:
+            return (
+                torch.ops.aten.miopen_batch_norm.default,
+                torch.ops.aten.miopen_batch_norm.default,
+            )
+        elif TEST_CUDA:
+            return (
+                torch.ops.aten.cudnn_batch_norm.default,
+                torch.ops.aten.cudnn_batch_norm.default,
+            )
         else:
-            self.assertTrue(torch.ops.aten.clone.default in targets)
-            self.assertTrue(torch.ops.aten.native_dropout.default not in targets)
+            return (
+                torch.ops.aten._native_batch_norm_legit.default,
+                torch.ops.aten._native_batch_norm_legit_no_training.default,
+            )
 
-    def test_move_exported_model_to_eval(self):
-        self._test_move_exported_model_to_eval_dropout(inplace=False)
-        self._test_move_exported_model_to_eval_dropout(inplace=True)
+    def test_move_exported_model_bn(self):
+        """
+        Test switching batch_norm behavior between train and eval modes using
+        `move_exported_model_to_eval` and `move_exported_model_to_train` APIs.
+        """
 
-    def test_bn_move_exported_model_to_eval(self):
         class M(torch.nn.Module):
-            def __init__(
-                self,
-            ):
+            def __init__(self):
                 super().__init__()
                 self.bn = torch.nn.BatchNorm2d(3)
-                self.conv = torch.nn.Conv2d(3, 3, 3)
 
             def forward(self, x):
-                return self.conv(self.bn(x))
-
-        m = M().train()
-        example_inputs = (
-            torch.randn((1, 3, 8, 8), dtype=torch.float32, requires_grad=True).add(1),
-        )
+                return self.bn(x)
 
+        if TEST_CUDA:
+            m = M().train().cuda()
+            example_inputs = (torch.randn(1, 3, 3, 3).cuda(),)
+        else:
+            m = M().train()
+            example_inputs = (torch.randn(1, 3, 3, 3),)
+        bn_train_op, bn_eval_op = self._get_bn_train_eval_ops()
         m = capture_pre_autograd_graph(m, example_inputs)
 
-        # Assert that bn op exists and is in train mode
-        batch_norm_node = None
-        for n in m.graph.nodes:
-            if n.target == torch.ops.aten._native_batch_norm_legit.default:
-                batch_norm_node = n
-                break
-        self.assertTrue(batch_norm_node is not None)
-        self.assertTrue(batch_norm_node.args[5])
+        # Assert that batch norm op exists and is in train mode
+        bn_node = self._get_node(m, bn_train_op)
+        self.assertTrue(bn_node is not None)
+        self.assertTrue(bn_node.args[5])
 
-        # Do the subgraph rewriting
+        # Move to eval
         torch.ao.quantization.move_exported_model_to_eval(m)
 
-        # Assert that bn op is now in eval mode
-        targets = [n.target for n in m.graph.nodes]
-        self.assertTrue(torch.ops.aten._native_batch_norm_legit.default not in targets)
-        self.assertTrue(torch.ops.aten._native_batch_norm_legit_no_training.default in targets)
+        # Assert that batch norm op is now in eval mode
+        bn_node = self._get_node(m, bn_eval_op)
+        self.assertTrue(bn_node is not None)
+
+        # Move to train
+        torch.ao.quantization.move_exported_model_to_train(m)
+
+        # Assert that batch norm op is now in train mode again
+        bn_node = self._get_node(m, bn_train_op)
+        self.assertTrue(bn_node is not None)
+        self.assertTrue(bn_node.args[5])
 
     def test_disallow_eval_train(self):
         m = TestHelperModules.ConvWithBNRelu(relu=True)
@@ -1600,42 +1912,146 @@ def test_disallow_eval_train(self):
             m.train()
 
         # After convert: still not OK
+        m = convert_pt2e(m)
+        with self.assertRaises(NotImplementedError):
+            m.eval()
+        with self.assertRaises(NotImplementedError):
+            m.train()
+
+    def test_allow_exported_model_train_eval(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm2d(3)
+                self.dropout = torch.nn.Dropout(0.5)
+
+            def forward(self, x):
+                x = self.bn(x)
+                x = self.dropout(x)
+                return x
+
+        if TEST_CUDA:
+            m = M().train().cuda()
+            example_inputs = (torch.randn(1, 3, 3, 3).cuda(),)
+        else:
+            m = M().train()
+            example_inputs = (torch.randn(1, 3, 3, 3),)
+        bn_train_op, bn_eval_op = self._get_bn_train_eval_ops()
+        m = capture_pre_autograd_graph(m, example_inputs)
+
+        def _assert_ops_are_correct(m: torch.fx.GraphModule, train: bool):
+            targets = [n.target for n in m.graph.nodes]
+            bn_op = bn_train_op if train else bn_eval_op
+            bn_node = self._get_node(m, bn_op)
+            self.assertTrue(bn_node is not None)
+            if TEST_CUDA:
+                self.assertEqual(bn_node.args[5], train)
+            dropout_node = self._get_node(m, torch.ops.aten.dropout.default)
+            self.assertEqual(dropout_node.args[2], train)
+
+        # Before wrapping: this is not OK
+        with self.assertRaises(NotImplementedError):
+            m.eval()
+        with self.assertRaises(NotImplementedError):
+            m.train()
+
+        # After wrapping: does not error and swaps the ops accordingly
+        torch.ao.quantization.allow_exported_model_train_eval(m)
+        m.eval()
+        _assert_ops_are_correct(m, train=False)
+        m.train()
+        _assert_ops_are_correct(m, train=True)
+
+        # After prepare but before wrapping: this is not OK
+        quantizer = XNNPACKQuantizer()
+        m = prepare_qat_pt2e(m, quantizer)
+        with self.assertRaises(NotImplementedError):
+            m.eval()
+        with self.assertRaises(NotImplementedError):
+            m.train()
+
+        # After prepare and after wrapping: does not error and swaps the ops accordingly
+        torch.ao.quantization.allow_exported_model_train_eval(m)
+        m.eval()
+        _assert_ops_are_correct(m, train=False)
+        m.train()
+        _assert_ops_are_correct(m, train=True)
+
+        # After convert but before wrapping: this is not OK
         m = convert_pt2e(m, fold_quantize=True)
         with self.assertRaises(NotImplementedError):
             m.eval()
         with self.assertRaises(NotImplementedError):
             m.train()
 
+        # After convert and after wrapping: does not error and swaps the ops accordingly
+        torch.ao.quantization.allow_exported_model_train_eval(m)
+        m.eval()
+        _assert_ops_are_correct(m, train=False)
+        m.train()
+        _assert_ops_are_correct(m, train=True)
+
+    def test_model_is_exported(self):
+        m = TestHelperModules.ConvWithBNRelu(relu=True)
+        example_inputs = (torch.rand(3, 3, 5, 5),)
+        exported_gm = capture_pre_autograd_graph(m, example_inputs)
+        fx_traced_gm = torch.fx.symbolic_trace(m, example_inputs)
+        self.assertTrue(
+            torch.ao.quantization.pt2e.export_utils.model_is_exported(exported_gm)
+        )
+        self.assertFalse(
+            torch.ao.quantization.pt2e.export_utils.model_is_exported(fx_traced_gm)
+        )
+        self.assertFalse(torch.ao.quantization.pt2e.export_utils.model_is_exported(m))
+
     def test_reentrant(self):
         """Test we can safely call quantization apis multiple times"""
         m = TestHelperModules.ConvBnReLU2dAndLinearReLU()
         example_inputs = (torch.randn(3, 3, 10, 10),)
 
-        quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config(is_per_channel=True, is_qat=True))
+        quantizer = XNNPACKQuantizer().set_global(
+            get_symmetric_quantization_config(is_per_channel=True, is_qat=True)
+        )
         m.conv_bn_relu = capture_pre_autograd_graph(m.conv_bn_relu, example_inputs)
         m.conv_bn_relu = prepare_qat_pt2e(m.conv_bn_relu, quantizer)
         m(*example_inputs)
-        m.conv_bn_relu = convert_pt2e(m.conv_bn_relu, fold_quantize=True)
+        m.conv_bn_relu = convert_pt2e(m.conv_bn_relu)
 
-        quantizer = XNNPACKQuantizer().set_module_type(torch.nn.Linear, get_symmetric_quantization_config(is_per_channel=False))
+        quantizer = XNNPACKQuantizer().set_module_type(
+            torch.nn.Linear, get_symmetric_quantization_config(is_per_channel=False)
+        )
         m = capture_pre_autograd_graph(m, example_inputs)
         m = prepare_pt2e(m, quantizer)
-        m = convert_pt2e(m, fold_quantize=True)
+        m = convert_pt2e(m)
 
         node_occurrence = {
-            ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor.default): 4,
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ): 4,
             # one for weight
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.default): 5,
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_channel.default): 1,
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ): 5,
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_channel.default
+            ): 1,
         }
         node_list = [
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.default),
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ),
             ns.call_function(torch.ops.aten.conv2d.default),
             ns.call_function(torch.ops.aten.relu.default),
-            ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor.default),
-            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.default),
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ),
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ),
             ns.call_function(torch.ops.aten.linear.default),
-            ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor.default),
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ),
         ]
         self.checkGraphModuleNodes(
             m, expected_node_occurrence=node_occurrence, expected_node_list=node_list
@@ -1652,9 +2068,12 @@ def test_groupwise_per_channel_quant(self):
         m(*example_inputs)
 
     def test_observer_callback(self):
-        from torch.library import Library, impl
-        test_lib = Library("test_int4", "DEF")
-        test_lib.define("quantize_per_tensor_int4(Tensor input, float scale, int zero_point) -> Tensor")
+        from torch.library import impl, Library
+
+        test_lib = Library("test_int4", "DEF")  # noqa: TOR901
+        test_lib.define(
+            "quantize_per_tensor_int4(Tensor input, float scale, int zero_point) -> Tensor"
+        )
 
         @impl(test_lib, "quantize_per_tensor_int4", "CompositeExplicitAutograd")
         def quantize_per_tensor_int4(
@@ -1663,9 +2082,15 @@ def quantize_per_tensor_int4(
             zero_point: int,
         ) -> torch.Tensor:
             inv_scale = 1.0 / scale
-            return torch.clamp(torch.round(input * inv_scale) + zero_point, 0, 15).to(torch.uint8).view(torch.bits8)
+            return (
+                torch.clamp(torch.round(input * inv_scale) + zero_point, 0, 15)
+                .to(torch.uint8)
+                .view(torch.bits8)
+            )
 
-        test_lib.define("dequantize_per_tensor_int4(Tensor input, float scale, int zero_point) -> Tensor")
+        test_lib.define(
+            "dequantize_per_tensor_int4(Tensor input, float scale, int zero_point) -> Tensor"
+        )
 
         @impl(test_lib, "dequantize_per_tensor_int4", "CompositeExplicitAutograd")
         def dequantize_per_tensor_int4(
@@ -1691,9 +2116,15 @@ def calculate_qparams(self, **kwargs):
             def convert(self, model: torch.fx.GraphModule, observer_node: Node):
                 with model.graph.inserting_before(observer_node):
                     q_node = model.graph.call_function(
-                        torch.ops.test_int4.quantize_per_tensor_int4, (observer_node.args[0], 1.0, 0), {})
+                        torch.ops.test_int4.quantize_per_tensor_int4,
+                        (observer_node.args[0], 1.0, 0),
+                        {},
+                    )
                     dq_node = model.graph.call_function(
-                        torch.ops.test_int4.dequantize_per_tensor_int4, (q_node, 1.0, 0), {})
+                        torch.ops.test_int4.dequantize_per_tensor_int4,
+                        (q_node, 1.0, 0),
+                        {},
+                    )
                     observer_node.replace_all_uses_with(dq_node)
                     model.graph.erase_node(observer_node)
 
@@ -1729,12 +2160,14 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
             def validate(self, model: torch.fx.GraphModule) -> None:
                 pass
 
-
         class M(torch.nn.Module):
             def forward(self, x1, x2):
                 return x1 + x2
 
-        example_inputs = (torch.randn(1, 3, 5, 5), torch.randn(1, 3, 5, 5),)
+        example_inputs = (
+            torch.randn(1, 3, 5, 5),
+            torch.randn(1, 3, 5, 5),
+        )
         node_occurrence = {
             # two for input of the first conv, one for output for the first conv
             torch.ops.test_int4.quantize_per_tensor_int4: 3,
@@ -1769,7 +2202,9 @@ def dynamic_quantize_pt2e(model, example_inputs):
                 is_per_channel=True, is_dynamic=True
             )
             dynamic_quantizer.set_global(operator_config_dynamic)
-            composed_quantizer = ComposableQuantizer([embedding_quantizer, dynamic_quantizer])
+            composed_quantizer = ComposableQuantizer(
+                [embedding_quantizer, dynamic_quantizer]
+            )
             prev = time.time()
             model = prepare_qat_pt2e(model, composed_quantizer)
             cur = time.time()
@@ -1796,3 +2231,52 @@ def forward(self, x):
         m = M().eval()
         example_inputs = (torch.randn(5, 5),)
         _ = dynamic_quantize_pt2e(m, example_inputs)
+
+    def test_conv_transpose_bn_relu(self):
+        class BackendAQuantizer(Quantizer):
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                int8_qspec = QuantizationSpec(
+                    dtype=torch.int8,
+                    quant_min=-128,
+                    quant_max=127,
+                    qscheme=torch.per_tensor_symmetric,
+                    is_dynamic=False,
+                    observer_or_fake_quant_ctr=observer.default_weight_observer,
+                )
+                quantization_config = QuantizationConfig(
+                    input_activation=int8_qspec,
+                    weight=int8_qspec,
+                    bias=None,
+                    output_activation=int8_qspec,
+                )
+                # conv_transpose + bn is fused automatically in PTQ (not configurable)
+                # so we just need to annotate conv_transpose + relu for conv_transpose + bn + relu
+                # pattern
+                OP_TO_ANNOTATOR["conv_transpose_relu"](model, quantization_config)
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        example_inputs = (torch.randn(1, 3, 5, 5),)
+        node_occurrence = {
+            # two for input of the first conv, one for output for the first conv
+            torch.ops.quantized_decomposed.quantize_per_tensor.default: 2,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default: 3,
+        }
+        node_list = [
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+            torch.ops.aten.conv_transpose2d.input,
+            torch.ops.aten.relu.default,
+            torch.ops.quantized_decomposed.quantize_per_tensor.default,
+        ]
+        self._test_quantizer(
+            TestHelperModules.ConvTWithBNRelu(relu=True, bn=True),
+            example_inputs,
+            BackendAQuantizer(),
+            node_occurrence,
+            node_list,
+        )
+
+
+instantiate_parametrized_tests(TestQuantizePT2E)
diff --git a/test/quantization/pt2e/test_quantize_pt2e_qat.py b/test/quantization/pt2e/test_quantize_pt2e_qat.py
index 662953aafcda2..d0398652d842b 100644
--- a/test/quantization/pt2e/test_quantize_pt2e_qat.py
+++ b/test/quantization/pt2e/test_quantize_pt2e_qat.py
@@ -61,8 +61,12 @@ def __init__(
             **conv_kwargs,
         ):
             super().__init__()
-            self.conv = conv_class(3, 3, 3, bias=has_conv_bias, **conv_kwargs)
-            self.bn = bn_class(3) if has_bn else None
+            conv_kwargs.setdefault("in_channels", 3)
+            conv_kwargs.setdefault("out_channels", 3)
+            conv_kwargs.setdefault("kernel_size", 3)
+            conv_kwargs.setdefault("bias", has_conv_bias)
+            self.conv = conv_class(**conv_kwargs)
+            self.bn = bn_class(conv_kwargs["out_channels"]) if has_bn else None
             self.relu = torch.nn.ReLU() if has_relu else None
 
         def forward(self, x):
@@ -78,6 +82,7 @@ def _get_conv_bn_model(
         has_conv_bias: bool = True,
         has_bn: bool = True,
         has_relu: bool = False,
+        transpose: bool = False,
         **conv_kwargs,
     ):
         """
@@ -86,7 +91,7 @@ def _get_conv_bn_model(
         conv-bn model with conv bias.
         """
         return self._BaseConvBnModel(
-            self.conv_class,
+            self.conv_transpose_class if transpose else self.conv_class,
             self.bn_class,
             has_conv_bias,
             has_bn,
@@ -179,6 +184,8 @@ def _verify_symmetric_xnnpack_qat_graph(
         has_bias: bool = True,
         is_cuda: bool = False,
         expected_conv_literal_args: Optional[Tuple[Any, ...]] = None,
+        # TODO: set this to true by default
+        verify_convert: bool = False,
     ):
         self._verify_symmetric_xnnpack_qat_graph_helper(
             m,
@@ -188,6 +195,7 @@ def _verify_symmetric_xnnpack_qat_graph(
             has_bias=has_bias,
             is_cuda=is_cuda,
             expected_conv_literal_args=expected_conv_literal_args,
+            verify_convert=verify_convert,
         )
         self._verify_symmetric_xnnpack_qat_graph_helper(
             m,
@@ -197,6 +205,7 @@ def _verify_symmetric_xnnpack_qat_graph(
             has_bias=has_bias,
             is_cuda=is_cuda,
             expected_conv_literal_args=expected_conv_literal_args,
+            verify_convert=verify_convert,
         )
 
     def _verify_symmetric_xnnpack_qat_graph_helper(
@@ -208,6 +217,7 @@ def _verify_symmetric_xnnpack_qat_graph_helper(
         has_bias: bool = True,
         is_cuda: bool = False,
         expected_conv_literal_args: Optional[Tuple[Any, ...]] = None,
+        verify_convert: bool = False,
     ):
         """
         Verify that the graph module matches the fused QAT [conv - bn (- relu)] pattern
@@ -267,6 +277,7 @@ def _verify_symmetric_xnnpack_qat_graph_helper(
         else:
             div_scale_factor_node = bn_node.args[0]
         (conv_node, scale_factor_reshape_node) = div_scale_factor_node.args
+        conv_op = conv_node.target
         self.assertEqual(div_scale_factor_node.target, torch.ops.aten.div.Tensor)
         self.assertTrue(_is_conv_node(conv_node))
         self.assertEqual(
@@ -347,12 +358,72 @@ def _verify_symmetric_xnnpack_qat_graph_helper(
         self.assertTrue("bn_running_var" in bn_running_var_node.target)
         self.assertEqual(eps, 1e-5)
 
+        # Optionally check the converted graph
+        if verify_convert:
+            m = convert_pt2e(m)
+            m(*example_inputs)
+
+            if is_per_channel:
+                conv_weight_dq_op = (
+                    torch.ops.quantized_decomposed.dequantize_per_channel.default
+                )
+                node_occurrence = {
+                    ns.call_function(
+                        torch.ops.quantized_decomposed.quantize_per_tensor.default
+                    ): 2,
+                    ns.call_function(
+                        torch.ops.quantized_decomposed.dequantize_per_tensor.default
+                    ): 2,
+                    ns.call_function(
+                        torch.ops.quantized_decomposed.dequantize_per_channel.default
+                    ): 1,
+                }
+            else:
+                conv_weight_dq_op = (
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default
+                )
+                node_occurrence = {
+                    ns.call_function(
+                        torch.ops.quantized_decomposed.quantize_per_tensor.default
+                    ): 2,
+                    ns.call_function(
+                        torch.ops.quantized_decomposed.dequantize_per_tensor.default
+                    ): 3,
+                }
+            node_list = [
+                ns.call_function(
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default
+                ),
+                ns.call_function(
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default
+                ),
+                ns.call_function(conv_weight_dq_op),
+                ns.call_function(conv_op),
+                ns.call_function(
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default
+                ),
+                ns.call_function(
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default
+                ),
+            ]
+
+            self.checkGraphModuleNodes(
+                m,
+                expected_node_list=node_list,
+                expected_node_occurrence=node_occurrence,
+            )
+
 
 class TestQuantizePT2EQAT_ConvBn_Base(PT2EQATTestCase):
     """
     Base TestCase to be used for all conv-bn[-relu] fusion patterns.
     """
 
+    # TODO: how can we avoid adding every new test to dynamo/expected_test_failures?
+    # Otherwise it fails with the following error:
+    #   torch._dynamo.exc.InternalTorchDynamoError:
+    #   'QuantizationConfig' object has no attribute '__bool__'
+
     def setUp(self):
         # NB: Skip the test if this is a base class, this is to handle the test
         # discovery logic in buck which finds and runs all tests here including
@@ -649,24 +720,26 @@ def forward(self, x):
         # Extract the conv weight and bias nodes
         def get_conv_weight_and_bias(conv_node: torch.fx.Node):
             weight_dq_node = conv_node.args[1]
-            weight_q_node = weight_dq_node.args[0]
-            weight_node = weight_q_node.args[0]
+            qweight_node = weight_dq_node.args[0]
             bias_node = conv_node.args[2]
-            assert isinstance(weight_node, torch.fx.Node)
+            assert isinstance(qweight_node, torch.fx.Node)
             assert isinstance(bias_node, torch.fx.Node)
-            return (weight_node, bias_node)
+            return (qweight_node, bias_node)
 
-        first_conv_weight, first_conv_bias = get_conv_weight_and_bias(first_conv)
-        second_conv_weight, second_conv_bias = get_conv_weight_and_bias(second_conv)
+        first_conv_qweight, first_conv_bias = get_conv_weight_and_bias(first_conv)
+        second_conv_qweight, second_conv_bias = get_conv_weight_and_bias(second_conv)
 
         # Assert that each set of conv, conv weight, and conv bias are in the same partition
         def get_source_fn(node: torch.fx.Node):
             # E.g. [('l__self___backbone1_conv', <class 'torch.nn.modules.conv.Conv2d'>)]
             return node.meta["source_fn_stack"][0][0]
 
-        self.assertEqual(get_source_fn(first_conv), get_source_fn(first_conv_weight))
+        # we don't preserve this is quantized weight currently since it's folded
+        # but user can attach "quantization_tag" to the node and it will be preserved
+        # self.assertEqual(get_source_fn(first_conv), get_source_fn(first_conv_qweight))
+        # self.assertEqual(get_source_fn(second_conv), get_source_fn(second_conv_qweight))
+
         self.assertEqual(get_source_fn(first_conv), get_source_fn(first_conv_bias))
-        self.assertEqual(get_source_fn(second_conv), get_source_fn(second_conv_weight))
         self.assertEqual(get_source_fn(second_conv), get_source_fn(second_conv_bias))
 
         # Assert that different sets of convs and relus have different partitions
@@ -703,15 +776,15 @@ def test_qat_conv_bn_bias_derived_qspec(self):
             bias_dq.target,
             torch.ops.quantized_decomposed.dequantize_per_tensor.default,
         )
-        weight_q = weight_dq.args[0]
-        bias_q = bias_dq.args[0]
+        weight_getattr = weight_dq.args[0]
+        bias_getattr = bias_dq.args[0]
         self.assertEqual(
-            weight_q.target,
-            torch.ops.quantized_decomposed.quantize_per_tensor.default,
+            weight_getattr.op,
+            "get_attr",
         )
         self.assertEqual(
-            bias_q.target,
-            torch.ops.quantized_decomposed.quantize_per_tensor.default,
+            bias_getattr.op,
+            "get_attr",
         )
 
         # Assert that bias scale = weight scale * input scale
@@ -745,32 +818,104 @@ def test_qat_per_channel_weight_custom_dtype(self):
             weight_dq.target,
             torch.ops.quantized_decomposed.dequantize_per_channel.default,
         )
-        weight_q = weight_dq.args[0]
+        weight_getattr = weight_dq.args[0]
         self.assertEqual(
-            weight_q.target,
-            torch.ops.quantized_decomposed.quantize_per_channel.default,
+            weight_getattr.op,
+            "get_attr",
         )
 
-        # Assert that args for the weight's quantize and dequantize ops
+        # Assert that args for the weight's dequantize ops
         # are copied correctly after subgraph rewriting
-        (q_axis, q_qmin, q_qmax, q_dtype) = weight_q.args[3:]
         (dq_axis, dq_qmin, dq_qmax, dq_dtype) = weight_dq.args[3:]
-        self.assertEqual(q_axis, 0)
         self.assertEqual(dq_axis, 0)
-        self.assertEqual(q_qmin, 0)
         self.assertEqual(dq_qmin, 0)
-        self.assertEqual(q_qmax, 2**31 - 1)
         self.assertEqual(dq_qmax, 2**31 - 1)
-        self.assertEqual(q_dtype, torch.int32)
         self.assertEqual(dq_dtype, torch.int32)
 
+    def _do_test_qat_conv_transpose_bn(self, has_relu: bool):
+        # Use different in/out channel sizes to test if conv weight is
+        # properly transposed in QAT pattern
+        m = self._get_conv_bn_model(
+            has_relu=has_relu,
+            transpose=True,
+            in_channels=3,
+            out_channels=5,
+            kernel_size=3,
+        )
+        self._verify_symmetric_xnnpack_qat_graph(
+            m,
+            self.example_inputs,
+            has_relu=has_relu,
+            verify_convert=True,
+        )
+
+    def test_qat_conv_transpose_bn(self):
+        self._do_test_qat_conv_transpose_bn(has_relu=False)
+
+    def test_qat_conv_transpose_bn_relu(self):
+        self._do_test_qat_conv_transpose_bn(has_relu=True)
+
+    def test_qat_conv_bn_per_channel_weight_bias(self):
+        m = self._get_conv_bn_model()
+        example_inputs = self.example_inputs
+        m = capture_pre_autograd_graph(m, example_inputs)
+        quantizer = ConvBnDerivedBiasQuantizer(is_per_channel=True)
+        m = prepare_qat_pt2e(m, quantizer)
+        m(*example_inputs)
+        m = convert_pt2e(m)
+        m(*example_inputs)
+
+        # Expected graph:
+        #      x -> q_tensor -> dq_tensor -> conv -> q_tensor -> dq_tensor -> output
+        #  weight -> q_channel -> dq_channel /
+        #    bias -> q_channel -> dq_channel /
+
+        (conv_node, _, _) = _get_conv_bn_getitem_nodes(m)
+        conv_op = conv_node.target
+        conv_weight_dq_op = (
+            torch.ops.quantized_decomposed.dequantize_per_channel.default
+        )
+        node_occurrence = {
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ): 2,
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ): 2,
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_channel.default
+            ): 2,
+        }
+        node_list = [
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ),
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ),
+            ns.call_function(conv_weight_dq_op),
+            ns.call_function(conv_weight_dq_op),
+            ns.call_function(conv_op),
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ),
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ),
+        ]
+        self.checkGraphModuleNodes(
+            m,
+            expected_node_list=node_list,
+            expected_node_occurrence=node_occurrence,
+        )
+
 
-# TODO: enable this in the next PR
 @skipIfNoQNNPACK
 class TestQuantizePT2EQAT_ConvBn1d(TestQuantizePT2EQAT_ConvBn_Base):
     dim = 1
     example_inputs = (torch.randn(1, 3, 5),)
     conv_class = torch.nn.Conv1d
+    conv_transpose_class = torch.nn.ConvTranspose1d
     bn_class = torch.nn.BatchNorm1d
 
 
@@ -779,6 +924,7 @@ class TestQuantizePT2EQAT_ConvBn2d(TestQuantizePT2EQAT_ConvBn_Base):
     dim = 2
     example_inputs = (torch.randn(1, 3, 5, 5),)
     conv_class = torch.nn.Conv2d
+    conv_transpose_class = torch.nn.ConvTranspose2d
     bn_class = torch.nn.BatchNorm2d
 
 
@@ -786,6 +932,10 @@ def _is_conv_node(n: torch.fx.Node):
     return n.op == "call_function" and n.target in [
         torch.ops.aten.conv1d.default,
         torch.ops.aten.conv2d.default,
+        torch.ops.aten.conv_transpose1d,
+        torch.ops.aten.conv_transpose1d.default,
+        torch.ops.aten.conv_transpose2d,
+        torch.ops.aten.conv_transpose2d.input,
     ]
 
 
@@ -856,22 +1006,45 @@ class ConvBnDerivedBiasQuantizer(Quantizer):
     derived from the conv input activation and weight qparams.
     """
 
+    def __init__(self, is_per_channel: bool = False):
+        super().__init__()
+        self.is_per_channel = is_per_channel
+
     def _derive_bias_qparams_from_act_and_weight_qparams(self, obs_or_fqs):
         act_scale, _ = obs_or_fqs[0].calculate_qparams()
         weight_scale, _ = obs_or_fqs[1].calculate_qparams()
-        bias_scale = torch.tensor([act_scale * weight_scale], dtype=torch.float32)
-        bias_zero_point = torch.tensor([0], dtype=torch.int32)
+        if self.is_per_channel:
+            bias_scale = act_scale * weight_scale
+            bias_zero_point = torch.zeros_like(bias_scale, dtype=torch.int32)
+        else:
+            bias_scale = torch.tensor([act_scale * weight_scale], dtype=torch.float32)
+            bias_zero_point = torch.tensor([0], dtype=torch.int32)
         return bias_scale, bias_zero_point
 
     def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        if self.is_per_channel:
+            weight_qscheme = torch.per_channel_symmetric
+            weight_fq = FusedMovingAvgObsFakeQuantize.with_args(
+                observer=MovingAveragePerChannelMinMaxObserver,
+            )
+        else:
+            weight_qscheme = torch.per_tensor_affine
+            weight_fq = default_fake_quant
         conv_node, _, getitem_node = _get_conv_bn_getitem_nodes(model)
-        act_and_weight_qspec = QuantizationSpec(
+        act_qspec = QuantizationSpec(
             dtype=torch.uint8,
             quant_min=0,
             quant_max=255,
             qscheme=torch.per_tensor_affine,
             observer_or_fake_quant_ctr=default_fake_quant,
         )
+        weight_qspec = QuantizationSpec(
+            dtype=torch.uint8,
+            quant_min=0,
+            quant_max=255,
+            qscheme=weight_qscheme,
+            observer_or_fake_quant_ctr=weight_fq,
+        )
         bias_qspec = DerivedQuantizationSpec(
             derived_from=[
                 (conv_node.args[0], conv_node),
@@ -881,18 +1054,19 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
             dtype=torch.int32,
             quant_min=-(2**31),
             quant_max=2**31 - 1,
-            qscheme=torch.per_tensor_affine,
+            qscheme=weight_qscheme,
+            ch_axis=0 if self.is_per_channel else None,
         )
         conv_node.meta["quantization_annotation"] = QuantizationAnnotation(
             input_qspec_map={
-                conv_node.args[0]: act_and_weight_qspec,
-                conv_node.args[1]: act_and_weight_qspec,
+                conv_node.args[0]: act_qspec,
+                conv_node.args[1]: weight_qspec,
                 conv_node.args[2]: bias_qspec,
             },
             _annotated=True,
         )
         getitem_node.meta["quantization_annotation"] = QuantizationAnnotation(
-            output_qspec=act_and_weight_qspec,
+            output_qspec=act_qspec,
             _annotated=True,
         )
         return model
@@ -975,7 +1149,7 @@ def _convert_qat_linears(self, model):
         for name, child in model.named_children():
             if isinstance(child, torch.fx.GraphModule):
                 torch.ao.quantization.move_exported_model_to_eval(child)
-                converted_child = convert_pt2e(child, fold_quantize=True)
+                converted_child = convert_pt2e(child)
                 setattr(model, name, converted_child)
             else:
                 self._convert_qat_linears(child)
@@ -1012,7 +1186,7 @@ def test_mixing_qat_ptq(self):
             # 3 x linear: 1 for act, 1 for output
             ns.call_function(
                 torch.ops.quantized_decomposed.quantize_per_tensor.default
-            ): 9,
+            ): 8,
             ns.call_function(
                 torch.ops.quantized_decomposed.dequantize_per_tensor.default
             ): 9,
diff --git a/test/quantization/pt2e/test_representation.py b/test/quantization/pt2e/test_representation.py
index 198146a321e83..b4273686c18f5 100644
--- a/test/quantization/pt2e/test_representation.py
+++ b/test/quantization/pt2e/test_representation.py
@@ -42,9 +42,7 @@ def _test_representation(
         model = prepare_pt2e(model, quantizer)
         # Calibrate
         model(*example_inputs)
-        model = convert_pt2e(
-            model, use_reference_representation=True, fold_quantize=True
-        )
+        model = convert_pt2e(model, use_reference_representation=True)
         self.checkGraphModuleNodes(model, expected_node_occurrence=ref_node_occurrence)
         # make sure it runs
         pt2e_quant_output = model(*example_inputs)
@@ -54,9 +52,7 @@ def _test_representation(
         model_copy = prepare_pt2e(model_copy, quantizer)
         # Calibrate
         model_copy(*example_inputs)
-        model_copy = convert_pt2e(
-            model_copy, use_reference_representation=False, fold_quantize=True
-        )
+        model_copy = convert_pt2e(model_copy, use_reference_representation=False)
         self.checkGraphModuleNodes(
             model_copy, expected_node_occurrence=non_ref_node_occurrence
         )
diff --git a/test/quantization/pt2e/test_x86inductor_quantizer.py b/test/quantization/pt2e/test_x86inductor_quantizer.py
index c2bf116250ed9..218b30bd9e33f 100644
--- a/test/quantization/pt2e/test_x86inductor_quantizer.py
+++ b/test/quantization/pt2e/test_x86inductor_quantizer.py
@@ -1,33 +1,38 @@
 # Owner(s): ["oncall: quantization"]
 import copy
+import itertools
+from enum import Enum
+
 import torch
+import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
 import torch.nn as nn
-from torch.ao.quantization.quantizer.x86_inductor_quantizer import (
-    X86InductorQuantizer,
-)
+from torch._export import capture_pre_autograd_graph
+from torch.ao.quantization import ObserverBase
 from torch.ao.quantization.quantize_pt2e import (
     convert_pt2e,
     prepare_pt2e,
     prepare_qat_pt2e,
 )
+from torch.ao.quantization.quantizer.x86_inductor_quantizer import (
+    QUANT_ANNOTATION_KEY,
+    X86InductorQuantizer,
+)
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
     QuantizationTestCase,
+    skipIfNoInductorSupport,
     skipIfNoX86,
-    skipIfNoDynamoSupport,
 )
 from torch.testing._internal.common_quantized import override_quantized_engine
-from enum import Enum
-import itertools
-import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
-from torch.ao.quantization import ObserverBase
-from torch._export import capture_pre_autograd_graph
+from torch.testing._internal.common_utils import skipIfTorchDynamo
 
-class Conv2DType(Enum):
+
+class NodePosType(Enum):
     left = 1
     right = 2
     both = 3
 
+
 class TestHelperModules:
     class SingleConv2dModule(torch.nn.Module):
         def __init__(self, with_bn=False) -> None:
@@ -45,31 +50,46 @@ def forward(self, x):
     class Conv2dUnaryModule(torch.nn.Module):
         def __init__(self, post_op, use_bias: bool = False, with_bn=False) -> None:
             super().__init__()
-            self.conv = nn.Conv2d(3, 6, (2, 2), stride=(1, 1), padding=(1, 1), bias=use_bias)
+            self.conv = nn.Conv2d(
+                3, 6, (2, 2), stride=(1, 1), padding=(1, 1), bias=use_bias
+            )
             self.post_op = post_op
             self.bn = torch.nn.BatchNorm2d(6)
             self.with_bn = with_bn
+            self.maxpool = torch.nn.MaxPool2d((3, 3))
 
         def forward(self, x):
             x = self.conv(x)
             if self.with_bn:
                 x = self.bn(x)
             x = self.post_op(x)
+            x = self.maxpool(x)
             return x
 
     class Conv2dAddModule(torch.nn.Module):
-        def __init__(self,
-                     inplace_add: bool = False,
-                     conv2d_type: Conv2DType = Conv2DType.left,
-                     use_bias: bool = False,
-                     with_bn: bool = False,
-                     ) -> None:
+        def __init__(
+            self,
+            inplace_add: bool = False,
+            conv2d_type: NodePosType = NodePosType.left,
+            use_bias: bool = False,
+            with_bn: bool = False,
+        ) -> None:
             super().__init__()
             self.conv = torch.nn.Conv2d(
-                in_channels=3, out_channels=3, kernel_size=3, stride=1, padding=1, bias=use_bias
+                in_channels=3,
+                out_channels=3,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=use_bias,
             )
             self.conv2 = torch.nn.Conv2d(
-                in_channels=3, out_channels=3, kernel_size=3, stride=1, padding=1, bias=use_bias
+                in_channels=3,
+                out_channels=3,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=use_bias,
             )
             self.relu = nn.ReLU()
             self.inplace_add = inplace_add
@@ -78,7 +98,7 @@ def __init__(self,
             self.with_bn = with_bn
 
         def forward(self, x):
-            if self.conv2d_type == Conv2DType.left:
+            if self.conv2d_type == NodePosType.left:
                 if self.inplace_add:
                     tmp = self.conv(x)
                     if self.with_bn:
@@ -90,14 +110,14 @@ def forward(self, x):
                     if self.with_bn:
                         tmp = self.bn(tmp)
                     return tmp + self.relu(x)
-            elif self.conv2d_type == Conv2DType.right:
+            elif self.conv2d_type == NodePosType.right:
                 if self.inplace_add:
                     tmp = self.relu(x)
                     tmp += self.conv(x)
                     return tmp
                 else:
                     return self.relu(x) + self.conv(x)
-            elif self.conv2d_type == Conv2DType.both:
+            elif self.conv2d_type == NodePosType.both:
                 if self.inplace_add:
                     tmp = self.conv(x)
                     tmp += self.conv2(x)
@@ -106,19 +126,30 @@ def forward(self, x):
                     return self.conv(x) + self.conv2(x)
 
     class Conv2dAddReLUModule(torch.nn.Module):
-        def __init__(self,
-                     inplace_add: bool = False,
-                     conv2d_type: Conv2DType = Conv2DType.left,
-                     inplace_relu: bool = False,
-                     use_bias: bool = False,
-                     with_bn: bool = False,
-                     ) -> None:
+        def __init__(
+            self,
+            inplace_add: bool = False,
+            conv2d_type: NodePosType = NodePosType.left,
+            inplace_relu: bool = False,
+            use_bias: bool = False,
+            with_bn: bool = False,
+        ) -> None:
             super().__init__()
             self.conv = torch.nn.Conv2d(
-                in_channels=3, out_channels=3, kernel_size=3, stride=1, padding=1, bias=use_bias
+                in_channels=3,
+                out_channels=3,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=use_bias,
             )
             self.conv2 = torch.nn.Conv2d(
-                in_channels=3, out_channels=3, kernel_size=3, stride=1, padding=1, bias=use_bias
+                in_channels=3,
+                out_channels=3,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=use_bias,
             )
             self.relu = nn.ReLU()
             self.inplace_add = inplace_add
@@ -128,7 +159,7 @@ def __init__(self,
             self.with_bn = with_bn
 
         def forward(self, x):
-            if self.conv2d_type == Conv2DType.left:
+            if self.conv2d_type == NodePosType.left:
                 if self.inplace_add:
                     tmp = self.conv(x)
                     if self.with_bn:
@@ -140,14 +171,14 @@ def forward(self, x):
                     if self.with_bn:
                         tmp = self.bn(tmp)
                     return self.relu2(tmp + self.relu(x))
-            elif self.conv2d_type == Conv2DType.right:
+            elif self.conv2d_type == NodePosType.right:
                 if self.inplace_add:
                     tmp = self.relu(x)
                     tmp += self.conv(x)
                     return self.relu2(tmp)
                 else:
                     return self.relu2(self.relu(x) + self.conv(x))
-            elif self.conv2d_type == Conv2DType.both:
+            elif self.conv2d_type == NodePosType.both:
                 if self.inplace_add:
                     tmp = self.conv(x)
                     tmp += self.conv2(x)
@@ -167,21 +198,43 @@ def forward(self, x):
             return torch.pow(x, 2)
 
     class SerialsConv2dAddReLUModule(torch.nn.Module):
-        """ Serials of 2 Conv2d -> Add -> ReLU Pattern.
-        """
-        def __init__(self, ) -> None:
+        """Serials of 2 Conv2d -> Add -> ReLU Pattern."""
+
+        def __init__(
+            self,
+        ) -> None:
             super().__init__()
             self.conv = torch.nn.Conv2d(
-                in_channels=3, out_channels=3, kernel_size=3, stride=1, padding=1, bias=True
+                in_channels=3,
+                out_channels=3,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=True,
             )
             self.conv2 = torch.nn.Conv2d(
-                in_channels=3, out_channels=3, kernel_size=3, stride=1, padding=1, bias=True
+                in_channels=3,
+                out_channels=3,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=True,
             )
             self.conv3 = torch.nn.Conv2d(
-                in_channels=3, out_channels=3, kernel_size=3, stride=1, padding=1, bias=True
+                in_channels=3,
+                out_channels=3,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=True,
             )
             self.conv4 = torch.nn.Conv2d(
-                in_channels=3, out_channels=3, kernel_size=3, stride=1, padding=1, bias=True
+                in_channels=3,
+                out_channels=3,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=True,
             )
             self.relu = nn.ReLU()
             self.relu2 = nn.ReLU()
@@ -193,13 +246,21 @@ def forward(self, x):
             return res2
 
     class Conv2dCatMaxpool2d(torch.nn.Module):
-        def __init__(self,):
+        def __init__(
+            self,
+        ):
             super().__init__()
-            self.conv = torch.nn.Conv2d(3, 16, 7, bias=True, stride=2, padding=3, dilation=1)
-            self.conv2 = torch.nn.Conv2d(3, 16, 7, bias=True, stride=2, padding=3, dilation=1)
+            self.conv = torch.nn.Conv2d(
+                3, 16, 7, bias=True, stride=2, padding=3, dilation=1
+            )
+            self.conv2 = torch.nn.Conv2d(
+                3, 16, 7, bias=True, stride=2, padding=3, dilation=1
+            )
             self.relu = torch.nn.ReLU()
             self.maxpool = torch.nn.MaxPool2d(3, stride=2, padding=1)
-            self.conv3 = torch.nn.Conv2d(32, 32, 7, bias=True, stride=2, padding=3, dilation=1)
+            self.conv3 = torch.nn.Conv2d(
+                32, 32, 7, bias=True, stride=2, padding=3, dilation=1
+            )
 
         def forward(self, x):
             temp1 = self.relu(self.conv(x))
@@ -210,9 +271,13 @@ def forward(self, x):
             return temp5
 
     class Conv2dAvgPool2d(torch.nn.Module):
-        def __init__(self,):
+        def __init__(
+            self,
+        ):
             super().__init__()
-            self.conv = torch.nn.Conv2d(3, 16, 7, bias=True, stride=2, padding=3, dilation=1)
+            self.conv = torch.nn.Conv2d(
+                3, 16, 7, bias=True, stride=2, padding=3, dilation=1
+            )
             self.avgpool = torch.nn.AvgPool2d(3, stride=2, padding=1)
 
         def forward(self, x):
@@ -220,9 +285,13 @@ def forward(self, x):
             return temp1
 
     class Conv2dCatSameInputs(torch.nn.Module):
-        def __init__(self,):
+        def __init__(
+            self,
+        ):
             super().__init__()
-            self.conv = torch.nn.Conv2d(3, 16, 7, bias=True, stride=2, padding=3, dilation=1)
+            self.conv = torch.nn.Conv2d(
+                3, 16, 7, bias=True, stride=2, padding=3, dilation=1
+            )
             self.relu = torch.nn.ReLU()
 
         def forward(self, x):
@@ -231,9 +300,13 @@ def forward(self, x):
             return temp3
 
     class Conv2dCatSingleInput(torch.nn.Module):
-        def __init__(self,):
+        def __init__(
+            self,
+        ):
             super().__init__()
-            self.conv = torch.nn.Conv2d(3, 16, 7, bias=True, stride=2, padding=3, dilation=1)
+            self.conv = torch.nn.Conv2d(
+                3, 16, 7, bias=True, stride=2, padding=3, dilation=1
+            )
             self.relu = torch.nn.ReLU()
 
         def forward(self, x):
@@ -250,18 +323,149 @@ def forward(self, x):
             return self.linear(x)
 
     class LinearUnaryModule(torch.nn.Module):
-        def __init__(self, use_bias, postop, inplace_postop) -> None:
+        def __init__(
+            self, use_bias, postop, inplace_postop=False, post_op_algo="none"
+        ) -> None:
             super().__init__()
             self.linear = nn.Linear(4, 4, bias=use_bias)
-            self.postop = postop(inplace=inplace_postop)
+            if postop == nn.GELU:
+                self.postop = postop(approximate=post_op_algo)
+            else:
+                self.postop = postop(inplace=inplace_postop)
 
         def forward(self, x):
             return self.postop(self.linear(x))
 
+    class LinearAddModule(torch.nn.Module):
+        def __init__(
+            self,
+            inplace_add: bool = False,
+            linear_pos: NodePosType = NodePosType.left,
+            use_bias: bool = False,
+        ) -> None:
+            super().__init__()
+            self.linear = torch.nn.Linear(
+                in_features=16, out_features=16, bias=use_bias
+            )
+            self.linear2 = torch.nn.Linear(
+                in_features=16, out_features=16, bias=use_bias
+            )
+            self.relu = nn.ReLU()
+            self.inplace_add = inplace_add
+            self.linear_pos = linear_pos
+
+        def forward(self, x):
+            if self.linear_pos == NodePosType.left:
+                if self.inplace_add:
+                    tmp = self.linear(x)
+                    tmp += self.relu(x)
+                    return tmp
+                else:
+                    tmp = self.linear(x)
+                    return tmp + self.relu(x)
+            elif self.linear_pos == NodePosType.right:
+                if self.inplace_add:
+                    tmp = self.relu(x)
+                    tmp += self.linear(x)
+                    return tmp
+                else:
+                    return self.relu(x) + self.linear(x)
+            elif self.linear_pos == NodePosType.both:
+                if self.inplace_add:
+                    tmp = self.linear(x)
+                    tmp += self.linear2(x)
+                    return tmp
+                else:
+                    return self.linear(x) + self.linear2(x)
+
+    class LinearAddReLUModule(torch.nn.Module):
+        def __init__(
+            self,
+            inplace_add: bool = False,
+            linear_pos: NodePosType = NodePosType.left,
+            inplace_relu: bool = False,
+            use_bias: bool = False,
+        ) -> None:
+            super().__init__()
+            self.linear = torch.nn.Linear(
+                in_features=16, out_features=16, bias=use_bias
+            )
+            self.linear2 = torch.nn.Linear(
+                in_features=16, out_features=16, bias=use_bias
+            )
+            self.relu = nn.ReLU()
+            self.inplace_add = inplace_add
+            self.linear_pos = linear_pos
+            self.relu2 = nn.ReLU(inplace=inplace_relu)
+
+        def forward(self, x):
+            if self.linear_pos == NodePosType.left:
+                if self.inplace_add:
+                    tmp = self.linear(x)
+                    tmp += self.relu(x)
+                    return self.relu2(tmp)
+                else:
+                    tmp = self.linear(x)
+                    return self.relu2(tmp + self.relu(x))
+            elif self.linear_pos == NodePosType.right:
+                if self.inplace_add:
+                    tmp = self.relu(x)
+                    tmp += self.linear(x)
+                    return self.relu2(tmp)
+                else:
+                    return self.relu2(self.relu(x) + self.linear(x))
+            elif self.linear_pos == NodePosType.both:
+                if self.inplace_add:
+                    tmp = self.linear(x)
+                    tmp += self.linear2(x)
+                    return self.relu2(tmp)
+                else:
+                    return self.relu2(self.linear(x) + self.linear2(x))
+
+    class SerialsLinearAddReLUModule(torch.nn.Module):
+        """Serials of 2 Linear -> Add -> ReLU Pattern."""
+
+        def __init__(
+            self,
+        ) -> None:
+            super().__init__()
+            self.linear = torch.nn.Linear(in_features=16, out_features=16, bias=True)
+            self.linear2 = torch.nn.Linear(in_features=16, out_features=16, bias=True)
+            self.linear3 = torch.nn.Linear(in_features=16, out_features=16, bias=True)
+            self.linear4 = torch.nn.Linear(in_features=16, out_features=16, bias=True)
+            self.relu = nn.ReLU()
+            self.relu2 = nn.ReLU()
+
+        def forward(self, x):
+            x1 = self.linear(x)
+            res1 = self.relu(self.linear2(x1) + self.linear3(x1))
+            res2 = self.relu2(self.linear4(res1) + res1)
+            return res2
+
+    class LinearAddModule2(torch.nn.Module):
+        def __init__(
+            self,
+            inplace_add: bool = False,
+        ) -> None:
+            super().__init__()
+            self.linear = torch.nn.Linear(in_features=16, out_features=16, bias=True)
+            self.linear2 = torch.nn.Linear(in_features=16, out_features=16, bias=True)
+            self.inplace_add = inplace_add
+
+        def forward(self, x):
+            if self.inplace_add:
+                tmp = self.linear(x)
+                tmp += self.linear2(tmp)
+                return tmp
+            else:
+                tmp = self.linear(x)
+                return tmp + self.linear2(tmp)
+
     class Conv2dAddModule2(torch.nn.Module):
-        def __init__(self,
-                     inplace_add: bool = False,
-                     ) -> None:
+        def __init__(
+            self,
+            inplace_add: bool = False,
+        ) -> None:
             super().__init__()
             self.conv = torch.nn.Conv2d(
                 in_channels=3, out_channels=3, kernel_size=3, stride=1, padding=1
@@ -283,23 +487,48 @@ def forward(self, x):
                 return tmp + self.bn2(self.conv2(tmp))
 
     class SelfAttnLikeModule(torch.nn.Module):
-        def __init__(self, input_dim) -> None:
+        def __init__(
+            self,
+            input_dim,
+            transpose_for_score=False,
+            num_attention_heads=None,
+            attention_head_size=None,
+        ) -> None:
             super().__init__()
             self.input_dim = input_dim
             self.q_proj = nn.Linear(input_dim, input_dim, bias=False)
             self.k_proj = nn.Linear(input_dim, input_dim, bias=False)
             self.v_proj = nn.Linear(input_dim, input_dim, bias=False)
             self.softmax = nn.Softmax(dim=-1)
+            self.transpose_for_score = transpose_for_score
+            if self.transpose_for_score:
+                assert num_attention_heads is not None
+                assert attention_head_size is not None
+                self.num_attention_heads = num_attention_heads
+                self.attention_head_size = attention_head_size
+
+        def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+            new_x_shape = x.size()[:-1] + (
+                self.num_attention_heads,
+                self.attention_head_size,
+            )
+            x = x.view(new_x_shape)
+            return x.permute(0, 2, 1, 3)
 
         def forward(self, x):
             q = self.q_proj(x)
             k = self.k_proj(x)
             v = self.v_proj(x)
-            scores = torch.bmm(q, k.transpose(1, 2)) / (self.input_dim ** 0.5)
+            if self.transpose_for_score:
+                q = self.transpose_for_scores(q)
+                k = self.transpose_for_scores(k)
+                v = self.transpose_for_scores(v)
+            scores = torch.matmul(q, k.transpose(-1, -2)) / (self.input_dim**0.5)
             attention = self.softmax(scores)
-            weighted = torch.bmm(attention, v)
+            weighted = torch.matmul(attention, v)
             return weighted
 
+
 class X86InductorQuantTestCase(QuantizationTestCase):
     def _test_quantizer(
         self,
@@ -325,7 +554,7 @@ def _test_quantizer(
         # Calibrate
         m(*example_inputs)
         prepare_model = copy.deepcopy(m)
-        m = convert_pt2e(m, fold_quantize=True)
+        m = convert_pt2e(m)
         convert_model = copy.deepcopy(m)
         pt2_quant_output = m(*example_inputs)
         node_occurrence = {
@@ -339,7 +568,8 @@ def _test_quantizer(
         )
         return export_model, prepare_model, convert_model
 
-@skipIfNoDynamoSupport
+
+@skipIfNoInductorSupport
 class TestQuantizePT2EX86Inductor(X86InductorQuantTestCase):
     @skipIfNoX86
     def test_conv2d(self):
@@ -376,28 +606,54 @@ def test_conv2d(self):
     @skipIfNoX86
     def test_conv2d_unary(self):
         """
-        Test pattern of conv2d with unary post ops (such as relu, hardtanh, relu6) with X86InductorQuantizer.
+        Test pattern of conv2d with unary post ops (such as relu, hardtanh, hardswish, relu6) with X86InductorQuantizer.
         """
         unary_map = {
             "relu": [torch.nn.ReLU(inplace=False), torch.ops.aten.relu.default],
             "relu_inplace": [torch.nn.ReLU(inplace=True), torch.ops.aten.relu_.default],
-            "hardtanh": [torch.nn.Hardtanh(min_val=0.0, max_val=6.0, inplace=False), torch.ops.aten.hardtanh.default],
-            "hardtanh_inplace": [torch.nn.Hardtanh(min_val=0.0, max_val=6.0, inplace=True), torch.ops.aten.hardtanh_.default],
+            "hardtanh": [
+                torch.nn.Hardtanh(min_val=0.0, max_val=6.0, inplace=False),
+                torch.ops.aten.hardtanh.default,
+            ],
+            "hardtanh_inplace": [
+                torch.nn.Hardtanh(min_val=0.0, max_val=6.0, inplace=True),
+                torch.ops.aten.hardtanh_.default,
+            ],
             "relu6": [torch.nn.ReLU6(inplace=False), torch.ops.aten.hardtanh.default],
-            "relu6_inplace": [torch.nn.ReLU6(inplace=True), torch.ops.aten.hardtanh_.default]
+            "relu6_inplace": [
+                torch.nn.ReLU6(inplace=True),
+                torch.ops.aten.hardtanh_.default,
+            ],
+            "hardswish": [
+                torch.nn.Hardswish(inplace=False),
+                torch.ops.aten.hardswish.default,
+            ],
+            "hardswish_inplace": [
+                torch.nn.Hardswish(inplace=True),
+                torch.ops.aten.hardswish_.default,
+            ],
+            "swish": [torch.nn.SiLU(inplace=False), torch.ops.aten.silu.default],
+            "swish_inplace": [
+                torch.nn.SiLU(inplace=True),
+                torch.ops.aten.silu_.default,
+            ],
         }
         use_bias_list = [True, False]
         with override_quantized_engine("x86"), torch.no_grad():
-            for unary_op, use_bias in itertools.product(unary_map.keys(), use_bias_list):
-                m = TestHelperModules.Conv2dUnaryModule(unary_map[unary_op][0], use_bias=use_bias).eval()
+            for unary_op, use_bias in itertools.product(
+                unary_map.keys(), use_bias_list
+            ):
+                m = TestHelperModules.Conv2dUnaryModule(
+                    unary_map[unary_op][0], use_bias=use_bias
+                ).eval()
                 example_inputs = (torch.randn(2, 3, 16, 16),)
                 quantizer = X86InductorQuantizer().set_global(
                     xiq.get_default_x86_inductor_quantization_config()
                 )
                 node_occurrence = {
                     # one for input and weight of the conv
-                    torch.ops.quantized_decomposed.quantize_per_tensor.default: 1,
-                    torch.ops.quantized_decomposed.dequantize_per_tensor.default: 1,
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default: 3,
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default: 3,
                     # note: quantize op for weights are const propagated
                     torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
                     torch.ops.quantized_decomposed.dequantize_per_channel.default: 1,
@@ -422,7 +678,7 @@ def test_conv2d_binary(self):
         Test pattern of conv2d with binary post ops (such as add) with X86InductorQuantizer.
         Currently, only add as binary post op is supported.
         """
-        conv2d_type_list = [Conv2DType.left, Conv2DType.both]
+        conv2d_type_list = [NodePosType.left, NodePosType.both]
         example_inputs = (torch.randn(2, 3, 6, 6),)
         quantizer = X86InductorQuantizer().set_global(
             xiq.get_default_x86_inductor_quantization_config()
@@ -430,7 +686,7 @@ def test_conv2d_binary(self):
         with override_quantized_engine("x86"), torch.no_grad():
             for conv2d_type in conv2d_type_list:
                 m = TestHelperModules.Conv2dAddModule(conv2d_type=conv2d_type).eval()
-                if conv2d_type != Conv2DType.both:
+                if conv2d_type != NodePosType.both:
                     node_occurrence = {
                         # one for input and weight of the conv
                         # one for extra input node of add
@@ -466,7 +722,6 @@ def test_conv2d_binary(self):
                     node_list,
                 )
 
-
     @skipIfNoX86
     def test_conv2d_binary2(self):
         """
@@ -496,7 +751,9 @@ def test_conv2d_binary2(self):
                     torch.ops.quantized_decomposed.dequantize_per_tensor.default,
                     torch.ops.aten.conv2d.default,
                     torch.ops.quantized_decomposed.quantize_per_tensor.default,
-                    torch.ops.aten.add_.Tensor if inplace_add else torch.ops.aten.add.Tensor,
+                    torch.ops.aten.add_.Tensor
+                    if inplace_add
+                    else torch.ops.aten.add.Tensor,
                 ]
                 self._test_quantizer(
                     m,
@@ -512,7 +769,7 @@ def test_conv2d_binary_unary(self):
         Test pattern of conv2d with binary + unary post ops (such as add + relu) with X86InductorQuantizer.
         Currently, only add as binary post op and relu as unary post op are supported.
         """
-        conv2d_type_list = [Conv2DType.left, Conv2DType.both]
+        conv2d_type_list = [NodePosType.left, NodePosType.both]
         example_inputs = (torch.randn(2, 3, 6, 6),)
         quantizer = X86InductorQuantizer().set_global(
             xiq.get_default_x86_inductor_quantization_config()
@@ -522,7 +779,7 @@ def test_conv2d_binary_unary(self):
                 m = TestHelperModules.Conv2dAddReLUModule(
                     conv2d_type=conv2d_type,
                 ).eval()
-                if conv2d_type != Conv2DType.both:
+                if conv2d_type != NodePosType.both:
                     node_occurrence = {
                         # one for input for conv
                         # one for extra input node of add
@@ -566,7 +823,9 @@ def test_conv2d_serials_binary_unary(self):
         with override_quantized_engine("x86"), torch.no_grad():
             m = TestHelperModules.SerialsConv2dAddReLUModule().eval()
             example_inputs = (torch.randn(2, 3, 16, 16),)
-            quantizer = X86InductorQuantizer().set_global(xiq.get_default_x86_inductor_quantization_config())
+            quantizer = X86InductorQuantizer().set_global(
+                xiq.get_default_x86_inductor_quantization_config()
+            )
             node_occurrence = {
                 torch.ops.quantized_decomposed.quantize_per_tensor.default: 4,
                 torch.ops.quantized_decomposed.dequantize_per_tensor.default: 6,
@@ -625,10 +884,7 @@ def _single_op_share_observer_recipe_test_helper(self, m, x, single_op):
         )
         # Check Maxpool2d has share observer at input and output
         for node in prepare_model.graph.nodes:
-            if (
-                node.op == "call_function"
-                and node.target is single_op
-            ):
+            if node.op == "call_function" and node.target is single_op:
                 single_op_node = node
                 input_obs_of_single_op = getattr(
                     prepare_model, single_op_node.args[0].target
@@ -648,7 +904,6 @@ def _single_op_share_observer_recipe_test_helper(self, m, x, single_op):
         self.assertTrue(input_obs_of_single_op is output_obs_of_single_op)
         self.assertTrue(input_obs_of_single_op is not input_obs_of_conv)
 
-
     @skipIfNoX86
     def test_maxpool2d_recipe(self):
         r"""
@@ -661,7 +916,6 @@ def test_maxpool2d_recipe(self):
             torch.ops.aten.max_pool2d.default,
         )
 
-
     @skipIfNoX86
     def test_adaptive_avg_pool2d_recipe(self):
         r"""
@@ -669,12 +923,13 @@ def test_adaptive_avg_pool2d_recipe(self):
         Since adaptive_avg_pool2d is a int8_in_int8_out_op, there is obs between adaptive_avg_pool2d and pow.
         """
         self._single_op_share_observer_recipe_test_helper(
-            TestHelperModules.Conv2dSingleOpPowModule(nn.AdaptiveAvgPool2d((1, 1))).eval(),
+            TestHelperModules.Conv2dSingleOpPowModule(
+                nn.AdaptiveAvgPool2d((1, 1))
+            ).eval(),
             torch.rand(1, 2, 14, 14),
             torch.ops.aten.adaptive_avg_pool2d.default,
         )
 
-
     @skipIfNoX86
     def test_flatten_recipe(self):
         r"""
@@ -682,12 +937,13 @@ def test_flatten_recipe(self):
         Since flatten is a int8_in_int8_out_op, there is obs between flatten and pow.
         """
         self._single_op_share_observer_recipe_test_helper(
-            TestHelperModules.Conv2dSingleOpPowModule(lambda x: torch.flatten(x, 1)).eval(),
+            TestHelperModules.Conv2dSingleOpPowModule(
+                lambda x: torch.flatten(x, 1)
+            ).eval(),
             torch.rand(1, 2, 14, 14),
             torch.ops.aten.flatten.using_ints,
         )
 
-
     @skipIfNoX86
     def test_cat_recipe(self):
         r"""
@@ -729,19 +985,10 @@ def test_cat_recipe(self):
         )
         # Check Cat/Maxpool2d has share observer at input and output
         for node in prepare_model.graph.nodes:
-            if (
-                node.op == "call_function"
-                and node.target == torch.ops.aten.cat.default
-            ):
-                cat_act_obs0 = getattr(
-                    prepare_model, node.all_input_nodes[0].target
-                )
-                cat_act_obs1 = getattr(
-                    prepare_model, node.all_input_nodes[1].target
-                )
-                cat_out_obs = getattr(
-                    prepare_model, next(iter(node.users)).target
-                )
+            if node.op == "call_function" and node.target == torch.ops.aten.cat.default:
+                cat_act_obs0 = getattr(prepare_model, node.all_input_nodes[0].target)
+                cat_act_obs1 = getattr(prepare_model, node.all_input_nodes[1].target)
+                cat_out_obs = getattr(prepare_model, next(iter(node.users)).target)
             elif (
                 node.op == "call_function"
                 and node.target is torch.ops.aten.max_pool2d.default
@@ -801,19 +1048,10 @@ def test_cat_recipe_same_inputs(self):
         )
         # Check Cat has share observer at input and output
         for node in prepare_model.graph.nodes:
-            if (
-                node.op == "call_function"
-                and node.target == torch.ops.aten.cat.default
-            ):
-                cat_act_obs0 = getattr(
-                    prepare_model, node.args[0][0].target
-                )
-                cat_act_obs1 = getattr(
-                    prepare_model, node.args[0][1].target
-                )
-                cat_out_obs = getattr(
-                    prepare_model, next(iter(node.users)).target
-                )
+            if node.op == "call_function" and node.target == torch.ops.aten.cat.default:
+                cat_act_obs0 = getattr(prepare_model, node.args[0][0].target)
+                cat_act_obs1 = getattr(prepare_model, node.args[0][1].target)
+                cat_out_obs = getattr(prepare_model, next(iter(node.users)).target)
         self.assertTrue(isinstance(cat_act_obs0, ObserverBase))
         self.assertTrue(isinstance(cat_act_obs1, ObserverBase))
         self.assertTrue(isinstance(cat_out_obs, ObserverBase))
@@ -858,16 +1096,9 @@ def test_cat_recipe_single_input(self):
         )
         # Check Cat has share observer at input and output
         for node in prepare_model.graph.nodes:
-            if (
-                node.op == "call_function"
-                and node.target == torch.ops.aten.cat.default
-            ):
-                cat_act_obs0 = getattr(
-                    prepare_model, node.args[0][0].target
-                )
-                cat_out_obs = getattr(
-                    prepare_model, next(iter(node.users)).target
-                )
+            if node.op == "call_function" and node.target == torch.ops.aten.cat.default:
+                cat_act_obs0 = getattr(prepare_model, node.args[0][0].target)
+                cat_out_obs = getattr(prepare_model, next(iter(node.users)).target)
         self.assertTrue(isinstance(cat_act_obs0, ObserverBase))
         self.assertTrue(isinstance(cat_out_obs, ObserverBase))
         self.assertTrue(cat_act_obs0 is cat_out_obs)
@@ -925,7 +1156,9 @@ def test_avg_pool2d_recipe(self):
                 and node.target is torch.ops.aten.conv2d.default
             ):
                 conv_node = node
-                output_obs_of_conv = getattr(prepare_model, next(iter(conv_node.users)).target)
+                output_obs_of_conv = getattr(
+                    prepare_model, next(iter(conv_node.users)).target
+                )
         self.assertTrue(isinstance(input_obs_of_avgpool, ObserverBase))
         self.assertTrue(isinstance(output_obs_of_avgpool, ObserverBase))
         self.assertTrue(isinstance(output_obs_of_conv, ObserverBase))
@@ -976,11 +1209,16 @@ def test_linear_unary(self):
         cases = itertools.product(use_bias_list, inplace_list, postop_list)
         post_op_map = {
             nn.ReLU: [torch.ops.aten.relu_.default, torch.ops.aten.relu.default],
-            nn.LeakyReLU: [torch.ops.aten.leaky_relu_.default, torch.ops.aten.leaky_relu.default],
+            nn.LeakyReLU: [
+                torch.ops.aten.leaky_relu_.default,
+                torch.ops.aten.leaky_relu.default,
+            ],
         }
         with override_quantized_engine("x86"), torch.no_grad():
             for use_bias, inplace, postop in cases:
-                m = TestHelperModules.LinearUnaryModule(use_bias=use_bias, postop=postop, inplace_postop=inplace).eval()
+                m = TestHelperModules.LinearUnaryModule(
+                    use_bias=use_bias, postop=postop, inplace_postop=inplace
+                ).eval()
                 example_inputs = (torch.randn(2, 4),)
                 quantizer = X86InductorQuantizer().set_global(
                     xiq.get_default_x86_inductor_quantization_config()
@@ -1007,6 +1245,312 @@ def test_linear_unary(self):
                     node_list,
                 )
 
+    @skipIfNoX86
+    def test_linear_unary_gelu(self):
+        """
+        Test pattern of linear with unary post ops (e.g. gelu) with X86InductorQuantizer.
+        """
+        use_bias_list = [True, False]
+        postop = nn.GELU
+        post_op_algorithm = ["none", "tanh"]
+        cases = itertools.product(use_bias_list, post_op_algorithm)
+        with override_quantized_engine("x86"), torch.no_grad():
+            for use_bias, post_op_algo in cases:
+                m = TestHelperModules.LinearUnaryModule(
+                    use_bias=use_bias, postop=postop, post_op_algo=post_op_algo
+                ).eval()
+                example_inputs = (torch.randn(2, 4),)
+                quantizer = X86InductorQuantizer().set_global(
+                    xiq.get_default_x86_inductor_quantization_config()
+                )
+                node_occurrence = {
+                    # one for input and weight of the conv, one for output for the gelu
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default: 1,
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default: 1,
+                    # quantize_per_channel for weights are const propagated
+                    torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
+                    torch.ops.quantized_decomposed.dequantize_per_channel.default: 1,
+                }
+                node_list = [
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default,
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+                    torch.ops.aten.linear.default,
+                    torch.ops.aten.gelu.default,
+                ]
+                self._test_quantizer(
+                    m,
+                    example_inputs,
+                    quantizer,
+                    node_occurrence,
+                    node_list,
+                )
+
+    def _check_annotation_stat(self, gm, expected_stat_dict):
+        # Check expected annotation statistics to ensure the annotation is correct
+
+        def _check_annotation(node):
+            annot = node.meta.get(QUANT_ANNOTATION_KEY, None)
+            if annot is None:
+                return False, False
+            return annot._annotated, annot._is_output_of_quantized_pattern
+
+        for node in gm.graph.nodes:
+            if node.target in expected_stat_dict.keys():
+                annotated, is_quant_out = _check_annotation(node)
+                expected_stat_dict[node.target]["annotated"] -= annotated
+                expected_stat_dict[node.target]["is_quant_out"] -= is_quant_out
+        for op_stat in expected_stat_dict.values():
+            assert all(v == 0 for v in op_stat.values())
+
+    @skipIfNoX86
+    def test_linear_binary(self):
+        """
+        Test pattern of linear with binary post ops (such as add) with X86InductorQuantizer.
+        Currently, only add as binary post op is supported.
+        """
+        linear_pos_list = [NodePosType.left, NodePosType.right, NodePosType.both]
+        # TODO test for inplace add after refactoring of capture_pre_autograd_graph
+        inplace_add_list = [False]
+        example_inputs = (torch.randn(2, 16),)
+        quantizer = X86InductorQuantizer().set_global(
+            xiq.get_default_x86_inductor_quantization_config()
+        )
+        cases = itertools.product(linear_pos_list, inplace_add_list)
+        with override_quantized_engine("x86"), torch.no_grad():
+            for linear_pos, inplace_add in cases:
+                m = TestHelperModules.LinearAddModule(
+                    inplace_add=inplace_add, linear_pos=linear_pos
+                ).eval()
+                if linear_pos != NodePosType.both:
+                    node_occurrence = {
+                        # Only one 1 q-dq for input of the linear
+                        # No q-dq for extra input node of add
+                        torch.ops.quantized_decomposed.quantize_per_tensor.default: 1,
+                        torch.ops.quantized_decomposed.dequantize_per_tensor.default: 1,
+                        # quantize_per_channel for weights are const propagated
+                        torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
+                        torch.ops.quantized_decomposed.dequantize_per_channel.default: 1,
+                    }
+                else:
+                    node_occurrence = {
+                        # One quantize_per_tensor for both linear nodes (shared)
+                        # Two dequantize_per_tensor for two linear nodes
+                        # No q-dq for extra input node of add
+                        torch.ops.quantized_decomposed.quantize_per_tensor.default: 1,
+                        torch.ops.quantized_decomposed.dequantize_per_tensor.default: 2,
+                        # quantize_per_channel for weights are const propagated
+                        torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
+                        torch.ops.quantized_decomposed.dequantize_per_channel.default: 2,
+                    }
+                node_list = [
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default,
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+                    torch.ops.aten.linear.default,
+                    torch.ops.aten.add_.Tensor
+                    if inplace_add
+                    else torch.ops.aten.add.Tensor,
+                ]
+                fq_m = self._test_quantizer(
+                    m,
+                    example_inputs,
+                    quantizer,
+                    node_occurrence,
+                    node_list,
+                )[-1]
+                # One linear and add are fused. The other linear is quantized alone if present
+                aten = torch.ops.aten
+                add_op = aten.add_.Tensor if inplace_add else aten.add.Tensor
+                expected_annotation_stat = {
+                    aten.linear.default: {
+                        "annotated": 2 if linear_pos == NodePosType.both else 1,
+                        "is_quant_out": 1 if linear_pos == NodePosType.both else 0,
+                    },
+                    add_op: {"annotated": 1, "is_quant_out": 1},
+                }
+                self._check_annotation_stat(fq_m, expected_annotation_stat)
+
+    @skipIfNoX86
+    def test_linear_binary2(self):
+        """
+        Test Pattern:
+            tmp = linear_1(x)
+            tmp2 = linear_2(tmp)
+            return tmp + tmp2
+        Since linear_1 has 2 users, we should annotate linear_2 for binary fusion instead of linear_1
+        """
+        example_inputs = (torch.randn(2, 16),)
+        quantizer = X86InductorQuantizer().set_global(
+            xiq.get_default_x86_inductor_quantization_config()
+        )
+        # TODO test for inplace add after refactoring of capture_pre_autograd_graph
+        inplace_add_list = [False]
+        with override_quantized_engine("x86"), torch.no_grad():
+            for inplace_add in inplace_add_list:
+                m = TestHelperModules.LinearAddModule2(inplace_add=inplace_add).eval()
+                # Two q-dq nodes for inputs of linear nodes
+                # No q-dq for extra input node of add
+                node_occurrence = {
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default: 2,
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default: 2,
+                    # quantize_per_channel for weights are const propagated
+                    torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
+                    torch.ops.quantized_decomposed.dequantize_per_channel.default: 2,
+                }
+                node_list = [
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default,
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+                    torch.ops.aten.linear.default,
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default,
+                    torch.ops.aten.add_.Tensor
+                    if inplace_add
+                    else torch.ops.aten.add.Tensor,
+                ]
+                fq_m = self._test_quantizer(
+                    m,
+                    example_inputs,
+                    quantizer,
+                    node_occurrence,
+                    node_list,
+                )[-1]
+                # One linear and add are fused. The other linear is quantized alone if present
+                aten = torch.ops.aten
+                add_op = aten.add_.Tensor if inplace_add else aten.add.Tensor
+                expected_annotation_stat = {
+                    aten.linear.default: {
+                        "annotated": 2,
+                        "is_quant_out": 1,
+                    },
+                    add_op: {"annotated": 1, "is_quant_out": 1},
+                }
+                self._check_annotation_stat(fq_m, expected_annotation_stat)
+
+    @skipIfNoX86
+    def test_linear_binary_unary(self):
+        """
+        Test pattern of linear with binary + unary post ops (such as add + relu) with X86InductorQuantizer.
+        Currently, only add as binary post op and relu as unary post op are supported.
+        """
+        linear_pos_list = [NodePosType.left, NodePosType.right, NodePosType.both]
+        # TODO test for inplace add after refactoring of capture_pre_autograd_graph
+        inplace_add_list = [False]
+        # TODO test for inplace relu after refactoring of capture_pre_autograd_graph
+        inplace_relu_list = [False]
+        example_inputs = (torch.randn(2, 16),)
+        quantizer = X86InductorQuantizer().set_global(
+            xiq.get_default_x86_inductor_quantization_config()
+        )
+        cases = itertools.product(linear_pos_list, inplace_add_list, inplace_relu_list)
+        with override_quantized_engine("x86"), torch.no_grad():
+            for linear_pos, inplace_add, inplace_relu in cases:
+                m = TestHelperModules.LinearAddReLUModule(
+                    inplace_add=inplace_add,
+                    linear_pos=linear_pos,
+                    inplace_relu=inplace_relu,
+                ).eval()
+                if linear_pos != NodePosType.both:
+                    node_occurrence = {
+                        # Only one q-dq node for input of the linear
+                        # No q-dq node for extra input node of add
+                        torch.ops.quantized_decomposed.quantize_per_tensor.default: 1,
+                        torch.ops.quantized_decomposed.dequantize_per_tensor.default: 1,
+                        # note: quantize op for weights are const propagated
+                        torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
+                        torch.ops.quantized_decomposed.dequantize_per_channel.default: 1,
+                    }
+                else:
+                    node_occurrence = {
+                        # One quantize_per_tensor for both linear nodes (shared)
+                        # Two dequantize_per_tensor for two linear nodes
+                        # No q-dq for extra input node of add
+                        torch.ops.quantized_decomposed.quantize_per_tensor.default: 1,
+                        torch.ops.quantized_decomposed.dequantize_per_tensor.default: 2,
+                        # note: quantize op for weights are const propagated
+                        torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
+                        torch.ops.quantized_decomposed.dequantize_per_channel.default: 2,
+                    }
+                node_list = [
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default,
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+                    torch.ops.aten.linear.default,
+                    torch.ops.aten.add_.Tensor
+                    if inplace_add
+                    else torch.ops.aten.add.Tensor,
+                ]
+                fq_m = self._test_quantizer(
+                    m,
+                    example_inputs,
+                    quantizer,
+                    node_occurrence,
+                    node_list,
+                )[-1]
+                # linear, add, relu are fused
+                # The other linear is quantized alone if present
+                aten = torch.ops.aten
+                add_op = aten.add_.Tensor if inplace_add else aten.add.Tensor
+                relu_op = aten.relu_.default if inplace_relu else aten.relu.default
+                expected_annotation_stat = {
+                    aten.linear.default: {
+                        "annotated": 2 if linear_pos == NodePosType.both else 1,
+                        "is_quant_out": 1 if linear_pos == NodePosType.both else 0,
+                    },
+                    add_op: {"annotated": 1, "is_quant_out": 0},
+                    relu_op: {"annotated": 1, "is_quant_out": 1},
+                }
+                self._check_annotation_stat(fq_m, expected_annotation_stat)
+
+    @skipIfNoX86
+    def test_linear_binary_unary_serials(self):
+        """
+        Test pattern of 2 following up linear add relu with X86InductorQuantizer.
+        """
+        with override_quantized_engine("x86"), torch.no_grad():
+            m = TestHelperModules.SerialsLinearAddReLUModule().eval()
+            example_inputs = (torch.randn(2, 16),)
+            quantizer = X86InductorQuantizer().set_global(
+                xiq.get_default_x86_inductor_quantization_config()
+            )
+            node_occurrence = {
+                # quantize_per_tensor: 1 for linear_1, 1 for linear_2/3 (shared), 1 for linear_4
+                # dequantize_per_tensor: 1 for each linear
+                # No q-dq for extra input node of add
+                torch.ops.quantized_decomposed.quantize_per_tensor.default: 3,
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default: 4,
+                # quantize_per_channel for weights are const propagated
+                torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
+                torch.ops.quantized_decomposed.dequantize_per_channel.default: 4,
+            }
+            node_list = [
+                torch.ops.quantized_decomposed.quantize_per_tensor.default,
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+                torch.ops.aten.linear.default,
+                torch.ops.quantized_decomposed.quantize_per_tensor.default,
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+                torch.ops.aten.linear.default,
+                torch.ops.aten.linear.default,
+                torch.ops.aten.add.Tensor,
+                torch.ops.aten.relu.default,
+            ]
+            fq_m = self._test_quantizer(
+                m,
+                example_inputs,
+                quantizer,
+                node_occurrence,
+                node_list,
+            )[-1]
+            # Two linear nodes are quantized alone
+            # The other two are fused with add and relu
+            aten = torch.ops.aten
+            expected_annotation_stat = {
+                aten.linear.default: {
+                    "annotated": 4,
+                    "is_quant_out": 2,
+                },
+                aten.add.Tensor: {"annotated": 2, "is_quant_out": 0},
+                aten.relu.default: {"annotated": 2, "is_quant_out": 2},
+            }
+            self._check_annotation_stat(fq_m, expected_annotation_stat)
+
+    @skipIfTorchDynamo("very slow")
     @skipIfNoX86
     def test_qat_conv2d(self):
         """
@@ -1044,6 +1588,7 @@ def test_qat_conv2d(self):
                 is_qat=True,
             )
 
+    @skipIfTorchDynamo("very slow")
     @skipIfNoX86
     def test_qat_conv2d_unary(self):
         """
@@ -1053,23 +1598,47 @@ def test_qat_conv2d_unary(self):
         unary_map = {
             "relu": [torch.nn.ReLU(inplace=False), torch.ops.aten.relu.default],
             "relu_inplace": [torch.nn.ReLU(inplace=True), torch.ops.aten.relu_.default],
-            "hardtanh": [torch.nn.Hardtanh(min_val=0.0, max_val=6.0, inplace=False), torch.ops.aten.hardtanh.default],
-            "hardtanh_inplace": [torch.nn.Hardtanh(min_val=0.0, max_val=6.0, inplace=True), torch.ops.aten.hardtanh_.default],
+            "hardtanh": [
+                torch.nn.Hardtanh(min_val=0.0, max_val=6.0, inplace=False),
+                torch.ops.aten.hardtanh.default,
+            ],
+            "hardtanh_inplace": [
+                torch.nn.Hardtanh(min_val=0.0, max_val=6.0, inplace=True),
+                torch.ops.aten.hardtanh_.default,
+            ],
             "relu6": [torch.nn.ReLU6(inplace=False), torch.ops.aten.hardtanh.default],
-            "relu6_inplace": [torch.nn.ReLU6(inplace=True), torch.ops.aten.hardtanh_.default]
+            "relu6_inplace": [
+                torch.nn.ReLU6(inplace=True),
+                torch.ops.aten.hardtanh_.default,
+            ],
+            "hardswish": [
+                torch.nn.Hardswish(inplace=False),
+                torch.ops.aten.hardswish.default,
+            ],
+            "hardswish_inplace": [
+                torch.nn.Hardswish(inplace=True),
+                torch.ops.aten.hardswish_.default,
+            ],
+            "swish": [torch.nn.SiLU(inplace=False), torch.ops.aten.silu.default],
+            "swish_inplace": [
+                torch.nn.SiLU(inplace=True),
+                torch.ops.aten.silu_.default,
+            ],
         }
 
         with override_quantized_engine("x86"):
             for unary_op in unary_map.keys():
-                m = TestHelperModules.Conv2dUnaryModule(unary_map[unary_op][0], with_bn=True)
+                m = TestHelperModules.Conv2dUnaryModule(
+                    unary_map[unary_op][0], with_bn=True
+                )
                 example_inputs = (torch.randn(2, 3, 16, 16),)
                 quantizer = X86InductorQuantizer().set_global(
                     xiq.get_default_x86_inductor_quantization_config(is_qat=True)
                 )
                 node_occurrence = {
                     # one for input and weight of the conv, one for output for the relu
-                    torch.ops.quantized_decomposed.quantize_per_tensor.default: 2,
-                    torch.ops.quantized_decomposed.dequantize_per_tensor.default: 2,
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default: 3,
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default: 3,
                     # note: quantize op for weights are const propagated
                     torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
                     torch.ops.quantized_decomposed.dequantize_per_channel.default: 1,
@@ -1093,6 +1662,7 @@ def test_qat_conv2d_unary(self):
                     is_qat=True,
                 )
 
+    @skipIfTorchDynamo("very slow")
     @skipIfNoX86
     def test_qat_conv2d_binary(self):
         """
@@ -1105,7 +1675,9 @@ def test_qat_conv2d_binary(self):
         )
         with override_quantized_engine("x86"):
             for inplace_add in [True, False]:
-                m = TestHelperModules.Conv2dAddModule(inplace_add=inplace_add, with_bn=True)
+                m = TestHelperModules.Conv2dAddModule(
+                    inplace_add=inplace_add, with_bn=True
+                )
                 node_occurrence = {
                     # one for input and weight of the conv
                     # one for output for the add
@@ -1122,7 +1694,9 @@ def test_qat_conv2d_binary(self):
                     torch.ops.quantized_decomposed.quantize_per_tensor.default,
                     torch.ops.quantized_decomposed.dequantize_per_tensor.default,
                     torch.ops.aten.conv2d.default,
-                    torch.ops.aten.add_.Tensor if inplace_add else torch.ops.aten.add.Tensor,
+                    torch.ops.aten.add_.Tensor
+                    if inplace_add
+                    else torch.ops.aten.add.Tensor,
                     torch.ops.quantized_decomposed.quantize_per_tensor.default,
                     torch.ops.quantized_decomposed.dequantize_per_tensor.default,
                 ]
@@ -1135,6 +1709,7 @@ def test_qat_conv2d_binary(self):
                     is_qat=True,
                 )
 
+    @skipIfTorchDynamo("very slow")
     @skipIfNoX86
     def test_qat_conv2d_binary2(self):
         """
@@ -1166,7 +1741,9 @@ def test_qat_conv2d_binary2(self):
                     torch.ops.quantized_decomposed.dequantize_per_tensor.default,
                     torch.ops.aten.conv2d.default,
                     torch.ops.quantized_decomposed.quantize_per_tensor.default,
-                    torch.ops.aten.add_.Tensor if inplace_add else torch.ops.aten.add.Tensor,
+                    torch.ops.aten.add_.Tensor
+                    if inplace_add
+                    else torch.ops.aten.add.Tensor,
                 ]
                 self._test_quantizer(
                     m,
@@ -1177,6 +1754,7 @@ def test_qat_conv2d_binary2(self):
                     is_qat=True,
                 )
 
+    @skipIfTorchDynamo("very slow")
     @skipIfNoX86
     def test_qat_conv2d_binary_unary(self):
         """
@@ -1241,7 +1819,6 @@ def test_dynamic_quant_linear(self):
                 torch.ops.quantized_decomposed.choose_qparams.tensor,
                 torch.ops.quantized_decomposed.quantize_per_tensor.tensor,
                 torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
-                torch.ops.quantized_decomposed.dequantize_per_channel.default,
                 torch.ops.aten.linear.default,
             ]
             self._test_quantizer(
@@ -1262,8 +1839,7 @@ def test_qat_dynamic_quant_linear(self):
             example_inputs = (torch.randn(1, 4, 64),)
             quantizer = X86InductorQuantizer().set_global(
                 xiq.get_default_x86_inductor_quantization_config(
-                    is_qat=True,
-                    is_dynamic=True
+                    is_qat=True, is_dynamic=True
                 )
             )
             node_occurrence = {
@@ -1278,7 +1854,6 @@ def test_qat_dynamic_quant_linear(self):
                 torch.ops.quantized_decomposed.choose_qparams.tensor,
                 torch.ops.quantized_decomposed.quantize_per_tensor.tensor,
                 torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
-                torch.ops.quantized_decomposed.dequantize_per_channel.default,
                 torch.ops.aten.linear.default,
             ]
             self._test_quantizer(
@@ -1289,3 +1864,174 @@ def test_qat_dynamic_quant_linear(self):
                 node_list,
                 is_qat=True,
             )
+
+    @skipIfNoX86
+    def test_filter_conv2d_recipe(self):
+        """
+        Test removing conv2d from default recipe of X86InductorQuantizer.
+        """
+        with override_quantized_engine("x86"), torch.no_grad():
+            m = TestHelperModules.Conv2dUnaryModule(torch.nn.ReLU(inplace=False)).eval()
+            example_inputs = (torch.randn(2, 3, 16, 16),)
+            quantizer = X86InductorQuantizer().set_global(
+                xiq.get_default_x86_inductor_quantization_config()
+            )
+            quantizer.set_module_type_qconfig(torch.nn.Conv2d, None)
+            node_occurrence = {
+                # one for input and weight of the conv
+                torch.ops.quantized_decomposed.quantize_per_tensor.default: 0,
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default: 0,
+                # note: quantize op for weights are const propagated
+                torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
+                torch.ops.quantized_decomposed.dequantize_per_channel.default: 0,
+            }
+            node_list = [
+                torch.ops.aten.conv2d.default,
+                torch.ops.aten.relu.default,
+            ]
+            self._test_quantizer(
+                m,
+                example_inputs,
+                quantizer,
+                node_occurrence,
+                node_list,
+            )
+
+    @skipIfNoX86
+    def test_filter_linear_recipe(self):
+        """
+        Test removing linear from default recipe of X86InductorQuantizer.
+        """
+        with override_quantized_engine("x86"), torch.no_grad():
+            m = TestHelperModules.LinearUnaryModule(
+                use_bias=True,
+                postop=nn.ReLU,
+            ).eval()
+            example_inputs = (torch.randn(2, 4),)
+            quantizer = X86InductorQuantizer().set_global(
+                xiq.get_default_x86_inductor_quantization_config()
+            )
+            quantizer.set_function_type_qconfig(torch.nn.functional.linear, None)
+            node_occurrence = {
+                # one for input and weight of the conv
+                torch.ops.quantized_decomposed.quantize_per_tensor.default: 0,
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default: 0,
+                # note: quantize op for weights are const propagated
+                torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
+                torch.ops.quantized_decomposed.dequantize_per_channel.default: 0,
+            }
+            node_list = [
+                torch.ops.aten.linear.default,
+                torch.ops.aten.relu.default,
+            ]
+            self._test_quantizer(
+                m,
+                example_inputs,
+                quantizer,
+                node_occurrence,
+                node_list,
+            )
+
+    @skipIfNoX86
+    def test_filter_maxpool2d_recipe(self):
+        """
+        Test removing maxpool2d from default recipe of X86InductorQuantizer.
+        """
+        with override_quantized_engine("x86"), torch.no_grad():
+            m = TestHelperModules.Conv2dUnaryModule(torch.nn.ReLU(inplace=False)).eval()
+            example_inputs = (torch.randn(2, 3, 16, 16),)
+            quantizer = X86InductorQuantizer().set_global(
+                xiq.get_default_x86_inductor_quantization_config()
+            )
+            quantizer.set_function_type_qconfig(torch.nn.functional.max_pool2d, None)
+            node_occurrence = {
+                # one for input and weight of the conv
+                torch.ops.quantized_decomposed.quantize_per_tensor.default: 1,
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default: 1,
+                # note: quantize op for weights are const propagated
+                torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
+                torch.ops.quantized_decomposed.dequantize_per_channel.default: 1,
+            }
+            node_list = [
+                torch.ops.quantized_decomposed.quantize_per_tensor.default,
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+                torch.ops.aten.conv2d.default,
+                torch.ops.aten.relu.default,
+                torch.ops.aten.max_pool2d.default,
+            ]
+            self._test_quantizer(
+                m,
+                example_inputs,
+                quantizer,
+                node_occurrence,
+                node_list,
+            )
+
+    @skipIfNoX86
+    def test_attention_block(self):
+        """
+        Test pattern of Attention like Block with X86InductorQuantizer.
+        """
+        for annotate_matmul in [False, True]:
+            with override_quantized_engine("x86"), torch.no_grad():
+                m = TestHelperModules.SelfAttnLikeModule(
+                    input_dim=64 * 16,
+                    transpose_for_score=True,
+                    num_attention_heads=16,
+                    attention_head_size=64,
+                ).eval()
+                example_inputs = (torch.randn(2, 384, 1024),)
+
+                m(*example_inputs)
+
+                quantizer = X86InductorQuantizer().set_global(
+                    xiq.get_default_x86_inductor_quantization_config()
+                )
+
+                if annotate_matmul:
+                    quantizer.set_function_type_qconfig(
+                        torch.matmul, quantizer.get_global_quantization_config()
+                    )
+
+                node_occurrence = {
+                    torch.ops.quantized_decomposed.quantize_per_tensor.default: 5
+                    if annotate_matmul
+                    else 1,
+                    torch.ops.quantized_decomposed.dequantize_per_tensor.default: 7
+                    if annotate_matmul
+                    else 3,
+                    # quantize_per_channel for weights are const propagated
+                    torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
+                    torch.ops.quantized_decomposed.dequantize_per_channel.default: 3,
+                }
+                if annotate_matmul:
+                    node_list = [
+                        torch.ops.quantized_decomposed.quantize_per_tensor.default,
+                        torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+                        torch.ops.aten.linear.default,
+                        torch.ops.aten.view.default,
+                        torch.ops.aten.permute.default,
+                        torch.ops.quantized_decomposed.quantize_per_tensor.default,
+                        torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+                        torch.ops.aten.matmul.default,
+                        torch.ops.aten.div.Tensor,
+                        torch.ops.aten.softmax.int,
+                    ]
+                else:
+                    node_list = [
+                        torch.ops.quantized_decomposed.quantize_per_tensor.default,
+                        torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+                        torch.ops.aten.linear.default,
+                        torch.ops.aten.view.default,
+                        torch.ops.aten.permute.default,
+                        torch.ops.aten.matmul.default,
+                        torch.ops.aten.div.Tensor,
+                        torch.ops.aten.softmax.int,
+                    ]
+                self._test_quantizer(
+                    m,
+                    example_inputs,
+                    quantizer,
+                    node_occurrence,
+                    node_list,
+                )
diff --git a/test/quantization/pt2e/test_xnnpack_quantizer.py b/test/quantization/pt2e/test_xnnpack_quantizer.py
index 035d87714ae8d..621e6456ebd2c 100644
--- a/test/quantization/pt2e/test_xnnpack_quantizer.py
+++ b/test/quantization/pt2e/test_xnnpack_quantizer.py
@@ -341,6 +341,49 @@ def forward(self, x):
         ]
         self._test_quantizer(m, example_inputs, quantizer, node_occurrence, node_list)
 
+    def test_set_module_name_with_underscores(self) -> None:
+        """Test that if a module name has an underscore, we can still quantize it"""
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                # This module name has underscores, which can be part of a mangled
+                # name.
+                self.foo_bar = torch.nn.Linear(2, 2)
+                self.baz = torch.nn.Linear(2, 2)
+
+            def forward(self, x):
+                return self.baz(self.foo_bar(x))
+
+        quantizer = XNNPACKQuantizer()
+        # Set global to no quantization and then per-channel for a specific submodule.
+        quantizer.set_module_name(
+            "foo_bar", get_symmetric_quantization_config(is_per_channel=True)
+        )
+        example_inputs = (torch.randn(2, 2),)
+        m = M().eval()
+        m = capture_pre_autograd_graph(m, example_inputs)
+        m = prepare_pt2e(m, quantizer)
+        # Use a linear count instead of names because the names might change, but
+        # the order should be the same.
+        count = 0
+        for n in m.graph.nodes:
+            if n.op == "call_function" and n.target == torch.ops.aten.linear.default:
+                # Get the weight observer to see the per-channel vs per-tensor.
+                weight_observer_node = n.args[1]
+                if count == 0:
+                    # The weight tensor should be per-tensor and not per-channel
+                    # for foo_bar.
+                    self.assertEqual(weight_observer_node.op, "call_module")
+                    observer_instance = getattr(m, weight_observer_node.target)
+                    self.assertEqual(
+                        observer_instance.qscheme, torch.per_channel_symmetric
+                    )
+                else:
+                    # For baz it should have no observer at all.
+                    self.assertNotEqual(weight_observer_node.op, "call_module")
+                count += 1
+
     def test_set_module_type(self):
         class Sub(torch.nn.Module):
             def __init__(self):
@@ -472,7 +515,7 @@ def test_propagate_annotation(self):
                 output_act = getattr(m, next(iter(n.users)).target)
                 self.assertIs(input_act, output_act)
 
-        m = convert_pt2e(m, fold_quantize=True)
+        m = convert_pt2e(m)
         node_occurrence = {
             # input and output are using quantize_per_tensor and weight is using quantize_per_channel
             ns.call_function(
@@ -534,6 +577,54 @@ def test_dynamic_linear(self):
                 qconfig_mapping,
             )
 
+    def test_dynamic_linear_int4_weight(self):
+        quantizer = XNNPACKQuantizer()
+        quantization_config = get_symmetric_quantization_config(
+            is_per_channel=True,
+            is_dynamic=True,
+            weight_qmin=0,
+            weight_qmax=15,
+        )
+        quantizer.set_global(quantization_config)
+        m_eager = TestHelperModules.TwoLinearModule().eval()
+
+        node_occurrence = {
+            # input and output are using quantize_per_tensor and weight is using quantize_per_channel
+            torch.ops.quantized_decomposed.quantize_per_tensor.tensor: 2,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.tensor: 2,
+            # note: quantize op for weights are const propagated
+            torch.ops.quantized_decomposed.quantize_per_channel.default: 0,
+            torch.ops.quantized_decomposed.dequantize_per_channel.default: 2,
+        }
+        act_affine_quant_obs = observer.PlaceholderObserver.with_args(
+            dtype=torch.qint8,
+            qscheme=torch.per_tensor_affine,
+            quant_min=-128,
+            quant_max=127,
+            eps=2**-12,
+            is_dynamic=True,
+        )
+        qconfig = QConfig(
+            activation=act_affine_quant_obs,
+            weight=per_channel_weight_observer_range_neg_127_to_127.with_args(
+                quant_min=0, quant_max=15
+            ),
+        )
+        qconfig_mapping = QConfigMapping().set_global(qconfig)
+        # Test with 2d inputs
+        example_inputs_2d = (torch.randn(9, 8),)
+        example_inputs_4d = (torch.randn(9, 10, 11, 8),)
+        for example_inputs in [example_inputs_2d, example_inputs_4d]:
+            self._test_quantizer(
+                m_eager,
+                example_inputs,
+                quantizer,
+                node_occurrence,
+                [],
+                True,
+                qconfig_mapping,
+            )
+
     def test_qat_dynamic_linear(self):
         quantizer = XNNPACKQuantizer()
         quantization_config = get_symmetric_quantization_config(
@@ -675,7 +766,7 @@ def forward(self, input_tensor, hidden_tensor):
             quantizer.set_global(quantization_config)
             model_graph = prepare_pt2e(model_graph, quantizer)
             model_graph(*example_inputs)
-            model_graph = convert_pt2e(model_graph, fold_quantize=True)
+            model_graph = convert_pt2e(model_graph)
             self.assertEqual(model_fx(*example_inputs), model_graph(*example_inputs))
 
     def test_linear_gru(self):
@@ -739,7 +830,7 @@ def forward(self, input_tensor, hidden_tensor):
             quantizer.set_global(quantization_config)
             model_graph = prepare_pt2e(model_graph, quantizer)
             model_graph(*example_inputs)
-            model_graph = convert_pt2e(model_graph, fold_quantize=True)
+            model_graph = convert_pt2e(model_graph)
             self.assertEqual(model_fx(*example_inputs), model_graph(*example_inputs))
 
     def test_add_and_inplace_add(self):
@@ -761,7 +852,7 @@ def test_add_and_inplace_add(self):
             torch.ops.aten.add.Tensor,
             torch.ops.quantized_decomposed.quantize_per_tensor.default,
             torch.ops.quantized_decomposed.dequantize_per_tensor.default,
-            torch.ops.aten.add_.Tensor,
+            # TODO torch.ops.aten.add.Tensor,
             torch.ops.quantized_decomposed.quantize_per_tensor.default,
         ]
         self._test_quantizer(
@@ -791,7 +882,7 @@ def test_mul_and_inplace_mul(self):
             torch.ops.aten.mul.Tensor,
             torch.ops.quantized_decomposed.quantize_per_tensor.default,
             torch.ops.quantized_decomposed.dequantize_per_tensor.default,
-            torch.ops.aten.mul_.Tensor,
+            # TODO torch.ops.aten.mul.Tensor,
             torch.ops.quantized_decomposed.quantize_per_tensor.default,
         ]
         self._test_quantizer(
@@ -810,7 +901,7 @@ def test_add_mul_scalar(self):
         node_occurrence = {
             # two input and one output for first add, and output for second add
             torch.ops.quantized_decomposed.quantize_per_tensor.default: 5,
-            torch.ops.quantized_decomposed.dequantize_per_tensor.default: 7,
+            # TODO torch.ops.quantized_decomposed.dequantize_per_tensor.default: 9,
         }
         node_list = [
             torch.ops.quantized_decomposed.dequantize_per_tensor.default,
@@ -821,10 +912,10 @@ def test_add_mul_scalar(self):
             torch.ops.aten.mul.Tensor,
             torch.ops.quantized_decomposed.quantize_per_tensor.default,
             torch.ops.quantized_decomposed.dequantize_per_tensor.default,
-            torch.ops.aten.add_.Tensor,
+            # TODO torch.ops.aten.add.Tensor,
             torch.ops.quantized_decomposed.quantize_per_tensor.default,
             torch.ops.quantized_decomposed.dequantize_per_tensor.default,
-            torch.ops.aten.mul_.Tensor,
+            # TODO torch.ops.aten.mul.Tensor,
             torch.ops.quantized_decomposed.quantize_per_tensor.default,
         ]
         self._test_quantizer(
@@ -920,7 +1011,7 @@ def test_resnet18(self):
                 id(m.activation_post_process_3), id(m.activation_post_process_2)
             )
             after_prepare_result = m(*example_inputs)
-            m = convert_pt2e(m, fold_quantize=True)
+            m = convert_pt2e(m)
 
             after_quant_result = m(*example_inputs)
 
diff --git a/test/run_test.py b/test/run_test.py
index 985bdb08846cb..af3b4d6866730 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -27,12 +27,17 @@
     FILE_SCHEMA,
     get_report_path,
     IS_CI,
+    IS_MACOS,
+    IS_WINDOWS,
     parser as common_parser,
     retry_shell,
     set_cwd,
     shell,
+    TEST_CUDA,
     TEST_WITH_ASAN,
+    TEST_WITH_CROSSREF,
     TEST_WITH_ROCM,
+    TEST_WITH_SLOW,
     TEST_WITH_SLOW_GRADCHECK,
 )
 
@@ -46,11 +51,16 @@
     TEST_TIMES_FILE,
 )
 from tools.stats.upload_metrics import add_global_metric, emit_metric
-from tools.testing.target_determination.determinator import (
-    AggregatedHeuristics,
-    get_prediction_confidences,
-    get_test_prioritizations,
+from tools.testing.discover_tests import (
+    CPP_TEST_PATH,
+    CPP_TEST_PREFIX,
+    CPP_TESTS_DIR,
+    parse_test_module,
+    TESTS,
 )
+from tools.testing.do_target_determination_for_s3 import import_results
+from tools.testing.target_determination.gen_artifact import gen_ci_artifact
+from tools.testing.target_determination.heuristics.utils import get_pr_number
 
 from tools.testing.test_run import TestRun
 from tools.testing.test_selections import (
@@ -65,11 +75,10 @@
 # Make sure to remove REPO_ROOT after import is done
 sys.path.remove(str(REPO_ROOT))
 
-
+TEST_CONFIG = os.getenv("TEST_CONFIG", "")
 RERUN_DISABLED_TESTS = os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1"
-CPP_TEST_PREFIX = "cpp"
-CPP_TEST_PATH = "build/bin"
 DISTRIBUTED_TEST_PREFIX = "distributed"
+INDUCTOR_TEST_PREFIX = "inductor"
 
 
 # Note [ROCm parallel CI testing]
@@ -98,10 +107,6 @@ def strtobool(s):
     return True
 
 
-def parse_test_module(test):
-    return test.split(".")[0]
-
-
 class TestChoices(list):
     def __init__(self, *args, **kwargs):
         super().__init__(args[0])
@@ -110,131 +115,6 @@ def __contains__(self, item):
         return list.__contains__(self, parse_test_module(item))
 
 
-def discover_tests(
-    base_dir: Optional[pathlib.Path] = None,
-    cpp_tests_dir: Optional[pathlib.Path] = None,
-    blocklisted_patterns: Optional[List[str]] = None,
-    blocklisted_tests: Optional[List[str]] = None,
-    extra_tests: Optional[List[str]] = None,
-) -> List[str]:
-    """
-    Searches for all python files starting with test_ excluding one specified by patterns.
-    If cpp_tests_dir is provided, also scan for all C++ tests under that directory. They
-    are usually found in build/bin
-    """
-
-    def skip_test_p(name: str) -> bool:
-        rc = False
-        if blocklisted_patterns is not None:
-            rc |= any(name.startswith(pattern) for pattern in blocklisted_patterns)
-        if blocklisted_tests is not None:
-            rc |= name in blocklisted_tests
-        return rc
-
-    cwd = pathlib.Path(__file__).resolve().parent if base_dir is None else base_dir
-    # This supports symlinks, so we can link domain library tests to PyTorch test directory
-    all_py_files = [
-        pathlib.Path(p) for p in glob.glob(f"{cwd}/**/test_*.py", recursive=True)
-    ]
-
-    cpp_tests_dir = (
-        f"{cwd.parent}/{CPP_TEST_PATH}" if cpp_tests_dir is None else cpp_tests_dir
-    )
-    # CPP test files are located under pytorch/build/bin. Unlike Python test, C++ tests
-    # are just binaries and could have any name, i.e. basic or atest
-    all_cpp_files = [
-        pathlib.Path(p) for p in glob.glob(f"{cpp_tests_dir}/**/*", recursive=True)
-    ]
-
-    rc = [str(fname.relative_to(cwd))[:-3] for fname in all_py_files]
-    # Add the cpp prefix for C++ tests so that we can tell them apart
-    rc.extend(
-        [
-            parse_test_module(f"{CPP_TEST_PREFIX}/{fname.relative_to(cpp_tests_dir)}")
-            for fname in all_cpp_files
-        ]
-    )
-
-    # Invert slashes on Windows
-    if sys.platform == "win32":
-        rc = [name.replace("\\", "/") for name in rc]
-    rc = [test for test in rc if not skip_test_p(test)]
-    if extra_tests is not None:
-        rc += extra_tests
-    return sorted(rc)
-
-
-CPP_TESTS_DIR = os.path.abspath(os.getenv("CPP_TESTS_DIR", default=CPP_TEST_PATH))
-TESTS = discover_tests(
-    cpp_tests_dir=CPP_TESTS_DIR,
-    blocklisted_patterns=[
-        "ao",
-        "bottleneck_test",
-        "custom_backend",
-        "custom_operator",
-        "fx",  # executed by test_fx.py
-        "jit",  # executed by test_jit.py
-        "mobile",
-        "onnx_caffe2",
-        "package",  # executed by test_package.py
-        "quantization",  # executed by test_quantization.py
-        "autograd",  # executed by test_autograd.py
-    ],
-    blocklisted_tests=[
-        "test_bundled_images",
-        "test_cpp_extensions_aot",
-        "test_determination",
-        "test_jit_fuser",
-        "test_jit_simple",
-        "test_jit_string",
-        "test_kernel_launch_checks",
-        "test_nnapi",
-        "test_static_runtime",
-        "test_throughput_benchmark",
-        "distributed/bin/test_script",
-        "distributed/elastic/multiprocessing/bin/test_script",
-        "distributed/launcher/bin/test_script",
-        "distributed/launcher/bin/test_script_init_method",
-        "distributed/launcher/bin/test_script_is_torchelastic_launched",
-        "distributed/launcher/bin/test_script_local_rank",
-        "distributed/test_c10d_spawn",
-        "distributions/test_transforms",
-        "distributions/test_utils",
-        "test/inductor/test_aot_inductor_utils",
-        "onnx/test_pytorch_onnx_onnxruntime_cuda",
-        "onnx/test_models",
-        # These are not C++ tests
-        f"{CPP_TEST_PREFIX}/CMakeFiles",
-        f"{CPP_TEST_PREFIX}/CTestTestfile.cmake",
-        f"{CPP_TEST_PREFIX}/Makefile",
-        f"{CPP_TEST_PREFIX}/cmake_install.cmake",
-        f"{CPP_TEST_PREFIX}/c10_intrusive_ptr_benchmark",
-        f"{CPP_TEST_PREFIX}/example_allreduce",
-        f"{CPP_TEST_PREFIX}/parallel_benchmark",
-        f"{CPP_TEST_PREFIX}/protoc",
-        f"{CPP_TEST_PREFIX}/protoc-3.13.0.0",
-        f"{CPP_TEST_PREFIX}/torch_shm_manager",
-        f"{CPP_TEST_PREFIX}/tutorial_tensorexpr",
-    ],
-    extra_tests=[
-        "test_cpp_extensions_aot_ninja",
-        "test_cpp_extensions_aot_no_ninja",
-        "distributed/elastic/timer/api_test",
-        "distributed/elastic/timer/local_timer_example",
-        "distributed/elastic/timer/local_timer_test",
-        "distributed/elastic/events/lib_test",
-        "distributed/elastic/metrics/api_test",
-        "distributed/elastic/utils/logging_test",
-        "distributed/elastic/utils/util_test",
-        "distributed/elastic/utils/distributed_test",
-        "distributed/elastic/multiprocessing/api_test",
-    ],
-)
-
-# The doctests are a special case that don't correspond to a file that discover
-# tests can enable.
-TESTS = TESTS + ["doctests"]
-
 FSDP_TEST = [test for test in TESTS if test.startswith("distributed/fsdp")]
 
 WINDOWS_BLOCKLIST = [
@@ -302,10 +182,20 @@ def skip_test_p(name: str) -> bool:
     "test_jit_cuda_fuser",
 ]
 
+XPU_BLOCKLIST = [
+    "test_autograd",
+]
+
+XPU_TEST = [
+    "test_xpu",
+]
+
 # The tests inside these files should never be run in parallel with each other
 RUN_PARALLEL_BLOCKLIST = [
     "test_cpp_extensions_jit",
     "test_cpp_extensions_open_device_registration",
+    "test_cpp_extensions_stream_and_event",
+    "test_cpp_extensions_mtia_backend",
     "test_jit_disabled",
     "test_mobile_optimizer",
     "test_multiprocessing",
@@ -316,6 +206,7 @@ def skip_test_p(name: str) -> bool:
     "test_tensorexpr",
     "test_cuda_primary_ctx",
     "test_cuda_trace",
+    "inductor/test_benchmark_fusion",
     "test_cuda_nvml_based_avail",
     # temporarily sets a global config
     "test_autograd_fallback",
@@ -328,38 +219,26 @@ def skip_test_p(name: str) -> bool:
     "test_fake_tensor",
     "test_cpp_api_parity",
     "test_reductions",
-    "test_cuda",
-    "test_cuda_expandable_segments",
-    "test_indexing",
     "test_fx_backends",
-    "test_linalg",
     "test_cpp_extensions_jit",
     "test_torch",
     "test_tensor_creation_ops",
-    "test_sparse_csr",
     "test_dispatch",
     "test_python_dispatch",  # torch.library creation and deletion must be serialized
     "test_spectral_ops",  # Cause CUDA illegal memory access https://github.com/pytorch/pytorch/issues/88916
     "nn/test_pooling",
     "nn/test_convolution",  # Doesn't respect set_per_process_memory_fraction, results in OOM for other tests in slow gradcheck
     "distributions/test_distributions",
-    "test_autograd",  # slow gradcheck runs a test that checks the cuda memory allocator
-    "test_prims",  # slow gradcheck runs a test that checks the cuda memory allocator
-    "test_modules",  # failed test due to mismatched elements
-    "functorch/test_vmap",  # OOM
     "test_fx",  # gets SIGKILL
-    "test_dataloader",  # frequently hangs for ROCm
-    "test_serialization",  # test_serialization_2gb_file allocates a tensor of 2GB, and could cause OOM
-    "test_schema_check",  # Cause CUDA illegal memory access https://github.com/pytorch/pytorch/issues/95749
     "functorch/test_memory_efficient_fusion",  # Cause CUDA OOM on ROCm
     "test_utils",  # OOM
     "test_sort_and_select",  # OOM
     "test_backward_compatible_arguments",  # OOM
-    "test_module_init",  # OOM
     "test_autocast",  # OOM
     "test_native_mha",  # OOM
     "test_module_hooks",  # OOM
-    "inductor/test_max_autotune",  # Testing, probably revert later
+    "inductor/test_max_autotune",
+    "inductor/test_cutlass_backend",  # slow due to many nvcc compilation steps
 ]
 # A subset of onnx tests that cannot run in parallel due to high memory usage.
 ONNX_SERIAL_LIST = [
@@ -437,7 +316,9 @@ def skip_test_p(name: str) -> bool:
     "test_jit_fuser_legacy",
 ]
 
+INDUCTOR_TESTS = [test for test in TESTS if test.startswith(INDUCTOR_TEST_PREFIX)]
 DISTRIBUTED_TESTS = [test for test in TESTS if test.startswith(DISTRIBUTED_TEST_PREFIX)]
+TORCH_EXPORT_TESTS = [test for test in TESTS if test.startswith("export")]
 FUNCTORCH_TESTS = [test for test in TESTS if test.startswith("functorch")]
 ONNX_TESTS = [test for test in TESTS if test.startswith("onnx")]
 CPP_TESTS = [test for test in TESTS if test.startswith(CPP_TEST_PREFIX)]
@@ -493,7 +374,9 @@ def run_test(
     launcher_cmd=None,
     extra_unittest_args=None,
     env=None,
+    print_log=True,
 ) -> int:
+    env = env or os.environ.copy()
     maybe_set_hip_visible_devies()
     unittest_args = options.additional_unittest_args.copy()
     test_file = test_module.name
@@ -515,11 +398,11 @@ def run_test(
     else:
         unittest_args.extend(
             [
-                f"--shard-id={test_module.shard - 1}",
+                f"--shard-id={test_module.shard}",
                 f"--num-shards={test_module.num_shards}",
             ]
         )
-        stepcurrent_key = f"{test_file}_{test_module.shard - 1}_{os.urandom(8).hex()}"
+        stepcurrent_key = f"{test_file}_{test_module.shard}_{os.urandom(8).hex()}"
 
     if options.verbose:
         unittest_args.append(f'-{"v"*options.verbose}')  # in case of pytest
@@ -538,7 +421,6 @@ def run_test(
         unittest_args.extend(
             get_pytest_args(
                 options,
-                stepcurrent_key,
                 is_cpp_test=is_cpp_test,
                 is_distributed_test=is_distributed_test,
             )
@@ -546,7 +428,8 @@ def run_test(
         unittest_args.extend(test_module.get_pytest_args())
         unittest_args = [arg if arg != "-f" else "-x" for arg in unittest_args]
 
-    # TODO: These features are not available for C++ test yet
+    # NB: These features are not available for C++ tests, but there is little incentive
+    # to implement it because we have never seen a flaky C++ test before.
     if IS_CI and not is_cpp_test:
         ci_args = ["--import-slow-tests", "--import-disabled-tests"]
         if RERUN_DISABLED_TESTS:
@@ -592,7 +475,7 @@ def run_test(
         argv = [test_file + ".py"] + unittest_args
 
     os.makedirs(REPO_ROOT / "test" / "test-reports", exist_ok=True)
-    if IS_CI:
+    if options.pipe_logs:
         log_fd, log_path = tempfile.mkstemp(
             dir=REPO_ROOT / "test" / "test-reports",
             prefix=f"{sanitize_file_name(str(test_module))}_",
@@ -601,32 +484,44 @@ def run_test(
         os.close(log_fd)
 
     command = (launcher_cmd or []) + executable + argv
-    num_retries = 0 if "--subprocess" in command or RERUN_DISABLED_TESTS else 2
-    is_slow = "slow" in os.environ.get("TEST_CONFIG", "") or "slow" in os.environ.get(
-        "BUILD_ENVRIONMENT", ""
+    should_retry = (
+        "--subprocess" not in command
+        and not RERUN_DISABLED_TESTS
+        and not is_cpp_test
+        and "-n" not in command
     )
     timeout = (
-        THRESHOLD * 6
-        if is_slow
+        None
+        if not options.enable_timeout
+        else THRESHOLD * 6
+        if TEST_WITH_SLOW
         else THRESHOLD * 3
-        if num_retries > 0
+        if should_retry
         and isinstance(test_module, ShardedTest)
         and test_module.time is not None
+        else THRESHOLD * 3
+        if is_cpp_test
         else None
     )
     print_to_stderr(f"Executing {command} ... [{datetime.now()}]")
 
     with ExitStack() as stack:
         output = None
-        if IS_CI:
+        if options.pipe_logs:
             output = stack.enter_context(open(log_path, "w"))
 
-        if options.continue_through_error and "--subprocess" not in command:
-            # I think subprocess is better handled by common_utils? but it's not working atm
-            ret_code, was_rerun = run_test_continue_through_error(
-                command, test_directory, env, timeout, stepcurrent_key, output
+        if should_retry:
+            ret_code, was_rerun = run_test_retries(
+                command,
+                test_directory,
+                env,
+                timeout,
+                stepcurrent_key,
+                output,
+                options.continue_through_error,
             )
         else:
+            command.extend([f"--sc={stepcurrent_key}", "--print-items"])
             ret_code, was_rerun = retry_shell(
                 command,
                 test_directory,
@@ -634,87 +529,113 @@ def run_test(
                 stderr=output,
                 env=env,
                 timeout=timeout,
-                retries=num_retries,
+                retries=0,
             )
 
-            # Pytest return code 5 means no test is collected. This is needed
-            # here as we use pytest directly when running C++ tests. Return
-            # code 4 is ok too as this happens when the binary is not a C++
-            # test executable. All binary files under build/bin that are not
-            # C++ test at the time of this writing have been excluded, but we
-            # can accept code 4 too just in case a new non-test binary file
-            # comes up in the future.
-            ret_code = 0 if ret_code == 5 or ret_code == 4 else ret_code
+            # Pytest return code 5 means no test is collected. Exit code 4 is
+            # returned when the binary is not a C++ test executable, but 4 can
+            # also be returned if the file fails before running any tests. All
+            # binary files under build/bin that are not C++ test at the time of
+            # this writing have been excluded and new ones should be added to
+            # the list of exclusions in tools/testing/discover_tests.py
+            ret_code = 0 if ret_code == 5 else ret_code
 
-    if IS_CI:
+    if options.pipe_logs and print_log:
         handle_log_file(
             test_module, log_path, failed=(ret_code != 0), was_rerun=was_rerun
         )
     return ret_code
 
 
-def run_test_continue_through_error(
-    command, test_directory, env, timeout, stepcurrent_key, output
+def run_test_retries(
+    command,
+    test_directory,
+    env,
+    timeout,
+    stepcurrent_key,
+    output,
+    continue_through_error,
 ):
     # Run the test with -x to stop at first failure. Try again, skipping the
     # previously run tests, repeating this until there is a test that fails 3
-    # times (same number of rVetries we typically give).  Then we skip that
-    # test, and keep going. Basically if the same test fails 3 times in a row,
-    # skip the test on the next run, but still fail in the end. I take advantage
-    # of the value saved in stepcurrent to keep track of the most recently run
-    # test (which is the one that failed if there was a failure).
+    # times (same number of rVetries we typically give).
+    #
+    # If continue through error is not set, then we fail fast.
+    #
+    # If continue through error is set, then we skip that test, and keep going.
+    # Basically if the same test fails 3 times in a row, skip the test on the
+    # next run, but still fail in the end. I take advantage of the value saved
+    # in stepcurrent to keep track of the most recently run test (which is the
+    # one that failed if there was a failure).
+
+    def print_to_file(s):
+        print(s, file=output, flush=True)
 
     num_failures = defaultdict(int)
 
+    print_items = ["--print-items"]
     sc_command = f"--sc={stepcurrent_key}"
     while True:
-        ret_code = shell(
-            command + [sc_command],
+        ret_code, _ = retry_shell(
+            command + [sc_command] + print_items,
             test_directory,
             stdout=output,
             stderr=output,
             env=env,
             timeout=timeout,
+            retries=0,  # no retries here, we do it ourselves, this is because it handles timeout exceptions well
         )
-        ret_code = 0 if ret_code == 5 or ret_code == 4 else ret_code
+        ret_code = 0 if ret_code == 5 else ret_code
         if ret_code == 0:
             break  # Got to the end of the test suite successfully
         signal_name = f" ({SIGNALS_TO_NAMES_DICT[-ret_code]})" if ret_code < 0 else ""
-        print(
-            f"Got exit code {ret_code}{signal_name}, retrying...",
-            file=output,
-            flush=True,
-        )
+        print_to_file(f"Got exit code {ret_code}{signal_name}")
 
         # Read what just failed
-        with open(
-            REPO_ROOT / ".pytest_cache/v/cache/stepcurrent" / stepcurrent_key
-        ) as f:
-            current_failure = f.read()
+        try:
+            with open(
+                REPO_ROOT / ".pytest_cache/v/cache/stepcurrent" / stepcurrent_key
+            ) as f:
+                current_failure = f.read()
+        except FileNotFoundError:
+            print_to_file(
+                "No stepcurrent file found. Either pytest didn't get to run (e.g. import error)"
+                + " or file got deleted (contact dev infra)"
+            )
+            break
 
         num_failures[current_failure] += 1
         if num_failures[current_failure] >= 3:
+            if not continue_through_error:
+                print_to_file("Stopping at first consistent failure")
+                break
             sc_command = f"--scs={stepcurrent_key}"
         else:
             sc_command = f"--sc={stepcurrent_key}"
+        print_to_file("Retrying...")
+        # Print full c++ stack traces during retries
+        # Don't do it for macos inductor tests as it makes them
+        # segfault for some reason
+        if not (
+            IS_MACOS
+            and len(command) >= 2
+            and command[2].startswith(INDUCTOR_TEST_PREFIX)
+        ):
+            env = env or {}
+            env["TORCH_SHOW_CPP_STACKTRACES"] = "1"
+        print_items = []  # do not continue printing them, massive waste of space
 
     consistent_failures = [x[1:-1] for x in num_failures.keys() if num_failures[x] >= 3]
     flaky_failures = [x[1:-1] for x in num_failures.keys() if 0 < num_failures[x] < 3]
     if len(flaky_failures) > 0:
-        print(
-            "The following tests failed flakily (had to be rerun in a new process,"
-            + f" doesn't include reruns froms same process): {flaky_failures}",
-            file=output,
-            flush=True,
+        print_to_file(
+            "The following tests failed and then succeeded when run in a new process"
+            + f"{flaky_failures}",
         )
     if len(consistent_failures) > 0:
-        print(
-            f"The following tests failed consistently: {consistent_failures}",
-            file=output,
-            flush=True,
-        )
+        print_to_file(f"The following tests failed consistently: {consistent_failures}")
         return 1, True
-    return 0, any(x > 0 for x in num_failures.values())
+    return ret_code, any(x > 0 for x in num_failures.values())
 
 
 def run_test_with_subprocess(test_module, test_directory, options):
@@ -1012,35 +933,33 @@ def handle_log_file(
     test: ShardedTest, file_path: str, failed: bool, was_rerun: bool
 ) -> None:
     test = str(test)
-    with open(file_path, "rb") as f:
-        full_text = f.read().decode("utf-8", errors="ignore")
+    with open(file_path, errors="ignore") as f:
+        full_text = f.read()
+
+    new_file = "test/test-reports/" + sanitize_file_name(
+        f"{test}_{os.urandom(8).hex()}_.log"
+    )
+    os.rename(file_path, REPO_ROOT / new_file)
 
     if not failed and not was_rerun and "=== RERUNS ===" not in full_text:
         # If success + no retries (idk how else to check for test level retries
-        # other than reparse xml), print only what tests ran, rename the log
-        # file so it doesn't get printed later, and do not remove logs.
-        new_file = "test/test-reports/" + sanitize_file_name(
-            f"{test}_{os.urandom(8).hex()}_.log"
-        )
-        os.rename(file_path, REPO_ROOT / new_file)
+        # other than reparse xml), print only what tests ran
         print_to_stderr(
             f"\n{test} was successful, full logs can be found in artifacts with path {new_file}"
         )
         for line in full_text.splitlines():
             if re.search("Running .* items in this shard:", line):
-                print_to_stderr(line.strip())
+                print_to_stderr(line.rstrip())
         print_to_stderr("")
         return
-    # otherwise: print entire file and then remove it
-    print_to_stderr(f"\nPRINTING LOG FILE of {test} ({file_path})")
+
+    # otherwise: print entire file
+    print_to_stderr(f"\nPRINTING LOG FILE of {test} ({new_file})")
     print_to_stderr(full_text)
-    print_to_stderr(f"FINISHED PRINTING LOG FILE of {test} ({file_path})\n")
-    os.remove(file_path)
+    print_to_stderr(f"FINISHED PRINTING LOG FILE of {test} ({new_file})\n")
 
 
-def get_pytest_args(
-    options, stepcurrent_key, is_cpp_test=False, is_distributed_test=False
-):
+def get_pytest_args(options, is_cpp_test=False, is_distributed_test=False):
     if RERUN_DISABLED_TESTS:
         # Distributed tests are too slow, so running them x50 will cause the jobs to timeout after
         # 3+ hours. So, let's opt for less number of reruns. We need at least 150 instances of the
@@ -1061,9 +980,8 @@ def get_pytest_args(
     ]
     if not is_cpp_test:
         # C++ tests need to be run with pytest directly, not via python
+        # We have a custom pytest shard that conflicts with the normal plugin
         pytest_args.extend(["-p", "no:xdist", "--use-pytest"])
-        if not options.continue_through_error and IS_CI:
-            pytest_args.append(f"--sc={stepcurrent_key}")
     else:
         # Use pytext-dist to run C++ tests in parallel as running them sequentially using run_test
         # is much slower than running them directly
@@ -1082,6 +1000,23 @@ def get_pytest_args(
     return pytest_args
 
 
+def run_ci_sanity_check(test: ShardedTest, test_directory, options):
+    assert (
+        test.name == "test_ci_sanity_check_fail"
+    ), f"This handler only works for test_ci_sanity_check_fail, got {test.name}"
+    ret_code = run_test(test, test_directory, options, print_log=False)
+    # This test should fail
+    if ret_code != 1:
+        return 1
+    test_reports_dir = str(REPO_ROOT / "test/test-reports")
+    # Delete the log files and xmls generated by the test
+    for file in glob.glob(f"{test_reports_dir}/{test.name}*.log"):
+        os.remove(file)
+    for dirname in glob.glob(f"{test_reports_dir}/**/{test.name}"):
+        shutil.rmtree(dirname)
+    return 0
+
+
 CUSTOM_HANDLERS = {
     "test_cuda_primary_ctx": run_test_with_subprocess,
     "test_cuda_nvml_based_avail": run_test_with_subprocess,
@@ -1104,6 +1039,7 @@ def get_pytest_args(
     "distributed/rpc/test_share_memory": run_test_with_subprocess,
     "distributed/rpc/cuda/test_tensorpipe_agent": run_test_with_subprocess,
     "doctests": run_doctests,
+    "test_ci_sanity_check_fail": run_ci_sanity_check,
 }
 
 
@@ -1147,6 +1083,12 @@ def parse_args():
         action="store_true",
         help=("If this flag is present, we will only run test_mps and test_metal"),
     )
+    parser.add_argument(
+        "--xpu",
+        "--xpu",
+        action="store_true",
+        help=("If this flag is present, we will run xpu tests except XPU_BLOCK_LIST"),
+    )
     parser.add_argument(
         "--cpp",
         "--cpp",
@@ -1218,6 +1160,39 @@ def parse_args():
         help="Runs the full test suite despite one of the tests failing",
         default=strtobool(os.environ.get("CONTINUE_THROUGH_ERROR", "False")),
     )
+    parser.add_argument(
+        "--pipe-logs",
+        action="store_true",
+        help="Print logs to output file while running tests.  True if in CI and env var is not set",
+        default=IS_CI and not strtobool(os.environ.get("VERBOSE_TEST_LOGS", "False")),
+    )
+    parser.add_argument(
+        "--enable-timeout",
+        action="store_true",
+        help="Set a timeout based on the test times json file.  Only works if there are test times available",
+        default=IS_CI and not strtobool(os.environ.get("NO_TEST_TIMEOUT", "False")),
+    )
+    parser.add_argument(
+        "--enable-td",
+        action="store_true",
+        help="Enables removing tests based on TD",
+        default=IS_CI
+        and (
+            TEST_WITH_CROSSREF
+            or TEST_WITH_ASAN
+            or (
+                strtobool(os.environ.get("TD_DISTRIBUTED", "False"))
+                and TEST_CONFIG == "distributed"
+                and TEST_CUDA
+            )
+            or (IS_WINDOWS and not TEST_CUDA)
+            or TEST_CONFIG == "nogpu_AVX512"
+            or TEST_CONFIG == "nogpu_NO_AVX2"
+        )
+        and get_pr_number() is not None
+        and not strtobool(os.environ.get("NO_TD", "False"))
+        and not TEST_WITH_SLOW,
+    )
     parser.add_argument(
         "additional_unittest_args",
         nargs="*",
@@ -1237,11 +1212,21 @@ def parse_args():
         action="store_true",
         help="exclude tests that are run for a specific jit config",
     )
+    parser.add_argument(
+        "--exclude-torch-export-tests",
+        action="store_true",
+        help="exclude torch export tests",
+    )
     parser.add_argument(
         "--exclude-distributed-tests",
         action="store_true",
         help="exclude distributed tests",
     )
+    parser.add_argument(
+        "--exclude-inductor-tests",
+        action="store_true",
+        help="exclude inductor tests",
+    )
     parser.add_argument(
         "--dry-run",
         action="store_true",
@@ -1305,6 +1290,7 @@ def must_serial(file: Union[str, ShardedTest]) -> bool:
         or file in CI_SERIAL_LIST
         or file in JIT_EXECUTOR_TESTS
         or file in ONNX_SERIAL_LIST
+        or NUM_PROCS == 1
     )
 
 
@@ -1349,6 +1335,12 @@ def get_selected_tests(options) -> List[str]:
         # Exclude all mps tests otherwise
         options.exclude.extend(["test_mps", "test_metal"])
 
+    if options.xpu:
+        selected_tests = exclude_tests(XPU_BLOCKLIST, selected_tests, "on XPU")
+    else:
+        # Exclude all xpu specifc tests otherwise
+        options.exclude.extend(XPU_TEST)
+
     # Filter to only run onnx tests when --onnx option is specified
     onnx_tests = [tname for tname in selected_tests if tname in ONNX_TESTS]
     if options.onnx:
@@ -1364,10 +1356,28 @@ def get_selected_tests(options) -> List[str]:
     if options.exclude_distributed_tests:
         options.exclude.extend(DISTRIBUTED_TESTS)
 
+    if options.exclude_inductor_tests:
+        options.exclude.extend(INDUCTOR_TESTS)
+
+    if options.exclude_torch_export_tests:
+        options.exclude.extend(TORCH_EXPORT_TESTS)
+
     # these tests failing in CUDA 11.6 temporary disabling. issue https://github.com/pytorch/pytorch/issues/75375
     if torch.version.cuda is not None:
         options.exclude.extend(["distributions/test_constraints"])
 
+    # these tests failing in Python 3.12 temporarily disabling
+    if sys.version_info >= (3, 12):
+        options.exclude.extend(
+            [
+                "functorch/test_dims",
+                "functorch/test_rearrange",
+                "functorch/test_parsing",
+                "functorch/test_memory_efficient_fusion",
+                "torch_np/numpy_tests/core/test_multiarray",
+            ]
+        )
+
     selected_tests = exclude_tests(options.exclude, selected_tests)
 
     if sys.platform == "win32" and not options.ignore_win_blocklist:
@@ -1476,7 +1486,7 @@ def do_sharding(
     test_file_times: Dict[str, float],
     test_class_times: Dict[str, Dict[str, float]],
     sort_by_time: bool = True,
-) -> List[ShardedTest]:
+) -> Tuple[float, List[ShardedTest]]:
     which_shard, num_shards = get_sharding_opts(options)
 
     # Do sharding
@@ -1488,10 +1498,7 @@ def do_sharding(
         must_serial=must_serial,
         sort_by_time=sort_by_time,
     )
-    _, tests_from_shard = shards[which_shard - 1]
-    selected_tests = tests_from_shard
-
-    return selected_tests
+    return shards[which_shard - 1]
 
 
 class TestFailure(NamedTuple):
@@ -1549,17 +1556,22 @@ def run_tests(
         NUM_PROCS, maxtasksperchild=None if torch.version.hip else 1
     )
 
-    # NB: This is a hack to make conftest.py available on CPP_TESTS_DIR. We should
-    # see if the file could be turned into a full-fledge ptest plugin instead
-    cpp_conftest_file = os.path.join(CPP_TESTS_DIR, "conftest.py")
-    if (
-        options.cpp
-        and os.path.exists(CPP_TESTS_DIR)
-        and os.path.isdir(CPP_TESTS_DIR)
-        and not os.path.exists(cpp_conftest_file)
-    ):
-        # Take the conftest file from the test directory
-        shutil.copy(os.path.join(test_directory, "conftest.py"), cpp_conftest_file)
+    # NB: This is a hack to make conftest.py and files it depends on available
+    # on CPP_TESTS_DIR. We should see if the file could be turned into a
+    # full-fledge ptest plugin instead
+    conftest_files = [
+        "conftest.py",
+        "pytest_shard_custom.py",
+    ]
+    for conftest_file in conftest_files:
+        cpp_file = os.path.join(CPP_TESTS_DIR, conftest_file)
+        if (
+            options.cpp
+            and os.path.exists(CPP_TESTS_DIR)
+            and os.path.isdir(CPP_TESTS_DIR)
+            and not os.path.exists(cpp_file)
+        ):
+            shutil.copy(os.path.join(test_directory, conftest_file), cpp_file)
 
     def handle_error_messages(failure: Optional[TestFailure]):
         if failure is None:
@@ -1577,38 +1589,31 @@ def parallel_test_completion_callback(failure):
         ):
             pool.terminate()
 
+    keep_going_message = (
+        "\n\nTip: You can keep running tests even on failure by passing --keep-going to run_test.py.\n"
+        "If running on CI, add the 'keep-going' label to your PR and rerun your jobs."
+    )
+
     try:
-        os.environ["NUM_PARALLEL_PROCS"] = str(NUM_PROCS)
-        for test in selected_tests_parallel:
+        for test in selected_tests_serial:
             options_clone = copy.deepcopy(options)
             if can_run_in_pytest(test):
                 options_clone.pytest = True
-            pool.apply_async(
-                run_test_module,
-                args=(test, test_directory, options_clone),
-                callback=parallel_test_completion_callback,
-            )
-        pool.close()
-        pool.join()
-        del os.environ["NUM_PARALLEL_PROCS"]
-
-        if (
-            not options.continue_through_error
-            and not RERUN_DISABLED_TESTS
-            and len(failures) != 0
-        ):
-            raise RuntimeError(
-                "\n".join(x.message for x in failures)
-                + "\n\nTip: You can keep running tests even on failure by "
-                "passing --keep-going to run_test.py.\n"
-                "If running on CI, add the 'keep-going' label to "
-                "your PR and rerun your jobs."
-            )
+            failure = run_test_module(test, test_directory, options_clone)
+            test_failed = handle_error_messages(failure)
+            if (
+                test_failed
+                and not options.continue_through_error
+                and not RERUN_DISABLED_TESTS
+            ):
+                raise RuntimeError(failure.message + keep_going_message)
 
-        for test in selected_tests_serial:
+        # Run tests marked as serial first
+        for test in selected_tests_parallel:
             options_clone = copy.deepcopy(options)
             if can_run_in_pytest(test):
                 options_clone.pytest = True
+            options_clone.additional_unittest_args.extend(["-m", "serial"])
             failure = run_test_module(test, test_directory, options_clone)
             test_failed = handle_error_messages(failure)
             if (
@@ -1616,7 +1621,22 @@ def parallel_test_completion_callback(failure):
                 and not options.continue_through_error
                 and not RERUN_DISABLED_TESTS
             ):
-                raise RuntimeError(failure.message)
+                raise RuntimeError(failure.message + keep_going_message)
+
+        os.environ["NUM_PARALLEL_PROCS"] = str(NUM_PROCS)
+        for test in selected_tests_parallel:
+            options_clone = copy.deepcopy(options)
+            if can_run_in_pytest(test):
+                options_clone.pytest = True
+            options_clone.additional_unittest_args.extend(["-m", "not serial"])
+            pool.apply_async(
+                run_test_module,
+                args=(test, test_directory, options_clone),
+                callback=parallel_test_completion_callback,
+            )
+        pool.close()
+        pool.join()
+        del os.environ["NUM_PARALLEL_PROCS"]
 
     finally:
         pool.terminate()
@@ -1628,7 +1648,6 @@ def parallel_test_completion_callback(failure):
 def check_pip_packages() -> None:
     packages = [
         "pytest-rerunfailures",
-        "pytest-shard",
         "pytest-flakefinder",
         "pytest-xdist",
     ]
@@ -1654,26 +1673,17 @@ def main():
     test_directory = str(REPO_ROOT / "test")
     selected_tests = get_selected_tests(options)
 
+    test_prioritizations = import_results()
+    test_prioritizations.amend_tests(selected_tests)
+
     os.makedirs(REPO_ROOT / "test" / "test-reports", exist_ok=True)
 
     if options.coverage and not PYTORCH_COLLECT_COVERAGE:
         shell(["coverage", "erase"])
 
-    aggregated_heuristics: AggregatedHeuristics = AggregatedHeuristics(
-        unranked_tests=selected_tests
-    )
-
-    with open(
-        REPO_ROOT / "test" / "test-reports" / "td_heuristic_rankings.log", "w"
-    ) as f:
-        if IS_CI:
-            # downloading test cases configuration to local environment
-            get_test_case_configs(dirpath=test_directory)
-            aggregated_heuristics = get_test_prioritizations(selected_tests, file=f)
-
-        test_prioritizations = aggregated_heuristics.get_aggregated_priorities()
-
-        f.write(test_prioritizations.get_info_str())
+    if IS_CI:
+        # downloading test cases configuration to local environment
+        get_test_case_configs(dirpath=test_directory)
 
     test_file_times_dict = load_test_file_times()
     test_class_times_dict = load_test_class_times()
@@ -1690,7 +1700,7 @@ def __init__(
         ):
             self.name = name
             self.failures = []
-            self.sharded_tests = do_sharding(
+            self.time, self.sharded_tests = do_sharding(
                 options,
                 raw_tests,
                 test_file_times_dict,
@@ -1699,54 +1709,38 @@ def __init__(
             )
 
         def __str__(self):
-            s = f"Name: {self.name}\n"
-            s += "  Parallel tests:\n"
-            s += "".join(
-                f"    {test}\n" for test in self.sharded_tests if not must_serial(test)
-            )
-            s += "  Serial tests:\n"
-            s += "".join(
-                f"    {test}\n" for test in self.sharded_tests if must_serial(test)
-            )
+            s = f"Name: {self.name} (est. time: {round(self.time / 60, 2)}min)\n"
+            serial = [test for test in self.sharded_tests if must_serial(test)]
+            parallel = [test for test in self.sharded_tests if not must_serial(test)]
+            s += f"  Serial tests ({len(serial)}):\n"
+            s += "".join(f"    {test}\n" for test in serial)
+            s += f"  Parallel tests ({len(parallel)}):\n"
+            s += "".join(f"    {test}\n" for test in parallel)
             return s.strip()
 
-    test_batches: List[TestBatch] = []
+    percent_to_run = 25 if options.enable_td else 100
+    print_to_stderr(
+        f"Running {percent_to_run}% of tests based on TD"
+        if options.enable_td
+        else "Running all tests"
+    )
+    include, exclude = test_prioritizations.get_top_per_tests(percent_to_run)
 
-    # Each batch will be run sequentially
-    test_batches = [
-        TestBatch(
-            "high_relevance", test_prioritizations.get_high_relevance_tests(), False
-        ),
-        TestBatch(
-            "probable_relevance",
-            test_prioritizations.get_probable_relevance_tests(),
-            False,
-        ),
-        TestBatch(
-            "unranked_relevance",
-            test_prioritizations.get_unranked_relevance_tests(),
-            True,
-        ),
-        TestBatch(
-            "unlikely_relevance",
-            test_prioritizations.get_unlikely_relevance_tests(),
-            True,
-        ),
-        TestBatch(
-            "none_relevance",
-            test_prioritizations.get_none_relevance_tests(),
-            True,
-        ),
-    ]
+    test_batch = TestBatch("tests to run", include, False)
+    test_batch_exclude = TestBatch("excluded", exclude, True)
+    if IS_CI:
+        gen_ci_artifact([x.to_json() for x in include], [x.to_json() for x in exclude])
 
-    for test_batch in test_batches:
-        print_to_stderr(test_batch)
+    print_to_stderr(f"Running parallel tests on {NUM_PROCS} processes")
+    print_to_stderr(test_batch)
+    print_to_stderr(test_batch_exclude)
 
     if options.dry_run:
         return
 
     if options.dynamo:
         os.environ["PYTORCH_TEST_WITH_DYNAMO"] = "1"
+
     elif options.inductor:
         os.environ["PYTORCH_TEST_WITH_INDUCTOR"] = "1"
 
@@ -1756,17 +1750,13 @@ def __str__(self):
     try:
         # Actually run the tests
         start_time = time.time()
-        for test_batch in test_batches:
-            elapsed_time = time.time() - start_time
-            print_to_stderr(
-                f"Starting test batch '{test_batch.name}' {elapsed_time} seconds after initiating testing"
-            )
-            print_to_stderr(
-                f"With sharding, this batch will run {len(test_batch.sharded_tests)} tests"
-            )
-            run_tests(
-                test_batch.sharded_tests, test_directory, options, test_batch.failures
-            )
+        elapsed_time = time.time() - start_time
+        print_to_stderr(
+            f"Starting test batch '{test_batch.name}' {round(elapsed_time, 2)} seconds after initiating testing"
+        )
+        run_tests(
+            test_batch.sharded_tests, test_directory, options, test_batch.failures
+        )
 
     finally:
         if options.coverage:
@@ -1781,24 +1771,18 @@ def __str__(self):
                 if not PYTORCH_COLLECT_COVERAGE:
                     cov.html_report()
 
-        all_failures = [failure for batch in test_batches for failure in batch.failures]
+        all_failures = test_batch.failures
 
         if IS_CI:
-            num_tests = len(selected_tests)
             for test, _ in all_failures:
-                test_stats = aggregated_heuristics.get_test_stats(test)
-                test_stats["num_total_tests"] = num_tests
-
-                print_to_stderr("Emiting td_test_failure_stats")
+                test_stats = test_prioritizations.get_test_stats(test)
+                print_to_stderr("Emiting td_test_failure_stats_v2")
                 emit_metric(
-                    "td_test_failure_stats",
+                    "td_test_failure_stats_v2",
                     {
-                        **test_stats,
-                        "confidence_ratings": get_prediction_confidences(
-                            selected_tests
-                        ),
+                        "selected_tests": selected_tests,
                         "failure": str(test),
-                        "tests": selected_tests,
+                        **test_stats,
                     },
                 )
 
diff --git a/test/scripts/cuda_memcheck_common.py b/test/scripts/cuda_memcheck_common.py
index aa52ced783b89..65c91fe68786f 100644
--- a/test/scripts/cuda_memcheck_common.py
+++ b/test/scripts/cuda_memcheck_common.py
@@ -1,8 +1,10 @@
 # this file contains a simple parser that parses report
 # from cuda-memcheck
 
+
 class ParseError(Exception):
     """Whenever the simple parser is unable to parse the report, this exception will be raised"""
+
     pass
 
 
@@ -77,25 +79,25 @@ def parse(message):
     ========= ERROR SUMMARY: 4 errors
     """
     errors = []
-    HEAD = '========='
+    HEAD = "========="
     headlen = len(HEAD)
     started = False
     in_message = False
     message_lines = []
     lines = message.splitlines()
     for l in lines:
-        if l == HEAD + ' CUDA-MEMCHECK':
+        if l == HEAD + " CUDA-MEMCHECK":
             started = True
             continue
         if not started or not l.startswith(HEAD):
             continue
-        l = l[headlen + 1:]
-        if l.startswith('ERROR SUMMARY:'):
+        l = l[headlen + 1 :]
+        if l.startswith("ERROR SUMMARY:"):
             return Report(l, errors)
         if not in_message:
             in_message = True
             message_lines = [l]
-        elif l == '':
+        elif l == "":
             errors.append(Error(message_lines))
             in_message = False
         else:
diff --git a/test/scripts/run_cuda_memcheck.py b/test/scripts/run_cuda_memcheck.py
index 54c0e7581e625..924048030e336 100755
--- a/test/scripts/run_cuda_memcheck.py
+++ b/test/scripts/run_cuda_memcheck.py
@@ -12,39 +12,62 @@
 Note that running cuda-memcheck could be very slow.
 """
 
+import argparse
 import asyncio
-import torch
 import multiprocessing
-import argparse
-import subprocess
-import tqdm
 import os
+import subprocess
 import sys
+
 import cuda_memcheck_common as cmc
+import torch
+import tqdm
 
 ALL_TESTS = []
 GPUS = torch.cuda.device_count()
 
 # parse arguments
 parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests")
-parser.add_argument('filename', help="the python file for a test, such as test_torch.py")
-parser.add_argument('timeout', type=int, help='kill the test if it does not terminate in a certain amount of seconds')
-parser.add_argument('--strict', action='store_true',
-                    help='Whether to show cublas/cudnn errors. These errors are ignored by default because'
-                         'cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors')
-parser.add_argument('--nproc', type=int, default=multiprocessing.cpu_count(),
-                    help='Number of processes running tests, default to number of cores in the system')
-parser.add_argument('--gpus', default='all',
-                    help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"')
-parser.add_argument('--ci', action='store_true',
-                    help='Whether this script is executed in CI. When executed inside a CI, this script fails when '
-                         'an error is detected. Also, it will not show tqdm progress bar, but directly print the error'
-                         'to stdout instead.')
-parser.add_argument('--nohang', action='store_true', help='Treat timeout as success')
-parser.add_argument('--split', type=int, default=1, help='Split the job into pieces')
-parser.add_argument('--rank', type=int, default=0, help='Which piece this process should pick')
+parser.add_argument(
+    "filename", help="the python file for a test, such as test_torch.py"
+)
+parser.add_argument(
+    "timeout",
+    type=int,
+    help="kill the test if it does not terminate in a certain amount of seconds",
+)
+parser.add_argument(
+    "--strict",
+    action="store_true",
+    help="Whether to show cublas/cudnn errors. These errors are ignored by default because"
+    "cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors",
+)
+parser.add_argument(
+    "--nproc",
+    type=int,
+    default=multiprocessing.cpu_count(),
+    help="Number of processes running tests, default to number of cores in the system",
+)
+parser.add_argument(
+    "--gpus",
+    default="all",
+    help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"',
+)
+parser.add_argument(
+    "--ci",
+    action="store_true",
+    help="Whether this script is executed in CI. When executed inside a CI, this script fails when "
+    "an error is detected. Also, it will not show tqdm progress bar, but directly print the error"
+    "to stdout instead.",
+)
+parser.add_argument("--nohang", action="store_true", help="Treat timeout as success")
+parser.add_argument("--split", type=int, default=1, help="Split the job into pieces")
+parser.add_argument(
+    "--rank", type=int, default=0, help="Which piece this process should pick"
+)
 args = parser.parse_args()
 
+
 # Filters that ignores cublas/cudnn errors
 # TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck?
 def is_ignored_only(output):
@@ -56,32 +79,43 @@ def is_ignored_only(output):
         return False
     count_ignored_errors = 0
     for e in report.errors:
-        if 'libcublas' in ''.join(e.stack) or 'libcudnn' in ''.join(e.stack) or 'libcufft' in ''.join(e.stack):
+        if (
+            "libcublas" in "".join(e.stack)
+            or "libcudnn" in "".join(e.stack)
+            or "libcufft" in "".join(e.stack)
+        ):
             count_ignored_errors += 1
     return count_ignored_errors == report.num_errors
 
+
 # Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
-os.environ['PYTORCH_CUDA_MEMCHECK'] = '1'
+os.environ["PYTORCH_CUDA_MEMCHECK"] = "1"
 
 # Discover tests:
 # To get a list of tests, run:
 # pytest --setup-only test/test_torch.py
 # and then parse the output
-proc = subprocess.Popen(['pytest', '--setup-only', args.filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+proc = subprocess.Popen(
+    ["pytest", "--setup-only", args.filename],
+    stdout=subprocess.PIPE,
+    stderr=subprocess.PIPE,
+)
 stdout, stderr = proc.communicate()
 lines = stdout.decode().strip().splitlines()
 for line in lines:
-    if '(fixtures used:' in line:
+    if "(fixtures used:" in line:
         line = line.strip().split()[0]
-        line = line[line.find('::') + 2:]
-        line = line.replace('::', '.')
+        line = line[line.find("::") + 2 :]
+        line = line.replace("::", ".")
         ALL_TESTS.append(line)
 
+
 # Do a simple filtering:
 # if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
 def is_cpu_only(name):
     name = name.lower()
-    return ('cpu' in name) and "cuda" not in name
+    return ("cpu" in name) and "cuda" not in name
+
 
 ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]
 
@@ -101,7 +135,7 @@ def is_cpu_only(name):
 # or as specified by the user
 progress = 0
 if not args.ci:
-    logfile = open('result.log', 'w')
+    logfile = open("result.log", "w")
     progressbar = tqdm.tqdm(total=len(ALL_TESTS))
 else:
     logfile = sys.stdout
@@ -110,53 +144,61 @@ def is_cpu_only(name):
     class ProgressbarStub:
         def update(self, *args):
             return
+
     progressbar = ProgressbarStub()
 
+
 async def run1(coroutine_id):
     global progress
 
-    if args.gpus == 'all':
+    if args.gpus == "all":
         gpuid = coroutine_id % GPUS
     else:
-        gpu_assignments = args.gpus.split(':')
-        assert args.nproc == len(gpu_assignments), 'Please specify GPU assignment for each process, separated by :'
+        gpu_assignments = args.gpus.split(":")
+        assert args.nproc == len(
+            gpu_assignments
+        ), "Please specify GPU assignment for each process, separated by :"
         gpuid = gpu_assignments[coroutine_id]
 
     while progress < len(ALL_TESTS):
         test = ALL_TESTS[progress]
         progress += 1
-        cmd = f'CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}'
-        proc = await asyncio.create_subprocess_shell(cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
+        cmd = f"CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}"
+        proc = await asyncio.create_subprocess_shell(
+            cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+        )
         try:
             stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout)
         except asyncio.TimeoutError:
-            print('Timeout:', test, file=logfile)
+            print("Timeout:", test, file=logfile)
             proc.kill()
             if args.ci and not args.nohang:
                 sys.exit("Hang detected on cuda-memcheck")
         else:
             if proc.returncode == 0:
-                print('Success:', test, file=logfile)
+                print("Success:", test, file=logfile)
             else:
                 stdout = stdout.decode()
                 stderr = stderr.decode()
                 should_display = args.strict or not is_ignored_only(stdout)
                 if should_display:
-                    print('Fail:', test, file=logfile)
+                    print("Fail:", test, file=logfile)
                     print(stdout, file=logfile)
                     print(stderr, file=logfile)
                     if args.ci:
                         sys.exit("Failure detected on cuda-memcheck")
                 else:
-                    print('Ignored:', test, file=logfile)
+                    print("Ignored:", test, file=logfile)
         del proc
         progressbar.update(1)
 
+
 async def main():
     tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
     for t in tasks:
         await t
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     loop = asyncio.get_event_loop()
     loop.run_until_complete(main())
diff --git a/test/simulate_nccl_errors.py b/test/simulate_nccl_errors.py
index 6b7d3cec1bd1f..d3275893f70dd 100644
--- a/test/simulate_nccl_errors.py
+++ b/test/simulate_nccl_errors.py
@@ -1,22 +1,26 @@
-
-import torch.distributed as c10d
-import torch
 import argparse
-import os
 import logging
-logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
+import os
+
+import torch
+import torch.distributed as c10d
+
+logging.basicConfig(
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
+)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description='Simple script to simulate NCCL errors. The script is '
-        'supposed to be run on multiple different nodes simultaneously with '
-        'appropriate rank and world_size. The script run an allreduce() on '
-        'the rank 0 node and aborts all the other nodes to simulate an error '
-        'in NCCL')
-    parser.add_argument('addr', help='address of the master node to connect to.')
-    parser.add_argument('port', help='port of the master node to connect to.')
-    parser.add_argument('rank', help='rank of this node')
-    parser.add_argument('world_size', help='number of nodes in process group')
+        description="Simple script to simulate NCCL errors. The script is "
+        "supposed to be run on multiple different nodes simultaneously with "
+        "appropriate rank and world_size. The script run an allreduce() on "
+        "the rank 0 node and aborts all the other nodes to simulate an error "
+        "in NCCL"
+    )
+    parser.add_argument("addr", help="address of the master node to connect to.")
+    parser.add_argument("port", help="port of the master node to connect to.")
+    parser.add_argument("rank", help="rank of this node")
+    parser.add_argument("world_size", help="number of nodes in process group")
     args = parser.parse_args()
     rank = int(args.rank)
     world_size = int(args.world_size)
@@ -24,14 +28,14 @@
 
     store = c10d.TCPStore(args.addr, port, world_size, rank == 0)
     process_group = c10d.ProcessGroupNCCL(store, rank, world_size)
-    logging.info('Running first allreduce')
+    logging.info("Running first allreduce")
     process_group.allreduce(torch.rand(10).cuda(rank)).wait()
     if rank == 0:
-        logging.info('Running second allreduce only on rank 0')
+        logging.info("Running second allreduce only on rank 0")
         work = process_group.allreduce(torch.rand(10).cuda(rank))
-        logging.info('Waiting for allreduce to complete...')
+        logging.info("Waiting for allreduce to complete...")
         work.wait()
-        logging.info('Second allreduce successful: %s', work.is_success())
+        logging.info("Second allreduce successful: %s", work.is_success())
     else:
-        logging.info('Aborting all other ranks.')
+        logging.info("Aborting all other ranks.")
         os.abort()
diff --git a/test/test_ao_sparsity.py b/test/test_ao_sparsity.py
index cfee70b0cb4e6..37a13ede6d436 100644
--- a/test/test_ao_sparsity.py
+++ b/test/test_ao_sparsity.py
@@ -1,46 +1,59 @@
 # Owner(s): ["module: unknown"]
 
-from torch.testing._internal.common_utils import run_tests, IS_ARM64
-
 # Kernels
-from ao.sparsity.test_kernels import TestQuantizedSparseKernels  # noqa: F401
-from ao.sparsity.test_kernels import TestQuantizedSparseLayers  # noqa: F401
+from ao.sparsity.test_kernels import (  # noqa: F401  # noqa: F401
+    TestQuantizedSparseKernels,
+    TestQuantizedSparseLayers,
+)
 
 # Parametrizations
 from ao.sparsity.test_parametrization import TestFakeSparsity  # noqa: F401
 
+# Scheduler
+from ao.sparsity.test_scheduler import (  # noqa: F401  # noqa: F401
+    TestCubicScheduler,
+    TestScheduler,
+)
+
 # Sparsifier
-from ao.sparsity.test_sparsifier import TestBaseSparsifier  # noqa: F401
-from ao.sparsity.test_sparsifier import TestWeightNormSparsifier  # noqa: F401
-from ao.sparsity.test_sparsifier import TestNearlyDiagonalSparsifier  # noqa: F401
+from ao.sparsity.test_sparsifier import (  # noqa: F401  # noqa: F401  # noqa: F401
+    TestBaseSparsifier,
+    TestNearlyDiagonalSparsifier,
+    TestWeightNormSparsifier,
+)
 
 # Structured Pruning
-from ao.sparsity.test_structured_sparsifier import TestBaseStructuredSparsifier  # noqa: F401
-from ao.sparsity.test_structured_sparsifier import TestSaliencyPruner  # noqa: F401
-from ao.sparsity.test_structured_sparsifier import TestFPGMPruner  # noqa: F401
-
-# Scheduler
-from ao.sparsity.test_scheduler import TestScheduler  # noqa: F401
-from ao.sparsity.test_scheduler import TestCubicScheduler  # noqa: F401
+from ao.sparsity.test_structured_sparsifier import (  # noqa: F401  # noqa: F401  # noqa: F401
+    TestBaseStructuredSparsifier,
+    TestFPGMPruner,
+    TestSaliencyPruner,
+)
+from torch.testing._internal.common_utils import IS_ARM64, run_tests
 
 # Composability
 if not IS_ARM64:
-    from ao.sparsity.test_composability import TestComposability  # noqa: F401
-    from ao.sparsity.test_composability import TestFxComposability  # noqa: F401
-
-# Utilities
-from ao.sparsity.test_sparsity_utils import TestSparsityUtilFunctions  # noqa: F401
+    from ao.sparsity.test_composability import (  # noqa: F401  # noqa: F401
+        TestComposability,
+        TestFxComposability,
+    )
 
-# Data Sparsifier
-from ao.sparsity.test_data_sparsifier import TestBaseDataSparsifier  # noqa: F401
-from ao.sparsity.test_data_sparsifier import TestNormDataSparsifiers  # noqa: F401
-from ao.sparsity.test_data_sparsifier import TestQuantizationUtils  # noqa: F401
+# Activation Sparsifier
+from ao.sparsity.test_activation_sparsifier import (  # noqa: F401
+    TestActivationSparsifier,
+)
 
 # Data Scheduler
 from ao.sparsity.test_data_scheduler import TestBaseDataScheduler  # noqa: F401
 
-# Activation Sparsifier
-from ao.sparsity.test_activation_sparsifier import TestActivationSparsifier  # noqa: F401
+# Data Sparsifier
+from ao.sparsity.test_data_sparsifier import (  # noqa: F401  # noqa: F401  # noqa: F401
+    TestBaseDataSparsifier,
+    TestNormDataSparsifiers,
+    TestQuantizationUtils,
+)
+
+# Utilities
+from ao.sparsity.test_sparsity_utils import TestSparsityUtilFunctions  # noqa: F401
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_autocast.py b/test/test_autocast.py
index 447032cff0a6e..2baa774f0df19 100644
--- a/test/test_autocast.py
+++ b/test/test_autocast.py
@@ -4,14 +4,20 @@
 import unittest
 
 import torch
-from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS, skipIfTorchDynamo
 from torch.testing._internal.autocast_test_lists import AutocastCPUTestLists
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    run_tests,
+    skipIfTorchDynamo,
+    TestCase,
+)
 from torch.utils._python_dispatch import TorchDispatchMode
 
+
 class TestAutocastCPU(TestCase):
     def setUp(self):
         super().setUp()
-        self.autocast_lists = AutocastCPUTestLists(torch.device('cpu'))
+        self.autocast_lists = AutocastCPUTestLists(torch.device("cpu"))
 
     def tearDown(self):
         del self.autocast_lists
@@ -49,18 +55,23 @@ def cast(val, to_type):
             if module is not None and hasattr(module, op):
                 output = getattr(module, op)(*args, **add_kwargs)
                 if isinstance(output, torch.Tensor):
-                    self.assertTrue(out_type == output.dtype,
-                                    f"autocast for torch.{op} produced {output.dtype}, should produce {out_type}")
+                    self.assertTrue(
+                        out_type == output.dtype,
+                        f"autocast for torch.{op} produced {output.dtype}, should produce {out_type}",
+                    )
             # Try Tensor.* variant:
             if hasattr(torch.Tensor, op):
                 output_method = getattr(args[0], op)(*args[1:], **add_kwargs)
                 if isinstance(output_method, torch.Tensor):
-                    self.assertTrue(out_type == output_method.dtype,
-                                    "autocast for torch.{} produced {}, should produce torch.{}"
-                                    .format(op, output_method.dtype, out_type))
-
-            self.assertTrue((output is not None) or (output_method is not None),
-                            f"{op} not found as an attribute on either Tensor or the requested module {module}")
+                    self.assertTrue(
+                        out_type == output_method.dtype,
+                        f"autocast for torch.{op} produced {output_method.dtype}, should produce torch.{out_type}",
+                    )
+
+            self.assertTrue(
+                (output is not None) or (output_method is not None),
+                f"{op} not found as an attribute on either Tensor or the requested module {module}",
+            )
 
             # Accounts for ops that return Tensors, iterables, and other non-Tensors.
             # For example, lstm_cell returns a tuple and equal returns bool.
@@ -76,7 +87,9 @@ def compare(first, second):
             if (output is not None) and (output_method is not None):
                 self.assertTrue(type(output) == type(output_method))
                 comparison = compare(output, output_method)
-                self.assertTrue(comparison, f"torch.{op} result did not match Tensor.{op} result")
+                self.assertTrue(
+                    comparison, f"torch.{op} result did not match Tensor.{op} result"
+                )
 
             # Compare numerics to Python-side "autocasting" that (we expect) does the same thing
             # as the C++-side autocasting, and should be bitwise accurate.
@@ -85,9 +98,13 @@ def compare(first, second):
                 self.assertFalse(torch.is_autocast_cpu_enabled())
 
                 if module is not None and hasattr(module, op):
-                    control = getattr(module, op)(*cast(args, run_as_type), **add_kwargs)
+                    control = getattr(module, op)(
+                        *cast(args, run_as_type), **add_kwargs
+                    )
                 else:
-                    control = getattr(args[0].to(run_as_type), op)(*cast(args[1:], run_as_type), **add_kwargs)
+                    control = getattr(args[0].to(run_as_type), op)(
+                        *cast(args[1:], run_as_type), **add_kwargs
+                    )
                 self.assertTrue(type(output_to_compare) == type(control))
                 comparison = compare(output_to_compare, control)
                 self.assertTrue(comparison, f"torch.{op} result did not match control")
@@ -100,26 +117,55 @@ def args_maybe_kwargs(self, op_with_args):
         else:
             return op_with_args[0], op_with_args[1], op_with_args[2]
 
-    @skipIfTorchDynamo
+    @skipIfTorchDynamo()
     def test_autocast_torch_expect_builtin_promote(self):
-        for op, args1, args2, out_type in self.autocast_lists.torch_expect_builtin_promote:
+        for (
+            op,
+            args1,
+            args2,
+            out_type,
+        ) in self.autocast_lists.torch_expect_builtin_promote:
             self._run_autocast_outofplace(op, args1, torch.float32, out_type=out_type)
-            self._run_autocast_outofplace(op, args2, torch.float32, out_type=out_type, amp_dtype=torch.float16)
+            self._run_autocast_outofplace(
+                op, args2, torch.float32, out_type=out_type, amp_dtype=torch.float16
+            )
 
-    @skipIfTorchDynamo
+    @skipIfTorchDynamo()
     def test_autocast_methods_expect_builtin_promote(self):
-        for op, args1, args2, out_type in self.autocast_lists.methods_expect_builtin_promote:
-            self._run_autocast_outofplace(op, args1, torch.float32, module=None, out_type=out_type)
-            self._run_autocast_outofplace(op, args2, torch.float32, module=None, out_type=out_type, amp_dtype=torch.float16)
+        for (
+            op,
+            args1,
+            args2,
+            out_type,
+        ) in self.autocast_lists.methods_expect_builtin_promote:
+            self._run_autocast_outofplace(
+                op, args1, torch.float32, module=None, out_type=out_type
+            )
+            self._run_autocast_outofplace(
+                op,
+                args2,
+                torch.float32,
+                module=None,
+                out_type=out_type,
+                amp_dtype=torch.float16,
+            )
 
-    @skipIfTorchDynamo
+    @skipIfTorchDynamo()
     def test_autocast_torch_16(self):
         for op_with_args in self.autocast_lists.torch_16:
             op, args, maybe_kwargs = self.args_maybe_kwargs(op_with_args)
-            self._run_autocast_outofplace(op, args, torch.bfloat16, add_kwargs=maybe_kwargs)
-            self._run_autocast_outofplace(op, args, torch.float16, add_kwargs=maybe_kwargs, amp_dtype=torch.float16)
+            self._run_autocast_outofplace(
+                op, args, torch.bfloat16, add_kwargs=maybe_kwargs
+            )
+            self._run_autocast_outofplace(
+                op,
+                args,
+                torch.float16,
+                add_kwargs=maybe_kwargs,
+                amp_dtype=torch.float16,
+            )
 
-    @skipIfTorchDynamo
+    @skipIfTorchDynamo()
     def test_autocast_nn_16(self):
         for op_with_args in self.autocast_lists.nn_16:
             op, args, maybe_kwargs = self.args_maybe_kwargs(op_with_args)
@@ -135,14 +181,22 @@ def test_autocast_nn_16(self):
                 amp_dtype=torch.float16,
             )
 
-    @skipIfTorchDynamo
+    @skipIfTorchDynamo()
     def test_autocast_torch_fp32(self):
         for op_with_args in self.autocast_lists.torch_fp32:
             op, args, maybe_kwargs = self.args_maybe_kwargs(op_with_args)
-            self._run_autocast_outofplace(op, args, torch.float32, add_kwargs=maybe_kwargs)
-            self._run_autocast_outofplace(op, args, torch.float32, add_kwargs=maybe_kwargs, amp_dtype=torch.float16)
+            self._run_autocast_outofplace(
+                op, args, torch.float32, add_kwargs=maybe_kwargs
+            )
+            self._run_autocast_outofplace(
+                op,
+                args,
+                torch.float32,
+                add_kwargs=maybe_kwargs,
+                amp_dtype=torch.float16,
+            )
 
-    @skipIfTorchDynamo
+    @skipIfTorchDynamo()
     def test_autocast_nn_fp32(self):
         for op_with_args in self.autocast_lists.nn_fp32:
             op, args, maybe_kwargs = self.args_maybe_kwargs(op_with_args)
@@ -158,15 +212,20 @@ def test_autocast_nn_fp32(self):
                 amp_dtype=torch.float16,
             )
 
-    @skipIfTorchDynamo
+    @skipIfTorchDynamo()
     def test_autocast_torch_need_autocast_promote(self):
         for op, args1, args2 in self.autocast_lists.torch_need_autocast_promote:
             self._run_autocast_outofplace(op, args1, torch.float32)
-            self._run_autocast_outofplace(op, args2, torch.float32, amp_dtype=torch.float16)
+            self._run_autocast_outofplace(
+                op, args2, torch.float32, amp_dtype=torch.float16
+            )
 
     @unittest.skipIf(IS_WINDOWS, "Limit support for bf16 path")
     def test_autocast_rnn(self):
-        if torch.backends.mkldnn.is_available() and torch.ops.mkldnn._is_mkldnn_bf16_supported():
+        if (
+            torch.backends.mkldnn.is_available()
+            and torch.ops.mkldnn._is_mkldnn_bf16_supported()
+        ):
             x = torch.randn(1, 2, 1)
             hx = torch.randn(2, 2, 1)
             cx = torch.randn(2, 2, 1)
@@ -182,9 +241,19 @@ def test_autocast_rnn(self):
                 m(x, (hx, cx))
 
     def test_autocast_disabled_with_fp32_dtype(self):
-        with torch.autocast(device_type='cpu', dtype=torch.float32, enabled=False):
+        with torch.autocast(device_type="cpu", dtype=torch.float32, enabled=False):
             _ = torch.ones(10)
 
+    def test_generic_autocast(self):
+        for op_with_args in self.autocast_lists.torch_16:
+            op, args, maybe_kwargs = self.args_maybe_kwargs(op_with_args)
+            with torch.amp.autocast(device_type="cpu"):
+                generic_autocast_output = getattr(torch, op)(*args, **maybe_kwargs)
+            with torch.cpu.amp.autocast():
+                cpu_autocast_output = getattr(torch, op)(*args, **maybe_kwargs)
+            self.assertEqual(generic_autocast_output, cpu_autocast_output)
+
+
 class CustomLinear(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x, w_t):
@@ -194,13 +263,13 @@ def forward(ctx, x, w_t):
     @staticmethod
     def backward(ctx, grad_output):
         x, w_t = ctx.saved_tensors
-        with torch.autocast(device_type='cuda'):
+        with torch.autocast(device_type="cuda"):
             dL_dX = torch.matmul(grad_output, w_t)
             dL_dW = torch.matmul(x.transpose(0, 1), grad_output).transpose(0, 1)
         return dL_dX, dL_dW
 
-class WeightDTypeCastCounterMode(TorchDispatchMode):
 
+class WeightDTypeCastCounterMode(TorchDispatchMode):
     def __init__(self, weight):
         super().__init__()
         self.dtype_cast_counter = 0
@@ -208,9 +277,9 @@ def __init__(self, weight):
 
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         if (
-            func is torch.ops.aten._to_copy.default and
-            args[0] is self.weight and
-            kwargs['dtype'] is torch.float16
+            func is torch.ops.aten._to_copy.default
+            and args[0] is self.weight
+            and kwargs["dtype"] is torch.float16
         ):
             self.dtype_cast_counter += 1
         return func(*args, **kwargs)
@@ -224,6 +293,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         torch.clear_autocast_cache = self.old_clear_cache
         return super().__exit__(exc_type, exc_val, exc_tb)
 
+
 @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
 class TestAutocastGPU(TestCase):
     def test_cast_cache_is_global(self):
@@ -238,7 +308,7 @@ def test_cast_cache_is_global(self):
         weight = torch.nn.Parameter(torch.randn(4, 3).cuda())
 
         with WeightDTypeCastCounterMode(weight) as mode:
-            with torch.autocast(device_type='cuda'):
+            with torch.autocast(device_type="cuda"):
                 output = CustomLinear.apply(data, weight)
                 s = output.sum()
             s.backward()
@@ -246,7 +316,6 @@ def test_cast_cache_is_global(self):
         self.assertEqual(mode.dtype_cast_counter, 1)
 
     def test_cache_disabled(self):
-
         data = torch.randn(2, 3).cuda()
         weight = torch.nn.Parameter(torch.randn(4, 3).cuda())
 
@@ -255,7 +324,7 @@ def test_cache_disabled(self):
             torch._C._add_cached_tensor(weight)
 
             with WeightDTypeCastCounterMode(weight) as mode:
-                with torch.autocast(device_type='cuda'):
+                with torch.autocast(device_type="cuda"):
                     output = CustomLinear.apply(data, weight)
                     s = output.sum()
                 s.backward()
@@ -275,12 +344,21 @@ def test_autocast_fast_dtype(self):
         self.assertEqual(cpu_fast_dtype, torch.bfloat16)
 
     def test_invalid_device(self):
-        dev = 'not a real device'
-        msg = f'unsupported autocast device_type \'{dev}\''
+        dev = "not a real device"
+        msg = f"Invalid device string: '{dev}'"
         with self.assertRaisesRegex(RuntimeError, msg):
             with torch.autocast(device_type=dev):
                 _ = torch.tensor(1)
+        with self.assertRaisesRegex(RuntimeError, msg):
+            assert torch.amp.is_autocast_available(device_type=dev)
+
+    def test_non_string_device(self):
+        """Test that `autocast` throws a ValueError when provided a `torch.device` object for `device_type` instead of a string"""
+        dev = torch.device("cpu")
+        msg = f"Expected `device_type` of type `str`, got: `{type(dev)}`"
+        with self.assertRaisesRegex(expected_exception=ValueError, expected_regex=msg):
+            torch.autocast(device_type=dev)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_autograd.py b/test/test_autograd.py
index a92c1bc2cd615..79880a1d628ac 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -1,11 +1,15 @@
 # Owner(s): ["module: autograd"]
 
+import collections
 import contextlib
 import gc
 import io
 import math
+import operator
 import os
+import pickle
 import random
+import subprocess
 import sys
 import tempfile
 import threading
@@ -13,55 +17,82 @@
 import unittest
 import uuid
 import warnings
-import operator
-import subprocess
-from copy import deepcopy
+import weakref
 from collections import OrderedDict
+from copy import deepcopy
+from functools import partial, reduce
 from itertools import product
 from operator import mul
-from functools import reduce, partial
+from typing import List, Tuple
+
 import torch
+import torch.autograd._functions
+import torch.autograd.forward_ad as fwAD
 
-from torch import nn
-from torch import inf, nan
-from torch.autograd.function import once_differentiable
-from torch.autograd.profiler import (profile, record_function, emit_nvtx, emit_itt)
-from torch.autograd.profiler_util import (_format_time, EventList, FunctionEvent, FunctionEventAvg)
-from torch.utils.checkpoint import checkpoint, checkpoint_sequential
+from torch import inf, nan, nn
+from torch.autograd import (
+    _calculate_shape,
+    detect_anomaly,
+    Function,
+    kineto_available,
+    Variable,
+)
+from torch.autograd.function import InplaceFunction, once_differentiable
+from torch.autograd.graph import GradientEdge
+from torch.autograd.profiler import emit_itt, emit_nvtx, profile, record_function
+from torch.autograd.profiler_util import (
+    _format_time,
+    EventList,
+    FunctionEvent,
+    FunctionEventAvg,
+)
 from torch.testing import make_tensor
 from torch.testing._internal.common_cuda import TEST_CUDA
-from torch.testing._internal.common_utils import (
-    TestCase, run_tests, skipIfNoLapack, slowTest, IS_WINDOWS, IS_MACOS,
-    disable_gc, gradcheck, gradgradcheck, parametrize,
-    instantiate_parametrized_tests, skipIfMps, set_warn_always_context)
-from torch.autograd import Variable, Function, detect_anomaly, kineto_available, _calculate_shape
-from torch.autograd.function import InplaceFunction
-import torch.autograd.forward_ad as fwAD
-from torch.autograd.graph import GradientEdge
-import torch.autograd._functions
-from torch.testing._internal.common_methods_invocations import mask_not_all_zeros
-from torch.testing._internal.common_device_type import (instantiate_device_type_tests,
-                                                        onlyCPU, onlyCUDA, dtypes, dtypesIfCUDA,
-                                                        deviceCountAtLeast, skipMeta, dtypesIfMPS)
+from torch.testing._internal.common_device_type import (
+    deviceCountAtLeast,
+    dtypes,
+    dtypesIfCUDA,
+    dtypesIfMPS,
+    instantiate_device_type_tests,
+    onlyCPU,
+    onlyCUDA,
+    skipMeta,
+)
 from torch.testing._internal.common_dtype import floating_types_and
+from torch.testing._internal.common_methods_invocations import mask_not_all_zeros
+from torch.testing._internal.common_utils import (
+    disable_gc,
+    gradcheck,
+    gradgradcheck,
+    instantiate_parametrized_tests,
+    IS_MACOS,
+    IS_WINDOWS,
+    parametrize,
+    run_tests,
+    set_warn_always_context,
+    skipIfMps,
+    skipIfNoLapack,
+    skipIfTorchDynamo,
+    slowTest,
+    TestCase,
+)
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._python_dispatch import TorchDispatchMode
-import weakref
-import collections
-import pickle
+from torch.utils.checkpoint import checkpoint, checkpoint_sequential
+from torch.utils.hooks import RemovableHandle
 
 
 def graph_desc(fn):
     if fn is None:
-        return 'None'
-    result = type(fn).__name__ + '('
+        return "None"
+    result = type(fn).__name__ + "("
     next_functions = fn.next_functions
     for next_fn, _ in next_functions:
         result += graph_desc(next_fn)
-        result += ', '
+        result += ", "
     if next_functions:
         result = result[:-2]
-    return result + ')'
+    return result + ")"
 
 
 class TestAutograd(TestCase):
@@ -109,6 +140,7 @@ def f3(x, y):
             def hook(*args):
                 # This should never be called!
                 self.assertTrue(False)
+
             out.register_hook(hook)
 
             b = out + y
@@ -118,10 +150,10 @@ def hook(*args):
         out, b = f3(x, y_safe)
         torch.autograd.grad(out.sum(), (b, y_safe))
 
-
     def test_grad_mode_class_decoration(self):
         # Decorating class is deprecated and should not be used
         with self.assertWarnsRegex(UserWarning, "Decorating classes is deprecated"):
+
             @torch.no_grad()
             class Foo:
                 def __init__(self):
@@ -137,6 +169,7 @@ def foo(self):
 
         # Decorating functions or methods is fine though
         with warnings.catch_warnings(record=True) as w:
+
             @torch.no_grad()
             def foo():
                 assert not torch.is_grad_enabled()
@@ -192,7 +225,6 @@ def _function_test(self, cls):
 
     def test_function(self):
         class MyFunction(Function):
-
             @staticmethod
             def forward(ctx, tensor1, pyscalar, tensor2):
                 ctx.pyscalar = pyscalar
@@ -206,8 +238,11 @@ def backward(ctx, grad_output):
                 self.assertIsInstance(var1, torch.Tensor)
                 self.assertIsInstance(var2, torch.Tensor)
                 self.assertIsInstance(grad_output, torch.Tensor)
-                return (grad_output + grad_output * var2, None,
-                        grad_output * ctx.pyscalar + grad_output * var1)
+                return (
+                    grad_output + grad_output * var2,
+                    None,
+                    grad_output * ctx.pyscalar + grad_output * var1,
+                )
 
         x, y = self._function_test(MyFunction)
 
@@ -218,7 +253,6 @@ def backward(ctx, grad_output):
 
     def test_once_differentiable(self):
         class MyFunction(Function):
-
             @staticmethod
             def forward(ctx, tensor1, pyscalar, tensor2):
                 ctx.pyscalar = pyscalar
@@ -230,14 +264,21 @@ def forward(ctx, tensor1, pyscalar, tensor2):
             def backward(ctx, grad_output):
                 self.assertFalse(torch.is_grad_enabled())
                 t1, t2 = ctx.saved_tensors
-                return (grad_output + grad_output * t2, None,
-                        grad_output * ctx.pyscalar + grad_output * t1)
+                return (
+                    grad_output + grad_output * t2,
+                    None,
+                    grad_output * ctx.pyscalar + grad_output * t1,
+                )
 
         x, y = self._function_test(MyFunction)
-        self.assertEqual(graph_desc(x.grad.grad_fn),
-                         'CopyBackwards(None, Error(AccumulateGrad(), None, AccumulateGrad()))')
-        self.assertEqual(graph_desc(y.grad.grad_fn),
-                         'CopyBackwards(None, Error(AccumulateGrad(), None, AccumulateGrad()))')
+        self.assertEqual(
+            graph_desc(x.grad.grad_fn),
+            "CopyBackwards(None, Error(AccumulateGrad(), None, AccumulateGrad()))",
+        )
+        self.assertEqual(
+            graph_desc(y.grad.grad_fn),
+            "CopyBackwards(None, Error(AccumulateGrad(), None, AccumulateGrad()))",
+        )
 
     def test_function_returns_input(self):
         class MyFunction(Function):
@@ -252,12 +293,12 @@ def backward(ctx, grad):
         for shape in [(1,), ()]:
             v = torch.ones(shape, requires_grad=True)
             MyFunction.apply(v).backward()
-            self.assertEqual(v.grad, torch.full(shape, 2.))
+            self.assertEqual(v.grad, torch.full(shape, 2.0))
 
             with torch.no_grad():
                 v.grad.zero_()
             MyFunction.apply(v.clone()).backward()
-            self.assertEqual(v.grad, torch.full(shape, 2.))
+            self.assertEqual(v.grad, torch.full(shape, 2.0))
 
     def test_function_returns_undefined_tensor(self):
         class MyFunction(Function):
@@ -276,13 +317,15 @@ def backward(ctx, grad):
         MyFunction.apply(x).backward()
         self.assertIsNone(x.grad)
 
-        MyFunction.apply(x ** 2).backward()
+        MyFunction.apply(x**2).backward()
         self.assertIsNone(x.grad)
 
         MyFunction.apply(x).sum().backward()
         self.assertIsNone(x.grad)
 
-        self.assertIsNone(torch.autograd.grad(MyFunction.apply(x), x, allow_unused=True)[0])
+        self.assertIsNone(
+            torch.autograd.grad(MyFunction.apply(x), x, allow_unused=True)[0]
+        )
 
     def test_materialize_grads(self):
         class MyFunction(Function):
@@ -328,7 +371,7 @@ def backward(ctx, g0, g1):
                 self.assertIsNone(g1)
                 return g0
 
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         out = Func.apply(a)[0]
         out.backward()
 
@@ -343,12 +386,12 @@ def backward(self, grad_output):
 
         # Check exception occurs
         with self.assertRaisesRegex(
-                RuntimeError,
-                'Legacy autograd function with non-static forward method is deprecated'):
+            RuntimeError,
+            "Legacy autograd function with non-static forward method is deprecated",
+        ):
             MyFunction()(torch.randn(3, 4))
 
     class SimulateBackwardError(Function):
-
         @staticmethod
         def forward(ctx, input):
             return input.clone()
@@ -356,10 +399,9 @@ def forward(ctx, input):
         @staticmethod
         @once_differentiable
         def backward(ctx, input):
-            raise Exception("Simulate error on backward pass")
+            raise Exception("Simulate error on backward pass")  # noqa: TRY002
 
     def test_custom_function_exception(self):
-
         t1 = torch.rand((3, 3), requires_grad=True)
         t2 = torch.rand((3, 3), requires_grad=True)
 
@@ -370,7 +412,6 @@ def test_custom_function_exception(self):
 
     def test_custom_function_non_tensor_inputs_outputs(self):
         class MyFunction(Function):
-
             @staticmethod
             def forward(ctx, t1, t2, scale, t3):
                 t4 = t1 + t2 * t3
@@ -430,7 +471,6 @@ def foo(t1, t2, t3):
 
     def test_custom_function_no_tensors(self):
         class MyFunction(Function):
-
             @staticmethod
             def forward(ctx, t1, t2, scale, t3):
                 t4 = t1 + t2 * t3
@@ -467,7 +507,7 @@ def forward(ctx, x):
             def backward(ctx, grad_output):
                 return torch.randn(10, dtype=torch.float)
 
-        with self.assertRaisesRegex(RuntimeError, 'expected shape'):
+        with self.assertRaisesRegex(RuntimeError, "expected shape"):
             input = torch.randn(5, 5, dtype=torch.float, requires_grad=True)
             MyFunction.apply(input).sum().backward()
 
@@ -489,8 +529,8 @@ def test_not_implemented_grad(self):
         # if grad for nextafter ends up being implemented, this should be changed
         y = torch.nextafter(a, a).sum()
         with self.assertRaisesRegex(
-                NotImplementedError,
-                'the derivative for .* is not implemented'):
+            NotImplementedError, "the derivative for .* is not implemented"
+        ):
             y.backward()
 
     def test_not_implemented_fwad(self):
@@ -557,7 +597,9 @@ def fn(x):
         torch.autograd.backward(out, retain_graph=True)
 
         # .grad is NOT OK when leaf is passed (this is the current state, subject to change)
-        with self.assertRaisesRegex(RuntimeError, "are currently running autograd.grad()"):
+        with self.assertRaisesRegex(
+            RuntimeError, "are currently running autograd.grad()"
+        ):
             torch.autograd.grad(out, (a,))
 
         # .grad is OK when non-leaf is passed
@@ -568,6 +610,7 @@ def fn(x):
             # Check a non-leaf
             counter[0] += 1
             self.assertTrue(torch._C._will_engine_execute_node(b.grad_fn))
+
         b.register_hook(fn)
         counter[0] = 0
         torch.autograd.grad(b.sum(), (a,))
@@ -584,40 +627,40 @@ def test_custom_function_vmap_defaults(self):
         class MySquare(Function):
             @staticmethod
             def forward(x):
-                return x ** 2
+                return x**2
 
             @staticmethod
             def setup_context(ctx, inputs, output):
-                x, = inputs
+                (x,) = inputs
                 ctx.save_for_backward(x)
 
             @staticmethod
             def backward(ctx, gO):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 return gO * 2 * x
 
         self.assertFalse(MySquare.generate_vmap_rule)
-        self.assertTrue(hasattr(MySquare, 'vmap'))
+        self.assertTrue(hasattr(MySquare, "vmap"))
 
     def test_custom_function_setup_context_simple(self):
         class MySquare(Function):
             @staticmethod
             def forward(x):
-                return x ** 2
+                return x**2
 
             @staticmethod
             def setup_context(ctx, inputs, output):
-                x, = inputs
+                (x,) = inputs
                 ctx.save_for_backward(x)
 
             @staticmethod
             def backward(ctx, gO):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 return gO * 2 * x
 
         x = torch.randn([], requires_grad=True)
         y = MySquare.apply(x)
-        gx, = torch.autograd.grad(y, x)
+        (gx,) = torch.autograd.grad(y, x)
         self.assertEqual(gx, 2 * x)
 
     def test_custom_function_setup_context_multi_output(self):
@@ -626,11 +669,11 @@ class MySquare(Function):
             @staticmethod
             def forward(x):
                 two_x = x.item() * 2
-                return x ** 2, two_x
+                return x**2, two_x
 
             @staticmethod
             def setup_context(ctx, inputs, output):
-                x, = inputs
+                (x,) = inputs
                 _, two_x = output
                 ctx.two_x = two_x
 
@@ -641,7 +684,7 @@ def backward(ctx, gO, _):
 
         x = torch.randn([], requires_grad=True)
         y, _ = MySquare.apply(x)
-        gx, = torch.autograd.grad(y, x)
+        (gx,) = torch.autograd.grad(y, x)
         self.assertEqual(gx, 2 * x)
 
     def test_custom_function_setup_context_multi_input(self):
@@ -673,10 +716,12 @@ def backward(ctx, gO):
 
         def test(x, shape, scale_forward, scale_backward):
             y = MyReshape.apply(x, shape, scale_forward, scale_backward).sum()
-            gx, = torch.autograd.grad(y, x)
+            (gx,) = torch.autograd.grad(y, x)
 
-            y_expected = MyReshapeRef.apply(x, shape, scale_forward, scale_backward).sum()
-            gx_expected, = torch.autograd.grad(y_expected, x)
+            y_expected = MyReshapeRef.apply(
+                x, shape, scale_forward, scale_backward
+            ).sum()
+            (gx_expected,) = torch.autograd.grad(y_expected, x)
 
             self.assertEqual(y_expected, y)
             self.assertEqual(gx_expected, gx)
@@ -705,20 +750,30 @@ def compute_grad(create_graph):
         self.assertEqual(x_grad, x_grad_clone)
 
     def test_accumulate_grad_tensor_reference(self):
-        def _test_grad_tensor(params_grad_tensor, backward_grad_tensor, should_preserve_reference, create_graph):
+        def _test_grad_tensor(
+            params_grad_tensor,
+            backward_grad_tensor,
+            should_preserve_reference,
+            create_graph,
+        ):
             params = torch.tensor([1.5, 1.5]).requires_grad_()
             params.grad = params_grad_tensor
             grad_saved = params.grad
             params.backward(backward_grad_tensor, create_graph=create_graph)
-            self.assertEqual(id(grad_saved) == id(params.grad), should_preserve_reference)
+            self.assertEqual(
+                id(grad_saved) == id(params.grad), should_preserve_reference
+            )
 
         for create_graph in (False, True):
             # Accumulate dense gradient to sparse gradient will change the `params.grad` reference
             _test_grad_tensor(
-                torch.sparse_coo_tensor(torch.tensor([[1, 1]]).long(), torch.tensor([1., 1.])),
+                torch.sparse_coo_tensor(
+                    torch.tensor([[1, 1]]).long(), torch.tensor([1.0, 1.0])
+                ),
                 torch.tensor([1.5, 1.5]),
                 False,  # never accumulates in-place
-                create_graph)
+                create_graph,
+            )
 
             # Accumulate dense gradient to dense gradient will preserve the `params.grad` reference,
             # but only if create_graph=False.
@@ -726,15 +781,21 @@ def _test_grad_tensor(params_grad_tensor, backward_grad_tensor, should_preserve_
                 torch.tensor([1.5, 1.5]),
                 torch.tensor([1.5, 1.5]),
                 not create_graph,
-                create_graph)
+                create_graph,
+            )
 
             # Accumulate sparse gradient to sparse gradient will preserve the `params.grad` reference,
             # but only if create_graph=False.
             _test_grad_tensor(
-                torch.sparse_coo_tensor(torch.tensor([[1, 1]]).long(), torch.tensor([1., 1.])),
-                torch.sparse_coo_tensor(torch.tensor([[1, 1]]).long(), torch.tensor([1., 1.])),
+                torch.sparse_coo_tensor(
+                    torch.tensor([[1, 1]]).long(), torch.tensor([1.0, 1.0])
+                ),
+                torch.sparse_coo_tensor(
+                    torch.tensor([[1, 1]]).long(), torch.tensor([1.0, 1.0])
+                ),
                 not create_graph,
-                create_graph)
+                create_graph,
+            )
 
     def test_accumulate_grad_with_zero_numel_grad(self):
         a = torch.rand(4, 0, requires_grad=True)
@@ -750,7 +811,7 @@ def test_hessian_vector(self):
         x = torch.randn(2, 2, requires_grad=True)
         y = torch.randn(2, 2, requires_grad=True)
 
-        z = x ** 2 + y * x + y ** 2
+        z = x**2 + y * x + y**2
         z.backward(torch.ones(2, 2), create_graph=True)
 
         with torch.no_grad():
@@ -769,7 +830,7 @@ def test_hessian_vector(self):
     def test_grad(self):
         x = torch.randn(2, 2, requires_grad=True)
         y = torch.randn(2, 2, requires_grad=True)
-        z = x ** 2 + y * x + y ** 2
+        z = x**2 + y * x + y**2
         z.backward(torch.ones(2, 2), create_graph=True)
 
         x_grad = 2 * x + y
@@ -779,8 +840,11 @@ def test_grad(self):
 
         grad_sum = 2 * x.grad + y.grad
         x_hv = torch.autograd.grad(
-            outputs=[grad_sum], grad_outputs=[torch.ones(2, 2)],
-            inputs=[x], create_graph=True)
+            outputs=[grad_sum],
+            grad_outputs=[torch.ones(2, 2)],
+            inputs=[x],
+            create_graph=True,
+        )
         expected_x_hv = torch.ones(2, 2) * 5
         expected_y_hv = torch.ones(2, 2) * 4
 
@@ -792,13 +856,21 @@ def test_grad(self):
         grad_out = torch.ones(2)
         try:
             torch.autograd.grad(
-                outputs=[grad_sum], grad_outputs=[grad_out],
-                inputs=[x], create_graph=True)
+                outputs=[grad_sum],
+                grad_outputs=[grad_out],
+                inputs=[x],
+                create_graph=True,
+            )
             self.assertFail()
         except RuntimeError as error:
-            self.assertEqual(str(error), "Mismatch in shape: grad_output[0] has a shape of "
-                             + str(grad_out.shape) + " and output[0] has a shape of "
-                             + str(grad_sum.shape) + ".")
+            self.assertEqual(
+                str(error),
+                "Mismatch in shape: grad_output[0] has a shape of "
+                + str(grad_out.shape)
+                + " and output[0] has a shape of "
+                + str(grad_sum.shape)
+                + ".",
+            )
 
     def test_grad_to_node(self):
         def check_matches(out, inp):
@@ -857,7 +929,10 @@ def test_grad_to_node_multi(self):
 
         ref = torch.autograd.grad(out.sum(), (x, y))
 
-        inp_edges = (GradientEdge(x.grad_fn, x.output_nr), GradientEdge(y.grad_fn, y.output_nr))
+        inp_edges = (
+            GradientEdge(x.grad_fn, x.output_nr),
+            GradientEdge(y.grad_fn, y.output_nr),
+        )
         new = torch.autograd.grad(out.sum(), inp_edges)
 
         self.assertEqual(ref, new)
@@ -871,12 +946,21 @@ def test_grad_to_node_materialize(self):
         out = x.clone()
 
         # Works
-        torch.autograd.grad(out.sum(), (edge_x, y), allow_unused=True, materialize_grads=True)
-        torch.autograd.grad(out.sum(), (x, y), allow_unused=True, materialize_grads=True)
+        torch.autograd.grad(
+            out.sum(), (edge_x, y), allow_unused=True, materialize_grads=True
+        )
+        torch.autograd.grad(
+            out.sum(), (x, y), allow_unused=True, materialize_grads=True
+        )
         torch.autograd.grad(out.sum(), (x, edge_y), allow_unused=True)
 
-        with self.assertRaisesRegex(RuntimeError, "materialize_grads cannot be used when the given input is a GradientEdge"):
-            torch.autograd.grad(out.sum(), (x, edge_y), allow_unused=True, materialize_grads=True)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "materialize_grads cannot be used when the given input is a GradientEdge",
+        ):
+            torch.autograd.grad(
+                out.sum(), (x, edge_y), allow_unused=True, materialize_grads=True
+            )
 
     def test_backward_to_node(self):
         x = torch.rand(2, requires_grad=True).clone()
@@ -899,11 +983,12 @@ def test_grad_nonleaf(self):
         grad_output = torch.ones(2, 2)
 
         def fn(x):
-            return x ** 2 + y * x + y ** 2
+            return x**2 + y * x + y**2
 
         for _ in range(5):
-            grad_x, = torch.autograd.grad(
-                fn(x), x, grad_outputs=grad_output, create_graph=True)
+            (grad_x,) = torch.autograd.grad(
+                fn(x), x, grad_outputs=grad_output, create_graph=True
+            )
 
             grad_x_expected = 2 * x + y
             self.assertIsNone(y.grad)
@@ -929,12 +1014,14 @@ def test_grad_nonleaf_many_outputs(self):
 
         def hook(*grads):
             hook_called[0] = True
+
         hook_called = [False]
         x.register_hook(hook)
 
         go = torch.randn(2, 2)
         grad_a, grad_b = torch.autograd.grad(
-            (a + 2 * b), [a, b], grad_outputs=go, create_graph=True)
+            (a + 2 * b), [a, b], grad_outputs=go, create_graph=True
+        )
 
         self.assertEqual(grad_a, go)
         self.assertEqual(grad_b, go * 2)
@@ -953,11 +1040,12 @@ def test_grad_nonleaf_register_hook(self):
 
         def hook(grad):
             hook_results[0] = grad
+
         x0.register_hook(hook)
 
         x_list[0].backward()
-        self.assertEqual(hook_results[0], torch.tensor(1.))
-        expected_grad = torch.tensor([1., 0, 0, 0, 0])
+        self.assertEqual(hook_results[0], torch.tensor(1.0))
+        expected_grad = torch.tensor([1.0, 0, 0, 0, 0])
         self.assertEqual(x.grad, expected_grad)
         self.assertIsNone(x_list[0].grad)
 
@@ -974,24 +1062,30 @@ def test_grad_materialize_grads(self):
         y = x * a
         dydx = torch.autograd.grad(y, x, create_graph=True)
         d2ydx2_none = torch.autograd.grad(dydx, x, create_graph=True, allow_unused=True)
-        d2ydx2 = torch.autograd.grad(dydx, x, create_graph=True, allow_unused=True, materialize_grads=True)
+        d2ydx2 = torch.autograd.grad(
+            dydx, x, create_graph=True, allow_unused=True, materialize_grads=True
+        )
         # `allow_unused` set to True implicitly
         d3ydx3 = torch.autograd.grad(d2ydx2, x, materialize_grads=True)
         self.assertIsNone(d2ydx2_none[0])
         self.assertEqual(d2ydx2[0].item(), 0)
         self.assertEqual(d3ydx3[0].item(), 0)
-        with self.assertRaisesRegex(ValueError, "Expected allow_unused to be True or not passed when"):
+        with self.assertRaisesRegex(
+            ValueError, "Expected allow_unused to be True or not passed when"
+        ):
             torch.autograd.grad(y, x, allow_unused=False, materialize_grads=True)
 
     def test_post_accumulate_grad_hook_on_non_leaf(self):
         def hook(tensor):
-            tensor.sub_(1.)
+            tensor.sub_(1.0)
+
         leaf = torch.rand(3, requires_grad=True)
-        non_leaf = 2. * leaf
+        non_leaf = 2.0 * leaf
 
         with self.assertRaisesRegex(
-                RuntimeError,
-                "post accumulate grad hooks cannot be registered on non-leaf tensors"):
+            RuntimeError,
+            "post accumulate grad hooks cannot be registered on non-leaf tensors",
+        ):
             non_leaf.register_post_accumulate_grad_hook(hook)
 
     def test_post_accumulate_grad_hook_multiple_hooks(self):
@@ -999,7 +1093,8 @@ def hook1(tensor):
             tensor.sub_(tensor.grad)
 
         def hook2(tensor):
-            tensor.mul_(4.)
+            tensor.mul_(4.0)
+
         tensor = torch.rand(3, requires_grad=True)
         tensor_ref = tensor.clone().detach()
         tensor.register_post_accumulate_grad_hook(hook1)
@@ -1007,11 +1102,12 @@ def hook2(tensor):
         sum = tensor.sum()
         sum.backward()
         # both hooks should be called, in order
-        self.assertEqual(4. * (tensor_ref - 1.), tensor)
+        self.assertEqual(4.0 * (tensor_ref - 1.0), tensor)
 
     def test_post_accumulate_grad_hook_multiple_tensors(self):
         def hook(tensor):
             tensor.sub_(tensor.grad)
+
         tensor1 = torch.rand(3, requires_grad=True)
         tensor1_ref = tensor1.clone().detach()
         tensor2 = torch.rand(5, requires_grad=True)
@@ -1021,12 +1117,13 @@ def hook(tensor):
         tensor1.sum().backward()
         tensor2.sum().backward()
         # both tensors should have been modified
-        self.assertEqual(tensor1_ref - 1., tensor1)
-        self.assertEqual(tensor2_ref - 1., tensor2)
+        self.assertEqual(tensor1_ref - 1.0, tensor1)
+        self.assertEqual(tensor2_ref - 1.0, tensor2)
 
     def test_post_accumulate_grad_hook_returns_not_None(self):
         def bad_hook(tensor):
             return tensor.grad
+
         tensor = torch.rand(2, 3, requires_grad=True)
         tensor.register_post_accumulate_grad_hook(bad_hook)
         # should error!
@@ -1085,12 +1182,13 @@ def optim_step_hook(param):
             optim_copy.step()
             optim_copy.zero_grad()
 
-        for p_static, p_reference, p in zip(params_copy, model_copy.parameters(), model.parameters()):
+        for p_static, p_reference, p in zip(
+            params_copy, model_copy.parameters(), model.parameters()
+        ):
             self.assertEqual(p_static, p)
             self.assertNotEqual(p_reference, p)
 
     def test_post_accumulate_grad_hook_gets_cleaned_up(self):
-
         def fun_stuff_with_hook():
             thing_to_put_in_hook = torch.rand(3)
 
@@ -1107,7 +1205,9 @@ def hook(tensor):
 
         with disable_gc():
             tensor, ref = fun_stuff_with_hook()
-            self.assertIsNotNone(ref())  # thing_to_put_in_hook should be kept alive by tensor
+            self.assertIsNotNone(
+                ref()
+            )  # thing_to_put_in_hook should be kept alive by tensor
 
             del tensor
             gc.collect()
@@ -1117,10 +1217,10 @@ def test_post_accumulate_grad_hook_ordering(self):
         tensor = torch.rand(3, requires_grad=True)
 
         def pre_hook(grad):
-            return grad.sub(2.)
+            return grad.sub(2.0)
 
         def acc_grad_node_pre_hook(grad_out):
-            return (grad_out[0].div(5.),)
+            return (grad_out[0].div(5.0),)
 
         def post_acc_grad_hook(tensor):
             tensor.grad.add_(0.5)
@@ -1141,7 +1241,7 @@ def acc_grad_node_post_hook(grad_in, grad_out):
         #   3. tensor post acc_grad hook
         #   4. acc_grad posthook
         # so that would be ((1 - 2) / 5 + 0.5) * 10 = 3
-        self.assertEqual(torch.tensor([3., 3., 3.]), tensor.grad)
+        self.assertEqual(torch.tensor([3.0, 3.0, 3.0]), tensor.grad)
 
     def test_hook_with_no_name(self):
         # Create a hook that do not have a __name__ attribute
@@ -1167,7 +1267,7 @@ def hook2(gs):
             log.append(2)
             return tuple(g * 2 for g in gs)
 
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         b = a.clone()
 
         b.grad_fn.register_prehook(hook2)
@@ -1199,7 +1299,7 @@ def test_retains_grad_can_always_observe_tensor_prehook(self):
         def tensor_prehook(g):
             return g * 2
 
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         b = a.clone()
         b.register_hook(tensor_prehook)
         b.retain_grad()
@@ -1208,7 +1308,7 @@ def tensor_prehook(g):
         b.clone().backward()
         self.assertEqual(b.grad.item(), 4)
 
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         b = a.clone()
         b.retain_grad()
         b.register_hook(tensor_prehook)
@@ -1219,7 +1319,7 @@ def tensor_prehook(g):
     def test_accumulate_grad_posthooks_can_observe_tensor_prehook(self):
         # Post hooks on accumulate should be able to observe changes to
         # grad made by tensor prehooks
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
 
         def tensor_prehook(g):
             return g * 2
@@ -1243,7 +1343,7 @@ def prehook(gI):
     def test_hook_edge_case_when_called_with_grad(self):
         # grad executes the tensor hooks of the next node but not
         # grad_fn pre hooks or the post hooks
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         b = a * 2
         c = b * 2
 
@@ -1304,12 +1404,15 @@ def test_sharded_grad(self):
 
         # define a helper for dividing intermediates into groups
         def group(l, group_size):
-            return (l[i:i + group_size] for i in range(0, len(l), group_size))
+            return (l[i : i + group_size] for i in range(0, len(l), group_size))
 
         # Compute the d loss / d intermediates in chunks of shard_size
         shard_size = 2
-        d_intermediates = [d_i for intermediates_batch in group(intermediates, shard_size)
-                           for d_i in torch.autograd.grad(loss, intermediates_batch)]
+        d_intermediates = [
+            d_i
+            for intermediates_batch in group(intermediates, shard_size)
+            for d_i in torch.autograd.grad(loss, intermediates_batch)
+        ]
         # Compute rest of backward pass
         torch.autograd.backward(intermediates, d_intermediates)
 
@@ -1318,19 +1421,19 @@ def group(l, group_size):
 
     def test_backward_badcalls(self):
         x = torch.ones(1)
-        with self.assertRaisesRegex(RuntimeError, 'does not require grad'):
+        with self.assertRaisesRegex(RuntimeError, "does not require grad"):
             x.backward()
 
     def test_grad_badcalls(self):
         x = torch.ones(1)
-        y = x ** 2
-        with self.assertRaisesRegex(RuntimeError, 'does not require grad'):
+        y = x**2
+        with self.assertRaisesRegex(RuntimeError, "does not require grad"):
             torch.autograd.grad(x, y)
-        with self.assertRaisesRegex(RuntimeError, 'does not require grad'):
+        with self.assertRaisesRegex(RuntimeError, "does not require grad"):
             torch.autograd.grad(y, x)
 
         x = torch.ones(1, requires_grad=True)
-        y = x ** 2
+        y = x**2
         torch.autograd.grad(y, x)  # this should succeed now
 
     def test_grad_empty_inputs(self):
@@ -1339,9 +1442,9 @@ def test_grad_empty_inputs(self):
             torch.autograd.grad(2 * x, [], grad_outputs=torch.tensor([1.0]))
 
     def test_grad_fn_badcalls(self):
-        error_regex = 'expected .* arguments, got .* instead'
+        error_regex = "expected .* arguments, got .* instead"
         x = torch.ones(1, requires_grad=True)
-        y = x ** 2
+        y = x**2
         with self.assertRaisesRegex(TypeError, error_regex):
             y.grad_fn(x.detach(), x.detach())  # too many
         with self.assertRaisesRegex(TypeError, error_regex):
@@ -1368,8 +1471,7 @@ def test_grad_unreachable(self):
         self.assertIsNone(grad_z)
 
         # allow_unused=False, but grads contains None inside, should throw
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Set allow_unused=True"):
+        with self.assertRaisesRegex(RuntimeError, "Set allow_unused=True"):
             grad_x, grad_y = torch.autograd.grad(x * 2, [x, y], allow_unused=False)
 
     def test_grad_unreachable_discovery(self):
@@ -1386,7 +1488,7 @@ def backward(ctx, x):
 
         x = MyFunc.apply(torch.randn(1, requires_grad=True) * 2)
         y = torch.randn(1, requires_grad=True)
-        (gY,) = torch.autograd.grad(x, (y, ), allow_unused=True)
+        (gY,) = torch.autograd.grad(x, (y,), allow_unused=True)
         self.assertIsNone(gY)
 
         x = MyFunc.apply(torch.randn(1, requires_grad=True) * 2)
@@ -1398,32 +1500,52 @@ def backward(ctx, x):
 
         x = MyFunc.apply(torch.randn(1, requires_grad=True) * 2)
         y = torch.randn(1, requires_grad=True)
-        torch.autograd.backward(x, inputs=(y, ))  # allow_unused is implicitly True!
+        torch.autograd.backward(x, inputs=(y,))  # allow_unused is implicitly True!
         self.assertIsNone(y.grad)
 
     def test_grad_batched_grad(self):
         x = torch.randn(2, 2, requires_grad=True)
 
         out = x.clone()  # Size([2, 2])
-        batched_grad = torch.arange(3).expand(2, 2, 3).transpose(0, 2)  # Size([3, 2, 2])
-        grad, = torch.autograd.grad(out, (x,), (batched_grad,), is_grads_batched=True)
-        self.assertEqual(grad, torch.arange(3).expand(2, 2, 3).transpose(0, 2).to(dtype=grad.dtype))
+        batched_grad = (
+            torch.arange(3).expand(2, 2, 3).transpose(0, 2)
+        )  # Size([3, 2, 2])
+        (grad,) = torch.autograd.grad(out, (x,), (batched_grad,), is_grads_batched=True)
+        self.assertEqual(
+            grad, torch.arange(3).expand(2, 2, 3).transpose(0, 2).to(dtype=grad.dtype)
+        )
 
         # Detect shape mismatch
         grad_out = torch.ones(2, 2)
-        with self.assertRaisesRegex(RuntimeError, "If `is_grads_batched=True`, we interpret the first"):
-            torch.autograd.grad(outputs=out, grad_outputs=(grad_out,), inputs=(x,), is_grads_batched=True)
+        with self.assertRaisesRegex(
+            RuntimeError, "If `is_grads_batched=True`, we interpret the first"
+        ):
+            torch.autograd.grad(
+                outputs=out,
+                grad_outputs=(grad_out,),
+                inputs=(x,),
+                is_grads_batched=True,
+            )
 
         # Scalar outputs
         out = x.sum()  # Size([])
         batched_grad = torch.arange(3)  # Size([3])
-        grad, = torch.autograd.grad(out, (x,), (batched_grad,), is_grads_batched=True)
-        self.assertEqual(grad, torch.arange(3).expand(2, 2, 3).transpose(0, 2).to(dtype=grad.dtype))
+        (grad,) = torch.autograd.grad(out, (x,), (batched_grad,), is_grads_batched=True)
+        self.assertEqual(
+            grad, torch.arange(3).expand(2, 2, 3).transpose(0, 2).to(dtype=grad.dtype)
+        )
 
         # We consider scalar and sized-1 to be a mismatch. This is consistent with current non-batched behavior.
         grad_out = torch.ones(2).unsqueeze(1)
-        with self.assertRaisesRegex(RuntimeError, "If `is_grads_batched=True`, we interpret the first"):
-            torch.autograd.grad(outputs=out, grad_outputs=(grad_out,), inputs=(x,), is_grads_batched=True)
+        with self.assertRaisesRegex(
+            RuntimeError, "If `is_grads_batched=True`, we interpret the first"
+        ):
+            torch.autograd.grad(
+                outputs=out,
+                grad_outputs=(grad_out,),
+                inputs=(x,),
+                is_grads_batched=True,
+            )
 
     def test_hooks(self):
         x = torch.ones(5, 5, requires_grad=True)
@@ -1436,7 +1558,7 @@ def bw_hook(inc, grad):
             self.assertIsInstance(grad, torch.Tensor)
             counter[0] += inc
 
-        z = x ** 2 + x * 2 + x * y + y
+        z = x**2 + x * 2 + x * y + y
         x.register_hook(lambda *args: bw_hook(0, *args))
         test = z.register_hook(lambda *args: bw_hook(1, *args))
         z.backward(torch.ones(5, 5), retain_graph=True)
@@ -1468,6 +1590,7 @@ def bw_hook_modify(grad):
 
     def _get_mul2(self, use_custom_function):
         if use_custom_function:
+
             class Mul2(Function):
                 @staticmethod
                 def forward(ctx, x):
@@ -1485,7 +1608,7 @@ def test_grad_fn_prehooks(self):
         for use_custom_function in (True, False):
             mul2 = self._get_mul2(use_custom_function)
 
-            a = torch.tensor([1.], requires_grad=True)
+            a = torch.tensor([1.0], requires_grad=True)
             b = mul2(a)
 
             post_counter = [0]
@@ -1646,8 +1769,10 @@ def bw_hook(grad):
         z.register_hook(bw_hook)
         z.sum().backward()
 
-        self.assertEqual(counter[0], 1, msg='bw_hook not called')
-        self.assertEqual(x.grad, torch.ones(5, 5, dtype=torch.double) * 2, atol=1e-5, rtol=0)
+        self.assertEqual(counter[0], 1, msg="bw_hook not called")
+        self.assertEqual(
+            x.grad, torch.ones(5, 5, dtype=torch.double) * 2, atol=1e-5, rtol=0
+        )
 
     def test_hook_none(self):
         # WARNING: this is a test for autograd internals.
@@ -1703,19 +1828,37 @@ def test_retain_grad(self):
     # NB: See test/cpp/api/autograd.cpp for more tests on the interaction between
     #     retains_grad and hooks in cpp
     def test_retain_grad_inplace(self):
-        a = torch.tensor([1.], requires_grad=True).clone()
+        a = torch.tensor([1.0], requires_grad=True).clone()
         a.retain_grad()
         a.mul_(2)
         a.sum().backward()
-        self.assertEqual(a.grad, torch.tensor([1.]))
+        self.assertEqual(a.grad, torch.tensor([1.0]))
 
-        a = torch.tensor([1.], requires_grad=True).clone()
+        a = torch.tensor([1.0], requires_grad=True).clone()
         a.retain_grad()
         # Inplace multiple times is OK
         a.mul_(2)
         a.mul_(2)
         a.sum().backward()
-        self.assertEqual(a.grad, torch.tensor([1.]))
+        self.assertEqual(a.grad, torch.tensor([1.0]))
+
+        # When in-place over view is done, the retains_grad hooks should be
+        # moved from base's original grad_fn to the copyslices node.
+        x = torch.tensor([1.0], requires_grad=True).clone()
+        x.retain_grad()
+        x_view = x[:]
+        x_view *= 2
+        x *= 2
+        x.sum().backward()
+        # The grad is 1, not 4, because we are computing grad wrt the latest
+        # version of x.
+        self.assertEqual(a.grad, torch.tensor([1.0]))
+
+        # If the base did not originally require grad, there should be no hook
+        # to move. Make sure this case runs without error.
+        x = torch.zeros(4)
+        y = x.view(2, 2)
+        y.add_(torch.randn(2, 2, requires_grad=True))
 
     def test_retains_grad_inplace_multiple_outputs(self):
         class DoubleMul(Function):
@@ -1750,7 +1893,7 @@ def backward(ctx, g1, g2):
             self.assertTrue(torch.allclose(gmean, gmean_expected))
 
     def test_retain_grad_inplace_over_view(self):
-        base = torch.tensor([1.], requires_grad=True).clone()
+        base = torch.tensor([1.0], requires_grad=True).clone()
         view = base[:]
         view2 = base[:]
         view.retain_grad()
@@ -1762,7 +1905,7 @@ def test_retain_grad_inplace_over_view(self):
         # so if the retains grad were not properly updated to the new grad_fn,
         # the grad would still be None
         self.assertEqual(view.grad, view2.grad)
-        self.assertEqual(view.grad, torch.tensor([1.]))
+        self.assertEqual(view.grad, torch.tensor([1.0]))
 
     def test_tensor_hooks_inplace(self):
         # Check that the second hook gets registered to the new version of tensor
@@ -1772,15 +1915,15 @@ def test_tensor_hooks_inplace(self):
         def fn1(grad):
             count1[0] += 1
             # x2 from mul, x2 from fn2
-            self.assertEqual(grad, torch.tensor([4.]))
+            self.assertEqual(grad, torch.tensor([4.0]))
             return grad * 2
 
         def fn2(grad):
             count2[0] += 1
-            self.assertEqual(grad, torch.tensor([1.]))
+            self.assertEqual(grad, torch.tensor([1.0]))
             return grad * 2
 
-        a = torch.tensor([1.], requires_grad=True)
+        a = torch.tensor([1.0], requires_grad=True)
         b = a.clone()
         b.register_hook(fn1)
         b.mul_(2)
@@ -1788,16 +1931,16 @@ def fn2(grad):
         b.sum().backward()
         self.assertEqual(count1[0], 1)
         self.assertEqual(count2[0], 1)
-        self.assertEqual(a.grad, torch.tensor([8.]))
+        self.assertEqual(a.grad, torch.tensor([8.0]))
 
         count3 = [0]
 
         def fn3(grad):
             count3[0] += 1
-            self.assertEqual(grad, torch.tensor([4.]))
+            self.assertEqual(grad, torch.tensor([4.0]))
             return grad * 2
 
-        a = torch.tensor([1.], requires_grad=True)
+        a = torch.tensor([1.0], requires_grad=True)
         b = a.clone()
         b.register_hook(fn3)
         # Inplace multiple times is OK
@@ -1805,7 +1948,7 @@ def fn3(grad):
         b.mul_(2)
         b.sum().backward()
         self.assertEqual(count1[0], 1)
-        self.assertEqual(a.grad, torch.tensor([8.]))
+        self.assertEqual(a.grad, torch.tensor([8.0]))
 
     def test_tensor_hooks_inplace_multiple_outputs(self):
         class DoubleMul(Function):
@@ -1858,9 +2001,9 @@ def fn1(grad):
 
         def fn2(grad):
             count[0] += 1
-            self.assertEqual(grad, torch.tensor([1.]))
+            self.assertEqual(grad, torch.tensor([1.0]))
 
-        base = torch.tensor([1.], requires_grad=True).clone()
+        base = torch.tensor([1.0], requires_grad=True).clone()
         view = base[:]
         view2 = base[:]
         view.register_hook(fn0)
@@ -1897,7 +2040,7 @@ def test_backward(self):
         v.backward(grad_output)
         self.assertEqual(v.grad, grad_output)
 
-        a = x + (y * z) + 4 * z ** 2 * x / y
+        a = x + (y * z) + 4 * z**2 * x / y
         a.backward(grad_output)
         x_grad = 4 * z.pow(2) / y + 1
         y_grad = z - 4 * x * z.pow(2) / y.pow(2)
@@ -1908,17 +2051,19 @@ def test_backward(self):
 
     def test_to_sparse_backward(self):
         to_attr_names = (
-            'to_dense',
-            'to_sparse',
-            'to_sparse_csr',
-            'to_sparse_csc',
-            'to_sparse_bsr',
-            'to_sparse_bsc',
+            "to_dense",
+            "to_sparse",
+            "to_sparse_csr",
+            "to_sparse_csc",
+            "to_sparse_bsr",
+            "to_sparse_bsc",
         )
         to_params = ((), (), (), (), (2,), (2,))
         to_attr_names_params = dict(zip(to_attr_names, to_params))
 
-        def check_inversion_possible(t, layout1, layout1_params, layout2, layout2_params):
+        def check_inversion_possible(
+            t, layout1, layout1_params, layout2, layout2_params
+        ):
             l = (layout1, layout2)
             p = (layout1_params, layout2_params)
             for l1, l2, p1, p2 in ((*l, *p), (*l[::-1], *p[::-1])):
@@ -1935,12 +2080,16 @@ def check_inversion_possible(t, layout1, layout1_params, layout2, layout2_params
 
         for from_to_attr in to_attr_names:
             from_params = to_attr_names_params[from_to_attr]
-            self_from = getattr(self_strided, from_to_attr)(*from_params).requires_grad_(True)
+            self_from = getattr(self_strided, from_to_attr)(
+                *from_params
+            ).requires_grad_(True)
 
             for to_to_attr in to_attr_names[1:]:
                 to_params = to_attr_names_params[to_to_attr]
 
-                if check_inversion_possible(self_strided, from_to_attr, from_params, to_to_attr, to_params):
+                if check_inversion_possible(
+                    self_strided, from_to_attr, from_params, to_to_attr, to_params
+                ):
                     self_to = getattr(self_from, to_to_attr)(*to_params)
                     grad_to = getattr(grad_strided, to_to_attr)(*to_params)
 
@@ -1978,8 +2127,18 @@ def test_sparse_mm_backward(self):
             b_grad = None if b.grad is None else b.grad.clone().detach()
 
             # Redo with only dense tensors
-            a = (a.to_dense() if a.is_sparse else a).clone().detach().requires_grad_(a_req_grad)
-            b = (b.to_dense() if b.is_sparse else b).clone().detach().requires_grad_(b_req_grad)
+            a = (
+                (a.to_dense() if a.is_sparse else a)
+                .clone()
+                .detach()
+                .requires_grad_(a_req_grad)
+            )
+            b = (
+                (b.to_dense() if b.is_sparse else b)
+                .clone()
+                .detach()
+                .requires_grad_(b_req_grad)
+            )
 
             r = a.mm(b)
             r.sum().backward()
@@ -2021,6 +2180,7 @@ def test_multi_backward_no_grad(self):
         # relax that check to a warning.
         def call_backwards():
             torch.autograd.backward([z, q], [torch.ones(5, 5), torch.ones(5, 5)])
+
         self.assertRaises(RuntimeError, call_backwards)
 
     def test_backward_with_inputs(self):
@@ -2028,7 +2188,7 @@ def test_backward_with_inputs(self):
         y = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
 
         def fn():
-            return x ** 2 + y * x + y ** 2
+            return x**2 + y * x + y**2
 
         gradient = torch.ones(2, 2)
         x_grad_expected = 2 * x + y
@@ -2059,8 +2219,11 @@ def reset_grad():
         self.assertEqual(x.grad, torch.zeros(2, 2), exact_dtype=False)
 
         reset_grad()
-        self.assertRaisesRegex(RuntimeError, 'cannot be empty',
-                               lambda: torch.autograd.backward(fn(), gradient, inputs=[]))
+        self.assertRaisesRegex(
+            RuntimeError,
+            "cannot be empty",
+            lambda: torch.autograd.backward(fn(), gradient, inputs=[]),
+        )
 
     def test_backward_with_nonleaf_inputs(self):
         x = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
@@ -2068,9 +2231,13 @@ def test_backward_with_nonleaf_inputs(self):
         y = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
         z = torch.randn(2, 2, dtype=torch.double, requires_grad=True)
 
-        out = x_nonleaf ** 2 + y * x_nonleaf + y ** 2
+        out = x_nonleaf**2 + y * x_nonleaf + y**2
 
-        out.backward(torch.ones(2, 2, dtype=torch.double), create_graph=True, inputs=[x, y, x_nonleaf])
+        out.backward(
+            torch.ones(2, 2, dtype=torch.double),
+            create_graph=True,
+            inputs=[x, y, x_nonleaf],
+        )
         x_grad_expected = 2 * x + y
         y_grad_expected = x + 2 * y
         x_non_leaf_expected = 2 * x_nonleaf + y
@@ -2082,13 +2249,15 @@ def test_backward_with_nonleaf_inputs(self):
         # backward doesn't have an allow_unused flag, so the behavior of backward
         # when variable is not part of the graph is as if allow_used were true
         # x.grad will simply be None.
-        out.backward(torch.ones(2, 2, dtype=torch.double), create_graph=True, inputs=[z])
+        out.backward(
+            torch.ones(2, 2, dtype=torch.double), create_graph=True, inputs=[z]
+        )
         self.assertIsNone(z.grad)
 
     def test_dependent_backward(self):
         x = torch.randn(10, requires_grad=True)
-        y = x ** 2
-        z = y ** 3
+        y = x**2
+        z = y**3
 
         go_y = torch.randn(10)
         go_z = torch.randn(10)
@@ -2120,7 +2289,7 @@ def forward(ctx, b):
 
             @staticmethod
             def backward(ctx, grad_b):
-                b, = ctx.saved_tensors
+                (b,) = ctx.saved_tensors
                 self.assertEqual(b.output_nr, 1)
 
         TestFn.apply(b).sum().backward()
@@ -2132,6 +2301,7 @@ def test_first_grad_fn_access_in_no_grad_mode(self):
         with torch.autograd.grad_mode.no_grad():
             v.grad_fn
 
+    @skipIfTorchDynamo("too slow")
     def test_free_deep_graph(self):
         def scope():
             depth = 150000
@@ -2149,6 +2319,7 @@ def scope():
         # Should not stack overflow
         scope()
 
+    @skipIfTorchDynamo("too slow")
     def test_free_deep_graph_complicated(self):
         def scope():
             depth = 100000
@@ -2161,8 +2332,9 @@ def scope():
 
             # Build a "chain with skip connections" graph
             for _ in range(depth):
-                prev_tensors = [tensor for tensor in prev_values[:-1]
-                                if tensor is not None]
+                prev_tensors = [
+                    tensor for tensor in prev_values[:-1] if tensor is not None
+                ]
                 prev_values.append(y)
                 prev_values.pop(0)
 
@@ -2179,6 +2351,7 @@ def scope():
         # Should not stack overflow
         scope()
 
+    @skipIfTorchDynamo("too slow")
     def test_free_deep_graph_pyfunction(self):
         class MyOp(Function):
             @staticmethod
@@ -2448,7 +2621,7 @@ def coro_enable_grad(n=10):
                     self.assertTrue(torch.is_grad_enabled())
                     yield (-i if has_raised else i)
 
-                except UnrecoverableException :
+                except UnrecoverableException:
                     self.assertTrue(torch.is_grad_enabled())
                     raise SecondaryException
 
@@ -2474,7 +2647,7 @@ def coro_no_grad(state):
 
                 except GeneratorExit:
                     self.assertFalse(torch.is_grad_enabled())
-                    state.add('GeneratorExit')
+                    state.add("GeneratorExit")
                     raise
 
         @torch.enable_grad()
@@ -2486,7 +2659,7 @@ def coro_enable_grad(state):
 
                 except GeneratorExit:
                     self.assertTrue(torch.is_grad_enabled())
-                    state.add('GeneratorExit')
+                    state.add("GeneratorExit")
                     raise
 
         state = set()
@@ -2496,7 +2669,7 @@ def coro_enable_grad(state):
                 next(coro)
 
             coro.close()
-        self.assertTrue('GeneratorExit' in state)
+        self.assertTrue("GeneratorExit" in state)
 
         state = set()
         with torch.no_grad():
@@ -2505,7 +2678,7 @@ def coro_enable_grad(state):
                 next(coro)
 
             coro.close()
-        self.assertTrue('GeneratorExit' in state)
+        self.assertTrue("GeneratorExit" in state)
 
     def test_no_grad_python_function(self):
         """Python Functions should respect grad mode."""
@@ -2525,7 +2698,7 @@ def backward(self, dy):
         self.assertFalse(y.requires_grad)
 
     def test_indexing(self):
-        x = torch.arange(1., 17).view(4, 4)
+        x = torch.arange(1.0, 17).view(4, 4)
         y = Variable(x, requires_grad=True)
 
         def compare(x, y, idx, indexed_tensor, indexed_var):
@@ -2567,9 +2740,9 @@ def check_index(x, y, idx):
 
         # advanced indexing, with less dim, or ellipsis
         check_index(x, y, ([0]))
-        check_index(x, y, ([0], ))
+        check_index(x, y, ([0],))
 
-        x = torch.arange(1., 49).view(4, 3, 4)
+        x = torch.arange(1.0, 49).view(4, 3, 4)
         y = Variable(x, requires_grad=True)
 
         check_index(x, y, (slice(None), [0], [0]))
@@ -2585,7 +2758,7 @@ def check_index(x, y, idx):
         check_index(x, y, ([2, 1], slice(None), slice(None)))
 
         # advanced indexing, with less dim, or ellipsis
-        check_index(x, y, ([0], ))
+        check_index(x, y, ([0],))
         check_index(x, y, ([0], slice(None)))
         check_index(x, y, ([0], Ellipsis))
         check_index(x, y, ([1, 2], [0, 1]))
@@ -2606,7 +2779,7 @@ def check_index(x, y, idx):
         compare(x, y, seq, indexed_tensor, indexed_var)
 
     def test_indexing_duplicates(self):
-        x = torch.arange(1., 17).view(4, 4)
+        x = torch.arange(1.0, 17).view(4, 4)
         y = Variable(x, requires_grad=True)
 
         idx = torch.LongTensor([1, 1, 3, 2, 1, 2])
@@ -2617,7 +2790,7 @@ def test_indexing_duplicates(self):
         self.assertEqual(y.grad, expected_grad)
 
         # with advanced indexing
-        x = torch.arange(1., 17).view(4, 4)
+        x = torch.arange(1.0, 17).view(4, 4)
         y = Variable(x, requires_grad=True)
 
         idx = [[1, 1, 3, 2, 1, 2], [0]]
@@ -2629,17 +2802,21 @@ def test_indexing_duplicates(self):
 
         self.assertEqual(y.grad, expected_grad)
 
-        x = torch.arange(1., 17).view(4, 4)
+        x = torch.arange(1.0, 17).view(4, 4)
         y = Variable(x, requires_grad=True)
         idx = [[[1, 2], [0, 0]], [[0, 1], [1, 1]]]
         y[idx].sum().backward()
-        expected_grad = torch.tensor([[0., 2., 0., 0.],
-                                      [1., 0., 0., 0.],
-                                      [0., 1., 0., 0.],
-                                      [0., 0., 0., 0.]])
+        expected_grad = torch.tensor(
+            [
+                [0.0, 2.0, 0.0, 0.0],
+                [1.0, 0.0, 0.0, 0.0],
+                [0.0, 1.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+            ]
+        )
         self.assertEqual(y.grad, expected_grad)
 
-        x = torch.arange(1., 65).view(4, 4, 4)
+        x = torch.arange(1.0, 65).view(4, 4, 4)
         y = Variable(x, requires_grad=True)
 
         idx = [[1, 1, 1], slice(None), slice(None)]
@@ -2654,7 +2831,7 @@ def test_index_backward_does_not_save_tensor(self):
         # trigger a version check on `tensor` during the backward pass, which
         # will cause the following code to error because `tensor` gets modified
         # by the indexing line.
-        a = torch.tensor([1., 0, 0])
+        a = torch.tensor([1.0, 0, 0])
         b = torch.zeros(3, requires_grad=True)
         tensor = b + 0
         tensor[a != 0] = tensor[a != 0]
@@ -2664,7 +2841,7 @@ def test_volatile_deprecated(self):
         v = torch.autograd.torch.randn(3, 3)
         with warnings.catch_warnings(record=True) as w:
             self.assertFalse(v.volatile)
-        self.assertIn('volatile', str(w[0].message))
+        self.assertIn("volatile", str(w[0].message))
 
     def test_saved_variables_deprecated(self):
         class MyFunction(Function):
@@ -2684,8 +2861,10 @@ def backward(ctx, grad_output):
             y = torch.randn((3, 3), requires_grad=True)
             MyFunction.apply(x, y).sum().backward()
 
-            has_deprecated = ('deprecated' in str(warn) and
-                              'saved_variables' in str(warn) for warn in warns)
+            has_deprecated = (
+                "deprecated" in str(warn) and "saved_variables" in str(warn)
+                for warn in warns
+            )
             has_deprecated = reduce(lambda x, y: x or y, has_deprecated)
             self.assertTrue(has_deprecated)
 
@@ -2700,13 +2879,14 @@ def test_requires_grad(self):
 
         def error():
             raise RuntimeError
+
         # Make sure backward isn't called on these
         a._backward_hooks = OrderedDict()
         x._backward_hooks = OrderedDict()
         y._backward_hooks = OrderedDict()
-        a._backward_hooks['test'] = error
-        x._backward_hooks['test'] = error
-        y._backward_hooks['test'] = error
+        a._backward_hooks["test"] = error
+        x._backward_hooks["test"] = error
+        y._backward_hooks["test"] = error
         b.backward(torch.ones(5, 5))
 
     def test_requires_grad_(self):
@@ -2806,8 +2986,11 @@ def test_backward_twice_with_saved_values(self):
         c = torch.zeros(3, dtype=torch.double)
         c[[1, 2]] = b[[1, 1]]
         c.backward(torch.tensor([1, 1, 1], dtype=torch.double))
-        self.assertRaisesRegex(RuntimeError, 'Specify retain_graph=True',
-                               lambda: c.backward(torch.tensor([1, 1, 1], dtype=torch.double)))
+        self.assertRaisesRegex(
+            RuntimeError,
+            "Specify retain_graph=True",
+            lambda: c.backward(torch.tensor([1, 1, 1], dtype=torch.double)),
+        )
 
     def test_backward_twice_retained_graph_with_saved_values(self):
         b = torch.randn(3, requires_grad=True, dtype=torch.double)
@@ -2836,12 +3019,22 @@ def test_backward_create_graph_warns(self):
             with warnings.catch_warnings(record=True) as ws:
                 c.backward(torch.ones_like(c), create_graph=True)
             b.grad = None
-            self.assertTrue(any('Using backward() with create_graph=True' in str(w.message) for w in ws))
+            self.assertTrue(
+                any(
+                    "Using backward() with create_graph=True" in str(w.message)
+                    for w in ws
+                )
+            )
 
             # Should not warn for grad
             with warnings.catch_warnings(record=True) as ws:
                 torch.autograd.grad(c, b, torch.ones_like(c), create_graph=True)
-            self.assertFalse(any('Using backward() with create_graph=True' in str(w.message) for w in ws))
+            self.assertFalse(
+                any(
+                    "Using backward() with create_graph=True" in str(w.message)
+                    for w in ws
+                )
+            )
 
     def test_next_functions(self):
         x = torch.randn(5, 5, requires_grad=True)
@@ -3064,15 +3257,23 @@ def test_setitem(self):
         self._test_setitem_tensor((5, 5), 3)
         self._test_setitem_tensor((5, 5), [[0, 1], [1, 0]])
         self._test_setitem_tensor((5,), 3)
-        self._test_setitem_tensor((5,), Variable(torch.LongTensor([3]), requires_grad=False).sum())
+        self._test_setitem_tensor(
+            (5,), Variable(torch.LongTensor([3]), requires_grad=False).sum()
+        )
         self._test_setitem_tensor((5,), [[0, 1, 2, 3]])
         self._test_setitem_tensor((5, 5, 5), [slice(None), slice(None), [1, 3]])
         self._test_setitem_tensor((5, 5, 5), [slice(None), [1, 3], slice(None)])
         self._test_setitem_tensor((5, 5, 5), [[1, 3], slice(None), slice(None)])
         self._test_setitem_tensor((5, 5, 5), [slice(None), [2, 4], [1, 3]])
         self._test_setitem_tensor((5, 5, 5), [[1, 3], [2, 4], slice(None)])
-        self._test_setitem_tensor((5, 5, 5), [Variable(torch.LongTensor([1,
-                                              3]), requires_grad=False), [2, 4], slice(None)])
+        self._test_setitem_tensor(
+            (5, 5, 5),
+            [
+                Variable(torch.LongTensor([1, 3]), requires_grad=False),
+                [2, 4],
+                slice(None),
+            ],
+        )
 
     def test_setitem_mask(self):
         mask = torch.BoolTensor(5, 5).bernoulli_()
@@ -3096,13 +3297,13 @@ def test_diagonal_expanded_v(self):
         value = torch.rand([])
         v_expanded = torch.tensor(value).expand(10)
         a = torch.rand(10, 10, dtype=torch.double, requires_grad=True)
-        result, = torch.autograd.grad(a.diagonal(), a, v_expanded)
+        (result,) = torch.autograd.grad(a.diagonal(), a, v_expanded)
         self.assertEqual(result, torch.eye(10, dtype=torch.double) * value)
 
     def test_select_expanded_v(self):
         v_expanded = torch.rand(10).expand(10, 10)
         a = torch.rand(10, 10, 10, requires_grad=True)
-        result, = torch.autograd.grad(a[0], a, v_expanded)
+        (result,) = torch.autograd.grad(a[0], a, v_expanded)
         expected = torch.zeros(10, 10, 10)
         expected[0] = v_expanded
         self.assertEqual(result, expected)
@@ -3110,7 +3311,7 @@ def test_select_expanded_v(self):
     def test_slice_expanded_v(self):
         v_expanded = torch.rand(10, 1).expand(2, 10, 10)
         a = torch.rand(10, 10, 10, requires_grad=True)
-        result, = torch.autograd.grad(a[3:5], a, v_expanded)
+        (result,) = torch.autograd.grad(a[3:5], a, v_expanded)
         expected = torch.zeros(10, 10, 10)
         expected[3:5] = v_expanded
         self.assertEqual(result, expected)
@@ -3174,6 +3375,7 @@ def test_gc_in_destructor(self):
         the Variable's tp_dealloc handler would get called twice leading to a
         segfault.
         """
+
         class CollectOnDelete(Function):
             def forward(self, x):
                 return x
@@ -3203,7 +3405,9 @@ def backward(ctx, grad_x):
         # After raising warning, should still return an instance
         self.assertIsInstance(f, Id)
         x = torch.zeros(1, requires_grad=True)
-        with self.assertRaisesRegex(RuntimeError, "non-static forward method is deprecated"):
+        with self.assertRaisesRegex(
+            RuntimeError, "non-static forward method is deprecated"
+        ):
             f(x)
         t = Id.apply(x)
         self.assertEqual(t.grad_fn.name(), "IdBackward")
@@ -3213,15 +3417,23 @@ def backward(ctx, grad_x):
         # properly error in this case.
         t = torch.ones(1, requires_grad=True)
         t._backward_hooks = {}
-        with self.assertRaisesRegex(RuntimeError, "Attribute '_register_hook_dict' is invalid"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Attribute '_register_hook_dict' is invalid"
+        ):
             f._register_hook_dict(t)
-        with self.assertRaisesRegex(RuntimeError, "Attribute 'register_hook' is invalid"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Attribute 'register_hook' is invalid"
+        ):
             f.register_hook(lambda x, y: None)
-        with self.assertRaisesRegex(RuntimeError, "Attribute 'next_functions' is invalid"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Attribute 'next_functions' is invalid"
+        ):
             f.next_functions
         with self.assertRaisesRegex(RuntimeError, "Attribute 'name' is invalid"):
             f.name()
-        with self.assertRaisesRegex(RuntimeError, "underlying PyNode has already been deallocated"):
+        with self.assertRaisesRegex(
+            RuntimeError, "underlying PyNode has already been deallocated"
+        ):
             f.metadata
 
     @unittest.expectedFailure
@@ -3289,10 +3501,10 @@ def backward(ctx, grad_output):
         mult2 = Mult.apply
 
         def check_gradgrad_repeated(x, y):
-            gy, = torch.autograd.grad(y[0], x, create_graph=True)
-            ggy_1, = torch.autograd.grad(gy[0, 0, 0], x, retain_graph=True)
-            gy, = torch.autograd.grad(y[0], x, create_graph=True)
-            ggy_2, = torch.autograd.grad(gy[0, 0, 0], x, retain_graph=True)
+            (gy,) = torch.autograd.grad(y[0], x, create_graph=True)
+            (ggy_1,) = torch.autograd.grad(gy[0, 0, 0], x, retain_graph=True)
+            (gy,) = torch.autograd.grad(y[0], x, create_graph=True)
+            (ggy_2,) = torch.autograd.grad(gy[0, 0, 0], x, retain_graph=True)
             self.assertEqual(ggy_1[0, 0, 1], ggy_2[0, 0, 1])
 
         x = torch.ones(2, 4, 4).requires_grad_()
@@ -3305,7 +3517,7 @@ def test_custom_autograd_no_early_free(self):
         class Double(torch.autograd.Function):
             @staticmethod
             def forward(ctx, x):
-                y = x ** 2
+                y = x**2
                 ctx.save_for_backward(x, y)
                 return y
 
@@ -3371,7 +3583,7 @@ def test_detach(self):
 
         # in-place detach on a view raises an exception
         view = x.narrow(0, 1, 4)
-        self.assertRaisesRegex(RuntimeError, 'view', lambda: view.detach_())
+        self.assertRaisesRegex(RuntimeError, "view", lambda: view.detach_())
 
     def test_detach_base(self):
         "detaching base does not detach view"
@@ -3387,13 +3599,16 @@ def test_detach_then_inplace_raises_in_autograd(self):
         x = torch.randn([], requires_grad=True)
         orig_x = x.detach().clone()
 
-        y = x ** 2  # saves x
+        y = x**2  # saves x
         z = x.detach()
         z.zero_()
         with self.assertRaisesRegex(RuntimeError, "has been modified by an inplace"):
             y.backward()
 
-    def _test_type_conversion_backward(self, t, ):
+    def _test_type_conversion_backward(
+        self,
+        t,
+    ):
         fvar = Variable(t(torch.randn(5, 5).float()), requires_grad=True)
         fvar.double().sum().backward()
         self.assertEqual(fvar.grad, torch.ones_like(fvar))
@@ -3426,7 +3641,12 @@ def test_type_conversions(self):
                 self.assertIs(y.grad.get_device(), 1)
                 self.assertIs(y.long().get_device(), 1)
 
-        for t in [torch.DoubleTensor, torch.FloatTensor, torch.IntTensor, torch.ByteTensor]:
+        for t in [
+            torch.DoubleTensor,
+            torch.FloatTensor,
+            torch.IntTensor,
+            torch.ByteTensor,
+        ]:
             for y_var in (True, False):
                 y = torch.randint(5, (5, 5), dtype=t.dtype)
                 y = Variable(y) if y_var else y
@@ -3442,11 +3662,14 @@ def test_type_conversions(self):
                         for y_cuda in (True, False):
                             x_c = x.cuda() if x_cuda else x
                             y_c = y.cuda() if y_cuda else y
-                            _, y_type = y_c.type().rsplit('.', 1)
-                            y_typestr = ('torch.cuda.' if y_cuda else 'torch.') + y_type
+                            _, y_type = y_c.type().rsplit(".", 1)
+                            y_typestr = ("torch.cuda." if y_cuda else "torch.") + y_type
                             self.assertEqual(y_c.type(), x_c.type(y_typestr).type())
                             self.assertIs(y_c.dtype, x_c.type(y_c.dtype).dtype)
-                            self.assertEqual(y_c.data_ptr(), y_c.cuda().data_ptr() if y_cuda else y_c.data_ptr())
+                            self.assertEqual(
+                                y_c.data_ptr(),
+                                y_c.cuda().data_ptr() if y_cuda else y_c.data_ptr(),
+                            )
 
         self._test_type_conversion_backward(lambda x: x)
         if torch.cuda.is_available():
@@ -3562,8 +3785,9 @@ def test_no_grad_modifies_version(self):
         z = (x * y).sum()
         with torch.no_grad():
             x *= 2
-        self.assertRaisesRegex(RuntimeError, 'modified by an inplace operation',
-                               lambda: z.backward())
+        self.assertRaisesRegex(
+            RuntimeError, "modified by an inplace operation", lambda: z.backward()
+        )
 
     def test_increment_version(self):
         a = torch.rand(5, requires_grad=True)
@@ -3582,7 +3806,6 @@ def test_increment_version(self):
         with self.assertRaisesRegex(RuntimeError, msg):
             torch.autograd.graph.increment_version(a)
 
-
     def test_no_grad_input(self):
         class MyFunction(Function):
             @staticmethod
@@ -3723,7 +3946,7 @@ def backward(ctx, grad_output):
         self.assertEqual(x.grad, torch.ones(x.size()))
 
     def test_set_grad_enabled(self):
-        x = torch.tensor([1.], requires_grad=True)
+        x = torch.tensor([1.0], requires_grad=True)
         with torch.set_grad_enabled(False):
             y = x * 2
         self.assertFalse(y.requires_grad)
@@ -3745,7 +3968,9 @@ def test_set_grad_enabled_wraps(self):
                     @torch.set_grad_enabled(False)
                     def inner_func(x):
                         return x.sin()
+
                 else:
+
                     def inner_func(x):
                         return x.sin()
 
@@ -3761,9 +3986,7 @@ def inner_func(x):
                 self.assertTrue(torch.is_grad_enabled())
 
                 x = torch.zeros(1, requires_grad=True)
-                self.assertTrue(
-                    not inner_func(x).requires_grad
-                )
+                self.assertTrue(not inner_func(x).requires_grad)
 
     def test_simple_reentrant(self):
         y_data = torch.randn(2, 2)
@@ -3801,7 +4024,6 @@ def test_reentrant_child_error(self):
         reentrant_root = f.sum()
 
         class ReentrantFunc(Function):
-
             @staticmethod
             def forward(ctx, inp):
                 return inp.clone()
@@ -3813,7 +4035,7 @@ def backward(ctx, grad):
                 return grad
 
         d = ReentrantFunc.apply(c)
-        with self.assertRaisesRegex(Exception, 'Simulate error'):
+        with self.assertRaisesRegex(Exception, "Simulate error"):
             d.sum().backward()
 
     def test_var_mean_differentiable(self):
@@ -3836,7 +4058,6 @@ def test_var_mean_differentiable(self):
 
     @skipIfNoLapack
     def test_lobpcg(self):
-
         def func(k, A, largest=True, B=None):
             X_shape = list(A.shape)
             X_shape[-1] = k
@@ -3874,7 +4095,13 @@ def run_symeig_test(k, sizes, largest=True):
             # Note it is not required if symeig is in forward instead (tested).
             D_grad = torch.rand(*A.shape[:-2], k) / 100
             U_grad = torch.rand(*A.shape[:-1], k) / 100
-            gradgradcheck(lambda A: func(k, A, largest), A, [D_grad, U_grad], atol=1e-4, check_batched_grad=False)
+            gradgradcheck(
+                lambda A: func(k, A, largest),
+                A,
+                [D_grad, U_grad],
+                atol=1e-4,
+                check_batched_grad=False,
+            )
 
             # check whether A.grad is symmetric
             A = A.detach().requires_grad_(True)
@@ -3927,9 +4154,9 @@ def test_current_graph_task_id(self):
         id = [-1]
 
         def hook(_):
-            id[0] = (torch._C._current_graph_task_id())
+            id[0] = torch._C._current_graph_task_id()
 
-        t = torch.tensor(1., requires_grad=True).clone()
+        t = torch.tensor(1.0, requires_grad=True).clone()
         t.register_hook(hook)
 
         t.backward(retain_graph=True)
@@ -3948,7 +4175,7 @@ def hook(_):
             predicted[0] = torch._C._current_graph_task_execution_order()
 
         def names(nodes):
-            return ", ".join([node.name().split(' ')[-1] for node in nodes]) + '\n'
+            return ", ".join([node.name().split(" ")[-1] for node in nodes]) + "\n"
 
         def grad_fns(*tensors):
             # or grad accumulator
@@ -3967,23 +4194,27 @@ def register_logging_hooks(*tensors):
             def get_hook(i):
                 def hook(t_):
                     actual.append(tensors[i])
+
                 return hook
 
             for i, t in enumerate(tensors):
                 t.register_hook(get_hook(i))
 
         # Basic example: single path
-        t = torch.tensor(1., requires_grad=True).clone().sin().exp()
+        t = torch.tensor(1.0, requires_grad=True).clone().sin().exp()
         t.register_hook(hook)
         with torch.autograd.set_multithreading_enabled(False):
             t.backward()
-        self.assertExpectedInline(names(predicted[0]), """\
+        self.assertExpectedInline(
+            names(predicted[0]),
+            """\
 ExpBackward0, SinBackward0, CloneBackward0, torch::autograd::AccumulateGrad
-""")
+""",
+        )
 
         # We don't exactly follow sequence_nr order
-        a = torch.tensor(1., requires_grad=True)
-        b = torch.tensor(2., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
+        b = torch.tensor(2.0, requires_grad=True)
         c = b.sin()
         d = a.cos()
         out = c * d
@@ -3995,7 +4226,7 @@ def hook(t_):
         actual = []
 
         # Accumulate grad node has more than one input
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         b = a.sin()
         c = a.cos()
         out = b * c
@@ -4007,7 +4238,7 @@ def hook(t_):
         actual = []
 
         # Multiple roots are also OK
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         b = a * 2
         out = b.sin()
         out2 = b.cos()
@@ -4016,15 +4247,18 @@ def hook(t_):
         out3.register_hook(hook)
         with torch.autograd.set_multithreading_enabled(False):
             torch.autograd.grad((out, out3, out2), inputs=(a,))
-        self.assertExpectedInline(names(predicted[0]), """\
+        self.assertExpectedInline(
+            names(predicted[0]),
+            """\
 CosBackward0, CosBackward0, SinBackward0, MulBackward0, torch::autograd::AccumulateGrad
-""")
+""",
+        )
         # TODO: Uncomment after update to hooks behavior
         # self.assertEqual(predicted[0], grad_fns(*actual))
         actual = []
 
         # Case where next node is nullptr
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         b = a * 2
         out = b.sin()
         register_logging_hooks(a, b, out)
@@ -4035,44 +4269,61 @@ def hook(t_):
         actual = []
 
         # Case where two `inputs` on the same path
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         b = a * 2
         out = b.sin()
         register_logging_hooks(a, b, out)
         out.register_hook(hook)
         with torch.autograd.set_multithreading_enabled(False):
-            torch.autograd.grad((out,), inputs=(a, b,))
-        self.assertEqual(names(predicted[0]), """\
+            torch.autograd.grad(
+                (out,),
+                inputs=(
+                    a,
+                    b,
+                ),
+            )
+        self.assertEqual(
+            names(predicted[0]),
+            """\
 SinBackward0, MulBackward0, torch::autograd::AccumulateGrad
-""")
+""",
+        )
         # TODO: Uncomment after update to hooks behavior
         # self.assertEqual(predicted[0], grad_fns(*actual))
         actual = []
 
         # Case where `inputs` specifies a subgraph
-        a = torch.tensor(1., requires_grad=True)
-        b = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
+        b = torch.tensor(1.0, requires_grad=True)
         c = a * b
         out = c.sin()
         register_logging_hooks(a, b, c, out)
         out.register_hook(hook)
         with torch.autograd.set_multithreading_enabled(False):
             torch.autograd.grad((out,), inputs=(a,))
-        self.assertEqual(names(predicted[0]), """\
+        self.assertEqual(
+            names(predicted[0]),
+            """\
 SinBackward0, MulBackward0, torch::autograd::AccumulateGrad
-""")
+""",
+        )
         # TODO: Uncomment after update to hooks behavior
         # self.assertEqual(predicted[0], grad_fns(*actual))
         actual = []
 
         # Errors when not called in a backward
-        with self.assertRaisesRegex(RuntimeError, "should only be called during the backward pass"):
+        with self.assertRaisesRegex(
+            RuntimeError, "should only be called during the backward pass"
+        ):
             torch._C._current_graph_task_execution_order()
 
         # Errors when context manager not enabled
-        t = torch.tensor(1., requires_grad=True).clone().sin().exp()
+        t = torch.tensor(1.0, requires_grad=True).clone().sin().exp()
         t.register_hook(hook)
-        with self.assertRaisesRegex(RuntimeError, "expects the current backward to be executed with multithreading disabled"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "expects the current backward to be executed with multithreading disabled",
+        ):
             t.backward()
 
     def test_view_replay_enabled(self):
@@ -4133,7 +4384,6 @@ def test_unsafe_set_version_counter(self):
         with self.assertRaisesRegex(RuntimeError, "Cannot set"):
             torch._C._autograd._unsafe_set_version_counter(x, -1)
 
-
     def test_current_node(self):
         pr = []
 
@@ -4153,7 +4403,9 @@ def __torch_dispatch__(self, func, types, args, kwargs=None):
             b.backward()
             pr.append("Done")
 
-        self.assertExpectedInline("\n".join(pr), """\
+        self.assertExpectedInline(
+            "\n".join(pr),
+            """\
 FW
 Running aten.rand.default from within None
 Running aten.mul.Tensor from within None
@@ -4166,7 +4418,8 @@ def __torch_dispatch__(self, func, types, args, kwargs=None):
 Running aten.mul.Tensor from within MulBackward0
 Running aten.detach.default from within AccumulateGrad
 Running aten.detach.default from within AccumulateGrad
-Done""")
+Done""",
+        )
 
     def test_profiler(self):
         x = torch.randn(10, 10)
@@ -4177,7 +4430,7 @@ def test_profiler(self):
 
         self.assertFalse(torch.autograd._profiler_enabled())
 
-        names = ['aten::mul', 'aten::add']
+        names = ["aten::mul", "aten::add"]
         found_indices = set()
         for evt in p.function_events:
             if evt.name in names:
@@ -4191,8 +4444,7 @@ def test_profiler_seq_nr(self):
             z = x + y
             s = z.sum(dim=None)
             s.backward()
-        print(p.key_averages().table(
-            sort_by="self_cpu_time_total", row_limit=-1))
+        print(p.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1))
         # expecting aten::add, aten::sum to have the sequence numbers,
         # expecting the corresponding backward nodes to have the same numbers
         # as the forward ops
@@ -4219,7 +4471,10 @@ def test_profiler_seq_nr(self):
         for idx, ((fwd_name, bwd_name), ops) in enumerate(autograd_ops.items()):
             self.assertEqual(len(ops), 3)
             self.assertEqual(ops[0].name, fwd_name)
-            self.assertEqual(ops[1].name, f"autograd::engine::evaluate_function: {bwd_name}Backward{idx}")
+            self.assertEqual(
+                ops[1].name,
+                f"autograd::engine::evaluate_function: {bwd_name}Backward{idx}",
+            )
             self.assertEqual(ops[2].name, f"{bwd_name}Backward{idx}")
             self.assertGreaterEqual(ops[0].sequence_nr, 0)
             self.assertEqual(ops[1].sequence_nr, ops[0].sequence_nr)
@@ -4360,8 +4615,12 @@ def test_profiler_aggregation_table(self):
 
     def test_profiler_function_event_avg(self):
         avg = FunctionEventAvg()
-        avg.add(FunctionEvent(id=0, node_id=0, name="foo", thread=0, start_us=10, end_us=15))
-        avg.add(FunctionEvent(id=1, node_id=0, name="foo", thread=0, start_us=20, end_us=30))
+        avg.add(
+            FunctionEvent(id=0, node_id=0, name="foo", thread=0, start_us=10, end_us=15)
+        )
+        avg.add(
+            FunctionEvent(id=1, node_id=0, name="foo", thread=0, start_us=20, end_us=30)
+        )
         avg.add(avg)
         self.assertEqual(avg.key, "foo")
 
@@ -4369,11 +4628,11 @@ def test_profiler_function_event_avg(self):
         self.assertEqual(avg.count, 4)
         self.assertEqual(avg.cpu_time_total, 30)
         self.assertEqual(avg.self_cpu_time_total, 30)
-        self.assertEqual(avg.cuda_time_total, 0)
+        self.assertEqual(avg.device_time_total, 0)
 
         # average stats
         self.assertEqual(avg.cpu_time, 7.5)
-        self.assertEqual(avg.cuda_time_total, 0)
+        self.assertEqual(avg.device_time_total, 0)
 
     def test_profiler_shapes(self):
         print("")
@@ -4411,26 +4670,35 @@ def test_profiler_aggregation_lstm(self):
                 end = time.time()
                 total_time_s += end - start
 
-        print(prof.table(
-            sort_by="self_cpu_time_total", row_limit=10, header="TEST"))
-        print(prof.key_averages(group_by_input_shape=True).table(
-            sort_by="self_cpu_time_total", row_limit=10))
-        print(prof.table(
-            sort_by="self_cpu_time_total", row_limit=10, max_src_column_width=300, header="TEST", top_level_events_only=True))
-        print(prof.key_averages(group_by_input_shape=True).table(
-            sort_by="self_cpu_time_total", row_limit=10, top_level_events_only=True))
-
-        total_time_us = total_time_s * 1000.0 * 1000.0  # make it us which is profiler default
+        print(prof.table(sort_by="self_cpu_time_total", row_limit=10, header="TEST"))
+        print(
+            prof.key_averages(group_by_input_shape=True).table(
+                sort_by="self_cpu_time_total", row_limit=10
+            )
+        )
         print(
-            "Total time based on python measurements: ",
-            _format_time(total_time_us)
+            prof.table(
+                sort_by="self_cpu_time_total",
+                row_limit=10,
+                max_src_column_width=300,
+                header="TEST",
+                top_level_events_only=True,
+            )
         )
         print(
-            "CPU time measurement python side overhead: {:.2f}%".format(
-                (total_time_us / prof.self_cpu_time_total - 1.0) * 100.0
+            prof.key_averages(group_by_input_shape=True).table(
+                sort_by="self_cpu_time_total", row_limit=10, top_level_events_only=True
             )
         )
 
+        total_time_us = (
+            total_time_s * 1000.0 * 1000.0
+        )  # make it us which is profiler default
+        print("Total time based on python measurements: ", _format_time(total_time_us))
+        print(
+            f"CPU time measurement python side overhead: {(total_time_us / prof.self_cpu_time_total - 1.0) * 100.0:.2f}%"
+        )
+
         if sys.platform != "win32":
             with tempfile.NamedTemporaryFile() as trace_file:
                 prof.export_chrome_trace(trace_file.name)
@@ -4452,12 +4720,12 @@ def forward(x):
 
         events = p.function_events
         important_events = [
-            'outer',
-            'aten::mul',
-            'aten::add',
-            'inner',
-            'aten::sub',
-            'aten::div'
+            "outer",
+            "aten::mul",
+            "aten::add",
+            "inner",
+            "aten::sub",
+            "aten::div",
         ]
         idx = 0
         for info in events:
@@ -4468,14 +4736,14 @@ def forward(x):
         self.assertEqual(idx, len(important_events))
 
         # We can also use record_function to decorate arbitrary function
-        @record_function('my_func')
+        @record_function("my_func")
         def f(x, y):
             return x + y
 
         with profile(use_kineto=kineto_available()) as p:
             f(1, 2)
 
-        self.assertTrue('my_func' in str(p))
+        self.assertTrue("my_func" in str(p))
 
     def test_record_function_multithreaded(self):
         rf = record_function("outer")
@@ -4491,15 +4759,14 @@ def test_record_function_multithreaded(self):
         # doesn't throw.
         rf.__exit__(None, None, None)
 
-
     def test_dir(self):
         x = torch.randn(10, 10)
         keys = dir(x)
-        self.assertIn('shape', keys)
+        self.assertIn("shape", keys)
 
         # real and imag are only implemented for complex tensors.
         y = torch.randn(10, 10, dtype=torch.cfloat)
-        imag_key = 'imag'
+        imag_key = "imag"
         self.assertRaises(RuntimeError, lambda: hasattr(x, imag_key))
         self.assertTrue(hasattr(y, imag_key))
         keys.remove(imag_key)
@@ -4507,7 +4774,6 @@ def test_dir(self):
         for key in keys:
             self.assertTrue(hasattr(x, key))
 
-
     def test_inplace_on_view_saved_output(self):
         # Test an in-place operation on a view in which the in-place op saves
         # its output. Previously, this created a reference cycle.
@@ -4531,21 +4797,22 @@ def test_inplace_on_view_leaf_errors(self):
         # Issue #21875: Fail faster (when we try to modify the view vs. in backward())
         x = torch.zeros(1, requires_grad=True)
         y = x.view_as(x)
-        with self.assertRaisesRegex(RuntimeError,
-                                    "a view of a leaf Variable that "
-                                    "requires grad is being used in "
-                                    "an in-place operation."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "a view of a leaf Variable that "
+            "requires grad is being used in "
+            "an in-place operation.",
+        ):
             y.add_(1)
 
     def test_inplace_on_view_backward(self):
         # Issue #10532: Make sure that this does not raise RuntimeError.
-        net = nn.Sequential(
-            nn.InstanceNorm2d(2),
-            nn.ReLU(True)
-        )
+        net = nn.Sequential(nn.InstanceNorm2d(2), nn.ReLU(True))
 
         x = torch.tensor([[[[1.0, 1.0]]]], requires_grad=True)
-        g, = torch.autograd.grad(net(x).pow(2), [x], grad_outputs=x.new_ones(x.shape) , create_graph=True)
+        (g,) = torch.autograd.grad(
+            net(x).pow(2), [x], grad_outputs=x.new_ones(x.shape), create_graph=True
+        )
         torch.autograd.grad(g.sum(), [x])
         self.assertEqual(x, torch.tensor([[[[1.0, 1.0]]]]))
 
@@ -4553,12 +4820,16 @@ def test_inplace_on_view_backward(self):
         inputs = torch.ones((1, 3, 256, 256), requires_grad=True)
 
         tmp1 = (inputs + 1).view_as(inputs)
-        tmp2 = torch.nn.functional.threshold(tmp1, 0., 0., True)
+        tmp2 = torch.nn.functional.threshold(tmp1, 0.0, 0.0, True)
         prob_interpolated = torch.sigmoid(tmp2)
 
-        gradients = torch.autograd.grad(outputs=prob_interpolated, inputs=inputs,
-                                        grad_outputs=torch.ones(prob_interpolated.size()),
-                                        create_graph=True, retain_graph=True)[0]
+        gradients = torch.autograd.grad(
+            outputs=prob_interpolated,
+            inputs=inputs,
+            grad_outputs=torch.ones(prob_interpolated.size()),
+            create_graph=True,
+            retain_graph=True,
+        )[0]
 
         gradient_penalty = gradients.sum()
         gradient_penalty.backward()
@@ -4592,7 +4863,7 @@ def test_out_variant_raises_when_inputs_require_grad(self):
         x = torch.zeros_like(a)
 
         # out=... functions don't support automatic differentiation currently
-        self.assertRaisesRegex(RuntimeError, 'out=', lambda: torch.mul(a, b, out=x))
+        self.assertRaisesRegex(RuntimeError, "out=", lambda: torch.mul(a, b, out=x))
 
         # the inputs can require grad if we're in no_grad() mode
         with torch.no_grad():
@@ -4603,7 +4874,7 @@ def test_out_variant_raises_when_inputs_require_grad(self):
         b = torch.randn(2, 2)
         x = torch.zeros(2, 2, requires_grad=True)
         # we should throw an exception if the output requires grad
-        self.assertRaisesRegex(RuntimeError, 'out=', lambda: torch.mul(a, b, out=x))
+        self.assertRaisesRegex(RuntimeError, "out=", lambda: torch.mul(a, b, out=x))
 
     def test_anomaly_detect_nan(self):
         size = 10
@@ -4630,19 +4901,25 @@ def backward(ctx, gO):
 
         inp = torch.rand(size, requires_grad=True)
         out = MyFunc.apply(inp, inp, True)
-        with self.assertRaisesRegex(RuntimeError, "Function 'MyFuncBackward' returned nan values in its 0th output."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Function 'MyFuncBackward' returned nan values in its 0th output.",
+        ):
             with warnings.catch_warnings(record=True) as w:
                 with detect_anomaly():
                     out.backward()
-            self.assertIn('No forward pass information', str(w[0].message))
+            self.assertIn("No forward pass information", str(w[0].message))
 
         inp = torch.rand(size, requires_grad=True)
-        with self.assertRaisesRegex(RuntimeError, "Function 'MyFuncBackward' returned nan values in its 1th output."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Function 'MyFuncBackward' returned nan values in its 1th output.",
+        ):
             with warnings.catch_warnings(record=True) as w:
                 with detect_anomaly():
                     out = MyFunc.apply(inp, inp, False)
                     out.backward()
-            self.assertIn('MyFunc.apply', str(w[0].message))
+            self.assertIn("MyFunc.apply", str(w[0].message))
 
     def test_calculate_shape_util(self):
         out = torch.randn(10, 5, requires_grad=True)
@@ -4652,12 +4929,19 @@ def test_calculate_shape_util(self):
         assert out_shape == torch.Size([10, 5])
         assert grad_shape == torch.Size([5, 10])
 
-        out = torch.nested.as_nested_tensor([
-            torch.randn(10, 5, requires_grad=True),
-            torch.randn(10, 5, requires_grad=True),
-            torch.randn(10, 5, requires_grad=True)]
+        out = torch.nested.as_nested_tensor(
+            [
+                torch.randn(10, 5, requires_grad=True),
+                torch.randn(10, 5, requires_grad=True),
+                torch.randn(10, 5, requires_grad=True),
+            ]
+        )
+        grad = torch.nested.as_nested_tensor(
+            [
+                torch.randn(5, 10, requires_grad=True),
+                torch.randn(5, 10, requires_grad=True),
+            ]
         )
-        grad = torch.nested.as_nested_tensor([torch.randn(5, 10, requires_grad=True), torch.randn(5, 10, requires_grad=True)])
         out_shape, grad_shape = _calculate_shape(out, grad, False)
 
         assert torch.equal(out_shape, torch.tensor([[10, 5], [10, 5], [10, 5]]))
@@ -4675,7 +4959,7 @@ def forward(ctx, inp1, fail_0th):
 
             @staticmethod
             def backward(ctx, gO):
-                inp, = ctx.saved_tensors
+                (inp,) = ctx.saved_tensors
                 fail_0th = ctx.fail_0th
                 g = gO.clone().expand(size)
                 gI = MyFunc2.apply(g * inp, g + inp, fail_0th)
@@ -4703,30 +4987,36 @@ def backward(ctx, gO):
 
         inp = torch.rand(size, requires_grad=True)
         out = MyFunc.apply(inp, True)
-        ginp, = torch.autograd.grad(out, (inp,), create_graph=True)
+        (ginp,) = torch.autograd.grad(out, (inp,), create_graph=True)
         gsum = ginp.sum()
         gsum.backward()  # should not fail
 
         inp = torch.rand(size, requires_grad=True)
         out = MyFunc.apply(inp, True)
-        ginp, = torch.autograd.grad(out, (inp,), create_graph=True)
+        (ginp,) = torch.autograd.grad(out, (inp,), create_graph=True)
         gsum = ginp.sum()
         with warnings.catch_warnings(record=True) as w:
-            with self.assertRaisesRegex(RuntimeError, "Function 'MyFunc2Backward' returned nan values in its 0th output."):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Function 'MyFunc2Backward' returned nan values in its 0th output.",
+            ):
                 with detect_anomaly():
                     gsum.backward()
-        self.assertIn('No forward pass information', str(w[1].message))
+        self.assertIn("No forward pass information", str(w[1].message))
 
         inp = torch.rand(size, requires_grad=True)
         with warnings.catch_warnings(record=True) as w:
-            with self.assertRaisesRegex(RuntimeError, "Function 'MyFunc2Backward' returned nan values in its 1th output."):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Function 'MyFunc2Backward' returned nan values in its 1th output.",
+            ):
                 with detect_anomaly():
                     out = MyFunc.apply(inp, False)
-                    ginp, = torch.autograd.grad(out, (inp,), create_graph=True)
+                    (ginp,) = torch.autograd.grad(out, (inp,), create_graph=True)
                     gsum = ginp.sum()
                     gsum.backward()
-        self.assertIn('MyFunc2.apply', str(w[1].message))
-        self.assertIn('MyFunc.apply', str(w[2].message))
+        self.assertIn("MyFunc2.apply", str(w[1].message))
+        self.assertIn("MyFunc.apply", str(w[2].message))
 
     def test_anomaly_grad_warnings(self):
         # PyTorch won't throw warnings if there is an error
@@ -4743,40 +5033,43 @@ def __exit__(self, *args):
                 self.captured = self.stderr_new.getvalue()
                 sys.stderr = self.stderr_orig
 
-
         # if the warnings don't throw, they will be handled as regular warnings
-        with self.assertRaisesRegex(RuntimeError,
-                                    "one of the variables needed for gradient computation has been "
-                                    "modified by an inplace operation"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "one of the variables needed for gradient computation has been "
+            "modified by an inplace operation",
+        ):
             with warnings.catch_warnings(record=True) as w:
                 with detect_anomaly():
                     a = torch.randn(5, requires_grad=True)
                     d1 = a + 1
-                    d2 = d1 ** 2
+                    d2 = d1**2
                     d1 += 1
                     torch.autograd.grad(d2.sum(), a)
 
         self.assertEqual(len(w), 2)
-        self.assertIn('Anomaly Detection has been enabled', str(w[0].message))
-        self.assertIn('Error detected in PowBackward0', str(w[1].message))
+        self.assertIn("Anomaly Detection has been enabled", str(w[0].message))
+        self.assertIn("Error detected in PowBackward0", str(w[1].message))
 
         # if the warning throws, it will be printed to sys.stderr
-        with self.assertRaisesRegex(RuntimeError,
-                                    "one of the variables needed for gradient computation has been "
-                                    "modified by an inplace operation"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "one of the variables needed for gradient computation has been "
+            "modified by an inplace operation",
+        ):
             with warnings.catch_warnings(record=True) as w:
                 with detect_anomaly():
                     warnings.simplefilter("error")
                     with StdErrDiverter() as s:
                         a = torch.randn(5, requires_grad=True)
                         d1 = a + 1
-                        d2 = d1 ** 2
+                        d2 = d1**2
                         d1 += 1
                         torch.autograd.grad(d2.sum(), a)
 
         self.assertEqual(len(w), 1)
-        self.assertIn('Anomaly Detection has been enabled', str(w[0].message))
-        self.assertIn('Error detected in PowBackward0', s.captured)
+        self.assertIn("Anomaly Detection has been enabled", str(w[0].message))
+        self.assertIn("Error detected in PowBackward0", s.captured)
 
     def test_anomaly_assign_parent_cleanup(self):
         # Test that python objects created are properly cleaned up when assign_parent is called
@@ -4805,6 +5098,7 @@ def get_ref():
             # We can test this by seeing whether Foo is not kept alive once t is destroyed
             class Foo:
                 pass
+
             my_obj = Foo()
             meta_dict = t.grad_fn.metadata
             meta_dict[0] = my_obj
@@ -4839,7 +5133,7 @@ def forward(ctx, x):
 
                 @staticmethod
                 def backward(ctx, gO):
-                    x, = ctx.saved_tensors
+                    (x,) = ctx.saved_tensors
                     return MyFunc2.apply(x)
 
             class MyFunc2(Function):
@@ -4853,15 +5147,19 @@ def backward(ctx, gO):
 
             inp = torch.rand(1, requires_grad=True)
             out = MyFunc.apply(inp)
-            ginp, = torch.autograd.grad(out, (inp,), create_graph=True)
+            (ginp,) = torch.autograd.grad(out, (inp,), create_graph=True)
 
             with warnings.catch_warnings(record=True) as w:
-                with self.assertRaisesRegex(RuntimeError, "Function 'MyFunc2Backward' returned nan values in its 0th output."):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "Function 'MyFunc2Backward' returned nan values in its 0th output.",
+                ):
                     with detect_anomaly():
                         ginp.backward()
 
             class Foo:
                 pass
+
             my_obj = Foo()
             meta_dict = out.grad_fn.metadata
             meta_dict[0] = my_obj
@@ -4894,7 +5192,10 @@ def run_fn(a):
                 out.backward(retain_graph=True)
 
                 with torch.autograd.detect_anomaly(check_nan=True):
-                    with self.assertRaisesRegex(RuntimeError, "Function 'MyFuncBackward' returned nan values in its 0th output."):
+                    with self.assertRaisesRegex(
+                        RuntimeError,
+                        "Function 'MyFuncBackward' returned nan values in its 0th output.",
+                    ):
                         out.backward(retain_graph=True)
 
                 out.backward()
@@ -4917,7 +5218,7 @@ class NonContGradFunc(Function):
             @staticmethod
             def forward(ctx, inp1):
                 ctx.size = inp1.size()
-                return torch.tensor([1.])
+                return torch.tensor([1.0])
 
             @staticmethod
             def backward(ctx, grad):
@@ -5018,12 +5319,30 @@ def check(fast_mode):
             def f(inp):
                 return inp.mul(5)
 
-            gradcheck(f, torch.rand(10, dtype=torch.float64, requires_grad=True), fast_mode=fast_mode)
-            gradgradcheck(f, torch.rand(10, dtype=torch.float64, requires_grad=True), fast_mode=fast_mode)
+            gradcheck(
+                f,
+                torch.rand(10, dtype=torch.float64, requires_grad=True),
+                fast_mode=fast_mode,
+            )
+            gradgradcheck(
+                f,
+                torch.rand(10, dtype=torch.float64, requires_grad=True),
+                fast_mode=fast_mode,
+            )
+
         check(fast_mode=True)
         check(fast_mode=False)
 
-    @parametrize('layout', (torch.sparse_coo, torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc))
+    @parametrize(
+        "layout",
+        (
+            torch.sparse_coo,
+            torch.sparse_csr,
+            torch.sparse_csc,
+            torch.sparse_bsr,
+            torch.sparse_bsc,
+        ),
+    )
     def test_gradcheck_input(self, layout):
         if layout in {torch.sparse_bsr, torch.sparse_bsc}:
             blocksize = (2, 2)
@@ -5036,8 +5355,15 @@ def check(fast_mode, masked):
             def fn(sparse):
                 return torch.sum(sparse)
 
-            gradcheck(fn, torch.rand(size, dtype=torch.double).to_sparse(layout=layout, blocksize=blocksize).requires_grad_(),
-                      masked=masked, check_batched_grad=False, fast_mode=fast_mode)
+            gradcheck(
+                fn,
+                torch.rand(size, dtype=torch.double)
+                .to_sparse(layout=layout, blocksize=blocksize)
+                .requires_grad_(),
+                masked=masked,
+                check_batched_grad=False,
+                fast_mode=fast_mode,
+            )
 
         for fast_mode, masked in product(*[(True, False)] * 2):
             check(fast_mode=fast_mode, masked=masked)
@@ -5051,83 +5377,194 @@ def forward(ctx, x, jitter=0.0):
 
             @staticmethod
             def backward(ctx, grad_out):
-                return NonDetFunc.apply(grad_out, ctx._jitter) * (1 + torch.rand_like(grad_out) * ctx._jitter), None
+                return (
+                    NonDetFunc.apply(grad_out, ctx._jitter)
+                    * (1 + torch.rand_like(grad_out) * ctx._jitter),
+                    None,
+                )
 
         def check(fast_mode):
             inp = torch.randn(5, 5, dtype=torch.double, requires_grad=True)
-            gradcheck(lambda x: NonDetFunc.apply(x, 0.0), inp, check_batched_grad=False, fast_mode=fast_mode)
-            with self.assertRaisesRegex(RuntimeError, 'Backward is not reentrant'):
-                gradcheck(lambda x: NonDetFunc.apply(x, 1e-6), inp, check_batched_grad=False, fast_mode=fast_mode)
-            with self.assertRaisesRegex(RuntimeError, 'Backward is not reentrant'):
-                gradgradcheck(lambda x: NonDetFunc.apply(x, 1e-12), inp, check_batched_grad=False, fast_mode=fast_mode)
-            gradcheck(lambda x: NonDetFunc.apply(x, 0.0), inp, nondet_tol=1e-5, check_batched_grad=False,
-                      fast_mode=fast_mode)
-            gradcheck(lambda x: NonDetFunc.apply(x, 1e-6), inp, nondet_tol=1e-5, check_batched_grad=False,
-                      fast_mode=fast_mode)
-            gradgradcheck(lambda x: NonDetFunc.apply(x, 1e-12), inp, nondet_tol=1e-5, check_batched_grad=False,
-                          fast_mode=fast_mode)
+            gradcheck(
+                lambda x: NonDetFunc.apply(x, 0.0),
+                inp,
+                check_batched_grad=False,
+                fast_mode=fast_mode,
+            )
+            with self.assertRaisesRegex(RuntimeError, "Backward is not reentrant"):
+                gradcheck(
+                    lambda x: NonDetFunc.apply(x, 1e-6),
+                    inp,
+                    check_batched_grad=False,
+                    fast_mode=fast_mode,
+                )
+            with self.assertRaisesRegex(RuntimeError, "Backward is not reentrant"):
+                gradgradcheck(
+                    lambda x: NonDetFunc.apply(x, 1e-12),
+                    inp,
+                    check_batched_grad=False,
+                    fast_mode=fast_mode,
+                )
+            gradcheck(
+                lambda x: NonDetFunc.apply(x, 0.0),
+                inp,
+                nondet_tol=1e-5,
+                check_batched_grad=False,
+                fast_mode=fast_mode,
+            )
+            gradcheck(
+                lambda x: NonDetFunc.apply(x, 1e-6),
+                inp,
+                nondet_tol=1e-5,
+                check_batched_grad=False,
+                fast_mode=fast_mode,
+            )
+            gradgradcheck(
+                lambda x: NonDetFunc.apply(x, 1e-12),
+                inp,
+                nondet_tol=1e-5,
+                check_batched_grad=False,
+                fast_mode=fast_mode,
+            )
+
         check(fast_mode=True)
         check(fast_mode=False)
 
     def test_gradcheck_validates_inputs(self):
         def check(fast_mode):
             x = torch.rand(10, requires_grad=True).to_sparse()
-            self.assertTrue(gradcheck(lambda x: x.to_dense(), (x,), check_batched_grad=False,
-                                      atol=1e-1, fast_mode=fast_mode, masked=True))
-            self.assertFalse(gradcheck(lambda x: x.to_dense(), (x,), masked=False,
-                                       check_batched_grad=False, raise_exception=False, fast_mode=fast_mode))
-            self.assertTrue(gradcheck(lambda x: x.to_dense(masked_grad=False), (x,), masked=False,
-                                      atol=1e-1, check_batched_grad=False, raise_exception=False, fast_mode=fast_mode))
+            self.assertTrue(
+                gradcheck(
+                    lambda x: x.to_dense(),
+                    (x,),
+                    check_batched_grad=False,
+                    atol=1e-1,
+                    fast_mode=fast_mode,
+                    masked=True,
+                )
+            )
+            self.assertFalse(
+                gradcheck(
+                    lambda x: x.to_dense(),
+                    (x,),
+                    masked=False,
+                    check_batched_grad=False,
+                    raise_exception=False,
+                    fast_mode=fast_mode,
+                )
+            )
+            self.assertTrue(
+                gradcheck(
+                    lambda x: x.to_dense(masked_grad=False),
+                    (x,),
+                    masked=False,
+                    atol=1e-1,
+                    check_batched_grad=False,
+                    raise_exception=False,
+                    fast_mode=fast_mode,
+                )
+            )
 
             # when none of the inputs require grad (always raises even if raise_exception=False)
             x = torch.rand(10, requires_grad=False)
-            with self.assertRaisesRegex(ValueError, 'at least one input tensor to require gradient'):
+            with self.assertRaisesRegex(
+                ValueError, "at least one input tensor to require gradient"
+            ):
                 gradcheck(lambda x: x, (x,), raise_exception=False, fast_mode=fast_mode)
 
             # (warning) when inputs are not double precision
             x = torch.ones(1, dtype=torch.float32, requires_grad=True)
-            with self.assertWarnsRegex(UserWarning, "Input #0 requires gradient and is not a double precision"):
-                self.assertTrue(gradcheck(lambda x: x, (x,), atol=1e-1, fast_mode=fast_mode))
+            with self.assertWarnsRegex(
+                UserWarning, "Input #0 requires gradient and is not a double precision"
+            ):
+                self.assertTrue(
+                    gradcheck(lambda x: x, (x,), atol=1e-1, fast_mode=fast_mode)
+                )
 
             # when layout is not mkldnn(aka has strides) and input has a dimension with stride 0. (always raises
             # even if raise_exception=False)
             x = torch.ones(1, dtype=torch.float64, requires_grad=True)
             x = x.expand((2, 2))
-            with self.assertRaisesRegex(RuntimeError, 'The 0th input has a dimension with stride 0'):
+            with self.assertRaisesRegex(
+                RuntimeError, "The 0th input has a dimension with stride 0"
+            ):
                 gradcheck(lambda x: x, (x,), raise_exception=False, fast_mode=fast_mode)
 
         check(fast_mode=True)
         check(fast_mode=False)
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     def test_gradcheck_validates_input_mkldnn(self):
         # when mkldnn inputs, forward mode testing is not allowed
         # Update tolerances below to make sure the gradient match even in single precision floats
         # Use the warning assert to hide the float32 warning
         x = torch.ones(1).to_mkldnn().requires_grad_()
-        with self.assertWarnsRegex(UserWarning, "Input #0 requires gradient and is not a double precision"):
-            with self.assertRaisesRegex(ValueError, 'MKLDNN inputs are not support for forward AD gradcheck.'):
-                gradcheck(lambda x: x.to_dense(), (x,), raise_exception=False, fast_mode=False, check_forward_ad=True,
-                          atol=1e-1, rtol=1e-1)
+        with self.assertWarnsRegex(
+            UserWarning, "Input #0 requires gradient and is not a double precision"
+        ):
+            with self.assertRaisesRegex(
+                ValueError, "MKLDNN inputs are not support for forward AD gradcheck."
+            ):
+                gradcheck(
+                    lambda x: x.to_dense(),
+                    (x,),
+                    raise_exception=False,
+                    fast_mode=False,
+                    check_forward_ad=True,
+                    atol=1e-1,
+                    rtol=1e-1,
+                )
 
-        with self.assertWarnsRegex(UserWarning, "Input #0 requires gradient and is not a double precision"):
-            with self.assertRaisesRegex(ValueError, 'MKLDNN inputs are not support for forward AD gradcheck.'):
-                gradcheck(lambda x: x.to_dense(), (x,), raise_exception=False, fast_mode=True, check_forward_ad=True,
-                          atol=1e-1, rtol=1e-1)
+        with self.assertWarnsRegex(
+            UserWarning, "Input #0 requires gradient and is not a double precision"
+        ):
+            with self.assertRaisesRegex(
+                ValueError, "MKLDNN inputs are not support for forward AD gradcheck."
+            ):
+                gradcheck(
+                    lambda x: x.to_dense(),
+                    (x,),
+                    raise_exception=False,
+                    fast_mode=True,
+                    check_forward_ad=True,
+                    atol=1e-1,
+                    rtol=1e-1,
+                )
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     def test_gradcheck_test_outputs(self):
         def check(fast_mode):
             # when sparse outputs (always raise even if raise_exception=False)
             x = torch.rand(10, requires_grad=True).to_sparse()
-            with self.assertRaisesRegex(ValueError, 'Sparse output is not supported at gradcheck yet'):
-                gradcheck(lambda x: x, (x,), masked=True, check_batched_grad=False, raise_exception=False,
-                          fast_mode=fast_mode)
+            with self.assertRaisesRegex(
+                ValueError, "Sparse output is not supported at gradcheck yet"
+            ):
+                gradcheck(
+                    lambda x: x,
+                    (x,),
+                    masked=True,
+                    check_batched_grad=False,
+                    raise_exception=False,
+                    fast_mode=fast_mode,
+                )
 
             # when mkldnn outputs (always raise even if raise_exception=False)
             root = torch.randn(4, 5, dtype=torch.float32, requires_grad=True)
-            with self.assertRaisesRegex(ValueError, 'MKLDNN output is not supported at gradcheck yet'):
-                gradcheck(lambda x: x.to_mkldnn(), (root,), check_batched_grad=False, raise_exception=False, fast_mode=fast_mode)
+            with self.assertRaisesRegex(
+                ValueError, "MKLDNN output is not supported at gradcheck yet"
+            ):
+                gradcheck(
+                    lambda x: x.to_mkldnn(),
+                    (root,),
+                    check_batched_grad=False,
+                    raise_exception=False,
+                    fast_mode=fast_mode,
+                )
+
         check(fast_mode=True)
         check(fast_mode=False)
 
@@ -5135,12 +5572,22 @@ def test_gradcheck_check_no_differentiable_outputs(self):
         def check(fast_mode):
             # When none of the outputs are differentiable, but numerical gradient is not zero
             x = torch.ones((1,), requires_grad=True)
-            with self.assertRaisesRegex(RuntimeError, 'Numerical gradient for function expected to be zero'):
+            with self.assertRaisesRegex(
+                RuntimeError, "Numerical gradient for function expected to be zero"
+            ):
                 gradcheck(lambda x: torch.tensor([x]), x)
-            self.assertFalse(gradcheck(lambda x: torch.tensor([x]), x, raise_exception=False, fast_mode=fast_mode))
+            self.assertFalse(
+                gradcheck(
+                    lambda x: torch.tensor([x]),
+                    x,
+                    raise_exception=False,
+                    fast_mode=fast_mode,
+                )
+            )
 
             # succeed when no outputs at all
             self.assertTrue(gradcheck(lambda x: (), (x,), fast_mode=fast_mode))
+
         check(fast_mode=True)
         check(fast_mode=False)
 
@@ -5148,10 +5595,28 @@ def test_gradcheck_check_batched_grad(self):
         def check(fast_mode):
             x = torch.rand(10, dtype=torch.double, requires_grad=True).to_sparse()
             # runtime error while compute batched grad (print big error)
-            with self.assertRaisesRegex(RuntimeError, 'gradcheck or gradgradcheck failed while testing batched gradient'):
-                gradcheck(lambda x: x.to_dense(), (x,), masked=True, check_batched_grad=True, fast_mode=fast_mode)
-            self.assertFalse(gradcheck(lambda x: x.to_dense(), (x,), masked=True, check_batched_grad=True,
-                                       raise_exception=False, fast_mode=fast_mode))
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "gradcheck or gradgradcheck failed while testing batched gradient",
+            ):
+                gradcheck(
+                    lambda x: x.to_dense(),
+                    (x,),
+                    masked=True,
+                    check_batched_grad=True,
+                    fast_mode=fast_mode,
+                )
+            self.assertFalse(
+                gradcheck(
+                    lambda x: x.to_dense(),
+                    (x,),
+                    masked=True,
+                    check_batched_grad=True,
+                    raise_exception=False,
+                    fast_mode=fast_mode,
+                )
+            )
+
         check(fast_mode=True)
         check(fast_mode=False)
 
@@ -5163,35 +5628,81 @@ def hook(grad):
                     if grad is not None:
                         return grad.to_dense().to_sparse(1)
                     return grad
+
                 y = x.clone()
                 y.register_hook(hook)
                 return y.to_dense()
+
             x = torch.ones((2, 2), dtype=torch.double, requires_grad=True).to_sparse()
-            with self.assertRaisesRegex(RuntimeError, 'grad is sparse tensor, but has incorrect sparse_dim'):
-                gradcheck(fn, (x,), atol=1e-1, masked=True, check_batched_grad=False, fast_mode=fast_mode)
-            self.assertFalse(gradcheck(fn, (x,), atol=1e-1, masked=True, check_batched_grad=False,
-                                       raise_exception=False, fast_mode=fast_mode))
+            with self.assertRaisesRegex(
+                RuntimeError, "grad is sparse tensor, but has incorrect sparse_dim"
+            ):
+                gradcheck(
+                    fn,
+                    (x,),
+                    atol=1e-1,
+                    masked=True,
+                    check_batched_grad=False,
+                    fast_mode=fast_mode,
+                )
+            self.assertFalse(
+                gradcheck(
+                    fn,
+                    (x,),
+                    atol=1e-1,
+                    masked=True,
+                    check_batched_grad=False,
+                    raise_exception=False,
+                    fast_mode=fast_mode,
+                )
+            )
 
             # when backward not multiplied by grad_output (non-sparse case)
             def fn2(x):
                 y = x.clone()
                 y.register_hook(lambda x: x + 1e-2)
                 return y
+
             x = torch.ones(1, dtype=torch.double, requires_grad=True)
-            with self.assertRaisesRegex(RuntimeError, 'backward not multiplied by grad_output'):
+            with self.assertRaisesRegex(
+                RuntimeError, "backward not multiplied by grad_output"
+            ):
                 gradcheck(fn2, (x,), atol=1e-1, fast_mode=fast_mode)
-            self.assertFalse(gradcheck(fn2, (x,), atol=1e-1, raise_exception=False, fast_mode=fast_mode))
+            self.assertFalse(
+                gradcheck(
+                    fn2, (x,), atol=1e-1, raise_exception=False, fast_mode=fast_mode
+                )
+            )
 
             # when backward not multiplied by grad_output (sparse case)
             def fn3(x):
                 y = x.clone().to_dense()
                 y.register_hook(lambda x: x + 1e-2)
                 return y
+
             x = torch.ones(1, dtype=torch.double, requires_grad=True).to_sparse()
-            with self.assertRaisesRegex(RuntimeError, 'backward not multiplied by grad_output'):
-                gradcheck(fn3, (x,), atol=1e-1, masked=True, check_batched_grad=False, fast_mode=fast_mode)
-            self.assertFalse(gradcheck(fn3, (x,), atol=1e-1, masked=True, check_batched_grad=False,
-                                       raise_exception=False, fast_mode=fast_mode))
+            with self.assertRaisesRegex(
+                RuntimeError, "backward not multiplied by grad_output"
+            ):
+                gradcheck(
+                    fn3,
+                    (x,),
+                    atol=1e-1,
+                    masked=True,
+                    check_batched_grad=False,
+                    fast_mode=fast_mode,
+                )
+            self.assertFalse(
+                gradcheck(
+                    fn3,
+                    (x,),
+                    atol=1e-1,
+                    masked=True,
+                    check_batched_grad=False,
+                    raise_exception=False,
+                    fast_mode=fast_mode,
+                )
+            )
 
             # when layout of grad_input is not the same as input
             class Test(Function):
@@ -5202,10 +5713,22 @@ def forward(ctx, x):
                 @staticmethod
                 def backward(ctx, x):
                     return x.to_sparse()
+
             x = torch.ones(1, dtype=torch.double, requires_grad=True)
-            with self.assertRaisesRegex(RuntimeError, 'grad is incorrect layout'):
-                gradcheck(Test.apply, (x,), check_batched_grad=False, fast_mode=fast_mode)
-            self.assertFalse(gradcheck(Test.apply, (x,), check_batched_grad=False, raise_exception=False, fast_mode=fast_mode))
+            with self.assertRaisesRegex(RuntimeError, "grad is incorrect layout"):
+                gradcheck(
+                    Test.apply, (x,), check_batched_grad=False, fast_mode=fast_mode
+                )
+            self.assertFalse(
+                gradcheck(
+                    Test.apply,
+                    (x,),
+                    check_batched_grad=False,
+                    raise_exception=False,
+                    fast_mode=fast_mode,
+                )
+            )
+
         check(fast_mode=True)
         check(fast_mode=False)
 
@@ -5216,14 +5739,25 @@ def fn(x):
                 def hook(x):
                     if x is None:
                         raise RuntimeError("x is undefined")
+
                 y = x.clone()
                 y.register_hook(hook)
                 return y
+
             x = torch.ones(1, dtype=torch.double, requires_grad=True)
-            with self.assertWarnsRegex(UserWarning, "Backwards compatibility: New undefined gradient support checking feature"):
-                with self.assertRaisesRegex(RuntimeError, 'Expected backward function to handle undefined output grads'):
+            with self.assertWarnsRegex(
+                UserWarning,
+                "Backwards compatibility: New undefined gradient support checking feature",
+            ):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "Expected backward function to handle undefined output grads",
+                ):
                     gradcheck(fn, (x,), fast_mode=fast_mode)
-                self.assertFalse(gradcheck(fn, (x,), raise_exception=False, fast_mode=fast_mode))
+                self.assertFalse(
+                    gradcheck(fn, (x,), raise_exception=False, fast_mode=fast_mode)
+                )
+
         check(fast_mode=True)
         check(fast_mode=False)
 
@@ -5233,32 +5767,54 @@ def fn(x):  # R -> R, C -> C
                 y = x.clone()
                 y.register_hook(lambda x: x + 1e-2)
                 return y
+
             x = torch.ones(2, 2, requires_grad=True)
-            with self.assertRaisesRegex(RuntimeError, 'Jacobian mismatch for output 0 with respect to input 0'):
+            with self.assertRaisesRegex(
+                RuntimeError, "Jacobian mismatch for output 0 with respect to input 0"
+            ):
                 gradcheck(fn, (x,), fast_mode=fast_mode)
-            self.assertFalse(gradcheck(fn, (x,), raise_exception=False, fast_mode=fast_mode))
+            self.assertFalse(
+                gradcheck(fn, (x,), raise_exception=False, fast_mode=fast_mode)
+            )
 
             x_c = torch.ones(2, 2, requires_grad=True, dtype=torch.complex128)
-            with self.assertRaisesRegex(RuntimeError, 'While considering the imaginary part of complex outputs only'):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "While considering the imaginary part of complex outputs only",
+            ):
                 gradcheck(fn, (x_c,), fast_mode=False)
-            self.assertFalse(gradcheck(fn, (x_c,), raise_exception=False, fast_mode=False))
+            self.assertFalse(
+                gradcheck(fn, (x_c,), raise_exception=False, fast_mode=False)
+            )
 
             def fn2(x):  # R -> C
                 y = torch.complex(x, x)
                 y.register_hook(lambda x: x + 1e-2)
                 return y
+
             x = torch.ones(2, 2, requires_grad=True)
-            with self.assertRaisesRegex(RuntimeError, 'While considering the imaginary part of complex outputs only'):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "While considering the imaginary part of complex outputs only",
+            ):
                 gradcheck(fn2, (x,), fast_mode=False)
-            self.assertFalse(gradcheck(fn2, (x,), raise_exception=False, fast_mode=False))
+            self.assertFalse(
+                gradcheck(fn2, (x,), raise_exception=False, fast_mode=False)
+            )
 
             def fn3(x):  # C -> R
                 y = torch.real(x)
                 y.register_hook(lambda x: x + 1e-2)
                 return y
-            with self.assertRaisesRegex(RuntimeError, 'Jacobian mismatch for output 0 with respect to input 0'):
+
+            with self.assertRaisesRegex(
+                RuntimeError, "Jacobian mismatch for output 0 with respect to input 0"
+            ):
                 gradcheck(fn3, (x_c,), fast_mode=False)
-            self.assertFalse(gradcheck(fn3, (x_c,), raise_exception=False, fast_mode=False))
+            self.assertFalse(
+                gradcheck(fn3, (x_c,), raise_exception=False, fast_mode=False)
+            )
+
         check(fast_mode=True)
         check(fast_mode=False)
 
@@ -5266,25 +5822,56 @@ def test_gradcheck_dense_and_sparse_inputs(self):
         def check(fast_mode):
             def fn(x, y):
                 return x * y.coalesce().to_dense()
+
             a = torch.rand(2, 2, dtype=torch.double, requires_grad=True)
-            b = torch.rand(2, 2, dtype=torch.double,).to_sparse().requires_grad_(True)
-            self.assertTrue(gradcheck(fn, (a, b), masked=True, check_batched_grad=False, fast_mode=fast_mode))
+            b = (
+                torch.rand(
+                    2,
+                    2,
+                    dtype=torch.double,
+                )
+                .to_sparse()
+                .requires_grad_(True)
+            )
+            self.assertTrue(
+                gradcheck(
+                    fn,
+                    (a, b),
+                    masked=True,
+                    check_batched_grad=False,
+                    fast_mode=fast_mode,
+                )
+            )
+
         check(fast_mode=True)
         check(fast_mode=False)
 
-    @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
+    @unittest.skipIf(
+        not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled"
+    )
     def test_gradcheck_multiple_mkldnn_inputs(self):
         def check(fast_mode):
             def fn(x, y):
                 return x + y.to_dense()
+
             a = torch.rand(10, requires_grad=True)
             b = torch.rand(10, dtype=torch.float32).to_mkldnn().requires_grad_(True)
-            self.assertTrue(gradcheck(fn, (a, b), atol=1e-1, check_batched_grad=False, fast_mode=fast_mode))
+            self.assertTrue(
+                gradcheck(
+                    fn, (a, b), atol=1e-1, check_batched_grad=False, fast_mode=fast_mode
+                )
+            )
 
             def fn2(x, y):
                 return x.to_dense() + y.to_dense()
+
             c = torch.rand(10, dtype=torch.float32).to_mkldnn().requires_grad_(True)
-            self.assertTrue(gradcheck(fn, (a, c), atol=1e-1, check_batched_grad=False, fast_mode=fast_mode))
+            self.assertTrue(
+                gradcheck(
+                    fn, (a, c), atol=1e-1, check_batched_grad=False, fast_mode=fast_mode
+                )
+            )
+
         check(fast_mode=True)
         check(fast_mode=False)
 
@@ -5295,8 +5882,12 @@ def fn(x):
                     return torch.cat([x, x])
                 else:
                     return x
+
             a = torch.ones(1, dtype=torch.double, requires_grad=True)
-            with self.assertRaisesRegex(AssertionError, 'return outputs with the same shape when inputs are perturbed'):
+            with self.assertRaisesRegex(
+                AssertionError,
+                "return outputs with the same shape when inputs are perturbed",
+            ):
                 self.assertTrue(gradcheck(fn, (a,), fast_mode=fast_mode))
 
             def fn2(x):
@@ -5304,8 +5895,13 @@ def fn2(x):
                     return x.to(torch.float32)
                 else:
                     return x
-            with self.assertRaisesRegex(AssertionError, 'return outputs with the same dtype when inputs are perturbed'):
+
+            with self.assertRaisesRegex(
+                AssertionError,
+                "return outputs with the same dtype when inputs are perturbed",
+            ):
                 self.assertTrue(gradcheck(fn2, (a,), fast_mode=fast_mode))
+
         check(fast_mode=True)
         check(fast_mode=False)
 
@@ -5313,12 +5909,14 @@ def test_gradcheck_complex_non_complex_outputs(self):
         def fn(x, y):
             z = torch.complex(x, y)
             return z, x + 1
+
         a = torch.ones(2, 2, requires_grad=True, dtype=torch.float64)
         b = torch.ones(2, 2, requires_grad=True, dtype=torch.float64)
         self.assertTrue(gradcheck(fn, (a, b)))
 
         def fn2(z):
             return z, torch.real(z)
+
         c = torch.ones(2, 2, requires_grad=True, dtype=torch.complex128)
         self.assertTrue(gradcheck(fn2, (c)))
 
@@ -5332,14 +5930,19 @@ def fn(inputs):
             x = inputs[0]
             y = inputs[1]
             return 2 * x + y, x + 2 * y
+
         a = torch.rand(2, 2, requires_grad=True, dtype=torch.float64)
         b = torch.rand(2, 2, requires_grad=True, dtype=torch.float64)
 
-        with self.assertWarnsRegex(UserWarning, "get_numerical_jacobian was part of PyTorch's private API"):
+        with self.assertWarnsRegex(
+            UserWarning, "get_numerical_jacobian was part of PyTorch's private API"
+        ):
             jacobian = get_numerical_jacobian(fn, (a, b), target=a, eps=1e-6)
         self.assertEqual(jacobian[0], 2 * torch.eye(4, dtype=torch.double))
 
-        with self.assertWarnsRegex(UserWarning, "get_numerical_jacobian was part of PyTorch's private API"):
+        with self.assertWarnsRegex(
+            UserWarning, "get_numerical_jacobian was part of PyTorch's private API"
+        ):
             jacobian = get_numerical_jacobian(fn, (a, b), eps=1e-6)
         self.assertEqual(jacobian[0], 2 * torch.eye(4, dtype=torch.double))
         self.assertEqual(jacobian[1], 1 * torch.eye(4, dtype=torch.double))
@@ -5357,8 +5960,15 @@ def fn(x, y):
         b = torch.rand(2, 2, requires_grad=True, dtype=torch.float64)
 
         outputs = fn(a, b)
-        with self.assertWarnsRegex(UserWarning, "get_analytical_jacobian was part of PyTorch's private API"):
-            jacobians, reentrant, correct_grad_sizes, correct_grad_types = get_analytical_jacobian((a, b), outputs[0])
+        with self.assertWarnsRegex(
+            UserWarning, "get_analytical_jacobian was part of PyTorch's private API"
+        ):
+            (
+                jacobians,
+                reentrant,
+                correct_grad_sizes,
+                correct_grad_types,
+            ) = get_analytical_jacobian((a, b), outputs[0])
         self.assertEqual(jacobians[0], 2 * torch.eye(4, dtype=torch.double))
         self.assertEqual(jacobians[1], 1 * torch.eye(4, dtype=torch.double))
         self.assertTrue(reentrant)
@@ -5371,11 +5981,22 @@ def forward(ctx, x, jitter=0.0):
 
             @staticmethod
             def backward(ctx, grad_out):
-                return NonDetFunc.apply(grad_out, ctx._jitter) * (1 + torch.rand_like(grad_out) * ctx._jitter), None
+                return (
+                    NonDetFunc.apply(grad_out, ctx._jitter)
+                    * (1 + torch.rand_like(grad_out) * ctx._jitter),
+                    None,
+                )
 
         outputs = NonDetFunc.apply(a, 1e-6)
-        with self.assertWarnsRegex(UserWarning, "get_analytical_jacobian was part of PyTorch's private API"):
-            jacobians, reentrant, correct_grad_sizes, correct_grad_types = get_analytical_jacobian((a,), outputs)
+        with self.assertWarnsRegex(
+            UserWarning, "get_analytical_jacobian was part of PyTorch's private API"
+        ):
+            (
+                jacobians,
+                reentrant,
+                correct_grad_sizes,
+                correct_grad_types,
+            ) = get_analytical_jacobian((a,), outputs)
         self.assertFalse(reentrant)
 
         with self.assertRaisesRegex(ValueError, "Expected grad_out to be 1.0"):
@@ -5389,15 +6010,23 @@ def fn(x):
                 y = x.clone()
                 y.register_hook(lambda x: x + 1e-2)
                 return y
+
             x = torch.ones(2, 2, requires_grad=True)
-            with self.assertRaisesRegex(GradcheckError, 'Jacobian mismatch for output 0 with respect to input 0'):
+            with self.assertRaisesRegex(
+                GradcheckError, "Jacobian mismatch for output 0 with respect to input 0"
+            ):
                 gradcheck(fn, (x,), fast_mode=fast_mode)
-            with self.assertRaisesRegex(RuntimeError, 'Jacobian mismatch for output 0 with respect to input 0'):
+            with self.assertRaisesRegex(
+                RuntimeError, "Jacobian mismatch for output 0 with respect to input 0"
+            ):
                 gradcheck(fn, (x,), fast_mode=fast_mode)
-            self.assertFalse(gradcheck(fn, (x,), raise_exception=False, fast_mode=fast_mode))
+            self.assertFalse(
+                gradcheck(fn, (x,), raise_exception=False, fast_mode=fast_mode)
+            )
 
             def fn2(x):
                 raise RuntimeError("Not a GradcheckError!")
+
             # Checks that when raise_exception=False, non-GradcheckErrors are not caught by gradcheck
             with self.assertRaisesRegex(RuntimeError, "Not a GradcheckError!"):
                 gradcheck(fn2, (x,), fast_mode=fast_mode, raise_exception=False)
@@ -5432,6 +6061,7 @@ def bad_fn(x, y):
 
             def basic_mul(x):
                 return torch.view_as_real(torch.resolve_conj(x * 1j))
+
             gradcheck(basic_mul, x, check_forward_ad=True, fast_mode=fast_mode)
 
             # Test for one input and one output being complex
@@ -5469,19 +6099,47 @@ def jvp(ctx, x_t, y_t):
         x = torch.rand(2, dtype=torch.double, requires_grad=True)
         y = torch.rand(2, dtype=torch.double, requires_grad=True)
 
-        gradcheck(UserFn.apply, (x, y), check_forward_ad=True, check_undefined_grad=False, check_backward_ad=False,
-                  check_batched_grad=False, check_batched_forward_grad=False)
+        gradcheck(
+            UserFn.apply,
+            (x, y),
+            check_forward_ad=True,
+            check_undefined_grad=False,
+            check_backward_ad=False,
+            check_batched_grad=False,
+            check_batched_forward_grad=False,
+        )
 
-        gradcheck(UserFn.apply, (x, y), check_forward_ad=True, check_undefined_grad=True, check_backward_ad=False,
-                  check_batched_grad=False, check_batched_forward_grad=False)
+        gradcheck(
+            UserFn.apply,
+            (x, y),
+            check_forward_ad=True,
+            check_undefined_grad=True,
+            check_backward_ad=False,
+            check_batched_grad=False,
+            check_batched_forward_grad=False,
+        )
 
-        gradcheck(UserFn.apply, (x, y), check_forward_ad=True, check_undefined_grad=True, check_backward_ad=False,
-                  check_batched_grad=False, check_batched_forward_grad=True)
+        gradcheck(
+            UserFn.apply,
+            (x, y),
+            check_forward_ad=True,
+            check_undefined_grad=True,
+            check_backward_ad=False,
+            check_batched_grad=False,
+            check_batched_forward_grad=True,
+        )
 
         x = torch.rand(2, dtype=torch.double, requires_grad=True)
         y = torch.rand(2, dtype=torch.double, requires_grad=False)
-        gradcheck(UserFn.apply, (x, y), check_forward_ad=True, check_undefined_grad=True, check_backward_ad=False,
-                  check_batched_grad=False, check_batched_forward_grad=True)
+        gradcheck(
+            UserFn.apply,
+            (x, y),
+            check_forward_ad=True,
+            check_undefined_grad=True,
+            check_backward_ad=False,
+            check_batched_grad=False,
+            check_batched_forward_grad=True,
+        )
 
     def test_gradcheck_forward_ad_respects_requires_grad(self):
         # Currently requires_grad is used as a easy way for gradcheck to know
@@ -5502,19 +6160,44 @@ def jvp(ctx, x_t, y_t):
         #     that fast and slow have the same counts
         x = torch.rand(1, dtype=torch.double, requires_grad=True)
         y = torch.rand(1, dtype=torch.double, requires_grad=True)
-        gradcheck(UserFn.apply, (x, y), check_forward_ad=True, check_undefined_grad=False, check_backward_ad=False,
-                  check_batched_grad=False, check_batched_forward_grad=False)
+        gradcheck(
+            UserFn.apply,
+            (x, y),
+            check_forward_ad=True,
+            check_undefined_grad=False,
+            check_backward_ad=False,
+            check_batched_grad=False,
+            check_batched_forward_grad=False,
+        )
         self.assertEqual(jvp_count[0], 2)  # (2) once per input
         jvp_count = [0]
 
-        gradcheck(UserFn.apply, (x, y), check_forward_ad=True, check_undefined_grad=True, check_backward_ad=False,
-                  check_batched_grad=False, check_batched_forward_grad=False)
-        self.assertEqual(jvp_count[0], 6)  # (+4): (once with normal ZT (+1), once with efficient ZT (+1)) for each input (x2)
+        gradcheck(
+            UserFn.apply,
+            (x, y),
+            check_forward_ad=True,
+            check_undefined_grad=True,
+            check_backward_ad=False,
+            check_batched_grad=False,
+            check_batched_forward_grad=False,
+        )
+        self.assertEqual(
+            jvp_count[0], 6
+        )  # (+4): (once with normal ZT (+1), once with efficient ZT (+1)) for each input (x2)
         jvp_count = [0]
 
-        gradcheck(UserFn.apply, (x, y), check_forward_ad=True, check_undefined_grad=True, check_backward_ad=False,
-                  check_batched_grad=False, check_batched_forward_grad=True)
-        self.assertEqual(jvp_count[0], 12)  # (+6): (compute batch of 2 with vmap (+1), with a loop (+2)) for each input (x2)
+        gradcheck(
+            UserFn.apply,
+            (x, y),
+            check_forward_ad=True,
+            check_undefined_grad=True,
+            check_backward_ad=False,
+            check_batched_grad=False,
+            check_batched_forward_grad=True,
+        )
+        self.assertEqual(
+            jvp_count[0], 12
+        )  # (+6): (compute batch of 2 with vmap (+1), with a loop (+2)) for each input (x2)
         jvp_count = [0]
 
         # Repeat the previous test except we mark one input with requires_grad=False
@@ -5522,8 +6205,15 @@ def jvp(ctx, x_t, y_t):
         #     Otherwise, other counts are halved.
         x = torch.rand(1, dtype=torch.double, requires_grad=True)
         y = torch.rand(1, dtype=torch.double, requires_grad=False)
-        gradcheck(UserFn.apply, (x, y), check_forward_ad=True, check_undefined_grad=True, check_backward_ad=False,
-                  check_batched_grad=False, check_batched_forward_grad=True)
+        gradcheck(
+            UserFn.apply,
+            (x, y),
+            check_forward_ad=True,
+            check_undefined_grad=True,
+            check_backward_ad=False,
+            check_batched_grad=False,
+            check_batched_forward_grad=True,
+        )
         self.assertEqual(jvp_count[0], 5)  # 1 + 1 + 3
 
     def test_gradcheck_check_forward_or_backward_only(self):
@@ -5563,14 +6253,22 @@ def jvp(ctx, gI, _1, _2):
                             bwd_should_fail = bwd_bad and check_backward_ad
 
                             def run():
-                                gradcheck(UserFn.apply, (x, fwd_bad, bwd_bad), check_forward_ad=check_forward_ad,
-                                          check_backward_ad=check_backward_ad, check_undefined_grad=check_backward_ad,
-                                          check_batched_grad=check_backward_ad, fast_mode=fast_mode)
+                                gradcheck(
+                                    UserFn.apply,
+                                    (x, fwd_bad, bwd_bad),
+                                    check_forward_ad=check_forward_ad,
+                                    check_backward_ad=check_backward_ad,
+                                    check_undefined_grad=check_backward_ad,
+                                    check_batched_grad=check_backward_ad,
+                                    fast_mode=fast_mode,
+                                )
 
                             x = torch.rand(2, dtype=torch.double, requires_grad=True)
 
                             if not check_forward_ad and not check_backward_ad:
-                                with self.assertRaisesRegex(AssertionError, "Expected at least one of"):
+                                with self.assertRaisesRegex(
+                                    AssertionError, "Expected at least one of"
+                                ):
                                     run()
                                 continue
 
@@ -5591,14 +6289,30 @@ def test_gradcheck_forward_ad_batched_grad(self):
         # multiple inputs and outputs with non-tensors inputs
         def fn1(a: torch.Tensor, b: int):
             return a.clone(), a + 1
-        gradcheck(fn1, (x, 1), check_forward_ad=True, check_backward_ad=False, check_batched_grad=False,
-                  check_undefined_grad=False, check_batched_forward_grad=True)
+
+        gradcheck(
+            fn1,
+            (x, 1),
+            check_forward_ad=True,
+            check_backward_ad=False,
+            check_batched_grad=False,
+            check_undefined_grad=False,
+            check_batched_forward_grad=True,
+        )
 
         # unrelated inputs: tangent for c is None
         def fn2(a: torch.Tensor, c: torch.Tensor):
             return a.clone()
-        gradcheck(fn2, (x, x.clone()), check_forward_ad=True, check_backward_ad=False, check_batched_grad=False,
-                  check_undefined_grad=False, check_batched_forward_grad=True)
+
+        gradcheck(
+            fn2,
+            (x, x.clone()),
+            check_forward_ad=True,
+            check_backward_ad=False,
+            check_batched_grad=False,
+            check_undefined_grad=False,
+            check_batched_forward_grad=True,
+        )
 
         class Fn(Function):
             @staticmethod
@@ -5616,7 +6330,9 @@ def jvp(ctx, gI):
 
         msg = "vmap: We do not yet support calling random operations inside of vmap"
         with self.assertRaisesRegex(RuntimeError, msg):
-            gradcheck(Fn.apply, (x,), check_forward_ad=True, check_batched_forward_grad=True)
+            gradcheck(
+                Fn.apply, (x,), check_forward_ad=True, check_batched_forward_grad=True
+            )
 
     def test_version_counter(self):
         x = torch.randn(1, 2)
@@ -5651,7 +6367,7 @@ def test_set_data_tensorimpl_type(self):
         # of type `SparseTensorImpl`.
         x = torch.randn(1, 2)
         x_s = torch.sparse_coo_tensor(torch.zeros([1, 1]), torch.ones([1]))
-        with self.assertRaisesRegex(RuntimeError, 'incompatible tensor type'):
+        with self.assertRaisesRegex(RuntimeError, "incompatible tensor type"):
             x.data = x_s
 
     def test_set_data_preserve_pyobj(self):
@@ -5666,7 +6382,9 @@ def test_set_data_self_requires_grad(self):
         b = torch.tensor(2.0)
         c = torch.tensor(3, dtype=torch.int64)
         a.data = b
-        with self.assertRaisesRegex(RuntimeError, 'must be floating point or complex dtype'):
+        with self.assertRaisesRegex(
+            RuntimeError, "must be floating point or complex dtype"
+        ):
             a.data = c
 
     @unittest.skipIf(IS_WINDOWS, "Skipping because doesn't work for windows")
@@ -5694,14 +6412,16 @@ def backward(ctx, grad):
         # The autograd engine creates worker threads only when GPU devices are present.
         # So make sure that we do shutdown threads when we're testing cuda and make sure
         # that there is no thread to shutdown when we're not using cuda.
-        if TEST_CUDA or torch.backends.mps.is_available():
+        if TEST_CUDA or torch.backends.mps.is_available() or torch.xpu.is_available():
             self.assertRegex(s, "PYTORCH_API_USAGE torch.autograd.thread_shutdown")
         else:
             self.assertNotRegex(s, "PYTORCH_API_USAGE torch.autograd.thread_shutdown")
 
-    @unittest.skipIf(IS_MACOS, "Fails with SIGBUS on macOS; https://github.com/pytorch/pytorch/issues/25941")
+    @unittest.skipIf(
+        IS_MACOS,
+        "Fails with SIGBUS on macOS; https://github.com/pytorch/pytorch/issues/25941",
+    )
     def test_deep_reentrant(self):
-
         class DeepReentrant(Function):
             @staticmethod
             def forward(ctx, x):
@@ -5773,7 +6493,6 @@ def backward(ctx, x):
         self.assertEqual(order.count("Reentrant"), 10)
         self.assertEqual(order[-1], "MyFunction")
 
-
     @slowTest
     def test_checkpointing(self):
         num_inp = 2000
@@ -5785,7 +6504,7 @@ def test_checkpointing(self):
         module = nn.Sequential(
             nn.Linear(nz_inp, nz_bottleneck),
             nn.ReLU(),
-            nn.Linear(nz_bottleneck, nz_inp)
+            nn.Linear(nz_bottleneck, nz_inp),
         )
 
         feat_combined = []
@@ -5802,27 +6521,28 @@ def test_checkpointing(self):
 
     def _test_checkpointing_non_reentrant_autocast(self, device_type):
         for enabled in [True, False]:
+
             def foo(x, y, z):
                 # torch.mm is on autocast's list of ops that should run in
                 # the autocast precision
                 x = torch.mm(x, y)
                 y = torch.mm(x, z)
                 z = torch.mm(z, z)
-                expected_dtype = (
-                    torch.float32 if not enabled else torch.bfloat16
-                )
+                expected_dtype = torch.float32 if not enabled else torch.bfloat16
                 self.assertEqual(expected_dtype, z.dtype)
                 return z
 
             x = torch.randn(3, 3, requires_grad=True)
             y = torch.randn(3, 3, requires_grad=True)
             z = torch.randn(3, 3, requires_grad=True)
-            if device_type == 'cuda':
+            if device_type == "cuda":
                 x = x.cuda()
                 y = y.cuda()
                 z = z.cuda()
 
-            with torch.autocast(enabled=enabled, device_type=device_type, dtype=torch.bfloat16):
+            with torch.autocast(
+                enabled=enabled, device_type=device_type, dtype=torch.bfloat16
+            ):
                 loss = checkpoint(foo, x, y, z, use_reentrant=False)
                 loss = loss.sum()
 
@@ -5835,18 +6555,18 @@ def test_checkpointing_non_reentrant_autocast_cpu(self):
         Test that autocast args such as the dtype are preserved during non-reentrant
         checkpoint recomputation on CPU.
         """
-        self._test_checkpointing_non_reentrant_autocast(device_type='cpu')
+        self._test_checkpointing_non_reentrant_autocast(device_type="cpu")
 
     @unittest.skipIf(
         not torch.cuda.is_available() or not torch.cuda.is_bf16_supported(),
-        "Test requires CUDA bf16 support"
+        "Test requires CUDA bf16 support",
     )
     def test_checkpointing_non_reentrant_autocast_gpu(self):
         """
         Test that autocast args/kwargs such as the dtype are preserved during
         non-reentrant checkpoint recomputation on GPU.
         """
-        self._test_checkpointing_non_reentrant_autocast(device_type='cuda')
+        self._test_checkpointing_non_reentrant_autocast(device_type="cuda")
 
     @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
     @slowTest
@@ -5875,15 +6595,23 @@ def forward(self, x):
                     if not self.use_checkpoint:
                         x = self.layers[i](x)
                     else:
-                        x = checkpoint(self.layers[i], x, use_reentrant=self.use_reentrant)
+                        x = checkpoint(
+                            self.layers[i], x, use_reentrant=self.use_reentrant
+                        )
 
                 return x
 
-        model_no_checkpoint = MyModel(8, use_checkpoint=False, use_reentrant=False).cuda()
-        model_reentrant_checkpoint = MyModel(8, use_checkpoint=True, use_reentrant=True).cuda()
-        model_no_reentrant_checkpoint = MyModel(8, use_checkpoint=True, use_reentrant=False).cuda()
+        model_no_checkpoint = MyModel(
+            8, use_checkpoint=False, use_reentrant=False
+        ).cuda()
+        model_reentrant_checkpoint = MyModel(
+            8, use_checkpoint=True, use_reentrant=True
+        ).cuda()
+        model_no_reentrant_checkpoint = MyModel(
+            8, use_checkpoint=True, use_reentrant=False
+        ).cuda()
 
-        x = torch.randn(100, 256, requires_grad=True, device='cuda')
+        x = torch.randn(100, 256, requires_grad=True, device="cuda")
 
         torch.cuda.reset_peak_memory_stats()
         loss = model_no_checkpoint(x.clone()).sum()
@@ -5925,9 +6653,9 @@ def backward(ctx, grad_out):
                     x_2, y_2, z_2, w_2, out_2 = ctx.saved_tensors
                 return x, y, z
 
-        x = torch.tensor(1., requires_grad=True)
-        y = torch.tensor(2., requires_grad=True)
-        z = torch.tensor(3., requires_grad=True)
+        x = torch.tensor(1.0, requires_grad=True)
+        y = torch.tensor(2.0, requires_grad=True)
+        z = torch.tensor(3.0, requires_grad=True)
 
         def foo(x, y, z):
             x = x * y * z
@@ -5950,27 +6678,36 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 self.operators.append(func.__name__)
                 return func(*args, **kwargs)
 
-        x = torch.tensor(1., requires_grad=True)
+        x = torch.tensor(1.0, requires_grad=True)
         verbose_mode = VerboseTorchDispatchMode()
 
         def context_fn():
             return verbose_mode, contextlib.nullcontext()
-        out = checkpoint(lambda x: x.exp(), x, use_reentrant=False, context_fn=context_fn)
-        self.assertEqual(verbose_mode.operators, ['exp.default'])
+
+        out = checkpoint(
+            lambda x: x.exp(), x, use_reentrant=False, context_fn=context_fn
+        )
+        self.assertEqual(verbose_mode.operators, ["exp.default"])
 
         verbose_mode.operators = []
 
         def context_fn():
             return contextlib.nullcontext(), verbose_mode
-        out = checkpoint(lambda x: x.exp(), x, use_reentrant=False, context_fn=context_fn)
+
+        out = checkpoint(
+            lambda x: x.exp(), x, use_reentrant=False, context_fn=context_fn
+        )
         out.backward()
         self.assertEqual(
-            verbose_mode.operators,
-            ['exp.default', 'detach.default', 'detach.default']
+            verbose_mode.operators, ["exp.default", "detach.default", "detach.default"]
         )
 
-        with self.assertRaisesRegex(Exception, "only supported when use_reentrant=False"):
-            out = checkpoint(lambda x: x.sin(), x, use_reentrant=True, context_fn=context_fn)
+        with self.assertRaisesRegex(
+            Exception, "only supported when use_reentrant=False"
+        ):
+            out = checkpoint(
+                lambda x: x.sin(), x, use_reentrant=True, context_fn=context_fn
+            )
 
     def test_checkpoint_warns_if_use_reentrant_not_passed_explcitly(self):
         a = torch.randn(1, requires_grad=True)
@@ -5979,7 +6716,9 @@ def test_checkpoint_warns_if_use_reentrant_not_passed_explcitly(self):
         self.assertNotWarn(lambda: checkpoint(lambda x: x, a, use_reentrant=False))
 
         # Not passing explicitly warns
-        with self.assertWarnsOnceRegex(UserWarning, ".*the use_reentrant parameter should be passed explicitly.*"):
+        with self.assertWarnsOnceRegex(
+            UserWarning, ".*the use_reentrant parameter should be passed explicitly.*"
+        ):
             checkpoint(lambda x: x, a)
 
     def test_checkpoint_sequential_warns_if_use_reentrant_not_passed_explcitly(self):
@@ -5987,14 +6726,18 @@ def test_checkpoint_sequential_warns_if_use_reentrant_not_passed_explcitly(self)
         modules_list = [
             torch.nn.Linear(3, 3),
             torch.nn.Linear(3, 3),
-            torch.nn.Linear(3, 3)
+            torch.nn.Linear(3, 3),
         ]
 
         # Passing explicitly should not warn
-        self.assertNotWarn(lambda: checkpoint_sequential(modules_list, 3, a, use_reentrant=False))
+        self.assertNotWarn(
+            lambda: checkpoint_sequential(modules_list, 3, a, use_reentrant=False)
+        )
 
         # Not passing explicitly warns
-        with self.assertWarnsOnceRegex(UserWarning, ".*the use_reentrant parameter should be passed explicitly.*"):
+        with self.assertWarnsOnceRegex(
+            UserWarning, ".*the use_reentrant parameter should be passed explicitly.*"
+        ):
             checkpoint_sequential(modules_list, 3, a)
 
     def test_checkpoint_detects_non_determinism(self):
@@ -6007,7 +6750,7 @@ def save_2_tensors(x):
             return x.sin().exp()
 
         def save_2_tensors_alt(x):
-            return x.sin() * torch.tensor([1., 2.])
+            return x.sin() * torch.tensor([1.0, 2.0])
 
         def get_non_det_fn(orig_fn, recompute_fn):
             counter = [0]
@@ -6018,20 +6761,25 @@ def fn(x):
                     return orig_fn(x)
                 else:
                     return recompute_fn(x)
+
             return fn
 
         a = torch.randn(1, requires_grad=True)
 
         # Save fewer tensors during recompute
         fn = get_non_det_fn(orig_fn=save_3_tensors, recompute_fn=save_2_tensors)
-        with self.assertRaisesRegex(RuntimeError, "A different number of tensors was saved"):
+        with self.assertRaisesRegex(
+            RuntimeError, "A different number of tensors was saved"
+        ):
             out = checkpoint(fn, a, use_reentrant=False)
             out.backward()
 
         # Save more tensors during recompute
         fn = get_non_det_fn(orig_fn=save_2_tensors, recompute_fn=save_3_tensors)
         with torch.utils.checkpoint.set_checkpoint_early_stop(False):
-            with self.assertRaisesRegex(RuntimeError, "trying to save more tensors during recomputation"):
+            with self.assertRaisesRegex(
+                RuntimeError, "trying to save more tensors during recomputation"
+            ):
                 out = checkpoint(fn, a, use_reentrant=False)
                 out.backward()
 
@@ -6049,26 +6797,32 @@ def fn(x):
         # Get the debug message if debug=True
         fn = get_non_det_fn(orig_fn=save_2_tensors, recompute_fn=save_2_tensors_alt)
 
-        with self.assertRaisesRegex(RuntimeError, "You are seeing this error because you passed `debug=True` to checkpoint"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "You are seeing this error because you passed `debug=True` to checkpoint",
+        ):
             out = checkpoint(fn, a, use_reentrant=False, debug=True)
             out.backward()
 
         fn = get_non_det_fn(orig_fn=save_2_tensors, recompute_fn=save_2_tensors_alt)
 
-        with self.assertRaisesRegex(RuntimeError, "You are seeing this error because you passed `debug=True` to checkpoint"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "You are seeing this error because you passed `debug=True` to checkpoint",
+        ):
             with torch.utils.checkpoint.set_checkpoint_debug_enabled(True):
                 out = checkpoint(fn, a, use_reentrant=False, debug=False)
                 out.backward()
 
         fn = get_non_det_fn(orig_fn=save_2_tensors, recompute_fn=save_2_tensors_alt)
 
-        with self.assertRaisesRegex(RuntimeError, "Recomputed values for the following tensors have different"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Recomputed values for the following tensors have different"
+        ):
             with torch.utils.checkpoint.set_checkpoint_debug_enabled(False):
                 out = checkpoint(fn, a, use_reentrant=False, debug=True)
                 out.backward()
 
-
-
     def test_access_saved_tensor_twice_without_recomputation_works(self):
         count = [0]
 
@@ -6097,7 +6851,7 @@ def foo(a):
         # Now it raises an error
         with self.assertRaisesRegex(
             RuntimeError,
-            "or directly access saved tensors after they have already been freed"
+            "or directly access saved tensors after they have already been freed",
         ):
             d.grad_fn._saved_result
 
@@ -6116,7 +6870,7 @@ def test_checkpointing_without_reentrant(self, input_requires_grad):
         module = nn.Sequential(
             nn.Linear(nz_inp, nz_bottleneck),
             nn.ReLU(),
-            nn.Linear(nz_bottleneck, nz_inp)
+            nn.Linear(nz_bottleneck, nz_inp),
         )
 
         # Module holder for testing activation checkpointing with no_reentrant
@@ -6147,20 +6901,23 @@ def forward(self, data):
             feat_r_no_checkpoint = module_copy(data_r)
             feat_combined_no_checkpoint.append(feat_r_no_checkpoint)
 
-
         # compute mean as a proxy for some joint reasoning
         mean_combined = torch.stack(feat_combined).mean()
         mean_combined.backward()
         mean_combined_no_checkpoint = torch.stack(feat_combined_no_checkpoint).mean()
         mean_combined_no_checkpoint.backward()
 
-        for checkpoint_param, param in zip(module.parameters(), module_copy.parameters()):
+        for checkpoint_param, param in zip(
+            module.parameters(), module_copy.parameters()
+        ):
             self.assertEqual(checkpoint_param.grad, param.grad)
 
     def test_checkpoint_valid_reset_on_error(self):
         a = torch.randn(2, 2, requires_grad=True)
 
-        with self.assertRaisesRegex(Exception, "Checkpointing is not compatible with .grad()"):
+        with self.assertRaisesRegex(
+            Exception, "torch.utils.checkpoint is incompatible"
+        ):
             b = checkpoint(torch.exp, a, use_reentrant=True).sum()
             torch.autograd.grad(b, (a,))
 
@@ -6183,8 +6940,7 @@ def forward(self, x):
 
         err_ctx = (
             self.assertRaisesRegex(
-                RuntimeError,
-                "none of output has requires_grad=True"
+                RuntimeError, "none of output has requires_grad=True"
             )
             if use_reentrant
             else contextlib.nullcontext()
@@ -6218,7 +6974,7 @@ def test_checkpointing_without_reentrant_correct_grad(self):
 
         a.grad = None
         d = checkpoint(torch.exp, a, use_reentrant=False).sum()
-        d_grad, = torch.autograd.grad(d, (a,))
+        (d_grad,) = torch.autograd.grad(d, (a,))
 
         self.assertEqual(b_grad, c_grad)
         self.assertEqual(b_grad, d_grad)
@@ -6228,6 +6984,7 @@ def test_checkpointing_without_reentrant_dataparallel(self):
         Verifies gradient correctness when checkpoint without reentrant autograd
         is used in conjunction with DataParallel.
         """
+
         class LinearModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -6271,7 +7028,9 @@ def hook(grad):
         w.register_hook(hook)
         x = torch.rand(10, 10, requires_grad=True)
         h = w * x  # Using w outside the checkpoint
-        out = checkpoint(lambda x: w * x, h, use_reentrant=False)  # Using w inside the checkpoint
+        out = checkpoint(
+            lambda x: w * x, h, use_reentrant=False
+        )  # Using w inside the checkpoint
 
         out.sum().backward()
         # should only call hook once
@@ -6290,23 +7049,17 @@ def __init__(self):
 
             def forward(self, dict_input):
                 tensor = dict_input["tensor"]
-                return {
-                    "result": self.layer(tensor)
-                }
+                return {"result": self.layer(tensor)}
 
         model_no_checkpoint = MyModel()
         model_checkpoint_without_reentrant = deepcopy(model_no_checkpoint)
 
-        inp = {
-            "tensor": torch.randn(5, 5)
-        }
+        inp = {"tensor": torch.randn(5, 5)}
 
         out_no_checkpoint = model_no_checkpoint(inp)["result"].sum()
 
         out_checkpoint = checkpoint(
-            model_checkpoint_without_reentrant,
-            inp,
-            use_reentrant=False
+            model_checkpoint_without_reentrant, inp, use_reentrant=False
         )["result"].sum()
 
         self.assertEqual(out_checkpoint, out_no_checkpoint)
@@ -6314,7 +7067,10 @@ def forward(self, dict_input):
         out_no_checkpoint.backward()
         out_checkpoint.backward()
 
-        for param, checkpoint_param in zip(model_no_checkpoint.parameters(), model_checkpoint_without_reentrant.parameters()):
+        for param, checkpoint_param in zip(
+            model_no_checkpoint.parameters(),
+            model_checkpoint_without_reentrant.parameters(),
+        ):
             self.assertEqual(param.grad, checkpoint_param.grad)
 
     def test_callback_adds_callback(self):
@@ -6352,9 +7108,9 @@ def callback():
         def hook_with_callback(*args):
             torch.autograd.Variable._execution_engine.queue_callback(callback)
 
-        t = torch.tensor([1., 2.], requires_grad=True, device=torch.device("cuda"))
+        t = torch.tensor([1.0, 2.0], requires_grad=True, device=torch.device("cuda"))
         t.register_hook(hook_with_callback)
-        output = t ** 2
+        output = t**2
         loss = output.sum()
 
         with self.assertRaisesRegex(RuntimeError, "blah"):
@@ -6448,7 +7204,7 @@ def add_gradient_penalty_to_grad(grad):
 
         handle = param.register_hook(add_gradient_penalty_to_grad)
         # Forward pass
-        tmp = (param * param)
+        tmp = param * param
         loss = tmp.sum()
         # Compute the gradients
         loss.backward()
@@ -6468,7 +7224,7 @@ def manual_increase_gradient(grad):
             return res
 
         # Forward pass
-        tmp = (param * param)
+        tmp = param * param
         handle = tmp.register_hook(manual_increase_gradient)
         loss = tmp.sum()
         # Compute the gradients
@@ -6492,8 +7248,10 @@ def test_grad_fn_attr_bindings(self):
         # self.assertEqual(out.grad_fn._saved_tensors, (a, b))              # TewnsorList -> Tuple[Tensor]
         self.assertEqual(out2.grad_fn._saved_self, a * 2)
         self.assertIsInstance(out2.grad_fn._saved_self, torch.Tensor)
-        self.assertIsInstance(out2.grad_fn._raw_saved_self, torch._C._autograd.SavedTensor)
-        self.assertEqual(out1.grad_fn._saved_dim, 0)                       # int64_t -> int
+        self.assertIsInstance(
+            out2.grad_fn._raw_saved_self, torch._C._autograd.SavedTensor
+        )
+        self.assertEqual(out1.grad_fn._saved_dim, 0)  # int64_t -> int
         self.assertIsInstance(out1.grad_fn._saved_dim, int)
 
         out2.grad_fn._raw_saved_self.register_hooks(lambda x: x, lambda x: x)
@@ -6510,10 +7268,16 @@ def test_grad_fn_attr_bindings(self):
         a = torch.ones(2, 2, requires_grad=True)
         indices = torch.tensor([0, 1])
         out = a[:, indices]
-        self.assertEqual(out.grad_fn._saved_indices, (None, indices))     # c10::List<c10::optional<Tensor>> -> Tuple[Tensor?]
+        self.assertEqual(
+            out.grad_fn._saved_indices, (None, indices)
+        )  # c10::List<c10::optional<Tensor>> -> Tuple[Tensor?]
         self.assertIsInstance(out.grad_fn._saved_indices[1], torch.Tensor)
-        self.assertIsInstance(out.grad_fn._raw_saved_indices[1], torch._C._autograd.SavedTensor)
-        self.assertEqual(out.grad_fn._saved_self_sym_sizes, a.shape)          # SymIntArrayRef -> Tuple[SymInt]
+        self.assertIsInstance(
+            out.grad_fn._raw_saved_indices[1], torch._C._autograd.SavedTensor
+        )
+        self.assertEqual(
+            out.grad_fn._saved_self_sym_sizes, a.shape
+        )  # SymIntArrayRef -> Tuple[SymInt]
         self.assertIsInstance(out.grad_fn._saved_self_sym_sizes[0], int)
 
         out.grad_fn._raw_saved_indices[1].register_hooks(lambda x: x, lambda x: x)
@@ -6521,7 +7285,9 @@ def test_grad_fn_attr_bindings(self):
             out.grad_fn._raw_saved_indices[0].register_hooks(lambda x: x, lambda x: x)
 
         out = a.mean()
-        self.assertEqual(out.grad_fn._saved_self_sym_sizes, a.shape)          # IntArrayRef -> Tuple[int]
+        self.assertEqual(
+            out.grad_fn._saved_self_sym_sizes, a.shape
+        )  # IntArrayRef -> Tuple[int]
 
         a = torch.ones(2, 2, requires_grad=True)
         out = a * a
@@ -6532,26 +7298,34 @@ def test_grad_fn_attr_bindings(self):
 
         a = torch.ones(1, 1, 2, requires_grad=True)
         out = torch.nn.functional.interpolate(a, 4, mode="linear")
-        self.assertEqual(out.grad_fn._saved_output_size, (4,))            # c10::optional<IntArrayRef> -> int[]?
+        self.assertEqual(
+            out.grad_fn._saved_output_size, (4,)
+        )  # c10::optional<IntArrayRef> -> int[]?
         self.assertIsInstance(out.grad_fn._saved_output_size[0], int)
-        self.assertEqual(out.grad_fn._saved_align_corners, False)         # bool -> bool
+        self.assertEqual(out.grad_fn._saved_align_corners, False)  # bool -> bool
         self.assertIsInstance(out.grad_fn._saved_align_corners, bool)
-        if hasattr(out.grad_fn, '_saved_scale_factors'):
-            self.assertIsNone(out.grad_fn._saved_scale_factors)           # c10::optional<ArrayRef<double>> -> float[]?
+        if hasattr(out.grad_fn, "_saved_scale_factors"):
+            self.assertIsNone(
+                out.grad_fn._saved_scale_factors
+            )  # c10::optional<ArrayRef<double>> -> float[]?
         else:
-            self.assertIsNone(out.grad_fn._saved_scales)                  # c10::optional<ArrayRef<double>> -> float[]?
+            self.assertIsNone(
+                out.grad_fn._saved_scales
+            )  # c10::optional<ArrayRef<double>> -> float[]?
 
         a = torch.ones(1, 1, 3, 3, requires_grad=True)
         out = nn.Conv2d(1, 1, 3)(a)
-        self.assertEqual(out.grad_fn._saved_bias_sym_sizes_opt, (1,))     # c10::optional<SymIntArrayRef> -> SymInt[]?
+        self.assertEqual(
+            out.grad_fn._saved_bias_sym_sizes_opt, (1,)
+        )  # c10::optional<SymIntArrayRef> -> SymInt[]?
         out = nn.Conv2d(1, 1, 3, bias=False)(a)
         # TODO: This is BAD! we converted a c10::nullopt into a (0,)
         self.assertEqual(out.grad_fn._saved_bias_sym_sizes_opt, (0,))
 
         a = torch.ones(1, 3, 3, requires_grad=True)
         out = torch.addbmm(a.squeeze(0), a, a)
-        self.assertEqual(out.grad_fn._saved_batch1_sym_argsize_0, 1)      # int64_t
-        self.assertEqual(out.grad_fn._saved_batch1_sym_argsize_1, 3)      # int64_t
+        self.assertEqual(out.grad_fn._saved_batch1_sym_argsize_0, 1)  # int64_t
+        self.assertEqual(out.grad_fn._saved_batch1_sym_argsize_1, 3)  # int64_t
 
         a = torch.ones(1, 1, 3, 3, requires_grad=True)
         out = torch.nn.functional.unfold(a, 3)
@@ -6564,12 +7338,12 @@ def test_grad_fn_attr_bindings(self):
 
         a = torch.ones(2, 2, requires_grad=True)
         out = torch.pdist(a, p=1)
-        self.assertEqual(out.grad_fn._saved_p, 1.)                        # double -> float
+        self.assertEqual(out.grad_fn._saved_p, 1.0)  # double -> float
         self.assertIsInstance(out.grad_fn._saved_p, float)
 
         a = torch.ones(1, 1, 2, requires_grad=True)
-        out = torch.logit(a, 1.)
-        self.assertEqual(out.grad_fn._saved_eps, 1.)                      # c10:optional<double> -> float?
+        out = torch.logit(a, 1.0)
+        self.assertEqual(out.grad_fn._saved_eps, 1.0)  # c10:optional<double> -> float?
         self.assertIsInstance(out.grad_fn._saved_eps, float)
         out = torch.logit(a)
         self.assertIsNone(out.grad_fn._saved_eps)
@@ -6577,37 +7351,53 @@ def test_grad_fn_attr_bindings(self):
         if torch._C.has_lapack:
             a = torch.ones(1, 1, requires_grad=True)
             q, r = torch.linalg.qr(a, mode="reduced")
-            self.assertEqual(q.grad_fn._saved_mode, "reduced")                # std::string -> str
+            self.assertEqual(q.grad_fn._saved_mode, "reduced")  # std::string -> str
 
-        a = torch.tensor([1.], requires_grad=True)
-        out = torch.div(a, 2., rounding_mode="trunc")
-        self.assertEqual(out.grad_fn._saved_rounding_mode, "trunc")       # c10::optional<std::string> -> str?
-        out = torch.div(a, 2., rounding_mode=None)
-        self.assertIsNone(out.grad_fn._saved_rounding_mode)               # c10::optional<std::string> -> str?
+        a = torch.tensor([1.0], requires_grad=True)
+        out = torch.div(a, 2.0, rounding_mode="trunc")
+        self.assertEqual(
+            out.grad_fn._saved_rounding_mode, "trunc"
+        )  # c10::optional<std::string> -> str?
+        out = torch.div(a, 2.0, rounding_mode=None)
+        self.assertIsNone(
+            out.grad_fn._saved_rounding_mode
+        )  # c10::optional<std::string> -> str?
 
         x = torch.zeros(5, requires_grad=True)
         out = torch.threshold(x, threshold=(1 + 0j), value=(1 + 0j))
-        self.assertIsInstance(out.grad_fn._saved_threshold, complex)      # Scalar(complex double) -> complex
+        self.assertIsInstance(
+            out.grad_fn._saved_threshold, complex
+        )  # Scalar(complex double) -> complex
         cfloat = torch.tensor(1 + 0j, dtype=torch.complex64)
         out = torch.threshold(x, threshold=cfloat, value=(1 + 0j))
-        self.assertIsInstance(out.grad_fn._saved_threshold, complex)      # Scalar(complex float) -> complex
-        out = torch.threshold(x, threshold=1., value=1.)
-        self.assertIsInstance(out.grad_fn._saved_threshold, float)        # Scalar(floating point) -> float
+        self.assertIsInstance(
+            out.grad_fn._saved_threshold, complex
+        )  # Scalar(complex float) -> complex
+        out = torch.threshold(x, threshold=1.0, value=1.0)
+        self.assertIsInstance(
+            out.grad_fn._saved_threshold, float
+        )  # Scalar(floating point) -> float
         out = torch.threshold(x, threshold=1, value=1)
-        self.assertIsInstance(out.grad_fn._saved_threshold, int)          # Scalar(integral) -> int
+        self.assertIsInstance(
+            out.grad_fn._saved_threshold, int
+        )  # Scalar(integral) -> int
         out = torch.threshold(x, threshold=False, value=False)
-        self.assertIsInstance(out.grad_fn._saved_threshold, bool)         # Scalar(bool) -> bool
+        self.assertIsInstance(
+            out.grad_fn._saved_threshold, bool
+        )  # Scalar(bool) -> bool
 
         a = torch.ones(2, 2, requires_grad=True)
         out = a.as_strided((3,), (1,), 1)
-        self.assertEqual(out.grad_fn._saved_storage_offset, 1)            # c10:optional<int64_t> -> int?
+        self.assertEqual(
+            out.grad_fn._saved_storage_offset, 1
+        )  # c10:optional<int64_t> -> int?
         self.assertIsInstance(out.grad_fn._saved_storage_offset, int)
         out = a.as_strided((3,), (1,))
         self.assertIsNone(out.grad_fn._saved_storage_offset)
 
         a = torch.ones(2, requires_grad=True)
         out = torch.tanh(a)
-        self.assertEqual(out, out.grad_fn._saved_result)                  # saved variable when output
+        self.assertEqual(out, out.grad_fn._saved_result)  # saved variable when output
 
         a = torch.randn(3, 5, requires_grad=True)
         b = torch.tensor([1, 0, 4])
@@ -6616,22 +7406,30 @@ def test_grad_fn_attr_bindings(self):
         self.assertIsNone(out.grad_fn._saved_weight)
         loss = nn.NLLLoss(weight=torch.ones((5,)))
         out = loss(a, b)
-        self.assertEqual(out.grad_fn._saved_weight, torch.ones((5,)))     # c10:optional<Tensor> -> Tensor?
+        self.assertEqual(
+            out.grad_fn._saved_weight, torch.ones((5,))
+        )  # c10:optional<Tensor> -> Tensor?
 
         out.sum().backward()
         with self.assertRaisesRegex(RuntimeError, "after they have already been freed"):
             out.grad_fn._saved_weight
 
         num_tensors = 3
-        input_tensors = [torch.ones(2, 2, requires_grad=True) for _ in range(num_tensors)]
-        scalars = [0.0 for _ in range(num_tensors)]                       # ArrayRef<Scalar> -> Tuple[Scalar, ...]
+        input_tensors = [
+            torch.ones(2, 2, requires_grad=True) for _ in range(num_tensors)
+        ]
+        scalars = [
+            0.0 for _ in range(num_tensors)
+        ]  # ArrayRef<Scalar> -> Tuple[Scalar, ...]
         results = torch._foreach_maximum(input_tensors, scalars)
         for t in results:
             self.assertEqual(t.grad_fn._saved_scalars, scalars)
 
-
     def test_cant_create_saved_tensors(self):
-        with self.assertRaisesRegex(RuntimeError, "Trying to create a SavedTensor object from Python is forbidden"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Trying to create a SavedTensor object from Python is forbidden",
+        ):
             torch.autograd.SavedTensor()
 
     def test_custom_function_saved_tensors(self):
@@ -6768,16 +7566,40 @@ def maybe_check_raise(fn, should_raise):
         # should_raise contains None if it should not raise
         # should_raise contains a string of the error if it should raise
         # The 3 elements are for view_as, first output of unbind and second output of unbind
-        run_test(grad_mode=True, requires_grad=False, is_view=True,
-                 should_raise_tuple=(None, None, None))
-        inp_change_err = "Output {} of UnbindBackward0 is a view and is being modified inplace."
-        run_test(grad_mode=True, requires_grad=True, is_view=True,
-                 should_raise_tuple=(None, inp_change_err.format("0"), inp_change_err.format("1")))
-        leaf_grad_err = "A view was created in no_grad mode and is being modified inplace"
-        run_test(grad_mode=False, requires_grad=True, is_view=True,
-                 should_raise_tuple=(leaf_grad_err, leaf_grad_err, leaf_grad_err))
-        run_test(grad_mode=False, requires_grad=False, is_view=True,
-                 should_raise_tuple=(None, None, None))
+        run_test(
+            grad_mode=True,
+            requires_grad=False,
+            is_view=True,
+            should_raise_tuple=(None, None, None),
+        )
+        inp_change_err = (
+            "Output {} of UnbindBackward0 is a view and is being modified inplace."
+        )
+        run_test(
+            grad_mode=True,
+            requires_grad=True,
+            is_view=True,
+            should_raise_tuple=(
+                None,
+                inp_change_err.format("0"),
+                inp_change_err.format("1"),
+            ),
+        )
+        leaf_grad_err = (
+            "A view was created in no_grad mode and is being modified inplace"
+        )
+        run_test(
+            grad_mode=False,
+            requires_grad=True,
+            is_view=True,
+            should_raise_tuple=(leaf_grad_err, leaf_grad_err, leaf_grad_err),
+        )
+        run_test(
+            grad_mode=False,
+            requires_grad=False,
+            is_view=True,
+            should_raise_tuple=(None, None, None),
+        )
 
     def test_inplace_not_requires_grad(self):
         class MyFn(torch.autograd.Function):
@@ -6798,7 +7620,9 @@ def backward(ctx, grad):
         # Take an invalid view on 'a' that should raise an error (warns during deprecation)
         view_a = MyFn.apply(a)
 
-        with self.assertRaisesRegex(RuntimeError, "This view was created inside a custom Function"):
+        with self.assertRaisesRegex(
+            RuntimeError, "This view was created inside a custom Function"
+        ):
             view_a += b
 
         # Extra test for copy_ that is a manual implementation and could be easily
@@ -6807,15 +7631,19 @@ def backward(ctx, grad):
         b = torch.rand(1, requires_grad=True)
         view_a = MyFn.apply(a)
 
-        with self.assertRaisesRegex(RuntimeError, "This view was created inside a custom Function"):
+        with self.assertRaisesRegex(
+            RuntimeError, "This view was created inside a custom Function"
+        ):
             view_a.copy_(b)
 
         # Functions that should throw must properly throw
         a = torch.rand(1, 2)
         b = torch.rand(1, requires_grad=True)
         view_a = a.unbind()[0]
-        with self.assertRaisesRegex(RuntimeError, "This view is the output of a function that returns "
-                                                  "multiple views."):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "This view is the output of a function that returns " "multiple views.",
+        ):
             view_a.copy_(b)
 
         # Sanity check that views that should work still work
@@ -6883,25 +7711,31 @@ def forward(ctx, a, make_view):
             @staticmethod
             def backward(ctx, grad):
                 bw_called[0] += 1
-                a, = ctx.saved_tensors
+                (a,) = ctx.saved_tensors
                 res = torch.zeros_like(a)
                 res.select(0, 0).copy_(grad)
                 return res, None
 
         fn_id_to_inplace_on_view_err_msg = {
-            "one_output": ("Output 0 of IdOneOutputBackward is a view and is being "
-                           "modified inplace. This view was created inside a custom Function"),
-            "two_output": ("Output 0 of IdTwoOutputBackward is a view and is being modified inplace."
-                           " This view is the output of a function that returns multiple views."),
-            "view_of_temp": ("Output 0 of ViewOfTempBackward is a view and is being "
-                             "modified inplace. This view was created inside a custom Function")
+            "one_output": (
+                "Output 0 of IdOneOutputBackward is a view and is being "
+                "modified inplace. This view was created inside a custom Function"
+            ),
+            "two_output": (
+                "Output 0 of IdTwoOutputBackward is a view and is being modified inplace."
+                " This view is the output of a function that returns multiple views."
+            ),
+            "view_of_temp": (
+                "Output 0 of ViewOfTempBackward is a view and is being "
+                "modified inplace. This view was created inside a custom Function"
+            ),
         }
 
         for fn_id in ["one_output", "two_output", "view_of_temp"]:
             for inplace in [True, False]:
                 for make_view in [True, False]:
                     # Used for special casing the tests below
-                    output_is_a_view = (make_view or fn_id == "view_of_temp")
+                    output_is_a_view = make_view or fn_id == "view_of_temp"
 
                     def fn(a, b):
                         # never modify a, b inplace for gracheck
@@ -6970,9 +7804,12 @@ def forward(ctx, x):
             @staticmethod
             def backward(ctx, x):
                 return x
+
         view_custom = Func.apply
 
-        def run_test(fn, fn_type, grad_mode_view, grad_mode_iview, requires_grad, error1, error2):
+        def run_test(
+            fn, fn_type, grad_mode_view, grad_mode_iview, requires_grad, error1, error2
+        ):
             # This test checks the behavior of inplace-view functions when
             # the views are created in grad mode or not
             base = torch.rand(2, 3, requires_grad=requires_grad).clone()
@@ -6980,7 +7817,7 @@ def run_test(fn, fn_type, grad_mode_view, grad_mode_iview, requires_grad, error1
             with torch.set_grad_enabled(grad_mode_view):
                 if fn_type == "multi_view":
                     inp = base.unbind()[0]
-                elif fn_type == "custom" :
+                elif fn_type == "custom":
                     inp = view_custom(base)
                 else:
                     inp = base.view_as(base)
@@ -7032,7 +7869,15 @@ def run_tests(fn):
                                     if grad_mode_view and not grad_mode_iview:
                                         error2 = custom_err
 
-                            run_test(fn, fn_type, grad_mode_view, grad_mode_iview, requires_grad, error1, error2)
+                            run_test(
+                                fn,
+                                fn_type,
+                                grad_mode_view,
+                                grad_mode_iview,
+                                requires_grad,
+                                error1,
+                                error2,
+                            )
 
         # This list was created by logging gen_inplace_or_view_type.py
         #   detach_ is excluded for this test because it cannot be applied to
@@ -7077,7 +7922,9 @@ def test_autograd_inplace_view_of_view(self):
             y = x.view(2)
         y.requires_grad_(True)
         z = y.view(2)
-        with self.assertRaisesRegex(RuntimeError, "a view of a view .* is being .* inside the no_grad block"):
+        with self.assertRaisesRegex(
+            RuntimeError, "a view of a view .* is being .* inside the no_grad block"
+        ):
             z /= 2
 
         x = torch.zeros(2)
@@ -7085,7 +7932,9 @@ def test_autograd_inplace_view_of_view(self):
             y = x.view(2)
         y.requires_grad_(True)
         z = y.view(2)
-        with self.assertRaisesRegex(RuntimeError, "a view of a view .* is being .* inside the inference_mode"):
+        with self.assertRaisesRegex(
+            RuntimeError, "a view of a view .* is being .* inside the inference_mode"
+        ):
             z /= 2
 
     # TODO This is not the correct behavior -
@@ -7138,7 +7987,7 @@ def forward(ctx, a, idx):
             @staticmethod
             def backward(ctx, grad):
                 bw_called[0] += 1
-                a, = ctx.saved_tensors
+                (a,) = ctx.saved_tensors
                 res = torch.zeros_like(a)
                 res.select(0, ctx.idx).copy_(grad)
                 return res, None
@@ -7152,8 +8001,10 @@ def backward(ctx, grad):
         self.assertTrue(bw_called[0] == 1)
 
         out = ComplexView.apply(a.clone(), idx)
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Output 0 of ComplexViewBackward is a view and is being modified inplace"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Output 0 of ComplexViewBackward is a view and is being modified inplace",
+        ):
             out += 1
 
     def test_autograd_python_custom_function_inplace(self):
@@ -7179,7 +8030,6 @@ def backward(ctx, grad):
                 bw_called[0] += 1
                 return grad, grad
 
-
         a = torch.ones(2, requires_grad=True)
         b = torch.ones(2, requires_grad=True)
 
@@ -7250,7 +8100,9 @@ def backward(ctx, ga, gab):
         self.assertTrue(bw_called[0] == 1)
 
         # The input is a view
-        inplace_on_view_err = "your Function modifies inplace an input that is a view of another Tensor"
+        inplace_on_view_err = (
+            "your Function modifies inplace an input that is a view of another Tensor"
+        )
         with self.assertRaisesRegex(RuntimeError, inplace_on_view_err):
             c, d = MyBadAdder.apply(a.clone().view_as(a), b)
 
@@ -7296,12 +8148,13 @@ def jvp(ctx, x_t):
                         return x_t
                     else:
                         return x_t.mul_(2)
+
             return InplaceMul
 
         for requires_grad, jvp_err in product([True, False], repeat=2):
             InplaceMul = get_custom_fn(jvp_err)
             # Make sure that tensor is always returned as-is if marked dirty
-            z = torch.tensor(1., requires_grad=requires_grad)
+            z = torch.tensor(1.0, requires_grad=requires_grad)
             x = z.clone()
             y = InplaceMul.apply(x)
             self.assertTrue(x is y)
@@ -7313,7 +8166,9 @@ def jvp(ctx, x_t):
                 x_dual = fwAD.make_dual(x, x_tangent)
 
                 if jvp_err:
-                    bad_mark_dirty_err = "jvp function must modify the corresponding gradient inplace"
+                    bad_mark_dirty_err = (
+                        "jvp function must modify the corresponding gradient inplace"
+                    )
                     with self.assertRaisesRegex(RuntimeError, bad_mark_dirty_err):
                         InplaceMul.apply(x_dual)
                 else:
@@ -7326,7 +8181,9 @@ def test_named_tensor_for_complex_views(self):
         names = ["batch", "height", "width", "complex"]
         z = torch.ones((2, 1, 2, 2), requires_grad=True)
         z_named = z.refine_names(*names)
-        z_complex = torch.view_as_complex(z_named.rename(None)).refine_names(*names[:-1])
+        z_complex = torch.view_as_complex(z_named.rename(None)).refine_names(
+            *names[:-1]
+        )
         z_complex.sum().abs().backward()
         expected = torch.ones_like(z_complex).rename(None)
         abs_1_1j = abs(1 + 1j)
@@ -7355,7 +8212,9 @@ def backward(ctx, gx):
         self.assertEqual(output.requires_grad, expected_output.requires_grad)
 
         # Check that in-place modification on view throws
-        leaf_grad_err = "A view was created in no_grad mode and is being modified inplace"
+        leaf_grad_err = (
+            "A view was created in no_grad mode and is being modified inplace"
+        )
         with self.assertRaisesRegex(RuntimeError, leaf_grad_err):
             output.zero_()
 
@@ -7399,9 +8258,7 @@ def backward(ctx, go):
                 with torch.enable_grad():
                     self.assertTrue(torch._C.is_grad_enabled())
                     foo = torch.rand(go.size(), requires_grad=True)
-                    grad, = torch.autograd.grad(
-                        foo ** 3, foo, grad_outputs=go
-                    )
+                    (grad,) = torch.autograd.grad(foo**3, foo, grad_outputs=go)
                     self.assertTrue(torch._C.is_grad_enabled())
                 self.assertTrue(torch._C.is_grad_enabled() == original)
                 return grad
@@ -7414,17 +8271,17 @@ def backward(ctx, go):
         MyFunction.apply(inp).sum().backward(create_graph=True)
 
     def test_power_function(self):
-        a = torch.tensor([0., 0., 0.])
-        b = torch.tensor([-1., 0., 1.], requires_grad=True)
+        a = torch.tensor([0.0, 0.0, 0.0])
+        b = torch.tensor([-1.0, 0.0, 1.0], requires_grad=True)
         c = torch.sum(a**b)
         c.backward()
-        self.assertEqual(b.grad, torch.tensor([-inf, 0., 0.]))
+        self.assertEqual(b.grad, torch.tensor([-inf, 0.0, 0.0]))
 
         s = 0
-        b = torch.tensor([-1., 0., 1.], requires_grad=True)
+        b = torch.tensor([-1.0, 0.0, 1.0], requires_grad=True)
         c = torch.sum(s**b)
         c.backward()
-        self.assertEqual(b.grad, torch.tensor([-inf, 0., 0.]))
+        self.assertEqual(b.grad, torch.tensor([-inf, 0.0, 0.0]))
 
     def test_custom_function_error(self):
         class BadFw(Function):
@@ -7462,7 +8319,9 @@ def forward(ctx, foo):
         with self.assertRaisesRegex(RuntimeError, "must implement either the backward"):
             BadBw.apply(inp).sum().backward()
 
-        with self.assertRaisesRegex(RuntimeError, "Implementing both 'backward' and 'vjp'"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Implementing both 'backward' and 'vjp'"
+        ):
             BadBw2.apply(inp).sum().backward()
 
         with self.assertRaisesRegex(RuntimeError, "must implement the jvp function"):
@@ -7505,6 +8364,7 @@ def jvp(ctx, gI, _):
         inp = torch.rand(4, 4, dtype=torch.double, requires_grad=True)
 
         for flag, msg in flag_to_error.items():
+
             def test_fn(inp):
                 if flag == "not_a_view_of_inp_base":
                     inp = inp.view_as(inp)
@@ -7546,7 +8406,10 @@ def test_fn(inp, flag):
 
         gradcheck(test_fn, (inp, False), check_forward_ad=True)
 
-        with self.assertRaisesRegex(RuntimeError, "inplace custom Function is not modifying the forward mode gradients inplace"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "inplace custom Function is not modifying the forward mode gradients inplace",
+        ):
             gradcheck(test_fn, (inp, True), check_forward_ad=True)
 
     def test_custom_function_forward_mode_wrong_formula(self):
@@ -7571,7 +8434,9 @@ def jvp(ctx, gI, _):
         inp = torch.rand(10, dtype=torch.double, requires_grad=True)
         gradcheck(UserFn.apply, (inp, False), check_forward_ad=True)
 
-        with self.assertRaisesRegex(RuntimeError, "Jacobian computed with forward mode mismatch for output 0"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Jacobian computed with forward mode mismatch for output 0"
+        ):
             gradcheck(UserFn.apply, (inp, True), check_forward_ad=True)
 
     def test_custom_function_forward_mode_non_tensor_before_tensor_args(self):
@@ -7586,19 +8451,26 @@ def jvp(ctx, nt, x_t, nt2, y_t):
                 self.assertIsNone(nt2)
                 return x_t * 2 + y_t * 3
 
-        x = torch.tensor(1., dtype=torch.double)
-        t = torch.tensor(1., dtype=torch.double)
-        y = torch.tensor(1., dtype=torch.double)
+        x = torch.tensor(1.0, dtype=torch.double)
+        t = torch.tensor(1.0, dtype=torch.double)
+        y = torch.tensor(1.0, dtype=torch.double)
 
         with fwAD.dual_level():
             dual_x = fwAD.make_dual(x, t)
             MyFn.apply(1, dual_x, 1, y)
 
-        gradcheck(MyFn.apply, (1, x.requires_grad_(True), 1, y.requires_grad_(True)), check_forward_ad=True,
-                  check_backward_ad=False, check_batched_grad=False)
+        gradcheck(
+            MyFn.apply,
+            (1, x.requires_grad_(True), 1, y.requires_grad_(True)),
+            check_forward_ad=True,
+            check_backward_ad=False,
+            check_batched_grad=False,
+        )
 
     def test_custom_function_forward_mode_forward_is_no_op(self):
-        error_regex = "A custom Function's forward is returning a view \\(or an input as-is\\)"
+        error_regex = (
+            "A custom Function's forward is returning a view \\(or an input as-is\\)"
+        )
 
         return_lambdas = {
             # If we return an input as-is in forward, that is treated
@@ -7612,6 +8484,7 @@ def test_custom_function_forward_mode_forward_is_no_op(self):
         }
 
         for k, fn in return_lambdas.items():
+
             class MyFn(torch.autograd.Function):
                 @staticmethod
                 def forward(ctx, x, y):
@@ -7625,13 +8498,13 @@ def vjp(ctx, gO1, gO2):
                 def jvp(ctx, x_t, y_t):
                     return x_t + y_t, fn(x_t)
 
-            a = torch.tensor(1., dtype=torch.double, requires_grad=True)
-            t = torch.tensor(1., dtype=torch.double)
-            b = torch.tensor(1., dtype=torch.double, requires_grad=True)
+            a = torch.tensor(1.0, dtype=torch.double, requires_grad=True)
+            t = torch.tensor(1.0, dtype=torch.double)
+            b = torch.tensor(1.0, dtype=torch.double, requires_grad=True)
 
-            c = torch.tensor(1., dtype=torch.double)
-            t2 = torch.tensor(1., dtype=torch.double)
-            d = torch.tensor(1., dtype=torch.double)
+            c = torch.tensor(1.0, dtype=torch.double)
+            t2 = torch.tensor(1.0, dtype=torch.double)
+            d = torch.tensor(1.0, dtype=torch.double)
 
             with fwAD.dual_level():
                 a_dual = fwAD.make_dual(a, t)
@@ -7678,9 +8551,9 @@ def vjp(ctx, grad_out):
                 z = ctx.z
                 return z * grad_out * y, z * grad_out * x, None
 
-        a = torch.tensor(1., requires_grad=True, dtype=torch.double)
-        t = torch.tensor(1., dtype=torch.double)
-        b = torch.tensor(2., requires_grad=True, dtype=torch.double)
+        a = torch.tensor(1.0, requires_grad=True, dtype=torch.double)
+        t = torch.tensor(1.0, dtype=torch.double)
+        b = torch.tensor(2.0, requires_grad=True, dtype=torch.double)
         c = 4
 
         with fwAD.dual_level():
@@ -7704,7 +8577,7 @@ def jvp(ctx, x_t):
 
             @staticmethod
             def vjp(ctx, grad_out):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 self.assertEqual(len(ctx.saved_tensors), 1)
                 return grad_out
 
@@ -7728,9 +8601,9 @@ def forward(ctx, x, y):
             def jvp(ctx, x_tangent, y_tangent):
                 return x_tangent, None
 
-        x = torch.tensor(2.)
-        x_tangent = torch.tensor(1.)
-        y = torch.tensor(3.)
+        x = torch.tensor(2.0)
+        x_tangent = torch.tensor(1.0)
+        y = torch.tensor(3.0)
 
         with fwAD.dual_level():
             x_dual = fwAD.make_dual(x, x_tangent)
@@ -7768,9 +8641,26 @@ def jvp(ctx, x_tangent, y_tangent):
 
         with fwAD.dual_level():
             x_dual = fwAD.make_dual(x, x_tangent)
-            with self.assertRaisesRegex(RuntimeError, "You should return None at that position instead"):
+            with self.assertRaisesRegex(
+                RuntimeError, "You should return None at that position instead"
+            ):
                 FuncWrong.apply(x_dual, y)
 
+        # returns non-tensor
+        class Func(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x.clone(), object(), x.clone()
+
+            @staticmethod
+            def jvp(ctx, x_tangent):
+                return x_tangent, None, x_tangent
+
+        with fwAD.dual_level():
+            x_dual = fwAD.make_dual(x, x_tangent)
+            out_dual, _, out2_dual = Func.apply(x_dual)
+            self.assertEqual(fwAD.unpack_dual(out_dual).tangent, x_tangent)
+            self.assertEqual(fwAD.unpack_dual(out2_dual).tangent, x_tangent)
 
     def test_custom_function_local_inplace(self):
         class MyFn(torch.autograd.Function):
@@ -7835,20 +8725,34 @@ def assert_only_first_requires_grad(res):
         for sort in [True, False]:
             for return_inverse in [True, False]:
                 for return_counts in [True, False]:
-                    res = torch.unique(inp, sorted=sort, return_inverse=return_inverse,
-                                       return_counts=return_counts)
+                    res = torch.unique(
+                        inp,
+                        sorted=sort,
+                        return_inverse=return_inverse,
+                        return_counts=return_counts,
+                    )
                     assert_only_first_requires_grad(res)
 
-                    res = torch.unique(inp, sorted=sort, return_inverse=return_inverse,
-                                       return_counts=return_counts, dim=0)
+                    res = torch.unique(
+                        inp,
+                        sorted=sort,
+                        return_inverse=return_inverse,
+                        return_counts=return_counts,
+                        dim=0,
+                    )
                     assert_only_first_requires_grad(res)
 
-                    res = torch.unique_consecutive(inp, return_inverse=return_inverse,
-                                                   return_counts=return_counts)
+                    res = torch.unique_consecutive(
+                        inp, return_inverse=return_inverse, return_counts=return_counts
+                    )
                     assert_only_first_requires_grad(res)
 
-                    res = torch.unique_consecutive(inp, return_inverse=return_inverse,
-                                                   return_counts=return_counts, dim=0)
+                    res = torch.unique_consecutive(
+                        inp,
+                        return_inverse=return_inverse,
+                        return_counts=return_counts,
+                        dim=0,
+                    )
                     assert_only_first_requires_grad(res)
 
                     # Here we test the internal functions to make sure all of them are
@@ -7858,16 +8762,25 @@ def assert_only_first_requires_grad(res):
 
                     # This looks public but is actually manually deleted from the
                     # torch namespace in torch/functional.py
-                    res = torch._VF.unique_dim(inp, dim=0, sorted=sort, return_inverse=return_inverse,
-                                               return_counts=return_counts)
+                    res = torch._VF.unique_dim(
+                        inp,
+                        dim=0,
+                        sorted=sort,
+                        return_inverse=return_inverse,
+                        return_counts=return_counts,
+                    )
                     assert_only_first_requires_grad(res)
 
                     # We don't test `unique_dim_consecutive` here.
                     # It looks public but the python binding is actually manually disabled in
                     # tools/autograd/gen_python_functions.py
 
-                    res = torch._unique2(inp, sorted=sort, return_inverse=return_inverse,
-                                         return_counts=return_counts)
+                    res = torch._unique2(
+                        inp,
+                        sorted=sort,
+                        return_inverse=return_inverse,
+                        return_counts=return_counts,
+                    )
                     assert_only_first_requires_grad(res)
 
     def test_custom_function_cycle(self):
@@ -7881,7 +8794,7 @@ def forward(ctx, x, metadata):
 
             @staticmethod
             def backward(ctx, gO):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 self.assertEqual(x, 3.14)
                 self.assertEqual(ctx.meta["foo"], 3.14)
                 return gO * x, None
@@ -7925,11 +8838,12 @@ class TestCls:
 
         def get_ref(input_requires_grad, nb_hooks):
             t = torch.randn(10, requires_grad=input_requires_grad)
-            a = torch.tensor(1., requires_grad=True)
+            a = torch.tensor(1.0, requires_grad=True)
 
             class Test(nn.Module):
                 def forward(self, x):
-                    return x ** 2 * a ** 2
+                    return x**2 * a**2
+
             mod = Test()
 
             for _ in range(nb_hooks):
@@ -7946,7 +8860,9 @@ def forward(self, x):
                 with warnings.catch_warnings(record=True) as w:
                     tmp.exp().sum().backward(create_graph=True)
                     self.assertTrue(len(w) == 1)
-                    self.assertTrue("Using backward() with create_graph=True" in str(w[0].message))
+                    self.assertTrue(
+                        "Using backward() with create_graph=True" in str(w[0].message)
+                    )
 
             # Remove the backward + create_graph=True cycle
             a.grad = None
@@ -7990,7 +8906,7 @@ class Test:
         count = [0]
 
         def scope():
-            a = torch.tensor(1., requires_grad=True)
+            a = torch.tensor(1.0, requires_grad=True)
             if use_custom_function:
                 b = Function.apply(a)
             else:
@@ -8006,6 +8922,7 @@ def hook(*args):
                 grad_fn_b
                 obj
                 count[0] += 1
+
             if use_tensor_hook:
                 b.register_hook(hook)
             else:
@@ -8149,10 +9066,10 @@ def test(get_input, is_leaf):
             # double backward
             a = get_input()
             grad_fn = a.grad_fn
-            y = a ** 3
+            y = a**3
             y.grad_fn._raw_saved_self.register_hooks(lambda x: x, lambda x: x)
             s = torch.sum(y)
-            g, = torch.autograd.grad(s, (a, ), create_graph=True)
+            (g,) = torch.autograd.grad(s, (a,), create_graph=True)
             if not is_leaf:
                 self.assertIs(grad_fn, y.grad_fn._saved_self.grad_fn)
                 g.sum().backward()
@@ -8163,18 +9080,26 @@ def test(get_input, is_leaf):
             a = get_input()
             y = a * a
             y.grad_fn._raw_saved_self.register_hooks(lambda x: x, lambda x: 1)
-            with self.assertRaisesRegex(TypeError, "Output of saved tensor unpack_hook expected to be a Tensor"):
+            with self.assertRaisesRegex(
+                TypeError, "Output of saved tensor unpack_hook expected to be a Tensor"
+            ):
                 print(y.grad_fn._saved_self)
 
             a = get_input()
             y = a * a
-            with self.assertRaisesRegex(TypeError, "missing 1 required positional argument"):
+            with self.assertRaisesRegex(
+                TypeError, "missing 1 required positional argument"
+            ):
                 y.grad_fn._raw_saved_self.register_hooks(lambda x, b: x, lambda x: x)
 
             a = get_input()
             y = a * a
-            with self.assertRaisesRegex(TypeError, "missing 1 required positional argument"):
-                y.grad_fn._raw_saved_self.register_hooks(lambda x, b: (x, b), lambda x: x)
+            with self.assertRaisesRegex(
+                TypeError, "missing 1 required positional argument"
+            ):
+                y.grad_fn._raw_saved_self.register_hooks(
+                    lambda x, b: (x, b), lambda x: x
+                )
 
             def inplace_double(x):
                 x *= 2
@@ -8183,8 +9108,13 @@ def inplace_double(x):
             a = get_input()
             t = a * a
 
-            with self.assertRaisesRegex(RuntimeError, "A saved tensor pack hook is modifying its input in place."):
-                t.grad_fn._raw_saved_self.register_hooks(inplace_double, lambda x: x / 2)
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "A saved tensor pack hook is modifying its input in place.",
+            ):
+                t.grad_fn._raw_saved_self.register_hooks(
+                    inplace_double, lambda x: x / 2
+                )
 
         # leaf
         test(lambda: torch.randn(5, requires_grad=True), True)
@@ -8194,14 +9124,16 @@ def inplace_double(x):
 
     def test_saved_variable_saved_original_inplace_detach(self):
         # Detaching a tensor that is saved input raises
-        a = torch.tensor(1., requires_grad=True).clone()
+        a = torch.tensor(1.0, requires_grad=True).clone()
         b = a.sin()
         a.detach_()
-        with self.assertRaisesRegex(RuntimeError, "Trying to use a saved tensor that has been detached"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Trying to use a saved tensor that has been detached"
+        ):
             b.backward()
 
         # Detaching a tensor that is saved as output is OK
-        a = torch.tensor(1., requires_grad=True).clone()
+        a = torch.tensor(1.0, requires_grad=True).clone()
         b = a.exp()
         a.detach_()
         b.backward()
@@ -8234,7 +9166,7 @@ def pack(x):
             a = torch.ones(5, requires_grad=True)
 
             with warnings.catch_warnings(record=True) as w:
-                warnings.simplefilter('always')
+                warnings.simplefilter("always")
                 y = a * a
                 # should raise two warnings from a being saved twice
                 self.assertEqual(len(w), 2)
@@ -8271,7 +9203,9 @@ def pack(x):
         y.sum().backward()
         self.assertEqual(2 * a, a.grad)
 
-    def test_saved_variable_packing_unpacking_did_not_save_original_with_default_hooks(self):
+    def test_saved_variable_packing_unpacking_did_not_save_original_with_default_hooks(
+        self,
+    ):
         # See also test_saved_variable_packing_unpacking_did_not_save_original_with_hooks
 
         with torch.autograd.graph.saved_tensors_hooks(lambda x: x, lambda x: x):
@@ -8289,7 +9223,9 @@ def test_setting_default_saved_variable_hooks_twice_should_not_fail(self):
     def test_setting_default_saved_variable_hooks_twice_should_use_inner(self):
         with torch.autograd.graph.saved_tensors_hooks(lambda x: 3 * x, lambda x: 3 * x):
             b = torch.randn(5, requires_grad=True)
-            with torch.autograd.graph.saved_tensors_hooks(lambda x: 5 * x, lambda x: 5 * x):
+            with torch.autograd.graph.saved_tensors_hooks(
+                lambda x: 5 * x, lambda x: 5 * x
+            ):
                 a = torch.randn(5, requires_grad=True)
                 y = a * a
             z = b * b
@@ -8317,7 +9253,9 @@ def test_disabling_saved_tensor_hooks_nested(self):
         with torch.autograd.graph.disable_saved_tensors_hooks("outer"):
             with torch.autograd.graph.disable_saved_tensors_hooks("inner"):
                 with self.assertRaisesRegex(RuntimeError, "inner"):
-                    with torch.autograd.graph.saved_tensors_hooks(lambda x: x, lambda x: x):
+                    with torch.autograd.graph.saved_tensors_hooks(
+                        lambda x: x, lambda x: x
+                    ):
                         pass
 
             self.assertFalse(torch._C._autograd._saved_tensors_hooks_is_enabled())
@@ -8342,7 +9280,7 @@ def unpack_hook(x):
 
                 super().__init__(lambda x: x, unpack_hook)
 
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
 
         with error_on_pack_hook():
             with self.assertRaisesRegex(CustomError, "pack"):
@@ -8358,15 +9296,17 @@ class Func(torch.autograd.Function):
             @staticmethod
             def forward(ctx, x):
                 intermediate = x.exp()
-                ctx.save_for_backward(intermediate.clone().detach_().requires_grad_(True))
+                ctx.save_for_backward(
+                    intermediate.clone().detach_().requires_grad_(True)
+                )
                 return x.exp()
 
             @staticmethod
             def backward(ctx, grad_out):
-                intermediate, = ctx.saved_tensors
+                (intermediate,) = ctx.saved_tensors
                 return grad_out * intermediate
 
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
 
         with torch.autograd.graph.saved_tensors_hooks(lambda x: x, lambda x: x):
             out = Func.apply(a)
@@ -8407,16 +9347,23 @@ def test_pack_hook_with_inplace_modification_should_fail(self):
         def inc(x):
             x += 1
             return x
+
         with torch.autograd.graph.saved_tensors_hooks(inc, lambda x: x):
-            with self.assertRaisesRegex(RuntimeError, "A saved tensor pack hook is modifying its input in place."):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "A saved tensor pack hook is modifying its input in place.",
+            ):
                 y = torch.exp(a)
 
         y = torch.exp(a)
-        with self.assertRaisesRegex(RuntimeError, "A saved tensor pack hook is modifying its input in place."):
+        with self.assertRaisesRegex(
+            RuntimeError, "A saved tensor pack hook is modifying its input in place."
+        ):
             y.grad_fn._raw_saved_result.register_hooks(inc, lambda x: x)
 
     def test_saving_variable_to_disk(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
+
             def pack(x):
                 name = os.path.join(tmp_dir, str(uuid.uuid4()))
                 torch.save(x, name)
@@ -8436,39 +9383,36 @@ def unpack(name):
     def test_default_saved_variable_hooks_double_backward(self):
         with torch.autograd.graph.saved_tensors_hooks(lambda x: x, lambda x: x):
             a = torch.randn(5, requires_grad=True)
-            y = a ** 3
+            y = a**3
             s = torch.sum(y)
-            g, = torch.autograd.grad(s, (a, ), create_graph=True)
+            (g,) = torch.autograd.grad(s, (a,), create_graph=True)
             g.sum().backward()
             self.assertEqual(6 * a, a.grad)
 
-
         with torch.autograd.graph.saved_tensors_hooks(lambda x: 2 * x, lambda x: x):
             a = torch.randn(5, requires_grad=True)
-            y = a ** 3
+            y = a**3
             s = torch.sum(y)
-        g, = torch.autograd.grad(s, (a, ), create_graph=True)
+        (g,) = torch.autograd.grad(s, (a,), create_graph=True)
         g.sum().backward()
         # factor 2 because only a is saved once
         self.assertEqual(6 * 2 * a, a.grad)
 
-
         a = torch.randn(5, requires_grad=True)
-        y = a ** 3
+        y = a**3
         s = torch.sum(y)
         with torch.autograd.graph.saved_tensors_hooks(lambda x: 2 * x, lambda x: x):
-            g, = torch.autograd.grad(s, (a, ), create_graph=True)
+            (g,) = torch.autograd.grad(s, (a,), create_graph=True)
             g.sum().backward()
             # factor 4 because pow_backward is grad * (exp * self.pow(exp - 1))
             # so grad is saved and self (i.e. a) is saved
             self.assertEqual(6 * 4 * a, a.grad)
 
-
         with torch.autograd.graph.saved_tensors_hooks(lambda x: 2 * x, lambda x: x):
             a = torch.randn(5, requires_grad=True)
-            y = a ** 3
+            y = a**3
             s = torch.sum(y)
-            g, = torch.autograd.grad(s, (a, ), create_graph=True)
+            (g,) = torch.autograd.grad(s, (a,), create_graph=True)
             g.sum().backward()
             # combining the two above blocks: 2 * 4 = 8
             # note that in that sense, a is saved twice
@@ -8513,9 +9457,17 @@ def test(get_input, cuda, pin_memory):
                 # FloatTensor
                 test(lambda: torch.randn(5, requires_grad=True), cuda, pin_memory)
                 # DoubleTensor
-                test(lambda: torch.randn(5, requires_grad=True, dtype=torch.double), cuda, pin_memory)
+                test(
+                    lambda: torch.randn(5, requires_grad=True, dtype=torch.double),
+                    cuda,
+                    pin_memory,
+                )
                 # Sparse tensor
-                x = torch.sparse_coo_tensor(torch.tensor([[1, 1]]).long(), torch.tensor([1., 1.]), requires_grad=True)
+                x = torch.sparse_coo_tensor(
+                    torch.tensor([[1, 1]]).long(),
+                    torch.tensor([1.0, 1.0]),
+                    requires_grad=True,
+                )
                 test(lambda: x, cuda, pin_memory)
 
     @unittest.skipIf(not TEST_CUDA, "test requires CUDA")
@@ -8550,7 +9502,7 @@ def f(x):
             memory_with_hooks = torch.cuda.memory_allocated()
             self.assertEqual(memory_with_hooks, memory_without_grad)
 
-    def test_multi_grad_hooks(self):
+    def test_multi_grad_all_hooks(self):
         t1 = torch.rand(2, requires_grad=True)
         t2 = torch.rand(2, requires_grad=True)
         t3 = torch.rand(2, requires_grad=True)
@@ -8597,6 +9549,78 @@ def backward(ctx, gO):
         out.sum().backward(inputs=(t1, t3), retain_graph=True)
         self.assertEqual(count[0], 2)
 
+    def test_multi_grad_any_hooks(self):
+        hook_id = 0
+        any_hook_handles: List[RemovableHandle] = []
+
+        class MultiOutputModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lin = nn.Linear(3, 3)
+
+            def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+                z = self.lin(x)
+                out = torch.sin(z), torch.cos(z)
+                nonlocal hook_id
+                z.register_hook(partial(hook, hook_id))
+                hook_id += 1
+                any_hook_handles.append(
+                    torch.autograd.graph.register_multi_grad_hook(
+                        out, partial(hook, hook_id), mode="any"
+                    )
+                )
+                hook_id += 1
+                return out
+
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod1 = MultiOutputModule()
+                self.mod2 = MultiOutputModule()
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                y = self.mod1(x)
+                z = y[0] + y[1]
+                return self.mod2(z)
+
+        hook_order: List[int] = []
+        hook_count = 0
+
+        def hook(hook_id: int, *unused):
+            nonlocal hook_count
+            nonlocal hook_order
+            hook_count += 1
+            hook_order.append(hook_id)
+
+        # Any hooks: IDs 1 and 3; regular hooks: IDs 0 and 2
+        model = Model()
+        inp = torch.randn((2, 3))
+        out = model(inp)
+        (out[0] + out[1]).sum().backward()
+        # Check that the any-hook runs only once and before the regular hook
+        # for each module
+        self.assertEqual(len(any_hook_handles), 2)
+        self.assertEqual(hook_order, [3, 2, 1, 0])
+
+        hook_id = 0
+        hook_order.clear()
+        any_hook_handles.clear()
+        out = model(inp)
+        for handle in any_hook_handles:
+            handle.remove()
+        (out[0] + out[1]).sum().backward()
+        # Check that the any-hook does not run if removed
+        self.assertEqual(hook_order, [2, 0])
+
+    def test_multi_grad_hooks_invalid_mode(self):
+        t1 = torch.rand(2, requires_grad=True)
+        t2 = torch.rand(2, requires_grad=True)
+        regex = r"Expects mode to be one of \('all', 'any'\) but got foo"
+        with self.assertRaisesRegex(ValueError, regex):
+            torch.autograd.graph.register_multi_grad_hook(
+                (t1, t2), lambda _: None, mode="foo"
+            )
+
     def test_pynode_destruction_deadlock(self):
         script = """
 import torch
@@ -8644,21 +9668,31 @@ def get_out():
 """
         try:
             subprocess.check_output(
-                [sys.executable, '-c', script],
+                [sys.executable, "-c", script],
                 stderr=subprocess.STDOUT,
                 # On Windows, opening the subprocess with the default CWD makes `import torch`
                 # fail, so just set CWD to this script's directory
                 cwd=os.path.dirname(os.path.realpath(__file__)),
                 # It is ok to have an extra long timeout here as a timeout means the test failed
-                timeout=20)
+                timeout=20,
+            )
         except subprocess.TimeoutExpired as e:
-            self.fail(msg="Example code timed out! See the code sample in the test for details.")
+            self.fail(
+                msg="Example code timed out! See the code sample in the test for details."
+            )
         except subprocess.CalledProcessError as e:
-            err_msg = "RuntimeError: one of the variables needed for gradient computation"
-            self.assertTrue(err_msg in e.output.decode("utf-8"))
+            if e.returncode < 0:
+                # Sometimes we segfault instead of deadlocking
+                self.fail("Subprocess exited with a fatal signal")
+            else:
+                err_msg = (
+                    "RuntimeError: one of the variables needed for gradient computation"
+                )
+                self.assertTrue(err_msg in e.output.decode("utf-8"))
 
     def test_view_func_replay(self):
         with torch.autograd._force_original_view_tracking(True):
+
             def _assert_match_metadata(a, b):
                 self.assertEqual(a.size(), b.size())
                 self.assertEqual(a.stride(), b.stride())
@@ -8666,7 +9700,7 @@ def _assert_match_metadata(a, b):
                 self.assertEqual(a.device, b.device)
                 self.assertEqual(a.dtype, b.dtype)
 
-            def _test_fn(fn, inp, *args):
+            def _test_fn(fn, inp, *args, use_unsafe_view_func=False):
                 outs = fn(inp, *args)
                 # handle functions that return multiple views (e.g. split)
                 if isinstance(outs, torch.Tensor):
@@ -8679,8 +9713,12 @@ def _test_fn(fn, inp, *args):
                     # forward view_func
                     new_inp = inp.clone()
                     _assert_match_metadata(new_inp, inp)
-                    new_out = out._view_func(new_inp)
+                    if use_unsafe_view_func:
+                        new_out = out._view_func_unsafe(new_inp)
+                    else:
+                        new_out = out._view_func(new_inp)
                     _assert_match_metadata(new_out, out)
+                    self.assertEqual(new_out, out)
 
                     # reverse view_func
                     new_out = out.detach()
@@ -8715,11 +9753,17 @@ def _test_fn(fn, inp, *args):
 
             # test view chains
             _test_fn(
-                lambda x: x.unsqueeze(-1).transpose(-1, -2).squeeze(1), torch.randn(2, 4))
+                lambda x: x.unsqueeze(-1).transpose(-1, -2).squeeze(1),
+                torch.randn(2, 4),
+            )
             _test_fn(
-                lambda x: x.chunk(2, -1)[0].transpose(0, 1).unsqueeze(-1), torch.randn(2, 3, 4))
+                lambda x: x.chunk(2, -1)[0].transpose(0, 1).unsqueeze(-1),
+                torch.randn(2, 3, 4),
+            )
             _test_fn(
-                lambda x: x.split_with_sizes([1, 3], -1)[0].chunk(2, -1), torch.randn(2, 3, 4))
+                lambda x: x.split_with_sizes([1, 3], -1)[0].chunk(2, 0),
+                torch.randn(2, 3, 4),
+            )
 
             # chains with missing view_func()s use as_strided() to cover the gaps
             def chain_with_only_parent_view_func(x):
@@ -8727,7 +9771,7 @@ def chain_with_only_parent_view_func(x):
                     x = x.split_with_sizes([1, 3], -1)[0]
 
                 with torch.autograd._force_original_view_tracking(False):
-                    x = x.chunk(2, -1)
+                    x = x.chunk(2, 0)
 
                 return x
 
@@ -8738,12 +9782,88 @@ def chain_with_only_current_view_func(x):
                     x = x.split_with_sizes([1, 3], -1)[0]
 
                 with torch.autograd._force_original_view_tracking(True):
-                    x = x.chunk(2, -1)
+                    x = x.chunk(2, 0)
 
                 return x
 
             _test_fn(chain_with_only_current_view_func, torch.randn(2, 3, 4))
 
+            # TODO: Move this somewhere else
+            # test NT views
+            from torch.nested._internal.nested_tensor import (
+                nested_view_from_values_offsets,
+            )
+
+            values = torch.randn(10, 5)
+            offsets = torch.tensor([0, 3, 6, 10])
+            _test_fn(nested_view_from_values_offsets, values, offsets)
+
+            nt = nested_view_from_values_offsets(values, offsets).clone().detach()
+            _test_fn(
+                torch.ops.aten._nested_get_values.default, nt, use_unsafe_view_func=True
+            )
+
+            def chain_nt_to_dense_back_and_forth(nt):
+                # NJT1 -> dense -> NJT2 -> dense
+                offsets2 = nt.offsets().clone().detach()
+                return nested_view_from_values_offsets(nt.values(), offsets2).values()
+
+            _test_fn(chain_nt_to_dense_back_and_forth, nt, use_unsafe_view_func=True)
+
+            def chain_dense_to_nt_back_and_forth(values, offsets):
+                offsets2 = offsets.clone().detach()
+                # dense -> NJT1 -> dense -> NJT2
+                return nested_view_from_values_offsets(
+                    nested_view_from_values_offsets(values, offsets).values(), offsets2
+                )
+
+            _test_fn(
+                chain_dense_to_nt_back_and_forth,
+                values,
+                offsets,
+                use_unsafe_view_func=True,
+            )
+
+    def test_view_func_replay_with_modified_state(self):
+        with torch.autograd._force_original_view_tracking(True):
+            base = torch.randn(3, 4, 5)
+            view = base.select(1, 2)
+
+            def symint_visitor_fn(x):
+                # modify saved index
+                return x + 1
+
+            # ensure modifying state changes view replay
+            new_base = torch.randn_like(base)
+            new_view = view._view_func(new_base, symint_visitor_fn=symint_visitor_fn)
+            self.assertEqual(new_view, new_base.select(1, 3))
+
+            # ensure saved state reverts back afterwards
+            self.assertEqual(view._view_func(new_base), new_base.select(1, 2))
+
+            # check modifying tensor state. currently, slice_inverse() is the only
+            # view that saves a tensor
+            base = torch.randn(3, 4, 5)
+            sliced = base[:, 2:3, :].detach()
+            view = torch.ops.aten.slice_inverse(sliced, base, 1, 2, 3, 1)
+
+            replacement_shape = (1, 2, 3)
+
+            def tensor_visitor_fn(x):
+                # return tensor with a smaller shape than the saved one
+                return torch.randn(*replacement_shape)
+
+            # ensure modifying state changes view replay
+            new_sliced = torch.ones_like(base)[:, 2:3, :].detach()
+            new_view = view._view_func(new_sliced, tensor_visitor_fn=tensor_visitor_fn)
+            self.assertEqual(new_view.shape, replacement_shape)
+            self.assertEqual(
+                new_view, new_sliced.as_strided(replacement_shape, (6, 3, 1))
+            )
+
+            # ensure saved state reverts back afterwards
+            self.assertEqual(view._view_func(sliced), base)
+
     def test_setup_context_when_forward_has_default_args(self):
         class PowFunction(Function):
             @staticmethod
@@ -8758,7 +9878,7 @@ def setup_context(ctx, inputs, output):
 
             @staticmethod
             def backward(ctx, gO):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 y = ctx.y
                 return gO * y * torch.pow(x, y - 1), None
 
@@ -8775,7 +9895,7 @@ def setup_context(cls, ctx, inputs, output):
 
             @classmethod
             def backward(cls, ctx, gO):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 y = ctx.y
                 return gO * y * torch.pow(x, y - 1), None
 
@@ -8785,10 +9905,10 @@ def backward(cls, ctx, gO):
         y_expected = torch.tensor(12.0)
 
         y1 = PowFunction.apply(x)
-        y1_expected, = torch.autograd.grad(y1, x)
+        (y1_expected,) = torch.autograd.grad(y1, x)
 
         y2 = PowFunctionWithClassmethod.apply(x)
-        y2_expected, = torch.autograd.grad(y2, x)
+        (y2_expected,) = torch.autograd.grad(y2, x)
 
         self.assertEqual(y, y1)
         self.assertEqual(y_expected, y1_expected)
@@ -8799,14 +9919,15 @@ def backward(cls, ctx, gO):
     def test_gradcheck_default_device_placement_context(self):
         # During gradcheck with fast_mode=True, we create a random vector on the CPU device using a CPU generator.
         # This test ensures that this still works when the default device is set to something else by the user.
-        with torch.device('cuda'):
+        with torch.device("cuda"):
             x = torch.randn(3, dtype=torch.double, requires_grad=True)
 
             def func(inp):
-                return inp ** 2.0
+                return inp**2.0
 
             self.assertTrue(gradcheck(func, x, fast_mode=True))
 
+
 def index_perm_variable(shape, max_indices):
     if not isinstance(shape, tuple):
         shape = (shape,)
@@ -8814,6 +9935,7 @@ def index_perm_variable(shape, max_indices):
     index = torch.randperm(max_indices).narrow(0, 0, reduce(mul, shape)).view(shape)
     return index
 
+
 def bernoulli_scalar():
     return torch.tensor(0, dtype=torch.uint8).bernoulli_()
 
@@ -8822,10 +9944,24 @@ class TestAutogradForwardModeBatchedGrad(TestCase):
     def test_out_of_place_basic(self):
         a = torch.rand(4, 4, dtype=torch.double, requires_grad=True)
         b = torch.rand(4, 4, dtype=torch.double, requires_grad=True)
-        self.assertTrue(gradcheck(torch.sin, a, check_forward_ad=True, check_batched_grad=True,
-                                  check_batched_forward_grad=True))
-        self.assertTrue(gradcheck(torch.add, (a, b), check_forward_ad=True, check_batched_grad=True,
-                                  check_batched_forward_grad=True))
+        self.assertTrue(
+            gradcheck(
+                torch.sin,
+                a,
+                check_forward_ad=True,
+                check_batched_grad=True,
+                check_batched_forward_grad=True,
+            )
+        )
+        self.assertTrue(
+            gradcheck(
+                torch.add,
+                (a, b),
+                check_forward_ad=True,
+                check_batched_grad=True,
+                check_batched_forward_grad=True,
+            )
+        )
 
     def test_out_of_place_not_same_layout(self):
         input = torch.zeros([2, 2]).transpose(0, 1)
@@ -8835,6 +9971,7 @@ def jvp(tangent):
             with fwAD.dual_level():
                 x = fwAD.make_dual(input, tangent)
                 return fwAD.unpack_dual(x)[1]
+
         x_tangent = torch._vmap_internals._vmap(jvp, 0, 0)(tangent)
 
         self.assertIsNot(x_tangent, tangent)
@@ -8849,10 +9986,19 @@ def jvp(tangent):
             with fwAD.dual_level():
                 x = fwAD.make_dual(input, tangent)
                 view.copy_(x)
-                return fwAD.unpack_dual(x)[1], fwAD.unpack_dual(view)[1], fwAD.unpack_dual(view._base)[1]
-        x_tangent, view_tangent, base_tangent = torch._vmap_internals._vmap(jvp, 0, 0)(tangent)
+                return (
+                    fwAD.unpack_dual(x)[1],
+                    fwAD.unpack_dual(view)[1],
+                    fwAD.unpack_dual(view._base)[1],
+                )
+
+        x_tangent, view_tangent, base_tangent = torch._vmap_internals._vmap(jvp, 0, 0)(
+            tangent
+        )
 
-        self.assertFalse(view_tangent._is_view())  # Optimization to share the same tensor!
+        self.assertFalse(
+            view_tangent._is_view()
+        )  # Optimization to share the same tensor!
         self.assertIs(view_tangent, base_tangent)
         self.assertIs(x_tangent, tangent)
 
@@ -8865,8 +10011,15 @@ def jvp(tangent):
             with fwAD.dual_level():
                 x = fwAD.make_dual(input, tangent)
                 view.copy_(x)
-                return fwAD.unpack_dual(x)[1], fwAD.unpack_dual(view)[1], fwAD.unpack_dual(view._base)[1]
-        x_tangent, view_tangent, base_tangent = torch._vmap_internals._vmap(jvp, 0, 0)(tangent)
+                return (
+                    fwAD.unpack_dual(x)[1],
+                    fwAD.unpack_dual(view)[1],
+                    fwAD.unpack_dual(view._base)[1],
+                )
+
+        x_tangent, view_tangent, base_tangent = torch._vmap_internals._vmap(jvp, 0, 0)(
+            tangent
+        )
 
         self.assertIs(view_tangent._base, base_tangent)
         self.assertIs(x_tangent, tangent)
@@ -8887,7 +10040,9 @@ def jvp(tangent):
                 self.assertIs(tangent, unpacked_tangent)
 
                 # as_strided raises
-                with self.assertRaisesRegex(RuntimeError, "can access memory outside of `tensor`"):
+                with self.assertRaisesRegex(
+                    RuntimeError, "can access memory outside of `tensor`"
+                ):
                     dual.as_strided((5,), (1,), 0)
             return unpacked_tangent
 
@@ -8942,7 +10097,10 @@ def test_size_check(self):
         tangent = torch.rand(3)
 
         with fwAD.dual_level():
-            with self.assertRaisesRegex(RuntimeError, "Trying to set a forward gradient that has a different size"):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Trying to set a forward gradient that has a different size",
+            ):
                 dual = fwAD.make_dual(foo, tangent)
 
             dual = fwAD.make_dual(foo, tangent[1:])
@@ -8965,8 +10123,28 @@ def test_metadata_check_checks_storage_numel(self):
             dual.as_strided((5,), (1,), 0)
 
     def test_metadata_check_checks_ignores_size_zero(self):
-        a = torch.ones(0).as_strided((0, 1,), (1, 1,), 0)
-        b = torch.ones(0).as_strided((0, 1,), (1, 0,), 0)
+        a = torch.ones(0).as_strided(
+            (
+                0,
+                1,
+            ),
+            (
+                1,
+                1,
+            ),
+            0,
+        )
+        b = torch.ones(0).as_strided(
+            (
+                0,
+                1,
+            ),
+            (
+                1,
+                0,
+            ),
+            0,
+        )
 
         with fwAD.dual_level():
             dual = fwAD.make_dual(a, b)
@@ -9010,7 +10188,7 @@ def test_metadata_check_check_conj(self):
         keys = {
             "NEITHER": lambda x: x,
             "CONJ": lambda x: x.conj(),
-            "NEG": lambda x: x._neg_view()
+            "NEG": lambda x: x._neg_view(),
         }
 
         for primal_key, tangent_key in product(keys, keys):
@@ -9032,16 +10210,16 @@ def test_metadata_check_check_conj(self):
 
     def test_metadata_check_ignore_storage_offset_for_zero_numel_tensor(self):
         # See https://github.com/pytorch/pytorch/issues/80507
-        a = torch.tensor([1.]).as_strided((0,), (1,), 1)
-        b = torch.tensor([1.]).as_strided((0,), (1,), 2)
+        a = torch.tensor([1.0]).as_strided((0,), (1,), 1)
+        b = torch.tensor([1.0]).as_strided((0,), (1,), 2)
 
         with fwAD.dual_level():
             dual_input = fwAD.make_dual(a, b)
             # Check that no copy is made
             self.assertIs(fwAD.unpack_dual(dual_input).tangent, b)
 
-        a = torch.tensor([1.]).as_strided((1,), (2,), 0)
-        b = torch.tensor([1.]).as_strided((1,), (1,), 0)
+        a = torch.tensor([1.0]).as_strided((1,), (2,), 0)
+        b = torch.tensor([1.0]).as_strided((1,), (1,), 0)
 
         with fwAD.dual_level():
             dual_input = fwAD.make_dual(a, b)
@@ -9058,7 +10236,6 @@ def test_metadata_check_ignore_storage_offset_for_zero_numel_tensor(self):
     #   - Ensure that view + inplace for both modes work fine
     #   - Ensure we do proper cleanup on exit of a level
 
-
     def test_default_level(self):
         foo = torch.rand(2)
         bar = torch.rand(2)
@@ -9112,7 +10289,9 @@ def test_nested_level(self):
             self.assertEqual(level, 0)
 
         with fwAD.dual_level():
-            with self.assertRaisesRegex(RuntimeError, "Nested forward mode AD is not supported at the moment"):
+            with self.assertRaisesRegex(
+                RuntimeError, "Nested forward mode AD is not supported at the moment"
+            ):
                 nest_level = fwAD.enter_dual_level()
 
     def test_set_fw_grad_having_own_fw_grad_at_same_level(self):
@@ -9122,7 +10301,9 @@ def test_set_fw_grad_having_own_fw_grad_at_same_level(self):
 
         with fwAD.dual_level():
             dual = fwAD.make_dual(foo, bar)
-            with self.assertRaisesRegex(RuntimeError, "has a forward gradient at the same level"):
+            with self.assertRaisesRegex(
+                RuntimeError, "has a forward gradient at the same level"
+            ):
                 fwAD.make_dual(baz, dual)
 
     def test_codegen_ignores_undefined_outputs(self):
@@ -9139,8 +10320,18 @@ def test_codegen_ignores_undefined_outputs(self):
         with fwAD.dual_level():
             dual_weight = fwAD.make_dual(weight, torch.ones_like(weight))
             grad_input, _, _ = torch.ops.aten.convolution_backward(
-                grad_out, inp, dual_weight, (0,),
-                (1, 1), (0, 0), (1, 1), False, (0, 0), 1, (False, True, False))
+                grad_out,
+                inp,
+                dual_weight,
+                (0,),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                False,
+                (0, 0),
+                1,
+                (False, True, False),
+            )
         self.assertIsNone(grad_input)
 
     def test_make_dual_inference_tensor_in_inference_mode(self):
@@ -9163,8 +10354,6 @@ class MySubclass(torch.Tensor):
             def __new__(cls, data=None):
                 return torch.Tensor._make_subclass(cls, data)
 
-            __torch_function__ = torch._C._disabled_torch_function_impl
-
             @classmethod
             def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 if func.overloadpacket == torch.ops.aten.alias:
@@ -9178,7 +10367,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 with no_dispatch():
                     return func(*args, **kwargs)
 
-        a = torch.tensor(1.)
+        a = torch.tensor(1.0)
         s = MySubclass(a)
 
         with fwAD.dual_level():
@@ -9197,15 +10386,21 @@ def test_make_dual_forbid_integral_dtype(self):
 
         with fwAD.dual_level():
             # Float Primal and Long Tangent
-            with self.assertRaisesRegex(ValueError, "Expected tangent to be floating point or complex"):
+            with self.assertRaisesRegex(
+                ValueError, "Expected tangent to be floating point or complex"
+            ):
                 fwAD.make_dual(primal_f, tangent_l)
 
             # Long Primal and Long Tangent
-            with self.assertRaisesRegex(ValueError, "Expected primal to be floating point or complex"):
+            with self.assertRaisesRegex(
+                ValueError, "Expected primal to be floating point or complex"
+            ):
                 fwAD.make_dual(primal_l, tangent_l)
 
             # Long Primal and Float Tangent
-            with self.assertRaisesRegex(ValueError, "Expected primal to be floating point or complex"):
+            with self.assertRaisesRegex(
+                ValueError, "Expected primal to be floating point or complex"
+            ):
                 fwAD.make_dual(primal_l, tangent_f)
 
     def test_print(self):
@@ -9261,7 +10456,9 @@ def test_advanced_packing_unpacking(self):
             # Unpacking should only create aliases as well
             dual_primal, dual_tangent = fwAD.unpack_dual(dual)
             self.assertEqual(dual_primal.storage().data_ptr(), foo.storage().data_ptr())
-            self.assertEqual(dual_tangent.storage().data_ptr(), bar.storage().data_ptr())
+            self.assertEqual(
+                dual_tangent.storage().data_ptr(), bar.storage().data_ptr()
+            )
             # And the tangent is actually re-used as-is so it is still the same Tensor
             self.assertIs(dual_tangent, bar)
 
@@ -9282,11 +10479,15 @@ def test_advanced_packing_unpacking(self):
             dual = fwAD.make_dual(foo, bar)
             p, t = fwAD.unpack_dual(dual)
 
-            gfoo, gbar = torch.autograd.grad(p.sum(), (foo, bar), retain_graph=True, allow_unused=True)
+            gfoo, gbar = torch.autograd.grad(
+                p.sum(), (foo, bar), retain_graph=True, allow_unused=True
+            )
             self.assertEqual(gfoo, torch.ones_like(foo))
             self.assertIsNone(gbar)
 
-            gfoo, gbar = torch.autograd.grad(t.sum(), (foo, bar), retain_graph=True, allow_unused=True)
+            gfoo, gbar = torch.autograd.grad(
+                t.sum(), (foo, bar), retain_graph=True, allow_unused=True
+            )
             self.assertIsNone(gfoo)
             self.assertEqual(gbar, torch.ones_like(bar))
 
@@ -9357,7 +10558,6 @@ def test_view_inplace_non_differentiable_views(self):
             self.assertEqual(fwAD.unpack_dual(dual)[0], original_foo * 4)
             self.assertEqual(fwAD.unpack_dual(dual)[1], original_bar * 4)
 
-
     def test_view_inplace_differentiable_views(self):
         original_foo = torch.rand(2)
         original_bar = torch.ones(2)
@@ -9377,8 +10577,8 @@ def test_view_inplace_differentiable_views(self):
             # Check that non differentiable view was not updated
             self.assertIsNone(fwAD.unpack_dual(foo)[1])
             # Check that differentiable view was updated
-            self.assertEqual(fwAD.unpack_dual(dual)[1], torch.tensor([2., 1.]))
-            self.assertEqual(fwAD.unpack_dual(view)[1], torch.tensor([2.]))
+            self.assertEqual(fwAD.unpack_dual(dual)[1], torch.tensor([2.0, 1.0]))
+            self.assertEqual(fwAD.unpack_dual(view)[1], torch.tensor([2.0]))
 
             # Check that we track differentiable view even for Tensors that are not dual
             baz = torch.rand(2)
@@ -9389,7 +10589,7 @@ def test_view_inplace_differentiable_views(self):
             baz[0] = dual[0]
             self.assertEqual(fwAD.unpack_dual(baz)[1][0], fwAD.unpack_dual(dual)[1][0])
             # Unused values get a gradient of 0
-            self.assertEqual(fwAD.unpack_dual(baz)[1][1], 0.)
+            self.assertEqual(fwAD.unpack_dual(baz)[1][1], 0.0)
 
             # Check that forward non-differentiable views do prevent gradient update
             baz = torch.rand(2)
@@ -9466,7 +10666,6 @@ def test_detach_view_tracking(self):
         self.assertTrue(foo_weak.expired())
 
     def test_out_variant(self):
-
         with fwAD.dual_level():
             foo = fwAD.make_dual(torch.rand(2), torch.rand(2))
             bar = torch.rand(2)
@@ -9501,7 +10700,9 @@ def assert_same_meta(t, target):
                     # Check size/strides match for feature dims only
                     for i in range(num_bdim, result.dim()):
                         self.assertEqual(result.size()[i], target.size()[i - num_bdim])
-                        self.assertEqual(result.stride()[i], target.stride()[i - num_bdim])
+                        self.assertEqual(
+                            result.stride()[i], target.stride()[i - num_bdim]
+                        )
 
                     # Check that we generate strides reasonably
                     if target.is_contiguous():
@@ -9510,7 +10711,9 @@ def assert_same_meta(t, target):
                     self.assertEqual(result.storage_offset(), target.storage_offset())
 
                     prod_of_t_bdims = reduce(operator.mul, t.size()[:num_bdim], 1)
-                    self.assertEqual(len(result.storage()), len(target.storage()) * prod_of_t_bdims)
+                    self.assertEqual(
+                        len(result.storage()), len(target.storage()) * prod_of_t_bdims
+                    )
 
                     # TensorOptions is same
                     self.assertEqual(result.dtype, target.dtype)
@@ -9542,7 +10745,7 @@ def assert_same_meta(t, target):
         check(a, b)
 
         # Scalar tensor
-        a = torch.tensor(1.)
+        a = torch.tensor(1.0)
         b = torch.randn(1, 2)
         check(a, b)
 
@@ -9564,17 +10767,21 @@ def fn():
         # context manager. If you do, there is something wrong with the
         # locking of the forward ad level most likely
 
+
 # Generic device type autograd tests.
 class TestAutogradDeviceType(TestCase):
-
     def test_min_max_median_backprops_to_all_values(self, device):
         for f in [torch.min, torch.max, torch.median, torch.nanmedian]:
-            x1 = torch.tensor([1., 0., 1., 0., 1., 0.], device=device, requires_grad=True)
-            x2 = torch.tensor([float('nan'), float('nan'), float('nan')], requires_grad=True)
+            x1 = torch.tensor(
+                [1.0, 0.0, 1.0, 0.0, 1.0, 0.0], device=device, requires_grad=True
+            )
+            x2 = torch.tensor(
+                [float("nan"), float("nan"), float("nan")], requires_grad=True
+            )
             for x in [x1, x2]:
                 y = f(x)
                 y.backward()
-                self.assertEqual(x.grad.sum(), 1.)
+                self.assertEqual(x.grad.sum(), 1.0)
                 self.assertEqual((x.grad == 1 / 3).sum(), 3)
 
     def test_scatter_index_reduce_amin_amax_backprops_to_all_values(self, device):
@@ -9582,9 +10789,11 @@ def test_scatter_index_reduce_amin_amax_backprops_to_all_values(self, device):
         # tested here instead of adding a SampleInput as the backward for this case is non-differentiable for gradgrad
         # as is the case for test_min_max_median_backprops_to_all_values above
         fns = (torch.scatter_reduce, torch.index_reduce)
-        reduces = ('amin', 'amax')
+        reduces = ("amin", "amax")
         for fn, reduction in product(fns, reduces):
-            input = torch.randn((2, 3), device=device, dtype=torch.float64, requires_grad=True)
+            input = torch.randn(
+                (2, 3), device=device, dtype=torch.float64, requires_grad=True
+            )
             src = input.clone().detach_().requires_grad_(True)
             idx = torch.arange(2).to(dtype=torch.long, device=device)
             if fn == torch.scatter_reduce:
@@ -9595,15 +10804,21 @@ def test_scatter_index_reduce_amin_amax_backprops_to_all_values(self, device):
     def test_scatter_index_reduce_prod_gradgrad_error(self, device):
         # test that double backward raises an error for the case where 2 zeros in src
         # are scattered to the same position in self
-        input = torch.tensor([1.], device=device, dtype=torch.float64, requires_grad=True)
-        src = torch.tensor([0., 0.], device=device, dtype=torch.float64, requires_grad=True)
+        input = torch.tensor(
+            [1.0], device=device, dtype=torch.float64, requires_grad=True
+        )
+        src = torch.tensor(
+            [0.0, 0.0], device=device, dtype=torch.float64, requires_grad=True
+        )
         idx = torch.tensor([0, 0], device=device, dtype=torch.long)
 
         for fn in (torch.scatter_reduce, torch.index_reduce):
             # check that this case passes on gradcheck
-            gradcheck(fn, (input, 0, idx, src, 'prod'), check_batched_grad=False)
-            with self.assertRaisesRegex(RuntimeError, "Double backward is unsupported for"):
-                gradgradcheck(fn, (input, 0, idx, src, 'prod'))
+            gradcheck(fn, (input, 0, idx, src, "prod"), check_batched_grad=False)
+            with self.assertRaisesRegex(
+                RuntimeError, "Double backward is unsupported for"
+            ):
+                gradgradcheck(fn, (input, 0, idx, src, "prod"))
 
     @skipIfMps  # the test doesn't work on MPS as double types are not supported
     def test_parameter_resize(self, device):
@@ -9627,9 +10842,12 @@ def _test(size, sparse_dim, nnz, device):
             i.mul_(torch.tensor(size[:sparse_dim]).unsqueeze(1).to(i))
             i = i.to(torch.long)
 
-            inp = torch.randn(v_size, dtype=torch.double, device=device, requires_grad=True)
-            other = self.genSparseTensor(size, sparse_dim, nnz, is_uncoalesced=True, device=device,
-                                         dtype=dtype)[0]
+            inp = torch.randn(
+                v_size, dtype=torch.double, device=device, requires_grad=True
+            )
+            other = self.genSparseTensor(
+                size, sparse_dim, nnz, is_uncoalesced=True, device=device, dtype=dtype
+            )[0]
 
             def fn(v):
                 x = torch.sparse_coo_tensor(i, v, size, dtype=dtype, device=device)
@@ -9645,7 +10863,9 @@ def fn(v):
 
             # assert that _values is non-differentiable
             with self.assertRaisesRegex(RuntimeError, "does not have a grad_fn"):
-                other.detach().requires_grad_()._values().backward(torch.ones_like(other._values()))
+                other.detach().requires_grad_()._values().backward(
+                    torch.ones_like(other._values())
+                )
 
         for empty_i, empty_v, empty_nnz in product([True, False], repeat=3):
             sparse_size = [] if empty_i else [2, 1]
@@ -9665,20 +10885,26 @@ def forward(ctx, x, grad_x):
 
             @staticmethod
             def backward(ctx, grad_x):
-                saved_grad_x, = ctx.saved_tensors
+                (saved_grad_x,) = ctx.saved_tensors
                 return saved_grad_x, None
 
         size = torch.Size([6, 3, 2])
-        i1 = torch.tensor([
-            [0, 3, 4],
-            [0, 2, 2],
-        ], dtype=torch.long)
+        i1 = torch.tensor(
+            [
+                [0, 3, 4],
+                [0, 2, 2],
+            ],
+            dtype=torch.long,
+        )
         v1 = make_tensor([3, 2], dtype=dtype, device=device)
         sparse_grad1 = torch.sparse_coo_tensor(i1, v1, size, dtype=dtype, device=device)
-        i2 = torch.tensor([
-            [0, 1, 3, 4],
-            [0, 1, 2, 2],
-        ], dtype=torch.long)
+        i2 = torch.tensor(
+            [
+                [0, 1, 3, 4],
+                [0, 1, 2, 2],
+            ],
+            dtype=torch.long,
+        )
         v2 = make_tensor([4, 2], dtype=dtype, device=device)
         sparse_grad2 = torch.sparse_coo_tensor(i2, v2, size, dtype=dtype, device=device)
         dense_grad = torch.rand(size, device=device, dtype=dtype)
@@ -9686,11 +10912,19 @@ def backward(ctx, grad_x):
 
         # sparse first
         x = torch.randn(size, dtype=dtype, device=device, requires_grad=True)
-        (fn.apply(x, sparse_grad1) + fn.apply(x, dense_grad) + fn.apply(x, sparse_grad2)).sum().abs().backward()
+        (
+            fn.apply(x, sparse_grad1)
+            + fn.apply(x, dense_grad)
+            + fn.apply(x, sparse_grad2)
+        ).sum().abs().backward()
         self.assertEqual(x.grad, dense_grad + sparse_grad1 + sparse_grad2)
         # dense first
         x = torch.randn(size, dtype=dtype, device=device, requires_grad=True)
-        (fn.apply(x, dense_grad) + fn.apply(x, sparse_grad1) + fn.apply(x, sparse_grad2)).sum().abs().backward()
+        (
+            fn.apply(x, dense_grad)
+            + fn.apply(x, sparse_grad1)
+            + fn.apply(x, sparse_grad2)
+        ).sum().abs().backward()
         self.assertEqual(x.grad, dense_grad + sparse_grad1 + sparse_grad2)
         # sparse only
         x = torch.randn(size, dtype=dtype, device=device, requires_grad=True)
@@ -9759,12 +10993,21 @@ def test_nonzero(tensor, value, expected):
             test_nonzero(f, inf, bool(inf))
             test_nonzero(f, -inf, bool(-inf))
 
-
         _test_pyscalar_conversions(lambda x: x.to(device), lambda x: int(x))
 
     @dtypesIfMPS(torch.float32)
-    @dtypesIfCUDA(torch.half, torch.float, torch.double, torch.int8, torch.int16, torch.int32, torch.int64)
-    @dtypes(torch.float, torch.double, torch.int8, torch.int16, torch.int32, torch.int64)
+    @dtypesIfCUDA(
+        torch.half,
+        torch.float,
+        torch.double,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+    )
+    @dtypes(
+        torch.float, torch.double, torch.int8, torch.int16, torch.int32, torch.int64
+    )
     def test_set_requires_grad_only_for_floats(self, device, dtype):
         def f1():
             a = torch.ones(1, dtype=dtype, device=device)
@@ -9785,13 +11028,17 @@ def f3():
             if dtype.is_floating_point:
                 f()
             else:
-                with self.assertRaisesRegex(RuntimeError, 'floating point', msg=f"dt: {a.dtype} device: {a.device}"):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "floating point",
+                    msg=f"dt: {a.dtype} device: {a.device}",
+                ):
                     f()
 
     @onlyCUDA
     def test_advanced_indexing_backwards_large(self, device):
         # See https://github.com/pytorch/pytorch/issues/22843
-        n = (1 << 16)
+        n = 1 << 16
         x = torch.rand(n, 1, device=device, requires_grad=True)
         a = x[:, [0]]
         a.sum().backward()
@@ -9800,7 +11047,9 @@ def test_advanced_indexing_backwards_large(self, device):
     def test_advanced_indexing_backwards_memory_format(self, device):
         # See https://github.com/pytorch/pytorch/issues/36956
         shape = (2, 8, 1, 2)
-        i = torch.randint(1, shape, device=device).contiguous(memory_format=torch.channels_last)
+        i = torch.randint(1, shape, device=device).contiguous(
+            memory_format=torch.channels_last
+        )
         x = torch.randn(shape, requires_grad=True, device=device)
         x[i].sum().backward()
 
@@ -9889,7 +11138,7 @@ def where(cond, x, y):
     @skipIfMps  # the test doesn't work on MPS
     def test_where_scalar(self, device):
         x = torch.randn(5, 5, dtype=torch.double, device=device, requires_grad=True)
-        scalar = 4.
+        scalar = 4.0
         cond = mask_not_all_zeros((5, 5)).to(device=device)
 
         def where_scalar_first(cond, x):
@@ -9983,13 +11232,13 @@ def test_grad_assignment(self, devices):
             x.grad = x
 
         # Tests device -> cpu grad assignment raises
-        if self.device_type != 'cpu':
+        if self.device_type != "cpu":
             with self.assertRaises(RuntimeError):
                 t_cpu = torch.rand(5, 5)
                 t_cpu.grad = torch.randn(5, 5, device=devices[0])
 
         # Tests half type on CUDA
-        if self.device_type == 'cuda':
+        if self.device_type == "cuda":
             x = x.to(dtype=torch.half, device=devices[0])
             x.grad = torch.zeros_like(x)
 
@@ -10008,7 +11257,9 @@ def test_requires_grad_factory(self, devices, dtype):
 
         for fn in fns:
             for requires_grad in [True, False]:
-                output = fn(x, dtype=dtype, device=devices[0], requires_grad=requires_grad)
+                output = fn(
+                    x, dtype=dtype, device=devices[0], requires_grad=requires_grad
+                )
                 self.assertEqual(requires_grad, output.requires_grad)
                 self.assertIs(dtype, output.dtype)
                 self.assertEqual(devices[0], str(x.device))
@@ -10016,6 +11267,7 @@ def test_requires_grad_factory(self, devices, dtype):
     @deviceCountAtLeast(2)
     def test_unused_output_device(self, devices):
         from torch.nn.parallel._functions import Broadcast
+
         x = torch.randn(5, 5, dtype=torch.float, device=devices[0], requires_grad=True)
         outputs = Broadcast.apply(list(range(len(devices))), x)
         y = outputs[-1] * 2
@@ -10072,12 +11324,12 @@ def test_copy_forward_ad_broadcasting(self, device):
             non_dual.copy_(dual)
 
     def test_copy_forward_ad_same_layout_copies_grad(self, device):
-        primal = torch.tensor([[3.], [4.]], device=device)
-        tangent = torch.tensor([[5.], [6.]], device=device)
+        primal = torch.tensor([[3.0], [4.0]], device=device)
+        tangent = torch.tensor([[5.0], [6.0]], device=device)
 
         with fwAD.dual_level():
             x_dual = fwAD.make_dual(primal, tangent)
-            non_dual = torch.tensor([[1.], [2.]])
+            non_dual = torch.tensor([[1.0], [2.0]])
             non_dual.copy_(x_dual)
             self.assertTrue(fwAD.unpack_dual(non_dual).tangent is not tangent)
 
@@ -10095,10 +11347,10 @@ def backward(ctx, grad_output):
                 with torch.enable_grad():
                     if ReentrantFunc._cpu_mode:
                         new_param = torch.randn(2, 2, requires_grad=True)
-                        (new_param ** 2).sum().backward()
+                        (new_param**2).sum().backward()
                     else:
                         new_param = torch.randn(2, 2, device=device, requires_grad=True)
-                        (new_param ** 2).sum().backward()
+                        (new_param**2).sum().backward()
                 return grad_output
 
         # Reentrant starts on GPU thread, finishs on GPU thread
@@ -10207,11 +11459,13 @@ def func(root, b):
             return x
 
         gradcheck(func, [a, b], raise_exception=True)
-        go = torch.randn(a.size(), dtype=torch.double, device=device, requires_grad=True)
+        go = torch.randn(
+            a.size(), dtype=torch.double, device=device, requires_grad=True
+        )
         gradgradcheck(func, (a, b), (go,))
 
     def test_inplace_on_view_multiple_outputs(self, device):
-        root = torch.arange(9., dtype=torch.double).reshape(3, 3).requires_grad_()
+        root = torch.arange(9.0, dtype=torch.double).reshape(3, 3).requires_grad_()
         x = root.clone()
         v1 = x.unbind()
         with self.assertRaises(RuntimeError):
@@ -10219,7 +11473,9 @@ def test_inplace_on_view_multiple_outputs(self, device):
 
     @skipIfMps  # the test doesn't work on MPS as double types are not supported
     def test_inplace_on_view_of_multiple_output_view(self, device):
-        a = torch.rand(10, dtype=torch.double, device=device, requires_grad=True).clone()
+        a = torch.rand(
+            10, dtype=torch.double, device=device, requires_grad=True
+        ).clone()
         b = a.unbind(0)
         c = b[0].view_as(b[0])
         with self.assertRaises(RuntimeError):
@@ -10227,7 +11483,9 @@ def test_inplace_on_view_of_multiple_output_view(self, device):
 
     @skipIfMps  # MPS backend doesn't support double types
     def test_inplace_multiple_output_view_of_view(self, device):
-        a = torch.rand(10, dtype=torch.double, device=device, requires_grad=True).clone()
+        a = torch.rand(
+            10, dtype=torch.double, device=device, requires_grad=True
+        ).clone()
         b = a.view_as(a)
         c = b.unbind(0)
         with self.assertRaises(RuntimeError):
@@ -10247,13 +11505,15 @@ def func(root, b):
             return x
 
         gradcheck(func, [a, b], raise_exception=True)
-        go = torch.randn(a.size(), dtype=torch.double, device=device, requires_grad=True)
+        go = torch.randn(
+            a.size(), dtype=torch.double, device=device, requires_grad=True
+        )
         gradgradcheck(func, (a, b), (go,))
 
     def test_inplace_on_view_backprop_view(self, device):
         # modify view and backprop through view
-        a = torch.tensor([2., 5.], device=device, requires_grad=False)
-        b = torch.tensor([3.], device=device, requires_grad=True)
+        a = torch.tensor([2.0, 5.0], device=device, requires_grad=False)
+        b = torch.tensor([3.0], device=device, requires_grad=True)
         res = a.narrow(0, 1, 1).mul_(b)
         res.sum().backward()
         self.assertEqual(b.grad.tolist(), [5])
@@ -10302,7 +11562,9 @@ def func(root, b):
             return x
 
         gradcheck(func, [a, b], raise_exception=True)
-        go = torch.randn(a.size(), dtype=torch.double, device=device, requires_grad=True)
+        go = torch.randn(
+            a.size(), dtype=torch.double, device=device, requires_grad=True
+        )
         gradgradcheck(func, (a, b), (go,))
 
     def test_inplace_on_view_non_contig(self, device):
@@ -10315,9 +11577,11 @@ def test_inplace_on_view_non_contig(self, device):
         self.assertEqual(root.grad.tolist(), [[1, 2], [1, 1], [1, 1]])
 
     def test_inplace_on_view_multi_output_unsafe(self, device):
-        for f in [lambda t: t.unsafe_split(1),
-                  lambda t: t.unsafe_split_with_sizes((1, 1, 1)),
-                  lambda t: t.unsafe_chunk(3)]:
+        for f in [
+            lambda t: t.unsafe_split(1),
+            lambda t: t.unsafe_split_with_sizes((1, 1, 1)),
+            lambda t: t.unsafe_chunk(3),
+        ]:
             a = torch.randn(3, 3, device=device, requires_grad=True)
             b = a + a
             s1, s2, s3 = f(b)
@@ -10325,21 +11589,25 @@ def test_inplace_on_view_multi_output_unsafe(self, device):
             s1.sum().backward()
 
     def test_inplace_on_view_multi_output_safe(self, device):
-        for f in [lambda t: t.split(1),
-                  lambda t: t.split_with_sizes((1, 1, 1)),
-                  lambda t: t.chunk(3)]:
+        for f in [
+            lambda t: t.split(1),
+            lambda t: t.split_with_sizes((1, 1, 1)),
+            lambda t: t.chunk(3),
+        ]:
             a = torch.randn(3, 3, device=device, requires_grad=True)
             b = a + a
             s1, s2, s3 = f(b)
-            error_msg = 'This view is the output of a function that returns multiple views.'
+            error_msg = (
+                "This view is the output of a function that returns multiple views."
+            )
             with self.assertRaisesRegex(RuntimeError, error_msg):
                 s1.mul_(s2)
 
     def test_inplace_on_view_undefined_grad_output(self, device):
-        a = torch.tensor([1.], requires_grad=True)
+        a = torch.tensor([1.0], requires_grad=True)
         c = a.clone()
         v = c[:]
-        b = torch.tensor(1., requires_grad=True)
+        b = torch.tensor(1.0, requires_grad=True)
 
         class InplaceFunc(torch.autograd.Function):
             @staticmethod
@@ -10400,7 +11668,9 @@ def test_strided_leaf_grad_layout(self, device):
                 self.assertEqual(b.grad.stride(), b.stride())
 
         # (2) If leaf isn't dense, checks that grads are rowmajor contiguous.
-        c = torch.empty_strided((2, 2), (4, 2), device=device).copy_(torch.rand((2, 2), device=device))
+        c = torch.empty_strided((2, 2), (4, 2), device=device).copy_(
+            torch.rand((2, 2), device=device)
+        )
         c.requires_grad_()
         d = torch.rand((2, 2), device=device)
         # checks (2) for broadcasted gradients
@@ -10414,34 +11684,38 @@ def test_strided_leaf_grad_layout(self, device):
     @skipIfMps
     def test_copy_r_to_c(self, device):
         out_c = torch.empty(3, 2, dtype=torch.cdouble, device=device)
-        inp_r = torch.randn(3, 2, dtype=torch.double, device=device,
-                            requires_grad=True)
+        inp_r = torch.randn(3, 2, dtype=torch.double, device=device, requires_grad=True)
 
         def do_test():
             out_c.copy_(inp_r)
             out_c_inter = out_c.sum()
             out_c_inter.abs().backward()
             with torch.no_grad():
-                self.assertEqual(inp_r.grad, torch.ones_like(inp_r) * torch.sgn(out_c_inter).real)
+                self.assertEqual(
+                    inp_r.grad, torch.ones_like(inp_r) * torch.sgn(out_c_inter).real
+                )
 
         self.assertNotWarn(do_test)
 
     def test_to_r_to_c(self, device):
         def do_test():
-            inp_r = torch.randn(3, 2, dtype=torch.double, device=device,
-                                requires_grad=True)
+            inp_r = torch.randn(
+                3, 2, dtype=torch.double, device=device, requires_grad=True
+            )
             out = inp_r.to(torch.complex128)
             out_inter = out.sum()
             out_inter.abs().backward()
             with torch.no_grad():
-                self.assertEqual(inp_r.grad, torch.ones_like(inp_r) * torch.sgn(out_inter).real)
+                self.assertEqual(
+                    inp_r.grad, torch.ones_like(inp_r) * torch.sgn(out_inter).real
+                )
 
         self.assertNotWarn(do_test)
 
     def test_non_differentiable_ops(self, device):
         # Just make sure the op doesn't raise an error
         # and resulting tensor has requires_grad=False.
-        x = torch.tensor([[1, 2], [3, 4.]], requires_grad=True, device=device)
+        x = torch.tensor([[1, 2], [3, 4.0]], requires_grad=True, device=device)
         out = torch.isin(x, torch.tensor([2, 3], device=device))
         self.assertFalse(out.requires_grad)
 
@@ -10472,7 +11746,9 @@ def test_complex_scalar_backward(self, device):
     def test_pow_real_negative_base_complex_exponent(self, device):
         # OpInfo doesn't naturally support input of mixed types, hence this test here.
         base = -torch.ones(2, device=device, dtype=torch.double)
-        exponent = torch.randn(2, device=device, dtype=torch.cdouble, requires_grad=True)
+        exponent = torch.randn(
+            2, device=device, dtype=torch.cdouble, requires_grad=True
+        )
 
         def fn(exponent):
             return torch.pow(base, exponent)
@@ -10517,7 +11793,10 @@ def fn(a):
             b.sin_()
             out.sum().backward()
             return a.grad
-        msg = "variables needed for gradient computation has been modified by an inplace"
+
+        msg = (
+            "variables needed for gradient computation has been modified by an inplace"
+        )
         with self.assertRaisesRegex(RuntimeError, msg):
             fn(a)
 
@@ -10538,7 +11817,9 @@ def fn(a):
             out.sum().backward()
             return a.grad
 
-        msg = "variables needed for gradient computation has been modified by an inplace"
+        msg = (
+            "variables needed for gradient computation has been modified by an inplace"
+        )
         with self.assertRaisesRegex(RuntimeError, msg):
             fn(a)
 
@@ -10580,8 +11861,8 @@ def test_double_backward(self):
             out = (b**2).sum()
             b.sin_()
             torch.autograd.grad(out, a, create_graph=True)
-            da, = torch.autograd.grad(out, a, create_graph=True)
-            d2a, = torch.autograd.grad(da.sum(), a)
+            (da,) = torch.autograd.grad(out, a, create_graph=True)
+            (d2a,) = torch.autograd.grad(da.sum(), a)
 
         self.assertTrue(torch.allclose(torch.ones_like(a) * 2, d2a))
         self.assertClonedLenEqual(ctx, 0)
@@ -10644,9 +11925,9 @@ def test_with_math_views(self):
 
     def test_with_out_variant(self):
         with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
-            a = torch.tensor([1.], requires_grad=True)
-            b = torch.tensor([1.])
-            c = torch.tensor([2.])
+            a = torch.tensor([1.0], requires_grad=True)
+            b = torch.tensor([1.0])
+            c = torch.tensor([2.0])
             out = a * b
             self.assertTIDMapLenEqual(ctx, 1)
             torch.sin(c, out=b)
@@ -10680,6 +11961,7 @@ def test_disallow_nesting(self):
                 with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
                     pass
 
+
 class TestAutogradInferenceMode(TestCase):
     def _is_inference_tensor(self, tensor):
         try:
@@ -10703,6 +11985,7 @@ def test_inference_mode_decorator(self):
         def func(x):
             self.assertEqual(torch.is_inference_mode_enabled(), mode)
             return x * x
+
         for mode, use_kwarg in product((True, False, None), (True, False)):
             if mode is None:
                 if use_kwarg:
@@ -10751,8 +12034,10 @@ def test_inference_mode_existing_autograd_session(self):
         # tensors created outside of inference mode aren't
         # inference tensors, so they will still have their
         # version counters tracked
-        err_msg = ("one of the variables needed for gradient computation has been "
-                   "modified by an inplace operation")
+        err_msg = (
+            "one of the variables needed for gradient computation has been "
+            "modified by an inplace operation"
+        )
         with self.assertRaisesRegex(RuntimeError, err_msg):
             out.backward(torch.ones_like(out))
 
@@ -10781,6 +12066,7 @@ def run_test(fn):
                 fn(c)
                 self.assertTrue(torch.is_inference(c))
                 self.assertEqual(c.requires_grad, requires_grad)
+
         run_test(lambda x: x.add_(2))
         run_test(lambda x: x.transpose_(0, 1))
 
@@ -10827,6 +12113,7 @@ def run_test(fn):
                     err_msg = "Inplace update to inference tensor outside InferenceMode"
                     with self.assertRaisesRegex(RuntimeError, err_msg):
                         fn(c)
+
         run_test(lambda x: x.add_(2))
         run_test(lambda x: x.transpose_(0, 1))
 
@@ -10861,6 +12148,7 @@ def run_test(fn):
                     view_out = a.view(-1)
                     self.assertFalse(torch.is_inference(view_out))
                     self.assertEqual(view_out.requires_grad, requires_grad)
+
         run_test(lambda x: x.add_(2))
         run_test(lambda x: x.transpose_(0, 1))
 
@@ -10940,7 +12228,9 @@ def functional_op(x):
             self.assertEqual(tmp.requires_grad, requires_grad)
 
             if requires_grad:
-                err_msg = "A view was created in inference mode and is being modified inplace"
+                err_msg = (
+                    "A view was created in inference mode and is being modified inplace"
+                )
                 with self.assertRaisesRegex(RuntimeError, err_msg):
                     out.add_(2)
                 pass
@@ -10979,7 +12269,6 @@ def test_mix_inference_and_normal_tensor_functional_op(self):
                 # with self.assertRaisesRegex(RuntimeError, err_msg):
                 #     torch.stack(inputs)
 
-
     def test_mix_inference_and_normal_tensor_inplace_op(self):
         for requires_grad in (True, False):
             s = torch.ones(1, 2, 3, requires_grad=requires_grad)
@@ -10995,8 +12284,10 @@ def test_mix_inference_and_normal_tensor_inplace_op(self):
                     a.mul_(c)
 
                 # inference tensor in TensorList input
-                err_msg = ("out=... arguments don't support automatic differentiation, "
-                           "but one of the arguments requires grad")
+                err_msg = (
+                    "out=... arguments don't support automatic differentiation, "
+                    "but one of the arguments requires grad"
+                )
                 with self.assertRaisesRegex(RuntimeError, err_msg):
                     torch.mul(s, s, out=c)
             else:
@@ -11041,6 +12332,7 @@ def run_test(fn):
                     pass
                 else:
                     fn(view_out)
+
         run_test(lambda x: x.add_(2))
         run_test(lambda x: x.transpose_(0, 1))
 
@@ -11061,19 +12353,21 @@ def run_test(fn):
                     pass
                 else:
                     view_out.grad_fn
+
         run_test(lambda x: x.add_(2))
         run_test(lambda x: x.transpose_(0, 1))
 
 
 class TestMultithreadAutograd(TestCase):
-    def _run_py_multithread_fn(self, fn, args=(), num_threads=10, kwargs=None, pass_idx=False):
-
+    def _run_py_multithread_fn(
+        self, fn, args=(), num_threads=10, kwargs=None, pass_idx=False
+    ):
         class PropagatingThread(threading.Thread):
-            '''Helper class to propagate exception from child
+            """Helper class to propagate exception from child
             thread to main thread on join.
 
             Reference: https://stackoverflow.com/a/31614591/5602957
-            '''
+            """
 
             def run(self):
                 self.exception = None
@@ -11143,7 +12437,7 @@ def train_fn_grad(x):
         # be accumulate to the same place and should be the same
         self._run_py_multithread_fn(train_fn_grad, (x,))
 
-    def test_multi_grad_hooks(self):
+    def test_multi_grad_all_hooks(self):
         # Multihooks should behave independently per execution of backward
         # Test that the hook fired the number of times we ran backward
         # even if those executions occur concurrently on different threads
@@ -11154,15 +12448,17 @@ def test_multi_grad_hooks(self):
 
         res = None
         count = [0]
+        hook_lock = threading.Lock()
 
         def hook(grads):
             nonlocal res
-            count[0] += 1
-            grad_is_none = [g is not None for g in grads]
-            if res is None:
-                res = grad_is_none
-            else:
-                self.assertEqual(res, grad_is_none)
+            with hook_lock:
+                count[0] += 1
+                grad_is_none = [g is not None for g in grads]
+                if res is None:
+                    res = grad_is_none
+                else:
+                    self.assertEqual(res, grad_is_none)
 
         torch.autograd.graph.register_multi_grad_hook((t1, t2, t3, t4), hook)
 
@@ -11181,6 +12477,8 @@ def backward_retain_graph(out, t2, t3):
         count = [0]
         err_count = [0]
         bw_count = [0]
+        bw_count_lock = threading.Lock()
+        err_count_lock = threading.Lock()
 
         class Func(torch.autograd.Function):
             @staticmethod
@@ -11189,11 +12487,12 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, gO):
-                bw_count[0] += 1
-                if bw_count[0] == 1:
-                    raise RuntimeError("error message")
-                else:
-                    return gO
+                with bw_count_lock:
+                    bw_count[0] += 1
+                    if bw_count[0] == 1:
+                        raise RuntimeError("error message")
+                    else:
+                        return gO
 
         out = (Func.apply(t2) * t3).sum()
 
@@ -11201,7 +12500,8 @@ def backward_retain_graph(out, t2, t3):
             try:
                 out.backward(inputs=(t2, t3), retain_graph=True)
             except RuntimeError:
-                err_count[0] += 1
+                with err_count_lock:
+                    err_count[0] += 1
 
         self._run_py_multithread_fn(backward_retain_graph, (out, t2, t3), num_threads=5)
 
@@ -11209,6 +12509,79 @@ def backward_retain_graph(out, t2, t3):
         self.assertEqual(err_count[0], 1)
         self.assertEqual(res, [False, True, True, False])
 
+    def test_multi_grad_any_hooks(self):
+        # Multihooks should behave independently per execution of backward
+        # Test that the hook fired the number of times we ran backward
+        # even if those executions occur concurrently on different threads
+        t1 = torch.rand(2, requires_grad=True)
+        t2 = torch.rand(2, requires_grad=True)
+        t3 = torch.rand(2, requires_grad=True)
+        t4 = torch.rand(2, requires_grad=True)
+
+        res = None
+        count = [0]
+        hook_lock = threading.Lock()
+
+        def hook(grad):
+            nonlocal res
+            with hook_lock:
+                count[0] += 1
+                if res is None:
+                    res = "foo"
+                else:
+                    self.assertEqual(res, "foo")
+
+        torch.autograd.graph.register_multi_grad_hook(
+            (t1, t2, t3, t4), hook, mode="any"
+        )
+
+        out = (t2 * t3).sum()
+
+        def backward_retain_graph(out, t2, t3):
+            out.backward(inputs=(t2, t3), retain_graph=True)
+
+        self._run_py_multithread_fn(backward_retain_graph, (out, t2, t3), num_threads=5)
+        self.assertEqual(count[0], 5)
+        self.assertEqual(res, "foo")
+
+        # Raise an error in one thread's backward
+        res = None
+        count = [0]
+        err_count = [0]
+        bw_count = [0]
+        bw_count_lock = threading.Lock()
+        err_count_lock = threading.Lock()
+
+        class Func(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x
+
+            @staticmethod
+            def backward(ctx, gO):
+                with bw_count_lock:
+                    bw_count[0] += 1
+                    if bw_count[0] == 1:
+                        raise RuntimeError("error message")
+                    else:
+                        return gO
+
+        out = (Func.apply(t2) * t3).sum()
+
+        def backward_retain_graph(out, t2, t3):
+            try:
+                out.backward(inputs=(t2, t3), retain_graph=True)
+            except RuntimeError:
+                with err_count_lock:
+                    err_count[0] += 1
+
+        self._run_py_multithread_fn(backward_retain_graph, (out, t2, t3), num_threads=5)
+
+        # Expect all 5 threads to increment count since the hook runs before
+        # the custom backward
+        self.assertEqual(count[0], 5)
+        self.assertEqual(err_count[0], 1)
+        self.assertEqual(res, "foo")
 
     def test_dataparallel_saved_tensors_hooks(self):
         def pack(x):
@@ -11253,7 +12626,7 @@ def test_python_thread_in_middle(self):
         success_vs_raises = [0, 0]
 
         def train_fn_no_retain_graph(x):
-            y = x + x ** 2
+            y = x + x**2
             try:
                 y.sum().backward()
                 success_vs_raises[0] += 1
@@ -11262,22 +12635,27 @@ def train_fn_no_retain_graph(x):
                 self.assertRegex(str(error), "Specify retain_graph=True")
 
         x_no_retain = torch.ones(5, 5, requires_grad=True)
-        y_no_retain = x_no_retain + x_no_retain ** 2
-        self._run_py_multithread_fn(train_fn_no_retain_graph, (y_no_retain,), num_threads=5)
+        y_no_retain = x_no_retain + x_no_retain**2
+        self._run_py_multithread_fn(
+            train_fn_no_retain_graph, (y_no_retain,), num_threads=5
+        )
         # at least one thread will be success in this case, all other threads should raise
         # with the error that throw to user to recommend them specify retain_graph=True
         self.assertTrue(success_vs_raises[0] >= 1)
 
         # multiple backward with python threads, no error with retain_graph=True
         def train_fn_retain_graph(x):
-            y = x + x ** 2
+            y = x + x**2
             y.sum().backward(retain_graph=True)
 
         x_retain = torch.ones(5, 5, requires_grad=True)
-        y_retain = x_retain + x_retain ** 2
+        y_retain = x_retain + x_retain**2
         self._run_py_multithread_fn(train_fn_retain_graph, (y_retain,), num_threads=5)
         # result should equal to num_thread * gradients
-        self.assertEqual(x_retain.grad, 5 * (4 * x_retain ** 3 + 6 * (x_retain ** 2) + 4 * x_retain + 1))
+        self.assertEqual(
+            x_retain.grad,
+            5 * (4 * x_retain**3 + 6 * (x_retain**2) + 4 * x_retain + 1),
+        )
 
     def test_fork_join_in_middle(self):
         # multiple backward with jit threads (fork/join primitive)
@@ -11287,7 +12665,7 @@ def test_fork_join_in_middle(self):
         # should throw error in some threads with no retain_graph.
         @torch.jit.script
         def train_fn_jit_no_retain(middle, orig_x):
-            y = middle + middle ** 2
+            y = middle + middle**2
             return torch.autograd.grad([y.sum()], [orig_x])
 
         @torch.jit.script
@@ -11307,7 +12685,7 @@ def train_fn_fork_join_calls_no_retain(x):
         # Case 2: no error with retain_graph=True
         @torch.jit.script
         def train_fn_jit_retain(middle, orig_x):
-            y = middle + middle ** 2
+            y = middle + middle**2
             return torch.autograd.grad([y.sum()], [orig_x], retain_graph=True)
 
         @torch.jit.script
@@ -11320,7 +12698,9 @@ def train_fn_fork_join_calls_retain(x):
             grad2 = torch.jit._wait(fut2)
             return grad, grad1, grad2
 
-        grad, grad1, grad2 = train_fn_fork_join_calls_retain(torch.randn(5, 5, requires_grad=True))
+        grad, grad1, grad2 = train_fn_fork_join_calls_retain(
+            torch.randn(5, 5, requires_grad=True)
+        )
         self.assertEqual(grad, grad1)
         self.assertEqual(grad, grad2)
 
@@ -11339,6 +12719,7 @@ def backward(ctx, *grad):
             Foo.apply(t).sum().backward()
         except Exception:
             import traceback
+
             tb = sys.exc_info()[2]
             tb_str = "\n".join(traceback.format_tb(tb))
             self.assertTrue('raise ValueError("something")' in tb_str)
@@ -11395,32 +12776,38 @@ def backward(ctx, gO):
                 raise RuntimeError("blah")
                 return gO
 
-        t = torch.tensor([1., 2.], requires_grad=True, device=torch.device("cuda"))
+        t = torch.tensor([1.0, 2.0], requires_grad=True, device=torch.device("cuda"))
         out = MyFunc.apply(t).sum()
 
         with self.assertRaisesRegex(RuntimeError, "blah"):
             out.backward()
 
+
 class TestNestedCheckpoint(TestCase):
     @staticmethod
     def grad(fn):
         def wrapper(x):
             with torch.enable_grad():
                 out = fn(x)
-                grad_input, = torch.autograd.grad(out, inputs=(x,), create_graph=True)
+                (grad_input,) = torch.autograd.grad(out, inputs=(x,), create_graph=True)
             return grad_input
+
         return wrapper
 
     @staticmethod
     def sum(fn):
         def wrapped(x):
             return fn(x).sum()
+
         return wrapped
 
     @staticmethod
     def checkpoint(fn):
         def wrapped(*args, **kwargs):
-            return torch.utils.checkpoint.checkpoint(fn, *args, use_reentrant=False, **kwargs)
+            return torch.utils.checkpoint.checkpoint(
+                fn, *args, use_reentrant=False, **kwargs
+            )
+
         return wrapped
 
     def get_tests(self, fn):
@@ -11430,8 +12817,14 @@ def get_tests(self, fn):
             # function <> tuple of function arbitrarily wrapped in checkpoint in various ways
             (fn, (c(fn), c(c(fn)))),
             (grad(fn), (grad(c(fn)), grad(c(c(fn))))),
-            (grad(grad(fn)), (grad(c(grad(fn))), c(grad(grad(c(fn)))), grad(c(grad(c(fn)))))),
-            (grad(grad(grad(fn))), (grad(c(grad(grad(c(fn))))), grad(c(grad(c(grad(c(fn)))))))),
+            (
+                grad(grad(fn)),
+                (grad(c(grad(fn))), c(grad(grad(c(fn)))), grad(c(grad(c(fn))))),
+            ),
+            (
+                grad(grad(grad(fn))),
+                (grad(c(grad(grad(c(fn))))), grad(c(grad(c(grad(c(fn))))))),
+            ),
         )
         return tests
 
@@ -11491,8 +12884,8 @@ def f(x):
             def g(x):
                 a = x.sin().exp().sin()
                 b = x.sin().exp().sin()
-                ga, = torch.autograd.grad(a, x)
-                gb, = torch.autograd.grad(b, x)
+                (ga,) = torch.autograd.grad(a, x)
+                (gb,) = torch.autograd.grad(b, x)
                 return x.sin()
 
             for fn in (f, g):
@@ -11504,7 +12897,6 @@ def g(x):
                         self.assertTrue(torch.allclose(expected, actual))
                         self.check_graph_dies(actual_fn)
 
-
     @parametrize("early_stop", [True, False])
     def test_nested_checkpoint_two_children(self, early_stop):
         with torch.utils.checkpoint.set_checkpoint_early_stop(early_stop):
@@ -11540,8 +12932,8 @@ def fn(k, a, b, f):
             return f(k * a * b.exp()), 1, "abcd"
 
         k = 3
-        a = torch.tensor(2., requires_grad=True)
-        b = torch.tensor(3., requires_grad=True)
+        a = torch.tensor(2.0, requires_grad=True)
+        b = torch.tensor(3.0, requires_grad=True)
 
         def f(x):
             return x.sin()
@@ -11563,8 +12955,8 @@ def fn(a, blah=None):
                 out = out * blah
             return out.sin().exp()
 
-        a = torch.tensor(2., requires_grad=True)
-        b = torch.tensor(3., requires_grad=True)
+        a = torch.tensor(2.0, requires_grad=True)
+        b = torch.tensor(3.0, requires_grad=True)
 
         with torch.utils.checkpoint.set_checkpoint_early_stop(early_stop):
             out = checkpoint(fn, a, blah=b, use_reentrant=False)
@@ -11585,7 +12977,7 @@ def hook(*_unused_args):
         def fn(a):
             return a.sin().cos().sin()
 
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
 
         with torch.utils.checkpoint.set_checkpoint_early_stop(early_stop):
             out = checkpoint(fn, a, use_reentrant=False)
@@ -11608,7 +13000,7 @@ def hook(*_unused_args):
             # the hook was registered
             x.backward(retain_graph=True)
 
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         with torch.utils.checkpoint.set_checkpoint_early_stop(early_stop):
             x, out = checkpoint(fn, a, use_reentrant=False)
         out.grad_fn.register_hook(hook)
@@ -11627,7 +13019,7 @@ def fn(x):
             return clone(x.sin().cos())
 
         # Early stopping is enabled by default
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         out = checkpoint(fn, a, use_reentrant=False)
         out.backward()
         self.assertEqual(counter[0], 1)
@@ -11637,7 +13029,7 @@ def fn(x):
         # the context manager, even though context manager is no longer active
         # when backward/recomputation is performed.
         counter = [0]
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         with torch.utils.checkpoint.set_checkpoint_early_stop(False):
             out = checkpoint(fn, a, use_reentrant=False)
 
@@ -11670,7 +13062,7 @@ def fn(x):
             return x.sin()
 
         # With early stopping (enabled by default)
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         with SinCounterMode() as python_dispatch_counter:
             out = checkpoint(fn, a, use_reentrant=False)
             out.backward()
@@ -11679,7 +13071,7 @@ def fn(x):
 
         # Without early stopping
         counter = [0]
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         with SinCounterMode() as python_dispatch_counter:
             with torch.utils.checkpoint.set_checkpoint_early_stop(False):
                 out = checkpoint(fn, a, use_reentrant=False)
@@ -11698,14 +13090,14 @@ def fn2(x):
             return x.clone()
 
         # With early stopping (enabled by default)
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         out = checkpoint(fn2, a, use_reentrant=False)
         out.backward()
         self.assertEqual(counter[0], 1)
 
         # Without early stopping
         counter = [0]
-        a = torch.tensor(1., requires_grad=True)
+        a = torch.tensor(1.0, requires_grad=True)
         with torch.utils.checkpoint.set_checkpoint_early_stop(False):
             out = checkpoint(fn2, a, use_reentrant=False)
         out.backward()
@@ -11721,7 +13113,7 @@ def test_autograd_multiple_dispatch_registrations(self, device):
         grad = torch.randn(3, 3, device=device)
         out.backward(grad)
 
-        if 'cuda' not in device:
+        if "cuda" not in device:
             # bogus default gradient registered for Autograd is grad + 1
             self.assertEqual(t.grad, grad + 1)
         else:
@@ -11729,8 +13121,16 @@ def test_autograd_multiple_dispatch_registrations(self, device):
             self.assertEqual(t.grad, grad * 2)
 
         # test registered AutogradNestedTensor formula
-        a = torch.arange(6, dtype=torch.float, device=device).reshape(2, 3).requires_grad_(True)
-        b = torch.arange(8, dtype=torch.float, device=device).reshape(2, 4).requires_grad_(True)
+        a = (
+            torch.arange(6, dtype=torch.float, device=device)
+            .reshape(2, 3)
+            .requires_grad_(True)
+        )
+        b = (
+            torch.arange(8, dtype=torch.float, device=device)
+            .reshape(2, 4)
+            .requires_grad_(True)
+        )
         nt = torch.nested.as_nested_tensor([a, b], dtype=torch.float, device=device)
 
         nt_out = torch._test_autograd_multiple_dispatch(nt)
@@ -11755,8 +13155,16 @@ def test_autograd_composite_implicit_and_dispatch_registration(self, device):
         self.assertEqual(t.grad, grad)
 
         # test registered AutogradNestedTensor formula
-        a = torch.arange(6, dtype=torch.float, device=device).reshape(2, 3).requires_grad_(True)
-        b = torch.arange(8, dtype=torch.float, device=device).reshape(2, 4).requires_grad_(True)
+        a = (
+            torch.arange(6, dtype=torch.float, device=device)
+            .reshape(2, 3)
+            .requires_grad_(True)
+        )
+        b = (
+            torch.arange(8, dtype=torch.float, device=device)
+            .reshape(2, 4)
+            .requires_grad_(True)
+        )
         nt = torch.nested.as_nested_tensor([a, b], dtype=torch.float, device=device)
 
         nt_out = torch._test_autograd_multiple_dispatch(nt, True)
@@ -11782,7 +13190,7 @@ def test_foward_mode_AD(self, device):
             err_msg = r"Trying to use forward AD with .* that does not support it"
             hint_msg = "Running forward AD for an OP that does not implement it should raise a NotImplementedError"
 
-            if 'cuda' in device:
+            if "cuda" in device:
                 with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg):
                     torch._test_autograd_multiple_dispatch(dual_input)
             else:
@@ -11805,7 +13213,7 @@ def test_view_copy(self, device):
         self.assertEqual(t_view_copy, t_view)
         self.assertEqual(t.grad, t_ref.grad)
         # backward results are per-dispatch-key in derivatives.yaml
-        if 'cuda' in device:
+        if "cuda" in device:
             # gradient registered to AutogradCUDA is grad.reshape_as(self) + 1
             self.assertEqual(t.grad, grad.reshape_as(t) + 1)
         else:
@@ -11827,13 +13235,18 @@ def foo(x):
         foo(inp).backward()
 
         # sum's input is saved for Nested Tensors
-        nt = torch.nested.nested_tensor([torch.rand(2), torch.rand(2)], device=device, requires_grad=True)
+        nt = torch.nested.nested_tensor(
+            [torch.rand(2), torch.rand(2)], device=device, requires_grad=True
+        )
         with self.assertRaisesRegex(RuntimeError, "modified by an inplace operation"):
-            foo(nt).backward(torch.nested.nested_tensor([torch.rand(1), torch.rand(1)], device=device))
+            foo(nt).backward(
+                torch.nested.nested_tensor(
+                    [torch.rand(1), torch.rand(1)], device=device
+                )
+            )
 
     @onlyCUDA
     def test_backward_single_threaded(self):
-
         threads_eq = None
 
         class TestFn(Function):
@@ -11860,7 +13273,6 @@ def backward(ctx, gO):
 
     @onlyCUDA
     def test_backward_tls_stash(self):
-
         local = threading.local()
         local.my_obj = {}
         local.my_obj[10] = 10
@@ -11919,7 +13331,7 @@ def log_grad_order(grad: torch.Tensor, name: str, order):
         # Expect to see that even though c has the smallest sequence number, it is still the first node to get run in autograd.
         # Also check that although a comes first during the forward, after giving it priority with sequence_nr,
         # its autograd node is run before that of b.
-        self.assertEqual(order, ['c', 'a', 'b'])
+        self.assertEqual(order, ["c", "a", "b"])
 
         self.assertEqual(x.grad, torch.ones_like(x))
         self.assertEqual(y.grad, 2 * torch.ones_like(x))
@@ -11935,20 +13347,14 @@ def log_grad_order(grad: torch.Tensor, name: str, order):
 from autograd.test_logging import TestAutogradLogging  # noqa: F401
 
 # e.g., TestAutogradDeviceTypeCPU and TestAutogradDeviceTypeCUDA
-instantiate_device_type_tests(
-    TestAutogradDeviceType,
-    globals(),
-    except_for=None
-)
+instantiate_device_type_tests(TestAutogradDeviceType, globals(), except_for=None)
 
 instantiate_device_type_tests(
-    TestAutogradMultipleDispatch,
-    globals(),
-    only_for=('cpu', 'cuda')
+    TestAutogradMultipleDispatch, globals(), only_for=("cpu", "cuda")
 )
 
 instantiate_parametrized_tests(TestAutograd)
 instantiate_parametrized_tests(TestNestedCheckpoint)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_autograd_fallback.py b/test/test_autograd_fallback.py
index 86a8a968ac9c5..ec6f01c4c2df6 100644
--- a/test/test_autograd_fallback.py
+++ b/test/test_autograd_fallback.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 import torch
-from torch.library import Library
+from torch.library import _scoped_library, Library
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -38,7 +38,7 @@ def get_op(self, name):
         return getattr(getattr(torch.ops, self.test_ns), name).default
 
     def get_lib(self):
-        lib = Library(self.test_ns, "FRAGMENT")
+        lib = Library(self.test_ns, "FRAGMENT")  # noqa: TOR901
         self.lib = lib
         return lib
 
@@ -146,166 +146,167 @@ def test_cpu_return_self(self, mode):
             # To be clear, none of these situations are OK and will lead
             # to other problems down the line. We're testing them because
             # it is fairly common to actually do these things.
-            lib = Library(self.test_ns, "FRAGMENT")
-            lib.define("foo(Tensor self) -> Tensor")
-            lib.impl("foo", lambda x: x, "CPU")
-            op = self.get_op("foo")
+            with _scoped_library(self.test_ns, "FRAGMENT") as lib:
+                lib.define("foo(Tensor self) -> Tensor")
+                lib.impl("foo", lambda x: x, "CPU")
+                op = self.get_op("foo")
 
-            x = torch.randn(3, requires_grad=True)
-            y = op(x).sum()
-            with self._check_ctx(mode):
-                y.backward()
-                self.assertEqual(x.grad, torch.ones_like(x))
+                x = torch.randn(3, requires_grad=True)
+                y = op(x).sum()
+                with self._check_ctx(mode):
+                    y.backward()
+                    self.assertEqual(x.grad, torch.ones_like(x))
 
-            lib.define("bar(Tensor(a!) self) -> Tensor(a!)")
-            lib.impl("bar", lambda x: x, "CPU")
-            op = self.get_op("bar")
+                lib.define("bar(Tensor(a!) self) -> Tensor(a!)")
+                lib.impl("bar", lambda x: x, "CPU")
+                op = self.get_op("bar")
 
-            x = torch.randn(3, requires_grad=True)
-            y = op(x).sum()
-            with self._check_ctx(mode):
-                y.backward()
-                self.assertEqual(x.grad, torch.ones_like(x))
+                x = torch.randn(3, requires_grad=True)
+                y = op(x).sum()
+                with self._check_ctx(mode):
+                    y.backward()
+                    self.assertEqual(x.grad, torch.ones_like(x))
 
     @parametrize("mode", ("nothing", "warn"))
     def test_composite_registered_to_cpu(self, mode):
         with autograd_fallback_mode(mode):
-            lib = Library(self.test_ns, "FRAGMENT")
-            lib.define("foo(Tensor self) -> Tensor")
-            lib.impl("foo", lambda x: x.sin().sum(), "CPU")
-            op = self.get_op("foo")
+            with _scoped_library(self.test_ns, "FRAGMENT") as lib:
+                lib.define("foo(Tensor self) -> Tensor")
+                lib.impl("foo", lambda x: x.sin().sum(), "CPU")
+                op = self.get_op("foo")
 
-            x = torch.randn(3, requires_grad=True)
-            y = op(x)
-            with self._check_ctx(mode):
-                y.backward()
-                self.assertEqual(x.grad, x.cos())
+                x = torch.randn(3, requires_grad=True)
+                y = op(x)
+                with self._check_ctx(mode):
+                    y.backward()
+                    self.assertEqual(x.grad, x.cos())
 
     @parametrize("mode", ("nothing", "warn"))
     def test_autograd_function_registered_to_cpu(self, mode):
         with autograd_fallback_mode(mode):
-            lib = Library(self.test_ns, "FRAGMENT")
-            lib.define("foo(Tensor self) -> Tensor")
+            with _scoped_library(self.test_ns, "FRAGMENT") as lib:
+                lib.define("foo(Tensor self) -> Tensor")
 
-            class NumpySin(torch.autograd.Function):
-                @staticmethod
-                def forward(ctx, x):
-                    ctx.save_for_backward(x)
-                    return torch.tensor(np.sin(x.cpu().numpy()))
+                class NumpySin(torch.autograd.Function):
+                    @staticmethod
+                    def forward(ctx, x):
+                        ctx.save_for_backward(x)
+                        return torch.tensor(np.sin(x.cpu().numpy()))
 
-                @staticmethod
-                def backward(ctx, gx):
-                    (x,) = ctx.saved_tensors
-                    return gx * x.cos()
+                    @staticmethod
+                    def backward(ctx, gx):
+                        (x,) = ctx.saved_tensors
+                        return gx * x.cos()
 
-            lib.impl("foo", NumpySin.apply, "CPU")
-            op = self.get_op("foo")
+                lib.impl("foo", NumpySin.apply, "CPU")
+                op = self.get_op("foo")
 
-            x = torch.randn(3, requires_grad=True)
-            y = op(x).sum()
-            with self._check_ctx(mode):
-                y.backward()
-                self.assertEqual(x.grad, x.cos())
+                x = torch.randn(3, requires_grad=True)
+                y = op(x).sum()
+                with self._check_ctx(mode):
+                    y.backward()
+                    self.assertEqual(x.grad, x.cos())
 
     @parametrize("mode", ("nothing", "warn"))
     def test_inplace_autograd_function_registered_to_cpu(self, mode):
         with autograd_fallback_mode(mode):
-            lib = Library(self.test_ns, "FRAGMENT")
-            lib.define("foo(Tensor(a!) self) -> Tensor(a!)")
-
-            class NumpySin_(torch.autograd.Function):
-                @staticmethod
-                def forward(ctx, x):
-                    ctx.save_for_backward(x.clone())
-                    x_np = x.detach().numpy()
-                    np.sin(x_np, out=x_np)
-                    ctx.mark_dirty(x)
-                    return x
-
-                @staticmethod
-                def backward(ctx, gx):
-                    (x,) = ctx.saved_tensors
-                    return gx * x.cos()
-
-            lib.impl("foo", NumpySin_.apply, "CPU")
-            op = self.get_op("foo")
-
-            x = torch.randn(3, requires_grad=True)
-            z = x.clone()
-            w = z[0]
-            y = op(w)
-
-            expected = torch.zeros_like(x)
-            expected[0] = x[0].cos()
-            with self._check_ctx(mode):
-                (gx,) = torch.autograd.grad(y, x, torch.ones_like(y), retain_graph=True)
-                self.assertEqual(gx, expected)
+            with _scoped_library(self.test_ns, "FRAGMENT") as lib:
+                lib.define("foo(Tensor(a!) self) -> Tensor(a!)")
+
+                class NumpySin_(torch.autograd.Function):
+                    @staticmethod
+                    def forward(ctx, x):
+                        ctx.save_for_backward(x.clone())
+                        x_np = x.detach().numpy()
+                        np.sin(x_np, out=x_np)
+                        ctx.mark_dirty(x)
+                        return x
+
+                    @staticmethod
+                    def backward(ctx, gx):
+                        (x,) = ctx.saved_tensors
+                        return gx * x.cos()
+
+                lib.impl("foo", NumpySin_.apply, "CPU")
+                op = self.get_op("foo")
+
+                x = torch.randn(3, requires_grad=True)
+                z = x.clone()
+                w = z[0]
+                y = op(w)
+
+                expected = torch.zeros_like(x)
+                expected[0] = x[0].cos()
+                with self._check_ctx(mode):
+                    (gx,) = torch.autograd.grad(
+                        y, x, torch.ones_like(y), retain_graph=True
+                    )
+                    self.assertEqual(gx, expected)
 
-            expected = torch.ones_like(x)
-            expected[0] = x[0].cos()
-            with self._check_ctx(mode):
-                (gx,) = torch.autograd.grad(z, x, torch.ones_like(z))
-                self.assertEqual(gx, expected)
+                expected = torch.ones_like(x)
+                expected[0] = x[0].cos()
+                with self._check_ctx(mode):
+                    (gx,) = torch.autograd.grad(z, x, torch.ones_like(z))
+                    self.assertEqual(gx, expected)
 
     @parametrize("mode", ("nothing", "warn"))
     def test_inplace_on_tensor_that_does_not_require_grad(self, mode):
         # We don't do anything special (that is, we don't rebase history).
         # See NOTE [autograd fallback and in-place operations] for why
         with autograd_fallback_mode(mode):
-            lib = Library(self.test_ns, "FRAGMENT")
-
-            # Correct usage of (a!)
-            lib.define("foo(Tensor(a!) self, Tensor other) -> Tensor(a!)")
-
-            def foo_impl(x, y):
-                x_d = x.detach()
-                y = y.detach()
-                x_d.add_(y)
-                return x
-
-            lib.impl("foo", foo_impl, "CPU")
-            foo = self.get_op("foo")
-
-            # Incorrect usage of (a!): user doesn't return tensor as-is
-            lib.define("bar(Tensor(a!) self, Tensor other) -> Tensor(a!)")
-
-            def bar_impl(x, y):
-                x_d = x.detach()
-                y = y.detach()
-                x_d.add_(y)
-                return x_d.clone()
+            with _scoped_library(self.test_ns, "FRAGMENT") as lib:
+                # Correct usage of (a!)
+                lib.define("foo(Tensor(a!) self, Tensor other) -> Tensor(a!)")
+
+                def foo_impl(x, y):
+                    x_d = x.detach()
+                    y = y.detach()
+                    x_d.add_(y)
+                    return x
 
-            lib.impl("bar", bar_impl, "CPU")
-            bar = self.get_op("bar")
-
-            # User mutated input tensor but didn't return it.
-            lib.define("baz(Tensor(a!) self, Tensor other) -> ()")
-
-            def baz_impl(x, y):
-                x_d = x.detach()
-                y = y.detach()
-                x_d.add_(y)
-
-            lib.impl("baz", baz_impl, "CPU")
-            baz = self.get_op("baz")
-
-            # Test in-place on non-view
-            for op in (foo, bar, baz):
-                x = torch.randn(3)
-                y = torch.randn(3, requires_grad=True)
-                with self.assertRaisesRegex(RuntimeError, "does not require grad"):
-                    z = x.clone()
-                    op(z, y)
-                    torch.autograd.grad(z, y, torch.ones_like(z), allow_unused=True)
-
-            # Test in-place on view
-            for op in (foo, bar, baz):
-                x = torch.randn(3)
-                y = torch.randn(3, requires_grad=True)
-                with self.assertRaisesRegex(RuntimeError, "does not require grad"):
-                    z = x[:]
-                    op(z, y)
-                    torch.autograd.grad(z, x, torch.ones_like(z), allow_unused=True)
+                lib.impl("foo", foo_impl, "CPU")
+                foo = self.get_op("foo")
+
+                # Incorrect usage of (a!): user doesn't return tensor as-is
+                lib.define("bar(Tensor(a!) self, Tensor other) -> Tensor(a!)")
+
+                def bar_impl(x, y):
+                    x_d = x.detach()
+                    y = y.detach()
+                    x_d.add_(y)
+                    return x_d.clone()
+
+                lib.impl("bar", bar_impl, "CPU")
+                bar = self.get_op("bar")
+
+                # User mutated input tensor but didn't return it.
+                lib.define("baz(Tensor(a!) self, Tensor other) -> ()")
+
+                def baz_impl(x, y):
+                    x_d = x.detach()
+                    y = y.detach()
+                    x_d.add_(y)
+
+                lib.impl("baz", baz_impl, "CPU")
+                baz = self.get_op("baz")
+
+                # Test in-place on non-view
+                for op in (foo, bar, baz):
+                    x = torch.randn(3)
+                    y = torch.randn(3, requires_grad=True)
+                    with self.assertRaisesRegex(RuntimeError, "does not require grad"):
+                        z = x.clone()
+                        op(z, y)
+                        torch.autograd.grad(z, y, torch.ones_like(z), allow_unused=True)
+
+                # Test in-place on view
+                for op in (foo, bar, baz):
+                    x = torch.randn(3)
+                    y = torch.randn(3, requires_grad=True)
+                    with self.assertRaisesRegex(RuntimeError, "does not require grad"):
+                        z = x[:]
+                        op(z, y)
+                        torch.autograd.grad(z, x, torch.ones_like(z), allow_unused=True)
 
     @parametrize("mode", ("nothing", "warn"))
     def test_post_autograd_returns_leaf(self, mode):
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index d5fcd5549656e..3ed43e6b06c34 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -1,75 +1,75 @@
 # Owner(s): ["module: tests"]
 
-import torch
-import numpy as np
-
 import itertools
-from itertools import chain
-from itertools import product
 import math
+import operator
 import random
-from numbers import Number
 import warnings
-import operator
 from functools import partial
+from itertools import chain, product
+from numbers import Number
+
+import numpy as np
+import torch
 
 import torch.autograd.forward_ad as fwAD
 from torch import inf, nan
-from torch.testing._internal.common_utils import (
-    TestCase,
-    slowTest,
-    iter_indices,
-    run_tests,
-    gradcheck,
-    torch_to_numpy_dtype_dict,
-    numpy_to_torch_dtype_dict,
-    TEST_SCIPY,
-    set_default_dtype,
-    skipIfTorchDynamo,
-)
+from torch.testing import make_tensor
 from torch.testing._internal.common_device_type import (
+    deviceCountAtLeast,
+    dtypes,
+    dtypesIfCPU,
+    dtypesIfCUDA,
     expectedFailureMeta,
     instantiate_device_type_tests,
-    onlyCUDA,
     onlyCPU,
-    dtypes,
-    dtypesIfCUDA,
-    dtypesIfCPU,
-    deviceCountAtLeast,
-    precisionOverride,
+    onlyCUDA,
     onlyNativeDeviceTypes,
-    skipIf,
-    ops,
     OpDTypes,
+    ops,
+    precisionOverride,
+    skipIf,
     skipMeta,
 )
-from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
-    all_types_and_complex_and,
     all_types_and,
-    integral_types,
+    all_types_and_complex_and,
     complex_types,
-    integral_types_and,
-    floating_types_and,
     floating_and_complex_types,
-    get_all_math_dtypes,
+    floating_types_and,
     get_all_int_dtypes,
+    get_all_math_dtypes,
+    integral_types,
+    integral_types_and,
 )
 from torch.testing._internal.common_methods_invocations import (
     binary_ufuncs,
     binary_ufuncs_and_refs,
-    generate_elementwise_binary_tensors,
-    generate_elementwise_binary_small_value_tensors,
-    generate_elementwise_binary_large_value_tensors,
-    generate_elementwise_binary_extremal_value_tensors,
     generate_elementwise_binary_broadcasting_tensors,
-    generate_elementwise_binary_with_scalar_samples,
+    generate_elementwise_binary_extremal_value_tensors,
+    generate_elementwise_binary_large_value_tensors,
+    generate_elementwise_binary_small_value_tensors,
+    generate_elementwise_binary_tensors,
     generate_elementwise_binary_with_scalar_and_type_promotion_samples,
+    generate_elementwise_binary_with_scalar_samples,
+)
+from torch.testing._internal.common_utils import (
+    gradcheck,
+    iter_indices,
+    numpy_to_torch_dtype_dict,
+    run_tests,
+    set_default_dtype,
+    skipIfTorchDynamo,
+    slowTest,
+    TEST_SCIPY,
+    TestCase,
+    torch_to_numpy_dtype_dict,
 )
 
 if TEST_SCIPY:
-    import scipy.special
     import scipy.integrate
+    import scipy.special
+
 
 # TODO: update to use opinfos consistently
 class TestBinaryUfuncs(TestCase):
@@ -269,7 +269,6 @@ def test_scalar_support(self, device, dtype, op):
         )
         self._test_reference_numerics(dtype, op, gen, equal_nan=True)
 
-
     @ops(binary_ufuncs)
     def test_contig_vs_every_other(self, device, dtype, op):
         lhs = make_tensor(
@@ -487,7 +486,7 @@ def test_type_promotion(self, device, op):
         )
 
         make_rhs_scalar_tensor = partial(
-            make_tensor, (), device='cpu', **op.rhs_make_tensor_kwargs
+            make_tensor, (), device="cpu", **op.rhs_make_tensor_kwargs
         )
 
         def _supported(dtypes):
@@ -777,10 +776,14 @@ def _supported(dtypes):
         # scalar  x scalar
         # Note: result dtype is default float type
         if op.supports_two_python_scalars and _supported((torch.long, torch.float32)):
-            rhs_f_scalar = 2.
-            for lhs in (1, 1.):
+            rhs_f_scalar = 2.0
+            for lhs in (1, 1.0):
                 result = op(lhs, rhs_f_scalar)
-                expected_dtype = torch.get_default_dtype() if not op.always_returns_bool else torch.bool
+                expected_dtype = (
+                    torch.get_default_dtype()
+                    if not op.always_returns_bool
+                    else torch.bool
+                )
                 self.assertEqual(result.dtype, expected_dtype)
 
     # TODO: move to error input test
@@ -966,7 +969,6 @@ def test_div_rounding_modes(self, device, dtype):
 
     @dtypes(torch.bfloat16, torch.half, torch.float32, torch.float64)
     def test_div_rounding_nonfinite(self, device, dtype):
-
         # Compare division of special floating point values against NumPy
         num = torch.tensor(
             [1.0, -1.0, 0, 0.1, -0.1, np.pi, -np.pi, np.inf, -np.inf, np.nan],
@@ -1088,21 +1090,27 @@ def test_complex_div_underflow_overflow(self, device, dtype):
         # NOTE: the calculation still produces an error if the number is greater than
         # finfo.max / 2, but hopefully people realized that it's a dangerous region to work with
         finfo = torch.finfo(dtype)
-        nom_lst = [complex(finfo.min / 2, finfo.min / 2),
-                   complex(finfo.max / 2, finfo.max / 2),
-                   complex(finfo.tiny, finfo.tiny),
-                   complex(finfo.tiny, 0.0),
-                   complex(0.0, 0.0)]
-        denom_lst = [complex(finfo.min / 2, finfo.min / 2),
-                     complex(finfo.max / 2, finfo.max / 2),
-                     complex(finfo.tiny, finfo.tiny),
-                     complex(0.0, finfo.tiny),
-                     complex(finfo.tiny, finfo.tiny)]
-        expected_lst = [complex(1.0, 0.0),
-                        complex(1.0, 0.0),
-                        complex(1.0, 0.0),
-                        complex(0.0, -1.0),
-                        complex(0.0, 0.0)]
+        nom_lst = [
+            complex(finfo.min / 2, finfo.min / 2),
+            complex(finfo.max / 2, finfo.max / 2),
+            complex(finfo.tiny, finfo.tiny),
+            complex(finfo.tiny, 0.0),
+            complex(0.0, 0.0),
+        ]
+        denom_lst = [
+            complex(finfo.min / 2, finfo.min / 2),
+            complex(finfo.max / 2, finfo.max / 2),
+            complex(finfo.tiny, finfo.tiny),
+            complex(0.0, finfo.tiny),
+            complex(finfo.tiny, finfo.tiny),
+        ]
+        expected_lst = [
+            complex(1.0, 0.0),
+            complex(1.0, 0.0),
+            complex(1.0, 0.0),
+            complex(0.0, -1.0),
+            complex(0.0, 0.0),
+        ]
         nom = torch.tensor(nom_lst, dtype=dtype, device=device)
         denom = torch.tensor(denom_lst, dtype=dtype, device=device)
         expected = torch.tensor(expected_lst, dtype=dtype, device=device)
@@ -1146,7 +1154,10 @@ def test_out_resize_warning(self, device):
         # test that multi-d out doesn't trigger segfault
         arg1 = (torch.ones(2, 1, device=device), torch.ones(1, device=device))
         arg2 = (torch.ones(2, device=device), torch.ones(1, 1, device=device))
-        outs = (torch.ones(2, 1, 1, 1, device=device), torch.ones(2, 2, 2, 2, device=device))
+        outs = (
+            torch.ones(2, 1, 1, 1, device=device),
+            torch.ones(2, 2, 2, 2, device=device),
+        )
 
         for a1, a2, o in zip(arg1, arg2, outs):
             with warnings.catch_warnings(record=True) as w:
@@ -1360,12 +1371,16 @@ def test_pow(self, device, dtype):
             self._do_pow_for_exponents(m1, exponents + complex_exponents, pow, 10e-4)
         else:
             self._do_pow_for_exponents(m1, exponents, math.pow, None)
-            will_raise_error = dtype is torch.half and torch.device(device).type == 'cpu'
+            will_raise_error = (
+                dtype is torch.half and torch.device(device).type == "cpu"
+            )
             if will_raise_error:
                 # On CPU,
                 # Half Tensor with complex exponents leads to computation dtype
                 # of ComplexHalf for which this ops is not supported yet
-                with self.assertRaisesRegex(RuntimeError, "not implemented for 'ComplexHalf'"):
+                with self.assertRaisesRegex(
+                    RuntimeError, "not implemented for 'ComplexHalf'"
+                ):
                     self._do_pow_for_exponents(m1, complex_exponents, pow, 10e-4)
             else:
                 self._do_pow_for_exponents(m1, complex_exponents, pow, 10e-4)
@@ -1646,6 +1661,7 @@ def test_pow_cuda_complex_extremal_failing(self, device, dtype):
             cpu_out = t.cpu().pow(2)
             self.assertEqual(cpu_out, cuda_out)
 
+    @skipIfTorchDynamo()
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half))
     def test_complex_scalar_pow_tensor(self, device, dtype):
@@ -1662,10 +1678,15 @@ def test_complex_scalar_pow_tensor(self, device, dtype):
             # of ComplexHalf for which this ops is not supported yet
             # NOTE: pow has fast-path when base is 1 which supports
             # ComplexHalf
-            will_raise_error = torch.device(device).type == 'cpu' and \
-                dtype is torch.half and base != (1 + 0j)
+            will_raise_error = (
+                torch.device(device).type == "cpu"
+                and dtype is torch.half
+                and base != (1 + 0j)
+            )
             if will_raise_error:
-                with self.assertRaisesRegex(RuntimeError, "not implemented for 'ComplexHalf'"):
+                with self.assertRaisesRegex(
+                    RuntimeError, "not implemented for 'ComplexHalf'"
+                ):
                     self._test_pow(base, first_exp)
                     self._test_pow(base, second_exp)
             else:
@@ -2027,9 +2048,7 @@ def _wrapped_ifloordiv_scalar(a):
                     tmp //= b_t
                     self.assertEqual(tmp.item(), expected_ifloordiv)
 
-                self.assertEqual(
-                    scripted_floor_divide__scalar(a_t), math.floor(a / 5)
-                )
+                self.assertEqual(scripted_floor_divide__scalar(a_t), math.floor(a / 5))
 
     # Tests binary op equivalence with Python builtin ops
     # Also tests that reverse operations are equivalent to forward ops
@@ -2041,7 +2060,6 @@ def test_binary_ops_with_scalars(self, device):
             (operator.mul, torch.mul),
             (operator.truediv, torch.div),
         ):
-
             for a, b in product(range(-10, 10), range(-10, 10)):
                 for op in (lambda x: x * 0.5, lambda x: math.floor(x)):
                     a = op(a)
@@ -3142,11 +3160,19 @@ def test_shift_limits(self, device, dtype):
         bits = iinfo.bits
         low = iinfo.min
         high = iinfo.max
-        exact_dtype = dtype != torch.uint8  # numpy changes dtype from uint8 to int16 for some out-of-limits shift values
+        exact_dtype = (
+            dtype != torch.uint8
+        )  # numpy changes dtype from uint8 to int16 for some out-of-limits shift values
         for input in (
-            torch.tensor([-1, 0, 1], device=device, dtype=dtype),  # small for non-vectorized operation
-            torch.tensor([low, high], device=device, dtype=dtype),  # small for non-vectorized operation
-            make_tensor((64, 64, 64), low=low, high=high, device=device, dtype=dtype),  # large for vectorized operation
+            torch.tensor(
+                [-1, 0, 1], device=device, dtype=dtype
+            ),  # small for non-vectorized operation
+            torch.tensor(
+                [low, high], device=device, dtype=dtype
+            ),  # small for non-vectorized operation
+            make_tensor(
+                (64, 64, 64), low=low, high=high, device=device, dtype=dtype
+            ),  # large for vectorized operation
         ):
             shift_left_expected = torch.zeros_like(input)
             shift_right_expected = torch.clamp(input, -1, 0)
@@ -3157,7 +3183,8 @@ def test_shift_limits(self, device, dtype):
                     lambda x: x << shift,
                     lambda x: np.left_shift(x, shift),
                     input,
-                    exact_dtype=exact_dtype, msg=f"<< {shift}"
+                    exact_dtype=exact_dtype,
+                    msg=f"<< {shift}",
                 )
                 shift_right = input >> shift
                 self.assertEqual(shift_right, shift_right_expected, msg=f">> {shift}")
@@ -3165,7 +3192,8 @@ def test_shift_limits(self, device, dtype):
                     lambda x: x >> shift,
                     lambda x: np.right_shift(x, shift),
                     input,
-                    exact_dtype=exact_dtype, msg=f">> {shift}"
+                    exact_dtype=exact_dtype,
+                    msg=f">> {shift}",
                 )
 
     @onlyNativeDeviceTypes
@@ -3447,6 +3475,7 @@ def _test_logaddexp(self, device, dtype, base2):
             # numpy has not implemented logaddexp for complex
             def _ref_func(x, y):
                 return scipy.special.logsumexp(np.stack((x, y), axis=0), axis=0)
+
             ref_func = _ref_func
             our_func = torch.logaddexp
         else:
@@ -3487,9 +3516,11 @@ def _test_helper(a, b):
         )
         _test_helper(a, b)
 
-    @skipIfTorchDynamo     # complex infs/nans differ under Dynamo/Inductor
+    @skipIfTorchDynamo()  # complex infs/nans differ under Dynamo/Inductor
     @dtypesIfCUDA(torch.float32, torch.float64, torch.bfloat16)
-    @dtypes(torch.float32, torch.float64, torch.bfloat16, torch.complex64, torch.complex128)
+    @dtypes(
+        torch.float32, torch.float64, torch.bfloat16, torch.complex64, torch.complex128
+    )
     def test_logaddexp(self, device, dtype):
         self._test_logaddexp(device, dtype, base2=False)
 
@@ -3817,7 +3848,13 @@ def _test_atan2_with_size(size, device):
                 b_16 = b.to(dtype=lowp_dtype)
                 actual_16 = a_16.atan2(b_16)
                 self.assertEqual(actual_16, actual.to(dtype=lowp_dtype))
-                self.assertEqual(expected, actual_16.view(-1), exact_dtype=False, rtol=rtol, atol=atol)
+                self.assertEqual(
+                    expected,
+                    actual_16.view(-1),
+                    exact_dtype=False,
+                    rtol=rtol,
+                    atol=atol,
+                )
 
         _test_atan2_with_size((2, 2), device)
         _test_atan2_with_size((3, 3), device)
@@ -3885,13 +3922,18 @@ def test_x(sizes, dim, x, device):
 
     @skipIf(not TEST_SCIPY, "Scipy required for the test.")
     def test_cumulative_trapezoid(self, device):
-
         import scipy.integrate
 
         if hasattr(scipy.integrate, "cumulative_trapezoid"):
-            scipy_cumulative_trapezoid = scipy.integrate.cumulative_trapezoid
+            _scipy_cumulative_trapezoid = scipy.integrate.cumulative_trapezoid
         else:  # Older version of SciPy uses a different name
-            scipy_cumulative_trapezoid = scipy.integrate.cumtrapz
+            _scipy_cumulative_trapezoid = scipy.integrate.cumtrapz
+
+        def scipy_cumulative_trapezoid(y, x=None, dx=1.0, axis=-1, initial=None):
+            if y.shape[axis] == 0:
+                return np.empty_like(y)
+            else:
+                return _scipy_cumulative_trapezoid(y, x, dx, axis, initial)
 
         def test_dx(sizes, dim, dx, device):
             t = torch.randn(sizes, device=device)
@@ -4027,7 +4069,6 @@ def to_np(value):
             torch.Tensor.float_power,
             torch.Tensor.float_power_,
         ):
-
             # Case of Tensor x Tensor
             if op is torch.Tensor.float_power_ and base_dtype != out_dtype:
                 with self.assertRaisesRegex(
@@ -4424,6 +4465,7 @@ def test_mul_chalf_tensor_and_cpu_scalar(self, device, dtype):
     # '__divmod__', '__rdivmod__', '__idivmod__',
 ]
 
+
 # Test that binary math operations return NotImplemented for unknown types.
 def generate_not_implemented_tests(cls):
     class UnknownType:
diff --git a/test/test_bundled_images.py b/test/test_bundled_images.py
index 118e276a30bf3..091bad66c15e8 100644
--- a/test/test_bundled_images.py
+++ b/test/test_bundled_images.py
@@ -1,25 +1,29 @@
 #!/usr/bin/env python3
 # Owner(s): ["oncall: mobile"]
 
-import torch
-import torch.utils.bundled_inputs
 import io
+
 import cv2
+import torch
+import torch.utils.bundled_inputs
 from torch.testing._internal.common_utils import TestCase
 
 torch.ops.load_library("//caffe2/torch/fb/operators:decode_bundled_image")
 
+
 def model_size(sm):
     buffer = io.BytesIO()
     torch.jit.save(sm, buffer)
     return len(buffer.getvalue())
 
+
 def save_and_load(sm):
     buffer = io.BytesIO()
     torch.jit.save(sm, buffer)
     buffer.seek(0)
     return torch.jit.load(buffer)
 
+
 """Return an InflatableArg that contains a tensor of the compressed image and the way to decode it
 
     keyword arguments:
@@ -27,6 +31,8 @@ def save_and_load(sm):
                   if in NCHW format, N should be 1
     quality -- the quality needed to compress the image
 """
+
+
 def bundle_jpeg_image(img_tensor, quality):
     # turn NCHW to HWC
     if img_tensor.dim() == 4:
@@ -37,9 +43,12 @@ def bundle_jpeg_image(img_tensor, quality):
     _, enc_img = cv2.imencode(".JPEG", pixels, encode_param)
     enc_img_tensor = torch.from_numpy(enc_img)
     enc_img_tensor = torch.flatten(enc_img_tensor).byte()
-    obj = torch.utils.bundled_inputs.InflatableArg(enc_img_tensor, "torch.ops.fb.decode_bundled_image({})")
+    obj = torch.utils.bundled_inputs.InflatableArg(
+        enc_img_tensor, "torch.ops.fb.decode_bundled_image({})"
+    )
     return obj
 
+
 def get_tensor_from_raw_BGR(im) -> torch.Tensor:
     raw_data = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
     raw_data = torch.from_numpy(raw_data).float()
@@ -53,6 +62,7 @@ def test_single_tensors(self):
         class SingleTensorModel(torch.nn.Module):
             def forward(self, arg):
                 return arg
+
         im = cv2.imread("caffe2/test/test_img/p1.jpg")
         tensor = torch.from_numpy(im)
         inflatable_arg = bundle_jpeg_image(tensor, 90)
diff --git a/test/test_bundled_inputs.py b/test/test_bundled_inputs.py
index db3c8df9b872f..2ba1ee847e8b4 100644
--- a/test/test_bundled_inputs.py
+++ b/test/test_bundled_inputs.py
@@ -3,11 +3,11 @@
 
 import io
 import textwrap
-from typing import List, Optional, Dict
+from typing import Dict, List, Optional
 
 import torch
 import torch.utils.bundled_inputs
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 def model_size(sm):
@@ -24,7 +24,6 @@ def save_and_load(sm):
 
 
 class TestBundledInputs(TestCase):
-
     def test_single_tensors(self):
         class SingleTensorModel(torch.nn.Module):
             def forward(self, arg):
@@ -32,7 +31,7 @@ def forward(self, arg):
 
         sm = torch.jit.script(SingleTensorModel())
         original_size = model_size(sm)
-        get_expr : List[str] = []
+        get_expr: List[str] = []
         samples = [
             # Tensor with small numel and small storage.
             (torch.tensor([1]),),
@@ -50,7 +49,8 @@ def forward(self, arg):
             (torch.quantize_per_tensor(torch.zeros(4, 8, 32, 32), 1, 0, torch.qint8),),
         ]
         torch.utils.bundled_inputs.augment_model_with_bundled_inputs(
-            sm, samples, get_expr)
+            sm, samples, get_expr
+        )
         # print(get_expr[0])
         # print(sm._generate_bundled_inputs.code)
 
@@ -80,18 +80,17 @@ def forward(self, arg):
         self.assertEqual(inflated[5][0].mean().item(), 0, atol=0.025, rtol=0)
         self.assertEqual(inflated[5][0].std().item(), 1, atol=0.02, rtol=0)
 
-
     def test_large_tensor_with_inflation(self):
         class SingleTensorModel(torch.nn.Module):
             def forward(self, arg):
                 return arg
+
         sm = torch.jit.script(SingleTensorModel())
         sample_tensor = torch.randn(1 << 16)
         # We can store tensors with custom inflation functions regardless
         # of size, even if inflation is just the identity.
         sample = torch.utils.bundled_inputs.bundle_large_tensor(sample_tensor)
-        torch.utils.bundled_inputs.augment_model_with_bundled_inputs(
-            sm, [(sample,)])
+        torch.utils.bundled_inputs.augment_model_with_bundled_inputs(sm, [(sample,)])
 
         loaded = save_and_load(sm)
         inflated = loaded.get_all_bundled_inputs()
@@ -99,17 +98,18 @@ def forward(self, arg):
 
         self.assertEqual(inflated[0][0], sample_tensor)
 
-
     def test_rejected_tensors(self):
         def check_tensor(sample):
             # Need to define the class in this scope to get a fresh type for each run.
             class SingleTensorModel(torch.nn.Module):
                 def forward(self, arg):
                     return arg
+
             sm = torch.jit.script(SingleTensorModel())
             with self.assertRaisesRegex(Exception, "Bundled input argument"):
                 torch.utils.bundled_inputs.augment_model_with_bundled_inputs(
-                    sm, [(sample,)])
+                    sm, [(sample,)]
+                )
 
         # Plain old big tensor.
         check_tensor(torch.randn(1 << 16))
@@ -120,7 +120,6 @@ def forward(self, arg):
         self.assertEqual(small_sparse.numel(), 2)
         check_tensor(small_sparse)
 
-
     def test_non_tensors(self):
         class StringAndIntModel(torch.nn.Module):
             def forward(self, fmt: str, num: int):
@@ -131,8 +130,7 @@ def forward(self, fmt: str, num: int):
             ("first {}", 1),
             ("second {}", 2),
         ]
-        torch.utils.bundled_inputs.augment_model_with_bundled_inputs(
-            sm, samples)
+        torch.utils.bundled_inputs.augment_model_with_bundled_inputs(sm, samples)
 
         loaded = save_and_load(sm)
         inflated = loaded.get_all_bundled_inputs()
@@ -162,23 +160,17 @@ def foo(self, arg):
             (torch.ones(4, 8, 32, 32).contiguous(memory_format=torch.channels_last),),
         ]
         info = [
-            'Tensor with small numel and small storage.',
-            'Tensor with large numel and small storage.',
-            'Tensor with small numel and large storage.',
-            'Large zero tensor.',
-            'Large channels-last ones tensor.',
-            'Special encoding of random tensor.',
+            "Tensor with small numel and small storage.",
+            "Tensor with large numel and small storage.",
+            "Tensor with small numel and large storage.",
+            "Large zero tensor.",
+            "Large channels-last ones tensor.",
+            "Special encoding of random tensor.",
         ]
         torch.utils.bundled_inputs.augment_many_model_functions_with_bundled_inputs(
             mm,
-            inputs={
-                mm.forward : samples,
-                mm.foo : samples
-            },
-            info={
-                mm.forward : info,
-                mm.foo : info
-            }
+            inputs={mm.forward: samples, mm.foo: samples},
+            info={mm.forward: info, mm.foo: info},
         )
         loaded = save_and_load(mm)
         inflated = loaded.get_all_bundled_inputs()
@@ -194,15 +186,21 @@ def foo(self, arg):
 
         # Check helper that work on all functions
         all_info = loaded.get_bundled_inputs_functions_and_info()
-        self.assertEqual(set(all_info.keys()), {'forward', 'foo'})
-        self.assertEqual(all_info['forward']['get_inputs_function_name'], ['get_all_bundled_inputs_for_forward'])
-        self.assertEqual(all_info['foo']['get_inputs_function_name'], ['get_all_bundled_inputs_for_foo'])
-        self.assertEqual(all_info['forward']['info'], info)
-        self.assertEqual(all_info['foo']['info'], info)
+        self.assertEqual(set(all_info.keys()), {"forward", "foo"})
+        self.assertEqual(
+            all_info["forward"]["get_inputs_function_name"],
+            ["get_all_bundled_inputs_for_forward"],
+        )
+        self.assertEqual(
+            all_info["foo"]["get_inputs_function_name"],
+            ["get_all_bundled_inputs_for_foo"],
+        )
+        self.assertEqual(all_info["forward"]["info"], info)
+        self.assertEqual(all_info["foo"]["info"], info)
 
         # example of how to turn the 'get_inputs_function_name' into the actual list of bundled inputs
         for func_name in all_info.keys():
-            input_func_name = all_info[func_name]['get_inputs_function_name'][0]
+            input_func_name = all_info[func_name]["get_inputs_function_name"][0]
             func_to_run = getattr(loaded, input_func_name)
             self.assertEqual(func_to_run(), samples)
 
@@ -220,16 +218,18 @@ def foo(self, arg):
         # inputs defined 2 ways so should fail
         with self.assertRaises(Exception):
             mm = torch.jit.script(MultipleMethodModel())
-            definition = textwrap.dedent("""
+            definition = textwrap.dedent(
+                """
                 def _generate_bundled_inputs_for_forward(self):
                     return []
-                """)
+                """
+            )
             mm.define(definition)
             torch.utils.bundled_inputs.augment_many_model_functions_with_bundled_inputs(
                 mm,
                 inputs={
-                    mm.forward : samples,
-                    mm.foo : samples,
+                    mm.forward: samples,
+                    mm.foo: samples,
                 },
             )
 
@@ -251,8 +251,8 @@ def foo(self, arg):
             torch.utils.bundled_inputs.augment_many_model_functions_with_bundled_inputs(
                 mm,
                 inputs={
-                    mm.forward : None,
-                    mm.foo : samples,
+                    mm.forward: None,
+                    mm.foo: samples,
                 },
             )
 
@@ -265,8 +265,7 @@ def forward(self, arg):
         with self.assertRaises(TypeError):
             m = torch.jit.script(SingleTensorModel())
             torch.utils.bundled_inputs.augment_model_with_bundled_inputs(
-                m,
-                inputs="foo"  # type: ignore[arg-type]
+                m, inputs="foo"  # type: ignore[arg-type]
             )
 
         # List of non tuples. Most common error using the api.
@@ -274,7 +273,9 @@ def forward(self, arg):
             m = torch.jit.script(SingleTensorModel())
             torch.utils.bundled_inputs.augment_model_with_bundled_inputs(
                 m,
-                inputs=[torch.ones(1, 2), ]  # type: ignore[list-item]
+                inputs=[
+                    torch.ones(1, 2),  # type: ignore[list-item]
+                ],
             )
 
     def test_double_augment_fail(self):
@@ -284,13 +285,13 @@ def forward(self, arg):
 
         m = torch.jit.script(SingleTensorModel())
         torch.utils.bundled_inputs.augment_model_with_bundled_inputs(
-            m,
-            inputs=[(torch.ones(1),)]
+            m, inputs=[(torch.ones(1),)]
         )
-        with self.assertRaisesRegex(Exception, "Models can only be augmented with bundled inputs once."):
+        with self.assertRaisesRegex(
+            Exception, "Models can only be augmented with bundled inputs once."
+        ):
             torch.utils.bundled_inputs.augment_model_with_bundled_inputs(
-                m,
-                inputs=[(torch.ones(1),)]
+                m, inputs=[(torch.ones(1),)]
             )
 
     def test_double_augment_non_mutator(self):
@@ -300,8 +301,7 @@ def forward(self, arg):
 
         m = torch.jit.script(SingleTensorModel())
         bundled_model = torch.utils.bundled_inputs.bundle_inputs(
-            m,
-            inputs=[(torch.ones(1),)]
+            m, inputs=[(torch.ones(1),)]
         )
         with self.assertRaises(AttributeError):
             m.get_all_bundled_inputs()
@@ -315,18 +315,15 @@ def forward(self, arg):
 
         m = torch.jit.script(SingleTensorModel())
         bundled_model = torch.utils.bundled_inputs.bundle_inputs(
-            m,
-            inputs={m.forward : [(torch.ones(1),)]}
+            m, inputs={m.forward: [(torch.ones(1),)]}
         )
         self.assertEqual(bundled_model.get_all_bundled_inputs(), [(torch.ones(1),)])
 
         bundled_model2 = torch.utils.bundled_inputs.bundle_inputs(
-            bundled_model,
-            inputs=[(torch.ones(2),)]
+            bundled_model, inputs=[(torch.ones(2),)]
         )
         self.assertEqual(bundled_model2.get_all_bundled_inputs(), [(torch.ones(2),)])
 
-
     def test_dict_args(self):
         class MyModel(torch.nn.Module):
             def forward(
@@ -396,7 +393,7 @@ def {}(self, value: Optional[List[Tensor]]):
                 """,
             )
 
-        out : List[str] = []
+        out: List[str] = []
         sm = torch.jit.script(MyModel())
         original_size = model_size(sm)
         small_inputs = (
@@ -426,7 +423,10 @@ def {}(self, value: Optional[List[Tensor]]):
         inflated = loaded.get_all_bundled_inputs()
         self.assertEqual(len(inflated[0]), len(small_inputs))
 
-        methods, _ = torch.utils.bundled_inputs._get_bundled_inputs_attributes_and_methods(
+        (
+            methods,
+            _,
+        ) = torch.utils.bundled_inputs._get_bundled_inputs_attributes_and_methods(
             loaded
         )
 
@@ -435,9 +435,9 @@ def {}(self, value: Optional[List[Tensor]]):
         # two args which have InflatableArg with fmt_fn
         # 1 * 2 * 2 = 4
         self.assertEqual(
-            sum([method.startswith("_inflate_helper") for method in methods]), 4
+            sum(method.startswith("_inflate_helper") for method in methods), 4
         )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_ci_sanity_check_fail.py b/test/test_ci_sanity_check_fail.py
new file mode 100644
index 0000000000000..895cf985dbc3f
--- /dev/null
+++ b/test/test_ci_sanity_check_fail.py
@@ -0,0 +1,21 @@
+# Owner(s): ["module: ci"]
+# Sanity check for CI setup in GHA.  This file is expected to fail so it can trigger reruns
+
+import os
+
+from torch.testing._internal.common_utils import run_tests, slowTest, TestCase
+
+
+class TestCISanityCheck(TestCase):
+    def test_env_vars_exist(self):
+        # This check should fail and trigger reruns.  If it passes, something is wrong
+        self.assertTrue(os.environ.get("CI") is None)
+
+    @slowTest
+    def test_env_vars_exist_slow(self):
+        # Same as the above, but for the slow suite
+        self.assertTrue(os.environ.get("CI") is None)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_comparison_utils.py b/test/test_comparison_utils.py
index 172e2c4092933..6c5c65d1a0cf7 100644
--- a/test/test_comparison_utils.py
+++ b/test/test_comparison_utils.py
@@ -2,7 +2,8 @@
 # Owner(s): ["module: internals"]
 
 import torch
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import run_tests, TestCase
+
 
 class TestComparisonUtils(TestCase):
     def test_all_equal_no_assert(self):
@@ -32,5 +33,5 @@ def test_assert_sizes(self):
             torch._assert_tensor_metadata(t, [3], [1], torch.float)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_compile_benchmark_util.py b/test/test_compile_benchmark_util.py
index 3e0b996ed06f0..7d43b5727b3be 100644
--- a/test/test_compile_benchmark_util.py
+++ b/test/test_compile_benchmark_util.py
@@ -1,17 +1,20 @@
 # Owner(s): ["module: dynamo"]
 
+import unittest
+
 import torch
 import torch._dynamo as torchdynamo
-from torch.testing._internal.common_utils import TestCase, run_tests, TEST_CUDA
-import unittest
+from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TestCase
 
 try:
     import tabulate  # noqa: F401  # type: ignore[import]
     from torch.utils.benchmark.utils.compile import bench_all
+
     HAS_TABULATE = True
 except ImportError:
     HAS_TABULATE = False
 
+
 @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
 @unittest.skipIf(not HAS_TABULATE, "tabulate not available")
 class TestCompileBenchmarkUtil(TestCase):
@@ -28,10 +31,24 @@ def forward(self, x):
         model = ToyModel().cuda()
 
         inference_table = bench_all(model, torch.ones(1024, 2, 2).cuda(), 5)
-        self.assertTrue("Inference" in inference_table and "Eager" in inference_table and "-" in inference_table)
-
-        training_table = bench_all(model, torch.ones(1024, 2, 2).cuda(), 5, optimizer=torch.optim.SGD(model.parameters(), lr=0.01))
-        self.assertTrue("Train" in training_table and "Eager" in training_table and "-" in training_table)
-
-if __name__ == '__main__':
+        self.assertTrue(
+            "Inference" in inference_table
+            and "Eager" in inference_table
+            and "-" in inference_table
+        )
+
+        training_table = bench_all(
+            model,
+            torch.ones(1024, 2, 2).cuda(),
+            5,
+            optimizer=torch.optim.SGD(model.parameters(), lr=0.01),
+        )
+        self.assertTrue(
+            "Train" in training_table
+            and "Eager" in training_table
+            and "-" in training_table
+        )
+
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_complex.py b/test/test_complex.py
index 664c27181cc81..04fa566bf94f2 100644
--- a/test/test_complex.py
+++ b/test/test_complex.py
@@ -2,28 +2,37 @@
 
 import torch
 from torch.testing._internal.common_device_type import (
-    instantiate_device_type_tests,
     dtypes,
+    instantiate_device_type_tests,
     onlyCPU,
 )
-from torch.testing._internal.common_utils import TestCase, run_tests, set_default_dtype
 from torch.testing._internal.common_dtype import complex_types
+from torch.testing._internal.common_utils import run_tests, set_default_dtype, TestCase
+
+devices = (torch.device("cpu"), torch.device("cuda:0"))
 
-devices = (torch.device('cpu'), torch.device('cuda:0'))
 
 class TestComplexTensor(TestCase):
     @dtypes(*complex_types())
     def test_to_list(self, device, dtype):
         # test that the complex float tensor has expected values and
         # there's no garbage value in the resultant list
-        self.assertEqual(torch.zeros((2, 2), device=device, dtype=dtype).tolist(), [[0j, 0j], [0j, 0j]])
+        self.assertEqual(
+            torch.zeros((2, 2), device=device, dtype=dtype).tolist(),
+            [[0j, 0j], [0j, 0j]],
+        )
 
-    @dtypes(torch.float32, torch.float64)
+    @dtypes(torch.float32, torch.float64, torch.float16)
     def test_dtype_inference(self, device, dtype):
         # issue: https://github.com/pytorch/pytorch/issues/36834
         with set_default_dtype(dtype):
-            x = torch.tensor([3., 3. + 5.j], device=device)
-        self.assertEqual(x.dtype, torch.cdouble if dtype == torch.float64 else torch.cfloat)
+            x = torch.tensor([3.0, 3.0 + 5.0j], device=device)
+        if dtype == torch.float16:
+            self.assertEqual(x.dtype, torch.chalf)
+        elif dtype == torch.float32:
+            self.assertEqual(x.dtype, torch.cfloat)
+        else:
+            self.assertEqual(x.dtype, torch.cdouble)
 
     @dtypes(*complex_types())
     def test_conj_copy(self, device, dtype):
@@ -33,6 +42,20 @@ def test_conj_copy(self, device, dtype):
         x1.copy_(xc1)
         self.assertEqual(x1, torch.tensor([5 - 1j, 2 - 2j], device=device, dtype=dtype))
 
+    @dtypes(*complex_types())
+    def test_all(self, device, dtype):
+        # issue: https://github.com/pytorch/pytorch/issues/120875
+        x = torch.tensor([1 + 2j, 3 - 4j, 5j, 6], device=device, dtype=dtype)
+        self.assertTrue(torch.all(x))
+
+    @dtypes(*complex_types())
+    def test_any(self, device, dtype):
+        # issue: https://github.com/pytorch/pytorch/issues/120875
+        x = torch.tensor(
+            [0, 0j, -0 + 0j, -0 - 0j, 0 + 0j, 0 - 0j], device=device, dtype=dtype
+        )
+        self.assertFalse(torch.any(x))
+
     @onlyCPU
     @dtypes(*complex_types())
     def test_eq(self, device, dtype):
@@ -40,67 +63,179 @@ def test_eq(self, device, dtype):
         nan = float("nan")
         # Non-vectorized operations
         for a, b in (
-            (torch.tensor([-0.0610 - 2.1172j], device=device, dtype=dtype),
-             torch.tensor([-6.1278 - 8.5019j], device=device, dtype=dtype)),
-            (torch.tensor([-0.0610 - 2.1172j], device=device, dtype=dtype),
-             torch.tensor([-6.1278 - 2.1172j], device=device, dtype=dtype)),
-            (torch.tensor([-0.0610 - 2.1172j], device=device, dtype=dtype),
-             torch.tensor([-0.0610 - 8.5019j], device=device, dtype=dtype)),
+            (
+                torch.tensor([-0.0610 - 2.1172j], device=device, dtype=dtype),
+                torch.tensor([-6.1278 - 8.5019j], device=device, dtype=dtype),
+            ),
+            (
+                torch.tensor([-0.0610 - 2.1172j], device=device, dtype=dtype),
+                torch.tensor([-6.1278 - 2.1172j], device=device, dtype=dtype),
+            ),
+            (
+                torch.tensor([-0.0610 - 2.1172j], device=device, dtype=dtype),
+                torch.tensor([-0.0610 - 8.5019j], device=device, dtype=dtype),
+            ),
         ):
             actual = torch.eq(a, b)
             expected = torch.tensor([False], device=device, dtype=torch.bool)
-            self.assertEqual(actual, expected, msg=f"\neq\nactual {actual}\nexpected {expected}")
+            self.assertEqual(
+                actual, expected, msg=f"\neq\nactual {actual}\nexpected {expected}"
+            )
 
             actual = torch.eq(a, a)
             expected = torch.tensor([True], device=device, dtype=torch.bool)
-            self.assertEqual(actual, expected, msg=f"\neq\nactual {actual}\nexpected {expected}")
+            self.assertEqual(
+                actual, expected, msg=f"\neq\nactual {actual}\nexpected {expected}"
+            )
 
             actual = torch.full_like(b, complex(2, 2))
             torch.eq(a, b, out=actual)
             expected = torch.tensor([complex(0)], device=device, dtype=dtype)
-            self.assertEqual(actual, expected, msg=f"\neq(out)\nactual {actual}\nexpected {expected}")
+            self.assertEqual(
+                actual, expected, msg=f"\neq(out)\nactual {actual}\nexpected {expected}"
+            )
 
             actual = torch.full_like(b, complex(2, 2))
             torch.eq(a, a, out=actual)
             expected = torch.tensor([complex(1)], device=device, dtype=dtype)
-            self.assertEqual(actual, expected, msg=f"\neq(out)\nactual {actual}\nexpected {expected}")
+            self.assertEqual(
+                actual, expected, msg=f"\neq(out)\nactual {actual}\nexpected {expected}"
+            )
 
         # Vectorized operations
         for a, b in (
-            (torch.tensor([
-                -0.0610 - 2.1172j, 5.1576 + 5.4775j, complex(2.8871, nan), -6.6545 - 3.7655j, -2.7036 - 1.4470j, 0.3712 + 7.989j,
-                -0.0610 - 2.1172j, 5.1576 + 5.4775j, complex(nan, -3.2650), -6.6545 - 3.7655j, -2.7036 - 1.4470j, 0.3712 + 7.989j],
-                device=device, dtype=dtype),
-             torch.tensor([
-                -6.1278 - 8.5019j, 0.5886 + 8.8816j, complex(2.8871, nan), 6.3505 + 2.2683j, 0.3712 + 7.9659j, 0.3712 + 7.989j,
-                -6.1278 - 2.1172j, 5.1576 + 8.8816j, complex(nan, -3.2650), 6.3505 + 2.2683j, 0.3712 + 7.9659j, 0.3712 + 7.989j],
-                device=device, dtype=dtype)),
+            (
+                torch.tensor(
+                    [
+                        -0.0610 - 2.1172j,
+                        5.1576 + 5.4775j,
+                        complex(2.8871, nan),
+                        -6.6545 - 3.7655j,
+                        -2.7036 - 1.4470j,
+                        0.3712 + 7.989j,
+                        -0.0610 - 2.1172j,
+                        5.1576 + 5.4775j,
+                        complex(nan, -3.2650),
+                        -6.6545 - 3.7655j,
+                        -2.7036 - 1.4470j,
+                        0.3712 + 7.989j,
+                    ],
+                    device=device,
+                    dtype=dtype,
+                ),
+                torch.tensor(
+                    [
+                        -6.1278 - 8.5019j,
+                        0.5886 + 8.8816j,
+                        complex(2.8871, nan),
+                        6.3505 + 2.2683j,
+                        0.3712 + 7.9659j,
+                        0.3712 + 7.989j,
+                        -6.1278 - 2.1172j,
+                        5.1576 + 8.8816j,
+                        complex(nan, -3.2650),
+                        6.3505 + 2.2683j,
+                        0.3712 + 7.9659j,
+                        0.3712 + 7.989j,
+                    ],
+                    device=device,
+                    dtype=dtype,
+                ),
+            ),
         ):
             actual = torch.eq(a, b)
-            expected = torch.tensor([False, False, False, False, False, True,
-                                    False, False, False, False, False, True],
-                                    device=device, dtype=torch.bool)
-            self.assertEqual(actual, expected, msg=f"\neq\nactual {actual}\nexpected {expected}")
+            expected = torch.tensor(
+                [
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    True,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    True,
+                ],
+                device=device,
+                dtype=torch.bool,
+            )
+            self.assertEqual(
+                actual, expected, msg=f"\neq\nactual {actual}\nexpected {expected}"
+            )
 
             actual = torch.eq(a, a)
-            expected = torch.tensor([True, True, False, True, True, True,
-                                    True, True, False, True, True, True],
-                                    device=device, dtype=torch.bool)
-            self.assertEqual(actual, expected, msg=f"\neq\nactual {actual}\nexpected {expected}")
+            expected = torch.tensor(
+                [
+                    True,
+                    True,
+                    False,
+                    True,
+                    True,
+                    True,
+                    True,
+                    True,
+                    False,
+                    True,
+                    True,
+                    True,
+                ],
+                device=device,
+                dtype=torch.bool,
+            )
+            self.assertEqual(
+                actual, expected, msg=f"\neq\nactual {actual}\nexpected {expected}"
+            )
 
             actual = torch.full_like(b, complex(2, 2))
             torch.eq(a, b, out=actual)
-            expected = torch.tensor([complex(0), complex(0), complex(0), complex(0), complex(0), complex(1),
-                                    complex(0), complex(0), complex(0), complex(0), complex(0), complex(1)],
-                                    device=device, dtype=dtype)
-            self.assertEqual(actual, expected, msg=f"\neq(out)\nactual {actual}\nexpected {expected}")
+            expected = torch.tensor(
+                [
+                    complex(0),
+                    complex(0),
+                    complex(0),
+                    complex(0),
+                    complex(0),
+                    complex(1),
+                    complex(0),
+                    complex(0),
+                    complex(0),
+                    complex(0),
+                    complex(0),
+                    complex(1),
+                ],
+                device=device,
+                dtype=dtype,
+            )
+            self.assertEqual(
+                actual, expected, msg=f"\neq(out)\nactual {actual}\nexpected {expected}"
+            )
 
             actual = torch.full_like(b, complex(2, 2))
             torch.eq(a, a, out=actual)
-            expected = torch.tensor([complex(1), complex(1), complex(0), complex(1), complex(1), complex(1),
-                                    complex(1), complex(1), complex(0), complex(1), complex(1), complex(1)],
-                                    device=device, dtype=dtype)
-            self.assertEqual(actual, expected, msg=f"\neq(out)\nactual {actual}\nexpected {expected}")
+            expected = torch.tensor(
+                [
+                    complex(1),
+                    complex(1),
+                    complex(0),
+                    complex(1),
+                    complex(1),
+                    complex(1),
+                    complex(1),
+                    complex(1),
+                    complex(0),
+                    complex(1),
+                    complex(1),
+                    complex(1),
+                ],
+                device=device,
+                dtype=dtype,
+            )
+            self.assertEqual(
+                actual, expected, msg=f"\neq(out)\nactual {actual}\nexpected {expected}"
+            )
 
     @onlyCPU
     @dtypes(*complex_types())
@@ -109,70 +244,183 @@ def test_ne(self, device, dtype):
         nan = float("nan")
         # Non-vectorized operations
         for a, b in (
-            (torch.tensor([-0.0610 - 2.1172j], device=device, dtype=dtype),
-             torch.tensor([-6.1278 - 8.5019j], device=device, dtype=dtype)),
-            (torch.tensor([-0.0610 - 2.1172j], device=device, dtype=dtype),
-             torch.tensor([-6.1278 - 2.1172j], device=device, dtype=dtype)),
-            (torch.tensor([-0.0610 - 2.1172j], device=device, dtype=dtype),
-             torch.tensor([-0.0610 - 8.5019j], device=device, dtype=dtype)),
+            (
+                torch.tensor([-0.0610 - 2.1172j], device=device, dtype=dtype),
+                torch.tensor([-6.1278 - 8.5019j], device=device, dtype=dtype),
+            ),
+            (
+                torch.tensor([-0.0610 - 2.1172j], device=device, dtype=dtype),
+                torch.tensor([-6.1278 - 2.1172j], device=device, dtype=dtype),
+            ),
+            (
+                torch.tensor([-0.0610 - 2.1172j], device=device, dtype=dtype),
+                torch.tensor([-0.0610 - 8.5019j], device=device, dtype=dtype),
+            ),
         ):
             actual = torch.ne(a, b)
             expected = torch.tensor([True], device=device, dtype=torch.bool)
-            self.assertEqual(actual, expected, msg=f"\nne\nactual {actual}\nexpected {expected}")
+            self.assertEqual(
+                actual, expected, msg=f"\nne\nactual {actual}\nexpected {expected}"
+            )
 
             actual = torch.ne(a, a)
             expected = torch.tensor([False], device=device, dtype=torch.bool)
-            self.assertEqual(actual, expected, msg=f"\nne\nactual {actual}\nexpected {expected}")
+            self.assertEqual(
+                actual, expected, msg=f"\nne\nactual {actual}\nexpected {expected}"
+            )
 
             actual = torch.full_like(b, complex(2, 2))
             torch.ne(a, b, out=actual)
             expected = torch.tensor([complex(1)], device=device, dtype=dtype)
-            self.assertEqual(actual, expected, msg=f"\nne(out)\nactual {actual}\nexpected {expected}")
+            self.assertEqual(
+                actual, expected, msg=f"\nne(out)\nactual {actual}\nexpected {expected}"
+            )
 
             actual = torch.full_like(b, complex(2, 2))
             torch.ne(a, a, out=actual)
             expected = torch.tensor([complex(0)], device=device, dtype=dtype)
-            self.assertEqual(actual, expected, msg=f"\nne(out)\nactual {actual}\nexpected {expected}")
+            self.assertEqual(
+                actual, expected, msg=f"\nne(out)\nactual {actual}\nexpected {expected}"
+            )
 
         # Vectorized operations
         for a, b in (
-            (torch.tensor([
-                -0.0610 - 2.1172j, 5.1576 + 5.4775j, complex(2.8871, nan), -6.6545 - 3.7655j, -2.7036 - 1.4470j, 0.3712 + 7.989j,
-                -0.0610 - 2.1172j, 5.1576 + 5.4775j, complex(nan, -3.2650), -6.6545 - 3.7655j, -2.7036 - 1.4470j, 0.3712 + 7.989j],
-                device=device, dtype=dtype),
-             torch.tensor([
-                -6.1278 - 8.5019j, 0.5886 + 8.8816j, complex(2.8871, nan), 6.3505 + 2.2683j, 0.3712 + 7.9659j, 0.3712 + 7.989j,
-                -6.1278 - 2.1172j, 5.1576 + 8.8816j, complex(nan, -3.2650), 6.3505 + 2.2683j, 0.3712 + 7.9659j, 0.3712 + 7.989j],
-                device=device, dtype=dtype)),
+            (
+                torch.tensor(
+                    [
+                        -0.0610 - 2.1172j,
+                        5.1576 + 5.4775j,
+                        complex(2.8871, nan),
+                        -6.6545 - 3.7655j,
+                        -2.7036 - 1.4470j,
+                        0.3712 + 7.989j,
+                        -0.0610 - 2.1172j,
+                        5.1576 + 5.4775j,
+                        complex(nan, -3.2650),
+                        -6.6545 - 3.7655j,
+                        -2.7036 - 1.4470j,
+                        0.3712 + 7.989j,
+                    ],
+                    device=device,
+                    dtype=dtype,
+                ),
+                torch.tensor(
+                    [
+                        -6.1278 - 8.5019j,
+                        0.5886 + 8.8816j,
+                        complex(2.8871, nan),
+                        6.3505 + 2.2683j,
+                        0.3712 + 7.9659j,
+                        0.3712 + 7.989j,
+                        -6.1278 - 2.1172j,
+                        5.1576 + 8.8816j,
+                        complex(nan, -3.2650),
+                        6.3505 + 2.2683j,
+                        0.3712 + 7.9659j,
+                        0.3712 + 7.989j,
+                    ],
+                    device=device,
+                    dtype=dtype,
+                ),
+            ),
         ):
             actual = torch.ne(a, b)
-            expected = torch.tensor([True, True, True, True, True, False,
-                                    True, True, True, True, True, False],
-                                    device=device, dtype=torch.bool)
-            self.assertEqual(actual, expected, msg=f"\nne\nactual {actual}\nexpected {expected}")
+            expected = torch.tensor(
+                [
+                    True,
+                    True,
+                    True,
+                    True,
+                    True,
+                    False,
+                    True,
+                    True,
+                    True,
+                    True,
+                    True,
+                    False,
+                ],
+                device=device,
+                dtype=torch.bool,
+            )
+            self.assertEqual(
+                actual, expected, msg=f"\nne\nactual {actual}\nexpected {expected}"
+            )
 
             actual = torch.ne(a, a)
-            expected = torch.tensor([False, False, True, False, False, False,
-                                    False, False, True, False, False, False],
-                                    device=device, dtype=torch.bool)
-            self.assertEqual(actual, expected, msg=f"\nne\nactual {actual}\nexpected {expected}")
+            expected = torch.tensor(
+                [
+                    False,
+                    False,
+                    True,
+                    False,
+                    False,
+                    False,
+                    False,
+                    False,
+                    True,
+                    False,
+                    False,
+                    False,
+                ],
+                device=device,
+                dtype=torch.bool,
+            )
+            self.assertEqual(
+                actual, expected, msg=f"\nne\nactual {actual}\nexpected {expected}"
+            )
 
             actual = torch.full_like(b, complex(2, 2))
             torch.ne(a, b, out=actual)
-            expected = torch.tensor([complex(1), complex(1), complex(1), complex(1), complex(1), complex(0),
-                                    complex(1), complex(1), complex(1), complex(1), complex(1), complex(0)],
-                                    device=device, dtype=dtype)
-            self.assertEqual(actual, expected, msg=f"\nne(out)\nactual {actual}\nexpected {expected}")
+            expected = torch.tensor(
+                [
+                    complex(1),
+                    complex(1),
+                    complex(1),
+                    complex(1),
+                    complex(1),
+                    complex(0),
+                    complex(1),
+                    complex(1),
+                    complex(1),
+                    complex(1),
+                    complex(1),
+                    complex(0),
+                ],
+                device=device,
+                dtype=dtype,
+            )
+            self.assertEqual(
+                actual, expected, msg=f"\nne(out)\nactual {actual}\nexpected {expected}"
+            )
 
             actual = torch.full_like(b, complex(2, 2))
             torch.ne(a, a, out=actual)
-            expected = torch.tensor([complex(0), complex(0), complex(1), complex(0), complex(0), complex(0),
-                                    complex(0), complex(0), complex(1), complex(0), complex(0), complex(0)],
-                                    device=device, dtype=dtype)
-            self.assertEqual(actual, expected, msg=f"\nne(out)\nactual {actual}\nexpected {expected}")
+            expected = torch.tensor(
+                [
+                    complex(0),
+                    complex(0),
+                    complex(1),
+                    complex(0),
+                    complex(0),
+                    complex(0),
+                    complex(0),
+                    complex(0),
+                    complex(1),
+                    complex(0),
+                    complex(0),
+                    complex(0),
+                ],
+                device=device,
+                dtype=dtype,
+            )
+            self.assertEqual(
+                actual, expected, msg=f"\nne(out)\nactual {actual}\nexpected {expected}"
+            )
+
 
 instantiate_device_type_tests(TestComplexTensor, globals())
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     TestCase._default_dtype_check_enabled = True
     run_tests()
diff --git a/test/test_content_store.py b/test/test_content_store.py
index 7080615778514..77e05ad609005 100644
--- a/test/test_content_store.py
+++ b/test/test_content_store.py
@@ -1,13 +1,19 @@
 # Owner(s): ["oncall: pt2"]
 
 import tempfile
+import unittest
 
 import torch
 from torch._prims.debug_prims import load_tensor_reader
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
 from torch.multiprocessing.reductions import StorageWeakRef
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
-from torch.testing._internal.common_utils import run_tests, skipIfRocm, TestCase
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    run_tests,
+    skipIfRocm,
+    TestCase,
+)
 from torch.utils._content_store import (
     ContentStoreReader,
     ContentStoreWriter,
@@ -15,6 +21,7 @@
 )
 
 
+@unittest.skipIf(IS_WINDOWS, "Test case not supported on Windows")
 class TestContentStore(TestCase):
     def test_basic(self, device):
         # setup test data
diff --git a/test/test_cpp_api_parity.py b/test/test_cpp_api_parity.py
index 9f33a51078972..3b57b40d62247 100644
--- a/test/test_cpp_api_parity.py
+++ b/test/test_cpp_api_parity.py
@@ -4,26 +4,35 @@
 import os
 
 import torch
-import torch.testing._internal.common_utils as common
 import torch.testing._internal.common_nn as common_nn
+import torch.testing._internal.common_utils as common
+from cpp_api_parity import (
+    functional_impl_check,
+    module_impl_check,
+    sample_functional,
+    sample_module,
+)
 from cpp_api_parity.parity_table_parser import parse_parity_tracker_table
 from cpp_api_parity.utils import is_torch_nn_functional_test
-from cpp_api_parity import module_impl_check, functional_impl_check, sample_module, sample_functional
 
 # NOTE: turn this on if you want to print source code of all C++ tests (e.g. for debugging purpose)
 PRINT_CPP_SOURCE = False
 
-devices = ['cpu', 'cuda']
+devices = ["cpu", "cuda"]
 
-PARITY_TABLE_PATH = os.path.join(os.path.dirname(__file__), 'cpp_api_parity', 'parity-tracker.md')
+PARITY_TABLE_PATH = os.path.join(
+    os.path.dirname(__file__), "cpp_api_parity", "parity-tracker.md"
+)
 
 parity_table = parse_parity_tracker_table(PARITY_TABLE_PATH)
 
+
 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestCppApiParity(common.TestCase):
     module_test_params_map = {}
     functional_test_params_map = {}
 
+
 expected_test_params_dicts = []
 
 if not common.IS_ARM64:
@@ -35,27 +44,47 @@ class TestCppApiParity(common.TestCase):
         (common_nn.criterion_tests, common_nn.CriterionTest),
     ]:
         for test_params_dict in test_params_dicts:
-            if test_params_dict.get('test_cpp_api_parity', True):
+            if test_params_dict.get("test_cpp_api_parity", True):
                 if is_torch_nn_functional_test(test_params_dict):
                     functional_impl_check.write_test_to_test_class(
-                        TestCppApiParity, test_params_dict, test_instance_class, parity_table, devices)
+                        TestCppApiParity,
+                        test_params_dict,
+                        test_instance_class,
+                        parity_table,
+                        devices,
+                    )
                 else:
                     module_impl_check.write_test_to_test_class(
-                        TestCppApiParity, test_params_dict, test_instance_class, parity_table, devices)
+                        TestCppApiParity,
+                        test_params_dict,
+                        test_instance_class,
+                        parity_table,
+                        devices,
+                    )
                 expected_test_params_dicts.append(test_params_dict)
 
     # Assert that all NN module/functional test dicts appear in the parity test
-    assert len([name for name in TestCppApiParity.__dict__ if 'test_torch_nn_' in name]) == \
-        len(expected_test_params_dicts) * len(devices)
+    assert len(
+        [name for name in TestCppApiParity.__dict__ if "test_torch_nn_" in name]
+    ) == len(expected_test_params_dicts) * len(devices)
 
     # Assert that there exists auto-generated tests for `SampleModule` and `sample_functional`.
     # 4 == 2 (number of test dicts that are not skipped) * 2 (number of devices)
-    assert len([name for name in TestCppApiParity.__dict__ if 'SampleModule' in name]) == 4
+    assert (
+        len([name for name in TestCppApiParity.__dict__ if "SampleModule" in name]) == 4
+    )
     # 4 == 2 (number of test dicts that are not skipped) * 2 (number of devices)
-    assert len([name for name in TestCppApiParity.__dict__ if 'sample_functional' in name]) == 4
+    assert (
+        len([name for name in TestCppApiParity.__dict__ if "sample_functional" in name])
+        == 4
+    )
 
-    module_impl_check.build_cpp_tests(TestCppApiParity, print_cpp_source=PRINT_CPP_SOURCE)
-    functional_impl_check.build_cpp_tests(TestCppApiParity, print_cpp_source=PRINT_CPP_SOURCE)
+    module_impl_check.build_cpp_tests(
+        TestCppApiParity, print_cpp_source=PRINT_CPP_SOURCE
+    )
+    functional_impl_check.build_cpp_tests(
+        TestCppApiParity, print_cpp_source=PRINT_CPP_SOURCE
+    )
 
 if __name__ == "__main__":
     common.TestCase._default_dtype_check_enabled = True
diff --git a/test/test_cpp_extensions_aot.py b/test/test_cpp_extensions_aot.py
index 2d278aa69e95f..3e5ce5cfcef45 100644
--- a/test/test_cpp_extensions_aot.py
+++ b/test/test_cpp_extensions_aot.py
@@ -1,20 +1,22 @@
 # Owner(s): ["module: cpp-extensions"]
 
-from itertools import repeat
 import os
 import re
-from typing import Union, get_args, get_origin
 import unittest
+from itertools import repeat
+from typing import get_args, get_origin, Union
 
-import torch.testing._internal.common_utils as common
-from torch.testing._internal.common_utils import IS_WINDOWS, skipIfTorchDynamo
-from torch.testing._internal.common_cuda import TEST_CUDA
 import torch
 import torch.backends.cudnn
+
+import torch.testing._internal.common_utils as common
 import torch.utils.cpp_extension
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_utils import IS_WINDOWS, skipIfTorchDynamo
 
 try:
     import pytest
+
     HAS_PYTEST = True
 except ImportError as e:
     HAS_PYTEST = False
@@ -24,11 +26,11 @@
 try:
     if HAS_PYTEST:
         cpp_extension = pytest.importorskip("torch_test_cpp_extension.cpp")
-        ort_extension = pytest.importorskip("torch_test_cpp_extension.ort")
+        maia_extension = pytest.importorskip("torch_test_cpp_extension.maia")
         rng_extension = pytest.importorskip("torch_test_cpp_extension.rng")
     else:
         import torch_test_cpp_extension.cpp as cpp_extension
-        import torch_test_cpp_extension.ort as ort_extension
+        import torch_test_cpp_extension.maia as maia_extension
         import torch_test_cpp_extension.rng as rng_extension
 except ImportError as e:
     raise RuntimeError(
@@ -141,11 +143,15 @@ def test_optional(self):
     @common.skipIfRocm
     @unittest.skipIf(common.IS_WINDOWS, "Windows not supported")
     @unittest.skipIf(not TEST_CUDA, "CUDA not found")
-    @unittest.skipIf(os.getenv('USE_NINJA', '0') == '0', "cuda extension with dlink requires ninja to build")
+    @unittest.skipIf(
+        os.getenv("USE_NINJA", "0") == "0",
+        "cuda extension with dlink requires ninja to build",
+    )
     def test_cuda_dlink_libs(self):
         from torch_test_cpp_extension import cuda_dlink
-        a = torch.randn(8, dtype=torch.float, device='cuda')
-        b = torch.randn(8, dtype=torch.float, device='cuda')
+
+        a = torch.randn(8, dtype=torch.float, device="cuda")
+        b = torch.randn(8, dtype=torch.float, device="cuda")
         ref = a + b
         test = cuda_dlink.add(a, b)
         self.assertEqual(test, ref)
@@ -164,6 +170,7 @@ class TestPybindTypeCasters(common.TestCase):
     second argument to `PYBIND11_TYPE_CASTER` should be the type we expect to
     receive in python, in these tests we verify this at run-time.
     """
+
     @staticmethod
     def expected_return_type(func):
         """
@@ -220,7 +227,9 @@ def check_union(self, funcs):
                     break
             else:
                 raise AssertionError(f"{val} is not an instance of {expected_types}")
-        self.assertFalse(expected_types, f"Missing functions for types {expected_types}")
+        self.assertFalse(
+            expected_types, f"Missing functions for types {expected_types}"
+        )
 
     def test_pybind_return_types(self):
         functions = [
@@ -246,46 +255,46 @@ def test_pybind_return_types(self):
 
 
 @torch.testing._internal.common_utils.markDynamoStrictTest
-class TestORTTensor(common.TestCase):
+class TestMAIATensor(common.TestCase):
     def test_unregistered(self):
-        a = torch.arange(0, 10, device='cpu')
+        a = torch.arange(0, 10, device="cpu")
         with self.assertRaisesRegex(RuntimeError, "Could not run"):
-            b = torch.arange(0, 10, device='ort')
+            b = torch.arange(0, 10, device="maia")
 
-    @skipIfTorchDynamo("dynamo cannot model ort device")
+    @skipIfTorchDynamo("dynamo cannot model maia device")
     def test_zeros(self):
-        a = torch.empty(5, 5, device='cpu')
-        self.assertEqual(a.device, torch.device('cpu'))
+        a = torch.empty(5, 5, device="cpu")
+        self.assertEqual(a.device, torch.device("cpu"))
 
-        b = torch.empty(5, 5, device='ort')
-        self.assertEqual(b.device, torch.device('ort', 0))
-        self.assertEqual(ort_extension.get_test_int(), 0)
+        b = torch.empty(5, 5, device="maia")
+        self.assertEqual(b.device, torch.device("maia", 0))
+        self.assertEqual(maia_extension.get_test_int(), 0)
         self.assertEqual(torch.get_default_dtype(), b.dtype)
 
-        c = torch.empty((5, 5), dtype=torch.int64, device='ort')
-        self.assertEqual(ort_extension.get_test_int(), 0)
+        c = torch.empty((5, 5), dtype=torch.int64, device="maia")
+        self.assertEqual(maia_extension.get_test_int(), 0)
         self.assertEqual(torch.int64, c.dtype)
 
     def test_add(self):
-        a = torch.empty(5, 5, device='ort', requires_grad=True)
-        self.assertEqual(ort_extension.get_test_int(), 0)
+        a = torch.empty(5, 5, device="maia", requires_grad=True)
+        self.assertEqual(maia_extension.get_test_int(), 0)
 
-        b = torch.empty(5, 5, device='ort')
-        self.assertEqual(ort_extension.get_test_int(), 0)
+        b = torch.empty(5, 5, device="maia")
+        self.assertEqual(maia_extension.get_test_int(), 0)
 
         c = a + b
-        self.assertEqual(ort_extension.get_test_int(), 1)
+        self.assertEqual(maia_extension.get_test_int(), 1)
 
     def test_conv_backend_override(self):
         # To simplify tests, we use 4d input here to avoid doing view4d( which
         # needs more overrides) in _convolution.
-        input = torch.empty(2, 4, 10, 2, device='ort', requires_grad=True)
-        weight = torch.empty(6, 4, 2, 2, device='ort', requires_grad=True)
-        bias = torch.empty(6, device='ort')
+        input = torch.empty(2, 4, 10, 2, device="maia", requires_grad=True)
+        weight = torch.empty(6, 4, 2, 2, device="maia", requires_grad=True)
+        bias = torch.empty(6, device="maia")
 
         # Make sure forward is overriden
         out = torch.nn.functional.conv2d(input, weight, bias, 2, 0, 1, 1)
-        self.assertEqual(ort_extension.get_test_int(), 2)
+        self.assertEqual(maia_extension.get_test_int(), 2)
         self.assertEqual(out.shape[0], input.shape[0])
         self.assertEqual(out.shape[1], weight.shape[0])
 
@@ -293,13 +302,12 @@ def test_conv_backend_override(self):
         # Double backward is dispatched to _convolution_double_backward.
         # It is not tested here as it involves more computation/overrides.
         grad = torch.autograd.grad(out, input, out, create_graph=True)
-        self.assertEqual(ort_extension.get_test_int(), 3)
+        self.assertEqual(maia_extension.get_test_int(), 3)
         self.assertEqual(grad[0].shape, input.shape)
 
 
 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestRNGExtension(common.TestCase):
-
     def setUp(self):
         super().setUp()
 
@@ -310,7 +318,7 @@ def test_rng(self):
         t = torch.empty(10, dtype=torch.int64).random_()
         self.assertNotEqual(t, fourty_two)
 
-        gen = torch.Generator(device='cpu')
+        gen = torch.Generator(device="cpu")
         t = torch.empty(10, dtype=torch.int64).random_(generator=gen)
         self.assertNotEqual(t, fourty_two)
 
@@ -337,7 +345,6 @@ def test_rng(self):
 @torch.testing._internal.common_utils.markDynamoStrictTest
 @unittest.skipIf(not TEST_CUDA, "CUDA not found")
 class TestTorchLibrary(common.TestCase):
-
     def test_torch_library(self):
         import torch_test_cpp_extension.torch_library  # noqa: F401
 
@@ -353,7 +360,7 @@ def f(a: bool, b: bool):
         self.assertFalse(s(True, False))
         self.assertFalse(s(False, True))
         self.assertFalse(s(False, False))
-        self.assertIn('torch_library::logical_and', str(s.graph))
+        self.assertIn("torch_library::logical_and", str(s.graph))
 
 
 if __name__ == "__main__":
diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py
index 3790e70916520..17c39fb6551ce 100644
--- a/test/test_cpp_extensions_jit.py
+++ b/test/test_cpp_extensions_jit.py
@@ -1,31 +1,38 @@
 # Owner(s): ["module: cpp-extensions"]
 
+import glob
 import os
+import re
 import shutil
+import subprocess
 import sys
+import tempfile
 import unittest
 import warnings
-import re
-import tempfile
-import subprocess
-import glob
 
-import torch.testing._internal.common_utils as common
-from torch.testing._internal.common_cuda import TEST_CUDNN, TEST_CUDA
 import torch
 import torch.backends.cudnn
+import torch.multiprocessing as mp
+
+import torch.testing._internal.common_utils as common
 import torch.utils.cpp_extension
-from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_CUDNN
 from torch.testing._internal.common_utils import gradcheck
-import torch.multiprocessing as mp
-from torch.utils.cpp_extension import _TORCH_PATH, remove_extension_h_precompiler_headers, get_cxx_compiler, check_compiler_is_gcc
+from torch.utils.cpp_extension import (
+    _TORCH_PATH,
+    check_compiler_is_gcc,
+    CUDA_HOME,
+    get_cxx_compiler,
+    remove_extension_h_precompiler_headers,
+    ROCM_HOME,
+)
 
 # define TEST_ROCM before changing TEST_CUDA
 TEST_ROCM = TEST_CUDA and torch.version.hip is not None and ROCM_HOME is not None
 TEST_CUDA = TEST_CUDA and CUDA_HOME is not None
 TEST_MPS = torch.backends.mps.is_available()
 IS_WINDOWS = sys.platform == "win32"
-IS_LINUX = sys.platform.startswith('linux')
+IS_LINUX = sys.platform.startswith("linux")
 
 
 def remove_build_path():
@@ -73,7 +80,11 @@ def test_jit_compile_extension(self):
                 "cpp_extensions/jit_extension.cpp",
                 "cpp_extensions/jit_extension2.cpp",
             ],
-            extra_include_paths=["cpp_extensions"],
+            extra_include_paths=[
+                "cpp_extensions",
+                "path / with spaces in it",
+                "path with quote'",
+            ],
             extra_cflags=["-g"],
             verbose=True,
         )
@@ -138,33 +149,39 @@ def test_mps_extension(self):
     def _run_jit_cuda_archflags(self, flags, expected):
         # Compile an extension with given `flags`
         def _check_cuobjdump_output(expected_values, is_ptx=False):
-            elf_or_ptx = '--list-ptx' if is_ptx else '--list-elf'
-            lib_ext = '.pyd' if IS_WINDOWS else '.so'
+            elf_or_ptx = "--list-ptx" if is_ptx else "--list-elf"
+            lib_ext = ".pyd" if IS_WINDOWS else ".so"
             # Note, .extension name may include _v1, _v2, so first find exact name
-            ext_filename = glob.glob(os.path.join(temp_dir,
-                                                  'cudaext_archflag*' + lib_ext))[0]
-            command = ['cuobjdump', elf_or_ptx, ext_filename]
-            p = subprocess.Popen(command,
-                                 stdout=subprocess.PIPE,
-                                 stderr=subprocess.PIPE)
+            ext_filename = glob.glob(
+                os.path.join(temp_dir, "cudaext_archflag*" + lib_ext)
+            )[0]
+            command = ["cuobjdump", elf_or_ptx, ext_filename]
+            p = subprocess.Popen(
+                command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
             output, err = p.communicate()
             output = output.decode("ascii")
             err = err.decode("ascii")
 
-            if not p.returncode == 0 or not err == '':
-                raise AssertionError(f"Flags: {flags}\nReturncode: {p.returncode}\nStderr: {err}\n"
-                                     f"Output: {output} ")
-
-            actual_arches = sorted(re.findall(r'sm_\d\d', output))
-            expected_arches = sorted(['sm_' + xx for xx in expected_values])
-            self.assertEqual(actual_arches, expected_arches,
-                             msg=f"Flags: {flags},  Actual: {actual_arches},  Expected: {expected_arches}\n"
-                                 f"Stderr: {err}\nOutput: {output}")
+            if not p.returncode == 0 or not err == "":
+                raise AssertionError(
+                    f"Flags: {flags}\nReturncode: {p.returncode}\nStderr: {err}\n"
+                    f"Output: {output} "
+                )
+
+            actual_arches = sorted(re.findall(r"sm_\d\d", output))
+            expected_arches = sorted(["sm_" + xx for xx in expected_values])
+            self.assertEqual(
+                actual_arches,
+                expected_arches,
+                msg=f"Flags: {flags},  Actual: {actual_arches},  Expected: {expected_arches}\n"
+                f"Stderr: {err}\nOutput: {output}",
+            )
 
         temp_dir = tempfile.mkdtemp()
-        old_envvar = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
+        old_envvar = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
         try:
-            os.environ['TORCH_CUDA_ARCH_LIST'] = flags
+            os.environ["TORCH_CUDA_ARCH_LIST"] = flags
 
             params = {
                 "name": "cudaext_archflags",
@@ -207,9 +224,9 @@ def _check_cuobjdump_output(expected_values, is_ptx=False):
                 shutil.rmtree(temp_dir)
 
             if old_envvar is None:
-                os.environ.pop('TORCH_CUDA_ARCH_LIST')
+                os.environ.pop("TORCH_CUDA_ARCH_LIST")
             else:
-                os.environ['TORCH_CUDA_ARCH_LIST'] = old_envvar
+                os.environ["TORCH_CUDA_ARCH_LIST"] = old_envvar
 
     @unittest.skipIf(not TEST_CUDA, "CUDA not found")
     @unittest.skipIf(TEST_ROCM, "disabled on rocm")
@@ -225,18 +242,35 @@ def test_jit_cuda_archflags(self):
         # expected values is length-2 tuple: (list of ELF, list of PTX)
         # note: there should not be more than one PTX value
         archflags = {
-            '': ([f'{capability[0]}{capability[1]}' for capability in capabilities], None),
-            "Maxwell+Tegra;6.1": (['53', '61'], None),
-            "Volta": (['70'], ['70']),
+            "": (
+                [f"{capability[0]}{capability[1]}" for capability in capabilities],
+                None,
+            ),
+            "Maxwell+Tegra;6.1": (["53", "61"], None),
+            "Volta": (["70"], ["70"]),
         }
-        archflags["7.5+PTX"] = (['75'], ['75'])
-        archflags["5.0;6.0+PTX;7.0;7.5"] = (['50', '60', '70', '75'], ['60'])
-        if int(torch.version.cuda.split('.')[0]) < 12:
+        archflags["7.5+PTX"] = (["75"], ["75"])
+        archflags["5.0;6.0+PTX;7.0;7.5"] = (["50", "60", "70", "75"], ["60"])
+        if int(torch.version.cuda.split(".")[0]) < 12:
             # CUDA 12 drops compute capability < 5.0
-            archflags["Pascal 3.5"] = (['35', '60', '61'], None)
+            archflags["Pascal 3.5"] = (["35", "60", "61"], None)
 
         for flags, expected in archflags.items():
-            self._run_jit_cuda_archflags(flags, expected)
+            try:
+                self._run_jit_cuda_archflags(flags, expected)
+            except RuntimeError as e:
+                # Using the device default (empty flags) may fail if the device is newer than the CUDA compiler
+                # This raises a RuntimeError with a specific message which we explicitly ignore here
+                if not flags and "Error building" in str(e):
+                    pass
+                else:
+                    raise
+            try:
+                torch.cuda.synchronize()
+            except RuntimeError:
+                # Ignore any error, e.g. unsupported PTX code on current device
+                # to avoid errors from here leaking into other tests
+                pass
 
     @unittest.skipIf(not TEST_CUDNN, "CuDNN not found")
     @unittest.skipIf(TEST_ROCM, "Not supported on ROCm")
@@ -578,8 +612,13 @@ def forward(self, input):
         self.assertEqual(sequential[2].parameters()[0].dtype, old_dtype)
 
         # Make sure we can access these methods recursively.
-        self.assertEqual(len(list(sequential.parameters())), len(net.parameters()) * 2 + 1)
-        self.assertEqual(len(list(sequential.named_parameters())), len(net.named_parameters()) * 2 + 1)
+        self.assertEqual(
+            len(list(sequential.parameters())), len(net.parameters()) * 2 + 1
+        )
+        self.assertEqual(
+            len(list(sequential.named_parameters())),
+            len(net.named_parameters()) * 2 + 1,
+        )
         self.assertEqual(len(list(sequential.buffers())), len(net.buffers()) * 2)
         self.assertEqual(len(list(sequential.modules())), 8)
 
@@ -735,8 +774,9 @@ def test_compilation_error_formatting(self):
         with self.assertRaises(RuntimeError) as e:
             torch.utils.cpp_extension.load_inline(
                 name="test_compilation_error_formatting",
-                cpp_sources="int main() { return 0 }")
-        pattern = r'.*(\\n|\\r).*'
+                cpp_sources="int main() { return 0 }",
+            )
+        pattern = r".*(\\n|\\r).*"
         self.assertNotRegex(str(e), pattern)
 
     def test_warning(self):
@@ -744,7 +784,7 @@ def test_warning(self):
         # symbol. But because of visibility and the fact that it lives in a
         # different compilation unit than pybind, this trips up ubsan even though
         # it is fine. "ubsan.supp" thus needs to contain "vptr:warn_mod.so".
-        source = '''
+        source = """
         // error_type:
         // 0: no error
         // 1: torch::TypeError
@@ -772,17 +812,19 @@ def test_warning(self):
             }
             return x.cos();
         }
-        '''
+        """
 
         # Ensure double type for hard-coded c name below
         t = torch.rand(2).double()
         cpp_tensor_name = r"CPUDoubleType"
 
         # Without error handling, the warnings cannot be catched
-        warn_mod = torch.utils.cpp_extension.load_inline(name='warn_mod',
-                                                         cpp_sources=[source],
-                                                         functions=['foo'],
-                                                         with_pytorch_error_handling=False)
+        warn_mod = torch.utils.cpp_extension.load_inline(
+            name="warn_mod",
+            cpp_sources=[source],
+            functions=["foo"],
+            with_pytorch_error_handling=False,
+        )
 
         with warnings.catch_warnings(record=True) as w:
             warn_mod.foo(t, 0)
@@ -792,7 +834,9 @@ def test_warning(self):
                 warn_mod.foo(t, 1)
             self.assertEqual(len(w), 0)
 
-            with self.assertRaisesRegex(SystemError, "bad argument to internal function"):
+            with self.assertRaisesRegex(
+                SystemError, "bad argument to internal function"
+            ):
                 warn_mod.foo(t, 2)
             self.assertEqual(len(w), 0)
 
@@ -800,12 +844,12 @@ def test_warning(self):
                 warn_mod.foo(t, 3)
             self.assertEqual(len(w), 0)
 
-
-        warn_mod = torch.utils.cpp_extension.load_inline(name='warn_mod',
-                                                         cpp_sources=[source],
-                                                         functions=['foo'],
-                                                         with_pytorch_error_handling=True)
-
+        warn_mod = torch.utils.cpp_extension.load_inline(
+            name="warn_mod",
+            cpp_sources=[source],
+            functions=["foo"],
+            with_pytorch_error_handling=True,
+        )
 
         with warnings.catch_warnings(record=True) as w:
             # Catched with no error should be detected
@@ -818,7 +862,9 @@ def test_warning(self):
             self.assertEqual(len(w), 2)
 
             # Catched with python error should also be detected
-            with self.assertRaisesRegex(SystemError, "bad argument to internal function"):
+            with self.assertRaisesRegex(
+                SystemError, "bad argument to internal function"
+            ):
                 warn_mod.foo(t, 2)
             self.assertEqual(len(w), 3)
 
@@ -843,7 +889,7 @@ def test_warning(self):
             self.assertEqual(len(w), 0)
 
     def test_autograd_from_cpp(self):
-        source = '''
+        source = """
         void run_back(at::Tensor x) {
             x.backward({});
         }
@@ -852,7 +898,7 @@ def test_autograd_from_cpp(self):
             pybind11::gil_scoped_release no_gil;
             x.backward({});
         }
-        '''
+        """
 
         class MyFn(torch.autograd.Function):
             @staticmethod
@@ -863,14 +909,18 @@ def forward(ctx, x):
             def backward(ctx, gx):
                 return gx
 
-        test_backward_deadlock = torch.utils.cpp_extension.load_inline(name='test_backward_deadlock',
-                                                                       cpp_sources=[source],
-                                                                       functions=['run_back', 'run_back_no_gil'],)
+        test_backward_deadlock = torch.utils.cpp_extension.load_inline(
+            name="test_backward_deadlock",
+            cpp_sources=[source],
+            functions=["run_back", "run_back_no_gil"],
+        )
 
         # This used to deadlock
         inp = torch.rand(20, requires_grad=True)
         loss = MyFn.apply(inp).sum()
-        with self.assertRaisesRegex(RuntimeError, "The autograd engine was called while holding the GIL."):
+        with self.assertRaisesRegex(
+            RuntimeError, "The autograd engine was called while holding the GIL."
+        ):
             test_backward_deadlock.run_back(loss)
 
         inp = torch.rand(20, requires_grad=True)
@@ -920,7 +970,6 @@ def test_custom_functorch_error(self):
         with self.assertRaisesRegex(RuntimeError, msg):
             torch.func.grad(identity_m.identity)(t)
 
-
     def test_gen_extension_h_pch(self):
         if not IS_LINUX:
             return
@@ -957,5 +1006,6 @@ def test_gen_extension_h_pch(self):
             self.assertEqual(pch_exist, True)
             self.assertEqual(signature_exist, True)
 
+
 if __name__ == "__main__":
     common.run_tests()
diff --git a/test/test_cpp_extensions_mtia_backend.py b/test/test_cpp_extensions_mtia_backend.py
new file mode 100644
index 0000000000000..3b81344a3cdfb
--- /dev/null
+++ b/test/test_cpp_extensions_mtia_backend.py
@@ -0,0 +1,156 @@
+# Owner(s): ["module: mtia"]
+
+import os
+import shutil
+import sys
+import tempfile
+import unittest
+
+import torch
+import torch.testing._internal.common_utils as common
+import torch.utils.cpp_extension
+from torch.testing._internal.common_utils import (
+    IS_ARM64,
+    IS_LINUX,
+    skipIfTorchDynamo,
+    TEST_CUDA,
+    TEST_PRIVATEUSE1,
+    TEST_XPU,
+)
+from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
+
+
+# define TEST_ROCM before changing TEST_CUDA
+TEST_ROCM = TEST_CUDA and torch.version.hip is not None and ROCM_HOME is not None
+TEST_CUDA = TEST_CUDA and CUDA_HOME is not None
+
+
+def remove_build_path():
+    if sys.platform == "win32":
+        # Not wiping extensions build folder because Windows
+        return
+    default_build_root = torch.utils.cpp_extension.get_default_build_root()
+    if os.path.exists(default_build_root):
+        shutil.rmtree(default_build_root, ignore_errors=True)
+
+
+@unittest.skipIf(
+    IS_ARM64 or not IS_LINUX or TEST_CUDA or TEST_PRIVATEUSE1 or TEST_ROCM or TEST_XPU,
+    "Only on linux platform and mutual exclusive to other backends",
+)
+@torch.testing._internal.common_utils.markDynamoStrictTest
+class TestCppExtensionMTIABackend(common.TestCase):
+    """Tests MTIA backend with C++ extensions."""
+
+    module = None
+
+    def setUp(self):
+        super().setUp()
+        # cpp extensions use relative paths. Those paths are relative to
+        # this file, so we'll change the working directory temporarily
+        self.old_working_dir = os.getcwd()
+        os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
+    def tearDown(self):
+        super().tearDown()
+        # return the working directory (see setUp)
+        os.chdir(self.old_working_dir)
+
+    @classmethod
+    def tearDownClass(cls):
+        remove_build_path()
+
+    @classmethod
+    def setUpClass(cls):
+        remove_build_path()
+        build_dir = tempfile.mkdtemp()
+        # Load the fake device guard impl.
+        cls.module = torch.utils.cpp_extension.load(
+            name="mtia_extension",
+            sources=["cpp_extensions/mtia_extension.cpp"],
+            build_directory=build_dir,
+            extra_include_paths=[
+                "cpp_extensions",
+                "path / with spaces in it",
+                "path with quote'",
+            ],
+            is_python_module=False,
+            verbose=True,
+        )
+
+    @skipIfTorchDynamo("Not a TorchDynamo suitable test")
+    def test_get_device_module(self):
+        device = torch.device("mtia:0")
+        default_stream = torch.get_device_module(device).current_stream()
+        self.assertEqual(
+            default_stream.device_type, int(torch._C._autograd.DeviceType.MTIA)
+        )
+        print(torch._C.Stream.__mro__)
+        print(torch.cuda.Stream.__mro__)
+
+    @skipIfTorchDynamo("Not a TorchDynamo suitable test")
+    def test_stream_basic(self):
+        default_stream = torch.mtia.current_stream()
+        user_stream = torch.mtia.Stream()
+        self.assertEqual(torch.mtia.current_stream(), default_stream)
+        self.assertNotEqual(default_stream, user_stream)
+        # Check mtia_extension.cpp, default stream id starts from 0.
+        self.assertEqual(default_stream.stream_id, 0)
+        self.assertNotEqual(user_stream.stream_id, 0)
+        with torch.mtia.stream(user_stream):
+            self.assertEqual(torch.mtia.current_stream(), user_stream)
+        self.assertTrue(user_stream.query())
+        default_stream.synchronize()
+        self.assertTrue(default_stream.query())
+
+    @skipIfTorchDynamo("Not a TorchDynamo suitable test")
+    def test_stream_context(self):
+        mtia_stream_0 = torch.mtia.Stream(device="mtia:0")
+        mtia_stream_1 = torch.mtia.Stream(device="mtia:0")
+        print(mtia_stream_0)
+        print(mtia_stream_1)
+        with torch.mtia.stream(mtia_stream_0):
+            current_stream = torch.mtia.current_stream()
+            msg = f"current_stream {current_stream} should be {mtia_stream_0}"
+            self.assertTrue(current_stream == mtia_stream_0, msg=msg)
+
+        with torch.mtia.stream(mtia_stream_1):
+            current_stream = torch.mtia.current_stream()
+            msg = f"current_stream {current_stream} should be {mtia_stream_1}"
+            self.assertTrue(current_stream == mtia_stream_1, msg=msg)
+
+    @skipIfTorchDynamo("Not a TorchDynamo suitable test")
+    def test_stream_context_different_device(self):
+        device_0 = torch.device("mtia:0")
+        device_1 = torch.device("mtia:1")
+        mtia_stream_0 = torch.mtia.Stream(device=device_0)
+        mtia_stream_1 = torch.mtia.Stream(device=device_1)
+        print(mtia_stream_0)
+        print(mtia_stream_1)
+        orig_current_device = torch.mtia.current_device()
+        with torch.mtia.stream(mtia_stream_0):
+            current_stream = torch.mtia.current_stream()
+            self.assertTrue(torch.mtia.current_device() == device_0.index)
+            msg = f"current_stream {current_stream} should be {mtia_stream_0}"
+            self.assertTrue(current_stream == mtia_stream_0, msg=msg)
+        self.assertTrue(torch.mtia.current_device() == orig_current_device)
+        with torch.mtia.stream(mtia_stream_1):
+            current_stream = torch.mtia.current_stream()
+            self.assertTrue(torch.mtia.current_device() == device_1.index)
+            msg = f"current_stream {current_stream} should be {mtia_stream_1}"
+            self.assertTrue(current_stream == mtia_stream_1, msg=msg)
+        self.assertTrue(torch.mtia.current_device() == orig_current_device)
+
+    @skipIfTorchDynamo("Not a TorchDynamo suitable test")
+    def test_device_context(self):
+        device_0 = torch.device("mtia:0")
+        device_1 = torch.device("mtia:1")
+        with torch.mtia.device(device_0):
+            self.assertTrue(torch.mtia.current_device() == device_0.index)
+
+        with torch.mtia.device(device_1):
+            self.assertTrue(torch.mtia.current_device() == device_1.index)
+
+
+if __name__ == "__main__":
+    common.run_tests()
diff --git a/test/test_cpp_extensions_open_device_registration.py b/test/test_cpp_extensions_open_device_registration.py
index 6e264b760baef..54f8862499a52 100644
--- a/test/test_cpp_extensions_open_device_registration.py
+++ b/test/test_cpp_extensions_open_device_registration.py
@@ -3,14 +3,16 @@
 import os
 import shutil
 import sys
-from typing import Union
 import tempfile
+import types
 import unittest
+from typing import Union
 
-import torch.testing._internal.common_utils as common
-from torch.testing._internal.common_utils import IS_ARM64, TEST_CUDA
 import torch
+
+import torch.testing._internal.common_utils as common
 import torch.utils.cpp_extension
+from torch.testing._internal.common_utils import IS_ARM64, skipIfTorchDynamo, TEST_CUDA
 from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
 
 
@@ -27,52 +29,66 @@ def remove_build_path():
         shutil.rmtree(default_build_root, ignore_errors=True)
 
 
-class DummyModule:
-
-    @staticmethod
+def generate_faked_module():
     def device_count() -> int:
         return 1
 
-    @staticmethod
-    def get_rng_state(device: Union[int, str, torch.device] = 'foo') -> torch.Tensor:
+    def get_rng_state(device: Union[int, str, torch.device] = "foo") -> torch.Tensor:
         # create a tensor using our custom device object.
         return torch.empty(4, 4, device="foo")
 
-    @staticmethod
-    def set_rng_state(new_state: torch.Tensor, device: Union[int, str, torch.device] = 'foo') -> None:
+    def set_rng_state(
+        new_state: torch.Tensor, device: Union[int, str, torch.device] = "foo"
+    ) -> None:
         pass
 
-    @staticmethod
     def is_available():
         return True
 
-    @staticmethod
     def current_device():
         return 0
 
+    # create a new module to fake torch.foo dynamicaly
+    foo = types.ModuleType("foo")
+
+    foo.device_count = device_count
+    foo.get_rng_state = get_rng_state
+    foo.set_rng_state = set_rng_state
+    foo.is_available = is_available
+    foo.current_device = current_device
+    foo._lazy_init = lambda: None
+    foo.is_initialized = lambda: True
+
+    return foo
+
+
 @unittest.skipIf(IS_ARM64, "Does not work on arm")
 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestCppExtensionOpenRgistration(common.TestCase):
-    """Tests Open Device Registration with C++ extensions.
-    """
+    """Tests Open Device Registration with C++ extensions."""
+
     module = None
 
     def setUp(self):
         super().setUp()
+
         # cpp extensions use relative paths. Those paths are relative to
         # this file, so we'll change the working directory temporarily
         self.old_working_dir = os.getcwd()
         os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
         assert self.module is not None
 
     def tearDown(self):
         super().tearDown()
+
         # return the working directory (see setUp)
         os.chdir(self.old_working_dir)
 
     @classmethod
     def setUpClass(cls):
         remove_build_path()
+
         cls.module = torch.utils.cpp_extension.load(
             name="custom_device_extension",
             sources=[
@@ -83,429 +99,478 @@ def setUpClass(cls):
             verbose=True,
         )
 
-    @classmethod
-    def tearDownClass(cls):
-        remove_build_path()
+        # register torch.foo module and foo device to torch
+        torch.utils.rename_privateuse1_backend("foo")
+        torch.utils.generate_methods_for_privateuse1_backend(for_storage=True)
+        torch._register_device_module("foo", generate_faked_module())
+
+    def test_base_device_registration(self):
+        self.assertFalse(self.module.custom_add_called())
+        # create a tensor using our custom device object
+        device = self.module.custom_device()
+        x = torch.empty(4, 4, device=device)
+        y = torch.empty(4, 4, device=device)
+        # Check that our device is correct.
+        self.assertTrue(x.device == device)
+        self.assertFalse(x.is_cpu)
+        self.assertFalse(self.module.custom_add_called())
+        # calls out custom add kernel, registered to the dispatcher
+        z = x + y
+        # check that it was called
+        self.assertTrue(self.module.custom_add_called())
+        z_cpu = z.to(device="cpu")
+        # Check that our cross-device copy correctly copied the data to cpu
+        self.assertTrue(z_cpu.is_cpu)
+        self.assertFalse(z.is_cpu)
+        self.assertTrue(z.device == device)
+        self.assertEqual(z, z_cpu)
+
+    def test_common_registration(self):
+        # check unsupported device and duplicated registration
+        with self.assertRaisesRegex(RuntimeError, "Expected one of cpu"):
+            torch._register_device_module("dev", generate_faked_module())
+        with self.assertRaisesRegex(RuntimeError, "The runtime module of"):
+            torch._register_device_module("foo", generate_faked_module())
+
+        # backend name can be renamed to the same name multiple times
+        torch.utils.rename_privateuse1_backend("foo")
+
+        # backend name can't be renamed multiple times to different names.
+        with self.assertRaisesRegex(
+            RuntimeError, "torch.register_privateuse1_backend()"
+        ):
+            torch.utils.rename_privateuse1_backend("dev")
+
+        # generator tensor and module can be registered only once
+        with self.assertRaisesRegex(RuntimeError, "The custom device module of"):
+            torch.utils.generate_methods_for_privateuse1_backend()
+
+        # check whether torch.foo have been registered correctly
+        self.assertTrue(
+            torch.utils.backend_registration._get_custom_mod_func("device_count")() == 1
+        )
+        with self.assertRaisesRegex(RuntimeError, "Try to call torch.foo"):
+            torch.utils.backend_registration._get_custom_mod_func("func_name_")
+
+        # check attributes after registered
+        self.assertTrue(hasattr(torch.Tensor, "is_foo"))
+        self.assertTrue(hasattr(torch.Tensor, "foo"))
+        self.assertTrue(hasattr(torch.TypedStorage, "is_foo"))
+        self.assertTrue(hasattr(torch.TypedStorage, "foo"))
+        self.assertTrue(hasattr(torch.UntypedStorage, "is_foo"))
+        self.assertTrue(hasattr(torch.UntypedStorage, "foo"))
+        self.assertTrue(hasattr(torch.nn.Module, "foo"))
+        self.assertTrue(hasattr(torch.nn.utils.rnn.PackedSequence, "is_foo"))
+        self.assertTrue(hasattr(torch.nn.utils.rnn.PackedSequence, "foo"))
+
+    def test_open_device_generator_registration_and_hooks(self):
+        device = self.module.custom_device()
+        # None of our CPU operations should call the custom add function.
+        self.assertFalse(self.module.custom_add_called())
+
+        # check generator registered before using
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Please register a generator to the PrivateUse1 dispatch key",
+        ):
+            torch.Generator(device=device)
+
+        self.module.register_generator_first()
+        gen = torch.Generator(device=device)
+        self.assertTrue(gen.device == device)
+
+        # generator can be registered only once
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Only can register a generator to the PrivateUse1 dispatch key once",
+        ):
+            self.module.register_generator_second()
+
+        self.module.register_hook()
+        default_gen = self.module.default_generator(0)
+        self.assertTrue(
+            default_gen.device.type == torch._C._get_privateuse1_backend_name()
+        )
 
-    def test_open_device_registration(self):
-        def test_base_device_registration():
-            torch.utils.rename_privateuse1_backend('foo')
-            self.assertFalse(self.module.custom_add_called())
-            # create a tensor using our custom device object
-            device = self.module.custom_device()
-            x = torch.empty(4, 4, device=device)
-            y = torch.empty(4, 4, device=device)
-            # Check that our device is correct.
-            self.assertTrue(x.device == device)
-            self.assertFalse(x.is_cpu)
-            self.assertFalse(self.module.custom_add_called())
-            # calls out custom add kernel, registered to the dispatcher
-            z = x + y
-            # check that it was called
-            self.assertTrue(self.module.custom_add_called())
-            z_cpu = z.to(device='cpu')
-            # Check that our cross-device copy correctly copied the data to cpu
-            self.assertTrue(z_cpu.is_cpu)
-            self.assertFalse(z.is_cpu)
-            self.assertTrue(z.device == device)
-            self.assertEqual(z, z_cpu)
-            z2 = z_cpu + z_cpu
-
-        # check whether the error can be reported correctly
-        def test_before_common_registration():
-            # check that register module name should be the same as custom backend
-            with self.assertRaisesRegex(RuntimeError, "Expected one of cpu"):
-                torch._register_device_module('xxx', DummyModule)
-            # check generator registered before using
-            torch.utils.rename_privateuse1_backend('foo')
-            with self.assertRaisesRegex(RuntimeError, "torch has no module of"):
-                with torch.random.fork_rng(device_type="foo"):
-                    pass
-            # check attributes before registered
-            self.assertFalse(hasattr(torch.Tensor, 'is_foo'))
-            self.assertFalse(hasattr(torch.Tensor, 'foo'))
-            self.assertFalse(hasattr(torch.TypedStorage, 'is_foo'))
-            self.assertFalse(hasattr(torch.TypedStorage, 'foo'))
-            self.assertFalse(hasattr(torch.UntypedStorage, 'is_foo'))
-            self.assertFalse(hasattr(torch.UntypedStorage, 'foo'))
-            self.assertFalse(hasattr(torch.nn.Module, 'foo'))
-
-        def test_after_common_registration():
-            # check attributes after registered
-            self.assertTrue(hasattr(torch.Tensor, 'is_foo'))
-            self.assertTrue(hasattr(torch.Tensor, 'foo'))
-            self.assertTrue(hasattr(torch.TypedStorage, 'is_foo'))
-            self.assertTrue(hasattr(torch.TypedStorage, 'foo'))
-            self.assertTrue(hasattr(torch.UntypedStorage, 'is_foo'))
-            self.assertTrue(hasattr(torch.UntypedStorage, 'foo'))
-            self.assertTrue(hasattr(torch.nn.Module, 'foo'))
-
-        def test_common_registration():
-            # first rename custom backend
-            torch.utils.rename_privateuse1_backend('foo')
-            # backend name can only rename once
-            with self.assertRaisesRegex(RuntimeError, "torch.register_privateuse1_backend()"):
-                torch.utils.rename_privateuse1_backend('xxx')
-            # register foo module, torch.foo
-            torch._register_device_module('foo', DummyModule)
-            self.assertTrue(torch.utils.backend_registration._get_custom_mod_func("device_count")() == 1)
-            with self.assertRaisesRegex(RuntimeError, "Try to call torch.foo"):
-                torch.utils.backend_registration._get_custom_mod_func("func_name_")
-            # default set for_tensor and for_module are True, so only set for_storage is True
-            torch.utils.generate_methods_for_privateuse1_backend(for_storage=True)
-            # generator tensor and module can be registered only once
-            with self.assertRaisesRegex(RuntimeError, "The custom device module of"):
-                torch.utils.generate_methods_for_privateuse1_backend()
-
-        def test_open_device_generator_registration_and_hooks():
-            device = self.module.custom_device()
-            # None of our CPU operations should call the custom add function.
-            self.assertFalse(self.module.custom_add_called())
-            # check generator registered before using
-            with self.assertRaisesRegex(RuntimeError,
-                                        "Please register a generator to the PrivateUse1 dispatch key"):
-                gen_ = torch.Generator(device=device)
-            self.module.register_generator_first()
-            gen = torch.Generator(device=device)
-            self.assertTrue(gen.device == device)
-            # generator can be registered only once
-            with self.assertRaisesRegex(RuntimeError,
-                                        "Only can register a generator to the PrivateUse1 dispatch key once"):
-                self.module.register_generator_second()
-            self.module.register_hook()
-            default_gen = self.module.default_generator(0)
-            self.assertTrue(default_gen.device.type == torch._C._get_privateuse1_backend_name())
-
-        def test_open_device_dispatchstub():
-            # test kernels could be reused by privateuse1 backend through dispatchstub
-            torch.utils.rename_privateuse1_backend('foo')
-            input_data = torch.randn(3, 4, 5, dtype=torch.float32, device="cpu")
-            foo_input_data = input_data.to("foo")
-            self.assertFalse(self.module.custom_abs_called())
-            torch.abs(foo_input_data)
-            self.assertTrue(self.module.custom_abs_called())
-
-        def test_open_device_quantized():
-            torch.utils.rename_privateuse1_backend('foo')
-            input_data = torch.randn(3, 4, 5, dtype=torch.float32, device="cpu").to("foo")
-            quantized_tensor = torch.quantize_per_tensor(input_data, 0.1, 10, torch.qint8)
-            self.assertEqual(quantized_tensor.device, torch.device('foo:0'))
-            self.assertEqual(quantized_tensor.dtype, torch.qint8)
-
-        def test_open_device_random():
-            with torch.random.fork_rng(device_type="foo"):
-                pass
-
-        def test_open_device_tensor():
-            device = self.module.custom_device()
-            # check whether print tensor.type() meets the expectation
-            dtypes = {
-                torch.bool: 'torch.foo.BoolTensor',
-                torch.double: 'torch.foo.DoubleTensor',
-                torch.float32: 'torch.foo.FloatTensor',
-                torch.half: 'torch.foo.HalfTensor',
-                torch.int32: 'torch.foo.IntTensor',
-                torch.int64: 'torch.foo.LongTensor',
-                torch.int8: 'torch.foo.CharTensor',
-                torch.short: 'torch.foo.ShortTensor',
-                torch.uint8: 'torch.foo.ByteTensor',
-            }
-            for tt, dt in dtypes.items():
-                test_tensor = torch.empty(4, 4, dtype=tt, device=device)
-                self.assertTrue(test_tensor.type() == dt)
-            # check whether the attributes and methods of the corresponding custom backend are generated correctly
-            x = torch.empty(4, 4)
-            self.assertFalse(x.is_foo)
-            x = x.foo(torch.device("foo"))
-            self.assertFalse(self.module.custom_add_called())
-            self.assertTrue(x.is_foo)
-            # test different device type input
-            y = torch.empty(4, 4)
-            self.assertFalse(y.is_foo)
-            y = y.foo(torch.device("foo:0"))
-            self.assertFalse(self.module.custom_add_called())
-            self.assertTrue(y.is_foo)
-            # test different device type input
-            z = torch.empty(4, 4)
-            self.assertFalse(z.is_foo)
-            z = z.foo(0)
-            self.assertFalse(self.module.custom_add_called())
-            self.assertTrue(z.is_foo)
-
-        def test_open_device_storage():
-            # check whether the attributes and methods for storage of the corresponding custom backend are generated correctly
-            x = torch.empty(4, 4)
-            z1 = x.storage()
-            self.assertFalse(z1.is_foo)
-            z1 = z1.foo()
-            self.assertFalse(self.module.custom_add_called())
+    def test_open_device_dispatchstub(self):
+        # test kernels could be reused by privateuse1 backend through dispatchstub
+        input_data = torch.randn(2, 2, 3, dtype=torch.float32, device="cpu")
+        foo_input_data = input_data.to("foo")
+        output_data = torch.abs(input_data)
+        foo_output_data = torch.abs(foo_input_data)
+        self.assertEqual(output_data, foo_output_data.cpu())
+
+        output_data = torch.randn(2, 2, 6, dtype=torch.float32, device="cpu")
+        # output operand will resize flag is True in TensorIterator.
+        foo_input_data = input_data.to("foo")
+        foo_output_data = output_data.to("foo")
+        # output operand will resize flag is False in TensorIterator.
+        torch.abs(input_data, out=output_data[:, :, 0:6:2])
+        torch.abs(foo_input_data, out=foo_output_data[:, :, 0:6:2])
+        self.assertEqual(output_data, foo_output_data.cpu())
+
+        # output operand will resize flag is True in TensorIterator.
+        # and convert output to contiguous tensor in TensorIterator.
+        output_data = torch.randn(2, 2, 6, dtype=torch.float32, device="cpu")
+        foo_input_data = input_data.to("foo")
+        foo_output_data = output_data.to("foo")
+        torch.abs(input_data, out=output_data[:, :, 0:6:3])
+        torch.abs(foo_input_data, out=foo_output_data[:, :, 0:6:3])
+        self.assertEqual(output_data, foo_output_data.cpu())
+
+    def test_open_device_quantized(self):
+        input_data = torch.randn(3, 4, 5, dtype=torch.float32, device="cpu").to("foo")
+        quantized_tensor = torch.quantize_per_tensor(input_data, 0.1, 10, torch.qint8)
+        self.assertEqual(quantized_tensor.device, torch.device("foo:0"))
+        self.assertEqual(quantized_tensor.dtype, torch.qint8)
+
+    def test_open_device_random(self):
+        # check if torch.foo have implemented get_rng_state
+        with torch.random.fork_rng(device_type="foo"):
+            pass
+
+    def test_open_device_tensor(self):
+        device = self.module.custom_device()
+
+        # check whether print tensor.type() meets the expectation
+        dtypes = {
+            torch.bool: "torch.foo.BoolTensor",
+            torch.double: "torch.foo.DoubleTensor",
+            torch.float32: "torch.foo.FloatTensor",
+            torch.half: "torch.foo.HalfTensor",
+            torch.int32: "torch.foo.IntTensor",
+            torch.int64: "torch.foo.LongTensor",
+            torch.int8: "torch.foo.CharTensor",
+            torch.short: "torch.foo.ShortTensor",
+            torch.uint8: "torch.foo.ByteTensor",
+        }
+        for tt, dt in dtypes.items():
+            test_tensor = torch.empty(4, 4, dtype=tt, device=device)
+            self.assertTrue(test_tensor.type() == dt)
+
+        # check whether the attributes and methods of the corresponding custom backend are generated correctly
+        x = torch.empty(4, 4)
+        self.assertFalse(x.is_foo)
+
+        x = x.foo(torch.device("foo"))
+        self.assertFalse(self.module.custom_add_called())
+        self.assertTrue(x.is_foo)
+
+        # test different device type input
+        y = torch.empty(4, 4)
+        self.assertFalse(y.is_foo)
+
+        y = y.foo(torch.device("foo:0"))
+        self.assertFalse(self.module.custom_add_called())
+        self.assertTrue(y.is_foo)
+
+        # test different device type input
+        z = torch.empty(4, 4)
+        self.assertFalse(z.is_foo)
+
+        z = z.foo(0)
+        self.assertFalse(self.module.custom_add_called())
+        self.assertTrue(z.is_foo)
+
+    def test_open_device_packed_sequence(self):
+        device = self.module.custom_device()
+        a = torch.rand(5, 3)
+        b = torch.tensor([1, 1, 1, 1, 1])
+        input = torch.nn.utils.rnn.PackedSequence(a, b)
+        self.assertFalse(input.is_foo)
+        input_foo = input.foo()
+        self.assertTrue(input_foo.is_foo)
+
+    def test_open_device_storage(self):
+        # check whether the attributes and methods for storage of the corresponding custom backend are generated correctly
+        x = torch.empty(4, 4)
+        z1 = x.storage()
+        self.assertFalse(z1.is_foo)
+
+        z1 = z1.foo()
+        self.assertFalse(self.module.custom_add_called())
+        self.assertTrue(z1.is_foo)
+
+        with self.assertRaisesRegex(RuntimeError, "Invalid device"):
+            z1.foo(torch.device("cpu"))
+
+        z1 = z1.cpu()
+        self.assertFalse(self.module.custom_add_called())
+        self.assertFalse(z1.is_foo)
+
+        z1 = z1.foo(device="foo:0", non_blocking=False)
+        self.assertFalse(self.module.custom_add_called())
+        self.assertTrue(z1.is_foo)
+
+        with self.assertRaisesRegex(RuntimeError, "Invalid device"):
+            z1.foo(device="cuda:0", non_blocking=False)
+
+        # check UntypedStorage
+        y = torch.empty(4, 4)
+        z2 = y.untyped_storage()
+        self.assertFalse(z2.is_foo)
+
+        z2 = z2.foo()
+        self.assertFalse(self.module.custom_add_called())
+        self.assertTrue(z2.is_foo)
+
+        # check custom StorageImpl create
+        self.module.custom_storage_registry()
+
+        z3 = y.untyped_storage()
+        self.assertFalse(self.module.custom_storageImpl_called())
+
+        z3 = z3.foo()
+        self.assertTrue(self.module.custom_storageImpl_called())
+        self.assertFalse(self.module.custom_storageImpl_called())
+
+        z3 = z3[0:3]
+        self.assertTrue(self.module.custom_storageImpl_called())
+
+    @skipIfTorchDynamo("unsupported aten.is_pinned.default")
+    def test_open_device_storage_pin_memory(self):
+        # Check if the pin_memory is functioning properly on custom device
+        cpu_tensor = torch.empty(3)
+        self.assertFalse(cpu_tensor.is_foo)
+        self.assertFalse(cpu_tensor.is_pinned("foo"))
+
+        cpu_tensor_pin = cpu_tensor.pin_memory("foo")
+        self.assertTrue(cpu_tensor_pin.is_pinned("foo"))
+
+        # Test storage pin_memory on custom device string
+        cpu_storage = cpu_tensor.storage()
+        foo_device = torch.device("foo")
+        self.assertFalse(cpu_storage.is_pinned("foo"))
+
+        cpu_storage_pin = cpu_storage.pin_memory("foo")
+        self.assertFalse(cpu_storage.is_pinned())
+        self.assertFalse(cpu_storage.is_pinned("foo"))
+        self.assertFalse(cpu_storage.is_pinned(foo_device))
+        self.assertFalse(cpu_storage_pin.is_pinned())
+        self.assertTrue(cpu_storage_pin.is_pinned("foo"))
+        self.assertTrue(cpu_storage_pin.is_pinned(foo_device))
+
+        cpu_storage_pin_already = cpu_storage_pin.pin_memory("foo")
+        self.assertTrue(cpu_storage_pin.is_pinned("foo"))
+        self.assertTrue(cpu_storage_pin.is_pinned(foo_device))
+        self.assertTrue(cpu_storage_pin_already.is_pinned("foo"))
+        self.assertTrue(cpu_storage_pin_already.is_pinned(foo_device))
+        self.assertFalse(cpu_storage.is_pinned("foo"))
+
+        cpu_storage_pinned = cpu_storage.pin_memory(foo_device)
+        self.assertFalse(cpu_storage.is_pinned())
+        self.assertFalse(cpu_storage.is_pinned("foo"))
+        self.assertFalse(cpu_storage.is_pinned(foo_device))
+        self.assertFalse(cpu_storage_pinned.is_pinned())
+        self.assertTrue(cpu_storage_pinned.is_pinned("foo"))
+        self.assertTrue(cpu_storage_pinned.is_pinned(foo_device))
+
+        # Test untyped storage pin_memory and is_pin
+        cpu_tensor = torch.randn([3, 2, 1, 4])
+        cpu_untyped_storage = cpu_tensor.untyped_storage()
+        self.assertFalse(cpu_untyped_storage.is_pinned())
+        self.assertFalse(cpu_untyped_storage.is_pinned("foo"))
+
+        cpu_untyped_storage_pinned = cpu_untyped_storage.pin_memory("foo")
+        self.assertFalse(cpu_untyped_storage_pinned.is_pinned())
+        self.assertTrue(cpu_untyped_storage_pinned.is_pinned("foo"))
+        self.assertTrue(cpu_untyped_storage_pinned.is_pinned(foo_device))
+
+        cpu_untyped_storage_pinned = cpu_untyped_storage.pin_memory(foo_device)
+        self.assertFalse(cpu_untyped_storage_pinned.is_pinned())
+        self.assertTrue(cpu_untyped_storage_pinned.is_pinned("foo"))
+        self.assertTrue(cpu_untyped_storage_pinned.is_pinned(foo_device))
+
+        with self.assertRaisesRegex(TypeError, "positional arguments but 3 were given"):
+            cpu_untyped_storage_pinned.is_pinned("foo1", "foo2")
+
+        # Test storage pin_memory on error device
+        self.assertFalse(cpu_storage_pinned.is_pinned("hpu"))
+        self.assertFalse(cpu_untyped_storage_pinned.is_pinned("hpu"))
+        invalid_device = torch.device("hpu")
+        self.assertFalse(cpu_untyped_storage_pinned.is_pinned(invalid_device))
+
+        with self.assertRaisesRegex(
+            NotImplementedError, "with arguments from the 'HPU' backend"
+        ):
+            cpu_storage.pin_memory("hpu")
+        with self.assertRaisesRegex(
+            NotImplementedError, "with arguments from the 'HPU' backend"
+        ):
+            cpu_untyped_storage.pin_memory("hpu")
+        with self.assertRaisesRegex(
+            NotImplementedError, "with arguments from the 'HPU' backend"
+        ):
+            cpu_untyped_storage.pin_memory(invalid_device)
+
+    @unittest.skip(
+        "Temporarily disable due to the tiny differences between clang++ and g++ in defining static variable in inline function"
+    )
+    def test_open_device_serialization(self):
+        self.module.set_custom_device_index(-1)
+        storage = torch.UntypedStorage(4, device=torch.device("foo"))
+        self.assertEqual(torch.serialization.location_tag(storage), "foo")
+
+        self.module.set_custom_device_index(0)
+        storage = torch.UntypedStorage(4, device=torch.device("foo"))
+        self.assertEqual(torch.serialization.location_tag(storage), "foo:0")
+
+        cpu_storage = torch.empty(4, 4).storage()
+        foo_storage = torch.serialization.default_restore_location(cpu_storage, "foo:0")
+        self.assertTrue(foo_storage.is_foo)
+
+        # test tensor MetaData serialization
+        x = torch.empty(4, 4).long()
+        y = x.foo()
+        self.assertFalse(self.module.check_backend_meta(y))
+        self.module.custom_set_backend_meta(y)
+        self.assertTrue(self.module.check_backend_meta(y))
+
+        self.module.custom_serialization_registry()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, "data.pt")
+            torch.save(y, path)
+            z1 = torch.load(path)
+            # loads correctly onto the foo backend device
             self.assertTrue(z1.is_foo)
-            with self.assertRaisesRegex(RuntimeError, "Invalid device"):
-                z1.foo(torch.device("cpu"))
-            z1 = z1.cpu()
-            self.assertFalse(self.module.custom_add_called())
-            self.assertFalse(z1.is_foo)
-            z1 = z1.foo(device="foo:0", non_blocking=False)
-            self.assertFalse(self.module.custom_add_called())
-            self.assertTrue(z1.is_foo)
-            with self.assertRaisesRegex(RuntimeError, "Invalid device"):
-                z1.foo(device="cuda:0", non_blocking=False)
-            # check UntypedStorage
-            y = torch.empty(4, 4)
-            z2 = y.untyped_storage()
+            # loads BackendMeta data correctly
+            self.assertTrue(self.module.check_backend_meta(z1))
+
+            # cross-backend
+            z2 = torch.load(path, map_location="cpu")
+            # loads correctly onto the cpu backend device
             self.assertFalse(z2.is_foo)
-            z2 = z2.foo()
-            self.assertFalse(self.module.custom_add_called())
-            self.assertTrue(z2.is_foo)
-            # check custom StorageImpl create
-            self.module.custom_storage_registry()
-            z3 = y.untyped_storage()
-            self.assertFalse(self.module.custom_storageImpl_called())
-            z3 = z3.foo()
-            self.assertTrue(self.module.custom_storageImpl_called())
-
-        def test_open_device_storage_pin_memory():
-            torch.utils.rename_privateuse1_backend('foo')
-            with self.assertRaisesRegex(RuntimeError, "The custom device module of"):
-                torch.utils.generate_methods_for_privateuse1_backend(for_tensor=False, for_module=False, for_storage=True)
-            # Check if the pin_memory is functioning properly on custom device
-            cpu_tensor = torch.empty(3)
-            self.assertFalse(cpu_tensor.is_foo)
-            self.assertFalse(cpu_tensor.is_pinned("foo"))
-            cpu_tensor_pin = cpu_tensor.pin_memory("foo")
-            self.assertTrue(cpu_tensor_pin.is_pinned("foo"))
-            # Test storage pin_memory on custom device string
-            cpu_storage = cpu_tensor.storage()
-            foo_device = torch.device("foo")
-            self.assertFalse(cpu_storage.is_pinned("foo"))
-            cpu_storage_pin = cpu_storage.pin_memory("foo")
-            self.assertFalse(cpu_storage.is_pinned())
-            self.assertFalse(cpu_storage.is_pinned("foo"))
-            self.assertFalse(cpu_storage.is_pinned(foo_device))
-            self.assertFalse(cpu_storage_pin.is_pinned())
-            self.assertTrue(cpu_storage_pin.is_pinned("foo"))
-            self.assertTrue(cpu_storage_pin.is_pinned(foo_device))
-            cpu_storage_pin_already = cpu_storage_pin.pin_memory("foo")
-            self.assertTrue(cpu_storage_pin.is_pinned("foo"))
-            self.assertTrue(cpu_storage_pin.is_pinned(foo_device))
-            self.assertTrue(cpu_storage_pin_already.is_pinned("foo"))
-            self.assertTrue(cpu_storage_pin_already.is_pinned(foo_device))
-
-            # Test storage pin_memory on torch.device
-            self.assertFalse(cpu_storage.is_pinned("foo"))
-            cpu_storage_pinned = cpu_storage.pin_memory(foo_device)
-            self.assertFalse(cpu_storage.is_pinned())
-            self.assertFalse(cpu_storage.is_pinned("foo"))
-            self.assertFalse(cpu_storage.is_pinned(foo_device))
-            self.assertFalse(cpu_storage_pinned.is_pinned())
-            self.assertTrue(cpu_storage_pinned.is_pinned("foo"))
-            self.assertTrue(cpu_storage_pinned.is_pinned(foo_device))
-
-            # Test untyped storage pin_memory and is_pin
-            cpu_tensor = torch.randn([3, 2, 1, 4])
-            cpu_untyped_storage = cpu_tensor.untyped_storage()
-            self.assertFalse(cpu_untyped_storage.is_pinned())
-            self.assertFalse(cpu_untyped_storage.is_pinned("foo"))
-            cpu_untyped_storage_pinned = cpu_untyped_storage.pin_memory("foo")
-            self.assertFalse(cpu_untyped_storage_pinned.is_pinned())
-            self.assertTrue(cpu_untyped_storage_pinned.is_pinned("foo"))
-            self.assertTrue(cpu_untyped_storage_pinned.is_pinned(foo_device))
-            cpu_untyped_storage_pinned = cpu_untyped_storage.pin_memory(foo_device)
-            self.assertFalse(cpu_untyped_storage_pinned.is_pinned())
-            self.assertTrue(cpu_untyped_storage_pinned.is_pinned("foo"))
-            self.assertTrue(cpu_untyped_storage_pinned.is_pinned(foo_device))
-            with self.assertRaisesRegex(TypeError, "positional arguments but 3 were given"):
-                cpu_untyped_storage_pinned.is_pinned("foo1", "foo2")
-
-            # Test storage pin_memory on error device
-            self.assertFalse(cpu_storage_pinned.is_pinned("hpu"))
-            with self.assertRaisesRegex(NotImplementedError, "with arguments from the 'HPU' backend"):
-                cpu_storage.pin_memory("hpu")
-            self.assertFalse(cpu_untyped_storage_pinned.is_pinned("hpu"))
-            with self.assertRaisesRegex(NotImplementedError, "with arguments from the 'HPU' backend"):
-                cpu_untyped_storage.pin_memory("hpu")
-            invalid_device = torch.device("hpu")
-            self.assertFalse(cpu_untyped_storage_pinned.is_pinned(invalid_device))
-            with self.assertRaisesRegex(NotImplementedError, "with arguments from the 'HPU' backend"):
-                cpu_untyped_storage.pin_memory(invalid_device)
-
-        def test_open_device_serialization():
-            self.module.set_custom_device_index(-1)
-            storage = torch.UntypedStorage(4, device=torch.device('foo'))
-            self.assertEqual(torch.serialization.location_tag(storage), 'foo')
-
-            self.module.set_custom_device_index(0)
-            storage = torch.UntypedStorage(4, device=torch.device('foo'))
-            self.assertEqual(torch.serialization.location_tag(storage), 'foo:0')
-
-            cpu_storage = torch.empty(4, 4).storage()
-            foo_storage = torch.serialization.default_restore_location(cpu_storage, 'foo:0')
-            self.assertTrue(foo_storage.is_foo)
-            # test tensor MetaData serialization
-            x = torch.empty(4, 4).long()
-            y = x.foo()
-            self.assertFalse(self.module.check_backend_meta(y))
-            self.module.custom_set_backend_meta(y)
-            self.assertTrue(self.module.check_backend_meta(y))
-
-            self.module.custom_serialization_registry()
-            with tempfile.TemporaryDirectory() as tmpdir:
-                path = os.path.join(tmpdir, 'data.pt')
-                torch.save(y, path)
-                z1 = torch.load(path)
-                # loads correctly onto the foo backend device
-                self.assertTrue(z1.is_foo)
-                # loads BackendMeta data correctly
-                self.assertTrue(self.module.check_backend_meta(z1))
-                # cross-backend
-                z2 = torch.load(path, map_location='cpu')
-                # loads correctly onto the cpu backend device
-                self.assertFalse(z2.is_foo)
-                # loads BackendMeta data correctly
-                self.assertFalse(self.module.check_backend_meta(z2))
-
-        def test_open_device_storage_resize():
-            torch.utils.rename_privateuse1_backend('foo')
-            cpu_tensor = torch.randn([8])
-            foo_tensor = cpu_tensor.foo()
-            foo_storage = foo_tensor.storage()
-            self.assertTrue(foo_storage.size() == 8)
-            foo_storage.resize_(8)
-            self.assertTrue(foo_storage.size() == 8)
-            with self.assertRaisesRegex(RuntimeError, 'Overflow'):
-                foo_storage.resize_(8**29)
-
-        def test_open_device_storage_type():
-            torch.utils.rename_privateuse1_backend('foo')
-            # test cpu float storage
-            cpu_tensor = torch.randn([8]).float()
-            cpu_storage = cpu_tensor.storage()
-            self.assertEqual(cpu_storage.type(), "torch.FloatStorage")
-
-            # test custom float storage before defining FloatStorage
-            foo_tensor = cpu_tensor.foo()
-            foo_storage = foo_tensor.storage()
-            self.assertEqual(foo_storage.type(), "torch.storage.TypedStorage")
-
-            class CustomFloatStorage:
-                @property
-                def __module__(self):
-                    return "torch." + torch._C._get_privateuse1_backend_name()
-
-                @property
-                def __name__(self):
-                    return "FloatStorage"
-
-            # test custom float storage after defining FloatStorage
-            try:
-                torch.foo.FloatStorage = CustomFloatStorage()
-                self.assertEqual(foo_storage.type(), "torch.foo.FloatStorage")
-
-                # test custom int storage after defining FloatStorage
-                foo_tensor2 = torch.randn([8]).int().foo()
-                foo_storage2 = foo_tensor2.storage()
-                self.assertEqual(foo_storage2.type(), "torch.storage.TypedStorage")
-            finally:
-                torch.foo.FloatStorage = None
-
-        def test_open_device_faketensor():
-            torch.utils.rename_privateuse1_backend('foo')
-            with torch._subclasses.fake_tensor.FakeTensorMode.push():
-                a = torch.empty(1, device="foo")
-                b = torch.empty(1, device="foo:0")
-                result = a + b
-
-        def test_open_device_named_tensor():
-            torch.utils.rename_privateuse1_backend('foo')
-            a = torch.empty([2, 3, 4, 5], device="foo", names=["N", "C", "H", "W"])
-
-        # Not an open registration test - this file is just very convenient
-        # for testing torch.compile on custom C++ operators
-        def test_compile_autograd_function_returns_self():
-            x_ref = torch.randn(4, requires_grad=True)
-            out_ref = self.module.custom_autograd_fn_returns_self(x_ref)
-            out_ref.sum().backward()
-
-            x_test = x_ref.clone().detach().requires_grad_(True)
-            f_compiled = torch.compile(self.module.custom_autograd_fn_returns_self)
-            out_test = f_compiled(x_test)
-            out_test.sum().backward()
-
-            self.assertEqual(out_ref, out_test)
-            self.assertEqual(x_ref.grad, x_test.grad)
-
-        # Not an open registration test - this file is just very convenient
-        # for testing torch.compile on custom C++ operators
-        def test_compile_autograd_function_aliasing():
-            x_ref = torch.randn(4, requires_grad=True)
-            out_ref = torch.ops._test_funcs.custom_autograd_fn_aliasing(x_ref)
-            out_ref.sum().backward()
-
-            x_test = x_ref.clone().detach().requires_grad_(True)
-            f_compiled = torch.compile(torch.ops._test_funcs.custom_autograd_fn_aliasing)
-            out_test = f_compiled(x_test)
-            out_test.sum().backward()
-
-            self.assertEqual(out_ref, out_test)
-            self.assertEqual(x_ref.grad, x_test.grad)
-
-        def test_open_device_tensor_type_fallback():
-            torch.utils.rename_privateuse1_backend('foo')
-            # create tensors located in custom device
-            x = torch.Tensor([1, 2, 3]).to('foo')
-            y = torch.Tensor([1, 0, 2]).to('foo')
-            # create result tensor located in cpu
-            z_cpu = torch.Tensor([0, 2, 1])
-            # Check that our device is correct.
-            device = self.module.custom_device()
-            self.assertTrue(x.device == device)
-            self.assertFalse(x.is_cpu)
-            # call sub op, which will fallback to cpu
-            z = torch.sub(x, y)
-
-            self.assertEqual(z_cpu, z)
-
-        def test_open_device_tensorlist_type_fallback():
-            torch.utils.rename_privateuse1_backend('foo')
-            # create tensors located in custom device
-            v_foo = torch.Tensor([1, 2, 3]).to('foo')
-            # create result tensor located in cpu
-            z_cpu = torch.Tensor([2, 4, 6])
-            # create tensorlist for foreach_add op
-            x = (v_foo, v_foo)
-            y = (v_foo, v_foo)
-            # Check that our device is correct.
-            device = self.module.custom_device()
-            self.assertTrue(v_foo.device == device)
-            self.assertFalse(v_foo.is_cpu)
-            # call _foreach_add op, which will fallback to cpu
-            z = torch._foreach_add(x, y)
-
-            self.assertEqual(z_cpu, z[0])
-            self.assertEqual(z_cpu, z[1])
-
-        test_base_device_registration()
-        test_before_common_registration()
-        test_common_registration()
-        test_after_common_registration()
-        test_open_device_generator_registration_and_hooks()
-        test_open_device_dispatchstub()
-        test_open_device_random()
-        test_open_device_tensor()
-        test_open_device_storage()
-        test_open_device_storage_pin_memory()
-        test_open_device_serialization()
-        test_open_device_storage_resize()
-        test_open_device_storage_type()
-        test_open_device_faketensor()
-        test_open_device_named_tensor()
-        test_open_device_quantized()
-
-        test_compile_autograd_function_returns_self()
-        test_compile_autograd_function_aliasing()
-
-        test_open_device_tensor_type_fallback()
-        test_open_device_tensorlist_type_fallback()
+            # loads BackendMeta data correctly
+            self.assertFalse(self.module.check_backend_meta(z2))
+
+    def test_open_device_storage_resize(self):
+        cpu_tensor = torch.randn([8])
+        foo_tensor = cpu_tensor.foo()
+        foo_storage = foo_tensor.storage()
+        self.assertTrue(foo_storage.size() == 8)
+
+        # Only register tensor resize_ function.
+        foo_tensor.resize_(8)
+        self.assertTrue(foo_storage.size() == 8)
+
+        with self.assertRaisesRegex(TypeError, "Overflow"):
+            foo_tensor.resize_(8**29)
+
+    def test_open_device_storage_type(self):
+        # test cpu float storage
+        cpu_tensor = torch.randn([8]).float()
+        cpu_storage = cpu_tensor.storage()
+        self.assertEqual(cpu_storage.type(), "torch.FloatStorage")
+
+        # test custom float storage before defining FloatStorage
+        foo_tensor = cpu_tensor.foo()
+        foo_storage = foo_tensor.storage()
+        self.assertEqual(foo_storage.type(), "torch.storage.TypedStorage")
+
+        class CustomFloatStorage:
+            @property
+            def __module__(self):
+                return "torch." + torch._C._get_privateuse1_backend_name()
+
+            @property
+            def __name__(self):
+                return "FloatStorage"
+
+        # test custom float storage after defining FloatStorage
+        try:
+            torch.foo.FloatStorage = CustomFloatStorage()
+            self.assertEqual(foo_storage.type(), "torch.foo.FloatStorage")
+
+            # test custom int storage after defining FloatStorage
+            foo_tensor2 = torch.randn([8]).int().foo()
+            foo_storage2 = foo_tensor2.storage()
+            self.assertEqual(foo_storage2.type(), "torch.storage.TypedStorage")
+        finally:
+            torch.foo.FloatStorage = None
+
+    def test_open_device_faketensor(self):
+        with torch._subclasses.fake_tensor.FakeTensorMode.push():
+            a = torch.empty(1, device="foo")
+            b = torch.empty(1, device="foo:0")
+            result = a + b
+
+    def test_open_device_named_tensor(self):
+        torch.empty([2, 3, 4, 5], device="foo", names=["N", "C", "H", "W"])
+
+    # Not an open registration test - this file is just very convenient
+    # for testing torch.compile on custom C++ operators
+    def test_compile_autograd_function_returns_self(self):
+        x_ref = torch.randn(4, requires_grad=True)
+        out_ref = self.module.custom_autograd_fn_returns_self(x_ref)
+        out_ref.sum().backward()
+
+        x_test = x_ref.clone().detach().requires_grad_(True)
+        f_compiled = torch.compile(self.module.custom_autograd_fn_returns_self)
+        out_test = f_compiled(x_test)
+        out_test.sum().backward()
+
+        self.assertEqual(out_ref, out_test)
+        self.assertEqual(x_ref.grad, x_test.grad)
+
+    # Not an open registration test - this file is just very convenient
+    # for testing torch.compile on custom C++ operators
+    @skipIfTorchDynamo("Temporary disabled due to torch._ops.OpOverloadPacket")
+    def test_compile_autograd_function_aliasing(self):
+        x_ref = torch.randn(4, requires_grad=True)
+        out_ref = torch.ops._test_funcs.custom_autograd_fn_aliasing(x_ref)
+        out_ref.sum().backward()
+
+        x_test = x_ref.clone().detach().requires_grad_(True)
+        f_compiled = torch.compile(torch.ops._test_funcs.custom_autograd_fn_aliasing)
+        out_test = f_compiled(x_test)
+        out_test.sum().backward()
+
+        self.assertEqual(out_ref, out_test)
+        self.assertEqual(x_ref.grad, x_test.grad)
+
+    def test_open_device_scalar_type_fallback(self):
+        z_cpu = torch.Tensor([[0, 0, 0, 1, 1, 2], [0, 1, 2, 1, 2, 2]]).to(torch.int64)
+        z = torch.triu_indices(3, 3, device="foo")
+        self.assertEqual(z_cpu, z)
+
+    def test_open_device_tensor_type_fallback(self):
+        # create tensors located in custom device
+        x = torch.Tensor([[1, 2, 3], [2, 3, 4]]).to("foo")
+        y = torch.Tensor([1, 0, 2]).to("foo")
+        # create result tensor located in cpu
+        z_cpu = torch.Tensor([[0, 2, 1], [1, 3, 2]])
+        # Check that our device is correct.
+        device = self.module.custom_device()
+        self.assertTrue(x.device == device)
+        self.assertFalse(x.is_cpu)
+
+        # call sub op, which will fallback to cpu
+        z = torch.sub(x, y)
+        self.assertEqual(z_cpu, z)
+
+        # call index op, which will fallback to cpu
+        z_cpu = torch.Tensor([3, 1])
+        y = torch.Tensor([1, 0]).long().to("foo")
+        z = x[y, y]
+        self.assertEqual(z_cpu, z)
+
+    def test_open_device_tensorlist_type_fallback(self):
+        # create tensors located in custom device
+        v_foo = torch.Tensor([1, 2, 3]).to("foo")
+        # create result tensor located in cpu
+        z_cpu = torch.Tensor([2, 4, 6])
+        # create tensorlist for foreach_add op
+        x = (v_foo, v_foo)
+        y = (v_foo, v_foo)
+        # Check that our device is correct.
+        device = self.module.custom_device()
+        self.assertTrue(v_foo.device == device)
+        self.assertFalse(v_foo.is_cpu)
+
+        # call _foreach_add op, which will fallback to cpu
+        z = torch._foreach_add(x, y)
+        self.assertEqual(z_cpu, z[0])
+        self.assertEqual(z_cpu, z[1])
 
 
 if __name__ == "__main__":
diff --git a/test/test_cpp_extensions_stream_and_event.py b/test/test_cpp_extensions_stream_and_event.py
new file mode 100644
index 0000000000000..728ac5f9809d9
--- /dev/null
+++ b/test/test_cpp_extensions_stream_and_event.py
@@ -0,0 +1,109 @@
+# Owner(s): ["module: mtia"]
+
+import os
+import shutil
+import sys
+import tempfile
+import unittest
+
+import torch
+import torch.testing._internal.common_utils as common
+import torch.utils.cpp_extension
+from torch.testing._internal.common_utils import (
+    IS_ARM64,
+    IS_LINUX,
+    skipIfTorchDynamo,
+    TEST_CUDA,
+    TEST_PRIVATEUSE1,
+)
+from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
+
+
+# define TEST_ROCM before changing TEST_CUDA
+TEST_ROCM = TEST_CUDA and torch.version.hip is not None and ROCM_HOME is not None
+TEST_CUDA = TEST_CUDA and CUDA_HOME is not None
+
+
+def remove_build_path():
+    if sys.platform == "win32":
+        # Not wiping extensions build folder because Windows
+        return
+    default_build_root = torch.utils.cpp_extension.get_default_build_root()
+    if os.path.exists(default_build_root):
+        shutil.rmtree(default_build_root, ignore_errors=True)
+
+
+# Since we use a fake MTIA device backend to test generic Stream/Event, device backends are mutual exclusive to each other.
+# The test will be skipped if any of the following conditions are met:
+@unittest.skipIf(
+    IS_ARM64 or not IS_LINUX or TEST_CUDA or TEST_PRIVATEUSE1 or TEST_ROCM,
+    "Only on linux platform and mutual exclusive to other backends",
+)
+@torch.testing._internal.common_utils.markDynamoStrictTest
+class TestCppExtensionStreamAndEvent(common.TestCase):
+    """Tests Stream and Event with C++ extensions."""
+
+    module = None
+
+    def setUp(self):
+        super().setUp()
+        # cpp extensions use relative paths. Those paths are relative to
+        # this file, so we'll change the working directory temporarily
+        self.old_working_dir = os.getcwd()
+        os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
+    def tearDown(self):
+        super().tearDown()
+        # return the working directory (see setUp)
+        os.chdir(self.old_working_dir)
+
+    @classmethod
+    def tearDownClass(cls):
+        remove_build_path()
+
+    @classmethod
+    def setUpClass(cls):
+        remove_build_path()
+        build_dir = tempfile.mkdtemp()
+        # Load the fake device guard impl.
+        src = f"{os.path.abspath(os.path.dirname(__file__))}/cpp_extensions/mtia_extension.cpp"
+        cls.module = torch.utils.cpp_extension.load(
+            name="mtia_extension",
+            sources=[src],
+            build_directory=build_dir,
+            extra_include_paths=[
+                "cpp_extensions",
+                "path / with spaces in it",
+                "path with quote'",
+            ],
+            is_python_module=False,
+            verbose=True,
+        )
+
+    @skipIfTorchDynamo("Not a TorchDynamo suitable test")
+    def test_stream_event(self):
+        s = torch.Stream()
+        self.assertTrue(s.device_type, int(torch._C._autograd.DeviceType.MTIA))
+        e = torch.Event()
+        self.assertTrue(e.device.type, "mtia")
+        # Should be nullptr by default
+        self.assertTrue(e.event_id == 0)
+        s.record_event(event=e)
+        print(f"recorded event 1: {e}")
+        self.assertTrue(e.event_id != 0)
+        e2 = s.record_event()
+        print(f"recorded event 2: {e2}")
+        self.assertTrue(e2.event_id != 0)
+        self.assertTrue(e2.event_id != e.event_id)
+        e.synchronize()
+        e2.synchronize()
+        time_elapsed = e.elapsed_time(e2)
+        print(f"time elapsed between e1 and e2: {time_elapsed}")
+        old_event_id = e.event_id
+        e.record(stream=s)
+        print(f"recorded event 1: {e}")
+        self.assertTrue(e.event_id == old_event_id)
+
+
+if __name__ == "__main__":
+    common.run_tests()
diff --git a/test/test_cuda.py b/test/test_cuda.py
index a37f42261ba96..d1bf77bec3bb2 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -1,37 +1,70 @@
 # Owner(s): ["module: cuda"]
 
-from itertools import product, chain
 import collections
 import contextlib
-from copy import deepcopy
 import gc
+import json
 import os
 import pickle
+import random
+import subprocess
 import sys
 import tempfile
 import threading
 import unittest
 import warnings
-import subprocess
-import random
+from copy import deepcopy
+from itertools import product
 from random import randint
-import json
 
 import torch
 import torch.cuda
-from torch.cuda._memory_viz import profile_plot, _profile_to_snapshot
-from torch.cuda._memory_viz import trace_plot
-from torch.cuda._memory_viz import segment_plot
 
 from torch import inf, nan
-from torch.utils.checkpoint import checkpoint_sequential
-from torch.testing._internal.common_utils import TestCase, freeze_rng_state, run_tests, \
-    NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, IS_WINDOWS, \
-    slowTest, skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf, TEST_CUDA, TEST_CUDA_GRAPH, TEST_WITH_ROCM, TEST_NUMPY, \
-    get_cycles_per_ms, parametrize, instantiate_parametrized_tests, subtest, IS_JETSON, gcIfJetson, NoTest, IS_LINUX
-from torch.testing._internal.common_cuda import TEST_CUDNN, TEST_MULTIGPU, \
-    _create_scaling_case, _create_scaling_models_optimizers, _get_torch_cuda_version
+from torch.cuda._memory_viz import (
+    _profile_to_snapshot,
+    profile_plot,
+    segment_plot,
+    trace_plot,
+)
 from torch.testing._internal.autocast_test_lists import AutocastTestLists
+from torch.testing._internal.common_cuda import (
+    _get_torch_cuda_version,
+    TEST_CUDNN,
+    TEST_MULTIGPU,
+)
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    onlyCUDA,
+)
+from torch.testing._internal.common_optimizers import optim_db, optims
+from torch.testing._internal.common_utils import (
+    freeze_rng_state,
+    gcIfJetson,
+    get_cycles_per_ms,
+    instantiate_parametrized_tests,
+    IS_ARM64,
+    IS_JETSON,
+    IS_LINUX,
+    IS_WINDOWS,
+    load_tests,
+    NO_MULTIPROCESSING_SPAWN,
+    NoTest,
+    parametrize,
+    run_tests,
+    serialTest,
+    skipCUDAMemoryLeakCheckIf,
+    skipCUDANonDefaultStreamIf,
+    skipIfRocm,
+    slowTest,
+    subtest,
+    TEST_CUDA,
+    TEST_CUDA_GRAPH,
+    TEST_NUMPY,
+    TEST_WITH_ROCM,
+    TestCase,
+)
+from torch.utils.checkpoint import checkpoint_sequential
 from torch.utils.viz._cycles import observe_tensor_cycles
 
 # load_tests from common_utils is used to automatically filter tests for
@@ -39,7 +72,7 @@
 load_tests = load_tests
 
 if not TEST_CUDA:
-    print('CUDA not available, skipping tests', file=sys.stderr)
+    print("CUDA not available, skipping tests", file=sys.stderr)
     TestCase = NoTest  # noqa: F811
 
 try:
@@ -51,7 +84,9 @@
     HAS_TORCHVISION = False
 skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision")
 
-TEST_CUDAMALLOCASYNC = TEST_CUDA and (torch.cuda.get_allocator_backend() == "cudaMallocAsync")
+TEST_CUDAMALLOCASYNC = TEST_CUDA and (
+    torch.cuda.get_allocator_backend() == "cudaMallocAsync"
+)
 TEST_LARGE_TENSOR = TEST_CUDA
 TEST_MEDIUM_TENSOR = TEST_CUDA
 TEST_BF16 = False
@@ -72,14 +107,16 @@ class TestCuda(TestCase):
 
     def setUp(self):
         super().setUp()
-        self.autocast_lists = AutocastTestLists(torch.device('cuda:0'))
+        self.autocast_lists = AutocastTestLists(torch.device("cuda:0"))
 
     def tearDown(self):
         del self.autocast_lists
         super().tearDown()
 
     def test_pinned_memory_with_cudaregister(self):
-        torch.cuda.memory._set_allocator_settings("pinned_use_cuda_host_register:True,pinned_num_register_threads:8")
+        torch.cuda.memory._set_allocator_settings(
+            "pinned_use_cuda_host_register:True,pinned_num_register_threads:8"
+        )
         t = torch.ones(20)
         self.assertFalse(t.is_pinned())
         try:
@@ -93,8 +130,10 @@ def test_pinned_memory_with_cudaregister(self):
 
     def test_pinned_memory_with_cudaregister_multithread(self):
         num_threads = 4
-        threads = [threading.Thread(target=self.test_pinned_memory_with_cudaregister)
-                   for t in range(num_threads)]
+        threads = [
+            threading.Thread(target=self.test_pinned_memory_with_cudaregister)
+            for t in range(num_threads)
+        ]
         for thread in threads:
             thread.start()
         for thread in threads:
@@ -130,8 +169,9 @@ def test_check_error(self):
         # Assert this call doesn't raise.
         torch.cuda.check_error(0)
 
-        with self.assertRaisesRegex(torch.cuda.CudaError,
-                                    "out of memory|hipErrorOutOfMemory"):
+        with self.assertRaisesRegex(
+            torch.cuda.CudaError, "out of memory|hipErrorOutOfMemory"
+        ):
             torch.cuda.check_error(2)
 
     def test_cuda_get_device_name(self):
@@ -157,33 +197,45 @@ def test_cuda_get_device_capability(self):
         self.assertEqual(current_device_capability, device_capability_no_argument)
 
     def test_out_of_memory(self):
-        tensor = torch.zeros(1024, device='cuda')
+        tensor = torch.zeros(1024, device="cuda")
 
-        oom_regex = "would exceed allowed memory" if TEST_CUDAMALLOCASYNC else \
-                    "Tried to allocate 800000000.00 GiB"
+        oom_regex = (
+            "would exceed allowed memory"
+            if TEST_CUDAMALLOCASYNC
+            else "Tried to allocate 800000000.00 GiB"
+        )
         with self.assertRaisesRegex(RuntimeError, oom_regex):
-            torch.empty(1024 * 1024 * 1024 * 800000000, dtype=torch.int8, device='cuda')
+            torch.empty(1024 * 1024 * 1024 * 800000000, dtype=torch.int8, device="cuda")
 
-        with self.assertRaisesRegex(RuntimeError, "Tried to allocate more than 1EB memory"):
-            torch.empty(1024 * 1024 * 1024 * 8000000000, dtype=torch.int8, device='cuda')
+        with self.assertRaisesRegex(
+            RuntimeError, "Tried to allocate more than 1EB memory"
+        ):
+            torch.empty(
+                1024 * 1024 * 1024 * 8000000000, dtype=torch.int8, device="cuda"
+            )
 
         # ensure out of memory error doesn't disturb subsequent kernel
         tensor.fill_(1)
         self.assertTrue((tensor == 1).all())
 
-
-    @unittest.skipIf(TEST_CUDAMALLOCASYNC or IS_JETSON, "Segmentation fault (core dumped)")
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC or IS_JETSON, "Segmentation fault (core dumped)"
+    )
+    @serialTest()
     def test_out_of_memory_retry(self):
         torch.cuda.empty_cache()
         total_memory = torch.cuda.get_device_properties(0).total_memory
-        oom_regex = "would exceed allowed memory" if TEST_CUDAMALLOCASYNC else \
-                    "Tried to allocate"
+        oom_regex = (
+            "would exceed allowed memory"
+            if TEST_CUDAMALLOCASYNC
+            else "Tried to allocate"
+        )
         size = int(total_memory * 0.5)
-        a = torch.empty(size , dtype=torch.int8, device='cuda')
+        a = torch.empty(size, dtype=torch.int8, device="cuda")
         with self.assertRaisesRegex(RuntimeError, oom_regex):
-            b = torch.empty(size, dtype=torch.int8, device='cuda')
+            b = torch.empty(size, dtype=torch.int8, device="cuda")
         del a
-        b = torch.empty(size, dtype=torch.int8, device='cuda')
+        b = torch.empty(size, dtype=torch.int8, device="cuda")
         del b
         # We used a lot of memory here, clean up so we don't affect other tests too much
         torch.cuda.empty_cache()
@@ -198,28 +250,34 @@ def test_set_per_process_memory_fraction(self):
         with self.assertRaisesRegex(ValueError, "Invalid fraction value"):
             torch.cuda.set_per_process_memory_fraction(2.0)
 
-        tensor = torch.zeros(1024, device='cuda')
+        tensor = torch.zeros(1024, device="cuda")
         torch.cuda.empty_cache()
         total_memory = torch.cuda.get_device_properties(0).total_memory
         torch.cuda.set_per_process_memory_fraction(0.5, 0)
 
         # test 0.499 allocation is ok.
         application = int(total_memory * 0.499) - torch.cuda.max_memory_reserved()
-        tmp_tensor = torch.empty(application, dtype=torch.int8, device='cuda')
+        tmp_tensor = torch.empty(application, dtype=torch.int8, device="cuda")
         del tmp_tensor
         torch.cuda.empty_cache()
 
         application = int(total_memory * 0.5)
         # it will get OOM when try to allocate more than half memory.
-        oom_regex = "would exceed allowed memory" if TEST_CUDAMALLOCASYNC else \
-                    "out of memory"
+        oom_regex = (
+            "would exceed allowed memory" if TEST_CUDAMALLOCASYNC else "out of memory"
+        )
         with self.assertRaisesRegex(RuntimeError, oom_regex):
-            torch.empty(application, dtype=torch.int8, device='cuda')
+            torch.empty(application, dtype=torch.int8, device="cuda")
 
         # ensure out of memory error doesn't disturb subsequent kernel
         tensor.fill_(1)
         self.assertTrue((tensor == 1).all())
 
+    def test_uuid(self):
+        uuid = torch.cuda.get_device_properties(0).uuid
+        self.assertEqual(len(str(uuid)), 36)  # xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+        self.assertEqual(len(uuid.bytes), 16)
+
     def test_copy_non_blocking(self):
         def _test_copy_non_blocking(a, b):
             event = torch.cuda.Event()
@@ -247,7 +305,16 @@ def _test_copy_non_blocking(a, b):
         y = torch.ones(10000000 - 1, dtype=torch.uint8).cuda()
         _test_copy_non_blocking(x, y)
 
+    def test_copy_non_blocking_type_conversion(self):
+        a = torch.ones(1, device="cuda")
+        b = torch.zeros(1, device="cpu", pin_memory=True)
+        c = torch.empty(1, device="cuda", dtype=torch.long)
+        torch.cuda._sleep(int(100 * get_cycles_per_ms()))
+        b.copy_(a, non_blocking=True)
+        c.copy_(b, non_blocking=True)
+        self.assertEqual(a, c, exact_dtype=False)
 
+    @serialTest()
     def test_to_non_blocking(self):
         stream = torch.cuda.current_stream()
 
@@ -264,9 +331,11 @@ def _test_to_non_blocking(a, non_blocking, dst):
 
         for dst, try_non_blocking in product(("cuda", "cpu"), (True, False)):
             # Creates source on the opposite device from destination.
-            src = torch.randn(1000000,
-                              device="cuda" if dst == "cpu" else "cpu",
-                              pin_memory=True if dst == "cuda" else False)
+            src = torch.randn(
+                1000000,
+                device="cuda" if dst == "cpu" else "cpu",
+                pin_memory=True if dst == "cuda" else False,
+            )
             _test_to_non_blocking(src, try_non_blocking, dst)
 
     def test_to_cpu_blocking_by_default(self):
@@ -297,10 +366,15 @@ def test_serialization_array_with_storage(self):
         q_copy[1].fill_(10)
         self.assertEqual(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
 
-    @unittest.skipIf(TEST_CUDAMALLOCASYNC or TEST_WITH_ROCM, "temporarily disabled for async")
-    @unittest.skipIf(_get_torch_cuda_version() >= (12, 2), "skipped as explicit workspace allocation is removed")
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC or TEST_WITH_ROCM, "temporarily disabled for async"
+    )
+    @unittest.skipIf(
+        _get_torch_cuda_version() >= (12, 2),
+        "skipped as explicit workspace allocation is removed",
+    )
     def test_cublas_workspace_explicit_allocation(self):
-        a = torch.randn(7, 7, device='cuda', requires_grad=False)
+        a = torch.randn(7, 7, device="cuda", requires_grad=False)
         default_workspace_size = 4096 * 2 * 1024 + 16 * 8 * 1024  # :4096:2:16:8
         # different size (32 MiB) expected on Hopper GPU
         if torch.cuda.get_device_capability() == (9, 0):
@@ -308,29 +382,30 @@ def test_cublas_workspace_explicit_allocation(self):
 
         def check_workspace_size(inp):
             torch._C._cuda_clearCublasWorkspaces()
-            start = torch.torch.cuda.memory_stats()['active_bytes.all.allocated']
+            start = torch.torch.cuda.memory_stats()["active_bytes.all.allocated"]
             with torch.no_grad():
                 torch.matmul(inp, inp)
-            finish = torch.torch.cuda.memory_stats()['active_bytes.all.allocated']
+            finish = torch.torch.cuda.memory_stats()["active_bytes.all.allocated"]
             return finish - start
 
         # check default
-        os.environ['CUBLAS_WORKSPACE_CONFIG'] = ''
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ""
         self.assertTrue(abs(check_workspace_size(a) - default_workspace_size) < 524288)
 
         # check default with bad user config
-        os.environ['CUBLAS_WORKSPACE_CONFIG'] = '-1'
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = "-1"
         self.assertTrue(abs(check_workspace_size(a) - default_workspace_size) < 524288)
 
         # check valid config
-        os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':128:8:64:16:32:32'
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":128:8:64:16:32:32"
         self.assertTrue(abs(check_workspace_size(a) - (3072 * 1024)) < 524288)
 
         torch._C._cuda_clearCublasWorkspaces()
 
     def test_cublas_allow_tf32_get_set(self):
-        skip_tf32_cublas = 'TORCH_ALLOW_TF32_CUBLAS_OVERRIDE' in os.environ and\
-            int(os.environ['TORCH_ALLOW_TF32_CUBLAS_OVERRIDE'])
+        skip_tf32_cublas = "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE" in os.environ and int(
+            os.environ["TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"]
+        )
         if skip_tf32_cublas:
             self.assertTrue(torch.backends.cuda.matmul.allow_tf32)
             return
@@ -343,43 +418,55 @@ def test_cublas_allow_tf32_get_set(self):
 
     def test_float32_matmul_precision_get_set(self):
         orig = torch.get_float32_matmul_precision()
-        skip_tf32_cublas = 'TORCH_ALLOW_TF32_CUBLAS_OVERRIDE' in os.environ and\
-            int(os.environ['TORCH_ALLOW_TF32_CUBLAS_OVERRIDE'])
+        skip_tf32_cublas = "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE" in os.environ and int(
+            os.environ["TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"]
+        )
         # this is really just checking that the environment variable is respected during testing
         # and not overwritten by another function that doesn't revert it to the intitial value
         if not skip_tf32_cublas:
             self.assertFalse(torch.backends.cuda.matmul.allow_tf32)
-            self.assertEqual(torch.get_float32_matmul_precision(), 'highest')
+            self.assertEqual(torch.get_float32_matmul_precision(), "highest")
         else:
             self.assertTrue(torch.backends.cuda.matmul.allow_tf32)
-        for p in ('medium', 'high'):
+        for p in ("medium", "high"):
             torch.set_float32_matmul_precision(p)
             self.assertEqual(torch.get_float32_matmul_precision(), p)
             self.assertTrue(torch.backends.cuda.matmul.allow_tf32)
-        torch.set_float32_matmul_precision('highest')
-        self.assertEqual(torch.get_float32_matmul_precision(), 'highest')
+        torch.set_float32_matmul_precision("highest")
+        self.assertEqual(torch.get_float32_matmul_precision(), "highest")
         self.assertFalse(torch.backends.cuda.matmul.allow_tf32)
         torch.set_float32_matmul_precision(orig)
 
     def test_cublas_allow_fp16_reduced_precision_reduction_get_set(self):
         orig = torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction
-        self.assertEqual(torch._C._get_cublas_allow_fp16_reduced_precision_reduction(), orig)
+        self.assertEqual(
+            torch._C._get_cublas_allow_fp16_reduced_precision_reduction(), orig
+        )
         torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = not orig
-        self.assertEqual(torch._C._get_cublas_allow_fp16_reduced_precision_reduction(), not orig)
+        self.assertEqual(
+            torch._C._get_cublas_allow_fp16_reduced_precision_reduction(), not orig
+        )
         torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = orig
 
     def test_cublas_allow_bf16_reduced_precision_reduction_get_set(self):
         orig = torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction
-        self.assertEqual(torch._C._get_cublas_allow_bf16_reduced_precision_reduction(), orig)
+        self.assertEqual(
+            torch._C._get_cublas_allow_bf16_reduced_precision_reduction(), orig
+        )
         torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = not orig
-        self.assertEqual(torch._C._get_cublas_allow_bf16_reduced_precision_reduction(), not orig)
+        self.assertEqual(
+            torch._C._get_cublas_allow_bf16_reduced_precision_reduction(), not orig
+        )
         torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = orig
 
-
     def test_cudnn_allow_tf32_get_set(self):
-        with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=False):
+        with torch.backends.cudnn.flags(
+            enabled=None, benchmark=None, deterministic=None, allow_tf32=False
+        ):
             self.assertFalse(torch.backends.cudnn.allow_tf32)
-        with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=True):
+        with torch.backends.cudnn.flags(
+            enabled=None, benchmark=None, deterministic=None, allow_tf32=True
+        ):
             self.assertTrue(torch.backends.cudnn.allow_tf32)
 
     def test_type_conversions(self):
@@ -399,7 +486,7 @@ def test_type_conversions(self):
 
     @unittest.skip("was disabled due to not enough memory, but actually it always fail")
     def test_arithmetic_large_tensor(self):
-        x = torch.empty(2**30, device='cuda')
+        x = torch.empty(2**30, device="cuda")
 
         x.fill_(1)
         self.assertEqual(x.sum(), 2**30)
@@ -420,9 +507,11 @@ def test_arithmetic_large_tensor(self):
         self.assertEqual(x.sum(), 2**29)
 
     def test_gather_bool(self):
-        t = torch.tensor([[False, True], [True, True]], device='cuda')
-        self.assertEqual(torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]], device='cuda')),
-                         torch.tensor([[False, False], [True, True]], device='cuda'))
+        t = torch.tensor([[False, True], [True, True]], device="cuda")
+        self.assertEqual(
+            torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]], device="cuda")),
+            torch.tensor([[False, False], [True, True]], device="cuda"),
+        )
 
     def test_torch_manual_seed_seeds_cuda_devices(self):
         with freeze_rng_state():
@@ -451,23 +540,28 @@ def test_manual_seed(self):
 
     def test_specify_improper_device_name(self):
         import os
+
         fname = "tempfile.pt"
         try:
             with self.assertRaisesRegex(RuntimeError, "Invalid device string"):
-                torch.save([torch.nn.Parameter(torch.randn(10, 10))], fname,
-                           _use_new_zipfile_serialization=True)
-                torch.load(fname, 'cuda0')
+                torch.save(
+                    [torch.nn.Parameter(torch.randn(10, 10))],
+                    fname,
+                    _use_new_zipfile_serialization=True,
+                )
+                torch.load(fname, "cuda0")
         finally:
             if os.path.exists(fname):
                 os.remove(fname)
 
     def test_get_device_index(self):
         from torch.cuda._utils import _get_device_index
+
         with self.assertRaisesRegex(RuntimeError, "Invalid device string"):
-            _get_device_index('cuda0', optional=True)
+            _get_device_index("cuda0", optional=True)
 
         with self.assertRaisesRegex(ValueError, "Expected a cuda device"):
-            cpu_device = torch.device('cpu')
+            cpu_device = torch.device("cpu")
             _get_device_index(cpu_device, optional=True)
 
     def test_serialization_array_with_empty(self):
@@ -540,7 +634,9 @@ def perform_copy():
         with torch.cuda.stream(stream):
             tmp2 = torch.cuda.FloatTensor(t.size())
             tmp2.zero_()
-            self.assertNotEqual(tmp2.data_ptr(), ptr[0], msg='allocation re-used to soon')
+            self.assertNotEqual(
+                tmp2.data_ptr(), ptr[0], msg="allocation re-used to soon"
+            )
 
         self.assertEqual(result.tolist(), [1, 2, 3, 4])
 
@@ -550,7 +646,7 @@ def perform_copy():
             torch.cuda.current_stream().synchronize()
             with torch.cuda.stream(stream):
                 tmp3 = torch.cuda.FloatTensor(t.size())
-                self.assertEqual(tmp3.data_ptr(), ptr[0], msg='allocation not re-used')
+                self.assertEqual(tmp3.data_ptr(), ptr[0], msg="allocation not re-used")
 
     def test_record_stream_on_shifted_view(self):
         # See issue #27366
@@ -597,7 +693,7 @@ def test_caching_pinned_memory(self):
         ptr = t.data_ptr()
         del t
         t = torch.FloatTensor([1]).pin_memory()
-        self.assertEqual(t.data_ptr(), ptr, msg='allocation not reused')
+        self.assertEqual(t.data_ptr(), ptr, msg="allocation not reused")
 
         # check that the allocation is not re-used if it's in-use by a copy
         gpu_tensor = torch.cuda.FloatTensor([0])
@@ -605,7 +701,7 @@ def test_caching_pinned_memory(self):
         gpu_tensor.copy_(t, non_blocking=True)
         del t
         t = torch.FloatTensor([1]).pin_memory()
-        self.assertNotEqual(t.data_ptr(), ptr, msg='allocation re-used too soon')
+        self.assertNotEqual(t.data_ptr(), ptr, msg="allocation re-used too soon")
         self.assertEqual(list(gpu_tensor), [1])
 
     def test_caching_allocator_record_stream_oom(self):
@@ -614,10 +710,10 @@ def test_caching_allocator_record_stream_oom(self):
         stream = torch.cuda.Stream()
 
         with torch.cuda.stream(stream):
-            y = torch.zeros(40 * 1024 * 1024, device='cuda')
+            y = torch.zeros(40 * 1024 * 1024, device="cuda")
 
         for _ in range(100):
-            x = torch.empty(40 * 1024 * 1024, device='cuda')
+            x = torch.empty(40 * 1024 * 1024, device="cuda")
             with torch.cuda.stream(stream):
                 y += x
             # delays re-use of `x` until after all operations in `stream`
@@ -630,62 +726,101 @@ def test_caching_allocator_record_stream_oom(self):
 
     # Tests for historic illegal memory access, see #17040.
     def test_reduction_gpu_memory_accessing(self):
-        x = torch.ones(512, 8, dtype=torch.float32, device='cuda')
+        x = torch.ones(512, 8, dtype=torch.float32, device="cuda")
         torch.sum(x, 0)
 
     def test_sum_fp16(self):
-        x = torch.zeros(10, device='cuda', dtype=torch.float16)
+        x = torch.zeros(10, device="cuda", dtype=torch.float16)
         self.assertEqual(x.sum(), 0)
 
-        x = torch.ones(65504, device='cuda', dtype=torch.float16)
+        x = torch.ones(65504, device="cuda", dtype=torch.float16)
         self.assertEqual(x.sum(), 65504)
         self.assertEqual(x.sum(dtype=torch.float32), 65504)
 
-        x = torch.ones(65536, device='cuda', dtype=torch.float16)
+        x = torch.ones(65536, device="cuda", dtype=torch.float16)
         self.assertEqual(x.sum(dtype=torch.float32), 65536)
 
         a = torch.zeros(1203611).bernoulli_(0.0005)
-        x = a.to(device='cuda', dtype=torch.float16)
+        x = a.to(device="cuda", dtype=torch.float16)
         self.assertEqual(x.sum().item(), a.sum().item())
 
         a = torch.zeros(100, 121, 80).bernoulli_(0.0005)
-        x = a.to(device='cuda', dtype=torch.float16)
+        x = a.to(device="cuda", dtype=torch.float16)
         self.assertEqual(x.sum((0, 2)).float().cpu(), a.sum((0, 2)))
 
     def test_mean_fp16(self):
-        x = torch.ones(65536, device='cuda', dtype=torch.float16)
+        x = torch.ones(65536, device="cuda", dtype=torch.float16)
         self.assertEqual(x.mean(), 1)
 
-        x = torch.ones(65536, device='cuda', dtype=torch.float16)
+        x = torch.ones(65536, device="cuda", dtype=torch.float16)
         self.assertEqual(x.mean(dtype=torch.float32), 1)
 
     def test_prod_large(self):
         # tests global reduction (should_global_reduce = true) in case of non-zero identity element
-        x = torch.ones(240000, device='cuda', dtype=torch.float32)
+        x = torch.ones(240000, device="cuda", dtype=torch.float32)
         self.assertEqual(x.prod(), 1)
 
         # test for complex types. Note 240k is divisible by 4
         for dtype in [torch.cfloat, torch.cdouble]:
-            x = torch.ones(240000, device='cuda', dtype=dtype) * (0 + 1j)
+            x = torch.ones(240000, device="cuda", dtype=dtype) * (0 + 1j)
             self.assertEqual(x.prod(), 1)
 
     def test_multinomial_ext(self):
         # Test two corner cases from older PyTorch (Issue #4858)
-        freqs = torch.cuda.FloatTensor([
-            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-            0.03178183361887932, 0.027680952101945877, 0.033176131546497345,
-            0.046052902936935425, 0.07742464542388916, 0.11543981730937958,
-            0.14148041605949402, 0.15784293413162231, 0.13180233538150787,
-            0.08271478116512299, 0.049702685326337814, 0.027557924389839172,
-            0.018125897273421288, 0.011851548217236996, 0.010252203792333603,
-            0.007422595750540495, 0.005372154992073774, 0.0045109698548913,
-            0.0036087757907807827, 0.0035267581697553396, 0.0018864056328311563,
-            0.0024605290964245796, 0.0022964938543736935, 0.0018453967059031129,
-            0.0010662291897460818, 0.0009842115687206388, 0.00045109697384759784,
-            0.0007791675161570311, 0.00020504408166743815, 0.00020504408166743815,
-            0.00020504408166743815, 0.00012302644609007984, 0.0,
-            0.00012302644609007984, 4.100881778867915e-05, 0.0, 0.0, 0.0, 0.0,
-            0.0, 0.0])
+        freqs = torch.cuda.FloatTensor(
+            [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.03178183361887932,
+                0.027680952101945877,
+                0.033176131546497345,
+                0.046052902936935425,
+                0.07742464542388916,
+                0.11543981730937958,
+                0.14148041605949402,
+                0.15784293413162231,
+                0.13180233538150787,
+                0.08271478116512299,
+                0.049702685326337814,
+                0.027557924389839172,
+                0.018125897273421288,
+                0.011851548217236996,
+                0.010252203792333603,
+                0.007422595750540495,
+                0.005372154992073774,
+                0.0045109698548913,
+                0.0036087757907807827,
+                0.0035267581697553396,
+                0.0018864056328311563,
+                0.0024605290964245796,
+                0.0022964938543736935,
+                0.0018453967059031129,
+                0.0010662291897460818,
+                0.0009842115687206388,
+                0.00045109697384759784,
+                0.0007791675161570311,
+                0.00020504408166743815,
+                0.00020504408166743815,
+                0.00020504408166743815,
+                0.00012302644609007984,
+                0.0,
+                0.00012302644609007984,
+                4.100881778867915e-05,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+            ]
+        )
 
         torch.cuda.manual_seed(11042)
         sample = torch.multinomial(freqs, 1000, True)
@@ -699,14 +834,19 @@ def test_multinomial_ext(self):
 
         # test corner case from Issue #13867
         torch.cuda.manual_seed(33)
-        probs = torch.randn(1000000, device='cuda').clamp(min=0) * 3e-5
+        probs = torch.randn(1000000, device="cuda").clamp(min=0) * 3e-5
         samples = probs.multinomial(1000000, replacement=True)
         self.assertGreater(probs[samples].min().item(), 0)
 
     def _spawn_test_multinomial_invalid_probs_cuda(self, probs):
         import subprocess
+
         try:
-            p = subprocess.Popen([sys.executable, '-c', f"""\
+            p = subprocess.Popen(
+                [
+                    sys.executable,
+                    "-c",
+                    f"""\
 import sys
 import torch
 from torch import inf, nan
@@ -717,29 +857,37 @@ def _spawn_test_multinomial_invalid_probs_cuda(self, probs):
     sys.exit(-1) # Should not be reached
 except RuntimeError as e:
     sys.exit(-2)
-"""], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+""",
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                universal_newlines=True,
+            )
             out, err = p.communicate(timeout=10)
             p.wait(timeout=10)
         except subprocess.TimeoutExpired as e:
             p.kill()
             out, err = p.communicate()
         expected_messages = [
-            'device-side assert triggered',  # CUDA
-            'Assertion',  # CUDA
-            'HSA_STATUS_ERROR_EXCEPTION',  # ROCm
-            'Device-side assertion'  # ROCm
+            "device-side assert triggered",  # CUDA
+            "Assertion",  # CUDA
+            "HSA_STATUS_ERROR_EXCEPTION",  # ROCm
+            "Device-side assertion",  # ROCm
         ]
         self.assertTrue(any(msg in out or msg in err for msg in expected_messages))
 
     @slowTest
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts")
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
     def test_multinomial_invalid_probs_cuda(self):
-        self._spawn_test_multinomial_invalid_probs_cuda([1., -1., 1.])
-        self._spawn_test_multinomial_invalid_probs_cuda([1., inf, 1.])
-        self._spawn_test_multinomial_invalid_probs_cuda([1., -inf, 1.])
-        self._spawn_test_multinomial_invalid_probs_cuda([1., 1., nan])
+        self._spawn_test_multinomial_invalid_probs_cuda([1.0, -1.0, 1.0])
+        self._spawn_test_multinomial_invalid_probs_cuda([1.0, inf, 1.0])
+        self._spawn_test_multinomial_invalid_probs_cuda([1.0, -inf, 1.0])
+        self._spawn_test_multinomial_invalid_probs_cuda([1.0, 1.0, nan])
 
     @staticmethod
     def _mute_init():
@@ -750,7 +898,7 @@ def _spawn_method(self, method, arg):
         with ctx.Pool(1, initializer=self._mute_init) as pool:
             errors = pool.map(method, [arg])
             for e in errors:
-                if 'device-side assert triggered' not in str(e):
+                if "device-side assert triggered" not in str(e):
                     self.fail(e)
 
     @staticmethod
@@ -763,21 +911,29 @@ def _test_index_bounds_cuda(idx):
             return err
 
     @slowTest
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
     @skipIfRocm
     def test_index_out_of_bounds_exception_cuda(self):
         test_method = TestCuda._test_index_bounds_cuda
         # Test in-bound access works fine
-        self.assertEqual(test_method(1), "x[torch.tensor([1)]=tensor([1], device='cuda:0')")
+        self.assertEqual(
+            test_method(1), "x[torch.tensor([1)]=tensor([1], device='cuda:0')"
+        )
         # Test that indexing out of bounds causes assert
         self._spawn_method(test_method, 11)
 
     @slowTest
     @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
+    @serialTest()
     def test_huge_index(self):
-        src = torch.empty(15000000, 45, device='cuda', dtype=torch.long).random_(0, 2**22)
-        idx = torch.randperm(src.shape[0], device='cuda')
+        src = torch.empty(15000000, 45, device="cuda", dtype=torch.long).random_(
+            0, 2**22
+        )
+        idx = torch.randperm(src.shape[0], device="cuda")
         res = src[idx]
         res_cpu = src.cpu()[idx.cpu()]
         self.assertEqual(res.cpu(), res_cpu)
@@ -806,20 +962,20 @@ def test_nvtx(self):
     def test_bincount_ext(self):
         # ensure CUDA code coverage
         input_size = (100000,)
-        w = torch.randn(input_size, dtype=torch.double, device='cuda')
+        w = torch.randn(input_size, dtype=torch.double, device="cuda")
         w_cpu = w.cpu()
         # test shared memory impl
-        t = torch.randint(50, input_size, dtype=torch.int8, device='cuda')
+        t = torch.randint(50, input_size, dtype=torch.int8, device="cuda")
         self.assertEqual(t.cpu().bincount(), t.bincount())
         self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
         # test global memory impl
         #   see `CUDAHistogramMemoryType` in SummaryOps.cu
         #   50000 * sizeof(int64_t) == 390 KiB, which should exceed smem of any known GPU
-        t = torch.randint(50000, input_size, dtype=torch.int64, device='cuda')
+        t = torch.randint(50000, input_size, dtype=torch.int64, device="cuda")
         self.assertEqual(t.cpu().bincount(), t.bincount())
         self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
 
-        t = torch.zeros([10], dtype=torch.int32, device='cuda')
+        t = torch.zeros([10], dtype=torch.int32, device="cuda")
         # 35488 * 65536 as int32 would cause overflow to negative value
         # giving negative bin offset
         t[0] = 35488
@@ -837,13 +993,16 @@ def test_norm_type_conversion(self):
         self.assertEqual(a.norm(p=0, dtype=torch.float32), 65536)
 
     def test_cuda_memory_leak_detection_propagates_errors(self):
-        with self.assertRaisesRegex(RuntimeError, r"The size of tensor a \(3\) must match"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"The size of tensor a \(3\) must match"
+        ):
             with self.assertLeaksNoCudaTensors():
-                x = torch.randn(3, 1, device='cuda')
-                y = torch.randn(2, 1, device='cuda')
+                x = torch.randn(3, 1, device="cuda")
+                y = torch.randn(2, 1, device="cuda")
                 z = x + y
 
     @unittest.skipIf(not TEST_MEDIUM_TENSOR, "not enough memory")
+    @serialTest()
     def test_cuda_kernel_loop_overflow(self):
         # Issue #24309: In extreme cases, the loop variable could overflow and continue
         # the kernel loop with a negative index, causing a RuntimeError (invalid write):
@@ -855,6 +1014,7 @@ def test_cuda_kernel_loop_overflow(self):
 
     @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
     @gcIfJetson
+    @serialTest()
     def test_cuda_kernel_loop_overflow_large(self):
         # Make sure input.numel() > INT_MAX is handled:
         x = torch.randn(1, 1, 1, 2**31, dtype=torch.float16, device="cuda")
@@ -896,7 +1056,7 @@ def test_streaming_backwards_sync(self):
 
         # Tests using grads outside the backward() stream context
         # See "Stream semantics of backward passes" on https://pytorch.org/docs/stable/notes/cuda.html
-        x = torch.randn(5, 5, device='cuda', requires_grad=True)
+        x = torch.randn(5, 5, device="cuda", requires_grad=True)
         with torch.cuda.stream(stream):
             stream.wait_stream(default_stream)
             output = MultiplyInStream.apply(x, 2)
@@ -909,7 +1069,7 @@ def test_streaming_backwards_sync(self):
         # Tests that using grads in the same stream context as backward()
         # is safe regardless what streams bwd ops ran on
         bwd_ambient_stream = torch.cuda.Stream()
-        x = torch.randn(5, 5, device='cuda', requires_grad=True)
+        x = torch.randn(5, 5, device="cuda", requires_grad=True)
         with torch.cuda.stream(stream):
             stream.wait_stream(default_stream)
             output = MultiplyInStream.apply(x, 3)
@@ -955,14 +1115,16 @@ def forward(self, x, x_first_use_on_ambient):
         for x_first_use_on_ambient in (True, False):
             # the out_of_place=False, iters=1 case stresses if proper syncs are inserted
             # when grads are initially None and stolen by backward ops.
-            for out_of_place, iters in ((True, 1),
-                                        (False, 1),
-                                        (False, 5)):
+            for out_of_place, iters in ((True, 1), (False, 1), (False, 5)):
                 with torch.cuda.stream(stream):
-                    x = torch.randn(5, 5, device='cuda', requires_grad=True)
+                    x = torch.randn(5, 5, device="cuda", requires_grad=True)
                     model = StreamModel().cuda()
-                    x.register_hook(lambda grad: self.assertEqual(torch.cuda.current_stream(),
-                                                                  stream if x_first_use_on_ambient else model.stream0))
+                    x.register_hook(
+                        lambda grad: self.assertEqual(
+                            torch.cuda.current_stream(),
+                            stream if x_first_use_on_ambient else model.stream0,
+                        )
+                    )
                     for p in model.parameters():
                         self.assertTrue(p.grad is None)
                     for i in range(iters):
@@ -1056,7 +1218,11 @@ def clone_leaf_grads():
                 stash.append(b.grad.clone())
 
             # Use a hook on e to install the callback
-            e.register_hook(lambda grad: torch.autograd.Variable._execution_engine.queue_callback(clone_leaf_grads))
+            e.register_hook(
+                lambda grad: torch.autograd.Variable._execution_engine.queue_callback(
+                    clone_leaf_grads
+                )
+            )
 
         s2.wait_stream(s1)
         with torch.cuda.stream(s2):
@@ -1066,11 +1232,19 @@ def clone_leaf_grads():
             self.assertEqual(stash[0], torch.full_like(a, 6))
             self.assertEqual(stash[1], torch.full_like(a, 6))
 
-    @unittest.skipIf(TEST_WITH_ROCM, "In ROCm, kernel asserts are disabled due to performance overhead")
+    @unittest.skipIf(
+        TEST_WITH_ROCM,
+        "In ROCm, kernel asserts are disabled due to performance overhead",
+    )
     def test_fixed_cuda_assert_async(self):
-        with self.assertRaisesRegex(RuntimeError, "Boolean value of Tensor with no values is ambiguous"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Boolean value of Tensor with no values is ambiguous"
+        ):
             torch._assert_async(torch.tensor([], device="cuda"))
-        with self.assertRaisesRegex(RuntimeError, "Boolean value of Tensor with more than one value is ambiguous"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Boolean value of Tensor with more than one value is ambiguous",
+        ):
             torch._assert_async(torch.tensor([0, 0], device="cuda"))
 
         torch._assert_async(torch.tensor(1, device="cuda"))
@@ -1087,497 +1261,23 @@ def test_fixed_cuda_assert_async(self):
         ]
 
         import subprocess
+
         for stmt in fail_stmts:
             with self.subTest(stmt=stmt):
-                r = subprocess.call([sys.executable, '-c', f"""\
+                r = subprocess.call(
+                    [
+                        sys.executable,
+                        "-c",
+                        f"""\
 import torch
 
 {stmt}
 torch.cuda.synchronize()
-"""])
+""",
+                    ]
+                )
                 self.assertTrue(r != 0)
 
-
-    def test_grad_scaling_update_scale(self, device="cuda", dtype=torch.float):
-        growth = 2.0
-        backoff = 0.25
-        growth_interval = 2
-        scale = torch.full((1,), 4.0, dtype=dtype, device=device)
-        growth_tracker = torch.full((1,), 0.0, dtype=torch.int32, device=device)
-        found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0")
-
-        # Simulates 2 consecutive unskipped iterations
-        torch._amp_update_scale_(scale, growth_tracker, found_inf, growth, backoff, growth_interval)
-        self.assertEqual(growth_tracker, 1)
-        self.assertEqual(scale, 4.0)
-        torch._amp_update_scale_(scale, growth_tracker, found_inf, growth, backoff, growth_interval)
-        self.assertEqual(growth_tracker, 0)
-        self.assertEqual(scale, 8.0)
-
-        # Simulates a skipped iteration
-        found_inf.fill_(1.0)
-        torch._amp_update_scale_(scale, growth_tracker, found_inf, growth, backoff, growth_interval)
-        self.assertEqual(growth_tracker, 0)
-        self.assertEqual(scale, 2.0)
-
-    def test_grad_scaling_unscale_sparse(self, device="cuda", dtype=torch.float):
-        scaler = torch.cuda.amp.GradScaler()
-
-        inv_scale = torch.full((1,), 0.25, dtype=dtype, device=device)
-        found_inf = torch.empty((1,), dtype=dtype, device=device)
-        cur = found_inf.device
-
-        i = torch.tensor([[0, 1, 1],
-                          [2, 0, 2]], device="cuda", dtype=torch.int64)
-        v = torch.tensor([16., 32., 64.], device="cuda", dtype=torch.float)
-        s = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cuda", dtype=dtype)
-
-        p = s.clone()
-        assert p.is_sparse
-        opt = torch.optim.SGD([p], lr=1.)
-
-        p.grad = s.clone()
-        found_inf.zero_()
-        found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, False)[cur]
-        self.assertEqual(found_inf, 0.0)
-        self.assertEqual(p.grad.to_dense(), (s / 4).to_dense())
-
-        v = torch.FloatTensor([16., 32., float('inf')])
-        p.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cuda", dtype=dtype)
-        found_inf.zero_()
-        found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, False)[cur]
-        self.assertEqual(found_inf, 1.0)
-
-        v = torch.FloatTensor([16., 32., float('nan')])
-        p.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cuda", dtype=dtype)
-        found_inf.zero_()
-        found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, False)[cur]
-        self.assertEqual(found_inf, 1.0)
-
-        p = s.clone().half()
-        assert p.is_sparse
-        opt = torch.optim.SGD([p], lr=1.)
-
-        p.grad = s.clone().half()
-        found_inf.zero_()
-        found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, True)[cur]
-        self.assertEqual(found_inf, 0.0)
-        self.assertEqual(p.grad.to_dense(), (s.half() / 4).to_dense())
-
-        # Creates fp16 sparse tensor with duplicated indices (uncoalesced).  The uncoalesced representation
-        # does not overflow in fp16, but the coalesced representation would, because 64000 + 64000 > fp16 max.
-        # _amp_non_finite_check_and_unscale_ should report an overflow here.
-        i = torch.LongTensor([[0, 1, 0],
-                              [2, 0, 2]])
-        v = torch.FloatTensor([64000., 32., 64000.])
-        p.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cuda", dtype=torch.float16)
-        found_inf.zero_()
-        found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, True)[cur]
-        self.assertEqual(found_inf, 1.0)
-
-    def test_grad_scaling_state_dict(self):
-        for lazy_init_scale in True, False:
-            s0 = torch.cuda.amp.GradScaler(init_scale=3., growth_factor=4., backoff_factor=.5, growth_interval=2)
-            s1 = torch.cuda.amp.GradScaler(init_scale=6., growth_factor=7., backoff_factor=.8, growth_interval=1)
-
-            # sets a random value for load_state_dict to overwrite
-            s1._init_growth_tracker = 7
-
-            if lazy_init_scale:
-                # Dummy scale() call to ensure the scale tensor is lazily initialized.
-                s1.scale(torch.full((1,), 4.0, dtype=torch.float32, device="cuda:0"))
-                self.assertTrue(isinstance(s1._scale, torch.cuda.FloatTensor))
-
-            s1.load_state_dict(s0.state_dict())
-
-            self.assertEqual(s1.get_scale(), 3.)
-            self.assertEqual(s1.get_growth_factor(), 4.)
-            self.assertEqual(s1.get_backoff_factor(), .5)
-            self.assertEqual(s1.get_growth_interval(), 2)
-            self.assertEqual(s1._init_growth_tracker, 0)
-
-    # _run_scaling_case generalizes some single-optimizer test logic to avoid too much copy-pasting below.
-    def _run_scaling_case(self, run, unskipped, skipped, atol=1e-7, optimizer_ctor=torch.optim.SGD, optimizer_kwargs=None):
-        # Ensure scaling can be disabled without changing user control flow.
-        for enabled in True, False:
-            (
-                mod_control, mod_scaling, opt_control, opt_scaling, data, loss_fn, skip_iter,
-            ) = _create_scaling_case(optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs)
-
-            # For functionality, test with a modest initial scale, and an unrealistically-large growth factor
-            # so any potential errors with the growth factor handling will be magnified.
-            scaler = torch.cuda.amp.GradScaler(init_scale=128., growth_factor=2.0, enabled=enabled, growth_interval=1)
-
-            _ = run(data, mod_control, opt_control, scaler, loss_fn, skip_iter, False)
-            ret = run(data, mod_scaling, opt_scaling, scaler, loss_fn, skip_iter, True)
-
-            # Allows run() to optionally return a different scaler instance.
-            scaler = ret if ret else scaler
-
-            # If scaling was enabled, the scale factor should have been multiplied by the growth factor
-            # len(data) - skipped times and the backoff factor "skipped" times.
-            if enabled:
-                net_growth = scaler.get_growth_factor()**unskipped if unskipped > 0 else 1.0
-                net_backoff = scaler.get_backoff_factor()**skipped if skipped > 0 else 1.0
-                self.assertTrue(scaler.get_scale() == (128. * net_growth * net_backoff))
-            else:
-                self.assertTrue(scaler.get_scale() == 1.0)
-
-            for c, s in zip(mod_control.parameters(), mod_scaling.parameters()):
-                self.assertEqual(c.grad, s.grad, atol=atol, rtol=1e-05)
-
-                c_state, s_state = opt_control.state[c], opt_scaling.state[s]
-                for k in c_state:
-                    self.assertEqual(c_state[k], s_state[k], atol=atol, rtol=1e-05, msg=k)
-
-                self.assertEqual(c, s, atol=atol, rtol=1e-05)
-
-    # Compares no scaling + no autocasting against scaling + autocasting.
-    def _grad_scaling_autocast_test(self, *, atol=1e-3, optimizer_ctor=torch.optim.SGD, optimizer_kwargs=None):
-        try_pickle = False
-
-        def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
-            for i, (input, target) in enumerate(data):
-                optimizer.zero_grad()
-                with torch.autocast('cuda', enabled=try_scaling_api):
-                    output = model(input)
-                    loss = loss_fn(output, target)
-                if try_scaling_api:
-                    scaler.scale(loss).backward()
-                    if i == skip_iter and scaler.is_enabled():
-                        with torch.no_grad():
-                            model[1].weight.grad.fill_(float('inf'))
-                    scaler.step(optimizer)
-                    scaler.update()
-                    if try_pickle:
-                        scaler = pickle.loads(pickle.dumps(scaler))
-                else:
-                    loss.backward()
-                    if (not scaler.is_enabled()) or (i != skip_iter):
-                        optimizer.step()
-            return scaler
-
-        # NOTE(mkozuki): With current way of testing, `torch.optim.Adam` is failing in spite of `foreach` and `fused`.
-        #   Giving some flexibility to this test might help.
-        context = contextlib.nullcontext
-        if optimizer_ctor in (torch.optim.Adam, torch.optim.AdamW):
-            from functools import partial
-            context = partial(self.assertRaises, AssertionError)
-        with context():
-            # sets atol=1e-3 because we're comparing pure fp32 arithmetic vs a mixture of fp16 and fp32
-            self._run_scaling_case(
-                run, unskipped=3, skipped=1, atol=atol, optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs,
-            )
-            # this will be picked up by try_pickle within run():
-            try_pickle = True
-            self._run_scaling_case(
-                run, unskipped=3, skipped=1, atol=atol, optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs,
-            )
-
-    def test_grad_scaling_autocast(self):
-        for optimizer_ctor in (torch.optim.SGD, torch.optim.Adam, torch.optim.AdamW):
-            self._grad_scaling_autocast_test(optimizer_ctor=optimizer_ctor)
-
-    def test_grad_scaling_autocast_foreach(self):
-        for optimizer_ctor in (torch.optim.SGD, torch.optim.Adam, torch.optim.AdamW):
-            self._grad_scaling_autocast_test(optimizer_ctor=optimizer_ctor, optimizer_kwargs={"foreach": True})
-
-    def test_grad_scaling_autocast_fused(self):
-        for optimizer_ctor in (torch.optim.Adam, torch.optim.AdamW):
-            self._grad_scaling_autocast_test(optimizer_ctor=optimizer_ctor, optimizer_kwargs={"fused": True})
-
-    # Compare non-fused optimizer vs fused one as the fused one unscales gradients
-    # inside its cuda kernel unlike the other.
-    def test_grad_scaling_autocast_fused_optimizers(self):
-        for optimizer_ctor, optimizer_kwargs, separate_unscale in product(
-            (torch.optim.Adam, torch.optim.AdamW),
-            ({"fused": True, "amsgrad": False}, {"fused": True, "amsgrad": True}),
-            (False, True),
-        ):
-            with self.subTest(optim=optimizer_ctor, kwargs=optimizer_kwargs, separate_unscale=separate_unscale):
-                self._grad_scaling_autocast_fused_optimizers(
-                    optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs, separate_unscale=separate_unscale)
-
-    def _grad_scaling_autocast_fused_optimizers(self, optimizer_ctor, optimizer_kwargs, separate_unscale):
-        (
-            mod_control, mod_scaling, opt_control, opt_scaling, data, loss_fn, _,
-        ) = _create_scaling_case(optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs)
-        kwargs = deepcopy(optimizer_kwargs)
-        kwargs["fused"] = False
-        opt_control = optimizer_ctor(mod_control.parameters(), lr=1.0, **kwargs)
-
-        scaler = torch.cuda.amp.GradScaler(init_scale=128.0)
-
-        for input, target in data:
-            opt_control.zero_grad()
-            with torch.autocast('cuda'):
-                output_control = mod_control(input)
-                loss_control = loss_fn(output_control, target)
-            scaler.scale(loss_control).backward()
-            scaler.step(opt_control)
-            scaler.update()
-
-            opt_scaling.zero_grad()
-            with torch.autocast('cuda'):
-                output_scaling = mod_scaling(input)
-                loss_scaling = loss_fn(output_scaling, target)
-            scaler.scale(loss_scaling).backward()
-            if separate_unscale:
-                scaler.unscale_(opt_scaling)
-            scaler.step(opt_scaling)
-            scaler.update()
-
-            self.assertEqual(loss_control, loss_scaling)
-            for param_control, param_scaling in zip(mod_control.parameters(), mod_scaling.parameters()):
-                self.assertEqual(param_control.grad, param_scaling.grad)
-                self.assertEqual(param_control, param_scaling)
-
-                state_control, state_scaling = opt_control.state[param_control], opt_scaling.state[param_scaling]
-
-                for k in state_control:
-                    actual = state_scaling[k]
-                    if k == "step":
-                        actual = actual.squeeze()
-                    self.assertEqual(state_control[k], actual)
-
-    # Make sure that the parameters become nonsense when scaled gradients are finite
-    # but they get invalidated before `optimizer.step`, after `GradScaler.unscale_`
-    def test_params_invalidated_with_grads_invalidated_between_unscale_and_step(self):
-        for optimizer_ctor, optimizer_kwargs in product(
-            (torch.optim.Adam, torch.optim.AdamW),
-            (
-                {"foreach": False, "fused": False},
-                {"foreach": True, "fused": False},
-                {"foreach": False, "fused": True},
-            ),
-        ):
-            with self.subTest(optimizer=optimizer_ctor, optimizer_kwargs=optimizer_kwargs):
-                self._test_grads_invalidated_between_unscale_and_step(optimizer_ctor, optimizer_kwargs)
-
-    def _test_grads_invalidated_between_unscale_and_step(self, optimizer_ctor, optimizer_kwargs):
-        model, _, optimizer, _, data, loss_fn, _ = _create_scaling_case(
-            optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs,
-        )
-        scaler = torch.cuda.amp.GradScaler(init_scale=128.0)
-
-        for input, target in data:
-            optimizer.zero_grad()
-            with torch.autocast('cuda', enabled=True):
-                output = model(input)
-                loss = loss_fn(output, target)
-            scaler.scale(loss).backward()
-            scaler.unscale_(optimizer)
-
-            # deliberately break grads
-            for j, param in enumerate(model.parameters()):
-                param.grad.copy_(torch.inf if j % 2 else torch.nan)
-
-            scaler.step(optimizer)
-            scaler.update()
-
-        self.assertTrue(all((p.isnan().any() or p.isinf().any()) for p in model.parameters()))
-
-    def test_grad_scale_will_not_overflow(self):
-        model = torch.nn.Linear(5, 1).cuda()
-        optimizer = torch.optim.Adam(model.parameters())
-        scaler = torch.cuda.amp.GradScaler(growth_interval=1, growth_factor=2**4, init_scale=1e38)
-        optimizer.zero_grad()
-        x = torch.randn(1, 5).cuda()
-        y = 1e-30 * torch.randn(1, 1).cuda()
-        l = ((model(x) - y)**2).mean()
-        scaler.scale(l).backward()
-        scaler.step(optimizer)
-        scaler.update()
-        assert scaler._scale != float('inf') and scaler._scale != float('nan')
-
-    def test_grad_scaling_clipping(self):
-        def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
-            max_norm = 0.2  # A reasonable value that actually has an effect, based on printouts of grads
-            for i, (input, target) in enumerate(data):
-                optimizer.zero_grad()
-                output = model(input)
-                loss = loss_fn(output, target)
-                if try_scaling_api:
-                    scaler.scale(loss).backward()
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm * scaler.get_scale())
-                    if i == skip_iter and scaler.is_enabled():
-                        model[1].weight.grad.data.fill_(float('inf'))
-                    scaler.step(optimizer)
-                    scaler.update()
-                else:
-                    loss.backward()
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
-                    if (not scaler.is_enabled()) or (i != skip_iter):
-                        optimizer.step()
-
-        self._run_scaling_case(run, unskipped=3, skipped=1, atol=1e-5)
-
-    def test_grad_scaling_clipping_separate_unscale(self):
-        def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
-            max_norm = 0.2  # A reasonable value that actually has an effect, based on printouts of grads
-            for i, (input, target) in enumerate(data):
-                optimizer.zero_grad()
-                output = model(input)
-                loss = loss_fn(output, target)
-                if try_scaling_api:
-                    scaler.scale(loss).backward()
-                    if i == skip_iter and scaler.is_enabled():
-                        model[1].weight.grad.data.fill_(float('inf'))
-                    scaler.unscale_(optimizer)
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm, error_if_nonfinite=False)
-                    scaler.step(optimizer)
-                    scaler.update()
-                else:
-                    loss.backward()
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
-                    if (not scaler.is_enabled()) or (i != skip_iter):
-                        optimizer.step()
-
-        self._run_scaling_case(run, unskipped=3, skipped=1)
-
-    @unittest.skipIf(IS_WINDOWS, 'FIXME: fix this test for Windows')
-    def test_grad_scaling_penalty(self):
-        def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
-            for i, (input, target) in enumerate(data):
-                optimizer.zero_grad()
-                output = model(input)
-                loss = loss_fn(output, target)
-
-                if try_scaling_api:
-                    grad_params = torch.autograd.grad(scaler.scale(loss),
-                                                      model.parameters(), create_graph=True)
-                    inv_scale = 1. / scaler.get_scale()
-                    grad_params = [p * inv_scale for p in grad_params]
-                else:
-                    grad_params = torch.autograd.grad(loss, model.parameters(), create_graph=True)
-
-                grad_norm = 0
-                for grad in grad_params:
-                    grad_norm += grad.pow(2).sum()
-                grad_norm = grad_norm.sqrt()
-                loss = loss + grad_norm
-
-                if try_scaling_api:
-                    scaler.scale(loss).backward()
-                    if i == skip_iter and scaler.is_enabled():
-                        model[1].weight.grad.data.fill_(float('inf'))
-                    scaler.step(optimizer)
-                    scaler.update()
-                else:
-                    loss.backward()
-                    if (not scaler.is_enabled()) or (i != skip_iter):
-                        optimizer.step()
-
-        self._run_scaling_case(run, unskipped=3, skipped=1)
-
-    def test_grad_scaling_accumulation(self):
-        def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
-            iters_to_accumulate = 2
-            for i, (input, target) in enumerate(data):
-                output = model(input)
-                loss = loss_fn(output, target)
-                loss = loss / iters_to_accumulate
-                if try_scaling_api:
-                    scaler.scale(loss).backward()
-                else:
-                    loss.backward()
-                if (i + 1) % iters_to_accumulate == 0:
-                    if try_scaling_api:
-                        scaler.step(optimizer)
-                        scaler.update()
-                        optimizer.zero_grad()
-                    else:
-                        optimizer.step()
-                        optimizer.zero_grad()
-
-        self._run_scaling_case(run, unskipped=2, skipped=0)
-
-    def test_grad_scaling_multiple(self):
-        # Tests gradient scaling with 2 models and 2 optimizers that both receive gradients from 2 losses.
-        # Some of the logic here cannot reuse the generic helper functions created for the 1-optimizer cases.
-        for enabled in True, False:
-            mod_control0, mod_scaling0, opt_control0, opt_scaling0, data, loss_fn, skip_iter = \
-                _create_scaling_case()
-            mod_control1, mod_scaling1, opt_control1, opt_scaling1 = \
-                _create_scaling_models_optimizers()
-
-            scaler = torch.cuda.amp.GradScaler(init_scale=128., growth_factor=2.0, enabled=enabled, growth_interval=1)
-
-            def run(model0, model1, optimizer0, optimizer1, try_scaling_api):
-                for i, (input, target) in enumerate(data):
-                    optimizer0.zero_grad()
-                    optimizer1.zero_grad()
-                    output0 = model0(input)
-                    output1 = model1(input)
-                    loss0 = loss_fn(0.3 * output0 + 0.7 * output1, target)
-                    loss1 = loss_fn(0.6 * output0 - 0.4 * output1, target)
-
-                    if try_scaling_api:
-                        scaler.scale(loss0).backward(retain_graph=True)
-                        scaler.scale(loss1).backward()
-                        if i == skip_iter and scaler.is_enabled():
-                            model1[1].weight.grad.data.fill_(float('inf'))
-
-                        # As an additional stress test, separately unscale for one of the optimizers.
-                        scaler.unscale_(optimizer0)
-
-                        scaler.step(optimizer0)
-                        scaler.step(optimizer1)
-                        scaler.update()
-                    else:
-                        loss0.backward(retain_graph=True)
-                        loss1.backward()
-                        optimizer0.step()
-                        if (not scaler.is_enabled()) or (i != skip_iter):
-                            optimizer1.step()
-
-            run(mod_control0, mod_control1, opt_control0, opt_control1, False)
-            run(mod_scaling0, mod_scaling1, opt_scaling0, opt_scaling1, True)
-
-            # The loss scale should have been multiplied by the growth factor 3 times and the backoff factor once.
-            self.assertTrue(scaler.get_scale() == (128. * scaler.get_growth_factor()**3 *
-                                                   scaler.get_backoff_factor()**1) if enabled else 1.0)
-
-            for c, s in zip(chain(mod_control0.parameters(), mod_control1.parameters()),
-                            chain(mod_scaling0.parameters(), mod_scaling1.parameters())):
-                self.assertEqual(c, s, rtol=1e-5, atol=1e-7)
-
-    def test_grad_scaler_pass_itself(self):
-        class _PlaceHolderOptimizer(torch.optim.Optimizer):
-            tester = self
-
-            def __init__(self, params, defaults=None):
-                if defaults is None:
-                    defaults = {}
-                super().__init__(params, defaults)
-                self._step_supports_amp_scaling = True
-
-        class Optimizer1(_PlaceHolderOptimizer):
-            def step(self, closure=None, *, grad_scaler=None):
-                self.tester.assertTrue(isinstance(grad_scaler, torch.cuda.amp.GradScaler))
-                self.tester.assertFalse(hasattr(self, "grad_scale"))
-                self.tester.assertFalse(hasattr(self, "found_inf"))
-
-        class Optimizer2(_PlaceHolderOptimizer):
-            def step(self, closure=None):
-                self.tester.assertTrue(isinstance(self.grad_scale, torch.Tensor))
-                self.tester.assertTrue(isinstance(self.found_inf, torch.Tensor))
-
-        x = torch.randn(4, 4).cuda()
-        m = torch.nn.Linear(4, 1).cuda()
-        o1 = Optimizer1(m.parameters())
-        o2 = Optimizer2(m.parameters())
-        scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
-
-        with torch.cuda.amp.autocast():
-            y = m(x)
-            loss = y.mean()
-        scaler.scale(loss).backward()
-        with self.assertWarns(FutureWarning):
-            scaler.step(o1)
-        scaler.step(o2)
-        scaler.update()
-
     @unittest.skipIf(TEST_CUDAMALLOCASYNC, "FAIL")
     def test_cublas_multiple_threads_same_device(self):
         # Note, these parameters should be very carefully tuned
@@ -1588,7 +1288,7 @@ def test_cublas_multiple_threads_same_device(self):
         trials = 3
         test_iters = 100
 
-        weight = torch.ones((size, size), device='cuda')
+        weight = torch.ones((size, size), device="cuda")
         results = {}
         barrier = threading.Barrier(num_threads)
 
@@ -1616,10 +1316,11 @@ def _worker(t):
 
         for _ in range(trials):
             for t in range(num_threads):
-                results[t] = torch.ones((size, size), device='cuda')
+                results[t] = torch.ones((size, size), device="cuda")
 
-            threads = [threading.Thread(target=_worker,
-                                        args=(t,)) for t in range(num_threads)]
+            threads = [
+                threading.Thread(target=_worker, args=(t,)) for t in range(num_threads)
+            ]
 
             for thread in threads:
                 thread.start()
@@ -1630,14 +1331,14 @@ def _worker(t):
                 self.assertEqual(results[t].sum().item(), size * size)
 
     # Test is flaky on Windows (https://github.com/pytorch/pytorch/issues/57401)
-    @unittest.skipIf(IS_WINDOWS, 'Test is flaky on Windows (see issue 57401)')
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(IS_WINDOWS, "Test is flaky on Windows (see issue 57401)")
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     @skipIfRocm
     def test_cudnn_multiple_threads_same_device(self):
         # This function is intended to test the lazy creation and reuse of per-thread
         # cudnn handles on each device in aten/src/ATen/cudnn/Handles.cpp.
         # Failure here likely indicates something wrong with that logic.
-        weight = torch.ones((1, 1, 2, 2), device='cuda')
+        weight = torch.ones((1, 1, 2, 2), device="cuda")
 
         results = {}
 
@@ -1647,6 +1348,7 @@ def test_cudnn_multiple_threads_same_device(self):
         barrier = threading.Barrier(num_threads)
 
         with torch.backends.cudnn.flags(enabled=True):
+
             def _worker(t):
                 my_stream = torch.cuda.Stream()
                 # Hard sync so we don't need to worry about creating and using tensors
@@ -1665,16 +1367,20 @@ def _worker(t):
                         #          its own stream, but is actually in thread 1's stream.
                         # thread 0 enqueues its div_, which IS is its own stream,
                         #          but now races with its convolution.
-                        results[t] = torch.nn.functional.conv2d(results[t], weight, padding=0)
+                        results[t] = torch.nn.functional.conv2d(
+                            results[t], weight, padding=0
+                        )
                         results[t].div_(4.0)
                 torch.cuda.synchronize()
 
             for _ in range(trials):
                 for t in range(num_threads):
-                    results[t] = torch.ones((1, 1, 2048, 2048), device='cuda')
+                    results[t] = torch.ones((1, 1, 2048, 2048), device="cuda")
 
-                threads = [threading.Thread(target=_worker,
-                                            args=(t,)) for t in range(num_threads)]
+                threads = [
+                    threading.Thread(target=_worker, args=(t,))
+                    for t in range(num_threads)
+                ]
 
                 for thread in threads:
                     thread.start()
@@ -1682,8 +1388,10 @@ def _worker(t):
                     thread.join()
 
                 for t in range(num_threads):
-                    self.assertEqual(results[t].sum().item(),
-                                     (2048 - test_iters) * (2048 - test_iters))
+                    self.assertEqual(
+                        results[t].sum().item(),
+                        (2048 - test_iters) * (2048 - test_iters),
+                    )
 
     def test_cusparse_multiple_threads_same_device(self):
         size = 1024
@@ -1692,9 +1400,9 @@ def test_cusparse_multiple_threads_same_device(self):
         test_iters = 500
 
         def ones_sparse(size):
-            a = torch.arange(size, device='cuda')
+            a = torch.arange(size, device="cuda")
             indices = torch.cartesian_prod(a, a).t()
-            values = torch.ones(size * size, device='cuda')
+            values = torch.ones(size * size, device="cuda")
             return torch.sparse_coo_tensor(indices, values)
 
         weight = ones_sparse(size)
@@ -1725,10 +1433,11 @@ def _worker(t):
 
         for _ in range(trials):
             for t in range(num_threads):
-                results[t] = torch.ones((size, size), device='cuda')
+                results[t] = torch.ones((size, size), device="cuda")
 
-            threads = [threading.Thread(target=_worker,
-                                        args=(t,)) for t in range(num_threads)]
+            threads = [
+                threading.Thread(target=_worker, args=(t,)) for t in range(num_threads)
+            ]
 
             for thread in threads:
                 thread.start()
@@ -1738,7 +1447,9 @@ def _worker(t):
             for t in range(num_threads):
                 self.assertEqual(results[t].sum().item(), size * size)
 
-    def _run_autocast_outofplace(self, op, args, run_as_type, out_type=None, module=torch, add_kwargs=None):
+    def _run_autocast_outofplace(
+        self, op, args, run_as_type, out_type=None, module=torch, add_kwargs=None
+    ):
         # helper to cast args
         def cast(val, to_type):
             if isinstance(val, torch.Tensor):
@@ -1752,7 +1463,7 @@ def cast(val, to_type):
             add_kwargs = {}
         fast_dtype = torch.bfloat16 if run_as_type == torch.bfloat16 else torch.float16
         self.assertFalse(torch.is_autocast_enabled())
-        with torch.autocast('cuda', dtype=fast_dtype):
+        with torch.autocast("cuda", dtype=fast_dtype):
             self.assertTrue(torch.is_autocast_enabled())
 
             out_type = out_type if out_type is not None else run_as_type
@@ -1762,19 +1473,24 @@ def cast(val, to_type):
             if module is not None and hasattr(module, op):
                 output = getattr(module, op)(*args, **add_kwargs)
                 if isinstance(output, torch.Tensor):
-                    self.assertTrue(out_type == output.dtype,
-                                    f"autocast for torch.{op} produced {output.dtype}, should produce {out_type}")
+                    self.assertTrue(
+                        out_type == output.dtype,
+                        f"autocast for torch.{op} produced {output.dtype}, should produce {out_type}",
+                    )
 
             # Try Tensor.* variant:
             if hasattr(torch.Tensor, op):
                 output_method = getattr(args[0], op)(*args[1:], **add_kwargs)
                 if isinstance(output_method, torch.Tensor):
-                    self.assertTrue(out_type == output_method.dtype,
-                                    "autocast for torch.{} produced {}, should produce torch.{}"
-                                    .format(op, output_method.dtype, out_type))
-
-            self.assertTrue((output is not None) or (output_method is not None),
-                            f"{op} not found as an attribute on either Tensor or the requested module {module}")
+                    self.assertTrue(
+                        out_type == output_method.dtype,
+                        f"autocast for torch.{op} produced {output_method.dtype}, should produce torch.{out_type}",
+                    )
+
+            self.assertTrue(
+                (output is not None) or (output_method is not None),
+                f"{op} not found as an attribute on either Tensor or the requested module {module}",
+            )
 
             # Accounts for ops that return Tensors, iterables, and other non-Tensors.
             # For example, lstm_cell returns a tuple and equal returns bool.
@@ -1790,18 +1506,24 @@ def compare(first, second):
             if (output is not None) and (output_method is not None):
                 self.assertTrue(type(output) == type(output_method))
                 comparison = compare(output, output_method)
-                self.assertTrue(comparison, f"torch.{op} result did not match Tensor.{op} result")
+                self.assertTrue(
+                    comparison, f"torch.{op} result did not match Tensor.{op} result"
+                )
 
             # Compare numerics to Python-side "autocasting" that (we expect) does the same thing
             # as the C++-side autocasting, and should be bitwise accurate.
             output_to_compare = output if output is not None else output_method
-            with torch.autocast('cuda', enabled=False):
+            with torch.autocast("cuda", enabled=False):
                 self.assertFalse(torch.is_autocast_enabled())
 
                 if module is not None and hasattr(module, op):
-                    control = getattr(module, op)(*cast(args, run_as_type), **add_kwargs)
+                    control = getattr(module, op)(
+                        *cast(args, run_as_type), **add_kwargs
+                    )
                 else:
-                    control = getattr(args[0].to(run_as_type), op)(*cast(args[1:], run_as_type), **add_kwargs)
+                    control = getattr(args[0].to(run_as_type), op)(
+                        *cast(args[1:], run_as_type), **add_kwargs
+                    )
                 self.assertTrue(type(output_to_compare) == type(control))
                 comparison = compare(output_to_compare, control)
                 self.assertTrue(comparison, f"torch.{op} result did not match control")
@@ -1814,7 +1536,7 @@ def args_maybe_kwargs(self, op_with_args):
         else:
             return op_with_args[0], op_with_args[1], op_with_args[2]
 
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_autocast_torch_fp16(self):
         with torch.backends.cudnn.flags(enabled=True, deterministic=True):
             for op_with_args in self.autocast_lists.torch_fp16:
@@ -1825,7 +1547,7 @@ def test_autocast_torch_fp16(self):
                 if not skip_test:
                     self._run_autocast_outofplace(op, args, torch.float16)
 
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_autocast_torch_bf16(self):
         with torch.backends.cudnn.flags(enabled=True, deterministic=True):
             for op_with_args in self.autocast_lists.torch_fp16:
@@ -1833,91 +1555,109 @@ def test_autocast_torch_bf16(self):
                 op, args = op_with_args[0], op_with_args[1]
                 if len(op_with_args) == 3:
                     skip_test = op_with_args[2]  # TEST_WITH_ROCM
-                should_error_from_cudnn = 'cudnn' in op and \
-                    ('TORCH_CUDNN_V8_API_DISABLED' in os.environ and
-                     int(os.environ['TORCH_CUDNN_V8_API_DISABLED']) or
-                     torch.cuda.get_device_capability() < (8, 0))
+                should_error_from_cudnn = "cudnn" in op and (
+                    "TORCH_CUDNN_V8_API_DISABLED" in os.environ
+                    and int(os.environ["TORCH_CUDNN_V8_API_DISABLED"])
+                    or torch.cuda.get_device_capability() < (8, 0)
+                )
                 should_error_from_not_implemented = should_error_from_cudnn
                 if not skip_test:
                     if should_error_from_not_implemented:
-                        with self.assertRaises(RuntimeError, msg=str(op) + ' should not be supported for bfloat16!'):
+                        with self.assertRaises(
+                            RuntimeError,
+                            msg=str(op) + " should not be supported for bfloat16!",
+                        ):
                             self._run_autocast_outofplace(op, args, torch.bfloat16)
                     else:
                         if torch.cuda.is_bf16_supported():
                             self._run_autocast_outofplace(op, args, torch.bfloat16)
                         else:
-                            with self.assertRaisesRegex(RuntimeError, 'Device does not support bfloat16'):
+                            with self.assertRaisesRegex(
+                                RuntimeError, "Device does not support bfloat16"
+                            ):
                                 self._run_autocast_outofplace(op, args, torch.bfloat16)
 
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_autocast_torch_fp32(self):
         for op_with_args in self.autocast_lists.torch_fp32:
             op, args, maybe_kwargs = self.args_maybe_kwargs(op_with_args)
-            self._run_autocast_outofplace(op, args, torch.float32, add_kwargs=maybe_kwargs)
+            self._run_autocast_outofplace(
+                op, args, torch.float32, add_kwargs=maybe_kwargs
+            )
 
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_autocast_torch_need_autocast_promote(self):
         for op, args in self.autocast_lists.torch_need_autocast_promote:
             self._run_autocast_outofplace(op, args, torch.float32)
 
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_autocast_torch_expect_builtin_promote(self):
         for op, args, out_type in self.autocast_lists.torch_expect_builtin_promote:
             self._run_autocast_outofplace(op, args, torch.float32, out_type=out_type)
 
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_autocast_nn_fp16(self):
         with torch.backends.cudnn.flags(enabled=True, deterministic=True):
             for op, args in self.autocast_lists.nn_fp16:
-                self._run_autocast_outofplace(op, args, torch.float16, module=torch._C._nn)
+                self._run_autocast_outofplace(
+                    op, args, torch.float16, module=torch._C._nn
+                )
 
-
-
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_autocast_nn_bf16(self):
         with torch.backends.cudnn.flags(enabled=True, deterministic=True):
             for op, args in self.autocast_lists.nn_fp16:
                 if torch.cuda.is_bf16_supported():
-                    self._run_autocast_outofplace(op, args, torch.bfloat16, module=torch._C._nn)
+                    self._run_autocast_outofplace(
+                        op, args, torch.bfloat16, module=torch._C._nn
+                    )
                 else:
-                    with self.assertRaisesRegex(RuntimeError, 'Device does not support bfloat16'):
-                        self._run_autocast_outofplace(op, args, torch.bfloat16, module=torch._C._nn)
-
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+                    with self.assertRaisesRegex(
+                        RuntimeError, "Device does not support bfloat16"
+                    ):
+                        self._run_autocast_outofplace(
+                            op, args, torch.bfloat16, module=torch._C._nn
+                        )
+
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_autocast_nn_fp32(self):
         for op, args in self.autocast_lists.nn_fp32:
             self._run_autocast_outofplace(op, args, torch.float32, module=torch._C._nn)
 
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_autocast_linalg_fp16(self):
         with torch.backends.cudnn.flags(enabled=True, deterministic=True):
             for op, args in self.autocast_lists.linalg_fp16:
-                self._run_autocast_outofplace(op, args, torch.float16, module=torch._C._linalg)
+                self._run_autocast_outofplace(
+                    op, args, torch.float16, module=torch._C._linalg
+                )
 
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_autocast_methods_fp16(self):
         with torch.backends.cudnn.flags(enabled=True, deterministic=True):
             for op, args in self.autocast_lists.methods_fp16:
                 self._run_autocast_outofplace(op, args, torch.float16, module=None)
 
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_autocast_methods_fp32(self):
         for op, args in self.autocast_lists.methods_fp32:
             self._run_autocast_outofplace(op, args, torch.float32, module=None)
 
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_autocast_methods_expect_builtin_promote(self):
         for op, args, out_type in self.autocast_lists.methods_expect_builtin_promote:
-            self._run_autocast_outofplace(op, args, torch.float32, module=None, out_type=out_type)
+            self._run_autocast_outofplace(
+                op, args, torch.float32, module=None, out_type=out_type
+            )
 
     def test_autocast_banned(self):
-        with torch.autocast('cuda'):
+        with torch.autocast("cuda"):
             for op, args, module in self.autocast_lists.banned:
                 with self.assertRaises(RuntimeError):
                     getattr(module, op)(*args)
 
     def test_autocast_ignored_types(self):
-        with torch.autocast('cuda'):
+        with torch.autocast("cuda"):
             for ignore_type in (torch.double, torch.int32):
                 a_ignore = torch.ones((8, 8), dtype=ignore_type, device="cuda:0")
                 b_ignore = torch.ones((8, 8), dtype=ignore_type, device="cuda:0")
@@ -1928,24 +1668,26 @@ def test_autocast_ignored_types(self):
                 if ignore_type is torch.double:
                     with self.assertRaises(RuntimeError):
                         torch.mm(a_ignore, c_16)
-                    with torch.autocast('cuda', enabled=False):
+                    with torch.autocast("cuda", enabled=False):
                         type_no_autocast = torch.mm(a_ignore, b_ignore).dtype
-                    self.assertTrue(torch.mm(a_ignore, b_ignore).dtype is type_no_autocast)
+                    self.assertTrue(
+                        torch.mm(a_ignore, b_ignore).dtype is type_no_autocast
+                    )
 
                 # Tests if CastPolicy::fp32 ops ignore double and int
-                with torch.autocast('cuda', enabled=False):
+                with torch.autocast("cuda", enabled=False):
                     type_no_autocast = torch.pow(a_ignore, 2.0).dtype
                 self.assertTrue(torch.pow(a_ignore, 2.0).dtype is type_no_autocast)
 
                 # Tests if CastPolicy::fp32_set_opt_dtype ops ignore double and int
-                with torch.autocast('cuda', enabled=False):
+                with torch.autocast("cuda", enabled=False):
                     type_no_autocast = torch.sum(a_ignore).dtype
                 self.assertTrue(torch.sum(a_ignore).dtype is type_no_autocast)
 
                 # Tests if CastPolicy::fp32_append_dtype ops ignore double and int
                 # Currently, no ops belonging to this policy support integer inputs.
                 if ignore_type is torch.double:
-                    with torch.autocast('cuda', enabled=False):
+                    with torch.autocast("cuda", enabled=False):
                         type_no_autocast = torch.norm(a_ignore).dtype
                     self.assertTrue(torch.norm(a_ignore).dtype is type_no_autocast)
 
@@ -2007,9 +1749,18 @@ def backward(ctx, grad):
         # Puts one input tensor in a nested container.  y's contained Tensor won't receive a gradient,
         # because torch.autograd.Function can't hand gradients back to non-Tensor forward arguments.
         # Sets requires_grad=False explicitly so we don't lie about expecting a gradient.
-        y = (0, {0: torch.randn((8, 8), device="cuda", dtype=torch.float16, requires_grad=False)})
+        y = (
+            0,
+            {
+                0: torch.randn(
+                    (8, 8), device="cuda", dtype=torch.float16, requires_grad=False
+                )
+            },
+        )
 
-        with torch.autocast('cuda', ):
+        with torch.autocast(
+            "cuda",
+        ):
             output = mymm(x, y, torch.float32)
             self.assertTrue(output.dtype is torch.float32)
             loss = output.sum()
@@ -2037,15 +1788,14 @@ def forward(self):
         model = Model()
         model_jit_script = torch.jit.script(model)
 
-        with torch.autocast('cuda', enabled=True):
+        with torch.autocast("cuda", enabled=True):
             model()
             model_jit_script()
 
     # cudnn RNNs require special backend handling (weights are cast to FP16 and reflattened)
     # so they get a dedicated test.
     # Despite the large number of RNN cases it tries, the test takes < 15 seconds on a Titan V (similar to V100).
-    @skipIfRocm
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    @unittest.skipIf(not TEST_CUDNN, "CUDNN not available")
     def test_autocast_rnn(self):
         with torch.backends.cudnn.flags(enabled=True, deterministic=True):
             # seq, batch, features, hidden size
@@ -2054,10 +1804,27 @@ def test_autocast_rnn(self):
             dtypes = (torch.float16, torch.float32)
             input_layouts = ("seq_first", "batch_first", "packed")
 
-            for (cls, num_layers, bias, input_layout, bidirectional, try_nonpreflattened_weights,
-                 input_dtype, hidden_dtype, weight_dtype) in \
-                    product(clses, (1, 2), (True, False), input_layouts, (True, False), (True, False),
-                            dtypes, dtypes, dtypes):
+            for (
+                cls,
+                num_layers,
+                bias,
+                input_layout,
+                bidirectional,
+                try_nonpreflattened_weights,
+                input_dtype,
+                hidden_dtype,
+                weight_dtype,
+            ) in product(
+                clses,
+                (1, 2),
+                (True, False),
+                input_layouts,
+                (True, False),
+                (True, False),
+                dtypes,
+                dtypes,
+                dtypes,
+            ):
                 if input_layout == "seq_first":
                     batch_first = False
                     x = torch.randn((T, B, F), device="cuda", dtype=input_dtype)
@@ -2066,44 +1833,72 @@ def test_autocast_rnn(self):
                     x = torch.randn((B, T, F), device="cuda", dtype=input_dtype)
                 elif input_layout == "packed":
                     batch_first = False
-                    x = torch.nn.utils.rnn.pack_padded_sequence(torch.randn((T, B, F),
-                                                                            device="cuda", dtype=input_dtype),
-                                                                lengths=(3, 2, 1, 3),
-                                                                enforce_sorted=False)
-
-                rnn = getattr(torch.nn, cls)(F, H, num_layers=num_layers, bidirectional=bidirectional,
-                                             bias=bias, batch_first=batch_first).cuda().to(dtype=weight_dtype)
+                    x = torch.nn.utils.rnn.pack_padded_sequence(
+                        torch.randn((T, B, F), device="cuda", dtype=input_dtype),
+                        lengths=(3, 2, 1, 3),
+                        enforce_sorted=False,
+                    )
+
+                rnn = (
+                    getattr(torch.nn, cls)(
+                        F,
+                        H,
+                        num_layers=num_layers,
+                        bidirectional=bidirectional,
+                        bias=bias,
+                        batch_first=batch_first,
+                    )
+                    .cuda()
+                    .to(dtype=weight_dtype)
+                )
 
                 if try_nonpreflattened_weights:
                     for p in rnn.parameters():
                         with torch.no_grad():
                             p.set_(p.clone())
 
-                h = torch.randn((num_layers * (2 if bidirectional else 1), B, H),
-                                device="cuda", dtype=hidden_dtype)
+                h = torch.randn(
+                    (num_layers * (2 if bidirectional else 1), B, H),
+                    device="cuda",
+                    dtype=hidden_dtype,
+                )
                 if cls == "LSTM":
-                    c = torch.randn((num_layers * (2 if bidirectional else 1), B, H),
-                                    device="cuda", dtype=hidden_dtype)
+                    c = torch.randn(
+                        (num_layers * (2 if bidirectional else 1), B, H),
+                        device="cuda",
+                        dtype=hidden_dtype,
+                    )
                     h = (h, c)
 
-                with torch.autocast('cuda', ):
+                with torch.autocast(
+                    "cuda",
+                ):
                     out, h_out = rnn(x, h)
                 out = out.data if input_layout == "packed" else out
                 self.assertEqual(out.dtype, torch.float16)
                 # Autocast wrapper requires at::_cudnn_rnn is autograd-exposed.  This check can't guarantee
                 # at::_cudnn_rnn is autograd-exposed, but if it fires, it indicates some funny business has
                 # occurred and we should double check that at::_cudnn_rnn remains autograd-exposed.
-                self.assertEqual(out.grad_fn.name(), "CudnnRnnBackward0")
+                self.assertEqual(
+                    out.grad_fn.name(),
+                    "MiopenRnnBackward0" if torch.version.hip else "CudnnRnnBackward0",
+                )
                 out.sum().backward()
                 grads = [p.grad.clone() for p in rnn.parameters()]
 
                 rnn.zero_grad()
 
                 if cls == "LSTM":
-                    out_control, h_out_control = rnn.to(dtype=torch.float16)(x.half(), (h[0].half(), h[1].half()))
+                    out_control, h_out_control = rnn.to(dtype=torch.float16)(
+                        x.half(), (h[0].half(), h[1].half())
+                    )
                 else:
-                    out_control, h_out_control = rnn.to(dtype=torch.float16)(x.half(), h.half())
-                out_control = out_control.data if input_layout == "packed" else out_control
+                    out_control, h_out_control = rnn.to(dtype=torch.float16)(
+                        x.half(), h.half()
+                    )
+                out_control = (
+                    out_control.data if input_layout == "packed" else out_control
+                )
                 out_control.sum().backward()
                 grads_control = [p.grad.clone() for p in rnn.parameters()]
 
@@ -2112,7 +1907,10 @@ def test_autocast_rnn(self):
                 self.assertEqual(out, out_control)
 
                 if cls == "LSTM":
-                    self.assertTrue(h_out[0].dtype is torch.float16 and h_out[1].dtype is torch.float16)
+                    self.assertTrue(
+                        h_out[0].dtype is torch.float16
+                        and h_out[1].dtype is torch.float16
+                    )
                     self.assertEqual(h_out[0], h_out_control[0])
                     self.assertEqual(h_out[1], h_out_control[1])
                 else:
@@ -2126,10 +1924,12 @@ def test_autocast_cache_leak(self):
         # Test is used to check, if autocast recaches the same parameters
         # when executed in a `torch.no_grad()` block.
 
-        linear = torch.nn.Linear(10, 10).to('cuda')
-        data = torch.randn(1, 10, device='cuda')
+        linear = torch.nn.Linear(10, 10).to("cuda")
+        data = torch.randn(1, 10, device="cuda")
 
-        with torch.autocast('cuda', ):
+        with torch.autocast(
+            "cuda",
+        ):
             with torch.no_grad():
                 out = linear(data)
                 first_iter_mem = torch.cuda.memory_allocated()
@@ -2138,11 +1938,15 @@ def test_autocast_cache_leak(self):
                 self.assertTrue(first_iter_mem == torch.cuda.memory_allocated())
 
     def test_autocast_checkpointing(self):
-        model = torch.nn.Sequential(torch.nn.Linear(8, 8),
-                                    torch.nn.Linear(8, 8),
-                                    torch.nn.Linear(8, 8)).cuda()
-        input = torch.rand((8, 8), device="cuda", dtype=torch.float16, requires_grad=True)
-        with torch.autocast('cuda', ):
+        model = torch.nn.Sequential(
+            torch.nn.Linear(8, 8), torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)
+        ).cuda()
+        input = torch.rand(
+            (8, 8), device="cuda", dtype=torch.float16, requires_grad=True
+        )
+        with torch.autocast(
+            "cuda",
+        ):
             output = checkpoint_sequential(model, 2, input, use_reentrant=True)
         self.assertTrue(output.requires_grad)
         self.assertTrue(output.dtype is torch.float16)
@@ -2150,8 +1954,9 @@ def test_autocast_checkpointing(self):
 
     @slowTest
     @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
+    @serialTest()
     def test_max_large_axis(self):
-        x = torch.zeros(2**32, device='cuda', dtype=torch.int8)
+        x = torch.zeros(2**32, device="cuda", dtype=torch.int8)
         x[-1] = 1
         val, idx = x.max(0)
         self.assertEqual(val, 1)
@@ -2173,7 +1978,9 @@ def test_graph_is_current_stream_capturing(self):
                 self.assertTrue(torch.cuda.is_current_stream_capturing())
                 g.capture_end()
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_graph_capture_simple(self):
         s = torch.cuda.Stream()
 
@@ -2190,9 +1997,167 @@ def test_graph_capture_simple(self):
 
         g.replay()
 
-        self.assertTrue(b.sum().item() == 11000.)
+        self.assertTrue(b.sum().item() == 11000.0)
+
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
+    def test_graphsafe_set_get_rng_state(self):
+        # Define a function to create generator states, with optional graph registration
+        def create_states(generator):
+            """Initializes generator states and registers them with a CUDA graph if provided."""
+            # Ensure the CUDA generator is initialized
+            torch.rand(1, device="cuda")
+            generator.manual_seed(0)
+
+            # Save the current state of the generator
+            old_state = generator.graphsafe_get_state()
+            # Create and save a cloned state of the generator
+            new_state = generator.clone_state()
+            # Return the original generator and its two states
+            return generator, old_state, new_state
+
+        def register_states_to_graph(generator_state, graph):
+            generator, old_state, new_state = generator_state
+            graph.register_generator_state(old_state)
+            graph.register_generator_state(new_state)
+
+        # Define a function to perform specific RNG actions using the generator's states
+        def perform_random_generation_steps(generator_state):
+            generator, old_state, new_state = generator_state
+            random_values = []
+
+            # Generate random numbers with the new generator state
+            generator.graphsafe_set_state(new_state)
+            random_values.append(torch.rand(5, device="cuda", generator=generator))
+
+            # Generate random numbers twice with the old generator state
+            generator.graphsafe_set_state(old_state)
+            random_values.extend(
+                [torch.rand(5, device="cuda", generator=generator) for _ in range(2)]
+            )
+
+            return random_values
+
+        # Define a function to retrieve the final offsets of the original and new generator states
+        def get_final_offsets_of_states(generator_state):
+            generator, old_state, new_state = generator_state
+            old_state_offset = old_state.get_offset()
+            new_state_offset = new_state.get_offset()
+            return old_state_offset, new_state_offset
+
+        # Set up and test a new CUDA generator
+        generator = torch.Generator(device="cuda")
+        generator_state = create_states(generator)
+
+        # Set up and test the default CUDA generator with a CUDA Graph
+        g = torch.cuda.CUDAGraph()
+        s = torch.cuda.Stream()
+        default_generator = torch.cuda.default_generators[0]
+        default_generator_state = create_states(default_generator)
+        register_states_to_graph(default_generator_state, g)
+
+        # Perform random number generation within a CUDA graph
+        with torch.cuda.stream(s):
+            g.capture_begin()
+            graphed_random_values = perform_random_generation_steps(
+                default_generator_state
+            )
+            g.capture_end()
+
+        # Synchronize the streams and replay the graph
+        torch.cuda.current_stream().wait_stream(s)
+        for _ in range(3):
+            random_values = perform_random_generation_steps(generator_state)
+            g.replay()
+            offset = get_final_offsets_of_states(generator_state)
+            graph_offset = get_final_offsets_of_states(default_generator_state)
+
+            # Compare the final offsets of states for both generators to ensure consistency
+            self.assertTrue(offset == graph_offset)
+            # Compare the states generated outside and inside the graph
+            self.assertEqual(random_values, graphed_random_values)
+
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
+    def test_memory_stats_of_multiple_generators_and_graphs(self):
+        # Function to clear CUDA cache and collect garbage
+        def clear_cuda_cache():
+            gc.collect()
+            torch.cuda.empty_cache()
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+        # Executes a simple graph task which includes capturing and executing a random number generation within a CUDA graph.
+        def simple_graph_task(graph):
+            s = torch.cuda.Stream()
+            with torch.cuda.stream(s):
+                graph.capture_begin()
+                torch.rand(1, device="cuda")
+                graph.capture_end()
+            torch.cuda.current_stream().wait_stream(s)
+            graph.replay()  # Replays the captured operations
+
+        def get_memory_stats():
+            stats = torch.cuda.memory_stats()
+            num_blocks = stats["active.all.current"]
+            total_size = stats["active_bytes.all.current"]
+            return num_blocks, total_size
+
+        def test(num_graphs, num_generators):
+            baseline = get_memory_stats()
+            baseline_num_blocks, baseline_total_size = baseline
+
+            # Allocate CUDA graphs
+            graphs = [torch.cuda.CUDAGraph() for _ in range(num_graphs)]
+
+            # Allocate and manage generator states
+            default_generator = torch.cuda.default_generators[0]
+            generators = [default_generator.graphsafe_get_state()]
+
+            # Starts from 1 as one state is already added
+            for _ in range(1, num_generators):
+                generators.append(default_generator.clone_state())
+
+            for graph in graphs:
+                for generator_state in generators:
+                    graph.register_generator_state(generator_state)
+                simple_graph_task(graph)
+
+            # Assert conditions after graph tasks
+            num_blocks, total_size = get_memory_stats()
+            # The allocated blocks should only be proportional to the number of generators
+            expected_blocks_diff = 2 * num_generators
+            expected_size_diff = 2 * 512 * num_generators  # Each block's size is 512
+
+            self.assertTrue(
+                (num_blocks - baseline_num_blocks) == expected_blocks_diff,
+                "Unexpected number of active blocks.",
+            )
+            self.assertTrue(
+                (total_size - baseline_total_size) == expected_size_diff,
+                "Unexpected total memory size.",
+            )
+
+            # Cleanup graphs and clear CUDA cache
+            while graphs:
+                graph = graphs.pop()
+                del graph
+            clear_cuda_cache()
+
+            # Assert that memory stats return to baseline after cleanup
+            self.assertTrue(
+                get_memory_stats() == baseline,
+                "Memory stats do not match baseline after cleanup.",
+            )
+
+        # Running the test function with different parameters
+        test(1, 1)
+        test(3, 2)
+        test(10, 20)
+
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_graph_capture_reset_recapture(self):
         s = torch.cuda.Stream()
 
@@ -2209,7 +2174,7 @@ def test_graph_capture_reset_recapture(self):
 
         g.replay()
 
-        self.assertTrue(b.sum().item() == 11000.)
+        self.assertTrue(b.sum().item() == 11000.0)
 
         g.reset()
 
@@ -2222,12 +2187,14 @@ def test_graph_capture_reset_recapture(self):
         torch.cuda.current_stream().wait_stream(s)
 
         g.replay()
-        self.assertTrue(b.sum().item() == 22000.)
+        self.assertTrue(b.sum().item() == 22000.0)
 
         g.reset()
         del g
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_graph_error(self):
         # We need to run this test in a separate thread as the error we trigger
         # puts the cuda context in a bad state
@@ -2246,20 +2213,28 @@ def test_graph_error(self):
 """
         try:
             a = subprocess.check_output(
-                [sys.executable, '-c', script],
+                [sys.executable, "-c", script],
                 stderr=subprocess.STDOUT,
                 # On Windows, opening the subprocess with the default CWD makes `import torch`
                 # fail, so just set CWD to this script's directory
-                cwd=os.path.dirname(os.path.realpath(__file__)),)
+                cwd=os.path.dirname(os.path.realpath(__file__)),
+            )
         except subprocess.CalledProcessError as e:
             if e.returncode == 1:
-                self.assertTrue(False, "Error raise by starting capture without a stream is not the expected one")
+                self.assertTrue(
+                    False,
+                    "Error raise by starting capture without a stream is not the expected one",
+                )
             elif e.returncode == 2:
-                self.assertTrue(False, "Error raised by starting capture without a stream was not caught")
-
-    @unittest.skipIf((not TEST_CUDA) or
-                     TEST_WITH_ROCM or
-                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+                self.assertTrue(
+                    False,
+                    "Error raised by starting capture without a stream was not caught",
+                )
+
+    @unittest.skipIf(
+        (not TEST_CUDA) or TEST_WITH_ROCM or int(torch.version.cuda.split(".")[0]) < 11,
+        "CUDA >= 11.0 required for graphs",
+    )
     def test_graph_warn_if_has_zero_nodes(self):
         with warnings.catch_warnings(record=True) as caught:
             g = torch.cuda.CUDAGraph()
@@ -2267,21 +2242,29 @@ def test_graph_warn_if_has_zero_nodes(self):
             with torch.cuda.stream(s):
                 g.capture_begin()
                 g.capture_end()
-        self.assertTrue(any("The CUDA Graph is empty" in str(w.message) for w in caught))
+        self.assertTrue(
+            any("The CUDA Graph is empty" in str(w.message) for w in caught)
+        )
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_graph_capture_oom(self):
-        oom_regex = "would exceed allowed memory" if TEST_CUDAMALLOCASYNC else \
-                    "out of memory"
+        oom_regex = (
+            "would exceed allowed memory" if TEST_CUDAMALLOCASYNC else "out of memory"
+        )
         with self.assertRaisesRegex(RuntimeError, oom_regex):
             with torch.cuda.graph(torch.cuda.CUDAGraph()):
-                torch.zeros(2 ** 40, device="cuda")
+                torch.zeros(2**40, device="cuda")
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
+    @serialTest()
     def test_repeat_graph_capture_cublas_workspace_memory(self):
         (x, y, z) = 1024, 512, 64
-        a = torch.rand((x, y), device='cuda')
-        b = torch.rand((y, z), device='cuda')
+        a = torch.rand((x, y), device="cuda")
+        b = torch.rand((y, z), device="cuda")
 
         # warmup
         torch.mm(a, b)
@@ -2300,10 +2283,14 @@ def test_repeat_graph_capture_cublas_workspace_memory(self):
 
         self.assertFalse(used_gb_before + 0.1 < used_gb_after)
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_graph_rng_functional(self):
-        ops_with_kwargs = ((torch.nn.functional.dropout, {"p": 0.1}),
-                           (torch.nn.functional.rrelu, {"training": True}),)
+        ops_with_kwargs = (
+            (torch.nn.functional.dropout, {"p": 0.1}),
+            (torch.nn.functional.rrelu, {"training": True}),
+        )
         size = 10000
 
         def run(op, kwargs):
@@ -2384,42 +2371,48 @@ def run(op, kwargs):
         for op, kwargs in ops_with_kwargs:
             run(op, kwargs)
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_graph_rng_distributions(self):
         size = 10000
         input = torch.rand((size,), device="cuda", dtype=torch.float)
         alloc = torch.empty((size,), device="cuda", dtype=torch.float)
 
         # Torch ops to test with sample args (tuple) and kwargs (dict)
-        torch_with_args = (("bernoulli", (input.clone(),), {}),
-                           # multinomial uses some uncapturable CUDA calls.
-                           # TODO: reenable multinomial tests if/when the implementation is capturable.
-                           # ("multinomial", (input.clone(), size, True), {}),
-                           # ("multinomial", (input.clone(), size // 2, False), {}),
-                           # TODO: reenable normal test, where std is a device
-                           # tensor, when graph test failures are fixed
-                           # ("normal", (input.clone() + 1, input.clone()), {}),
-                           ("normal", (input.clone() + 1, 1.0), {}),
-                           ("poisson", (input.clone(),), {}),
-                           ("rand", (size,), {"device": "cuda", "dtype": torch.float}),
-                           ("randint", (0, 3, (size,)), {"device": "cuda", "dtype": torch.float}),
-                           ("randn", (size,), {"device": "cuda", "dtype": torch.float}),)
+        torch_with_args = (
+            ("bernoulli", (input.clone(),), {}),
+            # multinomial uses some uncapturable CUDA calls.
+            # TODO: reenable multinomial tests if/when the implementation is capturable.
+            # ("multinomial", (input.clone(), size, True), {}),
+            # ("multinomial", (input.clone(), size // 2, False), {}),
+            # TODO: reenable normal test, where std is a device
+            # tensor, when graph test failures are fixed
+            # ("normal", (input.clone() + 1, input.clone()), {}),
+            ("normal", (input.clone() + 1, 1.0), {}),
+            ("poisson", (input.clone(),), {}),
+            ("rand", (size,), {"device": "cuda", "dtype": torch.float}),
+            ("randint", (0, 3, (size,)), {"device": "cuda", "dtype": torch.float}),
+            ("randn", (size,), {"device": "cuda", "dtype": torch.float}),
+        )
 
         # Tensor methods to test with sample args (tuple)
-        tensor_with_args = (("bernoulli_", (input.clone(),)),
-                            ("cauchy_", ()),
-                            ("exponential_", ()),
-                            ("geometric_", (0.3,)),
-                            ("log_normal_", ()),
-                            ("normal_", ()),
-                            ("random_", ()),
-                            ("uniform_", ()),)
+        tensor_with_args = (
+            ("bernoulli_", (input.clone(),)),
+            ("cauchy_", ()),
+            ("exponential_", ()),
+            ("geometric_", (0.3,)),
+            ("log_normal_", ()),
+            ("normal_", ()),
+            ("random_", ()),
+            ("uniform_", ()),
+        )
 
         def run(module, op, args, kwargs):
             torch.cuda.manual_seed(5)
 
             # Each path runs a dummy op to increment the state a bit before creating controls.
-            if (module == "torch"):
+            if module == "torch":
                 dummy = getattr(torch, op)(*args, **kwargs)
                 control1 = getattr(torch, op)(*args, **kwargs)
                 control2 = getattr(torch, op)(*args, **kwargs)
@@ -2438,7 +2431,7 @@ def run(module, op, args, kwargs):
 
                 g = torch.cuda.CUDAGraph()
                 torch.cuda.empty_cache()
-                if (module == "torch"):
+                if module == "torch":
                     g.capture_begin()
                     t1 = getattr(torch, op)(*args, **kwargs)
                     t2 = getattr(torch, op)(*args, **kwargs)
@@ -2471,7 +2464,7 @@ def run(module, op, args, kwargs):
                 torch.cuda.manual_seed(seed)
                 # Runs a dummy op prelude, as for controls, to make sure replay()
                 # picks up the dummy op's state increment.
-                if (module == "torch"):
+                if module == "torch":
                     dummy = getattr(torch, op)(*args, **kwargs)
                     control1 = getattr(torch, op)(*args, **kwargs)
                     control2 = getattr(torch, op)(*args, **kwargs)
@@ -2481,7 +2474,7 @@ def run(module, op, args, kwargs):
                     getattr(control2, op)(*args)
 
                 torch.cuda.manual_seed(seed)
-                if (module == "torch"):
+                if module == "torch":
                     dummy = getattr(torch, op)(*args, **kwargs)
                 else:
                     getattr(dummy, op)(*args)
@@ -2511,7 +2504,9 @@ def run(module, op, args, kwargs):
             # Adds an empty dict for kwargs, which none of the Tensor methods use
             run("Tensor", *(meth_with_args + ({},)))
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_graph_two_successive(self):
         torch.cuda.empty_cache()
 
@@ -2533,7 +2528,11 @@ def func_with_temps(t, val):
 
             s.wait_stream(torch.cuda.current_stream())
             with torch.cuda.stream(s):
-                g0_args = (torch.cuda.graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else ()
+                g0_args = (
+                    (torch.cuda.graph_pool_handle(),)
+                    if share_mem == "via graph_pool_handle()"
+                    else ()
+                )
                 g0.capture_begin(*g0_args)
                 b = a.clone()
                 for _ in range(5):
@@ -2564,24 +2563,35 @@ def func_with_temps(t, val):
             if not TEST_CUDAMALLOCASYNC:
                 # These stat checks are specific to the native allocator.
                 if share_mem != "Don't share":
-                    self.assertEqual(reserved_no_sharing - torch.cuda.memory_stats()["reserved_bytes.all.current"],  # noqa: F821
-                                     kSmallBuffer)
+                    self.assertEqual(
+                        reserved_no_sharing  # noqa: F821
+                        - torch.cuda.memory_stats()["reserved_bytes.all.current"],
+                        kSmallBuffer,
+                    )
                 else:
-                    reserved_no_sharing = torch.cuda.memory_stats()["reserved_bytes.all.current"]
+                    reserved_no_sharing = torch.cuda.memory_stats()[
+                        "reserved_bytes.all.current"
+                    ]
 
             del a, b, c, g0, g1
             # Tensors used across streams (a and b) were held until just now, so no need to call record_stream on them.
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
 
-    @unittest.skipIf((not TEST_CUDA_GRAPH) or
-                     IS_WINDOWS or  # appears to still be broken on Windows as of 11.4+
-                     (torch.version.cuda and
-                     int(torch.version.cuda.split(".")[0]) == 11 and
-                     int(torch.version.cuda.split(".")[1]) < 4),
-                     "Graph bindings disallow concurrent replay for CUDA < 11.4, see " +
-                     "https://github.com/pytorch/pytorch/pull/57556")
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        (not TEST_CUDA_GRAPH)
+        or IS_WINDOWS
+        or (  # appears to still be broken on Windows as of 11.4+
+            torch.version.cuda
+            and int(torch.version.cuda.split(".")[0]) == 11
+            and int(torch.version.cuda.split(".")[1]) < 4
+        ),
+        "Graph bindings disallow concurrent replay for CUDA < 11.4, see "
+        + "https://github.com/pytorch/pytorch/pull/57556",
+    )
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_graph_concurrent_replay(self):
         torch.cuda.empty_cache()
 
@@ -2605,7 +2615,11 @@ def func_with_temps(t, val):
 
             s.wait_stream(torch.cuda.current_stream())
             with torch.cuda.stream(s):
-                g0_args = (torch.cuda.graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else ()
+                g0_args = (
+                    (torch.cuda.graph_pool_handle(),)
+                    if share_mem == "via graph_pool_handle()"
+                    else ()
+                )
                 g0.capture_begin(*g0_args)
                 b = a.clone()
                 for _ in range(5):
@@ -2650,7 +2664,9 @@ def func_with_temps(t, val):
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_graph_three_successive(self):
         torch.cuda.empty_cache()
 
@@ -2667,7 +2683,11 @@ def test_graph_three_successive(self):
 
             s.wait_stream(torch.cuda.current_stream())
             with torch.cuda.stream(s):
-                g0_args = (torch.cuda.graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else ()
+                g0_args = (
+                    (torch.cuda.graph_pool_handle(),)
+                    if share_mem == "via graph_pool_handle()"
+                    else ()
+                )
                 g0.capture_begin(*g0_args)
                 b = a.clone()
                 c = b + 1
@@ -2699,10 +2719,14 @@ def test_graph_three_successive(self):
             g2.replay()
             g1.replay()
 
-            expect_corruption = (not TEST_CUDAMALLOCASYNC) and (share_mem != "Don't share")
+            expect_corruption = (not TEST_CUDAMALLOCASYNC) and (
+                share_mem != "Don't share"
+            )
             # If we used the native allocator and shared mempools, g2's capture should have reused c's memory for f.
             # We replayed g2 then g1, so we expect g1's captured "e = c + 3" mistakenly filled e with "f's vals + 3".
-            self.assertEqual(e.sum().item(), size * (7 + 3) if expect_corruption else size * 5)
+            self.assertEqual(
+                e.sum().item(), size * (7 + 3) if expect_corruption else size * 5
+            )
             self.assertEqual(f.sum().item(), size * 7)
 
             del a, b, d, e, f, g0, g1, g2
@@ -2710,8 +2734,10 @@ def test_graph_three_successive(self):
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
 
-    @unittest.skipIf((not TEST_CUDA_GRAPH) or
-                     TEST_CUDAMALLOCASYNC , "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        (not TEST_CUDA_GRAPH) or TEST_CUDAMALLOCASYNC,
+        "CUDA >= 11.0 or ROCM >= 5.3 required for graphs",
+    )
     def test_graph_memory_stats_and_use_result_after_destroy_graph(self):
         kSmallSize = 1048576
         kSmallBuffer = 2097152
@@ -2722,33 +2748,49 @@ def test_graph_memory_stats_and_use_result_after_destroy_graph(self):
         elem = 4
 
         # this was annoying to write but stresses the expectations pretty rigorously
-        cases = ((512 // elem, 1, kSmallBuffer, kSmallBuffer, "small_pool"),
-                 (kSmallSize // elem, 2, 2 * kSmallBuffer, kSmallBuffer, "small_pool"),
-                 ((kSmallSize + 512) // elem, 1, kLargeBuffer, kLargeBuffer, "large_pool"),
-                 ((kMinLargeAlloc - 512) // elem, 2, 2 * kLargeBuffer, kLargeBuffer, "large_pool"),
-                 ((kMinLargeAlloc + 512) // elem, 3,
-                  3 * (kRoundLarge * ((kMinLargeAlloc + 512 + kRoundLarge - 1) // kRoundLarge)),
-                  kRoundLarge * ((kMinLargeAlloc + 512 + kRoundLarge - 1) // kRoundLarge),
-                  "large_pool"),)
-
-        stats_to_check = ("segment.",
-                          "reserved_bytes.",
-                          "active.",
-                          "active_bytes.")
+        cases = (
+            (512 // elem, 1, kSmallBuffer, kSmallBuffer, "small_pool"),
+            (kSmallSize // elem, 2, 2 * kSmallBuffer, kSmallBuffer, "small_pool"),
+            ((kSmallSize + 512) // elem, 1, kLargeBuffer, kLargeBuffer, "large_pool"),
+            (
+                (kMinLargeAlloc - 512) // elem,
+                2,
+                2 * kLargeBuffer,
+                kLargeBuffer,
+                "large_pool",
+            ),
+            (
+                (kMinLargeAlloc + 512) // elem,
+                3,
+                3
+                * (
+                    kRoundLarge
+                    * ((kMinLargeAlloc + 512 + kRoundLarge - 1) // kRoundLarge)
+                ),
+                kRoundLarge * ((kMinLargeAlloc + 512 + kRoundLarge - 1) // kRoundLarge),
+                "large_pool",
+            ),
+        )
+
+        stats_to_check = ("segment.", "reserved_bytes.", "active.", "active_bytes.")
 
         gc.collect()
         torch.cuda.empty_cache()
 
         s = torch.cuda.Stream()
 
-        for (numel,
-             delta_cudaMallocs,
-             delta_cudaMalloc_bytes,
-             delta_cudaMalloc_bytes_post_del_g,
-             pool_string) in cases:
+        for (
+            numel,
+            delta_cudaMallocs,
+            delta_cudaMalloc_bytes,
+            delta_cudaMalloc_bytes_post_del_g,
+            pool_string,
+        ) in cases:
             if pool_string == "small_pool":
                 delta_active_blocks = 3  # one from "b" plus a sneaky two from CUDAGraph's one-element rng seed and offset holders
-                delta_active_bytes = numel * elem + 1024  # + 1024 for CUDAGraph's rng seed and offset holders each
+                delta_active_bytes = (
+                    numel * elem + 1024
+                )  # + 1024 for CUDAGraph's rng seed and offset holders each
             else:
                 delta_active_blocks = 1  # We only check the large pool, which isn't affected by rng offset holder
                 delta_active_bytes = numel * elem
@@ -2774,17 +2816,24 @@ def test_graph_memory_stats_and_use_result_after_destroy_graph(self):
 
             postcapture_stats = torch.cuda.memory_stats()
 
-            expecteds = (delta_cudaMallocs,
-                         delta_cudaMalloc_bytes,
-                         delta_active_blocks,
-                         delta_active_bytes)
+            expecteds = (
+                delta_cudaMallocs,
+                delta_cudaMalloc_bytes,
+                delta_active_blocks,
+                delta_active_bytes,
+            )
             # Double checks replay and stats before and after a call to empty_cache
             for i in range(2):
                 for stat, expected in zip(stats_to_check, expecteds):
                     stat = stat + pool_string + ".current"
                     current = postcapture_stats[stat] - precapture_stats[stat]
-                    self.assertEqual(current, expected, "Pre to post capture delta of " +
-                                     stat + f" = {current}, expected = {expected}, numel = {numel}")
+                    self.assertEqual(
+                        current,
+                        expected,
+                        "Pre to post capture delta of "
+                        + stat
+                        + f" = {current}, expected = {expected}, numel = {numel}",
+                    )
 
                 g.replay()
                 self.assertEqual(b.sum().item(), 6 * numel)
@@ -2804,8 +2853,13 @@ def test_graph_memory_stats_and_use_result_after_destroy_graph(self):
             for stat, expected in zip(stats_to_check, expecteds):
                 stat = stat + pool_string + ".current"
                 current = postdel_stats[stat] - precapture_stats[stat]
-                self.assertEqual(current, expected, "Pre capture to post graph delete delta of " +
-                                 stat + f" = {current}, expected = {expected}, numel = {numel}")
+                self.assertEqual(
+                    current,
+                    expected,
+                    "Pre capture to post graph delete delta of "
+                    + stat
+                    + f" = {current}, expected = {expected}, numel = {numel}",
+                )
 
             # del a, b before the next case is essential, otherwise overwriting a and b in the next case
             # can throw off its allocation/deallocation counts.
@@ -2814,7 +2868,9 @@ def test_graph_memory_stats_and_use_result_after_destroy_graph(self):
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_graph_record_stream(self):
         # Makes sure graph capture defers attempting to reclaim allocations used across streams. See
         # "Q. Why skip process_events if a capture might be underway?" in c10/cuda/CUDACachingAllocator.cpp
@@ -2831,7 +2887,7 @@ def test_graph_record_stream(self):
         with torch.cuda.stream(s0):
             potential_problem.record_stream(s0)
             torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES)
-            potential_problem.fill_(1.)
+            potential_problem.fill_(1.0)
         del potential_problem
 
         with torch.cuda.stream(s1):
@@ -2844,7 +2900,7 @@ def test_graph_record_stream(self):
             # Let's also see what happens if we record_stream on a tensor during capture.
             s2.wait_stream(s1)
             with torch.cuda.stream(s2):
-                b.fill_(1.)
+                b.fill_(1.0)
                 b.record_stream(s2)  # dummy record_stream
                 del b
             s1.wait_stream(s2)
@@ -2855,11 +2911,14 @@ def test_graph_record_stream(self):
         c = torch.zeros((3,), device="cuda")
 
     @skipIfRocm
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     # If this test is the first in the process to try cudnn rnns with dropout, it'll initialize
     # DropoutState's long-lived internal buffer. Calling code perceives this (correct) behavior
     # as a memory leak unless we skip the leak check.
     @skipCUDAMemoryLeakCheckIf(True)
+    @serialTest()
     def test_graph_cudnn_dropout(self):
         # Tests the interaction of cuda graph capture with DropoutState's syncs in ATen/native/cudnn/RNN.cpp.
         # In particular, if user runs a sequence of captured and noncaptured cudnn rnns, DropoutState should
@@ -2884,56 +2943,9 @@ def test_graph_cudnn_dropout(self):
 
         y = model(x)
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
-    def test_graph_grad_scaling(self):
-        torch.cuda.empty_cache()
-
-        scaler = torch.cuda.amp.GradScaler(init_scale=4.)
-        g = torch.cuda.CUDAGraph()
-        s = torch.cuda.Stream()
-
-        weight = torch.ones((100,), device="cuda", requires_grad=True)
-        opt = torch.optim.SGD([weight], lr=0.1)
-        static_input = torch.ones_like(weight)
-        static_grad = torch.ones_like(weight)
-
-        # warmup
-        s = torch.cuda.Stream()
-        s.wait_stream(torch.cuda.current_stream())
-        with torch.cuda.stream(s):
-            loss = (weight.half() * static_input).sum()
-            scaler.scale(loss).backward()
-        torch.cuda.current_stream().wait_stream(s)
-
-        opt.zero_grad(set_to_none=True)
-
-        # capture
-        with torch.cuda.stream(s):
-            g.capture_begin()
-            loss = (weight.half() * static_input).sum()
-            scaler.scale(loss).backward()
-            g.capture_end()
-
-        input_vals = [5, 20000, 5, 40000]
-        # If the scale gets updated properly, these are the scale, growth tracker,
-        # and grad values we expect.
-        expected_scales = [4, 2, 2, 1]
-        expected_growth_trackers = [1, 0, 1, 0]
-        expected_grad_vals = [5 * 4, float("inf"), 5 * 2, float("inf")]
-
-        for data, scale, growth_tracker, grad_val in zip(input_vals,
-                                                         expected_scales,
-                                                         expected_growth_trackers,
-                                                         expected_grad_vals):
-            static_input.fill_(data)
-            g.replay()
-            self.assertEqual(weight.grad, torch.full_like(weight.grad, grad_val))
-            scaler.step(opt)
-            scaler.update()
-            self.assertEqual(scaler._scale, scale)
-            self.assertEqual(scaler._growth_tracker, growth_tracker)
-
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     @parametrize(
         "with_amp,cache_enabled,allow_unused_input",
         [
@@ -2948,6 +2960,7 @@ def test_graph_grad_scaling(self):
             {True: "_allow_unused_input", False: "_not_allow_unused_input"}[z],
         ),
     )
+    @serialTest()
     def test_graph_make_graphed_callables(
         self, with_amp, cache_enabled, allow_unused_input
     ):
@@ -3056,15 +3069,64 @@ def forward(self, x):
             model_graphed({"x": real_inputs[0]}), model_control({"x": real_inputs[0]})
         )
 
-    def _test_graphed_optimizer(self, steps_warmup, steps_train, optimizer_ctor, kwargs):
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
+    def test_graph_make_graphed_callables_same_pool(self):
+        torch.manual_seed(5)
+        torch.cuda.manual_seed(5)
+        models = []
+        num_models = 3
+        for _ in range(num_models):
+            models.append(
+                torch.nn.Sequential(
+                    torch.nn.Linear(32, 128),
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(128, 128),
+                ).cuda()
+            )
+        # we will reuse the same pool for all graph captures
+        mempool = torch.cuda.graph_pool_handle()
+        graphed_models = []
+        for model in models:
+            x = torch.randn([64, 32], device="cuda")
+            graphed_model = deepcopy(model)
+            graphed_model = torch.cuda.make_graphed_callables(
+                graphed_model, (x,), pool=mempool
+            )
+            graphed_models.append(graphed_model)
+
+        for model, graphed_model in zip(models, graphed_models):
+            x = torch.randn([64, 32], device="cuda")
+            y = model(x)
+            yg = graphed_model(x)
+            l = y.norm()
+            lg = yg.norm()
+            l.backward()
+            lg.backward()
+
+            self.assertEqual(y, yg)
+            self.assertEqual(l, lg)
+            for p, pg in zip(model.parameters(), graphed_model.parameters()):
+                self.assertEqual(p, pg)
+                self.assertEqual(p.grad, pg.grad)
+                self.assertNotEqual(p.data_ptr(), pg.data_ptr())
+                self.assertNotEqual(p.grad.data_ptr, pg.grad.data_ptr)
+
+    def _test_graphed_optimizer(
+        self, steps_warmup, steps_train, optimizer_ctor, kwargs
+    ):
         for actually_do_graphs in (True, False):
-            params = [
-                torch.randn((i + 5, i + 5), device="cuda") for i in range(2)
-            ] + [torch.randn((), device="cuda")]
+            params = [torch.randn((i + 5, i + 5), device="cuda") for i in range(2)] + [
+                torch.randn((), device="cuda")
+            ]
             params_control = [p.clone().requires_grad_() for p in params]
             params_graphed = [p.clone().requires_grad_() for p in params]
 
-            grads = [[torch.randn_like(p) for p in params] for _ in range(steps_warmup + steps_train)]
+            grads = [
+                [torch.randn_like(p) for p in params]
+                for _ in range(steps_warmup + steps_train)
+            ]
 
             # Control (capturable=False)
 
@@ -3104,50 +3166,155 @@ def _test_graphed_optimizer(self, steps_warmup, steps_train, optimizer_ctor, kwa
             for p_control, p_graphed in zip(params_control, params_graphed):
                 self.assertEqual(p_control, p_graphed)
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_graph_optims(self):
         # Needs generalization if we want to extend this test to non-Adam-like optimizers.
-        cases = [
-            (optimizer_ctor, {"lr": 0.1, "betas": (0.8, 0.7), "foreach": foreach,
-                              "decoupled_weight_decay": decoupled_weight_decay})
-            for optimizer_ctor, foreach, decoupled_weight_decay in product(
-                (torch.optim.NAdam,), (False, True), (False, True),)
-        ] + [
-            (optimizer_ctor, {"lr": 0.1, "betas": (0.8, 0.7), "foreach": foreach, "amsgrad": amsgrad})
-            for optimizer_ctor, foreach, amsgrad in product(
-                (torch.optim.Adam, torch.optim.AdamW), (False, True), (False, True),)
-        ] + [
-            (optimizer_ctor, {"lr": 0.1, "betas": (0.8, 0.7), "fused": True, "amsgrad": amsgrad})
-            for optimizer_ctor, amsgrad in product((torch.optim.Adam, torch.optim.AdamW), (False, True))
-        ] + [
-            (torch.optim.ASGD, {"lr": 0.1, "foreach": True, "maximize": maximize, "weight_decay": weight_decay})
-            for maximize, weight_decay in product((False, True), (0.0, 0.1))
-        ]
-
+        cases = (
+            [
+                (
+                    optimizer_ctor,
+                    {
+                        "lr": 0.1,
+                        "betas": (0.8, 0.7),
+                        "foreach": foreach,
+                        "decoupled_weight_decay": decoupled_weight_decay,
+                        "weight_decay": weight_decay,
+                    },
+                )
+                for optimizer_ctor, foreach, decoupled_weight_decay, weight_decay in product(
+                    (
+                        torch.optim.NAdam,
+                        torch.optim.RAdam,
+                    ),
+                    (
+                        False,
+                        True,
+                    ),
+                    (
+                        False,
+                        True,
+                    ),
+                    (
+                        0.0,
+                        0.1,
+                    ),
+                )
+            ]
+            + [
+                (
+                    torch.optim.Rprop,
+                    {"lr": 0.1, "foreach": foreach, "maximize": maximize},
+                )
+                for foreach, maximize in product(
+                    (
+                        False,
+                        True,
+                    ),
+                    (
+                        False,
+                        True,
+                    ),
+                )
+            ]
+            + [
+                (
+                    optimizer_ctor,
+                    {
+                        "lr": 0.1,
+                        "betas": (0.8, 0.7),
+                        "foreach": foreach,
+                        "amsgrad": amsgrad,
+                    },
+                )
+                for optimizer_ctor, foreach, amsgrad in product(
+                    (torch.optim.Adam, torch.optim.AdamW),
+                    (False, True),
+                    (False, True),
+                )
+            ]
+            + [
+                (
+                    optimizer_ctor,
+                    {"lr": 0.1, "betas": (0.8, 0.7), "fused": True, "amsgrad": amsgrad},
+                )
+                for optimizer_ctor, amsgrad in product(
+                    (torch.optim.Adam, torch.optim.AdamW), (False, True)
+                )
+            ]
+            + [
+                (
+                    optimizer_ctor,
+                    {
+                        "lr": 0.1,
+                        "foreach": foreach,
+                        "maximize": maximize,
+                        "weight_decay": weight_decay,
+                    },
+                )
+                for optimizer_ctor, foreach, maximize, weight_decay in product(
+                    (
+                        torch.optim.Adamax,
+                        torch.optim.ASGD,
+                        torch.optim.Adadelta,
+                        torch.optim.RMSprop,
+                    ),
+                    (False, True),
+                    (False, True),
+                    (0, 0.1),
+                )
+            ]
+        )
 
         for optimizer_ctor, kwargs in cases:
             with self.subTest(optimizer_ctor=optimizer_ctor, kwargs=kwargs):
                 self._test_graphed_optimizer(3, 2, optimizer_ctor, kwargs)
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_graph_optims_with_explicitly_capturable_param_groups(self):
         # mimicking `_test_graphed_optimizer` maladroitly to pass two param_groups to optimizer.__init__
         n_warmup, n_replay = 3, 2
-        for optimizer, second_param_group_capturable in product((torch.optim.Adam, torch.optim.AdamW,
-                                                                 torch.optim.NAdam), (True, False)):
-            ref_p1, param1 = (torch.nn.Parameter(torch.ones(1, device="cuda")) for _ in range(2))
-            ref_p2, param2 = (torch.nn.Parameter(torch.ones(1, device="cuda")) for _ in range(2))
-            grads1, grads2 = ([torch.randn_like(param1) for _ in range(n_warmup + n_replay)] for _ in range(2))
-            ref_grads1, ref_grads2 = ([t.clone() for t in tensors] for tensors in (grads1, grads2))
+        for optimizer, second_param_group_capturable in product(
+            (
+                torch.optim.Adam,
+                torch.optim.AdamW,
+                torch.optim.ASGD,
+                torch.optim.Adamax,
+                torch.optim.NAdam,
+                torch.optim.RAdam,
+                torch.optim.Adadelta,
+                torch.optim.RMSprop,
+                torch.optim.Rprop,
+            ),
+            (True, False),
+        ):
+            ref_p1, param1 = (
+                torch.nn.Parameter(torch.ones(1, device="cuda")) for _ in range(2)
+            )
+            ref_p2, param2 = (
+                torch.nn.Parameter(torch.ones(1, device="cuda")) for _ in range(2)
+            )
+            grads1, grads2 = (
+                [torch.randn_like(param1) for _ in range(n_warmup + n_replay)]
+                for _ in range(2)
+            )
+            ref_grads1, ref_grads2 = (
+                [t.clone() for t in tensors] for tensors in (grads1, grads2)
+            )
             params = [
                 {"params": [param1], "capturable": True},
                 {"params": [param2], "capturable": second_param_group_capturable},
             ]
             opt = optimizer(params)
-            opt_ = optimizer([
-                {"params": [ref_p1], "capturable": False},
-                {"params": [ref_p2], "capturable": False},
-            ])
+            opt_ = optimizer(
+                [
+                    {"params": [ref_p1], "capturable": False},
+                    {"params": [ref_p2], "capturable": False},
+                ]
+            )
 
             for i in range(n_warmup + n_replay):
                 ref_p1.grad = ref_grads1[i]
@@ -3175,24 +3342,61 @@ def test_graph_optims_with_explicitly_capturable_param_groups(self):
                 self.assertEqual(ref_p1, param1)
                 self.assertEqual(ref_p2, param2)
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_graph_scaling_fused_optimizers(self):
         cases = [
-            (optimizer_ctor, {"lr": 0.1, "betas": (0.8, 0.7), "fused": True, "amsgrad": amsgrad})
-            for optimizer_ctor, amsgrad in product((torch.optim.Adam, torch.optim.AdamW), (False, True))
-        ]
+            (
+                optimizer_ctor,
+                {"lr": 0.1, "betas": (0.8, 0.7), "fused": True, "amsgrad": amsgrad},
+            )
+            for optimizer_ctor, amsgrad in product(
+                (torch.optim.Adam, torch.optim.AdamW), (False, True)
+            )
+        ] + list(
+            product(
+                (torch.optim.SGD,),
+                [
+                    {
+                        "lr": 0.1,
+                        "momentum": 0.0,
+                        "dampening": d,
+                        "weight_decay": w,
+                        "nesterov": n,
+                        "fused": True,
+                    }
+                    for d, w, n in product((0.0, 0.5), (0.0, 0.5), (False,))
+                ]
+                + [
+                    {
+                        "lr": 0.1,
+                        "momentum": 0.5,
+                        "dampening": d,
+                        "weight_decay": w,
+                        "nesterov": n,
+                        "fused": True,
+                    }
+                    for d, w, n in product((0.0,), (0.0, 0.5), (True, False))
+                ],
+            )
+        )
 
         steps_warmup = 3
         steps_train = 2
 
         for OptClass, kwargs in cases:
-            for actually_do_graphs in (True, False):
+            has_capturable_arg = OptClass in (torch.optim.Adam, torch.optim.AdamW)
+            for actually_do_graphs in (True, False) if has_capturable_arg else (True,):
                 params = [torch.randn((i + 5, i + 5), device="cuda") for i in range(2)]
                 params_control = [p.clone().requires_grad_() for p in params]
                 params_graphed = [p.clone().requires_grad_() for p in params]
 
                 # `GradScaler` in-place updates gradients thus it's necessary to duplicate gradients.
-                grads = [[torch.randn_like(p) for p in params] for _ in range(steps_warmup + steps_train)]
+                grads = [
+                    [torch.randn_like(p) for p in params]
+                    for _ in range(steps_warmup + steps_train)
+                ]
                 with torch.no_grad():
                     grads_control = [[g.clone() for g in gs] for gs in grads]
                     grads_graphed = [[g.clone() for g in gs] for gs in grads]
@@ -3200,16 +3404,21 @@ def test_graph_scaling_fused_optimizers(self):
                 # Gradient Scaler
                 scaler_for_control = torch.cuda.amp.GradScaler(init_scale=128.0)
                 with torch.no_grad():
-                    scaler_for_control._lazy_init_scale_growth_tracker(torch.device("cuda"))
+                    scaler_for_control._lazy_init_scale_growth_tracker(
+                        torch.device("cuda")
+                    )
 
                 scaler_for_graphed = torch.cuda.amp.GradScaler()
                 scaler_for_graphed.load_state_dict(scaler_for_control.state_dict())
                 with torch.no_grad():
-                    scaler_for_graphed._lazy_init_scale_growth_tracker(torch.device("cuda"))
+                    scaler_for_graphed._lazy_init_scale_growth_tracker(
+                        torch.device("cuda")
+                    )
 
                 # Control (capturable=False)
-
-                opt = OptClass(params_control, capturable=False, **kwargs)
+                if has_capturable_arg:
+                    kwargs["capturable"] = False
+                opt = OptClass(params_control, **kwargs)
 
                 for i in range(steps_warmup + steps_train):
                     for j, p in enumerate(params_control):
@@ -3218,8 +3427,9 @@ def test_graph_scaling_fused_optimizers(self):
                     scaler_for_control.update()
 
                 # capturable=True
-
-                opt = OptClass(params_graphed, capturable=True, **kwargs)
+                if has_capturable_arg:
+                    kwargs["capturable"] = True
+                opt = OptClass(params_graphed, **kwargs)
 
                 for i in range(steps_warmup):
                     for j, p in enumerate(params_graphed):
@@ -3249,7 +3459,9 @@ def test_graph_scaling_fused_optimizers(self):
                 for p_control, p_graphed in zip(params_control, params_graphed):
                     self.assertEqual(p_control, p_graphed)
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_cuda_graph_error_options(self):
         def fn():
             x = torch.zeros([2000], device="cuda")
@@ -3286,7 +3498,9 @@ def throws_on_cuda_event(capture_error_mode):
             torch.cuda.current_stream().wait_stream(stream)
             torch.cuda.synchronize()
             try:
-                with torch.cuda.graph(graph, stream=stream, capture_error_mode=capture_error_mode):
+                with torch.cuda.graph(
+                    graph, stream=stream, capture_error_mode=capture_error_mode
+                ):
                     out = fn()
                     thread = threading.Thread(target=raw_malloc)
                     thread.start()
@@ -3304,7 +3518,9 @@ def throws_on_cuda_event(capture_error_mode):
         # Exception would Corrupt Process and make other tests fail
         # self.assertTrue(throws_on_cuda_event("global"))
 
-    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
     def test_cuda_graph_allocator_propagates_stream(self):
         segments = torch.cuda.memory_snapshot()
         existing_pools = {s["segment_pool_id"] for s in segments}
@@ -3324,18 +3540,28 @@ def test_cuda_graph_allocator_propagates_stream(self):
         with torch.cuda.stream(s0):
             g.capture_end()
         segments = torch.cuda.memory_snapshot()
-        x = [s["segment_pool_id"] for s in segments if s["segment_pool_id"] not in existing_pools]
+        x = [
+            s["segment_pool_id"]
+            for s in segments
+            if s["segment_pool_id"] not in existing_pools
+        ]
         self.assertEqual(len(x), 2)
         self.assertEqual(x[0], x[1])
 
     def test_batch_norm_gather_stats(self):
-        input = torch.randn(1, 3, 3, 3, device='cuda')
+        input = torch.randn(1, 3, 3, 3, device="cuda")
         mean, invstd = torch.batch_norm_gather_stats(
-            input, mean=torch.ones(2, 3, device='cuda'), invstd=torch.ones(2, 3, device='cuda'),
-            running_mean=None, running_var=None  , momentum=.1, eps=1e-5, count=2
+            input,
+            mean=torch.ones(2, 3, device="cuda"),
+            invstd=torch.ones(2, 3, device="cuda"),
+            running_mean=None,
+            running_var=None,
+            momentum=0.1,
+            eps=1e-5,
+            count=2,
         )
-        self.assertEqual(mean, torch.ones(3, device='cuda'))
-        self.assertEqual(invstd, torch.ones(3, device='cuda'))
+        self.assertEqual(mean, torch.ones(3, device="cuda"))
+        self.assertEqual(invstd, torch.ones(3, device="cuda"))
 
     def test_matmul_memory_use(self):
         def get_max_used():
@@ -3390,7 +3616,7 @@ def forward(self, x):
                 return MyFunction.apply(x, self.a)
 
         model = MyModule()
-        criterion = torch.nn.MSELoss(reduction='sum')
+        criterion = torch.nn.MSELoss(reduction="sum")
         optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
 
         x = torch.randn(5, 5)
@@ -3403,25 +3629,38 @@ def forward(self, x):
     def test_matmul_device_mismatch(self):
         cpu = torch.rand((10, 10))
         cuda = cpu.cuda()
-        with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected all tensors to be on the same device"
+        ):
             cpu @ cuda
-        with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected all tensors to be on the same device"
+        ):
             cuda @ cpu
 
         for s, m1, m2 in product((cpu, cuda), repeat=3):
             if s.device == m1.device == m2.device:
                 torch.addmm(s, m1, m2)
             else:
-                with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"):
+                with self.assertRaisesRegex(
+                    RuntimeError, "Expected all tensors to be on the same device"
+                ):
                     torch.addmm(s, m1, m2)
 
     @unittest.skipIf(TEST_MULTIGPU, "Testing on one GPU is sufficient")
     def test_lazy_init(self):
-        """ Validate that no CUDA calls are made during `import torch` call"""
+        """Validate that no CUDA calls are made during `import torch` call"""
+
         def check_output(script: str) -> str:
-            return subprocess.check_output([sys.executable, "-c", script]).decode("ascii").strip()
+            return (
+                subprocess.check_output([sys.executable, "-c", script])
+                .decode("ascii")
+                .strip()
+            )
 
-        VISIBLE_DEVICES = "HIP_VISIBLE_DEVICES" if TEST_WITH_ROCM else "CUDA_VISIBLE_DEVICES"
+        VISIBLE_DEVICES = (
+            "HIP_VISIBLE_DEVICES" if TEST_WITH_ROCM else "CUDA_VISIBLE_DEVICES"
+        )
         test_script = f"import os; import torch;os.environ['{VISIBLE_DEVICES}']='32';print(torch.cuda.device_count())"
         rc = check_output(test_script)
         self.assertEqual(rc, "0")
@@ -3430,118 +3669,149 @@ def check_output(script: str) -> str:
             # By using ctypes and calling cuDeviceCountGet() and expect CUDA_ERROR_NOT_INITIALIZED == 3
             # See https://github.com/pytorch/pytorch/issues/116276 for more details
             libcuda_name = "libcuda.so.1" if not IS_WINDOWS else "nvcuda.dll"
-            cuda_driver_api_call = f"ctypes.CDLL('{libcuda_name}').cuDeviceGetCount(ctypes.byref(x))"
-            rc = check_output(f"import torch; import ctypes;x=ctypes.c_int(-1);print({cuda_driver_api_call})")
+            cuda_driver_api_call = (
+                f"ctypes.CDLL('{libcuda_name}').cuDeviceGetCount(ctypes.byref(x))"
+            )
+            rc = check_output(
+                f"import torch; import ctypes;x=ctypes.c_int(-1);print({cuda_driver_api_call})"
+            )
             self.assertEqual(rc, "3")
 
+    @unittest.skipIf(not TEST_MULTIGPU, "requires multiple devices")
+    @unittest.skipIf(TEST_WITH_ROCM, "too lazy to debug this on ROCm")
+    def test_device_count_not_cached_pre_init(self):
+        test_script = """\
+import torch
+import os
+r1 = torch.cuda.device_count()
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+r2 = torch.cuda.device_count()
+torch.empty(10, device='cuda')
+print(f"{r1}, {r2}")
+"""
+
+        r = (
+            subprocess.check_output([sys.executable, "-c", test_script])
+            .decode("ascii")
+            .strip()
+        )
 
+        x = torch.cuda.device_count()
+        self.assertEqual(f"{x}, 1", r)
 
 
 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestCudaMallocAsync(TestCase):
-    @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync")
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
+    )
     def test_memory_snapshot(self):
         try:
             torch.cuda.memory.empty_cache()
             torch.cuda.memory._record_memory_history("state", stacks="python")
             # make x the second block in a segment
-            torch.rand(2 * 311, 411, device='cuda')
-            unused = torch.rand(310, 410, device='cuda')
-            x = torch.rand(311, 411, device='cuda')
+            torch.rand(2 * 311, 411, device="cuda")
+            unused = torch.rand(310, 410, device="cuda")
+            x = torch.rand(311, 411, device="cuda")
 
             # create a bunch of tensors that all will tile into the
             # same segment to  exercise the history merging code
             # 512B is the minimum block size,
             # so we allocate all the tensors to this size to make sure
             # they tile evenly
-            tensors = [torch.rand(128, device='cuda') for _ in range(1000)]
+            tensors = [torch.rand(128, device="cuda") for _ in range(1000)]
             while tensors:
                 del tensors[randint(0, len(tensors) - 1)]
 
             # exercise the history trimming code
-            torch.rand(128 * 5, device='cuda')
+            torch.rand(128 * 5, device="cuda")
 
             ss = torch.cuda.memory._snapshot()
             found_it = False
-            for seg in ss['segments']:
-                self.assertTrue('frames' in seg)
-                for b in seg['blocks']:
-                    if b['requested_size'] == 311 * 411 * 4:
-                        self.assertTrue('test_cuda' in b['frames'][0]['filename'])
+            for seg in ss["segments"]:
+                self.assertTrue("frames" in seg)
+                for b in seg["blocks"]:
+                    if b["requested_size"] == 311 * 411 * 4:
+                        self.assertTrue("test_cuda" in b["frames"][0]["filename"])
                         found_it = True
-                        self.assertEqual(x.untyped_storage().data_ptr(), b['address'])
+                        self.assertEqual(x.untyped_storage().data_ptr(), b["address"])
             self.assertTrue(found_it)
 
             if not IS_WINDOWS:
                 with tempfile.NamedTemporaryFile() as f:
                     torch.cuda.memory._save_segment_usage(f.name)
                     with open(f.name) as f2:
-                        self.assertTrue('test_cuda.py' in f2.read())
+                        self.assertTrue("test_cuda.py" in f2.read())
             del unused
             del x
             torch.cuda.empty_cache()
             ss = torch.cuda.memory._snapshot()
-            self.assertTrue(ss['device_traces'][0][-1]['action'] in ('segment_free', 'segment_unmap'))
+            self.assertTrue(
+                ss["device_traces"][0][-1]["action"]
+                in ("segment_free", "segment_unmap")
+            )
 
         finally:
             torch.cuda.memory._record_memory_history(None)
 
-    @unittest.skipIf(not IS_LINUX, "linux only cpp unwinding")
+    @unittest.skipIf(IS_ARM64 or not IS_LINUX, "x86 linux only cpp unwinding")
     def test_direct_traceback(self):
         from torch._C._profiler import gather_traceback, symbolize_tracebacks
+
         c = gather_traceback(True, True, True)
-        r, = symbolize_tracebacks([c])
+        (r,) = symbolize_tracebacks([c])
         r = str(r)
         self.assertTrue("test_cuda.py" in r)
         self.assertTrue("unwind" in r)
 
-    @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync")
-    @unittest.skipIf(not IS_LINUX, "cpp contexts are linux only")
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
+    )
+    @unittest.skipIf(IS_ARM64 or not IS_LINUX, "cpp contexts are x86 linux only")
     def test_memory_snapshot_with_cpp(self):
         try:
             torch.cuda.memory.empty_cache()
             torch.cuda.memory._record_memory_history("state", stacks="all")
-            x = torch.rand(311, 411, device='cuda')
+            x = torch.rand(311, 411, device="cuda")
 
-            ss = torch.cuda.memory._snapshot()['segments']
+            ss = torch.cuda.memory._snapshot()["segments"]
             found_it = False
             for seg in ss:
-                for b in seg['blocks']:
-                    if b['requested_size'] == 311 * 411 * 4:
-                        self.assertTrue('::rand' in str(b['frames']))
+                for b in seg["blocks"]:
+                    if b["requested_size"] == 311 * 411 * 4:
+                        self.assertTrue("::rand" in str(b["frames"]))
                         found_it = True
             self.assertTrue(found_it)
 
         finally:
             torch.cuda.memory._record_memory_history(None)
 
-
     @skipIfRocm
     def test_memory_profiler_viz(self):
         with torch.profiler.profile(
-            with_stack=True,
-            profile_memory=True,
-            record_shapes=True
+            with_stack=True, profile_memory=True, record_shapes=True
         ) as prof:
-            x = torch.rand(128, 128, device='cuda')
+            x = torch.rand(128, 128, device="cuda")
             x * x + x * x
         plot = profile_plot(prof)
         plot = json.dumps(_profile_to_snapshot(prof))
         self.assertTrue("test_cuda.py" in plot)
         self.assertTrue("test_memory_profiler_viz" in plot)
-        self.assertTrue('category' in plot)
+        self.assertTrue("category" in plot)
 
-    @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync")
-    @unittest.skipIf(not IS_LINUX, "cpp contexts are linux only")
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
+    )
+    @unittest.skipIf(IS_ARM64 or not IS_LINUX, "cpp contexts are x86 linux only")
     def test_cycles(self):
         fired = False
 
         def observer(html):
             nonlocal fired
             fired = True
-            self.assertTrue('torch.Tensor' in html)
-            self.assertTrue('test_cuda' in html)
-            self.assertTrue('cell_contents' in html)
+            self.assertTrue("torch.Tensor" in html)
+            self.assertTrue("test_cuda" in html)
+            self.assertTrue("cell_contents" in html)
 
         disarm = observe_tensor_cycles(observer)
 
@@ -3549,15 +3819,18 @@ def noop():
             pass
 
         try:
+
             def create():
-                x = torch.empty(3, 4, device='cuda')
+                x = torch.empty(3, 4, device="cuda")
 
                 def foo(p):
                     if p:
                         return foo(not p)
                     else:
                         return x
+
                 return foo
+
             create()
             gc.collect()
             # the callback has to run outside of the collect
@@ -3568,16 +3841,24 @@ def foo(p):
         finally:
             disarm()
 
-    @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync")
-    @unittest.skipIf(not IS_LINUX, "cpp contexts are linux only")
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
+    )
+    @unittest.skipIf(IS_ARM64 or not IS_LINUX, "cpp contexts are x86 linux only")
     def test_memory_plots(self):
-        for context, stacks in (("all", "all" if IS_LINUX else "python"), ("all", "python"), (None, "python")):
+        for context, stacks in (
+            ("all", "all" if IS_LINUX else "python"),
+            ("all", "python"),
+            (None, "python"),
+        ):
             try:
                 torch.cuda.memory.empty_cache()
-                torch.cuda.memory._record_memory_history("all", context=context, stacks=stacks)
+                torch.cuda.memory._record_memory_history(
+                    "all", context=context, stacks=stacks
+                )
 
                 def run():
-                    x = torch.rand(128, 128, device='cuda')
+                    x = torch.rand(128, 128, device="cuda")
                     x * x + x * x
 
                 run()
@@ -3596,8 +3877,10 @@ def run():
             finally:
                 torch.cuda.memory._record_memory_history(None)
 
-    @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync")
-    @unittest.skipIf(not IS_LINUX, "cpp contexts are linux only")
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
+    )
+    @unittest.skipIf(IS_ARM64 or not IS_LINUX, "cpp contexts are x86 linux only")
     def test_memory_plots_free_stack(self):
         for context in ["alloc", "all", "state"]:
             try:
@@ -3607,7 +3890,7 @@ def test_memory_plots_free_stack(self):
 
                 def thealloc():
                     nonlocal x
-                    x = torch.rand(3, 4, device='cuda')
+                    x = torch.rand(3, 4, device="cuda")
 
                 def thefree():
                     nonlocal x
@@ -3616,13 +3899,70 @@ def thefree():
                 thealloc()
                 thefree()
                 ss = json.dumps(torch.cuda.memory._snapshot())
-                self.assertTrue(('thefree' in ss) == (context == 'all'))
-                self.assertTrue(('thealloc' in ss) == (context != 'state'))
+                self.assertTrue(("thefree" in ss) == (context == "all"))
+                self.assertTrue(("thealloc" in ss) == (context != "state"))
             finally:
                 torch.cuda.memory._record_memory_history(None)
 
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
+    )
+    @unittest.skipIf(IS_ARM64 or not IS_LINUX, "cpp contexts are x86 linux only")
+    def test_memory_plots_history_context(self):
+        try:
+            torch.cuda.memory.empty_cache()
+            x = None
+
+            def should_capture1():
+                nonlocal x
+                x = torch.rand(4, 4, device="cuda")
+
+            def should_not_capture():
+                nonlocal x
+                x = torch.rand(3, 4, device="cuda")
+
+            def should_capture2():
+                nonlocal x
+                x = torch.rand(4, 4, device="cuda")
+
+            # Recording with context and python call stacks should capture the call stack.
+            torch.cuda.memory._record_memory_history(context="all", stacks="python")
+            should_capture1()
+            # Recording with context=None should not capture the call stack.
+            torch.cuda.memory._record_memory_history(context=None)
+            should_not_capture()
+            # Recording with context and python call stacks should capture the call stack.
+            torch.cuda.memory._record_memory_history(context="all", stacks="python")
+            should_capture2()
+
+            ss = json.dumps(torch.cuda.memory._snapshot())
+            self.assertTrue("should_capture1" in ss)
+            self.assertTrue("should_not_capture" not in ss)
+            self.assertTrue("should_capture2" in ss)
+        finally:
+            torch.cuda.memory._record_memory_history(None)
+
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
+    )
+    @unittest.skipIf(IS_ARM64 or not IS_LINUX, "cpp contexts are x86 linux only")
+    def test_memory_plots_free_segment_stack(self):
+        for context in ["alloc", "all", "state"]:
+            try:
+                torch.cuda.memory.empty_cache()
+                torch.cuda.memory._record_memory_history(context=context)
+                x = torch.rand(3, 4, device="cuda")
+                del x
+                torch.cuda.memory.empty_cache()
+
+                ss = json.dumps(torch.cuda.memory._snapshot())
+                self.assertTrue(("empty_cache" in ss) == (context == "all"))
+            finally:
+                torch.cuda.memory._record_memory_history(None)
 
-    @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync")
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
+    )
     def test_memory_snapshot_script(self):
         try:
             torch.cuda.memory.empty_cache()
@@ -3630,16 +3970,16 @@ def test_memory_snapshot_script(self):
 
             @torch.jit.script
             def foo():
-                return torch.rand(311, 411, device='cuda')
+                return torch.rand(311, 411, device="cuda")
 
             x = foo()
 
-            ss = torch.cuda.memory._snapshot()['segments']
+            ss = torch.cuda.memory._snapshot()["segments"]
             found_it = False
             for seg in ss:
-                for b in seg['blocks']:
-                    if b['requested_size'] == 311 * 411 * 4:
-                        self.assertTrue(b['frames'][0]['name'] == 'foo')
+                for b in seg["blocks"]:
+                    if b["requested_size"] == 311 * 411 * 4:
+                        self.assertTrue(b["frames"][0]["name"] == "foo")
                         found_it = True
             self.assertTrue(found_it)
 
@@ -3660,8 +4000,12 @@ def power2_div(size, div_factor):
             return ret
 
         torch.cuda.memory.empty_cache()
-        key_allocated = 'active_bytes.all.allocated' if not TEST_CUDAMALLOCASYNC else 'allocated_bytes.all.current'
-        key_requested = 'requested_bytes.all.allocated'
+        key_allocated = (
+            "active_bytes.all.allocated"
+            if not TEST_CUDAMALLOCASYNC
+            else "allocated_bytes.all.current"
+        )
+        key_requested = "requested_bytes.all.allocated"
 
         nelems = 21 * 1024 * 1024
         nbytes = 4 * nelems  # floats are 4 bytes
@@ -3671,13 +4015,13 @@ def power2_div(size, div_factor):
 
         start_mem = torch.cuda.memory_stats()[key_allocated]
         torch.cuda.memory._set_allocator_settings("")
-        x = torch.rand(nelems, device='cuda')
+        x = torch.rand(nelems, device="cuda")
 
         # test roundup_power2_divisions single value syntax
         reg_mem = torch.cuda.memory_stats()[key_allocated]
         start_requested = torch.cuda.memory_stats()[key_requested]
         torch.cuda.memory._set_allocator_settings("roundup_power2_divisions:4")
-        y = torch.rand(nelems, device='cuda')
+        y = torch.rand(nelems, device="cuda")
 
         pow2_div4_mem = torch.cuda.memory_stats()[key_allocated]
         current_requested = torch.cuda.memory_stats()[key_requested]
@@ -3689,21 +4033,24 @@ def power2_div(size, div_factor):
             self.assertTrue(current_requested - start_requested == nbytes)
 
         torch.cuda.memory._set_allocator_settings("garbage_collection_threshold:0.5")
-        torch.cuda.memory._set_allocator_settings("garbage_collection_threshold:0.5,max_split_size_mb:40")
+        torch.cuda.memory._set_allocator_settings(
+            "garbage_collection_threshold:0.5,max_split_size_mb:40"
+        )
 
         # should have reset the power2 divisions now
         torch.cuda.memory.empty_cache()
         start_mem = torch.cuda.memory_stats()[key_allocated]
-        z = torch.rand(nelems, device='cuda')
+        z = torch.rand(nelems, device="cuda")
         reg_mem = torch.cuda.memory_stats()[key_allocated]
         self.assertTrue(reg_mem - start_mem == nbytes)
 
         # roundup_power2_divisions knob array syntax
         torch.cuda.memory.empty_cache()
         torch.cuda.memory._set_allocator_settings(
-            "garbage_collection_threshold:0.5,roundup_power2_divisions:[64:8,128:2,256:2,512:2,1024:1,>:1]")
+            "garbage_collection_threshold:0.5,roundup_power2_divisions:[64:8,128:2,256:2,512:2,1024:1,>:1]"
+        )
         start_mem = torch.cuda.memory_stats()[key_allocated]
-        w = torch.rand(nelems, device='cuda')
+        w = torch.rand(nelems, device="cuda")
 
         pow2_div8_mem = torch.cuda.memory_stats()[key_allocated]
         if not TEST_CUDAMALLOCASYNC:
@@ -3712,7 +4059,7 @@ def power2_div(size, div_factor):
 
         torch.cuda.memory.empty_cache()
         start_mem = torch.cuda.memory_stats()[key_allocated]
-        v = torch.rand(nelems_big, device='cuda')
+        v = torch.rand(nelems_big, device="cuda")
 
         pow2_div2_mem = torch.cuda.memory_stats()[key_allocated]
         if not TEST_CUDAMALLOCASYNC:
@@ -3722,7 +4069,7 @@ def power2_div(size, div_factor):
         torch.cuda.memory.empty_cache()
         torch.cuda.memory._set_allocator_settings("release_lock_on_cudamalloc:True")
         start_mem = torch.cuda.memory_stats()[key_allocated]
-        w = torch.rand(nelems, device='cuda')
+        w = torch.rand(nelems, device="cuda")
         reg_mem = torch.cuda.memory_stats()[key_allocated]
         self.assertTrue(reg_mem - start_mem == nbytes)
 
@@ -3730,7 +4077,9 @@ def power2_div(size, div_factor):
             torch.cuda.memory._set_allocator_settings("foo:1,bar:2")
 
         with self.assertRaises(RuntimeError):
-            torch.cuda.memory._set_allocator_settings("garbage_collection_threshold:1.2")
+            torch.cuda.memory._set_allocator_settings(
+                "garbage_collection_threshold:1.2"
+            )
 
         with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings("max_split_size_mb:2")
@@ -3739,23 +4088,40 @@ def power2_div(size, div_factor):
             torch.cuda.memory._set_allocator_settings("release_lock_on_cudamalloc:none")
 
         with self.assertRaises(RuntimeError):
-            torch.cuda.memory._set_allocator_settings("pinned_use_cuda_host_register:none")
+            torch.cuda.memory._set_allocator_settings(
+                "pinned_use_cuda_host_register:none"
+            )
 
         with self.assertRaises(RuntimeError):
-            torch.cuda.memory._set_allocator_settings("pinned_num_register_threads:none")
+            torch.cuda.memory._set_allocator_settings(
+                "pinned_num_register_threads:none"
+            )
 
         with self.assertRaises(RuntimeError):
-            torch.cuda.memory._set_allocator_settings("pinned_num_register_threads:1024")
-
+            torch.cuda.memory._set_allocator_settings(
+                "pinned_num_register_threads:1024"
+            )
 
-    def test_raises_oom(self):
+    @parametrize("max_split_size_mb_setting", [False, True])
+    def test_raises_oom(self, max_split_size_mb_setting):
+        if max_split_size_mb_setting:
+            # CudaCachingAllocator does early return when searching available blocks
+            # if max_split_size_mb is not set
+            # Setting this triggers more parts of the code
+            torch.cuda.memory._set_allocator_settings("max_split_size_mb:1024")
+            torch.cuda.memory.empty_cache()
         with self.assertRaises(torch.cuda.OutOfMemoryError):
-            torch.empty(1024 * 1024 * 1024 * 1024, device='cuda')
+            torch.empty(1024 * 1024 * 1024 * 1024, device="cuda")
 
-    @unittest.skipIf(not (IS_LINUX and os.uname().machine == "x86_64"), 'cpp traces only on linux')
-    @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync")
+    @unittest.skipIf(
+        not (IS_LINUX and os.uname().machine == "x86_64"), "cpp traces only on linux"
+    )
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
+    )
     def test_cpp_memory_snapshot_pickle(self):
         from torch.utils.cpp_extension import load_inline
+
         source = """
         #include <torch/csrc/cuda/memory_snapshot.h>
         py::object do_snapshot() {
@@ -3766,14 +4132,16 @@ def test_cpp_memory_snapshot_pickle(self):
             torch::cuda::_record_memory_history(e, ctx, 10, ctx, ctx);
         }
         """
-        m = load_inline(name='snapshot', cpp_sources=[source], functions=['do_snapshot', 'record'])
+        m = load_inline(
+            name="snapshot", cpp_sources=[source], functions=["do_snapshot", "record"]
+        )
         for ctx in (False, True):
             try:
                 m.record(True, ctx)
 
                 @torch.jit.script
                 def the_script_fn():
-                    return torch.rand(311, 411, device='cuda')
+                    return torch.rand(311, 411, device="cuda")
 
                 def run():
                     t = the_script_fn()
@@ -3781,22 +4149,22 @@ def run():
 
                 mem = run()
                 found = False
-                for s in mem['segments']:
-                    for b in s['blocks']:
-                        if b['state'] == 'active_allocated':
-                            if b['requested_size'] == 311 * 411 * 4:
+                for s in mem["segments"]:
+                    for b in s["blocks"]:
+                        if b["state"] == "active_allocated":
+                            if b["requested_size"] == 311 * 411 * 4:
                                 if ctx:
-                                    frame_text = str(b['frames'])
+                                    frame_text = str(b["frames"])
                                     # C++ frame
-                                    self.assertTrue('::rand' in frame_text)
+                                    self.assertTrue("::rand" in frame_text)
                                     # script frame
-                                    self.assertTrue('the_script_fn' in frame_text)
+                                    self.assertTrue("the_script_fn" in frame_text)
                                     # python frame
-                                    self.assertTrue('case.py' in frame_text)
+                                    self.assertTrue("case.py" in frame_text)
                                 found = True
-                last_action = mem['device_traces'][0][-1]
-                self.assertTrue(last_action['action'] == 'alloc')
-                self.assertTrue(last_action['size'] == 311 * 411 * 4)
+                last_action = mem["device_traces"][0][-1]
+                self.assertTrue(last_action["action"] == "alloc")
+                self.assertTrue(last_action["size"] == 311 * 411 * 4)
                 self.assertTrue(found)
             finally:
                 m.record(False, False)
@@ -3808,9 +4176,10 @@ def test_notifies_oom(self):
         def cb(device, alloc, device_alloc, device_free):
             nonlocal x
             x = True
+
         torch._C._cuda_attach_out_of_memory_observer(cb)
         with self.assertRaises(torch.cuda.OutOfMemoryError):
-            torch.empty(1024 * 1024 * 1024 * 1024, device='cuda')
+            torch.empty(1024 * 1024 * 1024 * 1024, device="cuda")
         self.assertTrue(x)
 
     def test_allocator_fuzz(self):
@@ -3826,7 +4195,7 @@ def test_allocator_fuzz(self):
             def alloc():
                 nonlocal total, c
                 b = random.randrange(2 * 1024 * 1024 // 4, 200 * 1024 * 1024 // 4)
-                mem.append((c, torch.full((b,), c, dtype=torch.int32, device='cuda')))
+                mem.append((c, torch.full((b,), c, dtype=torch.int32, device="cuda")))
                 c += 1
                 total += b
 
@@ -3841,7 +4210,7 @@ def free():
             for i in range(N):
                 while total >= 1024 * 1024 * 1024 / 4:
                     free()
-                action, = random.choices(choices, weights=[1, 1 if mem else 0, .1])
+                (action,) = random.choices(choices, weights=[1, 1 if mem else 0, 0.1])
                 action()
         finally:
             random.setstate(state)
@@ -3868,14 +4237,17 @@ def test_clock_speed(self):
 SMALL_BUFFER = 2097152
 LARGE_BUFFER = 20971520
 
+
 def get_cudagraph_segments(pool_id):
     segments = torch.cuda.memory_snapshot()
     return [segment for segment in segments if segment["segment_pool_id"] == pool_id]
 
+
 def get_all_cudagraph_segments():
     segments = torch.cuda.memory_snapshot()
     return [segment for segment in segments if segment["segment_pool_id"] != (0, 0)]
 
+
 def cudagraphify(fn, inputs, pool=None):
     if not TEST_CUDA_GRAPH:
         raise unittest.SkipTest("cuda graph test is skipped")
@@ -3895,9 +4267,11 @@ def cudagraphify(fn, inputs, pool=None):
 
     return graph, static_outputs
 
+
 def int8_cuda(size):
     return torch.ones([size], device="cuda", dtype=torch.uint8)
 
+
 def live_blocks(pool_id):
     blocks = 0
     seg = get_cudagraph_segments(pool_id)
@@ -3936,7 +4310,6 @@ def reconstruct_from_tensor_metadata(metadata):
 @unittest.skipIf(TEST_CUDAMALLOCASYNC or TEST_WITH_ROCM, "NYI")
 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestBlockStateAbsorption(TestCase):
-
     def checkCheckpointedBlock(self, before_block, after_block):
         for field in ("size", "state"):
             self.assertEqual(before_block[field], after_block[field])
@@ -3944,24 +4317,45 @@ def checkCheckpointedBlock(self, before_block, after_block):
     def checkCheckpointedState(self, before_segments, after_segments):
         # after may contain additional segments, but all of the segments in before
         # should be exactly equivalent to after
-        after_ptr_to_segment = {segment["address"] : segment for segment in after_segments}
+        after_ptr_to_segment = {
+            segment["address"]: segment for segment in after_segments
+        }
 
         for before_segment in before_segments:
             self.assertTrue(before_segment["address"] in after_ptr_to_segment)
             after_segment = after_ptr_to_segment[before_segment["address"]]
 
-            for field in ("device", "total_size", "allocated_size", "active_size", "segment_type", "segment_pool_id"):
+            for field in (
+                "device",
+                "total_size",
+                "allocated_size",
+                "active_size",
+                "segment_type",
+                "segment_pool_id",
+            ):
                 self.assertEqual(before_segment[field], after_segment[field])
 
-            self.assertEqual(len(before_segment["blocks"]), len(after_segment["blocks"]))
-            for before_block, after_block in zip(before_segment["blocks"], after_segment["blocks"]):
+            self.assertEqual(
+                len(before_segment["blocks"]), len(after_segment["blocks"])
+            )
+            for before_block, after_block in zip(
+                before_segment["blocks"], after_segment["blocks"]
+            ):
                 self.checkCheckpointedBlock(before_block, after_block)
 
     @staticmethod
-    def setCheckpointPoolState(device, state, stale_storages_ptr, storages_deleters=None):
+    def setCheckpointPoolState(
+        device, state, stale_storages_ptr, storages_deleters=None
+    ):
         stale_storages_ptr = [t.untyped_storage()._cdata for t in stale_storages_ptr]
-        storages_deleters = [] if not storages_deleters else [t.untyped_storage()._cdata for t in storages_deleters]
-        torch._C._cuda_setCheckpointPoolState(device, state, stale_storages_ptr, storages_deleters)
+        storages_deleters = (
+            []
+            if not storages_deleters
+            else [t.untyped_storage()._cdata for t in storages_deleters]
+        )
+        torch._C._cuda_setCheckpointPoolState(
+            device, state, stale_storages_ptr, storages_deleters
+        )
 
     def checkFunction(self, fn, inputs, pool=None):
         graph, outputs = cudagraphify(fn, inputs, pool=pool)
@@ -3974,7 +4368,9 @@ def checkFunction(self, fn, inputs, pool=None):
         state = torch._C._cuda_getCheckpointState(device, pool_id)
         self.setCheckpointPoolState(device, state, [], [])
 
-        self.checkCheckpointedState(segments_before_checkpoint, get_cudagraph_segments(pool_id))
+        self.checkCheckpointedState(
+            segments_before_checkpoint, get_cudagraph_segments(pool_id)
+        )
 
     def setUp(self):
         super().setUp()
@@ -3990,7 +4386,6 @@ def tearDown(self):
         super().tearDown()
 
     def test_simple(self):
-
         def foo():
             x = torch.zeros([SMALL_SIZE * 8], device="cuda", dtype=torch.uint8)
             x = x + x
@@ -4002,7 +4397,6 @@ def foo():
         self.checkFunction(foo, [])
 
     def test_allocated_in_middle_of_segment(self):
-
         def foo():
             small_buffers = [int8_cuda(MIN_BLOCK_SIZE) for _ in range(11)]
             return small_buffers[5].add_(2)
@@ -4010,7 +4404,6 @@ def foo():
         self.checkFunction(foo, [])
 
     def test_multiple_middle_allocations(self):
-
         def foo():
             small_buffers = [int8_cuda(MIN_BLOCK_SIZE) for _ in range(11)]
             return small_buffers[5], small_buffers[8]
@@ -4025,12 +4418,11 @@ def foo():
         self.checkFunction(foo, [])
 
     def test_additional_free_following_checkpoint(self):
-
         def foo():
-            return int8_cuda(MIN_BLOCK_SIZE),
+            return (int8_cuda(MIN_BLOCK_SIZE),)
 
         def foo2():
-            return int8_cuda(MIN_BLOCK_SIZE),
+            return (int8_cuda(MIN_BLOCK_SIZE),)
 
         graph, outputs = cudagraphify(foo, [])
         pool_id = graph.pool()
@@ -4041,12 +4433,13 @@ def foo2():
 
         graph2, outputs2 = cudagraphify(foo2, [], pool=graph.pool())
 
-
         self.setCheckpointPoolState(outputs[0].device.index, state, outputs2, [])
 
         del outputs2
 
-        self.checkCheckpointedState(segments_before_checkpoint, get_cudagraph_segments(pool_id))
+        self.checkCheckpointedState(
+            segments_before_checkpoint, get_cudagraph_segments(pool_id)
+        )
 
     # TODO: re-enable
     # def test_additional_free_error(self):
@@ -4063,12 +4456,11 @@ def foo2():
 
     #     state = torch._C._cuda_getCheckpointState(outputs[0].device.index, pool_id)
 
-        # graph2, outputs2 = cudagraphify(foo2, [], pool=graph.pool())
-        # with self.assertRaisesRegex(Exception, "being manually freed must be passed"):
-        #     self.setCheckpointPoolState(outputs[0].device.index, state, [], [])
+    # graph2, outputs2 = cudagraphify(foo2, [], pool=graph.pool())
+    # with self.assertRaisesRegex(Exception, "being manually freed must be passed"):
+    #     self.setCheckpointPoolState(outputs[0].device.index, state, [], [])
 
     def test_tensor_dies_after_checkpoint(self):
-
         def foo():
             return int8_cuda(MIN_BLOCK_SIZE), int8_cuda(MIN_BLOCK_SIZE)
 
@@ -4092,9 +4484,12 @@ def foo():
         self.assertEqual(live_blocks(pool_id), 0)
 
     def test_assigning_back_deleter_fns_to_tensor(self):
-
         def foo(x):
-            return int8_cuda(SMALL_BUFFER) + x, int8_cuda(SMALL_BUFFER) + x, int8_cuda(LARGE_BUFFER) + x
+            return (
+                int8_cuda(SMALL_BUFFER) + x,
+                int8_cuda(SMALL_BUFFER) + x,
+                int8_cuda(LARGE_BUFFER) + x,
+            )
 
         inp = torch.tensor([1], device="cuda")
         graph, outputs = cudagraphify(foo, [inp])
@@ -4117,7 +4512,9 @@ def foo(x):
 
         self.assertEqual(live_blocks(pool_id), 0)
 
-        reconstructed_tensors = [reconstruct_from_tensor_metadata(metadata) for metadata in ten_metadata]
+        reconstructed_tensors = [
+            reconstruct_from_tensor_metadata(metadata) for metadata in ten_metadata
+        ]
 
         for i in range(len(reconstructed_tensors)):
             self.assertTrue(reconstructed_tensors[i].mean(dtype=torch.float) == 2)
@@ -4128,7 +4525,9 @@ def foo(x):
         for i in range(len(reconstructed_tensors)):
             self.assertTrue(reconstructed_tensors[i].mean(dtype=torch.float) == 3)
 
-        self.setCheckpointPoolState(device, state, [], [reconstructed_tensors[0], reconstructed_tensors[1]])
+        self.setCheckpointPoolState(
+            device, state, [], [reconstructed_tensors[0], reconstructed_tensors[1]]
+        )
 
         self.assertEqual(live_blocks(pool_id), 3)
 
@@ -4149,6 +4548,7 @@ def foo(x):
     @skipIfNoTorchVision
     def test_resnet(self):
         import torchvision
+
         m = torchvision.models.resnet50()
         m.eval()
         m = m.cuda()
@@ -4157,7 +4557,6 @@ def test_resnet(self):
         self.checkFunction(m, [inp])
 
     def test_check_pool_live_allocations(self):
-
         def foo():
             return torch.ones([4], device="cuda")
 
@@ -4177,9 +4576,7 @@ def check(live_dps):
         del outputs
         self.assertTrue(check(set()))
 
-
     def test_allocate_in_thread_to_pool(self):
-
         def foo():
             return torch.rand([4], device="cuda")
 
@@ -4208,7 +4605,6 @@ def _use_cuda_memory_pool_manager(device, mem_pool):
                 torch._C._cuda_releasePool(device, mem_pool)
                 stream_context.__exit__(None, None, None)
 
-
         segments = get_cudagraph_segments(pool)
         self.assertEqual(len(get_cudagraph_segments(pool)), 1)
 
@@ -4226,7 +4622,6 @@ def alloc_three():
             # three more allocations not in pool
             alloc_three()
 
-
         def no_pool():
             # two allocations
             for _ in range(10):
@@ -4252,20 +4647,91 @@ def no_pool():
 
         self.assertEqual(len(get_cudagraph_segments(pool)), 0)
 
-
     def test_no_triton_on_import(self):
-        """ Test that Trition is not imported on first GPU use """
+        """Test that Trition is not imported on first GPU use"""
         script = "import sys; import torch; torch.rand(2, device='cuda'); print('triton' in sys.modules)"
 
-        rc = subprocess.check_output(
-            [sys.executable, '-c', script],
-            # On Windows, opening the subprocess with the default CWD makes `import torch`
-            # fail, so just set CWD to this script's directory
-            cwd=os.path.dirname(os.path.realpath(__file__))).strip().decode('ascii')
+        rc = (
+            subprocess.check_output(
+                [sys.executable, "-c", script],
+                # On Windows, opening the subprocess with the default CWD makes `import torch`
+                # fail, so just set CWD to this script's directory
+                cwd=os.path.dirname(os.path.realpath(__file__)),
+            )
+            .strip()
+            .decode("ascii")
+        )
         self.assertEqual(rc, "False", "Triton was imported when importing torch!")
 
 
+class TestCudaOptims(TestCase):
+    # These tests will be instantiate with instantiate_device_type_tests
+    # to apply the new OptimizerInfo structure.
+
+    @onlyCUDA
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
+    @parametrize("foreach, fused", [(False, False), (True, False), (False, True)])
+    @optims(
+        [
+            optim
+            for optim in optim_db
+            if "foreach" in optim.supported_impls and "fused" in optim.supported_impls
+        ],
+        dtypes=[torch.float32],
+    )
+    def test_graph_grad_scaling(self, device, dtype, optim_info, foreach, fused):
+        torch.cuda.empty_cache()
+
+        scaler = torch.cuda.amp.GradScaler(init_scale=4.0)
+        g = torch.cuda.CUDAGraph()
+        s = torch.cuda.Stream()
+
+        weight = torch.ones((100,), device="cuda", requires_grad=True)
+        opt = optim_info.optim_cls([weight], lr=0.1, foreach=foreach, fused=fused)
+        static_input = torch.ones_like(weight)
+        static_grad = torch.ones_like(weight)
+
+        # warmup
+        s = torch.cuda.Stream()
+        s.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(s):
+            loss = (weight.half() * static_input).sum()
+            scaler.scale(loss).backward()
+        torch.cuda.current_stream().wait_stream(s)
+
+        opt.zero_grad(set_to_none=True)
+
+        # capture
+        with torch.cuda.stream(s):
+            g.capture_begin()
+            loss = (weight.half() * static_input).sum()
+            scaler.scale(loss).backward()
+            g.capture_end()
+
+        input_vals = [5, 20000, 5, 40000]
+        # If the scale gets updated properly, these are the scale, growth tracker,
+        # and grad values we expect.
+        expected_scales = [4, 2, 2, 1]
+        expected_growth_trackers = [1, 0, 1, 0]
+        expected_grad_vals = [5 * 4, float("inf"), 5 * 2, float("inf")]
+
+        for data, scale, growth_tracker, grad_val in zip(
+            input_vals, expected_scales, expected_growth_trackers, expected_grad_vals
+        ):
+            static_input.fill_(data)
+            g.replay()
+            self.assertEqual(weight.grad, torch.full_like(weight.grad, grad_val))
+            scaler.step(opt)
+            scaler.update()
+            self.assertEqual(scaler._scale, scale)
+            self.assertEqual(scaler._growth_tracker, growth_tracker)
+
+
 instantiate_parametrized_tests(TestCuda)
+instantiate_parametrized_tests(TestCudaMallocAsync)
+instantiate_device_type_tests(TestCudaOptims, globals())
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_cuda_expandable_segments.py b/test/test_cuda_expandable_segments.py
index 123d2d2fe8e1b..ed1ccc2ca3869 100644
--- a/test/test_cuda_expandable_segments.py
+++ b/test/test_cuda_expandable_segments.py
@@ -2,11 +2,14 @@
 # run time cuda tests, but with the allocator using expandable segments
 
 import os
+
 import torch
 
-if torch.cuda.is_available():
-    torch.cuda.memory._set_allocator_settings('expandable_segments:True')
+from torch.testing._internal.common_cuda import IS_JETSON
+
+if torch.cuda.is_available() and not IS_JETSON:
+    torch.cuda.memory._set_allocator_settings("expandable_segments:True")
 
-current_dir = os.path.dirname(os.path.abspath(__file__))
-filepath = os.path.join(current_dir, 'test_cuda.py')
-exec(compile(open(filepath).read(), filepath, mode='exec'))
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    filepath = os.path.join(current_dir, "test_cuda.py")
+    exec(compile(open(filepath).read(), filepath, mode="exec"))
diff --git a/test/test_cuda_multigpu.py b/test/test_cuda_multigpu.py
index 9a118c26ec978..8d101003d7c01 100644
--- a/test/test_cuda_multigpu.py
+++ b/test/test_cuda_multigpu.py
@@ -3,38 +3,45 @@
 import collections
 import contextlib
 import ctypes
-import io
 import gc
+import io
 import queue
 import sys
 import tempfile
 import threading
-import torch
-import torch.cuda.comm as comm
 import unittest
 
-from itertools import repeat, chain
+from itertools import chain, repeat
 from typing import NamedTuple
+
+import torch
+import torch.cuda.comm as comm
 from torch.nn.parallel import scatter_gather
+from torch.testing._internal.common_cuda import (
+    _create_scaling_case,
+    _create_scaling_models_optimizers,
+    TEST_MULTIGPU,
+)
 from torch.testing._internal.common_utils import (
+    get_cycles_per_ms,
+    instantiate_parametrized_tests,
     IS_JETSON,
     IS_REMOTE_GPU,
     IS_SANDCASTLE,
     NoTest,
-    TEST_CUDA,
-    TestCase,
-    get_cycles_per_ms,
-    instantiate_parametrized_tests,
     run_tests,
     skipCUDANonDefaultStreamIf,
     skipIfRocm,
+    TEST_CUDA,
+    TestCase,
 )
-from torch.testing._internal.common_cuda import TEST_MULTIGPU, _create_scaling_case, _create_scaling_models_optimizers
 
-TEST_CUDAMALLOCASYNC = TEST_CUDA and (torch.cuda.get_allocator_backend() == "cudaMallocAsync")
+TEST_CUDAMALLOCASYNC = TEST_CUDA and (
+    torch.cuda.get_allocator_backend() == "cudaMallocAsync"
+)
 
 if not TEST_CUDA:
-    print('CUDA not available, skipping tests', file=sys.stderr)
+    print("CUDA not available, skipping tests", file=sys.stderr)
     TestCase = NoTest  # noqa: F811
 
 
@@ -44,7 +51,9 @@ class TestCudaMultiGPU(TestCase):
     def _check_memory_stat_consistency(self):
         snapshot = torch.cuda.memory_snapshot()
 
-        expected_each_device = collections.defaultdict(lambda: collections.defaultdict(int))
+        expected_each_device = collections.defaultdict(
+            lambda: collections.defaultdict(int)
+        )
 
         for segment in snapshot:
             expandable = segment["is_expandable"]
@@ -56,7 +65,9 @@ def _check_memory_stat_consistency(self):
                 expected["segment." + pool_str + ".current"] += 1
 
             expected["allocated_bytes.all.current"] += segment["allocated_size"]
-            expected["allocated_bytes." + pool_str + ".current"] += segment["allocated_size"]
+            expected["allocated_bytes." + pool_str + ".current"] += segment[
+                "allocated_size"
+            ]
 
             expected["reserved_bytes.all.current"] += segment["total_size"]
             expected["reserved_bytes." + pool_str + ".current"] += segment["total_size"]
@@ -65,7 +76,9 @@ def _check_memory_stat_consistency(self):
             expected["active_bytes." + pool_str + ".current"] += segment["active_size"]
 
             expected["requested_bytes.all.current"] += segment["requested_size"]
-            expected["requested_bytes." + pool_str + ".current"] += segment["requested_size"]
+            expected["requested_bytes." + pool_str + ".current"] += segment[
+                "requested_size"
+            ]
 
             sum_requested = 0
             is_split = len(segment["blocks"]) > 1
@@ -83,7 +96,9 @@ def _check_memory_stat_consistency(self):
                     expected["inactive_split.all.current"] += 1
                     expected["inactive_split." + pool_str + ".current"] += 1
                     expected["inactive_split_bytes.all.current"] += block["size"]
-                    expected["inactive_split_bytes." + pool_str + ".current"] += block["size"]
+                    expected["inactive_split_bytes." + pool_str + ".current"] += block[
+                        "size"
+                    ]
 
             self.assertEqual(sum_requested, segment["requested_size"])
 
@@ -94,15 +109,15 @@ def _check_memory_stat_consistency(self):
 
     def test_cuda_synchronize(self):
         torch.cuda.synchronize()
-        torch.cuda.synchronize('cuda')
-        torch.cuda.synchronize('cuda:0')
+        torch.cuda.synchronize("cuda")
+        torch.cuda.synchronize("cuda:0")
         torch.cuda.synchronize(0)
-        torch.cuda.synchronize(torch.device('cuda:0'))
+        torch.cuda.synchronize(torch.device("cuda:0"))
 
         if TEST_MULTIGPU:
-            torch.cuda.synchronize('cuda:1')
+            torch.cuda.synchronize("cuda:1")
             torch.cuda.synchronize(1)
-            torch.cuda.synchronize(torch.device('cuda:1'))
+            torch.cuda.synchronize(torch.device("cuda:1"))
 
         with self.assertRaisesRegex(ValueError, "Expected a cuda device, but"):
             torch.cuda.synchronize(torch.device("cpu"))
@@ -156,8 +171,27 @@ def assert_change(comp=1, empty_cache=False, reset_peak=False):
             last_r_arr[0] = new_r
             max_r_arr[0] = new_max_r
 
+            stat_key_n_sync = "num_sync_all_streams"
+            stat_key_n_alloc = "num_device_alloc"
+            stat_key_n_free = "num_device_free"
             if empty_cache:
+                num_sync_1 = torch.cuda.memory_stats(device).get(stat_key_n_sync, -1)
+                self.assertGreaterEqual(num_sync_1, 0)
+                num_alloc_1 = torch.cuda.memory_stats(device).get(stat_key_n_alloc, -1)
+                # if current memory usage is greater than zero we must have
+                # allocated something
+                self.assertGreaterEqual(num_alloc_1, 0 if new_m == 0 else 1)
+                num_free_1 = torch.cuda.memory_stats(device).get(stat_key_n_free, -1)
+                self.assertGreaterEqual(num_free_1, 0)
+                # empty_cache will enforce the call of release_cached_blocks
                 torch.cuda.empty_cache()
+                num_sync_2 = torch.cuda.memory_stats(device).get(stat_key_n_sync, -1)
+                self.assertEqual(num_sync_1 + 1, num_sync_2)
+                num_alloc_2 = torch.cuda.memory_stats(device).get(stat_key_n_alloc, -1)
+                self.assertGreaterEqual(num_alloc_2, num_alloc_1)
+                num_free_2 = torch.cuda.memory_stats(device).get(stat_key_n_free, -1)
+                self.assertGreaterEqual(num_free_2, num_free_1)
+
                 new_r = torch.cuda.memory_reserved(device)
                 new_max_r = torch.cuda.max_memory_reserved(device)
                 self.assertLessEqual(new_r, last_r_arr[0])
@@ -266,8 +300,10 @@ def advance(gen, end):
 
         # interlace
         torch.cuda.empty_cache()
-        gen0 = self._test_memory_stats_generator(self, device='cuda:0', N=35)
-        gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35)
+        gen0 = self._test_memory_stats_generator(self, device="cuda:0", N=35)
+        gen1 = self._test_memory_stats_generator(
+            self, device=torch.device("cuda:1"), N=35
+        )
         end0 = end1 = False
         while not (end0 and end1):
             end0 = advance(gen0, end0)
@@ -276,7 +312,9 @@ def advance(gen, end):
         # semi-random order
         torch.cuda.empty_cache()
         gen0 = self._test_memory_stats_generator(self, device=0, N=35)
-        gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35)
+        gen1 = self._test_memory_stats_generator(
+            self, device=torch.device("cuda:1"), N=35
+        )
         end0 = end1 = False
 
         while not (end0 and end1):
@@ -377,10 +415,10 @@ def _test_copy_sync_current_stream(self, x, y):
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_copy_streams(self):
-        d0 = torch.device('cuda:0')
+        d0 = torch.device("cuda:0")
         x0 = torch.zeros(5, 5, device=d0)
 
-        d1 = torch.device('cuda:1')
+        d1 = torch.device("cuda:1")
         x1 = torch.zeros(5, 5, device=d1)
         self._test_copy_sync_current_stream(x0, x1)
 
@@ -397,13 +435,13 @@ def test_cat_autogpu(self):
     @unittest.skipIf(torch.cuda.device_count() >= 10, "Loading a cuda:9 tensor")
     def test_load_nonexistent_device(self):
         # Setup: create a serialized file object with a 'cuda:9' restore location
-        tensor = torch.randn(2, device='cuda')
+        tensor = torch.randn(2, device="cuda")
         buf = io.BytesIO()
         torch.save(tensor, buf)
         # NB: this might not work in the future if serialization changes
-        buf = io.BytesIO(buf.getvalue().replace(b'cuda:0', b'cuda:9'))
+        buf = io.BytesIO(buf.getvalue().replace(b"cuda:0", b"cuda:9"))
 
-        msg = r'Attempting to deserialize object on CUDA device 9'
+        msg = r"Attempting to deserialize object on CUDA device 9"
         with self.assertRaisesRegex(RuntimeError, msg):
             _ = torch.load(buf)
 
@@ -412,7 +450,7 @@ def test_multigpu_serialization_remap(self):
         x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
 
         def gpu_remap(storage, location):
-            if location == 'cuda:1':
+            if location == "cuda:1":
                 return storage.cuda(0)
 
         with tempfile.NamedTemporaryFile() as f:
@@ -431,7 +469,7 @@ def test_multigpu_serialization_remap_dict(self):
         with tempfile.NamedTemporaryFile() as f:
             torch.save(x, f)
             f.seek(0)
-            x_copy = torch.load(f, map_location={'cuda:1': 'cuda:0'})
+            x_copy = torch.load(f, map_location={"cuda:1": "cuda:0"})
         for original, copy in zip(x, x_copy):
             self.assertEqual(copy, original)
             self.assertIs(type(copy), type(original))
@@ -439,10 +477,10 @@ def test_multigpu_serialization_remap_dict(self):
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_multigpu_storage_clone(self):
-        x = torch.randn(4, 4, device='cuda:1').storage()
+        x = torch.randn(4, 4, device="cuda:1").storage()
         y = x.clone()
         self.assertEqual(x.get_device(), y.get_device())
-        for t in ['byte', 'char', 'short', 'int', 'long', 'half', 'double']:
+        for t in ["byte", "char", "short", "int", "long", "half", "double"]:
             self.assertEqual(getattr(x, t)().get_device(), x.get_device())
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
@@ -460,8 +498,8 @@ def test_cuda_set_device(self):
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_current_stream(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")
 
         s0 = torch.cuda.current_stream()
         s1 = torch.cuda.current_stream(device=1)
@@ -482,15 +520,14 @@ def test_current_stream(self):
         self.assertEqual(d0, s2.device)
         self.assertEqual(s0, s1)
 
-        with self.assertRaisesRegex(ValueError,
-                                    "Expected a cuda device, but got: cpu"):
-            torch.cuda.current_stream(torch.device('cpu'))
+        with self.assertRaisesRegex(ValueError, "Expected a cuda device, but got: cpu"):
+            torch.cuda.current_stream(torch.device("cpu"))
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     @skipCUDANonDefaultStreamIf(True)
     def test_default_stream(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")
 
         with torch.cuda.device(d0):
             s0 = torch.cuda.default_stream()
@@ -514,14 +551,13 @@ def test_default_stream(self):
         with torch.cuda.device(d1):
             self.assertEqual(torch.cuda.current_stream(), s1)
 
-        with self.assertRaisesRegex(ValueError,
-                                    "Expected a cuda device, but got: cpu"):
-            torch.cuda.default_stream(torch.device('cpu'))
+        with self.assertRaisesRegex(ValueError, "Expected a cuda device, but got: cpu"):
+            torch.cuda.default_stream(torch.device("cpu"))
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_stream_event_device(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")
         e0 = torch.cuda.Event()
 
         self.assertEqual(None, e0.device)
@@ -534,10 +570,10 @@ def test_stream_event_device(self):
             s1 = torch.cuda.Stream()
             e1 = s1.record_event()
 
-        self.assertEqual(s0.device, torch.device('cuda:0'))
-        self.assertEqual(e0.device, torch.device('cuda:0'))
-        self.assertEqual(s1.device, torch.device('cuda:1'))
-        self.assertEqual(e1.device, torch.device('cuda:1'))
+        self.assertEqual(s0.device, torch.device("cuda:0"))
+        self.assertEqual(e0.device, torch.device("cuda:0"))
+        self.assertEqual(s1.device, torch.device("cuda:1"))
+        self.assertEqual(e1.device, torch.device("cuda:1"))
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_stream_context(self):
@@ -573,18 +609,17 @@ def test_stream_context(self):
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_streams_multi_gpu(self):
         default_stream = torch.cuda.current_stream()
-        self.assertEqual(default_stream.device, torch.device('cuda:0'))
+        self.assertEqual(default_stream.device, torch.device("cuda:0"))
         stream = torch.cuda.Stream(device=1)
-        self.assertEqual(stream.device, torch.device('cuda:1'))
+        self.assertEqual(stream.device, torch.device("cuda:1"))
         with torch.cuda.device(1):
-            self.assertEqual(
-                torch.cuda.current_stream().device, torch.device('cuda:1'))
+            self.assertEqual(torch.cuda.current_stream().device, torch.device("cuda:1"))
             self.assertNotEqual(torch.cuda.current_stream(), default_stream)
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_streams_multi_gpu_query(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")
         torch.cuda.synchronize(d0)
         torch.cuda.synchronize(d1)
 
@@ -623,8 +658,8 @@ def test_streams_multi_gpu_query(self):
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_streams_multi_gpu_eq(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")
 
         with torch.cuda.device(d0):
             s0 = torch.cuda.current_stream()
@@ -657,12 +692,12 @@ def test_streams_priority(self):
         s0 = torch.cuda.Stream(device=0, priority=low)
 
         self.assertEqual(low, s0.priority)
-        self.assertEqual(torch.device('cuda:0'), s0.device)
+        self.assertEqual(torch.device("cuda:0"), s0.device)
 
         s1 = torch.cuda.Stream(device=1, priority=high)
 
         self.assertEqual(high, s1.priority)
-        self.assertEqual(torch.device('cuda:1'), s1.device)
+        self.assertEqual(torch.device("cuda:1"), s1.device)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_tensor_device(self):
@@ -735,7 +770,7 @@ def _event_wait(self, spin_time_cycles):
 
     @staticmethod
     def _test_stream_event_nogil(self, sync_func, p2c, c2p):
-        with torch.cuda.device('cuda:1'):
+        with torch.cuda.device("cuda:1"):
             c2p.put(0)
             p2c.get()
             c2p.put(sync_func(self, TestCudaMultiGPU.FIFTY_MIL_CYCLES))
@@ -744,9 +779,11 @@ def _test_stream_event_nogil(self, sync_func, p2c, c2p):
     @skipIfRocm
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_stream_event_nogil(self):
-        for sync_func in [TestCudaMultiGPU._stream_synchronize,
-                          TestCudaMultiGPU._event_synchronize,
-                          TestCudaMultiGPU._event_wait]:
+        for sync_func in [
+            TestCudaMultiGPU._stream_synchronize,
+            TestCudaMultiGPU._event_synchronize,
+            TestCudaMultiGPU._event_wait,
+        ]:
             p2c = queue.Queue()
             c2p = queue.Queue()
             e_tik = torch.cuda.Event(enable_timing=True)
@@ -754,12 +791,13 @@ def test_stream_event_nogil(self):
 
             t = threading.Thread(
                 target=TestCudaMultiGPU._test_stream_event_nogil,
-                args=(self, sync_func, p2c, c2p))
+                args=(self, sync_func, p2c, c2p),
+            )
             t.daemon = True
             t.start()
 
             c2p.get()
-            with torch.cuda.device('cuda:0'):
+            with torch.cuda.device("cuda:0"):
                 e_tik.record()
                 p2c.put(0)
                 parent_time = sync_func(self, TestCudaMultiGPU.FIFTY_MIL_CYCLES)
@@ -782,8 +820,8 @@ def test_stream_event_nogil(self):
     @skipIfRocm
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_events_wait(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")
         torch.cuda.synchronize(d0)
         torch.cuda.synchronize(d1)
 
@@ -808,8 +846,8 @@ def test_events_wait(self):
 
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_events_multi_gpu_query(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")
 
         with torch.cuda.device(d0):
             s0 = torch.cuda.current_stream()
@@ -850,8 +888,8 @@ def test_events_multi_gpu_query(self):
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     @skipIfRocm
     def test_events_multi_gpu_elapsed_time(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")
 
         with torch.cuda.device(d0):
             s0 = torch.cuda.current_stream()
@@ -915,8 +953,7 @@ def test_external_streams(self):
     def test_external_streams_multi_device(self):
         device = torch.cuda.device(1)
         with self._get_external_stream(device) as stream_v:
-            ext_stream = torch.cuda.ExternalStream(
-                stream_v, device=device)
+            ext_stream = torch.cuda.ExternalStream(stream_v, device=device)
             self.assertEqual(stream_v, ext_stream.cuda_stream)
             self.assertEqual(ext_stream.device.index, device.idx)
 
@@ -937,7 +974,7 @@ def test_caching_pinned_memory_multi_gpu(self):
 
         del t
         t = torch.FloatTensor([2]).pin_memory()
-        self.assertNotEqual(t.data_ptr(), ptr, msg='allocation re-used too soon')
+        self.assertNotEqual(t.data_ptr(), ptr, msg="allocation re-used too soon")
 
         with torch.cuda.device(0):
             gpu_tensor0.copy_(t, non_blocking=True)
@@ -969,7 +1006,7 @@ def test_mem_get_info(self):
         def _test(idx):
             before_free_bytes, before_available_bytes = torch.cuda.mem_get_info(idx)
             # increasing to 8MB to force acquiring a new block and overcome blocksize differences across platforms
-            t = torch.randn(1024 * 1024 * 8, device='cuda:' + str(idx))
+            t = torch.randn(1024 * 1024 * 8, device="cuda:" + str(idx))
             if IS_JETSON:
                 # w/o syncing, mem_get_info will run before memory allocated has actually increased.
                 # This race condition causes consistent failure
@@ -1003,6 +1040,7 @@ def leak_gpu0():
                 leak_gpu0()
             except RuntimeError as e:
                 import re
+
                 assert re.match(regex, str(e)), str(e) + "\n does not match: \n" + regex
         else:
             # assertRaisesRegex does not pass with Python for Jetson,
@@ -1011,12 +1049,15 @@ def leak_gpu0():
                 leak_gpu0()
 
         if TEST_MULTIGPU:
+
             @self.wrap_with_cuda_memory_check
             def leak_gpu1():
                 # increasing to 8MB to force acquiring a new block and overcome blocksize differences across platforms
                 l.append(torch.randn(1024 * 1024 * 8, device=torch.device("cuda:1")))
 
-            with self.assertRaisesRegex(RuntimeError, r"CUDA driver API confirmed .+ on device 1.+"):
+            with self.assertRaisesRegex(
+                RuntimeError, r"CUDA driver API confirmed .+ on device 1.+"
+            ):
                 leak_gpu1()
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
@@ -1052,8 +1093,8 @@ def test_streaming_backwards_device_transfer(self):
         # Multiply by 2 here so to's backward creates gradient values that are different from the case above,
         # to mitigate weirdness if the caching allocator happens to reuse memory regions that were populated
         # with 1s by the case above
-        s0 = to_backward_recipient.to(device="cuda:0").sum() * 2.
-        s1 = to_backward_recipient.to(device="cuda:0").sum() * 2.
+        s0 = to_backward_recipient.to(device="cuda:0").sum() * 2.0
+        s1 = to_backward_recipient.to(device="cuda:0").sum() * 2.0
         torch.cuda.synchronize(device=dev0)
         torch.cuda.synchronize(device=dev1)
         s0.backward(retain_graph=True)
@@ -1066,7 +1107,12 @@ def test_streaming_backwards_device_transfer(self):
     def test_cuda_init_race(self):
         # See https://github.com/pytorch/pytorch/issues/16559
         import subprocess
-        subprocess.check_call([sys.executable, '-c', """\
+
+        subprocess.check_call(
+            [
+                sys.executable,
+                "-c",
+                """\
 import torch
 import threading
 
@@ -1077,99 +1123,10 @@ def worker(rank):
 t2 = threading.Thread(target=worker, args=(1,))
 t1.start()
 t2.start()
-"""])
-
-    def test_grad_scaling_unscale(self, dtype=torch.float):
-        inv_scale = torch.full((1,), 0.25, dtype=torch.float, device="cuda:0")
-        found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0")
-
-        size = 10
-        g = torch.full((size, size), 4.0, dtype=dtype, device="cuda:0")
-        ginf = g.clone()
-        ginf[2, 2] = float('inf')
-        gnan = g.clone()
-        gnan[2, 2] = float('nan')
-
-        # Tries selected combinations of
-        #  - contiguous grads
-        #  - g.clone().t() which is not contiguous but still non overlapping and dense
-        #  - variants of g.clone()[:, :5] which are not non overlapping and dense
-        # Non overlapping and dense grads route into a multi tensor apply kernel,
-        # others use a fallback per-tensor kernel, so we should try both.
-        cases = (
-            ([g.clone(), g.clone()], False),
-            ([g.clone(), g.clone().t()], False),
-            ([g.clone(), g.clone()[:, :5]], False),
-            ([g.clone()[:, :5], g.clone()[:, :5]], False),
-            ([g.clone(), ginf.clone()], True),
-            ([g.clone(), gnan.clone()], True),
-            ([g.clone(), ginf.clone()[:, :5]], True),
-            ([g.clone(), gnan.clone()[:, :5]], True),
-            ([ginf.clone(), g.clone()[:, :5]], True),
-            ([ginf.clone()[:, :5], g.clone()[:, :5]], True),
+""",
+            ]
         )
 
-        for grads, has_inf in cases:
-            found_inf.zero_()
-            torch._amp_foreach_non_finite_check_and_unscale_(grads, found_inf, inv_scale)
-            if has_inf:
-                self.assertEqual(found_inf, 1.0)
-            else:
-                self.assertEqual(found_inf, 0.0)
-                for grad in grads:
-                    self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7)
-
-        # When passing lists with mismatched dtypes to a raw
-        # _amp_foreach_non_finite_check_and_unscale_ call,
-        # it's expected to fall back to single-tensor TensorIterator kernel.
-        grads = [g.clone(), g.to(dtype=torch.float16)]
-        torch._amp_foreach_non_finite_check_and_unscale_(grads, found_inf, inv_scale)
-        for grad in grads:
-            self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7)
-
-        # Passing lists with mismatched devices to a raw
-        # _amp_foreach_non_finite_check_and_unscale_ call should raise errors.
-        if TEST_MULTIGPU:
-            with self.assertRaisesRegex(RuntimeError, r"Expected all tensors to be on the same device"):
-                torch._amp_foreach_non_finite_check_and_unscale_([g.clone(), g.to(device="cuda:1")],
-                                                                 found_inf,
-                                                                 inv_scale)
-
-        # Creates a list of grads with mismatched dtypes and devices, to ensure
-        # scaler._unscale_grads_ organizes grads by dtype and device before calling
-        # _amp_foreach_non_finite_check_and_unscale_ on each set.
-        # If inject_inf >= 0, writes an inf into one grad for _unscale_grads_ to find.
-        def perfect_storm_grads(inject_inf):
-            grads = [g.clone(), g.clone()[:, :5], g.to(dtype=torch.float16), g.to(dtype=torch.float16)]
-            if TEST_MULTIGPU:
-                grads += [g.to(device="cuda:1"),
-                          g.to(device="cuda:1")[:, :5],
-                          g.to(device="cuda:1", dtype=torch.float16),
-                          g.to(device="cuda:1", dtype=torch.float16)]
-            if inject_inf >= 0:
-                grads[inject_inf][2, 2] = float('inf')
-            return grads
-
-        scaler = torch.cuda.amp.GradScaler()
-        dummy_params = [torch.empty_like(g) for g in perfect_storm_grads(-1)]
-        dummy_opt = torch.optim.SGD(dummy_params, lr=1.)
-
-        # Ensures the inf/nan checking can find an inf injected onto any grad in the perfect storm.
-        for inject_inf in range(-1, len(dummy_params)):
-            found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0")
-            grads = perfect_storm_grads(inject_inf)
-            for i, p in enumerate(dummy_params):
-                p.grad = grads[i]
-            found_inf_per_device = scaler._unscale_grads_(dummy_opt, inv_scale, found_inf, True)
-            if inject_inf < 0:
-                # No inf was injected, ensures unscaling worked normally.
-                self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 0)
-                for grad in grads:
-                    self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7)
-            else:
-                # inf was injected, ensures inf was found.
-                self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 1)
-
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_grad_scaling_device_as_key(self):
         # Ensure that different instances of "device" objects that point to the same device
@@ -1200,14 +1157,24 @@ def test_grad_scaling_device_as_key(self):
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_grad_scaling_scale(self):
-        scaler = torch.cuda.amp.GradScaler(init_scale=2.)
+        scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
         t0 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:0")
         t1 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:1")
         # Create some nested iterables of tensors on different devices.
-        outputs = (t1.clone(), (t0.clone(), t1.clone()), [t0.clone(), (t1.clone(), t0.clone())])
+        outputs = (
+            t1.clone(),
+            (t0.clone(), t1.clone()),
+            [t0.clone(), (t1.clone(), t0.clone())],
+        )
         outputs = scaler.scale(outputs)
-        self.assertTrue(outputs[0] == 8.0 and outputs[1][0] == 8.0 and outputs[1][1] == 8.0 and
-                        outputs[2][0] == 8.0 and outputs[2][1][0] == 8.0 and outputs[2][1][1] == 8.0)
+        self.assertTrue(
+            outputs[0] == 8.0
+            and outputs[1][0] == 8.0
+            and outputs[1][1] == 8.0
+            and outputs[2][0] == 8.0
+            and outputs[2][1][0] == 8.0
+            and outputs[2][1][1] == 8.0
+        )
         self.assertTrue(scaler._scale.device == t1.device)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
@@ -1220,12 +1187,25 @@ def test_grad_scaling_multigpu(self):
         dev1 = torch.device("cuda:1")
 
         for enabled in True, False:
-            mod_control0, mod_scaling0, opt_control0, opt_scaling0, data, loss_fn, skip_iter = \
-                _create_scaling_case()
-            mod_control1, mod_scaling1, opt_control1, opt_scaling1 = \
-                _create_scaling_models_optimizers(device=dev1)
-
-            scaler = torch.cuda.amp.GradScaler(init_scale=128., growth_factor=2.0, enabled=enabled, growth_interval=1)
+            (
+                mod_control0,
+                mod_scaling0,
+                opt_control0,
+                opt_scaling0,
+                data,
+                loss_fn,
+                skip_iter,
+            ) = _create_scaling_case()
+            (
+                mod_control1,
+                mod_scaling1,
+                opt_control1,
+                opt_scaling1,
+            ) = _create_scaling_models_optimizers(device=dev1)
+
+            scaler = torch.cuda.amp.GradScaler(
+                init_scale=128.0, growth_factor=2.0, enabled=enabled, growth_interval=1
+            )
 
             def run(model0, model1, optimizer0, optimizer1, try_scaling_api):
                 for i, (input, target) in enumerate(data):
@@ -1234,13 +1214,15 @@ def run(model0, model1, optimizer0, optimizer1, try_scaling_api):
                     output0 = model0(input)
                     output1 = model1(input.to(dev1))
                     loss0 = loss_fn(0.3 * output0 + 0.7 * output1.to(dev0), target)
-                    loss1 = loss_fn(0.6 * output0.to(dev1) - 0.4 * output1, target.to(dev1))
+                    loss1 = loss_fn(
+                        0.6 * output0.to(dev1) - 0.4 * output1, target.to(dev1)
+                    )
 
                     if try_scaling_api:
                         scaler.scale(loss0).backward(retain_graph=True)
                         scaler.scale(loss1).backward()
                         if i == skip_iter and scaler.is_enabled():
-                            model1[1].weight.grad.data.fill_(float('inf'))
+                            model1[1].weight.grad.data.fill_(float("inf"))
 
                         # As an additional stress test, separately unscale for one of the optimizers.
                         scaler.unscale_(optimizer0)
@@ -1250,11 +1232,20 @@ def run(model0, model1, optimizer0, optimizer1, try_scaling_api):
 
                         # Make sure the found_infs were collected properly across optimizers and devices.
                         if scaler.is_enabled():
-                            self.assertTrue(len(scaler._found_inf_per_device(optimizer0)) == 1)
-                            self.assertTrue(len(scaler._found_inf_per_device(optimizer1)) == 1)
-                            self.assertTrue(scaler._found_inf_per_device(optimizer0)[dev0].item() == 0.)
-                            self.assertTrue(scaler._found_inf_per_device(optimizer1)[dev1].item() ==
-                                            float(i == skip_iter))
+                            self.assertTrue(
+                                len(scaler._found_inf_per_device(optimizer0)) == 1
+                            )
+                            self.assertTrue(
+                                len(scaler._found_inf_per_device(optimizer1)) == 1
+                            )
+                            self.assertTrue(
+                                scaler._found_inf_per_device(optimizer0)[dev0].item()
+                                == 0.0
+                            )
+                            self.assertTrue(
+                                scaler._found_inf_per_device(optimizer1)[dev1].item()
+                                == float(i == skip_iter)
+                            )
 
                         scaler.update()
                     else:
@@ -1268,25 +1259,41 @@ def run(model0, model1, optimizer0, optimizer1, try_scaling_api):
             run(mod_scaling0, mod_scaling1, opt_scaling0, opt_scaling1, True)
 
             # The loss scale should have been multiplied by the growth factor 3 times and the backoff factor once.
-            self.assertTrue(scaler.get_scale() == (128. * scaler.get_growth_factor()**3 *
-                                                   scaler.get_backoff_factor()**1) if enabled else 1.0)
+            self.assertTrue(
+                scaler.get_scale()
+                == (
+                    128.0
+                    * scaler.get_growth_factor() ** 3
+                    * scaler.get_backoff_factor() ** 1
+                )
+                if enabled
+                else 1.0
+            )
 
             # Copy mod_control1 and mod_scaling1 back the device 0 for comparison
             mod_control1.to(dev0)
             mod_scaling1.to(dev0)
 
-            for c, s in zip(chain(mod_control0.parameters(), mod_control1.parameters()),
-                            chain(mod_scaling0.parameters(), mod_scaling1.parameters())):
+            for c, s in zip(
+                chain(mod_control0.parameters(), mod_control1.parameters()),
+                chain(mod_scaling0.parameters(), mod_scaling1.parameters()),
+            ):
                 self.assertEqual(c, s, rtol=1e-5, atol=1e-7)
 
     @unittest.skipIf(not TEST_MULTIGPU, "Test needs multiple GPUs")
     def test_cuda_device_memory_allocated(self):
         from torch.cuda import memory_allocated
+
         device_count = torch.cuda.device_count()
         current_alloc = [memory_allocated(idx) for idx in range(device_count)]
         x = torch.ones(10, device="cuda:0")
         self.assertGreater(memory_allocated(0), current_alloc[0])
-        self.assertTrue(all(memory_allocated(torch.cuda.device(idx)) == current_alloc[idx] for idx in range(1, device_count)))
+        self.assertTrue(
+            all(
+                memory_allocated(torch.cuda.device(idx)) == current_alloc[idx]
+                for idx in range(1, device_count)
+            )
+        )
 
 
 class TestCudaComm(TestCase):
@@ -1298,12 +1305,17 @@ def _test_broadcast(self, input):
         for i, t in enumerate(results):
             self.assertEqual(t.get_device(), i)
             self.assertEqual(t, input)
-            if input.is_cuda and input.get_device() == i:  # test not copying on same device
+            if (
+                input.is_cuda and input.get_device() == i
+            ):  # test not copying on same device
                 self.assertEqual(t.data_ptr(), input.data_ptr())
         # test out=
         for inplace in [True, False]:
             if inplace:
-                outputs = [torch.empty_like(input, device=0), torch.empty_like(input, device=1)]
+                outputs = [
+                    torch.empty_like(input, device=0),
+                    torch.empty_like(input, device=1),
+                ]
             else:
                 outputs = [input.cuda(0), torch.empty_like(input, device=1)]
             results = comm.broadcast(input, out=outputs)
@@ -1313,13 +1325,19 @@ def _test_broadcast(self, input):
                 self.assertEqual(t.get_device(), i)
                 self.assertEqual(t, input)
         # test error msg
-        with self.assertRaisesRegex(RuntimeError, r"Exactly one of 'devices' and 'out'"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Exactly one of 'devices' and 'out'"
+        ):
             comm.broadcast(input, (0, 1), out=outputs)
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"Expected all output tensors to be CUDA tensors, but output tensor at index 1"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected all output tensors to be CUDA tensors, but output tensor at index 1",
+        ):
             comm.broadcast(input, out=[input.cuda(0), input.cpu()])
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"Expected all output tensors to have same shape as the source .+ at index 1"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected all output tensors to have same shape as the source .+ at index 1",
+        ):
             comm.broadcast(input, out=[input.cuda(0), input.cuda(1).unsqueeze(0)])
 
     def test_broadcast_cpu(self):
@@ -1361,16 +1379,16 @@ def test_broadcast_coalesced(self):
         numel = 5
         num_bytes = numel * 8
         tensors = [
-            self.genSparseTensor((2, 3), 2, 1, False, 'cuda', torch.float64)[0],
+            self.genSparseTensor((2, 3), 2, 1, False, "cuda", torch.float64)[0],
             torch.randn(numel).long().cuda(),
             torch.randn(numel).cuda(),
-            self.genSparseTensor((2, 3), 2, 10, False, 'cuda', torch.float64)[0],
-            self.genSparseTensor((2, 3), 2, 5, False, 'cuda', torch.float64)[0],
-            self.genSparseTensor((3, 3), 2, 7, False, 'cuda', torch.int64)[0],
-            self.genSparseTensor((2, 3), 2, 2, False, 'cuda', torch.float32)[0],
+            self.genSparseTensor((2, 3), 2, 10, False, "cuda", torch.float64)[0],
+            self.genSparseTensor((2, 3), 2, 5, False, "cuda", torch.float64)[0],
+            self.genSparseTensor((3, 3), 2, 7, False, "cuda", torch.int64)[0],
+            self.genSparseTensor((2, 3), 2, 2, False, "cuda", torch.float32)[0],
             torch.randn(numel).long().cuda(),
             torch.randn(numel).long().cuda(),
-            self.genSparseTensor((2, 7), 2, 3, False, 'cuda', torch.int64)[0],
+            self.genSparseTensor((2, 7), 2, 3, False, "cuda", torch.int64)[0],
             torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
             torch.randn(numel).cuda(),
         ]
@@ -1395,7 +1413,7 @@ def test_broadcast_coalesced_empty_tensors(self):
         tensors = [
             torch.tensor([]).byte().cuda(),
             torch.randn(5).cuda(),
-            torch.randn(5).double().cuda()
+            torch.randn(5).double().cuda(),
         ]
         self._test_broadcast_coalesced(tensors, 256)
 
@@ -1436,16 +1454,16 @@ def test_reduce_add_coalesced(self):
         numel = 5
         num_bytes = numel * 8
         tensors = [
-            self.genSparseTensor((2, 3), 2, 1, False, 'cuda', torch.float64)[0],
+            self.genSparseTensor((2, 3), 2, 1, False, "cuda", torch.float64)[0],
             torch.randn(numel).long().cuda(),
             torch.randn(numel).cuda(),
-            self.genSparseTensor((2, 3), 2, 10, False, 'cuda', torch.float64)[0],
-            self.genSparseTensor((2, 3), 2, 5, False, 'cuda', torch.float64)[0],
-            self.genSparseTensor((3, 3), 2, 7, False, 'cuda', torch.int64)[0],
-            self.genSparseTensor((2, 3), 2, 2, False, 'cuda', torch.float32)[0],
+            self.genSparseTensor((2, 3), 2, 10, False, "cuda", torch.float64)[0],
+            self.genSparseTensor((2, 3), 2, 5, False, "cuda", torch.float64)[0],
+            self.genSparseTensor((3, 3), 2, 7, False, "cuda", torch.int64)[0],
+            self.genSparseTensor((2, 3), 2, 2, False, "cuda", torch.float32)[0],
             torch.randn(numel).long().cuda(),
             torch.randn(numel).long().cuda(),
-            self.genSparseTensor((2, 7), 2, 3, False, 'cuda', torch.int64)[0],
+            self.genSparseTensor((2, 7), 2, 3, False, "cuda", torch.int64)[0],
             torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
             torch.randn(numel).cuda(),
         ]
@@ -1484,7 +1502,9 @@ def _test_scatter(self, input, chunk_sizes=None, dim=0):
             self.assertEqual(r, input[tuple(index)], atol=0, rtol=0)
             chunk_start = chunk_end
             if r.device == input.device:
-                self.assertEqual(r.data_ptr(), input.data_ptr())  # for target @ same device, a view should be returned
+                self.assertEqual(
+                    r.data_ptr(), input.data_ptr()
+                )  # for target @ same device, a view should be returned
 
         # test out
         out = [torch.empty_like(t) for t in result]
@@ -1501,20 +1521,38 @@ def _test_scatter(self, input, chunk_sizes=None, dim=0):
 
         # test error msg
         if chunk_sizes is not None:
-            with self.assertRaisesRegex(RuntimeError, r"Expected devices and chunk_sizes to be of same length"):
-                comm.scatter(input, [0 for _ in range(len(chunk_sizes) + 1)], dim=dim, chunk_sizes=chunk_sizes)
+            with self.assertRaisesRegex(
+                RuntimeError, r"Expected devices and chunk_sizes to be of same length"
+            ):
+                comm.scatter(
+                    input,
+                    [0 for _ in range(len(chunk_sizes) + 1)],
+                    dim=dim,
+                    chunk_sizes=chunk_sizes,
+                )
         with self.assertRaisesRegex(RuntimeError, r"'devices' must not be specified"):
             comm.scatter(input, (0, 1), dim=dim, out=out)
-        with self.assertRaisesRegex(RuntimeError, r"Expected at least one device to scatter to"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expected at least one device to scatter to"
+        ):
             comm.scatter(input, (), dim=dim)
-        with self.assertRaisesRegex(RuntimeError, r"Expected at least one output tensor to scatter to"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expected at least one output tensor to scatter to"
+        ):
             comm.scatter(input, dim=dim, out=[])
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"Expected all output tensors to be CUDA tensors, but output tensor at index 0"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected all output tensors to be CUDA tensors, but output tensor at index 0",
+        ):
             comm.scatter(input, dim=dim, out=([out[0].cpu()] + out[1:]))
-        with self.assertRaisesRegex(RuntimeError, r"Output tensor at index 0 has incorrect shape"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Output tensor at index 0 has incorrect shape"
+        ):
             comm.scatter(input, dim=dim, out=([out[0].unsqueeze(0)] + out[1:]))
-        with self.assertRaisesRegex(RuntimeError, r"Total size for output tensors along scatter dim \d+ does not match"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Total size for output tensors along scatter dim \d+ does not match",
+        ):
             index = [slice(None, None) for _ in range(input.dim())]
             index[dim] = slice(1, None)
             comm.scatter(input, dim=dim, out=([out[0][tuple(index)]] + out[1:]))
@@ -1552,13 +1590,13 @@ def _test_gather(self, dim):
         expected_size[dim] += y.size(dim)
         expected_size = torch.Size(expected_size)
 
-        destinations = [None, torch.device('cuda:0'), torch.device('cpu')]
+        destinations = [None, torch.device("cuda:0"), torch.device("cpu")]
         if torch.cuda.device_count() > 2:
-            destinations.append(torch.device('cuda:2'))
+            destinations.append(torch.device("cuda:2"))
         with torch.cuda.device(1):
             for destination in destinations:
                 if destination is None:
-                    expected_device = torch.device('cuda', torch.cuda.current_device())
+                    expected_device = torch.device("cuda", torch.cuda.current_device())
                 else:
                     expected_device = destination
                 for use_out in [True, False]:
@@ -1579,15 +1617,31 @@ def _test_gather(self, dim):
                     self.assertEqual(result[tuple(index)], y)
 
         # test error msg
-        with self.assertRaisesRegex(RuntimeError, r"'destination' must not be specified"):
-            comm.gather((x, y), dim, destination='cpu', out=torch.empty(expected_size, device='cpu'))
-        with self.assertRaisesRegex(RuntimeError, r"Expected at least one tensor to gather from"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"'destination' must not be specified"
+        ):
+            comm.gather(
+                (x, y),
+                dim,
+                destination="cpu",
+                out=torch.empty(expected_size, device="cpu"),
+            )
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expected at least one tensor to gather from"
+        ):
             comm.gather(())
-        with self.assertRaisesRegex(RuntimeError, r"Expected all input tensors to be CUDA tensors, "):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expected all input tensors to be CUDA tensors, "
+        ):
             comm.gather((x.cpu(), y))
-        with self.assertRaisesRegex(RuntimeError, r"Expected all input tensors to have the same number of dimensions"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected all input tensors to have the same number of dimensions",
+        ):
             comm.gather((x, y.unsqueeze(0)))
-        with self.assertRaisesRegex(RuntimeError, r"Input tensor at index 1 has invalid shape"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Input tensor at index 1 has invalid shape"
+        ):
             if dim in [0, -2]:
                 comm.gather((x, y[:, 1:]), dim=dim)
             elif dim in [1, -1]:
@@ -1604,7 +1658,9 @@ def test_gather_neg_dim(self):
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_memory_format_scatter_gather(self):
-        nhwc = torch.randn((10, 3, 32, 32), device='cpu').contiguous(memory_format=torch.channels_last)
+        nhwc = torch.randn((10, 3, 32, 32), device="cpu").contiguous(
+            memory_format=torch.channels_last
+        )
         results = torch.cuda.comm.scatter(nhwc, (0, 1), None, 0)
         for result in results:
             self.assertFalse(result.is_contiguous())
@@ -1613,7 +1669,6 @@ def test_memory_format_scatter_gather(self):
         gathered = torch.cuda.comm.gather(results)
         self.assertTrue(gathered.is_contiguous(memory_format=torch.channels_last))
 
-
     @unittest.skipIf(not TEST_MULTIGPU, "Test needs multiple GPUs")
     def test_scatter_namedtuple(self):
         # tests ability to scatter namedtuples and retrieve a list where each
@@ -1661,8 +1716,8 @@ class TestNamedTupleInput_1(NamedTuple):
     def test_gather_namedtuple(self):
         # tests ability to gather a list of namedtuples and return a namedtuple where each
         # element is of the expected tensor type.
-        fields = ['a', 'b']
-        TestNamedTupleInput_0 = collections.namedtuple('NamedTuple', fields)
+        fields = ["a", "b"]
+        TestNamedTupleInput_0 = collections.namedtuple("NamedTuple", fields)
 
         num_gpus = torch.cuda.device_count()
         a = torch.rand(num_gpus * 2, device=0)
@@ -1675,10 +1730,10 @@ def test_gather_namedtuple(self):
 
         outputs = [out1, out2]
 
-        out = scatter_gather.gather(outputs, 'cpu')  # test on CPU
+        out = scatter_gather.gather(outputs, "cpu")  # test on CPU
         for i, x in enumerate(out):
             self.assertTrue(isinstance(x, type(out2[-1])))  # x must be a tensor
-            cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu')))
+            cat = torch.cat((outputs[0][i].to("cpu"), outputs[1][i].to("cpu")))
             self.assertTrue(torch.equal(x, cat))
 
         out = scatter_gather.gather(outputs, 0)  # test on GPU
@@ -1707,15 +1762,15 @@ class TestNamedTupleInput_1(NamedTuple):
             cat = torch.cat((outputs[0][i].to(0), outputs[1][i].to(0)))
             self.assertTrue(torch.equal(x, cat))
 
-        out = scatter_gather.gather(outputs, 'cpu')  # test on CPU
+        out = scatter_gather.gather(outputs, "cpu")  # test on CPU
         for i, x in enumerate(out):
             self.assertTrue(isinstance(x, type(out2[-1])))
-            cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu')))
+            cat = torch.cat((outputs[0][i].to("cpu"), outputs[1][i].to("cpu")))
             self.assertTrue(torch.equal(x, cat))
 
 
 instantiate_parametrized_tests(TestCudaMultiGPU)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_cuda_nvml_based_avail.py b/test/test_cuda_nvml_based_avail.py
index 2b7487682a906..4cfe8dfc0e8c9 100644
--- a/test/test_cuda_nvml_based_avail.py
+++ b/test/test_cuda_nvml_based_avail.py
@@ -1,19 +1,28 @@
 # Owner(s): ["module: cuda"]
 
+import multiprocessing
 import os
 import sys
-import multiprocessing
-import torch
 import unittest
 from unittest.mock import patch
 
+import torch
+
 # NOTE: Each of the tests in this module need to be run in a brand new process to ensure CUDA is uninitialized
 # prior to test initiation.
 with patch.dict(os.environ, {"PYTORCH_NVML_BASED_CUDA_CHECK": "1"}):
     # Before executing the desired tests, we need to disable CUDA initialization and fork_handler additions that would
     # otherwise be triggered by the `torch.testing._internal.common_utils` module import
-    from torch.testing._internal.common_utils import (parametrize, instantiate_parametrized_tests, run_tests, TestCase,
-                                                      IS_WINDOWS, IS_JETSON, NoTest)
+    from torch.testing._internal.common_utils import (
+        instantiate_parametrized_tests,
+        IS_JETSON,
+        IS_WINDOWS,
+        NoTest,
+        parametrize,
+        run_tests,
+        TestCase,
+    )
+
     # NOTE: Because `remove_device_and_dtype_suffixes` initializes CUDA context (triggered via the import of
     # `torch.testing._internal.common_device_type` which imports `torch.testing._internal.common_cuda`) we need
     # to bypass that method here which should be irrelevant to the parameterized tests in this module.
@@ -21,7 +30,7 @@
 
     TEST_CUDA = torch.cuda.is_available()
     if not TEST_CUDA:
-        print('CUDA not available, skipping tests', file=sys.stderr)
+        print("CUDA not available, skipping tests", file=sys.stderr)
         TestCase = NoTest  # type: ignore[misc, assignment] # noqa: F811
 
 
@@ -30,11 +39,14 @@ class TestExtendedCUDAIsAvail(TestCase):
     SUBPROCESS_REMINDER_MSG = (
         "\n REMINDER: Tests defined in test_cuda_nvml_based_avail.py must be run in a process "
         "where there CUDA Driver API has not been initialized. Before further debugging, ensure you are either using "
-        "run_test.py or have added --subprocess to run each test in a different subprocess.")
+        "run_test.py or have added --subprocess to run each test in a different subprocess."
+    )
 
     def setUp(self):
         super().setUp()
-        torch.cuda.device_count.cache_clear()  # clear the lru_cache on this method before our test
+        torch.cuda._cached_device_count = (
+            None  # clear the lru_cache on this method before our test
+        )
 
     @staticmethod
     def in_bad_fork_test() -> bool:
@@ -47,31 +59,33 @@ def in_bad_fork_test() -> bool:
     # If the NVML-based assessment is attempted but fails, the CUDA Runtime API check should be executed
     @unittest.skipIf(IS_WINDOWS, "Needs fork")
     @parametrize("nvml_avail", [True, False])
-    @parametrize("avoid_init", ['1', '0', None])
+    @parametrize("avoid_init", ["1", "0", None])
     def test_cuda_is_available(self, avoid_init, nvml_avail):
-        if IS_JETSON and nvml_avail and avoid_init == '1':
-            self.skipTest('Not working for Jetson')
+        if IS_JETSON and nvml_avail and avoid_init == "1":
+            self.skipTest("Not working for Jetson")
         patch_env = {"PYTORCH_NVML_BASED_CUDA_CHECK": avoid_init} if avoid_init else {}
         with patch.dict(os.environ, **patch_env):
             if nvml_avail:
                 _ = torch.cuda.is_available()
             else:
-                with patch.object(torch.cuda, '_device_count_nvml', return_value=-1):
+                with patch.object(torch.cuda, "_device_count_nvml", return_value=-1):
                     _ = torch.cuda.is_available()
             with multiprocessing.get_context("fork").Pool(1) as pool:
                 in_bad_fork = pool.apply(TestExtendedCUDAIsAvail.in_bad_fork_test)
-            if os.getenv('PYTORCH_NVML_BASED_CUDA_CHECK') == '1' and nvml_avail:
-                self.assertFalse(in_bad_fork, TestExtendedCUDAIsAvail.SUBPROCESS_REMINDER_MSG)
+            if os.getenv("PYTORCH_NVML_BASED_CUDA_CHECK") == "1" and nvml_avail:
+                self.assertFalse(
+                    in_bad_fork, TestExtendedCUDAIsAvail.SUBPROCESS_REMINDER_MSG
+                )
             else:
                 assert in_bad_fork
 
 
 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestVisibleDeviceParses(TestCase):
-
     def test_env_var_parsing(self):
         def _parse_visible_devices(val):
             from torch.cuda import _parse_visible_devices as _pvd
+
             with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
                 return _pvd()
 
@@ -96,39 +110,57 @@ def _parse_visible_devices(val):
 
     def test_partial_uuid_resolver(self):
         from torch.cuda import _transform_uuid_to_ordinals
-        uuids = ['GPU-9942190a-aa31-4ff1-4aa9-c388d80f85f1',
-                 'GPU-9e8d35e3-a134-0fdd-0e01-23811fdbd293',
-                 'GPU-e429a63e-c61c-4795-b757-5132caeb8e70',
-                 'GPU-eee1dfbc-0a0f-6ad8-5ff6-dc942a8b9d98',
-                 'GPU-bbcd6503-5150-4e92-c266-97cc4390d04e',
-                 'GPU-472ea263-58d7-410d-cc82-f7fdece5bd28',
-                 'GPU-e56257c4-947f-6a5b-7ec9-0f45567ccf4e',
-                 'GPU-1c20e77d-1c1a-d9ed-fe37-18b8466a78ad']
+
+        uuids = [
+            "GPU-9942190a-aa31-4ff1-4aa9-c388d80f85f1",
+            "GPU-9e8d35e3-a134-0fdd-0e01-23811fdbd293",
+            "GPU-e429a63e-c61c-4795-b757-5132caeb8e70",
+            "GPU-eee1dfbc-0a0f-6ad8-5ff6-dc942a8b9d98",
+            "GPU-bbcd6503-5150-4e92-c266-97cc4390d04e",
+            "GPU-472ea263-58d7-410d-cc82-f7fdece5bd28",
+            "GPU-e56257c4-947f-6a5b-7ec9-0f45567ccf4e",
+            "GPU-1c20e77d-1c1a-d9ed-fe37-18b8466a78ad",
+        ]
         self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3"], uuids), [1])
-        self.assertEqual(_transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1])
-        self.assertEqual(_transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids), [1, 7, 5])
+        self.assertEqual(
+            _transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1]
+        )
+        self.assertEqual(
+            _transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids),
+            [1, 7, 5],
+        )
         # First invalid UUID aborts parsing
-        self.assertEqual(_transform_uuid_to_ordinals(["GPU-123", "GPU-9e8d35e3"], uuids), [])
-        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids), [1])
+        self.assertEqual(
+            _transform_uuid_to_ordinals(["GPU-123", "GPU-9e8d35e3"], uuids), []
+        )
+        self.assertEqual(
+            _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids),
+            [1],
+        )
         # First ambigous UUID aborts parsing
-        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1])
+        self.assertEqual(
+            _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1]
+        )
         # Duplicate UUIDs result in empty set
-        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-47", "GPU-9e8"], uuids), [])
+        self.assertEqual(
+            _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-47", "GPU-9e8"], uuids),
+            [],
+        )
 
     def test_ordinal_parse_visible_devices(self):
         def _device_count_nvml(val):
             from torch.cuda import _device_count_nvml as _dc
+
             with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
                 return _dc()
 
-        with patch.object(torch.cuda, '_raw_device_count_nvml', return_value=2):
+        with patch.object(torch.cuda, "_raw_device_count_nvml", return_value=2):
             self.assertEqual(_device_count_nvml("1, 0"), 2)
             # Ordinal out of bounds aborts parsing
             self.assertEqual(_device_count_nvml("1, 5, 0"), 1)
 
 
-
 instantiate_parametrized_tests(TestExtendedCUDAIsAvail)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_cuda_primary_ctx.py b/test/test_cuda_primary_ctx.py
index 089d07678c488..333d4bbc1564a 100644
--- a/test/test_cuda_primary_ctx.py
+++ b/test/test_cuda_primary_ctx.py
@@ -1,15 +1,21 @@
 # Owner(s): ["module: cuda"]
 
-import torch
-from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocmVersionLessThan, NoTest
-from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
 import sys
 import unittest
 
+import torch
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
+from torch.testing._internal.common_utils import (
+    NoTest,
+    run_tests,
+    skipIfRocmVersionLessThan,
+    TestCase,
+)
+
 # NOTE: this needs to be run in a brand new process
 
 if not TEST_CUDA:
-    print('CUDA not available, skipping tests', file=sys.stderr)
+    print("CUDA not available, skipping tests", file=sys.stderr)
     TestCase = NoTest  # noqa: F811
 
 
@@ -18,17 +24,21 @@ class TestCudaPrimaryCtx(TestCase):
     CTX_ALREADY_CREATED_ERR_MSG = (
         "Tests defined in test_cuda_primary_ctx.py must be run in a process "
         "where CUDA contexts are never created. Use either run_test.py or add "
-        "--subprocess to run each test in a different subprocess.")
+        "--subprocess to run each test in a different subprocess."
+    )
 
     @skipIfRocmVersionLessThan((4, 4, 21504))
     def setUp(self):
         for device in range(torch.cuda.device_count()):
             # Ensure context has not been created beforehand
-            self.assertFalse(torch._C._cuda_hasPrimaryContext(device), TestCudaPrimaryCtx.CTX_ALREADY_CREATED_ERR_MSG)
+            self.assertFalse(
+                torch._C._cuda_hasPrimaryContext(device),
+                TestCudaPrimaryCtx.CTX_ALREADY_CREATED_ERR_MSG,
+            )
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_str_repr(self):
-        x = torch.randn(1, device='cuda:1')
+        x = torch.randn(1, device="cuda:1")
 
         # We should have only created context on 'cuda:1'
         self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
@@ -43,13 +53,13 @@ def test_str_repr(self):
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_copy(self):
-        x = torch.randn(1, device='cuda:1')
+        x = torch.randn(1, device="cuda:1")
 
         # We should have only created context on 'cuda:1'
         self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
         self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
 
-        y = torch.randn(1, device='cpu')
+        y = torch.randn(1, device="cpu")
         y.copy_(x)
 
         # We should still have only created context on 'cuda:1'
@@ -58,7 +68,7 @@ def test_copy(self):
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_pin_memory(self):
-        x = torch.randn(1, device='cuda:1')
+        x = torch.randn(1, device="cuda:1")
 
         # We should have only created context on 'cuda:1'
         self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
@@ -70,7 +80,7 @@ def test_pin_memory(self):
         self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
         self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
 
-        x = torch.randn(3, device='cpu').pin_memory()
+        x = torch.randn(3, device="cpu").pin_memory()
 
         # We should still have only created context on 'cuda:1'
         self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
@@ -82,19 +92,19 @@ def test_pin_memory(self):
         self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
         self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
 
-        x = torch.randn(3, device='cpu', pin_memory=True)
+        x = torch.randn(3, device="cpu", pin_memory=True)
 
         # We should still have only created context on 'cuda:1'
         self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
         self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
 
-        x = torch.zeros(3, device='cpu', pin_memory=True)
+        x = torch.zeros(3, device="cpu", pin_memory=True)
 
         # We should still have only created context on 'cuda:1'
         self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
         self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
 
-        x = torch.empty(3, device='cpu', pin_memory=True)
+        x = torch.empty(3, device="cpu", pin_memory=True)
 
         # We should still have only created context on 'cuda:1'
         self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
@@ -106,5 +116,6 @@ def test_pin_memory(self):
         self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
         self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_cuda_sanitizer.py b/test/test_cuda_sanitizer.py
index a117346c7ab4a..4c210fcdb8c98 100644
--- a/test/test_cuda_sanitizer.py
+++ b/test/test_cuda_sanitizer.py
@@ -7,8 +7,8 @@
 
 import torch
 import torch.cuda._sanitizer as csan
-from torch.cuda._sanitizer import StreamId, DataPtr, EventId
-from torch.testing._internal.common_utils import TestCase, run_tests, NoTest, TEST_CUDA
+from torch.cuda._sanitizer import DataPtr, EventId, StreamId
+from torch.testing._internal.common_utils import NoTest, run_tests, TEST_CUDA, TestCase
 
 
 if not TEST_CUDA:
diff --git a/test/test_cuda_trace.py b/test/test_cuda_trace.py
index b01971375cc8e..2cbad3d8887cb 100644
--- a/test/test_cuda_trace.py
+++ b/test/test_cuda_trace.py
@@ -5,8 +5,8 @@
 import unittest.mock
 
 import torch
-import torch.utils._cuda_trace as cuda_trace
-from torch.testing._internal.common_utils import TestCase, run_tests, NoTest, TEST_CUDA
+import torch.cuda._gpu_trace as gpu_trace
+from torch.testing._internal.common_utils import NoTest, run_tests, TEST_CUDA, TestCase
 
 # NOTE: Each test needs to be run in a brand new process, to reset the registered hooks
 # and make sure the CUDA streams are initialized for each test that uses them.
@@ -19,18 +19,18 @@
 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestCudaTrace(TestCase):
     def setUp(self):
-        torch._C._activate_cuda_trace()
+        torch._C._activate_gpu_trace()
         self.mock = unittest.mock.MagicMock()
 
     def test_event_creation_callback(self):
-        cuda_trace.register_callback_for_cuda_event_creation(self.mock)
+        gpu_trace.register_callback_for_event_creation(self.mock)
 
         event = torch.cuda.Event()
         event.record()
         self.mock.assert_called_once_with(event._as_parameter_.value)
 
     def test_event_deletion_callback(self):
-        cuda_trace.register_callback_for_cuda_event_deletion(self.mock)
+        gpu_trace.register_callback_for_event_deletion(self.mock)
 
         event = torch.cuda.Event()
         event.record()
@@ -39,7 +39,7 @@ def test_event_deletion_callback(self):
         self.mock.assert_called_once_with(event_id)
 
     def test_event_record_callback(self):
-        cuda_trace.register_callback_for_cuda_event_record(self.mock)
+        gpu_trace.register_callback_for_event_record(self.mock)
 
         event = torch.cuda.Event()
         event.record()
@@ -48,7 +48,7 @@ def test_event_record_callback(self):
         )
 
     def test_event_wait_callback(self):
-        cuda_trace.register_callback_for_cuda_event_wait(self.mock)
+        gpu_trace.register_callback_for_event_wait(self.mock)
 
         event = torch.cuda.Event()
         event.record()
@@ -58,13 +58,13 @@ def test_event_wait_callback(self):
         )
 
     def test_memory_allocation_callback(self):
-        cuda_trace.register_callback_for_cuda_memory_allocation(self.mock)
+        gpu_trace.register_callback_for_memory_allocation(self.mock)
 
         tensor = torch.empty(10, 4, device="cuda")
         self.mock.assert_called_once_with(tensor.data_ptr())
 
     def test_memory_deallocation_callback(self):
-        cuda_trace.register_callback_for_cuda_memory_deallocation(self.mock)
+        gpu_trace.register_callback_for_memory_deallocation(self.mock)
 
         tensor = torch.empty(3, 8, device="cuda")
         data_ptr = tensor.data_ptr()
@@ -72,26 +72,33 @@ def test_memory_deallocation_callback(self):
         self.mock.assert_called_once_with(data_ptr)
 
     def test_stream_creation_callback(self):
-        cuda_trace.register_callback_for_cuda_stream_creation(self.mock)
+        gpu_trace.register_callback_for_stream_creation(self.mock)
+
+        # see Note [HIP Lazy Streams]
+        if torch.version.hip:
+            user_stream = torch.cuda.Stream()
+            with torch.cuda.stream(user_stream):
+                tensor = torch.ones(5, device="cuda")
+        else:
+            torch.cuda.Stream()
 
-        torch.cuda.Stream()
         self.mock.assert_called()
 
     def test_device_synchronization_callback(self):
-        cuda_trace.register_callback_for_cuda_device_synchronization(self.mock)
+        gpu_trace.register_callback_for_device_synchronization(self.mock)
 
         torch.cuda.synchronize()
         self.mock.assert_called()
 
     def test_stream_synchronization_callback(self):
-        cuda_trace.register_callback_for_cuda_stream_synchronization(self.mock)
+        gpu_trace.register_callback_for_stream_synchronization(self.mock)
 
         stream = torch.cuda.Stream()
         stream.synchronize()
         self.mock.assert_called_once_with(stream.cuda_stream)
 
     def test_event_synchronization_callback(self):
-        cuda_trace.register_callback_for_cuda_event_synchronization(self.mock)
+        gpu_trace.register_callback_for_event_synchronization(self.mock)
 
         event = torch.cuda.Event()
         event.record()
@@ -99,7 +106,7 @@ def test_event_synchronization_callback(self):
         self.mock.assert_called_once_with(event._as_parameter_.value)
 
     def test_memcpy_synchronization(self):
-        cuda_trace.register_callback_for_cuda_stream_synchronization(self.mock)
+        gpu_trace.register_callback_for_stream_synchronization(self.mock)
 
         tensor = torch.rand(5, device="cuda")
         tensor.nonzero()
@@ -107,8 +114,8 @@ def test_memcpy_synchronization(self):
 
     def test_all_trace_callbacks_called(self):
         other = unittest.mock.MagicMock()
-        cuda_trace.register_callback_for_cuda_memory_allocation(self.mock)
-        cuda_trace.register_callback_for_cuda_memory_allocation(other)
+        gpu_trace.register_callback_for_memory_allocation(self.mock)
+        gpu_trace.register_callback_for_memory_allocation(other)
 
         tensor = torch.empty(10, 4, device="cuda")
         self.mock.assert_called_once_with(tensor.data_ptr())
diff --git a/test/test_custom_ops.py b/test/test_custom_ops.py
index 6ee0498587f37..b5901bcc4b8fc 100644
--- a/test/test_custom_ops.py
+++ b/test/test_custom_ops.py
@@ -11,24 +11,33 @@
 
 import torch._custom_ops as custom_ops
 
-import torch.testing._internal.custom_op_db
 import torch.testing._internal.optests as optests
+import torch.utils.cpp_extension
 from functorch import make_fx
 from torch import Tensor
-from torch._custom_op.impl import custom_op, CustomOp
+from torch._custom_op.impl import custom_op, CustomOp, infer_schema
 from torch._utils_internal import get_file_path_2
+from torch.testing._internal import custom_op_db
 from torch.testing._internal.common_cuda import TEST_CUDA
-from torch.testing._internal.custom_op_db import custom_op_db
+from torch.testing._internal.custom_op_db import numpy_nonzero
 from typing import *  # noqa: F403
+import numpy as np
+
+
+def requires_compile(fun):
+    fun = unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")(fun)
+    return fun
 
 
 class CustomOpTestCaseBase(TestCase):
     test_ns = "_test_custom_op"
 
     def setUp(self):
+        super().setUp()
         self.libraries = []
 
     def tearDown(self):
+        super().tearDown()
         import torch._custom_op
 
         keys = list(torch._custom_op.impl.global_registry.keys())
@@ -46,7 +55,7 @@ def ns(self):
         return getattr(torch.ops, self.test_ns)
 
     def lib(self):
-        result = torch.library.Library(self.test_ns, "FRAGMENT")
+        result = torch.library.Library(self.test_ns, "FRAGMENT")  # noqa: TOR901
         self.libraries.append(result)
         return result
 
@@ -54,7 +63,7 @@ def get_op(self, qualname):
         return torch._custom_op.impl.get_op(qualname)
 
 
-@unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
+@requires_compile
 class TestCustomOpTesting(CustomOpTestCaseBase):
     @parametrize("check_gradients", (False, "auto"))
     @parametrize("dynamic", (True, False))
@@ -122,7 +131,7 @@ def foo_impl(x):
         with self.assertRaisesRegex(
             optests.OpCheckError, "Argument x is not defined as mutable but was mutated"
         ):
-            optests.opcheck(op, (x,), {})
+            torch.library.opcheck(op, (x,), {})
 
     def test_incorrect_schema_view(self, device):
         lib = self.lib()
@@ -158,7 +167,7 @@ def foo_meta(x):
             optests.OpCheckError,
             "Argument x is not defined to alias output but was aliasing",
         ):
-            optests.opcheck(op, (x,), {})
+            torch.library.opcheck(op, (x,), {})
 
     def test_missing_abstract_impl(self, device):
         lib = self.lib()
@@ -187,7 +196,7 @@ def foo_impl(x):
             optests.OpCheckError,
             "_test_custom_op.foo.default",
         ):
-            optests.opcheck(op, (x,), {})
+            torch.library.opcheck(op, (x,), {})
 
     def test_incorrect_abstract_impl(self, device):
         lib = self.lib()
@@ -225,7 +234,7 @@ def foo_meta(x):
 
         x = torch.tensor([0, 1.0], requires_grad=True)
         with self.assertRaisesRegex(optests.OpCheckError, "Shapes .* are not equal"):
-            optests.opcheck(op, (x,), {})
+            torch.library.opcheck(op, (x,), {})
 
     def test_missing_functionalization(self, device):
         lib = self.lib()
@@ -258,9 +267,9 @@ def foo_meta(x):
         y = x.clone()
         with self.assertRaisesRegex(
             optests.OpCheckError,
-            "Getting these operators to work with functionalization requires some extra work",
+            "We only support functionalizing operators whose outputs do not have alias annotations",
         ):
-            optests.opcheck(op, (y,), {})
+            torch.library.opcheck(op, (y,), {})
 
     def test_autograd_registered_at_backend(self, device):
         lib = self.lib()
@@ -286,7 +295,7 @@ def backward(ctx, gx):
             torch.testing._internal.optests.OpCheckError,
             "does not have an autograd kernel",
         ):
-            optests.opcheck(op, (x,), {})
+            torch.library.opcheck(op, (x,), {})
 
         # I'm not sure why this is necessary
         del lib
@@ -314,28 +323,20 @@ def backward(ctx, gx):
         with self.assertRaisesRegex(
             optests.OpCheckError, "eager-mode PyTorch vs AOTAutograd"
         ):
-            optests.opcheck(op, (x,), {})
+            torch.library.opcheck(op, (x,), {})
 
-    @ops(custom_op_db, dtypes=OpDTypes.any_one)
+    @ops(custom_op_db.custom_op_db, dtypes=OpDTypes.any_one)
     def test_opcheck_opinfo(self, device, dtype, op):
         for sample_input in op.sample_inputs(
             device, dtype, requires_grad=op.supports_autograd
         ):
             args = [sample_input.input] + list(sample_input.args)
             kwargs = sample_input.kwargs
-            if op.op in (
-                torch.ops._torch_testing.numpy_nonzero,
-                torch.ops._torch_testing.numpy_nms,
-            ):
-                ctx = self.assertRaisesRegex(optests.OpCheckError, "failed with")
-            else:
-                ctx = contextlib.nullcontext()
-            with ctx:
-                optests.opcheck(
-                    op.op,
-                    args,
-                    kwargs,
-                )
+            torch.library.opcheck(
+                op.op,
+                args,
+                kwargs,
+            )
 
     def test_opcheck_fails_basic(self, device):
         @custom_op(f"{self.test_ns}::foo")
@@ -351,7 +352,7 @@ def foo_impl(x):
         with self.assertRaisesRegex(
             optests.OpCheckError, "Autograd has not been implemented for operator"
         ):
-            optests.opcheck(self.get_op(f"{self.test_ns}::foo"), (x,), {})
+            torch.library.opcheck(self.get_op(f"{self.test_ns}::foo"), (x,), {})
 
     def test_autograd_registration_check_autograd_kernel(self, device):
         lib = self.lib()
@@ -449,6 +450,42 @@ def test_assert_raises_regex(self, device):
 class TestCustomOp(CustomOpTestCaseBase):
     test_ns = "_test_custom_op"
 
+    @requires_compile
+    def test_functionalize_error(self):
+        with torch.library._scoped_library(self.test_ns, "FRAGMENT") as lib:
+            lib.define("foo(Tensor(a!) x) -> Tensor(a!)")
+
+            def foo(x):
+                return x.sin_()
+
+            lib.impl("foo", foo, "CompositeExplicitAutograd")
+            foo_op = self.get_op(f"{self.test_ns}::foo")
+
+            lib.define("bar(Tensor(a) x) -> Tensor(a)")
+
+            def bar(x):
+                return x.view(-1)
+
+            lib.impl("bar", bar, "CompositeExplicitAutograd")
+            bar_op = self.get_op(f"{self.test_ns}::bar")
+
+            msg = r".*We only support functionalizing operators whose outputs do not have alias annotations"
+
+            x = torch.randn(3)
+
+            @torch.compile(backend="aot_eager", fullgraph=True)
+            def f(x):
+                return foo_op(x)
+
+            @torch.compile(backend="aot_eager", fullgraph=True)
+            def g(x):
+                return bar_op(x)
+
+            with self.assertRaisesRegex(RuntimeError, msg):
+                f(x)
+            with self.assertRaisesRegex(RuntimeError, msg):
+                g(x)
+
     def test_invalid_schemas(self):
         # function schmea validation goes through torchgen, so this is just a
         # basic test.
@@ -464,7 +501,7 @@ def test_name_must_match(self):
 
             @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
             def baz(x: Tensor) -> Tensor:
-                raise NotImplementedError()
+                raise NotImplementedError
 
     def test_unsupported_schemas(self):
         with self.assertRaisesRegex(ValueError, "only supports functional"):
@@ -555,55 +592,119 @@ def blah8(x, *, y=1):
         def blah9(x, *, y):
             pass
 
-    # Tests for the older custom_op API
-    def test_unsupported_annotation_categories(self):
+    def test_infer_schema_supported(self):
+        def a(x: Tensor) -> Tensor:
+            return torch.empty([])
+
+        self.assertExpectedInline(infer_schema(a), """(Tensor x) -> Tensor""")
+
+        def kwonly1(x: Tensor, *, y: int, z: float) -> Tensor:
+            return torch.empty([])
+
+        self.assertExpectedInline(
+            infer_schema(kwonly1), """(Tensor x, *, SymInt y, float z) -> Tensor"""
+        )
+
+        def kwonly2(*, y: Tensor) -> Tensor:
+            return torch.empty([])
+
+        self.assertExpectedInline(infer_schema(kwonly2), """(*, Tensor y) -> Tensor""")
+
+        def b(
+            x: Tensor,
+            y: int,
+            z: bool,
+            a: float,
+            b: torch.dtype,
+            c: torch.device,
+            d: torch.types.Number,
+        ) -> Tuple[Tensor, int, float, bool]:
+            return torch.empty([]), 1, 0.1, True
+
+        self.assertExpectedInline(
+            infer_schema(b),
+            """(Tensor x, SymInt y, bool z, float a, ScalarType b, Device c, Scalar d) -> (Tensor, SymInt, float, bool)""",
+        )
+
+        def c(
+            x: Tensor,
+            y: Sequence[Tensor],
+            z: Optional[Tensor],
+            w: Sequence[Optional[Tensor]],
+        ) -> List[Tensor]:
+            return [torch.empty([])]
+
+        self.assertExpectedInline(
+            infer_schema(c),
+            """(Tensor x, Tensor[] y, Tensor? z, Tensor?[] w) -> Tensor[]""",
+        )
+
+        def d(x: Tensor) -> Tuple[List[Tensor], Tensor]:
+            return [torch.empty([])], torch.empty([])
+
+        self.assertExpectedInline(
+            infer_schema(d), """(Tensor x) -> (Tensor[], Tensor)"""
+        )
+
+        def e() -> Tensor:
+            return torch.empty([])
+
+        self.assertExpectedInline(infer_schema(e), """() -> Tensor""")
+
+        def f(x: Tensor) -> None:
+            pass
+
+        self.assertExpectedInline(infer_schema(f), """(Tensor x) -> ()""")
+
+        def g(
+            x: Tensor, y: List[Tensor], z: List[Tensor], w: List[Optional[Tensor]]
+        ) -> None:
+            pass
+
+        self.assertExpectedInline(
+            infer_schema(g), """(Tensor x, Tensor[] y, Tensor[] z, Tensor?[] w) -> ()"""
+        )
+
+        self.assertExpectedInline(
+            infer_schema(g, mutates_args={"x", "w", "z"}),
+            """(Tensor(a0!) x, Tensor[] y, Tensor(a2!)[] z, Tensor(a3!)?[] w) -> ()""",
+        )
+
+    def test_infer_schema_unsupported(self):
         with self.assertRaisesRegex(ValueError, "varargs"):
 
-            @custom_op(f"{TestCustomOp.test_ns}::foo")
             def foo(*args):
-                raise NotImplementedError()
+                raise NotImplementedError
 
-            del foo
+            infer_schema(foo)
 
         with self.assertRaisesRegex(ValueError, "varkwargs"):
 
-            @custom_op(f"{TestCustomOp.test_ns}::foo")
             def foo(**kwargs):
-                raise NotImplementedError()
+                raise NotImplementedError
 
-            del foo
+            infer_schema(foo)
 
         with self.assertRaisesRegex(ValueError, "must have a type annotation"):
 
-            @custom_op(f"{TestCustomOp.test_ns}::foo")
             def foo(x):
-                raise NotImplementedError()
-
-            del foo
+                raise NotImplementedError
 
-        with self.assertRaisesRegex(ValueError, "default value"):
+            infer_schema(foo)
 
-            @custom_op(f"{TestCustomOp.test_ns}::foo")
-            def foo(x: Optional[Tensor] = None):
-                raise NotImplementedError()
-
-            del foo
+        with self.assertRaisesRegex(ValueError, "unsupported"):
 
-        with self.assertRaisesRegex(ValueError, "default value"):
+            def foo(x: Tensor) -> Tuple[Tensor, ...]:
+                raise NotImplementedError
 
-            @custom_op(f"{TestCustomOp.test_ns}::foo")
-            def foo(x: Optional[Tensor] = None):
-                raise NotImplementedError()
+            infer_schema(foo)
 
-            del foo
+        with self.assertRaisesRegex(ValueError, "can be mutated"):
 
-        with self.assertRaisesRegex(ValueError, "unsupported"):
+            def foo(x: Tensor, y: int) -> Tensor:
+                raise NotImplementedError
 
-            @custom_op(f"{TestCustomOp.test_ns}::foo")
-            def foo(x: Tensor) -> Tuple[Tensor, ...]:
-                raise NotImplementedError()
-
-            del foo
+            infer_schema(foo, mutates_args={"y"})
 
     def _generate_examples(self, typ):
         if typ is int:
@@ -649,13 +750,13 @@ def _generate_examples(self, typ):
         )
 
     def test_supported_return_types_single_return(self):
-        for typ in torch._custom_op.impl.SUPPORTED_RETURN_TYPES:
+        for typ in torch._library.infer_schema.SUPPORTED_RETURN_TYPES:
             for example in self._generate_examples(typ):
                 try:
 
                     @custom_ops.custom_op(f"{self.test_ns}::foo")
                     def foo(x: Tensor) -> typ:
-                        raise NotImplementedError()
+                        raise NotImplementedError
 
                     @custom_ops.impl(f"{self.test_ns}::foo")
                     def foo_impl(x: Tensor) -> typ:
@@ -668,13 +769,13 @@ def foo_impl(x: Tensor) -> typ:
                     custom_ops._destroy(f"{self.test_ns}::foo")
 
     def test_supported_return_types_multi_return(self):
-        for typ in torch._custom_op.impl.SUPPORTED_RETURN_TYPES:
+        for typ in torch._library.infer_schema.SUPPORTED_RETURN_TYPES:
             for example in self._generate_examples(typ):
                 try:
 
                     @custom_ops.custom_op(f"{self.test_ns}::foo")
                     def foo(x: Tensor) -> Tuple[typ, typ]:
-                        raise NotImplementedError()
+                        raise NotImplementedError
 
                     @custom_ops.impl(f"{self.test_ns}::foo")
                     def foo_impl(x: Tensor) -> Tuple[typ, typ]:
@@ -688,11 +789,11 @@ def foo_impl(x: Tensor) -> Tuple[typ, typ]:
                     custom_ops._destroy(f"{self.test_ns}::foo")
 
     def test_supported_param_types(self):
-        for typ in torch._custom_op.impl.SUPPORTED_PARAM_TYPES:
+        for typ in torch._library.infer_schema.SUPPORTED_PARAM_TYPES:
 
             @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
             def foo(x: Tensor, y: typ) -> Tensor:
-                raise NotImplementedError()
+                raise NotImplementedError
 
             yeet = None
 
@@ -726,7 +827,7 @@ def __len__(self):
 
         @custom_ops.custom_op(f"{self.test_ns}::foo")
         def foo(x: torch.Tensor, sizes: Sequence[int]) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         called = 0
 
@@ -750,7 +851,7 @@ def test_unsupported_param_types(self):
 
             @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
             def foo(x: Tensor, y: List[Optional[int]]) -> Tensor:
-                raise NotImplementedError()
+                raise NotImplementedError
 
             del foo
 
@@ -758,16 +859,7 @@ def foo(x: Tensor, y: List[Optional[int]]) -> Tensor:
             # int[N] in Dispatcher is a bit wild, so we don't try to support it.
             @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
             def foo(x: Tensor, y: Tuple[int, int]) -> Tensor:
-                raise NotImplementedError()
-
-            del foo
-
-        with self.assertRaisesRegex(ValueError, "unsupported type"):
-            # We could theoretically support this, but the syntax for suporting
-            # int[] is Sequence[int]
-            @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
-            def foo(x: Tensor, y: List[int]) -> Tensor:
-                raise NotImplementedError()
+                raise NotImplementedError
 
             del foo
 
@@ -775,7 +867,7 @@ def foo(x: Tensor, y: List[int]) -> Tensor:
 
             @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
             def foo(x: Tensor, y: Callable) -> Tensor:
-                raise NotImplementedError()
+                raise NotImplementedError
 
             del foo
 
@@ -822,7 +914,7 @@ def test_reserved_ns(self):
 
                 @custom_ops.custom_op(f"{ns}::foo2")
                 def foo2(x: torch.Tensor) -> torch.Tensor:
-                    raise NotImplementedError()
+                    raise NotImplementedError
 
     def test_private_ctor(self):
         with self.assertRaisesRegex(RuntimeError, "CustomOp constructor is private"):
@@ -831,7 +923,7 @@ def test_private_ctor(self):
     def test_lifetime(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         custom_op = torch._custom_op.impl.get_op(f"{TestCustomOp.test_ns}::foo")
 
@@ -840,7 +932,7 @@ def foo(x: torch.Tensor) -> torch.Tensor:
 
             @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
             def foo(x: torch.Tensor) -> torch.Tensor:  # noqa: F811
-                raise NotImplementedError()
+                raise NotImplementedError
 
         # Unless we delete the original op.
         custom_ops._destroy(f"{TestCustomOp.test_ns}::foo")
@@ -848,14 +940,14 @@ def foo(x: torch.Tensor) -> torch.Tensor:  # noqa: F811
         # Smoke test
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:  # noqa: F811
-            raise NotImplementedError()
+            raise NotImplementedError
 
         custom_ops._destroy(f"{TestCustomOp.test_ns}::foo")
 
     def test_autograd_notimplemented(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:  # noqa: F811
-            raise NotImplementedError()
+            raise NotImplementedError
 
         x = torch.randn(3, requires_grad=True)
         op = self.get_op(f"{self.test_ns}::foo")
@@ -866,7 +958,7 @@ def foo(x: torch.Tensor) -> torch.Tensor:  # noqa: F811
 
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: Sequence[torch.Tensor]) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         x = torch.randn(3, requires_grad=True)
         y = torch.randn(3)
@@ -878,7 +970,7 @@ def foo(x: Sequence[torch.Tensor]) -> torch.Tensor:
 
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         x = torch.randn(3, requires_grad=True)
         y = torch.randn(3)
@@ -890,7 +982,7 @@ def foo(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     def test_autograd_notimplemented_gradmode(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo")
         def foo_impl(x, y):
@@ -906,7 +998,7 @@ def foo_impl(x, y):
     def test_impl_cpu(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo", device_types="cpu")
         def foo_cpu(x):
@@ -920,7 +1012,7 @@ def foo_cpu(x):
     def test_impl_invalid_devices(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         def foo_impl(x):
             return x.sin()
@@ -945,7 +1037,7 @@ def foo_impl(x):
     def test_backward_partially_registered(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo")
         def foo_impl(x):
@@ -966,7 +1058,7 @@ def foo_backward(ctx, saved, grad):
     def test_save_for_backward_inputs_are_namedtuple(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo")
         def foo_impl(x):
@@ -996,7 +1088,7 @@ def foo_backward(ctx, saved, grad):
     def test_backward_returns_dict(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo")
         def foo_impl(x):
@@ -1019,7 +1111,7 @@ def foo_backward(ctx, saved, grad):
     def test_backward_dict_invalid_keys(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo")
         def foo_impl(x):
@@ -1042,7 +1134,7 @@ def foo_backward(ctx, saved, grad):
     def test_backward_dict_grad_for_nontensor(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor, dim: int) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo")
         def foo_impl(x, dim):
@@ -1065,7 +1157,7 @@ def foo_backward(ctx, saved, grad):
     def test_backward_dict_requires_keys_for_input_tensors(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo")
         def foo_impl(x, y):
@@ -1088,7 +1180,7 @@ def foo_backward(ctx, saved, grad):
     def test_backward_dict_requires_keys_for_input_optional_tensors(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor, y: Optional[torch.Tensor]) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo")
         def foo_impl(x, y):
@@ -1111,7 +1203,7 @@ def foo_backward(ctx, saved, grad):
     def test_backward_grads_are_tensor_or_none(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo")
         def foo_impl(x):
@@ -1134,7 +1226,7 @@ def foo_backward(ctx, saved, grad):
     def test_backward_tensorlist_input_requires_list_grads_with_same_numel(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(xs: Sequence[torch.Tensor]) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo")
         def foo_impl(xs):
@@ -1157,7 +1249,7 @@ def foo_backward(ctx, saved, grad):
     def test_backward_tensorlist_input_requires_list_grads_none_or_Tensor(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(xs: Sequence[torch.Tensor]) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo")
         def foo_impl(xs):
@@ -1180,7 +1272,7 @@ def foo_backward(ctx, saved, grad):
     def test_backward_tensorlist_input_requires_list_grads(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(xs: Sequence[torch.Tensor]) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo")
         def foo_impl(xs):
@@ -1203,7 +1295,7 @@ def foo_backward(ctx, saved, grad):
     def test_backward_output_differentiability_type(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(xs: Sequence[torch.Tensor]) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         with self.assertRaisesRegex(RuntimeError, "output_differentiability"):
 
@@ -1216,7 +1308,7 @@ def foo_backward(ctx, saved, grad):
     def test_backward_output_differentiability_numel(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(xs: Sequence[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         with self.assertRaisesRegex(RuntimeError, "output_differentiability"):
 
@@ -1229,7 +1321,7 @@ def foo_backward(ctx, saved, grad):
     def test_backward_output_differentiability_tensorlist(self):
         @custom_ops.custom_op(f"{self.test_ns}::foo")
         def foo(x: Tensor) -> Tuple[List[Tensor], Tensor]:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{self.test_ns}::foo")
         def foo_impl(x):
@@ -1255,7 +1347,7 @@ def foo_backward(ctx, saved, grad_lst, grad):
     def test_backward_output_differentiability_non_tensor(self):
         @custom_ops.custom_op(f"{self.test_ns}::foo")
         def foo(x: Tensor) -> Tuple[Tensor, int]:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{self.test_ns}::foo")
         def foo_impl(x):
@@ -1280,7 +1372,7 @@ def foo_backward(ctx, saved, grad0, grad1):
     def test_impl_separate(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo", device_types="cpu")
         def foo_cpu(x):
@@ -1304,7 +1396,7 @@ def foo_cuda(x):
     def test_impl_multiple(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @custom_ops.impl(f"{TestCustomOp.test_ns}::foo")
         def foo_impl(x):
@@ -1334,7 +1426,7 @@ def test_impl_abstract_overload(self):
     def test_impl_meta(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor, dim: int) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @torch.library.impl_abstract(f"{TestCustomOp.test_ns}::foo", lib=self.lib())
         def foo_meta(x, dim):
@@ -1350,7 +1442,7 @@ def foo_meta(x, dim):
     def test_duplicate_impl(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor, dim: int) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @torch.library.impl_abstract(f"{TestCustomOp.test_ns}::foo", lib=self.lib())
         def foo_meta(x, dim):
@@ -1369,17 +1461,18 @@ def foo_meta2(x, dim):
     def test_new_data_dependent_symint(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @torch.library.impl_abstract(f"{TestCustomOp.test_ns}::foo", lib=self.lib())
         def foo_meta(x):
             ctx = torch.library.get_ctx()
-            ctx.new_dynamic_size(min=1)
+            r = ctx.new_dynamic_size(min=1)
             with self.assertRaisesRegex(ValueError, "greater than or equal to 0"):
                 ctx.new_dynamic_size(min=-1)
             with self.assertRaisesRegex(ValueError, "SymInt"):
                 ctx.new_dynamic_size(max=x.numel())
-            return torch.clone(x)
+            # NB: You must return dynamic sizes!
+            return x.new_empty(r)
 
         x = torch.randn(2, 3, device="cpu")
         op = self.get_op(f"{self.test_ns}::foo")
@@ -1388,14 +1481,14 @@ def foo_meta(x):
     def test_meta_for_data_dependent_shape_operation(self):
         x = torch.randn(10, device="meta")
         with self.assertRaisesRegex(RuntimeError, "data-dependent output shape"):
-            torch.ops._torch_testing.numpy_nonzero(x)
+            numpy_nonzero(x)
 
     def test_basic_make_fx(self):
         # More serious tests are in our CustomOp opinfo db,
         # this one is just a sanity check.
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @torch.library.impl_abstract(f"{TestCustomOp.test_ns}::foo", lib=self.lib())
         def foo_meta(x):
@@ -1409,7 +1502,7 @@ def foo_meta(x):
     def test_not_implemented_error(self):
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
         def foo(x: torch.Tensor) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         x = torch.randn(3)
         op = self.get_op(f"{self.test_ns}::foo")
@@ -1417,44 +1510,27 @@ def foo(x: torch.Tensor) -> torch.Tensor:
             op(x)
 
         x = torch.randn(3, device="meta")
-        with self.assertRaisesRegex(
-            NotImplementedError, "no abstract impl or Meta kernel"
-        ):
+        with self.assertRaisesRegex(NotImplementedError, "no fake impl or Meta kernel"):
             op(x)
 
         @custom_ops.custom_op(f"{TestCustomOp.test_ns}::bar")
         def bar(sizes: Sequence[int]) -> torch.Tensor:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         op = self.get_op(f"{self.test_ns}::bar")
         with self.assertRaisesRegex(NotImplementedError, "no Tensor inputs"):
             op((1, 2, 3))
 
-    def test_abstract_registration_location(self):
-        custom_op = torch._custom_op.impl._find_custom_op(
-            "_torch_testing::numpy_nonzero"
-        )
-        source = torch._library.simple_registry.singleton.find(
-            "_torch_testing::numpy_nonzero"
-        ).abstract_impl.kernel.source
-        self.assertRegex(source, r".*custom_op_db.py:\d+")
-
     def test_data_dependent_basic(self):
-        def f(x):
-            return torch.ops._torch_testing.numpy_nonzero(x)
-
         x = torch.randn(5, 5)
-        gm = make_fx(f, tracing_mode="symbolic")(x)
+        gm = make_fx(numpy_nonzero, tracing_mode="symbolic")(x)
         self.assertTrue("nonzero" in gm.code)
 
     def test_data_dependent_fake_tracing(self):
-        def f(x):
-            return torch.ops._torch_testing.numpy_nonzero(x)
-
         x = torch.randn(5, 5)
         # We've updated to attempt to use unbacked symints even for fake
         # tracing
-        make_fx(f, tracing_mode="fake")(x)
+        make_fx(numpy_nonzero, tracing_mode="fake")(x)
 
     def test_symints(self):
         def f(x):
@@ -1485,15 +1561,17 @@ def test_data_dependent_compile(self):
 
         @torch.compile(backend=cnt)
         def f(x):
-            return torch.ops._torch_testing.numpy_nonzero(x.clone()).clone()
+            return numpy_nonzero(x.clone()).clone()
 
         f(torch.randn(10))
 
-        self.assertEqual(
-            dict(counters["graph_break"]),
-            {
-                "dynamic shape operator: _torch_testing.numpy_nonzero.default; to enable, set torch._dynamo.config.capture_dynamic_output_shape_ops = True": 1  # noqa: B950
-            },
+        self.assertEqual(len(counters["graph_break"]), 1)
+        self.assertEqual(next(iter(counters["graph_break"].values())), 1)
+        self.assertExpectedInline(
+            next(iter(counters["graph_break"].keys())).replace(";", "\n"),
+            """\
+dynamic shape operator: _torch_testing.numpy_nonzero.default
+ to enable, set torch._dynamo.config.capture_dynamic_output_shape_ops = True""",
         )
 
     # pre-existing problem: torch.compile(dynamic=True) will, by default,
@@ -1653,6 +1731,50 @@ def test_backward_impl_on_existing_op_with_key(self, key):
         lib.impl("foo", lambda x: x.sin().cos(), key)
         self._test_backward_impl_raises(qualname, key)
 
+    def test_is_functional_schema(self):
+        tests = {
+            "foo(Tensor x) -> Tensor": True,
+            "foo(Tensor(a) x) -> Tensor": True,
+            "foo(Tensor(a!) x) -> Tensor": False,
+            "foo(Tensor(a) x) -> Tensor(a)": False,
+            "foo(Tensor x) -> ()": False,
+        }
+        for schema_str, expected in tests.items():
+            res = torch._library.utils.is_functional_schema(schema_str)
+            self.assertEqual(res, expected)
+
+            from torchgen.model import FunctionSchema
+
+            schema = FunctionSchema.parse(schema_str)
+            res = torch._library.utils.is_functional_schema(schema)
+            self.assertEqual(res, expected)
+
+            schema = torch._C.parse_schema(schema_str)
+            res = torch._library.utils.is_functional_schema(schema)
+            self.assertEqual(res, expected)
+
+    def test_is_tensorlist_like_type(self):
+        tensorlists = [
+            # Tensor[]
+            torch.ops.aten.where.default._schema.returns[0].type,
+            # Tensor?[]
+            torch.ops.aten.index.Tensor._schema.arguments[1].type,
+            # Tensor[]?
+            torch._C.parse_schema("foo(Tensor[]? x) -> ()").arguments[0].type,
+            # Tensor?[]?
+            torch._C.parse_schema("foo(Tensor?[]? x) -> ()").arguments[0].type,
+        ]
+        non_tensorlists = [
+            # Tensor
+            torch.ops.aten.sin.default._schema.arguments[0].type,
+            # IntList
+            torch.ops.aten.sum.dim_IntList._schema.arguments[1].type,
+        ]
+        for a in tensorlists:
+            self.assertTrue(torch._library.utils.is_tensorlist_like_type(a))
+        for a in non_tensorlists:
+            self.assertFalse(torch._library.utils.is_tensorlist_like_type(a))
+
     def test_backward_impl_on_existing_op(self):
         lib = self.lib()
         lib.define("foo(Tensor x) -> Tensor")
@@ -1835,6 +1957,47 @@ def test_impl_device_invalid(self):
         with self.assertRaisesRegex(RuntimeError, "Expected one of cpu, cuda"):
             torch.library.impl("blah::blah", "somethingsomething")
 
+    def test_autograd_function_backed_op(self):
+        cpp_source = """
+struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
+  static constexpr bool is_traceable = true;
+
+  static torch::Tensor forward(
+      torch::autograd::AutogradContext* ctx,
+      const torch::Tensor& x) {
+    return x;
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext *ctx,
+      torch::autograd::variable_list grad_output) {
+    return grad_output;
+  }
+};
+
+torch::Tensor custom_op_backed_by_autograd_fn(const torch::Tensor& x) {
+  return CustomOpAutogradFunction::apply(x);
+}
+
+TORCH_LIBRARY(mylib, m) {
+    m.def("custom_op_backed_by_autograd_fn", custom_op_backed_by_autograd_fn);
+}
+        """
+
+        module = torch.utils.cpp_extension.load_inline(
+            name="mylib",
+            cpp_sources=cpp_source,
+            functions="custom_op_backed_by_autograd_fn",
+            verbose=True,
+        )
+
+        x = torch.ones(2, 2, requires_grad=True)
+        temp = x.clone().detach()
+        out = torch.ops.mylib.custom_op_backed_by_autograd_fn(x)
+        loss = out.sum()
+        loss.backward()
+        self.assertEqual(x.grad, temp)
+
 
 def op_with_incorrect_schema(testcase, name):
     lib = testcase.lib()
@@ -1863,7 +2026,7 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, grad):
-                raise NotImplementedError()
+                raise NotImplementedError
 
         def autograd_impl(x):
             return Op.apply(x)
@@ -1949,6 +2112,762 @@ def test_delayed_error_no_requires_grad(self):
         y = op(x)
 
 
+class TestCustomOpAPI(TestCase):
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_basic(self):
+        @torch.library.custom_op("_torch_testing::add", mutates_args=())
+        def add(x: Tensor, y: float) -> Tensor:
+            x_np = x.numpy(force=True)
+            out_np = x_np + y
+            return torch.from_numpy(out_np).to(x.device)
+
+        x = torch.randn(3)
+        y = 3.14
+        z = add(x, y)
+        self.assertEqual(z, x + y)
+
+        cpu_called = False
+
+        @add.register_kernel("cpu")
+        def _(x, y):
+            nonlocal cpu_called
+            cpu_called = True
+            x_np = x.numpy()
+            out_np = x_np + y
+            return torch.from_numpy(out_np)
+
+        z = add(x, y)
+        self.assertEqual(z, x + y)
+        self.assertTrue(cpu_called)
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_manual_schema(self):
+        @torch.library.custom_op(
+            "_torch_testing::add",
+            mutates_args=(),
+            schema="(Tensor x, float y) -> Tensor",
+        )
+        def add(x, y):
+            x_np = x.numpy(force=True)
+            out_np = x_np + y
+            return torch.from_numpy(out_np).to(x.device)
+
+        x = torch.randn(3)
+        y = 3.14
+        z = add(x, y)
+        self.assertEqual(z, x + y)
+
+        @torch.library.custom_op(
+            "_torch_testing::sin_",
+            mutates_args=["x"],
+            schema="(Tensor(a!) x) -> ()",
+        )
+        def sin_(x):
+            x_np = x.numpy()
+            np.sin(x_np, out=x_np)
+
+        x = torch.randn(3)
+        expected = x.sin()
+        sin_(x)
+        self.assertEqual(x, expected)
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_kwarg_only_tensors(self):
+        with self.assertRaisesRegex(NotImplementedError, "kwarg-only Tensor args"):
+
+            @torch.library.custom_op("_torch_testing::foo", mutates_args=())
+            def foo(x: Tensor, *, y: int, z: Tensor) -> Tensor:
+                pass
+
+        with self.assertRaisesRegex(NotImplementedError, "kwarg-only Tensor args"):
+
+            @torch.library.custom_op("_torch_testing::foo", mutates_args=())
+            def foo2(x: Tensor, *, y: int, z: Optional[Tensor]) -> Tensor:
+                pass
+
+        with self.assertRaisesRegex(NotImplementedError, "kwarg-only Tensor args"):
+
+            @torch.library.custom_op("_torch_testing::foo", mutates_args=())
+            def foo3(x: Tensor, *, y: int, z: List[Tensor]) -> Tensor:
+                pass
+
+        with torch.library._scoped_library("_torch_testing", "FRAGMENT") as lib:
+            lib.define("foo(Tensor x, *, Tensor y) -> Tensor")
+            with self.assertRaisesRegex(NotImplementedError, "kwarg-only Tensor args"):
+                torch.library.register_autograd(
+                    "_torch_testing::foo",
+                    lambda grad: grad,
+                    setup_context=lambda ctx, inputs, keyword_only_inputs, output: None,
+                )
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_register_autograd_kwargonly_low_level(self):
+        with torch.library._scoped_library("_torch_testing", "FRAGMENT") as lib:
+            lib.define("foo(Tensor x, *, float y) -> Tensor")
+            called = False
+
+            def foo_impl(x, *, y):
+                return x * y
+
+            lib.impl("foo", foo_impl, "CPU")
+
+            def backward(ctx, grad):
+                nonlocal called
+                called = True
+                return grad * ctx.y
+
+            def setup_context(ctx, inputs, keyword_only_inputs, output):
+                assert tuple(keyword_only_inputs.keys()) == ("y",)
+                ctx.y = keyword_only_inputs["y"]
+
+            torch.library.register_autograd(
+                "_torch_testing::foo", backward, setup_context=setup_context, lib=lib
+            )
+
+            x = torch.randn(3, requires_grad=True)
+            torch.ops._torch_testing.foo(x, y=3.14).sum().backward()
+            self.assertTrue(called)
+            self.assertEqual(x.grad, torch.tensor([3.14, 3.14, 3.14]))
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_register_autograd_defaults(self):
+        with torch.library._scoped_library("_torch_testing", "FRAGMENT") as lib:
+            lib.define("foo(Tensor w, int x = 2, *, int y = 3, int z) -> Tensor")
+
+            def foo_impl(w, x=2, *, y=3, z):
+                return w * x * y * z
+
+            lib.impl("foo", foo_impl, "CPU")
+
+            called = False
+
+            def backward(ctx, grad):
+                nonlocal called
+                called = True
+                return grad * ctx.c
+
+            def setup_context(ctx, inputs, keyword_only_inputs, output):
+                assert len(inputs) == 2
+                assert inputs[1] == 2
+                assert keyword_only_inputs == {"y": 3, "z": 42}
+                ctx.c = keyword_only_inputs["y"] * keyword_only_inputs["z"] * inputs[1]
+
+            torch.library.register_autograd(
+                "_torch_testing::foo", backward, setup_context=setup_context, lib=lib
+            )
+
+            w = torch.randn(3, requires_grad=True)
+            torch.ops._torch_testing.foo(w, z=42).sum().backward()
+            self.assertTrue(called)
+            self.assertEqual(w.grad, torch.full_like(w, 2 * 3 * 42))
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_manual_schema_error(self):
+        with self.assertRaisesRegex(ValueError, "the op mutates {'x'}"):
+
+            @torch.library.custom_op(
+                "_torch_testing::sin_",
+                mutates_args=(),
+                schema="(Tensor(a!) x) -> ()",
+            )
+            def sin_(x):
+                x_np = x.numpy()
+                np.sin(x_np, out=x_np)
+
+    def test_supports_tensorlist(self):
+        @torch._library.autograd.supports_tensorlist
+        class Stack(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, xs):
+                return torch.stack(xs)
+
+            @staticmethod
+            def backward(ctx, grad):
+                return list(grad.unbind(0))
+
+        # call two applys, do a backward on the first
+        def t():
+            return torch.randn([], requires_grad=True)
+
+        xs0 = [t(), t(), t()]
+        xs1 = [t(), t(), t(), t()]
+        y0 = Stack.apply(xs0)
+        y1 = Stack.apply(xs1)
+        grads = torch.autograd.grad(y0.sum(), xs0)
+        self.assertEqual(grads, [torch.tensor(1.0) for _ in range(3)])
+
+        # call one apply, do multiple backwards
+        xs = [t(), t(), t()]
+        y = Stack.apply(xs)
+        _ = torch.autograd.grad(y.sum(), xs, retain_graph=True)
+        _ = torch.autograd.grad(y.sum(), xs, retain_graph=True)
+        grads = torch.autograd.grad(y.sum(), xs, retain_graph=True)
+        self.assertEqual(grads, [torch.tensor(1.0) for _ in range(3)])
+
+        # error: on access forward, backward directly
+        with self.assertRaisesRegex(NotImplementedError, "Function.forward directly"):
+            Stack.forward(None, xs)
+        with self.assertRaisesRegex(NotImplementedError, "Function.backward directly"):
+            Stack.backward(None, xs)
+
+        # the recursive case
+        @torch._library.autograd.supports_tensorlist
+        class Foo(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, xs):
+                if len(xs) > 0:
+                    return Foo.apply(xs[1:])
+                ctx.len_xs = len(xs)
+                return x.sin()
+
+            @staticmethod
+            def backward(ctx, grad):
+                result = [None] * len_xs
+                result[-1] = grad.cos()
+                return result
+
+        with self.assertRaisesRegex(NotImplementedError, "Recursive call"):
+            Foo.apply(xs)
+
+        # recursive on backward
+        @torch._library.autograd.supports_tensorlist
+        class Bar(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, xs):
+                return [xs[i] + i for i in range(len(xs))]
+
+            @staticmethod
+            def backward(ctx, grads):
+                f1 = Bar.apply(grads[:2])
+                f2 = Bar.apply(grads[2:])
+                return f1 + f2
+
+        xs = [torch.tensor(0.0, requires_grad=True) for _ in range(5)]
+        ys = Bar.apply(xs)
+        sum(ys).backward()
+        result = [xi.grad for xi in xs]
+        self.assertEqual(result, torch.tensor([1.0, 2, 1, 2, 3]).unbind(0))
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_default_values(self):
+        defaults = []
+
+        @torch.library.custom_op("_torch_testing::f", mutates_args=())
+        def f(
+            x: Tensor,
+            a: Optional[int] = None,
+            b: float = 3.14,
+            c: bool = True,
+            d: int = 3,
+        ) -> Tensor:
+            defaults.extend([a, b, c, d])
+            return x.clone()
+
+        x = torch.randn(3)
+        f(x)
+        self.assertEqual(defaults, [None, 3.14, True, 3])
+
+    def test_mutated_error(self):
+        with self.assertRaisesRegex(
+            ValueError, r".*{'y'} in mutates_args were not found"
+        ):
+
+            @torch.library.custom_op(
+                "_torch_testing::numpy_sin_inplace",
+                mutates_args={"y"},
+                device_types="cpu",
+            )
+            def numpy_sin_inplace(x: Tensor) -> None:
+                x_np = x.numpy()
+                np.sin(x_np, out=x_np)
+
+    def test_mutated(self):
+        @torch.library.custom_op(
+            "_torch_testing::numpy_sin_inplace", mutates_args={"x"}, device_types="cpu"
+        )
+        def numpy_sin_inplace(x: Tensor) -> None:
+            x_np = x.numpy()
+            np.sin(x_np, out=x_np)
+
+        x = torch.randn(3)
+        version = x._version
+        expected = x.sin()
+        numpy_sin_inplace(x)
+        self.assertEqual(x, expected)
+        self.assertGreater(x._version, version)
+
+        @torch.library.custom_op("_torch_testing::f", mutates_args={"y", "z", "w"})
+        def f(
+            x: Tensor, y: Optional[Tensor], z: List[Tensor], w: List[Optional[Tensor]]
+        ) -> None:
+            return
+
+        x = torch.randn(3)
+        y = torch.randn(3)
+        z = [torch.randn(3), torch.randn(3)]
+        w = [torch.randn(3), None, torch.randn(3)]
+        initial_versions = pytree.tree_map_only(
+            torch.Tensor, lambda x: x._version, (x, y, z, w)
+        )
+        f(x, y, z, w)
+        new_versions = pytree.tree_map_only(
+            torch.Tensor, lambda x: x._version, (x, y, z, w)
+        )
+
+        self.assertEqual(initial_versions[0], new_versions[0])
+        initial_versions, _ = pytree.tree_flatten(initial_versions[1:])
+        new_versions, _ = pytree.tree_flatten(new_versions[1:])
+        for prev, after in zip(initial_versions, new_versions):
+            if prev is None and after is None:
+                continue
+            self.assertGreater(after, prev)
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    @parametrize("idx", [0, 1, 2, 3, 4, 5])
+    def test_library_register_fake_source(self, idx):
+        opname = f"source{idx}"
+        op = getattr(torch.ops._torch_testing, opname).default
+        entry = torch._library.simple_registry.singleton.find(op._name)
+        source = entry.abstract_impl.kernel.source
+        assert source is not None
+        self.assertTrue("custom_op_db.py" in source)
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_library_register_fake(self):
+        for mode in ["function", "qualname", "opoverload"]:
+
+            @torch.library.custom_op("_torch_testing::add", mutates_args=())
+            def add(x: Tensor, y: float) -> Tensor:
+                x_np = x.cpu().numpy()
+                out_np = x_np + y
+                return torch.from_numpy(out_np).to(x.device)
+
+            called = False
+
+            if mode == "function":
+                dec = torch.library.register_fake(add)
+                self.assertIsNotNone(dec)
+            elif mode == "qualname":
+                dec = torch.library.register_fake("_torch_testing::add")
+                self.assertIsNotNone(dec)
+            elif mode == "opoverload":
+                dec = torch.library.register_fake(torch.ops._torch_testing.add.default)
+                self.assertIsNotNone(dec)
+            else:
+                raise AssertionError("should not get here")
+
+            @dec
+            def _(x, y):
+                nonlocal called
+                called = True
+                return torch.empty_like(x)
+
+            with torch._subclasses.fake_tensor.FakeTensorMode():
+                x = torch.randn(3)
+                y = 3.14
+                z = add(x, y)
+                self.assertEqual(z.shape, x.shape)
+                self.assertTrue(called)
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_library_register_kernel(self):
+        modes = ["function", "qualname", "opoverload"]
+        calls = ["decorator", "function"]
+        device_types_options = ["cpu", None]
+
+        for mode, call, device_types in itertools.product(
+            modes, calls, device_types_options
+        ):
+
+            @torch.library.custom_op(
+                "_torch_testing::add", mutates_args=(), device_types="cuda"
+            )
+            def add(x: Tensor, y: float) -> Tensor:
+                x_np = x.cpu().numpy()
+                out_np = x_np + y
+                return torch.from_numpy(out_np).to(x.device)
+
+            if mode == "function":
+                op = add
+            elif mode == "qualname":
+                op = "_torch_testing::add"
+            else:
+                assert mode == "opoverload"
+                op = torch.ops._torch_testing.add.default
+
+            called = False
+
+            if call == "decorator":
+
+                @torch.library.register_kernel(op, device_types)
+                def _(x, y):
+                    nonlocal called
+                    called = True
+                    x_np = x.numpy()
+                    out_np = x_np + y
+                    return torch.from_numpy(out_np)
+
+            else:
+                assert call == "function"
+
+                def add_cpu(x, y):
+                    nonlocal called
+                    called = True
+                    x_np = x.numpy()
+                    out_np = x_np + y
+                    return torch.from_numpy(out_np)
+
+                torch.library.register_kernel(op, device_types, add_cpu)
+
+            x = torch.randn(3)
+            y = 3.14
+            z = add(x, y)
+            self.assertEqual(z, x + y)
+            self.assertTrue(called)
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_library_register_kernel_low_level(self):
+        modes = ["qualname", "opoverload"]
+        calls = ["decorator", "function"]
+        device_types_options = [("cpu", "cuda"), "cpu", None]
+
+        for mode, call, device_types in itertools.product(
+            modes, calls, device_types_options
+        ):
+            with torch.library._scoped_library("_torch_testing", "FRAGMENT") as lib:
+                lib.define("add9(Tensor x, float y) -> Tensor")
+
+                if mode == "qualname":
+                    op = "_torch_testing::add9"
+                else:
+                    assert mode == "opoverload"
+                    op = torch.ops._torch_testing.add9.default
+
+                called = False
+
+                if call == "decorator":
+
+                    @torch.library.register_kernel(op, device_types, lib=lib)
+                    def _(x, y):
+                        nonlocal called
+                        called = True
+                        x_np = x.numpy()
+                        out_np = x_np + y
+                        return torch.from_numpy(out_np)
+
+                else:
+                    assert call == "function"
+
+                    def add_cpu(x, y):
+                        nonlocal called
+                        called = True
+                        x_np = x.numpy()
+                        out_np = x_np + y
+                        return torch.from_numpy(out_np)
+
+                    torch.library.register_kernel(op, device_types, add_cpu, lib=lib)
+
+                x = torch.randn(3)
+                y = 3.14
+                z = torch.ops._torch_testing.add9.default(x, y)
+                self.assertEqual(z, x + y)
+                self.assertTrue(called)
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_library_register_autograd(self):
+        for mode in ["function", "qualname", "opoverload"]:
+
+            @torch.library.custom_op("mylib::numpy_sin", mutates_args=())
+            def numpy_sin(x: Tensor) -> Tensor:
+                x_np = x.cpu().numpy()
+                y_np = np.sin(x_np)
+                return torch.from_numpy(y_np).to(device=x.device)
+
+            def setup_context(ctx, inputs, output) -> Tensor:
+                (x,) = inputs
+                ctx.save_for_backward(x)
+
+            called = False
+
+            def backward(ctx, grad):
+                nonlocal called
+                called = True
+                (x,) = ctx.saved_tensors
+                return grad * x.cos()
+
+            if mode == "function":
+                torch.library.register_autograd(
+                    numpy_sin, backward, setup_context=setup_context
+                )
+            elif mode == "qualname":
+                torch.library.register_autograd(
+                    "mylib::numpy_sin", backward, setup_context=setup_context
+                )
+            elif mode == "opoverload":
+                torch.library.register_autograd(
+                    torch.ops.mylib.numpy_sin.default,
+                    backward,
+                    setup_context=setup_context,
+                )
+
+            x = torch.randn(3, requires_grad=True)
+            y = numpy_sin(x)
+            (grad_x,) = torch.autograd.grad(y, x, torch.ones_like(y))
+            self.assertTrue(called)
+            self.assertEqual(grad_x, x.cos())
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_library_register_autograd_low_level(self):
+        for mode in ["qualname", "opoverload"]:
+            with torch.library._scoped_library("_torch_testing", "FRAGMENT") as lib:
+                lib.define("sin5(Tensor x) -> Tensor")
+
+                def numpy_sin(x: Tensor) -> Tensor:
+                    x_np = x.cpu().detach().numpy()
+                    y_np = np.sin(x_np)
+                    return torch.from_numpy(y_np).to(device=x.device)
+
+                def setup_context(ctx, inputs, output) -> Tensor:
+                    (x,) = inputs
+                    ctx.save_for_backward(x)
+
+                called = False
+
+                def backward(ctx, grad):
+                    nonlocal called
+                    called = True
+                    (x,) = ctx.saved_tensors
+                    return grad * x.cos()
+
+                lib.impl("sin5", numpy_sin, "CPU")
+
+                called = False
+
+                if mode == "qualname":
+                    torch.library.register_autograd(
+                        "_torch_testing::sin5",
+                        backward,
+                        setup_context=setup_context,
+                        lib=lib,
+                    )
+                elif mode == "opoverload":
+                    torch.library.register_autograd(
+                        torch.ops._torch_testing.sin5.default,
+                        backward,
+                        setup_context=setup_context,
+                        lib=lib,
+                    )
+                x = torch.randn(3, requires_grad=True)
+                y = torch.ops._torch_testing.sin5(x)
+                (grad_x,) = torch.autograd.grad(y, x, torch.ones_like(y))
+                self.assertTrue(called)
+                self.assertEqual(grad_x, x.cos())
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_fake(self):
+        @torch.library.custom_op("_torch_testing::add", mutates_args=())
+        def add(x: Tensor, y: float) -> Tensor:
+            x_np = x.cpu().numpy()
+            out_np = x_np + y
+            return torch.from_numpy(out_np).to(x.device)
+
+        x = torch.randn(3)
+        y = 3.14
+        z = add(x, y)
+        self.assertEqual(z, x + y)
+
+        try:
+            with torch._subclasses.fake_tensor.FakeTensorMode():
+                x = torch.randn(3)
+                add(x, y)
+            raise AssertionError("should not be hit")
+        except RuntimeError as e:
+            abstract_impl_error_msg = str(e)
+        abstract_impl_error_msg = re.sub(
+            r"0x.*>\)>", "0xDEADBEEF>)>", abstract_impl_error_msg
+        ).replace(". ", ".\n")
+        self.assertExpectedInline(
+            abstract_impl_error_msg,
+            """\
+There was no fake impl registered for <CustomOpDef(_torch_testing::add)>.
+This is necessary for torch.compile/export/fx tracing to work.
+Please use `add.register_fake` to add an fake impl.""",
+        )
+
+        if not IS_WINDOWS:
+
+            @torch.compile(backend="eager")
+            def f(x, y):
+                return add(x, y)
+
+            x = torch.randn(3)
+            with self.assertRaisesRegex(RuntimeError, "no fake impl"):
+                f(x, y)
+
+        abstract_called = False
+
+        @add.register_fake
+        def _(x, y):
+            nonlocal abstract_called
+            abstract_called = True
+            return torch.empty_like(x)
+
+        with torch._subclasses.fake_tensor.FakeTensorMode():
+            x = torch.randn(3)
+            z = add(x, y)
+            self.assertEqual(z.shape, x.shape)
+            self.assertTrue(abstract_called)
+
+    @skipIfTorchDynamo("recursive dynamo")
+    @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work on windows")
+    def test_compile(self):
+        called_impl = False
+        called_abstract = False
+
+        @torch.library.custom_op("_torch_testing::linear", mutates_args=())
+        def custom_linear(x: Tensor, weight: Tensor, bias: Tensor) -> Tensor:
+            nonlocal called_impl
+            called_impl = True
+            x_np = x.numpy()
+            w_np = weight.numpy()
+            b_np = bias.numpy()
+            out_np = np.add(x_np @ w_np.T, bias)
+            return out_np
+
+        @custom_linear.register_fake
+        def _(x, weight, bias):
+            nonlocal called_abstract
+            called_abstract = True
+            assert x.dim() == 2
+            assert weight.dim() == 2
+            assert bias.dim() == 1
+            assert x.shape[1] == weight.shape[1]
+            assert weight.shape[0] == bias.shape[0]
+            assert x.device == weight.device
+            return x.new_empty(x.size(0), weight.size(0))
+
+        x = torch.randn(2, 2)
+        weight = torch.randn(2, 2)
+        bias = torch.randn(2)
+        out = torch.compile(custom_linear, backend="eager", fullgraph=True)(
+            x, weight, bias
+        )
+        self.assertEqual(out, torch.nn.functional.linear(x, weight, bias))
+        self.assertTrue(called_impl)
+        self.assertTrue(called_abstract)
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_register_autograd_error_cases(self):
+        @torch.library.custom_op("_torch_testing::g", mutates_args=())
+        def g(x: Tensor) -> Tensor:
+            return x.sin()
+
+        x = torch.randn(3, requires_grad=True)
+        y = g(x)
+        with self.assertRaisesRegex(RuntimeError, "no autograd formula"):
+            y.sum().backward()
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_replacement(self):
+        @torch.library.custom_op("_torch_testing::f", mutates_args=())
+        def f(x: Tensor) -> Tensor:
+            return x.sin()
+
+        x = torch.randn(3)
+        y = f(x)
+        self.assertEqual(y, x.sin())
+
+        @torch.library.custom_op("_torch_testing::f", mutates_args=())
+        def f(x: Tensor) -> Tensor:
+            return x.cos()
+
+        y = f(x)
+        self.assertEqual(y, x.cos())
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    @unittest.skipIf(not TEST_CUDA, "requires CUDA")
+    def test_split_device(self):
+        cpu_call_count = 0
+        cuda_call_count = 0
+
+        @torch.library.custom_op(
+            "_torch_testing::f", mutates_args=(), device_types="cpu"
+        )
+        def f(x: Tensor) -> Tensor:
+            nonlocal cpu_call_count
+            cpu_call_count += 1
+            x_np = x.numpy()
+            out_np = np.sin(x_np)
+            return torch.from_numpy(out_np)
+
+        @f.register_kernel("cuda")
+        def _(x: Tensor) -> Tensor:
+            nonlocal cuda_call_count
+            cuda_call_count += 1
+            x_np = x.cpu().numpy()
+            out_np = np.sin(x_np)
+            return torch.from_numpy(out_np).to(x.device)
+
+        x = torch.randn(3)
+        y = f(x)
+        self.assertEqual(y, x.sin())
+        self.assertEqual(cpu_call_count, 1)
+        self.assertEqual(cuda_call_count, 0)
+
+        x = x.cuda()
+        y = f(x)
+        self.assertEqual(y, x.sin())
+        self.assertEqual(cpu_call_count, 1)
+        self.assertEqual(cuda_call_count, 1)
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    @unittest.skipIf(not TEST_CUDA, "requires CUDA")
+    def test_multi_types(self):
+        @torch.library.custom_op(
+            "_torch_testing::f", mutates_args=(), device_types=("cpu", "cuda")
+        )
+        def f(x: Tensor) -> Tensor:
+            x_np = x.cpu().numpy()
+            out_np = np.sin(x_np)
+            return torch.from_numpy(out_np).to(x.device)
+
+        x = torch.randn(3)
+        y = f(x)
+        self.assertEqual(y, x.sin())
+        x = x.cuda()
+        y = f(x)
+        self.assertEqual(y, x.sin())
+
+    def test_disallows_output_aliasing(self):
+        @torch.library.custom_op("_torch_testing::f", mutates_args=())
+        def f(x: Tensor) -> Tensor:
+            return x.view(-1)
+
+        x = torch.randn(3)
+        with self.assertRaisesRegex(RuntimeError, "may not alias"):
+            f(x)
+
+        @torch.library.custom_op("_torch_testing::f", mutates_args=())
+        def f(x: Tensor) -> Tensor:
+            return x
+
+        x = torch.randn(3)
+        with self.assertRaisesRegex(RuntimeError, "may not alias"):
+            f(x)
+
+        @torch.library.custom_op(
+            "_torch_testing::f", mutates_args={"x"}, device_types="cpu"
+        )
+        def numpy_sin_inplace(x: Tensor) -> Tensor:
+            x_np = x.numpy()
+            np.sin(x_np, out=x_np)
+            return x
+
+        x = torch.randn(3)
+        with self.assertRaisesRegex(RuntimeError, "may not alias"):
+            numpy_sin_inplace(x)
+
+
 class MiniOpTestOther(CustomOpTestCaseBase):
     test_ns = "mini_op_test"
 
@@ -1968,6 +2887,7 @@ def test_nonzero_again(self):
     additional_decorators={
         "test_pt2_compliant_tag_mini_op_test_no_abstract": [unittest.expectedFailure]
     },
+    test_utils=optests.generate_tests.DEPRECATED_DEFAULT_TEST_UTILS,
 )
 
 optests.generate_opcheck_tests(
@@ -1977,6 +2897,7 @@ def test_nonzero_again(self):
         os.path.dirname(__file__),
         "minioptest_failures_dict.json",
     ),
+    test_utils=optests.generate_tests.DEPRECATED_DEFAULT_TEST_UTILS,
 )
 
 
@@ -2069,7 +2990,7 @@ def test_failures_dict_validation(self):
 
         failures = {
             "mini_op_test::incorrect_schema": {
-                "MiniOpTest.test_aot_dispatch_static__test_delayed_error": {
+                "MiniOpTest.test_aot_dispatch_dynamic__test_delayed_error": {
                     "comment": "",
                     "status": "success",
                 }
@@ -2099,7 +3020,7 @@ def test_failures_dict_validation(self):
 
         failures = {
             "mini_op_test::incorrect_schema": {
-                "MiniOpTest.test_aot_dispatch_static__test_delayed_error_nopenopenope": {
+                "MiniOpTest.test_aot_dispatch_dynamic__test_delayed_error_nopenopenope": {
                     "comment": "",
                     "status": "xfail",
                 },
@@ -2119,10 +3040,10 @@ def test_dont_generate_decorator(self):
     def test_opcheck(self):
         x = torch.randn(3, requires_grad=True)
         with self.assertRaisesRegex(ValueError, "OpOverload"):
-            optests.opcheck(torch.sin, (x,))
+            torch.library.opcheck(torch.sin, (x,))
         with self.assertRaisesRegex(ValueError, "test_utils to be subset of"):
-            optests.opcheck(torch.ops.aten.sin.default, (x,), test_utils="blah")
-        result = optests.opcheck(torch.ops.aten.sin.default, (x,))
+            torch.library.opcheck(torch.ops.aten.sin.default, (x,), test_utils="blah")
+        result = torch.library.opcheck(torch.ops.aten.sin.default, (x,))
 
         self.assertEqual(
             result,
@@ -2130,12 +3051,11 @@ def test_opcheck(self):
                 "test_schema": "SUCCESS",
                 "test_autograd_registration": "SUCCESS",
                 "test_faketensor": "SUCCESS",
-                "test_aot_dispatch_static": "SUCCESS",
                 "test_aot_dispatch_dynamic": "SUCCESS",
             },
         )
 
-        result = optests.opcheck(
+        result = torch.library.opcheck(
             torch.ops.aten.sin.default, (x,), test_utils="test_schema"
         )
         self.assertEqual(
@@ -2145,7 +3065,7 @@ def test_opcheck(self):
             },
         )
 
-        result = optests.opcheck(
+        result = torch.library.opcheck(
             torch.ops.aten.sin.default,
             (x,),
             test_utils=["test_schema", "test_faketensor"],
@@ -2158,6 +3078,21 @@ def test_opcheck(self):
             },
         )
 
+    def test_opcheck_customopdef(self):
+        sample_inputs = [
+            (torch.randn(3),),
+            (torch.randn(3, requires_grad=True),),
+        ]
+        if torch.cuda.is_available():
+            sample_inputs.extend(
+                [
+                    (torch.randn(3, device="cuda"),),
+                    (torch.randn(3, device="cuda", requires_grad=True),),
+                ]
+            )
+        for args in sample_inputs:
+            torch.library.opcheck(custom_op_db.numpy_cube, args)
+
     def test_is_inside_opcheck_mode(self):
         self.assertFalse(optests.is_inside_opcheck_mode())
         with optests.generate_tests.OpCheckMode(
@@ -2169,9 +3104,9 @@ def test_opcheck_bad_op(self):
         op = op_with_incorrect_schema(self, "foo")
         x = torch.randn(3)
         with self.assertRaisesRegex(Exception, "is not defined to alias output"):
-            optests.opcheck(op, (x,))
+            torch.library.opcheck(op, (x,))
 
-        result = optests.opcheck(op, (x,), raise_exception=False)
+        result = torch.library.opcheck(op, (x,), raise_exception=False)
         self.assertTrue(isinstance(result["test_schema"], RuntimeError))
         del result["test_schema"]
         self.assertEqual(
@@ -2179,7 +3114,6 @@ def test_opcheck_bad_op(self):
             {
                 "test_autograd_registration": "SUCCESS",
                 "test_faketensor": "SUCCESS",
-                "test_aot_dispatch_static": "SUCCESS",
                 "test_aot_dispatch_dynamic": "SUCCESS",
             },
         )
@@ -2188,6 +3122,7 @@ def test_opcheck_bad_op(self):
 only_for = ("cpu", "cuda")
 instantiate_device_type_tests(TestCustomOpTesting, globals(), only_for=only_for)
 instantiate_parametrized_tests(TestCustomOp)
+instantiate_parametrized_tests(TestCustomOpAPI)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 54da7a3aac8d9..7993d083e9259 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -1,73 +1,81 @@
 # Owner(s): ["module: dataloader"]
 
-import math
-import sys
-import errno
-import os
 import ctypes
+import errno
 import faulthandler
-import torch
+import functools
 import gc
-import time
+import itertools
+import math
+import operator
+import os
 import signal
+import sys
+import tempfile
+import time
 import unittest
-import itertools
 import warnings
-import tempfile
+
+import torch
 import torch.utils.data.datapipes as dp
 from torch import multiprocessing as mp
+from torch._utils import ExceptionWrapper
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_utils import (
+    IS_CI,
+    IS_JETSON,
+    IS_MACOS,
+    IS_SANDCASTLE,
+    IS_WINDOWS,
+    load_tests,
+    NO_MULTIPROCESSING_SPAWN,
+    parametrize,
+    run_tests,
+    skipIfNoDill,
+    skipIfRocm,
+    slowTest,
+    TEST_CUDA,
+    TEST_NUMPY,
+    TEST_WITH_ASAN,
+    TEST_WITH_TSAN,
+    TestCase,
+)
 from torch.utils.data import (
+    _utils,
     ChainDataset,
     ConcatDataset,
     DataLoader,
     Dataset,
     IterableDataset,
     IterDataPipe,
+    StackDataset,
     Subset,
     TensorDataset,
-    StackDataset,
-    _utils
 )
 from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL
-from torch.utils.data.dataset import random_split
 from torch.utils.data.datapipes.iter import IterableWrapper
-from torch._utils import ExceptionWrapper
-from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS, IS_JETSON,
-                                                  IS_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest,
-                                                  load_tests, TEST_WITH_ASAN, TEST_WITH_TSAN, IS_SANDCASTLE,
-                                                  IS_MACOS, TEST_CUDA, parametrize)
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
-import functools
-import operator
+from torch.utils.data.dataset import random_split
 
 
 try:
     import psutil
+
     HAS_PSUTIL = True
 except ImportError:
     HAS_PSUTIL = False
-    err_msg = ("psutil not found. Some critical data loader tests relying on it "
-               "(e.g., TestDataLoader.test_proper_exit) will not run.")
+    err_msg = (
+        "psutil not found. Some critical data loader tests relying on it "
+        "(e.g., TestDataLoader.test_proper_exit) will not run."
+    )
     if IS_CI:
         raise ImportError(err_msg) from None
     else:
         warnings.warn(err_msg)
 
-try:
-    import dill
-    # XXX: By default, dill writes the Pickler dispatch table to inject its
-    # own logic there. This globally affects the behavior of the standard library
-    # pickler for any user who transitively depends on this module!
-    # Undo this extension to avoid altering the behavior of the pickler globally.
-    dill.extend(use_dill=False)
-    HAS_DILL = True
-except ImportError:
-    HAS_DILL = False
-skipIfNoDill = unittest.skipIf(not HAS_DILL, "no dill")
-
 
 try:
     import numpy as np
+
     HAS_NUMPY = True
 except ImportError:
     HAS_NUMPY = False
@@ -78,7 +86,7 @@
 load_tests = load_tests
 
 if TEST_CUDA:
-    torch.cuda.memory._set_allocator_settings('expandable_segments:False')
+    torch.cuda.memory._set_allocator_settings("expandable_segments:False")
 
 if not NO_MULTIPROCESSING_SPAWN:
     # We want to use `spawn` if able because some of our tests check that the
@@ -94,7 +102,7 @@
     #
     # Get a multiprocessing context because some test / third party library will
     # set start_method when imported, and setting again triggers `RuntimeError`.
-    mp = mp.get_context(method='spawn')
+    mp = mp.get_context(method="spawn")
 
 
 # 60s of timeout?
@@ -108,7 +116,9 @@
 JOIN_TIMEOUT = 60.0  # seconds
 
 
-supported_multiprocessing_contexts = [None] + list(torch.multiprocessing.get_all_start_methods())
+supported_multiprocessing_contexts = [None] + list(
+    torch.multiprocessing.get_all_start_methods()
+)
 
 
 # collate_fn that returns the batch cloned; defined globally here for pickle purposes.
@@ -119,7 +129,8 @@ def _clone_collate(b):
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
-    "fork is not supported. Dying (set die_after_fork=0 to override)")
+    "fork is not supported. Dying (set die_after_fork=0 to override)",
+)
 class TestDatasetRandomSplit(TestCase):
     def test_lengths_must_equal_dataset_size(self):
         with self.assertRaises(ValueError):
@@ -138,19 +149,23 @@ def test_splits_have_correct_size(self):
 
         # Odd size splits
         self.assertEqual(
-            len(random_split(range(3), [0.5, 0.5], generator=torch.Generator().manual_seed(1))),
-            2
+            len(
+                random_split(
+                    range(3), [0.5, 0.5], generator=torch.Generator().manual_seed(1)
+                )
+            ),
+            2,
         )
 
         # Odd sized round-robin splits
-        splits = random_split(range(106), [0.1, 0.2, 0.3, 0.4],
-                              generator=torch.Generator().manual_seed(1))
+        splits = random_split(
+            range(106), [0.1, 0.2, 0.3, 0.4], generator=torch.Generator().manual_seed(1)
+        )
         self.assertEqual(len(splits[0]), 11)
         self.assertEqual(len(splits[1]), 22)
         self.assertEqual(len(splits[2]), 31)
         self.assertEqual(len(splits[3]), 42)
 
-
     def test_splits_are_mutually_exclusive(self):
         data = [5, 2, 3, 4, 1, 6]
         splits = random_split(data, [2, 4])
@@ -180,8 +195,9 @@ def test_splits_are_mutually_exclusive(self):
 
     def test_splits_indexing_type(self):
         r"""Indices generated by random_split
-          should be of integer type
+        should be of integer type
         """
+
         class CustomDataset:
             def __init__(self, test_object, custom_list):
                 self.data = custom_list
@@ -210,20 +226,41 @@ def __len__(self):
 
     def test_splits_reproducibility(self):
         self.assertEqual(
-            [list(x) for x in random_split(range(10), [3, 7], generator=torch.Generator().manual_seed(1))],
+            [
+                list(x)
+                for x in random_split(
+                    range(10), [3, 7], generator=torch.Generator().manual_seed(1)
+                )
+            ],
             [[5, 6, 1], [2, 0, 8, 9, 3, 7, 4]],
         )
         self.assertEqual(
-            random_split(range(100), [60, 40], generator=torch.Generator().manual_seed(42)),
-            random_split(range(100), [60, 40], generator=torch.Generator().manual_seed(42)),
+            random_split(
+                range(100), [60, 40], generator=torch.Generator().manual_seed(42)
+            ),
+            random_split(
+                range(100), [60, 40], generator=torch.Generator().manual_seed(42)
+            ),
         )
         self.assertEqual(
-            random_split(range(100), [0.5, 0.5], generator=torch.Generator().manual_seed(42)),
-            random_split(range(100), [0.5, 0.5], generator=torch.Generator().manual_seed(42)),
+            random_split(
+                range(100), [0.5, 0.5], generator=torch.Generator().manual_seed(42)
+            ),
+            random_split(
+                range(100), [0.5, 0.5], generator=torch.Generator().manual_seed(42)
+            ),
         )
         self.assertEqual(
-            random_split(range(100), [0.33, 0.33, 0.34], generator=torch.Generator().manual_seed(42)),
-            random_split(range(100), [0.33, 0.33, 0.34], generator=torch.Generator().manual_seed(42)),
+            random_split(
+                range(100),
+                [0.33, 0.33, 0.34],
+                generator=torch.Generator().manual_seed(42),
+            ),
+            random_split(
+                range(100),
+                [0.33, 0.33, 0.34],
+                generator=torch.Generator().manual_seed(42),
+            ),
         )
 
     def test_incomplete_fractional_splits(self):
@@ -288,7 +325,7 @@ def __init__(self, n):
         self.n = n
 
     def __getitem__(self, i):
-        return torch.as_tensor(i, device='cuda')
+        return torch.as_tensor(i, device="cuda")
 
     def __len__(self):
         return self.n
@@ -321,9 +358,9 @@ def __len__(self):
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
-    "fork is not supported. Dying (set die_after_fork=0 to override)")
+    "fork is not supported. Dying (set die_after_fork=0 to override)",
+)
 class TestTensorDataset(TestCase):
-
     def test_len(self):
         source = TensorDataset(torch.randn(15, 10, 2, 3, 4, 5), torch.randperm(15))
         self.assertEqual(len(source), 15)
@@ -368,29 +405,42 @@ def test_many_tensors(self):
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
-    "fork is not supported. Dying (set die_after_fork=0 to override)")
+    "fork is not supported. Dying (set die_after_fork=0 to override)",
+)
 class TestStackDataset(TestCase):
-
     def test_empty(self):
-        with self.assertRaisesRegex(ValueError, "At least one dataset should be passed"):
+        with self.assertRaisesRegex(
+            ValueError, "At least one dataset should be passed"
+        ):
             StackDataset()
 
     def test_mixed(self):
         with self.assertRaisesRegex(ValueError, "Supported either"):
-            StackDataset(TensorDataset(torch.randn(15, 10)), a=TensorDataset(torch.randn(10, 15)))
+            StackDataset(
+                TensorDataset(torch.randn(15, 10)), a=TensorDataset(torch.randn(10, 15))
+            )
 
     def test_size_mismatch(self):
         with self.assertRaisesRegex(ValueError, "Size mismatch between datasets"):
-            StackDataset(TensorDataset(torch.randn(15, 10)), TensorDataset(torch.randn(10, 15)))
+            StackDataset(
+                TensorDataset(torch.randn(15, 10)), TensorDataset(torch.randn(10, 15))
+            )
         with self.assertRaisesRegex(ValueError, "Size mismatch between datasets"):
-            StackDataset(a=TensorDataset(torch.randn(15, 10)), b=TensorDataset(torch.randn(10, 15)))
+            StackDataset(
+                a=TensorDataset(torch.randn(15, 10)),
+                b=TensorDataset(torch.randn(10, 15)),
+            )
 
     def test_len(self):
-        source = StackDataset(TensorDataset(torch.randn(15, 10)), TensorDataset(torch.randn(15)))
+        source = StackDataset(
+            TensorDataset(torch.randn(15, 10)), TensorDataset(torch.randn(15))
+        )
         self.assertEqual(len(source), 15)
         source = StackDataset(TensorDataset(torch.randn(15, 10)))
         self.assertEqual(len(source), 15)
-        source = StackDataset(a=TensorDataset(torch.randn(15, 10)), b=TensorDataset(torch.randn(15)))
+        source = StackDataset(
+            a=TensorDataset(torch.randn(15, 10)), b=TensorDataset(torch.randn(15))
+        )
         self.assertEqual(len(source), 15)
         source = StackDataset(a=TensorDataset(torch.randn(15, 10)))
         self.assertEqual(len(source), 15)
@@ -402,7 +452,7 @@ def test_single(self):
             self.assertEqual(t[i], source[i][0])
         source = StackDataset(a=t)
         for i in range(15):
-            self.assertEqual(t[i], source[i]['a'])
+            self.assertEqual(t[i], source[i]["a"])
 
     def test_getitem(self):
         t = TensorDataset(torch.randn(15, 10))
@@ -413,8 +463,8 @@ def test_getitem(self):
             self.assertEqual(l[i], source[i][1])
         source = StackDataset(a=t, b=l)
         for i in range(15):
-            self.assertEqual(t[i], source[i]['a'])
-            self.assertEqual(l[i], source[i]['b'])
+            self.assertEqual(t[i], source[i]["a"])
+            self.assertEqual(l[i], source[i]["b"])
 
     def test_getitems(self):
         class GetItemsDataset(Dataset):
@@ -442,8 +492,8 @@ def __len__(self):
         source = StackDataset(t=t, l=l)
         batch = source.__getitems__([0, 1, 2, 3])
         for i in range(4):
-            self.assertEqual(t[i], batch[i]['t'])
-            self.assertEqual(l[i], batch[i]['l'])
+            self.assertEqual(t[i], batch[i]["t"])
+            self.assertEqual(l[i], batch[i]["l"])
 
     def test_getitems_raises_index_error(self):
         class GetItemsDataset(Dataset):
@@ -486,17 +536,18 @@ def __len__(self):
 
         source = StackDataset(t, l)
 
-        with self.assertRaisesRegex(ValueError,
-                                    "Nested dataset's output size mismatch. Expected 4, got 3"):
+        with self.assertRaisesRegex(
+            ValueError, "Nested dataset's output size mismatch. Expected 4, got 3"
+        ):
             source.__getitems__([0, 1, 2, 3])
 
 
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
-    "fork is not supported. Dying (set die_after_fork=0 to override)")
+    "fork is not supported. Dying (set die_after_fork=0 to override)",
+)
 class TestConcatDataset(TestCase):
-
     def test_concat_two_singletons(self):
         result = ConcatDataset([[0], [1]])
         self.assertEqual(2, len(result))
@@ -504,24 +555,20 @@ def test_concat_two_singletons(self):
         self.assertEqual(1, result[1])
 
     def test_concat_two_non_singletons(self):
-        result = ConcatDataset([[0, 1, 2, 3, 4],
-                                [5, 6, 7, 8, 9]])
+        result = ConcatDataset([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
         self.assertEqual(10, len(result))
         self.assertEqual(0, result[0])
         self.assertEqual(5, result[5])
 
     def test_concat_two_non_singletons_with_empty(self):
         # Adding an empty dataset somewhere is correctly handled
-        result = ConcatDataset([[0, 1, 2, 3, 4],
-                                [],
-                                [5, 6, 7, 8, 9]])
+        result = ConcatDataset([[0, 1, 2, 3, 4], [], [5, 6, 7, 8, 9]])
         self.assertEqual(10, len(result))
         self.assertEqual(0, result[0])
         self.assertEqual(5, result[5])
 
     def test_concat_raises_index_error(self):
-        result = ConcatDataset([[0, 1, 2, 3, 4],
-                                [5, 6, 7, 8, 9]])
+        result = ConcatDataset([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
         with self.assertRaises(IndexError):
             # this one goes to 11
             result[11]
@@ -562,6 +609,7 @@ def set_faulthander_if_available(_=None):
 
 set_faulthander_if_available()
 
+
 # Process `pid` must have called `set_faulthander_if_available`
 def print_traces_of_all_threads(pid):
     if not IS_WINDOWS:
@@ -580,7 +628,6 @@ def print_traces_of_all_threads(pid):
 # its `.exception` attribute.
 # Inspired by https://stackoverflow.com/a/33599967
 class ErrorTrackingProcess(mp.Process):
-
     # Why no *args?
     #   py2 doesn't support def fn(x, *args, key=val, **kwargs)
     # Setting disable_stderr=True may generate a lot of unrelated error outputs
@@ -595,7 +642,7 @@ def run(self):
         set_faulthander_if_available()
         if self.disable_stderr:
             # Disable polluting stderr with errors that are supposed to happen.
-            with open(os.devnull, 'w') as devnull:
+            with open(os.devnull, "w") as devnull:
                 os.dup2(devnull.fileno(), sys.stderr.fileno())
         try:
             super().run()
@@ -605,8 +652,12 @@ def run(self):
             raise
 
     def print_traces_of_all_threads(self):
-        assert self.is_alive(), "can only use print_traces_of_all_threads if the process is alive"
-        assert not self.disable_stderr, "do not disable stderr if you use print_traces_of_all_threads"
+        assert (
+            self.is_alive()
+        ), "can only use print_traces_of_all_threads if the process is alive"
+        assert (
+            not self.disable_stderr
+        ), "do not disable stderr if you use print_traces_of_all_threads"
         # On platforms without `SIGUSR1`, `set_faulthander_if_available` sets
         # `faulthandler.enable()`, and `print_traces_of_all_threads` may kill
         # the process. So let's poll the exception first
@@ -632,7 +683,6 @@ def send_signal(self, signum, ignore_ESRCH=False):
 
 
 class ErrorDataset(Dataset):
-
     def __init__(self, size):
         self.size = size
 
@@ -641,7 +691,6 @@ def __len__(self):
 
 
 class SegfaultDataset(Dataset):
-
     def __init__(self, size):
         self.size = size
 
@@ -653,7 +702,6 @@ def __len__(self):
 
 
 class SleepDataset(Dataset):
-
     def __init__(self, size, sleep_sec):
         self.size = size
         self.sleep_sec = sleep_sec
@@ -670,7 +718,6 @@ def __len__(self):
 
 
 class SeedDataset(Dataset):
-
     def __init__(self, size):
         self.size = size
 
@@ -699,10 +746,9 @@ def __len__(self):
 # reach the call (i.e., acting like a barrier).
 # This can be used to ensure that each worker at least processes one data.
 class SynchronizedDataset(Dataset):
-
     def __init__(self, size, batch_size, num_workers):
         assert size >= num_workers * batch_size
-        self.count = mp.Value('i', 0, lock=True)
+        self.count = mp.Value("i", 0, lock=True)
         self.barrier = mp.Semaphore(0)
         self.num_workers = num_workers
         self.size = size
@@ -741,15 +787,26 @@ def __getitem__(self, idx):
 
 def _test_timeout(persistent_workers):
     dataset = SleepDataset(10, 3)
-    dataloader = DataLoader(dataset, batch_size=2, num_workers=2, timeout=1,
-                            persistent_workers=persistent_workers)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=2,
+        num_workers=2,
+        timeout=1,
+        persistent_workers=persistent_workers,
+    )
     _ = next(iter(dataloader))
 
 
 def _test_timeout_pin_memory(persistent_workers):
     dataset = SleepDataset(10, 3)
-    dataloader = DataLoader(dataset, batch_size=2, num_workers=2, timeout=1, pin_memory=True,
-                            persistent_workers=persistent_workers)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=2,
+        num_workers=2,
+        timeout=1,
+        pin_memory=True,
+        persistent_workers=persistent_workers,
+    )
     _ = next(iter(dataloader))
 
 
@@ -762,13 +819,14 @@ def _test_large_sampler_indices(persistent_workers):
         EmptyTensorDataset(10000000),
         batch_size=40960,
         persistent_workers=persistent_workers,
-        num_workers=1)
+        num_workers=1,
+    )
 
     it = iter(dataloader)
 
     for x in it:
         assert x.numel() == 0
-        raise RuntimeError('My Error')
+        raise RuntimeError("My Error")
 
 
 def disable_stderr(worker_id):
@@ -782,13 +840,15 @@ def disable_stderr(worker_id):
     sys.stderr.flush()  # flush library buffers that dup2 knows nothing about
     # Can't use a with-block because otherwise the fd will be closed when this
     # function ends.
-    with open(os.devnull, 'w') as devnull:
+    with open(os.devnull, "w") as devnull:
         os.dup2(devnull.fileno(), sys.stderr.fileno())
 
 
 def _test_segfault():
     dataset = SegfaultDataset(10)
-    dataloader = DataLoader(dataset, batch_size=2, num_workers=2, worker_init_fn=disable_stderr)
+    dataloader = DataLoader(
+        dataset, batch_size=2, num_workers=2, worker_init_fn=disable_stderr
+    )
     _ = next(iter(dataloader))
 
 
@@ -799,9 +859,13 @@ def _test_no_segfault():
         torch.set_num_threads(4)
     else:
         torch.set_num_threads(num_threads)
-    mp_ctx = torch.multiprocessing.get_context(method='fork')
-    dataloader = DataLoader(dataset, num_workers=1, worker_init_fn=disable_stderr,
-                            multiprocessing_context=mp_ctx)
+    mp_ctx = torch.multiprocessing.get_context(method="fork")
+    dataloader = DataLoader(
+        dataset,
+        num_workers=1,
+        worker_init_fn=disable_stderr,
+        multiprocessing_context=mp_ctx,
+    )
     _ = next(iter(dataloader))
 
 
@@ -815,10 +879,13 @@ def __len__(self):
 
     def __getitem__(self, idx):
         worker_info = torch.utils.data.get_worker_info()
-        if self.error_event is not None and self.error_event.is_set() and \
-                worker_info.id == worker_info.num_workers - 1:
+        if (
+            self.error_event is not None
+            and self.error_event.is_set()
+            and worker_info.id == worker_info.num_workers - 1
+        ):
             # only error in the last worker
-            raise RuntimeError('Worker error')
+            raise RuntimeError("Worker error")
         return torch.tensor([idx])
 
 
@@ -836,10 +903,13 @@ def __iter__(self):
 
     def __next__(self):
         worker_info = torch.utils.data.get_worker_info()
-        if self.error_event is not None and self.error_event.is_set() and \
-                worker_info.id == worker_info.num_workers - 1:
+        if (
+            self.error_event is not None
+            and self.error_event.is_set()
+            and worker_info.id == worker_info.num_workers - 1
+        ):
             # only error in the last worker
-            raise RuntimeError('Worker error')
+            raise RuntimeError("Worker error")
         self.remaining -= 1
         if self.remaining < 0:
             raise StopIteration
@@ -847,15 +917,22 @@ def __next__(self):
 
 
 # See TestDataLoader.test_proper_exit for usage
-def _test_proper_exit(is_iterable_dataset, use_workers, pin_memory, exit_method,
-                      hold_iter_reference, loader_setup_event, tester_setup_event,
-                      persistent_workers):
+def _test_proper_exit(
+    is_iterable_dataset,
+    use_workers,
+    pin_memory,
+    exit_method,
+    hold_iter_reference,
+    loader_setup_event,
+    tester_setup_event,
+    persistent_workers,
+):
     num_workers = 2 if use_workers else 0
 
-    if exit_method == 'worker_error' or exit_method == 'worker_kill':
+    if exit_method == "worker_error" or exit_method == "worker_kill":
         assert use_workers is True
 
-    if exit_method == 'worker_error':
+    if exit_method == "worker_error":
         worker_error_event = mp.Event()
     else:
         worker_error_event = None
@@ -865,10 +942,15 @@ def _test_proper_exit(is_iterable_dataset, use_workers, pin_memory, exit_method,
     else:
         ds = TestProperExitDataset(12, worker_error_event)
 
-    loader = DataLoader(ds, batch_size=1, shuffle=False,
-                        num_workers=num_workers, pin_memory=pin_memory,
-                        worker_init_fn=set_faulthander_if_available,
-                        persistent_workers=persistent_workers)
+    loader = DataLoader(
+        ds,
+        batch_size=1,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        worker_init_fn=set_faulthander_if_available,
+        persistent_workers=persistent_workers,
+    )
 
     error_it = 2
 
@@ -910,11 +992,11 @@ def kill_pid(pid):
                 worker_error_event.set()
 
         if i == error_it:
-            if exit_method == 'loader_error':
-                raise RuntimeError('Loader error')
-            elif exit_method == 'loader_kill':
+            if exit_method == "loader_error":
+                raise RuntimeError("Loader error")
+            elif exit_method == "loader_kill":
                 kill_pid(os.getpid())
-            elif exit_method == 'worker_kill':
+            elif exit_method == "worker_kill":
                 kill_pid(workers[-1].pid)  # kill last worker
 
     if not hold_iter_reference:
@@ -935,12 +1017,20 @@ def __getitem__(self, idx):
 # See _test_get_worker_info below for usage.
 def _test_worker_info_init_fn(worker_id):
     worker_info = torch.utils.data.get_worker_info()
-    assert worker_id == worker_info.id, "worker_init_fn and worker_info should have consistent id"
-    assert worker_id < worker_info.num_workers, "worker_init_fn and worker_info should have valid id"
-    assert worker_info.seed == torch.initial_seed(), "worker_init_fn and worker_info should have consistent seed"
+    assert (
+        worker_id == worker_info.id
+    ), "worker_init_fn and worker_info should have consistent id"
+    assert (
+        worker_id < worker_info.num_workers
+    ), "worker_init_fn and worker_info should have valid id"
+    assert (
+        worker_info.seed == torch.initial_seed()
+    ), "worker_init_fn and worker_info should have consistent seed"
     dataset = worker_info.dataset
-    assert isinstance(dataset, TestWorkerInfoDataset), "worker_info should have correct dataset copy"
-    assert not hasattr(dataset, 'value'), "worker_info should have correct dataset copy"
+    assert isinstance(
+        dataset, TestWorkerInfoDataset
+    ), "worker_info should have correct dataset copy"
+    assert not hasattr(dataset, "value"), "worker_info should have correct dataset copy"
     # test that WorkerInfo attributes are read-only
     try:
         worker_info.id = 3999
@@ -950,7 +1040,7 @@ def _test_worker_info_init_fn(worker_id):
         worker_info.a = 3
     except RuntimeError as e:
         assert str(e) == "Cannot assign attributes to WorkerInfo objects"
-    for k in ['id', 'num_workers', 'seed', 'dataset']:
+    for k in ["id", "num_workers", "seed", "dataset"]:
         assert f"{k}=" in repr(worker_info)
     dataset.value = [worker_id, os.getpid()]
 
@@ -961,9 +1051,12 @@ def _test_get_worker_info():
     num_workers = 2
     batch_size = 2
     dataset = TestWorkerInfoDataset(6, batch_size, num_workers)
-    dataloader = DataLoader(dataset, batch_size=batch_size,
-                            num_workers=num_workers,
-                            worker_init_fn=_test_worker_info_init_fn)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        worker_init_fn=_test_worker_info_init_fn,
+    )
     it = iter(dataloader)
     data = []
     for d in it:
@@ -977,12 +1070,12 @@ def _test_get_worker_info():
     # get_worker_info returns None in main proc after data loading
     assert torch.utils.data.get_worker_info() is None
     # main proc dataset was never assigned this attribute
-    assert not hasattr(dataset, 'value')
+    assert not hasattr(dataset, "value")
     try:
         _ = dataset[0]
     except AttributeError:
         return
-    raise RuntimeError('Expected AttributeError')
+    raise RuntimeError("Expected AttributeError")
 
 
 # test custom init function
@@ -1060,12 +1153,13 @@ def filter_len(row):
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
-    "fork is not supported. Dying (set die_after_fork=0 to override)")
+    "fork is not supported. Dying (set die_after_fork=0 to override)",
+)
 @unittest.skipIf(
     TEST_WITH_ASAN,
-    "DataLoader tests hang in ASAN, see: https://github.com/pytorch/pytorch/issues/66223")
+    "DataLoader tests hang in ASAN, see: https://github.com/pytorch/pytorch/issues/66223",
+)
 class TestDataLoader(TestCase):
-
     def setUp(self):
         super().setUp()
         self.data = torch.randn(100, 2, 3, 5)
@@ -1074,10 +1168,10 @@ def setUp(self):
         self.persistent_workers = False
 
     def _get_data_loader(self, dataset, **kwargs):
-        persistent_workers = kwargs.get('persistent_workers', self.persistent_workers)
-        if persistent_workers and kwargs.get('num_workers', 0) == 0:
+        persistent_workers = kwargs.get("persistent_workers", self.persistent_workers)
+        if persistent_workers and kwargs.get("num_workers", 0) == 0:
             persistent_workers = False
-        kwargs['persistent_workers'] = persistent_workers
+        kwargs["persistent_workers"] = persistent_workers
         return DataLoader(dataset, **kwargs)
 
     def _test_sequential(self, loader):
@@ -1090,13 +1184,13 @@ def _test_sequential(self, loader):
         else:
             for i, (sample, target) in enumerate(loader):
                 idx = i * batch_size
-                self.assertEqual(sample, self.data[idx:idx + batch_size])
-                self.assertEqual(target, self.labels[idx:idx + batch_size])
+                self.assertEqual(sample, self.data[idx : idx + batch_size])
+                self.assertEqual(target, self.labels[idx : idx + batch_size])
             self.assertEqual(i, math.floor((len(self.dataset) - 1) / batch_size))
 
     def _test_shuffle(self, loader):
-        found_data = {i: 0 for i in range(self.data.size(0))}
-        found_labels = {i: 0 for i in range(self.labels.size(0))}
+        found_data = dict.fromkeys(range(self.data.size(0)), 0)
+        found_labels = dict.fromkeys(range(self.labels.size(0)), 0)
         batch_size = loader.batch_size
         if batch_size is None:
             for i, (batch_samples, batch_targets) in enumerate(loader):
@@ -1134,22 +1228,28 @@ def _test_error(self, loader):
             except NotImplementedError:
                 errors += 1
             except StopIteration:
-                self.assertEqual(errors,
-                                 math.ceil(float(len(loader.dataset)) / loader.batch_size))
+                self.assertEqual(
+                    errors, math.ceil(float(len(loader.dataset)) / loader.batch_size)
+                )
                 return
 
     def test_error_in_init(self):
         for num_workers in [0, 2]:
-            loader = self._get_data_loader(ErrorIterableDataset(), num_workers=num_workers)
-            with self.assertRaisesRegex(RuntimeError, 'Error in __iter__'):
+            loader = self._get_data_loader(
+                ErrorIterableDataset(), num_workers=num_workers
+            )
+            with self.assertRaisesRegex(RuntimeError, "Error in __iter__"):
                 list(iter(loader))
 
-        loader = self._get_data_loader(self.dataset, num_workers=2, worker_init_fn=error_worker_init_fn)
-        with self.assertRaisesRegex(RuntimeError, 'Error in worker_init_fn'):
+        loader = self._get_data_loader(
+            self.dataset, num_workers=2, worker_init_fn=error_worker_init_fn
+        )
+        with self.assertRaisesRegex(RuntimeError, "Error in worker_init_fn"):
             list(iter(loader))
 
     def test_typing(self):
         from typing import List
+
         # Make sure there is no TypeError
 
         class SomeDatasetClass(Dataset[List[torch.Tensor]]):
@@ -1163,7 +1263,12 @@ def _create_dataloader(is_train: bool) -> DataLoader[List[torch.Tensor]]:
     def test_fd_limit_exceeded(self):
         # See NOTE [ DataLoader on Linux and open files limit ]
         import subprocess
-        subprocess.check_output([sys.executable, '-c', """\
+
+        subprocess.check_output(
+            [
+                sys.executable,
+                "-c",
+                """\
 import torch
 import resource
 from torch.utils.data import DataLoader, IterableDataset
@@ -1193,11 +1298,14 @@ def __next__(self):
 except RuntimeError as e:
     assert "ulimit -n" in str(e)
     assert "set_sharing_strategy" in str(e)
-"""])
+""",
+            ]
+        )
 
     def test_invalid_assign_after_init(self):
         dl = self._get_data_loader(self.dataset)
-        for attr in ('batch_size', 'sampler', 'batch_sampler', 'drop_last', 'dataset'):
+        for attr in ("batch_size", "sampler", "batch_sampler", "drop_last", "dataset"):
+
             def fn():
                 setattr(dl, attr, {})
 
@@ -1217,7 +1325,13 @@ def test_bulk_loading_nobatch(self):
         sampler = BulkLoadingSampler(ds, batch_size=4)
 
         for num_workers in [0, 4]:
-            dl = self._get_data_loader(ds, num_workers=num_workers, batch_size=None, sampler=sampler, pin_memory=TEST_CUDA)
+            dl = self._get_data_loader(
+                ds,
+                num_workers=num_workers,
+                batch_size=None,
+                sampler=sampler,
+                pin_memory=TEST_CUDA,
+            )
             self.assertFalse(dl._auto_collation)
             samples = list(dl)
             self.assertEqual(samples[0].is_pinned(), TEST_CUDA)
@@ -1242,7 +1356,13 @@ def test_sequential_pin_memory(self):
     def test_multiple_dataloaders(self):
         for multiprocessing_context in supported_multiprocessing_contexts:
             loader1_it = iter(self._get_data_loader(self.dataset, num_workers=1))
-            loader2_it = iter(self._get_data_loader(self.dataset, num_workers=2, multiprocessing_context=multiprocessing_context))
+            loader2_it = iter(
+                self._get_data_loader(
+                    self.dataset,
+                    num_workers=2,
+                    multiprocessing_context=multiprocessing_context,
+                )
+            )
             next(loader1_it)
             next(loader1_it)
             next(loader2_it)
@@ -1261,10 +1381,13 @@ def test_segfault(self):
             self.assertNotEqual(p.exitcode, 0)
             if IS_WINDOWS:
                 self.assertIsInstance(p.exception, OSError)
-                self.assertRegex(str(p.exception), r'access violation reading ')
+                self.assertRegex(str(p.exception), r"access violation reading ")
             else:
                 self.assertIsInstance(p.exception, RuntimeError)
-                self.assertRegex(str(p.exception), r'DataLoader worker \(pid \d+\) is killed by signal: ')
+                self.assertRegex(
+                    str(p.exception),
+                    r"DataLoader worker \(pid \d+\) is killed by signal: ",
+                )
         finally:
             p.terminate()
 
@@ -1282,7 +1405,10 @@ def test_no_segfault(self):
             self.assertFalse(p.is_alive())
             if p.exception:
                 self.assertIsInstance(p.exception, RuntimeError)
-                self.assertRegex(str(p.exception), r'DataLoader worker \(pid \d+\) is killed by signal: ')
+                self.assertRegex(
+                    str(p.exception),
+                    r"DataLoader worker \(pid \d+\) is killed by signal: ",
+                )
                 self.fail("Segfault occurred in worker process after fork")
         finally:
             p.terminate()
@@ -1303,7 +1429,9 @@ def test_timeout(self):
                 self.assertFalse(p.is_alive())
                 self.assertNotEqual(p.exitcode, 0)
                 self.assertIsInstance(p.exception, RuntimeError)
-                self.assertRegex(str(p.exception), r'DataLoader timed out after \d+ seconds')
+                self.assertRegex(
+                    str(p.exception), r"DataLoader timed out after \d+ seconds"
+                )
             finally:
                 p.terminate()
 
@@ -1314,69 +1442,137 @@ def test_large_sampler_indices(self):
         #
         # More context: https://github.com/pytorch/pytorch/issues/48666
 
-        p = ErrorTrackingProcess(target=_test_large_sampler_indices, args=(self.persistent_workers,))
+        p = ErrorTrackingProcess(
+            target=_test_large_sampler_indices, args=(self.persistent_workers,)
+        )
         p.start()
         p.join(JOIN_TIMEOUT)
         try:
             self.assertFalse(p.is_alive())
             self.assertNotEqual(p.exitcode, 0)
             self.assertIsInstance(p.exception, RuntimeError)
-            self.assertRegex(str(p.exception), r'My Error')
+            self.assertRegex(str(p.exception), r"My Error")
         finally:
             p.terminate()
 
     def test_invalid_ctor_args_combinations(self):
         # general
-        with self.assertRaisesRegex(ValueError, "num_workers option should be non-negative"):
+        with self.assertRaisesRegex(
+            ValueError, "num_workers option should be non-negative"
+        ):
             self._get_data_loader(self.dataset, num_workers=-1)
-        with self.assertRaisesRegex(ValueError, "timeout option should be non-negative"):
+        with self.assertRaisesRegex(
+            ValueError, "timeout option should be non-negative"
+        ):
             self._get_data_loader(self.dataset, timeout=-1)
 
         # disable auto-batching
-        with self.assertRaisesRegex(ValueError,
-                                    "batch_size=None option disables auto-batching and is mutually exclusive"):
+        with self.assertRaisesRegex(
+            ValueError,
+            "batch_size=None option disables auto-batching and is mutually exclusive",
+        ):
             self._get_data_loader(self.dataset, batch_size=None, drop_last=True)
 
         valid_ctx = list(torch.multiprocessing.get_all_start_methods())[-1]
-        with self.assertRaisesRegex(ValueError, r"multi-process loading \(num_workers > 0\), but got"):
-            self._get_data_loader(self.dataset, num_workers=0, multiprocessing_context=valid_ctx)
-        with self.assertRaisesRegex(ValueError, "should specify a valid start method in"):
-            self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context='bad')
-        with self.assertRaisesRegex(TypeError, "multiprocessing_context option should be a valid context "):
-            self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context=object())
+        with self.assertRaisesRegex(
+            ValueError, r"multi-process loading \(num_workers > 0\), but got"
+        ):
+            self._get_data_loader(
+                self.dataset, num_workers=0, multiprocessing_context=valid_ctx
+            )
+        with self.assertRaisesRegex(
+            ValueError, "should specify a valid start method in"
+        ):
+            self._get_data_loader(
+                self.dataset, num_workers=1, multiprocessing_context="bad"
+            )
+        with self.assertRaisesRegex(
+            TypeError, "multiprocessing_context option should be a valid context "
+        ):
+            self._get_data_loader(
+                self.dataset, num_workers=1, multiprocessing_context=object()
+            )
 
         # map-style
         sampler = torch.utils.data.SequentialSampler(self.dataset)
         batch_sampler = torch.utils.data.BatchSampler(sampler, 3, False)
-        with self.assertRaisesRegex(ValueError, "sampler option is mutually exclusive with shuffle"):
-            self._get_data_loader(self.dataset, batch_size=11, sampler=sampler, shuffle=True)
-        with self.assertRaisesRegex(ValueError, "sampler option is mutually exclusive with shuffle"):
-            self._get_data_loader(self.dataset, batch_sampler=batch_sampler, sampler=sampler, shuffle=True)
-        with self.assertRaisesRegex(ValueError, "sampler option is mutually exclusive with shuffle"):
-            self._get_data_loader(self.dataset, batch_sampler=batch_sampler, sampler=sampler, shuffle=3)
-        with self.assertRaisesRegex(ValueError, "batch_sampler option is mutually exclusive with"):
-            self._get_data_loader(self.dataset, batch_size=11, batch_sampler=batch_sampler)
-        with self.assertRaisesRegex(ValueError, "batch_sampler option is mutually exclusive with"):
-            self._get_data_loader(self.dataset, shuffle=True, batch_sampler=batch_sampler)
-        with self.assertRaisesRegex(ValueError, "batch_sampler option is mutually exclusive with"):
-            self._get_data_loader(self.dataset, drop_last=True, batch_sampler=batch_sampler)
-        with self.assertRaisesRegex(ValueError, "batch_sampler option is mutually exclusive with"):
-            self._get_data_loader(self.dataset, drop_last=3, batch_sampler=batch_sampler)
+        with self.assertRaisesRegex(
+            ValueError, "sampler option is mutually exclusive with shuffle"
+        ):
+            self._get_data_loader(
+                self.dataset, batch_size=11, sampler=sampler, shuffle=True
+            )
+        with self.assertRaisesRegex(
+            ValueError, "sampler option is mutually exclusive with shuffle"
+        ):
+            self._get_data_loader(
+                self.dataset, batch_sampler=batch_sampler, sampler=sampler, shuffle=True
+            )
+        with self.assertRaisesRegex(
+            ValueError, "sampler option is mutually exclusive with shuffle"
+        ):
+            self._get_data_loader(
+                self.dataset, batch_sampler=batch_sampler, sampler=sampler, shuffle=3
+            )
+        with self.assertRaisesRegex(
+            ValueError, "batch_sampler option is mutually exclusive with"
+        ):
+            self._get_data_loader(
+                self.dataset, batch_size=11, batch_sampler=batch_sampler
+            )
+        with self.assertRaisesRegex(
+            ValueError, "batch_sampler option is mutually exclusive with"
+        ):
+            self._get_data_loader(
+                self.dataset, shuffle=True, batch_sampler=batch_sampler
+            )
+        with self.assertRaisesRegex(
+            ValueError, "batch_sampler option is mutually exclusive with"
+        ):
+            self._get_data_loader(
+                self.dataset, drop_last=True, batch_sampler=batch_sampler
+            )
+        with self.assertRaisesRegex(
+            ValueError, "batch_sampler option is mutually exclusive with"
+        ):
+            self._get_data_loader(
+                self.dataset, drop_last=3, batch_sampler=batch_sampler
+            )
 
         # iterable-style
         dataset = CountingIterableDataset(20)
-        with self.assertRaisesRegex(ValueError, "DataLoader with IterableDataset: expected unspecified shuffle"):
+        with self.assertRaisesRegex(
+            ValueError, "DataLoader with IterableDataset: expected unspecified shuffle"
+        ):
             self._get_data_loader(dataset, shuffle=True)
-        with self.assertRaisesRegex(ValueError, "DataLoader with IterableDataset: expected unspecified shuffle"):
+        with self.assertRaisesRegex(
+            ValueError, "DataLoader with IterableDataset: expected unspecified shuffle"
+        ):
             self._get_data_loader(dataset, shuffle=3)
-        with self.assertRaisesRegex(ValueError, "DataLoader with IterableDataset: expected unspecified sampler"):
-            self._get_data_loader(dataset, sampler=torch.utils.data.SequentialSampler(dataset))
-        with self.assertRaisesRegex(ValueError, "DataLoader with IterableDataset: expected unspecified sampler"):
+        with self.assertRaisesRegex(
+            ValueError, "DataLoader with IterableDataset: expected unspecified sampler"
+        ):
+            self._get_data_loader(
+                dataset, sampler=torch.utils.data.SequentialSampler(dataset)
+            )
+        with self.assertRaisesRegex(
+            ValueError, "DataLoader with IterableDataset: expected unspecified sampler"
+        ):
             self._get_data_loader(dataset, sampler=3)
-        with self.assertRaisesRegex(ValueError, "DataLoader with IterableDataset: expected unspecified batch_sampler"):
-            self._get_data_loader(dataset, batch_sampler=torch.utils.data.BatchSampler(
-                torch.utils.data.SequentialSampler(dataset), 3, False))
-        with self.assertRaisesRegex(ValueError, "DataLoader with IterableDataset: expected unspecified batch_sampler"):
+        with self.assertRaisesRegex(
+            ValueError,
+            "DataLoader with IterableDataset: expected unspecified batch_sampler",
+        ):
+            self._get_data_loader(
+                dataset,
+                batch_sampler=torch.utils.data.BatchSampler(
+                    torch.utils.data.SequentialSampler(dataset), 3, False
+                ),
+            )
+        with self.assertRaisesRegex(
+            ValueError,
+            "DataLoader with IterableDataset: expected unspecified batch_sampler",
+        ):
             self._get_data_loader(dataset, batch_sampler=3)
 
     def test_builtin_collection_conversion(self):
@@ -1385,23 +1581,43 @@ def test_builtin_collection_conversion(self):
                 # map-style dataset
                 dataset = CountingDataset(20)
                 # no auto-batching
-                fetched = coll_ty(self._get_data_loader(dataset, batch_size=None, num_workers=num_workers))
+                fetched = coll_ty(
+                    self._get_data_loader(
+                        dataset, batch_size=None, num_workers=num_workers
+                    )
+                )
                 self.assertEqual(fetched, coll_ty(range(20)))
                 # auto-batching
-                fetched = coll_ty(self._get_data_loader(dataset, batch_size=2, num_workers=num_workers))
-                self.assertEqual(fetched, coll_ty(torch.tensor([i, i + 1]) for i in range(0, 20, 2)))
+                fetched = coll_ty(
+                    self._get_data_loader(
+                        dataset, batch_size=2, num_workers=num_workers
+                    )
+                )
+                self.assertEqual(
+                    fetched, coll_ty(torch.tensor([i, i + 1]) for i in range(0, 20, 2))
+                )
 
                 # iterable-style dataset
                 dataset = CountingIterableDataset(20)
                 # no auto-batching
-                fetched = coll_ty(self._get_data_loader(dataset, batch_size=None, num_workers=num_workers))
+                fetched = coll_ty(
+                    self._get_data_loader(
+                        dataset, batch_size=None, num_workers=num_workers
+                    )
+                )
                 self.assertEqual(fetched, coll_ty(range(20)))
                 # auto-batching
                 # this IterableDataset isn't configured for each worker, so for
                 # the equality test below to be valid, we cannot have more than 1 workers.
                 assert num_workers in [0, 1], "invalid test"
-                fetched = coll_ty(self._get_data_loader(dataset, batch_size=2, num_workers=num_workers))
-                self.assertEqual(fetched, coll_ty(torch.tensor([i, i + 1]) for i in range(0, 20, 2)))
+                fetched = coll_ty(
+                    self._get_data_loader(
+                        dataset, batch_size=2, num_workers=num_workers
+                    )
+                )
+                self.assertEqual(
+                    fetched, coll_ty(torch.tensor([i, i + 1]) for i in range(0, 20, 2))
+                )
 
     def test_iterable_style_dataset(self):
         # [no auto-batching] single process loading
@@ -1419,13 +1635,21 @@ def test_iterable_style_dataset(self):
         # [no auto-batching] multiprocessing loading
         num_workers = 3
         sizes_for_all_workers = [0, 4, 20]
-        expected = sorted(functools.reduce(operator.iadd, (list(range(s)) for s in sizes_for_all_workers), []))
-        assert len(sizes_for_all_workers) == num_workers, 'invalid test case'
+        expected = sorted(
+            functools.reduce(
+                operator.iadd, (list(range(s)) for s in sizes_for_all_workers), []
+            )
+        )
+        assert len(sizes_for_all_workers) == num_workers, "invalid test case"
         for prefetch_factor in [2, 3, 4]:
             dataset = WorkerSpecificIterableDataset(sizes_for_all_workers)
-            dataloader = self._get_data_loader(dataset, num_workers=num_workers, batch_size=None,
-                                               worker_init_fn=set_faulthander_if_available,
-                                               prefetch_factor=prefetch_factor)
+            dataloader = self._get_data_loader(
+                dataset,
+                num_workers=num_workers,
+                batch_size=None,
+                worker_init_fn=set_faulthander_if_available,
+                prefetch_factor=prefetch_factor,
+            )
             dataloader_iter = iter(dataloader)
             fetched = sorted(dataloader_iter)
             for a, b in zip(fetched, expected):
@@ -1437,22 +1661,30 @@ def test_iterable_style_dataset(self):
             # When loading more than len(dataset) data, after accessing len(dataloader),
             # we should get a warning. See NOTE [ IterableDataset and __len__ ].
             dataset = CountingIterableDataset(20)
-            dataloader = self._get_data_loader(dataset, num_workers=num_workers,
-                                               worker_init_fn=set_faulthander_if_available,
-                                               prefetch_factor=prefetch_factor)
+            dataloader = self._get_data_loader(
+                dataset,
+                num_workers=num_workers,
+                worker_init_fn=set_faulthander_if_available,
+                prefetch_factor=prefetch_factor,
+            )
             it = iter(dataloader)
             for _ in range(40):
-                self.assertNotWarn(lambda: next(it), "Should not warn before accessing len(dataloader)")
+                self.assertNotWarn(
+                    lambda: next(it), "Should not warn before accessing len(dataloader)"
+                )
             self.assertEqual(len(dataloader), len(dataset))
             self.assertEqual(len(dataloader), 20)
             it = iter(dataloader)
             for _ in range(20):
-                self.assertNotWarn(lambda: next(it), "Should not warn before exceeding length")
+                self.assertNotWarn(
+                    lambda: next(it), "Should not warn before exceeding length"
+                )
             for _ in range(3):
                 with self.assertWarnsRegex(
                     UserWarning,
                     r"but [0-9]+ samples have been fetched\. For multiprocessing data-loading, this",
-                        msg="Should always warn after exceeding length"):
+                    msg="Should always warn after exceeding length",
+                ):
                     next(it)
         # [no auto-batching] test that workers exit gracefully
         workers = dataloader_iter._workers
@@ -1478,19 +1710,36 @@ def test_iterable_style_dataset(self):
         # [auto-batching] multiprocessing loading
         num_workers = 3
         sizes_for_all_workers = [0, 4, 20]
-        expected = sorted(functools.reduce(operator.iadd, (list(range(s)) for s in sizes_for_all_workers), []))
-        assert len(sizes_for_all_workers) == num_workers, 'invalid test case'
+        expected = sorted(
+            functools.reduce(
+                operator.iadd, (list(range(s)) for s in sizes_for_all_workers), []
+            )
+        )
+        assert len(sizes_for_all_workers) == num_workers, "invalid test case"
         for prefetch_factor in [2, 3, 4]:
             dataset = WorkerSpecificIterableDataset(sizes_for_all_workers)
             # worker 0 should return 0 batches
             # worker 1 should return 1 batches
             # worker 2 should return 3 batches
-            dataloader = self._get_data_loader(dataset, num_workers=num_workers, batch_size=7, prefetch_factor=prefetch_factor)
+            dataloader = self._get_data_loader(
+                dataset,
+                num_workers=num_workers,
+                batch_size=7,
+                prefetch_factor=prefetch_factor,
+            )
             dataloader_iter = iter(dataloader)
             fetched = list(dataloader_iter)
             self.assertEqual(len(fetched), 4)
             fetched = {tuple(t.tolist()) for t in fetched}
-            self.assertEqual(fetched, {tuple(range(4)), tuple(range(7)), tuple(range(7, 14)), tuple(range(14, 20))})
+            self.assertEqual(
+                fetched,
+                {
+                    tuple(range(4)),
+                    tuple(range(7)),
+                    tuple(range(7, 14)),
+                    tuple(range(14, 20)),
+                },
+            )
 
             # [auto-batching] test that workers exit gracefully
             workers = dataloader_iter._workers
@@ -1514,16 +1763,25 @@ def test_iterable_style_dataset(self):
         # [auto-batching & drop_last] multiprocessing loading
         num_workers = 3
         sizes_for_all_workers = [0, 4, 20]
-        expected = sorted(functools.reduce(operator.iadd, (list(range(s)) for s in sizes_for_all_workers), []))
-        assert len(sizes_for_all_workers) == num_workers, 'invalid test case'
+        expected = sorted(
+            functools.reduce(
+                operator.iadd, (list(range(s)) for s in sizes_for_all_workers), []
+            )
+        )
+        assert len(sizes_for_all_workers) == num_workers, "invalid test case"
         for prefetch_factor in [2, 3, 4]:
             dataset = WorkerSpecificIterableDataset(sizes_for_all_workers)
             # worker 0 should return 0 batches
             # worker 1 should return 1 batches
             # worker 2 should return 3 batches
-            dataloader = self._get_data_loader(dataset, num_workers=num_workers, batch_size=7, drop_last=True,
-                                               worker_init_fn=set_faulthander_if_available,
-                                               prefetch_factor=prefetch_factor)
+            dataloader = self._get_data_loader(
+                dataset,
+                num_workers=num_workers,
+                batch_size=7,
+                drop_last=True,
+                worker_init_fn=set_faulthander_if_available,
+                prefetch_factor=prefetch_factor,
+            )
             dataloader_iter = iter(dataloader)
             fetched = list(dataloader_iter)
             self.assertEqual(len(fetched), 2)
@@ -1549,17 +1807,26 @@ def test_chain_iterable_style_dataset(self):
         dataset2 = CountingIterableDataset(15)
         expected = list(range(20)) + list(range(15))
         for num_workers in [0, 1]:
-            for chained_dataset in [dataset1 + dataset2, ChainDataset([dataset1, dataset2])]:
-                fetched = list(self._get_data_loader(chained_dataset, num_workers=num_workers))
+            for chained_dataset in [
+                dataset1 + dataset2,
+                ChainDataset([dataset1, dataset2]),
+            ]:
+                fetched = list(
+                    self._get_data_loader(chained_dataset, num_workers=num_workers)
+                )
                 self.assertEqual(len(fetched), len(expected))
                 for e, d in zip(expected, fetched):
                     self.assertIsInstance(d, torch.Tensor)
                     self.assertEqual(e, d)
 
-        with self.assertRaisesRegex(AssertionError, "ChainDataset only supports IterableDataset"):
+        with self.assertRaisesRegex(
+            AssertionError, "ChainDataset only supports IterableDataset"
+        ):
             list(iter(dataset1 + self.dataset))
 
-        with self.assertRaisesRegex(AssertionError, "ChainDataset only supports IterableDataset"):
+        with self.assertRaisesRegex(
+            AssertionError, "ChainDataset only supports IterableDataset"
+        ):
             list(iter(ChainDataset([dataset1, self.dataset])))
 
     @unittest.skipIf(IS_MACOS, "Not working on macos")
@@ -1576,61 +1843,120 @@ def test_multiprocessing_contexts(self):
         dl_common_args = dict(num_workers=3, batch_size=3, pin_memory=(not TEST_CUDA))
         for ctx in supported_multiprocessing_contexts:
             # windows and jetson devices don't support sharing cuda tensor; ROCm does not yet fully support IPC
-            if ctx in ['spawn', 'forkserver'] and TEST_CUDA and not IS_WINDOWS and not IS_JETSON:
+            if (
+                ctx in ["spawn", "forkserver"]
+                and TEST_CUDA
+                and not IS_WINDOWS
+                and not IS_JETSON
+            ):
                 ds_cls = CUDACountingDataset
             else:
                 ds_cls = CountingDataset
             self.assertEqual(
-                reference, list(self._get_data_loader(ds_cls(counting_ds_n), multiprocessing_context=ctx, **dl_common_args)))
+                reference,
+                list(
+                    self._get_data_loader(
+                        ds_cls(counting_ds_n),
+                        multiprocessing_context=ctx,
+                        **dl_common_args,
+                    )
+                ),
+            )
             if ctx is not None:
                 # test ctx object
                 ctx = mp.get_context(ctx)
                 self.assertEqual(
-                    reference, list(self._get_data_loader(ds_cls(counting_ds_n), multiprocessing_context=ctx, **dl_common_args)))
+                    reference,
+                    list(
+                        self._get_data_loader(
+                            ds_cls(counting_ds_n),
+                            multiprocessing_context=ctx,
+                            **dl_common_args,
+                        )
+                    ),
+                )
 
-    @skipIfNoNumpy
-    @unittest.skipIf(IS_JETSON, "Not working on Jetson")
-    def test_multiprocessing_iterdatapipe(self):
+    def _test_multiprocessing_iterdatapipe(self, with_dill):
         # Testing to make sure that function from global scope (e.g. imported from library) can be serialized
         # and used with multiprocess DataLoader
 
-        reference = [torch.as_tensor([[2, 3, 4, 5]], dtype=torch.int64),
-                     torch.as_tensor([[2, 3, 4, 5]], dtype=torch.int64)]
+        reference = [
+            torch.as_tensor([[2, 3, 4, 5]], dtype=torch.int64),
+            torch.as_tensor([[2, 3, 4, 5]], dtype=torch.int64),
+        ]
         datapipe: IterDataPipe = IterableWrapper([[1, 2, 3, 4], [1, 2, 3, 4, 5, 6]])
         datapipe = datapipe.map(row_processor)
-        datapipe = datapipe.filter(lambda row: len(row) == 4) if HAS_DILL else datapipe.filter(filter_len)
+        datapipe = (
+            datapipe.filter(lambda row: len(row) == 4)
+            if with_dill
+            else datapipe.filter(filter_len)
+        )
 
-        dl_common_args = dict(num_workers=2, batch_size=2, shuffle=True, pin_memory=(not TEST_CUDA))
+        dl_common_args = dict(
+            num_workers=2, batch_size=2, shuffle=True, pin_memory=(not TEST_CUDA)
+        )
         for ctx in supported_multiprocessing_contexts:
-            self.assertEqual(reference,
-                             [t.type(torch.int64)
-                              for t in self._get_data_loader(datapipe, multiprocessing_context=ctx, **dl_common_args)])
+            self.assertEqual(
+                reference,
+                [
+                    t.type(torch.int64)
+                    for t in self._get_data_loader(
+                        datapipe, multiprocessing_context=ctx, **dl_common_args
+                    )
+                ],
+            )
             if ctx is not None:
                 # test ctx object
                 ctx = mp.get_context(ctx)
-                self.assertEqual(reference,
-                                 [t.type(torch.int64)
-                                  for t in
-                                  self._get_data_loader(datapipe, multiprocessing_context=ctx, **dl_common_args)])
+                self.assertEqual(
+                    reference,
+                    [
+                        t.type(torch.int64)
+                        for t in self._get_data_loader(
+                            datapipe, multiprocessing_context=ctx, **dl_common_args
+                        )
+                    ],
+                )
+
+    @skipIfNoNumpy
+    @unittest.skipIf(IS_JETSON, "Not working on Jetson")
+    def test_multiprocessing_iterdatapipe(self):
+        self._test_multiprocessing_iterdatapipe(with_dill=False)
+
+    @unittest.expectedFailure
+    @skipIfNoNumpy
+    @unittest.skipIf(IS_JETSON, "Not working on Jetson")
+    @skipIfNoDill
+    def test_multiprocessing_iterdatapipe_with_dill(self):
+        self._test_multiprocessing_iterdatapipe(with_dill=True)
 
     def test_worker_seed(self):
         num_workers = 6
         batch_size = 1
         dataset = SynchronizedSeedDataset(num_workers, batch_size, num_workers)
-        dataloader = self._get_data_loader(dataset, batch_size=batch_size, num_workers=num_workers)
+        dataloader = self._get_data_loader(
+            dataset, batch_size=batch_size, num_workers=num_workers
+        )
         seeds = set()
-        for batch in dataloader:
-            seeds.add(batch[0])
+        seeds.update(batch[0] for batch in dataloader)
         self.assertEqual(len(seeds), num_workers)
 
     def test_worker_seed_reproducibility(self):
         def get_dataloader():
-            return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, generator=torch.Generator().manual_seed(42))
+            return DataLoader(
+                dataset,
+                batch_size=batch_size,
+                num_workers=num_workers,
+                generator=torch.Generator().manual_seed(42),
+            )
 
         num_workers = 6
         batch_size = 1
         dataset = SynchronizedSeedDataset(num_workers, batch_size, num_workers)
-        self.assertEqual({int(batch) for batch in get_dataloader()}, {int(batch) for batch in get_dataloader()})
+        self.assertEqual(
+            {int(batch) for batch in get_dataloader()},
+            {int(batch) for batch in get_dataloader()},
+        )
 
     def test_multi_epochs_reproducibility(self):
         num_workers = 2
@@ -1638,17 +1964,21 @@ def test_multi_epochs_reproducibility(self):
         num_epochs = 3
 
         dataset = TestMultiEpochDataset(batch_size * num_workers)
-        dataloader = self._get_data_loader(dataset, batch_size=batch_size,
-                                           shuffle=False, num_workers=num_workers)
+        dataloader = self._get_data_loader(
+            dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers
+        )
 
         for ind in range(num_epochs):
             for batch_idx, sample in enumerate(dataloader):
-                self.assertEqual(sample.tolist(), [batch_idx % num_workers] * batch_size)
+                self.assertEqual(
+                    sample.tolist(), [batch_idx % num_workers] * batch_size
+                )
 
     def test_worker_init_fn(self):
         dataset = SeedDataset(4)
-        dataloader = self._get_data_loader(dataset, batch_size=2, num_workers=2,
-                                           worker_init_fn=init_fn)
+        dataloader = self._get_data_loader(
+            dataset, batch_size=2, num_workers=2, worker_init_fn=init_fn
+        )
         for batch in dataloader:
             self.assertEqual(12345, batch[0])
             self.assertEqual(12345, batch[1])
@@ -1670,12 +2000,24 @@ def test_shuffle_batch_none(self):
         self._test_shuffle(DataLoader(self.dataset, batch_size=None, shuffle=True))
 
     def test_shuffle_batch(self):
-        self._test_shuffle(self._get_data_loader(self.dataset, batch_size=2, shuffle=True))
+        self._test_shuffle(
+            self._get_data_loader(self.dataset, batch_size=2, shuffle=True)
+        )
 
     def test_shuffle_reproducibility(self):
         for fn in (
-            lambda: DataLoader(self.dataset, shuffle=True, num_workers=0, generator=torch.Generator().manual_seed(42)),
-            lambda: DataLoader(self.dataset, shuffle=True, num_workers=2, generator=torch.Generator().manual_seed(42)),
+            lambda: DataLoader(
+                self.dataset,
+                shuffle=True,
+                num_workers=0,
+                generator=torch.Generator().manual_seed(42),
+            ),
+            lambda: DataLoader(
+                self.dataset,
+                shuffle=True,
+                num_workers=2,
+                generator=torch.Generator().manual_seed(42),
+            ),
         ):
             self.assertEqual(list(fn()), list(fn()))
 
@@ -1683,34 +2025,61 @@ def test_sequential_workers(self):
         self._test_sequential(self._get_data_loader(self.dataset, num_workers=4))
 
     def test_seqential_batch_workers(self):
-        self._test_sequential(self._get_data_loader(self.dataset, batch_size=2, num_workers=4))
+        self._test_sequential(
+            self._get_data_loader(self.dataset, batch_size=2, num_workers=4)
+        )
 
     def test_seqential_batch_workers_prefetch(self):
-        self._test_sequential(DataLoader(self.dataset, batch_size=2, num_workers=4, prefetch_factor=3))
+        self._test_sequential(
+            DataLoader(self.dataset, batch_size=2, num_workers=4, prefetch_factor=3)
+        )
 
     def test_shuffle_workers(self):
-        self._test_shuffle(self._get_data_loader(self.dataset, shuffle=True, num_workers=4))
+        self._test_shuffle(
+            self._get_data_loader(self.dataset, shuffle=True, num_workers=4)
+        )
 
     def test_shuffle_batch_workers(self):
-        self._test_shuffle(self._get_data_loader(self.dataset, batch_size=2, shuffle=True, num_workers=4))
+        self._test_shuffle(
+            self._get_data_loader(
+                self.dataset, batch_size=2, shuffle=True, num_workers=4
+            )
+        )
 
     def test_shuffle_batch_workers_prefetch(self):
-        self._test_shuffle(DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4, prefetch_factor=3))
+        self._test_shuffle(
+            DataLoader(
+                self.dataset,
+                batch_size=2,
+                shuffle=True,
+                num_workers=4,
+                prefetch_factor=3,
+            )
+        )
 
     def test_random_sampler(self):
-
         from collections import Counter
+
         from torch.utils.data import RandomSampler
 
         def sample_stat(sampler, num_samples):
             counts = Counter(sampler)
             count_repeated = sum(val > 1 for val in counts.values())
-            return (count_repeated, min(counts.keys()), max(counts.keys()), sum(counts.values()))
+            return (
+                count_repeated,
+                min(counts.keys()),
+                max(counts.keys()),
+                sum(counts.values()),
+            )
 
         # test sample with replacement
         n = len(self.dataset) + 1  # ensure at least one sample is drawn more than once
-        sampler_with_replacement = RandomSampler(self.dataset, replacement=True, num_samples=n)
-        count_repeated, minval, maxval, count_total = sample_stat(sampler_with_replacement, n)
+        sampler_with_replacement = RandomSampler(
+            self.dataset, replacement=True, num_samples=n
+        )
+        count_repeated, minval, maxval, count_total = sample_stat(
+            sampler_with_replacement, n
+        )
         self.assertTrue(count_repeated > 0)
         self.assertTrue(minval >= 0)
         self.assertTrue(maxval < len(self.dataset))
@@ -1718,7 +2087,9 @@ def sample_stat(sampler, num_samples):
 
         # test sample without replacement and without specified num_samples
         sampler_without_replacement = RandomSampler(self.dataset)
-        count_repeated, minval, maxval, count_total = sample_stat(sampler_without_replacement, len(self.dataset))
+        count_repeated, minval, maxval, count_total = sample_stat(
+            sampler_without_replacement, len(self.dataset)
+        )
         self.assertTrue(count_repeated == 0)
         self.assertTrue(minval == 0)
         self.assertTrue(maxval == len(self.dataset) - 1)
@@ -1727,7 +2098,9 @@ def sample_stat(sampler, num_samples):
         # test sample without replacement and with specified num_samples
         n = len(self.dataset) * 2
         sampler_without_replacement = RandomSampler(self.dataset, num_samples=n)
-        count_repeated, minval, maxval, count_total = sample_stat(sampler_without_replacement, len(self.dataset))
+        count_repeated, minval, maxval, count_total = sample_stat(
+            sampler_without_replacement, len(self.dataset)
+        )
         self.assertTrue(count_repeated == len(self.dataset))
         self.assertTrue(minval == 0)
         self.assertTrue(maxval == len(self.dataset) - 1)
@@ -1735,7 +2108,9 @@ def sample_stat(sampler, num_samples):
 
         n = len(self.dataset) - 1
         sampler_without_replacement = RandomSampler(self.dataset, num_samples=n)
-        count_repeated, minval, maxval, count_total = sample_stat(sampler_without_replacement, len(self.dataset))
+        count_repeated, minval, maxval, count_total = sample_stat(
+            sampler_without_replacement, len(self.dataset)
+        )
         self.assertTrue(count_repeated == 0)
         self.assertTrue(minval >= 0)
         self.assertTrue(maxval < len(self.dataset))
@@ -1743,23 +2118,26 @@ def sample_stat(sampler, num_samples):
 
         n = len(self.dataset) + 1
         sampler_without_replacement = RandomSampler(self.dataset, num_samples=n)
-        count_repeated, minval, maxval, count_total = sample_stat(sampler_without_replacement, len(self.dataset))
+        count_repeated, minval, maxval, count_total = sample_stat(
+            sampler_without_replacement, len(self.dataset)
+        )
         self.assertTrue(count_repeated == 1)
         self.assertTrue(minval == 0)
         self.assertTrue(maxval == len(self.dataset) - 1)
         self.assertTrue(count_total == n)
 
         # raise error when replacement is non-boolean
-        with self.assertRaisesRegex(TypeError, "replacement should be a boolean value, but got replacement=0"):
+        with self.assertRaisesRegex(
+            TypeError, "replacement should be a boolean value, but got replacement=0"
+        ):
             RandomSampler(self.dataset, replacement=0)
 
     def test_random_sampler_len_with_replacement(self):
         from torch.utils.data import RandomSampler
+
         # add 5 extra samples
         num_samples = len(self.dataset) + 5
-        sampler = RandomSampler(self.dataset,
-                                replacement=True,
-                                num_samples=num_samples)
+        sampler = RandomSampler(self.dataset, replacement=True, num_samples=num_samples)
         # test len method
         self.assertEqual(num_samples, len(sampler))
 
@@ -1769,24 +2147,29 @@ def test_random_sampler_len_with_replacement(self):
 
         # test with dataloader, batch_size = 1
         batch_size = 1
-        count_num_samples_in_data_loader = len(self._get_data_loader(
-            self.dataset, batch_size=batch_size, sampler=sampler))
+        count_num_samples_in_data_loader = len(
+            self._get_data_loader(self.dataset, batch_size=batch_size, sampler=sampler)
+        )
         self.assertEqual(num_samples, count_num_samples_in_data_loader)
 
         # test with dataloader, batch_size = 6
         batch_size = 6
-        count_num_samples_in_data_loader = len(self._get_data_loader(
-            self.dataset, batch_size=batch_size, sampler=sampler))
-        self.assertEqual(int(math.ceil(float(num_samples) / batch_size)),
-                         count_num_samples_in_data_loader)
+        count_num_samples_in_data_loader = len(
+            self._get_data_loader(self.dataset, batch_size=batch_size, sampler=sampler)
+        )
+        self.assertEqual(
+            int(math.ceil(float(num_samples) / batch_size)),
+            count_num_samples_in_data_loader,
+        )
 
     def test_random_sampler_len_without_replacement(self):
         from torch.utils.data import RandomSampler
+
         # add 5 extra samples
         num_samples = len(self.dataset) + 5
-        sampler = RandomSampler(self.dataset,
-                                replacement=False,
-                                num_samples=num_samples)
+        sampler = RandomSampler(
+            self.dataset, replacement=False, num_samples=num_samples
+        )
         # test len method
         self.assertEqual(num_samples, len(sampler))
 
@@ -1796,19 +2179,24 @@ def test_random_sampler_len_without_replacement(self):
 
         # test with dataloader, batch_size = 1
         batch_size = 1
-        count_num_samples_in_data_loader = len(self._get_data_loader(
-            self.dataset, batch_size=batch_size, sampler=sampler))
+        count_num_samples_in_data_loader = len(
+            self._get_data_loader(self.dataset, batch_size=batch_size, sampler=sampler)
+        )
         self.assertEqual(num_samples, count_num_samples_in_data_loader)
 
         # test with dataloader, batch_size = 6
         batch_size = 6
-        count_num_samples_in_data_loader = len(self._get_data_loader(
-            self.dataset, batch_size=batch_size, sampler=sampler))
-        self.assertEqual(num_samples // batch_size + (num_samples % batch_size > 0),
-                         count_num_samples_in_data_loader)
+        count_num_samples_in_data_loader = len(
+            self._get_data_loader(self.dataset, batch_size=batch_size, sampler=sampler)
+        )
+        self.assertEqual(
+            num_samples // batch_size + (num_samples % batch_size > 0),
+            count_num_samples_in_data_loader,
+        )
 
     def test_distributed_sampler_invalid_rank(self):
         from torch.utils.data.distributed import DistributedSampler
+
         dataset = torch.IntTensor(range(10))
         with self.assertRaisesRegex(ValueError, "Invalid rank"):
             sampler = DistributedSampler(dataset, 3, 3)
@@ -1817,7 +2205,6 @@ def test_distributed_sampler_invalid_rank(self):
             sampler = DistributedSampler(dataset, 3, -1)
 
     def test_duplicating_data_with_drop_last(self):
-
         from torch.utils.data.distributed import DistributedSampler
 
         num_processes = 4
@@ -1826,22 +2213,52 @@ def test_duplicating_data_with_drop_last(self):
         scanned_data = torch.IntTensor([])
         for i in range(num_processes):
             s = DistributedSampler(data_set, num_processes, i)
-            d_loader = self._get_data_loader(data_set, batch_size=int(num_batches / num_processes), drop_last=True, sampler=s)
+            d_loader = self._get_data_loader(
+                data_set,
+                batch_size=int(num_batches / num_processes),
+                drop_last=True,
+                sampler=s,
+            )
             for data in d_loader:
                 scanned_data = torch.cat((scanned_data, data), 0)
 
         self.assertEqual(scanned_data.size(), scanned_data.unique().size())
 
     def test_sampler_reproducibility(self):
-        from torch.utils.data import RandomSampler, WeightedRandomSampler, SubsetRandomSampler
+        from torch.utils.data import (
+            RandomSampler,
+            SubsetRandomSampler,
+            WeightedRandomSampler,
+        )
 
         weights = [0.1, 0.9, 0.4, 0.7, 3.0, 0.6]
         for fn in (
-            lambda: RandomSampler(self.dataset, num_samples=5, replacement=True, generator=torch.Generator().manual_seed(42)),
-            lambda: RandomSampler(self.dataset, replacement=False, generator=torch.Generator().manual_seed(42)),
-            lambda: WeightedRandomSampler(weights, num_samples=5, replacement=True, generator=torch.Generator().manual_seed(42)),
-            lambda: WeightedRandomSampler(weights, num_samples=5, replacement=False, generator=torch.Generator().manual_seed(42)),
-            lambda: SubsetRandomSampler(range(10), generator=torch.Generator().manual_seed(42)),
+            lambda: RandomSampler(
+                self.dataset,
+                num_samples=5,
+                replacement=True,
+                generator=torch.Generator().manual_seed(42),
+            ),
+            lambda: RandomSampler(
+                self.dataset,
+                replacement=False,
+                generator=torch.Generator().manual_seed(42),
+            ),
+            lambda: WeightedRandomSampler(
+                weights,
+                num_samples=5,
+                replacement=True,
+                generator=torch.Generator().manual_seed(42),
+            ),
+            lambda: WeightedRandomSampler(
+                weights,
+                num_samples=5,
+                replacement=False,
+                generator=torch.Generator().manual_seed(42),
+            ),
+            lambda: SubsetRandomSampler(
+                range(10), generator=torch.Generator().manual_seed(42)
+            ),
         ):
             self.assertEqual(list(fn()), list(fn()))
 
@@ -1870,17 +2287,19 @@ def test_sampler_reproducibility(self):
 
     def _test_sampler(self, **kwargs):
         indices = range(2, 12)  # using a regular iterable
-        dl = self._get_data_loader(self.dataset, sampler=indices, batch_size=2, **kwargs)
+        dl = self._get_data_loader(
+            self.dataset, sampler=indices, batch_size=2, **kwargs
+        )
         self.assertEqual(len(dl), 5)
         for i, (input, _target) in enumerate(dl):
             self.assertEqual(len(input), 2)
-            self.assertEqual(input, self.data[i * 2 + 2:i * 2 + 4])
+            self.assertEqual(input, self.data[i * 2 + 2 : i * 2 + 4])
 
     def test_sampler(self):
         self._test_sampler()
         self._test_sampler(num_workers=4)
         if not NO_MULTIPROCESSING_SPAWN:
-            self._test_batch_sampler(num_workers=4, multiprocessing_context='spawn')
+            self._test_batch_sampler(num_workers=4, multiprocessing_context="spawn")
 
     def _test_batch_sampler(self, **kwargs):
         # [(0, 1), (2, 3, 4), (5, 6), (7, 8, 9), ...]
@@ -1895,21 +2314,23 @@ def _test_batch_sampler(self, **kwargs):
             if i % 2 == 0:
                 offset = i * 5 // 2
                 self.assertEqual(len(input), 2)
-                self.assertEqual(input, self.data[offset:offset + 2])
+                self.assertEqual(input, self.data[offset : offset + 2])
             else:
                 offset = i * 5 // 2
                 self.assertEqual(len(input), 3)
-                self.assertEqual(input, self.data[offset:offset + 3])
+                self.assertEqual(input, self.data[offset : offset + 3])
 
     def test_batch_sampler(self):
         self._test_batch_sampler()
         self._test_batch_sampler(num_workers=4)
         if not NO_MULTIPROCESSING_SPAWN:
-            self._test_batch_sampler(num_workers=4, multiprocessing_context='spawn')
+            self._test_batch_sampler(num_workers=4, multiprocessing_context="spawn")
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_shuffle_pin_memory(self):
-        loader = self._get_data_loader(self.dataset, batch_size=2, shuffle=True, num_workers=4, pin_memory=True)
+        loader = self._get_data_loader(
+            self.dataset, batch_size=2, shuffle=True, num_workers=4, pin_memory=True
+        )
         for input, target in loader:
             self.assertTrue(input.is_pinned())
             self.assertTrue(target.is_pinned())
@@ -1933,36 +2354,79 @@ def __len__(self):
     @unittest.skipIf(not TEST_NUMPY, "numpy unavailable")
     def test_numpy_gen_state(self):
         from torch.utils.data._utils.worker import _generate_state
+
         # Using NumPy generated states as the reference to test `_generate_state`
         # having the same result.
         # Test case: ((worker_id, base_seed), expected_state)
         test_cases = [
-            ((4, 13434589827475259383), (2884386318, 1088094898, 3523808998, 3860348662)),
+            (
+                (4, 13434589827475259383),
+                (2884386318, 1088094898, 3523808998, 3860348662),
+            ),
             ((1, 15014285634777110771), (1934848465, 763213760, 2959016433, 179751970)),
-            ((10, 978296274032934101), (1759791917, 3550927336, 1225977135, 1036538043)),
-            ((12, 11868770762134256968), (3974661794, 3331131333, 3630387033, 2885815368)),
-            ((9, 15378787925219019706), (3815056996, 3162224466, 2735102421, 3190253477)),
+            (
+                (10, 978296274032934101),
+                (1759791917, 3550927336, 1225977135, 1036538043),
+            ),
+            (
+                (12, 11868770762134256968),
+                (3974661794, 3331131333, 3630387033, 2885815368),
+            ),
+            (
+                (9, 15378787925219019706),
+                (3815056996, 3162224466, 2735102421, 3190253477),
+            ),
             ((5, 9055612723125076328), (3522565701, 3368424109, 959377806, 621878693)),
-            ((15, 14617792358407278405), (3402479508, 1588702753, 1169536393, 3675067356)),
-            ((9, 17363320784006640087), (957989458, 2518334477, 1421725660, 3086155459)),
-            ((12, 480002904169484764), (2732851467, 1762620729, 4055801988, 1277640511)),
-            ((15, 16803975943592702950), (3479415043, 4022359553, 295994005, 3358606349)),
-            ((9, 11704776406047813044), (1968928009, 710113752, 2442656196, 1587420279)),
-            ((10, 16357891985431864516), (1271733898, 4197047399, 3727213786, 2338547348)),
-            ((2, 17423369006318065007), (544294336, 1911284083, 3299147734, 3231058347)),
+            (
+                (15, 14617792358407278405),
+                (3402479508, 1588702753, 1169536393, 3675067356),
+            ),
+            (
+                (9, 17363320784006640087),
+                (957989458, 2518334477, 1421725660, 3086155459),
+            ),
+            (
+                (12, 480002904169484764),
+                (2732851467, 1762620729, 4055801988, 1277640511),
+            ),
+            (
+                (15, 16803975943592702950),
+                (3479415043, 4022359553, 295994005, 3358606349),
+            ),
+            (
+                (9, 11704776406047813044),
+                (1968928009, 710113752, 2442656196, 1587420279),
+            ),
+            (
+                (10, 16357891985431864516),
+                (1271733898, 4197047399, 3727213786, 2338547348),
+            ),
+            (
+                (2, 17423369006318065007),
+                (544294336, 1911284083, 3299147734, 3231058347),
+            ),
             ((2, 2889492011444113593), (3721591783, 2595811276, 2212881745, 977682627)),
             ((0, 8979703111668486195), (4276723937, 2556068849, 2962827292, 233130238)),
-            ((6, 6269787272229682235), (2548857855, 1216457374, 1012973562, 2999759647))
+            (
+                (6, 6269787272229682235),
+                (2548857855, 1216457374, 1012973562, 2999759647),
+            ),
         ]
 
         for (worker_id, base_seed), exp in test_cases:
             self.assertEqual(exp, _generate_state(base_seed, worker_id))
 
     def test_error(self):
-        self._test_error(self._get_data_loader(ErrorDataset(100), batch_size=2, shuffle=True))
+        self._test_error(
+            self._get_data_loader(ErrorDataset(100), batch_size=2, shuffle=True)
+        )
 
     def test_error_workers(self):
-        self._test_error(self._get_data_loader(ErrorDataset(41), batch_size=2, shuffle=True, num_workers=4))
+        self._test_error(
+            self._get_data_loader(
+                ErrorDataset(41), batch_size=2, shuffle=True, num_workers=4
+            )
+        )
 
     @unittest.skipIf(IS_WINDOWS, "FIXME: stuck test")
     def test_partial_workers(self):
@@ -1973,7 +2437,11 @@ def test_partial_workers(self):
             pin_memory_configs = (False,)
 
         for pin_memory in pin_memory_configs:
-            loader = iter(self._get_data_loader(self.dataset, batch_size=2, num_workers=4, pin_memory=pin_memory))
+            loader = iter(
+                self._get_data_loader(
+                    self.dataset, batch_size=2, num_workers=4, pin_memory=pin_memory
+                )
+            )
             workers = loader._workers
             if pin_memory:
                 pin_memory_thread = loader._pin_memory_thread
@@ -1984,7 +2452,7 @@ def test_partial_workers(self):
             del loader
             for w in workers:
                 w.join(JOIN_TIMEOUT)
-                self.assertFalse(w.is_alive(), 'subprocess not terminated')
+                self.assertFalse(w.is_alive(), "subprocess not terminated")
             if pin_memory:
                 pin_memory_thread.join(JOIN_TIMEOUT)
                 self.assertFalse(pin_memory_thread.is_alive())
@@ -1994,15 +2462,20 @@ def test_partial_workers(self):
     @unittest.skipIf(not HAS_PSUTIL, "psutil not found")
     @slowTest
     def test_proper_exit(self):
-        (r'''There might be ConnectionResetError or leaked semaphore warning '''
-         r'''(due to dirty process exit), but they are all safe to ignore''')
+        (
+            r"""There might be ConnectionResetError or leaked semaphore warning """
+            r"""(due to dirty process exit), but they are all safe to ignore"""
+        )
 
         # TODO: test the case where the pin_memory_thread triggers an
         #       error/fatal signal. I haven't found out how to properly do that.
 
-        for is_iterable_dataset, use_workers, pin_memory, hold_iter_reference in \
-                itertools.product([True, False], repeat=4):
-
+        for (
+            is_iterable_dataset,
+            use_workers,
+            pin_memory,
+            hold_iter_reference,
+        ) in itertools.product([True, False], repeat=4):
             # `hold_iter_reference` specifies whether we hold a reference to the
             # iterator. This is interesting because Python3 error traces holds a
             # reference to the frames, which hold references to all the local
@@ -2025,24 +2498,24 @@ def test_proper_exit(self):
                 # TODO: Fix test for 'loader_kill' that would cause running out of shared memory.
                 # Killing loader process would prevent DataLoader iterator clean up all queues
                 # and worker processes
-                exit_methods = [None, 'loader_error', 'worker_error', 'worker_kill']
+                exit_methods = [None, "loader_error", "worker_error", "worker_kill"]
                 persistent_workers = self.persistent_workers
             else:
-                exit_methods = [None, 'loader_error', 'loader_kill']
+                exit_methods = [None, "loader_error", "loader_kill"]
                 persistent_workers = False
 
             for exit_method in exit_methods:
-                if exit_method == 'worker_kill':
+                if exit_method == "worker_kill":
                     # FIXME: This sometimes hangs. See #16608.
                     continue
 
                 desc = []
-                desc.append(f'is_iterable_dataset={is_iterable_dataset}')
-                desc.append(f'use_workers={use_workers}')
-                desc.append(f'pin_memory={pin_memory}')
-                desc.append(f'hold_iter_reference={hold_iter_reference}')
-                desc.append(f'exit_method={exit_method}')
-                desc = 'test_proper_exit with ' + ', '.join(desc)
+                desc.append(f"is_iterable_dataset={is_iterable_dataset}")
+                desc.append(f"use_workers={use_workers}")
+                desc.append(f"pin_memory={pin_memory}")
+                desc.append(f"hold_iter_reference={hold_iter_reference}")
+                desc.append(f"exit_method={exit_method}")
+                desc = "test_proper_exit with " + ", ".join(desc)
 
                 # Event that the loader process uses to signal testing process
                 # that various things are setup, including that the worker pids
@@ -2054,12 +2527,20 @@ def test_proper_exit(self):
                 # finish normally.
                 tester_setup_event = mp.Event()
 
-                loader_p = ErrorTrackingProcess(target=_test_proper_exit,
-                                                args=(is_iterable_dataset, use_workers, pin_memory,
-                                                      exit_method, hold_iter_reference,
-                                                      loader_setup_event, tester_setup_event,
-                                                      persistent_workers),
-                                                disable_stderr=False)
+                loader_p = ErrorTrackingProcess(
+                    target=_test_proper_exit,
+                    args=(
+                        is_iterable_dataset,
+                        use_workers,
+                        pin_memory,
+                        exit_method,
+                        hold_iter_reference,
+                        loader_setup_event,
+                        tester_setup_event,
+                        persistent_workers,
+                    ),
+                    disable_stderr=False,
+                )
                 loader_p.start()
                 loader_psutil_p = psutil.Process(loader_p.pid)
 
@@ -2067,13 +2548,15 @@ def test_proper_exit(self):
                 # workers.
                 loader_setup_event.wait(timeout=JOIN_TIMEOUT)
                 if not loader_setup_event.is_set():
-                    fail_msg = desc + ': loader process failed to setup within given time'
+                    fail_msg = (
+                        desc + ": loader process failed to setup within given time"
+                    )
                     if loader_p.exception is not None:
-                        fail_msg += f', and had exception {loader_p.exception}'
+                        fail_msg += f", and had exception {loader_p.exception}"
                     elif not loader_p.is_alive():
-                        fail_msg += f', and exited with code {loader_p.exitcode} but had no exception'
+                        fail_msg += f", and exited with code {loader_p.exitcode} but had no exception"
                     else:
-                        fail_msg += ', and is still alive.'
+                        fail_msg += ", and is still alive."
                     if loader_p.is_alive():
                         # this may kill the process, needs to run after the above lines
                         loader_p.print_traces_of_all_threads()
@@ -2083,31 +2566,44 @@ def test_proper_exit(self):
                 worker_psutil_ps = loader_psutil_p.children()
 
                 def fail(reason):
-                    report_psutil_attrs = ['pid', 'name', 'cpu_times', 'io_counters',
-                                           'memory_full_info', 'num_ctx_switches',
-                                           'open_files', 'threads', 'status',
-                                           'nice', 'ionice']
+                    report_psutil_attrs = [
+                        "pid",
+                        "name",
+                        "cpu_times",
+                        "io_counters",
+                        "memory_full_info",
+                        "num_ctx_switches",
+                        "open_files",
+                        "threads",
+                        "status",
+                        "nice",
+                        "ionice",
+                    ]
                     if reason is None:
                         err_msg = desc
                     else:
-                        err_msg = f'{desc}: {reason}'
-                    err_msg += '\nLoader info:\n\t'
+                        err_msg = f"{desc}: {reason}"
+                    err_msg += "\nLoader info:\n\t"
                     if loader_psutil_p.is_running():
-                        err_msg += str(loader_psutil_p.as_dict(attrs=report_psutil_attrs))
+                        err_msg += str(
+                            loader_psutil_p.as_dict(attrs=report_psutil_attrs)
+                        )
                         # this may kill the process, needs to run after the above line
                         loader_p.print_traces_of_all_threads()
                     else:
-                        err_msg += f'exited with code {loader_p.exitcode}'
+                        err_msg += f"exited with code {loader_p.exitcode}"
                     if use_workers:
-                        err_msg += '\nWorker(s) info:'
+                        err_msg += "\nWorker(s) info:"
                         for idx, worker_psutil_p in enumerate(worker_psutil_ps):
-                            err_msg += f'\n\tWorker {idx}:\n\t\t'
+                            err_msg += f"\n\tWorker {idx}:\n\t\t"
                             if worker_psutil_p.is_running():
-                                err_msg += str(worker_psutil_p.as_dict(attrs=report_psutil_attrs))
+                                err_msg += str(
+                                    worker_psutil_p.as_dict(attrs=report_psutil_attrs)
+                                )
                                 # this may kill the process, needs to run after the above line
                                 print_traces_of_all_threads(worker_psutil_p.pid)
                             else:
-                                err_msg += 'exited with unknown code'
+                                err_msg += "exited with unknown code"
                     self.fail(err_msg)
 
                 tester_setup_event.set()
@@ -2115,30 +2611,47 @@ def fail(reason):
                 try:
                     loader_p.join(JOIN_TIMEOUT + MP_STATUS_CHECK_INTERVAL)
                     if loader_p.is_alive():
-                        fail_reason = 'loader process did not terminate'
+                        fail_reason = "loader process did not terminate"
                         if loader_p.exception is not None:
-                            fail(fail_reason + f', and had exception {loader_p.exception}')
+                            fail(
+                                fail_reason
+                                + f", and had exception {loader_p.exception}"
+                            )
                         else:
-                            fail(fail_reason + ', and had no exception')
-                    _, alive = psutil.wait_procs(worker_psutil_ps, timeout=(MP_STATUS_CHECK_INTERVAL + JOIN_TIMEOUT))
+                            fail(fail_reason + ", and had no exception")
+                    _, alive = psutil.wait_procs(
+                        worker_psutil_ps,
+                        timeout=(MP_STATUS_CHECK_INTERVAL + JOIN_TIMEOUT),
+                    )
                     if len(alive) > 0:
-                        fail('worker process (pid(s) {}) did not terminate'.format(
-                            ', '.join(str(p.pid) for p in alive)))
+                        fail(
+                            "worker process (pid(s) {}) did not terminate".format(
+                                ", ".join(str(p.pid) for p in alive)
+                            )
+                        )
                     if exit_method is None:
                         if loader_p.exitcode != 0:
-                            fail(f'loader process had nonzero exitcode {loader_p.exitcode}')
+                            fail(
+                                f"loader process had nonzero exitcode {loader_p.exitcode}"
+                            )
                     else:
                         if loader_p.exitcode == 0:
-                            fail('loader process had zero exitcode')
-                        if exit_method == 'loader_error':
-                            if not isinstance(loader_p.exception, RuntimeError) or \
-                                    'Loader error' not in str(loader_p.exception):
-                                fail(f'loader process did not raise expected exception, but had {loader_p.exception}')
-                        elif exit_method == 'worker_kill':
+                            fail("loader process had zero exitcode")
+                        if exit_method == "loader_error":
+                            if not isinstance(
+                                loader_p.exception, RuntimeError
+                            ) or "Loader error" not in str(loader_p.exception):
+                                fail(
+                                    f"loader process did not raise expected exception, but had {loader_p.exception}"
+                                )
+                        elif exit_method == "worker_kill":
                             if isinstance(loader_p.exception, RuntimeError):
-                                if 'DataLoader worker (pid' not in str(loader_p.exception):
-                                    fail('loader process did not raise expected exception, but had {}'.format(
-                                        loader_p.exception))
+                                if "DataLoader worker (pid" not in str(
+                                    loader_p.exception
+                                ):
+                                    fail(
+                                        f"loader process did not raise expected exception, but had {loader_p.exception}"
+                                    )
                             elif isinstance(loader_p.exception, ConnectionRefusedError):
                                 # Sometimes, when the worker is being killed and is freeing its
                                 # resources, the unpickling in loader process will be met an
@@ -2149,11 +2662,16 @@ def fail(reason):
                                 # After all, we are happy as long as it terminates.
                                 pass
                             else:
-                                fail(f'loader process did not raise expected exception, but had {loader_p.exception}')
-                        elif exit_method == 'worker_error':
-                            if not isinstance(loader_p.exception, RuntimeError) or \
-                                    'Worker error' not in str(loader_p.exception):
-                                fail(f'loader process did not raise expected exception, but had {loader_p.exception}')
+                                fail(
+                                    f"loader process did not raise expected exception, but had {loader_p.exception}"
+                                )
+                        elif exit_method == "worker_error":
+                            if not isinstance(
+                                loader_p.exception, RuntimeError
+                            ) or "Worker error" not in str(loader_p.exception):
+                                fail(
+                                    f"loader process did not raise expected exception, but had {loader_p.exception}"
+                                )
                 finally:
                     loader_p.terminate()
 
@@ -2164,6 +2682,7 @@ def check_len(dl, expected):
             for _ in dl:
                 n += 1
             self.assertEqual(n, expected)
+
         check_len(self.dataset, 100)
         check_len(self._get_data_loader(self.dataset, batch_size=2), 50)
         check_len(self._get_data_loader(self.dataset, batch_size=3), 34)
@@ -2255,7 +2774,7 @@ def test_default_collate_dtype(self):
         self.assertEqual(collated.dtype, torch.bool)
 
         # Should be a no-op
-        arr = ['a', 'b', 'c']
+        arr = ["a", "b", "c"]
         self.assertEqual(arr, _utils.collate.default_collate(arr))
 
     def test_default_collate_mapping_keep_type(self):
@@ -2269,11 +2788,13 @@ def test_default_collate_sequence_keep_type(self):
         batch = [CustomList([1, 2, 3]), CustomList([4, 5, 6])]
         collated = _utils.collate.default_collate(batch)
 
-        expected = CustomList([
-            torch.tensor([1, 4]),
-            torch.tensor([2, 5]),
-            torch.tensor([3, 6]),
-        ])
+        expected = CustomList(
+            [
+                torch.tensor([1, 4]),
+                torch.tensor([2, 5]),
+                torch.tensor([3, 6]),
+            ]
+        )
         self.assertEqual(collated, expected)
 
     def test_default_collate_sequence_dont_keep_type(self):
@@ -2287,10 +2808,10 @@ def test_default_collate_bad_numpy_types(self):
         import numpy as np
 
         # Should be a no-op
-        arr = np.array(['a', 'b', 'c'])
+        arr = np.array(["a", "b", "c"])
         self.assertEqual(arr, _utils.collate.default_collate(arr))
 
-        arr = np.array([[['a', 'b', 'c']]])
+        arr = np.array([[["a", "b", "c"]]])
         self.assertRaises(TypeError, lambda: _utils.collate.default_collate(arr))
 
         arr = np.array([object(), object(), object()])
@@ -2305,21 +2826,26 @@ def test_default_collate_numpy_memmap(self):
 
         with tempfile.TemporaryFile() as f:
             arr = np.array([[0, 1], [2, 3], [4, 5], [6, 7]])
-            arr_memmap = np.memmap(f, dtype=arr.dtype, mode='w+', shape=arr.shape)
+            arr_memmap = np.memmap(f, dtype=arr.dtype, mode="w+", shape=arr.shape)
             arr_memmap[:] = arr[:]
-            arr_new = np.memmap(f, dtype=arr.dtype, mode='r', shape=arr.shape)
+            arr_new = np.memmap(f, dtype=arr.dtype, mode="r", shape=arr.shape)
             tensor = _utils.collate.default_collate(list(arr_new))
 
-        self.assertTrue((tensor == tensor.new_tensor([[0, 1], [2, 3], [4, 5], [6, 7]])).all().item())
+        self.assertTrue(
+            (tensor == tensor.new_tensor([[0, 1], [2, 3], [4, 5], [6, 7]])).all().item()
+        )
 
     def test_default_collate_bad_sequence_type(self):
-        batch = [['X'], ['X', 'X']]
+        batch = [["X"], ["X", "X"]]
         self.assertRaises(RuntimeError, lambda: _utils.collate.default_collate(batch))
-        self.assertRaises(RuntimeError, lambda: _utils.collate.default_collate(batch[::-1]))
+        self.assertRaises(
+            RuntimeError, lambda: _utils.collate.default_collate(batch[::-1])
+        )
 
     @unittest.skipIf(not TEST_NUMPY, "numpy unavailable")
     def test_default_collate_shared_tensor(self):
         import numpy as np
+
         t_in = torch.zeros(1)
         n_in = np.zeros(1)
 
@@ -2333,7 +2859,7 @@ def test_default_collate_shared_tensor(self):
         #        `get_worker_info() != None`), even though it is not.
         old = _utils.worker._worker_info
         try:
-            _utils.worker._worker_info = 'x'
+            _utils.worker._worker_info = "x"
             self.assertEqual(_utils.collate.default_collate([t_in]).is_shared(), True)
             self.assertEqual(_utils.collate.default_collate([n_in]).is_shared(), True)
         finally:
@@ -2342,22 +2868,29 @@ def test_default_collate_shared_tensor(self):
     def test_excessive_thread_creation_warning(self):
         with self.assertWarnsRegex(
             UserWarning,
-                r"excessive worker creation might get DataLoader running slow or even freeze"):
+            r"excessive worker creation might get DataLoader running slow or even freeze",
+        ):
             dataloader = DataLoader(self.dataset, batch_size=2, num_workers=1000)
 
 
 class TestDataLoaderDeviceType(TestCase):
-    @parametrize("context", [ctx for ctx in supported_multiprocessing_contexts if ctx is not None])
+    @parametrize(
+        "context",
+        [ctx for ctx in supported_multiprocessing_contexts if ctx is not None],
+    )
     def test_nested_tensor_multiprocessing(self, device, context):
         # The 'fork' multiprocessing context doesn't work for CUDA so skip it
-        if 'cuda' in device and context == "fork":
+        if "cuda" in device and context == "fork":
             # TODO: Skip this better in a better way when the test framework allows
             return
 
-        dataset = [torch.nested.nested_tensor([torch.randn(5)], device=device) for _ in range(10)]
+        dataset = [
+            torch.nested.nested_tensor([torch.randn(5)], device=device)
+            for _ in range(10)
+        ]
 
         pin_memory_settings = [False]
-        if device == 'cpu' and torch.cuda.is_available():
+        if device == "cpu" and torch.cuda.is_available():
             pin_memory_settings.append(True)
 
         for pin_memory in pin_memory_settings:
@@ -2376,7 +2909,8 @@ def test_nested_tensor_multiprocessing(self, device, context):
         # Error case: default collate_fn doesn't currently support batches of nested tensors.
         # Following the current semantics, we'd need to stack them, which isn't possible atm.
         with self.assertRaisesRegex(
-                RuntimeError, "not currently supported by the default collate_fn"):
+            RuntimeError, "not currently supported by the default collate_fn"
+        ):
             loader = torch.utils.data.DataLoader(
                 dataset,
                 batch_size=1,
@@ -2417,7 +2951,7 @@ def _create_dp(buffer_size):
                     num_workers=num_workers,
                     shuffle=True,
                     multiprocessing_context=mp_ctx,
-                    persistent_workers=pw
+                    persistent_workers=pw,
                 )
 
                 # No seed
@@ -2448,7 +2982,7 @@ def _create_dp(buffer_size):
 
 class StringDataset(Dataset):
     def __init__(self):
-        self.s = '12345'
+        self.s = "12345"
 
     def __len__(self):
         return len(self.s)
@@ -2460,7 +2994,8 @@ def __getitem__(self, ndx):
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
-    "fork is not supported. Dying (set die_after_fork=0 to override)")
+    "fork is not supported. Dying (set die_after_fork=0 to override)",
+)
 class TestStringDataLoader(TestCase):
     def setUp(self):
         super().setUp()
@@ -2468,8 +3003,10 @@ def setUp(self):
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_shuffle_pin_memory(self):
-        loader = DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4, pin_memory=True)
-        for (s, n) in loader:
+        loader = DataLoader(
+            self.dataset, batch_size=2, shuffle=True, num_workers=4, pin_memory=True
+        )
+        for s, n in loader:
             self.assertIsInstance(s[0], str)
             self.assertTrue(n.is_pinned())
 
@@ -2480,9 +3017,9 @@ def __len__(self):
 
     def __getitem__(self, ndx):
         return {
-            'a_tensor': torch.empty(4, 2).fill_(ndx),
-            'another_dict': {
-                'a_number': ndx,
+            "a_tensor": torch.empty(4, 2).fill_(ndx),
+            "another_dict": {
+                "a_number": ndx,
             },
         }
 
@@ -2490,7 +3027,8 @@ def __getitem__(self, ndx):
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
-    "fork is not supported. Dying (set die_after_fork=0 to override)")
+    "fork is not supported. Dying (set die_after_fork=0 to override)",
+)
 class TestDictDataLoader(TestCase):
     def setUp(self):
         super().setUp()
@@ -2499,23 +3037,32 @@ def setUp(self):
     def test_sequential_batch(self):
         for persistent_workers in (False, True):
             if persistent_workers:
-                loader = DataLoader(self.dataset, batch_size=2, shuffle=False,
-                                    persistent_workers=persistent_workers, num_workers=1)
+                loader = DataLoader(
+                    self.dataset,
+                    batch_size=2,
+                    shuffle=False,
+                    persistent_workers=persistent_workers,
+                    num_workers=1,
+                )
             else:
-                loader = DataLoader(self.dataset, batch_size=2, shuffle=False,
-                                    persistent_workers=persistent_workers)
+                loader = DataLoader(
+                    self.dataset,
+                    batch_size=2,
+                    shuffle=False,
+                    persistent_workers=persistent_workers,
+                )
             batch_size = loader.batch_size
             for i, sample in enumerate(loader):
                 idx = i * batch_size
-                self.assertEqual(set(sample.keys()), {'a_tensor', 'another_dict'})
-                self.assertEqual(set(sample['another_dict'].keys()), {'a_number'})
+                self.assertEqual(set(sample.keys()), {"a_tensor", "another_dict"})
+                self.assertEqual(set(sample["another_dict"].keys()), {"a_number"})
 
-                t = sample['a_tensor']
+                t = sample["a_tensor"]
                 self.assertEqual(t.size(), torch.Size([batch_size, 4, 2]))
                 self.assertTrue((t[0] == idx).all())
                 self.assertTrue((t[1] == idx + 1).all())
 
-                n = sample['another_dict']['a_number']
+                n = sample["another_dict"]["a_number"]
                 self.assertEqual(n.size(), torch.Size([batch_size]))
                 self.assertEqual(n[0], idx)
                 self.assertEqual(n[1], idx + 1)
@@ -2524,22 +3071,27 @@ def test_sequential_batch(self):
     def test_pin_memory(self):
         loader = DataLoader(self.dataset, batch_size=2, pin_memory=True)
         for sample in loader:
-            self.assertTrue(sample['a_tensor'].is_pinned())
-            self.assertTrue(sample['another_dict']['a_number'].is_pinned())
+            self.assertTrue(sample["a_tensor"].is_pinned())
+            self.assertTrue(sample["another_dict"]["a_number"].is_pinned())
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_pin_memory_device(self):
-        loader = DataLoader(self.dataset, batch_size=2, pin_memory=True, pin_memory_device='cuda')
+        loader = DataLoader(
+            self.dataset, batch_size=2, pin_memory=True, pin_memory_device="cuda"
+        )
         for sample in loader:
-            self.assertTrue(sample['a_tensor'].is_pinned(device='cuda'))
-            self.assertTrue(sample['another_dict']['a_number'].is_pinned(device='cuda'))
+            self.assertTrue(sample["a_tensor"].is_pinned(device="cuda"))
+            self.assertTrue(sample["another_dict"]["a_number"].is_pinned(device="cuda"))
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_pin_memory_with_only_device(self):
-        loader = DataLoader(self.dataset, batch_size=2, pin_memory_device='cuda')
+        loader = DataLoader(self.dataset, batch_size=2, pin_memory_device="cuda")
         for sample in loader:
-            self.assertFalse(sample['a_tensor'].is_pinned(device='cuda'))
-            self.assertFalse(sample['another_dict']['a_number'].is_pinned(device='cuda'))
+            self.assertFalse(sample["a_tensor"].is_pinned(device="cuda"))
+            self.assertFalse(
+                sample["another_dict"]["a_number"].is_pinned(device="cuda")
+            )
+
 
 class DummyDataset(torch.utils.data.Dataset):
     def __init__(self):
@@ -2562,11 +3114,13 @@ def __getitem__(self, idx):
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
-    "fork is not supported. Dying (set die_after_fork=0 to override)")
+    "fork is not supported. Dying (set die_after_fork=0 to override)",
+)
 @unittest.skipIf(
-    TEST_WITH_ASAN, "DataLoader tests hang in ASAN, see: https://github.com/pytorch/pytorch/issues/66223")
+    TEST_WITH_ASAN,
+    "DataLoader tests hang in ASAN, see: https://github.com/pytorch/pytorch/issues/66223",
+)
 class TestDataLoaderPersistentWorkers(TestDataLoader):
-
     def setUp(self):
         super().setUp()
         self.persistent_workers = True
@@ -2576,7 +3130,12 @@ def setUp(self):
     def test_fd_limit_exceeded(self):
         # See NOTE [ DataLoader on Linux and open files limit ]
         import subprocess
-        subprocess.check_output([sys.executable, '-c', """\
+
+        subprocess.check_output(
+            [
+                sys.executable,
+                "-c",
+                """\
 import torch
 import resource
 from torch.utils.data import DataLoader, IterableDataset
@@ -2606,7 +3165,9 @@ def __next__(self):
 except RuntimeError as e:
     assert "ulimit -n" in str(e)
     assert "set_sharing_strategy" in str(e)
-"""])
+""",
+            ]
+        )
 
     def test_dataset_not_reset(self):
         dataset = DummyDataset()
@@ -2614,7 +3175,9 @@ def test_dataset_not_reset(self):
         if TEST_CUDA:
             pin_memory_configs.append(True)
         for pin_memory in pin_memory_configs:
-            dataloader = self._get_data_loader(dataset, num_workers=2, pin_memory=pin_memory)
+            dataloader = self._get_data_loader(
+                dataset, num_workers=2, pin_memory=pin_memory
+            )
             dataset.start = 0
             for i in range(10):
                 for x in dataloader:
@@ -2628,7 +3191,12 @@ def test_dataset_not_reset(self):
     @unittest.skipIf(IS_WINDOWS, "Needs fork")
     def test_early_exit(self):
         import subprocess
-        proc = subprocess.check_output([sys.executable, '-c', """\
+
+        proc = subprocess.check_output(
+            [
+                sys.executable,
+                "-c",
+                """\
 import torch
 from torch.utils.data import DataLoader, IterableDataset
 
@@ -2659,26 +3227,33 @@ def __next__(self):
 
     for _ in dl:
         break
-"""])
+""",
+            ]
+        )
 
 
 class NamedTupleDataset(Dataset):
     from collections import namedtuple
-    Batch = namedtuple('Batch', ['data', 'label', 'random_tensor'])
-    Data = namedtuple('Data', ['positive', 'negative'])
+
+    Batch = namedtuple("Batch", ["data", "label", "random_tensor"])
+    Data = namedtuple("Data", ["positive", "negative"])
 
     def __len__(self):
         return 4
 
     def __getitem__(self, ndx):
-        return self.Batch(data=self.Data(positive=ndx, negative=-ndx),
-                          label=str(ndx), random_tensor=torch.randn(3))
+        return self.Batch(
+            data=self.Data(positive=ndx, negative=-ndx),
+            label=str(ndx),
+            random_tensor=torch.randn(3),
+        )
 
 
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
-    "fork is not supported. Dying (set die_after_fork=0 to override)")
+    "fork is not supported. Dying (set die_after_fork=0 to override)",
+)
 class TestNamedTupleDataLoader(TestCase):
     def setUp(self):
         super().setUp()
@@ -2701,6 +3276,7 @@ def test_dataloader_with_namedtuple(self):
             self.assertIsInstance(batch.data, NamedTupleDataset.Data)
             self.assertNotIsInstance(batch.data.positive, torch.Tensor)
 
+
 class SimpleCustomBatch:
     def __init__(self, data):
         transposed_data = list(zip(*data))
@@ -2715,11 +3291,13 @@ def pin_memory(self):
     def is_pinned(self):
         return self.inp.is_pinned() and self.tgt.is_pinned()
 
+
 # Workaround for https://github.com/pytorch/pytorch/issues/50661
 # Classes from  `__main__` can not be correctly unpickled from spawned module
 # See https://docs.python.org/3/library/multiprocessing.html#multiprocessing-programming
 self_module = __import__(os.path.splitext(os.path.basename(__file__))[0])
 
+
 def collate_wrapper(batch):
     return self_module.SimpleCustomBatch(batch)
 
@@ -2735,13 +3313,16 @@ def collate_into_packed_sequence_batch_first(batch):
     data = torch.stack([sample[0] for sample in batch], 0)
     b, t = data.size()
     lengths = torch.randint(1, t, size=(b,), dtype=torch.int64)
-    return torch.nn.utils.rnn.pack_padded_sequence(data, lengths, batch_first=True, enforce_sorted=False)
+    return torch.nn.utils.rnn.pack_padded_sequence(
+        data, lengths, batch_first=True, enforce_sorted=False
+    )
 
 
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
-    "fork is not supported. Dying (set die_after_fork=0 to override)")
+    "fork is not supported. Dying (set die_after_fork=0 to override)",
+)
 class TestCustomPinFn(TestCase):
     def setUp(self):
         super().setUp()
@@ -2754,11 +3335,15 @@ def test_custom_batch_pin(self):
         test_cases = [
             (collate_wrapper, self_module.SimpleCustomBatch),
             (collate_into_packed_sequence, torch.nn.utils.rnn.PackedSequence),
-            (collate_into_packed_sequence_batch_first, torch.nn.utils.rnn.PackedSequence),
+            (
+                collate_into_packed_sequence_batch_first,
+                torch.nn.utils.rnn.PackedSequence,
+            ),
         ]
         for collate_fn, elem_cls in test_cases:
-            loader = DataLoader(self.dataset, batch_size=2, collate_fn=collate_fn,
-                                pin_memory=True)
+            loader = DataLoader(
+                self.dataset, batch_size=2, collate_fn=collate_fn, pin_memory=True
+            )
             for sample in loader:
                 self.assertIsInstance(sample, elem_cls)
                 self.assertTrue(sample.is_pinned())
@@ -2768,11 +3353,19 @@ def test_custom_batch_pin_worker(self):
         test_cases = [
             (collate_wrapper, self_module.SimpleCustomBatch),
             (collate_into_packed_sequence, torch.nn.utils.rnn.PackedSequence),
-            (collate_into_packed_sequence_batch_first, torch.nn.utils.rnn.PackedSequence),
+            (
+                collate_into_packed_sequence_batch_first,
+                torch.nn.utils.rnn.PackedSequence,
+            ),
         ]
         for collate_fn, elem_cls in test_cases:
-            loader = DataLoader(self.dataset, batch_size=2, collate_fn=collate_fn,
-                                pin_memory=True, num_workers=1)
+            loader = DataLoader(
+                self.dataset,
+                batch_size=2,
+                collate_fn=collate_fn,
+                pin_memory=True,
+                num_workers=1,
+            )
             for sample in loader:
                 self.assertIsInstance(sample, elem_cls)
                 self.assertTrue(sample.is_pinned())
@@ -2796,10 +3389,12 @@ def __len__(self):
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
-    "fork is not supported. Dying (set die_after_fork=0 to override)")
+    "fork is not supported. Dying (set die_after_fork=0 to override)",
+)
 @unittest.skipIf(
     TEST_WITH_ASAN,
-    "Flaky with ASAN, see https://github.com/pytorch/pytorch/issues/65727")
+    "Flaky with ASAN, see https://github.com/pytorch/pytorch/issues/65727",
+)
 class TestIndividualWorkerQueue(TestCase):
     def setUp(self):
         super().setUp()
@@ -2807,20 +3402,26 @@ def setUp(self):
 
     def _run_ind_worker_queue_test(self, batch_size, num_workers):
         loader = DataLoader(
-            self.dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers,
-            timeout=5, worker_init_fn=self.dataset.worker_init_fn
+            self.dataset,
+            batch_size=batch_size,
+            shuffle=False,
+            num_workers=num_workers,
+            timeout=5,
+            worker_init_fn=self.dataset.worker_init_fn,
         )
         current_worker_idx = 0
         for i, (worker_ids, sample) in enumerate(loader):
             self.assertEqual(worker_ids.tolist(), [current_worker_idx] * batch_size)
-            self.assertEqual(sample.tolist(), list(range(i * batch_size, (i + 1) * batch_size)))
+            self.assertEqual(
+                sample.tolist(), list(range(i * batch_size, (i + 1) * batch_size))
+            )
             current_worker_idx += 1
             if current_worker_idx == num_workers:
                 current_worker_idx = 0
 
     def test_ind_worker_queue(self):
         max_num_workers = None
-        if hasattr(os, 'sched_getaffinity'):
+        if hasattr(os, "sched_getaffinity"):
             try:
                 max_num_workers = len(os.sched_getaffinity(0))
             except Exception:
@@ -2836,19 +3437,21 @@ def test_ind_worker_queue(self):
 
         for batch_size in (8, 16, 32, 64):
             for num_workers in range(0, min(6, max_num_workers)):
-                self._run_ind_worker_queue_test(batch_size=batch_size, num_workers=num_workers + 1)
+                self._run_ind_worker_queue_test(
+                    batch_size=batch_size, num_workers=num_workers + 1
+                )
 
 
 class SetAffinityDataset(IterableDataset):
-
     def __iter__(self):
         torch.randperm(1)
         after = os.sched_getaffinity(0)
         return iter(after)
 
+
 @unittest.skipIf(
-    not hasattr(os, 'sched_setaffinity'),
-    "os.sched_setaffinity is not available")
+    not hasattr(os, "sched_setaffinity"), "os.sched_setaffinity is not available"
+)
 class TestSetAffinity(TestCase):
     def test_set_affinity_in_worker_init(self):
         # Query the current affinity mask to avoid setting a disallowed one
@@ -2861,14 +3464,15 @@ def test_set_affinity_in_worker_init(self):
         def worker_set_affinity(_):
             os.sched_setaffinity(0, [expected_affinity])
 
-
         dataset = SetAffinityDataset()
 
         dataloader = torch.utils.data.DataLoader(
-            dataset, num_workers=2, worker_init_fn=worker_set_affinity)
+            dataset, num_workers=2, worker_init_fn=worker_set_affinity
+        )
         for sample in dataloader:
             self.assertEqual(sample, [expected_affinity])
 
+
 class ConvDataset(Dataset):
     def __init__(self):
         self.x = torch.ones(1, 1, 24000)
@@ -2885,7 +3489,8 @@ def __getitem__(self, index):
 @unittest.skipIf(IS_WINDOWS, "Needs fork")
 @unittest.skipIf(
     TEST_WITH_ASAN,
-    "This test hangs when running with ASAN, see https://github.com/pytorch/pytorch/issues/75492")
+    "This test hangs when running with ASAN, see https://github.com/pytorch/pytorch/issues/75492",
+)
 class TestConvAfterFork(TestCase):
     # Tests crash reported in https://github.com/pytorch/pytorch/issues/53565
     def test_conv_after_fork(self):
@@ -2897,5 +3502,5 @@ def test_conv_after_fork(self):
 instantiate_device_type_tests(TestDataLoaderDeviceType, globals())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 469994ed1c7a5..0881da2f9f64c 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 # Owner(s): ["module: dataloader"]
 
 import copy
@@ -41,7 +43,10 @@
 import torch.utils.data.datapipes as dp
 import torch.utils.data.graph
 import torch.utils.data.graph_settings
-from torch.testing._internal.common_utils import TestCase, run_tests, suppress_warnings, skipIfTorchDynamo
+from torch.testing._internal.common_utils import (
+    TestCase, run_tests, suppress_warnings, skipIfTorchDynamo, TEST_DILL, skipIfNoDill,
+)
+from torch.utils._import_utils import import_dill
 from torch.utils.data import (
     DataLoader,
     DataChunk,
@@ -65,18 +70,8 @@
 from torch.utils.data.datapipes.iter.sharding import SHARDING_PRIORITIES
 import operator
 
-try:
-    import dill
-
-    # XXX: By default, dill writes the Pickler dispatch table to inject its
-    # own logic there. This globally affects the behavior of the standard library
-    # pickler for any user who transitively depends on this module!
-    # Undo this extension to avoid altering the behavior of the pickler globally.
-    dill.extend(use_dill=False)
-    HAS_DILL = True
-except ImportError:
-    HAS_DILL = False
-skipIfNoDill = skipIf(not HAS_DILL, "no dill")
+dill = import_dill()
+HAS_DILL = TEST_DILL
 
 try:
     import pandas  # type: ignore[import] # noqa: F401 F403
@@ -229,7 +224,7 @@ def test_dir(self):
         for api in ['open', 'read', 'close']:
             self.assertTrue(api in s)
 
-    @skipIfTorchDynamo
+    @skipIfTorchDynamo()
     def test_api(self):
         fd = TestStreamWrapper._FakeFD("")
         wrap_fd = StreamWrapper(fd)
@@ -783,6 +778,7 @@ def test_serializable(self):
                 datapipe = dpipe(custom_input, *dp_args, **dp_kwargs)  # type: ignore[call-arg]
                 self._serialization_test_for_single_dp(datapipe)
 
+    @skipIfTorchDynamo("Dict with function as keys")
     def test_serializable_with_dill(self):
         """Only for DataPipes that take in a function as argument"""
         input_dp = dp.iter.IterableWrapper(range(10))
@@ -1408,7 +1404,7 @@ def _helper(ref_fn, fn, input_col=None, output_col=None, error=None):
         _helper(lambda data: (data[0] + 1, data[1], data[2]), Add1Callable(), 0)
 
     @suppress_warnings  # Suppress warning for lambda fn
-    @skipIfTorchDynamo
+    @skipIfTorchDynamo()
     def test_map_dict_with_col_iterdatapipe(self):
         def fn_11(d):
             return -d
diff --git a/test/test_decomp.py b/test/test_decomp.py
index 878715f325dfc..b4e2631116fec 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -1,41 +1,51 @@
 # Owner(s): ["module: decompositions"]
 
+import functools
+
+import itertools
+import re
+import unittest
 from collections import defaultdict
-from torch import Tensor
+from functools import partial
+
 import torch.autograd
+from torch import Tensor
 from torch._decomp import core_aten_decompositions, decomposition_table
-from torch.utils._python_dispatch import TorchDispatchMode
-
-from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
-from torch.utils import _pytree as pytree
+from torch._dispatch.python import enable_python_dispatcher
+from torch._ops import DispatchKey
 from torch.testing import make_tensor
 from torch.testing._internal.common_cuda import tf32_off
-from torch.testing._internal.common_utils import unMarkDynamoStrictTest
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    onlyCPU,
+    onlyCUDA,
+    onlyNativeDeviceTypes,
+    ops,
+)
+from torch.testing._internal.common_methods_invocations import (
+    op_db,
+    skip,
+    skipOps,
+    xfail,
+)
+from torch.testing._internal.common_modules import module_db, modules
 from torch.testing._internal.common_utils import (
     is_iterable_of_tensors,
-    TestCase,
+    IS_MACOS,
+    IS_WINDOWS,
+    run_tests,
     skipIfCrossRef,
+    skipIfTorchDynamo,
     suppress_warnings,
     TEST_WITH_ASAN,
     TEST_WITH_SLOW,
-    run_tests,
-    skipIfTorchDynamo,
-)
-from torch.testing._internal.common_modules import module_db, modules
-from torch.testing._internal.common_device_type import (
-    onlyNativeDeviceTypes,
-    ops,
-    instantiate_device_type_tests,
-    onlyCUDA,
+    TestCase,
+    unMarkDynamoStrictTest,
 )
-from torch.testing._internal.common_methods_invocations import op_db, skip, skipOps, xfail
-from torch._dispatch.python import enable_python_dispatcher
-from torch._ops import DispatchKey
+from torch.utils import _pytree as pytree
+from torch.utils._python_dispatch import TorchDispatchMode
 
-import itertools
-import functools
-from functools import partial
-import unittest
+from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
 
 aten = torch.ops.aten
 
@@ -47,11 +57,13 @@ def overload_to_aten_name(op):
 
 # All operators that can have decomp tests
 decomposition_names = {
-    overload_to_aten_name(k) for k in decomposition_table
+    overload_to_aten_name(k)
+    for k in decomposition_table
     if isinstance(k, torch._ops.OpOverload)
 }
 core_decomposition_names = {
-    overload_to_aten_name(k) for k in core_aten_decompositions()
+    overload_to_aten_name(k)
+    for k in core_aten_decompositions()
     if isinstance(k, torch._ops.OpOverload)
 }
 _decomp_test_ops = [
@@ -63,9 +75,9 @@ def overload_to_aten_name(op):
 _decomp_test_ops_core_autograd = [
     op
     for op in op_db
-    if op.aten_name in core_decomposition_names
-    and op.supports_autograd
+    if op.aten_name in core_decomposition_names and op.supports_autograd
 ]
+_sdpa_op_info = [op for op in op_db if "scaled_dot_product_attention" in op.aten_name]
 
 
 def diff_arg(arg, requires_grad=True):
@@ -137,7 +149,11 @@ def ref_vjp_no_create(f, *primals):
 
     def wrapped(cotangents):
         return _autograd_grad(
-            _as_tuple(result), primals, _as_tuple(cotangents), create_graph=False
+            _as_tuple(result),
+            primals,
+            _as_tuple(cotangents),
+            create_graph=False,
+            retain_graph=True,
         )
 
     return result, wrapped
@@ -193,6 +209,12 @@ def op_assert_ref(test_case, op, test_dtype, i, orig, decomp, ref, args, kwargs)
         (torch.bfloat16, torch.ops.aten.nll_loss_forward.default): 1e-1,
         (torch.float16, torch.ops.aten.nll_loss2d_forward.default): 1e-2,
         (torch.bfloat16, torch.ops.aten.nll_loss2d_forward.default): 2e-1,
+        (torch.float16, torch.ops.aten.hardswish.default): 2e-7,
+        (torch.bfloat16, torch.ops.aten.hardswish.default): 2e-7,
+        (torch.float16, torch.ops.aten.multi_margin_loss.default): 3e-2,
+        (torch.bfloat16, torch.ops.aten.multi_margin_loss.default): 3e-2,
+        (torch.float16, torch.ops.aten.multilabel_margin_loss_forward.default): 3e-2,
+        (torch.bfloat16, torch.ops.aten.multilabel_margin_loss_forward.default): 3e-2,
         # see https://github.com/pytorch/pytorch/pull/96264
         (torch.float16, torch.ops.aten.mv.default): 1e-5,
     }
@@ -216,7 +238,10 @@ def op_assert_ref(test_case, op, test_dtype, i, orig, decomp, ref, args, kwargs)
 
 def op_assert_equal(test_case, op, test_dtype, orig, decomp, args, kwargs):
     test_case.assertEqual(
-        orig.dtype, decomp.dtype, f"Operation: {op}, orig.dtype: {orig.dtype}, decomp.dtype: {decomp.dtype}, {args}, {kwargs}")
+        orig.dtype,
+        decomp.dtype,
+        f"Operation: {op}, orig.dtype: {orig.dtype}, decomp.dtype: {decomp.dtype}, {args}, {kwargs}",
+    )
     # Before adding an entry to this table, make sure your decomposition is right :)
     tol_table = {
         # Due to strange epsilon behaviors, see https://github.com/pytorch/pytorch/issues/73161
@@ -227,42 +252,48 @@ def op_assert_equal(test_case, op, test_dtype, orig, decomp, args, kwargs):
         ),
         (torch.float64, torch.ops.aten.native_layer_norm.default): (1e-6, 1e-6),
         # This exceeds default tolerances only on CPU, on CUDA it's fine
-        (torch.float32, torch.ops.aten.grid_sampler_2d.default) : (7e-6, 3e-5),
+        (torch.float32, torch.ops.aten.grid_sampler_2d.default): (7e-6, 3e-5),
         # Exceeds tolerances on CUDA, likely due to fma
-        (torch.float32, torch.ops.aten.mv.default) : (1e-5, 3e-5),
+        (torch.float32, torch.ops.aten.mv.default): (1e-5, 3e-5),
         (torch.complex64, torch.ops.aten.mv.default): (5e-5, 5e-5),
-        (torch.float64, torch.ops.aten.upsample_bicubic2d.vec) : (1e-5, 5e-4),
-        (torch.float64, torch.ops.aten.upsample_bicubic2d.default) : (1e-5, 5e-4),
+        (torch.float64, torch.ops.aten.upsample_bicubic2d.vec): (1e-5, 5e-4),
+        (torch.float64, torch.ops.aten.upsample_bicubic2d.default): (1e-5, 5e-4),
         # The decomposition is TOO correct. It computes everything in int64, so sometimes
         # there's an off-by-one error. See
         # https://github.com/pytorch/pytorch/issues/81996
         # https://github.com/pytorch/pytorch/issues/82230
-        (torch.int8, torch.ops.aten.linspace.default) : (0, 1),
-        (torch.uint8, torch.ops.aten.linspace.default) : (0, 1),
-        (torch.int16, torch.ops.aten.linspace.default) : (0, 1),
-        (torch.int32, torch.ops.aten.linspace.default) : (0, 1),
-        (torch.int64, torch.ops.aten.linspace.default) : (0, 1),
-        (torch.int8, torch.ops.aten.linspace.Tensor_Tensor) : (0, 1),
-        (torch.uint8, torch.ops.aten.linspace.Tensor_Tensor) : (0, 1),
-        (torch.int16, torch.ops.aten.linspace.Tensor_Tensor) : (0, 1),
-        (torch.int32, torch.ops.aten.linspace.Tensor_Tensor) : (0, 1),
-        (torch.int64, torch.ops.aten.linspace.Tensor_Tensor) : (0, 1),
-        (torch.int8, torch.ops.aten.linspace.Tensor_Scalar) : (0, 1),
-        (torch.uint8, torch.ops.aten.linspace.Tensor_Scalar) : (0, 1),
-        (torch.int16, torch.ops.aten.linspace.Tensor_Scalar) : (0, 1),
-        (torch.int32, torch.ops.aten.linspace.Tensor_Scalar) : (0, 1),
-        (torch.int64, torch.ops.aten.linspace.Tensor_Scalar) : (0, 1),
-        (torch.int8, torch.ops.aten.linspace.Scalar_Tensor) : (0, 1),
-        (torch.uint8, torch.ops.aten.linspace.Scalar_Tensor) : (0, 1),
-        (torch.int16, torch.ops.aten.linspace.Scalar_Tensor) : (0, 1),
-        (torch.int32, torch.ops.aten.linspace.Scalar_Tensor) : (0, 1),
-        (torch.int64, torch.ops.aten.linspace.Scalar_Tensor) : (0, 1),
+        (torch.int8, torch.ops.aten.linspace.default): (0, 1),
+        (torch.uint8, torch.ops.aten.linspace.default): (0, 1),
+        (torch.int16, torch.ops.aten.linspace.default): (0, 1),
+        (torch.int32, torch.ops.aten.linspace.default): (0, 1),
+        (torch.int64, torch.ops.aten.linspace.default): (0, 1),
+        (torch.int8, torch.ops.aten.linspace.Tensor_Tensor): (0, 1),
+        (torch.uint8, torch.ops.aten.linspace.Tensor_Tensor): (0, 1),
+        (torch.int16, torch.ops.aten.linspace.Tensor_Tensor): (0, 1),
+        (torch.int32, torch.ops.aten.linspace.Tensor_Tensor): (0, 1),
+        (torch.int64, torch.ops.aten.linspace.Tensor_Tensor): (0, 1),
+        (torch.int8, torch.ops.aten.linspace.Tensor_Scalar): (0, 1),
+        (torch.uint8, torch.ops.aten.linspace.Tensor_Scalar): (0, 1),
+        (torch.int16, torch.ops.aten.linspace.Tensor_Scalar): (0, 1),
+        (torch.int32, torch.ops.aten.linspace.Tensor_Scalar): (0, 1),
+        (torch.int64, torch.ops.aten.linspace.Tensor_Scalar): (0, 1),
+        (torch.int8, torch.ops.aten.linspace.Scalar_Tensor): (0, 1),
+        (torch.uint8, torch.ops.aten.linspace.Scalar_Tensor): (0, 1),
+        (torch.int16, torch.ops.aten.linspace.Scalar_Tensor): (0, 1),
+        (torch.int32, torch.ops.aten.linspace.Scalar_Tensor): (0, 1),
+        (torch.int64, torch.ops.aten.linspace.Scalar_Tensor): (0, 1),
     }
     if (decomp.dtype, op) in tol_table:
         rtol, atol = tol_table[(decomp.dtype, op)]
     else:
         rtol, atol = _getDefaultRtolAndAtol(orig.dtype, decomp.dtype)
-    test_case.assertEqual(orig, decomp, rtol=rtol, atol=atol, msg=f"{op.__name__}\nargs = {args}\nkwargs = {kwargs}")
+    test_case.assertEqual(
+        orig,
+        decomp,
+        rtol=rtol,
+        atol=atol,
+        msg=f"{op.__name__}\nargs = {args}\nkwargs = {kwargs}",
+    )
 
 
 # Given f, returns an f' such that:
@@ -308,8 +339,11 @@ def wrapped(*primals):
 def upcast_tensor(x, dtype=torch.float32):
     if isinstance(x, Tensor) and x.dtype.is_floating_point:
         return x.to(dtype=dtype)
-    elif (isinstance(x, torch.dtype)
-          and x in [torch.float16, torch.bfloat16, torch.float]):
+    elif isinstance(x, torch.dtype) and x in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float,
+    ]:
         return dtype
     else:
         return x
@@ -338,20 +372,16 @@ def normalize_op_input_output(f, sample, requires_grad=True):
     (None, None, "new_empty"),
     (None, None, "empty_like"),
     (None, None, "empty"),
-
     # AssertionError: False is not true : aten.item was not decomposed, saw calls for: aten._local_scalar_dense.default.
     (None, None, "item"),
-
     # It's the only in-place op without an out-of-place equivalent in the Python API
     # Its OpInfo wrongly registers it as `torch.zero_(x.clone())`.
     (None, None, "zero_"),
-
     # No idea what's going on here
     # In the recursive test logsumexp.default fails with args = (torch.tensor(-math.inf), [])
     # in the test, but it seems to pass when tested locally and in the logsumexp test
     (None, torch.float32, "masked.logsumexp"),
     (None, torch.float64, "masked.logsumexp"),
-
     # exp_vml_cpu not implemented for Half
     (torch.cpu, torch.float16, "signal.windows.exponential"),
     (torch.cpu, torch.float16, "signal.windows.gaussian"),
@@ -373,9 +403,7 @@ def normalize_op_input_output(f, sample, requires_grad=True):
     (None, None, "norm"),
     # native_batch_norm is only implicit when python dispatcher is on (and noncomposite otherwise)
     (None, None, "native_batch_norm"),
-
     (None, None, "_upsample_bilinear2d_aa"),
-
     (None, None, "empty_strided"),  # aten.empty_strided was not decomposed
 }
 
@@ -418,10 +446,16 @@ def test_unsupported(t):
         if type(t) is torch.Tensor or type(t) is torch.nn.Parameter:
             # These are all things that we haven't coded decompositions
             # to handle correctly.  Maybe they should.
-            return any([
-                t.is_sparse_csr, t.is_sparse, t.is_mkldnn, t.is_quantized,
-                t.is_nested, torch._is_functional_tensor(t),
-            ])
+            return any(
+                [
+                    t.is_sparse_csr,
+                    t.is_sparse,
+                    t.is_mkldnn,
+                    t.is_quantized,
+                    t.is_nested,
+                    torch._is_functional_tensor(t),
+                ]
+            )
         elif torch.overrides.is_tensor_like(t):
             # Decompositions will generally change the behavior of Tensor-like
             # subclasses, so bypass tests in this case too
@@ -434,52 +468,66 @@ def test_unsupported(t):
 
 
 core_backward_failures = {
-    skip('_softmax_backward_data'),  # slow: fails with --timeout=360 secs
-    xfail('addcdiv'),
-    skip('addcmul'),  # slow: fails with --timeout=360 secs
-    skip('deg2rad'),  # slow: fails with --timeout=360 secs
-    skip('diag_embed'),  # slow: fails with --timeout=360 secs
-    skip('frac'),  # slow: fails with --timeout=360 secs
-    skip('grid_sampler_2d'),  # slow: fails with --timeout=360 secs
-    xfail('lerp'),
-    skip('logaddexp'),  # slow: fails with --timeout=360 secs
-    skip('native_dropout_backward'),  # slow: fails with --timeout=360 secs
-    xfail('nn.functional.binary_cross_entropy_with_logits'),
-    skip('nn.functional.glu'),  # slow: fails with --timeout=360 secs
-    xfail('nn.functional.hardshrink'),
-    xfail('nn.functional.softshrink'),
-    skip('nn.functional.unfold'),  # slow: fails with --timeout=360 secs
-    xfail('norm'),
-    xfail('norm', 'fro'),
-    xfail('norm', 'inf'),
-    xfail('norm', 'nuc'),
-    skip('rad2deg'),  # slow: fails with --timeout=360 secs
-    skip('renorm'),  # slow: fails with --timeout=360 secs
-    skip('rot90'),  # slow: fails with --timeout=360 secs
-    skip('rsub'),  # slow: fails with --timeout=360 secs
-    skip('sgn'),  # slow: fails with --timeout=360 secs
-    skip('special.xlog1py'),  # slow: fails with --timeout=360 secs
-    xfail('stack'),
-    skip('tril'),  # slow: fails with --timeout=360 secs
-    skip('triu'),  # slow: fails with --timeout=360 secs
-    skip('unfold_copy'),  # slow: fails with --timeout=360 secs
-    skip('xlogy'),  # slow: fails with --timeout=360 secs
-    xfail('zero_'),
+    skip("_softmax_backward_data"),  # slow: fails with --timeout=360 secs
+    xfail("addcdiv"),
+    skip("addcmul"),  # slow: fails with --timeout=360 secs
+    skip("deg2rad"),  # slow: fails with --timeout=360 secs
+    skip("diag_embed"),  # slow: fails with --timeout=360 secs
+    skip("frac"),  # slow: fails with --timeout=360 secs
+    skip("grid_sampler_2d"),  # slow: fails with --timeout=360 secs
+    xfail("lerp"),
+    skip("logaddexp"),  # slow: fails with --timeout=360 secs
+    skip("native_dropout_backward"),  # slow: fails with --timeout=360 secs
+    xfail("nn.functional.binary_cross_entropy_with_logits"),
+    skip("nn.functional.glu"),  # slow: fails with --timeout=360 secs
+    xfail("nn.functional.hardshrink"),
+    xfail("nn.functional.softshrink"),
+    skip("nn.functional.unfold"),  # slow: fails with --timeout=360 secs
+    xfail("norm"),
+    xfail("norm", "fro"),
+    xfail("norm", "inf"),
+    xfail("norm", "nuc"),
+    skip("rad2deg"),  # slow: fails with --timeout=360 secs
+    skip("renorm"),  # slow: fails with --timeout=360 secs
+    skip("rot90"),  # slow: fails with --timeout=360 secs
+    skip("rsub"),  # slow: fails with --timeout=360 secs
+    skip("sgn"),  # slow: fails with --timeout=360 secs
+    skip("special.xlog1py"),  # slow: fails with --timeout=360 secs
+    xfail("stack"),
+    skip("tril"),  # slow: fails with --timeout=360 secs
+    skip("triu"),  # slow: fails with --timeout=360 secs
+    skip("unfold_copy"),  # slow: fails with --timeout=360 secs
+    skip("xlogy"),  # slow: fails with --timeout=360 secs
+    xfail("zero_"),
 }
 if not TEST_WITH_SLOW:
-    core_backward_failures.update({
-        skip('addr'),  # slow: takes 46 sec on A100
-        skip('baddbmm'),  # slow: takes 800+ sec on A100
-        skip('clamp_min'),  # slow: takes 800 sec on A100
-        skip('clamp_max'),  # slow: takes 800 sec on A100
-        skip('logit'),  # slow: takes 44 sec on A100
-        skip('nn.functional.hardswish'),  # slow: takes 60 sec on A100
-        skip('std_mean'),  # slow: takes 170 sec on A100
-        skip('split', variant_name='list_args'),  # slow: takes 118 sec on A100
-        skip('transpose'),  # slow: takes 50 sec on A100
-        skip('unbind'),  # slow: takes 70 sec on A100
-        skip('unsafe_split'),  # slow: takes 49 sec on A100
-    })
+    core_backward_failures.update(
+        {
+            skip("addr"),  # slow: takes 46 sec on A100
+            skip("baddbmm"),  # slow: takes 800+ sec on A100
+            skip("clamp_min"),  # slow: takes 800 sec on A100
+            skip("clamp_max"),  # slow: takes 800 sec on A100
+            skip("logit"),  # slow: takes 44 sec on A100
+            skip("nn.functional.hardswish"),  # slow: takes 60 sec on A100
+            skip("std_mean"),  # slow: takes 170 sec on A100
+            skip("split", variant_name="list_args"),  # slow: takes 118 sec on A100
+            skip("transpose"),  # slow: takes 50 sec on A100
+            skip("unbind"),  # slow: takes 70 sec on A100
+            skip("unsafe_split"),  # slow: takes 49 sec on A100
+        }
+    )
+
+comprehensive_failures = {
+    xfail(
+        "nn.functional.interpolate", "bilinear", dtypes=(torch.uint8,)
+    ),  # off by one error
+    xfail(
+        "nn.functional.interpolate", "bicubic", dtypes=(torch.uint8,)
+    ),  # off by one error
+    xfail(
+        "nn.functional.upsample_bilinear", "", dtypes=(torch.uint8,)
+    ),  # off by one error
+}
 
 
 @unMarkDynamoStrictTest
@@ -498,7 +546,7 @@ def test_quick(self, device, dtype, op):
         self.do_cross_ref(device, dtype, op, run_all=False)
 
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
-    @skipOps('TestDecomp', 'test_quick_core_backward', core_backward_failures)
+    @skipOps("TestDecomp", "test_quick_core_backward", core_backward_failures)
     @onlyNativeDeviceTypes
     @skipIfCrossRef
     @suppress_warnings
@@ -509,14 +557,16 @@ def test_quick_core_backward(self, device, dtype, op):
             args = [sample_input.input] + list(sample_input.args)
             kwargs = sample_input.kwargs
             func = partial(op.get_op(), **kwargs)
-            with self.DecompCrossRefMode(self, self.precision, self.rel_tol, dtype, run_all=False)\
-                 as mode, enable_python_dispatcher():
+            with self.DecompCrossRefMode(
+                self, self.precision, self.rel_tol, dtype, run_all=False
+            ) as mode, enable_python_dispatcher():
                 torch.autograd.gradcheck(func, args)
             self.check_decomposed(aten_name, mode)
 
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @onlyNativeDeviceTypes
     @skipIfCrossRef
+    @skipOps("TestDecomp", "test_comprehensive", comprehensive_failures)
     @suppress_warnings
     @ops(op_db)
     def test_comprehensive(self, device, dtype, op):
@@ -535,12 +585,30 @@ def test_uniform(self, device):
         res = torch._decomp.decompositions.uniform(x, low=low, high=high)
         self.assertEqual(ref, res)
 
+    def test_broadcasting_index_copy(self, device):
+        x = torch.zeros([1, 10], device=device)
+        xs = torch.ones([2, 10], device=device)
+
+        def index_copy(xs, x):
+            torch._decomp.decompositions.index_copy_(
+                xs, 0, torch.tensor(0).to(device), x
+            )
+
+        index_copy(xs, x)
+
+        xs_two = torch.ones([2, 10], device=device)
+        xs_two[0] = x
+
+        self.assertEqual(xs, xs_two)
+
     def test_rrelu_with_noise(self, device):
         # rrelu_with_noise behavior depends on a) whether elements in the input
         # are <= 0, and b) whether we're in training mode. Cover all cases:
         dtype = torch.float64
         x = torch.tensor(
-            [-3.0, -2.0, -1.0, 0.0, 1.0, 2.0], dtype=dtype, device=device,
+            [-3.0, -2.0, -1.0, 0.0, 1.0, 2.0],
+            dtype=dtype,
+            device=device,
         )
         lower = 1.0
         upper = 4.0
@@ -553,7 +621,11 @@ def test_rrelu_with_noise(self, device):
         torch.manual_seed(123)
         noise_res = torch.zeros(x.shape, dtype=dtype, device=device)
         res = torch._decomp.decompositions.rrelu_with_noise(
-            x, noise_res, lower, upper, training,
+            x,
+            noise_res,
+            lower,
+            upper,
+            training,
         )
         self.assertEqual(ref, res)
         self.assertEqual(noise_ref, noise_res)
@@ -568,30 +640,51 @@ def test_rrelu_with_noise(self, device):
         torch.manual_seed(123)
         noise_res = torch.zeros(x.shape, dtype=dtype, device=device)
         res = torch._decomp.decompositions.rrelu_with_noise(
-            x, noise_res, lower, upper, training,
+            x,
+            noise_res,
+            lower,
+            upper,
+            training,
         )
         self.assertEqual(ref, res)
         self.assertEqual(noise_ref, noise_res)
 
-
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @suppress_warnings
     @tf32_off()
     # only tests RNNs since we have py dispsatcher decomps for them
-    @modules(filter(lambda m: m.module_cls in (torch.nn.RNN, torch.nn.LSTM, torch.nn.GRU), module_db))
+    @modules(
+        filter(
+            lambda m: m.module_cls in (torch.nn.RNN, torch.nn.LSTM, torch.nn.GRU),
+            module_db,
+        )
+    )
     def test_rnn_decomp_module(self, device, dtype, module_info, training):
         module_cls = module_info.module_cls
-        module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
-                                                       requires_grad=True, training=training)
+        module_inputs = module_info.module_inputs_func(
+            module_info,
+            device=device,
+            dtype=dtype,
+            requires_grad=True,
+            training=training,
+        )
         for module_input in module_inputs:
             if module_input.forward_input is None:
                 continue
-            args, kwargs = module_input.constructor_input.args, module_input.constructor_input.kwargs
+            args, kwargs = (
+                module_input.constructor_input.args,
+                module_input.constructor_input.kwargs,
+            )
             m = module_cls(*args, **kwargs)
             m.to(device).to(dtype)
 
-            args, kwargs = module_input.forward_input.args, module_input.forward_input.kwargs
-            with self.DecompCrossRefMode(self, self.precision, self.rel_tol, dtype, run_all=True), enable_python_dispatcher():
+            args, kwargs = (
+                module_input.forward_input.args,
+                module_input.forward_input.kwargs,
+            )
+            with self.DecompCrossRefMode(
+                self, self.precision, self.rel_tol, dtype, run_all=True
+            ), enable_python_dispatcher():
                 decomp_out = m(*args, **kwargs)
 
             non_decomp_out = m(*args, **kwargs)
@@ -607,9 +700,81 @@ def test_batch_norm_unflatten_weight_bias(self, device):
         bias = torch.randn(3, device=device)
         mean = torch.randn(3, device=device)
         var = torch.randn(3, device=device)
-        res = torch._decomp.decompositions.native_batch_norm(input, weight, bias, mean, var, False, 1, 1e-05)
+        res = torch._decomp.decompositions.native_batch_norm(
+            input, weight, bias, mean, var, False, 1, 1e-05
+        )
         self.assertEqual(shape, res[0].shape)
 
+    def test_arange_graph(self, device):
+        from torch.fx.experimental.proxy_tensor import make_fx
+
+        def func(x, start):
+            le = x.shape[-1]
+            if start is None:
+                a = torch.arange(le, dtype=torch.float32, device=x.device)
+            else:
+                a = torch.arange(start, le, dtype=torch.float32, device=x.device)
+            return a
+
+        pattern = r", device = device\(.+\), requires_grad = False"
+
+        cfunc = make_fx(func, decomposition_table=decomposition_table)
+        fx_g = cfunc(torch.rand(10, device=device), None)
+        fx_g_code = fx_g.code.strip()
+        # Remove device and requires_grad
+        fx_g_code = re.sub(pattern, "", fx_g_code)
+        self.assertExpectedInline(
+            fx_g_code,
+            """\
+def forward(self, x_1, start_1):
+    iota = torch.ops.prims.iota.default(10, start = 0, step = 1, dtype = torch.int64)
+    mul = torch.ops.prims.mul.default(iota, 1);  iota = None
+    add = torch.ops.prims.add.default(mul, 0);  mul = None
+    convert_element_type = torch.ops.prims.convert_element_type.default(add, torch.float32);  add = None
+    return convert_element_type""",
+        )
+
+        fx_g = cfunc(torch.rand(10, device=device), 1)
+        fx_g_code = fx_g.code.strip()
+        # Remove device and requires_grad
+        fx_g_code = re.sub(pattern, "", fx_g_code)
+        self.assertExpectedInline(
+            fx_g_code,
+            """\
+def forward(self, x_1, start_1):
+    iota = torch.ops.prims.iota.default(9, start = 0, step = 1, dtype = torch.int64)
+    mul = torch.ops.prims.mul.default(iota, 1);  iota = None
+    add = torch.ops.prims.add.default(mul, 1);  mul = None
+    convert_element_type = torch.ops.prims.convert_element_type.default(add, torch.float32);  add = None
+    return convert_element_type""",
+        )
+
+    def test_masked_fill(self, device):
+        from torch.fx.experimental.proxy_tensor import make_fx
+
+        if torch.device(device).type not in [
+            "xpu",
+            "cuda",
+            torch._C._get_privateuse1_backend_name(),
+        ]:
+            self.skipTest("only runs on XPU and CUDA and PrivateUse1.")
+
+        def func(scores, mask, value):
+            return scores.masked_fill(mask, value)
+
+        scores_t = torch.tensor([1, 2, 3, 4], device=device)
+        mask_t = torch.tensor([True, True, True, True], device=device)
+        value_t = torch.tensor(0, dtype=scores_t.dtype)
+        cfunc = make_fx(func, decomposition_table=decomposition_table)
+        fx_g = cfunc(scores_t, mask_t, value_t)
+        self.assertExpectedInline(
+            fx_g.code.strip(),
+            """\
+def forward(self, scores_1, mask_1, value_1):
+    where = torch.ops.prims.where.default(mask_1, value_1, scores_1);  mask_1 = value_1 = scores_1 = None
+    return where""",
+        )
+
     class DecompCrossRefMode(TorchDispatchMode):
         def __init__(self, test_case, saved_precision, saved_rel_tol, dtype, run_all):
             self.test_case = test_case
@@ -634,7 +799,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             # Stuff we shouldn't bother testing
             # (TODO: remove detach from the decomp table?)
             # N.b. Testing in-place ops would need dedicated logic
-            in_place = func.name()[-1] == '_'
+            in_place = func.name()[-1] == "_"
             ignored_ops = [
                 torch.ops.aten.detach.default,
                 # non-deterministic ops
@@ -647,11 +812,11 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 torch.ops.aten.native_dropout.default,
             ]
             if (
-                    func not in decomposition_table or
-                    func in ignored_ops or
-                    torch.Tag.nondeterministic_seeded in func.tags or
-                    any_unsupported(args, kwargs) or
-                    in_place
+                func not in decomposition_table
+                or func in ignored_ops
+                or torch.Tag.nondeterministic_seeded in func.tags
+                or any_unsupported(args, kwargs)
+                or in_place
             ):
                 return func(*args, **kwargs)
 
@@ -699,29 +864,51 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 real_out_double, _ = tree_flatten(
                     func(*tree_map(upcast, args), **tree_map(upcast, kwargs))
                 )
-                for i, (orig, decomp, ref) in enumerate(zip(real_out, decomp_out, real_out_double)):
+                for i, (orig, decomp, ref) in enumerate(
+                    zip(real_out, decomp_out, real_out_double)
+                ):
                     if not isinstance(orig, torch.Tensor):
                         assert type(orig) == type(decomp)
                         assert orig == decomp
                         continue
-                    op_assert_ref(self.test_case, func, self.test_dtype, i, orig, decomp, ref, args, kwargs)
+                    op_assert_ref(
+                        self.test_case,
+                        func,
+                        self.test_dtype,
+                        i,
+                        orig,
+                        decomp,
+                        ref,
+                        args,
+                        kwargs,
+                    )
             else:
                 for orig, decomp in zip(real_out, decomp_out):
                     if not isinstance(orig, torch.Tensor):
                         assert type(orig) == type(decomp)
                         assert orig == decomp
                         continue
-                    op_assert_equal(self.test_case, func, self.test_dtype, orig, decomp, args, kwargs)
+                    op_assert_equal(
+                        self.test_case,
+                        func,
+                        self.test_dtype,
+                        orig,
+                        decomp,
+                        args,
+                        kwargs,
+                    )
 
             return real_out_unflat
 
     def check_decomposed(self, aten_name, mode):
         self.assertTrue(
             any(overload_to_aten_name(c) == aten_name for c in mode.decomposed),
-            msg=(f"aten.{aten_name} was not decomposed, saw calls for: "
-                 f"{', '.join(map(str, list(mode.called)))}. If your op is  "
-                 f"CompositeImplicitAutograd you should skip this test "
-                 f"by updating CROSS_REF_EXCLUDE_SET.")
+            msg=(
+                f"aten.{aten_name} was not decomposed, saw calls for: "
+                f"{', '.join(map(str, list(mode.called)))}. If your op is  "
+                f"CompositeImplicitAutograd you should skip this test "
+                f"by updating CROSS_REF_EXCLUDE_SET."
+            ),
         )
 
     @skipIfTorchDynamo("Test does not work with TorchDynamo")
@@ -734,7 +921,9 @@ def do_cross_ref(self, device, dtype, op, *, run_all):
         if any(key in CROSS_REF_EXCLUDE_SET for key in test_keys):
             self.skipTest(f"{op.name} in {dtype} not supported")
 
-        skip_decomp_vjp = any(key in CROSS_REF_BACKWARD_EXCLUDE_SET for key in test_keys)
+        skip_decomp_vjp = any(
+            key in CROSS_REF_BACKWARD_EXCLUDE_SET for key in test_keys
+        )
 
         requires_grad = (
             op.supports_autograd
@@ -750,6 +939,16 @@ def do_cross_ref(self, device, dtype, op, *, run_all):
         aten_name = op.decomp_aten_name or op.aten_name
 
         func = op.get_op()
+
+        def run_without_python_dispatcher(mode):
+            return any(
+                isinstance(op, torch._ops.OpOverload)
+                and op.has_kernel_for_dispatch_key(
+                    DispatchKey.CompositeImplicitAutograd
+                )
+                for op in mode.decomposed.union([func])
+            )
+
         for sample_input in samples:
             if requires_grad:
                 fn, primals = normalize_op_input_output(func, sample_input)
@@ -761,27 +960,57 @@ def do_cross_ref(self, device, dtype, op, *, run_all):
                 # store the called list on the mode object instance and no
                 # explicit clearing is necessary as I will create a fresh mode
                 # for each region
-                with self.DecompCrossRefMode(self, self.precision, self.rel_tol, dtype, run_all)\
-                     as mode, enable_python_dispatcher():
+                with self.DecompCrossRefMode(
+                    self, self.precision, self.rel_tol, dtype, run_all
+                ) as mode, enable_python_dispatcher():
                     decomp_out, decomp_vjp_fn = ref_vjp_no_create(fn, *primals)
+                if run_without_python_dispatcher(mode):
+                    # without this check, incorrect decomps at the python dispatcher level can still pass because
+                    # they're checking aten decomps at the torch_dispatch level.
+                    with self.DecompCrossRefMode(
+                        self, self.precision, self.rel_tol, dtype, run_all
+                    ) as mode:
+                        decomp_out, decomp_vjp_fn = ref_vjp_no_create(fn, *primals)
                 if aten_name in decomposition_names:
                     self.check_decomposed(aten_name, mode)
 
-                if not skip_decomp_vjp and (op.aten_backward_name in decomposition_names or run_all):
+                if not skip_decomp_vjp and (
+                    op.aten_backward_name in decomposition_names or run_all
+                ):
                     cotangents = tree_map(lambda x: torch.randn_like(x), decomp_out)
 
-                    with self.DecompCrossRefMode(self, self.precision, self.rel_tol, dtype, run_all)\
-                         as mode, enable_python_dispatcher():
+                    with self.DecompCrossRefMode(
+                        self, self.precision, self.rel_tol, dtype, run_all
+                    ) as mode, enable_python_dispatcher():
                         decomp_vjp_fn(cotangents)
+                    if run_without_python_dispatcher(mode):
+                        # without this check, incorrect decomps at the python dispatcher level can still pass because
+                        # they're checking aten decomps at the torch_dispatch level.
+                        with self.DecompCrossRefMode(
+                            self, self.precision, self.rel_tol, dtype, run_all
+                        ) as mode:
+                            decomp_vjp_fn(cotangents)
                     if not run_all:
                         self.check_decomposed(op.aten_backward_name, mode)
 
             elif aten_name in decomposition_names or run_all:
                 args = [sample_input.input] + list(sample_input.args)
                 kwargs = sample_input.kwargs
-                with self.DecompCrossRefMode(self, self.precision, self.rel_tol, dtype, run_all)\
-                     as mode, enable_python_dispatcher():
+                # A failure here might be because the decomposition for the op is wrong or because a
+                # decomposition used by the particular op is wrong.
+                with self.DecompCrossRefMode(
+                    self, self.precision, self.rel_tol, dtype, run_all
+                ) as mode, enable_python_dispatcher():
                     func(*args, **kwargs)
+
+                if run_without_python_dispatcher(mode):
+                    # without this check, incorrect decomps at the python dispatcher level can still pass because
+                    # they're checking aten decomps at the torch_dispatch level.
+                    with self.DecompCrossRefMode(
+                        self, self.precision, self.rel_tol, dtype, run_all
+                    ) as mode:
+                        func(*args, **kwargs)
+
                 if not run_all:
                     self.check_decomposed(aten_name, mode)
             else:
@@ -790,6 +1019,7 @@ def do_cross_ref(self, device, dtype, op, *, run_all):
                     "only backwards is decomposed, but dtype doesn't support AD"
                 )
 
+
 instantiate_device_type_tests(TestDecomp, globals())
 
 
@@ -846,7 +1076,8 @@ def test_amp_batch_norm_backward(self):
             mean,
             False,
             1e-05,
-            [True, True, True])
+            [True, True, True],
+        )
         res = torch._decomp.decompositions.native_batch_norm_backward(
             grad_out,
             x,
@@ -857,12 +1088,12 @@ def test_amp_batch_norm_backward(self):
             mean,
             False,
             1e-05,
-            [True, True, True])
-        for (a, b) in zip(ref, res):
+            [True, True, True],
+        )
+        for a, b in zip(ref, res):
             self.assertEqual(a.stride(), b.stride())
             self.assertEqual(a.dtype, b.dtype)
 
-
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @onlyNativeDeviceTypes
     @skipIfCrossRef
@@ -899,50 +1130,81 @@ def test_weight_norm_interface(self, device):
         self.assertTrue(torch.allclose(ref[0], res[0]))
         self.assertTrue(torch.allclose(ref[1], res[1]))
 
+        inp = torch.rand([30, 10], device=device)
+        inp2 = torch.rand([30, 1], device=device)
+
+        self.assertEqual(
+            torch.ops.aten._weight_norm_interface(inp, inp2),
+            torch._decomp.decompositions._weight_norm_interface(inp, inp2),
+        )
+
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
-    @onlyNativeDeviceTypes
+    @onlyCPU
     @skipIfCrossRef
-    def test_sdpa(self, device):
-        from torch.fx.experimental.proxy_tensor import make_fx
-        from torch._decomp import get_decompositions
-        from torch.nn import functional as F
+    @skipOps(
+        "DecompOneOffTests",
+        "test_sdpa",
+        [
+            xfail(
+                "nn.functional.scaled_dot_product_attention",
+                dtypes=[torch.half] + ([torch.bfloat16] if IS_MACOS else []),
+            ),
+        ],
+    )
+    @ops(_sdpa_op_info)
+    def test_sdpa(self, device, dtype, op):
+        # SDPA doesn't support float16, this is aligned with aten/src/ATen/native/transformers/attention.cpp. If we
+        # add support for float16 over there we should update this test as well.
 
         class ScaledDotProductAttention(torch.nn.Module):
             def __init__(self):
                 super().__init__()
 
-            def forward(self, query_layer, key_layer, value_layer):
-                attn_output = F.scaled_dot_product_attention(
-                    query_layer, key_layer, value_layer, None, dropout_p=0.0, is_causal=True
+            def forward(
+                self, query_layer, key_layer, value_layer, mask=None, is_causal=True
+            ):
+                attn_output = op(
+                    query_layer,
+                    key_layer,
+                    value_layer,
+                    attn_mask=mask,
+                    dropout_p=0.0,
+                    is_causal=is_causal,
                 )
                 return attn_output
 
+        query_layer = torch.randn(1, 128, 100, 64, device=device, dtype=dtype)
+        key_layer = torch.randn(1, 128, 100, 64, device=device, dtype=dtype)
+        value_layer = torch.randn(1, 128, 100, 64, device=device, dtype=dtype)
+        masks = [None, torch.ones((1, 1, 100, 100), device=device, dtype=torch.bool)]
 
-        query_layer = torch.randn(1, 128, 100, 64, device=device)
-        key_layer = torch.randn(1, 128, 100, 64, device=device)
-        value_layer = torch.randn(1, 128, 100, 64, device=device)
+        atol, rtol = dtype_precisions[dtype]
 
-        attention = ScaledDotProductAttention()
-        fx_g = make_fx(
-            attention,
-            decomposition_table=get_decompositions(
-                [
-                    torch.ops.aten._scaled_dot_product_flash_attention.default,
-                ]
-            ),
-        )(query_layer, key_layer, value_layer)
+        for mask in masks:
+            is_causal = mask is None
+            attention = ScaledDotProductAttention()
+            decomposed_res = (
+                torch._decomp.decompositions.scaled_dot_product_flash_attention_for_cpu(
+                    query_layer, key_layer, value_layer, 0.0, is_causal, attn_mask=mask
+                )
+            )
+            eager_res = op(
+                query_layer,
+                key_layer,
+                value_layer,
+                attn_mask=mask,
+                dropout_p=0.0,
+                is_causal=is_causal,
+            )
 
-        compiled_res = fx_g(query_layer, key_layer, value_layer)
-        eager_res = F.scaled_dot_product_attention(
-            query_layer, key_layer, value_layer, None, dropout_p=0.0, is_causal=True
-        )
-        self.assertTrue(torch.allclose(compiled_res, eager_res, atol=1e-6, rtol=1e-5))
+            self.assertTrue(
+                torch.allclose(decomposed_res[0], eager_res, atol=atol, rtol=rtol)
+            )
 
 
 instantiate_device_type_tests(DecompOneOffTests, globals())
 
 
-
 class HasDecompTest(TestCase):
     def setUp(self):
         super().setUp()
@@ -952,22 +1214,24 @@ def setUp(self):
     def _can_appear_in_trace(op: torch._ops.OpOverload) -> bool:
         has_tensor_arg = any(
             "Tensor" in str(a.type)
-            for a in itertools.chain(op._schema.arguments, op._schema.returns))
+            for a in itertools.chain(op._schema.arguments, op._schema.returns)
+        )
         if not has_tensor_arg:
             return False
 
         try:
             # CompositeImplicitAutograd ops are transparent to the tracer, so don't need decompositions
-            return not op.has_kernel_for_dispatch_key(DispatchKey.CompositeImplicitAutograd)
+            return not op.has_kernel_for_dispatch_key(
+                DispatchKey.CompositeImplicitAutograd
+            )
         except RuntimeError as e:
             # has_key fails for some jit-registered ops, which shouldn't be
             # relevant here anyway
-            if 'does not exist' in str(e):
+            if "does not exist" in str(e):
                 return False
             raise
 
     def test_has_decomposition(self):
-
         def all_aten_overloads():
             for name in torch._C._dispatch_get_all_op_names():
                 if not name.startswith("aten::"):
@@ -988,11 +1252,14 @@ def all_aten_overloads():
         # configurations, so would cause the test to fail
         allow_list = {aten.get_gradients.default}
 
-        overloads_wanting_decomp = {op for op in all_aten_overloads()
-                                    if self._can_appear_in_trace(op)}
+        overloads_wanting_decomp = {
+            op for op in all_aten_overloads() if self._can_appear_in_trace(op)
+        }
         ops_missing_decomp = overloads_wanting_decomp - decomposition_table.keys()
         ops_missing_decomp -= allow_list
-        self.assertExpected("".join(sorted(op.name() + "\n" for op in ops_missing_decomp)))
+        self.assertExpected(
+            "".join(sorted(op.name() + "\n" for op in ops_missing_decomp))
+        )
 
     def test_aten_core_operators(self):
         # If a decomposition isn't included in the core decompositions,
@@ -1008,13 +1275,23 @@ def test_aten_core_operators(self):
 
         # Some decompositions are registered for CompositeImplicitAutograd
         # operators, which never appear in AOTAutograd's graph so are never used.
-        useful_decomps = {op for op in decomposition_table.keys()
-                          if isinstance(op, torch._ops.OpOverload) and
-                          self._can_appear_in_trace(op)}
+        useful_decomps = {
+            op
+            for op in decomposition_table.keys()
+            if isinstance(op, torch._ops.OpOverload) and self._can_appear_in_trace(op)
+        }
         core_decomps = torch._decomp.core_aten_decompositions().keys()
         core_aten_ops = useful_decomps - core_decomps
         self.assertExpected("".join(sorted(op.name() + "\n" for op in core_aten_ops)))
 
+    @unittest.skipIf(IS_WINDOWS, "torch.compile not supported on windows")
+    def test_compile_rrelu(self):
+        def f(x):
+            return torch.rrelu(x)
+
+        inp = torch.rand(1, 2, 3)
+        self.assertEqual(f(inp), torch.compile(f)(inp))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_deploy.py b/test/test_deploy.py
index c59334e5fe7e0..db43aaa6b8835 100644
--- a/test/test_deploy.py
+++ b/test/test_deploy.py
@@ -3,9 +3,10 @@
 import textwrap
 import types
 
-from torch.utils._freeze import Freezer, PATH_MARKER
 from torch.testing._internal.common_utils import run_tests, TestCase
 
+from torch.utils._freeze import Freezer, PATH_MARKER
+
 
 class TestFreezer(TestCase):
     """Tests the freeze.py script"""
diff --git a/test/test_determination.py b/test/test_determination.py
index 038339425b9be..bf3a24a64e0cd 100644
--- a/test/test_determination.py
+++ b/test/test_determination.py
@@ -3,7 +3,7 @@
 import os
 
 import run_test
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class DummyOptions:
@@ -30,7 +30,9 @@ def determined_tests(cls, changed_files):
         return [
             test
             for test in cls.TESTS
-            if run_test.should_run_test(run_test.TARGET_DET_LIST, test, changed_files, DummyOptions())
+            if run_test.should_run_test(
+                run_test.TARGET_DET_LIST, test, changed_files, DummyOptions()
+            )
         ]
 
     def test_target_det_list_is_sorted(self):
@@ -42,9 +44,7 @@ def test_target_det_list_is_sorted(self):
 
     def test_config_change_only(self):
         """CI configs trigger all tests"""
-        self.assertEqual(
-            self.determined_tests([".ci/pytorch/test.sh"]), self.TESTS
-        )
+        self.assertEqual(self.determined_tests([".ci/pytorch/test.sh"]), self.TESTS)
 
     def test_run_test(self):
         """run_test.py is imported by determination tests"""
@@ -68,14 +68,17 @@ def test_cpp_file(self):
     def test_test_file(self):
         """Test files trigger themselves and dependent tests"""
         self.assertEqual(
-            self.determined_tests(["test/test_jit.py"]), ["test_jit_profiling", "test_jit"]
+            self.determined_tests(["test/test_jit.py"]),
+            ["test_jit_profiling", "test_jit"],
         )
         self.assertEqual(
             self.determined_tests(["test/jit/test_custom_operators.py"]),
             ["test_jit_profiling", "test_jit"],
         )
         self.assertEqual(
-            self.determined_tests(["test/quantization/eager/test_quantize_eager_ptq.py"]),
+            self.determined_tests(
+                ["test/quantization/eager/test_quantize_eager_ptq.py"]
+            ),
             ["test_quantization"],
         )
 
diff --git a/test/test_dispatch.py b/test/test_dispatch.py
index 5b03c49c3090d..1ffe3f3028ea0 100644
--- a/test/test_dispatch.py
+++ b/test/test_dispatch.py
@@ -1,14 +1,15 @@
 # Owner(s): ["module: dispatch"]
 
-import torch._C as C
-from torch.testing._internal.common_utils import TestCase, run_tests
-from torch._python_dispatcher import PythonDispatcher
-
-from collections import namedtuple
 import itertools
 import os
 import re
+
+from collections import namedtuple
+
+import torch._C as C
 import torch.utils.cpp_extension
+from torch._python_dispatcher import PythonDispatcher
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 # TODO: Expand the dispatcher API to be a generic API for interfacing with
 # the dispatcher from Python!
@@ -26,30 +27,33 @@
 #   - Internal state of Dispatcher makes sense.  This is indirectly
 #     tested by the invariant testing
 
-Result = namedtuple('Result', 'state table provenance')
+Result = namedtuple("Result", "state table provenance")
 
 dispatch_keys_to_check = (
-    'Undefined',
-    'CPU',
-    'CUDA',
-    'XLA',
-    'AutogradOther',
-    'AutogradCPU',
-    'AutogradCUDA',
-    'AutogradXLA')
+    "Undefined",
+    "CPU",
+    "CUDA",
+    "XLA",
+    "AutogradOther",
+    "AutogradCPU",
+    "AutogradCUDA",
+    "AutogradXLA",
+)
+
 
 def extract_dispatch_table_with_keys(table, dispatch_keys):
-    extracted = ''
-    table_entries = table.split('\n')
+    extracted = ""
+    table_entries = table.split("\n")
     regex = re.compile(r"registered at .*FallbackKernel\.cpp.*(\[)")
     for k in dispatch_keys:
         for t in table_entries:
             if t.startswith(k):
                 # mask out file:line info for in-tree backend fallback
-                entry = regex.sub('registered in pytorch framework [', t)
-                extracted += (entry + '\n')
+                entry = regex.sub("registered in pytorch framework [", t)
+                extracted += entry + "\n"
     return extracted
 
+
 class TestDispatch(TestCase):
     namespace_index = 0
 
@@ -60,8 +64,15 @@ def test_all_invariants(self):
     # You probably don't want to call this directly; if your constructors
     # don't commute, you can still run commute with a fixed ctor_order
     # so that you can test that the destructors still commute
-    def run_ops(self, name, ops, ctor_order=None, dtor_order=None,
-                results=None, expect_raises=False):
+    def run_ops(
+        self,
+        name,
+        ops,
+        ctor_order=None,
+        dtor_order=None,
+        results=None,
+        expect_raises=False,
+    ):
         """
         Given a list of operator registrations, run the registrations in the
         order specified by ctor_order, and then run the deregistrations in
@@ -106,21 +117,25 @@ def run_ops(self, name, ops, ctor_order=None, dtor_order=None,
         def check_invariants(actual_provenance):
             C._dispatch_check_invariants(name)
             # Normalize the test namespace so that expected outputs are stable
-            actual_state = C._dispatch_dump(
-                f"{test_namespace}::{name}").replace(test_namespace, "test")
-            actual_table = C._dispatch_dump_table(
-                f"{test_namespace}::{name}").replace(test_namespace, "test")
+            actual_state = C._dispatch_dump(f"{test_namespace}::{name}").replace(
+                test_namespace, "test"
+            )
+            actual_table = C._dispatch_dump_table(f"{test_namespace}::{name}").replace(
+                test_namespace, "test"
+            )
             expected_state, expected_table, expected_provenance = results.setdefault(
                 frozenset(active_ops),
-                Result(actual_state, actual_table, actual_provenance)
+                Result(actual_state, actual_table, actual_provenance),
             )
             self.assertMultiLineEqual(
-                expected_state, actual_state,
-                f"expected from {expected_provenance}; actual from {actual_provenance}"
+                expected_state,
+                actual_state,
+                f"expected from {expected_provenance}; actual from {actual_provenance}",
             )
             self.assertMultiLineEqual(
-                expected_table, actual_table,
-                f"expected from {expected_provenance}; actual from {actual_provenance}"
+                expected_table,
+                actual_table,
+                f"expected from {expected_provenance}; actual from {actual_provenance}",
             )
 
         results.setdefault(frozenset(), Result("", "", "hardcoded initial state"))
@@ -144,7 +159,9 @@ def check_invariants(actual_provenance):
                 actual = actual.split("\nException raised from ")[0]
                 expected, _, expected_provenance = results.setdefault(
                     frozenset(active_ops),
-                    Result(actual, "", f"error after running ctors {ctor_order[:i + 1]}")
+                    Result(
+                        actual, "", f"error after running ctors {ctor_order[:i + 1]}"
+                    ),
                 )
                 self.assertMultiLineEqual(expected, actual, expected_provenance)
                 set_to_report = frozenset(active_ops)
@@ -165,7 +182,8 @@ def check_invariants(actual_provenance):
             self.assertTrue(
                 False,
                 "expected exception to be raised, but nothing was raised "
-                f"(after running ctors {ctor_order})")
+                f"(after running ctors {ctor_order})",
+            )
         # In the order specified by dtor_order, run deregistrations
         for i, op_ix in enumerate(dtor_order):
             # Trigger a destruction
@@ -209,8 +227,13 @@ def commute(self, name, ops, ctor_order=None, expect_raises=False):
         def go(ctor_order):
             for dtor_order in itertools.permutations(range(len(ops))):
                 self.run_ops(
-                    name, ops, ctor_order, dtor_order,
-                    results=results, expect_raises=expect_raises)
+                    name,
+                    ops,
+                    ctor_order,
+                    dtor_order,
+                    results=results,
+                    expect_raises=expect_raises,
+                )
 
         if ctor_order is not None:
             go(ctor_order)
@@ -226,19 +249,24 @@ def go(ctor_order):
         return results[frozenset(range(len(ops)))]
 
     def test_def(self):
-        state = self.commute("foo", [
-            # m.def("foo(Tensor x) -> Tensor")
-            lambda m: m.def_("foo(Tensor x) -> Tensor"),
-            # m.impl("test_def", [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo"),
-            # m.impl("test_def", kCPU, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", dispatch="CPU"),
-            # m.impl("test_def", kAutograd, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", dispatch="Autograd"),
-            # m.impl("test_def", kAutogradCPU, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", dispatch="AutogradCPU")
-        ]).state
-        self.assertExpectedInline(state, '''\
+        state = self.commute(
+            "foo",
+            [
+                # m.def("foo(Tensor x) -> Tensor")
+                lambda m: m.def_("foo(Tensor x) -> Tensor"),
+                # m.impl("test_def", [](const Tensor& x) { return x })
+                lambda m: m.impl_t_t("foo"),
+                # m.impl("test_def", kCPU, [](const Tensor& x) { return x })
+                lambda m: m.impl_t_t("foo", dispatch="CPU"),
+                # m.impl("test_def", kAutograd, [](const Tensor& x) { return x })
+                lambda m: m.impl_t_t("foo", dispatch="Autograd"),
+                # m.impl("test_def", kAutogradCPU, [](const Tensor& x) { return x })
+                lambda m: m.impl_t_t("foo", dispatch="AutogradCPU"),
+            ],
+        ).state
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: test::foo(Tensor x) -> Tensor
 debug: registered at /dev/null:0
@@ -247,38 +275,51 @@ def test_def(self):
 AutogradCPU: impl_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 Autograd[alias]: impl_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 CompositeImplicitAutograd[alias]: impl_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
-''')
+""",
+        )
 
     def test_def_impl_schema_mismatch(self):
         # NB: an impl-impl mismatch is not reported eagerly; you'll find out
         # about it because one of them won't match with def
-        state = self.commute("foo", [
-            # m.def("foo(Tensor x, Tensor y) -> Tensor")
-            lambda m: m.def_("foo(Tensor x, Tensor y) -> Tensor"),
-            # m.impl("foo", [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo"),
-        ], expect_raises=True).state
-        self.assertExpectedInline(state, '''\
+        state = self.commute(
+            "foo",
+            [
+                # m.def("foo(Tensor x, Tensor y) -> Tensor")
+                lambda m: m.def_("foo(Tensor x, Tensor y) -> Tensor"),
+                # m.impl("foo", [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo"),
+            ],
+            expect_raises=True,
+        ).state
+        self.assertExpectedInline(
+            state,
+            """\
 Inferred operator schema for a C++ kernel function doesn't match the expected function schema.
   operator: test::foo
   expected schema: test::foo(Tensor x, Tensor y) -> Tensor
     registered at /dev/null:0
   inferred schema: (Tensor _0) -> Tensor _0
     impl_t_t
-  reason: The number of arguments is different. 2 vs 1.''')
+  reason: The number of arguments is different. 2 vs 1.""",
+        )
 
     def test_def_with_inference(self):
-        state = self.commute("foo", [
-            # m.def("foo", [](const Tensor & x) { return x })
-            lambda m: m.def_name_t_t("foo"),
-            # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CPU"),
-            # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "Autograd"),
-            # m.impl("foo", torch::kAutogradCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "AutogradCPU")
-        ]).state
-        self.assertExpectedInline(state, '''\
+        state = self.commute(
+            "foo",
+            [
+                # m.def("foo", [](const Tensor & x) { return x })
+                lambda m: m.def_name_t_t("foo"),
+                # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "CPU"),
+                # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "Autograd"),
+                # m.impl("foo", torch::kAutogradCPU, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "AutogradCPU"),
+            ],
+        ).state
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: test::foo(Tensor _0) -> Tensor _0
 debug: registered at /dev/null:0
@@ -287,55 +328,73 @@ def test_def_with_inference(self):
 AutogradCPU: impl_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 Autograd[alias]: impl_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 CompositeImplicitAutograd[alias]: default_def_name_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
-''')
+""",
+        )
 
     def test_def_only(self):
-        state = self.commute("foo", [
-            # m.def("foo(Tensor x, Tensor y) -> Tensor")
-            lambda m: m.def_("foo(Tensor x, Tensor y) -> Tensor"),
-        ]).state
-        self.assertExpectedInline(state, '''\
+        state = self.commute(
+            "foo",
+            [
+                # m.def("foo(Tensor x, Tensor y) -> Tensor")
+                lambda m: m.def_("foo(Tensor x, Tensor y) -> Tensor"),
+            ],
+        ).state
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: test::foo(Tensor x, Tensor y) -> Tensor
 debug: registered at /dev/null:0
 alias analysis kind: FROM_SCHEMA
-''')
+""",
+        )
 
     def test_impl_only(self):
-        state = self.commute("foo", [
-            # m.impl("foo", [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo"),
-            # m.impl("foo", torch::kCPU, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", "CPU"),
-            # m.impl("foo", torch::kAutograd, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", "Autograd"),
-            # m.impl("foo", torch::kAutogradCPU, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", "AutogradCPU")
-        ]).state
-        self.assertExpectedInline(state, '''\
+        state = self.commute(
+            "foo",
+            [
+                # m.impl("foo", [](const Tensor& x) { return x })
+                lambda m: m.impl_t_t("foo"),
+                # m.impl("foo", torch::kCPU, [](const Tensor& x) { return x })
+                lambda m: m.impl_t_t("foo", "CPU"),
+                # m.impl("foo", torch::kAutograd, [](const Tensor& x) { return x })
+                lambda m: m.impl_t_t("foo", "Autograd"),
+                # m.impl("foo", torch::kAutogradCPU, [](const Tensor& x) { return x })
+                lambda m: m.impl_t_t("foo", "AutogradCPU"),
+            ],
+        ).state
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: (none)
 CPU: impl_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 AutogradCPU: impl_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 Autograd[alias]: impl_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 CompositeImplicitAutograd[alias]: impl_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
-''')
+""",
+        )
 
     def test_computed_table(self):
-        result = self.commute("foo", [
-            # m.def("foo", [](const Tensor & x) { return x })
-            lambda m: m.def_name_t_t("foo"),
-            # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
-            # m.impl("foo", torch::kCUDA, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "XLA", debug="fn_xla"),
-            # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"),
-            # m.impl("foo", torch::kAutogradCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "AutogradCPU", debug="fn_autogradcpu")
-        ])
+        result = self.commute(
+            "foo",
+            [
+                # m.def("foo", [](const Tensor & x) { return x })
+                lambda m: m.def_name_t_t("foo"),
+                # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
+                # m.impl("foo", torch::kCUDA, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "XLA", debug="fn_xla"),
+                # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"),
+                # m.impl("foo", torch::kAutogradCPU, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "AutogradCPU", debug="fn_autogradcpu"),
+            ],
+        )
         state, table = result.state, result.table
-        self.assertExpectedInline(state, '''\
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: test::foo(Tensor _0) -> Tensor _0
 debug: registered at /dev/null:0
@@ -345,12 +404,17 @@ def test_computed_table(self):
 AutogradCPU: fn_autogradcpu :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 Autograd[alias]: fn_autograd :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 CompositeImplicitAutograd[alias]: default_def_name_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
-''')
+""",
+        )
 
         # computed dispatch table is too big, so we only check on a few entries we're interested in.
-        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check)
+        extracted_table = extract_dispatch_table_with_keys(
+            table, dispatch_keys_to_check
+        )
 
-        self.assertExpectedInline(extracted_table, '''\
+        self.assertExpectedInline(
+            extracted_table,
+            """\
 Undefined: default_def_name_t_t [math kernel]
 CPU: fn_cpu [kernel]
 CUDA: default_def_name_t_t [math kernel]
@@ -359,30 +423,41 @@ def test_computed_table(self):
 AutogradCPU: fn_autogradcpu [kernel]
 AutogradCUDA: default_def_name_t_t [math kernel]
 AutogradXLA: fn_autograd [autograd kernel]
-''')
+""",
+        )
 
     def test_computed_table_with_cpu_math_autogradcpu_fallthrough(self):
         global_m = C._dispatch_library("IMPL", "_", "AutogradCPU")
-        result = self.commute("foo", [
-            # m.def("foo", [](const Tensor & x) { return x })
-            lambda m: m.def_name_t_t("foo"),
-            # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CPU"),
-        ])
+        result = self.commute(
+            "foo",
+            [
+                # m.def("foo", [](const Tensor & x) { return x })
+                lambda m: m.def_name_t_t("foo"),
+                # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "CPU"),
+            ],
+        )
         state, table = result.state, result.table
-        self.assertExpectedInline(state, '''\
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: test::foo(Tensor _0) -> Tensor _0
 debug: registered at /dev/null:0
 alias analysis kind: CONSERVATIVE
 CPU: impl_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 CompositeImplicitAutograd[alias]: default_def_name_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
-''')
+""",
+        )
 
         # computed dispatch table is too big, so we only check on a few entries we're interested in.
-        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check)
+        extracted_table = extract_dispatch_table_with_keys(
+            table, dispatch_keys_to_check
+        )
 
-        self.assertExpectedInline(extracted_table, '''\
+        self.assertExpectedInline(
+            extracted_table,
+            """\
 Undefined: default_def_name_t_t [math kernel]
 CPU: impl_t_t [kernel]
 CUDA: default_def_name_t_t [math kernel]
@@ -391,29 +466,40 @@ def test_computed_table_with_cpu_math_autogradcpu_fallthrough(self):
 AutogradCPU: registered in pytorch framework [backend fallback]
 AutogradCUDA: default_def_name_t_t [math kernel]
 AutogradXLA: default_def_name_t_t [math kernel]
-''')
+""",
+        )
 
     def test_computed_table_with_math(self):
         global_m = C._dispatch_library("IMPL", "_", "AutogradCPU")
-        result = self.commute("foo", [
-            # m.def("foo(Tensor x) -> Tensor")
-            lambda m: m.def_("foo(Tensor x) -> Tensor"),
-            # m.impl("foo", torch::kCompositeImplicitAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CompositeImplicitAutograd"),
-        ])
+        result = self.commute(
+            "foo",
+            [
+                # m.def("foo(Tensor x) -> Tensor")
+                lambda m: m.def_("foo(Tensor x) -> Tensor"),
+                # m.impl("foo", torch::kCompositeImplicitAutograd, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "CompositeImplicitAutograd"),
+            ],
+        )
         state, table = result.state, result.table
-        self.assertExpectedInline(state, '''\
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: test::foo(Tensor x) -> Tensor
 debug: registered at /dev/null:0
 alias analysis kind: FROM_SCHEMA
 CompositeImplicitAutograd[alias]: impl_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
-''')
+""",
+        )
 
         # computed dispatch table is too big, so we only check on a few entries we're interested in.
-        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check)
+        extracted_table = extract_dispatch_table_with_keys(
+            table, dispatch_keys_to_check
+        )
 
-        self.assertExpectedInline(extracted_table, '''\
+        self.assertExpectedInline(
+            extracted_table,
+            """\
 Undefined: impl_t_t [math kernel]
 CPU: impl_t_t [math kernel]
 CUDA: impl_t_t [math kernel]
@@ -422,32 +508,45 @@ def test_computed_table_with_math(self):
 AutogradCPU: impl_t_t [math kernel]
 AutogradCUDA: impl_t_t [math kernel]
 AutogradXLA: impl_t_t [math kernel]
-''')
+""",
+        )
 
     def test_computed_table_with_cpu_math(self):
         global_m = C._dispatch_library("IMPL", "_", "AutogradCPU")
-        result = self.commute("foo", [
-            # m.def("foo(Tensor x) -> Tensor")
-            lambda m: m.def_("foo(Tensor x) -> Tensor"),
-            # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
-            # m.impl("foo", torch::kCompositeImplicitAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CompositeImplicitAutograd", debug="fn_math"),
-        ])
+        result = self.commute(
+            "foo",
+            [
+                # m.def("foo(Tensor x) -> Tensor")
+                lambda m: m.def_("foo(Tensor x) -> Tensor"),
+                # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
+                # m.impl("foo", torch::kCompositeImplicitAutograd, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t(
+                    "foo", "CompositeImplicitAutograd", debug="fn_math"
+                ),
+            ],
+        )
         state, table = result.state, result.table
-        self.assertExpectedInline(state, '''\
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: test::foo(Tensor x) -> Tensor
 debug: registered at /dev/null:0
 alias analysis kind: FROM_SCHEMA
 CPU: fn_cpu :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 CompositeImplicitAutograd[alias]: fn_math :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
-''')
+""",
+        )
 
         # computed dispatch table is too big, so we only check on a few entries we're interested in.
-        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check)
+        extracted_table = extract_dispatch_table_with_keys(
+            table, dispatch_keys_to_check
+        )
 
-        self.assertExpectedInline(extracted_table, '''\
+        self.assertExpectedInline(
+            extracted_table,
+            """\
 Undefined: fn_math [math kernel]
 CPU: fn_cpu [kernel]
 CUDA: fn_math [math kernel]
@@ -456,50 +555,69 @@ def test_computed_table_with_cpu_math(self):
 AutogradCPU: registered in pytorch framework [backend fallback]
 AutogradCUDA: fn_math [math kernel]
 AutogradXLA: fn_math [math kernel]
-''')
+""",
+        )
 
     def test_computed_table_with_autograd(self):
         global_m = C._dispatch_library("IMPL", "_", "AutogradCPU")
-        result = self.commute("foo", [
-            # m.def("foo(Tensor x) -> Tensor")
-            lambda m: m.def_("foo(Tensor x) -> Tensor"),
-            # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "Autograd"),
-        ])
+        result = self.commute(
+            "foo",
+            [
+                # m.def("foo(Tensor x) -> Tensor")
+                lambda m: m.def_("foo(Tensor x) -> Tensor"),
+                # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "Autograd"),
+            ],
+        )
         state, table = result.state, result.table
-        self.assertExpectedInline(state, '''\
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: test::foo(Tensor x) -> Tensor
 debug: registered at /dev/null:0
 alias analysis kind: FROM_SCHEMA
 Autograd[alias]: impl_t_t :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
-''')
+""",
+        )
 
         # computed dispatch table is too big, so we only check on a few entries we're interested in.
-        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check)
+        extracted_table = extract_dispatch_table_with_keys(
+            table, dispatch_keys_to_check
+        )
 
-        self.assertExpectedInline(extracted_table, '''\
+        self.assertExpectedInline(
+            extracted_table,
+            """\
 AutogradOther: impl_t_t [autograd kernel]
 AutogradCPU: impl_t_t [autograd kernel]
 AutogradCUDA: impl_t_t [autograd kernel]
 AutogradXLA: impl_t_t [autograd kernel]
-''')
+""",
+        )
 
     # Now that catchAll maps to CompositeImplicitAutograd, registering to both
     # catchAll and CompositeImplicitAutograd breaks commutativity.
     def test_computed_table_with_cpu_autograd_math(self):
-        result = self.commute("foo", [
-            # m.def("foo(Tensor x) -> Tensor")
-            lambda m: m.def_("foo(Tensor x) -> Tensor"),
-            # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
-            # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"),
-            # m.impl("foo", torch::kCompositeImplicitAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CompositeImplicitAutograd", debug="fn_math"),
-        ])
+        result = self.commute(
+            "foo",
+            [
+                # m.def("foo(Tensor x) -> Tensor")
+                lambda m: m.def_("foo(Tensor x) -> Tensor"),
+                # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
+                # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"),
+                # m.impl("foo", torch::kCompositeImplicitAutograd, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t(
+                    "foo", "CompositeImplicitAutograd", debug="fn_math"
+                ),
+            ],
+        )
         state, table = result.state, result.table
-        self.assertExpectedInline(state, '''\
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: test::foo(Tensor x) -> Tensor
 debug: registered at /dev/null:0
@@ -507,12 +625,17 @@ def test_computed_table_with_cpu_autograd_math(self):
 CPU: fn_cpu :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 Autograd[alias]: fn_autograd :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 CompositeImplicitAutograd[alias]: fn_math :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
-''')
+""",
+        )
 
         # computed dispatch table is too big, so we only check on a few entries we're interested in.
-        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check)
+        extracted_table = extract_dispatch_table_with_keys(
+            table, dispatch_keys_to_check
+        )
 
-        self.assertExpectedInline(extracted_table, '''\
+        self.assertExpectedInline(
+            extracted_table,
+            """\
 Undefined: fn_math [math kernel]
 CPU: fn_cpu [kernel]
 CUDA: fn_math [math kernel]
@@ -521,31 +644,44 @@ def test_computed_table_with_cpu_autograd_math(self):
 AutogradCPU: fn_autograd [autograd kernel]
 AutogradCUDA: fn_math [math kernel]
 AutogradXLA: fn_math [math kernel]
-''')
+""",
+        )
 
     def test_computed_table_with_ambiguous_autogradother(self):
-        result = self.commute("foo", [
-            # m.def("foo(Tensor x) -> Tensor")
-            lambda m: m.def_("foo(Tensor x) -> Tensor"),
-            # m.impl("foo", torch::kCompositeImplicitAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CompositeImplicitAutograd", debug="fn_math"),
-            # m.impl("foo", torch::kFPGA, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "FPGA", debug="fn_fpga"),
-        ])
+        result = self.commute(
+            "foo",
+            [
+                # m.def("foo(Tensor x) -> Tensor")
+                lambda m: m.def_("foo(Tensor x) -> Tensor"),
+                # m.impl("foo", torch::kCompositeImplicitAutograd, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t(
+                    "foo", "CompositeImplicitAutograd", debug="fn_math"
+                ),
+                # m.impl("foo", torch::kFPGA, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "FPGA", debug="fn_fpga"),
+            ],
+        )
         state, table = result.state, result.table
-        self.assertExpectedInline(state, '''\
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: test::foo(Tensor x) -> Tensor
 debug: registered at /dev/null:0
 alias analysis kind: FROM_SCHEMA
 FPGA: fn_fpga :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 CompositeImplicitAutograd[alias]: fn_math :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
-''')
+""",
+        )
 
         # computed dispatch table is too big, so we only check on a few entries we're interested in.
-        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('FPGA',))
+        extracted_table = extract_dispatch_table_with_keys(
+            table, dispatch_keys_to_check + ("FPGA",)
+        )
 
-        self.assertExpectedInline(extracted_table, '''\
+        self.assertExpectedInline(
+            extracted_table,
+            """\
 Undefined: fn_math [math kernel]
 CPU: fn_math [math kernel]
 CUDA: fn_math [math kernel]
@@ -555,31 +691,44 @@ def test_computed_table_with_ambiguous_autogradother(self):
 AutogradCUDA: fn_math [math kernel]
 AutogradXLA: fn_math [math kernel]
 FPGA: fn_fpga [kernel]
-''')
+""",
+        )
 
     def test_computed_table_with_cpu_defaultbackend(self):
-        result = self.commute("foo", [
-            # m.def("foo(Tensor x) -> Tensor")
-            lambda m: m.def_("foo(Tensor x) -> Tensor"),
-            # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
-            # m.impl("foo", torch::kCompositeExplicitAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CompositeExplicitAutograd", debug="fn_defaultbackend"),
-        ])
+        result = self.commute(
+            "foo",
+            [
+                # m.def("foo(Tensor x) -> Tensor")
+                lambda m: m.def_("foo(Tensor x) -> Tensor"),
+                # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
+                # m.impl("foo", torch::kCompositeExplicitAutograd, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t(
+                    "foo", "CompositeExplicitAutograd", debug="fn_defaultbackend"
+                ),
+            ],
+        )
         state, table = result.state, result.table
-        self.assertExpectedInline(state, '''\
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: test::foo(Tensor x) -> Tensor
 debug: registered at /dev/null:0
 alias analysis kind: FROM_SCHEMA
 CPU: fn_cpu :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 CompositeExplicitAutograd[alias]: fn_defaultbackend :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
-''')
+""",
+        )
 
         # computed dispatch table is too big, so we only check on a few entries we're interested in.
-        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check)
+        extracted_table = extract_dispatch_table_with_keys(
+            table, dispatch_keys_to_check
+        )
 
-        self.assertExpectedInline(extracted_table, '''\
+        self.assertExpectedInline(
+            extracted_table,
+            """\
 Undefined: fn_defaultbackend [default backend kernel]
 CPU: fn_cpu [kernel]
 CUDA: fn_defaultbackend [default backend kernel]
@@ -588,21 +737,29 @@ def test_computed_table_with_cpu_defaultbackend(self):
 AutogradCPU: registered in pytorch framework [backend fallback]
 AutogradCUDA: registered in pytorch framework [backend fallback]
 AutogradXLA: registered in pytorch framework [backend fallback]
-''')
+""",
+        )
 
     def test_computed_table_with_cpu_autograd_defaultbackend(self):
-        result = self.commute("foo", [
-            # m.def("foo(Tensor x) -> Tensor")
-            lambda m: m.def_("foo(Tensor x) -> Tensor"),
-            # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
-            # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"),
-            # m.impl("foo", torch::kCompositeExplicitAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CompositeExplicitAutograd", debug="fn_defaultbackend"),
-        ])
+        result = self.commute(
+            "foo",
+            [
+                # m.def("foo(Tensor x) -> Tensor")
+                lambda m: m.def_("foo(Tensor x) -> Tensor"),
+                # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
+                # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"),
+                # m.impl("foo", torch::kCompositeExplicitAutograd, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t(
+                    "foo", "CompositeExplicitAutograd", debug="fn_defaultbackend"
+                ),
+            ],
+        )
         state, table = result.state, result.table
-        self.assertExpectedInline(state, '''\
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: test::foo(Tensor x) -> Tensor
 debug: registered at /dev/null:0
@@ -610,12 +767,17 @@ def test_computed_table_with_cpu_autograd_defaultbackend(self):
 CPU: fn_cpu :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 Autograd[alias]: fn_autograd :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 CompositeExplicitAutograd[alias]: fn_defaultbackend :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
-''')
+""",
+        )
 
         # computed dispatch table is too big, so we only check on a few entries we're interested in.
-        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('FPGA',))
+        extracted_table = extract_dispatch_table_with_keys(
+            table, dispatch_keys_to_check + ("FPGA",)
+        )
 
-        self.assertExpectedInline(extracted_table, '''\
+        self.assertExpectedInline(
+            extracted_table,
+            """\
 Undefined: fn_defaultbackend [default backend kernel]
 CPU: fn_cpu [kernel]
 CUDA: fn_defaultbackend [default backend kernel]
@@ -625,23 +787,33 @@ def test_computed_table_with_cpu_autograd_defaultbackend(self):
 AutogradCUDA: fn_autograd [autograd kernel]
 AutogradXLA: fn_autograd [autograd kernel]
 FPGA: fn_defaultbackend [default backend kernel]
-''')
+""",
+        )
 
     def test_computed_table_with_cpu_autograd_math_defaultbackend(self):
-        result = self.commute("foo", [
-            # m.def("foo(Tensor x) -> Tensor")
-            lambda m: m.def_("foo(Tensor x) -> Tensor"),
-            # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
-            # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"),
-            # m.impl("foo", torch::kCompositeImplicitAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CompositeImplicitAutograd", debug="fn_math"),
-            # m.impl("foo", torch::kCompositeExplicitAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "CompositeExplicitAutograd", debug="fn_defaultbackend"),
-        ])
+        result = self.commute(
+            "foo",
+            [
+                # m.def("foo(Tensor x) -> Tensor")
+                lambda m: m.def_("foo(Tensor x) -> Tensor"),
+                # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
+                # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"),
+                # m.impl("foo", torch::kCompositeImplicitAutograd, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t(
+                    "foo", "CompositeImplicitAutograd", debug="fn_math"
+                ),
+                # m.impl("foo", torch::kCompositeExplicitAutograd, [](const Tensor & x) { return x })
+                lambda m: m.impl_t_t(
+                    "foo", "CompositeExplicitAutograd", debug="fn_defaultbackend"
+                ),
+            ],
+        )
         state, table = result.state, result.table
-        self.assertExpectedInline(state, '''\
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: test::foo(Tensor x) -> Tensor
 debug: registered at /dev/null:0
@@ -650,12 +822,17 @@ def test_computed_table_with_cpu_autograd_math_defaultbackend(self):
 Autograd[alias]: fn_autograd :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 CompositeImplicitAutograd[alias]: fn_math :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 CompositeExplicitAutograd[alias]: fn_defaultbackend :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
-''')
+""",
+        )
 
         # computed dispatch table is too big, so we only check on a few entries we're interested in.
-        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check)
+        extracted_table = extract_dispatch_table_with_keys(
+            table, dispatch_keys_to_check
+        )
 
-        self.assertExpectedInline(extracted_table, '''\
+        self.assertExpectedInline(
+            extracted_table,
+            """\
 Undefined: fn_defaultbackend [default backend kernel]
 CPU: fn_cpu [kernel]
 CUDA: fn_defaultbackend [default backend kernel]
@@ -664,7 +841,8 @@ def test_computed_table_with_cpu_autograd_math_defaultbackend(self):
 AutogradCPU: fn_autograd [autograd kernel]
 AutogradCUDA: fn_autograd [autograd kernel]
 AutogradXLA: fn_autograd [autograd kernel]
-''')
+""",
+        )
 
     def test_multiple_def_error(self):
         ops = [
@@ -675,25 +853,32 @@ def test_multiple_def_error(self):
         ]
         self.assertExpectedInline(
             self.commute("foo", ops, expect_raises=True).state,
-            '''Tried to register an operator (test::foo(Tensor x, Tensor y) -> Tensor) with the same name and overload '''
-            '''name multiple times. Each overload's schema should only be registered with a single call to def(). '''
-            '''Duplicate registration: registered at /dev/null:0. Original registration: registered at /dev/null:0'''
+            """Tried to register an operator (test::foo(Tensor x, Tensor y) -> Tensor) with the same name and overload """
+            """name multiple times. Each overload's schema should only be registered with a single call to def(). """
+            """Duplicate registration: registered at /dev/null:0. Original registration: registered at /dev/null:0""",
         )
 
     def test_def_with_explicit_alias(self):
-        state = self.commute("foo", [
-            # m.def(torch::schema(
-            #   "foo(Tensor x, Tensor y) -> Tensor",
-            #   AliasAnalysisKind::PURE))
-            lambda m: m.def_("foo(Tensor x, Tensor y) -> Tensor",
-                             alias="PURE_FUNCTION")
-        ]).state
-        self.assertExpectedInline(state, '''\
+        state = self.commute(
+            "foo",
+            [
+                # m.def(torch::schema(
+                #   "foo(Tensor x, Tensor y) -> Tensor",
+                #   AliasAnalysisKind::PURE))
+                lambda m: m.def_(
+                    "foo(Tensor x, Tensor y) -> Tensor", alias="PURE_FUNCTION"
+                )
+            ],
+        ).state
+        self.assertExpectedInline(
+            state,
+            """\
 name: test::foo
 schema: test::foo(Tensor x, Tensor y) -> Tensor
 debug: registered at /dev/null:0
 alias analysis kind: PURE_FUNCTION
-''')
+""",
+        )
 
     def test_multiple_def_alias_defaulting(self):
         ops = [
@@ -705,9 +890,9 @@ def test_multiple_def_alias_defaulting(self):
         ]
         self.assertExpectedInline(
             self.commute("foo", ops, expect_raises=True).state,
-            '''Tried to register an operator (test::foo(Tensor x) -> Tensor) with the same name and overload '''
-            '''name multiple times. Each overload's schema should only be registered with a single call to def(). '''
-            '''Duplicate registration: registered at /dev/null:0. Original registration: registered at /dev/null:0'''
+            """Tried to register an operator (test::foo(Tensor x) -> Tensor) with the same name and overload """
+            """name multiple times. Each overload's schema should only be registered with a single call to def(). """
+            """Duplicate registration: registered at /dev/null:0. Original registration: registered at /dev/null:0""",
         )
 
     def test_multiple_def_alias_mismatch(self):
@@ -721,9 +906,9 @@ def test_multiple_def_alias_mismatch(self):
         ]
         self.assertExpectedInline(
             self.commute("foo", ops, expect_raises=True).state,
-            '''Tried to register an operator (test::foo(Tensor x) -> Tensor) with the same name and overload '''
-            '''name multiple times. Each overload's schema should only be registered with a single call to def(). '''
-            '''Duplicate registration: registered at /dev/null:0. Original registration: registered at /dev/null:0'''
+            """Tried to register an operator (test::foo(Tensor x) -> Tensor) with the same name and overload """
+            """name multiple times. Each overload's schema should only be registered with a single call to def(). """
+            """Duplicate registration: registered at /dev/null:0. Original registration: registered at /dev/null:0""",
         )
 
     def test_multiple_fallback(self):
@@ -734,8 +919,8 @@ def test_multiple_fallback(self):
         except RuntimeError as e:
             self.assertExpectedInline(
                 str(e),
-                '''Tried to register multiple backend fallbacks for the same dispatch key XLA; previous registration '''
-                '''registered at /dev/null:0, new registration registered at /dev/null:0'''
+                """Tried to register multiple backend fallbacks for the same dispatch key XLA; previous registration """
+                """registered at /dev/null:0, new registration registered at /dev/null:0""",
             )
         else:
             self.assertTrue(False)
@@ -748,12 +933,12 @@ def test_overwrite_math(self):
         # Not commutative
         self.assertExpectedInline(
             self.commute("foo", ops, ctor_order=(0, 1)).state,
-            '''\
+            """\
 name: test::foo
 schema: (none)
 CompositeImplicitAutograd[alias]: fn2 :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
 CompositeImplicitAutograd[alias] (inactive): fn1 :: (Tensor _0) -> Tensor _0 [ boxed unboxed ]
-'''
+""",
         )
 
     # Definition: a dangling impl happens when someone does an impl() on a
@@ -765,11 +950,15 @@ def test_find_dangling_impls(self):
         self.assertEqual(
             0,
             len(dangling_impls),
-            msg=f"Expect zero dangling impls, but found: {dangling_impls}"
+            msg=f"Expect zero dangling impls, but found: {dangling_impls}",
         )
 
     def test_find_dangling_impls_ext(self):
-        extension_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cpp_extensions', 'dangling_impl_extension.cpp')
+        extension_path = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)),
+            "cpp_extensions",
+            "dangling_impl_extension.cpp",
+        )
         module = torch.utils.cpp_extension.load(
             name="dangling_impl_extension",
             sources=[
@@ -782,18 +971,20 @@ def test_find_dangling_impls_ext(self):
         impls = C._dispatch_find_dangling_impls()
         self.assertEqual(1, len(impls))
         self.assertEqual(
-            f'''\
+            f"""\
 name: __test::foo
 schema: (none)
 CPU: registered at {extension_path}:5 :: () -> () [ boxed unboxed ]
-''',
-            impls[0])
+""",
+            impls[0],
+        )
 
     def test_dispatch_print_registrations_for_dispatch_key_invalid(self):
         with self.assertRaisesRegex(
-                RuntimeError,
-                "could not parse dispatch key: invalid_key"):
-            C._dispatch_print_registrations_for_dispatch_key('invalid_key')
+            RuntimeError, "could not parse dispatch key: invalid_key"
+        ):
+            C._dispatch_print_registrations_for_dispatch_key("invalid_key")
+
 
 class TestPythonDispatcher(TestCase):
     def test_basic(self):
@@ -801,7 +992,7 @@ def test_basic(self):
         dispatcher.register(["CPU", "XLA", "Lazy", "CompositeImplicitAutograd"])
         self.assertExpectedInline(
             dispatcher.dispatchTable(),
-            '''\
+            """\
 
 Computed Dispatch Table
 key             kernel
@@ -814,15 +1005,17 @@ def test_basic(self):
 AutogradCPU     [backend fallback]
 AutogradXLA     [backend fallback]
 AutogradLazy    [backend fallback]
-'''
+""",
         )
 
     def test_math_autogradcpu(self):
         dispatcher = PythonDispatcher()
-        dispatcher.register(["CPU", "XLA", "Lazy", "CompositeImplicitAutograd", "AutogradCPU"])
+        dispatcher.register(
+            ["CPU", "XLA", "Lazy", "CompositeImplicitAutograd", "AutogradCPU"]
+        )
         self.assertExpectedInline(
             dispatcher.dispatchTable(),
-            '''\
+            """\
 
 Computed Dispatch Table
 key             kernel
@@ -835,11 +1028,11 @@ def test_math_autogradcpu(self):
 AutogradCPU     fn_AutogradCPU [kernel]
 AutogradXLA     [backend fallback]
 AutogradLazy    [backend fallback]
-'''
+""",
         )
         self.assertExpectedInline(
             dispatcher.registrations(),
-            '''\
+            """\
 
 Registered Kernels
 key             kernel
@@ -849,15 +1042,17 @@ def test_math_autogradcpu(self):
 Lazy            fn_Lazy
 AutogradCPU     fn_AutogradCPU
 CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd
-'''
+""",
         )
 
     def test_defaultbackend_autogradcpu(self):
         dispatcher = PythonDispatcher()
-        dispatcher.register(["CPU", "XLA", "Lazy", "CompositeExplicitAutograd", "AutogradCPU"])
+        dispatcher.register(
+            ["CPU", "XLA", "Lazy", "CompositeExplicitAutograd", "AutogradCPU"]
+        )
         self.assertExpectedInline(
             dispatcher.dispatchTable(),
-            '''\
+            """\
 
 Computed Dispatch Table
 key             kernel
@@ -870,12 +1065,12 @@ def test_defaultbackend_autogradcpu(self):
 AutogradCPU     fn_AutogradCPU [kernel]
 AutogradXLA     [backend fallback]
 AutogradLazy    [backend fallback]
-'''
+""",
         )
 
         self.assertExpectedInline(
             dispatcher.registrations(),
-            '''\
+            """\
 
 Registered Kernels
 key             kernel
@@ -885,7 +1080,7 @@ def test_defaultbackend_autogradcpu(self):
 Lazy            fn_Lazy
 AutogradCPU     fn_AutogradCPU
 CompositeExplicitAutograd[alias] fn_CompositeExplicitAutograd
-'''
+""",
         )
 
     def test_autogradother(self):
@@ -893,7 +1088,7 @@ def test_autogradother(self):
         dispatcher.register(["CPU", "FPGA", "CompositeImplicitAutograd"])
         self.assertExpectedInline(
             dispatcher.dispatchTable(),
-            '''\
+            """\
 
 Computed Dispatch Table
 key             kernel
@@ -906,12 +1101,12 @@ def test_autogradother(self):
 AutogradCPU     [backend fallback]
 AutogradXLA     fn_CompositeImplicitAutograd [math kernel]
 AutogradLazy    fn_CompositeImplicitAutograd [math kernel]
-'''
+""",
         )
 
         self.assertExpectedInline(
             dispatcher.registrations(),
-            '''\
+            """\
 
 Registered Kernels
 key             kernel
@@ -919,7 +1114,7 @@ def test_autogradother(self):
 FPGA            fn_FPGA
 CPU             fn_CPU
 CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd
-'''
+""",
         )
 
     def test_duplicate_registrations(self):
@@ -932,9 +1127,12 @@ def test_defaultbackend_math(self):
         dispatcher = PythonDispatcher()
 
         with self.assertRaisesRegex(
-                RuntimeError,
-                r"Registration to both CompositeImplicitAutograd and CompositeExplicitAutograd is not allowed"):
-            dispatcher.register(["CompositeExplicitAutograd", "CompositeImplicitAutograd"])
+            RuntimeError,
+            r"Registration to both CompositeImplicitAutograd and CompositeExplicitAutograd is not allowed",
+        ):
+            dispatcher.register(
+                ["CompositeExplicitAutograd", "CompositeImplicitAutograd"]
+            )
 
     def test_quantized_structured_not_implemented(self):
         x = torch.zeros([1, 1, 1])
@@ -948,8 +1146,9 @@ def test_quantized_structured_not_implemented(self):
         self.assertRaisesRegex(
             NotImplementedError,
             "Could not run 'aten::bmm.out' with arguments from the 'QuantizedCPU' backend.",
-            lambda: torch.bmm(qx, qy)
+            lambda: torch.bmm(qx, qy),
         )
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_dlpack.py b/test/test_dlpack.py
index 35fe8ad88b255..a9036be160b0a 100644
--- a/test/test_dlpack.py
+++ b/test/test_dlpack.py
@@ -2,11 +2,16 @@
 
 import torch
 from torch.testing import make_tensor
-from torch.testing._internal.common_utils import TestCase, run_tests, IS_JETSON
 from torch.testing._internal.common_device_type import (
-    instantiate_device_type_tests, onlyCUDA, dtypes, skipMeta, skipCUDAIfRocm,
-    onlyNativeDeviceTypes)
+    dtypes,
+    instantiate_device_type_tests,
+    onlyCUDA,
+    onlyNativeDeviceTypes,
+    skipCUDAIfRocm,
+    skipMeta,
+)
 from torch.testing._internal.common_dtype import all_types_and_complex_and
+from torch.testing._internal.common_utils import IS_JETSON, run_tests, TestCase
 from torch.utils.dlpack import from_dlpack, to_dlpack
 
 
@@ -15,7 +20,16 @@ class TestTorchDlPack(TestCase):
 
     @skipMeta
     @onlyNativeDeviceTypes
-    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    @dtypes(
+        *all_types_and_complex_and(
+            torch.half,
+            torch.bfloat16,
+            torch.bool,
+            torch.uint16,
+            torch.uint32,
+            torch.uint64,
+        )
+    )
     def test_dlpack_capsule_conversion(self, device, dtype):
         x = make_tensor((5,), dtype=dtype, device=device)
         z = from_dlpack(to_dlpack(x))
@@ -23,7 +37,16 @@ def test_dlpack_capsule_conversion(self, device, dtype):
 
     @skipMeta
     @onlyNativeDeviceTypes
-    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    @dtypes(
+        *all_types_and_complex_and(
+            torch.half,
+            torch.bfloat16,
+            torch.bool,
+            torch.uint16,
+            torch.uint32,
+            torch.uint64,
+        )
+    )
     def test_dlpack_protocol_conversion(self, device, dtype):
         x = make_tensor((5,), dtype=dtype, device=device)
         z = from_dlpack(x)
@@ -62,7 +85,16 @@ def test_dlpack_conversion_with_streams(self, device, dtype):
 
     @skipMeta
     @onlyNativeDeviceTypes
-    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    @dtypes(
+        *all_types_and_complex_and(
+            torch.half,
+            torch.bfloat16,
+            torch.bool,
+            torch.uint16,
+            torch.uint32,
+            torch.uint64,
+        )
+    )
     def test_from_dlpack(self, device, dtype):
         x = make_tensor((5,), dtype=dtype, device=device)
         y = torch.from_dlpack(x)
@@ -70,7 +102,16 @@ def test_from_dlpack(self, device, dtype):
 
     @skipMeta
     @onlyNativeDeviceTypes
-    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    @dtypes(
+        *all_types_and_complex_and(
+            torch.half,
+            torch.bfloat16,
+            torch.bool,
+            torch.uint16,
+            torch.uint32,
+            torch.uint64,
+        )
+    )
     def test_from_dlpack_noncontinguous(self, device, dtype):
         x = make_tensor((25,), dtype=dtype, device=device).reshape(5, 5)
 
@@ -113,7 +154,16 @@ def test_dlpack_conversion_with_diff_streams(self, device, dtype):
 
     @skipMeta
     @onlyNativeDeviceTypes
-    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    @dtypes(
+        *all_types_and_complex_and(
+            torch.half,
+            torch.bfloat16,
+            torch.bool,
+            torch.uint16,
+            torch.uint32,
+            torch.uint64,
+        )
+    )
     def test_from_dlpack_dtype(self, device, dtype):
         x = make_tensor((5,), dtype=dtype, device=device)
         y = torch.from_dlpack(x)
@@ -204,5 +254,5 @@ def test_dlpack_normalize_strides(self):
 
 instantiate_device_type_tests(TestTorchDlPack, globals())
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 48d6e40b62acb..4a2ba0d9867e3 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -2,8 +2,8 @@
 
 import contextlib
 import copy
-import itertools
 import inspect
+import itertools
 import math
 import operator
 import re
@@ -16,8 +16,9 @@
 from torch._C import _disabled_torch_function_impl
 from torch.fx.experimental import sym_node
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.fx.experimental.sym_node import to_node, sym_sqrt, SymNode, method_to_operator
+from torch.fx.experimental.sym_node import method_to_operator, SymNode, to_node
 from torch.fx.experimental.symbolic_shapes import (
+    _constrain_range_for_size,
     DimConstraints,
     DimDynamic,
     expect_true,
@@ -25,9 +26,10 @@
     guard_float,
     guard_int,
     GuardOnDataDependentSymNode,
-    ShapeEnv,
     is_symbolic,
+    ShapeEnv,
     StatelessSymbolicContext,
+    statically_known_true,
 )
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -36,8 +38,8 @@
     skipIfTorchDynamo,
     TestCase,
 )
-from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils import _pytree as pytree
+from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._sympy.functions import FloorDiv, Mod
 
 aten = torch.ops.aten
@@ -49,8 +51,10 @@ def register_meta(op):
     def decorator(f):
         def add_func(op):
             meta_funcs[op] = f
+
         pytree.tree_map_(add_func, op)
         return f
+
     return decorator
 
 
@@ -99,13 +103,26 @@ def create_contiguous(shape):
 
 class FakeSymbolicTensor(torch.Tensor):
     @staticmethod
-    def __new__(cls, sym_shape, sym_strides, dtype, layout, requires_grad, device, storage_offset=0):
+    def __new__(
+        cls,
+        sym_shape,
+        sym_strides,
+        dtype,
+        layout,
+        requires_grad,
+        device,
+        storage_offset=0,
+    ):
         # TODO: this is wrong in general
         sym_stride = create_contiguous(sym_shape)
         r = torch.Tensor._make_wrapper_subclass(
-            cls, sym_shape,
-            sym_stride, storage_offset,
-            dtype=dtype, layout=layout, requires_grad=requires_grad,
+            cls,
+            sym_shape,
+            sym_stride,
+            storage_offset,
+            dtype=dtype,
+            layout=layout,
+            requires_grad=requires_grad,
             device=device,
         )
         return r
@@ -113,7 +130,9 @@ def __new__(cls, sym_shape, sym_strides, dtype, layout, requires_grad, device, s
     __torch_function__ = _disabled_torch_function_impl
 
     def new_empty(self, shape):
-        return FakeSymbolicTensor(shape, None, self.dtype, self.layout, self.requires_grad, self.device)
+        return FakeSymbolicTensor(
+            shape, None, self.dtype, self.layout, self.requires_grad, self.device
+        )
 
     @classmethod
     def __torch_dispatch__(cls, func_overload, types, args=(), kwargs=None):
@@ -123,67 +142,106 @@ def __torch_dispatch__(cls, func_overload, types, args=(), kwargs=None):
         if func_overload == torch.ops.aten.new_empty.default:
             self = args[0]
             shape = args[1]
-            return FakeSymbolicTensor(shape, self.stride(), self.dtype, self.layout, self.requires_grad, self.device)
+            return FakeSymbolicTensor(
+                shape,
+                self.stride(),
+                self.dtype,
+                self.layout,
+                self.requires_grad,
+                self.device,
+            )
 
         raise RuntimeError(f"operator {func_overload} not supported")
 
 
-def create_symbolic_tensor(name, arg, shape_env):
+def create_symbolic_tensor(name, arg, shape_env, source=None, dynamic_dims=None):
     from torch._dynamo.source import ConstantSource
 
+    if source is None:
+        source = ConstantSource(name)
     constraint_dims = [None] * arg.dim()
-    dynamic_dims = [DimDynamic.DUCK] * arg.dim()
-    sym_shapes, sym_strides, sym_storage_offset = \
-        shape_env.create_symbolic_sizes_strides_storage_offset(
-            arg,
-            source=ConstantSource(name),
-            symbolic_context=StatelessSymbolicContext(
-                dynamic_sizes=dynamic_dims,
-                constraint_sizes=constraint_dims
-            ),
-        )
-    return FakeSymbolicTensor(sym_shapes, sym_strides, arg.dtype, arg.layout, arg.requires_grad, arg.device, sym_storage_offset)
-
-def create_symtype(cls, pytype, shape_env, val):
+    if dynamic_dims is None:
+        dynamic_dims = [DimDynamic.DUCK] * arg.dim()
+    (
+        sym_shapes,
+        sym_strides,
+        sym_storage_offset,
+    ) = shape_env.create_symbolic_sizes_strides_storage_offset(
+        arg,
+        source=source,
+        symbolic_context=StatelessSymbolicContext(
+            dynamic_sizes=dynamic_dims, constraint_sizes=constraint_dims
+        ),
+    )
+    return FakeSymbolicTensor(
+        sym_shapes,
+        sym_strides,
+        arg.dtype,
+        arg.layout,
+        arg.requires_grad,
+        arg.device,
+        sym_storage_offset,
+    )
+
+
+def create_symtype(cls, pytype, shape_env, val, duck=True):
     from torch._dynamo.source import ConstantSource
+
     symbol = shape_env.create_symbol(
         val,
         source=ConstantSource(f"__testing_only{len(shape_env.var_to_val)}"),
-        dynamic_dim=DimDynamic.DUCK,
+        dynamic_dim=DimDynamic.DUCK if duck else DimDynamic.DYNAMIC,
         constraint_dim=None,
     )
-    return cls(SymNode(
-        symbol,
-        shape_env,
-        pytype,
-        hint=val,
-    ))
+    return cls(
+        SymNode(
+            symbol,
+            shape_env,
+            pytype,
+            hint=val,
+        )
+    )
+
+
+# TODO: default duck to False
+def create_symint(shape_env, i: int, duck=True):
+    return create_symtype(SymInt, int, shape_env, i, duck=duck)
 
-def create_symint(shape_env, i: int):
-    return create_symtype(SymInt, int, shape_env, i)
 
 def create_symbool(shape_env, b: bool):
     return create_symtype(SymBool, bool, shape_env, b)
 
+
 def create_symfloat(shape_env, f: float):
     return create_symtype(SymFloat, float, shape_env, f)
 
-@skipIfTorchDynamo("Creating ShapeEnv fails for confusing reasons (also we never expect dynamo to see code like this)")
-class TestPySymInt(TestCase):
 
+@skipIfTorchDynamo(
+    "Creating ShapeEnv fails for confusing reasons (also we never expect dynamo to see code like this)"
+)
+class TestPySymInt(TestCase):
     def test_arith_ops(self):
         shape_env = ShapeEnv()
         symints = []
         for i in range(2, 5):
             symints.append((i, create_symint(shape_env, i)))
 
-        ops = [operator.add, operator.sub, operator.floordiv, operator.mul, operator.mod]
+        ops = [
+            operator.add,
+            operator.sub,
+            operator.floordiv,
+            operator.mul,
+            operator.mod,
+        ]
 
         for op in ops:
             for args in itertools.permutations(symints, 2):
-                if not isinstance(args[0][1], int) and ((op != operator.mod or op != operator.floordiv) and args[1][0] != 0):
-                    self.assertTrue(op(args[0][1], args[1][1]) == op(args[0][0], args[1][0]))
-
+                if not isinstance(args[0][1], int) and (
+                    (op != operator.mod or op != operator.floordiv) and args[1][0] != 0
+                ):
+                    self.assertTrue(
+                        op(args[0][1], args[1][1]) == op(args[0][0], args[1][0])
+                    )
 
     def test_reverse_arith_ops(self):
         shape_env = ShapeEnv()
@@ -194,7 +252,6 @@ def test_reverse_arith_ops(self):
         a = create_symint(shape_env, 2)
         self.assertTrue(5 * a == 5 * 2)
 
-
     def test_roundtrip(self):
         shape_env = ShapeEnv()
         x = create_symbolic_tensor("x", torch.randn(5, 4, 3), shape_env)
@@ -211,7 +268,9 @@ def test_roundtrip(self):
         # Should be simplifiable to an integer.
         # Ref: https://github.com/pytorch/pytorch/pull/107492
         self.assertTrue(isinstance(x.size()[1], SymInt))
-        self.assertTrue(isinstance(x.size()[1].node.maybe_as_int(), int))  # due to guard above
+        self.assertTrue(
+            isinstance(x.size()[1].node.maybe_as_int(), int)
+        )  # due to guard above
         self.assertTrue(x.size()[2] == 3)
 
         self.assertTrue(x.size(0) == 5)
@@ -338,7 +397,6 @@ def test_int_to_float(self):
         self.assertIsInstance(r, torch.SymFloat, msg=type(r))
 
     def test_aten_ops(self):
-
         shape_env = ShapeEnv()
         x = create_symbolic_tensor("x", torch.randn(5), shape_env)
         torch.ops.aten.narrow_copy.default(x, 0, 0, x.shape[0])
@@ -362,7 +420,7 @@ def forward(self, x):
     def test_meta_symint(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 2)
-        r = torch.empty(a0, device='meta')
+        r = torch.empty(a0, device="meta")
         self.assertIsInstance(r.shape[0], SymInt)
 
     def test_guard_int(self):
@@ -371,6 +429,21 @@ def test_guard_int(self):
         self.assertEqual(guard_int(a0), 2)
         self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(s0, 2)""")
 
+    def test_prefer_deferred_runtime_assertions_over_guards(self):
+        shape_env = ShapeEnv(prefer_deferred_runtime_asserts_over_guards=True)
+        s0 = create_symint(shape_env, 2)
+        self.assertEqual(guard_int(s0), 2)
+        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(s0, 2)""")
+
+        shape_env = ShapeEnv(prefer_deferred_runtime_asserts_over_guards=True)
+        s0 = create_symint(shape_env, 2)
+        self.assertTrue(expect_true(s0 == 2))
+        self.assertEqual(len(shape_env.guards), 0)
+        self.assertExpectedInline(
+            str([ra.expr for ra in shape_env.deferred_runtime_asserts[None]]),
+            """[Eq(s0, 2)]""",
+        )
+
     def test_sym_int(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 5)
@@ -383,21 +456,25 @@ def test_sym_int(self):
         r = sym_int(a1 / 2)
         self.assertEqual(guard_int(r), 3)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
-        self.assertExpectedInline(str(shape_env.guards[1][0]), """Eq(floor(s1/2), 3)""")
+        self.assertExpectedInline(str(shape_env.guards[1][0]), """Eq(Trunc(s1/2), 3)""")
 
         a3 = create_symint(shape_env, 3)
         r = sym_int(2.0 * torch.sym_float(a3))
         self.assertEqual(guard_int(r), 6)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
-        self.assertExpectedInline(str(shape_env.guards[2][0]), """Eq(2*s2, 6)""")
+        self.assertExpectedInline(
+            str(shape_env.guards[2][0]), """Eq(Trunc(2.0*s2), 6)"""
+        )
 
     def test_sym_sqrt(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 4)
-        r = sym_sqrt(a0)
+        r = torch._sym_sqrt(a0)
         self.assertEqual(r, 2)
         self.assertIsInstance(r, torch.SymFloat, msg=type(r))
-        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(sqrt(s0), 2)""")
+        self.assertExpectedInline(
+            str(shape_env.guards[0][0]), """Eq(OpaqueUnaryFn_sqrt(s0), 2)"""
+        )
 
     def test_sym_floor(self):
         shape_env = ShapeEnv()
@@ -411,13 +488,29 @@ def test_sym_floor(self):
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
         self.assertExpectedInline(str(shape_env.guards[1][0]), """Eq(3*s0, 15)""")
 
+    def test_sym_trunc(self):
+        shape_env = ShapeEnv()
+        a0 = create_symint(shape_env, 5)
+        r = math.trunc(a0 / 2)
+        self.assertEqual(r, 2)
+        self.assertIsInstance(r, torch.SymInt, msg=type(r))
+        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(Trunc(s0/2), 2)""")
+        r = torch.sym_int(torch.sym_sqrt(a0))
+        self.assertEqual(r, 2)
+        self.assertIsInstance(r, torch.SymInt, msg=type(r))
+        self.assertExpectedInline(
+            str(shape_env.guards[1][0]), """Eq(Trunc(OpaqueUnaryFn_sqrt(s0)), 2)"""
+        )
+
     def test_sym_ceil(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 5)
         r = math.ceil(a0 / 2)
         self.assertEqual(r, 3)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
-        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(ceiling(s0/2), 3)""")
+        self.assertExpectedInline(
+            str(shape_env.guards[0][0]), """Eq(ceiling(s0/2), 3)"""
+        )
         r = math.floor(3.0 * a0)
         self.assertEqual(r, 15)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
@@ -438,13 +531,19 @@ def test_sym_ite(self):
         self.assertEqual(len(shape_env.guards), 0)
         self.assertEqual(r3, 5)
         self.assertEqual(type(t), type(r3))
-        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(Piecewise((s0, Eq(s0, 5)), (s1, True)), 5)""")
+        self.assertExpectedInline(
+            str(shape_env.guards[0][0]),
+            """Eq(Piecewise((s0, Eq(s0, 5)), (s1, True)), 5)""",
+        )
         b4 = f == 5
         r4 = torch.sym_ite(b4, t, f)
         self.assertEqual(len(shape_env.guards), 1)
         self.assertEqual(r4, 4)
         self.assertEqual(type(f), type(r4))
-        self.assertExpectedInline(str(shape_env.guards[1][0]), """Eq(Piecewise((s0, Eq(s1, 5)), (s1, True)), 4)""")
+        self.assertExpectedInline(
+            str(shape_env.guards[1][0]),
+            """Eq(Piecewise((s0, Eq(s1, 5)), (s1, True)), 4)""",
+        )
 
     def test_tracing_sym_ite(self):
         def f(x):
@@ -454,13 +553,16 @@ def f(x):
 
         gm = make_fx(f, tracing_mode="symbolic")(torch.ones(4, 5))
         self.assertEqual(len(gm.shape_env.guards), 0)
-        self.assertExpectedInline(gm.code.strip(), """\
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
 def forward(self, x_1):
     sym_size_int = torch.ops.aten.sym_size.int(x_1, 0)
     eq = sym_size_int == 5
     sym_size_int_1 = torch.ops.aten.sym_size.int(x_1, 1);  x_1 = None
     sym_ite = torch.sym_ite(eq, sym_size_int, sym_size_int_1);  eq = sym_size_int = sym_size_int_1 = None
-    return sym_ite""")
+    return sym_ite""",
+        )
         r1 = gm(torch.ones(4, 5))
         self.assertIsInstance(r1, int)
         self.assertEqual(r1, 5)
@@ -479,6 +581,12 @@ def test_data_dependent_guard(self):
         s0 = shape_env.create_unbacked_symint()
         self.assertRaises(GuardOnDataDependentSymNode, lambda: bool(s0 == 0))
 
+    def test_data_dependent_guard_propagate_real_tensors(self):
+        shape_env = ShapeEnv()
+        s0 = shape_env.create_unbacked_symint()
+        shape_env.set_unbacked_var_to_val(s0.node.expr, 0)
+        self.assertEqual(bool(s0 == 0), True)
+
     def test_expect_true_basic(self):
         shape_env = ShapeEnv()
         i0 = shape_env.create_unbacked_symint()
@@ -494,13 +602,15 @@ def test_expect_true_with_s0(self):
         shape_env = ShapeEnv()
         s0 = create_symint(shape_env, 5)
         i0 = shape_env.create_unbacked_symint()
-        self.assertTrue(expect_true(i0 <= s0))
+        self.assertTrue(expect_true(i0 < s0))
         self.assertExpectedInline(
             str([ra.expr for ra in shape_env.deferred_runtime_asserts[i0.node.expr]]),
-            """[i0 - s0 <= 0]"""
+            """[-s0 + u0 < 0]""",
         )
-        self.assertTrue(i0 <= s0)
+        self.assertTrue(i0 < s0)
+        self.assertTrue(i0 != s0)
         self.assertFalse(i0 > s0)
+        self.assertFalse(i0 >= s0)
 
     def test_expect_true_prefer_later(self):
         shape_env = ShapeEnv()
@@ -511,7 +621,7 @@ def test_expect_true_prefer_later(self):
         # Importantly, this is put in i1, not i0!
         self.assertExpectedInline(
             str([ra.expr for ra in shape_env.deferred_runtime_asserts[i1_sym]]),
-            """[Eq(i0 + i1, 10)]"""
+            """[Eq(u0 + u1, 10)]""",
         )
         self.assertTrue(i0 + i1 == 10)
         # NB: We currently don't support deriving that we can substitute
@@ -524,25 +634,120 @@ def test_unbacked_substitution(self):
         shape_env = ShapeEnv()
         i0 = shape_env.create_unbacked_symint()
         i1 = shape_env.create_unbacked_symint()
+        _constrain_range_for_size(i0)
+        _constrain_range_for_size(i1)
         self.assertTrue(expect_true(i0 == i1 * 4))
-        self.assertExpectedInline(str(i0), """4*i1""")
+        self.assertExpectedInline(str(i0), """u0""")
 
         i2 = shape_env.create_unbacked_symint()
         i3 = shape_env.create_unbacked_symint()
+        _constrain_range_for_size(i2)
+        _constrain_range_for_size(i3)
         self.assertTrue(expect_true(i2 * 4 == i3))
-        self.assertExpectedInline(str(i3), """4*i2""")
+        self.assertExpectedInline(str(i3), """u3""")
+
+    def test_avoid_unbacked_substitution(self):
+        shape_env = ShapeEnv()
+        i0 = shape_env.create_unbacked_symint()
+        _constrain_range_for_size(i0)
+        i1 = shape_env.create_unbacked_symint()
+        _constrain_range_for_size(i1)
+        self.assertTrue(expect_true(i0 == 10 - i1))
+        self.assertExpectedInline(str(i0), """u0""")
 
     def test_expect_true_double_digits(self):
         shape_env = ShapeEnv()
         ia = [shape_env.create_unbacked_symint() for _ in range(11)]  # allocate 10
-        self.assertEqual(str(ia[-1]), "i10")
+        self.assertEqual(str(ia[-1]), "u10")
         self.assertTrue(expect_true(sum(ia) == 20))
         self.assertEqual(len(shape_env.deferred_runtime_asserts[ia[-1].node.expr]), 1)
 
+    def test_expect_true_refine_range(self):
+        shape_env = ShapeEnv()
+        for i, rel in enumerate(
+            [lambda x: x > 4, lambda x: 4 < x, lambda x: x >= 5, lambda x: 5 <= x]
+        ):
+            with self.subTest(f"i = {i}"):
+                i0 = shape_env.create_unbacked_symint()
+                self.assertTrue(expect_true(rel(i0)))
+                self.assertTrue(statically_known_true(i0 != 3))
+                self.assertTrue(statically_known_true(i0 != 4))
+                self.assertFalse(statically_known_true(i0 != 5))
+                self.assertFalse(statically_known_true(i0 != 6))
+                self.assertTrue(statically_known_true(i0 > 4))
+                self.assertTrue(statically_known_true(i0 >= 5))
+
+        for i, rel in enumerate(
+            [lambda x: x < 4, lambda x: 4 > x, lambda x: x <= 3, lambda x: 3 >= x]
+        ):
+            with self.subTest(f"i = {i}"):
+                i0 = shape_env.create_unbacked_symint()
+                self.assertTrue(expect_true(rel(i0)))
+                self.assertFalse(statically_known_true(i0 != 2))
+                self.assertFalse(statically_known_true(i0 != 3))
+                self.assertTrue(statically_known_true(i0 != 4))
+                self.assertTrue(statically_known_true(i0 != 5))
+                self.assertTrue(statically_known_true(i0 < 4))
+                self.assertTrue(statically_known_true(i0 <= 5))
+
+    def test_guard_refine_range(self):
+        shape_env = ShapeEnv()
+        for i, rel in enumerate(
+            [lambda x: x > 4, lambda x: 4 < x, lambda x: x >= 5, lambda x: 5 <= x]
+        ):
+            with self.subTest(f"i = {i}"):
+                i0 = create_symint(shape_env, 10, duck=False)
+                self.assertTrue(bool(rel(i0)))
+                self.assertTrue(statically_known_true(i0 != 3))
+                self.assertTrue(statically_known_true(i0 != 4))
+                self.assertFalse(statically_known_true(i0 != 5))
+                self.assertFalse(statically_known_true(i0 != 6))
+                self.assertTrue(statically_known_true(i0 > 4))
+                self.assertTrue(statically_known_true(i0 >= 5))
+
+        for i, rel in enumerate(
+            [lambda x: x > 4, lambda x: 4 < x, lambda x: x >= 5, lambda x: 5 <= x]
+        ):
+            with self.subTest(f"i = {i}"):
+                i0 = create_symint(shape_env, 2, duck=False)
+                self.assertFalse(bool(rel(i0)))
+                self.assertFalse(statically_known_true(i0 != 3))
+                self.assertFalse(statically_known_true(i0 != 4))
+                self.assertTrue(statically_known_true(i0 != 5))
+                self.assertTrue(statically_known_true(i0 != 6))
+                self.assertTrue(statically_known_true(i0 <= 4))
+                self.assertTrue(statically_known_true(i0 < 5))
+
+        for i, rel in enumerate(
+            [lambda x: x < 4, lambda x: 4 > x, lambda x: x <= 3, lambda x: 3 >= x]
+        ):
+            with self.subTest(f"i = {i}"):
+                i0 = create_symint(shape_env, 2, duck=False)
+                self.assertTrue(bool(rel(i0)))
+                self.assertFalse(statically_known_true(i0 != 2))
+                self.assertFalse(statically_known_true(i0 != 3))
+                self.assertTrue(statically_known_true(i0 != 4))
+                self.assertTrue(statically_known_true(i0 != 5))
+                self.assertTrue(statically_known_true(i0 < 4))
+                self.assertTrue(statically_known_true(i0 <= 3))
+
+        for i, rel in enumerate(
+            [lambda x: x < 4, lambda x: 4 > x, lambda x: x <= 3, lambda x: 3 >= x]
+        ):
+            with self.subTest(f"i = {i}"):
+                i0 = create_symint(shape_env, 10, duck=False)
+                self.assertFalse(bool(rel(i0)))
+                self.assertTrue(statically_known_true(i0 != 2))
+                self.assertTrue(statically_known_true(i0 != 3))
+                self.assertFalse(statically_known_true(i0 != 4))
+                self.assertFalse(statically_known_true(i0 != 5))
+                self.assertTrue(statically_known_true(i0 >= 4))
+                self.assertTrue(statically_known_true(i0 > 3))
+
     def test_non_overlapping_and_dense(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 5)
-        r = torch.empty_strided((a0, 7), (1, a0), device='meta')
+        r = torch.empty_strided((a0, 7), (1, a0), device="meta")
         self.assertTrue(torch.ops.aten.is_non_overlapping_and_dense.default(r))
 
     def test_specialize_zero_one(self):
@@ -617,7 +822,9 @@ def f(a, b):
         fx_g = make_fx(f, tracing_mode="symbolic")(torch.randn(5, 3), torch.randn(4, 3))
         out = fx_g.print_readable(print_output=False)
 
-        self.assertExpectedInline(out.strip(), """\
+        self.assertExpectedInline(
+            out.strip(),
+            """\
 class f(torch.nn.Module):
     def forward(self, a_1: "f32[s0, s1]", b_1: "f32[s2, s1]"):
         # No stacktrace found for following nodes
@@ -631,14 +838,132 @@ def forward(self, a_1: "f32[s0, s1]", b_1: "f32[s2, s1]"):
         native_dropout = torch.ops.aten.native_dropout.default(new_empty, 0.5, True);  new_empty = None
         getitem: "f32[s0 + s2, 2*s1]" = native_dropout[0]
         getitem_1: "b8[s0 + s2, 2*s1]" = native_dropout[1];  native_dropout = None
-        return (getitem, getitem_1)""")  # noqa: B950
+        return (getitem, getitem_1)""",  # noqa: B950
+        )
+
+    def test_statically_known_true(self):
+        shape_env = ShapeEnv()
+        s2, s3, s4 = (create_symint(shape_env, i) for i in range(2, 5))
+
+        # Statically known true
+        self.assertTrue(statically_known_true(True))
+        self.assertTrue(statically_known_true(s2 == s2))
+        self.assertTrue(statically_known_true(s2 * s3 > s3))
+        self.assertTrue(statically_known_true(s3 * s4 > s4))
+        self.assertTrue(statically_known_true((s3 + s3) % 2 == 0))
+
+        # Statically known false
+        self.assertFalse(statically_known_true(False))
+        self.assertFalse(statically_known_true(s3 * s4 <= s4))
+        self.assertFalse(statically_known_true((s3 + s3) % 2 == 1))
+
+        # True for hints, but not known statically
+        self.assertFalse(statically_known_true(s2 + s2 == s4))
+        self.assertFalse(statically_known_true(s4 % s2 == 0))
+        self.assertFalse(statically_known_true(s2 != s3))
+        self.assertFalse(statically_known_true(s3 * s4 > s2))
+
+        # False for hints, but not known statically
+        self.assertFalse(statically_known_true(s2 == s3))
+        self.assertFalse(statically_known_true(s2 > s3))
+        self.assertFalse(statically_known_true(s3 + s3 == s4))
+
+        # No guards should be generated
+        self.assertEqual(len(shape_env.guards), 0)
+
+    def test_ephemeral_source_simplification(self):
+        from torch._dynamo.source import EphemeralSource
+
+        # For full robustness, ensure the ephemeral source symbols are simplified out regardless
+        # of construction order or check order.
+        for construct_ephemeral_first, x_first_in_check in itertools.product(
+            [False, True], [False, True]
+        ):
+            shape_env = ShapeEnv()
+            shape = (5, 10)
+            dynamic_dims = [DimDynamic.DYNAMIC for _ in shape]
+            x = create_symbolic_tensor(
+                "x",
+                torch.randn(*shape),
+                shape_env,
+                source=(EphemeralSource() if construct_ephemeral_first else None),
+                dynamic_dims=dynamic_dims,
+            )
+            y = create_symbolic_tensor(
+                "y",
+                torch.randn(*shape),
+                shape_env,
+                source=(EphemeralSource() if not construct_ephemeral_first else None),
+                dynamic_dims=dynamic_dims,
+            )
+            t_with_ephemeral = x if construct_ephemeral_first else y
+
+            def _get_ephemeral_source_symbols(t):
+                return [
+                    s.node.expr
+                    for s in itertools.chain(t.shape, t.stride(), (t.storage_offset(),))
+                    if isinstance(s, torch.SymInt)
+                    and s.node.expr in shape_env.var_to_sources
+                    and any(
+                        source.is_ephemeral()
+                        for source in shape_env.var_to_sources[s.node.expr]
+                    )
+                ]
+
+            # these checks should simplify out the ephemeral symbols, regardless of the
+            # ordering x == y or y == x
+            self.assertTrue(len(_get_ephemeral_source_symbols(t_with_ephemeral)) > 0)
+            if x_first_in_check:
+                torch._check(x.size() == y.size())
+                torch._check(x.stride() == y.stride())
+                torch._check(x.storage_offset() == y.storage_offset())
+            else:
+                torch._check(y.size() == x.size())
+                torch._check(y.stride() == x.stride())
+                torch._check(y.storage_offset() == x.storage_offset())
+            self.assertEqual(len(_get_ephemeral_source_symbols(t_with_ephemeral)), 0)
+
+    def test_ephemeral_source_unified_with_non_ephemeral_source(self):
+        from torch._dynamo.source import EphemeralSource
 
-@skipIfTorchDynamo("Creating ShapeEnv fails for confusing reasons (also we never expect dynamo to see code like this)")
+        for construct_ephemeral_first in (False, True):
+            shape_env = ShapeEnv()
+            shape = (5, 10)
+            # use duck sizing here to ensure symbol reuse across x and y
+            duck_dims = [DimDynamic.DUCK for _ in shape]
+            x = create_symbolic_tensor(
+                "x",
+                torch.randn(*shape),
+                shape_env,
+                source=(EphemeralSource() if construct_ephemeral_first else None),
+                dynamic_dims=duck_dims,
+            )
+            y = create_symbolic_tensor(
+                "y",
+                torch.randn(*shape),
+                shape_env,
+                source=(EphemeralSource() if not construct_ephemeral_first else None),
+                dynamic_dims=duck_dims,
+            )
+
+            # regardless of construction order, non-ephemeral sources should be preferred
+            # first in the var_to_sources list for potential guarding later on
+            for source_list in shape_env.var_to_sources.values():
+                self.assertFalse(source_list[0].is_ephemeral())
+
+            self.assertEqual(x.size(), y.size())
+            self.assertEqual(x.stride(), y.stride())
+            self.assertEqual(x.storage_offset(), y.storage_offset())
+
+
+@skipIfTorchDynamo(
+    "Creating ShapeEnv fails for confusing reasons (also we never expect dynamo to see code like this)"
+)
 class TestSymNumberMagicMethods(TestCase):
     def _do_test(self, fn, inp1, inp2, shape_env, is_unary_fn):
         # Helper function
         # NB: don't use one as that will get specialized
-        seed_node = (create_symint(shape_env, 2) / 2.).node
+        seed_node = (create_symint(shape_env, 2) / 2.0).node
         bool_seed_node = (create_symint(shape_env, 2) == 2).node
 
         def get_sym_inp(inp):
@@ -660,16 +985,19 @@ def maybe_xfail(inp1, inp2):
             elif fn == "pow" and inp1 == 0 and inp2 < 0:
                 # ZeroDivisionError: 0.0 cannot be raised to a negative power
                 return self.assertRaises((ZeroDivisionError,))
-            elif fn == "pow" and inp1 < 0 and inp2 in (2.5, -2.5) and (
-                type(inp1) in (SymFloat, SymInt) or
-                type(inp2) in (SymFloat, SymInt)
+            elif (
+                fn == "pow"
+                and inp1 < 0
+                and inp2 in (2.5, -2.5)
+                and (
+                    type(inp1) in (SymFloat, SymInt) or type(inp2) in (SymFloat, SymInt)
+                )
             ):
                 # Complex result, which we do not support:
                 # TypeError: Cannot convert complex to float
                 return self.assertRaises((TypeError,))
             elif fn in ("lshift", "rshift") and not (
-                isinstance(inp1, (SymInt, int)) and
-                isinstance(inp2, (SymInt, int))
+                isinstance(inp1, (SymInt, int)) and isinstance(inp2, (SymInt, int))
             ):
                 # TypeError: unsupported operand type(s)
                 return self.assertRaises((TypeError,))
@@ -728,7 +1056,6 @@ def guard_fn(v):
             out = guard_fn(out)
             self.assertEqual(out, ref_out)
 
-
     @parametrize("fn", list(sym_node.magic_methods.keys()))
     def test_bool_method(self, fn):
         # sym_ite has its own tests
@@ -739,7 +1066,6 @@ def test_bool_method(self, fn):
         shape_env = ShapeEnv()
         self._do_test(fn, True, False, shape_env, is_unary_fn)
 
-
     @parametrize("fn", list(sym_node.magic_methods.keys()))
     @parametrize("first_type", ["int", "float"])
     @parametrize("second_type", ["int", "float"])
@@ -748,7 +1074,9 @@ def test_method(self, fn, first_type, second_type):
             # TODO: Hmm, this looks like we skip all floats
             self.skipTest(f"{fn} is not a float magic method")
 
-        if (first_type == "int" or second_type == "int") and fn in sym_node.only_float_magic_methods:
+        if (
+            first_type == "int" or second_type == "int"
+        ) and fn in sym_node.only_float_magic_methods:
             self.skipTest(f"{fn} is not an int method")
 
         is_unary_fn = fn in sym_node.unary_methods or fn == "round"
@@ -764,7 +1092,7 @@ def test_method(self, fn, first_type, second_type):
         values = (
             0.0,
             1.0,
-            2.5,
+            0.5 if fn in ("sym_acos", "sym_asin") else 2.5,  # avoid math domain error
         )
 
         neg_values = tuple(-x for x in values)
@@ -803,10 +1131,10 @@ def test_symnode_hashing(self):
             with self.assertRaisesRegex(TypeError, "unhashable"):
                 hash(x)
 
-        # Singleton SymInt, constant SymBool, SymNode are hashable
-        j1 = torch._C._get_singleton_int(1, 1)
-        j1_copy = torch._C._get_singleton_int(1, 1)
-        j2 = torch._C._get_singleton_int(2, 1)
+        # NestedInt (SymInt), constant SymBool, SymNode are hashable
+        j1 = torch._C._get_nested_int(1, 1)
+        j1_copy = torch._C._get_nested_int(1, 1)
+        j2 = torch._C._get_nested_int(2, 1)
         t = self.get_constant_bool(True)
         t_copy = self.get_constant_bool(True)
         f = self.get_constant_bool(False)
@@ -825,15 +1153,24 @@ def test_symnode_hashing(self):
         hash(n)
         hash(m)
 
+    def test_symint_deepcopy(self):
+        shape_env = ShapeEnv()
+
+        symnodes = (torch._C._get_nested_int(1, 1),)
+        deepcopied_symnodes = copy.deepcopy(symnodes)
+        self.assertEqual(symnodes, deepcopied_symnodes)
+
     def test_non_symbolic_symnode(self):
-        j1 = torch._C._get_singleton_int(1, 1)
-        j2 = torch._C._get_singleton_int(1, 1)
-        j3 = torch._C._get_singleton_int(3, 1)
+        j1 = torch._C._get_nested_int(1, 1)
+        j2 = torch._C._get_nested_int(1, 1)
+        j3 = torch._C._get_nested_int(3, 1)
 
         self.assertIsInstance(j1, torch.SymInt)
         self.assertNotIsInstance(j1, int)
 
-        with self.assertRaisesRegex(RuntimeError, "add not supported by SingletonSymNode"):
+        with self.assertRaisesRegex(
+            RuntimeError, "add not supported by NestedIntSymNode"
+        ):
             j1 + 3
 
         self.assertFalse(j1 == 3)
@@ -887,8 +1224,10 @@ def test_non_symbolic_symnode(self):
         self.assertIs(sz1 == sz2, True)
         self.assertIs(sz1 != sz2, False)
 
+
 instantiate_parametrized_tests(TestSymNumberMagicMethods)
 
+
 class TestFloorDiv(TestCase):
     @staticmethod
     def python_floordiv(x, y):
@@ -921,7 +1260,9 @@ def test_floordiv_float_int(self):
         )
 
         for x, y in TestFloorDiv.yield_test_cases(values):
-            self.assertEqual(TestFloorDiv.python_floordiv(x, y), TestFloorDiv.torch_floordiv(x, y))
+            self.assertEqual(
+                TestFloorDiv.python_floordiv(x, y), TestFloorDiv.torch_floordiv(x, y)
+            )
 
     def test_floordiv_bool(self):
         values = (
@@ -934,14 +1275,20 @@ def test_floordiv_bool(self):
 
         for x, y in TestFloorDiv.yield_test_cases(values, negate=False):
             # Compares to int since our FloorDiv has no bool support
-            self.assertEqual(TestFloorDiv.python_floordiv(x, y), TestFloorDiv.torch_floordiv(int(x), int(y)))
+            self.assertEqual(
+                TestFloorDiv.python_floordiv(x, y),
+                TestFloorDiv.torch_floordiv(int(x), int(y)),
+            )
             # Tests that our impl throws
             self.assertRaisesRegex(
                 TypeError,
-                (rf"unsupported operand type\(s\) for //: "
-                 rf"'{type(sympy.sympify(x)).__name__}' and '{type(sympy.sympify(y)).__name__}'"
-                 rf", expected integer or real"),
-                lambda: TestFloorDiv.torch_floordiv(x, y))
+                (
+                    rf"unsupported operand type\(s\) for //: "
+                    rf"'{type(sympy.sympify(x)).__name__}' and '{type(sympy.sympify(y)).__name__}'"
+                    rf", expected integer or real"
+                ),
+                lambda: TestFloorDiv.torch_floordiv(x, y),
+            )
 
     def test_floordiv_complex(self):
         values = (
@@ -958,10 +1305,13 @@ def test_floordiv_complex(self):
             self.assertRaises(TypeError, lambda: TestFloorDiv.python_floordiv(x, y))
             self.assertRaisesRegex(
                 TypeError,
-                (rf"unsupported operand type\(s\) for //: "
-                 rf"'{type(sympy.sympify(x)).__name__}' and '{type(sympy.sympify(y)).__name__}'"
-                 rf", expected integer or real"),
-                lambda: TestFloorDiv.torch_floordiv(x, y))
+                (
+                    rf"unsupported operand type\(s\) for //: "
+                    rf"'{type(sympy.sympify(x)).__name__}' and '{type(sympy.sympify(y)).__name__}'"
+                    rf", expected integer or real"
+                ),
+                lambda: TestFloorDiv.torch_floordiv(x, y),
+            )
 
     def test_floordiv_div_by_zero(self):
         values = (
@@ -974,11 +1324,14 @@ def test_floordiv_div_by_zero(self):
             # We don't test error messages to avoid depending on Python
             # interpreter version
             if type(y) is not sympy.Symbol:
-                self.assertRaises(ZeroDivisionError, lambda: TestFloorDiv.python_floordiv(x, y))
+                self.assertRaises(
+                    ZeroDivisionError, lambda: TestFloorDiv.python_floordiv(x, y)
+                )
             self.assertRaisesRegex(
                 ZeroDivisionError,
                 "division by zero",
-                lambda: TestFloorDiv.torch_floordiv(x, y))
+                lambda: TestFloorDiv.torch_floordiv(x, y),
+            )
 
     def test_floordiv_zero_base(self):
         values = (
@@ -989,7 +1342,10 @@ def test_floordiv_zero_base(self):
 
         for x, y in TestFloorDiv.yield_test_cases(values, negate=False):
             if type(x) is not sympy.Symbol:
-                self.assertEqual(TestFloorDiv.python_floordiv(x, y), TestFloorDiv.torch_floordiv(x, y))
+                self.assertEqual(
+                    TestFloorDiv.python_floordiv(x, y),
+                    TestFloorDiv.torch_floordiv(x, y),
+                )
             else:
                 self.assertEqual(0, TestFloorDiv.torch_floordiv(x, y))
 
@@ -1002,7 +1358,9 @@ def test_floordiv_div_by_one(self):
         )
 
         for x, y in TestFloorDiv.yield_test_cases(values):
-            self.assertEqual(TestFloorDiv.python_floordiv(x, y), TestFloorDiv.torch_floordiv(x, y))
+            self.assertEqual(
+                TestFloorDiv.python_floordiv(x, y), TestFloorDiv.torch_floordiv(x, y)
+            )
 
     def test_floordiv_simplify(self):
         # Tests how we simplify or evaluate FloorDiv without free variables
@@ -1053,6 +1411,7 @@ def test_floordiv_assumptions(self):
         )
 
         for base, divisor in itertools.product(cases, repeat=2):
+
             def op():
                 return FloorDiv(base, divisor)
 
@@ -1062,9 +1421,12 @@ def is_complex(x):
             if is_complex(base) or is_complex(divisor):
                 self.assertRaisesRegex(
                     TypeError,
-                    (r"unsupported operand type\(s\) for //: 'Symbol' and 'Symbol',"
-                     r" expected integer or real"),
-                    op)
+                    (
+                        r"unsupported operand type\(s\) for //: 'Symbol' and 'Symbol',"
+                        r" expected integer or real"
+                    ),
+                    op,
+                )
                 continue
 
             op = op()
@@ -1096,7 +1458,7 @@ def test_dim_constraints_reduce_congruences_simple(self):
             s % 2,
             ((s / 16) + 2) % 4,
         }
-        congruences = dim_constraints.reduce_congruences()
+        congruences = dim_constraints._reduce_congruences()
         self.assertEqual(congruences[s], {(s + 32) % 64})
 
     def test_dim_constraints_reduce_inequalities_simple(self):
@@ -1123,9 +1485,47 @@ def test_dim_constraints_reduce_inequalities_simple(self):
         solution = reduce_inequalities(exprs, s).as_set()
         self.assertEqual(solution, {8})
 
+    def test_dim_constraints_reduce_inequalities_error(self):
+        from collections import defaultdict
+
+        from sympy import Symbol
+        from sympy.solvers.inequalities import reduce_inequalities
+        from torch._dynamo.source import (
+            LocalSource,
+            TensorProperty,
+            TensorPropertySource,
+        )
+        from torch.fx.experimental.symbolic_shapes import DynamicDimConstraintPrinter
+
+        s0 = Symbol("s0", positive=True, integer=True)
+        exprs = {
+            4 * s0**3 - 4 * s0**2 + s0 <= 2147483647,
+            s0 >= 2,
+            s0**3 <= 2147483647,
+            s0 <= 2147483647,
+        }
+        answer = reduce_inequalities(exprs, s0)
+
+        symbol_to_source = defaultdict(list)
+        symbol_to_source[s0].append(
+            TensorPropertySource(
+                base=LocalSource(local_name="a"), prop=TensorProperty.SIZE, idx=0
+            )
+        )
+        dcp = DynamicDimConstraintPrinter(symbol_to_source, {})
+        with self.assertRaisesRegex(
+            AssertionError,
+            "Unknown symbol.*created by constraints solver",
+        ):
+            dcp.doprint(answer)
+
     def test_dim_constraints_solve_full(self):
         from sympy import Eq, Integer, Ne, Symbol
-        from torch._dynamo.source import LocalSource, TensorProperty, TensorPropertySource
+        from torch._dynamo.source import (
+            LocalSource,
+            TensorProperty,
+            TensorPropertySource,
+        )
 
         src0 = TensorPropertySource(
             base=LocalSource(local_name="a"), prop=TensorProperty.SIZE, idx=0
@@ -1183,7 +1583,9 @@ def test_dim_constraints_solve_full(self):
         }
         var_to_val = {s0: 8, s1: 96, s5: 22, s6: 21}
         marked_dynamic = {s0, s1, s5, s6}
-        dim_constraints = DimConstraints(symbol_to_source, var_to_val, marked_dynamic, {})
+        dim_constraints = DimConstraints(
+            symbol_to_source, var_to_val, marked_dynamic, {}
+        )
         dim_constraints.add_equality(src2, s0)
         dim_constraints.add_equality(src3, s0)
         dim_constraints.add_equality(src4, s0)
@@ -1908,7 +2310,9 @@ def test_dim_constraints_solve_full(self):
                 FloorDiv(s0, 2),
             )
         )
-        dim_constraints.add(Ne(64 * (Mod(FloorDiv((FloorDiv(s1, 2) - 1), 8) + 1, 4)), 0))
+        dim_constraints.add(
+            Ne(64 * (Mod(FloorDiv((FloorDiv(s1, 2) - 1), 8) + 1, 4)), 0)
+        )
         dim_constraints.add(
             Eq(
                 64
@@ -2002,29 +2406,35 @@ def test_dim_constraints_solve_full(self):
 
         dim_constraints.solve()
         dim_constraints.remove_redundant_dynamic_results()
-        self.assertEqual(dim_constraints._static_results, {
-            "L['c'].size()[0] == 8",
-            "L['d'].size()[0] == 8",
-            "L['a'].size()[2] == 96",
-            "L['f'].size()[1] == 1",
-            "L['a'].size()[3] == 96",
-            "L['b'].size()[2] == 3",
-            "L['b'].size()[1] == 22",
-            "L['b'].size()[0] == 8",
-            "L['a'].size()[1] == 22",
-            "L['a'].size()[0] == 8",
-        })
-        self.assertEqual(dim_constraints._dynamic_results, {
-            "dynamic_dim(L['e'], 1) == dynamic_dim(L['c'], 1)",
-            "dynamic_dim(L['d'], 1) == dynamic_dim(L['c'], 1)",
-        })
+        self.assertEqual(
+            dim_constraints._static_results,
+            {
+                "L['c'].size()[0] == 8",
+                "L['d'].size()[0] == 8",
+                "L['a'].size()[2] == 96",
+                "L['f'].size()[1] == 1",
+                "L['a'].size()[3] == 96",
+                "L['b'].size()[2] == 3",
+                "L['b'].size()[1] == 22",
+                "L['b'].size()[0] == 8",
+                "L['a'].size()[1] == 22",
+                "L['a'].size()[0] == 8",
+            },
+        )
+        self.assertEqual(
+            dim_constraints._dynamic_results,
+            {
+                "dynamic_dim(L['e'], 1) == dynamic_dim(L['c'], 1)",
+                "dynamic_dim(L['d'], 1) == dynamic_dim(L['c'], 1)",
+            },
+        )
 
         def dummy_fn(a, b, c, d, e, f):
             pass
 
         action_code = dim_constraints.prettify_results(inspect.signature(dummy_fn))
         static_code, dynamic_code = re.findall(r"```(.*?)```", action_code, re.DOTALL)
-        expected_static = '''
+        expected_static = """
 def specializations(a, b, c, d, e, f):
     # a:
     assert a.size()[0] == 8
@@ -2045,8 +2455,8 @@ def specializations(a, b, c, d, e, f):
 
     # f:
     assert f.size()[1] == 1
-'''
-        expected_dynamic = '''
+"""
+        expected_dynamic = """
 def specify_constraints(a, b, c, d, e, f):
     return [
         # d:
@@ -2055,12 +2465,11 @@ def specify_constraints(a, b, c, d, e, f):
         # e:
         dynamic_dim(e, 1) == dynamic_dim(c, 1),
     ]
-'''
+"""
 
         self.assertEqual(static_code, expected_static)
         self.assertEqual(dynamic_code, expected_dynamic)
 
 
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_expanded_weights.py b/test/test_expanded_weights.py
index d05ed290bbae8..02cfca058c74e 100644
--- a/test/test_expanded_weights.py
+++ b/test/test_expanded_weights.py
@@ -1,43 +1,64 @@
 # Owner(s): ["module: nn"]
+import unittest
 from dataclasses import dataclass
 from functools import partial
-from itertools import product, chain
-import unittest
+from itertools import chain, product
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss
+from torch.nn.utils._expanded_weights import ExpandedWeight
+from torch.nn.utils._expanded_weights.expanded_weights_utils import (
+    forward_helper,
+    set_grad_sample_if_exists,
+    standard_kwargs,
+    sum_over_all_but_batch_and_last_n,
+    unpack_expanded_weight_or_tensor,
+)
 from torch.nn.utils._per_sample_grad import call_for_per_sample_grads
 from torch.testing._internal.common_cuda import TEST_CUDA, tf32_off
-from torch.testing._internal.common_device_type import OpDTypes, instantiate_device_type_tests, ops
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    OpDTypes,
+    ops,
+)
+from torch.testing._internal.common_methods_invocations import op_db, SampleInput
 from torch.testing._internal.common_modules import module_db, modules
-from torch.testing._internal.common_nn import TestBase, module_tests, new_module_tests
-from torch.testing._internal.common_utils import TestCase, freeze_rng_state, make_tensor, run_tests, parametrize
-from torch.testing._internal.common_methods_invocations import SampleInput, op_db
-from torch.nn.utils._expanded_weights import ExpandedWeight
-from torch.nn.utils._expanded_weights.expanded_weights_utils import forward_helper, set_grad_sample_if_exists, \
-    unpack_expanded_weight_or_tensor, sum_over_all_but_batch_and_last_n, standard_kwargs
+from torch.testing._internal.common_nn import module_tests, new_module_tests, TestBase
+from torch.testing._internal.common_utils import (
+    freeze_rng_state,
+    make_tensor,
+    parametrize,
+    run_tests,
+    skipIfTorchDynamo,
+    TestCase,
+)
 from torch.utils._pytree import tree_map_only
 
 
 class TestContext:
     pass
 
+
 class TestExpandedWeightHelperFunction(TestCase):
     def test_forward_helper(self, device):
         input = torch.randn(3, 4, device=device)
         weight = torch.randn(5, 4, device=device)
         bias = torch.randn(5, device=device)
-        for (weight_batched, bias_batched) in product([True, False], [True, False]):
+        for weight_batched, bias_batched in product([True, False], [True, False]):
             maybe_batched_weight = weight
             maybe_batched_bias = bias
             if weight_batched:
-                maybe_batched_weight = ExpandedWeight(weight.clone().requires_grad_(), 3, loss_reduction="sum")
+                maybe_batched_weight = ExpandedWeight(
+                    weight.clone().requires_grad_(), 3, loss_reduction="sum"
+                )
             if bias_batched:
-                maybe_batched_bias = ExpandedWeight(bias.clone().requires_grad_(), 3, loss_reduction="sum")
+                maybe_batched_bias = ExpandedWeight(
+                    bias.clone().requires_grad_(), 3, loss_reduction="sum"
+                )
             args = (input, maybe_batched_weight, maybe_batched_bias)
-            expanded_args, expanded_kwargs = standard_kwargs(('bias',), args)
+            expanded_args, expanded_kwargs = standard_kwargs(("bias",), args)
             res = forward_helper(nn.functional.linear, expanded_args, expanded_kwargs)
             expected = nn.functional.linear(input, weight, bias)
             self.assertEqual(res, expected)
@@ -46,36 +67,65 @@ def test_forward_helper(self, device):
             assert expanded_args[0] is args[0]  # avoids property checks in assertEquals
             assert expanded_args[1] is args[1]  # avoids property checks in assertEquals
             self.assertEqual(len(expanded_kwargs), 1)
-            assert expanded_kwargs['bias'] is args[2]  # avoids property checks in assertEquals
+            assert (
+                expanded_kwargs["bias"] is args[2]
+            )  # avoids property checks in assertEquals
 
     def test_forward_helper_failure_args(self, device):
         weight = torch.randn(5, 4, device=device)
         bias = torch.randn(5, device=device)
-        with self.assertRaisesRegex(RuntimeError, r"do not support inputs that are also ExpandedWeights."):
-            input = ExpandedWeight(torch.randn(3, 4, requires_grad=True), 3, loss_reduction="sum")
-            expanded_args, expanded_kwargs = standard_kwargs(('bias',), (input, weight, bias))
+        with self.assertRaisesRegex(
+            RuntimeError, r"do not support inputs that are also ExpandedWeights."
+        ):
+            input = ExpandedWeight(
+                torch.randn(3, 4, requires_grad=True), 3, loss_reduction="sum"
+            )
+            expanded_args, expanded_kwargs = standard_kwargs(
+                ("bias",), (input, weight, bias)
+            )
             forward_helper(nn.functional.linear, expanded_args, expanded_kwargs)
-        with self.assertRaisesRegex(RuntimeError, r"requires a Tensor as the first input"):
-            expanded_args, expanded_kwargs = standard_kwargs(('bias',), (3, weight, bias))
+        with self.assertRaisesRegex(
+            RuntimeError, r"requires a Tensor as the first input"
+        ):
+            expanded_args, expanded_kwargs = standard_kwargs(
+                ("bias",), (3, weight, bias)
+            )
             forward_helper(nn.functional.linear, expanded_args, expanded_kwargs)
-        with self.assertRaisesRegex(RuntimeError, r"requires a batch dimension but got an input of size 0"):
-            expanded_args, expanded_kwargs = standard_kwargs(('bias',), (torch.tensor(3), weight, bias))
+        with self.assertRaisesRegex(
+            RuntimeError, r"requires a batch dimension but got an input of size 0"
+        ):
+            expanded_args, expanded_kwargs = standard_kwargs(
+                ("bias",), (torch.tensor(3), weight, bias)
+            )
             forward_helper(nn.functional.linear, expanded_args, expanded_kwargs)
-        with self.assertRaisesRegex(RuntimeError, r"0 is not a valid batch size for Expanded Weights"):
-            expanded_args, expanded_kwargs = standard_kwargs(('bias',), (torch.randn(0, 1, 2), weight, bias))
+        with self.assertRaisesRegex(
+            RuntimeError, r"0 is not a valid batch size for Expanded Weights"
+        ):
+            expanded_args, expanded_kwargs = standard_kwargs(
+                ("bias",), (torch.randn(0, 1, 2), weight, bias)
+            )
             forward_helper(nn.functional.linear, expanded_args, expanded_kwargs)
         input = torch.randn(3, 4)
-        for (weight_batched, bias_batched) in product([True, False], [True, False]):
+        for weight_batched, bias_batched in product([True, False], [True, False]):
             if not weight_batched and not bias_batched:
                 continue
             maybe_batched_weight = weight
             maybe_batched_bias = bias
             if weight_batched:
-                maybe_batched_weight = ExpandedWeight(weight.clone().requires_grad_(), 4, loss_reduction="sum")
+                maybe_batched_weight = ExpandedWeight(
+                    weight.clone().requires_grad_(), 4, loss_reduction="sum"
+                )
             if bias_batched:
-                maybe_batched_bias = ExpandedWeight(bias.clone().requires_grad_(), 4, loss_reduction="sum")
-            with self.assertRaisesRegex(RuntimeError, r"Expected ExpandedWeights to have batch size matching input"):
-                expanded_args, expanded_kwargs = standard_kwargs(('bias',), (input, maybe_batched_weight, maybe_batched_bias))
+                maybe_batched_bias = ExpandedWeight(
+                    bias.clone().requires_grad_(), 4, loss_reduction="sum"
+                )
+            with self.assertRaisesRegex(
+                RuntimeError,
+                r"Expected ExpandedWeights to have batch size matching input",
+            ):
+                expanded_args, expanded_kwargs = standard_kwargs(
+                    ("bias",), (input, maybe_batched_weight, maybe_batched_bias)
+                )
                 forward_helper(nn.functional.linear, expanded_args, expanded_kwargs)
 
     def test_set_grad_sample_if_exists(self, device):
@@ -86,28 +136,36 @@ def test_fn(a):
         expanded_weight = ExpandedWeight(orig_weight, 3, loss_reduction="sum")
         grad_sample = torch.randn(3)
         set_grad_sample_if_exists(expanded_weight, test_fn)
-        self.assertTrue(hasattr(orig_weight, 'grad_sample'))
+        self.assertTrue(hasattr(orig_weight, "grad_sample"))
         self.assertEqual(orig_weight.grad_sample, grad_sample)
 
         basic_tensor = torch.randn(4, device=device)
         set_grad_sample_if_exists(basic_tensor, test_fn)
-        self.assertFalse(hasattr(basic_tensor, 'grad_sample'))
+        self.assertFalse(hasattr(basic_tensor, "grad_sample"))
 
         non_tensor = 3
         set_grad_sample_if_exists(non_tensor, test_fn)
-        self.assertFalse(hasattr(non_tensor, 'grad_sample'))
+        self.assertFalse(hasattr(non_tensor, "grad_sample"))
 
     def test_set_grad_sample_if_exists_failure(self, device):
         def test_fn(a):
             return True
 
         grad_tensor = torch.randn(4, requires_grad=True, device=device)
-        with self.assertRaisesRegex(RuntimeError, r"does not support a mixture of ExpandedWeight parameters and normal Parameters"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"does not support a mixture of ExpandedWeight parameters and normal Parameters",
+        ):
             set_grad_sample_if_exists(grad_tensor, test_fn)
 
     def test_unpack_expanded_weight_or_tensor(self, device):
         input = torch.randn(3, requires_grad=True, device=device)
-        self.assertEqual(input, unpack_expanded_weight_or_tensor(ExpandedWeight(input, 3, loss_reduction="sum")))
+        self.assertEqual(
+            input,
+            unpack_expanded_weight_or_tensor(
+                ExpandedWeight(input, 3, loss_reduction="sum")
+            ),
+        )
 
         input.requires_grad_(False)
         self.assertEqual(input, unpack_expanded_weight_or_tensor(input))
@@ -115,18 +173,30 @@ def test_unpack_expanded_weight_or_tensor(self, device):
 
     def test_unpack_expanded_weight_or_tensor_with_custom_function(self, device):
         input = torch.randn(3, requires_grad=True, device=device)
-        self.assertTrue(unpack_expanded_weight_or_tensor(ExpandedWeight(input, 3, loss_reduction="sum"), lambda x: x is input))
+        self.assertTrue(
+            unpack_expanded_weight_or_tensor(
+                ExpandedWeight(input, 3, loss_reduction="sum"), lambda x: x is input
+            )
+        )
 
         input.requires_grad_(False)
         self.assertTrue(unpack_expanded_weight_or_tensor(input, lambda x: x is input))
-        self.assertTrue(unpack_expanded_weight_or_tensor(4, lambda x: x is input) is None)
+        self.assertTrue(
+            unpack_expanded_weight_or_tensor(4, lambda x: x is input) is None
+        )
 
     def test_unpack_expanded_weight_or_tensor_failure(self, device):
         input = torch.randn(3, requires_grad=True, device=device)
-        with self.assertRaisesRegex(RuntimeError, r"does not support a mixture of ExpandedWeight parameters and normal Parameters"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"does not support a mixture of ExpandedWeight parameters and normal Parameters",
+        ):
             unpack_expanded_weight_or_tensor(input)
 
-        with self.assertRaisesRegex(RuntimeError, r"does not support a mixture of ExpandedWeight parameters and normal Parameters"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"does not support a mixture of ExpandedWeight parameters and normal Parameters",
+        ):
             unpack_expanded_weight_or_tensor(input, lambda x: x is input)
 
     def test_sum_over_all_but_batch_and_last_n(self, device):
@@ -142,6 +212,7 @@ def test_sum_over_all_but_batch_and_last_n(self, device):
         res = sum_over_all_but_batch_and_last_n(input, 4)
         self.assertEqual(res, input)
 
+
 class TestExpandedWeightFunctional(TestCase):
     def _compare_ew_and_for_loop_per_sample_grads(self, op, sample_input, reduction):
         input = sample_input.input
@@ -151,20 +222,32 @@ def _compare_ew_and_for_loop_per_sample_grads(self, op, sample_input, reduction)
 
         # get per sample grads with ExpandedWeights objects
         loss_reduction = "sum" if reduction == torch.sum else "mean"
-        (ew_input, ew_args, ew_kwargs) = make_expanded_weight(sample_input, batch_size, loss_reduction)
+        (ew_input, ew_args, ew_kwargs) = make_expanded_weight(
+            sample_input, batch_size, loss_reduction
+        )
         diff_input_list = (ew_input,) + tuple(ew_args) + tuple(ew_kwargs.values())
         diff_input_list = [i for i in diff_input_list if is_diff_tensor(i)]
-        diff_input_list = [i.orig_weight if isinstance(i, ExpandedWeight) else i for i in diff_input_list]
+        diff_input_list = [
+            i.orig_weight if isinstance(i, ExpandedWeight) else i
+            for i in diff_input_list
+        ]
         if not diff_input_list:
             return
         result = run_op(op, ew_input, *ew_args, **ew_kwargs)
-        reduction(result).backward()  # grad doesn't work with ExpandedWeight because it calls __torch_function__
-        expanded_weight_grad = tuple(i.grad_sample if hasattr(i, "grad_sample") else i.grad for i in diff_input_list)
+        reduction(
+            result
+        ).backward()  # grad doesn't work with ExpandedWeight because it calls __torch_function__
+        expanded_weight_grad = tuple(
+            i.grad_sample if hasattr(i, "grad_sample") else i.grad
+            for i in diff_input_list
+        )
 
         # get per sample grads with for loop
         func = partial(run_op, op)
 
-        per_sample_grad = for_loop_per_sample_grad(batch_size, reduction, input, func, *args, **kwargs)
+        per_sample_grad = for_loop_per_sample_grad(
+            batch_size, reduction, input, func, *args, **kwargs
+        )
 
         # check equality
         self.assertEqual(len(per_sample_grad), len(expanded_weight_grad))
@@ -172,93 +255,196 @@ def _compare_ew_and_for_loop_per_sample_grads(self, op, sample_input, reduction)
             # don't check equality of `input.grad`s since these vanilla tensors won't be scaled
             expanded_weight_grad = expanded_weight_grad[1:]
             per_sample_grad = per_sample_grad[1:]
-        for (result_grad, expected_grad) in zip(expanded_weight_grad, per_sample_grad):
+        for result_grad, expected_grad in zip(expanded_weight_grad, per_sample_grad):
             self.assertEqual(result_grad, expected_grad)
 
-    @ops(filter(lambda op: op.supports_expanded_weight, op_db), dtypes=OpDTypes.supported, allowed_dtypes=(torch.double,))
+    @ops(
+        filter(lambda op: op.supports_expanded_weight, op_db),
+        dtypes=OpDTypes.supported,
+        allowed_dtypes=(torch.double,),
+    )
     def test_expanded_weight_per_sample_grad_sum(self, device, dtype, op):
         sample_inputs = op.sample_inputs(device, dtype, requires_grad=True)
         for sample_input in supported_inputs(op, sample_inputs):
-            if op.name == "nn.functional.embedding":  # embedding flips its argument order for autograd tests
-                sample_input = SampleInput(sample_input.args[0], args=(sample_input.input,), kwargs=sample_input.kwargs)
+            if (
+                op.name == "nn.functional.embedding"
+            ):  # embedding flips its argument order for autograd tests
+                sample_input = SampleInput(
+                    sample_input.args[0],
+                    args=(sample_input.input,),
+                    kwargs=sample_input.kwargs,
+                )
 
             self._compare_ew_and_for_loop_per_sample_grads(op, sample_input, torch.sum)
 
-    @ops(filter(lambda op: op.supports_expanded_weight, op_db), dtypes=OpDTypes.supported, allowed_dtypes=(torch.double,))
+    @ops(
+        filter(lambda op: op.supports_expanded_weight, op_db),
+        dtypes=OpDTypes.supported,
+        allowed_dtypes=(torch.double,),
+    )
     def test_expanded_weight_per_sample_grad_mean(self, device, dtype, op):
         sample_inputs = op.sample_inputs(device, dtype, requires_grad=True)
         for sample_input in supported_inputs(op, sample_inputs):
-            if op.name == "nn.functional.embedding":  # embedding flips its argument order for autograd tests
-                sample_input = SampleInput(sample_input.args[0], args=(sample_input.input,), kwargs=sample_input.kwargs)
+            if (
+                op.name == "nn.functional.embedding"
+            ):  # embedding flips its argument order for autograd tests
+                sample_input = SampleInput(
+                    sample_input.args[0],
+                    args=(sample_input.input,),
+                    kwargs=sample_input.kwargs,
+                )
 
             self._compare_ew_and_for_loop_per_sample_grads(op, sample_input, torch.mean)
 
-    @ops(filter(lambda op: op.supports_expanded_weight, op_db), dtypes=OpDTypes.supported, allowed_dtypes=(torch.double,))
+    @ops(
+        filter(lambda op: op.supports_expanded_weight, op_db),
+        dtypes=OpDTypes.supported,
+        allowed_dtypes=(torch.double,),
+    )
     def test_expanded_weights_per_sample_grad_input_no_grad(self, device, dtype, op):
         sample_inputs = op.sample_inputs(device, dtype, requires_grad=True)
         for sample_input in supported_inputs(op, sample_inputs):
-            if op.name == "nn.functional.embedding":  # embedding flips its argument order for autograd tests
-                sample_input = SampleInput(sample_input.args[0], args=(sample_input.input,), kwargs=sample_input.kwargs)
+            if (
+                op.name == "nn.functional.embedding"
+            ):  # embedding flips its argument order for autograd tests
+                sample_input = SampleInput(
+                    sample_input.args[0],
+                    args=(sample_input.input,),
+                    kwargs=sample_input.kwargs,
+                )
             sample_input.input.requires_grad_(False)
 
             self._compare_ew_and_for_loop_per_sample_grads(op, sample_input, torch.mean)
 
-    @ops(filter(lambda op: op.supports_expanded_weight, op_db), dtypes=OpDTypes.supported, allowed_dtypes=(torch.double,))
+    @skipIfTorchDynamo("Checking error message doesn't work with dynamo")
+    @ops(
+        filter(lambda op: op.supports_expanded_weight, op_db),
+        dtypes=OpDTypes.supported,
+        allowed_dtypes=(torch.double,),
+    )
     def test_unsupported_expand_weights(self, device, dtype, op):
         sample_inputs = op.sample_inputs(device, dtype, requires_grad=True)
         unsupported_inputs = supported_inputs(op, sample_inputs, supported_inputs=False)
         for sample_input in unsupported_inputs:
             with self.assertRaisesRegex(RuntimeError, r"Expanded Weights"):
-                if op.name == "nn.functional.embedding":  # embedding flips its argument order for autograd tests
-                    sample_input = SampleInput(sample_input.args[0], args=(sample_input.input,), kwargs=sample_input.kwargs)
+                if (
+                    op.name == "nn.functional.embedding"
+                ):  # embedding flips its argument order for autograd tests
+                    sample_input = SampleInput(
+                        sample_input.args[0],
+                        args=(sample_input.input,),
+                        kwargs=sample_input.kwargs,
+                    )
                 input = sample_input.input
 
                 batch_size = input.shape[0] if len(input.shape) > 1 else 1
 
                 # get per sample grads with ExpandedWeights objects
-                (ew_input, ew_args, ew_kwargs) = make_expanded_weight(sample_input, batch_size)
+                (ew_input, ew_args, ew_kwargs) = make_expanded_weight(
+                    sample_input, batch_size
+                )
                 result = run_op(op, ew_input, *ew_args, **ew_kwargs)
-                diff_input_list = (ew_input,) + tuple(ew_args) + tuple(ew_kwargs.values())
+                diff_input_list = (
+                    (ew_input,) + tuple(ew_args) + tuple(ew_kwargs.values())
+                )
                 diff_input_list = [i for i in diff_input_list if is_diff_tensor(i)]
-                diff_input_list = [i.orig_weight if isinstance(i, ExpandedWeight) else i for i in diff_input_list]
+                diff_input_list = [
+                    i.orig_weight if isinstance(i, ExpandedWeight) else i
+                    for i in diff_input_list
+                ]
                 result.sum().backward()  # grad doesn't work with ExpandedWeight because it calls __torch_function__
 
-    @ops(filter(lambda op: op.supports_expanded_weight, op_db), dtypes=OpDTypes.supported)
+    @ops(
+        filter(lambda op: op.supports_expanded_weight, op_db), dtypes=OpDTypes.supported
+    )
     def test_expanded_weight_forward(self, device, dtype, op):
         sample_inputs = op.sample_inputs(device, dtype)
         for sample_input in supported_inputs(op, sample_inputs):
-            if op.name == "nn.functional.embedding":  # embedding flips its argument order for autograd tests
-                sample_input = SampleInput(sample_input.args[0].clone(),
-                                           args=(sample_input.input.clone(),),
-                                           kwargs=sample_input.kwargs)
-                if "cuda" in device and "max_norm" in sample_input.kwargs and "padding_idx" in sample_input.kwargs:
-                    self.skipTest("embedding is non-determinstic in this case, see issue #74679")
-            batch_size = sample_input.input.shape[0] if len(sample_input.input.shape) > 1 else 1
+            if (
+                op.name == "nn.functional.embedding"
+            ):  # embedding flips its argument order for autograd tests
+                sample_input = SampleInput(
+                    sample_input.args[0].clone(),
+                    args=(sample_input.input.clone(),),
+                    kwargs=sample_input.kwargs,
+                )
+                if (
+                    "cuda" in device
+                    and "max_norm" in sample_input.kwargs
+                    and "padding_idx" in sample_input.kwargs
+                ):
+                    self.skipTest(
+                        "embedding is non-determinstic in this case, see issue #74679"
+                    )
+            batch_size = (
+                sample_input.input.shape[0] if len(sample_input.input.shape) > 1 else 1
+            )
             for loss_reduction in ["sum", "mean"]:
-                (ew_input, ew_args, ew_kwargs) = make_expanded_weight(sample_input, batch_size, loss_reduction)
+                (ew_input, ew_args, ew_kwargs) = make_expanded_weight(
+                    sample_input, batch_size, loss_reduction
+                )
                 expanded_weight_result = run_op(op, ew_input, *ew_args, **ew_kwargs)
-                normal_result = run_op(op, sample_input.input, *sample_input.args, **sample_input.kwargs)
+                normal_result = run_op(
+                    op, sample_input.input, *sample_input.args, **sample_input.kwargs
+                )
                 self.assertEqual(expanded_weight_result, normal_result)
 
     def test_expanded_weight_error(self, device):
         batch_size = 3
-        sample_input = make_tensor((batch_size, 4), dtype=torch.float32, device=device, requires_grad=True)
-        sample_weight = make_tensor((4), dtype=torch.float32, device=device, requires_grad=True)
-        with self.assertRaisesRegex(RuntimeError, r"Expanded Weights encountered but cannot handle function"):
-            torch.add(sample_input, ExpandedWeight(sample_weight, batch_size, loss_reduction="sum"))
+        sample_input = make_tensor(
+            (batch_size, 4), dtype=torch.float32, device=device, requires_grad=True
+        )
+        sample_weight = make_tensor(
+            (4), dtype=torch.float32, device=device, requires_grad=True
+        )
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expanded Weights encountered but cannot handle function"
+        ):
+            torch.add(
+                sample_input,
+                ExpandedWeight(sample_weight, batch_size, loss_reduction="sum"),
+            )
 
     def _test_embedding_model(self, model, num_embedding, device):
         batch_size = 32
         input = torch.randint(0, num_embedding, (batch_size, 5, 5), device=device)
-        return self._test_model(partial(model, num_embedding=num_embedding), batch_size, input, device)
-
-    def _test_conv_model(self, model, input_size, num_dim, device, loss_reduction="sum", atol=1e-4, rtol=5e-5):
+        return self._test_model(
+            partial(model, num_embedding=num_embedding), batch_size, input, device
+        )
+
+    def _test_conv_model(
+        self,
+        model,
+        input_size,
+        num_dim,
+        device,
+        loss_reduction="sum",
+        atol=1e-4,
+        rtol=5e-5,
+    ):
         batch_size = 32
         input_ending = [input_size] * num_dim
         input = torch.randn([batch_size, 3] + input_ending, device=device)
-        return self._test_model(partial(model, num_dim=num_dim), batch_size, input, device, loss_reduction, atol, rtol)
-
-    def _test_model(self, model, batch_size, input, device, loss_reduction="sum", atol=1e-4, rtol=5e-5):
+        return self._test_model(
+            partial(model, num_dim=num_dim),
+            batch_size,
+            input,
+            device,
+            loss_reduction,
+            atol,
+            rtol,
+        )
+
+    def _test_model(
+        self,
+        model,
+        batch_size,
+        input,
+        device,
+        loss_reduction="sum",
+        atol=1e-4,
+        rtol=5e-5,
+    ):
         model = model(10).to(device)
         targets = torch.randint(0, 10, (batch_size,), device=device)
         criterion = CrossEntropyLoss(reduction=loss_reduction)
@@ -273,14 +459,18 @@ def _test_model(self, model, batch_size, input, device, loss_reduction="sum", at
         expected = []
         for i in range(batch_size):
             loss = criterion(model(input[i].unsqueeze(0)), targets[i].unsqueeze(0))
-            expected.append(torch.autograd.grad(loss, model.parameters(), torch.ones_like(loss)))
+            expected.append(
+                torch.autograd.grad(loss, model.parameters(), torch.ones_like(loss))
+            )
 
         expected = [torch.stack(grad) for grad in zip(*expected)]
-        for (res, exp) in zip(result, expected):
+        for res, exp in zip(result, expected):
             self.assertEqual(res, exp, atol=atol, rtol=rtol)
 
     def _compute_tolerances(self, device):
-        is_cuda_sm86 = device.startswith("cuda") and torch.cuda.get_device_capability(0) == (8, 6)
+        is_cuda_sm86 = device.startswith("cuda") and torch.cuda.get_device_capability(
+            0
+        ) == (8, 6)
         return (9e-3, 5e-5) if is_cuda_sm86 else (1e-4, 5e-5)
 
     @tf32_off()
@@ -325,60 +515,85 @@ def convnet(num_classes, num_dim):
                 nn.Flatten(start_dim=1, end_dim=-1),
                 nn.Linear(128, num_classes, bias=True),
             )
+
         atol, rtol = self._compute_tolerances(device)
-        return self._test_conv_model(convnet, 28, 2, device, loss_reduction="mean", atol=atol, rtol=rtol)
+        return self._test_conv_model(
+            convnet, 28, 2, device, loss_reduction="mean", atol=atol, rtol=rtol
+        )
 
-    @parametrize('num_dim', [1, 2, 3])
+    @parametrize("num_dim", [1, 2, 3])
     @tf32_off()
     def test_instance_norm_model(self, num_dim, device):
         def instance_norm_model(num_classes, num_dim):
-            conv_layer = nn.Conv1d if num_dim == 1 else nn.Conv2d if num_dim == 2 else nn.Conv3d
-            norm_layer = nn.InstanceNorm1d if num_dim == 1 else nn.InstanceNorm2d if num_dim == 2 else nn.InstanceNorm3d
+            conv_layer = (
+                nn.Conv1d if num_dim == 1 else nn.Conv2d if num_dim == 2 else nn.Conv3d
+            )
+            norm_layer = (
+                nn.InstanceNorm1d
+                if num_dim == 1
+                else nn.InstanceNorm2d
+                if num_dim == 2
+                else nn.InstanceNorm3d
+            )
             return nn.Sequential(
                 conv_layer(3, 32, kernel_size=3, stride=1, padding=1),
                 norm_layer(32, affine=True),
                 nn.Flatten(start_dim=1, end_dim=-1),
-                nn.Linear(32 * (7 ** num_dim), num_classes, bias=True),
+                nn.Linear(32 * (7**num_dim), num_classes, bias=True),
             )
+
         atol, rtol = self._compute_tolerances(device)
-        return self._test_conv_model(instance_norm_model, 7, num_dim, device, atol=atol, rtol=rtol)
+        return self._test_conv_model(
+            instance_norm_model, 7, num_dim, device, atol=atol, rtol=rtol
+        )
 
-    @parametrize('num_dim', [1, 2, 3])
+    @parametrize("num_dim", [1, 2, 3])
     @tf32_off()
     def test_group_norm_model(self, num_dim, device):
         def group_norm_model(num_classes, num_dim):
-            conv_layer = nn.Conv1d if num_dim == 1 else nn.Conv2d if num_dim == 2 else nn.Conv3d
+            conv_layer = (
+                nn.Conv1d if num_dim == 1 else nn.Conv2d if num_dim == 2 else nn.Conv3d
+            )
             return nn.Sequential(
                 conv_layer(3, 32, kernel_size=3, stride=1, padding=1),
                 nn.GroupNorm(8, 32, affine=True),
                 nn.Flatten(start_dim=1, end_dim=-1),
-                nn.Linear(32 * (7 ** num_dim), num_classes, bias=True),
+                nn.Linear(32 * (7**num_dim), num_classes, bias=True),
             )
+
         atol, rtol = self._compute_tolerances(device)
-        return self._test_conv_model(group_norm_model, 7, num_dim, device, atol=atol, rtol=rtol)
+        return self._test_conv_model(
+            group_norm_model, 7, num_dim, device, atol=atol, rtol=rtol
+        )
 
-    @parametrize('num_dim', [1, 2, 3])
+    @parametrize("num_dim", [1, 2, 3])
     @tf32_off()
     def test_layer_norm_model(self, num_dim, device):
         def layer_norm_model(num_classes, num_dim):
-            conv_layer = nn.Conv1d if num_dim == 1 else nn.Conv2d if num_dim == 2 else nn.Conv3d
+            conv_layer = (
+                nn.Conv1d if num_dim == 1 else nn.Conv2d if num_dim == 2 else nn.Conv3d
+            )
             normalized_shape = [7] * num_dim
             return nn.Sequential(
                 conv_layer(3, 32, kernel_size=3, stride=1, padding=1),
                 nn.LayerNorm(normalized_shape, elementwise_affine=True),
                 nn.Flatten(start_dim=1, end_dim=-1),
-                nn.Linear(32 * (7 ** num_dim), num_classes, bias=True),
+                nn.Linear(32 * (7**num_dim), num_classes, bias=True),
             )
+
         atol, rtol = self._compute_tolerances(device)
-        return self._test_conv_model(layer_norm_model, 7, num_dim, device, atol=atol, rtol=rtol)
+        return self._test_conv_model(
+            layer_norm_model, 7, num_dim, device, atol=atol, rtol=rtol
+        )
 
     def test_embedding_model(self, device):
         def embedding_model(num_classes, num_embedding):
             return nn.Sequential(
                 nn.Embedding(num_embedding, 15),
                 nn.Flatten(start_dim=1, end_dim=-1),
-                nn.Linear(375, num_classes, bias=True)
+                nn.Linear(375, num_classes, bias=True),
             )
+
         return self._test_embedding_model(embedding_model, 16, device)
 
     def test_group_norm_error(self, device):
@@ -388,11 +603,23 @@ def test_group_norm_error(self, device):
         N = 3
         C = 5
         inp = torch.randn(N, C)
-        with self.assertRaisesRegex(RuntimeError, r"Expected number of channels in input to be divisible"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expected number of channels in input to be divisible"
+        ):
             F.group_norm(inp, 2)  # 5 is not divisible by 2
 
+
 class TestExpandedWeightModule(TestCase):
-    def _do_test(self, module, input, args=None, kwargs=None, batch_first=True, atol=None, rtol=None):
+    def _do_test(
+        self,
+        module,
+        input,
+        args=None,
+        kwargs=None,
+        batch_first=True,
+        atol=None,
+        rtol=None,
+    ):
         args = args or ()
         kwargs = kwargs or {}
 
@@ -404,10 +631,12 @@ def _do_test(self, module, input, args=None, kwargs=None, batch_first=True, atol
 
         with freeze_rng_state():
             # get per sample grads with ExpandedWeights context manager
-            actual_res = call_for_per_sample_grads(module,
-                                                   batch_size=batch_size,
-                                                   loss_reduction="sum",
-                                                   batch_first=batch_first)(input, *args, **kwargs).sum()
+            actual_res = call_for_per_sample_grads(
+                module,
+                batch_size=batch_size,
+                loss_reduction="sum",
+                batch_first=batch_first,
+            )(input, *args, **kwargs).sum()
             actual_res.backward()
             actual_grads = []
             for param in module.parameters():
@@ -418,26 +647,39 @@ def _do_test(self, module, input, args=None, kwargs=None, batch_first=True, atol
                 input.grad = torch.zeros_like(input.grad)
 
             # get per sample grads with a for loop
-            expected_res = torch.tensor(0., device=input.device, dtype=actual_res.dtype)
+            expected_res = torch.tensor(
+                0.0, device=input.device, dtype=actual_res.dtype
+            )
             expected_grads = []
             for i in range(batch_size):
                 input_slice = input.narrow(batch_dim, i, 1)
                 input_slice = input_slice.squeeze(batch_dim)
 
                 # h's batch dim is always the first dim. Must be contiguous for CUDA
-                sliced_args = tree_map_only(torch.Tensor, lambda t: t.narrow(1, i, 1).contiguous(), args)
+                sliced_args = tree_map_only(
+                    torch.Tensor, lambda t: t.narrow(1, i, 1).contiguous(), args
+                )
                 diff_params = module.parameters()
                 if diff_input:
                     diff_params = chain(diff_params, (input_slice,))
-                res = module(input_slice.unsqueeze(batch_dim).contiguous(), *sliced_args, **kwargs).sum()
-                out_grads = torch.autograd.grad(res, diff_params, torch.ones_like(res), allow_unused=True)
+                res = module(
+                    input_slice.unsqueeze(batch_dim).contiguous(),
+                    *sliced_args,
+                    **kwargs,
+                ).sum()
+                out_grads = torch.autograd.grad(
+                    res, diff_params, torch.ones_like(res), allow_unused=True
+                )
                 expected_grads.append(out_grads)
                 expected_res += res
             expected_grads = [torch.stack(grad) for grad in zip(*expected_grads)]
             if not batch_first:
                 expected_grads[-1] = expected_grads[-1].transpose(0, 1)
         self.assertEqual(actual_res, expected_res)
-        [self.assertEqual(actual, expected, atol=atol, rtol=rtol) for (actual, expected) in zip(actual_grads, expected_grads)]
+        [
+            self.assertEqual(actual, expected, atol=atol, rtol=rtol)
+            for (actual, expected) in zip(actual_grads, expected_grads)
+        ]
 
     def _do_test_multi_input(self, module, input):
         class TestModule(nn.Module):
@@ -455,7 +697,9 @@ def forward(self, input):
         with freeze_rng_state():
             # get per sample grads with ExpandedWeights context manager, calling .backward() twice
             test_module = TestModule(module)
-            actual_res = call_for_per_sample_grads(test_module, loss_reduction="sum")(input).sum()
+            actual_res = call_for_per_sample_grads(test_module, loss_reduction="sum")(
+                input
+            ).sum()
             actual_res.backward()
             actual_grads = []
             for param in module.parameters():
@@ -465,7 +709,6 @@ def forward(self, input):
                 actual_grads.append(input.grad.clone())
                 input.grad = torch.zeros_like(input.grad)
 
-
             # get per sample grads with a for loop, running over the input twice
             expected_grads = []
             for i in range(batch_size):
@@ -474,13 +717,24 @@ def forward(self, input):
                 if diff_input:
                     diff_params = chain(diff_params, (input_slice,))
                 res = module(input_slice.unsqueeze(0)).sum()
-                out_grads = torch.autograd.grad(res, diff_params, torch.ones_like(res), allow_unused=True)
+                out_grads = torch.autograd.grad(
+                    res, diff_params, torch.ones_like(res), allow_unused=True
+                )
                 expected_grads.append(out_grads)
         expected_grads = tuple(torch.stack(grad) for grad in zip(*expected_grads))
-        expected_grads = tuple(expected_grad for expected_grad in expected_grads if expected_grad is not None)
-        assert [self.assertEqual(actual, 2 * expected) for (actual, expected) in zip(actual_grads, expected_grads)]
-
-    def _do_test_rnn_packed_sequence(self, module, input, args=None, kwargs=None, atol=None, rtol=None):
+        expected_grads = tuple(
+            expected_grad
+            for expected_grad in expected_grads
+            if expected_grad is not None
+        )
+        assert [
+            self.assertEqual(actual, 2 * expected)
+            for (actual, expected) in zip(actual_grads, expected_grads)
+        ]
+
+    def _do_test_rnn_packed_sequence(
+        self, module, input, args=None, kwargs=None, atol=None, rtol=None
+    ):
         args = args if args is not None else ()
         kwargs = kwargs if kwargs is not None else {}
 
@@ -488,9 +742,9 @@ def _do_test_rnn_packed_sequence(self, module, input, args=None, kwargs=None, at
 
         with freeze_rng_state():
             # get per sample grads with ExpandedWeights context manager
-            actual_res = call_for_per_sample_grads(module,
-                                                   batch_size=batch_size,
-                                                   loss_reduction="sum")(input, *args, **kwargs).data.sum()
+            actual_res = call_for_per_sample_grads(
+                module, batch_size=batch_size, loss_reduction="sum"
+            )(input, *args, **kwargs).data.sum()
             actual_res.backward()
             actual_grads = []
             for param in module.parameters():
@@ -503,21 +757,34 @@ def _do_test_rnn_packed_sequence(self, module, input, args=None, kwargs=None, at
             # compute the per sample grads with a for loop
             expected_res = torch.zeros_like(actual_res)
             expected_grads = []
-            padded_input, seq_sizes = torch.nn.utils.rnn.pad_packed_sequence(input, batch_first=True)
+            padded_input, seq_sizes = torch.nn.utils.rnn.pad_packed_sequence(
+                input, batch_first=True
+            )
             for i in range(len(seq_sizes)):
                 input_slice = padded_input[i].narrow(0, 0, seq_sizes[i])
                 diff_params = module.parameters()
                 batch_dim = 0 if module.m.batch_first else 1
                 res = module(input_slice.unsqueeze(batch_dim), *args, **kwargs).sum()
                 expected_res += res
-                out_grads = torch.autograd.grad(res, diff_params, torch.ones_like(res), allow_unused=True)
+                out_grads = torch.autograd.grad(
+                    res, diff_params, torch.ones_like(res), allow_unused=True
+                )
                 expected_grads.append(out_grads)
 
             expected_grads = [torch.stack(grad) for grad in zip(*expected_grads)]
             self.assertEqual(actual_res, expected_res)
-            [self.assertEqual(actual, expected, atol=atol, rtol=rtol) for (actual, expected) in zip(actual_grads, expected_grads)]
-
-    @modules(filter(lambda m_info: m_info.module_cls in (torch.nn.RNN, torch.nn.LSTM, torch.nn.GRU), module_db))
+            [
+                self.assertEqual(actual, expected, atol=atol, rtol=rtol)
+                for (actual, expected) in zip(actual_grads, expected_grads)
+            ]
+
+    @modules(
+        filter(
+            lambda m_info: m_info.module_cls
+            in (torch.nn.RNN, torch.nn.LSTM, torch.nn.GRU),
+            module_db,
+        )
+    )
     @tf32_off()
     def test_module(self, device, dtype, module_info, training):
         class RNNWrapper(torch.nn.Module):
@@ -535,20 +802,35 @@ def batch_hidden(h):
             new_h_shape[1] = 2
             return h.unsqueeze(1).repeat(new_h_shape)
 
-
         module_cls = module_info.module_cls
-        atol, rtol = (1e-4, 1e-5) if module_cls == torch.nn.GRU and dtype == torch.float32 else (None, None)
-        module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
-                                                       requires_grad=True, training=training, with_packed_sequence=True)
+        atol, rtol = (
+            (1e-4, 1e-5)
+            if module_cls == torch.nn.GRU and dtype == torch.float32
+            else (None, None)
+        )
+        module_inputs = module_info.module_inputs_func(
+            module_info,
+            device=device,
+            dtype=dtype,
+            requires_grad=True,
+            training=training,
+            with_packed_sequence=True,
+        )
         for module_input in module_inputs:
             if module_input.forward_input is None:
                 continue
-            args, kwargs = module_input.constructor_input.args, module_input.constructor_input.kwargs
+            args, kwargs = (
+                module_input.constructor_input.args,
+                module_input.constructor_input.kwargs,
+            )
             m = RNNWrapper(module_cls, args, kwargs)
             batch_first = m.m.batch_first
             m.to(device).to(dtype)
 
-            args, kwargs = module_input.forward_input.args, module_input.forward_input.kwargs
+            args, kwargs = (
+                module_input.forward_input.args,
+                module_input.forward_input.kwargs,
+            )
 
             # if the RNN tests use unbatched inputs--batch the inputs
             input = args[0]
@@ -564,21 +846,37 @@ def batch_hidden(h):
 
                 h = args[1] if len(args) > 1 else None
                 if h is not None:
-                    h = batch_hidden(h) if isinstance(h, torch.Tensor) else tuple(batch_hidden(hx) for hx in h)
+                    h = (
+                        batch_hidden(h)
+                        if isinstance(h, torch.Tensor)
+                        else tuple(batch_hidden(hx) for hx in h)
+                    )
                     args = list(args)
                     args[1] = h
 
             if isinstance(input, torch.nn.utils.rnn.PackedSequence):
-                self._do_test_rnn_packed_sequence(m, input, args[1:], kwargs, atol=atol, rtol=rtol)
+                self._do_test_rnn_packed_sequence(
+                    m, input, args[1:], kwargs, atol=atol, rtol=rtol
+                )
             else:
-                self._do_test(m, input, args[1:], kwargs, batch_first=batch_first, atol=atol, rtol=rtol)
+                self._do_test(
+                    m,
+                    input,
+                    args[1:],
+                    kwargs,
+                    batch_first=batch_first,
+                    atol=atol,
+                    rtol=rtol,
+                )
 
     def test_per_sample_api_failing(self):
         module = nn.Linear(10, 10)
         input = torch.randn(64, 10)
         with self.assertRaisesRegex(RuntimeError, r"Module passed must be nn.Module"):
             call_for_per_sample_grads("fail")(input)
-        with self.assertRaisesRegex(RuntimeError, r"Batch size passed must be None or an integer"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Batch size passed must be None or an integer"
+        ):
             call_for_per_sample_grads(module, batch_size=6.4)(input)
         with self.assertRaisesRegex(RuntimeError, r"Batch size must be positive"):
             call_for_per_sample_grads(module, batch_size=-64)(input)
@@ -588,7 +886,9 @@ def test_per_sample_api_failing(self):
             call_for_per_sample_grads(module)(input)
 
         module = nn.Linear(10, 10)  # reset to not have grad_sample fields
-        with self.assertRaisesRegex(RuntimeError, r"Expected loss_reduction argument to be sum or mean"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expected loss_reduction argument to be sum or mean"
+        ):
             call_for_per_sample_grads(module, loss_reduction="")(input)
 
     def test_per_sample_api_compute_batch_size(self):
@@ -604,7 +904,10 @@ def forward(self, input1, input2):
         input1 = torch.randn(4, 5)
         input2 = torch.randn(5, 5)
 
-        with self.assertRaisesRegex(RuntimeError, "found at least one input with batch size 4 and one with batch size 5"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "found at least one input with batch size 4 and one with batch size 5",
+        ):
             call_for_per_sample_grads(module)(input1, input2)
 
         input2 = torch.randn(4, 5)
@@ -632,11 +935,16 @@ def forward(self, input1, input2):
 
         input = NonPytreeableTuple(torch.randn(4, 5), torch.randn(4, 5))
         model = CustomModule()
-        with self.assertRaisesRegex(RuntimeError, "ExpandedWeights cannot compute the batch size from the inputs"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "ExpandedWeights cannot compute the batch size from the inputs",
+        ):
             call_for_per_sample_grads(model)(input, "")
 
         # would prefer for it to error because input is not pytree-able but that's hard to detect
-        with self.assertRaisesRegex(RuntimeError, "Expected ExpandedWeights to have batch size matching input"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected ExpandedWeights to have batch size matching input"
+        ):
             call_for_per_sample_grads(model)(input, torch.randn(5))
 
         model = CustomModule()  # TODO: functional call bug, sam will fix
@@ -644,70 +952,105 @@ def forward(self, input1, input2):
         model = CustomModule()
         call_for_per_sample_grads(model, batch_size=4)(input, torch.randn(5))
 
+
 class ContextManagerTests(TestBase):
     def __init__(self, *args, **kwargs):
-        self.test_cpu = kwargs.get('test_cpu', True)
-        self.test_cuda = kwargs.get('test_cuda', True)
+        self.test_cpu = kwargs.get("test_cpu", True)
+        self.test_cuda = kwargs.get("test_cuda", True)
         super().__init__(*args, **kwargs)
 
     @property
     def constructor_args(self):
-        return self._get_arg('constructor_args', False)
+        return self._get_arg("constructor_args", False)
 
     def test_context_manager(self, test_case, device):
-        kwargs = {'device': device, 'dtype': torch.double}
+        kwargs = {"device": device, "dtype": torch.double}
         module = self.constructor(*self.constructor_args).to(**kwargs)
-        if 'Embedding' in self.get_name():
-            kwargs['dtype'] = torch.long
+        if "Embedding" in self.get_name():
+            kwargs["dtype"] = torch.long
         input = self._get_input().to(**kwargs)
         if len(input.shape) == 0 or input.shape[0] == 0:
-            raise unittest.SkipTest("Can't get per sample gradients when no batch dim or batch dim is 0")
+            raise unittest.SkipTest(
+                "Can't get per sample gradients when no batch dim or batch dim is 0"
+            )
         if self.constructor == torch.nn.Linear and len(input.shape) == 1:
-            raise unittest.SkipTest("Can't get per sample gradients for input of rank 1")
+            raise unittest.SkipTest(
+                "Can't get per sample gradients for input of rank 1"
+            )
         test_case._do_test(module, input)
 
     def test_context_manager_multiple_inputs(self, test_case, device):
         module = self.constructor(*self.constructor_args).to(device)
         input = self._get_input()
         if len(input.shape) == 0 or input.shape[0] == 0:
-            raise unittest.SkipTest("Can't get per sample gradients when no batch dim or batch dim is 0")
+            raise unittest.SkipTest(
+                "Can't get per sample gradients when no batch dim or batch dim is 0"
+            )
         if self.constructor == torch.nn.Linear and len(input.shape) == 1:
-            raise unittest.SkipTest("Can't get per sample gradients for input of rank 1")
+            raise unittest.SkipTest(
+                "Can't get per sample gradients for input of rank 1"
+            )
         test_case._do_test_multi_input(module, input)
 
+
 def filter_supported_tests(t):
-    supported_modules = ['Linear', 'Conv1d', 'Conv2d', 'Conv3d', 'Embedding', 'LayerNorm', 'GroupNorm', 'InstanceNorm']
-    if 'module_name' in t and t['module_name'] in supported_modules:
+    supported_modules = [
+        "Linear",
+        "Conv1d",
+        "Conv2d",
+        "Conv3d",
+        "Embedding",
+        "LayerNorm",
+        "GroupNorm",
+        "InstanceNorm",
+    ]
+    if "module_name" in t and t["module_name"] in supported_modules:
         return True
 
+
 # TODO: Once all of these use ModuleInfo, replace with ModuleInfo tests
 # These currently use the legacy nn tests
-supported_tests = [t for t in module_tests + new_module_tests if filter_supported_tests(t)]
+supported_tests = [
+    t for t in module_tests + new_module_tests if filter_supported_tests(t)
+]
 for test_param in supported_tests:
-    if 'constructor' not in test_param:
-        name = test_param.pop('module_name')
-        test_param['constructor'] = getattr(nn, name)
-    decorator = test_param.pop('decorator', None)
+    if "constructor" not in test_param:
+        name = test_param.pop("module_name")
+        test_param["constructor"] = getattr(nn, name)
+    decorator = test_param.pop("decorator", lambda test: test)
     test = ContextManagerTests(**test_param)
     test_name = test.get_name()
     if hasattr(TestExpandedWeightModule, test_name):
-        raise RuntimeError('Found two tests with the same name: ' + test_name)
+        raise RuntimeError("Found two tests with the same name: " + test_name)
     test_name_multi_input = test.get_name() + "_multiple_inputs"
     if hasattr(TestExpandedWeightModule, test_name_multi_input):
-        raise RuntimeError('Found two tests with the same name: ' + test_name)
-    if decorator is not None:
-        fn = decorator(fn)  # noqa: F821
+        raise RuntimeError("Found two tests with the same name: " + test_name)
     if test.test_cpu:
-        setattr(TestExpandedWeightModule, test_name, lambda self, test=test: test.test_context_manager(self, 'cpu'))
-        setattr(TestExpandedWeightModule, test_name_multi_input,
-                lambda self, test=test: test.test_context_manager_multiple_inputs(self, 'cpu'))
+        setattr(
+            TestExpandedWeightModule,
+            test_name,
+            decorator(lambda self, test=test: test.test_context_manager(self, "cpu")),
+        )
+        setattr(
+            TestExpandedWeightModule,
+            test_name_multi_input,
+            decorator(
+                lambda self, test=test: test.test_context_manager_multiple_inputs(
+                    self, "cpu"
+                )
+            ),
+        )
     if TEST_CUDA and test.test_cuda:
         # since this checks derivatives, only use double for precision
-        setattr(TestExpandedWeightModule, test_name + '_cuda_double',
-                lambda self, test=test: test.test_context_manager(self, 'cuda'))
+        setattr(
+            TestExpandedWeightModule,
+            test_name + "_cuda_double",
+            decorator(lambda self, test=test: test.test_context_manager(self, "cuda")),
+        )
 
 # ------------- HELPER FUNCTIONS -----------------
 
+
 def run_op(op, input, *args, **kwargs):
     r"""
     OpInfo for Embedding switches the input and weight so autograd tests will only check the derivative
@@ -719,6 +1062,7 @@ def run_op(op, input, *args, **kwargs):
     else:
         return op(input, *args, **kwargs)
 
+
 def make_expanded_weight(sample_input, batch_size, loss_reduction="sum"):
     def expanded_weight_or_clone(arg):
         if is_diff_tensor(arg):
@@ -727,22 +1071,35 @@ def expanded_weight_or_clone(arg):
 
     ew_input = clone_if_tensor(sample_input.input)
     ew_args = tuple(expanded_weight_or_clone(arg) for arg in sample_input.args)
-    ew_kwargs = {name: expanded_weight_or_clone(arg) for (name, arg) in sample_input.kwargs.items()}
+    ew_kwargs = {
+        name: expanded_weight_or_clone(arg)
+        for (name, arg) in sample_input.kwargs.items()
+    }
     return ew_input, ew_args, ew_kwargs
 
+
 def supported_inputs(op, sample_inputs, supported_inputs=True):
     r"""
     ExpandedWeights currently does not support some use cases when there's no batch dimension or
     operations that would cause inter-batch operations. Removes all of the cases it cannot deal with
     """
+
     def filter_fn(input):
-        convolutions = ["nn.functional.conv1d", "nn.functional.conv2d", "nn.functional.conv3d"]
+        convolutions = [
+            "nn.functional.conv1d",
+            "nn.functional.conv2d",
+            "nn.functional.conv3d",
+        ]
         batched_input_size = dict(zip(convolutions, [3, 4, 5]))
         if op.name == "nn.functional.linear":
-            is_supported_input = input.input.dim() > 1  # input of rank 1 means no batch dim
+            is_supported_input = (
+                input.input.dim() > 1
+            )  # input of rank 1 means no batch dim
         elif op.name == "nn.functional.layer_norm":
             normalized_shape = input.args[0]
-            is_supported_input = input.input.shape != normalized_shape  # would cause inter-batch operations
+            is_supported_input = (
+                input.input.shape != normalized_shape
+            )  # would cause inter-batch operations
         elif op.name in convolutions:
             # currently can't deal with padding computation on Python level
             is_supported_input = input.input.dim() == batched_input_size[op.name]
@@ -751,10 +1108,14 @@ def filter_fn(input):
             is_supported_input = len(idx.shape) > 1  # there's no batch size
         else:
             is_supported_input = True
-        is_supported_input = is_supported_input and input.input.shape[0] > 0  # 0 is not a valid batch size
+        is_supported_input = (
+            is_supported_input and input.input.shape[0] > 0
+        )  # 0 is not a valid batch size
         return is_supported_input if supported_inputs else not is_supported_input
+
     return [input for input in sample_inputs if filter_fn(input)]
 
+
 def for_loop_per_sample_grad(batch_size, reduction, input, func, *args, **kwargs):
     # get per sample grads by getting derivative for each input in a for loop
     per_sample_grad = []
@@ -762,14 +1123,26 @@ def for_loop_per_sample_grad(batch_size, reduction, input, func, *args, **kwargs
         per_sample_input = input[i]
         result = reduction(func(per_sample_input.unsqueeze(0), *args, **kwargs))
         diff_input_list = (per_sample_input,) + tuple(args) + tuple(kwargs.values())
-        diff_input_list = [i for i in diff_input_list if isinstance(i, torch.Tensor) and i.requires_grad]
-        per_sample_grad.append(torch.autograd.grad(result, diff_input_list, torch.ones_like(result), allow_unused=True))
+        diff_input_list = [
+            i
+            for i in diff_input_list
+            if isinstance(i, torch.Tensor) and i.requires_grad
+        ]
+        per_sample_grad.append(
+            torch.autograd.grad(
+                result, diff_input_list, torch.ones_like(result), allow_unused=True
+            )
+        )
     if len(per_sample_grad) == batch_size:
         per_sample_grad = tuple(torch.stack(grad) for grad in zip(*per_sample_grad))
     return per_sample_grad
 
+
 def is_diff_tensor(t):
-    return isinstance(t, ExpandedWeight) or (isinstance(t, torch.Tensor) and t.requires_grad)
+    return isinstance(t, ExpandedWeight) or (
+        isinstance(t, torch.Tensor) and t.requires_grad
+    )
+
 
 def clone_if_tensor(t):
     if isinstance(t, torch.Tensor):
@@ -779,8 +1152,9 @@ def clone_if_tensor(t):
     else:
         return t
 
+
 instantiate_device_type_tests(TestExpandedWeightHelperFunction, globals())
 instantiate_device_type_tests(TestExpandedWeightFunctional, globals())
 instantiate_device_type_tests(TestExpandedWeightModule, globals())
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 2f562296abfc5..40075eb24e04c 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -1,21 +1,28 @@
 # Owner(s): ["module: meta tensors"]
 
+
 from torch.testing._internal.common_utils import (
     TestCase, TEST_WITH_TORCHDYNAMO, run_tests, skipIfCrossRef, skipIfRocm, skipIfTorchDynamo, parametrize,
-    instantiate_parametrized_tests)
+    instantiate_parametrized_tests, TemporaryFileName)
 import torch
 import torch._dynamo
+from torch._dynamo.testing import make_test_cls_with_patches
 import itertools
 import numpy as np
 from torch.testing._internal.jit_utils import RUN_CUDA
+from torch._guards import tracing, TracingContext
 from torch._subclasses.fake_tensor import (
+    extract_tensor_metadata,
     FakeTensor,
     FakeTensorMode,
     FakeTensorConverter,
     DynamicOutputShapeException,
     UnsupportedOperatorException,
+    unset_fake_temporarily,
+)
+from torch.fx.experimental.symbolic_shapes import (
+    ShapeEnv, DimDynamic, free_symbols, StatelessSymbolicContext, ShapeEnvSettings, statically_known_true
 )
-from torch.fx.experimental.symbolic_shapes import ShapeEnv, DimDynamic, free_symbols, StatelessSymbolicContext
 from torch.testing._internal.custom_op_db import custom_op_db
 from torch.testing._internal.common_device_type import ops
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, OpDTypes
@@ -24,11 +31,14 @@
 from torch._dynamo.testing import rand_strided
 from torch._C._functorch import is_batchedtensor, _add_batch_dim, get_unwrapped
 from torch.testing import FileCheck
+import dataclasses
+import inspect
 import unittest
 import torch._prims as prims
 import contextlib
 import weakref
 import copy
+import pickle
 import torch._functorch.config
 import torch.testing._internal.optests as optests
 from unittest.mock import patch
@@ -37,6 +47,16 @@
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._python_dispatch import TorchDispatchMode
 import torch.utils._pytree as pytree
+from torch.fx.experimental.proxy_tensor import make_fx
+
+aten = torch.ops.aten
+
+torch._dynamo.config.fake_tensor_cache_enabled = True
+torch._dynamo.config.fake_tensor_cache_crosscheck_enabled = True
+
+def expectedFailurePropagateRealTensors(fn):
+    fn._expected_failure_propagate_real_tensors = True
+    return fn
 
 class FakeTensorTest(TestCase):
     def checkType(self, t, device_str, size):
@@ -65,31 +85,25 @@ def test_basic(self):
             self.assertEqual(z.device, torch.device("cpu"))
             self.assertTrue(isinstance(z, FakeTensor))
 
-    def test_basic_forced_memo_only(self):
-        x = torch.empty(2, 2, device="cpu")
-        y = torch.empty(4, 2, 2, device="cpu")
-        with FakeTensorMode() as mode:
-            x_fake = mode.from_tensor(x)
-            x2 = mode.from_tensor(x, memoized_only=True)
-            self.assertTrue(x2 is not None)
-            y = mode.from_tensor(y, memoized_only=True)
-            self.assertIs(y, None)
-
     def test_custom_op_fallback(self):
         from torch.library import Library, impl
 
-        test_lib = Library("my_test_op", "DEF")
-        test_lib.define('foo(Tensor self) -> Tensor')
+        try:
+            test_lib = Library("my_test_op", "DEF")  # noqa: TOR901
+            test_lib.define('foo(Tensor self) -> Tensor')
 
-        @impl(test_lib, 'foo', 'CPU')
-        def foo_impl(self):
-            return self.cos()
+            @impl(test_lib, 'foo', 'CPU')
+            def foo_impl(self):
+                return self.cos()
 
-        x = torch.empty(2, 2, device="cpu")
-        with self.assertRaisesRegex(UnsupportedOperatorException, "my_test_op.foo.default"):
-            with FakeTensorMode(allow_fallback_kernels=True) as mode:
-                x = mode.from_tensor(x)
-                torch.ops.my_test_op.foo(x)
+            x = torch.empty(2, 2, device="cpu")
+            with self.assertRaisesRegex(UnsupportedOperatorException, "my_test_op.foo.default"):
+                with FakeTensorMode(allow_fallback_kernels=True) as mode:
+                    x = mode.from_tensor(x)
+                    torch.ops.my_test_op.foo(x)
+
+        finally:
+            test_lib._destroy()
 
     def test_parameter_instantiation(self):
         with FakeTensorMode():
@@ -202,6 +216,8 @@ def test_fake_dispatch_keys(self):
                 FileCheck().check("CPU").check("AutocastCPU").run(torch._C._dispatch_key_set(y))
                 FileCheck().check_not("ADInplaceOrView").check_not("Autograd").run(torch._C._dispatch_key_set(y))
 
+    # TODO: functorch support for propagate real tensors
+    @expectedFailurePropagateRealTensors
     def test_batch_tensor(self):
         x = torch.rand((3, 4, 5))
         b = _add_batch_dim(x, 0, 0)
@@ -387,10 +403,10 @@ def test_out_multi_device(self):
             x = torch.rand([4])
             y = torch.rand([4], device="cuda")
 
-            with self.assertRaisesRegex(Exception, "found two different devices"):
+            with self.assertRaisesRegex(Exception, "found.+two.+devices"):
                 torch.sin(x, out=y)
 
-            with self.assertRaisesRegex(Exception, "found two different devices"):
+            with self.assertRaisesRegex(Exception, "found.+two.+devices"):
                 x.add_(y)
 
 
@@ -554,6 +570,8 @@ def test_tolist(self):
             x = torch.rand([10])
             x.tolist()
 
+    # Propagate real tensors doesn't work with fake-on-fake
+    @expectedFailurePropagateRealTensors
     def test_same_shape_env_preserved(self):
         shape_env = ShapeEnv()
         mode1 = FakeTensorMode(shape_env=shape_env)
@@ -573,6 +591,9 @@ def test_same_shape_env_preserved(self):
         self.assertIs(t2.size(0).node.shape_env, t1.size(0).node.shape_env)
         self.assertEqual(str(t2.size(0)), str(t1.size(0)))
 
+    # TODO: Support NJT.  There's also some funny business with dynamic shapes
+    # which would need to be dealt with as well
+    @expectedFailurePropagateRealTensors
     def test_jagged_fake_to_fake_preserved(self):
         from torch.nested._internal.nested_tensor import jagged_from_list
 
@@ -731,7 +752,9 @@ def test_aten_index_multi_device(self):
             x2 = torch.rand(4, 4, device="cuda")
             i1 = torch.tensor([0, 1], device="cuda")
             i2 = torch.tensor([0, 1], device="cpu")
-            r1 = torch.ops.aten.index(x1, i1)
+            # NB: This one does not work: cuda indices not allowed on cpu
+            # tensor
+            # r1 = torch.ops.aten.index(x1, i1)
             r2 = torch.ops.aten.index(x2, i2)
 
             y1 = torch.rand(4, device="cpu")
@@ -740,7 +763,7 @@ def test_aten_index_multi_device(self):
             j2 = torch.tensor([2], device="cpu")
             r3 = torch.ops.aten.index_put.default(x1, j1, y1)
             r4 = torch.ops.aten.index_put.default(x2, j2, y2)
-        self.checkType(r1, "cpu", ())
+        # self.checkType(r1, "cpu", ())
         self.checkType(r2, "cuda", ())
         self.checkType(r3, "cpu", (4, 4))
         self.checkType(r4, "cuda", (4, 4))
@@ -769,6 +792,55 @@ def test__adaptive_avg_pool2d_backward(self):
             grad_in = torch.ops.aten._adaptive_avg_pool2d_backward(grad_out, inp)
             self.assertTrue(torch._prims_common.suggest_memory_format(grad_in) == torch.channels_last)
 
+    # Propagate real tensors doesn't work when original input arguments are
+    # fake
+    @expectedFailurePropagateRealTensors
+    def test_export_numpy(self):
+        class MyNumpyModel(torch.nn.Module):
+            def forward(self, input):
+                input = input.numpy()
+                return input + np.random.randn(*input.shape)
+
+        with FakeTensorMode():
+            ep = torch.export.export(MyNumpyModel(), args=(torch.randn(1000),))
+            self.assertTrue(isinstance(ep, torch.export.ExportedProgram))
+
+    def test_alias_call(self):
+        fwAD = torch.autograd.forward_ad
+
+        def f(x):
+            return 4312491 * x
+
+        with torch._subclasses.fake_tensor.FakeTensorMode():
+            with fwAD.dual_level():
+                x = torch.randn(3, device="cpu")
+                y = torch.ones_like(x)
+                dual = fwAD.make_dual(x, y)
+                r = f(dual)
+
+        self.assertIsInstance(r, FakeTensor)
+        self.assertEqual(r.size(), [3])
+
+
+instantiate_parametrized_tests(FakeTensorTest)
+
+
+def make_propagate_real_tensors_cls(cls):
+    cls = make_test_cls_with_patches(
+        cls,
+        "PropagateRealTensors",
+        "_propagate_real_tensors",
+        (torch._functorch.config, "fake_tensor_propagate_real_tensors", True),
+        xfail_prop="_expected_failure_propagate_real_tensors",
+        decorator=skipIfTorchDynamo("propagate_real_tensors affects Dynamo"),
+    )
+    cls.__file__ = __file__
+    cls.__module__ = __name__
+    globals()[cls.__name__] = cls
+
+
+make_propagate_real_tensors_cls(FakeTensorTest)
+
 
 class FakeTensorConstHandling(TestCase):
     def assertConst(self, *args):
@@ -860,6 +932,10 @@ def test_constant_propagate_through_functions(self):
             y = torch.div(4, 4, rounding_mode='trunc')
             self.assertConst(y)
 
+
+make_propagate_real_tensors_cls(FakeTensorConstHandling)
+
+
 def contains_type(type: torch._C.Type, maybe_contained_type: torch._C.Type):
     return maybe_contained_type.isSubtypeOf(type) or any(
         contains_type(e, maybe_contained_type) for e in type.containedTypes()
@@ -876,6 +952,11 @@ def test_fake(self, device, dtype, op):
             optests.fake_check(op, args, kwargs)
 
 
+make_propagate_real_tensors_cls(FakeTensorOpInfoTest)
+instantiate_device_type_tests(FakeTensorOpInfoTest, globals(), only_for=("cpu", "cuda"))
+instantiate_device_type_tests(PropagateRealTensorsFakeTensorOpInfoTest, globals(), only_for=("cpu",))  # noqa: F821
+
+
 class FakeTensorConverterTest(TestCase):
     def test_memoized_conversion_to_meta(self):
         x = torch.rand(2, 2, 2)
@@ -893,8 +974,8 @@ def test_separate_tensor_storages_view(self):
         y = x[0]
         mode = FakeTensorMode()
         converter = mode.fake_tensor_converter
-        x_conv = converter(mode, x)
-        y_conv = converter(mode, y)
+        x_conv = converter.from_real_tensor(mode, x)
+        y_conv = converter.from_real_tensor(mode, y)
         self.assertEqual(torch._C._storage_id(x_conv), torch._C._storage_id(y_conv))
 
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1991")
@@ -904,17 +985,17 @@ def test_separate_tensor_storages_non_view(self):
         y.set_(x.storage())
         mode = FakeTensorMode()
         converter = mode.fake_tensor_converter
-        x_conv = converter(mode, x)
-        y_conv = converter(mode, y)
+        x_conv = converter.from_real_tensor(mode, x)
+        y_conv = converter.from_real_tensor(mode, y)
         stor_id = torch._C._storage_id(x_conv)
         self.assertEqual(stor_id, torch._C._storage_id(y_conv))
         del x
+        del x_conv
         self.assertEqual(len(converter.tensor_memo), 1)
-        converter.meta_converter.check_for_expired_weak_storages()
         self.assertEqual(len(converter.meta_converter.storage_memo), 1)
         del y
+        del y_conv
         self.assertEqual(len(converter.tensor_memo), 0)
-        converter.meta_converter.check_for_expired_weak_storages()
         self.assertEqual(len(converter.meta_converter.storage_memo), 0)
 
 
@@ -924,23 +1005,25 @@ def test_dead_weak_ref(self):
         y = x[0]
         mode = FakeTensorMode()
         converter = FakeTensorConverter()
-        x_conv = converter(mode, x)
-        x_conv_storage = torch._C._storage_id(x_conv)
+        x_conv = converter.from_real_tensor(mode, x)
+        x_conv_storage = x_conv.untyped_storage()
         del x_conv
         self.assertFalse(x in converter.tensor_memo)
-        y_conv = converter(mode, y)
-        self.assertEqual(x_conv_storage, torch._C._storage_id(y_conv))
+        y_conv = converter.from_real_tensor(mode, y)
+        self.assertIs(x_conv_storage, y_conv.untyped_storage())
 
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1991")
     def test_dead_key(self):
         x = torch.rand(2, 2, 2)
         mode = FakeTensorMode()
         converter = FakeTensorConverter()
-        x_conv = converter(mode, x)
+        x_conv = converter.from_real_tensor(mode, x)
         self.assertEqual(len(converter.tensor_memo), 1)
-        x_conv2 = converter(mode, x)
+        x_conv2 = converter.from_real_tensor(mode, x)
         assert x_conv2 is x_conv
         del x
+        del x_conv
+        del x_conv2
         self.assertEqual(len(converter.tensor_memo), 0)
 
     def test_no_active_mode(self):
@@ -985,16 +1068,17 @@ def test_no_ref_cycle(self):
         assert y_weak() is None
 
 
+make_propagate_real_tensors_cls(FakeTensorConverterTest)
+
+
 class FakeTensorOperatorInvariants(TestCase):
-    @staticmethod
-    def get_aten_op(schema):
+    def get_aten_op(self, schema):
         namespace, name = schema.name.split("::")
         overload = schema.overload_name if schema.overload_name else "default"
         assert namespace == "aten"
         return getattr(getattr(torch.ops.aten, name), overload)
 
-    @staticmethod
-    def get_all_aten_schemas():
+    def get_all_aten_schemas(self):
         for schema in torch._C._jit_get_all_schemas():
             namespace = schema.name.split("::")[0]
             if namespace != "aten":
@@ -1058,6 +1142,23 @@ def test_like_ops(self):
                 op = self.get_aten_op(schema)
                 self.assertIn(op, torch._subclasses.fake_tensor._like_tensor_constructors)
 
+    def test_str_storage(self):
+        x = torch.zeros(3)
+        with FakeTensorMode() as m:
+            y = m.from_tensor(x)
+            self.assertExpectedInline(str(x.storage()), '''\
+ 0.0
+ 0.0
+ 0.0
+[torch.storage.TypedStorage(dtype=torch.float32, device=cpu) of size 3]''')
+            self.assertExpectedInline(str(y.storage()), '''\
+...
+[torch.storage.TypedStorage(dtype=torch.float32, device=meta) of size 3]''')
+
+        self.assertExpectedInline(str(y.storage()), '''\
+...
+[torch.storage.TypedStorage(dtype=torch.float32, device=meta) of size 3]''')
+
     # at::_embedding_bag has no op info,
     # and returns extra tensors that at::embedding bag throws away
     def test_embedding_bag_private(self):
@@ -1126,6 +1227,20 @@ def forward(self, arg1, arg2, arg3):
                 self.assertTrue("output[0]" not in str(e))
                 self.assertTrue("found mismatched tensor metadata for output[6]: Devices cpu and cuda:0 are not equal!" in str(e))
 
+    # IMPORTANT!!! Always run even if CUDA is not available
+    def test_fake_cuda_no_init(self):
+        # Skip this test, we will try to run CUDA operations to real prop so
+        # it clearly will not work on CPU runner
+        if torch._functorch.config.fake_tensor_propagate_real_tensors:
+            return
+        with FakeTensorMode():
+            torch.empty(10, device='cuda')
+            torch.ones(10, device='cuda')
+            torch.zeros(10, device='cuda')
+            torch.rand(10, device='cuda')
+            torch.tensor(3.14, device='cuda')
+            torch.tensor([[3.14, 2], [1, 2]], device='cuda')
+
     @skipIfRocm
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_conv_c1_backward(self):
@@ -1176,6 +1291,9 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         self.assertEqual(mode.count, 0)
 
 
+make_propagate_real_tensors_cls(FakeTensorOperatorInvariants)
+
+
 class FakeTensorPropTest(TestCase):
     def test_fake_tensor_prop_on_nn_module(self):
         class ToyNnModuleWithParameters(torch.nn.Module):
@@ -1234,6 +1352,7 @@ def to_fake_tensor(x):
                 self.assertTrue(failed)
 
 
+    @expectedFailurePropagateRealTensors  # Propagate real tensors doesn't work with fake-on-fake
     def test_fake_tensor_prop_on_nn_module_with_optional_args(self):
         class OptionalArgumentInBetween(torch.nn.Module):
             def __init__(self):
@@ -1260,10 +1379,396 @@ def forward(self, value, another_value=None, another_optional_value=None):
             graph_model = torch.fx.symbolic_trace(model, (value, None, another_optional_value))
             FakeTensorProp(graph_model, fake_mode).propagate(value, None, another_optional_value)
 
-instantiate_parametrized_tests(FakeTensorTest)
 
-only_for = ("cpu", "cuda")
-instantiate_device_type_tests(FakeTensorOpInfoTest, globals(), only_for=only_for)
+    @expectedFailurePropagateRealTensors  # TODO: not sure about this one, kinda strange
+    def test_unbacked_shape_realloc(self):
+        def f(x):
+            return x.nonzero()
+
+        shape_env = ShapeEnv()
+        fake_mode = FakeTensorMode(shape_env=shape_env)
+        with fake_mode:
+            value = torch.randn(5)
+            gm = make_fx(f)(value)
+        nonzero_nodes = [n for n in gm.graph.nodes if n.target is torch.ops.aten.nonzero.default]
+        self.assertEqual(len(nonzero_nodes), 1)
+        self.assertIsInstance(nonzero_nodes[0].meta['val'].shape[0], torch.SymInt)
+        u0 = nonzero_nodes[0].meta['val'].shape[0]
+        FakeTensorProp(gm, fake_mode).propagate(value)
+        u1 = nonzero_nodes[0].meta['val'].shape[0]
+        # Test that this test is actually doing something in that the
+        # FakeTensorProp actually triggered a reallocation.  If this assert is
+        # failing, it could be because we started memoizing the nnz count for
+        # nonzero, which is nice in some sense (no reallocation) but not
+        # helpful for this test, which is checking what we do when we have
+        # to reallocate.  If so, you need to make this example more
+        # complicated (e.g., maybe have a nontrivial computation on the input
+        # before feeding it into nonzero, or have some sort of randomness)
+        self.assertIsNot(u0, u1)
+        self.assertTrue(statically_known_true(u0 == u1))
+
+
+    def test_torch_load_with_fake_mode(self):
+
+        class TheModelClass(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = torch.nn.Linear(5, 10)
+
+            def forward(self, x):
+                return self.fc1(x)
+
+        with TemporaryFileName() as state_dict_file:
+            # Create state_dict to be loaded later
+            model = TheModelClass()
+            torch.save(model.state_dict(), state_dict_file)
+
+            fake_mode = FakeTensorMode()
+            with fake_mode:
+                torch.load(state_dict_file)  # scenario 1
+                torch.load(state_dict_file, map_location="cpu")  # scenario 2
+
+
+make_propagate_real_tensors_cls(FakeTensorPropTest)
+
+
+class FakeTensorSerialization(TestCase):
+    def test_serialization(self):
+        x = torch.tensor([0], device="cpu")
+        with FakeTensorMode():
+            y = pickle.loads(pickle.dumps(x))
+            self.assertEqual(type(y), FakeTensor)
+            self.assertEqual(y.device.type, "meta")
+
+            with unset_fake_temporarily():
+                y = pickle.loads(pickle.dumps(x))
+                self.assertEqual(x.device, y.device)
+
+    def test_serialization_with_tracing(self):
+        x = torch.tensor([0], device="cpu")
+        with tracing(TracingContext(FakeTensorMode())):
+            y = pickle.loads(pickle.dumps(x))
+            self.assertEqual(x.device, y.device)
+
+
+class FakeTensorDispatchCache(TestCase):
+    def test_shape_env_settings(self):
+        """
+        Validation that any boolean settings in ShapeEnv are present in the
+        ShapeEnvSettings. We hope to ensure that any new settings that might
+        affect FakeTensor dispatch are included in the cache key calculation.
+        If this test fails, consider updating ShapeEnvSettings or change this
+        test to omit checking for the new field.
+        """
+        init_sig = inspect.signature(ShapeEnv._init)
+        args = [
+            name for name, param in init_sig.parameters.items()
+            if type(param.default) is bool
+        ]
+
+        settings = [f.name for f in dataclasses.fields(ShapeEnvSettings)]
+        for arg in args:
+            self.assertTrue(arg in settings)
+
+    def _test_cache_key(self, fm, x, y, z):
+        """
+        Helper for all test_cache_key_* tests below. Assert that the
+        cache keys for inputs x and y are the same, but z is different.
+        """
+        func = aten.add.Tensor
+        key_x = fm._cache_key(func, [x], {})
+        key_y = fm._cache_key(func, [y], {})
+        key_z = fm._cache_key(func, [z], {})
+
+        self.assertEqual(key_x, key_y)
+        self.assertNotEqual(key_x, key_z)
+
+    def test_cache_key_dtype(self):
+        with FakeTensorMode() as fm:
+            x = torch.randn(4, 3, dtype=torch.float16)
+            y = torch.randn(4, 3, dtype=torch.float16)
+            z = x.to(dtype=torch.float32)
+            self._test_cache_key(fm, x, y, z)
+
+    def test_cache_key_shape(self):
+        with FakeTensorMode() as fm:
+            x = torch.randn(4, 3)
+            y = torch.randn(4, 3)
+            z = torch.randn(4, 2)
+            self._test_cache_key(fm, x, y, z)
+
+    def test_cache_key_stride(self):
+        with FakeTensorMode() as fm:
+            x = torch.randn(4, 2)
+            y = torch.randn(4, 2)
+            z = x.as_strided((4, 2), (1, 2))
+            self._test_cache_key(fm, x, y, z)
+
+    @unittest.skipIf(not RUN_CUDA, "requires cuda")
+    def test_cache_key_device(self):
+        with FakeTensorMode() as fm:
+            x = torch.randn(4, 3)
+            y = torch.randn(4, 3)
+            z = x.to(device="cuda")
+            self._test_cache_key(fm, x, y, z)
+
+    def test_cache_key_memory_format(self):
+        with FakeTensorMode() as fm:
+            x = torch.randn(1, 2, 3, 4)
+            y = torch.randn(1, 2, 3, 4)
+            z = x.to(memory_format=torch.channels_last)
+            self._test_cache_key(fm, x, y, z)
+
+    def test_cache_key_storage_offset(self):
+        with FakeTensorMode() as fm:
+            x = torch.randn(3)[1:]
+            y = torch.randn(3)[1:]
+            z = torch.randn(2)
+            self._test_cache_key(fm, x, y, z)
+
+    def test_cache_key_requires_grad(self):
+        with FakeTensorMode() as fm:
+            x = torch.randn(4, 3)
+            y = torch.randn(4, 3)
+            z = torch.randn(4, 3, requires_grad=True)
+            self._test_cache_key(fm, x, y, z)
+
+    def test_cache_key_is_conj(self):
+        with FakeTensorMode() as fm:
+            x = torch.randn(4, 3, dtype=torch.complex64)
+            y = torch.randn(4, 3, dtype=torch.complex64)
+            z = torch.randn(4, 3, dtype=torch.complex64)
+            torch._C._set_conj(z, not z.is_conj())
+            self._test_cache_key(fm, x, y, z)
+
+    def test_cache_key_is_neg(self):
+        with FakeTensorMode() as fm:
+            x = torch.randn(4, 3, dtype=torch.complex64)
+            y = torch.randn(4, 3, dtype=torch.complex64)
+            z = torch.randn(4, 3, dtype=torch.complex64)
+            torch._C._set_neg(z, not z.is_neg())
+            self._test_cache_key(fm, x, y, z)
+
+    def test_cache_key_is_inference(self):
+        with torch.inference_mode(True):
+            t = torch.randn(4, 3)
+        with FakeTensorMode() as fm:
+            x = torch.randn(4, 3)
+            y = torch.randn(4, 3)
+            z = fm.from_tensor(t)
+            self._test_cache_key(fm, x, y, z)
+
+    def test_cache_key_constants(self):
+        with FakeTensorMode() as fm:
+            # Python hashes 1.0 to the same value as 1. Make sure the
+            # cache key calculation differentiates them.
+            self._test_cache_key(fm, 1.0, 1.0, 1)
+            self._test_cache_key(fm, 0.0, 0.0, 0)
+
+    def assertHitsMisses(self, hits, misses):
+        """
+        Helper to assert on the number of recorded hits and misses.
+        """
+        info = FakeTensorMode.cache_info()
+        self.assertEqual(info.hits, hits)
+        self.assertEqual(info.misses, misses)
+
+    def assertBypasses(self, reason, count):
+        """
+        Helper to assert on the number of recorded bypasses.
+        """
+        info = FakeTensorMode.cache_info()
+        if count > 0:
+            self.assertIn(reason, info.bypasses)
+            self.assertEqual(info.bypasses[reason], count)
+        else:
+            self.assertNotIn(reason, info.bypasses)
+
+    def test_cache_hit(self):
+        """
+        Test that cache hit/miss counters are updated correctly.
+        """
+        with FakeTensorMode():
+            x = torch.randn(4, 3)
+            y = torch.randn(4, 3)
+
+            FakeTensorMode.cache_clear()
+            self.assertHitsMisses(0, 0)
+            res1 = x + y
+            self.assertHitsMisses(0, 1)
+            res2 = x + y
+            self.assertHitsMisses(1, 1)
+
+            self.assertEqual(
+                extract_tensor_metadata(res1),
+                extract_tensor_metadata(res2),
+            )
+
+    def test_cache_bypass(self):
+        """
+        Test that cache bypass counters are updated correctly.
+        """
+        with FakeTensorMode():
+            x = torch.randn(1, 2)
+
+            FakeTensorMode.cache_clear()
+            self.assertBypasses("inplace view", 0)
+
+            x.unsqueeze_(0)
+            self.assertBypasses("inplace view", 1)
+
+    def test_cache_default_dtype(self):
+        """
+        Test that the default dtype is respected when serving cached results.
+        """
+        with FakeTensorMode():
+            x = torch.tensor([1, 2], dtype=torch.int32)
+            torch.set_default_dtype(torch.float32)
+
+            FakeTensorMode.cache_clear()
+            self.assertHitsMisses(0, 0)
+
+            y = x + 1.0
+            self.assertEqual(y.dtype, torch.float32)
+            self.assertHitsMisses(0, 1)
+
+            torch.set_default_dtype(torch.float16)
+            y = x + 1.0
+            self.assertEqual(y.dtype, torch.float16)
+            self.assertHitsMisses(0, 2)
+
+            torch.set_default_dtype(torch.float32)
+            y = x + 1.0
+            self.assertEqual(y.dtype, torch.float32)
+            self.assertHitsMisses(1, 2)
+
+    @unittest.skipIf(not RUN_CUDA, "requires cuda")
+    def test_cache_default_device(self):
+        """
+        Test that the default device is respected when serving cached results.
+        """
+        with FakeTensorMode():
+            FakeTensorMode.cache_clear()
+            self.assertHitsMisses(0, 0)
+
+            torch.set_default_device("cpu")
+            x = torch.tensor([1, 2])
+            y = x + 1.0
+            self.assertEqual(y.device.type, "cpu")
+            self.assertHitsMisses(0, 1)
+
+            torch.set_default_device("cuda")
+            x = torch.tensor([1, 2])
+            y = x + 1.0
+            self.assertEqual(y.device.type, "cuda")
+            self.assertHitsMisses(0, 2)
+
+            torch.set_default_device("cpu")
+            x = torch.tensor([1, 2])
+            y = x + 1.0
+            self.assertEqual(y.device.type, "cpu")
+            self.assertHitsMisses(1, 2)
+
+    def test_cache_inplace_op(self):
+        """
+        Test that inplace ops served from the cache correctly reference the
+        input parameter.
+        """
+        with FakeTensorMode():
+            x = torch.randn(1, 2)
+            y = torch.randn(1, 2)
+
+            FakeTensorMode.cache_clear()
+            self.assertHitsMisses(0, 0)
+
+            z = x.add_(y)
+            self.assertHitsMisses(0, 1)
+            self.assertEqual(id(x), id(z))
+
+            w = x.add_(y)
+            self.assertHitsMisses(1, 1)
+            self.assertEqual(id(x), id(w))
+
+    def test_cache_view_op(self):
+        """
+        Test that view ops are handled correctly when served from the cache.
+        """
+        with FakeTensorMode():
+            x1 = torch.ones(2, requires_grad=True).clone()
+            x2 = torch.ones(2, requires_grad=True).clone()
+            y2 = x2.view(-1)
+
+            # Test operating on a non-view tensor, then the same operation
+            # on a view tensor. Assert that the view property is set correctly.
+            z1 = x1.mul_(2)
+            self.assertFalse(z1._is_view())
+
+            z2 = y2.mul_(2)
+            self.assertTrue(z2._is_view())
+
+            # Now the other way around: first operate on a view tensor, then
+            # the same operation on a non-view tensor.
+            z2 = y2.mul_(2)
+            self.assertTrue(z2._is_view())
+
+            z1 = x1.mul_(2)
+            self.assertFalse(z1._is_view())
+
+    def test_cache_dispatch_key_set(self):
+        """
+        Test that operations that change the dispatch key set bypass caching.
+        """
+        with FakeTensorMode():
+            FakeTensorMode.cache_clear()
+            self.assertBypasses("dispatch_key_set mismatch", 0)
+
+            x = torch._efficientzerotensor(3)
+            self.assertTrue(x._is_zerotensor())
+            self.assertBypasses("dispatch_key_set mismatch", 1)
+
+            y = torch._efficientzerotensor(3)
+            self.assertTrue(y._is_zerotensor())
+            self.assertBypasses("dispatch_key_set mismatch", 2)
+
+    def test_inference_mode(self):
+        """
+        Test that caching handles inference mode correctly.
+        """
+        with FakeTensorMode():
+            x = torch.randn(4, 3)
+            y = torch.randn(4, 3)
+
+            FakeTensorMode.cache_clear()
+            self.assertHitsMisses(0, 0)
+
+            # Expect a miss when the inference mode is different
+            res1 = x + y
+            with torch.inference_mode():
+                res2 = x + y
+
+            self.assertHitsMisses(0, 2)
+            self.assertFalse(res1.is_inference())
+            self.assertTrue(res2.is_inference())
+
+            # Second tries should see hits
+            res3 = x + y
+
+            self.assertHitsMisses(1, 2)
+            self.assertFalse(res3.is_inference())
+            self.assertEqual(
+                extract_tensor_metadata(res1),
+                extract_tensor_metadata(res3),
+            )
+
+            with torch.inference_mode():
+                res4 = x + y
+
+            self.assertHitsMisses(2, 2)
+            self.assertTrue(res4.is_inference())
+            self.assertEqual(
+                extract_tensor_metadata(res2),
+                extract_tensor_metadata(res4),
+            )
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_flop_counter.py b/test/test_flop_counter.py
index abd40d0eaad44..6d570bb271da2 100644
--- a/test/test_flop_counter.py
+++ b/test/test_flop_counter.py
@@ -21,7 +21,7 @@ def FlopCounterMode(*args, **kwargs):
     return torch.utils.flop_counter.FlopCounterMode(*args, **kwargs, display=False)
 
 def get_total_flops(mode):
-    return str(sum([v for _, v in mode.flop_counts["Global"].items()]))
+    return str(sum(v for _, v in mode.flop_counts["Global"].items()))
 
 def T(*shape, requires_grad=False):
     return torch.randn(*shape, requires_grad=requires_grad)
@@ -29,9 +29,8 @@ def T(*shape, requires_grad=False):
 @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "torchdynamo doesn't work with __torch_dispatch__ right now")
 class TestFlopCounter(TestCase):
     def test_flop_counter_variety(self):
-        mode = FlopCounterMode()
         mod = torch.nn.Linear(9, 10)
-        with mode:
+        with FlopCounterMode() as mode:
             torch.mm(T(4, 5), T(5, 6))
             torch.addmm(T(4, 6), T(4, 5), T(5, 6), beta=0.5, alpha=0.5)
             torch.matmul(T(5, 6), T(6, 7))
@@ -41,8 +40,7 @@ def test_flop_counter_variety(self):
         self.assertExpectedInline(get_total_flops(mode), """3012""")
 
     def test_op(self):
-        mode = FlopCounterMode()
-        with mode:
+        with FlopCounterMode() as mode:
             torch.mm(T(4, 5), T(5, 6))
         # 4 * 6 * 2 * 5 = 240
         self.assertExpectedInline(get_total_flops(mode), """240""")
@@ -91,8 +89,7 @@ def test_op(self):
         self.assertExpectedInline(get_total_flops(mode), """1440""")
 
     def test_backward(self):
-        mode = FlopCounterMode()
-        with mode:
+        with FlopCounterMode() as mode:
             a = T(4, 5, requires_grad=True)
             a = torch.mm(a, T(5, 6))
             a = a.unsqueeze(0).expand(7, 4, 6)
@@ -101,11 +98,18 @@ def test_backward(self):
 
         self.assertExpectedInline(get_total_flops(mode), """5184""")
 
+    def test_backward_reset(self):
+        with FlopCounterMode() as mode:
+            a = T(4, 5, requires_grad=True)
+            a.mm(a.t()).sum().backward()
+            a.mm(a.t()).sum().backward()
+
+        self.assertExpectedInline(get_total_flops(mode), """960""")
+
     def test_torchscript(self):
         def foo(x):
             return torch.mm(x, x)
-        mode = FlopCounterMode()
-        with mode:
+        with FlopCounterMode() as mode:
             foo(T(5, 5))
         unscripted_flops = get_total_flops(mode)
         ts_foo = torch.jit.script(foo)
@@ -124,20 +128,98 @@ def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
                 return torch.mm(grad_output, grad_output) + torch.mm(grad_output, grad_output)
 
         a = T(5, 5, requires_grad=True)
-        mode = FlopCounterMode()
-        with mode:
+        with FlopCounterMode() as mode:
             a = _CustomOp.apply(a)
             a.sum().backward()
 
         self.assertExpectedInline(get_total_flops(mode), """750""")
 
+    def test_conv_backwards_as_decomposition(self):
+        # [conv backwards decomposition as conv forwards]
+
+        class onlyConvs(torch.autograd.Function):
+            @staticmethod
+            def forward(inp, weight, transposed):
+                if not transposed:
+                    return F.conv1d(inp, weight)
+                else:
+                    return F.conv_transpose1d(inp, weight)
+
+            @staticmethod
+            def setup_context(ctx, inputs, output):
+                inp, weight, transposed = inputs
+                ctx.save_for_backward(inp, weight)
+                ctx.transposed = transposed
 
+            @staticmethod
+            def backward(ctx, grad_out):
+                inp, weight = ctx.saved_tensors
+                if not ctx.transposed:
+                    grad_inp = F.conv_transpose1d(grad_out, weight)
+                    grad_weight = F.conv1d(inp, grad_out)
+                    return grad_inp, grad_weight, None
+                else:
+                    grad_inp = F.conv1d(grad_out, weight)
+                    grad_weight = F.conv1d(grad_out.transpose(1, 0), inp.transpose(1, 0))
+                    return grad_inp, grad_weight.transpose(1, 0), None
+
+
+        from torch.func import grad
+        x = torch.randn(2, 3, 16, dtype=torch.float64)
+        weight = torch.randn(3, 4, 4, dtype=torch.float64)
+
+        def boring_conv(x, weight, transposed):
+            if not transposed:
+                return F.conv1d(x, weight).pow(2).sum()
+            else:
+                return F.conv_transpose1d(x, weight).pow(2).sum()
+
+        def only_convs(x, weight, transposed):
+            return onlyConvs.apply(x, weight, transposed).pow(2).sum()
+
+        boring_grads = grad(boring_conv, argnums=(0, 1))(x, weight, True)
+        fun_grads = grad(only_convs, argnums=(0, 1))(x, weight, True)
+
+        self.assertEqual(boring_grads, fun_grads)
+
+
+    def test_convs(self):
+        def assert_equivalence(f, expected_forward=None):
+            with FlopCounterMode() as mode:
+                f()
+            conv_forward_flops = mode.get_flop_counts()['Global'][torch.ops.aten.convolution]
+            conv_backward_flops = mode.get_flop_counts()['Global'][torch.ops.aten.convolution_backward]
+
+            self.assertEqual(conv_forward_flops * 2, conv_backward_flops)
+            if expected_forward is not None:
+                self.assertEqual(conv_forward_flops, expected_forward)
+
+        x = torch.rand(1, 1, 2, 2, requires_grad=True)
+        weight = torch.randn(1, 1, 2, 2, requires_grad=True)
+        assert_equivalence(lambda: F.conv_transpose2d(x, weight).sum().backward(), 32)
+
+        x = torch.rand(1, 1, 2, 2, requires_grad=True)
+        weight = torch.randn(1, 1, 1, 1, requires_grad=True)
+        assert_equivalence(lambda: F.conv2d(x, weight).sum().backward(), 8)
+
+        for in_channels, out_channels, groups in [
+            (1, 1, 1),
+            (1, 3, 1),
+            (3, 1, 1),
+            (3, 7, 1),
+            (2, 4, 2),
+            (4, 2, 2),
+        ]:
+            x = torch.rand(1, in_channels, 4, 4, requires_grad=True)
+            weight = torch.randn(out_channels, in_channels, 2, 2, requires_grad=True)
+            assert_equivalence(lambda: F.conv2d(x, weight).sum().backward())
+            transposed_weight = torch.randn(in_channels, out_channels, 2, 2, requires_grad=True)
+            assert_equivalence(lambda: F.conv_transpose2d(x, transposed_weight).sum().backward())
 
     @skipIfNoTorchVision
     def test_module(self):
         resnet18 = torchvision_models.resnet18()
-        mode = FlopCounterMode(resnet18)
-        with mode:
+        with FlopCounterMode(resnet18) as mode:
             a = T(1, 3, 224, 224, requires_grad=True)
             resnet18(a).sum().backward()
 
@@ -147,6 +229,17 @@ def test_module(self):
         self.assertExpectedInline(str(layer1_conv_flops), """924844032""")
         self.assertExpectedInline(str(layer1_conv_back_flops), """1849688064""")
 
+
+    def test_conv_transpose_loop(self):
+        x = torch.rand(1, 4, 30, 2)
+        model = torch.nn.ConvTranspose2d(4, 8, (2, 2), stride=2)
+
+        with FlopCounterMode() as mode:
+            for i in range(50):
+                out = model(x)
+                out.sum().backward()
+        self.assertExpectedInline(str(mode.get_total_flops()), """1536000""")
+
     def test_custom(self):
         mode = FlopCounterMode(custom_mapping={torch.ops.aten.add: lambda *args, out_shape: 5})
         with mode:
@@ -155,8 +248,8 @@ def test_custom(self):
 
         self.assertExpectedInline(get_total_flops(mode), """5""")
 
-        def count(*args, out):
-            return out.numel()
+        def count(*args, out_val):
+            return out_val.numel()
         count._get_raw = True
 
         mode = FlopCounterMode(custom_mapping={torch.ops.aten.add: count})
@@ -167,8 +260,7 @@ def count(*args, out):
         self.assertExpectedInline(get_total_flops(mode), """20""")
 
     def test_noop(self):
-        mode = FlopCounterMode()
-        with mode:
+        with FlopCounterMode() as mode:
             T(4, 5).cos()
 
     @unittest.skipIf(not HAS_CUDA, "CUDA not available")
@@ -235,18 +327,27 @@ def get_flops(batch_size, n_heads, seq_len_q, seq_len_k, head_dim, head_dim_v, d
         self.assertExpectedInline(str(flops_fw_bw_math), """805306368""")
         self.assertExpectedInline(str(flops_fw_bw_efficient), """939524096""")
 
+    def test_addmm_out(self):
+        def f(x):
+            y = torch.zeros(10, 10)
+            return torch.mm(x, x, out=y)
+
+        with FlopCounterMode() as mode:
+            f(torch.randn(10, 10))
+
+        self.assertExpectedInline(get_total_flops(mode), """2000""")
+
     def test_hook_registration(self):
         model = torch.nn.Linear(100, 100)
         x = torch.randn(3, 100)
 
-        flop_counter = FlopCounterMode(model)
-        with flop_counter:
-            self.assertEqual(len(model._forward_pre_hooks), 1)
-            self.assertEqual(len(model._forward_hooks), 1)
+        with FlopCounterMode() as mode:
+            self.assertEqual(len(torch.nn.modules.module._global_forward_pre_hooks), 1)
+            self.assertEqual(len(torch.nn.modules.module._global_forward_hooks), 1)
             model(x).sum().backward()
 
-        self.assertEqual(len(model._forward_pre_hooks), 0)
-        self.assertEqual(len(model._forward_hooks), 0)
+        self.assertEqual(len(torch.nn.modules.module._global_forward_pre_hooks), 0)
+        self.assertEqual(len(torch.nn.modules.module._global_forward_hooks), 0)
 
     def test_pytrees(self):
         class Foo(torch.nn.Module):
@@ -264,8 +365,7 @@ def forward(self, x):
                 return self.b(self.a(x))
 
         mod = Mod()
-        mode = FlopCounterMode(mod)
-        with mode:
+        with FlopCounterMode() as mode:
             mod({'a': torch.randn(10, 10, requires_grad=True).clone()})['a'].sum().backward()
         self.assertExpectedInline((mode.flop_counts['Mod'][torch.ops.aten.mm]), """12000""")
 
@@ -274,13 +374,14 @@ def forward(self, x):
                 return (torch.mm(x, x),)
 
         mod = Mod2()
-        mode = FlopCounterMode(mod)
-        with mode:
+        with FlopCounterMode() as mode:
             mod(torch.randn(10, 10, requires_grad=True))[0].sum().backward()
         self.assertExpectedInline((mode.flop_counts['Mod2'][torch.ops.aten.mm]), """6000""")
 
-
-
+    def test_warning(self):
+        mod = torch.nn.Linear(2, 2)
+        with self.assertWarnsRegex(UserWarning, "not needed"):
+            FlopCounterMode(mod)
 
 
 if __name__ == '__main__':
diff --git a/test/test_foreach.py b/test/test_foreach.py
index a0f4fb30ca480..e9fc4ffc7791f 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -1,26 +1,46 @@
 # Owner(s): ["module: mta"]
 
-from contextlib import nullcontext
-from numbers import Number
+import itertools
 import random
 import re
-import torch
 import unittest
-import itertools
 import weakref
+from contextlib import nullcontext
+from numbers import Number
+
+import torch
 
 from torch.testing import make_tensor
 from torch.testing._comparison import default_tolerances
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
-from torch.testing._internal.common_utils import \
-    TestCase, run_tests, TEST_WITH_ROCM, skipIfTorchDynamo, parametrize, gradcheck
-from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, dtypes, onlyCUDA, ops, OpDTypes)
-from torch.testing._internal.common_methods_invocations import (
-    foreach_unary_op_db, foreach_binary_op_db, foreach_pointwise_op_db,
-    foreach_reduce_op_db, foreach_other_op_db)
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+    onlyCUDA,
+    OpDTypes,
+    ops,
+)
 from torch.testing._internal.common_dtype import (
-    all_types_and_complex_and, floating_types_and, floating_types, integral_types_and,
+    all_types_and_complex_and,
+    floating_types,
+    floating_types_and,
+    integral_types_and,
+)
+from torch.testing._internal.common_methods_invocations import (
+    foreach_binary_op_db,
+    foreach_other_op_db,
+    foreach_pointwise_op_db,
+    foreach_reduce_op_db,
+    foreach_unary_op_db,
+)
+from torch.testing._internal.common_utils import (
+    gradcheck,
+    parametrize,
+    run_tests,
+    skipIfRocmVersionLessThan,
+    skipIfTorchDynamo,
+    TEST_WITH_ROCM,
+    TestCase,
 )
 
 
@@ -38,7 +58,10 @@ def __call__(self, inputs, scalars=None, **kwargs):
             # special consideration as it is a keyword only argument to the
             # regular func. (Strangely, it is not a keyword only argument to the
             # foreach func)
-            return [self.func(*i, value=scalars[idx], **kwargs) for idx, i in enumerate(zip(*inputs))]
+            return [
+                self.func(*i, value=scalars[idx], **kwargs)
+                for idx, i in enumerate(zip(*inputs))
+            ]
         if len(inputs) == 2 and isinstance(inputs[1], (Number, torch.Tensor)):
             # binary op with tensorlist and scalar.
             inputs[1] = [inputs[1] for _ in range(len(inputs[0]))]
@@ -49,15 +72,16 @@ class ForeachFuncWrapper:
     def __init__(self, func):
         self.func = func
         # Some foreach functions don't have in-place implementations.
-        self.is_inplace = False if func is None else func.__name__.endswith('_')
+        self.is_inplace = False if func is None else func.__name__.endswith("_")
 
     def __call__(self, inputs, is_cuda, expect_fastpath, **kwargs):
         actual = None
         zero_size = kwargs.pop("zero_size", False)
         if (
-            is_cuda and
-            torch.autograd.kineto_available() and
-            torch.profiler.ProfilerActivity.CUDA in torch.profiler.supported_activities()
+            is_cuda
+            and torch.autograd.kineto_available()
+            and torch.profiler.ProfilerActivity.CUDA
+            in torch.profiler.supported_activities()
         ):
             with torch.profiler.profile() as p:
                 actual = self.func(*inputs, **kwargs)
@@ -66,13 +90,15 @@ def __call__(self, inputs, is_cuda, expect_fastpath, **kwargs):
             assert mta_called == (expect_fastpath and (not zero_size))
         else:
             actual = self.func(*inputs, **kwargs)
-        # note(mkozuki): inplace foreach functions are void functions.
-        return inputs[0] if self.is_inplace else actual
+        if self.is_inplace:
+            assert id(inputs[0]) == id(actual)
+        return actual
 
 
 class InplaceForeachVersionBumpCheck:
-
-    def __init__(self, testcase: TestCase, tensorlist: "List[torch.Tensor]") -> None:  # noqa: F821
+    def __init__(
+        self, testcase: TestCase, tensorlist: "List[torch.Tensor]"  # noqa: F821
+    ) -> None:
         self._testcase = testcase
         self._tensorlist = tensorlist
         self._orig_version_counts = [t._version for t in tensorlist]
@@ -82,7 +108,9 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         # note(crcrpar): some methods e.g. `_binary_test` could call the given inplace function multiple times
-        self._testcase.assertGreaterEqual([t._version for t in self._tensorlist], self._orig_version_counts)
+        self._testcase.assertGreaterEqual(
+            [t._version for t in self._tensorlist], self._orig_version_counts
+        )
 
 
 def get_transform_func(num_tensors, dtype, device, is_fastpath):
@@ -92,8 +120,11 @@ def transform(t):
         if torch.is_tensor(t) and t.ndim == 0:
             return t
         return make_tensor(
-            (num_tensors, num_tensors), dtype=dtype, device=device,
-            requires_grad=True, noncontiguous=not is_fastpath,
+            (num_tensors, num_tensors),
+            dtype=dtype,
+            device=device,
+            requires_grad=True,
+            noncontiguous=not is_fastpath,
         )
 
     return transform
@@ -104,7 +135,7 @@ def transform(t):
 class TestForeach(TestCase):
     @property
     def is_cuda(self):
-        return self.device_type == 'cuda'
+        return self.device_type == "cuda"
 
     def _get_funcs(self, op):
         return (
@@ -122,28 +153,48 @@ def _get_funcs(self, op):
     #   - https://github.com/pytorch/pytorch/pull/100811
     @onlyCUDA
     @ops(
-        foreach_unary_op_db + foreach_binary_op_db + foreach_pointwise_op_db + foreach_reduce_op_db + foreach_other_op_db,
-        dtypes=(torch.float32,)
+        foreach_unary_op_db
+        + foreach_binary_op_db
+        + foreach_pointwise_op_db
+        + foreach_reduce_op_db
+        + foreach_other_op_db,
+        dtypes=(torch.float32,),
     )
     def test_all_zero_size_tensors_do_not_launch_kernel(self, device, dtype, op):
         wrapped_op, _, inplace_op, _ = self._get_funcs(op)
 
         for sample in op.sample_zero_size_inputs(device, dtype):
-            if op.supports_out:
-                wrapped_op((sample.input, *sample.args), is_cuda=self.is_cuda, expect_fastpath=True, zero_size=True)
-            with InplaceForeachVersionBumpCheck(self, sample.input):
-                inplace_op((sample.input, *sample.args), is_cuda=self.is_cuda, expect_fastpath=True, zero_size=True)
+            if op.method_variant is not None:
+                wrapped_op(
+                    (sample.input, *sample.args),
+                    is_cuda=self.is_cuda,
+                    expect_fastpath=True,
+                    zero_size=True,
+                )
+
+            if op.inplace_variant is not None:
+                with InplaceForeachVersionBumpCheck(self, sample.input):
+                    inplace_op(
+                        (sample.input, *sample.args),
+                        is_cuda=self.is_cuda,
+                        expect_fastpath=True,
+                        zero_size=True,
+                    )
 
-    @unittest.skipIf(TEST_WITH_ROCM, "Skipped on ROCm, since it is failing on ROCm 5.7")
+    @skipIfRocmVersionLessThan((6, 0))
     @ops(
-        foreach_unary_op_db + foreach_binary_op_db + foreach_pointwise_op_db + foreach_reduce_op_db + foreach_other_op_db,
+        foreach_unary_op_db
+        + foreach_binary_op_db
+        + foreach_pointwise_op_db
+        + foreach_reduce_op_db
+        + foreach_other_op_db,
     )
     @parametrize(
         "noncontiguous,inplace",
         [(False, False), (False, True), (True, False), (True, True)],
-        name_fn=lambda x, y: '{}_{}'.format(
-            'fastpath' if not x else 'slowpath', 'inplace' if y else 'outplace'
-        )
+        name_fn=lambda x, y: "{}_{}".format(
+            "fastpath" if not x else "slowpath", "inplace" if y else "outplace"
+        ),
     )
     def test_parity(self, device, dtype, op, noncontiguous, inplace):
         if inplace:
@@ -153,8 +204,12 @@ def test_parity(self, device, dtype, op, noncontiguous, inplace):
         for sample in op.sample_inputs(device, dtype, noncontiguous=noncontiguous):
             ref_kwargs = sample.kwargs
             # div promotes ints to floats, so we cannot go on the fastpath there
-            div_slowpath = dtype in integral_types_and(torch.bool) and op.name == '_foreach_div'
-            expect_fastpath = not (noncontiguous or sample.disable_fastpath or div_slowpath)
+            div_slowpath = (
+                dtype in integral_types_and(torch.bool) and op.name == "_foreach_div"
+            )
+            expect_fastpath = not (
+                noncontiguous or sample.disable_fastpath or div_slowpath
+            )
             ref_input, ctxmgr = sample.input, nullcontext()
             if inplace:
                 with torch.no_grad():
@@ -162,7 +217,12 @@ def test_parity(self, device, dtype, op, noncontiguous, inplace):
                 ctxmgr = InplaceForeachVersionBumpCheck(self, sample.input)
             try:
                 with ctxmgr:
-                    actual = func([sample.input, *sample.args], self.is_cuda, expect_fastpath, **sample.kwargs)
+                    actual = func(
+                        [sample.input, *sample.args],
+                        self.is_cuda,
+                        expect_fastpath,
+                        **sample.kwargs,
+                    )
             except Exception as e:
                 with (
                     self.assertRaisesRegex(type(e), re.escape(str(e)))
@@ -176,13 +236,25 @@ def test_parity(self, device, dtype, op, noncontiguous, inplace):
 
     def _binary_test(
         self,
-        dtype, op, ref, inputs, is_fastpath, is_inplace,
+        dtype,
+        op,
+        ref,
+        inputs,
+        is_fastpath,
+        is_inplace,
         *,
-        alpha, scalar_self_arg: bool,
+        alpha,
+        scalar_self_arg: bool,
     ):
-        ref_inputs = [[t.clone().detach() for t in inputs[0]], inputs[1]] if is_inplace else inputs
+        ref_inputs = (
+            [[t.clone().detach() for t in inputs[0]], inputs[1]]
+            if is_inplace
+            else inputs
+        )
         try:
-            with InplaceForeachVersionBumpCheck(self, inputs[0]) if op.is_inplace else nullcontext():
+            with InplaceForeachVersionBumpCheck(
+                self, inputs[0]
+            ) if op.is_inplace else nullcontext():
                 actual = op(inputs, self.is_cuda, is_fastpath)
         except RuntimeError as e:
             with self.assertRaisesRegex(type(e), re.escape(str(e))):
@@ -191,15 +263,21 @@ def _binary_test(
                 else:
                     [ref.func(ref_inputs[0], t) for t in ref_inputs[1]]
         else:
-            expected = ref(ref_inputs) if not scalar_self_arg else [ref.func(ref_inputs[0], t) for t in ref_inputs[1]]
+            expected = (
+                ref(ref_inputs)
+                if not scalar_self_arg
+                else [ref.func(ref_inputs[0], t) for t in ref_inputs[1]]
+            )
             self.assertEqual(actual, expected)
         if alpha is not None and not scalar_self_arg:
-            kwargs = {'alpha': alpha}
+            kwargs = {"alpha": alpha}
             ref_inputs = inputs
             try:
                 op_kwargs = {}
                 op_kwargs.update(kwargs)
-                with InplaceForeachVersionBumpCheck(self, inputs[0]) if op.is_inplace else nullcontext():
+                with InplaceForeachVersionBumpCheck(
+                    self, inputs[0]
+                ) if op.is_inplace else nullcontext():
                     actual = op(inputs, self.is_cuda, is_fastpath, **op_kwargs)
             except RuntimeError as e:
                 with self.assertRaisesRegex(type(e), re.escape(str(e))):
@@ -207,14 +285,15 @@ def _binary_test(
             else:
                 expected = ref(ref_inputs, **kwargs)
                 if dtype in (torch.float16, torch.bfloat16) and TEST_WITH_ROCM:
-                    self.assertEqual(expected, actual, atol=1.e-3, rtol=default_tolerances(dtype)[0])
+                    self.assertEqual(
+                        expected, actual, atol=1.0e-3, rtol=default_tolerances(dtype)[0]
+                    )
                 else:
                     self.assertEqual(expected, actual)
 
     @ops(filter(lambda op: op.supports_scalar_self_arg, foreach_binary_op_db))
     @parametrize("is_fastpath", (True, False))
     def test_binary_op_with_scalar_self_support(self, device, dtype, op, is_fastpath):
-
         def clone(arg):
             if isinstance(arg, (list, tuple)):
                 return [clone(a) for a in arg]
@@ -224,7 +303,9 @@ def clone(arg):
                 return arg
 
         scalar_self_arg_test_complete = False
-        for i, sample in enumerate(op.sample_inputs(device, dtype, noncontiguous=not is_fastpath)):
+        for i, sample in enumerate(
+            op.sample_inputs(device, dtype, noncontiguous=not is_fastpath)
+        ):
             (rhs_arg,) = sample.args
             kwargs = {} or sample.kwargs
             alpha = kwargs.pop("alpha", None)
@@ -232,24 +313,39 @@ def clone(arg):
             if isinstance(rhs_arg, Number) and not scalar_self_arg_test_complete:
                 scalar_self_arg_test_complete = True
                 self._binary_test(
-                    dtype, wrapped_op, ref, [rhs_arg, sample.input], is_fastpath, False,
-                    alpha=alpha, scalar_self_arg=True,
+                    dtype,
+                    wrapped_op,
+                    ref,
+                    [rhs_arg, sample.input],
+                    is_fastpath,
+                    False,
+                    alpha=alpha,
+                    scalar_self_arg=True,
                 )
                 if op.supports_autograd and dtype == torch.float32:
                     transformed_sample = sample.transform(
-                        get_transform_func(len(sample.input), dtype, device, is_fastpath))
+                        get_transform_func(
+                            len(sample.input), dtype, device, is_fastpath
+                        )
+                    )
                     tensors = transformed_sample.input
                     (rhs_arg,) = transformed_sample.args
                     ref_tensors, ref_rhs_arg = clone(tensors), clone(rhs_arg)
-                    sum(wrapped_op(
-                        [rhs_arg, tensors], is_cuda=False, expect_fastpath=False
-                    )).mean().backward()
-                    sum([ref.func(ref_rhs_arg, t) for t in ref_tensors]).mean().backward()
-                    self.assertEqual([t.grad for t in tensors], [t.grad for t in ref_tensors])
+                    sum(
+                        wrapped_op(
+                            [rhs_arg, tensors], is_cuda=False, expect_fastpath=False
+                        )
+                    ).mean().backward()
+                    sum(ref.func(ref_rhs_arg, t) for t in ref_tensors).mean().backward()
+                    self.assertEqual(
+                        [t.grad for t in tensors], [t.grad for t in ref_tensors]
+                    )
 
     @ops(foreach_pointwise_op_db)
     @parametrize("is_fastpath", (True, False))
-    def test_pointwise_op_with_tensor_of_scalarlist_overload(self, device, dtype, op, is_fastpath):
+    def test_pointwise_op_with_tensor_of_scalarlist_overload(
+        self, device, dtype, op, is_fastpath
+    ):
         for sample in op.sample_inputs(device, dtype, noncontiguous=not is_fastpath):
             assert isinstance(sample.args, tuple)
             assert len(sample.args) == 2
@@ -260,35 +356,62 @@ def test_pointwise_op_with_tensor_of_scalarlist_overload(self, device, dtype, op
             scalars = kwargs.pop("scalars", None)
 
             if is_fastpath and scalars:
-                sample = sample.transform(lambda t: t.clone().detach() if torch.is_tensor(t) else t)
+                sample = sample.transform(
+                    lambda t: t.clone().detach() if torch.is_tensor(t) else t
+                )
                 inputs = [sample.input, *sample.args]
                 tensor_values = torch.tensor(scalars)
                 # 1D Tensor of scalars
-                for is_inplace, op_, ref_ in ((False, wrapped_op, ref), (True, inplace_op, inplace_ref)):
+                for is_inplace, op_, ref_ in (
+                    (False, wrapped_op, ref),
+                    (True, inplace_op, inplace_ref),
+                ):
                     self._pointwise_test(
-                        op_, ref_, inputs, is_fastpath and not disable_fastpath, is_inplace,
-                        scalars=tensor_values, **kwargs)
+                        op_,
+                        ref_,
+                        inputs,
+                        is_fastpath and not disable_fastpath,
+                        is_inplace,
+                        scalars=tensor_values,
+                        **kwargs,
+                    )
                     self._pointwise_test(
-                        op_, ref_, inputs, is_fastpath and not disable_fastpath, is_inplace,
+                        op_,
+                        ref_,
+                        inputs,
+                        is_fastpath and not disable_fastpath,
+                        is_inplace,
                         scalars=tensor_values[0],
                         custom_values_err="Expected packed scalar Tensor to be of dimension 1. Got 0 instead.",
                         **kwargs,
                     )
                     if self.is_cuda:
                         self._pointwise_test(
-                            op_, ref_, inputs, is_fastpath and not disable_fastpath, is_inplace,
+                            op_,
+                            ref_,
+                            inputs,
+                            is_fastpath and not disable_fastpath,
+                            is_inplace,
                             scalars=tensor_values.cuda(),
                             custom_values_err="Expected scalars to be on CPU, got cuda:0 instead.",
                             **kwargs,
                         )
                     self._pointwise_test(
-                        op_, ref_, inputs, is_fastpath and not disable_fastpath, is_inplace,
+                        op_,
+                        ref_,
+                        inputs,
+                        is_fastpath and not disable_fastpath,
+                        is_inplace,
                         scalars=tensor_values[:2],
                         custom_values_err=f"Expected length of scalars to match input of length {len(scalars)} but got 2 instead.",
                         **kwargs,
                     )
                     self._pointwise_test(
-                        op_, ref_, inputs, is_fastpath and not disable_fastpath, is_inplace,
+                        op_,
+                        ref_,
+                        inputs,
+                        is_fastpath and not disable_fastpath,
+                        is_inplace,
                         scalars=torch.tensor([[0, 1], [2, 3]])[:, 1],
                         custom_values_err="Expected scalars to be contiguous.",
                         **kwargs,
@@ -297,32 +420,76 @@ def test_pointwise_op_with_tensor_of_scalarlist_overload(self, device, dtype, op
             # Tests of implicit broadcasting
             N = len(sample.input)
             inputs = [
-                [make_tensor((N, N), device=device, dtype=dtype, noncontiguous=not is_fastpath) for _ in range(N)],
                 [
-                    make_tensor((N - i, 1), device=device, dtype=dtype, noncontiguous=not is_fastpath)
+                    make_tensor(
+                        (N, N),
+                        device=device,
+                        dtype=dtype,
+                        noncontiguous=not is_fastpath,
+                    )
+                    for _ in range(N)
+                ],
+                [
+                    make_tensor(
+                        (N - i, 1),
+                        device=device,
+                        dtype=dtype,
+                        noncontiguous=not is_fastpath,
+                    )
                     for i in range(N)
                 ],
                 [
-                    make_tensor((1, N - i), device=device, dtype=dtype, noncontiguous=not is_fastpath)
+                    make_tensor(
+                        (1, N - i),
+                        device=device,
+                        dtype=dtype,
+                        noncontiguous=not is_fastpath,
+                    )
                     for i in range(N)
                 ],
             ]
             self._pointwise_test(
-                wrapped_op, ref, inputs, is_fastpath and disable_fastpath, is_inplace=False,
-                scalars=scalars, **kwargs)
+                wrapped_op,
+                ref,
+                inputs,
+                is_fastpath and disable_fastpath,
+                is_inplace=False,
+                scalars=scalars,
+                **kwargs,
+            )
             self._pointwise_test(
-                inplace_op, inplace_ref, inputs, is_fastpath and disable_fastpath,
-                is_inplace=True, scalars=scalars, **kwargs)
+                inplace_op,
+                inplace_ref,
+                inputs,
+                is_fastpath and disable_fastpath,
+                is_inplace=True,
+                scalars=scalars,
+                **kwargs,
+            )
 
     def _pointwise_test(
         self,
-        op, ref, inputs, is_fastpath, is_inplace,
+        op,
+        ref,
+        inputs,
+        is_fastpath,
+        is_inplace,
         *,
-        scalars=None, custom_values_err=None, **kwargs
+        scalars=None,
+        custom_values_err=None,
+        **kwargs,
     ):
-        ref_inputs = [[t.clone().detach() for t in inputs[0]], inputs[1], inputs[2]] if is_inplace else inputs
+        ref_inputs = (
+            [[t.clone().detach() for t in inputs[0]], inputs[1], inputs[2]]
+            if is_inplace
+            else inputs
+        )
         try:
-            with (InplaceForeachVersionBumpCheck(self, inputs[0]) if is_inplace else nullcontext()):
+            with (
+                InplaceForeachVersionBumpCheck(self, inputs[0])
+                if is_inplace
+                else nullcontext()
+            ):
                 actual = op(inputs, self.is_cuda, is_fastpath, **kwargs)
         except RuntimeError as e:
             with self.assertRaisesRegex(type(e), re.escape(str(e))):
@@ -350,8 +517,10 @@ def _pointwise_test(
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
     def test_add_scalar_with_empty_list_and_empty_tensor(self, device, dtype):
         # TODO: enable empty list case
-        for tensors in [[torch.randn([0], device=device, dtype=dtype)],
-                        [torch.empty_strided((0, 1), (0, 0), dtype=dtype, device=device)]]:
+        for tensors in [
+            [torch.randn([0], device=device, dtype=dtype)],
+            [torch.empty_strided((0, 1), (0, 0), dtype=dtype, device=device)],
+        ]:
             res = torch._foreach_add(tensors, 1)
             self.assertEqual(res, tensors)
 
@@ -402,30 +571,50 @@ def test_binary_op_scalar_with_different_tensor_dtypes(self, device, dtype, op):
         filter(lambda op: op.supports_out, foreach_binary_op_db),
         dtypes=OpDTypes.supported,
     )
+    @unittest.skipIf(
+        torch.cuda.is_available() and not torch.cuda.get_device_capability(0) == (8, 6),
+        "failing flakily on non sm86 cuda jobs, ex https://github.com/pytorch/pytorch/issues/125035",
+    )
     def test_binary_op_list_error_cases(self, device, dtype, op):
-        foreach_op, foreach_op_, ref, ref_ = op.method_variant, op.inplace_variant, op.ref, op.ref_inplace
+        foreach_op, foreach_op_, ref, ref_ = (
+            op.method_variant,
+            op.inplace_variant,
+            op.ref,
+            op.ref_inplace,
+        )
         tensors1 = []
         tensors2 = []
         ops_to_test = [foreach_op, foreach_op_]
 
         # Empty lists
         for fop in ops_to_test:
-            with self.assertRaisesRegex(RuntimeError, "There were no tensor arguments to this function"):
+            with self.assertRaisesRegex(
+                RuntimeError, "There were no tensor arguments to this function"
+            ):
                 fop(tensors1, tensors2)
 
         # One empty list
         tensors1.append(torch.tensor([1], device=device, dtype=dtype))
         for fop in ops_to_test:
-            with self.assertRaisesRegex(RuntimeError, "Tensor list must have same number of elements as scalar list."):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Tensor list must have same number of elements as scalar list.",
+            ):
                 fop(tensors1, tensors2)
 
         # Lists have different amount of tensors
         tensors2.append(torch.tensor([1], device=device))
         tensors2.append(torch.tensor([1], device=device))
         for fop in ops_to_test:
-            with self.assertRaisesRegex(RuntimeError, "Tensor lists must have the same number of tensors, got 1 and 2"):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Tensor lists must have the same number of tensors, got 1 and 2",
+            ):
                 fop(tensors1, tensors2)
-            with self.assertRaisesRegex(RuntimeError, "Tensor lists must have the same number of tensors, got 2 and 1"):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Tensor lists must have the same number of tensors, got 2 and 1",
+            ):
                 fop(tensors2, tensors1)
 
         # Corresponding tensors with different sizes that aren't compatible with broadcast
@@ -450,16 +639,25 @@ def test_binary_op_list_error_cases(self, device, dtype, op):
             tensor2 = torch.ones(10, 10, device="cuda:1", dtype=dtype)
             if dtype == torch.bool and foreach_op == torch._foreach_sub:
                 for fop in ops_to_test:
-                    with self.assertRaisesRegex(RuntimeError, re.escape(_BOOL_SUB_ERR_MSG)):
+                    with self.assertRaisesRegex(
+                        RuntimeError, re.escape(_BOOL_SUB_ERR_MSG)
+                    ):
                         fop([tensor1], [tensor2])
                 return
-            with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"):
+            with self.assertRaisesRegex(
+                RuntimeError, "Expected all tensors to be on the same device"
+            ):
                 foreach_op([tensor1], [tensor2])
-            if dtype in integral_types_and(torch.bool) and foreach_op == torch._foreach_div:
+            if (
+                dtype in integral_types_and(torch.bool)
+                and foreach_op == torch._foreach_div
+            ):
                 with self.assertRaisesRegex(RuntimeError, "result type"):
                     foreach_op_([tensor1], [tensor2])
             else:
-                with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"):
+                with self.assertRaisesRegex(
+                    RuntimeError, "Expected all tensors to be on the same device"
+                ):
                     foreach_op_([tensor1], [tensor2])
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not found")
@@ -474,46 +672,108 @@ def test_binary_op_list_slow_path(self, device, dtype, op):
         tensor2 = make_tensor((1,), device=device, dtype=dtype).expand_as(tensor1)
         inputs = ([tensor1], [tensor2])
         self._binary_test(
-            dtype, foreach_op, native_op, inputs, is_fastpath=False, is_inplace=False,
-            alpha=None, scalar_self_arg=False)
+            dtype,
+            foreach_op,
+            native_op,
+            inputs,
+            is_fastpath=False,
+            is_inplace=False,
+            alpha=None,
+            scalar_self_arg=False,
+        )
         self._binary_test(
-            dtype, foreach_op_, native_op_, inputs, is_fastpath=False, is_inplace=True,
-            alpha=None, scalar_self_arg=False)
+            dtype,
+            foreach_op_,
+            native_op_,
+            inputs,
+            is_fastpath=False,
+            is_inplace=True,
+            alpha=None,
+            scalar_self_arg=False,
+        )
 
         # different strides
         tensor1 = torch.zeros(10, 10, device=device, dtype=dtype)
         tensor2 = torch.ones(10, 10, device=device, dtype=dtype)
         inputs = ([tensor1], [tensor2.t()])
         self._binary_test(
-            dtype, foreach_op, native_op, inputs, is_fastpath=False, is_inplace=False,
-            alpha=None, scalar_self_arg=False)
+            dtype,
+            foreach_op,
+            native_op,
+            inputs,
+            is_fastpath=False,
+            is_inplace=False,
+            alpha=None,
+            scalar_self_arg=False,
+        )
         self._binary_test(
-            dtype, foreach_op_, native_op_, inputs, is_fastpath=False, is_inplace=True,
-            alpha=None, scalar_self_arg=False)
+            dtype,
+            foreach_op_,
+            native_op_,
+            inputs,
+            is_fastpath=False,
+            is_inplace=True,
+            alpha=None,
+            scalar_self_arg=False,
+        )
 
         # non contiguous
-        tensor1 = make_tensor((5, 2, 1, 3), device=device, dtype=dtype, noncontiguous=True)
-        tensor2 = make_tensor((5, 2, 1, 3), device=device, dtype=dtype, noncontiguous=True)
+        tensor1 = make_tensor(
+            (5, 2, 1, 3), device=device, dtype=dtype, noncontiguous=True
+        )
+        tensor2 = make_tensor(
+            (5, 2, 1, 3), device=device, dtype=dtype, noncontiguous=True
+        )
         self.assertFalse(tensor1.is_contiguous())
         self.assertFalse(tensor2.is_contiguous())
         inputs = ([tensor1], [tensor2])
         self._binary_test(
-            dtype, foreach_op, native_op, inputs, is_fastpath=False, is_inplace=False,
-            alpha=None, scalar_self_arg=False)
+            dtype,
+            foreach_op,
+            native_op,
+            inputs,
+            is_fastpath=False,
+            is_inplace=False,
+            alpha=None,
+            scalar_self_arg=False,
+        )
         self._binary_test(
-            dtype, foreach_op_, native_op_, inputs, is_fastpath=False, is_inplace=True,
-            alpha=None, scalar_self_arg=False)
+            dtype,
+            foreach_op_,
+            native_op_,
+            inputs,
+            is_fastpath=False,
+            is_inplace=True,
+            alpha=None,
+            scalar_self_arg=False,
+        )
 
         # sliced tensor
         tensor1 = make_tensor((5, 2, 1, 3), device=device, dtype=dtype)
-        tensor2 = make_tensor((5, 2, 1, 3 * 7), device=device, dtype=dtype)[:, :, :, ::7]
+        tensor2 = make_tensor((5, 2, 1, 3 * 7), device=device, dtype=dtype)[
+            :, :, :, ::7
+        ]
         inputs = ([tensor1], [tensor2])
         self._binary_test(
-            dtype, foreach_op, native_op, inputs, is_fastpath=False, is_inplace=False,
-            alpha=None, scalar_self_arg=False)
+            dtype,
+            foreach_op,
+            native_op,
+            inputs,
+            is_fastpath=False,
+            is_inplace=False,
+            alpha=None,
+            scalar_self_arg=False,
+        )
         self._binary_test(
-            dtype, foreach_op_, native_op_, inputs, is_fastpath=False, is_inplace=True,
-            alpha=None, scalar_self_arg=False)
+            dtype,
+            foreach_op_,
+            native_op_,
+            inputs,
+            is_fastpath=False,
+            is_inplace=True,
+            alpha=None,
+            scalar_self_arg=False,
+        )
 
     @ops(
         filter(lambda op: op.supports_out, foreach_binary_op_db),
@@ -535,9 +795,18 @@ def test_binary_op_float_inf_nan(self, device, dtype, op):
             ],
         )
         op, ref, inplace_op, inplace_ref = self._get_funcs(op)
-        self._binary_test(dtype, op, ref, inputs, True, False, alpha=None, scalar_self_arg=False)
         self._binary_test(
-            dtype, inplace_op, inplace_ref, inputs, True, True, alpha=None, scalar_self_arg=False
+            dtype, op, ref, inputs, True, False, alpha=None, scalar_self_arg=False
+        )
+        self._binary_test(
+            dtype,
+            inplace_op,
+            inplace_ref,
+            inputs,
+            True,
+            True,
+            alpha=None,
+            scalar_self_arg=False,
         )
 
     # note: Below three tests (postfixed with `_tensors_on_different_devices`)
@@ -548,7 +817,9 @@ def test_binary_op_float_inf_nan(self, device, dtype, op):
     def test_unary_op_tensors_on_different_devices(self, device, dtype, op):
         method, ref, inplace_method, ref_inplace = self._get_funcs(op)
         # tensors: ['cuda', 'cpu]
-        tensors = next(iter(op.sample_inputs(device, dtype, num_input_tensors=[2]))).input
+        tensors = next(
+            iter(op.sample_inputs(device, dtype, num_input_tensors=[2]))
+        ).input
         tensors[1] = tensors[1].to("cpu")
         if not op.supports_out:
             try:
@@ -576,8 +847,12 @@ def test_unary_op_tensors_on_different_devices(self, device, dtype, op):
     def test_binary_op_tensors_on_different_devices(self, device, dtype, op):
         # `tensors1`: ['cuda', 'cpu']
         # `tensors2`: ['cuda', 'cpu']
-        _cuda_tensors = next(iter(op.sample_inputs(device, dtype, num_input_tensors=[2], same_size=True))).input
-        _cpu_tensors = next(iter(op.sample_inputs("cpu", dtype, num_input_tensors=[2], same_size=True))).input
+        _cuda_tensors = next(
+            iter(op.sample_inputs(device, dtype, num_input_tensors=[2], same_size=True))
+        ).input
+        _cpu_tensors = next(
+            iter(op.sample_inputs("cpu", dtype, num_input_tensors=[2], same_size=True))
+        ).input
         tensors1, tensors2 = list(zip(_cuda_tensors, _cpu_tensors))
 
         foreach_op, foreach_op_ = op.method_variant, op.inplace_variant
@@ -608,10 +883,16 @@ def test_pointwise_op_tensors_on_different_devices(self, device, dtype, op):
         _cuda_tensors = list(
             op.sample_inputs(device, dtype, num_input_tensors=[3], same_size=True)
         )[int(dtype == torch.float32)].input
-        _cpu_tensors = next(iter(op.sample_inputs("cpu", dtype, num_input_tensors=[3], same_size=True))).input
+        _cpu_tensors = next(
+            iter(op.sample_inputs("cpu", dtype, num_input_tensors=[3], same_size=True))
+        ).input
         tensors1, tensors2, tensors3 = list(zip(_cuda_tensors, _cpu_tensors))
 
-        foreach_op, foreach_op_, native_op = op.method_variant, op.inplace_variant, op.ref
+        foreach_op, foreach_op_, native_op = (
+            op.method_variant,
+            op.inplace_variant,
+            op.ref,
+        )
         actual = foreach_op(tensors1, tensors2, tensors3)
         expected = [native_op(*_cuda_tensors), native_op(*_cpu_tensors)]
         self.assertEqual(expected, actual)
@@ -628,24 +909,78 @@ def test_foreach_l2_large_value_input(self, device, dtype, op):
         ord, N = 2, 10
         max_value = torch.finfo(dtype).max
         scaler = torch.tensor([max_value]).sqrt().to(device=device, dtype=dtype)
-        inputs = ([
-            t * scaler for t in next(iter(op.sample_inputs(device, dtype, requries_grad=True, num_input_tensors=[N], low=1))).input
-        ],)
+        inputs = (
+            [
+                t * scaler
+                for t in next(
+                    iter(
+                        op.sample_inputs(
+                            device,
+                            dtype,
+                            requries_grad=True,
+                            num_input_tensors=[N],
+                            low=1,
+                        )
+                    )
+                ).input
+            ][:-1],
+        )
         # make sure that the min. of squared L2 norm value per tensor is greater than the max value of `dtype`.
         self.assertTrue(scaler * scaler * N > max_value)
         fn, ref_fn, *_ = self._get_funcs(op)
-        actual = fn(inputs, is_cuda=True, expect_fastpath=True, ord=ord, zero_size=False)
+        actual = fn(
+            inputs, is_cuda=True, expect_fastpath=True, ord=ord, zero_size=False
+        )
         expect = ref_fn(inputs, ord=ord)
 
         if dtype == torch.float16:
             # making sure the reference L2 norm values are in the range of FP16.
             self.assertFalse(any(torch.isinf(e) for e in expect))
         else:
-            self.assertTrue(all(
-                inputs[0][i].numel() == 0 or torch.isinf(e)
-                for i, e in enumerate(expect)))
+            self.assertTrue(
+                all(
+                    inputs[0][i].numel() == 0 or torch.isinf(e)
+                    for i, e in enumerate(expect)
+                )
+            )
         self.assertEqual(expect, actual, equal_nan=False)
 
+    @onlyCUDA
+    @ops(foreach_reduce_op_db, allowed_dtypes=floating_types())
+    @parametrize("use_cuda_graph", (False, True))
+    def test_big_num_tensors(self, device, dtype, op, use_cuda_graph):
+        N = 600
+        tensorlist = [
+            make_tensor((2, 3), dtype=dtype, device=device, noncontiguous=False)
+            for _ in range(N)
+        ]
+        fn, ref_fn, *_ = self._get_funcs(op)
+
+        import math
+
+        for ord in (1, 2, math.inf):
+            if not use_cuda_graph:
+                actual = fn(
+                    inputs=[tensorlist],
+                    is_cuda=True,
+                    expect_fastpath=True,
+                    ord=ord,
+                    zero_size=False,
+                )
+            else:
+                # When using CUDA graphs and the tensor metadata doesn't fit in
+                # the static kernel argument space, multi_tensor_apply creates
+                # the launch arguments once, uses cudaUserObject_t to tie its
+                # lifetime to the graph, and reuses it throughout replays. This
+                # test verifies multi_tensor_apply's behavior in the scenario.
+                g = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(g):
+                    actual = fn.func(tensorlist, ord=ord)
+                g.replay()
+            expect = ref_fn(inputs=[tensorlist], ord=ord)
+
+            self.assertEqual(expect, actual, equal_nan=True)
+
     @onlyCUDA
     @ops(foreach_reduce_op_db)
     def test_foreach_reduce_large_input(self, device, dtype, op):
@@ -658,12 +993,17 @@ def test_foreach_reduce_large_input(self, device, dtype, op):
         wrapped_op, ref, _, _ = self._get_funcs(op)
         self.assertEqual(
             ref(inputs, ord=ord),
-            wrapped_op(inputs, self.is_cuda, not disable_fastpath, ord=ord, zero_size=False),
+            wrapped_op(
+                inputs, self.is_cuda, not disable_fastpath, ord=ord, zero_size=False
+            ),
         )
 
     @onlyCUDA
     @ops(
-        foreach_unary_op_db + foreach_binary_op_db + foreach_pointwise_op_db + foreach_other_op_db,
+        foreach_unary_op_db
+        + foreach_binary_op_db
+        + foreach_pointwise_op_db
+        + foreach_other_op_db,
         dtypes=(torch.float,),
     )
     def test_inplace_foreach_leaf_check_and_grad_fn(self, device, dtype, op):
@@ -671,7 +1011,13 @@ def test_inplace_foreach_leaf_check_and_grad_fn(self, device, dtype, op):
         if inplace_op is None:
             self.skipTest("no in-place op available")
 
-        sample = next(iter(op.sample_inputs(dtype=dtype, device=device, num_input_tensors=[2], same_size=True)))
+        sample = next(
+            iter(
+                op.sample_inputs(
+                    dtype=dtype, device=device, num_input_tensors=[2], same_size=True
+                )
+            )
+        )
         sample.input[0].requires_grad_(True)
         with self.assertRaisesRegex(RuntimeError, "a leaf Variable that requires grad"):
             inplace_op(sample.input, *sample.args)
@@ -679,7 +1025,10 @@ def test_inplace_foreach_leaf_check_and_grad_fn(self, device, dtype, op):
         with self.assertRaisesRegex(RuntimeError, "a leaf Variable that requires grad"):
             inplace_op(sample.input, *sample.args)
 
-        _tensors = [t.clone().detach().requires_grad_(i == 0) for i, t in enumerate(sample.input)]
+        _tensors = [
+            t.clone().detach().requires_grad_(i == 0)
+            for i, t in enumerate(sample.input)
+        ]
         tensors = [t.clone() for t in _tensors]
         inplace_op(tensors, *sample.args)
         self.assertIsNotNone(tensors[0].grad_fn)
@@ -689,15 +1038,33 @@ def test_inplace_foreach_leaf_check_and_grad_fn(self, device, dtype, op):
     @ops(
         filter(
             lambda op: op.supports_out,
-            foreach_unary_op_db + foreach_binary_op_db + foreach_pointwise_op_db + foreach_other_op_db,
+            foreach_unary_op_db
+            + foreach_binary_op_db
+            + foreach_pointwise_op_db
+            + foreach_other_op_db,
         ),
         dtypes=(torch.float,),
     )
     def test_outplace_with_invalid_grads(self, device, dtype, op):
         func, *_ = self._get_funcs(op)
-        sample = next(iter(op.sample_inputs(dtype=dtype, device=device, requires_grad=True, num_input_tensors=[2], same_size=True)))
+        sample = next(
+            iter(
+                op.sample_inputs(
+                    dtype=dtype,
+                    device=device,
+                    requires_grad=True,
+                    num_input_tensors=[2],
+                    same_size=True,
+                )
+            )
+        )
         self.assertTrue(all(t.requires_grad for t in sample.input))
-        (out1, out2) = func([sample.input, *sample.args], is_cuda=False, expect_fastpath=False, **sample.kwargs)
+        (out1, out2) = func(
+            [sample.input, *sample.args],
+            is_cuda=False,
+            expect_fastpath=False,
+            **sample.kwargs,
+        )
         out1.backward(torch.ones_like(out1))
         self.assertIsNotNone(sample.input[0].grad)
         self.assertIsNone(sample.input[1].grad)
@@ -705,17 +1072,24 @@ def test_outplace_with_invalid_grads(self, device, dtype, op):
     @ops(
         filter(
             lambda op: op.backward_requires_result,
-            foreach_unary_op_db + foreach_binary_op_db + foreach_pointwise_op_db + foreach_other_op_db,
+            foreach_unary_op_db
+            + foreach_binary_op_db
+            + foreach_pointwise_op_db
+            + foreach_other_op_db,
         ),
         dtypes=(torch.float32,),
     )
     def test_lifetime_of_grad_fn_when_result_is_saved(self, device, dtype, op):
-
         def get_ref(func, sample):
             class Foo:
                 pass
 
-            out = func((sample.input, *sample.args), is_cuda=False, expect_fastpath=False, **sample.kwargs)
+            out = func(
+                (sample.input, *sample.args),
+                is_cuda=False,
+                expect_fastpath=False,
+                **sample.kwargs,
+            )
             foo = Foo()
             meta_dict = out[0].grad_fn.metadata
             meta_dict[0] = foo
@@ -729,7 +1103,9 @@ def _test(func, sample):
             self.assertIsNone(ref())
 
         func = self._get_funcs(op)[0]
-        for sample in op.sample_inputs(device, dtype, requires_grad=True, num_input_tensors=[1]):
+        for sample in op.sample_inputs(
+            device, dtype, requires_grad=True, num_input_tensors=[1]
+        ):
             for key in ("is_fastpath", "disable_fastpath"):
                 if key in sample.kwargs:
                     del sample.kwargs[key]
@@ -737,8 +1113,11 @@ def _test(func, sample):
             # see: https://github.com/pytorch/pytorch/blob/5403c777/tools/autograd/derivatives.yaml#L3048-L3049
             if op.name == "_foreach_pow":
                 if (
-                    (isinstance(sample.args[0], list) and isinstance(sample.args[0][0], Number))
-                    or (isinstance(sample.args[0], Number) and not isinstance(sample.args[0], float))
+                    isinstance(sample.args[0], list)
+                    and isinstance(sample.args[0][0], Number)
+                ) or (
+                    isinstance(sample.args[0], Number)
+                    and not isinstance(sample.args[0], float)
                 ):
                     continue
                 if isinstance(sample.args[0], float):
@@ -757,12 +1136,15 @@ def test_tensors_grouping(self):
                 i,
                 device=torch.device("cuda", random.randint(0, num_devices - 1)),
                 dtype=dtypes[random.randint(0, 2)],
-            ) for i in range(num_tensors_per_list)
+            )
+            for i in range(num_tensors_per_list)
         ]
         list2 = [None for _ in list1]
         list3 = [torch.rand_like(t) for t in list1]
         nested_tensorlists = [list1, list2, list3]
-        grouped_tensors = torch.utils._foreach_utils._group_tensors_by_device_and_dtype(nested_tensorlists, with_indices=True)
+        grouped_tensors = torch.utils._foreach_utils._group_tensors_by_device_and_dtype(
+            nested_tensorlists, with_indices=True
+        )
         num_tensors_seen = 0
         for (device, dtype), ([l1, l2, l3], indices) in grouped_tensors.items():
             for t in itertools.chain(l1, l3):
@@ -788,18 +1170,25 @@ def test_0dim_tensor_overload_cpu_ok(self):
         actual = torch._foreach_div(tensors, scalar_cpu_tensor)
         self.assertEqual(actual, [t.div(scalar_cpu_tensor) for t in tensors])
 
-
     @onlyCUDA
     def test_0dim_tensor_overload_exception(self):
         # check exceptions of fast path
-        tensors = [make_tensor((2, 2), dtype=torch.float, device="cuda") for _ in range(2)]
+        tensors = [
+            make_tensor((2, 2), dtype=torch.float, device="cuda") for _ in range(2)
+        ]
         with self.assertRaisesRegex(RuntimeError, "scalar tensor expected to be on"):
             torch._foreach_add(tensors, torch.tensor(1.0, device="cpu"), alpha=1.0)
 
-        tensors = [make_tensor((2, 2), dtype=torch.float, device=d) for d in ("cpu", "cuda")]
-        with self.assertRaisesRegex(RuntimeError, "scalar tensor expected to be 0 dim but"):
+        tensors = [
+            make_tensor((2, 2), dtype=torch.float, device=d) for d in ("cpu", "cuda")
+        ]
+        with self.assertRaisesRegex(
+            RuntimeError, "scalar tensor expected to be 0 dim but"
+        ):
             torch._foreach_mul(tensors, torch.tensor([1.0, 1.0], device="cuda"))
-        with self.assertRaisesRegex(RuntimeError, "scalar tensor expected to be 0 dim but"):
+        with self.assertRaisesRegex(
+            RuntimeError, "scalar tensor expected to be 0 dim but"
+        ):
             torch._foreach_add(tensors, torch.tensor([1.0, 1.0], device="cuda"))
 
     @onlyCUDA
@@ -826,29 +1215,54 @@ def test_foreach_copy_with_multi_device_inputs(self, device, dtype, op):
     # Test reverse-mode & forward-mode AD if supported.
     @onlyCUDA
     @ops(
-        foreach_unary_op_db + foreach_binary_op_db + foreach_pointwise_op_db + foreach_reduce_op_db + foreach_other_op_db,
+        foreach_unary_op_db
+        + foreach_binary_op_db
+        + foreach_pointwise_op_db
+        + foreach_reduce_op_db
+        + foreach_other_op_db,
         dtypes=OpDTypes.supported,
         allowed_dtypes=(torch.float64, torch.complex128),
     )
-    @parametrize("inplace", (False, True), name_fn=lambda x: "inplace" if x else "outplace")
+    @parametrize(
+        "inplace", (False, True), name_fn=lambda x: "inplace" if x else "outplace"
+    )
     def test_autodiff(self, device, dtype, op, inplace):
-        if not (op.supports_autograd or op.supports_forward_ad):
-            self.skipTest("neither reverse mode nor forward mode supported")
         if (not inplace) and not op.supports_out:
             self.skipTest("out-of-place not implemented")
         if inplace and op.has_no_in_place:
             self.skipTest("in-place not implemented")
+        if not (
+            op.supports_autograd
+            or op.supports_inplace_autograd
+            or op.supports_forward_ad
+        ):
+            self.skipTest("neither reverse mode nor forward mode supported")
 
         # note(crcrpar): without this, some unary functions fail, unlike inplace and/or complex.
-        if (not inplace) and dtype == torch.float64 and op.name in (
-            "_foreach_acos", "_foreach_asin", "_foreach_log10", "_foreach_log1p", "_foreach_log2",
-            "_foreach_log", "_foreach_pow", "_foreach_sqrt",
+        if (
+            (not inplace)
+            and dtype == torch.float64
+            and op.name
+            in (
+                "_foreach_acos",
+                "_foreach_asin",
+                "_foreach_log10",
+                "_foreach_log1p",
+                "_foreach_log2",
+                "_foreach_log",
+                "_foreach_pow",
+                "_foreach_sqrt",
+            )
         ):
             value_range = {"low": 0.5, "high": 1.0}
         else:
             value_range = {}
         for sample in op.sample_inputs(
-            device, dtype, requires_grad=True, num_input_tensors=[5], **value_range,
+            device,
+            dtype,
+            requires_grad=True,
+            num_input_tensors=[5],
+            **value_range,
         ):
             # Skip `_foreach_pow.ScalarAndTensor(Scalar, Tensor[])`
             if op.name == "_foreach_pow" and isinstance(sample.input, Number):
@@ -859,17 +1273,32 @@ def test_autodiff(self, device, dtype, op, inplace):
                 # Call `clone` to avoid inplace modifications likewise
                 # `torch.testing._internal.common_utils.TestGradients._get_safe_inplace`
                 def inplace_func(*tensorlist):
-                    kwargs = {"alpha": sample.kwargs["alpha"]} if "alpha" in sample.kwargs else {}
-                    op.inplace_variant(tuple(t.clone() for t in tensorlist), *sample.args, **kwargs)
+                    kwargs = (
+                        {"alpha": sample.kwargs["alpha"]}
+                        if "alpha" in sample.kwargs
+                        else {}
+                    )
+                    op.inplace_variant(
+                        tuple(t.clone() for t in tensorlist), *sample.args, **kwargs
+                    )
                     return tensorlist
+
                 func = inplace_func
             else:
+
                 def outplace_func(*tensorlist):
-                    kwargs = {"alpha": sample.kwargs["alpha"]} if "alpha" in sample.kwargs else {}
+                    kwargs = (
+                        {"alpha": sample.kwargs["alpha"]}
+                        if "alpha" in sample.kwargs
+                        else {}
+                    )
                     return op.method_variant(tensorlist, *sample.args, **kwargs)
+
                 func = outplace_func
 
-            working_sample, err_msg_pattern = check_autodiff_sample(op, sample, dtype, inplace)
+            working_sample, err_msg_pattern = check_autodiff_sample(
+                op, sample, dtype, inplace
+            )
 
             def call_gradcheck():
                 gradcheck(
@@ -897,7 +1326,6 @@ def call_gradcheck():
                 hook_buffer = []
 
                 def get_grad_fn_hook(i):
-
                     def hook(grad_inputs, grad_outputs) -> None:
                         hook_buffer.append(i)
 
@@ -905,7 +1333,11 @@ def hook(grad_inputs, grad_outputs) -> None:
 
                 _inputs = [t.clone().detach().requires_grad_() for t in sample.input]
                 inputs = [t.clone() for t in _inputs]
-                kwargs = {"alpha": sample.kwargs["alpha"]} if "alpha" in sample.kwargs else {}
+                kwargs = (
+                    {"alpha": sample.kwargs["alpha"]}
+                    if "alpha" in sample.kwargs
+                    else {}
+                )
                 op.inplace_variant(inputs, *sample.args, **kwargs)
 
                 self.assertEqual(len({t.grad_fn for t in inputs}), len(inputs))
@@ -941,36 +1373,50 @@ def hook(grad_inputs, grad_outputs) -> None:
 def check_autodiff_sample(op, sample, dtype, is_inplace):
     if op.name == "_foreach_abs" and is_inplace and dtype == torch.complex128:
         return False, "In-place abs is not supported for complex tensors."
-    if (
-        op.name == "_foreach_sub"
-        and (
-            (isinstance(sample.args[0], list) and any(isinstance(a, bool) for a in sample.args[0]))
-            or isinstance(sample.args[0], bool)
+    if op.name == "_foreach_sub" and (
+        (
+            isinstance(sample.args[0], list)
+            and any(isinstance(a, bool) for a in sample.args[0])
         )
+        or isinstance(sample.args[0], bool)
     ):
         return False, _BOOL_SUB_ERR_MSG
     if op.name == "_foreach_norm" and (not is_inplace):
         return (
             False,
             "Trying to set a forward gradient that has a different size than that of the original Tensor, "
-            "this is not supported. Tensor is of size [] while the given forward gradient is of size [1, 1]."
+            "this is not supported. Tensor is of size [] while the given forward gradient is of size [1, 1].",
+        )
+    rhs_arg_has_complex_number = sample.args and (
+        (
+            isinstance(sample.args[0], list)
+            and any(isinstance(a, complex) for a in sample.args[0])
         )
-    rhs_arg_has_complex_number = sample.args and ((
-        isinstance(sample.args[0], list)
-        and any(isinstance(a, complex) for a in sample.args[0])
-    ) or (
-        isinstance(sample.args[0], complex)
-    ))
+        or (isinstance(sample.args[0], complex))
+    )
     if rhs_arg_has_complex_number and dtype == torch.float64:
-        if op.name in ("_foreach_clamp_max", "_foreach_clamp_min", "_foreach_maximum", "_foreach_minimum"):
+        if op.name in (
+            "_foreach_clamp_max",
+            "_foreach_clamp_min",
+            "_foreach_maximum",
+            "_foreach_minimum",
+        ):
             return False, "clamp is not supported for complex types"
         if not is_inplace:
             return False, ""
         else:
             if op.name == "_foreach_pow":
                 return False, "Found dtype Double but expected ComplexDouble"
-            if op.name in ("_foreach_add", "_foreach_sub", "_foreach_mul", "_foreach_div"):
-                return False, "result type ComplexDouble can't be cast to the desired output type Double"
+            if op.name in (
+                "_foreach_add",
+                "_foreach_sub",
+                "_foreach_mul",
+                "_foreach_div",
+            ):
+                return (
+                    False,
+                    "result type ComplexDouble can't be cast to the desired output type Double",
+                )
     return True, ""
 
 
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index ac2443823f2ff..978b58b492c03 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -1,31 +1,46 @@
 # Owner(s): ["module: codegen"]
 
-import torch
+import unittest
 from contextlib import nullcontext
-from torch.testing._internal.common_utils import (
-    TestCase, run_tests, skipIfTorchDynamo, TEST_WITH_TORCHDYNAMO, IS_WINDOWS,
-    xfail_inherited_tests
+
+import torch
+from torch._dispatch.python import (
+    enable_crossref_functionalize,
+    enable_python_dispatcher,
+)
+from torch._subclasses.functional_tensor import (
+    dispatch_functionalize,
+    FunctionalTensor,
+    FunctionalTensorMode,
 )
-from torch._subclasses.functional_tensor import FunctionalTensor, FunctionalTensorMode, dispatch_functionalize
-from torch.testing._internal.logging_tensor import LoggingTensor, capture_logs
-from torch.utils._pytree import tree_map_only
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.passes.reinplace import reinplace
-from torch._dispatch.python import enable_crossref_functionalize, enable_python_dispatcher
 from torch.multiprocessing.reductions import StorageWeakRef
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    run_tests,
+    skipIfTorchDynamo,
+    TEST_WITH_TORCHDYNAMO,
+    TestCase,
+    xfail_inherited_tests,
+)
+from torch.testing._internal.logging_tensor import capture_logs, LoggingTensor
 from torch.utils import _pytree as pytree
+from torch.utils._pytree import tree_map_only
 
-import unittest
 
 def are_aliased(x, y):
     x_storage = StorageWeakRef(x.storage())
     y_storage = StorageWeakRef(y.storage())
     return x_storage == y_storage
 
+
 # We can unify testing and use functionalize() here instead
 # if/when functorch moves into core.
 # This is basically a crappy version of `functionalize()`.
-def _functionalize(f, *, reapply_views: bool, crossref: bool, skip_input_mutations: bool = False):
+def _functionalize(
+    f, *, reapply_views: bool, crossref: bool, skip_input_mutations: bool = False
+):
     def to_fun(t: torch.Tensor):
         func_t = torch._to_functional_tensor(t)
         func_t.requires_grad = t.requires_grad
@@ -54,34 +69,46 @@ def wrapped(*inputs):
                     if inpt_new.shape == inpt.shape:
                         inpt.copy_(inpt_new)
             tree_map_only(torch.Tensor, torch._sync, out)
-            out_unwrapped = tree_map_only(torch.Tensor, torch._from_functional_tensor, out)
+            out_unwrapped = tree_map_only(
+                torch.Tensor, torch._from_functional_tensor, out
+            )
             return out_unwrapped
 
     return wrapped
 
-@unittest.skipIf(TEST_WITH_TORCHDYNAMO, "https://github.com/pytorch/pytorch/issues/81457")
-class TestFunctionalization(TestCase):
 
+@unittest.skipIf(
+    TEST_WITH_TORCHDYNAMO, "https://github.com/pytorch/pytorch/issues/81457"
+)
+class TestFunctionalization(TestCase):
     crossref = False
 
     def get_logs(self, func, *inpts, reapply_views=False, run_reinplace=False):
         inpts_clone = tree_map_only(torch.Tensor, torch.clone, inpts)
-        traced_f = make_fx(_functionalize(func, reapply_views=reapply_views, crossref=self.crossref))(*inpts)
+        traced_f = make_fx(
+            _functionalize(func, reapply_views=reapply_views, crossref=self.crossref)
+        )(*inpts)
         if run_reinplace:
             traced_f = reinplace(traced_f, *inpts_clone)
         return traced_f.code
 
-    def assert_functionalization(self, func, *inpts, reapply_views=False, mutated_input_metadata=False):
+    def assert_functionalization(
+        self, func, *inpts, reapply_views=False, mutated_input_metadata=False
+    ):
         clones1 = tree_map_only(torch.Tensor, torch.clone, inpts)
         clones2 = tree_map_only(torch.Tensor, torch.clone, inpts)
         clones3 = tree_map_only(torch.Tensor, torch.clone, inpts)
 
         # Compare outputs (and mutated inputs), with and without functionalization.
         out_ref = func(*inpts)
-        out_functional = _functionalize(func, reapply_views=reapply_views, crossref=self.crossref)(*clones1)
+        out_functional = _functionalize(
+            func, reapply_views=reapply_views, crossref=self.crossref
+        )(*clones1)
 
         # The reinplacing pass is only valid to run with reapply_views=True.
-        functional_func = make_fx(_functionalize(func, reapply_views=True, crossref=self.crossref))(*clones2)
+        functional_func = make_fx(
+            _functionalize(func, reapply_views=True, crossref=self.crossref)
+        )(*clones2)
         reinplace_func = reinplace(functional_func, *clones2)
 
         # NOTE: for now, need to pass in fresh inputs here, because make_fx
@@ -95,22 +122,38 @@ def assert_functionalization(self, func, *inpts, reapply_views=False, mutated_in
             flat_inpts = pytree.tree_leaves(inpts)
             flat_clones1 = pytree.tree_leaves(clones1)
             flat_clones3 = pytree.tree_leaves(clones3)
-            for inpt, input_clone, input_clone3 in zip(flat_inpts, flat_clones1, flat_clones3):
-                self.assertEqual(inpt, input_clone)  # input mutations should still occur
+            for inpt, input_clone, input_clone3 in zip(
+                flat_inpts, flat_clones1, flat_clones3
+            ):
+                self.assertEqual(
+                    inpt, input_clone
+                )  # input mutations should still occur
                 self.assertEqual(inpt, input_clone3)
 
         # Handle tests with multi-tensor outputs
         if isinstance(out_ref, tuple):
-            out_refs, out_functionals, out_reinplaces = list(out_ref), list(out_functional), list(out_reinplace)
+            out_refs, out_functionals, out_reinplaces = (
+                list(out_ref),
+                list(out_functional),
+                list(out_reinplace),
+            )
         else:
-            out_refs, out_functionals, out_reinplaces = [out_ref], [out_functional], [out_reinplace]
+            out_refs, out_functionals, out_reinplaces = (
+                [out_ref],
+                [out_functional],
+                [out_reinplace],
+            )
 
-        for out_ref_, out_functional_, out_reinplace_ in zip(out_refs, out_functionals, out_reinplaces):
+        for out_ref_, out_functional_, out_reinplace_ in zip(
+            out_refs, out_functionals, out_reinplaces
+        ):
             self.assertEqual(out_ref_, out_functional_)
             self.assertEqual(out_ref_, out_reinplace_)
 
     def test_save_for_backwards_segfault(self):
-        inp = torch._to_functional_tensor(LoggingTensor(torch.randn(2, 2))).requires_grad_(True)
+        inp = torch._to_functional_tensor(
+            LoggingTensor(torch.randn(2, 2))
+        ).requires_grad_(True)
         inp.exp()
 
     def test_multiple_views_of_same_base(self):
@@ -123,6 +166,7 @@ def f(x):
             # z should have been updated too.
             z2 = z + 1
             return z2
+
         self.assert_functionalization(f, torch.ones(4))
 
     def test_freeze(self):
@@ -143,7 +187,9 @@ def f(x):
             y.copy_(x)
             return y
 
-        r = _functionalize(f, reapply_views=True, crossref=self.crossref)(torch.ones(2, 2))
+        r = _functionalize(f, reapply_views=True, crossref=self.crossref)(
+            torch.ones(2, 2)
+        )
         self.assertEqual(r.stride(), (5, 1))
 
     def test_set_(self):
@@ -155,7 +201,7 @@ def f(x):
         # We should probaby get the crossref test to work,
         # but fixing it for Storage() objects is annoying.
         r = _functionalize(f, reapply_views=True, crossref=False)(torch.ones(2))
-        self.assertEqual(str(r.device), 'cpu')
+        self.assertEqual(str(r.device), "cpu")
 
     def test_advanced_indexing(self):
         def f():
@@ -178,8 +224,11 @@ def f(input):
 
         def g(x):
             loss = f(x).sum()
-            from torch._functorch.aot_autograd import setup_stacktrace_preservation_hooks
             import torch.fx.traceback as fx_traceback
+            from torch._functorch.aot_autograd import (
+                setup_stacktrace_preservation_hooks,
+            )
+
             setup_stacktrace_preservation_hooks([loss.grad_fn])
             with fx_traceback.preserve_node_meta():
                 loss.backward()
@@ -187,7 +236,9 @@ def g(x):
 
         with torch.autograd.detect_anomaly(check_nan=False):
             logs = self.get_logs(g, torch.ones(16, 64, 128, 128, requires_grad=True))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -217,7 +268,8 @@ def forward(self, arg0_1):
     view_copy_11 = torch.ops.aten.view_copy.default(view_copy_8, [16, 64, 128, 128]);  view_copy_8 = None
     detach_copy_1 = torch.ops.aten.detach_copy.default(view_copy_11);  view_copy_11 = None
     return detach_copy_1
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
     def test_simple(self):
         def f(x):
@@ -227,9 +279,12 @@ def f(x):
             y.add_(tmp)
             z = x * x
             return y
+
         self.assert_functionalization(f, torch.ones(4, 2))
         logs = self.get_logs(f, torch.ones(4, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -242,10 +297,15 @@ def forward(self, arg0_1):
     mul = torch.ops.aten.mul.Tensor(view_copy_1, view_copy_1)
     copy_ = torch.ops.aten.copy_.default(arg0_1, view_copy_1);  arg0_1 = view_copy_1 = None
     return view_copy_2
-    """)
+    """,
+        )
 
-        reinplaced_logs = self.get_logs(f, torch.ones(4, 2), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(4, 2), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -258,7 +318,8 @@ def forward(self, arg0_1):
     mul = torch.ops.aten.mul.Tensor(view_1, view_1)
     copy_ = torch.ops.aten.copy_.default(arg0_1, view_1);  arg0_1 = view_1 = None
     return view_2
-    """)
+    """,
+        )
 
     def test_simple_out(self):
         def f(x):
@@ -269,9 +330,12 @@ def f(x):
             torch.add(y, tmp, out=z)
             w = z * z
             return w
+
         self.assert_functionalization(f, torch.ones(4, 2))
         logs = self.get_logs(f, torch.ones(4, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -282,10 +346,15 @@ def forward(self, arg0_1):
     add = torch.ops.aten.add.Tensor(view_copy, ones);  view_copy = ones = None
     mul = torch.ops.aten.mul.Tensor(add, add);  add = None
     return mul
-    """)
+    """,
+        )
 
-        reinplaced_logs = self.get_logs(f, torch.ones(4, 2), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(4, 2), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -296,7 +365,8 @@ def forward(self, arg0_1):
     add = torch.ops.aten.add.Tensor(view, ones);  view = ones = None
     mul = torch.ops.aten.mul.Tensor(add, add);  add = None
     return mul
-    """)
+    """,
+        )
 
     def test_multi_out(self):
         def f(x):
@@ -306,9 +376,12 @@ def f(x):
             out_max = torch.empty(4)
             torch.aminmax(x, dim=0, out=(out_max, out_min))
             return out_max
+
         self.assert_functionalization(f, torch.arange(8, dtype=torch.float32))
         logs = self.get_logs(f, torch.arange(8, dtype=torch.float32))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -319,10 +392,18 @@ def forward(self, arg0_1):
     getitem = aminmax[0]
     getitem_1 = aminmax[1];  aminmax = None
     return getitem
-    """)
+    """,
+        )
 
-        reinplaced_logs = self.get_logs(f, torch.arange(8, dtype=torch.float32), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f,
+            torch.arange(8, dtype=torch.float32),
+            reapply_views=True,
+            run_reinplace=True,
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -333,7 +414,8 @@ def forward(self, arg0_1):
     getitem = aminmax[0]
     getitem_1 = aminmax[1];  aminmax = None
     return getitem
-    """)
+    """,
+        )
 
     def test_tensor_ctr(self):
         def f(x):
@@ -346,7 +428,9 @@ def f(x):
         self.assert_functionalization(f, inpt)
 
         logs = self.get_logs(f, inpt)
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -358,10 +442,13 @@ def forward(self, arg0_1):
     view_copy_1 = torch.ops.aten.view_copy.default(add, [3]);  add = None
     view_copy_2 = torch.ops.aten.view_copy.default(view_copy_1, [-1])
     return view_copy_1
-    """)
+    """,
+        )
 
         reinplaced_logs = self.get_logs(f, inpt, reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -373,7 +460,8 @@ def forward(self, arg0_1):
     view_1 = torch.ops.aten.view.default(view, [3]);  view = None
     view_2 = torch.ops.aten.view.default(view_1, [-1])
     return view_1
-    """)
+    """,
+        )
 
     def test_advanced_indexing_correct_strides(self):
         def f(a):
@@ -383,6 +471,7 @@ def f(a):
             c = torch.ones_like(b, dtype=torch.bool)
             d = b.masked_fill_(c, 0)
             return d
+
         self.assert_functionalization(f, torch.ones(2, 2), reapply_views=True)
 
     def test_tensor_list_mixed_functional_nonfunctional(self):
@@ -393,8 +482,11 @@ def f(x):
             functional_tensor = torch.ones(2, dtype=torch.long)
             out = x[functional_tensor, nonfunctional_tensor]
             return out
+
         out = f(torch.ones(2, 2))
-        out_functional = _functionalize(f, reapply_views=True, crossref=self.crossref)(torch.ones(2, 2))
+        out_functional = _functionalize(f, reapply_views=True, crossref=self.crossref)(
+            torch.ones(2, 2)
+        )
         self.assertEqual(out, out_functional)
 
     def test_inplace_on_non_view(self):
@@ -405,9 +497,12 @@ def f(x):
             y = x.view(4, 2)
             x.add_(tmp)
             return y
+
         self.assert_functionalization(f, torch.ones(4, 2))
         logs = self.get_logs(f, torch.ones(4, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -418,10 +513,15 @@ def forward(self, arg0_1):
     copy_ = torch.ops.aten.copy_.default(arg0_1, add);  arg0_1 = None
     view_copy_1 = torch.ops.aten.view_copy.default(add, [4, 2]);  add = None
     return view_copy_1
-    """)
+    """,
+        )
 
-        reinplaced_logs = self.get_logs(f, torch.ones(4, 2), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(4, 2), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -432,16 +532,21 @@ def forward(self, arg0_1):
     copy_ = torch.ops.aten.copy_.default(arg0_1, add);  arg0_1 = None
     view_1 = torch.ops.aten.view.default(add, [4, 2]);  add = None
     return view_1
-    """)
+    """,
+        )
 
     # Some ops that are mutable are neither inplace nor out= ops.
     # They also need special handling.
     def test_mutable_op_not_inplace_or_other(self):
         def f(x):
-            return torch._fused_moving_avg_obs_fq_helper(x, x, x, x, x, x, x, 1.0, 0, 1, 0)
+            return torch._fused_moving_avg_obs_fq_helper(
+                x, x, x, x, x, x, x, 1.0, 0, 1, 0
+            )
 
         logs = self.get_logs(f, torch.ones(1))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -455,16 +560,20 @@ def forward(self, arg0_1):
     getitem_5 = _fused_moving_avg_obs_fq_helper_functional[5];  _fused_moving_avg_obs_fq_helper_functional = None
     copy_ = torch.ops.aten.copy_.default(arg0_1, getitem_5);  arg0_1 = getitem_5 = None
     return (getitem, getitem_1)
-    """)  # noqa: B950
+    """,  # noqa: B950
+        )
 
     def test_as_strided(self):
         def f(x):
             y = x.as_strided((2,), (2,), 1)
             y.add_(1)
             return x
+
         self.assert_functionalization(f, torch.ones(9))
         logs = self.get_logs(f, torch.ones(9))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -475,11 +584,16 @@ def forward(self, arg0_1):
     as_strided_copy_1 = torch.ops.aten.as_strided_copy.default(as_strided_scatter, [2], [2], 1)
     copy_ = torch.ops.aten.copy_.default(arg0_1, as_strided_scatter);  arg0_1 = None
     return as_strided_scatter
-    """)
+    """,
+        )
 
         # NB: even with reapply_views=True, we expect to see scatter op
-        reinplaced_logs = self.get_logs(f, torch.ones(2, 2), reapply_views=True, run_reinplace=False)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(2, 2), reapply_views=True, run_reinplace=False
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -490,32 +604,40 @@ def forward(self, arg0_1):
     as_strided_1 = torch.ops.aten.as_strided.default(as_strided_scatter, [2], [2], 1)
     copy_ = torch.ops.aten.copy_.default(arg0_1, as_strided_scatter);  arg0_1 = None
     return as_strided_scatter
-    """)
+    """,
+        )
 
     def test_tensor_list_composite(self):
         def f(x):
             # Test an op with TensorList input
             y = torch.block_diag(x, x)
             return y
+
         self.assert_functionalization(f, torch.ones(2, 2))
         logs = self.get_logs(f, torch.ones(2, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
 def forward(self, arg0_1):
     block_diag = torch.ops.aten.block_diag.default([arg0_1, arg0_1]);  arg0_1 = None
     return block_diag
-    """)
+    """,
+        )
 
     def test_cat(self):
         def f(x):
             out = torch.empty(0)
             torch.cat((x,), out=out)
             return out
+
         self.assert_functionalization(f, torch.ones(2, 2))
         logs = self.get_logs(f, torch.ones(2, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -523,10 +645,15 @@ def forward(self, arg0_1):
     empty = torch.ops.aten.empty.memory_format([0], device = device(type='cpu'), pin_memory = False)
     cat = torch.ops.aten.cat.default([arg0_1]);  arg0_1 = None
     return cat
-    """)
+    """,
+        )
 
-        reinplaced_logs = self.get_logs(f, torch.ones(2, 2), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(2, 2), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -534,8 +661,8 @@ def forward(self, arg0_1):
     empty = torch.ops.aten.empty.memory_format([0], device = device(type='cpu'), pin_memory = False)
     cat = torch.ops.aten.cat.default([arg0_1]);  arg0_1 = None
     return cat
-    """)
-
+    """,
+        )
 
     def test_diagonal(self):
         def f(x):
@@ -545,9 +672,12 @@ def f(x):
             y.add_(tmp)
             z = x * x
             return z
+
         self.assert_functionalization(f, torch.ones(2, 2))
         logs = self.get_logs(f, torch.ones(2, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -560,10 +690,15 @@ def forward(self, arg0_1):
     diagonal_copy_1 = torch.ops.aten.diagonal_copy.default(diagonal_scatter);  diagonal_scatter = None
     mul = torch.ops.aten.mul.Tensor(arg0_1, arg0_1);  arg0_1 = None
     return mul
-    """)
+    """,
+        )
 
-        reinplaced_logs = self.get_logs(f, torch.ones(2, 2), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(2, 2), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -575,7 +710,8 @@ def forward(self, arg0_1):
     diagonal_1 = torch.ops.aten.diagonal.default(clone);  clone = None
     mul = torch.ops.aten.mul.Tensor(arg0_1, arg0_1);  arg0_1 = None
     return mul
-    """)
+    """,
+        )
 
     def test_diagonal_mutated_input(self):
         def f(x):
@@ -584,10 +720,13 @@ def f(x):
             y = x.diagonal()
             y.add_(tmp)
             return x
+
         x = torch.ones(2, 2)
         self.assert_functionalization(f, x)
         logs = self.get_logs(f, torch.ones(2, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -599,11 +738,16 @@ def forward(self, arg0_1):
     diagonal_copy_1 = torch.ops.aten.diagonal_copy.default(diagonal_scatter)
     copy_ = torch.ops.aten.copy_.default(arg0_1, diagonal_scatter);  arg0_1 = None
     return diagonal_scatter
-    """)
+    """,
+        )
 
         # NB: even with reapply_views=True, we expect to see scatter op
-        reinplaced_logs = self.get_logs(f, torch.ones(2, 2), reapply_views=True, run_reinplace=False)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(2, 2), reapply_views=True, run_reinplace=False
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -615,7 +759,8 @@ def forward(self, arg0_1):
     diagonal_1 = torch.ops.aten.diagonal.default(diagonal_scatter)
     copy_ = torch.ops.aten.copy_.default(arg0_1, diagonal_scatter);  arg0_1 = None
     return diagonal_scatter
-    """)
+    """,
+        )
 
     def test_channels_last_contiguous(self):
         def f(x):
@@ -624,13 +769,17 @@ def f(x):
             y = x.diagonal()
             y.add_(tmp)
             return x
+
         x = torch.randn(4, 8, 8, 3).permute(0, 3, 1, 2)
         self.assert_functionalization(f, x)
         logs = self.get_logs(f, x).strip()
         # There should be no clone in the graph
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 def forward(self, arg0_1):
-    return arg0_1""")
+    return arg0_1""",
+        )
 
     def test_split(self):
         def f(x):
@@ -641,9 +790,12 @@ def f(x):
             y3.add_(tmp)
             z = x * x
             return y3
+
         self.assert_functionalization(f, torch.ones(4, 2))
         logs = self.get_logs(f, torch.ones(4, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -666,11 +818,16 @@ def forward(self, arg0_1):
     mul = torch.ops.aten.mul.Tensor(slice_scatter, slice_scatter)
     copy_ = torch.ops.aten.copy_.default(arg0_1, slice_scatter);  arg0_1 = slice_scatter = None
     return diagonal_copy_1
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
         # NB: even with reapply_views=True, we expect to see scatter op
-        reinplaced_logs = self.get_logs(f, torch.ones(4, 2), reapply_views=True, run_reinplace=False)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(4, 2), reapply_views=True, run_reinplace=False
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -693,7 +850,8 @@ def forward(self, arg0_1):
     mul = torch.ops.aten.mul.Tensor(slice_scatter, slice_scatter)
     copy_ = torch.ops.aten.copy_.default(arg0_1, slice_scatter);  arg0_1 = slice_scatter = None
     return diagonal_1
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
     def test_split_with_sizes(self):
         def f(x):
@@ -704,9 +862,12 @@ def f(x):
             y3.add_(tmp)
             z = x * x
             return y3
+
         self.assert_functionalization(f, torch.ones(4, 2))
         logs = self.get_logs(f, torch.ones(4, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -729,11 +890,16 @@ def forward(self, arg0_1):
     mul = torch.ops.aten.mul.Tensor(slice_scatter, slice_scatter)
     copy_ = torch.ops.aten.copy_.default(arg0_1, slice_scatter);  arg0_1 = slice_scatter = None
     return diagonal_copy_1
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
         # NB: even with reapply_views=True, we expect to see scatter op
-        reinplaced_logs = self.get_logs(f, torch.ones(4, 2), reapply_views=True, run_reinplace=False)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(4, 2), reapply_views=True, run_reinplace=False
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -756,7 +922,8 @@ def forward(self, arg0_1):
     mul = torch.ops.aten.mul.Tensor(slice_scatter, slice_scatter)
     copy_ = torch.ops.aten.copy_.default(arg0_1, slice_scatter);  arg0_1 = slice_scatter = None
     return diagonal_1
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
     def test_slice(self):
         def f(x):
@@ -765,9 +932,12 @@ def f(x):
             y = x[0:2]
             y.add_(tmp)
             return x
+
         self.assert_functionalization(f, torch.ones(4, 2), mutated_input_metadata=True)
         logs = self.get_logs(f, torch.ones(4, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -783,11 +953,16 @@ def forward(self, arg0_1):
     slice_copy_1 = torch.ops.aten.slice_copy.Tensor(transpose_copy_3, 0, 0, 2);  transpose_copy_3 = None
     transpose_copy_4 = torch.ops.aten.transpose_copy.int(transpose_copy_2, 1, 0);  transpose_copy_2 = None
     return transpose_copy_4
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
         # NB: even with reapply_views=True, we expect to see scatter op
-        reinplaced_logs = self.get_logs(f, torch.ones(4, 2), reapply_views=True, run_reinplace=False)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(4, 2), reapply_views=True, run_reinplace=False
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -803,7 +978,8 @@ def forward(self, arg0_1):
     slice_2 = torch.ops.aten.slice.Tensor(transpose_3, 0, 0, 2);  transpose_3 = None
     transpose_4 = torch.ops.aten.transpose.int(transpose_2, 1, 0);  transpose_2 = None
     return transpose_4
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
     def test_view_inplace(self):
         def f(x):
@@ -813,9 +989,12 @@ def f(x):
             y = x[0]
             y.add_(tmp)
             return x
+
         self.assert_functionalization(f, torch.ones(4, 2), mutated_input_metadata=True)
         logs = self.get_logs(f, torch.ones(4, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -831,11 +1010,16 @@ def forward(self, arg0_1):
     select_copy_1 = torch.ops.aten.select_copy.int(transpose_copy_3, 0, 0);  transpose_copy_3 = None
     transpose_copy_4 = torch.ops.aten.transpose_copy.int(transpose_copy_2, 1, 0);  transpose_copy_2 = None
     return transpose_copy_4
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
         # NB: even with reapply_views=True, we expect to see scatter op
-        reinplaced_logs = self.get_logs(f, torch.ones(4, 2), reapply_views=True, run_reinplace=False)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(4, 2), reapply_views=True, run_reinplace=False
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -851,7 +1035,8 @@ def forward(self, arg0_1):
     select_1 = torch.ops.aten.select.int(transpose_3, 0, 0);  transpose_3 = None
     transpose_4 = torch.ops.aten.transpose.int(transpose_2, 1, 0);  transpose_2 = None
     return transpose_4
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
     def test_unbind(self):
         def f(x):
@@ -861,9 +1046,12 @@ def f(x):
             y, _ = x.unbind(0)
             y.add_(tmp)
             return x
+
         self.assert_functionalization(f, torch.ones(4, 2), mutated_input_metadata=True)
         logs = self.get_logs(f, torch.ones(4, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -883,11 +1071,16 @@ def forward(self, arg0_1):
     getitem_3 = unbind_copy_1[1];  unbind_copy_1 = None
     transpose_copy_4 = torch.ops.aten.transpose_copy.int(transpose_copy_2, 1, 0);  transpose_copy_2 = None
     return transpose_copy_4
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
         # NB: even with reapply_views=True, we expect to see scatter op
-        reinplaced_logs = self.get_logs(f, torch.ones(4, 2), reapply_views=True, run_reinplace=False)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(4, 2), reapply_views=True, run_reinplace=False
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -907,7 +1100,8 @@ def forward(self, arg0_1):
     getitem_3 = unbind_1[1];  unbind_1 = None
     transpose_4 = torch.ops.aten.transpose.int(transpose_2, 1, 0);  transpose_2 = None
     return transpose_4
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
     def test_optional_tensor_list(self):
         def f(x):
@@ -918,9 +1112,12 @@ def f(x):
             values = torch.arange(4, dtype=y.dtype)
             y.index_put_((indices,), values, accumulate=False)
             return y
+
         self.assert_functionalization(f, torch.ones(4, 2))
         logs = self.get_logs(f, torch.ones(4, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -933,7 +1130,8 @@ def forward(self, arg0_1):
     view_copy_2 = torch.ops.aten.view_copy.default(view_copy_1, [8])
     copy_ = torch.ops.aten.copy_.default(arg0_1, view_copy_1);  arg0_1 = view_copy_1 = None
     return view_copy_2
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
     def test_scalars(self):
         def f(x):
@@ -944,9 +1142,12 @@ def f(x):
             z = 2 * y
             z.div_(1)
             return z
+
         self.assert_functionalization(f, torch.ones(4, 2))
         logs = self.get_logs(f, torch.ones(4, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -960,7 +1161,8 @@ def forward(self, arg0_1):
     div = torch.ops.aten.div.Tensor(mul, 1);  mul = None
     copy_ = torch.ops.aten.copy_.default(arg0_1, view_copy_1);  arg0_1 = view_copy_1 = None
     return div
-    """)
+    """,
+        )
 
     @skipIfTorchDynamo("Test does not work with TorchDynamo")
     def test_metadata_change(self):
@@ -970,9 +1172,12 @@ def f(x):
             y = x.clone()
             out = y.ge_(0)
             return out
+
         self.assert_functionalization(f, torch.ones(4, 2))
         logs = self.get_logs(f, torch.ones(4, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -981,10 +1186,15 @@ def forward(self, arg0_1):
     ge = torch.ops.aten.ge.Scalar(clone, 0);  clone = None
     _to_copy = torch.ops.aten._to_copy.default(ge, dtype = torch.float32, layout = torch.strided);  ge = None
     return _to_copy
-    """)
+    """,
+        )
 
-        reinplaced_logs = self.get_logs(f, torch.ones(2, 2), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(2, 2), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -993,7 +1203,8 @@ def forward(self, arg0_1):
     ge = torch.ops.aten.ge.Scalar(clone, 0);  clone = None
     _to_copy = torch.ops.aten._to_copy.default(ge, dtype = torch.float32, layout = torch.strided);  ge = None
     return _to_copy
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
     @skipIfTorchDynamo("Test does not work with TorchDynamo")
     def test_metadata_change_out_op(self):
@@ -1002,7 +1213,9 @@ def f(t, y):
             return torch.add(t, y, out=out_1)
 
         inpt1, inpt2 = torch.tensor([1]), torch.tensor([1])
-        inpt1_func, inpt2_func = torch._to_functional_tensor(inpt1), torch._to_functional_tensor(inpt2)
+        inpt1_func, inpt2_func = torch._to_functional_tensor(
+            inpt1
+        ), torch._to_functional_tensor(inpt2)
 
         out_ref = f(inpt1, inpt2)
         torch._enable_functionalization(reapply_views=True)
@@ -1012,22 +1225,25 @@ def f(t, y):
             torch._disable_functionalization()
         self.assertEqual(out_ref, torch._from_functional_tensor(out_functional))
 
-
     def test_only_one_view(self):
         def f(x):
             # This tests that we don't have any unnecessary views in the trace.
             # If the input wasn't mutated, we don't need to regenerate it,
             # so there should be a total of 1 op in the output trace.
             return x.view(4, 2)
+
         logs = self.get_logs(f, torch.ones(4, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
 def forward(self, arg0_1):
     view_copy = torch.ops.aten.view_copy.default(arg0_1, [4, 2]);  arg0_1 = None
     return view_copy
-    """)
+    """,
+        )
 
     def test_everything(self):
         def f(x):
@@ -1043,9 +1259,12 @@ def f(x):
             z2.add_(tmp)
             z4 = z0[0] + z2.reshape(4)
             return z2
+
         self.assert_functionalization(f, torch.ones(4, 2))
         logs = self.get_logs(f, torch.ones(4, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -1096,10 +1315,15 @@ def forward(self, arg0_1):
     view_copy_13 = torch.ops.aten.view_copy.default(getitem_4, [4]);  getitem_4 = None
     add_2 = torch.ops.aten.add.Tensor(select_copy_1, view_copy_13);  select_copy_1 = view_copy_13 = None
     return getitem_2
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
-        reinplaced_logs = self.get_logs(f, torch.ones(4, 2), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(4, 2), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -1141,7 +1365,8 @@ def forward(self, arg0_1):
     select_1 = torch.ops.aten.select.int(view_9, 0, 0);  view_9 = None
     add_2 = torch.ops.aten.add.Tensor(select_1, _unsafe_view);  select_1 = _unsafe_view = None
     return getitem_2
-    """)
+    """,
+        )
 
     def test_reapply_views_simple(self):
         def f(x):
@@ -1150,9 +1375,12 @@ def f(x):
             y.add_(tmp)
             z = x * x
             return y
+
         self.assert_functionalization(f, torch.ones(4, 2), reapply_views=True)
         logs = self.get_logs(f, torch.ones(4, 2), reapply_views=True)
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -1165,7 +1393,8 @@ def forward(self, arg0_1):
     mul = torch.ops.aten.mul.Tensor(view_1, view_1)
     copy_ = torch.ops.aten.copy_.default(arg0_1, view_1);  arg0_1 = view_1 = None
     return view_2
-    """)
+    """,
+        )
 
     def test_aliases_maintained_after_pass_when_reapplying_views(self):
         def f(x):
@@ -1203,7 +1432,9 @@ def f(x):
         # to() is a composite op that noops when the dtype/shape match, so nothing gets logged.
         # self.assert_functionalization(f, torch.ones(2))
         logs = self.get_logs(f, torch.ones(2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -1217,10 +1448,15 @@ def forward(self, arg0_1):
     diagonal_scatter_1 = torch.ops.aten.diagonal_scatter.default(diagonal_scatter, add);  diagonal_scatter = add = None
     diagonal_copy_2 = torch.ops.aten.diagonal_copy.default(diagonal_scatter_1);  diagonal_scatter_1 = None
     return diagonal_copy_2
-    """)
+    """,
+        )
 
-        reinplaced_logs = self.get_logs(f, torch.ones(2), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(2), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -1232,12 +1468,15 @@ def forward(self, arg0_1):
     add = torch.ops.aten.add_.Tensor(diagonal_1, arg0_1);  diagonal_1 = arg0_1 = None
     diagonal_2 = torch.ops.aten.diagonal.default(zeros);  zeros = None
     return diagonal_2
-    """)
+    """,
+        )
 
         # Test 2: copy_() with same dtype, different shape
         self.assert_functionalization(f, torch.ones(1))
         logs = self.get_logs(f, torch.ones(1))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -1251,10 +1490,15 @@ def forward(self, arg0_1):
     diagonal_scatter_1 = torch.ops.aten.diagonal_scatter.default(diagonal_scatter, add);  diagonal_scatter = add = None
     diagonal_copy_2 = torch.ops.aten.diagonal_copy.default(diagonal_scatter_1);  diagonal_scatter_1 = None
     return diagonal_copy_2
-    """)
+    """,
+        )
 
-        reinplaced_logs = self.get_logs(f, torch.ones(1), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(1), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -1266,12 +1510,15 @@ def forward(self, arg0_1):
     add = torch.ops.aten.add_.Tensor(diagonal_1, arg0_1);  diagonal_1 = arg0_1 = None
     diagonal_2 = torch.ops.aten.diagonal.default(zeros);  zeros = None
     return diagonal_2
-    """)
+    """,
+        )
 
         # Test 3: copy_() with different dtype, same shape
         self.assert_functionalization(f, torch.ones(2, dtype=torch.long))
         logs = self.get_logs(f, torch.ones(2, dtype=torch.long))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -1285,10 +1532,15 @@ def forward(self, arg0_1):
     diagonal_scatter_1 = torch.ops.aten.diagonal_scatter.default(diagonal_scatter, add);  diagonal_scatter = add = None
     diagonal_copy_2 = torch.ops.aten.diagonal_copy.default(diagonal_scatter_1);  diagonal_scatter_1 = None
     return diagonal_copy_2
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
-        reinplaced_logs = self.get_logs(f, torch.ones(2, dtype=torch.long), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(2, dtype=torch.long), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -1300,12 +1552,15 @@ def forward(self, arg0_1):
     add = torch.ops.aten.add_.Tensor(diagonal_1, arg0_1);  diagonal_1 = arg0_1 = None
     diagonal_2 = torch.ops.aten.diagonal.default(zeros);  zeros = None
     return diagonal_2
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
         # Test 4: copy_() with different dtype, different shape
         self.assert_functionalization(f, torch.ones(1, dtype=torch.long))
         logs = self.get_logs(f, torch.ones(1, dtype=torch.long))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -1319,10 +1574,15 @@ def forward(self, arg0_1):
     diagonal_scatter_1 = torch.ops.aten.diagonal_scatter.default(diagonal_scatter, add);  diagonal_scatter = add = None
     diagonal_copy_2 = torch.ops.aten.diagonal_copy.default(diagonal_scatter_1);  diagonal_scatter_1 = None
     return diagonal_copy_2
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
-        reinplaced_logs = self.get_logs(f, torch.ones(1, dtype=torch.long), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(1, dtype=torch.long), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -1334,7 +1594,8 @@ def forward(self, arg0_1):
     add = torch.ops.aten.add_.Tensor(diagonal_1, arg0_1);  diagonal_1 = arg0_1 = None
     diagonal_2 = torch.ops.aten.diagonal.default(zeros);  zeros = None
     return diagonal_2
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
     def test_expand_symint(self):
         # Once some existing SymInt bugs are ironed out, we should update
@@ -1344,14 +1605,17 @@ def f(x):
 
         self.assert_functionalization(f, torch.ones(2, 2))
         logs = self.get_logs(f, torch.ones(2, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
 def forward(self, arg0_1):
     expand_copy = torch.ops.aten.expand_copy.default(arg0_1, [2, 2]);  arg0_1 = None
     return expand_copy
-    """)
+    """,
+        )
 
     def test_fill_(self):
         def f(x):
@@ -1362,7 +1626,9 @@ def f(x):
 
         self.assert_functionalization(f, torch.ones(2, 2))
         logs = self.get_logs(f, torch.ones(2, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -1373,10 +1639,15 @@ def forward(self, arg0_1):
     diagonal_scatter = torch.ops.aten.diagonal_scatter.default(add, fill);  add = fill = None
     diagonal_copy_1 = torch.ops.aten.diagonal_copy.default(diagonal_scatter)
     return diagonal_scatter
-    """)
+    """,
+        )
 
-        reinplaced_logs = self.get_logs(f, torch.ones(2, 2), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(2, 2), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -1386,7 +1657,8 @@ def forward(self, arg0_1):
     fill = torch.ops.aten.fill_.Scalar(diagonal, 0);  diagonal = None
     diagonal_1 = torch.ops.aten.diagonal.default(add)
     return add
-    """)
+    """,
+        )
 
     def test_resize_smaller(self):
         def f(w):
@@ -1401,7 +1673,9 @@ def f(w):
 
         self.assert_functionalization(f, torch.ones(8, 2))
         logs = self.get_logs(f, torch.ones(8, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -1424,10 +1698,15 @@ def forward(self, arg0_1):
     as_strided_copy_3 = torch.ops.aten.as_strided_copy.default(view_copy_7, [3, 3], [3, 1]);  view_copy_7 = None
     add_2 = torch.ops.aten.add.Tensor(as_strided_copy_3, 1);  as_strided_copy_3 = None
     return add_2
-    """)  # noqa: B950
+    """,  # noqa: B950
+        )
 
-        reinplaced_logs = self.get_logs(f, torch.ones(8, 2), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(8, 2), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -1449,7 +1728,8 @@ def forward(self, arg0_1):
     as_strided_3 = torch.ops.aten.as_strided.default(view_7, [3, 3], [3, 1]);  view_7 = None
     add_2 = torch.ops.aten.add_.Tensor(as_strided_3, 1)
     return as_strided_3
-    """)
+    """,
+        )
 
     def test_resize_same_size_diff_rank(self):
         def f(x):
@@ -1478,7 +1758,9 @@ def f(x):
 
         self.assert_functionalization(f, torch.ones(8, 2))
         logs = self.get_logs(f, torch.ones(8, 2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -1491,10 +1773,15 @@ def forward(self, arg0_1):
     view_copy_2 = torch.ops.aten.view_copy.default(view_copy_1, [25])
     add_1 = torch.ops.aten.add.Tensor(view_copy_1, 1)
     return (view_copy_1, add_1)
-    """)
+    """,
+        )
 
-        reinplaced_logs = self.get_logs(f, torch.ones(8, 2), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(8, 2), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -1507,7 +1794,8 @@ def forward(self, arg0_1):
     view_2 = torch.ops.aten.view.default(view_1, [25])
     add_1 = torch.ops.aten.add.Tensor(view_1, 1)
     return (view_1, add_1)
-    """)
+    """,
+        )
 
     def test_resize_larger_invalid(self):
         def f(x):
@@ -1524,8 +1812,9 @@ def f(x):
             return y, out
 
         with self.assertRaisesRegex(
-                RuntimeError,
-                r'Attempted to resize a view tensor to a larger size. This is not allowed in the functionalization pass'):
+            RuntimeError,
+            r"Attempted to resize a view tensor to a larger size. This is not allowed in the functionalization pass",
+        ):
             self.assert_functionalization(f, torch.ones(8, 2))
 
     def test_nested_functions_propagate_updates(self):
@@ -1558,9 +1847,12 @@ def f(x, y):
 
         # Make sure that functionalization ran the "+" kernel
         # with a functional + non-functional tensor, and wrapped the output appropriately.
-        self.assertExpectedInline('\n'.join(logs), """\
+        self.assertExpectedInline(
+            "\n".join(logs),
+            """\
 $2: f32[4] = torch._ops.aten.add.Tensor($0, $1)
-$3: f32[4] = torch._ops.aten.add.Tensor($2, 1)""")
+$3: f32[4] = torch._ops.aten.add.Tensor($2, 1)""",
+        )
 
     def test_mixed_wrappers_invalid(self):
         x1_not_functional = torch.ones(4)
@@ -1577,9 +1869,12 @@ def f(x):
             tmp = torch.zeros(10)
             tmp[5].fill_(1)
             return tmp
+
         self.assert_functionalization(f, torch.ones(2))
         logs = self.get_logs(f, torch.ones(2))
-        self.assertExpectedInline(logs, """\
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -1590,10 +1885,15 @@ def forward(self, arg0_1):
     select_scatter = torch.ops.aten.select_scatter.default(zeros, fill, 0, 5);  zeros = fill = None
     select_copy_1 = torch.ops.aten.select_copy.int(select_scatter, 0, 5)
     return select_scatter
-    """)  # noqa: B950
+    """,
+        )  # noqa: B950
 
-        reinplaced_logs = self.get_logs(f, torch.ones(2), reapply_views=True, run_reinplace=True)
-        self.assertExpectedInline(reinplaced_logs, """\
+        reinplaced_logs = self.get_logs(
+            f, torch.ones(2), reapply_views=True, run_reinplace=True
+        )
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -1603,23 +1903,39 @@ def forward(self, arg0_1):
     fill = torch.ops.aten.fill_.Scalar(select, 1);  select = None
     select_1 = torch.ops.aten.select.int(zeros, 0, 5)
     return zeros
-    """)
-
+    """,
+        )
 
     def test_instance_norm(self):
         size = 100
 
         def f(x, running_mean, running_var):
             with enable_python_dispatcher():
-                return torch.instance_norm(x, None, None, running_mean, running_var,
-                                           use_input_stats=True, momentum=0.1, eps=1e-5, cudnn_enabled=False)
-        self.assert_functionalization(f, torch.randn(20, size, 35, 45), torch.zeros(size), torch.ones(size))
+                return torch.instance_norm(
+                    x,
+                    None,
+                    None,
+                    running_mean,
+                    running_var,
+                    use_input_stats=True,
+                    momentum=0.1,
+                    eps=1e-5,
+                    cudnn_enabled=False,
+                )
+
+        self.assert_functionalization(
+            f, torch.randn(20, size, 35, 45), torch.zeros(size), torch.ones(size)
+        )
         # On Windows, for instance_norm, the alias_copy's are reordered to come right before they need to be used
         # whereas on other platforms, the alias_copy's are before the view_copy's.
         # e.g., the alias_copy after the getitem_4 assignment would be moved to be right before the copy assignment.
         if not IS_WINDOWS:
-            logs = self.get_logs(f, torch.randn(20, size, 35, 45), torch.zeros(size), torch.ones(size))
-            self.assertExpectedInline(logs, """\
+            logs = self.get_logs(
+                f, torch.randn(20, size, 35, 45), torch.zeros(size), torch.ones(size)
+            )
+            self.assertExpectedInline(
+                logs,
+                """\
 
 
 
@@ -1652,13 +1968,20 @@ def forward(self, arg0_1, arg1_1, arg2_1):
     copy_ = torch.ops.aten.copy_.default(arg1_1, alias_copy_1);  arg1_1 = alias_copy_1 = None
     copy__1 = torch.ops.aten.copy_.default(arg2_1, alias_copy_4);  arg2_1 = alias_copy_4 = None
     return view_copy_5
-    """)  # noqa: B950
+    """,  # noqa: B950
+            )
 
             reinplaced_logs = self.get_logs(
-                f, torch.randn(20, size, 35, 45), torch.zeros(size), torch.ones(size),
-                reapply_views=True, run_reinplace=True
+                f,
+                torch.randn(20, size, 35, 45),
+                torch.zeros(size),
+                torch.ones(size),
+                reapply_views=True,
+                run_reinplace=True,
             )
-            self.assertExpectedInline(reinplaced_logs, """\
+            self.assertExpectedInline(
+                reinplaced_logs,
+                """\
 
 
 
@@ -1691,8 +2014,8 @@ def forward(self, arg0_1, arg1_1, arg2_1):
     copy_ = torch.ops.aten.copy_.default(arg1_1, alias_1);  arg1_1 = alias_1 = None
     copy__1 = torch.ops.aten.copy_.default(arg2_1, alias_4);  arg2_1 = alias_4 = None
     return view_5
-    """)  # noqa: B950
-
+    """,  # noqa: B950
+            )
 
     def test_mutation_overlapping_mem(self):
         def fn(x):
@@ -1702,19 +2025,29 @@ def fn(x):
             t3 = t2.abs_()
             return t3
 
-        with self.assertRaisesRegex(RuntimeError, r'encountered a tensor being mutated that has internal overlap'):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"encountered a tensor being mutated that has internal overlap",
+        ):
             x = torch.ones(1, 5)
             out = _functionalize(fn, reapply_views=True, crossref=False)(x)
 
-
     def test_batch_norm(self):
         def f(x, running_mean, running_var):
             with enable_python_dispatcher():
-                return torch.batch_norm(x, None, None, running_mean, running_var, True, 0.1, 1e-5, False)
+                return torch.batch_norm(
+                    x, None, None, running_mean, running_var, True, 0.1, 1e-5, False
+                )
 
-        self.assert_functionalization(f, torch.randn(20, 100, 35, 45), torch.zeros(100), torch.ones(100))
-        logs = self.get_logs(f, torch.randn(20, 100, 35, 45), torch.zeros(100), torch.ones(100))
-        self.assertExpectedInline(logs, """\
+        self.assert_functionalization(
+            f, torch.randn(20, 100, 35, 45), torch.zeros(100), torch.ones(100)
+        )
+        logs = self.get_logs(
+            f, torch.randn(20, 100, 35, 45), torch.zeros(100), torch.ones(100)
+        )
+        self.assertExpectedInline(
+            logs,
+            """\
 
 
 
@@ -1729,12 +2062,20 @@ def forward(self, arg0_1, arg1_1, arg2_1):
     copy_ = torch.ops.aten.copy_.default(arg1_1, getitem_3);  arg1_1 = getitem_3 = None
     copy__1 = torch.ops.aten.copy_.default(arg2_1, getitem_4);  arg2_1 = getitem_4 = None
     return getitem
-    """)  # noqa: B950
+    """,  # noqa: B950
+        )
 
         reinplaced_logs = self.get_logs(
-            f, torch.randn(20, 100, 35, 45), torch.zeros(100), torch.ones(100), reapply_views=True, run_reinplace=True
+            f,
+            torch.randn(20, 100, 35, 45),
+            torch.zeros(100),
+            torch.ones(100),
+            reapply_views=True,
+            run_reinplace=True,
         )
-        self.assertExpectedInline(reinplaced_logs, """\
+        self.assertExpectedInline(
+            reinplaced_logs,
+            """\
 
 
 
@@ -1749,7 +2090,8 @@ def forward(self, arg0_1, arg1_1, arg2_1):
     copy_ = torch.ops.aten.copy_.default(arg1_1, getitem_3);  arg1_1 = getitem_3 = None
     copy__1 = torch.ops.aten.copy_.default(arg2_1, getitem_4);  arg2_1 = getitem_4 = None
     return getitem
-    """)  # noqa: B950
+    """,  # noqa: B950
+        )
 
     # This tests our python shims around C++ Functionalization: FunctionalTensor and FunctionalTensorMode
     def test_python_functionalization(self):
@@ -1768,7 +2110,9 @@ def f_functionalized(x):
             # our FunctionalTensor will inherit the same keyset.
             # We don't have an easy way of directly mutating a tensor's keyset from python,
             # so globally disabling functionalization here is easier.
-            maybe_disable = torch._C._ExcludeDispatchKeyGuard(torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize))
+            maybe_disable = torch._C._ExcludeDispatchKeyGuard(
+                torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize)
+            )
             with maybe_disable, FunctionalTensorMode():
                 x_wrapped = FunctionalTensor.to_functional(x)
                 out_wrapped = f(x_wrapped)
@@ -1781,14 +2125,17 @@ def f_functionalized(x):
         fx_g = make_fx(f_functionalized)(x)
         # NB: view_1 below is expected (though unused) due to view replay. AOTAutograd runs a
         # DCE pass that will remove nodes like this later on.
-        self.assertExpectedInline(fx_g.code.strip(), """\
+        self.assertExpectedInline(
+            fx_g.code.strip(),
+            """\
 def forward(self, x_1):
     view = torch.ops.aten.view.default(x_1, [-1])
     mul = torch.ops.aten.mul.Tensor(x_1, 2);  x_1 = None
     view_1 = torch.ops.aten.view.default(mul, [-1])
     view_2 = torch.ops.aten.view.default(mul, [-1]);  mul = None
     add = torch.ops.aten.add.Tensor(view_2, 1);  view_2 = None
-    return add""")
+    return add""",
+        )
 
     def test_python_functionalization_zero_tensor(self):
         def f(x):
@@ -1796,14 +2143,21 @@ def f(x):
             out = x + y
             out.mul_(2)
             return out
+
         x = torch.randn(4)
         out_ref = f(x)
         out_test = dispatch_functionalize(f)(x)
-        out_test_cpp = _functionalize(f, reapply_views=True, crossref=False, skip_input_mutations=True)(x)
+        out_test_cpp = _functionalize(
+            f, reapply_views=True, crossref=False, skip_input_mutations=True
+        )(x)
         self.assertEqual(out_ref, out_test)
         self.assertEqual(out_ref, out_test_cpp)
         fx_g = make_fx(dispatch_functionalize(f))(x)
-        fx_g_cpp = make_fx(_functionalize(f, reapply_views=True, crossref=False, skip_input_mutations=True))(x)
+        fx_g_cpp = make_fx(
+            _functionalize(
+                f, reapply_views=True, crossref=False, skip_input_mutations=True
+            )
+        )(x)
         self.assertEqual(fx_g_cpp.code.strip(), fx_g.code.strip())
 
     def test_python_functionalization_is_conj(self):
@@ -1834,7 +2188,6 @@ def f(x):
         self.assertEqual(out_ref[0], out_test_cpp[0])
         self.assertEqual(out_ref[1], out_test_cpp[1])
 
-
     def test_python_functionalization_conj(self):
         def f(x):
             y = x.clone().conj()
@@ -1844,12 +2197,20 @@ def f(x):
         x = torch.randn(4, dtype=torch.complex64)
         out_ref = f(x)
         out_test = dispatch_functionalize(f)(x)
-        out_test_cpp = _functionalize(f, reapply_views=True, crossref=False, skip_input_mutations=True)(x)
+        out_test_cpp = _functionalize(
+            f, reapply_views=True, crossref=False, skip_input_mutations=True
+        )(x)
         self.assertEqual(out_ref, out_test)
         self.assertEqual(out_test, out_test_cpp)
         fx_g = make_fx(dispatch_functionalize(f))(x)
-        fx_g_cpp = make_fx(_functionalize(f, reapply_views=True, crossref=False, skip_input_mutations=True))(x)
-        self.assertExpectedInline(fx_g.code.strip(), """\
+        fx_g_cpp = make_fx(
+            _functionalize(
+                f, reapply_views=True, crossref=False, skip_input_mutations=True
+            )
+        )(x)
+        self.assertExpectedInline(
+            fx_g.code.strip(),
+            """\
 def forward(self, arg0_1):
     clone = torch.ops.aten.clone.default(arg0_1);  arg0_1 = None
     _conj = torch.ops.aten._conj.default(clone);  clone = None
@@ -1861,7 +2222,8 @@ def forward(self, arg0_1):
     _conj_2 = torch.ops.aten._conj.default(_conj_1);  _conj_1 = None
     clone_3 = torch.ops.aten.clone.default(_conj_2);  _conj_2 = None
     view_as_real = torch.ops.aten.view_as_real.default(clone_3);  clone_3 = None
-    return view_as_real""")
+    return view_as_real""",
+        )
         self.assertEqual(fx_g_cpp.code.strip(), fx_g.code.strip())
 
     def test_python_functionalization_neg(self):
@@ -1873,23 +2235,34 @@ def f(x):
         x = torch.randn(4)
         out_ref = f(x)
         out_test = dispatch_functionalize(f)(x)
-        out_test_cpp = _functionalize(f, reapply_views=True, crossref=False, skip_input_mutations=True)(x)
+        out_test_cpp = _functionalize(
+            f, reapply_views=True, crossref=False, skip_input_mutations=True
+        )(x)
         self.assertEqual(out_ref, out_test)
         self.assertEqual(out_ref, out_test_cpp)
         fx_g = make_fx(dispatch_functionalize(f))(x)
-        fx_g_cpp = make_fx(_functionalize(f, reapply_views=True, crossref=False, skip_input_mutations=True))(x)
-        self.assertExpectedInline(fx_g.code.strip(), """\
+        fx_g_cpp = make_fx(
+            _functionalize(
+                f, reapply_views=True, crossref=False, skip_input_mutations=True
+            )
+        )(x)
+        self.assertExpectedInline(
+            fx_g.code.strip(),
+            """\
 def forward(self, arg0_1):
     _neg_view = torch.ops.aten._neg_view.default(arg0_1);  arg0_1 = None
     clone = torch.ops.aten.clone.default(_neg_view);  _neg_view = None
     add = torch.ops.aten.add.Tensor(clone, 1);  clone = None
-    return add""")
+    return add""",
+        )
         self.assertEqual(fx_g_cpp.code.strip(), fx_g.code.strip())
 
     def test_python_functionalization_lift_fresh_storage(self):
         unlifted = torch.tensor([0.0])
 
-        maybe_disable = torch._C._ExcludeDispatchKeyGuard(torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize))
+        maybe_disable = torch._C._ExcludeDispatchKeyGuard(
+            torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize)
+        )
         with maybe_disable, FunctionalTensorMode():
             lifted = torch.ops.aten.lift_fresh.default(unlifted)
 
@@ -1903,36 +2276,51 @@ def f(x):
         x = torch.randn(4)
         out_ref = f(x)
         out_test = dispatch_functionalize(f)(x)
-        out_test_cpp = _functionalize(f, reapply_views=True, crossref=False, skip_input_mutations=True)(x)
+        out_test_cpp = _functionalize(
+            f, reapply_views=True, crossref=False, skip_input_mutations=True
+        )(x)
         self.assertEqual(out_ref, out_test)
         self.assertEqual(out_ref, out_test_cpp)
         fx_g = make_fx(dispatch_functionalize(f))(x)
-        fx_g_cpp = make_fx(_functionalize(f, reapply_views=True, crossref=False, skip_input_mutations=True))(x)
-        self.assertExpectedInline(fx_g.code.strip(), """\
+        fx_g_cpp = make_fx(
+            _functionalize(
+                f, reapply_views=True, crossref=False, skip_input_mutations=True
+            )
+        )(x)
+        self.assertExpectedInline(
+            fx_g.code.strip(),
+            """\
 def forward(self, arg0_1):
     _tensor_constant0 = self._tensor_constant0
     lift_fresh_copy = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
     add = torch.ops.aten.add.Tensor(lift_fresh_copy, arg0_1);  lift_fresh_copy = arg0_1 = None
-    return add""")
+    return add""",
+        )
         self.assertEqual(fx_g_cpp.code.strip(), fx_g.code.strip())
 
-@xfail_inherited_tests([
-    "test_as_strided",
-    "test_copy_",
-    "test_diagonal",
-    "test_diagonal_mutated_input",
-    "test_everything",
-    "test_fill_",
-    "test_slice",
-    "test_split",
-    "test_split_with_sizes",
-    "test_unbind",
-    "test_view_clone_view_inplace",
-    "test_view_inplace",
-])
-@unittest.skipIf(TEST_WITH_TORCHDYNAMO, "dynamo-ing code with proxy + fake doesnt work well")
+
+@xfail_inherited_tests(
+    [
+        "test_as_strided",
+        "test_copy_",
+        "test_diagonal",
+        "test_diagonal_mutated_input",
+        "test_everything",
+        "test_fill_",
+        "test_slice",
+        "test_split",
+        "test_split_with_sizes",
+        "test_unbind",
+        "test_view_clone_view_inplace",
+        "test_view_inplace",
+    ]
+)
+@unittest.skipIf(
+    TEST_WITH_TORCHDYNAMO, "dynamo-ing code with proxy + fake doesnt work well"
+)
 class TestCrossRefFunctionalization(TestFunctionalization):
     crossref = True
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_fx.py b/test/test_fx.py
index 652fe6cd34a60..eadcd750aeded 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -57,6 +57,7 @@
     IS_WINDOWS,
     find_library_location,
     run_tests,
+    skipIfTorchDynamo,
 )
 from torch.testing._internal.jit_utils import JitTestCase
 
@@ -575,7 +576,6 @@ def multiply_forward(self, x, y):
         with self.assertRaisesRegex(AssertionError, "doesn't exist in"):
             tracer.trace(f)
 
-
     def test_graph_unique_names(self):
         class M(torch.nn.Module):
             def forward(self, a, b):
@@ -813,7 +813,6 @@ def __init__(self, interpreter):
             # Return final GraphModule!!!
             return GraphModule(wrapper, graph)
 
-
         # Lower GraphModule to C++ interpreter
         lowered = lower_to_elementwise_interpreter(msm)
 
@@ -869,7 +868,6 @@ def forward(self, x):
                 x = self.lin(x)
                 return x
 
-
         ec = ExampleCode()
 
         traced = torch.fx.symbolic_trace(ec)
@@ -877,7 +875,6 @@ def forward(self, x):
         x = torch.randn(bs, d_hid)
         torch.testing.assert_close(ec(x), traced(x))
 
-
     def test_node_tagging(self):
         class TaggingTracer(Tracer):
             def create_node(self, kind : str, target : Union[str, Callable],
@@ -951,7 +948,6 @@ def f(x):
         traced.graph.lint()
         self.assertEqual(count_attrs(traced), 2)
 
-
     def test_symbolic_trace_sequential(self):
         class Simple(torch.nn.Module):
             def forward(self, x):
@@ -1485,7 +1481,6 @@ def test_remove_uses_with_custom_filter(self):
 
         self.assertTrue(neg in relu.users)
 
-
     def test_nonetype_annotation(self):
         eb = torch.nn.EmbeddingBag(3, 4)
         symbolic_trace(eb)
@@ -1505,7 +1500,6 @@ class M(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
                 return (x, x + x)
 
-
         original = M()
         traced = symbolic_trace(original)
         self.assertEqual(traced(torch.ones(1)), original.forward(torch.ones(1)))
@@ -1800,7 +1794,6 @@ def forward(self, x):
                 self.assertEqual(node.meta["stack_trace"], "stack_trace")
                 self.assertEqual(node.meta["source_fn_stack"], "source_fn_stack")
 
-
     def test_interpreter(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -2173,7 +2166,6 @@ def is_leaf_module(self, m : torch.nn.Module, qualname : str):
         for node in to_erase:
             rn18_traced.graph.erase_node(node)
 
-
     def test_replace_input(self):
         graph : torch.fx.Graph = torch.fx.Graph()
         x : torch.fx.Node = graph.create_node('placeholder', 'x')
@@ -2216,7 +2208,6 @@ def test_update_args_api(self):
         inp_x, inp_y = torch.randn(5, 3), torch.randn(3, 5)
         self.assertEqual(orig_gm(inp_x, inp_y), torch.relu(inp_x))
 
-
         b.update_arg(0, y)
         new_gm = torch.fx.GraphModule(torch.nn.Module(), graph)
         self.assertEqual(new_gm(inp_x, inp_y), torch.relu(inp_y))
@@ -2232,7 +2223,6 @@ def test_update_kwargs_api(self):
         inp_x, inp_y = torch.randn(5, 3), torch.randn(3, 5)
         self.assertEqual(orig_gm(inp_x, inp_y), torch.relu(inp_x))
 
-
         b.update_kwarg('input', y)
         new_gm = torch.fx.GraphModule(torch.nn.Module(), graph)
         self.assertEqual(new_gm(inp_x, inp_y), torch.relu(inp_y))
@@ -2390,7 +2380,6 @@ def foo(x, y):
         x, y = torch.randn(3, 4), torch.randn(3, 4)
         self.checkGraphModule(foo, (x, y))
 
-
     def test_trace_return_dataclass(self):
         """
         Test case for Module that return dataclass
@@ -2448,7 +2437,6 @@ def forward(self, x):
 
         self.assertEqual(module(x), gm(x))
 
-
     def test_trace_return_namedtuple(self):
         """
         Test case for Module that return namedtuple
@@ -2461,7 +2449,6 @@ class ModuleReturnNamedTuple(torch.nn.Module):
             def forward(self, d : torch.Tensor):
                 return MyOutput(foo=d, bar=d)
 
-
         module = ModuleReturnNamedTuple()
 
         traced_graph = symbolic_trace(module).graph
@@ -2747,7 +2734,6 @@ def test_getitem_subproc(self):
         proc.join()
         self.assertEqual(proc.exitcode, 0)
 
-
     def test_user_friendly_call_provenance_with_function(self):
         def fn(x):
             return wrapper_fn(x)
@@ -3596,7 +3582,7 @@ def f_return_custom(x):
 
         def verify_pytree(f, inp):
             val = pytree.tree_map(lambda x: torch.randn(3) if isinstance(x, PHBase) else x, inp)
-            num_flat_args = len([i == PH for i in pytree.tree_leaves(inp)])
+            num_flat_args = len(pytree.tree_leaves(inp))
             orig_out = f(val)
             nf = symbolic_trace(f, concrete_args={'x': inp})
             self.assertEqual(nf(val), orig_out)
@@ -3607,17 +3593,17 @@ def verify_pytree(f, inp):
             self.assertEqual(nf.graph.process_outputs(bare_fx(*nf.graph.process_inputs(val))), orig_out)
 
             assert num_flat_args == 0 or "tree_flatten_spec" in nf.code
-            assert sum([i.op == 'placeholder' for i in nf.graph.nodes]) == num_flat_args
+            assert sum(i.op == 'placeholder' for i in nf.graph.nodes) == num_flat_args
 
             nf = symbolic_trace(nf)
             self.assertEqual(nf(val), orig_out)
             assert "tree_flatten_spec" not in nf.code
-            assert sum([i.op == 'placeholder' for i in nf.graph.nodes]) == 1
+            assert sum(i.op == 'placeholder' for i in nf.graph.nodes) == 1
 
             nf = symbolic_trace(nf, concrete_args={'x': inp})
             self.assertEqual(nf(val), orig_out)
             assert num_flat_args == 0 or "tree_flatten_spec" in nf.code
-            assert sum([i.op == 'placeholder' for i in nf.graph.nodes]) == num_flat_args
+            assert sum(i.op == 'placeholder' for i in nf.graph.nodes) == num_flat_args
 
             pickled = pickle.dumps(nf)
             nf = pickle.loads(pickled)
@@ -3866,7 +3852,6 @@ def test_insert_arg(self):
         m.graph.lint()
 
 
-
 def run_getitem_target():
     from torch.fx._symbolic_trace import _wrapped_methods_to_patch
     _wrapped_methods_to_patch.append((torch.Tensor, "__getitem__"))
@@ -4124,24 +4109,29 @@ def test_class_member_back_compat(self):
     def test_public_api_surface(self):
         non_back_compat_objects = {}
 
-        def check_symbols_have_bc_designation(m, prefix):
+        def check_symbols_have_bc_designation(m, seen):
             if not m.__name__.startswith('torch.fx'):
                 return
             if m.__name__.startswith('torch.fx.experimental'):
                 return
+            # It's really common for inner functions to point to random modules
+            # - make sure we don't recurse into modules we've already checked.
+            seen.add(m.__name__)
             for k, v in m.__dict__.items():
+                if hasattr(v, '__name__') and v.__name__ in seen:
+                    continue
                 if v is m:
                     continue
                 if k.startswith('_'):
                     continue
                 if isinstance(v, types.ModuleType):
-                    check_symbols_have_bc_designation(v, prefix + [k])
+                    check_symbols_have_bc_designation(v, seen)
                 elif isinstance(v, (type, types.FunctionType)):
                     if v not in _MARKED_WITH_COMPATIBILITY:
                         non_back_compat_objects.setdefault(v)
 
-        check_symbols_have_bc_designation(torch.fx, ['torch', 'fx'])
-        check_symbols_have_bc_designation(torch.fx.passes, ['torch', 'fx', 'passes'])
+        check_symbols_have_bc_designation(torch.fx, set())
+        check_symbols_have_bc_designation(torch.fx.passes, set())
 
         non_back_compat_strs = [torch.typename(obj) for obj in non_back_compat_objects.keys()]
         # Only want objects in torch.fx
@@ -4188,6 +4178,10 @@ def test_preserve_unused_attr_after_unpickle(self):
         self.assertTrue(hasattr(reload_gm, "dummy_buffer"))
         self.assertTrue(hasattr(reload_gm, "dummy_parameter"))
 
+# This is failing on Python 3.12 : https://github.com/pytorch/pytorch/issues/119454
+@unittest.skipIf(
+    sys.version_info >= (3, 12), "Failing on python 3.12+"
+)
 class TestFunctionalTracing(JitTestCase):
     def setUp(self):
         super().setUp()
@@ -4282,6 +4276,7 @@ def tearDown(self):
         "fractional_max_pool2d_with_indices": ARG_TYPE_MISMATCH,
         "fractional_max_pool3d_with_indices": ARG_TYPE_MISMATCH,
         "layer_norm": ARG_TYPE_MISMATCH,
+        "rms_norm": ARG_TYPE_MISMATCH,
         "lp_pool1d": ARG_TYPE_MISMATCH,
 
         "affine_grid": CONTROL_FLOW,
@@ -4454,6 +4449,7 @@ def tearDownClass(cls):
 
 instantiate_device_type_tests(TestOperatorSignatures, globals())
 
+@skipIfTorchDynamo("too slow")
 @skipIfNoTorchVision
 class TestVisionTracing(JitTestCase):
     def setUp(self):
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 3e7120a67f8da..d3231df353134 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -1564,7 +1564,7 @@ class TestNormalizeOperators(JitTestCase):
     @ops(op_db, allowed_dtypes=(torch.float,))
     def test_normalize_operator_exhaustive(self, device, dtype, op):
         # These ops currently don't trace in FX for various reasons (i.e. they take a list of tensors)
-        fx_fail = {"cat", "stack", "hstack", "vstack", "dstack", "linalg.multi_dot", "_upsample_bilinear2d_aa"}
+        fx_fail = {"cat", "stack", "hstack", "vstack", "dstack", "linalg.multi_dot", "_upsample_bilinear2d_aa", "_chunk_cat"}
         sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=False)
         if isinstance(op.op, torch._ops.OpOverload):
             self.skipTest("normalize operator doesn't work on torch.ops")
diff --git a/test/test_fx_passes.py b/test/test_fx_passes.py
index c5952df961e01..491633f0e4b9b 100644
--- a/test/test_fx_passes.py
+++ b/test/test_fx_passes.py
@@ -234,6 +234,11 @@ def forward17(a, b, c, d, e, f):
         a2 = e + f
         return a0, a1, a2
 
+    @staticmethod
+    def forward18(a, b, c):
+        a0, a1 = torch.ops.aten.var_mean(a)
+        return a0
+
 # A mock OperatorSupport class, where only operator.add is supported
 class MockOperatorSupport(OperatorSupport):
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
@@ -277,6 +282,8 @@ class TestFXGraphPasses(JitTestCase):
         (TestPartitionFunctions.forward15, [['add_1', 'add', 'permute_1', 'view', 'permute_2', 'permute_3', 'permute']], False),
         (TestPartitionFunctions.forward16, [["permute_1", "add_1", "add"]], True),
         (TestPartitionFunctions.forward16, [['add_1', 'add', 'permute_1', 'view', 'permute_2', 'permute_3', 'permute']], False),
+        # should be empty partition, not a partiton with empty nodes
+        (TestPartitionFunctions.forward18, [], False),
     ])
     def test_partitioner(self, fn, expected_partition, bookend_non_compute_pass):
         traced = symbolic_trace(fn)
@@ -743,7 +750,7 @@ def pattern(a, b, c):
 class QuantizationFp8Pattern:
     @classmethod
     def setup(cls):
-        cls.quantization = torch.library.Library("fp8_quantization", "DEF")
+        cls.quantization = torch.library.Library("fp8_quantization", "DEF")  # noqa: TOR901
         cls.quantization.define("quantize_per_tensor_affine_fp8(Tensor self, int dtype, float scale) -> Tensor")
         cls.quantization.define("dequantize_per_tensor_affine_fp8(Tensor self, int dtype, float scale) -> Tensor")
 
diff --git a/test/test_hub.py b/test/test_hub.py
index 842c5cf6fa933..a0c0c315c4eaf 100644
--- a/test/test_hub.py
+++ b/test/test_hub.py
@@ -43,7 +43,7 @@ def _assert_trusted_list_is_empty(self):
 
     def _assert_in_trusted_list(self, line):
         with open(self.trusted_list_path) as f:
-            assert line in (l.strip() for l in f.readlines())
+            assert line in (l.strip() for l in f)
 
     @retry(Exception, tries=3)
     def test_load_from_github(self):
diff --git a/test/test_indexing.py b/test/test_indexing.py
index e5e854d1d0972..f34fa4c5669b2 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -1,27 +1,38 @@
 # Owner(s): ["module: tests"]
 
-import torch
-from torch import tensor
+import operator
+import random
 
 import unittest
 import warnings
-import random
 from functools import reduce
 
 import numpy as np
+import torch
+from torch import tensor
 
 from torch.testing import make_tensor
-from torch.testing._internal.common_utils import (
-    TestCase, run_tests, skipIfTorchDynamo, DeterministicGuard)
 from torch.testing._internal.common_device_type import (
-    instantiate_device_type_tests, onlyCUDA, dtypes, dtypesIfCPU, dtypesIfCUDA,
-    onlyNativeDeviceTypes, skipXLA)
-import operator
+    dtypes,
+    dtypesIfCPU,
+    dtypesIfCUDA,
+    instantiate_device_type_tests,
+    onlyCUDA,
+    onlyNativeDeviceTypes,
+    skipXLA,
+)
+from torch.testing._internal.common_utils import (
+    DeterministicGuard,
+    run_tests,
+    serialTest,
+    skipIfTorchDynamo,
+    TEST_CUDA,
+    TestCase,
+)
 
 
 class TestIndexing(TestCase):
     def test_index(self, device):
-
         def consec(size, start=1):
             sequence = torch.ones(torch.tensor(size).prod(0)).cumsum(0)
             sequence.add_(start - 1)
@@ -30,7 +41,9 @@ def consec(size, start=1):
         reference = consec((3, 3, 3)).to(device)
 
         # empty tensor indexing
-        self.assertEqual(reference[torch.LongTensor().to(device)], reference.new(0, 3, 3))
+        self.assertEqual(
+            reference[torch.LongTensor().to(device)], reference.new(0, 3, 3)
+        )
 
         self.assertEqual(reference[0], consec((3, 3)), atol=0, rtol=0)
         self.assertEqual(reference[1], consec((3, 3), 10), atol=0, rtol=0)
@@ -41,10 +54,15 @@ def consec(size, start=1):
         self.assertEqual(reference[:], consec((3, 3, 3)), atol=0, rtol=0)
 
         # indexing with Ellipsis
-        self.assertEqual(reference[..., 2], torch.tensor([[3., 6., 9.],
-                                                          [12., 15., 18.],
-                                                          [21., 24., 27.]]), atol=0, rtol=0)
-        self.assertEqual(reference[0, ..., 2], torch.tensor([3., 6., 9.]), atol=0, rtol=0)
+        self.assertEqual(
+            reference[..., 2],
+            torch.tensor([[3.0, 6.0, 9.0], [12.0, 15.0, 18.0], [21.0, 24.0, 27.0]]),
+            atol=0,
+            rtol=0,
+        )
+        self.assertEqual(
+            reference[0, ..., 2], torch.tensor([3.0, 6.0, 9.0]), atol=0, rtol=0
+        )
         self.assertEqual(reference[..., 2], reference[:, :, 2], atol=0, rtol=0)
         self.assertEqual(reference[0, ..., 2], reference[0, :, 2], atol=0, rtol=0)
         self.assertEqual(reference[0, 2, ...], reference[0, 2], atol=0, rtol=0)
@@ -55,9 +73,15 @@ def consec(size, start=1):
         self.assertEqual(reference[...], reference, atol=0, rtol=0)
 
         reference_5d = consec((3, 3, 3, 3, 3)).to(device)
-        self.assertEqual(reference_5d[..., 1, 0], reference_5d[:, :, :, 1, 0], atol=0, rtol=0)
-        self.assertEqual(reference_5d[2, ..., 1, 0], reference_5d[2, :, :, 1, 0], atol=0, rtol=0)
-        self.assertEqual(reference_5d[2, 1, 0, ..., 1], reference_5d[2, 1, 0, :, 1], atol=0, rtol=0)
+        self.assertEqual(
+            reference_5d[..., 1, 0], reference_5d[:, :, :, 1, 0], atol=0, rtol=0
+        )
+        self.assertEqual(
+            reference_5d[2, ..., 1, 0], reference_5d[2, :, :, 1, 0], atol=0, rtol=0
+        )
+        self.assertEqual(
+            reference_5d[2, 1, 0, ..., 1], reference_5d[2, 1, 0, :, 1], atol=0, rtol=0
+        )
         self.assertEqual(reference_5d[...], reference_5d, atol=0, rtol=0)
 
         # LongTensor indexing
@@ -70,10 +94,18 @@ def consec(size, start=1):
 
         # None indexing
         self.assertEqual(reference[2, None], reference[2].unsqueeze(0))
-        self.assertEqual(reference[2, None, None], reference[2].unsqueeze(0).unsqueeze(0))
+        self.assertEqual(
+            reference[2, None, None], reference[2].unsqueeze(0).unsqueeze(0)
+        )
         self.assertEqual(reference[2:4, None], reference[2:4].unsqueeze(1))
-        self.assertEqual(reference[None, 2, None, None], reference.unsqueeze(0)[:, 2].unsqueeze(0).unsqueeze(0))
-        self.assertEqual(reference[None, 2:5, None, None], reference.unsqueeze(0)[:, 2:5].unsqueeze(2).unsqueeze(2))
+        self.assertEqual(
+            reference[None, 2, None, None],
+            reference.unsqueeze(0)[:, 2].unsqueeze(0).unsqueeze(0),
+        )
+        self.assertEqual(
+            reference[None, 2:5, None, None],
+            reference.unsqueeze(0)[:, 2:5].unsqueeze(2).unsqueeze(2),
+        )
 
         # indexing 0-length slice
         self.assertEqual(torch.empty(0, 5, 5), reference[slice(0)])
@@ -84,13 +116,28 @@ def consec(size, start=1):
         # indexing with step
         reference = consec((10, 10, 10)).to(device)
         self.assertEqual(reference[1:5:2], torch.stack([reference[1], reference[3]], 0))
-        self.assertEqual(reference[1:6:2], torch.stack([reference[1], reference[3], reference[5]], 0))
+        self.assertEqual(
+            reference[1:6:2], torch.stack([reference[1], reference[3], reference[5]], 0)
+        )
         self.assertEqual(reference[1:9:4], torch.stack([reference[1], reference[5]], 0))
-        self.assertEqual(reference[2:4, 1:5:2], torch.stack([reference[2:4, 1], reference[2:4, 3]], 1))
-        self.assertEqual(reference[3, 1:6:2], torch.stack([reference[3, 1], reference[3, 3], reference[3, 5]], 0))
-        self.assertEqual(reference[None, 2, 1:9:4], torch.stack([reference[2, 1], reference[2, 5]], 0).unsqueeze(0))
-        self.assertEqual(reference[:, 2, 1:6:2],
-                         torch.stack([reference[:, 2, 1], reference[:, 2, 3], reference[:, 2, 5]], 1))
+        self.assertEqual(
+            reference[2:4, 1:5:2],
+            torch.stack([reference[2:4, 1], reference[2:4, 3]], 1),
+        )
+        self.assertEqual(
+            reference[3, 1:6:2],
+            torch.stack([reference[3, 1], reference[3, 3], reference[3, 5]], 0),
+        )
+        self.assertEqual(
+            reference[None, 2, 1:9:4],
+            torch.stack([reference[2, 1], reference[2, 5]], 0).unsqueeze(0),
+        )
+        self.assertEqual(
+            reference[:, 2, 1:6:2],
+            torch.stack(
+                [reference[:, 2, 1], reference[:, 2, 3], reference[:, 2, 5]], 1
+            ),
+        )
 
         lst = [list(range(i, i + 10)) for i in range(0, 100, 10)]
         tensor = torch.DoubleTensor(lst).to(device)
@@ -156,23 +203,33 @@ def ri(indices):
 
         def validate_indexing(x):
             self.assertEqual(x[[0]], consec((1,)))
-            self.assertEqual(x[ri([0]), ], consec((1,)))
-            self.assertEqual(x[ri([3]), ], consec((1,), 4))
+            self.assertEqual(x[ri([0]),], consec((1,)))
+            self.assertEqual(x[ri([3]),], consec((1,), 4))
             self.assertEqual(x[[2, 3, 4]], consec((3,), 3))
-            self.assertEqual(x[ri([2, 3, 4]), ], consec((3,), 3))
-            self.assertEqual(x[ri([0, 2, 4]), ], torch.tensor([1, 3, 5], dtype=dtype, device=device))
+            self.assertEqual(x[ri([2, 3, 4]),], consec((3,), 3))
+            self.assertEqual(
+                x[ri([0, 2, 4]),], torch.tensor([1, 3, 5], dtype=dtype, device=device)
+            )
 
         def validate_setting(x):
             x[[0]] = -2
             self.assertEqual(x[[0]], torch.tensor([-2], dtype=dtype, device=device))
             x[[0]] = -1
-            self.assertEqual(x[ri([0]), ], torch.tensor([-1], dtype=dtype, device=device))
+            self.assertEqual(
+                x[ri([0]),], torch.tensor([-1], dtype=dtype, device=device)
+            )
             x[[2, 3, 4]] = 4
-            self.assertEqual(x[[2, 3, 4]], torch.tensor([4, 4, 4], dtype=dtype, device=device))
-            x[ri([2, 3, 4]), ] = 3
-            self.assertEqual(x[ri([2, 3, 4]), ], torch.tensor([3, 3, 3], dtype=dtype, device=device))
-            x[ri([0, 2, 4]), ] = torch.tensor([5, 4, 3], dtype=dtype, device=device)
-            self.assertEqual(x[ri([0, 2, 4]), ], torch.tensor([5, 4, 3], dtype=dtype, device=device))
+            self.assertEqual(
+                x[[2, 3, 4]], torch.tensor([4, 4, 4], dtype=dtype, device=device)
+            )
+            x[ri([2, 3, 4]),] = 3
+            self.assertEqual(
+                x[ri([2, 3, 4]),], torch.tensor([3, 3, 3], dtype=dtype, device=device)
+            )
+            x[ri([0, 2, 4]),] = torch.tensor([5, 4, 3], dtype=dtype, device=device)
+            self.assertEqual(
+                x[ri([0, 2, 4]),], torch.tensor([5, 4, 3], dtype=dtype, device=device)
+            )
 
         # Only validates indexing and setting for halfs
         if dtype == torch.half:
@@ -192,208 +249,300 @@ def validate_setting(x):
         # strided is [1, 3, 5, 7]
         reference = consec((10,))
         strided = torch.tensor((), dtype=dtype, device=device)
-        strided.set_(reference.storage(), storage_offset=0,
-                     size=torch.Size([4]), stride=[2])
+        strided.set_(
+            reference.storage(), storage_offset=0, size=torch.Size([4]), stride=[2]
+        )
 
         self.assertEqual(strided[[0]], torch.tensor([1], dtype=dtype, device=device))
-        self.assertEqual(strided[ri([0]), ], torch.tensor([1], dtype=dtype, device=device))
-        self.assertEqual(strided[ri([3]), ], torch.tensor([7], dtype=dtype, device=device))
-        self.assertEqual(strided[[1, 2]], torch.tensor([3, 5], dtype=dtype, device=device))
-        self.assertEqual(strided[ri([1, 2]), ], torch.tensor([3, 5], dtype=dtype, device=device))
-        self.assertEqual(strided[ri([[2, 1], [0, 3]]), ],
-                         torch.tensor([[5, 3], [1, 7]], dtype=dtype, device=device))
+        self.assertEqual(
+            strided[ri([0]),], torch.tensor([1], dtype=dtype, device=device)
+        )
+        self.assertEqual(
+            strided[ri([3]),], torch.tensor([7], dtype=dtype, device=device)
+        )
+        self.assertEqual(
+            strided[[1, 2]], torch.tensor([3, 5], dtype=dtype, device=device)
+        )
+        self.assertEqual(
+            strided[ri([1, 2]),], torch.tensor([3, 5], dtype=dtype, device=device)
+        )
+        self.assertEqual(
+            strided[ri([[2, 1], [0, 3]]),],
+            torch.tensor([[5, 3], [1, 7]], dtype=dtype, device=device),
+        )
 
         # stride is [4, 8]
         strided = torch.tensor((), dtype=dtype, device=device)
-        strided.set_(reference.storage(), storage_offset=4,
-                     size=torch.Size([2]), stride=[4])
+        strided.set_(
+            reference.storage(), storage_offset=4, size=torch.Size([2]), stride=[4]
+        )
         self.assertEqual(strided[[0]], torch.tensor([5], dtype=dtype, device=device))
-        self.assertEqual(strided[ri([0]), ], torch.tensor([5], dtype=dtype, device=device))
-        self.assertEqual(strided[ri([1]), ], torch.tensor([9], dtype=dtype, device=device))
-        self.assertEqual(strided[[0, 1]], torch.tensor([5, 9], dtype=dtype, device=device))
-        self.assertEqual(strided[ri([0, 1]), ], torch.tensor([5, 9], dtype=dtype, device=device))
-        self.assertEqual(strided[ri([[0, 1], [1, 0]]), ],
-                         torch.tensor([[5, 9], [9, 5]], dtype=dtype, device=device))
+        self.assertEqual(
+            strided[ri([0]),], torch.tensor([5], dtype=dtype, device=device)
+        )
+        self.assertEqual(
+            strided[ri([1]),], torch.tensor([9], dtype=dtype, device=device)
+        )
+        self.assertEqual(
+            strided[[0, 1]], torch.tensor([5, 9], dtype=dtype, device=device)
+        )
+        self.assertEqual(
+            strided[ri([0, 1]),], torch.tensor([5, 9], dtype=dtype, device=device)
+        )
+        self.assertEqual(
+            strided[ri([[0, 1], [1, 0]]),],
+            torch.tensor([[5, 9], [9, 5]], dtype=dtype, device=device),
+        )
 
         # reference is 1 2
         #              3 4
         #              5 6
         reference = consec((3, 2))
-        self.assertEqual(reference[ri([0, 1, 2]), ri([0])], torch.tensor([1, 3, 5], dtype=dtype, device=device))
-        self.assertEqual(reference[ri([0, 1, 2]), ri([1])], torch.tensor([2, 4, 6], dtype=dtype, device=device))
+        self.assertEqual(
+            reference[ri([0, 1, 2]), ri([0])],
+            torch.tensor([1, 3, 5], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            reference[ri([0, 1, 2]), ri([1])],
+            torch.tensor([2, 4, 6], dtype=dtype, device=device),
+        )
         self.assertEqual(reference[ri([0]), ri([0])], consec((1,)))
         self.assertEqual(reference[ri([2]), ri([1])], consec((1,), 6))
-        self.assertEqual(reference[[ri([0, 0]), ri([0, 1])]], torch.tensor([1, 2], dtype=dtype, device=device))
-        self.assertEqual(reference[[ri([0, 1, 1, 0, 2]), ri([1])]],
-                         torch.tensor([2, 4, 4, 2, 6], dtype=dtype, device=device))
-        self.assertEqual(reference[[ri([0, 0, 1, 1]), ri([0, 1, 0, 0])]],
-                         torch.tensor([1, 2, 3, 3], dtype=dtype, device=device))
-
-        rows = ri([[0, 0],
-                   [1, 2]])
-        columns = [0],
-        self.assertEqual(reference[rows, columns], torch.tensor([[1, 1],
-                                                                 [3, 5]], dtype=dtype, device=device))
-
-        rows = ri([[0, 0],
-                   [1, 2]])
+        self.assertEqual(
+            reference[[ri([0, 0]), ri([0, 1])]],
+            torch.tensor([1, 2], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            reference[[ri([0, 1, 1, 0, 2]), ri([1])]],
+            torch.tensor([2, 4, 4, 2, 6], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            reference[[ri([0, 0, 1, 1]), ri([0, 1, 0, 0])]],
+            torch.tensor([1, 2, 3, 3], dtype=dtype, device=device),
+        )
+
+        rows = ri([[0, 0], [1, 2]])
+        columns = ([0],)
+        self.assertEqual(
+            reference[rows, columns],
+            torch.tensor([[1, 1], [3, 5]], dtype=dtype, device=device),
+        )
+
+        rows = ri([[0, 0], [1, 2]])
         columns = ri([1, 0])
-        self.assertEqual(reference[rows, columns], torch.tensor([[2, 1],
-                                                                 [4, 5]], dtype=dtype, device=device))
-        rows = ri([[0, 0],
-                   [1, 2]])
-        columns = ri([[0, 1],
-                      [1, 0]])
-        self.assertEqual(reference[rows, columns], torch.tensor([[1, 2],
-                                                                 [4, 5]], dtype=dtype, device=device))
+        self.assertEqual(
+            reference[rows, columns],
+            torch.tensor([[2, 1], [4, 5]], dtype=dtype, device=device),
+        )
+        rows = ri([[0, 0], [1, 2]])
+        columns = ri([[0, 1], [1, 0]])
+        self.assertEqual(
+            reference[rows, columns],
+            torch.tensor([[1, 2], [4, 5]], dtype=dtype, device=device),
+        )
 
         # setting values
         reference[ri([0]), ri([1])] = -1
-        self.assertEqual(reference[ri([0]), ri([1])], torch.tensor([-1], dtype=dtype, device=device))
-        reference[ri([0, 1, 2]), ri([0])] = torch.tensor([-1, 2, -4], dtype=dtype, device=device)
-        self.assertEqual(reference[ri([0, 1, 2]), ri([0])],
-                         torch.tensor([-1, 2, -4], dtype=dtype, device=device))
-        reference[rows, columns] = torch.tensor([[4, 6], [2, 3]], dtype=dtype, device=device)
-        self.assertEqual(reference[rows, columns],
-                         torch.tensor([[4, 6], [2, 3]], dtype=dtype, device=device))
+        self.assertEqual(
+            reference[ri([0]), ri([1])], torch.tensor([-1], dtype=dtype, device=device)
+        )
+        reference[ri([0, 1, 2]), ri([0])] = torch.tensor(
+            [-1, 2, -4], dtype=dtype, device=device
+        )
+        self.assertEqual(
+            reference[ri([0, 1, 2]), ri([0])],
+            torch.tensor([-1, 2, -4], dtype=dtype, device=device),
+        )
+        reference[rows, columns] = torch.tensor(
+            [[4, 6], [2, 3]], dtype=dtype, device=device
+        )
+        self.assertEqual(
+            reference[rows, columns],
+            torch.tensor([[4, 6], [2, 3]], dtype=dtype, device=device),
+        )
 
         # Verify still works with Transposed (i.e. non-contiguous) Tensors
 
-        reference = torch.tensor([[0, 1, 2, 3],
-                                  [4, 5, 6, 7],
-                                  [8, 9, 10, 11]], dtype=dtype, device=device).t_()
+        reference = torch.tensor(
+            [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=dtype, device=device
+        ).t_()
 
         # Transposed: [[0, 4, 8],
         #              [1, 5, 9],
         #              [2, 6, 10],
         #              [3, 7, 11]]
 
-        self.assertEqual(reference[ri([0, 1, 2]), ri([0])],
-                         torch.tensor([0, 1, 2], dtype=dtype, device=device))
-        self.assertEqual(reference[ri([0, 1, 2]), ri([1])],
-                         torch.tensor([4, 5, 6], dtype=dtype, device=device))
-        self.assertEqual(reference[ri([0]), ri([0])],
-                         torch.tensor([0], dtype=dtype, device=device))
-        self.assertEqual(reference[ri([2]), ri([1])],
-                         torch.tensor([6], dtype=dtype, device=device))
-        self.assertEqual(reference[[ri([0, 0]), ri([0, 1])]],
-                         torch.tensor([0, 4], dtype=dtype, device=device))
-        self.assertEqual(reference[[ri([0, 1, 1, 0, 3]), ri([1])]],
-                         torch.tensor([4, 5, 5, 4, 7], dtype=dtype, device=device))
-        self.assertEqual(reference[[ri([0, 0, 1, 1]), ri([0, 1, 0, 0])]],
-                         torch.tensor([0, 4, 1, 1], dtype=dtype, device=device))
-
-        rows = ri([[0, 0],
-                   [1, 2]])
-        columns = [0],
-        self.assertEqual(reference[rows, columns],
-                         torch.tensor([[0, 0], [1, 2]], dtype=dtype, device=device))
-
-        rows = ri([[0, 0],
-                   [1, 2]])
+        self.assertEqual(
+            reference[ri([0, 1, 2]), ri([0])],
+            torch.tensor([0, 1, 2], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            reference[ri([0, 1, 2]), ri([1])],
+            torch.tensor([4, 5, 6], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            reference[ri([0]), ri([0])], torch.tensor([0], dtype=dtype, device=device)
+        )
+        self.assertEqual(
+            reference[ri([2]), ri([1])], torch.tensor([6], dtype=dtype, device=device)
+        )
+        self.assertEqual(
+            reference[[ri([0, 0]), ri([0, 1])]],
+            torch.tensor([0, 4], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            reference[[ri([0, 1, 1, 0, 3]), ri([1])]],
+            torch.tensor([4, 5, 5, 4, 7], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            reference[[ri([0, 0, 1, 1]), ri([0, 1, 0, 0])]],
+            torch.tensor([0, 4, 1, 1], dtype=dtype, device=device),
+        )
+
+        rows = ri([[0, 0], [1, 2]])
+        columns = ([0],)
+        self.assertEqual(
+            reference[rows, columns],
+            torch.tensor([[0, 0], [1, 2]], dtype=dtype, device=device),
+        )
+
+        rows = ri([[0, 0], [1, 2]])
         columns = ri([1, 0])
-        self.assertEqual(reference[rows, columns],
-                         torch.tensor([[4, 0], [5, 2]], dtype=dtype, device=device))
-        rows = ri([[0, 0],
-                   [1, 3]])
-        columns = ri([[0, 1],
-                      [1, 2]])
-        self.assertEqual(reference[rows, columns],
-                         torch.tensor([[0, 4], [5, 11]], dtype=dtype, device=device))
+        self.assertEqual(
+            reference[rows, columns],
+            torch.tensor([[4, 0], [5, 2]], dtype=dtype, device=device),
+        )
+        rows = ri([[0, 0], [1, 3]])
+        columns = ri([[0, 1], [1, 2]])
+        self.assertEqual(
+            reference[rows, columns],
+            torch.tensor([[0, 4], [5, 11]], dtype=dtype, device=device),
+        )
 
         # setting values
         reference[ri([0]), ri([1])] = -1
-        self.assertEqual(reference[ri([0]), ri([1])],
-                         torch.tensor([-1], dtype=dtype, device=device))
-        reference[ri([0, 1, 2]), ri([0])] = torch.tensor([-1, 2, -4], dtype=dtype, device=device)
-        self.assertEqual(reference[ri([0, 1, 2]), ri([0])],
-                         torch.tensor([-1, 2, -4], dtype=dtype, device=device))
-        reference[rows, columns] = torch.tensor([[4, 6], [2, 3]], dtype=dtype, device=device)
-        self.assertEqual(reference[rows, columns],
-                         torch.tensor([[4, 6], [2, 3]], dtype=dtype, device=device))
+        self.assertEqual(
+            reference[ri([0]), ri([1])], torch.tensor([-1], dtype=dtype, device=device)
+        )
+        reference[ri([0, 1, 2]), ri([0])] = torch.tensor(
+            [-1, 2, -4], dtype=dtype, device=device
+        )
+        self.assertEqual(
+            reference[ri([0, 1, 2]), ri([0])],
+            torch.tensor([-1, 2, -4], dtype=dtype, device=device),
+        )
+        reference[rows, columns] = torch.tensor(
+            [[4, 6], [2, 3]], dtype=dtype, device=device
+        )
+        self.assertEqual(
+            reference[rows, columns],
+            torch.tensor([[4, 6], [2, 3]], dtype=dtype, device=device),
+        )
 
         # stride != 1
 
         # strided is [[1 3 5 7],
         #             [9 11 13 15]]
 
-        reference = torch.arange(0., 24, dtype=dtype, device=device).view(3, 8)
+        reference = torch.arange(0.0, 24, dtype=dtype, device=device).view(3, 8)
         strided = torch.tensor((), dtype=dtype, device=device)
-        strided.set_(reference.storage(), 1, size=torch.Size([2, 4]),
-                     stride=[8, 2])
-
-        self.assertEqual(strided[ri([0, 1]), ri([0])],
-                         torch.tensor([1, 9], dtype=dtype, device=device))
-        self.assertEqual(strided[ri([0, 1]), ri([1])],
-                         torch.tensor([3, 11], dtype=dtype, device=device))
-        self.assertEqual(strided[ri([0]), ri([0])],
-                         torch.tensor([1], dtype=dtype, device=device))
-        self.assertEqual(strided[ri([1]), ri([3])],
-                         torch.tensor([15], dtype=dtype, device=device))
-        self.assertEqual(strided[[ri([0, 0]), ri([0, 3])]],
-                         torch.tensor([1, 7], dtype=dtype, device=device))
-        self.assertEqual(strided[[ri([1]), ri([0, 1, 1, 0, 3])]],
-                         torch.tensor([9, 11, 11, 9, 15], dtype=dtype, device=device))
-        self.assertEqual(strided[[ri([0, 0, 1, 1]), ri([0, 1, 0, 0])]],
-                         torch.tensor([1, 3, 9, 9], dtype=dtype, device=device))
-
-        rows = ri([[0, 0],
-                   [1, 1]])
-        columns = [0],
-        self.assertEqual(strided[rows, columns],
-                         torch.tensor([[1, 1], [9, 9]], dtype=dtype, device=device))
-
-        rows = ri([[0, 1],
-                   [1, 0]])
+        strided.set_(reference.storage(), 1, size=torch.Size([2, 4]), stride=[8, 2])
+
+        self.assertEqual(
+            strided[ri([0, 1]), ri([0])],
+            torch.tensor([1, 9], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            strided[ri([0, 1]), ri([1])],
+            torch.tensor([3, 11], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            strided[ri([0]), ri([0])], torch.tensor([1], dtype=dtype, device=device)
+        )
+        self.assertEqual(
+            strided[ri([1]), ri([3])], torch.tensor([15], dtype=dtype, device=device)
+        )
+        self.assertEqual(
+            strided[[ri([0, 0]), ri([0, 3])]],
+            torch.tensor([1, 7], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            strided[[ri([1]), ri([0, 1, 1, 0, 3])]],
+            torch.tensor([9, 11, 11, 9, 15], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            strided[[ri([0, 0, 1, 1]), ri([0, 1, 0, 0])]],
+            torch.tensor([1, 3, 9, 9], dtype=dtype, device=device),
+        )
+
+        rows = ri([[0, 0], [1, 1]])
+        columns = ([0],)
+        self.assertEqual(
+            strided[rows, columns],
+            torch.tensor([[1, 1], [9, 9]], dtype=dtype, device=device),
+        )
+
+        rows = ri([[0, 1], [1, 0]])
         columns = ri([1, 2])
-        self.assertEqual(strided[rows, columns],
-                         torch.tensor([[3, 13], [11, 5]], dtype=dtype, device=device))
-        rows = ri([[0, 0],
-                   [1, 1]])
-        columns = ri([[0, 1],
-                      [1, 2]])
-        self.assertEqual(strided[rows, columns],
-                         torch.tensor([[1, 3], [11, 13]], dtype=dtype, device=device))
+        self.assertEqual(
+            strided[rows, columns],
+            torch.tensor([[3, 13], [11, 5]], dtype=dtype, device=device),
+        )
+        rows = ri([[0, 0], [1, 1]])
+        columns = ri([[0, 1], [1, 2]])
+        self.assertEqual(
+            strided[rows, columns],
+            torch.tensor([[1, 3], [11, 13]], dtype=dtype, device=device),
+        )
 
         # setting values
 
         # strided is [[10, 11],
         #             [17, 18]]
 
-        reference = torch.arange(0., 24, dtype=dtype, device=device).view(3, 8)
+        reference = torch.arange(0.0, 24, dtype=dtype, device=device).view(3, 8)
         strided = torch.tensor((), dtype=dtype, device=device)
-        strided.set_(reference.storage(), 10, size=torch.Size([2, 2]),
-                     stride=[7, 1])
-        self.assertEqual(strided[ri([0]), ri([1])],
-                         torch.tensor([11], dtype=dtype, device=device))
+        strided.set_(reference.storage(), 10, size=torch.Size([2, 2]), stride=[7, 1])
+        self.assertEqual(
+            strided[ri([0]), ri([1])], torch.tensor([11], dtype=dtype, device=device)
+        )
         strided[ri([0]), ri([1])] = -1
-        self.assertEqual(strided[ri([0]), ri([1])],
-                         torch.tensor([-1], dtype=dtype, device=device))
+        self.assertEqual(
+            strided[ri([0]), ri([1])], torch.tensor([-1], dtype=dtype, device=device)
+        )
 
-        reference = torch.arange(0., 24, dtype=dtype, device=device).view(3, 8)
+        reference = torch.arange(0.0, 24, dtype=dtype, device=device).view(3, 8)
         strided = torch.tensor((), dtype=dtype, device=device)
-        strided.set_(reference.storage(), 10, size=torch.Size([2, 2]),
-                     stride=[7, 1])
-        self.assertEqual(strided[ri([0, 1]), ri([1, 0])],
-                         torch.tensor([11, 17], dtype=dtype, device=device))
-        strided[ri([0, 1]), ri([1, 0])] = torch.tensor([-1, 2], dtype=dtype, device=device)
-        self.assertEqual(strided[ri([0, 1]), ri([1, 0])],
-                         torch.tensor([-1, 2], dtype=dtype, device=device))
-
-        reference = torch.arange(0., 24, dtype=dtype, device=device).view(3, 8)
+        strided.set_(reference.storage(), 10, size=torch.Size([2, 2]), stride=[7, 1])
+        self.assertEqual(
+            strided[ri([0, 1]), ri([1, 0])],
+            torch.tensor([11, 17], dtype=dtype, device=device),
+        )
+        strided[ri([0, 1]), ri([1, 0])] = torch.tensor(
+            [-1, 2], dtype=dtype, device=device
+        )
+        self.assertEqual(
+            strided[ri([0, 1]), ri([1, 0])],
+            torch.tensor([-1, 2], dtype=dtype, device=device),
+        )
+
+        reference = torch.arange(0.0, 24, dtype=dtype, device=device).view(3, 8)
         strided = torch.tensor((), dtype=dtype, device=device)
-        strided.set_(reference.storage(), 10, size=torch.Size([2, 2]),
-                     stride=[7, 1])
-
-        rows = ri([[0],
-                   [1]])
-        columns = ri([[0, 1],
-                      [0, 1]])
-        self.assertEqual(strided[rows, columns],
-                         torch.tensor([[10, 11], [17, 18]], dtype=dtype, device=device))
-        strided[rows, columns] = torch.tensor([[4, 6], [2, 3]], dtype=dtype, device=device)
-        self.assertEqual(strided[rows, columns],
-                         torch.tensor([[4, 6], [2, 3]], dtype=dtype, device=device))
+        strided.set_(reference.storage(), 10, size=torch.Size([2, 2]), stride=[7, 1])
+
+        rows = ri([[0], [1]])
+        columns = ri([[0, 1], [0, 1]])
+        self.assertEqual(
+            strided[rows, columns],
+            torch.tensor([[10, 11], [17, 18]], dtype=dtype, device=device),
+        )
+        strided[rows, columns] = torch.tensor(
+            [[4, 6], [2, 3]], dtype=dtype, device=device
+        )
+        self.assertEqual(
+            strided[rows, columns],
+            torch.tensor([[4, 6], [2, 3]], dtype=dtype, device=device),
+        )
 
         # Tests using less than the number of dims, and ellipsis
 
@@ -401,12 +550,17 @@ def validate_setting(x):
         #              3 4
         #              5 6
         reference = consec((3, 2))
-        self.assertEqual(reference[ri([0, 2]), ],
-                         torch.tensor([[1, 2], [5, 6]], dtype=dtype, device=device))
-        self.assertEqual(reference[ri([1]), ...],
-                         torch.tensor([[3, 4]], dtype=dtype, device=device))
-        self.assertEqual(reference[..., ri([1])],
-                         torch.tensor([[2], [4], [6]], dtype=dtype, device=device))
+        self.assertEqual(
+            reference[ri([0, 2]),],
+            torch.tensor([[1, 2], [5, 6]], dtype=dtype, device=device),
+        )
+        self.assertEqual(
+            reference[ri([1]), ...], torch.tensor([[3, 4]], dtype=dtype, device=device)
+        )
+        self.assertEqual(
+            reference[..., ri([1])],
+            torch.tensor([[2], [4], [6]], dtype=dtype, device=device),
+        )
 
         # verify too many indices fails
         with self.assertRaises(IndexError):
@@ -417,21 +571,22 @@ def validate_setting(x):
         # can't test cuda because it is a device assert
         if not reference.is_cuda:
             for err_idx in (10, -11):
-                with self.assertRaisesRegex(IndexError, r'out of'):
+                with self.assertRaisesRegex(IndexError, r"out of"):
                     reference[err_idx]
-                with self.assertRaisesRegex(IndexError, r'out of'):
+                with self.assertRaisesRegex(IndexError, r"out of"):
                     reference[torch.LongTensor([err_idx]).to(device)]
-                with self.assertRaisesRegex(IndexError, r'out of'):
+                with self.assertRaisesRegex(IndexError, r"out of"):
                     reference[[err_idx]]
 
         def tensor_indices_to_np(tensor, indices):
             # convert the Torch Tensor to a numpy array
-            tensor = tensor.to(device='cpu')
+            tensor = tensor.to(device="cpu")
             npt = tensor.numpy()
 
             # convert indices
-            idxs = tuple(i.tolist() if isinstance(i, torch.LongTensor) else
-                         i for i in indices)
+            idxs = tuple(
+                i.tolist() if isinstance(i, torch.LongTensor) else i for i in indices
+            )
 
             return npt, idxs
 
@@ -443,7 +598,7 @@ def get_numpy(tensor, indices):
 
         def set_numpy(tensor, indices, value):
             if not isinstance(value, int):
-                if self.device_type != 'cpu':
+                if self.device_type != "cpu":
                     value = value.cpu()
                 value = value.numpy()
 
@@ -458,7 +613,9 @@ def assert_set_eq(tensor, indexer, val):
             pyt = tensor.clone()
             numt = tensor.clone()
             pyt[indexer] = val
-            numt = torch.tensor(set_numpy(numt, indexer, val), dtype=dtype, device=device)
+            numt = torch.tensor(
+                set_numpy(numt, indexer, val), dtype=dtype, device=device
+            )
             self.assertEqual(pyt, numt)
 
         def assert_backward_eq(tensor, indexer):
@@ -481,18 +638,15 @@ def get_set_tensor(indexed, indexer):
         #            5  6  7  8  9
         #           10 11 12 13 14
         #           15 16 17 18 19
-        reference = torch.arange(0., 20, dtype=dtype, device=device).view(4, 5)
+        reference = torch.arange(0.0, 20, dtype=dtype, device=device).view(4, 5)
 
         indices_to_test = [
             # grab the second, fourth columns
             [slice(None), [1, 3]],
-
             # first, third rows,
             [[0, 2], slice(None)],
-
             # weird shape
-            [slice(None), [[0, 1],
-                           [2, 3]]],
+            [slice(None), [[0, 1], [2, 3]]],
             # negatives
             [[-1], [0]],
             [[0, 2], [-1]],
@@ -504,16 +658,14 @@ def get_set_tensor(indexed, indexer):
 
         for indexer in get_indices_to_test:
             assert_get_eq(reference, indexer)
-            if self.device_type != 'cpu':
+            if self.device_type != "cpu":
                 assert_backward_eq(reference, indexer)
 
         for indexer in indices_to_test:
             assert_set_eq(reference, indexer, 44)
-            assert_set_eq(reference,
-                          indexer,
-                          get_set_tensor(reference, indexer))
+            assert_set_eq(reference, indexer, get_set_tensor(reference, indexer))
 
-        reference = torch.arange(0., 160, dtype=dtype, device=device).view(4, 8, 5)
+        reference = torch.arange(0.0, 160, dtype=dtype, device=device).view(4, 8, 5)
 
         indices_to_test = [
             [slice(None), slice(None), [0, 3, 4]],
@@ -535,9 +687,11 @@ def get_set_tensor(indexed, indexer):
             [[[2]], [[0, 3], [4, 1]], slice(None)],
             # non-contiguous indexing subspace
             [[0, 2, 3], slice(None), [1, 3, 4]],
-
+            # [...]
             # less dim, ellipsis
-            [[0, 2], ],
+            [
+                [0, 2],
+            ],
             [[0, 2], slice(None)],
             [[0, 2], Ellipsis],
             [[0, 2], slice(None), Ellipsis],
@@ -548,7 +702,6 @@ def get_set_tensor(indexed, indexer):
             [Ellipsis, [2, 3, 4]],
             [Ellipsis, slice(None), [2, 3, 4]],
             [slice(None), Ellipsis, [2, 3, 4]],
-
             # ellipsis counts for nothing
             [Ellipsis, slice(None), slice(None), [0, 3, 4]],
             [slice(None), Ellipsis, slice(None), [0, 3, 4]],
@@ -566,7 +719,7 @@ def get_set_tensor(indexed, indexer):
             if torch.cuda.is_available():
                 assert_backward_eq(reference, indexer)
 
-        reference = torch.arange(0., 1296, dtype=dtype, device=device).view(3, 9, 8, 6)
+        reference = torch.arange(0.0, 1296, dtype=dtype, device=device).view(3, 9, 8, 6)
 
         indices_to_test = [
             [slice(None), slice(None), slice(None), [0, 3, 4]],
@@ -610,7 +763,6 @@ def get_set_tensor(indexed, indexer):
             [[0], [4], [1, 3, 4], slice(None)],
             [[1], [0, 2, 3], [1], slice(None)],
             [[[1, 2], [1, 2]], [[0, 1], [2, 3]], [[2, 3], [3, 5]], slice(None)],
-
             # less dim, ellipsis
             [Ellipsis, [0, 3, 4]],
             [Ellipsis, slice(None), [0, 3, 4]],
@@ -624,7 +776,9 @@ def get_set_tensor(indexed, indexer):
             [[0], [1, 2, 4], slice(None)],
             [[0], [1, 2, 4], Ellipsis],
             [[0], [1, 2, 4], Ellipsis, slice(None)],
-            [[1], ],
+            [
+                [1],
+            ],
             [[0, 2, 1], [3], [4]],
             [[0, 2, 1], [3], [4], slice(None)],
             [[0, 2, 1], [3], [4], Ellipsis],
@@ -642,14 +796,16 @@ def get_set_tensor(indexed, indexer):
         for indexer in indices_to_test:
             assert_get_eq(reference, indexer)
             assert_set_eq(reference, indexer, 1333)
-            if self.device_type != 'cpu':
+            if self.device_type != "cpu":
                 assert_backward_eq(reference, indexer)
 
     def test_advancedindex_big(self, device):
         reference = torch.arange(0, 123344, dtype=torch.int, device=device)
 
-        self.assertEqual(reference[[0, 123, 44488, 68807, 123343], ],
-                         torch.tensor([0, 123, 44488, 68807, 123343], dtype=torch.int))
+        self.assertEqual(
+            reference[[0, 123, 44488, 68807, 123343],],
+            torch.tensor([0, 123, 44488, 68807, 123343], dtype=torch.int),
+        )
 
     def test_set_item_to_scalar_tensor(self, device):
         m = random.randint(1, 10)
@@ -687,29 +843,37 @@ def test_step(self, device):
 
     def test_step_assignment(self, device):
         v = torch.zeros(4, 4, device=device)
-        v[0, 1::2] = torch.tensor([3., 4.], device=device)
+        v[0, 1::2] = torch.tensor([3.0, 4.0], device=device)
         self.assertEqual(v[0].tolist(), [0, 3, 0, 4])
         self.assertEqual(v[1:].sum(), 0)
 
     def test_bool_indices(self, device):
         v = torch.randn(5, 7, 3, device=device)
-        boolIndices = torch.tensor([True, False, True, True, False], dtype=torch.bool, device=device)
+        boolIndices = torch.tensor(
+            [True, False, True, True, False], dtype=torch.bool, device=device
+        )
         self.assertEqual(v[boolIndices].shape, (3, 7, 3))
         self.assertEqual(v[boolIndices], torch.stack([v[0], v[2], v[3]]))
 
         v = torch.tensor([True, False, True], dtype=torch.bool, device=device)
-        boolIndices = torch.tensor([True, False, False], dtype=torch.bool, device=device)
+        boolIndices = torch.tensor(
+            [True, False, False], dtype=torch.bool, device=device
+        )
         uint8Indices = torch.tensor([1, 0, 0], dtype=torch.uint8, device=device)
         with warnings.catch_warnings(record=True) as w:
-            self.assertEqual(v[boolIndices].shape, v[uint8Indices].shape)
-            self.assertEqual(v[boolIndices], v[uint8Indices])
-            self.assertEqual(v[boolIndices], tensor([True], dtype=torch.bool, device=device))
-            self.assertEqual(len(w), 2)
+            v1 = v[boolIndices]
+            v2 = v[uint8Indices]
+            self.assertEqual(v1.shape, v2.shape)
+            self.assertEqual(v1, v2)
+            self.assertEqual(
+                v[boolIndices], tensor([True], dtype=torch.bool, device=device)
+            )
+            self.assertEqual(len(w), 1)
 
     def test_bool_indices_accumulate(self, device):
-        mask = torch.zeros(size=(10, ), dtype=torch.bool, device=device)
+        mask = torch.zeros(size=(10,), dtype=torch.bool, device=device)
         y = torch.ones(size=(10, 10), device=device)
-        y.index_put_((mask, ), y[mask], accumulate=True)
+        y.index_put_((mask,), y[mask], accumulate=True)
         self.assertEqual(y, torch.ones(size=(10, 10), device=device))
 
     def test_multiple_bool_indices(self, device):
@@ -723,32 +887,38 @@ def test_byte_mask(self, device):
         v = torch.randn(5, 7, 3, device=device)
         mask = torch.ByteTensor([1, 0, 1, 1, 0]).to(device)
         with warnings.catch_warnings(record=True) as w:
-            self.assertEqual(v[mask].shape, (3, 7, 3))
-            self.assertEqual(v[mask], torch.stack([v[0], v[2], v[3]]))
-            self.assertEqual(len(w), 2)
+            res = v[mask]
+            self.assertEqual(res.shape, (3, 7, 3))
+            self.assertEqual(res, torch.stack([v[0], v[2], v[3]]))
+            self.assertEqual(len(w), 1)
 
-        v = torch.tensor([1.], device=device)
+        v = torch.tensor([1.0], device=device)
         self.assertEqual(v[v == 0], torch.tensor([], device=device))
 
     def test_byte_mask_accumulate(self, device):
-        mask = torch.zeros(size=(10, ), dtype=torch.uint8, device=device)
+        mask = torch.zeros(size=(10,), dtype=torch.uint8, device=device)
         y = torch.ones(size=(10, 10), device=device)
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")
-            y.index_put_((mask, ), y[mask], accumulate=True)
+            y.index_put_((mask,), y[mask], accumulate=True)
             self.assertEqual(y, torch.ones(size=(10, 10), device=device))
             self.assertEqual(len(w), 2)
 
-    @skipIfTorchDynamo("This test causes SIGKILL when running with dynamo, https://github.com/pytorch/pytorch/issues/88472")
+    @skipIfTorchDynamo(
+        "This test causes SIGKILL when running with dynamo, https://github.com/pytorch/pytorch/issues/88472"
+    )
+    @serialTest(TEST_CUDA)
     def test_index_put_accumulate_large_tensor(self, device):
         # This test is for tensors with number of elements >= INT_MAX (2^31 - 1).
         N = (1 << 31) + 5
         dt = torch.int8
         a = torch.ones(N, dtype=dt, device=device)
-        indices = torch.tensor([-2, 0, -2, -1, 0, -1, 1], device=device, dtype=torch.long)
+        indices = torch.tensor(
+            [-2, 0, -2, -1, 0, -1, 1], device=device, dtype=torch.long
+        )
         values = torch.tensor([6, 5, 6, 6, 5, 7, 11], dtype=dt, device=device)
 
-        a.index_put_((indices, ), values, accumulate=True)
+        a.index_put_((indices,), values, accumulate=True)
 
         self.assertEqual(a[0], 11)
         self.assertEqual(a[1], 12)
@@ -783,11 +953,19 @@ def test_index_put_accumulate_expanded_values(self, device):
         t_dev = t.to(device)
         indices = [
             torch.tensor([0, 1, 2, 3]),
-            torch.tensor([1, ]),
+            torch.tensor(
+                [
+                    1,
+                ]
+            ),
         ]
         indices_dev = [i.to(device) for i in indices]
         values0d = torch.tensor(1.0)
-        values1d = torch.tensor([1.0, ])
+        values1d = torch.tensor(
+            [
+                1.0,
+            ]
+        )
 
         out_cuda = t_dev.index_put_(indices_dev, values0d.to(device), accumulate=True)
         out_cpu = t.index_put_(indices, values0d, accumulate=True)
@@ -801,13 +979,21 @@ def test_index_put_accumulate_expanded_values(self, device):
         t_dev = t.to(device)
 
         indices = [
-            torch.tensor([0, ]),
+            torch.tensor(
+                [
+                    0,
+                ]
+            ),
             torch.arange(3)[:, None],
             torch.arange(2)[None, :],
         ]
         indices_dev = [i.to(device) for i in indices]
         values1d = torch.tensor([-1.0, -2.0])
-        values2d = torch.tensor([[-1.0, -2.0], ])
+        values2d = torch.tensor(
+            [
+                [-1.0, -2.0],
+            ]
+        )
 
         out_cuda = t_dev.index_put_(indices_dev, values1d.to(device), accumulate=True)
         out_cpu = t.index_put_(indices, values1d, accumulate=True)
@@ -826,7 +1012,9 @@ def test_index_put_accumulate_non_contiguous(self, device):
         self.assertTrue(not t1.is_contiguous())
         self.assertTrue(not t2.is_contiguous())
 
-        indices = [torch.tensor([0, 1]), ]
+        indices = [
+            torch.tensor([0, 1]),
+        ]
         indices_dev = [i.to(device) for i in indices]
         value = torch.randn(2, 2)
         out_cuda = t1.index_put_(indices_dev, value.to(device), accumulate=True)
@@ -948,7 +1136,11 @@ def fn2(x):
         scripted_fn2 = torch.jit.script(fn2)
         data = torch.arange(100, device=device, dtype=torch.float)
         out = scripted_fn1(data.detach().clone())
-        ref = torch.tensor(np.concatenate((np.ones(50), np.arange(50, 100))), device=device, dtype=torch.float)
+        ref = torch.tensor(
+            np.concatenate((np.ones(50), np.arange(50, 100))),
+            device=device,
+            dtype=torch.float,
+        )
         self.assertEqual(out, ref)
         out = scripted_fn2(data.detach().clone())
         self.assertEqual(out, ref)
@@ -959,9 +1151,15 @@ def test_int_indices(self, device):
         self.assertEqual(v[:, [0, 4, 2]].shape, (5, 3, 3))
         self.assertEqual(v[:, [[0, 1], [4, 3]]].shape, (5, 2, 2, 3))
 
-    @dtypes(torch.cfloat, torch.cdouble, torch.float, torch.bfloat16, torch.long, torch.bool)
-    @dtypesIfCPU(torch.cfloat, torch.cdouble, torch.float, torch.long, torch.bool, torch.bfloat16)
-    @dtypesIfCUDA(torch.cfloat, torch.cdouble, torch.half, torch.long, torch.bool, torch.bfloat16)
+    @dtypes(
+        torch.cfloat, torch.cdouble, torch.float, torch.bfloat16, torch.long, torch.bool
+    )
+    @dtypesIfCPU(
+        torch.cfloat, torch.cdouble, torch.float, torch.long, torch.bool, torch.bfloat16
+    )
+    @dtypesIfCUDA(
+        torch.cfloat, torch.cdouble, torch.half, torch.long, torch.bool, torch.bfloat16
+    )
     def test_index_put_src_datatype(self, device, dtype):
         src = torch.ones(3, 2, 4, device=device, dtype=dtype)
         vals = torch.ones(3, 2, 4, device=device, dtype=dtype)
@@ -1012,21 +1210,28 @@ def test_empty_index(self, device):
 
     def test_empty_ndim_index(self, device):
         x = torch.randn(5, device=device)
-        self.assertEqual(torch.empty(0, 2, device=device), x[torch.empty(0, 2, dtype=torch.int64, device=device)])
+        self.assertEqual(
+            torch.empty(0, 2, device=device),
+            x[torch.empty(0, 2, dtype=torch.int64, device=device)],
+        )
 
         x = torch.randn(2, 3, 4, 5, device=device)
-        self.assertEqual(torch.empty(2, 0, 6, 4, 5, device=device),
-                         x[:, torch.empty(0, 6, dtype=torch.int64, device=device)])
+        self.assertEqual(
+            torch.empty(2, 0, 6, 4, 5, device=device),
+            x[:, torch.empty(0, 6, dtype=torch.int64, device=device)],
+        )
 
         x = torch.empty(10, 0, device=device)
         self.assertEqual(x[[1, 2]].shape, (2, 0))
         self.assertEqual(x[[], []].shape, (0,))
-        with self.assertRaisesRegex(IndexError, 'for dimension with size 0'):
+        with self.assertRaisesRegex(IndexError, "for dimension with size 0"):
             x[:, [0, 1]]
 
     def test_empty_ndim_index_bool(self, device):
         x = torch.randn(5, device=device)
-        self.assertRaises(IndexError, lambda: x[torch.empty(0, 2, dtype=torch.uint8, device=device)])
+        self.assertRaises(
+            IndexError, lambda: x[torch.empty(0, 2, dtype=torch.uint8, device=device)]
+        )
 
     def test_empty_slice(self, device):
         x = torch.randn(2, 3, 4, 5, device=device)
@@ -1041,7 +1246,7 @@ def test_index_getitem_copy_bools_slices(self, device):
         true = torch.tensor(1, dtype=torch.uint8, device=device)
         false = torch.tensor(0, dtype=torch.uint8, device=device)
 
-        tensors = [torch.randn(2, 3, device=device), torch.tensor(3., device=device)]
+        tensors = [torch.randn(2, 3, device=device), torch.tensor(3.0, device=device)]
 
         for a in tensors:
             self.assertNotEqual(a.data_ptr(), a[True].data_ptr())
@@ -1174,18 +1379,18 @@ def test_int_assignment(self, device):
         self.assertEqual(x.tolist(), [[0, 1], [5, 6]])
 
     def test_byte_tensor_assignment(self, device):
-        x = torch.arange(0., 16, device=device).view(4, 4)
+        x = torch.arange(0.0, 16, device=device).view(4, 4)
         b = torch.ByteTensor([True, False, True, False]).to(device)
-        value = torch.tensor([3., 4., 5., 6.], device=device)
+        value = torch.tensor([3.0, 4.0, 5.0, 6.0], device=device)
 
         with warnings.catch_warnings(record=True) as w:
             x[b] = value
             self.assertEqual(len(w), 1)
 
         self.assertEqual(x[0], value)
-        self.assertEqual(x[1], torch.arange(4., 8, device=device))
+        self.assertEqual(x[1], torch.arange(4.0, 8, device=device))
         self.assertEqual(x[2], value)
-        self.assertEqual(x[3], torch.arange(12., 16, device=device))
+        self.assertEqual(x[3], torch.arange(12.0, 16, device=device))
 
     def test_variable_slicing(self, device):
         x = torch.arange(0, 16, device=device).view(4, 4)
@@ -1196,50 +1401,62 @@ def test_variable_slicing(self, device):
     def test_ellipsis_tensor(self, device):
         x = torch.arange(0, 9, device=device).view(3, 3)
         idx = torch.tensor([0, 2], device=device)
-        self.assertEqual(x[..., idx].tolist(), [[0, 2],
-                                                [3, 5],
-                                                [6, 8]])
-        self.assertEqual(x[idx, ...].tolist(), [[0, 1, 2],
-                                                [6, 7, 8]])
+        self.assertEqual(x[..., idx].tolist(), [[0, 2], [3, 5], [6, 8]])
+        self.assertEqual(x[idx, ...].tolist(), [[0, 1, 2], [6, 7, 8]])
 
     def test_unravel_index_errors(self, device):
         with self.assertRaisesRegex(TypeError, r"expected 'indices' to be integer"):
-            torch.unravel_index(
-                torch.tensor(0.5, device=device),
-                (2, 2))
+            torch.unravel_index(torch.tensor(0.5, device=device), (2, 2))
 
         with self.assertRaisesRegex(TypeError, r"expected 'indices' to be integer"):
-            torch.unravel_index(
-                torch.tensor([], device=device),
-                (10, 3, 5))
+            torch.unravel_index(torch.tensor([], device=device), (10, 3, 5))
 
-        with self.assertRaisesRegex(TypeError, r"expected 'shape' to be int or sequence"):
+        with self.assertRaisesRegex(
+            TypeError, r"expected 'shape' to be int or sequence"
+        ):
             torch.unravel_index(
                 torch.tensor([1], device=device, dtype=torch.int64),
-                torch.tensor([1, 2, 3]))
+                torch.tensor([1, 2, 3]),
+            )
 
-        with self.assertRaisesRegex(TypeError, r"expected 'shape' sequence to only contain ints"):
+        with self.assertRaisesRegex(
+            TypeError, r"expected 'shape' sequence to only contain ints"
+        ):
             torch.unravel_index(
-                torch.tensor([1], device=device, dtype=torch.int64),
-                (1, 2, 2.0))
+                torch.tensor([1], device=device, dtype=torch.int64), (1, 2, 2.0)
+            )
 
-        with self.assertRaisesRegex(ValueError, r"'shape' cannot have negative values, but got \(2, -3\)"):
-            torch.unravel_index(
-                torch.tensor(0, device=device),
-                (2, -3))
+        with self.assertRaisesRegex(
+            ValueError, r"'shape' cannot have negative values, but got \(2, -3\)"
+        ):
+            torch.unravel_index(torch.tensor(0, device=device), (2, -3))
 
     def test_invalid_index(self, device):
         x = torch.arange(0, 16, device=device).view(4, 4)
-        self.assertRaisesRegex(TypeError, 'slice indices', lambda: x["0":"1"])
+        self.assertRaisesRegex(TypeError, "slice indices", lambda: x["0":"1"])
 
     def test_out_of_bound_index(self, device):
         x = torch.arange(0, 100, device=device).view(2, 5, 10)
-        self.assertRaisesRegex(IndexError, 'index 5 is out of bounds for dimension 1 with size 5', lambda: x[0, 5])
-        self.assertRaisesRegex(IndexError, 'index 4 is out of bounds for dimension 0 with size 2', lambda: x[4, 5])
-        self.assertRaisesRegex(IndexError, 'index 15 is out of bounds for dimension 2 with size 10',
-                               lambda: x[0, 1, 15])
-        self.assertRaisesRegex(IndexError, 'index 12 is out of bounds for dimension 2 with size 10',
-                               lambda: x[:, :, 12])
+        self.assertRaisesRegex(
+            IndexError,
+            "index 5 is out of bounds for dimension 1 with size 5",
+            lambda: x[0, 5],
+        )
+        self.assertRaisesRegex(
+            IndexError,
+            "index 4 is out of bounds for dimension 0 with size 2",
+            lambda: x[4, 5],
+        )
+        self.assertRaisesRegex(
+            IndexError,
+            "index 15 is out of bounds for dimension 2 with size 10",
+            lambda: x[0, 1, 15],
+        )
+        self.assertRaisesRegex(
+            IndexError,
+            "index 12 is out of bounds for dimension 2 with size 10",
+            lambda: x[:, :, 12],
+        )
 
     def test_zero_dim_index(self, device):
         x = torch.tensor(10, device=device)
@@ -1249,16 +1466,19 @@ def runner():
             print(x[0])
             return x[0]
 
-        self.assertRaisesRegex(IndexError, 'invalid index', runner)
+        self.assertRaisesRegex(IndexError, "invalid index", runner)
 
     @onlyCUDA
     def test_invalid_device(self, device):
         idx = torch.tensor([0, 1])
         b = torch.zeros(5, device=device)
-        c = torch.tensor([1., 2.], device="cpu")
+        c = torch.tensor([1.0, 2.0], device="cpu")
 
         for accumulate in [True, False]:
-            self.assertRaises(RuntimeError, lambda: torch.index_put_(b, (idx,), c, accumulate=accumulate))
+            self.assertRaises(
+                RuntimeError,
+                lambda: torch.index_put_(b, (idx,), c, accumulate=accumulate),
+            )
 
     @onlyCUDA
     def test_cpu_indices(self, device):
@@ -1283,7 +1503,9 @@ def _test_against_numpy(t, indices, dim):
 
         for shape in [(3, 2), (2, 3, 5), (2, 4, 0), (2, 3, 1, 4)]:
             for noncontiguous in [True, False]:
-                t = make_tensor(shape, device=device, dtype=dtype, noncontiguous=noncontiguous)
+                t = make_tensor(
+                    shape, device=device, dtype=dtype, noncontiguous=noncontiguous
+                )
                 for dim in list(range(t.ndim)) + [None]:
                     if dim is None:
                         indices = torch.argsort(t.view(-1))
@@ -1312,8 +1534,9 @@ def test_take_along_dim_invalid(self, device, dtype):
         indices = torch.argsort(t, dim=dim)
 
         # dim of `t` and `indices` does not match
-        with self.assertRaisesRegex(RuntimeError,
-                                    "input and indices should have the same number of dimensions"):
+        with self.assertRaisesRegex(
+            RuntimeError, "input and indices should have the same number of dimensions"
+        ):
             torch.take_along_dim(t, indices[0], dim=0)
 
         # invalid `indices` dtype
@@ -1341,18 +1564,26 @@ def test_gather_take_along_dim_cross_device(self, device, dtype):
         t = make_tensor(shape, device=device, dtype=dtype)
         indices = torch.argsort(t, dim=dim)
 
-        with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected all tensors to be on the same device"
+        ):
             torch.gather(t, 0, indices.cpu())
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"Expected tensor to have .* but got tensor with .* torch.take_along_dim()"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected tensor to have .* but got tensor with .* torch.take_along_dim()",
+        ):
             torch.take_along_dim(t, indices.cpu(), dim=0)
 
-        with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Expected all tensors to be on the same device"
+        ):
             torch.gather(t.cpu(), 0, indices)
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"Expected tensor to have .* but got tensor with .* torch.take_along_dim()"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected tensor to have .* but got tensor with .* torch.take_along_dim()",
+        ):
             torch.take_along_dim(t.cpu(), indices, dim=0)
 
     @onlyCUDA
@@ -1405,7 +1636,6 @@ def test_index_limits(self, device):
         self.assertRaises(IndexError, lambda: t[idx_max])
 
 
-
 # The tests below are from NumPy test_indexing.py with some modifications to
 # make them compatible with PyTorch. It's licensed under the BDS license below:
 #
@@ -1440,9 +1670,10 @@ def test_index_limits(self, device):
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 class NumpyTests(TestCase):
     def test_index_no_floats(self, device):
-        a = torch.tensor([[[5.]]], device=device)
+        a = torch.tensor([[[5.0]]], device=device)
 
         self.assertRaises(IndexError, lambda: a[0.0])
         self.assertRaises(IndexError, lambda: a[0, 0.0])
@@ -1490,9 +1721,7 @@ def test_empty_fancy_index(self, device):
         self.assertRaises(IndexError, lambda: a[b])
 
     def test_ellipsis_index(self, device):
-        a = tensor([[1, 2, 3],
-                    [4, 5, 6],
-                    [7, 8, 9]], device=device)
+        a = tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], device=device)
         self.assertIsNot(a[...], a)
         self.assertEqual(a[...], a)
         # `a[...]` was `a` in numpy <1.9.
@@ -1515,9 +1744,7 @@ def test_ellipsis_index(self, device):
 
     def test_single_int_index(self, device):
         # Single integer index selects one row
-        a = tensor([[1, 2, 3],
-                    [4, 5, 6],
-                    [7, 8, 9]], device=device)
+        a = tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], device=device)
 
         self.assertEqual(a[0], [1, 2, 3])
         self.assertEqual(a[-1], [7, 8, 9])
@@ -1529,9 +1756,7 @@ def test_single_int_index(self, device):
 
     def test_single_bool_index(self, device):
         # Single boolean index
-        a = tensor([[1, 2, 3],
-                    [4, 5, 6],
-                    [7, 8, 9]], device=device)
+        a = tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], device=device)
 
         self.assertEqual(a[True], a[None])
         self.assertEqual(a[False], a[None][0:0])
@@ -1540,24 +1765,24 @@ def test_boolean_shape_mismatch(self, device):
         arr = torch.ones((5, 4, 3), device=device)
 
         index = tensor([True], device=device)
-        self.assertRaisesRegex(IndexError, 'mask', lambda: arr[index])
+        self.assertRaisesRegex(IndexError, "mask", lambda: arr[index])
 
         index = tensor([False] * 6, device=device)
-        self.assertRaisesRegex(IndexError, 'mask', lambda: arr[index])
+        self.assertRaisesRegex(IndexError, "mask", lambda: arr[index])
 
         index = torch.ByteTensor(4, 4).to(device).zero_()
-        self.assertRaisesRegex(IndexError, 'mask', lambda: arr[index])
-        self.assertRaisesRegex(IndexError, 'mask', lambda: arr[(slice(None), index)])
+        self.assertRaisesRegex(IndexError, "mask", lambda: arr[index])
+        self.assertRaisesRegex(IndexError, "mask", lambda: arr[(slice(None), index)])
 
     def test_boolean_indexing_onedim(self, device):
         # Indexing a 2-dimensional array with
         # boolean array of length one
-        a = tensor([[0., 0., 0.]], device=device)
+        a = tensor([[0.0, 0.0, 0.0]], device=device)
         b = tensor([True], device=device)
         self.assertEqual(a[b], a)
         # boolean assignment
-        a[b] = 1.
-        self.assertEqual(a, tensor([[1., 1., 1.]], device=device))
+        a[b] = 1.0
+        self.assertEqual(a, tensor([[1.0, 1.0, 1.0]], device=device))
 
     def test_boolean_assignment_value_mismatch(self, device):
         # A boolean assignment should fail when the shape of the values
@@ -1567,34 +1792,33 @@ def test_boolean_assignment_value_mismatch(self, device):
         def f(a, v):
             a[a > -1] = tensor(v).to(device)
 
-        self.assertRaisesRegex(Exception, 'shape mismatch', f, a, [])
-        self.assertRaisesRegex(Exception, 'shape mismatch', f, a, [1, 2, 3])
-        self.assertRaisesRegex(Exception, 'shape mismatch', f, a[:1], [1, 2, 3])
+        self.assertRaisesRegex(Exception, "shape mismatch", f, a, [])
+        self.assertRaisesRegex(Exception, "shape mismatch", f, a, [1, 2, 3])
+        self.assertRaisesRegex(Exception, "shape mismatch", f, a[:1], [1, 2, 3])
 
     def test_boolean_indexing_twodim(self, device):
         # Indexing a 2-dimensional array with
         # 2-dimensional boolean array
-        a = tensor([[1, 2, 3],
-                    [4, 5, 6],
-                    [7, 8, 9]], device=device)
-        b = tensor([[True, False, True],
-                    [False, True, False],
-                    [True, False, True]], device=device)
+        a = tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], device=device)
+        b = tensor(
+            [[True, False, True], [False, True, False], [True, False, True]],
+            device=device,
+        )
         self.assertEqual(a[b], tensor([1, 3, 5, 7, 9], device=device))
         self.assertEqual(a[b[1]], tensor([[4, 5, 6]], device=device))
         self.assertEqual(a[b[0]], a[b[2]])
 
         # boolean assignment
         a[b] = 0
-        self.assertEqual(a, tensor([[0, 2, 0],
-                                    [4, 0, 6],
-                                    [0, 8, 0]], device=device))
+        self.assertEqual(a, tensor([[0, 2, 0], [4, 0, 6], [0, 8, 0]], device=device))
 
     def test_boolean_indexing_weirdness(self, device):
         # Weird boolean indexing things
         a = torch.ones((2, 3, 4), device=device)
         self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape)
-        self.assertEqual(torch.ones(1, 2, device=device), a[True, [0, 1], True, True, [1], [[2]]])
+        self.assertEqual(
+            torch.ones(1, 2, device=device), a[True, [0, 1], True, True, [1], [[2]]]
+        )
         self.assertRaises(IndexError, lambda: a[False, [0, 1], ...])
 
     def test_boolean_indexing_weirdness_tensors(self, device):
@@ -1603,7 +1827,9 @@ def test_boolean_indexing_weirdness_tensors(self, device):
         true = torch.tensor(True, device=device)
         a = torch.ones((2, 3, 4), device=device)
         self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape)
-        self.assertEqual(torch.ones(1, 2, device=device), a[true, [0, 1], true, true, [1], [[2]]])
+        self.assertEqual(
+            torch.ones(1, 2, device=device), a[true, [0, 1], true, true, [1], [[2]]]
+        )
         self.assertRaises(IndexError, lambda: a[false, [0, 1], ...])
 
     def test_boolean_indexing_alldims(self, device):
@@ -1615,9 +1841,7 @@ def test_boolean_indexing_alldims(self, device):
     def test_boolean_list_indexing(self, device):
         # Indexing a 2-dimensional array with
         # boolean lists
-        a = tensor([[1, 2, 3],
-                    [4, 5, 6],
-                    [7, 8, 9]], device=device)
+        a = tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], device=device)
         b = [True, False, False]
         c = [True, True, False]
         self.assertEqual(a[b], tensor([[1, 2, 3]], device=device))
@@ -1635,14 +1859,18 @@ def test_everything_returns_views(self, device):
 
     def test_broaderrors_indexing(self, device):
         a = torch.zeros(5, 5, device=device)
-        self.assertRaisesRegex(IndexError, 'shape mismatch', a.__getitem__, ([0, 1], [0, 1, 2]))
-        self.assertRaisesRegex(IndexError, 'shape mismatch', a.__setitem__, ([0, 1], [0, 1, 2]), 0)
+        self.assertRaisesRegex(
+            IndexError, "shape mismatch", a.__getitem__, ([0, 1], [0, 1, 2])
+        )
+        self.assertRaisesRegex(
+            IndexError, "shape mismatch", a.__setitem__, ([0, 1], [0, 1, 2]), 0
+        )
 
     def test_trivial_fancy_out_of_bounds(self, device):
         a = torch.zeros(5, device=device)
         ind = torch.ones(20, dtype=torch.int64, device=device)
         if a.is_cuda:
-            raise unittest.SkipTest('CUDA asserts instead of raising an exception')
+            raise unittest.SkipTest("CUDA asserts instead of raising an exception")
         ind[-1] = 10
         self.assertRaises(IndexError, a.__getitem__, ind)
         self.assertRaises(IndexError, a.__setitem__, ind, 0)
@@ -1654,13 +1882,13 @@ def test_trivial_fancy_out_of_bounds(self, device):
     def test_index_is_larger(self, device):
         # Simple case of fancy index broadcasting of the index.
         a = torch.zeros((5, 5), device=device)
-        a[[[0], [1], [2]], [0, 1, 2]] = tensor([2., 3., 4.], device=device)
+        a[[[0], [1], [2]], [0, 1, 2]] = tensor([2.0, 3.0, 4.0], device=device)
 
-        self.assertTrue((a[:3, :3] == tensor([2., 3., 4.], device=device)).all())
+        self.assertTrue((a[:3, :3] == tensor([2.0, 3.0, 4.0], device=device)).all())
 
     def test_broadcast_subspace(self, device):
         a = torch.zeros((100, 100), device=device)
-        v = torch.arange(0., 100, device=device)[:, None]
+        v = torch.arange(0.0, 100, device=device)[:, None]
         b = torch.arange(99, -1, -1, device=device).long()
         a[b] = v
         expected = b.float().unsqueeze(1).expand(100, 100)
@@ -1675,8 +1903,9 @@ def test_truncate_leading_1s(self, device):
         torch.diagonal(kernel2).copy_(torch.square(col_max.view(4)))
         self.assertEqual(kernel, kernel2)
 
-instantiate_device_type_tests(TestIndexing, globals(), except_for='meta')
-instantiate_device_type_tests(NumpyTests, globals(), except_for='meta')
 
-if __name__ == '__main__':
+instantiate_device_type_tests(TestIndexing, globals(), except_for="meta")
+instantiate_device_type_tests(NumpyTests, globals(), except_for="meta")
+
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_jit.py b/test/test_jit.py
index 64cd80c10d904..bb6f4e2558885 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -99,7 +99,7 @@
     suppress_warnings, BUILD_WITH_CAFFE2, IS_SANDCASTLE, GRAPH_EXECUTOR, ProfilingMode, TestCase, \
     freeze_rng_state, slowTest, TemporaryFileName, \
     enable_profiling_mode_for_profiling_tests, TEST_MKL, set_default_dtype, num_profiled_runs, \
-    skipIfCrossRef, IS_MACOS, skipIfTorchDynamo
+    skipIfCrossRef, skipIfTorchDynamo
 from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, disable_autodiff_subgraph_inlining, \
     _trace, do_input_map, get_execution_plan, make_global, \
     execWrapper, _inline_everything, _tmp_donotuse_dont_inline_everything, \
@@ -344,7 +344,6 @@ def __init__(self):
         self.bar = torch.jit.ScriptModule()
 
 
-@skipIfTorchDynamo()
 class TestJitProfiler(JitTestCase):
     """
     This runs tests that requires setting some global states like torch._C._set_graph_executor_optimize
@@ -409,7 +408,6 @@ def fn(x):
             self.assertTrue(other_fn_events[thread] >= mul_time)
 
 
-@skipIfTorchDynamo()
 class TestJit(JitTestCase):
     @unittest.skip("Requires a lot of RAM")
     def test_big(self):
@@ -1680,6 +1678,13 @@ def foo(x, t: torch.dtype):
                   torch.bfloat16, torch.complex64, torch.complex128, torch.bool]:
             self.assertEqual(scr(x, t), foo(x, t))
 
+    def test_script_bool_literal_conversion(self):
+        def foo(x):
+            return torch.mul(x, True)
+        scr = torch.jit.script(foo)
+        x = torch.rand(3, 4)
+        self.assertEqual(scr(x), foo(x))
+
     def test_shape_analysis_masked_select(self):
         input_str = """graph(%0 : Float(),
           %1 : Bool()):
@@ -1763,73 +1768,6 @@ def doit(x, y):
         for node in g.nodes():
             self.assertTrue(g2.findNode(node.kind()) is not None)
 
-    def test_permute_inputs_binding(self):
-        @torch.jit.script
-        def foo(i, j, k):
-            pass
-
-        g = foo.graph
-
-        idxs = []
-        for i, inp in enumerate(g.inputs()):
-            inp.setDebugName(f"inp{i}")
-            idxs.append(i)
-
-        permuted_idxs = list(np.random.permutation(idxs))
-        g.permuteInputs(permuted_idxs)
-        for i, inp in enumerate(g.inputs()):
-            self.assertEqual(f"inp{permuted_idxs[i]}", inp.debugName())
-
-    @unittest.skipIf(IS_MACOS, "Failing on MacOS only")
-    def test_python_ir_utils(self):
-        @torch.jit.script
-        def foo(inp):
-            x = inp + 1
-            y = x / 2
-            z = y * y
-            return z
-
-        add_node = foo.graph.findNode("aten::add")
-        div_node = foo.graph.findNode("aten::div")
-
-        with foo.graph.insert_point_guard(add_node):
-            with foo.graph.insert_point_guard(div_node):
-                foo.graph.insertConstant("goodbye")
-            foo.graph.insertConstant("hello")
-        with foo.graph.insert_point_guard(foo.graph.findNode("aten::mul")):
-            foo.graph.insertConstant("hello")
-        FileCheck().check("hello").check("goodbye").check("hello").run(foo.graph)
-
-        self.assertTrue(add_node.matches(add_node.schema()))
-        self.assertFalse(add_node.matches(div_node.schema()))
-
-    def test_python_ir_utils_graph(self):
-        @torch.jit.script
-        def unrolled_mul(x: torch.Tensor, y: int):
-            out = x
-            for _ in range(y - 1):
-                out = out + x
-            return out
-
-        @torch.jit.script
-        def foo(x):
-            return x * 4
-
-        g = foo.graph
-        muls = g.findAllNodes("aten::mul")
-        scalar_muls = filter(lambda x: x.matches("aten::mul(Tensor self, Scalar other) -> Tensor"), muls)
-        mul_constant_int = filter(lambda x: isinstance(list(x.inputs())[1].toIValue(), int), scalar_muls)
-        for mul in mul_constant_int:
-            with g.insert_point_guard(mul):
-                outputs = g.insertGraph(unrolled_mul.graph, list(mul.inputs()))
-                assert len(outputs) == len(list(mul.outputs()))
-                for new_out, old_out in zip(outputs, g.outputs()):
-                    old_out.replaceAllUsesWith(new_out)
-                mul.destroy()
-
-        FileCheck().check_not("aten::mul").check("aten::add").run(foo.graph)
-        self.assertEqual(foo(torch.ones([2, 2])), torch.ones([2, 2]) * 4)
-
     @unittest.skipIf(IS_SANDCASTLE, "gtest runs these in sandcastle")
     @unittest.skipIf(RUN_CUDA, "covered by test_cpp_cuda")
     @unittest.skipIf(not torch._C._jit_has_cpp_tests(), "Tests were not built, use BUILD_TEST=1")
@@ -3011,7 +2949,6 @@ def foo(x):
         self.assertRegex(graph.__repr__(), source_range_regex)
 
 
-@skipIfTorchDynamo()
 class TestFrontend(JitTestCase):
 
     def test_instancing_error(self):
@@ -3068,7 +3005,6 @@ def test_func(x, y):
             res_2 = traced_model_2(**{'x': torch.rand([2]), 'z': torch.rand([2])})  # noqa: PIE804
 
 
-@skipIfTorchDynamo()
 class TestScript(JitTestCase):
 
     # Tests that calling torch.jit.script repeated on function is allowed.
@@ -5662,7 +5598,7 @@ def test_uninitialized(self):
         g = parse_ir(graph_str)
         m = self.createFunctionFromGraph(g)
         self.getExportImportCopy(m)
-        with self.assertRaisesRegex(RuntimeError, "isInt"):
+        with self.assertRaisesRegex(RuntimeError, "expected int"):
             m()
 
 
@@ -5930,7 +5866,7 @@ def fn(x, y, z):
 
     def test_python_frontend_source_range(self):
         def fn():
-            raise Exception("hello")
+            raise Exception("hello")  # noqa: TRY002
         ast = torch.jit.frontend.get_jit_def(fn, fn.__name__)
         FileCheck().check("SourceRange at:") \
                    .check("def fn():") \
@@ -5941,7 +5877,7 @@ def fn():
 
     def test_python_frontend_py3(self):
         def fn():
-            raise Exception("hello")
+            raise Exception("hello")  # noqa: TRY002
         ast = torch.jit.frontend.get_jit_def(fn, fn.__name__)
         self.assertExpected(str(ast))
 
@@ -15743,7 +15679,7 @@ def fn(string):
     def test_unicode_comments(self):
         @torch.jit.script
         def test(self, a):
-            # 🤷🤷🤷🤷
+            # shrug
             return torch.nn.functional.relu(a)
 
     def test_get_set_state_with_tensors(self):
@@ -16024,6 +15960,11 @@ def forward(self, x, y):
             with torch.jit.fuser(fuser_name):
                 self.checkModule(MyModule(), (x, y))
 
+    def test_zero_dimension_tensor_trace(self):
+        def f(x):
+            return x[x > 0]
+        jf = torch.jit.trace(f, torch.tensor(2., device="cpu"))
+
 # known to be failing in tracer
 EXCLUDE_TRACED = {
     # The following fail due to #12024.
@@ -16103,12 +16044,10 @@ def forward(self, x, y):
 }
 
 
-@skipIfTorchDynamo()
 class TestJitGeneratedModule(JitTestCase):
     pass
 
 
-@skipIfTorchDynamo()
 class TestJitGeneratedFunctional(JitTestCase):
     pass
 
@@ -16302,7 +16241,7 @@ def normalize_check_ad(check_ad, name):
     elif len(check_ad) == 3:
         check_ad = list(check_ad)
     else:
-        raise Exception('Invalid check_ad, requires (bool, str|List[str], str|List[str])')
+        raise Exception('Invalid check_ad, requires (bool, str|List[str], str|List[str])')  # noqa: TRY002
 
     check_ad = [[t] if isinstance(t, str) else t for t in check_ad]
 
diff --git a/test/test_jit_autocast.py b/test/test_jit_autocast.py
index 0ad4101427f7c..ee0f1fa3eb890 100644
--- a/test/test_jit_autocast.py
+++ b/test/test_jit_autocast.py
@@ -33,6 +33,23 @@ def tearDown(self):
         torch._C._jit_set_autocast_mode(self.old_value)
         super().tearDown()
 
+    @unittest.skipIf(not TEST_CUDA, "No cuda")
+    def test_jit_generic_autocast(self):
+        @torch.jit.script
+        def fn_cuda_autocast(a, b):
+            with autocast():
+                x = torch.mm(a, b)
+                y = torch.sum(x)
+                return x, y
+
+        @torch.jit.script
+        def fn_generic_autocast(a, b):
+            with torch.amp.autocast(device_type='cuda'):
+                x = torch.mm(a, b)
+                y = torch.sum(x)
+                return x, y
+        self.assertEqual(fn_cuda_autocast(self.a_fp32, self.b_fp32), fn_generic_autocast(self.a_fp32, self.b_fp32))
+
     @unittest.skipIf(not TEST_CUDA, "No cuda")
     def test_minimal(self):
         @torch.jit.script
diff --git a/test/test_jit_fuser.py b/test/test_jit_fuser.py
index 6e342ea4f5a5a..9d59dcce08bef 100644
--- a/test/test_jit_fuser.py
+++ b/test/test_jit_fuser.py
@@ -70,7 +70,7 @@ def test_abs_cpu(self):
     @unittest.skipIf(IS_SANDCASTLE, "NYI: fuser CPU support for Sandcastle")
     @enable_cpu_fuser
     def test_abs_cpu_unicode_temp_dir(self):
-        with TemporaryDirectoryName(suffix='中文') as dname:
+        with TemporaryDirectoryName(suffix='\u4e2d\u6587') as dname:
             shell_env = os.environ.copy()
             shell_env['TMP'] = dname
             cmd = [sys.executable, os.path.basename(__file__), type(self).__name__ + '.test_abs_cpu']
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index ddcb0940bc9e1..071249192ec65 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -22,7 +22,7 @@
 
 from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, \
     enable_profiling_mode_for_profiling_tests, slowTest, skipIfTorchDynamo, TEST_WITH_ASAN, \
-    IS_FBCODE
+    TEST_WITH_ROCM, IS_FBCODE
 from torch.testing._internal.jit_utils import JitTestCase, \
     RUN_CUDA, RUN_CUDA_HALF, RUN_CUDA_MULTI_GPU, warmup_backward, set_fusion_group_inlining, \
     clone_inputs, get_traced_sample_variant_pairs, TensorExprTestOptions, NoTracerWarnContextManager
@@ -82,7 +82,6 @@ def inline_fusion_groups():
         torch._C._debug_set_fusion_group_inlining(old_inlining)
 
 
-@skipIfTorchDynamo()
 class TestTEFuser(JitTestCase):
     def setUp(self):
         super().setUp()
@@ -1924,6 +1923,7 @@ def eager(t1, t2, t3, t4, t: float):
             t = torch.rand(8, dtype=torch.float, device=device)
             scripted = self.checkScript(eager, (t, t, t, t, 0.1))
 
+    @skipIfTorchDynamo("too slow")
     def test_chunk_mul_one(self):
         if self.dynamic_shapes:
             self.skipTest("TODO: chunk dynamic shapes")
@@ -2200,7 +2200,9 @@ def eager(x):
             x = torch.ones((8, 1))
             torch.testing.assert_close(eager(x), script(x))
 
+    @skipIfTorchDynamo("too slow")
     @unittest.skipIf(TEST_WITH_ASAN, "takes 10+ minutes on asan")
+    @unittest.skipIf(TEST_WITH_ROCM, "Tensor-likes are not close for nans")
     def test_batch_norm(self):
         def test(fn, args):
             trace = torch.jit.trace(fn, args)
@@ -2626,7 +2628,6 @@ def get_name(op):
 # super() [with no arguments] fails, presumably because of how instantiate_device_type_tests works.
 # super(TestNNCOpInfo, self) fails because TestNNCOpInfo gets deleted from global scope.
 # super(JitCommonTestCase, self).fn() would skip JitCommonTestCase.fn() implementation
-@skipIfTorchDynamo()
 class TestNNCOpInfoParent(JitCommonTestCase):
     pass
 
@@ -2681,7 +2682,6 @@ def f({', '.join(param_names)}):
             self.assertEqual(kernel.fallback(tuple(param_values)), correct_val)
 
     @onlyCPU
-    @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons")
     @unittest.skipIf(not LLVM_ENABLED, "Compiles with TensorExprKernel")
     @ops([op for op in op_db if get_name(op) in works_list], allowed_dtypes=(torch.float,))
     def test_working(self, device, dtype, op):
@@ -2745,7 +2745,6 @@ def test_nnc_correctness(self, device, dtype, op):
 instantiate_device_type_tests(TestNNCOpInfo, globals(), only_for=only_for)
 
 # Purpose of this class is to allow super() calls. (See TestNNCOpInfoParent)
-@skipIfTorchDynamo()
 class TestLoopnestRandomizationParent(JitTestCase):
     pass
 
diff --git a/test/test_legacy_vmap.py b/test/test_legacy_vmap.py
index fc2cc65232c4f..4a715d2226820 100644
--- a/test/test_legacy_vmap.py
+++ b/test/test_legacy_vmap.py
@@ -1,18 +1,20 @@
 # Owner(s): ["module: vmap"]
 
-from torch.testing._internal.common_utils import TestCase, run_tests
+import functools
+import itertools
+import types
+import warnings
+
 import torch
 import torch.nn.functional as F
 from torch import Tensor
 from torch._vmap_internals import vmap
-import functools
-import itertools
-import warnings
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
-import types
+from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TestCase
 
 
-FALLBACK_REGEX = r'There is a performance drop'
+FALLBACK_REGEX = r"There is a performance drop"
+
 
 class EnableVmapFallbackWarnings:
     def __enter__(self):
@@ -22,9 +24,12 @@ def __enter__(self):
     def __exit__(self, *ignored):
         torch._C._debug_only_display_vmap_fallback_warnings(self.prev_state)
 
-class TestVmapAPI(TestCase):
+
+class TestVmapAPILegacy(TestCase):
     def test_non_tensor_output_raises(self):
-        with self.assertRaisesRegex(ValueError, "got type <class 'float'> as the return"):
+        with self.assertRaisesRegex(
+            ValueError, "got type <class 'float'> as the return"
+        ):
             output = vmap(lambda x: 3.14)(torch.ones(3))
 
         def multiple_outputs(x):
@@ -36,16 +41,20 @@ def multiple_outputs(x):
     def test_different_map_dim_size_raises(self):
         x = torch.randn(2)
         y = torch.randn(3)
-        expected_msg = 'Expected all tensors to have the same size in the mapped dimension'
+        expected_msg = (
+            "Expected all tensors to have the same size in the mapped dimension"
+        )
         with self.assertRaisesRegex(ValueError, expected_msg):
             vmap(torch.mul)(x, y)
         with self.assertRaisesRegex(ValueError, expected_msg):
             vmap(lambda z: z[0] + z[1], in_dims=((0, 0),))((x, y))
         with self.assertRaisesRegex(ValueError, expected_msg):
-            vmap(lambda z: z['x'] + z['y'], in_dims=({'x': 0, 'y': 0},))({'x': x, 'y': y})
+            vmap(lambda z: z["x"] + z["y"], in_dims=({"x": 0, "y": 0},))(
+                {"x": x, "y": y}
+            )
 
     def test_func_with_no_inputs(self):
-        expected_msg = 'got no inputs'
+        expected_msg = "got no inputs"
 
         def foo():
             return torch.randn(3)
@@ -157,12 +166,12 @@ def out_op(x, y):
 
         tensor = torch.randn(2)
         # The fallback doesn't support TensorList
-        with self.assertRaisesRegex(RuntimeError, 'Batching rule not implemented'):
+        with self.assertRaisesRegex(RuntimeError, "Batching rule not implemented"):
             vmap(lambda t: torch.atleast_1d([t]))(tensor)
 
         # Don't support non-tensor returns. This is a limitation of vmap;
         # functions that don't return tensors must be special cased
-        with self.assertRaisesRegex(RuntimeError, 'Batching rule not implemented'):
+        with self.assertRaisesRegex(RuntimeError, "Batching rule not implemented"):
             vmap(torch.Tensor.item)(tensor)
 
     def test_nonzero_out_dims(self):
@@ -188,7 +197,9 @@ def test_nonzero_out_dims(self):
         tensor = torch.randn(2, 3, 5, 7)
         other = torch.randn(2, 3, 5, 7)
         result = vmap(lambda x, y: (x, y), out_dims=2)(tensor, other)
-        self.assertEqual(result, (tensor.permute(1, 2, 0, 3), other.permute(1, 2, 0, 3)))
+        self.assertEqual(
+            result, (tensor.permute(1, 2, 0, 3), other.permute(1, 2, 0, 3))
+        )
 
         # use out_dims with the maximum vmap-able tensor dims (64 dims)
         ndims = 64
@@ -201,12 +212,18 @@ def test_nonzero_out_dims(self):
         # test something that is not the identity function
         def foo(x, y):
             return x, x * y, x * y * y
+
         x = torch.randn(2, 3, 5)
         y = torch.randn(2, 3, 5)
         result = vmap(foo, out_dims=1)(x, y)
         self.assertEqual(
             result,
-            (x.permute(1, 0, 2), (x * y).permute(1, 0, 2), (x * y * y).permute(1, 0, 2)))
+            (
+                x.permute(1, 0, 2),
+                (x * y).permute(1, 0, 2),
+                (x * y * y).permute(1, 0, 2),
+            ),
+        )
 
     def test_multiple_out_dims(self):
         def foo(x):
@@ -265,19 +282,19 @@ def foo(x):
         self.assertEqual(result, expected)
 
     def test_out_dims_must_be_int_or_tuple_of_int_err_msg(self):
-        msg = '`out_dims` must be an int or a tuple of int'
+        msg = "`out_dims` must be an int or a tuple of int"
         tensor = torch.randn(2, 3)
         with self.assertRaisesRegex(ValueError, msg):
-            vmap(lambda x: x, out_dims='lol')(tensor)
+            vmap(lambda x: x, out_dims="lol")(tensor)
         with self.assertRaisesRegex(ValueError, msg):
-            vmap(lambda x: x, out_dims=('lol',))(tensor)
+            vmap(lambda x: x, out_dims=("lol",))(tensor)
         with self.assertRaisesRegex(ValueError, msg):
             vmap(lambda x: x, out_dims=None)(tensor)
         with self.assertRaisesRegex(ValueError, msg):
             vmap(lambda x: x, out_dims=(None,))(tensor)
 
     def test_out_dims_and_num_outputs_mismatch_err_msg(self):
-        msg = '`out_dims` must have one dim per output'
+        msg = "`out_dims` must have one dim per output"
         x = torch.randn(2, 3, 5)
 
         # Too many out_dims
@@ -296,7 +313,7 @@ def test_out_dim_out_of_bounds_err_msg(self):
         # TODO(rzou): This error message isn't that great. It comes straight
         # from maybe_wrap_dim. Consider doing a try-catch-(add some context) to
         # the error message in the future in C++
-        msg = 'Dimension out of range'
+        msg = "Dimension out of range"
         x = torch.randn(2, 3, 5)
         with self.assertRaisesRegex(IndexError, msg):
             vmap(lambda x: x, out_dims=3)(x)
@@ -387,28 +404,30 @@ def test_accepts_nested_inputs(self):
         out = vmap(lambda z: z[0] + z[1], in_dims=([0, 0],))([x, y])
         self.assertEqual(out, x + y)
 
-        out = vmap(lambda z: z['x'] + z['y'])({'x': x, 'y': y})
+        out = vmap(lambda z: z["x"] + z["y"])({"x": x, "y": y})
         self.assertEqual(out, x + y)
-        out = vmap(lambda z: z['x'] + z['y'], in_dims=(0,))({'x': x, 'y': y})
+        out = vmap(lambda z: z["x"] + z["y"], in_dims=(0,))({"x": x, "y": y})
         self.assertEqual(out, x + y)
-        out = vmap(lambda z: z['x'] + z['y'], in_dims=({'x': 0, 'y': 0},))({'x': x, 'y': y})
+        out = vmap(lambda z: z["x"] + z["y"], in_dims=({"x": 0, "y": 0},))(
+            {"x": x, "y": y}
+        )
         self.assertEqual(out, x + y)
 
         # Multiple layers of nesting
-        out_fn = vmap(lambda z: z['x'][0] + z['x'][1][0] + z['y'][0] + z['y'][1])
-        out = out_fn({'x': [x, (x,)], 'y': [y, y]})
+        out_fn = vmap(lambda z: z["x"][0] + z["x"][1][0] + z["y"][0] + z["y"][1])
+        out = out_fn({"x": [x, (x,)], "y": [y, y]})
         self.assertEqual(out, x + x + y + y)
 
     def test_in_dims_wrong_type_err_msg(self):
         x = torch.randn(3)
         y = torch.randn(3)
-        msg = r'expected `in_dims` to be int or a \(potentially nested\) tuple'
+        msg = r"expected `in_dims` to be int or a \(potentially nested\) tuple"
         with self.assertRaisesRegex(ValueError, msg):
             vmap(torch.mul, [0, 0])(x, y)
         with self.assertRaisesRegex(ValueError, msg):
             vmap(torch.mul, set({0}))(x, y)
         with self.assertRaisesRegex(ValueError, msg):
-            vmap(torch.mul, 'lol')(x, y)
+            vmap(torch.mul, "lol")(x, y)
         with self.assertRaisesRegex(ValueError, msg):
             vmap(lambda z: z[0] + z[1], in_dims=[0, 0])([x, y])
         # The following should not throw
@@ -417,7 +436,7 @@ def test_in_dims_wrong_type_err_msg(self):
     def test_not_enough_in_dims_err_msg(self):
         x = torch.randn(3)
         y = torch.randn(3)
-        msg = r'in_dims is not compatible with the structure of `inputs`'
+        msg = r"in_dims is not compatible with the structure of `inputs`"
 
         with self.assertRaisesRegex(ValueError, msg):
             vmap(torch.mul, (0,))(x, y)
@@ -441,7 +460,7 @@ def bar(x, yz):
         y = torch.randn(2, 3)
 
         # the following are errors in jax (and will always be errors)
-        msg = 'Got in_dim=0 for an input but the input is of type'
+        msg = "Got in_dim=0 for an input but the input is of type"
         with self.assertRaisesRegex(ValueError, msg):
             vmap(torch.sum)(x, 0)
         with self.assertRaisesRegex(ValueError, msg):
@@ -458,7 +477,7 @@ def foo(x):
         x = torch.randn(2, 3)
         y = torch.randn(2, 3)
 
-        msg = r'Got in_dim=-?\w for some input, but that input is a Tensor of dimensionality \w'
+        msg = r"Got in_dim=-?\w for some input, but that input is a Tensor of dimensionality \w"
         with self.assertRaisesRegex(ValueError, msg):
             vmap(foo)(torch.randn([]))
         with self.assertRaisesRegex(ValueError, msg):
@@ -519,7 +538,7 @@ def test_fallback_zero_dim(self):
         x = torch.randn(B0, 11)
         y = torch.randn(11)
 
-        msg = 'The fallback path does not support vmap over dims of size 0'
+        msg = "The fallback path does not support vmap over dims of size 0"
 
         with self.assertRaisesRegex(RuntimeError, msg):
             vmap(op, (0, None))(x, y)
@@ -577,11 +596,12 @@ def run_test(batch_size):
             index = torch.tensor([0, 4, 2])
             values = torch.randn(B0, 3, 11, 13)
 
-            self._assert_uses_vmap_fallback((torch.index_add, (0, None, None, 0)), (x, dim, index, values))
+            self._assert_uses_vmap_fallback(
+                (torch.index_add, (0, None, None, 0)), (x, dim, index, values)
+            )
 
             result = vmap(torch.index_add, (0, None, None, 0))(x, dim, index, values)
-            expected = torch.index_add(
-                x, dim + 1, index, values.view(B0, 3, 11, 13))
+            expected = torch.index_add(x, dim + 1, index, values.view(B0, 3, 11, 13))
             self.assertEqual(result, expected)
 
         run_test(batch_size=5)
@@ -714,7 +734,7 @@ def test_inplace_fallback_nary_different_levels(self):
         self.assertEqual(x, outplace_op(x_orig, y.view(B0, 1, 7)))
 
         # op(left, right): Some of the levels in right are not found in left
-        msg = r'vmap: aten::atan2_\(self, \*extra_args\) is not possible'
+        msg = r"vmap: aten::atan2_\(self, \*extra_args\) is not possible"
         x = torch.rand(7)
         y = torch.rand(B0, 7)
         with self.assertRaisesRegex(RuntimeError, msg):
@@ -739,7 +759,7 @@ def test_backward_unsupported_interaction(self):
         x = torch.randn(3, requires_grad=True)
         y = torch.randn(5)
         grad = torch.randn_like(x)
-        err_msg = r'backward\(\) called inside torch.vmap'
+        err_msg = r"backward\(\) called inside torch.vmap"
 
         def backward_on_vmapped_tensor(x):
             x.sum().backward()
@@ -761,7 +781,7 @@ def completely_unrelated_backward(y):
 
     def test_grad_unsupported_interaction(self):
         input_tensor = torch.randn(3, requires_grad=True)
-        err_msg = 'autograd.grad.* called inside torch.vmap'
+        err_msg = "autograd.grad.* called inside torch.vmap"
 
         captured = torch.randn(3, requires_grad=True)
 
@@ -772,7 +792,7 @@ def output_to_grad_is_vmapped(input_tensor):
         with self.assertRaisesRegex(RuntimeError, err_msg):
             vmap(output_to_grad_is_vmapped)(input_tensor)
 
-        output = (input_tensor ** 2).sum()
+        output = (input_tensor**2).sum()
 
         def input_to_grad_is_vmapped(input_tensor):
             return torch.autograd.grad([output], [input_tensor])[0]
@@ -812,7 +832,7 @@ def test_fallback_with_undefined_grad(self):
 
         def get_vjp(v):
             result = torch.nn.functional.conv2d(x, weight)
-            grad_x, = torch.autograd.grad(result, x, v)
+            (grad_x,) = torch.autograd.grad(result, x, v)
             return grad_x
 
         # Runs vmap(get_vjp)(v), which should not error out.
@@ -824,6 +844,7 @@ def get_vjp(v):
         # different op (and/or create and use a dummy operator) to avoid bitrot.
         self._assert_uses_vmap_fallback([get_vjp], [v])
 
+
 def slice_inputs(inputs, bdims, i):
     result = []
     for inp, bdim in zip(inputs, bdims):
@@ -855,30 +876,40 @@ def reference_vmap(op, inputs, in_dims=0, out_dims=0):
     assert all(len(result) == num_returns for result in results)
     if isinstance(out_dims, int):
         out_dims = (out_dims,) * num_returns
-    return tuple(torch.stack(result_shards, out_dim)
-                 for result_shards, out_dim in zip(zip(*results), out_dims))
+    return tuple(
+        torch.stack(result_shards, out_dim)
+        for result_shards, out_dim in zip(zip(*results), out_dims)
+    )
 
 
 class TensorFactory:
     @staticmethod
-    def rand(size, device='cpu', dtype=torch.float):
+    def rand(size, device="cpu", dtype=torch.float):
         return torch.rand(size, device=device, dtype=dtype)
 
     @staticmethod
-    def randn(size, device='cpu', dtype=torch.float):
+    def randn(size, device="cpu", dtype=torch.float):
         return torch.randn(size, device=device, dtype=dtype)
 
     @staticmethod
-    def randp1(size, device='cpu', dtype=torch.float):
+    def randp1(size, device="cpu", dtype=torch.float):
         return torch.rand(size, device=device, dtype=dtype) + 1
 
+
 # Tests vmap(op, in_dims, out_dims)(*inputs) by comparing the output to a
 # (slow) sequential map+stack fallback.
 #
 # check_view: Test if the first returned output is a view of the first input
 # check_propagates_grad: Test if the operation propagates gradients.
-def _vmap_test(self, op, inputs, in_dims=0, out_dims=0,
-               check_view=False, check_propagates_grad=True):
+def _vmap_test(
+    self,
+    op,
+    inputs,
+    in_dims=0,
+    out_dims=0,
+    check_view=False,
+    check_propagates_grad=True,
+):
     result = vmap(op, in_dims, out_dims)(*inputs)
     reference_result = reference_vmap(op, inputs, in_dims, out_dims)
     self.assertEqual(result, reference_result)
@@ -888,8 +919,10 @@ def _vmap_test(self, op, inputs, in_dims=0, out_dims=0,
         result_as_tuple = (result,) if op_has_single_return else result
         for output in result_as_tuple:
             input0_base = inputs[0] if inputs[0]._base is None else inputs[0]._base
-            self.assertTrue(output._base is input0_base,
-                            msg="result was not a view of the first input!")
+            self.assertTrue(
+                output._base is input0_base,
+                msg="result was not a view of the first input!",
+            )
 
     if not check_propagates_grad:
         return
@@ -905,26 +938,29 @@ def _vmap_test(self, op, inputs, in_dims=0, out_dims=0,
     result_as_tuple = (result,) if op_has_single_return else result
     self.assertTrue(result[0].requires_grad)
 
+
 def should_allow_vmap_fallback_usage(fn):
-    return getattr(fn, '_allow_vmap_fallback_usage', False)
+    return getattr(fn, "_allow_vmap_fallback_usage", False)
+
 
 def allowVmapFallbackUsage(fn):
     fn._allow_vmap_fallback_usage = True
     return fn
 
-# All tests of TestVmapBase check that the slow vmap fallback is never invoked.
+
+# All tests of TestVmapBaseLegacy check that the slow vmap fallback is never invoked.
 # This is so that we can incrementally add batching rules for operators to
 # replace the slow vmap fallback path for said operators. To skip this check,
 # please use the allowVmapFallbackUsage decorator.
 #
-# NB: Don't add tests to TestVmapBase directly, unless you want them to run
-# on every subclass of TestVmapBase. Add them to e.g. TestVmapOperators.
+# NB: Don't add tests to TestVmapBaseLegacy directly, unless you want them to run
+# on every subclass of TestVmapBaseLegacy. Add them to e.g. TestVmapOperators.
 #
-# NB: TestVmapBase is a nested class. This prevents test runners from picking
+# NB: TestVmapBaseLegacy is a nested class. This prevents test runners from picking
 # it up and running it.
 class Namespace:
-    class TestVmapBase(TestCase):
-        def __init__(self, method_name='runTest'):
+    class TestVmapBaseLegacy(TestCase):
+        def __init__(self, method_name="runTest"):
             super().__init__(method_name)
 
             test_method = getattr(self, method_name, None)
@@ -932,27 +968,33 @@ def __init__(self, method_name='runTest'):
                 return
 
             if not should_allow_vmap_fallback_usage(test_method):
-                setattr(self, method_name,
-                        self._wrap_method_with_vmap_fallback_check(test_method))
+                setattr(
+                    self,
+                    method_name,
+                    self._wrap_method_with_vmap_fallback_check(test_method),
+                )
 
         def _wrap_method_with_vmap_fallback_check(self, method):
             msg = (
-                'Expected the test to not invoke the vmap fallback path, i.e., '
-                'all of the operators being tested in this test should have batching '
-                'rules implemented. If you are intentionally testing something to '
-                'do with the fallback path, use allowVmapFallbackUsage. Otherwise, '
-                'please make sure that batching rules are implemented for the '
-                'operator(s) being tested.'
+                "Expected the test to not invoke the vmap fallback path, i.e., "
+                "all of the operators being tested in this test should have batching "
+                "rules implemented. If you are intentionally testing something to "
+                "do with the fallback path, use allowVmapFallbackUsage. Otherwise, "
+                "please make sure that batching rules are implemented for the "
+                "operator(s) being tested."
             )
 
             @functools.wraps(method)
             def wrapper(self, *args, **kwargs):
                 with warnings.catch_warnings(record=True) as wa:
-                    warnings.simplefilter('always')
+                    warnings.simplefilter("always")
                     with EnableVmapFallbackWarnings():
                         method(*args, **kwargs)
                     for captured_warning in wa:
-                        self.assertNotRegex(str(captured_warning.message), FALLBACK_REGEX, msg)
+                        self.assertNotRegex(
+                            str(captured_warning.message), FALLBACK_REGEX, msg
+                        )
+
             return types.MethodType(wrapper, self)
 
         @allowVmapFallbackUsage
@@ -983,7 +1025,7 @@ def uses_fallback(self):
                 uses_fallback(self)
 
 
-class TestVmapOperators(Namespace.TestVmapBase):
+class TestVmapOperatorsLegacy(Namespace.TestVmapBaseLegacy):
     def _vmap_test(self, *args, **kwargs):
         return _vmap_test(self, *args, **kwargs)
 
@@ -1002,8 +1044,12 @@ def _test_unary(self, op, getter, device, *args, **kwargs):
         # Doubly nested vmap
         test(vmap(op), [getter([B0, B1], device)])
         test(vmap(op), [getter([B1, 2, 5, B0, 3], device)], in_dims=2)
-        test(vmap(op, in_dims=2), [getter([2, 5, B0, B1, 3], device)],
-             in_dims=2, out_dims=2)
+        test(
+            vmap(op, in_dims=2),
+            [getter([2, 5, B0, B1, 3], device)],
+            in_dims=2,
+            out_dims=2,
+        )
 
     def test_unary_pointwise_ops(self):
         cases = [
@@ -1039,15 +1085,21 @@ def test_unary_pointwise_ops(self):
             (torch.trunc, TensorFactory.randn),
         ]
         for op, getter in cases:
-            self._test_unary(op, getter, 'cpu')
+            self._test_unary(op, getter, "cpu")
 
     def test_clone(self):
         # Some basic tests
-        self._test_unary(lambda x: x.clone(), TensorFactory.randn, 'cpu')
-        self._test_unary(lambda x: x.clone(memory_format=torch.preserve_format),
-                         TensorFactory.randn, 'cpu')
-        self._test_unary(lambda x: x.clone(memory_format=torch.contiguous_format),
-                         TensorFactory.randn, 'cpu')
+        self._test_unary(lambda x: x.clone(), TensorFactory.randn, "cpu")
+        self._test_unary(
+            lambda x: x.clone(memory_format=torch.preserve_format),
+            TensorFactory.randn,
+            "cpu",
+        )
+        self._test_unary(
+            lambda x: x.clone(memory_format=torch.contiguous_format),
+            TensorFactory.randn,
+            "cpu",
+        )
 
         # Test that the per-examples are contiguous when using torch.contiguous_format
         def clone_contiguous(x):
@@ -1064,12 +1116,13 @@ def clone_contiguous(x):
         self.assertTrue(y.is_contiguous())
         self.assertTrue(y[0][0].is_contiguous())
 
-
-        msg = r'only supported with memory_format torch.preserve_format or torch.contiguous_format'
+        msg = r"only supported with memory_format torch.preserve_format or torch.contiguous_format"
         with self.assertRaisesRegex(RuntimeError, msg):
             vmap(lambda x: x.clone(memory_format=torch.channels_last))(torch.randn(B0))
         with self.assertRaisesRegex(RuntimeError, msg):
-            vmap(lambda x: x.clone(memory_format=torch.channels_last_3d))(torch.randn(B0))
+            vmap(lambda x: x.clone(memory_format=torch.channels_last_3d))(
+                torch.randn(B0)
+            )
 
     def test_binary_pointwise_ops(self):
         def get_number(getter):
@@ -1089,27 +1142,38 @@ def make_case(op, input_getter=TensorFactory.randn):
             make_case(torch.div, input_getter=TensorFactory.randp1),
             make_case(lambda x, y: x / y, input_getter=TensorFactory.randp1),
             make_case(torch.pow, input_getter=TensorFactory.randp1),
-            make_case(lambda x, y: x ** y, input_getter=TensorFactory.randp1),
+            make_case(lambda x, y: x**y, input_getter=TensorFactory.randp1),
         ]
         test = self._vmap_test
 
         for op, getter in cases:
-            device = 'cpu'
+            device = "cpu"
             B0, B1 = 7, 11
 
             # Single vmap: op(Tensor, Tensor)
             test(op, (getter([B0, 3], device), getter([B0, 3], device)))
             test(op, (getter([B0], device), getter([B0, 2, 3], device)))
             test(op, (getter([B0], device), getter([2, B0, 3], device)), in_dims=(0, 1))
-            test(op, (getter([B0], device), getter([2, B0, 3], device)),
-                 in_dims=(0, 1), out_dims=1)
+            test(
+                op,
+                (getter([B0], device), getter([2, B0, 3], device)),
+                in_dims=(0, 1),
+                out_dims=1,
+            )
             test(op, (getter([B0], device), getter([2, 3], device)), in_dims=(0, None))
-            test(op, (getter([2, 3], device), getter([B0, 3], device)), in_dims=(0, None))
+            test(
+                op, (getter([2, 3], device), getter([B0, 3], device)), in_dims=(0, None)
+            )
 
             # Nested vmap: op(Tensor, Tensor)
-            test(vmap(op), (getter([B0, B1, 2, 3], device), getter([B0, B1, 3], device)))
-            test(vmap(op, in_dims=(None, 0)),
-                 (getter([B0, 2, 3], device), getter([B1, 3], device)), in_dims=(0, None))
+            test(
+                vmap(op), (getter([B0, B1, 2, 3], device), getter([B0, B1, 3], device))
+            )
+            test(
+                vmap(op, in_dims=(None, 0)),
+                (getter([B0, 2, 3], device), getter([B1, 3], device)),
+                in_dims=(0, None),
+            )
 
             # Python number overload: op(Tensor, Number) (and vice-versa)
             number = get_number(getter)
@@ -1163,7 +1227,9 @@ def _test(sizes, strides, offset, tensor, lambd):
             offset = x.storage_offset()
 
             # Broadcast
-            _test([5, 5, 2, 3], [0, 0, S0, S1], offset, x, lambda x: x.expand(5, 5, 2, 3))
+            _test(
+                [5, 5, 2, 3], [0, 0, S0, S1], offset, x, lambda x: x.expand(5, 5, 2, 3)
+            )
             # transpose
             _test([3, 2], [S1, S0], offset, x, lambda x: x.transpose(0, 1))
             # select
@@ -1173,19 +1239,23 @@ def _test(sizes, strides, offset, tensor, lambd):
         B1 = 7
         x = torch.randn(B1, B0, 2, 3)
         S0, S1 = x.stride()[2:]
-        result = vmap(vmap(lambda t: t.as_strided([5, 5, 2, 3], [0, 0, S0, S1])), in_dims=1)(x)
+        result = vmap(
+            vmap(lambda t: t.as_strided([5, 5, 2, 3], [0, 0, S0, S1])), in_dims=1
+        )(x)
         expected = vmap(vmap(lambda t: t.expand(5, 5, 2, 3)), in_dims=1)(x)
         self.assertTrue(result._base is expected._base)
         self.assertEqual(result, expected)
 
         # Check that mal-formatted size/strides doesn't crash
-        with self.assertRaisesRegex(RuntimeError, 'size and stride must have the same length'):
+        with self.assertRaisesRegex(
+            RuntimeError, "size and stride must have the same length"
+        ):
             x = torch.randn(B0, 2, 3).transpose(0, 1)
             vmap(lambda x: x.as_strided([1, 1, 1], [1, 1]))(x)
 
         # Sanity check #1: we require the batch dims to be at the front of the
         # tensor (in memory layout).
-        msg = 'batch dims being vmapped over are at the front of the tensor'
+        msg = "batch dims being vmapped over are at the front of the tensor"
         with self.assertRaisesRegex(RuntimeError, msg):
             x = torch.randn(2, B0, 3).transpose(0, 1)
             vmap(lambda x: x.as_strided([2, 3], [B0 * 3, 1]))(x)
@@ -1202,7 +1272,7 @@ def _test(sizes, strides, offset, tensor, lambd):
         # Sanity check #2a: The maximum indexable location of
         # xs[i].as_strided(sizes, strides, offset + xs[i].offset() - xs.offset())
         # is less than or equal to the maximum indexable location of xs[i].
-        msg = 'This is not supported inside of vmap'
+        msg = "This is not supported inside of vmap"
         with self.assertRaisesRegex(RuntimeError, msg):
             x = torch.randn(B0, 3)
             vmap(lambda x: x.as_strided([3], [1], 1))(x)
@@ -1244,19 +1314,32 @@ def test_bmm(self):
 
         # left arg is vmapped
         test(op, (torch.rand(B0, 2, 3, 5), torch.rand(2, 5, 3)), in_dims=(0, None))
-        test(vmap(op, in_dims=(0, None)), (torch.rand(B1, B0, 2, 3, 5), torch.rand(2, 5, 3)),
-             in_dims=(1, None))
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, B0, 2, 3, 5), torch.rand(2, 5, 3)),
+            in_dims=(1, None),
+        )
 
         # right arg is vmapped
         test(op, (torch.rand(2, 5, 3), torch.rand(B0, 2, 3, 5)), in_dims=(None, 0))
-        test(vmap(op, in_dims=(None, 0)), (torch.rand(2, 5, 3), torch.rand(B1, B0, 2, 3, 5)),
-             in_dims=(None, 1))
+        test(
+            vmap(op, in_dims=(None, 0)),
+            (torch.rand(2, 5, 3), torch.rand(B1, B0, 2, 3, 5)),
+            in_dims=(None, 1),
+        )
 
         # both args are vmapped
         test(op, (torch.rand(B0, 2, 3, 5), torch.rand(B0, 2, 5, 3)))
-        test(vmap(op), (torch.rand(B1, B0, 2, 3, 5), torch.rand(B0, B1, 2, 5, 3)), in_dims=(1, 0))
-        test(vmap(op, in_dims=(0, None)),
-             (torch.rand(B1, 2, 3, 5), torch.rand(B0, 2, 5, 3)), in_dims=(None, 0))
+        test(
+            vmap(op),
+            (torch.rand(B1, B0, 2, 3, 5), torch.rand(B0, B1, 2, 5, 3)),
+            in_dims=(1, 0),
+        )
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, 2, 3, 5), torch.rand(B0, 2, 5, 3)),
+            in_dims=(None, 0),
+        )
 
     def test_cat(self):
         test = self._vmap_test
@@ -1266,16 +1349,23 @@ def test_cat(self):
         def get_op(dim):
             def op(*tensors):
                 return torch.cat(tensors, dim=dim)
+
             return op
 
         test(get_op(0), (torch.rand(B0, 2), torch.rand(B0, 3)))
         test(get_op(0), (torch.rand(2), torch.rand(B0, 3)), in_dims=(None, 0))
         test(get_op(0), (torch.rand(2, 17), torch.rand(3, 17, B0)), in_dims=(None, 2))
         test(get_op(-1), (torch.rand(17, 2), torch.rand(17, 3, B0)), in_dims=(None, 2))
-        test(vmap(get_op(0), in_dims=(0, None)),
-             (torch.rand(B1, 2), torch.rand(B0, 3)), in_dims=(None, 0))
-        test(vmap(get_op(0), in_dims=(0, 0)),
-             (torch.rand(B1, 2), torch.rand(B0, B1, 3)), in_dims=(None, 0))
+        test(
+            vmap(get_op(0), in_dims=(0, None)),
+            (torch.rand(B1, 2), torch.rand(B0, 3)),
+            in_dims=(None, 0),
+        )
+        test(
+            vmap(get_op(0), in_dims=(0, 0)),
+            (torch.rand(B1, 2), torch.rand(B0, B1, 3)),
+            in_dims=(None, 0),
+        )
 
     def test_conj(self):
         op = torch.conj
@@ -1283,6 +1373,7 @@ def test_conj(self):
         def run_test(dtype):
             def get(shape):
                 return torch.randn(shape, dtype=dtype)
+
             B0, B1 = 7, 11
             test = self._vmap_test
 
@@ -1294,8 +1385,7 @@ def get(shape):
             # Doubly nested vmap
             test(vmap(op), [get([B0, B1])])
             test(vmap(op), [get([B1, 2, 5, B0, 3])], in_dims=2)
-            test(vmap(op, in_dims=2), [get([2, 5, B0, B1, 3])],
-                 in_dims=2, out_dims=2)
+            test(vmap(op, in_dims=2), [get([2, 5, B0, B1, 3])], in_dims=2, out_dims=2)
 
         # correctness tests
         run_test(torch.float)
@@ -1309,7 +1399,7 @@ def get(shape):
     def test_contiguous(self):
         op = Tensor.contiguous
 
-        self._test_unary(op, TensorFactory.randn, 'cpu')
+        self._test_unary(op, TensorFactory.randn, "cpu")
 
         # check that contiguous returns the original tensor if the per-examples
         # are already contiguous
@@ -1319,7 +1409,7 @@ def test_contiguous(self):
         result = vmap(Tensor.contiguous, in_dims=2, out_dims=2)(x)
         self.assertTrue(result is x)
 
-        msg = 'NYI: querying is_contiguous inside of vmap for memory_format'
+        msg = "NYI: querying is_contiguous inside of vmap for memory_format"
         tensor = torch.randn(B0, 3)
         with self.assertRaisesRegex(RuntimeError, msg):
             vmap(functools.partial(op, memory_format=torch.channels_last))(tensor)
@@ -1353,10 +1443,16 @@ def test_chunk(self):
         # tests for torch.split(self, split_size: int, dim)
         test(op, (torch.rand(B0, 2, 1024), 15, -1), in_dims=(0, None, None))
         test(op, (torch.rand(2, B0, 1024), 9, 1), in_dims=(1, None, None))
-        test(vmap(op, in_dims=(0, None, None)), (torch.rand(B1, 1023, B0, 5), 4, 0),
-             in_dims=(2, None, None))
-        test(vmap(vmap(lambda t: op(t, 4, 1), in_dims=2)),
-             (torch.rand(B1, 2, B0, 64, B2),), in_dims=2)
+        test(
+            vmap(op, in_dims=(0, None, None)),
+            (torch.rand(B1, 1023, B0, 5), 4, 0),
+            in_dims=(2, None, None),
+        )
+        test(
+            vmap(vmap(lambda t: op(t, 4, 1), in_dims=2)),
+            (torch.rand(B1, 2, B0, 64, B2),),
+            in_dims=2,
+        )
 
     def test_clamp(self):
         clamp_cases = (
@@ -1367,7 +1463,7 @@ def test_clamp(self):
             (lambda t: t.clamp_max(max=0.5), TensorFactory.randn),
         )
         for op, getter in clamp_cases:
-            self._test_unary(op, getter, 'cpu')
+            self._test_unary(op, getter, "cpu")
 
     def test_comparison_ops(self):
         test = functools.partial(self._vmap_test, check_propagates_grad=False)
@@ -1376,12 +1472,18 @@ def test_comparison_ops(self):
         B0, B1 = 7, 11
 
         ops = (
-            torch.eq, lambda x, y: x == y,
-            torch.gt, lambda x, y: x > y,
-            torch.ge, lambda x, y: x >= y,
-            torch.le, lambda x, y: x <= y,
-            torch.lt, lambda x, y: x < y,
-            torch.ne, lambda x, y: x != y,
+            torch.eq,
+            lambda x, y: x == y,
+            torch.gt,
+            lambda x, y: x > y,
+            torch.ge,
+            lambda x, y: x >= y,
+            torch.le,
+            lambda x, y: x <= y,
+            torch.lt,
+            lambda x, y: x < y,
+            torch.ne,
+            lambda x, y: x != y,
         )
 
         for op in ops:
@@ -1395,12 +1497,17 @@ def test_comparison_ops(self):
 
             # Nested vmap: op(Tensor, Tensor)
             test(vmap(op), (getter([B0, B1, 2, 3]), getter([B0, B1, 3])))
-            test(vmap(op, in_dims=(None, 0)),
-                 (getter([B0, 2, 3]), getter([B1, 3])), in_dims=(0, None))
+            test(
+                vmap(op, in_dims=(None, 0)),
+                (getter([B0, 2, 3]), getter([B1, 3])),
+                in_dims=(0, None),
+            )
 
             # test number as inputs
             number = getter([]).item()
-            self._test_unary(lambda t: op(t, number), getter, 'cpu', check_propagates_grad=False)
+            self._test_unary(
+                lambda t: op(t, number), getter, "cpu", check_propagates_grad=False
+            )
 
     def test_diagonal(self):
         tensor = torch.randn(3, 5, 7, 11, 13)
@@ -1411,8 +1518,12 @@ def test_diagonal(self):
         test(op, (tensor, 2, 1, 2), in_dims=(1, None, None, None))
         test(op, (tensor, 0, -2, -1), in_dims=(1, None, None, None), out_dims=1)
         test(vmap(lambda t: op(t, 0, 0, -1)), (tensor,), in_dims=1, out_dims=1)
-        test(vmap(vmap(lambda t: op(t, 0, 0, 1), in_dims=1), in_dims=3),
-             (tensor,), in_dims=1, out_dims=1)
+        test(
+            vmap(vmap(lambda t: op(t, 0, 0, 1), in_dims=1), in_dims=3),
+            (tensor,),
+            in_dims=1,
+            out_dims=1,
+        )
 
     def test_dot(self):
         op = torch.dot
@@ -1430,19 +1541,28 @@ def test_dot(self):
 
         # left arg is vmapped
         test(op, (torch.rand(B0, 5), torch.rand(5)), in_dims=(0, None))
-        test(vmap(op, in_dims=(0, None)), (torch.rand(B1, B0, 5), torch.rand(5)),
-             in_dims=(1, None))
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, B0, 5), torch.rand(5)),
+            in_dims=(1, None),
+        )
 
         # right arg is vmapped
         test(op, (torch.rand(5), torch.rand(B0, 5)), in_dims=(None, 0))
-        test(vmap(op, in_dims=(None, 0)), (torch.rand(5), torch.rand(B1, B0, 5)),
-             in_dims=(None, 1))
+        test(
+            vmap(op, in_dims=(None, 0)),
+            (torch.rand(5), torch.rand(B1, B0, 5)),
+            in_dims=(None, 1),
+        )
 
         # both args are vmapped
         test(op, (torch.rand(B0, 5), torch.rand(B0, 5)))
         test(vmap(op), (torch.rand(B1, B0, 5), torch.rand(B0, B1, 5)), in_dims=(1, 0))
-        test(vmap(op, in_dims=(0, None)),
-             (torch.rand(B1, 5), torch.rand(B0, 5)), in_dims=(None, 0))
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, 5), torch.rand(B0, 5)),
+            in_dims=(None, 0),
+        )
 
     def test_expand_as(self):
         op = torch.Tensor.expand_as
@@ -1452,7 +1572,11 @@ def test_expand_as(self):
         test(op, (torch.rand(B0, 1, 5), torch.rand(2, 3, 5)), in_dims=(0, None))
         test(op, (torch.rand(1, 5), torch.rand(B0, 2, 3, 5)), in_dims=(None, 0))
         test(vmap(op), (torch.rand(B0, B1, 1, 5), torch.rand(B0, B1, 2, 3, 5)))
-        test(vmap(op), (torch.rand(B0, B1, 1, 5), torch.rand(B1, B0, 2, 3, 5)), in_dims=(0, 1))
+        test(
+            vmap(op),
+            (torch.rand(B0, B1, 1, 5), torch.rand(B1, B0, 2, 3, 5)),
+            in_dims=(0, 1),
+        )
         test(vmap(op), (torch.rand(B0, B1), torch.rand(B1, 2, 3, 5)), in_dims=(0, None))
         test(vmap(vmap(op)), (torch.rand(B0, B1, B2), torch.rand(B0, B1, B2, 2, 3, 5)))
 
@@ -1474,18 +1598,24 @@ def test_fill_and_zero_inplace(self):
             # Doubly nested vmap
             test(vmap(op), [TensorFactory.randn([B0, B1])])
             test(vmap(op), [TensorFactory.randn([B1, 2, 5, B0, 3])], in_dims=2)
-            test(vmap(op, in_dims=2), [TensorFactory.randn([2, 5, B0, B1, 3])],
-                 in_dims=2, out_dims=2)
+            test(
+                vmap(op, in_dims=2),
+                [TensorFactory.randn([2, 5, B0, B1, 3])],
+                in_dims=2,
+                out_dims=2,
+            )
 
         # test when value is a batched tensor for fill_ operator
         B0, B1 = 3, 5
         test(Tensor.fill_, [TensorFactory.randn([B0, B1]), TensorFactory.randn(B0)])
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"output with shape .+ doesn't match the broadcast shape"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"output with shape .+ doesn't match the broadcast shape"
+        ):
             # Runtime Error is thrown when the tensor being written to isn't being vmapped over
-            vmap(Tensor.fill_, (None, 0))(TensorFactory.randn([B0, B1]),
-                                          TensorFactory.randn([B0]))
+            vmap(Tensor.fill_, (None, 0))(
+                TensorFactory.randn([B0, B1]), TensorFactory.randn([B0])
+            )
 
     def _test_complex_views(self, op, dtypes):
         test = self._vmap_view_test
@@ -1505,8 +1635,7 @@ def get(shape):
             # Doubly nested vmap
             test(vmap(op), [get([B0, B1])])
             test(vmap(op), [get([B1, 2, 5, 3, B0])], in_dims=4)
-            test(vmap(op, in_dims=2), [get([2, 5, B0, B1, 3])],
-                 in_dims=2, out_dims=2)
+            test(vmap(op, in_dims=2), [get([2, 5, B0, B1, 3])], in_dims=2, out_dims=2)
 
         for dtype in dtypes:
             run_test(op, dtype)
@@ -1518,7 +1647,9 @@ def test_imag(self):
         self._test_complex_views(torch.imag, dtypes=[torch.cfloat, torch.cdouble])
 
     def test_view_as_real(self):
-        self._test_complex_views(torch.view_as_real, dtypes=[torch.cfloat, torch.cdouble])
+        self._test_complex_views(
+            torch.view_as_real, dtypes=[torch.cfloat, torch.cdouble]
+        )
 
     def test_view_as_complex(self):
         def run_test(dtype):
@@ -1537,8 +1668,9 @@ def get(shape):
             # Doubly nested vmap
             test(vmap(op), [get([B0, B1, 2])])
             test(vmap(op), [get([B1, 2, 5, B0, 3, 2])], in_dims=2)
-            test(vmap(op, in_dims=2), [get([2, 5, B0, B1, 3, 2])],
-                 in_dims=2, out_dims=2)
+            test(
+                vmap(op, in_dims=2), [get([2, 5, B0, B1, 3, 2])], in_dims=2, out_dims=2
+            )
 
             # Interesting case #1: Batch dim directly before dim of size 2
             test(op, [get([3, B0, 2])], in_dims=1)
@@ -1559,7 +1691,7 @@ def get(shape):
                 vmap(vmap(op, in_dims=1), in_dims=1)(get([2, B0, B1]))
 
             # Invalid input: no dimension of size 2
-            msg = 'Input tensor must have one or more dimensions'
+            msg = "Input tensor must have one or more dimensions"
             with self.assertRaisesRegex(RuntimeError, msg):
                 vmap(op)(get([B0]))
             with self.assertRaisesRegex(RuntimeError, msg):
@@ -1567,7 +1699,7 @@ def get(shape):
 
             # Invalid input: Batch dim has size 2, but the logical last dim does
             # not have size 2
-            msg = 'Tensor must have a last dimension of size 2'
+            msg = "Tensor must have a last dimension of size 2"
             with self.assertRaisesRegex(RuntimeError, msg):
                 vmap(op, in_dims=1)(get([3, 2]))
 
@@ -1588,7 +1720,7 @@ def foo(x):
         self.assertEqual(vmap(foo)(tensor), torch.tensor([0, 0, 0]))
 
     def test_is_floating_point(self):
-        float_tensor = torch.tensor([1., 2., 3.])
+        float_tensor = torch.tensor([1.0, 2.0, 3.0])
         long_tensor = torch.tensor([1, 2, 3])
 
         def foo(x):
@@ -1603,9 +1735,9 @@ def foo(x):
     def test_is_contiguous(self):
         def foo(x):
             if x.is_contiguous():
-                return torch.tensor(1.)
+                return torch.tensor(1.0)
             else:
-                return torch.tensor(0.)
+                return torch.tensor(0.0)
 
         B0, B1 = 3, 5
 
@@ -1649,7 +1781,7 @@ def baz(x, memory_format):
             x.is_contiguous(memory_format=memory_format)
             return x
 
-        msg = 'NYI: querying is_contiguous inside of vmap for memory_format'
+        msg = "NYI: querying is_contiguous inside of vmap for memory_format"
         tensor = torch.randn(B0, 2, 7, 3)
         with self.assertRaisesRegex(RuntimeError, msg):
             vmap(functools.partial(baz, memory_format=torch.channels_last))(tensor)
@@ -1664,17 +1796,30 @@ def test_movedim(self):
         # movedim(tensor, int, int) variant
         test(op, (torch.rand(B0, 2, 5), 0, 1), in_dims=(0, None, None))
         test(op, (torch.rand(2, B0, 5), 0, 1), in_dims=(1, None, None))
-        test(vmap(op, in_dims=(0, None, None)), (torch.rand(B1, 2, B0, 5), 0, 1), in_dims=(2, None, None))
-        test(vmap(vmap(op, in_dims=(2, None, None)), in_dims=(0, None, None)),
-             (torch.rand(B1, 2, B0, 5, B2), 0, 1), in_dims=(2, None, None))
+        test(
+            vmap(op, in_dims=(0, None, None)),
+            (torch.rand(B1, 2, B0, 5), 0, 1),
+            in_dims=(2, None, None),
+        )
+        test(
+            vmap(vmap(op, in_dims=(2, None, None)), in_dims=(0, None, None)),
+            (torch.rand(B1, 2, B0, 5, B2), 0, 1),
+            in_dims=(2, None, None),
+        )
 
         # movedim(tensor, intlist, intlist) variant
         test(op, (torch.rand(B0, 2, 3, 5), [1, 0], [0, 2]), in_dims=(0, None, None))
         test(op, (torch.rand(2, 3, B0, 5), [1, 0], [0, 2]), in_dims=(1, None, None))
-        test(vmap(op, in_dims=(0, None, None)),
-             (torch.rand(B1, 2, B0, 5), [0, 1], [1, 0]), in_dims=(2, None, None))
-        test(vmap(vmap(op, in_dims=(2, None, None)), in_dims=(0, None, None)),
-             (torch.rand(B1, 2, B0, 5, B2), [0, 1], [1, 0]), in_dims=(2, None, None))
+        test(
+            vmap(op, in_dims=(0, None, None)),
+            (torch.rand(B1, 2, B0, 5), [0, 1], [1, 0]),
+            in_dims=(2, None, None),
+        )
+        test(
+            vmap(vmap(op, in_dims=(2, None, None)), in_dims=(0, None, None)),
+            (torch.rand(B1, 2, B0, 5, B2), [0, 1], [1, 0]),
+            in_dims=(2, None, None),
+        )
 
     def test_mm(self):
         op = torch.mm
@@ -1692,19 +1837,32 @@ def test_mm(self):
 
         # left arg is vmapped
         test(op, (torch.rand(B0, 2, 5), torch.rand(5, 2)), in_dims=(0, None))
-        test(vmap(op, in_dims=(0, None)), (torch.rand(B1, B0, 2, 5), torch.rand(5, 2)),
-             in_dims=(1, None))
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, B0, 2, 5), torch.rand(5, 2)),
+            in_dims=(1, None),
+        )
 
         # right arg is vmapped
         test(op, (torch.rand(2, 5), torch.rand(B0, 5, 2)), in_dims=(None, 0))
-        test(vmap(op, in_dims=(None, 0)), (torch.rand(2, 5), torch.rand(B1, B0, 5, 2)),
-             in_dims=(None, 1))
+        test(
+            vmap(op, in_dims=(None, 0)),
+            (torch.rand(2, 5), torch.rand(B1, B0, 5, 2)),
+            in_dims=(None, 1),
+        )
 
         # both args are vmapped
         test(op, (torch.rand(B0, 2, 5), torch.rand(B0, 5, 2)))
-        test(vmap(op), (torch.rand(B1, B0, 2, 5), torch.rand(B0, B1, 5, 2)), in_dims=(1, 0))
-        test(vmap(op, in_dims=(0, None)),
-             (torch.rand(B1, 2, 5), torch.rand(B0, 5, 2)), in_dims=(None, 0))
+        test(
+            vmap(op),
+            (torch.rand(B1, B0, 2, 5), torch.rand(B0, B1, 5, 2)),
+            in_dims=(1, 0),
+        )
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, 2, 5), torch.rand(B0, 5, 2)),
+            in_dims=(None, 0),
+        )
 
     def test_mv(self):
         op = torch.mv
@@ -1722,19 +1880,30 @@ def test_mv(self):
 
         # left arg is vmapped
         test(op, (torch.rand(B0, 2, 5), torch.rand(5)), in_dims=(0, None))
-        test(vmap(op, in_dims=(0, None)), (torch.rand(B1, B0, 2, 5), torch.rand(5)),
-             in_dims=(1, None))
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, B0, 2, 5), torch.rand(5)),
+            in_dims=(1, None),
+        )
 
         # right arg is vmapped
         test(op, (torch.rand(2, 5), torch.rand(B0, 5)), in_dims=(None, 0))
-        test(vmap(op, in_dims=(None, 0)), (torch.rand(2, 5), torch.rand(B1, B0, 5)),
-             in_dims=(None, 1))
+        test(
+            vmap(op, in_dims=(None, 0)),
+            (torch.rand(2, 5), torch.rand(B1, B0, 5)),
+            in_dims=(None, 1),
+        )
 
         # both args are vmapped
         test(op, (torch.rand(B0, 2, 5), torch.rand(B0, 5)))
-        test(vmap(op), (torch.rand(B1, B0, 2, 5), torch.rand(B0, B1, 5)), in_dims=(1, 0))
-        test(vmap(op, in_dims=(0, None)),
-             (torch.rand(B1, 2, 5), torch.rand(B0, 5)), in_dims=(None, 0))
+        test(
+            vmap(op), (torch.rand(B1, B0, 2, 5), torch.rand(B0, B1, 5)), in_dims=(1, 0)
+        )
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, 2, 5), torch.rand(B0, 5)),
+            in_dims=(None, 0),
+        )
 
     def test_narrow(self):
         op = torch.narrow
@@ -1743,10 +1912,18 @@ def test_narrow(self):
 
         test(op, (torch.rand(B0, 2, 5), -1, 1, 3), in_dims=(0, None, None, None))
         test(op, (torch.rand(2, B0, 5), 1, 1, 3), in_dims=(1, None, None, None))
-        test(vmap(op, in_dims=(0, None, None, None)),
-             (torch.rand(B1, 2, B0, 5), 1, 0, 0), in_dims=(2, None, None, None))
-        test(vmap(vmap(op, in_dims=(2, None, None, None)), in_dims=(0, None, None, None)),
-             (torch.rand(B1, 2, B0, 5, B2), -1, 2, 3), in_dims=(2, None, None, None))
+        test(
+            vmap(op, in_dims=(0, None, None, None)),
+            (torch.rand(B1, 2, B0, 5), 1, 0, 0),
+            in_dims=(2, None, None, None),
+        )
+        test(
+            vmap(
+                vmap(op, in_dims=(2, None, None, None)), in_dims=(0, None, None, None)
+            ),
+            (torch.rand(B1, 2, B0, 5, B2), -1, 2, 3),
+            in_dims=(2, None, None, None),
+        )
 
     def test_new_empty(self):
         # Empty is non-deterministic so we just check that the shape of the
@@ -1784,7 +1961,9 @@ def _test_double_vmap(size, stride, B0, B1):
             self.assertEqual(result.stride(), [B1 * S, S] + stride)
 
             x = torch.randn(B1, B0)
-            result = vmap(vmap(lambda x: x.new_empty_strided(size, stride)), in_dims=1)(x)
+            result = vmap(vmap(lambda x: x.new_empty_strided(size, stride)), in_dims=1)(
+                x
+            )
             S = x.new_empty_strided(size, stride).storage().size()
             self.assertEqual(result.shape, [B0, B1] + size)
             self.assertEqual(result.stride(), [B1 * S, S] + stride)
@@ -1821,7 +2000,11 @@ def test_select(self):
         test(op, (torch.rand(B0, 2, 5), 0, 0), in_dims=(0, None, None))
         test(op, (torch.rand(2, B0, 5), 1, 1), in_dims=(1, None, None))
         test(vmap(lambda t: op(t, 1, 1)), (torch.rand(B1, 2, B0, 5),), in_dims=2)
-        test(vmap(vmap(lambda t: op(t, 1, 1), in_dims=1)), (torch.rand(B1, 2, B0, B2, 5),), in_dims=2)
+        test(
+            vmap(vmap(lambda t: op(t, 1, 1), in_dims=1)),
+            (torch.rand(B1, 2, B0, B2, 5),),
+            in_dims=2,
+        )
 
     def test_stack(self):
         test = self._vmap_test
@@ -1831,26 +2014,37 @@ def test_stack(self):
         def get_op(dim):
             def op(*tensors):
                 return torch.stack(tensors, dim=dim)
+
             return op
 
         test(get_op(0), (torch.rand(B0, 3), torch.rand(B0, 3)))
         test(get_op(0), (torch.rand(3), torch.rand(B0, 3)), in_dims=(None, 0))
         test(get_op(0), (torch.rand(2, 17), torch.rand(2, 17, B0)), in_dims=(None, 2))
         test(get_op(-1), (torch.rand(2, 17), torch.rand(2, 17, B0)), in_dims=(None, 2))
-        test(vmap(get_op(0), in_dims=(0, None)),
-             (torch.rand(B1, 2), torch.rand(B0, 2)), in_dims=(None, 0))
-        test(vmap(get_op(0), in_dims=(0, 0)),
-             (torch.rand(B1, 2), torch.rand(B0, B1, 2)), in_dims=(None, 0))
-
+        test(
+            vmap(get_op(0), in_dims=(0, None)),
+            (torch.rand(B1, 2), torch.rand(B0, 2)),
+            in_dims=(None, 0),
+        )
+        test(
+            vmap(get_op(0), in_dims=(0, 0)),
+            (torch.rand(B1, 2), torch.rand(B0, B1, 2)),
+            in_dims=(None, 0),
+        )
 
     def test_slice(self):
         test = self._vmap_view_test
         B0, B1, B2 = 7, 11, 13
         test(lambda t: t[0:1], (torch.rand(B0, 3, 5),))
         test(lambda t: t[:, 1:3], (torch.rand(3, 5, B0),), in_dims=2)
-        test(vmap(lambda t: t[:, 0:1], in_dims=2), (torch.rand(3, 5, B0, B1),), in_dims=2)
-        test(vmap(vmap(lambda t: t[0:1], in_dims=2), in_dims=2),
-             (torch.rand(3, 5, B0, B1, B2),), in_dims=2)
+        test(
+            vmap(lambda t: t[:, 0:1], in_dims=2), (torch.rand(3, 5, B0, B1),), in_dims=2
+        )
+        test(
+            vmap(vmap(lambda t: t[0:1], in_dims=2), in_dims=2),
+            (torch.rand(3, 5, B0, B1, B2),),
+            in_dims=2,
+        )
 
     def test_squeeze(self):
         test = self._vmap_view_test
@@ -1882,39 +2076,76 @@ def test_sum_dim(self):
         test(vmap(lambda x: x.sum(0)), [torch.randn([B0, B1])])
         test(vmap(lambda x: x.sum(-1)), [torch.randn([B0, B1])])
         test(vmap(lambda x: x.sum(-2)), [torch.randn([B1, 2, 5, B0, 3])], in_dims=2)
-        test(vmap(lambda x: x.sum(2), in_dims=2), [torch.randn([2, 5, B0, B1, 3])],
-             in_dims=2, out_dims=2)
+        test(
+            vmap(lambda x: x.sum(2), in_dims=2),
+            [torch.randn([2, 5, B0, B1, 3])],
+            in_dims=2,
+            out_dims=2,
+        )
 
     def test_reshape(self):
         test = self._vmap_test
         B0, B1, B2 = 7, 11, 13
         op = torch.reshape
         test(op, (torch.rand(B0, 2 * 5), [2, 5]), in_dims=(0, None), check_view=True)
-        test(op, (torch.rand(2, B0, 5), [1, 1, 10]), in_dims=(1, None), check_view=False)
-        test(vmap(lambda t: t.reshape([-1])), (torch.rand(B0, B1, 2, 5),), check_view=True)
-        test(vmap(vmap(lambda t: t.reshape([-1]), in_dims=2), in_dims=1),
-             (torch.rand(3, B1, 2, B2, 5, B0),), in_dims=5, check_view=False)
+        test(
+            op, (torch.rand(2, B0, 5), [1, 1, 10]), in_dims=(1, None), check_view=False
+        )
+        test(
+            vmap(lambda t: t.reshape([-1])),
+            (torch.rand(B0, B1, 2, 5),),
+            check_view=True,
+        )
+        test(
+            vmap(vmap(lambda t: t.reshape([-1]), in_dims=2), in_dims=1),
+            (torch.rand(3, B1, 2, B2, 5, B0),),
+            in_dims=5,
+            check_view=False,
+        )
 
     def test_reshape_as(self):
         test = self._vmap_test
         B0, B1, B2 = 7, 11, 13
         op = torch.Tensor.reshape_as
         test(op, (torch.rand(B0, 2 * 5), torch.rand(B0, 2, 5)), check_view=True)
-        test(op, (torch.rand(2 * 5), torch.rand(B0, 2, 5)), in_dims=(None, 0), check_view=True)
-        test(op, (torch.rand(B0, 2 * 5), torch.rand(2, 5)), in_dims=(0, None), check_view=True)
+        test(
+            op,
+            (torch.rand(2 * 5), torch.rand(B0, 2, 5)),
+            in_dims=(None, 0),
+            check_view=True,
+        )
+        test(
+            op,
+            (torch.rand(B0, 2 * 5), torch.rand(2, 5)),
+            in_dims=(0, None),
+            check_view=True,
+        )
 
-        test(op, (torch.rand(2, B0, 5), torch.rand(1, 1, 10)), in_dims=(1, None), check_view=False)
+        test(
+            op,
+            (torch.rand(2, B0, 5), torch.rand(1, 1, 10)),
+            in_dims=(1, None),
+            check_view=False,
+        )
 
-        test(vmap(op), (torch.rand(B0, B1, 2, 5), torch.randn(B0, B1, 10)), check_view=True)
-        test(vmap(vmap(op, in_dims=(2, None)), in_dims=(1, None)),
-             (torch.rand(3, B1, 2, B2, 5, B0), torch.rand(B0, 3 * 2 * 5)),
-             in_dims=(5, 0), check_view=False)
+        test(
+            vmap(op),
+            (torch.rand(B0, B1, 2, 5), torch.randn(B0, B1, 10)),
+            check_view=True,
+        )
+        test(
+            vmap(vmap(op, in_dims=(2, None)), in_dims=(1, None)),
+            (torch.rand(3, B1, 2, B2, 5, B0), torch.rand(B0, 3 * 2 * 5)),
+            in_dims=(5, 0),
+            check_view=False,
+        )
 
     def test_result_type(self):
         def scalar_tensor_with_dtype(op):
             def wrapped(*args, **kwargs):
                 dtype = op(*args, **kwargs)
                 return torch.ones([], dtype=dtype)
+
             return wrapped
 
         test = self._vmap_test
@@ -1922,37 +2153,68 @@ def wrapped(*args, **kwargs):
 
         B0 = 2
 
-        test(op, (torch.randn(B0), torch.randn(B0, dtype=torch.float64)),
-             check_propagates_grad=False)
-        test(op, (torch.randn(B0), torch.randint(10, [B0], dtype=torch.int64)),
-             check_propagates_grad=False)
+        test(
+            op,
+            (torch.randn(B0), torch.randn(B0, dtype=torch.float64)),
+            check_propagates_grad=False,
+        )
+        test(
+            op,
+            (torch.randn(B0), torch.randint(10, [B0], dtype=torch.int64)),
+            check_propagates_grad=False,
+        )
 
         test(lambda x: op(x, 1), (torch.randn(B0),), check_propagates_grad=False)
         test(lambda x: op(x, 1.6), (torch.randn(B0),), check_propagates_grad=False)
 
-        test(lambda x: op(x, torch.tensor(1)), (torch.randn(B0),),
-             check_propagates_grad=False)
-        test(lambda x: op(x, torch.tensor(1.6, dtype=torch.double)),
-             (torch.randn(B0),), check_propagates_grad=False)
+        test(
+            lambda x: op(x, torch.tensor(1)),
+            (torch.randn(B0),),
+            check_propagates_grad=False,
+        )
+        test(
+            lambda x: op(x, torch.tensor(1.6, dtype=torch.double)),
+            (torch.randn(B0),),
+            check_propagates_grad=False,
+        )
 
-        test(op, (torch.randn(B0, 2), torch.randn(B0, 2, dtype=torch.float64)),
-             check_propagates_grad=False)
-        test(op, (torch.randn(B0, 2), torch.randint(10, [B0, 2], dtype=torch.int64)),
-             check_propagates_grad=False)
+        test(
+            op,
+            (torch.randn(B0, 2), torch.randn(B0, 2, dtype=torch.float64)),
+            check_propagates_grad=False,
+        )
+        test(
+            op,
+            (torch.randn(B0, 2), torch.randint(10, [B0, 2], dtype=torch.int64)),
+            check_propagates_grad=False,
+        )
 
         test(lambda x: op(x, 1), (torch.randn(B0, 2),), check_propagates_grad=False)
         test(lambda x: op(x, 1.6), (torch.randn(B0, 2),), check_propagates_grad=False)
 
-        test(lambda x: op(x, torch.tensor(1)), (torch.randn(B0, 2),),
-             check_propagates_grad=False)
-        test(lambda x: op(x, torch.tensor(1.6, dtype=torch.double)),
-             (torch.randn(B0, 2),), check_propagates_grad=False)
+        test(
+            lambda x: op(x, torch.tensor(1)),
+            (torch.randn(B0, 2),),
+            check_propagates_grad=False,
+        )
+        test(
+            lambda x: op(x, torch.tensor(1.6, dtype=torch.double)),
+            (torch.randn(B0, 2),),
+            check_propagates_grad=False,
+        )
 
-        test(op, (torch.randn(B0, 2), torch.randn(B0, dtype=torch.float64)),
-             check_propagates_grad=False)
-        test(op, (torch.randn(B0, 2), torch.randint(10, [B0], dtype=torch.int64)),
-             check_propagates_grad=False)
+        test(
+            op,
+            (torch.randn(B0, 2), torch.randn(B0, dtype=torch.float64)),
+            check_propagates_grad=False,
+        )
+        test(
+            op,
+            (torch.randn(B0, 2), torch.randint(10, [B0], dtype=torch.int64)),
+            check_propagates_grad=False,
+        )
 
+    @skipIfTorchDynamo("too slow")
     def test_tensor_split(self):
         test = self._vmap_view_test
         op = torch.tensor_split
@@ -1961,18 +2223,38 @@ def test_tensor_split(self):
         # tests for torch.tensor_split(self, indices_or_sections: int, dim)
         test(op, (torch.rand(B0, 2, 1024), 5, -1), in_dims=(0, None, None))
         test(op, (torch.rand(2, B0, 1024), 150, 1), in_dims=(1, None, None))
-        test(vmap(op, in_dims=(0, None, None)), (torch.rand(B1, 1023, B0, 5), 256, 0),
-             in_dims=(2, None, None))
-        test(vmap(vmap(lambda t: op(t, 4, 1), in_dims=2)),
-             (torch.rand(B1, 2, B0, 64, B2),), in_dims=2)
+        test(
+            vmap(op, in_dims=(0, None, None)),
+            (torch.rand(B1, 1023, B0, 5), 256, 0),
+            in_dims=(2, None, None),
+        )
+        test(
+            vmap(vmap(lambda t: op(t, 4, 1), in_dims=2)),
+            (torch.rand(B1, 2, B0, 64, B2),),
+            in_dims=2,
+        )
 
         # tests for torch.tensor_split(self, indices_or_sections: List[int], dim)
-        test(op, (torch.rand(B0, 2, 1024), [50, 100, 378, 890], -1), in_dims=(0, None, None))
-        test(op, (torch.rand(2, B0, 1024), [50, 100, 212, 345, 0, 378, 890], 1), in_dims=(1, None, None))
-        test(vmap(op, in_dims=(0, None, None)), (torch.rand(B1, 1023, B0, 5), [50, 100, 212, 345, 0, 378, 890], 0),
-             in_dims=(2, None, None))
-        test(vmap(vmap(lambda t: op(t, [4, 8, 9, 34, 29], 1), in_dims=2)),
-             (torch.rand(B1, 2, B0, 64, B2),), in_dims=2)
+        test(
+            op,
+            (torch.rand(B0, 2, 1024), [50, 100, 378, 890], -1),
+            in_dims=(0, None, None),
+        )
+        test(
+            op,
+            (torch.rand(2, B0, 1024), [50, 100, 212, 345, 0, 378, 890], 1),
+            in_dims=(1, None, None),
+        )
+        test(
+            vmap(op, in_dims=(0, None, None)),
+            (torch.rand(B1, 1023, B0, 5), [50, 100, 212, 345, 0, 378, 890], 0),
+            in_dims=(2, None, None),
+        )
+        test(
+            vmap(vmap(lambda t: op(t, [4, 8, 9, 34, 29], 1), in_dims=2)),
+            (torch.rand(B1, 2, B0, 64, B2),),
+            in_dims=2,
+        )
 
     def test_split(self):
         test = self._vmap_view_test
@@ -1982,18 +2264,32 @@ def test_split(self):
         # tests for torch.split(self, split_size: int, dim)
         test(op, (torch.rand(B0, 2, 1024), 101, -1), in_dims=(0, None, None))
         test(op, (torch.rand(2, B0, 1024), 130, 1), in_dims=(1, None, None))
-        test(vmap(op, in_dims=(0, None, None)), (torch.rand(B1, 1023, B0, 5), 256, 0),
-             in_dims=(2, None, None))
-        test(vmap(vmap(lambda t: op(t, 4, 1), in_dims=2)),
-             (torch.rand(B1, 2, B0, 64, B2),), in_dims=2)
+        test(
+            vmap(op, in_dims=(0, None, None)),
+            (torch.rand(B1, 1023, B0, 5), 256, 0),
+            in_dims=(2, None, None),
+        )
+        test(
+            vmap(vmap(lambda t: op(t, 4, 1), in_dims=2)),
+            (torch.rand(B1, 2, B0, 64, B2),),
+            in_dims=2,
+        )
 
         # tests for torch.split(self, split_size: List[int], dim)
         test(op, (torch.rand(B0, 2, 1024), [1, 1020, 3], -1), in_dims=(0, None, None))
-        test(op, (torch.rand(2, B0, 1024), [100] * 10 + [24], 1), in_dims=(1, None, None))
-        test(vmap(op, in_dims=(0, None, None)), (torch.rand(B1, 1023, B0, 5), [256] * 3 + [255], 0),
-             in_dims=(2, None, None))
-        test(vmap(vmap(lambda t: op(t, [4] * 8 + [8] * 4, 1), in_dims=2)),
-             (torch.rand(B1, 2, B0, 64, B2),), in_dims=2)
+        test(
+            op, (torch.rand(2, B0, 1024), [100] * 10 + [24], 1), in_dims=(1, None, None)
+        )
+        test(
+            vmap(op, in_dims=(0, None, None)),
+            (torch.rand(B1, 1023, B0, 5), [256] * 3 + [255], 0),
+            in_dims=(2, None, None),
+        )
+        test(
+            vmap(vmap(lambda t: op(t, [4] * 8 + [8] * 4, 1), in_dims=2)),
+            (torch.rand(B1, 2, B0, 64, B2),),
+            in_dims=2,
+        )
 
     def test_trace(self):
         op = torch.trace
@@ -2015,8 +2311,11 @@ def test_transpose(self):
         test(lambda x: op(x, 3, 1), (torch.rand(B0, 2, 5, 4, 6),))
         test(lambda x: op(x, 1, 0), (torch.rand(2, B0, 5),), in_dims=1)
         test(vmap(lambda x: op(x, 0, 1)), (torch.rand(B1, 2, B0, 5),), in_dims=2)
-        test(vmap(vmap(lambda x: op(x, 0, 1), in_dims=2)),
-             (torch.rand(B1, 2, B0, 5, B2),), in_dims=2)
+        test(
+            vmap(vmap(lambda x: op(x, 0, 1), in_dims=2)),
+            (torch.rand(B1, 2, B0, 5, B2),),
+            in_dims=2,
+        )
 
         # Special case: scalar tensor
         for dim1, dim2 in itertools.product([0, -1], [0, -1]):
@@ -2049,12 +2348,16 @@ def test_to(self):
         test = self._vmap_test
         B0, B1 = 7, 11
 
-        test(lambda t: t.to('cpu'), (torch.rand(B0),))
+        test(lambda t: t.to("cpu"), (torch.rand(B0),))
         test(lambda t: t.to(torch.double), (torch.rand(B0),))
-        test(lambda t, o: t.to(o), (torch.rand(B0), torch.randn(B0, dtype=torch.float64)))
-        test(lambda t, o: t.to(o),
-             (torch.rand(B0), torch.randn(B0, dtype=torch.float64)),
-             in_dims=(0, None))
+        test(
+            lambda t, o: t.to(o), (torch.rand(B0), torch.randn(B0, dtype=torch.float64))
+        )
+        test(
+            lambda t, o: t.to(o),
+            (torch.rand(B0), torch.randn(B0, dtype=torch.float64)),
+            in_dims=(0, None),
+        )
         test(vmap(lambda t: t.to(torch.double)), (torch.rand(B0, B1, 3),))
 
         # also test some casting methods
@@ -2070,10 +2373,18 @@ def test_unfold(self):
 
         test(op, (torch.rand(B0, 7, 11), 0, 2, 1), in_dims=(0, None, None, None))
         test(op, (torch.rand(7, B0, 11), 1, 4, 2), in_dims=(1, None, None, None))
-        test(vmap(op, in_dims=(0, None, None, None)),
-             (torch.rand(B1, 7, B0, 11), 1, 5, 1), in_dims=(2, None, None, None))
-        test(vmap(vmap(op, in_dims=(2, None, None, None)), in_dims=(0, None, None, None)),
-             (torch.rand(B1, 7, B0, 11, B2), -1, 2, 4), in_dims=(2, None, None, None))
+        test(
+            vmap(op, in_dims=(0, None, None, None)),
+            (torch.rand(B1, 7, B0, 11), 1, 5, 1),
+            in_dims=(2, None, None, None),
+        )
+        test(
+            vmap(
+                vmap(op, in_dims=(2, None, None, None)), in_dims=(0, None, None, None)
+            ),
+            (torch.rand(B1, 7, B0, 11, B2), -1, 2, 4),
+            in_dims=(2, None, None, None),
+        )
 
     def test_unbind(self):
         test = self._vmap_view_test
@@ -2083,10 +2394,16 @@ def test_unbind(self):
         test(op, (torch.rand(B0, 2, 1024), -1), in_dims=(0, None))
         test(op, (torch.rand(B0, 2, 0),))
         test(op, (torch.rand(2, B0, 7), 0), in_dims=(1, None))
-        test(vmap(op, in_dims=(0, None)), (torch.rand(B1, 1023, B0, 5), 1),
-             in_dims=(2, None))
-        test(vmap(vmap(lambda t: op(t, dim=1), in_dims=2)),
-             (torch.rand(B1, 2, B0, 32, B2),), in_dims=2)
+        test(
+            vmap(op, in_dims=(0, None)),
+            (torch.rand(B1, 1023, B0, 5), 1),
+            in_dims=(2, None),
+        )
+        test(
+            vmap(vmap(lambda t: op(t, dim=1), in_dims=2)),
+            (torch.rand(B1, 2, B0, 32, B2),),
+            in_dims=2,
+        )
 
     def test_view(self):
         test = self._vmap_view_test
@@ -2100,8 +2417,11 @@ def test_view(self):
         test(op, (torch.rand(B0, 2 * 5), [2, 5]), in_dims=(0, None))
         test(op, (torch.rand(B0, 4, 5), [1, 2, 1, 10]), in_dims=(0, None))
         test(vmap(lambda t: t.view([-1])), (torch.rand(B0, B1, 2, 5, 3),))
-        test(vmap(vmap(lambda t: t.reshape([-1])), in_dims=1),
-             (torch.rand(B2, B0, B1, 3, 2, 5),), in_dims=1)
+        test(
+            vmap(vmap(lambda t: t.reshape([-1])), in_dims=1),
+            (torch.rand(B2, B0, B1, 3, 2, 5),),
+            in_dims=1,
+        )
 
     def test_view_as(self):
         test = self._vmap_view_test
@@ -2119,9 +2439,11 @@ def test_view_as(self):
         test(op, (torch.rand(B0, 4, 5), torch.rand(2, 1, 1, 10)), in_dims=(0, None))
 
         test(vmap(op), (torch.rand(B0, B1, 2, 5), torch.randn(B0, B1, 10)))
-        test(vmap(vmap(op, in_dims=(0, None)), in_dims=(0, None)),
-             (torch.rand(B1, B2, B0, 3, 2, 5), torch.rand(B0, 3 * 2 * 5)),
-             in_dims=(2, 0))
+        test(
+            vmap(vmap(op, in_dims=(0, None)), in_dims=(0, None)),
+            (torch.rand(B1, B2, B0, 3, 2, 5), torch.rand(B0, 3 * 2 * 5)),
+            in_dims=(2, 0),
+        )
 
     def test_no_random_op_support(self):
         B0 = 2
@@ -2134,27 +2456,25 @@ def test_no_random_op_support(self):
             (lambda t: torch.bernoulli(t, p=0.5), (torch.rand(B0, 1),)),
             (lambda t: torch.multinomial(t, 2), (torch.rand(B0, 3),)),
             (torch.normal, (torch.randn(B0, 1), torch.randn(B0, 1))),
-            (lambda t: torch.normal(t, 1.), (torch.randn(B0, 1),)),
-            (lambda t: torch.normal(0., t), (torch.randn(B0, 1),)),
+            (lambda t: torch.normal(t, 1.0), (torch.randn(B0, 1),)),
+            (lambda t: torch.normal(0.0, t), (torch.randn(B0, 1),)),
             (torch.poisson, (torch.rand(B0, 1),)),
             (torch.rand_like, (torch.rand(B0, 1),)),
             (torch.randn_like, (torch.rand(B0, 1),)),
             (lambda t: torch.randint_like(t, 2), (torch.rand(B0, 1),)),
             (lambda t: torch.randint_like(t, 0, 2), (torch.rand(B0, 1),)),
-
             # out-of-place on captured tensor
             (lambda t: torch.bernoulli(captured), (torch.rand(B0),)),
             (lambda t: torch.bernoulli(captured, p=0.5), (torch.rand(B0),)),
             (lambda t: torch.multinomial(captured, 2), (torch.rand(B0),)),
             (lambda t: torch.normal(captured, captured), (torch.randn(B0),)),
-            (lambda t: torch.normal(captured, 1.), (torch.randn(B0),)),
-            (lambda t: torch.normal(0., captured), (torch.randn(B0),)),
+            (lambda t: torch.normal(captured, 1.0), (torch.randn(B0),)),
+            (lambda t: torch.normal(0.0, captured), (torch.randn(B0),)),
             (lambda t: torch.poisson(captured), (torch.rand(B0),)),
             (lambda t: torch.rand_like(captured), (torch.rand(B0),)),
-            (lambda t: torch.randn_like(captured) , (torch.rand(B0),)),
+            (lambda t: torch.randn_like(captured), (torch.rand(B0),)),
             (lambda t: torch.randint_like(captured, 2), (torch.rand(B0),)),
             (lambda t: torch.randint_like(captured, 0, 2), (torch.rand(B0),)),
-
             # in-place on BatchedTensor
             (lambda t: t.bernoulli_(), (torch.randn(B0, 1),)),
             (lambda t: t.cauchy_(), (torch.randn(B0, 1),)),
@@ -2166,7 +2486,6 @@ def test_no_random_op_support(self):
             (lambda t: t.random_(0, 2), (torch.randn(B0, 1),)),
             (lambda t: t.random_(2), (torch.randn(B0, 1),)),
             (lambda t: t.uniform_(), (torch.randn(B0, 1),)),
-
             # in-place on captured tensor
             (lambda t: captured.bernoulli_(), (torch.randn(B0),)),
             (lambda t: captured.cauchy_(), (torch.randn(B0),)),
@@ -2178,7 +2497,6 @@ def test_no_random_op_support(self):
             (lambda t: captured.random_(0, 2), (torch.randn(B0),)),
             (lambda t: captured.random_(2), (torch.randn(B0),)),
             (lambda t: captured.uniform_(), (torch.randn(B0),)),
-
             # factory functions
             (lambda t: torch.rand(1), (torch.randn(B0),)),
             (lambda t: torch.randn(1), (torch.randn(B0),)),
@@ -2186,13 +2504,17 @@ def test_no_random_op_support(self):
             (lambda t: torch.randperm(5), (torch.randn(B0),)),
         ]
         for op, args in random_ops:
-            with self.assertRaisesRegex(RuntimeError,
-                                        'vmap: We do not yet support calling random operations'):
+            with self.assertRaisesRegex(
+                RuntimeError, "vmap: We do not yet support calling random operations"
+            ):
                 vmap(op)(*args)
 
+
 def construct_v(output, batch_size):
-    return torch.randn(batch_size, *output.shape,
-                       dtype=output.dtype, device=output.device)
+    return torch.randn(
+        batch_size, *output.shape, dtype=output.dtype, device=output.device
+    )
+
 
 def as_tuple(x):
     if isinstance(x, tuple):
@@ -2200,20 +2522,26 @@ def as_tuple(x):
     elif isinstance(x, list):
         return tuple(x)
     else:
-        return x,
+        return (x,)
+
 
 def differentiable(args):
-    return tuple(arg for arg in as_tuple(args)
-                 if isinstance(arg, torch.Tensor) and arg.requires_grad)
+    return tuple(
+        arg
+        for arg in as_tuple(args)
+        if isinstance(arg, torch.Tensor) and arg.requires_grad
+    )
+
 
 def _get_rand_no_zeros(*args, **kwargs):
-    requires_grad = kwargs.get('requires_grad', False)
+    requires_grad = kwargs.get("requires_grad", False)
     kwargs_without_requires_grad = kwargs.copy()
-    kwargs_without_requires_grad['requires_grad'] = False
+    kwargs_without_requires_grad["requires_grad"] = False
     result = torch.rand(*args, **kwargs_without_requires_grad)
     return result.clamp_min_(0.1).requires_grad_(requires_grad)
 
-class TestVmapBatchedGradient(Namespace.TestVmapBase):
+
+class TestVmapBatchedGradientLegacy(Namespace.TestVmapBaseLegacy):
     def _vmap_test(self, *args, **kwargs):
         return _vmap_test(self, *args, **kwargs)
 
@@ -2223,7 +2551,9 @@ def _vmap_test(self, *args, **kwargs):
     # output_process_fn: a function that maps the outputs to the part
     #       that should be differentiated.
     # batch_size: the batch dim size for the batched grad
-    def _batched_grad_test(self, op, args, kwargs=None, output_process_fn=lambda x: x, batch_size=3):
+    def _batched_grad_test(
+        self, op, args, kwargs=None, output_process_fn=lambda x: x, batch_size=3
+    ):
         if kwargs is None:
             kwargs = {}
         outputs = op(*args, **kwargs)
@@ -2231,10 +2561,13 @@ def _batched_grad_test(self, op, args, kwargs=None, output_process_fn=lambda x:
         batched_vectors = tuple(construct_v(out, batch_size) for out in outputs)
 
         def vector_jacobian_product(*vectors):
-            return torch.autograd.grad(outputs, differentiable(args), vectors,
-                                       retain_graph=True)
-        self._vmap_test(vector_jacobian_product, batched_vectors,
-                        check_propagates_grad=False)
+            return torch.autograd.grad(
+                outputs, differentiable(args), vectors, retain_graph=True
+            )
+
+        self._vmap_test(
+            vector_jacobian_product, batched_vectors, check_propagates_grad=False
+        )
 
     # Tests batched second grad computation of outputs = op(*args, **kwargs).
     # by comparing it to a sequential map+stack fallback.
@@ -2249,30 +2582,40 @@ def vector_jacobian_product(*vectors):
     # Regression.
     # It might be useful to have a test that computes batched first gradients and
     # then uses those to compute batched second gradients in the future.
-    def _batched_grad_grad_test(self, op, args, kwargs=None, output_process_fn=lambda x: x, batch_size=3):
+    def _batched_grad_grad_test(
+        self, op, args, kwargs=None, output_process_fn=lambda x: x, batch_size=3
+    ):
         if kwargs is None:
             kwargs = {}
         outputs = op(*args, **kwargs)
         outputs = differentiable(output_process_fn(outputs))
         ones = tuple(torch.ones_like(out) for out in outputs)
         # Same thing as summing together all of the outputs and calling .backward()
-        first_grads = torch.autograd.grad(outputs, differentiable(args), ones,
-                                          create_graph=True)
+        first_grads = torch.autograd.grad(
+            outputs, differentiable(args), ones, create_graph=True
+        )
         first_grads = differentiable(first_grads)
         self.assertNotEqual(
-            len(first_grads), 0, "None of the first grads depend on the input!")
+            len(first_grads), 0, "None of the first grads depend on the input!"
+        )
 
         batched_vectors = tuple(construct_v(grad, batch_size) for grad in first_grads)
 
         def vector_hessian_product(*vectors):
-            outputs = torch.autograd.grad(first_grads, differentiable(args), vectors,
-                                          retain_graph=True, allow_unused=True)
+            outputs = torch.autograd.grad(
+                first_grads,
+                differentiable(args),
+                vectors,
+                retain_graph=True,
+                allow_unused=True,
+            )
             outputs = tuple(out for out in outputs if out is not None)
             assert len(outputs) > 0
             return outputs
 
-        self._vmap_test(vector_hessian_product, batched_vectors,
-                        check_propagates_grad=False)
+        self._vmap_test(
+            vector_hessian_product, batched_vectors, check_propagates_grad=False
+        )
 
     def _test_arithmetic(self, op, device, test_grad_grad=True):
         x = torch.randn(2, 3, requires_grad=True, device=device)
@@ -2316,6 +2659,7 @@ def test_expand(self, device):
 
         def op(x):
             return x.expand(5, 5, 2, 3)
+
         self._batched_grad_test(op, (x,))
 
     @allowVmapFallbackUsage
@@ -2396,6 +2740,7 @@ def test_stack(self, device):
 
         def op(x, y):
             return torch.stack([x, y])
+
         self._batched_grad_test(op, (x, y))
 
     def test_select(self, device):
@@ -2418,7 +2763,6 @@ def test_threshold(self, device):
         x = torch.randn(2, 3, device=device, requires_grad=True)
         self._batched_grad_test(lambda x: F.threshold(x, 0.5, 0.0), (x,))
 
-
     @allowVmapFallbackUsage
     def test_inplace_on_view(self, device):
         leaf = torch.randn(4, 5, requires_grad=True)
@@ -2465,7 +2809,7 @@ def test_unrelated_output(self, device):
         gy = torch.randn(B0, requires_grad=True)
 
         def vjp(v):
-            res, = torch.autograd.grad(y, x, v, allow_unused=True)
+            (res,) = torch.autograd.grad(y, x, v, allow_unused=True)
             return torch.zeros_like(x) if res is None else res
 
         result = vmap(vjp)(gy)
@@ -2479,18 +2823,19 @@ def test_unrelated_output_multiple_grad(self, device):
         gy = torch.randn(B0, requires_grad=True)
 
         def vjp(v):
-            res, = torch.autograd.grad(y, x, v, allow_unused=True)
+            (res,) = torch.autograd.grad(y, x, v, allow_unused=True)
             return torch.zeros_like(x) if res is None else res
 
         _ = vjp(gy[0])
         result = vmap(vjp)(gy)
         self.assertEqual(result, torch.zeros(B0, *x.shape, device=device))
 
+
 instantiate_device_type_tests(
-    TestVmapBatchedGradient,
+    TestVmapBatchedGradientLegacy,
     globals(),
     None,
 )
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_linalg.py b/test/test_linalg.py
index e20d11f37e71a..a0860adb2a198 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -18,12 +18,13 @@
      TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU, iter_indices,
      make_fullrank_matrices_with_distinct_singular_values,
      freeze_rng_state, IS_ARM64, IS_SANDCASTLE, TEST_OPT_EINSUM, parametrize, skipIfTorchDynamo,
-     setLinalgBackendsToDefaultFinally)
+     setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally, serialTest)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, has_cusolver, has_hipsolver,
      onlyCPU, skipCUDAIf, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride,
      skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, onlyNativeDeviceTypes, dtypesIfCUDA,
-     onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver, dtypesIfMPS, largeTensorTest)
+     onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver, skipCUDAIfNotRocm,
+     dtypesIfMPS, largeTensorTest)
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
     all_types, all_types_and_complex_and, floating_and_complex_types, integral_types,
@@ -31,6 +32,8 @@
 )
 from torch.testing._internal.common_cuda import SM53OrLater, SM80OrLater, SM90OrLater, tf32_on_and_off, _get_magma_version, \
     _get_torch_cuda_version
+from torch.testing._internal.common_quantization import _group_quantize_tensor
+from torch.testing._internal.common_mkldnn import bf32_on_and_off
 from torch.distributions.binomial import Binomial
 import torch.backends.opt_einsum as opt_einsum
 import operator
@@ -41,8 +44,16 @@
 if TEST_SCIPY:
     import scipy
 
+def blaslt_supported_device():
+    if torch.cuda.is_available():
+        if torch.version.hip:
+            for arch in ['gfx90a', 'gfx94']:
+                if arch in torch.cuda.get_device_properties(0).gcnArchName:
+                    return True
+        else:
+            return True
+    return False
 
-@unittest.skipIf(IS_ARM64, "Issue with numpy version on arm")
 class TestLinalg(TestCase):
     def setUp(self):
         super(self.__class__, self).setUp()
@@ -57,6 +68,7 @@ def tearDown(self):
     @dtypes(torch.float, torch.cfloat)
     @precisionOverride({torch.float: 1e-06, torch.cfloat: 1e-06})
     @tf32_on_and_off(5e-3)
+    @bf32_on_and_off(5e-3)
     def test_inner(self, device, dtype):
         def check(a_sizes_, b_sizes_):
             for a_sizes, b_sizes in ((a_sizes_, b_sizes_), (b_sizes_, a_sizes_)):
@@ -604,6 +616,7 @@ def cholesky_test_helper(n, batch_dims, upper):
     @skipCPUIfNoLapack
     @dtypes(*floating_and_complex_types())
     @tf32_on_and_off(0.01)
+    @bf32_on_and_off(0.01)
     def test_old_cholesky(self, device, dtype):
         from torch.testing._internal.common_utils import random_hermitian_pd_matrix
 
@@ -1211,10 +1224,14 @@ def run_test_case(input_size, ord, keepdim):
 
     @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble, torch.bfloat16, torch.float16)
     def test_vector_norm(self, device, dtype):
+        if IS_ARM64 and device == 'cpu' and dtype in [torch.float16, torch.bfloat16, torch.float32]:
+            raise unittest.SkipTest("Fails on ARM, see https://github.com/pytorch/pytorch/issues/125438")
+        # have to use torch.randn(...).to(bfloat16) instead of
         # This test compares torch.linalg.vector_norm's output with
         # torch.linalg.norm given a flattened tensor
         ord_vector = [0, 0.9, 1, 2, 3, inf, -0.5, -1, -2, -3, -inf]
         input_sizes = [
+            (1, ),
             (10, ),
             (4, 5),
             (3, 4, 5),
@@ -1268,15 +1285,17 @@ def run_test_case(input, ord, dim, keepdim, norm_dtype):
         else:
             raise RuntimeError("Unsupported dtype")
 
-        for input_size, ord, keepdim, norm_dtype in product(input_sizes, ord_vector, [True, False], norm_dtypes):
-            input = make_tensor(input_size, dtype=dtype, device=device, low=-9, high=9)
-            for dim in [None, random.randint(0, len(input_size) - 1)]:
-                run_test_case(
-                    input,
-                    ord,
-                    dim,
-                    keepdim,
-                    norm_dtype)
+        for amp in [False, True]:
+            with torch.autocast(device_type=device, enabled=amp):
+                for input_size, ord, keepdim, norm_dtype in product(input_sizes, ord_vector, [True, False], norm_dtypes):
+                    input = make_tensor(input_size, dtype=dtype, device=device, low=-9, high=9)
+                    for dim in [None, random.randint(0, len(input_size) - 1)]:
+                        run_test_case(
+                            input,
+                            ord,
+                            dim,
+                            keepdim,
+                            norm_dtype)
 
     def test_vector_norm_dim_tuple_arg(self, device):
         test_cases = [
@@ -1626,6 +1645,29 @@ def test_vector_norm_extreme_values(self, device):
                 result_n = np.linalg.norm(x_n, ord=ord)
                 self.assertEqual(result, result_n, msg=msg)
 
+    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
+    def test_vector_norm_reduce_over_1D_vector(self, device, dtype):
+        input_sizes_and_dims = [
+            ((6, 1), -1),
+            ((3, 1, 2, 1), (1, 3)),
+            ((1,), None),
+        ]
+        orders = [float('inf'), -float('inf'), 0, 1, -1, 2, -2]
+        keepdims = [True, False]
+
+        for input_size_and_dim, ord, keepdim in product(input_sizes_and_dims, orders, keepdims):
+            input_size = input_size_and_dim[0]
+            dim = input_size_and_dim[1]
+            if type(dim) is tuple and ord == 0:
+                # skip because np.linalg.norm raises 'ValueError: Invalid norm order for matrices.'
+                continue
+            input = make_tensor(input_size, dtype=dtype, device=device, low=-9, high=9)
+            result = torch.linalg.vector_norm(input, ord, dim, keepdim)
+            result_numpy = np.linalg.norm(input.cpu().numpy(), ord, dim, keepdim)
+
+            msg = f'input.size()={input.size()}, ord={ord}, dim={dim}, keepdim={keepdim}, dtype={dtype}'
+            self.assertEqual(result, result_numpy, msg=msg)
+
     @skipCUDAIfNoMagmaAndNoCusolver
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double)
@@ -1914,7 +1956,7 @@ def test_eig_errors_and_warnings(self, device, dtype):
 
         # if out tensor with floating dtype is passed for complex output an error is thrown
         if not dtype.is_complex:
-            # The characteristic equation is p(λ) = λ^2 − 2λ + 5 = 0, with roots λ = 1±2i
+            # The characteristic equation is p(lambda) = lambda^2 - 2lambda + 5 = 0, with roots lambda = 1[+-]2i
             a = torch.tensor([[3., -2.], [4., -1.]], dtype=dtype, device=device)
             out0 = torch.empty(0, device=device, dtype=dtype)
             out1 = torch.empty(0, device=device, dtype=dtype)
@@ -2081,7 +2123,7 @@ def test_eigvals_errors_and_warnings(self, device, dtype):
 
         # if out tensor with floating dtype is passed for complex output an error is thrown
         if not dtype.is_complex:
-            # The characteristic equation is p(λ) = λ^2 − 2λ + 5 = 0, with roots λ = 1±2i
+            # The characteristic equation is p(lambda) = lambda^2 - 2lambda + 5 = 0, with roots lambda = 1[+-]2i
             a = torch.tensor([[3., -2.], [4., -1.]], dtype=dtype, device=device)
             out = torch.empty(0, device=device, dtype=dtype)
             with self.assertRaisesRegex(RuntimeError, "Expected eigenvalues to be safely castable"):
@@ -2443,6 +2485,7 @@ def run_subtest(actual_rank, matrix_size, batches, device, svd_lowrank, **option
     @precisionOverride({torch.float: 1e-4, torch.cfloat: 2e-4})
     @setLinalgBackendsToDefaultFinally
     @dtypes(*floating_and_complex_types())
+    @serialTest()
     def test_svd(self, device, dtype):
         # tests linalg.svd, svd, linalg.svdvals
         make_arg = partial(make_tensor, dtype=dtype, device=device)
@@ -4286,6 +4329,15 @@ def run_test(A_dims, b_dims, device, upper, transpose, unitriangular):
             run_test((4, 4), (2, 1, 3, 4, 2), device, upper, transpose, unitriangular)  # broadcasting A
             run_test((1, 3, 1, 4, 4), (2, 1, 3, 4, 5), device, upper, transpose, unitriangular)  # broadcasting A & b
 
+    @onlyCUDA
+    @dtypes(torch.float)
+    def test_triangular_solve_large(self, device, dtype):
+        # Repro for https://github.com/pytorch/pytorch/issues/79191
+        A = torch.randn(1, 2, 2, device=device, dtype=dtype).tril_()
+        B = torch.randn(1, 2, 524281, device=device, dtype=dtype)
+        X = torch.linalg.solve_triangular(A, B, upper=False)
+        self.assertEqual(A @ X, B)
+
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(*floating_and_complex_types())
@@ -4382,33 +4434,62 @@ def gen_sizes_matmul(self, x_dim, y_dim=4, matrix_size=4, batch_size=3):
 
     @dtypesIfCUDA(torch.float, torch.complex64)  # Integer matmul just supported on CPU
     @dtypes(torch.int64, torch.float, torch.complex64)
+    @setBlasBackendsToDefaultFinally
     def test_matmul_small_brute_force_1d_Nd(self, device, dtype):
-        make_arg = partial(make_tensor, device=device, dtype=dtype)
+        for backend in ["cublas", "cublaslt"]:
+            if torch.device(device).type == 'cuda':
+                torch.backends.cuda.preferred_blas_library(backend)
 
-        for (size_x, size_y), nctg_x, nctg_y in product(self.gen_sizes_matmul(1), (True, False), (True, False)):
-            x = make_arg(size_x, noncontiguous=nctg_x)
-            y = make_arg(size_y, noncontiguous=nctg_y)
-            self.check_single_matmul(x, y)
+            make_arg = partial(make_tensor, device=device, dtype=dtype)
+
+            for (size_x, size_y), nctg_x, nctg_y in product(self.gen_sizes_matmul(1), (True, False), (True, False)):
+                x = make_arg(size_x, noncontiguous=nctg_x)
+                y = make_arg(size_y, noncontiguous=nctg_y)
+                self.check_single_matmul(x, y)
 
     @dtypesIfCUDA(torch.float, torch.complex64)  # Integer matmul just supported on CPU
     @dtypes(torch.int64, torch.float, torch.complex64)
+    @setBlasBackendsToDefaultFinally
     def test_matmul_small_brute_force_2d_Nd(self, device, dtype):
-        make_arg = partial(make_tensor, device=device, dtype=dtype)
+        for backend in ["cublas", "cublaslt"]:
+            if torch.device(device).type == 'cuda':
+                torch.backends.cuda.preferred_blas_library(backend)
 
-        for (size_x, size_y), nctg_x, nctg_y in product(self.gen_sizes_matmul(2), (True, False), (True, False)):
-            x = make_arg(size_x, noncontiguous=nctg_x)
-            y = make_arg(size_y, noncontiguous=nctg_y)
-            self.check_single_matmul(x, y)
+            make_arg = partial(make_tensor, device=device, dtype=dtype)
+
+            for (size_x, size_y), nctg_x, nctg_y in product(self.gen_sizes_matmul(2), (True, False), (True, False)):
+                x = make_arg(size_x, noncontiguous=nctg_x)
+                y = make_arg(size_y, noncontiguous=nctg_y)
+                self.check_single_matmul(x, y)
 
     @dtypesIfCUDA(torch.float, torch.complex64)  # Integer matmul just supported on CPU
     @dtypes(torch.int64, torch.float, torch.complex64)
+    @setBlasBackendsToDefaultFinally
     def test_matmul_small_brute_force_3d_Nd(self, device, dtype):
-        make_arg = partial(make_tensor, device=device, dtype=dtype)
+        for backend in ["cublas", "cublaslt"]:
+            if torch.device(device).type == 'cuda':
+                torch.backends.cuda.preferred_blas_library(backend)
+
+            make_arg = partial(make_tensor, device=device, dtype=dtype)
 
-        for (size_x, size_y), nctg_x, nctg_y in product(self.gen_sizes_matmul(3), (True, False), (True, False)):
-            x = make_arg(size_x, noncontiguous=nctg_x)
-            y = make_arg(size_y, noncontiguous=nctg_y)
-            self.check_single_matmul(x, y)
+            for (size_x, size_y), nctg_x, nctg_y in product(self.gen_sizes_matmul(3), (True, False), (True, False)):
+                x = make_arg(size_x, noncontiguous=nctg_x)
+                y = make_arg(size_y, noncontiguous=nctg_y)
+                self.check_single_matmul(x, y)
+
+    @dtypes(torch.float, torch.complex64)
+    def test_matmul_out_kernel_errors_with_autograd(self, device, dtype):
+        a = torch.empty((256, 512), device=device, dtype=dtype, requires_grad=True).unsqueeze(0)
+        b = torch.empty((4, 128, 512), device=device, dtype=dtype, requires_grad=True).transpose(-1, -2)
+        c = torch.empty((256, 4, 128), device=device, dtype=dtype).movedim(1, 0)
+
+        torch.matmul(a.detach(), b.detach(), out=c)
+
+        with self.assertRaisesRegex(RuntimeError, "functions with out=... arguments don't support automatic differentiation"):
+            torch.matmul(a, b, out=c)
+
+        with torch.no_grad():
+            torch.matmul(a, b, out=c)
 
     # 4GB should do, but we run tests in parallel in CI, so let's be generous
     @largeTensorTest('16GB', device='cuda')
@@ -4682,6 +4763,44 @@ def test_corner_cases_of_cublasltmatmul(self, device, dtype):
         m2 = torch.randn(16, 131071, device=device).to(dtype)
         torch.nn.functional.linear(m1, m2, M)
 
+    @onlyCUDA
+    @skipCUDAIfNotRocm
+    @dtypes(*floating_types_and(torch.bfloat16, torch.half))
+    def test_hipblaslt_corner_cases_rocm(self, device, dtype):
+        if dtype == torch.double:
+            raise unittest.SkipTest("hipblasLt doesn't support doubles yet")
+
+        # enable hipblaslt path via env variable.
+        import os
+        DISABLE_ADDMM_HIP_LT = "DISABLE_ADDMM_HIP_LT"
+        prev_val = os.getenv(DISABLE_ADDMM_HIP_LT)
+        try:
+            os.environ[DISABLE_ADDMM_HIP_LT] = "0"
+            # common case
+            M = torch.randn(128, device=device, dtype=dtype)
+            m1 = torch.randn(2048, 2400, device=device, dtype=dtype)
+            m2 = torch.randn(128, 2400, device=device, dtype=dtype)
+            out1 = torch.nn.functional.linear(m1, m2, M)
+            M_cpu = M.to('cpu')
+            m1_cpu = m1.to('cpu')
+            m2_cpu = m2.to('cpu')
+            out1_cpu = torch.nn.functional.linear(m1_cpu, m2_cpu, M_cpu)
+            self.assertTrue(torch.allclose(out1_cpu, out1.cpu(), rtol=1e-2, atol=1e-2))
+
+            # common case without bias
+            m1 = torch.randn(2048, 2400, device=device, dtype=dtype)
+            m2 = torch.randn(128, 2400, device=device, dtype=dtype)
+            out2 = torch.nn.functional.linear(m1, m2, bias=None)
+            m1_cpu = m1.to('cpu')
+            m2_cpu = m2.to('cpu')
+            out2_cpu = torch.nn.functional.linear(m1_cpu, m2_cpu, bias=None)
+            self.assertTrue(torch.allclose(out2_cpu, out2.cpu(), rtol=1e-2, atol=1e-2))
+        finally:
+            if prev_val is None:
+                del os.environ[DISABLE_ADDMM_HIP_LT]
+            else:
+                os.environ[DISABLE_ADDMM_HIP_LT] = prev_val
+
     @dtypesIfCUDA(*floating_and_complex_types_and(
                   torch.half,
                   *[torch.bfloat16] if SM53OrLater else []
@@ -4881,6 +5000,7 @@ def test_householder_product_errors_and_warnings(self, device):
 
     @precisionOverride({torch.float32: 1e-2, torch.complex64: 1e-2})
     @skipCUDAIfNoMagmaAndNoCusolver
+    @skipIfTorchDynamo("Runtime error with torch._C._linalg.linalg_lu_factor")
     @skipCPUIfNoLapack
     @dtypes(*floating_and_complex_types())
     def test_linalg_lu_family(self, device, dtype):
@@ -5117,7 +5237,11 @@ def test_lobpcg_basic(self, device, dtype):
     @skipCPUIfNoLapack
     @dtypes(torch.double)
     def test_lobpcg_ortho(self, device, dtype):
+        if torch.version.hip:
+            torch.backends.cuda.preferred_linalg_library('magma')
         self._test_lobpcg_method(device, dtype, 'ortho')
+        if torch.version.hip:
+            torch.backends.cuda.preferred_linalg_library('default')
 
     def _test_lobpcg_method(self, device, dtype, method):
         from torch.testing._internal.common_utils import random_symmetric_pd_matrix, random_sparse_pd_matrix
@@ -5463,6 +5587,8 @@ def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=
                   torch.half))
     @dtypes(torch.bfloat16, torch.half, torch.float, torch.double, torch.cfloat, torch.cdouble)
     def test_addmv(self, device, dtype):
+        if IS_ARM64 and device == 'cpu' and dtype == torch.float16:
+            raise unittest.SkipTest("Fails on ARM, see https://github.com/pytorch/pytorch/issues/125438")
         # have to use torch.randn(...).to(bfloat16) instead of
         # torch.randn(..., dtype=bfloat16). randn does not support
         # bfloat16 yet.
@@ -5570,6 +5696,7 @@ def maybe_transpose(cond, m):
                   *[torch.bfloat16] if TEST_WITH_ROCM or SM53OrLater else []))
     @dtypes(*floating_and_complex_types_and(torch.bfloat16, torch.half))
     @tf32_on_and_off(0.05)
+    @bf32_on_and_off(0.05)
     def test_addmm(self, device, dtype):
         self._test_addmm_impl(torch.addmm, None, device, dtype)
 
@@ -5579,6 +5706,7 @@ def test_addmm(self, device, dtype):
                   *[torch.bfloat16, torch.half] if TEST_WITH_ROCM or SM53OrLater else []))
     @dtypes(*floating_types_and(torch.bfloat16))
     @tf32_on_and_off(0.05)
+    @bf32_on_and_off(0.05)
     def test_addmm_relu(self, device, dtype):
         self._test_addmm_impl(torch._addmm_activation, "relu", device, dtype)
 
@@ -5588,12 +5716,14 @@ def test_addmm_relu(self, device, dtype):
                   *[torch.bfloat16, torch.half] if TEST_WITH_ROCM or SM53OrLater else []))
     @dtypes(*floating_types_and(torch.bfloat16))
     @tf32_on_and_off(0.05)
+    @bf32_on_and_off(0.05)
     def test_addmm_gelu(self, device, dtype):
         self._test_addmm_impl(torch._addmm_activation, "gelu", device, dtype)
 
     @dtypes(torch.float, torch.double)
     @dtypesIfCUDA(*floating_and_complex_types())
     @tf32_on_and_off(0.005)
+    @bf32_on_and_off(0.005)
     def test_addmm_sizes(self, device, dtype):
         for m in [0, 1, 25]:
             for n in [0, 1, 10]:
@@ -5673,7 +5803,7 @@ def test_matmul_45724(self, device):
         self.assertEqual(c, cpu_result)
 
     @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
-    @unittest.skipIf(SM90OrLater, "Expected failure on sm90")
+    @unittest.skipIf(SM90OrLater and not TEST_WITH_ROCM, "Expected failure on sm90")
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
     @onlyCUDA
     @parametrize("k", [16, 32])
@@ -5681,9 +5811,6 @@ def test_matmul_45724(self, device):
     @parametrize("use_transpose_a", [True, False])
     @parametrize("use_transpose_b", [True, False])
     def test__int_mm(self, device, k, n, use_transpose_a, use_transpose_b):
-        if TEST_WITH_ROCM:
-            self.skipTest("_int_mm not compiled for ROCM")
-
         def genf_int_float(x, y, use_transpose):
             if use_transpose:
                 x, y = y, x
@@ -5715,9 +5842,13 @@ def _test(m, k, n, transpose_a, transpose_b, test_equal=True):
         version = _get_torch_cuda_version()
         SM80OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 0)
         SM70 = torch.cuda.is_available() and torch.cuda.get_device_capability() == (7, 0)
-        if version >= (11, 7):
+        SM75 = torch.cuda.is_available() and torch.cuda.get_device_capability() == (7, 5)
+
+        if TEST_WITH_ROCM:
+            _test(17, k, n, use_transpose_a, use_transpose_b, True)
+        elif version >= (11, 7):
             if not use_transpose_a and use_transpose_b:
-                if SM80OrLater or (version >= (12, 3) and SM70):
+                if SM80OrLater or (version >= (12, 3) and (SM70 or SM75)):
                     _test(17, k, n, use_transpose_a, use_transpose_b, version > (11, 7))
                 else:
                     with self.assertRaisesRegex(RuntimeError,
@@ -5735,7 +5866,7 @@ def _test(m, k, n, transpose_a, transpose_b, test_equal=True):
                     _test(17, k, n, use_transpose_a, use_transpose_b)
 
             if not use_transpose_a and not use_transpose_b:
-                if SM80OrLater or (version >= (12, 3) and SM70):
+                if SM80OrLater or (version >= (12, 3) and (SM70 or SM75)):
                     _test(17, k, n, use_transpose_a, use_transpose_b)
                 else:
                     with self.assertRaisesRegex(RuntimeError,
@@ -5790,54 +5921,59 @@ def _gen_pair(m, k, n):
                                r"Expected result.size\(0\) to be 17 but got 16",
                                lambda: torch._int_mm(genf_int(17, 8), genf_int(8, 32), out=genf_int(16, 31).int()))
 
-    def _group_quantize_tensor(self, w, n_bit=4, q_group_size=16):
-        assert w.dim() == 2
-        w = w.transpose(0, 1).contiguous()
-        assert q_group_size > 1
-        assert w.shape[-1] % q_group_size == 0
-
-        to_quant = w.reshape(-1, q_group_size)
-        assert torch.isnan(to_quant).sum() == 0
-
-        max_val = to_quant.amax(dim=1, keepdim=True)
-        min_val = to_quant.amin(dim=1, keepdim=True)
-        max_int = 2 ** n_bit - 1
-        min_int = 0
-        scales = (max_val - min_val).clamp(min=1e-6) / max_int
-        assert torch.isnan(scales).sum() == 0
-
-        zeros = min_val + scales * (2 ** (n_bit - 1))
-        assert torch.isnan(zeros).sum() == 0
-
-        out = to_quant.sub(min_val).div(scales).round().clamp_(min_int, max_int)
-        assert torch.isnan(out).sum() == 0
-
-        out = out.to(dtype=torch.int32).reshape(w.shape)
-
-        # Scales and zeros for the same q-group should be contiguous, so we can
-        # load as a 32-bit word
-        scales = scales.view(w.shape[0], -1)
-        zeros = zeros.view(w.shape[0], -1)
-        scales_and_zeros = (
-            torch.cat(
-                [
-                    scales.reshape(scales.size(0), scales.size(1), 1),
-                    zeros.reshape(zeros.size(0), zeros.size(1), 1),
-                ],
-                2,
-            ).transpose(0, 1).contiguous()
-        )
+    @onlyCPU
+    @parametrize("m", [0, 8, 17])
+    @parametrize("k", [0, 16, 32])
+    @parametrize("n", [16, 32])
+    @parametrize("use_transpose_a", [True, False])
+    @parametrize("use_transpose_b", [True, False])
+    @parametrize("non_contig_type", [0, 1, 2])
+    def test__int_mm_cpu(self, device, m, k, n, use_transpose_a, use_transpose_b, non_contig_type):
+        # non_contig_type:
+        # 0: the whole data buffer is contiguous (can be transposed)
+        # 1: stride of one dimension is 1, but the whole buffer is not contiguous
+        # 2: Neither stride is 1
+
+        def genf_int_float(x, y, use_transpose, non_contig_type):
+            if use_transpose:
+                x, y = y, x
+            if non_contig_type != 0:
+                y = y * 2
+            x_int8 = torch.randint(-10, 10, (x, y), dtype=torch.int8, device=device)
+            x_float = x_int8.to(torch.float32)
+            if non_contig_type == 1:
+                x_int8 = x_int8[:, : y // 2]
+                x_float = x_float[:, : y // 2]
+            elif non_contig_type == 2:
+                x_int8 = x_int8[:, ::2]
+                x_float = x_float[:, ::2]
+            if use_transpose:
+                return x_int8.t(), x_float.t()
+            return x_int8, x_float
 
-        return out, scales_and_zeros
+        if non_contig_type != 0 and (m == 0 or k == 0):
+            return
+        a_int8, a_float = genf_int_float(m, k, use_transpose_a, non_contig_type)
+        b_int8, b_float = genf_int_float(k, n, use_transpose_b, non_contig_type)
+        c_int32 = torch._int_mm(a_int8, b_int8)
+        self.assertTrue(c_int32.dtype is torch.int32)
+        self.assertEqual(c_int32.device, torch.device(device))
+        self.assertEqual(c_int32.float(), torch.mm(a_float, b_float))
+        c_int32_result = c_int32.new_empty(c_int32.size())
+        # Checking out variant
+        torch._int_mm(a_int8, b_int8, out=c_int32_result)
+        self.assertEqual(c_int32_result.float(), torch.mm(a_float, b_float))
 
     @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
-    @unittest.skipIf(not SM80OrLater, "need sm_80")
-    @onlyCUDA
+    @onlyNativeDeviceTypes
     @parametrize("m", [32, 64])
     @parametrize("k", [32, 64])
     @parametrize("n", [48, 64])
     def test__int4_mm(self, device, m, k, n):
+        if self.device_type == 'cuda' and not SM80OrLater:
+            self.skipTest("requires SM80 or later")
+
         if TEST_WITH_ROCM:
             self.skipTest("_int4_mm not compiled for ROCM")
 
@@ -5845,11 +5981,11 @@ def test__int4_mm(self, device, m, k, n):
         inner_k_tiles = 2
 
         torch.manual_seed(1)
-        a = torch.rand((m, k), dtype=torch.bfloat16, device=device)
-        b = torch.rand((k, n), dtype=torch.bfloat16, device=device)
+        a_bf16 = torch.rand((m, k), dtype=torch.bfloat16, device=device)
+        b_bf16 = torch.rand((k, n), dtype=torch.bfloat16, device=device)
 
         def convert_weight_to_int4pack(b):
-            b_int32, b_scales_and_zeros = self._group_quantize_tensor(
+            b_int32, b_scales_and_zeros = _group_quantize_tensor(
                 b, n_bit=4, q_group_size=q_group
             )
             b_int4pack = torch._convert_weight_to_int4pack(
@@ -5863,21 +5999,29 @@ def weight_int4pack_mm(a, b_int4pack, b_scales_and_zeros):
                 a, b_int4pack, q_group, b_scales_and_zeros
             )
 
-        b_int4pack, b_scales_and_zeros = convert_weight_to_int4pack(b)
-        res = weight_int4pack_mm(a, b_int4pack, b_scales_and_zeros)
-        ref = torch.mm(a, b)
+        b_int4pack, b_scales_and_zeros_bf16 = convert_weight_to_int4pack(b_bf16)
+
+        for dtype in [torch.bfloat16] + ([torch.float16, torch.float32] if device == "cpu" else []):
+            a = a_bf16.to(dtype=dtype)
+            b = b_bf16.to(dtype=dtype)
+            b_scales_and_zeros = b_scales_and_zeros_bf16.to(dtype=dtype)
+            ref = torch.mm(a, b)
+            res = weight_int4pack_mm(a, b_int4pack, b_scales_and_zeros)
+
+            mean_err = ((res - ref).abs() / ref).mean()
+            self.assertTrue(mean_err < 0.05)
 
-        mean_err = ((res - ref).abs() / ref).mean()
-        self.assertTrue(mean_err < 0.05)
 
     @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
-    @unittest.skipIf(not SM80OrLater, "need sm_80")
-    @onlyCUDA
+    @onlyNativeDeviceTypes
     @parametrize("m", [32, 64])
     @parametrize("k", [32, 64])
     @parametrize("n", [48, 64])
     def test_compile_int4_mm(self, device, m, k, n):
+        if self.device_type == 'cuda' and not SM80OrLater:
+            self.skipTest("requires SM80 or later")
+
         if TEST_WITH_ROCM:
             self.skipTest("_int4_mm not compiled for ROCM")
 
@@ -5888,7 +6032,7 @@ def test_compile_int4_mm(self, device, m, k, n):
         a = torch.rand((m, k), dtype=torch.bfloat16, device=device)
         b = torch.rand((k, n), dtype=torch.bfloat16, device=device)
 
-        b_int32, b_scales_and_zeros = self._group_quantize_tensor(
+        b_int32, b_scales_and_zeros = _group_quantize_tensor(
             b, n_bit=4, q_group_size=q_group
         )
 
@@ -5907,12 +6051,96 @@ def int4_mm(a, b_int32, b_scales_and_zeros):
         mean_err = ((res - ref).abs() / ref).mean()
         self.assertTrue(mean_err < 0.05)
 
+    def _dynamically_quantize_per_channel(self, x, quant_min, quant_max, target_dtype):
+        # source: https://github.com/pytorch-labs/gpt-fast/blob/main/quantize.py
+        # default setup for affine quantization of activations
+        x_dtype = x.dtype
+        x = x.float()
+        eps = torch.finfo(torch.float32).eps
+
+        # get min and max
+        min_val, max_val = torch.aminmax(x, dim=1)
+
+        # calculate scales and zero_points based on min and max
+        # reference: https://fburl.com/code/srbiybme
+        min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
+        max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
+        device = min_val_neg.device
+
+        # reference: https://fburl.com/code/4wll53rk
+        max_val_pos = torch.max(-min_val_neg, max_val_pos)
+        scales = max_val_pos / (float(quant_max - quant_min) / 2)
+        # ensure scales is the same dtype as the original tensor
+        scales = torch.clamp(scales, min=eps).to(x.dtype)
+        zero_points = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device)
+
+        # quantize based on qmin/qmax/scales/zp
+        x_div = x / scales.unsqueeze(-1)
+        x_round = torch.round(x_div)
+        x_zp = x_round + zero_points.unsqueeze(-1)
+        quant = torch.clamp(x_zp, quant_min, quant_max).to(target_dtype)
+
+        return quant, scales.to(x_dtype), zero_points
+
+    @onlyCPU
+    @parametrize("m", [32, 64])
+    @parametrize("k", [32, 64])
+    @parametrize("n", [48, 64])
+    def test__int8_mm(self, device, m, k, n):
+        torch.manual_seed(1)
+        a = torch.rand((m, k), dtype=torch.bfloat16, device=device)
+        b = torch.rand((n, k), dtype=torch.bfloat16, device=device)
+
+        def convert_weight_to_int8pack(b):
+            b_int8pack, b_scales, _ = self._dynamically_quantize_per_channel(
+                b, -128, 127, torch.int8
+            )
+            return b_int8pack, b_scales
+
+        def weight_int8pack_mm(a, b_int8pack, b_scales):
+            return torch._weight_int8pack_mm(
+                a, b_int8pack, b_scales
+            )
+
+        b_int8pack, b_scales = convert_weight_to_int8pack(b)
+        res = weight_int8pack_mm(a, b_int8pack, b_scales)
+        ref = torch.mm(a, b.transpose(0, 1))
+
+        mean_err = ((res - ref).abs() / ref).mean()
+        self.assertTrue(mean_err < 0.05)
+
+    @onlyCPU
+    @parametrize("m", [32, 64])
+    @parametrize("k", [32, 64])
+    @parametrize("n", [48, 64])
+    def test_compile_int8_mm(self, device, m, k, n):
+        torch.manual_seed(1)
+        a = torch.rand((m, k), dtype=torch.bfloat16, device=device)
+        b = torch.rand((n, k), dtype=torch.bfloat16, device=device)
+
+        b_int8pack, b_scales, _ = self._dynamically_quantize_per_channel(
+            b, -128, 127, torch.int8
+        )
+
+        @torch.compile
+        def int8_mm(a, b_int8pack, b_scales):
+            return torch._weight_int8pack_mm(
+                a, b_int8pack, b_scales
+            )
+
+        res = int8_mm(a, b_int8pack, b_scales)
+        ref = torch.mm(a, b.transpose(0, 1))
+
+        mean_err = ((res - ref).abs() / ref).mean()
+        self.assertTrue(mean_err < 0.05)
+
     @slowTest
     @onlyNativeDeviceTypes
     # bfloat16 doesn't have sufficient precision to pass this test
     @dtypes(torch.half, torch.float32, torch.float64, torch.int32, torch.int64, torch.cfloat, torch.cdouble)
     @dtypesIfCUDA(torch.float32, torch.float64, torch.cfloat, torch.cdouble)
     @tf32_on_and_off(0.01)
+    @bf32_on_and_off(0.01)
     def test_mm(self, device, dtype):
         def _test_mm(n, m, p, dtype, genf):
             # helper function
@@ -6045,6 +6273,26 @@ def t_b(tensor):
         self.assertEqual(torch.bmm(Ab_conj, Bb_, out=out_b), torch.bmm(Ab_conj_physical, Bb_, out=out_b))
         self.assertEqual(torch.bmm(t_b(Ab_conj), Bb_, out=out_b), torch.bmm(t_b(Ab_conj_physical), Bb_, out=out_b))
 
+    @onlyNativeDeviceTypes
+    def test_mm_conjtranspose(self, device):
+        A = torch.randn(3, 3, dtype=torch.cfloat, device=device)
+        B = torch.randn(3, 3, dtype=torch.cfloat, device=device)
+
+        # A conjtranspose
+        out1 = torch.mm(A.t().conj(), B)
+        out1_ref = torch.mm(A.t().conj_physical(), B)
+        self.assertEqual(out1, out1_ref)
+
+        # B conjtranspose
+        out1 = torch.mm(A, B.t().conj())
+        out1_ref = torch.mm(A, B.t().conj_physical())
+        self.assertEqual(out1, out1_ref)
+
+        # A&B conjtranspose
+        out1 = torch.mm(A.t().conj(), B.t().conj())
+        out1_ref = torch.mm(A.t().conj_physical(), B.t().conj_physical())
+        self.assertEqual(out1, out1_ref)
+
     @onlyNativeDeviceTypes
     def test_mm_empty_inputs_mixed_dtype_errors(self, device):
         a = torch.randint(0, 10, [1, 10], dtype=torch.int16, device=device)
@@ -6072,6 +6320,7 @@ def test_strided_mm_bmm(self, device, dtype):
     @onlyNativeDeviceTypes
     @dtypes(*floating_and_complex_types_and(torch.bfloat16, torch.half))
     @tf32_on_and_off(0.05)
+    @bf32_on_and_off(0.05)
     def test_bmm(self, device, dtype):
         if self.device_type == 'cuda' and dtype is torch.bfloat16 and not SM53OrLater:
             # cuBLAS does not guarantee BFloat16 support on SM < 53.
@@ -6184,6 +6433,7 @@ def _test_addbmm_baddbmm(self, func, b1, b2, ref, out_tensor):
     @onlyNativeDeviceTypes
     @dtypes(*floating_and_complex_types_and(torch.bfloat16, torch.half))
     @tf32_on_and_off(0.05)
+    @bf32_on_and_off(0.05)
     def test_addbmm(self, device, dtype):
         if self.device_type == 'cuda' and dtype is torch.bfloat16 and not SM53OrLater:
             # cuBLAS does not guarantee BFloat16 support on SM < 53.
@@ -6257,6 +6507,7 @@ def generate_tensor():
     @onlyNativeDeviceTypes
     @dtypes(*floating_and_complex_types_and(torch.bfloat16, torch.half))
     @tf32_on_and_off(0.05)
+    @bf32_on_and_off(0.05)
     def test_baddbmm(self, device, dtype):
         if self.device_type == 'cuda' and dtype is torch.bfloat16 and not SM53OrLater:
             # cuBLAS does not guarantee BFloat16 support on SM < 53.
@@ -7215,6 +7466,7 @@ def dims_full_for_fn():
             self.assertEqual(r0, r1)
 
     @tf32_on_and_off(0.001)
+    @bf32_on_and_off(0.001)
     def test_broadcast_batched_matmul(self, device):
         n_dim = random.randint(1, 8)
         m_dim = random.randint(1, 8)
@@ -7398,6 +7650,7 @@ def test_pca_lowrank(self, device):
 
         def run_subtest(guess_rank, actual_rank, matrix_size, batches, device, pca, **options):
             density = options.pop('density', 1)
+            use_svd_lowrank = options.pop('use_svd_lowrank', False)
             if isinstance(matrix_size, int):
                 rows = columns = matrix_size
             else:
@@ -7409,7 +7662,11 @@ def run_subtest(guess_rank, actual_rank, matrix_size, batches, device, pca, **op
                 a_input = random_sparse_matrix(rows, columns, density, device=device, dtype=dtype)
                 a = a_input.to_dense()
 
-            u, s, v = pca(a_input, q=guess_rank, **options)
+            if use_svd_lowrank:
+                m = a_input.mean(dim=-2, keepdim=True)
+                u, s, v = pca(a_input, q=guess_rank, M=m, **options)
+            else:
+                u, s, v = pca(a_input, q=guess_rank, **options)
 
             self.assertEqual(s.shape[-1], guess_rank)
             self.assertEqual(u.shape[-2], rows)
@@ -7447,6 +7704,8 @@ def run_subtest(guess_rank, actual_rank, matrix_size, batches, device, pca, **op
                     if guess_rank <= min(*size):
                         run_subtest(guess_rank, actual_rank, size, batches, device, torch.pca_lowrank)
                         run_subtest(guess_rank, actual_rank, size[::-1], batches, device, torch.pca_lowrank)
+                        run_subtest(guess_rank, actual_rank, size, batches, device, torch.svd_lowrank, use_svd_lowrank=True)
+                        run_subtest(guess_rank, actual_rank, size[::-1], batches, device, torch.svd_lowrank, use_svd_lowrank=True)
 
         # sparse input
         for guess_rank, size in [
@@ -7544,6 +7803,7 @@ def fn(torchfn, *args):
                          fn(torch.slogdet, (0, 0)))
 
     @tf32_on_and_off(0.005)
+    @bf32_on_and_off(0.005)
     def test_tensordot(self, device):
         a = torch.arange(60., device=device).reshape(3, 4, 5)
         b = torch.arange(24., device=device).reshape(4, 3, 2)
@@ -7694,6 +7954,27 @@ def test_preferred_linalg_library(self):
         self.assertEqual(out_ref, out1.cpu())
         self.assertEqual(out1, out2)
 
+    @onlyCUDA
+    @unittest.skipIf(not blaslt_supported_device(), "blasLt not supported on current device")
+    @setBlasBackendsToDefaultFinally
+    def test_preferred_blas_library(self):
+        # The main purpose of this test is to make sure these "backend" calls work normally without raising exceptions.
+        m1 = torch.randint(2, 5, (2048, 2400), device='cuda', dtype=torch.float)
+        m2 = torch.randint(2, 5, (128, 2400), device='cuda', dtype=torch.float)
+
+        torch.backends.cuda.preferred_blas_library('cublaslt')
+        out1 = torch.nn.functional.linear(m1, m2)
+
+        torch.backends.cuda.preferred_blas_library('cublas')
+        out2 = torch.nn.functional.linear(m1, m2)
+
+        # Although blas preferred flags doesn't affect CPU currently,
+        # we set this to make sure the flag can switch back to default normally.
+        out_ref = torch.nn.functional.linear(m1.cpu(), m2.cpu())
+
+        self.assertEqual(out1, out2)
+        self.assertEqual(out_ref, out2.cpu())
+
     def test_permute_matmul(self):
         a = torch.ones([2, 5, 24, 24])
         b = torch.ones([3, 2, 5, 24, 24])
diff --git a/test/test_maskedtensor.py b/test/test_maskedtensor.py
index ed7f5677888bb..87ce6db35e20f 100644
--- a/test/test_maskedtensor.py
+++ b/test/test_maskedtensor.py
@@ -492,7 +492,7 @@ def test_masks_match(self, fn_name):
         mt1 = masked_tensor(data1, mask1)
         try:
             fn(mt0, mt1)
-            raise AssertionError()
+            raise AssertionError
         except ValueError as e:
             assert (
                 "Input masks must match. If you need support for this, please open an issue on Github."
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index 140903b5e2b7c..a5c583580848d 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -3,7 +3,7 @@
 import unittest
 from itertools import product
 from functools import partial
-from typing import Optional
+from typing import Optional, Tuple
 
 import torch
 
@@ -150,7 +150,7 @@ def test_cublas_addmm_alignment(self, dtype):
 
     @onlyCUDA
     @unittest.skipIf(IS_JETSON, "Too large for Jetson")
-    @toleranceOverride({torch.float32: xtol(atol=1e-5, rtol=1e-5)})
+    @toleranceOverride({torch.float32: xtol(atol=1e-5, rtol=1.1e-5)})
     @dtypes(*([torch.float32, torch.float16] +
               [torch.bfloat16] if TEST_WITH_ROCM or SM53OrLater else []))
     @parametrize(
@@ -204,13 +204,161 @@ def _expand_to_batch(t: torch.Tensor):
         self.assertEqual(out1_gpu, out2_gpu[0])
 
 
-@unittest.skipIf(TEST_WITH_ROCM, "FP8 is not supported on ROCM")
+
+f8_msg = "FP8 is only supported on H100+ and sm_89 and MI300+ devices"
+
+if torch.version.hip:
+    e4m3_type = torch.float8_e4m3fnuz
+    e5m2_type = torch.float8_e5m2fnuz
+    E4M3_MAX_POS = torch.finfo(torch.float8_e4m3fnuz).max
+    E5M2_MAX_POS = torch.finfo(torch.float8_e5m2fnuz).max
+else:
+    e4m3_type = torch.float8_e4m3fn
+    e5m2_type = torch.float8_e5m2
+    E4M3_MAX_POS = torch.finfo(torch.float8_e4m3fn).max
+    E5M2_MAX_POS = torch.finfo(torch.float8_e5m2).max
+
+# avoid division by zero when calculating scale
+EPS = 1e-12
+
+def scaled_mm_supported_device():
+    if torch.cuda.is_available():
+        if torch.version.hip:
+            return 'gfx94' in torch.cuda.get_device_properties(0).gcnArchName
+        else:
+            return torch.cuda.get_device_capability() >= (9, 0) or torch.cuda.get_device_capability() == (8, 9)
+    return False
+
+
+def amax_to_scale(
+    amax: torch.Tensor, float8_dtype: torch.dtype, orig_dtype: torch.dtype
+):
+    """ Converts the amax value of a tensor to the fp8 scale.
+    Args:
+        amax: The amax value of the tensor.
+        float8_dtype: the float8 dtype.
+        orig_dtype: The original dtype of the tensor.
+    """
+    scale = torch.empty_like(amax, dtype=torch.float32)
+    if float8_dtype == e4m3_type:
+        res = E4M3_MAX_POS / torch.clamp(amax, min=EPS)
+    elif float8_dtype == e5m2_type:
+        res = E4M3_MAX_POS / torch.clamp(amax, min=EPS)
+    else:
+        raise ValueError(f"Unsupported float8_dtype: {float8_dtype}")
+
+    # Ensure the scale is representable in float16,
+    # this helps when amax is small. We are assuming that we don't need
+    # to care about this for float32/bfloat16
+    if orig_dtype is torch.float16:
+        res = torch.clamp(res, max=torch.finfo(torch.float16).max)
+
+    scale.copy_(res)
+    return scale
+
+def tensor_to_scale(x: torch.Tensor, float8_dtype: torch.dtype):
+    amax = torch.max(torch.abs(x))
+    return amax_to_scale(amax, float8_dtype, x.dtype)
+
+def mm_float8_emulated(x, x_scale, y, y_scale, out_dtype):
+    # naive implementation: dq -> op -> q
+    x_fp32 = x.to(torch.float) / x_scale
+    y_fp32 = y.to(torch.float) / y_scale
+    out_fp32 = torch.mm(x_fp32, y_fp32)
+
+    return out_fp32.to(out_dtype), torch.max(torch.abs(out_fp32))
+
+def addmm_float8_unwrapped(
+    a_data: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_data: torch.Tensor,
+    b_scale: torch.tensor,
+    output_dtype: torch.dtype,
+    output_scale: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    a_inverse_scale = a_scale.reciprocal()
+    b_inverse_scale = b_scale.reciprocal()
+    if output_dtype == torch.float32 and bias is not None:
+        # Bias is not supported by _scaled_mm when output is fp32
+        output, output_amax = torch._scaled_mm(
+            a_data,
+            b_data,
+            out_dtype=output_dtype,
+            scale_a=a_inverse_scale,
+            scale_b=b_inverse_scale,
+            scale_result=output_scale,
+        )
+        output += bias
+        return output, output_amax
+    output, output_amax = torch._scaled_mm(
+        a_data,
+        b_data,
+        bias=bias,
+        out_dtype=output_dtype,
+        scale_a=a_inverse_scale,
+        scale_b=b_inverse_scale,
+        scale_result=output_scale,
+    )
+    return output, output_amax
+
+def mm_float8(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    output_dtype: torch.dtype,  # output dtype
+    output_scale: Optional[torch.Tensor] = None,  # output scale, precomputed
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return addmm_float8_unwrapped(
+        a, a_scale, b, b_scale, output_dtype, output_scale
+    )
+
+def to_fp8_saturated(
+    x: torch.Tensor,
+    x_scale: torch.tensor,
+    fp8_dtype: torch.dtype
+):
+    """
+    Converts a tensor to a saturated fp8 tensor.
+
+    Args:
+        a: Input Tensor.
+        b: Input Tensor.
+        a_scale: scale associated with `a`.
+        b_scale: scale associated with `b`.
+        output_dtype: dtype of result.
+        output_scale: the output tensor's scale, precomputed.
+
+    Returns:
+        (torch.Tensor, torch.Tensor): (result of the matrix multiplication, associated amax)
+    Note:
+        The default behavior in PyTorch for casting to `e4m3_type`
+        and `e5m2_type` is to not saturate. In this context, we should
+        saturate. A common case where we want to saturate is when the history
+        of a tensor has a maximum value of `amax1`, and the current amax value
+        is `amax2`, where `amax1 < amax2`.
+    """
+    x_scaled = x * x_scale
+
+    if fp8_dtype == e4m3_type:
+        x = x.clamp(min=-1 * E4M3_MAX_POS, max=E4M3_MAX_POS)
+    elif fp8_dtype == e5m2_type:
+        x = x.clamp(min=-1 * E5M2_MAX_POS, max=E5M2_MAX_POS)
+    else:
+        raise ValueError(f"to_fp8_saturated(): Unsupported fp8_dtype: {fp8_dtype}")
+
+    return x.to(fp8_dtype)
+
 @unittest.skipIf(not torch.cuda.is_available(), "CUDA not found")
 class TestFP8MatmulCuda(TestCase):
-    @unittest.skipIf(not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0), "FP8 is only supported on H100+")
+
+
+
+    @unittest.skipIf(not scaled_mm_supported_device(), f8_msg)
     def _test_tautological_mm(self, device: str = "cuda",
-                              x_dtype: torch.dtype = torch.float8_e4m3fn,
-                              y_dtype: torch.dtype = torch.float8_e4m3fn,
+                              x_dtype: torch.dtype = e4m3_type,
+                              y_dtype: torch.dtype = e4m3_type,
                               out_dtype: Optional[torch.dtype] = None,
                               size: int = 16) -> None:
         x_fp8 = torch.rand(size, size, device=device).to(x_dtype)
@@ -223,28 +371,32 @@ def _test_tautological_mm(self, device: str = "cuda",
             self.assertEqual(out_fp32.amax(), amax_fp8)
         self.assertEqual(out_fp32, out_fp8.to(torch.float))
 
-    @unittest.skipIf(not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0), "FP8 is only supported on H100+")
+    @unittest.skipIf(not scaled_mm_supported_device(), f8_msg)
     def test_float8_basics(self, device) -> None:
-        self._test_tautological_mm(device, torch.float8_e4m3fn, torch.float8_e4m3fn, size=16)
-        self._test_tautological_mm(device, torch.float8_e4m3fn, torch.float8_e5m2, size=32)
-        self._test_tautological_mm(device, torch.float8_e5m2, torch.float8_e4m3fn, size=48)
+        self._test_tautological_mm(device, e4m3_type, e4m3_type, size=16)
+        # hipblaslt does not yet support mixed e4m3_type input
+        if torch.version.hip is None:
+            self._test_tautological_mm(device, e4m3_type, e5m2_type, size=32)
+            self._test_tautological_mm(device, e5m2_type, e4m3_type, size=48)
         # According to https://docs.nvidia.com/cuda/cublas/#id99 8F_E5M2 MM is unsupported
         with self.assertRaises(RuntimeError):
-            self._test_tautological_mm(device, torch.float8_e5m2, torch.float8_e5m2)
+            self._test_tautological_mm(device, e5m2_type, e5m2_type)
 
-    @unittest.skipIf(not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0), "FP8 is only supported on H100+")
-    def test_float8_out_dtype(self, device) -> None:
         self._test_tautological_mm(device, size=64, out_dtype=torch.float16)
         self._test_tautological_mm(device, size=96, out_dtype=torch.float32)
-        self._test_tautological_mm(device, size=80, out_dtype=torch.bfloat16)
+        # hipblaslt does not yet support bfloat16 output
+        if torch.version.hip is None:
+            self._test_tautological_mm(device, size=80, out_dtype=torch.bfloat16)
         with self.assertRaises(RuntimeError):
-            self._test_tautological_mm(device, out_dtype=torch.float8_e5m2)
+            self._test_tautological_mm(device, out_dtype=e5m2_type)
 
-    @unittest.skipIf(not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0), "FP8 is only supported on H100+")
+    @unittest.skipIf(not scaled_mm_supported_device(), f8_msg)
     def test_float8_scale(self, device) -> None:
         size = (16, 16)
-        x = torch.full(size, .5, device=device, dtype=torch.float8_e4m3fn)
-        y = torch.full(size, .5, device=device, dtype=torch.float8_e5m2).t()
+        x = torch.full(size, .5, device=device, dtype=e4m3_type)
+        # hipblaslt does not yet support mixed e4m3_type input
+        y_type = e4m3_type if torch.version.hip else e5m2_type
+        y = torch.full(size, .5, device=device, dtype=y_type).t()
         scale_a = torch.tensor(1.5, device=device)
         scale_b = torch.tensor(0.66, device=device)
         out_fp8, amax_fp8 = torch._scaled_mm(x, y)
@@ -252,40 +404,95 @@ def test_float8_scale(self, device) -> None:
         out_fp8_s, amax_fp8_s = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b)
         self.assertEqual(out_fp8, out_fp8_s)
 
-    @unittest.skipIf(not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0), "FP8 is only supported on H100+")
+    @unittest.skipIf(not scaled_mm_supported_device(), f8_msg)
+    @parametrize("base_dtype", [torch.float16, torch.bfloat16, torch.float32])
+    def test_scaled_mm_vs_emulated(self, base_dtype):
+        torch.manual_seed(42)
+        input_dtype = e4m3_type
+        output_dtype = base_dtype
+        compare_type = torch.float32
+
+        x = torch.randn(16, 16, device="cuda", dtype=base_dtype)
+        y = torch.randn(32, 16, device="cuda", dtype=base_dtype).t()
+
+        x_scale = tensor_to_scale(x, input_dtype).float()
+        y_scale = tensor_to_scale(y, input_dtype).float()
+
+        x_fp8 = to_fp8_saturated(x, x_scale, e4m3_type)
+        y_fp8 = to_fp8_saturated(y, y_scale, e4m3_type)
+
+        # Calculate actual F8 mm
+        out_scaled_mm, output_amax_scaled = mm_float8(
+            x_fp8,
+            y_fp8,
+            a_scale=x_scale,
+            b_scale=y_scale,
+            output_dtype=output_dtype
+        )
+
+        # Calculate emulated F8 mm
+        out_emulated, output_amax_emulated = mm_float8_emulated(
+            x_fp8,
+            x_scale,
+            y_fp8,
+            y_scale,
+            output_dtype
+        )
+
+        if output_dtype != base_dtype:
+            out_scaled_mm = out_scaled_mm.to(compare_type)
+            out_emulated = out_emulated.to(compare_type)
+
+            out_scaled_mm = out_scaled_mm / amax_to_scale(
+                output_amax_scaled, input_dtype
+            )
+            out_emulated = out_emulated / amax_to_scale(
+                output_amax_emulated, input_dtype
+            )
+
+        if base_dtype in {torch.bfloat16, torch.float16}:
+            atol, rtol = 7e-2, 7e-2
+        else:
+            atol, rtol = 2e-3, 2e-3
+
+        torch.testing.assert_close(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
+
+    @unittest.skipIf(not scaled_mm_supported_device(), f8_msg)
     def test_float8_bias(self, device) -> None:
         (k, l, m) = (16, 48, 32)
-        x = torch.rand((k, l), device=device).to(torch.float8_e4m3fn)
-        y = torch.full((m, l), .25, device=device, dtype=torch.float8_e4m3fn).t()
+        x = torch.rand((k, l), device=device).to(e4m3_type)
+        y = torch.full((m, l), .25, device=device, dtype=e4m3_type).t()
         bias = torch.full((m,), 4.0, device=device, dtype=torch.half)
         out_fp8, amax_fp8 = torch._scaled_mm(x, y)
         outb_fp8, amaxb_fp8 = torch._scaled_mm(x, y, bias=bias)
-        self.assertEqual((amaxb_fp8 - amax_fp8).item(), 4.0)
+        # this fails on ROCm currently because hipblaslt doesn't have amax op
+        if torch.version.hip is None:
+            self.assertEqual((amaxb_fp8 - amax_fp8).item(), 4.0)
 
-    @unittest.skipIf(not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0), "FP8 is only supported on H100+")
+    @unittest.skipIf(not scaled_mm_supported_device(), f8_msg)
     @parametrize("bias", [True, False])
     def test_non_divisible_leading_dim(self, device, bias: torch.bool) -> None:
-        x = torch.rand((17, 16), device=device).to(torch.float8_e4m3fn)
-        y = torch.rand((16, 16), device=device).to(torch.float8_e4m3fn).t()
+        x = torch.rand((17, 16), device=device).to(e4m3_type)
+        y = torch.rand((16, 16), device=device).to(e4m3_type).t()
         input_bias = None
         if bias:
             input_bias = torch.rand((16,), device=device).to(torch.half)
         out_fp8, amax_fp8 = torch._scaled_mm(x, y, bias=input_bias)
 
-    @unittest.skipIf(not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0), "FP8 is only supported on H100+")
+    @unittest.skipIf(not scaled_mm_supported_device(), f8_msg)
     def test_float8_bias_relu_edgecase(self, device) -> None:
         (k, l, m) = (16, 48, 32)
-        x = torch.full((k, l), 0.0, device=device).to(torch.float8_e4m3fn)
-        y = torch.full((m, l), 1.0, device=device, dtype=torch.float8_e4m3fn).t()
+        x = torch.full((k, l), 0.0, device=device).to(e4m3_type)
+        y = torch.full((m, l), 1.0, device=device, dtype=e4m3_type).t()
         bias = torch.full((m,), -3.0, device=device, dtype=torch.half)
         outb_fp8, amaxb_fp8 = torch._scaled_mm(x, y, bias=bias)
         self.assertEqual(amaxb_fp8.item(), 3.0)
 
-    @unittest.skipIf(not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0), "FP8 is only supported on H100+")
+    @unittest.skipIf(not scaled_mm_supported_device(), f8_msg)
     def test_float32_output_errors_with_bias(self, device) -> None:
         (k, l, m) = (16, 48, 32)
-        x = torch.rand((k, l), device=device).to(torch.float8_e4m3fn)
-        y = torch.full((m, l), .25, device=device, dtype=torch.float8_e4m3fn).t()
+        x = torch.rand((k, l), device=device).to(e4m3_type)
+        y = torch.full((m, l), .25, device=device, dtype=e4m3_type).t()
         bias = torch.full((m,), 4.0, device=device, dtype=torch.bfloat16)
         self.assertRaisesRegex(
             RuntimeError,
@@ -293,23 +500,25 @@ def test_float32_output_errors_with_bias(self, device) -> None:
             lambda: torch._scaled_mm(x, y, bias=bias, out_dtype=torch.float32),
         )
 
-    @unittest.skipIf(not torch.cuda.is_available() or torch.cuda.get_device_capability() >= (9, 0),
-                     "This test is only for devices with compute capability < 9.0")
-    def test_error_message_fp8_non_h100(self, device) -> None:
+    @unittest.skipIf(scaled_mm_supported_device(),
+                     "This test is only for devices with compute capability < 8.9")
+    def test_error_message_fp8_pre_sm89(self, device) -> None:
         (k, l, m) = (16, 48, 32)
-        x = torch.rand((k, l), device=device).to(torch.float8_e4m3fn)
-        y = torch.rand((m, l), device=device).to(torch.float8_e4m3fn).t()
+        x = torch.rand((k, l), device=device).to(e4m3_type)
+        y = torch.rand((m, l), device=device).to(e4m3_type).t()
         self.assertRaisesRegex(
             RuntimeError,
-            r"torch\.\_scaled\_mm is only supported on devices with compute capability \>\= 9\.0",
+            r"torch\.\_scaled\_mm is only supported on CUDA devices with compute capability \>\= 9\.0 or 8\.9, or ROCm MI300\+",
             lambda: torch._scaled_mm(x, y, out_dtype=torch.float32),
         )
 
-    @unittest.skipIf(not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0), "FP8 is only supported on H100+")
+    @unittest.skipIf(not scaled_mm_supported_device(), f8_msg)
     def test_float8_scale_fast_accum(self, device) -> None:
         size = (16, 16)
-        x = torch.full(size, .5, device=device, dtype=torch.float8_e4m3fn)
-        y = torch.full(size, .5, device=device, dtype=torch.float8_e5m2).t()
+        x = torch.full(size, .5, device=device, dtype=e4m3_type)
+        # hipblaslt does not yet support mixed e4m3_type input
+        y_type = e4m3_type if torch.version.hip else e5m2_type
+        y = torch.full(size, .5, device=device, dtype=y_type).t()
         scale_a = torch.tensor(1.5, device=device)
         scale_b = torch.tensor(0.66, device=device)
         out_fp8, amax_fp8 = torch._scaled_mm(x, y, use_fast_accum=True)
diff --git a/test/test_meta.py b/test/test_meta.py
index 90606646d74ef..ebd91e71c29f2 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -8,7 +8,7 @@
 from torch.overrides import resolve_name
 from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
 from torch.utils import _pytree as pytree
-from torch._subclasses.meta_utils import MetaConverter, assert_metadata_eq
+from torch._subclasses.meta_utils import MetaConverter, assert_metadata_eq, is_sparse_any
 import torch.utils._python_dispatch
 from torch._dispatch.python import enable_python_dispatcher
 from torch._ops import OpOverload, OpOverloadPacket
@@ -64,6 +64,9 @@
 i64 = torch.int64
 b8 = torch.bool
 u8 = torch.uint8
+u16 = torch.uint16
+u32 = torch.uint32
+u64 = torch.uint64
 
 foreach_op_db = (
     foreach_unary_op_db +
@@ -283,6 +286,14 @@ def test_imag(self):
         m = MetaConverter()(y)
         self.assertMetadataMatches(m, y)
 
+    def test_inplace_set_storage(self):
+        x = torch.tensor([0, 1], dtype=torch.int64)
+        storage = x.untyped_storage()
+        ssize = storage.size()
+        meta = torch.empty((), dtype=torch.int64)
+        meta.set_(storage, 0, (), ())
+        self.assertEqual(storage.size(), ssize)
+
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1991")
     def test_weakref(self):
         x = torch.randn(4, 4, 4)
@@ -292,9 +303,20 @@ def test_weakref(self):
         self.assertIs(y, z)
         self.assertEqual(len(m.tensor_memo), 1)
         self.assertEqual(len(m.storage_memo), 1)
+        self.assertEqual(len(m.describer.lookup_tensor), 1)
+        self.assertEqual(len(m.describer.lookup_storage), 1)
         del x
+        # Entries from Tensor -> int get deallocated when the real tensor
+        # disappears...
+        self.assertEqual(len(m.describer.lookup_tensor), 0)
+        self.assertEqual(len(m.describer.lookup_storage), 0)
+        del y
+        del z
+        # ... but the int -> FakeTensor entries don't die until the fake
+        # tensors themselves die (because the user may have held onto the
+        # int key and are expecting to get a consistent fake tensor in
+        # this case)
         self.assertEqual(len(m.tensor_memo), 0)
-        m.check_for_expired_weak_storages()
         self.assertEqual(len(m.storage_memo), 0)
         li = []
         r = []
@@ -302,9 +324,14 @@ def test_weakref(self):
             li.append(torch.rand([i]))
             r.append(m(li[-1]))
         self.assertEqual(len(m.tensor_memo), 4)
+        self.assertEqual(len(m.storage_memo), 4)
+        self.assertEqual(len(m.describer.lookup_tensor), 4)
+        self.assertEqual(len(m.describer.lookup_storage), 4)
         del li
+        self.assertEqual(len(m.describer.lookup_tensor), 0)
+        self.assertEqual(len(m.describer.lookup_storage), 0)
+        del r
         self.assertEqual(len(m.tensor_memo), 0)
-        m.check_for_expired_weak_storages()
         self.assertEqual(len(m.storage_memo), 0)
 
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1991")
@@ -401,22 +428,24 @@ def test_assert(cond, msg):
         if not isinstance(r, torch.Tensor):
             continue
         test_assert(isinstance(meta_r, torch.Tensor), f"but real {i}th result is Tensor")
-        test_assert(meta_r.dtype == r.dtype, f"but real dtype was {r.dtype}")
-        test_assert(meta_r.shape == r.shape, f"but real shape was {r.shape}")
+        test_assert(meta_r.dtype == r.dtype, f"for element {i}, was {meta_r.dtype} but real dtype was {r.dtype}")
+        test_assert(meta_r.shape == r.shape, f"for element {i}, was {meta_r.shape} but real shape was {r.shape}")
         # See https://github.com/pytorch/pytorch/issues/78050
         if should_check_strides(func) == CheckStrides.ALL:
             same_strides, _ = torch._prims_common.check_all_strides(meta_r, r)
-            test_assert(same_strides, f"but real stride was {r.stride()}")
+            test_assert(same_strides, f"for element {i}, was {meta_r.stride()} but real stride was {r.stride()}")
         elif should_check_strides(func) == CheckStrides.SIGNIFICANT:
             same_strides, _ = torch._prims_common.check_significant_strides(meta_r, r)
-            test_assert(same_strides, f"but real stride was {r.stride()}")
+            test_assert(same_strides, f"for element {i}, was {meta_r.stride()} but real stride was {r.stride()}")
         test_assert(
             meta_r.storage_offset() == r.storage_offset(),
-            f"but real storage_offset was {r.storage_offset()}")
-        test_assert(meta_r.requires_grad == r.requires_grad, f"but real requires_grad was {r.requires_grad}")
+            f"for element {i}, was {meta_r.storage_offset()} but real storage_offset was {r.storage_offset()}")
+        test_assert(meta_r.requires_grad == r.requires_grad,
+                    f"for element {i}, was {meta_r.requires_grad} but real requires_grad was {r.requires_grad}")
         if func not in CHECK_CONJ_SKIPS:
-            test_assert(meta_r.is_conj() == r.is_conj(), f"but real is_conj was {r.is_conj()}")
-        test_assert(meta_r.is_neg() == r.is_neg(), f"but real is_neg was {r.is_neg()}")
+            test_assert(meta_r.is_conj() == r.is_conj(),
+                        f"for element {i}, was {meta_r.is_conj()} but real is_conj was {r.is_conj()}")
+        test_assert(meta_r.is_neg() == r.is_neg(), f"for element {i}, was {meta_r.is_neg()} but real is_neg was {r.is_neg()}")
 
 
 # This environment variable controls whether or not we print expected failure
@@ -488,7 +517,9 @@ def __repr__(self):
             return self.s
 
     def go(t):
-        if isinstance(t, torch.Tensor):
+        if is_sparse_any(t):
+            return t
+        elif isinstance(t, torch.Tensor):
             return Lit(f"{t} stride={t.stride()}")
         else:
             return t
@@ -639,17 +670,13 @@ def run_meta_crossref(
     torch.Tensor.nonzero : {f64, i32, c128, i64, i16, c32, f16, u8, c64, bf16, b8, i8, f32},
     torch.Tensor.item : {f64, i32, c128, i64, i16, f16, u8, c32, c64, bf16, b8, i8, f32},
     torch.bincount : {i32, i64, u8, i16, i8},
-    torch.frexp : {f64, f16, bf16, f32},
-    torch.functional.unique : {f64, i32, i64, u8, i16, f16, bf16, b8, i8, f32},
-    torch.functional.unique_consecutive : {f64, i32, i64, u8, i16, f16, bf16, b8, i8, f32},
-    torch.histc : {f64, f16, bf16, f32},
+    torch.functional.unique : {f64, i32, i64, u8, i16, f16, bf16, b8, i8, f32, u16, u32, u64},
+    torch.functional.unique_consecutive : {f64, i32, i64, u8, i16, f16, bf16, b8, i8, f32, u16, u32, u64},
     torch.histogram : {f64, f32},
     torch.histogramdd : {f64, f32},
     torch.kthvalue : {f64, i32, i64, u8, i16, f16, bf16, i8, f32},
     torch.nn.functional.ctc_loss : {f64, f32},
     torch.nn.functional.gaussian_nll_loss : {f16, f64, bf16, f32},
-    torch.linalg.eig : {f64, f32, c128, c64},
-    torch.linalg.eigvals : {f64, f32, c128, c64},
     torch.linalg.lstsq : {f64, f32, c128, c64},
 }
 
@@ -693,7 +720,6 @@ def run_meta_crossref(
     torch.equal : {bf16, i8, c32, i64, u8, c128, b8, f64, i16, i32, f32, f16, c64},
     torch.nanmean : {bf16, f64, f32, f16, c32, c64, c128},
     torch.nn.functional.cross_entropy : {bf16, f64, f32},
-    torch.nn.functional.interpolate : {bf16, f64, f32, u8},
     torch.nn.functional.nll_loss : {bf16, f64, f32},
     torch.linalg.cond : {c128, c64, f32, f64},
     torch.linalg.vecdot : {bf16, f64, f32, f16},
@@ -708,8 +734,11 @@ def run_meta_crossref(
 meta_function_device_skips = defaultdict(dict)
 
 meta_function_device_expected_failures['cpu'] = {
+    # TODO: The decomps for these batch norm ops return different dtypes depending
+    # on the device. We should make this work better with meta tensors.
     torch.native_batch_norm: {bf16, f16},
     torch._native_batch_norm_legit: {bf16, f16},
+    torch.ops.aten._batch_norm_with_update: {bf16, f16},
     torch.native_layer_norm: {bf16, f16},
 }
 
@@ -719,13 +748,15 @@ def run_meta_crossref(
     torch.functional.unique: {f16},  # aten::_unique2, aten::unique_dim
     torch.functional.unique_consecutive: {f16},  # aten::unique_consecutive
     torch.geqrf: {f32, f64},  # aten::geqrf
-    torch.histc: {i16, i32, i64, i8},  # aten::histc, aten::histc.out
     torch.kthvalue: {f16},  # aten::kthvalue.values
 }
 
 meta_function_device_skips['cpu'] = {
+    # TODO: The decomps for these batch norm ops return different dtypes depending
+    # on the device. We should make this work better with meta tensors.
     torch.native_batch_norm: {f32, f64},
     torch._native_batch_norm_legit: {f32, f64},
+    torch.ops.aten._batch_norm_with_update: {f32, f64},
 }
 
 meta_function_device_skips['cuda'] = {
@@ -798,7 +829,6 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
 meta_dispatch_expected_failures = {
     aten.allclose.default: {f16, bf16, f32, f64, c64, c128},  # NotImplementedError: 'aten::_local_scalar_dense'
     aten.geqrf.default : {c64, c128, f64, f32},
-    aten.linalg_eig.default : {c64, c128, f64, f32},
     aten.linalg_lstsq.default : {c64, c128, f64, f32},
     aten.masked_select.default : {c64, f16, i8, f64, c128, i64, bf16, f32, i32, b8, i16, u8},
     aten.masked_select.out : {c64, f16, i8, f64, c128, i64, bf16, f32, i32, b8, i16, u8},
@@ -811,17 +841,14 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
     aten._histogramdd_from_bin_cts.default : {f32, f64},
     aten._histogramdd_from_bin_tensors.default : {f32, f64},
     aten._local_scalar_dense.default : {c32, c64, f16, i8, f64, c128, i64, bf16, f32, i32, b8, i16, u8},
-    aten._unique2.default : {i8, f64, i64, f16, bf16, f32, i32, b8, i16, u8},
+    aten._unique2.default : {i8, f64, i64, f16, bf16, f32, i32, b8, i16, u8, u16, u32, u64},
     aten.bincount.default : {i64, i8, i32, i16, u8},
     aten.equal.default : {c64, f16, i8, f64, c128, i64, bf16, f32, i32, b8, i16, u8},
-    aten.frexp.Tensor : {bf16, f32, f16, f64},
-    aten.histc.default : {bf16, f32, f64},
-    aten.histc.out : {bf16, f32, f64},
     aten.histogram.bin_ct : {f32, f64},
     aten.histogram.bins_tensor : {f32, f64},
     aten.kthvalue.default : {i8, f64, i64, f16, bf16, f32, i32, i16, u8},
-    aten.unique_consecutive.default : {i8, f64, i64, f16, bf16, f32, i32, b8, i16, u8},
-    aten.unique_dim.default : {i8, f64, i64, f16, bf16, f32, i32, b8, i16, u8},
+    aten.unique_consecutive.default : {i8, f64, i64, f16, bf16, f32, i32, b8, i16, u8, u16, u32, u64},
+    aten.unique_dim.default : {i8, f64, i64, f16, bf16, f32, i32, b8, i16, u8, u16, u32, u64},
     aten.upsample_nearest3d.vec : {bf16, f32, f64, u8},
 
 }
@@ -852,12 +879,14 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
 meta_dispatch_device_skips = defaultdict(dict)
 
 meta_dispatch_device_expected_failures['cpu'] = {
+    # TODO: The decomps for these batch norm ops return different dtypes depending
+    # on the device. We should make this work better with meta tensors.
     aten.native_batch_norm.default: {bf16, f16},
     aten._native_batch_norm_legit.default: {bf16, f16},
     aten._native_batch_norm_legit.no_stats: {bf16, f16},
+    aten._batch_norm_with_update.default: {bf16, f16},
+
     aten.native_layer_norm.default: {bf16, f16},
-    aten.histc.default: {f16},
-    aten.histc.out: {f16},
 }
 
 meta_dispatch_device_expected_failures['cuda'] = {
@@ -866,8 +895,6 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
     aten._use_cudnn_ctc_loss.Tensor: {f32, f64},  # aten::_use_cudnn_ctc_loss.Tensor
     aten.cudnn_grid_sampler.default: {f16, f32, f64},  # aten::cudnn_grid_sampler
     aten.geqrf.default: {f32, f64},  # aten::geqrf
-    aten.histc.default: {i16, i32, i64, i8},  # aten::histc
-    aten.histc.out: {i16, i32, i64, i8},  # aten::histc.out
     aten.kthvalue.default: {f16},  # aten::kthvalue.values
     aten.linalg_eigvalsh.out: {f32, f64},  # aten::linalg_eigvalsh.out
     aten.log_sigmoid_forward.default: {bf16, f16, f64, f32},
@@ -879,9 +906,13 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
 
 meta_dispatch_device_skips['cpu'] = {
     aten._embedding_bag_forward_only.default: {bf16, f16, f32, f64},
+
+    # TODO: The decomps for these batch norm ops return different dtypes depending
+    # on the device. We should make this work better with meta tensors.
     aten.native_batch_norm.default: {f32, f64},
     aten._native_batch_norm_legit.default: {f32, f64},
     aten._native_batch_norm_legit.no_stats: {f32, f64},
+    aten._batch_norm_with_update.default: {f32, f64},
 
     # If the computation dtype is different from the input
     # dtype this will fail. CPU execution may also have a
@@ -1326,26 +1357,22 @@ def test_inplace_bin_ops_error(self):
 
     @onlyCPU
     def test_meta_autograd_no_error(self):
-        lib = torch.library.Library("meta_test", "DEF")
-        impl_cpu = torch.library.Library("meta_test", "IMPL", "CPU")
-        impl_meta = torch.library.Library("meta_test", "IMPL", "Meta")
-
-        def foo_impl(x):
-            return x + 1
-
-        lib.define("foo(Tensor a) -> Tensor")
-        impl_meta.impl("foo", foo_impl)
-        impl_cpu.impl("foo", foo_impl)
-
-        a = torch.ones(2, device='meta')
-        # The point of the test is that this should not error:
-        # We have a fallthrough kernel registered to the AutogradMeta
-        # key for custom ops, so it's fine that `foo()` doesn't have
-        # an autograd kernel.
-        b = torch.ops.meta_test.foo.default(a)
-        del impl_meta
-        del impl_cpu
-        del lib
+        with torch.library._scoped_library("meta_test", "DEF") as lib:
+            with torch.library._scoped_library("meta_test", "IMPL", "CPU") as impl_cpu:
+                with torch.library._scoped_library("meta_test", "IMPL", "Meta") as impl_meta:
+                    def foo_impl(x):
+                        return x + 1
+
+                    lib.define("foo(Tensor a) -> Tensor")
+                    impl_meta.impl("foo", foo_impl)
+                    impl_cpu.impl("foo", foo_impl)
+
+                    a = torch.ones(2, device='meta')
+                    # The point of the test is that this should not error:
+                    # We have a fallthrough kernel registered to the AutogradMeta
+                    # key for custom ops, so it's fine that `foo()` doesn't have
+                    # an autograd kernel.
+                    b = torch.ops.meta_test.foo.default(a)
 
     def test_huber_loss_backward(self):
         inps = [torch.rand(2**52, device='meta') for _ in range(3)]
@@ -1575,6 +1602,66 @@ def test_quantized_embedding_bag(self):
         self.assertEqual(eb.dtype, torch.float32)
         self.assertEqual(eb.untyped_storage().data_ptr(), 0)
 
+    # Tests mean and max.
+    # Can't easily test sum, because there is a fast path for sum which
+    # causes offset2bag to not get allocated... but the backward function
+    # needs it, and the offset2bag computation lives inside the
+    # derivatives.yaml formula directly, so there is no way to access it.
+    # To test sum, need to manually compute offset2bag
+    @parametrize("mode", [1, 2])
+    def test_embedding_bag_dense_backward(self, mode):
+        weight = torch.randn(4, 3, requires_grad=True)
+        indices = torch.tensor([1, 0, 2, 1, 3])
+        offsets = torch.tensor([0, 2, 3, 5])
+        scale_grad_by_freq = False
+        sparse = False
+        per_sample_weights = None
+        include_last_offset = False
+        padding_idx = -1
+
+        output, offset2bag, bag_size, maximum_indices = torch.ops.aten._embedding_bag.default(
+            weight, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset, padding_idx
+        )
+        grad = torch.randn_like(output)
+
+        # Call the function with example inputs
+        grad_weight = torch.ops.aten._embedding_bag_dense_backward.default(
+            grad, indices, offset2bag, bag_size, maximum_indices, weight.size(0),
+            scale_grad_by_freq, mode, per_sample_weights, padding_idx
+        )
+        meta_grad_weight = torch.ops.aten._embedding_bag_dense_backward.default(
+            grad.to('meta'), indices.to('meta'), offset2bag.to('meta'), bag_size.to('meta'),
+            maximum_indices.to('meta'), weight.size(0),
+            scale_grad_by_freq, mode, per_sample_weights, padding_idx
+        )
+        self.assertEqual(grad_weight.to('meta'), meta_grad_weight)
+
+    def test_embedding_bag_dense_backward_per_sample_weights(self):
+        weight = torch.randn(4, 3, requires_grad=True)
+        indices = torch.tensor([1, 0, 2, 1, 3])
+        offsets = torch.tensor([0, 2, 3, 5])
+        scale_grad_by_freq = False
+        sparse = False
+        mode = 0
+        per_sample_weights = torch.randn(5, requires_grad=True)
+        include_last_offset = False
+        padding_idx = -1
+
+        output, offset2bag, bag_size, maximum_indices = torch.ops.aten._embedding_bag.default(
+            weight, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset, padding_idx
+        )
+        grad = torch.randn_like(output)
+
+        # Call the function with example inputs
+        grad_weight = torch.ops.aten._embedding_bag_per_sample_weights_backward.default(
+            grad, weight, indices, offsets, offset2bag, mode, padding_idx
+        )
+        meta_grad_weight = torch.ops.aten._embedding_bag_per_sample_weights_backward.default(
+            grad.to('meta'), weight.to('meta'), indices.to('meta'),
+            offsets.to('meta'), offset2bag.to('meta'), mode, padding_idx
+        )
+        self.assertEqual(grad_weight.to('meta'), meta_grad_weight)
+
     # opinfo test is using aten.fill_, it's not testing aten.fill
     @onlyCUDA
     def test_fill_stride(self):
diff --git a/test/test_mkldnn_fusion.py b/test/test_mkldnn_fusion.py
index a3cd326bf0d39..e2caa36445b84 100644
--- a/test/test_mkldnn_fusion.py
+++ b/test/test_mkldnn_fusion.py
@@ -6,7 +6,7 @@
 import torch
 from torch import nn
 
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo
 from torch.testing._internal.jit_utils import JitTestCase
 
 from test_tensorexpr import warmup_and_run_forward
@@ -22,6 +22,7 @@ class PointwisePostOp(NamedTuple):
 CONV_MODULES = {2: torch.nn.Conv2d, 3: torch.nn.Conv3d}
 CONV_TRANSPOSE_MODULES = {2: torch.nn.ConvTranspose2d}
 
+@skipIfTorchDynamo("too slow")
 @unittest.skipIf(not torch.backends.mkldnn.is_available(), "MKL-DNN build is disabled")
 class TestMkldnnFusion(JitTestCase):
     def assertFused(self, graph, fused_patterns):
diff --git a/test/test_module_init.py b/test/test_module_init.py
deleted file mode 100644
index 0b6d1003439ad..0000000000000
--- a/test/test_module_init.py
+++ /dev/null
@@ -1,540 +0,0 @@
-# Owner(s): ["module: nn"]
-
-import inspect
-import torch
-from unittest import mock
-from unittest.mock import MagicMock, patch
-from torch.testing._internal.common_dtype import floating_types
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes
-from torch.testing._internal.common_quantization import skipIfNoFBGEMM
-from torch.testing._internal.common_utils import TestCase, run_tests
-
-# Returns a database of args & kwargs that can be used to construct each module.
-# Each entry is in class -> (args, kwargs) format.
-# Example: torch.nn.Linear -> ([10, 5], {})
-# TODO: Merge this in with the initial ModuleInfo implementation.
-def build_constructor_arg_db():
-    return {
-        torch.nn.AdaptiveAvgPool1d: ((5,), {}),
-        torch.nn.AdaptiveAvgPool2d: ((5,), {}),
-        torch.nn.AdaptiveAvgPool3d: ((5,), {}),
-        torch.nn.AdaptiveLogSoftmaxWithLoss: ((100, 20, [5, 10, 15]), {}),
-        torch.nn.AdaptiveMaxPool1d: ((5,), {}),
-        torch.nn.AdaptiveMaxPool2d: ((5,), {}),
-        torch.nn.AdaptiveMaxPool3d: ((5,), {}),
-        torch.nn.AlphaDropout: ((), {}),
-        torch.nn.AvgPool1d: ((3,), {}),
-        torch.nn.AvgPool2d: ((3,), {}),
-        torch.nn.AvgPool3d: ((3,), {}),
-        torch.nn.BCELoss: ((), {}),
-        torch.nn.BCEWithLogitsLoss: ((), {}),
-        torch.nn.BatchNorm1d: ((5,), {}),
-        torch.nn.BatchNorm2d: ((5,), {}),
-        torch.nn.BatchNorm3d: ((5,), {}),
-        torch.nn.Bilinear: ((2, 3, 4), {}),
-        torch.nn.CELU: ((), {}),
-        torch.nn.CTCLoss: ((), {}),
-        torch.nn.ChannelShuffle: ((4,), {}),
-        torch.nn.ConstantPad1d: ((2, 3.5), {}),
-        torch.nn.ConstantPad2d: ((2, 3.5), {}),
-        torch.nn.ConstantPad3d: ((2, 3.5), {}),
-        torch.nn.CircularPad1d: ((2,), {}),
-        torch.nn.CircularPad2d: ((2,), {}),
-        torch.nn.CircularPad3d: ((2,), {}),
-        torch.nn.Conv1d: ((3, 3, 3), {}),
-        torch.nn.Conv2d: ((3, 3, 3), {}),
-        torch.nn.Conv3d: ((3, 3, 3), {}),
-        torch.nn.ConvTranspose1d: ((3, 3, 3), {}),
-        torch.nn.ConvTranspose2d: ((3, 3, 3), {}),
-        torch.nn.ConvTranspose3d: ((3, 3, 3), {}),
-        torch.nn.CosineEmbeddingLoss: ((), {}),
-        torch.nn.CosineSimilarity: ((), {}),
-        torch.nn.CrossEntropyLoss: ((), {}),
-        torch.nn.CrossMapLRN2d: ((5,), {}),
-        torch.nn.Dropout1d: ((), {}),
-        torch.nn.Dropout2d: ((), {}),
-        torch.nn.Dropout3d: ((), {}),
-        torch.nn.Dropout: ((), {}),
-        torch.nn.ELU: ((), {}),
-        torch.nn.Embedding: ((10, 5), {}),
-        torch.nn.EmbeddingBag: ((10, 5), {}),
-        torch.nn.FeatureAlphaDropout: ((), {}),
-        torch.nn.Flatten: ((), {}),
-        torch.nn.Fold: ((5, 2), {}),
-        torch.nn.FractionalMaxPool2d: ((5, 2), {}),
-        torch.nn.FractionalMaxPool3d: ((5, 2), {}),
-        torch.nn.GELU: ((), {}),
-        torch.nn.GLU: ((), {}),
-        torch.nn.GRU: ((5, 10), {}),
-        torch.nn.GRUCell: ((5, 10), {}),
-        torch.nn.GaussianNLLLoss: ((), {}),
-        torch.nn.GroupNorm: ((3, 6, 1e-5, True), {}),
-        torch.nn.Hardshrink: ((), {}),
-        torch.nn.Hardsigmoid: ((), {}),
-        torch.nn.Hardswish: ((), {}),
-        torch.nn.Hardtanh: ((), {}),
-        torch.nn.HingeEmbeddingLoss: ((), {}),
-        torch.nn.HuberLoss: ((), {}),
-        torch.nn.Identity: ((), {}),
-        torch.nn.InstanceNorm1d: ((5, 1e-5, 0.1, True), {}),
-        torch.nn.InstanceNorm2d: ((5, 1e-5, 0.1, True), {}),
-        torch.nn.InstanceNorm3d: ((5, 1e-5, 0.1, True), {}),
-        torch.nn.KLDivLoss: ((), {}),
-        torch.nn.L1Loss: ((), {}),
-        torch.nn.LPPool1d: ((2, 3), {}),
-        torch.nn.LPPool2d: ((2, 3), {}),
-        torch.nn.LPPool3d: ((2, 3), {}),
-        torch.nn.LSTM: ((5, 10), {}),
-        torch.nn.LSTMCell: ((5, 10), {}),
-        torch.nn.LayerNorm: ((2,), {}),
-        torch.nn.LazyBatchNorm1d: ((), {}),
-        torch.nn.LazyBatchNorm2d: ((), {}),
-        torch.nn.LazyBatchNorm3d: ((), {}),
-        torch.nn.LazyConv1d: ((5, 2), {}),
-        torch.nn.LazyConv2d: ((5, 2), {}),
-        torch.nn.LazyConv3d: ((5, 2), {}),
-        torch.nn.LazyConvTranspose1d: ((5, 2), {}),
-        torch.nn.LazyConvTranspose2d: ((5, 2), {}),
-        torch.nn.LazyConvTranspose3d: ((5, 2), {}),
-        torch.nn.LazyInstanceNorm1d: ((), {}),
-        torch.nn.LazyInstanceNorm2d: ((), {}),
-        torch.nn.LazyInstanceNorm3d: ((), {}),
-        torch.nn.LazyLinear: ((5,), {}),
-        torch.nn.LeakyReLU: ((), {}),
-        torch.nn.Linear: ((10, 5), {}),
-        torch.nn.LocalResponseNorm: ((2,), {}),
-        torch.nn.LogSigmoid: ((), {}),
-        torch.nn.LogSoftmax: ((), {}),
-        torch.nn.MSELoss: ((), {}),
-        torch.nn.MarginRankingLoss: ((), {}),
-        torch.nn.MaxPool1d: ((3,), {}),
-        torch.nn.MaxPool2d: ((3,), {}),
-        torch.nn.MaxPool3d: ((3,), {}),
-        torch.nn.MaxUnpool1d: ((5,), {}),
-        torch.nn.MaxUnpool2d: ((5,), {}),
-        torch.nn.MaxUnpool3d: ((5,), {}),
-        torch.nn.Mish: ((), {}),
-        torch.nn.ModuleDict: ((), {}),
-        torch.nn.ModuleList: ((), {}),
-        torch.nn.MultiLabelMarginLoss: ((), {}),
-        torch.nn.MultiLabelSoftMarginLoss: ((), {}),
-        torch.nn.MultiMarginLoss: ((), {}),
-        torch.nn.MultiheadAttention: ((100, 2), {}),
-        torch.nn.NLLLoss2d: ((), {}),
-        torch.nn.NLLLoss: ((), {}),
-        torch.nn.PReLU: ((), {}),
-        torch.nn.PairwiseDistance: ((), {}),
-        torch.nn.ParameterDict: ((), {}),
-        torch.nn.ParameterList: ((), {}),
-        torch.nn.PixelShuffle: ((2,), {}),
-        torch.nn.PixelUnshuffle: ((2,), {}),
-        torch.nn.PoissonNLLLoss: ((), {}),
-        torch.nn.RNN: ((5, 10), {}),
-        torch.nn.RNNBase: (('LSTM', 5, 10), {}),
-        torch.nn.RNNCell: ((5, 10), {}),
-        torch.nn.RNNCellBase: ((5, 10, True, 2), {}),
-        torch.nn.RReLU: ((), {}),
-        torch.nn.ReLU6: ((), {}),
-        torch.nn.ReLU: ((), {}),
-        torch.nn.ReflectionPad1d: ((2,), {}),
-        torch.nn.ReflectionPad2d: ((2,), {}),
-        torch.nn.ReflectionPad3d: ((2,), {}),
-        torch.nn.ReplicationPad1d: ((2,), {}),
-        torch.nn.ReplicationPad2d: ((2,), {}),
-        torch.nn.ReplicationPad3d: ((2,), {}),
-        torch.nn.SELU: ((), {}),
-        torch.nn.Sequential: ((), {}),
-        torch.nn.SiLU: ((), {}),
-        torch.nn.Sigmoid: ((), {}),
-        torch.nn.SmoothL1Loss: ((), {}),
-        torch.nn.SoftMarginLoss: ((), {}),
-        torch.nn.Softmax2d: ((), {}),
-        torch.nn.Softmax: ((), {}),
-        torch.nn.Softmin: ((), {}),
-        torch.nn.Softplus: ((), {}),
-        torch.nn.Softshrink: ((), {}),
-        torch.nn.Softsign: ((), {}),
-        torch.nn.SyncBatchNorm: ((5,), {}),
-        torch.nn.Tanh: ((), {}),
-        torch.nn.Tanhshrink: ((), {}),
-        torch.nn.Threshold: ((0.1, 20), {}),
-        torch.nn.Transformer: ((), {}),
-        torch.nn.TransformerDecoder: ((torch.nn.TransformerDecoderLayer, 3), {}),
-        torch.nn.TransformerDecoderLayer: ((10, 2), {}),
-        torch.nn.TransformerEncoder: ((torch.nn.TransformerEncoderLayer, 3), {}),
-        torch.nn.TransformerEncoderLayer: ((10, 2), {}),
-        torch.nn.TripletMarginLoss: ((), {}),
-        torch.nn.TripletMarginWithDistanceLoss: ((), {}),
-        torch.nn.Unflatten: ((1, (2, 5, 5)), {}),
-        torch.nn.Unfold: ((3,), {}),
-        torch.nn.Upsample: ((), {}),
-        torch.nn.UpsamplingBilinear2d: ((), {}),
-        torch.nn.UpsamplingNearest2d: ((), {}),
-        torch.nn.ZeroPad1d: ((0,), {}),
-        torch.nn.ZeroPad2d: ((0,), {}),
-        torch.nn.ZeroPad3d: ((0,), {}),
-        torch.ao.nn.qat.Conv1d: ((3, 3, 3), {
-            'qconfig': torch.ao.quantization.default_qconfig,
-        }),
-        torch.ao.nn.qat.Conv2d: ((3, 3, 3), {
-            'qconfig': torch.ao.quantization.default_qconfig,
-        }),
-        torch.ao.nn.qat.Conv3d: ((3, 3, 3), {
-            'qconfig': torch.ao.quantization.default_qconfig,
-        }),
-        torch.ao.nn.qat.Linear: ((5, 2), {
-            'qconfig': torch.ao.quantization.default_qconfig,
-        }),
-        torch.ao.nn.qat.Embedding: ((10, 12), {
-            'qconfig': torch.ao.quantization.float_qparams_weight_only_qconfig,
-        }),
-        torch.ao.nn.qat.EmbeddingBag: ((10, 12), {
-            'qconfig': torch.ao.quantization.float_qparams_weight_only_qconfig,
-        }),
-        torch.ao.nn.quantizable.LSTM: ((5, 6), {}),
-        torch.ao.nn.quantizable.LSTMCell: ((5, 6), {}),
-        torch.ao.nn.quantizable.MultiheadAttention: ((10, 2), {}),
-        torch.ao.nn.quantized.BatchNorm2d: ((2,), {}),
-        torch.ao.nn.quantized.BatchNorm3d: ((2,), {}),
-        torch.ao.nn.quantized.Dropout: ((), {}),
-        torch.ao.nn.quantized.Conv1d: ((3, 3, 3), {}),
-        torch.ao.nn.quantized.Conv2d: ((3, 3, 3), {}),
-        torch.ao.nn.quantized.Conv3d: ((3, 3, 3), {}),
-        torch.ao.nn.quantized.ConvTranspose1d: ((3, 3, 3), {}),
-        torch.ao.nn.quantized.ConvTranspose2d: ((3, 3, 3), {}),
-        torch.ao.nn.quantized.ConvTranspose3d: ((16, 33, (3, 3, 5)), {
-            'stride': (2, 1, 1),
-            'padding': (4, 2, 2),
-            'output_padding': (2, 2, 2),
-            'dilation': (1, 1, 1),
-        }),
-        torch.ao.nn.quantized.DeQuantize: ((), {}),
-        torch.ao.nn.quantized.ELU: ((0.01, 0), {}),
-        torch.ao.nn.quantized.Embedding: ((10, 3), {
-            'factory_kwargs': {},
-        }),
-        torch.ao.nn.quantized.EmbeddingBag: ((10, 3), {
-            'factory_kwargs': {},
-        }),
-        torch.ao.nn.quantized.GroupNorm: ((2, 4, torch.nn.Parameter(torch.tensor(2.)),
-                                          torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.ao.nn.quantized.Hardswish: ((0.1, 0,), {}),
-        torch.ao.nn.quantized.InstanceNorm1d: ((2, torch.nn.Parameter(torch.tensor(2.)),
-                                               torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.ao.nn.quantized.InstanceNorm2d: ((2, torch.nn.Parameter(torch.tensor(2.)),
-                                               torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.ao.nn.quantized.InstanceNorm3d: ((2, torch.nn.Parameter(torch.tensor(2.)),
-                                               torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.ao.nn.quantized.LayerNorm: ((2, torch.nn.Parameter(torch.tensor(2.)),
-                                          torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.ao.nn.quantized.LeakyReLU: ((0.01, 0), {}),
-        torch.ao.nn.quantized.Linear: ((5, 2), {
-            'factory_kwargs': {},
-        }),
-        torch.ao.nn.quantized.MaxPool2d: ((3,), {}),
-        torch.ao.nn.quantized.Quantize: ((0.1, 0), {
-            'dtype': torch.int16,
-            'factory_kwargs': {},
-        }),
-        torch.ao.nn.quantized.ReLU6: ((), {}),
-        torch.ao.nn.quantized.Sigmoid: ((0.1, 0), {}),
-        torch.ao.nn.quantized.Softmax: ((), {}),
-        torch.ao.nn.quantized.FloatFunctional: ((), {}),
-        torch.ao.nn.quantized.FXFloatFunctional: ((), {}),
-        torch.ao.nn.quantized.QFunctional: ((), {}),
-        # Remove torch.ao.nn.quantized after the migration completes:
-        torch.ao.nn.qat.Conv1d: ((3, 3, 3), {
-            'qconfig': torch.ao.quantization.default_qconfig,
-        }),
-        torch.ao.nn.qat.Conv2d: ((3, 3, 3), {
-            'qconfig': torch.ao.quantization.default_qconfig,
-        }),
-        torch.ao.nn.qat.Conv3d: ((3, 3, 3), {
-            'qconfig': torch.ao.quantization.default_qconfig,
-        }),
-        torch.ao.nn.qat.Linear: ((5, 2), {
-            'qconfig': torch.ao.quantization.default_qconfig,
-        }),
-        torch.ao.nn.qat.Embedding: ((10, 12), {
-            'qconfig': torch.ao.quantization.float_qparams_weight_only_qconfig,
-        }),
-        torch.ao.nn.qat.EmbeddingBag: ((10, 12), {
-            'qconfig': torch.ao.quantization.float_qparams_weight_only_qconfig,
-        }),
-        torch.ao.nn.quantized.BatchNorm2d: ((2,), {}),
-        torch.ao.nn.quantized.BatchNorm3d: ((2,), {}),
-        torch.ao.nn.quantized.Dropout: ((), {}),
-        torch.ao.nn.quantized.Conv1d: ((3, 3, 3), {}),
-        torch.ao.nn.quantized.Conv2d: ((3, 3, 3), {}),
-        torch.ao.nn.quantized.Conv3d: ((3, 3, 3), {}),
-        torch.ao.nn.quantized.ConvTranspose1d: ((3, 3, 3), {}),
-        torch.ao.nn.quantized.ConvTranspose2d: ((3, 3, 3), {}),
-        torch.ao.nn.quantized.ConvTranspose3d: ((16, 33, (3, 3, 5)), {
-            'stride': (2, 1, 1),
-            'padding': (4, 2, 2),
-            'output_padding': (2, 2, 2),
-            'dilation': (1, 1, 1),
-        }),
-        torch.ao.nn.quantized.DeQuantize: ((), {}),
-        torch.ao.nn.quantized.ELU: ((0.01, 0), {}),
-        torch.ao.nn.quantized.Embedding: ((10, 3), {
-            'factory_kwargs': {},
-        }),
-        torch.ao.nn.quantized.EmbeddingBag: ((10, 3), {
-            'factory_kwargs': {},
-        }),
-        torch.ao.nn.quantized.GroupNorm: ((2, 4, torch.nn.Parameter(torch.tensor(2.)),
-                                           torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.ao.nn.quantized.Hardswish: ((0.1, 0,), {}),
-        torch.ao.nn.quantized.InstanceNorm1d: ((2, torch.nn.Parameter(torch.tensor(2.)),
-                                                torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.ao.nn.quantized.InstanceNorm2d: ((2, torch.nn.Parameter(torch.tensor(2.)),
-                                                torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.ao.nn.quantized.InstanceNorm3d: ((2, torch.nn.Parameter(torch.tensor(2.)),
-                                                torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.ao.nn.quantized.LayerNorm: ((2, torch.nn.Parameter(torch.tensor(2.)),
-                                           torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.ao.nn.quantized.LeakyReLU: ((0.01, 0), {}),
-        torch.ao.nn.quantized.Linear: ((5, 2), {
-            'factory_kwargs': {},
-        }),
-        torch.ao.nn.quantized.MaxPool2d: ((3,), {}),
-        torch.ao.nn.quantized.PReLU: ((0.01, 0), {}),
-        torch.ao.nn.quantized.Quantize: ((0.1, 0), {
-            'dtype': torch.int16,
-            'factory_kwargs': {},
-        }),
-        torch.ao.nn.quantized.ReLU6: ((), {}),
-        torch.ao.nn.quantized.Sigmoid: ((0.1, 0), {}),
-        torch.ao.nn.quantized.Softmax: ((), {}),
-        torch.ao.nn.quantized.FloatFunctional: ((), {}),
-        torch.ao.nn.quantized.FXFloatFunctional: ((), {}),
-        torch.ao.nn.quantized.QFunctional: ((), {}),
-    }
-
-
-# Instantiates the given class with the given args, kwargs, optionally on a given device.
-def instantiate_class(cls, args, kwargs, extra_kwargs):
-    return cls(*args, **kwargs) if extra_kwargs is None else cls(*args, **kwargs, **extra_kwargs)
-
-
-# Returns a function that calls the real implementation of a method
-# in addition to passing args to a mock object.
-def mock_wrapper(method):
-    mock = MagicMock()
-
-    def wrapper(self, *args, **kwargs):
-        mock(*args, **kwargs)
-        return method(self, *args, **kwargs)
-    wrapper.mock = mock
-    return wrapper
-
-
-# Returns a set of args / kwargs that can be used to construct the module.
-def get_example_args(module_cls, constructor_arg_db, extra_kwargs=None):
-    assert module_cls in constructor_arg_db, \
-        f"No entry for {module_cls} in the constructor arg DB. Please add it to pass these tests."
-    args, kwargs = constructor_arg_db[module_cls]
-    extra_kwargs = {} if extra_kwargs is None else extra_kwargs
-
-    # Recursively instantiate args / kwargs that are class objects.
-    args = [instantiate_class(arg, *get_example_args(arg, constructor_arg_db), extra_kwargs=extra_kwargs)
-            if inspect.isclass(arg) else torch.nn.Parameter(arg.to(**extra_kwargs))
-            if isinstance(arg, torch.nn.Parameter) else arg for arg in args]
-    kwargs = {k: instantiate_class(v, *get_example_args(v, constructor_arg_db), extra_kwargs=extra_kwargs)
-              if inspect.isclass(v) else torch.nn.Parameter(v.to(*extra_kwargs))
-              if isinstance(v, torch.nn.Parameter) else v for k, v in kwargs.items()}
-    kwargs.update(extra_kwargs)
-    return args, kwargs
-
-
-def generate_test_func(test_cls, module_cls, constructor_arg_db,
-                       verify_kwargs=True, module_is_lazy=False, check_nonexistent_arg=True):
-    # Generate a function for testing the given module.
-    @dtypes(*floating_types())
-    def run_test(test_cls, device, dtype, module_cls=module_cls):
-        # Check if this module creates parameters or registers buffers.
-        # The mock magic here passes through to the real Parameter / register_buffer
-        # logic and is only used to check for calls.
-        args, kwargs = get_example_args(module_cls, constructor_arg_db)
-
-        # Some modules need to pass factory_kwargs so as not to conflict with existing args such as dtype.
-        module_needs_factory_kwargs = 'factory_kwargs' in kwargs
-        if module_needs_factory_kwargs:
-            del kwargs['factory_kwargs']
-            extra_kwargs = {
-                'factory_kwargs': {
-                    'device': device,
-                    'dtype': dtype,
-                }
-            }
-        else:
-            extra_kwargs = {
-                'device': device,
-                'dtype': dtype,
-            }
-
-        parameter_new = mock_wrapper(torch.nn.Parameter.__new__)
-        with patch.object(torch.nn.Parameter, '__new__', parameter_new):
-            register_buffer = mock_wrapper(torch.nn.Module.register_buffer)
-            with patch.object(torch.nn.Module, 'register_buffer', register_buffer):
-                m = module_cls(*args, **kwargs)
-                module_creates_params_or_buffers = parameter_new.mock.called or register_buffer.mock.called
-
-        # == Verify factory kwargs are supported. ==
-        if verify_kwargs and module_creates_params_or_buffers:
-            args, kwargs = get_example_args(module_cls, constructor_arg_db,
-                                            extra_kwargs=extra_kwargs)
-
-            if module_is_lazy:
-                # Ensure device and dtype are passed to all UninitializedParameters and UninitializedBuffers.
-                uninit_param_new = mock_wrapper(torch.nn.UninitializedParameter.__new__)
-                with patch.object(torch.nn.UninitializedParameter, '__new__', uninit_param_new):
-                    uninit_buffer_new = mock_wrapper(torch.nn.UninitializedBuffer.__new__)
-                    with patch.object(torch.nn.UninitializedBuffer, '__new__', uninit_buffer_new):
-                        m = module_cls(*args, **kwargs)
-                        uninit_param_new.mock.assert_has_calls(
-                            [mock.call(device=device, dtype=dtype) for _ in uninit_param_new.mock.mock_calls])
-                        uninit_buffer_new.mock.assert_has_calls(
-                            [mock.call(device=device, dtype=dtype) for _ in uninit_buffer_new.mock.mock_calls])
-            else:
-                # Check device placement and dtype for parameters and buffers.
-                # Only verify floating point dtypes since that's what the kwarg applies to.
-                # Note that dtype verification is also skipped if the module requires factory_kwargs.
-                m = module_cls(*args, **kwargs)
-                for name, param in m.named_parameters():
-                    test_cls.assertEqual(
-                        str(param.device), device,
-                        f'Parameter {name} is on {param.device.type} instead of the expected device {device}')
-                    if param.dtype.is_floating_point and not module_needs_factory_kwargs:
-                        test_cls.assertEqual(
-                            param.dtype, dtype,
-                            f'Parameter {name} is of dtype {param.dtype} instead of the expected dtype {dtype}')
-                for name, buffer in m.named_buffers():
-                    test_cls.assertEqual(
-                        str(buffer.device), device,
-                        f'Buffer {name} is on {buffer.device.type} instead of the expected device {device}')
-                    if buffer.dtype.is_floating_point and not module_needs_factory_kwargs:
-                        test_cls.assertEqual(
-                            buffer.dtype, dtype,
-                            f'Buffer {name} is of dtype {buffer.dtype} instead of the expected dtype {dtype}')
-
-        # == Verify passing a nonexistent arg errors out. ==
-        if check_nonexistent_arg:
-            with test_cls.assertRaises(TypeError):
-                m = module_cls(*args, **kwargs, nonexistent_arg='foo')
-
-    return run_test
-
-
-def generate_tests(test_cls, constructor_arg_db):
-    # test all modules underneath these namespaces...
-    NAMESPACES = [
-        torch.nn,
-        torch.ao.nn.qat,
-        torch.ao.nn.quantized,
-        torch.ao.nn.qat,
-        torch.ao.nn.quantizable,
-        torch.ao.nn.quantized,
-    ]
-    # ...except these
-    MODULES_TO_SKIP = {
-        torch.nn.Module,
-        torch.nn.Container,  # deprecated
-        torch.nn.NLLLoss2d,  # deprecated
-        # TODO: Remove these 4 from this list once the ASan issue is fixed.
-        # See https://github.com/pytorch/pytorch/issues/55396
-        torch.ao.nn.quantized.Embedding,
-        torch.ao.nn.quantized.EmbeddingBag,
-        torch.ao.nn.quantized.Embedding,
-        torch.ao.nn.quantized.EmbeddingBag,
-        torch.ao.nn.quantized.LSTM,
-        torch.ao.nn.quantized.MultiheadAttention,
-    }
-    # no need to support kwargs for these modules even though
-    # they have parameters / buffers because they are passed in
-    # already instantiated s
-    MODULES_WITHOUT_KWARGS_SUPPORT = {
-        torch.nn.BCELoss,
-        torch.nn.BCEWithLogitsLoss,
-        torch.nn.CrossEntropyLoss,
-        torch.nn.FractionalMaxPool2d,
-        torch.nn.FractionalMaxPool3d,
-        torch.nn.MultiLabelSoftMarginLoss,
-        torch.nn.MultiMarginLoss,
-        torch.nn.NLLLoss,
-        torch.nn.TransformerDecoder,
-        torch.nn.TransformerEncoder,
-    }
-    # modules that supported kwargs before
-    MODULES_WITH_PREVIOUS_KWARGS = {
-        torch.nn.Identity,
-    }
-    # lazy modules don't instantiate parameters right away
-    LAZY_MODULES = {
-        torch.nn.LazyBatchNorm1d,
-        torch.nn.LazyBatchNorm2d,
-        torch.nn.LazyBatchNorm3d,
-        torch.nn.LazyConv1d,
-        torch.nn.LazyConv2d,
-        torch.nn.LazyConv3d,
-        torch.nn.LazyConvTranspose1d,
-        torch.nn.LazyConvTranspose2d,
-        torch.nn.LazyConvTranspose3d,
-        torch.nn.LazyConvTranspose3d,
-        torch.nn.LazyInstanceNorm1d,
-        torch.nn.LazyInstanceNorm2d,
-        torch.nn.LazyInstanceNorm3d,
-        torch.nn.LazyLinear,
-    }
-    # these modules requires FBGEMM backend to instantiate
-    MODULES_THAT_REQUIRE_FBGEMM = {
-        torch.ao.nn.quantized.Conv1d,
-        torch.ao.nn.quantized.Conv2d,
-        torch.ao.nn.quantized.Conv3d,
-        torch.ao.nn.quantized.ConvTranspose1d,
-        torch.ao.nn.quantized.ConvTranspose2d,
-        torch.ao.nn.quantized.ConvTranspose3d,
-        torch.ao.nn.quantized.Linear,
-        # Remove the lines below after AO migration is complete
-        torch.ao.nn.quantized.Conv1d,
-        torch.ao.nn.quantized.Conv2d,
-        torch.ao.nn.quantized.Conv3d,
-        torch.ao.nn.quantized.ConvTranspose1d,
-        torch.ao.nn.quantized.ConvTranspose2d,
-        torch.ao.nn.quantized.ConvTranspose3d,
-        torch.ao.nn.quantized.Linear,
-    }
-
-    for namespace in NAMESPACES:
-        # the "nn" in "torch.nn"
-        namespace_basename = namespace.__name__.split('.')[-1]
-        for module_name in namespace.modules.__all__:
-            # class object for this module (e.g. torch.nn.Linear)
-            module_cls = getattr(namespace.modules, module_name)
-            if module_cls in MODULES_TO_SKIP:
-                continue
-            verify_kwargs = module_cls not in MODULES_WITHOUT_KWARGS_SUPPORT
-            module_is_lazy = module_cls in LAZY_MODULES
-            check_nonexistent_arg = module_cls not in MODULES_WITH_PREVIOUS_KWARGS
-            # Generate a function for testing this module and setattr it onto the test class.
-            run_test = generate_test_func(test_cls, module_cls, constructor_arg_db,
-                                          verify_kwargs=verify_kwargs,
-                                          module_is_lazy=module_is_lazy,
-                                          check_nonexistent_arg=check_nonexistent_arg)
-            test_name = f'test_{namespace_basename}_{module_name}'
-            if module_cls in MODULES_THAT_REQUIRE_FBGEMM:
-                run_test = skipIfNoFBGEMM(run_test)
-            setattr(TestModuleInit, test_name, run_test)
-
-
-class TestModuleInit(TestCase):
-    _ignore_not_implemented_error = False
-
-
-generate_tests(TestModuleInit, build_constructor_arg_db())
-instantiate_device_type_tests(TestModuleInit, globals())
-
-
-if __name__ == '__main__':
-    run_tests()
diff --git a/test/test_module_tracker.py b/test/test_module_tracker.py
new file mode 100644
index 0000000000000..e465b128edbea
--- /dev/null
+++ b/test/test_module_tracker.py
@@ -0,0 +1,73 @@
+# Owner(s): ["module: unknown"]
+
+from copy import copy
+
+import torch
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.utils.module_tracker import ModuleTracker
+
+
+class TestModuleTracker(TestCase):
+    def test_module_hierarchy(self):
+        seen_fw = []
+        seen_bw = []
+
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                x = x["a"].relu_()
+                seen_fw.append((copy(tracker.parents), tracker.is_bw))
+                x.register_hook(
+                    lambda grad: seen_bw.append((copy(tracker.parents), tracker.is_bw))
+                )
+                return {"a": torch.mm(x, x)}
+
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = Foo()
+                self.b = Foo()
+
+            def forward(self, x):
+                return self.b(self.a(x))
+
+        mod = Mod()
+
+        with ModuleTracker() as tracker:
+            mod({"a": torch.randn(10, 10, requires_grad=True).clone()})[
+                "a"
+            ].sum().backward()
+            mod({"a": torch.randn(10, 10, requires_grad=True).clone()})[
+                "a"
+            ].sum().backward()
+
+        self.assertEqual(
+            seen_fw,
+            [
+                ({"Global", "Mod", "Mod.a"}, False),
+                ({"Global", "Mod", "Mod.b"}, False),
+                ({"Global", "Mod", "Mod.a"}, False),
+                ({"Global", "Mod", "Mod.b"}, False),
+            ],
+        )
+
+        self.assertEqual(
+            seen_bw,
+            [
+                ({"Global", "Mod", "Mod.b"}, True),
+                ({"Global", "Mod", "Mod.a"}, True),
+                ({"Global", "Mod", "Mod.b"}, True),
+                ({"Global", "Mod", "Mod.a"}, True),
+            ],
+        )
+
+    def test_bw_detection(self):
+        mod = torch.nn.Linear(2, 2)
+
+        with ModuleTracker() as tracker:
+            mod(torch.rand(2, requires_grad=True)).sum().backward()
+            self.assertFalse(tracker.is_bw)
+            self.assertEqual(tracker.parents, {"Global"})
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_modules.py b/test/test_modules.py
index da9b466a65116..a62cc635a3d30 100644
--- a/test/test_modules.py
+++ b/test/test_modules.py
@@ -15,7 +15,7 @@
 from torch.testing._internal.common_modules import module_db, modules, ModuleErrorEnum, TrainEvalMode
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck,
-    gradgradcheck)
+    gradgradcheck, parametrize, wrapSwapTensorsTest)
 from unittest.mock import patch, call
 
 
@@ -212,7 +212,7 @@ def test_repr(self, device, dtype, module_info, training):
             str(m)
 
     @modules(module_db)
-    def test_pickle(self, device, dtype, module_info, training):
+    def test_save_load(self, device, dtype, module_info, training):
         # Test that module can be pickled and unpickled.
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
@@ -229,12 +229,13 @@ def test_pickle(self, device, dtype, module_info, training):
                 m = module_cls(*args, **kwargs)
                 m.to(device).to(dtype)
                 m.train(training)
+                sd = m.state_dict()
 
                 # === Do forward pass. ===
                 args, kwargs = module_input.forward_input.args, module_input.forward_input.kwargs
                 output = m(*args, **kwargs)
 
-                # === Check unpickled module gives the same output. ===
+                # === Check saved/loaded module gives the same output. ===
                 with tempfile.TemporaryFile() as f:
                     torch.save(m, f)
                     f.seek(0)
@@ -242,6 +243,17 @@ def test_pickle(self, device, dtype, module_info, training):
                     output_from_copy = m_copy(*args, **kwargs)
                     self.assertEqual(output, output_from_copy)
 
+                # === Check saved/loaded state_dict are the same (including weights_only load). ===
+                with tempfile.TemporaryFile() as f:
+                    torch.save(sd, f)
+                    f.seek(0)
+                    sd_copy = torch.load(f)
+                    self.assertEqual(sd_copy, sd)
+                    del sd_copy
+                    f.seek(0)
+                    sd_copy_wo = torch.load(f, weights_only=True)
+                    self.assertEqual(sd_copy_wo, sd)
+
     @skipMeta
     @modules([module_info for module_info in module_db
               if 'inplace' in signature(module_info.module_cls).parameters])
@@ -816,10 +828,8 @@ def test_device_ctx_init(self, device, dtype, module_info, training):
 
         for module_input, module_input_meta in zip(module_inputs, module_inputs_meta):
             c_args, c_kwargs = module_input.constructor_input.args, module_input.constructor_input.kwargs
-            fw_args, fw_kwargs = module_input.forward_input.args, module_input.forward_input.kwargs
 
             c_args_meta, c_kwargs_meta = module_input_meta.constructor_input.args, module_input_meta.constructor_input.kwargs
-            fw_args_meta, fw_kwargs_meta = module_input_meta.forward_input.args, module_input_meta.forward_input.kwargs
 
             m_cpu = module_cls(*c_args, **c_kwargs)
 
@@ -853,6 +863,124 @@ def test_errors(self, device, dtype, module_info, training):
             else:
                 raise NotImplementedError(f"Unknown error type {error_input.error_on}")
 
+    @modules([module for module in module_db if not module.is_lazy])
+    @parametrize('swap', [True, False])
+    @parametrize('set_grad', [True, False])
+    @wrapSwapTensorsTest()
+    def test_to(self, device, dtype, module_info, training, swap, set_grad):
+        module_cls = module_info.module_cls
+        devices = ['cpu']
+        if torch.cuda.is_available():
+            devices += ['cuda']
+        dtypes = module_info.dtypes
+        module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
+                                                       requires_grad=False, training=training)
+        torch.__future__.set_swap_module_params_on_conversion(swap)
+
+        for module_input in module_inputs:
+            c_args, c_kwargs = module_input.constructor_input.args, module_input.constructor_input.kwargs
+
+            m = module_cls(*c_args, **c_kwargs)
+
+            # Avoid using `module.to()` when constructing module since that is the method we are testing
+            def _to(m, set_grad=False):
+                for c in m.children():
+                    _to(c, set_grad=set_grad)
+                for n, p in m.named_parameters(recurse=False):
+                    new_p = torch.nn.Parameter(p.detach().clone().to(device, dtype))
+                    setattr(m, n, new_p)
+                    if set_grad:
+                        new_p.grad = torch.randn_like(new_p)
+                for n, b in m.named_buffers(recurse=False):
+                    new_b = b.detach().clone().to(device, dtype)
+                    setattr(m, n, new_b)
+            _to(m, set_grad=set_grad)
+
+            prev_device, prev_dtype = device, dtype
+            for device_, dtype_ in product(devices, dtypes):
+                # if device/dtype do not change, grad.to(device, dtype) is a no-op so
+                # swapping will not change ._cdata
+                # parameters will be wrapped in an nn.Parameter before swapping
+                # which will cause the ._cdata to change
+                g_no_swap = device_ == prev_device and dtype_ == prev_dtype
+                prev_device, prev_dtype = device_, dtype_
+
+                p_ids_before = [id(p) for p in m.parameters()]
+                p_cdatas_before = [p._cdata for p in m.parameters()]
+                if set_grad:
+                    g_ids_before = [id(p.grad) for p in m.parameters()]
+                    g_cdatas_before = [p.grad._cdata for p in m.parameters()]
+
+                m.to(device=device_, dtype=dtype_)
+
+                self.assertTrue(all(isinstance(p, torch.nn.Parameter) for p in m.parameters()))
+                self.assertTrue(all(p.device.type == device_ for p in m.parameters()))
+                self.assertTrue(all(p.dtype == dtype_ for p in m.parameters()))
+                p_ids_after = [id(p) for p in m.parameters()]
+                p_cdatas_after = [p._cdata for p in m.parameters()]
+
+                if set_grad:
+                    self.assertTrue(all(p.grad.device.type == device_ for p in m.parameters()))
+                    self.assertTrue(all(p.grad.dtype == dtype_ for p in m.parameters()))
+                    g_ids_after = [id(p.grad) for p in m.parameters()]
+                    g_cdatas_after = [p.grad._cdata for p in m.parameters()]
+
+                if swap:
+                    # id same, ._cdata differs --> swapped cdata of THPVariable
+                    self.assertTrue(all(a == b for a, b in zip(p_ids_before, p_ids_after)))
+                    self.assertTrue(all(a != b for a, b in zip(p_cdatas_before, p_cdatas_after)))
+                    if set_grad:
+                        self.assertTrue(
+                            all(a == b if g_no_swap else a != b for a, b in zip(g_cdatas_before, g_cdatas_after)))
+                else:
+                    # id and _cdata remain the same --> .data setting
+                    self.assertTrue(all(a == b for a, b in zip(p_cdatas_before, p_cdatas_after)))
+                    self.assertTrue(all(a == b for a, b in zip(p_ids_before, p_ids_after)))
+                    if set_grad:
+                        self.assertTrue(all(a == b for a, b in zip(g_cdatas_before, g_cdatas_after)))
+                        self.assertTrue(all(a == b for a, b in zip(g_ids_before, g_ids_after)))
+
+
+    @modules([module for module in module_db if not module.is_lazy], allowed_dtypes=[torch.float32])
+    @parametrize('swap', [True, False])
+    @wrapSwapTensorsTest()
+    def test_to_empty(self, device, dtype, module_info, swap, training):
+        module_cls = module_info.module_cls
+
+        with torch.device("meta"):
+            module_inputs = module_info.module_inputs_func(module_info, device=None, dtype=dtype,
+                                                           requires_grad=False, training=training)
+
+        torch.__future__.set_swap_module_params_on_conversion(swap)
+        device_ = torch.device(device)
+
+        for module_input in module_inputs:
+            c_args, c_kwargs = module_input.constructor_input.args, module_input.constructor_input.kwargs
+
+            with torch.device("meta"):
+                m = module_cls(*c_args, **c_kwargs)
+
+            p_ids_before = [id(p) for p in m.parameters()]
+            p_cdatas_before = [p._cdata for p in m.parameters()]
+            m.to_empty(device=device_)
+
+            self.assertTrue(all(isinstance(p, torch.nn.Parameter) for p in m.parameters()))
+            self.assertTrue(all(p.device == device_ for p in m.parameters()))
+            self.assertTrue(all(p.dtype == dtype for p in m.parameters()))
+            p_ids_after = [id(p) for p in m.parameters()]
+            p_cdatas_after = [p._cdata for p in m.parameters()]
+
+            if swap:
+                # id same, ._cdata differs --> swapped cdata of THPVariable
+                self.assertTrue(all(a == b for a, b in zip(p_ids_before, p_ids_after)))
+                self.assertTrue(all(a != b for a, b in zip(p_cdatas_before, p_cdatas_after)))
+            else:
+                # id and ._cdata differ
+                # meta and device have different shallow copy types, so this will create a new
+                # parameter and assign it to the module
+                self.assertTrue(all(a != b for a, b in zip(p_ids_before, p_ids_after)))
+                self.assertTrue(all(a != b for a, b in zip(p_cdatas_before, p_cdatas_after)))
+
 
 instantiate_device_type_tests(TestModule, globals(), allow_mps=True)
 
diff --git a/test/test_monitor.py b/test/test_monitor.py
index 8a9aabfcc7ad0..59d763421d001 100644
--- a/test/test_monitor.py
+++ b/test/test_monitor.py
@@ -1,7 +1,7 @@
 # Owner(s): ["oncall: r2p"]
 
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests,
+    TestCase, run_tests, skipIfTorchDynamo,
 )
 
 from datetime import timedelta, datetime
@@ -75,6 +75,7 @@ def test_log_event(self) -> None:
         self.assertIsNotNone(e.data)
         log_event(e)
 
+    @skipIfTorchDynamo("Really weird error")
     def test_event_handler(self) -> None:
         events = []
 
@@ -97,6 +98,8 @@ def handler(event: Event) -> None:
         log_event(e)
         self.assertEqual(len(events), 2)
 
+
+@skipIfTorchDynamo("Really weird error")
 class TestMonitorTensorboard(TestCase):
     def setUp(self):
         global SummaryWriter, event_multiplexer
diff --git a/test/test_mps.py b/test/test_mps.py
index 741adc1641d39..24c4e2d45e48e 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -22,8 +22,8 @@
 from torch.nn import Parameter
 from torch.testing._internal import opinfo
 from torch.testing._internal.common_utils import \
-    (gradcheck, gradgradcheck, run_tests, TestCase, download_file, IS_CI, NoTest,
-     skipIfSlowGradcheckEnv, suppress_warnings)
+    (gradcheck, gradgradcheck, parametrize, run_tests, TestCase, download_file, IS_CI,
+     NoTest, skipIfSlowGradcheckEnv, suppress_warnings)
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import get_all_dtypes, integral_types
 import torch.backends.mps
@@ -40,6 +40,7 @@
 )
 from torch.testing._internal.common_device_type import ops, dtypes, instantiate_device_type_tests, OpDTypes
 from torch.testing._internal.common_nn import NNTestCase
+from torch.testing._internal.common_quantization import _group_quantize_tensor
 import numpy as np
 import torch
 import torch.utils._pytree as pytree
@@ -60,6 +61,10 @@
     )
 )
 
+def xfailIfMacOS14_4Plus(func):
+    return unittest.expectedFailure(func) if product_version > 14.3 else func  # noqa: F821
+
+
 def mps_ops_grad_modifier(ops):
     XFAILLIST_GRAD = {
 
@@ -67,10 +72,12 @@ def mps_ops_grad_modifier(ops):
         'digamma': [torch.float32],
         'special.polygammaspecial_polygamma_n_0': [torch.float16],
         'polygammapolygamma_n_0': [torch.float16],
+        'nn.functional.binary_cross_entropy': [torch.float16],
 
         # Unimplemented ops
         '__getitem__': [torch.float16],
         '_segment_reduce': [torch.float16, torch.float32],
+        '_chunk_cat': [torch.float16, torch.float32],
         'unfold_copy': [torch.float16, torch.float32],  # unfold_backward is not implemented
         'unfold': [torch.float16, torch.float32],
         'sparse.mmreduce': [torch.float32],  # csr not supported
@@ -138,10 +145,6 @@ def mps_ops_grad_modifier(ops):
         # Unimplemented
         'logaddexp2': [torch.float32],
 
-        # The result of pow(9 , 8) is showing 43046716, whereas it should've been 43046721.
-        # fixed in macOS 13. We are not raising error.
-        '__rpow__': [torch.float32],
-        'pow': [torch.float32],
     }
 
     MACOS_BEFORE_13_3_XFAILLIST_GRAD = {
@@ -158,16 +161,6 @@ def mps_ops_grad_modifier(ops):
         # On the backward pass for `sort` both are used (values and indices), thus resulting in a issmatch between CPU and MPS.
         # Running `msort` with stable `sort` passes.
         'msort': [torch.float16],
-
-        # The result of pow(9 , 8) is showing 43046716, whereas it should've been 43046721.
-        # fixed in macOS 13. We are not raising error.
-        'pow': [torch.float32],
-        '__rpow__': [torch.float32],
-
-        # See https://github.com/pytorch/pytorch/issues/106112 for more information
-        'cumprod': [torch.float32, torch.float16],
-        # See https://github.com/pytorch/pytorch/issues/109166 for more information
-        'masked.cumprod': [torch.float16],
     }
 
     SKIPLIST_GRAD = {
@@ -189,6 +182,12 @@ def mps_ops_grad_modifier(ops):
         'msort': [torch.float16],
     }
 
+    ON_MPS_XFAILLIST = {
+        # Failures due to lack of implementation of downstream functions on MPS backend
+        # TODO: remove these once downstream function 'aten::_linalg_svd.U' have been implemented
+        'linalg.matrix_rank': None,
+    }
+
     def addDecorator(op, d) -> None:
         op.decorators = list(op.decorators) if op.decorators is not None else []
         op.decorators.append(d)
@@ -205,6 +204,11 @@ def addDecorator(op, d) -> None:
                          unittest.skip,
                          dtypes=SKIPLIST_GRAD[key]))
 
+        if key in ON_MPS_XFAILLIST:
+            addDecorator(op, DecorateInfo(
+                         unittest.expectedFailure,
+                         dtypes=ON_MPS_XFAILLIST[key]))
+
         if key in MACOS_12_3_XFAILLIST_GRAD and (not torch.backends.mps.is_macos13_or_newer()):
             addDecorator(op, DecorateInfo(
                          unittest.expectedFailure,
@@ -227,7 +231,9 @@ def mps_ops_modifier(ops):
         '__radd__',
         '__rmul__',
         '__getitem__',
+        'abs',
         'add',
+        'argwhere',
         'atleast_1d',
         'atleast_2d',
         'atleast_3d',
@@ -235,9 +241,12 @@ def mps_ops_modifier(ops):
         'as_strided_scatter',
         'broadcast_tensors',
         'broadcast_to',
+        'chalf',
         'cfloat',
         'chunk',
         'clone',
+        'conj',
+        'conj_physical',
         'contiguous',
         'diag',
         'diag_embed',
@@ -255,8 +264,10 @@ def mps_ops_modifier(ops):
         'flatten',
         'fill',
         'full',
+        'H',
         'hsplit',
         'imag',
+        'index_select',
         'isfinite',
         'isinf',
         'isreal',
@@ -268,6 +279,7 @@ def mps_ops_modifier(ops):
         'logspace',
         'linspacetensor_overload',
         'logspacetensor_overload',
+        'mH',
         'mT',
         'masked_scatter',
         'masked_select',
@@ -277,9 +289,12 @@ def mps_ops_modifier(ops):
         'mul',
         'narrow',
         'narrow_copy',
-        'nn.functional.padcircular',
+        'nn.functional.conv1d',
+        'nn.functional.conv_transpose1d',
         'nn.functional.feature_alpha_dropoutwithout_train',
+        'nn.functional.padcircular',
         'nn.functional.unfold',
+        'nonzero',
         'ones',
         'outer',
         'permute',
@@ -287,6 +302,7 @@ def mps_ops_modifier(ops):
         'randn',
         'ravel',
         'real',
+        'repeat_interleave',
         'reshape_as',
         'reshape',
         'resolve_conj',
@@ -297,10 +313,12 @@ def mps_ops_modifier(ops):
         'slice',
         'split',
         'split_with_sizes',
+        'split_with_sizes_copy',
         'splitlist_args',
         'squeeze',
         'squeezemultiple',
         'sub',
+        'svd',
         't',
         'tensor_split',
         'transpose',
@@ -322,18 +340,21 @@ def mps_ops_modifier(ops):
 
     AFTER_MACOS_14_0_SUPPORTED_COMPLEX_OPS = {
         '__rdiv__',
+        '_chunk_cat',
         'acos',
         'acosh',
         'all',
+        'allclose',
         'any',
-        'argwhere',
+        'addcdiv',
+        'addcmul',
         'asin',
         'atan',
         'atanh',
+        'bfloat16',
         'bool',
         'cartesian_prod',
         'cat',
-        'chalf',
         'char',
         'column_stack',
         'combinations',
@@ -342,38 +363,60 @@ def mps_ops_modifier(ops):
         'cosh',
         'count_nonzero',
         'diff',
+        'div',
+        'divno_rounding_mode',
         'dot',
         'dstack',
+        'eq',
+        'equal',
         'exp2',
         'exp',
         'expm1',
+        'fft.fft',
+        'fft.fft2',
+        'fft.fftn',
         'fft.fftshift',
+        'fft.ifft',
+        'fft.ifft2',
+        'fft.ifftn',
         'fft.ifftshift',
         'flip',
         'fliplr',
         'flipud',
         'float',
+        'gradient',
         'half',
         'hstack',
         'int',
+        'isclose',
+        'isnan',
         'ldexp',
         'log10',
         'log1p',
         'log2',
         'log',
+        'logical_and',
         'logical_not',
+        'logical_or',
+        'logical_xor',
         'long',
+        'masked_fill',
+        'masked.mean',
         'masked.prod',
+        'masked.std',
         'masked.sum',
+        'masked.var',
         'mean',
+        'ne',
         'neg',
         'nn.functional.padconstant',
         'nn.functional.padreflect',
         'nn.functional.padreplicate',
         'nn.functional.pixel_shuffle',
         'nn.functional.pixel_unshuffle',
+        'nn.functional.rms_norm',
+        'nn.functional.softsign',
         'nn.functional.tanhshrink',
-        'nonzero',
         'prod',
         'reciprocal',
         'roll',
@@ -384,14 +427,19 @@ def mps_ops_modifier(ops):
         'sin',
         'sinh',
         'sqrt',
+        'square',
         'stack',
+        'stft',
         'sum',
         'sum_to_size',
         'tan',
         'tanh',
         'trace',
+        'trapz',
+        'trapezoid',
         'tril',
         'triu',
+        'true_divide',
         'vstack',
         'where',
     }
@@ -518,13 +566,13 @@ def mps_ops_modifier(ops):
         # - MPS output: tensor([102.6681, inf])
         # In the latter case, inf is probably correct (this is what scipy does).
         'polygamma': [torch.float32, torch.uint8],
-        'polygammapolygamma_n_0': [torch.float32, torch.int16, torch.int32, torch.int64, torch.int8],
-        'polygammapolygamma_n_2': [torch.float32, torch.int16, torch.int32, torch.int64, torch.int8],
-        'polygammapolygamma_n_1': [torch.float32, torch.int16, torch.int32, torch.int64, torch.int8],
-        'polygammapolygamma_n_3': [torch.float32, torch.int16, torch.int32, torch.int64, torch.int8],
-        'polygammapolygamma_n_4': [torch.float32, torch.int16, torch.int32, torch.int64, torch.int8],
-        'special.polygamma': [torch.float32, torch.int16, torch.int32, torch.int64, torch.int8],
-        'special.polygammaspecial_polygamma_n_0': [torch.float32, torch.int16, torch.int32, torch.int64, torch.int8],
+        'polygammapolygamma_n_0': [torch.float32, torch.int16, torch.int8],
+        'polygammapolygamma_n_2': [torch.float32, torch.int16, torch.int8],
+        'polygammapolygamma_n_1': [torch.float32, torch.int16, torch.int8],
+        'polygammapolygamma_n_3': [torch.float32, torch.int16, torch.int8],
+        'polygammapolygamma_n_4': [torch.float32, torch.int16, torch.int8],
+        'special.polygamma': [torch.float32, torch.int16, torch.int32, torch.int8],
+        'special.polygammaspecial_polygamma_n_0': [torch.float32, torch.int16, torch.int8],
 
         # Failures due to precision issues (due to fast-math). These has been fixed in MacOS 13.3+
         'tan': [torch.float32],
@@ -552,12 +600,6 @@ def mps_ops_modifier(ops):
         'linalg.vander': [torch.int64],
     }
 
-    MACOS_BEFORE_14_0_XFAILLIST = {
-        'cfloat': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8, torch.float16, torch.float32],
-        'chalf': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8, torch.int8, torch.float16, torch.float32],
-    }
-
-
     MACOS_AFTER_13_1_XFAILLIST = {
         # before macOS 13.2 it falls back to cpu and pass the forward pass
         'grid_sampler_2d': [torch.float32],  # Unsupported Border padding mode
@@ -587,13 +629,13 @@ def mps_ops_modifier(ops):
         # - MPS output: tensor([102.6681, inf])
         # In the latter case, inf is probably correct (this is what scipy does).
         'polygamma': [torch.float32, torch.uint8],
-        'polygammapolygamma_n_0': [torch.float32, torch.int16, torch.int32, torch.int64, torch.int8],
-        'polygammapolygamma_n_2': [torch.float32, torch.int16, torch.int32, torch.int64, torch.int8],
-        'polygammapolygamma_n_1': [torch.float32, torch.int16, torch.int32, torch.int64, torch.int8],
-        'polygammapolygamma_n_3': [torch.float32, torch.int16, torch.int32, torch.int64, torch.int8],
-        'polygammapolygamma_n_4': [torch.float32, torch.int16, torch.int32, torch.int64, torch.int8],
-        'special.polygamma': [torch.float32, torch.int16, torch.int32, torch.int64, torch.int8],
-        'special.polygammaspecial_polygamma_n_0': [torch.float32, torch.int16, torch.int32, torch.int64, torch.int8],
+        'polygammapolygamma_n_0': [torch.float32, torch.int16, torch.int8],
+        'polygammapolygamma_n_2': [torch.float32, torch.int16, torch.int8],
+        'polygammapolygamma_n_1': [torch.float32, torch.int16, torch.int8],
+        'polygammapolygamma_n_3': [torch.float32, torch.int16, torch.int8],
+        'polygammapolygamma_n_4': [torch.float32, torch.int16, torch.int8],
+        'special.polygamma': [torch.float32, torch.int16, torch.int32, torch.int8],
+        'special.polygammaspecial_polygamma_n_0': [torch.float32, torch.int16, torch.int8],
     }
 
     # Those ops are not expected to work
@@ -604,26 +646,9 @@ def mps_ops_modifier(ops):
         'log_sigmoid_forward': None,
         'linalg.eig': None,
         'linalg.eigvals': None,
-        'fft.fft': None,
-        'fft.fft2': None,
-        'fft.fftn': None,
-        'fft.hfft': None,
         'fft.hfft2': None,
         'fft.hfftn': None,
-        'fft.ifft': None,
-        'fft.ifft2': None,
-        'fft.ifftn': None,
-        'fft.ihfft': None,
-        'fft.ihfft2': None,
-        'fft.ihfftn': None,
-        'fft.irfft': None,
-        'fft.irfft2': None,
-        'fft.irfftn': None,
-        'fft.rfft': None,
-        'fft.rfft2': None,
-        'fft.rfftn': None,
         'put': None,
-        'stft': None,
         'nn.functional.conv_transpose3d': None,
         'rounddecimals_neg_3': None,
         'rounddecimals_3': None,
@@ -647,8 +672,10 @@ def mps_ops_modifier(ops):
         'igamma': None,
         'igammac': None,
         'index_copy': None,
-        'index_reduce': None,
-        'isin': None,
+        'index_reduceprod': None,
+        'index_reducemean': None,
+        'index_reduceamax': None,
+        'index_reduceamin': None,
         'isneginf': None,
         'isposinf': None,
         'kthvalue': None,
@@ -698,7 +725,6 @@ def mps_ops_modifier(ops):
         'nn.functional.adaptive_max_pool3d': None,
         'nn.functional.interpolatearea': None,
         'nn.functional.interpolatebicubic': None,
-        'nn.functional.interpolatelinear': None,
         'nn.functional.interpolatetrilinear': None,
         # TODO: max_pool2d for integral types fails the numerical test
         'nn.functional.max_pool2d': (integral_types() if product_version < 14.0 else
@@ -721,7 +747,6 @@ def mps_ops_modifier(ops):
         'nn.functional.norm': None,
         'ormqr': None,
         'pca_lowrank': None,
-        'pinverse': None,
         'qr': None,
         'quantile': None,
         'rsub': None,
@@ -768,8 +793,6 @@ def mps_ops_modifier(ops):
         'special.spherical_bessel_j0': None,
         'special.xlog1py': None,
         'special.zeta': None,
-        'std_mean': None,
-        'std_meanunbiased': None,
         'svd_lowrank': None,
         'symeig': None,
         'take': None,
@@ -783,7 +806,6 @@ def mps_ops_modifier(ops):
         'geometric_': None,
         'log_normal_': None,
         'log_normal': None,
-        'bfloat16': None,
         'cdouble': None,
         'double': None,
         'nn.functional.softminwith_dtype': None,
@@ -791,9 +813,7 @@ def mps_ops_modifier(ops):
         'softmaxwith_dtype': None,
         'float_power': None,
         'full_like': None,
-        'linalg.matrix_rank': None,
         'linalg.matrix_rankhermitian': None,
-        'linalg.pinv': None,
         'linalg.pinvhermitian': None,
         'nonzero_static': None,
 
@@ -860,6 +880,32 @@ def mps_ops_modifier(ops):
         'round': [torch.float16],
     }
 
+    if product_version < 14.0:
+        # FFT and BFloat16 support was added in MacOS 14
+        UNIMPLEMENTED_XFAILLIST.update({
+            'bfloat16': None,
+            'fft.fft': None,
+            'fft.fft2': None,
+            'fft.fftn': None,
+            'fft.hfft': None,
+            'fft.ifft': None,
+            'fft.ifft2': None,
+            'fft.ifftn': None,
+            'fft.ihfft': None,
+            'fft.ihfft2': None,
+            'fft.ihfftn': None,
+            'fft.irfft': None,
+            'fft.irfft2': None,
+            'fft.irfftn': None,
+            'fft.rfft': None,
+            'fft.rfft2': None,
+            'fft.rfftn': None,
+            'stft': None,
+            # Error in TestConsistencyCPU.test_output_match_isin_cpu_int32,
+            # not reproducible in later OS. Added assert to op if used in < 14.0
+            'isin': None,
+        })
+
     UNDEFINED_XFAILLIST = {
         # Top 60 operators
         # topk fails with duplicate indices
@@ -917,6 +963,12 @@ def mps_ops_modifier(ops):
         'logit': [torch.float16],
     }
 
+    ON_MPS_XFAILLIST = {
+        # Failures due to lack of implementation of downstream functions on MPS backend
+        # TODO: remove these once downstream function 'aten::_linalg_svd.U' have been implemented
+        'linalg.matrix_rank': None,
+    }
+
     EMPTY_OPS_SKIPLIST = {
         # Fill tensors with uninitialized data, causing mismatch with CPU.
         # They occasionally match, thus skipping them.
@@ -953,7 +1005,7 @@ def addDecorator(op, d) -> None:
                          dtypes=EMPTY_OPS_SKIPLIST[key]))
         if key in SKIPLIST:
             addDecorator(op, DecorateInfo(unittest.skip("Skipped!"), dtypes=SKIPLIST[key]))
-        for xfaillist in [UNIMPLEMENTED_XFAILLIST, UNDEFINED_XFAILLIST]:
+        for xfaillist in [UNIMPLEMENTED_XFAILLIST, UNDEFINED_XFAILLIST, ON_MPS_XFAILLIST]:
             if key in xfaillist:
                 addDecorator(op, DecorateInfo(
                              unittest.expectedFailure,
@@ -979,11 +1031,6 @@ def addDecorator(op, d) -> None:
                          unittest.expectedFailure,
                          dtypes=MACOS_12_3_XFAILLIST[key]))
 
-        if key in MACOS_BEFORE_14_0_XFAILLIST and product_version < 14.0:
-            addDecorator(op, DecorateInfo(
-                         unittest.expectedFailure,
-                         dtypes=MACOS_BEFORE_14_0_XFAILLIST[key]))
-
         # If ops is not supported for complex types, expect it to fail
         if key not in SUPPORTED_COMPLEX_OPS and (key not in AFTER_MACOS_14_0_SUPPORTED_COMPLEX_OPS or product_version < 14.0):
             addDecorator(op, DecorateInfo(unittest.expectedFailure, dtypes=[torch.complex32, torch.complex64]))
@@ -1027,12 +1074,6 @@ def mps_ops_error_inputs_modifier(ops):
         # unimplemented
         'logcumsumexp',
     }
-    if product_version < 14.0:
-        XFAILLIST.update({
-            # unsupported complex dtypes
-            'fft.hfft',
-            'fft.irfft',
-        })
 
     def addDecorator(op, d) -> None:
         op.decorators = list(op.decorators) if op.decorators is not None else []
@@ -1053,6 +1094,7 @@ def addDecorator(op, d) -> None:
     NNTestCase = NoTest  # noqa: F811
 
 product_version = float('.'.join(platform.mac_ver()[0].split('.')[:2]) or -1)
+total_memory = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]))
 
 # Determine whether to enable MPS memory leak check (uses same code as CUDA).
 TEST_MPS_MEM_LEAK_CHECK = os.getenv('PYTORCH_TEST_MPS_MEM_LEAK_CHECK', '0') == '1'
@@ -1122,19 +1164,17 @@ def __exit__(self, exec_type, exec_value, traceback):
         if caching_allocator_discrepancy and not driver_discrepancy:
             # Just raises a warning if the leak is not validated by the driver API
             msg = ("MPS caching allocator reports a memory leak not "
-                   "verified by the driver API in {}! "
-                   "Caching allocator allocated memory was {} and is now reported as {}. "
-                   "MPS driver allocated memory was {} and is now {}.").format(
-                self.name, self.caching_allocator_before,
-                caching_allocator_mem_allocated, self.driver_before, driver_mem_allocated)
+                   f"verified by the driver API in {self.name}! "
+                   f"Caching allocator allocated memory was {self.caching_allocator_before} "
+                   f"and is now reported as {caching_allocator_mem_allocated}. "
+                   f"MPS driver allocated memory was {self.driver_before} and is now {driver_mem_allocated}.")
             warnings.warn(msg)
         elif caching_allocator_discrepancy and driver_discrepancy:
             # A caching allocator discrepancy validated by the driver API is a failure
-            msg = ("MPS driver API confirmed a leak in {}! "
-                   "Caching allocator allocated memory was {} and is now reported as {}. "
-                   "MPS driver allocated memory was {} and is now {}.").format(
-                self.name, self.caching_allocator_before, caching_allocator_mem_allocated,
-                self.driver_before, driver_mem_allocated)
+            msg = (f"MPS driver API confirmed a leak in {self.name}! "
+                   f"Caching allocator allocated memory was {self.caching_allocator_before} "
+                   f"and is now reported as {caching_allocator_mem_allocated}. "
+                   f"MPS driver allocated memory was {self.driver_before} and is now {driver_mem_allocated}.")
 
             raise RuntimeError(msg)
 
@@ -1379,6 +1419,8 @@ def testNumbersGPU(self):
             self._testReluInPlace(
                 np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
                 device="mps")
+            self._testRelu(np.array([]).astype(t), device="mps")
+            self._testReluInPlace(np.array([]).astype(t), device="mps")
 
 class MatmulTest(TestCaseMPS):
     def _helper(self, shape_tensor_1, shape_tensor_2, expand_tensor_1_shape=None, expand_tensor_2_shape=None):
@@ -1432,9 +1474,19 @@ def testNpLeakyRelu(self):
                                                          0.9]]),
                 negative_slope=0.1))
 
-    def _testLeakyRelu(self, np_features, negative_slope, device):
-        cpu_x = torch.from_numpy(np_features).requires_grad_()
-        mps_x = torch.from_numpy(np_features).to('mps').requires_grad_()
+    def _testLeakyRelu(self, shape, dtype, negative_slope, contiguous):
+        cpu_x = torch.randn(shape, device='cpu', dtype=dtype)
+        mps_x = cpu_x.detach().clone().to('mps')
+
+        if not contiguous and not (0 in shape or len(shape) < 2):
+            # Tranposing will make the tensor non-contiguous
+            cpu_x = cpu_x.transpose(0, 1)
+            mps_x = mps_x.transpose(0, 1)
+            assert not mps_x.is_contiguous()
+
+        cpu_x.requires_grad_()
+        mps_x.requires_grad_()
+
         relu_op = torch.nn.LeakyReLU(negative_slope)
 
         cpu_leaky_relu = relu_op(cpu_x)
@@ -1442,19 +1494,24 @@ def _testLeakyRelu(self, np_features, negative_slope, device):
         torch.testing.assert_close(cpu_leaky_relu, mps_leaky_relu.to('cpu'))
 
         # test backward pass
+
         cpu_grad = torch.ones_like(cpu_leaky_relu)
         mps_grad = cpu_grad.to('mps')
-        cpu_leaky_relu.backward(gradient=cpu_grad)
+
         mps_leaky_relu.backward(gradient=mps_grad)
-        torch.testing.assert_close(cpu_x.grad, mps_x.grad.to('cpu'))
+        cpu_leaky_relu.backward(gradient=cpu_grad)
 
-    def testNumbersCPU(self):
-        for t in [np.float32]:
-            self._testLeakyRelu(
-                np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-                negative_slope=0.2,
-                device="cpu")
+        assert cpu_x.grad is not None  # Check that the grad is well-populated
+        self.assertEqual(cpu_x.grad, mps_x.grad)
 
+    def testNumbersCPU(self):
+        for t in [torch.float, torch.half]:
+            for shape in [[], (0,), (0, 3), (4,), (4, 3), (5, 4, 3)]:
+                for contiguous in [True, False]:
+                    self._testLeakyRelu(shape,
+                                        dtype=t,
+                                        negative_slope=0.2,
+                                        contiguous=contiguous)
 
 class TestAvgPool(TestCaseMPS):
     def _sum_pool2d(self, x, kernel_size):
@@ -1909,6 +1966,25 @@ def helper(bias_shape):
         helper(())
         helper((2, 4))
 
+    def test_linear_errors(self):
+        # Mixed CPU<->MPS tensors
+        size = (3, 3)
+
+        # Unsupported dtypes
+        with self.assertRaisesRegex(RuntimeError, "does not support linear for non-float weights"):
+            torch.nn.functional.linear(torch.rand(size, device='mps'),
+                                       torch.randint(-10, 10, size, dtype=torch.int8, device='mps'))
+
+        # Weigths on wrong device
+        with self.assertRaisesRegex(RuntimeError, "argument weight is on cpu but expected on mps"):
+            torch.nn.functional.linear(torch.rand(size, device='mps'),
+                                       torch.rand(size, device='cpu'))
+
+        # Input on wrong device
+        with self.assertRaisesRegex(RuntimeError, "argument input is on cpu but expected on mps"):
+            torch.nn.functional.linear(torch.rand(size, device='cpu'),
+                                       torch.rand(size, device='mps'))
+
     def _linear_helper(self, in_features, out_features, shape, bias=True, backward_pass=False):
         cpu_linear = torch.nn.Linear(in_features=in_features, out_features=out_features, device="cpu", bias=bias)
         mps_linear = torch.nn.Linear(in_features=in_features, out_features=out_features, device="mps", bias=bias)
@@ -4200,6 +4276,13 @@ def helper(dtype):
             self.assertEqual(e_string, "MPS does not support cumsum_out_mps op with int64 input." +
                              " Support has been added in macOS 13.3")
 
+    def test_cumsum_bool(self):
+        a = torch.ones(2**16, dtype=torch.bool)
+        t_cpu = a.cumsum(0)
+        t_mps = a.to("mps").cumsum(0)
+
+        self.assertEqual(t_cpu, t_mps)
+
     def test_cumsum_minus_one_axis(self):
         def helper(dtype):
             # Test with axis -1
@@ -4524,6 +4607,33 @@ def helper(shape, reduction):
         helper([7, 5, 2, 4, 6], 'sum')
         helper([8, 4, 5, 7, 6], 'mean')
 
+    def test_mse_loss_strided_output(self):
+        # https://github.com/pytorch/pytorch/issues/124621
+        lf = nn.MSELoss(reduction='none')
+        model_cpu = nn.Sequential(
+            nn.Conv1d(3, 3, 1),
+        )
+        model_mps = copy.deepcopy(model_cpu).to("mps")
+
+        x = torch.randn(128, 10, 3)
+        x = x.permute(0, 2, 1)
+
+        x_mps = x.detach().clone().to("mps").permute(0, 2, 1)
+        x_mps = x_mps.permute(0, 2, 1)
+
+        y = model_cpu(x)
+        y_mps = model_mps(x_mps)
+
+        y = y.permute(0, 2, 1)[:, :5, :]
+        y_mps = y_mps.permute(0, 2, 1)[:, :5, :]
+
+        y_hat = torch.randn(128, 5, 3)
+        y_hat_mps = y_hat.detach().clone().to("mps")
+
+        loss = lf(y, y_hat)
+        loss_mps = lf(y_mps, y_hat_mps)
+        self.assertEqual(loss, loss_mps)
+
     # Binary Cross Enropy
     def test_bce_loss_simple(self):
         def helper(shape, reduction):
@@ -5737,6 +5847,25 @@ def test_clamp_fp16_fp32(self):
 
         self.assertEqual(clamp_result_mps, clamp_result_cpu)
 
+    def test_clamp_nan(self):
+        t_mps = torch.tensor([torch.nan, 1, 2], device="mps")
+        t_cpu = torch.tensor([torch.nan, 1, 2], device="cpu")
+
+        clamp_min_max_mps = torch.clamp(t_mps, min=-100, max=100)
+        clamp_min_max_cpu = torch.clamp(t_cpu, min=-100, max=100)
+
+        self.assertEqual(clamp_min_max_mps, clamp_min_max_cpu)
+
+        clamp_min_mps = torch.clamp(t_mps, min=-100)
+        clamp_min_cpu = torch.clamp(t_cpu, min=-100)
+
+        self.assertEqual(clamp_min_mps, clamp_min_cpu)
+
+        clamp_max_mps = torch.clamp(t_mps, max=100)
+        clamp_max_cpu = torch.clamp(t_cpu, max=100)
+
+        self.assertEqual(clamp_max_mps, clamp_max_cpu)
+
     # Test clamp_min
     def test_clamp_min(self):
         def helper(n, c, h, w):
@@ -6375,6 +6504,18 @@ def helper(shape, alpha=1.0, memory_format=torch.contiguous_format):
                 for alpha in [0.000001, 1.0, 2.3, 0.34, 23]:
                     helper(shape, alpha, memory_fromat)
 
+    def test_elu_strided_output(self):
+        # https://github.com/pytorch/pytorch/issues/124834
+        elu_input = torch.randn(1, 1024, 500)
+        alpha = float(1)
+        inplace = False
+
+        elu_input_noncontiguous = elu_input.transpose(1, 2)
+        self.assertEqual(
+            F.elu(elu_input_noncontiguous.to('cpu'), alpha, inplace),
+            F.elu(elu_input_noncontiguous.to('mps'), alpha, inplace)
+        )
+
     # Test glu
     def test_glu(self):
         def helper(shape, dim=0):
@@ -6574,9 +6715,18 @@ def helper(input_shape, out_shape, return_indices, dtype, channels_last=False):
                 helper((2, 16, 16), (4, 4), return_indices, dtype)
 
     def test_gelu_simple(self):
-        def helper(shape, dtype=torch.float):
-            cpu_x = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=True)
-            x = cpu_x.detach().clone().to('mps').requires_grad_()
+        def helper(shape, dtype=torch.float, contiguous=True):
+            cpu_x = torch.randn(shape, device='cpu', dtype=dtype)
+            x = cpu_x.detach().clone().to('mps')
+
+            if not contiguous and (0 not in shape and len(shape) >= 2):
+                # Tranposing will make the tensor non-contiguous
+                cpu_x = cpu_x.transpose(0, 1)
+                x = x.transpose(0, 1)
+                assert not x.is_contiguous()
+
+            cpu_x.requires_grad_()
+            x.requires_grad_()
 
             gelu_result = torch.nn.GELU()(x)
             # GELU is not supported on CPU, so cast it to float
@@ -6591,16 +6741,55 @@ def helper(shape, dtype=torch.float):
             atol = 1e-5 if dtype == torch.float else 1e-2
             rtol = 1e-3 if dtype == torch.float else 1e-2
             self.assertEqual(gelu_result, gelu_result_cpu.to(dtype), atol=atol, rtol=rtol)
+
+            assert x.grad is not None  # Check that the grad is well-populated
             self.assertEqual(x.grad, cpu_x.grad, atol=atol, rtol=rtol)
 
         # Test empty shape too
         for dtype in [torch.float, torch.half]:
-            for shape in [(0, 3), [], (2, 3), (2, 8, 4, 5)]:
-                helper(shape, dtype)
+            for shape in [[], (0,), (0, 3), (4,), (4, 3), (5, 4, 3)]:
+                for contiguous in [True, False]:
+                    helper(shape, dtype, contiguous)
         # Test that gelu would raise an assert for integral types
         for dtype in [torch.int8, torch.int16, torch.int32, torch.int64]:
             self.assertRaises(RuntimeError, lambda: torch.nn.GELU()(torch.randint(100, (2,), dtype=dtype, device="mps")))
 
+    def test_mish_simple(self):
+        def helper(shape, dtype=torch.float, contiguous=True):
+            cpu_x = torch.randn(shape, device='cpu', dtype=dtype)
+            x = cpu_x.detach().clone().to('mps')
+
+            if not contiguous and (0 not in shape and len(shape) >= 2):
+                # Tranposing will make the tensor non-contiguous
+                cpu_x = cpu_x.transpose(0, 1)
+                x = x.transpose(0, 1)
+                assert not x.is_contiguous()
+
+            cpu_x.requires_grad_()
+            x.requires_grad_()
+
+            mish_result = torch.nn.Mish()(x)
+            mish_result_cpu = torch.nn.Mish()(cpu_x)
+
+            cpu_grad = torch.ones_like(mish_result_cpu)
+            grad = cpu_grad.to('mps')
+
+            mish_result.backward(gradient=grad)
+            mish_result_cpu.backward(gradient=cpu_grad)
+
+            atol = 1e-5 if dtype == torch.float else 1e-2
+            rtol = 1e-3 if dtype == torch.float else 1e-2
+            self.assertEqual(mish_result, mish_result_cpu.to(dtype), atol=atol, rtol=rtol)
+
+            assert x.grad is not None  # Check that the grad is well-populated
+            self.assertEqual(x.grad, cpu_x.grad, atol=atol, rtol=rtol)
+
+        # Test empty shape too
+        for dtype in [torch.float, torch.half]:
+            for shape in [[], (0,), (0, 3), (4,), (4, 3), (5, 4, 3)]:
+                for contiguous in [True, False]:
+                    helper(shape, dtype, contiguous)
+
     def test_gelu(self):
         def _test_gelu(n, m, dtype, contiguous, atol=None, rtol=None):
             numpy_dtype = {
@@ -6869,6 +7058,42 @@ def test_index_64bit(self):
         gc.collect()
         torch.mps.empty_cache()
 
+    def test_mm_large(self):
+        """ Test that MM works for matrices with index larger than 32K """
+        x = torch.rand(10, 1, device="mps")
+        y = torch.rand(1, 32769, device="mps")
+        # This used to crash with:
+        # error: subRange.start (24576) is not less than length of dimension[0] (16384)
+        # See https://github.com/pytorch/pytorch/issues/116769#issuecomment-1888302095
+        self.assertNotEqual(torch.mm(x, y[:, 16384:32768]).abs().max().item(), 0.0)
+
+        def compare_mm(m, n, k, dtype=torch.float):
+            x = torch.rand(m, n, device="mps", dtype=dtype)
+            y = torch.rand(n, k, device="mps", dtype=dtype)
+            z = torch.mm(x, y).cpu()
+            z_cpu = torch.mm(x.cpu(), y.cpu())
+            self.assertEqual(z, z_cpu)
+
+        # Used to produce incorrect results with MPS on M1 running MacOS 14.3, but correct with Metal
+        compare_mm(1024, 1, 32769)
+        # one more time, but with dimensions inverted
+        # see https://github.com/pytorch/pytorch/issues/116769#issuecomment-1920066984
+        compare_mm(32769, 1, 1025)
+
+        if product_version >= 14.0:
+            # Test bfloat16 mm
+            compare_mm(1024, 1, 32769, torch.bfloat16)
+
+    @unittest.skipIf(total_memory < 12_000_000_000, "Needs at least 12Gb RAM to run the test")
+    @unittest.skipIf(product_version < 14.0, "Can't allocate 4Gb tensor on MacOS 13")
+    def test_copy_large(self):
+        """ Test that copy of 4Gb+ tensors works """
+        x = torch.ones((2**30 + 11,), dtype=torch.float32)
+        y = x.to(device="mps")
+        self.assertTrue(torch.all(y == torch.tensor(1.0, device="mps")))
+        del y
+        del x
+
     # Test flip
     def test_flip(self):
         def helper(shape, dims):
@@ -7415,6 +7640,15 @@ def helper(shape, x_shape, y_shape, cond_dtype=torch.bool, x_dtype=torch.float):
         helper((2, 3), (5, 2, 3), (2, 3))
         helper((2, 3), (2, 3), (5, 2, 3))
         helper((2, 3), (5, 2, 3), (6, 5, 2, 3))
+        # Test that output is correctly resizes
+        # TODO: Remove me when out OpInfo testing is enabled on MPS
+        output = torch.tensor(0.0, device="mps")
+        cond = torch.randint(2, (3, 3), dtype=torch.bool, device="mps")
+        inp = torch.rand(3, 3, device="mps")
+        other = torch.rand(3, 3, device="mps")
+        out = torch.where(cond, inp, other, out=output)
+        self.assertEqual(id(out), id(output))
+        self.assertEqual(out.shape, (3, 3))
 
     # Test normal
     def test_normal(self):
@@ -7984,6 +8218,46 @@ def helper(dtype):
 
         [helper(dtype) for dtype in [torch.float32, torch.float16, torch.int32, torch.int16, torch.uint8, torch.int8, torch.bool]]
 
+    @unittest.skipIf(product_version < 14.0, "Skipped on MacOS < 14.0")
+    def test_isin(self):
+        def helper(dtype):
+            shapes = [([2, 5], [3, 5, 2]), ([10, 3, 5], [20, 1, 3]),
+                      ([5], [10]), ([0], [5]), ([5], [0])]
+            for shape_tuple in shapes:
+                for inverted in [True, False]:
+                    if dtype.is_floating_point:
+                        # Half is not supported for CPU isin. Compute reference in FP32
+                        A = torch.randn(size=shape_tuple[0], device='cpu', dtype=torch.float32)
+                        B = torch.randn(size=shape_tuple[1], device='cpu', dtype=torch.float32)
+                    else:
+                        A = torch.randint(0, 100, size=shape_tuple[0], device='cpu', dtype=dtype)
+                        B = torch.randint(0, 100, size=shape_tuple[1], device='cpu', dtype=dtype)
+
+                    A_mps = A.clone().detach().to('mps')
+                    B_mps = B.clone().detach().to('mps')
+
+                    cpu_ref = torch.isin(A, B, invert=inverted)
+                    if dtype is torch.float16:
+                        cpu_ref.type(dtype)
+
+                    mps_out = torch.isin(A_mps, B_mps, invert=inverted)
+                    self.assertEqual(mps_out, cpu_ref)
+
+        [helper(dtype) for dtype in [torch.float32, torch.float16, torch.int32, torch.int16, torch.uint8, torch.int8]]
+
+    @unittest.skipIf(product_version < 14.0, "Skipped on MacOS < 14.0")
+    def test_isin_asserts(self):
+        A = torch.randn(size=[1, 4], device='mps', dtype=torch.float32)
+        B = torch.randn(size=[1, 4], device='mps', dtype=torch.float16)
+        with self.assertRaisesRegex(RuntimeError, 'Expected elements.dtype()*'):
+            out = torch.isin(A, B)
+
+
+        C = torch.randn(size=[1, 4], device='mps', dtype=torch.float32)
+        D = torch.randn(size=[1, 4], device='cpu', dtype=torch.float32)
+        with self.assertRaisesRegex(RuntimeError, 'Expected elements.is_mps()*'):
+            out = torch.isin(C, D)
+
 class TestSmoothL1Loss(TestCaseMPS):
 
     def _smooth_l1_loss_helper(self, reduction="mean", requires_grad=False):
@@ -8704,6 +8978,168 @@ def test_addr(self, device="mps", dtype=torch.float32):
         m2 = torch.randn(25, device=device).to(dtype)
         self._test_addr(torch.addr, M, m1, m2, beta=0)
 
+    def test_matrix_rank(self, device="mps", dtype=torch.float32):
+        matrix_rank = torch.linalg.matrix_rank
+
+        def run_test(shape0, shape1, batch):
+            a = torch.randn(*batch, shape0, shape1, dtype=dtype, device=device)
+            rank_a = matrix_rank(a)
+
+            self.assertEqual(rank_a, matrix_rank(a.mH))
+            aaH = torch.matmul(a, a.mH)
+            rank_aaH = matrix_rank(aaH)
+            rank_aaH_hermitian = matrix_rank(aaH, hermitian=True)
+            self.assertEqual(rank_aaH, rank_aaH_hermitian)
+            aHa = torch.matmul(a.mH, a)
+            self.assertEqual(matrix_rank(aHa), matrix_rank(aHa, hermitian=True))
+
+            # check against NumPy
+            self.assertEqual(rank_a, np.linalg.matrix_rank(a.cpu().numpy()))
+            self.assertEqual(matrix_rank(a, 0.01), np.linalg.matrix_rank(a.cpu().numpy(), 0.01))
+
+            self.assertEqual(rank_aaH, np.linalg.matrix_rank(aaH.cpu().numpy()))
+            self.assertEqual(matrix_rank(aaH, 0.01), np.linalg.matrix_rank(aaH.cpu().numpy(), 0.01))
+
+            # hermitian flag for NumPy was added in 1.14.0
+            if np.lib.NumpyVersion(np.__version__) >= '1.14.0':
+                self.assertEqual(rank_aaH_hermitian,
+                                 np.linalg.matrix_rank(aaH.cpu().numpy(), hermitian=True))
+                self.assertEqual(matrix_rank(aaH, 0.01, True),
+                                 np.linalg.matrix_rank(aaH.cpu().numpy(), 0.01, True))
+
+            # check out= variant
+            out = torch.empty(a.shape[:-2], dtype=torch.int64, device=device)
+            ans = matrix_rank(a, out=out)
+            self.assertEqual(ans, out)
+            self.assertEqual(ans, rank_a)
+
+        shapes = (3, 13)
+        batches = ((), (0, ), (4, ), (3, 5, ))
+        for (shape0, shape1), batch in zip(itertools.product(shapes, reversed(shapes)), batches):
+            # escape only when NotImplementedError of downstream function is raised
+            # TODO: remove this once the required function is implemented
+            try:
+                run_test(shape0, shape1, batch)
+            except NotImplementedError as e:
+                with self.assertRaisesRegex(
+                        NotImplementedError,
+                        "The operator 'aten::_linalg_svd.U' is not currently implemented for the MPS device."):
+                    raise e
+
+    def test_pinv(self, device="mps", dtype=torch.float32, precision=1e-4):
+        from torch.testing._internal.common_utils import random_hermitian_pd_matrix
+
+        def run_test_main(A, hermitian):
+            # Testing against definition for pseudo-inverses
+            A_pinv = torch.linalg.pinv(A, hermitian=hermitian)
+            np_A = A.cpu().numpy()
+            np_A_pinv = A_pinv.cpu().numpy()
+            if A.numel() > 0:
+                self.assertEqual(A, np_A @ np_A_pinv @ np_A, atol=precision, rtol=precision)
+                self.assertEqual(A_pinv, np_A_pinv @ np_A @ np_A_pinv, atol=precision, rtol=precision)
+                self.assertEqual(np_A @ np_A_pinv, (np_A @ np_A_pinv).conj().swapaxes(-2, -1), atol=precision, rtol=precision)
+                self.assertEqual(np_A_pinv @ np_A, (np_A_pinv @ np_A).conj().swapaxes(-2, -1), atol=precision, rtol=precision)
+            else:
+                self.assertEqual(A.shape, A_pinv.shape[:-2] + (A_pinv.shape[-1], A_pinv.shape[-2]))
+
+            # Check out= variant
+            out = torch.empty_like(A_pinv)
+            ans = torch.linalg.pinv(A, hermitian=hermitian, out=out)
+            self.assertEqual(ans, out)
+            self.assertEqual(ans, A_pinv)
+
+        def run_test_numpy(A, hermitian):
+            # Check against NumPy output
+            # Test float rcond, and specific value for each matrix
+            rconds = [float(torch.rand(1)), ]
+            # Test different types of rcond tensor
+            for rcond_type in MPS_DTYPES:
+                rconds.append(torch.rand(A.shape[:-2], dtype=torch.float32, device=device).to(rcond_type))
+            # Test broadcasting of rcond
+            if A.ndim > 2:
+                rconds.append(torch.rand(A.shape[-3], device=device))
+            for rcond in rconds:
+                actual = torch.linalg.pinv(A, rcond=rcond, hermitian=hermitian)
+                torch_rtol = torch.linalg.pinv(A, rtol=rcond, hermitian=hermitian)
+                self.assertEqual(actual, torch_rtol, atol=precision, rtol=precision)
+                numpy_rcond = rcond if isinstance(rcond, float) else rcond.cpu().numpy()
+                expected = np.linalg.pinv(A.cpu().numpy(), rcond=numpy_rcond, hermitian=hermitian)
+                self.assertEqual(actual, expected, atol=precision, rtol=precision)
+
+        for sizes in [(5, 5), (3, 5, 5), (3, 2, 5, 5),  # square matrices
+                      (3, 2), (5, 3, 2), (2, 5, 3, 2),  # fat matrices
+                      (2, 3), (5, 2, 3), (2, 5, 2, 3),  # thin matrices
+                      (0, 0), (0, 2), (2, 0), (3, 0, 0), (0, 3, 0), (0, 0, 3)]:  # zero numel matrices
+            A = torch.randn(*sizes, dtype=dtype, device=device)
+            hermitian = False
+            run_test_main(A, hermitian)
+            run_test_numpy(A, hermitian)
+
+        # Check hermitian = True
+        for sizes in [(5, 5), (3, 5, 5), (3, 2, 5, 5),  # square matrices
+                      (0, 0), (3, 0, 0), ]:  # zero numel square matrices
+            A = random_hermitian_pd_matrix(sizes[-1], *sizes[:-2], dtype=dtype, device=device)
+            hermitian = True
+            # escape only when NotImplementedError of downstream function is raised
+            # TODO: remove this once the required function is implemented
+            try:
+                run_test_main(A, hermitian)
+            except NotImplementedError as e:
+                with self.assertRaisesRegex(
+                        NotImplementedError,
+                        "The operator 'aten::_linalg_eigh.eigenvalues' is not currently implemented for the MPS device."):
+                    raise e
+            try:
+                run_test_numpy(A, hermitian)
+            except NotImplementedError as e:
+                with self.assertRaisesRegex(
+                        NotImplementedError,
+                        "The operator 'aten::_linalg_eigh.eigenvalues' is not currently implemented for the MPS device."):
+                    raise e
+
+    @parametrize("m", [32, 64])
+    @parametrize("k", [32, 64])
+    @parametrize("n", [48, 64])
+    def test__int4_mm(self, m, k, n):
+        q_group = 32
+        inner_k_tiles = 2
+
+        torch.manual_seed(1)
+        a_f32 = torch.rand((m, k), device="mps")
+        b_f32 = torch.rand((k, n), device="mps")
+
+        def convert_weight_to_int4pack(b):
+            b_int32, b_scales_and_zeros = _group_quantize_tensor(
+                b, n_bit=4, q_group_size=q_group
+            )
+            b_int4pack = torch._convert_weight_to_int4pack(
+                b_int32.cpu(), inner_k_tiles
+            ).to(device="mps")
+
+            return b_int4pack, b_scales_and_zeros
+
+        def weight_int4pack_mm(a, b_int4pack, b_scales_and_zeros):
+            return torch._weight_int4pack_mm(
+                a, b_int4pack, q_group, b_scales_and_zeros
+            ).to(device="mps")
+
+        b_int4pack, b_scales_and_zeros_f32 = convert_weight_to_int4pack(b_f32)
+
+        for dtype in [torch.float16, torch.float32] + ([torch.bfloat16] if product_version > 14.0 else []):
+            a = a_f32.to(dtype=dtype)
+            b = b_f32.to(dtype=dtype)
+            b_scales_and_zeros = b_scales_and_zeros_f32.to(dtype=dtype)
+            ref = torch.mm(a, b)
+            res = weight_int4pack_mm(a, b_int4pack, b_scales_and_zeros)
+
+            mean_err = ((res - ref).abs() / ref).mean()
+            self.assertTrue(mean_err < 0.05)
+
+
+
+
+
+
 class TestGatherScatter(TestCaseMPS):
     def test_slicing_with_step(self):
         # Slicing with step
@@ -8794,7 +9230,7 @@ def is_view_of(self, base, other):
         # Note: only validates storage on native device types
         # because some accelerators, like XLA, do not expose storage
         if base.device.type == 'mps':
-            if base.storage().data_ptr() != other.storage().data_ptr():
+            if base.untyped_storage().data_ptr() != other.untyped_storage().data_ptr():
                 return False
 
         return True
@@ -8917,6 +9353,14 @@ def test_t_view(self, device="mps"):
         v[0, 1] = 0
         self.assertEqual(t[1, 0], v[0, 1])
 
+    def test_inplace_view_add(self):
+        # https://github.com/pytorch/pytorch/issues/96153
+        t_mps = torch.ones((2, 6,), device='mps')[1].reshape(2, 3)
+        t_cpu = torch.ones((2, 6,), device='cpu')[1].reshape(2, 3)
+        t_mps = t_mps + 1
+        t_cpu = t_cpu + 1
+        self.assertEqual(t_mps, t_cpu)
+
     def test_t_inplace_view(self, device="mps"):
         t = torch.ones(5, 5, device=device)
         v = t.view_as(t)
@@ -10867,6 +11311,18 @@ def test_cpu_indices(self, device="mps"):
         out = x[idx]  # index
         self.assertEqual(out, torch.zeros(2, device=device), atol=0, rtol=0)
 
+    def test_nextafter(self, device="mps"):
+        for dtype in [torch.float16, torch.float32]:
+            x = torch.tensor([1, -1, 0, 0, 2, -2], device=device, dtype=dtype)
+            y = torch.tensor([2, -2, -1, 1, -3, 3], device=device, dtype=dtype)
+            na = torch.nextafter(x, y)
+            na_cpu = torch.nextafter(x.cpu(), y.cpu())
+            na_ge_x_mps = na.cpu() > x.cpu()
+            # greater is broken on MPS, see https://github.com/pytorch/pytorch/issues/125051
+            na_ge_x_cpu = na_cpu > x.cpu()
+            self.assertEqual(na_ge_x_mps, na_ge_x_cpu)
+
+
 class TestRNNMPS(TestCaseMPS):
     def _lstm_helper(self, num_layers, dtype, device, bidirectional=False, bias=True, batch_first=False,
                      seq_len=3, batch_size=5, hidden_size=7, input_size=11, backward=False):
@@ -10962,6 +11418,8 @@ def test_lstm_forward(self, device="mps", dtype=torch.float32):
             for test_options in self.LSTM_TEST_CASES:
                 self._lstm_helper(num_layers=num_layers, dtype=dtype, device=device, **test_options)
 
+    # Broke on MacOS-14.4 (but works on 14.2), see https://github.com/pytorch/pytorch/issues/125803
+    @xfailIfMacOS14_4Plus
     def test_lstm_backward(self, device="mps", dtype=torch.float32):
         for num_layers in [1, 2, 5]:
             for test_options in self.LSTM_TEST_CASES:
@@ -11185,7 +11643,7 @@ class TestConsistency(TestCaseMPS):
         '__rdiv__', '__rmul__',
         'nn.functional.huber_loss',
         'true_divide', 'kron',
-        'gradient', 'var', 'std', 'ldexp',
+        'gradient', 'var', 'std', 'std_mean', 'ldexp',
         'linalg.vector_norm', 'lerp',
         'addr', 'var_mean',
         'var_mean_unbiased',
@@ -11197,10 +11655,14 @@ class TestConsistency(TestCaseMPS):
         'nn.functional.batch_norm',
         'nn.functional.instance_norm',
         'round', 'xlogy', 'addcmul',
+        'nn.functional.cross_entropy',
+        'nn.functional.binary_cross_entropy',
+        'nn.functional.nll_loss',
         'nn.functional.max_pool2d',
         'nn.functional.gelu',
         'nn.functional.glu',
         '_native_batch_norm_legit',
+        '_batch_norm_with_update',
         'native_batch_norm',
         'softmax',
         '_softmax_backward_data',
@@ -11215,6 +11677,9 @@ class TestConsistency(TestCaseMPS):
         'nextafter',
         'native_layer_norm',
         'nn.functional.layer_norm',
+        'nn.functional.interpolate',
+        'nn.functional.upsample_bilinear',
+        'nn.functional.upsample_nearest',
 
         # for macOS 12
         'masked.normalize', 'masked.sum', 'masked.var',
@@ -11235,6 +11700,35 @@ class TestConsistency(TestCaseMPS):
         'addbmm',
     }
 
+    def _compute_tolerances(self, op, dtype):
+        if (op.name in self.FP32_LOW_PRECISION_LIST) and dtype == torch.float32:
+            return (1e-4, 3e-5)
+
+        if op.name in self.FP16_LOW_PRECISION_LIST and dtype == torch.float16:
+            return (1e-2, 1e-2)
+
+        if op.name in ['nn.functional.conv_transpose1d',
+                       'nn.functional.conv_transpose2d',
+                       'nn.functional.conv_transpose3d',
+                       '__rmatmul__', 'addbmm', 'addmv',
+                       'baddbmm', 'cov', 'matmul', 'mv'] and dtype == torch.float16:
+            return (5e-2, 5e-2)
+        if op.name == "masked.mean":
+            return (7e-4, 2e-3)
+        if op.name == "native_layer_norm":
+            return (1e-4, 1.3e-5)
+        if op.name in ["pow", "__rpow__"] and product_version < 13.3:
+            # The result of pow(9 , 8) is showing 43046716, whereas it should've been 43046721.
+            # fixed in macOS 13.3+
+            return (1e-6, 2e-3 if dtype == torch.float16 else 4e-6)
+        if op.name == "nn.functional.interpolate":
+            return (1e-3, 1e-4)
+        if op.name in ['fft.rfftn', 'fft.hfftn', 'fft.hfft2', 'fft.fft', 'fft.fftn', 'fft.rfft']:
+            # TODO: Investigate why this is needed
+            # See https://github.com/pytorch/pytorch/issues/120237
+            return (3e-5, 3e-5)
+        return (None, None)
+
     # Used for accept mode only
     NEW_ALLOW_LIST = defaultdict(list)
     NEW_ALLOW_LIST_GRAD = defaultdict(list)
@@ -11266,37 +11760,10 @@ def get_samples():
             cpu_out = op(*cpu_args, **cpu_kwargs)
             mps_out = op(*mps_args, **mps_kwargs)
 
-            if (op.name in self.FP32_LOW_PRECISION_LIST) and dtype == torch.float32:
-                atol = 1e-4
-                rtol = 3e-5
-            elif op.name in self.FP16_LOW_PRECISION_LIST and dtype == torch.float16:
-                atol = 1e-2
-                rtol = 1e-2
-            elif op.name in ['nn.functional.conv_transpose1d',
-                             'nn.functional.conv_transpose2d',
-                             'nn.functional.conv_transpose3d',
-                             '__rmatmul__', 'addbmm', 'addmv',
-                             'baddbmm', 'cov', 'matmul', 'mv'] and dtype == torch.float16:
-                atol = 5e-2
-                rtol = 5e-2
-            elif op.name == "masked.mean":
-                atol = 7e-4
-                rtol = 2e-3
-            elif op.name == "native_layer_norm":
-                atol = 1e-4
-                rtol = 1.3e-5
-            elif op.name in ["pow", "__rpow__"]:
-                atol = 1e-6
-                rtol = 4e-6
-            elif op.name == "nn.functional.interpolate":
-                atol = 1e-3
-                rtol = 1e-4
-            elif op.name == "nn.functional.upsample_bilinear" and dtype == torch.uint8:
+            atol, rtol = self._compute_tolerances(op, dtype)
+            if op.name == "nn.functional.upsample_bilinear" and dtype == torch.uint8:
                 atol = 1.0
                 rtol = 0.0
-            else:
-                atol = None
-                rtol = None
 
             self.assertEqual(cpu_out, mps_out, atol=atol, rtol=rtol)
 
@@ -11329,36 +11796,13 @@ def get_samples():
             cpu_out = op(*cpu_args, **cpu_kwargs)
             mps_out = op(*mps_args, **mps_kwargs)
 
-            if op.name in self.FP32_LOW_PRECISION_LIST and dtype == torch.float32:
-                atol = 1e-4
-                rtol = 3e-5
-            elif op.name in self.FP16_LOW_PRECISION_LIST and dtype == torch.float16:
-                atol = 1e-2
-                rtol = 1e-2
-            elif op.name in ['nn.functional.conv_transpose1d',
-                             'nn.functional.conv_transpose2d',
-                             'nn.functional.conv_transpose3d',
-                             '__rmatmul__', 'addbmm', 'addmv',
-                             'baddbmm', 'cov', 'matmul', 'mv'] and dtype == torch.float16:
-                atol = 5e-2
-                rtol = 5e-2
-            elif (op.name == "masked.mean"):
-                atol = 7e-4
-                rtol = 2e-3
-            elif (op.name == "native_layer_norm"):
-                atol = 1e-4
-                rtol = 1.3e-5
-            elif op.name in ["renorm", "norm", "linalg.norm"] and dtype == torch.float16:
+            if op.name == "unique" and cpu_kwargs["sorted"] is False:
+                continue
+
+            atol, rtol = self._compute_tolerances(op, dtype)
+            if op.name in ["renorm", "norm", "linalg.norm"] and dtype == torch.float16:
                 atol = 7e-4
                 rtol = 1.5e-3
-            elif op.name == "unique" and cpu_kwargs["sorted"] is False:
-                continue
-            elif op.name == "nn.functional.interpolate":
-                atol = 1e-3
-                rtol = 1e-4
-            else:
-                atol = None
-                rtol = None
 
             self.assertEqual(cpu_out, mps_out, atol=atol, rtol=rtol)
 
@@ -11422,6 +11866,27 @@ def test_error_inputs(self, device, op):
             with self.assertRaisesRegex(error_type, error_regex):
                 op(*mps_args, **mps_kwargs)
 
+class TestComplex(TestCase):
+    def test_tensor_scalar_binops(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/119088
+        def to_cpu(x):
+            return x.cpu() if isinstance(x, torch.Tensor) else x
+
+        # Allocate tensors on mps
+        with torch.device("mps"):
+            inputs = [torch.rand(2, dtype=dtype) for dtype in [torch.float, torch.half, torch.cfloat]]
+        self.assertTrue(all(x.device.type == "mps" for x in inputs))
+        # Add scalars
+        inputs.extend([7, 3.14, 2 + 3j, torch.tensor(4 + 5j, dtype=torch.chalf)])
+
+        # Iterate over all permutations of types(int, float, complex, half) and ops (excluding div)
+        for x, y in itertools.product(inputs, inputs):
+            for op_name in ["__add__", "__sub__", "__mul__"]:
+                x_cpu, y_cpu = map(to_cpu, (x, y))
+                res = getattr(x, op_name)(y)
+                res_cpu = getattr(x_cpu, op_name)(y_cpu)
+                self.assertEqual(to_cpu(res), res_cpu, f"{op_name}({x}, {y}) produces different results {res} vs {res_cpu}")
+
 
 # Copied from `TestCommon` in `test_ops.py`, just enough to duplicate the `test_numpy_ref` for MPS
 @skipIfSlowGradcheckEnv
@@ -11467,7 +11932,7 @@ def test_numpy_ref_mps(self, device, dtype, op):
     def test_tensor_creation(self, device, dtype):
         def ones(device):
             return torch.ones((2, 2), dtype=dtype, device=device)
-        if dtype not in MPS_DTYPES + [torch.complex64]:
+        if dtype not in MPS_DTYPES + ([torch.bfloat16, torch.complex64] if product_version > 14.0 else [torch.complex64]):
             with self.assertRaises(TypeError):
                 ones(device)
         else:
@@ -11475,6 +11940,7 @@ def ones(device):
             cpu_tensor = ones("cpu")
             self.assertEqual(mps_tensor.cpu(), cpu_tensor)
 
+
 # TODO: Actually instantiate that test for the "mps" device to better reflect what it is doing.
 # This requires mps to be properly registered in the device generic test framework which is not the
 # case right now. We can probably use `allow_mps` introduced in https://github.com/pytorch/pytorch/pull/87342
@@ -11482,6 +11948,7 @@ def ones(device):
 instantiate_device_type_tests(TestConsistency, globals(), only_for="cpu")
 instantiate_device_type_tests(TestErrorInputs, globals(), allow_mps=True, only_for="mps")
 instantiate_device_type_tests(TestCommon, globals(), allow_mps=True, only_for="mps")
+instantiate_device_type_tests(TestLinalgMPS, globals(), allow_mps=True, only_for="mps")
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index 8a7d07e390510..d529494075973 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -1,12 +1,12 @@
 # Owner(s): ["module: multiprocessing"]
 
 import contextlib
+import copy
 import gc
 import os
 import sys
 import time
 import unittest
-import copy
 from sys import platform
 
 import torch
@@ -14,9 +14,20 @@
 import torch.multiprocessing as mp
 import torch.utils.hooks
 from torch.nn import Parameter
-from torch.testing._internal.common_utils import (TestCase, run_tests, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, TEST_WITH_ASAN,
-                                                  load_tests, slowTest, TEST_WITH_TSAN, TEST_WITH_TORCHDYNAMO,
-                                                  TEST_WITH_ROCM, IS_MACOS)
+from torch.testing._internal.common_cuda import IS_JETSON
+from torch.testing._internal.common_utils import (
+    IS_MACOS,
+    IS_WINDOWS,
+    load_tests,
+    NO_MULTIPROCESSING_SPAWN,
+    run_tests,
+    slowTest,
+    TEST_WITH_ASAN,
+    TEST_WITH_ROCM,
+    TEST_WITH_TORCHDYNAMO,
+    TEST_WITH_TSAN,
+    TestCase,
+)
 
 
 # load_tests from common_utils is used to automatically filter tests for
@@ -24,16 +35,21 @@
 load_tests = load_tests
 
 TEST_REPEATS = 30
-HAS_SHM_FILES = os.path.isdir('/dev/shm')
+HAS_SHM_FILES = os.path.isdir("/dev/shm")
 MAX_WAITING_TIME_IN_SECONDS = 30
-TEST_CUDA_IPC = torch.cuda.is_available() and \
-    sys.platform != 'darwin' and \
-    sys.platform != 'win32' and \
-    not TEST_WITH_ROCM  # https://github.com/pytorch/pytorch/issues/90940
+
+TEST_CUDA_IPC = (
+    torch.cuda.is_available()
+    and sys.platform != "darwin"
+    and sys.platform != "win32"
+    and not IS_JETSON
+    and not TEST_WITH_ROCM
+)  # https://github.com/pytorch/pytorch/issues/90940
+
 TEST_MULTIGPU = TEST_CUDA_IPC and torch.cuda.device_count() > 1
 
 if TEST_CUDA_IPC:
-    torch.cuda.memory._set_allocator_settings('expandable_segments:False')
+    torch.cuda.memory._set_allocator_settings("expandable_segments:False")
 
 
 class SubProcess(mp.Process):
@@ -50,7 +66,7 @@ def _test_cuda_ipc_deadlock_actor(queue, iterations):
     for i in range(iterations):
         if not queue.empty():
             queue.get()
-        time.sleep(.01)
+        time.sleep(0.01)
 
 
 def _test_cuda_ipc_deadlock_learner(queue, iterations):
@@ -58,7 +74,7 @@ def _test_cuda_ipc_deadlock_learner(queue, iterations):
     for i in range(iterations):
         if not queue.full():
             queue.put(copy.deepcopy(net.state_dict()))
-        time.sleep(.01)
+        time.sleep(0.01)
 
 
 def simple_fill(queue, event):
@@ -107,8 +123,14 @@ def sum_tensors(inq, outq):
     with torch.cuda.device(1):
         tensors = inq.get()
         for tensor in tensors:
-            outq.put((tensor.sum().item(), tensor.get_device(),
-                      tensor.numel(), tensor.storage().size()))
+            outq.put(
+                (
+                    tensor.sum().item(),
+                    tensor.get_device(),
+                    tensor.numel(),
+                    tensor.storage().size(),
+                )
+            )
 
 
 def queue_get_exception(inqueue, outqueue):
@@ -118,7 +140,7 @@ def queue_get_exception(inqueue, outqueue):
     except Exception as e:
         outqueue.put(e)
     else:
-        outqueue.put('no exception')
+        outqueue.put("no exception")
 
 
 # Multiply by two in a separate stream
@@ -148,7 +170,7 @@ def autograd_sharing(queue, ready, master_modified, device, is_parameter):
     ready.set()
     master_modified.wait()
 
-    expected_var = torch.arange(1., 26, device=device).view(5, 5)
+    expected_var = torch.arange(1.0, 26, device=device).view(5, 5)
     expected_var[0, 0] = 1000
     is_ok = var.data.equal(expected_var)
     var.data[:] = torch.ones(5, 5, device=device)
@@ -174,14 +196,16 @@ def mixed_type_producer(queue, event):
         event.wait()
         event.clear()
 
+
 def simple_autograd_function(a=1):
     torch.rand(3).requires_grad_(True).mean().backward()
-    return a ** 2
+    return a**2
+
 
 @contextlib.contextmanager
 def fs_sharing():
     prev_strategy = mp.get_sharing_strategy()
-    mp.set_sharing_strategy('file_system')
+    mp.set_sharing_strategy("file_system")
     try:
         yield
     finally:
@@ -189,7 +213,6 @@ def fs_sharing():
 
 
 class leak_checker:
-
     def __init__(self, test_case):
         self.checked_pids = [os.getpid()]
         self.test_case = test_case
@@ -228,7 +251,7 @@ def has_shm_files(self, wait=True):
             return False
 
         result = self._has_shm_files()
-        if not result or mp.get_sharing_strategy() != 'file_system' or not wait:
+        if not result or mp.get_sharing_strategy() != "file_system" or not wait:
             return result
 
         total_waiting_time = 0
@@ -243,23 +266,25 @@ def has_shm_files(self, wait=True):
 
     def _has_shm_files(self):
         gc.collect()
-        names = ['torch_' + str(pid) for pid in self.checked_pids]
-        for filename in os.listdir('/dev/shm'):
+        names = ["torch_" + str(pid) for pid in self.checked_pids]
+        for filename in os.listdir("/dev/shm"):
             for name in names:
                 if filename.startswith(name):
                     return True
         return False
 
 
-@unittest.skipIf(TEST_WITH_TSAN, "TSAN is not fork-safe since we're forking in a multi-threaded environment")
+@unittest.skipIf(
+    TEST_WITH_TSAN,
+    "TSAN is not fork-safe since we're forking in a multi-threaded environment",
+)
 class TestMultiprocessing(TestCase):
-
     def tearDown(self):
         # This will keep tests isolated from each-other
         if torch.cuda.is_available():
             torch.cuda.ipc_collect()
 
-    def _test_sharing(self, ctx=mp, device='cpu', dtype=torch.float, repeat=1):
+    def _test_sharing(self, ctx=mp, device="cpu", dtype=torch.float, repeat=1):
         def test_fill():
             x = torch.zeros(5, 5).to(device, dtype)
             q = ctx.Queue()
@@ -360,24 +385,36 @@ def do_test():
             for _ in range(repeat):
                 do_test()
 
-    @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on macOS")
-    @unittest.skipIf(TEST_WITH_ASAN,
-                     "seems to hang with ASAN, see https://github.com/pytorch/pytorch/issues/5326")
+    @unittest.skipIf(
+        platform == "darwin", "file descriptor strategy is not supported on macOS"
+    )
+    @unittest.skipIf(
+        TEST_WITH_ASAN,
+        "seems to hang with ASAN, see https://github.com/pytorch/pytorch/issues/5326",
+    )
     def test_fd_sharing(self):
         self._test_sharing(repeat=TEST_REPEATS)
 
-    @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on macOS")
+    @unittest.skipIf(
+        platform == "darwin", "file descriptor strategy is not supported on macOS"
+    )
     def test_fd_preserve_sharing(self):
         self._test_preserve_sharing(repeat=TEST_REPEATS)
 
-    @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on macOS")
+    @unittest.skipIf(
+        platform == "darwin", "file descriptor strategy is not supported on macOS"
+    )
     def test_fd_pool(self):
         self._test_pool(repeat=TEST_REPEATS)
 
-    @unittest.skipIf(TEST_WITH_ASAN,
-                     "seems to hang with ASAN, see https://github.com/pytorch/pytorch/issues/5326")
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO,
-                     "Fail to clean up temporary /dev/shm/torch_* file, see https://github.com/pytorch/pytorch/issues/91467")
+    @unittest.skipIf(
+        TEST_WITH_ASAN,
+        "seems to hang with ASAN, see https://github.com/pytorch/pytorch/issues/5326",
+    )
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO,
+        "Fail to clean up temporary /dev/shm/torch_* file, see https://github.com/pytorch/pytorch/issues/91467",
+    )
     def test_fs_sharing(self):
         with fs_sharing():
             # The test works but is very slow on MacOS, see https://github.com/pytorch/pytorch/pull/93183,
@@ -385,21 +422,27 @@ def test_fs_sharing(self):
             repeat = 1 if IS_MACOS else TEST_REPEATS
             self._test_sharing(repeat=repeat)
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO,
-                     "Fail to clean up temporary /dev/shm/torch_* file, see https://github.com/pytorch/pytorch/issues/91467")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO,
+        "Fail to clean up temporary /dev/shm/torch_* file, see https://github.com/pytorch/pytorch/issues/91467",
+    )
     def test_fs_preserve_sharing(self):
         with fs_sharing():
             self._test_preserve_sharing(repeat=TEST_REPEATS)
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO,
-                     "Fail to clean up temporary /dev/shm/torch_* file, see https://github.com/pytorch/pytorch/issues/91467")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO,
+        "Fail to clean up temporary /dev/shm/torch_* file, see https://github.com/pytorch/pytorch/issues/91467",
+    )
     def test_fs_pool(self):
         with fs_sharing():
             self._test_pool(repeat=TEST_REPEATS)
 
     @unittest.skipIf(not HAS_SHM_FILES, "don't not how to check if shm files exist")
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO,
-                     "Fail to clean up temporary /dev/shm/torch_* file, see https://github.com/pytorch/pytorch/issues/91467")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO,
+        "Fail to clean up temporary /dev/shm/torch_* file, see https://github.com/pytorch/pytorch/issues/91467",
+    )
     def test_fs(self):
         def queue_put():
             x = torch.DoubleStorage(4)
@@ -426,39 +469,53 @@ def test_inherit_tensor(self):
 
     @unittest.skipIf(IS_WINDOWS, "Test needs to use fork multiprocessing")
     def test_autograd_errors(self):
-        ctx = mp.get_context('fork')
+        ctx = mp.get_context("fork")
         simple_autograd_function()
         # Autograd only uses thread when GPUs are involved
-        if torch.cuda.is_available() or torch.backends.mps.is_available():
-            with self.assertRaisesRegex(RuntimeError, r'Unable to handle autograd'):
+        if (
+            torch.cuda.is_available()
+            or torch.backends.mps.is_available()
+            or torch.xpu.is_available()
+        ):
+            with self.assertRaisesRegex(RuntimeError, r"Unable to handle autograd"):
                 with ctx.Pool(3) as pool:
                     pool.map(simple_autograd_function, [1, 2, 3])
         else:
             with ctx.Pool(3) as pool:
                 pool.map(simple_autograd_function, [1, 2, 3])
 
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Test needs to use spawn multiprocessing")
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN, "Test needs to use spawn multiprocessing"
+    )
     def test_autograd_fine_with_spawn(self):
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         simple_autograd_function()
         with ctx.Pool(3) as pool:
             pool.map(simple_autograd_function, [1, 2, 3])
 
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
-    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
     def test_cuda_simple(self):
         torch.cuda.FloatTensor([1])  # initialize CUDA outside of leak checker
-        self._test_sharing(mp.get_context('spawn'), 'cuda', torch.float)
-
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
-    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+        self._test_sharing(mp.get_context("spawn"), "cuda", torch.float)
+
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
     def test_cuda_memory_allocation(self):
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         q = ctx.Queue()
         e = ctx.Event()
-        p = ctx.Process(target=send_and_delete_tensors, args=(q, e, 'cuda', torch.int, 5))
+        p = ctx.Process(
+            target=send_and_delete_tensors, args=(q, e, "cuda", torch.int, 5)
+        )
         p.start()
         t = []
         for _ in range(5):
@@ -468,15 +525,19 @@ def test_cuda_memory_allocation(self):
         e.set()
         p.join(1)
 
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
-    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
     def test_cuda_ipc_deadlock(self):
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         queue = ctx.Queue(1)
         processes = dict(
             a=ctx.Process(target=_test_cuda_ipc_deadlock_actor, args=(queue, 100)),
-            l=ctx.Process(target=_test_cuda_ipc_deadlock_learner, args=(queue, 100)))
+            l=ctx.Process(target=_test_cuda_ipc_deadlock_learner, args=(queue, 100)),
+        )
 
         for p in processes.values():
             p.start()
@@ -487,22 +548,30 @@ def test_cuda_ipc_deadlock(self):
         for p in processes.values():
             self.assertFalse(p.is_alive())
 
-
     @slowTest
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
-    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
     def test_cuda_send_many(self, name=None, size=5, count=100000):
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         q1 = ctx.Queue()
         q2 = ctx.Queue()
         q3 = ctx.Queue()
         e1 = ctx.Event()
         e2 = ctx.Event()
         e3 = ctx.Event()
-        p1 = ctx.Process(target=send_and_delete_tensors, args=(q1, e1, 'cuda', torch.long, count, size))
+        p1 = ctx.Process(
+            target=send_and_delete_tensors,
+            args=(q1, e1, "cuda", torch.long, count, size),
+        )
         p2 = ctx.Process(target=receive_and_send, args=(q1, q2, e2, count))
-        p3 = ctx.Process(target=receive_and_send_sum, args=(q2, q3, e3, 'cuda', torch.long, count, size))
+        p3 = ctx.Process(
+            target=receive_and_send_sum,
+            args=(q2, q3, e3, "cuda", torch.long, count, size),
+        )
         p1.start()
         p2.start()
         p3.start()
@@ -516,18 +585,21 @@ def test_cuda_send_many(self, name=None, size=5, count=100000):
         p2.join(1)
         p3.join(1)
 
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
-    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
-    @unittest.skipIf(not TEST_MULTIGPU, 'found only 1 GPU')
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
+    @unittest.skipIf(not TEST_MULTIGPU, "found only 1 GPU")
     def test_cuda_small_tensors(self):
         # Check multiple small tensors which will likely use the same
         # underlying cached allocation
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         tensors = []
         for i in range(5):
             device = i % 2
-            tensors += [torch.arange(i * 5., (i + 1) * 5).cuda(device)]
+            tensors += [torch.arange(i * 5.0, (i + 1) * 5).cuda(device)]
 
         inq = ctx.Queue()
         outq = ctx.Queue()
@@ -542,7 +614,7 @@ def test_cuda_small_tensors(self):
 
         for i, _tensor in enumerate(tensors):
             v, device, tensor_size, storage_size = results[i]
-            self.assertEqual(v, torch.arange(i * 5., (i + 1) * 5).sum())
+            self.assertEqual(v, torch.arange(i * 5.0, (i + 1) * 5).sum())
             self.assertEqual(device, i % 2)
             self.assertEqual(tensor_size, 5)
 
@@ -563,8 +635,8 @@ def test_cuda_small_tensors(self):
         # memory 'file' for performance reason
         torch.cuda.ipc_collect()
 
-    @unittest.skipIf(IS_WINDOWS, 'not applicable to Windows (only fails with fork)')
-    @unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available')
+    @unittest.skipIf(IS_WINDOWS, "not applicable to Windows (only fails with fork)")
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_cuda_bad_call(self):
         # Initialize CUDA
         t = torch.zeros(5, 5).cuda().cpu()
@@ -576,10 +648,11 @@ def test_cuda_bad_call(self):
         p.join()
         self.assertIsInstance(outq.get(), RuntimeError)
 
-    @unittest.skipIf(IS_WINDOWS, 'not applicable to Windows (only fails with fork)')
-    @unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available')
+    @unittest.skipIf(IS_WINDOWS, "not applicable to Windows (only fails with fork)")
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_wrong_cuda_fork(self):
-        stderr = TestCase.runWithPytorchAPIUsageStderr("""\
+        stderr = TestCase.runWithPytorchAPIUsageStderr(
+            """\
 import torch
 from torch.multiprocessing import Process
 def run(rank):
@@ -595,14 +668,18 @@ def run(rank):
         processes.append(p)
     for p in processes:
         p.join()
-""")
+"""
+        )
         self.assertRegex(stderr, "Cannot re-initialize CUDA in forked subprocess.")
 
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
-    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
     def test_event(self):
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         queue = ctx.Queue()
         ready = ctx.Event()
         done = ctx.Event()
@@ -631,19 +708,23 @@ def _test_event_multiprocess_child(event, p2c, c2p):
         event.synchronize()
         c2p.put(1)  # notify parent synchronization is done
 
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
-    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
     def test_event_multiprocess(self):
         event = torch.cuda.Event(enable_timing=False, interprocess=True)
         self.assertTrue(event.query())
 
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         p2c = ctx.SimpleQueue()
         c2p = ctx.SimpleQueue()
         p = ctx.Process(
             target=TestMultiprocessing._test_event_multiprocess_child,
-            args=(event, p2c, c2p))
+            args=(event, p2c, c2p),
+        )
         p.start()
 
         c2p.get()  # wait for until child process is ready
@@ -656,13 +737,16 @@ def test_event_multiprocess(self):
         self.assertTrue(event.query())
         p.join()
 
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
-    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
-    @unittest.skipIf(not TEST_MULTIGPU, 'found only 1 GPU')
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
+    @unittest.skipIf(not TEST_MULTIGPU, "found only 1 GPU")
     def test_event_handle_multi_gpu(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")
         with torch.cuda.device(d0):
             e0 = torch.cuda.Event(enable_timing=False, interprocess=True)
 
@@ -689,19 +773,23 @@ def _test_event_handle_importer_consumer(handle, p2c, c2p):
         c2p.put(1)  # notify synchronization is done in child
         p2c.get()  # wait for parent to finish before destructing child event
 
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
-    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
     def test_event_handle_importer(self):
         e0 = torch.cuda.Event(enable_timing=False, interprocess=True)
         self.assertTrue(e0.query())
 
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         p2c = ctx.SimpleQueue()
         c2p = ctx.SimpleQueue()
         p = ctx.Process(
             target=TestMultiprocessing._test_event_handle_importer_consumer,
-            args=(e0.ipc_handle(), p2c, c2p))
+            args=(e0.ipc_handle(), p2c, c2p),
+        )
         p.start()
 
         c2p.get()  # wait for child to become ready
@@ -719,8 +807,7 @@ def test_event_handle_importer(self):
     def _test_event_handle_exporter_consumer(handle, p2c, c2p):
         stream = torch.cuda.Stream()
         with torch.cuda.stream(stream):
-            e1 = torch.cuda.Event.from_ipc_handle(
-                torch.cuda.current_device(), handle)
+            e1 = torch.cuda.Event.from_ipc_handle(torch.cuda.current_device(), handle)
             torch.cuda._sleep(50000000)  # spin for about 50 ms
             e1.record()
             c2p.put(0)
@@ -728,18 +815,22 @@ def _test_event_handle_exporter_consumer(handle, p2c, c2p):
             # destructing e1
             p2c.get()
 
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
-    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
     def test_event_handle_exporter(self):
         e0 = torch.cuda.Event(enable_timing=False, interprocess=True)
 
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         p2c = ctx.SimpleQueue()
         c2p = ctx.SimpleQueue()
         p = ctx.Process(
             target=TestMultiprocessing._test_event_handle_exporter_consumer,
-            args=(e0.ipc_handle(), p2c, c2p))
+            args=(e0.ipc_handle(), p2c, c2p),
+        )
         p.start()
         # wait for event in child process is recorded
         c2p.get()
@@ -758,21 +849,24 @@ def _test_empty_tensor_sharing(self, dtype, device):
         self.assertEqual(out, empty)
 
     def test_empty_tensor_sharing(self):
-        self._test_empty_tensor_sharing(torch.float32, torch.device('cpu'))
-        self._test_empty_tensor_sharing(torch.int64, torch.device('cpu'))
+        self._test_empty_tensor_sharing(torch.float32, torch.device("cpu"))
+        self._test_empty_tensor_sharing(torch.int64, torch.device("cpu"))
 
-    @unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available')
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_empty_tensor_sharing_cuda(self):
-        self._test_empty_tensor_sharing(torch.float32, torch.device('cuda'))
-        self._test_empty_tensor_sharing(torch.int64, torch.device('cuda'))
+        self._test_empty_tensor_sharing(torch.float32, torch.device("cuda"))
+        self._test_empty_tensor_sharing(torch.int64, torch.device("cuda"))
 
     def _test_autograd_sharing(self, var, ctx=mp, is_parameter=False):
-        device = 'cuda' if var.is_cuda else 'cpu'
+        device = "cuda" if var.is_cuda else "cpu"
 
         ready = ctx.Event()
         master_modified = ctx.Event()
         queue = ctx.Queue()
-        p = ctx.Process(target=autograd_sharing, args=(queue, ready, master_modified, device, is_parameter))
+        p = ctx.Process(
+            target=autograd_sharing,
+            args=(queue, ready, master_modified, device, is_parameter),
+        )
         p.daemon = True
         p.start()
 
@@ -823,28 +917,35 @@ def _test_mixed_types_cuda_sharing(self, ctx=mp):
         time.sleep(5)
         p.join()
 
-    @unittest.skipIf(TEST_WITH_ASAN,
-                     "non-deterministically hangs with ASAN https://github.com/pytorch/pytorch/issues/94024")
+    @unittest.skipIf(
+        TEST_WITH_ASAN,
+        "non-deterministically hangs with ASAN https://github.com/pytorch/pytorch/issues/94024",
+    )
     def test_variable_sharing(self):
         for requires_grad in [True, False]:
-            var = torch.arange(1., 26).view(5, 5).requires_grad_(requires_grad)
+            var = torch.arange(1.0, 26).view(5, 5).requires_grad_(requires_grad)
             self._test_autograd_sharing(var)
 
     # See https://github.com/pytorch/pytorch/issues/14997
-    @unittest.skipIf(TEST_WITH_ASAN,
-                     "non-deterministically hangs with ASAN")
+    @unittest.skipIf(TEST_WITH_ASAN, "non-deterministically hangs with ASAN")
     def test_leaf_variable_sharing(self):
-        devices = ['cpu']
+        devices = ["cpu"]
         if torch.cuda.is_available() and not NO_MULTIPROCESSING_SPAWN and TEST_CUDA_IPC:
-            devices.append('cuda')
+            devices.append("cuda")
         for device in devices:
             for requires_grad in [True, False]:
-                var = torch.arange(1., 26, device=device).view(5, 5).requires_grad_(requires_grad)
+                var = (
+                    torch.arange(1.0, 26, device=device)
+                    .view(5, 5)
+                    .requires_grad_(requires_grad)
+                )
                 self.assertTrue(var.is_leaf)
-                ctx = mp.get_context('spawn') if device == 'cuda' else mp
+                ctx = mp.get_context("spawn") if device == "cuda" else mp
                 ready = ctx.Event()
                 queue = ctx.Queue()
-                p = ctx.Process(target=requires_grad_variable_sharing, args=(queue, ready))
+                p = ctx.Process(
+                    target=requires_grad_variable_sharing, args=(queue, ready)
+                )
                 p.daemon = True
                 p.start()
                 queue.put(var)
@@ -853,65 +954,86 @@ def test_leaf_variable_sharing(self):
                 self.assertTrue(worker_requires_grad == requires_grad)
 
     def test_non_leaf_variable_sharing(self):
-        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        devices = ["cpu"] if not torch.cuda.is_available() else ["cpu", "cuda"]
         for device in devices:
-            var0 = torch.arange(1., 26, device=device).view(5, 5).requires_grad_(True)
+            var0 = torch.arange(1.0, 26, device=device).view(5, 5).requires_grad_(True)
             var = var0 * 2
             # Don't use a regular Queue; it uses a background thread (which
             # means we can't catch the exceptions)
             queue = mp.SimpleQueue()
-            self.assertRaisesRegex(RuntimeError, r'requires_grad', lambda: queue.put(var))
-
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
-    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+            self.assertRaisesRegex(
+                RuntimeError, r"requires_grad", lambda: queue.put(var)
+            )
+
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
     def test_cuda_variable_sharing(self):
         for requires_grad in [True, False]:
-            var = torch.arange(1., 26, device='cuda').view(5, 5).requires_grad_(requires_grad)
-            self._test_autograd_sharing(var, mp.get_context('spawn'))
-
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
-    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+            var = (
+                torch.arange(1.0, 26, device="cuda")
+                .view(5, 5)
+                .requires_grad_(requires_grad)
+            )
+            self._test_autograd_sharing(var, mp.get_context("spawn"))
+
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
     def test_mixed_types_cuda_sharing(self):
-        self._test_mixed_types_cuda_sharing(mp.get_context('spawn'))
+        self._test_mixed_types_cuda_sharing(mp.get_context("spawn"))
 
     def test_parameter_sharing(self):
-        param = Parameter(torch.arange(1., 26).view(5, 5))
+        param = Parameter(torch.arange(1.0, 26).view(5, 5))
         self._test_autograd_sharing(param, is_parameter=True)
 
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
-    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
     def test_cuda_parameter_sharing(self):
-        param = Parameter(torch.arange(1., 26, device='cuda').view(5, 5))
-        self._test_autograd_sharing(param, mp.get_context('spawn'), is_parameter=True)
-
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
+        param = Parameter(torch.arange(1.0, 26, device="cuda").view(5, 5))
+        self._test_autograd_sharing(param, mp.get_context("spawn"), is_parameter=True)
+
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
     def test_integer_parameter_serialization_cpu(self):
-        self._test_integer_parameter_serialization(device='cpu')
-
-    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
-                     don't support multiprocessing with spawn start method")
-    @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
+        self._test_integer_parameter_serialization(device="cpu")
+
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN,
+        "Disabled for environments that \
+                     don't support multiprocessing with spawn start method",
+    )
+    @unittest.skipIf(not TEST_CUDA_IPC, "CUDA IPC not available")
     def test_integer_parameter_serialization_cuda(self):
-        self._test_integer_parameter_serialization(device='cuda')
+        self._test_integer_parameter_serialization(device="cuda")
 
     def _test_integer_parameter_serialization(self, device):
         param = torch.nn.Parameter(
-            torch.tensor(0, dtype=torch.int64, device=device),
-            requires_grad=False
+            torch.tensor(0, dtype=torch.int64, device=device), requires_grad=False
         )
 
-        ctx = mp.get_context('spawn')
+        ctx = mp.get_context("spawn")
         p = ctx.Process(target=integer_parameter_serialization, args=(param,))
         p.start()
         p.join()
 
         self.assertEqual(
-            0, p.exitcode,
-            msg=f'Failed to serialize successfully for "{device}" device!'
+            0,
+            p.exitcode,
+            msg=f'Failed to serialize successfully for "{device}" device!',
         )
 
     def test_empty_shared(self):
@@ -924,7 +1046,9 @@ def _test_is_shared(self):
         t.share_memory_()
         self.assertTrue(t.is_shared())
 
-    @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on macOS")
+    @unittest.skipIf(
+        platform == "darwin", "file descriptor strategy is not supported on macOS"
+    )
     def test_is_shared(self):
         self._test_is_shared()
 
@@ -932,11 +1056,11 @@ def test_fs_is_shared(self):
         with fs_sharing():
             self._test_is_shared()
 
-    @unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available')
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_is_shared_cuda(self):
         t = torch.randn(5, 5).cuda()
         self.assertTrue(t.is_shared())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_namedtensor.py b/test/test_namedtensor.py
index 691badc719f1a..8387babc798f3 100644
--- a/test/test_namedtensor.py
+++ b/test/test_namedtensor.py
@@ -2,6 +2,7 @@
 
 import unittest
 from torch.testing._internal.common_utils import TestCase, run_tests, TEST_NUMPY
+from torch.testing._internal.common_utils import skipIfTorchDynamo
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_device_type import get_all_device_types
 from collections import namedtuple, OrderedDict
@@ -148,6 +149,7 @@ def _test_factory(self, factory, device):
             names65 = ['A' * i for i in range(1, 66)]
             x = factory([1] * 65, names=names64, device=device)
 
+    @skipIfTorchDynamo("not a bug: Dynamo causes the refcounts to be different")
     def test_none_names_refcount(self, N=10):
         def scope():
             unnamed = torch.empty(2, 3)
@@ -684,10 +686,12 @@ def test_basic(op):
 
             self.assertEqual(op(a, a).names, ('N', 'C'))
             self.assertEqual(op(a, c).names, ('N', 'C'))
-
-            with self.assertRaisesRegex(RuntimeError, "do not match"):
+            # TODO: dynamo will throw a slightly different
+            # error message because it's adding fake tensors
+            # `must match the size of` portion is the dynamo error
+            with self.assertRaisesRegex(RuntimeError, "do not match|must match the size of"):
                 op(a, d)
-            with self.assertRaisesRegex(RuntimeError, "do not match"):
+            with self.assertRaisesRegex(RuntimeError, "do not match|must match the size of"):
                 op(a, b)
 
         def test_wildcard(op):
@@ -1075,6 +1079,21 @@ def test_flatten_nodims(self):
         with self.assertRaisesRegex(RuntimeError, "cannot be empty"):
             tensor.flatten((), 'abcd')
 
+    def test_flatten_index_error(self):
+        tensor = torch.randn(1, 2)
+        with self.assertRaisesRegex(IndexError,
+                                    r"Dimension out of range \(expected to be in range of \[-2, 1\], but got 2\)"):
+            tensor.flatten(0, 2)
+        with self.assertRaisesRegex(IndexError,
+                                    r"Dimension out of range \(expected to be in range of \[-2, 1\], but got 2\)"):
+            tensor.flatten(0, 2, 'N')
+        with self.assertRaisesRegex(RuntimeError,
+                                    r"flatten\(\) has invalid args: start_dim cannot come after end_dim"):
+            tensor.flatten(1, 0)
+        with self.assertRaisesRegex(RuntimeError,
+                                    r"flatten\(\) has invalid args: start_dim cannot come after end_dim"):
+            tensor.flatten(1, 0, 'N')
+
     def test_unflatten(self):
         # test args: tensor, int, namedshape
         self.assertTrue(torch.equal(
@@ -1388,6 +1407,8 @@ def see_name():
         new_refcnt = sys.getrefcount(seen_name)
         self.assertEqual(new_refcnt, old_refcnt)
 
+    # This test is failing on Python 3.12: https://github.com/pytorch/pytorch/issues/119464
+    @unittest.skipIf(sys.version_info >= (3, 12), "Failing on python 3.12+")
     def test_using_unseen_interned_string_bumps_refcount_permanently(self):
         # Please don't use this as a name in a different test.
         unseen_name = 'abcdefghi'
@@ -1398,6 +1419,8 @@ def test_using_unseen_interned_string_bumps_refcount_permanently(self):
         new_refcnt = sys.getrefcount(unseen_name)
         self.assertEqual(new_refcnt, old_refcnt + 1)
 
+    # This test is failing on Python 3.12: https://github.com/pytorch/pytorch/issues/119464
+    @unittest.skipIf(sys.version_info >= (3, 12), "Failing on python 3.12+")
     def test_using_unseen_uninterned_string_refcounts(self):
         # Please don't use this as a name in a different test.
         # non-compile-time constants are not interned
diff --git a/test/test_namedtuple_return_api.py b/test/test_namedtuple_return_api.py
index 881762c7047fb..122cff473371d 100644
--- a/test/test_namedtuple_return_api.py
+++ b/test/test_namedtuple_return_api.py
@@ -28,6 +28,7 @@
     '_scaled_dot_product_flash_attention',
     '_scaled_dot_product_flash_attention_for_cpu',
     '_scaled_dot_product_efficient_attention',
+    '_scaled_dot_product_cudnn_attention',
 }
 
 
diff --git a/test/test_native_functions.py b/test/test_native_functions.py
index c95b4a221eae6..2760ca9171ab7 100644
--- a/test/test_native_functions.py
+++ b/test/test_native_functions.py
@@ -96,7 +96,7 @@ def fake_module(values, const):
                 return traced_none(values)
             if const == [5.1, 4.1]:
                 return traced_list(values)
-            raise Exception("Invalid argument")
+            raise Exception("Invalid argument")  # noqa: TRY002
 
         self.do_test_optional_floatlist_with_module(fake_module)
 
@@ -150,7 +150,7 @@ def fake_module(values, const):
                 return traced_none(values)
             if const == [5, 4]:
                 return traced_list(values)
-            raise Exception("Invalid argument")
+            raise Exception("Invalid argument")  # noqa: TRY002
 
         self.do_test_optional_intlist_with_module(fake_module)
 
@@ -217,7 +217,7 @@ def fake_module(values, const):
                 return traced_none(values)
             if const == 10:
                 return traced_int(values)
-            raise Exception("Invalid argument")
+            raise Exception("Invalid argument")  # noqa: TRY002
 
         self.do_test_optional_filled_intlist_with_module(fake_module)
 
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index e9deadbb25120..31ea10a930f7f 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -6,18 +6,23 @@
 from typing import Optional, Tuple
 import unittest
 from functools import partial
+import math
 
 import numpy as np
 import torch
 import torch.nn
 import torch.nn.functional as F
-from torch.testing._internal.common_cuda import SM80OrLater
+from torch.testing._internal.common_cuda import (
+    SM70OrLater, SM80OrLater, PLATFORM_SUPPORTS_FUSED_ATTENTION,
+)
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfCUDA,
     instantiate_device_type_tests,
     onlyCPU,
     onlyCUDA,
+    skipCUDAIf,
+    skipCUDAIfRocm,
     skipMeta,
     PYTORCH_CUDA_MEMCHECK,
 )
@@ -28,9 +33,11 @@
     gradcheck,
     instantiate_parametrized_tests,
     IS_FBCODE,
+    IS_WINDOWS,
     parametrize,
     run_tests,
     skipIfSlowGradcheckEnv,
+    skipIfTorchDynamo,
     markDynamoStrictTest,
     xfailIfTorchDynamo,
     subtest,
@@ -42,6 +49,8 @@
     buffer_from_jagged,
     jagged_from_list,
     NestedTensor,
+    nested_view_from_values_offsets,
+    ViewNestedFromBuffer,
 )
 
 # Tests are ported from pytorch/nestedtensor.
@@ -176,6 +185,35 @@ def layout_name(layout):
     return layout.__repr__().split(".")[-1]
 
 
+# Helper function for test_dummy_mha_with_nt
+@torch.fx.wrap
+def convert_dense_to_nested_tensor(values):
+    offsets = torch.arange(
+        0, values.shape[0] * values.shape[1] + 1, values.shape[1], device=values.device
+    )
+    metadata_cache = {"max_seqlen": values.shape[1], "min_seqlen": 1}
+    nt = ViewNestedFromBuffer.apply(
+        values.view(-1, values.shape[-1]), offsets, metadata_cache
+    )
+    return nt
+
+
+# Helper function for test_dummy_mha_with_nt
+@torch.fx.wrap
+def convert_jagged_to_nested_tensor(
+    values: torch.Tensor, offsets: torch.Tensor, max_length: int
+) -> torch.Tensor:
+    metadata_cache = {"max_seqlen": max_length, "min_seqlen": 1}
+    nt = ViewNestedFromBuffer.apply(values, offsets, metadata_cache)
+    return nt
+
+
+# Helper function for test_dummy_mha_with_nt
+@torch.fx.wrap
+def convert_nt_to_jagged(nt):
+    return buffer_from_jagged(nt)
+
+
 @markDynamoStrictTest
 class TestNestedTensor(TestCase):
     @parametrize("batch_size", [2, 4])
@@ -992,7 +1030,7 @@ def test_nested_tensor_indexing(self, device, dtype):
         self.assertRaises(IndexError, lambda: nt[2])
         self.assertRaises(IndexError, lambda: nt[-3])
         self.assertRaises(NotImplementedError, lambda: nt[:])
-        self.assertRaises(NotImplementedError, lambda: nt[...])
+        self.assertEqual(nt[...], nt)
         # tuple of indices: only support integer in the batch dimension
         #                 + all possible indexing in the original tensor dimensions
         self.assertEqual(nt[0, 0, 0], x0[0, 0])
@@ -1549,19 +1587,6 @@ def test_softmax_noncontiguous(self, device, dtype):
             torch.nn.functional.softmax(nt_noncontiguous, -1))
 
     def _test_bmm(self, device, dtype):
-        # error case: one is nested but the other is not
-        nt = torch.nested.nested_tensor([torch.randn(2), torch.randn(3)], device=device, dtype=dtype)
-        t = torch.randn(4, device=device, dtype=dtype)
-        self.assertRaisesRegex(
-            RuntimeError,
-            "Expected both to be nested, but got a nested self and non-nested other",
-            lambda: nt.bmm(t)
-        )
-        self.assertRaisesRegex(
-            RuntimeError,
-            "Expected both to be nested, but got a non-nested self and nested other",
-            lambda: t.bmm(nt)
-        )
         # error case: not 3D tensors
         nt0 = torch.nested.nested_tensor([], device=device, dtype=dtype)
         nt1 = torch.nested.nested_tensor([torch.randn(2), torch.randn(3)], device=device, dtype=dtype)
@@ -1639,6 +1664,37 @@ def _test_bmm(self, device, dtype):
         else:
             self.assertEqual(actual, expect)
 
+        # nested tensor bmm normal tensor
+        nt0 = torch.nested.nested_tensor([torch.randn((2, 7)), torch.randn((3, 7))], device=device, dtype=dtype)
+        nt1 = torch.rand(2, 7, 5, dtype=dtype, device=device)
+        actual = torch.nested.to_padded_tensor(nt0.bmm(nt1), 0.0)
+        expect = torch.nested.to_padded_tensor(nt0, 0.0).bmm(nt1)
+        if dtype == torch.float16:
+            self.assertEqual(actual, expect, rtol=1e-3, atol=1e-3)
+        else:
+            self.assertEqual(actual, expect)
+
+        # nested tensor bmm normal tensor with non-contiguous view
+        nt1 = torch.rand(2, 5, 7, dtype=dtype, device=device)
+        nt1 = nt1.transpose(1, 2)
+        actual = torch.nested.to_padded_tensor(nt0.bmm(nt1), 0.0)
+        expect = torch.nested.to_padded_tensor(nt0, 0.0).bmm(nt1)
+        if dtype == torch.float16:
+            self.assertEqual(actual, expect, rtol=1e-3, atol=1e-3)
+        else:
+            self.assertEqual(actual, expect)
+
+
+        # normal tensor bmm nested tensor
+        nt0 = torch.rand(2, 5, 7, dtype=dtype, device=device)
+        nt1 = torch.nested.nested_tensor([torch.randn((7, 6)), torch.randn((7, 5))], device=device, dtype=dtype)
+        actual = torch.nested.to_padded_tensor(nt0.bmm(nt1), 0.0)
+        expect = nt0.bmm(torch.nested.to_padded_tensor(nt1, 0.0))
+        if dtype == torch.float16:
+            self.assertEqual(actual, expect, rtol=1e-3, atol=1e-3)
+        else:
+            self.assertEqual(actual, expect)
+
         # test tensorcore path
         nt0 = torch.nested.nested_tensor([torch.randn((2, 8)), torch.randn((3, 16))], device=device, dtype=dtype)
         nt1 = torch.nested.nested_tensor([torch.randn((8, 8)), torch.randn((16, 8))], device=device, dtype=dtype)
@@ -2879,7 +2935,7 @@ def grad_test_func(a, b):
         data = (a, b)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
-    # TODO: OOM https://github.com/pytorch/pytorch/issues/95562
+    # https://github.com/pytorch/pytorch/issues/95562
     @skipIfSlowGradcheckEnv
     @parametrize("size", [1024, 1023, 513, 512, 256, 128, 32, 4, 2])
     def test_layer_norm_backward(self, device, size):
@@ -2896,7 +2952,7 @@ def grad_test_func(a, b, c):
         data = (a, b, c)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
-    # TODO: OOM https://github.com/pytorch/pytorch/issues/95562
+    # https://github.com/pytorch/pytorch/issues/95562
     @skipIfSlowGradcheckEnv
     # Could either mark slow or reduce size
     @parametrize("size", [128, 32, 4, 2])
@@ -3007,7 +3063,8 @@ def test_tensor_attributes(self, device):
         a = torch.randn(2, 3, requires_grad=True, dtype=torch.float64, device=device)
         b = torch.randn(3, 3, requires_grad=True, dtype=torch.float64, device=device)
         c = torch.randn(4, 3, requires_grad=True, dtype=torch.float64, device=device)
-        nt, _offsets = jagged_from_list([a, b, c], None)
+        nt = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
+        _offsets = nt.offsets()
 
         for op in (
             torch.ops.aten.is_non_overlapping_and_dense.default,
@@ -3023,9 +3080,9 @@ def test_tensor_attributes(self, device):
                                     "directly calling torch.ops.aten.size"):
             torch.ops.aten.size.default(nt)
 
-        singleton_int = torch.nested._internal.nested_tensor.get_tensor_symint(_offsets, coeff=1)
-        self.assertEqual(nt.size(), (3, singleton_int, 3))
-        self.assertEqual(nt.shape, (3, singleton_int, 3))
+        nested_int = torch.nested._internal.nested_tensor.get_tensor_symint(_offsets, coeff=1)
+        self.assertEqual(nt.size(), (3, nested_int, 3))
+        self.assertEqual(nt.shape, (3, nested_int, 3))
         self.assertEqual(nt.dim(), 3)
         self.assertEqual(nt.numel(), 27)
 
@@ -3036,9 +3093,9 @@ def test_linear(self, device):
         weight = torch.randn(4, 3, requires_grad=True, dtype=torch.float64, device=device)
 
         def grad_test_func(a, b, c, weight):
-            nt, _ = jagged_from_list([a, b, c], None)
+            nt = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
             out = torch.nn.functional.linear(nt, weight)
-            return buffer_from_jagged(out)
+            return out.values()
 
         gradcheck(grad_test_func, inputs=(a, b, c, weight), check_batched_grad=False)
 
@@ -3048,12 +3105,38 @@ def test_unary_pointwise(self, device):
         c = torch.randn(4, 3, requires_grad=True, dtype=torch.float64, device=device)
 
         def grad_test_func(a, b, c):
-            nt, _ = jagged_from_list([a, b, c], None)
+            nt = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
             out = torch.nn.functional.silu(nt.sin().cos())
-            return buffer_from_jagged(out)
+            return out.values()
 
         gradcheck(grad_test_func, inputs=(a, b, c), check_batched_grad=False)
 
+    def test_unary_pointwise_transposed_inputs(self, device):
+        a, b, c = (
+            torch.randn(i + 2, 5, requires_grad=True, dtype=torch.float64, device=device) for i in range(3)
+        )
+
+        nt = torch.nested.nested_tensor([a.detach(), b.detach(), c.detach()], layout=torch.jagged)
+        nt_t = nt.transpose(1, 2)
+        self.assertFalse(nt_t.is_contiguous())
+        out = torch.nn.functional.silu(nt_t.sin().cos())
+        self.assertEqual(out.is_contiguous(), torch.nn.functional.silu(b.transpose(-1, -2).sin().cos()).is_contiguous())
+
+        self.assertEqual(nt_t.shape, out.shape)
+
+        a, b, c = (
+            torch.randn(i + 2, 5, requires_grad=True, dtype=torch.float64, device=device) for i in range(3)
+        )
+
+        def grad_test_func(a, b, c):
+            nt = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
+            nt_t = nt.transpose(1, 2)
+            out = torch.nn.functional.silu(nt_t.sin().cos())
+            return out.values()
+
+        gradcheck(grad_test_func, inputs=(a, b, c), check_batched_grad=False)
+
+
     def test_binary_pointwise(self, device):
         a = torch.randn(2, 3, requires_grad=True, dtype=torch.float64, device=device)
         b = torch.randn(3, 3, requires_grad=True, dtype=torch.float64, device=device)
@@ -3061,20 +3144,58 @@ def test_binary_pointwise(self, device):
 
         # Incorrect usage: shape check will fail if the offsets tensor are not
         #                  the same exact tensor object
-        nt1, _ = jagged_from_list([a, b, c], None)
-        nt2, _ = jagged_from_list([a, b, c], None)
+        nt1 = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
+        nt2 = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
 
         self.assertRaisesRegex(
             RuntimeError,
             "cannot call binary pointwise function .* with inputs of shapes",
             lambda: nt1 * nt2)
 
+        # Correct usage: chain the calls using the same offsets tensor object
+        def grad_test_func(a, b, c):
+            nt1 = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
+            # TODO: Switch to public API that takes in (values, offsets) once it exists
+            nt2, offsets = jagged_from_list([a, b, c], nt1.offsets())
+            out = nt1 * nt2
+            return out.values()
+
+        gradcheck(grad_test_func, inputs=(a, b, c), check_batched_grad=False)
+
+    def test_binary_pointwise_transposed(self, device):
+        a, b, c = (
+            torch.randn(i + 2, 5, dtype=torch.float64, device=device) for i in range(3)
+        )
+
+        nt1, offsets = jagged_from_list([a, b, c], None)
+        nt2, offsets = jagged_from_list([a, b, c], offsets)
+
+        nt1_t = nt1.transpose(1, 2)
+        nt2_t = nt2.transpose(1, 2)
+
+        # out = nt1_t * nt2_t
+        # self.assertFalse(nt1_t.is_contiguous())
+        # self.assertEqual(out.is_contiguous(), (b.transpose(-1, -2) * b.transpose(-1, -2)).is_contiguous())
+        # self.assertEqual(out.shape, nt1_t.shape)
+
+        self.assertRaisesRegex(
+            RuntimeError,
+            "cannot call binary pointwise function mul.Tensor with inputs of shapes",
+            lambda: nt1 * nt2_t,
+        )
+
+        a, b, c = (
+            torch.randn(i + 2, 5, requires_grad=True, dtype=torch.float64, device=device) for i in range(3)
+        )
+
         # Correct usage: chain the calls using the same offsets tensor object
         def grad_test_func(a, b, c):
             nt1, offsets = jagged_from_list([a, b, c], None)
             nt2, offsets = jagged_from_list([a, b, c], offsets)
-            out = nt1 * nt2
-            return buffer_from_jagged(out)
+            nt1_t = nt1.transpose(1, 2)
+            nt2_t = nt2.transpose(1, 2)
+            out = nt1_t * nt2_t
+            return out.values()
 
         gradcheck(grad_test_func, inputs=(a, b, c), check_batched_grad=False)
 
@@ -3083,19 +3204,21 @@ def test_split(self, device):
         b = torch.randn(3, 3, requires_grad=True, dtype=torch.float64, device=device)
         c = torch.randn(4, 3, requires_grad=True, dtype=torch.float64, device=device)
 
-        nt, _ = jagged_from_list([a, b, c], None)
+        nt = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
         out = torch.split(nt, 2, -1)
         self.assertEqual(len(out), 2)
         self.assertEqual(
-            out[0], jagged_from_list([a[:, 0:2], b[:, 0:2], c[:, 0:2]], None)[0]
+            out[0],
+            torch.nested.as_nested_tensor([a[:, 0:2], b[:, 0:2], c[:, 0:2]], layout=torch.jagged)
         )
         self.assertEqual(
-            out[1], jagged_from_list([a[:, 2:], b[:, 2:], c[:, 2:]], None)[0]
+            out[1],
+            torch.nested.as_nested_tensor([a[:, 2:], b[:, 2:], c[:, 2:]], layout=torch.jagged)
         )
 
         with self.assertRaisesRegex(
             RuntimeError,
-            r"split\(\): not supported for NestedTensor on dim=0 or dim=1",
+            r"split\(\): not supported for NestedTensor on dim=1",
         ):
             torch.split(nt, 2, 1)
 
@@ -3104,18 +3227,20 @@ def test_split_with_sizes(self, device):
         b = torch.randn(3, 3, requires_grad=True, dtype=torch.float64, device=device)
         c = torch.randn(4, 3, requires_grad=True, dtype=torch.float64, device=device)
 
-        nt, _ = jagged_from_list([a, b, c], None)
+        nt = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
         out = torch.split(nt, [1, 2], -1)
         self.assertEqual(len(out), 2)
         self.assertEqual(
-            out[0], jagged_from_list([a[:, 0:1], b[:, 0:1], c[:, 0:1]], None)[0]
+            out[0],
+            torch.nested.as_nested_tensor([a[:, 0:1], b[:, 0:1], c[:, 0:1]], layout=torch.jagged)
         )
         self.assertEqual(
-            out[1], jagged_from_list([a[:, 1:], b[:, 1:], c[:, 1:]], None)[0]
+            out[1],
+            torch.nested.as_nested_tensor([a[:, 1:], b[:, 1:], c[:, 1:]], layout=torch.jagged)
         )
         with self.assertRaisesRegex(
             RuntimeError,
-            r"split_with_sizes\(\): not supported for NestedTensor on dim=0 or dim=1",
+            r"split_with_sizes\(\): not supported for NestedTensor on dim=1",
         ):
             torch.split(nt, [1, 2], 1)
 
@@ -3137,23 +3262,62 @@ def test_views_inherit_ragged_dim(self, device):
         view = nt.expand(-1, -1, 5)
         self.assertEqual(nt.shape[:2], view.shape[:2])
 
+    def test_view_ragged_idx_not_one(self, device):
+        nt = random_nt_from_dims([2, None, 20], device=device, dtype=torch.float32, layout=torch.jagged)
+
+        view_transposed = nt.transpose(1, 2).view(2, 20, nt.size(1))
+        self.assertEqual((2, 20, nt.size(1)), (view_transposed.size()))
+        self.assertEqual(view_transposed._base, nt._base)
+
+    def test_unsafe_view(self, device):
+        nt = random_nt_from_dims([4, None, 8, 10], device=device, dtype=torch.float32, layout=torch.jagged)
+        # basic view
+        view1 = torch.ops.aten._unsafe_view(nt, (4, -1, 80))
+        self.assertEqual((4, nt.size(1), 80), tuple(view1.size()))
+        # _unsafe_view differs from view in that the view information is not tracked
+        self.assertTrue(view1._base is None)
+
+        # test an unsafe_view when ragged_idx != 1, currently only supports identity view
+        nt_t = nt.transpose(1, 2)
+        view2 = torch.ops.aten._unsafe_view(nt_t, (4, 8, nt.size(1), 10))
+        self.assertEqual((4, 8, nt.size(1), 10), tuple(view2.size()))
+        self.assertTrue(view2._base is None)
+
     @xfailIfTorchDynamo
-    def test_reshape_decomp(self, device):
-        # contiguous NT should result in view
+    @parametrize("requires_grad", [False, True])
+    def test_reshape_decomp(self, device, requires_grad):
+        # contiguous NT should result in view.
         nt = random_nt_from_dims(
-            [3, None, 10], device=device, dtype=torch.float32, layout=torch.jagged)
+            [3, None, 10],
+            device=device,
+            dtype=torch.float32,
+            layout=torch.jagged,
+        ).detach().requires_grad_(requires_grad)
         view = nt.reshape(-1, -1, 5, 2)
         self.assertEqual(view.shape[:2], nt.shape[:2])
         self.assertTrue(view._is_view() and view._base is nt)
+        # make sure gradients flow back
+        if requires_grad:
+            view.backward(torch.ones_like(view))
+            self.assertEqual(nt.grad, torch.ones_like(nt))
 
         # non-contiguous NT should result in contiguous copy
         nt = random_nt_from_dims(
-            [3, None, 5, 2], device=device, dtype=torch.float32, layout=torch.jagged)
+            [3, None, 5, 2],
+            device=device,
+            dtype=torch.float32,
+            layout=torch.jagged,
+            requires_grad=requires_grad
+        )
         nt_noncontig = nt.transpose(-1, -2)
         self.assertFalse(nt_noncontig.is_contiguous())
         copy = nt_noncontig.reshape(-1, -1, 10)
         self.assertTrue(copy.is_contiguous())
         self.assertEqual(copy.shape[:2], nt.shape[:2])
+        # make sure gradients flow back
+        if requires_grad:
+            copy.backward(torch.ones_like(copy))
+            self.assertEqual(nt.grad, torch.ones_like(nt))
 
     def test_flatten_decomp(self, device):
         nt = random_nt_from_dims(
@@ -3169,22 +3333,29 @@ def test_flatten_decomp(self, device):
     def test_chunk(self, device):
         # normal case
         D = 30
-        nt = random_nt_from_dims(
-            [4, None, D], device=device, dtype=torch.float32, layout=torch.jagged)
+        B = 8
+        nt = random_nt_from_dims([B, None, D], device=device, dtype=torch.float32, layout=torch.jagged)
         NUM_CHUNKS = 3
         chunks = nt.chunk(NUM_CHUNKS, dim=-1)
         self.assertEqual(len(chunks), NUM_CHUNKS)
         for i in range(NUM_CHUNKS):
             self.assertEqual(chunks[i].shape[-1], D // NUM_CHUNKS)
 
-        # chunk on batch dim not supported
-        with self.assertRaisesRegex(
-                RuntimeError, "chunk.* not supported for NestedTensor on dim=0 or dim=1"):
-            nt.chunk(2, dim=0)
+        # chunk on batch dim
+        chunks = nt.chunk(NUM_CHUNKS, dim=0)
+        self.assertEqual(len(chunks), NUM_CHUNKS)
+        chunk_size = math.ceil(B / NUM_CHUNKS)
+        for i in range(NUM_CHUNKS):
+            if i < NUM_CHUNKS - 1:
+                self.assertEqual(chunks[i].shape[0], chunk_size)
+            else:
+                self.assertEqual(chunks[i].shape[0], B - chunk_size * (NUM_CHUNKS - 1))
+            offsets_expected = nt._offsets[i * chunk_size + 1 : (i + 1) * chunk_size + 1] - nt._offsets[i * chunk_size]
+            self.assertEqual(chunks[i]._offsets[1:], offsets_expected)
+        self.assertEqual(nt._values, torch.cat([x._values for x in chunks], dim=0))
 
         # chunk on ragged dim not supported
-        with self.assertRaisesRegex(
-                RuntimeError, "chunk.* not supported for NestedTensor on dim=0 or dim=1"):
+        with self.assertRaisesRegex(RuntimeError, "chunk.* not supported for NestedTensor on dim=1"):
             nt.chunk(2, dim=1)
 
     def test_squeeze(self, device):
@@ -3212,12 +3383,12 @@ def test_squeeze(self, device):
 
         # squeeze on batch dim not supported
         with self.assertRaisesRegex(
-                RuntimeError, "squeeze.* not supported for NestedTensor on dim=0 or dim=1"):
+                RuntimeError, "squeeze.* not supported for NestedTensor on dim=0"):
             nt.squeeze(0)
 
         # squeeze on ragged dim not supported
         with self.assertRaisesRegex(
-                RuntimeError, "squeeze.* not supported for NestedTensor on dim=0 or dim=1"):
+                RuntimeError, "squeeze.* not supported for NestedTensor on dim=1"):
             nt.squeeze(1)
 
     def test_binary_pointwise_broadcasting(self, device):
@@ -3237,14 +3408,29 @@ def test_binary_pointwise_broadcasting(self, device):
         )
 
         def grad_test_func(t, *ts):
-            nt, _ = jagged_from_list(ts, None)
+            nt = torch.nested.as_nested_tensor(list(ts), layout=torch.jagged)
             out = nt + t
-            return buffer_from_jagged(out)
+            return out.values()
 
         for t_size in t_sizes:
             t = torch.rand(t_size, requires_grad=True, device=device, dtype=torch.float64)
             gradcheck(grad_test_func, inputs=(t, *ts), check_batched_grad=False)
 
+    def test_threshold_backward(self, device):
+        ts1 = self._get_list_for_jagged_tensor(((2, 3, 4), 16), device=device, requires_grad=False)
+        ts2 = self._get_list_for_jagged_tensor(((2, 3, 4), 16), device=device, requires_grad=False)
+
+        nt1, offsets = jagged_from_list(ts1, None)
+        nt2, offsets = jagged_from_list(ts2, offsets)
+        buf1 = nt1.values().detach().clone()
+        buf2 = nt2.values().detach().clone()
+
+        res_nt = torch.ops.aten.threshold_backward(nt1, nt2, 0.0)
+        res_dense = torch.ops.aten.threshold_backward(buf1, buf2, 0.0)
+
+        self.assertEqual(res_dense, res_nt.values())
+
+
     @parametrize("keepdim", [False, True])
     def test_sum_int_DimList(self, device, keepdim):
         # (B, j0, 3, 4)
@@ -3266,11 +3452,11 @@ def test_sum_int_DimList(self, device, keepdim):
                 with self.assertRaisesRegex(
                         RuntimeError,
                         "applying over the ragged dimension, but not the batch dimension"):
-                    nt, _ = jagged_from_list(ts, None)
+                    nt = torch.nested.as_nested_tensor(ts, layout=torch.jagged)
                     out = torch.sum(nt, dim=rd, keepdim=keepdim)
                 continue
 
-            nt, _ = jagged_from_list(ts, None)
+            nt = torch.nested.as_nested_tensor(ts, layout=torch.jagged)
             out = torch.sum(nt, dim=rd, keepdim=keepdim)
             ref_shape = ref_shape_keepdim if keepdim else ref_shape_no_keepdim
             self.assertEqual(len(out.shape), len(ref_shape))
@@ -3282,7 +3468,7 @@ def test_sum_int_DimList(self, device, keepdim):
 
         # Check values correctness
         # raggedness not reduced
-        nt, _ = jagged_from_list(ts, None)
+        nt = torch.nested.as_nested_tensor(ts, layout=torch.jagged)
         out = torch.sum(nt, dim=(2, 3), keepdim=keepdim)
         out_ref = torch.sum(nt.values(), dim=(1, 2))
         self.assertIsInstance(out, NestedTensor)
@@ -3290,7 +3476,7 @@ def test_sum_int_DimList(self, device, keepdim):
         self.assertTrue(torch.allclose(out.values().view(-1), out_ref.view(-1)))
 
         # raggedness reduced away
-        nt, _ = jagged_from_list(ts, None)
+        nt = torch.nested.as_nested_tensor(ts, layout=torch.jagged)
         out = torch.sum(nt, dim=(0, 1), keepdim=keepdim)
         out_ref = torch.sum(nt.values(), dim=(0,))
         self.assertNotIsInstance(out, NestedTensor)
@@ -3336,24 +3522,25 @@ def test_pin_memory(self, device):
             self.assertIs(pinned, pinned.pin_memory())
             self.assertEqual(pinned.data_ptr(), pinned.pin_memory().data_ptr())
 
-    def _validate_nt(self, nt, tensor_list, device, dtype, requires_grad):
+    @torch.compiler.disable
+    def _validate_nt(self, nt, device, dtype, layout, requires_grad, dim, batch_size, base=None):
         # Validate a bunch of properties after NT construction.
         device = torch.device(device)
-        first_t = torch.as_tensor(tensor_list[0])
-        expected_dim = first_t.dim() + 1
-        batch_size = len(tensor_list)
-        self.assertEqual(nt.dim(), expected_dim)
+        self.assertEqual(nt.dim(), dim)
         self.assertEqual(nt.device, device)
         self.assertEqual(nt.dtype, dtype)
-        self.assertEqual(nt.layout, torch.jagged)
+        self.assertEqual(nt.layout, layout)
         self.assertEqual(nt.requires_grad, requires_grad)
-        self.assertEqual(nt.values().device, device)
-        self.assertEqual(nt.offsets().device, device)
-        self.assertEqual(nt.shape[0], batch_size)
-        self.assertTrue(isinstance(nt.shape[1], torch.SymInt))
-        self.assertEqual(nt.shape[2:], first_t.shape[1:])
 
-    @xfailIfTorchDynamo
+        if layout == torch.jagged:
+            self.assertEqual(nt._values.device, device)
+            self.assertEqual(nt._offsets.device, device)
+            self.assertEqual(nt.shape[0], batch_size)
+            self.assertTrue(isinstance(nt.shape[1], torch.SymInt))
+
+        if base is not None:
+            self.assertTrue(nt._is_view() and nt._base is base)
+
     @dtypes(torch.float, torch.double, torch.half)
     @parametrize("requires_grad", [False, True])
     @parametrize("components_require_grad", [False, True])
@@ -3367,7 +3554,11 @@ def test_jagged_layout_construction_nested_tensor(
                 dtype=dtype,
                 layout=torch.jagged,
                 requires_grad=requires_grad)
-            self._validate_nt(nt, tensor_list, device, dtype, requires_grad)
+
+            expected_dim = torch.as_tensor(tensor_list[0]).dim() + 1
+            expected_batch_size = len(tensor_list)
+            self._validate_nt(
+                nt, device, dtype, torch.jagged, requires_grad, expected_dim, expected_batch_size)
 
             # Make sure grads -don't- flow back into original tensors for nested_tensor()
             if requires_grad:
@@ -3376,7 +3567,6 @@ def test_jagged_layout_construction_nested_tensor(
                 t = t if isinstance(t, torch.Tensor) else torch.as_tensor(t)
                 self.assertTrue(t.grad is None)
 
-    @xfailIfTorchDynamo
     @dtypes(torch.float, torch.double, torch.half)
     @parametrize("components_require_grad", [False, True])
     def test_jagged_layout_construction_as_nested_tensor(
@@ -3391,7 +3581,16 @@ def test_jagged_layout_construction_as_nested_tensor(
                 layout=torch.jagged)
 
             # nt.requires_grad=True should be set if at least one component requires grad
-            self._validate_nt(nt, tensor_list, device, dtype, components_require_grad)
+            expected_dim = tensor_list[0].dim() + 1
+            expected_batch_size = len(tensor_list)
+            self._validate_nt(
+                nt,
+                device,
+                dtype,
+                torch.jagged,
+                components_require_grad,
+                expected_dim,
+                expected_batch_size)
 
             # Make sure grads flow back into original tensors for as_nested_tensor()
             if components_require_grad:
@@ -3413,12 +3612,156 @@ def test_jagged_layout_construction_with_pinned_memory(self, device):
                 device="cpu",
                 pin_memory=True)
 
-            self._validate_nt(nt, tensor_list, "cpu", torch.float32, requires_grad=False)
+            expected_dim = torch.as_tensor(tensor_list[0]).dim() + 1
+            expected_batch_size = len(tensor_list)
+            self._validate_nt(
+                nt,
+                device="cpu",
+                dtype=torch.float32,
+                layout=torch.jagged,
+                requires_grad=False,
+                dim=expected_dim,
+                batch_size=expected_batch_size)
             self.assertTrue(nt.is_pinned())
 
+    @dtypes(torch.float, torch.double, torch.half)
+    @parametrize("requires_grad", [False, True])
+    @parametrize("values_is_view", [False, True])
+    def test_jagged_view_from_values_offsets(self, device, dtype, requires_grad, values_is_view):
+        if values_is_view:
+            # make values a view of base
+            base = torch.randn(
+                2, 3, 4, 5, 6, device=device, dtype=dtype, requires_grad=requires_grad)
+            values = base.flatten(0, -2)
+        else:
+            values = torch.randn(10, 5, device=device, dtype=dtype, requires_grad=requires_grad)
+        offsets = torch.tensor([0, 2, 4, 6, 10], device=device, dtype=torch.int64)
+
+        nt = nested_view_from_values_offsets(values, offsets)
+
+        expected_dim = values.dim() + 1
+        expected_batch_size = offsets.shape[0] - 1
+        expected_base = base if values_is_view else values
+        self._validate_nt(
+            nt, device, dtype, torch.jagged, requires_grad, expected_dim, expected_batch_size,
+            # ensure NT is a proper view
+            base=expected_base
+        )
+
+        if requires_grad:
+            # Make sure grads flow back
+            (nt * 2).backward(torch.ones_like(nt))
+
+            @torch.compiler.disable
+            def _check_grad(t):
+                self.assertTrue(t.grad is not None)
+                self.assertEqual(t.grad, torch.ones_like(t) * 2)
+
+            _check_grad(base if values_is_view else values)
+
+    @dtypes(torch.float)
+    def test_nested_tensor_from_jagged(self, device, dtype):
+        # construct from (values, offsets)
+        values = torch.randn(10, 5, device=device, dtype=dtype)
+        offsets = torch.tensor([0, 2, 4, 6, 10], device=device, dtype=torch.int64)
+        nt = torch.nested.nested_tensor_from_jagged(values, offsets=offsets)
+        self.assertTrue(isinstance(nt, NestedTensor))
+        self.assertTrue(nt._is_view() and nt._base is values)
+        self.assertEqual(nt.dim(), 3)
+        self.assertEqual(nt.size(0), offsets.size(0) - 1)
+        self.assertEqual(nt.size(-1), values.size(-1))
+        self.assertIsNone(nt._lengths)
+        self.assertTrue(nt.is_contiguous())
+
+        # construct from (values, offsets, lengths)
+        lengths = torch.tensor([2, 1, 1, 2], device=device)
+        nt = torch.nested.nested_tensor_from_jagged(values, offsets=offsets, lengths=lengths)
+        self.assertTrue(isinstance(nt, NestedTensor))
+        self.assertTrue(nt._is_view() and nt._base is values)
+        self.assertEqual(nt.dim(), 3)
+        self.assertEqual(nt.size(0), offsets.size(0) - 1)
+        self.assertEqual(nt.size(-1), values.size(-1))
+        self.assertEqual(nt._lengths, lengths)
+        # when both offsets / lengths are specified, expect non-contiguous
+        self.assertFalse(nt.is_contiguous())
+
+        # construct from (values, lengths)
+        values = torch.randn(14, 5, device=device, dtype=dtype)
+        lengths = torch.tensor([2, 3, 4, 5], device=device)
+        nt = torch.nested.nested_tensor_from_jagged(values, lengths=lengths)
+        self.assertTrue(isinstance(nt, NestedTensor))
+        self.assertTrue(nt._is_view() and nt._base is values)
+        self.assertEqual(nt.dim(), 3)
+        self.assertEqual(nt.size(0), lengths.size(0))
+        self.assertEqual(nt.size(-1), values.size(-1))
+        # for now, if only lengths is specified, convert to offsets to integrate best with the
+        # existing kernels
+        expected_offsets = torch.tensor([0, 2, 5, 9, 14], device=device)
+        expected_nt = torch.nested.nested_tensor_from_jagged(values, offsets=expected_offsets)
+        for n1, n2 in zip(nt.unbind(), expected_nt.unbind()):
+            self.assertEqual(n1, n2)
+
+        # error case: no offsets or lengths
+        with self.assertRaisesRegex(RuntimeError, "At least one of offsets or lengths is required"):
+            torch.nested.nested_tensor_from_jagged(values, offsets=None, lengths=None)
+
+    @dtypes(torch.float, torch.double, torch.half)
+    @parametrize("dim", range(5))
+    @parametrize("layout", [torch.strided, torch.jagged],
+                 name_fn=lambda l: f"layout_{str(l).split('.')[1]}")
+    @parametrize("requires_grad", [False, True])
+    @parametrize("contiguous", [False, True])
+    def test_as_nested_tensor_from_tensor(
+            self, device, dtype, dim, layout, requires_grad, contiguous):
+        if dim == 0:
+            t = torch.tensor(3., requires_grad=requires_grad)
+        else:
+            t = torch.randn(*(3 for _ in range(dim)), requires_grad=requires_grad)
+        assert t.dim() == dim
+
+        if dim < 2:
+            # 0-1 dim tensors can't be converted to NTs
+            with self.assertRaisesRegex(RuntimeError, "Expected tensor argument to have dim"):
+                nt = torch.nested.as_nested_tensor(t, device=device, dtype=dtype, layout=layout)
+            return
+
+        orig_t = t
+        if not contiguous:
+            t = t.transpose(0, 1)
+
+        nt = torch.nested.as_nested_tensor(t, device=device, dtype=dtype, layout=layout)
+        expected_dim = t.dim()
+        expected_batch_size = t.size(0)
+        self._validate_nt(
+            nt, device, dtype, layout, requires_grad, expected_dim, expected_batch_size)
+
+        if torch.device(device) == t.device and dtype == t.dtype and contiguous:
+            # should be the non-copying (view) case
+            self.assertTrue(nt._is_view() and nt._base is t)
+
+        # should be equivalent to construction from unbound tensor list
+        nt_from_unbind = torch.nested.as_nested_tensor(
+            list(t.unbind(0)), device=device, dtype=dtype, layout=layout)
+        self.assertEqual(nt, nt_from_unbind)
+
+        # ensure call on a NT with the same properties returns the NT directly
+        nt2 = torch.nested.as_nested_tensor(nt, device=device, dtype=dtype, layout=layout)
+        self.assertTrue(nt is nt2)
+
+        # we don't support conversion between layouts this way atm
+        other_layout = torch.strided if layout == torch.jagged else torch.jagged
+        with self.assertRaisesRegex(
+                RuntimeError, "Converting between nested tensor layouts is not supported"):
+            torch.nested.as_nested_tensor(nt, device=device, dtype=dtype, layout=other_layout)
+
+        if requires_grad:
+            # make sure gradients flow back into inputs
+            (nt * 2).backward(torch.ones_like(nt))
+            self.assertEqual(orig_t.grad, torch.ones_like(orig_t) * 2)
+
     @dtypes(torch.double, torch.half)
     @onlyCUDA
-    def test_device_dtype_transfer_maintains_offsets(self, device, dtype):
+    def test_device_dtype_transfer_updates_offsets(self, device, dtype):
         for tensor_list in self._get_example_tensor_lists():
             orig_device = torch.device("cpu")
             orig_dtype = torch.float32
@@ -3431,8 +3774,8 @@ def test_device_dtype_transfer_maintains_offsets(self, device, dtype):
             self.assertEqual(torch.int64, nt.offsets().dtype)
             nt = nt.to(device=device).to(dtype=dtype)
 
-            # offsets should still be int64 on the original device
-            self.assertEqual(orig_device, nt.offsets().device)
+            # offsets should still be int64 on the new device
+            self.assertEqual(nt.values().device, nt.offsets().device)
             self.assertEqual(torch.int64, nt.offsets().dtype)
 
     def test_unbind(self, device):
@@ -3454,9 +3797,9 @@ def test_layer_norm_2(self, device):
         bias = torch.randn(3, requires_grad=False, dtype=torch.float64, device=device)
 
         def grad_test_func(a, b, c, bias):
-            nt, _ = jagged_from_list([a, b, c], None)
+            nt = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
             out = torch.nn.functional.layer_norm(nt, (nt.shape[-1],), bias=bias)
-            return buffer_from_jagged(out)
+            return out.values()
 
         gradcheck(
             grad_test_func, inputs=(*test_tensor_list, bias), check_batched_grad=False
@@ -3466,20 +3809,26 @@ def grad_test_func(a, b, c, bias):
             RuntimeError,
             r"layer_norm\(\): normalizing over ragged dim not supported for nested tensors",
         ):
-            nt, _ = jagged_from_list(test_tensor_list, None)
+            nt = torch.nested.as_nested_tensor(test_tensor_list, layout=torch.jagged)
             _ = torch.nn.functional.layer_norm(nt, (nt.shape[-2], nt.shape[-1]))
 
     def test_narrow(self, device):
         starts = torch.tensor([0, 1, 2, 3, 4], device=device, dtype=torch.int64)
         lengths = torch.tensor([3, 2, 2, 1, 5], device=device, dtype=torch.int64)
+        buffer = (
+            torch.arange(0, 10, device=device, dtype=torch.int64)
+            .unsqueeze(0).expand(5, -1).clone().detach()
+        )
         nt = torch.nested.narrow(
-            torch.arange(0, 10, device=device, dtype=torch.int64).unsqueeze(0).expand(5, -1).clone().detach(),
+            buffer,
             1,
             starts,
             lengths,
             layout=torch.jagged
         )
 
+        self.assertTrue(nt._is_view() and nt._base is buffer)
+
         # TODO: Use this approach when unbind is functional
         # unbinded_nt = nt.unbind()
         # for i in range(starts.shape[0]):
@@ -3494,7 +3843,7 @@ def test_is_contiguous(self, device):
         a = torch.randn(2, 3, requires_grad=True, dtype=torch.float64, device=device)
         b = torch.randn(3, 3, requires_grad=True, dtype=torch.float64, device=device)
         c = torch.randn(4, 3, requires_grad=True, dtype=torch.float64, device=device)
-        nt_contiguous, _ = jagged_from_list([a, b, c], None)
+        nt_contiguous = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
 
         starts_nc = torch.tensor([0, 1, 2, 3, 4], device=device, dtype=torch.int64)
         lengths_nc = torch.tensor([3, 2, 2, 1, 5], device=device, dtype=torch.int64)
@@ -3529,31 +3878,148 @@ def test_is_contiguous(self, device):
         self.assertTrue(not nt_noncontiguous.is_contiguous(memory_format=torch.contiguous_format))
         self.assertTrue(nt_contiguous_narrow.is_contiguous(memory_format=torch.contiguous_format))
 
+    def test_layout_under_torch_dispatch_mode(self):
+        from torch.testing._internal.logging_tensor import capture_logs_with_logging_tensor_mode
+
+        nt = random_nt_from_dims([2, None, 3], torch.device('cpu'), torch.float32, layout=torch.jagged)
+
+        with capture_logs_with_logging_tensor_mode():
+            self.assertEqual(nt.layout, torch.jagged)
+
+    @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
+    @parametrize("func", [torch.empty_like, torch.randn_like],
+                 name_fn=lambda f: f.__name__)
+    def test_like_shape(self, func):
+        nt = random_nt_from_dims([2, None, 3], torch.device('cpu'), torch.float32, layout=torch.jagged)
+        nt_like = func(nt)
+
+        for nt_ub in nt_like.unbind():
+            t_like = func(nt_ub)
+            self.assertEqual(nt_ub.shape, t_like.shape)
+
+    @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
+    @parametrize("func", [torch.ones_like, torch.zeros_like],
+                 name_fn=lambda f: f.__name__)
+    def test_like_value(self, func):
+        nt = random_nt_from_dims([2, None, 3], torch.device('cpu'), torch.float32, layout=torch.jagged)
+        nt_like = func(nt)
+
+        for nt_ub in nt_like.unbind():
+            t_like = func(nt_ub)
+            self.assertEqual(nt_ub, t_like)
+
     def test_noncontiguous_pointwise(self, device):
         a = torch.randn(2, 3, 4, requires_grad=True, dtype=torch.float64, device=device)
         b = torch.randn(3, 3, 4, requires_grad=True, dtype=torch.float64, device=device)
         c = torch.randn(4, 3, 4, requires_grad=True, dtype=torch.float64, device=device)
-        nt, _ = jagged_from_list([a, b, c], None)
+        nt = torch.nested.nested_tensor([a, b, c], layout=torch.jagged)
         # transpose ragged dim
         transposed = nt.transpose(1, 2)
-        # pointwise ops are not supported on ragged dim transposed jagged layout NTs
-        with self.assertRaisesRegex(ValueError, "expected .* to be a contiguous jagged layout"):
-            clone = transposed.clone()
-
-    # Note 1: CPU Fused kernels do not support nested, Math is missing ops to work with NT jagged
-    # Note 2: Unless running on newer GPUs, only mem-effn or math are available, and mem-effn
-    # will fail with gradients and math has ops that aren't implemented. Therefore, in
-    # order to get some kernel to work with most GPUs, we have to disable gradients in
-    # this more general test
-    # Note 3: ROCm only supports the math kernel, which doesn't work with jagged NTs
+        self.assertFalse(transposed.is_contiguous())
+        clone = transposed.clone()
+
+        def check_nt_equality(x, y):
+            self.assertEqual(x.values(), y.values())
+            self.assertEqual(x.offsets(), y.offsets())
+            self.assertEqual(x._ragged_idx, y._ragged_idx)
+            self.assertEqual(x.shape, y.shape)
+
+        self.assertFalse(clone.is_contiguous())
+        check_nt_equality(clone, transposed)
+
+        clone_contig = transposed.clone(memory_format=torch.contiguous_format)
+        self.assertTrue(clone_contig.is_contiguous())
+        check_nt_equality(clone_contig, transposed)
+
+        detached = transposed.detach()
+        self.assertFalse(clone.is_contiguous())
+        check_nt_equality(detached, transposed)
+
+    def test_to_copy(self, device):
+        nt = torch.nested.nested_tensor(
+            [torch.randn(i + 2, 3, 4, requires_grad=True, dtype=torch.float64, device=device)
+             for i in range(3)], layout=torch.jagged
+        )
+
+        nt_copy_dtype = torch.ops.aten._to_copy(nt, dtype=torch.float16)
+        self.assertEqual(torch.float16, nt_copy_dtype.dtype)
+
+        nt_t = nt.transpose(1, 2)
+        nt_t_copy_dtype = torch.ops.aten._to_copy(nt_t, dtype=torch.float16)
+        self.assertEqual(torch.float16, nt_t_copy_dtype.dtype)
+
+    @skipIfTorchDynamo("Dynamo doesn't know how to trace prof.events()")
+    def test_profiler_sequence_nr(self):
+        with torch.profiler.profile() as prof:
+            values = torch.randn(4, 6, requires_grad=True)
+            offsets = torch.tensor([0, 2, 4])
+            values = values * 2
+            l = torch.nn.Linear(6, 8)
+            nt = torch.nested.nested_tensor_from_jagged(values, offsets)
+
+            nt = l(nt)
+            val = nt.values()
+
+            loss = val.sum()
+            loss.backward()
+
+        fwd_seq_nrs = []
+        for evt in prof.events():
+            if "linear" in evt.name.lower() and "backward" not in evt.name.lower() and evt.sequence_nr != -1:
+                fwd_seq_nrs.append(evt.sequence_nr)
+
+        bwd_seq_nrs = []
+        for evt in prof.events():
+            if (
+                "linear" in evt.name.lower() and
+                "backward" in evt.name.lower() and
+                "evaluate_function" not in evt.name.lower() and
+                evt.sequence_nr != -1
+            ):
+                bwd_seq_nrs.append(evt.sequence_nr)
+
+        # There should only be one such event with a sequence number:
+        # the PythonTLSSnapshot event - but, note that it's not terrible if
+        # we end up with multiple events with the same sequence number - so we
+        # could relax this check if it becomes inconvenient to maintain this
+        # property.
+        self.assertEqual(len(fwd_seq_nrs), 1)
+        self.assertEqual(len(bwd_seq_nrs), 1)
+        self.assertEqual(fwd_seq_nrs[0], bwd_seq_nrs[0])
+
+    def test_is_same_size(self, device):
+        def get_3_tensors():
+            return [torch.randn(i + 2, 3, 4, requires_grad=True, dtype=torch.float64, device=device) for i in range(3)]
+
+        nt1, offsets1 = jagged_from_list(get_3_tensors(), None)
+        nt2, offsets1 = jagged_from_list(get_3_tensors(), offsets1)
+
+        nt3, offsets2 = jagged_from_list(get_3_tensors(), None)
+        nt4, offsets2 = jagged_from_list(get_3_tensors(), offsets2)
+
+        def check_size(nt1, nt2, nt3, nt4):
+            self.assertTrue(torch.ops.aten.is_same_size(nt1, nt2))
+            self.assertTrue(torch.ops.aten.is_same_size(nt3, nt4))
+            self.assertFalse(torch.ops.aten.is_same_size(nt1, nt3))
+
+        check_size(nt1, nt2, nt3, nt4)
+
+        nt1_t, nt2_t, nt3_t, nt4_t = (x.transpose(1, 2) for x in (nt1, nt2, nt3, nt4))
+        check_size(nt1_t, nt2_t, nt3_t, nt4_t)
+
+    # Doesn't work until we have real views
     @xfailIfTorchDynamo
-    @onlyCUDA
-    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts")
+    # Note 1: Math fallback doesn't work with bfloat16 on CUDA
+    # Note 2: ROCm doesn't support flash attention or mem_efficient attention for NT
+    @unittest.skipIf(
+        TEST_WITH_ROCM,
+        "ROCm doesn't support flash attention or mem_efficient attention for NT",
+    )
     @parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32] if
                  SM80OrLater else [torch.float16, torch.float32])
     def test_sdpa(self, device, dtype):
         batch_size = 1
-        emb_dims = 64
+        emb_dims = 128
         n_heads = 8
         head_dims = emb_dims // n_heads
 
@@ -3642,7 +4108,7 @@ def check_forward_backward():
             self.assertEqual(attn_d1, attn_nts[0].unsqueeze(0), atol=output_ref_atol, rtol=output_ref_rtol)
             self.assertEqual(attn_d2, attn_nts[1].unsqueeze(0), atol=output_ref_atol, rtol=output_ref_rtol)
 
-            nt_grads = torch.autograd.grad(buffer_from_jagged(attn_nt).sum(), (q_nt, k_nt, v_nt))
+            nt_grads = torch.autograd.grad(attn_nt.values().sum(), (q_nt, k_nt, v_nt))
             for nt_grad, d1_grad, d2_grad, grad_atol, grad_rtol in zip(nt_grads, d1_grads, d2_grads, grad_atols, grad_rtols):
                 unbound_nt_grads = nt_grad.unbind()
                 self.assertEqual(d1_grad, unbound_nt_grads[0].unsqueeze(0), atol=grad_atol, rtol=grad_rtol)
@@ -3652,21 +4118,24 @@ def check_forward_backward():
         check_forward_backward()
 
         # Test dispatcher works by calling only mem-effn and math (as they are safe for all devices)
-        with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=True, enable_math=False):
+        with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=True, enable_math=True):
             check_forward_backward()
 
-        # Will fail bc unsupported ops
-        # TODO: Add remaining ops, or implement a different math dispatch for jagged
+        # Test math fallback
         with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
-            with self.assertRaises(RuntimeError):
-                attn_nt = torch.nn.functional.scaled_dot_product_attention(q_nt_t, k_nt_t, v_nt_t).transpose(1, 2)
-
-
-    # This requires NT -> NT views to work in inductor, which is a TODO
-    @unittest.expectedFailure  # noqa: E301
+            # Math fallback doesn't work with bfloat16 on CUDA because
+            # "group_gemm_dispatch" not implemented for 'BFloat16'
+            if not (str(device).startswith("cuda") and dtype == torch.bfloat16):
+                check_forward_backward()
+
+    @skipIfTorchDynamo("SDPA test compiles internally")
+    @unittest.skipIf(IS_WINDOWS, reason="Windows not yet supported for torch.compile")
+    @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
+    # Guarding with sqrt() doesn't work on ROCm?
+    @skipCUDAIfRocm
     @onlyCUDA
-    @parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32] if
-                 SM80OrLater else [torch.float16, torch.float32])
+    @dtypes(*([torch.float16, torch.bfloat16, torch.float32] if SM80OrLater
+            else [torch.float16, torch.float32]))
     def test_sdpa_compile(self, device, dtype):
         batch_size = 1
         emb_dims = 1024
@@ -3692,9 +4161,9 @@ def test_sdpa_compile(self, device, dtype):
         k_d2 = key(x_d2).view(batch_size, -1, n_heads, head_dims).transpose(1, 2)
         v_d2 = value(x_d2).view(batch_size, -1, n_heads, head_dims).transpose(1, 2)
 
-        q_nt = query(x_nt).view(*x_nt.size()[0:2], n_heads, head_dims).transpose(1, 2)
-        k_nt = key(x_nt).view(*x_nt.size()[0:2], n_heads, head_dims).transpose(1, 2)
-        v_nt = value(x_nt).view(*x_nt.size()[0:2], n_heads, head_dims).transpose(1, 2)
+        q_nt = query(x_nt).view(*x_nt.size()[0:2], n_heads, head_dims).detach().transpose(1, 2)
+        k_nt = key(x_nt).view(*x_nt.size()[0:2], n_heads, head_dims).detach().transpose(1, 2)
+        v_nt = value(x_nt).view(*x_nt.size()[0:2], n_heads, head_dims).detach().transpose(1, 2)
 
         # High Precision Math Reference
         q_d1_f32 = q_d1.to(torch.float32)
@@ -3733,6 +4202,151 @@ def test_sdpa_with_constant_sequence_length(self, device, dtype):
         output_dense = F.scaled_dot_product_attention(query._values, key._values, value._values)
         self.assertEqual(output._values, output_dense)
 
+    # Doesn't work until we have real views
+    @xfailIfTorchDynamo
+    @onlyCUDA
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FUSED_ATTENTION,
+        "Platform doesn't support flash or mem-efficient attention"
+    )
+    @dtypes(*([torch.float16, torch.bfloat16, torch.float32] if SM80OrLater
+            else [torch.float16, torch.float32]))
+    def test_sdpa_with_packed_in_proj(self, device, dtype):
+        # shape (B, *, D)
+        input_packed = random_nt_from_dims(
+            [5, None, 10], device=device, dtype=dtype, layout=torch.jagged)
+
+        # Do input projection.
+        num_heads = 2
+        # should be multiple of 4 for efficient kernels (e.g. flash / mem-efficient)
+        head_dim = 8
+        qkv_linear = torch.nn.Linear(10, num_heads * head_dim * 3).to(device=device, dtype=dtype)
+
+        def in_proj(input_packed, qkv_linear=qkv_linear):
+            qkv_post_proj = qkv_linear(input_packed)
+            # these are non-contiguous to trigger _is_safe_to_get_storage_as_tensor()
+            q, k, v = qkv_post_proj.chunk(3, dim=-1)
+            q = q.unflatten(-1, [num_heads, head_dim]).transpose(-2, -3)
+            k = k.unflatten(-1, [num_heads, head_dim]).transpose(-2, -3)
+            v = v.unflatten(-1, [num_heads, head_dim]).transpose(-2, -3)
+            return q, k, v
+
+        q, k, v = in_proj(input_packed)
+        output = F.scaled_dot_product_attention(q, k, v, attn_mask=None)
+
+        # compare to individually running unbound components through
+        for in_component, out_component in zip(
+            input_packed.unbind(),
+            output.transpose(-2, -3).unbind()
+        ):
+            q, k, v = in_proj(in_component)
+            out = F.scaled_dot_product_attention(q, k, v).transpose(-2, -3)
+
+            # Low Precision Math Reference
+            out_lp_ref = torch.ops.aten._scaled_dot_product_attention_math(
+                q, k, v)[0].transpose(-2, -3)
+            output_ref_atol, output_ref_rtol = get_tolerances(out, out_lp_ref)
+
+            self.assertEqual(out, out_component, atol=output_ref_atol, rtol=output_ref_rtol)
+
+    @skipIfTorchDynamo("SDPA test compiles internally")
+    @unittest.skipIf(IS_WINDOWS, reason="Windows not yet supported for torch.compile")
+    @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
+    # mha_varlen_fwd not supported on ROCm
+    @skipCUDAIfRocm
+    @onlyCUDA
+    @dtypes(*([torch.float16, torch.bfloat16, torch.float32] if SM80OrLater
+            else [torch.float16, torch.float32]))
+    def test_sdpa_backwards(self, device, dtype):
+        values = torch.randn(9, 3, 256, requires_grad=True, device=device, dtype=dtype)
+        offsets = torch.tensor([0, 1, 3, 5, 9], device=device, dtype=torch.int64)
+
+        @torch.compile
+        def f(values, offsets):
+            nt = convert_jagged_to_nested_tensor(values, offsets, max_length=4)
+            nt = nt.transpose(-2, -3)
+            # purposefully graph break to trigger view replay for subclass view input
+            torch.tensor(1).item()
+            output = F.scaled_dot_product_attention(nt, nt, nt).transpose(-2, -3)
+            return convert_nt_to_jagged(output)
+
+        output = f(values, offsets)
+        output.sum().backward()
+        self.assertEqual(values.grad, torch.ones_like(values))
+
+    # Internally-defined NT use cases are lifted to here for maximum test realism.
+    # TODO: Remove these when ViewNestedFromBuffer, etc. are deprecated.
+    @skipCUDAIfRocm  # not needed
+    @skipIfTorchDynamo("compiles internally")
+    @unittest.skipIf(IS_WINDOWS, reason="Windows not yet supported for torch.compile")
+    @skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
+    def test_dummy_mha_with_nt(self, device):
+        bs = 3
+        d1 = 2
+        d2 = 4
+        d3 = 6
+        n_heads = 2
+        d_head = d3 // n_heads
+        max_length_1 = 10
+        max_length_2 = 20
+        torch.manual_seed(0)
+
+        class mha(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                torch.manual_seed(0)
+                self.linear = torch.nn.Linear(d2, d3, device=device)
+
+            def forward(self, query, value, offsets):
+
+                value = self.linear(value)
+                key = convert_jagged_to_nested_tensor(value, offsets, max_length_1)
+                value = convert_jagged_to_nested_tensor(value, offsets, max_length_2)
+                query = convert_dense_to_nested_tensor(query)
+                q = query.view(bs, -1, n_heads, d_head).transpose(1, 2)
+                k = key.view(bs, -1, n_heads, d_head).transpose(1, 2)
+                v = value.view(bs, -1, n_heads, d_head).transpose(1, 2)
+                attn_output = torch.nn.functional.scaled_dot_product_attention(
+                    q,
+                    k,
+                    v,
+                    attn_mask=None,
+                    dropout_p=0.0,
+                    is_causal=False,
+                )
+                attn_output = attn_output.transpose(1, 2)
+                attn_output = convert_nt_to_jagged(attn_output)
+                return attn_output, key._max_seqlen, value._max_seqlen
+
+        query = torch.rand(bs, d1, d3, device=device)
+        value = torch.rand(6, d2, requires_grad=True, device=device)
+        offsets = torch.tensor([0, 2, 3, 6], device=device)
+
+        m = mha()
+        symbolic_traced: torch.fx.GraphModule = torch.fx.symbolic_trace(m)
+        m = torch.compile(symbolic_traced)
+        attn_output, cached_key_max_seqlen, cached_value_max_seqlen = m(
+            query, value, offsets
+        )
+        loss = attn_output.sum()
+        # Check that NT can be fx traced and torch.compile, and backward works
+        loss.backward()
+
+        # Check that value.requires_grad is not lost after tracing and compiling
+        value_grad = value.grad  # save for comparison later
+        self.assertIsNotNone(value_grad)
+        # check that max_seqlen is cached properly
+        self.assertEqual(cached_key_max_seqlen, max_length_1)
+        self.assertEqual(cached_value_max_seqlen, max_length_2)
+
+        # check if the output is numerically equivalent with the eager mode
+        m_eager = mha()
+        value.grad = None
+        attn_output_eager, _, _ = m_eager(query, value, offsets)
+        attn_output_eager.sum().backward()
+        self.assertTrue(torch.allclose(attn_output_eager, attn_output))
+        self.assertTrue(torch.allclose(value_grad, value.grad))
+
 
 instantiate_parametrized_tests(TestNestedTensor)
 instantiate_device_type_tests(TestNestedTensorDeviceType, globals())
diff --git a/test/test_nn.py b/test/test_nn.py
index 99877be2b837d..008354ad721eb 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -8,11 +8,11 @@
 import itertools
 import warnings
 import pickle
+import re
 from copy import deepcopy
 from itertools import product
 from functools import partial
 from collections import OrderedDict
-from tempfile import NamedTemporaryFile
 from unittest import SkipTest
 
 import torch
@@ -34,9 +34,8 @@
     download_file, get_function_arglist, load_tests, skipIfMps, \
     IS_PPC, \
     parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
-    skipIfTorchDynamo, IS_WINDOWS, gcIfJetson, set_default_dtype
-from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, TEST_CUDNN_VERSION, \
-    PLATFORM_SUPPORTS_FLASH_ATTENTION
+    skipIfTorchDynamo, gcIfJetson, set_default_dtype
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
     module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
     ctcloss_reference, new_module_tests, single_batch_reference_fn, _test_bfloat16_ops, _test_module_empty_input
@@ -52,7 +51,7 @@
 from torch.testing._internal.common_utils import dtype2prec_DONTUSE
 from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, tf32_off, tf32_on
 from torch.types import _TensorOrTensors
-
+from torch.testing._internal.common_mkldnn import bf32_on_and_off
 
 AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32()
 
@@ -629,30 +628,6 @@ def test_buffer_not_persistent_assign(self):
         self.assertTrue(len(list(m.buffers())) == 0)
         self.assertTrue(len(m.state_dict()) == 1)
 
-    @unittest.skipIf(not TEST_NUMPY, "numpy not found")
-    def test_load_state_dict_invalid(self):
-        m = torch.nn.Linear(2, 2, bias=False)
-
-        state_dict = {'weight': np.random.randn(2, 2)}
-        with self.assertRaisesRegex(RuntimeError,
-                                    "expected torch.Tensor or Tensor-like object from checkpoint but received"):
-            m.load_state_dict(state_dict)
-
-        state_dict = {'weight': ((1., 1.), (2., 2.))}
-        with self.assertRaisesRegex(RuntimeError,
-                                    "expected torch.Tensor or Tensor-like object from checkpoint but received"):
-            m.load_state_dict(state_dict)
-
-    def test_load_state_dict_type(self):
-        m = nn.Module()
-
-        with self.assertRaisesRegex(TypeError,
-                                    "Expected state_dict to be dict-like, got"):
-            m.load_state_dict("")
-        with self.assertRaisesRegex(TypeError,
-                                    "Expected state_dict to be dict-like, got"):
-            m.load_state_dict(2)
-
     def test_buffer_not_persistent_load(self):
         m = nn.Module()
         m.register_buffer('buf', torch.rand(5), persistent=False)
@@ -1491,6 +1466,10 @@ def test_RNN_nonlinearity(self):
         with self.assertRaisesRegex(ValueError, 'Unknown nonlinearity'):
             rnn = torch.nn.RNN(1, 10, nonlinearity='garbage')
 
+    def test_RNN_nonlinearity_passed_as_arg(self):
+        rnn = torch.nn.RNN(2, 3, 1, 'relu')
+        self.assertEqual(rnn.nonlinearity, 'relu')
+
     def test_module_apply_inplace_op(self):
         def add_one_inplace(t):
             return t.add_(1.0)
@@ -1615,6 +1594,22 @@ def add_one_inplace(t):
         finally:
             torch.__future__.set_overwrite_module_params_on_conversion(False)
 
+    def test_swap_module_params_fails_after_forward(self):
+        torch.__future__.set_swap_module_params_on_conversion(True)
+        try:
+            m = torch.nn.Linear(2, 3)
+            inp = torch.randn(2, 2)
+            # forward will init AccumulateGrad nodes, which bumps use_count of parameters' at::Tensors
+            out = m(inp)
+            with self.assertRaisesRegex(RuntimeError, re.escape("_apply(): Couldn't swap Linear.weight")):
+                m.half()
+            del out
+            # works as expected now
+            m.half()
+            self.assertTrue(all(p.dtype == torch.float16 for p in m.parameters()))
+        finally:
+            torch.__future__.set_swap_module_params_on_conversion(False)
+
     def test_type(self):
         l = nn.Linear(10, 20)
         net = nn.Module()
@@ -1704,14 +1699,14 @@ def check_weight_norm(l, name, num_params):
             # Applying weight norm on one of them causes it to become a tensor
             l = torch.nn.utils.weight_norm(l, name=name)
             self.assertEqual(
-                sum([isinstance(p, torch.nn.Parameter) for p in l._flat_weights]),
+                sum(isinstance(p, torch.nn.Parameter) for p in l._flat_weights),
                 num_params - 1,
             )
 
             # Removing the weight norm reparametrization restores the Parameter
             l = torch.nn.utils.remove_weight_norm(l, name=name)
             self.assertEqual(
-                sum([isinstance(p, torch.nn.Parameter) for p in l._flat_weights]),
+                sum(isinstance(p, torch.nn.Parameter) for p in l._flat_weights),
                 num_params,
             )
 
@@ -2276,153 +2271,6 @@ def test_state_dict(self):
         # Reference https://github.com/pytorch/pytorch/pull/75507#issuecomment-1110291545
         self.assertNotWarn(lambda: l.state_dict(destination=dict()), "Should not warn kwarg destination w/o _metadata")
 
-    def test_load_state_dict(self):
-        l = nn.Linear(5, 5)
-        block = nn.Module()
-        block.conv1 = nn.Conv2d(3, 3, 3, bias=True)
-        block.conv2 = nn.Conv2d(3, 3, 3, bias=False)
-        net = nn.Module()
-        net.linear1 = l
-        net.linear2 = l
-        net.bn = nn.BatchNorm2d(2)
-        net.block = block
-        net.add_module('empty', None)
-        conv1_bias_dtype = block.conv1.bias.dtype
-
-        state_dict = net.state_dict()
-        state_dict.update({
-            'linear1.weight': torch.ones(5, 5),
-            'block.conv1.bias': torch.arange(1, 4, dtype=conv1_bias_dtype),
-            'bn.running_mean': torch.randn(2),
-        })
-        # Also test if a DDP state_dict can be loaded from a local model.
-        ddp_state_dict = net.state_dict()
-        ddp_state_dict.update({
-            'module.linear1.weight': torch.ones(5, 5),
-            'module.block.conv1.bias': torch.arange(1, 4, dtype=conv1_bias_dtype),
-            'module.bn.running_mean': torch.randn(2),
-        })
-        torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(ddp_state_dict, 'module.')
-        for sd in [state_dict, ddp_state_dict]:
-            incompatible_keys = net.load_state_dict(sd)
-            self.assertEqual(len(incompatible_keys.missing_keys), 0)
-            self.assertEqual(len(incompatible_keys.unexpected_keys), 0)
-            self.assertNotIn('Incompatible', str(incompatible_keys))
-            self.assertEqual(net.linear1.weight, sd['linear1.weight'])
-            self.assertEqual(net.block.conv1.bias, sd['block.conv1.bias'])
-            self.assertEqual(net.bn.running_mean, sd['bn.running_mean'])
-
-        state_dict = net.state_dict()
-        state_dict.update({'extra': torch.ones(5)})
-        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict))
-        incompatible_keys = net.load_state_dict(state_dict, strict=False)
-        self.assertEqual(len(incompatible_keys.missing_keys), 0)
-        self.assertEqual(len(incompatible_keys.unexpected_keys), 1)
-        self.assertIn('extra', incompatible_keys.unexpected_keys)
-        self.assertIn('Incompatible', str(incompatible_keys))
-
-        state_dict = net.state_dict()
-        state_dict.update({'extra.param': torch.ones(5)})
-        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict))
-        incompatible_keys = net.load_state_dict(state_dict, strict=False)
-        self.assertEqual(len(incompatible_keys.missing_keys), 0)
-        self.assertEqual(len(incompatible_keys.unexpected_keys), 1)
-        self.assertIn('extra.param', incompatible_keys.unexpected_keys)
-
-        state_dict = net.state_dict()
-        del state_dict['linear1.weight']
-        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict))
-        incompatible_keys = net.load_state_dict(state_dict, strict=False)
-        self.assertEqual(len(incompatible_keys.missing_keys), 1)
-        self.assertEqual(len(incompatible_keys.unexpected_keys), 0)
-        self.assertIn('linear1.weight', incompatible_keys.missing_keys)
-        state_dict.update({'extra.param': torch.ones(5)})
-        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict))
-        incompatible_keys = net.load_state_dict(state_dict, strict=False)
-        self.assertEqual(len(incompatible_keys.missing_keys), 1)
-        self.assertEqual(len(incompatible_keys.unexpected_keys), 1)
-        self.assertIn('linear1.weight', incompatible_keys.missing_keys)
-        self.assertIn('extra.param', incompatible_keys.unexpected_keys)
-
-        state_dict = net.state_dict()
-        state_dict.update({'bn.running_mean': torch.rand(14, 4)})  # wrong size
-        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict))
-        self.assertRaises(RuntimeError, lambda: net.load_state_dict(state_dict, strict=False))
-
-        state_dict = net.state_dict()
-        old_state_dict = deepcopy(state_dict)
-        state_dict = {
-            'linear1.weight': torch.ones(5, 5),
-            'block.conv1.bias': torch.arange(1, 4, dtype=conv1_bias_dtype),
-            'bn.running_mean': torch.randn(2),
-            'nonexistent_key': torch.rand(3)
-        }
-        net.load_state_dict(state_dict, strict=False)
-        self.assertEqual(net.linear1.weight, state_dict['linear1.weight'])
-        self.assertEqual(net.block.conv1.bias, state_dict['block.conv1.bias'])
-        self.assertEqual(net.bn.running_mean, state_dict['bn.running_mean'])
-        new_state_dict = net.state_dict()
-        del old_state_dict['linear1.weight']
-        del old_state_dict['block.conv1.bias']
-        del old_state_dict['bn.running_mean']
-        for k, v, in old_state_dict.items():
-            self.assertTrue(v.equal(new_state_dict[k]))
-
-    def test_load_state_dict_BC(self):
-        # BatchNormNd
-        # Added num_batches_tracked buffer at version 2. For state dict with
-        # earlier versions or no versions, it should provide default value of 0.
-        bn = nn.BatchNorm2d(3)
-        state_dict = bn.state_dict()
-        del state_dict['num_batches_tracked']
-        state_dict._metadata['']['version'] = 1  # version 1
-        bn.load_state_dict(state_dict)
-        self.assertEqual(bn.num_batches_tracked.dtype, torch.long)
-        self.assertEqual(bn.num_batches_tracked.item(), 0)
-        del state_dict._metadata['']['version']  # no version
-        bn.load_state_dict(state_dict)
-        self.assertEqual(bn.num_batches_tracked.dtype, torch.long)
-        self.assertEqual(bn.num_batches_tracked.item(), 0)
-
-    def test_load_state_dict_child(self):
-        base_module = nn.Linear(1, 1)
-        model = base_module
-        for _ in range(3):
-            model = nn.Sequential(*[deepcopy(model) for _ in range(10)])
-
-        def hook_fn(module, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
-            module_state_dict = module.state_dict()
-            self.assertEqual(len(module_state_dict.keys()), len(state_dict.keys()))
-
-        model[0][0]._register_load_state_dict_pre_hook(hook_fn, with_module=True)
-        model.load_state_dict(model.state_dict(), strict=True)
-
-    @unittest.skipIf(IS_WINDOWS, "Tempfile permission issue on windows")
-    def test_register_state_dict_pre_hook_backward_compat(self):
-        called = False
-
-        def my_state_dict_pre_hook(*args, **kwargs):
-            nonlocal called
-            called = True
-
-        m = nn.Linear(1, 1)
-        self.assertTrue(hasattr(m, '_state_dict_pre_hooks'))
-        delattr(m, '_state_dict_pre_hooks')
-        # Save and load, ensure we can still call state_dict
-        # without running into issues.
-        with NamedTemporaryFile() as f:
-            # Note that torch.save / torch.load is not recommended
-            # to save / load modules.
-            torch.save(m, f.name)
-            m = torch.load(f.name)
-
-        # Ensure we can run state_dict without issues
-        _ = m.state_dict()
-        self.assertFalse(called)
-        m.register_state_dict_pre_hook(my_state_dict_pre_hook)
-        _ = m.state_dict()
-        self.assertTrue(called)
-
     def _test_register_state_dict_pre_hook(self, model, submodule):
         _state_dict_prefix = "foo."
         state_dict_pre_hook_count = 0
@@ -2475,52 +2323,6 @@ def forward(self, x):
         mod = MyLazyModule()
         self._test_register_state_dict_pre_hook(mod, mod.layer1)
 
-    @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons")
-    def test_load_state_dict_ref_cycle(self):
-        # load_state_dict shouldn't cause a reference cycle involving Tensors
-        import gc
-
-        m = torch.nn.LSTM(16, 16, bidirectional=True)
-
-        gc.collect()
-        m.load_state_dict(deepcopy(m).state_dict())
-        refcycles = gc.collect()
-
-        self.assertEqual(refcycles, 0)
-
-    def test_load_state_dict_custom(self):
-
-        class CustomState(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.param = torch.nn.Parameter(torch.ones(1))
-                self.sub = torch.nn.Linear(5, 5)
-
-            def _save_to_state_dict(self, destination, prefix, keep_vars):
-                destination[prefix + "serialized"] = self.param.data + 1
-
-            def _load_from_state_dict(self, state_dict, prefix, local_metadata,
-                                      strict, missing_keys, unexpected_keys,
-                                      error_msgs):
-                # skip some of the error handling
-                self.param.data.copy_(state_dict[prefix + "serialized"] - 1)
-
-        # use sequential to verify nesting
-        m = nn.Sequential(CustomState())
-        with torch.no_grad():
-            m[0].param[0] = 10
-            m[0].sub.weight[0, 0] = 555
-        state_dict = m.state_dict()
-        self.assertEqual(state_dict["0.serialized"].item(), 11)
-        self.assertIn("0.sub.weight", state_dict)
-        self.assertNotIn("0.param", state_dict)
-        del m
-        mm = nn.Sequential(CustomState())
-        self.assertEqual(mm[0].param[0].item(), 1)
-        mm.load_state_dict(state_dict)
-        self.assertEqual(mm[0].param[0].item(), 10)
-        self.assertEqual(mm[0].sub.weight[0, 0].item(), 555)
-
     def test_extra_state(self):
 
         class SubModule(torch.nn.Module):
@@ -2579,131 +2381,6 @@ def set_extra_state(self, state):
             self.assertEqual(m.state_dict(), m2.state_dict())
             self.assertEqual(m.foo, m2.foo)
 
-    def test_load_state_dict_assign_meta(self):
-        class MyModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.fc1 = nn.Linear(3, 5)
-                self.bn = nn.BatchNorm1d(5)
-
-            def forward(self, input):
-                return self.bn(self.fc1(input))
-
-        net = MyModule()
-        state_dict = net.state_dict(keep_vars=True)
-
-        with torch.device('meta'):
-            net_meta = MyModule()
-
-        net_meta.load_state_dict(state_dict, assign=True)
-
-        # Make sure parameters and persistent buffers were assigned
-        net_meta_state_dict = net_meta.state_dict(keep_vars=True)
-        for key in state_dict.keys():
-            if isinstance(state_dict[key], torch.nn.Parameter):
-                self.assertTrue(state_dict[key] is net_meta_state_dict[key])
-
-        # Make sure that ordering of parameters and buffers is preserved
-        net_named_parameters = net.named_parameters()
-        net_named_buffers = net.named_buffers()
-        net_meta_named_parameters = net_meta.named_parameters()
-        net_meta_named_buffers = net_meta.named_buffers()
-
-        for p1, p2 in zip(net_named_parameters, net_meta_named_parameters):
-            n1, _ = p1
-            n2, _ = p2
-            self.assertEqual(n1, n2)
-
-        for p1, p2 in zip(net_named_buffers, net_meta_named_buffers):
-            n1, _ = p1
-            n2, _ = p2
-            self.assertEqual(n1, n2)
-
-        # Make sure outputs are the same
-        t = torch.randn(4, 3)
-        out_net = net(t)
-        out_net_meta = net_meta(t.clone())
-
-        self.assertEqual(out_net, out_net_meta)
-
-    def test_load_state_dict_assign_with_optimizer(self):
-        class MyModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.fc1 = nn.Linear(3, 5)
-                self.bn = nn.BatchNorm1d(5)
-
-            def forward(self, input):
-                return self.bn(self.fc1(input))
-
-        net = MyModule()
-        opt = torch.optim.Adam(net.parameters(), lr=1000)
-        x = torch.randn(4, 3)
-        num_iters = 3
-
-        for i in range(num_iters):
-            opt.zero_grad()
-            out = net(x)
-            out.sum().backward()
-            opt.step()
-
-        opt_state_dict = deepcopy(opt.state_dict())
-        net_state_dict = deepcopy(net.state_dict())
-
-        with torch.device('meta'):
-            net_meta = MyModule()
-
-        net_meta.load_state_dict(net_state_dict, assign=True)
-        # must create optimizer only after loading state_dict when assign=True
-        opt2 = torch.optim.Adam(net_meta.parameters(), lr=1000)
-        opt2.load_state_dict(opt_state_dict)
-
-        y = x.clone()
-        for i in range(num_iters):
-            opt.zero_grad()
-            out = net(x)
-            out.sum().backward()
-            opt.step()
-
-            opt2.zero_grad()
-            out2 = net_meta(y)
-            out2.sum().backward()
-            opt2.step()
-
-        self.assertEqual(opt.state_dict(), opt2.state_dict())
-        self.assertEqual(net.state_dict(), net_meta.state_dict())
-
-    def test_load_state_dict_assign_shape_stride(self):
-        # Assigned tensor is allowed to have different properties than initial
-        # tensor except for shape
-        class MyModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.fc1 = nn.Linear(3, 5)
-                self.bn = nn.BatchNorm1d(5)
-
-            def forward(self, input):
-                return self.bn(self.fc1(input))
-
-        net = MyModule()
-        state_dict = net.state_dict()
-        # loading should be ok if stride is different
-        state_dict['fc1.weight'] = torch.randn(3, 5).transpose(0, 1)
-        net2 = MyModule()
-        net2.load_state_dict(state_dict, strict=False, assign=True)
-
-        state_dict['fc1.weight'] = torch.randn(2, 4)
-        with self.assertRaisesRegex(RuntimeError, "size mismatch for fc1.weight: copying a param with shape"):
-            net2.load_state_dict(state_dict, strict=False, assign=True)
-
-    def test_load_state_dict_warn_assign(self):
-        with torch.device('meta'):
-            m = torch.nn.Linear(3, 5)
-        state_dict = m.state_dict()
-        state_dict['weight'] = torch.empty_like(state_dict['weight'], device='cpu')
-        with self.assertWarnsRegex(UserWarning, "for weight: copying from a non-meta parameter in the checkpoint to a meta"):
-            m.load_state_dict(state_dict)
-
     def test_extra_state_missing_set_extra_state(self):
 
         class MyModule(torch.nn.Module):
@@ -2966,20 +2643,20 @@ def test_CTCLoss_typechecks(self):
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     def test_CTCLoss_lengthchecks_cuda(self):
-        target_lengths = [30, 25, 20]
-        input_lengths = [50, 50, 50]
-        targets = torch.randint(1, 15, (3, 29), dtype=torch.long, device='cuda')
-        log_probs = torch.randn(50, 3, 15, dtype=torch.float, device='cuda').log_softmax(2)
-        with self.assertRaises(RuntimeError):
-            torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths)
+        for target_lengths in [[30, 25, 20], [-1, -1, -1]]:
+            for input_lengths in [[50, 50, 50], [-1, -1, -1]]:
+                targets = torch.randint(1, 15, (3, 29), dtype=torch.long, device='cuda')
+                log_probs = torch.randn(50, 3, 15, dtype=torch.float, device='cuda').log_softmax(2)
+                with self.assertRaises(RuntimeError):
+                    torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths)
 
     def test_CTCLoss_lengthchecks_cpu(self):
-        target_lengths = [30, 25, 20]
-        input_lengths = [50, 50, 50]
-        targets = torch.randint(1, 15, (3, 29), dtype=torch.int)
-        log_probs = torch.randn(50, 3, 15, dtype=torch.float).log_softmax(2)
-        with self.assertRaises(RuntimeError):
-            torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths)
+        for target_lengths in [[30, 25, 20], [-1, -1, -1]]:
+            for input_lengths in [[50, 50, 50], [-1, -1, -1]]:
+                targets = torch.randint(1, 15, (3, 29), dtype=torch.int)
+                log_probs = torch.randn(50, 3, 15, dtype=torch.float).log_softmax(2)
+                with self.assertRaises(RuntimeError):
+                    torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths)
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     def test_CTCLoss_long_targets(self):
@@ -3021,6 +2698,30 @@ def test_CTCLoss_critical_target_len(self):
         res_cpu = torch.nn.functional.ctc_loss(inp.cpu(), target, input_lengths, target_lengths, reduction='none')
         self.assertEqual(res_cpu, res_gpu, atol=1e-3, rtol=0)
 
+    def test_CTCLoss_zero_lengths(self):
+        devices = ['cpu']
+        devices += ['cuda'] if TEST_CUDA else []
+        N = 3
+        S = 2
+        C = 200
+        T = 1
+        target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.int)
+        input_lengths = torch.full(size=(N,), fill_value=0, dtype=torch.int)
+        target_lengths = torch.full(size=(N,), fill_value=0, dtype=torch.int)
+        for device in devices:
+            inp = torch.randn(T, N, C, dtype=torch.float, device=device).log_softmax(2).requires_grad_()
+            res = torch.nn.functional.ctc_loss(inp, target, input_lengths, target_lengths, reduction='none')
+            self.assertTrue((res == 0).all().item())
+            res.sum().backward()
+            self.assertTrue((inp.grad == 0).all().item())
+        target_lengths = torch.full(size=(N,), fill_value=1, dtype=torch.int)
+        for device in devices:
+            inp = torch.randn(T, N, C, dtype=torch.float, device=device).log_softmax(2).requires_grad_()
+            res = torch.nn.functional.ctc_loss(inp, target, input_lengths, target_lengths, reduction='none')
+            self.assertTrue((res == torch.inf).all().item())
+            res.sum().backward()
+            self.assertTrue((inp.grad == 0).all().item())
+
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     def test_CTCLoss_zero_infinity(self):
         target_lengths = [60, 25, 20]
@@ -3707,6 +3408,19 @@ def test_cudnn_rnn_dropout_states_device(self):
         hx = torch.randn(2, 4, 20).cuda(device)
         output = rnn(input, hx)
 
+    def test_cudnn_forward_exception(self):
+        rnns = [
+            (nn.LSTM(10, 20, batch_first=True), (torch.zeros(1, 2, 19), torch.zeros(1, 2, 19))),
+            (nn.LSTM(10, 20, batch_first=True, proj_size=10), (torch.zeros(1, 2, 19), torch.zeros(1, 2, 19))),
+            (nn.GRU(10, 20, batch_first=True), torch.zeros(1, 2, 19)),
+            (nn.RNN(10, 20, batch_first=True), torch.zeros(1, 2, 19)),
+        ]
+        x_wrong = torch.randn(2, 3, 3)
+        x_right = torch.randn(2, 3, 10)
+        for rnn, hidden in rnns:
+            self.assertRaisesRegex(RuntimeError, "Expected hidden.*size.*got", rnn, x_right, hidden)
+            self.assertRaisesRegex(RuntimeError, re.escape("input.size(-1) must be equal to input_size"), rnn, x_wrong)
+
     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
     @skipIfRocm
     def test_cudnn_weight_format(self):
@@ -4456,7 +4170,7 @@ def test_RNN_cpu_vs_cudnn_no_dropout(self):
         dtype = torch.double
         self._test_RNN_cpu_vs_cudnn(0, dtype)
 
-    @unittest.skipIf(not (TEST_CUDNN and (TEST_CUDNN_VERSION if TEST_CUDNN_VERSION else 0) >= 5103), "needs cudnn >= 5.1")
+    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
     def test_RNN_cpu_vs_cudnn_with_dropout(self):
         # Because of dropout randomness, can only compare dropout=0 and dropout=1
         self._test_RNN_cpu_vs_cudnn(1)
@@ -4515,8 +4229,7 @@ def test_partial_flat_weights(self):
         warnings.simplefilter("always")
         self.assertEqual(m(inp)[0].cpu(), out_expected[0])
 
-
-    @unittest.skipIf(not (TEST_CUDNN and (TEST_CUDNN_VERSION if TEST_CUDNN_VERSION else 0) >= 5103), "needs cudnn >= 5.1")
+    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
     @set_default_dtype(torch.double)
     def test_RNN_dropout(self):
         # checking the assumption that cuDNN sticks dropout in between
@@ -4560,6 +4273,7 @@ def test_RNN_dropout(self):
                     self.assertEqual(hy.data[0][0][0], 10)
                     self.assertEqual(hy.data[1][0][0], output_val)
 
+    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
     @set_default_dtype(torch.double)
     def test_error_RNN_seq_len_zero(self):
         # checking error message when RNN has seq_len = 0
@@ -4588,7 +4302,7 @@ def test_RNN_input_size_zero(self):
                 # Check that backward does not cause a hard error
                 outs[0].sum().backward()
 
-    @unittest.skipIf(not (TEST_CUDNN and (TEST_CUDNN_VERSION if TEST_CUDNN_VERSION else 0) >= 5103), "needs cudnn >= 5.1")
+    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
     def test_RNN_dropout_state(self):
         for p in (0, 0.1234):
             for train in (True, False):
@@ -4628,7 +4342,7 @@ def test_RNN_dropout_state(self):
                         self.assertNotEqual(hy1, hy2)
                         self.assertNotEqual(hy1, hy3)
 
-    @unittest.skipIf(not (TEST_CUDNN and (TEST_CUDNN_VERSION if TEST_CUDNN_VERSION else 0) >= 5103), "needs cudnn >= 5.1")
+    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
     @set_default_dtype(torch.double)
     def test_RNN_change_dropout(self):
         for train, cuda in product((True, False), repeat=2):
@@ -5327,6 +5041,14 @@ def test_batchnorm_load_state_dict(self):
         meta_bn.load_state_dict(empty_dict, assign=True, strict=False)
         self.assertEqual(meta_bn.state_dict()["num_batches_tracked"], torch.tensor(0))
 
+    def test_batch_norm_update_stats(self):
+        input = torch.rand(0, 1)
+        running_mean = torch.rand(1)
+        running_var = torch.rand(1)
+        with self.assertRaisesRegex(RuntimeError,
+                                    re.escape("input tensor must have at least one element, but got input_sizes = [0, 1]")):
+            torch.batch_norm_update_stats(input=input, momentum=0.0, running_mean=running_mean, running_var=running_var)
+
     def test_pairwise_distance(self):
         input1 = torch.randn(4, 4, requires_grad=True, dtype=torch.double)
         input2 = torch.randn(4, 4, requires_grad=True, dtype=torch.double)
@@ -5775,6 +5497,30 @@ def test_affine_grid_error_checking(self):
         with self.assertRaisesRegex(NotImplementedError, "affine_grid only supports 4D and 5D sizes"):
             F.affine_grid(theta, torch.Size([1, 1, 2, 2, 2, 2]), align_corners=False)
 
+    @parametrize_test('device', ['cpu'] + (['cuda'] if TEST_CUDA else []))
+    @parametrize_test('nd', [2, 3])
+    def test_affine_grid_backward_cl_cf_consistency(self, device, nd):
+        # Test based on reported issue: https://github.com/pytorch/pytorch/issues/124154
+
+        theta = torch.rand([6, nd, nd + 1], requires_grad=True, device=device)
+        size = [6, 3, 4, 5] if nd == 2 else [6, 3, 4, 5, 5]
+        grid = torch.nn.functional.affine_grid(theta, size, align_corners=False)
+
+        grad_tensor = torch.rand(grid.shape, device=device)
+
+        memory_format_cl = torch.channels_last if nd == 2 else torch.channels_last_3d
+        grad_tensor_cl = grad_tensor.contiguous(memory_format=memory_format_cl)
+
+        assert theta.grad is None
+        grid.backward(grad_tensor_cl)
+        theta_grad_cl = theta.grad.clone().contiguous()
+
+        theta.grad.zero_()
+        grid.backward(grad_tensor)
+        theta_grad_cf = theta.grad
+
+        self.assertEqual(theta_grad_cf, theta_grad_cl)
+
     @set_default_dtype(torch.double)
     def test_grid_sample(self):
         # Backward pass of native C++ and CUDA kernels branch depending on whether input requires gradient,
@@ -6536,6 +6282,13 @@ def test_channel_shuffle_return_alias_of_self(self):
         output = torch.nn.ChannelShuffle(groups)(input_tensor)
         torch.testing.assert_close(output, input_tensor)
 
+    @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons")
+    def test_native_channel_shuffle_return_alias_of_self(self):
+        groups = 3
+        input_tensor = torch.rand([0, 9, 4, 4])
+        output = torch.native_channel_shuffle(input_tensor, groups)
+        torch.testing.assert_close(output, input_tensor)
+
     @set_default_dtype(torch.double)
     def test_upsamplingLinear1d(self):
         for align_corners in [True, False]:
@@ -6839,6 +6592,14 @@ def test_linear_broadcasting(self):
         expected = m(inp.view(6, 5)).view(2, 3, 8)
         self.assertEqual(expected, m(inp))
 
+    def test_linear_raise_on_scalar_input(self):
+        # This used to cause an int underflow issue when reshaping the input
+        # see https://github.com/pytorch/pytorch/issues/119161
+        m = nn.Linear(1, 1)
+        inp = torch.ones(1).squeeze()
+        with self.assertRaisesRegex(RuntimeError, ".*both arguments.*1D.*"):
+            m(inp)
+
     @parametrize_test('device', ['cpu'] + (['cuda'] if TEST_CUDA else []))
     @parametrize_test('bias', [
         subtest(False, name='nobias'), subtest(True, name='bias')])
@@ -6866,7 +6627,7 @@ def test_linear_autograd(self, device, bias, weight_layout):
         elif weight_layout == torch.sparse_coo:
             module.weight = nn.Parameter(module.weight.to_sparse_coo())
         else:
-            raise AssertionError()
+            raise AssertionError
 
         inp = torch.randn(4, requires_grad=True, device=device)
         res = module(inp)
@@ -8151,6 +7912,7 @@ def _test_module_empty_inputs(self, module, inputs):
     @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
                      "Scipy v1.0 and/or numpy not found")
     @tf32_on_and_off()
+    @bf32_on_and_off()
     def test_affine_2d_rotate0(self, device):
         # scipy before 1.0.0 do not support homogeneous coordinate
         # scipy.ndimage.affine_transform, so we need to skip.
@@ -8190,6 +7952,7 @@ def test_affine_2d_rotate0(self, device):
     @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
                      "Scipy v1.0 and/or numpy not found")
     @tf32_on_and_off(0.001)
+    @bf32_on_and_off(0.001)
     def test_affine_2d_rotate90(self, device):
         # scipy before 1.0.0 do not support homogeneous coordinate
         # scipy.ndimage.affine_transform, so we need to skip.
@@ -8238,6 +8001,7 @@ def test_affine_2d_rotate90(self, device):
     @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
                      "Scipy v1.0 and/or numpy not found")
     @tf32_on_and_off(0.005)
+    @bf32_on_and_off(0.005)
     def test_affine_2d_rotate45(self, device):
         # scipy before 1.0.0 do not support homogeneous coordinate
         # scipy.ndimage.affine_transform, so we need to skip.
@@ -8293,6 +8057,7 @@ def test_avg_pool_large_tensor(self, device):
     @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
                      "Scipy v1.0 and/or numpy not found")
     @tf32_on_and_off(0.005)
+    @bf32_on_and_off(0.005)
     def test_affine_2d_rotateRandom(self, device):
         # scipy before 1.0.0 do not support homogeneous coordinate
         # scipy.ndimage.affine_transform, so we need to skip.
@@ -8344,6 +8109,7 @@ def test_affine_2d_rotateRandom(self, device):
     @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'),
                      "Scipy v1.0 and/or numpy not found")
     @tf32_on_and_off(0.005)
+    @bf32_on_and_off(0.005)
     def test_affine_3d_rotateRandom(self, device):
         # scipy before 1.0.0 do not support homogeneous coordinate
         # scipy.ndimage.affine_transform, so we need to skip.
@@ -8762,6 +8528,15 @@ def test_ReplicationPad_empty(self, device, dtype):
             inp = torch.randn(3, 0, 10, 10, 10, device=device, dtype=dtype)
             mod(inp)
 
+        with self.assertRaisesRegex(RuntimeError, 'padding size is expected to be 2'):
+            torch._C._nn.replication_pad1d(torch.randn([2]), padding=[])
+
+        with self.assertRaisesRegex(RuntimeError, 'padding size is expected to be 4'):
+            torch._C._nn.replication_pad2d(torch.randn([2]), padding=[])
+
+        with self.assertRaisesRegex(RuntimeError, 'padding size is expected to be 6'):
+            torch._C._nn.replication_pad3d(torch.randn([2]), padding=[])
+
     def test_ReplicationPad1d_large(self, device):
         shapes = ([2, 65736, 4], [65736, 2, 4])
         pl, pr = 3, 4
@@ -9871,7 +9646,12 @@ def test_upsamplingBiMode2d_consistency(
 
         torch.manual_seed(0)
 
-        input_ui8 = torch.randint(0, 256, size=(batch_size, num_channels, 400, 400), dtype=torch.uint8, device=device)
+        # - input range is set to [30, 220] for bicubic mode, because the bicubic kernel may create
+        #   [intermediate] values outside of the [0, 255] range, which need
+        #   to be clipped in uint8 path, but not in float path. This isn't
+        #   an issue with bilinear kernel.
+        input_range = (30, 220) if mode == "bicubic" else (0, 256)
+        input_ui8 = torch.randint(*input_range, size=(batch_size, num_channels, 400, 400), dtype=torch.uint8, device=device)
         input_ui8 = input_ui8.contiguous(memory_format=memory_format)
 
         if non_contig == "sliced":
@@ -9905,37 +9685,21 @@ def test_upsamplingBiMode2d_consistency(
             self.assertTrue(output_ui8.is_contiguous(memory_format=memory_format))
             self.assertTrue(output_f32.is_contiguous(memory_format=memory_format))
 
-        diff = (output_f32 - output_ui8.float()).abs()
         if mode == "bilinear":
             torch.testing.assert_close(output_f32, output_ui8.float(), rtol=0, atol=1)
         else:
-            # - tolerances for bicubic mode are in general higher than for
-            #   bilinear mode, because the bicubic kernel may create
-            #   [intermediate] values outside of the [0, 255] range, which need
-            #   to be clipped in uint8 path, but not in float path. This isn't
-            #   an issue with bilinear kernel.
-            # - Also in bicubic mode, when antialias=False, we have to use
-            #   bigger tolerances than when antialias=True. This is partially
-            #   due to the fact that when False, the float path uses the -0.75
-            #   constant while the uint8 path uses the -0.5 constant in the
-            #   bicubic kernel (when True, they both use -0.5). This difference
-            #   in constants exists for historical reasons. Should both paths
-            #   use the -0.5 constant, we would have closer results and we would
-            #   be able to lower the tolerances.
-
-            max_diff = 30 if antialias else 44
-            assert diff.max() < max_diff
+            diff = (output_f32 - output_ui8.float()).abs()
+            self.assertLess(diff.max(), 15)
 
             threshold = 2
-            percent = 3 if antialias else 40
-            assert (diff > threshold).float().mean() < (percent / 100)
+            percent = 3
+            self.assertLess((diff > threshold).float().mean(), percent / 100)
 
             threshold = 5
-            percent = 1 if antialias else 20
-            assert (diff > threshold).float().mean() < (percent / 100)
+            percent = 1
+            self.assertLess((diff > threshold).float().mean(), percent / 100)
 
-            mae = .4 if antialias else 3
-            assert diff.mean() < mae
+            self.assertLess(diff.mean(), 0.4)
 
     @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
     @parametrize_test("align_corners", [True, False])
@@ -9993,31 +9757,41 @@ def test_upsamplingBicubic2d_aa_correctness(self, device, memory_format):
         self.assertEqual(expected_out, t_out)
 
     @parametrize_test("align_corners", [True, False])
-    def test_upsamplingTrilinear3d(self, device, align_corners):
+    @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last_3d])
+    def test_upsamplingTrilinear3d(self, device, align_corners, memory_format):
         kwargs = dict(mode='trilinear', align_corners=align_corners)
 
-        for memory_format in [torch.contiguous_format, torch.channels_last_3d]:
-            # test float scale factor up & downsampling
-            for scale_factor in [0.5, 1.5, 2]:
-                m = nn.Upsample(scale_factor=scale_factor, **kwargs)
-                in_t = torch.ones(1, 2, 2, 2, 2, device=device, dtype=torch.double)
-                in_t = in_t.contiguous(memory_format=memory_format).requires_grad_()
-                out_size = int(math.floor(in_t.shape[-1] * scale_factor))
-                with warnings.catch_warnings(record=True) as w:
-                    out_t = m(in_t)
-                expected_out = torch.ones(1, 2, out_size, out_size, out_size, device=device, dtype=torch.double)
-                self.assertEqual(expected_out, out_t)
-                # Assert that memory format is carried through to the output
-                self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
-                out_t.backward(torch.randn_like(out_t))
-                self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))
-
-                input = torch.randn(1, 2, 2, 2, 2, requires_grad=True, dtype=torch.double)
-                self.assertEqual(
-                    F.interpolate(input, (out_size, out_size, out_size), **kwargs),
-                    F.interpolate(input, scale_factor=scale_factor, **kwargs))
-                gradcheck(lambda x: F.interpolate(x, out_size, **kwargs), [input])
-                gradgradcheck(lambda x: F.interpolate(x, out_size, **kwargs), [input])
+        # test float scale factor up & downsampling
+        for scale_factor in [0.5, 1.5, 2]:
+            m = nn.Upsample(scale_factor=scale_factor, **kwargs)
+            in_t = torch.ones(1, 2, 4, 4, 4, device=device, dtype=torch.double)
+            in_t = in_t.contiguous(memory_format=memory_format).requires_grad_()
+            out_size = int(math.floor(in_t.shape[-1] * scale_factor))
+            with warnings.catch_warnings(record=True) as w:
+                out_t = m(in_t)
+            expected_out = torch.ones(1, 2, out_size, out_size, out_size, device=device, dtype=torch.double)
+            self.assertEqual(expected_out, out_t)
+            # Assert that memory format is carried through to the output
+            self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
+
+            grad_out = torch.randn_like(out_t).contiguous(memory_format=memory_format)
+            in_t.grad = None
+            out_t.backward(grad_out)
+            grad_in = in_t.grad
+            self.assertTrue(grad_in.is_contiguous(memory_format=memory_format))
+
+            if memory_format == torch.channels_last_3d:
+                # check if grad inputs CF and CL match
+                in_t.grad = None
+                out_t.backward(grad_out.contiguous())
+                self.assertEqual(in_t.grad, grad_in)
+
+            input = torch.randn(1, 2, 4, 4, 4, requires_grad=True, dtype=torch.double)
+            self.assertEqual(
+                F.interpolate(input, (out_size, out_size, out_size), **kwargs),
+                F.interpolate(input, scale_factor=scale_factor, **kwargs))
+            gradcheck(lambda x: F.interpolate(x, out_size, **kwargs), [input])
+            gradgradcheck(lambda x: F.interpolate(x, out_size, **kwargs), [input])
 
     @onlyCUDA
     @dtypes(torch.half)
@@ -10029,6 +9803,23 @@ def test_upsampling_64bit_indexing_channels_last(self, device, dtype):
         del x
         self.assertTrue(torch.allclose(out, out_ref))
 
+    @onlyCUDA
+    @dtypes(torch.half)
+    @largeTensorTest('40GB')
+    def test_replicatepad_64bit_indexing(self, device, dtype):
+        conv = torch.nn.Conv1d(128, 128, 3, 1, 1, padding_mode="replicate", device=device, dtype=dtype)
+        x = torch.randn(size=(256 * 448 * 2, 128, 96), dtype=dtype, device=device)
+        y = conv(x)
+        torch.mean(y).backward()
+
+    @onlyCUDA
+    @dtypes(torch.half)
+    @largeTensorTest('40GB')
+    def test_upsamplingnearest2d_backward_64bit_indexing(self, device, dtype):
+        x = torch.randn(size=(36, 128, 512, 512), device=device, dtype=dtype).requires_grad_()
+        y = F.interpolate(x, scale_factor=2, mode="nearest")
+        y.backward(torch.randn_like(y))
+
     def _slow_masked_softmax(self, input, mask):
         exp = torch.exp(input)
         exp = exp * mask
@@ -10165,6 +9956,29 @@ def slow_masked_softmax(input, mask):
                     exact_dtype=True
                 )
 
+    @dtypes(torch.bfloat16, torch.half)
+    @precisionOverride({torch.bfloat16: 2e-2, torch.half: 3e-3})
+    def test_masked_softmax_lowp(self, dtype):
+        sizes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)]
+        for (B, num_heads, L) in sizes:
+            for dim in [0, 3]:
+                input_lowp = torch.randn((B, num_heads, L, L), dtype=dtype).requires_grad_()
+                input_ref = input_lowp.float().detach().requires_grad_()
+                mask = torch.randint(0, 2, (B, L))
+                mask = mask.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool()
+
+                for mask_type in [1, 2]:
+                    res_ref = torch._masked_softmax(input_ref, mask, dim, mask_type)
+                    res = torch._masked_softmax(input_lowp, mask, dim, mask_type)
+                    self.assertEqual(res_ref.to(dtype), res)
+
+                    grad_lowp = torch.randn_like(res_ref).to(dtype=dtype)
+                    grad_ref = grad_lowp.float()
+
+                    res_ref.backward(grad_ref)
+                    res.backward(grad_lowp)
+                    self.assertEqual(input_ref.grad.to(dtype), input_lowp.grad)
+
     def _test_masked_softmax_helper(self, input, dim, mask, mask_type):
         input_ref = input.detach().clone().requires_grad_()
         result = torch._masked_softmax(input, mask, dim, mask_type)
@@ -11690,6 +11504,47 @@ def compute_result_and_gradient(reduction, target_dtype):
             self.assertEqual(result_long, result_byte)
             self.assertEqual(grad_long, grad_byte)
 
+    @onlyCUDA
+    @skipIfRocm
+    @dtypes(torch.float16, torch.float32)
+    def test_cross_entropy_loss_2d_out_of_bounds_class_index(self, device, dtype):
+        # Test for issue #117532
+        # Run in a different process to prevent the device-side assert from affecting other tests
+        stderr = TestCase.runWithPytorchAPIUsageStderr(f"""\
+#!/usr/bin/env python3
+
+import torch
+import torch.nn.functional as F
+from torch.testing._internal.common_utils import (run_tests, TestCase)
+
+class TestThatContainsCUDAAssert(TestCase):
+    def test_cross_entropy_loss_2d_out_of_bounds_class_index(self):
+        device = '{str(device)}'
+        dtype = {str(dtype).strip("'")}
+        ignore_index = 255
+        b = 10
+        n_classes = 3
+        w = 768
+        h = 1024
+        pred = torch.randn(b, n_classes, w, h, dtype=dtype, device=device)
+        labels = torch.zeros(b, w, h, dtype=torch.int64, device=device)
+        labels[5, 200, 200] = ignore_index
+        # Set invalid class index
+        labels[5, 200, 200] = 254
+
+        x = F.cross_entropy(
+            pred, labels, reduction="none", ignore_index=ignore_index
+        )
+        torch.cuda.synchronize()
+
+
+if __name__ == '__main__':
+    run_tests()
+        """)
+        self.assertIn('CUDA error: device-side assert triggered', stderr)
+
+
+
     def test_cross_entropy_loss_prob_target_all_reductions(self, device):
         # Test with k-dimensional loss.
         for k in range(5):
@@ -11906,10 +11761,10 @@ def check_equal(loss, inp_targ_1, inp_targ_2):
                 # i.e. we don't count the ignored_idx at all.
                 check_equal(loss, (inp1, targ_positive_ignore_index), (inp2[1:], targ_positive_ignore_index[1:]))
 
-    # Ref: https://github.com/pytorch/pytorch/issue/85005
+    # Ref: https://github.com/pytorch/pytorch/issues/85005
     @onlyCUDA
     @largeTensorTest("45GB", "cpu")
-    @largeTensorTest("45GB", "cuda")
+    @largeTensorTest("70GB", "cuda")
     @parametrize_test("reduction", ("none", "mean", "sum"))
     def test_cross_entropy_large_tensor(self, device, reduction):
         logits = torch.randn(int(2 ** 16), int(2 ** 16) + 1, dtype=torch.float32, device='cuda', requires_grad=True)
@@ -12342,6 +12197,17 @@ def forward(self, x):
         m = MyModule(10, 1, device='meta', dtype=dtype)
         m(input)
 
+        # Test empty meta module error with torch.nn.Module.to().
+        with self.assertRaisesRegex(
+            NotImplementedError,
+            re.escape(
+                "Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() "
+                "instead of torch.nn.Module.to() when moving module from meta to a different "
+                "device."
+            ),
+        ):
+            m.to(device)
+
         # Test materializing meta module on a real device.
         m.to_empty(device=device)
         m(input)
@@ -12695,7 +12561,6 @@ def perm_fn(x):
             with cm:
                 _test(activation=activation, batch_first=batch_first, training=training)
 
-    @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
     @parametrize_test('foreach', (False, True))
     def test_clip_grad_value(self, foreach, device):
         if torch.device(device).type == 'xla' and foreach:
@@ -12723,7 +12588,6 @@ def test_clip_grad_value(self, foreach, device):
         clip_grad_value_([p2], clip_value, foreach=foreach)
         self.assertEqual(p1.grad, p2.grad)
 
-    @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
     @parametrize_test('foreach', (False, True))
     @parametrize_test('norm_type', (0.5, 1.5, 2, 4, 'inf'))
     def test_clip_grad_norm(self, norm_type, foreach, device):
@@ -12784,7 +12648,7 @@ def compare_scaling(grads):
 
     # reference issue: https://github.com/pytorch/pytorch/issues/111484
     @onlyCUDA
-    @largeTensorTest("41GB" if TEST_WITH_ROCM else "30GB", "cuda")
+    @largeTensorTest("42GB", "cuda")
     def test_softmax_forward_64bit_indexing(self, device):
         batch_size = 70
         seq_len = 2048
@@ -12951,6 +12815,49 @@ def test_fuse_linear_bn_requires_grad(self):
             self.assertEqual(weight.requires_grad, w_rg)
             self.assertEqual(bias.requires_grad, b_rg)
 
+class TestUtils(TestCase):
+    def test_consume_prefix_in_state_dict_if_present(self):
+        class Block(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = nn.Conv2d(3, 3, 3, bias=True)
+                self.conv2 = nn.Conv2d(3, 3, 3, bias=False)
+
+        class Net(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear1 = nn.Linear(5, 5)
+                self.linear2 = nn.Linear(5, 5)
+                net.bn = nn.BatchNorm2d(2)
+                self.block = Block()
+
+        # 0. Case non-DDP model empty state_dict
+        net = nn.Module()
+        state_dict = net.state_dict()
+        nn.modules.utils.consume_prefix_in_state_dict_if_present(state_dict, 'module.')
+        # check they are the same preserving order
+        self.assertEqual(list(state_dict.keys()), list(net.state_dict().keys()))
+        self.assertEqual(list(state_dict._metadata.keys()), list(net.state_dict()._metadata.keys()))
+
+        # 1. Case non-DDP model test example state_dict
+        net = Net()
+        state_dict = net.state_dict()
+        nn.modules.utils.consume_prefix_in_state_dict_if_present(state_dict, 'module.')
+        # Check they are the same preserving order
+        self.assertEqual(list(state_dict.keys()), list(net.state_dict().keys()))
+        self.assertEqual(list(state_dict._metadata.keys()), list(net.state_dict()._metadata.keys()))
+
+        # 2. Case DDP model test example state_dict
+        state_dict = net.state_dict()
+        metadata = state_dict._metadata
+        ddp_state_dict = OrderedDict((f'module.{k}', v) for k, v in state_dict.items())
+        ddp_state_dict._metadata = OrderedDict({'': metadata['']})
+        ddp_state_dict._metadata.update(('module' if k == '' else f'module.{k}', v) for k, v in metadata.items())
+        nn.modules.utils.consume_prefix_in_state_dict_if_present(ddp_state_dict, 'module.')
+        # Check they are the same preserving order
+        self.assertEqual(list(state_dict.keys()), list(ddp_state_dict.keys()))
+        self.assertEqual(list(state_dict._metadata.keys()), list(ddp_state_dict._metadata.keys()))
+
 instantiate_device_type_tests(TestNNDeviceType, globals())
 instantiate_parametrized_tests(TestNN)
 
diff --git a/test/test_nnapi.py b/test/test_nnapi.py
index df4bc06f7fb15..f1d00796eb0e5 100644
--- a/test/test_nnapi.py
+++ b/test/test_nnapi.py
@@ -287,7 +287,7 @@ def forward(self, arg):
                             return torch.nn.functional.relu(arg)
                         if op == "sigmoid":
                             return torch.sigmoid(arg)
-                        raise Exception("Bad op")
+                        raise Exception("Bad op")  # noqa: TRY002
                 self.check(UnaryModule(), torch.tensor([-1.0, 1.0]))
                 self.check(
                     UnaryModule(),
@@ -307,7 +307,7 @@ def forward(self, lhs, rhs):
                             return lhs * rhs
                         if op == "div":
                             return lhs / rhs
-                        raise Exception("Bad op")
+                        raise Exception("Bad op")  # noqa: TRY002
 
                 self.check(
                     BinaryModule(),
diff --git a/test/test_numba_integration.py b/test/test_numba_integration.py
index 887aa2ba8dd6a..30e1f79242319 100644
--- a/test/test_numba_integration.py
+++ b/test/test_numba_integration.py
@@ -258,6 +258,7 @@ def test_from_cuda_array_interface(self):
         ]
         for dtype in dtypes:
             numpy_arys = [
+                numpy.ones((), dtype=dtype),
                 numpy.arange(6).reshape(2, 3).astype(dtype),
                 numpy.arange(6).reshape(2, 3).astype(dtype)[1:],  # View offset should be ignored
                 numpy.arange(6).reshape(2, 3).astype(dtype)[:, None],  # change the strides but still contiguous
diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py
index a0b9561841629..f19ce61917eaa 100644
--- a/test/test_numpy_interop.py
+++ b/test/test_numpy_interop.py
@@ -1,15 +1,19 @@
+# mypy: ignore-errors
+
 # Owner(s): ["module: numpy"]
 
 import torch
 import numpy as np
 
 from itertools import product
+import sys
 
 from torch.testing._internal.common_utils import \
     (skipIfTorchDynamo, TestCase, run_tests)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, onlyCPU, dtypes, skipMeta)
 from torch.testing._internal.common_dtype import all_types_and_complex_and
+from torch.testing import make_tensor
 
 
 # For testing handling NumPy objects and sending tensors to / accepting
@@ -220,7 +224,7 @@ def test_from_numpy(self, device) -> None:
                     self.assertEqual(tensor_from_array2[i], array2[i])
 
         # Test unsupported type
-        array = np.array([1, 2, 3, 4], dtype=np.uint16)
+        array = np.array(['foo', 'bar'], dtype=np.dtype(np.str_))
         with self.assertRaises(TypeError):
             tensor_from_array = torch.from_numpy(array)
 
@@ -255,6 +259,18 @@ def test_from_numpy(self, device) -> None:
         x.strides = (3,)
         self.assertRaises(ValueError, lambda: torch.from_numpy(x))
 
+    @skipIfTorchDynamo("No need to test invalid dtypes that should fail by design.")
+    def test_from_numpy_no_leak_on_invalid_dtype(self):
+        # This used to leak memory as the `from_numpy` call raised an exception and didn't decref the temporary
+        # object. See https://github.com/pytorch/pytorch/issues/121138
+        x = np.array("value".encode('ascii'))
+        for _ in range(1000):
+            try:
+                torch.from_numpy(x)
+            except TypeError:
+                pass
+        self.assertTrue(sys.getrefcount(x) == 2)
+
     @skipMeta
     def test_from_list_of_ndarray_warning(self, device):
         warning_msg = r"Creating a tensor from a list of numpy.ndarrays is extremely slow"
@@ -417,7 +433,7 @@ def test_multiplication_numpy_scalar(self, device) -> None:
     @onlyCPU
     def test_parse_numpy_int(self, device):
         # Only concrete class can be given where "Type[number[_64Bit]]" is expected
-        self.assertRaisesRegex(RuntimeError, "Overflow",
+        self.assertRaisesRegex(RuntimeError, "(Overflow|an integer is required)",
                                lambda: torch.mean(torch.randn(1, 1), np.uint64(-1)))  # type: ignore[call-overload]
         # https://github.com/pytorch/pytorch/issues/29252
         for nptype in [np.int16, np.int8, np.uint8, np.int32, np.int64]:
@@ -482,6 +498,49 @@ def test_numpy_scalar_cmp(self, device, dtype):
                 else:
                     self.assertTrue(t == a)
 
+    @onlyCPU
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool))
+    def test___eq__(self, device, dtype):
+        a = make_tensor((5, 7), dtype=dtype, device=device, low=-9, high=9)
+        b = a.clone().detach()
+        b_np = b.numpy()
+
+        # Check all elements equal
+        res_check = torch.ones_like(a, dtype=torch.bool)
+        self.assertEqual(a == b_np, res_check)
+        self.assertEqual(b_np == a, res_check)
+
+        # Check one element unequal
+        if dtype == torch.bool:
+            b[1][3] = not b[1][3]
+        else:
+            b[1][3] += 1
+        res_check[1][3] = False
+        self.assertEqual(a == b_np, res_check)
+        self.assertEqual(b_np == a, res_check)
+
+        # Check random elements unequal
+        rand = torch.randint(0, 2, a.shape, dtype=torch.bool)
+        res_check = rand.logical_not()
+        b.copy_(a)
+
+        if dtype == torch.bool:
+            b[rand] = b[rand].logical_not()
+        else:
+            b[rand] += 1
+
+        self.assertEqual(a == b_np, res_check)
+        self.assertEqual(b_np == a, res_check)
+
+        # Check all elements unequal
+        if dtype == torch.bool:
+            b.copy_(a.logical_not())
+        else:
+            b.copy_(a + 1)
+        res_check.fill_(False)
+        self.assertEqual(a == b_np, res_check)
+        self.assertEqual(b_np == a, res_check)
+
     @onlyCPU
     def test_empty_tensors_interop(self, device):
         x = torch.rand((), dtype=torch.float16)
diff --git a/test/test_ops.py b/test/test_ops.py
index b105e0968fcae..44f503ae9b6ed 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1,88 +1,86 @@
 # Owner(s): ["module: unknown"]
 
+import contextlib
 import copy
-from collections.abc import Sequence
-from functools import partial
-import warnings
-import unittest
 import inspect
 import itertools
-import torch
-import contextlib
-import re
 import os
+import re
+import unittest
+import warnings
 
 from collections import defaultdict
+from collections.abc import Sequence
+from functools import partial
 from importlib import import_module
-from torch.utils._pytree import tree_map
 from typing import Dict, List
+
+import torch
+
+import torch._prims as prims
+
+import torch.utils._pytree as pytree
+from torch._prims.context import TorchRefsMode
+from torch._prims_common.wrappers import _maybe_remove_out_wrapper
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch._subclasses.fake_utils import outputs_alias_inputs
 from torch.testing import make_tensor
+
+from torch.testing._internal import composite_compliance, opinfo
+from torch.testing._internal.common_device_type import (
+    deviceCountAtLeast,
+    instantiate_device_type_tests,
+    onlyCPU,
+    onlyCUDA,
+    onlyNativeDeviceTypes,
+    OpDTypes,
+    ops,
+    skipMeta,
+)
 from torch.testing._internal.common_dtype import (
-    floating_and_complex_types_and,
     all_types_and_complex_and,
+    floating_and_complex_types_and,
     integral_types_and,
 )
+from torch.testing._internal.common_methods_invocations import (
+    BinaryUfuncInfo,
+    op_db,
+    ops_and_refs,
+    python_ref_db,
+    ReductionOpInfo,
+    ReductionPythonRefInfo,
+    skip,
+    skipOps,
+    SpectralFuncInfo,
+    UnaryUfuncInfo,
+    xfail,
+)
 
 from torch.testing._internal.common_utils import (
-    TestCase,
-    is_iterable_of_tensors,
-    run_tests,
-    IS_SANDCASTLE,
     clone_input_helper,
+    first_sample,
     IS_CI,
+    IS_FBCODE,
+    is_iterable_of_tensors,
+    IS_SANDCASTLE,
+    IS_WINDOWS,
+    noncontiguous_like,
+    parametrize,
+    run_tests,
     set_default_dtype,
+    skipIfTorchInductor,
+    slowTest,
     suppress_warnings,
-    noncontiguous_like,
     TEST_WITH_ASAN,
+    TEST_WITH_ROCM,
     TEST_WITH_TORCHDYNAMO,
     TEST_WITH_TORCHINDUCTOR,
     TEST_WITH_UBSAN,
-    IS_WINDOWS,
-    IS_FBCODE,
-    first_sample,
-    parametrize,
-    skipIfTorchInductor,
-    slowTest,
+    TestCase,
     unMarkDynamoStrictTest,
 )
-from torch.testing._internal.common_methods_invocations import (
-    op_db,
-    UnaryUfuncInfo,
-    ReductionOpInfo,
-    ReductionPythonRefInfo,
-    SpectralFuncInfo,
-    ops_and_refs,
-    python_ref_db,
-    BinaryUfuncInfo,
-    xfail,
-    skip,
-    skipOps
-)
-from torch.testing._internal.common_device_type import (
-    deviceCountAtLeast,
-    instantiate_device_type_tests,
-    ops,
-    onlyCUDA,
-    onlyCPU,
-    onlyNativeDeviceTypes,
-    OpDTypes,
-    skipMeta,
-)
-from torch._subclasses.fake_tensor import (
-    FakeTensor,
-    FakeTensorMode,
-)
-from torch._subclasses.fake_utils import outputs_alias_inputs
-
-import torch._prims as prims
-from torch._prims.context import TorchRefsMode
-from torch._prims_common.wrappers import _maybe_remove_out_wrapper
-
-from torch.testing._internal import opinfo
-from torch.testing._internal import composite_compliance
-
-import torch.utils._pytree as pytree
 from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils._pytree import tree_map
 
 assert torch.get_default_dtype() == torch.float32
 
@@ -107,16 +105,21 @@
     )
 )
 
+
 def reduction_dtype_filter(op):
-    if (not isinstance(op, ReductionPythonRefInfo) or not op.supports_out
-       or torch.int16 not in op.dtypes):
+    if (
+        not isinstance(op, ReductionPythonRefInfo)
+        or not op.supports_out
+        or torch.int16 not in op.dtypes
+    ):
         return False
 
     argspec = inspect.getfullargspec(op.op)
-    if 'dtype' not in argspec.kwonlyargs:
+    if "dtype" not in argspec.kwonlyargs:
         return False
     return True
 
+
 # Create a list of operators that are a subset of _ref_test_ops but don't have a
 # numpy ref to compare them too, If both CPU and CUDA are compared to numpy
 # then they do not need to be compared to each other
@@ -124,6 +127,7 @@ def reduction_dtype_filter(op):
 
 aten = torch.ops.aten
 
+
 # Tests that apply to all operators and aren't related to any particular
 #   system
 @unMarkDynamoStrictTest
@@ -170,7 +174,6 @@ def test_multiple_devices(self, devices, dtype, op):
                 )
 
     def test_pointwise_tag_coverage(self):
-
         pytorch_dir = os.path.abspath(__file__ + "/../../")
         files = [
             "aten/src/ATen/native/UnaryOps.cpp",
@@ -229,7 +232,7 @@ def get_opoverloadpacket_from_dispatch(kernel):
                 lines = f.read()
                 matches = regex.findall(lines)
                 for match in matches:
-                    kernel = match[len("DEFINE_DISPATCH("):-len("_stub")]
+                    kernel = match[len("DEFINE_DISPATCH(") : -len("_stub")]
 
                     # no op definition for it, but defined with DEFINE_DISPATCH ?
                     if kernel == "trigamma":
@@ -264,10 +267,12 @@ def get_opoverloadpacket_from_dispatch(kernel):
     @ops(_ref_test_ops, allowed_dtypes=(torch.float64, torch.long, torch.complex128))
     def test_numpy_ref(self, device, dtype, op):
         if (
-            TEST_WITH_TORCHINDUCTOR and
-            op.formatted_name in ('signal_windows_exponential', 'signal_windows_bartlett') and
-            dtype == torch.float64 and 'cuda' in device
-           ):   # noqa: E121
+            TEST_WITH_TORCHINDUCTOR
+            and op.formatted_name
+            in ("signal_windows_exponential", "signal_windows_bartlett")
+            and dtype == torch.float64
+            and "cuda" in device
+        ):  # noqa: E121
             raise unittest.SkipTest("XXX: raises tensor-likes are not close.")
 
         # Sets the default dtype to NumPy's default dtype of double
@@ -283,10 +288,9 @@ def test_numpy_ref(self, device, dtype, op):
     @slowTest
     @ops(_ops_and_refs_with_no_numpy_ref, dtypes=OpDTypes.any_common_cpu_cuda_one)
     def test_compare_cpu(self, device, dtype, op):
-
         def to_cpu(arg):
             if isinstance(arg, torch.Tensor):
-                return arg.to(device='cpu')
+                return arg.to(device="cpu")
             return arg
 
         samples = op.reference_inputs(device, dtype)
@@ -315,6 +319,10 @@ def to_cpu(arg):
     @ops(python_ref_db)
     @skipIfTorchInductor("Takes too long for inductor")
     def test_python_ref_meta(self, device, dtype, op):
+        CHECK_CONJ_SKIPS = {
+            torch._refs.linalg.svd,
+        }
+
         with FakeTensorMode() as mode:
             pass
 
@@ -331,7 +339,9 @@ def _to_tensormeta(x):
             meta_sample = sample.transform(_to_tensormeta)
             try:
                 with mode:
-                    meta_result = op(meta_sample.input, *meta_sample.args, **meta_sample.kwargs)
+                    meta_result = op(
+                        meta_sample.input, *meta_sample.args, **meta_sample.kwargs
+                    )
             except torch._subclasses.fake_tensor.UnsupportedFakeTensorException:
                 continue
             except torch._subclasses.fake_tensor.DataDependentOutputException:
@@ -341,12 +351,16 @@ def _to_tensormeta(x):
 
             if isinstance(result, torch.Tensor):
                 self.assertTrue(isinstance(meta_result, FakeTensor))
-                prims.utils.compare_tensor_meta(result, meta_result)
+                prims.utils.compare_tensor_meta(
+                    result, meta_result, check_conj=op.op not in CHECK_CONJ_SKIPS
+                )
             elif isinstance(result, Sequence):
                 for a, b in zip(result, meta_result):
                     if isinstance(a, torch.Tensor) or isinstance(b, torch.Tensor):
                         self.assertTrue(isinstance(b, FakeTensor))
-                        prims.utils.compare_tensor_meta(a, b)
+                        prims.utils.compare_tensor_meta(
+                            a, b, check_conj=op.op not in CHECK_CONJ_SKIPS
+                        )
 
     def _ref_test_helper(
         self,
@@ -362,22 +376,27 @@ def _ref_test_helper(
         # NOTE: this test works by comparing the reference
         ex = None
         for sample in op.reference_inputs(device, dtype, requires_grad=False):
-            if isinstance(sample.input, torch.Tensor) and sample.input.numel() == 0 and skip_zero_numel:
+            if (
+                isinstance(sample.input, torch.Tensor)
+                and sample.input.numel() == 0
+                and skip_zero_numel
+            ):
                 continue
-            if isinstance(sample.input, torch.Tensor) and sample.input.ndim == 0 and skip_zero_dim:
+            if (
+                isinstance(sample.input, torch.Tensor)
+                and sample.input.ndim == 0
+                and skip_zero_dim
+            ):
                 continue
 
-            if (
-                skip_bfloat
-                and (
-                    (
-                        isinstance(sample.input, torch.Tensor)
-                        and sample.input.dtype == torch.bfloat16
-                    )
-                    or any(
-                        isinstance(arg, torch.Tensor) and arg.dtype == torch.bfloat16
-                        for arg in sample.args
-                    )
+            if skip_bfloat and (
+                (
+                    isinstance(sample.input, torch.Tensor)
+                    and sample.input.dtype == torch.bfloat16
+                )
+                or any(
+                    isinstance(arg, torch.Tensor) and arg.dtype == torch.bfloat16
+                    for arg in sample.args
                 )
             ):
                 continue
@@ -385,12 +404,19 @@ def _ref_test_helper(
                 ref_result = op(sample.input, *sample.args, **sample.kwargs)
             torch_result = op.torch_opinfo(sample.input, *sample.args, **sample.kwargs)
 
-            for a, b in zip(pytree.tree_leaves(ref_result), pytree.tree_leaves(torch_result)):
+            for a, b in zip(
+                pytree.tree_leaves(ref_result), pytree.tree_leaves(torch_result)
+            ):
                 if isinstance(a, torch.Tensor) or isinstance(b, torch.Tensor):
                     prims.utils.compare_tensor_meta(a, b)
-                    if getattr(op, 'validate_view_consistency', True) and not skip_view_consistency:
-                        msg = (f"The torch implementation {'returns' if b._is_view() else 'does not return'} "
-                               f"a view, while the reference {'does' if a._is_view() else 'does not'}")
+                    if (
+                        getattr(op, "validate_view_consistency", True)
+                        and not skip_view_consistency
+                    ):
+                        msg = (
+                            f"The torch implementation {'returns' if b._is_view() else 'does not return'} "
+                            f"a view, while the reference {'does' if a._is_view() else 'does not'}"
+                        )
                         self.assertEqual(a._is_view(), b._is_view(), msg)
 
             # Computes the dtype the more precise computatino would occur in
@@ -422,7 +448,6 @@ def _ref_test_helper(
 
                 ex = e
 
-
             # Goes to next sample if these results are close
             if not ex:
                 continue
@@ -437,7 +462,9 @@ def _make_precise(x):
                 return x
 
             precise_sample = sample.transform(_make_precise)
-            precise_result = op.torch_opinfo(precise_sample.input, *precise_sample.args, **precise_sample.kwargs)
+            precise_result = op.torch_opinfo(
+                precise_sample.input, *precise_sample.args, **precise_sample.kwargs
+            )
 
             def _distance(a, b):
                 # Special-cases boolean comparisons
@@ -445,24 +472,34 @@ def _distance(a, b):
                     assert b.dtype is torch.bool
                     return (a ^ b).sum()
 
-                same = (a == b)
-                if prims.utils.is_float_dtype(a.dtype) or prims.utils.is_complex_dtype(a.dtype):
-                    same = torch.logical_or(same, torch.logical_and(torch.isnan(a), torch.isnan(b)))
+                same = a == b
+                if prims.utils.is_float_dtype(a.dtype) or prims.utils.is_complex_dtype(
+                    a.dtype
+                ):
+                    same = torch.logical_or(
+                        same, torch.logical_and(torch.isnan(a), torch.isnan(b))
+                    )
 
                 actual_error = torch.where(same, 0, torch.abs(a - b)).sum()
                 return actual_error
 
             ref_distance = 0
-            for a, b in zip(pytree.tree_leaves(ref_result), pytree.tree_leaves(precise_result)):
+            for a, b in zip(
+                pytree.tree_leaves(ref_result), pytree.tree_leaves(precise_result)
+            ):
                 ref_distance = ref_distance + _distance(a, b)
 
             torch_distance = 0
-            for a, b in zip(pytree.tree_leaves(torch_result), pytree.tree_leaves(precise_result)):
+            for a, b in zip(
+                pytree.tree_leaves(torch_result), pytree.tree_leaves(precise_result)
+            ):
                 torch_distance = torch_distance + _distance(a, b)
 
             # TODO: consider adding some tolerance to this comparison
-            msg = f"Reference result was farther ({ref_distance}) from the precise " \
-                  f"computation than the torch result was ({torch_distance})!"
+            msg = (
+                f"Reference result was farther ({ref_distance}) from the precise "
+                f"computation than the torch result was ({torch_distance})!"
+            )
             self.assertTrue(ref_distance <= torch_distance, msg=msg)
 
         # Reports numerical accuracy discrepancies
@@ -480,6 +517,12 @@ def test_python_ref(self, device, dtype, op):
         # In this test, primTorch refs call into the refs namespace
         # For example, a ref with torch.foo in it will calls refs.foo instead
         # Direct calls to refs and prims are not affected
+        if (
+            TEST_WITH_ROCM
+            and (op.name == "_refs.fft.ihfftn" or op.name == "_refs.fft.ihfft2")
+            and dtype == torch.float16
+        ):
+            self.skipTest("Skipped on ROCm")
         self._ref_test_helper(lambda: TorchRefsMode(strict=True), device, dtype, op)
 
     # Tests that experimental Python References perform the same computation
@@ -492,14 +535,27 @@ def test_python_ref_torch_fallback(self, device, dtype, op):
         # In this test, refs call into the torch namespace (after the initial invocation)
         # For example, a ref with torch.foo in it will call torch.foo instead of refs.foo
         # Direct calls to refs and prims are not translated
+        if TEST_WITH_ROCM and op.name == "_refs.fft.ihfftn" and dtype == torch.float16:
+            self.skipTest("Skipped on ROCm")
         self._ref_test_helper(contextlib.nullcontext, device, dtype, op)
 
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @onlyCUDA
     @ops(python_ref_db)
-    @parametrize('executor', ['aten',])
+    @parametrize(
+        "executor",
+        [
+            "aten",
+        ],
+    )
     @skipIfTorchInductor("Takes too long for inductor")
     def test_python_ref_executor(self, device, dtype, op, executor):
+        if (
+            TEST_WITH_ROCM
+            and (op.name == "_refs.fft.ihfftn" or op.name == "_refs.fft.ihfft2")
+            and dtype == torch.float16
+        ):
+            self.skipTest("Skipped on ROCm")
         # skip zero-dim tensors for some composites of reduction operations and view
         skip_zero_dim_ops = [
             "_refs.logsumexp",
@@ -510,8 +566,10 @@ def test_python_ref_executor(self, device, dtype, op, executor):
             "ops.nvprims.view",
         ]
 
-        from torch._prims.executor import make_traced
         from copy import copy
+
+        from torch._prims.executor import make_traced
+
         op = copy(op)
         op.op = partial(make_traced(op.op), executor=executor)
         self._ref_test_helper(
@@ -534,8 +592,20 @@ def test_errors(self, device, op):
 
     @skipMeta
     @onlyNativeDeviceTypes
-    @ops([op for op in op_db if op.error_inputs_sparse_func is not None], dtypes=OpDTypes.none)
-    @parametrize("layout", (torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc, torch.sparse_coo))
+    @ops(
+        [op for op in op_db if op.error_inputs_sparse_func is not None],
+        dtypes=OpDTypes.none,
+    )
+    @parametrize(
+        "layout",
+        (
+            torch.sparse_csr,
+            torch.sparse_csc,
+            torch.sparse_bsr,
+            torch.sparse_bsc,
+            torch.sparse_coo,
+        ),
+    )
     def test_errors_sparse(self, device, op, layout):
         for ei in op.error_inputs_sparse(device, layout):
             si = ei.sample_input
@@ -545,7 +615,10 @@ def test_errors_sparse(self, device, op, layout):
 
     @skipMeta
     @onlyNativeDeviceTypes
-    @ops([op for op in python_ref_db if op.error_inputs_func is not None], dtypes=OpDTypes.none)
+    @ops(
+        [op for op in python_ref_db if op.error_inputs_func is not None],
+        dtypes=OpDTypes.none,
+    )
     @skipIfTorchInductor("Takes too long for inductor")
     def test_python_ref_errors(self, device, op):
         mode = FakeTensorMode()
@@ -746,8 +819,9 @@ def _compare_out(transform, *, compare_strides_and_data_ptrs=True):
                 self.assertEqual(expected, out)
 
                 if compare_strides_and_data_ptrs:
-                    stride_msg = "Strides are not the same! Original strides were {} and strides are now {}".format(
-                        original_strides, final_strides
+                    stride_msg = (
+                        f"Strides are not the same! Original strides were {original_strides} "
+                        f"and strides are now {final_strides}"
                     )
                     self.assertEqual(original_strides, final_strides, msg=stride_msg)
                     self.assertEqual(original_ptrs, final_ptrs)
@@ -871,8 +945,9 @@ def _compare_out(transform, *, compare_strides_and_data_ptrs=True):
                 self.assertEqual(expected, out)
 
                 if compare_strides_and_data_ptrs:
-                    stride_msg = "Strides are not the same! Original strides were {} and strides are now {}".format(
-                        original_strides, final_strides
+                    stride_msg = (
+                        "Strides are not the same! "
+                        f"Original strides were {original_strides} and strides are now {final_strides}"
                     )
                     self.assertEqual(original_strides, final_strides, msg=stride_msg)
                     self.assertEqual(original_ptrs, final_ptrs)
@@ -889,7 +964,6 @@ def _case_zero_transform(t):
                     # for non-integer types fills with NaN
                     return torch.full_like(t, float("nan"))
 
-
             _compare_out(_case_zero_transform)
 
             # Case 1: out= with the correct shape, dtype, and device,
@@ -929,7 +1003,6 @@ def _case_two_transform(t):
             elif torch.cuda.is_available():
                 wrong_device = "cuda"
 
-
             factory_fn_msg = (
                 "\n\nNOTE: If your op is a factory function (i.e., it accepts TensorOptions) you should mark its "
                 "OpInfo with `is_factory_function=True`."
@@ -988,6 +1061,42 @@ def _case_four_transform(t):
                     with self.assertRaises(RuntimeError, msg=msg_fail):
                         op_out(out=out)
 
+    @ops(
+        [
+            op
+            for op in op_db
+            if op.supports_out and (op.supports_autograd or op.is_factory_function)
+        ],
+        dtypes=OpDTypes.supported,
+        allowed_dtypes=[torch.float, torch.cfloat],
+    )
+    def test_out_requires_grad_error(self, device, dtype, op):
+        sample = first_sample(self, op.sample_inputs(device, dtype))
+
+        # Call op to get prototype for out arguments
+        expect = op(sample.input, *sample.args, **sample.kwargs)
+        any_requires_grad = False
+
+        def set_requires_grad(x):
+            nonlocal any_requires_grad
+            if isinstance(x, torch.Tensor) and (
+                x.is_floating_point() or x.is_complex()
+            ):
+                any_requires_grad = True
+                x.requires_grad_(True)
+            return x
+
+        out = pytree.tree_map_(set_requires_grad, expect)
+        if not any_requires_grad:
+            # Skip ops without any floating point outputs, e.g. isnan
+            return
+
+        msg = (
+            "functions with out=... arguments don't support automatic "
+            "differentiation, but one of the arguments requires grad."
+        )
+        with self.assertRaises(RuntimeError, msg=msg):
+            op(sample.input, *sample.args, **sample.kwargs, out=out)
 
     @ops(filter(reduction_dtype_filter, ops_and_refs), dtypes=(torch.int16,))
     def test_out_integral_dtype(self, device, dtype, op):
@@ -1002,24 +1111,32 @@ def helper(with_out, expectFail, op_to_test, inputs, *args, **kwargs):
                 self.assertFalse(expectFail)
             except RuntimeError as err:
                 self.assertEqual(
-                    str(err), "dtype argument and out dtype must match in reduction")
+                    str(err), "dtype argument and out dtype must match in reduction"
+                )
                 self.assertTrue(expectFail)
             return out
+
         samples = op.sample_inputs(device, dtype)
         for sample in samples:
-            if 'dtype' not in sample.kwargs:
+            if "dtype" not in sample.kwargs:
                 helper(False, False, op, sample.input, *sample.args, **sample.kwargs)
                 helper(True, False, op, sample.input, *sample.args, **sample.kwargs)
-                sample.kwargs['dtype'] = torch.int16
+                sample.kwargs["dtype"] = torch.int16
                 helper(False, False, op, sample.input, *sample.args, **sample.kwargs)
                 helper(True, True, op, sample.input, *sample.args, **sample.kwargs)
-                sample.kwargs['dtype'] = torch.int32
+                sample.kwargs["dtype"] = torch.int32
                 helper(False, False, op, sample.input, *sample.args, **sample.kwargs)
                 helper(True, False, op, sample.input, *sample.args, **sample.kwargs)
             else:
                 helper(False, False, op, sample.input, *sample.args, **sample.kwargs)
-                helper(True, sample.kwargs['dtype'] != torch.int32, op, sample.input,
-                       *sample.args, **sample.kwargs)
+                helper(
+                    True,
+                    sample.kwargs["dtype"] != torch.int32,
+                    op,
+                    sample.input,
+                    *sample.args,
+                    **sample.kwargs,
+                )
 
     # Tests that the forward and backward passes of operations produce the
     #   same values for the cross-product of op variants (method, inplace)
@@ -1033,7 +1150,6 @@ def test_variant_consistency_eager(self, device, dtype, op):
         operator = op.operator_variant
         inplace_operator = op.inplace_operator_variant
 
-
         # list of all inplace ops: inplace variant + alias inplace variants if exist
         inplace_ops = [inplace, inplace_operator]
         variants = [method, inplace, operator, inplace_operator]
@@ -1214,8 +1330,11 @@ def test_complex_half_reference_testing(self, device, dtype, op):
             actual = op(sample.input, *sample.args, **sample.kwargs)
             # sample.transform applies the lambda to torch.Tensor and torch.dtype.
             # However, we only want to apply it to Tensors with dtype `torch.complex32`..
-            transformed_sample = sample.transform(lambda x: x.to(torch.complex64) if isinstance(
-                x, torch.Tensor) and x.dtype is torch.complex32 else x)
+            transformed_sample = sample.transform(
+                lambda x: x.to(torch.complex64)
+                if isinstance(x, torch.Tensor) and x.dtype is torch.complex32
+                else x
+            )
             expected = op(
                 transformed_sample.input,
                 *transformed_sample.args,
@@ -1224,8 +1343,12 @@ def test_complex_half_reference_testing(self, device, dtype, op):
             # Since range of chalf is much less compared to cfloat,
             # we get `inf`s easily (eg. with `pow`, `exp`),
             # so we cast `cfloat` back to `chalf`.
-            expected = tree_map(lambda x: x.to(torch.complex32) if isinstance(
-                x, torch.Tensor) and x.dtype is torch.complex64 else x, expected)
+            expected = tree_map(
+                lambda x: x.to(torch.complex32)
+                if isinstance(x, torch.Tensor) and x.dtype is torch.complex64
+                else x,
+                expected,
+            )
 
             # `exact_dtype` is False because for ops like real, imag
             # we get different dtypes for `actual` and `expected`
@@ -1233,7 +1356,6 @@ def test_complex_half_reference_testing(self, device, dtype, op):
             # `cfloat` input -> `float` output
             self.assertEqual(actual, expected, exact_dtype=False)
 
-
     @ops(op_db, allowed_dtypes=(torch.bool,))
     @unittest.skipIf(TEST_WITH_UBSAN, "Test uses undefined behavior")
     def test_non_standard_bool_values(self, device, dtype, op):
@@ -1243,7 +1365,9 @@ def convert_boolean_tensors(x):
                 return x
 
             # Map False -> 0 and True -> Random value in [2, 255]
-            true_vals = torch.randint(2, 255, x.shape, dtype=torch.uint8, device=x.device)
+            true_vals = torch.randint(
+                2, 255, x.shape, dtype=torch.uint8, device=x.device
+            )
             false_vals = torch.zeros((), dtype=torch.uint8, device=x.device)
             x_int = torch.where(x, true_vals, false_vals)
 
@@ -1334,8 +1458,11 @@ def _tensor_requires_grad(x):
 
                     return False
 
-                requires_grad = _tensor_requires_grad(sample.input) \
-                    or _tensor_requires_grad(sample.args) or _tensor_requires_grad(sample.kwargs)
+                requires_grad = (
+                    _tensor_requires_grad(sample.input)
+                    or _tensor_requires_grad(sample.args)
+                    or _tensor_requires_grad(sample.kwargs)
+                )
                 if not requires_grad:
                     continue
 
@@ -1391,16 +1518,12 @@ def _tensor_requires_grad(x):
             if len(partially_supported_forward) > 0:
                 msg = (
                     msg
-                    + "The following dtypes only worked on some samples during forward: {}.\n".format(
-                        partially_supported_forward
-                    )
+                    + f"The following dtypes only worked on some samples during forward: {partially_supported_forward}.\n"
                 )
             if len(partially_supported_backward) > 0:
                 msg = (
                     msg
-                    + "The following dtypes only worked on some samples during backward: {}.\n".format(
-                        partially_supported_backward
-                    )
+                    + f"The following dtypes only worked on some samples during backward: {partially_supported_backward}.\n"
                 )
             print(msg)
 
@@ -1425,33 +1548,31 @@ def _tensor_requires_grad(x):
         if len(supported_but_unclaimed_forward) > 0:
             msg = (
                 msg
-                + "The following dtypes worked in forward but are not listed by the OpInfo: {}.\n".format(
-                    supported_but_unclaimed_forward
-                )
+                + "The following dtypes worked in forward but are not listed by the OpInfo: "
+                + f"{supported_but_unclaimed_forward}.\n"
             )
         if len(supported_but_unclaimed_backward) > 0:
             msg = (
                 msg
-                + "The following dtypes worked in backward but are not listed by the OpInfo: {}.\n".format(
-                    supported_but_unclaimed_backward
-                )
+                + "The following dtypes worked in backward but are not listed by the OpInfo: "
+                + f"{supported_but_unclaimed_backward}.\n"
             )
         if len(claimed_but_unsupported_forward) > 0:
             msg = (
                 msg
-                + "The following dtypes did not work in forward but are listed by the OpInfo: {}.\n".format(
-                    claimed_but_unsupported_forward
-                )
+                + "The following dtypes did not work in forward but are listed by the OpInfo: "
+                + f"{claimed_but_unsupported_forward}.\n"
             )
         if len(claimed_but_unsupported_backward) > 0:
             msg = (
                 msg
-                + "The following dtypes did not work in backward but are listed by the OpInfo: {}.\n".format(
-                    claimed_but_unsupported_backward
-                )
+                + "The following dtypes did not work in backward "
+                + f"but are listed by the OpInfo: {claimed_but_unsupported_backward}.\n"
             )
 
-        all_claimed_but_unsupported = set.union(claimed_but_unsupported_backward, claimed_but_unsupported_forward)
+        all_claimed_but_unsupported = set.union(
+            claimed_but_unsupported_backward, claimed_but_unsupported_forward
+        )
         if all_claimed_but_unsupported:
             msg += "Unexpected failures raised the following errors:\n"
             for dtype in all_claimed_but_unsupported:
@@ -1462,7 +1583,10 @@ def _tensor_requires_grad(x):
     # Validates that each OpInfo that sets promotes_int_to_float=True does as it says
     @skipMeta
     @onlyNativeDeviceTypes
-    @ops((op for op in op_db if op.promotes_int_to_float), allowed_dtypes=integral_types_and(torch.bool))
+    @ops(
+        (op for op in op_db if op.promotes_int_to_float),
+        allowed_dtypes=integral_types_and(torch.bool),
+    )
     def test_promotes_int_to_float(self, device, dtype, op):
         for sample in op.sample_inputs(device, dtype):
             output = op(sample.input, *sample.args, **sample.kwargs)
@@ -1488,7 +1612,9 @@ def test_operator(self, device, dtype, op):
             args = [sample.input] + list(sample.args)
             kwargs = sample.kwargs
             composite_compliance.check_with_mode(op, args, kwargs, self.assertEqual)
-            composite_compliance.check_all_permutations(op, args, kwargs, self.assertEqual)
+            composite_compliance.check_all_permutations(
+                op, args, kwargs, self.assertEqual
+            )
 
     @unittest.skipIf(
         IS_FBCODE or IS_SANDCASTLE, "__torch_dispatch__ does not work in fbcode"
@@ -1503,9 +1629,13 @@ def test_backward(self, device, dtype, op):
             # We pass assertEqual so that decorators like `toleranceOverride`
             # actually work (otherwise they silently do nothing!)
             composite_compliance.check_backward_formula(
-                op.get_op(), args, kwargs,
+                op.get_op(),
+                args,
+                kwargs,
                 sample.output_process_fn_grad,
-                op.gradcheck_wrapper, self.assertEqual)
+                op.gradcheck_wrapper,
+                self.assertEqual,
+            )
 
     @unittest.skipIf(
         IS_FBCODE or IS_SANDCASTLE, "__torch_dispatch__ does not work in fbcode"
@@ -1526,7 +1656,167 @@ def test_forward_ad(self, device, dtype, op):
             # We pass assertEqual so that decorators like `toleranceOverride`
             # actually work (otherwise they silently do nothing!)
             composite_compliance.check_forward_ad_formula(
-                op.get_op(), args, kwargs, op.gradcheck_wrapper, self.assertEqual)
+                op.get_op(), args, kwargs, op.gradcheck_wrapper, self.assertEqual
+            )
+
+    @ops(op_db, allowed_dtypes=(torch.float,))
+    def test_cow_input(self, device, dtype, op):
+        samples = op.sample_inputs(device, dtype, requires_grad=op.supports_autograd)
+
+        def is_strided_tensor(arg):
+            return torch.is_tensor(arg) and arg.layout == torch.strided
+
+        def check_ignore_materialize(idx_or_kw, allow_list):
+            return (allow_list is not None) and (idx_or_kw in allow_list)
+
+        def check_cow_input(
+            arg,
+            arg_copy,
+            idx_or_kw,
+            backward_or_forward="forward",
+            supports_cow_input_no_materialize=op.supports_cow_input_no_materialize_forward,
+            allow_list=op.allow_cow_input_materialize_forward,
+        ):
+            arg_name = (
+                f"Argument {idx_or_kw}"
+                if isinstance(idx_or_kw, int)
+                else f"Keyword argument '{idx_or_kw}'"
+            ) + f" during {backward_or_forward} call"
+
+            if is_strided_tensor(arg):
+                is_cow = torch._C._is_cow_tensor(arg)
+
+                if supports_cow_input_no_materialize and not check_ignore_materialize(
+                    idx_or_kw, allow_list
+                ):
+                    self.assertTrue(
+                        is_cow,
+                        msg=(
+                            f"{arg_name} unexpectedly materializes. "
+                            f"Either set `supports_cow_input_no_materialize_{backward_or_forward}=False` "
+                            "in this operation's OpInfo, add the arg to the OpInfo's "
+                            f"`allow_cow_input_materialize_{backward_or_forward}` list, or change the "
+                            "implementation to avoid materialization."
+                        ),
+                    )
+
+                if is_cow:
+                    self.assertTrue(
+                        torch.allclose(arg, arg_copy, rtol=0, atol=0, equal_nan=True),
+                        msg=(
+                            f"{arg_name} avoided materialization, "
+                            "but the operation mutated its data."
+                        ),
+                    )
+
+        for sample in samples:
+            args_raw = [sample.input] + list(sample.args)
+            kwargs_raw = sample.kwargs
+            args_copy = []
+            args = []
+            kwargs_copy = {}
+            kwargs = {}
+
+            # Convert strided tensor inputs to COW tensors and make copies of
+            # all inputs
+            for idx, arg in enumerate(args_raw):
+                if is_strided_tensor(arg):
+                    args_copy.append(arg.clone().detach())
+                    args.append(torch._lazy_clone(arg))
+                else:
+                    if torch.is_tensor(arg):
+                        args_copy.append(arg.clone().detach())
+                    else:
+                        args_copy.append(copy.deepcopy(arg))
+                    args.append(arg)
+
+            for kw, arg in kwargs_raw.items():
+                if is_strided_tensor(arg):
+                    kwargs_copy[kw] = arg.clone().detach()
+                    kwargs[kw] = torch._lazy_clone(arg)
+                else:
+                    if torch.is_tensor(arg):
+                        kwargs_copy[kw] = arg.clone().detach()
+                    else:
+                        kwargs_copy[kw] = copy.deepcopy(arg)
+                    kwargs[kw] = arg
+
+            leaf_tensors = composite_compliance.gather_leaf_tensors(args, kwargs)
+
+            # Call forward op
+            results_raw = op.get_op()(*args, **kwargs)
+
+            # Check that COW inputs remain COW after the forward op is executed
+            for idx, arg in enumerate(args):
+                check_cow_input(arg, args_copy[idx], idx)
+
+            for kw, arg in kwargs.items():
+                check_cow_input(arg, kwargs_copy[kw], kw)
+
+            # Call backward op if it is supported. This part of the test is
+            # based on `composite_compliance.check_backward_formula`
+            if (
+                op.supports_autograd
+                and len(leaf_tensors) > 0
+                and not op.skip_cow_input_backward
+            ):
+                if sample.output_process_fn_grad is not None:
+                    results_raw = sample.output_process_fn_grad(results_raw)
+
+                leaf_results = pytree.tree_leaves(results_raw)
+                results = [
+                    r
+                    for r in leaf_results
+                    if isinstance(r, torch.Tensor) and r.requires_grad
+                ]
+
+                all_results_strided = all(
+                    is_strided_tensor(result) for result in results
+                )
+
+                # Only test backward if the results are strided tensors
+                if all_results_strided:
+                    output_grads_raw = [
+                        torch.ones(r.shape, device=r.device, dtype=r.dtype)
+                        for r in results
+                    ]
+                    output_grads_copy = []
+                    output_grads = []
+
+                    # Convert output grads to COW tensors and make copies
+                    for output_grad in output_grads_raw:
+                        output_grads_copy.append(output_grad.clone().detach())
+                        output_grads.append(torch._lazy_clone(output_grad))
+
+                    input_grads = torch.autograd.grad(
+                        results,
+                        leaf_tensors,
+                        output_grads,
+                        allow_unused=True,
+                        retain_graph=True,
+                    )
+
+                    # Check that COW inputs remain COW after the backward op is executed
+                    for idx, arg in enumerate(args):
+                        check_cow_input(
+                            arg,
+                            args_copy[idx],
+                            idx,
+                            backward_or_forward="backward",
+                            supports_cow_input_no_materialize=op.supports_cow_input_no_materialize_backward,
+                            allow_list=op.allow_cow_input_materialize_backward,
+                        )
+
+                    # Check that COW inputs remain COW after the backward op is executed
+                    for idx, output_grad in enumerate(output_grads):
+                        check_cow_input(
+                            output_grad,
+                            output_grads_copy[idx],
+                            f"output grad {idx}",
+                            backward_or_forward="backward",
+                            supports_cow_input_no_materialize=op.supports_cow_input_no_materialize_backward,
+                            allow_list=op.allow_cow_input_materialize_backward,
+                        )
 
     @ops(op_db, allowed_dtypes=(torch.float,))
     def test_view_replay(self, device, dtype, op):
@@ -1549,9 +1839,9 @@ def _assert_match_metadata(a, b):
                 # forward and reverse views via a functioning view_func() / rev_view_func().
                 for out in outs:
                     if not (
-                        isinstance(out, torch.Tensor) and
-                        out._is_view() and
-                        out._base is inp
+                        isinstance(out, torch.Tensor)
+                        and out._is_view()
+                        and out._base is inp
                     ):
                         continue
 
@@ -1560,6 +1850,7 @@ def _assert_match_metadata(a, b):
                     _assert_match_metadata(new_inp, inp)
                     new_out = out._view_func_unsafe(new_inp)
                     _assert_match_metadata(new_out, out)
+                    self.assertEqual(new_out, out)
 
                     # reverse view_func
                     new_out = out.detach()
@@ -1757,6 +2048,7 @@ def is_bit_set(x):
             torch.is_complex,
         )
 
+
 # input strides and size may have been altered due to the result of an inplace op
 def check_inplace_view(func, input, rs, input_size, input_strides):
     if func is None:
@@ -1768,7 +2060,7 @@ def check_inplace_view(func, input, rs, input_size, input_strides):
         unequal_strides = rs.stride() != input_strides
         # resize_ should probably have inplace_view tag. Not adding the tag since it
         # breaks some codegen logic
-        if (unequal_size or unequal_strides):
+        if unequal_size or unequal_strides:
             if isinstance(func, torch._ops.OpOverloadPacket):
                 func = func.default
             # Reference: https://github.com/pytorch/pytorch/issues/78759
@@ -1776,9 +2068,10 @@ def check_inplace_view(func, input, rs, input_size, input_strides):
                 # TODO: use self.assertIn when we have separate tests for each tag
                 assert torch.Tag.inplace_view in func.tags
 
+
 # A mode that when enabled runs correctness checks to ensure
 # that operators have expected tags based on their input and
-# ouput tensor properties
+# output tensor properties
 class TestTagsMode(TorchDispatchMode):
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         if isinstance(args[0], torch.Tensor):
@@ -1790,6 +2083,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             rs = func(*args, **kwargs)
         return rs
 
+
 # Test to verify the correctness for tags in `tags.yaml`, also available for access through `torch.Tags`
 @unMarkDynamoStrictTest
 class TestTags(TestCase):
@@ -1810,6 +2104,7 @@ def test_tags(self, device, dtype, op):
                 opoverloadpacket = getattr(torch.ops.aten, aten_name, None)
                 check_inplace_view(opoverloadpacket, input, rs, old_size, old_stride)
 
+
 class TestSelfKwarg(TestCase):
     def test_self_kwargs(self):
         """Verify that we can call the aten ops with all kwargs even if the
@@ -1818,157 +2113,171 @@ def test_self_kwargs(self):
         torch.ops.aten.reshape.default(self=torch.rand(1, 2), shape=[2])
         torch.ops.aten.min.default(self=torch.rand(100))
 
+
 @unMarkDynamoStrictTest
 class TestRefsOpsInfo(TestCase):
-
-    import_paths = ["_refs", "_refs.special", "_refs.nn.functional", "_refs.fft", "_refs._conversions"]
-    module_alls = [(path, import_module(f"torch.{path}").__all__) for path in import_paths]
-    ref_ops_names = tuple(itertools.chain.from_iterable(
-        [f"{path}.{op}" for op in module_all] for path, module_all in module_alls))
+    import_paths = [
+        "_refs",
+        "_refs.special",
+        "_refs.nn.functional",
+        "_refs.fft",
+        "_refs._conversions",
+    ]
+    module_alls = [
+        (path, import_module(f"torch.{path}").__all__) for path in import_paths
+    ]
+    ref_ops_names = tuple(
+        itertools.chain.from_iterable(
+            [f"{path}.{op}" for op in module_all] for path, module_all in module_alls
+        )
+    )
     ref_db_names = {ref_op.name for ref_op in python_ref_db}
 
     # TODO: References that do not have an entry in python_ref_db
     skip_ref_ops = {
-        '_refs.alias',
-        '_refs.bitwise_right_shift',
-        '_refs.copy_to',
-        '_refs.empty_permuted',
-        '_refs.empty_strided',
-        '_refs.equal',
-        '_refs.full',
-        '_refs.full_like',
-        '_refs.is_complex',
-        '_refs.to',
-        '_refs.mvlgamma',
-        '_refs.ones',
-        '_refs.ones_like',
-        '_refs.special.expit',
-        '_refs.std_var',
-        '_refs.swap_axes',
-        '_refs.uniform',
-        '_refs.scalar_tensor',
-        '_refs.trunc_divide',
-        '_refs.zero',
-        '_refs.zeros',
-        '_refs.zeros_like',
-        '_refs.rfloordiv',
-        '_refs.rtruediv',
-        '_refs.rpow',
+        "_refs.alias",
+        "_refs.bitwise_right_shift",
+        "_refs.copy_to",
+        "_refs.empty_permuted",
+        "_refs.empty_strided",
+        "_refs.equal",
+        "_refs.full",
+        "_refs.full_like",
+        "_refs.is_complex",
+        "_refs.to",
+        "_refs.mvlgamma",
+        "_refs.ones",
+        "_refs.ones_like",
+        "_refs.special.expit",
+        "_refs.std_var",
+        "_refs.swap_axes",
+        "_refs.uniform",
+        "_refs.scalar_tensor",
+        "_refs.trunc_divide",
+        "_refs.zero",
+        "_refs.zeros",
+        "_refs.zeros_like",
+        "_refs.rfloordiv",
+        "_refs.rtruediv",
+        "_refs.rpow",
         # These should be tested with their out-of-place counterparts
-        '_refs.index_add_',
-        '_refs.index_copy_',
-        '_refs.index_fill_',
-        '_refs.native_group_norm',
+        "_refs.index_add_",
+        "_refs.index_copy_",
+        "_refs.index_fill_",
+        "_refs.native_group_norm",
     }
 
     not_in_decomp_table = {
         # duplicated in _decomp and _refs
-        '_refs.nn.functional.group_norm',
-        '_refs.nn.functional.mse_loss',
-        '_refs.floor_divide',
-        '_refs.rsub',
+        "_refs.nn.functional.group_norm",
+        "_refs.nn.functional.mse_loss",
+        "_refs.floor_divide",
         # duplicated as refs do not have decent support for advanced indexing
-        '_refs.index_copy',
-        '_refs.index_copy_',
-        '_refs.index_add',
-        '_refs.index_add_',
+        "_refs.index_copy",
+        "_refs.index_copy_",
+        "_refs.index_add",
+        "_refs.index_add_",
         # these are not aten ops?
-        '_refs._conversions.bfloat16',
-        '_refs._conversions.bool',
-        '_refs._conversions.byte',
-        '_refs._conversions.char',
-        '_refs._conversions.double',
-        '_refs._conversions.float',
-        '_refs._conversions.half',
-        '_refs._conversions.int',
-        '_refs._conversions.long',
-        '_refs._conversions.short',
-        '_refs._conversions.chalf',
-        '_refs._conversions.cfloat',
-        '_refs._conversions.cdouble',
-        '_refs.broadcast_shapes',
-        '_refs.broadcast_tensors',
-        '_refs.mvlgamma',
-        '_refs.nn.functional.layer_norm',
-        '_refs.nn.functional.tanhshrink',
-        '_refs.nn.functional.triplet_margin_loss',
-        '_refs.rfloordiv',
-        '_refs.rtruediv',
-        '_refs.rpow',
+        "_refs._conversions.bfloat16",
+        "_refs._conversions.bool",
+        "_refs._conversions.byte",
+        "_refs._conversions.char",
+        "_refs._conversions.double",
+        "_refs._conversions.float",
+        "_refs._conversions.half",
+        "_refs._conversions.int",
+        "_refs._conversions.long",
+        "_refs._conversions.short",
+        "_refs._conversions.chalf",
+        "_refs._conversions.cfloat",
+        "_refs._conversions.cdouble",
+        "_refs.broadcast_shapes",
+        "_refs.broadcast_tensors",
+        "_refs.mvlgamma",
+        "_refs.nn.functional.layer_norm",
+        "_refs.nn.functional.tanhshrink",
+        "_refs.nn.functional.triplet_margin_loss",
+        "_refs.rfloordiv",
+        "_refs.rtruediv",
+        "_refs.rpow",
         # CompositeImplicitAutograd
-        '_refs.allclose',
-        '_refs.atleast_1d',
-        '_refs.atleast_2d',
-        '_refs.atleast_3d',
-        '_refs.broadcast_to',
-        '_refs.chunk',
-        '_refs.column_stack',
-        '_refs.contiguous',
-        '_refs.dsplit',
-        '_refs.dstack',
-        '_refs.fill',
-        '_refs.fill_',
-        '_refs.flatten',
-        '_refs.fliplr',
-        '_refs.flipud',
-        '_refs.float_power',
-        '_refs.hsplit',
-        '_refs.hstack',
-        '_refs.isclose',
-        '_refs.isfinite',
-        '_refs.isreal',
-        '_refs.istft',
-        '_refs.log_softmax',
-        '_refs.movedim',
-        '_refs.narrow',
-        '_refs.nn.functional.dropout',
-        '_refs.nn.functional.l1_loss',
-        '_refs.nn.functional.smooth_l1_loss',
-        '_refs.nn.functional.log_softmax',
-        '_refs.nn.functional.poisson_nll_loss',
-        '_refs.nn.functional.softmax',
-        '_refs.nn.functional.softmin',
-        '_refs.positive',
-        '_refs.ravel',
-        '_refs.reshape',
-        '_refs.softmax',
-        '_refs.special.expit',
-        '_refs.special.log_softmax',
-        '_refs.special.softmax',
-        '_refs.square',
-        '_refs.stft',
-        '_refs.T',
-        '_refs.take_along_dim',
-        '_refs.tensor_split',
-        '_refs.to',
-        '_refs.true_divide',
-        '_refs.trunc_divide',
-        '_refs.vsplit',
-        '_refs.vstack',
-        '_refs.linalg.matrix_norm',
-        '_refs.linalg.norm',
-        '_refs.linalg.svd',
-        '_refs.linalg.svdvals',
-        '_refs.unflatten',
-        '_refs.sum_to_size',
+        "_refs.allclose",
+        "_refs.atleast_1d",
+        "_refs.atleast_2d",
+        "_refs.atleast_3d",
+        "_refs.broadcast_to",
+        "_refs.chunk",
+        "_refs.column_stack",
+        "_refs.contiguous",
+        "_refs.dsplit",
+        "_refs.dstack",
+        "_refs.fill",
+        "_refs.fill_",
+        "_refs.flatten",
+        "_refs.fliplr",
+        "_refs.flipud",
+        "_refs.float_power",
+        "_refs.hsplit",
+        "_refs.hstack",
+        "_refs.isclose",
+        "_refs.isfinite",
+        "_refs.isreal",
+        "_refs.istft",
+        "_refs.log_softmax",
+        "_refs.movedim",
+        "_refs.narrow",
+        "_refs.nn.functional.dropout",
+        "_refs.nn.functional.l1_loss",
+        "_refs.nn.functional.smooth_l1_loss",
+        "_refs.nn.functional.log_softmax",
+        "_refs.nn.functional.poisson_nll_loss",
+        "_refs.nn.functional.softmax",
+        "_refs.nn.functional.softmin",
+        "_refs.positive",
+        "_refs.ravel",
+        "_refs.reshape",
+        "_refs.softmax",
+        "_refs.special.expit",
+        "_refs.special.log_softmax",
+        "_refs.special.softmax",
+        "_refs.square",
+        "_refs.stft",
+        "_refs.T",
+        "_refs.take_along_dim",
+        "_refs.tensor_split",
+        "_refs.to",
+        "_refs.true_divide",
+        "_refs.trunc_divide",
+        "_refs.vsplit",
+        "_refs.vstack",
+        "_refs.linalg.matrix_norm",
+        "_refs.linalg.norm",
+        "_refs.linalg.svd",
+        "_refs.linalg.svdvals",
+        "_refs.unflatten",
+        "_refs.sum_to_size",
         # ref implementation missing kwargs
-        '_refs.full_like',  # missing "layout"
-        '_refs.round',  # missing "decimals"
-        '_refs.scalar_tensor',  # missing "layout"
+        "_refs.full_like",  # missing "layout"
+        "_refs.scalar_tensor",  # missing "layout"
         # other
-        '_refs.block_diag',  # only refs._block_diag_iterable is in decomposition table
-        '_refs.empty',  # intentional; direct empty is faster and has less guards
-        '_refs.empty_permuted',  # intentional; direct empty is faster and has less guards
-        '_refs.expand_as',
-        '_refs.as_strided',  # _prims._as_strided_meta: "reduce() of empty sequence with no initial value"
-        '_refs.copy_to',  # torch._C._jit_get_operation: No such operator aten::copy_to
-        '_refs.equal',  # 'bool' object has no attribute 'dtype'
-        '_refs.conj',  # Calls _prims.conj
-        '_refs.real',
-        '_refs.imag',
-        '_refs.reshape_as',
-        '_refs.view_as',
-        '_refs.view_as_complex'  # TorchInductor does not support complex at the moment.
+        "_refs.block_diag",  # only refs._block_diag_iterable is in decomposition table
+        "_refs.empty",  # intentional; direct empty is faster and has less guards
+        "_refs.empty_permuted",  # intentional; direct empty is faster and has less guards
+        "_refs.expand_as",
+        "_refs.as_strided",  # _prims._as_strided_meta: "reduce() of empty sequence with no initial value"
+        "_refs.copy_to",  # torch._C._jit_get_operation: No such operator aten::copy_to
+        "_refs.equal",  # 'bool' object has no attribute 'dtype'
+        "_refs.conj",  # Calls _prims.conj
+        "_refs.real",
+        "_refs.imag",
+        "_refs.reshape_as",
+        "_refs.view_as",
+        "_refs.view_as_complex",  # TorchInductor does not support complex at the moment.
+        # the decompositions for these ops are slightly different
+        # because of out handling
+        "_refs.var_mean",
+        "_refs.std_mean",
+        "_refs.native_layer_norm",
     }
 
     @parametrize("op", ref_ops_names)
@@ -1977,7 +2286,11 @@ def test_refs_are_in_python_ref_db(self, op):
         if op in self.skip_ref_ops:
             raise unittest.SkipTest(f"{op} does not have an entry in python_ref_db")
         elif inplace:
-            self.assertNotIn(op, self.ref_db_names, msg=f"{op} is an in-place operation and should not have an OpInfo")
+            self.assertNotIn(
+                op,
+                self.ref_db_names,
+                msg=f"{op} is an in-place operation and should not have an OpInfo",
+            )
         else:
             # Intentionally don't use assertIn to avoid printing the
             # (very large) container
@@ -1985,17 +2298,23 @@ def test_refs_are_in_python_ref_db(self, op):
 
     @parametrize("op", ref_ops_names)
     def test_refs_are_in_decomp_table(self, op):
-        path = op.split('.')
-        module_path = '.'.join(path[:-1])
+        path = op.split(".")
+        module_path = ".".join(path[:-1])
         op_name = path[-1]
         op_impl = getattr(import_module(f"torch.{module_path}"), op_name)
 
         if op in self.not_in_decomp_table:
-            self.assertNotIn(op_impl, torch._decomp.decomposition_table.values(),
-                             f"Unexpectedly found {op} in torch._decomp.decomposition_table.values()")
+            self.assertNotIn(
+                op_impl,
+                torch._decomp.decomposition_table.values(),
+                f"Unexpectedly found {op} in torch._decomp.decomposition_table.values()",
+            )
         else:
-            self.assertIn(op_impl, torch._decomp.decomposition_table.values(),
-                          f"Did not find {op} in torch._decomp.decomposition_table.values()")
+            self.assertIn(
+                op_impl,
+                torch._decomp.decomposition_table.values(),
+                f"Did not find {op} in torch._decomp.decomposition_table.values()",
+            )
 
 
 fake_skips = (
@@ -2049,6 +2368,15 @@ def test_refs_are_in_decomp_table(self, op):
     "linalg.lstsq.grad_oriented",
 )
 
+# Ops that have dynamic output shapes that we can handle when
+# allow_dynamic_shape_ops is True in fake tensor shape environment.
+supported_dynamic_output_op_tests = (
+    "nonzero",
+    "unique",
+    "repeat_interleave",
+    "masked_select",
+)
+
 # some inputs invoke dynamic output shape operators, some do not
 sometimes_dynamic_output_op_test = (
     "__getitem__",
@@ -2062,9 +2390,7 @@ def test_refs_are_in_decomp_table(self, op):
     "allclose",
 )
 
-aliasing_failures = (
-    "histogramdd",
-)
+aliasing_failures = ("histogramdd",)
 
 fake_backward_skips = {
     "linalg.cond",
@@ -2081,7 +2407,7 @@ def test_refs_are_in_decomp_table(self, op):
 fake_backward_xfails = {skip(s) for s in fake_backward_skips} | {
     xfail("fft.ihfftn"),  # Mismatch in aten._conj_physical.default
     xfail("fft.ihfft2"),  # Mismatch in aten._conj_physical.default
-    skip('nn.functional.ctc_loss'),
+    skip("nn.functional.ctc_loss"),
 }
 
 fake_autocast_backward_xfails = {
@@ -2090,11 +2416,26 @@ def test_refs_are_in_decomp_table(self, op):
     skip("linalg.pinv"),
     skip("linalg.pinv", "hermitian"),
     skip("linalg.pinv", "singular"),
-    skip('pinverse'),
+    skip("pinverse"),
 }
 
+
 @unMarkDynamoStrictTest
 class TestFakeTensor(TestCase):
+    def setUp(self):
+        # Turn on FakeTensor caching and cross-checking for these tests:
+        cache_enabled = unittest.mock.patch(
+            "torch._dynamo.config.fake_tensor_cache_enabled", True
+        )
+        cache_enabled.start()
+        self.addCleanup(cache_enabled.stop)
+
+        cache_crosscheck = unittest.mock.patch(
+            "torch._dynamo.config.fake_tensor_cache_crosscheck_enabled", True
+        )
+        cache_crosscheck.start()
+        self.addCleanup(cache_crosscheck.stop)
+
     def _test_fake_helper(self, device, dtype, op, context):
         name = op.name
         if op.variant_test_name:
@@ -2104,12 +2445,28 @@ def _test_fake_helper(self, device, dtype, op, context):
 
         samples = op.sample_inputs(device, dtype, requires_grad=False)
         for sample in samples:
+            mode = FakeTensorMode()
+
+            from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+            allow_dynamic_output_shape_shape_env = ShapeEnv(
+                allow_dynamic_output_shape_ops=True
+            )
+
+            allow_dynamic_output_shape_mode = FakeTensorMode(
+                shape_env=allow_dynamic_output_shape_shape_env
+            )
+
             try:
-                mode = FakeTensorMode()
+                with context():
+                    res = op(sample.input, *sample.args, **sample.kwargs)
+            except Exception:
+                continue
 
+            def run_with_fake_mode_and_verify(fake_mode, match_results=True):
                 def map_to_fake(e):
                     if isinstance(e, torch.Tensor):
-                        return mode.from_tensor(e)
+                        return fake_mode.from_tensor(e)
                     else:
                         return e
 
@@ -2119,45 +2476,64 @@ def map_to_fake(e):
 
                 try:
                     with context():
-                        res = op(sample.input, *sample.args, **sample.kwargs)
-                except Exception as e:
-                    continue
+                        with fake_mode:
+                            res_fake = op(input, *args, **kwargs)
 
-                with context():
-                    with mode:
-                        res_fake = op(input, *args, **kwargs)
+                    if not match_results:
+                        return
 
+                    for fake_out, real_out in zip(
+                        pytree.tree_leaves(res_fake), pytree.tree_leaves(res)
+                    ):
+                        if not isinstance(fake_out, torch.Tensor):
+                            self.assertTrue(not isinstance(real_out, torch.Tensor))
+                            self.assertEqual(fake_out, real_out)
+                            continue
 
-                for fake_out, real_out in zip(
-                    pytree.tree_leaves(res_fake), pytree.tree_leaves(res)
-                ):
-                    if not isinstance(fake_out, torch.Tensor):
-                        self.assertTrue(not isinstance(real_out, torch.Tensor))
-                        continue
+                        self.assertTrue(isinstance(fake_out, FakeTensor))
+                        # if you see a shape exception here, you may need to add
+                        # a `dynamic_output_shape` tag to an operator
 
-                    self.assertTrue(isinstance(fake_out, FakeTensor))
-                    # if you see a shape exception here, you may need to add
-                    # a `dynamic_output_shape` tag to an operator
+                        # prims/decomps must correctly model strides,
+                        # see https://github.com/pytorch/pytorch/issues/78050#issuecomment-1253950325
+                        prims.utils.compare_tensor_meta(fake_out, real_out, True)
 
-                    # prims/decomps must correctly model strides,
-                    # see https://github.com/pytorch/pytorch/issues/78050#issuecomment-1253950325
-                    prims.utils.compare_tensor_meta(fake_out, real_out, True)
+                        if name not in aliasing_failures:
+                            fake_aliasing = outputs_alias_inputs(
+                                (input, args, kwargs), res_fake
+                            )
+                            real_aliasing = outputs_alias_inputs(
+                                (sample.input, sample, args, sample.kwargs), res
+                            )
+                            self.assertEqual(fake_aliasing, real_aliasing)
 
-                    if name not in aliasing_failures:
-                        fake_aliasing = outputs_alias_inputs((input, args, kwargs), res_fake)
-                        real_aliasing = outputs_alias_inputs((sample.input, sample, args, sample.kwargs), res)
-                        self.assertEqual(fake_aliasing, real_aliasing)
+                    self.assertTrue(
+                        name not in dynamic_output_op_tests
+                        and name not in data_dependent_op_tests
+                    )
 
-                self.assertTrue(name not in dynamic_output_op_tests and name not in data_dependent_op_tests)
+                except torch._subclasses.fake_tensor.UnsupportedFakeTensorException:
+                    pass
+                except torch._subclasses.fake_tensor.UnsupportedOperatorException:
+                    pass
+                except torch._subclasses.fake_tensor.DynamicOutputShapeException:
+                    self.assertTrue(
+                        name in dynamic_output_op_tests
+                        or name in sometimes_dynamic_output_op_test
+                    )
+                    self.assertTrue(
+                        mode.shape_env is None
+                        or not mode.shape_env.allow_dynamic_output_shape_ops
+                        or name not in supported_dynamic_output_op_tests
+                    )
+                except torch._subclasses.fake_tensor.DataDependentOutputException:
+                    self.assertTrue(name in data_dependent_op_tests)
 
-            except torch._subclasses.fake_tensor.UnsupportedFakeTensorException:
-                pass
-            except torch._subclasses.fake_tensor.UnsupportedOperatorException:
-                pass
-            except torch._subclasses.fake_tensor.DynamicOutputShapeException:
-                self.assertTrue(name in dynamic_output_op_tests or name in sometimes_dynamic_output_op_test)
-            except torch._subclasses.fake_tensor.DataDependentOutputException:
-                self.assertTrue(name in data_dependent_op_tests)
+            run_with_fake_mode_and_verify(mode)
+            if name in supported_dynamic_output_op_tests:
+                run_with_fake_mode_and_verify(
+                    allow_dynamic_output_shape_mode, match_results=False
+                )
 
     @ops(op_db, dtypes=OpDTypes.any_one)
     def test_pointwise_ops(self, device, dtype, op):
@@ -2220,7 +2596,9 @@ def test_fake(self, device, dtype, op):
     def test_fake_autocast(self, device, dtype, op):
         if op.name in fake_autocast_device_skips[device]:
             self.skipTest("Skip failing test")
-        context = torch.cuda.amp.autocast if device == "cuda" else torch.cpu.amp.autocast
+        context = (
+            torch.cuda.amp.autocast if device == "cuda" else torch.cpu.amp.autocast
+        )
         self._test_fake_helper(device, dtype, op, context)
 
     def _test_fake_crossref_helper(self, device, dtype, op, context):
@@ -2240,24 +2618,37 @@ def _test_fake_crossref_helper(self, device, dtype, op, context):
 
             # TODO: enable check_aliasing, batch norm fails
             try:
-                with torch._subclasses.CrossRefFakeMode(ignore_op_fn=lambda fn: fn in common_skip_ops, check_aliasing=True):
-                    with warnings.catch_warnings(), context(), torch.autograd.set_multithreading_enabled(False):
+                with torch._subclasses.CrossRefFakeMode(
+                    ignore_op_fn=lambda fn: fn in common_skip_ops, check_aliasing=True
+                ):
+                    with warnings.catch_warnings(), context(), torch.autograd.set_multithreading_enabled(
+                        False
+                    ):
                         composite_compliance.compute_expected_grads(
-                            op.get_op(), args, kwargs,
+                            op.get_op(),
+                            args,
+                            kwargs,
                             sample.output_process_fn_grad,
-                            op.gradcheck_wrapper)
+                            op.gradcheck_wrapper,
+                        )
             except torch._subclasses.fake_tensor.UnsupportedOperatorException:
                 pass
 
     @onlyCUDA
     @ops([op for op in op_db if op.supports_autograd], allowed_dtypes=(torch.float,))
-    @skipOps('TestFakeTensor', 'test_fake_crossref_backward_no_amp', fake_backward_xfails)
+    @skipOps(
+        "TestFakeTensor", "test_fake_crossref_backward_no_amp", fake_backward_xfails
+    )
     def test_fake_crossref_backward_no_amp(self, device, dtype, op):
         self._test_fake_crossref_helper(device, dtype, op, contextlib.nullcontext)
 
     @onlyCUDA
     @ops([op for op in op_db if op.supports_autograd], allowed_dtypes=(torch.float,))
-    @skipOps('TestFakeTensor', 'test_fake_crossref_backward_amp', fake_backward_xfails | fake_autocast_backward_xfails)
+    @skipOps(
+        "TestFakeTensor",
+        "test_fake_crossref_backward_amp",
+        fake_backward_xfails | fake_autocast_backward_xfails,
+    )
     def test_fake_crossref_backward_amp(self, device, dtype, op):
         self._test_fake_crossref_helper(device, dtype, op, torch.cuda.amp.autocast)
 
@@ -2266,7 +2657,7 @@ def test_strided_layout(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype)
         for sample in samples:
             kwargs = sample.kwargs.copy()
-            kwargs['layout'] = torch.strided
+            kwargs["layout"] = torch.strided
             strided_result = op(sample.input, *sample.args, **kwargs)
             self.assertEqual(strided_result.layout, torch.strided)
 
diff --git a/test/test_ops_fwd_gradients.py b/test/test_ops_fwd_gradients.py
index 30748aa001d61..e54db321db8d3 100644
--- a/test/test_ops_fwd_gradients.py
+++ b/test/test_ops_fwd_gradients.py
@@ -1,16 +1,25 @@
 # Owner(s): ["module: unknown"]
 
-from functools import partial
 import platform
+from functools import partial
 from unittest import skipIf as skipif
+
 import torch
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    OpDTypes,
+    ops,
+)
+from torch.testing._internal.common_methods_invocations import op_db
 
-from torch.testing._internal.common_utils import unMarkDynamoStrictTest
 from torch.testing._internal.common_utils import (
-    TestGradients, run_tests, skipIfTorchInductor, IS_MACOS, TestCase)
-from torch.testing._internal.common_methods_invocations import op_db
-from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, ops, OpDTypes)
+    IS_MACOS,
+    run_tests,
+    skipIfTorchInductor,
+    TestCase,
+    TestGradients,
+    unMarkDynamoStrictTest,
+)
 
 # TODO: mitigate flaky issue on macOS https://github.com/pytorch/pytorch/issues/66033
 # AFAIK, c10::ThreadPool looks correct in the way it uses condition_variable wait. The
@@ -19,8 +28,10 @@
     torch.set_num_threads(1)
 
 # gradcheck requires double precision
-_gradcheck_ops = partial(ops, dtypes=OpDTypes.supported,
-                         allowed_dtypes=[torch.double, torch.cdouble])
+_gradcheck_ops = partial(
+    ops, dtypes=OpDTypes.supported, allowed_dtypes=[torch.double, torch.cdouble]
+)
+
 
 @unMarkDynamoStrictTest
 class TestFwdGradients(TestGradients):
@@ -33,31 +44,46 @@ def test_fn_fwgrad_bwgrad(self, device, dtype, op):
             self._check_helper(device, dtype, op, op.get_op(), "fwgrad_bwgrad")
         else:
             err_msg = r"Trying to use forward AD with .* that does not support it"
-            hint_msg = ("Running forward-over-backward gradgrad for an OP that has does not support it did not "
-                        "raise any error. If your op supports forward AD, you should set supports_fwgrad_bwgrad=True.")
+            hint_msg = (
+                "Running forward-over-backward gradgrad for an OP that has does not support it did not "
+                "raise any error. If your op supports forward AD, you should set supports_fwgrad_bwgrad=True."
+            )
             with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg):
                 self._check_helper(device, dtype, op, op.get_op(), "fwgrad_bwgrad")
 
-
     def _forward_grad_helper(self, device, dtype, op, variant, is_inplace):
         # TODO: clean up how attributes are passed to gradcheck from OpInfos
         def call_grad_test_helper():
-            check_batched_forward_grad = ((op.check_batched_forward_grad and not is_inplace) or
-                                          (op.check_inplace_batched_forward_grad and is_inplace))
-            self._grad_test_helper(device, dtype, op, variant, check_forward_ad=True, check_backward_ad=False,
-                                   check_batched_grad=False, check_batched_forward_grad=check_batched_forward_grad)
+            check_batched_forward_grad = (
+                op.check_batched_forward_grad and not is_inplace
+            ) or (op.check_inplace_batched_forward_grad and is_inplace)
+            self._grad_test_helper(
+                device,
+                dtype,
+                op,
+                variant,
+                check_forward_ad=True,
+                check_backward_ad=False,
+                check_batched_grad=False,
+                check_batched_forward_grad=check_batched_forward_grad,
+            )
+
         if op.supports_forward_ad:
             call_grad_test_helper()
         else:
             err_msg = r"Trying to use forward AD with .* that does not support it"
-            hint_msg = ("Running forward AD for an OP that has does not support it did not "
-                        "raise any error. If your op supports forward AD, you should set supports_forward_ad=True")
+            hint_msg = (
+                "Running forward AD for an OP that has does not support it did not "
+                "raise any error. If your op supports forward AD, you should set supports_forward_ad=True"
+            )
             with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg):
                 call_grad_test_helper()
 
     @_gradcheck_ops(op_db)
-    @skipif(platform.machine() == "s390x",
-            reason="Different precision of openblas functions: https://github.com/OpenMathLib/OpenBLAS/issues/4194")
+    @skipif(
+        platform.machine() == "s390x",
+        reason="Different precision of openblas functions: https://github.com/OpenMathLib/OpenBLAS/issues/4194",
+    )
     def test_forward_mode_AD(self, device, dtype, op):
         self._skip_helper(op, device, dtype)
 
@@ -71,10 +97,13 @@ def test_inplace_forward_mode_AD(self, device, dtype, op):
         if not op.inplace_variant or not op.supports_inplace_autograd:
             self.skipTest("Skipped! Operation does not support inplace autograd.")
 
-        self._forward_grad_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()), is_inplace=True)
+        self._forward_grad_helper(
+            device, dtype, op, self._get_safe_inplace(op.get_inplace()), is_inplace=True
+        )
+
 
 instantiate_device_type_tests(TestFwdGradients, globals())
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     TestCase._default_dtype_check_enabled = True
     run_tests()
diff --git a/test/test_ops_gradients.py b/test/test_ops_gradients.py
index 55d4112be91bb..a78112ec0dec0 100644
--- a/test/test_ops_gradients.py
+++ b/test/test_ops_gradients.py
@@ -1,24 +1,34 @@
 # Owner(s): ["module: unknown"]
 
 from functools import partial
-import torch
 
-from torch.testing._internal.common_utils import TestGradients, run_tests, TestCase
+import torch
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    OpDTypes,
+    ops,
+)
 from torch.testing._internal.common_methods_invocations import op_db
-from torch.testing._internal.control_flow_opinfo_db import control_flow_opinfo_db
+
+from torch.testing._internal.common_utils import (
+    run_tests,
+    TestCase,
+    TestGradients,
+    unMarkDynamoStrictTest,
+)
 from torch.testing._internal.custom_op_db import custom_op_db
-from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, ops, OpDTypes)
-from torch.testing._internal.common_utils import unMarkDynamoStrictTest
+from torch.testing._internal.hop_db import hop_db
 
 # gradcheck requires double precision
-_gradcheck_ops = partial(ops, dtypes=OpDTypes.supported,
-                         allowed_dtypes=[torch.double, torch.cdouble])
+_gradcheck_ops = partial(
+    ops, dtypes=OpDTypes.supported, allowed_dtypes=[torch.double, torch.cdouble]
+)
+
 
 @unMarkDynamoStrictTest
 class TestBwdGradients(TestGradients):
     # Tests that gradients are computed correctly
-    @_gradcheck_ops(op_db + control_flow_opinfo_db + custom_op_db)
+    @_gradcheck_ops(op_db + hop_db + custom_op_db)
     def test_fn_grad(self, device, dtype, op):
         # This is verified by test_dtypes in test_ops.py
         if dtype not in op.supported_backward_dtypes(torch.device(device).type):
@@ -49,16 +59,20 @@ def test_inplace_grad(self, device, dtype, op):
                     result = inplace(sample)
                     result.sum().backward()
         else:
-            self._grad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()))
+            self._grad_test_helper(
+                device, dtype, op, self._get_safe_inplace(op.get_inplace())
+            )
 
     # Test that gradients of gradients are computed correctly
-    @_gradcheck_ops(op_db + control_flow_opinfo_db + custom_op_db)
+    @_gradcheck_ops(op_db + hop_db + custom_op_db)
     def test_fn_gradgrad(self, device, dtype, op):
         self._skip_helper(op, device, dtype)
         if not op.supports_gradgrad:
-            self.skipTest("Op claims it doesn't support gradgrad. This is not verified.")
+            self.skipTest(
+                "Op claims it doesn't support gradgrad. This is not verified."
+            )
         else:
-            self._check_helper(device, dtype, op, op.get_op(), 'bwgrad_bwgrad')
+            self._check_helper(device, dtype, op, op.get_op(), "bwgrad_bwgrad")
 
     # Test that gradients of gradients are properly raising
     @_gradcheck_ops(op_db + custom_op_db)
@@ -69,7 +83,7 @@ def test_fn_fail_gradgrad(self, device, dtype, op):
 
         err_msg = r"derivative for .* is not implemented"
         with self.assertRaisesRegex(RuntimeError, err_msg):
-            self._check_helper(device, dtype, op, op.get_op(), 'bwgrad_bwgrad')
+            self._check_helper(device, dtype, op, op.get_op(), "bwgrad_bwgrad")
 
     # Method gradgrad (and grad, see above) tests are disabled since they're
     #   costly and redundant with function gradgrad (and grad) tests
@@ -83,11 +97,13 @@ def test_inplace_gradgrad(self, device, dtype, op):
         self._skip_helper(op, device, dtype)
         if not op.inplace_variant or not op.supports_inplace_autograd:
             self.skipTest("Skipped! Operation does not support inplace autograd.")
-        self._check_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()), "bwgrad_bwgrad")
+        self._check_helper(
+            device, dtype, op, self._get_safe_inplace(op.get_inplace()), "bwgrad_bwgrad"
+        )
 
 
 instantiate_device_type_tests(TestBwdGradients, globals())
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     TestCase._default_dtype_check_enabled = True
     run_tests()
diff --git a/test/test_ops_jit.py b/test/test_ops_jit.py
index 758f6d47dfe1e..28f8c7672c576 100644
--- a/test/test_ops_jit.py
+++ b/test/test_ops_jit.py
@@ -6,20 +6,39 @@
 import torch
 
 from torch.testing import FileCheck
-from torch.testing._internal.common_utils import \
-    (run_tests, IS_SANDCASTLE, clone_input_helper, first_sample, TestCase)
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    OpDTypes,
+    ops,
+)
+from torch.testing._internal.common_jit import (
+    check_against_reference,
+    JitCommonTestCase,
+)
 from torch.testing._internal.common_methods_invocations import op_db
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, ops, OpDTypes
-from torch.testing._internal.common_jit import JitCommonTestCase, check_against_reference
-from torch.testing._internal.jit_metaprogramming_utils import create_script_fn, create_traced_fn, check_alias_annotation
-from torch.testing._internal.jit_utils import disable_autodiff_subgraph_inlining, is_lambda
-from torch.testing._internal.common_utils import unMarkDynamoStrictTest
+from torch.testing._internal.common_utils import (
+    clone_input_helper,
+    first_sample,
+    IS_SANDCASTLE,
+    run_tests,
+    TestCase,
+    unMarkDynamoStrictTest,
+)
+from torch.testing._internal.jit_metaprogramming_utils import (
+    check_alias_annotation,
+    create_script_fn,
+    create_traced_fn,
+)
+from torch.testing._internal.jit_utils import (
+    disable_autodiff_subgraph_inlining,
+    is_lambda,
+)
 
 # variant testing is only done with torch.float and torch.cfloat to avoid
 #   excessive test times and maximize signal to noise ratio
-_variant_ops = partial(ops, dtypes=OpDTypes.supported,
-                       allowed_dtypes=(torch.float, torch.cfloat))
-
+_variant_ops = partial(
+    ops, dtypes=OpDTypes.supported, allowed_dtypes=(torch.float, torch.cfloat)
+)
 
 
 # Tests operators for consistency between JIT and eager, also checks
@@ -37,17 +56,25 @@ class TestJit(JitCommonTestCase):
     # TODO WARNING: inplace x {traced, scripted} not currently tested
     @_variant_ops(op_db)
     def test_variant_consistency_jit(self, device, dtype, op):
-        _requires_grad = (dtype in op.supported_backward_dtypes(torch.device(device).type))
+        _requires_grad = dtype in op.supported_backward_dtypes(
+            torch.device(device).type
+        )
 
         include_conjugated_inputs = op.test_conjugated_samples and dtype.is_complex
-        samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad, include_conjugated_inputs=include_conjugated_inputs)
+        samples = op.sample_inputs(
+            device,
+            dtype,
+            requires_grad=_requires_grad,
+            include_conjugated_inputs=include_conjugated_inputs,
+        )
 
         # Acquires variants to test
         func = op.get_op()
         method = op.get_method()
         variants = {
             # TODO: inplace tests currently fail, fix and add inplace variant
-            'function': func, 'method': method,
+            "function": func,
+            "method": method,
         }
 
         # scripting strips the torch.ops prefix from these operators
@@ -57,13 +84,12 @@ def test_variant_consistency_jit(self, device, dtype, op):
             self.skipTest("variant consistency doesn't work on torch.ops")
 
         # TODO: find better way to standardize on op registration itself..
-        has_fake_function = op.name in ["resize_", 'resize_as_']
+        has_fake_function = op.name in ["resize_", "resize_as_"]
 
         if has_fake_function:
-            variants = {'method': getattr(torch.Tensor, op.name)}
+            variants = {"method": getattr(torch.Tensor, op.name)}
             samples = op.sample_inputs(device, dtype, requires_grad=False)
 
-
         tested = False
         for sample in samples:
             # Test traced and scripted consistency
@@ -80,22 +106,30 @@ def test_variant_consistency_jit(self, device, dtype, op):
 
                 tested = True
                 try:
-                    self.indiv_variant_test_jit(device, dtype, op, sample, func_type, variant, has_fake_function)
+                    self.indiv_variant_test_jit(
+                        device, dtype, op, sample, func_type, variant, has_fake_function
+                    )
                 except Exception as e:
-                    variant_error_info = dedent(f"""
+                    variant_error_info = dedent(
+                        f"""
                         Error testing {op.name} {func_type} variant
                         with dtype: {dtype}
                         with inputs {sample}:
-                    """)
-                    raise Exception(variant_error_info) from e
+                    """
+                    )
+                    raise Exception(variant_error_info) from e  # noqa: TRY002
 
         assert tested, "JIT Test does not execute any logic"
 
-    def indiv_variant_test_jit(self, device, dtype, op, sample, func_type, variant, has_fake_function):
-        _requires_grad = (dtype in op.supported_backward_dtypes(torch.device(device).type))
+    def indiv_variant_test_jit(
+        self, device, dtype, op, sample, func_type, variant, has_fake_function
+    ):
+        _requires_grad = dtype in op.supported_backward_dtypes(
+            torch.device(device).type
+        )
         support_script = op.supports_scripting
         # Create accessor for script function variant
-        name = op.name + '_' if func_type == 'inplace' else op.name
+        name = op.name + "_" if func_type == "inplace" else op.name
 
         # run with disable_autodiff_subgraph_inlining(True) to test
         #   autodiff support. Context manager forces the graph to contain
@@ -112,16 +146,23 @@ def out_fn(output):
                 return output
 
             def get_sample():
-                return clone_input_helper(sample.input) if op.name[-1] == '_' else sample.input
+                return (
+                    clone_input_helper(sample.input)
+                    if op.name[-1] == "_"
+                    else sample.input
+                )
 
             if support_script:
-                check_against_reference(self,
-                                        script_fn,
-                                        op.get_op(),
-                                        out_fn,
-                                        (get_sample(),) + sample.args,
-                                        sample.kwargs,
-                                        no_grad=not _requires_grad, no_gradgrad=not op.supports_gradgrad)
+                check_against_reference(
+                    self,
+                    script_fn,
+                    op.get_op(),
+                    out_fn,
+                    (get_sample(),) + sample.args,
+                    sample.kwargs,
+                    no_grad=not _requires_grad,
+                    no_gradgrad=not op.supports_gradgrad,
+                )
 
             # Check traced forward, grad, and grad grad
             # TODO: fix tracing here
@@ -131,13 +172,16 @@ def get_sample():
 
             if supports_tracing:
                 traced_fn = create_traced_fn(self, variant)
-                check_against_reference(self,
-                                        traced_fn,
-                                        op.get_op(),
-                                        out_fn,
-                                        (get_sample(),) + sample.args,
-                                        sample.kwargs,
-                                        no_grad=not _requires_grad, no_gradgrad=not op.supports_gradgrad)
+                check_against_reference(
+                    self,
+                    traced_fn,
+                    op.get_op(),
+                    out_fn,
+                    (get_sample(),) + sample.args,
+                    sample.kwargs,
+                    no_grad=not _requires_grad,
+                    no_gradgrad=not op.supports_gradgrad,
+                )
 
             # Check alias annotation schema for correctness (make
             #   sure inputs that aren't supposed to be modified aren't)
@@ -146,8 +190,13 @@ def get_sample():
             if dtype == torch.float32:
                 # TODO: no reason why we cant run this with tracing graph
                 if support_script and op.name != "rsub":
-                    check_alias_annotation(name, (get_sample(),) + sample.args, sample.kwargs,
-                                           func_type=func_type, aten_name=op.aten_name)
+                    check_alias_annotation(
+                        name,
+                        (get_sample(),) + sample.args,
+                        sample.kwargs,
+                        func_type=func_type,
+                        aten_name=op.aten_name,
+                    )
 
                 # TODO: use script graph as well
                 checked_shape_analysis = False
@@ -156,14 +205,18 @@ def get_sample():
 
                     # right now, tuple of outputs and tensor output supported
                     # TODO: list of tensor outputs
-                    tuple_of_tensors = isinstance(out, tuple) and all(isinstance(elem, torch.Tensor) for elem in out)
+                    tuple_of_tensors = isinstance(out, tuple) and all(
+                        isinstance(elem, torch.Tensor) for elem in out
+                    )
 
                     if isinstance(out, torch.Tensor) or tuple_of_tensors:
                         if tuple_of_tensors:
                             sizes = [elem.size() for elem in out]
                         else:
                             sizes = out.size()
-                        self.checkShapeAnalysis(sizes, traced_fn.graph, op.assert_jit_shape_analysis)
+                        self.checkShapeAnalysis(
+                            sizes, traced_fn.graph, op.assert_jit_shape_analysis
+                        )
                         checked_shape_analysis = True
                 if op.assert_jit_shape_analysis:
                     self.assertTrue(checked_shape_analysis)
@@ -173,20 +226,31 @@ def get_sample():
                 # Sandcastle doesn't fuse nodes
                 if IS_SANDCASTLE:
                     # fusible nodes are expected to be found in FusionGroups in the DifferentiableGraphs
-                    nonfusible_nodes = op.autodiff_nonfusible_nodes + op.autodiff_fusible_nodes
+                    nonfusible_nodes = (
+                        op.autodiff_nonfusible_nodes + op.autodiff_fusible_nodes
+                    )
                     fusible_nodes = []
                 else:
                     nonfusible_nodes = op.autodiff_nonfusible_nodes
                     fusible_nodes = op.autodiff_fusible_nodes
 
                 if supports_tracing:
-                    self.assertAutodiffNode(traced_fn.last_graph, op.assert_autodiffed, nonfusible_nodes, fusible_nodes)
+                    self.assertAutodiffNode(
+                        traced_fn.last_graph,
+                        op.assert_autodiffed,
+                        nonfusible_nodes,
+                        fusible_nodes,
+                    )
                 if support_script:
-                    self.assertAutodiffNode(script_fn.last_graph, op.assert_autodiffed, nonfusible_nodes, fusible_nodes)
+                    self.assertAutodiffNode(
+                        script_fn.last_graph,
+                        op.assert_autodiffed,
+                        nonfusible_nodes,
+                        fusible_nodes,
+                    )
 
     # alias testing is only done with torch.float for the same reason
-    _alias_ops = partial(ops, dtypes=OpDTypes.supported,
-                         allowed_dtypes=(torch.float,))
+    _alias_ops = partial(ops, dtypes=OpDTypes.supported, allowed_dtypes=(torch.float,))
 
     @_alias_ops(op for op in op_db if op.aliases)
     def test_jit_alias_remapping(self, device, dtype, op):
@@ -209,16 +273,18 @@ def quote_strs(v):
 
             return str(v)
 
-        args_kw = args + \
-            [f"{v}" for v in sample.args] + \
-            [f"{k}={quote_strs(v)}" for k, v in sample.kwargs.items()]
+        args_kw = (
+            args
+            + [f"{v}" for v in sample.args]
+            + [f"{k}={quote_strs(v)}" for k, v in sample.kwargs.items()]
+        )
 
         # Prepare data for test tracing
         sample_args_kwargs = ()
         if len(sample.args) > 0:
-            sample_args_kwargs += (sample.args, )
+            sample_args_kwargs += (sample.args,)
         if len(sample.kwargs) > 0:
-            sample_args_kwargs += (sample.kwargs, )
+            sample_args_kwargs += (sample.kwargs,)
 
         original_name = op.aten_name
         original_name_inplace = original_name + "_"
@@ -227,7 +293,11 @@ def quote_strs(v):
         for a_op in op.aliases:
             inplace = a_op.inplace_variant
             method_or_inplace = [a_op.inplace_variant, a_op.method_variant]
-            variants = (v for v in (a_op.op, a_op.method_variant, a_op.inplace_variant) if v is not None)
+            variants = (
+                v
+                for v in (a_op.op, a_op.method_variant, a_op.inplace_variant)
+                if v is not None
+            )
 
             # Test scripting:
             for variant in variants:
@@ -235,10 +305,10 @@ def quote_strs(v):
                 op_name = original_name_inplace if variant is inplace else original_name
 
                 if variant in method_or_inplace:
-                    fn_template = '''
+                    fn_template = """
                         def _fn(t0{c}):
                             return t0.{alias_name}({args_kw})
-                    '''
+                    """
                     # remove the first input tensor
                     script = fn_template.format(
                         c=", " if len(args_kw[1:]) > 1 else "",
@@ -246,10 +316,10 @@ def _fn(t0{c}):
                         alias_name=variant_name,
                     )
                 else:
-                    fn_template = '''
+                    fn_template = """
                         def _fn({args}):
                             return variant({args_kw})
-                    '''
+                    """
                     script = fn_template.format(
                         args=", ".join(args),
                         args_kw=", ".join(args_kw),
@@ -261,13 +331,15 @@ def _fn({args}):
 
                 scripted = torch.jit.CompilationUnit(script)._fn
 
-                if (variant is inplace and not torch.can_cast(expected_dtype, dtype)):
+                if variant is inplace and not torch.can_cast(expected_dtype, dtype):
                     try:
                         inp = clone_input_helper(sample.input)
                         scripted(inp)
                     except Exception as e:
                         continue
-                    self.fail("Inplace operation on integer tensor that should be promoted to float didn't fail!")
+                    self.fail(
+                        "Inplace operation on integer tensor that should be promoted to float didn't fail!"
+                    )
 
                 inp = clone_input_helper(sample.input)
                 scripted(inp)
@@ -294,6 +366,6 @@ def _fn(*sample_args, **sample_kwargs):
 
 instantiate_device_type_tests(TestJit, globals())
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     TestCase._default_dtype_check_enabled = True
     run_tests()
diff --git a/test/test_optim.py b/test/test_optim.py
index be886713e4d93..0303ab100298b 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -1,25 +1,88 @@
 # Owner(s): ["module: optimizer"]
 import functools
+import math
+import tempfile
+from typing import Any, Dict, Tuple
 import unittest
 from copy import deepcopy
+from unittest.mock import patch
+
+from torch.optim.lr_scheduler import ReduceLROnPlateau
 
 import torch
-from optim.test_optim import TestOptim, TestDifferentiableOptimizer  # noqa: F401
+from torch.optim import Optimizer, SGD
+from torch.optim.optimizer import register_optimizer_step_pre_hook, register_optimizer_step_post_hook
+from optim.test_optim import TestDifferentiableOptimizer  # noqa: F401
 from optim.test_lrscheduler import TestLRScheduler  # noqa: F401
 from optim.test_swa_utils import TestSWAUtils  # noqa: F401
 from torch.nn import Parameter
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_optimizers import (
-    optim_db, optims, OptimizerErrorEnum, _get_optim_inputs_including_global_cliquey_kwargs)
+    optim_db, optims, OptimizerErrorEnum, _get_optim_inputs_including_global_cliquey_kwargs, TensorTracker)
 from torch.testing._internal.common_device_type import (
-    instantiate_device_type_tests, largeTensorTest, onlyCPU, onlyCUDA, skipMPS)
-from torch.testing._internal.common_utils import markDynamoStrictTest, parametrize, run_tests, TestCase
-
+    instantiate_device_type_tests, largeTensorTest, onlyCPU, onlyCUDA, skipMPS, TEST_WITH_ROCM, onlyNativeDeviceTypes)
+from torch.testing._internal.common_utils import markDynamoStrictTest, parametrize, run_tests, TestCase, TEST_WITH_TORCHDYNAMO
+from torch.testing._internal.common_cuda import _create_scaling_case
+from torch.testing._internal.common_dtype import floating_types_and
 
 FP16_REDUCED_PRECISION = {'atol': 1e-5, 'rtol': 1e-4}
 
+def rosenbrock(tensor):
+    assert tensor.size() == torch.Size([2]), f"Requires tensor with 2 scalars but got {tensor.size()}"
+    x, y = tensor
+    return (1 - x) ** 2 + 100 * (y - x**2) ** 2
+
+
+def drosenbrock(tensor):
+    assert tensor.size() == torch.Size([2]), f"Requires tensor with 2 scalars but got {tensor.size()}"
+    x, y = tensor
+    return torch.stack((-400 * x * (y - x**2) - 2 * (1 - x), 200 * (y - x**2)))
+
+
 @markDynamoStrictTest
 class TestOptimRenewed(TestCase):
+    """
+    This test class validates the core optimizers and is structured as the correctness of:
+    - The update algorithms (forloop implementation)
+        * Every optimizer's algorithm is most readably implemented through a big for-loop
+          over all the parameters, which is what we refer to as the forloop or single tensor
+          implementation. These algorithms are manually validated by comparing to the paper
+          and systematically validated by assuring that the loss goes the right direction
+          when the optimizer has been applied.
+        * This implementation should compose with optimizer hyperparameters well, such as
+          supporting Tensor LRs, the capturable API, and sparse and complex parameters.
+    - Each varying implementation
+        * We then have implementations that improve upon the performance of the forloop
+          implementation by leveraging fusion, namely our foreach (mult_tensor) and fused
+          implementations.
+        * These variations are validated numerically by comparing with the forloop version
+          of the optimizer. In fact, we test most variations this way--we see the forloop
+          implementation as the ground truth and expect that improvements to it in any way
+          should be just as correct.
+        * Both params and optimizer states should be validated numerically.
+    - state_dict APIs
+        * The optimizer instance should be serializable
+        * Calling save and load should be deterministic
+        * Moving between devices should be seamless
+        * BC - load_state_dict should be able to handle older optimizer states
+    - Hook APIs (everything should fire in the right order)
+    - LR Scheduler integration (composing should not error + should go the right direction)
+    - Parameter groups (should be equivalent to having multiple optimizers)
+    - Erroring (what should error should error)
+
+    We also cover different ways of generating parameters and grads:
+    - With parameters, we either generate them randomly given specific shapes or we take
+      them from a sample NN module.
+        * Variety is important here because NN modules have type Parameter and randomly
+          generated tensors have type Tensor.
+        * Parameters can be sparse for a subset of the optimizers (check out OptimizerInfo)
+        * Complex parameters should be handled using view_as_real
+        * Parameters can be spread across different devices and different dtypes for any
+          given optimizer
+        * Parameters can be contiguous and noncontiguous
+    - With grads, we follow suit from the parameters.
+        * Grads can also be None, empty, or zero-valued, and this should not disrupt training.
+    """
 
     @onlyCPU
     @optims(optim_db)
@@ -56,6 +119,518 @@ def test_errors(self, device, dtype, optim_info):
                 raise NotImplementedError(f"Unknown error type {error_input.error_on}")
 
 
+    @parametrize("contiguous", [True, False])
+    @parametrize("with_lrsched", [True, False])
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_forloop_goes_right_direction(self, device, dtype, optim_info, contiguous, with_lrsched):
+        optim_cls = optim_info.optim_cls
+        schedulers_constructors = optim_info.scheduler_inputs if with_lrsched else [None]
+
+        for schedulers_constructor in schedulers_constructors:
+            # with tensor LR we need fresh inputs for each scheduler
+            # or mutating it will carry across iters
+            optim_inputs = optim_info.optim_inputs_func(device=device)
+            for optim_input in optim_inputs:
+                if "foreach" in optim_info.supported_impls:
+                    optim_input.kwargs["foreach"] = False  # force forloop
+                if contiguous:
+                    weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
+                    bias = Parameter(torch.randn((10), device=device, dtype=dtype))
+                else:
+                    weight = Parameter(torch.randn((10, 5, 2), device=device, dtype=dtype)[..., 0])
+                    bias = Parameter(torch.randn((10, 2), device=device, dtype=dtype)[..., 0])
+                input = torch.randn(5, device=device, dtype=dtype)
+
+                optimizer = optim_cls([weight, bias], **optim_input.kwargs)
+                schedulers = [s(optimizer) for s in (schedulers_constructor if schedulers_constructor else [])]
+
+                def closure():
+                    optimizer.zero_grad()
+                    loss = (weight.mv(input) + bias).pow(2).sum()
+                    loss.backward()
+                    if optim_info.only_supports_sparse_grads:
+                        # For this test, we naively convert the Tensor layout, which we know does
+                        # NOT represent the expected use case for optims like SparseAdam!
+                        weight.grad = weight.grad.to_sparse()
+                        bias.grad = bias.grad.to_sparse()
+                    return loss
+
+                initial_value = closure().item()
+                for _ in range(20):
+                    if optim_info.step_requires_closure:
+                        loss = optimizer.step(closure)
+                    else:
+                        loss = closure()
+                        optimizer.step()
+
+                    for scheduler in schedulers:
+                        if isinstance(scheduler, ReduceLROnPlateau):
+                            scheduler.step(loss)
+                        else:
+                            scheduler.step()
+
+                if optim_input.kwargs.get("maximize", False):
+                    self.assertGreater(closure().item(), initial_value)
+                else:
+                    self.assertLess(closure().item(), initial_value)
+
+
+    @onlyCUDA
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @parametrize("with_lrsched", [True, False])
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_forloop_goes_right_direction_multigpu(self, device, dtype, optim_info, with_lrsched):
+        optim_cls = optim_info.optim_cls
+        schedulers_constructors = optim_info.scheduler_inputs if with_lrsched else [None]
+        for schedulers_constructor in schedulers_constructors:
+            # We need a fresh set of inputs if we have a tensor LR
+            # to not carry mutations across iterations.
+            optim_inputs = optim_info.optim_inputs_func(device=device)
+            for optim_input in optim_inputs:
+                if "foreach" in optim_info.supported_impls:
+                    optim_input.kwargs["foreach"] = False  # force forloop
+
+                weight = Parameter(torch.randn((10, 5), device="cuda:0", dtype=dtype))
+                bias = Parameter(torch.randn((10), device="cuda:1", dtype=dtype))
+                inpt = torch.randn(5, device="cuda:0", dtype=dtype)
+
+                optimizer = optim_cls([weight, bias], **optim_input.kwargs)
+                schedulers = [s(optimizer) for s in (schedulers_constructor if schedulers_constructor else [])]
+
+                def closure():
+                    optimizer.zero_grad()
+                    loss = (weight.mv(inpt).cuda(1) + bias).pow(2).sum()
+                    loss.backward()
+                    if optim_info.only_supports_sparse_grads:
+                        # For this test, we naively convert the Tensor layout, which we know does
+                        # NOT represent the expected use case for optims like SparseAdam!
+                        weight.grad = weight.grad.to_sparse()
+                        bias.grad = bias.grad.to_sparse()
+                    return loss
+
+                initial_value = closure().item()
+                for _ in range(20):
+                    loss = optimizer.step(closure)
+                    for scheduler in schedulers:
+                        if isinstance(scheduler, ReduceLROnPlateau):
+                            scheduler.step(loss)
+                        else:
+                            scheduler.step()
+
+                if optim_input.kwargs.get("maximize", False):
+                    self.assertGreater(closure().item(), initial_value)
+                else:
+                    self.assertLess(closure().item(), initial_value)
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_param_group_with_lrscheduler_goes_right_direction(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+
+        for schedulers_c in optim_info.scheduler_inputs:
+            weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
+            bias = Parameter(torch.randn((10), device=device, dtype=dtype))
+            inpt = torch.randn(5, device=device, dtype=dtype)
+
+            optimizer = optim_cls([{"params": [weight]}, {"params": [bias], "lr": 0.01}])
+            schedulers = [scheduler_c(optimizer) for scheduler_c in schedulers_c]
+
+            def closure():
+                optimizer.zero_grad()
+                loss = (weight.mv(inpt) + bias).pow(2).sum()
+                loss.backward()
+                if optim_info.only_supports_sparse_grads:
+                    # For this test, we naively convert the Tensor layout, which we know does
+                    # NOT represent the expected use case for optims like SparseAdam!
+                    weight.grad = weight.grad.to_sparse()
+                    bias.grad = bias.grad.to_sparse()
+                return loss
+
+            initial_value = closure().item()
+            for _ in range(20):
+                loss = optimizer.step(closure)
+                for scheduler in schedulers:
+                    if isinstance(scheduler, ReduceLROnPlateau):
+                        scheduler.step(loss)
+                    else:
+                        scheduler.step()
+
+            self.assertLess(closure().item(), initial_value)
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_tensor_lr(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+
+        # Skip differentiable testing for now, see https://github.com/pytorch/pytorch/issues/116490
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info, skip=("differentiable",))
+        for optim_input in all_optim_inputs:
+            weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
+            weight_c = weight.clone().detach().requires_grad_(True)
+            bias = Parameter(torch.randn((10), device=device, dtype=dtype))
+            bias_c = bias.clone().detach().requires_grad_(True)
+            inpt = torch.randn(5, device=device, dtype=dtype)
+
+            kwargs = optim_input.kwargs
+            if "lr" in kwargs:
+                del kwargs["lr"]
+
+            kwargs["lr"] = 1.0 if optim_info.step_requires_closure else 1e-3
+            optimizer_r = optim_cls([weight, bias], **kwargs)
+
+            try:
+                kwargs["lr"] = torch.tensor(kwargs["lr"])
+                optimizer = optim_cls([weight_c, bias_c], **kwargs)
+            except ValueError as e:
+                self.assertRegex(str(e), ".*lr as a Tensor is not supported.*")
+                continue
+
+            def closure(optim, w, b, i):
+                optim.zero_grad()
+                loss = (w.mv(i) + b).pow(2).sum()
+                loss.backward()
+                if optim_info.only_supports_sparse_grads:
+                    # For this test, we naively convert the Tensor layout, which we know does
+                    # NOT represent the expected use case for optims like SparseAdam!
+                    w.grad = w.grad.to_sparse()
+                    b.grad = b.grad.to_sparse()
+                return loss
+
+            for _ in range(5):
+                if optim_info.step_requires_closure:
+                    optimizer_r.step(functools.partial(closure, optimizer_r, weight, bias, inpt))
+                    optimizer.step(functools.partial(closure, optimizer, weight_c, bias_c, inpt))
+                else:
+                    closure(optimizer_r, weight, bias, inpt)
+                    closure(optimizer, weight_c, bias_c, inpt)
+
+                self.assertEqual(weight, weight_c)
+                self.assertEqual(bias, bias_c)
+
+
+    @parametrize("with_lrsched", [True, False])
+    @optims([o for o in optim_db if o.supports_sparse or o.only_supports_sparse_grads], dtypes=[torch.float64])
+    def test_rosenbrock_sparse(self, device, dtype, optim_info, with_lrsched):
+        optim_cls = optim_info.optim_cls
+
+        # Skip differentiable testing for now, see https://github.com/pytorch/pytorch/issues/116490
+        # Fused impls do not support sparse gradients
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
+            device, dtype, optim_info, skip=("differentiable", "fused"))
+        kwarg_updates, schedulers_constructors = optim_info.metadata_for_sparse
+
+        if with_lrsched and len(schedulers_constructors) == 0:
+            return
+
+        supported_inputs = []
+        if len(kwarg_updates) != 0:
+            seen = set()
+            for i in all_optim_inputs:
+                for k in kwarg_updates:
+                    if k in i.kwargs:
+                        del i.kwargs[k]
+                hashable_kwargs = tuple(sorted(i.kwargs.items()))
+                if len(i.kwargs) > 0 and hashable_kwargs not in seen:
+                    supported_inputs.append(i)
+                    seen.add(hashable_kwargs)
+                    if "lr" in kwarg_updates:
+                        i.kwargs["lr"] = kwarg_updates["lr"]
+        else:
+            supported_inputs = all_optim_inputs
+
+        for optim_input in supported_inputs:
+            kwargs = optim_input.kwargs
+            multi_tensor = kwargs.get("foreach", False)
+
+            # For rosenbrock tests, it is mandated that the param is a tensor with 2 numbers
+            if multi_tensor:
+                params_t = [torch.tensor([1.5, 1.5]), torch.tensor([1.5, 1.5], dtype=dtype)]
+            else:
+                params_t = [torch.tensor([1.5, 1.5])]
+
+            params = [Parameter(param_t) for param_t in params_t]
+            optimizer = optim_cls(params, **kwargs)
+            schedulers = [s(optimizer) for s in (schedulers_constructors if with_lrsched else [])]
+
+            if not optim_info.only_supports_sparse_grads:
+                params_c = [Parameter(param_t.clone()) for param_t in params_t]
+                optimizer_c = optim_cls(params_c, **kwargs)
+                schedulers_c = [s(optimizer_c) for s in (schedulers_constructors if with_lrsched else [])]
+
+            solution = torch.tensor([1, 1])
+            with torch.no_grad():
+                initial_dist = sum(param.dist(solution) for param in params)
+
+            def get_grad(param, sparse_grad, w):
+                grad = drosenbrock(param)
+                # NB: We torture test the optimizer by returning an
+                # uncoalesced sparse tensor
+
+                # Depending on w, provide only the x or y gradient
+                if sparse_grad:
+                    if w:
+                        i = torch.tensor([[0, 0]], dtype=torch.int64)
+                        x = grad[0]
+                        v = torch.tensor([x / 4.0, x - x / 4.0])
+                    else:
+                        i = torch.tensor([[1, 1]], dtype=torch.int64)
+                        y = grad[1]
+                        v = torch.tensor([y - y / 4.0, y / 4.0])
+                    grad_out = torch.sparse_coo_tensor(i, v, (2,), dtype=v.dtype)
+                else:
+                    if w:
+                        grad_out = torch.tensor([grad[0], 0], dtype=param.dtype)
+                    else:
+                        grad_out = torch.tensor([0, grad[1]], dtype=param.dtype)
+                return grad_out
+
+            def eval(params, sparse_grad, w):
+                optimizer.zero_grad()
+                if multi_tensor:
+                    loss = sum(rosenbrock(param) for param in params)
+                else:
+                    loss = rosenbrock(params[0])
+                loss.backward()
+
+                grads_out = [get_grad(param, sparse_grad, w) for param in params]
+                with torch.no_grad():
+                    params[0].grad = grads_out[0]
+                    if multi_tensor:
+                        params[1].grad = grads_out[1].to(dtype=dtype)
+                return loss
+
+            for i in range(1800):
+                # Do cyclic coordinate descent
+                w = i % 2
+                optimizer.step(functools.partial(eval, params, True, w))
+                for scheduler in schedulers:
+                    if isinstance(scheduler, ReduceLROnPlateau):
+                        scheduler.step(rosenbrock(params[0]))
+                    else:
+                        scheduler.step()
+                if not optim_info.only_supports_sparse_grads:
+                    optimizer_c.step(functools.partial(eval, params_c, False, w))
+                    for scheduler in schedulers_c:
+                        if isinstance(scheduler, ReduceLROnPlateau):
+                            scheduler.step(rosenbrock(params_c[0]))
+                        else:
+                            scheduler.step()
+                    # Tolerance is increased due to floating point error from different
+                    # code path for dense case: x v.s. x - x / 4.0 + x / 4.0
+                    self.assertEqual(params, params_c, atol=5e-6, rtol=5e-6)
+
+            if not kwargs.get("maximize", False):
+                self.assertLessEqual(
+                    sum(param.dist(solution) for param in params),
+                    initial_dist
+                )
+            else:
+                self.assertGreaterEqual(
+                    sum(rosenbrock(param) for param in params),
+                    sum(rosenbrock(param_t) for param_t in params_t),
+                )
+
+
+    @skipMPS
+    @optims([o for o in optim_db if o.supports_complex], dtypes=[torch.complex64])
+    def test_complex(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+        # Skip differentiable testing for now, see https://github.com/pytorch/pytorch/issues/116490
+        # Also skip fused, since our fused kernels do not support complex
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
+            device, dtype, optim_info, skip=("differentiable", "fused"))
+        for optim_input in all_optim_inputs:
+            # Last param is intentionally real to test that we can mix real and complex
+            complex_params = [
+                torch.randn(10, 5, device=device, dtype=dtype, requires_grad=True),
+                torch.randn(10, device=device, dtype=dtype, requires_grad=True),
+                torch.randn(10, 5, device=device, dtype=torch.float32, requires_grad=True),
+            ]
+            real_params = [
+                (
+                    torch.view_as_real(param).detach().clone().requires_grad_()
+                    if param.is_complex()
+                    else param.detach().clone().requires_grad_()
+                )
+                for param in complex_params
+            ]
+
+            complex_optimizer = optim_cls(complex_params, **optim_input.kwargs)
+            real_optimizer = optim_cls(real_params, **optim_input.kwargs)
+            real_steps = []
+            complex_steps = []
+            grads_losses = []
+
+            def real_closure():
+                for param in real_params:
+                    grad = torch.randn_like(param)
+                    param.grad = grad
+                    real_steps.append(param.detach().clone())
+                    grads_losses.append(grad.clone())
+                loss = torch.randn(1)
+                grads_losses.append(loss.clone())
+                return loss
+
+            def complex_closure():
+                for param in complex_params:
+                    if torch.is_complex(param):
+                        grad = torch.view_as_complex(grads_losses.pop(0))
+                        complex_steps.append(torch.view_as_real_copy(param.detach()))
+                    else:
+                        grad = grads_losses.pop(0)
+                        complex_steps.append(param.detach().clone())
+                    param.grad = grad
+                return grads_losses.pop(0)
+
+            for _ in range(3):
+                if optim_info.step_requires_closure:
+                    # LBFGS, for example, requires closure and calls it internally
+                    real_optimizer.step(real_closure)
+                    complex_optimizer.step(complex_closure)
+                else:
+                    # For other optimizers, we call closure explicitly to set the gradients
+                    real_closure()
+                    complex_closure()
+                    real_optimizer.step()
+                    complex_optimizer.step()
+
+            # Final Parameters should be the same
+            complex_params_asreal = [torch.view_as_real(param) if param.is_complex() else param for param in complex_params]
+            self.assertEqual(real_params, complex_params_asreal)
+
+            # All intermediate steps should also be the same
+            # also checks steps taken within for example a line search
+            self.assertEqual(complex_steps, real_steps)
+
+
+    @skipMPS
+    @optims([o for o in optim_db if o.supports_complex], dtypes=[torch.complex64])
+    def test_complex_2d(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+        # Skip differentiable testing for now, see https://github.com/pytorch/pytorch/issues/116490
+        # Also skip fused, since our fused kernels do not support complex
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
+            device, dtype, optim_info, skip=("differentiable", "fused"))
+        for optim_input in all_optim_inputs:
+            if optim_info.step_requires_closure:
+                # Why? The way we implement complex is by turning complex params into view_as_real
+                # alternatives. For example, an size (M,N) tensor will become (M,N,2). In this test,
+                # we break apart a tensor into its real and imaginary parts, which would be 2x(M,N).
+                # For other pointwise optimizers, this distinction is trivial, but for LBFGS where
+                # there are reductions across all parameters (and all the grads get flattened into
+                # one long Tensor), this ordering matters. Why? Reductions (like sum) are NOT
+                # commutative, i.e., a + b + c != a + c + b in computers. Thus, we add a seed here
+                # to control the discrepancy that will happen with LBFGS. Note that in test_complex
+                # above, there is no need for a seed nor for increased tolerance, because results
+                # should be bitwise equivalent.
+                torch.manual_seed(2024)
+
+            a1 = torch.randn(2, device=device, dtype=dtype, requires_grad=True)
+            a1_real = a1.real.clone().detach()
+            a1_imag = a1.imag.clone().detach()
+            a1_real.requires_grad_()
+            a1_imag.requires_grad_()
+            optim1 = optim_cls([a1], **optim_input.kwargs)
+            optim2 = optim_cls([a1_real, a1_imag], **optim_input.kwargs)
+
+            a1_reals = TensorTracker()
+            a1_imags = TensorTracker()
+            a1_grad_reals = TensorTracker()
+            a1_grad_imags = TensorTracker()
+            losses = TensorTracker()
+
+            def closure1():
+                optim1.zero_grad()
+                loss = rosenbrock(a1).abs()
+                loss.backward()
+
+                # Track clones to best test accuracy
+                a1_reals.add(a1.real)
+                a1_imags.add(a1.imag)
+                a1_grad_reals.add(a1.grad.real)
+                a1_grad_imags.add(a1.grad.imag)
+
+                losses.add(loss)
+
+                return loss
+
+            def closure2():
+                optim2.zero_grad()
+                a1_reals.pop_check_set(a1_real, self)
+                a1_imags.pop_check_set(a1_imag, self)
+                a2 = torch.complex(a1_real, a1_imag)
+                loss = rosenbrock(a2).abs()
+                losses.pop_check_set(loss, self)
+                loss.backward()
+                a1_grad_reals.pop_check_set(a1_real.grad, self)
+                a1_grad_imags.pop_check_set(a1_imag.grad, self)
+                return loss
+
+
+            for _ in range(3):
+                if optim_info.step_requires_closure:
+                    # LBFGS, for example, requires closure and calls it internally
+                    optim1.step(closure1)
+                    optim2.step(closure2)
+                else:
+                    closure1()
+                    closure2()
+                    optim1.step()
+                    optim2.step()
+
+                self.assertEqual(a1.real, a1_real)
+                self.assertEqual(a1.imag, a1_imag)
+
+            self.assertTrue(a1_reals.all_popped())
+            self.assertTrue(a1_imags.all_popped())
+            self.assertTrue(a1_grad_reals.all_popped())
+            self.assertTrue(a1_grad_imags.all_popped())
+            self.assertTrue(losses.all_popped())
+
+    def _compare_between(self, inputs, models, optimizers, assert_eq_kwargs=None, assert_step_dtype=None):
+        # why 7? iteration 7 is where we start to see differences for RAdam
+        # params interacting with the small eps value, because that's right
+        # after rho_t becomes greater than 5 in step 6.
+        if assert_eq_kwargs is None:
+            assert_eq_kwargs = {}
+        kIterations = 7
+        tracker = TensorTracker(assert_eq_kwargs)
+        for i in range(kIterations):
+            state, updated_params = [], []
+            if not isinstance(inputs, list):
+                inputs = [inputs, inputs]
+            for input, model, optimizer in zip(inputs, models, optimizers):
+                optimizer.zero_grad()
+
+                # Test that step behaves as expected (a no-op) when grads are set to None
+                if i != 3:
+                    output = model(input)
+                    loss = output.sum()
+                    loss.backward()
+
+                optimizer.step()
+                state.append(optimizer.state)
+                updated_params.append(model.parameters())
+
+            og_state, new_state = state
+            for og_p, new_p in zip(updated_params[0], updated_params[1]):
+                tracker.add(og_p)
+                tracker.pop_check_set(new_p, self)
+
+                # check that optimizer states are the same
+                og_p_state = og_state[og_p]
+                new_p_state = new_state[new_p]
+                if assert_step_dtype is not None:
+                    if torch.is_tensor(og_p_state.get("step", None)):
+                        self.assertEqual(og_p_state["step"].dtype, assert_step_dtype)
+                    if torch.is_tensor(new_p_state.get("step", None)):
+                        self.assertEqual(new_p_state["step"].dtype, assert_step_dtype)
+                for k in og_p_state:
+                    tracker.add(og_p_state[k])
+                    tracker.pop_check_set(new_p_state[k], self)
+
+            self.assertTrue(tracker.all_popped())
+
     def _test_derived_optimizers(self, device, dtype, optim_info, flag, reduced_precision=False, assert_step_dtype=None):
         """
         Given a flag 'fused' or 'foreach', test for parity of optimizer state
@@ -63,18 +638,14 @@ def _test_derived_optimizers(self, device, dtype, optim_info, flag, reduced_prec
         for provided optimizer configurations.
         """
         assert flag in ("foreach", "fused")
+        assert_eq_kwargs = {} if not reduced_precision else FP16_REDUCED_PRECISION
 
-        # why 7? iteration 7 is where we start to see differences for RAdam
-        # params interacting with the small eps value, because that's right
-        # after rho_t becomes greater than 5 in step 6.
-        kIterations = 7
-
-        optim_inputs = optim_info.optim_inputs_func(device=device)
+        optim_inputs = optim_info.optim_inputs_func(device=device, dtype=dtype)
         optim_cls = optim_info.optim_cls
         for optim_input in optim_inputs:
-            updated_params, state = [], []
+            models, optimizers = [], []
             kwargs = deepcopy(optim_input.kwargs)
-            if (kwargs.get("capturable", False) and str(device) == "cpu"):
+            if kwargs.get("capturable", False) and str(device) == "cpu":
                 # capturable is not supported on CPU
                 continue
             for flag_value in (False, True):
@@ -100,39 +671,10 @@ def _test_derived_optimizers(self, device, dtype, optim_info, flag, reduced_prec
                 params = list(model.parameters()) + [empty_param]
 
                 optimizer = optim_cls(params, **kwargs)
+                models.append(model)
+                optimizers.append(optimizer)
 
-                for i in range(kIterations):
-                    optimizer.zero_grad()
-
-                    # Test that step behaves as expected (a no-op) when grads are set to None
-                    if i != 3:
-                        output = model(input)
-                        loss = output.sum()
-                        loss.backward()
-
-                    optimizer.step()
-
-                if assert_step_dtype is not None:
-                    p_state = optimizer.state[params[0]]
-                    if torch.is_tensor(p_state.get("step", None)):
-                        self.assertEqual(p_state["step"].dtype, assert_step_dtype)
-
-                state.append(optimizer.state)
-                updated_params.append(model.parameters())
-
-            assert_eq_kwargs = {} if not reduced_precision else FP16_REDUCED_PRECISION
-
-            og_state, new_state = state
-            for og_p, new_p in zip(updated_params[0], updated_params[1]):
-                self.assertEqual(og_p, new_p, **assert_eq_kwargs)
-
-                # check that optimizer states are the same
-                og_p_state = og_state[og_p]
-                new_p_state = new_state[new_p]
-
-                for k in og_p_state:
-                    self.assertEqual(og_p_state[k], new_p_state[k], **assert_eq_kwargs)
-
+            self._compare_between(input, models, optimizers, assert_eq_kwargs, assert_step_dtype)
 
     @skipMPS  # MPS doesn't support torch.float64, see https://github.com/pytorch/pytorch/issues/115350
     @optims([optim for optim in optim_db if "foreach" in optim.supported_impls], dtypes=[torch.float64])
@@ -180,7 +722,7 @@ def test_mixed_device_dtype(self, device, dtype, optim_info, impl):
         for optim_input in optim_inputs:
             updated_params, state = [], []
             kwargs = deepcopy(optim_input.kwargs)
-            if kwargs.get("capturable", False) and str(device) == "cpu":
+            if kwargs.get("capturable", False) and str(device) == "cpu" :
                 # capturable is not supported on CPU
                 continue
             for use_impl in (False, True):
@@ -227,16 +769,18 @@ def test_set_default_dtype_works_with_foreach(self, device, dtype, optim_info):
         # default dtype is higher prec float64
         old_default_dtype = torch.get_default_dtype()
         for default_dtype in [torch.float64, torch.float16]:
-            torch.set_default_dtype(default_dtype)
-            self._test_derived_optimizers(
-                device,
-                dtype,
-                optim_info,
-                "foreach",
-                reduced_precision=default_dtype == torch.float16,
-                assert_step_dtype=torch.float64 if default_dtype == torch.float64 else torch.float32,
-            )
-            torch.set_default_dtype(old_default_dtype)
+            try:
+                torch.set_default_dtype(default_dtype)
+                self._test_derived_optimizers(
+                    device,
+                    dtype,
+                    optim_info,
+                    "foreach",
+                    reduced_precision=default_dtype == torch.float16,
+                    assert_step_dtype=torch.float64 if default_dtype == torch.float64 else torch.float32,
+                )
+            finally:
+                torch.set_default_dtype(old_default_dtype)
 
 
 
@@ -264,7 +808,6 @@ def test_peak_memory_foreach(self, device, dtype, optim_info):
             max_mems = []
             for flag_value in (False, True):
                 kwargs["foreach"] = flag_value
-
                 # The 128 is critical here! Our CUDACachingAllocator allocates in blocks of 512,
                 # meaning any tensor that occupies <512 bytes of memory will allocate a whole
                 # 512 bytes anyway. We use 128 (since datasize would be 4 bytes) so that param
@@ -288,7 +831,9 @@ def test_peak_memory_foreach(self, device, dtype, optim_info):
             st_max_mem, mt_max_mem = max_mems
             intermediate_size = nparams * param.nelement() * param.element_size()
             nintermediates = 1  # we expect a budget of 1 intermediate most of the time
-            if kwargs.get('capturable') or optim_cls.__name__ in ["Adadelta", "ASGD"]:
+
+            # Check the param group directly to handle if the compiler set capturable
+            if optimizer.param_groups[0].get("capturable", False) or optim_cls.__name__ in ["Adadelta", "ASGD", "RAdam"]:
                 # with capturable in Adam(W), we have 2 extra intermediates for the bias_corrections
                 # with Adadelta, we have 2 extra for (acc_delta + eps) and (square_avg + eps)
                 # ASGD allocates axs, 2x mus, 2x etas, and grads at the same time
@@ -296,7 +841,22 @@ def test_peak_memory_foreach(self, device, dtype, optim_info):
                 if optim_cls.__name__ == "NAdam":
                     # with capturable in NAdam, we have 3 extra intermediates for the
                     # bias_correction, mus, and mu_nexts
-                    nintermediates = 5
+                    if TEST_WITH_TORCHDYNAMO:
+                        # With dynamo, the eager/FX backend appears to hold memory longer than
+                        # vanilla eager: https://github.com/pytorch/pytorch/issues/125511
+                        nintermediates = 8
+                    else:
+                        nintermediates = 5
+
+                if optim_cls.__name__ == "RAdam":
+                    # RAdam has four intermediates with capturable
+                    # num, unrect_step_size, buffer, grouped_grads
+                    if TEST_WITH_TORCHDYNAMO:
+                        # With dynamo, the eager/FX backend appears to hold memory than
+                        # vanilla eager: https://github.com/pytorch/pytorch/issues/125511
+                        nintermediates = 6
+                    else:
+                        nintermediates = 4
 
             elif optim_cls.__name__ in ["NAdam", "Adagrad", "RMSprop"]:
                 # NAdam uses two intermediates at the same time (grads & exp_avg_sq_sqrt)
@@ -304,19 +864,39 @@ def test_peak_memory_foreach(self, device, dtype, optim_info):
                 # RMSprop uses avg and grads
                 nintermediates = 2
 
-            self.assertLessEqual(mt_max_mem, st_max_mem + intermediate_size * nintermediates)
+            # Dynamo ST uses less mem than eager in the case of Adam/Adagrad/Nadam/RAdam
+            # which makes the foreach memory check fail
+            if TEST_WITH_TORCHDYNAMO:
+                st_max_mem += 6000
 
+            expected_max_mem = st_max_mem + intermediate_size * nintermediates
+            # hipcc currently can't generate efficient code for the small buffer optimization
+            # code path (see Note [small buffer optimization] for details), thus we always
+            # dynamically allocate the tensor metadata for ROCM. Adjusting the expected max
+            # memory usage to account for this.
+            if TEST_WITH_ROCM:
+                expected_max_mem *= 1.02
 
-    @onlyCUDA
-    @optims([optim for optim in optim_db if "fused" in optim.supported_impls], dtypes=[torch.float64])
+            self.assertLessEqual(mt_max_mem, expected_max_mem)
+
+
+    @onlyNativeDeviceTypes
+    @optims(
+        [optim for optim in optim_db if "fused" in optim.supported_impls],
+        dtypes=floating_types_and(torch.bfloat16, torch.float16, )
+    )
     def test_fused_matches_forloop(self, device, dtype, optim_info):
+        if device not in optim_info.supports_fused_on:
+            self.skipTest(f"{device} is not supported for fused on {optim_info.optim_cls.__name__}")
         self._test_derived_optimizers(device, dtype, optim_info, "fused")
 
 
-    @onlyCUDA
-    @largeTensorTest("64GB", "cuda")
+    @onlyNativeDeviceTypes
+    @largeTensorTest("64GB")
     @optims([optim for optim in optim_db if "fused" in optim.supported_impls], dtypes=[torch.float16])
     def test_fused_large_tensor(self, device, dtype, optim_info):
+        if device not in optim_info.supports_fused_on:
+            self.skipTest(f"{device} is not supported for fused on {optim_info.optim_cls.__name__}")
         optim_cls = optim_info.optim_cls
         optim_inputs = optim_info.optim_inputs_func(device=device)
         for optim_input in optim_inputs:
@@ -326,6 +906,145 @@ def test_fused_large_tensor(self, device, dtype, optim_info):
             optimizer.step()
 
 
+    @onlyCUDA
+    @optims([optim for optim in optim_db if "fused" in optim.supported_impls], dtypes=[torch.float32])
+    def test_fused_does_not_step_if_foundinf(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+        optim_inputs = optim_info.optim_inputs_func(device=device)
+        num_params = 5
+        for optim_input in optim_inputs:
+            for no_grad_scale in (False, True):
+                params = [torch.ones((1,), device=device, dtype=dtype) for _ in range(num_params)]
+                params_c = [param.clone().detach() for param in params]
+                for p in params:
+                    p.grad = torch.ones_like(p)
+                optimizer = optim_cls(params, fused=True, **optim_input.kwargs)
+                optimizer.grad_scale = None if no_grad_scale else torch.ones((1,), dtype=dtype, device=device)
+                optimizer.found_inf = torch.ones((), dtype=dtype, device=device)
+                optimizer.step()
+                for p in params:
+                    if "step" in optimizer.state[p]:
+                        self.assertEqual(torch.zeros((), dtype=dtype, device=device), optimizer.state[p]["step"])
+                self.assertEqual(params, params_c)
+
+
+    @onlyCUDA
+    @parametrize("impl", ["fused", "capturable"])
+    @optims([optim for optim in optim_db if "fused" in optim.supported_impls], dtypes=[torch.float32])
+    def test_cpu_load_state_dict(self, device, dtype, impl, optim_info):
+        # NOTE: This SIMULATES a fused/capturable optimizer with state moved to CPU, issue 103256
+        # How do we get there? Users typically create CUDA models on fused optimizers and then
+        # store checkpoints on CPU as CUDA memory is limited with torch.load(...map_location="cpu").
+        # Since this is a unit test, it is more expedient to simulate what the state_dict
+        # would look like, which is basically CPU tensors with fused/capturable flag = True.
+        optim_cls = optim_info.optim_cls
+        if optim_cls.__name__ == "SGD" and impl == "capturable":
+            # Capturable SGD does not exist
+            self.skipTest("SGD does not currently support capturable")
+
+        cpu_optim_inputs = optim_info.optim_inputs_func(device="cpu")
+        for optim_input in cpu_optim_inputs:
+            param = torch.tensor([0.1, 0.2], dtype=dtype, device="cpu")
+            optimizer = optim_cls([param], **optim_input.kwargs)
+            param.grad = torch.rand_like(param)
+            optimizer.step()
+            optim_state_dict_cpu = deepcopy(optimizer.state_dict())
+            optim_state_dict_cpu["param_groups"][0][impl] = True
+
+            # load
+            optim_input.kwargs[impl] = True
+            param_cuda = param.clone().detach().to(device="cuda")
+            optimizer_cuda = optim_cls([param_cuda], **optim_input.kwargs)
+            optimizer_cuda.load_state_dict(optim_state_dict_cpu)
+            optimizer_cuda.zero_grad()
+            param_cuda.grad = torch.rand_like(param_cuda)
+            optimizer_cuda.step()
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_param_groups_weight_decay(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+        # Skip differentiable testing for now, see https://github.com/pytorch/pytorch/issues/116490
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info, skip=("differentiable",))
+        for optim_input in all_optim_inputs:
+            weight_kwargs = optim_input.kwargs
+            bias_kwargs = deepcopy(optim_input.kwargs)
+            bias_kwargs["weight_decay"] = 0.0
+
+            weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
+            bias = Parameter(torch.randn((10), device=device, dtype=dtype))
+            input = torch.randn(5, device=device, dtype=dtype)
+
+            optimizer = optim_cls([dict(params=[weight], **weight_kwargs), dict(params=[bias], **bias_kwargs)])
+
+            loss = (weight.mv(input) + bias).pow(2).sum()
+            initial_value = loss.item()
+            for _ in range(20):
+                optimizer.zero_grad()
+                loss = (weight.mv(input) + bias).pow(2).sum()
+                loss.backward()
+                if optim_info.only_supports_sparse_grads:
+                    # For this test, we naively convert the Tensor layout, which we know does
+                    # NOT represent the expected use case for optims like SparseAdam!
+                    weight.grad = weight.grad.to_sparse()
+                    bias.grad = bias.grad.to_sparse()
+                optimizer.step()
+
+            # Test that the direction of loss moved appropriately
+            if optim_input.kwargs.get("maximize", False):
+                self.assertGreater(loss.item(), initial_value)
+            else:
+                self.assertLess(loss.item(), initial_value)
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_param_groups_lr(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+        # Skip differentiable testing for now, see https://github.com/pytorch/pytorch/issues/116490
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info, skip=("differentiable",))
+        for optim_input in all_optim_inputs:
+            # optim_input.kwargs will be the param group kwargs, which should have >0 lr
+            if "lr" not in optim_input.kwargs or optim_input.kwargs["lr"] == 0:
+                optim_input.kwargs["lr"] = 1e-3
+            outer_kwargs = {"lr": 1e-28}
+            if optim_cls.__name__ == "Rprop":
+                # Allow min step size to be 0
+                outer_kwargs["step_sizes"] = (0, 50)
+
+            weight = Parameter(torch.randn((10, 5), device=device, dtype=dtype))
+            bias = Parameter(torch.randn((10), device=device, dtype=dtype))
+            irrelevant = Parameter(torch.randn(2, device=device, dtype=dtype))
+            irrelevant_clone = irrelevant.clone()
+            input = torch.randn(5, device=device, dtype=dtype)
+            optimizer = optim_cls(
+                [dict(params=[weight, bias], **optim_input.kwargs), dict(params=[irrelevant])],
+                **outer_kwargs)
+
+            loss = (weight.mv(input) + bias).pow(2).sum()
+            initial_value = loss.item()
+            for _ in range(20):
+                optimizer.zero_grad()
+                loss = (weight.mv(input) + bias).pow(2).sum()
+                loss.backward()
+                irrelevant.grad = torch.rand_like(irrelevant)
+                if optim_info.only_supports_sparse_grads:
+                    # For this test, we naively convert the Tensor layout, which we know does
+                    # NOT represent the expected use case for optims like SparseAdam!
+                    weight.grad = weight.grad.to_sparse()
+                    bias.grad = bias.grad.to_sparse()
+                    irrelevant.grad = irrelevant.grad.to_sparse()
+                optimizer.step()
+
+            # Test that the direction of loss moved appropriately
+            if optim_input.kwargs.get("maximize", False):
+                self.assertGreater(loss.item(), initial_value)
+            else:
+                self.assertLess(loss.item(), initial_value)
+
+            # Test that irrelevant parameters were not updated since lr was almost 0
+            self.assertEqual(irrelevant, irrelevant_clone)
+
+
     @optims(optim_db, dtypes=[torch.float32])
     def test_step_is_noop_when_params_have_no_grad(self, device, dtype, optim_info):
         optim_cls = optim_info.optim_cls
@@ -341,7 +1060,6 @@ def closure():
         for optim_input in all_optim_inputs:
             optimizer = optim_cls(params, **optim_input.kwargs)
             optimizer.step(closure)
-            self.assertEqual(old_params, params)
 
 
     @optims(optim_db, dtypes=[torch.float32])
@@ -364,7 +1082,7 @@ def closure():
 
             # AdamW params will be updated regardless of grads due to lr, so make lr smaller
             if optim_cls.__name__ == "AdamW":
-                kwargs["lr"] = torch.tensor(1e-4) if isinstance(kwargs.get("lr", 1e-4), torch.Tensor) else 1e-4
+                kwargs["lr"] = torch.tensor(1e-5) if isinstance(kwargs.get("lr", 1e-5), torch.Tensor) else 1e-5
 
             if kwargs.get("differentiable", False):
                 params = [param.clone()]
@@ -372,7 +1090,7 @@ def closure():
                 params = [param]
 
             optimizer = optim_cls(params, **kwargs)
-            if optim_cls.__name__ == "SparseAdam":
+            if optim_info.only_supports_sparse_grads:
                 # Intentionally construct a multidimensional empty v for the sparse grad
                 # Single dim v passes the test while multidim correctly repros the issue
                 # https://github.com/pytorch/pytorch/issues/82486
@@ -418,7 +1136,11 @@ def fwd_bwd(optim, w, b, i):
 
             # Prime the optimizer
             for _ in range(10):
-                optimizer.step(closure)
+                if optim_info.step_requires_closure:
+                    optimizer.step(closure)
+                else:
+                    closure()
+                    optimizer.step()
 
             # Clone the weights and construct a new optimizer for them
             with torch.no_grad():
@@ -432,9 +1154,16 @@ def fwd_bwd(optim, w, b, i):
             optimizer_c.load_state_dict(deepcopy(optimizer.state_dict()))
 
             # Run both optimizers in parallel
-            for i in range(10):
-                optimizer.step(closure)
-                optimizer_c.step(closure_c)
+            for _ in range(10):
+                if optim_info.step_requires_closure:
+                    optimizer.step(closure)
+                    optimizer_c.step(closure_c)
+                else:
+                    closure()
+                    closure_c()
+                    optimizer.step()
+                    optimizer_c.step()
+
                 self.assertEqual(weight, weight_c)
                 self.assertEqual(bias, bias_c)
 
@@ -472,7 +1201,7 @@ def fwd_bwd(optim, mod, i):
                 return loss
 
             for _ in range(3):
-                if optim_cls.__name__ == "LBFGS":
+                if optim_info.step_requires_closure:
                     optimizer.step(functools.partial(fwd_bwd, optimizer, model, input))
                 else:
                     fwd_bwd(optimizer, model, input)
@@ -489,13 +1218,610 @@ def fwd_bwd(optim, mod, i):
             optimizer.load_state_dict(old_state_dict)
 
             # Make sure we can still step
-            if optim_cls.__name__ == "LBFGS":
+            if optim_info.step_requires_closure:
                 optimizer.step(functools.partial(fwd_bwd, optimizer, model, input))
             else:
                 fwd_bwd(optimizer, model, input)
                 optimizer.step()
 
 
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_save_load_equality_with_weights_only(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+
+        # Skip differentiable testing for now, see https://github.com/pytorch/pytorch/issues/116490
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info, skip=("differentiable",))
+        weight = Parameter(torch.randn(2, 3, requires_grad=True, device=device, dtype=dtype))
+        bias = Parameter(torch.randn(2, requires_grad=True, device=device, dtype=dtype))
+        input = torch.randn(3, requires_grad=True, device=device, dtype=dtype)
+        params = [weight, bias]
+
+        def fwd_bwd(optim, w, b, i):
+            optim.zero_grad()
+            loss = (w.mv(i) + b).pow(2).sum()
+            loss.backward()
+            if optim_info.only_supports_sparse_grads:
+                weight.grad = weight.grad.to_sparse()
+                bias.grad = bias.grad.to_sparse()
+            return loss
+
+        for optim_input in all_optim_inputs:
+            optimizer = optim_cls(params, **optim_input.kwargs)
+            closure = functools.partial(fwd_bwd, optimizer, weight, bias, input)
+
+            # Prime the optimizer
+            for _ in range(3):
+                optimizer.step(closure)
+
+            sd = optimizer.state_dict()
+
+            # === Check saved/loaded state_dict are the same (including weights_only load). ===
+            with tempfile.TemporaryFile() as f:
+                torch.save(sd, f)
+                f.seek(0)
+                sd_copy = torch.load(f)
+                self.assertEqual(sd_copy, sd)
+                del sd_copy
+                f.seek(0)
+                sd_copy_wo = torch.load(f, weights_only=True)
+                self.assertEqual(sd_copy_wo, sd)
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_load_nontensor_step(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+
+        # Skip differentiable testing for now, see https://github.com/pytorch/pytorch/issues/116490
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info, skip=("differentiable",))
+        params = [Parameter(torch.randn(2, 3, device=device, dtype=dtype)) for _ in range(2)]
+        for p in params:
+            p.grad = torch.rand_like(p)
+            if optim_info.only_supports_sparse_grads:
+                # For this test, we naively convert the Tensor layout, which we know does
+                # NOT represent the expected use case for optims like SparseAdam!
+                p.grad = p.grad.to_sparse()
+
+        # Needed for second order optims like LBFGS
+        closure_loss = torch.rand(1, device=device, dtype=dtype)
+
+        def closure():
+            return closure_loss if optim_info.step_requires_closure else None
+
+        for optim_input in all_optim_inputs:
+            kwargs = optim_input.kwargs
+            optimizer = optim_cls(params, **optim_input.kwargs)
+            for _ in range(3):
+                optimizer.step(closure)
+            state_dict = deepcopy(optimizer.state_dict())
+            for p_state in state_dict["state"].values():
+                if "step" in p_state and torch.is_tensor(p_state["step"]):
+                    p_state["step"] = p_state["step"].item()
+            optimizer.load_state_dict(state_dict)
+            optimizer.step(closure)
+
+
+    @onlyCUDA
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_state_dict_with_cuda_params(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+
+        # Skip differentiable testing for now, see https://github.com/pytorch/pytorch/issues/116490
+        # We limit our configs to CPU only, because we will be moving them to CUDA later
+        cpu_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs("cpu", dtype, optim_info, skip=("differentiable",))
+
+        # Needed for second order optims like LBFGS
+        closure_loss = torch.rand(1, device=device, dtype=dtype)
+
+        def closure():
+            return closure_loss if optim_info.step_requires_closure else None
+
+        for optim_input in cpu_optim_inputs:
+            params = [Parameter(torch.randn(2, 3, device="cpu", dtype=dtype)) for _ in range(2)]
+            for p in params:
+                p.grad = torch.randn_like(p)
+                if optim_info.only_supports_sparse_grads:
+                    # For this test, we naively convert the Tensor layout, which we know does
+                    # NOT represent the expected use case for optims like SparseAdam!
+                    p.grad = p.grad.to_sparse()
+
+            optimizer = optim_cls(params, **optim_input.kwargs)
+
+            for _ in range(3):
+                optimizer.step(closure)
+
+            with torch.no_grad():
+                params_cuda = [p.to(device="cuda") for p in params]
+                for (i, p) in enumerate(params_cuda):
+                    p.grad = params[i].grad.to(device="cuda")
+            optimizer_cuda = optim_cls(params_cuda, **optim_input.kwargs)
+
+            state_dict_cpu = deepcopy(optimizer.state_dict())
+            state_dict_cuda = deepcopy(optimizer.state_dict())
+            optimizer_cuda.load_state_dict(state_dict_cuda)
+
+            # Make sure state_dict_cuda isn't modified by merely calling load_state_dict
+            self.assertEqual(state_dict_cpu, state_dict_cuda)
+
+            # Make sure that device of state['step'] is still CPU _unless_ torch.compile() added a capturable!
+            capturable = state_dict_cpu["param_groups"][0].get("capturable", False)
+            fused = state_dict_cpu["param_groups"][0].get("fused", False)
+            new_state_dict = optimizer_cuda.state_dict()
+            for state_cpu, state_cuda in zip(state_dict_cpu["state"].values(), new_state_dict["state"].values()):
+                if "step" in state_cpu and torch.is_tensor(state_cpu["step"]):
+                    self.assertEqual(state_cuda["step"].device.type, "cuda" if capturable or fused else "cpu")
+
+            for _ in range(5):
+                optimizer.step(closure)
+                optimizer_cuda.step(closure)
+                self.assertEqual(params, params_cuda)
+                self.assertEqual(optimizer.state_dict(), optimizer_cuda.state_dict())
+
+
+    @staticmethod
+    def _state_dict_pre_hook(optimizer: Optimizer) -> None:
+        optimizer.state["test"] = 1
+
+
+    @staticmethod
+    def _state_dict_post_hook(optimizer: Optimizer, state_dict: Dict[str, Any]) -> Dict[str, Any]:
+        if "test" in state_dict["state"]:
+            state_dict["state"].pop("test")
+            state_dict["ran_state_dict_pre_hook"] = True
+        else:
+            state_dict["ran_state_dict_pre_hook"] = False
+        return state_dict
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_state_dict_pre_hook(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info)
+        for optim_input in all_optim_inputs:
+            param = torch.rand(2, 3, device=device, dtype=dtype, requires_grad=True)
+            optim = optim_cls([param], **optim_input.kwargs)
+            optim.register_state_dict_pre_hook(self.__class__._state_dict_pre_hook)
+            state_dict = optim.state_dict()
+            self.assertEqual(state_dict["state"]["test"], 1)
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_state_dict_post_hook(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info)
+        for optim_input in all_optim_inputs:
+            param = torch.rand(2, 3, device=device, dtype=dtype, requires_grad=True)
+            optim = optim_cls([param], **optim_input.kwargs)
+            optim.register_state_dict_post_hook(self.__class__._state_dict_post_hook)
+            state_dict = optim.state_dict()
+            self.assertFalse(state_dict["ran_state_dict_pre_hook"])
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_state_dict_pre_post_hook(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info)
+        for optim_input in all_optim_inputs:
+            param = torch.rand(2, 3, device=device, dtype=dtype, requires_grad=True)
+            optim = optim_cls([param], **optim_input.kwargs)
+            optim.register_state_dict_pre_hook(self.__class__._state_dict_pre_hook)
+            optim.register_state_dict_post_hook(self.__class__._state_dict_post_hook)
+            state_dict = optim.state_dict()
+            self.assertFalse("test" in state_dict["state"])
+            self.assertTrue(state_dict["ran_state_dict_pre_hook"])
+
+
+    @staticmethod
+    def _load_state_dict_pre_hook1(optimizer: Optimizer, state_dict: Dict[str, Any]) -> None:
+        state_dict["param_groups"][0]["lr"] = 0.002
+
+
+    @staticmethod
+    def _load_state_dict_pre_hook2(optimizer: Optimizer, state_dict: Dict[str, Any]) -> Dict[str, Any]:
+        # The typical use case for returning a state dict is to drastically modify the state dict.
+        # I will simulate by simply making a deep copy and ensuring that my_state_dict still gets used
+        my_state_dict = deepcopy(state_dict)
+        my_state_dict["param_groups"][0]["lr"] = 0.003
+        return my_state_dict
+
+
+    @staticmethod
+    def _load_state_dict_post_hook(optimizer: Optimizer) -> None:
+        optimizer.state["ran_load_state_dict_pre_hook2"] = optimizer.param_groups[0]["lr"] == 0.003
+        optimizer.state["ran_load_state_dict_post_hook"] = True
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_load_state_dict_pre_hook_and_prepend(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info)
+        for optim_input in all_optim_inputs:
+            param = torch.rand(2, 3, device=device, dtype=dtype, requires_grad=True)
+            optim = optim_cls([param], **optim_input.kwargs)
+            state_dict = optim.state_dict()
+
+            # usually one would have a new optim instance here, but it's all the same here
+            optim.register_load_state_dict_pre_hook(self.__class__._load_state_dict_pre_hook1)
+            optim.load_state_dict(state_dict)
+            self.assertEqual(optim.param_groups[0]["lr"], 0.002)
+
+            optim.register_load_state_dict_pre_hook(self.__class__._load_state_dict_pre_hook2, prepend=True)
+            optim.load_state_dict(state_dict)
+            # If prepend were False would be 0.003 but since prepend is True, the other hook overrides
+            self.assertEqual(optim.param_groups[0]["lr"], 0.002)
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_load_state_dict_post_hook(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info)
+        for optim_input in all_optim_inputs:
+            param = torch.rand(2, 3, device=device, dtype=dtype, requires_grad=True)
+            optim = optim_cls([param], **optim_input.kwargs)
+
+            optim.register_load_state_dict_post_hook(self.__class__._load_state_dict_post_hook)
+            optim.load_state_dict(optim.state_dict())
+            self.assertFalse(optim.state["ran_load_state_dict_pre_hook2"])
+            self.assertTrue(optim.state["ran_load_state_dict_post_hook"])
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_load_state_dict_pre_post_hook(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info)
+        for optim_input in all_optim_inputs:
+            param = torch.rand(2, 3, device=device, dtype=dtype, requires_grad=True)
+            optim = optim_cls([param], **optim_input.kwargs)
+
+            optim.register_load_state_dict_pre_hook(self.__class__._load_state_dict_pre_hook2)
+            optim.register_load_state_dict_post_hook(self.__class__._load_state_dict_post_hook)
+            optim.load_state_dict(optim.state_dict())
+            self.assertTrue(optim.state["ran_load_state_dict_pre_hook2"])
+            self.assertTrue(optim.state["ran_load_state_dict_post_hook"])
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_step_post_hook(self, device, dtype, optim_info):
+        def post_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
+            nonlocal data
+            data += 2
+
+        params = [torch.tensor([1, 1], device=device, dtype=dtype)]
+
+        def dummy_closure():
+            return 1
+
+        closure = dummy_closure if optim_info.step_requires_closure else None
+
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info)
+        for optim_input in all_optim_inputs:
+            optim = optim_info.optim_cls(params, **optim_input.kwargs)
+            data = 2
+            hook_handle = optim.register_step_post_hook(post_hook)
+
+            optim.step(closure)
+            optim.step(closure)
+            # check if post hooks were registered
+            self.assertEqual(data, 6)
+
+            # remove handles, take step and verify that hook is no longer registered
+            hook_handle.remove()
+
+            optim.step(closure)
+            self.assertEqual(data, 6)
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_step_pre_hook(self, device, dtype, optim_info):
+        def pre_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
+            nonlocal data
+            data += 2
+
+        params = [torch.tensor([1, 1], device=device, dtype=dtype)]
+
+        def dummy_closure():
+            return 1
+
+        closure = dummy_closure if optim_info.step_requires_closure else None
+
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info)
+        for optim_input in all_optim_inputs:
+            optim = optim_info.optim_cls(params, **optim_input.kwargs)
+            data = 5
+            hook_handle = optim.register_step_pre_hook(pre_hook)
+
+            optim.step(closure)
+            optim.step(closure)
+            # check if pre hooks were registered
+            self.assertEqual(data, 9)
+
+            # remove handles, take step and verify that hook is no longer registered
+            hook_handle.remove()
+
+            optim.step(closure)
+            self.assertEqual(data, 9)
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_step_all_hooks(self, device, dtype, optim_info):
+        def global_pre_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
+            nonlocal data
+            data.append(0)
+
+        def global_post_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
+            nonlocal data
+            data.append(5)
+
+        def local_pre_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
+            nonlocal data
+            data.append(1)
+
+        def local_post_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
+            nonlocal data
+            data.append(2)
+
+        params = [torch.tensor([1, 1], device=device, dtype=dtype)]
+
+        def dummy_closure():
+            return 1
+
+        closure = dummy_closure if optim_info.step_requires_closure else None
+
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info)
+        for optim_input in all_optim_inputs:
+            optim = optim_info.optim_cls(params, **optim_input.kwargs)
+            optim2 = SGD(params)
+            data = []
+
+            # register global hooks to both optimizers
+            global_pre_handle = register_optimizer_step_pre_hook(global_pre_hook)
+            global_post_handle = register_optimizer_step_post_hook(global_post_hook)
+
+            # register local hooks
+            first_pre_handle = optim.register_step_pre_hook(local_pre_hook)
+            first_post_handle = optim.register_step_post_hook(local_post_hook)
+            second_pre_handle = optim2.register_step_pre_hook(local_pre_hook)
+            second_post_handle = optim2.register_step_post_hook(local_post_hook)
+
+            optim.step(closure)
+            self.assertListEqual(data, [0, 1, 2, 5])
+            optim2.step(closure)
+            self.assertListEqual(data, [0, 1, 2, 5, 0, 1, 2, 5])
+            optim.step(closure)
+            self.assertListEqual(data, [0, 1, 2, 5, 0, 1, 2, 5, 0, 1, 2, 5])
+
+            # remove all hooks
+            global_pre_handle.remove()
+            global_post_handle.remove()
+            first_pre_handle.remove()
+            first_post_handle.remove()
+            second_pre_handle.remove()
+            second_post_handle.remove()
+
+            optim.step(closure)
+            optim2.step(closure)
+            self.assertListEqual(data, [0, 1, 2, 5, 0, 1, 2, 5, 0, 1, 2, 5])
+
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_deepcopy_copies_all_public_attrs(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+
+        # Skip differentiable testing for now, see https://github.com/pytorch/pytorch/issues/116490
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(device, dtype, optim_info, skip=("differentiable",))
+
+        params = [Parameter(torch.randn(2, 3, device=device, dtype=dtype)) for _ in range(2)]
+        for p in params:
+            p.grad = torch.rand_like(p)
+            if optim_info.only_supports_sparse_grads:
+                # For this test, we naively convert the Tensor layout, which we know does
+                # NOT represent the expected use case for optims like SparseAdam!
+                p.grad = p.grad.to_sparse()
+
+        # Needed for second order optims like LBFGS
+        def closure():
+            return 1 if optim_info.step_requires_closure else None
+
+        def getPublicAttrs(obj):
+            return {k for k in obj.__dict__ if not k.startswith("_")}
+
+        for optim_input in all_optim_inputs:
+            optimizer = optim_cls(params, **optim_input.kwargs)
+
+            # Make some state
+            for _ in range(3):
+                if optim_info.step_requires_closure:
+                    optimizer.step(closure)
+                else:
+                    closure()
+                    optimizer.step()
+
+            self.assertEqual(getPublicAttrs(optimizer), getPublicAttrs(deepcopy(optimizer)))
+
+
+    @optims([optim for optim in optim_db if optim.step_requires_closure], dtypes=[torch.float32])
+    def test_second_order_optims_return_consistent_types(self, device, dtype, optim_info):
+        # Motivated by #7586
+        optim_cls = optim_info.optim_cls
+        params = [torch.randn(10, 5, device=device, dtype=dtype), torch.randn(10, device=device, dtype=dtype)]
+
+        def closure():
+            return torch.tensor([10], device=device, dtype=dtype)
+
+        for optim_input in optim_info.optim_inputs_func(device=device):
+            # Currently, the only second order optim is LBFGS, so we just go ahead and modify
+            # "tolerance_grad", but this may not scale if we add second order optims in the future
+            kwargs = optim_input.kwargs
+            kwargs["tolerance_grad"] = math.inf
+            optim_inf = optim_cls(params, **kwargs)
+            kwargs["tolerance_grad"] = -math.inf
+            optim_neg_inf = optim_cls(params, **kwargs)
+
+            res1 = optim_inf.step(closure)
+            res2 = optim_neg_inf.step(closure)
+            self.assertEqual(type(res1), type(res2))
+
+    @onlyCUDA
+    @optims(
+        [optim for optim in optim_db if "cpu" in optim.supports_fused_on and "cuda" in optim.supports_fused_on],
+        dtypes=floating_types_and(torch.bfloat16, torch.float16,)
+    )
+    def test_fused_cpu_matches_cuda(self, device, dtype, optim_info):
+        optim_cls = optim_info.optim_cls
+        optim_inputs = optim_info.optim_inputs_func(device="cpu")
+        for optim_input in optim_inputs:
+            inpts, models, optimizers = [], [], []
+            for dev in ('cpu', 'cuda'):
+                kwargs = optim_input.kwargs
+                kwargs["fused"] = True
+                inpt = torch.tensor(
+                    [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype=dtype, device=dev
+                ).reshape(3, 2)
+
+                torch.manual_seed(1)
+                model = torch.nn.Sequential(
+                    torch.nn.Linear(2, 3),
+                    torch.nn.Sigmoid(),
+                    torch.nn.Linear(3, 1),
+                    torch.nn.Sigmoid(),
+                )
+                model.to(dtype=dtype, device=dev)
+
+                # foreach/fused optimizers should be tested with a
+                # zero_size tensor as its last param.
+                # ref: https://github.com/pytorch/pytorch/issues/100701
+                empty_param = torch.empty((), device=dev, dtype=dtype, requires_grad=True)
+                empty_param.grad = torch.rand_like(empty_param)
+                params = list(model.parameters()) + [empty_param]
+
+                optimizer = optim_cls(params, **kwargs)
+                inpts.append(inpt)
+                models.append(model)
+                optimizers.append(optimizer)
+        self._compare_between(inpts, models, optimizers)
+
+    @onlyNativeDeviceTypes
+    @optims([optim for optim in optim_db if "fused" in optim.supported_impls], dtypes=[torch.float32])
+    def test_grad_scaling_autocast_fused_optimizers(self, device, dtype, optim_info):
+        # This ut is from test_cuda.py test_grad_scaling_autocast_fused_optimizers
+        # but only test Adam/AdamW on CPU
+        # TODO: haozhe, support SGD and unified this ut with the CUDA only one
+        if device not in optim_info.supports_fused_on:
+            self.skipTest(f"{device} is not supported for fused on {optim_info.optim_cls.__name__}")
+        optim_inputs = optim_info.optim_inputs_func(device=device)
+        optim_cls = optim_info.optim_cls
+        for optim_input in optim_inputs:
+            kwargs = optim_input.kwargs
+            kwargs["fused"] = True
+            for _separate_unscale in (True, False):
+                self._grad_scaling_autocast_fused_optimizers(
+                    device=device, optimizer_ctor=optim_cls, optimizer_kwargs=kwargs, separate_unscale=_separate_unscale)
+
+    def _grad_scaling_autocast_fused_optimizers(self, device, optimizer_ctor, optimizer_kwargs, separate_unscale):
+        torch.manual_seed(20)
+        (
+            mod_control, mod_scaling, opt_control, opt_scaling, data, loss_fn, _,
+        ) = _create_scaling_case(optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs, device='cpu')
+        kwargs = deepcopy(optimizer_kwargs)
+        kwargs["fused"] = False
+        if 'lr' not in optimizer_kwargs:
+            # _create_scaling_case will set lr = 1.0 if optimizer_kwargs do not set lr
+            kwargs['lr'] = 1.0
+        opt_control = optimizer_ctor(mod_control.parameters(), **kwargs)
+
+        scaler_scaling = torch.amp.GradScaler(device, init_scale=128.0)
+        scaler_control = torch.amp.GradScaler(device, init_scale=128.0)
+        tracker = TensorTracker()
+        for input, target in data:
+            opt_control.zero_grad()
+            with torch.autocast(device_type=device, dtype=torch.half):
+                output_control = mod_control(input)
+                loss_control = loss_fn(output_control, target)
+            scaler_control.scale(loss_control).backward()
+            scaler_control.step(opt_control)
+            scaler_control.update()
+
+            opt_scaling.zero_grad()
+            with torch.autocast(device_type=device, dtype=torch.half):
+                output_scaling = mod_scaling(input)
+                loss_scaling = loss_fn(output_scaling, target)
+            scaler_scaling.scale(loss_scaling).backward()
+            if separate_unscale:
+                scaler_scaling.unscale_(opt_scaling)
+            scaler_scaling.step(opt_scaling)
+            scaler_scaling.update()
+
+            tracker.add(loss_control)
+            tracker.pop_check_set(loss_scaling, self)
+            for param_control, param_scaling in zip(mod_control.parameters(), mod_scaling.parameters()):
+                tracker.add(param_control.grad)
+                tracker.pop_check_set(param_scaling.grad, self)
+                tracker.add(param_control)
+                tracker.pop_check_set(param_scaling, self)
+
+                state_control, state_scaling = opt_control.state[param_control], opt_scaling.state[param_scaling]
+
+                for k in state_control:
+                    actual = state_scaling[k]
+                    if k == "step":
+                        actual = actual.squeeze()
+                    tracker.add(state_control[k])
+                    tracker.pop_check_set(actual, self)
+
+    @onlyCUDA
+    @optims([o for o in optim_db if "foreach" in o.supported_impls], dtypes=[torch.float32])
+    def test_defaults_changed_to_foreach(self, device, dtype, optim_info):
+        # Test that the default implementations for optimizers are changed to foreach
+        optim_cls = optim_info.optim_cls
+        model = torch.nn.Linear(5, 5)
+        model.to(dtype=dtype, device=device)
+        inpt = torch.rand(2, 5, dtype=dtype, device=device)
+
+        import inspect
+        module = inspect.getmodule(optim_cls)
+
+        for optim_input in optim_info.optim_inputs_func(device=device):
+            optim = optim_cls(model.parameters(), **optim_input.kwargs)
+            optim.zero_grad()
+            output = model(inpt)
+            loss = output.sum()
+            loss.backward()
+            with patch.object(
+                module, f"_multi_tensor_{optim_cls.__name__.lower()}"
+            ) as mocked_foreach_impl:
+                optim.step()
+                self.assertTrue(mocked_foreach_impl.called)
+
+    @optims(optim_db, dtypes=[torch.float32])
+    def test_non_empty_state(self, device, dtype, optim_info):
+        # There are internal tests that check that the state is not empty
+        optim_cls = optim_info.optim_cls
+        model = torch.nn.Linear(5, 5)
+        model.to(dtype=dtype, device=device)
+        inpt = torch.rand(2, 5, dtype=dtype, device=device)
+
+        for optim_input in optim_info.optim_inputs_func(device=device):
+            optim = optim_cls(model.parameters(), **optim_input.kwargs)
+            optim.zero_grad()
+            output = model(inpt)
+            loss = output.sum()
+            loss.backward()
+
+            if optim_info.only_supports_sparse_grads:
+                for param in model.parameters():
+                    if param.grad is not None:
+                        param.grad = param.grad.to_sparse()
+
+            if optim_info.step_requires_closure:
+                optim.step(lambda: 1.0)
+            else:
+                optim.step()
+
+            for state in optim.state.values():
+                self.assertGreater(len(state), 0)
+
+
+
+
 instantiate_device_type_tests(TestOptimRenewed, globals(), allow_mps=True)
 
 
diff --git a/test/test_out_dtype_op.py b/test/test_out_dtype_op.py
index 9242c5a6a4c82..d8d36297f37e9 100644
--- a/test/test_out_dtype_op.py
+++ b/test/test_out_dtype_op.py
@@ -5,7 +5,6 @@
 import torch._dynamo
 import torch._inductor
 import torch._inductor.decomposition
-import torch._export
 from torch._higher_order_ops.out_dtype import out_dtype
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing._internal.common_utils import (
@@ -62,12 +61,12 @@ def forward(self, x):
         weight = torch.randint(-128, 127, (5, 5), dtype=torch.int8)
         m = M(weight)
         x = torch.randint(-128, 127, (5, 5), dtype=torch.int8)
-        ep = torch._export.export(
+        ep = torch.export.export(
             m,
             (x,),
         )
         FileCheck().check("torch.ops.higher_order.out_dtype").check("aten.mm.default").run(ep.graph_module.code)
-        self.assertTrue(torch.allclose(m(x), ep(x)))
+        self.assertTrue(torch.allclose(m(x), ep.module()(x)))
         for node in ep.graph.nodes:
             if node.op == "call_function" and node.target is out_dtype:
                 # Result of this node should be int32
@@ -121,14 +120,15 @@ def f(x, y):
         self.assertTrue(torch.allclose(numerical_res, gm(*inp)))
 
     def test_out_dtype_non_functional(self):
-        def f(x, y):
-            return out_dtype(
-                torch.ops.aten.add_.Tensor, torch.int32, x, y
-            )
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return out_dtype(
+                    torch.ops.aten.add_.Tensor, torch.int32, x, y
+                )
 
         with self.assertRaisesRegex(ValueError, "out_dtype's first argument needs to be a functional operator"):
-            _ = torch._export.export(
-                f, (torch.randint(-128, 127, (5, 5), dtype=torch.int8), torch.randint(-128, 127, (5, 5), dtype=torch.int8)),
+            _ = torch.export.export(
+                M(), (torch.randint(-128, 127, (5, 5), dtype=torch.int8), torch.randint(-128, 127, (5, 5), dtype=torch.int8)),
             )
 
     def test_out_dtype_non_op_overload(self):
diff --git a/test/test_overrides.py b/test/test_overrides.py
index 1a065d99f68ba..cb46ca6ed8809 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -638,7 +638,7 @@ def _simple_type_parser(func, arg_name, arg_type):
                 return instance_gen()
             elif arg_type == "TensorList" or arg_type == "ITensorListRef":
                 return [instance_gen(), instance_gen()]
-            elif arg_type == "c10::List<c10::optional<Tensor>>":
+            elif arg_type == "c10::List<::std::optional<Tensor>>":
                 return [instance_gen(), instance_gen()]
             elif arg_type == "IntArrayRef" or arg_type == "SymIntArrayRef":
                 size = arg.get("size", 2)
@@ -1206,7 +1206,7 @@ class ErrorA(RuntimeError):
 
         class A(TorchFunctionMode):
             def __torch_function__(self, *args, **kwargs):
-                raise ErrorA()
+                raise ErrorA
 
         with self.assertRaises(ErrorA):
             with A():
@@ -1218,7 +1218,7 @@ class ErrorA(RuntimeError):
 
         class A(TorchFunctionMode):
             def __torch_function__(self, *args, **kwargs):
-                raise ErrorA()
+                raise ErrorA
 
         x = A()
         with self.assertRaises(ErrorA):
@@ -1387,6 +1387,28 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
 
         self.assertTrue(called)
 
+    def test_getitem_call(self):
+        # This failed because the parser thinks the function is called to()
+        # but it's actually called _parse_to()
+
+        called = False
+
+        class A(TorchFunctionMode):
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                nonlocal called
+                if kwargs is None:
+                    kwargs = {}
+                called = True
+                return func(*args, **kwargs)
+
+        a = torch.zeros(5)
+        b = torch.tensor(0)
+        with A():
+            a[b]
+
+        self.assertTrue(called)
+
+
     def test_distributions_bernoulli(self):
         # This failed because improper use of has_torch_function when
         # is_tensor_like should have been used instead, inside the
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 1c29be8e03563..9c56eef304bde 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -2,6 +2,7 @@
 
 from torch.testing._internal.common_utils import TestCase, run_tests
 import torch
+import torch._dynamo
 import unittest
 import warnings
 import operator
@@ -17,13 +18,15 @@
     guard_int, GuardOnDataDependentSymNode
 )
 from torch.testing._internal.custom_op_db import custom_op_db
-from torch.testing._internal.control_flow_opinfo_db import control_flow_opinfo_db
+from torch.testing._internal.hop_db import hop_db
 from torch.testing._internal.common_device_type import ops
 import torch.testing._internal.optests as optests
 from torch._C import _disabled_torch_function_impl
 from torch.fx.experimental.proxy_tensor import make_fx, DecompositionInterpreter, get_isolated_graphmodule
 from torch.utils._pytree import tree_map
+from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
 from torch import nn
+import torch._functorch.config
 import re
 
 import functools
@@ -955,7 +958,7 @@ def test_debug_interpreter(self):
         import torch.library
         from torch.library import Library
 
-        foo = Library("foo", "DEF")
+        foo = Library("foo", "DEF")  # noqa: TOR901
         foo.define("foo(Tensor self) -> Tensor")
 
         # Operator where meta and cpu disagree on strides
@@ -1019,6 +1022,44 @@ def forward(self, x_1, y_1):
     sym_size_int_1 = torch.ops.aten.sym_size.int(y_1, 0);  y_1 = None
     return (sym_size_int, sym_size_int_1)""")
 
+    def test_deduped_shape(self):
+        def f(s0, s1, x, y):
+            return torch.functional.broadcast_shapes(x.size(), y.size()[0]), torch.empty(x.shape[0])
+
+        x = torch.empty(3, 1)
+        y = torch.empty(5)
+        from torch.fx.experimental.symbolic_shapes import ShapeEnv
+        shape_env = ShapeEnv()
+
+        with FakeTensorMode(shape_env=shape_env, static_shapes=False) as fake_mode:
+            x = fake_mode.from_tensor(x)
+            y = fake_mode.from_tensor(y)
+            r = str(make_fx(f, tracing_mode="real")(x.shape[0], y.shape[0], x, y).code).strip()
+            self.assertExpectedInline(r, """\
+def forward(self, s0_1, s1_1, x_1, y_1):
+    empty = torch.ops.aten.empty.memory_format([s0_1], device = device(type='cpu'), pin_memory = False)
+    return ((s0_1, s1_1), empty)""")
+
+    def test_non_deduped_shape(self):
+        def f(x, y):
+            return torch.functional.broadcast_shapes(x.size(), y.size()[0]), torch.empty(x.shape[0])
+
+        x = torch.empty(3, 1)
+        y = torch.empty(5)
+        from torch.fx.experimental.symbolic_shapes import ShapeEnv
+        shape_env = ShapeEnv()
+
+        with FakeTensorMode(shape_env=shape_env, static_shapes=False) as fake_mode:
+            x = fake_mode.from_tensor(x)
+            y = fake_mode.from_tensor(y)
+            r = str(make_fx(f, tracing_mode="real")(x, y).code).strip()
+            self.assertExpectedInline(r, """\
+def forward(self, x_1, y_1):
+    sym_size_int = torch.ops.aten.sym_size.int(x_1, 0);  x_1 = None
+    empty = torch.ops.aten.empty.memory_format([sym_size_int], device = device(type='cpu'), pin_memory = False)
+    sym_size_int_1 = torch.ops.aten.sym_size.int(y_1, 0);  y_1 = None
+    return ((sym_size_int, sym_size_int_1), empty)""")
+
     def test_unary(self):
         def f(x):
             assert x.shape[0] < 20
@@ -1030,7 +1071,7 @@ def f(x):
         self.assertTrue(eval_guards(gm, torch.randn(4, 5)))
         self.assertEqual(repr(bind_symbols(gm, torch.randn(4, 5))), "{s0: 4, s1: 5}")
         self.assertFalse(eval_guards(gm, torch.randn(25, 5)))
-        self.assertExpectedInline(show_guards(gm), """L['x'].size()[0] < 20""")
+        self.assertExpectedInline(show_guards(gm), """L['x'].size()[0] <= 19""")
 
     def test_repeat_interleave(self):
         def f(src_tokens, beam_size_src):
@@ -1067,6 +1108,23 @@ def forward(self, y_1, x_1):
     index_select = torch.ops.aten.index_select.default(y_1, 1, repeat_interleave);  y_1 = repeat_interleave = None
     return index_select""")
 
+    def test_cumsum_unbacked(self):
+        def f(x):
+            y = x.item()
+            z = torch.randn((3, y, 3))
+            return z.cumsum(0)
+
+        r = str(make_fx(f, tracing_mode="symbolic")(torch.tensor([5])).code).strip()
+        self.assertExpectedInline(
+            r, """\
+def forward(self, x_1):
+    _local_scalar_dense = torch.ops.aten._local_scalar_dense.default(x_1);  x_1 = None
+    randn = torch.ops.aten.randn.default([3, _local_scalar_dense, 3], device = device(type='cpu'), pin_memory = False);  _local_scalar_dense = None
+    cumsum = torch.ops.aten.cumsum.default(randn, 0);  randn = None
+    return cumsum"""  # noqa: B950
+        )
+
+
     def test_repeat_interleave_unbacked_output_size(self):
         def f(x, y):
             s = x.sum().item()
@@ -1290,7 +1348,7 @@ def f(x, mask, params, buffers):
                 for s in p.shape:
                     guard_int(s)
             x = x[mask]
-            torch._constrain_as_value(x.shape[0], min=1)
+            torch._check(x.shape[0] >= 1)
             for p in params.values():
                 p.grad = None
             return torch.func.functional_call(mod, {**params, **buffers}, (x,)).sum()
@@ -1350,6 +1408,14 @@ def forward(self, x_1, y_1):
     add = torch.ops.aten.add.Tensor(zeros, y_1);  zeros = y_1 = None
     return add""")  # noqa: B950
 
+    def test_reshape_divisibility_unbacked(self):
+        def f(x):
+            i0 = x.item()
+            r = torch.zeros(i0, 4, 20)
+            r = r.transpose(2, 1)
+            return r.reshape(-1, 80)
+        make_fx(f, tracing_mode="symbolic")(torch.tensor(24))
+
     def test_view_divisibility_unbacked(self):
         def f(x):
             i0 = x.item()
@@ -1357,6 +1423,17 @@ def f(x):
             return r.view(12, -1, 192)
         make_fx(f, tracing_mode="symbolic")(torch.tensor(24))
 
+    def test_view_divisibility_unbacked_relatively_prime(self):
+        # See https://github.com/pytorch/pytorch/issues/123651
+        def f(x):
+            i0 = x.item()
+            torch._check_is_size(i0)
+            # To trigger the original issue, the max bound has to
+            # be chosen such that 448 / 447 < 2 (which it is.)
+            torch._check(i0 <= 448)
+            return torch.zeros(256 * i0).view(-1, 447)
+        make_fx(f, tracing_mode="symbolic")(torch.tensor(256 * 447))
+
     def test_unbacked_unify_guard(self):
         def f(x, y):
             z = torch.zeros(x.item())
@@ -1374,6 +1451,7 @@ def forward(self, x_1, y_1):
     add = torch.ops.aten.add.Tensor(y_1, 2);  y_1 = None
     return add""")  # noqa: B950
 
+    @unittest.expectedFailure
     def test_unbacked_unify_guard_transitivity(self):
         def f(x1, x2, y):
             z1 = torch.zeros(x1.item())
@@ -1385,21 +1463,43 @@ def f(x1, x2, y):
             else:
                 return y + 2
 
-        r = str(make_fx(f, tracing_mode="symbolic")(torch.tensor(10), torch.tensor(10), torch.randn(10)).code).strip()
-        self.assertExpectedInline(r, """\
-def forward(self, x1_1, x2_1, y_1):
-    _local_scalar_dense = torch.ops.aten._local_scalar_dense.default(x1_1);  x1_1 = None
-    zeros = torch.ops.aten.zeros.default([_local_scalar_dense], device = device(type='cpu'), pin_memory = False);  _local_scalar_dense = None
-    _local_scalar_dense_1 = torch.ops.aten._local_scalar_dense.default(x2_1);  x2_1 = None
-    zeros_1 = torch.ops.aten.zeros.default([_local_scalar_dense_1], device = device(type='cpu'), pin_memory = False);  _local_scalar_dense_1 = None
-    add = torch.ops.aten.add.Tensor(y_1, 2);  y_1 = None
-    return add""")  # noqa: B950
+        gm = make_fx(f, tracing_mode="symbolic")(torch.tensor(10), torch.tensor(10), torch.randn(10))
+        insert_deferred_runtime_asserts(gm, gm.shape_env, "test")
+        gm.recompile()
+        r = str(gm.code).strip()
+        # self.assertExpectedInline(
+        #     r, """"""  # noqa: B950
+        # )
+
+    def test_unbacked_unify_dependency_violation(self):
+        def f(x1, x2, x3, y):
+            z1 = x1.item()
+            torch._check(z1 // 9 == 1)
+            z2 = x2.item()
+            z3 = x3.item()
+            torch._check(z1 == z2 + z3)
+            return y * 2
+            if z2 + z3 == z1:
+                return y * 2
+            else:
+                return y + 3
+
+        # NB:
+
+        gm = make_fx(f, tracing_mode="symbolic")(torch.tensor(10), torch.tensor(5), torch.tensor(5), torch.randn(1))
+        insert_deferred_runtime_asserts(gm, gm.shape_env, "test")
+        gm.recompile()
+        self.assertEqual(gm(torch.tensor(12), torch.tensor(6), torch.tensor(6), torch.tensor([1.0])), torch.tensor([2.0]))
+        with self.assertRaises(RuntimeError):
+            gm(torch.tensor(20), torch.tensor(10), torch.tensor(10), torch.tensor([1.0]))
+
 
     def test_split_unbacked_sizes(self):
         def f(lengths, values):
             # tolist not directly supported atm
             sizes = [lengths[i].item() for i in range(lengths.size(0))]
             for s in sizes:
+                # TODO(avik): no assertion generated with torch._check_is_size?
                 torch._constrain_as_size(s)
             return torch.split(values, sizes)
 
@@ -1445,6 +1545,22 @@ def f(a):
 
         make_fx(f, tracing_mode="symbolic")(torch.randn(4))
 
+    @torch._functorch.config.patch(fake_tensor_propagate_real_tensors=True)
+    def test_invalidate_nonzero_propagate_real_tensors(self):
+        def f(a):
+            b = a.clone()
+            x = b.nonzero()
+            x1 = b.nonzero()
+            x2 = b.nonzero()
+            assert x1.shape[0] == x2.shape[0]
+            b.normal_()
+            y = b.nonzero()
+            # Because you're not actually going to generate exactly zero with
+            # normal_ lol
+            assert x1.shape[0] == y.shape[0]
+
+        make_fx(f, tracing_mode="symbolic")(torch.randn(4))
+
     def test_sqrt_size(self):
         def f(a):
             return a / a.size(-1) ** 0.5
@@ -1560,6 +1676,18 @@ def f(tensor):
         f(a)
         make_fx(f, tracing_mode="symbolic")(a)
 
+    def test_fake_tensor_as_size(self):
+        def f(x):
+            r = torch.zeros([x])
+            return r
+
+        fx_g = make_fx(f, tracing_mode="symbolic")(torch.tensor(4))
+        self.assertExpectedInline(fx_g.code.strip(), """\
+def forward(self, x_1):
+    _local_scalar_dense = torch.ops.aten._local_scalar_dense.default(x_1);  x_1 = None
+    zeros = torch.ops.aten.zeros.default([_local_scalar_dense], device = device(type='cpu'), pin_memory = False);  _local_scalar_dense = None
+    return zeros""")  # noqa: B950
+
     def test_expand(self):
         def f(a):
             b = torch.mul(a, a)
@@ -1647,14 +1775,14 @@ def f(a):
             assert a.shape[0] > 5 and a.shape[0] > 12
             return a.cos()
         tensor = make_fx(f, tracing_mode="symbolic")(torch.randn(15))
-        self.assertExpectedInline(show_guards(tensor), """L['a'].size()[0] > 12""")
+        self.assertExpectedInline(show_guards(tensor), """13 <= L['a'].size()[0]""")
 
     def test_guard_lowerbound_range_refinement(self):
         def f(a):
             assert a.shape[0] < 20 and a.shape[0] < 30
             return a.cos()
         tensor = make_fx(f, tracing_mode="symbolic")(torch.randn(15))
-        self.assertExpectedInline(show_guards(tensor), """L['a'].size()[0] < 20""")
+        self.assertExpectedInline(show_guards(tensor), """L['a'].size()[0] <= 19""")
 
     def test_guard_upperbound_range_refinement_multivariate(self):
         def f(a):
@@ -1664,7 +1792,8 @@ def f(a):
         tensor = make_fx(f, tracing_mode="symbolic")(torch.randn((15, 20)))
         self.assertExpectedInline(show_guards(tensor), """\
 L['a'].size()[1] > L['a'].size()[0]
-L['a'].size()[0] > 12""")
+13 <= L['a'].size()[0]
+14 <= L['a'].size()[1]""")
 
     def test_guard_lowerbound_range_refinement_multivariate(self):
         def f(a):
@@ -1676,7 +1805,8 @@ def f(a):
             show_guards(tensor),
             """\
 L['a'].size()[1] < L['a'].size()[0]
-L['a'].size()[0] < 20""")
+L['a'].size()[0] <= 19
+L['a'].size()[1] <= 18""")
 
     def test_sym_storage_offset(self):
         def f(x, y):
@@ -1806,38 +1936,20 @@ def f(t):
 }
 
 symbolic_tensor_failures = {
-    xfail('linalg.eig'),
-    xfail('linalg.eigvals'),
     xfail('combinations', ''),
-    xfail('frexp', ''),  # aten.frexp.Tensor - couldn't find symbolic meta function/decomposition
     xfail('geqrf', ''),  # aten.geqrf.default - couldn't find symbolic meta function/decomposition
-    xfail('histc', ''),  # Could not run 'aten::histc' with arguments from the 'Meta' backend. This could be because...
     xfail('histogram', ''),  # Could not run 'aten::histogram.bin_ct' with arguments from the 'Meta' backend. This c...
     xfail('histogramdd', ''),  # aten._histogramdd_bin_edges.default - couldn't find symbolic meta function/decomposition
-    xfail('isin', ''),  # aten.isin.Tensor_Tensor - couldn't find symbolic meta function/decomposition
     xfail('kthvalue', ''),  # aten.kthvalue.default - couldn't find symbolic meta function/decomposition
     xfail('nanquantile', ''),  # Could not run 'aten::equal' with arguments from the 'Meta' backend.
     xfail('narrow', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.binary_cross_entropy', ''),  # aten.new_empty.default - couldn't find symbolic meta function/decom...
     xfail('nn.functional.cross_entropy', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.ctc_loss'),  # aten._ctc_loss.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.fractional_max_pool2d', ''),  # argument 'size' must be tuple of ints, but found element of t...
-    xfail('nn.functional.fractional_max_pool3d', ''),  # argument 'size' must be tuple of ints, but found element of t...
-    xfail('nn.functional.interpolate', 'linear'),  # aten.upsample_linear1d.vec - couldn't find symbolic meta function/dec...
-    xfail('nn.functional.interpolate', 'trilinear'),  # aten.upsample_trilinear3d.vec - couldn't find symbolic meta functi...
-    xfail('nn.functional.pixel_unshuffle', ''),  # aten.pixel_unshuffle.default - couldn't find symbolic meta function/deco...
     xfail('quantile', ''),  # Could not run 'aten::equal' with arguments from the 'Meta' backend.
-    xfail('resize_as_', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
     xfail('unique_consecutive', ''),  # aten.unique_consecutive.default - couldn't find symbolic meta function/decomposition
     xfail('unique', ''),  # aten._unique2.default - couldn't find symbolic meta function/decomposition
 
-    # AssertionError: False != True - https://github.com/pytorch/pytorch/issues/113905
-    xfail('dist', ''),
-    xfail('norm', ''),
-    xfail('linalg.vector_norm', ''),
-    xfail('linalg.norm', 'subgradients_at_zero'),
-    xfail('renorm', ''),
-
     xfail('max_pool2d_with_indices_backward', ''),  # Expected a value of type 'List[int]' for argument 'kernel_size' but...
 
     # many complex operators incorrect striding, metadata
@@ -1864,15 +1976,6 @@ def f(t):
 
 symbolic_tensor_failures.update(symbolic_tensor_segfaults)
 
-outplace_symbolic_tensor_failures = {
-    xfail('i0', ''),  # aten.i0.default - couldn't find symbolic meta function/decomposition
-
-    xfail('linalg.norm', ''),
-    xfail('round', 'decimals_0'),  # Cannot call numel() on tensor with symbolic sizes/strides
-    xfail('round', 'decimals_3'),  # Cannot call numel() on tensor with symbolic sizes/strides
-    xfail('round', 'decimals_neg_3'),  # Cannot call numel() on tensor with symbolic sizes/strides
-}
-
 inplace_symbolic_tensor_failures = {
     # bugs
     xfail('float_power', ''),  # base given to float_power_ has dtype Float but the operation's result requires dtype Double
@@ -1881,57 +1984,30 @@ def f(t):
 }
 
 out_symbolic_tensor_failures = {
+    # Cast error details: Unable to cast (...) to Tensor
+    #
+    # This happens because the test is set up to call the out variant using the `out` kwarg:
+    #   torch._some_op(arg1, arg2, out=(out1, out2, out3))
+    #
+    # However, this only works on torch ops, not aten ops. For `_batch_norm_with_update`,
+    # this fails because the op has no python bindings, so it doesn't support the `out` kwarg
+    # way of calling its out variant.
+    xfail('_batch_norm_with_update', ''),
     xfail('_native_batch_norm_legit', ''),
-    xfail('aminmax', ''),
     xfail('angle', ''),
     xfail('argmax', ''),
     xfail('argmin', ''),
-    xfail('bmm', ''),
-    xfail('cummax', ''),
-    xfail('cummin', ''),
     xfail('fft.fft2', ''),
     xfail('fft.fftn', ''),
     xfail('fft.ifft2', ''),
     xfail('fft.ifftn', ''),
     xfail('gather', ''),
-    xfail('i0', ''),
-    xfail('linalg.cholesky', ''),
-    xfail('linalg.cholesky_ex', ''),
-    xfail('linalg.det', ''),
-    xfail('linalg.det', 'singular'),
-    xfail('linalg.eigh', ''),
-    xfail('linalg.inv', ''),
-    xfail('linalg.inv_ex', ''),
-    xfail('linalg.ldl_factor', ''),
-    xfail('linalg.ldl_factor_ex', ''),
-    xfail('linalg.lu', ''),
-    xfail('linalg.lu_factor', ''),
-    xfail('linalg.lu_factor_ex', ''),
     xfail('linalg.pinv', ''),
     xfail('linalg.pinv', 'hermitian'),
-    xfail('linalg.qr', ''),
-    xfail('linalg.slogdet', ''),
-    xfail('linalg.solve_ex', ''),
-    xfail('linalg.svd', ''),
-    xfail('linalg.svdvals', ''),
     xfail('lu', ''),
-    xfail('lu_unpack', ''),
-    xfail('max', 'reduction_with_dim'),
-    xfail('min', 'reduction_with_dim'),
-    xfail('mode', ''),
-    xfail('nn.functional.avg_pool2d', ''),
-    xfail('nn.functional.linear', ''),
-    xfail('qr', ''),
-    xfail('round', ''),
-    xfail('round', 'decimals_0'),
-    xfail('round', 'decimals_3'),
-    xfail('round', 'decimals_neg_3'),
     xfail('scatter_add', ''),
     xfail('scatter', ''),
-    xfail('sort', ''),
-    xfail('svd', ''),
     xfail('take_along_dim', ''),
-    xfail('topk', ''),
     xfail('triangular_solve', ''),
     xfail('view_copy', ''),
 
@@ -1939,6 +2015,12 @@ def f(t):
     xfail('ones', ''),
     xfail('randn', ''),
     xfail('zeros', ''),
+
+    # RuntimeError: Cannot call numel() on tensor with symbolic sizes/strides
+    xfail('index_reduce', 'prod'),
+    xfail('index_reduce', 'mean'),
+    xfail('index_reduce', 'amax'),
+    xfail('index_reduce', 'amin'),
 }
 
 out_symbolic_tensor_segfaults = {
@@ -1980,20 +2062,36 @@ def _test_make_fx_helper(self, device, dtype, op, tracing_mode, inplace=False, o
             self.skipTest("Dynamic output shape operation in trace")
 
 
+def skipIfNameMatches(pattern):
+    """
+    Decorator to skip a test if its name matches the given pattern.
+    """
+    def decorator(test_func):
+        def wrapper(*args, **kwargs):
+            if re.match(pattern, test_func.__name__):
+                raise unittest.SkipTest(f"Test '{test_func.__name__}' skipped because its name matches the pattern '{pattern}'")
+            return test_func(*args, **kwargs)
+        return wrapper
+    return decorator
+
+# Auto functionalize shouldn't work with make_fx directly
+filtered_hop_db = [op for op in hop_db if op.name != "auto_functionalize"]
+
+@unittest.skipIf(not torch._dynamo.is_dynamo_supported(), "Cond requires dynamo")
 class TestProxyTensorOpInfo(TestCase):
-    @ops(op_db + custom_op_db + control_flow_opinfo_db, allowed_dtypes=(torch.float,))
+    @ops(op_db + filtered_hop_db + custom_op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestProxyTensorOpInfo', 'test_make_fx_exhaustive', make_fx_failures)
     def test_make_fx_exhaustive(self, device, dtype, op):
         _test_make_fx_helper(self, device, dtype, op, "real")
 
-    @ops(op_db + custom_op_db + control_flow_opinfo_db, allowed_dtypes=(torch.float,))
+    @ops(op_db + filtered_hop_db + custom_op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestProxyTensorOpInfo', 'test_make_fx_fake_exhaustive', make_fx_failures.union(fake_tensor_failures))
     def test_make_fx_fake_exhaustive(self, device, dtype, op):
         _test_make_fx_helper(self, device, dtype, op, "fake")
 
-    @ops(op_db + custom_op_db + control_flow_opinfo_db, allowed_dtypes=(torch.float,))
+    @ops(op_db + filtered_hop_db + custom_op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive',
-             make_fx_failures | fake_tensor_failures | symbolic_tensor_failures | outplace_symbolic_tensor_failures)
+             make_fx_failures | fake_tensor_failures | symbolic_tensor_failures)
     def test_make_fx_symbolic_exhaustive(self, device, dtype, op):
         _test_make_fx_helper(self, device, dtype, op, "symbolic")
 
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
index 77ad8febd97db..59e535e1447d2 100644
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@@ -1,6 +1,8 @@
 # Owner(s): ["module: autograd"]
 
 from torch.testing._internal.common_utils import TestCase, run_tests, IS_JETSON, IS_WINDOWS
+from torch._utils_internal import get_file_path_2
+
 import pkgutil
 import torch
 import importlib
@@ -10,6 +12,47 @@
 import os
 import unittest
 from importlib import import_module
+from itertools import chain
+from pathlib import Path
+
+def _find_all_importables(pkg):
+    """Find all importables in the project.
+
+    Return them in order.
+    """
+    return sorted(
+        set(
+            chain.from_iterable(
+                _discover_path_importables(Path(p), pkg.__name__)
+                for p in pkg.__path__
+            ),
+        ),
+    )
+
+
+def _discover_path_importables(pkg_pth, pkg_name):
+    """Yield all importables under a given path and package.
+
+    This is like pkgutil.walk_packages, but does *not* skip over namespace
+    packages. Taken from https://stackoverflow.com/questions/41203765/init-py-required-for-pkgutil-walk-packages-in-python3
+    """
+    for dir_path, _d, file_names in os.walk(pkg_pth):
+        pkg_dir_path = Path(dir_path)
+
+        if pkg_dir_path.parts[-1] == '__pycache__':
+            continue
+
+        if all(Path(_).suffix != '.py' for _ in file_names):
+            continue
+
+        rel_pt = pkg_dir_path.relative_to(pkg_pth)
+        pkg_pref = '.'.join((pkg_name, ) + rel_pt.parts)
+        yield from (
+            pkg_path
+            for _, pkg_path, _ in pkgutil.walk_packages(
+                (str(pkg_dir_path), ), prefix=f'{pkg_pref}.',
+            )
+        )
 
 
 class TestPublicBindings(TestCase):
@@ -96,6 +139,7 @@ def test_no_new_bindings(self):
             "Generator",
             "GeneratorType",
             "get_autocast_cpu_dtype",
+            "get_autocast_dtype",
             "get_autocast_ipu_dtype",
             "get_default_dtype",
             "get_num_interop_threads",
@@ -145,6 +189,7 @@ def test_no_new_bindings(self):
             "NumberType",
             "OperatorInfo",
             "OptionalType",
+            "OutOfMemoryError",
             "ParameterDict",
             "parse_ir",
             "parse_schema",
@@ -172,6 +217,7 @@ def test_no_new_bindings(self):
             "set_anomaly_enabled",
             "set_autocast_cache_enabled",
             "set_autocast_cpu_dtype",
+            "set_autocast_dtype",
             "set_autocast_ipu_dtype",
             "set_autocast_cpu_enabled",
             "set_autocast_ipu_enabled",
@@ -184,6 +230,7 @@ def test_no_new_bindings(self):
             "StaticModule",
             "Stream",
             "StreamObjType",
+            "Event",
             "StringType",
             "SUM",
             "SymFloat",
@@ -231,9 +278,10 @@ def _is_mod_public(modname):
                 return False
         return True
 
+
     def test_modules_can_be_imported(self):
         failures = []
-        for _, modname, _ in pkgutil.walk_packages(path=torch.__path__, prefix=torch.__name__ + '.'):
+        for _, modname, _ in _discover_path_importables(str(torch.__path__), "torch"):
             try:
                 # TODO: fix "torch/utils/model_dump/__main__.py"
                 # which calls sys.exit() when we try to import it
@@ -250,6 +298,7 @@ def test_modules_can_be_imported(self):
             "torch._inductor.codegen.cuda.cuda_kernel",
             "torch.onnx._internal.fx._pass",
             "torch.onnx._internal.fx.analysis",
+            "torch.onnx._internal.fx.decomposition_skip",
             "torch.onnx._internal.fx.diagnostics",
             "torch.onnx._internal.fx.fx_onnx_interpreter",
             "torch.onnx._internal.fx.fx_symbolic_graph_extractor",
@@ -287,7 +336,7 @@ def test_modules_can_be_imported(self):
             "torch.utils.tensorboard._caffe2_graph",
             "torch._inductor.codegen.cuda.cuda_template",
             "torch._inductor.codegen.cuda.gemm_template",
-            "torch._inductor.triton_helpers",
+            "torch._inductor.runtime.triton_helpers",
             "torch.ao.pruning._experimental.data_sparsifier.lightning.callbacks.data_sparsity",
             "torch.backends._coreml.preprocess",
             "torch.contrib._tensorboard_vis",
@@ -309,6 +358,19 @@ def test_modules_can_be_imported(self):
             "torch.distributed.algorithms._optimizer_overlap",
             "torch.distributed.rpc._testing.faulty_agent_backend_registry",
             "torch.distributed.rpc._utils",
+            "torch.ao.pruning._experimental.data_sparsifier.benchmarks.dlrm_utils",
+            "torch.ao.pruning._experimental.data_sparsifier.benchmarks.evaluate_disk_savings",
+            "torch.ao.pruning._experimental.data_sparsifier.benchmarks.evaluate_forward_time",
+            "torch.ao.pruning._experimental.data_sparsifier.benchmarks.evaluate_model_metrics",
+            "torch.ao.pruning._experimental.data_sparsifier.lightning.tests.test_callbacks",
+            "torch.csrc.jit.tensorexpr.scripts.bisect",
+            "torch.csrc.lazy.test_mnist",
+            "torch.distributed._shard.checkpoint._fsspec_filesystem",
+            "torch.distributed._tensor.examples.visualize_sharding_example",
+            "torch.distributed.checkpoint._fsspec_filesystem",
+            "torch.distributed.examples.memory_tracker_example",
+            "torch.testing._internal.distributed.rpc.fb.thrift_rpc_agent_test_fixture",
+            "torch.utils._cxx_pytree",
         }
 
         # No new entries should be added to this list.
@@ -345,6 +407,7 @@ def test_modules_can_be_imported(self):
             "torch.distributed.run",
             "torch.distributed.tensor.parallel",
             "torch.distributed.utils",
+            "torch.utils.tensorboard",
         }
 
         errors = []
@@ -365,7 +428,7 @@ def test_modules_can_be_imported(self):
     def test_correct_module_names(self):
         '''
         An API is considered public, if  its  `__module__` starts with `torch.`
-        and there is no name in `__module__` or the object itself that starts with “_”.
+        and there is no name in `__module__` or the object itself that starts with "_".
         Each public package should either:
         - (preferred) Define `__all__` and all callables and classes in there must have their
          `__module__` start with the current submodule's path. Things not in `__all__` should
@@ -374,7 +437,7 @@ def test_correct_module_names(self):
           `__module__` that start with the current submodule.
         '''
         failure_list = []
-        with open(os.path.join(os.path.dirname(__file__), 'allowlist_for_publicAPI.json')) as json_file:
+        with open(get_file_path_2(os.path.dirname(__file__), 'allowlist_for_publicAPI.json')) as json_file:
             # no new entries should be added to this allow_dict.
             # New APIs must follow the public API guidelines.
             allow_dict = json.load(json_file)
@@ -401,7 +464,8 @@ def test_module(modname):
             # verifies that each public API has the correct module name and naming semantics
             def check_one_element(elem, modname, mod, *, is_public, is_all):
                 obj = getattr(mod, elem)
-                if not (isinstance(obj, Callable) or inspect.isclass(obj)):
+                # torch.dtype is not a class nor callable, so we need to check for it separately
+                if not (isinstance(obj, (Callable, torch.dtype)) or inspect.isclass(obj)):
                     return
                 elem_module = getattr(obj, '__module__', None)
                 # Only used for nice error message below
@@ -472,7 +536,7 @@ def check_one_element(elem, modname, mod, *, is_public, is_all):
                     if not elem.startswith('_'):
                         check_one_element(elem, modname, mod, is_public=True, is_all=False)
 
-        for _, modname, ispkg in pkgutil.walk_packages(path=torch.__path__, prefix=torch.__name__ + '.'):
+        for _, modname, _ in _discover_path_importables(str(torch.__path__), "torch"):
             test_module(modname)
 
         test_module('torch')
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index 7b62b387b2b33..e1cd61ad992c5 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -1,34 +1,52 @@
 # Owner(s): ["module: __torch_dispatch__"]
 
 import tempfile
-import torch
+import unittest
 from copy import deepcopy
-from torch.library import Library, impl, fallthrough_kernel
-from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+import torch
 from torch import SymInt
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.cuda.jiterator import _create_jit_fn
-import unittest
+from torch.fx.experimental.symbolic_shapes import ShapeEnv
+from torch.library import _scoped_library, fallthrough_kernel, impl, Library
 from torch.testing._internal.common_utils import *  # noqa: F403
-from torch.utils._mode_utils import no_dispatch, all_same_mode
-from torch.testing._internal.logging_tensor import LoggingTensor, LoggingTensorReentrant, LoggingTensorMode, \
-    log_input, capture_logs, capture_logs_with_logging_tensor_mode
-from torch.testing._internal.two_tensor import TwoTensor
-from torch.utils._pytree import tree_map, tree_map_only
-from torch.utils import _pytree as pytree
-from torch.utils._python_dispatch import TorchDispatchMode, _get_current_dispatch_mode, _get_current_dispatch_mode_stack
+import logging
+import sys
+
+import torch._dynamo
+from torch._C import DispatchKey, DispatchKeySet
 from torch._custom_op.functional import register_functional_op
-from torch._C import DispatchKeySet, DispatchKey
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.testing._internal.common_device_type import ops
+from torch.multiprocessing.reductions import StorageWeakRef
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    ops,
+)
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.custom_op_db import custom_op_db
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
-from torch.multiprocessing.reductions import StorageWeakRef
+from torch.testing._internal.logging_tensor import (
+    capture_logs,
+    capture_logs_with_logging_tensor_mode,
+    log_input,
+    LoggingTensor,
+    LoggingTensorMode,
+    LoggingTensorReentrant,
+)
+from torch.testing._internal.two_tensor import TwoTensor
+from torch.utils import _pytree as pytree
+from torch.utils._mode_utils import all_same_mode, no_dispatch
+from torch.utils._python_dispatch import (
+    _get_current_dispatch_mode,
+    _get_current_dispatch_mode_stack,
+    TorchDispatchMode,
+)
+from torch.utils._pytree import tree_map, tree_map_only
 
-import logging
-import sys
-import torch._dynamo
+
+# used as DataLoader collate_fn below; named here to avoid trying to pickle a lambda
+def _identity(x):
+    return x
 
 
 class TestDispatcherPythonBindings(TestCase):
@@ -40,7 +58,7 @@ def test_call_boxed(self) -> None:
 
 
 class TestPythonRegistration(TestCase):
-    test_ns = '_test_python_registration'
+    test_ns = "_test_python_registration"
 
     def tearDown(self):
         if hasattr(torch.ops, self.test_ns):
@@ -48,61 +66,61 @@ def tearDown(self):
 
     def test_override_aten_ops_with_multiple_libraries(self) -> None:
         x = torch.tensor([1, 2])
-        my_lib1 = Library("aten", "IMPL")
-        my_lib2 = Library("aten", "IMPL")
-
-        # Example 1
-        def my_neg(*args, **kwargs):
-            return args[0]._neg_view()
-
-        # Now we are secretly making the operator a view op so autograd needs to know how
-        # to handle it
-        my_lib1.impl('neg', my_neg, "AutogradCPU")
-
-        self.assertTrue(torch.neg(x).is_neg())
-
-        # RuntimeError: impl("aten::neg", ...):
-        # Explicitly provided namespace (aten) in operator name does not match ...
-        with self.assertRaisesRegex(RuntimeError, "operator name does not match namespace"):
-            my_lib3 = Library("foo", "DEF")
-            my_lib3.define("neg(Tensor self) -> Tensor")
-            my_lib3.impl(torch.ops.aten.neg.default, my_neg, "AutogradCPU")
-            del my_lib3
-
-        # Example 2
-        def my_mul(*args, **kwargs):
-            return torch.zeros_like(args[0])
-
-        # torch.ops.aten.mul.Tensor
-        my_lib2.impl("aten::mul.Tensor", my_mul, "ZeroTensor")
+        with _scoped_library("aten", "IMPL") as my_lib2:
+            with _scoped_library("aten", "IMPL") as my_lib1:
+                # Example 1
+                def my_neg(*args, **kwargs):
+                    return args[0]._neg_view()
+
+                # Now we are secretly making the operator a view op so autograd needs to know how
+                # to handle it
+                my_lib1.impl("neg", my_neg, "AutogradCPU")
+
+                self.assertTrue(torch.neg(x).is_neg())
+
+                # RuntimeError: impl("aten::neg", ...):
+                # Explicitly provided namespace (aten) in operator name does not match ...
+                with self.assertRaisesRegex(
+                    RuntimeError, "operator name does not match namespace"
+                ):
+                    with _scoped_library("foo", "DEF") as my_lib3:
+                        my_lib3.define("neg(Tensor self) -> Tensor")
+                        my_lib3.impl(torch.ops.aten.neg.default, my_neg, "AutogradCPU")
 
-        y = torch._efficientzerotensor(2)
-        self.assertFalse(torch.mul(x, y)._is_zerotensor())
+                # Example 2
+                def my_mul(*args, **kwargs):
+                    return torch.zeros_like(args[0])
 
-        # Assert that a user can't override the behavior of a (ns, op, dispatch_key)
-        # combination if someone overrided the behavior for the same before them
-        with self.assertRaisesRegex(RuntimeError, 'already a kernel registered from python'):
-            my_lib2.impl(torch.ops.aten.mul.Tensor, my_mul, "ZeroTensor")
+                # torch.ops.aten.mul.Tensor
+                my_lib2.impl("aten::mul.Tensor", my_mul, "ZeroTensor")
 
-        del my_lib1
+                y = torch._efficientzerotensor(2)
+                self.assertFalse(torch.mul(x, y)._is_zerotensor())
 
-        # Validate that lib2 is not affected by removing lib1
-        self.assertFalse(torch.mul(x, y)._is_zerotensor())
+                # Assert that a user can't override the behavior of a (ns, op, dispatch_key)
+                # combination if someone overrided the behavior for the same before them
+                with self.assertRaisesRegex(
+                    RuntimeError, "already a kernel registered from python"
+                ):
+                    my_lib2.impl(torch.ops.aten.mul.Tensor, my_mul, "ZeroTensor")
 
-        del my_lib2
+            # Validate that lib2 is not affected by removing lib1
+            self.assertFalse(torch.mul(x, y)._is_zerotensor())
 
         # Validate that the old behavior is restored for neg and mul
         self.assertFalse(torch.neg(x).is_neg())
         self.assertTrue(torch.mul(x, y)._is_zerotensor())
 
     def test_error_if_fn_not_callable(self):
-        with self.assertRaisesRegex(TypeError, "Input function is required to be a callable"):
-            my_lib = Library("aten", "IMPL")
-            my_lib.impl(torch.ops.aten.neg.default, [], "AutogradCPU")
+        with self.assertRaisesRegex(
+            TypeError, "Input function is required to be a callable"
+        ):
+            with _scoped_library("aten", "IMPL") as my_lib:
+                my_lib.impl(torch.ops.aten.neg.default, [], "AutogradCPU")
 
     def test_finalizer(self):
         impls_refcnt = sys.getrefcount(torch.library._impls)
-        lib = Library(self.test_ns, "FRAGMENT")
+        lib = Library(self.test_ns, "FRAGMENT")  # noqa: TOR901
         lib.define("foo123(Tensor x) -> Tensor")
 
         # 1 for `lib`, 1 for sys.getrefcount
@@ -118,7 +136,7 @@ def foo123(x):
             pass
 
         lib.impl(f"{self.test_ns}::foo123", foo123, "CPU")
-        key = f'{self.test_ns}/foo123/CPU'
+        key = f"{self.test_ns}/foo123/CPU"
         self.assertTrue(key in torch.library._impls)
 
         saved_op_impls = lib._op_impls
@@ -147,23 +165,22 @@ def my_sum(*args, **kwargs):
             run[0] = True
             return args[0].clone()
 
-        my_lib1 = Library("aten", "IMPL")
-        my_lib1.impl('aten::sum', my_sum, "CPU")
-        x = torch.tensor([1, 2])
-        self.assertEqual(torch.sum(x), x)
-        self.assertTrue(run[0])
-        del my_lib1
+        with _scoped_library("aten", "IMPL") as my_lib1:
+            my_lib1.impl("aten::sum", my_sum, "CPU")
+            x = torch.tensor([1, 2])
+            self.assertEqual(torch.sum(x), x)
+            self.assertTrue(run[0])
         # Validate that the old behavior is restored for sum
         self.assertEqual(torch.sum(x), torch.tensor(3))
 
     def test_override_cuda_with_jiterator(self) -> None:
         def override_where_cuda() -> None:
             # Example 1: Invert the behavior of where's condition input
-            not_where_code_string = '''
+            not_where_code_string = """
             template <typename T> T inverted_where(bool cond, T a, T b){
                 return !cond ? a : b;
             }
-            '''
+            """
             jitted_where = _create_jit_fn(not_where_code_string)
 
             CALLED = [False]
@@ -173,28 +190,29 @@ def inverted_where(*args, **kwargs):
                 return jitted_where(*args, **kwargs)
 
             # overriding where's cuda kernel with Jiterator generated kernel
-            my_lib = Library("aten", "IMPL")
-            my_lib.impl('aten::where.self', inverted_where, "CUDA")
+            with _scoped_library("aten", "IMPL") as my_lib:
+                my_lib.impl("aten::where.self", inverted_where, "CUDA")
 
-            device = 'cuda'
-            cond = torch.tensor([True, True, False], device=device, dtype=torch.bool)
-            x = torch.tensor([1, 2, 3], device=device)
-            y = torch.tensor([-1, -2, -3], device=device)
+                device = "cuda"
+                cond = torch.tensor(
+                    [True, True, False], device=device, dtype=torch.bool
+                )
+                x = torch.tensor([1, 2, 3], device=device)
+                y = torch.tensor([-1, -2, -3], device=device)
 
-            self.assertEqual(torch.where(cond, x, y), torch.tensor([-1, -2, 3]))
-            self.assertTrue(CALLED[0])
-            del my_lib
+                self.assertEqual(torch.where(cond, x, y), torch.tensor([-1, -2, 3]))
+                self.assertTrue(CALLED[0])
 
             # behavior restored after deregistration
             self.assertEqual(torch.where(cond, x, y), torch.tensor([1, 2, -3]))
 
         def override_gelu_cuda() -> None:
             # Example 2: Use relu to approximate gelu for faster compute
-            fastest_gelu_code_string = '''
+            fastest_gelu_code_string = """
             template <typename T> T fast_gelu(T a){
                 return a > 0 ? a : 0;
             }
-            '''
+            """
             jitted_gelu = _create_jit_fn(fastest_gelu_code_string)
 
             CALLED = [False]
@@ -204,24 +222,27 @@ def fast_gelu(*args, **kwargs):
                 return jitted_gelu(*args, **kwargs)
 
             # overriding gelu's cuda kernel with Jiterator generated relu kernel
-            my_lib = Library("aten", "IMPL")
-            my_lib.impl('aten::gelu', fast_gelu, "CUDA")
+            with _scoped_library("aten", "IMPL") as my_lib:
+                my_lib.impl("aten::gelu", fast_gelu, "CUDA")
 
-            x = torch.rand([3, 3], device='cuda', dtype=torch.float)
-            self.assertEqual(torch.nn.functional.gelu(x), torch.nn.functional.relu(x))
-            self.assertTrue(CALLED[0])
-            del my_lib
+                x = torch.rand([3, 3], device="cuda", dtype=torch.float)
+                self.assertEqual(
+                    torch.nn.functional.gelu(x), torch.nn.functional.relu(x)
+                )
+                self.assertTrue(CALLED[0])
 
             # behavior restored after deregistration
-            self.assertNotEqual(torch.nn.functional.gelu(x), torch.nn.functional.relu(x))
+            self.assertNotEqual(
+                torch.nn.functional.gelu(x), torch.nn.functional.relu(x)
+            )
 
         def override_exp_cuda() -> None:
             # Example 3: Preventing exp from exploding for float16
-            clipped_exp_code_string = '''
+            clipped_exp_code_string = """
             template <typename T> T clipped_exp(T a){
                 return a > T(10.0) ? T(22026.4657948) : exp(a);
             }
-            '''
+            """
             jitted_exp = _create_jit_fn(clipped_exp_code_string)
 
             CALLED = [False]
@@ -231,24 +252,28 @@ def clipped_exp(*args, **kwargs):
                 return jitted_exp(*args, **kwargs)
 
             # overriding exp's cuda kernel with clipped_exp kernel
-            my_lib = Library("aten", "IMPL")
-            my_lib.impl('aten::exp', clipped_exp, "CUDA")
+            with _scoped_library("aten", "IMPL") as my_lib:
+                my_lib.impl("aten::exp", clipped_exp, "CUDA")
 
-            x = torch.tensor([0.0, 100.0], device='cuda', dtype=torch.float16)
-            self.assertEqual(torch.exp(x), torch.tensor([1.0, 22026.4657948], dtype=torch.float16))
-            self.assertTrue(CALLED[0])
-            del my_lib
+                x = torch.tensor([0.0, 100.0], device="cuda", dtype=torch.float16)
+                self.assertEqual(
+                    torch.exp(x),
+                    torch.tensor([1.0, 22026.4657948], dtype=torch.float16),
+                )
+                self.assertTrue(CALLED[0])
 
             # behavior restored after deregistration
-            self.assertEqual(torch.exp(x), torch.tensor([1.0, torch.inf], dtype=torch.float16))
+            self.assertEqual(
+                torch.exp(x), torch.tensor([1.0, torch.inf], dtype=torch.float16)
+            )
 
         def override_add_cuda() -> None:
             # Example 4: simulate a hardware bug, where the adder is always off by 1
-            buggy_add_code_string = '''
+            buggy_add_code_string = """
             template <typename T> T buggy_add(T a, T b){
                 return a + b + T(1);
             }
-            '''
+            """
             jitted_add = _create_jit_fn(buggy_add_code_string)
 
             CALLED = [False]
@@ -257,18 +282,17 @@ def buggy_add(*args, **kwargs):
                 CALLED[0] = True
                 return jitted_add(*args, **kwargs)
 
-            my_lib = Library("aten", "IMPL")
-            my_lib.impl('aten::add.Tensor', buggy_add, "CUDA")
+            with _scoped_library("aten", "IMPL") as my_lib:
+                my_lib.impl("aten::add.Tensor", buggy_add, "CUDA")
 
-            x_cpu = torch.rand([3, 3], device='cpu')
-            y_cpu = torch.rand([3], device='cpu')
+                x_cpu = torch.rand([3, 3], device="cpu")
+                y_cpu = torch.rand([3], device="cpu")
 
-            x_cuda = x_cpu.cuda()
-            y_cuda = y_cpu.cuda()
+                x_cuda = x_cpu.cuda()
+                y_cuda = y_cpu.cuda()
 
-            self.assertEqual(x_cuda + y_cuda, x_cpu + y_cpu + 1)
-            self.assertTrue(CALLED[0])
-            del my_lib
+                self.assertEqual(x_cuda + y_cuda, x_cpu + y_cpu + 1)
+                self.assertTrue(CALLED[0])
 
             # behavior restored after deregistration
             self.assertEqual(x_cuda + y_cuda, x_cpu + y_cpu)
@@ -282,101 +306,89 @@ def buggy_add(*args, **kwargs):
     def test_extend_library_with_dispatch_key_arg(self):
         def my_sum(*args, **kwargs):
             return args[0].clone()
-        my_lib1 = Library("aten", "IMPL", dispatch_key="CPU")
 
-        # RuntimeError: Explicitly provided dispatch key (Conjugate) is
-        # inconsistent with the dispatch key of the enclosing TORCH_LIBRARY_IMPL block
-        with self.assertRaisesRegex(RuntimeError, "inconsistent with the dispatch key"):
-            my_lib1.impl('sum', my_sum, "Conjugate")
-        my_lib1.impl('aten::sum', my_sum)
-        x = torch.tensor([1, 2])
-        self.assertEqual(torch.sum(x), x)
-        del my_lib1
+        with _scoped_library("aten", "IMPL", dispatch_key="CPU") as my_lib1:
+            # RuntimeError: Explicitly provided dispatch key (Conjugate) is
+            # inconsistent with the dispatch key of the enclosing TORCH_LIBRARY_IMPL block
+            with self.assertRaisesRegex(
+                RuntimeError, "inconsistent with the dispatch key"
+            ):
+                my_lib1.impl("sum", my_sum, "Conjugate")
+            my_lib1.impl("aten::sum", my_sum)
+            x = torch.tensor([1, 2])
+            self.assertEqual(torch.sum(x), x)
 
     def test_create_new_library(self) -> None:
-        my_lib1 = Library(self.test_ns, "DEF")
-
-        my_lib1.define("sum(Tensor self) -> Tensor")
+        with _scoped_library(self.test_ns, "DEF") as my_lib1:
+            my_lib1.define("sum(Tensor self) -> Tensor")
 
-        # Example 1
-        @torch.library.impl(my_lib1, "sum", "CPU")
-        def my_sum(*args, **kwargs):
-            return args[0].clone()
-
-        x = torch.tensor([1, 2])
-        op = getattr(torch.ops, self.test_ns).sum
-        self.assertEqual(op(x), x)
-
-        my_lib2 = Library(self.test_ns, "IMPL")
-
-        # Example 2
-        @torch.library.impl(my_lib2, op.default, "ZeroTensor")
-        def my_sum_zt(*args, **kwargs):
-            if args[0]._is_zerotensor():
-                return torch._efficientzerotensor(args[0].shape)
-            else:
+            # Example 1
+            @torch.library.impl(my_lib1, "sum", "CPU")
+            def my_sum(*args, **kwargs):
                 return args[0].clone()
 
-        y = torch._efficientzerotensor(3)
-        self.assertTrue(op(y)._is_zerotensor())
-        self.assertEqual(op(x), x)
-
-        del my_lib2
-        del my_lib1
+            x = torch.tensor([1, 2])
+            op = getattr(torch.ops, self.test_ns).sum
+            self.assertEqual(op(x), x)
 
-    def test_create_new_library_fragment_no_existing(self):
-        my_lib = Library(self.test_ns, "FRAGMENT")
+            with _scoped_library(self.test_ns, "IMPL") as my_lib2:
+                # Example 2
+                @torch.library.impl(my_lib2, op.default, "ZeroTensor")
+                def my_sum_zt(*args, **kwargs):
+                    if args[0]._is_zerotensor():
+                        return torch._efficientzerotensor(args[0].shape)
+                    else:
+                        return args[0].clone()
 
-        my_lib.define("sum2(Tensor self) -> Tensor")
+                y = torch._efficientzerotensor(3)
+                self.assertTrue(op(y)._is_zerotensor())
+                self.assertEqual(op(x), x)
 
-        @torch.library.impl(my_lib, "sum2", "CPU")
-        def my_sum(*args, **kwargs):
-            return args[0]
+    def test_create_new_library_fragment_no_existing(self):
+        with _scoped_library(self.test_ns, "FRAGMENT") as my_lib:
+            my_lib.define("sum2(Tensor self) -> Tensor")
 
-        x = torch.tensor([1, 2])
-        self.assertEqual(getattr(torch.ops, self.test_ns).sum2(x), x)
+            @torch.library.impl(my_lib, "sum2", "CPU")
+            def my_sum(*args, **kwargs):
+                return args[0]
 
-        del my_lib
+            x = torch.tensor([1, 2])
+            self.assertEqual(getattr(torch.ops, self.test_ns).sum2(x), x)
 
     def test_create_new_library_fragment_with_existing(self):
-        my_lib1 = Library(self.test_ns, "DEF")
+        with _scoped_library(self.test_ns, "DEF") as my_lib1:
+            # Create a fragment
+            with _scoped_library(self.test_ns, "FRAGMENT") as my_lib2:
+                my_lib2.define("sum4(Tensor self) -> Tensor")
 
-        # Create a fragment
-        my_lib2 = Library(self.test_ns, "FRAGMENT")
+                @torch.library.impl(my_lib2, "sum4", "CPU")
+                def my_sum4(*args, **kwargs):
+                    return args[0]
 
-        my_lib2.define("sum4(Tensor self) -> Tensor")
+                x = torch.tensor([1, 2])
+                self.assertEqual(getattr(torch.ops, self.test_ns).sum4(x), x)
 
-        @torch.library.impl(my_lib2, "sum4", "CPU")
-        def my_sum4(*args, **kwargs):
-            return args[0]
-
-        x = torch.tensor([1, 2])
-        self.assertEqual(getattr(torch.ops, self.test_ns).sum4(x), x)
+                # Create another fragment
+                with _scoped_library(self.test_ns, "FRAGMENT") as my_lib3:
+                    my_lib3.define("sum3(Tensor self) -> Tensor")
 
-        # Create another fragment
-        my_lib3 = Library(self.test_ns, "FRAGMENT")
+                    @torch.library.impl(my_lib3, "sum3", "CPU")
+                    def my_sum3(*args, **kwargs):
+                        return args[0]
 
-        my_lib3.define("sum3(Tensor self) -> Tensor")
-
-        @torch.library.impl(my_lib3, "sum3", "CPU")
-        def my_sum3(*args, **kwargs):
-            return args[0]
-
-        x = torch.tensor([1, 2])
-        self.assertEqual(getattr(torch.ops, self.test_ns).sum3(x), x)
-
-        del my_lib1
-        del my_lib2
-        del my_lib3
+                    x = torch.tensor([1, 2])
+                    self.assertEqual(getattr(torch.ops, self.test_ns).sum3(x), x)
 
     @unittest.skipIf(IS_WINDOWS, "Skipped under Windows")
     def test_alias_analysis(self):
         def test_helper(alias_analysis=""):
-            my_lib1 = Library(self.test_ns, "DEF")
+            my_lib1 = Library(self.test_ns, "DEF")  # noqa: TOR901
 
             called = [0]
 
-            @torch.library.define(my_lib1, "_op() -> None", alias_analysis=alias_analysis)
+            @torch.library.define(
+                my_lib1, "_op() -> None", alias_analysis=alias_analysis
+            )
             def _op(*args, **kwargs):
                 called[0] += 1
 
@@ -393,11 +405,11 @@ def _test():
 
     def test_error_for_unsupported_ns_or_kind(self) -> None:
         with self.assertRaisesRegex(ValueError, "Unsupported kind"):
-            my_lib1 = Library("myns", "BLA")
+            my_lib1 = Library("myns", "BLA")  # noqa: TOR901
 
-        for kind in ('DEF', 'FRAGMENT'):
+        for kind in ("DEF", "FRAGMENT"):
             with self.assertRaisesRegex(ValueError, "reserved namespace"):
-                my_lib1 = Library("prim", kind)
+                my_lib1 = Library("prim", kind)  # noqa: TOR901
 
     def test_returning_symint(self) -> None:
         shape_env = ShapeEnv()
@@ -407,45 +419,41 @@ def test_returning_symint(self) -> None:
 
         s0, s1 = ft.shape
 
-        tlib = Library(self.test_ns, "DEF")
-        tlib.define("sqsum(SymInt a, SymInt b) -> SymInt")
+        with _scoped_library(self.test_ns, "DEF") as tlib:
+            tlib.define("sqsum(SymInt a, SymInt b) -> SymInt")
 
-        @impl(tlib, "sqsum", "CompositeExplicitAutograd")
-        def sqsum(a: SymInt, b: SymInt):
-            return a * a + b * b
+            @impl(tlib, "sqsum", "CompositeExplicitAutograd")
+            def sqsum(a: SymInt, b: SymInt):
+                return a * a + b * b
 
-        out = getattr(torch.ops, self.test_ns).sqsum.default(s0, s1)
-        out_val = shape_env.evaluate_expr(out.node.expr)
+            out = getattr(torch.ops, self.test_ns).sqsum.default(s0, s1)
+            out_val = shape_env.evaluate_expr(out.node.expr)
         self.assertEqual(out_val, 13)
 
     def test_register_functional_op_error_cases(self):
-        lib = Library(self.test_ns, "FRAGMENT")
-        with self.assertRaisesRegex(TypeError, "instance of OpOverload"):
-            register_functional_op(lib, "abs", torch.ops.aten.abs_)
-        with self.assertRaisesRegex(RuntimeError, "Expected op to be mutable"):
-            register_functional_op(lib, "abs", torch.ops.aten.abs_.default)
-        with self.assertRaisesRegex(RuntimeError, "Expected op to be mutable"):
-            register_functional_op(lib, "abs", torch.ops.aten.abs.out)
-
-        schemas = [
-            'foo(Tensor x, Tensor(a!)[] y) -> ()',
-            'foo(Tensor x, Tensor(a!) y, Tensor(b) z) -> Tensor(b)',
-            'foo(Tensor x, Tensor(a!) y) -> (Tensor, Tensor(a))',
-        ]
-        del lib
+        with _scoped_library(self.test_ns, "FRAGMENT") as lib:
+            with self.assertRaisesRegex(TypeError, "instance of OpOverload"):
+                register_functional_op(lib, "abs", torch.ops.aten.abs_)
+            with self.assertRaisesRegex(RuntimeError, "Expected op to be mutable"):
+                register_functional_op(lib, "abs", torch.ops.aten.abs_.default)
+            with self.assertRaisesRegex(RuntimeError, "Expected op to be mutable"):
+                register_functional_op(lib, "abs", torch.ops.aten.abs.out)
+
+            schemas = [
+                "foo(Tensor x, Tensor(a!)[] y) -> ()",
+                "foo(Tensor x, Tensor(a!) y, Tensor(b) z) -> Tensor(b)",
+                "foo(Tensor x, Tensor(a!) y) -> (Tensor, Tensor(a))",
+            ]
 
         for schema in schemas:
-            lib = Library(self.test_ns, "FRAGMENT")
-            try:
+            with _scoped_library(self.test_ns, "FRAGMENT") as lib:
                 lib.define(schema)
                 with self.assertRaisesRegex(RuntimeError, "NYI"):
                     register_functional_op(
                         lib,
                         "foo_functional",
-                        getattr(torch.ops, self.test_ns).foo.default)
-            finally:
-                del lib
-                delattr(torch.ops, self.test_ns)
+                        getattr(torch.ops, self.test_ns).foo.default,
+                    )
 
     def _check_is_functional_variant(self, mutable_op, functional_op, args):
         # functional op should not mutate
@@ -461,12 +469,23 @@ def _check_is_functional_variant(self, mutable_op, functional_op, args):
             flat_mutable_result = pytree.tree_leaves(mutable_result)
         flat_functional_result = pytree.tree_leaves(functional_result)
         assert len(flat_functional_result) > len(flat_mutable_result)
-        self.assertEqual(flat_functional_result[:len(flat_mutable_result)], flat_mutable_result)
+        self.assertEqual(
+            flat_functional_result[: len(flat_mutable_result)], flat_mutable_result
+        )
 
         # check rest of functional_result is the mutated args
-        mutated_args = [maybe_mutated_arg for maybe_mutated_arg, arg in zip(cloned_args, args)
-                        if not (maybe_mutated_arg is not None and arg is not None and torch.allclose(maybe_mutated_arg, arg))]
-        self.assertEqual(flat_functional_result[len(flat_mutable_result):], mutated_args)
+        mutated_args = [
+            maybe_mutated_arg
+            for maybe_mutated_arg, arg in zip(cloned_args, args)
+            if not (
+                maybe_mutated_arg is not None
+                and arg is not None
+                and torch.allclose(maybe_mutated_arg, arg)
+            )
+        ]
+        self.assertEqual(
+            flat_functional_result[len(flat_mutable_result) :], mutated_args
+        )
 
         # check that functionalization kernel was indeed registered
         def fn(*args):
@@ -483,118 +502,127 @@ def fn(*args):
         self.assertTrue(has_functional_op)
 
     def test_register_functional_op_no_returns(self):
-        lib = Library(self.test_ns, 'FRAGMENT')
-        lib.define('foo(Tensor x, Tensor(a!) y, Tensor z, Tensor(b!) w) -> ()')
-
-        def foo_impl(x, y, z, w):
-            y.fill_(3.14)
-            w.fill_(2.71)
-
-        lib.impl('foo', foo_impl, 'CPU')
-        register_functional_op(
-            lib,
-            'foo_functional',
-            getattr(torch.ops, self.test_ns).foo.default)
-        x = torch.randn([])
-        y = torch.randn([])
-        z = torch.randn([])
-        w = torch.randn([])
-        self._check_is_functional_variant(
-            getattr(torch.ops, self.test_ns).foo.default,
-            getattr(torch.ops, self.test_ns).foo_functional.default, (x, y, z, w))
+        with _scoped_library(self.test_ns, "FRAGMENT") as lib:
+            lib.define("foo(Tensor x, Tensor(a!) y, Tensor z, Tensor(b!) w) -> ()")
+
+            def foo_impl(x, y, z, w):
+                y.fill_(3.14)
+                w.fill_(2.71)
+
+            lib.impl("foo", foo_impl, "CPU")
+            register_functional_op(
+                lib, "foo_functional", getattr(torch.ops, self.test_ns).foo.default
+            )
+            x = torch.randn([])
+            y = torch.randn([])
+            z = torch.randn([])
+            w = torch.randn([])
+            self._check_is_functional_variant(
+                getattr(torch.ops, self.test_ns).foo.default,
+                getattr(torch.ops, self.test_ns).foo_functional.default,
+                (x, y, z, w),
+            )
 
     def test_register_functional_op_with_optional(self):
-        lib = Library(self.test_ns, 'FRAGMENT')
-        lib.define('foo(Tensor x, Tensor(a!) y, Tensor (b!) z, Tensor(c!)? w) -> ()')
-
-        def foo_impl(x, y, z, w):
-            y.fill_(3.14)
-            z.fill_(2.71)
-            if w is not None:
-                w.fill_(1.618)
-
-        lib.impl('foo', foo_impl, 'CPU')
-        register_functional_op(
-            lib,
-            'foo_functional',
-            getattr(torch.ops, self.test_ns).foo.default)
-        x = torch.randn([])
-        y = torch.randn([])
-        z = torch.randn([])
-        w = torch.randn([])
-        self._check_is_functional_variant(
-            getattr(torch.ops, self.test_ns).foo.default,
-            getattr(torch.ops, self.test_ns).foo_functional.default, (x, y, z, w))
-        self._check_is_functional_variant(
-            getattr(torch.ops, self.test_ns).foo.default,
-            getattr(torch.ops, self.test_ns).foo_functional.default, (x, y, z, None))
+        with _scoped_library(self.test_ns, "FRAGMENT") as lib:
+            lib.define(
+                "foo(Tensor x, Tensor(a!) y, Tensor (b!) z, Tensor(c!)? w) -> ()"
+            )
 
+            def foo_impl(x, y, z, w):
+                y.fill_(3.14)
+                z.fill_(2.71)
+                if w is not None:
+                    w.fill_(1.618)
+
+            lib.impl("foo", foo_impl, "CPU")
+            register_functional_op(
+                lib, "foo_functional", getattr(torch.ops, self.test_ns).foo.default
+            )
+            x = torch.randn([])
+            y = torch.randn([])
+            z = torch.randn([])
+            w = torch.randn([])
+            self._check_is_functional_variant(
+                getattr(torch.ops, self.test_ns).foo.default,
+                getattr(torch.ops, self.test_ns).foo_functional.default,
+                (x, y, z, w),
+            )
+            self._check_is_functional_variant(
+                getattr(torch.ops, self.test_ns).foo.default,
+                getattr(torch.ops, self.test_ns).foo_functional.default,
+                (x, y, z, None),
+            )
 
     def test_register_functional_op_one_return(self):
-        lib = Library(self.test_ns, 'FRAGMENT')
-        lib.define('foo(Tensor x, Tensor(a!) y, Tensor(c!) z, Tensor(b!) w) -> Tensor')
-
-        def foo_impl(x, y, z, w):
-            y.fill_(3.14)
-            w.fill_(2.71)
-            z.fill_(0.99)
-            return x.clone()
-
-        lib.impl('foo', foo_impl, 'CPU')
-        register_functional_op(
-            lib,
-            "foo_functional",
-            getattr(torch.ops, self.test_ns).foo.default)
-        x = torch.randn([])
-        y = torch.randn([])
-        z = torch.randn([])
-        w = torch.randn([])
-        self._check_is_functional_variant(
-            getattr(torch.ops, self.test_ns).foo.default,
-            getattr(torch.ops, self.test_ns).foo_functional.default, (x, y, z, w))
+        with _scoped_library(self.test_ns, "FRAGMENT") as lib:
+            lib.define(
+                "foo(Tensor x, Tensor(a!) y, Tensor(c!) z, Tensor(b!) w) -> Tensor"
+            )
+
+            def foo_impl(x, y, z, w):
+                y.fill_(3.14)
+                w.fill_(2.71)
+                z.fill_(0.99)
+                return x.clone()
+
+            lib.impl("foo", foo_impl, "CPU")
+            register_functional_op(
+                lib, "foo_functional", getattr(torch.ops, self.test_ns).foo.default
+            )
+            x = torch.randn([])
+            y = torch.randn([])
+            z = torch.randn([])
+            w = torch.randn([])
+            self._check_is_functional_variant(
+                getattr(torch.ops, self.test_ns).foo.default,
+                getattr(torch.ops, self.test_ns).foo_functional.default,
+                (x, y, z, w),
+            )
 
     def test_register_functional_op_multiple_returns(self):
-        lib = Library(self.test_ns, 'FRAGMENT')
-        lib.define('foo(Tensor x, Tensor(a!) y, Tensor z, Tensor(b!) w) -> (Tensor, Tensor)')
+        with _scoped_library(self.test_ns, "FRAGMENT") as lib:
+            lib.define(
+                "foo(Tensor x, Tensor(a!) y, Tensor z, Tensor(b!) w) -> (Tensor, Tensor)"
+            )
 
-        def foo_impl(x, y, z, w):
-            y.fill_(3.14)
-            w.fill_(2.71)
-            return x.clone(), z.clone()
+            def foo_impl(x, y, z, w):
+                y.fill_(3.14)
+                w.fill_(2.71)
+                return x.clone(), z.clone()
 
-        lib.impl('foo', foo_impl, 'CPU')
-        register_functional_op(
-            lib,
-            'foo_functional',
-            getattr(torch.ops, self.test_ns).foo.default)
+            lib.impl("foo", foo_impl, "CPU")
+            register_functional_op(
+                lib, "foo_functional", getattr(torch.ops, self.test_ns).foo.default
+            )
 
-        x = torch.randn([])
-        y = torch.randn([])
-        z = torch.randn([])
-        w = torch.randn([])
-        self._check_is_functional_variant(
-            getattr(torch.ops, self.test_ns).foo.default,
-            getattr(torch.ops, self.test_ns).foo_functional.default, (x, y, z, w))
+            x = torch.randn([])
+            y = torch.randn([])
+            z = torch.randn([])
+            w = torch.randn([])
+            self._check_is_functional_variant(
+                getattr(torch.ops, self.test_ns).foo.default,
+                getattr(torch.ops, self.test_ns).foo_functional.default,
+                (x, y, z, w),
+            )
 
     def test_register_fallthrough(self):
-        try:
-            my_lib = Library('aten', 'IMPL')
+        with _scoped_library("aten", "IMPL") as my_lib:
             my_lib.impl("mm", fallthrough_kernel, "AutocastCPU")
 
-            a = torch.randn(2, 3, device='cpu', dtype=torch.float32)
-            b = torch.randn(3, 2, device='cpu', dtype=torch.float32)
+            a = torch.randn(2, 3, device="cpu", dtype=torch.float32)
+            b = torch.randn(3, 2, device="cpu", dtype=torch.float32)
             with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
                 # dtype for mm should be float32 since we registered a fallthrough
                 self.assertEqual(torch.mm(a, b).dtype, torch.float32)
                 # ops that don't have a fallthrough registered should not be affected
                 self.assertEqual(torch.matmul(a, b).dtype, torch.bfloat16)
-        finally:
-            del my_lib
 
         with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
             # default behavior should have been restored
             self.assertEqual(torch.mm(a, b).dtype, torch.bfloat16)
 
+
 class TestPythonDispatch(TestCase):
     def test_basic(self) -> None:
         with capture_logs() as logs:
@@ -604,7 +632,7 @@ def test_basic(self) -> None:
             saved_x = y.grad_fn._saved_self
             grad_y = LoggingTensor(torch.tensor([1.0]))
             log_input("grad_y", grad_y)
-            g, = torch.autograd.grad((y,), (x,), (grad_y,))
+            (g,) = torch.autograd.grad((y,), (x,), (grad_y,))
 
         self.assertEqual(g.elem, torch.tensor([6.0]))
         with torch.no_grad():
@@ -614,13 +642,16 @@ def test_basic(self) -> None:
             self.assertEqual(saved_x, x)
             # TODO: figure out why broken
             # self.assertEqual(saved_x._version, x._version)
-        self.assertExpectedInline('\n'.join(logs), '''\
+        self.assertExpectedInline(
+            "\n".join(logs),
+            """\
 $0: f32[1] = input('x')
 $1: f32[1] = torch._ops.aten.mul.Tensor($0, $0)
 $2: f32[1] = input('grad_y')
 $3: f32[1] = torch._ops.aten.mul.Tensor($2, $0)
 $4: f32[1] = torch._ops.aten.mul.Tensor($2, $0)
-$5: f32[1] = torch._ops.aten.add.Tensor($4, $3)''')
+$5: f32[1] = torch._ops.aten.add.Tensor($4, $3)""",
+        )
 
     def test_out(self) -> None:
         with capture_logs() as logs:
@@ -633,10 +664,13 @@ def test_out(self) -> None:
         self.assertEqual(y.elem, torch.ones(1))
         # TODO: arguably this shouldn't pass and we should complain
         # that out isn't a kwarg
-        self.assertExpectedInline('\n'.join(logs), '''\
+        self.assertExpectedInline(
+            "\n".join(logs),
+            """\
 $0: f32[1] = input('x')
 $1: f32[1] = input('y')
-$2: f32[1] = torch._ops.aten.abs.out($0, out=$1)''')
+$2: f32[1] = torch._ops.aten.abs.out($0, out=$1)""",
+        )
 
     def test_kwarg_only(self) -> None:
         with capture_logs() as logs:
@@ -654,7 +688,9 @@ def test_kwarg_only(self) -> None:
 
         # The expectation is that beta/alpha don't show up when they're
         # defaulted.  This is even if the user explicitly specified it.
-        self.assertExpectedInline('\n'.join(logs), '''\
+        self.assertExpectedInline(
+            "\n".join(logs),
+            """\
 $0: f32[1] = input('x')
 $1: f32[1, 1] = input('y')
 $2: f32[1] = input('z')
@@ -662,7 +698,8 @@ def test_kwarg_only(self) -> None:
 $4: f32[1] = torch._ops.aten.addmv.default($0, $1, $2)
 $5: f32[1] = torch._ops.aten.addmv.default($0, $1, $2, beta=2)
 $6: f32[1] = torch._ops.aten.addmv.default($0, $1, $2, alpha=2)
-$7: f32[1] = torch._ops.aten.addmv.default($0, $1, $2, beta=2, alpha=2)''')
+$7: f32[1] = torch._ops.aten.addmv.default($0, $1, $2, beta=2, alpha=2)""",
+        )
 
     def test_kwarg_only_and_positional_default(self) -> None:
         with capture_logs() as logs:
@@ -675,12 +712,15 @@ def test_kwarg_only_and_positional_default(self) -> None:
 
         # What we are testing here is that we omit arg2
         # if it is defaulted, even if a kwarg is set
-        self.assertExpectedInline('\n'.join(logs), '''\
+        self.assertExpectedInline(
+            "\n".join(logs),
+            """\
 $0: f32[1] = input('x')
 $1: f32[1] = torch._ops.aten._foobar.default($0)
 $2: f32[1] = torch._ops.aten._foobar.default($0, False)
 $3: f32[1] = torch._ops.aten._foobar.default($0, arg3=False)
-$4: f32[1] = torch._ops.aten._foobar.default($0, False, arg3=False)''')
+$4: f32[1] = torch._ops.aten._foobar.default($0, False, arg3=False)""",
+        )
 
     def test_produce_real_type(self) -> None:
         with capture_logs() as logs:
@@ -688,39 +728,48 @@ def test_produce_real_type(self) -> None:
             log_input("x", x)
             x.to(dtype=torch.double)  # non-optional dtype
             torch.cumprod(x, 0, dtype=torch.double)  # optional dtype
-            x[:, 1].contiguous(memory_format=torch.contiguous_format)  # optional memory format
+            x[:, 1].contiguous(
+                memory_format=torch.contiguous_format
+            )  # optional memory format
             # There doesn't appear to be any layout signatures which are
             # triggerable using tensor subclasses (need to use a mode)
 
-        self.assertExpectedInline('\n'.join(logs), '''\
+        self.assertExpectedInline(
+            "\n".join(logs),
+            """\
 $0: f32[2, 2] = input('x')
 $1: f64[2, 2] = torch._ops.aten._to_copy.default($0, dtype=torch.float64)
 $2: f64[2, 2] = torch._ops.aten.cumprod.default($0, 0, dtype=torch.float64)
 $3: f32[2, 2] = torch._ops.aten.slice.Tensor($0, 0, 0, 9223372036854775807)
 $4: f32[2] = torch._ops.aten.select.int($3, 1, 1)
-$5: f32[2] = torch._ops.aten.clone.default($4, memory_format=torch.contiguous_format)''')
+$5: f32[2] = torch._ops.aten.clone.default($4, memory_format=torch.contiguous_format)""",
+        )
 
     def test_optional_tensor_list(self) -> None:
         def weird(xs):
             print("woof")
             return torch.empty(())
 
-        my_lib = Library("my_lib", "DEF")
-        my_lib.define("weird(Tensor?[] self) -> Tensor")
-        my_lib.impl("weird", weird, "CPU")
-        with capture_logs() as logs:
-            x = LoggingTensor(torch.ones(2, 2))
-            log_input("x", x)
-            torch.ops.my_lib.weird.default([None, x])
-
-        self.assertExpectedInline('\n'.join(logs), '''\
+        with _scoped_library("my_lib", "DEF") as my_lib:
+            my_lib.define("weird(Tensor?[] self) -> Tensor")
+            my_lib.impl("weird", weird, "CPU")
+            with capture_logs() as logs:
+                x = LoggingTensor(torch.ones(2, 2))
+                log_input("x", x)
+                torch.ops.my_lib.weird.default([None, x])
+
+        self.assertExpectedInline(
+            "\n".join(logs),
+            """\
 $0: f32[2, 2] = input('x')
-$1: f32[] = torch._ops.my_lib.weird.default(['None', '$0'])''')
+$1: f32[] = torch._ops.my_lib.weird.default(['None', '$0'])""",
+        )
 
     def test_list_ret(self) -> None:
         # test all sequence types are permissible returns
         for list_type in (list, tuple):
-            class A(torch._C.TensorBase):
+
+            class A(torch.Tensor):
                 @staticmethod
                 def __new__(cls, elem):
                     return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
@@ -735,12 +784,12 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
 
             self.assertEqual(
                 torch.split(A(torch.tensor([0, 1])), 2),
-                torch.split(torch.tensor([0, 1]), 2)
+                torch.split(torch.tensor([0, 1]), 2),
             )
 
     def test_invalid_ret(self) -> None:
         # test invalid return gets reasonable error message
-        class A(torch._C.TensorBase):
+        class A(torch.Tensor):
             @staticmethod
             def __new__(cls, elem):
                 return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
@@ -751,10 +800,14 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
 
         # Wobbles depending on NDEBUG mode of pybind11
         self.assertRaisesRegex(
-            RuntimeError, "Unable to cast", lambda: A(torch.zeros(1)).neg(),
+            RuntimeError,
+            "Unable to cast",
+            lambda: A(torch.zeros(1)).neg(),
         )
         self.assertRaisesRegex(
-            RuntimeError, "Unable to cast", lambda: A(torch.zeros(1)).detach(),
+            RuntimeError,
+            "Unable to cast",
+            lambda: A(torch.zeros(1)).detach(),
         )
 
     def test_detach_appears_twice_when_called_once(self) -> None:
@@ -766,10 +819,13 @@ def test_detach_appears_twice_when_called_once(self) -> None:
         # it currently emits two, for reasons unclear to us. Leaving
         # this test here to make sure we don't regress even further (it
         # would be bad if calling .detach() once emits 3+ detaches).
-        self.assertExpectedInline('\n'.join(logs), '''\
+        self.assertExpectedInline(
+            "\n".join(logs),
+            """\
 $0: f32[1] = input('x')
 $1: f32[1] = torch._ops.aten.detach.default($0)
-$2: f32[1] = torch._ops.aten.detach.default($1)''')
+$2: f32[1] = torch._ops.aten.detach.default($1)""",
+        )
 
     def test_storage(self) -> None:
         # For now, just make sure it doesn't crash.  Ideally, we should
@@ -820,10 +876,18 @@ def __new__(cls, elem):
             def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 raise ErrorB
 
-        self.assertRaises(ErrorA, lambda: torch.add(A(torch.empty(1)), A(torch.empty(1))))
-        self.assertRaises(ErrorB, lambda: torch.add(A(torch.empty(1)), B(torch.empty(1))))
-        self.assertRaises(ErrorB, lambda: torch.add(B(torch.empty(1)), A(torch.empty(1))))
-        self.assertRaises(ErrorB, lambda: torch.add(B(torch.empty(1)), B(torch.empty(1))))
+        self.assertRaises(
+            ErrorA, lambda: torch.add(A(torch.empty(1)), A(torch.empty(1)))
+        )
+        self.assertRaises(
+            ErrorB, lambda: torch.add(A(torch.empty(1)), B(torch.empty(1)))
+        )
+        self.assertRaises(
+            ErrorB, lambda: torch.add(B(torch.empty(1)), A(torch.empty(1)))
+        )
+        self.assertRaises(
+            ErrorB, lambda: torch.add(B(torch.empty(1)), B(torch.empty(1)))
+        )
 
     def test_format(self) -> None:
         x = LoggingTensor(torch.ones(1))
@@ -840,14 +904,14 @@ def test_custom_autograd(self) -> None:
         class Square(torch.autograd.Function):
             @staticmethod
             def forward(ctx, x):
-                y = x ** 2
+                y = x**2
                 ctx.save_for_backward(x)
                 return y
 
             @staticmethod
             def backward(ctx, grad_output):
                 assert isinstance(grad_output, LoggingTensor)
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 assert isinstance(x, LoggingTensor)
                 escape[0] = x
                 return grad_output * 2 * x
@@ -872,14 +936,17 @@ def backward(ctx, grad_output):
             # TODO: figure out why this is broken
             # self.assertEqual(escape[0]._version, x._version)
 
-        self.assertExpectedInline('\n'.join(logs), '''\
+        self.assertExpectedInline(
+            "\n".join(logs),
+            """\
 $0: f32[1] = input('x')
 $1: f32[1] = input('x.grad')
 $2: f32[1] = torch._ops.aten.pow.Tensor_Scalar($0, 2)
 $3: f32[1] = input('grad_output')
 $4: f32[1] = torch._ops.aten.mul.Tensor($3, 2)
 $5: f32[1] = torch._ops.aten.mul.Tensor($4, $0)
-$6: f32[1] = torch._ops.aten.add_.Tensor($1, $5)''')
+$6: f32[1] = torch._ops.aten.add_.Tensor($1, $5)""",
+        )
 
     def test_subclass_creation(self):
         # Make sure these statements runs without error
@@ -901,8 +968,6 @@ class Foo(torch.Tensor):
 
     def test_new_ones(self) -> None:
         class MyTensor(torch.Tensor):
-            __torch_function__ = torch._C._disabled_torch_function_impl
-
             @classmethod
             def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 return MyTensor(3)
@@ -911,8 +976,6 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
 
     def test_like(self) -> None:
         class MyTensor(torch.Tensor):
-            __torch_function__ = torch._C._disabled_torch_function_impl
-
             @classmethod
             def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 return MyTensor(3)
@@ -921,13 +984,14 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
             f_name = f + "_like"
             self.assertEqual(type(getattr(torch, f_name)(MyTensor(2))), MyTensor)
 
-        self.assertEqual(type(torch.full_like(MyTensor(2), 1.)), MyTensor)
+        self.assertEqual(type(torch.full_like(MyTensor(2), 1.0)), MyTensor)
         self.assertEqual(type(torch.randint_like(MyTensor(2), high=3)), MyTensor)
 
     def test_make_fx_with_subclass(self) -> None:
         def f(x, y):
             # Returns (TwoTensor, Tensor)
             return x * y, y + y
+
         x_a = torch.zeros(4)
         x_b = torch.zeros(4)
         y = torch.ones(4)
@@ -942,8 +1006,11 @@ def f_to_trace(x_a, x_b, y):
             out1, out2 = f(x, y)
             out1_unwrapped_attrs, _ = out1.__tensor_flatten__()
             return (*[getattr(out1, attr) for attr in out1_unwrapped_attrs], out2)
-        fx_g = make_fx(f_to_trace, tracing_mode='fake')(x_a, x_b, y)
-        self.assertExpectedInline(fx_g.code, """\
+
+        fx_g = make_fx(f_to_trace, tracing_mode="fake")(x_a, x_b, y)
+        self.assertExpectedInline(
+            fx_g.code,
+            """\
 
 
 
@@ -952,21 +1019,34 @@ def forward(self, x_a_1, x_b_1, y_1):
     mul_1 = torch.ops.aten.mul.Tensor(x_b_1, y_1);  x_b_1 = None
     add = torch.ops.aten.add.Tensor(y_1, y_1);  y_1 = None
     return (mul, mul_1, add)
-    """)
+    """,
+        )
+
+    # See https://github.com/pytorch/pytorch/issues/117794
+    def test_return_and_correct_aliasing_gives_correct_stride(self):
+        t = TwoTensor(torch.randn(2, 2), torch.randn(2, 2))
+        x = torch.randn(2, 2)
+        # slicing should result in the same stride for TwoTensor as a dense tensor would give
+        self.assertEqual(t[:, 0].stride(), x[:, 0].stride())
 
     def test_make_wrapper_subclass_propagates_metadata(self) -> None:
         class WrapperTensor(torch.Tensor):
             elem: torch.Tensor
 
-            __slots__ = ['elem']
+            __slots__ = ["elem"]
 
             @staticmethod
             def __new__(cls, elem, *args, **kwargs):
                 r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
-                    cls, elem.size(),
-                    dtype=elem.dtype, layout=elem.layout,
-                    device=elem.device, requires_grad=elem.requires_grad,
-                    strides=elem.stride(), storage_offset=elem.storage_offset())
+                    cls,
+                    elem.size(),
+                    dtype=elem.dtype,
+                    layout=elem.layout,
+                    device=elem.device,
+                    requires_grad=elem.requires_grad,
+                    strides=elem.stride(),
+                    storage_offset=elem.storage_offset(),
+                )
                 r.elem = elem
                 return r
 
@@ -983,35 +1063,45 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
 
     def test_wrapper_subclass_serializes(self) -> None:
         with tempfile.TemporaryFile() as f:
-            x = LoggingTensor(torch.randn(3))
+            # purposefully use int64 to test non-default dtype
+            x = LoggingTensor(torch.randperm(3))
             torch.save(x, f)
             f.seek(0)
             x_loaded = torch.load(f)
             self.assertTrue(type(x_loaded) is type(x))
+            self.assertEqual(x, x_loaded)
             self.assertEqual(x.elem, x_loaded.elem)
             self.assertFalse(x is x_loaded)
 
     def test_deepcopy_wrapper_subclass(self) -> None:
-        x = LoggingTensor(torch.randn(3))
+        # purposefully use int64 to test non-default dtype
+        x = LoggingTensor(torch.randperm(3))
         x_copy = deepcopy(x)
         self.assertTrue(type(x_copy) is type(x))
+        self.assertEqual(x, x_copy)
         self.assertEqual(x.elem, x_copy.elem)
         self.assertFalse(x is x_copy)
 
-    def test_deepcopy_wrapper_subclass_with_clone_returning_different_type(self) -> None:
-
+    def test_deepcopy_wrapper_subclass_with_clone_returning_different_type(
+        self,
+    ) -> None:
         class MyWrapperTensor(torch.Tensor):
             elem: torch.Tensor
 
-            __slots__ = ['elem']
+            __slots__ = ["elem"]
 
             @staticmethod
             def __new__(cls, elem, *args, **kwargs):
                 r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
-                    cls, elem.size(),
-                    dtype=elem.dtype, layout=elem.layout,
-                    device=elem.device, requires_grad=elem.requires_grad,
-                    strides=elem.stride(), storage_offset=elem.storage_offset())
+                    cls,
+                    elem.size(),
+                    dtype=elem.dtype,
+                    layout=elem.layout,
+                    device=elem.device,
+                    requires_grad=elem.requires_grad,
+                    strides=elem.stride(),
+                    storage_offset=elem.storage_offset(),
+                )
                 r.elem = elem
                 return r
 
@@ -1027,12 +1117,13 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
             # explicitly disable __torch_function__ for this subclass.
 
         x = MyWrapperTensor(torch.randn(3))
-        with self.assertRaisesRegex(RuntimeError,
-                                    "for which cloning returns another instance of the same subclass"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "for which cloning returns another instance of the same subclass",
+        ):
             x_copy = deepcopy(x)
 
     def test_deepcopy_non_wrapper_subclass(self) -> None:
-
         # Ensure correct error is thrown for common error cases.
         class SubTensorError1(torch.Tensor):
             # Default implementation of new_empty() returns a plain tensor.
@@ -1045,8 +1136,10 @@ def new_empty(self, shape):
 
         for error_cls in [SubTensorError1, SubTensorError2]:
             x = error_cls(3)
-            with self.assertRaisesRegex(RuntimeError,
-                                        "for which that function returns another instance of the same subclass"):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "for which that function returns another instance of the same subclass",
+            ):
                 x_copy = deepcopy(x)
 
         # Ensure a correctly implemented new_empty() causes deepcopy() to work.
@@ -1066,11 +1159,21 @@ def __new__(cls, elem, *args, **kwargs):
                 #     extra dispatch keys. We probably want to unify the two APIs
                 #     in the future.
                 r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
-                    cls, elem.size(), elem.stride(), elem.storage_offset(),
+                    cls,
+                    elem.size(),
+                    elem.stride(),
+                    elem.storage_offset(),
                     torch.contiguous_format,
-                    elem.dtype, elem.layout,
-                    elem.device, False, False, None, False, False,
-                    DispatchKeySet(DispatchKey.NestedTensor))
+                    elem.dtype,
+                    elem.layout,
+                    elem.device,
+                    False,
+                    False,
+                    None,
+                    False,
+                    False,
+                    DispatchKeySet(DispatchKey.NestedTensor),
+                )
                 return r
 
             @classmethod
@@ -1079,22 +1182,43 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
 
         x = ExtraKeysTensor(torch.randn(3))
         self.assertTrue(torch._C._dispatch_keys(x).has(DispatchKey.NestedTensor))
-        self.assertFalse(torch._C._dispatch_keys(x).has(DispatchKey.AutogradNestedTensor))
+        self.assertFalse(
+            torch._C._dispatch_keys(x).has(DispatchKey.AutogradNestedTensor)
+        )
+
+    def test_wrapper_subclass_multiprocessing_preserves_dtype(self):
+        # a and b have dtype of int64, which is purposefully different from the default
+        # assumed by _make_wrapper_subclass().
+        a = torch.randperm(5)
+        b = torch.randperm(5)
+        data = TwoTensor(a, b)
+        expected_dtype = data.dtype
+
+        loader = torch.utils.data.DataLoader(
+            [data, data],
+            batch_size=2,
+            num_workers=2,
+            collate_fn=_identity,
+        )
+        for batch in loader:
+            self.assertEqual(batch[0].dtype, expected_dtype)
 
     def test_index_put_where_only_index_is_subclass(self) -> None:
         called_funcs = []
 
         class MyTensor(torch.Tensor):
-            __torch_function__ = torch._C._disabled_torch_function_impl
             elem: torch.Tensor
-            __slots__ = ['elem']
+            __slots__ = ["elem"]
 
             @staticmethod
             def __new__(cls, elem, *args, **kwargs):
                 r = torch.Tensor._make_wrapper_subclass(
-                    cls, elem.size(),
-                    dtype=elem.dtype, layout=elem.layout,
-                    device=elem.device, requires_grad=elem.requires_grad
+                    cls,
+                    elem.size(),
+                    dtype=elem.dtype,
+                    layout=elem.layout,
+                    device=elem.device,
+                    requires_grad=elem.requires_grad,
                 )
                 r.elem = elem
                 return r
@@ -1114,8 +1238,11 @@ def test_torch_dispatch_mode_basic(self) -> None:
         with capture_logs(is_mode=True) as logs:
             with LoggingTensorMode():
                 torch.empty([])
-        self.assertExpectedInline('\n'.join(logs), """\
-$0: f32[] = torch._ops.aten.empty.memory_format([], device=device(type='cpu'), pin_memory=False)""")
+        self.assertExpectedInline(
+            "\n".join(logs),
+            """\
+$0: f32[] = torch._ops.aten.empty.memory_format([], device=device(type='cpu'), pin_memory=False)""",
+        )
 
     def test_torch_dispatch_mode_unrelated_tensors(self) -> None:
         x = torch.randn([])
@@ -1123,7 +1250,9 @@ def test_torch_dispatch_mode_unrelated_tensors(self) -> None:
         with capture_logs(is_mode=True) as logs:
             with LoggingTensorMode():
                 x + y
-        self.assertExpectedInline('\n'.join(logs), """$2: f32[] = torch._ops.aten.add.Tensor($0, $1)""")
+        self.assertExpectedInline(
+            "\n".join(logs), """$2: f32[] = torch._ops.aten.add.Tensor($0, $1)"""
+        )
 
     def test_nested_push_logging_tensor_mode(self):
         x = torch.randn([])
@@ -1134,11 +1263,14 @@ def test_nested_push_logging_tensor_mode(self):
                     torch.empty([])
                     x + y
 
-        self.assertExpectedInline('\n'.join(logs), """\
+        self.assertExpectedInline(
+            "\n".join(logs),
+            """\
 $0: f32[] = torch._ops.aten.empty.memory_format([], device=device(type='cpu'), pin_memory=False)
 $0: f32[] = torch._ops.aten.empty.memory_format([], device=device(type='cpu'), pin_memory=False)
 $3: f32[] = torch._ops.aten.add.Tensor($1, $2)
-$3: f32[] = torch._ops.aten.add.Tensor($1, $2)""")
+$3: f32[] = torch._ops.aten.add.Tensor($1, $2)""",
+        )
 
     def test_capture_logs_with_torch_dispatch_mode(self):
         x = torch.randn([])
@@ -1146,9 +1278,12 @@ def test_capture_logs_with_torch_dispatch_mode(self):
         with capture_logs_with_logging_tensor_mode() as logs:
             torch.empty([])
             x + y
-        self.assertExpectedInline('\n'.join(logs), """\
+        self.assertExpectedInline(
+            "\n".join(logs),
+            """\
 $0: f32[] = torch._ops.aten.empty.memory_format([], device=device(type='cpu'), pin_memory=False)
-$3: f32[] = torch._ops.aten.add.Tensor($1, $2)""")
+$3: f32[] = torch._ops.aten.add.Tensor($1, $2)""",
+        )
 
         x = torch.randn([])
         y = torch.randn([])
@@ -1158,11 +1293,14 @@ def test_capture_logs_with_torch_dispatch_mode(self):
                 torch.empty([])
                 x + y
 
-        self.assertExpectedInline('\n'.join(logs2), """\
+        self.assertExpectedInline(
+            "\n".join(logs2),
+            """\
 $0: f32[] = torch._ops.aten.empty.memory_format([], device=device(type='cpu'), pin_memory=False)
 $0: f32[] = torch._ops.aten.empty.memory_format([], device=device(type='cpu'), pin_memory=False)
 $3: f32[] = torch._ops.aten.add.Tensor($1, $2)
-$3: f32[] = torch._ops.aten.add.Tensor($1, $2)""")
+$3: f32[] = torch._ops.aten.add.Tensor($1, $2)""",
+        )
 
         self.assertEqual(logs1, logs2)
 
@@ -1252,7 +1390,9 @@ def test_shallow_copy_and_detach(self) -> None:
 
         class TestMode(TorchDispatchMode):
             def __torch_dispatch__(self, func, types, args=(), kwargs=None):
-                tree_map_only(torch.Tensor, lambda t: test_case.assertIn(t, seen), (args, kwargs))
+                tree_map_only(
+                    torch.Tensor, lambda t: test_case.assertIn(t, seen), (args, kwargs)
+                )
                 if kwargs is None:
                     kwargs = {}
                 r = func(*args, **kwargs)
@@ -1272,8 +1412,8 @@ def __new__(cls, elem):
 
         class AMode(TorchDispatchMode):
             def __torch_dispatch__(self, func, types, args=(), kwargs=None):
-                if func.__name__ == 'randn.default':
-                    raise RuntimeError()
+                if func.__name__ == "randn.default":
+                    raise RuntimeError
                 return A(torch.zeros(()))
 
         with AMode():
@@ -1289,7 +1429,7 @@ class ErrorA(RuntimeError):
 
         class A(TorchDispatchMode):
             def __torch_dispatch__(self, func, types, args=(), kwargs=None):
-                raise ErrorA()
+                raise ErrorA
 
         x = A()
         with self.assertRaises(ErrorA):
@@ -1345,7 +1485,7 @@ class BasicMode(TorchDispatchMode):
             def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 return func(*args, **kwargs)
 
-        x = torch.tensor(4.)
+        x = torch.tensor(4.0)
         with Mode():
             y = x + x
             z = y + y
@@ -1359,7 +1499,10 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         self.assertIsInstance(y, ModeTensor)
         self.assertIsInstance(z, ModeTensor)
 
-        assert self.assertRaisesRegex(RuntimeError, "subclass Mode but.* associated to a python object of type Mode")
+        assert self.assertRaisesRegex(
+            RuntimeError,
+            "subclass Mode but.* associated to a python object of type Mode",
+        )
 
     def test_notimplemented_mode(self):
         sub_count = 0
@@ -1395,8 +1538,6 @@ def unwrap(t):
 
                 return func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))
 
-            __torch_function__ = torch._C._disabled_torch_function_impl
-
         a = SubTensor(torch.randn(2))
         with PoliteMode() as mode:
             a.abs()
@@ -1417,10 +1558,12 @@ def test_nesting_same_mode(self):
             with LoggingTensorMode() as reenabled:
                 with reenabled:
                     torch.empty([])
-            self.assertExpectedInline('\n'.join(logs), """\
+            self.assertExpectedInline(
+                "\n".join(logs),
+                """\
 $0: f32[] = torch._ops.aten.empty.memory_format([], device=device(type='cpu'), pin_memory=False)
-$0: f32[] = torch._ops.aten.empty.memory_format([], device=device(type='cpu'), pin_memory=False)""")
-
+$0: f32[] = torch._ops.aten.empty.memory_format([], device=device(type='cpu'), pin_memory=False)""",
+            )
 
     def test_error_using_class_method_on_mode(self):
         class A(TorchDispatchMode):
@@ -1428,8 +1571,10 @@ class A(TorchDispatchMode):
             def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 return func(args, kwargs)
 
-        x = torch.tensor(5.)
-        with self.assertRaisesRegex(RuntimeError, "classmethod is not supported, please make it a plain method"):
+        x = torch.tensor(5.0)
+        with self.assertRaisesRegex(
+            RuntimeError, "classmethod is not supported, please make it a plain method"
+        ):
             with A():
                 x + x
 
@@ -1470,9 +1615,13 @@ def test_all_same_mode(self):
 
     def test_tolist_numpy_with_torch_dispatch_mode(self) -> None:
         x = LoggingTensor(torch.tensor([2.0, 3.0]))
-        with self.assertRaisesRegex(RuntimeError, "is not supported for tensor subclasses."):
+        with self.assertRaisesRegex(
+            RuntimeError, "is not supported for tensor subclasses."
+        ):
             x.tolist()
-        with self.assertRaisesRegex(RuntimeError, "is not supported for tensor subclasses."):
+        with self.assertRaisesRegex(
+            RuntimeError, "is not supported for tensor subclasses."
+        ):
             x.numpy()
         with self.assertRaises(AssertionError):
             self.assertEqual(x, None)
@@ -1490,45 +1639,50 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 self.testcase.assertEqual(args[1].device_index, 2)
                 self.testcase.assertEqual(args[1].device_type, 3)
 
-        t = torch.tensor(5.)
+        t = torch.tensor(5.0)
         s = torch.Stream(stream_id=1, device_index=2, device_type=3)
         with TestMode(self):
             t.record_stream(s)
 
     def test_return_stream(self) -> None:
-        l_def = torch.library.Library("test_return_stream", "DEF")
-        l_def.define("return_stream(Tensor self) -> Stream")
-        l_impl = torch.library.Library("test_return_stream", "IMPL", "CPU")
-        l_impl.impl("return_stream", lambda _: torch.Stream(stream_id=0, device_index=1, device_type=2))
+        with _scoped_library("test_return_stream", "DEF") as l_def:
+            l_def.define("return_stream(Tensor self) -> Stream")
+            with _scoped_library("test_return_stream", "IMPL", "CPU") as l_impl:
+                l_impl.impl(
+                    "return_stream",
+                    lambda _: torch.Stream(stream_id=0, device_index=1, device_type=2),
+                )
 
-        class TestMode(TorchDispatchMode):
-            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
-                return torch.Stream(stream_id=1, device_index=2, device_type=3)
+                class TestMode(TorchDispatchMode):
+                    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                        return torch.Stream(stream_id=1, device_index=2, device_type=3)
 
-        t = torch.tensor(5.)
-        s = torch.ops.test_return_stream.return_stream(t)
-        self.assertIsInstance(s, torch.Stream)
-        self.assertEqual(s.stream_id, 0)
-        self.assertEqual(s.device_index, 1)
-        self.assertEqual(s.device_type, 2)
+                t = torch.tensor(5.0)
+                s = torch.ops.test_return_stream.return_stream(t)
+                self.assertIsInstance(s, torch.Stream)
+                self.assertEqual(s.stream_id, 0)
+                self.assertEqual(s.device_index, 1)
+                self.assertEqual(s.device_type, 2)
 
-        with TestMode():
-            s = torch.ops.test_return_stream.return_stream(t)
-        self.assertIsInstance(s, torch.Stream)
-        self.assertEqual(s.stream_id, 1)
-        self.assertEqual(s.device_index, 2)
-        self.assertEqual(s.device_type, 3)
+                with TestMode():
+                    s = torch.ops.test_return_stream.return_stream(t)
+                self.assertIsInstance(s, torch.Stream)
+                self.assertEqual(s.stream_id, 1)
+                self.assertEqual(s.device_index, 2)
+                self.assertEqual(s.device_type, 3)
 
     def test_subclass_autograd_device_check(self) -> None:
         class NonWrapperSubclass(torch.Tensor):
             elem: torch.Tensor
 
-            __slots__ = ['elem']
+            __slots__ = ["elem"]
 
             @staticmethod
             def __new__(cls, elem, *args, **kwargs):
                 # Wrong device here!
-                r = torch.Tensor._make_subclass(cls, elem.to("meta"), elem.requires_grad)
+                r = torch.Tensor._make_subclass(
+                    cls, elem.to("meta"), elem.requires_grad
+                )
                 # ...the real tensor is held as an element on the tensor.
                 r.elem = elem
                 return r
@@ -1541,8 +1695,12 @@ def unwrap(e):
                 def wrap(e):
                     return NonWrapperSubclass(e) if isinstance(e, torch.Tensor) else e
 
-                rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)))
-                logging.getLogger("NonWrapperSubclass").info(f"{func.__module__}.{func.__name__}", args, kwargs, rs)
+                rs = tree_map(
+                    wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))
+                )
+                logging.getLogger("NonWrapperSubclass").info(
+                    f"{func.__module__}.{func.__name__}", args, kwargs, rs
+                )
                 return rs
 
         x = NonWrapperSubclass(torch.tensor([3.0, 4.0], requires_grad=True))
@@ -1560,9 +1718,12 @@ class SubclassWithNone(torch.Tensor):
             @staticmethod
             def __new__(cls, elem, *args, **kwargs):
                 r = torch.Tensor._make_wrapper_subclass(
-                    cls, elem.size(),
-                    dtype=elem.dtype, layout=elem.layout,
-                    device=elem.device, requires_grad=elem.requires_grad
+                    cls,
+                    elem.size(),
+                    dtype=elem.dtype,
+                    layout=elem.layout,
+                    device=elem.device,
+                    requires_grad=elem.requires_grad,
                 )
                 r.elem = elem
                 return r
@@ -1575,7 +1736,9 @@ def unwrap(e):
                 def wrap(e):
                     return SubclassWithNone(e) if isinstance(e, torch.Tensor) else e
 
-                rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)))
+                rs = tree_map(
+                    wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))
+                )
                 if func.overloadpacket.__name__ == "add":
                     return None
                 else:
@@ -1630,8 +1793,6 @@ class SubTensor(torch.Tensor):
             def __new__(cls, elem):
                 return torch.Tensor._make_subclass(cls, elem)
 
-            __torch_function__ = torch._C._disabled_torch_function_impl
-
             @classmethod
             def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 called.append(func)
@@ -1650,8 +1811,6 @@ class SubTensorWithListArg(torch.Tensor):
             def __new__(cls, elem):
                 return torch.Tensor._make_subclass(cls, elem)
 
-            __torch_function__ = torch._C._disabled_torch_function_impl
-
             @classmethod
             def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 called.append(func)
@@ -1669,8 +1828,6 @@ class SubTensor(torch.Tensor):
             def __new__(cls, elem):
                 return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
 
-            __torch_function__ = torch._C._disabled_torch_function_impl
-
             @classmethod
             def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 called.append(func)
@@ -1691,8 +1848,6 @@ def test_set_data(self):
         called = 0
 
         class SubTensor(torch.Tensor):
-            __torch_function__ = torch._C._disabled_torch_function_impl
-
             @classmethod
             def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 nonlocal called
@@ -1716,6 +1871,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
     def test_construct_int_tensor(self):
         class SubTensor(torch.Tensor):
             pass
+
         # should not fail
         SubTensor(torch.zeros(2, dtype=torch.int))
 
@@ -1727,8 +1883,6 @@ def __new__(cls, elem):
                 r = torch.Tensor._make_subclass(cls, elem)
                 return r
 
-            __torch_function__ = torch._C._disabled_torch_function_impl
-
             @classmethod
             def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 with no_dispatch():
@@ -1775,10 +1929,13 @@ def test_is_contiguous_slow_path(self):
         not_contiguous_data = torch.as_strided(data.clone(), (2, 2), (1, 2))
 
         for use_wrapper_subclass in [True, False]:
+
             class ExampleTensor1(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_sizes_strides_policy="strides")
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_sizes_strides_policy="strides"
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -1787,7 +1944,9 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             class ExampleTensor2(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_sizes_strides_policy="strides")
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_sizes_strides_policy="strides"
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -1798,7 +1957,9 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             class ExampleTensor3(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_sizes_strides_policy="strides")
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_sizes_strides_policy="strides"
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -1829,7 +1990,9 @@ def test_fancy_strides(self):
         class ExampleTensor(torch.Tensor):
             @staticmethod
             def __new__(cls, data):
-                return TestPythonDispatch.subclass_helper(cls, data, False, dispatch_sizes_strides_policy="strides")
+                return TestPythonDispatch.subclass_helper(
+                    cls, data, False, dispatch_sizes_strides_policy="strides"
+                )
 
             @classmethod
             def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -1838,7 +2001,7 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
                     torch.ops.aten.is_contiguous.memory_format,
                     torch.ops.aten.is_strides_like_format.default,
                     torch.ops.aten.is_non_overlapping_and_dense.default,
-                    torch.ops.aten.stride.default
+                    torch.ops.aten.stride.default,
                 ]:
                     calls.append((func, list(args)[1:]))
                     return None
@@ -1847,20 +2010,32 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
 
         e = ExampleTensor(torch.randn(2, 2))
         self.assertFalse(e.is_contiguous(memory_format=torch.channels_last))
-        self.assertEqual(calls, [(torch.ops.aten.is_contiguous.memory_format, [torch.channels_last])])
+        self.assertEqual(
+            calls, [(torch.ops.aten.is_contiguous.memory_format, [torch.channels_last])]
+        )
         calls.clear()
-        self.assertFalse(torch.ops.aten.is_strides_like_format.default(e, torch.channels_last))
-        self.assertEqual(calls, [(torch.ops.aten.is_strides_like_format.default, [torch.channels_last])])
+        self.assertFalse(
+            torch.ops.aten.is_strides_like_format.default(e, torch.channels_last)
+        )
+        self.assertEqual(
+            calls,
+            [(torch.ops.aten.is_strides_like_format.default, [torch.channels_last])],
+        )
         calls.clear()
         self.assertTrue(torch.ops.aten.is_non_overlapping_and_dense.default(e))
-        self.assertEqual(calls, [(torch.ops.aten.is_non_overlapping_and_dense.default, [])])
+        self.assertEqual(
+            calls, [(torch.ops.aten.is_non_overlapping_and_dense.default, [])]
+        )
 
     def test_device_slowpath(self):
         for use_wrapper_subclass in [True]:
+
             class ExampleTensor1(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_device=True)
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_device=True
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -1869,23 +2044,27 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             class ExampleTensor2(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_device=True)
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_device=True
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
                     if func.overloadpacket == torch.ops.prim.device:
-                        return torch.device('meta')
+                        return torch.device("meta")
                     return NotImplemented
 
             class ExampleTensor3(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_device=True)
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_device=True
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
                     if func.overloadpacket == torch.ops.prim.device:
-                        return torch.device('meta')
+                        return torch.device("meta")
                     return NotImplemented
 
             err_msg = "Multiple dispatch failed for 'torch.ops.prim.device'"
@@ -1894,22 +2073,25 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
                 e.device()
 
             ten = torch.rand([1])
-            e = ExampleTensor2(torch.randn(3, 3, device='cpu'), use_wrapper_subclass)
-            self.assertEqual(e.device.type, 'meta')
-            self.assertEqual(ten.type_as(e).device.type, 'meta')
+            e = ExampleTensor2(torch.randn(3, 3, device="cpu"), use_wrapper_subclass)
+            self.assertEqual(e.device.type, "meta")
+            self.assertEqual(ten.type_as(e).device.type, "meta")
 
-            e = ExampleTensor3(torch.randn(3, 3, device='cpu'), use_wrapper_subclass)
-            self.assertEqual(e.device.type, 'meta')
-            self.assertEqual(ten.type_as(e).device.type, 'meta')
+            e = ExampleTensor3(torch.randn(3, 3, device="cpu"), use_wrapper_subclass)
+            self.assertEqual(e.device.type, "meta")
+            self.assertEqual(ten.type_as(e).device.type, "meta")
 
     def test_dim_slowpath(self):
         data = torch.randn(3, 3)
 
         for use_wrapper_subclass in [True, False]:
+
             class DimNotImplementedTensor(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_sizes_strides_policy="sizes")
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_sizes_strides_policy="sizes"
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -1918,7 +2100,9 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             class DimImplementedTensor(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_sizes_strides_policy="sizes")
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_sizes_strides_policy="sizes"
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -1939,6 +2123,7 @@ class T(torch.Tensor):
             @classmethod
             def __torch_function__(cls, *args, **kwargs):
                 pass
+
         a = torch.rand(3)
 
         a[[T(), T()]]
@@ -1952,17 +2137,22 @@ class TestTensor(torch.Tensor):
             @staticmethod
             def __new__(cls, *args, **kwargs):
                 r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
-                    cls, (0,), dispatch_sizes_strides_policy="sizes")
+                    cls, (0,), dispatch_sizes_strides_policy="sizes"
+                )
                 return r
 
             @classmethod
             def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 if func in (
                     torch.ops.aten.sym_size.default,
-                    torch.ops.aten.sym_stride.default
+                    torch.ops.aten.sym_stride.default,
                 ):
                     from torch._dynamo.source import ConstantSource
-                    from torch.fx.experimental.symbolic_shapes import ShapeEnv, DimDynamic
+                    from torch.fx.experimental.symbolic_shapes import (
+                        DimDynamic,
+                        ShapeEnv,
+                    )
+
                     shape_env = ShapeEnv()
                     si = shape_env.create_symintnode(
                         shape_env.create_symbol(
@@ -1971,7 +2161,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                             dynamic_dim=DimDynamic.DUCK,
                             constraint_dim=None,
                         ),
-                        hint=123
+                        hint=123,
                     )
                     return (si,)
 
@@ -1983,10 +2173,13 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
 
     def test_strides_slow_path(self):
         for use_wrapper_subclass in [True, False]:
+
             class StridesNotImplemented(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_sizes_strides_policy="strides")
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_sizes_strides_policy="strides"
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -1995,7 +2188,9 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             class StridesCustomReturn(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_sizes_strides_policy="strides")
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_sizes_strides_policy="strides"
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -2006,7 +2201,9 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             class StridesDefaultReturn(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_sizes_strides_policy="strides")
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_sizes_strides_policy="strides"
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -2032,7 +2229,9 @@ def test_sizes_slow_path(self):
             class SizesNotImplemented(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_sizes_strides_policy="sizes")
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_sizes_strides_policy="sizes"
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -2043,7 +2242,9 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             class SizesCustomReturn(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_sizes_strides_policy="sizes")
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_sizes_strides_policy="sizes"
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -2056,7 +2257,9 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             class SizesDefaultReturn(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_sizes_strides_policy="sizes")
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_sizes_strides_policy="sizes"
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -2117,12 +2320,16 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
         def trace_fn(x):
             x_wrapper = CustomSizeDynamicShapesTensor(x)
             return x_wrapper.size(), x_wrapper.stride()
+
         fx_g = make_fx(trace_fn, tracing_mode="symbolic")(x)
-        self.assertExpectedInline(fx_g.code.strip(), """\
+        self.assertExpectedInline(
+            fx_g.code.strip(),
+            """\
 def forward(self, x_1):
     sym_size_int = torch.ops.aten.sym_size.int(x_1, 0)
     sym_size_int_1 = torch.ops.aten.sym_size.int(x_1, 1);  x_1 = None
-    return ((sym_size_int, sym_size_int_1), (sym_size_int, sym_size_int_1))""")
+    return ((sym_size_int, sym_size_int_1), (sym_size_int, sym_size_int_1))""",
+        )
 
     def test_data_ptr_respects_numel_slow_path(self):
         data = torch.randn(6, 2)
@@ -2130,7 +2337,9 @@ def test_data_ptr_respects_numel_slow_path(self):
         class NumelDefaultReturn(torch.Tensor):
             @staticmethod
             def __new__(cls, data, wrapper):
-                return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_sizes_strides_policy="sizes")
+                return TestPythonDispatch.subclass_helper(
+                    cls, data, wrapper, dispatch_sizes_strides_policy="sizes"
+                )
 
             @classmethod
             def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -2154,7 +2363,9 @@ def test_layout_slow_path(self):
             class LayoutNotImplemented(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_layout=True)
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_layout=True
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -2163,7 +2374,9 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             class LayoutCustomReturn(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_layout=True)
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_layout=True
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -2174,7 +2387,9 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             class LayoutDefaultReturn(torch.Tensor):
                 @staticmethod
                 def __new__(cls, data, wrapper):
-                    return TestPythonDispatch.subclass_helper(cls, data, wrapper, dispatch_layout=True)
+                    return TestPythonDispatch.subclass_helper(
+                        cls, data, wrapper, dispatch_layout=True
+                    )
 
                 @classmethod
                 def __torch_dispatch__(cls, func, types, args, kwargs):
@@ -2193,6 +2408,7 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             e = LayoutDefaultReturn(torch.randn(4, 2), use_wrapper_subclass)
             self.assertEqual(e.layout, torch.strided)
 
+
 class TestPythonDispatcher(TestCase):
     def test_basic(self):
         x = torch.randn(2, requires_grad=True)
@@ -2207,8 +2423,8 @@ def test_lstsq(self):
         python_disp_shape = torch.linalg.lstsq(a, b).solution.shape
         self.assertEqual(expected_shape, python_disp_shape)
 
-class TestWrapperSubclassAliasing(TestCase):
 
+class TestWrapperSubclassAliasing(TestCase):
     def _test_wrapper_subclass_aliasing(self, op, args, kwargs):
         def to_subclass(t: torch.Tensor):
             return TwoTensor(t, t.clone())
@@ -2221,16 +2437,24 @@ def to_subclass(t: torch.Tensor):
         result_test = op(*args_subclass, **kwargs_subclass)
 
         args_ref_flat = pytree.arg_tree_leaves(*args, **kwargs)
-        args_ref_flat_tensors = [x for x in args_ref_flat if isinstance(x, torch.Tensor)]
+        args_ref_flat_tensors = [
+            x for x in args_ref_flat if isinstance(x, torch.Tensor)
+        ]
 
         args_test_flat = pytree.tree_leaves((args_subclass, kwargs_subclass))
-        args_test_flat_tensors = [x for x in args_test_flat if isinstance(x, torch.Tensor)]
+        args_test_flat_tensors = [
+            x for x in args_test_flat if isinstance(x, torch.Tensor)
+        ]
 
         result_ref_flat = pytree.tree_leaves(result_ref)
-        result_ref_flat_tensors = [x for x in result_ref_flat if isinstance(x, torch.Tensor)]
+        result_ref_flat_tensors = [
+            x for x in result_ref_flat if isinstance(x, torch.Tensor)
+        ]
 
         result_test_flat = pytree.tree_leaves(result_test)
-        result_test_flat_tensors = [x for x in result_test_flat if isinstance(x, torch.Tensor)]
+        result_test_flat_tensors = [
+            x for x in result_test_flat if isinstance(x, torch.Tensor)
+        ]
 
         for o_ref, o_test in zip(result_ref_flat_tensors, result_test_flat_tensors):
             for a_ref, a_test in zip(args_ref_flat_tensors, args_test_flat_tensors):
@@ -2238,26 +2462,42 @@ def to_subclass(t: torch.Tensor):
                 if out_is_inpt:
                     self.assertTrue(o_test is a_test)
 
-                out_aliases_inpt = StorageWeakRef(o_ref.untyped_storage()) == StorageWeakRef(a_ref.untyped_storage())
+                out_aliases_inpt = StorageWeakRef(
+                    o_ref.untyped_storage()
+                ) == StorageWeakRef(a_ref.untyped_storage())
                 if out_aliases_inpt:
-                    self.assertTrue(StorageWeakRef(o_test.untyped_storage()) == StorageWeakRef(a_test.untyped_storage()))
+                    self.assertTrue(
+                        StorageWeakRef(o_test.untyped_storage())
+                        == StorageWeakRef(a_test.untyped_storage())
+                    )
                 else:
-                    self.assertFalse(StorageWeakRef(o_test.untyped_storage()) == StorageWeakRef(a_test.untyped_storage()))
+                    self.assertFalse(
+                        StorageWeakRef(o_test.untyped_storage())
+                        == StorageWeakRef(a_test.untyped_storage())
+                    )
 
     # This tests the correctness of `torch.utils._python_dispatch.return_and_correct_aliasing`,
     # a util for wrapper subclasses to promise correct aliasing behavior.
     # It's probably overkill to test every OpInfo,
     # so I picked a sampling of ops with representative schemas.
-    @ops([op for op in op_db if op.name in [
-        'mul',  # out-of-place
-        'cat',  # out-of-place (TensorList input)
-        'index',  # out-of-place (Optional TensorList input)
-        'mul_',  # inplace
-        'view',  # view
-        't_',  # inplace-view
-        'split',  # view (multi-return)
-        'native_batch_norm',  # mutable op (returns outputs and mutates some inputs)
-    ]], allowed_dtypes=(torch.float,))
+    @ops(
+        [
+            op
+            for op in op_db
+            if op.name
+            in [
+                "mul",  # out-of-place
+                "cat",  # out-of-place (TensorList input)
+                "index",  # out-of-place (Optional TensorList input)
+                "mul_",  # inplace
+                "view",  # view
+                "t_",  # inplace-view
+                "split",  # view (multi-return)
+                "native_batch_norm",  # mutable op (returns outputs and mutates some inputs)
+            ]
+        ],
+        allowed_dtypes=(torch.float,),
+    )
     def test_wrapper_subclass_aliasing(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype)
         sample = first_sample(self, samples)
@@ -2281,15 +2521,18 @@ def test_wrapper_subclass_aliasing_conv2d(self, device):
         # Make sure that _return_and_correct_aliasing can handle this case
         # (I'm using inference_mode to make sure conv2d doesn't decompose and goes to torch_dispatch)
         with torch.inference_mode():
-            self._test_wrapper_subclass_aliasing(torch.ops.aten.conv2d.default, args, kwargs)
+            self._test_wrapper_subclass_aliasing(
+                torch.ops.aten.conv2d.default, args, kwargs
+            )
 
     def test_wrapper_subclass_aliasing_out_op(self, device):
         # Make sure that _return_and_correct_aliasing can handle kwargs w mutable tensors
         args = (torch.ones(4), torch.ones(4))
-        kwargs = {'out': torch.empty(4)}
+        kwargs = {"out": torch.empty(4)}
         self._test_wrapper_subclass_aliasing(torch.ops.aten.add.out, args, kwargs)
 
+
 instantiate_device_type_tests(TestWrapperSubclassAliasing, globals())
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_pytree.py b/test/test_pytree.py
index 182b53b90cc68..caaf4d0b53bd6 100644
--- a/test/test_pytree.py
+++ b/test/test_pytree.py
@@ -1,17 +1,22 @@
 # Owner(s): ["module: pytree"]
 
+import collections
 import inspect
 import re
 import unittest
 from collections import defaultdict, deque, namedtuple, OrderedDict, UserDict
+from dataclasses import dataclass
+from typing import Any, NamedTuple
 
 import torch
 import torch.utils._pytree as py_pytree
+from torch.fx.immutable_collections import immutable_dict, immutable_list
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_FBCODE,
     parametrize,
     run_tests,
+    skipIfTorchDynamo,
     subtest,
     TEST_WITH_TORCHDYNAMO,
     TestCase,
@@ -624,6 +629,18 @@ def test_tree_map_only(self, pytree_impl):
             pytree_impl.tree_map_only(int, lambda x: x + 2, [0, "a"]), [2, "a"]
         )
 
+    @parametrize(
+        "pytree_impl",
+        [
+            subtest(py_pytree, name="py"),
+            subtest(cxx_pytree, name="cxx"),
+        ],
+    )
+    def test_tree_map_only_predicate_fn(self, pytree_impl):
+        self.assertEqual(
+            pytree_impl.tree_map_only(lambda x: x == 0, lambda x: x + 2, [0, 1]), [2, 1]
+        )
+
     @parametrize(
         "pytree_impl",
         [
@@ -898,17 +915,45 @@ def test_pytree_serialize(self, spec):
         self.assertEqual(spec, py_pytree.treespec_loads(serialized_spec))
 
     def test_pytree_serialize_namedtuple(self):
-        Point = namedtuple("Point", ["x", "y"])
+        Point1 = namedtuple("Point1", ["x", "y"])
+        py_pytree._register_namedtuple(
+            Point1,
+            serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.Point1",
+        )
+
         spec = py_pytree.TreeSpec(
-            namedtuple, Point, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
+            namedtuple, Point1, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
         )
+        roundtrip_spec = py_pytree.treespec_loads(py_pytree.treespec_dumps(spec))
+        self.assertEqual(spec, roundtrip_spec)
 
+        class Point2(NamedTuple):
+            x: int
+            y: int
+
+        py_pytree._register_namedtuple(
+            Point2,
+            serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.Point2",
+        )
+
+        spec = py_pytree.TreeSpec(
+            namedtuple, Point2, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
+        )
         roundtrip_spec = py_pytree.treespec_loads(py_pytree.treespec_dumps(spec))
-        # The context in the namedtuple is different now because we recreated
-        # the namedtuple type.
-        self.assertEqual(spec.context._fields, roundtrip_spec.context._fields)
+        self.assertEqual(spec, roundtrip_spec)
+
+    def test_pytree_serialize_namedtuple_bad(self):
+        DummyType = namedtuple("DummyType", ["x", "y"])
+
+        spec = py_pytree.TreeSpec(
+            namedtuple, DummyType, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
+        )
+
+        with self.assertRaisesRegex(
+            NotImplementedError, "Please register using `_register_namedtuple`"
+        ):
+            py_pytree.treespec_dumps(spec)
 
-    @unittest.expectedFailure
     def test_pytree_custom_type_serialize_bad(self):
         class DummyType:
             def __init__(self, x, y):
@@ -999,6 +1044,10 @@ def test_pytree_serialize_bad_protocol(self):
         spec = py_pytree.TreeSpec(
             namedtuple, Point, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
         )
+        py_pytree._register_namedtuple(
+            Point,
+            serialized_type_name="test_pytree.test_pytree_serialize_bad_protocol.Point",
+        )
 
         with self.assertRaisesRegex(ValueError, "Unknown protocol"):
             py_pytree.treespec_dumps(spec, -1)
@@ -1056,6 +1105,169 @@ def test_saved_serialized(self):
         self.assertEqual(serialized_spec, saved_spec)
         self.assertEqual(complicated_spec, py_pytree.treespec_loads(saved_spec))
 
+    def test_tree_map_with_path(self):
+        tree = [{i: i for i in range(10)}]
+        all_zeros = py_pytree.tree_map_with_path(
+            lambda kp, val: val - kp[1].key + kp[0].idx, tree
+        )
+        self.assertEqual(all_zeros, [dict.fromkeys(range(10), 0)])
+
+    def test_tree_map_with_path_multiple_trees(self):
+        @dataclass
+        class ACustomPytree:
+            x: Any
+            y: Any
+            z: Any
+
+        tree1 = [ACustomPytree(x=12, y={"cin": [1, 4, 10], "bar": 18}, z="leaf"), 5]
+        tree2 = [
+            ACustomPytree(
+                x=2,
+                y={"cin": [2, 2, 2], "bar": 2},
+                z="leaf",
+            ),
+            2,
+        ]
+
+        py_pytree.register_pytree_node(
+            ACustomPytree,
+            flatten_fn=lambda f: ([f.x, f.y], f.z),
+            unflatten_fn=lambda xy, z: ACustomPytree(xy[0], xy[1], z),
+            flatten_with_keys_fn=lambda f: ((("x", f.x), ("y", f.y)), f.z),
+        )
+        from_two_trees = py_pytree.tree_map_with_path(
+            lambda kp, a, b: a + b, tree1, tree2
+        )
+        from_one_tree = py_pytree.tree_map(lambda a: a + 2, tree1)
+        self.assertEqual(from_two_trees, from_one_tree)
+
+    @skipIfTorchDynamo("dynamo pytree tracing doesn't work here")
+    def test_tree_flatten_with_path_is_leaf(self):
+        leaf_dict = {"foo": [(3)]}
+        pytree = (["hello", [1, 2], leaf_dict],)
+        key_leaves, spec = py_pytree.tree_flatten_with_path(
+            pytree, is_leaf=lambda x: isinstance(x, dict)
+        )
+        self.assertTrue(key_leaves[-1][1] is leaf_dict)
+
+    def test_tree_flatten_with_path_roundtrip(self):
+        class ANamedTuple(NamedTuple):
+            x: torch.Tensor
+            y: int
+            z: str
+
+        @dataclass
+        class ACustomPytree:
+            x: Any
+            y: Any
+            z: Any
+
+        py_pytree.register_pytree_node(
+            ACustomPytree,
+            flatten_fn=lambda f: ([f.x, f.y], f.z),
+            unflatten_fn=lambda xy, z: ACustomPytree(xy[0], xy[1], z),
+            flatten_with_keys_fn=lambda f: ((("x", f.x), ("y", f.y)), f.z),
+        )
+
+        SOME_PYTREES = [
+            (None,),
+            ["hello", [1, 2], {"foo": [(3)]}],
+            [ANamedTuple(x=torch.rand(2, 3), y=1, z="foo")],
+            [ACustomPytree(x=12, y={"cin": [1, 4, 10], "bar": 18}, z="leaf"), 5],
+        ]
+        for pytree in SOME_PYTREES:
+            key_leaves, spec = py_pytree.tree_flatten_with_path(pytree)
+            actual = py_pytree.tree_unflatten([leaf for _, leaf in key_leaves], spec)
+            self.assertEqual(actual, pytree)
+
+    def test_tree_leaves_with_path(self):
+        class ANamedTuple(NamedTuple):
+            x: torch.Tensor
+            y: int
+            z: str
+
+        @dataclass
+        class ACustomPytree:
+            x: Any
+            y: Any
+            z: Any
+
+        py_pytree.register_pytree_node(
+            ACustomPytree,
+            flatten_fn=lambda f: ([f.x, f.y], f.z),
+            unflatten_fn=lambda xy, z: ACustomPytree(xy[0], xy[1], z),
+            flatten_with_keys_fn=lambda f: ((("x", f.x), ("y", f.y)), f.z),
+        )
+
+        SOME_PYTREES = [
+            (None,),
+            ["hello", [1, 2], {"foo": [(3)]}],
+            [ANamedTuple(x=torch.rand(2, 3), y=1, z="foo")],
+            [ACustomPytree(x=12, y={"cin": [1, 4, 10], "bar": 18}, z="leaf"), 5],
+        ]
+        for pytree in SOME_PYTREES:
+            flat_out, _ = py_pytree.tree_flatten_with_path(pytree)
+            leaves_out = py_pytree.tree_leaves_with_path(pytree)
+            self.assertEqual(flat_out, leaves_out)
+
+    def test_key_str(self):
+        class ANamedTuple(NamedTuple):
+            x: str
+            y: int
+
+        tree = (["hello", [1, 2], {"foo": [(3)], "bar": [ANamedTuple(x="baz", y=10)]}],)
+        flat, _ = py_pytree.tree_flatten_with_path(tree)
+        paths = [f"{py_pytree.keystr(kp)}: {val}" for kp, val in flat]
+        self.assertEqual(
+            paths,
+            [
+                "[0][0]: hello",
+                "[0][1][0]: 1",
+                "[0][1][1]: 2",
+                "[0][2]['foo'][0]: 3",
+                "[0][2]['bar'][0].x: baz",
+                "[0][2]['bar'][0].y: 10",
+            ],
+        )
+
+    @skipIfTorchDynamo("AssertionError in dynamo")
+    def test_flatten_flatten_with_key_consistency(self):
+        """Check that flatten and flatten_with_key produces consistent leaves/context."""
+        reg = py_pytree.SUPPORTED_NODES
+
+        EXAMPLE_TREE = {
+            list: [1, 2, 3],
+            tuple: (1, 2, 3),
+            dict: {"foo": 1, "bar": 2},
+            namedtuple: collections.namedtuple("ANamedTuple", ["x", "y"])(1, 2),
+            OrderedDict: OrderedDict([("foo", 1), ("bar", 2)]),
+            defaultdict: defaultdict(int, {"foo": 1, "bar": 2}),
+            deque: deque([1, 2, 3]),
+            torch.Size: torch.Size([1, 2, 3]),
+            immutable_dict: immutable_dict({"foo": 1, "bar": 2}),
+            immutable_list: immutable_list([1, 2, 3]),
+        }
+
+        for typ in reg:
+            example = EXAMPLE_TREE.get(typ)
+            if example is None:
+                continue
+            flat_with_path, spec1 = py_pytree.tree_flatten_with_path(example)
+            flat, spec2 = py_pytree.tree_flatten(example)
+
+            self.assertEqual(flat, [x[1] for x in flat_with_path])
+            self.assertEqual(spec1, spec2)
+
+    def test_key_access(self):
+        class ANamedTuple(NamedTuple):
+            x: str
+            y: int
+
+        tree = (["hello", [1, 2], {"foo": [(3)], "bar": [ANamedTuple(x="baz", y=10)]}],)
+        flat, _ = py_pytree.tree_flatten_with_path(tree)
+        for kp, val in flat:
+            self.assertEqual(py_pytree.key_get(tree, kp), val)
+
 
 class TestCxxPytree(TestCase):
     def setUp(self):
@@ -1117,12 +1329,20 @@ def test_pytree_serialize(self, spec):
         self.assertEqual(spec, cxx_pytree.treespec_loads(serialized_spec))
 
     def test_pytree_serialize_namedtuple(self):
+        py_pytree._register_namedtuple(
+            GlobalPoint,
+            serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.GlobalPoint",
+        )
         spec = cxx_pytree.tree_structure(GlobalPoint(0, 1))
 
         roundtrip_spec = cxx_pytree.treespec_loads(cxx_pytree.treespec_dumps(spec))
         self.assertEqual(roundtrip_spec.type._fields, spec.type._fields)
 
         LocalPoint = namedtuple("LocalPoint", ["x", "y"])
+        py_pytree._register_namedtuple(
+            LocalPoint,
+            serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.LocalPoint",
+        )
         spec = cxx_pytree.tree_structure(LocalPoint(0, 1))
 
         roundtrip_spec = cxx_pytree.treespec_loads(cxx_pytree.treespec_dumps(spec))
diff --git a/test/test_reductions.py b/test/test_reductions.py
index 45bff79159460..bb43d1ae3a2c4 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -1115,6 +1115,13 @@ def test_all_any_empty(self, device):
         self.assertTrue(x.all())
         self.assertFalse(x.any())
 
+    def test_all_issue117215(self, device):
+        info = torch.iinfo(torch.uint8)
+        a = torch.randint(info.min, info.max, (73, 11, 3, 17), dtype=torch.uint8)
+        b = torch.all(a, dim=0)
+        c = a.to(torch.bool).all(dim=0)
+        self.assertEqual(torch.ne(b, c).sum(), 0)
+
     @dtypesIfCUDA(torch.half, torch.bfloat16, torch.float, torch.double)
     @dtypes(torch.half, torch.bfloat16, torch.float, torch.double)
     def test_max_with_inf(self, device, dtype):
@@ -1336,6 +1343,50 @@ def test_var_stability2(self, device):
         tensor = tensor.unsqueeze(1)
         self.assertEqual(tensor.var(0), 0.03125)
 
+    @onlyCPU
+    @dtypes(torch.bfloat16, torch.float16)
+    def test_sum_noncontig_lowp(self, device, dtype) -> None:
+        dim_sequences = {
+            2: [0, 1],
+            3: [0, 1, 2],
+            4: [0, 1, 2, 3],
+            5: [0, 1, 2, 3, 4],
+        }
+
+        def create_noncontig_inputs(x, ndim):
+            if ndim == 2:
+                return x[::2, ::2]
+            elif ndim == 3:
+                return x[::2, ::2, ::2]
+            elif ndim == 4:
+                return x[::2, ::2, ::2, ::2]
+            elif ndim == 5:
+                return x[::2, ::2, ::2, ::2, ::2]
+
+        def helper(self, shape, reduce_dims, device, dtype):
+            for permute_list in list(permutations(dim_sequences[len(shape)], len(shape))):
+                x = torch.ones(shape, device=device, dtype=dtype)
+                x = create_noncontig_inputs(x, len(shape))
+                x_trans = x.permute(permute_list)
+                x_sum = torch.sum(x_trans, reduce_dims)
+                x_trans_ref = x_trans.float()
+                x_sum_ref = torch.sum(x_trans_ref, reduce_dims)
+                self.assertEqual(x_sum, x_sum_ref.to(dtype=dtype))
+
+        shapes = [
+            (50, 50),
+            (50, 50, 50),
+            (10, 50, 30, 30),
+            (10, 5, 10, 50, 7),
+        ]
+
+        for shape in shapes:
+            for i in range(1, len(shape) + 1):
+                reduce_dims = list(combinations(dim_sequences[len(shape)], i))
+                for reduce_dim in reduce_dims:
+                    helper(self, shape, reduce_dim, device, dtype)
+
+
     @onlyCPU
     @dtypes(torch.bool, torch.double)
     def test_sum_all(self, device, dtype) -> None:
@@ -1662,45 +1713,52 @@ def test_count_nonzero(self, device, dtype):
         self._test_reduction_function_with_numpy(torch.count_nonzero, np.count_nonzero, device, dtype)
         self._test_reduction_function_with_numpy(torch.count_nonzero, np.count_nonzero, device, dtype, True)
 
+    # TODO: Investigate why the output is not close to numpy.
+    def _get_relaxed_tolerances_for(self, dtype):
+        if dtype == torch.float16:
+            atol = 0.4
+            rtol = 1e-2
+        elif dtype == torch.float32:
+            atol = 7e-05
+            rtol = 3e-06
+        else:
+            # Default values
+            atol = None
+            rtol = None
+        return atol, rtol
+
     def _test_sum_reduction_vs_numpy(self, torch_fn, np_fn, device, dtype, with_keepdim=False, with_extremal=False):
         def is_integral(dtype):
             return dtype in integral_types()
 
+        exact_dtype = True
         # On Windows CI, the current version of `numpy` promotes all lower integers
         # dtypes to int32 while `torch` promotes them to int64. Hence we skip on checking
         # the exact dtype.
         # Reference : https://dr.pytorch.org/api/view-log-full?build_id=122051580
         # PR : https://github.com/pytorch/pytorch/pull/38628#issuecomment-655905370
-        exact_dtype = False if (IS_WINDOWS and is_integral(dtype)) else True
-
+        if IS_WINDOWS and is_integral(dtype):
+            exact_dtype = False
+        # For uint8, numpy promotes to uint64 while torch promotes to int64.
+        # So we must skip this as well.
         if dtype == torch.uint8:
-            with self.assertRaises(TypeError):
-                self._test_reduction_function_with_numpy(torch_fn, np_fn, device, dtype, with_extremal=with_extremal)
-        else:
-            # TODO: Investigate why the output is not close to numpy.
-            if dtype == torch.float16:
-                atol = 0.4
-                rtol = 1e-2
-            elif dtype == torch.float32:
-                atol = 7e-05
-                rtol = 3e-06
-            else:
-                # Default values
-                atol = None
-                rtol = None
-            self._test_reduction_function_with_numpy(torch_fn, np_fn, device, dtype,
-                                                     atol=atol, rtol=rtol, exact_dtype=exact_dtype,
-                                                     with_keepdim=with_keepdim, with_extremal=with_extremal)
+            exact_dtype = False
+
+        # TODO: Investigate why the output is not close to numpy.
+        atol, rtol = self._get_relaxed_tolerances_for(dtype)
+        self._test_reduction_function_with_numpy(torch_fn, np_fn, device, dtype,
+                                                 atol=atol, rtol=rtol, exact_dtype=exact_dtype,
+                                                 with_keepdim=with_keepdim, with_extremal=with_extremal)
 
     @onlyNativeDeviceTypes
-    @dtypes(*all_types_and(torch.half))
+    @dtypes(*set(all_types_and(torch.half)) - {torch.uint8})
     def test_sum_vs_numpy(self, device, dtype):
         self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype)
         self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype, with_extremal=True)
         self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype, with_keepdim=True)
 
     @onlyNativeDeviceTypes
-    @dtypes(*all_types_and(torch.half))
+    @dtypes(*set(all_types_and(torch.half)) - {torch.uint8})
     def test_nansum_vs_numpy(self, device, dtype):
         self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype)
         self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype, with_extremal=True)
@@ -1718,12 +1776,14 @@ def test_nansum_out_dtype(self, device, dtype):
         out_dtype = dtype
         inp_dtypes = all_types_and(torch.half) if out_dtype.is_floating_point else integral_types()
         for inp_dtype in inp_dtypes:
+            # TODO: Investigate why the output is not close to numpy.
+            atol, rtol = self._get_relaxed_tolerances_for(dtype)
             shape = _rand_shape(random.randint(2, 5), min_size=5, max_size=10)
             x = _generate_input(shape, inp_dtype, device, with_extremal=False)
             torch_fn = partial(torch.nansum, dtype=out_dtype)
             np_out_dtype = torch_to_numpy_dtype_dict[out_dtype]
             np_fn = partial(np.nansum, dtype=np_out_dtype)
-            self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
+            self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None, atol=atol, rtol=rtol)
 
     @dtypes(*all_types_and(torch.half))
     def test_argminmax_multiple(self, device, dtype):
@@ -3528,6 +3588,10 @@ def to_numpy(input):
             # Workaround https://github.com/pytorch/pytorch/issues/66556
             expected = np.asarray(expected)  # transform numpy scalars to numpy.ndarray instances
 
+            # Numpy differs, producing uint32 on Windows
+            if expected.dtype in [np.uint64, np.uint32]:
+                exact_dtype = False
+
             msg = ("Failed to produce expected results! Input tensor was"
                    f" {t}, torch result is {actual}, and reference result is"
                    f" {expected}.") if t.numel() < 10 else None
diff --git a/test/test_schema_check.py b/test/test_schema_check.py
index 177493d4a09ec..831ba9b892504 100644
--- a/test/test_schema_check.py
+++ b/test/test_schema_check.py
@@ -26,17 +26,17 @@ def secretly_mutating(x):
 def output_is_input(x):
     return x
 
-custom_lib = torch.library.Library("bad_schemas", "DEF")
+custom_lib = torch.library.Library("bad_schemas", "DEF")  # noqa: TOR901
 custom_lib.define("secretly_aliasing(Tensor x) -> Tensor")
 custom_lib.define("secretly_mutating(Tensor x) -> Tensor")
 custom_lib.define("output_is_input(Tensor(a) x) -> Tensor(a)")
 
-custom_lib_cpu = torch.library.Library("bad_schemas", "IMPL", "CPU")
+custom_lib_cpu = torch.library.Library("bad_schemas", "IMPL", "CPU")  # noqa: TOR901
 custom_lib_cpu.impl("secretly_aliasing", secretly_aliasing)
 custom_lib_cpu.impl("secretly_mutating", secretly_mutating)
 custom_lib_cpu.impl("output_is_input", output_is_input)
 
-custom_lib_meta = torch.library.Library("bad_schemas", "IMPL", "Meta")
+custom_lib_meta = torch.library.Library("bad_schemas", "IMPL", "Meta")  # noqa: TOR901
 custom_lib_meta.impl("secretly_aliasing", secretly_aliasing)
 custom_lib_meta.impl("secretly_mutating", secretly_mutating)
 custom_lib_meta.impl("output_is_input", output_is_input)
@@ -53,8 +53,6 @@ class IncorrectAliasTensor(torch.Tensor):
 
     __slots__ = ['elem']
 
-    __torch_function__ = torch._C._disabled_torch_function_impl
-
     @staticmethod
     def __new__(cls, elem, *args, **kwargs):
         # The wrapping tensor (IncorrectAliasTensor) shouldn't hold any
@@ -295,7 +293,7 @@ def test_schema_check_mode_functionality_with_multiple_outputs(self):
         self.assertEqual(m_expected, m_actual)
         self.assertEqual(e_expected, e_actual)
 
-    # Tests that SchemaCheckMode wraps Torch.tensor with aliasing ouputs due to aliasing inputs
+    # Tests that SchemaCheckMode wraps Torch.tensor with aliasing outputs due to aliasing inputs
     def test_schema_check_mode_functionality_with_multiple_outputs_aliasing(self):
         x = torch.rand((3, 3))
         actual = torch.zeros(3)
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 7cfc6aab61975..2f7e6babdecfb 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -20,24 +20,28 @@
 
 from torch._utils_internal import get_file_path_2
 from torch._utils import _rebuild_tensor
+from torch.utils._import_utils import import_dill
 from torch.serialization import check_module_version_greater_or_equal, get_default_load_endianness, \
     set_default_load_endianness, LoadEndianness
 
-from torch.testing._internal.common_utils import IS_FILESYSTEM_UTF8_ENCODING, TemporaryDirectoryName, \
-    TestCase, IS_WINDOWS, TEST_DILL, run_tests, download_file, BytesIOContext, TemporaryFileName, \
-    parametrize, instantiate_parametrized_tests, AlwaysWarnTypedStorageRemoval
+from torch.testing._internal.common_utils import (
+    IS_FILESYSTEM_UTF8_ENCODING, TemporaryDirectoryName,
+    TestCase, IS_WINDOWS, TEST_DILL, run_tests, download_file, BytesIOContext, TemporaryFileName,
+    parametrize, instantiate_parametrized_tests, AlwaysWarnTypedStorageRemoval, serialTest)
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_dtype import all_types_and_complex_and
 
+if not IS_WINDOWS:
+    from mmap import MAP_SHARED, MAP_PRIVATE
+else:
+    MAP_SHARED, MAP_PRIVATE = None, None
+
 # These tests were all copied from `test/test_torch.py` at some point, so see
 # the actual blame, see this revision
 # https://github.com/pytorch/pytorch/blame/9a2691f2fc948b9792686085b493c61793c2de30/test/test_torch.py
 
-if TEST_DILL:
-    import dill
-    HAS_DILL_AT_LEAST_0_3_1 = check_module_version_greater_or_equal(dill, (0, 3, 1))
-else:
-    HAS_DILL_AT_LEAST_0_3_1 = False
+dill = import_dill()
+HAS_DILL_AT_LEAST_0_3_1 = dill is not None and check_module_version_greater_or_equal(dill, (0, 3, 1))
 
 can_retrieve_source = True
 with warnings.catch_warnings(record=True) as warns:
@@ -925,7 +929,7 @@ def test(name_or_buffer):
             test(fname)
 
         if IS_FILESYSTEM_UTF8_ENCODING:
-            with TemporaryDirectoryName(suffix='非ASCIIパス') as dname:
+            with TemporaryDirectoryName(suffix='\u975eASCII\u30d1\u30b9') as dname:
                 with TemporaryFileName(dir=dname) as fname:
                     test(fname)
 
@@ -938,6 +942,7 @@ def test_serialization_zipfile_actually_jit(self):
             torch.load(f)
 
     # Ensure large zip64 serialization works properly
+    @serialTest()
     def test_serialization_2gb_file(self):
         # Run GC to clear up as much memory as possible before running this test
         gc.collect()
@@ -982,10 +987,14 @@ def test_lr_scheduler_serialization(self):
             lr_scheduler_state = torch.load(f)
 
         self.assertEqual(lr_scheduler_state['base_lrs'], lr_scheduler.base_lrs)
-        self.assertFalse(hasattr(lr_scheduler_state['anneal_func'], '__self__'))  # check method is not bound
+        if 'anneal_func' in lr_scheduler_state:
+            self.assertFalse(hasattr(lr_scheduler_state['anneal_func'], '__self__'))  # check method is not bound
+        else:
+            self.assertTrue('_anneal_func_type' in lr_scheduler_state)
         self.assertTrue(size < 1024 * 1024)  # Must be less than 1MB
 
-    def test_serialization_python_attr(self):
+    @parametrize('weights_only', (True, False))
+    def test_serialization_python_attr(self, weights_only):
         def _test_save_load_attr(t):
             t.foo = 'foo'
             t.pi = 3.14
@@ -993,7 +1002,7 @@ def _test_save_load_attr(t):
             with BytesIOContext() as f:
                 torch.save(t, f)
                 f.seek(0)
-                loaded_t = torch.load(f)
+                loaded_t = torch.load(f, weights_only=weights_only)
 
             self.assertEqual(t, loaded_t)
             self.assertEqual(t.foo, loaded_t.foo)
@@ -3950,7 +3959,33 @@ def forward(self, input):
             for v in result.values():
                 self.assertTrue(v.is_cuda)
 
-    @parametrize('dtype', (torch.float8_e5m2, torch.float8_e4m3fn))
+    def test_serialization_mmap_loading_options(self):
+        if IS_WINDOWS:
+            with self.assertRaisesRegex(RuntimeError, "Changing the default mmap options is currently not supported"):
+                torch.serialization.set_default_mmap_options(2)
+            return
+        m = torch.nn.Linear(3, 5)
+        sd = m.state_dict()
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(sd, f)
+            # with MmapVisibility.MAP_PRIVATE, should not be able to modify file
+            sd_loaded = torch.load(f.name, mmap=True)
+            sd_loaded['weight'][0][0] = 0
+            sd_loaded2 = torch.load(f.name, mmap=True)
+            self.assertEqual(sd_loaded2['weight'], sd['weight'])
+            # with MmapVisibility.MAP_SHARED, should be able to modify file
+            torch.serialization.set_default_mmap_options(MAP_SHARED)
+            try:
+                sd_loaded = torch.load(f.name, mmap=True)
+                sd_loaded['weight'][0][0] = 0
+                sd_loaded2 = torch.load(f.name, mmap=True)
+                self.assertNotEqual(sd_loaded2['weight'], sd['weight'])
+                self.assertEqual(sd_loaded2['weight'][0][0].item(), 0)
+                self.assertEqual(sd_loaded2['weight'], sd_loaded['weight'])
+            finally:
+                torch.serialization.set_default_mmap_options(MAP_PRIVATE)
+
+    @parametrize('dtype', (torch.float8_e5m2, torch.float8_e4m3fn, torch.complex32))
     @parametrize('weights_only', (True, False))
     def test_serialization_dtype(self, dtype, weights_only):
         """ Tests that newer dtypes can be serialized using `_rebuild_tensor_v3` """
@@ -3965,7 +4000,6 @@ def test_serialization_dtype(self, dtype, weights_only):
             y['even'][0] = torch.tensor(-0.25, dtype=dtype)
             self.assertEqual(y['x'][:2].to(dtype=torch.float32), torch.tensor([-0.25, 0.25]))
 
-
     def run(self, *args, **kwargs):
         with serialization_method(use_zip=True):
             return super().run(*args, **kwargs)
diff --git a/test/test_shape_ops.py b/test/test_shape_ops.py
index 189187b58293a..47acfff9c6d4c 100644
--- a/test/test_shape_ops.py
+++ b/test/test_shape_ops.py
@@ -12,7 +12,7 @@
 from torch import nan
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests, skipIfTorchDynamo, torch_to_numpy_dtype_dict, IS_JETSON)
+    TestCase, run_tests, skipIfTorchDynamo, torch_to_numpy_dtype_dict, IS_JETSON, TEST_PRIVATEUSE1_DEVICE_TYPE)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCPU, onlyCUDA, dtypes, onlyNativeDeviceTypes,
     dtypesIfCUDA, largeTensorTest)
@@ -631,7 +631,7 @@ def gen_nontrivial_input(shape, dtype, device):
                     "scalar type Long",
                     lambda: torch.nonzero(tensor, out=torch.empty([], dtype=torch.float, device=device))
                 )
-            if self.device_type == 'cuda':
+            if self.device_type == 'cuda' or self.device_type == TEST_PRIVATEUSE1_DEVICE_TYPE:
                 self.assertRaisesRegex(
                     RuntimeError,
                     "on the same device",
diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py
index 9439382b98582..7709131e61020 100644
--- a/test/test_sort_and_select.py
+++ b/test/test_sort_and_select.py
@@ -137,6 +137,13 @@ def test_sort(self, device):
             self.assertIsOrdered('descending', x, res2val, res2ind,
                                  'random with NaNs')
 
+    def test_sort_stable_none(self):
+        # Called sort with stable=None used to trigger an assertion
+        # See https://github.com/pytorch/pytorch/issues/117255
+        x = torch.ones(10)
+        y = x.sort(stable=None).values
+        self.assertTrue(torch.all(y == torch.ones(10)).item())
+
     @onlyCUDA
     def test_sort_large_slice(self, device):
         # tests direct cub path
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 3ac444e2225c7..f51afa6e09c46 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -66,6 +66,8 @@ def _op_supports_any_sparse(op):
     IS_WINDOWS and torch.version.cuda and version.parse(torch.version.cuda) > version.parse("11.2")
 ) or (not IS_WINDOWS and not TEST_WITH_ROCM)
 
+HIPSPARSE_SPMM_COMPLEX128_SUPPORTED = torch.version.hip and version.parse(torch.version.hip.split("-")[0]) >= version.parse("6.0")
+
 def all_sparse_layouts(test_name='layout', include_strided=False):
     return parametrize(test_name, [
         subtest(torch.strided, name='Strided'),
@@ -1549,11 +1551,12 @@ def test_shape(di, dj, dk, nnz):
         self.assertEqual(self.safeToDense(res), self.safeToDense(true_result))
 
     @coalescedonoff
-    @precisionOverride({torch.bfloat16: 5e-2})
-    @dtypes(torch.double, torch.cdouble, torch.bfloat16)
+    @precisionOverride({torch.bfloat16: 5e-2, torch.float16: 5e-2})
+    @dtypes(torch.double, torch.cdouble, torch.bfloat16, torch.float16)
     def test_sparse_addmm(self, device, dtype, coalesced):
-        if dtype is torch.bfloat16 and device.startswith("cuda"):
-            self.skipTest('addmm_sparse_cuda is not implemented for BFloat16')
+        if (dtype is torch.bfloat16 or dtype is torch.float16) and device.startswith("cuda"):
+            self.skipTest('addmm_sparse_cuda is not implemented for BFloat16 and Half')
+
 
         def test_shape(m, n, p, nnz, broadcast, alpha_beta=None):
             if alpha_beta is None:
@@ -2837,7 +2840,6 @@ def test_factory_type_inference(self, device, dtype):
         t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.LongTensor(1, 0))
         self.assertEqual(torch.int64, t.dtype)
 
-
     @onlyCUDA
     def test_factory_device_type_inference(self, device):
         # both indices/values are CUDA
@@ -4234,14 +4236,14 @@ def test_future_empty_dim(self, device, dtype, op):
 class TestSparseMeta(TestCase):
     exact_dtype = True
 
-    def test_basic(self):
-        r = torch.empty(4, 4, layout=torch.sparse_coo, device='meta')
+    def _test_meta_sparse_coo(self, dtype):
+        r = torch.empty(4, 4, layout=torch.sparse_coo, device='meta', dtype=dtype)
         self.assertTrue(r.is_meta)
         self.assertEqual(r.device.type, "meta")
         r2 = torch.empty_like(r)
         self.assertTrue(r2.is_meta)
         self.assertEqual(r, r2)
-        r3 = torch.sparse_coo_tensor(size=(4, 4), device='meta')
+        r3 = torch.sparse_coo_tensor(size=(4, 4), device='meta', dtype=dtype)
         self.assertTrue(r3.is_meta)
         self.assertEqual(r, r3)
         r.sparse_resize_((4, 4), 1, 1)
@@ -4260,9 +4262,232 @@ def test_basic(self):
         # TODO: this sort of aliasing will need to be handled by
         # functionalization
         self.assertEqual(r._indices(), torch.empty(2, 0, device='meta', dtype=torch.int64))
-        self.assertEqual(r._values(), torch.empty(0, 4, device='meta'))
+        self.assertEqual(r._values(), torch.empty(0, 4, device='meta', dtype=dtype))
         self.assertEqual(r.indices(), torch.empty(2, 0, device='meta', dtype=torch.int64))
-        self.assertEqual(r.values(), torch.empty(0, 4, device='meta'))
+        self.assertEqual(r.values(), torch.empty(0, 4, device='meta', dtype=dtype))
+
+    def _test_meta_sparse_compressed(self, dtype, index_dtype, layout, batchsize, densesize):
+        index_dtype = torch.int64
+        blocksize = (2, 3) if layout in {torch.sparse_bsr, torch.sparse_bsc} else ()
+        sparsesize = (4, 6)
+        nnz = 0
+
+        shape = (*batchsize, *sparsesize, *densesize)
+        compressed_dim = 0 if layout in {torch.sparse_csr, torch.sparse_bsr} else 1
+        nof_compressed_indices = (sparsesize[compressed_dim] // blocksize[compressed_dim] + 1 if blocksize
+                                  else sparsesize[compressed_dim] + 1)
+        compressed_indices = torch.empty((*batchsize, nof_compressed_indices), device='meta', dtype=index_dtype)
+        plain_indices = torch.empty((*batchsize, nnz), device='meta', dtype=index_dtype)
+
+        values = torch.empty((*batchsize, nnz, *blocksize, *densesize), device='meta', dtype=dtype)
+        r = torch.sparse_compressed_tensor(
+            compressed_indices,
+            plain_indices,
+            values,
+            shape,
+            layout=layout
+        )
+        self.assertTrue(r.is_meta)
+        self.assertEqual(r.device.type, "meta")
+
+        self.assertEqual(r.sparse_dim(), 2)
+        self.assertEqual(r.dense_dim(), len(densesize))
+        self.assertEqual(r._nnz(), nnz)
+        batch_dims = r.ndim - r.sparse_dim() - r.dense_dim()
+        r_blocksize = r.values().shape[batch_dims + 1: batch_dims + 1 + len(blocksize)]
+        self.assertEqual(r_blocksize, blocksize)
+
+        r_compressed_indices = r.crow_indices() if layout in {torch.sparse_csr, torch.sparse_bsr} else r.ccol_indices()
+        r_plain_indices = r.col_indices() if layout in {torch.sparse_csr, torch.sparse_bsr} else r.row_indices()
+
+        self.assertEqual(r_compressed_indices,
+                         torch.empty((*batchsize, nof_compressed_indices), device='meta', dtype=index_dtype))
+        self.assertEqual(r_plain_indices, torch.empty((*batchsize, nnz), device='meta', dtype=index_dtype))
+        self.assertEqual(r.values(), torch.empty((*batchsize, nnz, *blocksize, *densesize), device='meta', dtype=dtype))
+
+        r2 = torch.empty_like(r)
+        self.assertTrue(r2.is_meta)
+        self.assertEqual(r2, r)
+
+        if layout in {torch.sparse_csr, torch.sparse_csc}:
+            r3 = torch.empty((*batchsize, *sparsesize), dtype=dtype, layout=layout, device="meta")
+            self.assertTrue(r3.is_meta)
+            if not densesize:
+                # dense dimensions cannot be specified for torch.empty
+                self.assertEqual(r3, r)
+
+    @all_sparse_layouts('layout', include_strided=False)
+    @parametrize("dtype", [torch.float64])
+    def test_meta(self, dtype, layout):
+        if layout is torch.sparse_coo:
+            self._test_meta_sparse_coo(dtype)
+        else:
+            index_dtype = torch.int64
+            for batchsize, densesize in itertools.product([(), (2,)], [(), (3,)]):
+                self._test_meta_sparse_compressed(dtype, index_dtype, layout, batchsize, densesize)
+
+    def _test_print_meta_data(self, dtype, layout, batchsize, sparsesize, densesize):
+        index_dtype = torch.int64
+        nnz = 0
+        blocksize = (2, 3) if layout in {torch.sparse_bsr, torch.sparse_bsc} else ()
+        shape = (*batchsize, *sparsesize, *densesize)
+        values = torch.empty((*batchsize, nnz, *blocksize, *densesize), device='meta', dtype=dtype)
+        if layout is torch.sparse_coo:
+            indices = torch.empty((len(sparsesize), nnz), device='meta', dtype=index_dtype)
+            x = torch.sparse_coo_tensor(indices, values, shape)
+        else:
+            compressed_dim = 0 if layout in {torch.sparse_csr, torch.sparse_bsr} else 1
+            nof_compressed_indices = (sparsesize[compressed_dim] // blocksize[compressed_dim] + 1 if blocksize
+                                      else sparsesize[compressed_dim] + 1)
+            compressed_indices = torch.empty((*batchsize, nof_compressed_indices), device='meta', dtype=index_dtype)
+            plain_indices = torch.empty((*batchsize, nnz), device='meta', dtype=index_dtype)
+            x = torch.sparse_compressed_tensor(
+                compressed_indices,
+                plain_indices,
+                values,
+                shape,
+                layout=layout
+            )
+
+        printed = []
+        printed.append(f"########## {dtype}/{index_dtype}/size={batchsize}+{sparsesize}+{blocksize}+{densesize} ##########")
+        printed.append("# sparse meta tensor")
+        printed.append(str(x))
+
+        return printed
+
+    @all_sparse_layouts('layout', include_strided=False)
+    @parametrize("dtype", [torch.float64])
+    def test_print_meta(self, dtype, layout):
+        printed = []
+        for batchsize, sparsesize, densesize in itertools.product(
+                [(), (2,)], [(4, 6), (3, 5, 7)], [(), (3,)]
+        ):
+            if layout is torch.sparse_coo and batchsize:
+                # COO tensors don't have batch dimensions
+                continue
+            if layout is not torch.sparse_coo and len(sparsesize) != 2:
+                # CSR/CSC/BSR/BSC tensors must have 2 sparse dimensions
+                continue
+            printed += self._test_print_meta_data(dtype, layout, batchsize, sparsesize, densesize)
+
+        orig_maxDiff = self.maxDiff
+        self.maxDiff = None
+        try:
+            self.assertExpected('\n'.join(printed))
+            self.maxDiff = orig_maxDiff
+        except Exception:
+            self.maxDiff = orig_maxDiff
+            raise
+
+    def assertEqualMeta(self, x, y, expected_nnz):
+        self.assertEqual(x.layout, y.layout)
+        self.assertEqual(x.shape, y.shape)
+        self.assertEqual(x.dtype, y.dtype)
+        self.assertEqual(x.sparse_dim(), y.sparse_dim())
+        self.assertEqual(x.dense_dim(), y.dense_dim())
+
+        def assertEqualAttrs(x, y, expected_shape):
+            self.assertEqual(x.shape, expected_shape)
+            self.assertEqual(x.dtype, y.dtype)
+            self.assertEqual(x.layout, y.layout)
+            if not x.is_meta:
+                self.assertEqual(x.device, y.device)
+
+        if x.layout is torch.sparse_coo:
+            assertEqualAttrs(x._indices(), y._indices(), (*y._indices().shape[:-1], expected_nnz))
+            assertEqualAttrs(x._values(), y._values(), (expected_nnz, *y._values().shape[1:]))
+        elif x.layout in {torch.sparse_csr, torch.sparse_bsr}:
+            assertEqualAttrs(x.crow_indices(), y.crow_indices(), y.crow_indices().shape)
+            assertEqualAttrs(x.col_indices(), y.col_indices(), (*y.col_indices().shape[:-1], expected_nnz))
+            batch_dim = x.col_indices().ndim - 1
+            values_shape = (*y.values().shape[:batch_dim], expected_nnz, *y.values().shape[batch_dim + 1:])
+            self.assertEqual(x.values().layout, y.values().layout)
+            self.assertEqual(x.values().dtype, y.values().dtype)
+            self.assertEqual(x.values().shape, values_shape)
+        elif x.layout in {torch.sparse_csc, torch.sparse_bsc}:
+            assertEqualAttrs(x.ccol_indices(), y.ccol_indices(), y.ccol_indices().shape)
+            assertEqualAttrs(x.row_indices(), y.row_indices(), (*y.row_indices().shape[:-1], expected_nnz))
+            batch_dim = x.row_indices().ndim - 1
+            values_shape = (*y.values().shape[:batch_dim], expected_nnz, *y.values().shape[batch_dim + 1:])
+            self.assertEqual(x.values().layout, y.values().layout)
+            self.assertEqual(x.values().dtype, y.values().dtype)
+            self.assertEqual(x.values().shape, values_shape)
+
+    @all_sparse_layouts('layout', include_strided=False)
+    @parametrize("dtype", [torch.float64])
+    def test_to_meta(self, dtype, layout):
+        index_dtype = torch.int64
+        device = 'cpu'
+        for t in self.generate_simple_inputs(layout, device=device, dtype=dtype, index_dtype=index_dtype):
+            m = t.to(device="meta")
+            self.assertEqual(m.device.type, "meta")
+            self.assertEqualMeta(m, t, 0)
+
+    @all_sparse_layouts('layout', include_strided=False)
+    @parametrize("dtype", [torch.float64])
+    def test_zeros_like_meta(self, dtype, layout):
+        index_dtype = torch.int64
+        device = 'cpu'
+        for t in self.generate_simple_inputs(layout, device=device, dtype=dtype, index_dtype=index_dtype):
+            m = torch.zeros_like(t, device="meta")
+            self.assertEqual(m.device.type, "meta")
+            self.assertEqualMeta(m, t, 0)
+
+    @all_sparse_layouts('layout', include_strided=False)
+    @parametrize("dtype", [torch.float64])
+    def test_fake(self, dtype, layout):
+        from torch._subclasses.fake_tensor import FakeTensorMode, FakeTensor
+        fake_mode = FakeTensorMode()
+        index_dtype = torch.int64
+        device = 'cpu'
+        for t in self.generate_simple_inputs(layout, device=device, dtype=dtype, index_dtype=index_dtype):
+            f = FakeTensor.from_tensor(t, fake_mode)
+            self.assertIsInstance(f, FakeTensor)
+            self.assertEqualMeta(f, t, 0)
+
+            d = f.detach()
+            self.assertIsInstance(d, FakeTensor)
+            self.assertEqualMeta(d, t, 0)
+
+    @all_sparse_layouts('layout', include_strided=False)
+    @parametrize("dtype", [torch.float64])
+    def test_zeros_like_fake(self, dtype, layout):
+        from torch._subclasses.fake_tensor import FakeTensorMode, FakeTensor
+        from torch.utils._mode_utils import no_dispatch
+        fake_mode = FakeTensorMode()
+        index_dtype = torch.int64
+        device = 'cpu'
+        for t in self.generate_simple_inputs(layout, device=device, dtype=dtype, index_dtype=index_dtype):
+            f = FakeTensor.from_tensor(t, fake_mode)
+            expected = torch.zeros_like(t)
+            with no_dispatch():
+                result = torch.zeros_like(f, device=f.fake_device)
+            self.assertEqual(result, expected)
+            self.assertEqualMeta(result, expected, 0)
+
+    @all_sparse_layouts('layout', include_strided=False)
+    @parametrize("dtype", [torch.float64])
+    def test_sum_meta(self, dtype, layout):
+        device = 'cpu'
+        index_dtype = torch.int64
+        for t in self.generate_simple_inputs(layout, device=device, dtype=dtype, index_dtype=index_dtype):
+            m = t.to(device='meta')
+            r = torch.sum(m)
+            expected = torch.sum(t).to(device="meta")
+            self.assertTrue(r.is_meta)
+            self.assertEqualMeta(r, expected, 0)
+
+    @all_sparse_layouts('layout', include_strided=False)
+    @parametrize("dtype", [torch.float64])
+    def test_add_meta(self, dtype, layout):
+        device = 'cpu'
+        index_dtype = torch.int64
+        for t in self.generate_simple_inputs(layout, device=device, dtype=dtype, index_dtype=index_dtype):
+            expected = torch.add(t, t).to(device='meta')
+            m = t.to(device='meta')
+            r = torch.add(m, m)
+            self.assertEqualMeta(r, expected, 0)
 
 
 class _SparseDataset(torch.utils.data.Dataset):
@@ -5125,6 +5350,8 @@ def test_invalid_blocksize(self):
 
 instantiate_device_type_tests(TestSparseAny, globals(), except_for='meta')
 
+instantiate_parametrized_tests(TestSparseMeta)
+
 instantiate_parametrized_tests(TestSparseLegacyAndDeprecation)
 
 if __name__ == '__main__':
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index c20493643b5cc..86316a7dd0f0d 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -12,7 +12,7 @@
      load_tests, coalescedonoff, parametrize, subtest, skipIfTorchDynamo, skipIfRocm, IS_FBCODE, IS_REMOTE_GPU)
 from torch.testing._internal.common_device_type import \
     (ops, instantiate_device_type_tests, dtypes, OpDTypes, dtypesIfCUDA, onlyCPU, onlyCUDA, skipCUDAIfNoSparseGeneric,
-     precisionOverride, skipMeta, skipCUDAIf, skipCUDAIfRocm, skipCPUIfNoMklSparse, skipCUDAIfRocmVersionLessThan,
+     precisionOverride, skipMeta, skipCUDAIf, skipCPUIfNoMklSparse, skipCUDAIfRocmVersionLessThan,
      largeTensorTest)
 from torch.testing._internal.common_methods_invocations import \
     (op_db, sparse_csr_unary_ufuncs, ReductionOpInfo)
@@ -21,7 +21,7 @@
     floating_types, all_types_and_complex_and, floating_and_complex_types, floating_types_and,
     all_types_and_complex, floating_and_complex_types_and)
 from torch.testing._internal.opinfo.definitions.sparse import validate_sample_input_sparse
-from test_sparse import CUSPARSE_SPMM_COMPLEX128_SUPPORTED
+from test_sparse import CUSPARSE_SPMM_COMPLEX128_SUPPORTED, HIPSPARSE_SPMM_COMPLEX128_SUPPORTED
 import operator
 
 if TEST_SCIPY:
@@ -46,6 +46,8 @@ def _check_cusparse_spgemm_available():
     return not TEST_WITH_ROCM
 
 def _check_cusparse_sddmm_available():
+    if TEST_WITH_ROCM:
+        return True
     version = _get_torch_cuda_version()
     # cusparseSDDMM was added in 11.2.1 but we don't have access to patch version
     min_supported_version = (11, 3)
@@ -334,6 +336,46 @@ def test_empty_errors(self, layout, device, dtype):
                                     ", but got size"):
             torch.empty((5,), dtype=dtype, device=device, layout=layout)
 
+    @skipMeta
+    @all_sparse_compressed_layouts()
+    @dtypes(*all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half))
+    def test_sparse_compressed_tensor_with_dims(self, layout, device, dtype):
+
+        def get_sparse_compressed_tensor_properties(s):
+            if layout in {torch.sparse_csr, torch.sparse_bsr}:
+                compressed_indices, plain_indices = s.crow_indices(), s.col_indices()
+            else:
+                compressed_indices, plain_indices = s.ccol_indices(), s.row_indices()
+            values = s.values()
+            return dict(shape=s.shape, dtype=s.dtype, device=s.device, nnz=s._nnz(), layout=s.layout,
+                        compressed_indices_shape=compressed_indices.shape,
+                        compressed_indices_dtype=compressed_indices.dtype,
+                        compressed_indices_device=compressed_indices.device,
+                        plain_indices_shape=plain_indices.shape,
+                        plain_indices_dtype=plain_indices.dtype,
+                        plain_indices_device=plain_indices.device,
+                        values_shape=values.shape,
+                        values_dtype=values.dtype,
+                        values_device=values.device)
+
+        for index_dtype in [torch.int32, torch.int64]:
+            for t in self.generate_simple_inputs(layout, device=device, dtype=dtype, index_dtype=index_dtype):
+                dense_dim = t.dense_dim()
+                sparse_dim = t.sparse_dim()
+                batch_dim = t.ndim - sparse_dim - dense_dim
+                nnz = t.values().shape[batch_dim]
+                if layout in {torch.sparse_bsr, torch.sparse_bsc}:
+                    blocksize = t.values().shape[batch_dim + 1: batch_dim + 1 + sparse_dim]
+                else:
+                    blocksize = ()
+
+                e = torch.ops.aten._sparse_compressed_tensor_with_dims(nnz, dense_dim, t.shape, blocksize, index_dtype,
+                                                                       dtype=dtype, layout=layout, device=device)
+
+                e_prop, t_prop = get_sparse_compressed_tensor_properties(e), get_sparse_compressed_tensor_properties(t)
+                for k, v in e_prop.items():
+                    self.assertEqual(v, t_prop[k], lambda msg: f'{msg} when comparing {k}, expected {t_prop[k]}, got {v}')
+
     @skipMeta
     @all_sparse_compressed_layouts()
     @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
@@ -391,8 +433,7 @@ def test_print(self, layout, device):
                         basesize = size[batch_ndim:batch_ndim + base_ndim]
                         densesize = size[batch_ndim + base_ndim:]
                         assert len(densesize) == dense_ndim
-                        printed.append("########## {}/{}/size={}+{}+{} ##########".format(
-                            dtype, index_dtype, batchsize, basesize, densesize))
+                        printed.append(f"########## {dtype}/{index_dtype}/size={batchsize}+{basesize}+{densesize} ##########")
                         x = torch.sparse_compressed_tensor(compressed_indices,
                                                            plain_indices,
                                                            values, size, dtype=dtype, layout=layout, device=device)
@@ -1271,6 +1312,7 @@ def test_resize_errors(self, device, dtype):
                 new_shape = (2, 2)
                 a.resize_(new_shape)
 
+    @skipIfTorchDynamo()
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_sparse_csr_from_dense(self, device, dtype):
         dense = torch.tensor([[4, 5, 0], [0, 0, 0], [1, 0, 0]], dtype=dtype, device=device)
@@ -2021,7 +2063,9 @@ def maybe_transpose(cond, m):
     @dtypesIfCUDA(*floating_types_and(torch.complex64,
                                       *[torch.bfloat16] if SM80OrLater else [],
                                       *[torch.half] if SM53OrLater else [],
-                                      *[torch.complex128] if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else []))
+                                      *[torch.complex128]
+                                      if CUSPARSE_SPMM_COMPLEX128_SUPPORTED or HIPSPARSE_SPMM_COMPLEX128_SUPPORTED
+                                      else []))
     @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
                         torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
     def test_addmm_sizes_all_sparse_csr(self, device, dtype, m, n, k):
@@ -2380,7 +2424,6 @@ def remove_diagonal(t):
                                                                                  itertools.product([True, False], repeat=4)):
             run_test(n, k, upper, unitriangular, transpose, zero)
 
-    @skipCUDAIfRocm
     @skipCUDAIf(
         not _check_cusparse_sddmm_available(),
         "cuSparse Generic API SDDMM is not available"
@@ -2435,7 +2478,6 @@ def run_test(c, a, b, op_a, op_b, *, alpha=None, beta=None):
                 for op_a, op_b in itertools.product([True, False], repeat=2):
                     run_test(c, a, b, op_a, op_b)
 
-    @skipCUDAIfRocm
     @skipCUDAIf(
         not _check_cusparse_sddmm_available(),
         "cuSparse Generic API SDDMM is not available"
@@ -2466,9 +2508,9 @@ def test_sampled_addmm_autograd(self, device, dtype):
             self.assertEqual(a.grad, a1.grad)
             self.assertEqual(b.grad, b1.grad)
 
-    @skipCUDAIfRocm
     @onlyCUDA
-    @skipCUDAIf(True, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177")
+    # It works on ROCm and CUDA issue is currently active
+    @skipCUDAIf(not TEST_WITH_ROCM, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177")
     @skipCUDAIf(
         not _check_cusparse_sddmm_available(),
         "cuSparse Generic API SDDMM is not available"
@@ -2489,7 +2531,7 @@ def run_test(c, a, b):
 
     @onlyCUDA
     @skipCUDAIf(
-        not (TEST_WITH_ROCM or _check_cusparse_sddmm_available()),
+        not _check_cusparse_sddmm_available(),
         "cuSparse Generic API SDDMM is not available"
     )
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
@@ -2771,7 +2813,6 @@ def fn(input):
             dense_output.backward(dense_covector)
             self.assertEqual(sparse_input.grad, dense_input.grad)
 
-    @skipCUDAIfRocm
     @skipCUDAIf(
         not _check_cusparse_sddmm_available(),
         "cuSparse Generic API SDDMM is not available"
@@ -2847,7 +2888,6 @@ def fn(a):
                     else:
                         self.assertEqual(a.grad, dense_a.grad)
 
-    @skipCUDAIfRocm
     @skipCPUIfNoMklSparse
     @dtypes(torch.float64)
     def test_autograd_dense_output_addmv(self, device, dtype):
@@ -2882,9 +2922,6 @@ def fn(c, b):
     def test_autograd_dense_output(self, device, dtype, op):
         if op.name == "mv" and no_mkl_sparse and self.device_type == 'cpu':
             self.skipTest("MKL Sparse is not available")
-        if op.name == "mv" and TEST_WITH_ROCM and self.device_type == 'cuda':
-            # mv currently work only on CUDA
-            self.skipTest("ROCm is not supported")
 
         samples = list(op.sample_inputs(device, dtype, requires_grad=True))
 
@@ -3087,7 +3124,28 @@ def _construct_sp_matrix(self, tensor, layout, blocksize=(2, 2)):
             return sp.csc_matrix(tensor.cpu().numpy())
         if layout is torch.sparse_bsr:
             return sp.bsr_matrix(tensor.cpu().numpy(), blocksize=blocksize).sorted_indices()
-        # No native scipy BSC support?
+        if layout is torch.sparse_bsc:
+            # SciPy doesn't have native BSC support - but our tests don't need the full
+            # functionality so fake it by using a transposed BSR matrix.
+            class FakeBscMatrix:
+                def __init__(self, matrix):
+                    self._matrix = matrix
+                    self.shape = tuple(reversed(matrix.shape))
+                    self.indptr = matrix.indptr
+                    self.indices = matrix.indices
+                    self.data = [x.transpose() for x in matrix.data]
+
+                @staticmethod
+                def from_matrix(matrix, blocksize):
+                    blocksize = tuple(reversed(blocksize))
+                    matrix = matrix.transpose()
+                    return FakeBscMatrix(sp.bsr_matrix(matrix, blocksize=blocksize))
+
+                def sorted_indices(self):
+                    sub = self._matrix.sorted_indices()
+                    return FakeBscMatrix(sub)
+
+            return FakeBscMatrix.from_matrix(tensor.cpu().numpy(), blocksize=blocksize).sorted_indices()
         raise NotImplementedError(repr(tensor))
 
     @skipMeta
@@ -3383,29 +3441,32 @@ def test_sparse_to_sparse_compressed(self, device, dtype, coalesced, layout):
         page with SciPy.  Independent from SciPy, all conversion
         combinations are tested in TestSparseAny.test_to_sparse.
         """
-        if layout is torch.sparse_bsc:
-            # TODO: Remove this once support has been enabled
-            self.skipTest('NOT IMPL')
-        if layout is torch.sparse_bsr:
-            # TODO: Remove this once support has been enabled
-            self.skipTest('NOT IMPL')
 
-        for shape in [(0, 10), (6, 0), (6, 10), (0, 0)]:
+        blocksize_kw = {}
+        if layout in (torch.sparse_bsc, torch.sparse_bsr):
+            blocksize_kw['blocksize'] = (2, 2)
+            # block modes don't support 0 width/height
+            shapes = [(6, 10)]
+        elif layout in (torch.sparse_csc, torch.sparse_csr):
+            shapes = [(0, 10), (6, 0), (6, 10), (0, 0)]
+        else:
+            raise NotImplementedError("unhandled layout")
+
+        if layout in (torch.sparse_bsc, torch.sparse_csc):
+            compressed_indices_mth = torch.Tensor.ccol_indices
+            plain_indices_mth = torch.Tensor.row_indices
+        elif layout in (torch.sparse_bsr, torch.sparse_csr):
+            compressed_indices_mth = torch.Tensor.crow_indices
+            plain_indices_mth = torch.Tensor.col_indices
+        else:
+            raise NotImplementedError("unhandled layout")
+
+        for shape in shapes:
             sparse_dim = 2
             nnz = shape[0] * shape[1] // 2
             sparse, _, _ = self.genSparseTensor(shape, sparse_dim, nnz, coalesced, device, dtype)
             sp_matrix = self._construct_sp_matrix(sparse, layout)
-            pt_matrix = sparse.to_sparse(layout=layout)
-
-            compressed_indices_mth = {
-                torch.sparse_csr: torch.Tensor.crow_indices,
-                torch.sparse_csc: torch.Tensor.ccol_indices,
-            }[layout]
-
-            plain_indices_mth = {
-                torch.sparse_csr: torch.Tensor.col_indices,
-                torch.sparse_csc: torch.Tensor.row_indices,
-            }[layout]
+            pt_matrix = sparse.to_sparse(layout=layout, **blocksize_kw)
 
             self.assertEqual(layout, pt_matrix.layout)
             self.assertEqual(sp_matrix.shape, pt_matrix.shape)
@@ -3415,7 +3476,7 @@ def test_sparse_to_sparse_compressed(self, device, dtype, coalesced, layout):
 
             sparse_csc = sparse.to_sparse_csc()
             sp_matrix = self._construct_sp_matrix(sparse_csc, layout)
-            pt_matrix = sparse_csc.to_sparse(layout=layout)
+            pt_matrix = sparse_csc.to_sparse(layout=layout, **blocksize_kw)
 
             self.assertEqual(layout, pt_matrix.layout)
             self.assertEqual(sp_matrix.shape, pt_matrix.shape)
@@ -3462,6 +3523,7 @@ def _to_block_triangular_inplace(self, d, row_block, col_block):
         return d
 
     @onlyCUDA
+    @skipIfRocm(msg="test is too slow on ROCm stack")
     @dtypes(torch.half, torch.bfloat16, torch.float)
     @dtypesIfCUDA(torch.half, *[torch.bfloat16] if SM80OrLater else [], torch.float)
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "Test requires Triton")
@@ -3626,7 +3688,6 @@ def test_triton_bsr_dense_bmm_error_messages(self, device, dtype):
     @dtypesIfCUDA(torch.half, *[torch.bfloat16] if SM80OrLater else [], torch.float)
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "Test requires Triton")
     @precisionOverride({torch.float16: 1e-3})
-    @unittest.skip("Disable to unblock triton pin upgrade. Details in https://github.com/pytorch/pytorch/issues/108102")
     def test_triton_scaled_dot_product_attention(self, device, dtype, block_size):
         from functools import partial
         from torch.sparse._triton_ops import _scaled_dot_product_attention
diff --git a/test/test_sparse_semi_structured.py b/test/test_sparse_semi_structured.py
index 87d09a8d5d1fd..625f067da467d 100644
--- a/test/test_sparse_semi_structured.py
+++ b/test/test_sparse_semi_structured.py
@@ -5,13 +5,21 @@
 
 import torch
 from torch import nn
+import torch.nn.functional as F
 
-from torch.sparse.semi_structured import (
-    _DTYPE_TO_SEMI_STRUCTURED_SPARSE_CONFIG,
+from torch.sparse import (
     SparseSemiStructuredTensor,
+    SparseSemiStructuredTensorCUSPARSELT,
+    SparseSemiStructuredTensorCUTLASS,
     to_sparse_semi_structured,
 )
 
+from torch.sparse._semi_structured_conversions import (
+    sparse_semi_structured_from_dense_cutlass,
+    _sparse_semi_structured_tile,
+    _compute_compressed_swizzled_bitmask,
+)
+
 from torch.testing import make_tensor
 
 from torch.testing._internal.common_device_type import (
@@ -31,27 +39,48 @@
     IS_WINDOWS,
 )
 
-from torch.utils._triton import has_triton
+import pytest
 
-CUSPARSELT_NUM_ALG_IDS = 4
+from torch.utils._triton import has_triton
 
-SEMI_STRUCTURED_SUPPORTED_DTYPES = _DTYPE_TO_SEMI_STRUCTURED_SPARSE_CONFIG.keys()
-SEMI_STRUCTURED_SUPPORTED_BACKENDS = []
+SEMI_STRUCTURED_SUPPORTED_DTYPES = [torch.float16, torch.bfloat16, torch.float32, torch.int8]
+SEMI_STRUCTURED_SUPPORTED_BACKENDS = {}
 
 _IS_SM8X = False
+
 if torch.cuda.is_available():
     _IS_SM8X = torch.cuda.get_device_capability(0)[0] == 8
-    SEMI_STRUCTURED_SUPPORTED_BACKENDS.append("cutlass")
+    SEMI_STRUCTURED_SUPPORTED_BACKENDS["cutlass"] = SparseSemiStructuredTensorCUTLASS
 
     # check if cslt is available for now using this:
     # TODO when we add cusparselt as a backend, we can update this to be use torch.cusparselt.is_available()
     try:
         torch._cslt_compress(torch.ones(128, 256).cuda())
-        SEMI_STRUCTURED_SUPPORTED_BACKENDS.append("cusparselt")
+        SEMI_STRUCTURED_SUPPORTED_BACKENDS["cusparselt"] = SparseSemiStructuredTensorCUSPARSELT
     except Exception:
         pass
 
+inference_dtypes = dtypes(torch.float16, torch.bfloat16, torch.float32, torch.int8)
+training_dtypes = dtypes(torch.float16, torch.bfloat16)
+parametrize_backends = parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
+
+atol_rtol_kw = {
+    torch.float16: {
+        "rtol": 1e-3,
+        "atol": 1e-3,
+    },
+    torch.bfloat16: {
+        "rtol": 1e-1,
+        "atol": 1e-1,
+    },
+}
+
+def sparse24_largest_mask_2d(original):
+    sparse = SparseSemiStructuredTensorCUTLASS.prune_dense_static_sort(original)
+    return sparse.to_dense().bool()
 
+def sparsify24_dense(original):
+    return sparse24_largest_mask_2d(original) * original
 
 def rand_sparse_semi_structured_mask(
     r, c, dtype=torch.float16, device="cuda", choice=None
@@ -95,6 +124,7 @@ def rand_sparse_semi_structured(r, c, dtype, device, choice=None):
     dense = dense.masked_fill(~mask, 0)
     return dense
 
+
 def rand_sparse_semi_structured_all_patterns(r, c, dtype, device):
     pattern = '2by4' if dtype != torch.float32 else '1by2'
     if pattern == '1by2':
@@ -155,7 +185,7 @@ def _test_mlp_contiguous_relu_compile(backend, dense_input_shape):
         """
         Test nn.Linear + .contiguous() + nn.ReLU with SparseSemiStructuredTensor + torch.compile
         We expect:
-            (1) The sparse tensor subclass should turn nn.Linear into `aten._structured_sparse_linear` + `aten.contiguous()`
+            (1) The sparse tensor subclass should turn nn.Linear into `aten._structured_sparse_addmm` + `aten.contiguous()`
             (2) Inductor should fuse the .contiguous() call into the relu
         """
 
@@ -169,8 +199,6 @@ def forward(self, x):
                 x = x.contiguous()
                 return torch.nn.functional.relu(x)
 
-        SparseSemiStructuredTensor._FORCE_CUTLASS = backend == "cutlass"
-
         input = torch.rand(dense_input_shape, device="cuda").half()
         model = Model().eval().cuda().half()
         mod_linear = model.linear
@@ -180,7 +208,7 @@ def forward(self, x):
         mod_linear.weight = nn.Parameter(mod_linear.weight * mask)
 
         dense_result = model(input)
-        mod_linear.weight = nn.Parameter(to_sparse_semi_structured(mod_linear.weight))
+        mod_linear.weight = nn.Parameter(SEMI_STRUCTURED_SUPPORTED_BACKENDS[backend].from_dense(mod_linear.weight))
         sparse_result = model(input)
 
         model = torch.compile(model, backend="inductor", fullgraph=True)
@@ -205,23 +233,42 @@ def test_mlp_contiguous_relu_compile_cusparselt(self):
     @unittest.skipIf(IS_WINDOWS, "torch.compile not supported on windows")
     def test_mlp_contiguous_relu_compile_cutlass(self):
         """
-        test for CUTLASS meta registrations (_sparse_semi_structured_linear) + torch.compile
+        test for CUTLASS meta registrations (_sparse_semi_structured_addmm) + torch.compile
         """
         for dense_input_shape in [(1, 128), (64, 128), (128, 128), (64, 128, 128)]:
             SparseSemiStructuredTensorCompileTest._test_mlp_contiguous_relu_compile("cutlass", dense_input_shape)
 
 
+    @unittest.skipIf(IS_WINDOWS, "torch.compile not supported on windows")
+    @unittest.skipIf("cusparselt" not in SEMI_STRUCTURED_SUPPORTED_BACKENDS, "cusparselt not supported on this machine")
+    def test_sp24_compile(self) -> None:
+        x = torch.randn([1024, 512], device="cuda", dtype=torch.float16, requires_grad=True)
+        e = torch.eye(x.shape[0], x.shape[0], device="cuda", dtype=torch.float16)
+
+        def fn(x, e):
+            y = SparseSemiStructuredTensorCUSPARSELT.prune_dense_static_sort(x)
+            y = y.t()
+            return x @ y
+
+        # Eager
+        output = fn(x, e)
+        output.backward(output)
+        # Torch compile
+        output = torch.compile(fn)(x, e)
+        output.backward(output)
+
 class TestSparseSemiStructured(TestCase):
 
     def setUp(self):
         if not _IS_SM8X:
             self.skipTest('Only runs on SM80')
+        if IS_WINDOWS:
+            self.skipTest("torch.compile not supported on windows")
 
-    @dtypes(*SEMI_STRUCTURED_SUPPORTED_DTYPES)
-    @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
+    @inference_dtypes
+    @parametrize_backends
     def test_to_sparse_semi_structured(self, dtype, backend):
         SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
-
         A = rand_sparse_semi_structured_mask(128, 256, dtype=dtype)
         A_sparse = to_sparse_semi_structured(A)
 
@@ -232,16 +279,14 @@ def test_to_sparse_semi_structured(self, dtype, backend):
         assert isinstance(A, torch.Tensor)
         assert isinstance(A_sparse, SparseSemiStructuredTensor)
 
-
-    @dtypes(*SEMI_STRUCTURED_SUPPORTED_DTYPES)
+    @inference_dtypes
+    @parametrize_backends
     @parametrize("dense_input_shape", [(128, 1), (128, 64), (128, 128)])
-    @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
     def test_mm_sparse_first_NN(self, dense_input_shape, dtype, device, backend):
         """
         Ensure torch.mm(A_sparse, B) is correct for float16 and will throw error for int8
         """
         SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
-
         A = rand_sparse_semi_structured_mask(256, 128, dtype=dtype)
         A_sparse = to_sparse_semi_structured(A)
 
@@ -249,9 +294,8 @@ def test_mm_sparse_first_NN(self, dense_input_shape, dtype, device, backend):
 
         # Currently we don't support int matmul on GPU, so evaluate on CPU and copy over
         if dtype is torch.int8:
-            # This should fail
             if backend == "cutlass":
-                with self.assertRaisesRegex(RuntimeError, "two_four_sgemm_cutlass_dispatch_layouts"):
+                with self.assertRaisesRegex(RuntimeError, "spgemm_cutlass_dispatch_layouts"):
                     sparse_result = torch.mm(A_sparse, B)
             else:
                 with self.assertRaisesRegex(RuntimeError,
@@ -262,16 +306,15 @@ def test_mm_sparse_first_NN(self, dense_input_shape, dtype, device, backend):
             sparse_result = torch.mm(A_sparse, B)
             assert torch.allclose(dense_result, sparse_result, rtol=1e-3, atol=1e-3)
 
-    @dtypes(*SEMI_STRUCTURED_SUPPORTED_DTYPES)
+    @inference_dtypes
+    @parametrize_backends
     @parametrize("dense_input_shape", [(1, 128), (64, 128), (128, 128)])
-    @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
     def test_mm_sparse_first_NT(self, dense_input_shape, dtype, device, backend):
         """
         Ensure torch.mm(A_sparse, B.t()) is correct for float16/bfloat16
         and will throw an error for int8 + padding
         """
         SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
-
         A = rand_sparse_semi_structured_mask(256, 128, dtype=dtype)
         A_sparse = to_sparse_semi_structured(A)
 
@@ -282,7 +325,7 @@ def test_mm_sparse_first_NT(self, dense_input_shape, dtype, device, backend):
             # padding with int8 throws an error because transposing B yields a contiguous output
             # and row-row 2:4 sparse @ dense with NN is not supported by cuSPARSELt or CUTLASS.
             if backend == "cutlass":
-                with self.assertRaisesRegex(RuntimeError, "two_four_sgemm_cutlass_dispatch_layouts"):
+                with self.assertRaisesRegex(RuntimeError, "spgemm_cutlass_dispatch_layouts"):
                     sparse_result = torch.mm(A_sparse, B.t())
             else:
                 with self.assertRaisesRegex(RuntimeError,
@@ -299,14 +342,16 @@ def test_mm_sparse_first_NT(self, dense_input_shape, dtype, device, backend):
             sparse_result = torch.mm(A_sparse, B.t())
             assert torch.allclose(dense_result, sparse_result, rtol=1e-3, atol=1e-3)
 
-    @dtypes(*SEMI_STRUCTURED_SUPPORTED_DTYPES)
+    @inference_dtypes
     @parametrize("dense_input_shape", [(1, 128), (64, 128), (128, 128)])
-    @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
+    @parametrize_backends
     def test_mm_sparse_first_TN(self, dtype, dense_input_shape, device, backend):
         """
         Ensure torch.mm(A_sparse.t(), B) throws error
         """
         SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
+        if backend == "cutlass" and IS_WINDOWS:
+            self.skipTest("CUTLASS not supported on Windows")
         A = rand_sparse_semi_structured_mask(128, 256, dtype=dtype)
         A_sparse = to_sparse_semi_structured(A)
 
@@ -314,18 +359,20 @@ def test_mm_sparse_first_TN(self, dtype, dense_input_shape, device, backend):
 
         with self.assertRaisesRegex(
             NotImplementedError,
-            r"arg0: SparseSemiStructuredTensor\(.*transposed=True",
+            r"`SparseSemiStructuredTensor.*` matmul: operation is not supported",
         ):
             torch.mm(A_sparse.t(), B)
 
-    @dtypes(*SEMI_STRUCTURED_SUPPORTED_DTYPES)
+    @inference_dtypes
     @parametrize("dense_input_shape", [(1, 128), (64, 128), (128, 128)])
-    @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
+    @parametrize_backends
     def test_mm_sparse_second_NT(self, dense_input_shape, dtype, device, backend):
         """
         Ensure torch.mm(A, B_sparse.t()) is correct
         """
         SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
+        if backend == "cutlass" and IS_WINDOWS:
+            self.skipTest("CUTLASS not supported on Windows")
         B = rand_sparse_semi_structured_mask(256, 128, dtype=dtype)
         B_sparse = to_sparse_semi_structured(B)
 
@@ -341,14 +388,16 @@ def test_mm_sparse_second_NT(self, dense_input_shape, dtype, device, backend):
 
         assert torch.allclose(dense_result, sparse_result, rtol=1e-3, atol=1e-3)
 
-    @dtypes(*SEMI_STRUCTURED_SUPPORTED_DTYPES)
+    @inference_dtypes
     @parametrize("dense_input_shape", [(1, 128), (64, 128), (128, 128)])
-    @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
+    @parametrize_backends
     def test_mm_sparse_second_NN(self, dense_input_shape, dtype, device, backend):
         """
         Ensure torch.mm(A, B_sparse) throws error
         """
         SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
+        if backend == "cutlass" and IS_WINDOWS:
+            self.skipTest("CUTLASS not supported on Windows")
         B = rand_sparse_semi_structured_mask(256, 128, dtype=dtype)
         B_sparse = to_sparse_semi_structured(B)
 
@@ -356,18 +405,20 @@ def test_mm_sparse_second_NN(self, dense_input_shape, dtype, device, backend):
 
         with self.assertRaisesRegex(
             NotImplementedError,
-            r"arg1: SparseSemiStructuredTensor\(.*transposed=False",
+            r"`SparseSemiStructuredTensor.*` matmul: operation is not supported",
         ):
             sparse_result = torch.mm(A, B_sparse)
 
     @parametrize("dense_input_shape", [(1, 128), (64, 128), (128, 128), (64, 128, 128)])
     @parametrize("inference_mode", [subtest(True), subtest(False)])
-    @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
+    @parametrize_backends
     def test_linear(self, dense_input_shape, inference_mode, device, backend):
         """
         Test nn.Linear has the same numerics
         """
         SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
+        if backend == "cutlass" and IS_WINDOWS:
+            self.skipTest("CUTLASS not supported on Windows")
         input = torch.rand((dense_input_shape), device=device).half()
         model = nn.Linear(128, 256).to(device).half()
         m, n = model.weight.shape
@@ -388,9 +439,9 @@ def test_linear(self, dense_input_shape, inference_mode, device, backend):
         assert torch.allclose(dense_result, sparse_result, rtol=1e-3, atol=1e-3)
 
     @parametrize("dense_input_shape", [(1, 128), (64, 128), (128, 128), (64, 128, 128)])
-    @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
+    @parametrize_backends
     def test_mlp(self, device, dense_input_shape, backend):
-        SparseSemiStructuredTensor._FORCE_CUTLASS = backend == "cutlass"
+        SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
         input = torch.rand(dense_input_shape, device=device).half()
         model = (
             nn.Sequential(
@@ -418,26 +469,30 @@ def test_mlp(self, device, dense_input_shape, backend):
 
         assert torch.allclose(dense_result, sparse_result, rtol=1e-3, atol=1e-3)
 
-    @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
+    @parametrize_backends
     def test_values(self, backend):
         SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
+        if backend == "cutlass" and IS_WINDOWS:
+            self.skipTest("CUTLASS not supported on Windows")
         A = rand_sparse_semi_structured_mask(128, 128)
         A_sparse = to_sparse_semi_structured(A)
         assert A_sparse.values().shape == (128, 64)
         assert (A_sparse.values() == 1).all()
 
-    @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
+    @parametrize_backends
     def test_indices(self, backend):
         SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
+        if backend == "cutlass" and IS_WINDOWS:
+            self.skipTest("CUTLASS not supported on Windows")
         A = rand_sparse_semi_structured_mask(128, 128)
         A_sparse = to_sparse_semi_structured(A)
         assert A_sparse.indices().shape == (128, 8)
 
-    @dtypes(*SEMI_STRUCTURED_SUPPORTED_DTYPES)
-    @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
+    @inference_dtypes
+    @parametrize_backends
     def test_min_sparse_shape(self, dtype, device, backend):
         SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
-        config = _DTYPE_TO_SEMI_STRUCTURED_SPARSE_CONFIG[dtype]
+        config = SEMI_STRUCTURED_SUPPORTED_BACKENDS[backend]._DTYPE_SHAPE_CONSTRAINTS[dtype]
         A = rand_sparse_semi_structured_mask(config.sparse_min_rows, config.sparse_min_cols, dtype=dtype, device=device)
         A_sparse = to_sparse_semi_structured(A)
         B = torch.rand((config.sparse_min_cols, config.dense_min_cols), device=device).to(dtype)
@@ -451,18 +506,22 @@ def test_min_sparse_shape(self, dtype, device, backend):
             sparse_res = torch.mm(A_sparse, B)
         assert torch.allclose(sparse_res, dense_res, rtol=1e-3, atol=1e-3)
 
-    @dtypes(*SEMI_STRUCTURED_SUPPORTED_DTYPES)
-    @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
+    @inference_dtypes
+    @parametrize_backends
     def test_unsupported_shape(self, dtype, device, backend):
         SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
+        if backend == "cutlass" and IS_WINDOWS:
+            self.skipTest("CUTLASS not supported on Windows")
         A = rand_sparse_semi_structured_mask(2, 2, dtype=dtype, device=device)
         with self.assertRaisesRegex(RuntimeError, "Error original_tensor.shape"):
             A_sparse = to_sparse_semi_structured(A)
 
     @dtypes(*all_types_and_complex())
-    @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
+    @parametrize_backends
     def test_unsupported_dtype(self, dtype, device, backend):
         SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
+        if backend == "cutlass" and IS_WINDOWS:
+            self.skipTest("CUTLASS not supported on Windows")
         A = rand_sparse_semi_structured_mask(128, 128, dtype=dtype, device=device)
 
         if dtype not in SEMI_STRUCTURED_SUPPORTED_DTYPES:
@@ -471,19 +530,335 @@ def test_unsupported_dtype(self, dtype, device, backend):
         else:
             A_sparse = to_sparse_semi_structured(A)
 
-    @parametrize("backend", SEMI_STRUCTURED_SUPPORTED_BACKENDS)
+    @parametrize_backends
     def test_unsupported_dim(self, device, backend):
         SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
+        if backend == "cutlass" and IS_WINDOWS:
+            self.skipTest("CUTLASS not supported on Windows")
         A = torch.rand(128, 128, 128, device=device, dtype=torch.float16)
 
         with self.assertRaisesRegex(RuntimeError, "Error original_tensor.dim"):
             A_sparse = to_sparse_semi_structured(A)
 
-    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
-    @parametrize("backend", ["cutlass"])
-    @dtypes(*SEMI_STRUCTURED_SUPPORTED_DTYPES)
-    def test_linear_cutlass(self, device, dtype, backend):
-        SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
+
+def create_random_mask(shape) -> torch.Tensor:
+    r = random.Random(0)
+    mask = torch.zeros(shape, dtype=torch.bool)
+    for line in range(mask.shape[0]):
+        for col in range(0, mask.shape[1], 4):
+            sparsity = r.choice(
+                [
+                    [False, False, True, True],
+                    [False, True, False, True],
+                    [True, False, False, True],
+                    [False, True, True, False],
+                    [True, False, True, False],
+                    [True, True, False, False],
+                ]
+            )
+            mask[line, col : col + 4] = torch.tensor(sparsity, dtype=torch.bool)
+    return mask
+
+class TestSparseSemiStructuredTraining(TestCase):
+
+    def setUp(self):
+        if not _IS_SM8X:
+            self.skipTest('Only runs on SM80')
+        if IS_WINDOWS:
+            self.skipTest('CUTLASS not supported on windows')
+
+
+    @training_dtypes
+    def test_prune_dense_static_sort(self, dtype) -> None:
+        # Ideally we would like to clone and compare, but that won't work because the sorting order will be different
+        # instead we pass the pruned matrix to the CUDA implementation and preserve the sparsity pattern.
+        dense = torch.randn(128, 128, device="cuda", dtype=dtype)
+        pruned = _sparse_semi_structured_tile(dense)
+
+        # CUTLASS
+        reference_cutlass = SparseSemiStructuredTensorCUTLASS.prune_dense_static_sort(pruned, algorithm="largest_abs_values_greedy")
+        assert torch.allclose(pruned, reference_cutlass.to_dense())
+
+        packed_cutlass, meta_cutlass = sparse_semi_structured_from_dense_cutlass(pruned)
+        packed_t_cutlass, meta_t_cutlass = sparse_semi_structured_from_dense_cutlass(pruned.t().contiguous())
+        meta_cutlass = meta_cutlass.as_strided(reference_cutlass.meta.shape, reference_cutlass.meta.stride())
+        meta_t_cutlass = meta_t_cutlass.as_strided(reference_cutlass.meta_t.shape, reference_cutlass.meta_t.stride())
+        compressed_swizzled_bitmask = _compute_compressed_swizzled_bitmask(pruned)
+        compressed_swizzled_bitmask = compressed_swizzled_bitmask.as_strided(reference_cutlass.compressed_swizzled_bitmask.shape,
+                                                                             reference_cutlass.compressed_swizzled_bitmask.stride())
+        cutlass = SparseSemiStructuredTensorCUTLASS(dense.shape,
+                                                    packed_cutlass,
+                                                    meta_cutlass,
+                                                    packed_t_cutlass,
+                                                    meta_t_cutlass,
+                                                    compressed_swizzled_bitmask)
+        assert torch.allclose(reference_cutlass.to_dense(), cutlass.to_dense())
+
+        # CUSPARSELT
+        reference_cusparselt = SparseSemiStructuredTensorCUSPARSELT.prune_dense_static_sort(pruned,
+                                                                                            algorithm="largest_abs_values_greedy")
+        assert torch.allclose(pruned, reference_cusparselt.to_dense())
+
+        packed_cusparselt = torch._cslt_compress(pruned)
+        packed_t_cusparselt = torch._cslt_compress(pruned.t().contiguous())
+        cusparselt = SparseSemiStructuredTensorCUSPARSELT(dense.shape,
+                                                          packed_cusparselt,
+                                                          None,
+                                                          packed_t_cusparselt,
+                                                          None,
+                                                          compressed_swizzled_bitmask)
+        assert torch.allclose(reference_cusparselt.to_dense(), cusparselt.to_dense())
+
+
+
+    @training_dtypes
+    @parametrize_backends
+    def test_pruning_algo_largest_abs_values_greedy(self, dtype, backend) -> None:
+        inp = torch.tensor(
+            [[4, 3, 2, 1], [-1, -3, 0.6, 0.5], [1, 2, 3, 4], [10, 2, -1, 5]],
+            device="cuda",
+            dtype=dtype,
+        )
+        inp = F.pad(inp, (0, 128 - 4, 0, 128 - 4), "constant", 1)
+        sInp = SEMI_STRUCTURED_SUPPORTED_BACKENDS[backend].prune_dense_static_sort(inp, algorithm="largest_abs_values_greedy")
+
+        mask = sInp.to_dense() / inp
+        assert mask[:4, :4].int().tolist() == [
+            [1, 1, 0, 0],
+            [0, 1, 1, 0],
+            [0, 0, 1, 1],
+            [1, 0, 0, 1],
+        ]
+
+    @training_dtypes
+    def test_gemm(self, dtype) -> None:
+        M, N, K = 32, 32, 64
+        a = torch.randn([M, K], device="cuda", dtype=dtype)
+        b = torch.randn([K, N], device="cuda", dtype=dtype)
+        mask = rand_sparse_semi_structured_mask(M, K, dtype=torch.bool)
+
+        a.masked_fill_(~mask, 0)
+
+        a_sparse = to_sparse_semi_structured(a)
+
+        masked_a = a * mask
+        ref_out = masked_a @ b
+        sp24_out = a_sparse @ b
+        assert torch.allclose(ref_out, sp24_out, **atol_rtol_kw[dtype])
+
+
+    @training_dtypes
+    @parametrize_backends
+    def test_pack_both_ways_meta_correctness(self, dtype, backend) -> None:
+        M, N = 128, 256
+        # Construct x to make sure we always have exactly 8 elements per 4x4 tile
+        a = (4 * torch.arange(8))[:, None] + torch.arange(8)[None, :]
+        a = a.repeat(M // 8, N // 8)
+        assert a.shape == (M, N)
+        a = a.cuda().to(dtype)
+        b = torch.randn([a.shape[1], 128], device="cuda", dtype=dtype)
+
+        a_sparse = SEMI_STRUCTURED_SUPPORTED_BACKENDS[backend].prune_dense_static_sort(a)
+
+        mask_dense = sparse24_largest_mask_2d(a).to(dtype)
+
+        if backend == "cutlass":
+            assert isinstance(a_sparse, SparseSemiStructuredTensorCUTLASS)
+            (packed, meta, packed_t, meta_t, bitmask) = torch._sparse_semi_structured_tile(
+                mask_dense, use_cutlass=True)
+
+            sparse_mask = SparseSemiStructuredTensorCUTLASS(
+                mask_dense.shape,
+                packed=packed,
+                meta=meta,
+                packed_t=packed_t,
+                meta_t=meta_t,
+                compressed_swizzled_bitmask=bitmask,
+            )
+            assert torch.allclose(a_sparse.meta.view(torch.short), sparse_mask.meta)
+
+        ref_gemm = (mask_dense * a) @ b
+        pack_gemm = a_sparse @ b
+        assert torch.allclose(ref_gemm, pack_gemm, **atol_rtol_kw[dtype])
+
+    @training_dtypes
+    def test_pack_both_ways_id(self, dtype) -> None:
+        N = 512
+        torch.manual_seed(0)
+        a = torch.randn([N, N], dtype=dtype, device="cuda")
+        b = torch.eye(N, dtype=dtype, device="cuda")
+
+        packed, meta, packed_t, meta_t = torch._sparse_semi_structured_tile(a)[
+            :4
+        ]
+        # Heuristic to ensure we pack the same values
+        assert torch.allclose(
+            packed.to(torch.float64).sum(), packed_t.to(torch.float64).sum()
+        )
+
+        mask_dense = sparse24_largest_mask_2d(a.to(dtype))
+
+        ref_gemm = mask_dense * a
+        # Test A@B
+        pack_gemm = torch._sparse_semi_structured_linear(b.t(), packed, meta).t()
+        max_diff = (ref_gemm - pack_gemm).abs().argmax()
+        assert torch.allclose(
+            ref_gemm, pack_gemm,
+            **atol_rtol_kw[dtype]
+        ), f"packed is wrong at pos: ({max_diff // N}, {max_diff % N})"
+        # Test A.t@B
+        pack_gemm = torch._sparse_semi_structured_linear(b.t(), packed_t, meta_t)
+        max_diff = (ref_gemm - pack_gemm).abs().argmax()
+
+        assert torch.allclose(
+            ref_gemm, pack_gemm,
+            **atol_rtol_kw[dtype]
+        ), f"packed_t is wrong at pos: ({max_diff // N}, {max_diff % N})"
+
+    @training_dtypes
+    def test_pack_both_ways_edge_case1(self, dtype) -> None:
+        # In this case, the heuristic will keep 7 values out of 16
+        # instead of 8. let's see how the kernel handles this
+        quad = torch.tensor(
+            [
+                [2, -1, -2, -3],  # Should be packed as `2 <null>`
+                [-1, 8, -1, 6],
+                [-1, -1, 4, 5],
+                [-1, 3, 7, -1],
+            ],
+            dtype=dtype,
+            device="cuda",
+        )
+        a = torch.randn([32, 64], dtype=dtype, device="cuda")
+        a[:4, :4] = quad
+        packed, meta, packed_t, meta_t = torch._sparse_semi_structured_tile(a)[:4]
+        # Check first line in A
+        assert packed[0, 0].item() == 2
+        assert packed[0, 1].item() == 0
+        # And first column in A.t
+        assert packed_t[0, 0].item() == 2
+        assert packed_t[0, 1].item() == 0
+
+    @training_dtypes
+    def test_sp24_apply(self, dtype) -> None:
+        M, N = 256, 1024
+        x = torch.randn([M, N], dtype=dtype, device="cuda")
+        (
+            packed,
+            meta,
+            packed_t,
+            meta_t,
+            bitmask,
+        ) = torch._sparse_semi_structured_tile(x)
+        packed2, packed_t2 = torch._sparse_semi_structured_apply(x, bitmask)
+        assert torch.allclose(packed, packed2)
+        assert torch.allclose(packed_t, packed_t2)
+
+    @training_dtypes
+    def test_sp24_apply_dense(self, dtype) -> None:
+        M, N = 256, 1024
+        x = torch.randn([M, N], dtype=dtype, device="cuda")
+        (
+            packed,
+            meta,
+            packed_t,
+            meta_t,
+            bitmask,
+        ) = torch._sparse_semi_structured_tile(x)
+
+        expected = SparseSemiStructuredTensorCUTLASS(
+            x.shape,
+            packed=packed,
+            meta=meta,
+            packed_t=packed_t,
+            meta_t=meta_t,
+            compressed_swizzled_bitmask=bitmask,
+        ).to_dense()
+
+        packed2, packed_t2 = torch._sparse_semi_structured_apply(x, bitmask)
+        sparse = SparseSemiStructuredTensorCUTLASS(
+            x.shape,
+            packed=packed2,
+            meta=meta,
+            packed_t=packed_t2,
+            meta_t=meta_t,
+            compressed_swizzled_bitmask=bitmask,
+        )
+
+        dense = torch._sparse_semi_structured_apply_dense(x, bitmask)
+
+        assert torch.allclose(dense, expected)
+        assert torch.allclose(sparse.to_dense(), expected)
+
+
+    @training_dtypes
+    def test_sp24_matmuls(self, dtype) -> None:
+        M, N, K = 64, 256, 1024
+        a = torch.randn([M, K], device="cuda", dtype=dtype)
+        b = torch.randn([K, N], device="cuda", dtype=dtype)
+        a_m = sparse24_largest_mask_2d(a)
+        b_m = sparse24_largest_mask_2d(b)
+        (packed, meta, packed_t, meta_t, bitmask) = torch._sparse_semi_structured_tile(a)
+        a_s = SparseSemiStructuredTensorCUTLASS(
+            a.shape,
+            packed=packed,
+            meta=meta,
+            packed_t=packed_t,
+            meta_t=meta_t,
+            compressed_swizzled_bitmask=bitmask,
+        )
+        (packed, meta, packed_t, meta_t, bitmask) = torch._sparse_semi_structured_tile(b)
+        b_s = SparseSemiStructuredTensorCUTLASS(
+            b.shape,
+            packed=packed,
+            meta=meta,
+            packed_t=packed_t,
+            meta_t=meta_t,
+            compressed_swizzled_bitmask=bitmask,
+        )
+
+        assert torch.allclose(a_s @ b, (a * a_m) @ b, rtol=1e-1, atol=1e-1)
+        assert torch.allclose(a @ b_s, a @ (b * b_m), rtol=1e-1, atol=1e-1)
+        assert torch.allclose(
+            a @ a_s.t(), a @ (a * a_m).t(), rtol=1e-1, atol=1e-1
+        )
+        assert torch.allclose(
+            a_s.t() @ a, (a * a_m).t() @ a, rtol=1e-1, atol=1e-1
+        )
+
+    def test_sp24_matmuls_mat_vec(self) -> None:
+        a = torch.randn([64, 128], device="cuda", dtype=torch.float16)
+        b = torch.randn([128], device="cuda", dtype=torch.float16)
+        a_m = sparse24_largest_mask_2d(a)
+        a_s = to_sparse_semi_structured(a)
+
+        with pytest.raises(NotImplementedError):
+            assert torch.allclose(a_s @ b, (a * a_m) @ b, **atol_rtol_kw[a.dtype])
+
+
+    def test_sp24_matmuls_bmm(self) -> None:
+        a = torch.randn([64, 128], device="cuda", dtype=torch.float16)
+        b = torch.randn([5, 6, 128], device="cuda", dtype=torch.float16)
+        a_m = sparse24_largest_mask_2d(a)
+        a_s = to_sparse_semi_structured(a)
+
+        with pytest.raises(NotImplementedError):
+            assert torch.allclose(a_s @ b, (a * a_m) @ b, **atol_rtol_kw[a.dtype])
+
+class TestSparseSemiStructuredCUTLASS(TestCase):
+    """
+    This contains CUTLASS specific tests for
+         - torch._sparse_semi_structured_linear
+    """
+    def setUp(self):
+        if not _IS_SM8X:
+            self.skipTest('Only runs on SM80')
+        if "cutlass" not in SEMI_STRUCTURED_SUPPORTED_BACKENDS:
+            self.skipTest('CUTLASS not enabled')
+
+    @unittest.skipIf(TEST_WITH_ROCM or IS_WINDOWS, "ROCm and Windows doesn't support CUTLASS")
+    @inference_dtypes
+    def test_linear_cutlass(self, device, dtype):
 
         def run_test(batch_shape, m, n, k, device, dtype, dtype_out, add_bias, activation, rtol, atol):
             weight = rand_sparse_semi_structured(m, k, dtype, device)
@@ -539,11 +914,76 @@ def run_test(batch_shape, m, n, k, device, dtype, dtype_out, add_bias, activatio
             torch.backends.cuda.matmul.allow_tf32 = orig
 
 
-    @unittest.skipIf(not has_triton(), "Test needs triton and recent GPU arch")
+    @unittest.skipIf(TEST_WITH_ROCM or IS_WINDOWS, "ROCm and Windows doesn't support CUTLASS")
     @parametrize("backend", ["cutlass"])
     @dtypes(*SEMI_STRUCTURED_SUPPORTED_DTYPES)
-    def test_conversions(self, device, dtype, backend):
+    def test_sparse_semi_structured_ops_cutlass(self, device, dtype, backend):
         SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
+        if backend == "cutlass" and IS_WINDOWS:
+            self.skipTest("CUTLASS not supported on Windows")
+
+        def run_test(m, n, k, device, dtype, dtype_out, use_input, rtol, atol):
+            mat1 = rand_sparse_semi_structured(m, k, dtype, device)
+            # mat2 transposed as int8 case supports only row-major/column-major combination
+            mat2 = make_tensor((n, k), dtype=dtype, device=device).t()
+            input = make_tensor((m,), dtype=dtype_out, device=device) if use_input else None
+
+            if use_input:
+                if dtype.is_floating_point:
+                    alpha = 1.3
+                    beta = -0.7
+                else:
+                    alpha = 2
+                    beta = -3
+
+            dtype_dense = torch.float32
+            mat1_dense = mat1.to(dtype_dense)
+            mat2_dense = mat2.to(dtype_dense)
+            if not use_input:
+                output0 = torch.mm(mat1_dense, mat2_dense)
+            else:
+                input_dense = input.to(dtype_dense)[:, None]
+                output0 = torch.addmm(input_dense, mat1_dense, mat2_dense, alpha=alpha, beta=beta)
+
+            compressed = to_sparse_semi_structured(mat1)
+
+            mat1_sparse = compressed.values()
+            mat1_meta = compressed.indices()
+
+            if not use_input:
+                output1 = torch._sparse_semi_structured_mm(mat1_sparse, mat1_meta, mat2, out_dtype=dtype_out)
+            else:
+                output1 = torch._sparse_semi_structured_addmm(
+                    input, mat1_sparse, mat1_meta, mat2, alpha=alpha, beta=beta, out_dtype=dtype_out
+                )
+            torch.testing.assert_close(output1.to(dtype_dense), output0, rtol=rtol, atol=atol)
+
+        if dtype == torch.float32:
+            # Inputs are converted to TF32 internally for sparse GEMM,
+            # so make dense GEMM to do the same for matching results.
+            orig = torch.backends.cuda.matmul.allow_tf32
+            torch.backends.cuda.matmul.allow_tf32 = True
+
+        dtype_out = {torch.int8: torch.int32, torch.half: torch.half, torch.bfloat16: torch.bfloat16, torch.float32: torch.float32}
+        rtol, atol = 1e-3, 1e-3
+        if dtype == torch.bfloat16:
+            rtol, atol = 5e-3, 5e-3
+        elif dtype == torch.float32:
+            rtol, atol = 1e-3, 75e-2
+        for m, n, k, use_input in \
+                itertools.product(range(3), range(3), range(3), (False, True)):
+            m = 2 ** m * 32
+            n = 2 ** n * 32
+            k = 2 ** k * 128
+            run_test(m, n, k, device, dtype, dtype_out[dtype], use_input, rtol, atol)
+
+        if dtype == torch.float32:
+            torch.backends.cuda.matmul.allow_tf32 = orig
+
+
+    @unittest.skipIf(not has_triton(), "Test needs triton and recent GPU arch")
+    @inference_dtypes
+    def test_conversions(self, device, dtype):
 
         def run_test(r, c, device, dtype):
             dense_ref = rand_sparse_semi_structured(r, c, dtype, device)
@@ -570,10 +1010,8 @@ def run_test(r, c, device, dtype):
             run_test(r, c, device, dtype)
 
     @unittest.skipIf(not has_triton(), "Test needs triton and recent GPU arch")
-    @parametrize("backend", ["cutlass"])
-    @dtypes(*SEMI_STRUCTURED_SUPPORTED_DTYPES)
-    def test_conversions_all_patterns(self, device, dtype, backend):
-        SparseSemiStructuredTensor._FORCE_CUTLASS = (backend == "cutlass")
+    @inference_dtypes
+    def test_conversions_all_patterns(self, device, dtype):
         r, c = 32, 128
 
         dense_inv, dense_val = rand_sparse_semi_structured_all_patterns(r, c, dtype, device)
@@ -583,32 +1021,37 @@ def test_conversions_all_patterns(self, device, dtype, backend):
 
         torch.testing.assert_close(dense, dense_val, rtol=0, atol=0)
 
-class TestCUSPARSELT(TestCase):
+
+
+CUSPARSELT_NUM_ALG_IDS = 4
+CUSPARSELT_MIXED_DTYPE_SUPPORT = [torch.float16, torch.bfloat16, torch.int32]
+
+
+class TestSparseSemiStructuredCUSPARSELT(TestCase):
     """
-    This contains cuSPARSELt specific tests.
+    This contains cuSPARSELt specific tests for
+        torch._cslt_compress
+        torch._cslt_sparse_mm
     """
-
     def setUp(self):
         if not _IS_SM8X:
             self.skipTest('Only runs on SM80')
         if "cusparselt" not in SEMI_STRUCTURED_SUPPORTED_BACKENDS:
             self.skipTest('cuSPARSELt not enabled')
-        else:
-            SparseSemiStructuredTensor._FORCE_CUTLASS = False
-
 
+    @parametrize("out_dtype", CUSPARSELT_MIXED_DTYPE_SUPPORT)
     @parametrize("dense_input_shape", [(128, 128)])
-    def test_cslt_sparse_mm_int8_in_fp16_out(self, dense_input_shape, device):
+    def test_cslt_sparse_mm_mixed_dtype(self, dense_input_shape, out_dtype, device):
         A = rand_sparse_semi_structured_mask(128, 128, dtype=torch.int8)
         A_compressed = torch._cslt_compress(A)
 
         B = torch.rand(dense_input_shape, device=device).to(torch.int8)
 
-        dense_result = torch.mm(A.cpu().to(torch.int64), B.t().cpu().to(torch.int64)).to(device, dtype=torch.float16)
-        sparse_result = torch._cslt_sparse_mm(A_compressed, B.t(), out_dtype=torch.float16)
+        dense_result = torch.mm(A.cpu().to(torch.int64), B.t().cpu().to(torch.int64)).to(device, dtype=out_dtype)
+        sparse_result = torch._cslt_sparse_mm(A_compressed, B.t(), out_dtype=out_dtype)
         assert torch.allclose(dense_result, sparse_result, rtol=1e-3, atol=1e-3)
 
-    @dtypes(torch.float16, torch.bfloat16)
+    @training_dtypes
     def test_cslt_sparse_mm_alpha(self, dtype, device):
         A = torch.Tensor([0, 0, 1, 1]).tile((128, 64)).to(dtype).cuda()
         B = torch.ones((256, 128), device=device).to(dtype)
@@ -623,22 +1066,24 @@ def test_cslt_sparse_mm_alpha(self, dtype, device):
 
         assert torch.allclose(sparse_result, dense_result, rtol=1e-3, atol=1e-3)
 
-    def test_cslt_sparse_mm_alpha_int8_in_f16_out(self, device):
+    @parametrize("out_dtype", CUSPARSELT_MIXED_DTYPE_SUPPORT)
+    def test_cslt_sparse_mm_alpha_mixed_dtype(self, out_dtype, device):
         A = torch.Tensor([0, 0, 10, 10]).tile((128, 64)).to(torch.int8).cuda()
         B = torch.ones((128, 256), device=device).to(torch.int8).t()
-        alpha = torch.Tensor([2**(-i) for i in range(128)]).cuda()
+        alpha = torch.Tensor([2**(-i) if out_dtype is not torch.int32 else 1
+                              for i in range(128)]).cuda()
 
         A_compressed = torch._cslt_compress(A)
-        sparse_result = torch._cslt_sparse_mm(A_compressed, B, alpha=alpha, out_dtype=torch.float16).cpu()
+        sparse_result = torch._cslt_sparse_mm(A_compressed, B, alpha=alpha, out_dtype=out_dtype).cpu()
 
         alpha_scaled = torch.stack([alpha] * 128).t()
-        dense_result = alpha_scaled.cpu() * torch.mm(A.to(torch.int32).cpu(), B.to(torch.int32).cpu())
-        dense_result = dense_result.to(torch.float16)
+        dense_result = alpha_scaled.cpu() * torch.mm(A.to(torch.int64).cpu(), B.to(torch.int64).cpu())
+        dense_result = dense_result.to(out_dtype)
 
         assert torch.allclose(sparse_result, dense_result, rtol=1e-3, atol=1e-3)
 
     @parametrize("alg_id", range(CUSPARSELT_NUM_ALG_IDS))
-    @dtypes(*SEMI_STRUCTURED_SUPPORTED_DTYPES)
+    @inference_dtypes
     def test_cslt_sparse_mm_alg_id(self, device, dtype, alg_id):
         # alg_id=3 not supported for float32 dtype
         if dtype == torch.float32 and alg_id == 3:
@@ -655,7 +1100,7 @@ def test_cslt_sparse_mm_alg_id(self, device, dtype, alg_id):
 
         assert torch.allclose(sparse_result, dense_result, rtol=1e-3, atol=1e-3)
 
-    @dtypes(*SEMI_STRUCTURED_SUPPORTED_DTYPES)
+    @inference_dtypes
     def test_cslt_sparse_mm_search(self, device, dtype):
         A = rand_sparse_semi_structured_mask(128, 128, dtype=dtype)
         A_compressed = torch._cslt_compress(A)
@@ -668,9 +1113,10 @@ def test_cslt_sparse_mm_search(self, device, dtype):
         # in cuSPARSELt v0.5.0 there are only 4 alg_ids total, so we should remove the +1 here when we update.
         assert alg_id in range(CUSPARSELT_NUM_ALG_IDS + 1)
 
-
 instantiate_device_type_tests(TestSparseSemiStructured, globals(), only_for="cuda")
-instantiate_device_type_tests(TestCUSPARSELT, globals(), only_for="cuda")
+instantiate_device_type_tests(TestSparseSemiStructuredCUTLASS, globals(), only_for="cuda")
+instantiate_device_type_tests(TestSparseSemiStructuredCUSPARSELT, globals(), only_for="cuda")
+instantiate_device_type_tests(TestSparseSemiStructuredTraining, globals(), only_for="cuda")
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index bec7e884418a2..35cbff467ff6c 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -11,7 +11,7 @@
 
 from torch.testing._internal.common_utils import \
     (TestCase, run_tests, TEST_NUMPY, TEST_LIBROSA, TEST_MKL, first_sample, TEST_WITH_ROCM,
-     make_tensor)
+     make_tensor, skipIfTorchDynamo)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, ops, dtypes, onlyNativeDeviceTypes,
      skipCPUIfNoFFT, deviceCountAtLeast, onlyCUDA, OpDTypes, skipIf, toleranceOverride, tol)
@@ -120,8 +120,6 @@ def skip_helper_for_fft(device, dtype):
 
     if device_type == 'cpu':
         raise unittest.SkipTest("half and complex32 are not supported on CPU")
-    if TEST_WITH_ROCM:
-        raise unittest.SkipTest("half and complex32 are not supported on ROCM")
     if not SM53OrLater:
         raise unittest.SkipTest("half and complex32 are only supported on CUDA device with SM>53")
 
@@ -462,6 +460,31 @@ def test_fftn_invalid(self, device, dtype, op):
         with self.assertRaisesRegex(RuntimeError, "tensor only has 3 dimensions"):
             op(a, s=(10, 10, 10, 10))
 
+    @skipCPUIfNoFFT
+    @onlyNativeDeviceTypes
+    @dtypes(torch.half, torch.float, torch.double, torch.cfloat, torch.cdouble)
+    def test_fftn_noop_transform(self, device, dtype):
+        skip_helper_for_fft(device, dtype)
+        RESULT_TYPE = {
+            torch.half: torch.chalf,
+            torch.float: torch.cfloat,
+            torch.double: torch.cdouble,
+        }
+
+        for op in [
+            torch.fft.fftn,
+            torch.fft.ifftn,
+            torch.fft.fft2,
+            torch.fft.ifft2,
+        ]:
+            inp = make_tensor((10, 10), device=device, dtype=dtype)
+            out = torch.fft.fftn(inp, dim=[])
+
+            expect_dtype = RESULT_TYPE.get(inp.dtype, inp.dtype)
+            expect = inp.to(expect_dtype)
+            self.assertEqual(expect, out)
+
+
     @skipCPUIfNoFFT
     @onlyNativeDeviceTypes
     @toleranceOverride({
@@ -891,6 +914,7 @@ def test_cufft_context(self, device, dtype):
         self.assertFalse((x.grad - x).abs().max() == 0)
 
     # passes on ROCm w/ python 2.7, fails w/ python 3.6
+    @skipIfTorchDynamo("cannot set WRITEABLE flag to True of this array")
     @skipCPUIfNoFFT
     @onlyNativeDeviceTypes
     @dtypes(torch.double)
@@ -964,6 +988,7 @@ def _test(sizes, n_fft, hop_length=None, win_length=None, win_sizes=None,
         _test((10,), 5, 4, win_sizes=(11,), expected_error=RuntimeError)
         _test((10,), 5, 4, win_sizes=(1, 1), expected_error=RuntimeError)
 
+    @skipIfTorchDynamo("double")
     @skipCPUIfNoFFT
     @onlyNativeDeviceTypes
     @dtypes(torch.double)
@@ -1418,6 +1443,7 @@ def test_istft_throws(self, device):
         self.assertRaises(RuntimeError, torch.istft, torch.zeros((3, 0, 2)), 2)
         self.assertRaises(RuntimeError, torch.istft, torch.zeros((0, 3, 2)), 2)
 
+    @skipIfTorchDynamo("Failed running call_function")
     @onlyNativeDeviceTypes
     @skipCPUIfNoFFT
     @dtypes(torch.double)
diff --git a/test/test_stateless.py b/test/test_stateless.py
index 9ad6a6aadf616..32ec45937059f 100644
--- a/test/test_stateless.py
+++ b/test/test_stateless.py
@@ -253,9 +253,13 @@ def test_reparametrize_module_fail_reset_to_original(self, functional_call):
         parameters = {'l1.parametrizations.weight.original': torch.nn.Parameter(torch.tensor([[1.0]])),
                       'l1.bias': torch.tensor([0.0]),
                       'buffer': torch.tensor([0.0])}
+
         with self.assertRaisesRegex(RuntimeError, "shapes cannot be multiplied"):
-            x = torch.rand((4, 5))  # to work, it should be of size (1, 1)
-            functional_call(module, parameters, x)  # this call will fail because x is the wrong size
+            @torch._dynamo.disable
+            def _error_case():
+                x = torch.rand((4, 5))  # to work, it should be of size (1, 1)
+                functional_call(module, parameters, x)  # this call will fail because x is the wrong size
+            _error_case()
 
         # verify that the spectral normalization is still applied
         self.assertTrue('l1.parametrizations.weight.original' in dict(module.named_parameters()))
diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py
index ca50bb2392188..434793508d471 100644
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@@ -7,30 +7,9 @@
 import torch
 from torch import nn
 from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.static_module import StaticModule
 from typing import List
 
-class StaticModule:
-    def __init__(self, scripted):
-        # this is an nn.Module
-        if hasattr(scripted, "_c"):
-            self.static_module = torch._C._jit_to_static_module(scripted._c)
-        else:
-            self.static_module = torch._C._jit_to_static_module(scripted.graph)
-
-    def __call__(self, *args, **kwargs):
-        return self.static_module(*args, **kwargs)
-
-    def benchmark(self, args, kwargs, warmup_runs, main_runs):
-        self.static_module.benchmark(args, kwargs, warmup_runs, main_runs)
-
-    def runAsync(self, args, kwargs):
-        return self.static_module.runAsync(args, kwargs)
-
-    def benchmark_individual_ops(self, args, kwargs, warmup_runs, main_runs):
-        return self.static_module.benchmark_individual_ops(
-            args, kwargs, warmup_runs, main_runs
-        )
-
 
 def linear_shim(
     input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None
diff --git a/test/test_subclass.py b/test/test_subclass.py
index 01253955ad9c4..869982c2830ac 100644
--- a/test/test_subclass.py
+++ b/test/test_subclass.py
@@ -235,8 +235,6 @@ def __new__(
             def __init__(self, t) -> None:
                 self.tensor: torch.Tensor = t
 
-            __torch_function__ = torch._C._disabled_torch_function_impl
-
             @classmethod
             def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
 
diff --git a/test/test_sympy_utils.py b/test/test_sympy_utils.py
index c88bb2f2280a2..c5da8f7fc0da2 100644
--- a/test/test_sympy_utils.py
+++ b/test/test_sympy_utils.py
@@ -255,7 +255,7 @@ class TestSympyInterp(TestCase):
     @parametrize("fn", UNARY_OPS + BINARY_OPS + UNARY_BOOL_OPS + BINARY_BOOL_OPS + COMPARE_OPS)
     def test_interp(self, fn):
         # SymPy does not implement truncation for Expressions
-        if fn in ("div", "truncdiv", "minimum", "maximum"):
+        if fn in ("div", "truncdiv", "minimum", "maximum", "mod"):
             return
 
         from sympy.abc import x, y
@@ -288,6 +288,10 @@ def test_python_interp_fx(self, fn):
         if fn in ("log", "exp"):
             return
 
+        # Sympy does not support truncation on symbolic shapes
+        if fn in ("truncdiv", "mod"):
+            return
+
         vals = CONSTANTS
         if fn in {*UNARY_BOOL_OPS, *BINARY_BOOL_OPS}:
             vals = [True, False]
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index 1581865500643..91db7bbadd989 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -75,7 +75,7 @@ def _rand_shape(dim, min_size, max_size):
 # DOES NOT INCLUDE view ops, which are tested in TestViewOps (currently in
 #   test_torch.py) OR numpy interop (which is also still tested in test_torch.py)
 #
-# See https://pytorch.org/docs/master/torch.html#creation-ops
+# See https://pytorch.org/docs/main/torch.html#creation-ops
 
 class TestTensorCreation(TestCase):
     exact_dtype = True
@@ -543,10 +543,11 @@ def test_cat_out(self, device):
             self.assertEqual(y, expected_y)
             self.assertEqual(x, expected_x)
 
-    @dtypes(*all_types_and_complex())
+    @dtypes(*all_types_and_complex(), torch.uint16, torch.uint32, torch.uint64)
     def test_cat_out_fast_path_dim0_dim1(self, device, dtype):
+        int_types = integral_types_and(torch.uint16, torch.uint32, torch.uint64)
         x = torch.zeros((0), device=device, dtype=dtype)
-        if dtype in integral_types():
+        if dtype in int_types:
             y = torch.randint(low=0, high=100, size=(4, 6), device=device, dtype=dtype)
         else:
             y = torch.randn((4, 6), device=device, dtype=dtype)
@@ -575,7 +576,7 @@ def test_cat_out_fast_path_dim0_dim1(self, device, dtype):
         self.assertEqual(b_fastcat, expected_b)
         # Finally, we need to make sure backward is not broken
         # Integral types will not have grad
-        if dtype not in integral_types():
+        if dtype not in int_types:
             a = torch.randn((4, 3), device=device, dtype=dtype, requires_grad=True)
             b = torch.randn((2, 3), device=device, dtype=dtype, requires_grad=True)
             c = torch.randn((5, 3), device=device, dtype=dtype, requires_grad=True)
@@ -944,7 +945,7 @@ def test_hstack_column_stack(self, device, dtype):
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half))
     def test_vstack_row_stack(self, device, dtype):
-        ops = ((torch.vstack, np.vstack), (torch.row_stack, np.row_stack))
+        ops = ((torch.vstack, np.vstack), (torch.row_stack, np.vstack))
         for torch_op, np_op in ops:
             self._test_special_stacks(0, 2, torch_op, np_op, device, dtype)
             for i in range(5):
@@ -1046,6 +1047,8 @@ def test_float_to_int_conversion_finite(self, device, dtype):
         self._float_to_int_conversion_helper(vals, device, dtype, refs)
 
     # Note: CUDA will fail this test on most dtypes, often dramatically.
+    # NB: torch.uint16, torch.uint32, torch.uint64 excluded as this
+    # nondeterministically fails, warning "invalid value encountered in cast"
     @onlyCPU
     @dtypes(torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)
     def test_float_to_int_conversion_nonfinite(self, device, dtype):
@@ -1053,8 +1056,6 @@ def test_float_to_int_conversion_nonfinite(self, device, dtype):
 
         self._float_to_int_conversion_helper(vals, device, dtype)
 
-    # TODO: re-enable this test
-    @unittest.skipIf(True, "real and imag not implemented for complex")
     @onlyNativeDeviceTypes
     def test_complex_type_conversions(self, device):
         dtypes = [torch.float, torch.complex64, torch.complex128]
@@ -1622,7 +1623,9 @@ def test_random_from_to_bool(self, device):
                         lambda: t.random_(from_, to_)
                     )
 
-    @dtypes(*all_types_and(torch.bfloat16, torch.half))
+    # NB: uint64 is broken because its max value is not representable in
+    # int64_t, but this is what random expects
+    @dtypes(*all_types_and(torch.bfloat16, torch.half, torch.uint16, torch.uint32))
     def test_random_full_range(self, device, dtype):
         size = 2000
         alpha = 0.1
@@ -1656,7 +1659,9 @@ def test_random_full_range(self, device, dtype):
         self.assertTrue(from_ <= t.to(torch.double).min() < (from_ + delta))
         self.assertTrue((to_inc_ - delta) < t.to(torch.double).max() <= to_inc_)
 
-    @dtypes(*all_types_and(torch.bfloat16, torch.half))
+    # NB: uint64 is broken because its max value is not representable in
+    # int64_t, but this is what random expects
+    @dtypes(*all_types_and(torch.bfloat16, torch.half, torch.uint16, torch.uint32))
     def test_random_from_to(self, device, dtype):
         size = 2000
         alpha = 0.1
@@ -1745,7 +1750,7 @@ def test_random_from_to(self, device, dtype):
                         lambda: t.random_(from_, to_)
                     )
 
-    @dtypes(*all_types_and(torch.bfloat16, torch.half))
+    @dtypes(*all_types_and(torch.bfloat16, torch.half, torch.uint16, torch.uint32))
     def test_random_to(self, device, dtype):
         size = 2000
         alpha = 0.1
@@ -2730,10 +2735,22 @@ def _test_signal_window_functions(self, name, dtype, device, **kwargs):
             return
         for size in [0, 1, 2, 5, 10, 50, 100, 1024, 2048]:
             for periodic in [True, False]:
-                res = torch_method(size, periodic=periodic, **kwargs, device=device, dtype=dtype)
+                res = torch_method(
+                    size,
+                    periodic=periodic,
+                    layout=torch.strided,
+                    requires_grad=False,
+                    **kwargs,
+                    device=device,
+                    dtype=dtype,
+                )
                 # NB: scipy always returns a float64 result
-                ref = torch.from_numpy(signal.get_window((name, *(kwargs.values())), size, fftbins=periodic))
-                self.assertEqual(res, ref, exact_dtype=False)
+                ref = torch.from_numpy(
+                    signal.get_window(
+                        (name, *(kwargs.values())), size, fftbins=periodic
+                    )
+                )
+                self.assertEqual(res, ref.to(dtype))
         with self.assertRaisesRegex(RuntimeError, r'not implemented for sparse types'):
             torch_method(3, layout=torch.sparse_coo)
         self.assertTrue(torch_method(3, requires_grad=True).requires_grad)
@@ -2754,7 +2771,7 @@ def test_signal_window_functions(self, device, dtype, window):
     @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
     @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16, torch.half, torch.long)
-    @dtypes(torch.float, torch.double, torch.long)
+    @dtypes(torch.float, torch.double, torch.long, torch.bfloat16, torch.float16)
     def test_kaiser_window(self, device, dtype):
         for num_test in range(50):
             self._test_signal_window_functions('kaiser', dtype, device, beta=random.random() * 30)
@@ -3805,7 +3822,7 @@ def to_memview(tensor):
 
 class TestAsArray(TestCase):
     def _check(self, original, cvt=lambda t: t, is_alias=True, same_dtype=True, same_device=True, **kwargs):
-        """Check the output of 'asarray', given its input and assertion informations.
+        """Check the output of 'asarray', given its input and assertion information.
 
         Besides calling 'asarray' itself, this function does 4 different checks:
             1. Whether the result is aliased or not, depending on 'is_alias'
@@ -4051,7 +4068,7 @@ def test_astensor_consistency(self, device):
             t = torch.asarray(e)
             self.assertEqual(t, original)
 
-    @skipIfTorchDynamo
+    @skipIfTorchDynamo()
     @onlyCPU
     def test_numpy_scalars(self, device):
         scalar = np.float64(0.5)
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index c3d6b24bf1572..3ce2ab2a172c8 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -131,6 +131,7 @@ def test_pytorch_histogram(self):
         with self.createSummaryWriter() as w:
             w.add_histogram('float histogram', torch.rand((50,)))
             w.add_histogram('int histogram', torch.randint(0, 100, (50,)))
+            w.add_histogram('bfloat16 histogram', torch.rand(50, dtype=torch.bfloat16))
 
     def test_pytorch_histogram_raw(self):
         with self.createSummaryWriter() as w:
@@ -447,60 +448,6 @@ def test_custom_scalars(self):
         }
         summary.custom_scalars(layout)  # only smoke test. Because protobuf in python2/3 serialize dictionary differently.
 
-    def test_hparams_smoke(self):
-        hp = {'lr': 0.1, 'bsize': 4}
-        mt = {'accuracy': 0.1, 'loss': 10}
-        summary.hparams(hp, mt)  # only smoke test. Because protobuf in python2/3 serialize dictionary differently.
-
-        hp = {'use_magic': True, 'init_string': "42"}
-        mt = {'accuracy': 0.1, 'loss': 10}
-        summary.hparams(hp, mt)
-
-        mt = {'accuracy': torch.zeros(1), 'loss': torch.zeros(1)}
-        summary.hparams(hp, mt)
-
-    def test_hparams_wrong_parameter(self):
-        with self.assertRaises(TypeError):
-            summary.hparams([], {})
-        with self.assertRaises(TypeError):
-            summary.hparams({}, [])
-        with self.assertRaises(ValueError):
-            res = summary.hparams({'pytorch': [1, 2]}, {'accuracy': 2.0})
-        # metric data is used in writer.py so the code path is different, which leads to different exception type.
-        with self.assertRaises(NotImplementedError):
-            with self.createSummaryWriter() as writer:
-                writer.add_hparams({'pytorch': 1.0}, {'accuracy': [1, 2]})
-
-    def test_hparams_number(self):
-        hp = {'lr': 0.1}
-        mt = {'accuracy': 0.1}
-        self.assertTrue(compare_proto(summary.hparams(hp, mt), self))
-
-    def test_hparams_bool(self):
-        hp = {'bool_var': True}
-        mt = {'accuracy': 0.1}
-        self.assertTrue(compare_proto(summary.hparams(hp, mt), self))
-
-    def test_hparams_string(self):
-        hp = {'string_var': "hi"}
-        mt = {'accuracy': 0.1}
-        self.assertTrue(compare_proto(summary.hparams(hp, mt), self))
-
-    def test_hparams_domain_discrete(self):
-        hp = {"lr": 0.1, "bool_var": True, "string_var": "hi"}
-        mt = {"accuracy": 0.1}
-        hp_domain = {"lr": [0.1], "bool_var": [True], "string_var": ["hi"]}
-
-        # hparam_domain_discrete keys needs to be subset of hparam_dict keys
-        with self.assertRaises(TypeError):
-            summary.hparams(hp, mt, hparam_domain_discrete={"wrong_key": []})
-
-        # hparam_domain_discrete values needs to be same type as hparam_dict values
-        with self.assertRaises(TypeError):
-            summary.hparams(hp, mt, hparam_domain_discrete={"lr": [True]})
-
-        # only smoke test. Because protobuf map serialization is nondeterministic.
-        summary.hparams(hp, mt, hparam_domain_discrete=hp_domain)
 
     @unittest.skipIf(IS_MACOS, "Skipping on mac, see https://github.com/pytorch/pytorch/pull/109349 ")
     def test_mesh(self):
diff --git a/test/test_testing.py b/test/test_testing.py
index b8f7c57864898..ba9558a3ddd14 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -1485,15 +1485,7 @@ def test_low_high_default_smoke(self, dtype, device):
         low_inclusive, high_exclusive = {
             torch.bool: (0, 2),
             torch.uint8: (0, 10),
-            **{
-                signed_integral_dtype: (-9, 10)
-                for signed_integral_dtype in [
-                    torch.int8,
-                    torch.int16,
-                    torch.int32,
-                    torch.int64,
-                ]
-            },
+            **dict.fromkeys([torch.int8, torch.int16, torch.int32, torch.int64], (-9, 10)),
         }.get(dtype, (-9, 9))
 
         t = torch.testing.make_tensor(10_000, dtype=dtype, device=device, low=low_inclusive, high=high_exclusive)
@@ -1981,8 +1973,7 @@ def test_op_parametrized(self, device, dtype, op, flag):
         for op in op_db:
             for dtype in op.supported_dtypes(torch.device(device).type):
                 for flag_part in ('flag_disabled', 'flag_enabled'):
-                    expected_name = '{}.test_op_parametrized_{}_{}_{}_{}'.format(
-                        device_cls.__name__, op.formatted_name, flag_part, device, dtype_name(dtype))
+                    expected_name = f'{device_cls.__name__}.test_op_parametrized_{op.formatted_name}_{flag_part}_{device}_{dtype_name(dtype)}'  # noqa: B950
                     expected_test_names.append(expected_name)
 
         test_names = _get_test_names_for_test_class(device_cls)
@@ -2063,7 +2054,7 @@ def test_three(self, device, dtype, op):
         for test_func, name in _get_test_funcs_for_test_class(device_cls):
             should_apply = (name == 'test_op_param_test_op_x_2_cpu_float64' or
                             ('test_other' in name and 'y_5' in name) or
-                            ('test_three' in name and name.endswith('int16')))
+                            ('test_three' in name and name.endswith('_int16')))
             self.assertEqual(hasattr(test_func, '_decorator_applied'), should_apply)
 
     def test_modules_decorator_applies_module_and_param_specific_decorators(self, device):
diff --git a/test/test_torch.py b/test/test_torch.py
index f02c8d7bb74d3..81da78f9a8820 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -25,13 +25,13 @@
 import sys
 import copyreg
 from torch import inf, nan
-from itertools import product, combinations, permutations
+from itertools import product, combinations, permutations, chain
 from functools import partial
 from torch import multiprocessing as mp
 from torch.testing import make_tensor
 
 from torch.testing._internal.common_utils import (  # type: ignore[attr-defined]
-    TEST_WITH_TORCHINDUCTOR, TestCase, TEST_WITH_ROCM, run_tests, IS_JETSON,
+    TEST_WITH_TORCHINDUCTOR, TEST_WITH_ROCM, run_tests, IS_JETSON,
     IS_WINDOWS, IS_FILESYSTEM_UTF8_ENCODING, NO_MULTIPROCESSING_SPAWN,
     IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, skipIfTorchInductor, load_tests, slowTest, slowTestIf,
     TEST_WITH_CROSSREF, skipIfTorchDynamo, skipRocmIfTorchInductor, set_default_dtype,
@@ -47,14 +47,15 @@
     instantiate_device_type_tests,
     onlyCUDA, onlyCPU,
     dtypes, dtypesIfCUDA, dtypesIfCPU, deviceCountAtLeast,
-    skipMeta,
-    PYTORCH_CUDA_MEMCHECK, largeTensorTest, onlyNativeDeviceTypes,
+    skipMeta, PYTORCH_CUDA_MEMCHECK, largeTensorTest, onlyNativeDeviceTypes,
     get_all_device_types, skipXLA)
 from typing import Tuple
 import torch.backends.quantized
 import torch.testing._internal.data
 from torch.testing._internal.common_cuda import (
-    tf32_on_and_off, tf32_is_not_fp32, TEST_CUDNN)
+    tf32_on_and_off, tf32_is_not_fp32, TEST_CUDNN, TEST_MULTIGPU,
+    _create_scaling_case, _create_scaling_models_optimizers)
+from torch.testing._internal.common_mkldnn import bf32_on_and_off
 from torch.testing._internal.common_dtype import (
     floating_types_and, get_all_math_dtypes, all_types_and_complex_and, complex_types,
     all_types_and, floating_types, floating_and_complex_types, integral_types_and,
@@ -62,6 +63,12 @@
 )
 from torch.testing._internal.two_tensor import TwoTensor
 
+if TEST_WITH_TORCHINDUCTOR:
+    from torch._inductor.test_case import TestCase
+else:
+    from torch.testing._internal.common_utils import TestCase  # type: ignore[assignment]
+
+
 # Protects against includes accidentally setting the default dtype
 assert torch.get_default_dtype() is torch.float32
 
@@ -1326,7 +1333,7 @@ def test_deterministic_resize(self, device, dtype):
     # point tensors with NaN and integer tensors with MAX_INT
     @skipXLA
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
-    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16, torch.uint16, torch.uint32, torch.uint64))
     def test_deterministic_empty(self, device, dtype):
         gen_fns = [
             lambda: torch.empty(10, 9, device=device, dtype=dtype),
@@ -2457,6 +2464,7 @@ def test_cdist_cuda_backward(self, device):
                         self.assertEqual(y1.grad, y2.grad, rtol=0, atol=0.001)
 
     @tf32_on_and_off(0.005)
+    @bf32_on_and_off(0.005)
     def test_cdist_large(self, device):
         for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
             x = torch.randn(1000, 10, device=device)
@@ -2467,6 +2475,7 @@ def test_cdist_large(self, device):
 
     @slowTest
     @tf32_on_and_off(0.01)
+    @bf32_on_and_off(0.01)
     def test_cdist_large_batch(self, device):
         for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
             x = torch.randn(4, 3, 1000, 10, device=device)
@@ -2476,6 +2485,7 @@ def test_cdist_large_batch(self, device):
             self.assertEqual(expected, actual)
 
     @tf32_on_and_off(0.005)
+    @bf32_on_and_off(0.005)
     def test_cdist_non_contiguous(self, device):
         for cm in ['use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
             x = torch.randn(5, 7, device=device).mT
@@ -2503,6 +2513,7 @@ def test_cdist_non_contiguous(self, device):
             self.assertEqual(expected, actual)
 
     @tf32_on_and_off(0.005)
+    @bf32_on_and_off(0.005)
     def test_cdist_non_contiguous_batch(self, device):
         for cm in ['use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
             x = torch.randn(4, 3, 2, 5, 7, device=device).mT
@@ -4985,7 +4996,6 @@ def _test_helper(x, y, bias, memory_format):
             torch.channels_last_3d)
 
     # FIXME: make this a elementwise unary and elementwise binary OpInfo test
-    @skipIfTorchDynamo("Torchdynamo fails with unknown reason")
     def test_strides_propagation(self, device):
         def _test_helper(x, op, unary=False):
             def compare_strides(s1, s2, div):
@@ -5072,6 +5082,181 @@ def test_storage_all_devices(self, devices):
             t = torch.tensor((), device=device)
             self.assertEqual(t.dtype, t.storage().dtype)
 
+    # Note [lazy_clone_ tests with inductor enabled]
+    # These `lazy_clone_` tests are written in a way that makes them pass in
+    # both eager mode and compiled mode (`PYTORCH_TEST_WITH_INDUCTOR=1`). There
+    # are cases where COW tensors can materialize at different times and in
+    # different ways in compiled mode versus eager mode, and those cases need to
+    # be avoided. There are two main wrinkles the be aware of.
+    #
+    # The first wrinkle is that these tests have to check the internal
+    # properties of tensors to make sure they materialize in the expected way,
+    # and those checks cause dynamo graph breaks. Depending on the situation, a
+    # graph break in-between two compiled graphs that operate on the same COW
+    # tensor can make the tensor materialize when it would not materialize in
+    # eager mode, causing the checks to fail. The strategy for avoiding this is
+    # to make all the operations on COW tensors get compiled into the same
+    # graph, by not doing any checks between the operations, and just do all the
+    # checks at the end of the test. If we really do want to perform checks
+    # between two operations, `op1` and `op2`, the solution is to create two
+    # different tests. One test performs just `op1` and then checks. The other
+    # test performs `op1` followed immediately by `op2` and then checks.
+    #
+    # The second wrinkle is that in eager mode, if we perform writes on two COW
+    # tensors where one is a lazy clone of the other, the first tensor to be
+    # written will be materialized with a new data pointer, and the second
+    # tensor will just reuse the original data pointer when it is materialized.
+    # But in compiled mode, if these writes happen in the same graph, the order
+    # in which the tensors materialize can be different than in eager mode. So
+    # in this case the strategy is to purposefully cause a graph break to happen
+    # in-between the two write operations, by adding checks between them, so
+    # that they have to materialize in the expected order.
+    @skipXLA
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_lazy_clone(self, device, dtype):
+        t = torch.tensor([[0, 1], [2, 3]], device=device, dtype=dtype)
+        t_orig_storage_addr = torch._C._storage_address(t)
+        orig_data_ptr = torch._C._data_address(t)
+        clone = t._lazy_clone()
+
+        # Lazy cloning a tensor should cause both it and its clone to become COW
+        # tensors. They should have different storages, but the same data
+        # pointer.
+
+        self.assertTrue(torch._C._is_cow_tensor(clone))
+        self.assertTrue(torch._C._is_cow_tensor(t))
+
+        self.assertTrue(torch._C._storage_address(t) == t_orig_storage_addr)
+        self.assertTrue(torch._C._storage_address(clone) != t_orig_storage_addr)
+
+        self.assertTrue(torch._C._data_address(t) == orig_data_ptr)
+        self.assertTrue(torch._C._data_address(clone) == orig_data_ptr)
+
+    # See Note [lazy_clone_ tests with inductor enabled]
+    @skipXLA
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_lazy_clone_view(self, device, dtype):
+        t = torch.tensor([[0, 1], [2, 3]], device=device, dtype=dtype)
+        t_orig_storage_addr = torch._C._storage_address(t)
+        orig_data_ptr = torch._C._data_address(t)
+        clone = t._lazy_clone()
+        view = t.view([4])
+
+        # Viewing `t` should not cause a copy (materialize) to happen. All the
+        # tensors should still be COW and have the same data pointer. `view` and
+        # `t` should have the same storage, and `clone` should have a different
+        # storage.
+
+        self.assertTrue(torch._C._is_cow_tensor(t))
+        self.assertTrue(torch._C._is_cow_tensor(view))
+        self.assertTrue(torch._C._is_cow_tensor(clone))
+
+        self.assertTrue(torch._C._storage_address(t) == t_orig_storage_addr)
+        self.assertTrue(torch._C._storage_address(view) == t_orig_storage_addr)
+        self.assertTrue(torch._C._storage_address(clone) != t_orig_storage_addr)
+
+        self.assertTrue(torch._C._data_address(t) == orig_data_ptr)
+        self.assertTrue(torch._C._data_address(clone) == orig_data_ptr)
+        self.assertTrue(torch._C._data_address(view) == orig_data_ptr)
+
+    # See Note [lazy_clone_ tests with inductor enabled]
+    @skipXLA
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_lazy_clone_view_materialize(self, device, dtype):
+        t = torch.tensor([[0, 1], [2, 3]], device=device, dtype=dtype)
+        t_orig_storage_addr = torch._C._storage_address(t)
+        orig_data_ptr = torch._C._data_address(t)
+        clone = t._lazy_clone()
+        view = t.view([4])
+        view += torch.ones(1, device=device, dtype=dtype)
+
+        # Writing to `t` should cause the storage under `t` and `view` to be
+        # copied (materialized), but should not affect `clone`.
+
+        self.assertFalse(torch._C._is_cow_tensor(t))
+        self.assertFalse(torch._C._is_cow_tensor(view))
+        self.assertTrue(torch._C._is_cow_tensor(clone))
+
+        self.assertTrue(torch._C._storage_address(t) == t_orig_storage_addr)
+        self.assertTrue(torch._C._storage_address(view) == t_orig_storage_addr)
+        self.assertTrue(torch._C._storage_address(clone) != t_orig_storage_addr)
+
+        t_new_data_addr = torch._C._data_address(t)
+        self.assertTrue(t_new_data_addr != orig_data_ptr)
+        self.assertTrue(torch._C._data_address(view) == t_new_data_addr)
+        self.assertTrue(torch._C._data_address(clone) == orig_data_ptr)
+
+        clone += torch.ones(1, device=device, dtype=dtype)
+
+        # Writing to `clone` should materialize it, so it should no longer
+        # be COW. However, since `clone`'s storage is the only COW storage
+        # left that holds a reference to the original data pointer, this
+        # materialization should not actually cause a copy--it should
+        # just reuse the original data pointer.
+
+        self.assertFalse(torch._C._is_cow_tensor(t))
+        self.assertFalse(torch._C._is_cow_tensor(view))
+        self.assertFalse(torch._C._is_cow_tensor(clone))
+
+        self.assertTrue(torch._C._storage_address(t) == t_orig_storage_addr)
+        self.assertTrue(torch._C._storage_address(view) == t_orig_storage_addr)
+        self.assertTrue(torch._C._storage_address(clone) != t_orig_storage_addr)
+
+        self.assertTrue(torch._C._data_address(t) == t_new_data_addr)
+        self.assertTrue(torch._C._data_address(view) == t_new_data_addr)
+        self.assertTrue(torch._C._data_address(clone) == orig_data_ptr)
+
+    @skipXLA
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_lazy_clone_binary_op_no_materialize(self, device, dtype):
+        t = torch.tensor([[0, 1], [2, 3]], device=device, dtype=dtype)
+        clone = t._lazy_clone()
+        res = t + clone
+        self.assertTrue(torch._C._is_cow_tensor(t))
+        self.assertTrue(torch._C._is_cow_tensor(clone))
+
+    # This tests that if a COW materialization is attempted inside an
+    # `at::parallel_for` loop function, then an error is raised. This test is
+    # implemented in Python rather than C++ because the C++ tests are built
+    # without multithreading support in `at::parallel_for`.
+    @skipXLA
+    @skipIfTorchDynamo("Torchdynamo fails and we do not need to test it here anyway")
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_parallel_cow_materialize_error(self, device, dtype):
+
+        def run(num_threads, num_parallel, skip_first, should_error):
+            orig_num_threads = torch.get_num_threads()
+
+            try:
+                torch.set_num_threads(num_threads)
+
+                a = torch.tensor([[0, 1], [2, 3]], device=device, dtype=dtype)._lazy_clone()
+
+                if should_error:
+                    with self.assertRaisesRegex(RuntimeError, r'Materializing a storage'):
+                        torch._test_parallel_materialize(
+                            a, num_parallel, skip_first)
+                else:
+                    torch._test_parallel_materialize(a, num_parallel, skip_first)
+
+                # Error should not raise in any case if the tensor is not COW
+                b = torch.tensor([[0, 1], [2, 3]], device=device, dtype=dtype)
+                torch._test_parallel_materialize(b, num_parallel, skip_first)
+
+            finally:
+                torch.set_num_threads(orig_num_threads)
+
+        run(1, 1, False, True)
+        run(1, 1, True, False)
+        run(1, 10, False, True)
+        run(1, 10, True, True)
+        run(10, 1, False, True)
+        run(10, 1, True, False)
+        run(10, 10, False, True)
+        run(10, 10, True, True)
+        run(10, 2, False, True)
+        run(10, 2, True, True)
+
     # FIXME: move to test distributions
     @skipIfMps
     @dtypesIfCUDA(torch.float, torch.double, torch.half)
@@ -5393,8 +5578,9 @@ def transformation_cuda_fn(tensor, **kwargs):
                 'cpu', get_generator(mf, shape), transformation_cuda_fn, mf, default_is_preserve=True)
 
     # FIXME: move to test_serialization
+    @onlyNativeDeviceTypes
     def test_pickle_gradscaler(self, device):
-        # This test is not in test_cuda.py because it should pass in 3 cases:
+        # This test should pass in 3 cases for cuda:
         #  1. cuda is not available.
         #  2. cuda is available but device is not cuda.
         #  3. cuda is available and device is cuda.
@@ -5403,14 +5589,18 @@ def test_pickle_gradscaler(self, device):
         # to cuda Tensors, because I don't want to do cuda things if device is not cuda.
         # In case 3, a and b are enabled and we may also try lazy-initing _scale to a cuda tensor.
         device = torch.device(device)
-        try_lazy_inits = (True, False) if device.type == "cuda" else (False,)
+        try_lazy_inits = (True, False)
+        GradScaler = partial(torch.GradScaler, device=device.type)
         for lazy_init_scale in try_lazy_inits:
-            a = torch.cuda.amp.GradScaler(init_scale=3., growth_factor=4., backoff_factor=.5, growth_interval=2)
-            self.assertTrue(not a.is_enabled() if torch.cuda.amp.common.amp_definitely_not_available() else a.is_enabled())
+            a = GradScaler(init_scale=3., growth_factor=4., backoff_factor=.5, growth_interval=2)
+            if device.type == "cuda":
+                self.assertTrue(not a.is_enabled() if torch.cuda.amp.common.amp_definitely_not_available() else a.is_enabled())
+            else:
+                self.assertTrue(a.is_enabled())
             if lazy_init_scale:
                 # Dummy a.scale() call lazy-inits a._scale Tensor.
                 a.scale(torch.tensor([4.0], dtype=torch.float32, device=device))
-                self.assertTrue(isinstance(a._scale, torch.cuda.FloatTensor))
+                self.assertTrue(a._scale.device.type == device.type)
             # The following three lines should work whether or not cuda is available.
             serialized = pickle.dumps(a)
             b = pickle.loads(serialized)
@@ -5423,7 +5613,7 @@ def test_pickle_gradscaler(self, device):
                 self.assertEqual(b._init_growth_tracker, 0)
                 # supplies a dummy key to test the defaultdict's default_factory
                 self.assertEqual(b._per_optimizer_states["fdsa"],
-                                 torch.cuda.amp.grad_scaler._refresh_per_optimizer_state())
+                                 torch.amp.grad_scaler._refresh_per_optimizer_state())
                 if lazy_init_scale:
                     self.assertEqual(b.scale(torch.tensor([4.0], dtype=torch.float32, device=device)), 12.0)
 
@@ -5444,6 +5634,571 @@ def test_multinomial_empty_wo_replacement(self, device):
         self._test_multinomial_empty(device, False, 1)
         self._test_multinomial_empty(device, False, 2)
 
+    @onlyNativeDeviceTypes
+    @dtypes(torch.float, torch.double)
+    def test_grad_scaling_unscale(self, device, dtype):
+        device = torch.device(device)
+        device0 = "cuda:0" if device.type == "cuda" else "cpu"
+        inv_scale = torch.full((1,), 0.25, dtype=torch.float, device=device0)
+        found_inf = torch.full((1,), 0.0, dtype=torch.float, device=device0)
+
+        size = 20
+        g = torch.full((size, size), 4.0, dtype=dtype, device=device0)
+        ginf = g.clone()
+        ginf[2, 2] = float('inf')
+        gnan = g.clone()
+        gnan[2, 2] = float('nan')
+
+        # Tries selected combinations of
+        #  - contiguous grads
+        #  - g.clone().t() which is not contiguous but still non overlapping and dense
+        #  - variants of g.clone()[:, :5] which are not non overlapping and dense
+        # Non overlapping and dense grads route into a multi tensor apply kernel,
+        # others use a fallback per-tensor kernel, so we should try both.
+        cases = (
+            ([g.clone(), g.clone()], False),
+            ([g.clone(), g.clone().t()], False),
+            ([g.clone(), g.clone()[:, :5]], False),
+            ([g.clone()[:, :5], g.clone()[:, :5]], False),
+            ([g.clone(), ginf.clone()], True),
+            ([g.clone(), gnan.clone()], True),
+            ([g.clone(), ginf.clone()[:, :5]], True),
+            ([g.clone(), gnan.clone()[:, :5]], True),
+            ([ginf.clone(), g.clone()[:, :5]], True),
+            ([ginf.clone()[:, :5], g.clone()[:, :5]], True),
+        )
+
+        for grads, has_inf in cases:
+            found_inf.zero_()
+            torch._amp_foreach_non_finite_check_and_unscale_(grads, found_inf, inv_scale)
+            if has_inf:
+                self.assertEqual(found_inf, 1.0)
+            else:
+                self.assertEqual(found_inf, 0.0)
+                for grad in grads:
+                    self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7)
+
+        # When passing lists with mismatched dtypes to a raw
+        # _amp_foreach_non_finite_check_and_unscale_ call on CUDA,
+        # it's expected to fall back to single-tensor TensorIterator kernel.
+        grads = [g.clone(), g.to(dtype=torch.float16)]
+        torch._amp_foreach_non_finite_check_and_unscale_(grads, found_inf, inv_scale)
+        for grad in grads:
+            self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7)
+
+        # Passing lists with mismatched devices to a raw
+        # _amp_foreach_non_finite_check_and_unscale_ call should raise errors.
+        if device.type == "cuda" and TEST_MULTIGPU:
+            with self.assertRaisesRegex(RuntimeError, r"Expected all tensors to be on the same device"):
+                torch._amp_foreach_non_finite_check_and_unscale_([g.clone(), g.to(device="cuda:1")],
+                                                                 found_inf,
+                                                                 inv_scale)
+
+        # Creates a list of grads with mismatched dtypes and devices, to ensure
+        # scaler._unscale_grads_ organizes grads by dtype and device before calling
+        # _amp_foreach_non_finite_check_and_unscale_ on each set.
+        # If inject_inf >= 0, writes an inf into one grad for _unscale_grads_ to find.
+        def perfect_storm_grads(inject_inf):
+            grads = [g.clone(), g.clone()[:, :5], g.to(dtype=torch.float16), g.to(dtype=torch.float16)]
+            if device.type == "cuda" and TEST_MULTIGPU:
+                grads += [g.to(device="cuda:1"),
+                          g.to(device="cuda:1")[:, :5],
+                          g.to(device="cuda:1", dtype=torch.float16),
+                          g.to(device="cuda:1", dtype=torch.float16)]
+            if inject_inf >= 0:
+                grads[inject_inf][2, 2] = float('inf')
+            return grads
+
+        GradScaler = partial(torch.GradScaler, device=device.type)
+        scaler = GradScaler()
+        dummy_params = [torch.empty_like(g) for g in perfect_storm_grads(-1)]
+        dummy_opt = torch.optim.SGD(dummy_params, lr=1.)
+
+        # Ensures the inf/nan checking can find an inf injected onto any grad in the perfect storm.
+        for inject_inf in range(-1, len(dummy_params)):
+            found_inf = torch.full((1,), 0.0, dtype=torch.float, device=device0)
+            grads = perfect_storm_grads(inject_inf)
+            for i, p in enumerate(dummy_params):
+                p.grad = grads[i]
+            found_inf_per_device = scaler._unscale_grads_(dummy_opt, inv_scale, found_inf, True)
+            if inject_inf < 0:
+                # No inf was injected, ensures unscaling worked normally.
+                self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 0)
+                for grad in grads:
+                    self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7)
+            else:
+                # inf was injected, ensures inf was found.
+                self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 1)
+
+    @onlyNativeDeviceTypes
+    @dtypes(torch.float)
+    def test_grad_scaling_update_scale(self, device, dtype):
+        growth = 2.0
+        backoff = 0.25
+        growth_interval = 2
+        scale = torch.full((1,), 4.0, dtype=dtype, device=device)
+        growth_tracker = torch.full((1,), 0.0, dtype=torch.int32, device=device)
+        found_inf = torch.full((1,), 0.0, dtype=torch.float, device=device)
+
+        # Simulates 2 consecutive unskipped iterations
+        torch._amp_update_scale_(scale, growth_tracker, found_inf, growth, backoff, growth_interval)
+        self.assertEqual(growth_tracker, 1)
+        self.assertEqual(scale, 4.0)
+        torch._amp_update_scale_(scale, growth_tracker, found_inf, growth, backoff, growth_interval)
+        self.assertEqual(growth_tracker, 0)
+        self.assertEqual(scale, 8.0)
+
+        # Simulates a skipped iteration
+        found_inf.fill_(1.0)
+        torch._amp_update_scale_(scale, growth_tracker, found_inf, growth, backoff, growth_interval)
+        self.assertEqual(growth_tracker, 0)
+        self.assertEqual(scale, 2.0)
+
+    @skipIfTorchDynamo("Failed running call_function for sparse_coo_tensor. See https://github.com/pytorch/pytorch/issues/118856")
+    @onlyNativeDeviceTypes
+    @dtypes(torch.float)
+    def test_grad_scaling_unscale_sparse(self, device, dtype):
+        device = torch.device(device)
+        scaler = torch.GradScaler(device=device.type)
+
+        inv_scale = torch.full((1,), 0.25, dtype=dtype, device=device)
+        found_inf = torch.empty((1,), dtype=dtype, device=device)
+        cur = found_inf.device
+
+        i = torch.tensor([[0, 1, 1],
+                          [2, 0, 2]], device=device, dtype=torch.int64)
+        v = torch.tensor([16., 32., 64.], device=device, dtype=torch.float)
+        s = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device=device, dtype=dtype)
+
+        p = s.clone()
+        assert p.is_sparse
+        opt = torch.optim.SGD([p], lr=1.)
+
+        p.grad = s.clone()
+        found_inf.zero_()
+        found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, False)[cur]
+        self.assertEqual(found_inf, 0.0)
+        self.assertEqual(p.grad.to_dense(), (s / 4).to_dense())
+
+        v = torch.FloatTensor([16., 32., float('inf')])
+        p.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device=device, dtype=dtype)
+        found_inf.zero_()
+        found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, False)[cur]
+        self.assertEqual(found_inf, 1.0)
+
+        v = torch.FloatTensor([16., 32., float('nan')])
+        p.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device=device, dtype=dtype)
+        found_inf.zero_()
+        found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, False)[cur]
+        self.assertEqual(found_inf, 1.0)
+
+        p = s.clone().half()
+        assert p.is_sparse
+        opt = torch.optim.SGD([p], lr=1.)
+
+        p.grad = s.clone().half()
+        found_inf.zero_()
+        found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, True)[cur]
+        self.assertEqual(found_inf, 0.0)
+        self.assertEqual(p.grad.to_dense(), (s.half() / 4).to_dense())
+
+        # Creates fp16 sparse tensor with duplicated indices (uncoalesced).  The uncoalesced representation
+        # does not overflow in fp16, but the coalesced representation would, because 64000 + 64000 > fp16 max.
+        # _amp_non_finite_check_and_unscale_ should report an overflow here.
+        i = torch.LongTensor([[0, 1, 0],
+                              [2, 0, 2]])
+        v = torch.FloatTensor([64000., 32., 64000.])
+        p.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device=device, dtype=torch.float16)
+        found_inf.zero_()
+        found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, True)[cur]
+        self.assertEqual(found_inf, 1.0)
+
+    @onlyNativeDeviceTypes
+    def test_grad_scaling_state_dict(self, device):
+        device = torch.device(device)
+        GradScaler = partial(torch.GradScaler, device=device.type)
+        for lazy_init_scale in True, False:
+            s0 = GradScaler(init_scale=3., growth_factor=4., backoff_factor=.5, growth_interval=2)
+            s1 = GradScaler(init_scale=6., growth_factor=7., backoff_factor=.8, growth_interval=1)
+
+            # sets a random value for load_state_dict to overwrite
+            s1._init_growth_tracker = 7
+
+            if lazy_init_scale:
+                # Dummy scale() call to ensure the scale tensor is lazily initialized.
+                s1.scale(torch.full((1,), 4.0, dtype=torch.float32, device=device))
+                if "cuda" == device.type:
+                    self.assertTrue(isinstance(s1._scale, torch.cuda.FloatTensor))
+                else:
+                    self.assertTrue(isinstance(s1._scale, torch.FloatTensor))
+
+            s1.load_state_dict(s0.state_dict())
+
+            self.assertEqual(s1.get_scale(), 3.)
+            self.assertEqual(s1.get_growth_factor(), 4.)
+            self.assertEqual(s1.get_backoff_factor(), .5)
+            self.assertEqual(s1.get_growth_interval(), 2)
+            self.assertEqual(s1._init_growth_tracker, 0)
+
+    # _run_scaling_case generalizes some single-optimizer test logic to avoid too much copy-pasting below.
+    def _run_scaling_case(self, device, run, unskipped, skipped, atol=1e-7, optimizer_ctor=torch.optim.SGD, optimizer_kwargs=None):
+        # Ensure scaling can be disabled without changing user control flow.
+        for enabled in True, False:
+            (
+                mod_control, mod_scaling, opt_control, opt_scaling, data, loss_fn, skip_iter,
+            ) = _create_scaling_case(device=device, optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs)
+
+            # For functionality, test with a modest initial scale, and an unrealistically-large growth factor
+            # so any potential errors with the growth factor handling will be magnified.
+            GradScaler = partial(torch.GradScaler, device=device)
+            scaler = GradScaler(init_scale=128., growth_factor=2.0, enabled=enabled, growth_interval=1)
+
+            _ = run(device, data, mod_control, opt_control, scaler, loss_fn, skip_iter, False)
+            ret = run(device, data, mod_scaling, opt_scaling, scaler, loss_fn, skip_iter, True)
+
+            # Allows run() to optionally return a different scaler instance.
+            scaler = ret if ret else scaler
+
+            # If scaling was enabled, the scale factor should have been multiplied by the growth factor
+            # len(data) - skipped times and the backoff factor "skipped" times.
+            if enabled:
+                net_growth = scaler.get_growth_factor()**unskipped if unskipped > 0 else 1.0
+                net_backoff = scaler.get_backoff_factor()**skipped if skipped > 0 else 1.0
+                self.assertTrue(scaler.get_scale() == (128. * net_growth * net_backoff))
+            else:
+                self.assertTrue(scaler.get_scale() == 1.0)
+
+            for c, s in zip(mod_control.parameters(), mod_scaling.parameters()):
+                self.assertEqual(c.grad, s.grad, atol=atol, rtol=1e-05)
+
+                c_state, s_state = opt_control.state[c], opt_scaling.state[s]
+                for k in c_state:
+                    self.assertEqual(c_state[k], s_state[k], atol=atol, rtol=1e-05, msg=k)
+
+                self.assertEqual(c, s, atol=atol, rtol=1e-05)
+
+    # Compares no scaling + no autocasting against scaling + autocasting.
+    def _grad_scaling_autocast_test(self, *, device="cuda", atol=1e-3, optimizer_ctor=torch.optim.SGD, optimizer_kwargs=None):
+        try_pickle = False
+
+        def run(device, data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
+            for i, (input, target) in enumerate(data):
+                optimizer.zero_grad()
+                with torch.autocast(device_type=device, dtype=torch.half, enabled=try_scaling_api):
+                    output = model(input)
+                    loss = loss_fn(output, target)
+                if try_scaling_api:
+                    scaler.scale(loss).backward()
+                    if i == skip_iter and scaler.is_enabled():
+                        with torch.no_grad():
+                            model[1].weight.grad.fill_(float('inf'))
+                    scaler.step(optimizer)
+                    scaler.update()
+                    if try_pickle:
+                        scaler = pickle.loads(pickle.dumps(scaler))
+                else:
+                    loss.backward()
+                    if (not scaler.is_enabled()) or (i != skip_iter):
+                        optimizer.step()
+            return scaler
+
+        # NOTE(mkozuki): With current way of testing, `torch.optim.Adam` is failing in spite of `foreach` and `fused`.
+        #   Giving some flexibility to this test might help.
+        context = contextlib.nullcontext
+        if optimizer_ctor in (torch.optim.Adam, torch.optim.AdamW):
+            from functools import partial
+            context = partial(self.assertRaises, AssertionError)
+        with context():
+            # sets atol=1e-3 because we're comparing pure fp32 arithmetic vs a mixture of fp16 and fp32
+            self._run_scaling_case(
+                device, run, unskipped=3, skipped=1, atol=atol, optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs,
+            )
+            # this will be picked up by try_pickle within run():
+            try_pickle = True
+            self._run_scaling_case(
+                device, run, unskipped=3, skipped=1, atol=atol, optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs,
+            )
+
+    @onlyNativeDeviceTypes
+    def test_grad_scaling_autocast(self, device):
+        device = torch.device(device)
+        for optimizer_ctor in (torch.optim.SGD, torch.optim.Adam, torch.optim.AdamW):
+            self._grad_scaling_autocast_test(device=device.type, optimizer_ctor=optimizer_ctor)
+
+    @onlyNativeDeviceTypes
+    def test_grad_scaling_autocast_foreach(self, device):
+        device = torch.device(device)
+        for optimizer_ctor in (torch.optim.SGD, torch.optim.Adam, torch.optim.AdamW):
+            self._grad_scaling_autocast_test(device=device.type, optimizer_ctor=optimizer_ctor, optimizer_kwargs={"foreach": True})
+
+    @onlyNativeDeviceTypes
+    def test_grad_scaling_autocast_fused(self, device):
+        device = torch.device(device)
+        for optimizer_ctor in (torch.optim.Adam, torch.optim.AdamW):
+            self._grad_scaling_autocast_test(device=device.type, optimizer_ctor=optimizer_ctor, optimizer_kwargs={"fused": True})
+
+    # Make sure that the parameters become nonsense when scaled gradients are finite
+    # but they get invalidated before `optimizer.step`, after `GradScaler.unscale_`
+
+    @onlyNativeDeviceTypes
+    def test_params_invalidated_with_grads_invalidated_between_unscale_and_step(self, device):
+        device = torch.device(device)
+        for optimizer_ctor, optimizer_kwargs in product(
+            (torch.optim.Adam, torch.optim.AdamW),
+            (
+                {"foreach": False, "fused": False},
+                {"foreach": True, "fused": False},
+                {"foreach": False, "fused": True},
+            ),
+        ):
+            with self.subTest(optimizer=optimizer_ctor, optimizer_kwargs=optimizer_kwargs):
+                self._test_grads_invalidated_between_unscale_and_step(device.type, optimizer_ctor, optimizer_kwargs)
+
+    def _test_grads_invalidated_between_unscale_and_step(self, device, optimizer_ctor, optimizer_kwargs):
+        model, _, optimizer, _, data, loss_fn, _ = _create_scaling_case(
+            device, optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs,
+        )
+        scaler = torch.GradScaler(device=device, init_scale=128.0)
+
+        for input, target in data:
+            optimizer.zero_grad()
+            with torch.autocast(device_type=device, dtype=torch.half):
+                output = model(input)
+                loss = loss_fn(output, target)
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+
+            # deliberately break grads
+            for j, param in enumerate(model.parameters()):
+                param.grad.copy_(torch.inf if j % 2 else torch.nan)
+
+            scaler.step(optimizer)
+            scaler.update()
+
+        self.assertTrue(all((p.isnan().any() or p.isinf().any()) for p in model.parameters()))
+
+    @onlyNativeDeviceTypes
+    def test_grad_scale_will_not_overflow(self, device):
+        device = torch.device(device)
+        model = torch.nn.Linear(5, 1).to(device)
+        optimizer = torch.optim.Adam(model.parameters())
+        scaler = torch.GradScaler(device=device.type, growth_interval=1, growth_factor=2**4, init_scale=1e38)
+        optimizer.zero_grad()
+        x = torch.randn(1, 5).to(device)
+        y = 1e-30 * torch.randn(1, 1).to(device)
+        l = ((model(x) - y) ** 2).mean()
+        scaler.scale(l).backward()
+        scaler.step(optimizer)
+        scaler.update()
+        assert scaler._scale != float("inf") and scaler._scale != float("nan")
+
+    @onlyNativeDeviceTypes
+    def test_grad_scaling_clipping(self, device):
+        device = torch.device(device)
+
+        def run(device, data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
+            max_norm = 0.2  # A reasonable value that actually has an effect, based on printouts of grads
+            for i, (input, target) in enumerate(data):
+                optimizer.zero_grad()
+                output = model(input)
+                loss = loss_fn(output, target)
+                if try_scaling_api:
+                    scaler.scale(loss).backward()
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm * scaler.get_scale())
+                    if i == skip_iter and scaler.is_enabled():
+                        model[1].weight.grad.data.fill_(float('inf'))
+                    scaler.step(optimizer)
+                    scaler.update()
+                else:
+                    loss.backward()
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+                    if (not scaler.is_enabled()) or (i != skip_iter):
+                        optimizer.step()
+
+        self._run_scaling_case(device.type, run, unskipped=3, skipped=1, atol=1e-5)
+
+    @onlyNativeDeviceTypes
+    def test_grad_scaling_clipping_separate_unscale(self, device):
+        device = torch.device(device)
+
+        def run(device, data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
+            max_norm = 0.2  # A reasonable value that actually has an effect, based on printouts of grads
+            for i, (input, target) in enumerate(data):
+                optimizer.zero_grad()
+                output = model(input)
+                loss = loss_fn(output, target)
+                if try_scaling_api:
+                    scaler.scale(loss).backward()
+                    if i == skip_iter and scaler.is_enabled():
+                        model[1].weight.grad.data.fill_(float('inf'))
+                    scaler.unscale_(optimizer)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm, error_if_nonfinite=False)
+                    scaler.step(optimizer)
+                    scaler.update()
+                else:
+                    loss.backward()
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+                    if (not scaler.is_enabled()) or (i != skip_iter):
+                        optimizer.step()
+
+        self._run_scaling_case(device.type, run, unskipped=3, skipped=1)
+
+    @onlyNativeDeviceTypes
+    @unittest.skipIf(IS_WINDOWS, 'FIXME: fix this test for Windows')
+    def test_grad_scaling_penalty(self, device):
+        device = torch.device(device)
+
+        def run(device, data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
+            for i, (input, target) in enumerate(data):
+                optimizer.zero_grad()
+                output = model(input)
+                loss = loss_fn(output, target)
+
+                if try_scaling_api:
+                    grad_params = torch.autograd.grad(scaler.scale(loss),
+                                                      model.parameters(), create_graph=True)
+                    inv_scale = 1. / scaler.get_scale()
+                    grad_params = [p * inv_scale for p in grad_params]
+                else:
+                    grad_params = torch.autograd.grad(loss, model.parameters(), create_graph=True)
+
+                grad_norm = 0
+                for grad in grad_params:
+                    grad_norm += grad.pow(2).sum()
+                grad_norm = grad_norm.sqrt()
+                loss = loss + grad_norm
+
+                if try_scaling_api:
+                    scaler.scale(loss).backward()
+                    if i == skip_iter and scaler.is_enabled():
+                        model[1].weight.grad.data.fill_(float('inf'))
+                    scaler.step(optimizer)
+                    scaler.update()
+                else:
+                    loss.backward()
+                    if (not scaler.is_enabled()) or (i != skip_iter):
+                        optimizer.step()
+
+        self._run_scaling_case(device.type, run, unskipped=3, skipped=1)
+
+    @onlyNativeDeviceTypes
+    def test_grad_scaling_accumulation(self, device):
+        device = torch.device(device)
+
+        def run(device, data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
+            iters_to_accumulate = 2
+            for i, (input, target) in enumerate(data):
+                output = model(input)
+                loss = loss_fn(output, target)
+                loss = loss / iters_to_accumulate
+                if try_scaling_api:
+                    scaler.scale(loss).backward()
+                else:
+                    loss.backward()
+                if (i + 1) % iters_to_accumulate == 0:
+                    if try_scaling_api:
+                        scaler.step(optimizer)
+                        scaler.update()
+                        optimizer.zero_grad()
+                    else:
+                        optimizer.step()
+                        optimizer.zero_grad()
+
+        self._run_scaling_case(device.type, run, unskipped=2, skipped=0)
+
+    @onlyNativeDeviceTypes
+    def test_grad_scaling_multiple(self, device):
+        device = torch.device(device)
+        # Tests gradient scaling with 2 models and 2 optimizers that both receive gradients from 2 losses.
+        # Some of the logic here cannot reuse the generic helper functions created for the 1-optimizer cases.
+        for enabled in True, False:
+            mod_control0, mod_scaling0, opt_control0, opt_scaling0, data, loss_fn, skip_iter = \
+                _create_scaling_case(device.type)
+            mod_control1, mod_scaling1, opt_control1, opt_scaling1 = \
+                _create_scaling_models_optimizers(device.type)
+
+            GradScaler = partial(torch.GradScaler, device=device.type)
+            scaler = GradScaler(init_scale=128., growth_factor=2.0, enabled=enabled, growth_interval=1)
+
+            def run(model0, model1, optimizer0, optimizer1, try_scaling_api):
+                for i, (input, target) in enumerate(data):
+                    optimizer0.zero_grad()
+                    optimizer1.zero_grad()
+                    output0 = model0(input)
+                    output1 = model1(input)
+                    loss0 = loss_fn(0.3 * output0 + 0.7 * output1, target)
+                    loss1 = loss_fn(0.6 * output0 - 0.4 * output1, target)
+
+                    if try_scaling_api:
+                        scaler.scale(loss0).backward(retain_graph=True)
+                        scaler.scale(loss1).backward()
+                        if i == skip_iter and scaler.is_enabled():
+                            model1[1].weight.grad.data.fill_(float('inf'))
+
+                        # As an additional stress test, separately unscale for one of the optimizers.
+                        scaler.unscale_(optimizer0)
+
+                        scaler.step(optimizer0)
+                        scaler.step(optimizer1)
+                        scaler.update()
+                    else:
+                        loss0.backward(retain_graph=True)
+                        loss1.backward()
+                        optimizer0.step()
+                        if (not scaler.is_enabled()) or (i != skip_iter):
+                            optimizer1.step()
+
+            run(mod_control0, mod_control1, opt_control0, opt_control1, False)
+            run(mod_scaling0, mod_scaling1, opt_scaling0, opt_scaling1, True)
+
+            # The loss scale should have been multiplied by the growth factor 3 times and the backoff factor once.
+            self.assertTrue(scaler.get_scale() == (128. * scaler.get_growth_factor()**3 *
+                                                   scaler.get_backoff_factor()**1) if enabled else 1.0)
+
+            for c, s in zip(chain(mod_control0.parameters(), mod_control1.parameters()),
+                            chain(mod_scaling0.parameters(), mod_scaling1.parameters())):
+                self.assertEqual(c, s, rtol=1e-5, atol=1e-7)
+
+    @onlyNativeDeviceTypes
+    def test_grad_scaler_pass_itself(self, device):
+        device = torch.device(device)
+        GradScaler = torch.cuda.amp.GradScaler if "cuda" == device.type else torch.cpu.amp.GradScaler
+
+        class _PlaceHolderOptimizer(torch.optim.Optimizer):
+            tester = self
+
+            def __init__(self, params, defaults=None):
+                if defaults is None:
+                    defaults = {}
+                super().__init__(params, defaults)
+                self._step_supports_amp_scaling = True
+
+        class Optimizer1(_PlaceHolderOptimizer):
+            def step(self, closure=None, *, grad_scaler=None):
+                self.tester.assertTrue(isinstance(grad_scaler, GradScaler))
+                self.tester.assertFalse(hasattr(self, "grad_scale"))
+                self.tester.assertFalse(hasattr(self, "found_inf"))
+
+        class Optimizer2(_PlaceHolderOptimizer):
+            def step(self, closure=None):
+                self.tester.assertTrue(isinstance(self.grad_scale, torch.Tensor))
+                self.tester.assertTrue(isinstance(self.found_inf, torch.Tensor))
+
+        x = torch.randn(4, 4).to(device)
+        m = torch.nn.Linear(4, 1).to(device)
+        o1 = Optimizer1(m.parameters())
+        o2 = Optimizer2(m.parameters())
+        scaler = GradScaler(init_scale=2.0)
+
+        with torch.autocast(device_type=device.type, dtype=torch.half):
+            y = m(x)
+            loss = y.mean()
+        scaler.scale(loss).backward()
+        with self.assertWarns(FutureWarning):
+            scaler.step(o1)
+        scaler.step(o2)
+        scaler.update()
+
     @dtypesIfCUDA(torch.float, torch.double, torch.half)
     @dtypesIfCPU(torch.float, torch.double, torch.bfloat16, torch.half)
     @dtypes(torch.float, torch.double)
@@ -5619,8 +6374,12 @@ def make_tensor_wrapper(shape, dtype):
                     atol = 1e-2
                 self.assertEqual(src, dst.copy_(t), rtol=rtol, atol=atol)
 
-    @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.complex32))
+    @dtypes(*all_types_and_complex_and(
+        torch.bool, torch.half, torch.bfloat16, torch.complex32,
+        torch.uint16, torch.uint32, torch.uint64))
     def test_item(self, device, dtype):
+        if torch.device(device).type == 'xla' and dtype in [torch.uint16, torch.uint32, torch.uint64]:
+            self.skipTest('uint16,32,64 not implemented on XLA')
         t = torch.ones((), device=device, dtype=dtype)
         self.assertEqual(1, t.item())
 
@@ -7287,7 +8046,7 @@ def assert_with_filename(filename):
             assert_with_filename(fname)
 
         if IS_FILESYSTEM_UTF8_ENCODING:
-            with TemporaryDirectoryName(suffix='中文') as dname, TemporaryFileName(dir=dname) as fname:
+            with TemporaryDirectoryName(suffix='\u4e2d\u6587') as dname, TemporaryFileName(dir=dname) as fname:
                 assert_with_filename(fname)
 
     def test_torch_from_file(self):
@@ -7318,7 +8077,7 @@ def assert_with_filename(filename):
             assert_with_filename(fname)
 
         if IS_FILESYSTEM_UTF8_ENCODING:
-            with TemporaryDirectoryName(suffix='中文') as dname, TemporaryFileName(dir=dname) as fname:
+            with TemporaryDirectoryName(suffix='\u4e2d\u6587') as dname, TemporaryFileName(dir=dname) as fname:
                 assert_with_filename(fname)
 
     def test_print(self):
@@ -7590,6 +8349,13 @@ def test_sizeof(self) -> None:
         self.assertEqual((sizeof_100 - sizeof_empty) // (sizeof_10 - sizeof_empty), 10)
         self.assertEqual((sizeof_100 - sizeof_empty) % (sizeof_10 - sizeof_empty), 0)
 
+    @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
+    def test_resizable(self) -> None:
+        x = torch.randn(5)
+        self.assertTrue(x.storage().resizable())
+        x.numpy()
+        self.assertFalse(x.storage().resizable())
+
     def test_iter(self) -> None:
         x = torch.randn(5, 5)
         for i, sub in enumerate(x):
@@ -8081,7 +8847,7 @@ def test_upsample_nearest2d_meta(self):
         out = torch.empty(4, 3, 16, 16, device='meta', dtype=torch.double)
         self.assertExpectedRaisesInline(
             RuntimeError, lambda: torch._C._nn.upsample_nearest2d(x, (16, 16), out=out),
-            """Expected out tensor to have dtype float, but got double instead"""
+            """Expected out tensor to have dtype torch.float32 but got torch.float64 instead"""
         )
 
         # Complain if out device mismatch
@@ -8091,7 +8857,7 @@ def test_upsample_nearest2d_meta(self):
         if not TEST_WITH_TORCHINDUCTOR:
             self.assertExpectedRaisesInline(
                 RuntimeError, lambda: torch._C._nn.upsample_nearest2d(x, (16, 16), out=out),
-                """Expected out tensor to have device meta, but got cpu instead"""
+                """Attempting to copy from device meta to device cpu, but cross-device copies are not allowed!"""
             )
 
     def test_add_meta_scalar(self):
@@ -8360,8 +9126,8 @@ def test_manual_seed(self):
         for seed, expected_initial_seed in test_cases:
             torch.manual_seed(seed)
             actual_initial_seed = torch.initial_seed()
-            msg = "expected initial_seed() = {:x} after calling manual_seed({:x}), but got {:x} instead".format(
-                expected_initial_seed, seed, actual_initial_seed)
+            msg = (f"expected initial_seed() = {expected_initial_seed:x} "
+                   f"after calling manual_seed({seed:x}), but got {actual_initial_seed:x} instead")
             self.assertEqual(expected_initial_seed, actual_initial_seed, msg=msg)
         for invalid_seed in [min_int64 - 1, max_uint64 + 1]:
             with self.assertRaisesRegex(RuntimeError, r'Overflow when unpacking long'):
@@ -8609,6 +9375,47 @@ def test_slice(self):
         self.assertEqual(x[:, -2:3].tolist(), [[2], [6], [10], [14]])
         self.assertEqual(x[0:-1:2].tolist(), [[0, 1, 2, 3], [8, 9, 10, 11]])
 
+    def test_split_with_sizes_copy_out(self):
+        device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+        shape = (30, 40, 50)
+        x = torch.rand(*shape, device=device)
+        cases = [
+            (0, [3, 7, 8, 12]),
+            (1, [3, 7, 10, 20]),
+            (-2, [3, 7, 10, 20]),
+            (2, [3, 7, 10, 12, 18]),
+            (-1, [3, 7, 10, 12, 18]),
+            (2, [3, 7, 10, 0, 30]),
+        ]
+        for dim, split_sizes in cases:
+            views = x.split_with_sizes(split_sizes, dim=dim)
+            expects = [v.clone() for v in views]
+            out = [torch.zeros_like(v) for v in views]
+            for expect, t in zip(expects, out):
+                if expect.numel() != 0:
+                    self.assertFalse(expect.eq(t).all().item())
+
+            torch.split_with_sizes_copy(x, split_sizes, dim=dim, out=out)
+            for expect, t in zip(expects, out):
+                self.assertTrue(expect.eq(t).all().item())
+
+            if not torch.cuda.is_available():
+                continue
+
+            # Test with cuda graph
+            out = [torch.zeros_like(v) for v in views]
+            for expect, t in zip(expects, out):
+                if expect.numel() != 0:
+                    self.assertFalse(expect.eq(t).all().item())
+
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g):
+                torch.split_with_sizes_copy(x, split_sizes, dim=dim, out=out)
+
+            g.replay()
+            for expect, t in zip(expects, out):
+                self.assertTrue(expect.eq(t).all().item())
+
     def test_type(self):
         x = torch.randn(3, 3).double()
         self.assertEqual(x.type('torch.FloatTensor').dtype, torch.float32)
@@ -8733,8 +9540,7 @@ def test_device(self):
 
         device_set = {'cpu', 'cpu:0', 'cuda', 'cuda:0', 'cuda:1', 'cuda:10', 'cuda:100'}
         device_hash_set = set()
-        for device in device_set:
-            device_hash_set.add(hash(torch.device(device)))
+        device_hash_set.update(hash(torch.device(device)) for device in device_set)
         self.assertEqual(len(device_set), len(device_hash_set))
 
         def get_expected_device_repr(device):
@@ -8973,8 +9779,13 @@ def test_tensor_base_init(self):
         # Direct construction not OK
         self.assertRaises(RuntimeError, lambda: torch._C.TensorBase())
 
-        # But construction of subclass is OK
-        class T(torch._C.TensorBase):
+        # Subclassing it directly no OK
+        with self.assertRaisesRegex(RuntimeError, "Cannot subclass"):
+            class Tfail(torch._C.TensorBase):
+                pass
+
+        # Doing so with Tensor is ok though
+        class T(torch.Tensor):
             pass
 
         T()
@@ -8993,7 +9804,7 @@ def test_tensor_base_new(self):
 
         # OK to call super().__new__, see
         # https://github.com/pytorch/pytorch/issues/57421
-        class TestTensor(torch._C.TensorBase):
+        class TestTensor(torch.Tensor):
             @staticmethod
             def __new__(cls, x, *args, **kwargs):
                 return super().__new__(cls, x, *args, **kwargs)
@@ -9193,7 +10004,7 @@ def __del__(self):
 
     def test_tensor_slot_dealloc(self):
 
-        class SlotTensor1(torch._C.TensorBase):
+        class SlotTensor1(torch.Tensor):
             __slots__ = ['slot1']
 
         class SlotTensor2(SlotTensor1):
@@ -9256,7 +10067,7 @@ def test_storage_dict_dealloc(self):
     def test_tensor_finalizer_dealloc(self):
         m = [False]
 
-        class FinalizerTensor(torch._C.TensorBase):
+        class FinalizerTensor(torch.Tensor):
             def __del__(self):
                 m[0] = True
 
@@ -9394,7 +10205,7 @@ def test_tensor_cycle_via_slots(self):
         m1 = [False]
         m2 = [False]
 
-        class SlotTensor1(torch._C.TensorBase):
+        class SlotTensor1(torch.Tensor):
             __slots__ = ['slot1']
 
             def __del__(self):
@@ -9459,7 +10270,7 @@ def test_storage_preserve_nonhermetic_in_hermetic_context(self):
         from torch.library import Library, impl
         global _my_storage
 
-        my_lib = Library("my_lib", "DEF")
+        my_lib = Library("my_lib", "DEF")  # noqa: TOR901
         my_lib.define('my_func() -> None')
 
         a = torch.tensor([1.])
@@ -9742,6 +10553,14 @@ def _checked_swap(self, t1, t2):
         self.assertEqual(t1_moved, new_t2_moved)
         self.assertEqual(t2_moved, new_t1_moved)
 
+        # tests that PyObject slots on TensorImpl are correctly swapped by
+        # checking that when the function applied on a swapped tensor is
+        # returns doesn't change the TensorImpl, the returned value (which is
+        # given by returning the reference to the PyObject in the TensorImpl's
+        # PyObjectSlot) is still correct
+        self.assertEqual(id(t1.fill_(0.5)), id(t1))
+        self.assertEqual(id(t2.fill_(0.5)), id(t2))
+
     @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "Dynamo adds weakrefs")
     def test_swap_basic(self):
         ts = [
@@ -9763,10 +10582,21 @@ def test_swap_basic(self):
             self.assertIs(holder[0], t1)
             self.assertEqual(t1.foo, "bar")
 
+            if t1.is_floating_point():
+                t3 = t1.clone().detach().requires_grad_(True)
+                out = t3 * 2
+                with self.assertRaisesRegex(RuntimeError, "Expected single reference to a's"):
+                    torch.utils.swap_tensors(t3, t2)
+                del out
+                # Now succeeds
+                torch.utils.swap_tensors(t3, t2)
+                torch.utils.swap_tensors(t1, t2)
+
             wr = weakref.ref(t1)
             with self.assertRaisesRegex(RuntimeError, "has weakref"):
                 torch.utils.swap_tensors(t1, t2)
 
+
     @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "Dynamo adds weakrefs")
     def test_swap_fail_slots(self):
         class MyTwoTensor(TwoTensor):
@@ -9818,7 +10648,7 @@ class MyTwoTensor4(TwoTensor):
 METHOD = 1
 INPLACE_METHOD = 2
 FUNCTIONAL = 4
-DIM_ARG = None
+DIM_ARG: None = None
 
 def make_neg_dim_test(name, tensor_arg, arg_constr, types, extra_dim=0):
     def neg_dim_test(self):
diff --git a/test/test_transformers.py b/test/test_transformers.py
index a3577248baa16..73f838143dd58 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -8,13 +8,12 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.functional import scaled_dot_product_attention
+from torch.nn.attention import sdpa_kernel, SDPBackend
 from torch.nn.attention.bias import CausalVariant, causal_lower_right, causal_upper_left
 from torch.nn.parameter import Parameter
 import unittest
-from unittest import expectedFailure as xfail
 from unittest.mock import patch, MagicMock, ANY
 import math
-from torch.backends.cuda import sdp_kernel, SDPBackend
 import torch.optim as optim
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyCUDA, onlyCPU
 from typing import List, Tuple, Optional
@@ -22,6 +21,7 @@
 from torch.testing._internal.common_utils import (
     TEST_WITH_ROCM,
     skipIfRocm,
+    skipIfTorchDynamo,
     TEST_FAIRSEQ,
     run_tests,
     parametrize,
@@ -31,15 +31,19 @@
     set_default_dtype,
     gradcheck,
     make_tensor,
-    NOTEST_CPU
+    NOTEST_CPU,
+    IS_WINDOWS,
+    TEST_WITH_TORCHDYNAMO,
 )
+from torch._dynamo.testing import CompileCounterWithBackend
 
 
 from torch.testing._internal.common_methods_invocations import wrapper_set_seed
 from torch.testing._internal.common_cuda import (
-    SM80OrLater, PLATFORM_SUPPORTS_FLASH_ATTENTION,
+    IS_JETSON, SM80OrLater, PLATFORM_SUPPORTS_FLASH_ATTENTION,
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
-    PLATFORM_SUPPORTS_FUSED_ATTENTION
+    PLATFORM_SUPPORTS_FUSED_ATTENTION,
+    PLATFORM_SUPPORTS_CUDNN_ATTENTION
 )
 
 if TEST_FAIRSEQ:
@@ -67,7 +71,7 @@ def use_deterministic_algorithims(mode: bool, warn_only: bool):
 default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float32: 1e-5}
 default_rtol = {torch.float16: 1e-3, torch.bfloat16: 1.6e-2, torch.float32: 1.3e-6}
 
-isSM86or89Device = torch.cuda.is_available() and torch.cuda.get_device_capability() in [(8, 6), (8, 9)]
+isSM8XDevice = torch.cuda.is_available() and torch.cuda.get_device_capability() in [(8, 6), (8, 7), (8, 9)]
 isSM90Device = torch.cuda.is_available() and torch.cuda.get_device_capability() == (9, 0)
 isSM5xDevice = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] == 5
 isLessThanSM80Device = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8
@@ -104,12 +108,6 @@ def get_tolerances(
         rtol = default_rtol[computed_value.dtype]
     return atol, rtol
 
-backend_map = {
-    SDPBackend.MATH: {"enable_math": True, "enable_flash": False, "enable_mem_efficient": False},
-    SDPBackend.FLASH_ATTENTION: {"enable_math": False, "enable_flash": True, "enable_mem_efficient": False},
-    SDPBackend.EFFICIENT_ATTENTION: {
-        "enable_math": False, "enable_flash": False, "enable_mem_efficient": True}
-}
 
 def query_key_value_clones(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, dtype: torch.dtype = None):
     """ Clones the query, key, and value tensors and moves them to the specified dtype. """
@@ -126,6 +124,8 @@ def get_platform_specific_sdpa():
         ret.append(SDPBackend.FLASH_ATTENTION)
     if PLATFORM_SUPPORTS_MEM_EFF_ATTENTION:
         ret.append(SDPBackend.EFFICIENT_ATTENTION)
+    if PLATFORM_SUPPORTS_CUDNN_ATTENTION:
+        ret.append(SDPBackend.CUDNN_ATTENTION)
     if not ret:
         # Add a placeholder, an empty list causes "An empty arg_values was passed to @parametrize"
         ret.append(SDPBackend.EFFICIENT_ATTENTION)
@@ -665,7 +665,7 @@ def test_encoder_padding_and_src_mask_bool(self):
             torch.arange(3)[None, :].cpu() >= input_seq_len[:, None]
         )
 
-        with self.assertNoLogs(None):
+        with (self.assertNoLogs(None) if not TEST_WITH_TORCHDYNAMO else contextlib.nullcontext()):
             encoder(
                 inputs,
                 mask=src_mask,
@@ -961,7 +961,7 @@ def forward(
                          f"{attn_dim}D_{'causal_' if is_causal else ''}attn_mask"
                          if attn_dim is not None else "no_attn_mask")))
     @parametrize("dropout_p", [0.0, 0.2, 0.5])
-    @sdp_kernel(enable_flash=False, enable_mem_efficient=False)
+    @sdpa_kernel(backends=[SDPBackend.MATH])
     def test_scaled_dot_product_attention(self, device, input_dim, attn_mask_dim, is_causal, dropout_p):
         def sdp_ref(
                 q,
@@ -1214,7 +1214,7 @@ def test_train_with_is_causal(self, device):
         mock_layer.assert_called_with(ANY, src_mask=ANY, is_causal=True, src_key_padding_mask=ANY)
 
         # check expected numerical values with all kernels
-        self.is_causal_kernels(["math"], device)
+        self.is_causal_kernels([SDPBackend.MATH], device)
 
     def is_causal_kernels(self, kernels, device):
         def ones_tensor(*shape):
@@ -1231,15 +1231,11 @@ def ones_tensor(*shape):
         )
 
         for kernel in kernels:
-            with torch.backends.cuda.sdp_kernel(
-                enable_math=(kernel == 'math'),
-                enable_flash=(kernel == 'flash'),
-                enable_mem_efficient=(kernel == 'meff')
-            ):
+            with sdpa_kernel(backends=[kernel]):
                 actual, _ = mha(qkv, qkv, qkv, attn_mask=mask, need_weights=False, is_causal=True)
                 self.assertTrue(torch.equal(actual, expected))
 
-                if kernel != 'math':
+                if kernel != SDPBackend.MATH:
                     # fails with embedding size not multiple of 4
                     with self.assertRaisesRegex(RuntimeError, "No available kernel"):
                         qkv_f, mha_f = ones_tensor(S, L, 2), nn.MultiheadAttention(2, H).to(device)
@@ -1255,7 +1251,7 @@ def ones_tensor(*shape):
     )
     def test_is_causal_gpu(self):
         device = 'cuda'
-        self.is_causal_kernels(["math", "meff"], device)
+        self.is_causal_kernels([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION], device)
 
     def test_script_mha_in_proj_weight_none(self):
         mha = torch.nn.MultiheadAttention(
@@ -1333,20 +1329,24 @@ class TestSDPAFailureModes(NNTestCase):
     _do_cuda_non_default_stream = True
 
     @onlyCUDA
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION or not isSM86or89Device,
-                     "Does not support fused SDPA or not SM86+ hardware")
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION or not isSM8XDevice,
+        "Does not support fused SDPA or not SM86+ hardware",
+    )
     @parametrize("head_dim", [193, 204, 256])
-    def test_flash_backward_failure_sm86plus(self, device, head_dim: int):
+    @parametrize("dropout_p", [0.0, 0.2])
+    def test_flash_backward_failure_sm86plus(self, device, head_dim: int, dropout_p: float):
         dtype = torch.float16
         make_tensor = partial(torch.rand, device=device, dtype=dtype)
-        # See check_requires_grad_and_head_dim_gt64_and_sm_ge86 in pytorch/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+        # See check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89 in
+        # pytorch/aten/src/ATen/native/transformers/cuda/sdp_utils.h
         size = (2, 2, 4, head_dim)
         q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
 
-        with sdp_kernel(enable_mem_efficient=False, enable_flash=False, enable_math=True):
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
             math_ref = torch.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False)
 
-        with sdp_kernel(enable_mem_efficient=False, enable_flash=True, enable_math=False):
+        with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
             # Should not fail because inputs don't require grad
             flash_ref = torch.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False)
 
@@ -1356,13 +1356,20 @@ def test_flash_backward_failure_sm86plus(self, device, head_dim: int):
             q = make_tensor(size, requires_grad=True)
             k = make_tensor(size, requires_grad=True)
             v = make_tensor(size, requires_grad=True)
-            self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
-                q, k, v, None, 0.0, False))
+            if 192 < head_dim <= 224 or (head_dim > 224 and dropout_p != 0.0):
+                self.assertRaises(
+                    RuntimeError,
+                    lambda: torch.nn.functional.scaled_dot_product_attention(
+                        q, k, v, None, dropout_p, False
+                    ),
+                )
+            else:
+                flash_ref = torch.nn.functional.scaled_dot_product_attention(q, k, v, None, dropout_p, False)
 
     @onlyCUDA
     def test_dispatch_fails_no_backend(self, device):
         dtype = torch.float16
-        with sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=False):
+        with sdpa_kernel(backends=[SDPBackend.ERROR]):
             size = (2, 3, 4)
             q = torch.randn(size, device=device, dtype=dtype)
             k = torch.randn(size, device=device, dtype=dtype)
@@ -1379,7 +1386,7 @@ def test_dispatch_fails_no_backend(self, device):
         PLATFORM_SPECIFIC_SDPA,
     )
     def test_invalid_fused_inputs_dim_3(self, device, kernel: SDPBackend):
-        with sdp_kernel(**backend_map[kernel]):
+        with sdpa_kernel(backends=[kernel]):
             # Dim is not 4
             size = (2, 3, 8)
             dtype = torch.float16
@@ -1397,7 +1404,7 @@ def test_invalid_fused_inputs_dim_3(self, device, kernel: SDPBackend):
         PLATFORM_SPECIFIC_SDPA,
     )
     def test_invalid_fused_inputs_broadcast(self, device, kernel: SDPBackend):
-        with sdp_kernel(**backend_map[kernel]):
+        with sdpa_kernel(backends=[kernel]):
             #  Fused Kernels don't support broadcasting for dense inputs
             dtype = torch.float16
             size = (2, 4, 3, 8)
@@ -1412,7 +1419,7 @@ def test_invalid_fused_inputs_broadcast(self, device, kernel: SDPBackend):
     @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_ATTENTION, "Does not support fused scaled dot product attention")
     @parametrize("kernel", PLATFORM_SPECIFIC_SDPA)
     def test_invalid_sequence_lengths(self, device, kernel: SDPBackend):
-        with sdp_kernel(**backend_map[kernel]):
+        with sdpa_kernel(backends=[kernel]):
             # Passing in a q,k,v with 0 length sequences will error
             dtype = torch.float16
             make_tensor = partial(torch.rand, device=device, dtype=dtype)
@@ -1426,8 +1433,8 @@ def test_invalid_sequence_lengths(self, device, kernel: SDPBackend):
     @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_ATTENTION, "Does not support fused scaled dot product attention")
     @parametrize("kernel", PLATFORM_SPECIFIC_SDPA)
     def test_invalid_last_dim_stride(self, device, kernel: SDPBackend):
-        with sdp_kernel(**backend_map[kernel]):
-            # Passing in a q,k,v with 0 length sequences will error
+        with sdpa_kernel(backends=[kernel]):
+            # Passing in a q,k,v with last dim stride not equal to 1 will error
             dtype = torch.float16
             make_tensor = partial(torch.rand, device=device, dtype=dtype)
             size = SdpaShape(2, 2, 8, 8)
@@ -1441,7 +1448,7 @@ def test_invalid_last_dim_stride(self, device, kernel: SDPBackend):
     @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not flash_attention fused scaled dot product attention")
     @parametrize("kernel", [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION])
     def test_invalid_fused_inputs_head_dim(self, device, kernel: SDPBackend):
-        with sdp_kernel(**backend_map[kernel]):
+        with sdpa_kernel(backends=[kernel]):
             # The embed dim per head is not divisible by 8 for flash attention
             dtype = torch.float16
             make_tensor = partial(torch.rand, device=device, dtype=dtype)
@@ -1457,7 +1464,7 @@ def test_invalid_fused_inputs_head_dim(self, device, kernel: SDPBackend):
         PLATFORM_SPECIFIC_SDPA,
     )
     def test_invalid_fused_inputs_invalid_dtype(self, device, kernel: SDPBackend):
-        with sdp_kernel(**backend_map[kernel]):
+        with sdpa_kernel(backends=[kernel]):
             # Invalid dtype for both Flash Attention and Mem Efficient Attention
             size = SdpaShape(2, 2, 3, 16)
             make_tensor = partial(torch.rand, device=device, dtype=torch.float64)
@@ -1469,7 +1476,7 @@ def test_invalid_fused_inputs_invalid_dtype(self, device, kernel: SDPBackend):
     @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention")
     @parametrize("kernel", [SDPBackend.FLASH_ATTENTION])
     def test_invalid_fused_inputs_attn_mask_present(self, device, kernel: SDPBackend):
-        with sdp_kernel(**backend_map[kernel]):
+        with sdpa_kernel(backends=[kernel]):
             # Failures for unsupported SDP args
             size = SdpaShape(2, 2, 3, 16)
             make_tensor = partial(torch.rand, size, device=device, dtype=torch.float16)
@@ -1487,7 +1494,7 @@ def test_unaligned_tensors(self, device):
         size = SdpaShape(2, 2, 8, 5)
         make_tensor = partial(torch.rand, size, device=device, dtype=dtype)
         q, k, v = make_tensor(), make_tensor(), make_tensor()
-        with sdp_kernel(enable_flash=False, enable_mem_efficient=True, enable_math=False):
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
             self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
                 q, k, v, None, 0.0, False))
 
@@ -1498,7 +1505,7 @@ def test_flash_fail_fp32(self, device):
         size = SdpaShape(16, 16, 32, 32)
         make_tensor = partial(torch.rand, size, device=device, dtype=dtype)
         q, k, v = make_tensor(), make_tensor(), make_tensor()
-        with sdp_kernel(enable_flash=True, enable_mem_efficient=False, enable_math=False):
+        with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
             with self.assertWarnsRegex(UserWarning, "Expected query, key and value to all be of dtype: {Half, BFloat16}"):
                 self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
                     q, k, v, None, 0.0, False))
@@ -1511,7 +1518,7 @@ def test_flash_autocast_fp32_float16(self, device):
         make_tensor = partial(torch.rand, size, device=device, dtype=dtype)
         q, k, v = make_tensor(), make_tensor(), make_tensor()
         with torch.autocast(device_type='cuda', dtype=torch.float16):
-            with sdp_kernel(enable_flash=True, enable_mem_efficient=False, enable_math=False):
+            with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
                 _ = torch.nn.functional.scaled_dot_product_attention(
                     q, k, v, None, 0.0, False)
 
@@ -1523,14 +1530,14 @@ def test_flash_autocast_fp32_bfloat16(self, device):
         make_tensor = partial(torch.rand, size, device=device, dtype=dtype)
         q, k, v = make_tensor(), make_tensor(), make_tensor()
         with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
-            with sdp_kernel(enable_flash=True, enable_mem_efficient=False, enable_math=False):
+            with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
                 _ = torch.nn.functional.scaled_dot_product_attention(
                     q, k, v, None, 0.0, False)
 
     # Note: do not truncate the list according to platforms. These tests should always raise errors.
     @parametrize("kernel", [SDPBackend.MATH, SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION])
     def test_invalid_inputs_different_datatypes(self, device, kernel: SDPBackend):
-        with sdp_kernel(**backend_map[kernel]):
+        with sdpa_kernel(backends=[kernel]):
             # Different datatypes
             shape = (1, 4, 8, 16)
             query = torch.randn(shape, dtype=torch.float32, device=device)
@@ -1550,7 +1557,7 @@ def test_invalid_inputs_different_devices(self, device, kernel: SDPBackend):
 
     @parametrize("kernel", [SDPBackend.MATH, SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION])
     def test_invalid_inputs_1_dimensional_inputs(self, device, kernel: SDPBackend):
-        with sdp_kernel(**backend_map[kernel]):
+        with sdpa_kernel(backends=[kernel]):
             # 1 dimensional input
             shape = (1, 4)
             query = torch.randn(4, dtype=torch.float16, device=device)
@@ -1576,7 +1583,7 @@ def test_fused_kernels_nested_broadcasting_error_cases(self, device):
         key = rand_nested_tensor(k_shape).transpose(1, 2)
         value = rand_nested_tensor(v_shape).transpose(1, 2)
 
-        with sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=True):
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
             with self.assertRaisesRegex(RuntimeError, "No available kernel"):
                 torch.nn.functional.scaled_dot_product_attention(
                     query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
@@ -1589,12 +1596,11 @@ def test_nested_fails_on_padding_head_dim(self, device):
         shape = SdpaShape(5, 8, seq_len_list, 57)
         make_tensor = partial(rand_sdpa_tensor, shape=shape, type="nested", device=device, dtype=dtype)
         q, k, v = make_tensor(), make_tensor(), make_tensor()
-        with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
+        with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
             with self.assertWarnsRegex(UserWarning, "For NestedTensor inputs, Flash attention requires"):
                 self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
                     q, k, v, None, 0.0, False))
 
-
     @onlyCUDA
     @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_ATTENTION or not isLessThanSM80Device,
                      "Current platform does not support fused SDPA or is an SM80+ device.")
@@ -1603,11 +1609,23 @@ def test_mem_efficient_fail_bfloat16_less_than_sm80(self, device):
         size = SdpaShape(16, 16, 32, 32)
         make_tensor = partial(torch.rand, size, device=device, dtype=dtype)
         q, k, v = make_tensor(), make_tensor(), make_tensor()
-        with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
             with self.assertWarnsRegex(UserWarning, "Expected query, key and value to all be of dtype: {Half, Float}"):
                 self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
                     q, k, v, None, 0.0, False))
 
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention")
+    def test_flash_atteention_large_bf16_nan_values(self, device):
+        query = torch.full((1, 1, 1, 64), 133120.0, dtype=torch.bfloat16, device="cuda")
+        key = torch.full((1, 1, 1, 64), 133120.0, dtype=torch.bfloat16, device="cuda")
+        value = torch.full((1, 1, 1, 64), 133120.0, dtype=torch.bfloat16, device="cuda")
+
+        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+            out = torch.nn.functional.scaled_dot_product_attention(query, key, value)
+
+        self.assertFalse(torch.isnan(out).any(), "Output should not contain NaNs!")
+
     @onlyCUDA
     @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_ATTENTION, "Fused SDPA was not built for this system")
     @parametrize("fused_kernel", [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION] if
@@ -1630,7 +1648,7 @@ def test_fused_kernels_seq_len_0_inputs(self, device, fused_kernel):
         key = key.transpose(1, 2)
         value = value.transpose(1, 2)
 
-        with sdp_kernel(**backend_map[fused_kernel]):
+        with sdpa_kernel(backends=[fused_kernel]):
             with self.assertRaisesRegex(RuntimeError, "No available kernel"):
                 torch.nn.functional.scaled_dot_product_attention(
                     query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
@@ -1654,7 +1672,7 @@ def test_fused_kernels_nested_broadcasting_requires_grad_failure(self, device):
         key = key.transpose(1, 2)
         value = value.transpose(1, 2)
 
-        with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]):
+        with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
             with self.assertWarnsRegex(UserWarning, "Both fused kernels do not support training with broadcasted NT inputs"):
                 with self.assertRaisesRegex(RuntimeError, "No available kernel"):
                     out = torch.nn.functional.scaled_dot_product_attention(
@@ -1670,42 +1688,40 @@ def test_flash_attention_fail_with_non_square_causal_attention(self, device):
         make_kv = partial(torch.rand, kv_shape, device=device, dtype=dtype)
         q, k, v = make_q(), make_kv(), make_kv()
         warning_str = "Flash attention does not support the is_causal flag when seqlen_q != seqlen_k."
-        with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]):
+        with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
             with self.assertWarnsRegex(UserWarning, warning_str):
                 self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
                     q, k, v, None, 0.0, is_causal=True))
 
-def _get_block_size(device, head_dim, is_causal):
+def _get_block_size_n(device, head_dim, is_dropout, is_causal):
     # This should match the block sizes in the CUDA kernel
-    # Mask is only interesting when we are setting dropout
-    is_dropout = True
     assert head_dim <= 256
     major, minor = torch.cuda.get_device_capability(device)
     is_sm8x = major == 8 and minor > 0  # Only include sm86 and sm89, exclude sm80 (A100)
     is_sm80 = major == 8 and minor == 0
     is_sm90 = major == 9 and minor == 0
     if head_dim <= 32:
-        return 128, 128
+        return 128
     if head_dim <= 64:
-        return (128, 128) if not is_dropout else (128, 64)
+        return 128 if not is_dropout else 64
     elif head_dim <= 96:
-        return (64, 64) if (is_sm8x and is_causal) else (128, 64)
+        return 64
     elif head_dim <= 128:
         if is_sm8x:
-            return (64, 64) if (not is_dropout and is_causal) else (128, 32)
+            return 64 if (not is_dropout and is_causal) else 32
         else:
-            return 128, (64 if not is_dropout else 32)
+            return 64 if not is_dropout else 32
     elif head_dim <= 160:
         if is_sm8x:
-            return (128, 64) if not is_causal else (64, 64)
+            return 64
         else:
-            return 128, 32
+            return 32
     elif head_dim <= 192:
-        return (128, 64) if not is_dropout else (64, 64)
+        return 64
     elif head_dim <= 224:
-        return (128, 64) if (is_sm80 or is_sm90) else (64, 64)
+        return 64
     elif head_dim <= 256:
-        return (128, 64) if is_sm80 else (64, 64)
+        return 64
 
 
 def pad_last_dim(input_tensor, alignment_size, slice: bool = False):
@@ -1746,7 +1762,7 @@ def test_sdp_math_gradcheck(self, device, contiguous_inputs: bool):
             key = key.contiguous()
             value = value.contiguous()
 
-        with sdp_kernel(enable_math=True, enable_mem_efficient=False, enable_flash=False):
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
             assert gradcheck(lambda *args, **kwargs:
                              wrapper_set_seed(torch.nn.functional.scaled_dot_product_attention, *args, **kwargs),
                              (query, key, value, None, 0.0, False)
@@ -1763,14 +1779,14 @@ def test_fused_sdp_choice_cpu(self, device, type: str, dropout: float, dtype: to
         q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
         if type == "nested" \
                 or dropout > 0.0 \
-                or dtype not in [torch.float32, torch.float64, torch.bfloat16]:
+                or dtype not in [torch.float32, torch.float64, torch.bfloat16, torch.float16]:
             assert torch._fused_sdp_choice(q, k, v, dropout_p=dropout) == SDPBackend.MATH.value
         else:
             assert torch._fused_sdp_choice(q, k, v, dropout_p=dropout) == SDPBackend.FLASH_ATTENTION.value
 
     @onlyCPU
     @parametrize("fused_kernel", [SDPBackend.FLASH_ATTENTION])
-    @parametrize("dtype", [torch.float64, torch.float32, torch.bfloat16])
+    @parametrize("dtype", [torch.float64, torch.float32, torch.bfloat16, torch.float16])
     @parametrize("batch_size", [2, 12])
     @parametrize("seq_len", [267, 1030])
     @parametrize("n_head", [1, 3])
@@ -1794,6 +1810,9 @@ def test_scaled_dot_product_fused_attention_vs_math_cpu(
         if dtype is torch.bfloat16:
             atol = 5e-2
             rtol = 5e-2
+        if dtype is torch.float16:
+            atol = 1e-2
+            rtol = 1e-2
 
         n_embd = n_head * head_dim
         make_tensor = partial(rand_sdpa_tensor, type="dense", device=device, dtype=dtype, packed=True, requires_grad=False)
@@ -1808,7 +1827,7 @@ def test_scaled_dot_product_fused_attention_vs_math_cpu(
         q, k, v = x.split(n_embd, dim=2)
         q2, k2, v2 = x2.split(n_embd, dim=2)
 
-        if dtype is torch.bfloat16:
+        if dtype in [torch.bfloat16, torch.float16]:
             q2 = q2.float()
             k2 = k2.float()
             v2 = v2.float()
@@ -1821,15 +1840,15 @@ def test_scaled_dot_product_fused_attention_vs_math_cpu(
         q2 = q2.view(batch_size, seq_len, n_head, head_dim).transpose(1, 2)
         v2 = v2.view(batch_size, seq_len, n_head, head_dim).transpose(1, 2)
 
-        with sdp_kernel(**backend_map[fused_kernel]):
+        with sdpa_kernel(backends=[fused_kernel]):
             actual = torch.nn.functional.scaled_dot_product_attention(
                 q, k, v, attn_mask=None, dropout_p=0.0, is_causal=causal)
-        with sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False):
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
             math_ref = torch.nn.functional.scaled_dot_product_attention(
                 q2, k2, v2, attn_mask=None, dropout_p=0.0, is_causal=causal)
 
-        if dtype is torch.bfloat16:
-            math_ref = math_ref.bfloat16()
+        if dtype in [torch.bfloat16, torch.float16]:
+            math_ref = math_ref.to(dtype)
 
         self.assertEqual(actual, math_ref, atol=atol, rtol=rtol)
 
@@ -1847,7 +1866,7 @@ def test_scaled_dot_product_fused_attention_vs_math_cpu(
 
     @onlyCPU
     @parametrize("fused_kernel", [SDPBackend.FLASH_ATTENTION])
-    @parametrize("dtype", [torch.float64, torch.float32, torch.bfloat16])
+    @parametrize("dtype", [torch.float64, torch.float32, torch.bfloat16, torch.float16])
     @parametrize("batch_size", [2, 12])
     @parametrize("q_seq_len", [267, 1030])
     @parametrize("kv_seq_len", [514, 1179])
@@ -1873,6 +1892,8 @@ def test_scaled_dot_product_fused_attention_mask_vs_math_cpu(
         tol = Tolerances(1e-5, 5e-6)
         if dtype is torch.bfloat16:
             tol = Tolerances(5e-2, 5e-2)
+        if dtype is torch.float16:
+            tol = Tolerances(1e-2, 1e-2)
 
         make_tensor = partial(rand_sdpa_tensor, type="dense", device=device, dtype=dtype, requires_grad=False)
         q_shape = SdpaShape(batch_size, n_head, q_seq_len, head_dim)
@@ -1890,7 +1911,7 @@ def test_scaled_dot_product_fused_attention_mask_vs_math_cpu(
             k2.requires_grad_(True)
             v2.requires_grad_(True)
 
-        if dtype is torch.bfloat16:
+        if dtype in [torch.bfloat16, torch.float16]:
             q2, k2, v2 = q2.float(), k2.float(), v2.float()
         # (B, nh, T, hs)
         q = q.view(batch_size, q_seq_len, n_head, head_dim).transpose(1, 2)
@@ -1908,17 +1929,17 @@ def test_scaled_dot_product_fused_attention_mask_vs_math_cpu(
         k2 = k2.view(batch_size, kv_seq_len, n_head, head_dim).transpose(1, 2)
         v2 = v2.view(batch_size, kv_seq_len, n_head, head_dim).transpose(1, 2)
 
-        with sdp_kernel(**backend_map[fused_kernel]):
+        with sdpa_kernel(backends=[fused_kernel]):
             actual = torch.nn.functional.scaled_dot_product_attention(
                 q, k, v, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
-        with sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False):
-            if not bool_mask and dtype is torch.bfloat16:
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
+            if not bool_mask and dtype in [torch.bfloat16, torch.float16]:
                 attn_mask = attn_mask.float()
             math_ref = torch.nn.functional.scaled_dot_product_attention(
                 q2, k2, v2, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
 
-        if dtype is torch.bfloat16:
-            math_ref = math_ref.bfloat16()
+        if dtype in [torch.bfloat16, torch.float16]:
+            math_ref = math_ref.to(dtype)
 
         self.assertEqual(actual, math_ref, atol=tol.atol, rtol=tol.rtol)
 
@@ -1945,7 +1966,7 @@ def ref(x):
 
         x = torch.randn(1, 3, 64, 64, device=device)
         ref_result = ref(x)
-        with sdp_kernel(**backend_map[kernel]):
+        with sdpa_kernel(backends=[kernel]):
             sdp_math = torch.nn.functional.scaled_dot_product_attention(x, x, x, scale=-1.0 / 0.0001)
         self.assertEqual(ref_result, sdp_math)
 
@@ -1963,7 +1984,114 @@ class TestSDPACudaOnly(NNTestCase):
     _do_cuda_memory_leak_check = True
     _do_cuda_non_default_stream = True
 
-    def convert_flash_attn_S_to_softmax(self, S, query_padding_mask, key_padding_mask, head_dim, causal=False):
+    # TODO USED FOR TESTING THE SCORES, e.g. testing ALIBI we don't need this now
+    def normalize_flash_attn_S(
+        self,
+        attn_unnorm,
+        q,
+        k,
+        v,
+        query_padding_mask=None,
+        key_padding_mask=None,
+        attn_bias=None,
+        is_dropout=False,
+        causal=False,
+        window_size=(-1, -1),  # -1 means infinite window size
+        scale=None,
+    ):
+        """
+        Arguments:
+            q: (batch_size, seqlen_q, nheads, head_dim)
+            k, v: (batch_size, seqlen_k, nheads, head_dim)
+            key_padding_mask: (batch_size, seqlen_q)
+            attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
+        Output:
+            softmax_lse: (batch_size, nheads, seqlen_q)
+            softmax_max: (batch_size, nheads, seqlen_q)
+        """
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        if causal:
+            window_size = (window_size[0], 0)
+        q, k, v = q.float(), k.float(), v.float()
+        _, seqlen_q, _, head_dim = q.shape
+        seqlen_k = k.shape[1]
+        b = q.shape[0]
+        from torch.nn.attention.bias import _calculate_scale
+        scale = _calculate_scale(head_dim, scale)
+        scores = torch.matmul(q.transpose(1, 2) * scale, k.permute(0, 2, 3, 1))
+        if key_padding_mask is not None:
+            scores.masked_fill_(~key_padding_mask.view(b, 1, 1, -1), float("-inf"))
+        if window_size[0] >= 0 or window_size[1] >= 0:
+            local_mask = self.construct_local_mask(
+                seqlen_q,
+                seqlen_k,
+                window_size,
+                query_padding_mask,
+                key_padding_mask,
+                q.device,
+            )
+            scores.masked_fill_(local_mask, float("-inf"))
+        if attn_bias is not None:
+            scores = scores + attn_bias.to(dtype=scores.dtype)
+        block_size_n = _get_block_size_n(scores.device, head_dim, is_dropout, causal)
+        scores_block = scores.split(block_size_n, dim=-1)
+        lse_block = torch.stack([torch.logsumexp(s, dim=-1) for s in scores_block], dim=-1)
+        lse = torch.logsumexp(lse_block, dim=-1)
+        # lse could be -inf (i.e. all values in scores are -inf), and we want to set those to inf
+        # so that when we do torch.exp(m - lse), we get 0.0 instead of NaN.
+        lse[lse == float("-inf")] = float("inf")
+        scores_max_block = torch.stack([torch.amax(s, dim=-1) for s in scores_block], dim=-1)
+        cummax_block = torch.cummax(scores_max_block.flip(-1), dim=-1).values.flip(-1).unbind(dim=-1)
+        attn_unnorm_block = attn_unnorm.split(block_size_n, dim=-1)
+        attn_norm = torch.cat(
+            [
+                a * (torch.exp(m - lse)).unsqueeze(-1)
+                for a, m in zip(attn_unnorm_block, cummax_block)
+            ],
+            dim=-1,
+        )
+        if query_padding_mask is not None:
+            attn_norm.masked_fill_(~query_padding_mask.view(b, 1, -1, 1), 0.0)
+            # attn_norm.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
+        return attn_norm.to(dtype=attn_unnorm.dtype)
+
+    def construct_local_mask(self, seqlen_q, seqlen_k, window_size, query_padding_mask, key_padding_mask, device):
+        # row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
+        row_idx = torch.arange(seqlen_q, device=device, dtype=torch.long).view(-1, 1)
+        col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+        sk = (
+            seqlen_k
+            if key_padding_mask is None
+            else key_padding_mask.sum(-1).view(-1, 1, 1, 1)
+            # else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+        )
+        sq = (
+            seqlen_q
+            if query_padding_mask is None
+            else query_padding_mask.sum(-1).view(-1, 1, 1, 1)
+            # else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+        )
+        if window_size[0] < 0:
+            return col_idx > row_idx + sk - sq + window_size[1]
+        else:
+            sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
+            return torch.logical_or(
+                col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),
+                col_idx < row_idx + sk - sq - window_size[0],
+            )
+
+    def convert_flash_attn_S_to_softmax(
+        self,
+        S,
+        seqlen_q,
+        seqlen_k,
+        query_padding_mask,
+        key_padding_mask,
+        causal=False,
+        window_size=(-1, -1),  # -1 means infinite window size
+    ):
         """FlashAttention stores the S matrix in a different way.
         Arguments:
             S: (batch_size, nheads, seqlen_q, seqlen_k)
@@ -1972,53 +2100,45 @@ def convert_flash_attn_S_to_softmax(self, S, query_padding_mask, key_padding_mas
         """
         if TEST_WITH_ROCM:
             return S
-
-        b, h, seqlen_q, seqlen_k = S.shape
-        warps_n = 4
-        blocksize_m, blocksize_n = _get_block_size(S.device, head_dim, causal)
-        nblocks_m = (seqlen_q + blocksize_m - 1) // blocksize_m
-        nblocks_n = (seqlen_k + blocksize_n - 1) // blocksize_n
-        mmas_n = (blocksize_n + 16 - 1) // 16
-
-        # Reshape S using PyTorch native functions
-        S_flat = S.view(b, h, nblocks_m, blocksize_m, nblocks_n, blocksize_n)
-        S_flat = S_flat.permute(0, 1, 2, 4, 3, 5)
-        S_flat = S_flat.reshape(b, h, nblocks_m, nblocks_n, (blocksize_m * blocksize_n))
-        S_converted = S_flat.view(b, h, nblocks_m, nblocks_n, mmas_n, -1, warps_n, 8, 4, 2, 2, 2)
-        S_converted = S_converted.permute(0, 1, 2, 5, 6, 10, 7, 3, 4, 9, 8, 11)
-        S_converted = S_converted.reshape(b, h, (nblocks_m * S_converted.size(3) *
-                                          warps_n * 2 * 8), (nblocks_n * mmas_n * 2 * 4 * 2))
+        b = S.shape[0]
 
         if causal:
-            causal_mask = torch.triu(torch.ones(seqlen_q, seqlen_k, dtype=torch.bool, device=S.device), 1)
-            S_converted.masked_fill_(causal_mask, 0.0)
+            window_size = (window_size[0], 0)
+        seqlen_q_rounded, seqlen_k_rounded = S.shape[-2:]
+        S_converted = S
+        if window_size[0] >= 0 or window_size[1] >= 0:
+            local_mask = self.construct_local_mask(
+                seqlen_q,
+                seqlen_k,
+                window_size,
+                query_padding_mask,
+                key_padding_mask,
+                S.device,
+            )
+            local_mask = F.pad(
+                local_mask,
+                (0, seqlen_k_rounded - seqlen_k, 0, seqlen_q_rounded - seqlen_q),
+                value=True,
+            )
+            S_converted = S_converted.masked_fill(local_mask, 0.0)
+
         # Need to zero out things not in attention_mask in case S was initialized with random values
         # and some of those values aren't overwritten.
-        seqlen_q_og = query_padding_mask.shape[-1] if query_padding_mask is not None else seqlen_q
+        seqlen_q_og = (
+            query_padding_mask.shape[-1] if query_padding_mask is not None else seqlen_q_rounded
+        )
         if query_padding_mask is not None:
-            if seqlen_q_og < seqlen_q:
-                query_padding_mask = F.pad(query_padding_mask, (0, seqlen_q - seqlen_q_og))
-            else:
-                query_padding_mask = query_padding_mask[:, :seqlen_q]
-            q_mask_fill = ~query_padding_mask.view(query_padding_mask.shape[0], 1, query_padding_mask.shape[1], 1)
-            S_converted = S_converted.masked_fill(q_mask_fill, 0.0)
+            query_padding_mask = F.pad(query_padding_mask, (0, seqlen_q_rounded - seqlen_q_og))
+            # S_converted = S_converted.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
+            S_converted = S_converted.masked_fill(~query_padding_mask.view(b, 1, -1, 1), 0.0)
         seqlen_k_og = key_padding_mask.shape[-1] if key_padding_mask is not None else seqlen_k
         if key_padding_mask is not None:
-            if seqlen_k_og < seqlen_k:
-                key_padding_mask = F.pad(key_padding_mask, (0, seqlen_k - seqlen_k_og))
-            else:
-                key_padding_mask = key_padding_mask[:, :seqlen_k]
-            k_mask_fill = ~key_padding_mask.view(key_padding_mask.shape[0], 1, 1, key_padding_mask.shape[1])
-            S_converted = S_converted.masked_fill(k_mask_fill, 0.0)
-        if seqlen_q_og < seqlen_q:
-            S_converted = S_converted[:, :, :seqlen_q_og, :]
-        else:
-            S_converted = F.pad(S_converted, (0, 0, 0, seqlen_q_og - seqlen_q))
-        if seqlen_k_og < seqlen_k:
-            S_converted = S_converted[:, :, :, :seqlen_k_og]
-        else:
-            S_converted = F.pad(S_converted, (0, seqlen_k_og - seqlen_k))
-        return S_converted
+            key_padding_mask = F.pad(key_padding_mask, (0, seqlen_k_rounded - seqlen_k_og))
+            S_converted = S_converted.masked_fill(~key_padding_mask.view(b, 1, 1, -1), 0.0)
+            # S_converted = S_converted.masked_fill(rearrange(~key_padding_mask, "b s -> b 1 1 s"), 0.0)
+        S_converted = F.pad(S_converted, (0, 0, 0, seqlen_q_og - seqlen_q_rounded))
+        S_converted = F.pad(S_converted, (0, seqlen_k_og - seqlen_k_rounded))
+        return S_converted[:, :, :seqlen_q, :seqlen_k]
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Fused SDPA was not built for this system")
     @parametrize("mask_dim", [1, 2, 3, 4])
@@ -2039,7 +2159,7 @@ def test_mem_efficient_attetntion_mask_variants(self, device, mask_dim: List[int
             mask = torch.randn((num_heads, seq_len_q, seq_len_kv), device=device, dtype=dtype)
         elif mask_dim == 4:
             mask = torch.randn((batch, num_heads, seq_len_q, seq_len_kv), device=device, dtype=dtype)
-        with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
             out = F.scaled_dot_product_attention(query, key, value, mask)
         out.sum().backward()
 
@@ -2053,7 +2173,7 @@ def test_mem_eff_attention_pad_mask(self, device, dtype):
         kv_shape = SdpaShape(batch, num_heads, seq_len_kv, head_dim)
         key, value = make_tensor(kv_shape), make_tensor(kv_shape)
         mask = torch.randn((batch, num_heads, seq_len_q, seq_len_kv), device=device, dtype=dtype)
-        with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
             out = F.scaled_dot_product_attention(query, key, value, mask)
         out.sum().backward()
 
@@ -2068,7 +2188,7 @@ def test_mem_eff_attention_non_contiguous_mask(self, device, dtype):
         key, value = make_tensor(kv_shape), make_tensor(kv_shape)
         mask = torch.randn((batch, num_heads, seq_len_q, seq_len_kv), device=device, dtype=dtype)
         mask = torch.as_strided(mask, (batch, num_heads, seq_len_q, seq_len_kv), (0, 0, 0, 1))
-        with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
             out = F.scaled_dot_product_attention(query, key, value, mask)
         out.sum().backward()
 
@@ -2085,7 +2205,7 @@ def test_mem_eff_attention_long_sequence_mask(self, device, dtype):
         kv_shape = SdpaShape(batch, num_heads, seq_len_kv, head_dim)
         key, value = make_tensor(kv_shape), make_tensor(kv_shape)
         mask = torch.randn((batch, num_heads, seq_len_q, seq_len_kv), device=device, dtype=dtype)
-        with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
             out = F.scaled_dot_product_attention(query, key, value, mask)
         out.sum().backward()
 
@@ -2103,10 +2223,10 @@ def test_mem_eff_attention_non_contig_mask_bug(self, device):
         attn_mask_strides = (14, 14, 14, 1)
 
         # Calculate the number of elements needed for each tensor
-        query_num_elements = max([size * stride for size, stride in zip(query_size, query_strides)])
-        key_num_elements = max([size * stride for size, stride in zip(key_size, key_strides)])
-        value_num_elements = max([size * stride for size, stride in zip(value_size, value_strides)])
-        attention_mask_num_elements = max([size * stride for size, stride in zip(attention_mask_size, attn_mask_strides)])
+        query_num_elements = max(size * stride for size, stride in zip(query_size, query_strides))
+        key_num_elements = max(size * stride for size, stride in zip(key_size, key_strides))
+        value_num_elements = max(size * stride for size, stride in zip(value_size, value_strides))
+        attention_mask_num_elements = max(size * stride for size, stride in zip(attention_mask_size, attn_mask_strides))
 
         # Create the tensors with the specified sizes and strides
         query = torch.randn(query_num_elements, device=device).as_strided(query_size, query_strides)
@@ -2114,7 +2234,7 @@ def test_mem_eff_attention_non_contig_mask_bug(self, device):
         value = torch.randn(value_num_elements, device=device).as_strided(value_size, value_strides)
         bias = torch.randn(attention_mask_num_elements, device=device).as_strided(attention_mask_size, attn_mask_strides)
 
-        with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
             out = F.scaled_dot_product_attention(query, key, value, bias)
             out_contig = F.scaled_dot_product_attention(query, key, value, bias.contiguous())
 
@@ -2153,10 +2273,10 @@ def test_scaled_dot_product_attention_fused_kernels_packed(self, device, type: s
             key = key.contiguous()
             value = value.contiguous()
 
-        with sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=True):
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
             actual = torch.nn.functional.scaled_dot_product_attention(
                 query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
-        with sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False):
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
             math_ref = torch.nn.functional.scaled_dot_product_attention(
                 query.contiguous(), key.contiguous(), value.contiguous(),
                 attn_mask=None, dropout_p=0.0, is_causal=False)
@@ -2197,11 +2317,11 @@ def rand_tensor(shape):
         key_lp = key_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
         value_lp = value_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
 
-        with sdp_kernel(**backend_map[fused_kernel]):
+        with sdpa_kernel(backends=[fused_kernel]):
             actual = torch.nn.functional.scaled_dot_product_attention(
                 query_lp, key_lp, value_lp, attn_mask=None, dropout_p=0.0, is_causal=False)
 
-        with sdp_kernel(**backend_map[SDPBackend.MATH]):
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
             math_ref_lp = torch.nn.functional.scaled_dot_product_attention(
                 query_lp.contiguous(), key_lp.contiguous(), value_lp.contiguous(),
                 attn_mask=None, dropout_p=0.0, is_causal=False)
@@ -2260,10 +2380,10 @@ def test_sdp_mem_efficient_grad_against_math(self, device, contiguous_inputs: bo
             key_lp = key_lp.contiguous()
             value_lp = value_lp.contiguous()
 
-        with sdp_kernel(enable_math=True, enable_mem_efficient=False, enable_flash=False):
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
             out = torch.nn.functional.scaled_dot_product_attention(query, key, value, None, 0.0, is_causal)
 
-        with sdp_kernel(enable_math=False, enable_mem_efficient=True, enable_flash=False):
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
             out_lp = torch.nn.functional.scaled_dot_product_attention(
                 query_lp, key_lp, value_lp, None, 0.0, is_causal)
 
@@ -2276,7 +2396,6 @@ def test_sdp_mem_efficient_grad_against_math(self, device, contiguous_inputs: bo
         # Cast up and compare
         self.assertEqual(qkv.grad, qkv_lp.grad.to(torch.float64), atol=1e-5, rtol=1e-5)
 
-    @skipIfRocm  # Small matrices
     @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention was not built for this system")
     @parametrize("contiguous_inputs", [True, False])
     @parametrize("is_causal", [True, False])
@@ -2309,10 +2428,10 @@ def test_sdp_flash_attention_grad_against_math(self, device, contiguous_inputs:
             key_lp = key_lp.contiguous()
             value_lp = value_lp.contiguous()
 
-        with sdp_kernel(enable_math=True, enable_mem_efficient=False, enable_flash=False):
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
             out = torch.nn.functional.scaled_dot_product_attention(query, key, value, None, 0.0, is_causal)
 
-        with sdp_kernel(enable_math=False, enable_mem_efficient=False, enable_flash=True):
+        with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
             out_lp = torch.nn.functional.scaled_dot_product_attention(
                 query_lp, key_lp, value_lp, None, 0.0, is_causal)
 
@@ -2327,6 +2446,8 @@ def test_sdp_flash_attention_grad_against_math(self, device, contiguous_inputs:
         # Bump down the tolearnce for blfoat16
         atol = 7e-4 if dtype == torch.float16 else 7e-3
         rtol = 7e-4 if dtype == torch.float16 else 7e-3
+        if TEST_WITH_ROCM:
+            atol = 9e-4 if dtype == torch.float16 else 9e-3
         self.assertEqual(qkv.grad, qkv_lp.grad.to(torch.float64), atol=atol, rtol=rtol)
 
     @skipIfRocm  # Missing nested and EFFICIENT_ATTENTION
@@ -2370,27 +2491,29 @@ def test_sdp_choice_with_determinism(self, device, warn_only):
         query, key, value = make_tensor(shape), make_tensor(shape), make_tensor(shape)
 
         with use_deterministic_algorithims(True, warn_only=warn_only):
-            with sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=True):
+            with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION, SDPBackend.MATH]):
                 assert torch._fused_sdp_choice(query, key, value) == SDPBackend.EFFICIENT_ATTENTION.value
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Platform does not support fused SDPA")
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_ATTENTION, "Fused SDPA was not built for this system")
+    @parametrize("fused_kernel", PLATFORM_SPECIFIC_SDPA)
     @parametrize("warn_only", [True, False])
-    def test_mem_eff_backwards_throws_determinism_warning(self, device, warn_only):
+    def test_fused_backwards_throws_determinism_warning(self, device, warn_only, fused_kernel):
         batch_size, seq_len, num_heads, head_dim = 1, 64, 8, 64
         shape = SdpaShape(batch_size, num_heads, seq_len, head_dim)
-        make_tensor = partial(rand_sdpa_tensor, type="dense", device=device, dtype=torch.float32, packed=False, requires_grad=True)
+        make_tensor = partial(rand_sdpa_tensor, type="dense", device=device, dtype=torch.float16, packed=False, requires_grad=True)
         query, key, value = make_tensor(shape), make_tensor(shape), make_tensor(shape)
 
+        kernel_name = "Memory Efficient attention" if fused_kernel == SDPBackend.EFFICIENT_ATTENTION else "Flash Attention"
         warning_context = (
             self.assertWarnsRegex(
                 UserWarning,
-                "Memory Efficient attention defaults to a non-deterministic algorithm.",
+                f"{kernel_name} defaults to a non-deterministic algorithm.",
             )
             if warn_only
             else contextlib.nullcontext()
         )
         with use_deterministic_algorithims(True, warn_only=warn_only):
-            with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
+            with sdpa_kernel(backends=[fused_kernel]):
                 with warning_context:
                     torch.nn.functional.scaled_dot_product_attention(query, key, value).sum().backward()
 
@@ -2407,7 +2530,7 @@ def test_mem_eff_backwards_determinism(self, device):
         value = torch.rand(batch_size, n_heads, seq_len, head_dim,
                            device=device, dtype=dtype, requires_grad=True)
 
-        with sdp_kernel(enable_mem_efficient=True, enable_math=False, enable_flash=False):
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
             # Run once to establish baseline
             out = F.scaled_dot_product_attention(query, key, value)
             upward_grad = torch.rand_like(out)
@@ -2447,6 +2570,7 @@ def test_mem_eff_backwards_determinism(self, device):
 
     # verified passing successfully on H100
     @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Does not support SDPA")
+    @unittest.skipIf(IS_JETSON, "causing sigkill on Jetson")
     @parametrize("batch_size", [1, 8])
     @parametrize("seq_len_q", [4, 8, 64, 128, 256, 512, 1024, 2048] if SM80OrLater else [4, 8, 64, 128, 256, 512])
     @parametrize("seq_len_k", [4, 8, 64, 128, 256, 512, 1024, 2048] if SM80OrLater else [4, 8, 64, 128, 256, 512])
@@ -2484,13 +2608,13 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
         query_ref, key_ref, value_ref = query_key_value_clones(query, key, value, dtype=higher_precision_dtype)
 
         # Create real output
-        with sdp_kernel(enable_mem_efficient=True, enable_flash=False, enable_math=False):
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
             # Set the seed and run the kernel
             torch.manual_seed(seed)
             out = F.scaled_dot_product_attention(query, key, value, dropout_p=dropout_p, is_causal=is_causal, scale=scale)
 
         if dropout_p == 0.0:
-            with sdp_kernel(enable_math=True, enable_flash=False, enable_mem_efficient=False):
+            with sdpa_kernel(backends=[SDPBackend.MATH]):
                 # High Precision Math Reference
                 out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref,
                                                          dropout_p=dropout_p, is_causal=is_causal, scale=scale)
@@ -2548,6 +2672,7 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
 
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Does not support SDPA")
+    @unittest.skipIf(IS_JETSON, "causing sigkill on Jetson")
     @parametrize("batch_size", [1, 8])
     @parametrize("seq_len_q", [4, 8, 64, 128, 256, 312, 512, 1024, 2048] if SM80OrLater else [4, 8, 64, 128, 152, 256, 512])
     @parametrize("seq_len_k", [4, 8, 64, 65, 128, 256, 408, 512, 1024, 2048] if SM80OrLater else [4, 8, 37, 64, 128, 256, 512])
@@ -2590,14 +2715,14 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
         attn_mask_ref = attn_mask.detach().to(higher_precision_dtype).requires_grad_(True)
 
         # Create real output
-        with sdp_kernel(enable_mem_efficient=True, enable_flash=False, enable_math=False):
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
             # Set the seed and run the kernel
             torch.manual_seed(seed)
             out = F.scaled_dot_product_attention(query, key, value, attn_mask, dropout_p=dropout_p,
                                                  is_causal=is_causal, scale=scale)
 
         if dropout_p == 0.0:
-            with sdp_kernel(enable_math=True, enable_flash=False, enable_mem_efficient=False):
+            with sdpa_kernel(backends=[SDPBackend.MATH]):
                 # High Precision Math Reference
                 out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref, attn_mask_ref,
                                                          dropout_p=dropout_p, is_causal=is_causal, scale=scale)
@@ -2665,6 +2790,7 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
                          atol=grad_attn_mask_atol, rtol=grad_attn_mask_rtol)
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support SDPA or pre-SM80 hardware")
+    @unittest.skipIf(IS_JETSON, "causing sigkill on Jetson")
     @parametrize("batch_size", [1, 8])
     @parametrize("seq_len_q", [4, 8, 64, 143, 256, 512, 1024, 2048])
     @parametrize("seq_len_k", [4, 8, 64, 128, 256, 587, 1024, 2048])
@@ -2676,18 +2802,8 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
     def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_len_q: int, seq_len_k: int,
                                                head_dim: int, is_causal: bool, dropout_p: float, dtype: torch.dtype,
                                                scale: str):
-        if TEST_WITH_ROCM:
-            def is_power_of_2(n):
-                return n & (n - 1) == 0
-            if not is_power_of_2(seq_len_q) or not is_power_of_2(seq_len_k) or not is_power_of_2(head_dim):
-                self.skipTest("Flash attention on ROCM only supports power of two seq_len_q seq_len_k headdim, for now.")
-            if head_dim < 16 or seq_len_q < 16 or seq_len_k < 16:
-                self.skipTest("Flash attention on ROCM only supports power of two seq_len_q, seq_len_k, headdim >= 16, for now.")
-            if head_dim > 128:
-                self.skipTest("Flash attention on ROCM only supports power of two headdim <= 128, for now.")
-
-        if isSM86or89Device and head_dim in range(193, 256 + 1):
-            self.skipTest("Flash attention on sm86 and sm89 for headdim > 192 currently disabled")
+        if isSM8XDevice and head_dim in range(193, 256 + 1):
+            self.skipTest("Flash attention on sm86, sm87, and sm89 for headdim > 192 currently disabled")
         if is_causal and seq_len_q != seq_len_k:
             self.skipTest("Flash V2 does not accept is_casual when seq_len_q != seq_len_k")
 
@@ -2709,11 +2825,9 @@ def is_power_of_2(n):
         is_dropout = dropout_p > 0.0
 
         if not is_dropout:
-            # Problem: We pad sizes in the composite region of the top level SDPA. But we need the
-            # Debug mask when have dropout. So I am going to manualy pad up here when testing dropout
-            with sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
+            with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
                 out = F.scaled_dot_product_attention(query, key, value, dropout_p=dropout_p, is_causal=is_causal, scale=scale)
-            with sdp_kernel(enable_math=True, enable_flash=False, enable_mem_efficient=False):
+            with sdpa_kernel(backends=[SDPBackend.MATH]):
                 # High Precision Math Reference
                 out_ref = F.scaled_dot_product_attention(
                     query_ref, key_ref, value_ref, is_causal=is_causal, scale=scale)
@@ -2721,6 +2835,8 @@ def is_power_of_2(n):
                 out_lp_ref = F.scaled_dot_product_attention(
                     query_ref_lp, key_ref_lp, value_ref_lp, is_causal=is_causal, scale=scale)
         else:
+            # Problem: We pad sizes in the composite region of the top level SDPA. But we need the
+            # Debug mask when have dropout. So I am going to manualy pad up here when testing dropout
             q_padded, q_og_size = pad_last_dim(query, 8)
             k_padded, k_og_size = pad_last_dim(key, 8)
             v_padded, v_og_size = pad_last_dim(value, 8)
@@ -2739,9 +2855,14 @@ def is_power_of_2(n):
                 batch_size, seq_len_k, device=device, dtype=torch.bool)
 
             softmax_mask = self.convert_flash_attn_S_to_softmax(
-                dbug_mask, query_padding_mask, key_padding_mask, head_dim=head_dim,
+                dbug_mask, seq_len_q, seq_len_k, query_padding_mask, key_padding_mask,
                 causal=is_causal)[:, :, :seq_len_q, :seq_len_k]
             dropout_mask = softmax_mask >= 0
+            # attn_unnorm = softmax_mask.abs()
+            # attn = self.normalize_flash_attn_S(attn_unnorm, q_padded,
+            #                                    k_padded, v_padded, query_padding_mask,
+            #                                    key_padding_mask, None, True, is_causal, scale=scale)
+
             # High Precision Math Reference
             out_ref = torch.ops.aten._scaled_dot_product_attention_math(
                 query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal, scale=scale, dropout_mask=dropout_mask)[0]
@@ -2752,8 +2873,8 @@ def is_power_of_2(n):
 
         upstream_grad = torch.rand_like(out, requires_grad=False)
 
-        # backward for flash attention on sm86 and sm89 for headdim > 64 currently disabled
-        if isSM86or89Device and head_dim in range(193, 256):
+        # backward for flash attention on sm86, sm87, and sm89 for headdim >= 193 currently disabled
+        if isSM8XDevice and head_dim in range(193, 256):
             self.assertRaises(RuntimeError, lambda: out.backward(upstream_grad))
             return
         out.backward(upstream_grad)
@@ -2768,7 +2889,9 @@ def is_power_of_2(n):
         query_fudge_factor = 4
         grad_q_ref_atol, grad_q_ref_rtol = get_tolerances(query_ref.grad, query_ref_lp.grad, query_fudge_factor)
 
-        grad_k_ref_atol, grad_k_ref_rtol = get_tolerances(key_ref.grad, key_ref_lp.grad)
+        key_fudge_factor = 2
+        grad_k_ref_atol, grad_k_ref_rtol = get_tolerances(key_ref.grad, key_ref_lp.grad, key_fudge_factor)
+
         value_fudge_factor = 2
         grad_v_ref_atol, grad_v_ref_rtol = get_tolerances(value_ref.grad, value_ref_lp.grad, value_fudge_factor)
 
@@ -2820,7 +2943,8 @@ def get_dropout_mask(output, fused_kernel, batch_size, n_heads, q_len, kv_len, d
                     batch_size, seq_len_k, device=device, dtype=torch.bool)
 
                 softmax_mask = self.convert_flash_attn_S_to_softmax(
-                    dbug_mask, query_padding_mask, key_padding_mask, head_dim=head_dim, causal=is_causal)
+                    dbug_mask, seq_len_q, seq_len_k, query_padding_mask, key_padding_mask,
+                    causal=is_causal)[:, :, :seq_len_q, :seq_len_k]
                 dropout_mask = softmax_mask >= 0
                 return dropout_mask
 
@@ -2885,7 +3009,7 @@ def get_dropout_mask(output, fused_kernel, batch_size, n_heads, q_len, kv_len, d
             # replays produce different results
             self.assertNotEqual(out_first, out)
 
-        with sdp_kernel(enable_math=True, enable_flash=False, enable_mem_efficient=False):
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
             if dropout_p == 0.0:
                 # High Precision Math Reference
                 out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref,
@@ -2965,10 +3089,10 @@ def test_fused_kernels_seq_len_1_inputs(self, device, fused_kernel):
         key = key.transpose(1, 2)
         value = value.transpose(1, 2)
 
-        with sdp_kernel(**backend_map[fused_kernel]):
+        with sdpa_kernel(backends=[fused_kernel]):
             actual = torch.nn.functional.scaled_dot_product_attention(
                 query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
-        with sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False):
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
             math_ref = torch.nn.functional.scaled_dot_product_attention(
                 query.contiguous().to(torch.float32),
                 key.contiguous().to(torch.float32),
@@ -3057,10 +3181,10 @@ def _broadcast(t, batch_broadcasted, num_heads_broadcasted):
         key = key.transpose(1, 2)
         value = value.transpose(1, 2)
 
-        with sdp_kernel(**backend_map[kernel]):
+        with sdpa_kernel(backends=[kernel]):
             actual = torch.nn.functional.scaled_dot_product_attention(
                 query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
-        with sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False):
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
             math_ref = torch.nn.functional.scaled_dot_product_attention(
                 query_expanded.contiguous(), key_expanded.contiguous(), value_expanded.contiguous(),
                 attn_mask=None, dropout_p=0.0, is_causal=False)
@@ -3091,10 +3215,10 @@ def test_fused_kernels_nested_broadcasting_query_dense(self, device):
         key = key.transpose(1, 2)
         value = value.transpose(1, 2)
 
-        with sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=True):
+        with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
             actual = torch.nn.functional.scaled_dot_product_attention(
                 query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
-        with sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False):
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
             math_ref = torch.nn.functional.scaled_dot_product_attention(
                 query_expanded.contiguous(), key.contiguous(), value_expanded.contiguous(),
                 attn_mask=None, dropout_p=0.0, is_causal=False)
@@ -3147,9 +3271,9 @@ def rand_nt(sequence_list, num_heads, head_dim):
         is_dropout = dropout_p > 0.0
 
         if not is_dropout:
-            with sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
+            with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
                 out = F.scaled_dot_product_attention(query, key, value, dropout_p=dropout_p, is_causal=is_causal, scale=scale)
-            with sdp_kernel(enable_math=True, enable_flash=False, enable_mem_efficient=False):
+            with sdpa_kernel(backends=[SDPBackend.MATH]):
                 # High Precision Math Reference
                 out_ref = F.scaled_dot_product_attention(
                     query_ref, key_ref, value_ref, is_causal=is_causal, scale=scale)
@@ -3175,7 +3299,7 @@ def rand_nt(sequence_list, num_heads, head_dim):
             key_padding_mask = key_padding_mask.to("cuda")
 
             softmax_mask = self.convert_flash_attn_S_to_softmax(
-                dbug_mask, query_padding_mask, key_padding_mask, head_dim=head_dim, causal=is_causal)
+                dbug_mask, max_seq_len_q, max_seq_len_kv, query_padding_mask, key_padding_mask, causal=is_causal)
             dropout_mask = softmax_mask >= 0
             nt_stack = []
             for tensor_component in range(batch_size):
@@ -3216,11 +3340,19 @@ def rand_nt(sequence_list, num_heads, head_dim):
         self.assertEqual(value.grad, value_ref.grad.to(value.grad.dtype),
                          atol=grad_v_ref_atol, rtol=grad_v_ref_rtol)
 
-class TestAttnMasks(NNTestCase):
+class TestAttnBias(NNTestCase):
 
-    def run_test(self, device, compile, make_q, make_kv, attn_bias=None,
-                 forw_tolerances: Optional[Tolerances] = None, grad_tolerances: Optional[Tolerances] = None):
-        if compile:
+    def run_test(
+        self,
+        device,
+        make_q,
+        make_kv,
+        attn_bias=None,
+        forw_tolerances: Optional[Tolerances] = None,
+        grad_tolerances: Optional[Tolerances] = None,
+        backend=None,
+    ):
+        if backend is not None:
             torch._dynamo.reset()
 
         query, key, value = make_q(), make_kv(), make_kv()
@@ -3232,8 +3364,8 @@ def run_test(self, device, compile, make_q, make_kv, attn_bias=None,
         )
 
         sdpa_op = (
-            torch.compile(scaled_dot_product_attention, fullgraph=True)
-            if compile
+            torch.compile(scaled_dot_product_attention, backend=backend)
+            if backend is not None
             else scaled_dot_product_attention
         )
         sdpa_output = sdpa_op(
@@ -3288,16 +3420,18 @@ def test_causal_variants(self, device, causal_variant: CausalVariant, shape: Lis
         else:
             attn_bias = causal_lower_right(seq_len_q, seq_len_kv)
 
-        self.run_test(device, False, make_q_tensor, make_kv_tensor, attn_bias, forw_tol, grad_tol)
+        self.run_test(device, make_q_tensor, make_kv_tensor, attn_bias, forw_tol, grad_tol, backend=None)
 
-    @skipIfRocm  # No support for the second variant for now
+    @skipIfRocm  # CausalVariant
     @parametrize("causal_variant", [CausalVariant.UPPER_LEFT, CausalVariant.LOWER_RIGHT])
     @parametrize(
         "shape",
         [(16, 16, 128, 128, 16), (16, 16, 128, 256, 32), (16, 16, 256, 128, 32), (1, 1, 23, 56, 15)],
     )
-    @xfail
+    @unittest.skipIf(IS_WINDOWS, "torch.compile is not supported on windows")
+    @skipIfTorchDynamo("This function already calls torch.compile.")
     def test_causal_variants_compile(self, device, causal_variant: CausalVariant, shape: List[Tuple[int]]):
+        cnts = CompileCounterWithBackend("aot_eager")
         make_tensor = partial(
             torch.rand, device=device, dtype=torch.float16, requires_grad=True
         )
@@ -3317,7 +3451,8 @@ def test_causal_variants_compile(self, device, causal_variant: CausalVariant, sh
         else:
             attn_bias = causal_lower_right(seq_len_q, seq_len_kv)
 
-        self.run_test(device, True, make_q_tensor, make_kv_tensor, attn_bias, forw_tol, grad_tol)
+        self.run_test(device, make_q_tensor, make_kv_tensor, attn_bias, forw_tol, grad_tol, backend=cnts)
+        self.assertEqual(cnts.frame_count, 1, "Compiled graph should have 1 frame!")
 
     @parametrize("shape", [(16, 16, 128, 128, 16), (16, 16, 128, 256, 32), (16, 16, 256, 128, 32), (1, 1, 23, 56, 15)])
     def test_is_causal_equals_upper_left(self, device, shape: List[Tuple[int]]):
@@ -3365,7 +3500,7 @@ def test_is_causal_and_mask_fails(self, device):
 instantiate_device_type_tests(TestSDPAFailureModes, globals(), only_for=device_types)
 instantiate_device_type_tests(TestSDPA, globals(), only_for=device_types)
 instantiate_device_type_tests(TestSDPACudaOnly, globals(), only_for=("cuda"))
-instantiate_device_type_tests(TestAttnMasks, globals(), only_for=device_types)
+instantiate_device_type_tests(TestAttnBias, globals(), only_for=device_types)
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_type_info.py b/test/test_type_info.py
index 729de86d24dd7..1288b40ff64eb 100644
--- a/test/test_type_info.py
+++ b/test/test_type_info.py
@@ -6,6 +6,7 @@
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
 
+import sys
 import torch
 import unittest
 
@@ -23,6 +24,8 @@ def test_invalid_input(self):
         for dtype in [torch.int64, torch.int32, torch.int16, torch.int8, torch.uint8, torch.bool]:
             with self.assertRaises(TypeError):
                 _ = torch.finfo(dtype)
+            with self.assertRaises(RuntimeError):
+                dtype.to_complex()
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_iinfo(self):
@@ -88,6 +91,26 @@ def test_finfo(self):
         self.assertEqual(xinfo.resolution, 1.0)
         self.assertEqual(xinfo.dtype, "float8_e4m3fn")
 
+    def test_to_complex(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/124868
+        # If reference count is leaked this would be a set of 10 elements
+        ref_cnt = {sys.getrefcount(torch.float32.to_complex()) for _ in range(10)}
+        self.assertLess(len(ref_cnt), 3)
+
+        self.assertEqual(torch.float64.to_complex(), torch.complex128)
+        self.assertEqual(torch.float32.to_complex(), torch.complex64)
+        self.assertEqual(torch.float16.to_complex(), torch.complex32)
+
+    def test_to_real(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/124868
+        # If reference count is leaked this would be a set of 10 elements
+        ref_cnt = {sys.getrefcount(torch.cfloat.to_real()) for _ in range(10)}
+        self.assertLess(len(ref_cnt), 3)
+
+        self.assertEqual(torch.complex128.to_real(), torch.double)
+        self.assertEqual(torch.complex64.to_real(), torch.float32)
+        self.assertEqual(torch.complex32.to_real(), torch.float16)
+
 if __name__ == '__main__':
     TestCase._default_dtype_check_enabled = True
     run_tests()
diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py
index ceef13b459d86..dd96bd2f2fc24 100644
--- a/test/test_type_promotion.py
+++ b/test/test_type_promotion.py
@@ -941,8 +941,10 @@ def test_integer_addcdiv_deprecated(self, device, dtype):
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     @float_double_default_dtype
     @onlyCPU
-    @dtypes(*list(itertools.product(set(numpy_to_torch_dtype_dict.values()),
-                                    set(numpy_to_torch_dtype_dict.values()))))
+    # NB: skip uint16,32,64 as PyTorch doesn't implement promotion for them
+    @dtypes(*list(itertools.product(
+        set(numpy_to_torch_dtype_dict.values()) - {torch.uint16, torch.uint32, torch.uint64},
+        set(numpy_to_torch_dtype_dict.values()) - {torch.uint16, torch.uint32, torch.uint64})))
     def test_numpy_array_binary_ufunc_promotion(self, device, dtypes):
         import operator
         np_type = torch_to_numpy_dtype_dict[dtypes[0]]
diff --git a/test/test_typing.py b/test/test_typing.py
index ba542e18f0d59..3793700a5c791 100644
--- a/test/test_typing.py
+++ b/test/test_typing.py
@@ -67,7 +67,7 @@ def _run_mypy() -> Dict[str, List[str]]:
                 directory,
             ]
         )
-        assert not stderr, directory
+        assert not stderr, stderr
         stdout = stdout.replace("*", "")
 
         # Parse the output
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index a1f781e0367ac..53fff01189d9b 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -20,7 +20,7 @@
     skipIfNoSciPy,
     IS_WINDOWS,
     gradcheck,
-    skipIfTorchDynamo,
+    is_iterable_of_tensors,
 )
 from torch.testing._internal.common_methods_invocations import (
     unary_ufuncs,
@@ -40,6 +40,7 @@
     precisionOverride,
     dtypesIfCPU,
 )
+from torch.utils import _pytree as pytree
 
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
@@ -267,7 +268,6 @@ def _helper_reference_numerics(
     #   1D tensors and a large 2D tensor with interesting and extremal values
     #   and noncontiguities.
     @suppress_warnings
-    @skipIfTorchDynamo  # really flaky
     @ops(reference_filtered_ops)
     def test_reference_numerics_normal(self, device, dtype, op):
         tensors = generate_elementwise_unary_tensors(
@@ -276,7 +276,6 @@ def test_reference_numerics_normal(self, device, dtype, op):
         self._test_reference_numerics(dtype, op, tensors)
 
     @suppress_warnings
-    @skipIfTorchDynamo  # really flaky
     @ops(reference_filtered_ops)
     def test_reference_numerics_small(self, device, dtype, op):
         if dtype in (torch.bool,):
@@ -288,7 +287,6 @@ def test_reference_numerics_small(self, device, dtype, op):
         self._test_reference_numerics(dtype, op, tensors)
 
     @suppress_warnings
-    @skipIfTorchDynamo  # really flaky
     @ops(reference_filtered_ops)
     def test_reference_numerics_large(self, device, dtype, op):
         if dtype in (torch.bool, torch.uint8, torch.int8):
@@ -322,9 +320,10 @@ def test_contig_vs_every_other(self, device, dtype, op):
         self.assertFalse(non_contig.is_contiguous())
 
         torch_kwargs, _ = op.sample_kwargs(device, dtype, non_contig)
-        self.assertEqual(
-            op(contig, **torch_kwargs)[::2], op(non_contig, **torch_kwargs)
-        )
+        expected = op(non_contig, **torch_kwargs)
+        result = op(contig, **torch_kwargs)
+        result = pytree.tree_map(lambda x: x[::2], result)
+        self.assertEqual(result, expected)
 
     @ops(unary_ufuncs)
     def test_contig_vs_transposed(self, device, dtype, op):
@@ -337,7 +336,10 @@ def test_contig_vs_transposed(self, device, dtype, op):
         self.assertFalse(non_contig.is_contiguous())
 
         torch_kwargs, _ = op.sample_kwargs(device, dtype, contig)
-        self.assertEqual(op(contig, **torch_kwargs).T, op(non_contig, **torch_kwargs))
+        expected = op(non_contig, **torch_kwargs)
+        result = op(contig, **torch_kwargs)
+        result = pytree.tree_map(lambda x: x.T, result)
+        self.assertEqual(result, expected)
 
     @ops(unary_ufuncs)
     def test_non_contig(self, device, dtype, op):
@@ -389,8 +391,9 @@ def test_non_contig_expand(self, device, dtype, op):
             contig = op(contig, **torch_kwargs)
             non_contig = op(non_contig, **torch_kwargs)
             for i in range(3):
+                non_contig_i = pytree.tree_map(lambda x: x[i], non_contig)
                 self.assertEqual(
-                    contig, non_contig[i], msg="non-contiguous expand[" + str(i) + "]"
+                    contig, non_contig_i, msg="non-contiguous expand[" + str(i) + "]"
                 )
 
     @ops(unary_ufuncs)
@@ -437,7 +440,12 @@ def test_batch_vs_slicing(self, device, dtype, op):
 
         torch_kwargs, _ = op.sample_kwargs(device, dtype, input)
         actual = op(input, **torch_kwargs)
-        expected = torch.stack([op(slice, **torch_kwargs) for slice in input])
+
+        all_outs = [op(slice, **torch_kwargs) for slice in input]
+        if is_iterable_of_tensors(actual):
+            expected = [torch.stack([out[i] for out in all_outs]) for i in range(len(actual))]
+        else:
+            expected = torch.stack(all_outs)
 
         self.assertEqual(actual, expected)
 
diff --git a/test/test_utils.py b/test/test_utils.py
index 7416477311e6e..b151b5141a280 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -10,6 +10,7 @@
 import traceback
 import textwrap
 import unittest
+import warnings
 from typing import Any, List, Dict
 import torch
 import torch.nn as nn
@@ -24,7 +25,7 @@
 from torch.testing._internal.common_methods_invocations import op_db
 import torch.cuda
 from torch.utils._pytree import tree_any, tree_all_only
-from torch.utils.checkpoint import checkpoint, checkpoint_sequential
+from torch.utils.checkpoint import checkpoint, checkpoint_sequential, get_device_states, _infer_device_type
 from torch.utils._device import set_device
 from torch.utils._traceback import report_compile_source_on_error, format_traceback_short, CapturedTraceback
 import torch.utils.cpp_extension
@@ -146,7 +147,7 @@ def test_checkpoint_valid(self):
         chunks = 2
         modules = list(model.children())
         out = checkpoint_sequential(modules, chunks, input_var, use_reentrant=True)
-        with self.assertRaisesRegex(RuntimeError, "Checkpointing is not compatible"):
+        with self.assertRaisesRegex(RuntimeError, "torch.utils.checkpoint is incompatible"):
             torch.autograd.grad(
                 outputs=[out], grad_outputs=[torch.ones(1, 5)], inputs=[input_var], create_graph=True
             )
@@ -460,6 +461,52 @@ def test_fn(x):
         self.assertEqual(non_retain_stats, checkpoint_non_retain_stats)
         self.assertEqual(non_retain_stats, checkpoint_retain_stats)
 
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_get_device_states_recursive(self):
+        inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="cuda:1")]}
+        device_ids, device_states = get_device_states(inp)
+        self.assertEqual(2, len(device_ids))
+        self.assertEqual(2, len(device_states))
+        self.assertEqual(0, device_ids[0])
+        self.assertEqual(1, device_ids[1])
+        self.assertTrue(isinstance(device_states[0], torch.Tensor))
+        self.assertTrue(isinstance(device_states[1], torch.Tensor))
+
+    def test_infer_device_state_recursive_meta(self):
+        inp = {'foo' : torch.rand(10, device="meta")}
+        device_type = _infer_device_type(inp)
+        self.assertEqual("meta", device_type)
+
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_infer_device_state_recursive_multi_cuda(self):
+        # Check that no warning is issued for either cuda:0, cuda:1 or
+        # cuda:0, cuda:0 cases since they are both the same device type
+        inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="cuda:1")]}
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            device_type = _infer_device_type(inp)
+            self.assertEqual("cuda", device_type)
+        inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="cuda:0")]}
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            device_type = _infer_device_type(inp)
+            self.assertEqual("cuda", device_type)
+        # Check that a warning is issued for cuda:0, meta and that it includes
+        # device type information
+        inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="meta")]}
+        with warnings.catch_warnings(record=True) as w:
+            device_type = _infer_device_type(inp)
+            self.assertEqual("cuda", device_type)
+        self.assertEqual(len(w), 1)
+        warning_msg = str(w[-1].message)
+        self.assertTrue(
+            "Tensor arguments, excluding CPU tensors, are detected on at least two types of devices"
+            in warning_msg
+        )
+        self.assertTrue("Device types: [\'cuda\', \'meta\']" in warning_msg)
+        self.assertTrue("first device type: cuda" in warning_msg)
+
+
 class TestDataLoaderUtils(TestCase):
     MAX_TIMEOUT_IN_SECOND = 300
 
@@ -694,10 +741,76 @@ def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail):
 
 
 class TestHipify(TestCase):
+
     def test_import_hipify(self):
         from torch.utils.hipify import hipify_python  # noqa: F401
 
 
+class TestHipifyTrie(TestCase):
+    def setUp(self):
+        self.trie = torch.utils.hipify.hipify_python.Trie()
+
+    def test_add_and_search_trie(self):
+        self.trie.add("banana")
+        self.assertTrue(self.trie.search("banana"))
+        self.assertFalse(self.trie.search("ban"))
+        self.assertFalse(self.trie.search("dog"))
+
+    def test_add_multiple_and_search_trie(self):
+        words_to_add = ["banana", "apple", "orange"]
+        for word in words_to_add:
+            self.trie.add(word)
+
+        for word in words_to_add:
+            self.assertTrue(self.trie.search(word))
+
+        for word in ["ban", "dog", "okay", "app"]:
+            self.assertFalse(self.trie.search(word))
+
+    def test_quote_escape(self):
+        orig_chars = ["*", "[", ".", "+", "a", "z", "-"]
+        quoted_strs = ["\\*", "\\[", "\\.", "\\+", "a", "z", "\\-"]
+        for i in range(len(orig_chars)):
+            self.assertEqual(self.trie.quote(orig_chars[i]), quoted_strs[i])
+
+    def test_export_trie_to_regex(self):
+        words_to_add = ["__CUDACC__", "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", "CUDA_ERROR_ARRAY_IS_MAPPED",
+                        "CUDA_ERROR_NOT_MAPPED", "CUDA_ERROR_INVALID_SOURCE"]
+        for word in words_to_add:
+            self.trie.add(word)
+        regex = self.trie.export_to_regex()
+        expected_regex = r"(?:CUDA_ERROR_(?:ARRAY_IS_MAPPED|CONTEXT_ALREADY_CURRENT|INVALID_SOURCE|NOT_MAPPED)|__CUDACC__)"
+        self.assertEqual(regex, expected_regex)
+
+
+    def test_prefix_words_export_trie_to_regex(self):
+        # test case where some nodes have both children and are also leaf nodes.
+        words_to_add = ["apple", "app", "ban", "banana"]
+        for word in words_to_add:
+            self.trie.add(word)
+        regex = self.trie.export_to_regex()
+        expected_regex = r"(?:app(?:le)?|ban(?:ana)?)"
+        self.assertEqual(regex, expected_regex)
+
+    def test_single_export_trie_to_regex(self):
+        words_to_add = ["cudaErrorInvalidMemcpyDirection"]
+        for word in words_to_add:
+            self.trie.add(word)
+        regex = self.trie.export_to_regex()
+        expected_regex = "cudaErrorInvalidMemcpyDirection"
+        self.assertEqual(regex, expected_regex)
+
+
+    def test_char_export_trie_to_regex(self):
+        self.trie.add("a")
+        self.assertEqual(self.trie.export_to_regex(), "a")
+        self.trie.add("b")
+        self.assertEqual(self.trie.export_to_regex(), "[ab]")
+
+    def test_special_char_export_trie_to_regex(self):
+        self.trie.add(r"c*")
+        self.assertEqual(self.trie.export_to_regex(), r"c\*")
+
 class TestAssert(TestCase):
     def test_assert_true(self):
         # verify assertions work as expected
@@ -779,7 +892,7 @@ def test_load_standalone(self):
             shutil.rmtree(build_dir)
 
 
-class DummyXPUModule:
+class DummyPrivateUse1Module:
     @staticmethod
     def is_available():
         return True
@@ -807,11 +920,12 @@ def get_amp_supported_dtype():
 
 class TestExtensionUtils(TestCase):
     def tearDown(self):
-        # Clean up from test_external_module_register
-        if hasattr(torch, "xpu"):
-            delattr(torch, "xpu")
-        if "torch.xpu" in sys.modules:
-            del sys.modules["torch.xpu"]
+        # Clean up
+        backend_name = torch._C._get_privateuse1_backend_name()
+        if hasattr(torch, backend_name):
+            delattr(torch, backend_name)
+        if f"torch.{backend_name}" in sys.modules:
+            del sys.modules[f"torch.{backend_name}"]
 
     def test_external_module_register(self):
         # Built-in module
@@ -820,20 +934,20 @@ def test_external_module_register(self):
 
         # Wrong device type
         with self.assertRaisesRegex(RuntimeError, "Expected one of cpu"):
-            torch._register_device_module('dummmy', DummyXPUModule)
+            torch._register_device_module('dummmy', DummyPrivateUse1Module)
 
         with self.assertRaises(AttributeError):
-            torch.xpu.is_available()  # type: ignore[attr-defined]
+            torch.privateuseone.is_available()  # type: ignore[attr-defined]
 
-        torch._register_device_module('xpu', DummyXPUModule)
+        torch._register_device_module('privateuseone', DummyPrivateUse1Module)
 
-        torch.xpu.is_available()  # type: ignore[attr-defined]
+        torch.privateuseone.is_available()  # type: ignore[attr-defined]
 
         # No supporting for override
         with self.assertRaisesRegex(RuntimeError, "The runtime module of"):
-            torch._register_device_module('xpu', DummyXPUModule)
+            torch._register_device_module('privateuseone', DummyPrivateUse1Module)
 
-    def test_external_module_and_backend_register(self):
+    def test_external_module_register_with_renamed_backend(self):
         torch.utils.rename_privateuse1_backend('foo')
         with self.assertRaisesRegex(RuntimeError, "has already been set"):
             torch.utils.rename_privateuse1_backend('dummmy')
@@ -847,7 +961,7 @@ def test_external_module_and_backend_register(self):
         with self.assertRaisesRegex(AssertionError, "Tried to use AMP with the"):
             with torch.autocast(device_type=custom_backend_name):
                 pass
-        torch._register_device_module('foo', DummyXPUModule)
+        torch._register_device_module('foo', DummyPrivateUse1Module)
 
         torch.foo.is_available()  # type: ignore[attr-defined]
         with torch.autocast(device_type=custom_backend_name):
@@ -967,7 +1081,7 @@ def test_basic(self):
         source = '''\
 def f(x):
     def g(x):
-        raise RuntimeError()  # HEYA
+        raise RuntimeError  # HEYA
 
     x = x * 3
     return g(x) + 1
@@ -985,7 +1099,7 @@ def g(x):
 
     def test_format_traceback_short(self):
         try:
-            raise RuntimeError()
+            raise RuntimeError
         except RuntimeError as e:
             self.assertRegex(format_traceback_short(e.__traceback__), r'.*test_utils.py:\d+ in test_format_traceback_short')
 
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 153b65b203d20..cb019057ae9fb 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -1,23 +1,41 @@
 # Owner(s): ["module: tests"]
-import torch
-import numpy as np
+import random
 
 import unittest
-from itertools import product, permutations, combinations
 from functools import partial
-import random
+from itertools import combinations, permutations, product
+
+import numpy as np
+import torch
 
 from torch.testing import make_tensor
-from torch.testing._internal.common_utils import (
-    IS_FBCODE, TestCase, run_tests, suppress_warnings, gradcheck, gradgradcheck,
-    numpy_to_torch_dtype_dict, skipIfTorchDynamo
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+    onlyCPU,
+    onlyNativeDeviceTypes,
+    skipLazy,
+    skipMeta,
+    skipXLA,
 )
-from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, onlyCPU, dtypes, onlyNativeDeviceTypes, skipLazy, skipMeta, skipXLA)
 from torch.testing._internal.common_dtype import (
-    all_types_and_complex_and, complex_types, all_types_and, floating_and_complex_types_and,
+    all_types_and,
+    all_types_and_complex_and,
+    complex_types,
+    floating_and_complex_types_and,
+)
+from torch.testing._internal.common_utils import (
+    gradcheck,
+    gradgradcheck,
+    IS_FBCODE,
+    numpy_to_torch_dtype_dict,
+    run_tests,
+    skipIfTorchDynamo,
+    suppress_warnings,
+    TestCase,
 )
 
+
 # TODO: replace this with make_tensor() in common_utils.py
 def _generate_input(shape, dtype, device, with_extremal):
     if shape == ():
@@ -29,17 +47,19 @@ def _generate_input(shape, dtype, device, with_extremal):
                 x = torch.randn(*shape, device=device) * random.randint(30, 100)
                 x = x.to(torch.bfloat16)
             else:
-                x = torch.randn(*shape, dtype=dtype, device=device) * random.randint(30, 100)
+                x = torch.randn(*shape, dtype=dtype, device=device) * random.randint(
+                    30, 100
+                )
             x[torch.randn(*shape) > 0.5] = 0
             if with_extremal and dtype.is_floating_point:
                 # Use extremal values
-                x[torch.randn(*shape) > 0.5] = float('nan')
-                x[torch.randn(*shape) > 0.5] = float('inf')
-                x[torch.randn(*shape) > 0.5] = float('-inf')
+                x[torch.randn(*shape) > 0.5] = float("nan")
+                x[torch.randn(*shape) > 0.5] = float("inf")
+                x[torch.randn(*shape) > 0.5] = float("-inf")
             elif with_extremal and dtype.is_complex:
-                x[torch.randn(*shape) > 0.5] = complex('nan')
-                x[torch.randn(*shape) > 0.5] = complex('inf')
-                x[torch.randn(*shape) > 0.5] = complex('-inf')
+                x[torch.randn(*shape) > 0.5] = complex("nan")
+                x[torch.randn(*shape) > 0.5] = complex("inf")
+                x[torch.randn(*shape) > 0.5] = complex("-inf")
         elif dtype == torch.bool:
             x = torch.zeros(shape, dtype=dtype, device=device)
             x[torch.randn(*shape) > 0.5] = True
@@ -48,6 +68,7 @@ def _generate_input(shape, dtype, device, with_extremal):
 
     return x
 
+
 # TODO: replace this with make_tensor() in common_utils.py
 def _rand_shape(dim, min_size, max_size):
     shape = []
@@ -55,13 +76,15 @@ def _rand_shape(dim, min_size, max_size):
         shape.append(random.randint(min_size, max_size))
     return tuple(shape)
 
+
 # TODO: refactor tests to avoid this function
 # Converts half/bfloat16 dtype to float when device is cpu
 def _convert_t(dtype, device):
-    if device == 'cpu' and dtype in {torch.half, torch.bfloat16}:
+    if device == "cpu" and dtype in {torch.half, torch.bfloat16}:
         return torch.float
     return dtype
 
+
 # TODO: replace this with make_tensor() in common_utils.py
 # Returns a tensor of the requested shape, dtype, and device
 # Requesting a half CPU tensor returns a float CPU tensor with
@@ -80,28 +103,31 @@ def _make_tensor(shape, dtype, device, fill_ones=False) -> torch.Tensor:
         return t.to(_convert_t(dtype, device))
 
     # Populates the CPU tensor with floats representable as half/bfloat16
-    if dtype == torch.half and device == 'cpu':
+    if dtype == torch.half and device == "cpu":
         return torch.randn(*shape, dtype=torch.float, device=device).half().float()
-    if dtype == torch.bfloat16 and device == 'cpu':
+    if dtype == torch.bfloat16 and device == "cpu":
         return torch.randn(*shape, dtype=torch.float, device=device).bfloat16().float()
 
     # Default: returns a tensor with random float values
     return torch.randn(shape, dtype=dtype, device=device).to(dtype=dtype)
 
+
 # Tests ops and indexing to ensure they return views (and new tensors) as
 # appropriate.
 class TestViewOps(TestCase):
     exact_dtype = True
 
     def is_view_of(self, base, other):
-        if (not other._is_view() or
-                other is base or
-                other._base is not base or
-                base.device != other.device):
+        if (
+            not other._is_view()
+            or other is base
+            or other._base is not base
+            or base.device != other.device
+        ):
             return False
         # Note: only validates storage on native device types
         # because some accelerators, like XLA, do not expose storage
-        if base.device.type == 'cpu' or base.device.type == 'cuda':
+        if base.device.type == "cpu" or base.device.type == "cuda":
             if base.untyped_storage().data_ptr() != other.untyped_storage().data_ptr():
                 return False
 
@@ -109,7 +135,7 @@ def is_view_of(self, base, other):
 
     # Returns true if v1 and v2 are views of the same base
     def is_view_of_same_base(self, v1, v2):
-        if (not v1._is_view() or v1 is v2):
+        if not v1._is_view() or v1 is v2:
             return False
         return self.is_view_of(v1._base, v2)
 
@@ -130,15 +156,23 @@ def test_conj_self(self, device, dtype):
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool))
     def test_view_dtype_new(self, device, dtype):
-        dtypes = {value : key for (key, value) in numpy_to_torch_dtype_dict.items()}
+        dtypes = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()}
         del dtypes[torch.bool]
 
         def generate_inputs():
             yield make_tensor((4, 4, 64), dtype=dtype, device=device, low=-5, high=5)
-            yield make_tensor((4, 4, 64), dtype=dtype, device=device, low=-5, high=5).permute(1, 0, 2)
-            yield make_tensor((4, 64, 4), dtype=dtype, device=device, low=-5, high=5).permute(2, 0, 1)
-            yield make_tensor((1, 5, 1), dtype=dtype, device=device, low=-5, high=5).expand(5, 5, 64)
-            yield make_tensor((2, 5, 256), dtype=dtype, device=device, low=-5, high=5)[1::2, 1:, ::2]
+            yield make_tensor(
+                (4, 4, 64), dtype=dtype, device=device, low=-5, high=5
+            ).permute(1, 0, 2)
+            yield make_tensor(
+                (4, 64, 4), dtype=dtype, device=device, low=-5, high=5
+            ).permute(2, 0, 1)
+            yield make_tensor(
+                (1, 5, 1), dtype=dtype, device=device, low=-5, high=5
+            ).expand(5, 5, 64)
+            yield make_tensor((2, 5, 256), dtype=dtype, device=device, low=-5, high=5)[
+                1::2, 1:, ::2
+            ]
             yield make_tensor((0, 5, 64), dtype=dtype, device=device, low=-5, high=5)
             yield make_tensor((), dtype=dtype, device=device, low=-5, high=5)
 
@@ -174,15 +208,21 @@ def calc_expected_size_and_stride(a, view_dtype):
             a_np_contiguous = a.cpu().contiguous().numpy()
 
             for view_dtype, np_view_dtype in dtypes.items():
-                equal_element_size = torch._utils._element_size(dtype) == torch._utils._element_size(view_dtype)
+                equal_element_size = torch._utils._element_size(
+                    dtype
+                ) == torch._utils._element_size(view_dtype)
 
                 if not equal_element_size and a.dim() == 0:
-                    with self.assertRaisesRegex(RuntimeError, r"self.dim\(\) cannot be 0"):
+                    with self.assertRaisesRegex(
+                        RuntimeError, r"self.dim\(\) cannot be 0"
+                    ):
                         a.view(view_dtype)
                     continue
 
                 if not equal_element_size and a.stride(-1) != 1:
-                    with self.assertRaisesRegex(RuntimeError, r"self.stride\(-1\) must be 1"):
+                    with self.assertRaisesRegex(
+                        RuntimeError, r"self.stride\(-1\) must be 1"
+                    ):
                         a.view(view_dtype)
                     continue
 
@@ -190,7 +230,9 @@ def calc_expected_size_and_stride(a, view_dtype):
                 self.assertEqual(a_view.dtype, view_dtype)
                 self.assertEqual(a.data_ptr(), a_view.data_ptr())
 
-                expected_size, expected_stride = calc_expected_size_and_stride(a, view_dtype)
+                expected_size, expected_stride = calc_expected_size_and_stride(
+                    a, view_dtype
+                )
                 self.assertEqual(a_view.size(), expected_size)
                 self.assertEqual(a_view.stride(), expected_stride)
 
@@ -210,8 +252,17 @@ def calc_expected_size_and_stride(a, view_dtype):
         # because view(dtype) does not support backward yet
         # TODO: Remove this when autograd support is added
         if dtype.is_floating_point or dtype.is_complex:
-            for view_dtype in floating_and_complex_types_and(torch.half, torch.bfloat16):
-                t = make_tensor((5, 5, 64), dtype=dtype, device=device, low=-5, high=5, requires_grad=True)
+            for view_dtype in floating_and_complex_types_and(
+                torch.half, torch.bfloat16
+            ):
+                t = make_tensor(
+                    (5, 5, 64),
+                    dtype=dtype,
+                    device=device,
+                    low=-5,
+                    high=5,
+                    requires_grad=True,
+                )
                 self.assertFalse(t.view(view_dtype).requires_grad)
 
     # Test the extra error checks that happen when the view dtype
@@ -221,28 +272,35 @@ def calc_expected_size_and_stride(a, view_dtype):
     def test_view_dtype_upsize_errors(self, device, dtype):
         dtype_size = torch._utils._element_size(dtype)
 
-        for view_dtype in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool):
+        for view_dtype in all_types_and_complex_and(
+            torch.half, torch.bfloat16, torch.bool
+        ):
             view_dtype_size = torch._utils._element_size(view_dtype)
             if view_dtype_size <= dtype_size:
                 continue
 
             size_ratio = view_dtype_size // dtype_size
-            a = make_tensor((4, 4, size_ratio + 1), dtype=dtype, device=device, low=-5, high=5)
+            a = make_tensor(
+                (4, 4, size_ratio + 1), dtype=dtype, device=device, low=-5, high=5
+            )
             with self.assertRaisesRegex(
-                    RuntimeError,
-                    rf"self.size\(-1\) must be divisible by {size_ratio}"):
+                RuntimeError, rf"self.size\(-1\) must be divisible by {size_ratio}"
+            ):
                 a.view(view_dtype)
 
             with self.assertRaisesRegex(
-                    RuntimeError,
-                    rf"self.storage_offset\(\) must be divisible by {size_ratio}"):
+                RuntimeError,
+                rf"self.storage_offset\(\) must be divisible by {size_ratio}",
+            ):
                 a[:, :, 1:].view(view_dtype)
 
-            a = make_tensor((4, 4, size_ratio), dtype=dtype, device=device, low=-5, high=5)
+            a = make_tensor(
+                (4, 4, size_ratio), dtype=dtype, device=device, low=-5, high=5
+            )
             a = a.as_strided((4, 4, size_ratio), (size_ratio, 1, 1))
             with self.assertRaisesRegex(
-                    RuntimeError,
-                    rf"self.stride\(1\) must be divisible by {size_ratio}"):
+                RuntimeError, rf"self.stride\(1\) must be divisible by {size_ratio}"
+            ):
                 a.view(view_dtype)
 
     @onlyNativeDeviceTypes
@@ -255,14 +313,18 @@ def fn(contiguous_input=True, dim0=0, dim1=1):
 
             if input.size()[-1] != 2:
                 self.assertRaisesRegex(
-                    RuntimeError, "Tensor must have a last dimension of size 2",
-                    lambda: torch.view_as_complex(input))
+                    RuntimeError,
+                    "Tensor must have a last dimension of size 2",
+                    lambda: torch.view_as_complex(input),
+                )
                 return
 
             if input.stride()[-1] != 1:
                 self.assertRaisesRegex(
-                    RuntimeError, "Tensor must have a last dimension with stride 1",
-                    lambda: torch.view_as_complex(input))
+                    RuntimeError,
+                    "Tensor must have a last dimension with stride 1",
+                    lambda: torch.view_as_complex(input),
+                )
                 return
 
             res = torch.view_as_complex(input)
@@ -276,25 +338,30 @@ def fn(contiguous_input=True, dim0=0, dim1=1):
         # RuntimeError since in this case the last dim of input would not have stride 1
         fn(contiguous_input=False, dim0=1, dim1=2)
 
-
         # RuntimeError since in this case the stride of non-last dim of input would not be of size 2
         x = torch.randn(3, 3, device=device)
         t = torch.as_strided(x, (2, 2), (1, 1))
         self.assertRaisesRegex(
-            RuntimeError, "Tensor must have a stride divisible by 2 for all but last dimension",
-            lambda: torch.view_as_complex(t))
+            RuntimeError,
+            "Tensor must have a stride divisible by 2 for all but last dimension",
+            lambda: torch.view_as_complex(t),
+        )
 
         # tensor with zero elements
         x = torch.tensor([], device=device)  # torch.Size([0])
         self.assertRaisesRegex(
-            RuntimeError, "Tensor must have a last dimension of size 2",
-            lambda: torch.view_as_complex(x))
+            RuntimeError,
+            "Tensor must have a last dimension of size 2",
+            lambda: torch.view_as_complex(x),
+        )
 
         # zero dimension tensor
         z = torch.tensor(2.0)
         self.assertRaisesRegex(
-            RuntimeError, "Input tensor must have one or more dimensions",
-            lambda: torch.view_as_complex(z))
+            RuntimeError,
+            "Input tensor must have one or more dimensions",
+            lambda: torch.view_as_complex(z),
+        )
 
         y = x.reshape(0, 2)  # torch.Size([0, 2])
         res = torch.view_as_complex(y)
@@ -410,13 +477,20 @@ def compare_with_numpy(contiguous_input=True):
     @onlyNativeDeviceTypes
     @dtypes(*complex_types())
     def test_conj_imag_view(self, device, dtype) -> None:
-        t = _make_tensor((4, 5,), dtype, device)
+        t = _make_tensor(
+            (
+                4,
+                5,
+            ),
+            dtype,
+            device,
+        )
         t_numpy_conj = torch.from_numpy(t.cpu().numpy().conj()).to(device=device)
         v = t.conj()
         self.assertTrue(self.is_view_of(t, v))
         self.assertEqual(v, t_numpy_conj)
 
-        if (t.is_complex()):
+        if t.is_complex():
             v_imag = v.imag
             self.assertTrue(self.is_view_of(t, v_imag))
             self.assertEqual(v_imag, t_numpy_conj.imag)
@@ -424,7 +498,14 @@ def test_conj_imag_view(self, device, dtype) -> None:
 
     @onlyNativeDeviceTypes
     def test_conj_view_with_shared_memory(self, device) -> None:
-        a = _make_tensor((4, 5,), torch.cfloat, device)
+        a = _make_tensor(
+            (
+                4,
+                5,
+            ),
+            torch.cfloat,
+            device,
+        )
         b = a.conj()
         c = a.conj()
 
@@ -433,7 +514,12 @@ def test_conj_view_with_shared_memory(self, device) -> None:
         self.assertEqual(torch.add(b, c), b.add_(c))
 
     @onlyNativeDeviceTypes
-    @dtypes(*product(complex_types(), all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)))
+    @dtypes(
+        *product(
+            complex_types(),
+            all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool),
+        )
+    )
     @suppress_warnings
     def test_set_real_imag(self, device, dtypes):
         x = torch.randn(10, dtype=dtypes[0], device=device)
@@ -499,9 +585,10 @@ def test_unbind(self):
             stacked = torch.randn(3, 10, 10, requires_grad=True)
             outs = stacked.unbind()
             gi = grad.unbind()[i]
-            g, = torch.autograd.grad(outs[i], stacked, gi)
-            g_expected = torch.stack([gi if j == i else torch.zeros_like(gi)
-                                      for j in range(3)], dim=0)
+            (g,) = torch.autograd.grad(outs[i], stacked, gi)
+            g_expected = torch.stack(
+                [gi if j == i else torch.zeros_like(gi) for j in range(3)], dim=0
+            )
             self.assertEqual(g, g_expected)
         # Check with gradcheck
         stacked = torch.randn(3, 10, 10, dtype=torch.double, requires_grad=True)
@@ -789,8 +876,9 @@ def test_writes_propagate(t, v):
         self.assertTrue(self.is_view_of_same_base(t, v))
 
         # stride[i] = stride[i + 1] * size[i + 1] is satisfied for 3 groups:
-        t = torch.ones(720, device=device) \
-            .as_strided((2, 3, 2, 3, 5, 4), (6, 2, 15, 5, 1, 0))
+        t = torch.ones(720, device=device).as_strided(
+            (2, 3, 2, 3, 5, 4), (6, 2, 15, 5, 1, 0)
+        )
         #               [--1--|---2---|-3-] [--1--|----2---|-3-]
         v1 = t.flatten(0, 1)
         v2 = v1.flatten(1, 3)
@@ -811,6 +899,7 @@ def assert_is_nonview(t, nv):
             nv[idx_nv] = 0
             if device != "meta":
                 self.assertNotEqual(t[idx_t], nv[idx_nv])
+
         t = torch.ones(2, 3, 2, 3, device=device).transpose(2, 3)
         nv = t.flatten(1, 3)
         assert_is_nonview(t, nv)
@@ -858,7 +947,9 @@ def test_advanced_indexing_nonview(self, device):
         nv[1, 1] = 0
         self.assertNotEqual(t[2, 2], nv[1, 1])
 
-    @unittest.skipIf(IS_FBCODE, "TorchScript backend not yet supported in FBCODE/OVRSOURCE builds")
+    @unittest.skipIf(
+        IS_FBCODE, "TorchScript backend not yet supported in FBCODE/OVRSOURCE builds"
+    )
     def test_advanced_indexing_assignment(self, device):
         t = torch.ones(3, 3, device=device)
         rows = torch.tensor([[0, 0], [2, 2]], device=device)
@@ -953,9 +1044,9 @@ def test_view_copy_out(self, device):
         self.assertEqual(expected1, out1)
         self.assertEqual(expected2, out2)
 
+
 class TestOldViewOps(TestCase):
     def test_ravel(self, device):
-
         def _test_ravel(tensors, size, nc=False):
             for src in tensors:
                 # Continuous Tensor -> View
@@ -996,34 +1087,32 @@ def _test_ravel(tensors, size, nc=False):
         self.assertTrue(flat2.is_contiguous())
 
         # Test both float tensor and quantized tensor
-        tensors = [torch.randn(5, 5, 5, 5, device=device),
-                   torch._empty_affine_quantized([5, 5, 5, 5],
-                                                 scale=2,
-                                                 zero_point=3,
-                                                 dtype=torch.quint8,
-                                                 device=device)]
+        tensors = [
+            torch.randn(5, 5, 5, 5, device=device),
+            torch._empty_affine_quantized(
+                [5, 5, 5, 5], scale=2, zero_point=3, dtype=torch.quint8, device=device
+            ),
+        ]
         _test_ravel(tensors, 625)
 
-        tensors = [torch.randn(0, 2, 3, device=device),
-                   torch.randn(3, 0, 2, device=device),
-                   torch._empty_affine_quantized([0, 2, 3],
-                                                 scale=2,
-                                                 zero_point=3,
-                                                 dtype=torch.quint8,
-                                                 device=device),
-                   torch._empty_affine_quantized([3, 0, 2],
-                                                 scale=2,
-                                                 zero_point=3,
-                                                 dtype=torch.quint8,
-                                                 device=device)]
+        tensors = [
+            torch.randn(0, 2, 3, device=device),
+            torch.randn(3, 0, 2, device=device),
+            torch._empty_affine_quantized(
+                [0, 2, 3], scale=2, zero_point=3, dtype=torch.quint8, device=device
+            ),
+            torch._empty_affine_quantized(
+                [3, 0, 2], scale=2, zero_point=3, dtype=torch.quint8, device=device
+            ),
+        ]
         _test_ravel(tensors, 0)
 
-        tensors = [torch.randn(5, 5, device=device),
-                   torch._empty_affine_quantized([5, 5],
-                                                 scale=2,
-                                                 zero_point=3,
-                                                 dtype=torch.quint8,
-                                                 device=device)]
+        tensors = [
+            torch.randn(5, 5, device=device),
+            torch._empty_affine_quantized(
+                [5, 5], scale=2, zero_point=3, dtype=torch.quint8, device=device
+            ),
+        ]
         _test_ravel(tensors, 25, True)
 
     # TODO: this should be refactored into the view ops test suite
@@ -1055,7 +1144,9 @@ def test_expand(self, device):
         # test non-contiguous
         noncontig = torch.randn(5, 2, 1, 3, device=device)[:, 0]
         self.assertFalse(noncontig.is_contiguous())
-        self.assertEqual(noncontig.expand(2, 5, 4, 3), noncontig.contiguous().repeat(2, 1, 4, 1))
+        self.assertEqual(
+            noncontig.expand(2, 5, 4, 3), noncontig.contiguous().repeat(2, 1, 4, 1)
+        )
 
         # make sure it's compatible with unsqueeze
         expanded = tensor2.expand(1, 1, 5)
@@ -1068,7 +1159,9 @@ def test_expand(self, device):
         self.assertRaises(RuntimeError, lambda: tensor2.expand(-1, -1))
 
         # test expanding empty to empty
-        self.assertEqual(torch.zeros(0, device=device).expand((0,)), torch.zeros(0, device=device))
+        self.assertEqual(
+            torch.zeros(0, device=device).expand((0,)), torch.zeros(0, device=device)
+        )
 
     # TODO: this should be refactored into the view ops test suite
     def test_view_empty(self, device):
@@ -1107,7 +1200,9 @@ def test_reshape(self, device):
         x = torch.randn(3, 3, device=device)
         self.assertEqual(x.data_ptr(), x.reshape_as(torch.rand(9)).data_ptr())
         self.assertEqual(x.data_ptr(), x.reshape_as(torch.rand(1, 9, 1)).data_ptr())
-        self.assertRaises(RuntimeError, lambda: x.reshape_as(torch.rand(10, device=device)))
+        self.assertRaises(
+            RuntimeError, lambda: x.reshape_as(torch.rand(10, device=device))
+        )
 
     def test_flatten(self, device):
         # Test that flatten returns 1-dim tensor when given a 0-dim tensor
@@ -1125,12 +1220,12 @@ def test_flatten(self, device):
         self.assertEqual(flat0.shape, flat1.shape)
 
         # Test both float tensor and quantized tensor
-        tensors = [torch.randn(5, 5, 5, 5, device=device),
-                   torch._empty_affine_quantized([5, 5, 5, 5],
-                                                 scale=2,
-                                                 zero_point=3,
-                                                 dtype=torch.quint8,
-                                                 device=device)]
+        tensors = [
+            torch.randn(5, 5, 5, 5, device=device),
+            torch._empty_affine_quantized(
+                [5, 5, 5, 5], scale=2, zero_point=3, dtype=torch.quint8, device=device
+            ),
+        ]
         for src in tensors:
             flat = src.flatten(0, -1)
             self.assertEqual(flat.shape, torch.Size([625]))
@@ -1160,11 +1255,13 @@ def test_flatten(self, device):
             self.assertEqual(flat, src)
 
             # out of bounds index
-            with self.assertRaisesRegex(IndexError, 'Dimension out of range'):
+            with self.assertRaisesRegex(IndexError, "Dimension out of range"):
                 src.flatten(5, 10)
 
             # invalid start and end
-            with self.assertRaisesRegex(RuntimeError, 'start_dim cannot come after end_dim'):
+            with self.assertRaisesRegex(
+                RuntimeError, "start_dim cannot come after end_dim"
+            ):
                 src.flatten(2, 0)
 
     # TODO: update to work on CUDA, too
@@ -1176,7 +1273,9 @@ def test_narrow(self, device):
         self.assertEqual(x.narrow(0, 1, 1), torch.tensor([[3, 4, 5]]))
         self.assertEqual(x.narrow(0, -1, 1), torch.tensor([[6, 7, 8]]))
         self.assertEqual(x.narrow(0, -2, 2), torch.tensor([[3, 4, 5], [6, 7, 8]]))
-        self.assertEqual(x.narrow(0, -3, 3), torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]]))
+        self.assertEqual(
+            x.narrow(0, -3, 3), torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
+        )
         self.assertEqual(x.narrow(-1, -1, 1), torch.tensor([[2], [5], [8]]))
         self.assertEqual(x.narrow(-2, -1, 1), torch.tensor([[6, 7, 8]]))
 
@@ -1186,7 +1285,7 @@ def test_narrow_tensor(self, device):
         x = torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
         self.assertEqual(x.narrow(0, torch.tensor(0), 1), torch.tensor([[0, 1, 2]]))
         with self.assertRaises(Exception):
-            x.narrow(0, torch.tensor(0.), 1)
+            x.narrow(0, torch.tensor(0.0), 1)
         with self.assertRaises(Exception):
             x.narrow(0, torch.tensor([0]), 1)
         with self.assertRaises(Exception):
@@ -1215,10 +1314,14 @@ def test_t(self, device):
 
         # Test 3D tensor
         x = torch.rand((2, 2, 2))
-        with self.assertRaisesRegex(RuntimeError, 'expects a tensor with <= 2 dimensions, but self is 3D'):
+        with self.assertRaisesRegex(
+            RuntimeError, "expects a tensor with <= 2 dimensions, but self is 3D"
+        ):
             x.t()
         x = x.to_sparse()
-        with self.assertRaisesRegex(RuntimeError, 'expects a tensor with <= 2 sparse and 0 dense dimensions'):
+        with self.assertRaisesRegex(
+            RuntimeError, "expects a tensor with <= 2 sparse and 0 dense dimensions"
+        ):
             x.t()
 
     @onlyCPU
@@ -1231,19 +1334,23 @@ def test_split(self, device):
         start = 0
         for target_size, split in zip(target_sizes, splits):
             self.assertEqual(split.size(), target_size)
-            self.assertEqual(tensor.narrow(dim, start, target_size[dim]), split, atol=0, rtol=0)
+            self.assertEqual(
+                tensor.narrow(dim, start, target_size[dim]), split, atol=0, rtol=0
+            )
             start = start + target_size[dim]
 
         # Variable sections split
         tensor = torch.randn(20, 10)
         dim = 0
         split_sizes = [5, 5, 10]
-        target_sizes = ([[5, 10], [5, 10], [10, 10]])
+        target_sizes = [[5, 10], [5, 10], [10, 10]]
         splits = tensor.split(split_sizes, dim)
         start = 0
         for target_size, split in zip(target_sizes, splits):
             self.assertEqual(split.size(), target_size)
-            self.assertEqual(tensor.narrow(dim, start, target_size[dim]), split, atol=0, rtol=0)
+            self.assertEqual(
+                tensor.narrow(dim, start, target_size[dim]), split, atol=0, rtol=0
+            )
             start = start + target_size[dim]
 
         split_sizes = [2, 2, 6]
@@ -1253,7 +1360,9 @@ def test_split(self, device):
         start = 0
         for target_size, split in zip(target_sizes, splits):
             self.assertEqual(split.size(), target_size)
-            self.assertEqual(tensor.narrow(dim, start, target_size[dim]), split, atol=0, rtol=0)
+            self.assertEqual(
+                tensor.narrow(dim, start, target_size[dim]), split, atol=0, rtol=0
+            )
             start = start + target_size[dim]
 
     @onlyCPU
@@ -1266,12 +1375,13 @@ def test_chunk(self, device):
         start = 0
         for target_size, split in zip(target_sizes, splits):
             self.assertEqual(split.size(), target_size)
-            self.assertEqual(tensor.narrow(dim, start, target_size[dim]), split,
-                             atol=0, rtol=0)
+            self.assertEqual(
+                tensor.narrow(dim, start, target_size[dim]), split, atol=0, rtol=0
+            )
             start = start + target_size[dim]
 
         # Invalid chunk sizes
-        error_regex = 'chunk expects.*greater than 0'
+        error_regex = "chunk expects.*greater than 0"
         with self.assertRaisesRegex(RuntimeError, error_regex):
             tensor.chunk(0)
         with self.assertRaisesRegex(RuntimeError, error_regex):
@@ -1312,7 +1422,9 @@ def test_T(self, device):
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_transposes(self, device, dtype):
         for op in ("T", "H", "mT", "mH", "adjoint"):
-            shapes = ((2, 3), (2, 3, 4)) if op[0] == "m" or op == "adjoint" else ((2, 3),)
+            shapes = (
+                ((2, 3), (2, 3, 4)) if op[0] == "m" or op == "adjoint" else ((2, 3),)
+            )
             for shape in shapes:
                 a = make_tensor(shape, device=device, dtype=dtype)
                 t1 = getattr(a, op)
@@ -1357,7 +1469,9 @@ def test_resize_as_preserves_strides(self, device):
 
     def test_memory_format_resize_as(self, device):
         def test_helper(shape, memory_format, device):
-            xc = torch.randn(shape, device=device).contiguous(memory_format=memory_format)
+            xc = torch.randn(shape, device=device).contiguous(
+                memory_format=memory_format
+            )
             flat = torch.randn(xc.numel(), device=device)
             flat.resize_as_(xc, memory_format=torch.preserve_format)
             self.assertTrue(flat.is_contiguous(memory_format=memory_format))
@@ -1372,7 +1486,9 @@ def test_helper(shape, numel, memory_format, device):
             self.assertTrue(flat.is_contiguous(memory_format=memory_format))
 
         test_helper((10, 3, 32, 32), 10 * 3 * 32 * 32, torch.channels_last, device)
-        test_helper((3, 10, 3, 32, 32), 3 * 10 * 3 * 32 * 32, torch.channels_last_3d, device)
+        test_helper(
+            (3, 10, 3, 32, 32), 3 * 10 * 3 * 32 * 32, torch.channels_last_3d, device
+        )
 
     @onlyNativeDeviceTypes
     @dtypes(torch.int64, torch.float, torch.complex128)
@@ -1407,14 +1523,22 @@ def test_transpose_vs_numpy(self, device, dtype):
                             dst_dim = dst_dim - nd
 
                         partial_map = {
-                            torch.swapdims: partial(torch.swapdims, dim0=src_dim, dim1=dst_dim),
-                            torch.swapaxes: partial(torch.swapaxes, axis0=src_dim, axis1=dst_dim),
-                            torch.transpose: partial(torch.transpose, dim0=src_dim, dim1=dst_dim),
+                            torch.swapdims: partial(
+                                torch.swapdims, dim0=src_dim, dim1=dst_dim
+                            ),
+                            torch.swapaxes: partial(
+                                torch.swapaxes, axis0=src_dim, axis1=dst_dim
+                            ),
+                            torch.transpose: partial(
+                                torch.transpose, dim0=src_dim, dim1=dst_dim
+                            ),
                         }
 
                         torch_fn = partial_map[fn]
                         np_fn = partial(np.swapaxes, axis1=src_dim, axis2=dst_dim)
-                        self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
+                        self.compare_with_numpy(
+                            torch_fn, np_fn, x, device=None, dtype=None
+                        )
 
             # Move dim to same position
             x = torch.randn(2, 3, 5, 7, 11)
@@ -1437,11 +1561,15 @@ def _test_atleast_dim(self, torch_fn, np_fn, device, dtype):
                         x = _generate_input(shape, dtype, device, with_extremal)
                         if contiguous:
                             x = x.T
-                        self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
+                        self.compare_with_numpy(
+                            torch_fn, np_fn, x, device=None, dtype=None
+                        )
 
                         # Compare sequence input
                         torch_sequence_x = (x,) * random.randint(3, 10)
-                        np_sequence_x = tuple(np.array(x.detach().cpu().numpy()) for x in torch_sequence_x)
+                        np_sequence_x = tuple(
+                            np.array(x.detach().cpu().numpy()) for x in torch_sequence_x
+                        )
                         torch_res = torch_fn(*torch_sequence_x)
                         np_res = np_fn(*np_sequence_x)
 
@@ -1484,7 +1612,6 @@ def test_atleast_gradient(self, device):
         self._test_atleast(device, torch.atleast_2d)
         self._test_atleast(device, torch.atleast_3d)
 
-
     @onlyCPU
     @dtypes(torch.float)
     def test_broadcast_tensors(self, device, dtype):
@@ -1498,7 +1625,6 @@ def test_broadcast_tensors(self, device, dtype):
         self.assertTrue(y1.size() == expected_size)
         self.assertTrue(y2.size() == expected_size)
 
-
     @onlyCPU
     def test_broadcast_shapes(self, device):
         examples = [(), (1,), (2,), (1, 1), (3, 1), (3, 2), (4, 1, 1), (4, 3, 2)]
@@ -1520,30 +1646,48 @@ def test_broadcast_shapes(self, device):
             res2 = torch.broadcast_tensors(*map(torch.empty, integral_inputs))[0].shape
             self.assertEqual(res1, res2)
 
-        inputs_with_neg_vals = [[1, 1, -12], [-1, 1], [-11, ]]
+        inputs_with_neg_vals = [
+            [1, 1, -12],
+            [-1, 1],
+            [
+                -11,
+            ],
+        ]
         for integral_inputs_with_neg_vals in inputs_with_neg_vals:
-            with self.assertRaisesRegex(RuntimeError, "Trying to create tensor with negative dimension"):
+            with self.assertRaisesRegex(
+                RuntimeError, "Trying to create tensor with negative dimension"
+            ):
                 torch.broadcast_shapes(*integral_inputs_with_neg_vals)
 
         integral_inputs_error_case = [(3, 5), (2, 4, 1)]
         for error_input in integral_inputs_error_case:
-            with self.assertRaisesRegex(RuntimeError, "Shape mismatch: objects cannot be broadcast to a single shape"):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Shape mismatch: objects cannot be broadcast to a single shape",
+            ):
                 torch.broadcast_shapes(*error_input)
 
         negative_inputs = [(-1,), (1, -12), (4, -11), (-4, 1), (1, 1, -2)]
         for s0 in negative_inputs:
-            with self.assertRaisesRegex(RuntimeError, "Trying to create tensor with negative dimension"):
+            with self.assertRaisesRegex(
+                RuntimeError, "Trying to create tensor with negative dimension"
+            ):
                 torch.broadcast_shapes(s0)
 
             for s1 in negative_inputs:
-                with self.assertRaisesRegex(RuntimeError, "Trying to create tensor with negative dimension"):
+                with self.assertRaisesRegex(
+                    RuntimeError, "Trying to create tensor with negative dimension"
+                ):
                     torch.broadcast_shapes(s0, s1)
 
         float_inputs_error_case = [(1.1, 2.0), (1.1, 1.0)]
         for error_case in float_inputs_error_case:
             for float_input in error_case:
-                with self.assertRaisesRegex(RuntimeError, "Input shapes "
-                                            "should be of type ints, a tuple of ints, or a list of ints"):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "Input shapes "
+                    "should be of type ints, a tuple of ints, or a list of ints",
+                ):
                     torch.broadcast_shapes(float_input)
 
         diff_input_types = [(1, (5,)), (3, (1,)), (1, (3, 4))]
@@ -1564,9 +1708,7 @@ def can_broadcast(s0, s1):
                     return False
             return True
 
-        sizes = (
-            (), (1,), (2,), (1, 1), (3, 1), (3, 2), (4, 1, 1), (4, 3, 2)
-        )
+        sizes = ((), (1,), (2,), (1, 1), (3, 1), (3, 2), (4, 1, 1), (4, 3, 2))
         for s0, s1 in combinations(sizes, r=2):
             t = make_tensor(s0, dtype=dtype, device=device, low=-9, high=9)
             t_np = t.cpu().numpy()
@@ -1576,9 +1718,11 @@ def can_broadcast(s0, s1):
                 np_res = np.broadcast_to(t_np, s1)
                 self.assertEqual(res, np_res)
             else:
-                with self.assertRaisesRegex(RuntimeError,
-                                            r"The expanded size of the tensor \(\d\) "
-                                            r"must match the existing size \(\d\)"):
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"The expanded size of the tensor \(\d\) "
+                    r"must match the existing size \(\d\)",
+                ):
                     torch.broadcast_to(t, s1)
 
     def test_view(self, device):
@@ -1602,10 +1746,14 @@ def test_view(self, device):
         self.assertEqual(empty.view(-1).size(), torch.Size([0]))
         self.assertEqual(empty.view(10, 3, -1).size(), torch.Size([10, 3, 0]))
 
-        with self.assertRaisesRegex(RuntimeError, r"because the unspecified dimension size -1 can be any value"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"because the unspecified dimension size -1 can be any value"
+        ):
             empty.view(-1, 0)
 
-        with self.assertRaisesRegex(RuntimeError, r"because the unspecified dimension size -1 can be any value"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"because the unspecified dimension size -1 can be any value"
+        ):
             empty.view(3, 0, -1, 0)
 
         self.assertRaises(RuntimeError, lambda: tensor.view(15, 0))
@@ -1614,7 +1762,11 @@ def test_view(self, device):
 
         # test view when tensor is not contiguous in every dimension, but only
         # contiguous dimensions are touched.
-        tensor = torch.rand(4, 2, 5, 1, 6, 2, 9, 3, device=device).transpose(-1, 2).transpose(-2, 3)
+        tensor = (
+            torch.rand(4, 2, 5, 1, 6, 2, 9, 3, device=device)
+            .transpose(-1, 2)
+            .transpose(-2, 3)
+        )
         # size:                      [   4,    2,    3,    9,    6,    2,    1,    5]
         # stride:                    [3840, 1620,    1,    3,   54,   27,  324,  324]
         # contiguous dim chunks:     [__________, ____, ____, __________, ____, ____]
@@ -1648,7 +1800,9 @@ def test_view(self, device):
         self.assertRaises(RuntimeError, lambda: tensor.view(8, 3, 54, 2, 1, 5))
 
         # view with stride 0 dims
-        tensor = torch.empty(1, 1, device=device).expand(3, 4)  # all dims are contiguous
+        tensor = torch.empty(1, 1, device=device).expand(
+            3, 4
+        )  # all dims are contiguous
         contig_tensor = tensor.clone()
         self.assertEqual(tensor.view(-1), contig_tensor.view(-1))
         self.assertEqual(tensor.view(1, -1, 1), contig_tensor.view(1, -1, 1))
@@ -1670,7 +1824,9 @@ def test_reshape_view_semantics(self, device, dtype):
         # the copy).
         copy_tensor = tensor.transpose(0, 1).reshape(target)
         self.assertEqual(copy_tensor.size(), target)
-        self.assertNotEqual(tensor.storage().data_ptr(), copy_tensor.storage().data_ptr())
+        self.assertNotEqual(
+            tensor.storage().data_ptr(), copy_tensor.storage().data_ptr()
+        )
 
     def test_contiguous(self, device):
         x = torch.randn(1, 16, 5, 5, device=device)
@@ -1700,9 +1856,11 @@ def test_tensor_split_sections(self, device, dtype):
                 a_n = a.cpu().numpy()
                 for dim in range(-a.dim(), a.dim()):
                     for sections in range(1, 2 * a.size(dim)):
-                        msg = f'input_size {input_size}, sections {sections}, dim {dim}'
+                        msg = f"input_size {input_size}, sections {sections}, dim {dim}"
                         result1 = torch.tensor_split(a, sections, dim)
-                        result2 = torch.tensor_split(a, torch.tensor(sections, dtype=torch.int64), dim)
+                        result2 = torch.tensor_split(
+                            a, torch.tensor(sections, dtype=torch.int64), dim
+                        )
                         for r1, r2 in zip(result1, result2):
                             self.assertEqual(r1.device, torch.device(device), msg=msg)
                             self.assertEqual(r1.dtype, dtype, msg=msg)
@@ -1744,9 +1902,11 @@ def test_tensor_split_indices(self, device, dtype):
                 for dim in range(-a.dim(), a.dim()):
                     for indices in indices_args:
                         result_1 = torch.tensor_split(a, indices, dim)
-                        result_2 = torch.tensor_split(a, torch.tensor(indices, dtype=torch.int64), dim)
+                        result_2 = torch.tensor_split(
+                            a, torch.tensor(indices, dtype=torch.int64), dim
+                        )
 
-                        msg = f'input_size {input_size}, indices {indices}, dim {dim}'
+                        msg = f"input_size {input_size}, indices {indices}, dim {dim}"
                         for r1, r2 in zip(result_1, result_2):
                             self.assertEqual(r1.device, torch.device(device), msg=msg)
                             self.assertEqual(r1.dtype, dtype, msg=msg)
@@ -1762,18 +1922,46 @@ def test_tensor_split_errors(self, device):
         S = 10
         test_cases = [
             # input size, sections or indices, dim, error type, error message, numpy error type
-            [(S,), 10, 1, IndexError, r'Dimension out of range', IndexError],
-            [(), 10, 0, RuntimeError, r'tensor_split expected at least a 1-dimensional tensor, '
-                + 'but got a tensor with 0 dims', IndexError],
-            [(S,), (10,), 1, IndexError, r'Dimension out of range', IndexError],
-            [(), (10,), 0, RuntimeError, r'tensor_split expected at least a 1-dimensional tensor, '
-                + 'but got a tensor with 0 dims', IndexError],
-            [(S,), 0, 0, RuntimeError, r'number of sections must be larger than 0, got 0', ValueError],
-            [(S,), -1, 0, RuntimeError, r'number of sections must be larger than 0, got -1', ValueError],
+            [(S,), 10, 1, IndexError, r"Dimension out of range", IndexError],
+            [
+                (),
+                10,
+                0,
+                RuntimeError,
+                r"tensor_split expected at least a 1-dimensional tensor, "
+                + "but got a tensor with 0 dims",
+                IndexError,
+            ],
+            [(S,), (10,), 1, IndexError, r"Dimension out of range", IndexError],
+            [
+                (),
+                (10,),
+                0,
+                RuntimeError,
+                r"tensor_split expected at least a 1-dimensional tensor, "
+                + "but got a tensor with 0 dims",
+                IndexError,
+            ],
+            [
+                (S,),
+                0,
+                0,
+                RuntimeError,
+                r"number of sections must be larger than 0, got 0",
+                ValueError,
+            ],
+            [
+                (S,),
+                -1,
+                0,
+                RuntimeError,
+                r"number of sections must be larger than 0, got -1",
+                ValueError,
+            ],
         ]
         for input_size, sections_or_indices, dim, err, err_msg, numpy_err in test_cases:
             a = torch.randn(input_size, device=device)
-            msg = f'input_size {input_size}, sections_or_indices {sections_or_indices}, dim {dim}'
+            msg = f"input_size {input_size}, sections_or_indices {sections_or_indices}, dim {dim}"
             with self.assertRaisesRegex(err, err_msg, msg=msg):
                 torch.tensor_split(a, sections_or_indices, dim)
             with self.assertRaisesRegex(err, err_msg, msg=msg):
@@ -1782,13 +1970,17 @@ def test_tensor_split_errors(self, device):
                 np.array_split(a.cpu().numpy(), sections_or_indices, dim)
 
         # addtional tests for tensor_split with tensor_indices_or_sections
-        with self.assertRaisesRegex(RuntimeError,
-                                    r'tensor_split expected tensor_indices_or_sections to have dtype of long, but got Float'):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"tensor_split expected tensor_indices_or_sections to have dtype of long, but got Float",
+        ):
             torch.tensor_split(a, torch.tensor(1.1), dim)
 
-        with self.assertRaisesRegex(RuntimeError,
-                                    r'tensor_split expected tensor_indices_or_sections to be a'
-                                    + ' zero-dimensional or one-dimensional tensor, but got a tensor with 2 dims'):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"tensor_split expected tensor_indices_or_sections to be a"
+            + " zero-dimensional or one-dimensional tensor, but got a tensor with 2 dims",
+        ):
             torch.tensor_split(torch.rand(S, device=device), torch.tensor(((1,),)), 0)
 
     def test_resize_all_dtypes_and_devices(self, device):
@@ -1808,11 +2000,13 @@ def test_resize_as_all_dtypes_and_devices(self, device):
     @onlyNativeDeviceTypes
     def test_resize_overflow(self, device):
         x = torch.empty((), dtype=torch.float64)
-        with self.assertRaisesRegex(RuntimeError, 'Storage size calculation overflowed'):
+        with self.assertRaisesRegex(
+            RuntimeError, "Storage size calculation overflowed"
+        ):
             x.resize_([2, 4, 2**29, 2**29])
-        with self.assertRaisesRegex(RuntimeError, 'overflow'):
+        with self.assertRaisesRegex(RuntimeError, "overflow"):
             x.resize_([8, 8, 2**29, 2**29])
-        with self.assertRaisesRegex(RuntimeError, 'Stride calculation overflowed'):
+        with self.assertRaisesRegex(RuntimeError, "Stride calculation overflowed"):
             x.resize_([0, 4, 2305843009213693952])
 
     def test_view_all_dtypes_and_devices(self, device):
@@ -1823,12 +2017,26 @@ def test_view_all_dtypes_and_devices(self, device):
     @skipIfTorchDynamo("conj bit not implemented in TensorVariable yet")
     @onlyCPU
     def test_conj_neg_view_numpy_error(self, device):
-        self.assertRaisesRegex(RuntimeError, "has conjugate bit set", lambda: torch.tensor([1 + 2j]).conj().numpy())
-        self.assertRaisesRegex(RuntimeError, "has negative bit set", lambda: torch.tensor([1 + 2j]).conj().imag.numpy())
-        self.assertRaisesRegex(RuntimeError, "not supported for conjugate view tensors",
-                               lambda: torch.tensor([1 + 2j]).conj().view(torch.float64))
-        self.assertRaisesRegex(RuntimeError, "not supported for tensors with negative bit set",
-                               lambda: torch.tensor([1 + 2j]).conj().imag.view(torch.int32))
+        self.assertRaisesRegex(
+            RuntimeError,
+            "has conjugate bit set",
+            lambda: torch.tensor([1 + 2j]).conj().numpy(),
+        )
+        self.assertRaisesRegex(
+            RuntimeError,
+            "has negative bit set",
+            lambda: torch.tensor([1 + 2j]).conj().imag.numpy(),
+        )
+        self.assertRaisesRegex(
+            RuntimeError,
+            "not supported for conjugate view tensors",
+            lambda: torch.tensor([1 + 2j]).conj().view(torch.float64),
+        )
+        self.assertRaisesRegex(
+            RuntimeError,
+            "not supported for tensors with negative bit set",
+            lambda: torch.tensor([1 + 2j]).conj().imag.view(torch.int32),
+        )
 
     @onlyCPU
     def test_crow_col_indices(self, device):
@@ -1842,8 +2050,9 @@ def test_crow_col_indices(self, device):
         t.crow_indices()
         t.col_indices()
 
+
 instantiate_device_type_tests(TestViewOps, globals(), include_lazy=True)
 instantiate_device_type_tests(TestOldViewOps, globals())
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_weak.py b/test/test_weak.py
index cc9ff6f58ee05..471049c60131c 100644
--- a/test/test_weak.py
+++ b/test/test_weak.py
@@ -525,7 +525,7 @@ def __iter__(self):
                 return self
 
             def __next__(self):
-                raise Exc()
+                raise Exc
 
         self.assertRaises(Exc, d.update, badseq())
 
@@ -866,7 +866,7 @@ def __iter__(self):
                 return self
 
             def __next__(self):
-                raise Exc()
+                raise Exc
 
         self.assertRaises(Exc, d.update, badseq())
 
diff --git a/test/test_xpu.py b/test/test_xpu.py
new file mode 100644
index 0000000000000..7e616b39b9d84
--- /dev/null
+++ b/test/test_xpu.py
@@ -0,0 +1,306 @@
+# Owner(s): ["module: intel"]
+
+import sys
+import unittest
+
+import torch
+import torch.xpu._gpu_trace as gpu_trace
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    onlyXPU,
+    OpDTypes,
+    ops,
+)
+from torch.testing._internal.common_methods_invocations import ops_and_refs
+from torch.testing._internal.common_utils import (
+    NoTest,
+    run_tests,
+    suppress_warnings,
+    TEST_WITH_UBSAN,
+    TEST_XPU,
+    TestCase,
+)
+
+if not TEST_XPU:
+    print("XPU not available, skipping tests", file=sys.stderr)
+    TestCase = NoTest  # noqa: F811
+
+TEST_MULTIXPU = torch.xpu.device_count() > 1
+
+cpu_device = torch.device("cpu")
+xpu_device = torch.device("xpu")
+
+any_common_cpu_xpu_one = OpDTypes.any_common_cpu_cuda_one
+_xpu_computation_op_list = [
+    "fill",
+    "zeros",
+    "zeros_like",
+    "clone",
+    "view_as_real",
+    "view_as_complex",
+    "view",
+    "resize_",
+    "resize_as_",
+    "add",
+    "sub",
+    "mul",
+    "div",
+    "abs",
+]
+_xpu_tensor_factory_op_list = [
+    "as_strided",
+    "empty",
+    "empty_strided",
+]
+_xpu_not_test_dtype_op_list = [
+    "resize_",  # Skipped by CPU
+    "resize_as_",  # Skipped by CPU
+    "abs",  # Not aligned dtype
+]
+_xpu_all_op_list = _xpu_computation_op_list + _xpu_tensor_factory_op_list
+_xpu_all_ops = [op for op in ops_and_refs if op.name in _xpu_all_op_list]
+_xpu_computation_ops = [
+    op for op in ops_and_refs if op.name in _xpu_computation_op_list
+]
+
+
+class TestXpu(TestCase):
+    def test_device_behavior(self):
+        current_device = torch.xpu.current_device()
+        torch.xpu.set_device(current_device)
+        self.assertEqual(current_device, torch.xpu.current_device())
+
+    @unittest.skipIf(not TEST_MULTIXPU, "only one GPU detected")
+    def test_multi_device_behavior(self):
+        current_device = torch.xpu.current_device()
+        target_device = (current_device + 1) % torch.xpu.device_count()
+
+        with torch.xpu.device(target_device):
+            self.assertEqual(target_device, torch.xpu.current_device())
+        self.assertEqual(current_device, torch.xpu.current_device())
+
+        with torch.xpu._DeviceGuard(target_device):
+            self.assertEqual(target_device, torch.xpu.current_device())
+        self.assertEqual(current_device, torch.xpu.current_device())
+
+    def test_get_device_properties(self):
+        current_device = torch.xpu.current_device()
+        device_properties = torch.xpu.get_device_properties(current_device)
+        self.assertEqual(device_properties, torch.xpu.get_device_properties(None))
+        self.assertEqual(device_properties, torch.xpu.get_device_properties())
+
+        device_name = torch.xpu.get_device_name(current_device)
+        self.assertEqual(device_name, torch.xpu.get_device_name(None))
+        self.assertEqual(device_name, torch.xpu.get_device_name())
+
+        device_capability = torch.xpu.get_device_capability(current_device)
+        self.assertTrue(device_capability["max_work_group_size"] > 0)
+        self.assertTrue(device_capability["max_num_sub_groups"] > 0)
+        self.assertEqual(
+            device_properties.driver_version, device_capability["driver_version"]
+        )
+        self.assertEqual(device_properties.has_fp16, device_capability["has_fp16"])
+        self.assertEqual(device_properties.has_fp64, device_capability["has_fp64"])
+        self.assertEqual(
+            device_properties.has_atomic64, device_capability["has_atomic64"]
+        )
+
+    def test_wrong_xpu_fork(self):
+        stderr = TestCase.runWithPytorchAPIUsageStderr(
+            """\
+import torch
+from torch.multiprocessing import Process
+def run(rank):
+    torch.xpu.set_device(rank)
+if __name__ == "__main__":
+    size = 2
+    processes = []
+    for rank in range(size):
+        # it would work fine without the line below
+        torch.xpu.set_device(0)
+        p = Process(target=run, args=(rank,))
+        p.start()
+        processes.append(p)
+    for p in processes:
+        p.join()
+"""
+        )
+        self.assertRegex(stderr, "Cannot re-initialize XPU in forked subprocess.")
+
+    def test_streams(self):
+        s0 = torch.xpu.Stream()
+        torch.xpu.set_stream(s0)
+        s1 = torch.xpu.current_stream()
+        self.assertEqual(s0, s1)
+        s2 = torch.xpu.Stream()
+        self.assertFalse(s0 == s2)
+        torch.xpu.set_stream(s2)
+        with torch.xpu.stream(s0):
+            self.assertEqual(s0, torch.xpu.current_stream())
+        self.assertEqual(s2, torch.xpu.current_stream())
+
+    def test_stream_priority(self):
+        low, high = torch.xpu.Stream.priority_range()
+        s0 = torch.xpu.Stream(device=0, priority=low)
+
+        self.assertEqual(low, s0.priority)
+        self.assertEqual(torch.device("xpu:0"), s0.device)
+
+        s1 = torch.xpu.Stream(device=0, priority=high)
+
+        self.assertEqual(high, s1.priority)
+        self.assertEqual(torch.device("xpu:0"), s1.device)
+
+    def test_stream_event_repr(self):
+        s = torch.xpu.current_stream()
+        self.assertTrue("torch.xpu.Stream" in str(s))
+        e = torch.xpu.Event()
+        self.assertTrue("torch.xpu.Event(uninitialized)" in str(e))
+        s.record_event(e)
+        self.assertTrue("torch.xpu.Event" in str(e))
+
+    def test_events(self):
+        stream = torch.xpu.current_stream()
+        event = torch.xpu.Event()
+        self.assertTrue(event.query())
+        stream.record_event(event)
+        event.synchronize()
+        self.assertTrue(event.query())
+
+    def test_generator(self):
+        torch.manual_seed(2024)
+        g_state0 = torch.xpu.get_rng_state()
+        torch.manual_seed(1234)
+        g_state1 = torch.xpu.get_rng_state()
+        self.assertNotEqual(g_state0, g_state1)
+
+        torch.xpu.manual_seed(2024)
+        g_state2 = torch.xpu.get_rng_state()
+        self.assertEqual(g_state0, g_state2)
+
+        torch.xpu.set_rng_state(g_state1)
+        self.assertEqual(g_state1, torch.xpu.get_rng_state())
+
+        torch.manual_seed(1234)
+        torch.xpu.set_rng_state(g_state0)
+        self.assertEqual(2024, torch.xpu.initial_seed())
+
+    @onlyXPU
+    @suppress_warnings
+    @ops(_xpu_computation_ops, dtypes=any_common_cpu_xpu_one)
+    def test_compare_cpu(self, device, dtype, op):
+        def to_cpu(arg):
+            if isinstance(arg, torch.Tensor):
+                return arg.to(device="cpu")
+            return arg
+
+        samples = op.reference_inputs(device, dtype)
+
+        for sample in samples:
+            cpu_sample = sample.transform(to_cpu)
+            xpu_results = op(sample.input, *sample.args, **sample.kwargs)
+            cpu_results = op(cpu_sample.input, *cpu_sample.args, **cpu_sample.kwargs)
+
+            xpu_results = sample.output_process_fn_grad(xpu_results)
+            cpu_results = cpu_sample.output_process_fn_grad(cpu_results)
+
+            # Lower tolerance because we are running this as a `@slowTest`
+            # Don't want the periodic tests to fail frequently
+            self.assertEqual(xpu_results, cpu_results, atol=1e-4, rtol=1e-4)
+
+    @onlyXPU
+    @ops(_xpu_computation_ops, allowed_dtypes=(torch.bool,))
+    @unittest.skipIf(TEST_WITH_UBSAN, "Test uses undefined behavior")
+    def test_non_standard_bool_values(self, device, dtype, op):
+        # Test boolean values other than 0x00 and 0x01 (gh-54789)
+        def convert_boolean_tensors(x):
+            if not isinstance(x, torch.Tensor) or x.dtype != torch.bool:
+                return x
+
+            # Map False -> 0 and True -> Random value in [2, 255]
+            true_vals = torch.randint(
+                2, 255, x.shape, dtype=torch.uint8, device=x.device
+            )
+            false_vals = torch.zeros((), dtype=torch.uint8, device=x.device)
+            x_int = torch.where(x, true_vals, false_vals)
+
+            ret = x_int.view(torch.bool)
+            self.assertEqual(ret, x)
+            return ret
+
+        for sample in op.sample_inputs(device, dtype):
+            expect = op(sample.input, *sample.args, **sample.kwargs)
+
+            transformed = sample.transform(convert_boolean_tensors)
+            actual = op(transformed.input, *transformed.args, **transformed.kwargs)
+
+            self.assertEqual(expect, actual)
+
+
+instantiate_device_type_tests(TestXpu, globals(), only_for="xpu")
+
+
+class TestXpuTrace(TestCase):
+    def setUp(self):
+        torch._C._activate_gpu_trace()
+        self.mock = unittest.mock.MagicMock()
+
+    def test_event_creation_callback(self):
+        gpu_trace.register_callback_for_event_creation(self.mock)
+
+        event = torch.xpu.Event()
+        event.record()
+        self.mock.assert_called_once_with(event._as_parameter_.value)
+
+    def test_event_deletion_callback(self):
+        gpu_trace.register_callback_for_event_deletion(self.mock)
+
+        event = torch.xpu.Event()
+        event.record()
+        event_id = event._as_parameter_.value
+        del event
+        self.mock.assert_called_once_with(event_id)
+
+    def test_event_record_callback(self):
+        gpu_trace.register_callback_for_event_record(self.mock)
+
+        event = torch.xpu.Event()
+        event.record()
+        self.mock.assert_called_once_with(
+            event._as_parameter_.value, torch.xpu.current_stream().sycl_queue
+        )
+
+    def test_event_wait_callback(self):
+        gpu_trace.register_callback_for_event_wait(self.mock)
+
+        event = torch.xpu.Event()
+        event.record()
+        event.wait()
+        self.mock.assert_called_once_with(
+            event._as_parameter_.value, torch.xpu.current_stream().sycl_queue
+        )
+
+    def test_device_synchronization_callback(self):
+        gpu_trace.register_callback_for_device_synchronization(self.mock)
+
+        torch.xpu.synchronize()
+        self.mock.assert_called()
+
+    def test_stream_synchronization_callback(self):
+        gpu_trace.register_callback_for_stream_synchronization(self.mock)
+
+        stream = torch.xpu.Stream()
+        stream.synchronize()
+        self.mock.assert_called_once_with(stream.sycl_queue)
+
+    def test_event_synchronization_callback(self):
+        gpu_trace.register_callback_for_event_synchronization(self.mock)
+
+        event = torch.xpu.Event()
+        event.record()
+        event.synchronize()
+        self.mock.assert_called_once_with(event._as_parameter_.value)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/torch_np/numpy_tests/core/test_dlpack.py b/test/torch_np/numpy_tests/core/test_dlpack.py
index cde1033ce18c0..bf99f9f457530 100644
--- a/test/torch_np/numpy_tests/core/test_dlpack.py
+++ b/test/torch_np/numpy_tests/core/test_dlpack.py
@@ -2,6 +2,7 @@
 
 import functools
 import sys
+import unittest
 
 from unittest import skipIf as skipif
 
@@ -15,6 +16,7 @@
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    skipIfTorchDynamo,
     TEST_WITH_TORCHDYNAMO,
     TestCase,
     xpassIfTorchDynamo,
@@ -46,7 +48,8 @@ def test_dunder_dlpack_refcount(self):
         del y
         assert sys.getrefcount(x) == 2
 
-    @xpassIfTorchDynamo  # (reason="pytorch does not raise")
+    @unittest.expectedFailure
+    @skipIfTorchDynamo("I can't figure out how to get __dlpack__ into trace_rules.py")
     def test_dunder_dlpack_stream(self):
         x = np.arange(5)
         x.__dlpack__(stream=None)
diff --git a/test/torch_np/numpy_tests/core/test_indexing.py b/test/torch_np/numpy_tests/core/test_indexing.py
index 52970ac4d888c..f0d9023ddbff0 100644
--- a/test/torch_np/numpy_tests/core/test_indexing.py
+++ b/test/torch_np/numpy_tests/core/test_indexing.py
@@ -20,7 +20,6 @@
     skipIfTorchDynamo,
     TEST_WITH_TORCHDYNAMO,
     TestCase,
-    xfailIfTorchDynamo,
     xpassIfTorchDynamo,
 )
 
@@ -202,13 +201,13 @@ def test_ellipsis_index(self):
         b[(Ellipsis,)] = 2
         assert_equal(b, 2)
 
-    @xfailIfTorchDynamo  # numpy ndarrays do not have `.tensor` attribute
+    @xpassIfTorchDynamo  # 'torch_.np.array() does not have base attribute.
     def test_ellipsis_index_2(self):
         a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
         assert_(a[...] is not a)
         assert_equal(a[...], a)
         # `a[...]` was `a` in numpy <1.9.
-        assert_(a[...].tensor._base is a.tensor)
+        assert_(a[...].base is a)
 
     def test_single_int_index(self):
         # Single integer index selects one row
@@ -395,7 +394,7 @@ def test_memory_order(self):
         a = a.reshape(-1, 1)
         assert_(a[b, 0].flags.f_contiguous)
 
-    @skipIfTorchDynamo  # XXX: flaky, depends on implementation details
+    @skipIfTorchDynamo()  # XXX: flaky, depends on implementation details
     def test_small_regressions(self):
         # Reference count of intp for index checks
         a = np.array([0])
@@ -525,7 +524,10 @@ def test_prepend_not_one(self):
         a = np.zeros(5)
 
         # Too large and not only ones.
-        assert_raises((ValueError, RuntimeError), assign, a, s_[...], np.ones((2, 1)))
+        try:
+            assign(a, s_[...], np.ones((2, 1)))
+        except Exception as e:
+            self.assertTrue(isinstance(e, (ValueError, RuntimeError)))
         assert_raises(
             (ValueError, RuntimeError), assign, a, s_[[1, 2, 3],], np.ones((2, 1))
         )
@@ -538,8 +540,14 @@ def test_simple_broadcasting_errors(self):
         s_ = np.s_
         a = np.zeros((5, 1))
 
-        assert_raises((ValueError, RuntimeError), assign, a, s_[...], np.zeros((5, 2)))
-        assert_raises((ValueError, RuntimeError), assign, a, s_[...], np.zeros((5, 0)))
+        try:
+            assign(a, s_[...], np.zeros((5, 2)))
+        except Exception as e:
+            self.assertTrue(isinstance(e, (ValueError, RuntimeError)))
+        try:
+            assign(a, s_[...], np.zeros((5, 0)))
+        except Exception as e:
+            self.assertTrue(isinstance(e, (ValueError, RuntimeError)))
         assert_raises(
             (ValueError, RuntimeError), assign, a, s_[:, [0]], np.zeros((5, 2))
         )
diff --git a/test/torch_np/numpy_tests/core/test_multiarray.py b/test/torch_np/numpy_tests/core/test_multiarray.py
index 3921c0707b517..d1910c85b06c0 100644
--- a/test/torch_np/numpy_tests/core/test_multiarray.py
+++ b/test/torch_np/numpy_tests/core/test_multiarray.py
@@ -1136,14 +1136,14 @@ def __len__(self):
                 return 1
 
             def __getitem__(self, index):
-                raise ValueError()
+                raise ValueError
 
         class Map:
             def __len__(self):
                 return 1
 
             def __getitem__(self, index):
-                raise KeyError()
+                raise KeyError
 
         a = np.array([Map()])
         assert_(a.shape == (1,))
@@ -1160,7 +1160,7 @@ def __getitem__(self, ind):
                 if ind in [0, 1]:
                     return ind
                 else:
-                    raise IndexError()
+                    raise IndexError
 
         d = np.array([Point2(), Point2(), Point2()])
         assert_equal(d.dtype, np.dtype(object))
@@ -1181,7 +1181,7 @@ def test_false_len_iterable(self):
         # Special case where a bad __getitem__ makes us fall back on __iter__:
         class C:
             def __getitem__(self, x):
-                raise Exception
+                raise Exception  # noqa: TRY002
 
             def __iter__(self):
                 return iter(())
@@ -3613,8 +3613,9 @@ def test_ip_types(self):
     def test_mask_size(self):
         assert_raises(ValueError, np.putmask, np.array([1, 2, 3]), [True], 5)
 
-    @parametrize("dtype", (">i4", "<i4"))
-    def test_byteorder(self, dtype):
+    @parametrize("greater", (True, False))
+    def test_byteorder(self, greater):
+        dtype = ">i4" if greater else "<i4"
         x = np.array([1, 2, 3], dtype)
         np.putmask(x, [True, False, True], -1)
         assert_array_equal(x, [-1, 2, -1])
@@ -5639,6 +5640,7 @@ def test_matmul_raises(self):
         )
 
     @xpassIfTorchDynamo  # (reason="torch supports inplace matmul, and so do we")
+    @skipif(numpy.__version__ >= "1.26", reason="This is fixed in numpy 1.26")
     def test_matmul_inplace(self):
         # It would be nice to support in-place matmul eventually, but for now
         # we don't have a working implementation, so better just to error out
diff --git a/test/torch_np/numpy_tests/core/test_numeric.py b/test/torch_np/numpy_tests/core/test_numeric.py
index e94c3f25b6339..c4374f3f185c8 100644
--- a/test/torch_np/numpy_tests/core/test_numeric.py
+++ b/test/torch_np/numpy_tests/core/test_numeric.py
@@ -371,36 +371,58 @@ def test_logical(self):
         assert_((t and s) is s)
         assert_((f and s) is f)
 
-    @xfailIfTorchDynamo
-    def test_bitwise_or(self):
+    def test_bitwise_or_eq(self):
         f = np.False_
         t = np.True_
-        assert_((t | t) is t)
-        assert_((f | t) is t)
-        assert_((t | f) is t)
-        assert_((f | f) is f)
+        assert_((t | t) == t)
+        assert_((f | t) == t)
+        assert_((t | f) == t)
+        assert_((f | f) == f)
 
-    @xfailIfTorchDynamo
-    def test_bitwise_and(self):
+    def test_bitwise_or_is(self):
         f = np.False_
         t = np.True_
-        assert_((t & t) is t)
-        assert_((f & t) is f)
-        assert_((t & f) is f)
-        assert_((f & f) is f)
+        assert_(bool(t | t) is bool(t))
+        assert_(bool(f | t) is bool(t))
+        assert_(bool(t | f) is bool(t))
+        assert_(bool(f | f) is bool(f))
 
-    @xfailIfTorchDynamo
-    def test_bitwise_xor(self):
+    def test_bitwise_and_eq(self):
         f = np.False_
         t = np.True_
-        assert_((t ^ t) is f)
-        assert_((f ^ t) is t)
-        assert_((t ^ f) is t)
-        assert_((f ^ f) is f)
+        assert_((t & t) == t)
+        assert_((f & t) == f)
+        assert_((t & f) == f)
+        assert_((f & f) == f)
+
+    def test_bitwise_and_is(self):
+        f = np.False_
+        t = np.True_
+        assert_(bool(t & t) is bool(t))
+        assert_(bool(f & t) is bool(f))
+        assert_(bool(t & f) is bool(f))
+        assert_(bool(f & f) is bool(f))
+
+    def test_bitwise_xor_eq(self):
+        f = np.False_
+        t = np.True_
+        assert_((t ^ t) == f)
+        assert_((f ^ t) == t)
+        assert_((t ^ f) == t)
+        assert_((f ^ f) == f)
+
+    def test_bitwise_xor_is(self):
+        f = np.False_
+        t = np.True_
+        assert_(bool(t ^ t) is bool(f))
+        assert_(bool(f ^ t) is bool(t))
+        assert_(bool(t ^ f) is bool(t))
+        assert_(bool(f ^ f) is bool(f))
 
 
 class TestBoolArray(TestCase):
     def setUp(self):
+        super().setUp()
         # offset for simd tests
         self.t = np.array([True] * 41, dtype=bool)[1::]
         self.f = np.array([False] * 41, dtype=bool)[1::]
@@ -489,6 +511,7 @@ def test_logical_and_or_xor(self):
 @xfailIfTorchDynamo
 class TestBoolCmp(TestCase):
     def setUp(self):
+        super().setUp()
         self.f = np.ones(256, dtype=np.float32)
         self.ef = np.ones(self.f.size, dtype=bool)
         self.d = np.ones(128, dtype=np.float64)
@@ -1292,6 +1315,7 @@ def test_array_equiv(self):
 @instantiate_parametrized_tests
 class TestClip(TestCase):
     def setUp(self):
+        super().setUp()
         self.nr = 5
         self.nc = 3
 
@@ -2075,6 +2099,7 @@ def test_non_finite_scalar(self):
 
 class TestStdVar(TestCase):
     def setUp(self):
+        super().setUp()
         self.A = np.array([1, -1, 1, -1])
         self.real_var = 1
 
@@ -2132,6 +2157,7 @@ class TestCreationFuncs(TestCase):
     # Test ones, zeros, empty and full.
 
     def setUp(self):
+        super().setUp()
         # dtypes = {np.dtype(tp) for tp in itertools.chain.from_iterable(np.sctypes.values())}
         dtypes = {np.dtype(tp) for tp in "efdFDBbhil?"}
         self.dtypes = dtypes
@@ -2193,6 +2219,7 @@ class TestLikeFuncs(TestCase):
     """Test ones_like, zeros_like, empty_like and full_like"""
 
     def setUp(self):
+        super().setUp()
         self.data = [
             # Array scalars
             (np.array(3.0), None),
diff --git a/test/torch_np/numpy_tests/core/test_numerictypes.py b/test/torch_np/numpy_tests/core/test_numerictypes.py
index b7e87983b8b43..462c51873b66e 100644
--- a/test/torch_np/numpy_tests/core/test_numerictypes.py
+++ b/test/torch_np/numpy_tests/core/test_numerictypes.py
@@ -162,7 +162,7 @@ def test_names_reflect_attributes(self, t):
         """Test that names correspond to where the type is under ``np.``"""
         assert getattr(np, t.__name__) is t
 
-    @skipIfTorchDynamo  # XXX: weird, some names are not OK
+    @skipIfTorchDynamo()  # XXX: weird, some names are not OK
     @parametrize("t", numeric_types)
     def test_names_are_undersood_by_dtype(self, t):
         """Test the dtype constructor maps names back to the type"""
diff --git a/test/torch_np/numpy_tests/core/test_scalarmath.py b/test/torch_np/numpy_tests/core/test_scalarmath.py
index 1303693851cd9..a93014be92544 100644
--- a/test/torch_np/numpy_tests/core/test_scalarmath.py
+++ b/test/torch_np/numpy_tests/core/test_scalarmath.py
@@ -130,7 +130,7 @@ def test_type_create(self):
             b = atype([1, 2, 3])
             assert_equal(a, b)
 
-    @skipIfTorchDynamo  # freezes under torch.Dynamo (loop unrolling, huh)
+    @skipIfTorchDynamo()  # freezes under torch.Dynamo (loop unrolling, huh)
     def test_leak(self):
         # test leak of scalar objects
         # a leak would show up in valgrind as still-reachable of ~2.6MB
diff --git a/test/torch_np/numpy_tests/core/test_shape_base.py b/test/torch_np/numpy_tests/core/test_shape_base.py
index efb5205b08fb7..2947bd74039b6 100644
--- a/test/torch_np/numpy_tests/core/test_shape_base.py
+++ b/test/torch_np/numpy_tests/core/test_shape_base.py
@@ -198,15 +198,15 @@ def test_2D_array(self):
     def test_generator(self):
         # numpy 1.24 emits warnings but we don't
         # with assert_warns(FutureWarning):
-        hstack(np.arange(3) for _ in range(2))
+        hstack([np.arange(3) for _ in range(2)])
         # with assert_warns(FutureWarning):
-        hstack(x for x in np.ones((3, 2)))
+        hstack([x for x in np.ones((3, 2))])  # noqa: C416
 
     @skipif(numpy.__version__ < "1.24", reason="NP_VER: fails on NumPy 1.23.x")
     def test_casting_and_dtype(self):
         a = np.array([1, 2, 3])
         b = np.array([2.5, 3.5, 4.5])
-        res = np.hstack((a, b), casting="unsafe", dtype=np.int64)
+        res = np.hstack(np.append(a, b), casting="unsafe", dtype=np.int64)
         expected_res = np.array([1, 2, 3, 2, 3, 4])
         assert_array_equal(res, expected_res)
 
diff --git a/test/torch_np/numpy_tests/fft/test_pocketfft.py b/test/torch_np/numpy_tests/fft/test_pocketfft.py
index 7ce5f47f9a779..7cb8597690f16 100644
--- a/test/torch_np/numpy_tests/fft/test_pocketfft.py
+++ b/test/torch_np/numpy_tests/fft/test_pocketfft.py
@@ -47,6 +47,7 @@ def test_fft_n(self):
 @instantiate_parametrized_tests
 class TestFFT1D(TestCase):
     def setUp(self):
+        super().setUp()
         np.random.seed(123456)
 
     def test_identity(self):
@@ -341,7 +342,7 @@ def test_fft_with_order(self, dtype, order, fft):
                 Y_res = fft(Y, axes=ax)
                 assert_allclose(X_res, Y_res, atol=_tol, rtol=_tol)
         else:
-            raise ValueError()
+            raise ValueError
 
 
 @skipif(IS_WASM, reason="Cannot start thread")
diff --git a/test/torch_np/numpy_tests/lib/test_function_base.py b/test/torch_np/numpy_tests/lib/test_function_base.py
index 3b673b260ab6f..85d69fc0061b6 100644
--- a/test/torch_np/numpy_tests/lib/test_function_base.py
+++ b/test/torch_np/numpy_tests/lib/test_function_base.py
@@ -1435,7 +1435,7 @@ def test_keywords_no_func_code(self):
         try:
             vectorize(random.randrange)  # Should succeed
         except Exception:
-            raise AssertionError()  # noqa: TRY200
+            raise AssertionError  # noqa: TRY200
 
     def test_keywords2_ticket_2100(self):
         # Test kwarg support: enhancement ticket 2100
@@ -2577,7 +2577,7 @@ def test_with_incorrect_minlength(self):
             lambda: np.bincount(x, minlength=-1),
         )
 
-    @skipIfTorchDynamo  # flaky test
+    @skipIfTorchDynamo()  # flaky test
     @skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
     def test_dtype_reference_leaks(self):
         # gh-6805
diff --git a/test/torch_np/numpy_tests/lib/test_shape_base_.py b/test/torch_np/numpy_tests/lib/test_shape_base_.py
index eaf957a84146f..4ac179fbce138 100644
--- a/test/torch_np/numpy_tests/lib/test_shape_base_.py
+++ b/test/torch_np/numpy_tests/lib/test_shape_base_.py
@@ -556,7 +556,7 @@ def test_2D_arrays(self):
     def test_generator(self):
         # numpy 1.24 emits a warning but we don't
         # with assert_warns(FutureWarning):
-        column_stack(np.arange(3) for _ in range(2))
+        column_stack([np.arange(3) for _ in range(2)])
 
 
 class TestDstack(TestCase):
@@ -604,7 +604,7 @@ def test_2D_array2(self):
     def test_generator(self):
         # numpy 1.24 emits a warning but we don't
         # with assert_warns(FutureWarning):
-        dstack(np.arange(3) for _ in range(2))
+        dstack([np.arange(3) for _ in range(2)])
 
 
 # array_split has more comprehensive test of splitting.
diff --git a/test/torch_np/test_ndarray_methods.py b/test/torch_np/test_ndarray_methods.py
index 815e5058d77ac..8ea68acb8cd74 100644
--- a/test/torch_np/test_ndarray_methods.py
+++ b/test/torch_np/test_ndarray_methods.py
@@ -645,7 +645,7 @@ def test_extra_methods(self, name):
 
 
 class TestIter(TestCase):
-    @skipIfTorchDynamo
+    @skipIfTorchDynamo()
     def test_iter_1d(self):
         # numpy generates array scalars, we do 0D arrays
         a = np.arange(5)
diff --git a/test/torch_np/test_ufuncs_basic.py b/test/torch_np/test_ufuncs_basic.py
index 6c5a97b18a304..274a3d75618eb 100644
--- a/test/torch_np/test_ufuncs_basic.py
+++ b/test/torch_np/test_ufuncs_basic.py
@@ -89,16 +89,18 @@ def test_x_and_out_broadcast(self, ufunc):
 
         res_out = ufunc(x, out=out)
         res_bcast = ufunc(x_b)
-        assert_equal(res_out, res_bcast)
+        # TODO: switching the order causes a graph break, failing the test.
+        # See test/dynamo/test_misc.py -k test_numpy_graph_break
         assert res_out is out
+        assert_equal(res_out, res_bcast)
 
         out = np.empty((1, x.shape[0]))
         x_b = np.broadcast_to(x, out.shape)
 
         res_out = ufunc(x, out=out)
         res_bcast = ufunc(x_b)
-        assert_equal(res_out, res_bcast)
         assert res_out is out
+        assert_equal(res_out, res_bcast)
 
 
 ufunc_op_iop_numeric = [
@@ -206,8 +208,10 @@ def test_xy_and_out_broadcast(self, ufunc):
         res_out = ufunc(x, y, out=out)
         res_bcast = ufunc(x_b, y_b)
 
-        assert_equal(res_out, res_bcast)
+        # TODO: switching the order causes a graph break, failing the test.
+        # See test/dynamo/test_misc.py -k test_numpy_graph_break
         assert res_out is out
+        assert_equal(res_out, res_bcast)
 
 
 dtypes_numeric = [np.int32, np.float32, np.float64, np.complex128]
diff --git a/test/typing/fail/creation_ops.py b/test/typing/fail/creation_ops.py
index af148568c809e..2ebb9b4cc266b 100644
--- a/test/typing/fail/creation_ops.py
+++ b/test/typing/fail/creation_ops.py
@@ -1,6 +1,6 @@
 # flake8: noqa
 import torch
 
-torch.tensor([3], dtype='int32')  # E: expected "Optional[dtype]"
+torch.tensor([3], dtype='int32')  # E: Argument "dtype" to "tensor" has incompatible type "str"; expected "dtype | None"  [arg-type]
 torch.ones(3, dtype='int32')  # E: No overload variant of "ones" matches argument types "int", "str"
 torch.zeros(3, dtype='int32')  # E: No overload variant of "zeros" matches argument types "int", "str"
diff --git a/test/typing/fail/random.py b/test/typing/fail/random.py
index e021a0ac3b4ae..e87ec7f05a2ea 100644
--- a/test/typing/fail/random.py
+++ b/test/typing/fail/random.py
@@ -1,4 +1,4 @@
 # flake8: noqa
 import torch
 
-torch.set_rng_state([1, 2, 3])  # E: Argument 1 to "set_rng_state" has incompatible type "List[int]"; expected "Tensor"
+torch.set_rng_state([1, 2, 3])  # E: Argument 1 to "set_rng_state" has incompatible type "list[int]"; expected "Tensor"  [arg-type]
diff --git a/test/typing/pass/creation_ops.py b/test/typing/pass/creation_ops.py
index aecef48e7e244..f866d3a1628fa 100644
--- a/test/typing/pass/creation_ops.py
+++ b/test/typing/pass/creation_ops.py
@@ -1,6 +1,11 @@
+# mypy: disable-error-code="possibly-undefined"
 # flake8: noqa
 import torch
 from torch.testing._internal.common_utils import TEST_NUMPY
+
+from typing_extensions import assert_type
+
+
 if TEST_NUMPY:
     import numpy as np
 
@@ -116,3 +121,7 @@
 inp = torch.tensor([-1.5, 0, 2.0])
 values = torch.tensor([0.5])
 torch.heaviside(inp, values)
+
+# Parameter
+p = torch.nn.Parameter(torch.empty(1))
+assert_type(p, torch.nn.Parameter)
diff --git a/test/typing/pass/math_ops.py b/test/typing/pass/math_ops.py
index ead325596f464..6b75c719d7cc2 100644
--- a/test/typing/pass/math_ops.py
+++ b/test/typing/pass/math_ops.py
@@ -20,6 +20,8 @@
 # add
 torch.add(a, 20)
 torch.add(a, torch.randn(4, 1), alpha=10)
+torch.add(a+1j, 20+1j)
+torch.add(a+1j, 20, alpha=1j)
 
 # addcdiv
 torch.addcdiv(torch.randn(1, 3), torch.randn(3, 1), torch.randn(1, 3), value=0.1)
@@ -233,6 +235,7 @@
 # mul/multiply
 torch.mul(torch.randn(3), 100)
 torch.multiply(torch.randn(4, 1), torch.randn(1, 4))
+torch.mul(torch.randn(3)+1j, 100+1j)
 
 # mvlgamma
 torch.mvlgamma(torch.empty(2, 3).uniform_(1, 2), 2)
@@ -308,6 +311,8 @@
 
 # sub/subtract
 torch.sub(torch.tensor((1, 2)), torch.tensor((0, 1)), alpha=2)
+torch.sub(torch.tensor((1j, 2j)), 1j, alpha=2)
+torch.sub(torch.tensor((1j, 2j)), 10, alpha=2j)
 
 # tan
 torch.tan(a)
diff --git a/test/typing/reveal/namedtuple.py b/test/typing/reveal/namedtuple.py
index d8fcc107deade..085270d7f9a87 100644
--- a/test/typing/reveal/namedtuple.py
+++ b/test/typing/reveal/namedtuple.py
@@ -7,7 +7,7 @@
 t_sort[0][0, 0] == 1.5      # noqa: B015
 t_sort.indices[0, 0] == 1   # noqa: B015
 t_sort.values[0, 0] == 1.5  # noqa: B015
-reveal_type(t_sort)  # E: Tuple[{Tensor}, {Tensor}, fallback=torch.return_types.sort]
+reveal_type(t_sort)  # E: torch.return_types.sort
 
 t_qr = torch.linalg.qr(t)
 t_qr[0].shape == [2, 2]     # noqa: B015
diff --git a/test/typing/reveal/tensor_constructors.py b/test/typing/reveal/tensor_constructors.py
index c0abbe3169dee..5f64042243613 100644
--- a/test/typing/reveal/tensor_constructors.py
+++ b/test/typing/reveal/tensor_constructors.py
@@ -1,3 +1,4 @@
+# mypy: disable-error-code="possibly-undefined"
 # flake8: noqa
 import torch
 from torch.testing._internal.common_utils import TEST_NUMPY
diff --git a/test/xpu/test_conv.py b/test/xpu/test_conv.py
new file mode 100644
index 0000000000000..f3d4375213f02
--- /dev/null
+++ b/test/xpu/test_conv.py
@@ -0,0 +1,1270 @@
+# Owner(s): ["module: intel"]
+
+import itertools
+import math
+import unittest
+from itertools import product
+
+import torch
+import torch.backends.cudnn as cudnn
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.testing import make_tensor
+from torch.testing._internal.common_cuda import tf32_is_not_fp32
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+    onlyXPU,
+)
+from torch.testing._internal.common_dtype import floating_types_and
+from torch.testing._internal.common_nn import _test_module_empty_input, NNTestCase
+from torch.testing._internal.common_utils import (
+    dtype2prec_DONTUSE,
+    gradcheck,
+    gradgradcheck,
+    parametrize as parametrize_test,
+    run_tests,
+    set_default_dtype,
+    TEST_SCIPY,
+    TEST_WITH_ROCM,
+)
+
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32()
+if TEST_SCIPY:
+    import scipy.ndimage
+    import scipy.signal
+
+
+class TestConvolutionNNDeviceType(NNTestCase):
+    def run_conv_double_back_test(
+        self,
+        kern,
+        stride,
+        padding,
+        chan_in,
+        chan_out,
+        batch_size,
+        inp_size,
+        dilation,
+        no_weight,
+        groups=1,
+        use_xpu=False,
+        use_bias=True,
+        dtype=torch.double,
+    ):
+        device = torch.device("xpu" if use_xpu else "cpu")
+        x = torch.randn(
+            batch_size,
+            chan_in,
+            inp_size,
+            inp_size,
+            device=device,
+            dtype=dtype,
+            requires_grad=True,
+        )
+        weight = torch.randn(
+            chan_out,
+            chan_in // groups,
+            kern,
+            kern,
+            device=device,
+            dtype=dtype,
+            requires_grad=not no_weight,
+        )
+        if use_bias:
+            bias = torch.randn(chan_out, device=device, dtype=dtype, requires_grad=True)
+        else:
+            bias = None
+
+        def func(*inputs):
+            if use_bias:
+                lx, lweight, lbias = inputs
+            else:
+                lx, lweight = inputs
+                lbias = None
+            out = F.conv2d(lx, lweight, lbias, stride, padding, dilation, groups)
+            return out
+
+        if use_bias:
+            inputs = x, weight, bias
+        else:
+            inputs = x, weight
+
+        dummy_out = func(*inputs)
+        grad_y = torch.randn_like(
+            dummy_out, device=device, dtype=dtype, requires_grad=True
+        )
+
+        if dtype == torch.float:
+            (g,) = torch.autograd.grad(dummy_out.sum(), x, create_graph=True)
+            return g.requires_grad
+
+        return gradgradcheck(func, inputs, (grad_y,))
+
+    @dtypes(*floating_types_and(torch.half, torch.bfloat16))
+    def test_Conv2d_large_workspace(self, device, dtype):
+        sizes = [
+            (1, 256, 109, 175),
+            (1, 256, 80, 128),
+            (1, 256, 120, 192),
+        ]
+
+        def run_test(benchmark):
+            conv = torch.nn.Conv2d(256, 256, kernel_size=3, padding=1).to(device, dtype)
+            for size in sizes:
+                x = torch.randn(size, device=device, dtype=dtype)
+                out = conv(x.detach().clone().requires_grad_())
+                out.backward(torch.ones_like(out))
+
+        run_test(benchmark=False)
+        run_test(benchmark=True)
+
+    @dtypes(torch.half, torch.float)
+    def test_ConvTranspose2d_large_output_padding(self, device, dtype):
+        net1 = torch.nn.ConvTranspose2d(
+            128, 64, kernel_size=3, stride=2, padding=1, output_padding=1
+        ).to(device=device, dtype=dtype)
+        net2 = torch.nn.ConvTranspose2d(
+            64, 32, kernel_size=3, stride=2, padding=1, output_padding=1
+        ).to(device=device, dtype=dtype)
+        net3 = torch.nn.ConvTranspose2d(
+            32, 3, kernel_size=3, stride=2, padding=1, output_padding=1
+        ).to(device=device, dtype=dtype)
+        x = torch.rand(1, 128, 6, 6, device=device, dtype=dtype, requires_grad=True)
+        x = net1(x)
+        x = net2(x)
+        x = net3(x)
+        x.backward(torch.randn_like(x))
+
+    @dtypes(torch.float, torch.double, torch.half)
+    def test_Conv2d_depthwise_naive_groups(self, device, dtype):
+        if dtype == torch.half and "xpu" in device:
+            self.skipTest(
+                "The accuracy issue of dtype fp16 would be fixed in oneDNN v3.4"
+            )
+        for depth_multiplier in [1, 2]:
+            m = nn.Conv2d(2, 2 * depth_multiplier, kernel_size=3, groups=2).to(
+                device, dtype
+            )
+            i = (
+                torch.randn(2, 2, 6, 6, device=device, dtype=dtype)
+                .div_(2)
+                .requires_grad_()
+            )
+            output = m(i)
+            grad_output = (
+                torch.randn(2, 2 * depth_multiplier, 4, 4, device=device, dtype=dtype)
+                / 2
+            )
+            output.backward(grad_output)
+
+            offset = 1 * depth_multiplier
+
+            m1 = nn.Conv2d(1, 1 * depth_multiplier, kernel_size=3).to(device, dtype)
+            m1.weight.data = m.weight.data[:offset].clone()
+            m1.bias.data = m.bias.data[:offset].clone()
+            i1 = i.detach()[:, :1].clone().requires_grad_()
+            output1 = m1(i1)
+            output1.backward(grad_output[:, :offset].contiguous())
+
+            m2 = nn.Conv2d(1, 1 * depth_multiplier, kernel_size=3).to(device, dtype)
+            m2.weight.data.copy_(m.weight.data[offset:])
+            m2.bias.data.copy_(m.bias.data[offset:])
+            i2 = i.detach()[:, 1:].clone().requires_grad_()
+            output2 = m2(i2)
+            output2.backward(grad_output[:, offset:].contiguous())
+
+            self.assertEqual(
+                output,
+                torch.cat([output1, output2], 1),
+                atol=dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
+            self.assertEqual(
+                i.grad.data,
+                torch.cat([i1.grad.data, i2.grad.data], 1),
+                atol=dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
+            self.assertEqual(
+                m.bias.grad.data,
+                torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
+                atol=dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
+            self.assertEqual(
+                m.weight.grad.data,
+                torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                atol=dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
+
+    @dtypes(torch.float, torch.double, torch.half)
+    def test_Conv3d_depthwise_naive_groups(self, device, dtype):
+        if dtype == torch.half and "xpu" in device:
+            self.skipTest(
+                "The accuracy issue of dtype fp16 would be fixed in oneDNN v3.4"
+            )
+        for depth_multiplier in [1, 2]:
+            m = nn.Conv3d(2, 2 * depth_multiplier, kernel_size=3, groups=2).to(
+                device, dtype
+            )
+            i = (
+                torch.randn(2, 2, 6, 6, 6, device=device, dtype=dtype)
+                .div_(2)
+                .requires_grad_()
+            )
+            output = m(i)
+            grad_output = (
+                torch.randn(
+                    2, 2 * depth_multiplier, 4, 4, 4, device=device, dtype=dtype
+                )
+                / 2
+            )
+            output.backward(grad_output)
+
+            offset = 1 * depth_multiplier
+
+            m1 = nn.Conv3d(1, 1 * depth_multiplier, kernel_size=3).to(device, dtype)
+            m1.weight.data = m.weight.data[:offset].clone()
+            m1.bias.data = m.bias.data[:offset].clone()
+            i1 = i.detach()[:, :1].clone().requires_grad_()
+            output1 = m1(i1)
+            output1.backward(grad_output[:, :offset].contiguous())
+
+            m2 = nn.Conv3d(1, 1 * depth_multiplier, kernel_size=3).to(device, dtype)
+            m2.weight.data.copy_(m.weight.data[offset:])
+            m2.bias.data.copy_(m.bias.data[offset:])
+            i2 = i.detach()[:, 1:].clone().requires_grad_()
+            output2 = m2(i2)
+            output2.backward(grad_output[:, offset:].contiguous())
+            atol, rtol = (3e-4, 3e-2)
+
+            self.assertEqual(
+                output, torch.cat([output1, output2], 1), atol=atol, rtol=rtol
+            )
+            self.assertEqual(
+                i.grad.data,
+                torch.cat([i1.grad.data, i2.grad.data], 1),
+                atol=dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
+            self.assertEqual(
+                m.bias.grad.data,
+                torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
+                atol=dtype2prec_DONTUSE[dtype],
+                rtol=0,
+            )
+            self.assertEqual(
+                m.weight.grad.data,
+                torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                atol=atol,
+                rtol=rtol,
+            )
+
+    @dtypes(torch.float, torch.double, torch.half)
+    def test_noncontig_conv_grad(self, device, dtype):
+        module = nn.Conv2d(3, 5, kernel_size=3, padding=1).to(device, dtype)
+        input = torch.randn(
+            2, 3, 10, 10, dtype=dtype, device=device, requires_grad=True
+        )
+        output = module(input)
+
+        grad = torch.randn(2, 2, 5, 10, 10, dtype=dtype, device=device)[:, 1]
+        assert not grad.is_contiguous()
+        output.backward(grad, retain_graph=True)
+        self.assertIsNotNone(input.grad)
+        result = input.grad.data.clone()
+        input.grad.data.zero_()
+
+        output.backward(grad.contiguous())
+        self.assertEqual(
+            result, input.grad.data, atol=dtype2prec_DONTUSE[dtype], rtol=0
+        )
+
+    @dtypes(torch.double)
+    def test_conv_double_backward(self, device, dtype):
+        with torch.backends.cudnn.flags(enabled=True, deterministic=True):
+            batch_size = 1
+            for kern, inp_size, dilations in [(3, 5, [1, 2]), (4, 9, [1])]:
+                for stride, padding, chan_in, chan_out, dilation in product(
+                    [1], [2], [2], [3], dilations
+                ):
+                    no_weight = stride == 2
+                    result = self.run_conv_double_back_test(
+                        kern,
+                        stride,
+                        padding,
+                        chan_in,
+                        chan_out,
+                        batch_size,
+                        inp_size,
+                        dilation,
+                        no_weight,
+                        use_xpu=True,
+                        dtype=dtype,
+                    )
+                    self.assertTrue(result, "Conv double backward test failed")
+
+    def test_conv_double_backward_no_bias(self):
+        kern, stride = 3, 2
+        chan_in, chan_out = 2, 4
+        batch_size, inp_size = 2, 5
+        padding, dilation = 1, 1
+        no_weight, use_bias = False, True
+        result = self.run_conv_double_back_test(
+            kern,
+            stride,
+            padding,
+            chan_in,
+            chan_out,
+            batch_size,
+            inp_size,
+            dilation,
+            no_weight,
+            use_bias=use_bias,
+        )
+        self.assertTrue(result, "Conv double backward test failed")
+
+    def test_conv_double_backward_groups(self):
+        kern, stride, padding = 3, 1, 2
+        chan_in, chan_out = 2, 4
+        batch_size, inp_size, dilation = 2, 6, 1
+        no_weight = False
+        groups = 2
+        result = self.run_conv_double_back_test(
+            kern,
+            stride,
+            padding,
+            chan_in * groups,
+            chan_out * groups,
+            batch_size,
+            inp_size,
+            dilation,
+            no_weight,
+            groups=groups,
+        )
+        self.assertTrue(result, "Conv double backward test failed")
+
+    def test_conv_double_backward_stride(self):
+        batch_size = 2
+        for kern, inp_size, dilations in [(3, 5, [1, 2]), (3, 7, [1])]:
+            for stride, padding, chan_in, chan_out, dilation in product(
+                [2], [0, 1], [1], [2], dilations
+            ):
+                no_weight = False
+                self.run_conv_double_back_test(
+                    kern,
+                    stride,
+                    padding,
+                    chan_in,
+                    chan_out,
+                    batch_size,
+                    inp_size,
+                    dilation,
+                    no_weight,
+                )
+
+    @dtypes(torch.float)
+    def test_conv1d_same_padding(self, device, dtype):
+        test_args = [
+            range(50, 55),
+            [1, 2, 3, 8],
+            range(1, 4),
+            [1],
+        ]
+        for in_size, k_size, dilation, stride in itertools.product(*test_args):
+            x = torch.rand(1, 1, in_size, device=device, dtype=dtype)
+            y = torch.rand(1, 1, k_size, device=device, dtype=dtype)
+            z = F.conv1d(x, y, padding="same", dilation=dilation, stride=stride)
+            self.assertEqual(z.size(2), int(math.ceil(in_size / stride)))
+
+        x = torch.rand(1, 1, 12, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 3, device=device, dtype=dtype)
+        expect = F.conv1d(x, y, padding=1)
+        actual = F.conv1d(x, y, padding="same")
+        self.assertEqual(expect, actual)
+
+        x = torch.rand(1, 1, 12, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 4, device=device, dtype=dtype)
+        expect = F.conv1d(x, y, padding=3, dilation=2)
+        actual = F.conv1d(x, y, padding="same", dilation=2)
+        self.assertEqual(expect, actual)
+
+        expect = F.conv1d(x, y, padding=5, dilation=3)[..., 1:]
+        actual = F.conv1d(x, y, padding="same", dilation=3)
+        self.assertEqual(expect, actual)
+
+    @dtypes(torch.float)
+    def test_conv3d_same_padding(self, device, dtype):
+        rtol, atol = None, None
+        x = torch.rand(1, 1, 10, 11, 12, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 1, 2, 5, device=device, dtype=dtype)
+        expect = F.conv3d(x, y, padding=(0, 1, 2))[..., :, 1:, :]
+        actual = F.conv3d(x, y, padding="same")
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
+
+        expect = F.conv3d(x, y, padding=(0, 1, 4), dilation=2)
+        actual = F.conv3d(x, y, padding="same", dilation=2)
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
+
+        y = torch.rand(1, 1, 4, 4, 4, device=device, dtype=dtype)
+        expect = F.conv3d(x, y, padding=5, dilation=3)[..., 1:, 1:, 1:]
+        actual = F.conv3d(x, y, padding="same", dilation=3)
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
+
+    @dtypes(torch.float)
+    def test_conv1d_valid_padding(self, device, dtype):
+        x = torch.rand(1, 1, 10, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 4, device=device, dtype=dtype)
+        expect = F.conv1d(x, y)
+        actual = F.conv1d(x, y, padding="valid")
+        self.assertEqual(expect, actual)
+
+    @dtypes(torch.float)
+    def test_conv2d_valid_padding(self, device, dtype):
+        x = torch.rand(1, 1, 1, 10, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 1, 4, device=device, dtype=dtype)
+        expect = F.conv2d(x, y)
+        actual = F.conv2d(x, y, padding="valid")
+        self.assertEqual(expect, actual)
+
+    @dtypes(torch.float)
+    def test_conv3d_valid_padding(self, device, dtype):
+        x = torch.rand(1, 1, 1, 1, 10, dtype=dtype, device=device)
+        y = torch.rand(1, 1, 1, 1, 4, dtype=dtype, device=device)
+        expect = F.conv3d(x, y)
+        actual = F.conv3d(x, y, padding="valid")
+        self.assertEqual(expect, actual)
+
+    @dtypes(torch.float)
+    def test_conv1d_same_padding_backward(self, device, dtype):
+        x = torch.rand(1, 1, 12, dtype=dtype, device=device, requires_grad=True)
+        y = torch.rand(1, 1, 4, dtype=dtype, device=device, requires_grad=True)
+
+        z = F.conv1d(x, y, padding=3, dilation=2)
+        z.sum().abs().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        z = F.conv1d(x, y, padding="same", dilation=2)
+        z.sum().abs().backward()
+        self.assertEqual(gx_expect, x.grad)
+        self.assertEqual(gy_expect, y.grad)
+        x.grad, y.grad = None, None
+
+        z = F.conv1d(x, y, padding=2)[..., 1:]
+        z.sum().abs().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        z = F.conv1d(x, y, padding="same")
+        z.sum().abs().backward()
+        self.assertEqual(gx_expect, x.grad)
+        self.assertEqual(gy_expect, y.grad)
+
+    @dtypes(torch.float)
+    def test_conv2d_same_padding_backward(self, device, dtype):
+        x = torch.rand(1, 1, 10, 11, device=device, dtype=dtype, requires_grad=True)
+        y = torch.rand(1, 1, 4, 5, device=device, dtype=dtype, requires_grad=True)
+
+        z = F.conv2d(x, y, padding=(3, 4), dilation=2)
+        z.sum().abs().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        z = F.conv2d(x, y, padding="same", dilation=2)
+        z.sum().abs().backward()
+        self.assertEqual(gx_expect, x.grad)
+        self.assertEqual(gy_expect, y.grad)
+        x.grad, y.grad = None, None
+
+        y = torch.rand(1, 1, 4, 4, device=device, dtype=dtype, requires_grad=True)
+        z = F.conv2d(x, y, padding=2)[..., 1:, 1:]
+        z.sum().abs().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        z = F.conv2d(x, y, padding="same")
+        z.sum().abs().backward()
+        self.assertEqual(gx_expect, x.grad)
+        self.assertEqual(gy_expect, y.grad)
+
+    @dtypes(torch.double)
+    def test_conv3d_same_padding_backward(self, device, dtype):
+        x = torch.rand(1, 1, 1, 11, 12, dtype=dtype, device=device, requires_grad=True)
+        y = torch.rand(1, 1, 1, 2, 5, dtype=dtype, device=device, requires_grad=True)
+        z = F.conv3d(x, y, padding=(0, 1, 4), dilation=2)
+        z.sum().abs().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        z = F.conv3d(x, y, padding="same", dilation=2)
+        z.sum().abs().backward()
+        self.assertEqual(gx_expect, x.grad)
+        self.assertEqual(gy_expect, y.grad)
+        x.grad, y.grad = None, None
+        gradcheck(
+            lambda x, y: F.conv3d(x, y, padding="same", dilation=2),
+            (x, y),
+            check_forward_ad=True,
+            nondet_tol=1e-5,
+        )
+        gradgradcheck(
+            lambda x, y: F.conv3d(x, y, padding="same", dilation=2),
+            (x, y),
+            check_fwd_over_rev=True,
+        )
+
+        y = torch.rand(1, 1, 1, 4, 4, dtype=dtype, device=device, requires_grad=True)
+        z = F.conv3d(x, y, padding=2)[..., 1:, 1:]
+        z.sum().abs().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        z = F.conv3d(x, y, padding="same")
+        z.sum().abs().backward()
+        self.assertEqual(gx_expect, x.grad)
+        self.assertEqual(gy_expect, y.grad)
+        gradcheck(
+            lambda x, y: F.conv3d(x, y, padding="same"),
+            (x, y),
+            check_forward_ad=True,
+            nondet_tol=1e-5,
+        )
+        gradgradcheck(
+            lambda x, y: F.conv3d(x, y, padding="same"),
+            (x, y),
+            check_fwd_over_rev=True,
+        )
+
+    @dtypes(torch.float)
+    def test_conv1d_valid_padding_backward(self, device, dtype):
+        x = torch.rand(1, 1, 10, dtype=dtype, device=device, requires_grad=True)
+        y = torch.rand(1, 1, 4, dtype=dtype, device=device, requires_grad=True)
+        F.conv1d(x, y, padding=0).sum().abs().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+        F.conv1d(x, y, padding="valid").sum().abs().backward()
+        gx_actual, gy_actual = x.grad, y.grad
+        self.assertEqual(gx_expect, gx_actual)
+        self.assertEqual(gy_expect, gy_actual)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
+    @dtypes(torch.float)
+    @parametrize_test("mode", ("valid", "same"))
+    def test_conv1d_vs_scipy(self, device, dtype, mode):
+        t = make_tensor((1, 10), device=device, dtype=dtype)
+        feat_dim = t.shape[1]
+        weight_even = make_tensor((1, 1, 4), device=device, dtype=dtype)
+        weight_odd = make_tensor((1, 1, 5), device=device, dtype=dtype)
+
+        def _test(t, weight, mode):
+            t_a = t.view(-1).cpu().numpy()
+            w_a = weight.view(-1).cpu().numpy()
+            expected = scipy.signal.convolve(t_a, w_a, mode=mode)
+
+            kwargs = {"padding": mode}
+            if mode == "same":
+                p = weight.shape[2] // 2
+                t = torch.nn.functional.pad(t, (p, p))
+                kwargs.pop("padding")
+
+            weight_flipped = torch.flip(weight, (2,))
+            actual = torch.nn.functional.conv1d(t, weight_flipped, **kwargs).squeeze(0)
+            if mode == "same":
+                actual = actual[:feat_dim]
+
+            self.assertEqual(actual, expected, atol=2e-5, rtol=2e-5)
+
+        with set_default_dtype(torch.float):
+            _test(t, weight_even, mode)
+            _test(t, weight_odd, mode)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
+    @dtypes(torch.float)
+    @parametrize_test("mode", ("valid", "same"))
+    def test_conv2d_vs_scipy(self, device, dtype, mode):
+        t = make_tensor((1, 5, 10), device=device, dtype=dtype)
+        weight_even = make_tensor((1, 1, 2, 4), device=device, dtype=dtype)
+        weight_odd = make_tensor((1, 1, 3, 5), device=device, dtype=dtype)
+
+        def _test(t, weight, mode):
+            t_a = t.squeeze(0).cpu().numpy()
+            w_a = weight.squeeze(0).squeeze(0).cpu().numpy()
+            expected = scipy.signal.convolve2d(t_a, w_a, mode=mode)
+
+            kwargs = {"padding": mode}
+            if mode == "same":
+                left_right_pad = weight.shape[3] // 2
+                top_bottom_pad = weight.shape[2] // 2
+                p = (left_right_pad, left_right_pad, top_bottom_pad, top_bottom_pad)
+                t = torch.nn.functional.pad(t, p)
+                kwargs.pop("padding")
+
+            weight_flipped = torch.flip(weight, (2, 3))
+            actual = torch.nn.functional.conv2d(t, weight_flipped, **kwargs).squeeze(0)
+            if mode == "same":
+                actual = actual[:5, :10]
+
+            self.assertEqual(actual, expected, rtol=2e-5, atol=5e-6)
+
+        with set_default_dtype(torch.float):
+            _test(t, weight_even, mode)
+            _test(t, weight_odd, mode)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
+    @dtypes(torch.float)
+    @parametrize_test("mode", ("valid", "same"))
+    def test_conv3d_vs_scipy(self, device, dtype, mode):
+        t = make_tensor((1, 5, 5, 10), device=device, dtype=dtype)
+        weight_even = make_tensor((1, 1, 2, 2, 4), device=device, dtype=dtype)
+        weight_odd = make_tensor((1, 1, 2, 3, 5), device=device, dtype=dtype)
+
+        def _test(t, weight, mode):
+            t_a = t.squeeze(0).cpu().numpy()
+            w_a = weight.squeeze(0).squeeze(0).cpu().numpy()
+            expected = scipy.signal.convolve(t_a, w_a, mode=mode)
+            kwargs = {"padding": mode}
+            if mode == "same":
+                left_right_pad = weight.shape[4] // 2
+                top_bottom_pad = weight.shape[3] // 2
+                front_back_pad = weight.shape[2] // 2
+                p = (
+                    left_right_pad,
+                    left_right_pad,
+                    top_bottom_pad,
+                    top_bottom_pad,
+                    front_back_pad,
+                    front_back_pad,
+                )
+                t = torch.nn.functional.pad(t, p)
+                kwargs.pop("padding")
+            weight_flipped = torch.flip(weight, (2, 3, 4))
+            actual = torch.nn.functional.conv3d(t, weight_flipped, **kwargs).squeeze(0)
+            if mode == "same":
+                actual = actual[:5, :5, :10]
+            self.assertEqual(actual, expected, rtol=2e-5, atol=5e-6)
+
+        with set_default_dtype(torch.float):
+            _test(t, weight_even, mode)
+            _test(t, weight_odd, mode)
+
+    @dtypes(torch.float)
+    def test_conv2d_valid_padding_backward(self, device, dtype):
+        x = torch.rand(1, 1, 1, 10, device=device, dtype=dtype, requires_grad=True)
+        y = torch.rand(1, 1, 1, 4, device=device, dtype=dtype, requires_grad=True)
+        F.conv2d(x, y, padding=0).sum().abs().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+        F.conv2d(x, y, padding="valid").sum().abs().backward()
+        gx_actual, gy_actual = x.grad, y.grad
+        self.assertEqual(gx_expect, gx_actual)
+        self.assertEqual(gy_expect, gy_actual)
+
+    @dtypes(torch.double)
+    def test_conv3d_valid_padding_backward(self, device, dtype):
+        x = torch.rand(1, 1, 1, 1, 10, dtype=dtype, device=device, requires_grad=True)
+        y = torch.rand(1, 1, 1, 1, 4, dtype=dtype, device=device, requires_grad=True)
+        F.conv3d(x, y, padding=0).sum().abs().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        F.conv3d(x, y, padding="valid").sum().abs().backward()
+        gx_actual, gy_actual = x.grad, y.grad
+        self.assertEqual(gx_expect, gx_actual)
+        self.assertEqual(gy_expect, gy_actual)
+        gradcheck(
+            lambda x, y: F.conv3d(x, y, padding="valid"),
+            (x, y),
+            check_forward_ad=True,
+        )
+        gradgradcheck(
+            lambda x, y: F.conv3d(x, y, padding="valid"),
+            (x, y),
+            check_fwd_over_rev=True,
+        )
+
+    @parametrize_test("N", range(2, 4), name_fn=lambda N: f"ConvTranspose{N}d")
+    def test_conv_transpose_with_output_size_and_no_batch_dim(self, device, N):
+        inp = torch.randn((1, 15, 13) if N == 2 else (1, 15, 13, 13), device=device)
+        output_size = (1, 240, 200) if N == 2 else (1, 240, 200, 200)
+        ConvTransposeNd = getattr(nn, f"ConvTranspose{N}d")
+        m = ConvTransposeNd(
+            1, 1, kernel_size=16, stride=16, padding=7, bias=False, device=device
+        )
+        output = m(inp, output_size=output_size)
+        self.assertEqual(output.shape, output_size)
+
+    @dtypes(torch.float)
+    def test_conv_empty_channel(self, device, dtype):
+        in_channels = 0
+        mod = torch.nn.Conv1d(in_channels, 8, 2, stride=2, dtype=dtype).to(device)
+        inp = torch.randn(2, 0, 15, device=device, dtype=dtype)
+        _test_module_empty_input(self, mod, inp, check_size=False)
+
+        with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
+            inp = torch.randn(2, 1, 0, device=device, dtype=dtype)
+            mod(inp)
+
+        mod = torch.nn.Conv2d(in_channels, 33, 3, stride=2, dtype=dtype).to(device)
+        inp = torch.randn(2, 0, 50, 100, device=device, dtype=dtype)
+        _test_module_empty_input(self, mod, inp, check_size=False)
+
+        with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
+            inp = torch.randn(2, 1, 40, 0, device=device, dtype=dtype)
+            mod(inp)
+
+        mod = torch.nn.Conv3d(in_channels, 33, 3, stride=2, dtype=dtype).to(device)
+        inp = torch.randn(2, 0, 50, 20, 40, device=device, dtype=dtype)
+        _test_module_empty_input(self, mod, inp, check_size=False)
+
+        with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
+            inp = torch.randn(2, 1, 50, 0, 40, device=device, dtype=dtype)
+            mod(inp)
+
+    def test_group_conv_empty(self, device):
+        mod = torch.nn.Conv2d(4, 4, stride=2, kernel_size=3, padding=1, groups=4).to(
+            device
+        )
+        inp = torch.randn(0, 4, 4, 4, device=device)
+        _test_module_empty_input(self, mod, inp, check_size=False)
+
+    def test_group_convTranspose_empty(self, device):
+        mod = torch.nn.ConvTranspose2d(
+            4, 4, stride=2, kernel_size=3, padding=1, groups=4
+        ).to(device)
+        inp = torch.randn(0, 4, 4, 4, device=device)
+        _test_module_empty_input(self, mod, inp, check_size=False)
+
+    def test_convTranspose_empty(self, device):
+        mod = torch.nn.ConvTranspose2d(4, 4, stride=2, kernel_size=3, padding=1).to(
+            device
+        )
+        inp = torch.randn(0, 4, 4, 4, device=device)
+        _test_module_empty_input(self, mod, inp, check_size=False)
+
+    def test_conv_large_nosplit(self, device):
+        dtype = torch.half
+        conv1 = nn.Conv2d(2, 2, 8, 8).to(device).to(dtype)
+        input_large = torch.randn(1, 2, 1024, 1024 * 1024, dtype=dtype, device=device)
+        conv1(input_large)
+        conv2 = torch.nn.Conv2d(1, 1024, 1, 1).to(device).to(dtype)
+        input_large = torch.randn(1, 1, 2048, 1024, dtype=dtype, device=device)
+        conv2(input_large)
+
+    def test_conv_noncontig_weights(self, device):
+        for dim in (1, 2, 3):
+            for grouped in (False, True):
+                nc = 3
+                groups = 3 if grouped else 1
+                w = torch.randn([3] * dim, device=device)
+                w = w.expand([nc, int(nc / groups)] + list(w.shape))
+                w = w.detach().requires_grad_()
+                x = torch.randn(
+                    [1, nc] + ([5] * dim), device=device, requires_grad=True
+                )
+                y = getattr(F, f"conv{dim}d")(x, w, groups=groups)
+                y.sum().backward()
+                y = getattr(F, f"conv_transpose{dim}d")(x, w, groups=groups)
+                y.sum().backward()
+
+    def test_conv_noncontig_weights_and_bias(self, device):
+        for bias in [True, False]:
+            conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=bias).to(
+                device, torch.float
+            )
+            input_nc = torch.randn(
+                (1, 3, 224, 224, 2), device=device, dtype=torch.float
+            )[:, :, :, :, 1]
+            input_c = input_nc.contiguous()
+            weight_nc = torch.randn((64, 3, 7, 7, 2), device=device, dtype=torch.float)[
+                :, :, :, :, 1
+            ]
+            conv1.weight = nn.Parameter(weight_nc)
+            weight_c = conv1.weight.contiguous()
+            if bias:
+                bias_nc = torch.randn((64, 2), device=device, dtype=torch.float)[:, 1]
+                conv1.bias = nn.Parameter(bias_nc)
+                bias_c = conv1.bias.contiguous()
+            out1 = conv1(input_nc)
+            conv1.weight = nn.Parameter(weight_c)
+            if bias:
+                conv1.bias = nn.Parameter(bias_c)
+            out2 = conv1(input_c)
+            self.assertEqual(out1, out2)
+
+    def test_conv_transposed_large(self, device):
+        dtype = torch.half if self.device_type == "cuda" else torch.float
+        conv = nn.ConvTranspose2d(1, 1, 1, 1, bias=False).to(device).to(dtype)
+        input_large = torch.randn(4096, 1, 512, 1024, dtype=dtype, device=device)
+        ret = conv(input_large)
+        maxdiff0 = (
+            (ret.narrow(0, 0, 1024) - conv(input_large.narrow(0, 0, 1024)))
+            .abs_()
+            .max()
+            .item()
+        )
+        maxdiff1 = (
+            (ret.narrow(0, 1024, 1024) - conv(input_large.narrow(0, 1024, 1024)))
+            .abs_()
+            .max()
+            .item()
+        )
+        maxdiff2 = (
+            (ret.narrow(0, 2048, 1024) - conv(input_large.narrow(0, 2048, 1024)))
+            .abs_()
+            .max()
+            .item()
+        )
+        maxdiff3 = (
+            (ret.narrow(0, 3072, 1024) - conv(input_large.narrow(0, 3072, 1024)))
+            .abs_()
+            .max()
+            .item()
+        )
+        self.assertEqual(maxdiff0, 0)
+        self.assertEqual(maxdiff1, 0)
+        self.assertEqual(maxdiff2, 0)
+        self.assertEqual(maxdiff3, 0)
+
+    def test_conv_large(self, device):
+        dtype = torch.half if self.device_type == "cuda" else torch.float
+        conv = nn.Conv2d(2, 2, 8, 8, bias=False).to(device).to(dtype)
+        input_large = torch.randn(4097, 2, 512, 512, dtype=dtype, device=device)
+        ret = conv(input_large)
+        self.assertEqual(ret[:2048], conv(input_large[:2048]))
+        self.assertEqual(ret[2048:4096], conv(input_large[2048:4096]))
+        self.assertEqual(ret[4096:], conv(input_large[4096:]))
+
+        conv.zero_grad()
+        ret.view(4097, -1).max(dim=1).values.sum().backward()
+        del ret
+        grad1 = conv.weight.grad.detach().clone()
+        conv.zero_grad()
+        conv(input_large[:2048]).view(2048, -1).max(dim=1).values.sum().backward()
+        conv(input_large[2048:4096]).view(2048, -1).max(dim=1).values.sum().backward()
+        conv(input_large[4096:]).view(1, -1).max(dim=1).values.sum().backward()
+        grad2 = conv.weight.grad.detach().clone()
+        scale = 1 / grad2.abs().mean()
+        grad1 = grad1 * scale
+        grad2 = grad2 * scale
+        self.assertEqual(grad1, grad2, atol=5e-2, rtol=5e-3)
+
+    def test_Conv2d_size_1_kernel(self, device):
+        x_cpu = torch.randn(2, 3, 5, 5)
+        conv_cpu = torch.nn.Conv2d(3, 3, kernel_size=1)
+        y_cpu = conv_cpu(x_cpu)
+        y = torch.rand_like(y_cpu)
+        y_cpu.backward(y)
+
+        with cudnn.flags(enabled=False):
+            conv_cuda = torch.nn.Conv2d(3, 3, kernel_size=1).to(device)
+            conv_cuda.bias.data.copy_(conv_cpu.bias.data)
+            conv_cuda.weight.data.copy_(conv_cpu.weight.data)
+            y_cuda = conv_cuda(x_cpu.to(device))
+            y_cuda.backward(y.to(device))
+
+        self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(
+            conv_cpu.bias.grad.data,
+            conv_cuda.bias.grad.data,
+            atol=1e-5,
+            rtol=0,
+            exact_device=False,
+        )
+        self.assertEqual(
+            conv_cpu.weight.grad.data,
+            conv_cuda.weight.grad.data,
+            atol=1e-5,
+            rtol=0,
+            exact_device=False,
+        )
+
+    def test_ConvTranspose2d_size_1_kernel(self, device):
+        x_cpu = torch.randn(2, 3, 5, 5)
+        conv_cpu = torch.nn.ConvTranspose2d(3, 3, kernel_size=1)
+        y_cpu = conv_cpu(x_cpu)
+        y = torch.rand_like(y_cpu)
+        y_cpu.backward(y)
+        conv_cuda = torch.nn.ConvTranspose2d(3, 3, kernel_size=1).to(device)
+        conv_cuda.bias.data.copy_(conv_cpu.bias.data)
+        conv_cuda.weight.data.copy_(conv_cpu.weight.data)
+        y_cuda = conv_cuda(x_cpu.to(device))
+        y_cuda.backward(y.to(device))
+
+        self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(
+            conv_cpu.bias.grad.data,
+            conv_cuda.bias.grad.data,
+            atol=1e-5,
+            rtol=0,
+            exact_device=False,
+        )
+        self.assertEqual(
+            conv_cpu.weight.grad.data,
+            conv_cuda.weight.grad.data,
+            atol=1e-5,
+            rtol=0,
+            exact_device=False,
+        )
+
+    def test_ConvTranspose3d_size_1_kernel(self, device):
+        with set_default_dtype(torch.double):
+            x_cpu = torch.randn(2, 3, 3, 5, 5)
+            conv_cpu = torch.nn.ConvTranspose3d(3, 3, kernel_size=1)
+            y_cpu = conv_cpu(x_cpu)
+            y = torch.rand_like(y_cpu)
+            y_cpu.backward(y)
+            conv_cuda = torch.nn.ConvTranspose3d(3, 3, kernel_size=1).to(device)
+            conv_cuda.bias.data.copy_(conv_cpu.bias.data)
+            conv_cuda.weight.data.copy_(conv_cpu.weight.data)
+            y_cuda = conv_cuda(x_cpu.to(device))
+            y_cuda.backward(y.to(device))
+
+            self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
+            self.assertEqual(
+                conv_cpu.bias.grad.data,
+                conv_cuda.bias.grad.data,
+                atol=1e-5,
+                rtol=0,
+                exact_device=False,
+            )
+            self.assertEqual(
+                conv_cpu.weight.grad.data,
+                conv_cuda.weight.grad.data,
+                atol=1e-5,
+                rtol=0,
+                exact_device=False,
+            )
+
+    @dtypes(torch.float)
+    def test_Conv2d_naive_groups(self, device, dtype):
+        m = nn.Conv2d(4, 4, kernel_size=3, groups=2).to(device, dtype)
+        i = torch.randn(2, 4, 6, 6, device=device, dtype=dtype, requires_grad=True)
+        output = m(i)
+        grad_output = torch.randn(2, 4, 4, 4, device=device, dtype=dtype)
+        output.backward(grad_output)
+
+        m1 = nn.Conv2d(2, 2, kernel_size=3).to(device, dtype)
+        m1.weight.data.copy_(m.weight.data[:2])
+        m1.bias.data.copy_(m.bias.data[:2])
+        i1 = i.data[:, :2].contiguous().requires_grad_(True)
+        output1 = m1(i1)
+        output1.backward(grad_output[:, :2].contiguous())
+
+        m2 = nn.Conv2d(2, 2, kernel_size=3).to(device, dtype)
+        m2.weight.data.copy_(m.weight.data[2:])
+        m2.bias.data.copy_(m.bias.data[2:])
+        i2 = i.data[:, 2:].contiguous().requires_grad_(True)
+        output2 = m2(i2)
+        output2.backward(grad_output[:, 2:].contiguous())
+
+        self.assertEqual(output, torch.cat([output1, output2], 1))
+        self.assertEqual(
+            i.grad.data,
+            torch.cat([i1.grad.data, i2.grad.data], 1),
+            atol=dtype2prec_DONTUSE[dtype],
+            rtol=0,
+        )
+        self.assertEqual(
+            m.bias.grad.data,
+            torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
+            atol=dtype2prec_DONTUSE[dtype],
+            rtol=0,
+        )
+        self.assertEqual(
+            m.weight.grad.data,
+            torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+            atol=dtype2prec_DONTUSE[dtype],
+            rtol=0,
+        )
+
+    @dtypes(torch.double)
+    def test_Conv2d_backward_depthwise(self, device, dtype):
+        x = torch.randn(2, 2, 4, 20, device=device, dtype=dtype, requires_grad=True)
+        weight = torch.randn(2, 1, 3, 5, device=device, dtype=dtype, requires_grad=True)
+
+        def conv2d_depthwise(x, weight):
+            return torch.nn.functional.conv2d(
+                x, weight, bias=None, stride=(1, 10), groups=2
+            )
+
+        torch.autograd.gradcheck(conv2d_depthwise, (x, weight))
+
+    @dtypes(torch.half, torch.float)
+    def test_conv_cudnn_nhwc(self, device, dtype):
+        def helper(n, c, h, w, out_channels, kernel_size, groups):
+            input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device).to(
+                memory_format=torch.channels_last
+            )
+            input.requires_grad_()
+            conv = nn.Conv2d(c, out_channels, kernel_size, groups=groups).to(
+                device=device, dtype=dtype, memory_format=torch.channels_last
+            )
+            for p in conv.parameters():
+                p.data = torch.randint_like(p, -3, 3)
+
+            ref_input = input.detach().clone().contiguous().double().requires_grad_()
+            ref_conv = nn.Conv2d(c, out_channels, kernel_size, groups=groups)
+            ref_conv.load_state_dict(conv.state_dict())
+            ref_conv = ref_conv.to(
+                device=device, dtype=torch.double, memory_format=torch.contiguous_format
+            )
+
+            out = conv(input)
+            ref_out = ref_conv(ref_input)
+
+            grad = torch.randint_like(out, -3, 3)
+            ref_grad = grad.detach().clone().double().contiguous()
+
+            out.backward(grad)
+            ref_out.backward(ref_grad)
+
+            self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
+            self.assertTrue(input.grad.is_contiguous(memory_format=torch.channels_last))
+            self.assertTrue(
+                conv.weight.grad.is_contiguous(memory_format=torch.channels_last)
+            )
+
+            self.assertTrue(ref_out.is_contiguous())
+            self.assertTrue(ref_input.grad.is_contiguous())
+            self.assertTrue(ref_conv.weight.grad.is_contiguous())
+
+            self.assertEqual(out, ref_out, exact_dtype=False)
+            self.assertEqual(conv.weight.grad, ref_conv.weight.grad, exact_dtype=False)
+            self.assertEqual(conv.bias.grad, ref_conv.bias.grad, exact_dtype=False)
+            self.assertEqual(input.grad, ref_input.grad, exact_dtype=False)
+
+        helper(2, 8, 4, 4, out_channels=4, kernel_size=3, groups=1)
+        helper(2, 8, 4, 4, out_channels=8, kernel_size=3, groups=8)
+        helper(1, 16, 56, 56, out_channels=16, kernel_size=3, groups=1)
+        helper(1, 16, 56, 56, out_channels=16, kernel_size=3, groups=16)
+
+    @dtypes(torch.half, torch.float)
+    def test_conv_cudnn_ndhwc(self, device, dtype):
+        def helper(n, c, d, h, w, out_channels, kernel_size, groups):
+            input = torch.randint(
+                -2, 2, (n, c, d, h, w), dtype=dtype, device=device
+            ).to(memory_format=torch.channels_last_3d)
+            input.requires_grad_()
+            conv = nn.Conv3d(c, out_channels, kernel_size, groups=groups).to(
+                device=device, dtype=dtype, memory_format=torch.channels_last_3d
+            )
+            for p in conv.parameters():
+                p.data = torch.randint_like(p, -2, 2)
+
+            ref_input = input.detach().clone().contiguous().double().requires_grad_()
+            ref_conv = nn.Conv3d(c, out_channels, kernel_size, groups=groups)
+            ref_conv.load_state_dict(conv.state_dict())
+            ref_conv = ref_conv.to(
+                device=device, dtype=torch.double, memory_format=torch.contiguous_format
+            )
+
+            out = conv(input)
+            ref_out = ref_conv(ref_input)
+
+            grad = torch.randint_like(out, -2, 2)
+            ref_grad = grad.detach().clone().double().contiguous()
+
+            out.backward(grad)
+            ref_out.backward(ref_grad)
+
+            self.assertTrue(out.is_contiguous(memory_format=torch.channels_last_3d))
+            self.assertTrue(
+                input.grad.is_contiguous(memory_format=torch.channels_last_3d)
+            )
+            self.assertTrue(
+                conv.weight.grad.is_contiguous(memory_format=torch.channels_last_3d)
+            )
+
+            self.assertTrue(ref_out.is_contiguous())
+            self.assertTrue(ref_input.grad.is_contiguous())
+            self.assertTrue(ref_conv.weight.grad.is_contiguous())
+
+            self.assertEqual(out, ref_out, exact_dtype=False)
+            self.assertEqual(conv.weight.grad, ref_conv.weight.grad, exact_dtype=False)
+            self.assertEqual(conv.bias.grad, ref_conv.bias.grad, exact_dtype=False)
+            self.assertEqual(input.grad, ref_input.grad, exact_dtype=False)
+
+        helper(2, 8, 4, 4, 4, out_channels=4, kernel_size=3, groups=1)
+        helper(2, 8, 4, 4, 4, out_channels=8, kernel_size=3, groups=8)
+        helper(1, 16, 18, 18, 18, out_channels=16, kernel_size=3, groups=1)
+        helper(1, 16, 18, 18, 18, out_channels=16, kernel_size=3, groups=16)
+
+    def _run_conv(
+        self,
+        layer,
+        device,
+        inp,
+        grad,
+        ref_conv,
+        ref_input,
+        ref_out,
+        input_format,
+        weight_format,
+        grad_format,
+        output_format,
+    ):
+        conv = (
+            layer(inp.size(1), grad.size(1), ref_conv.weight.size(2)).float().to(device)
+        )
+        conv.load_state_dict(ref_conv.state_dict())
+        weight_data = (
+            conv.weight.detach().clone().contiguous(memory_format=weight_format)
+        )
+        conv.weight.data = weight_data.resize_(
+            weight_data.size(), memory_format=weight_format
+        )
+        input = inp.clone().contiguous(memory_format=input_format)
+        input.resize_(input.size(), memory_format=input_format)
+        input = input.requires_grad_()
+        grad = grad.contiguous(memory_format=grad_format)
+        grad.resize_(grad.size(), memory_format=grad_format)
+        out = conv(input)
+        out.backward(grad)
+        self.assertTrue(out.is_contiguous(memory_format=output_format))
+        self.assertEqual(out, ref_out)
+        self.assertEqual(conv.weight.grad, ref_conv.weight.grad)
+        self.assertEqual(conv.bias.grad, ref_conv.bias.grad)
+        self.assertEqual(input.grad, ref_input.grad)
+
+    def _test_conv_cudnn_nhwc_nchw(self, layer, n, c, h, w, k, filter_size, device):
+        data = torch.randint(1, 10, (n, c, h, w), dtype=torch.float32, device=device)
+        ref_input = data.clone().contiguous().requires_grad_(True)
+        ref_conv = layer(c, k, filter_size).float().to(device)
+        ref_out = ref_conv(ref_input)
+        grad = torch.randint(1, 10, ref_out.size(), dtype=torch.float32, device=device)
+        ref_out.backward(grad)
+
+        for w_f in [torch.contiguous_format, torch.channels_last]:
+            for g_f in [torch.contiguous_format, torch.channels_last]:
+                for input_format in [torch.contiguous_format, torch.channels_last]:
+                    output_format = torch.contiguous_format
+                    if input_format == torch.channels_last:
+                        output_format = torch.channels_last
+                    if w_f == torch.channels_last:
+                        output_format = torch.channels_last
+                    self._run_conv(
+                        layer,
+                        device,
+                        data,
+                        grad,
+                        ref_conv,
+                        ref_input,
+                        ref_out,
+                        input_format,
+                        w_f,
+                        g_f,
+                        output_format,
+                    )
+
+    @dtypes(torch.float, torch.double)
+    def test_conv_cudnn_nhwc_support(self, device, dtype):
+        input = torch.randn(
+            (1, 16, 1, 1), dtype=dtype, device=device, requires_grad=True
+        )
+        weight = torch.randn(
+            (8, 16, 3, 3), dtype=dtype, device=device, requires_grad=True
+        )
+        weight = weight.to(memory_format=torch.channels_last)
+        o = torch.conv2d(input, weight, None, (2, 1), (1, 1), (1, 1), 1)
+        self.assertTrue(o.is_contiguous(memory_format=torch.channels_last))
+        o.sum().backward()
+
+    @dtypes(torch.float)
+    def test_conv2d_no_grad(self, device, dtype):
+        for batch in [1, 2, 3]:
+            for groups in [1, 2, 4]:
+                input = torch.rand(batch, groups, 8, 8, dtype=dtype, device=device)
+                m = nn.Conv2d(
+                    groups,
+                    8,
+                    kernel_size=(3, 3),
+                    groups=groups,
+                    dtype=dtype,
+                    device=device,
+                )
+                with torch.no_grad():
+                    output_ng = m(input)
+                output = m(input)
+                self.assertEqual(output, output_ng, rtol=1e-2, atol=1e-5)
+
+    def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
+        input = torch.randn(2, 3, 6, device=device)
+        weight = torch.randn(3, 3, 3, device=device)
+        bias = torch.randn(3, device=device)
+        stride = (2,)
+        padding = (1,)
+        dilation = (1,)
+        transposed = False
+        output_padding = (0,)
+        groups = 1
+        output = torch.ops.aten.convolution(
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+        )
+
+        ggI = torch.randn(input.shape, device=device)
+        ggW = torch.randn(weight.shape, device=device)
+        ggB = torch.randn(bias.shape, device=device)
+        gO = torch.randn(output.shape, device=device)
+        output_mask = [True, True, True]
+        (
+            grad_grad_output,
+            grad_input,
+            grad_weight,
+        ) = torch.ops.aten._convolution_double_backward(
+            ggI,
+            ggW,
+            ggB,
+            gO,
+            weight,
+            input,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+            output_mask,
+        )
+
+        self.assertEqual(grad_grad_output.shape, gO.shape)
+        self.assertEqual(grad_input.shape, input.shape)
+        self.assertEqual(grad_weight.shape, weight.shape)
+
+    @onlyXPU
+    @dtypes(torch.float16, torch.bfloat16, torch.float32, torch.float64)
+    def test_channels_last_ouput_stride(self, device, dtype):
+        input = torch.randn(
+            (2, 3, 16, 16), device=device, dtype=dtype, requires_grad=True
+        )
+        weight = torch.randn(
+            (512, 3, 3, 3), device=device, dtype=dtype, requires_grad=True
+        )
+        input = input.to(memory_format=torch.channels_last)
+        weight = weight.to(memory_format=torch.channels_last)
+        out = torch.conv2d(input, weight, None, (2, 2), (0, 0), (1, 1), 1)
+
+        if dtype is torch.float64:
+            # Like most conv backend, xpu does not support float64 for chanel last conv.
+            # input NHWC, output NCHW
+            assert_size_stride(out, (2, 512, 7, 7), (25088, 49, 7, 1))
+        else:
+            # input NHWC, output NHWC
+            assert_size_stride(out, (2, 512, 7, 7), (25088, 1, 3584, 512))
+
+
+instantiate_device_type_tests(TestConvolutionNNDeviceType, globals(), only_for="xpu")
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/xpu/test_gemm.py b/test/xpu/test_gemm.py
new file mode 100644
index 0000000000000..0157677a582f2
--- /dev/null
+++ b/test/xpu/test_gemm.py
@@ -0,0 +1,1148 @@
+# Owner(s): ["module: intel"]
+
+import itertools
+import math
+import random
+from functools import partial
+from itertools import product
+
+import numpy as np
+
+import torch
+from torch.testing import make_tensor
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+    precisionOverride,
+)
+from torch.testing._internal.common_utils import iter_indices, run_tests, TestCase
+
+
+class TestBasicGEMM(TestCase):
+    def _test_addmm_addmv(
+        self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False, activation=None
+    ):
+        dtype = t.dtype
+        numpy_dtype = dtype
+        if dtype in {torch.bfloat16, torch.half}:
+            numpy_dtype = torch.float
+        if dtype.is_complex:
+            alpha = 0.9 + 0.3j if alpha is None else alpha
+            beta = 0.5 + 0.6j if beta is None else beta
+        else:
+            alpha = 1.2 if alpha is None else alpha
+            beta = 0.8 if beta is None else beta
+        if activation == "gelu":
+            res1 = f(t, m, v, alpha=alpha, beta=beta, use_gelu=True)
+        else:
+            res1 = f(t, m, v, alpha=alpha, beta=beta)
+        res2 = torch.full_like(res1, math.nan)
+        if transpose_out:
+            res2 = res2.t().clone(memory_format=torch.contiguous_format).t()
+        if activation == "gelu":
+            f(t, m, v, alpha=alpha, beta=beta, out=res2, use_gelu=True)
+        else:
+            f(t, m, v, alpha=alpha, beta=beta, out=res2)
+        m.to(numpy_dtype).cpu().numpy()
+        v.to(numpy_dtype).cpu().numpy()
+        res3 = alpha * (
+            m.to(numpy_dtype).cpu().numpy() @ v.to(numpy_dtype).cpu().numpy()
+        )
+        if beta != 0:
+            res3 += (beta * t).to(numpy_dtype).cpu().numpy()
+        if activation == "relu":
+            res3 = res3 * (res3 > 0)
+        elif activation == "gelu":
+            res3_t = torch.from_numpy(res3).to(dtype)
+            approximate = "tanh" if t.is_cuda else "none"
+            res3_t = torch.nn.functional.gelu(res3_t, approximate=approximate)
+            res3 = res3_t.to(numpy_dtype).cpu().numpy()
+        else:
+            assert activation is None, f"unsupported activation {activation}"
+        res3 = torch.from_numpy(res3).to(dtype)
+        self.assertEqual(res1, res2)
+        self.assertEqual(res1, res3)
+
+    def _test_addmm_impl(self, func, activation, device, dtype):
+        M = torch.randn(10, 25, device="cpu", dtype=torch.float32).to(dtype).to(device)
+        m1 = torch.randn(10, 50, device="cpu", dtype=torch.float32).to(dtype).to(device)
+        m2 = torch.randn(50, 25, device="cpu", dtype=torch.float32).to(dtype).to(device)
+        self._test_addmm_addmv(func, M, m1, m2, activation=activation)
+
+        # vector-shaped bias and beta=1 result in epilogue fusion in CUDA
+        V = torch.randn(25, device="cpu", dtype=torch.float32).to(dtype).to(device)
+        self._test_addmm_addmv(func, V, m1, m2, beta=1, activation=activation)
+
+        # Test 0-strided
+        M = (
+            torch.randn(10, 1, device="cpu", dtype=torch.float32)
+            .to(dtype)
+            .expand(10, 25)
+            .to(device)
+        )
+        m1 = (
+            torch.randn(10, 1, device="cpu", dtype=torch.float32)
+            .to(dtype)
+            .expand(10, 50)
+            .to(device)
+        )
+        m2 = torch.randn(50, 25, device="cpu", dtype=torch.float32).to(dtype).to(device)
+        self._test_addmm_addmv(func, M, m1, m2, activation=activation)
+
+        # Test beta=0, M=nan
+        M = (
+            torch.full((10, 25), math.nan, device="cpu", dtype=torch.float32)
+            .to(dtype)
+            .to(device)
+        )
+        m1 = torch.randn(10, 50, device="cpu", dtype=torch.float32).to(dtype).to(device)
+        m2 = torch.randn(50, 25, device="cpu", dtype=torch.float32).to(dtype).to(device)
+        self._test_addmm_addmv(func, M, m1, m2, beta=0, activation=activation)
+
+        # Test transpose
+        for t1, t2, t3, t4 in itertools.product([True, False], repeat=4):
+
+            def maybe_transpose(cond, m):
+                if not cond:
+                    return m
+                return m.t().clone(memory_format=torch.contiguous_format).t()
+
+            M = maybe_transpose(t1, torch.randn(10, 25, device=device).to(dtype))
+            m1 = maybe_transpose(t2, torch.randn(10, 50, device=device).to(dtype))
+            m2 = maybe_transpose(t3, torch.randn(50, 25, device=device).to(dtype))
+            self._test_addmm_addmv(
+                func, M, m1, m2, transpose_out=t4, activation=activation
+            )
+
+            if t1:
+                # use vector V instead of matrix M for epilogue fusion in CUDA (doesn't depend on t1)
+                self._test_addmm_addmv(
+                    func,
+                    V,
+                    m1,
+                    m2,
+                    beta=1,
+                    transpose_out=t4,
+                    activation=activation,
+                )
+
+    @precisionOverride(
+        {
+            torch.float: 1e-4,
+            torch.half: 1e-1,
+        }
+    )
+    @dtypes(torch.float32, torch.half)
+    def test_addmm(self, device, dtype):
+        self._test_addmm_impl(torch.addmm, None, device, dtype)
+
+    @precisionOverride({torch.bfloat16: 1e-0, torch.half: 1e-3, torch.float: 1e-4})
+    @dtypes(torch.bfloat16, torch.half, torch.float)
+    def test_addmv(self, device, dtype):
+        # have to use torch.randn(...).to(bfloat16) instead of
+        # torch.randn(..., dtype=bfloat16). randn does not support
+        # bfloat16 yet.
+        # "*0.2" to reduce errors for low precision
+        ts = [
+            0.2 * torch.randn(50, device=device).to(dtype),
+            0.2 * torch.randn(1, device=device).to(dtype).expand(50),
+        ]
+        vs = [
+            0.2 * torch.randn(100, device=device).to(dtype),
+            0.2
+            * torch.ones(1, device=device)
+            .to(dtype)
+            .expand(100),  # to reduce errors for low precision
+        ]
+        ms = [
+            # 0d
+            0.2
+            * torch.ones((), device=device)
+            .to(dtype)
+            .expand(50, 100),  # to reduce errors for low precision
+            # 1d
+            0.2 * torch.randn((1, 100), device=device).to(dtype).expand(50, 100),
+            # this initialization reduces errors for low precision for broadcasted matrices
+            # by making sure that intermediate and result values are exactly representable
+            # in low precision type
+            0.2
+            * torch.randint(3, (50, 1), dtype=torch.float, device=device)
+            .to(dtype)
+            .expand(50, 100),
+            # 2d
+            0.2 * torch.randn((50, 100), device=device).to(dtype),
+            0.2 * torch.randn((100, 50), device=device).to(dtype).t(),
+        ]
+        for m, v, t in itertools.product(ms, vs, ts):
+            self._test_addmm_addmv(torch.addmv, t, m, v)
+        # Test beta=0, t=nan
+        t = torch.full((50,), math.nan, device=device).to(dtype)
+        for m, v in itertools.product(ms, vs):
+            self._test_addmm_addmv(torch.addmv, t, m, v, beta=0)
+
+    @dtypes(
+        torch.half,
+        torch.float32,
+    )
+    def test_mm(self, device, dtype):
+        def _test_mm(n, m, p, dtype, genf):
+            # helper function
+            def matrixmultiply(mat1, mat2):
+                n = mat1.size(0)
+                m = mat1.size(1)
+                p = mat2.size(1)
+                dtype_ = torch.float if dtype == torch.half else dtype
+                if dtype == torch.half:
+                    mat1 = mat1.float()
+                    mat2 = mat2.float()
+                res = torch.zeros(n, p, dtype=dtype_, device=device)
+                for i, j in iter_indices(res):
+                    res[i, j] = sum(mat1[i, k] * mat2[k, j] for k in range(m))
+                return res.half() if dtype == torch.half else res
+
+            # contiguous case
+            mat1 = genf(n, m)
+            mat2 = genf(m, p)
+            res = torch.mm(mat1, mat2)
+
+            res2 = matrixmultiply(mat1, mat2)
+            self.assertEqual(res, res2)
+
+            # non contiguous case 1
+            mat1 = genf(n, m)
+            mat2 = genf(p, m).t()
+            res = torch.mm(mat1, mat2)
+
+            res2 = matrixmultiply(mat1, mat2)
+            self.assertEqual(res, res2)
+
+            # non contiguous case 2
+            mat1 = genf(m, n).t()
+            mat2 = genf(m, p)
+            res = torch.mm(mat1, mat2)
+
+            res2 = matrixmultiply(mat1, mat2)
+            self.assertEqual(res, res2)
+
+            # non contiguous case 3
+            mat1 = genf(m, n).t()
+            mat2 = genf(p, m).t()
+            res = torch.mm(mat1, mat2)
+
+            res2 = matrixmultiply(mat1, mat2)
+            self.assertEqual(res, res2)
+
+            # test with zero stride
+            mat1 = genf(n, m)
+            mat2 = genf(m, 1).expand(m, p)
+            res = torch.mm(mat1, mat2)
+
+            res2 = matrixmultiply(mat1, mat2)
+            self.assertEqual(res, res2)
+
+            # explicitly exercise the _out variant in torch.mm().
+            # contiguous case
+            mat1 = genf(n, m)
+            mat2 = genf(m, p)
+            res = genf(n, p)
+            torch.mm(mat1, mat2, out=res)
+
+            res2 = matrixmultiply(mat1, mat2)
+            self.assertEqual(res, res2)
+
+            # explicitly exercise the _out variant in torch.mm().
+            # non contiguous case 3
+            mat1 = genf(m, n).t()
+            mat2 = genf(p, m).t()
+            res = genf(n, p)
+            torch.mm(mat1, mat2, out=res)
+
+            res2 = matrixmultiply(mat1, mat2)
+            self.assertEqual(res, res2)
+
+        def genf_int(x, y):
+            return torch.randint(0, 100, (x, y), dtype=dtype, device=device)
+
+        def genf_bfloat(x, y):
+            return torch.randn(x, y, dtype=torch.float32, device=device).to(dtype) * 0.1
+
+        def genf_float(x, y):
+            return torch.randn(x, y, dtype=dtype, device=device)
+
+        def genf_Half(x, y):
+            return torch.randn(x, y, dtype=dtype, device=device)
+
+        for n, m, p in [(20, 10, 15), (15, 20, 10), (25, 18, 10)]:
+            if (dtype == torch.int32) or (dtype == torch.int64):
+                genf = genf_int
+            elif dtype == torch.bfloat16:
+                genf = genf_bfloat
+            elif dtype == torch.half:
+                genf = genf_Half
+            else:
+                genf = genf_float
+
+            _test_mm(n, m, p, dtype, genf)
+
+    @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
+    @dtypes(torch.float32, torch.bfloat16, torch.half)
+    def test_bmm(self, device, dtype):
+        batch_sizes = [1, 10]
+        M, N, O = 23, 15, 12
+        numpy_dtype = dtype if dtype != torch.bfloat16 else torch.float32
+
+        def invert_perm(p):
+            d = {x: i for i, x in enumerate(p)}
+            return (d[0], d[1], d[2])
+
+        def generate_inputs(num_batches):
+            # transposed tensors
+            for perm1, perm2 in itertools.product(
+                itertools.permutations((0, 1, 2)), repeat=2
+            ):
+                b1 = make_tensor(
+                    (num_batches, M, N), dtype=dtype, device=device, low=-0.1, high=0.1
+                )
+                b2 = make_tensor(
+                    (num_batches, N, O), dtype=dtype, device=device, low=-0.1, high=0.1
+                )
+                b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1))
+                b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2))
+                yield b1, b2
+            # broadcasting tensors
+            for b1, b2, b3, b4, b5, b6 in itertools.product((True, False), repeat=6):
+                shape1 = (num_batches if b1 else 1, M if b2 else 1, N if b3 else 1)
+                shape2 = (num_batches if b4 else 1, N if b5 else 1, O if b6 else 1)
+                b1 = make_tensor(
+                    shape1, dtype=dtype, device=device, low=-0.1, high=0.1
+                ).expand(num_batches, M, N)
+                b2 = make_tensor(
+                    shape2, dtype=dtype, device=device, low=-0.1, high=0.1
+                ).expand(num_batches, N, O)
+                yield b1, b2
+            # zero-sized tensors
+            for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
+                shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
+                shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
+                b1 = torch.randn(shape1, dtype=dtype, device=device)
+                b2 = torch.randn(shape2, dtype=dtype, device=device)
+                yield b1, b2
+
+        for num_batches in batch_sizes:
+            for (b1, b2), perm3 in itertools.product(
+                generate_inputs(num_batches), itertools.permutations((0, 1, 2))
+            ):
+                res1 = torch.bmm(b1, b2)
+                res2 = (
+                    torch.full(
+                        (num_batches, M, O), math.nan, dtype=dtype, device=device
+                    )
+                    .permute(perm3)
+                    .contiguous()
+                    .permute(invert_perm(perm3))
+                )
+                torch.bmm(b1, b2, out=res2)
+                expect = torch.from_numpy(
+                    b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()
+                ).to(device=device, dtype=dtype)
+                self.assertEqual(expect, res1)
+                self.assertEqual(expect, res2)
+
+                if self.device_type == "cuda":
+                    # check that mixed arguments are rejected
+                    self.assertRaises(RuntimeError, lambda: torch.bmm(b1, b2.cpu()))
+                    self.assertRaises(RuntimeError, lambda: torch.bmm(b1.cpu(), b2))
+                    self.assertRaises(
+                        RuntimeError, lambda: torch.bmm(b1, b2, out=res2.cpu())
+                    )
+
+    def _test_addbmm_baddbmm(self, func, b1, b2, ref, out_tensor):
+        getattr(out_tensor, func + "_")(b1, b2)
+        self.assertEqual(out_tensor, ref)
+        res3 = out_tensor.clone()
+
+        with self.assertWarnsOnceRegex(
+            UserWarning, f"This overload of {func}_ is deprecated"
+        ):
+            getattr(out_tensor, func + "_")(1, b1, b2)
+        self.assertEqual(out_tensor, ref * 2),
+        getattr(res3, func + "_")(b1, b2, beta=1)
+        self.assertEqual(out_tensor, res3)
+
+        with self.assertWarnsOnceRegex(
+            UserWarning, f"This overload of {func}_ is deprecated"
+        ):
+            getattr(out_tensor, func + "_")(1.0, 0.5, b1, b2)
+        self.assertEqual(out_tensor, ref * 2.5)
+        getattr(res3, func + "_")(b1, b2, beta=1.0, alpha=0.5)
+        self.assertEqual(out_tensor, res3)
+
+        with self.assertWarnsOnceRegex(
+            UserWarning, f"This overload of {func} is deprecated"
+        ):
+            self.assertEqual(out_tensor, getattr(torch, func)(1, out_tensor, 0, b1, b2))
+
+        res4 = getattr(torch, func)(out_tensor, b1, b2, beta=1, alpha=0.5)
+        self.assertEqual(res4, ref * 3),
+
+        nan = torch.full_like(out_tensor, math.nan)
+        res5 = getattr(torch, func)(nan, b1, b2, beta=0, alpha=1)
+        self.assertEqual(res5, ref)
+
+        if b1.is_complex():
+            res6 = getattr(torch, func)(out_tensor, b1, b2, beta=0.1j, alpha=0.5j)
+            self.assertEqual(res6, out_tensor * 0.1j + 0.5j * ref)
+        else:
+            res6 = getattr(torch, func)(out_tensor, b1, b2, beta=0.1, alpha=0.5)
+            self.assertEqual(res6, out_tensor * 0.1 + 0.5 * ref)
+
+        res7 = torch.full_like(out_tensor, math.nan)
+        getattr(torch, func)(nan, b1, b2, beta=0, out=res7)
+        self.assertEqual(res7, ref)
+
+    @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
+    @dtypes(torch.float32, torch.bfloat16, torch.half)
+    def test_addbmm(self, device, dtype):
+        num_batches = 2
+        M, N, O = 16, 17, 18
+
+        is_supported = True
+
+        if not is_supported:
+            b1 = make_tensor(
+                (num_batches, M, N), dtype=dtype, device=device, low=-1, high=1
+            )
+            b2 = make_tensor(
+                (num_batches, N, O), dtype=dtype, device=device, low=-1, high=1
+            )
+            t = make_tensor((M, O), dtype=dtype, device=device, low=-1, high=1)
+            self.assertRaisesRegex(
+                RuntimeError,
+                "type|Type|not implemented|CUBLAS_STATUS_NOT_SUPPORTED",
+                lambda: torch.addbmm(t, b1, b2),
+            )
+            return
+
+        def invert_perm(p):
+            d = {x: i for i, x in enumerate(p)}
+            return (d[0], d[1], d[2])
+
+        def generate_tensor():
+            numpy_dtype = dtype if dtype != torch.bfloat16 else torch.float32
+            # transposed tensors
+            for perm1, perm2 in itertools.product(
+                itertools.permutations((0, 1, 2)), repeat=2
+            ):
+                for perm3 in itertools.permutations((0, 1)):
+                    b1 = (
+                        make_tensor(
+                            (num_batches, M, N),
+                            dtype=dtype,
+                            device=device,
+                            low=-1,
+                            high=1,
+                        )
+                        * 0.1
+                    )
+                    b2 = (
+                        make_tensor(
+                            (num_batches, N, O),
+                            dtype=dtype,
+                            device=device,
+                            low=-1,
+                            high=1,
+                        )
+                        * 0.1
+                    )
+                    b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1))
+                    b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2))
+                    ref = (
+                        torch.from_numpy(
+                            b1.to(numpy_dtype).cpu().numpy()
+                            @ b2.to(numpy_dtype).cpu().numpy()
+                        )
+                        .to(device=device, dtype=dtype)
+                        .sum(0)
+                    )
+                    out_tensor = (
+                        torch.zeros_like(ref).permute(perm3).contiguous().permute(perm3)
+                    )
+                    yield b1, b2, ref, out_tensor
+            # broadcasting tensors
+            for s1, s2, s3, s4, s5, s6 in itertools.product((True, False), repeat=6):
+                shape1 = (num_batches if s1 else 1, M if s2 else 1, N if s3 else 1)
+                shape2 = (num_batches if s4 else 1, N if s5 else 1, O if s6 else 1)
+                b1 = (
+                    make_tensor(
+                        shape1, dtype=dtype, device=device, low=-1, high=1
+                    ).expand(num_batches, M, N)
+                    * 0.1
+                )
+                b2 = (
+                    make_tensor(
+                        shape2, dtype=dtype, device=device, low=-1, high=1
+                    ).expand(num_batches, N, O)
+                    * 0.1
+                )
+                ref = (
+                    torch.from_numpy(
+                        b1.to(numpy_dtype).cpu().numpy()
+                        @ b2.to(numpy_dtype).cpu().numpy()
+                    )
+                    .to(device=device, dtype=dtype)
+                    .sum(0)
+                )
+                out_tensor = torch.zeros_like(ref)
+                yield b1, b2, ref, out_tensor
+            # zero-sized tensors
+            for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
+                shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
+                shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
+                b1 = (
+                    make_tensor(shape1, dtype=dtype, device=device, low=-1, high=1)
+                    * 0.1
+                )
+                b2 = (
+                    make_tensor(shape2, dtype=dtype, device=device, low=-1, high=1)
+                    * 0.1
+                )
+                ref = (
+                    torch.from_numpy(
+                        b1.to(numpy_dtype).cpu().numpy()
+                        @ b2.to(numpy_dtype).cpu().numpy()
+                    )
+                    .to(device=device, dtype=dtype)
+                    .sum(0)
+                )
+                out_tensor = torch.zeros_like(ref)
+                yield b1, b2, ref, out_tensor
+
+        for b1, b2, ref, out_tensor in generate_tensor():
+            self._test_addbmm_baddbmm("addbmm", b1, b2, ref, out_tensor)
+
+    @precisionOverride({torch.half: 0.1, torch.bfloat16: 0.5})
+    @dtypes(torch.float32, torch.bfloat16, torch.half)
+    def test_baddbmm(self, device, dtype):
+        num_batches = 10
+        M, N, O = 12, 8, 50
+
+        def invert_perm(p):
+            d = {x: i for i, x in enumerate(p)}
+            return (d[0], d[1], d[2])
+
+        def generate_tensor():
+            numpy_dtype = (
+                dtype if dtype not in [torch.bfloat16, torch.half] else torch.float32
+            )
+            # transposed tensors
+            for perm1, perm2, perm3 in itertools.product(
+                itertools.permutations((0, 1, 2)), repeat=3
+            ):
+                b1 = make_tensor(
+                    (num_batches, M, N), dtype=dtype, device=device, low=-1, high=1
+                )
+                b2 = make_tensor(
+                    (num_batches, N, O), dtype=dtype, device=device, low=-1, high=1
+                )
+                b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1))
+                b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2))
+                ref = torch.from_numpy(
+                    b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()
+                ).to(device=device, dtype=dtype)
+                out_tensor = torch.zeros_like(ref)
+                out_tensor = (
+                    out_tensor.permute(perm3).contiguous().permute(invert_perm(perm3))
+                )
+                yield b1, b2, ref, out_tensor
+            # broadcasting tensors
+            for s1, s2, s3, s4, s5, s6 in itertools.product((True, False), repeat=6):
+                shape1 = (num_batches if s1 else 1, M if s2 else 1, N if s3 else 1)
+                shape2 = (num_batches if s4 else 1, N if s5 else 1, O if s6 else 1)
+                b1 = make_tensor(
+                    shape1, dtype=dtype, device=device, low=-1, high=1
+                ).expand(num_batches, M, N)
+                b2 = make_tensor(
+                    shape2, dtype=dtype, device=device, low=-1, high=1
+                ).expand(num_batches, N, O)
+                ref = torch.from_numpy(
+                    b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()
+                ).to(device=device, dtype=dtype)
+                out_tensor = torch.zeros_like(ref)
+                yield b1, b2, ref, out_tensor
+            # zero-sized tensors
+            for z1, z2, z3, z4 in itertools.product((True, False), repeat=4):
+                shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0)
+                shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0)
+                b1 = make_tensor(shape1, dtype=dtype, device=device, low=-2, high=2)
+                b2 = make_tensor(shape2, dtype=dtype, device=device, low=-2, high=2)
+                ref = torch.from_numpy(
+                    b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()
+                ).to(device=device, dtype=dtype)
+                out_tensor = torch.zeros_like(ref)
+                yield b1, b2, ref, out_tensor
+
+        for b1, b2, ref, out_tensor in generate_tensor():
+            self._test_addbmm_baddbmm("baddbmm", b1, b2, ref, out_tensor)
+
+    def test_tensordot(self, device):
+        a = torch.arange(60.0, device=device).reshape(3, 4, 5)
+        b = torch.arange(24.0, device=device).reshape(4, 3, 2)
+        c = torch.tensordot(a, b, dims=([1, 0], [0, 1])).cpu()
+        cn = torch.from_numpy(
+            np.tensordot(a.cpu().numpy(), b.cpu().numpy(), axes=([1, 0], [0, 1]))
+        )
+        self.assertEqual(c, cn)
+
+        cout = torch.zeros((5, 2), device=device)
+        torch.tensordot(a, b, dims=([1, 0], [0, 1]), out=cout).cpu()
+        self.assertEqual(c, cout)
+
+        a = torch.randn(2, 3, 4, 5, device=device)
+        b = torch.randn(4, 5, 6, 7, device=device)
+        c = torch.tensordot(a, b, dims=2).cpu()
+        cn = torch.from_numpy(np.tensordot(a.cpu().numpy(), b.cpu().numpy(), axes=2))
+
+        with self.assertRaisesRegex(RuntimeError, "expects dims >= 0"):
+            torch.tensordot(a, b, dims=-1)
+
+        self.assertEqual(c, cn)
+        c = torch.tensordot(a, b).cpu()
+        cn = torch.from_numpy(np.tensordot(a.cpu().numpy(), b.cpu().numpy()))
+        self.assertEqual(c, cn)
+
+        a = torch.tensordot(torch.tensor(0.0), torch.tensor(0.0), 0)
+        an = torch.from_numpy(
+            np.tensordot(
+                np.zeros((), dtype=np.float32), np.zeros((), dtype=np.float32), 0
+            )
+        )
+        self.assertEqual(a, an)
+
+    @dtypes(torch.float)
+    @precisionOverride({torch.float32: 1e-4})
+    def test_1_sized_with_0_strided(self, device, dtype):
+        a = make_tensor((8, 1, 64), dtype=dtype, device=device)
+        a_strided = torch.as_strided(a, size=[8, 1, 64], stride=[64, 0, 1])
+        b = make_tensor((8, 64, 512), dtype=dtype, device=device)
+        b_strided = torch.as_strided(b, size=[8, 64, 512], stride=[64, 1, 512])
+        res = torch.bmm(a_strided, b_strided)
+        expect = torch.from_numpy(a_strided.cpu().numpy() @ b_strided.cpu().numpy()).to(
+            device=device, dtype=dtype
+        )
+        self.assertEqual(expect, res)
+
+    def _select_broadcastable_dims(self, dims_full=None):
+        # select full dimensionality
+        if dims_full is None:
+            dims_full = []
+            ndims = random.randint(1, 4)
+            dims_full = [random.randint(1, 8) for _ in range(ndims)]
+        else:
+            ndims = len(dims_full)
+
+        # select actual dimensions for ops:
+        # larger: full ndims, individual sizes may be reduced
+        # smaller: possibly reduced ndims, sizes may be reduced
+        smaller_ndims = random.randint(1, ndims)
+        dims_small = []
+        dims_large = []
+        for i in range(ndims - 1, -1, -1):
+            j = random.randint(1, 3)
+            if j == 1:  # no reduced singleton dimension
+                ds = dims_full[i]
+                dl = dims_full[i]
+            elif j == 2:  # larger may have reduced singleton dimension
+                ds = dims_full[i]
+                dl = 1 if len(dims_small) < smaller_ndims else dims_full[i]
+            elif j == 3:  # smaller may have reduced singleton dimension
+                ds = 1
+                dl = dims_full[i]
+            dims_large = [dl] + dims_large
+            if len(dims_small) < smaller_ndims:
+                dims_small = [ds] + dims_small
+        return (dims_small, dims_large, dims_full)
+
+    def test_broadcast_fused_matmul(self, device):
+        fns = ["baddbmm", "addbmm", "addmm", "addmv", "addr"]
+
+        for fn in fns:
+            batch_dim = random.randint(1, 8)
+            n_dim = random.randint(1, 8)
+            m_dim = random.randint(1, 8)
+            p_dim = random.randint(1, 8)
+
+            def dims_full_for_fn():
+                if fn == "baddbmm":
+                    return (
+                        [batch_dim, n_dim, p_dim],
+                        [batch_dim, n_dim, m_dim],
+                        [batch_dim, m_dim, p_dim],
+                    )
+                elif fn == "addbmm":
+                    return (
+                        [n_dim, p_dim],
+                        [batch_dim, n_dim, m_dim],
+                        [batch_dim, m_dim, p_dim],
+                    )
+                elif fn == "addmm":
+                    return ([n_dim, p_dim], [n_dim, m_dim], [m_dim, p_dim])
+                elif fn == "addmv":
+                    return ([n_dim], [n_dim, m_dim], [m_dim])
+                elif fn == "addr":
+                    return ([n_dim, m_dim], [n_dim], [m_dim])
+                else:
+                    raise AssertionError("unknown function")
+
+            (t0_dims_full, t1_dims, t2_dims) = dims_full_for_fn()
+            (t0_dims_small, _, _) = self._select_broadcastable_dims(t0_dims_full)
+
+            t0_small = torch.randn(*t0_dims_small, device=device).float()
+            t1 = torch.randn(*t1_dims, device=device).float()
+            t2 = torch.randn(*t2_dims, device=device).float()
+
+            t0_full = t0_small.expand(*t0_dims_full).to(device)
+
+            fntorch = getattr(torch, fn)
+            r0 = fntorch(t0_small, t1, t2)
+            r1 = fntorch(t0_full, t1, t2)
+            self.assertEqual(r0, r1)
+
+    @dtypes(torch.float32)
+    def test_strided_mm_bmm(self, device, dtype):
+        # Tests strided view case with stride smaller than corresponding dimension size
+        x = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=dtype, device=device)
+        new_shape = [2, 2, 2]
+        new_stride = [3, 1, 1]
+        sx = torch.as_strided(x, size=new_shape, stride=new_stride)
+
+        torch_fn = lambda x: torch.bmm(x, x)  # noqa: E731
+        np_fn = lambda x: np.matmul(x, x)  # noqa: E731
+        self.compare_with_numpy(torch_fn, np_fn, sx)
+
+        torch_fn = lambda x: torch.mm(x, x)  # noqa: E731
+        self.compare_with_numpy(torch_fn, np_fn, sx[0])
+
+    def test_mm_empty_inputs_mixed_dtype_errors(self, device):
+        a = torch.randint(0, 10, [1, 10], dtype=torch.int16, device=device)
+        b = torch.randn(10, 20, dtype=torch.float32, device=device)
+        with self.assertRaisesRegex(
+            RuntimeError, "expected .* and .* to have the same dtype, but got:"
+        ):
+            torch.mm(a, b)
+
+    def test_matmul_45724(self, device):
+        # https://github.com/pytorch/pytorch/issues/45724
+        a = torch.rand(65537, 22, 64, device=device, dtype=torch.half)
+        b = torch.rand(65537, 64, 22, device=device, dtype=torch.half)
+        c = torch.full((65537, 22, 22), math.nan, dtype=torch.half, device=device)
+        cpu_result = torch.matmul(a.cpu().float(), b.cpu().float()).half()
+        torch.matmul(a, b, out=c)
+        self.assertEqual(c, cpu_result)
+
+    @dtypes(
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float16,
+        torch.float32,
+        torch.float64,
+    )
+    def test_baddbmm_input_dtypes_compatibility(self, device, dtype):
+        batch1 = torch.rand((1, 2, 2), dtype=torch.float32, device=device)
+        batch2 = torch.rand((1, 2, 2), dtype=torch.float32, device=device)
+        input_tensor = torch.rand((1, 2, 2), device=device).to(dtype)
+        if dtype != torch.float32:
+            with self.assertRaisesRegex(RuntimeError, "Input dtypes must be the same"):
+                y = torch.baddbmm(input_tensor, batch1, batch2, beta=0.0)
+        else:
+            out = torch.randn((1, 2, 2), dtype=dtype, device=device).fill_(torch.nan)
+            y_ref = torch.bmm(batch1, batch2)
+            y = torch.baddbmm(input_tensor, batch1, batch2, beta=0.0, out=out)
+            self.assertEqual(out, y_ref)
+
+    @dtypes(torch.float)
+    def test_baddbmm_nan_input_with_zero_beta(self, device, dtype):
+        for shape in [[3, 2, 2], [2, 20, 20]]:
+            mat1, mat2 = (
+                torch.randn(shape, dtype=dtype, device=device) for _ in range(2)
+            )
+            inputs = [
+                torch.randn(shape, dtype=dtype, device=device),
+                torch.randn(shape, dtype=dtype, device=device).fill_(torch.nan),
+            ]
+            outs = [
+                None,
+                torch.randn(shape, dtype=dtype, device=device),
+                torch.randn(shape, dtype=dtype, device=device).fill_(torch.nan),
+            ]
+            options = itertools.product(inputs, outs)
+            for input, out in options:
+                y_ref = torch.bmm(mat1, mat2)
+                y = torch.baddbmm(input, mat1, mat2, beta=0.0, out=out)
+                self.assertEqual(y_ref, y)
+
+    @dtypes(torch.float)
+    def test_addmm_sizes(self, device, dtype):
+        for m in [0, 1, 25]:
+            for n in [0, 1, 10]:
+                for k in [0, 1, 8]:
+                    M = torch.randn(n, m, device=device).to(dtype)
+                    m1 = torch.randn(n, k, device=device).to(dtype)
+                    m2 = torch.randn(k, m, device=device).to(dtype)
+                    self._test_addmm_addmv(torch.addmm, M, m1, m2)
+
+                    m1 = torch.randn(n, k + 1, device=device).to(dtype)
+                    m2 = torch.randn(k, m, device=device).to(dtype)
+                    self.assertRaisesRegex(
+                        RuntimeError,
+                        f"{n}x{k + 1}.*{k}x{m}",
+                        lambda: torch.addmm(M, m1, m2),
+                    )
+                    self.assertRaisesRegex(
+                        RuntimeError, f"{n}x{k + 1}.*{k}x{m}", lambda: torch.mm(m1, m2)
+                    )
+
+    @precisionOverride(
+        {
+            torch.double: 1e-8,
+            torch.float: 1e-4,
+            torch.bfloat16: 5e-2,
+            torch.half: 5e-2,
+            torch.cfloat: 1e-4,
+            torch.cdouble: 1e-8,
+        }
+    )
+    @dtypes(torch.float32, torch.bfloat16, torch.half)
+    def test_addmm_gelu(self, device, dtype):
+        self._test_addmm_impl(torch._addmm_activation, "gelu", device, dtype)
+
+    @precisionOverride(
+        {
+            torch.double: 1e-8,
+            torch.float: 1e-4,
+            torch.bfloat16: 5e-2,
+            torch.half: 5e-2,
+            torch.cfloat: 1e-4,
+            torch.cdouble: 1e-8,
+        }
+    )
+    @dtypes(torch.float32, torch.bfloat16, torch.half)
+    def test_addmm_relu(self, device, dtype):
+        self._test_addmm_impl(torch._addmm_activation, "relu", device, dtype)
+
+    @dtypes(torch.float, torch.bfloat16, torch.half)
+    def test_addmv_rowmajor_colmajor_incx_incy_lda(self, device, dtype):
+        # tests (o, s)*(s).  o is output size, s is summed size.
+        o = 5
+        s = 3
+        a_data = torch.arange(1, o * s + 1, device=device, dtype=dtype).view(o, s)
+        x_data = torch.arange(1, s + 1, 1, device=device, dtype=dtype)
+        y_data = torch.ones(o, device=device, dtype=dtype)
+        control = torch.tensor(
+            [15.0, 33.0, 51.0, 69.0, 87.0], device=device, dtype=dtype
+        )
+
+        def _test(row_major, incx, incy, lda_tail):
+            if row_major:
+                a_storage = torch.full(
+                    (o, s + lda_tail), float("nan"), device=device, dtype=dtype
+                )
+            else:
+                a_storage = torch.full(
+                    (s, o + lda_tail), float("nan"), device=device, dtype=dtype
+                ).permute(1, 0)
+            a = a_storage[:o, :s].copy_(a_data)
+
+            x_storage = torch.full((s, incx), float("nan"), device=device, dtype=dtype)
+            x = x_storage[:, 0].copy_(x_data)
+
+            y_storage = torch.full((o, incy), float("nan"), device=device, dtype=dtype)
+            y = y_storage[:, 0].copy_(y_data)
+
+            self._test_addmm_addmv(torch.addmv, y, a, x)
+
+        for row_major, incx, incy, lda_tail in itertools.product(
+            (False, True), (1, 2), (1, 2), (0, 1)
+        ):
+            _test(row_major, incx, incy, lda_tail)
+
+    @precisionOverride(
+        {
+            torch.double: 1e-8,
+            torch.float: 1e-4,
+            torch.bfloat16: 0.6,
+            torch.half: 1e-1,
+            torch.cfloat: 1e-4,
+            torch.cdouble: 1e-8,
+        }
+    )
+    @dtypes(torch.bfloat16, torch.half, torch.float32)
+    def test_corner_cases_of_cublasltmatmul(self, device, dtype):
+        # common case
+        M = torch.randn(128, device=device).to(dtype)
+        m1 = torch.randn(2048, 2400, device=device).to(dtype)
+        m2 = torch.randn(128, 2400, device=device).to(dtype)
+        torch.nn.functional.linear(m1, m2, M)
+        # Ntrans_B has ld >> rows
+        m1 = torch.rand([128, 2400]).to(dtype).to(device).t()
+        m2 = torch.rand([2048, 25272]).to(dtype).to(device).t()[21940:24340]
+        M = torch.rand([128]).to(dtype).to(device)
+        torch.addmm(M, m2.t(), m1)
+        # trans_A has ld >> rows
+        m1 = torch.rand([128, 25272]).to(dtype).to(device)[:, 21940:24340].t()
+        m2 = torch.randn(2048, 2400, device=device).to(dtype)
+        M = torch.rand([128]).to(dtype).to(device)
+        torch.addmm(M, m2, m1)
+        # large tensor dim > 65535
+        M = torch.randn(16, device=device).to(dtype)
+        m1 = torch.randn(32, 131071, device=device).to(dtype)
+        m2 = torch.randn(16, 131071, device=device).to(dtype)
+        torch.nn.functional.linear(m1, m2, M)
+
+    def test_blas_empty(self, device):
+        def fn(torchfn, *args, test_out=False, **kwargs):
+            def call_torch_fn(*args, **kwargs):
+                return torchfn(
+                    *tuple(
+                        torch.randn(shape, device=device)
+                        if isinstance(shape, tuple)
+                        else shape
+                        for shape in args
+                    ),
+                    **kwargs,
+                )
+
+            result = call_torch_fn(*args, **kwargs)
+            if not test_out:
+                return result
+            else:
+                out = torch.full_like(result, math.nan)
+                out1 = call_torch_fn(*args, **kwargs, out=out)
+                return out
+
+        # mm, addmm
+        self.assertEqual((0, 0), fn(torch.mm, (0, 0), (0, 0)).shape)
+        self.assertEqual((0, 5), fn(torch.mm, (0, 0), (0, 5)).shape)
+        self.assertEqual((5, 0), fn(torch.mm, (5, 0), (0, 0)).shape)
+        self.assertEqual((3, 0), fn(torch.mm, (3, 2), (2, 0)).shape)
+        self.assertEqual(
+            torch.zeros((5, 6), device=device), fn(torch.mm, (5, 0), (0, 6))
+        )
+        self.assertEqual(
+            torch.zeros((5, 6), device=device),
+            fn(torch.mm, (5, 0), (0, 6), test_out=True),
+        )
+
+        self.assertEqual((0, 0), fn(torch.addmm, (0, 0), (0, 0), (0, 0)).shape)
+        self.assertEqual((0, 1), fn(torch.addmm, (1,), (0, 17), (17, 1)).shape)
+        t = torch.randn((5, 6), device=device)
+        self.assertEqual(t, fn(torch.addmm, t, (5, 0), (0, 6)))
+        self.assertEqual(t, fn(torch.addmm, t, (5, 0), (0, 6), test_out=True))
+
+        # mv, addmv
+        self.assertEqual((0,), fn(torch.mv, (0, 0), (0,)).shape)
+        self.assertEqual((0,), fn(torch.mv, (0, 2), (2,)).shape)
+        self.assertEqual(torch.zeros((3,), device=device), fn(torch.mv, (3, 0), (0,)))
+        self.assertEqual(
+            torch.zeros((3,), device=device), fn(torch.mv, (3, 0), (0,), test_out=True)
+        )
+
+        self.assertEqual((0,), fn(torch.addmv, (0,), (0, 0), (0,)).shape)
+        t = torch.randn((3,), device=device)
+        self.assertEqual(t, fn(torch.addmv, t, (3, 0), (0,)))
+        self.assertEqual(t, fn(torch.addmv, t, (3, 0), (0,), test_out=True))
+
+        # bmm, baddbmm
+        self.assertEqual((0, 0, 0), fn(torch.bmm, (0, 0, 0), (0, 0, 0)).shape)
+        self.assertEqual((3, 0, 5), fn(torch.bmm, (3, 0, 0), (3, 0, 5)).shape)
+        self.assertEqual((0, 5, 6), fn(torch.bmm, (0, 5, 0), (0, 0, 6)).shape)
+        self.assertEqual(
+            torch.zeros((3, 5, 6), device=device), fn(torch.bmm, (3, 5, 0), (3, 0, 6))
+        )
+        self.assertEqual(
+            torch.zeros((3, 5, 6), device=device),
+            fn(torch.bmm, (3, 5, 0), (3, 0, 6), test_out=True),
+        )
+
+        self.assertEqual(
+            (0, 0, 0), fn(torch.baddbmm, (0, 0, 0), (0, 0, 0), (0, 0, 0)).shape
+        )
+        self.assertEqual(
+            (3, 0, 5), fn(torch.baddbmm, (3, 0, 5), (3, 0, 0), (3, 0, 5)).shape
+        )
+        self.assertEqual(
+            (0, 5, 6), fn(torch.baddbmm, (0, 5, 6), (0, 5, 0), (0, 0, 6)).shape
+        )
+        self.assertEqual(
+            (3, 5, 6), fn(torch.baddbmm, (3, 5, 6), (3, 5, 0), (3, 0, 6)).shape
+        )
+        c = torch.arange(30, dtype=torch.float32, device=device).reshape(3, 2, 5)
+        self.assertEqual(
+            -2 * c, fn(torch.baddbmm, c, (3, 2, 0), (3, 0, 5), beta=-2)
+        )  # Issue #33467
+        self.assertEqual(
+            -2 * c, fn(torch.baddbmm, c, (3, 2, 0), (3, 0, 5), beta=-2, test_out=True)
+        )  # Issue #33467
+
+        # addbmm
+        self.assertEqual((0, 0), fn(torch.addbmm, (0, 0), (0, 0, 0), (0, 0, 0)).shape)
+        self.assertEqual((0, 5), fn(torch.addbmm, (0, 5), (3, 0, 0), (3, 0, 5)).shape)
+        t = torch.randn((5, 6), device=device)
+        self.assertEqual(t, fn(torch.addbmm, t, (0, 5, 0), (0, 0, 6)))
+        self.assertEqual(t, fn(torch.addbmm, t, (0, 5, 0), (0, 0, 6), test_out=True))
+
+        # matmul
+        self.assertEqual(torch.tensor(0.0, device=device), fn(torch.matmul, (0,), (0,)))
+        self.assertEqual(
+            torch.tensor(0.0, device=device),
+            fn(torch.matmul, (0,), (0,), test_out=True),
+        )
+        self.assertEqual((0, 0), fn(torch.matmul, (0, 0), (0, 0)).shape)
+        self.assertEqual((0, 0, 0), fn(torch.matmul, (0, 0, 0), (0, 0, 0)).shape)
+        self.assertEqual((5, 0, 0), fn(torch.matmul, (5, 0, 0), (5, 0, 0)).shape)
+        self.assertEqual(
+            torch.zeros((5, 3, 4), device=device),
+            fn(torch.matmul, (5, 3, 0), (5, 0, 4)),
+        )
+        self.assertEqual(
+            torch.zeros((5, 3, 4), device=device),
+            fn(torch.matmul, (5, 3, 0), (5, 0, 4), test_out=True),
+        )
+
+        # dot
+        self.assertEqual(torch.tensor(0.0, device=device), fn(torch.dot, (0,), (0,)))
+        self.assertEqual(
+            torch.tensor(0.0, device=device), fn(torch.dot, (0,), (0,), test_out=True)
+        )
+
+    def test_large_bmm_backward(self, device):
+        A = torch.randn([1024, 2, 1024], device=device).mT.contiguous().mT
+        B = torch.randn([1, 1024, 65536], device=device, requires_grad=True)
+        G = torch.randn([1024, 2, 65536], device=device)
+
+        # Should not create an intermediary tensor of size [1024, 1024, 65536] (256GB of memory) and OOM
+        (A @ B).backward(G)
+
+    def test_large_bmm_mm_backward(self, device):
+        A = torch.randn([1024, 2, 1024], device=device).mT.contiguous().mT
+        B = torch.randn([1024, 65536], device=device, requires_grad=True)
+        G = torch.randn([1024, 2, 65536], device=device)
+
+        # Should not create an intermediary tensor of size [1024, 1024, 65536] (256GB of memory) and OOM
+        (A @ B).backward(G)
+
+    def check_single_matmul(self, x, y):
+        def assertEqual(answer, expected):
+            if x.dtype.is_floating_point or x.dtype.is_complex:
+                k = max(x.shape[-1], 1)  # Scale the atol with the size of the matrix
+                self.assertEqual(
+                    answer,
+                    expected,
+                    msg=f"{x.shape} x {y.shape} = {answer.shape}",
+                    atol=k * 5e-5,
+                    rtol=1e-4,
+                )
+            else:
+                self.assertEqual(
+                    answer, expected, msg=f"{x.shape} x {y.shape} = {answer.shape}"
+                )
+
+        # test x @ y
+        expected = np.matmul(x.cpu(), y.cpu())
+        ans = torch.matmul(x, y)
+        self.assertTrue(ans.is_contiguous())
+        assertEqual(ans, expected)
+
+        # test out
+        out = torch.empty_like(ans)
+        ans = torch.matmul(x, y, out=out)
+        self.assertIs(ans, out)
+        self.assertTrue(ans.is_contiguous())
+        assertEqual(ans, expected)
+
+    def gen_sizes_matmul(self, x_dim, y_dim=4, matrix_size=4, batch_size=3):
+        """
+        Generates sequences of tuples (x, y) of with size(x) = x_dim and
+        size(y) <= y_dim that are compatible wrt. matmul
+        """
+        assert x_dim >= 1
+        assert y_dim >= 2
+        x = x_dim
+        for y in range(1, y_dim + 1):
+            for batch, mn in product(
+                product(range(batch_size), repeat=max(x - 2, y - 2, 0)),
+                product(range(matrix_size), repeat=min(y, 2)),
+            ):
+                if x == 1:
+                    size_x = mn[:1]
+                    size_y = batch + mn
+                    yield size_x, size_y
+                else:
+                    for k in range(matrix_size):
+                        size_x = (k,) + mn[:1]
+                        if x > 2:
+                            size_x = batch[-(x - 2) :] + size_x
+                        size_y = mn
+                        if y > 2:
+                            size_y = batch[-(y - 2) :] + size_y
+                        yield size_x, size_y
+
+    @dtypes(torch.float)
+    def test_matmul_small_brute_force_1d_Nd(self, device, dtype):
+        make_arg = partial(make_tensor, device=device, dtype=dtype)
+
+        for (size_x, size_y), nctg_x, nctg_y in product(
+            self.gen_sizes_matmul(1), (True, False), (True, False)
+        ):
+            x = make_arg(size_x, noncontiguous=nctg_x)
+            y = make_arg(size_y, noncontiguous=nctg_y)
+            self.check_single_matmul(x, y)
+
+    @dtypes(torch.float)
+    def test_matmul_small_brute_force_2d_Nd(self, device, dtype):
+        make_arg = partial(make_tensor, device=device, dtype=dtype)
+
+        for (size_x, size_y), nctg_x, nctg_y in product(
+            self.gen_sizes_matmul(2), (True, False), (True, False)
+        ):
+            x = make_arg(size_x, noncontiguous=nctg_x)
+            y = make_arg(size_y, noncontiguous=nctg_y)
+            self.check_single_matmul(x, y)
+
+    @dtypes(torch.float)
+    def test_matmul_small_brute_force_3d_Nd(self, device, dtype):
+        make_arg = partial(make_tensor, device=device, dtype=dtype)
+
+        for (size_x, size_y), nctg_x, nctg_y in product(
+            self.gen_sizes_matmul(3), (True, False), (True, False)
+        ):
+            x = make_arg(size_x, noncontiguous=nctg_x)
+            y = make_arg(size_y, noncontiguous=nctg_y)
+            self.check_single_matmul(x, y)
+
+    @dtypes(torch.float)
+    def test_matmul_out_kernel_errors_with_autograd(self, device, dtype):
+        a = torch.empty(
+            (256, 512), device=device, dtype=dtype, requires_grad=True
+        ).unsqueeze(0)
+        b = torch.empty(
+            (4, 128, 512), device=device, dtype=dtype, requires_grad=True
+        ).transpose(-1, -2)
+        c = torch.empty((256, 4, 128), device=device, dtype=dtype).movedim(1, 0)
+
+        torch.matmul(a.detach(), b.detach(), out=c)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "functions with out=... arguments don't support automatic differentiation",
+        ):
+            torch.matmul(a, b, out=c)
+
+        with torch.no_grad():
+            torch.matmul(a, b, out=c)
+
+
+instantiate_device_type_tests(TestBasicGEMM, globals(), only_for="xpu")
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/third_party/LICENSES_BUNDLED.txt b/third_party/LICENSES_BUNDLED.txt
index df5cfee1f4f95..23d990fe594c4 100644
--- a/third_party/LICENSES_BUNDLED.txt
+++ b/third_party/LICENSES_BUNDLED.txt
@@ -1,4 +1,4 @@
-The Pytorch repository and source distributions bundle several libraries that are
+The Pytorch repository and source distributions bundle several libraries that are 
 compatibly licensed.  We list these here.
 
 Name: DCGM
@@ -41,11 +41,28 @@ License: Apache-2.0
 Files: third_party/benchmark,
      third_party/onnx/third_party/benchmark,
      third_party/onnx-tensorrt/third_party/onnx/third_party/benchmark,
-     third_party/protobuf/third_party/benchmark
+     third_party/protobuf/third_party/benchmark,
+     third_party/opentelemetry-cpp/third_party/benchmark
   For details, see: third_party/benchmark/LICENSE,
      third_party/onnx/third_party/benchmark/LICENSE,
      third_party/onnx-tensorrt/third_party/onnx/third_party/benchmark/LICENSE,
-     third_party/protobuf/third_party/benchmark/LICENSE
+     third_party/protobuf/third_party/benchmark/LICENSE,
+     third_party/opentelemetry-cpp/third_party/benchmark/LICENSE
+
+Name: boost-vcpkg-helpers
+License: MIT
+Files: third_party/opentelemetry-cpp/tools/vcpkg/ports/boost-vcpkg-helpers
+  For details, see: third_party/opentelemetry-cpp/tools/vcpkg/ports/boost-vcpkg-helpers/LICENSE.txt
+
+Name: cJSON
+License: MIT
+Files: third_party/opentelemetry-cpp/third_party/prometheus-cpp/3rdparty/civetweb/examples/rest/cJSON
+  For details, see: third_party/opentelemetry-cpp/third_party/prometheus-cpp/3rdparty/civetweb/examples/rest/cJSON/LICENSE
+
+Name: catch2
+License: BSL-1.0
+Files: third_party/opentelemetry-cpp/third_party/opentracing-cpp/3rd_party/include/opentracing/catch2
+  For details, see: third_party/opentelemetry-cpp/third_party/opentracing-cpp/3rd_party/include/opentracing/catch2/LICENSE.txt
 
 Name: clog
 License: BSD-2-Clause
@@ -104,6 +121,16 @@ Files: third_party/kineto/libkineto/third_party/dynolog/third_party/json/test/th
   For details, see: third_party/kineto/libkineto/third_party/dynolog/third_party/json/test/thirdparty/doctest/LICENSE.txt,
      third_party/nlohmann/tests/thirdparty/doctest/LICENSE.txt
 
+Name: duktape-1.5.2
+License: MIT
+Files: third_party/opentelemetry-cpp/third_party/prometheus-cpp/3rdparty/civetweb/src/third_party/duktape-1.5.2
+  For details, see: third_party/opentelemetry-cpp/third_party/prometheus-cpp/3rdparty/civetweb/src/third_party/duktape-1.5.2/LICENSE.txt
+
+Name: duktape-1.8.0
+License: MIT
+Files: third_party/opentelemetry-cpp/third_party/prometheus-cpp/3rdparty/civetweb/src/third_party/duktape-1.8.0
+  For details, see: third_party/opentelemetry-cpp/third_party/prometheus-cpp/3rdparty/civetweb/src/third_party/duktape-1.8.0/LICENSE.txt
+
 Name: dynolog
 License: MIT
 Files: third_party/kineto/libkineto/third_party/dynolog
@@ -114,22 +141,37 @@ License: BSD-3-Clause
 Files: third_party/eigen
   For details, see: third_party/eigen/COPYING.BSD
 
+Name: etw
+License: MIT
+Files: third_party/opentelemetry-cpp/exporters/etw/include/opentelemetry/exporters/etw
+  For details, see: third_party/opentelemetry-cpp/exporters/etw/include/opentelemetry/exporters/etw/LICENSE
+
+Name: expected
+License: MIT
+Files: third_party/opentelemetry-cpp/third_party/opentracing-cpp/3rd_party/include/opentracing/expected
+  For details, see: third_party/opentelemetry-cpp/third_party/opentracing-cpp/3rd_party/include/opentracing/expected/LICENSE
+
 Name: fbgemm
 License: BSD-3-Clause
 Files: third_party/fbgemm
   For details, see: third_party/fbgemm/LICENSE
 
+Name: ffnvcodec
+License: MIT with exception
+Files: third_party/opentelemetry-cpp/tools/vcpkg/ports/ffnvcodec
+  For details, see: third_party/opentelemetry-cpp/tools/vcpkg/ports/ffnvcodec/LICENSE.txt
+
 Name: flatbuffers
 License: Apache-2.0
 Files: third_party/flatbuffers
-  For details, see: third_party/flatbuffers/LICENSE.txt
+  For details, see: third_party/flatbuffers/LICENSE
 
 Name: fmt
 License: MIT with exception
 Files: third_party/fmt,
      third_party/kineto/libkineto/third_party/dynolog/third_party/fmt,
      third_party/kineto/libkineto/third_party/fmt
-  For details, see: third_party/fmt/LICENSE.rst,
+  For details, see: third_party/fmt/LICENSE,
      third_party/kineto/libkineto/third_party/dynolog/third_party/fmt/LICENSE.rst,
      third_party/kineto/libkineto/third_party/fmt/LICENSE.rst
 
@@ -149,12 +191,19 @@ Files: third_party/fbgemm/third_party/googletest/googlemock/scripts/generator,
      third_party/googletest/googlemock/scripts/generator,
      third_party/kineto/libkineto/third_party/googletest/googlemock/scripts/generator,
      third_party/protobuf/third_party/googletest/googlemock/scripts/generator,
-     third_party/tensorpipe/third_party/googletest/googlemock/scripts/generator
+     third_party/tensorpipe/third_party/googletest/googlemock/scripts/generator,
+     third_party/opentelemetry-cpp/third_party/prometheus-cpp/3rdparty/googletest/googlemock/scripts/generator
   For details, see: third_party/fbgemm/third_party/googletest/googlemock/scripts/generator/LICENSE,
      third_party/googletest/googlemock/scripts/generator/LICENSE,
      third_party/kineto/libkineto/third_party/googletest/googlemock/scripts/generator/LICENSE,
      third_party/protobuf/third_party/googletest/googlemock/scripts/generator/LICENSE,
-     third_party/tensorpipe/third_party/googletest/googlemock/scripts/generator/LICENSE
+     third_party/tensorpipe/third_party/googletest/googlemock/scripts/generator/LICENSE,
+     third_party/opentelemetry-cpp/third_party/prometheus-cpp/3rdparty/googletest/googlemock/scripts/generator/LICENSE
+
+Name: gettimeofday
+License: Apache-2.0
+Files: third_party/opentelemetry-cpp/tools/vcpkg/ports/gettimeofday
+  For details, see: third_party/opentelemetry-cpp/tools/vcpkg/ports/gettimeofday/LICENSE
 
 Name: gloo
 License: BSD-3-Clause
@@ -183,7 +232,9 @@ Files: third_party/fbgemm/third_party/googletest,
      third_party/protobuf/third_party/googletest,
      third_party/protobuf/third_party/googletest/googletest,
      third_party/tensorpipe/third_party/googletest,
-     third_party/tensorpipe/third_party/googletest/googletest
+     third_party/tensorpipe/third_party/googletest/googletest,
+     third_party/opentelemetry-cpp/third_party/googletest,
+     third_party/opentelemetry-cpp/third_party/prometheus-cpp/3rdparty/googletest
   For details, see: third_party/fbgemm/third_party/googletest/LICENSE,
      third_party/fbgemm/third_party/googletest/googletest/LICENSE,
      third_party/googletest/LICENSE,
@@ -193,20 +244,25 @@ Files: third_party/fbgemm/third_party/googletest,
      third_party/protobuf/third_party/googletest/LICENSE,
      third_party/protobuf/third_party/googletest/googletest/LICENSE,
      third_party/tensorpipe/third_party/googletest/LICENSE,
-     third_party/tensorpipe/third_party/googletest/googletest/LICENSE
+     third_party/tensorpipe/third_party/googletest/googletest/LICENSE,
+     third_party/opentelemetry-cpp/third_party/googletest/LICENSE,
+     third_party/opentelemetry-cpp/third_party/prometheus-cpp/3rdparty/googletest/LICENSE
 
 Name: gtest
 License: BSD-3-Clause
-Files: third_party/ideep/mkl-dnn/tests/gtest,
-     third_party/ideep/mkl-dnn/third_party/oneDNN/tests/gtests/gtest
-  For details, see: third_party/ideep/mkl-dnn/tests/gtest/LICENSE,
-     third_party/ideep/mkl-dnn/third_party/oneDNN/tests/gtests/gtest/LICENSE
+Files: third_party/ideep/mkl-dnn/tests/gtests/gtest
+  For details, see: third_party/ideep/mkl-dnn/tests/gtests/gtest/LICENSE
 
 Name: hipify_torch
 License: MIT
 Files: third_party/fbgemm/third_party/hipify_torch
   For details, see: third_party/fbgemm/third_party/hipify_torch/LICENSE.txt
 
+Name: hungarian
+License: Apache-2.0
+Files: third_party/opentelemetry-cpp/tools/vcpkg/ports/hungarian
+  For details, see: third_party/opentelemetry-cpp/tools/vcpkg/ports/hungarian/LICENSE.txt
+
 Name: ideep
 License: MIT
 Files: third_party/ideep
@@ -217,10 +273,10 @@ License: BSD-3-Clause
 Files: third_party/ios-cmake
   For details, see: third_party/ios-cmake/LICENSE
 
-Name: json
+Name: irrlicht
 License: MIT
-Files: third_party/cudnn_frontend/include/contrib/nlohmann/json
-  For details, see: third_party/cudnn_frontend/include/contrib/nlohmann/json/LICENSE.txt
+Files: third_party/opentelemetry-cpp/tools/vcpkg/ports/irrlicht
+  For details, see: third_party/opentelemetry-cpp/tools/vcpkg/ports/irrlicht/LICENSE.txt
 
 Name: kineto
 License: BSD-3-Clause
@@ -232,11 +288,21 @@ License: Apache-2.0
 Files: third_party/tensorpipe/third_party/libnop
   For details, see: third_party/tensorpipe/third_party/libnop/LICENSE
 
+Name: libstemmer
+License: BSD-3-Clause
+Files: third_party/opentelemetry-cpp/tools/vcpkg/ports/libstemmer
+  For details, see: third_party/opentelemetry-cpp/tools/vcpkg/ports/libstemmer/LICENSE
+
 Name: libuv
 License: MIT
 Files: third_party/tensorpipe/third_party/libuv
   For details, see: third_party/tensorpipe/third_party/libuv/LICENSE
 
+Name: mimalloc
+License: MIT
+Files: third_party/mimalloc
+  For details, see: third_party/mimalloc/LICENSE
+
 Name: miniz-2.1.0
 License: MIT
 Files: third_party/miniz-2.1.0
@@ -247,6 +313,11 @@ License: Apache-2.0
 Files: third_party/ideep/mkl-dnn
   For details, see: third_party/ideep/mkl-dnn/LICENSE
 
+Name: ms-gsl
+License: MIT
+Files: third_party/opentelemetry-cpp/third_party/ms-gsl
+  For details, see: third_party/opentelemetry-cpp/third_party/ms-gsl/LICENSE
+
 Name: nccl
 License: BSD-3-Clause
 Files: third_party/nccl/nccl
@@ -257,11 +328,6 @@ License: BSD-Source-Code
 Files: third_party/neon2sse
   For details, see: third_party/neon2sse/LICENSE
 
-Name: oneDNN
-License: Apache-2.0
-Files: third_party/ideep/mkl-dnn/third_party/oneDNN
-  For details, see: third_party/ideep/mkl-dnn/third_party/oneDNN/LICENSE
-
 Name: onnx
 License: Apache-2.0
 Files: third_party/onnx
@@ -277,11 +343,46 @@ License: MIT
 Files: third_party/onnx-tensorrt
   For details, see: third_party/onnx-tensorrt/LICENSE
 
+Name: opentelemetry-cpp
+License: Apache-2.0
+Files: third_party/opentelemetry-cpp
+  For details, see: third_party/opentelemetry-cpp/LICENSE
+
+Name: opentelemetry-proto
+License: Apache-2.0
+Files: third_party/opentelemetry-cpp/third_party/opentelemetry-proto
+  For details, see: third_party/opentelemetry-cpp/third_party/opentelemetry-proto/LICENSE
+
+Name: opentracing-cpp
+License: Apache-2.0
+Files: third_party/opentelemetry-cpp/third_party/opentracing-cpp
+  For details, see: third_party/opentelemetry-cpp/third_party/opentracing-cpp/LICENSE
+
+Name: pdcurses
+License: Apache-2.0
+Files: third_party/opentelemetry-cpp/tools/vcpkg/ports/pdcurses
+  For details, see: third_party/opentelemetry-cpp/tools/vcpkg/ports/pdcurses/LICENSE
+
 Name: pfs
 License: Apache-2.0
 Files: third_party/kineto/libkineto/third_party/dynolog/third_party/pfs
   For details, see: third_party/kineto/libkineto/third_party/dynolog/third_party/pfs/LICENSE
 
+Name: physac
+License: MIT
+Files: third_party/opentelemetry-cpp/tools/vcpkg/ports/physac
+  For details, see: third_party/opentelemetry-cpp/tools/vcpkg/ports/physac/LICENSE
+
+Name: pqp
+License: Apache-2.0
+Files: third_party/opentelemetry-cpp/tools/vcpkg/ports/pqp
+  For details, see: third_party/opentelemetry-cpp/tools/vcpkg/ports/pqp/LICENSE
+
+Name: prometheus-cpp
+License: MIT
+Files: third_party/opentelemetry-cpp/third_party/prometheus-cpp
+  For details, see: third_party/opentelemetry-cpp/third_party/prometheus-cpp/LICENSE
+
 Name: protobuf
 License: BSD-3-Clause
 Files: third_party/protobuf
@@ -308,11 +409,21 @@ Files: third_party/onnx/third_party/pybind11,
      third_party/pybind11/LICENSE,
      third_party/tensorpipe/third_party/pybind11/LICENSE
 
+Name: python
+License: BSD-3-Clause
+Files: third_party/cutlass/python
+  For details, see: third_party/cutlass/python/LICENSE.txt
+
 Name: python-peachpy
 License: BSD-2-Clause
 Files: third_party/python-peachpy
   For details, see: third_party/python-peachpy/LICENSE.rst
 
+Name: sigslot
+License: Apache-2.0
+Files: third_party/opentelemetry-cpp/tools/vcpkg/ports/sigslot
+  For details, see: third_party/opentelemetry-cpp/tools/vcpkg/ports/sigslot/LICENSE
+
 Name: sleef
 License: BSL-1.0
 Files: third_party/sleef
@@ -333,6 +444,11 @@ License: Apache-2.0
 Files: third_party/tbb
   For details, see: third_party/tbb/LICENSE
 
+Name: tensorflow-common
+License: MIT
+Files: third_party/opentelemetry-cpp/tools/vcpkg/ports/tensorflow-common
+  For details, see: third_party/opentelemetry-cpp/tools/vcpkg/ports/tensorflow-common/LICENSE.txt
+
 Name: tensorpipe
 License: BSD-3-Clause
 Files: third_party/tensorpipe
@@ -343,7 +459,22 @@ License: MIT with exception
 Files: third_party/kineto/libkineto/third_party/dynolog/third_party/cpr/test
   For details, see: third_party/kineto/libkineto/third_party/dynolog/third_party/cpr/test/LICENSE
 
+Name: variant
+License: BSD-3-Clause
+Files: third_party/opentelemetry-cpp/third_party/opentracing-cpp/3rd_party/include/opentracing/variant
+  For details, see: third_party/opentelemetry-cpp/third_party/opentracing-cpp/3rd_party/include/opentracing/variant/LICENSE
+
+Name: vcpkg
+License: MIT
+Files: third_party/opentelemetry-cpp/tools/vcpkg
+  For details, see: third_party/opentelemetry-cpp/tools/vcpkg/LICENSE.txt
+
+Name: vulkan
+License: Apache-2.0 with exception
+Files: third_party/opentelemetry-cpp/tools/vcpkg/ports/vulkan
+  For details, see: third_party/opentelemetry-cpp/tools/vcpkg/ports/vulkan/LICENSE.txt
+
 Name: zstd
 License: BSD-3-Clause
 Files: third_party/zstd
-  For details, see: third_party/zstd/LICENSE
+  For details, see: third_party/zstd/LICENSE
\ No newline at end of file
diff --git a/third_party/XNNPACK b/third_party/XNNPACK
index d9cce341f86a2..fcbf55af6cf28 160000
--- a/third_party/XNNPACK
+++ b/third_party/XNNPACK
@@ -1 +1 @@
-Subproject commit d9cce341f86a207da9d851d05e26cd50b508b73c
+Subproject commit fcbf55af6cf28a4627bcd1f703ab7ad843f0f3a2
diff --git a/third_party/build_bundled.py b/third_party/build_bundled.py
index d60a2c1354fd2..4e983400eb658 100644
--- a/third_party/build_bundled.py
+++ b/third_party/build_bundled.py
@@ -102,6 +102,21 @@ def squeeze(t):
         elif 'BoostSoftwareLicense-Version1.0' in txt:
             # Hmm, do we need to check the text?
             return 'BSL-1.0'
+        elif 'gettimeofday' in txt:
+            # Used in opentelemetry-cpp/tools/vcpkg/ports/gettimeofday
+            return 'Apache-2.0'
+        elif 'libhungarian' in txt:
+            # Used in opentelemetry-cpp/tools/vcpkg/ports/hungarian
+            return 'Apache-2.0'
+        elif 'PDCurses' in txt:
+            # Used in opentelemetry-cpp/tools/vcpkg/ports/pdcurses
+            return 'Apache-2.0'
+        elif 'Copyright1999UniversityofNorthCarolina' in txt:
+            # Used in opentelemetry-cpp/tools/vcpkg/ports/pqp
+            return 'Apache-2.0'
+        elif 'sigslot' in txt:
+            # Used in opentelemetry-cpp/tools/vcpkg/ports/sigslot
+            return 'Apache-2.0'
         elif squeeze("Clarified Artistic License") in txt:
             return 'Clarified Artistic License'
         elif all([squeeze(m) in txt.lower() for m in bsd3_txt]):
diff --git a/third_party/cudnn_frontend b/third_party/cudnn_frontend
index 9f82dda5c029d..150798fe97655 160000
--- a/third_party/cudnn_frontend
+++ b/third_party/cudnn_frontend
@@ -1 +1 @@
-Subproject commit 9f82dda5c029d15a5f371f0fe003dc0c74a0c987
+Subproject commit 150798fe976556078f443fdb059a1ff0361f58a2
diff --git a/third_party/cutlass b/third_party/cutlass
index 44c704eae85da..bbe579a9e3beb 160000
--- a/third_party/cutlass
+++ b/third_party/cutlass
@@ -1 +1 @@
-Subproject commit 44c704eae85da352d277d6f092f41412772f70e4
+Subproject commit bbe579a9e3beb6ea6626d9227ec32d0dae119a49
diff --git a/third_party/generate-xnnpack-wrappers.py b/third_party/generate-xnnpack-wrappers.py
old mode 100644
new mode 100755
index d378f39489e46..e87d4697fe96e
--- a/third_party/generate-xnnpack-wrappers.py
+++ b/third_party/generate-xnnpack-wrappers.py
@@ -37,6 +37,7 @@
     "PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_AVX512VNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_RVV_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
     "PROD_AVXVNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)",
@@ -81,6 +82,7 @@
     "PROD_AVX512SKX_MICROKERNEL_SRCS",
     "PROD_AVX512VBMI_MICROKERNEL_SRCS",
     "PROD_AVX512VNNI_MICROKERNEL_SRCS",
+    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS",
     "PROD_RVV_MICROKERNEL_SRCS",
     "PROD_AVXVNNI_MICROKERNEL_SRCS",
     "AARCH32_ASM_MICROKERNEL_SRCS",
diff --git a/third_party/ideep b/third_party/ideep
index e212bbbf81e2a..8a6cc4e09dc50 160000
--- a/third_party/ideep
+++ b/third_party/ideep
@@ -1 +1 @@
-Subproject commit e212bbbf81e2a29a82651ae55dc1effa8830f31a
+Subproject commit 8a6cc4e09dc509f04f83c085e38786b1fb44e14d
diff --git a/third_party/kineto b/third_party/kineto
index a30ca3f9509c2..3a81076cc9709 160000
--- a/third_party/kineto
+++ b/third_party/kineto
@@ -1 +1 @@
-Subproject commit a30ca3f9509c2cfd28561abbca51328f0bdf9014
+Subproject commit 3a81076cc97092666f319846f32f36b73ce2293e
diff --git a/third_party/mkl-dnn.BUILD b/third_party/mkl-dnn.BUILD
index 808ece93a20a3..dac4f9e3e8cf8 100644
--- a/third_party/mkl-dnn.BUILD
+++ b/third_party/mkl-dnn.BUILD
@@ -64,8 +64,8 @@ template_rule(
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "3",
         "@DNNL_VERSION_MINOR@": "3",
-        "@DNNL_VERSION_PATCH@": "2",
-        "@DNNL_VERSION_HASH@": "2dc95a2ad0841e29db8b22fbccaf3e5da7992b01",
+        "@DNNL_VERSION_PATCH@": "6",
+        "@DNNL_VERSION_HASH@": "86e6af5974177e513fd3fee58425e1063e7f1361",
     },
 )
 
diff --git a/third_party/nccl/nccl b/third_party/nccl/nccl
index 8c6c5951854a5..48bb7fec79531 160000
--- a/third_party/nccl/nccl
+++ b/third_party/nccl/nccl
@@ -1 +1 @@
-Subproject commit 8c6c5951854a57ba90c4424fa040497f6defac46
+Subproject commit 48bb7fec7953112ff37499a272317f6663f8f600
diff --git a/third_party/onnx b/third_party/onnx
index ccde5da81388f..990217f043af7 160000
--- a/third_party/onnx
+++ b/third_party/onnx
@@ -1 +1 @@
-Subproject commit ccde5da81388ffa770ca98b64e07f803ad089414
+Subproject commit 990217f043af7222348ca8f0301e17fa7b841781
diff --git a/third_party/opentelemetry-cpp b/third_party/opentelemetry-cpp
new file mode 160000
index 0000000000000..a799f4aed9c94
--- /dev/null
+++ b/third_party/opentelemetry-cpp
@@ -0,0 +1 @@
+Subproject commit a799f4aed9c94b765dcdaabaeab7d5e7e2310878
diff --git a/third_party/opentelemetry-cpp.BUILD b/third_party/opentelemetry-cpp.BUILD
new file mode 100644
index 0000000000000..af58a82452909
--- /dev/null
+++ b/third_party/opentelemetry-cpp.BUILD
@@ -0,0 +1,71 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "string_flag")
+
+package(default_visibility = ["//visibility:public"])
+
+bool_flag(
+    name = "with_abseil",
+    build_setting_default = False,
+)
+
+CPP_STDLIBS = [
+    "none",
+    "best",
+    "2014",
+    "2017",
+    "2020",
+    "2023",
+]
+
+string_flag(
+    name = "with_cxx_stdlib",
+    build_setting_default = "best",
+    values = CPP_STDLIBS,
+)
+
+cc_library(
+    name = "api",
+    hdrs = glob(["include/**/*.h"]),
+    defines = select({
+        ":with_external_abseil": ["HAVE_ABSEIL"],
+        "//conditions:default": [],
+    }) + select({
+        ":set_cxx_stdlib_none": [],
+        ### automatic selection
+        ":set_cxx_stdlib_best": ["OPENTELEMETRY_STL_VERSION=(__cplusplus/100)"],
+        # See https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus
+        ":set_cxx_stdlib_best_and_msvc": ["OPENTELEMETRY_STL_VERSION=(_MSVC_LANG/100)"],
+        ### manual selection
+        ":set_cxx_stdlib_2014": ["OPENTELEMETRY_STL_VERSION=2014"],
+        ":set_cxx_stdlib_2017": ["OPENTELEMETRY_STL_VERSION=2017"],
+        ":set_cxx_stdlib_2020": ["OPENTELEMETRY_STL_VERSION=2020"],
+        ":set_cxx_stdlib_2023": ["OPENTELEMETRY_STL_VERSION=2023"],
+        "//conditions:default": [],
+    }),
+    strip_include_prefix = "include",
+    tags = ["api"],
+    deps = select({
+        ":with_external_abseil": [
+            "@com_google_absl//absl/base",
+            "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/types:variant",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+config_setting(
+    name = "with_external_abseil",
+    flag_values = {":with_abseil": "true"},
+)
+
+[config_setting(
+    name = "set_cxx_stdlib_%s" % v,
+    flag_values = {":with_cxx_stdlib": v},
+) for v in CPP_STDLIBS]
+
+config_setting(
+    name = "set_cxx_stdlib_best_and_msvc",
+    constraint_values = ["@bazel_tools//tools/cpp:msvc"],
+    flag_values = {":with_cxx_stdlib": "best"},
+)
diff --git a/third_party/pocketfft b/third_party/pocketfft
index ea778e37710c0..9d3ab05a7fffb 160000
--- a/third_party/pocketfft
+++ b/third_party/pocketfft
@@ -1 +1 @@
-Subproject commit ea778e37710c07723435b1be58235996d1d43a5a
+Subproject commit 9d3ab05a7fffbc71a492bc6a17be034e83e8f0fe
diff --git a/third_party/pybind11 b/third_party/pybind11
index 8a099e44b3d5f..3e9dfa2866941 160000
--- a/third_party/pybind11
+++ b/third_party/pybind11
@@ -1 +1 @@
-Subproject commit 8a099e44b3d5f85b20f05828d919d2332a8de841
+Subproject commit 3e9dfa2866941655c56877882565e7577de6fc7b
diff --git a/third_party/sleef b/third_party/sleef
index e0a003ee838b7..60e76d2bce17d 160000
--- a/third_party/sleef
+++ b/third_party/sleef
@@ -1 +1 @@
-Subproject commit e0a003ee838b75d11763aa9c3ef17bf71a725bff
+Subproject commit 60e76d2bce17d278b439d9da17177c8f957a9e9b
diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD
index 573f9c5b54a3b..f22a6e905e2be 100644
--- a/third_party/sleef.BUILD
+++ b/third_party/sleef.BUILD
@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [
 SLEEF_PRIVATE_INCLUDES = [
     "-Iexternal/sleef/src/arch",
     "-Iexternal/sleef/src/common",
+    "-Iexternal/sleef/src/libm",
 ]
 
 SLEEF_PUBLIC_INCLUDES = [
@@ -201,8 +202,6 @@ cc_library(
     srcs = [
         "src/libm/rempitab.c",
         "src/libm/sleefdp.c",
-        "src/libm/sleefld.c",
-        "src/libm/sleefqp.c",
         "src/libm/sleefsp.c",
     ],
     hdrs = SLEEF_PUBLIC_HEADERS,
diff --git a/third_party/xnnpack.buck.bzl b/third_party/xnnpack.buck.bzl
index edd7e38ca7772..cb351261d4039 100644
--- a/third_party/xnnpack.buck.bzl
+++ b/third_party/xnnpack.buck.bzl
@@ -21,18 +21,19 @@ load(
     "PROD_AVX512SKX_MICROKERNEL_SRCS",
     "PROD_AVX512VBMI_MICROKERNEL_SRCS",
     "PROD_AVX512VNNI_MICROKERNEL_SRCS",
+    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS",
     "PROD_AVXVNNI_MICROKERNEL_SRCS",
     "PROD_AVX_MICROKERNEL_SRCS",
     "PROD_F16C_MICROKERNEL_SRCS",
     "PROD_FMA3_MICROKERNEL_SRCS",
     "PROD_FP16ARITH_MICROKERNEL_SRCS",
+    "PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS",
+    "PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS",
     "PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS",
     "PROD_NEONDOT_MICROKERNEL_SRCS",
     "PROD_NEONFMA_MICROKERNEL_SRCS",
     "PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS",
     "PROD_NEONFP16ARITH_MICROKERNEL_SRCS",
-    "PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS",
-    "PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS",
     "PROD_NEONFP16_MICROKERNEL_SRCS",
     "PROD_NEONI8MM_MICROKERNEL_SRCS",
     "PROD_NEONV8_MICROKERNEL_SRCS",
@@ -555,8 +556,15 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         apple_sdks = (IOS, MACOSX, APPLETVOS),
         compiler_flags = [
             "-O2",
-            "-mavx",
-        ],
+        ] + select({
+            "DEFAULT": [],
+            "ovr_config//cpu:x86_32": [
+                "-mavx",
+            ],
+            "ovr_config//cpu:x86_64": [
+                "-mavx",
+            ],
+        }),
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
             "-DXNN_INTERNAL=",
@@ -564,7 +572,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         labels = labels,
         platform_compiler_flags = [
             (
-                "x86",
+                "x86|x86_64|platform009|platform010",
                 [
                     "-mavx",
                 ],
@@ -626,6 +634,115 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         ],
     )
 
+    fb_xplat_cxx_library(
+        name = "ukernels_avx512vnnigfni",
+        srcs = PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS if is_arvr_mode() else [],
+        headers = subdir_glob([
+            ("XNNPACK/src", "**/*.h"),
+            ("XNNPACK/src", "**/*.c"),
+        ]),
+        header_namespace = "",
+        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        compiler_flags = [
+            "-O2",
+        ] + select({
+            "DEFAULT": [],
+            "ovr_config//cpu:x86_32": [
+                "-mavx",
+                "-mgfni",
+                "-mavx512vl",
+                "-mavx512vnni",
+                "-mavx512bw",
+                "-mavx512dq",
+            ],
+            "ovr_config//cpu:x86_64": [
+                "-mavx",
+                "-mgfni",
+                "-mavx512vl",
+                "-mavx512vnni",
+                "-mavx512bw",
+                "-mavx512dq",
+            ],
+        }),
+        fbobjc_preprocessor_flags = [
+            "-DXNN_PRIVATE=",
+            "-DXNN_INTERNAL=",
+        ],
+        labels = labels,
+        platform_compiler_flags = [
+            (
+                "x86|x86_64|platform009|platform010",
+                [
+                    "-mavx512f",
+                    "-mavx512cd",
+                    "-mavx512bw",
+                    "-mavx512dq",
+                    "-mavx512vl",
+                    "-mavx512vnni",
+                    "-mgfni",
+                ],
+            ),
+        ],
+        platform_srcs = ([
+            (
+                "x86|x86_64|platform009|platform010",
+                PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS,
+            ),
+        ] if not is_arvr_mode() else []),
+        preferred_linkage = "static",
+        preprocessor_flags = [
+            "-DXNN_LOG_LEVEL=0",
+        ],
+        visibility = ["PUBLIC"],
+        windows_clang_compiler_flags_override = WINDOWS_FLAGS + WINDOWS_CLANG_COMPILER_FLAGS + ["-mavx"],
+        windows_compiler_flags_override = WINDOWS_FLAGS + ["-mavx"],
+        deps = [
+            ":interface",
+        ],
+    )
+
+    fb_xplat_cxx_library(
+        name = "ukernels_avx512vnnigfni_ovr_win32",
+        headers = subdir_glob([
+            ("XNNPACK/src", "**/*.h"),
+            ("XNNPACK/src", "**/*.c"),
+        ]),
+        header_namespace = "",
+        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        compiler_flags = [
+            "-O2",
+        ],
+        fbobjc_preprocessor_flags = [
+            "-DXNN_PRIVATE=",
+            "-DXNN_INTERNAL=",
+        ],
+        labels = labels,
+        platform_compiler_flags = [
+            (
+                "x86|x86_64|platform009|platform010",
+                [
+                    "-mavx512f",
+                    "-mavx512cd",
+                    "-mavx512bw",
+                    "-mavx512dq",
+                    "-mavx512vl",
+                    "-mavx512vnni",
+                    "-mgfni",
+                ],
+            ),
+        ],
+        preferred_linkage = "static",
+        preprocessor_flags = [
+            "-DXNN_LOG_LEVEL=0",
+        ],
+        visibility = ["PUBLIC"],
+        windows_clang_compiler_flags_override = WINDOWS_FLAGS + WINDOWS_CLANG_COMPILER_FLAGS + ["-mavx"],
+        windows_compiler_flags_override = WINDOWS_FLAGS + ["-mavx"],
+        windows_srcs = PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS,
+        deps = [
+            ":interface",
+        ],
+    )
 
     fb_xplat_cxx_library(
         name = "ukernels_avx512vnni",
@@ -638,8 +755,15 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         apple_sdks = (IOS, MACOSX, APPLETVOS),
         compiler_flags = [
             "-O2",
-            "-mavx",
-        ],
+        ] + select({
+            "DEFAULT": [],
+            "ovr_config//cpu:x86_32": [
+                "-mavx",
+            ],
+            "ovr_config//cpu:x86_64": [
+                "-mavx",
+            ],
+        }),
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
             "-DXNN_INTERNAL=",
@@ -729,6 +853,9 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         apple_sdks = (IOS, MACOSX, APPLETVOS),
         compiler_flags = [
             "-O2",
+            "-mavxvnni",
+            "-mf16c",
+            "-mfma",
         ],
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
@@ -741,6 +868,8 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                 [
                     "-mavx2",
                     "-mavxvnni",
+                    "-mf16c",
+                    "-mfma",
                 ],
             ),
         ],
@@ -800,7 +929,6 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         ],
     )
 
-
     fb_xplat_cxx_library(
         name = "ukernels_f16c",
         srcs = PROD_F16C_MICROKERNEL_SRCS if is_arvr_mode() else [],
@@ -812,8 +940,15 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         apple_sdks = (IOS, MACOSX, APPLETVOS),
         compiler_flags = [
             "-O2",
-            "-mf16c",
-        ],
+        ] + select({
+            "DEFAULT": [],
+            "ovr_config//cpu:x86_32": [
+                "-mf16c",
+            ],
+            "ovr_config//cpu:x86_64": [
+                "-mf16c",
+            ],
+        }),
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
             "-DXNN_INTERNAL=",
@@ -821,7 +956,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         labels = labels,
         platform_compiler_flags = [
             (
-                "x86",
+                "x86|x86_64|platform009|platform010",
                 [
                     "-mf16c",
                 ],
@@ -896,7 +1031,22 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         apple_sdks = (IOS, MACOSX, APPLETVOS),
         compiler_flags = [
             "-O2",
-            "-mxop",
+        ] + select({
+            "DEFAULT": [],
+            "ovr_config//cpu:x86_32": [
+                "-mxop",
+            ],
+            "ovr_config//cpu:x86_64": [
+                "-mxop",
+            ],
+        }),
+        platform_compiler_flags = [
+            (
+                "x86|x86_64|platform009|platform010",
+                [
+                    "-mxop",
+                ],
+            ),
         ],
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
@@ -978,9 +1128,17 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         apple_sdks = (IOS, MACOSX, APPLETVOS),
         compiler_flags = [
             "-O2",
-            "-mfma",
-            "-mf16c",
-        ],
+        ] + select({
+            "DEFAULT": [],
+            "ovr_config//cpu:x86_32": [
+                "-mfma",
+                "-mf16c",
+            ],
+            "ovr_config//cpu:x86_64": [
+                "-mfma",
+                "-mf16c",
+            ],
+        }),
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
             "-DXNN_INTERNAL=",
@@ -988,7 +1146,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         labels = labels,
         platform_compiler_flags = [
             (
-                "^(i[3-6]86|x86|x86_64|AMD64)$",
+                "(i[3-6]86|x86|x86_64|AMD64)",
                 [
                     "-mfma",
                     "-mf16c",
@@ -1076,10 +1234,19 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         apple_sdks = (IOS, MACOSX, APPLETVOS),
         compiler_flags = [
             "-O2",
-            "-mavx2",
-            "-mfma",
-            "-mf16c",
-        ],
+        ] + select({
+            "DEFAULT": [],
+            "ovr_config//cpu:x86_32": [
+                "-mavx2",
+                "-mfma",
+                "-mf16c",
+            ],
+            "ovr_config//cpu:x86_64": [
+                "-mavx2",
+                "-mfma",
+                "-mf16c",
+            ],
+        }),
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
             "-DXNN_INTERNAL=",
@@ -1087,7 +1254,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         labels = labels,
         platform_compiler_flags = [
             (
-                "x86",
+                "x86|x86_64|platform009|platform010",
                 [
                     "-mavx2",
                     "-mfma",
@@ -1182,8 +1349,15 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         apple_sdks = (IOS, MACOSX, APPLETVOS),
         compiler_flags = [
             "-O2",
-            "-mavx512f",
-        ],
+        ] + select({
+            "DEFAULT": [],
+            "ovr_config//cpu:x86_32": [
+                "-mavx512f",
+            ],
+            "ovr_config//cpu:x86_64": [
+                "-mavx512f",
+            ],
+        }),
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
             "-DXNN_INTERNAL=",
@@ -1191,7 +1365,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         labels = labels,
         platform_compiler_flags = [
             (
-                "x86",
+                "x86|x86_64|platform009|platform010",
                 [
                     "-mavx512f",
                 ],
@@ -1226,13 +1400,25 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         apple_sdks = (IOS, MACOSX, APPLETVOS),
         compiler_flags = [
             "-O2",
-            "-mavx512f",
-            "-mavx512cd",
-            "-mavx512bw",
-            "-mavx512dq",
-            "-mavx512vl",
-            "-mavx512vbmi",
-        ],
+        ] + select({
+            "DEFAULT": [],
+            "ovr_config//cpu:x86_32": [
+                "-mavx512f",
+                "-mavx512cd",
+                "-mavx512bw",
+                "-mavx512dq",
+                "-mavx512vl",
+                "-mavx512vbmi",
+            ],
+            "ovr_config//cpu:x86_64": [
+                "-mavx512f",
+                "-mavx512cd",
+                "-mavx512bw",
+                "-mavx512dq",
+                "-mavx512vl",
+                "-mavx512vbmi",
+            ],
+        }),
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
             "-DXNN_INTERNAL=",
@@ -1240,7 +1426,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         labels = labels,
         platform_compiler_flags = [
             (
-                "^(i[3-6]86|x86|x86_64|AMD64)$",
+                "(i[3-6]86|x86|x86_64|AMD64)",
                 [
                     "-mavx512f",
                     "-mavx512cd",
@@ -1283,7 +1469,6 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         ],
     )
 
-
     fb_xplat_cxx_library(
         name = "ukernels_avx512_ovr_win32",
         headers = subdir_glob([
@@ -1333,12 +1518,23 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         apple_sdks = (IOS, MACOSX, APPLETVOS),
         compiler_flags = [
             "-O2",
-            "-mavx512f",
-            "-mavx512cd",
-            "-mavx512bw",
-            "-mavx512dq",
-            "-mavx512vl",
-        ],
+        ] + select({
+            "DEFAULT": [],
+            "ovr_config//cpu:x86_32": [
+                "-mavx512f",
+                "-mavx512cd",
+                "-mavx512bw",
+                "-mavx512dq",
+                "-mavx512vl",
+            ],
+            "ovr_config//cpu:x86_64": [
+                "-mavx512f",
+                "-mavx512cd",
+                "-mavx512bw",
+                "-mavx512dq",
+                "-mavx512vl",
+            ],
+        }),
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
             "-DXNN_INTERNAL=",
@@ -1346,7 +1542,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         labels = labels,
         platform_compiler_flags = [
             (
-                "^(i[3-6]86|x86|x86_64|AMD64)$",
+                "(i[3-6]86|x86|x86_64|AMD64)",
                 [
                     "-mavx512f",
                     "-mavx512cd",
@@ -1471,7 +1667,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                     "-march=armv6",
                     "-mfpu=vfp",
                     "-munaligned-access",
-                ]
+                ],
             ),
         ],
         preferred_linkage = "static",
@@ -1566,7 +1762,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             (
                 "(aarch64|arm64)",
                 PROD_NEON_MICROKERNEL_SRCS + [PROD_NEON_AARCH64_MICROKERNEL_SRCS[0]],
-            )
+            ),
         ] if not is_arvr_mode() else [],
         labels = labels,
         preferred_linkage = "static",
@@ -1744,7 +1940,6 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         ],
     )
 
-
     fb_xplat_cxx_library(
         name = "ukernels_neon_fp16",
         srcs = PROD_NEONFP16_MICROKERNEL_SRCS,
@@ -1803,8 +1998,8 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         compiler_flags = [
             "-O2",
         ] + select({
+            "DEFAULT": [],
             "ovr_config//cpu:arm64": ["-march=armv8-a"],
-            "DEFAULT": []
         }),
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
@@ -1862,12 +2057,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         compiler_flags = [
             "-O2",
         ] + select({
+            "DEFAULT": [],
             "ovr_config//cpu:arm32": [
                 "-march=armv8.2-a+dotprod",
                 "-mfpu=neon-fp-armv8",
                 "-mfloat-abi=softfp",
             ],
-            "DEFAULT": []
         }),
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
@@ -1918,8 +2113,8 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         compiler_flags = [
             "-O2",
         ] + select({
+            "DEFAULT": [],
             "ovr_config//cpu:arm64": ["-march=armv8.2-a+dotprod"],
-            "DEFAULT": []
         }),
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
@@ -1938,7 +2133,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             (
                 "(aarch64|arm64)",
                 PROD_NEONDOT_MICROKERNEL_SRCS + PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS,
-            )
+            ),
         ] if not is_arvr_mode() else [],
         preferred_linkage = "static",
         preprocessor_flags = [
@@ -2026,7 +2221,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         ] + select({
             "DEFAULT": [],
             "ovr_config//cpu:arm64": [
-                "-march=armv8.2-a+dotprod+fp16"
+                "-march=armv8.2-a+dotprod+fp16",
             ],
         }),
         fbobjc_preprocessor_flags = [
@@ -2037,7 +2232,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             (
                 "(aarch64|arm64)",
                 [
-                    "-march=armv8.2-a+dotprod+fp16"
+                    "-march=armv8.2-a+dotprod+fp16",
                 ],
             ),
         ],
@@ -2075,13 +2270,13 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         apple_sdks = (IOS, MACOSX, APPLETVOS),
         compiler_flags = [
             "-O2",
-        ] + select ({
+        ] + select({
+            "DEFAULT": [],
             "ovr_config//cpu:arm32": [
                 "-marm",
                 "-march=armv8.2-a+fp16",
                 "-mfpu=neon-fp-armv8",
             ],
-            "DEFAULT": []
         }),
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
@@ -2131,9 +2326,9 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         apple_sdks = (IOS, MACOSX, APPLETVOS),
         compiler_flags = [
             "-O2",
-        ] +  select({
+        ] + select({
+            "DEFAULT": [],
             "ovr_config//cpu:arm64": ["-march=armv8.2-a+fp16"],
-            "DEFAULT": []
         }),
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
@@ -2207,8 +2402,8 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                 "(arm64|aarch64)",
                 [
                     "-march=armv8.2-a+i8mm+fp16",
-                ]
-            )
+                ],
+            ),
         ],
         platforms = (APPLE, ANDROID, CXX, WINDOWS),
         preferred_linkage = "static",
@@ -2241,7 +2436,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                 "-marm",
                 "-march=armv8.2-a+dotprod+fp16",
                 "-mfpu=neon-fp-armv8",
-            ]
+            ],
         }),
         fbobjc_preprocessor_flags = [
             "-DXNN_PRIVATE=",
@@ -2362,6 +2557,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             ":ukernels_xop",
             ":ukernels_avx512vbmi",
             ":ukernels_avx512vnni",
+            ":ukernels_avx512vnnigfni",
             # ":ukernels_avxvnni" Excluding avxvnni microkernels because they fail on older compilers
         ],
     )
@@ -2386,6 +2582,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             ":ukernels_xop_ovr_win32",
             ":ukernels_avx512vbmi",
             ":ukernels_avx512vnni_ovr_win32",
+            ":ukernels_avx512vnnigfni_ovr_win32",
             # ":ukernels_avxvnni_ovr_win32" Excluding avxvnni microkernels because they fail on older compilers
         ],
     )
diff --git a/third_party/xnnpack_src_defs.bzl b/third_party/xnnpack_src_defs.bzl
index 47d18d4ba29ad..296dacb58ec4c 100644
--- a/third_party/xnnpack_src_defs.bzl
+++ b/third_party/xnnpack_src_defs.bzl
@@ -2,349 +2,16 @@
 Auto-generated by generate-wrappers.py script. Do not modify
 """
 
-PROD_AVX2_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx2.c",
-]
-
-PROD_F16C_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/f16c.c",
-]
-
-PROD_FMA3_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/fma3.c",
-]
-
-PROD_SSE_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/sse.c",
-]
-
 PROD_SCALAR_MICROKERNEL_SRCS = [
     "XNNPACK/src/amalgam/gen/scalar.c",
 ]
 
-PROD_NEON_AARCH64_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neon-aarch64.c",
-    "XNNPACK/src/amalgam/gen/neonfma-aarch64.c",
-]
-
-PROD_SSE41_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/sse41.c",
-]
-
-PROD_NEONFMA_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neonfma.c",
-]
-
-PROD_FMA_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/fma.c",
-]
-
-PROD_AVX512VBMI_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx512vbmi.c",
-]
-
-PROD_AVX_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx.c",
-]
-
-SUBGRAPH_SRCS = [
-    "XNNPACK/src/memory-planner.c",
-    "XNNPACK/src/runtime.c",
-    "XNNPACK/src/subgraph.c",
-    "XNNPACK/src/subgraph/abs.c",
-    "XNNPACK/src/subgraph/add2.c",
-    "XNNPACK/src/subgraph/argmax-pooling-2d.c",
-    "XNNPACK/src/subgraph/average-pooling-2d.c",
-    "XNNPACK/src/subgraph/bankers-rounding.c",
-    "XNNPACK/src/subgraph/batch-matrix-multiply.c",
-    "XNNPACK/src/subgraph/ceiling.c",
-    "XNNPACK/src/subgraph/clamp.c",
-    "XNNPACK/src/subgraph/concatenate.c",
-    "XNNPACK/src/subgraph/convert.c",
-    "XNNPACK/src/subgraph/convolution-2d.c",
-    "XNNPACK/src/subgraph/copy.c",
-    "XNNPACK/src/subgraph/deconvolution-2d.c",
-    "XNNPACK/src/subgraph/depth-to-space-2d.c",
-    "XNNPACK/src/subgraph/depthwise-convolution-2d.c",
-    "XNNPACK/src/subgraph/divide.c",
-    "XNNPACK/src/subgraph/elu.c",
-    "XNNPACK/src/subgraph/even-split.c",
-    "XNNPACK/src/subgraph/floor.c",
-    "XNNPACK/src/subgraph/fully-connected-sparse.c",
-    "XNNPACK/src/subgraph/fully-connected.c",
-    "XNNPACK/src/subgraph/global-average-pooling.c",
-    "XNNPACK/src/subgraph/global-sum-pooling.c",
-    "XNNPACK/src/subgraph/hardswish.c",
-    "XNNPACK/src/subgraph/leaky-relu.c",
-    "XNNPACK/src/subgraph/max-pooling-2d.c",
-    "XNNPACK/src/subgraph/maximum2.c",
-    "XNNPACK/src/subgraph/minimum2.c",
-    "XNNPACK/src/subgraph/multiply2.c",
-    "XNNPACK/src/subgraph/negate.c",
-    "XNNPACK/src/subgraph/prelu.c",
-    "XNNPACK/src/subgraph/scaled-dot-product-attention.c",
-    "XNNPACK/src/subgraph/sigmoid.c",
-    "XNNPACK/src/subgraph/softmax.c",
-    "XNNPACK/src/subgraph/space-to-depth-2d.c",
-    "XNNPACK/src/subgraph/square-root.c",
-    "XNNPACK/src/subgraph/square.c",
-    "XNNPACK/src/subgraph/squared-difference.c",
-    "XNNPACK/src/subgraph/static-constant-pad.c",
-    "XNNPACK/src/subgraph/static-mean.c",
-    "XNNPACK/src/subgraph/static-reshape.c",
-    "XNNPACK/src/subgraph/static-resize-bilinear-2d.c",
-    "XNNPACK/src/subgraph/static-slice.c",
-    "XNNPACK/src/subgraph/static-transpose.c",
-    "XNNPACK/src/subgraph/subtract.c",
-    "XNNPACK/src/subgraph/tanh.c",
-    "XNNPACK/src/subgraph/unpooling-2d.c",
-    "XNNPACK/src/subgraph/validation.c",
-    "XNNPACK/src/tensor.c",
-]
-
-PROD_FP16ARITH_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/fp16arith.c",
-]
-
-LOGGING_SRCS = [
-    "XNNPACK/src/enums/datatype-strings.c",
-    "XNNPACK/src/enums/microkernel-type.c",
-    "XNNPACK/src/enums/node-type.c",
-    "XNNPACK/src/enums/operator-type.c",
-    "XNNPACK/src/log.c",
-]
-
-PROD_NEONV8_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neonv8.c",
-]
-
-PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neondotfp16arith.c",
-]
-
-PROD_SSSE3_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/ssse3.c",
-]
-
-PROD_AVX512F_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avx512f.c",
-]
-
-PROD_RVV_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/rvv.c",
-]
-
-PROD_XOP_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/xop.c",
-]
-
 PROD_AVX512VNNI_MICROKERNEL_SRCS = [
     "XNNPACK/src/amalgam/gen/avx512vnni.c",
 ]
 
-XNNPACK_SRCS = [
-    "XNNPACK/src/configs/argmaxpool-config.c",
-    "XNNPACK/src/configs/avgpool-config.c",
-    "XNNPACK/src/configs/binary-elementwise-config.c",
-    "XNNPACK/src/configs/cmul-config.c",
-    "XNNPACK/src/configs/conv-hwc2chw-config.c",
-    "XNNPACK/src/configs/dwconv-config.c",
-    "XNNPACK/src/configs/dwconv2d-chw-config.c",
-    "XNNPACK/src/configs/experiments-config.c",
-    "XNNPACK/src/configs/gavgpool-config.c",
-    "XNNPACK/src/configs/gavgpool-cw-config.c",
-    "XNNPACK/src/configs/gemm-config.c",
-    "XNNPACK/src/configs/ibilinear-chw-config.c",
-    "XNNPACK/src/configs/ibilinear-config.c",
-    "XNNPACK/src/configs/lut32norm-config.c",
-    "XNNPACK/src/configs/maxpool-config.c",
-    "XNNPACK/src/configs/pavgpool-config.c",
-    "XNNPACK/src/configs/prelu-config.c",
-    "XNNPACK/src/configs/raddstoreexpminusmax-config.c",
-    "XNNPACK/src/configs/reduce-config.c",
-    "XNNPACK/src/configs/rmax-config.c",
-    "XNNPACK/src/configs/spmm-config.c",
-    "XNNPACK/src/configs/transpose-config.c",
-    "XNNPACK/src/configs/unary-elementwise-config.c",
-    "XNNPACK/src/configs/unpool-config.c",
-    "XNNPACK/src/configs/vmulcaddc-config.c",
-    "XNNPACK/src/configs/xx-fill-config.c",
-    "XNNPACK/src/configs/xx-pad-config.c",
-    "XNNPACK/src/configs/x8-lut-config.c",
-    "XNNPACK/src/configs/zip-config.c",
-    "XNNPACK/src/init.c",
-    "XNNPACK/src/params.c",
-]
-
-PROD_NEONI8MM_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neoni8mm.c",
-]
-
-PROD_ARMSIMD32_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/armsimd32.c",
-]
-
-PROD_AVXVNNI_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/avxvnni.c",
-]
-
-PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neondot-aarch64.c",
-]
-
-PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neondotfp16-aarch64.c",
-]
-
-PROD_SSE2_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/sse2.c",
-]
-
-OPERATOR_SRCS = [
-    "XNNPACK/src/operator-delete.c",
-    "XNNPACK/src/operators/argmax-pooling-nhwc.c",
-    "XNNPACK/src/operators/average-pooling-nhwc.c",
-    "XNNPACK/src/operators/batch-matrix-multiply-nc.c",
-    "XNNPACK/src/operators/binary-elementwise-nd.c",
-    "XNNPACK/src/operators/channel-shuffle-nc.c",
-    "XNNPACK/src/operators/constant-pad-nd.c",
-    "XNNPACK/src/operators/convolution-nchw.c",
-    "XNNPACK/src/operators/convolution-nhwc.c",
-    "XNNPACK/src/operators/deconvolution-nhwc.c",
-    "XNNPACK/src/operators/dynamic-fully-connected-nc.c",
-    "XNNPACK/src/operators/fully-connected-nc.c",
-    "XNNPACK/src/operators/global-average-pooling-ncw.c",
-    "XNNPACK/src/operators/global-average-pooling-nwc.c",
-    "XNNPACK/src/operators/lut-elementwise-nc.c",
-    "XNNPACK/src/operators/max-pooling-nhwc.c",
-    "XNNPACK/src/operators/prelu-nc.c",
-    "XNNPACK/src/operators/reduce-nd.c",
-    "XNNPACK/src/operators/resize-bilinear-nchw.c",
-    "XNNPACK/src/operators/resize-bilinear-nhwc.c",
-    "XNNPACK/src/operators/rope-nthc.c",
-    "XNNPACK/src/operators/scaled-dot-product-attention-nhtc.c",
-    "XNNPACK/src/operators/slice-nd.c",
-    "XNNPACK/src/operators/softmax-nc.c",
-    "XNNPACK/src/operators/transpose-nd.c",
-    "XNNPACK/src/operators/unary-elementwise-nc.c",
-    "XNNPACK/src/operators/unpooling-nhwc.c",
-]
-
-AARCH32_ASM_MICROKERNEL_SRCS = [
-    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S",
-    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S",
-    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S",
-    "XNNPACK/src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S",
-    "XNNPACK/src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x4-asm-aarch32-vfp-ld64.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x4-minmax-asm-aarch32-vfp-ld64.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
-    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-ld64.S",
-    "XNNPACK/src/f32-igemm/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
-    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-ld64.S",
-    "XNNPACK/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S",
-    "XNNPACK/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S",
-    "XNNPACK/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S",
-    "XNNPACK/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p8c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
-    "XNNPACK/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p16c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S",
-    "XNNPACK/src/qs16-qs8-vcvt/qs16-qs8-vcvt-asm-aarch32-neon-u16.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S",
-    "XNNPACK/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S",
-    "XNNPACK/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S",
-]
-
-PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neonfp16arith-aarch64.c",
-]
-
-PROD_NEONFP16_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neonfp16.c",
+PROD_AVX512F_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/avx512f.c",
 ]
 
 AARCH64_ASM_MICROKERNEL_SRCS = [
@@ -530,87 +197,13 @@ AARCH64_ASM_MICROKERNEL_SRCS = [
     "XNNPACK/src/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
     "XNNPACK/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
     "XNNPACK/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondotfp16arith-cortex-a55.S",
+    "XNNPACK/src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
+    "XNNPACK/src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
     "XNNPACK/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
     "XNNPACK/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld64.S",
     "XNNPACK/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
     "XNNPACK/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
     "XNNPACK/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mull.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
-    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
     "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
     "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
     "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
@@ -673,14 +266,244 @@ AARCH64_ASM_MICROKERNEL_SRCS = [
     "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
 ]
 
-PROD_NEONDOT_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neondot.c",
+PROD_NEONV8_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neonv8.c",
+]
+
+PROD_AVX_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/avx.c",
+]
+
+LOGGING_SRCS = [
+    "XNNPACK/src/enums/datatype-strings.c",
+    "XNNPACK/src/enums/microkernel-type.c",
+    "XNNPACK/src/enums/node-type.c",
+    "XNNPACK/src/enums/operator-type.c",
+    "XNNPACK/src/log.c",
+]
+
+PROD_NEONI8MM_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neoni8mm.c",
+]
+
+AARCH32_ASM_MICROKERNEL_SRCS = [
+    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S",
+    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S",
+    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S",
+    "XNNPACK/src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S",
+    "XNNPACK/src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x4-asm-aarch32-vfp-ld64.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x4-minmax-asm-aarch32-vfp-ld64.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-ld64.S",
+    "XNNPACK/src/f32-igemm/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch32-neon-cortex-a53.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53-prfm.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75-prfm.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-ld64.S",
+    "XNNPACK/src/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S",
+    "XNNPACK/src/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S",
+    "XNNPACK/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S",
+    "XNNPACK/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S",
+    "XNNPACK/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p8c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
+    "XNNPACK/src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p16c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S",
+    "XNNPACK/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35-prfm.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53-prfm.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64-prfm.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S",
+    "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S",
+    "XNNPACK/src/qs16-qs8-vcvt/qs16-qs8-vcvt-asm-aarch32-neon-u16.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S",
+    "XNNPACK/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S",
+    "XNNPACK/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S",
+]
+
+PROD_F16C_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/f16c.c",
+]
+
+PROD_XOP_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/xop.c",
+]
+
+PROD_RVV_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/rvv.c",
+]
+
+SUBGRAPH_SRCS = [
+    "XNNPACK/src/memory-planner.c",
+    "XNNPACK/src/runtime.c",
+    "XNNPACK/src/subgraph.c",
+    "XNNPACK/src/subgraph/abs.c",
+    "XNNPACK/src/subgraph/add2.c",
+    "XNNPACK/src/subgraph/argmax-pooling-2d.c",
+    "XNNPACK/src/subgraph/average-pooling-2d.c",
+    "XNNPACK/src/subgraph/bankers-rounding.c",
+    "XNNPACK/src/subgraph/batch-matrix-multiply.c",
+    "XNNPACK/src/subgraph/ceiling.c",
+    "XNNPACK/src/subgraph/clamp.c",
+    "XNNPACK/src/subgraph/concatenate.c",
+    "XNNPACK/src/subgraph/convert.c",
+    "XNNPACK/src/subgraph/convolution-2d.c",
+    "XNNPACK/src/subgraph/copy.c",
+    "XNNPACK/src/subgraph/deconvolution-2d.c",
+    "XNNPACK/src/subgraph/depth-to-space-2d.c",
+    "XNNPACK/src/subgraph/depthwise-convolution-2d.c",
+    "XNNPACK/src/subgraph/divide.c",
+    "XNNPACK/src/subgraph/elu.c",
+    "XNNPACK/src/subgraph/even-split.c",
+    "XNNPACK/src/subgraph/floor.c",
+    "XNNPACK/src/subgraph/fully-connected-sparse.c",
+    "XNNPACK/src/subgraph/fully-connected.c",
+    "XNNPACK/src/subgraph/global-average-pooling.c",
+    "XNNPACK/src/subgraph/global-sum-pooling.c",
+    "XNNPACK/src/subgraph/hardswish.c",
+    "XNNPACK/src/subgraph/leaky-relu.c",
+    "XNNPACK/src/subgraph/max-pooling-2d.c",
+    "XNNPACK/src/subgraph/maximum2.c",
+    "XNNPACK/src/subgraph/minimum2.c",
+    "XNNPACK/src/subgraph/multiply2.c",
+    "XNNPACK/src/subgraph/negate.c",
+    "XNNPACK/src/subgraph/prelu.c",
+    "XNNPACK/src/subgraph/reshape-helpers.c",
+    "XNNPACK/src/subgraph/scaled-dot-product-attention.c",
+    "XNNPACK/src/subgraph/sigmoid.c",
+    "XNNPACK/src/subgraph/softmax.c",
+    "XNNPACK/src/subgraph/space-to-depth-2d.c",
+    "XNNPACK/src/subgraph/square-root.c",
+    "XNNPACK/src/subgraph/square.c",
+    "XNNPACK/src/subgraph/squared-difference.c",
+    "XNNPACK/src/subgraph/static-constant-pad.c",
+    "XNNPACK/src/subgraph/static-mean.c",
+    "XNNPACK/src/subgraph/static-reshape.c",
+    "XNNPACK/src/subgraph/static-resize-bilinear-2d.c",
+    "XNNPACK/src/subgraph/static-slice.c",
+    "XNNPACK/src/subgraph/static-transpose.c",
+    "XNNPACK/src/subgraph/subtract.c",
+    "XNNPACK/src/subgraph/tanh.c",
+    "XNNPACK/src/subgraph/unpooling-2d.c",
+    "XNNPACK/src/subgraph/validation.c",
+    "XNNPACK/src/tensor.c",
+]
+
+PROD_FMA3_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/fma3.c",
 ]
 
 PROD_AVX512SKX_MICROKERNEL_SRCS = [
     "XNNPACK/src/amalgam/gen/avx512skx.c",
 ]
 
+JIT_SRCS = [
+    "XNNPACK/src/jit/aarch32-assembler.cc",
+    "XNNPACK/src/jit/aarch64-assembler.cc",
+    "XNNPACK/src/jit/assembler.cc",
+]
+
+PROD_NEONFP16_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neonfp16.c",
+]
+
+PROD_SSSE3_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/ssse3.c",
+]
+
+XNNPACK_SRCS = [
+    "XNNPACK/src/configs/argmaxpool-config.c",
+    "XNNPACK/src/configs/avgpool-config.c",
+    "XNNPACK/src/configs/binary-elementwise-config.c",
+    "XNNPACK/src/configs/cmul-config.c",
+    "XNNPACK/src/configs/conv-hwc2chw-config.c",
+    "XNNPACK/src/configs/dwconv-config.c",
+    "XNNPACK/src/configs/dwconv2d-chw-config.c",
+    "XNNPACK/src/configs/experiments-config.c",
+    "XNNPACK/src/configs/gavgpool-config.c",
+    "XNNPACK/src/configs/gavgpool-cw-config.c",
+    "XNNPACK/src/configs/gemm-config.c",
+    "XNNPACK/src/configs/ibilinear-chw-config.c",
+    "XNNPACK/src/configs/ibilinear-config.c",
+    "XNNPACK/src/configs/lut32norm-config.c",
+    "XNNPACK/src/configs/maxpool-config.c",
+    "XNNPACK/src/configs/pavgpool-config.c",
+    "XNNPACK/src/configs/prelu-config.c",
+    "XNNPACK/src/configs/raddstoreexpminusmax-config.c",
+    "XNNPACK/src/configs/reduce-config.c",
+    "XNNPACK/src/configs/rmax-config.c",
+    "XNNPACK/src/configs/spmm-config.c",
+    "XNNPACK/src/configs/transpose-config.c",
+    "XNNPACK/src/configs/unary-elementwise-config.c",
+    "XNNPACK/src/configs/unpool-config.c",
+    "XNNPACK/src/configs/vmulcaddc-config.c",
+    "XNNPACK/src/configs/xx-fill-config.c",
+    "XNNPACK/src/configs/xx-pad-config.c",
+    "XNNPACK/src/configs/x8-lut-config.c",
+    "XNNPACK/src/configs/zip-config.c",
+    "XNNPACK/src/init.c",
+    "XNNPACK/src/params.c",
+]
+
+PROD_FP16ARITH_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/fp16arith.c",
+]
+
 TABLE_SRCS = [
     "XNNPACK/src/tables/exp2-k-over-64.c",
     "XNNPACK/src/tables/exp2-k-over-2048.c",
@@ -693,16 +516,105 @@ TABLE_SRCS = [
     "XNNPACK/src/tables/vlog.c",
 ]
 
+PROD_NEON_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neon.c",
+]
+
+PROD_AVXVNNI_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/avxvnni.c",
+]
+
 PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [
     "XNNPACK/src/amalgam/gen/neonfp16arith.c",
 ]
 
-JIT_SRCS = [
-    "XNNPACK/src/jit/aarch32-assembler.cc",
-    "XNNPACK/src/jit/aarch64-assembler.cc",
-    "XNNPACK/src/jit/assembler.cc",
+PROD_SSE_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/sse.c",
 ]
 
-PROD_NEON_MICROKERNEL_SRCS = [
-    "XNNPACK/src/amalgam/gen/neon.c",
+PROD_NEON_AARCH64_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neon-aarch64.c",
+    "XNNPACK/src/amalgam/gen/neonfma-aarch64.c",
+]
+
+PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neondotfp16-aarch64.c",
+]
+
+PROD_NEONFMA_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neonfma.c",
+]
+
+PROD_FMA_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/fma.c",
+]
+
+PROD_SSE2_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/sse2.c",
+]
+
+PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/avx512vnnigfni.c",
+]
+
+PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neonfp16arith-aarch64.c",
+]
+
+PROD_AVX2_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/avx2.c",
+]
+
+OPERATOR_SRCS = [
+    "XNNPACK/src/operator-delete.c",
+    "XNNPACK/src/operators/argmax-pooling-nhwc.c",
+    "XNNPACK/src/operators/average-pooling-nhwc.c",
+    "XNNPACK/src/operators/batch-matrix-multiply-nc.c",
+    "XNNPACK/src/operators/binary-elementwise-nd.c",
+    "XNNPACK/src/operators/channel-shuffle-nc.c",
+    "XNNPACK/src/operators/constant-pad-nd.c",
+    "XNNPACK/src/operators/convolution-nchw.c",
+    "XNNPACK/src/operators/convolution-nhwc.c",
+    "XNNPACK/src/operators/deconvolution-nhwc.c",
+    "XNNPACK/src/operators/dynamic-fully-connected-nc.c",
+    "XNNPACK/src/operators/fully-connected-nc.c",
+    "XNNPACK/src/operators/global-average-pooling-ncw.c",
+    "XNNPACK/src/operators/global-average-pooling-nwc.c",
+    "XNNPACK/src/operators/lut-elementwise-nc.c",
+    "XNNPACK/src/operators/max-pooling-nhwc.c",
+    "XNNPACK/src/operators/prelu-nc.c",
+    "XNNPACK/src/operators/reduce-nd.c",
+    "XNNPACK/src/operators/resize-bilinear-nchw.c",
+    "XNNPACK/src/operators/resize-bilinear-nhwc.c",
+    "XNNPACK/src/operators/rope-nthc.c",
+    "XNNPACK/src/operators/scaled-dot-product-attention-nhtc.c",
+    "XNNPACK/src/operators/slice-nd.c",
+    "XNNPACK/src/operators/softmax-nc.c",
+    "XNNPACK/src/operators/transpose-nd.c",
+    "XNNPACK/src/operators/unary-elementwise-nc.c",
+    "XNNPACK/src/operators/unpooling-nhwc.c",
+]
+
+PROD_AVX512VBMI_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/avx512vbmi.c",
+]
+
+PROD_NEONDOT_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neondot.c",
+]
+
+PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neondot-aarch64.c",
+]
+
+PROD_SSE41_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/sse41.c",
+]
+
+PROD_ARMSIMD32_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/armsimd32.c",
+]
+
+PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/gen/neondotfp16arith.c",
 ]
diff --git a/third_party/xnnpack_wrapper_defs.bzl b/third_party/xnnpack_wrapper_defs.bzl
index 256633ff55ed6..b92ebb88d74ef 100644
--- a/third_party/xnnpack_wrapper_defs.bzl
+++ b/third_party/xnnpack_wrapper_defs.bzl
@@ -119,6 +119,10 @@ PROD_AVX512VNNI_MICROKERNEL_SRCS = [
     "xnnpack_wrappers/amalgam/gen/avx512vnni.c",
 ]
 
+PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/gen/avx512vnnigfni.c",
+]
+
 PROD_RVV_MICROKERNEL_SRCS = [
     "xnnpack_wrappers/amalgam/gen/rvv.c",
 ]
@@ -154,28 +158,9 @@ AARCH32_ASM_MICROKERNEL_SRCS = [
     "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
     "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-ld64.S",
     "xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S",
+    "xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondotfp16arith-cortex-a55.S",
     "xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S",
     "xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c4-minmax-asm-aarch32-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S",
     "xnnpack_wrappers/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p8c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
     "xnnpack_wrappers/qs8-qc8w-dwconv/qs8-qc8w-dwconv-3p16c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
     "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7-prfm.S",
@@ -419,87 +404,13 @@ AARCH64_ASM_MICROKERNEL_SRCS = [
     "xnnpack_wrappers/f32-qc8w-gemm/gen/f32-qc8w-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
     "xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
     "xnnpack_wrappers/qd8-f16-qc8w-gemm/gen/qd8-f16-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondotfp16arith-cortex-a55.S",
+    "xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qd8-f16-qc8w-igemm/gen/qd8-f16-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
     "xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
     "xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld64.S",
     "xnnpack_wrappers/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
     "xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-cortex-a55.S",
     "xnnpack_wrappers/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mull.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
     "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53-prfm.S",
     "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
     "xnnpack_wrappers/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
diff --git a/third_party/xpu.txt b/third_party/xpu.txt
new file mode 100644
index 0000000000000..8d6e43952c8b3
--- /dev/null
+++ b/third_party/xpu.txt
@@ -0,0 +1 @@
+5bf9e0cc768f7a3b13d829118683275f324399f1
diff --git a/tools/BUCK.bzl b/tools/BUCK.bzl
index 7ad0882e44f5e..5b410274ff773 100644
--- a/tools/BUCK.bzl
+++ b/tools/BUCK.bzl
@@ -124,6 +124,8 @@ def define_tools_targets(
             "autograd/templates/TraceType.cpp",
             "autograd/templates/VariableType.cpp",
             "autograd/templates/VariableType.h",
+            "autograd/templates/ViewFuncs.cpp",
+            "autograd/templates/ViewFuncs.h",
             "autograd/templates/annotated_fn_args.py.in",
             "autograd/templates/python_enum_tag.cpp",
             "autograd/templates/python_fft_functions.cpp",
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index a362092712e70..96047e61f0304 100755
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -200,7 +200,11 @@ def remove_hcc(line: str) -> str:
     output_directory=out_dir,
     includes=includes,
     ignores=ignores,
-    extra_files=["torch/_inductor/codegen/wrapper.py"],
+    extra_files=[
+        "torch/_inductor/codegen/cpp_wrapper_cpu.py",
+        "torch/_inductor/codegen/cpp_wrapper_cuda.py",
+        "torch/_inductor/codegen/wrapper.py",
+    ],
     out_of_place_only=args.out_of_place_only,
     hip_clang_launch=is_hip_clang(),
 )
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index ec59abeb573f7..efc37aee123d8 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -442,6 +442,10 @@
   self: grad
   result: auto_linear
 
+- name: _lazy_clone(Tensor self) -> Tensor
+  self: grad
+  result: auto_linear
+
 - name: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
   self: _to_copy_backward(grad, self.options())
   result: _to_copy(self_t, dtype, layout, device, pin_memory, non_blocking, memory_format)
@@ -1246,6 +1250,20 @@
   self: grad.neg()
   result: auto_element_wise
 
+- name: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
+  input, weight, bias: "grad.defined() ? batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, /*update*/true, eps, grad_input_mask, retain_variables ? result3.clone() : result3) : std::tuple<Tensor, Tensor, Tensor>()"
+  result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, true, eps)
+
+- name: _batch_norm_no_update(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
+  input, weight, bias: "grad.defined() ? batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, /*update*/false, eps, grad_input_mask, retain_variables ? result3.clone() : result3) : std::tuple<Tensor, Tensor, Tensor>()"
+  result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, false, eps)
+
+- name: batch_norm_backward(Tensor grad_out, Tensor input, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, bool update, float eps, bool[3] output_mask, Tensor reserve) -> (Tensor, Tensor, Tensor)
+  input, weight, grad_out: batchnorm_double_backward(input, weight, grads[0], grads[1], grads[2], grad_out, running_mean, running_var, update, eps, save_mean, save_var, grad_input_mask)
+  save_mean: not_implemented("batch_norm_backward save_mean")
+  save_var: not_implemented("batch_norm_backward save_var")
+  reserve: not_implemented("batch_norm_backward reserve")
+
 - name: nextafter(Tensor self, Tensor other) -> Tensor
   self: not_implemented("nextafter")
   other: not_implemented("nextafter")
@@ -1499,6 +1517,11 @@
   grad_output: grad.slice_symint(dim, start, end, step)
   result: auto_linear
 
+- name: slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+  self: grad.slice_symint(dim, start, end, step)
+  src: slice_scatter_symint(grad, zeros_like(self), dim, start, end, step)
+  result: auto_linear
+
 - name: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
   self: slice_scatter_symint(grad, zeros_like(src), dim, start, end, step)
   src: grad.slice_symint(dim, start, end, step)
@@ -2770,6 +2793,15 @@
   nested_size: non_differentiable
   nested_strides: non_differentiable
 
+- name: _nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor(a)
+  self: grad.values()
+  offsets: non_differentiable
+  lengths: non_differentiable
+  dummy: non_differentiable
+
+- name: _nested_get_values(Tensor(a) self) -> Tensor(a)
+  self: _nested_view_from_jagged(grad, at::_nested_get_offsets(self), at::_nested_get_jagged_dummy(self), at::_nested_get_lengths(self), at::_nested_get_ragged_idx(self))
+
 # Transformers
 - name: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
   output_differentiability: [True, False, False, False]
@@ -2787,10 +2819,14 @@
   output_differentiability: [True, False, False, False, False]
   query, key, value: _flash_attention_backward_symint(grad, query, key, value, output, softmax_logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, philox_seed, philox_offset, scale)
 
-- name: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
+- name: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt? max_seqlen_q, SymInt? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None, int? window_size=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
   output_differentiability: [True, False, False, False, False, False]
   query, key, value, bias: _efficient_attention_backward_symint(grad, query, key, value, bias, output, cu_seqlens_q, cu_seqlens_k, max_seqlen_batch_q, max_seqlen_batch_k, logsumexp, dropout_p, philox_seed, philox_offset, custom_mask_type, bias.requires_grad(), scale)
 
+- name: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+  output_differentiability: [True, False, False, False, False, False, False, False, False]
+  query, key, value: _scaled_dot_product_cudnn_attention_backward_symint(grad, query, key, value, output, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, philox_seed, philox_offset, scale)
+
 # fft
 - name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
   self: fft_r2c_backward(grad, dim, normalization, onesided, self.sym_size(dim.back()))
diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
index c4d1df00a95d2..0d4aa91d3fade 100644
--- a/tools/autograd/gen_autograd.py
+++ b/tools/autograd/gen_autograd.py
@@ -43,6 +43,7 @@
 from .gen_trace_type import gen_trace_type
 from .gen_variable_factories import gen_variable_factories
 from .gen_variable_type import gen_variable_type
+from .gen_view_funcs import gen_view_funcs
 from .load_derivatives import load_derivatives
 
 
@@ -95,6 +96,9 @@ def gen_autograd(
     # Generate variable_factories.h
     gen_variable_factories(out, native_functions_path, tags_path, template_path)
 
+    # Generate ViewFuncs.h/cpp
+    gen_view_funcs(out, fns_with_diff_infos, template_path)
+
 
 def gen_autograd_python(
     native_functions_path: str,
diff --git a/tools/autograd/gen_inplace_or_view_type.py b/tools/autograd/gen_inplace_or_view_type.py
index 154c0be864845..d1392f5407c0c 100644
--- a/tools/autograd/gen_inplace_or_view_type.py
+++ b/tools/autograd/gen_inplace_or_view_type.py
@@ -4,7 +4,7 @@
 # if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp
 # The fallback is expected to mimick this codegen, so we should keep the two in sync.
 
-from typing import Dict, List, Optional, Sequence, Tuple
+from typing import Dict, List, Optional, Tuple
 
 from torchgen.api import cpp
 from torchgen.api.autograd import (
@@ -59,7 +59,9 @@
     "view_as_real",
     "_conj",
     "_neg_view",
+    "_nested_get_values",
     "_nested_view_from_buffer",
+    "_nested_view_from_jagged",
 ]
 
 VIEW_FUNCTIONS = {
@@ -71,6 +73,7 @@
     "permute": "self",
     "select": "self",
     "slice": "self",
+    "slice_inverse": "self",
     "split": "self",
     "split_with_sizes": "self",
     "squeeze": "self",
@@ -171,7 +174,7 @@
 
 SETUP_REPLAY_VIEW_IF_NOT_SUPPORT_AS_STRIDED_OR_VIEW_WITH_METADATA_CHANGE = CodeTemplate(
     """\
-std::function<at::Tensor(const at::Tensor&)> func=nullptr;
+std::unique_ptr<torch::autograd::ViewFunc> func(nullptr);
 std::function<at::Tensor(const at::Tensor&)> rev_func=nullptr;
 if (${is_view_with_metadata_change} ||
     !self.unsafeGetTensorImpl()->support_as_strided() ||
@@ -183,11 +186,9 @@
 """
 )
 
-REPLAY_VIEW_LAMBDA_FUNC = CodeTemplate(
+REPLAY_VIEW_FUNC = CodeTemplate(
     """\
-func = [=](const at::Tensor& ${input_base}) {
-  return ${replay_view_call}${view_indexing};
-};
+func = std::make_unique<${view_func_name}>(${view_func_args});
 """
 )
 
@@ -345,24 +346,13 @@ def get_view_info(f: NativeFunction) -> Optional[str]:
     return view_info
 
 
-# For view replay calls, we generate an ordinary Dispatcher::call() instead, because:
-#  - We want to replay the entire call into the op, including any previously-set dispatch keys (including autograd!).
-#  - The view replay call also is not part of the hot path.
-def emit_view_call(
-    f: NativeFunction, input_base: str, unpacked_args: Sequence[str]
-) -> str:
-    # View replay functions use the standard Dispatcher::call API.
-    return CALL_DISPATCH.substitute(
-        unambiguous_name=f.func.name.unambiguous_name(), unpacked_args=unpacked_args
-    )
-
-
-def emit_view_lambda(
+def emit_view_func(
     f: NativeFunction, bindings: List[Binding], view_idx: Optional[str] = None
 ) -> str:
     """Generate an additional lambda function to recover views in backward when as_strided is not supported.
     See Note [View + Inplace update for base tensor] and [View + Inplace update for view tensor] for more details.
     """
+    # TODO: Clean this logic up if we get rid of reverse view funcs or reify them.
     input_base = "input_base"
     replay_view_func = ""
     updated_args: List[str] = []
@@ -375,6 +365,7 @@ def emit_view_lambda(
         BaseCType(intArrayRefT),
         BaseCType(symIntArrayRefT),
         ConstRefCType(BaseCType(tensorT)),
+        ConstRefCType(OptionalCType(BaseCType(tensorT))),
     ]
     for binding in bindings:
         arg, arg_type = binding.name, binding.nctype.type
@@ -404,18 +395,23 @@ def emit_view_lambda(
                 arg=arg, val=arg_value, default="0"
             )
             updated_args.append(arg_value)
-        elif arg_type == ConstRefCType(BaseCType(tensorT)):
+        elif arg_type == ConstRefCType(BaseCType(tensorT)) or arg_type == ConstRefCType(
+            OptionalCType(BaseCType(tensorT))
+        ):
             # NB: Closing over a tensor. If a user modifies this tensor, this will be silently
             # incorrect. The proper thing to do is to store the version counter and copy on write.
             updated_args.append(arg)
         else:
             updated_args.append(arg)
 
-    replay_view_call = emit_view_call(f, input_base, updated_args)
-    replay_view_func += REPLAY_VIEW_LAMBDA_FUNC.substitute(
-        input_base=input_base,
-        replay_view_call=replay_view_call,
-        view_indexing=("" if view_idx is None else f"[{view_idx}]"),
+    from .gen_view_funcs import view_func_name
+
+    view_func_args = [b.name for b in bindings if b.name != "self"]
+    if view_idx is not None:
+        view_func_args.append(f"{view_idx}")
+    replay_view_func += REPLAY_VIEW_FUNC.substitute(
+        view_func_name=view_func_name(f, include_namespace=True),
+        view_func_args=view_func_args,
     )
 
     input_view = "input_view"
@@ -492,26 +488,26 @@ def get_creation_meta_in_mode(original: str) -> str:
         if is_tensor_list_type(return_info.type):
             creation_meta = get_creation_meta_in_mode("CreationMeta::MULTI_OUTPUT_NODE")
             view_idx = "view_idx"
-            view_lambda = emit_view_lambda(
+            view_func = emit_view_func(
                 f, extract_bindings(f), view_idx=view_idx
             ).strip()
             as_view_call = (
                 f"as_view(/* base */ {view_info}, /* output */ {var}[{view_idx}], "
                 "/* is_bw_differentiable */ true, /* is_fw_differentiable */ true, "
-                "/* view_func */ func, /* rev_view_func */ rev_func, "
+                "/* view_func */ std::move(func), /* rev_view_func */ rev_func, "
                 f"/* creation_meta */ {creation_meta});"
             )
             call += MULTI_OUTPUT_VIEW_ITERATION.substitute(
-                var=var, view_idx=view_idx, body=f"{view_lambda}\n{as_view_call}"
+                var=var, view_idx=view_idx, body=f"{view_func}\n{as_view_call}"
             )
             rhs_value = f"std::move({var})"
         else:
-            call += emit_view_lambda(f, extract_bindings(f), view_idx=None)
+            call += emit_view_func(f, extract_bindings(f), view_idx=None)
             creation_meta = get_creation_meta_in_mode("CreationMeta::DEFAULT")
             rhs_value = (
                 f"as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, "
                 "/* is_fw_differentiable */ true, "
-                f"/* view_func */ func, /* rev_view_func */ rev_func, /* creation_meta */ {creation_meta})"
+                f"/* view_func */ std::move(func), /* rev_view_func */ rev_func, /* creation_meta */ {creation_meta})"
             )
     else:
         # This could be supported but we don't need it at the moment, so keeping things simple.
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 2c7ce19ff69bd..be942ca5bfbb5 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -47,13 +47,13 @@
     dispatch_lambda_exprs,
     dispatch_lambda_return_str,
     has_tensor_options,
-    namedtuple_fieldnames,
     PythonSignature,
     PythonSignatureDeprecated,
     PythonSignatureGroup,
     PythonSignatureNativeFunctionPair,
     signature,
     signature_from_schema,
+    structseq_fieldnames,
 )
 
 from torchgen.code_template import CodeTemplate
@@ -64,12 +64,14 @@
     BaseOperatorName,
     FunctionSchema,
     NativeFunction,
+    SchemaKind,
     Type,
     Variant,
 )
 from torchgen.utils import FileManager, split_name_params
 from torchgen.yaml_utils import YamlLoader
 
+from .gen_inplace_or_view_type import is_tensor_list_type
 from .gen_trace_type import should_trace
 
 #
@@ -158,9 +160,12 @@
     "fill.Tensor",  # only used by the functionalization pass
     "fill.Scalar",  # only used by the functionalization pass
     "lift.*",
-    "normal_functional",  # only used by the functionalization pas
+    "normal_functional",  # only used by the functionalization pass
     "nbytes",
     "itemsize",
+    "_batch_norm_with_update",
+    "_batch_norm_with_update_out",
+    "_batch_norm_no_update",
 ]
 
 SKIP_PYTHON_BINDINGS = [
@@ -350,8 +355,8 @@ def gen(
     )
 
     # Currently, we only use `functions` to generate `return_types` bindings.
-    # All methods which return namedtuple have function variant at this point.
-    # If any method only operator with namedtuple is added in the future,
+    # All methods which return structseq have function variant at this point.
+    # If any method only operator with structseq is added in the future,
     # we will have to address that.
     create_python_return_type_bindings(
         fm, functions, lambda fn: True, "python_return_types.cpp"
@@ -686,13 +691,13 @@ def is_schema_compatible(
 
 
 @with_native_function
-def gen_namedtuple_typename_key(f: NativeFunction) -> str:
+def gen_structseq_typename_key(f: NativeFunction) -> str:
     name = cpp.name(f.func)
-    fieldnames = namedtuple_fieldnames(f.func.returns)
+    fieldnames = structseq_fieldnames(f.func.returns)
     return "_".join([name] + fieldnames)
 
 
-def emit_namedtuple_call(
+def emit_structseq_call(
     overloads: Sequence[PythonSignatureNativeFunctionPair],
 ) -> Tuple[List[str], Dict[str, str]]:
     """
@@ -705,19 +710,19 @@ def emit_namedtuple_call(
     typedefs: List[str] = []  # typedef declarations and init code
 
     for overload in overloads:
-        fieldnames = namedtuple_fieldnames(overload.function.func.returns)
+        fieldnames = structseq_fieldnames(overload.function.func.returns)
         if not fieldnames:
             continue
 
         name = cpp.name(overload.function.func)  # use @with_native_function?
-        tn_key = gen_namedtuple_typename_key(overload.function)
+        tn_key = gen_structseq_typename_key(overload.function)
         typename = typenames.get(tn_key)
         if typename is None:
             typename = f'NamedTuple{"" if not typedefs else len(typedefs)}'
             typenames[tn_key] = typename
             typedefs.append(
                 f"""\
-static PyTypeObject* {typename} = generated::get_{name}_namedtuple();"""
+static PyTypeObject* {typename} = generated::get_{name}_structseq();"""
             )
 
     return typedefs, typenames
@@ -738,14 +743,14 @@ def generate_return_type_definition_and_registrations(
     registrations: List[str] = []  # register call for the typedef
 
     for overload in overloads:
-        fieldnames = namedtuple_fieldnames(overload.function.func.returns)
+        fieldnames = structseq_fieldnames(overload.function.func.returns)
         if not fieldnames:
             continue
 
         fields = ", ".join(f'{{"{fn}", ""}}' for fn in fieldnames)
 
         name = cpp.name(overload.function.func)  # use @with_native_function?
-        tn_key = gen_namedtuple_typename_key(overload.function)
+        tn_key = gen_structseq_typename_key(overload.function)
         typename = typenames.get(tn_key)
 
         if typename is None:
@@ -753,7 +758,7 @@ def generate_return_type_definition_and_registrations(
             typenames[tn_key] = typename
             definitions.append(
                 f"""\
-PyTypeObject* get_{name}_namedtuple() {{
+PyTypeObject* get_{name}_structseq() {{
     static PyStructSequence_Field NamedTuple_fields[] = {{ {fields},  {{nullptr}} }};
     static PyTypeObject {typename};
     static bool is_initialized = false;
@@ -768,7 +773,7 @@ def generate_return_type_definition_and_registrations(
 """
             )
             registrations.append(
-                f'addReturnType(return_types_module, "{name}", generated::get_{name}_namedtuple());'
+                f'addReturnType(return_types_module, "{name}", generated::get_{name}_structseq());'
             )
 
     return definitions, registrations
@@ -787,12 +792,12 @@ def generate_return_type_declarations(
     declarations: List[str] = []  # function declaration to register the typedef
 
     for overload in overloads:
-        fieldnames = namedtuple_fieldnames(overload.function.func.returns)
+        fieldnames = structseq_fieldnames(overload.function.func.returns)
         if not fieldnames:
             continue
 
         name = cpp.name(overload.function.func)  # use @with_native_function?
-        tn_key = gen_namedtuple_typename_key(overload.function)
+        tn_key = gen_structseq_typename_key(overload.function)
         typename = typenames.get(tn_key)
 
         if typename is None:
@@ -800,7 +805,7 @@ def generate_return_type_declarations(
                 f'{name}NamedTuple{"" if not declarations else len(declarations)}'
             )
             typenames[tn_key] = typename
-            declarations.append(f"PyTypeObject* get_{name}_namedtuple();")
+            declarations.append(f"PyTypeObject* get_{name}_structseq();")
 
     return declarations
 
@@ -895,10 +900,10 @@ def method_impl(
     """
     pycname = get_pycname(name)
     noarg = is_noarg(overloads)
-    namedtuple_inits, namedtuple_typenames = emit_namedtuple_call(overloads)
+    structseq_inits, structseq_typenames = emit_structseq_call(overloads)
 
     method_header = ["HANDLE_TH_ERRORS"]
-    method_header += namedtuple_inits
+    method_header += structseq_inits
     method_header += (
         ["const Tensor& self = THPVariable_Unpack(self_);"] if method else []
     )
@@ -916,9 +921,7 @@ def method_impl(
     for overload_index, overload in enumerate(grouped_overloads):
         signature = overload.signature.signature_str(symint=symint)
         signatures.append(f"{cpp_string(str(signature))},")
-        dispatch_body = emit_dispatch_case(
-            overload, namedtuple_typenames, symint=symint
-        )
+        dispatch_body = emit_dispatch_case(overload, structseq_typenames, symint=symint)
         dispatch.append(
             PY_VARIABLE_CASE.substitute(
                 overload_index=overload_index, body=dispatch_body
@@ -1002,7 +1005,7 @@ def gen_has_torch_function_check(
 
 def emit_dispatch_case(
     overload: PythonSignatureGroup,
-    namedtuple_typenames: Dict[str, str],
+    structseq_typenames: Dict[str, str],
     *,
     symint: bool = True,
 ) -> str:
@@ -1017,19 +1020,19 @@ def emit_dispatch_case(
         return PY_VARIABLE_OUT.substitute(
             out_idx=overload.signature.output_idx(),
             call_dispatch=emit_single_dispatch(
-                overload.signature, overload.base, namedtuple_typenames, symint=symint
+                overload.signature, overload.base, structseq_typenames, symint=symint
             ),
             call_dispatch_out=emit_single_dispatch(
                 overload.signature,
                 overload.outplace,
-                namedtuple_typenames,
+                structseq_typenames,
                 symint=symint,
             ),
         )
     else:
         # no-output version only
         return emit_single_dispatch(
-            overload.signature, overload.base, namedtuple_typenames, symint=symint
+            overload.signature, overload.base, structseq_typenames, symint=symint
         )
 
 
@@ -1299,7 +1302,7 @@ def is_smaller(s1: PythonSignature, s2: PythonSignature) -> bool:
 def emit_single_dispatch(
     ps: PythonSignature,
     f: NativeFunction,
-    namedtuple_typenames: Dict[str, str],
+    structseq_typenames: Dict[str, str],
     *,
     symint: bool = True,
 ) -> str:
@@ -1351,6 +1354,25 @@ def go(f: NativeFunction) -> str:
         )
 
         if lambda_return == "void":
+            # Make in-place foreach return `self` at python-binding level.
+            # ref: https://github.com/pytorch/pytorch/pull/118622#pullrequestreview-1904804954
+            self_arg = f.func.arguments.self_arg
+            return_stmt: str
+            if (
+                str(f.func.name).startswith("_foreach_")
+                and f.func.kind() == SchemaKind.inplace
+            ):
+                # note(crcrpar): `_foreach_pow.ScalarAndTensor` does NOT have its in-place
+                # variant and it unlikely to have it in the future. Thus it's safe to have the following assert.
+                assert self_arg is not None and is_tensor_list_type(
+                    self_arg.argument.type
+                )
+                return_stmt = """PyObject* self_tensorlist = _r.args[0];
+Py_INCREF(self_tensorlist);
+return self_tensorlist;
+"""
+            else:
+                return_stmt = "Py_RETURN_NONE;"
             return f"""\
 {schema_comment}
 {inits}
@@ -1359,11 +1381,11 @@ def go(f: NativeFunction) -> str:
   {dispatch_callee}({dispatch_args});
 }};
 dispatch_{name}({lambda_args}){set_requires_grad};
-Py_RETURN_NONE;
+{return_stmt}
 """
         else:
-            typename = namedtuple_typenames.get(gen_namedtuple_typename_key(f))
-            namedtuple_typeref = f"{typename}, " if typename is not None else ""
+            typename = structseq_typenames.get(gen_structseq_typename_key(f))
+            structseq_typeref = f"{typename}, " if typename is not None else ""
             return f"""\
 {schema_comment}
 {inits}
@@ -1371,7 +1393,7 @@ def go(f: NativeFunction) -> str:
   pybind11::gil_scoped_release no_gil;
   return {dispatch_callee}({dispatch_args});
 }};
-return wrap({namedtuple_typeref}dispatch_{name}({lambda_args}){set_requires_grad});
+return wrap({structseq_typeref}dispatch_{name}({lambda_args}){set_requires_grad});
 """
 
     return go(f)
diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index c69f34e019fc2..9d9144bce04c2 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -2,7 +2,6 @@
 from typing import Dict, List, Sequence, Union
 
 from torchgen.api import cpp
-
 from torchgen.api.types import DispatcherSignature
 from torchgen.code_template import CodeTemplate
 from torchgen.context import with_native_function
@@ -376,22 +375,11 @@ def format_postrecord_trace(f: NativeFunction) -> str:
         return POST_RECORD_TRACE.substitute(add_trace_outputs=outputs)
 
 
-def declare_returned_variables(f: NativeFunction) -> str:
-    modifies_arguments = f.func.kind() in (SchemaKind.inplace, SchemaKind.out)
-    if modifies_arguments:
-        return ""
-    if len(f.func.returns) == 1:
-        return ""
-    types = [cpp.return_type(r, symint=True) for r in f.func.returns]
-    names = cpp.return_names(f)
-    return "\n".join(f"{type.cpp_type()} {name};" for type, name in zip(types, names))
-
-
 def tie_return_values(f: NativeFunction) -> str:
     if len(f.func.returns) == 1:
         return f'auto {f.func.returns[0].name or "result"}'
     names = cpp.return_names(f)
-    return f'std::tie({", ".join(names)})'
+    return f'auto [{", ".join(names)}]'
 
 
 def get_return_value(f: NativeFunction) -> str:
@@ -415,7 +403,6 @@ def emit_trace_body(f: NativeFunction) -> List[str]:
     trace_body: List[str] = []
 
     trace_body.append(format_prerecord_trace(f))
-    trace_body.append(declare_returned_variables(f))
 
     dispatcher_sig = DispatcherSignature.from_schema(f.func)
     dispatcher_exprs = dispatcher_sig.exprs()
@@ -433,7 +420,8 @@ def emit_trace_body(f: NativeFunction) -> List[str]:
     )
 
     # Note that this calls the slow, dispatching variants of manual_cpp_binding ops.
-    # We could probably work harder to ensure that the fast variants are called instead, but the perf benefit would be minimal.
+    # We could probably work harder to ensure that the fast variants are
+    # called instead, but the perf benefit would be minimal.
     trace_body.append(
         TRACE_DISPATCH.substitute(
             assign_return_values=assign_return_values,
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 09455318a983e..b9651ea2da803 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -96,7 +96,6 @@
     WRAPPER_REGISTRATION,
 )
 from .gen_trace_type import (
-    declare_returned_variables,
     get_return_value,
     MANUAL_AUTOGRAD_AND_TRACER,
     MANUAL_BACKEND,
@@ -418,7 +417,8 @@
     """\
 if (${tensor_name}_storage_saved.has_value() &&
     !at::impl::dispatch_mode_enabled() &&
-    !at::impl::tensor_has_dispatch(${tensor_name}))
+    !at::impl::tensor_has_dispatch(${tensor_name}) &&
+    !at::impl::tensor_has_dispatch(${out_tensor_name}))
   TORCH_INTERNAL_ASSERT(${tensor_name}_storage_saved.value().is_alias_of(${out_tensor_name}.storage()));
 """
 )
@@ -735,7 +735,7 @@
 auto ${inp_name}_t_raw = toNonOptFwGrad(${inp});
 auto ${inp_name}_tensor = toNonOptTensor(${inp});
 auto ${inp_name}_t = (${inp_name}_t_raw.defined() || !${inp_name}_tensor.defined())
-  ? ${inp_name}_t_raw : at::${zeros_fn}(${inp_name}_tensor.sizes(), ${inp_name}_tensor.options());
+  ? ${inp_name}_t_raw : at::${zeros_fn}(${inp_name}_tensor.sym_sizes(), ${inp_name}_tensor.options());
 """
 )
 
@@ -1252,7 +1252,7 @@ def guard_for(arg: SavedAttribute) -> Optional[str]:
                 if a.name == derivative_var_name:
                     break
             else:
-                raise AssertionError()
+                raise AssertionError
             return f"grad_fn->should_compute_output({edge_off})"
 
         if is_inplace_foreach:
@@ -1873,9 +1873,9 @@ def emit_fw_derivatives() -> List[str]:
                     if inp.name in refargname2inplace_foreacharg:
                         inp_name = refargname2inplace_foreacharg[inp.name].name
                 zeros_fn = (
-                    "zeros"
+                    "zeros_symint"
                     if inplace and inp.name == "self"
-                    else "_efficientzerotensor"
+                    else "_efficientzerotensor_symint"
                 )
                 if inp.name in derivative.required_inputs_fw_grad:
                     unpacked_arguments += (
@@ -2130,7 +2130,6 @@ def emit_forbid_fw_derivatives(is_out_fn: bool = False) -> str:
         body.extend(emit_check_inplace())
         body.extend(emit_original_self_definition())
         body.extend(setup_derivative(differentiable_inputs))
-    body.append(declare_returned_variables(f))
 
     body.append(emit_call(f, unpacked_bindings, try_jit_decomposition))
     if requires_derivative:
diff --git a/tools/autograd/gen_view_funcs.py b/tools/autograd/gen_view_funcs.py
new file mode 100644
index 0000000000000..c9f7561dca17d
--- /dev/null
+++ b/tools/autograd/gen_view_funcs.py
@@ -0,0 +1,334 @@
+# Generates ViewFuncs.h/cpp
+#
+# NOTE: If any changes are being made to the ViewFunc codegen please also check
+# if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+# The fallback is expected to mimic this codegen, so we should keep the two in sync.
+
+from typing import List, Tuple
+
+import torchgen.api.dispatcher as dispatcher
+from torchgen.api.autograd import NativeFunctionWithDifferentiabilityInfo
+from torchgen.api.translate import translate
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    NamedCType,
+    SymIntT,
+    tensorT,
+    VectorCType,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.model import Argument, NativeFunction, OptionalType
+from torchgen.utils import FileManager
+
+from .gen_inplace_or_view_type import (
+    CALL_DISPATCH,
+    extract_bindings,
+    get_view_info,
+    modifies_arguments,
+    use_derived,
+)
+
+FUNCTION_DECLARATION = CodeTemplate(
+    """\
+#define ${uppercase_op}_AVAILABLE
+struct ${op} : public ${superclass} {
+  ${op}(${constructor_args}) ${initializer_list}
+  {};
+  virtual ~${op}() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ${state}
+};
+
+"""
+)
+
+FUNCTION_DEFINITION = CodeTemplate(
+    """\
+std::vector<c10::SymInt> ${op}::get_symints() const {
+  ${get_symints}
+}
+
+size_t ${op}::num_symints() const {
+  return static_cast<size_t>(${num_symints});
+}
+
+void ${op}::set_symints(std::vector<c10::SymInt> ${symints_vec}) {
+  TORCH_INTERNAL_ASSERT(${symints_vec}.size() == num_symints());
+  ${set_symints}
+}
+
+std::vector<at::Tensor> ${op}::get_tensors() const {
+  ${get_tensors}
+}
+
+size_t ${op}::num_tensors() const {
+  return static_cast<size_t>(${num_tensors});
+}
+
+void ${op}::set_tensors(std::vector<at::Tensor> ${tensors_vec}) {
+  TORCH_INTERNAL_ASSERT(${tensors_vec}.size() == num_tensors());
+  ${set_tensors}
+}
+
+at::Tensor ${op}::operator()(const at::Tensor& ${call_input_name}) const {
+  return ${op_call};
+}
+
+std::unique_ptr<ViewFunc> ${op}::clone_and_set(
+    std::optional<std::vector<c10::SymInt>> ${symints_vec},
+    std::optional<std::vector<at::Tensor>> ${tensors_vec}) const {
+  auto output = std::make_unique<${op}>(${clone_args});
+  if (${symints_vec}.has_value()) {
+    output->set_symints(std::move(*(${symints_vec})));
+  }
+  if (${tensors_vec}.has_value()) {
+    output->set_tensors(std::move(*(${tensors_vec})));
+  }
+  return output;
+}
+
+"""
+)
+
+
+# e.g. as_strided -> AsStridedViewFunc for camel case or
+# as_strided_view_func otherwise
+def view_func_name(
+    f: NativeFunction, include_namespace: bool = False, camel_case: bool = True
+) -> str:
+    name = f.func.name.unambiguous_name()
+    view_func_name = f"{name.replace('.', '_')}_view_func"
+    if camel_case:
+        is_private = view_func_name.startswith("_")
+        view_func_name = "".join(
+            [p.title() for p in view_func_name.replace(".", "_").split("_")]
+        )
+        if is_private:
+            # put the leading underscore back in
+            view_func_name = f"_{view_func_name}"
+    namespace = "torch::autograd::generated::" if include_namespace else ""
+    return f"{namespace}{view_func_name}"
+
+
+def is_symint_or_tensor(arg: Argument) -> bool:
+    return arg.type.is_tensor_like() or arg.type.is_symint_like()
+
+
+def remove_const_ref(binding: Binding) -> Binding:
+    return Binding(
+        name=binding.name,
+        nctype=binding.nctype.remove_const_ref(),
+        argument=binding.argument,
+        default=binding.default,
+    )
+
+
+def returns_multi_tensor(fn: NativeFunction) -> bool:
+    returns = fn.func.returns
+    assert len(returns) == 1
+    returns_list_like = returns[0].type.is_list_like() is not None
+    returns_tensor_like = returns[0].type.is_tensor_like()
+    return returns_list_like and returns_tensor_like
+
+
+# Generates strings with logic for getting / setting state of a particular type.
+#
+# Args:
+#   bindings (list): List of state bindings of interest (may be empty)
+#   state_vec_type (NamedCType): Type of vector to either return or copy from
+#
+# Returns:
+#   tuple: (list of getter logic strings, list of setter logic strings, string
+#     with num items expression)
+def generate_state_getter_setter(
+    bindings: List[Binding],
+    state_vec_type: NamedCType,
+) -> Tuple[List[str], List[str], str]:
+    getter_logic = []
+    setter_logic = []
+
+    state_vec = state_vec_type.name
+    getter_logic.append(f"{state_vec_type.cpp_type()} {state_vec};")
+    if len(bindings) > 0:
+        setter_logic.append("auto i = 0;")
+
+    num_exprs = []
+    for i, b in enumerate(bindings):
+        assert isinstance(b.argument, Argument)
+        if b.argument.type.is_list_like():
+            # Handle list-likes.
+            num_expr = f"{b.name}.size()"
+            num_exprs.append(num_expr)
+            getter = f"{state_vec}.insert({state_vec}.end(), {b.name}.begin(), {b.name}.end());"
+            setter = f"std::copy({state_vec}.begin() + i, {state_vec}.begin() + i + {b.name}.size(), {b.name}.begin());"
+        elif isinstance(b.argument.type, OptionalType):
+            # Handle optionals.
+            num_expr = f"({b.name}.has_value() ? 1 : 0)"
+            num_exprs.append(num_expr)
+            conditional = f"if({b.name}.has_value())"
+            getter = (
+                f"{conditional} {state_vec}.insert({state_vec}.end(), *({b.name}));"
+            )
+            setter = f"{conditional} {b.name} = {state_vec}[i];"
+        else:
+            num_expr = "1"
+            num_exprs.append(num_expr)
+            getter = f"{state_vec}.push_back({b.name});"
+            setter = f"{b.name} = {state_vec}[i];"
+
+        getter_logic.append(getter)
+        setter_logic.append(setter)
+        if i < len(bindings) - 1:
+            setter_logic.append(f"i += {num_expr};")
+
+    # Reserve / assert based on the total number of items expression.
+    num_items = "0" if len(num_exprs) == 0 else " + ".join(num_exprs)
+    if len(bindings) > 0:
+        getter_logic.insert(1, f"{state_vec}.reserve({num_items});")
+
+    getter_logic.append(f"return {state_vec};")
+
+    return getter_logic, setter_logic, num_items
+
+
+def process_function(fn: NativeFunction, template: CodeTemplate) -> str:
+    bindings = extract_bindings(fn)
+    non_self_bindings = [b for b in bindings if b.name != "self"]
+
+    non_self_args = fn.func.arguments.flat_all[1:]
+    non_self_value_bindings = [
+        dispatcher.argument(a, remove_non_owning_ref_types=True) for a in non_self_args
+    ]
+
+    # Generate constructor / clone args for the generated struct.
+    constructor_args = [b.defn() for b in non_self_bindings]
+    clone_args = [b.name for b in non_self_bindings]
+
+    # Generate state variable declarations for the generated struct.
+    state_variables = [
+        f"{remove_const_ref(b).defn()};" for b in non_self_value_bindings
+    ]
+
+    # Generate initializer list expressions for the generated struct.
+    # allow_expensive_conversions=True because we need to store e.g. SymIntArrayRefs as
+    # vector<SymInt>s.
+    init_exprs = translate(
+        non_self_bindings, non_self_value_bindings, allow_expensive_conversions=True
+    )
+    initializers = []
+    for b, init_expr in zip(non_self_bindings, init_exprs):
+        name = b.nctype.name
+        assert isinstance(name, str)
+        initializers.append(f"{name}({init_expr.expr})")
+
+    # Generate call to underlying view op
+    call_input_name = "input_base"
+    op_call_args = [call_input_name, *(b.name for b in non_self_bindings)]
+    op_call = CALL_DISPATCH.substitute(
+        unambiguous_name=fn.func.name.unambiguous_name(),
+        unpacked_args=op_call_args,
+    )
+
+    # Multi-output views additionally require a view_idx for disambiguation.
+    if returns_multi_tensor(fn):
+        view_idx_name = "view_idx"
+        view_idx_typename = "int64_t"
+        view_idx_decl = f"{view_idx_typename} {view_idx_name}"
+        constructor_args.append(view_idx_decl)
+        clone_args.append(view_idx_name)
+        state_variables.append(f"{view_idx_decl};")
+        initializers.append(f"{view_idx_name}({view_idx_name})")
+        op_call += f"[{view_idx_name}]"
+
+    # Generate initializer list for the generated struct.
+    initializer_list = f": {', '.join(initializers)}" if len(initializers) > 0 else ""
+
+    # Generate getter / setter logic for any symints.
+    symint_bindings = [
+        b
+        for b in non_self_bindings
+        if isinstance(b.argument, Argument) and b.argument.type.is_symint_like()
+    ]
+    symints_vec_type = NamedCType("symints", VectorCType(BaseCType(SymIntT)))
+    get_symints, set_symints, num_symints = generate_state_getter_setter(
+        symint_bindings, symints_vec_type
+    )
+
+    # Generate getter / setter logic for any tensors.
+    tensor_bindings = [
+        b
+        for b in non_self_bindings
+        if isinstance(b.argument, Argument) and b.argument.type.is_tensor_like()
+    ]
+    tensors_vec_type = NamedCType("tensors", VectorCType(BaseCType(tensorT)))
+    get_tensors, set_tensors, num_tensors = generate_state_getter_setter(
+        tensor_bindings, tensors_vec_type
+    )
+
+    return template.substitute(
+        op=view_func_name(fn),
+        uppercase_op=view_func_name(fn, camel_case=False).upper(),
+        superclass="torch::autograd::ViewFunc",
+        initializer_list=initializer_list,
+        state=state_variables,
+        constructor_args=constructor_args,
+        clone_args=clone_args,
+        symints_vec=symints_vec_type.name,
+        get_symints=get_symints,
+        set_symints=set_symints,
+        num_symints=num_symints,
+        tensors_vec=tensors_vec_type.name,
+        get_tensors=get_tensors,
+        set_tensors=set_tensors,
+        num_tensors=num_tensors,
+        call_input_name=call_input_name,
+        op_call=op_call,
+    )
+
+
+def gen_view_funcs(
+    out: str,
+    fns_with_infos: List[NativeFunctionWithDifferentiabilityInfo],
+    template_path: str,
+) -> None:
+    # don't need the info parts, just the function
+    fns = [fn.func for fn in fns_with_infos if use_derived(fn)]
+    # only want out-of-place views
+    view_fns = [
+        fn for fn in fns if get_view_info(fn) is not None and not modifies_arguments(fn)
+    ]
+
+    declarations = [process_function(fn, FUNCTION_DECLARATION) for fn in view_fns]
+    definitions = [process_function(fn, FUNCTION_DEFINITION) for fn in view_fns]
+    ops_headers = [f"#include <ATen/ops/{fn.root_name}_ops.h>" for fn in view_fns]
+
+    file_basename = "ViewFuncs"
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    for suffix in [".h", ".cpp"]:
+        fname = file_basename + suffix
+        fm.write_with_template(
+            fname,
+            fname,
+            lambda: {
+                "generated_comment": "@"
+                + f"generated from {fm.template_dir_for_comments()}/"
+                + fname,
+                "view_func_declarations": declarations,
+                "view_func_definitions": definitions,
+                "ops_headers": ops_headers,
+            },
+        )
diff --git a/tools/autograd/load_derivatives.py b/tools/autograd/load_derivatives.py
index 6b336cd6888b2..ad05cf187ef5e 100644
--- a/tools/autograd/load_derivatives.py
+++ b/tools/autograd/load_derivatives.py
@@ -49,7 +49,9 @@
 from torchgen.utils import concatMap, IDENT_REGEX, split_name_params
 from torchgen.yaml_utils import YamlLoader
 
-_GLOBAL_LOAD_DERIVATIVE_CACHE = {}
+DerivativeRet = Tuple[Dict[FunctionSchema, Dict[str, DifferentiabilityInfo]], Set[str]]
+
+_GLOBAL_LOAD_DERIVATIVE_CACHE: Dict[Tuple[str, str], DerivativeRet] = {}
 
 _VALID_AUTOGRAD_KEYS = set(AUTOGRAD_KEYS)
 
@@ -84,7 +86,8 @@ def add_view_copy_derivatives(
                     view_copy_differentiability_infos[dispatch_key] = view_copy_info
             else:
                 break
-        if len(view_copy_differentiability_infos) > 0:
+        # prefer manually-defined derivatives if any
+        if len(view_copy_differentiability_infos) > 0 and fn_schema not in infos:
             assert fn_schema is not None
             view_infos[fn_schema] = view_copy_differentiability_infos
 
@@ -93,7 +96,7 @@ def add_view_copy_derivatives(
 
 def load_derivatives(
     derivatives_yaml_path: str, native_yaml_path: str, tags_yaml_path: str
-) -> Tuple[Dict[FunctionSchema, Dict[str, DifferentiabilityInfo]], Set[str]]:
+) -> DerivativeRet:
     # Do some caching as this is a deterministic function
     global _GLOBAL_LOAD_DERIVATIVE_CACHE
     key = (derivatives_yaml_path, native_yaml_path)
@@ -105,11 +108,10 @@ def load_derivatives(
         # From the parsed native functions, separate out the (generated) view_copy functions,
         # so we can generate derivatives for them separately.
         native_functions_with_view_groups = get_grouped_by_view_native_functions(funcs)
-        native_functions_without_view_copies = concatMap(
-            # We need to pull out the view_inplace ops too, since they might have their own derivative entries.
+        native_functions = concatMap(
             lambda g: [g]
             if isinstance(g, NativeFunction)
-            else list(g.functions(include_copy=False)),
+            else list(g.functions(include_copy=True)),
             native_functions_with_view_groups,
         )
         view_groups = [
@@ -126,7 +128,7 @@ def load_derivatives(
             FunctionSchema, List[NativeFunction]
         ] = defaultdict(list)
         functions_by_schema: Dict[str, NativeFunction] = {}
-        for function in native_functions_without_view_copies:
+        for function in native_functions:
             functions_by_signature[function.func.signature()].append(function)
             assert str(function.func) not in functions_by_schema
             functions_by_schema[str(function.func)] = function
diff --git a/tools/autograd/templates/ADInplaceOrViewType.cpp b/tools/autograd/templates/ADInplaceOrViewType.cpp
index 7a19047dd5c99..e8276697eee06 100644
--- a/tools/autograd/templates/ADInplaceOrViewType.cpp
+++ b/tools/autograd/templates/ADInplaceOrViewType.cpp
@@ -1,5 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include "torch/csrc/autograd/VariableTypeUtils.h"
+#include "torch/csrc/autograd/generated/ViewFuncs.h"
 
 #include <torch/library.h>
 #include <ATen/FunctionalInverses.h>
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index 5ff1a6dc567d3..065812694cfe4 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -48,6 +48,7 @@ namespace VariableType {
   TORCH_API std::vector<at::DeprecatedTypeProperties*> allCUDATypes();
   TORCH_API std::vector<at::DeprecatedTypeProperties*> allXPUTypes();
   TORCH_API std::vector<at::DeprecatedTypeProperties*> allCPUTypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allPrivateUser1Types();
 
   at::Tensor & unpack(Tensor & t, const char * name, int pos);
   const at::Tensor & unpack(const Tensor & t, const char * name, int pos);
diff --git a/tools/autograd/templates/ViewFuncs.cpp b/tools/autograd/templates/ViewFuncs.cpp
new file mode 100644
index 0000000000000..11b9b194fb46f
--- /dev/null
+++ b/tools/autograd/templates/ViewFuncs.cpp
@@ -0,0 +1,14 @@
+#include <torch/csrc/autograd/generated/ViewFuncs.h>
+
+// ${generated_comment}
+
+using at::Tensor;
+using at::Scalar;
+using at::IntArrayRef;
+using at::TensorList;
+
+namespace torch::autograd::generated {
+
+${view_func_definitions}
+
+} // namespace torch::autograd::generated
diff --git a/tools/autograd/templates/ViewFuncs.h b/tools/autograd/templates/ViewFuncs.h
new file mode 100644
index 0000000000000..faf5ab6881f18
--- /dev/null
+++ b/tools/autograd/templates/ViewFuncs.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <torch/library.h>
+#include <torch/csrc/autograd/variable.h>
+#include <c10/core/SymIntArrayRef.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+$ops_headers
+#endif
+
+namespace torch::autograd::generated {
+
+using at::Scalar;
+using at::Tensor;
+using at::IntArrayRef;
+using at::ArrayRef;
+using at::Type;
+using at::ScalarType;
+using c10::optional;
+using c10::fmap;
+
+${view_func_declarations}
+
+} // namespace torch::autograd::generated
diff --git a/tools/autograd/templates/python_fft_functions.cpp b/tools/autograd/templates/python_fft_functions.cpp
index 34e6377b80579..71ac4e2226d2d 100644
--- a/tools/autograd/templates/python_fft_functions.cpp
+++ b/tools/autograd/templates/python_fft_functions.cpp
@@ -14,7 +14,7 @@
 #include "torch/csrc/utils/pycfunction_helpers.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/structseq.h"
-#include "torch/csrc/utils/cuda_lazy_init.h"
+#include "torch/csrc/utils/device_lazy_init.h"
 
 #include <ATen/core/Tensor.h>
 
diff --git a/tools/autograd/templates/python_nested_functions.cpp b/tools/autograd/templates/python_nested_functions.cpp
index 3b8e1e581ed45..3acb5128cee1e 100644
--- a/tools/autograd/templates/python_nested_functions.cpp
+++ b/tools/autograd/templates/python_nested_functions.cpp
@@ -14,7 +14,7 @@
 #include "torch/csrc/utils/pycfunction_helpers.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/structseq.h"
-#include "torch/csrc/utils/cuda_lazy_init.h"
+#include "torch/csrc/utils/device_lazy_init.h"
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
diff --git a/tools/autograd/templates/python_nn_functions.cpp b/tools/autograd/templates/python_nn_functions.cpp
index f311cfebe4c5f..4877df6584bd6 100644
--- a/tools/autograd/templates/python_nn_functions.cpp
+++ b/tools/autograd/templates/python_nn_functions.cpp
@@ -60,14 +60,14 @@ static PyObject * THPVariable__parse_to(PyObject* module, PyObject* args, PyObje
     PyTuple_SET_ITEM(tuple.get(), 0, Py_None);
   }
   if (scalarType) {
-    PyTuple_SET_ITEM(tuple.get(), 1, torch::autograd::utils::wrap(torch::getTHPDtype(*scalarType)));
+    PyTuple_SET_ITEM(tuple.get(), 1, Py_NewRef(torch::getTHPDtype(*scalarType)));
   } else {
     Py_INCREF(Py_None);
     PyTuple_SET_ITEM(tuple.get(), 1, Py_None);
   }
   PyTuple_SET_ITEM(tuple.get(), 2, torch::autograd::utils::wrap(non_blocking));
   if (opt_memory_format.has_value()) {
-    PyTuple_SET_ITEM(tuple.get(), 3, torch::utils::getTHPMemoryFormat(opt_memory_format.value()));
+    PyTuple_SET_ITEM(tuple.get(), 3, Py_NewRef(torch::utils::getTHPMemoryFormat(opt_memory_format.value())));
   } else {
     Py_INCREF(Py_None);
     PyTuple_SET_ITEM(tuple.get(), 3, Py_None);
diff --git a/tools/autograd/templates/python_special_functions.cpp b/tools/autograd/templates/python_special_functions.cpp
index 4cbcb790151c0..bf9e109b4a773 100644
--- a/tools/autograd/templates/python_special_functions.cpp
+++ b/tools/autograd/templates/python_special_functions.cpp
@@ -14,7 +14,7 @@
 #include "torch/csrc/utils/pycfunction_helpers.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/structseq.h"
-#include "torch/csrc/utils/cuda_lazy_init.h"
+#include "torch/csrc/utils/device_lazy_init.h"
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index 1dbbfc18fe6e6..c17d1040e1892 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -31,7 +31,7 @@
 #include "torch/csrc/jit/frontend/tracer.h"
 #include "torch/csrc/autograd/generated/variable_factories.h"
 #include "torch/csrc/utils/structseq.h"
-#include "torch/csrc/utils/cuda_lazy_init.h"
+#include "torch/csrc/utils/device_lazy_init.h"
 #include "torch/csrc/autograd/generated/python_return_types.h"
 
 #include <ATen/core/Tensor.h>
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index bccc7ecff0568..437ea23d079bf 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -21,7 +21,8 @@
 #ifdef USE_CUDA
 #include "torch/csrc/cuda/Event.h"
 #endif
-#include "torch/csrc/utils/cuda_lazy_init.h"
+#include "torch/csrc/utils/device_lazy_init.h"
+#include <torch/csrc/utils/numpy_stub.h>
 #include "torch/csrc/utils/object_ptr.h"
 #include "torch/csrc/utils/pycfunction_helpers.h"
 #include "torch/csrc/utils/python_arg_parser.h"
@@ -317,9 +318,7 @@ template<typename T>
 static T dispatch_to(const Tensor & self) {
   pybind11::gil_scoped_release no_gil;
   OptionalDeviceGuard device_guard(device_of(self));
-  if (self.sym_numel() != 1) {
-    throw ValueError("only one element tensors can be converted to Python scalars");
-  }
+  TORCH_CHECK_VALUE(self.sym_numel() == 1, "only one element tensors can be converted to Python scalars");
   return self.template item<T>();
 }
 
@@ -497,7 +496,7 @@ static PyObject * THPVariable_cuda(PyObject* self, PyObject* args, PyObject* kwa
   auto device = r.isNone(0) ? at::Device(at::DeviceType::CUDA) : r.device(0);
   auto opt_memory_format = r.memoryformatOptional(2);
   TORCH_CHECK(device.is_cuda(), "Invalid device, must be cuda device");
-  torch::utils::cuda_lazy_init();
+  torch::utils::device_lazy_init(at::kCUDA);
   return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1), false, opt_memory_format));
   END_HANDLE_TH_ERRORS
 }
@@ -520,6 +519,7 @@ static PyObject * THPVariable_xpu(PyObject* self, PyObject* args, PyObject* kwar
   auto device = r.isNone(0) ? at::Device(at::DeviceType::XPU) : r.device(0);
   auto opt_memory_format = r.memoryformatOptional(2);
   TORCH_CHECK(device.is_xpu(), "Invalid device, must be xpu device");
+  torch::utils::device_lazy_init(at::kXPU);
   return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1), false, opt_memory_format));
   END_HANDLE_TH_ERRORS
 }
@@ -975,9 +975,7 @@ static PyObject * THPVariable_to(PyObject* self, PyObject* args, PyObject* kwarg
   auto copy = std::get<3>(parsed);
   auto opt_memory_format = std::get<4>(parsed);
   auto& self_ = THPVariable_Unpack(self);
-  if (device && device->is_cuda()) {
-    torch::utils::cuda_lazy_init();
-  }
+  torch::utils::maybe_initialize_device(device);
   if (device && device->is_privateuseone()) {
     at::globalContext().lazyInitPrivateUse1();
   }
@@ -1059,9 +1057,7 @@ static PyObject * THPVariable_type(PyObject* self, PyObject* args, PyObject* kwa
   if (device_type != device.type()) {
     device = at::Device(device_type);
   }
-  if (device.is_cuda()) {
-    torch::utils::cuda_lazy_init();
-  }
+  torch::utils::maybe_initialize_device(device);
   if (device.is_privateuseone()) {
     at::globalContext().lazyInitPrivateUse1();
   }
@@ -1083,6 +1079,41 @@ static PyObject * THPVariable_bool_scalar(PyObject* self, PyObject* args) {
   return THPVariable_is_nonzero(self, args);
 }
 
+static PyObject * THPVariable___eq__(PyObject* self_, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+#ifdef USE_NUMPY
+  if (torch::utils::is_numpy_available()) {
+    static PythonArgParser parser({
+      "__eq__(PyObject* other)",
+    }, /*traceable=*/true);
+
+    ParsedArgs<1> parsed_args;
+    auto _r = parser.parse(self_, args, kwargs, parsed_args);
+    if(_r.has_torch_function()) {
+      return handle_torch_function(_r, self_, args, kwargs, THPVariableClass, "torch.Tensor");
+    }
+    switch (_r.idx) {
+      case 0: {
+        auto other = _r.pyobject(0);
+        if (PyArray_Check(other)) {
+          auto other_tensor = torch::utils::tensor_from_numpy(other);
+          auto dispatch_eq = [](const at::Tensor & self, const at::Tensor & other) -> at::Tensor {
+            pybind11::gil_scoped_release no_gil;
+            return self.eq(other);
+          };
+          const Tensor& self = THPVariable_Unpack(self_);
+          return wrap(dispatch_eq(self, other_tensor));
+        }
+      }
+    }
+  }
+#endif
+  return THPVariable_eq(self_, args, kwargs);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
 // Wrapper converts a raised TypeError into returning NotImplemented
 // Used to implement binary arithmetic operators
 template <PyObject* (*Func)(PyObject*, PyObject*, PyObject*)>
@@ -1214,7 +1245,7 @@ PyMethodDef variable_methods[] = {
   {"__ifloordiv__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_floor_divide_>), METH_VARARGS | METH_KEYWORDS, NULL},
   {"__mod__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_remainder>), METH_VARARGS | METH_KEYWORDS, NULL},
   {"__imod__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_remainder_>), METH_VARARGS | METH_KEYWORDS, NULL},
-  {"__eq__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_eq>), METH_VARARGS | METH_KEYWORDS, NULL},
+  {"__eq__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable___eq__>), METH_VARARGS | METH_KEYWORDS, NULL},
   {"__ne__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_ne>), METH_VARARGS | METH_KEYWORDS, NULL},
   {"__lt__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_lt>), METH_VARARGS | METH_KEYWORDS, NULL},
   {"__le__", castPyCFunctionWithKeywords(TypeError_to_NotImplemented_<THPVariable_le>), METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/tools/build/bazel/requirements.in b/tools/build/bazel/requirements.in
index eaec6c67eec8c..20b28921ff307 100644
--- a/tools/build/bazel/requirements.in
+++ b/tools/build/bazel/requirements.in
@@ -1,8 +1,6 @@
-PyYAML==6.0
-future==0.18.3
-numpy==1.24.3
+PyYAML==6.0.1
+numpy==1.26.4
 requests==2.31.0
-setuptools==67.8.0
-six==1.16.0
+setuptools==69.5.1
 sympy==1.12
-typing_extensions==4.5.0
+typing_extensions==4.11.0
diff --git a/tools/build/bazel/requirements.txt b/tools/build/bazel/requirements.txt
index 2d9812b0bf53b..a241602167737 100644
--- a/tools/build/bazel/requirements.txt
+++ b/tools/build/bazel/requirements.txt
@@ -4,193 +4,220 @@
 #
 #    pip-compile --allow-unsafe --generate-hashes tools/build/bazel/requirements.in
 #
-certifi==2023.7.22 \
-    --hash=sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082 \
-    --hash=sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9
+certifi==2024.2.2 \
+    --hash=sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f \
+    --hash=sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1
     # via requests
-charset-normalizer==3.1.0 \
-    --hash=sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6 \
-    --hash=sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1 \
-    --hash=sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e \
-    --hash=sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373 \
-    --hash=sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62 \
-    --hash=sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230 \
-    --hash=sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be \
-    --hash=sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c \
-    --hash=sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0 \
-    --hash=sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448 \
-    --hash=sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f \
-    --hash=sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649 \
-    --hash=sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d \
-    --hash=sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0 \
-    --hash=sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706 \
-    --hash=sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a \
-    --hash=sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59 \
-    --hash=sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23 \
-    --hash=sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5 \
-    --hash=sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb \
-    --hash=sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e \
-    --hash=sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e \
-    --hash=sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c \
-    --hash=sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28 \
-    --hash=sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d \
-    --hash=sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41 \
-    --hash=sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974 \
-    --hash=sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce \
-    --hash=sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f \
-    --hash=sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1 \
-    --hash=sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d \
-    --hash=sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8 \
-    --hash=sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017 \
-    --hash=sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31 \
-    --hash=sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7 \
-    --hash=sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8 \
-    --hash=sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e \
-    --hash=sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14 \
-    --hash=sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd \
-    --hash=sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d \
-    --hash=sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795 \
-    --hash=sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b \
-    --hash=sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b \
-    --hash=sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b \
-    --hash=sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203 \
-    --hash=sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f \
-    --hash=sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19 \
-    --hash=sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1 \
-    --hash=sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a \
-    --hash=sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac \
-    --hash=sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9 \
-    --hash=sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0 \
-    --hash=sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137 \
-    --hash=sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f \
-    --hash=sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6 \
-    --hash=sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5 \
-    --hash=sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909 \
-    --hash=sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f \
-    --hash=sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0 \
-    --hash=sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324 \
-    --hash=sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755 \
-    --hash=sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb \
-    --hash=sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854 \
-    --hash=sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c \
-    --hash=sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60 \
-    --hash=sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84 \
-    --hash=sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0 \
-    --hash=sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b \
-    --hash=sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1 \
-    --hash=sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531 \
-    --hash=sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1 \
-    --hash=sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11 \
-    --hash=sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326 \
-    --hash=sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df \
-    --hash=sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab
+charset-normalizer==3.3.2 \
+    --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \
+    --hash=sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087 \
+    --hash=sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786 \
+    --hash=sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8 \
+    --hash=sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09 \
+    --hash=sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185 \
+    --hash=sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574 \
+    --hash=sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e \
+    --hash=sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519 \
+    --hash=sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898 \
+    --hash=sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269 \
+    --hash=sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3 \
+    --hash=sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f \
+    --hash=sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6 \
+    --hash=sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8 \
+    --hash=sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a \
+    --hash=sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73 \
+    --hash=sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc \
+    --hash=sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714 \
+    --hash=sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2 \
+    --hash=sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc \
+    --hash=sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce \
+    --hash=sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d \
+    --hash=sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e \
+    --hash=sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6 \
+    --hash=sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269 \
+    --hash=sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96 \
+    --hash=sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d \
+    --hash=sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a \
+    --hash=sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4 \
+    --hash=sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77 \
+    --hash=sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d \
+    --hash=sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0 \
+    --hash=sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed \
+    --hash=sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068 \
+    --hash=sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac \
+    --hash=sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25 \
+    --hash=sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8 \
+    --hash=sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab \
+    --hash=sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26 \
+    --hash=sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2 \
+    --hash=sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db \
+    --hash=sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f \
+    --hash=sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5 \
+    --hash=sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99 \
+    --hash=sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c \
+    --hash=sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d \
+    --hash=sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811 \
+    --hash=sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa \
+    --hash=sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a \
+    --hash=sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03 \
+    --hash=sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b \
+    --hash=sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04 \
+    --hash=sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c \
+    --hash=sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001 \
+    --hash=sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458 \
+    --hash=sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389 \
+    --hash=sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99 \
+    --hash=sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985 \
+    --hash=sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537 \
+    --hash=sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238 \
+    --hash=sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f \
+    --hash=sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d \
+    --hash=sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796 \
+    --hash=sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a \
+    --hash=sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143 \
+    --hash=sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8 \
+    --hash=sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c \
+    --hash=sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5 \
+    --hash=sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5 \
+    --hash=sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711 \
+    --hash=sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4 \
+    --hash=sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6 \
+    --hash=sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c \
+    --hash=sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7 \
+    --hash=sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4 \
+    --hash=sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b \
+    --hash=sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae \
+    --hash=sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12 \
+    --hash=sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c \
+    --hash=sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae \
+    --hash=sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8 \
+    --hash=sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887 \
+    --hash=sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b \
+    --hash=sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4 \
+    --hash=sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f \
+    --hash=sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5 \
+    --hash=sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33 \
+    --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \
+    --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561
     # via requests
-future==0.18.3 \
-    --hash=sha256:34a17436ed1e96697a86f9de3d15a3b0be01d8bc8de9c1dffd59fb8234ed5307
-    # via -r requirements.in
-idna==3.4 \
-    --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \
-    --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2
+idna==3.7 \
+    --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \
+    --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0
     # via requests
 mpmath==1.3.0 \
     --hash=sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f \
     --hash=sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c
     # via sympy
-numpy==1.24.3 \
-    --hash=sha256:0ec87a7084caa559c36e0a2309e4ecb1baa03b687201d0a847c8b0ed476a7187 \
-    --hash=sha256:1a7d6acc2e7524c9955e5c903160aa4ea083736fde7e91276b0e5d98e6332812 \
-    --hash=sha256:202de8f38fc4a45a3eea4b63e2f376e5f2dc64ef0fa692838e31a808520efaf7 \
-    --hash=sha256:210461d87fb02a84ef243cac5e814aad2b7f4be953b32cb53327bb49fd77fbb4 \
-    --hash=sha256:2d926b52ba1367f9acb76b0df6ed21f0b16a1ad87c6720a1121674e5cf63e2b6 \
-    --hash=sha256:352ee00c7f8387b44d19f4cada524586f07379c0d49270f87233983bc5087ca0 \
-    --hash=sha256:35400e6a8d102fd07c71ed7dcadd9eb62ee9a6e84ec159bd48c28235bbb0f8e4 \
-    --hash=sha256:3c1104d3c036fb81ab923f507536daedc718d0ad5a8707c6061cdfd6d184e570 \
-    --hash=sha256:4719d5aefb5189f50887773699eaf94e7d1e02bf36c1a9d353d9f46703758ca4 \
-    --hash=sha256:4749e053a29364d3452c034827102ee100986903263e89884922ef01a0a6fd2f \
-    --hash=sha256:5342cf6aad47943286afa6f1609cad9b4266a05e7f2ec408e2cf7aea7ff69d80 \
-    --hash=sha256:56e48aec79ae238f6e4395886b5eaed058abb7231fb3361ddd7bfdf4eed54289 \
-    --hash=sha256:76e3f4e85fc5d4fd311f6e9b794d0c00e7002ec122be271f2019d63376f1d385 \
-    --hash=sha256:7776ea65423ca6a15255ba1872d82d207bd1e09f6d0894ee4a64678dd2204078 \
-    --hash=sha256:784c6da1a07818491b0ffd63c6bbe5a33deaa0e25a20e1b3ea20cf0e43f8046c \
-    --hash=sha256:8535303847b89aa6b0f00aa1dc62867b5a32923e4d1681a35b5eef2d9591a463 \
-    --hash=sha256:9a7721ec204d3a237225db3e194c25268faf92e19338a35f3a224469cb6039a3 \
-    --hash=sha256:a1d3c026f57ceaad42f8231305d4653d5f05dc6332a730ae5c0bea3513de0950 \
-    --hash=sha256:ab344f1bf21f140adab8e47fdbc7c35a477dc01408791f8ba00d018dd0bc5155 \
-    --hash=sha256:ab5f23af8c16022663a652d3b25dcdc272ac3f83c3af4c02eb8b824e6b3ab9d7 \
-    --hash=sha256:ae8d0be48d1b6ed82588934aaaa179875e7dc4f3d84da18d7eae6eb3f06c242c \
-    --hash=sha256:c91c4afd8abc3908e00a44b2672718905b8611503f7ff87390cc0ac3423fb096 \
-    --hash=sha256:d5036197ecae68d7f491fcdb4df90082b0d4960ca6599ba2659957aafced7c17 \
-    --hash=sha256:d6cc757de514c00b24ae8cf5c876af2a7c3df189028d68c0cb4eaa9cd5afc2bf \
-    --hash=sha256:d933fabd8f6a319e8530d0de4fcc2e6a61917e0b0c271fded460032db42a0fe4 \
-    --hash=sha256:ea8282b9bcfe2b5e7d491d0bf7f3e2da29700cec05b49e64d6246923329f2b02 \
-    --hash=sha256:ecde0f8adef7dfdec993fd54b0f78183051b6580f606111a6d789cd14c61ea0c \
-    --hash=sha256:f21c442fdd2805e91799fbe044a7b999b8571bb0ab0f7850d0cb9641a687092b
-    # via -r requirements.in
-pyyaml==6.0 \
-    --hash=sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf \
-    --hash=sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293 \
-    --hash=sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b \
-    --hash=sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57 \
-    --hash=sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b \
-    --hash=sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4 \
-    --hash=sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07 \
-    --hash=sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba \
-    --hash=sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9 \
-    --hash=sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287 \
-    --hash=sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513 \
-    --hash=sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0 \
-    --hash=sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782 \
-    --hash=sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0 \
-    --hash=sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92 \
-    --hash=sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f \
-    --hash=sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2 \
-    --hash=sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc \
-    --hash=sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1 \
-    --hash=sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c \
-    --hash=sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86 \
-    --hash=sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4 \
-    --hash=sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c \
-    --hash=sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34 \
-    --hash=sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b \
-    --hash=sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d \
-    --hash=sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c \
-    --hash=sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb \
-    --hash=sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7 \
-    --hash=sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737 \
-    --hash=sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3 \
-    --hash=sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d \
-    --hash=sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358 \
-    --hash=sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53 \
-    --hash=sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78 \
-    --hash=sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803 \
-    --hash=sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a \
-    --hash=sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f \
-    --hash=sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174 \
-    --hash=sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5
-    # via -r requirements.in
+numpy==1.26.4 \
+    --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
+    --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \
+    --hash=sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20 \
+    --hash=sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0 \
+    --hash=sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010 \
+    --hash=sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a \
+    --hash=sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea \
+    --hash=sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c \
+    --hash=sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71 \
+    --hash=sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110 \
+    --hash=sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be \
+    --hash=sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a \
+    --hash=sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a \
+    --hash=sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5 \
+    --hash=sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed \
+    --hash=sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd \
+    --hash=sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c \
+    --hash=sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e \
+    --hash=sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0 \
+    --hash=sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c \
+    --hash=sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a \
+    --hash=sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b \
+    --hash=sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0 \
+    --hash=sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6 \
+    --hash=sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2 \
+    --hash=sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a \
+    --hash=sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30 \
+    --hash=sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218 \
+    --hash=sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5 \
+    --hash=sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07 \
+    --hash=sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2 \
+    --hash=sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4 \
+    --hash=sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764 \
+    --hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \
+    --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \
+    --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f
+    # via -r tools/build/bazel/requirements.in
+pyyaml==6.0.1 \
+    --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \
+    --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \
+    --hash=sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df \
+    --hash=sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741 \
+    --hash=sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206 \
+    --hash=sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27 \
+    --hash=sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595 \
+    --hash=sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62 \
+    --hash=sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98 \
+    --hash=sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696 \
+    --hash=sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290 \
+    --hash=sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9 \
+    --hash=sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d \
+    --hash=sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6 \
+    --hash=sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867 \
+    --hash=sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47 \
+    --hash=sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486 \
+    --hash=sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6 \
+    --hash=sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3 \
+    --hash=sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007 \
+    --hash=sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938 \
+    --hash=sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0 \
+    --hash=sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c \
+    --hash=sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735 \
+    --hash=sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d \
+    --hash=sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28 \
+    --hash=sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4 \
+    --hash=sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba \
+    --hash=sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8 \
+    --hash=sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef \
+    --hash=sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5 \
+    --hash=sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd \
+    --hash=sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3 \
+    --hash=sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0 \
+    --hash=sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515 \
+    --hash=sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c \
+    --hash=sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c \
+    --hash=sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924 \
+    --hash=sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34 \
+    --hash=sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43 \
+    --hash=sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859 \
+    --hash=sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673 \
+    --hash=sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54 \
+    --hash=sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a \
+    --hash=sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b \
+    --hash=sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab \
+    --hash=sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa \
+    --hash=sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c \
+    --hash=sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585 \
+    --hash=sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d \
+    --hash=sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f
+    # via -r tools/build/bazel/requirements.in
 requests==2.31.0 \
     --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \
     --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1
-    # via -r requirements.in
-six==1.16.0 \
-    --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \
-    --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254
-    # via -r requirements.in
+    # via -r tools/build/bazel/requirements.in
 sympy==1.12 \
     --hash=sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5 \
     --hash=sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8
-    # via -r requirements.in
-typing-extensions==4.5.0 \
-    --hash=sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb \
-    --hash=sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4
-    # via -r requirements.in
-urllib3==2.0.7 \
-    --hash=sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84 \
-    --hash=sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e
+    # via -r tools/build/bazel/requirements.in
+typing-extensions==4.11.0 \
+    --hash=sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0 \
+    --hash=sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a
+    # via -r tools/build/bazel/requirements.in
+urllib3==2.2.1 \
+    --hash=sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d \
+    --hash=sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19
     # via requests
 
 # The following packages are considered to be unsafe in a requirements file:
-setuptools==67.8.0 \
-    --hash=sha256:5df61bf30bb10c6f756eb19e7c9f3b473051f48db77fddbe06ff2ca307df9a6f \
-    --hash=sha256:62642358adc77ffa87233bc4d2354c4b2682d214048f500964dbe760ccedf102
-    # via -r requirements.in
+setuptools==69.5.1 \
+    --hash=sha256:6c1fccdac05a97e598fb0ae3bbed5904ccb317337a51139dcd51453611bbb987 \
+    --hash=sha256:c636ac361bc47580504644275c9ad802c50415c7522212252c033bd15f301f32
+    # via -r tools/build/bazel/requirements.in
diff --git a/tools/build_with_debinfo.py b/tools/build_with_debinfo.py
index 0f2bc9b7379c5..b8d4a634849a7 100755
--- a/tools/build_with_debinfo.py
+++ b/tools/build_with_debinfo.py
@@ -26,7 +26,7 @@ def parse_args() -> Any:
 
     parser = ArgumentParser(description="Incremental build PyTorch with debinfo")
     parser.add_argument("--verbose", action="store_true")
-    parser.add_argument("files", nargs="?", action="append")
+    parser.add_argument("files", nargs="*")
     return parser.parse_args()
 
 
diff --git a/tools/code_analyzer/gen_oplist.py b/tools/code_analyzer/gen_oplist.py
index c963f3651a8e0..5f4a883736abd 100644
--- a/tools/code_analyzer/gen_oplist.py
+++ b/tools/code_analyzer/gen_oplist.py
@@ -34,7 +34,7 @@ def throw_if_any_op_includes_overloads(selective_builder: SelectiveBuilder) -> N
         if op.include_all_overloads:
             ops.append(op_name)
     if ops:
-        raise Exception(
+        raise Exception(  # noqa: TRY002
             (
                 "Operators that include all overloads are "
                 + "not allowed since --allow-include-all-overloads "
diff --git a/tools/code_coverage/package/tool/clang_coverage.py b/tools/code_coverage/package/tool/clang_coverage.py
index 1d1ebff6ae1f9..a6b1fa0c08129 100644
--- a/tools/code_coverage/package/tool/clang_coverage.py
+++ b/tools/code_coverage/package/tool/clang_coverage.py
@@ -74,7 +74,9 @@ def export_target(
     platform_type: TestPlatform,
 ) -> None:
     if binary_file is None:
-        raise Exception(f"{merged_file} doesn't have corresponding binary!")
+        raise Exception(  # noqa: TRY002
+            f"{merged_file} doesn't have corresponding binary!"
+        )  # noqa: TRY002
     print_log("start to export: ", merged_file)
     # run export
     cmd_shared_library = (
diff --git a/tools/code_coverage/package/util/utils.py b/tools/code_coverage/package/util/utils.py
index e0b4befb578b9..ddeef943986ea 100644
--- a/tools/code_coverage/package/util/utils.py
+++ b/tools/code_coverage/package/util/utils.py
@@ -116,7 +116,7 @@ def get_test_name_from_whole_path(path: str) -> str:
 def check_compiler_type(cov_type: Optional[CompilerType]) -> None:
     if cov_type is not None and cov_type in [CompilerType.GCC, CompilerType.CLANG]:
         return
-    raise Exception(
+    raise Exception(  # noqa: TRY002
         f"Can't parse compiler type: {cov_type}.",
         " Please set environment variable COMPILER_TYPE as CLANG or GCC",
     )
@@ -125,7 +125,7 @@ def check_compiler_type(cov_type: Optional[CompilerType]) -> None:
 def check_platform_type(platform_type: TestPlatform) -> None:
     if platform_type in [TestPlatform.OSS, TestPlatform.FBCODE]:
         return
-    raise Exception(
+    raise Exception(  # noqa: TRY002
         f"Can't parse platform type: {platform_type}.",
         " Please set environment variable COMPILER_TYPE as OSS or FBCODE",
     )
@@ -134,7 +134,7 @@ def check_platform_type(platform_type: TestPlatform) -> None:
 def check_test_type(test_type: str, target: str) -> None:
     if test_type in [TestType.CPP.value, TestType.PY.value]:
         return
-    raise Exception(
+    raise Exception(  # noqa: TRY002
         f"Can't parse test type: {test_type}.",
         f" Please check the type of buck target: {target}",
     )
diff --git a/tools/dynamo/verify_dynamo.py b/tools/dynamo/verify_dynamo.py
index ba2e59e557e22..e62d74043c7c9 100644
--- a/tools/dynamo/verify_dynamo.py
+++ b/tools/dynamo/verify_dynamo.py
@@ -215,8 +215,8 @@ def main():
         f"ROCM version: {rocm_ver}\n"
     )
     for args in _SANITY_CHECK_ARGS:
-        if sys.version_info >= (3, 12):
-            warnings.warn("Dynamo not yet supported in Python 3.12. Skipping check.")
+        if sys.version_info >= (3, 13):
+            warnings.warn("Dynamo not yet supported in Python 3.13. Skipping check.")
             continue
         check_dynamo(*args)
     print("All required checks passed")
diff --git a/tools/gen_vulkan_spv.py b/tools/gen_vulkan_spv.py
index c88a8e4896310..544823fd4470f 100644
--- a/tools/gen_vulkan_spv.py
+++ b/tools/gen_vulkan_spv.py
@@ -28,13 +28,76 @@
 
 CPP_H_NAME = "spv.h"
 CPP_SRC_NAME = "spv.cpp"
-DEFAULT_ENV = {
+
+DEFAULT_ENV: Dict[str, Any] = {
     "PRECISION": "highp",
     "FLOAT_IMAGE_FORMAT": "rgba16f",
     "INT_IMAGE_FORMAT": "rgba32i",
     "UINT_IMAGE_FORMAT": "rgba32ui",
 }
 
+TYPES_ENV: Dict[str, Any] = {
+    "IMAGE_FORMAT": {
+        "float": "rgba32f",
+        "half": "rgba16f",
+        "int": "rgba32i",
+        "uint": "rgba32ui",
+        "int8": "rgba8i",
+        "uint8": "rgba8ui",
+    },
+    "IMAGE_T": {
+        3: {
+            "float": "image3D",
+            "half": "image3D",
+            "int": "iimage3D",
+            "uint": "uimage3D",
+        },
+        2: {
+            "float": "image2D",
+            "half": "image2D",
+            "int": "iimage2D",
+            "uint": "uimage2D",
+        },
+    },
+    "SAMPLER_T": {
+        3: {
+            "float": "sampler3D",
+            "half": "sampler3D",
+            "int": "isampler3D",
+            "uint": "usampler3D",
+        },
+        2: {
+            "float": "sampler2D",
+            "half": "sampler2D",
+            "int": "isampler2D",
+            "uint": "usampler2D",
+        },
+    },
+    "VEC4_T": {
+        "float": "vec4",
+        "half": "vec4",
+        "int": "ivec4",
+        "uint": "uvec4",
+        "int8": "vec4",
+        "uint8": "uvec4",
+    },
+    "T": {
+        "float": "float",
+        "half": "float",
+        "int": "int",
+        "uint": "uint",
+        "int8": "int",
+        "uint8": "uint8",
+    },
+}
+
+FUNCS_ENV: Dict[str, Any] = {
+    "GET_POS": {
+        3: lambda pos: pos,
+        2: lambda pos: f"{pos}.xy",
+    }
+}
+
 
 def extract_filename(path: str, keep_ext: bool = True) -> Any:
     if keep_ext:
@@ -513,115 +576,127 @@ def getShaderInfo(srcFilePath: str) -> ShaderInfo:
 #  C++ File Generation  #
 #########################
 
+cpp_template = """
+#include <ATen/native/vulkan/api/ShaderRegistry.h>
+#include <stdint.h>
+#include <vector>
 
-def gen_cpp_files(
-    spv_files: Dict[str, str], cpp_header_path: str, cpp_src_file_path: str
-) -> None:
-    h = "#pragma once\n"
-    h += "#include <ATen/native/vulkan/api/Types.h>\n"
-    h += "#include <ATen/native/vulkan/api/vk_api.h>\n"
-    h += "#include <c10/util/flat_hash_map.h>\n"
-    h += "#include <string>\n"
-
-    nsbegin = "namespace at {\nnamespace native {\nnamespace vulkan {\n"
-    nsend = "} // namespace vulkan\n} // namespace native\n} // namespace at\n"
-
-    anon_ns_begin = "namespace {\n"
-    anon_ns_end = "} // namespace\n"
-
-    h += nsbegin
-
-    # Forward declaration of ShaderInfo
-    h += "namespace api {\nstruct ShaderInfo;\n} // namespace api\n"
-    h += "typedef ska::flat_hash_map<std::string, api::ShaderInfo> ShaderListing;\n"
-    h += "typedef ska::flat_hash_map<std::string, std::string> RegistryKeyMap;\n"
-    h += "typedef ska::flat_hash_map<std::string, RegistryKeyMap> ShaderRegistry;\n"
-    h += "extern const ShaderListing shader_infos;\n"
-    h += "extern ShaderRegistry shader_registry;\n"
-    h += "inline const ShaderListing& get_shader_infos() {\n  return shader_infos;\n}\n"
-    h += (
-        "inline ShaderRegistry& get_shader_registry() {\n  return shader_registry;\n}\n"
+using namespace at::native::vulkan;
+
+namespace at {{
+namespace native {{
+namespace vulkan {{
+
+namespace {{
+
+{spv_bin_arrays}
+
+}}
+
+static void register_fn() {{
+
+{register_shader_infos}
+
+{shader_info_registry}
+
+}}
+
+static const api::ShaderRegisterInit register_shaders(&register_fn);
+
+}}
+}}
+}}
+
+"""
+
+
+def generateSpvBinStr(spvPath: str, name: str) -> Tuple[int, str]:
+    with open(spvPath, "rb") as fr:
+        next_bin = array.array("I", fr.read())
+        sizeBytes = 4 * len(next_bin)
+        spv_bin_str = "const uint32_t {}_bin[] = {{\n{}\n}};".format(
+            name,
+            textwrap.indent(",\n".join(str(x) for x in next_bin), "  "),
+        )
+
+    return sizeBytes, spv_bin_str
+
+
+def generateShaderInfoStr(shader_info: ShaderInfo, name: str, sizeBytes: int) -> str:
+    tile_size = (
+        f"{{{', '.join(str(x) for x in shader_info.tile_size)}}}"
+        if (len(shader_info.tile_size) > 0)
+        else "std::vector<uint32_t>()"
+    )
+
+    shader_info_layouts = "{{{}}}".format(",\n ".join(shader_info.layouts))
+
+    shader_info_args = [
+        f'"{name}"',
+        f"{name}_bin",
+        str(sizeBytes),
+        shader_info_layouts,
+        tile_size,
+        storageTypeToEnum[shader_info.weight_storage_type],
+        storageTypeToEnum[shader_info.bias_storage_type],
+    ]
+
+    shader_info_str = textwrap.indent(
+        "api::shader_registry().register_shader(\n  api::ShaderInfo(\n{args}));\n".format(
+            args=textwrap.indent(",\n".join(shader_info_args), "     "),
+        ),
+        "    ",
     )
 
-    h += nsend
+    return shader_info_str
+
+
+def generateShaderDispatchStr(shader_info: ShaderInfo, name: str) -> str:
+    if shader_info.register_for is None:
+        return ""
+
+    (op_name, registry_keys) = shader_info.register_for
+    for registry_key in registry_keys:
+        shader_dispatch_str = textwrap.indent(
+            f'api::shader_registry().register_op_dispatch("{op_name}", api::DispatchKey::{registry_key.upper()}, "{name}");',
+            "    ",
+        )
 
-    cpp = "#include <ATen/native/vulkan/api/Shader.h>\n"
-    cpp += f"#include <ATen/native/vulkan/{CPP_H_NAME}>\n"
-    cpp += "#include <stdint.h>\n"
-    cpp += "#include <vector>\n"
-    cpp += nsbegin
+    return shader_dispatch_str
 
-    shader_info_bin_code = []
-    shader_info_cpp_code = []
-    shader_info_registry_code = []
+
+def genCppFiles(
+    spv_files: Dict[str, str], cpp_header_path: str, cpp_src_file_path: str
+) -> None:
+    spv_bin_strs = []
+    register_shader_info_strs = []
+    shader_registry_strs = []
 
     for spvPath, srcPath in spv_files.items():
         name = getName(spvPath).replace("_spv", "")
 
-        with open(spvPath, "rb") as fr:
-            next_bin = array.array("I", fr.read())
-            sizeBytes = 4 * len(next_bin)
-            shader_info_bin_code.append(
-                "const uint32_t {}_bin[] = {{\n{}\n}};".format(
-                    name,
-                    textwrap.indent(",\n".join(str(x) for x in next_bin), "  "),
-                ),
-            )
+        sizeBytes, spv_bin_str = generateSpvBinStr(spvPath, name)
+        spv_bin_strs.append(spv_bin_str)
 
         shader_info = getShaderInfo(srcPath)
 
-        tile_size = (
-            f"{{{', '.join(str(x) for x in shader_info.tile_size)}}}"
-            if (len(shader_info.tile_size) > 0)
-            else "std::vector<uint32_t>()"
-        )
-
-        shader_info_layouts = "{{{}}}".format(",\n ".join(shader_info.layouts))
-
-        shader_info_args = [
-            f'"vulkan.{name}"',
-            f"{name}_bin",
-            str(sizeBytes),
-            shader_info_layouts,
-            tile_size,
-            storageTypeToEnum[shader_info.weight_storage_type],
-            storageTypeToEnum[shader_info.bias_storage_type],
-        ]
-
-        shader_info_cpp_code.append(
-            textwrap.indent(
-                '{{"{}",\n api::ShaderInfo(\n{})}}'.format(
-                    name,
-                    textwrap.indent(",\n".join(shader_info_args), "     "),
-                ),
-                "    ",
-            ),
+        register_shader_info_strs.append(
+            generateShaderInfoStr(shader_info, name, sizeBytes)
         )
 
         if shader_info.register_for is not None:
-            (op_name, registry_keys) = shader_info.register_for
-            for registry_key in registry_keys:
-                shader_info_registry_code.append(
-                    textwrap.indent(
-                        f'{{"{op_name}", {{{{"{registry_key}", "{name}"}}}}}}',
-                        "        ",
-                    ),
-                )
+            shader_registry_strs.append(generateShaderDispatchStr(shader_info, name))
 
-    cpp += anon_ns_begin
-    cpp += "\n".join(shader_info_bin_code) + "\n"
-    cpp += anon_ns_end
+    spv_bin_arrays = "\n".join(spv_bin_strs)
+    register_shader_infos = "\n".join(register_shader_info_strs)
+    shader_info_registry = "\n".join(shader_registry_strs)
 
-    cpp += "const ShaderListing shader_infos = {{\n{}}};\n".format(
-        ",\n".join(shader_info_cpp_code),
+    cpp = cpp_template.format(
+        spv_bin_arrays=spv_bin_arrays,
+        register_shader_infos=register_shader_infos,
+        shader_info_registry=shader_info_registry,
     )
-    cpp += "ShaderRegistry shader_registry = {{\n{}}};\n".format(
-        ",\n".join(shader_info_registry_code),
-    )
-    cpp += nsend
 
-    with open(cpp_header_path, "w") as fw:
-        fw.write(h)
     with open(cpp_src_file_path, "w") as fw:
         fw.write(cpp)
 
@@ -659,7 +734,10 @@ def main(argv: List[str]) -> int:
     )
     options = parser.parse_args()
 
+    DEFAULT_ENV.update(TYPES_ENV)
+    DEFAULT_ENV.update(FUNCS_ENV)
     env = DEFAULT_ENV
+
     for key, value in parse_arg_env(options.env).items():
         env[key] = value
 
@@ -672,7 +750,7 @@ def main(argv: List[str]) -> int:
     shader_generator = SPVGenerator(options.glsl_paths, env, options.glslc_path)
     output_spv_files = shader_generator.generateSPV(options.tmp_dir_path)
 
-    gen_cpp_files(
+    genCppFiles(
         output_spv_files,
         f"{options.output_path}/{CPP_H_NAME}",
         f"{options.output_path}/{CPP_SRC_NAME}",
diff --git a/tools/generate_torch_version.py b/tools/generate_torch_version.py
index 936896934c493..75ab4530e26f2 100644
--- a/tools/generate_torch_version.py
+++ b/tools/generate_torch_version.py
@@ -14,13 +14,20 @@
 
 def get_sha(pytorch_root: Union[str, Path]) -> str:
     try:
-        return (
-            subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=pytorch_root)
-            .decode("ascii")
-            .strip()
-        )
+        rev = None
+        if os.path.exists(os.path.join(pytorch_root, ".git")):
+            rev = subprocess.check_output(
+                ["git", "rev-parse", "HEAD"], cwd=pytorch_root
+            )
+        elif os.path.exists(os.path.join(pytorch_root, ".hg")):
+            rev = subprocess.check_output(
+                ["hg", "identify", "-r", "."], cwd=pytorch_root
+            )
+        if rev:
+            return rev.decode("ascii").strip()
     except Exception:
-        return UNKNOWN
+        pass
+    return UNKNOWN
 
 
 def get_tag(pytorch_root: Union[str, Path]) -> str:
diff --git a/tools/linter/adapters/actionlint_linter.py b/tools/linter/adapters/actionlint_linter.py
index a8f599c2e1e8a..685fae5415739 100644
--- a/tools/linter/adapters/actionlint_linter.py
+++ b/tools/linter/adapters/actionlint_linter.py
@@ -66,7 +66,14 @@ def check_file(
     file: str,
 ) -> List[LintMessage]:
     try:
-        proc = run_command([binary, file])
+        proc = run_command(
+            [
+                binary,
+                "-ignore",
+                '"runs-on" section must be sequence node but got mapping node with "!!map" tag',
+                file,
+            ]
+        )
     except OSError as err:
         return [
             LintMessage(
diff --git a/tools/linter/adapters/clangtidy_linter.py b/tools/linter/adapters/clangtidy_linter.py
index 4394797674363..7fe7a87e98da5 100644
--- a/tools/linter/adapters/clangtidy_linter.py
+++ b/tools/linter/adapters/clangtidy_linter.py
@@ -13,13 +13,22 @@
 from sysconfig import get_paths as gp
 from typing import Any, List, NamedTuple, Optional, Pattern
 
+
 # PyTorch directory root
-result = subprocess.run(
-    ["git", "rev-parse", "--show-toplevel"],
-    stdout=subprocess.PIPE,
-    check=True,
-)
-PYTORCH_ROOT = result.stdout.decode("utf-8").strip()
+def scm_root() -> str:
+    path = os.path.abspath(os.getcwd())
+    while True:
+        if os.path.exists(os.path.join(path, ".git")):
+            return path
+        if os.path.isdir(os.path.join(path, ".hg")):
+            return path
+        n = len(path)
+        path = os.path.dirname(path)
+        if len(path) == n:
+            raise RuntimeError("Unable to find SCM root")
+
+
+PYTORCH_ROOT = scm_root()
 IS_WINDOWS: bool = os.name == "nt"
 
 
diff --git a/tools/linter/adapters/mypy_linter.py b/tools/linter/adapters/mypy_linter.py
index 11466e8100a79..9c59563f20501 100644
--- a/tools/linter/adapters/mypy_linter.py
+++ b/tools/linter/adapters/mypy_linter.py
@@ -122,9 +122,12 @@ def check_files(
     retries: int,
     code: str,
 ) -> List[LintMessage]:
+    # dmypy has a bug where it won't pick up changes if you pass it absolute
+    # file names, see https://github.com/python/mypy/issues/16768
+    filenames = [os.path.relpath(f) for f in filenames]
     try:
         proc = run_command(
-            [sys.executable, "-mmypy", f"--config={config}"] + filenames,
+            ["dmypy", "run", "--", f"--config={config}"] + filenames,
             extra_env={},
             retries=retries,
         )
diff --git a/tools/linter/adapters/pip_init.py b/tools/linter/adapters/pip_init.py
index f177a920d0b7e..0c1551f5e301a 100644
--- a/tools/linter/adapters/pip_init.py
+++ b/tools/linter/adapters/pip_init.py
@@ -4,6 +4,7 @@
 import argparse
 import logging
 import os
+import shutil
 import subprocess
 import sys
 import time
@@ -50,7 +51,12 @@ def run_command(args: List[str]) -> "subprocess.CompletedProcess[bytes]":
         stream=sys.stderr,
     )
 
-    pip_args = ["pip3", "install"]
+    uv_available = shutil.which("uv") is not None
+
+    if uv_available:
+        pip_args = ["uv", "pip", "install"]
+    else:
+        pip_args = ["pip", "install"]
 
     # If we are in a global install, use `--user` to install so that you do not
     # need root access in order to initialize linters.
diff --git a/tools/linter/adapters/test_has_main_linter.py b/tools/linter/adapters/test_has_main_linter.py
index 8b163b97ad538..1cd5573726b55 100644
--- a/tools/linter/adapters/test_has_main_linter.py
+++ b/tools/linter/adapters/test_has_main_linter.py
@@ -29,6 +29,11 @@ def visit_Module(self, node: cst.Module) -> bool:
         run_test_call = m.Call(
             func=m.Name("run_tests") | m.Attribute(attr=m.Name("run_tests"))
         )
+        # Distributed tests (i.e. MultiProcContinuousTest) calls `run_rank`
+        # instead of `run_tests` in main
+        run_rank_call = m.Call(
+            func=m.Name("run_rank") | m.Attribute(attr=m.Name("run_rank"))
+        )
         raise_block = m.Raise()
 
         # name == main or main == name
@@ -42,7 +47,7 @@ def visit_Module(self, node: cst.Module) -> bool:
         )
         for child in node.children:
             if m.matches(child, m.If(test=if_main1 | if_main2)):
-                if m.findall(child, raise_block | run_test_call):
+                if m.findall(child, raise_block | run_test_call | run_rank_call):
                     self.found = True
                     break
 
diff --git a/tools/nvcc_fix_deps.py b/tools/nvcc_fix_deps.py
index bb420d90308b4..9101e55276263 100644
--- a/tools/nvcc_fix_deps.py
+++ b/tools/nvcc_fix_deps.py
@@ -39,7 +39,7 @@ def resolve_include(path: Path, include_dirs: List[Path]) -> Path:
 def repair_depfile(depfile: TextIO, include_dirs: List[Path]) -> None:
     changes_made = False
     out = ""
-    for line in depfile.readlines():
+    for line in depfile:
         if ":" in line:
             colon_pos = line.rfind(":")
             out += line[: colon_pos + 1]
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 79bf42de7e453..59498f41f3ef0 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -6,11 +6,12 @@
 from pprint import pformat
 from typing import Dict, List, Sequence
 from unittest.mock import Mock, patch
+from warnings import warn
 
 from torchgen.api.python import (
     PythonSignatureGroup,
     PythonSignatureNativeFunctionPair,
-    returns_named_tuple_pyi,
+    returns_structseq_pyi,
 )
 from torchgen.gen import parse_native_yaml, parse_tags_yaml
 
@@ -251,7 +252,7 @@ def sig_for_ops(opname: str) -> List[str]:
             tname = "builtins." + tname
         return [f"def {opname}(self) -> {tname}: ..."]
     else:
-        raise Exception("unknown op", opname)
+        raise Exception("unknown op", opname)  # noqa: TRY002
 
 
 def generate_type_hints(sig_group: PythonSignatureGroup) -> List[str]:
@@ -417,7 +418,7 @@ def gen_nn_functional(fm: FileManager) -> None:
             "softplus": [
                 "def softplus({}) -> Tensor: ...".format(
                     ", ".join(
-                        ["input: Tensor", "beta: int = ...", "threshold: int = ..."]
+                        ["input: Tensor", "beta: float = ...", "threshold: float = ..."]
                     )
                 )
             ],
@@ -604,10 +605,16 @@ def mock_add_docstr(func: Mock, docstr: str) -> None:
         sys.modules["torch"] = Mock(name="torch")
         sys.modules["torch._C"] = Mock(_add_docstr=mock_add_docstr)
 
-        # manually import torch._torch_docs and torch._tensor_docs to trigger
-        # the mocked _add_docstr and collect docstrings
-        sys.modules["torch._torch_docs"] = importlib.import_module("_torch_docs")
-        sys.modules["torch._tensor_docs"] = importlib.import_module("_tensor_docs")
+        try:
+            # manually import torch._torch_docs and torch._tensor_docs to trigger
+            # the mocked _add_docstr and collect docstrings
+            sys.modules["torch._torch_docs"] = importlib.import_module("_torch_docs")
+            sys.modules["torch._tensor_docs"] = importlib.import_module("_tensor_docs")
+        except ModuleNotFoundError:
+            # Gracefully fail if these modules are not importable
+            warn(
+                "Failed to import _torch_docs/_tensor_docs, skipping docstring in pyi files."
+            )
 
     return docstrs
 
@@ -640,7 +647,7 @@ def gen_pyi(
     # also needs to update the other file.
 
     # Dictionary for NamedTuple definitions
-    namedtuples: Dict[str, str] = {}
+    structseqs: Dict[str, str] = {}
 
     # Generate type signatures for top-level functions
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -703,7 +710,6 @@ def gen_pyi(
                             "dtype: _dtype",
                             "count: int = -1",
                             "offset: int = 0",
-                            "device: Optional[DeviceLikeType] = None",
                             "requires_grad: _bool = False",
                         ]
                     )
@@ -793,6 +799,15 @@ def gen_pyi(
                 "def _functionalize_are_all_mutations_under_no_grad_or_inference_mode(t: Tensor) -> _bool: ..."
             ],
             "_functionalize_sync": ["def _functionalize_sync(t: Tensor) -> None: ..."],
+            "_functionalize_was_storage_changed": [
+                "def _functionalize_was_storage_changed(tensor: Tensor) -> _bool: ..."
+            ],
+            "_functionalize_has_metadata_mutation": [
+                "def _functionalize_has_metadata_mutation(tensor: Tensor) -> _bool: ..."
+            ],
+            "_functionalize_apply_view_metas": [
+                "def _functionalize_apply_view_metas(tensor: Tensor,  base: Tensor) -> Tensor: ..."
+            ],
             "_enable_functionalization": [
                 "def _enable_functionalization(*, reapply_views: _bool = False): ..."
             ],
@@ -967,15 +982,20 @@ def gen_pyi(
             ],
         }
     )
-    for binop in ["mul", "true_divide", "floor_divide"]:
+    for binop in ["true_divide", "floor_divide"]:
         unsorted_function_hints[binop].append(
             f"def {binop}(input: Union[Tensor, Number], other: Union[Tensor, Number], "
             "*, out: Optional[Tensor] = None) -> Tensor: ..."
         )
+    for binop in ["mul"]:
+        unsorted_function_hints[binop].append(
+            f"def {binop}(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], "
+            "*, out: Optional[Tensor] = None) -> Tensor: ..."
+        )
     for binop in ["add", "sub"]:
         unsorted_function_hints[binop].append(
-            f"def {binop}(input: Union[Tensor, Number], other: Union[Tensor, Number], "
-            "*, alpha: Optional[Number] = 1, out: Optional[Tensor] = None) -> Tensor: ..."
+            f"def {binop}(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], "
+            "*, alpha: Optional[Union[Number, _complex]] = 1, out: Optional[Tensor] = None) -> Tensor: ..."
         )
 
     native_functions = parse_native_yaml(
@@ -991,14 +1011,14 @@ def gen_pyi(
         name = group.signature.name
         unsorted_function_hints[name] += generate_type_hints(group)
 
-        named_tuple = returns_named_tuple_pyi(group.signature)
-        if named_tuple is not None and not group.signature.deprecated:
-            # deprecated namedtuples are currently not included for torch functions
-            tuple_name, tuple_def = named_tuple
-            if tuple_name in namedtuples:
-                assert namedtuples[tuple_name] == tuple_def
+        structseq = returns_structseq_pyi(group.signature)
+        if structseq is not None and not group.signature.deprecated:
+            # deprecated structseqs are currently not included for torch functions
+            tuple_name, tuple_def = structseq
+            if tuple_name in structseqs:
+                assert structseqs[tuple_name] == tuple_def
             else:
-                namedtuples[tuple_name] = tuple_def
+                structseqs[tuple_name] = tuple_def
 
     def replace_special_case(hint: str) -> str:
         # NB: Keep this in sync with enum in aten/src/ATen/core/Reduction.h
@@ -1044,14 +1064,14 @@ def replace_special_case(hint: str) -> str:
             "new_tensor": [
                 f"def new_tensor(self, data: Any, {FACTORY_PARAMS}) -> Tensor: ..."
             ],
-            "__new__": ["def __new__(self, *args, **kwargs) -> Tensor: ..."],
+            "__new__": ["def __new__(cls, *args, **kwargs) -> Self: ..."],
             # new and __init__ have the same signatures differ only in return type
             # Adapted from legacy_tensor_ctor and legacy_tensor_new
             "new": [
-                f"def new(self, *args: Any, {DEVICE_PARAM}) -> Tensor: ...",
-                "def new(self, storage: Storage) -> Tensor: ...",
-                "def new(self, other: Tensor) -> Tensor: ...",
-                f"def new(self, size: _size, *, {DEVICE_PARAM}) -> Tensor: ...",
+                f"def new(cls, *args: Any, {DEVICE_PARAM}) -> Self: ...",
+                "def new(cls, storage: Storage) -> Self: ...",
+                "def new(cls, other: Tensor) -> Self: ...",
+                f"def new(cls, size: _size, *, {DEVICE_PARAM}) -> Self: ...",
             ],
             "__init__": [
                 f"def __init__(self, *args: Any, {DEVICE_PARAM}) -> None: ...",
@@ -1099,11 +1119,15 @@ def replace_special_case(hint: str) -> str:
                             "self",
                             "device: Optional[Union[_device, _int, str]] = None",
                             "non_blocking: _bool = False",
+                            "memory_format: torch.memory_format = torch.preserve_format",
                         ]
                     )
                 )
             ],
-            "numpy": ["def numpy(self, *, force: _bool = False) -> Any: ..."],
+            "cpu": [
+                "def cpu(self, memory_format: torch.memory_format = torch.preserve_format) -> Tensor: ..."
+            ],
+            "numpy": ["def numpy(self, *, force: _bool = False) -> numpy.ndarray: ..."],
             "apply_": ["def apply_(self, callable: Callable) -> Tensor: ..."],
             "map_": [
                 "def map_(self, tensor: Tensor, callable: Callable) -> Tensor: ..."
@@ -1136,7 +1160,7 @@ def replace_special_case(hint: str) -> str:
             "is_meta": ["is_meta: _bool"],
             "is_mps": ["is_mps: _bool"],
             "is_mtia": ["is_mtia: _bool"],
-            "is_ort": ["is_ort: _bool"],
+            "is_maia": ["is_maia: _bool"],
             "is_mkldnn": ["is_mkldnn: _bool"],
             "is_vulkan": ["is_vulkan: _bool"],
             "is_ipu": ["is_ipu: _bool"],
@@ -1173,7 +1197,7 @@ def replace_special_case(hint: str) -> str:
             ],
         }
     )
-    for binop in ["mul", "true_divide", "floor_divide"]:
+    for binop in ["true_divide", "floor_divide"]:
         for inplace in [False, True]:
             out_suffix = ", *, out: Optional[Tensor] = None"
             if inplace:
@@ -1183,6 +1207,16 @@ def replace_special_case(hint: str) -> str:
                 f"def {binop}(self, other: Union[Tensor, Number, torch.SymInt, torch.SymFloat]{out_suffix})"
                 " -> Tensor: ..."
             )
+    for binop in ["mul"]:
+        for inplace in [False, True]:
+            out_suffix = ", *, out: Optional[Tensor] = None"
+            if inplace:
+                binop += "_"
+                out_suffix = ""
+            unsorted_tensor_method_hints[binop].append(
+                f"def {binop}(self, other: Union[Tensor, Number, _complex, torch.SymInt, torch.SymFloat]{out_suffix})"
+                " -> Tensor: ..."
+            )
     for binop in ["add", "sub"]:
         for inplace in [False, True]:
             out_suffix = ", out: Optional[Tensor] = None"
@@ -1190,14 +1224,13 @@ def replace_special_case(hint: str) -> str:
                 binop += "_"
                 out_suffix = ""
             unsorted_tensor_method_hints[binop].append(
-                f"def {binop}(self, other: Union[Tensor, Number, torch.SymInt, torch.SymFloat], "
-                f"*, alpha: Optional[Number] = 1{out_suffix})"
+                f"def {binop}(self, other: Union[Tensor, Number, _complex, torch.SymInt, torch.SymFloat], "
+                f"*, alpha: Optional[Union[Number, _complex]] = 1{out_suffix})"
                 " -> Tensor: ..."
             )
     simple_conversions = [
         "byte",
         "char",
-        "cpu",
         "double",
         "float",
         "half",
@@ -1227,14 +1260,14 @@ def replace_special_case(hint: str) -> str:
         name = group.signature.name
         unsorted_tensor_method_hints[name] += generate_type_hints(group)
 
-        named_tuple = returns_named_tuple_pyi(group.signature)
-        if named_tuple is not None and not group.signature.deprecated:
-            # deprecated namedtuples are currently not included for torch functions
-            tuple_name, tuple_def = named_tuple
-            if tuple_name in namedtuples:
-                assert namedtuples[tuple_name] == tuple_def
+        structseq = returns_structseq_pyi(group.signature)
+        if structseq is not None and not group.signature.deprecated:
+            # deprecated structseqs are currently not included for torch functions
+            tuple_name, tuple_def = structseq
+            if tuple_name in structseqs:
+                assert structseqs[tuple_name] == tuple_def
             else:
-                namedtuples[tuple_name] = tuple_def
+                structseqs[tuple_name] = tuple_def
 
     for op in all_ops:
         name = f"__{op}__"
@@ -1251,10 +1284,10 @@ def replace_special_case(hint: str) -> str:
 
     # TODO: Missing type hints for nn
 
-    # Generate namedtuple definitions
+    # Generate structseq definitions
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    namedtuple_defs = [f"{defn}\n" for defn in namedtuples.values()]
+    structseq_defs = [f"{defn}\n" for defn in structseqs.values()]
 
     # Generate type signatures for legacy classes
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1334,7 +1367,7 @@ def replace_special_case(hint: str) -> str:
     hinted_function_names = [
         name for name, hint in unsorted_function_hints.items() if hint
     ]
-    all_symbols = sorted(list(namedtuples.keys()) + hinted_function_names)
+    all_symbols = sorted(list(structseqs.keys()) + hinted_function_names)
     all_directive = pformat(all_symbols, width=100, compact=True).split("\n")
     all_directive[0] = f"__all__ = {all_directive[0]}"
 
@@ -1357,7 +1390,7 @@ def replace_special_case(hint: str) -> str:
     # ~~~~~~~~~~~~~~~~~~
 
     env = {
-        "namedtuple_defs": namedtuple_defs,
+        "structseq_defs": structseq_defs,
         "function_hints": function_hints,
         "tensor_method_hints": tensor_method_hints,
         "legacy_class_hints": legacy_class_hints,
diff --git a/tools/setup_helpers/gen_version_header.py b/tools/setup_helpers/gen_version_header.py
index c41fa0e230e3e..1812957193d07 100644
--- a/tools/setup_helpers/gen_version_header.py
+++ b/tools/setup_helpers/gen_version_header.py
@@ -62,7 +62,7 @@ def main(args: argparse.Namespace) -> None:
 
     with open(args.template_path) as input:
         with open(args.output_path, "w") as output:
-            for line in input.readlines():
+            for line in input:
                 output.write(apply_replacements(replacements, line))
 
 
diff --git a/tools/setup_helpers/generate_linker_script.py b/tools/setup_helpers/generate_linker_script.py
new file mode 100644
index 0000000000000..11c397a9e5f3d
--- /dev/null
+++ b/tools/setup_helpers/generate_linker_script.py
@@ -0,0 +1,37 @@
+import subprocess
+
+
+def gen_linker_script(
+    filein: str = "cmake/prioritized_text.txt", fout: str = "cmake/linker_script.ld"
+) -> None:
+    with open(filein) as f:
+        prioritized_text = f.readlines()
+        prioritized_text = [
+            line.replace("\n", "") for line in prioritized_text if line != "\n"
+        ]
+
+    linker_script_lines = subprocess.check_output(["ld", "-verbose"], text=True).split(
+        "\n"
+    )
+
+    indices = [
+        i
+        for i, x in enumerate(linker_script_lines)
+        if x == "=================================================="
+    ]
+    linker_script_lines = linker_script_lines[indices[0] + 1 : indices[1]]
+
+    text_line_start = [
+        i for i, line in enumerate(linker_script_lines) if ".text           :" in line
+    ]
+    assert len(text_line_start) == 1, "The linker script has multiple text sections!"
+    text_line_start = text_line_start[0]
+
+    with open(fout, "w") as f:
+        for lineid, line in enumerate(linker_script_lines):
+            if lineid == text_line_start + 2:
+                f.write("    *(\n")
+                for plines in prioritized_text:
+                    f.write(f"      .text.{plines}\n")
+                f.write("    )\n")
+            f.write(f"{line}\n")
diff --git a/tools/stats/export_test_times.py b/tools/stats/export_test_times.py
index 8254157f0d760..2b9c0c4506830 100644
--- a/tools/stats/export_test_times.py
+++ b/tools/stats/export_test_times.py
@@ -3,26 +3,13 @@
 
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent
 sys.path.append(str(REPO_ROOT))
-from tools.stats.import_test_stats import (
-    copy_pytest_cache,
-    get_td_heuristic_historial_edited_files_json,
-    get_td_heuristic_profiling_json,
-    get_test_class_ratings,
-    get_test_class_times,
-    get_test_file_ratings,
-    get_test_times,
-)
+from tools.stats.import_test_stats import get_test_class_times, get_test_times
 
 
 def main() -> None:
-    print("Exporting files from test-infra")
+    print("Exporting test times from test-infra")
     get_test_times()
     get_test_class_times()
-    get_test_file_ratings()
-    get_test_class_ratings()
-    get_td_heuristic_historial_edited_files_json()
-    get_td_heuristic_profiling_json()
-    copy_pytest_cache()
 
 
 if __name__ == "__main__":
diff --git a/tools/stats/test_dashboard.py b/tools/stats/test_dashboard.py
new file mode 100644
index 0000000000000..c5e990560cb13
--- /dev/null
+++ b/tools/stats/test_dashboard.py
@@ -0,0 +1,191 @@
+import json
+import os
+import re
+import time
+from collections import defaultdict
+from functools import lru_cache
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Any, cast, Dict, List
+
+import requests
+
+from tools.stats.upload_stats_lib import (
+    _get_request_headers,
+    download_gha_artifacts,
+    download_s3_artifacts,
+    get_job_id,
+    unzip,
+    upload_workflow_stats_to_s3,
+)
+
+REGEX_JOB_INFO = r"(.*) \/ .*test \(([^,]*), .*\)"
+
+
+@lru_cache(maxsize=1000)
+def get_job_name(job_id: int) -> str:
+    try:
+        return cast(
+            str,
+            requests.get(
+                f"https://api.github.com/repos/pytorch/pytorch/actions/jobs/{job_id}",
+                headers=_get_request_headers(),
+            ).json()["name"],
+        )
+    except Exception as e:
+        print(f"Failed to get job name for job id {job_id}: {e}")
+        return "NoJobName"
+
+
+@lru_cache(maxsize=1000)
+def get_build_name(job_name: str) -> str:
+    try:
+        return re.match(REGEX_JOB_INFO, job_name).group(1)  # type: ignore[union-attr]
+    except AttributeError:
+        print(f"Failed to match job name: {job_name}")
+        return "NoBuildEnv"
+
+
+@lru_cache(maxsize=1000)
+def get_test_config(job_name: str) -> str:
+    try:
+        return re.match(REGEX_JOB_INFO, job_name).group(2)  # type: ignore[union-attr]
+    except AttributeError:
+        print(f"Failed to match job name: {job_name}")
+        return "NoTestConfig"
+
+
+def get_td_exclusions(
+    workflow_run_id: int, workflow_run_attempt: int
+) -> Dict[str, Any]:
+    with TemporaryDirectory() as temp_dir:
+        print("Using temporary directory:", temp_dir)
+        os.chdir(temp_dir)
+
+        # Download and extract all the reports (both GHA and S3)
+        s3_paths = download_s3_artifacts(
+            "test-jsons", workflow_run_id, workflow_run_attempt
+        )
+        for path in s3_paths:
+            unzip(path)
+
+        artifact_paths = download_gha_artifacts(
+            "test-jsons", workflow_run_id, workflow_run_attempt
+        )
+        for path in artifact_paths:
+            unzip(path)
+
+        grouped_tests: Dict[str, Any] = defaultdict(lambda: defaultdict(set))
+        for td_exclusions in Path(".").glob("**/td_exclusions*.json"):
+            with open(td_exclusions) as f:
+                exclusions = json.load(f)
+                for exclusion in exclusions["excluded"]:
+                    job_id = get_job_id(td_exclusions)
+                    job_name = get_job_name(job_id)
+                    build_name = get_build_name(job_name)
+                    test_config = get_test_config(job_name)
+                    grouped_tests[build_name][test_config].add(exclusion["test_file"])
+
+        for build_name, build in grouped_tests.items():
+            for test_config, test_files in build.items():
+                grouped_tests[build_name][test_config] = sorted(test_files)
+        return grouped_tests
+
+
+def group_test_cases(test_cases: List[Dict[str, Any]]) -> Dict[str, Any]:
+    start = time.time()
+    grouped_tests: Dict[str, Any] = defaultdict(
+        lambda: defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+        )
+    )
+    for test_case in test_cases:
+        job_name = get_job_name(test_case["job_id"])
+        build_name = get_build_name(job_name)
+        if "bazel" in build_name:
+            continue
+        test_config = get_test_config(job_name)
+        class_name = test_case.pop("classname", "NoClass")
+        name = test_case.pop("name", "NoName")
+        invoking_file = test_case.pop("invoking_file", "NoFile")
+        invoking_file = invoking_file.replace(".", "/")
+        test_case.pop("workflow_id")
+        test_case.pop("workflow_run_attempt")
+        grouped_tests[build_name][test_config][invoking_file][class_name][name].append(
+            test_case
+        )
+
+    print(f"Time taken to group tests: {time.time() - start}")
+    return grouped_tests
+
+
+def get_reruns(grouped_tests: Dict[str, Any]) -> Dict[str, Any]:
+    reruns: Dict[str, Any] = defaultdict(
+        lambda: defaultdict(
+            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+        )
+    )
+    for build_name, build in grouped_tests.items():
+        for test_config, test_config_data in build.items():
+            for invoking_file, invoking_file_data in test_config_data.items():
+                for class_name, class_data in invoking_file_data.items():
+                    for test_name, test_data in class_data.items():
+                        if len(test_data) > 1:
+                            if invoking_file in (
+                                "distributed/test_distributed_spawn",
+                                "onnx/test_fx_to_onnx_with_onnxruntime",
+                                "distributed/algorithms/quantization/test_quantization",
+                            ):
+                                continue
+                            reruns[build_name][test_config][invoking_file][class_name][
+                                test_name
+                            ] = test_data
+    return reruns
+
+
+def get_invoking_file_summary(grouped_tests: Dict[str, Any]) -> Dict[str, Any]:
+    invoking_file_summary: Dict[str, Any] = defaultdict(
+        lambda: defaultdict(lambda: defaultdict(lambda: {"count": 0, "time": 0.0}))
+    )
+    for build_name, build in grouped_tests.items():
+        for test_config, test_config_data in build.items():
+            for invoking_file, invoking_file_data in test_config_data.items():
+                for class_data in invoking_file_data.values():
+                    for test_data in class_data.values():
+                        invoking_file_summary[build_name][test_config][invoking_file][
+                            "count"
+                        ] += 1
+                        for i in test_data:
+                            invoking_file_summary[build_name][test_config][
+                                invoking_file
+                            ]["time"] += i["time"]
+
+    return invoking_file_summary
+
+
+def upload_additional_info(
+    workflow_run_id: int, workflow_run_attempt: int, test_cases: List[Dict[str, Any]]
+) -> None:
+    grouped_tests = group_test_cases(test_cases)
+    reruns = get_reruns(grouped_tests)
+    exclusions = get_td_exclusions(workflow_run_id, workflow_run_attempt)
+    invoking_file_summary = get_invoking_file_summary(grouped_tests)
+
+    upload_workflow_stats_to_s3(
+        workflow_run_id,
+        workflow_run_attempt,
+        "additional_info/reruns",
+        [reruns],
+    )
+    upload_workflow_stats_to_s3(
+        workflow_run_id,
+        workflow_run_attempt,
+        "additional_info/td_exclusions",
+        [exclusions],
+    )
+    upload_workflow_stats_to_s3(
+        workflow_run_id,
+        workflow_run_attempt,
+        "additional_info/invoking_file_summary",
+        [invoking_file_summary],
+    )
diff --git a/tools/stats/upload_metrics.py b/tools/stats/upload_metrics.py
index b4ff5256fb983..16688c340ced0 100644
--- a/tools/stats/upload_metrics.py
+++ b/tools/stats/upload_metrics.py
@@ -20,6 +20,12 @@
 except ImportError as e:
     print(f"Unable to import boto3. Will not be emitting metrics.... Reason: {e}")
 
+# Sometimes our runner machines are located in one AWS account while the metrics table may be in
+# another, so we need to specify the table's ARN explicitly.
+TORCHCI_METRICS_TABLE_ARN = (
+    "arn:aws:dynamodb:us-east-1:308535385114:table/torchci-metrics"
+)
+
 
 class EnvVarMetric:
     name: str
@@ -105,7 +111,7 @@ def emit_metric(
     env_var_metrics = [
         EnvVarMetric("repo", "GITHUB_REPOSITORY"),
         EnvVarMetric("workflow", "GITHUB_WORKFLOW"),
-        EnvVarMetric("build_environment", "BUILD_ENVIRONMENT"),
+        EnvVarMetric("build_environment", "BUILD_ENVIRONMENT", required=False),
         EnvVarMetric("job", "GITHUB_JOB"),
         EnvVarMetric("test_config", "TEST_CONFIG", required=False),
         EnvVarMetric("pr_number", "PR_NUMBER", required=False, type_conversion_fn=int),
@@ -153,7 +159,7 @@ def emit_metric(
     if EMIT_METRICS:
         try:
             session = boto3.Session(region_name="us-east-1")
-            session.resource("dynamodb").Table("torchci-metrics").put_item(
+            session.resource("dynamodb").Table(TORCHCI_METRICS_TABLE_ARN).put_item(
                 Item={
                     **reserved_metrics,
                     **metrics,
@@ -177,6 +183,8 @@ def _helper(o: Any) -> Any:
             return [_helper(v) for v in o]
         if isinstance(o, dict):
             return {_helper(k): _helper(v) for k, v in o.items()}
+        if isinstance(o, tuple):
+            return tuple(_helper(v) for v in o)
         return o
 
     return {k: _helper(v) for k, v in data.items()}
diff --git a/tools/stats/upload_stats_lib.py b/tools/stats/upload_stats_lib.py
index 45ea6ba39fd5b..770af32247d61 100644
--- a/tools/stats/upload_stats_lib.py
+++ b/tools/stats/upload_stats_lib.py
@@ -5,7 +5,7 @@
 import zipfile
 
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import boto3  # type: ignore[import]
 import requests
@@ -32,6 +32,7 @@ def _get_artifact_urls(prefix: str, workflow_run_id: int) -> Dict[Path, str]:
     """Get all workflow artifacts with 'test-report' in the name."""
     response = requests.get(
         f"{PYTORCH_REPO}/actions/runs/{workflow_run_id}/artifacts?per_page=100",
+        headers=_get_request_headers(),
     )
     artifacts = response.json()["artifacts"]
     while "next" in response.links.keys():
@@ -228,3 +229,15 @@ def is_rerun_disabled_tests(tests: Dict[str, Dict[str, int]]) -> bool:
         t.get("num_green", 0) + t.get("num_red", 0) > MAX_RETRY_IN_NON_DISABLED_MODE
         for t in tests.values()
     )
+
+
+def get_job_id(report: Path) -> Optional[int]:
+    # [Job id in artifacts]
+    # Retrieve the job id from the report path. In our GHA workflows, we append
+    # the job id to the end of the report name, so `report` looks like:
+    #     unzipped-test-reports-foo_5596745227/test/test-reports/foo/TEST-foo.xml
+    # and we want to get `5596745227` out of it.
+    try:
+        return int(report.parts[0].rpartition("_")[2])
+    except ValueError:
+        return None
diff --git a/tools/stats/upload_test_stat_aggregates.py b/tools/stats/upload_test_stat_aggregates.py
index 5d22f690731a9..5eb9a12d9833b 100644
--- a/tools/stats/upload_test_stat_aggregates.py
+++ b/tools/stats/upload_test_stat_aggregates.py
@@ -22,9 +22,11 @@ def get_oncall_from_testfile(testfile: str) -> Union[List[str], None]:
                 if line.startswith("# Owner(s): "):
                     possible_lists = re.findall(r"\[.*\]", line)
                     if len(possible_lists) > 1:
-                        raise Exception("More than one list found")
+                        raise Exception("More than one list found")  # noqa: TRY002
                     elif len(possible_lists) == 0:
-                        raise Exception("No oncalls found or file is badly formatted")
+                        raise Exception(  # noqa: TRY002
+                            "No oncalls found or file is badly formatted"
+                        )  # noqa: TRY002
                     oncalls = ast.literal_eval(possible_lists[0])
                     return list(oncalls)
     except Exception as e:
diff --git a/tools/stats/upload_test_stats.py b/tools/stats/upload_test_stats.py
index 24da991e3fcbe..5f18cb7b239fb 100644
--- a/tools/stats/upload_test_stats.py
+++ b/tools/stats/upload_test_stats.py
@@ -2,30 +2,21 @@
 import os
 import sys
 import xml.etree.ElementTree as ET
+from multiprocessing import cpu_count, Pool
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List
 
+from tools.stats.test_dashboard import upload_additional_info
 from tools.stats.upload_stats_lib import (
     download_gha_artifacts,
     download_s3_artifacts,
+    get_job_id,
     unzip,
     upload_workflow_stats_to_s3,
 )
 
 
-def get_job_id(report: Path) -> Optional[int]:
-    # [Job id in artifacts]
-    # Retrieve the job id from the report path. In our GHA workflows, we append
-    # the job id to the end of the report name, so `report` looks like:
-    #     unzipped-test-reports-foo_5596745227/test/test-reports/foo/TEST-foo.xml
-    # and we want to get `5596745227` out of it.
-    try:
-        return int(report.parts[0].rpartition("_")[2])
-    except ValueError:
-        return None
-
-
 def parse_xml_report(
     tag: str,
     report: Path,
@@ -140,17 +131,24 @@ def get_tests(workflow_run_id: int, workflow_run_attempt: int) -> List[Dict[str,
 
         # Parse the reports and transform them to JSON
         test_cases = []
+        mp = Pool(cpu_count())
         for xml_report in Path(".").glob("**/*.xml"):
-            test_cases.extend(
-                parse_xml_report(
-                    "testcase",
-                    xml_report,
-                    workflow_run_id,
-                    workflow_run_attempt,
+            test_cases.append(
+                mp.apply_async(
+                    parse_xml_report,
+                    args=(
+                        "testcase",
+                        xml_report,
+                        workflow_run_id,
+                        workflow_run_attempt,
+                    ),
                 )
             )
-
-        return test_cases
+        mp.close()
+        mp.join()
+        test_cases = [tc.get() for tc in test_cases]
+        flattened = [item for sublist in test_cases for item in sublist]
+        return flattened
 
 
 def get_tests_for_circleci(
@@ -296,3 +294,5 @@ def init_value(test_case: Dict[str, Any]) -> Dict[str, Any]:
         upload_workflow_stats_to_s3(
             args.workflow_run_id, args.workflow_run_attempt, "test_run", test_cases
         )
+
+    upload_additional_info(args.workflow_run_id, args.workflow_run_attempt, test_cases)
diff --git a/tools/test/heuristics/__init__.py b/tools/test/heuristics/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tools/test/heuristics/heuristics_test_mixin.py b/tools/test/heuristics/heuristics_test_mixin.py
deleted file mode 100644
index 1bc303d696b71..0000000000000
--- a/tools/test/heuristics/heuristics_test_mixin.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import pathlib
-import sys
-import unittest
-from typing import Optional
-
-REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent
-try:
-    # using tools/ to optimize test run.
-    sys.path.append(str(REPO_ROOT))
-
-    from tools.testing.target_determination.determinator import TestPrioritizations
-    from tools.testing.test_run import TestRuns
-
-except ModuleNotFoundError:
-    print("Can't import required modules, exiting")
-    sys.exit(1)
-
-
-class HeuristicsTestMixin(unittest.TestCase):
-    def assertHeuristicsMatch(
-        self,
-        test_prioritizations: TestPrioritizations,
-        expected_prioritizations: Optional[TestPrioritizations] = None,
-        expected_high_tests: Optional[TestRuns] = None,
-        expected_probable_tests: Optional[TestRuns] = None,
-        expected_unranked_tests: Optional[TestRuns] = None,
-        expected_unlikely_tests: Optional[TestRuns] = None,
-        expected_none_tests: Optional[TestRuns] = None,
-    ) -> None:
-        # if expected_prioritizations is set, none of the other expected values should be set
-        if expected_prioritizations:
-            assert not (
-                expected_high_tests
-                or expected_probable_tests
-                or expected_unranked_tests
-                or expected_unlikely_tests
-                or expected_none_tests
-            )
-            expected_high_tests = expected_prioritizations.get_high_relevance_tests()
-            expected_probable_tests = (
-                expected_prioritizations.get_probable_relevance_tests()
-            )
-            expected_unranked_tests = (
-                expected_prioritizations.get_unranked_relevance_tests()
-            )
-            expected_unlikely_tests = (
-                expected_prioritizations.get_unlikely_relevance_tests()
-            )
-            expected_none_tests = expected_prioritizations.get_none_relevance_tests()
-
-        if expected_unranked_tests:
-            self.assertTupleEqual(
-                test_prioritizations.get_unranked_relevance_tests(),
-                expected_unranked_tests,
-                "Unranked tests differ",
-            )
-
-        if expected_probable_tests:
-            self.assertTupleEqual(
-                test_prioritizations.get_probable_relevance_tests(),
-                expected_probable_tests,
-                "Probable relevance tests differ",
-            )
-
-        if expected_high_tests:
-            self.assertTupleEqual(
-                test_prioritizations.get_high_relevance_tests(),
-                expected_high_tests,
-                "High relevance tests differ",
-            )
-
-        if expected_unlikely_tests:
-            self.assertTupleEqual(
-                test_prioritizations.get_unlikely_relevance_tests(),
-                expected_unlikely_tests,
-                "Unlikely relevance tests differ",
-            )
-
-        if expected_none_tests:
-            self.assertTupleEqual(
-                test_prioritizations.get_none_relevance_tests(),
-                expected_none_tests,
-                "None relevance tests differ",
-            )
diff --git a/tools/test/heuristics/test_heuristics.py b/tools/test/heuristics/test_heuristics.py
index 295dbc5e25af9..a1d1534704dac 100644
--- a/tools/test/heuristics/test_heuristics.py
+++ b/tools/test/heuristics/test_heuristics.py
@@ -1,32 +1,31 @@
+# For testing specific heuristics
 import io
 import json
 import pathlib
 import sys
 import unittest
-from typing import Any, Dict, Set
+from typing import Any, Dict, List, Set
 from unittest import mock
 
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
-try:
-    # using tools/ to optimize test run.
-    sys.path.insert(0, str(REPO_ROOT))
-
-    from tools.test.heuristics.heuristics_test_mixin import HeuristicsTestMixin
-    from tools.testing.target_determination.determinator import (
-        AggregatedHeuristics,
-        get_test_prioritizations,
-        TestPrioritizations,
-    )
-    from tools.testing.target_determination.heuristics import HEURISTICS
-    from tools.testing.target_determination.heuristics.previously_failed_in_pr import (
-        get_previous_failures,
-    )
-    from tools.testing.test_run import TestRun, TestRuns
-
-    sys.path.remove(str(REPO_ROOT))
-except ModuleNotFoundError:
-    print("Can't import required modules, exiting")
-    sys.exit(1)
+sys.path.append(str(REPO_ROOT))
+from tools.test.heuristics.test_interface import TestTD
+from tools.testing.target_determination.determinator import TestPrioritizations
+from tools.testing.target_determination.heuristics.filepath import (
+    file_matches_keyword,
+    get_keywords,
+)
+from tools.testing.target_determination.heuristics.historical_class_failure_correlation import (
+    HistoricalClassFailurCorrelation,
+)
+from tools.testing.target_determination.heuristics.previously_failed_in_pr import (
+    get_previous_failures,
+)
+from tools.testing.test_run import TestRun
+
+sys.path.remove(str(REPO_ROOT))
+
+HEURISTIC_CLASS = "tools.testing.target_determination.heuristics.historical_class_failure_correlation."
 
 
 def mocked_file(contents: Dict[Any, Any]) -> io.IOBase:
@@ -36,7 +35,82 @@ def mocked_file(contents: Dict[Any, Any]) -> io.IOBase:
     return file_object
 
 
-class TestParsePrevTests(HeuristicsTestMixin):
+def gen_historical_class_failures() -> Dict[str, Dict[str, float]]:
+    return {
+        "file1": {
+            "test1::classA": 0.5,
+            "test2::classA": 0.2,
+            "test5::classB": 0.1,
+        },
+        "file2": {
+            "test1::classB": 0.3,
+            "test3::classA": 0.2,
+            "test5::classA": 1.5,
+            "test7::classC": 0.1,
+        },
+        "file3": {
+            "test1::classC": 0.4,
+            "test4::classA": 0.2,
+            "test7::classC": 1.5,
+            "test8::classC": 0.1,
+        },
+    }
+
+
+ALL_TESTS = [
+    "test1",
+    "test2",
+    "test3",
+    "test4",
+    "test5",
+    "test6",
+    "test7",
+    "test8",
+]
+
+
+class TestHistoricalClassFailureCorrelation(TestTD):
+    @mock.patch(
+        HEURISTIC_CLASS + "_get_historical_test_class_correlations",
+        return_value=gen_historical_class_failures(),
+    )
+    @mock.patch(
+        HEURISTIC_CLASS + "query_changed_files",
+        return_value=["file1"],
+    )
+    def test_get_prediction_confidence(
+        self,
+        historical_class_failures: Dict[str, Dict[str, float]],
+        changed_files: List[str],
+    ) -> None:
+        tests_to_prioritize = ALL_TESTS
+
+        heuristic = HistoricalClassFailurCorrelation()
+        test_prioritizations = heuristic.get_prediction_confidence(tests_to_prioritize)
+
+        expected = TestPrioritizations(
+            tests_to_prioritize,
+            {
+                TestRun("test1::classA"): 0.25,
+                TestRun("test2::classA"): 0.1,
+                TestRun("test5::classB"): 0.05,
+                TestRun("test1", excluded=["classA"]): 0.0,
+                TestRun("test2", excluded=["classA"]): 0.0,
+                TestRun("test3"): 0.0,
+                TestRun("test4"): 0.0,
+                TestRun("test5", excluded=["classB"]): 0.0,
+                TestRun("test6"): 0.0,
+                TestRun("test7"): 0.0,
+                TestRun("test8"): 0.0,
+            },
+        )
+
+        self.assert_test_scores_almost_equal(
+            test_prioritizations._test_scores, expected._test_scores
+        )
+
+
+class TestParsePrevTests(TestTD):
     @mock.patch("os.path.exists", return_value=False)
     def test_cache_does_not_exist(self, mock_exists: Any) -> None:
         expected_failing_test_files: Set[str] = set()
@@ -73,367 +147,37 @@ def test_dedupes_failing_test_files(self, mock_exists: Any, mock_open: Any) -> N
 
         self.assertSetEqual(expected_failing_test_files, found_tests)
 
-    @mock.patch(
-        "tools.testing.target_determination.heuristics.previously_failed_in_pr.get_previous_failures",
-        return_value={"test4"},
-    )
-    @mock.patch(
-        "tools.testing.target_determination.heuristics.edited_by_pr._get_modified_tests",
-        return_value={"test2", "test4"},
-    )
-    @mock.patch(
-        "tools.testing.target_determination.heuristics.correlated_with_historical_failures.get_correlated_tests",
-        return_value=["test1"],
-    )
-    def test_get_reordered_tests(self, *args: Any) -> None:
-        tests = ["test1", "test2", "test3", "test4", "test5"]
-
-        expected_prioritizations = TestPrioritizations(
-            tests_being_ranked=tests,
-            high_relevance=["test4", "test2"],
-            probable_relevance=["test1"],
-            unranked_relevance=["test3", "test5"],
-        )
-
-        test_prioritizations = get_test_prioritizations(
-            tests
-        ).get_aggregated_priorities()
-
-        self.assertHeuristicsMatch(
-            test_prioritizations, expected_prioritizations=expected_prioritizations
-        )
-
-
-class TestInterface(HeuristicsTestMixin):
-    def test_class_prioritization(self) -> None:
-        tests = ["test1", "test2", "test3", "test4", "test5"]
-
-        prioritizations = TestPrioritizations(
-            tests_being_ranked=tests,
-            probable_relevance=["test2::TestFooClass", "test3"],
-        )
-
-        expected_probable_tests = tuple(
-            TestRun(test) for test in ["test2::TestFooClass", "test3"]
-        )
-        expected_unranked_tests = (
-            TestRun("test1"),
-            TestRun("test2", excluded=["TestFooClass"]),
-            TestRun("test4"),
-            TestRun("test5"),
-        )
-
-        self.assertHeuristicsMatch(
-            prioritizations,
-            expected_probable_tests=expected_probable_tests,
-            expected_unranked_tests=expected_unranked_tests,
-        )
-
-
-class TestAggregatedHeuristics(HeuristicsTestMixin):
-    def test_merging_multiple_test_class_heuristics(self) -> None:
-        tests = ["test1", "test2", "test3", "test4"]
-
-        heuristic1 = TestPrioritizations(
-            tests_being_ranked=tests,
-            probable_relevance=["test2::TestFooClass", "test3"],
-        )
-
-        heuristic2 = TestPrioritizations(
-            tests_being_ranked=tests,
-            high_relevance=["test2::TestFooClass", "test3::TestBarClass"],
-        )
-
-        expected_high_relevance = tuple(
-            TestRun(test) for test in ["test2::TestFooClass", "test3::TestBarClass"]
-        )
-        expected_probable_relevance = (TestRun("test3", excluded=["TestBarClass"]),)
-        expected_unranked_relevance = (
-            TestRun("test1"),
-            TestRun("test2", excluded=["TestFooClass"]),
-            TestRun("test4"),
-        )
-
-        aggregator = AggregatedHeuristics(unranked_tests=tests)
-        aggregator.add_heuristic_results(HEURISTICS[0], heuristic1)
-        aggregator.add_heuristic_results(HEURISTICS[1], heuristic2)
-
-        aggregated_pris = aggregator.get_aggregated_priorities()
-
-        self.assertHeuristicsMatch(
-            aggregated_pris,
-            expected_high_tests=expected_high_relevance,
-            expected_probable_tests=expected_probable_relevance,
-            expected_unranked_tests=expected_unranked_relevance,
-        )
-
-    def test_downgrading_file_test(self) -> None:
-        tests = ["test1", "test2", "test3", "test4"]
-
-        heuristic1 = TestPrioritizations(
-            tests_being_ranked=tests,
-            probable_relevance=["test2", "test3"],
-        )
-
-        heuristic2 = TestPrioritizations(
-            tests_being_ranked=tests,
-            no_relevance=["test2"],
-        )
-
-        expected_prioritizations = TestPrioritizations(
-            tests_being_ranked=tests,
-            probable_relevance=["test3"],
-            unranked_relevance=["test1", "test4"],
-            no_relevance=["test2"],
-        )
-
-        aggregator = AggregatedHeuristics(unranked_tests=tests)
-        aggregator.add_heuristic_results(HEURISTICS[0], heuristic1)
-        aggregator.add_heuristic_results(HEURISTICS[1], heuristic2)
-
-        aggregated_pris = aggregator.get_aggregated_priorities()
 
-        self.assertHeuristicsMatch(
-            aggregated_pris, expected_prioritizations=expected_prioritizations
-        )
-
-    def test_merging_file_heuristic_after_class_heuristic(self) -> None:
-        tests = ["test1", "test2", "test3", "test4", "test5"]
-        heuristic1 = TestPrioritizations(
-            tests_being_ranked=tests,
-            high_relevance=["test2::TestFooClass"],
-        )
-        heuristic2 = TestPrioritizations(
-            tests_being_ranked=tests,
-            probable_relevance=["test2", "test3"],
-        )
-
-        expected_aggregated_high_relevance = tuple(
-            TestRun(test) for test in ["test2::TestFooClass"]
-        )
-        expected_aggregated_probable_relevance = (
-            TestRun("test2", excluded=["TestFooClass"]),
-            TestRun("test3"),
-        )
-        expected_aggregated_unranked_relevance = (
-            TestRun("test1"),
-            TestRun("test4"),
-            TestRun("test5"),
-        )
-
-        aggregator = AggregatedHeuristics(unranked_tests=tests)
-        aggregator.add_heuristic_results(HEURISTICS[0], heuristic1)
-        aggregator.add_heuristic_results(HEURISTICS[1], heuristic2)
-
-        aggregated_pris = aggregator.get_aggregated_priorities()
-
-        self.assertHeuristicsMatch(
-            aggregated_pris,
-            expected_high_tests=expected_aggregated_high_relevance,
-            expected_probable_tests=expected_aggregated_probable_relevance,
-            expected_unranked_tests=expected_aggregated_unranked_relevance,
-        )
-
-    def test_get_test_stats_with_whole_tests(self) -> None:
-        self.maxDiff = None
-        tests = ["test1", "test2", "test3", "test4", "test5"]
-        heuristic1 = TestPrioritizations(
-            tests_being_ranked=tests,
-            high_relevance=["test3", "test4"],
-        )
-        heuristic2 = TestPrioritizations(
-            tests_being_ranked=tests,
-            probable_relevance=["test5"],
-        )
-
-        aggregator = AggregatedHeuristics(unranked_tests=tests)
-        aggregator.add_heuristic_results(HEURISTICS[0], heuristic1)
-        aggregator.add_heuristic_results(HEURISTICS[1], heuristic2)
-
-        expected_test3_stats = {
-            "test_name": "test3",
-            "test_filters": "",
-            "without_heuristics": {
-                "relevance_group": "UNRANKED",
-                "order_within_relevance_group": 2,
-                "num_tests_in_relevance_group": 5,
-                "order_overall": 2,
-                "heuristic_name": "baseline",
-            },
-            "heuristics": [
-                {
-                    "relevance_group": "HIGH",
-                    "order_within_relevance_group": 0,
-                    "num_tests_in_relevance_group": 2,
-                    "order_overall": 0,
-                    "heuristic_name": HEURISTICS[0].name,
-                    "trial_mode": False,
-                },
-                {
-                    "relevance_group": "UNRANKED",
-                    "order_within_relevance_group": 2,
-                    "num_tests_in_relevance_group": 4,
-                    "order_overall": 3,
-                    "heuristic_name": HEURISTICS[1].name,
-                    "trial_mode": False,
-                },
-            ],
-            "num_heuristics_prioritized_by": 1,
-            "aggregated": {
-                "relevance_group": "HIGH",
-                "order_within_relevance_group": 0,
-                "num_tests_in_relevance_group": 2,
-                "order_overall": 0,
-            },
-            "aggregated_trial": {
-                "relevance_group": "HIGH",
-                "order_within_relevance_group": 0,
-                "num_tests_in_relevance_group": 2,
-                "order_overall": 0,
-            },
-            "highest_ranking_heuristic": HEURISTICS[0].name,
-        }
-
-        test3_stats = aggregator.get_test_stats(TestRun("test3"))
-
-        self.assertDictEqual(test3_stats, expected_test3_stats)
-
-    def test_get_test_stats_only_contains_allowed_types(self) -> None:
-        self.maxDiff = None
-        tests = ["test1", "test2", "test3", "test4", "test5"]
-        heuristic1 = TestPrioritizations(
-            tests_being_ranked=tests,
-            high_relevance=["test3", "test4"],
-        )
-        heuristic2 = TestPrioritizations(
-            tests_being_ranked=tests,
-            probable_relevance=["test5::classA"],
-        )
-
-        aggregator = AggregatedHeuristics(unranked_tests=tests)
-        aggregator.add_heuristic_results(HEURISTICS[0], heuristic1)
-        aggregator.add_heuristic_results(HEURISTICS[1], heuristic2)
-
-        stats3 = aggregator.get_test_stats(TestRun("test3"))
-        stats5 = aggregator.get_test_stats(TestRun("test5::classA"))
-
-        def assert_valid_dict(dict_contents: Dict[str, Any]) -> None:
-            for key, value in dict_contents.items():
-                self.assertTrue(isinstance(key, str))
-                self.assertTrue(
-                    isinstance(value, (str, float, int, list, dict)),
-                    f"{value} is not a str, float, or dict",
-                )
-                if isinstance(value, dict):
-                    assert_valid_dict(value)
-                elif isinstance(value, list):
-                    for item in value:
-                        assert_valid_dict(item)
-
-        assert_valid_dict(stats3)
-        assert_valid_dict(stats5)
-
-    def test_get_test_stats_gets_rank_for_test_classes(self) -> None:
-        self.maxDiff = None
-        tests = ["test1", "test2", "test3", "test4", "test5"]
-        heuristic1 = TestPrioritizations(
-            tests_being_ranked=tests,
-            high_relevance=["test3", "test4"],
-        )
-        heuristic2 = TestPrioritizations(
-            tests_being_ranked=tests,
-            probable_relevance=["test5::classA"],
-        )
-
-        aggregator = AggregatedHeuristics(unranked_tests=tests)
-        aggregator.add_heuristic_results(HEURISTICS[0], heuristic1)
-        aggregator.add_heuristic_results(HEURISTICS[1], heuristic2)
-
-        statsInclusive = aggregator.get_test_stats(
-            TestRun("test5", included=["classA"])
-        )
-        statsExclusive = aggregator.get_test_stats(
-            TestRun("test5", excluded=["classA"])
-        )
-
-        print("h")
-        # Validate the heuristic level stats are correct
-        self.assertEqual(
-            statsInclusive["heuristics"][1]["order_within_relevance_group"], 0
-        )
+class TestFilePath(TestTD):
+    def test_get_keywords(self) -> None:
+        self.assertEqual(get_keywords("test/test_car.py"), [])
+        self.assertEqual(get_keywords("test/nn/test_amp.py"), ["nn"])
+        self.assertEqual(get_keywords("torch/nn/test_amp.py"), ["nn"])
         self.assertEqual(
-            statsInclusive["heuristics"][1]["num_tests_in_relevance_group"], 1
-        )
-        self.assertEqual(statsInclusive["heuristics"][1]["order_overall"], 0)
-        self.assertEqual(statsInclusive["heuristics"][1]["relevance_group"], "PROBABLE")
-        self.assertEqual(statsInclusive["aggregated"]["order_overall"], 2)
-
-        self.assertEqual(
-            statsExclusive["heuristics"][1]["order_within_relevance_group"], 4
-        )
-        self.assertEqual(
-            statsExclusive["heuristics"][1]["num_tests_in_relevance_group"], 5
-        )
-        self.assertEqual(statsExclusive["heuristics"][1]["order_overall"], 5)
-        self.assertEqual(statsExclusive["heuristics"][1]["relevance_group"], "UNRANKED")
-        self.assertEqual(statsExclusive["aggregated"]["order_overall"], 5)
-
-    def test_merging_file_heuristic_after_class_heuristic_with_same_probability(
-        self,
-    ) -> None:
-        tests = ["test1", "test2", "test3", "test4", "test5"]
-        heuristic1 = TestPrioritizations(
-            tests_being_ranked=tests,
-            probable_relevance=["test2::TestFooClass"],
-        )
-        heuristic2 = TestPrioritizations(
-            tests_being_ranked=tests,
-            probable_relevance=["test3", "test2"],
-        )
-
-        expected_aggregated_high_relevance: TestRuns = tuple()
-        expected_aggregated_probable_relevance = (
-            TestRun("test2::TestFooClass"),
-            TestRun("test3"),
-            TestRun("test2", excluded=["TestFooClass"]),
-        )
-        expected_aggregated_unranked_relevance = (
-            TestRun("test1"),
-            TestRun("test4"),
-            TestRun("test5"),
-        )
-
-        aggregator = AggregatedHeuristics(unranked_tests=tests)
-        aggregator.add_heuristic_results(HEURISTICS[0], heuristic1)
-        aggregator.add_heuristic_results(HEURISTICS[1], heuristic2)
-
-        aggregated_pris = aggregator.get_aggregated_priorities()
-
-        self.assertHeuristicsMatch(
-            aggregated_pris,
-            expected_high_tests=expected_aggregated_high_relevance,
-            expected_probable_tests=expected_aggregated_probable_relevance,
-            expected_unranked_tests=expected_aggregated_unranked_relevance,
-        )
-
-    def test_get_test_stats_works_with_class_granularity_heuristics(self) -> None:
-        tests = ["test1", "test2", "test3", "test4", "test5"]
-        heuristic1 = TestPrioritizations(
-            tests_being_ranked=tests,
-            probable_relevance=["test2"],
-        )
-        heuristic2 = TestPrioritizations(
-            tests_being_ranked=tests,
-            high_relevance=["test2::TestFooClass"],
-        )
-
-        aggregator = AggregatedHeuristics(unranked_tests=tests)
-        aggregator.add_heuristic_results(HEURISTICS[0], heuristic1)
-        aggregator.add_heuristic_results(HEURISTICS[1], heuristic2)
-
-        # These should not throw an error
-        aggregator.get_test_stats(TestRun("test2::TestFooClass"))
-        aggregator.get_test_stats(TestRun("test2"))
+            get_keywords("torch/nn/mixed_precision/test_amp.py"), ["nn", "amp"]
+        )
+
+    def test_match_keywords(self) -> None:
+        self.assertTrue(file_matches_keyword("test/quantization/test_car.py", "quant"))
+        self.assertTrue(file_matches_keyword("test/test_quantization.py", "quant"))
+        self.assertTrue(file_matches_keyword("test/nn/test_amp.py", "nn"))
+        self.assertTrue(file_matches_keyword("test/nn/test_amp.py", "amp"))
+        self.assertTrue(file_matches_keyword("test/test_onnx.py", "onnx"))
+        self.assertFalse(file_matches_keyword("test/test_onnx.py", "nn"))
+
+    def test_get_keywords_match(self) -> None:
+        def helper(test_file: str, changed_file: str) -> bool:
+            return any(
+                file_matches_keyword(test_file, x) for x in get_keywords(changed_file)
+            )
+
+        self.assertTrue(helper("test/quantization/test_car.py", "quantize/t.py"))
+        self.assertFalse(helper("test/onnx/test_car.py", "nn/t.py"))
+        self.assertTrue(helper("test/nn/test_car.py", "nn/t.py"))
+        self.assertFalse(helper("test/nn/test_car.py", "test/b.py"))
+        self.assertTrue(helper("test/test_mixed_precision.py", "torch/amp/t.py"))
+        self.assertTrue(helper("test/test_amp.py", "torch/mixed_precision/t.py"))
+        self.assertTrue(helper("test/idk/other/random.py", "torch/idk/t.py"))
 
 
 if __name__ == "__main__":
diff --git a/tools/test/heuristics/test_historical_class_failure_correlation.py b/tools/test/heuristics/test_historical_class_failure_correlation.py
deleted file mode 100644
index 8125a54373693..0000000000000
--- a/tools/test/heuristics/test_historical_class_failure_correlation.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import pathlib
-import sys
-import unittest
-from typing import Dict, List
-from unittest import mock
-
-REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent
-try:
-    # using tools/ to optimize test run.
-    sys.path.append(str(REPO_ROOT))
-
-    from tools.test.heuristics.heuristics_test_mixin import HeuristicsTestMixin
-    from tools.testing.target_determination.determinator import TestPrioritizations
-    from tools.testing.target_determination.heuristics.historical_class_failure_correlation import (
-        HistoricalClassFailurCorrelation,
-    )
-
-except ModuleNotFoundError as e:
-    print("Can't import required modules, exiting")
-    sys.exit(1)
-
-HEURISTIC_CLASS = "tools.testing.target_determination.heuristics.historical_class_failure_correlation."
-HEURISTIC_UTILS = "tools.testing.target_determination.heuristics.utils."
-
-
-def gen_historical_class_failures() -> Dict[str, Dict[str, float]]:
-    return {
-        "file1": {
-            "test1::classA": 0.5,
-            "test2::classA": 0.2,
-            "test5::classB": 0.1,
-        },
-        "file2": {
-            "test1::classB": 0.3,
-            "test3::classA": 0.2,
-            "test5::classA": 1.5,
-            "test7::classC": 0.1,
-        },
-        "file3": {
-            "test1::classC": 0.4,
-            "test4::classA": 0.2,
-            "test7::classC": 1.5,
-            "test8::classC": 0.1,
-        },
-    }
-
-
-ALL_TESTS = [
-    "test1",
-    "test2",
-    "test3",
-    "test4",
-    "test5",
-    "test6",
-    "test7",
-    "test8",
-]
-
-
-class TestHistoricalClassFailureCorrelation(HeuristicsTestMixin):
-    @mock.patch(
-        HEURISTIC_CLASS + "_get_historical_test_class_correlations",
-        return_value=gen_historical_class_failures(),
-    )
-    @mock.patch(
-        HEURISTIC_UTILS + "query_changed_files",
-        return_value=["file1"],
-    )
-    def test_get_test_priorities(
-        self,
-        historical_class_failures: Dict[str, Dict[str, float]],
-        changed_files: List[str],
-    ) -> None:
-        tests_to_prioritize = ALL_TESTS
-
-        heuristic = HistoricalClassFailurCorrelation()
-        test_prioritizations = heuristic.get_test_priorities(tests_to_prioritize)
-
-        expected = TestPrioritizations(
-            tests_being_ranked=tests_to_prioritize,
-            probable_relevance=list(historical_class_failures["file1"].keys()),
-        )
-
-        self.assertHeuristicsMatch(
-            test_prioritizations,
-            expected_prioritizations=expected,
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tools/test/heuristics/test_interface.py b/tools/test/heuristics/test_interface.py
new file mode 100644
index 0000000000000..df122ab7d56fd
--- /dev/null
+++ b/tools/test/heuristics/test_interface.py
@@ -0,0 +1,570 @@
+import pathlib
+import sys
+import unittest
+from typing import Any, Dict, List
+
+REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
+sys.path.append(str(REPO_ROOT))
+import tools.testing.target_determination.heuristics.interface as interface
+from tools.testing.test_run import TestRun
+
+sys.path.remove(str(REPO_ROOT))
+
+
+class TestTD(unittest.TestCase):
+    def assert_test_scores_almost_equal(
+        self, d1: Dict[TestRun, float], d2: Dict[TestRun, float]
+    ) -> None:
+        # Check that dictionaries are the same, except for floating point errors
+        self.assertEqual(set(d1.keys()), set(d2.keys()))
+        for k, v in d1.items():
+            self.assertAlmostEqual(v, d2[k], msg=f"{k}: {v} != {d2[k]}")
+
+    def make_heuristic(self, classname: str) -> Any:
+        # Create a dummy heuristic class
+        class Heuristic(interface.HeuristicInterface):
+            def get_prediction_confidence(
+                self, tests: List[str]
+            ) -> interface.TestPrioritizations:
+                # Return junk
+                return interface.TestPrioritizations([], {})
+
+        return type(classname, (Heuristic,), {})
+
+
+class TestTestPrioritizations(TestTD):
+    def test_init_none(self) -> None:
+        tests = ["test_a", "test_b"]
+        test_prioritizations = interface.TestPrioritizations(tests, {})
+        self.assertSetEqual(test_prioritizations._original_tests, set(tests))
+        self.assertDictEqual(
+            test_prioritizations._test_scores,
+            {TestRun("test_a"): 0.0, TestRun("test_b"): 0.0},
+        )
+
+    def test_init_set_scores_full_files(self) -> None:
+        tests = ["test_a", "test_b"]
+        test_prioritizations = interface.TestPrioritizations(
+            tests, {TestRun("test_a"): 0.5, TestRun("test_b"): 0.25}
+        )
+        self.assertSetEqual(test_prioritizations._original_tests, set(tests))
+        self.assertDictEqual(
+            test_prioritizations._test_scores,
+            {TestRun("test_a"): 0.5, TestRun("test_b"): 0.25},
+        )
+
+    def test_init_set_scores_some_full_files(self) -> None:
+        tests = ["test_a", "test_b"]
+        test_prioritizations = interface.TestPrioritizations(
+            tests, {TestRun("test_a"): 0.5}
+        )
+        self.assertSetEqual(test_prioritizations._original_tests, set(tests))
+        self.assertDictEqual(
+            test_prioritizations._test_scores,
+            {TestRun("test_a"): 0.5, TestRun("test_b"): 0.0},
+        )
+
+    def test_init_set_scores_classes(self) -> None:
+        tests = ["test_a", "test_b"]
+        test_prioritizations = interface.TestPrioritizations(
+            tests, {TestRun("test_a", included=["TestA"]): 0.5}
+        )
+        self.assertSetEqual(test_prioritizations._original_tests, set(tests))
+        self.assertDictEqual(
+            test_prioritizations._test_scores,
+            {
+                TestRun("test_a", included=["TestA"]): 0.5,
+                TestRun("test_a", excluded=["TestA"]): 0.0,
+                TestRun("test_b"): 0.0,
+            },
+        )
+
+    def test_init_set_scores_other_class_naming_convention(self) -> None:
+        tests = ["test_a", "test_b"]
+        test_prioritizations = interface.TestPrioritizations(
+            tests, {TestRun("test_a::TestA"): 0.5}
+        )
+        self.assertSetEqual(test_prioritizations._original_tests, set(tests))
+        self.assertDictEqual(
+            test_prioritizations._test_scores,
+            {
+                TestRun("test_a", included=["TestA"]): 0.5,
+                TestRun("test_a", excluded=["TestA"]): 0.0,
+                TestRun("test_b"): 0.0,
+            },
+        )
+
+    def test_set_test_score_full_class(self) -> None:
+        tests = ["test_a", "test_b"]
+        test_prioritizations = interface.TestPrioritizations(tests, {})
+        test_prioritizations.set_test_score(TestRun("test_a"), 0.5)
+        self.assertSetEqual(test_prioritizations._original_tests, set(tests))
+        self.assertDictEqual(
+            test_prioritizations._test_scores,
+            {TestRun("test_a"): 0.5, TestRun("test_b"): 0.0},
+        )
+
+    def test_set_test_score_mix(self) -> None:
+        tests = ["test_a", "test_b"]
+        test_prioritizations = interface.TestPrioritizations(
+            tests, {TestRun("test_b"): -0.5}
+        )
+        test_prioritizations.set_test_score(TestRun("test_a"), 0.1)
+        test_prioritizations.set_test_score(TestRun("test_a::TestA"), 0.2)
+        test_prioritizations.set_test_score(TestRun("test_a::TestB"), 0.3)
+        test_prioritizations.set_test_score(TestRun("test_a", included=["TestC"]), 0.4)
+        self.assertSetEqual(test_prioritizations._original_tests, set(tests))
+        self.assertDictEqual(
+            test_prioritizations._test_scores,
+            {
+                TestRun("test_a", included=["TestA"]): 0.2,
+                TestRun("test_a", included=["TestB"]): 0.3,
+                TestRun("test_a", included=["TestC"]): 0.4,
+                TestRun("test_a", excluded=["TestA", "TestB", "TestC"]): 0.1,
+                TestRun("test_b"): -0.5,
+            },
+        )
+        test_prioritizations.set_test_score(
+            TestRun("test_a", included=["TestA", "TestB"]), 0.5
+        )
+        self.assertDictEqual(
+            test_prioritizations._test_scores,
+            {
+                TestRun("test_a", included=["TestA", "TestB"]): 0.5,
+                TestRun("test_a", included=["TestC"]): 0.4,
+                TestRun("test_a", excluded=["TestA", "TestB", "TestC"]): 0.1,
+                TestRun("test_b"): -0.5,
+            },
+        )
+        test_prioritizations.set_test_score(
+            TestRun("test_a", excluded=["TestA", "TestB"]), 0.6
+        )
+        self.assertDictEqual(
+            test_prioritizations._test_scores,
+            {
+                TestRun("test_a", included=["TestA", "TestB"]): 0.5,
+                TestRun("test_a", excluded=["TestA", "TestB"]): 0.6,
+                TestRun("test_b"): -0.5,
+            },
+        )
+        test_prioritizations.set_test_score(TestRun("test_a", included=["TestC"]), 0.7)
+        self.assertDictEqual(
+            test_prioritizations._test_scores,
+            {
+                TestRun("test_a", included=["TestA", "TestB"]): 0.5,
+                TestRun("test_a", excluded=["TestA", "TestB", "TestC"]): 0.6,
+                TestRun("test_a", included=["TestC"]): 0.7,
+                TestRun("test_b"): -0.5,
+            },
+        )
+        test_prioritizations.set_test_score(TestRun("test_a", excluded=["TestD"]), 0.8)
+        self.assertDictEqual(
+            test_prioritizations._test_scores,
+            {
+                TestRun("test_a", excluded=["TestD"]): 0.8,
+                TestRun("test_a", included=["TestD"]): 0.6,
+                TestRun("test_b"): -0.5,
+            },
+        )
+        self.assertSetEqual(test_prioritizations._original_tests, set(tests))
+        test_prioritizations.validate()
+
+    def test_add_test_score_mix(self) -> None:
+        tests = ["test_a", "test_b"]
+        test_prioritizations = interface.TestPrioritizations(
+            tests, {TestRun("test_b"): -0.5}
+        )
+        test_prioritizations.add_test_score(TestRun("test_a"), 0.1)
+        test_prioritizations.add_test_score(TestRun("test_a::TestA"), 0.2)
+        test_prioritizations.add_test_score(TestRun("test_a::TestB"), 0.3)
+        test_prioritizations.add_test_score(TestRun("test_a", included=["TestC"]), 0.4)
+        self.assertSetEqual(test_prioritizations._original_tests, set(tests))
+        self.assert_test_scores_almost_equal(
+            test_prioritizations._test_scores,
+            {
+                TestRun("test_a", included=["TestA"]): 0.3,
+                TestRun("test_a", included=["TestB"]): 0.4,
+                TestRun("test_a", included=["TestC"]): 0.5,
+                TestRun("test_a", excluded=["TestA", "TestB", "TestC"]): 0.1,
+                TestRun("test_b"): -0.5,
+            },
+        )
+        test_prioritizations.add_test_score(
+            TestRun("test_a", included=["TestA", "TestB"]), 0.5
+        )
+        self.assert_test_scores_almost_equal(
+            test_prioritizations._test_scores,
+            {
+                TestRun("test_a", included=["TestA"]): 0.8,
+                TestRun("test_a", included=["TestB"]): 0.9,
+                TestRun("test_a", included=["TestC"]): 0.5,
+                TestRun("test_a", excluded=["TestA", "TestB", "TestC"]): 0.1,
+                TestRun("test_b"): -0.5,
+            },
+        )
+        test_prioritizations.add_test_score(
+            TestRun("test_a", excluded=["TestA", "TestB"]), 0.6
+        )
+        self.assert_test_scores_almost_equal(
+            test_prioritizations._test_scores,
+            {
+                TestRun("test_a", included=["TestA"]): 0.8,
+                TestRun("test_a", included=["TestB"]): 0.9,
+                TestRun("test_a", included=["TestC"]): 1.1,
+                TestRun("test_a", excluded=["TestA", "TestB", "TestC"]): 0.7,
+                TestRun("test_b"): -0.5,
+            },
+        )
+        test_prioritizations.add_test_score(TestRun("test_a", included=["TestC"]), 0.7)
+        self.assert_test_scores_almost_equal(
+            test_prioritizations._test_scores,
+            {
+                TestRun("test_a", included=["TestA"]): 0.8,
+                TestRun("test_a", included=["TestB"]): 0.9,
+                TestRun("test_a", included=["TestC"]): 1.8,
+                TestRun("test_a", excluded=["TestA", "TestB", "TestC"]): 0.7,
+                TestRun("test_b"): -0.5,
+            },
+        )
+        test_prioritizations.add_test_score(TestRun("test_a", excluded=["TestD"]), 0.8)
+        self.assert_test_scores_almost_equal(
+            test_prioritizations._test_scores,
+            {
+                TestRun("test_a", included=["TestA"]): 1.6,
+                TestRun("test_a", included=["TestB"]): 1.7,
+                TestRun("test_a", included=["TestC"]): 2.6,
+                TestRun("test_a", included=["TestD"]): 0.7,
+                TestRun("test_a", excluded=["TestA", "TestB", "TestC", "TestD"]): 1.5,
+                TestRun("test_b"): -0.5,
+            },
+        )
+        test_prioritizations.add_test_score(
+            TestRun("test_a", excluded=["TestD", "TestC"]), 0.1
+        )
+        self.assert_test_scores_almost_equal(
+            test_prioritizations._test_scores,
+            {
+                TestRun("test_a", included=["TestA"]): 1.7,
+                TestRun("test_a", included=["TestB"]): 1.8,
+                TestRun("test_a", included=["TestC"]): 2.6,
+                TestRun("test_a", included=["TestD"]): 0.7,
+                TestRun("test_a", excluded=["TestA", "TestB", "TestC", "TestD"]): 1.6,
+                TestRun("test_b"): -0.5,
+            },
+        )
+        self.assertSetEqual(test_prioritizations._original_tests, set(tests))
+        test_prioritizations.validate()
+
+
+class TestAggregatedHeuristics(TestTD):
+    def check(
+        self,
+        tests: List[str],
+        test_prioritizations: List[Dict[TestRun, float]],
+        expected: Dict[TestRun, float],
+    ) -> None:
+        aggregated_heuristics = interface.AggregatedHeuristics(tests)
+        for i, test_prioritization in enumerate(test_prioritizations):
+            heuristic = self.make_heuristic(f"H{i}")
+            aggregated_heuristics.add_heuristic_results(
+                heuristic(), interface.TestPrioritizations(tests, test_prioritization)
+            )
+        final_prioritzations = aggregated_heuristics.get_aggregated_priorities()
+        self.assert_test_scores_almost_equal(
+            final_prioritzations._test_scores,
+            expected,
+        )
+
+    def test_get_aggregated_priorities_mix_1(self) -> None:
+        tests = ["test_a", "test_b", "test_c"]
+        self.check(
+            tests,
+            [
+                {TestRun("test_a"): 0.5},
+                {TestRun("test_a::TestA"): 0.25},
+                {TestRun("test_c"): 0.8},
+            ],
+            {
+                TestRun("test_a", excluded=["TestA"]): 0.5,
+                TestRun("test_a", included=["TestA"]): 0.75,
+                TestRun("test_b"): 0.0,
+                TestRun("test_c"): 0.8,
+            },
+        )
+
+    def test_get_aggregated_priorities_mix_2(self) -> None:
+        tests = ["test_a", "test_b", "test_c"]
+        self.check(
+            tests,
+            [
+                {
+                    TestRun("test_a", included=["TestC"]): 0.5,
+                    TestRun("test_b"): 0.25,
+                    TestRun("test_a", excluded=["TestA", "TestB", "TestC"]): 0.8,
+                },
+                {
+                    TestRun("test_a::TestA"): 0.25,
+                    TestRun("test_b::TestB"): 0.5,
+                    TestRun("test_a::TestB"): 0.75,
+                    TestRun("test_a", excluded=["TestA", "TestB"]): 0.8,
+                },
+                {TestRun("test_c"): 0.8},
+            ],
+            {
+                TestRun("test_a", included=["TestA"]): 0.25,
+                TestRun("test_a", included=["TestB"]): 0.75,
+                TestRun("test_a", included=["TestC"]): 1.3,
+                TestRun("test_a", excluded=["TestA", "TestB", "TestC"]): 1.6,
+                TestRun("test_b", included=["TestB"]): 0.75,
+                TestRun("test_b", excluded=["TestB"]): 0.25,
+                TestRun("test_c"): 0.8,
+            },
+        )
+
+    def test_get_aggregated_priorities_mix_3(self) -> None:
+        tests = ["test_a"]
+        self.check(
+            tests,
+            [
+                {
+                    TestRun("test_a", included=["TestA"]): 0.1,
+                    TestRun("test_a", included=["TestC"]): 0.1,
+                    TestRun("test_a", excluded=["TestA", "TestB", "TestC"]): 0.1,
+                },
+                {
+                    TestRun("test_a", excluded=["TestD"]): 0.1,
+                },
+                {
+                    TestRun("test_a", included=["TestC"]): 0.1,
+                },
+                {
+                    TestRun("test_a", included=["TestB", "TestC"]): 0.1,
+                },
+                {
+                    TestRun("test_a", included=["TestC"]): 0.1,
+                    TestRun("test_a", included=["TestD"]): 0.1,
+                },
+                {
+                    TestRun("test_a"): 0.1,
+                },
+            ],
+            {
+                TestRun("test_a", included=["TestA"]): 0.3,
+                TestRun("test_a", included=["TestB"]): 0.3,
+                TestRun("test_a", included=["TestC"]): 0.6,
+                TestRun("test_a", included=["TestD"]): 0.3,
+                TestRun("test_a", excluded=["TestA", "TestB", "TestC", "TestD"]): 0.3,
+            },
+        )
+
+
+class TestAggregatedHeuristicsTestStats(TestTD):
+    def test_get_test_stats_with_whole_tests(self) -> None:
+        self.maxDiff = None
+        tests = ["test1", "test2", "test3", "test4", "test5"]
+        heuristic1 = interface.TestPrioritizations(
+            tests,
+            {
+                TestRun("test3"): 0.3,
+                TestRun("test4"): 0.1,
+            },
+        )
+        heuristic2 = interface.TestPrioritizations(
+            tests,
+            {
+                TestRun("test5"): 0.5,
+            },
+        )
+
+        aggregator = interface.AggregatedHeuristics(tests)
+        aggregator.add_heuristic_results(self.make_heuristic("H1")(), heuristic1)
+        aggregator.add_heuristic_results(self.make_heuristic("H2")(), heuristic2)
+
+        expected_test3_stats = {
+            "test_name": "test3",
+            "test_filters": "",
+            "heuristics": [
+                {
+                    "position": 0,
+                    "score": 0.3,
+                    "heuristic_name": "H1",
+                    "trial_mode": False,
+                },
+                {
+                    "position": 3,
+                    "score": 0.0,
+                    "heuristic_name": "H2",
+                    "trial_mode": False,
+                },
+            ],
+            "aggregated": {"position": 1, "score": 0.3},
+            "aggregated_trial": {"position": 1, "score": 0.3},
+        }
+
+        test3_stats = aggregator.get_test_stats(TestRun("test3"))
+
+        self.assertDictEqual(test3_stats, expected_test3_stats)
+
+    def test_get_test_stats_only_contains_allowed_types(self) -> None:
+        self.maxDiff = None
+        tests = ["test1", "test2", "test3", "test4", "test5"]
+        heuristic1 = interface.TestPrioritizations(
+            tests,
+            {
+                TestRun("test3"): 0.3,
+                TestRun("test4"): 0.1,
+            },
+        )
+        heuristic2 = interface.TestPrioritizations(
+            tests,
+            {
+                TestRun("test5::classA"): 0.5,
+            },
+        )
+
+        aggregator = interface.AggregatedHeuristics(tests)
+        aggregator.add_heuristic_results(self.make_heuristic("H1")(), heuristic1)
+        aggregator.add_heuristic_results(self.make_heuristic("H2")(), heuristic2)
+
+        stats3 = aggregator.get_test_stats(TestRun("test3"))
+        stats5 = aggregator.get_test_stats(TestRun("test5::classA"))
+
+        def assert_valid_dict(dict_contents: Dict[str, Any]) -> None:
+            for key, value in dict_contents.items():
+                self.assertTrue(isinstance(key, str))
+                self.assertTrue(
+                    isinstance(value, (str, float, int, list, dict)),
+                    f"{value} is not a str, float, or dict",
+                )
+                if isinstance(value, dict):
+                    assert_valid_dict(value)
+                elif isinstance(value, list):
+                    for item in value:
+                        assert_valid_dict(item)
+
+        assert_valid_dict(stats3)
+        assert_valid_dict(stats5)
+
+    def test_get_test_stats_gets_rank_for_test_classes(self) -> None:
+        self.maxDiff = None
+        tests = ["test1", "test2", "test3", "test4", "test5"]
+        heuristic1 = interface.TestPrioritizations(
+            tests,
+            {
+                TestRun("test3"): 0.3,
+                TestRun("test4"): 0.1,
+            },
+        )
+        heuristic2 = interface.TestPrioritizations(
+            tests,
+            {
+                TestRun("test5::classA"): 0.5,
+            },
+        )
+
+        aggregator = interface.AggregatedHeuristics(tests)
+        aggregator.add_heuristic_results(self.make_heuristic("H1")(), heuristic1)
+        aggregator.add_heuristic_results(self.make_heuristic("H2")(), heuristic2)
+
+        stats_inclusive = aggregator.get_test_stats(
+            TestRun("test5", included=["classA"])
+        )
+        stats_exclusive = aggregator.get_test_stats(
+            TestRun("test5", excluded=["classA"])
+        )
+        expected_inclusive = {
+            "test_name": "test5",
+            "test_filters": "classA",
+            "heuristics": [
+                {
+                    "position": 4,
+                    "score": 0.0,
+                    "heuristic_name": "H1",
+                    "trial_mode": False,
+                },
+                {
+                    "position": 0,
+                    "score": 0.5,
+                    "heuristic_name": "H2",
+                    "trial_mode": False,
+                },
+            ],
+            "aggregated": {"position": 0, "score": 0.5},
+            "aggregated_trial": {"position": 0, "score": 0.5},
+        }
+        expected_exclusive = {
+            "test_name": "test5",
+            "test_filters": "not (classA)",
+            "heuristics": [
+                {
+                    "position": 4,
+                    "score": 0.0,
+                    "heuristic_name": "H1",
+                    "trial_mode": False,
+                },
+                {
+                    "position": 5,
+                    "score": 0.0,
+                    "heuristic_name": "H2",
+                    "trial_mode": False,
+                },
+            ],
+            "aggregated": {"position": 5, "score": 0.0},
+            "aggregated_trial": {"position": 5, "score": 0.0},
+        }
+
+        self.assertDictEqual(stats_inclusive, expected_inclusive)
+        self.assertDictEqual(stats_exclusive, expected_exclusive)
+
+    def test_get_test_stats_works_with_class_granularity_heuristics(self) -> None:
+        tests = ["test1", "test2", "test3", "test4", "test5"]
+        heuristic1 = interface.TestPrioritizations(
+            tests,
+            {
+                TestRun("test2"): 0.3,
+            },
+        )
+        heuristic2 = interface.TestPrioritizations(
+            tests,
+            {
+                TestRun("test2::TestFooClass"): 0.5,
+            },
+        )
+
+        aggregator = interface.AggregatedHeuristics(tests)
+        aggregator.add_heuristic_results(self.make_heuristic("H1")(), heuristic1)
+        aggregator.add_heuristic_results(self.make_heuristic("H2")(), heuristic2)
+
+        # These should not throw an error
+        aggregator.get_test_stats(TestRun("test2::TestFooClass"))
+        aggregator.get_test_stats(TestRun("test2"))
+
+
+class TestJsonParsing(TestTD):
+    def test_json_parsing_matches_TestPrioritizations(self) -> None:
+        tests = ["test1", "test2", "test3", "test4", "test5"]
+        tp = interface.TestPrioritizations(
+            tests,
+            {
+                TestRun("test3", included=["ClassA"]): 0.8,
+                TestRun("test3", excluded=["ClassA"]): 0.2,
+                TestRun("test4"): 0.7,
+                TestRun("test5"): 0.6,
+            },
+        )
+        tp_json = tp.to_json()
+        tp_json_to_tp = interface.TestPrioritizations.from_json(tp_json)
+
+        self.assertSetEqual(tp._original_tests, tp_json_to_tp._original_tests)
+        self.assertDictEqual(tp._test_scores, tp_json_to_tp._test_scores)
+
+    def test_json_parsing_matches_TestRun(self) -> None:
+        testrun = TestRun("test1", included=["classA", "classB"])
+        testrun_json = testrun.to_json()
+        testrun_json_to_test = TestRun.from_json(testrun_json)
+
+        self.assertTrue(testrun == testrun_json_to_test)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/test/heuristics/test_utils.py b/tools/test/heuristics/test_utils.py
new file mode 100644
index 0000000000000..934897271ae53
--- /dev/null
+++ b/tools/test/heuristics/test_utils.py
@@ -0,0 +1,54 @@
+import pathlib
+import sys
+import unittest
+from typing import Any, Dict
+
+
+REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
+sys.path.append(str(REPO_ROOT))
+import tools.testing.target_determination.heuristics.utils as utils
+from tools.testing.test_run import TestRun
+
+sys.path.remove(str(REPO_ROOT))
+
+
+class TestHeuristicsUtils(unittest.TestCase):
+    def assertDictAlmostEqual(
+        self, first: Dict[TestRun, Any], second: Dict[TestRun, Any]
+    ) -> None:
+        self.assertEqual(first.keys(), second.keys())
+        for key in first.keys():
+            self.assertAlmostEqual(first[key], second[key])
+
+    def test_normalize_ratings(self) -> None:
+        ratings: Dict[TestRun, float] = {
+            TestRun("test1"): 1,
+            TestRun("test2"): 2,
+            TestRun("test3"): 4,
+        }
+        normalized = utils.normalize_ratings(ratings, 4)
+        self.assertDictAlmostEqual(normalized, ratings)
+
+        normalized = utils.normalize_ratings(ratings, 0.1)
+        self.assertDictAlmostEqual(
+            normalized,
+            {
+                TestRun("test1"): 0.025,
+                TestRun("test2"): 0.05,
+                TestRun("test3"): 0.1,
+            },
+        )
+
+        normalized = utils.normalize_ratings(ratings, 0.2, min_value=0.1)
+        self.assertDictAlmostEqual(
+            normalized,
+            {
+                TestRun("test1"): 0.125,
+                TestRun("test2"): 0.15,
+                TestRun("test3"): 0.2,
+            },
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/test/test_executorch_gen.py b/tools/test/test_executorch_gen.py
index e44aac41a8e03..3123c274a522b 100644
--- a/tools/test/test_executorch_gen.py
+++ b/tools/test/test_executorch_gen.py
@@ -332,6 +332,18 @@ def setUp(self) -> None:
             loc=Location(__file__, 1),
             valid_tags=set(),
         )
+        (
+            self.custom_3_native_function,
+            custom_3_backend_index,
+        ) = NativeFunction.from_yaml(
+            {
+                "func": "custom_3::op_3(Tensor(a!) self, Tensor x) -> Tensor(a!)",
+                "dispatch": {"CPU": "kernel_3"},
+                "variants": "method",
+            },
+            loc=Location(__file__, 1),
+            valid_tags=set(),
+        )
 
         backend_indices: Dict[DispatchKey, Dict[OperatorName, BackendMetadata]] = {
             DispatchKey.CPU: {},
@@ -412,6 +424,29 @@ def test_aten_lib_has_context_arg(self) -> None:
             in declarations
         )
 
+    def test_aten_lib_method_variant(self) -> None:
+        declarations = gen_functions_declarations(
+            native_functions=[
+                self.custom_3_native_function,
+            ],
+            kernel_index=self.kernel_index,
+            selector=SelectiveBuilder.get_nop_selector(),
+            use_aten_lib=True,
+        )
+        self.assertTrue(
+            """
+namespace custom_3 {
+
+// custom_3::op_3(Tensor(a!) self, Tensor x) -> Tensor(a!)
+TORCH_API inline at::Tensor & op_3(torch::executor::KernelRuntimeContext & context, at::Tensor & self, const at::Tensor & x) {
+    return self.op_3(x);
+}
+
+} // namespace custom_3
+        """
+            in declarations
+        )
+
 
 class TestComputeCodegenUnboxedKernels(unittest.TestCase):
     def setUp(self) -> None:
diff --git a/tools/test/test_test_run.py b/tools/test/test_test_run.py
index 9e1e4702d1931..25fa8107ffb29 100644
--- a/tools/test/test_test_run.py
+++ b/tools/test/test_test_run.py
@@ -24,9 +24,7 @@ def test_union_with_inclusions(self) -> None:
         run1 = TestRun("foo::bar")
         run2 = TestRun("foo::baz")
 
-        expected = TestRun("foo")
-        expected._included.add("bar")
-        expected._included.add("baz")
+        expected = TestRun("foo", included=["bar", "baz"])
 
         self.assertEqual(run1 | run2, expected)
         self.assertEqual(run2 | run1, expected)
diff --git a/tools/test/test_test_selections.py b/tools/test/test_test_selections.py
index a6f6dc2e2bf28..cc9bf5f4435d2 100644
--- a/tools/test/test_test_selections.py
+++ b/tools/test/test_test_selections.py
@@ -1,3 +1,4 @@
+import functools
 import pathlib
 import random
 import sys
@@ -72,6 +73,109 @@ def assert_shards_equal(
             self.assertAlmostEqual(expected[0], actual[0])
             self.assertListEqual(expected[1], actual[1])
 
+    def test_no_times(self) -> None:
+        # Check that round robin sharding is used when no times are provided
+        expected_shards = [
+            (
+                0.0,
+                [
+                    ShardedTest(
+                        test="super_long_test", shard=1, num_shards=1, time=None
+                    ),
+                    ShardedTest(test="long_test2", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="normal_test2", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="short_test1", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="short_test3", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="short_test5", shard=1, num_shards=1, time=None),
+                ],
+            ),
+            (
+                0.0,
+                [
+                    ShardedTest(test="long_test1", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="normal_test1", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="normal_test3", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="short_test2", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="short_test4", shard=1, num_shards=1, time=None),
+                ],
+            ),
+        ]
+        self.assert_shards_equal(
+            expected_shards,
+            calculate_shards(2, self.tests, {}, {}, sort_by_time=False),
+        )
+
+    def test_some_times_with_not_sort_by_time(self) -> None:
+        expected_shards = [
+            (
+                400.0,
+                [
+                    ShardedTest(test="test_1", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="test_2", shard=1, num_shards=1, time=400),
+                    ShardedTest(test="test_5", shard=1, num_shards=1, time=None),
+                ],
+            ),
+            (
+                300.0,
+                [
+                    ShardedTest(test="test_3", shard=1, num_shards=1, time=300),
+                    ShardedTest(test="test_4", shard=1, num_shards=1, time=None),
+                ],
+            ),
+        ]
+        self.assert_shards_equal(
+            expected_shards,
+            calculate_shards(
+                2,
+                [
+                    TestRun("test_1"),
+                    TestRun("test_2"),
+                    TestRun("test_3"),
+                    TestRun("test_4"),
+                    TestRun("test_5"),
+                ],
+                {"test_2": 400, "test_3": 300},
+                {},
+                sort_by_time=False,
+            ),
+        )
+
+    def test_serial_parallel_interleaving(self) -> None:
+        expected_shards = [
+            (
+                300.0,
+                [
+                    ShardedTest(test="test_1", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="test_3", shard=1, num_shards=1, time=300),
+                    ShardedTest(test="test_4", shard=1, num_shards=1, time=None),
+                ],
+            ),
+            (
+                400.0,
+                [
+                    ShardedTest(test="test_2", shard=1, num_shards=1, time=400),
+                    ShardedTest(test="test_5", shard=1, num_shards=1, time=None),
+                ],
+            ),
+        ]
+        self.assert_shards_equal(
+            expected_shards,
+            calculate_shards(
+                2,
+                [
+                    TestRun("test_1"),
+                    TestRun("test_2"),
+                    TestRun("test_3"),
+                    TestRun("test_4"),
+                    TestRun("test_5"),
+                ],
+                {"test_2": 400, "test_3": 300},
+                {},
+                must_serial=lambda x: x in ["test_1", "test_3"],
+                sort_by_time=False,
+            ),
+        )
+
     def test_calculate_2_shards_with_complete_test_times(self) -> None:
         expected_shards = [
             (
@@ -174,10 +278,12 @@ def test_calculate_2_shards_with_incomplete_test_times(self) -> None:
                 22.0,
                 [
                     ShardedTest(test="long_test1", shard=1, num_shards=1, time=22),
-                    ShardedTest(test="long_test2", shard=1, num_shards=1, time=None),
-                    ShardedTest(test="normal_test3", shard=1, num_shards=1, time=None),
-                    ShardedTest(test="short_test3", shard=1, num_shards=1, time=None),
-                    ShardedTest(test="short_test5", shard=1, num_shards=1, time=None),
+                    ShardedTest(
+                        test="super_long_test", shard=1, num_shards=1, time=None
+                    ),
+                    ShardedTest(test="normal_test2", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="short_test2", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="short_test4", shard=1, num_shards=1, time=None),
                 ],
             ),
             (
@@ -185,12 +291,10 @@ def test_calculate_2_shards_with_incomplete_test_times(self) -> None:
                 [
                     ShardedTest(test="normal_test1", shard=1, num_shards=1, time=9),
                     ShardedTest(test="short_test1", shard=1, num_shards=1, time=1),
-                    ShardedTest(
-                        test="super_long_test", shard=1, num_shards=1, time=None
-                    ),
-                    ShardedTest(test="normal_test2", shard=1, num_shards=1, time=None),
-                    ShardedTest(test="short_test2", shard=1, num_shards=1, time=None),
-                    ShardedTest(test="short_test4", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="long_test2", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="normal_test3", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="short_test3", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="short_test5", shard=1, num_shards=1, time=None),
                 ],
             ),
         ]
@@ -213,38 +317,38 @@ def test_calculate_5_shards_with_incomplete_test_times(self) -> None:
                 22.0,
                 [
                     ShardedTest(test="long_test1", shard=1, num_shards=1, time=22),
-                    ShardedTest(test="normal_test2", shard=1, num_shards=1, time=None),
-                    ShardedTest(test="short_test5", shard=1, num_shards=1, time=None),
+                    ShardedTest(
+                        test="super_long_test", shard=1, num_shards=1, time=None
+                    ),
+                    ShardedTest(test="short_test3", shard=1, num_shards=1, time=None),
                 ],
             ),
             (
                 9.0,
                 [
                     ShardedTest(test="normal_test1", shard=1, num_shards=1, time=9),
-                    ShardedTest(test="normal_test3", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="long_test2", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="short_test4", shard=1, num_shards=1, time=None),
                 ],
             ),
             (
                 1.0,
                 [
                     ShardedTest(test="short_test1", shard=1, num_shards=1, time=1),
-                    ShardedTest(test="short_test2", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="normal_test2", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="short_test5", shard=1, num_shards=1, time=None),
                 ],
             ),
             (
                 0.0,
                 [
-                    ShardedTest(
-                        test="super_long_test", shard=1, num_shards=1, time=None
-                    ),
-                    ShardedTest(test="short_test3", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="normal_test3", shard=1, num_shards=1, time=None),
                 ],
             ),
             (
                 0.0,
                 [
-                    ShardedTest(test="long_test2", shard=1, num_shards=1, time=None),
-                    ShardedTest(test="short_test4", shard=1, num_shards=1, time=None),
+                    ShardedTest(test="short_test2", shard=1, num_shards=1, time=None),
                 ],
             ),
         ]
@@ -322,43 +426,92 @@ def test_split_shards(self) -> None:
             ),
         )
 
+    def test_zero_tests(self) -> None:
+        self.assertListEqual([(0.0, []), (0.0, [])], calculate_shards(2, [], {}, None))
+
     def test_split_shards_random(self) -> None:
         random.seed(120)
         for _ in range(100):
             num_shards = random.randint(1, 10)
             num_tests = random.randint(1, 100)
+            test_names = [str(i) for i in range(num_tests)]
+            tests = [TestRun(x) for x in test_names]
+            serial = [x for x in test_names if random.randint(0, 1) == 0]
+            has_times = [x for x in test_names if random.randint(0, 1) == 0]
             random_times: Dict[str, float] = {
-                str(i): random.randint(0, THRESHOLD * 10) for i in range(num_tests)
+                i: random.randint(0, THRESHOLD * 10) for i in has_times
             }
+            sort_by_time = random.randint(0, 1) == 0
 
             shards = calculate_shards(
                 num_shards,
-                [TestRun(t) for t in random_times.keys()],
+                tests,
                 random_times,
-                gen_class_times(random_times),
+                None,
+                must_serial=lambda x: x in serial,
+                sort_by_time=sort_by_time,
             )
 
             times = [x[0] for x in shards]
             max_diff = max(times) - min(times)
-            self.assertTrue(max_diff <= THRESHOLD)
+            self.assertTrue(max_diff <= THRESHOLD + (num_tests - len(has_times)) * 60)
 
-            all_sharded_tests = defaultdict(list)
-            for time, sharded_tests in shards:
-                self.assertEqual(time, sum(x.time for x in sharded_tests))
+            all_sharded_tests: Dict[str, List[ShardedTest]] = defaultdict(list)
+            for _, sharded_tests in shards:
                 for sharded_test in sharded_tests:
                     all_sharded_tests[sharded_test.name].append(sharded_test)
 
-            self.assertListEqual(
-                sorted(random_times.keys()), sorted(all_sharded_tests.keys())
-            )
+            # Check that all test files are represented in the shards
+            self.assertListEqual(sorted(test_names), sorted(all_sharded_tests.keys()))
+            # Check that for each test file, the pytest shards' times adds up to
+            # original and all shards are present
             for test, sharded_tests in all_sharded_tests.items():
-                self.assertAlmostEqual(
-                    random_times[test], sum(x.time or 0 for x in sharded_tests)
-                )
+                if random_times.get(test) is None:
+                    self.assertTrue(len(sharded_tests) == 1)
+                    self.assertTrue(sharded_tests[0].time is None)
+                else:
+                    # x.time is not None because of the above check
+                    self.assertAlmostEqual(
+                        random_times[test], sum(x.time for x in sharded_tests)  # type: ignore[misc]
+                    )
                 self.assertListEqual(
                     list(range(sharded_tests[0].num_shards)),
                     sorted(x.shard - 1 for x in sharded_tests),
                 )
+            # Check that sort_by_time is respected
+            if sort_by_time:
+
+                def comparator(a: ShardedTest, b: ShardedTest) -> int:
+                    # serial comes first
+                    if a.name in serial and b.name not in serial:
+                        return -1
+                    if a.name not in serial and b.name in serial:
+                        return 1
+                    # known test times come first
+                    if a.time is not None and b.time is None:
+                        return -1
+                    if a.time is None and b.time is not None:
+                        return 1
+                    if a.time == b.time:
+                        return 0
+                    # not None due to the above checks
+                    return -1 if a.time > b.time else 1  # type: ignore[operator]
+
+            else:
+
+                def comparator(a: ShardedTest, b: ShardedTest) -> int:
+                    # serial comes first
+                    if a.name in serial and b.name not in serial:
+                        return -1
+                    if a.name not in serial and b.name in serial:
+                        return 1
+                    return test_names.index(a.name) - test_names.index(b.name)
+
+            for _, sharded_tests in shards:
+                self.assertListEqual(
+                    sorted(sharded_tests, key=functools.cmp_to_key(comparator)),
+                    sharded_tests,
+                )
 
     def test_calculate_2_shards_against_optimal_shards(self) -> None:
         random.seed(120)
diff --git a/tools/test/test_upload_stats_lib.py b/tools/test/test_upload_stats_lib.py
index 8199994d6144a..0baf323966e13 100644
--- a/tools/test/test_upload_stats_lib.py
+++ b/tools/test/test_upload_stats_lib.py
@@ -1,13 +1,18 @@
 import decimal
 import inspect
+import pathlib
+import sys
 import unittest
 from typing import Any, Dict
 from unittest import mock
 
+REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(REPO_ROOT))
 from tools.stats.upload_metrics import add_global_metric, emit_metric
 
 from tools.stats.upload_stats_lib import BATCH_SIZE, upload_to_rockset
 
+sys.path.remove(str(REPO_ROOT))
 
 # default values
 REPO = "some/repo"
@@ -278,7 +283,7 @@ def test_no_metrics_emitted_if_required_env_var_set_to_empty_string(
         mock.patch.dict(
             "os.environ",
             {
-                "BUILD_ENVIRONMENT": "",
+                "GITHUB_JOB": "",
             },
         ).start()
 
diff --git a/tools/testing/discover_tests.py b/tools/testing/discover_tests.py
new file mode 100644
index 0000000000000..5c37fdd4fe9d8
--- /dev/null
+++ b/tools/testing/discover_tests.py
@@ -0,0 +1,139 @@
+import glob
+import os
+import sys
+from pathlib import Path
+from typing import List, Optional, Union
+
+CPP_TEST_PREFIX = "cpp"
+CPP_TEST_PATH = "build/bin"
+CPP_TESTS_DIR = os.path.abspath(os.getenv("CPP_TESTS_DIR", default=CPP_TEST_PATH))
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+
+
+def parse_test_module(test: str) -> str:
+    return test.split(".")[0]
+
+
+def discover_tests(
+    base_dir: Path = REPO_ROOT / "test",
+    cpp_tests_dir: Optional[Union[str, Path]] = None,
+    blocklisted_patterns: Optional[List[str]] = None,
+    blocklisted_tests: Optional[List[str]] = None,
+    extra_tests: Optional[List[str]] = None,
+) -> List[str]:
+    """
+    Searches for all python files starting with test_ excluding one specified by patterns.
+    If cpp_tests_dir is provided, also scan for all C++ tests under that directory. They
+    are usually found in build/bin
+    """
+
+    def skip_test_p(name: str) -> bool:
+        rc = False
+        if blocklisted_patterns is not None:
+            rc |= any(name.startswith(pattern) for pattern in blocklisted_patterns)
+        if blocklisted_tests is not None:
+            rc |= name in blocklisted_tests
+        return rc
+
+    # This supports symlinks, so we can link domain library tests to PyTorch test directory
+    all_py_files = [
+        Path(p) for p in glob.glob(f"{base_dir}/**/test_*.py", recursive=True)
+    ]
+
+    cpp_tests_dir = (
+        f"{base_dir.parent}/{CPP_TEST_PATH}" if cpp_tests_dir is None else cpp_tests_dir
+    )
+    # CPP test files are located under pytorch/build/bin. Unlike Python test, C++ tests
+    # are just binaries and could have any name, i.e. basic or atest
+    all_cpp_files = [
+        Path(p) for p in glob.glob(f"{cpp_tests_dir}/**/*", recursive=True)
+    ]
+
+    rc = [str(fname.relative_to(base_dir))[:-3] for fname in all_py_files]
+    # Add the cpp prefix for C++ tests so that we can tell them apart
+    rc.extend(
+        [
+            parse_test_module(f"{CPP_TEST_PREFIX}/{fname.relative_to(cpp_tests_dir)}")
+            for fname in all_cpp_files
+        ]
+    )
+
+    # Invert slashes on Windows
+    if sys.platform == "win32":
+        rc = [name.replace("\\", "/") for name in rc]
+    rc = [test for test in rc if not skip_test_p(test)]
+    if extra_tests is not None:
+        rc += extra_tests
+    return sorted(rc)
+
+
+TESTS = discover_tests(
+    cpp_tests_dir=CPP_TESTS_DIR,
+    blocklisted_patterns=[
+        "ao",
+        "bottleneck_test",
+        "custom_backend",
+        "custom_operator",
+        "fx",  # executed by test_fx.py
+        "jit",  # executed by test_jit.py
+        "mobile",
+        "onnx_caffe2",
+        "package",  # executed by test_package.py
+        "quantization",  # executed by test_quantization.py
+        "autograd",  # executed by test_autograd.py
+    ],
+    blocklisted_tests=[
+        "test_bundled_images",
+        "test_cpp_extensions_aot",
+        "test_determination",
+        "test_jit_fuser",
+        "test_jit_simple",
+        "test_jit_string",
+        "test_kernel_launch_checks",
+        "test_nnapi",
+        "test_static_runtime",
+        "test_throughput_benchmark",
+        "distributed/bin/test_script",
+        "distributed/elastic/multiprocessing/bin/test_script",
+        "distributed/launcher/bin/test_script",
+        "distributed/launcher/bin/test_script_init_method",
+        "distributed/launcher/bin/test_script_is_torchelastic_launched",
+        "distributed/launcher/bin/test_script_local_rank",
+        "distributed/test_c10d_spawn",
+        "distributions/test_transforms",
+        "distributions/test_utils",
+        "test/inductor/test_aot_inductor_utils",
+        "onnx/test_pytorch_onnx_onnxruntime_cuda",
+        "onnx/test_models",
+        # These are not C++ tests
+        f"{CPP_TEST_PREFIX}/CMakeFiles",
+        f"{CPP_TEST_PREFIX}/CTestTestfile.cmake",
+        f"{CPP_TEST_PREFIX}/Makefile",
+        f"{CPP_TEST_PREFIX}/cmake_install.cmake",
+        f"{CPP_TEST_PREFIX}/c10_intrusive_ptr_benchmark",
+        f"{CPP_TEST_PREFIX}/example_allreduce",
+        f"{CPP_TEST_PREFIX}/parallel_benchmark",
+        f"{CPP_TEST_PREFIX}/protoc",
+        f"{CPP_TEST_PREFIX}/protoc-3.13.0.0",
+        f"{CPP_TEST_PREFIX}/torch_shm_manager",
+        f"{CPP_TEST_PREFIX}/tutorial_tensorexpr",
+    ],
+    extra_tests=[
+        "test_cpp_extensions_aot_ninja",
+        "test_cpp_extensions_aot_no_ninja",
+        "distributed/elastic/timer/api_test",
+        "distributed/elastic/timer/local_timer_example",
+        "distributed/elastic/timer/local_timer_test",
+        "distributed/elastic/events/lib_test",
+        "distributed/elastic/metrics/api_test",
+        "distributed/elastic/utils/logging_test",
+        "distributed/elastic/utils/util_test",
+        "distributed/elastic/utils/distributed_test",
+        "distributed/elastic/multiprocessing/api_test",
+        "doctests",
+    ],
+)
+
+
+if __name__ == "__main__":
+    print(TESTS)
diff --git a/tools/testing/do_target_determination_for_s3.py b/tools/testing/do_target_determination_for_s3.py
new file mode 100644
index 0000000000000..4b004801fc747
--- /dev/null
+++ b/tools/testing/do_target_determination_for_s3.py
@@ -0,0 +1,79 @@
+import json
+import os
+import pathlib
+import sys
+
+
+REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(REPO_ROOT))
+
+from tools.stats.import_test_stats import (
+    copy_pytest_cache,
+    get_td_heuristic_historial_edited_files_json,
+    get_td_heuristic_profiling_json,
+    get_test_class_ratings,
+    get_test_class_times,
+    get_test_file_ratings,
+    get_test_times,
+)
+from tools.stats.upload_metrics import emit_metric
+
+from tools.testing.discover_tests import TESTS
+from tools.testing.target_determination.determinator import (
+    AggregatedHeuristics,
+    get_test_prioritizations,
+    TestPrioritizations,
+)
+
+sys.path.remove(str(REPO_ROOT))
+
+
+def import_results() -> TestPrioritizations:
+    if not (REPO_ROOT / ".additional_ci_files/td_results.json").exists():
+        print("No TD results found")
+        return TestPrioritizations([], {})
+    with open(REPO_ROOT / ".additional_ci_files/td_results.json") as f:
+        td_results = json.load(f)
+        tp = TestPrioritizations.from_json(td_results)
+
+    return tp
+
+
+def main() -> None:
+    selected_tests = TESTS
+
+    aggregated_heuristics: AggregatedHeuristics = AggregatedHeuristics(selected_tests)
+
+    get_test_times()
+    get_test_class_times()
+    get_test_file_ratings()
+    get_test_class_ratings()
+    get_td_heuristic_historial_edited_files_json()
+    get_td_heuristic_profiling_json()
+    copy_pytest_cache()
+
+    aggregated_heuristics = get_test_prioritizations(selected_tests)
+
+    test_prioritizations = aggregated_heuristics.get_aggregated_priorities()
+
+    print("Aggregated Heuristics")
+    print(test_prioritizations.get_info_str(verbose=False))
+
+    if os.getenv("CI") == "true":
+        print("Emitting metrics")
+        # Split into 3 due to size constraints
+        emit_metric(
+            "td_results_final_test_prioritizations",
+            {"test_prioritizations": test_prioritizations.to_json()},
+        )
+        emit_metric(
+            "td_results_aggregated_heuristics",
+            {"aggregated_heuristics": aggregated_heuristics.to_json()},
+        )
+
+    with open(REPO_ROOT / "td_results.json", "w") as f:
+        f.write(json.dumps(test_prioritizations.to_json()))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/testing/modulefinder_determinator.py b/tools/testing/modulefinder_determinator.py
index ce55fdb4245ab..ba58d75c57fee 100644
--- a/tools/testing/modulefinder_determinator.py
+++ b/tools/testing/modulefinder_determinator.py
@@ -21,6 +21,8 @@
     "test_cpp_extensions_aot_no_ninja",
     "test_cpp_extensions_jit",
     "test_cpp_extensions_open_device_registration",
+    "test_cpp_extensions_stream_and_event",
+    "test_cpp_extensions_mtia_backend",
     "test_cuda",
     "test_cuda_primary_ctx",
     "test_dataloader",
diff --git a/tools/testing/target_determination/determinator.py b/tools/testing/target_determination/determinator.py
index d4ca6b96c1689..17320a73a194e 100644
--- a/tools/testing/target_determination/determinator.py
+++ b/tools/testing/target_determination/determinator.py
@@ -1,5 +1,5 @@
 import sys
-from typing import Any, Dict, List
+from typing import Any, List
 
 from tools.testing.target_determination.heuristics import (
     AggregatedHeuristics as AggregatedHeuristics,
@@ -11,31 +11,16 @@
 def get_test_prioritizations(
     tests: List[str], file: Any = sys.stdout
 ) -> AggregatedHeuristics:
-    aggregated_results = AggregatedHeuristics(unranked_tests=tests)
+    aggregated_results = AggregatedHeuristics(tests)
     print(f"Received {len(tests)} tests to prioritize", file=file)
     for test in tests:
         print(f"  {test}", file=file)
 
     for heuristic in HEURISTICS:
-        new_rankings: TestPrioritizations = heuristic.get_test_priorities(tests)
+        new_rankings: TestPrioritizations = heuristic.get_prediction_confidence(tests)
         aggregated_results.add_heuristic_results(heuristic, new_rankings)
 
-        num_tests_found = len(new_rankings.get_prioritized_tests())
-        print(
-            f"Heuristic {heuristic} identified {num_tests_found} tests "
-            + f"to prioritize ({(num_tests_found / len(tests)):.2%}%)",
-            file=file,
-        )
-
-        if num_tests_found:
-            print(new_rankings.get_info_str(), file=file)
+        print(f"Results from {heuristic.__class__.__name__}")
+        print(new_rankings.get_info_str(verbose=False), file=file)
 
     return aggregated_results
-
-
-def get_prediction_confidences(tests: List[str]) -> Dict[str, Dict[str, float]]:
-    # heuristic name -> test -> rating/confidence
-    rankings: Dict[str, Dict[str, float]] = {}
-    for heuristic in HEURISTICS:
-        rankings[heuristic.name] = heuristic.get_prediction_confidence(tests)
-    return rankings
diff --git a/tools/testing/target_determination/gen_artifact.py b/tools/testing/target_determination/gen_artifact.py
new file mode 100644
index 0000000000000..c5165cbb81081
--- /dev/null
+++ b/tools/testing/target_determination/gen_artifact.py
@@ -0,0 +1,12 @@
+import json
+import os
+import pathlib
+from typing import Any, List
+
+REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
+
+
+def gen_ci_artifact(included: List[Any], excluded: List[Any]) -> None:
+    file_name = f"td_exclusions-{os.urandom(10).hex()}.json"
+    with open(REPO_ROOT / "test" / "test-reports" / file_name, "w") as f:
+        json.dump({"included": included, "excluded": excluded}, f)
diff --git a/tools/testing/target_determination/heuristics/__init__.py b/tools/testing/target_determination/heuristics/__init__.py
index 09425b9ee9647..62b92a15edfe2 100644
--- a/tools/testing/target_determination/heuristics/__init__.py
+++ b/tools/testing/target_determination/heuristics/__init__.py
@@ -1,9 +1,10 @@
-from typing import List
+from typing import List, Tuple
 
 from tools.testing.target_determination.heuristics.correlated_with_historical_failures import (
     CorrelatedWithHistoricalFailures,
 )
 from tools.testing.target_determination.heuristics.edited_by_pr import EditedByPR
+from tools.testing.target_determination.heuristics.filepath import Filepath
 from tools.testing.target_determination.heuristics.historical_class_failure_correlation import (
     HistoricalClassFailurCorrelation,
 )
@@ -16,6 +17,8 @@
     HeuristicInterface as HeuristicInterface,
     TestPrioritizations as TestPrioritizations,
 )
+from tools.testing.target_determination.heuristics.llm import LLM
+from tools.testing.target_determination.heuristics.mentioned_in_pr import MentionedInPR
 
 from tools.testing.target_determination.heuristics.previously_failed_in_pr import (
     PreviouslyFailedInPR,
@@ -27,8 +30,11 @@
 HEURISTICS: List[HeuristicInterface] = [
     PreviouslyFailedInPR(),
     EditedByPR(),
+    MentionedInPR(),
     HistoricalClassFailurCorrelation(trial_mode=True),
     CorrelatedWithHistoricalFailures(),
-    HistorialEditedFiles(trial_mode=True),
-    Profiling(trial_mode=True),
+    HistorialEditedFiles(),
+    Profiling(),
+    LLM(),
+    Filepath(),
 ]
diff --git a/tools/testing/target_determination/heuristics/correlated_with_historical_failures.py b/tools/testing/target_determination/heuristics/correlated_with_historical_failures.py
index 1df81acd6fa67..590ca468ae95d 100644
--- a/tools/testing/target_determination/heuristics/correlated_with_historical_failures.py
+++ b/tools/testing/target_determination/heuristics/correlated_with_historical_failures.py
@@ -11,30 +11,19 @@
 )
 
 from tools.testing.target_determination.heuristics.utils import (
-    get_correlated_tests,
     get_ratings_for_tests,
     normalize_ratings,
 )
+from tools.testing.test_run import TestRun
 
 
 class CorrelatedWithHistoricalFailures(HeuristicInterface):
     def __init__(self, **kwargs: Dict[str, Any]):
         super().__init__(**kwargs)
 
-    def get_test_priorities(self, tests: List[str]) -> TestPrioritizations:
-        correlated_tests = get_correlated_tests(
-            ADDITIONAL_CI_FILES_FOLDER / TEST_FILE_RATINGS_FILE
-        )
-        relevant_correlated_tests = [test for test in correlated_tests if test in tests]
-        test_rankings = TestPrioritizations(
-            tests_being_ranked=tests, probable_relevance=relevant_correlated_tests
-        )
-
-        return test_rankings
-
-    def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
+    def get_prediction_confidence(self, tests: List[str]) -> TestPrioritizations:
         test_ratings = get_ratings_for_tests(
             ADDITIONAL_CI_FILES_FOLDER / TEST_FILE_RATINGS_FILE
         )
-        test_ratings = {k: v for (k, v) in test_ratings.items() if k in tests}
-        return normalize_ratings(test_ratings, 1)
+        test_ratings = {TestRun(k): v for (k, v) in test_ratings.items() if k in tests}
+        return TestPrioritizations(tests, normalize_ratings(test_ratings, 0.25))
diff --git a/tools/testing/target_determination/heuristics/edited_by_pr.py b/tools/testing/target_determination/heuristics/edited_by_pr.py
index d1d3d97fe046d..d0a473db78ae5 100644
--- a/tools/testing/target_determination/heuristics/edited_by_pr.py
+++ b/tools/testing/target_determination/heuristics/edited_by_pr.py
@@ -9,25 +9,18 @@
     python_test_file_to_test_name,
     query_changed_files,
 )
+from tools.testing.test_run import TestRun
 
 
 class EditedByPR(HeuristicInterface):
     def __init__(self, **kwargs: Dict[str, Any]):
         super().__init__(**kwargs)
 
-    def get_test_priorities(self, tests: List[str]) -> TestPrioritizations:
-        # Tests must always be returned in a deterministic order.
-        # Otherwise it breaks our test sharding logic
-        critical_tests = sorted(_get_modified_tests())
-        test_rankings = TestPrioritizations(
-            tests_being_ranked=tests, high_relevance=critical_tests
-        )
-
-        return test_rankings
-
-    def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
+    def get_prediction_confidence(self, tests: List[str]) -> TestPrioritizations:
         critical_tests = _get_modified_tests()
-        return {test: 1 for test in critical_tests if test in tests}
+        return TestPrioritizations(
+            tests, {TestRun(test): 1 for test in critical_tests if test in tests}
+        )
 
 
 def _get_modified_tests() -> Set[str]:
diff --git a/tools/testing/target_determination/heuristics/filepath.py b/tools/testing/target_determination/heuristics/filepath.py
new file mode 100644
index 0000000000000..066d13f49b1e5
--- /dev/null
+++ b/tools/testing/target_determination/heuristics/filepath.py
@@ -0,0 +1,107 @@
+from collections import defaultdict
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Callable, Dict, List
+from warnings import warn
+
+from tools.testing.target_determination.heuristics.interface import (
+    HeuristicInterface,
+    TestPrioritizations,
+)
+
+from tools.testing.target_determination.heuristics.utils import (
+    normalize_ratings,
+    query_changed_files,
+)
+from tools.testing.test_run import TestRun
+
+REPO_ROOT = Path(__file__).parent.parent.parent.parent
+
+keyword_synonyms: Dict[str, List[str]] = {
+    "amp": ["mixed_precision"],
+    "quant": ["quantized", "quantization", "quantize"],
+    "decomp": ["decomposition", "decompositions"],
+    "numpy": ["torch_np", "numpy_tests"],
+    "ops": ["opinfo"],
+}
+
+not_keyword = [
+    "torch",
+    "test",
+    "tests",
+    "util",
+    "utils",
+    "func",
+    "src",
+    "c",
+    "ns",
+    "tools",
+    "internal",
+]
+
+custom_matchers: Dict[str, Callable[[str], bool]] = {
+    "nn": lambda x: "nn" in x.replace("onnx", "_"),
+    "c10": lambda x: "c10" in x.replace("c10d", "_"),
+}
+
+
+@lru_cache(maxsize=1)
+def get_keywords(file: str) -> List[str]:
+    keywords = []
+    for folder in Path(file).parts[:-1]:
+        folder = sanitize_folder_name(folder)
+        keywords.append(folder)
+    return [kw for kw in keywords if kw not in not_keyword]
+
+
+def sanitize_folder_name(folder_name: str) -> str:
+    if folder_name.startswith("_"):
+        folder_name = folder_name[1:]
+
+    for syn_rep, syns in keyword_synonyms.items():
+        if folder_name in syns or folder_name == syn_rep:
+            return syn_rep
+
+    return folder_name
+
+
+def file_matches_keyword(file: str, keyword: str) -> bool:
+    keywords = get_keywords(file)
+    return (
+        keyword in keywords
+        or any(
+            syn in keywords or syn in file for syn in keyword_synonyms.get(keyword, [])
+        )
+        or custom_matchers.get(keyword, lambda x: keyword in x)(file)  # type: ignore[no-untyped-call]
+    )
+
+
+class Filepath(HeuristicInterface):
+    # Heuristic based on folders in the file path.  Takes each folder of each
+    # changed file and attempts to find matches based on those folders
+    def __init__(self, **kwargs: Dict[str, Any]) -> None:
+        super().__init__(**kwargs)
+
+    def get_prediction_confidence(self, tests: List[str]) -> TestPrioritizations:
+        keyword_frequency: Dict[str, int] = defaultdict(int)
+        try:
+            changed_files = query_changed_files()
+        except Exception as e:
+            warn(f"Can't query changed test files due to {e}")
+            changed_files = []
+
+        for cf in changed_files:
+            keywords = get_keywords(cf)
+            for keyword in keywords:
+                keyword_frequency[keyword] += 1
+
+        test_ratings: Dict[str, float] = defaultdict(float)
+
+        for test in tests:
+            for keyword, frequency in keyword_frequency.items():
+                if file_matches_keyword(test, keyword):
+                    test_ratings[test] += frequency
+        test_ratings = {TestRun(k): v for (k, v) in test_ratings.items() if k in tests}
+        return TestPrioritizations(
+            tests, normalize_ratings(test_ratings, 0.25, min_value=0.125)
+        )
diff --git a/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py b/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py
index c5d025a8e3089..0a3097607af32 100644
--- a/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py
+++ b/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py
@@ -19,6 +19,7 @@
     query_changed_files,
     REPO_ROOT,
 )
+from tools.testing.test_run import TestRun
 
 
 class HistoricalClassFailurCorrelation(HeuristicInterface):
@@ -30,19 +31,12 @@ class HistoricalClassFailurCorrelation(HeuristicInterface):
     def __init__(self, **kwargs: Any):
         super().__init__(**kwargs)
 
-    def get_test_priorities(self, tests: List[str]) -> TestPrioritizations:
-        correlated_tests = _rank_correlated_tests(tests)
-
-        test_rankings = TestPrioritizations(
-            tests_being_ranked=tests, probable_relevance=correlated_tests
-        )
-
-        return test_rankings
-
-    def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
+    def get_prediction_confidence(self, tests: List[str]) -> TestPrioritizations:
         ratings = _get_ratings_for_tests(set(tests))
-        test_ratings = {k: v for (k, v) in ratings.items() if k in tests}
-        return normalize_ratings(test_ratings, 1)
+        test_ratings = {
+            TestRun(k): v for (k, v) in ratings.items() if TestRun(k).test_file in tests
+        }
+        return TestPrioritizations(tests, normalize_ratings(test_ratings, 0.25))
 
 
 def _get_historical_test_class_correlations() -> Dict[str, Dict[str, float]]:
diff --git a/tools/testing/target_determination/heuristics/historical_edited_files.py b/tools/testing/target_determination/heuristics/historical_edited_files.py
index 09ef1b87acda0..c3b7f07719d29 100644
--- a/tools/testing/target_determination/heuristics/historical_edited_files.py
+++ b/tools/testing/target_determination/heuristics/historical_edited_files.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List
+from typing import Any, List
 
 from tools.stats.import_test_stats import (
     ADDITIONAL_CI_FILES_FOLDER,
@@ -11,10 +11,10 @@
 )
 
 from tools.testing.target_determination.heuristics.utils import (
-    get_correlated_tests,
     get_ratings_for_tests,
     normalize_ratings,
 )
+from tools.testing.test_run import TestRun
 
 
 # This heuristic assumes that changed files in previous commits are good sources
@@ -26,20 +26,10 @@ class HistorialEditedFiles(HeuristicInterface):
     def __init__(self, **kwargs: Any):
         super().__init__(**kwargs)
 
-    def get_test_priorities(self, tests: List[str]) -> TestPrioritizations:
-        correlated_tests = get_correlated_tests(
-            ADDITIONAL_CI_FILES_FOLDER / TD_HEURISTIC_HISTORICAL_EDITED_FILES
-        )
-        relevant_correlated_tests = [test for test in correlated_tests if test in tests]
-        test_rankings = TestPrioritizations(
-            tests_being_ranked=tests, probable_relevance=relevant_correlated_tests
-        )
-
-        return test_rankings
-
-    def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
+    def get_prediction_confidence(self, tests: List[str]) -> TestPrioritizations:
         test_ratings = get_ratings_for_tests(
             ADDITIONAL_CI_FILES_FOLDER / TD_HEURISTIC_HISTORICAL_EDITED_FILES
         )
-        test_ratings = {k: v for (k, v) in test_ratings.items() if k in tests}
-        return normalize_ratings(test_ratings, 1)
+        test_ratings = {TestRun(k): v for (k, v) in test_ratings.items() if k in tests}
+
+        return TestPrioritizations(tests, normalize_ratings(test_ratings, 0.25))
diff --git a/tools/testing/target_determination/heuristics/interface.py b/tools/testing/target_determination/heuristics/interface.py
index a7b82455d15e4..77052e6bba0dd 100644
--- a/tools/testing/target_determination/heuristics/interface.py
+++ b/tools/testing/target_determination/heuristics/interface.py
@@ -1,61 +1,8 @@
-import operator
-import sys
 from abc import abstractmethod
 from copy import copy
-from enum import Enum
-from functools import total_ordering
-from itertools import chain
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    FrozenSet,
-    Iterable,
-    Iterator,
-    List,
-    Optional,
-    Tuple,
-)
-
-from tools.testing.test_run import TestRun, TestRuns
-
-
-# Note: Keep the implementation of Relevance private to this file so
-# that it's easy to change in the future as we discover what's needed
-@total_ordering
-class Relevance(Enum):
-    HIGH = 4
-    NONE = 3
-    PROBABLE = 2
-    UNLIKELY = 1
-    UNRANKED = 0
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, Relevance):
-            return False
-
-        return self.value == other.value
-
-    def __lt__(self, other: object) -> bool:
-        if not isinstance(other, Relevance):
-            raise NotImplementedError(f"Can't compare {self} to {other}")
-
-        return self.value < other.value
+from typing import Any, Dict, FrozenSet, Iterable, Iterator, List, Tuple
 
-    @staticmethod
-    def priority_traversal() -> Iterator["Relevance"]:
-        yield Relevance.HIGH
-        yield Relevance.NONE
-        yield Relevance.PROBABLE
-        yield Relevance.UNLIKELY
-        yield Relevance.UNRANKED
-
-
-METRIC_RELEVANCE_GROUP = "relevance_group"
-METRIC_ORDER_WITHIN_RELEVANCE_GROUP = "order_within_relevance_group"
-METRIC_NUM_TESTS_IN_RELEVANCE_GROUP = "num_tests_in_relevance_group"
-METRIC_ORDER_OVERALL = "order_overall"
-METRIC_HEURISTIC_NAME = "heuristic_name"
+from tools.testing.test_run import TestRun
 
 
 class TestPrioritizations:
@@ -71,128 +18,35 @@ class TestPrioritizations:
                otherwise it breaks the test sharding logic
     """
 
-    _test_priorities: List[List[TestRun]]
     _original_tests: FrozenSet[str]
+    _test_scores: Dict[TestRun, float]
 
     def __init__(
         self,
         tests_being_ranked: Iterable[str],  # The tests that are being prioritized.
-        high_relevance: Optional[List[str]] = None,
-        probable_relevance: Optional[List[str]] = None,
-        unranked_relevance: Optional[List[str]] = None,
-        unlikely_relevance: Optional[List[str]] = None,
-        no_relevance: Optional[List[str]] = None,
+        scores: Dict[TestRun, float],
     ) -> None:
         self._original_tests = frozenset(tests_being_ranked)
+        self._test_scores = {TestRun(test): 0.0 for test in self._original_tests}
 
-        self._test_priorities = [[] for _ in range(5)]
-        # Setup the initial priorities
-        self._test_priorities[Relevance.UNRANKED.value] = [
-            TestRun(test) for test in tests_being_ranked
-        ]
-
-        for test in high_relevance or []:
-            self.set_test_relevance(TestRun(test), Relevance.HIGH)
-        for test in probable_relevance or []:
-            self.set_test_relevance(TestRun(test), Relevance.PROBABLE)
-        for test in unranked_relevance or []:
-            self.set_test_relevance(TestRun(test), Relevance.UNRANKED)
-        for test in unlikely_relevance or []:
-            self.set_test_relevance(TestRun(test), Relevance.UNLIKELY)
-        for test in no_relevance or []:
-            self.set_test_relevance(TestRun(test), Relevance.NONE)
-
-        self.validate_test_priorities()
-
-    def _traverse_priorities(self) -> Iterator[Tuple[Relevance, List[TestRun]]]:
-        for relevance in Relevance.priority_traversal():
-            yield (relevance, self._test_priorities[relevance.value])
-
-    def get_pointer_to_test(self, test_run: TestRun) -> Iterator[Tuple[Relevance, int]]:
-        """
-        Returns all test runs that contain any subset of the given test_run and their current relevance.
-
-        self._test_priorities should NOT have items added or removed form it while iterating over the
-        results of this function.
-        """
-        # Find a test run that contains the given TestRun and it's relevance.
-        found_match = False
-        for relevance, tests in self._traverse_priorities():
-            for idx, existing_test_run in enumerate(tests):
-                # Does the existing test run contain any of the test we're looking for?
-                shared_test = existing_test_run & test_run
-                if not shared_test.is_empty():
-                    found_match = True
-                    yield (Relevance(relevance), idx)
-
-        if not found_match:
-            raise ValueError(f"Test {test_run} not found in any relevance group")
-
-    def _update_test_relevance(
-        self,
-        test_run: TestRun,
-        new_relevance: Relevance,
-        acceptable_relevance_fn: Callable[[Relevance, Relevance], bool],
-    ) -> None:
-        """
-        Updates the test run's relevance to the new relevance.
-
-        If the tests in the test run were previously split up into multiple test runs, all the chunks at a lower
-        relevance will be merged into one new test run at the new relevance, appended to the end of the relevance group.
-
-        However, any tests in a test run that are already at the desired relevance will be left alone, keeping it's
-        original place in the relevance group.
-        """
-        if test_run.test_file not in self._original_tests:
-            return  # We don't need this test
-
-        # The tests covered by test_run could potentially have been split up into
-        # multiple test runs, each at a different relevance. Let's make sure to bring
-        # all of them up to the minimum relevance
-        upgraded_tests = TestRun.empty()
-        tests_to_remove = []
-        for curr_relevance, test_run_idx in self.get_pointer_to_test(test_run):
-            if acceptable_relevance_fn(curr_relevance, new_relevance):
-                # This test is already at the desired relevance
-                continue  # no changes needed
-
-            test_run_to_rerank = self._test_priorities[curr_relevance.value][
-                test_run_idx
-            ]
-            # Remove the requested tests from their current relevance group, to be added to the new one
-            remaining_tests = test_run_to_rerank - test_run
-            upgraded_tests |= test_run_to_rerank & test_run
-
-            # Remove the tests that are being upgraded
-            if remaining_tests:
-                self._test_priorities[curr_relevance.value][
-                    test_run_idx
-                ] = remaining_tests
-            else:
-                # List traversal prevents us from deleting these immediately, so note them for later
-                tests_to_remove.append((curr_relevance, test_run_idx))
+        for test, score in scores.items():
+            self.set_test_score(test, score)
 
-        for relevance, test_idx in tests_to_remove:
-            del self._test_priorities[relevance.value][test_idx]
+        self.validate()
 
-        # And add them to the desired relevance group
-        if upgraded_tests:
-            self._test_priorities[new_relevance.value].append(upgraded_tests)
-
-    def set_test_relevance(self, test_run: TestRun, new_relevance: Relevance) -> None:
-        return self._update_test_relevance(test_run, new_relevance, operator.eq)
-
-    def raise_test_relevance(self, test_run: TestRun, new_relevance: Relevance) -> None:
-        return self._update_test_relevance(test_run, new_relevance, operator.ge)
-
-    def validate_test_priorities(self) -> None:
+    def validate(self) -> None:
         # Union all TestRuns that contain include/exclude pairs
-        all_tests = self.get_all_tests()
+        all_tests = self._test_scores.keys()
         files = {}
         for test in all_tests:
             if test.test_file not in files:
                 files[test.test_file] = copy(test)
             else:
+                assert (
+                    files[test.test_file] & test
+                ).is_empty(), (
+                    f"Test run `{test}` overlaps with `{files[test.test_file]}`"
+                )
                 files[test.test_file] |= test
 
         for test in files.values():
@@ -205,135 +59,148 @@ def validate_test_priorities(self) -> None:
             files.keys()
         ), "The set of tests in the TestPrioritizations must be identical to the set of tests passed in"
 
-    def integrate_priorities(self, other: "TestPrioritizations") -> None:
-        """
-        Integrates priorities from another TestPrioritizations object.
+    def _traverse_scores(self) -> Iterator[Tuple[float, TestRun]]:
+        # Sort by score, then alphabetically by test name
+        for test, score in sorted(
+            self._test_scores.items(), key=lambda x: (-x[1], str(x[0]))
+        ):
+            yield score, test
 
-        The final result takes all tests from the `self` and rearranges them based on priorities from `other`.
-        Currently it will only raise the priority of a test, never lower it.
-        """
-        assert (
-            self._original_tests == other._original_tests
-        ), "Both tests should stem from the same original test list"
+    def set_test_score(self, test_run: TestRun, new_score: float) -> None:
+        if test_run.test_file not in self._original_tests:
+            return  # We don't need this test
 
-        for relevance, _ in other._traverse_priorities():
-            if relevance > Relevance.UNRANKED:
-                for test in other._test_priorities[relevance.value]:
-                    self.raise_test_relevance(test, relevance)
+        relevant_test_runs: List[TestRun] = [
+            tr for tr in self._test_scores.keys() if tr & test_run and tr != test_run
+        ]
 
-        self.validate_test_priorities()
-        return
+        # Set the score of all the tests that are covered by test_run to the same score
+        self._test_scores[test_run] = new_score
+        # Set the score of all the tests that are not covered by test_run to original score
+        for relevant_test_run in relevant_test_runs:
+            old_score = self._test_scores[relevant_test_run]
+            del self._test_scores[relevant_test_run]
 
-    def get_all_tests(self) -> TestRuns:
-        """Returns all tests in the TestPrioritizations"""
-        return tuple(chain(*self._test_priorities))
+            not_to_be_updated = relevant_test_run - test_run
+            if not not_to_be_updated.is_empty():
+                self._test_scores[not_to_be_updated] = old_score
+        self.validate()
 
-    def get_prioritized_tests(self) -> TestRuns:
-        return self.get_high_relevance_tests() + self.get_probable_relevance_tests()
+    def add_test_score(self, test_run: TestRun, score_to_add: float) -> None:
+        if test_run.test_file not in self._original_tests:
+            return
 
-    def get_high_relevance_tests(self) -> TestRuns:
-        return tuple(test for test in self._test_priorities[Relevance.HIGH.value])
+        relevant_test_runs: List[TestRun] = [
+            tr for tr in self._test_scores.keys() if tr & test_run
+        ]
 
-    def get_probable_relevance_tests(self) -> TestRuns:
-        return tuple(test for test in self._test_priorities[Relevance.PROBABLE.value])
+        for relevant_test_run in relevant_test_runs:
+            old_score = self._test_scores[relevant_test_run]
+            del self._test_scores[relevant_test_run]
 
-    def get_unranked_relevance_tests(self) -> TestRuns:
-        return tuple(test for test in self._test_priorities[Relevance.UNRANKED.value])
+            intersection = relevant_test_run & test_run
+            if not intersection.is_empty():
+                self._test_scores[intersection] = old_score + score_to_add
 
-    def get_unlikely_relevance_tests(self) -> TestRuns:
-        return tuple(test for test in self._test_priorities[Relevance.UNLIKELY.value])
+            not_to_be_updated = relevant_test_run - test_run
+            if not not_to_be_updated.is_empty():
+                self._test_scores[not_to_be_updated] = old_score
 
-    def get_none_relevance_tests(self) -> TestRuns:
-        return tuple(test for test in self._test_priorities[Relevance.NONE.value])
+        self.validate()
 
-    def get_info_str(self) -> str:
-        info = ""
+    def get_all_tests(self) -> List[TestRun]:
+        """Returns all tests in the TestPrioritizations"""
+        return [x[1] for x in self._traverse_scores()]
 
-        def _test_info(label: str, tests: List[TestRun]) -> str:
-            if not tests:
-                return ""
+    def get_top_per_tests(self, n: int) -> Tuple[List[TestRun], List[TestRun]]:
+        """Divides list of tests into two based on the top n% of scores.  The
+        first list is the top, and the second is the rest."""
+        tests = [x[1] for x in self._traverse_scores()]
+        index = n * len(tests) // 100 + 1
+        return tests[:index], tests[index:]
 
-            s = f"{label} tests ({len(tests)}):\n"
-            for test in tests:
-                if test in tests:
-                    s += f"  {test}\n"
-            return s
+    def get_info_str(self, verbose: bool = True) -> str:
+        info = ""
 
-        for relevance_group, tests in self._traverse_priorities():
-            if relevance_group == Relevance.UNRANKED:
+        for score, test in self._traverse_scores():
+            if not verbose and score == 0:
                 continue
-            info += _test_info(
-                f"{Relevance(relevance_group).name.title()} Relevance", tests
-            )
+            info += f"  {test} ({score})\n"
 
-        return info.strip()
+        return info.rstrip()
 
     def print_info(self) -> None:
         print(self.get_info_str())
 
-    def _get_test_relevance_group(self, test_run: TestRun) -> Relevance:
-        """
-        Returns the relevance of the given test run.
-        If the heuristic split this test run among multiple runs, then return the
-        highest relevance of any of the test runs.
-        """
-        for relevance_group, tests in self._traverse_priorities():
+    def get_priority_info_for_test(self, test_run: TestRun) -> Dict[str, Any]:
+        """Given a failing test, returns information about it's prioritization that we want to emit in our metrics."""
+        for idx, (score, test) in enumerate(self._traverse_scores()):
             #  Different heuristics may result in a given test file being split
             #  into different test runs, so look for the overlapping tests to
             #  find the match
-            if any(t & test_run for t in tests):
-                return Relevance(relevance_group)
+            if test & test_run:
+                return {"position": idx, "score": score}
+        raise AssertionError(f"Test run {test_run} not found")
 
-        raise ValueError(f"Test {test_run} not found in any relevance group")
+    def get_test_stats(self, test: TestRun) -> Dict[str, Any]:
+        return {
+            "test_name": test.test_file,
+            "test_filters": test.get_pytest_filter(),
+            **self.get_priority_info_for_test(test),
+            "max_score": max(score for score, _ in self._traverse_scores()),
+            "min_score": min(score for score, _ in self._traverse_scores()),
+            "all_scores": {
+                str(test): score for test, score in self._test_scores.items()
+            },
+        }
 
-    def _get_test_order(self, test_run: TestRun) -> int:
+    def to_json(self) -> Dict[str, Any]:
         """
-        Returns the rank this heuristic suggested for the test run.
-        If the heuristic split this test run among multiple runs, then return the
-        highest relevance of any of the test runs.
+        Returns a JSON dict that describes this TestPrioritizations object.
         """
-        base_rank = 0
-
-        for _, relevance_group_tests in self._traverse_priorities():
-            for idx, test in enumerate(relevance_group_tests):
-                #  Different heuristics may result in a given test file being split
-                #  into different test runs, so look for the overlapping tests to
-                #  find the match
-                if test & test_run:
-                    return base_rank + idx
-            base_rank += len(relevance_group_tests)
+        json_dict = {
+            "_test_scores": [
+                (test.to_json(), score)
+                for test, score in self._test_scores.items()
+                if score != 0
+            ],
+            "_original_tests": list(self._original_tests),
+        }
+        return json_dict
 
-        raise ValueError(f"Test {test_run} not found in any relevance group")
+    @staticmethod
+    def from_json(json_dict: Dict[str, Any]) -> "TestPrioritizations":
+        """
+        Returns a TestPrioritizations object from a JSON dict.
+        """
+        test_prioritizations = TestPrioritizations(
+            tests_being_ranked=json_dict["_original_tests"],
+            scores={
+                TestRun.from_json(testrun_json): score
+                for testrun_json, score in json_dict["_test_scores"]
+            },
+        )
+        return test_prioritizations
 
-    def _get_test_order_within_relevance_group(self, test_run: TestRun) -> int:
+    def amend_tests(self, tests: List[str]) -> None:
         """
-        Returns the highest test order of any test class within the same relevance group.
-        If the heuristic split this test run among multiple runs, then return the
-        highest relevance of any of the test runs.
+        Removes tests that are not in the given list from the
+        TestPrioritizations.  Adds tests that are in the list but not in the
+        TestPrioritizations.
         """
-        for _, relevance_group_tests in self._traverse_priorities():
-            for idx, test in enumerate(relevance_group_tests):
-                #  Different heuristics may result in a given test file being split
-                #  into different test runs, so look for the overlapping tests to
-                #  find the match
-                if test & test_run:
-                    return idx
+        valid_scores = {
+            test: score
+            for test, score in self._test_scores.items()
+            if test.test_file in tests
+        }
+        self._test_scores = valid_scores
 
-        raise ValueError(f"Test {test_run} not found in any relevance group")
+        for test in tests:
+            if test not in self._original_tests:
+                self._test_scores[TestRun(test)] = 0
+        self._original_tests = frozenset(tests)
 
-    def get_priority_info_for_test(self, test_run: TestRun) -> Dict[str, Any]:
-        """Given a failing test, returns information about it's prioritization that we want to emit in our metrics."""
-        relevance = self._get_test_relevance_group(test_run)
-        return {
-            METRIC_RELEVANCE_GROUP: relevance.name,
-            METRIC_ORDER_WITHIN_RELEVANCE_GROUP: self._get_test_order_within_relevance_group(
-                test_run
-            ),
-            METRIC_NUM_TESTS_IN_RELEVANCE_GROUP: len(
-                self._test_priorities[relevance.value]
-            ),
-            METRIC_ORDER_OVERALL: self._get_test_order(test_run),
-        }
+        self.validate()
 
 
 class AggregatedHeuristics:
@@ -347,11 +214,19 @@ class AggregatedHeuristics:
         "HeuristicInterface", TestPrioritizations
     ]  # Key is the Heuristic's name. Dicts will preserve the order of insertion, which is important for sharding
 
-    unranked_tests: Tuple[str, ...]
+    _all_tests: FrozenSet[str]
 
-    def __init__(self, unranked_tests: List[str]) -> None:
-        self.unranked_tests = tuple(unranked_tests)
+    def __init__(self, all_tests: List[str]) -> None:
+        self._all_tests = frozenset(all_tests)
         self._heuristic_results = {}
+        self.validate()
+
+    def validate(self) -> None:
+        for heuristic, heuristic_results in self._heuristic_results.items():
+            heuristic_results.validate()
+            assert (
+                heuristic_results._original_tests == self._all_tests
+            ), f"Tests in {heuristic.name} are not the same as the tests in the AggregatedHeuristics"
 
     def add_heuristic_results(
         self, heuristic: "HeuristicInterface", heuristic_results: TestPrioritizations
@@ -360,6 +235,7 @@ def add_heuristic_results(
             raise ValueError(f"We already have heuristics for {heuristic.name}")
 
         self._heuristic_results[heuristic] = heuristic_results
+        self.validate()
 
     def get_aggregated_priorities(
         self, include_trial: bool = False
@@ -367,17 +243,19 @@ def get_aggregated_priorities(
         """
         Returns the aggregated priorities across all heuristics.
         """
-        aggregated_priorities = TestPrioritizations(
-            tests_being_ranked=self.unranked_tests
-        )
-
-        for heuristic, heuristic_results in self._heuristic_results.items():
-            if heuristic.trial_mode and not include_trial:
-                continue
+        valid_heuristics = {
+            heuristic: heuristic_results
+            for heuristic, heuristic_results in self._heuristic_results.items()
+            if not heuristic.trial_mode or include_trial
+        }
 
-            aggregated_priorities.integrate_priorities(heuristic_results)
+        new_tp = TestPrioritizations(self._all_tests, {})
 
-        return aggregated_priorities
+        for heuristic_results in valid_heuristics.values():
+            for score, testrun in heuristic_results._traverse_scores():
+                new_tp.add_test_score(testrun, score)
+        new_tp.validate()
+        return new_tp
 
     def get_test_stats(self, test: TestRun) -> Dict[str, Any]:
         """
@@ -388,51 +266,17 @@ def get_test_stats(self, test: TestRun) -> Dict[str, Any]:
             "test_filters": test.get_pytest_filter(),
         }
 
-        # Get baseline metrics assuming we didn't have any TD heuristics
-        baseline_priorities = TestPrioritizations(
-            tests_being_ranked=self.unranked_tests
-        )
-        baseline_stats = baseline_priorities.get_priority_info_for_test(test)
-        baseline_stats["heuristic_name"] = "baseline"
-        stats["without_heuristics"] = baseline_stats
-
         # Get metrics about the heuristics used
         heuristics = []
 
-        # Figure out which heuristic gave this test the highest priority (if any)
-        highest_ranking_heuristic = None
-        highest_ranking_heuristic_order: int = sys.maxsize
-
-        # And figure out how many heuristics suggested prioritizing this test
-        num_heuristics_prioritized_by = 0
-
         for heuristic, heuristic_results in self._heuristic_results.items():
             metrics = heuristic_results.get_priority_info_for_test(test)
             metrics["heuristic_name"] = heuristic.name
             metrics["trial_mode"] = heuristic.trial_mode
             heuristics.append(metrics)
 
-            if not heuristic.trial_mode and heuristic_results._get_test_relevance_group(
-                test
-            ) in [
-                Relevance.HIGH,
-                Relevance.PROBABLE,
-            ]:
-                num_heuristics_prioritized_by += 1
-
-                # "highest_ranking_heuristic" should only consider heuristics that actually prioritize the test.
-                # Sometimes an UNRANKED heuristic could be have an overall order above a PRIORITIZED heuristic
-                # because it was randomly sorted higher initially, while the heuristic that actually prioritized it
-                # used other data to determined it to be slighlty less relevant than other tests.
-                if metrics[METRIC_ORDER_OVERALL] < highest_ranking_heuristic_order:
-                    highest_ranking_heuristic = heuristic.name
-                    highest_ranking_heuristic_order = metrics[METRIC_ORDER_OVERALL]
-
         stats["heuristics"] = heuristics
 
-        # Easier to compute here than in rockset
-        stats["num_heuristics_prioritized_by"] = num_heuristics_prioritized_by
-
         stats[
             "aggregated"
         ] = self.get_aggregated_priorities().get_priority_info_for_test(test)
@@ -441,11 +285,18 @@ def get_test_stats(self, test: TestRun) -> Dict[str, Any]:
             include_trial=True
         ).get_priority_info_for_test(test)
 
-        if highest_ranking_heuristic:
-            stats["highest_ranking_heuristic"] = highest_ranking_heuristic
-
         return stats
 
+    def to_json(self) -> Dict[str, Any]:
+        """
+        Returns a JSON dict that describes this AggregatedHeuristics object.
+        """
+        json_dict: Dict[str, Any] = {}
+        for heuristic, heuristic_results in self._heuristic_results.items():
+            json_dict[heuristic.name] = heuristic_results.to_json()
+
+        return json_dict
+
 
 class HeuristicInterface:
     """
@@ -462,15 +313,6 @@ class HeuristicInterface:
     def __init__(self, **kwargs: Any) -> None:
         self.trial_mode = kwargs.get("trial_mode", False)  # type: ignore[assignment]
 
-    @abstractmethod
-    def get_test_priorities(self, tests: List[str]) -> TestPrioritizations:
-        """
-        Returns the prioritizations for the given tests.
-
-        The set of test in TestPrioritizations _must_ be identical to the set of tests passed in.
-        """
-        pass
-
     @property
     def name(self) -> str:
         return self.__class__.__name__
@@ -479,11 +321,10 @@ def __str__(self) -> str:
         return self.name
 
     @abstractmethod
-    def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
+    def get_prediction_confidence(self, tests: List[str]) -> TestPrioritizations:
         """
-        Like get_test_priorities, but instead returns a float ranking ranging
-        from -1 to 1, where negative means skip, positive means run, 0 means no
-        idea, and magnitude = how confident the heuristic is. Used by
-        AggregatedHeuristicsRankings.
+        Returns a float ranking ranging from -1 to 1, where negative means skip,
+        positive means run, 0 means no idea, and magnitude = how confident the
+        heuristic is. Used by AggregatedHeuristicsRankings.
         """
         pass
diff --git a/tools/testing/target_determination/heuristics/llm.py b/tools/testing/target_determination/heuristics/llm.py
new file mode 100644
index 0000000000000..d3021d93b1f09
--- /dev/null
+++ b/tools/testing/target_determination/heuristics/llm.py
@@ -0,0 +1,53 @@
+import json
+import os
+import re
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List
+
+from tools.stats.import_test_stats import ADDITIONAL_CI_FILES_FOLDER
+from tools.testing.target_determination.heuristics.interface import (
+    HeuristicInterface,
+    TestPrioritizations,
+)
+from tools.testing.target_determination.heuristics.utils import normalize_ratings
+from tools.testing.test_run import TestRun
+
+
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent.parent
+
+
+class LLM(HeuristicInterface):
+    def __init__(self, **kwargs: Dict[str, Any]):
+        super().__init__(**kwargs)
+
+    def get_prediction_confidence(self, tests: List[str]) -> TestPrioritizations:
+        critical_tests = self.get_mappings()
+        filter_valid_tests = {
+            TestRun(test): score
+            for test, score in critical_tests.items()
+            if test in tests
+        }
+        normalized_scores = normalize_ratings(filter_valid_tests, 0.25)
+        return TestPrioritizations(tests, normalized_scores)
+
+    def get_mappings(self) -> Dict[str, float]:
+        path = (
+            REPO_ROOT
+            / ADDITIONAL_CI_FILES_FOLDER
+            / "llm_results/mappings/indexer-files-gitdiff-output.json"
+        )
+        if not os.path.exists(path):
+            print(f"could not find path {path}")
+            return {}
+        with open(path) as f:
+            # Group by file
+            r = defaultdict(list)
+            for key, value in json.load(f).items():
+                re_match = re.match("(.*).py", key)
+                if re_match:
+                    file = re_match.group(1)
+                    r[file].append(value)
+            # Average the scores for each file
+            r = {file: sum(scores) / len(scores) for file, scores in r.items()}
+            return r
diff --git a/tools/testing/target_determination/heuristics/mentioned_in_pr.py b/tools/testing/target_determination/heuristics/mentioned_in_pr.py
new file mode 100644
index 0000000000000..074a375e2ddec
--- /dev/null
+++ b/tools/testing/target_determination/heuristics/mentioned_in_pr.py
@@ -0,0 +1,63 @@
+import re
+from typing import Any, List
+
+from tools.testing.target_determination.heuristics.interface import (
+    HeuristicInterface,
+    TestPrioritizations,
+)
+from tools.testing.target_determination.heuristics.utils import (
+    get_git_commit_info,
+    get_issue_or_pr_body,
+    get_pr_number,
+)
+from tools.testing.test_run import TestRun
+
+
+# This heuristic searches the PR body and commit titles, as well as issues/PRs
+# mentioned in the PR body/commit title for test names (search depth of 1) and
+# gives the test a rating of 1.  For example, if I mention "test_foo" in the PR
+# body, test_foo will be rated 1.  If I mention #123 in the PR body, and #123
+# mentions "test_foo", test_foo will be rated 1.
+class MentionedInPR(HeuristicInterface):
+    def __init__(self, **kwargs: Any):
+        super().__init__(**kwargs)
+
+    def _search_for_linked_issues(self, s: str) -> List[str]:
+        return re.findall(r"#(\d+)", s) + re.findall(r"/pytorch/pytorch/.*/(\d+)", s)
+
+    def get_prediction_confidence(self, tests: List[str]) -> TestPrioritizations:
+        try:
+            commit_messages = get_git_commit_info()
+        except Exception as e:
+            print(f"Can't get commit info due to {e}")
+            commit_messages = ""
+        try:
+            pr_number = get_pr_number()
+            if pr_number is not None:
+                pr_body = get_issue_or_pr_body(pr_number)
+            else:
+                pr_body = ""
+        except Exception as e:
+            print(f"Can't get PR body due to {e}")
+            pr_body = ""
+
+        # Search for linked issues or PRs
+        linked_issue_bodies: List[str] = []
+        for issue in self._search_for_linked_issues(
+            commit_messages
+        ) + self._search_for_linked_issues(pr_body):
+            try:
+                linked_issue_bodies.append(get_issue_or_pr_body(int(issue)))
+            except Exception as e:
+                pass
+
+        mentioned = []
+        for test in tests:
+            if (
+                test in commit_messages
+                or test in pr_body
+                or any(test in body for body in linked_issue_bodies)
+            ):
+                mentioned.append(test)
+
+        return TestPrioritizations(tests, {TestRun(test): 1 for test in mentioned})
diff --git a/tools/testing/target_determination/heuristics/previously_failed_in_pr.py b/tools/testing/target_determination/heuristics/previously_failed_in_pr.py
index f7fa972fa7b15..8d15d7537e915 100644
--- a/tools/testing/target_determination/heuristics/previously_failed_in_pr.py
+++ b/tools/testing/target_determination/heuristics/previously_failed_in_pr.py
@@ -15,6 +15,7 @@
 from tools.testing.target_determination.heuristics.utils import (
     python_test_file_to_test_name,
 )
+from tools.testing.test_run import TestRun
 
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent.parent
 
@@ -23,19 +24,11 @@ class PreviouslyFailedInPR(HeuristicInterface):
     def __init__(self, **kwargs: Dict[str, Any]):
         super().__init__(**kwargs)
 
-    def get_test_priorities(self, tests: List[str]) -> TestPrioritizations:
-        # Tests must always be returned in a deterministic order.
-        # Otherwise it breaks our test sharding logic
-        critical_tests = sorted(get_previous_failures())
-        test_rankings = TestPrioritizations(
-            tests_being_ranked=tests, high_relevance=critical_tests
-        )
-
-        return test_rankings
-
-    def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
+    def get_prediction_confidence(self, tests: List[str]) -> TestPrioritizations:
         critical_tests = get_previous_failures()
-        return {test: 1 for test in critical_tests if test in tests}
+        return TestPrioritizations(
+            tests, {TestRun(test): 1 for test in critical_tests if test in tests}
+        )
 
 
 def get_previous_failures() -> Set[str]:
diff --git a/tools/testing/target_determination/heuristics/profiling.py b/tools/testing/target_determination/heuristics/profiling.py
index c0d76df02798f..a1cf8e3a40d62 100644
--- a/tools/testing/target_determination/heuristics/profiling.py
+++ b/tools/testing/target_determination/heuristics/profiling.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List
+from typing import Any, List
 
 from tools.stats.import_test_stats import (
     ADDITIONAL_CI_FILES_FOLDER,
@@ -11,10 +11,10 @@
 )
 
 from tools.testing.target_determination.heuristics.utils import (
-    get_correlated_tests,
     get_ratings_for_tests,
     normalize_ratings,
 )
+from tools.testing.test_run import TestRun
 
 
 # Profilers were used to gather simple python code coverage information for each
@@ -24,20 +24,9 @@ class Profiling(HeuristicInterface):
     def __init__(self, **kwargs: Any):
         super().__init__(**kwargs)
 
-    def get_test_priorities(self, tests: List[str]) -> TestPrioritizations:
-        correlated_tests = get_correlated_tests(
-            ADDITIONAL_CI_FILES_FOLDER / TD_HEURISTIC_PROFILING_FILE
-        )
-        relevant_correlated_tests = [test for test in correlated_tests if test in tests]
-        test_rankings = TestPrioritizations(
-            tests_being_ranked=tests, probable_relevance=relevant_correlated_tests
-        )
-
-        return test_rankings
-
-    def get_prediction_confidence(self, tests: List[str]) -> Dict[str, float]:
+    def get_prediction_confidence(self, tests: List[str]) -> TestPrioritizations:
         test_ratings = get_ratings_for_tests(
             ADDITIONAL_CI_FILES_FOLDER / TD_HEURISTIC_PROFILING_FILE
         )
-        test_ratings = {k: v for (k, v) in test_ratings.items() if k in tests}
-        return normalize_ratings(test_ratings, 1)
+        test_ratings = {TestRun(k): v for (k, v) in test_ratings.items() if k in tests}
+        return TestPrioritizations(tests, normalize_ratings(test_ratings, 0.25))
diff --git a/tools/testing/target_determination/heuristics/utils.py b/tools/testing/target_determination/heuristics/utils.py
index 6df65eee3b891..7d8297d56e566 100644
--- a/tools/testing/target_determination/heuristics/utils.py
+++ b/tools/testing/target_determination/heuristics/utils.py
@@ -1,11 +1,16 @@
 import json
 import os
+import re
 import subprocess
 from collections import defaultdict
+from functools import lru_cache
 from pathlib import Path
-from typing import cast, Dict, List, Set, Union
+from typing import cast, Dict, List, Optional, Set, Union
+from urllib.request import Request, urlopen
 from warnings import warn
 
+from tools.testing.test_run import TestRun
+
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent.parent
 
 
@@ -17,7 +22,37 @@ def python_test_file_to_test_name(tests: Set[str]) -> Set[str]:
     return valid_tests
 
 
-def query_changed_files() -> List[str]:
+@lru_cache(maxsize=None)
+def get_pr_number() -> Optional[int]:
+    pr_number = os.environ.get("PR_NUMBER", "")
+    if pr_number == "":
+        re_match = re.match(r"^refs/tags/.*/(\d+)$", os.environ.get("GITHUB_REF", ""))
+        if re_match is not None:
+            pr_number = re_match.group(1)
+    if pr_number != "":
+        return int(pr_number)
+    return None
+
+
+@lru_cache(maxsize=None)
+def get_merge_base() -> str:
+    pr_number = get_pr_number()
+    if pr_number is not None:
+        github_token = os.environ.get("GITHUB_TOKEN")
+        headers = {
+            "Accept": "application/vnd.github.v3+json",
+            "Authorization": f"token {github_token}",
+        }
+        url = f"https://api.github.com/repos/pytorch/pytorch/pulls/{pr_number}"
+        with urlopen(Request(url, headers=headers)) as conn:
+            pr_info = json.loads(conn.read().decode())
+            base = f"origin/{pr_info['base']['ref']}"
+        merge_base = (
+            subprocess.check_output(["git", "merge-base", base, "HEAD"])
+            .decode()
+            .strip()
+        )
+        return merge_base
     default_branch = f"origin/{os.environ.get('GIT_DEFAULT_BRANCH', 'main')}"
     merge_base = (
         subprocess.check_output(["git", "merge-base", default_branch, "HEAD"])
@@ -27,30 +62,69 @@ def query_changed_files() -> List[str]:
 
     head = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip()
 
-    base_commit = merge_base
-    if base_commit == head:
+    if merge_base == head:
         # We are on the default branch, so check for changes since the last commit
-        base_commit = "HEAD^"
+        merge_base = "HEAD^"
+    return merge_base
+
+
+def query_changed_files() -> List[str]:
+    base_commit = get_merge_base()
 
     proc = subprocess.run(
         ["git", "diff", "--name-only", base_commit, "HEAD"],
         capture_output=True,
         check=False,
     )
+    print(f"base_commit: {base_commit}")
 
     if proc.returncode != 0:
         raise RuntimeError("Unable to get changed files")
 
     lines = proc.stdout.decode().strip().split("\n")
     lines = [line.strip() for line in lines]
+    print(f"Changed files: {lines}")
     return lines
 
 
-def normalize_ratings(ratings: Dict[str, float], max_value: float) -> Dict[str, float]:
+@lru_cache(maxsize=None)
+def get_git_commit_info() -> str:
+    """Gets the commit info since the last commit on the default branch."""
+    base_commit = get_merge_base()
+
+    return (
+        subprocess.check_output(
+            ["git", "log", f"{base_commit}..HEAD"],
+        )
+        .decode()
+        .strip()
+    )
+
+
+@lru_cache(maxsize=None)
+def get_issue_or_pr_body(number: int) -> str:
+    """Gets the body of an issue or PR"""
+    github_token = os.environ.get("GITHUB_TOKEN")
+    headers = {
+        "Accept": "application/vnd.github.v3+json",
+        "Authorization": f"token {github_token}",
+    }
+    # Despite the 'issues' in the link, this also works for PRs
+    url = f"https://api.github.com/repos/pytorch/pytorch/issues/{number}"
+    with urlopen(Request(url, headers=headers)) as conn:
+        body: str = json.loads(conn.read().decode())["body"]
+        return body
+
+
+def normalize_ratings(
+    ratings: Dict[TestRun, float], max_value: float, min_value: float = 0
+) -> Dict[TestRun, float]:
     # Takse the ratings, makes the max value into max_value, and proportionally
     # distributes the rest of the ratings.
     # Ex [1,2,3,4] and max_value 8 gets converted to [2,4,6,8]
     # Assumes all rankings are >= 0
+    # min_value is what 0 gets mapped to and shifts the values accordingly.  Ex
+    # [1,2,3,4], min_value 1, max_value 5 gets converted to [2,3,4,5]
     # Don't modify in place
     if len(ratings) == 0:
         return ratings
@@ -60,7 +134,7 @@ def normalize_ratings(ratings: Dict[str, float], max_value: float) -> Dict[str,
     assert max_rating > 0
     normalized_ratings = {}
     for tf, rank in ratings.items():
-        normalized_ratings[tf] = rank / max_rating * max_value
+        normalized_ratings[tf] = rank / max_rating * (max_value - min_value) + min_value
     return normalized_ratings
 
 
diff --git a/tools/testing/test_run.py b/tools/testing/test_run.py
index e9eee27b4df2b..70ba67421eefb 100644
--- a/tools/testing/test_run.py
+++ b/tools/testing/test_run.py
@@ -1,6 +1,6 @@
 from copy import copy
 from functools import total_ordering
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, FrozenSet, Iterable, List, Optional, Union
 
 
 class TestRun:
@@ -15,8 +15,10 @@ class TestRun:
     """
 
     test_file: str
-    _exclued: Set[str]  # Tests that should be excluded from this test run
-    _included: Set[str]  # If non-empy, only these tests should be run in this test run
+    _excluded: FrozenSet[str]  # Tests that should be excluded from this test run
+    _included: FrozenSet[
+        str
+    ]  # If non-empy, only these tests should be run in this test run
 
     def __init__(
         self,
@@ -24,26 +26,23 @@ def __init__(
         excluded: Optional[Iterable[str]] = None,
         included: Optional[Iterable[str]] = None,
     ) -> None:
-        self._excluded = set()
-        self._included = set()
-
         if excluded and included:
             raise ValueError("Can't specify both included and excluded")
 
+        ins = set(included or [])
+        exs = set(excluded or [])
+
         if "::" in name:
             assert (
                 not included and not excluded
             ), "Can't specify included or excluded tests when specifying a test class in the file name"
             self.test_file, test_class = name.split("::")
-            self._included.add(test_class)
+            ins.add(test_class)
         else:
             self.test_file = name
 
-        # For testing purposes
-        if excluded:
-            self._excluded = set(excluded)
-        if included:
-            self._included = set(included)
+        self._excluded = frozenset(exs)
+        self._included = frozenset(ins)
 
     @staticmethod
     def empty() -> "TestRun":
@@ -57,17 +56,17 @@ def is_empty(self) -> bool:
     def is_full_file(self) -> bool:
         return not self._included and not self._excluded
 
-    def included(self) -> Set[str]:
-        return self._included.copy()
+    def included(self) -> FrozenSet[str]:
+        return self._included
 
-    def excluded(self) -> Set[str]:
-        return self._excluded.copy()
+    def excluded(self) -> FrozenSet[str]:
+        return self._excluded
 
     def get_pytest_filter(self) -> str:
         if self._included:
             return " or ".join(sorted(self._included))
         elif self._excluded:
-            return f"not ({' and '.join(sorted(self._excluded))})"
+            return f"not ({' or '.join(sorted(self._excluded))})"
         else:
             return ""
 
@@ -81,7 +80,7 @@ def contains(self, test: "TestRun") -> bool:
         if test.is_full_file():
             return False  # test contains all tests, but self doesn't
 
-        # Does self exclude a subset of what test excldes?
+        # Does self exclude a subset of what test excludes?
         if test._excluded:
             return test._excluded.issubset(self._excluded)
 
@@ -124,15 +123,8 @@ def __eq__(self, other: object) -> bool:
         ret = ret and self._excluded == other._excluded
         return ret
 
-    def __ior__(  # noqa: PYI034 Method returns `self`
-        self, other: "TestRun"
-    ) -> "TestRun":
-        res = self | other
-        self.test_file = res.test_file
-        self._included = res._included
-        self._excluded = res._excluded
-
-        return self
+    def __hash__(self) -> int:
+        return hash((self.test_file, self._included, self._excluded))
 
     def __or__(self, other: "TestRun") -> "TestRun":
         """
@@ -175,15 +167,6 @@ def __or__(self, other: "TestRun") -> "TestRun":
         excluded = self._excluded | other._excluded
         return TestRun(self.test_file, excluded=excluded - included)
 
-    def __isub__(  # noqa: PYI034 Method returns `self`
-        self, other: "TestRun"
-    ) -> "TestRun":
-        res = self - other
-        self.test_file = res.test_file
-        self._included = res._included
-        self._excluded = res._excluded
-        return self
-
     def __sub__(self, other: "TestRun") -> "TestRun":
         """
         To subtract test runs means to run all the tests in the first run except for what the second run specifies.
@@ -203,7 +186,7 @@ def __sub__(self, other: "TestRun") -> "TestRun":
         if other.is_full_file():
             return TestRun.empty()
 
-        def return_inclusions_or_empty(inclusions: Set[str]) -> TestRun:
+        def return_inclusions_or_empty(inclusions: FrozenSet[str]) -> TestRun:
             if inclusions:
                 return TestRun(self.test_file, included=inclusions)
             return TestRun.empty()
@@ -227,8 +210,23 @@ def __and__(self, other: "TestRun") -> "TestRun":
 
         return (self | other) - (self - other) - (other - self)
 
+    def to_json(self) -> Dict[str, Any]:
+        r: Dict[str, Any] = {
+            "test_file": self.test_file,
+        }
+        if self._included:
+            r["included"] = list(self._included)
+        if self._excluded:
+            r["excluded"] = list(self._excluded)
+        return r
 
-TestRuns = Tuple[TestRun, ...]
+    @staticmethod
+    def from_json(json: Dict[str, Any]) -> "TestRun":
+        return TestRun(
+            json["test_file"],
+            included=json.get("included", []),
+            excluded=json.get("excluded", []),
+        )
 
 
 @total_ordering
@@ -295,8 +293,8 @@ def __lt__(self, other: object) -> bool:
     def __str__(self) -> str:
         return f"{self.test} {self.shard}/{self.num_shards}"
 
-    def get_time(self) -> float:
-        return self.time or 0
+    def get_time(self, default: float = 0) -> float:
+        return self.time if self.time is not None else default
 
     def get_pytest_args(self) -> List[str]:
         filter = self.test.get_pytest_filter()
diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py
index 19c9a312649ad..fe29faa498dff 100644
--- a/tools/testing/test_selections.py
+++ b/tools/testing/test_selections.py
@@ -3,7 +3,7 @@
 import subprocess
 from pathlib import Path
 
-from typing import Callable, Dict, List, Optional, Sequence, Set, Tuple
+from typing import Callable, Dict, FrozenSet, List, Optional, Sequence, Tuple
 
 from tools.stats.import_test_stats import get_disabled_tests, get_slow_tests
 from tools.testing.test_run import ShardedTest, TestRun
@@ -49,6 +49,7 @@ def __init__(self) -> None:
         self.parallel: List[ShardedTest] = []
 
     def get_total_time(self) -> float:
+        """Default is the value for which to substitute if a test has no time"""
         procs = [0.0 for _ in range(NUM_PROCS_FOR_SHARDING_CALC)]
         for test in self.parallel:
             min_index = procs.index(min(procs))
@@ -67,44 +68,8 @@ def get_with_pytest_shard(
 ) -> List[ShardedTest]:
     sharded_tests: List[ShardedTest] = []
 
-    def get_duration_for_classes(
-        test_file: str, test_classes: Set[str]
-    ) -> Optional[float]:
-        duration: float = 0
-        if not test_class_times:
-            return None
-
-        for test_class in test_classes:
-            class_duration = test_class_times.get(test_file, {}).get(test_class, None)
-            if class_duration is None:
-                return None
-            if class_duration:
-                duration += class_duration
-        return duration
-
     for test in tests:
-        file_duration = test_file_times.get(test.test_file, None)
-        included = test.included()
-        excluded = test.excluded()
-        included_classes_duration = get_duration_for_classes(test.test_file, included)
-        excluded_classes_duration = get_duration_for_classes(test.test_file, excluded)
-
-        if included:
-            # If we don't have the time for all included classes, our upper bound is the file duration
-            duration = (
-                included_classes_duration
-                if included_classes_duration is not None
-                else file_duration
-            )
-        elif excluded:
-            # If we don't have the time for all excluded classes, our upper bound is file duration
-            duration = (
-                file_duration - excluded_classes_duration
-                if excluded_classes_duration is not None and file_duration is not None
-                else file_duration
-            )
-        else:
-            duration = file_duration
+        duration = get_duration(test, test_file_times, test_class_times or {})
 
         if duration and duration > THRESHOLD:
             num_shards = math.ceil(duration / THRESHOLD)
@@ -117,6 +82,102 @@ def get_duration_for_classes(
     return sharded_tests
 
 
+def get_duration(
+    test: TestRun,
+    test_file_times: Dict[str, float],
+    test_class_times: Dict[str, Dict[str, float]],
+) -> Optional[float]:
+    """Calculate the time for a TestRun based on the given test_file_times and
+    test_class_times.  Returns None if the time is unknown."""
+    file_duration = test_file_times.get(test.test_file, None)
+    if test.is_full_file():
+        return file_duration
+
+    def get_duration_for_classes(
+        test_file: str, test_classes: FrozenSet[str]
+    ) -> Optional[float]:
+        duration: float = 0
+
+        for test_class in test_classes:
+            class_duration = test_class_times.get(test_file, {}).get(test_class, None)
+            if class_duration is None:
+                return None
+            duration += class_duration
+        return duration
+
+    included = test.included()
+    excluded = test.excluded()
+    included_classes_duration = get_duration_for_classes(test.test_file, included)
+    excluded_classes_duration = get_duration_for_classes(test.test_file, excluded)
+
+    if included_classes_duration is None or excluded_classes_duration is None:
+        # Didn't get the time for all classes, so time is unknown
+        return None
+
+    if included:
+        return included_classes_duration
+    assert (
+        excluded
+    ), f"TestRun {test} is not full file but doesn't have included or excluded classes"
+    if file_duration is None:
+        return None
+    return file_duration - excluded_classes_duration
+
+
+def shard(
+    sharded_jobs: List[ShardJob],
+    pytest_sharded_tests: Sequence[ShardedTest],
+    estimated_time_limit: Optional[float] = None,
+    serial: bool = False,
+) -> None:
+    # Modifies sharded_jobs in place
+    if len(sharded_jobs) == 0:
+        assert (
+            len(pytest_sharded_tests) == 0
+        ), "No shards provided but there are tests to shard"
+        return
+
+    round_robin_index = 0
+
+    def _get_min_sharded_job(
+        sharded_jobs: List[ShardJob], test: ShardedTest
+    ) -> ShardJob:
+        if test.time is None:
+            nonlocal round_robin_index
+            job = sharded_jobs[round_robin_index % len(sharded_jobs)]
+            round_robin_index += 1
+            return job
+        return min(sharded_jobs, key=lambda j: j.get_total_time())
+
+    def _shard_serial(
+        tests: Sequence[ShardedTest], sharded_jobs: List[ShardJob]
+    ) -> None:
+        assert estimated_time_limit is not None, "Estimated time limit must be provided"
+        new_sharded_jobs = sharded_jobs
+        for test in tests:
+            if (
+                len(sharded_jobs) > 1
+                and sharded_jobs[-1].get_total_time() > estimated_time_limit
+            ):
+                new_sharded_jobs = sharded_jobs[:-1]
+            min_sharded_job = _get_min_sharded_job(new_sharded_jobs, test)
+            min_sharded_job.serial.append(test)
+
+    def _shard_parallel(
+        tests: Sequence[ShardedTest], sharded_jobs: List[ShardJob]
+    ) -> None:
+        for test in tests:
+            min_sharded_job = _get_min_sharded_job(sharded_jobs, test)
+            min_sharded_job.parallel.append(test)
+
+    if serial:
+        _shard_serial(pytest_sharded_tests, sharded_jobs)
+    else:
+        _shard_parallel(pytest_sharded_tests, sharded_jobs)
+
+    return
+
+
 def calculate_shards(
     num_shards: int,
     tests: Sequence[TestRun],
@@ -126,38 +187,69 @@ def calculate_shards(
     sort_by_time: bool = True,
 ) -> List[Tuple[float, List[ShardedTest]]]:
     must_serial = must_serial or (lambda x: True)
+    test_class_times = test_class_times or {}
 
-    known_tests: Sequence[TestRun] = tests
-    unknown_tests: Sequence[TestRun] = []
-
+    # Divide tests into pytest shards
     if sort_by_time:
         known_tests = [
             x
             for x in tests
-            if x.test_file in test_file_times
-            or (test_class_times and x.test_file in test_class_times)
+            if get_duration(x, test_file_times, test_class_times) is not None
         ]
         unknown_tests = [x for x in tests if x not in known_tests]
 
-    known_tests = get_with_pytest_shard(known_tests, test_file_times, test_class_times)
+        pytest_sharded_tests = sorted(
+            get_with_pytest_shard(known_tests, test_file_times, test_class_times),
+            key=lambda j: j.get_time(),
+            reverse=True,
+        ) + get_with_pytest_shard(unknown_tests, test_file_times, test_class_times)
+    else:
+        pytest_sharded_tests = get_with_pytest_shard(
+            tests, test_file_times, test_class_times
+        )
+    del tests
 
-    if sort_by_time:
-        known_tests = sorted(known_tests, key=lambda j: j.get_time(), reverse=True)
+    serial_tests = [test for test in pytest_sharded_tests if must_serial(test.name)]
+    parallel_tests = [test for test in pytest_sharded_tests if test not in serial_tests]
 
-    sharded_jobs: List[ShardJob] = [ShardJob() for _ in range(num_shards)]
-    for test in known_tests:
-        if must_serial(test.name):
-            min_sharded_job = min(sharded_jobs, key=lambda j: j.get_total_time())
-            min_sharded_job.serial.append(test)
-        else:
-            min_sharded_job = min(sharded_jobs, key=lambda j: j.get_total_time())
-            min_sharded_job.parallel.append(test)
+    serial_time = sum(test.get_time() for test in serial_tests)
+    parallel_time = sum(test.get_time() for test in parallel_tests)
+    total_time = serial_time + parallel_time / NUM_PROCS_FOR_SHARDING_CALC
+    estimated_time_per_shard = total_time / num_shards
+    # Separate serial tests from parallel tests as much as possible to maximize
+    # parallelism by putting all the serial tests on the first num_serial_shards
+    # shards. The estimated_time_limit is the estimated time it should take for
+    # the least filled serial shard. Ex if we have 8 min of serial tests, 20 min
+    # of parallel tests, 6 shards, and 2 procs per machine, we would expect each
+    # machine to take 3 min and should aim for 3 serial shards, with shards 1
+    # and 2 taking 3 min and shard 3 taking 2 min.  The estimated time limit
+    # would be 2 min. This ensures that the first few shard contains as many
+    # serial tests as possible and as few parallel tests as possible. The least
+    # filled/last (in the example, the 3rd) shard may contain a lot of both
+    # serial and parallel tests.
+    estimated_time_limit = 0.0
+    if estimated_time_per_shard != 0:
+        estimated_time_limit = serial_time % estimated_time_per_shard
+    if estimated_time_limit <= 0.01:
+        estimated_time_limit = estimated_time_per_shard
+    if total_time == 0:
+        num_serial_shards = num_shards
+    else:
+        num_serial_shards = max(math.ceil(serial_time / total_time * num_shards), 1)
+
+    sharded_jobs = [ShardJob() for _ in range(num_shards)]
+    shard(
+        sharded_jobs=sharded_jobs[:num_serial_shards],
+        pytest_sharded_tests=serial_tests,
+        estimated_time_limit=estimated_time_limit,
+        serial=True,
+    )
+    shard(
+        sharded_jobs=sharded_jobs,
+        pytest_sharded_tests=parallel_tests,
+        serial=False,
+    )
 
-    # Round robin the unknown jobs starting with the smallest shard
-    index = min(range(num_shards), key=lambda i: sharded_jobs[i].get_total_time())
-    for unknown_test in unknown_tests:
-        sharded_jobs[index].serial.append(ShardedTest(unknown_test, 1, 1, None))
-        index = (index + 1) % num_shards
     return [job.convert_to_tuple() for job in sharded_jobs]
 
 
diff --git a/tools/update_masked_docs.py b/tools/update_masked_docs.py
index 6c1c2aab56e91..911852699b979 100644
--- a/tools/update_masked_docs.py
+++ b/tools/update_masked_docs.py
@@ -41,7 +41,7 @@ def main() -> None:
 
     for func_name in sorted(torch.masked._ops.__all__):
         func = getattr(torch.masked._ops, func_name)
-        func_doc = torch.masked._generate_docstring(func)
+        func_doc = torch.masked._generate_docstring(func)  # type: ignore[no-untyped-call, attr-defined]
         _new_content.append(f'{func_name}_docstring = """{func_doc}"""\n')
 
     new_content = "\n".join(_new_content)
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 24903a207ecc3..8f879a8ecc783 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -79,6 +79,7 @@ list(APPEND TORCH_PYTHON_INCLUDE_DIRECTORIES ${LIBSHM_SRCDIR})
 set(TORCH_PYTHON_LINK_LIBRARIES
     python::python
     pybind::pybind11
+    opentelemetry::api
     shm
     fmt::fmt-header-only
     ATEN_CPU_FILES_GEN_LIB)
@@ -145,6 +146,13 @@ if(USE_ROCM)
     list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${ROCM_ROCTX_LIB})
 endif()
 
+if(USE_XPU)
+    include(${TORCH_ROOT}/cmake/public/xpu.cmake)
+    append_filelist("libtorch_python_xpu_sources" TORCH_PYTHON_SRCS)
+
+    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_XPU)
+endif()
+
 if(USE_CUDNN OR USE_ROCM)
     list(APPEND TORCH_PYTHON_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/shared/cudnn.cpp
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index aaf612651495f..1314d880b4cee 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -29,7 +29,9 @@ from typing import (
     overload,
     runtime_checkable,
 )
-from typing_extensions import ParamSpec
+from typing_extensions import ParamSpec, Self
+
+import numpy
 
 import torch
 from torch import inf, SymInt, Tensor
@@ -53,10 +55,11 @@ from torch.types import (
 )
 
 from torch._prims_common import DeviceLikeType
+from torch.utils._python_dispatch import TorchDispatchMode
 
 # This module is defined in torch/csrc/Module.cpp
 
-from . import _functorch, _lazy, _lazy_ts_backend, _nn, _onnx, _VariableFunctions, _cpu, _aoti
+from . import _functorch, _lazy, _lazy_ts_backend, _nn, _onnx, _VariableFunctions, _cpu, _aoti, _verbose
 
 K = TypeVar("K")
 T = TypeVar("T")
@@ -110,7 +113,44 @@ class Stream:
     device_index: _int
     device_type: _int
 
-    device: device  # The device of the stream
+    device: _device  # The device of the stream
+
+    @overload
+    def __new__(self, device: Optional[DeviceLikeType] = None, *, priority: _int = 0) -> Stream: ...
+    @overload
+    def __new__(self, stream_id: _int, device_index: _int, device_type: _int, *, priority: _int = 0) -> Stream: ...
+    def query(self) -> _bool: ...
+    def synchronize(self) -> None: ...
+    def wait_event(self, event: Event) -> None: ...
+    def wait_stream(self, other: Stream) -> None: ...
+    def record_event(self, event: Optional[Event] = None) -> Event: ...
+    def __hash__(self) -> _int: ...
+    def __repr__(self) -> str: ...
+    def __eq__(self, other: object) -> _bool: ...
+
+
+# Defined in torch/csrc/Event.cpp
+class Event:
+
+    device: _device  # The device of the Event
+    event_id: _int # The raw event created by device backend
+
+    def __new__(self,
+        device: Optional[DeviceLikeType] = None,
+        *,
+        enable_timing: _bool = False,
+        blocking: _bool = False,
+        interprocess: _bool = False) -> Event: ...
+    @classmethod
+    def from_ipc_handle(self, device: _device, ipc_handle: bytes) -> Event: ...
+    def record(self, stream: Optional[Stream] = None) -> None: ...
+    def wait(self, stream: Optional[Stream] = None) -> None: ...
+    def query(self) -> _bool: ...
+    def elapsed_time(self, other: Event) -> _float: ...
+    def synchronize(self) -> None: ...
+    def ipc_handle(self) -> bytes: ...
+    def __repr__(self) -> str: ...
+
 
 # Defined in torch/csrc/Size.cpp
 class Size(Tuple[_int, ...]):
@@ -389,6 +429,16 @@ ResolutionCallback = Callable[[str], Callable[..., Any]]
 
 # Defined in torch/csrc/jit/python/script_init.cpp
 #        and torch/csrc/jit/python/init.cpp
+def _maybe_call_torch_function_for_op_packet(
+    op_overload_packet: Any,
+    args: Any,
+    kwargs: Any,
+) -> Any: ...
+def _check_schema_allow_fake_script_object(
+    schema: FunctionSchema,
+    args: Any,
+    kwargs: Any,
+) -> _bool: ...
 def _create_function_from_graph(qualname: str, graph: Graph) -> ScriptFunction: ...
 def _debug_set_autodiff_subgraph_inlining(disabled: _bool) -> None: ...
 def _ivalue_tags_match(lhs: ScriptModule, rhs: ScriptModule) -> _bool: ...
@@ -854,6 +904,7 @@ class FunctionSchema:
     returns: List[Argument]
     name: str
     overload_name: str
+    is_mutable: _bool
 
 class _UpgraderEntry:
     bumped_at_version: _int
@@ -1088,6 +1139,8 @@ def _set_sdp_use_mem_efficient(
 ) -> None: ...  # THPModule_setSDPUseMemEfficient
 def _get_math_sdp_enabled() -> _bool: ...  # THPModule_userEnabledMathSDP
 def _set_sdp_use_math(arg: _bool) -> None: ...  # THPModule_setSDPUseMath
+def _get_cudnn_sdp_enabled() -> _bool: ...  # THPModule_userEnabledMathSDP
+def _set_sdp_use_cudnn(arg: _bool) -> None: ...  # THPModule_setSDPUseMath
 def _get_mkldnn_enabled() -> _bool: ...  # THPModule_userEnabledMkldnn
 def _set_mkldnn_enabled(arg: _bool) -> None: ...  # THPModule_setUserEnabledMkldnn
 def _get_cudnn_benchmark() -> _bool: ...  # THPModule_benchmarkCuDNN
@@ -1130,6 +1183,7 @@ def _meta_in_tls_dispatch_include() -> _bool: ...
 def _stash_obj_in_tls(key: str, arg: Any) -> None: ...
 def _get_obj_in_tls(key: str) -> Any: ...
 def _is_key_in_tls(key: str) -> _bool: ...
+def _select_batch_norm_backend(*args, **kwargs) -> BatchNormBackend: ...
 def _select_conv_backend(*args, **kwargs) -> ConvBackend: ...
 def _conv_determine_backend_memory_format(
     input: Tensor,
@@ -1195,6 +1249,15 @@ class _LinalgBackend:
     Cusolver: _LinalgBackend
     Magma: _LinalgBackend
 
+class BatchNormBackend(Enum): ...
+
+def _get_blas_preferred_backend() -> torch._C._BlasBackend: ...
+def _set_blas_preferred_backend(arg: torch._C._BlasBackend): ...
+
+class _BlasBackend:
+    Cublas: _BlasBackend
+    Cublaslt: _BlasBackend
+
 class ConvBackend(Enum): ...
 
 class Tag(Enum):
@@ -1211,6 +1274,7 @@ _has_mps: _bool
 has_lapack: _bool
 _has_cuda: _bool
 _has_magma: _bool
+_has_xpu: _bool
 _has_mkldnn: _bool
 _has_cudnn: _bool
 has_spectral: _bool
@@ -1223,12 +1287,21 @@ def is_grad_enabled() -> _bool: ...
 def _set_fwd_grad_enabled(enabled: _bool) -> None: ...
 def _is_fwd_grad_enabled() -> _bool: ...
 def is_inference_mode_enabled() -> _bool: ...
+@overload
+def set_autocast_enabled(device_type: str, enabled: _bool) -> None: ...
+@overload
 def set_autocast_enabled(enabled: _bool) -> None: ...
+@overload
+def is_autocast_enabled(device_type: str) -> _bool: ...
+@overload
 def is_autocast_enabled() -> _bool: ...
+def set_autocast_dtype(device_type: str, dtype: _dtype) -> None: ...
+def get_autocast_dtype(device_type: str) -> _dtype: ...
 def clear_autocast_cache() -> None: ...
 def set_autocast_cpu_enabled(enabled: _bool) -> None: ...
 def is_autocast_cpu_enabled() -> _bool: ...
 def _is_any_autocast_enabled() -> _bool: ...
+def _is_autocast_available(device_type: str) -> _bool: ...
 def set_autocast_cpu_dtype(dtype: _dtype) -> None: ...
 def set_autocast_gpu_dtype(dtype: _dtype) -> None: ...
 def get_autocast_cpu_dtype() -> _dtype: ...
@@ -1260,13 +1333,14 @@ def _pop_torch_function_stack() -> Any: ...
 def _get_function_stack_at(idx: _int) -> Any: ...
 def _len_torch_function_stack() -> _int: ...
 def _set_torch_dispatch_mode(cls: Any) -> None: ...
-def _push_on_torch_dispatch_stack(cls: Any) -> None: ...
+def _push_on_torch_dispatch_stack(cls: TorchDispatchMode) -> None: ...
 def _pop_torch_dispatch_stack(mode_key: Optional[torch._C._TorchDispatchModeKey] = None) -> Any: ...
 def _get_dispatch_mode(mode_key: Optional[torch._C._TorchDispatchModeKey]) -> Any: ...
-def _unset_dispatch_mode(mode: torch._C._TorchDispatchModeKey) -> Any: ...
-def _set_dispatch_mode(mode: Any) -> None: ...
+def _unset_dispatch_mode(mode: torch._C._TorchDispatchModeKey) -> Optional[TorchDispatchMode]: ...
+def _set_dispatch_mode(mode: TorchDispatchMode) -> None: ...
 def _get_dispatch_stack_at(idx: _int) -> Any: ...
 def _len_torch_dispatch_stack() -> _int: ...
+def _activate_gpu_trace() -> None: ...
 
 class _DisableTorchDispatch:
     def __init__(self): ...
@@ -1350,7 +1424,7 @@ class PyTorchFileWriter:
     def __init__(self, name: str) -> None: ...
     @overload
     def __init__(self, buffer: BinaryIO) -> None: ...
-    def write_record(self, name: str, data: Union[bytes, _int], size: _int) -> None: ...
+    def write_record(self, name: str, data: Union[Storage, bytes, _int], size: _int) -> None: ...
     def write_end_of_file(self) -> None: ...
     def set_min_version(self, version: _int) -> None: ...
     def get_all_written_records(self) -> List[str]: ...
@@ -1379,6 +1453,9 @@ class Generator:
     def __init__(self, device: Optional[DeviceLikeType] = None) -> None: ...
     def get_state(self) -> Tensor: ...
     def set_state(self, _new_state: Tensor) -> Generator: ...
+    def clone_state(self) -> Generator: ...
+    def graphsafe_get_state(self) -> Generator: ...
+    def graphsafe_set_state(self, _new_state: Generator) -> Generator: ...
     def set_offset(self, offset: _int) -> Generator: ...
     def get_offset(self) -> _int: ...
     def manual_seed(self, seed: _int) -> Generator: ...
@@ -1389,6 +1466,7 @@ class Generator:
 
 class _DispatchOperatorHandle:
     def schema(self) -> FunctionSchema: ...
+    def debug(self) -> str: ...
 
 class _DispatchModule:
     def def_(self, schema: str, alias: str = "") -> _DispatchModule: ...
@@ -1416,6 +1494,9 @@ class _DispatchModule:
     def define(self, schema: str, alias: str = "") -> _DispatchModule: ...
     def fallback_fallthrough(self, dispatch: str = "") -> _DispatchModule: ...
 
+_after_ADInplaceOrView_keyset: DispatchKeySet
+_after_autograd_keyset: DispatchKeySet
+
 def _dispatch_library(
     kind: str,
     name: str,
@@ -1439,6 +1520,10 @@ def _dispatch_has_kernel_for_any_dispatch_key(
     name: str,
     dispatch_key_set: DispatchKeySet,
 ) -> _bool: ...
+def _dispatch_kernel_for_dispatch_key_is_fallthrough(
+    name: str,
+    dispatch: _dispatchkey,
+) -> _bool: ...
 def _dispatch_has_computed_kernel_for_dispatch_key(
     name: str,
     dispatch: _dispatchkey,
@@ -1465,6 +1550,10 @@ def _dispatch_pystub(name: str, overload: str) -> Optional[Tuple[str, str]]: ...
 def _dispatch_is_alias_key(dispatch: _dispatchkey) -> _bool: ...
 def _functionality_to_backend_keys(dispatch: _dispatchkey) -> List[DispatchKey]: ...
 def _functionalization_reapply_views_tls() -> _bool: ...
+def _only_lift_cpu_tensors() -> _bool: ...
+def _set_only_lift_cpu_tensors(value: _bool) -> None: ...
+def _set_throw_on_mutable_data_ptr(tensor: Tensor) -> None: ...
+def _set_warn_deprecated_on_mutable_data_ptr(tensor: Tensor) -> None: ...
 
 class DispatchKey(Enum):
     ${dispatch_key_hints}
@@ -1516,11 +1605,21 @@ class _ForceDispatchKeyGuard:
     def __enter__(self): ...
     def __exit__(self, exc_type, exc_value, traceback): ...
 
+class _PreserveDispatchKeyGuard:
+    def __init__(self): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
 class _AutoDispatchBelowAutograd:
     def __init__(self): ...
     def __enter__(self): ...
     def __exit__(self, exc_type, exc_value, traceback): ...
 
+class _AutoDispatchBelowADInplaceOrView:
+    def __init__(self): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
 def _dispatch_print_registrations_for_dispatch_key(dispatch_key: str = "") -> None: ...
 def _dispatch_get_registrations_for_dispatch_key(
     dispatch_key: str = "",
@@ -1530,7 +1629,7 @@ def _are_functorch_transforms_active() -> _bool: ...
 # Define in torch/csrc/autograd/init.cpp
 def _set_python_dispatcher(dispatcher: object) -> None: ...
 
-def _get_singleton_int(id: _int, coeff: _int) -> SymInt: ...
+def _get_nested_int(id: _int, coeff: _int) -> SymInt: ...
 
 def _get_constant_bool_symnode(val: _bool) -> Any: ...
 
@@ -1604,6 +1703,15 @@ class TensorBase(metaclass=_TensorMeta):
     nbytes: _int
     itemsize: _int
     _has_symbolic_sizes_strides: _bool
+
+    def _view_func_unsafe(
+        self,
+        new_base: Tensor,
+        symint_visitor_fn: Optional[Callable[[_int], _int]] = None,
+        tensor_visitor_fn: Optional[Callable[[Tensor], Tensor]] = None
+    ):
+        ...
+
     ${tensor_method_hints}
 
 _TensorBase = TensorBase
@@ -1611,6 +1719,24 @@ _TensorBase = TensorBase
 # Defined in torch/csrc/multiprocessing/init.cpp
 def _multiprocessing_init() -> None: ...
 
+# Defined in torch/csrc/Module.cpp
+def _accelerator_hooks_device_count() -> _int: ...
+def _accelerator_hooks_set_current_device(device_index: _int) -> None: ...
+def _accelerator_hooks_get_current_device() -> _int: ...
+def _accelerator_hooks_exchange_device(device_index: _int) -> _int: ...
+def _accelerator_hooks_maybe_exchange_device(device_index: _int) -> _int: ...
+def _get_accelerator(check: _bool = False) -> _device: ...
+
+# Defined in torch/csrc/mtia/Module.cpp
+def _mtia_init() -> None: ...
+def _mtia_isBuilt() -> _bool: ...
+def _mtia_isInBadFork() -> _bool: ...
+def _mtia_deviceSynchronize() -> None: ...
+def _mtia_getCurrentStream(device: _int) -> Stream: ...
+def _mtia_setCurrentStream(stream: Stream) -> None: ...
+def _mtia_getDefaultStream(device: _int) -> Stream: ...
+
+
 # Defined in torch/csrc/mps/Module.cpp
 def _mps_deviceSynchronize() -> None: ...
 def _mps_get_default_generator() -> Generator: ...
@@ -1688,6 +1814,7 @@ def _cuda_getCheckpointState(device: _int, mempool: Tuple[_int, _int]) -> _cuda_
 def _set_cached_tensors_enabled(enabled: _bool) -> None: ...
 def _add_cached_tensor(t: Tensor) -> None: ...
 def _remove_cached_tensor(t: Tensor) -> None: ...
+def _tensors_data_ptrs_at_indices_equal(tensors: List[Tensor], ptrs: List[Optional[_int]], indices: List[_int]) -> _bool: ...
 def _construct_CUDA_Tensor_From_Storage_And_Metadata(metadata: dict, storage: Storage) -> Tensor: ...
 def _storage_Use_Count(storage_ptr: _int) -> _int: ...
 def _set_storage_access_error_msg(t: Tensor, s: str) -> None: ...
@@ -1764,6 +1891,7 @@ class _CudaDeviceProperties:
     is_multi_gpu_board: _int
     max_threads_per_multi_processor: _int
     gcnArchName: str
+    uuid: str
 
 # Functions related to SDPA
 class _SDPAParams:
@@ -1787,6 +1915,7 @@ class _SDPBackend(Enum):
     MATH = 0
     FLASH_ATTENTION = 1
     EFFICIENT_ATTENTION = 2
+    CUDNN_ATTENTION = 3
 
 def _can_use_flash_attention(params: _SDPAParams, debug: _bool) -> _bool: ...
 def _can_use_mem_efficient_attention(params: _SDPAParams, debug: _bool) -> _bool: ...
@@ -1864,6 +1993,7 @@ class _CudaEventBase:
 class _CUDAGraph:
     def capture_begin(self, pool: Optional[Tuple[_int, _int]] = ..., capture_error_mode: str = "global") -> None: ...
     def capture_end(self) -> None: ...
+    def register_generator_state(self, Generator) -> None: ...
     def replay(self) -> None: ...
     def reset(self) -> None: ...
     def pool(self) -> Tuple[_int, _int]: ...
@@ -1873,6 +2003,71 @@ class _CUDAGraph:
 def _cuda_isCurrentStreamCapturing() -> _bool: ...
 def _graph_pool_handle() -> Tuple[_int, _int]: ...
 
+# Defined in torch/csrc/xpu/Module.cpp
+def _xpu_setDevice(device: _int) -> None: ...
+def _xpu_exchangeDevice(device: _int) -> _int: ...
+def _xpu_maybeExchangeDevice(device: _int) -> _int: ...
+def _xpu_getDevice() -> _int: ...
+def _xpu_getDeviceCount() -> _int: ...
+def _xpu_init() -> None: ...
+def _xpu_setStream(stream_id: _int, device_index: _int, device_type: _int) -> None: ...
+def _xpu_getCurrentStream(device: _int) -> Tuple: ...
+def _xpu_getCurrentRawStream(device: _int) -> _int: ...
+def _xpu_synchronize(device: _int) -> None: ...
+def _xpu_emptyCache() -> None: ...
+
+class _XpuDeviceProperties:
+    name: str
+    platform_name: str
+    vendor: str
+    driver_version: str
+    version: str
+    total_memory: _int
+    max_compute_units: _int
+    gpu_eu_count: _int
+    gpu_subslice_count: _int
+    max_work_group_size: _int
+    max_num_sub_groups: _int
+    sub_group_sizes: List[_int]
+    has_fp16: _bool
+    has_fp64: _bool
+    has_atomic64: _bool
+    type: str
+
+# Defined in torch/csrc/xpu/Stream.cpp
+class _XpuStreamBase(Stream):
+    stream_id: _int
+    device_index: _int
+    device_type: _int
+
+    device: _device
+    sycl_queue: _int
+    priority: _int
+
+    def __new__(
+        cls,
+        priority: _int = 0,
+        stream_id: _int = 0,
+        device_index: _int = 0,
+        device_type: _int = 0,
+    ) -> _XpuStreamBase: ...
+    def query(self) -> _bool: ...
+    def synchronize(self) -> None: ...
+    @staticmethod
+    def priority_range() -> Tuple: ...
+
+# Defined in torch/csrc/xpu/Event.cpp
+class _XpuEventBase:
+    device: _device
+    sycl_event: _int
+
+    def __new__(cls, enable_timing: _bool = False) -> _XpuEventBase: ...
+    def record(self, stream: _XpuEventBase) -> None: ...
+    def wait(self, stream: _XpuStreamBase) -> None: ...
+    def query(self) -> _bool: ...
+    def elapsed_time(self, other: _XpuEventBase) -> _float: ...
+    def synchronize(self) -> None: ...
+
 # Defined in torch/csrc/DataLoader.cpp
 def _set_worker_signal_handlers(
     *arg: Any,
@@ -2083,14 +2278,14 @@ def _c10d_init() -> _bool: ...
 # Defined in torch/csrc/distributed/rpc/testing/init.cpp
 def _faulty_agent_init() -> _bool: ...
 def _register_py_class_for_device(device: str, cls: Any) -> None: ...
-def _activate_cuda_trace() -> None: ...
 
 # Defined in torch/csrc/Module.cpp
 def _current_graph_task_id() -> _int: ...
 def _current_autograd_node() -> _Node: ...
+def _dispatch_key_set(Tensor) -> str: ...
 
 # Defined in torch/csrc/Exceptions.cpp
-class _OutOfMemoryError(RuntimeError): ...
+class OutOfMemoryError(RuntimeError): ...
 class _DistError(RuntimeError): ...
 class _DistBackendError(RuntimeError): ...
 class _DistStoreError(RuntimeError): ...
@@ -2113,3 +2308,8 @@ def _save_jit_module_to_bytes(m: ScriptModule,  extra_files: Dict[str, Any]) ->
 def _get_module_info_from_flatbuffer(data: bytes): ...
 def _jit_resolve_packet(op_name: str, *args, **kwargs) -> str: ...
 def _swap_tensor_impl(t1: Tensor, t2: Tensor): ...
+def _save_pickle(obj: Any) -> bytes: ...
+
+# Defined in torch/csrc/jit/runtime/static/init.cpp
+def _jit_to_static_module(graph_or_module: Union[Graph,ScriptModule]) -> Any: ...
+def _fuse_to_static_module(graph_or_module: Union[Graph,ScriptModule], min_size: _int) -> Any: ...
diff --git a/torch/_C/_aoti.pyi b/torch/_C/_aoti.pyi
index fb0fe49e8a31d..072170be9b1d5 100644
--- a/torch/_C/_aoti.pyi
+++ b/torch/_C/_aoti.pyi
@@ -1,3 +1,21 @@
+from ctypes import c_void_p
+from typing import List
+
+from torch import Tensor
+
 # Defined in torch/csrc/inductor/aoti_runner/pybind.cpp
+
+# Tensor to AtenTensorHandle
+def unsafe_alloc_void_ptrs_from_tensors(tensors: List[Tensor]) -> List[c_void_p]: ...
+def unsafe_alloc_void_ptr_from_tensor(tensor: Tensor) -> c_void_p: ...
+
+# AtenTensorHandle to Tensor
+def alloc_tensors_by_stealing_from_void_ptrs(
+    handles: List[c_void_p],
+) -> List[Tensor]: ...
+def alloc_tensor_by_stealing_from_void_ptr(
+    handle: c_void_p,
+) -> Tensor: ...
+
 class AOTIModelContainerRunnerCpu: ...
 class AOTIModelContainerRunnerCuda: ...
diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi
index 4b62950fe85c1..118d913f68159 100644
--- a/torch/_C/_autograd.pyi
+++ b/torch/_C/_autograd.pyi
@@ -10,19 +10,21 @@ from ._profiler import (
     ProfilerConfig,
 )
 
-# Defined in tools/autograd/init.cpp
+# Defined in torch/csrc/autograd/init.cpp
 
 class DeviceType(Enum):
     CPU = ...
     CUDA = ...
+    XPU = ...
     MKLDNN = ...
     OPENGL = ...
     OPENCL = ...
     IDEEP = ...
     HIP = ...
     FPGA = ...
-    ORT = ...
+    MAIA = ...
     XLA = ...
+    MTIA = ...
     MPS = ...
     HPU = ...
     Meta = ...
@@ -52,8 +54,9 @@ class ProfilerEvent:
 class _KinetoEvent:
     def name(self) -> str: ...
     def device_index(self) -> int: ...
-    def start_us(self) -> int: ...
-    def duration_us(self) -> int: ...
+    def device_resource_id(self) -> int: ...
+    def start_ns(self) -> int: ...
+    def duration_ns(self) -> int: ...
     def is_async(self) -> bool: ...
     def linked_correlation_id(self) -> int: ...
     def shapes(self) -> List[List[int]]: ...
@@ -76,7 +79,7 @@ class _ProfilerResult:
     def legacy_events(self) -> List[List[ProfilerEvent]]: ...
     def save(self, path: str) -> None: ...
     def experimental_event_tree(self) -> List[_ProfilerEvent]: ...
-    def trace_start_us(self) -> int: ...
+    def trace_start_ns(self) -> int: ...
 
 class SavedTensor: ...
 
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index 8fc44f19dcf4c..28d790e3d6903 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -251,6 +251,19 @@ class Work:
     @staticmethod
     def unbox(obj: ScriptObject) -> Work: ...
 
+class Backend:
+    def __init__(
+        self,
+        rank: int,
+        size: int,
+    ): ...
+    @property
+    def supports_splitting(self) -> bool: ...
+    def rank(self) -> int: ...
+    def size(self) -> int: ...
+    def eager_connect_single_device(self, device: Optional[torch.device]) -> None: ...
+    def _set_sequence_number_for_group(self) -> None: ...
+
 class ProcessGroup:
     class Options:
         def __init__(self, backend: str, timeout: timedelta = ...): ...
@@ -461,9 +474,10 @@ class ProcessGroup:
         self,
         device: torch.device,
         backend_type: BackendType,
-        backend: Optional[ProcessGroup],
+        backend: Optional[Backend],
     ) -> None: ...
     def _set_group_name(self, name: str) -> None: ...
+    def _set_group_desc(self, desc: str) -> None: ...
     def name(self) -> str: ...
     def _has_hooks(self) -> bool: ...
     def _wait_for_pending_works(self) -> None: ...
@@ -472,6 +486,10 @@ class ProcessGroup:
     def bound_device_id(self) -> Optional[torch.device]: ...
     @bound_device_id.setter
     def bound_device_id(self, device: Optional[torch.device]) -> None: ...
+    @property
+    def group_name(self) -> str: ...
+    @property
+    def group_desc(self) -> str: ...
 
 class ProcessGroupRoundRobin(ProcessGroup): ...
 
@@ -479,7 +497,7 @@ def _round_robin_process_groups(
     process_groups: List[ProcessGroup],
 ) -> ProcessGroupRoundRobin: ...
 
-class ProcessGroupGloo(ProcessGroup):
+class ProcessGroupGloo(Backend):
     class Device: ...
     class Options: ...
 
@@ -496,11 +514,11 @@ class ProcessGroupGloo(ProcessGroup):
     def create_default_device() -> Device: ...
     def _set_default_timeout(self, timeout) -> None: ...
 
-class _ProcessGroupWrapper(ProcessGroup):
-    def __init__(self, pg: ProcessGroup, gloo_pg: ProcessGroupGloo): ...
+class _ProcessGroupWrapper(Backend):
+    def __init__(self, pg: Backend, gloo_pg: ProcessGroupGloo): ...
     wrapped_pg: Backend
 
-class ProcessGroupNCCL(ProcessGroup):
+class ProcessGroupNCCL(Backend):
     class Options:
         def __init__(self, timeout: Optional[timedelta] = None): ...
         @property
@@ -524,8 +542,11 @@ class ProcessGroupNCCL(ProcessGroup):
     def _group_start(self) -> None: ...
     def _group_end(self) -> None: ...
     def _set_default_timeout(self, timeout) -> None: ...
+    def _shutdown(self) -> None: ...
+    @property
+    def uid(self) -> int: ...
 
-class ProcessGroupUCC(ProcessGroup):
+class ProcessGroupUCC(Backend):
     def __init__(
         self,
         store: Store,
@@ -534,7 +555,7 @@ class ProcessGroupUCC(ProcessGroup):
         timeout: timedelta,
     ): ...
 
-class ProcessGroupMPI(ProcessGroup):
+class ProcessGroupMPI(Backend):
     def __init__(
         self,
         rank: int,
@@ -563,13 +584,10 @@ def _verify_params_across_processes(
     logger: Optional[Logger],
 ): ...
 def _make_nccl_premul_sum(factor: Union[float, List[Tensor]]) -> ReduceOp: ...
-
-class Backend:
-    def __init__(
-        self,
-        rank: int,
-        size: int,
-    ): ...
-    @property
-    def supports_splitting(self) -> bool: ...
-    def eager_connect_single_device(self, device: Optional[torch.device]) -> None: ...
+def _register_process_group(
+    group_name: str,
+    process_group: ProcessGroup,
+) -> None: ...
+def _resolve_process_group(group_name: str) -> ProcessGroup: ...
+def _unregister_all_process_groups() -> None: ...
+def _unregister_process_group(group_name: str) -> None: ...
diff --git a/torch/_C/_dynamo/compiled_autograd.pyi b/torch/_C/_dynamo/compiled_autograd.pyi
index ffc4713d0e6e5..8ec4fbbdae8c3 100644
--- a/torch/_C/_dynamo/compiled_autograd.pyi
+++ b/torch/_C/_dynamo/compiled_autograd.pyi
@@ -7,3 +7,4 @@ def set_autograd_compiler(
 ) -> Optional[Callable[[], AutogradCompilerInstance]]: ...
 def clear_cache() -> None: ...
 def is_cache_empty() -> bool: ...
+def set_verbose_logging(enable: bool) -> bool: ...
diff --git a/torch/_C/_dynamo/eval_frame.pyi b/torch/_C/_dynamo/eval_frame.pyi
index e7c7db9d80aff..f3ad6f7228279 100644
--- a/torch/_C/_dynamo/eval_frame.pyi
+++ b/torch/_C/_dynamo/eval_frame.pyi
@@ -1,5 +1,5 @@
 import types
-from typing import NewType, Optional
+from typing import List, NewType, Optional
 
 from torch._dynamo.types import DynamoCallback, DynamoGuardHook
 
@@ -11,7 +11,6 @@ def set_eval_frame(callback: DynamoCallback) -> DynamoCallback: ...
 def reset_code(code: types.CodeType) -> None: ...
 def unsupported(obj1: object, obj2: object) -> object: ...
 def skip_code(code: types.CodeType) -> None: ...
-def set_guard_fail_hook(hook: DynamoGuardHook) -> None: ...
 def set_guard_error_hook(hook: DynamoGuardHook) -> None: ...
 
 class _CacheEntry:
@@ -19,4 +18,9 @@ class _CacheEntry:
     code: types.CodeType
     next: Optional[_CacheEntry]
 
-def _debug_get_cache_entry_list(code: types.CodeType) -> Optional[_CacheEntry]: ...
+class _ExtraState:
+    def invalidate(self, cache_entry: _CacheEntry): ...
+
+def _debug_get_cache_entry_list(code: types.CodeType) -> List[_CacheEntry]: ...
+
+py_opcode_caches: List[int]
diff --git a/torch/_C/_dynamo/guards.pyi b/torch/_C/_dynamo/guards.pyi
index e3e8d130c5d04..2de2f10cd3288 100644
--- a/torch/_C/_dynamo/guards.pyi
+++ b/torch/_C/_dynamo/guards.pyi
@@ -4,6 +4,71 @@ import torch
 
 class GlobalStateGuard:
     def check(self) -> bool: ...
+    def reason(self) -> str: ...
+
+class LeafGuard: ...
+class GuardDebugInfo: ...
+
+class GuardManager:
+    def check(self, value) -> bool: ...
+    def check_verbose(self, value) -> GuardDebugInfo: ...
+
+    # Accessors
+    def globals_dict_manager(
+        self,
+        f_globals: Dict[str, Any],
+        source,
+        example_value,
+        guard_manager_enum,
+    ) -> GuardManager: ...
+    def dict_getitem_manager(
+        self, key, source, example_value, guard_manager_enum
+    ) -> GuardManager: ...
+    def global_weakref_manager(
+        self, global_name: str, source, example_value, guard_manager_enum
+    ) -> GuardManager: ...
+    def type_manager(
+        self, source, example_value, guard_manager_enum
+    ) -> GuardManager: ...
+    def getattr_manager(
+        self, attr: str, source, example_value, guard_manager_enum
+    ) -> GuardManager: ...
+    def lambda_manager(
+        self, python_lambda, source, example_value, guard_manager_enum
+    ) -> GuardManager: ...
+
+    # Leaf guards
+    def add_lambda_guard(self, user_lambda, verbose_code_parts: List[str]) -> None: ...
+    def add_id_match_guard(self, id_val, verbose_code_parts: List[str]) -> None: ...
+    def add_equals_match_guard(
+        self, equals_val, verbose_code_parts: List[str]
+    ) -> None: ...
+    def add_global_state_guard(self, verbose_code_parts: List[str]) -> None: ...
+
+class RootGuardManager(GuardManager):
+    def get_epilogue_lambda_guards(self) -> List[LeafGuard]: ...
+    def add_epilogue_lambda_guard(
+        self, guard: LeafGuard, verbose_code_parts: List[str]
+    ) -> None: ...
+
+class DictGuardManager(GuardManager):
+    def get_key_manager(
+        self, index, source, example_value, guard_manager_enum
+    ) -> GuardManager: ...
+    def get_value_manager(
+        self, index, source, example_value, guard_manager_enum
+    ) -> GuardManager: ...
+
+def install_tensor_aliasing_guard(
+    guard_managers: List[GuardManager],
+    tensor_names: List[str],
+    verbose_code_parts: List[str],
+): ...
+def install_no_tensor_aliasing_guard(
+    guard_managers: List[GuardManager],
+    tensor_names: List[str],
+    verbose_code_parts: List[str],
+): ...
 
 class TensorGuards:
     def __init__(
diff --git a/torch/_C/_functorch.pyi b/torch/_C/_functorch.pyi
index 60c9b4b3ce472..ef5ab302ea34d 100644
--- a/torch/_C/_functorch.pyi
+++ b/torch/_C/_functorch.pyi
@@ -11,18 +11,32 @@ def is_batchedtensor(tensor: Tensor) -> bool: ...
 def is_functionaltensor(tensor: Tensor) -> bool: ...
 def is_functorch_wrapped_tensor(tensor: Tensor) -> bool: ...
 def is_gradtrackingtensor(tensor: Tensor) -> bool: ...
+def is_legacy_batchedtensor(tensor: Tensor) -> bool: ...
 def maybe_get_bdim(tensor: Tensor) -> int: ...
 def maybe_get_level(tensor: Tensor) -> int: ...
+def maybe_current_level() -> Optional[int]: ...
 def unwrap_if_dead(tensor: Tensor) -> Tensor: ...
 def _unwrap_for_grad(tensor: Tensor, level: int) -> Tensor: ...
 def _wrap_for_grad(tensor: Tensor, level: int) -> Tensor: ...
 def _unwrap_batched(tensor: Tensor, level: int) -> Tuple[Tensor, Optional[int]]: ...
 def current_level() -> int: ...
+def count_jvp_interpreters() -> int: ...
 def _add_batch_dim(tensor: Tensor, bdim: int, level: int) -> Tensor: ...
 def set_single_level_autograd_function_allowed(allowed: bool) -> None: ...
 def get_single_level_autograd_function_allowed() -> bool: ...
 def _unwrap_functional_tensor(tensor: Tensor, reapply_views: bool) -> Tensor: ...
 def _wrap_functional_tensor(tensor: Tensor, level: int) -> Tensor: ...
+def _vmap_increment_nesting(batch_size: int, randomness: str) -> int: ...
+def _vmap_decrement_nesting() -> int: ...
+def _grad_increment_nesting() -> int: ...
+def _grad_decrement_nesting() -> int: ...
+def _jvp_increment_nesting() -> int: ...
+def _jvp_decrement_nesting() -> int: ...
+
+class _PreserveDynamicLayerStack:
+    def __init__(self): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
 
 # Defined in aten/src/ATen/functorch/Interpreter.h
 class TransformType(Enum):
@@ -66,6 +80,7 @@ class CVmapInterpreterPtr:
 
 class DynamicLayer: ...
 
+def get_interpreter_stack() -> list[CInterpreter]: ...
 def peek_interpreter_stack() -> CInterpreter: ...
 def pop_dynamic_layer_stack() -> DynamicLayer: ...
 def push_dynamic_layer_stack(dl: DynamicLayer) -> int: ...
diff --git a/torch/_C/_onnx.pyi b/torch/_C/_onnx.pyi
index 376f461c35881..2e8e5a0c66117 100644
--- a/torch/_C/_onnx.pyi
+++ b/torch/_C/_onnx.pyi
@@ -25,6 +25,8 @@ class TensorProtoDataType(Enum):
     BFLOAT16 = ...
     FLOAT8E5M2 = ...
     FLOAT8E4M3FN = ...
+    FLOAT8E5M2FNUZ = ...
+    FLOAT8E4M3FNUZ = ...
 
 class OperatorExportTypes(Enum):
     ONNX = ...
diff --git a/torch/_C/_profiler.pyi b/torch/_C/_profiler.pyi
index f1b8cda073b0d..d19e72f57322c 100644
--- a/torch/_C/_profiler.pyi
+++ b/torch/_C/_profiler.pyi
@@ -39,6 +39,7 @@ class ActiveProfilerType(Enum):
 class ProfilerActivity(Enum):
     CPU = ...
     CUDA = ...
+    XPU = ...
     MTIA = ...
     PrivateUse1 = ...
 
@@ -233,6 +234,11 @@ def symbolize_tracebacks(
 ) -> List[List[Dict[str, str]]]: ...
 
 class _RecordFunctionFast:
-    def __init__(self, name: str) -> None: ...
+    def __init__(
+        self,
+        name: str,
+        input_values: Optional[Union[list, tuple]] = None,
+        keyword_values: Optional[dict] = None,
+    ) -> None: ...
     def __enter__(self) -> None: ...
     def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: ...
diff --git a/torch/_C/return_types.pyi.in b/torch/_C/return_types.pyi.in
index a9b7ab733e4eb..458a076d7bfef 100644
--- a/torch/_C/return_types.pyi.in
+++ b/torch/_C/return_types.pyi.in
@@ -8,10 +8,12 @@ from typing import (
     List,
     Literal,
     NamedTuple,
+    NoReturn,
     Optional,
     overload,
     Sequence,
     Tuple,
+    Type,
     TypeVar,
     Union,
 )
@@ -29,4 +31,6 @@ from torch.types import (
     Number,
 )
 
-${namedtuple_defs}
+${structseq_defs}
+
+all_return_types: List[Type] = []
diff --git a/torch/__future__.py b/torch/__future__.py
index 9ac8406e8f8ea..f172ee3c8fe22 100644
--- a/torch/__future__.py
+++ b/torch/__future__.py
@@ -1,21 +1,75 @@
-"""
-This global flag controls whether to assign new tensors to the parameters
-instead of changing the existing parameters in-place when converting an `nn.Module`
-using the following methods:
-1. `module.cuda()` / `.cpu()` (for moving `module` between devices)
-2. `module.float()` / `.double()` / `.half()` (for converting `module` to a different dtype)
-3. `module.to()` / `.type()` (for changing `module`'s device or dtype)
-4. `module._apply(fn)` (for generic functions applied to `module`)
+_overwrite_module_params_on_conversion: bool = False
+_swap_module_params_on_conversion: bool = False
 
-Default: False
-"""
-_overwrite_module_params_on_conversion = False
 
+def set_overwrite_module_params_on_conversion(value: bool) -> None:
+    """
+    Sets whether to assign new tensors to the parameters instead of changing the
+    existing parameters in-place when converting an ``nn.Module``.
 
-def set_overwrite_module_params_on_conversion(value):
+    When enabled, the following methods will assign new parameters to the module:
+
+    #. ``module.{device}()`` (e.g. :meth:`nn.Module.cuda()`) for moving a module between devices
+    #. ``module.{dtype}()`` (e.g. :meth:`nn.Module.float()`) for converting a module to a different dtype
+    #. :meth:`nn.Module.to`
+    #. :meth:`nn.Module.to_empty`
+
+    Args:
+        value (bool): Whether to assign new tensors or not.
+
+    """
     global _overwrite_module_params_on_conversion
     _overwrite_module_params_on_conversion = value
 
 
-def get_overwrite_module_params_on_conversion():
+def get_overwrite_module_params_on_conversion() -> bool:
+    """
+    Returns whether to assign new tensors to the parameters instead of changing the
+    existing parameters in-place when converting an :class:`torch.nn.Module`. Defaults to ``False``.
+
+    See :func:`~torch.__future__.set_overwrite_module_params_on_conversion` for more information.
+    """
     return _overwrite_module_params_on_conversion
+
+
+def set_swap_module_params_on_conversion(value: bool) -> None:
+    """
+    Sets whether to use :func:`~torch.utils.swap_tensors` instead of setting ``.data`` to
+    change the existing parameters in-place when converting an ``nn.Module`` and instead
+    of ``param.copy_(state_dict[key])`` when loading a state dict into an ``nn.Module``.
+
+    .. note::
+        This function takes precedence over :func:`~torch.__future__.get_overwrite_module_params_on_conversion`
+
+    When enabled, the following methods will swap the existing parameters in-place:
+
+    #. ``module.{device}()`` (e.g. :meth:`nn.Module.cuda()`) for moving a module between devices
+    #. ``module.{dtype}()`` (e.g. :meth:`nn.Module.float()`) for converting a module to a different dtype
+    #. :meth:`nn.Module.to`
+    #. :meth:`nn.Module.to_empty`
+    #. :meth:`nn.Module.load_state_dict`
+
+    The semantics for :meth:`~nn.Module.load_state_dict` when this is set are as follows:
+
+    #. For each parameter/buffer, its corresponding ``state_dict['key']`` is transformed via
+       :meth:`~torch.Tensor.module_load` (i.e. ``res = param.module_load(state_dict['key'])``)
+    #. If necessary, ``res`` will be wrapped in an :class:`~nn.Parameter`
+    #. The parameter/buffer in the module will be swapped via :func:`~torch.utils.swap_tensors`
+       with ``res``
+
+    Args:
+        value (bool): Whether to use :func:`~torch.utils.swap_tensors` or not.
+
+    """
+    global _swap_module_params_on_conversion
+    _swap_module_params_on_conversion = value
+
+
+def get_swap_module_params_on_conversion() -> bool:
+    """
+    Returns whether to use :func:`~torch.utils.swap_tensors` instead of setting .data to
+    change the existing parameters in-place when converting an ``nn.Module``. Defaults to ``False``.
+
+    See :func:`~torch.__future__.set_swap_module_params_on_conversion` for more information.
+    """
+    return _swap_module_params_on_conversion
diff --git a/torch/__init__.py b/torch/__init__.py
index 98c9a43511c39..d2d92e7aec999 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -57,8 +57,8 @@ def _running_with_deploy():
     'set_warn_always', 'is_warn_always_enabled', 'SymInt', 'SymFloat',
     'SymBool', 'sym_not', 'unravel_index',
     'sym_int', 'sym_float', 'sym_max', 'sym_min', 'sym_ite', 'compile', 'vmap',
-    'sym_sqrt',
-    'export', 'autocast', 'cond',
+    'export', 'autocast', 'cond', 'GradScaler',
+    'get_device_module',
 ]
 
 ################################################################################
@@ -66,9 +66,11 @@ def _running_with_deploy():
 ################################################################################
 
 if sys.platform == 'win32':
+    import sysconfig
     pfiles_path = os.getenv('ProgramFiles', 'C:\\Program Files')
     py_dll_path = os.path.join(sys.exec_prefix, 'Library', 'bin')
     th_dll_path = os.path.join(os.path.dirname(__file__), 'lib')
+    usebase_path = os.path.join(sysconfig.get_config_var("userbase"), 'Library', 'bin')
 
     # When users create a virtualenv that inherits the base environment,
     # we will need to add the corresponding library directory into
@@ -79,7 +81,7 @@ def _running_with_deploy():
     else:
         base_py_dll_path = ''
 
-    dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, base_py_dll_path]))
+    dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, base_py_dll_path, usebase_path]))
 
     if all(not os.path.exists(os.path.join(p, 'nvToolsExt64_1.dll')) for p in dll_paths):
         nvtoolsext_dll_path = os.path.join(
@@ -280,6 +282,12 @@ def __le__(self, other) -> builtins.bool:
     def __ge__(self, other) -> builtins.bool:
         raise AssertionError("type stub not overridden")
 
+    def __add__(self, other) -> "SymInt":
+        raise AssertionError("type stub not overridden")
+
+    def __mul__(self, other) -> "SymInt":
+        raise AssertionError("type stub not overridden")
+
     def __sym_max__(self, other):
         raise AssertionError("type stub not overridden")
 
@@ -296,12 +304,11 @@ def __repr__(self):
         return str(self.node)
 
     def __hash__(self) -> builtins.int:
-        ret = self.node.singleton_int()
-        if ret is not None:
-            return hash(ret)
+        if self.node.is_nested_int():
+            return hash(self.node.nested_int())
         else:
             # We could support constant SymInts as well, but not doing it for now
-            raise TypeError("unhashable type: non-singleton SymInt")
+            raise TypeError("unhashable type: non-nested SymInt")
 
 class SymFloat:
     """
@@ -335,6 +342,9 @@ def __le__(self, other) -> builtins.bool:
     def __ge__(self, other) -> builtins.bool:
         raise AssertionError("type stub not overridden")
 
+    def __trunc__(self):
+        raise AssertionError("type stub not overridden")
+
     def __sym_max__(self, other):
         raise AssertionError("type stub not overridden")
 
@@ -461,7 +471,7 @@ def sym_int(a):
     if isinstance(a, SymInt):
         return a
     elif isinstance(a, SymFloat):
-        return math.floor(a) if a >= 0 else math.ceil(a)  # type: ignore[arg-type, call-overload]
+        return math.trunc(a)
     return py_int(a)  # type: ignore[operator]
 
 def sym_max(a, b):
@@ -491,15 +501,33 @@ def sym_min(a, b):
         return b.__sym_min__(a)
     return builtins.min(a, b)  # type: ignore[operator]
 
-# Drop in replacement for math.sqrt
-def sym_sqrt(a):
-    from .overrides import has_torch_function_unary, handle_torch_function
+# Drop in replacement for math.sqrt, math.sin, math.cos etc
+current_module = sys.modules[__name__]
+
+def _get_sym_math_fn(name):
+    def fn(a):
+        from .overrides import has_torch_function_unary, handle_torch_function
+
+        if has_torch_function_unary(a):
+            return handle_torch_function(fn, (a,), a)
+        if hasattr(a, f"__sym_{name}__"):
+            return getattr(a, f"__sym_{name}__")()
+        return getattr(math, name)(a)
+
+    return fn
+
+for name in ("sqrt", "cos", "cosh", "sin", "sinh", "tan", "tanh", "asin", "acos", "atan"):
+    sym_name = f"_sym_{name}"
+    fn = _get_sym_math_fn(name)
+    fn.__qualname__ = fn.__name__ = sym_name
+    setattr(current_module, sym_name, fn)
+
+# Adding temporary shortcut
+sym_sqrt = current_module._sym_sqrt
+__all__.append("sym_sqrt")
+
+del fn, name, sym_name, current_module  # type: ignore[possibly-undefined]
 
-    if has_torch_function_unary(a):
-        return handle_torch_function(sym_sqrt, (a,), a)
-    if hasattr(a, "__sym_sqrt__"):
-        return a.__sym_sqrt__()
-    return math.sqrt(a)
 
 def sym_ite(b, t, f):
     from .overrides import has_torch_function, handle_torch_function
@@ -728,17 +756,17 @@ def set_default_tensor_type(t):
 def set_default_dtype(d):
     r"""
 
-    Sets the default floating point dtype to :attr:`d`. Supports torch.float32
-    and torch.float64 as inputs. Other dtypes may be accepted without complaint
-    but are not supported and are unlikely to work as expected.
+    Sets the default floating point dtype to :attr:`d`. Supports floating point dtype
+    as inputs. Other dtypes will cause torch to raise an exception.
 
     When PyTorch is initialized its default floating point dtype is torch.float32,
     and the intent of set_default_dtype(torch.float64) is to facilitate NumPy-like
     type inference. The default floating point dtype is used to:
 
-    1. Implicitly determine the default complex dtype. When the default floating point
-       type is float32 the default complex dtype is complex64, and when the default
-       floating point type is float64 the default complex type is complex128.
+    1. Implicitly determine the default complex dtype. When the default floating type is float16,
+       the default complex dtype is complex32. For float32, the default complex dtype is complex64.
+       For float64, it is complex128. For bfloat16, an exception will be raised because
+       there is no corresponding complex type for bfloat16.
     2. Infer the dtype for tensors constructed using Python floats or complex Python
        numbers. See examples below.
     3. Determine the result of type promotion between bool and integer tensors and
@@ -760,7 +788,6 @@ def set_default_dtype(d):
         torch.complex64
 
         >>> torch.set_default_dtype(torch.float64)
-
         >>> # Python floats are now interpreted as float64
         >>> torch.tensor([1.2, 3]).dtype    # a new floating point tensor
         torch.float64
@@ -768,6 +795,14 @@ def set_default_dtype(d):
         >>> torch.tensor([1.2, 3j]).dtype   # a new complex tensor
         torch.complex128
 
+        >>> torch.set_default_dtype(torch.float16)
+        >>> # Python floats are now interpreted as float16
+        >>> torch.tensor([1.2, 3]).dtype    # a new floating point tensor
+        torch.float16
+        >>> # Complex Python numbers are now interpreted as complex128
+        >>> torch.tensor([1.2, 3j]).dtype   # a new complex tensor
+        torch.complex32
+
     """
     _C._set_default_dtype(d)
 
@@ -862,7 +897,7 @@ def use_deterministic_algorithms(mode: builtins.bool, *, warn_only: builtins.boo
     A handful of CUDA operations are nondeterministic if the CUDA version is
     10.2 or greater, unless the environment variable ``CUBLAS_WORKSPACE_CONFIG=:4096:8``
     or ``CUBLAS_WORKSPACE_CONFIG=:16:8`` is set. See the CUDA documentation for more
-    details: `<https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility>`_
+    details: `<https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility>`_
     If one of these environment variable configurations is not set, a :class:`RuntimeError`
     will be raised from these operations when called with CUDA tensors:
 
@@ -992,24 +1027,24 @@ def set_float32_matmul_precision(precision: str) -> None:
     Supports three settings:
 
         * "highest", float32 matrix multiplications use the float32 datatype (24 mantissa
-          bits) for internal computations.
+          bits with 23 bits explicitly stored) for internal computations.
         * "high", float32 matrix multiplications either use the TensorFloat32 datatype (10
-          mantissa bits) or treat each float32 number as the sum of two bfloat16 numbers
-          (approximately 16 mantissa bits), if the appropriate fast matrix multiplication
+          mantissa bits explicitly stored) or treat each float32 number as the sum of two bfloat16 numbers
+          (approximately 16 mantissa bits with 14 bits explicitly stored), if the appropriate fast matrix multiplication
           algorithms are available.  Otherwise float32 matrix multiplications are computed
           as if the precision is "highest".  See below for more information on the bfloat16
           approach.
         * "medium", float32 matrix multiplications use the bfloat16 datatype (8 mantissa
-          bits) for internal computations, if a fast matrix multiplication algorithm
+          bits with 7 bits explicitly stored) for internal computations, if a fast matrix multiplication algorithm
           using that datatype internally is available. Otherwise float32
           matrix multiplications are computed as if the precision is "high".
 
     When using "high" precision, float32 multiplications may use a bfloat16-based algorithm
     that is more complicated than simply truncating to some smaller number mantissa bits
-    (e.g. 10 for TensorFloat32, 8 for bfloat16).  Refer to [Henry2019]_ for a complete
+    (e.g. 10 for TensorFloat32, 7 for bfloat16 explicitly stored).  Refer to [Henry2019]_ for a complete
     description of this algorithm.  To briefly explain here, the first step is to realize
     that we can perfectly encode a single float32 number as the sum of three bfloat16
-    numbers (because float32 has 24 mantissa bits while bfloat16 has 8, and both have the
+    numbers (because float32 has 23 mantissa bits while bfloat16 has 7 explicitly stored, and both have the
     same number of exponent bits).  This means that the product of two float32 numbers can
     be exactly given by the sum of nine products of bfloat16 numbers.  We can then trade
     accuracy for speed by dropping some of these products.  The "high" precision algorithm
@@ -1234,7 +1269,8 @@ def _check_tensor_all(cond, message=None):  # noqa: F811
 # For Python Array API (https://data-apis.org/array-api/latest/API_specification/constants.html) and
 # NumPy consistency (https://numpy.org/devdocs/reference/constants.html)
 from math import e , nan , inf , pi
-__all__.extend(['e', 'pi', 'nan', 'inf'])
+newaxis: None = None
+__all__.extend(['e', 'pi', 'nan', 'inf', 'newaxis'])
 
 ################################################################################
 # Define Storage and Tensor classes
@@ -1424,7 +1460,7 @@ def _dtype(self):
     TypedStorage
 }
 
-# The _tensor_classes set is initialized by the call to _C._initialize_tensor_type_bindings()
+# The _tensor_classes set is initialized by the call to initialize_python_bindings.
 _tensor_classes: Set[Type] = set()
 
 # If you edit these imports, please update torch/__init__.py.in as well
@@ -1445,7 +1481,7 @@ def manager_path():
         raise RuntimeError("Unable to find torch_shm_manager at " + path)
     return path.encode('utf-8')
 
-from torch.amp import autocast
+from torch.amp import autocast, GradScaler
 
 # Initializing the extension shadows the built-in python float / int classes;
 # store them for later use by SymInt / SymFloat.
@@ -1490,6 +1526,15 @@ def manager_path():
         __all__.append(name)
 
 
+################################################################################
+# Add torch.dtype instances to the public API
+################################################################################
+
+import torch
+
+for attribute in dir(torch):
+    if isinstance(getattr(torch, attribute), torch.dtype):
+        __all__.append(attribute)
 
 ################################################################################
 # Import TorchDynamo's lazy APIs to avoid circular dependenices
@@ -1537,6 +1582,8 @@ def _assert(condition, message):
 from torch import cuda as cuda
 from torch import cpu as cpu
 from torch import mps as mps
+from torch import xpu as xpu
+from torch import mtia as mtia
 from torch import autograd as autograd
 from torch.autograd import (
     no_grad as no_grad,
@@ -1580,8 +1627,8 @@ def _assert(condition, message):
 _C._init_names(list(torch._storage_classes))
 
 # attach docstrings to torch and tensor functions
-from . import _torch_docs, _tensor_docs, _storage_docs
-del _torch_docs, _tensor_docs, _storage_docs
+from . import _torch_docs, _tensor_docs, _storage_docs, _size_docs
+del _torch_docs, _tensor_docs, _storage_docs, _size_docs
 
 
 def compiled_with_cxx11_abi() -> builtins.bool:
@@ -1646,6 +1693,9 @@ def __init__(self, mode, options, dynamic):
         self.apply_mode(mode)
         self.apply_options(options)
 
+        # Stash the compiler_fn to be used for backend match guard.
+        from torch._inductor.compile_fx import compile_fx
+        self.compiler_fn = compile_fx
         if self.config.get("triton.cudagraphs", False):
             os.environ["DISABLE_CUPTI_LAZY_REINIT"] = "1"
             # FIXME: CUDA Graph does not work well with CUPTI teardown.
@@ -1735,6 +1785,10 @@ def __eq__(self, other):
     def __call__(self, model_, inputs_):
         return self.compiler_fn(model_, inputs_, **self.kwargs)
 
+    def reset(self):
+        if hasattr(self.compiler_fn, "reset"):
+            self.compiler_fn.reset()
+
 
 def compile(model: Optional[Callable] = None, *,
             fullgraph: builtins.bool = False,
@@ -1745,6 +1799,8 @@ def compile(model: Optional[Callable] = None, *,
             disable: builtins.bool = False) -> Callable:
     """
     Optimizes given model/function using TorchDynamo and specified backend.
+    If you are compiling an :class:`torch.nn.Module`, you can also use :meth:`torch.nn.Module.compile`
+    to compile the module inplace without changing its structure.
 
     Concretely, for every frame executed within the compiled region, we will attempt
     to compile it and cache the compiled result on the code object for future
@@ -1752,7 +1808,7 @@ def compile(model: Optional[Callable] = None, *,
     results are not applicable for subsequent calls (this is called a "guard
     failure), you can use TORCH_LOGS=guards to debug these situations.
     Multiple compiled results can be associated with a frame up to
-    ``torch._dynamo.config.cache_size_limit``, which defaults to 64; at which
+    ``torch._dynamo.config.cache_size_limit``, which defaults to 8; at which
     point we will fall back to eager.  Note that compile caches are per
     *code object*, not frame; if you dynamically create multiple copies of a
     function, they will all share the same code cache.
@@ -1778,7 +1834,7 @@ def compile(model: Optional[Callable] = None, *,
 
         - Experimental or debug in-tree backends can be seen with `torch._dynamo.list_backends(None)`
 
-        - To register an out-of-tree custom backend: https://pytorch.org/docs/main/compile/custom-backends.html
+        - To register an out-of-tree custom backend: https://pytorch.org/docs/main/torch.compiler_custom_backends.html
        mode (str): Can be either "default", "reduce-overhead", "max-autotune" or "max-autotune-no-cudagraphs"
 
         - "default" is the default mode, which is a good balance between performance and overhead
@@ -1825,9 +1881,8 @@ def foo(x):
 
     """
     _C._log_api_usage_once("torch.compile")
-    # Temporary until we get proper support for python 3.12
-    if sys.version_info >= (3, 12):
-        raise RuntimeError("Dynamo is not supported on Python 3.12+")
+    if sys.version_info >= (3, 13):
+        raise RuntimeError("Dynamo is not supported on Python 3.13+")
 
     # Decorator mode
     if model is None:
@@ -1966,6 +2021,27 @@ def __getattr__(name):
 
         raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
 
+def get_device_module(device: Optional[Union[torch.device, str]] = None):
+    """
+    Returns the module associated with a given device(e.g., torch.device('cuda'), "mtia:0", "xpu", ...).
+    If no device is given, return the module for the current accelerator or CPU if none is present.
+    """
+    if isinstance(device, torch.device):
+        device_module_name = device.type
+    elif isinstance(device, str):
+        device_module_name = torch.device(device).type
+    elif device is None:
+        # Using default accelerator type. If no accelerator is available, it automatically returns CPU device.
+        device_module_name = torch._C._get_accelerator().type
+    else:
+        raise RuntimeError(f"Invalid value of device '{device}', expect torch.device, str, or None")
+    device_module = getattr(torch, device_module_name, None)
+    if device_module is None:
+        raise RuntimeError(
+            f"Device '{device_module_name}' does not have a corresponding module registered as 'torch.{device_module_name}'."
+        )
+    return device_module
+
 
 def _constrain_as_value(symbol, min: Optional[builtins.int] = None, max: Optional[builtins.int] = None):
     """
@@ -1982,39 +2058,17 @@ def _constrain_as_size(symbol, min: Optional[builtins.int] = None, max: Optional
     which then need to be used as tensor constructors. Providing these assertions to PyTorch can help resolve
       GuardOnDataDependentSymNode errors upon export, since we cannot guard on unbacked SymInts.
 
-    This function has unusual semantics which distinguish it from constrain_as_value.
-    Specifically, at compile-time, we will unsoundly assume that the resulting int is always >= 2.
-    As a result, max value you pass in should always be greater than 2.
-    This makes it easier to use the unbacked int in size contexts, as we will often attempt to guard on a size being zero/one
-    (e.g., when computing the contiguity of a tensor, or testing if broadcasting can occur),
-    which will not work on unbacked SymInts. Assuming that the int is >= 2 allows us to
-    report False to these tests. Although this is technically unsound,
-    in practice we observe that if your program works for all sizes >= 2,
-    it probably works for zero and one too. The reason specifically assume size is >= 2 is because
-    lot of PyTorch code is specialized for 0 and 1 which could result in not general graphs.
-    At runtime, we only assert that the user provided min/max values are respected.
-
-    To demonstrate in a scenario, suppose you do
-    ```
-    # Case 1
-    # This will assume symbol is between [2, inf) at compile time, but [0, inf) at runtime
-    constrain_as_size(symbol, min=0)
-
-    # Case 2
-    # This will assume symbol is between [2, N] at compile time, but [0, N] at runtime
-    constrain_as_size(symbol, min=0, max=N)
-
-    # Case 3
-    # This is not valid case as max is <= 2
-    constrain_as_size(symbol, min=0, max=1)
-
-    # Case 4
-    # This will assume symbol is between [2, inf) at compile time, AND [2, inf) at runtime
-    constrain_as_size(symbol, min=2)
-
-    # Case 5
-    # This will assume symbol is between [2, inf) at compile time, but [1, inf) at runtime
-    constrain_as_size(symbol, min=1)
+    This function has unusual semantics which distinguish it from
+    _constrain_as_value.  Specifically, in some circumstances in framework
+    code, we will treat this int as >= 2 (when we do a size-oblivious guard).
+    This makes it easier to This makes it easier to use the unbacked int in
+    size contexts, as we will often attempt to guard on a size being zero/one
+    (e.g., when computing the contiguity of a tensor, or testing if
+    broadcasting can occur), which will not work on unbacked SymInts.
+    However, if we conservatively assume that the size is not zero/one, we will
+    end up with a graph that will still work even if the size is zero/one.
+
+    For more details, see https://docs.google.com/document/d/1HSuTTVvYH1pTew89Rtpeu84Ht3nQEFTYhAX3Ypa_xJs/edit
     ```
     """
     torch.sym_constrain_range_for_size(symbol, min=min, max=max)
diff --git a/torch/_custom_op/impl.py b/torch/_custom_op/impl.py
index a826b0246a4de..df83c51bcfd96 100644
--- a/torch/_custom_op/impl.py
+++ b/torch/_custom_op/impl.py
@@ -14,6 +14,8 @@
 from torch.library import get_ctx
 
 from .autograd import autograd_kernel_indirection, construct_autograd_kernel
+import torch._library.infer_schema
+from torch._library.infer_schema import infer_schema
 
 """
 For a detailed guide on custom ops, please see
@@ -769,112 +771,6 @@ def compare(sig_args, schema_args):
     compare(kwargonly, schema.arguments.flat_kwarg_only)
 
 
-def infer_schema(prototype_function: typing.Callable) -> str:
-    sig = inspect.signature(prototype_function)
-
-    def error_fn(what):
-        raise ValueError(
-            f"custom_op(...)(func): {what} " f"Got func with signature {sig})"
-        )
-
-    params = [
-        parse_param(name, param, error_fn) for name, param in sig.parameters.items()
-    ]
-    ret = parse_return(sig.return_annotation, error_fn)
-    return f"({', '.join(params)}) -> {ret}"
-
-
-def parse_param(name, param, error_fn):
-    if not supported_param(param):
-        error_fn("We do not support positional-only args, varargs, or varkwargs.")
-
-    if param.annotation is inspect.Parameter.empty:
-        error_fn(f"Parameter {name} must have a type annotation.")
-
-    if param.annotation not in SUPPORTED_PARAM_TYPES.keys():
-        error_fn(
-            f"Parameter {name} has unsupported type {param.annotation}. "
-            f"The valid types are: {SUPPORTED_PARAM_TYPES.keys()}."
-        )
-
-    if param.default is not inspect.Parameter.empty:
-        error_fn(
-            f"Parameter {name} has a default value; this is not supported. "
-            f"If you want to use default values then create a function with "
-            f"default values that calls the CustomOp"
-        )
-
-    return f"{SUPPORTED_PARAM_TYPES[param.annotation]} {name}"
-
-
-def derived_types(
-    base_type, cpp_type, list_base, optional_base_list, optional_list_base
-):
-    result = [
-        (base_type, cpp_type),
-        (typing.Optional[base_type], f"{cpp_type}?"),
-    ]
-    if list_base:
-        result.append((typing.Sequence[base_type], f"{cpp_type}[]"))  # type: ignore[valid-type]
-    if optional_base_list:
-        result.append((typing.Sequence[typing.Optional[base_type]], f"{cpp_type}?[]"))  # type: ignore[valid-type]
-    if optional_list_base:
-        result.append((typing.Optional[typing.Sequence[base_type]], f"{cpp_type}[]?"))  # type: ignore[valid-type]
-    return result
-
-
-def get_supported_param_types():
-    data = [
-        # (python type, schema type, type[] variant, type?[] variant, type[]? variant
-        (torch.Tensor, "Tensor", True, True, False),
-        (int, "SymInt", True, False, True),
-        (float, "float", True, False, True),
-        (bool, "bool", True, False, True),
-        (str, "str", False, False, False),
-        (torch.types.Number, "Scalar", True, False, False),
-        (torch.dtype, "ScalarType", False, False, False),
-        (torch.device, "Device", False, False, False),
-    ]
-    result = []
-    for line in data:
-        result.extend(derived_types(*line))
-    return dict(result)
-
-
-SUPPORTED_RETURN_TYPES = {
-    torch.Tensor: "Tensor",
-    typing.List[torch.Tensor]: "Tensor[]",
-    int: "SymInt",
-    float: "float",
-    bool: "bool",
-    torch.types.Number: "Scalar",
-}
-
-
-def parse_return(annotation, error_fn):
-    origin = typing.get_origin(annotation)
-    if origin is not tuple:
-        if annotation not in SUPPORTED_RETURN_TYPES.keys():
-            error_fn(
-                f"Return has unsupported type {annotation}. "
-                f"The valid types are: {SUPPORTED_RETURN_TYPES}."
-            )
-        return SUPPORTED_RETURN_TYPES[annotation]
-
-    args = typing.get_args(annotation)
-    for arg in args:
-        if arg not in SUPPORTED_RETURN_TYPES:
-            error_fn(
-                f"Return has unsupported type {annotation}. "
-                f"The valid types are: {SUPPORTED_RETURN_TYPES}."
-            )
-
-    return "(" + ", ".join([SUPPORTED_RETURN_TYPES[arg] for arg in args]) + ")"
-
-
-SUPPORTED_PARAM_TYPES = get_supported_param_types()
-
-
 def report_error_callback(custom_op: typing.Any, key: str) -> None:
     if key == "Undefined":
         raise NotImplementedError(
@@ -958,14 +854,14 @@ def get_abstract_impl(qualname):
     return custom_op._get_impl("abstract").func
 
 
-def _custom_op_with_schema(qualname, schema):
+def _custom_op_with_schema(qualname, schema, needs_fixed_stride_order=True):
     ns, name = qualname.split("::")
     schema_str = f"{name}{schema}"
     function_schema = FunctionSchema.parse(schema_str)
     validate_schema(function_schema)
-
+    tags = [torch._C.Tag.needs_fixed_stride_order] if needs_fixed_stride_order else []
     lib = library.Library(ns, "FRAGMENT")
-    lib.define(schema_str)
+    lib.define(schema_str, tags=tags)
     ophandle = find_ophandle_or_throw(ns, function_schema.name)
     result = CustomOp(lib, ns, function_schema, name, ophandle, _private_access=True)
     result._register_autograd_kernel_indirection()
diff --git a/torch/_custom_ops.py b/torch/_custom_ops.py
index fe396da3fb90e..c13b0aaf339ad 100644
--- a/torch/_custom_ops.py
+++ b/torch/_custom_ops.py
@@ -61,7 +61,7 @@ def custom_op(qualname, func_or_schema=None):
         >>> # we will infer the types of the inputs and outputs.
         >>> @torch._custom_ops.custom_op("mylibrary::numpy_sin")
         >>> def numpy_sin(x: Tensor) -> Tensor:
-        >>>     raise NotImplementedError()
+        >>>     raise NotImplementedError
         >>>
         >>> # The custom op is now accessible via the torch.ops module:
         >>> torch.ops.mylibrary.numpy_sin
@@ -143,7 +143,7 @@ def impl(qualname, *, device_types=("cpu", "cuda"), func=None):
         >>> # we will infer the types of the inputs and outputs.
         >>> @torch._custom_ops.custom_op("mylibrary::numpy_cos")
         >>> def numpy_cos(x: Tensor) -> Tensor:
-        >>>     raise NotImplementedError()
+        >>>     raise NotImplementedError
         >>>
         >>> # The custom op is now accessible via the torch.ops module:
         >>> torch.ops.mylibrary.numpy_cos
@@ -207,7 +207,7 @@ def impl_abstract(qualname, *, func=None):
         >>> # Example 1: an operator without data-dependent output shape
         >>> @torch._custom_ops.custom_op("mylibrary::custom_linear")
         >>> def custom_linear(x: Tensor, weight: Tensor, bias: Tensor) -> Tensor:
-        >>>     raise NotImplementedError()
+        >>>     raise NotImplementedError
         >>>
         >>> @torch._custom_ops.impl_abstract("mylibrary::custom_linear")
         >>> def custom_linear_abstract(x, weight):
diff --git a/torch/_decomp/__init__.py b/torch/_decomp/__init__.py
index 582084a578716..982544dfe6079 100644
--- a/torch/_decomp/__init__.py
+++ b/torch/_decomp/__init__.py
@@ -171,6 +171,7 @@ def clamp_min(x):
     assert type in {"post_autograd", "pre_autograd", "meta"}
 
     def decomposition_decorator(fn: Callable) -> Callable:
+        orig_fn = fn
         if not unsafe:
             fn = _convert_out_params(fn)
 
@@ -183,7 +184,7 @@ def register(op):
 
         # To handle allowing multiple aten_ops at once
         pytree.tree_map_(register, aten_op)
-        return fn
+        return orig_fn
 
     return decomposition_decorator
 
@@ -275,6 +276,7 @@ def core_aten_decompositions() -> Dict[torch._ops.OperatorBase, Callable]:
             aten.clamp_min,
             aten.col2im,
             aten.count_nonzero,
+            aten.linalg_cross,
             aten.cudnn_batch_norm,
             aten.cudnn_batch_norm_backward,
             aten.deg2rad,
@@ -323,9 +325,12 @@ def core_aten_decompositions() -> Dict[torch._ops.OperatorBase, Callable]:
             aten.index_copy_,
             aten.index_fill,
             aten.index_fill_,
+            aten.isin,
             aten.isneginf,
             aten.isposinf,
             aten.l1_loss,
+            aten._lazy_clone,
+            aten._test_parallel_materialize,
             aten.leaky_relu_,
             aten.leaky_relu_backward,
             aten.lerp,
@@ -369,6 +374,8 @@ def core_aten_decompositions() -> Dict[torch._ops.OperatorBase, Callable]:
             aten.norm,
             aten.ones,
             aten.ones_like,
+            aten.pixel_shuffle,
+            aten.pixel_unshuffle,
             aten._prelu_kernel,
             aten._prelu_kernel_backward,
             aten._reshape_alias,
@@ -383,12 +390,13 @@ def core_aten_decompositions() -> Dict[torch._ops.OperatorBase, Callable]:
             aten.renorm,
             aten.renorm_,
             aten.replication_pad2d,
+            aten.resize_as,
+            aten.roll,
             aten.rot90,
             aten.rrelu_with_noise,
             aten.rrelu_with_noise_,
-            aten.rsub.Scalar,
-            aten.rsub.Tensor,
-            aten._scaled_dot_product_flash_attention.default,
+            aten.rsub,
+            aten._scaled_dot_product_flash_attention_for_cpu.default,
             aten.select_backward,
             aten.select_scatter,
             aten.sgn,
@@ -412,6 +420,7 @@ def core_aten_decompositions() -> Dict[torch._ops.OperatorBase, Callable]:
             aten.special_log_ndtr,
             aten.special_xlog1py,
             aten.split.Tensor,
+            aten.split_with_sizes_copy,
             aten.squeeze.default,
             aten.squeeze.dim,
             aten.std,
@@ -438,7 +447,9 @@ def core_aten_decompositions() -> Dict[torch._ops.OperatorBase, Callable]:
             aten.unsafe_split.Tensor,
             aten.unsafe_split_with_sizes,
             aten._unsafe_view,
+            aten.upsample_linear1d,
             aten.upsample_bilinear2d,
+            aten.upsample_trilinear3d,
             aten.upsample_nearest2d_backward,
             aten.view_as_complex,
             aten.xlogy,
@@ -447,6 +458,7 @@ def core_aten_decompositions() -> Dict[torch._ops.OperatorBase, Callable]:
             aten.zero_,
             aten.zeros,
             aten.zeros_like,
+            aten._chunk_cat,
             aten._weight_norm_interface,
         ]
     )
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 6feec49148daf..6cfccbab0d04b 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -14,7 +14,13 @@
 from torch import sym_float, sym_int, Tensor
 from torch._decomp import register_decomposition
 from torch._higher_order_ops.out_dtype import out_dtype
-from torch._prims_common import IntLike, NumberType, TensorLike, TensorSequenceType
+from torch._prims_common import (
+    IntLike,
+    NumberType,
+    suggest_memory_format,
+    TensorLike,
+    TensorSequenceType,
+)
 from torch._prims_common.wrappers import (
     _maybe_convert_to_dtype,
     _maybe_resize_out,
@@ -301,8 +307,8 @@ def _prelu_kernel_backward(
 def rrelu_with_noise(
     self: Tensor,
     noise: Tensor,
-    lower: float,
-    upper: float,
+    lower: float = 0.125,
+    upper: float = 0.3333333333333333,
     training: bool = False,
     generator: Optional[torch.Generator] = None,
 ) -> Tensor:
@@ -738,7 +744,7 @@ def slice_forward(
         raise RuntimeError("slice step must be positive")
 
     start_val = start if start is not None else 0
-    end_val = end if end is not None else sys.maxsize  # 2^63 – 1
+    end_val = end if end is not None else sys.maxsize  # 2^63 - 1
 
     if start_val < 0:
         start_val += sizes[dim]
@@ -844,7 +850,6 @@ def _im2col_col2im_indices_along_dim(
 
 @register_decomposition(aten.im2col)
 @out_wrapper()
-@pw_cast_for_opmath
 def im2col(
     input: Tensor,
     kernel_size: List[int],
@@ -1153,16 +1158,6 @@ def _log_softmax(x: Tensor, dim: int, half_to_float: bool):
     return result
 
 
-@register_decomposition(aten.rsub.Tensor)
-def rsub_Tensor(self: Tensor, other: Tensor, alpha: float = 1) -> Tensor:
-    return torch.sub(other, self, alpha=alpha)
-
-
-@register_decomposition(aten.rsub.Scalar)
-def rsub_Scalar(self: Tensor, other: float, alpha: float = 1) -> Tensor:
-    return torch.sub(other, self, alpha=alpha)
-
-
 @register_decomposition(aten.embedding)
 @out_wrapper()
 def embedding(
@@ -1222,10 +1217,108 @@ def prod(x: List[int]):
     return r
 
 
+def _pad_chunk(
+    tensors: List[Tensor],
+    dim: int,
+    num_chunks: int,
+) -> List[Tensor]:
+    padded_tensors = []
+    for tensor in tensors:
+        tensor_size = tensor.size()
+        pad_along_dim = (tensor_size[dim] + num_chunks - 1) // num_chunks * num_chunks
+        if pad_along_dim != tensor_size[dim]:
+            # Use aten.constant_pad_nd instead of copy_ for functionalization
+            pad = [0] * 2 * (tensor.ndim - dim - 1) + [
+                0,
+                pad_along_dim - tensor_size[dim],
+            ]
+            tensor = aten.constant_pad_nd(tensor, pad, 0)
+        view_size = tensor_size[:dim] + torch.Size([num_chunks, -1])
+        padded_tensors.append(tensor.view(view_size))
+    return padded_tensors
+
+
+def have_same_ndims(tensors: List[Tensor]):
+    ndim = tensors[0].ndim
+    for tensor in tensors:
+        if tensor.ndim != ndim:
+            return False
+    return True
+
+
+def leading_dimension_matches(tensors: List[Tensor], dim: int):
+    leading_dim_sizes = tensors[0].size()[:dim]
+    for tensor in tensors:
+        torch._check(
+            tensor.size()[:dim] == leading_dim_sizes,
+            lambda: "_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors",
+        )
+
+
+def _preprocess_chunk_cat_inputs(
+    tensors: List[Tensor],
+    dim: int,
+    num_chunks: int,
+):
+    torch._check(num_chunks >= 1, lambda: "_chunk_cat expects positive num_chunks")
+    torch._check(
+        len(tensors) > 0, lambda: "_chunk_cat expects a non-empty input tensor list"
+    )
+    expected_dtype = tensors[0].dtype
+    expected_device = tensors[0].device
+    for tensor in tensors:
+        torch._check(tensor.numel() > 0, lambda: "_chunk_cat expects non-empty tensor")
+        torch._check(
+            tensor.dtype == expected_dtype,
+            lambda: "_chunk_cat expects all input tensors with the same dtype",
+        )
+        torch._check(
+            tensor.device == expected_device,
+            lambda: "_chunk_cat expects all inputs tensors on the same device",
+        )
+    if have_same_ndims(tensors):
+        dim = utils.canonicalize_dim(tensors[0].dim(), dim)
+    else:
+        torch._check(
+            dim >= 0,
+            lambda: "_chunk_cat expects non-negative dim when input tensors have different ndims",
+        )
+        for tensor in tensors:
+            torch._check(
+                dim < tensor.ndim,
+                lambda: "_chunk_cat expects dim < ndim for all input tensors",
+            )
+    leading_dimension_matches(tensors, dim)
+    return dim
+
+
+@register_decomposition([aten._chunk_cat.default, aten._chunk_cat.out])
+def _chunk_cat(
+    tensors: List[Tensor],
+    dim: int,
+    num_chunks: int,
+    out: Optional[Tensor] = None,
+) -> Tensor:
+    dim = _preprocess_chunk_cat_inputs(tensors, dim, num_chunks)
+    padded_tensors = _pad_chunk(tensors, dim, num_chunks)
+    if out is None:
+        return torch.cat(padded_tensors, dim + 1)
+    else:
+        torch.cat(padded_tensors, dim + 1, out=out)
+        return out
+
+
 @register_decomposition(aten.split_with_sizes)
 def split_with_sizes(
     self: Tensor, split_sizes: List[int], dim: int = 0
 ) -> List[Tensor]:
+    # NB: Perform the check_is_size tests first so that the
+    # sum test does not try to do a replacement
+    for i in range(len(split_sizes)):
+        torch._check_is_size(
+            split_sizes[i],
+            lambda: "split_with_sizes expects split_sizes have only non-negative entries",
+        )
     torch._check_with(
         ValueError,
         sum(split_sizes) == self.shape[dim],
@@ -1240,10 +1333,6 @@ def split_with_sizes(
 
     for i in range(num_splits):
         length = split_sizes[i]
-        torch._check_is_size(
-            length,
-            lambda: "split_with_sizes expects split_sizes have only non-negative entries",
-        )
         # We know this is true thanks to the sum, but this assertion helps
         # out our internal reasoning
         expect_true(start_idx + length <= self.shape[dim])
@@ -1252,6 +1341,26 @@ def split_with_sizes(
     return splits
 
 
+# out_wrapper currently does not allow optional outputs
+@register_decomposition(
+    [aten.split_with_sizes_copy.default, aten.split_with_sizes_copy.out]
+)
+def split_with_sizes_copy(
+    self: Tensor,
+    split_sizes: List[int],
+    dim: int = 0,
+    out: Optional[List[Tensor]] = None,
+) -> Optional[List[Tensor]]:
+    splits = split_with_sizes(self, split_sizes, dim=dim)
+    if out is None:
+        return [s.clone(memory_format=torch.contiguous_format) for s in splits]
+    else:
+        for output, split in zip(out, splits):
+            _maybe_resize_out(output, split.shape)
+            _safe_copy_out(copy_from=split, copy_to=output, exact_dtype=True)
+        return None
+
+
 @register_decomposition(aten.unsafe_split.Tensor)
 def unsafe_split(input: Tensor, split_size: int, dim: int = 0) -> Tuple[Tensor, ...]:
     return aten.split.Tensor(input, split_size, dim)
@@ -1289,7 +1398,7 @@ def tensor_split_tensor_indices_or_sections_py_impl(
     self: Tensor,
     tensor_indices_or_sections: Tensor,
     dim: int = 0,
-) -> List[Tensor]:
+) -> Tuple[Tensor, ...]:
     assert tensor_indices_or_sections.device.type == "cpu"
     assert tensor_indices_or_sections.dtype == torch.int64
     split_dim = tensor_indices_or_sections.dim()
@@ -1304,6 +1413,15 @@ def tensor_split_tensor_indices_or_sections_py_impl(
         return self.tensor_split(sections, dim)
     else:
         indices = [i.item() for i in tensor_indices_or_sections]
+        # WARNING: Tempted to torch._check_is_size on the indices here?  You
+        # can't: tensor_split works with negative values in indices:
+        #
+        # >>> torch.tensor_split(torch.randn(10), torch.tensor([-5, 5]))
+        # (tensor([ 0.3540,  2.1074, -0.8507,  1.1639,  0.3055]), tensor([]),
+        # tensor([-0.4285,  1.0692, -0.1776,  0.9362,  1.6143]))
+        #
+        # Sorry, I don't make the rules.  Explicitly do the item call in user
+        # code if you KNOW that they are non-negative.
         return self.tensor_split(indices, dim)
 
 
@@ -1830,6 +1948,114 @@ def _native_batch_norm_legit_functional(
     return output, save_mean, save_rstd, new_running_mean, new_running_var
 
 
+def _get_batch_norm_reserve_tensor(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Tensor,
+    running_var: Tensor,
+    eps: float,
+    training: bool,
+) -> Tensor:
+    """
+    Return a reserve tensor for batch norm, used only by cudnn to pass forward state to the
+    backward pass. This is needed for `_batch_norm_with_update` and `_batch_norm_no_update`,
+    which support a variety of backends including cudnn. We create this tensor here to get
+    the correct shape in the traced graph if we detect that will call the cudnn kernel,
+    and rely on DCE to avoid materializing this tensor.
+    """
+    backend = torch._C._select_batch_norm_backend(  # type: ignore[attr-defined]
+        input, weight, bias, running_mean, running_var, True, eps
+    )
+    reserve_size = 0
+    if backend == torch._C._BatchNormBackend.Cudnn:  # type: ignore[attr-defined]
+        reserve_size = torch._C._get_cudnn_batch_norm_reserve_space_size(input, training)  # type: ignore[attr-defined]
+    return torch.empty(
+        reserve_size, dtype=torch.uint8, layout=input.layout, device=input.device
+    )
+
+
+@register_decomposition(aten._batch_norm_with_update.default)
+def _batch_norm_with_update(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Tensor,
+    running_var: Tensor,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    output, save_mean, save_rstd, _, _ = native_batch_norm_helper(
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        True,  # training
+        momentum,
+        eps,
+        False,  # functional
+    )
+    reserve = _get_batch_norm_reserve_tensor(
+        input, weight, bias, running_mean, running_var, eps, training=True
+    )
+    return output, save_mean, save_rstd, reserve
+
+
+@register_decomposition(aten._batch_norm_with_update_functional.default)
+def _batch_norm_with_update_functional(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Tensor,
+    running_var: Tensor,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
+    (
+        output,
+        save_mean,
+        save_rstd,
+        new_rm,
+        new_rv,
+    ) = native_batch_norm_helper(
+        input, weight, bias, running_mean, running_var, True, momentum, eps, True
+    )
+    reserve = _get_batch_norm_reserve_tensor(
+        input, weight, bias, running_mean, running_var, eps, training=True
+    )
+    assert new_rm is not None, "new_running_mean should not be None"
+    assert new_rv is not None, "new_running_var should not be None"
+    return (output, save_mean, save_rstd, reserve, new_rm, new_rv)
+
+
+@register_decomposition(aten._batch_norm_no_update.default)
+def _batch_norm_no_update(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Tensor,
+    running_var: Tensor,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    output, save_mean, save_rstd, _, _ = native_batch_norm_helper(
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        False,  # training
+        momentum,
+        eps,
+        False,  # functional
+    )
+    reserve = _get_batch_norm_reserve_tensor(
+        input, weight, bias, running_mean, running_var, eps, training=False
+    )
+    return output, save_mean, save_rstd, reserve
+
+
 @register_decomposition(aten._fused_dropout)
 @out_wrapper("out0", "out1")
 @pw_cast_for_opmath
@@ -1840,26 +2066,6 @@ def _fused_dropout_decomposition(input, p, generator=None):
     return (res, mask)
 
 
-def device_hint(tensor):
-    if isinstance(tensor, torch._subclasses.FakeTensor):
-        return tensor.fake_device
-    else:
-        return None
-
-
-def wrap_output_with_input_device_(x, common_device):
-    # wrap meta tensor
-    if common_device is not None and x.device.type == "meta":
-        from torch._subclasses.fake_tensor import FakeTensorMode
-
-        fake_mode = FakeTensorMode()
-        fake_mode.in_kernel_invocation = True
-        converter = fake_mode.fake_tensor_converter
-        return converter.from_meta_and_device(fake_mode, x, common_device)
-
-    return x
-
-
 @register_decomposition(aten._to_copy)
 @out_wrapper()
 def _to_copy(
@@ -1877,20 +2083,18 @@ def _to_copy(
     if device is None and dtype is None and memory_format is None:
         return x.clone()
     dtype_converted = False
-    common_device = device_hint(x)
+
     if device is not None and device != x.device:
         # avoid conversions on cpu
         if dtype is not None and device.type == "cpu":
             x = torch._prims.convert_element_type(x, dtype)
             dtype_converted = True
         x = torch._prims.device_put(x, device)
+
     if dtype is not None and not dtype_converted:
         x = torch._prims.convert_element_type(x, dtype)
         dtype_converted = True
-    # In case of dtype promotion, faketensor converted into tensor.
-    # Need to convert into faketensor if input was a faketensor.
-    if dtype_converted:
-        x = wrap_output_with_input_device_(x, common_device)
+
     if memory_format is not None:  # no ref/prim for memory format
         return torch.clone(x, memory_format=memory_format)
     return x
@@ -1948,6 +2152,34 @@ def _broadcast_batch_norm_backward(x, broadcast_mask):
     return x
 
 
+@register_decomposition(aten.batch_norm_backward.default)
+def batch_norm_backward(
+    grad_out: Tensor,
+    input: Tensor,
+    weight: Optional[Tensor],
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    save_mean: Optional[Tensor],
+    save_invstd: Optional[Tensor],
+    train: bool,
+    eps: float,
+    output_mask: List[bool],
+    reserve: Tensor,
+) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+    return native_batch_norm_backward(
+        grad_out,
+        input,
+        weight,
+        running_mean,
+        running_var,
+        save_mean,
+        save_invstd,
+        train,
+        eps,
+        output_mask,
+    )
+
+
 @register_decomposition(aten.native_batch_norm_backward.default)
 def native_batch_norm_backward(
     grad_out: Tensor,
@@ -2333,6 +2565,7 @@ def _index_copy(
     # Treat scalars as elements of \R^1
     zero_dim = x.ndim == 0
     x1 = x.unsqueeze(0) if zero_dim else x
+    index = index.unsqueeze(0) if index.ndim == 0 else index
     idx = (None,) * dim + (index,)
     index_put = aten.index_put_ if inplace else aten.index_put
     out = index_put(x1, idx, tensor)
@@ -2415,71 +2648,45 @@ def get_scale_value(scales, idx):
 
 
 @register_decomposition(aten.upsample_nearest1d.vec)
+@register_decomposition(aten.upsample_nearest2d.vec)
+@register_decomposition(aten.upsample_nearest3d.vec)
 @aten.upsample_nearest1d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
 @aten.upsample_nearest1d.vec.py_impl(DispatchKey.Autograd)
-def upsample_nearest1d_vec(input, output_size, scale_factors):
-    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
-    scale = get_scale_value(scale_factors, 0)
-
-    return aten.upsample_nearest1d.default(input, osize, scale)
-
-
-@register_decomposition(aten._upsample_nearest_exact1d.vec)
-@aten._upsample_nearest_exact1d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
-@aten._upsample_nearest_exact1d.vec.py_impl(DispatchKey.Autograd)
-def _upsample_nearest_exact1d_vec(input, output_size, scale_factors):
-    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
-    scale = get_scale_value(scale_factors, 0)
-
-    return aten._upsample_nearest_exact1d.default(input, osize, scale)
-
-
-@register_decomposition(aten.upsample_nearest2d.vec)
 @aten.upsample_nearest2d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
 @aten.upsample_nearest2d.vec.py_impl(DispatchKey.Autograd)
-def upsample_nearest2d_vec(input, output_size, scale_factors):
-    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
-    scale_h = get_scale_value(scale_factors, 0)
-    scale_w = get_scale_value(scale_factors, 1)
-
-    return aten.upsample_nearest2d.default(input, osize, scale_h, scale_w)
-
-
-@register_decomposition(aten._upsample_nearest_exact2d.vec)
-@aten._upsample_nearest_exact2d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
-@aten._upsample_nearest_exact2d.vec.py_impl(DispatchKey.Autograd)
-def _upsample_nearest_exact2d_vec(input, output_size, scale_factors):
-    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
-    scale_h = get_scale_value(scale_factors, 0)
-    scale_w = get_scale_value(scale_factors, 1)
-
-    return aten._upsample_nearest_exact2d.default(input, osize, scale_h, scale_w)
-
-
-@register_decomposition(aten.upsample_nearest3d.vec)
 @aten.upsample_nearest3d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
 @aten.upsample_nearest3d.vec.py_impl(DispatchKey.Autograd)
-def upsample_nearest3d_vec(input, output_size, scale_factors):
+def _upsample_nearest_vec(
+    input: Tensor,
+    output_size: Optional[List[int]],
+    scale_factors: Optional[List[float]],
+) -> Tensor:
     osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
-    scale_d = get_scale_value(scale_factors, 0)
-    scale_h = get_scale_value(scale_factors, 1)
-    scale_w = get_scale_value(scale_factors, 2)
-
-    return aten.upsample_nearest3d.default(input, osize, scale_d, scale_h, scale_w)
+    scales = (
+        scale_factors if scale_factors else [None] * len(osize)  # type: ignore[list-item]
+    )
+    return _upsample_nearest(input, osize, scales)
 
 
+@register_decomposition(aten._upsample_nearest_exact1d.vec)
+@register_decomposition(aten._upsample_nearest_exact2d.vec)
 @register_decomposition(aten._upsample_nearest_exact3d.vec)
+@aten._upsample_nearest_exact1d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten._upsample_nearest_exact1d.vec.py_impl(DispatchKey.Autograd)
+@aten._upsample_nearest_exact2d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten._upsample_nearest_exact2d.vec.py_impl(DispatchKey.Autograd)
 @aten._upsample_nearest_exact3d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
 @aten._upsample_nearest_exact3d.vec.py_impl(DispatchKey.Autograd)
-def _upsample_nearest_exact3d_vec(input, output_size, scale_factors):
+def _upsample_nearest_exact_vec(
+    input: Tensor,
+    output_size: Optional[List[int]],
+    scale_factors: Optional[List[float]],
+) -> Tensor:
     osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
-    scale_d = get_scale_value(scale_factors, 0)
-    scale_h = get_scale_value(scale_factors, 1)
-    scale_w = get_scale_value(scale_factors, 2)
-
-    return aten._upsample_nearest_exact3d.default(
-        input, osize, scale_d, scale_h, scale_w
+    scales = (
+        scale_factors if scale_factors else [None] * len(osize)  # type: ignore[list-item]
     )
+    return _upsample_nearest(input, osize, scales, exact=True)
 
 
 def _compute_upsample_nearest_indices(input, output_size, scales, exact=False):
@@ -2511,83 +2718,67 @@ def _compute_upsample_nearest_indices(input, output_size, scales, exact=False):
         for _ in range(num_spatial_dims - 1 - d):
             input_indices = input_indices.unsqueeze(-1)
         indices.append(input_indices)
-    return tuple(indices)
+    return indices
 
 
-@register_decomposition(aten.upsample_nearest1d.default)
+@register_decomposition([aten.upsample_nearest1d.default, aten.upsample_nearest1d.out])
+@aten.upsample_nearest1d.default.py_impl(DispatchKey.CompositeImplicitAutograd)
 @aten.upsample_nearest1d.default.py_impl(DispatchKey.Autograd)
-@pw_cast_for_opmath
+@out_wrapper(preserve_memory_format=True, exact_dtype=True)
 def upsample_nearest1d(
     input: Tensor,
     output_size: List[int],
     scales: Optional[float] = None,
 ) -> Tensor:
-    (l_indices,) = _compute_upsample_nearest_indices(input, output_size, (scales,))
-    return aten._unsafe_index(input, (None, None, l_indices))
+    return _upsample_nearest(input, output_size, [scales])
 
 
-@register_decomposition(aten._upsample_nearest_exact1d.default)
+@register_decomposition(
+    [aten._upsample_nearest_exact1d.default, aten._upsample_nearest_exact1d.out]
+)
+@aten._upsample_nearest_exact1d.default.py_impl(DispatchKey.CompositeImplicitAutograd)
 @aten._upsample_nearest_exact1d.default.py_impl(DispatchKey.Autograd)
-@pw_cast_for_opmath
-def _upsample_nearest_exact1d(
+@out_wrapper(preserve_memory_format=True, exact_dtype=True)
+def upsample_nearest_exact1d(
     input: Tensor,
     output_size: List[int],
     scales: Optional[float] = None,
 ) -> Tensor:
-    (l_indices,) = _compute_upsample_nearest_indices(
-        input, output_size, (scales,), exact=True
-    )
-    return aten._unsafe_index(input, (None, None, l_indices))
-
+    return _upsample_nearest(input, output_size, [scales], exact=True)
 
-def _upsample_nearest2d_common(input, h_indices, w_indices):
-    result = aten._unsafe_index(input, (None, None, h_indices, w_indices))
 
-    # convert output to correct memory format, if necessary
-    memory_format = utils.suggest_memory_format(input)
-
-    # following "heuristic: only use channels_last path when it's faster than the contiguous path"
-    _, n_channels, _, _ = input.shape
-    if input.device.type == "cuda" and n_channels < 4:
-        memory_format = torch.contiguous_format
-
-    result = result.contiguous(memory_format=memory_format)
-    return result
-
-
-@register_decomposition(aten.upsample_nearest2d.default)
+@register_decomposition([aten.upsample_nearest2d.default, aten.upsample_nearest2d.out])
+@aten.upsample_nearest2d.default.py_impl(DispatchKey.CompositeImplicitAutograd)
 @aten.upsample_nearest2d.default.py_impl(DispatchKey.Autograd)
-@pw_cast_for_opmath
+@out_wrapper(preserve_memory_format=True, exact_dtype=True)
 def upsample_nearest2d(
     input: Tensor,
     output_size: List[int],
     scales_h: Optional[float] = None,
     scales_w: Optional[float] = None,
 ) -> Tensor:
-    h_indices, w_indices = _compute_upsample_nearest_indices(
-        input, output_size, (scales_h, scales_w)
-    )
-    return _upsample_nearest2d_common(input, h_indices, w_indices)
+    return _upsample_nearest(input, output_size, [scales_h, scales_w])
 
 
-@register_decomposition(aten._upsample_nearest_exact2d.default)
+@register_decomposition(
+    [aten._upsample_nearest_exact2d.default, aten._upsample_nearest_exact2d.out]
+)
+@aten._upsample_nearest_exact2d.default.py_impl(DispatchKey.CompositeImplicitAutograd)
 @aten._upsample_nearest_exact2d.default.py_impl(DispatchKey.Autograd)
-@pw_cast_for_opmath
+@out_wrapper(preserve_memory_format=True, exact_dtype=True)
 def _upsample_nearest_exact2d(
     input: Tensor,
     output_size: List[int],
     scales_h: Optional[float] = None,
     scales_w: Optional[float] = None,
 ) -> Tensor:
-    h_indices, w_indices = _compute_upsample_nearest_indices(
-        input, output_size, (scales_h, scales_w), exact=True
-    )
-    return _upsample_nearest2d_common(input, h_indices, w_indices)
+    return _upsample_nearest(input, output_size, [scales_h, scales_w], exact=True)
 
 
-@register_decomposition(aten.upsample_nearest3d.default)
+@register_decomposition([aten.upsample_nearest3d.default, aten.upsample_nearest3d.out])
+@aten.upsample_nearest3d.default.py_impl(DispatchKey.CompositeImplicitAutograd)
 @aten.upsample_nearest3d.default.py_impl(DispatchKey.Autograd)
-@pw_cast_for_opmath
+@out_wrapper(preserve_memory_format=True, exact_dtype=True)
 def upsample_nearest3d(
     input: Tensor,
     output_size: List[int],
@@ -2595,17 +2786,15 @@ def upsample_nearest3d(
     scales_h: Optional[float] = None,
     scales_w: Optional[float] = None,
 ) -> Tensor:
-    d_indices, h_indices, w_indices = _compute_upsample_nearest_indices(
-        input, output_size, (scales_d, scales_h, scales_w)
-    )
-    result = aten._unsafe_index(input, (None, None, d_indices, h_indices, w_indices))
-
-    return result
+    return _upsample_nearest(input, output_size, [scales_d, scales_h, scales_w])
 
 
-@register_decomposition(aten._upsample_nearest_exact3d.default)
+@register_decomposition(
+    [aten._upsample_nearest_exact3d.default, aten._upsample_nearest_exact3d.out]
+)
+@aten._upsample_nearest_exact3d.default.py_impl(DispatchKey.CompositeImplicitAutograd)
 @aten._upsample_nearest_exact3d.default.py_impl(DispatchKey.Autograd)
-@pw_cast_for_opmath
+@out_wrapper(preserve_memory_format=True, exact_dtype=True)
 def _upsample_nearest_exact3d(
     input: Tensor,
     output_size: List[int],
@@ -2613,11 +2802,35 @@ def _upsample_nearest_exact3d(
     scales_h: Optional[float] = None,
     scales_w: Optional[float] = None,
 ) -> Tensor:
-    d_indices, h_indices, w_indices = _compute_upsample_nearest_indices(
-        input, output_size, (scales_d, scales_h, scales_w), exact=True
+    return _upsample_nearest(
+        input, output_size, [scales_d, scales_h, scales_w], exact=True
+    )
+
+
+@pw_cast_for_opmath
+def _upsample_nearest(
+    input: Tensor,
+    output_size: List[int],
+    scales: List[Optional[float]],
+    exact: bool = False,
+) -> Tensor:
+    spatial_indices = _compute_upsample_nearest_indices(
+        input, output_size, scales, exact=exact
     )
-    result = aten._unsafe_index(input, (None, None, d_indices, h_indices, w_indices))
 
+    indices = [None, None] + spatial_indices
+    result = aten._unsafe_index(input, indices)
+
+    if result.ndim == 4:
+        # convert output to correct memory format, if necessary
+        memory_format = utils.suggest_memory_format(input)
+
+        # following "heuristic: only use channels_last path when it's faster than the contiguous path"
+        n_channels = input.shape[1]
+        if input.device.type == "cuda" and n_channels < 4:
+            memory_format = torch.contiguous_format
+
+        result = result.contiguous(memory_format=memory_format)
     return result
 
 
@@ -2831,7 +3044,7 @@ def _rnn_helper(
             final_hiddens.append(bwd_hidden)
 
         if bidirectional:
-            input = torch.cat([fwd_inp, bwd_inp], fwd_inp.dim() - 1)
+            input = torch.cat([fwd_inp, bwd_inp], fwd_inp.dim() - 1)  # type: ignore[possibly-undefined]
         else:
             input = fwd_inp
 
@@ -3285,14 +3498,73 @@ def upsample_bilinear2d_aa_vec(input, output_size, align_corners, scale_factors)
     )
 
 
+@register_decomposition(aten._upsample_bicubic2d_aa.vec)
+@aten._upsample_bicubic2d_aa.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten._upsample_bicubic2d_aa.vec.py_impl(DispatchKey.Autograd)
+def upsample_bicubic2d_aa_vec(input, output_size, align_corners, scale_factors):
+    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
+    scale_h = get_scale_value(scale_factors, 0)
+    scale_w = get_scale_value(scale_factors, 1)
+    return torch.ops.aten._upsample_bicubic2d_aa(
+        input, osize, align_corners, scale_h, scale_w
+    )
+
+
 @register_decomposition(aten.upsample_bilinear2d.vec)
+@register_decomposition(aten.upsample_trilinear3d.vec)
+@aten.upsample_linear1d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.upsample_linear1d.vec.py_impl(DispatchKey.Autograd)
 @aten.upsample_bilinear2d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
 @aten.upsample_bilinear2d.vec.py_impl(DispatchKey.Autograd)
-def upsample_bilinear2d_vec(input, output_size, align_corners, scale_factors):
+@aten.upsample_trilinear3d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.upsample_trilinear3d.vec.py_impl(DispatchKey.Autograd)
+def _upsample_linear_vec(input, output_size, align_corners, scale_factors):
     osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
-    scale_h = get_scale_value(scale_factors, 0)
-    scale_w = get_scale_value(scale_factors, 1)
-    return upsample_bilinear2d(input, osize, align_corners, scale_h, scale_w)
+    scales = scale_factors if scale_factors else [None] * len(osize)
+    return _upsample_linear(input, osize, align_corners, scales)
+
+
+@register_decomposition([aten.upsample_linear1d.default, aten.upsample_linear1d.out])
+@out_wrapper()
+def upsample_linear1d(
+    input: Tensor,
+    output_size: List[int],
+    align_corners: bool,
+    scales_w: Optional[float] = None,
+) -> Tensor:
+    return _upsample_linear(input, output_size, align_corners, [scales_w])
+
+
+@register_decomposition(
+    [aten.upsample_bilinear2d.default, aten.upsample_bilinear2d.out]
+)
+@aten.upsample_bilinear2d.default.py_impl(DispatchKey.Autograd)
+@out_wrapper()
+def upsample_bilinear2d(
+    input: Tensor,
+    output_size: List[int],
+    align_corners: bool,
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
+) -> Tensor:
+    return _upsample_linear(input, output_size, align_corners, [scales_h, scales_w])
+
+
+@register_decomposition(
+    [aten.upsample_trilinear3d.default, aten.upsample_trilinear3d.out]
+)
+@out_wrapper()
+def upsample_trilinear3d(
+    input: Tensor,
+    output_size: List[int],
+    align_corners: bool,
+    scales_d: Optional[float] = None,
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
+) -> Tensor:
+    return _upsample_linear(
+        input, output_size, align_corners, [scales_d, scales_h, scales_w]
+    )
 
 
 def _compute_scale(in_size, out_size, align_corners, scale=None):
@@ -3309,66 +3581,80 @@ def _compute_source_index(scale, dst_index, align_corners):
         return scale * (dst_index + 0.5) - 0.5
 
 
-@register_decomposition(aten.upsample_bilinear2d.default)
-@aten.upsample_bilinear2d.default.py_impl(DispatchKey.Autograd)
+def _sum_tensors_uint8(
+    src: Iterable[Tensor], weights: Iterable[Tensor], weights_precision: Tensor
+) -> Tensor:
+    output = _sum_tensors(
+        s.to(torch.int32) * c.to(torch.int32) for s, c in zip(src, weights)
+    ) + (1 << (weights_precision - 1))
+    output = output >> weights_precision
+    return torch.clamp(output, 0, 255).to(torch.uint8)
+
+
+def _compute_weight_precision(weights: TensorSequenceType) -> Tensor:
+    max_weight = torch.stack(weights).max()
+    max_weight_precision = 22
+    precisions = torch.arange(max_weight_precision, device=max_weight.device)
+    values = 0.5 + max_weight * (1 << (precisions + 1))
+    mask = values >= (1 << 15)
+    return max_weight_precision - mask.sum()
+
+
 @pw_cast_for_opmath
-def upsample_bilinear2d(
+def _upsample_linear(
     input: Tensor,
     output_size: List[int],
     align_corners: bool,
-    scales_h: Optional[float] = None,
-    scales_w: Optional[float] = None,
+    scales: List[Optional[float]],
 ) -> Tensor:
     # get dimensions of original image
-    _, n_channels, in_h, in_w = input.shape
-
-    # Calculate horizontal and vertical scaling factor
-    h_scale_factor = _compute_scale(in_h, output_size[0], align_corners, scales_h)
-    w_scale_factor = _compute_scale(in_w, output_size[1], align_corners, scales_w)
+    n_batch, n_channels = input.shape[:2]
+    inp_sizes = input.shape[2:]
+    n_dims = len(inp_sizes)
 
     _, dtype = utils.elementwise_dtypes(
-        input, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+        input,
+        type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
     )
-    # We have to create arange with int64 dtype and use .to in order to avoid
-    # additional kernels creation in inductor and get a perf slowdown
-    i = torch.arange(output_size[0], device=input.device).to(dtype=dtype)
-    j = torch.arange(output_size[1], device=input.device).to(dtype=dtype)
-
-    x_f32 = _compute_source_index(w_scale_factor, j, align_corners).clamp(min=0.0)
-    y_f32 = _compute_source_index(h_scale_factor, i, align_corners).clamp(min=0.0)
-    y_f32 = y_f32.unsqueeze(-1)
-
-    x = x_f32.to(torch.int64)
-    y = y_f32.to(torch.int64)
-
-    # We are using torch.where instead of torch.clamp below due to an expected failure
-    # in test_aot_autograd_symbolic_exhaustive_nn_functional_interpolate_bilinear_cpu_float32 test
-    # torch.ops.aten.clamp.default(add, None, sub) on int64 input tensor is returning float32 and
-    # fails with torch.ops.aten._unsafe_index.Tensor(primals_1, [None, None, _to_copy_1, clamp_2])
-    # RuntimeError: _unsafe_index found unexpected index type Float
-    # xp1 = (x + 1).clamp(max=in_w - 1); yp1 = (y + 1).clamp(max=in_h - 1)
-    xp1 = torch.where(x < in_w - 1, x + 1, x)
-    yp1 = torch.where(y < in_h - 1, y + 1, y)
-
-    v1 = aten._unsafe_index(input, [None, None, y, x])
-    v2 = aten._unsafe_index(input, [None, None, y, xp1])
-    v3 = aten._unsafe_index(input, [None, None, yp1, x])
-    v4 = aten._unsafe_index(input, [None, None, yp1, xp1])
-
-    dtype = torch.float32 if not input.is_floating_point() else input.dtype
-    if not input.is_floating_point():
-        v1 = v1.to(dtype)
-        v2 = v2.to(dtype)
-        v3 = v3.to(dtype)
-        v4 = v4.to(dtype)
 
-    yscale = (y_f32 - y).clamp(0.0, 1.0).to(dtype)
-    xscale = (x_f32 - x).clamp(0.0, 1.0).to(dtype)
+    def get_values(inp_size, out_size, scales, nsqueeze):
+        # First Calculate scaling factor
+        scale_factor = _compute_scale(inp_size, out_size, align_corners, scales)
+        # We have to create arange with int64 dtype and use .to in order to avoid
+        # additional kernels creation in inductor and get a perf slowdown
+        i = torch.arange(out_size, device=input.device).to(dtype=dtype)
+
+        x_f32 = _compute_source_index(scale_factor, i, align_corners).clamp(min=0.0)
+        x_f32 = x_f32.reshape(x_f32.shape[0], *[1] * (nsqueeze))
+        x = x_f32.to(torch.int64)
+        xp1 = (x + 1).clamp(max=inp_size - 1)
+        return x_f32, x, xp1
+
+    values = [
+        get_values(inp_size, out_size, scales, n_dims - 1 - i)
+        for i, (inp_size, out_size, scales) in enumerate(
+            zip(inp_sizes, output_size, scales)
+        )
+    ]
+    xs_f32, xs, xp1s = list(zip(*values))
+
+    vs = []
+    for a in product(*[[0, 1]] * n_dims):
+        idx = [None, None] + [xs[k] if a[k] == 0 else xp1s[k] for k in range(n_dims)]
+        v = aten._unsafe_index(input, idx)
+        v = _maybe_convert_to_dtype(v, dtype)
+        vs.append(v)
+
+    for i in reversed(range(n_dims)):
+        xscale = (xs_f32[i] - xs[i]).clamp(0.0, 1.0).to(dtype)
+        vs = [
+            # x1 * (1 - alpha) + x2 * alpha == x1 + (x2 - x1) * alpha
+            v1 + torch.mul(v2 - v1, xscale)
+            for v1, v2 in zip(vs[::2], vs[1::2])
+        ]
 
-    # x1 * (1 - alpha) + x2 * alpha == x1 + (x2 - x1) * alpha
-    q1 = v1 + torch.mul(v2 - v1, xscale)
-    q2 = v3 + torch.mul(v4 - v3, xscale)
-    result = q1 + torch.mul(q2 - q1, yscale)
+    assert len(vs) == 1
+    result = vs[0]
 
     # convert output to correct memory format, if necessary
     memory_format = utils.suggest_memory_format(input)
@@ -3377,6 +3663,8 @@ def upsample_bilinear2d(
     if input.device.type == "cuda" and n_channels < 16:
         memory_format = torch.contiguous_format
 
+    assert isinstance(result, torch.Tensor)
+
     result = result.contiguous(memory_format=memory_format)
 
     if not input.is_floating_point():
@@ -3507,12 +3795,22 @@ def _upsample_cubic_convolution2(x: Tensor, A: float) -> Tensor:
 
 def _upsample_get_cubic_coefficients(t: Tensor) -> TensorSequenceType:
     A = -0.75
-    return (
-        _upsample_cubic_convolution2(t + 1.0, A),
-        _upsample_cubic_convolution1(t, A),
-        _upsample_cubic_convolution1(1.0 - t, A),
-        _upsample_cubic_convolution2(2.0 - t, A),
-    )
+
+    if t.device == torch.device("cpu"):
+        tt1 = torch.stack([t, 1.0 - t], dim=0)
+        tt2 = torch.stack([t + 1.0, 2.0 - t], dim=0)
+        w03 = _upsample_cubic_convolution2(tt2, A)
+        w12 = _upsample_cubic_convolution1(tt1, A)
+        w0, w3 = torch.unbind(w03, dim=0)
+        w1, w2 = torch.unbind(w12, dim=0)
+        return w0, w1, w2, w3
+    else:
+        return (
+            _upsample_cubic_convolution2(t + 1.0, A),
+            _upsample_cubic_convolution1(t, A),
+            _upsample_cubic_convolution1(1.0 - t, A),
+            _upsample_cubic_convolution2(2.0 - t, A),
+        )
 
 
 def _upsample_cubic_interp1d(coeffs: TensorSequenceType, ts: Tensor) -> Tensor:
@@ -3808,18 +4106,11 @@ def mv(self, vec):
 def binary_cross_entropy_with_logits(
     self, target, weight=None, pos_weight=None, reduction=Reduction.MEAN.value
 ):
-    max_val = (-self).clamp_min(0)
     if pos_weight is not None:
         log_weight = (pos_weight - 1) * target + 1
-        loss = (1 - target) * self + log_weight * (
-            ((-max_val).exp() + (-self - max_val).exp()).log() + max_val
-        )
+        loss = (1 - target) * self - (log_weight * F.logsigmoid(self))
     else:
-        loss = (
-            (1 - target) * self
-            + max_val
-            + ((-max_val).exp() + (-self - max_val).exp()).log()
-        )
+        loss = (1 - target) * self - F.logsigmoid(self)
 
     if weight is not None:
         loss = loss * weight
@@ -3827,18 +4118,20 @@ def binary_cross_entropy_with_logits(
     return apply_loss_reduction(loss, reduction)
 
 
-def should_fold(tensor1: torch.Tensor, tensor2: torch.Tensor) -> bool:
+def should_fold(tensor1: torch.Tensor, tensor2: torch.Tensor, is_out: bool) -> bool:
     # For comments of the logic of this function see eager in /native/LinearAlgebra.cpp
 
     t1, t2 = (tensor1, tensor2) if tensor1.ndim >= tensor2.ndim else (tensor2, tensor1)
 
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
     if not (t1.ndim >= 3 and t2.ndim <= 2):
         return False
-    if t2.requires_grad:
+    if t2.requires_grad and not is_out:
         return True
     if tensor1.ndim == 2:
         return False
-    if t1.numel() == 0:
+    if guard_size_oblivious(t1.numel() == 0):
         return True
 
     t1_shape = t1.shape
@@ -3850,8 +4143,9 @@ def should_fold(tensor1: torch.Tensor, tensor2: torch.Tensor) -> bool:
 
 
 @aten.matmul.default.py_impl(DispatchKey.CompositeImplicitAutograd)
-@out_wrapper()
-def matmul(tensor1, tensor2):
+@aten.matmul.out.py_impl(DispatchKey.CompositeImplicitAutograd)
+@out_wrapper(pass_is_out=True)
+def matmul(tensor1, tensor2, *, is_out=False):
     dim_tensor1 = tensor1.dim()
     dim_tensor2 = tensor2.dim()
     assert dim_tensor1 != 0 and dim_tensor2 != 0
@@ -3863,7 +4157,7 @@ def matmul(tensor1, tensor2):
         return torch.squeeze(torch.mm(torch.unsqueeze(tensor1, 0), tensor2), 0)
     elif dim_tensor1 == 2 and dim_tensor2 == 2:
         return torch.mm(tensor1, tensor2)
-    elif should_fold(tensor1, tensor2):
+    elif should_fold(tensor1, tensor2, is_out):
         # dim_tensor1 >=3 && (dim_tensor2 == 1 || dim_tensor2 == 2) ||
         # dim_tensor2 >=3 && (dim_tensor1 == 1 || dim_tensor1 == 2)
         # and some condition on the strides is fulfilled
@@ -3970,65 +4264,89 @@ def matmul(tensor1, tensor2):
         torch._check(False, lambda: "both arguments to matmul need to be at least 1D")
 
 
-@register_decomposition(aten.upsample_bicubic2d.default)
+@register_decomposition([aten.upsample_bicubic2d.default, aten.upsample_bicubic2d.out])
+@aten.upsample_bicubic2d.default.py_impl(DispatchKey.Autograd)
+@out_wrapper()
 @pw_cast_for_opmath
 def upsample_bicubic2d_default(
-    a: Tensor,
+    input: Tensor,
     output_size: Tuple[int, int],
     align_corners: bool,
     scale_h: Optional[float] = None,
     scale_w: Optional[float] = None,
 ) -> Tensor:
-    N, C, iH, iW = a.shape
-    oH, oW = output_size
+    # get dimensions of original image
+    _, _, in_h, in_w = input.shape
 
-    def compute_scale(in_size, out_size, align_corners, scale=None):
-        if align_corners:
-            return (in_size - 1) / (out_size - 1) if out_size > 1 else 0
-        else:
-            return 1 / scale if scale is not None and scale > 0 else in_size / out_size
+    # Calculate horizontal and vertical scaling factor
+    h_scale_factor = _compute_scale(in_h, output_size[0], align_corners, scale_h)
+    w_scale_factor = _compute_scale(in_w, output_size[1], align_corners, scale_w)
 
-    def compute_source_index(scale, dst_index, align_corners):
-        if align_corners:
-            return scale * dst_index
-        else:
-            return scale * (dst_index + 0.5) - 0.5
+    _, dtype = utils.elementwise_dtypes(
+        input, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    )
+
+    # We have to create arange with int64 dtype and use .to in order to avoid
+    # additional kernels creation in inductor and get a perf slowdown
+    i = torch.arange(output_size[0], device=input.device).to(dtype=dtype)
+    j = torch.arange(output_size[1], device=input.device).to(dtype=dtype)
 
-    height_scale = compute_scale(iH, oH, align_corners, scale_h)
-    width_scale = compute_scale(iW, oW, align_corners, scale_w)
+    x_float = _compute_source_index(w_scale_factor, j, align_corners)
+    y_float = _compute_source_index(h_scale_factor, i, align_corners)
+    y_float = y_float.unsqueeze(-1)
 
-    N_idx = torch.arange(N, device=a.device).view(N, 1, 1, 1)
-    C_idx = torch.arange(C, device=a.device).view(1, C, 1, 1)
-    out_y = torch.arange(oH, device=a.device).view((1, 1, oH, 1))
-    out_x = torch.arange(oW, device=a.device).view((1, 1, 1, oW))
+    x = x_float.floor()
+    y = y_float.floor()
 
-    real_x = compute_source_index(width_scale, out_x, align_corners)
-    in_x = real_x.floor()
-    t_x = real_x - in_x
-    ix = in_x.to(dtype=torch.int64)
+    # We should also clamp xscale/yscale
+    # See guard_index_and_lambda in UpSample.h
+    yscale = (y_float - y).clamp(0.0, 1.0)
+    xscale = (x_float - x).clamp(0.0, 1.0)
+    x = x.to(torch.int64)
+    y = y.to(torch.int64)
 
-    real_y = compute_source_index(height_scale, out_y, align_corners)
-    in_y = real_y.floor()
-    t_y = real_y - in_y
-    iy = in_y.to(dtype=torch.int64)
+    iys_ofs = (y - 1, y, y + 1, y + 2)
+    ixs_ofs = (x - 1, x, x + 1, x + 2)
 
-    iys_ofs = (iy - 1, iy, iy + 1, iy + 2)
-    ixs_ofs = (ix - 1, ix, ix + 1, ix + 2)
+    weights_x = _upsample_get_cubic_coefficients(xscale)
+    weights_y = _upsample_get_cubic_coefficients(yscale)
+
+    weights_precision_x, weights_precision_y = None, None
+    if input.dtype == torch.uint8:
+        weights_precision_x = _compute_weight_precision(weights_x)
+        weights_precision_y = _compute_weight_precision(weights_y)
+
+        weights_x = [
+            (w * (1 << weights_precision_x) + torch.sign(w) * 0.5).to(torch.int16)
+            for w in weights_x
+        ]
+        weights_y = [
+            (w * (1 << weights_precision_y) + torch.sign(w) * 0.5).to(torch.int16)
+            for w in weights_y
+        ]
 
     def load_bounded(ys, xs):
-        y_idx = torch.clamp(ys, 0, iH - 1)
-        x_idx = torch.clamp(xs, 0, iW - 1)
-        return aten._unsafe_index(a, [N_idx, C_idx, y_idx, x_idx])
+        y_idx = torch.clamp(ys, 0, in_h - 1)
+        x_idx = torch.clamp(xs, 0, in_w - 1)
+        v = aten._unsafe_index(input, [None, None, y_idx, x_idx])
+        return v
 
     def get_x_interp(y):
-        coeffs_x = tuple(load_bounded(y, x_ofs) for x_ofs in ixs_ofs)
-        return _upsample_cubic_interp1d(coeffs_x, t_x)
-
-    coeffs_y = tuple(get_x_interp(y_ofs) for y_ofs in iys_ofs)
-    result = _upsample_cubic_interp1d(coeffs_y, t_y)
+        src_x = tuple(load_bounded(y, x_ofs) for x_ofs in ixs_ofs)
+        if input.dtype == torch.uint8:
+            assert weights_precision_x is not None
+            return _sum_tensors_uint8(src_x, weights_x, weights_precision_x)
+        return _sum_tensors(c1 * c2 for (c1, c2) in zip(src_x, weights_x))
+
+    src_y = tuple(get_x_interp(y_ofs) for y_ofs in iys_ofs)
+    if input.dtype == torch.uint8:
+        assert weights_precision_y is not None
+        result = _sum_tensors_uint8(src_y, weights_y, weights_precision_y)
+    else:
+        result = _sum_tensors(c1 * c2 for (c1, c2) in zip(src_y, weights_y))
 
     # convert output to correct memory format, if necessary
-    memory_format = utils.suggest_memory_format(a)
+    memory_format = utils.suggest_memory_format(input)
     result = result.contiguous(memory_format=memory_format)
     return result
 
@@ -4276,36 +4594,29 @@ def multilabel_margin_loss_forward(
 # it calls _scaled_dot_product_attention_math and
 # _scaled_dot_product_attention_math only has a CompositeImplicitAutograd
 # kernel. As a result it's decomposed into ops with finer granularity.
-# However recent PRs (#103826 #105131) added new logic in
+# However recent PRs (#103826 #105131 #115913) added new logic in
 # scaled_dot_product_attention and now it calls
-# _scaled_dot_product_flash_attention which contains a CPU kernel. This results
-# in _scaled_dot_product_flash_attention showing up in torch.export().
+# _scaled_dot_product_flash_attention_for_cpu in export path. This results
+# in _scaled_dot_product_flash_attention_for_cpu showing up in export result.
 # This decomposition ensures scaled_dot_product_attention is still decomposed
 # the same way as before, i.e., going through
 # _scaled_dot_product_attention_math. Notice that this decomp rule should be
 # excluded by inductor.
-@register_decomposition(aten._scaled_dot_product_flash_attention.default)
-def scaled_dot_product_flash_attention(
+@register_decomposition(aten._scaled_dot_product_flash_attention_for_cpu.default)
+def scaled_dot_product_flash_attention_for_cpu(
     query: Tensor,
     key: Tensor,
     value: Tensor,
     dropout_p: float = 0.0,
     is_causal: bool = False,
-    return_debug_mask: bool = False,
     *,
+    attn_mask: Optional[Tensor] = None,
     scale: Optional[float] = None,
-) -> Tuple[Tensor, Tensor, Tensor, Tensor, int, int, Tensor, Tensor, Tensor]:
+) -> Tuple[Tensor, Tensor]:
     dtype = query.dtype
-    batchSize, num_head, qSize, headSize = (
-        query.shape[0],
-        query.shape[1],
-        query.shape[2],
-        query.shape[3],
-    )
-
     torch._check(
-        torch.is_floating_point(query) and dtype is not torch.half,
-        lambda: f"query must be FP32, FP64, BF16 but got {query.dtype}",
+        torch.is_floating_point(query),
+        lambda: f"query must be FP32, FP64, BF16, FP16 but got {query.dtype}",
     )
     torch._check(
         query.dim() == 4 and key.dim() == 4 and value.dim() == 4,
@@ -4318,26 +4629,16 @@ def scaled_dot_product_flash_attention(
         query.shape[3] == value.shape[3] and key.shape[3] == value.shape[3],
         lambda: "q, k, v should have the same head size",
     )
-    torch._check(
-        return_debug_mask is False, lambda: "return_debug_mask is not supported."
-    )
 
-    logsumexp = torch.empty([batchSize, qSize, num_head, headSize], dtype=torch.float)
-    cum_seq_q, cum_seq_k = torch.empty([], dtype=torch.long), torch.empty(
-        [], dtype=torch.long
-    )
-    max_q, max_k = 0, 0
-    philox_seed, philox_offset = torch.empty([], dtype=torch.long), torch.empty(
-        [], dtype=torch.long
-    )
-    debug_attn_mask = torch.empty(
-        [],
-        dtype=query.dtype,
-        device=query.device,
-        requires_grad=query.requires_grad,
-    )
-    output, _ = aten._scaled_dot_product_attention_math.default(
-        query, key, value, None, dropout_p, is_causal, None, scale=scale
+    output, attn = aten._scaled_dot_product_attention_math.default(
+        query,
+        key,
+        value,
+        attn_mask=attn_mask,
+        dropout_p=dropout_p,
+        is_causal=is_causal,
+        dropout_mask=None,
+        scale=scale,
     )
     # Why this change?
     # In pre-dispatch export scaled_dot_product_attention is executed via
@@ -4373,17 +4674,7 @@ def scaled_dot_product_flash_attention(
     # pre-dispatch op-output and its decomposed representation must
     # return tensor with same view and dims
     output = output.transpose(1, 2).contiguous(memory_format=torch.contiguous_format)
-    return (
-        output.transpose(1, 2),
-        logsumexp,
-        cum_seq_q,
-        cum_seq_k,
-        max_q,
-        max_k,
-        philox_seed,
-        philox_offset,
-        debug_attn_mask,
-    )
+    return (output.transpose(1, 2), attn)
 
 
 def register_inplace(aten_op, outplace_op):
@@ -4418,6 +4709,11 @@ def floor_divide(self, other):
     return torch.div(self, other, rounding_mode="floor")
 
 
+@register_decomposition(aten.sym_numel)
+def sym_numel(t):
+    return functools.reduce(operator.mul, t.shape, 1)
+
+
 @register_decomposition([aten.sum.default, aten.sum.out])
 def sum_default(
     self: Tensor,
@@ -4440,13 +4736,72 @@ def squeeze_default(self: Tensor, dim: Optional[int] = None):
 
 
 @register_decomposition(torch.ops.aten._weight_norm_interface)
-def _weight_norm_interface(x, y, dim):
+def _weight_norm_interface(x, y, dim=0):
     # https://github.com/pytorch/pytorch/blob/852f8526c52190125446adc9a6ecbcc28fb66182/aten/src/ATen/native/WeightNorm.cpp#L58
     keep_dim = tuple(i for i in range(len(x.shape)) if i != dim)
     norm = x.norm(2, keep_dim, keepdim=True)
     return x * (y / norm), norm
 
 
+@register_decomposition(aten.isin)
+@out_wrapper()
+def isin(elements, test_elements, *, assume_unique=False, invert=False):
+    # handle when either elements or test_elements are Scalars (they can't both be)
+    if not isinstance(elements, torch.Tensor):
+        elements = torch.tensor(elements, device=test_elements.device)
+    if not isinstance(test_elements, torch.Tensor):
+        test_elements = torch.tensor(test_elements, device=elements.device)
+
+    if test_elements.numel() < 10.0 * pow(elements.numel(), 0.145):
+        return isin_default(elements, test_elements, invert=invert)
+    else:
+        return isin_sorting(
+            elements, test_elements, assume_unique=assume_unique, invert=invert
+        )
+
+
+def isin_default(elements, test_elements, *, invert=False):
+    if elements.numel() == 0:
+        return torch.empty_like(elements, dtype=torch.bool)
+
+    x = elements.view(*elements.shape, *((1,) * test_elements.ndim))
+    if not invert:
+        cmp = x == test_elements
+    else:
+        cmp = x != test_elements
+    dim = tuple(range(-1, -test_elements.ndim - 1, -1))
+    return cmp.any(dim=dim)
+
+
+def isin_sorting(elements, test_elements, *, assume_unique=False, invert=False):
+    elements_flat = elements.flatten()
+    test_elements_flat = test_elements.flatten()
+    if assume_unique:
+        # This is the same as the aten implementation. For
+        # assume_unique=False, we cannot use unique() here, so we use a
+        # version with searchsorted instead.
+        all_elements = torch.cat([elements_flat, test_elements_flat])
+        sorted_elements, sorted_order = torch.sort(all_elements, stable=True)
+
+        duplicate_mask = sorted_elements[1:] == sorted_elements[:-1]
+        duplicate_mask = torch.constant_pad_nd(duplicate_mask, [0, 1], False)
+
+        if invert:
+            duplicate_mask = duplicate_mask.logical_not()
+
+        mask = torch.empty_like(duplicate_mask)
+        mask = mask.index_copy(0, sorted_order, duplicate_mask)
+
+        return mask[0 : elements.numel()]
+    else:
+        sorted_test_elements, _ = torch.sort(test_elements_flat)
+        idx = torch.searchsorted(sorted_test_elements, elements_flat)
+        test_idx = torch.where(idx < sorted_test_elements.numel(), idx, 0)
+        cmp = sorted_test_elements[test_idx] == elements_flat
+        cmp = cmp.logical_not() if invert else cmp
+        return cmp.reshape(elements.shape)
+
+
 @register_decomposition(aten.take)
 @out_wrapper()
 def take(self, index):
@@ -4454,6 +4809,15 @@ def take(self, index):
     return flattened[index]
 
 
+@register_decomposition(aten.resize_as)
+def resize_as(self, other, memory_format=None):
+    if memory_format is None:
+        memory_format = torch.contiguous_format
+    if memory_format == torch.preserve_format:
+        memory_format = suggest_memory_format(other)
+    return aten.resize(self, other.shape, memory_format=memory_format)
+
+
 register_inplace(aten.addbmm_, aten.addbmm)
 register_inplace(aten.addmm_, aten.addmm)
 register_inplace(aten.addmv_, aten.addmv)
diff --git a/torch/_decomp/decompositions_for_jvp.py b/torch/_decomp/decompositions_for_jvp.py
index 19dfaedcce31d..81946c314638a 100644
--- a/torch/_decomp/decompositions_for_jvp.py
+++ b/torch/_decomp/decompositions_for_jvp.py
@@ -291,6 +291,34 @@ def native_batch_norm_backward(
     return (grad_input, grad_weight, grad_bias)
 
 
+@register_decomposition_for_jvp(aten.batch_norm_backward)
+def batch_norm_backward(
+    grad_out: Tensor,
+    input: Tensor,
+    weight: Tensor,
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    save_mean: Optional[Tensor],
+    save_var: Optional[Tensor],
+    update: bool,
+    eps: float,
+    output_mask: List[bool],
+    reserve: Tensor,
+) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+    return native_batch_norm_backward(
+        grad_out,
+        input,
+        weight,
+        running_mean,
+        running_var,
+        save_mean,
+        save_var,
+        update,
+        eps,
+        output_mask,
+    )
+
+
 _register_jit_decomposition_for_jvp(torch.ops.aten.trace.default, use_python=True)
 _register_jit_decomposition_for_jvp(torch.ops.aten.nll_loss_backward.default)
 _register_jit_decomposition_for_jvp(torch.ops.aten.nll_loss2d_backward.default)
@@ -300,3 +328,4 @@ def native_batch_norm_backward(
 _register_jit_decomposition_for_jvp(torch.ops.aten.native_layer_norm_backward.default)
 _register_jit_decomposition_for_jvp(torch.ops.aten.native_batch_norm_backward.default)
 _register_jit_decomposition_for_jvp(torch.ops.aten.cudnn_batch_norm_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.batch_norm_backward.default)
diff --git a/torch/_deploy.py b/torch/_deploy.py
index 30c022eac8793..35e8d49769401 100644
--- a/torch/_deploy.py
+++ b/torch/_deploy.py
@@ -88,7 +88,7 @@ def persistent_load(saved_id):
         importer = sys_importer
 
     unpickler = PackageUnpickler(importer, io.BytesIO(obj_bytes))
-    unpickler.persistent_load = persistent_load  # type: ignore[assignment]
+    unpickler.persistent_load = persistent_load  # type: ignore[method-assign]
     result = _deploy_objects[id] = unpickler.load()
     return result
 
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 6b3ca43bc3330..05d58662da794 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -1,6 +1,7 @@
 import torch
 from . import convert_frame, eval_frame, resume_execution
 from .backends.registry import list_backends, lookup_backend, register_backend
+from .callback import callback_handler, on_compile_end, on_compile_start
 from .code_context import code_context
 from .convert_frame import replay
 from .decorators import (
@@ -21,6 +22,7 @@
     explain,
     export,
     is_dynamo_supported,
+    is_inductor_supported,
     optimize,
     optimize_assert,
     OptimizedModule,
@@ -66,13 +68,8 @@
 
 def reset() -> None:
     """Clear all compile caches and restore initial state"""
-    with eval_frame.compile_lock:
-        for weak_code in (
-            convert_frame.input_codes.seen + convert_frame.output_codes.seen
-        ):
-            code = weak_code()
-            if code:
-                reset_code(code)
+    with convert_frame.compile_lock:
+        reset_code_caches()
         convert_frame.input_codes.clear()
         convert_frame.output_codes.clear()
         orig_code_map.clear()
@@ -82,4 +79,18 @@ def reset() -> None:
         _reset_guarded_backend_cache()
         reset_frame_count()
         torch._C._dynamo.compiled_autograd.clear_cache()
+        convert_frame.FRAME_COUNTER = 0
+        convert_frame.FRAME_COMPILE_COUNTER.clear()
+        callback_handler.clear()
+
+
+def reset_code_caches() -> None:
+    """Clear compile caches that are keyed by code objects"""
+    with convert_frame.compile_lock:
+        for weak_code in (
+            convert_frame.input_codes.seen + convert_frame.output_codes.seen
+        ):
+            code = weak_code()
+            if code:
+                reset_code(code)
         code_context.clear()
diff --git a/torch/_dynamo/_trace_wrapped_higher_order_op.py b/torch/_dynamo/_trace_wrapped_higher_order_op.py
index 7915f141d29ee..6e22cafcc6dd5 100644
--- a/torch/_dynamo/_trace_wrapped_higher_order_op.py
+++ b/torch/_dynamo/_trace_wrapped_higher_order_op.py
@@ -1,11 +1,14 @@
+import torch
 from torch._C import DispatchKey
 from torch._higher_order_ops.utils import autograd_not_implemented
 
 from torch._ops import HigherOrderOperator
 from torch._subclasses import FakeTensorMode
+from torch.fx.experimental._backward_state import BackwardState
 
 from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
 from torch.utils._python_dispatch import _get_current_dispatch_mode
+from torch.utils._pytree import tree_map_only
 
 
 __all__ = ["trace_wrapped"]
@@ -41,10 +44,12 @@
 # compiled autograd do we inline into the function.
 
 
-def trace_wrapped(*args, fn):
-    return _trace_wrapped_op(*args, fn=fn)
+def trace_wrapped(*args, **kwargs):
+    with torch.no_grad():
+        return _trace_wrapped_op(*args, **kwargs)
 
 
+# TODO(jansel): need to ensure this does not get DCEed
 _trace_wrapped_op = HigherOrderOperator("trace_wrapped")
 
 
@@ -56,54 +61,51 @@ def _assert_meta(grad, size, stride, dtype):
 
 
 @_trace_wrapped_op.py_impl(ProxyTorchDispatchMode)
-def inner_trace(mode, *args, fn):
-    import torch
-
-    assert len(args) == 1
-    grad = args[0]
-    assert isinstance(grad, torch.Tensor)
-
-    def self_invoke(*args):
-        return _trace_wrapped_op(*args, fn=fn)
-
-    proxy_args = (mode.tracer.unwrap_proxy(grad),)
-    out_proxy = mode.tracer.create_proxy(
-        "call_function", self_invoke, proxy_args, {}, name="trace_wrapped"
-    )
-    grad = torch.zeros_like(grad)
-    grad = track_tensor_tree(grad, out_proxy, constant=None, tracer=mode.tracer)
-
-    # We have a little shortcut here, wherein we DO NOT yet run a meta func, and so
-    # we take on an assumption that input and output meta matches. As such, we must introduce
-    # a runtime assert
-    proxy_args = (
-        mode.tracer.unwrap_proxy(grad),
-        grad.size(),
-        grad.stride(),
-        grad.dtype,
-    )
+def inner_trace(mode, *args, bw_state=None, **kwargs):
+    def self_invoke(*args, **dyn_kwargs):
+        with torch.no_grad():
+            return _trace_wrapped_op(*args, **dyn_kwargs, **kwargs)
+
+    def unwrap_proxies(x):
+        if isinstance(x, torch.Tensor):
+            return mode.tracer.unwrap_proxy(x)
+        if isinstance(x, (list, tuple)):
+            return type(x)(map(unwrap_proxies, x))
+        if x is None:
+            return None
+        raise AssertionError(f"unhandled type: {type(x)}")
+
+    proxy_kwargs = {}
+    if bw_state is not None:
+        assert isinstance(bw_state, BackwardState) and bw_state.proxy is not None
+        proxy_kwargs["bw_state"] = bw_state.proxy
     out_proxy = mode.tracer.create_proxy(
         "call_function",
-        _assert_meta,
-        proxy_args,
-        {},
-        name="assert",
+        self_invoke,
+        unwrap_proxies(args),
+        proxy_kwargs,
+        name="trace_wrapped",
     )
-    grad = torch.empty_like(grad)
-    grad = track_tensor_tree(grad, out_proxy, constant=None, tracer=mode.tracer)
+
+    if args[0] is None:
+        grad = args[1]  # module backward hooks
+    else:
+        grad = args[0]  # other backward hooks
+    grad = tree_map_only(torch.Tensor, torch.empty_like, grad)
+    track_tensor_tree(grad, out_proxy, constant=None, tracer=mode.tracer)
     return grad
 
 
 @_trace_wrapped_op.py_impl(FakeTensorMode)
-def inner_fake(*args, fn):
+def inner_fake(*args, **kwargs):
     raise RuntimeError("This op should never be invoked here")
 
 
 @_trace_wrapped_op.py_impl(DispatchKey.CompositeExplicitAutograd)
-def _trace_wrapped_op_dense(*args, fn):
+def _trace_wrapped_op_dense(*args, fn, **kwargs):
     mode = _get_current_dispatch_mode()
     assert mode is None, "Mode should never be enabled for CPU/CUDA key"
-    return fn(*args)
+    return fn(*args, **kwargs)
 
 
 _trace_wrapped_op.py_impl(DispatchKey.Autograd)(
@@ -112,7 +114,7 @@ def _trace_wrapped_op_dense(*args, fn):
 
 
 @_trace_wrapped_op.py_functionalize_impl
-def _trace_wrapped_functionalized(ctx, *args, fn):
+def _trace_wrapped_functionalized(ctx, *args, **kwargs):
     unwrapped_args = ctx.unwrap_tensors(args)
     with ctx.redispatch_to_next():
-        return ctx.wrap_tensors(_trace_wrapped_op(*unwrapped_args, fn=fn))
+        return ctx.wrap_tensors(_trace_wrapped_op(*unwrapped_args, **kwargs))
diff --git a/torch/_dynamo/backends/common.py b/torch/_dynamo/backends/common.py
index 2ce1e7b7c20a4..cf1204de1a5ff 100644
--- a/torch/_dynamo/backends/common.py
+++ b/torch/_dynamo/backends/common.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import contextlib
 import functools
 import logging
@@ -5,7 +7,7 @@
 
 import torch
 from torch._dynamo import disable
-from torch._dynamo.utils import counters, defake
+from torch._dynamo.utils import counters, defake, flatten_graph_inputs
 from torch._functorch.aot_autograd import aot_module_simplified
 from torch.utils._python_dispatch import _disable_current_modes
 
@@ -14,10 +16,18 @@
 
 def aot_autograd(**kwargs):
     def compiler_fn(gm: torch.fx.GraphModule, example_inputs):
+        if any(isinstance(x, (list, tuple, dict)) for x in example_inputs):
+            return flatten_graph_inputs(
+                gm,
+                example_inputs,
+                compiler_fn,
+            )
+
         # Hack to get around circular import problems with aot_eager_decomp_partition
         if callable(kwargs.get("decompositions")):
             kwargs["decompositions"] = kwargs["decompositions"]()
 
+        # NB: dont delete counter increment
         counters["aot_autograd"]["total"] += 1
         use_fallback = False
 
diff --git a/torch/_dynamo/backends/cudagraphs.py b/torch/_dynamo/backends/cudagraphs.py
index a542b0752b77b..4cef3a68fd659 100644
--- a/torch/_dynamo/backends/cudagraphs.py
+++ b/torch/_dynamo/backends/cudagraphs.py
@@ -1,87 +1,33 @@
-import logging
+# mypy: ignore-errors
+
+import functools
 import operator
 from collections import defaultdict
-from typing import Set
+from typing import Dict, List, Optional
 
 import torch
+from torch._dynamo import config
+from torch._dynamo.backends.common import aot_autograd
+from torch._dynamo.backends.debugging import boxed_nop
+from torch._inductor.cudagraph_utils import (
+    BoxedDeviceIndex,
+    check_multiple_devices_or_any_cpu_nodes,
+    format_default_skip_message,
+    get_mutation_stack_trace,
+    get_placeholders,
+    log_cudagraph_skip_and_bump_counter,
+)
+from torch._inductor.utils import (
+    BoxedBool,
+    count_tangents,
+    get_first_incompatible_cudagraph_node,
+    num_fw_fixed_arguments,
+    output_node,
+)
 
-from torch.fx import GraphModule
-from torch.fx.passes.backends.cudagraphs import partition_cudagraphs
 from torch.multiprocessing.reductions import StorageWeakRef
-from torch.nn import Module
-from torch.utils._pytree import tree_map
-from .common import aot_autograd
 from .registry import register_backend
 
-log = logging.getLogger(__name__)
-
-
-def cloner(t):
-    if isinstance(t, torch.Tensor):
-        return t.clone()
-    else:
-        return t
-
-
-class CudaGraphModule(Module):
-    gm: GraphModule
-    mutated_inputs: Set[int]
-
-    def __init__(self, gm, mutated_inputs):
-        super().__init__()
-        self.gm = gm
-        self.mutated_inputs = mutated_inputs
-
-    warmed_up = False
-
-    # these are all None or all filled
-    graph = None
-    static_inputs = None
-    static_outputs = None
-
-    # NB: we override __call__ as we don't need any nn.Module machinery
-    # and to reduce overhead
-    def __call__(self, *args):
-        # TODO: once we've recorded here, we'd like to replace the __call__
-        # implementation with compiled bytecode that copies into static, replays
-        # the cuda graph, then copies out.  First condition is the hotpath,
-        # needs optimizing
-        if self.graph is not None:
-            assert len(args) == len(self.static_inputs)
-            for dst, src in zip(self.static_inputs, args):
-                dst.copy_(src)
-            self.graph.replay()
-            for i in self.mutated_inputs:
-                args[i].copy_(self.static_inputs[i])
-            return tree_map(cloner, self.static_outputs)
-
-        elif self.warmed_up:
-            # record
-            self.static_inputs = [x.clone() for x in args]
-            self.graph = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(self.graph):
-                self.static_outputs = self.gm(*self.static_inputs)
-            # NB: recording doesn't actually run the operations, so
-            # now we immediately replay the graph to serve up the result
-            self.graph.replay()
-            for i in self.mutated_inputs:
-                args[i].copy_(self.static_inputs[i])
-            return tree_map(cloner, self.static_outputs)
-
-        else:
-            # warmup
-            stream = torch.cuda.Stream()
-            stream.wait_stream(torch.cuda.current_stream())
-            with torch.cuda.stream(stream):
-                r = self.gm(*args)
-            torch.cuda.current_stream().wait_stream(stream)
-            self.warmed_up = True
-            return r
-
-
-# Interpreter versions of these passes can be found at
-# https://gist.github.com/ezyang/df2d746cac3b2c7d55c181e37c57ef23
-
 
 def find_input_mutations(g):
     def meta_fk(meta):
@@ -92,7 +38,8 @@ def meta_fk(meta):
     mutated_inputs = set()
     for n in g.nodes:
         if n.op == "placeholder":
-            inputs[StorageWeakRef(meta_fk(n.meta)._typed_storage())].add(input_idx)
+            if isinstance(meta_fk(n.meta), torch.Tensor):
+                inputs[StorageWeakRef(meta_fk(n.meta)._typed_storage())].add(input_idx)
             input_idx += 1
         elif n.op == "call_function":
             if n.target is operator.getitem:
@@ -115,34 +62,159 @@ def meta_fk(meta):
                     mutated_inputs |= inputs[
                         StorageWeakRef(meta_fk(argument.meta)._typed_storage())
                     ]
+
         # TODO: error on unrecognized nodes
     return mutated_inputs
 
 
-# Mutates input graph
-def apply_cuda_graphs(gm):
+def get_device_node_mapping(gm: torch.fx.GraphModule):
+    device_node_mapping: Dict[torch.device, torch.fx.Node] = {}
     for n in gm.graph.nodes:
-        if n.op == "call_module":
-            assert not n.kwargs
-            submod = gm.get_submodule(n.target)
-            gm.delete_submodule(n.target)
-            mutated_inputs = find_input_mutations(submod.graph)
-            gm.add_submodule(n.target, CudaGraphModule(submod, mutated_inputs))
-    # NB: we didn't actually change the graph, no need for recompile
+        t = n.meta.get("val", None)
+        if isinstance(t, torch.Tensor) and t.device not in device_node_mapping:
+            device_node_mapping[t.device] = n
+    return device_node_mapping
+
+
+def check_for_mutation_ignore_cuda_graph_managed_tensor(
+    aot_model: torch.fx.GraphModule, num_fixed
+) -> Optional[str]:
+    mutation_indices = find_input_mutations(aot_model.graph) - set(range(num_fixed))
+    if not mutation_indices:
+        return None
+
+    placeholders = [node for node in aot_model.graph.nodes if node.op == "placeholder"]
+    return get_mutation_stack_trace(placeholders, mutation_indices)
+
+
+def check_for_skip(aot_model: torch.fx.GraphModule, num_fixed) -> Optional[str]:
+    if not config.cudagraph_backend_support_input_mutation:
+        if mut_skip := check_for_mutation_ignore_cuda_graph_managed_tensor(
+            aot_model, num_fixed
+        ):
+            return mut_skip
+
+    if skip := check_multiple_devices_or_any_cpu_nodes(
+        get_device_node_mapping(aot_model)
+    ):
+        return skip
+
+    if node := get_first_incompatible_cudagraph_node(aot_model):
+        return format_default_skip_message(f"incompatible op ({node.name})")
+
+    return None
+
+
+def get_device_index(gm) -> int:
+    device = next(iter(get_device_node_mapping(gm)))
+    assert device.type == "cuda"
+    return device.index
+
+
+def get_stack_traces(gm) -> List[Optional[str]]:
+    output = output_node(gm)
+    assert len(output.args) == 1
+    return [
+        (arg.stack_trace if isinstance(arg, torch.fx.node.Node) else None)
+        for arg in output.args[0]
+    ]
+
+
+def cudagraphs(dynamo_model, dynamo_inputs):
+    from torch._inductor.cudagraph_trees import cudagraphify_impl
+
+    do_cudagraphs = BoxedBool(True)
+    boxed_device_index = BoxedDeviceIndex(None)
+
+    def forward_cudagraphs(aot_model, aot_inputs, is_inference=False):
+        interp = boxed_nop(aot_model, aot_inputs)
+        fixed = num_fw_fixed_arguments(len(dynamo_inputs), len(aot_inputs))
+        if skip_msg := check_for_skip(aot_model, fixed):
+            BoxedBool.disable(do_cudagraphs)
+            log_cudagraph_skip_and_bump_counter(
+                f"skipping cudagraphs due to {skip_msg}"
+            )
+            return interp
+
+        boxed_device_index.set(get_device_index(aot_model))
+        out = cudagraphify_impl(
+            interp,
+            aot_inputs,
+            range(fixed),
+            device_index=boxed_device_index.value,
+            is_backward=False,
+            is_inference=False,
+            stack_traces=get_stack_traces(aot_model),
+            placeholders=get_placeholders(aot_model.graph),
+            mutated_input_idxs=find_input_mutations(aot_model.graph),
+        )
+        out._boxed_call = True
+        return out
+
+    def backward_cudagraphs(aot_model, aot_inputs):
+        interp = boxed_nop(aot_model, aot_inputs)
+        if not do_cudagraphs:
+            return aot_model
+
+        fixed = count_tangents(aot_model)
+        if skip_msg := check_for_skip(aot_model, fixed):
+            log_cudagraph_skip_and_bump_counter(
+                "skipping cudagraphs due to %s", skip_msg
+            )
+
+            # See [Backward Generation Handling]
+            manager = torch._inductor.cudagraph_trees.get_manager(
+                boxed_device_index.value, create_if_none_exists=False
+            )
+            assert manager is not None
+
+            def fn(inputs):
+                manager.set_to_running_backward()
+                return aot_model(inputs)
+
+            fn._boxed_call = True
+            return fn
+
+        out = cudagraphify_impl(
+            interp,
+            aot_inputs,
+            range(fixed),
+            device_index=get_device_index(aot_model),
+            is_backward=True,
+            is_inference=False,
+            stack_traces=get_stack_traces(aot_model),
+            placeholders=get_placeholders(aot_model.graph),
+            mutated_input_idxs=find_input_mutations(aot_model.graph),
+        )
+        out._boxed_call = True
+        return out
+
+    aot_cudagraphs = aot_autograd(
+        fw_compiler=forward_cudagraphs,
+        bw_compiler=backward_cudagraphs,
+        inference_compiler=functools.partial(forward_cudagraphs, is_inference=True),
+        keep_inference_input_mutations=torch._dynamo.config.cudagraph_backend_keep_input_mutation,
+    )
+    return aot_cudagraphs(dynamo_model, dynamo_inputs)
+
+
+class CudagraphsBackend:
+    compiler_name = "cudagraphs"
 
+    @staticmethod
+    def reset():
+        from torch._inductor.cudagraph_trees import reset_cudagraph_trees
 
-def cudagraphs(model, inputs):
-    model = partition_cudagraphs(model, inputs)
-    apply_cuda_graphs(model)
-    return model
+        reset_cudagraph_trees()
 
+    @staticmethod
+    def __call__(model, inputs):
+        return cudagraphs(model, inputs)
 
-aot_cudagraphs = aot_autograd(fw_compiler=cudagraphs, bw_compiler=cudagraphs)
 
 # aot_cudagraphs only applies CUDA graphs to the graph.  It is also helpful
 # for debugging and can serve as a perf baseline.
-# TODO(jansel): rename to just "cudagraphs"?
-register_backend(name="cudagraphs", compiler_fn=aot_cudagraphs)
+register_backend(name="cudagraphs", compiler_fn=CudagraphsBackend())
 
 
 def cudagraphs_inner(model, inputs, copy_outputs=True, copy_inputs=True):
diff --git a/torch/_dynamo/backends/debugging.py b/torch/_dynamo/backends/debugging.py
index a759c74b9a0b1..03d34129464c8 100644
--- a/torch/_dynamo/backends/debugging.py
+++ b/torch/_dynamo/backends/debugging.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import dataclasses
 import functools
 from importlib import import_module
@@ -7,6 +9,7 @@
 
 import torch
 from torch import _guards
+from torch._functorch import config as functorch_config
 from torch._functorch.compilers import ts_compile
 from .common import aot_autograd
 from .registry import register_debug_backend as register_backend
@@ -18,7 +21,7 @@
 
 @register_backend
 def eager(gm, fake_tensor_inputs):
-    return gm
+    return gm.forward
 
 
 @register_backend
@@ -65,35 +68,45 @@ def run(args):
 # Useful for debugging purpose
 # aot_eager uses AOT Autograd backend with nop compiler. It is helpful in debugging.
 aot_eager = aot_autograd(
-    fw_compiler=boxed_nop, partition_fn=min_cut_rematerialization_partition
+    fw_compiler=boxed_nop,
+    partition_fn=min_cut_rematerialization_partition,
+    keep_inference_input_mutations=True,
 )
 register_backend(name="aot_eager", compiler_fn=aot_eager)
 
-aot_eager_default_partitioner = aot_autograd(fw_compiler=boxed_nop)
+aot_eager_default_partitioner = aot_autograd(
+    fw_compiler=boxed_nop, keep_inference_input_mutations=True
+)
 register_backend(
     name="aot_eager_default_partitioner", compiler_fn=aot_eager_default_partitioner
 )
 
+
 # Uses TorchInductor AOT Autograd decomps and partitioner to isolate aot vs
 # inductor problems.
 # aot_eager_decomp_partition just replaces the inductor compiler with nop to help
 # isolate inductor vs aot_eager errors
-aot_eager_decomp_partition = aot_autograd(
-    # these are taken from memory_efficient_fusion()
-    fw_compiler=boxed_nop,
-    bw_compiler=boxed_nop,
-    # NB: lambda here is to delay import of inductor
-    decompositions=lambda: import_module(
-        "torch._inductor.compile_fx"
-    ).select_decomp_table(),
-    partition_fn=functools.partial(
-        min_cut_rematerialization_partition, compiler="inductor"
-    ),
-)
+def aot_eager_decomp_partition(gm, fake_tensor_inputs):
+    with functorch_config.patch(unlift_effect_tokens=True):
+        return aot_autograd(
+            # these are taken from memory_efficient_fusion()
+            fw_compiler=boxed_nop,
+            bw_compiler=boxed_nop,
+            # NB: lambda here is to delay import of inductor
+            decompositions=lambda: import_module(
+                "torch._inductor.compile_fx"
+            ).select_decomp_table(),
+            partition_fn=functools.partial(
+                min_cut_rematerialization_partition, compiler="inductor"
+            ),
+        )(gm, fake_tensor_inputs)
+
+
 register_backend(
     name="aot_eager_decomp_partition", compiler_fn=aot_eager_decomp_partition
 )
 
+
 # AOT Autograd with torchscript backend. Default partitioner.
 # aot_ts uses torchscript backend. We can use this with both nnc and nvfuser
 # by using the relevant fuser with torch.jit.fuser(...)
@@ -116,7 +129,7 @@ class TestingOnlyCompileError(Exception):
 def relu_compile_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
     for node in gm.graph.nodes:
         if node.target == torch.relu:
-            raise ReluCompileError()
+            raise ReluCompileError
     return gm
 
 
@@ -152,7 +165,7 @@ def non_leaf_compile_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs
         return gm
     for t in example_inputs:
         if not t.is_leaf:
-            raise TestingOnlyCompileError()
+            raise TestingOnlyCompileError
     return gm
 
 
diff --git a/torch/_dynamo/backends/distributed.py b/torch/_dynamo/backends/distributed.py
index a5b32c6a6bc12..477cbdfd02598 100644
--- a/torch/_dynamo/backends/distributed.py
+++ b/torch/_dynamo/backends/distributed.py
@@ -1,12 +1,16 @@
+# mypy: ignore-errors
+
 import logging
 import traceback
 from dataclasses import dataclass, field
 from typing import Any, List, Optional
+from unittest import mock
 
 import torch
 from torch import fx
 from torch._dynamo.output_graph import GraphCompileReason
 from torch._dynamo.utils import deepcopy_to_fake_tensor, detect_fake_mode
+from torch._logging import trace_structured
 from torch.fx.node import Node
 
 # Regular log messages should go through 'log'.
@@ -124,7 +128,258 @@ def has_higher_order_op(gm):
     return False
 
 
+# 3 (lazy compile): Replace submodules with lazily compiling submodule
+class SubmoduleReplacer(torch.fx.interpreter.Interpreter):
+    def __init__(self, module, compiler):
+        super().__init__(module)
+        self.compiler = compiler
+
+    def lazily_compiled_submod(self, input_mod):
+        """
+        Create a wrapper around submodules which:
+        - lazily compiles each of the partitioned submodules using the user-provided compiler
+        - unpacks singleton tuples/lists into flat arg
+        """
+
+        class LazilyCompiledModule(torch.nn.Module):
+            def __init__(self, submod, compiler, unwrap_singleton_tuple):
+                super().__init__()
+                self.submod = submod
+                self.compiler = compiler
+                self.compiled = False
+                self.unwrap_singleton_tuple = unwrap_singleton_tuple
+
+            def forward(self, *args):
+                if not self.compiled:
+                    # First compile with args as example_inputs
+                    # These args will be fakeified if using Inductor/AOTAutograd
+                    new_submod = self.compiler(self.submod, args)
+                    del self.submod
+                    self.submod = new_submod
+                    self.compiled = True
+                    self.compiler = None
+
+                x = self.submod(*args)
+                # we must let 'input_mod' return a tuple, to make AOT happy.
+                # (aot_autograd compile_fn literally requires that the output of a graph it compiles is a tuple).
+                # however, we don't acutally want this tuple to be returned, since the fx logic that calls the submod
+                # will again wrap outputs from the submod in a tuple.  So we unwrap it, and count on it being re-wrapped
+                if self.unwrap_singleton_tuple and isinstance(x, (tuple, list)):
+                    return x[0]
+                return x
+
+        unwrap_singleton_tuple = False
+        for sn in input_mod.graph.nodes:
+            if sn.op == "output":
+                if not isinstance(sn.args[0], tuple):
+                    unwrap_singleton_tuple = True
+                    sn.args = (sn.args,)
+
+        input_mod.recompile()
+        input_mod.compile_subgraph_reason = GraphCompileReason(
+            "DDPOptimizer intentional graph-break (See Note [DDPOptimizer])."
+            " Set `torch._dynamo.config.optimize_ddp = False` to disable.",
+            [
+                # it's close to useless to get a real stacktrace here, and quite verbose.
+                traceback.FrameSummary(__file__, 0, DDPOptimizer),
+            ],
+        )
+        wrapper = LazilyCompiledModule(
+            input_mod,
+            self.compiler,
+            unwrap_singleton_tuple,
+        )
+        return wrapper
+
+    # We replace the submodules with lazy submodules which compile
+    # the corresponding submodules when they are run with real values
+    # Always returns `None` - we do not need to propagate values in order
+    # to replace submodules.
+    def run_node(self, n: Node) -> Any:
+        if n.op == "call_module":
+            real_mod = self.fetch_attr(n.target)
+
+            ddp_graph_log.debug("\n---%s graph---\n%s", n.target, real_mod.graph)
+
+            assert len(n.kwargs) == 0, "We assume only args for these modules"
+            lazily_compiled_submod = self.lazily_compiled_submod(real_mod)
+
+            # We update the original (outer) graph with a call into the compiled module
+            # instead of the uncompiled one.
+            self.module.delete_submodule(n.target)
+            n.target = "compiled_" + n.target
+            self.module.add_submodule(n.target, lazily_compiled_submod)
+
+
+# 3 (no lazy compile): compile each of the partitioned submodules using the user-provided compiler
+class SubmodCompiler(torch.fx.interpreter.Interpreter):
+    def __init__(self, module, compiler, fake_mode):
+        super().__init__(module)
+        self.compiler = compiler
+        self.fake_mode = fake_mode
+
+    def compile_submod(self, input_mod, args, kwargs):
+        """
+        Compile the submodule,
+        using a wrapper to make sure its output is always a tuple,
+        which is required by AotAutograd based compilers
+        """
+        assert len(kwargs) == 0, "We assume only args for these modules"
+
+        class WrapperModule(torch.nn.Module):
+            def __init__(self, submod, unwrap_singleton_tuple):
+                super().__init__()
+                self.submod = submod
+                self.unwrap_singleton_tuple = unwrap_singleton_tuple
+
+            def forward(self, *args):
+                x = self.submod(*args)
+                # TODO(whc)
+                # for some reason the isinstance check is necessary if I split one node per submod
+                # - even though I supposedly wrapped the output in a tuple in those cases, the real
+                # compiled module was still returning a tensor
+                if self.unwrap_singleton_tuple and isinstance(x, (tuple, list)):
+                    return x[0]
+                return x
+
+        unwrap_singleton_tuple = False
+        for sn in input_mod.graph.nodes:
+            if sn.op == "output":
+                if not isinstance(sn.args[0], tuple):
+                    unwrap_singleton_tuple = True
+                    sn.args = (sn.args,)
+
+        input_mod.recompile()
+        input_mod.compile_subgraph_reason = GraphCompileReason(
+            "DDPOptimizer intentional graph-break (See Note [DDPOptimizer])."
+            " Set `torch._dynamo.config.optimize_ddp = False` to disable.",
+            [
+                # it's close to useless to get a real stacktrace here, and quite verbose.
+                traceback.FrameSummary(__file__, 0, DDPOptimizer),
+            ],
+        )
+
+        wrapper = WrapperModule(
+            self.compiler(input_mod, args),
+            unwrap_singleton_tuple,
+        )
+        return wrapper
+
+    # Note:
+    #
+    # The way distributed works today around fake tensors can be somewhat confusing.
+    # Some of these codepaths are shared in both runtime, and compile time. The presence
+    # of a fake_mode, read off of fake tensor inputs, dictates how we will operate.
+    #
+    # A few things to keep in mind:
+    #
+    # 1) We invoke `compile_submod` with a real module. The output of that gets stored
+    # on the graph via `self.module.add_submodule(n.target, compiled_submod_real)`.
+    #
+    # 2) When running a call_module targeted node, if we have a fake_mode, we fakify the
+    # module we got from self.fetch_attr(n.target). Regardless of fake_mode, we then execute it.
+    #
+    # 3) Fake tensors should always be around during compile time.
+    #
+    # 4) Fake tensors should never be around at runtime.
+    #
+    # 5) We end up with a compilation mode that takes a real submodule and fake tensors,
+    # to match what aot_autograd expects. See Note: [Fake Modules and AOTAutograd]
+    def run_node(self, n: Node) -> Any:
+        args, kwargs = self.fetch_args_kwargs_from_env(n)
+        new_args = []
+        assert self.fake_mode
+        for arg in args:
+            if isinstance(arg, torch.Tensor) and not isinstance(
+                arg, torch._subclasses.FakeTensor
+            ):
+                new_args.append(torch._dynamo.utils.to_fake_tensor(arg, self.fake_mode))
+            else:
+                new_args.append(arg)
+
+        log.debug("run_node %s, %s got args %s", n.op, n.target, args_str(args))
+        assert isinstance(args, tuple)
+        assert isinstance(kwargs, dict)
+
+        if n.op == "call_module":
+            real_mod = self.fetch_attr(n.target)
+            if self.fake_mode:
+                curr_submod = deepcopy_to_fake_tensor(real_mod, self.fake_mode)
+            else:
+                curr_submod = real_mod
+
+            ddp_graph_log.debug("\n---%s graph---\n%s", n.target, curr_submod.graph)
+
+            # When calling the compiler on the submod, inputs (new_args) are expected to
+            # be FakeTensors already since Dynamo would have made them FakeTensors in the
+            # non-DDP flow.  However, the parameters are _not_ expected to be FakeTensors,
+            # since this wrapping happens during compilation
+
+            # Note: Returning Fake Tensors on First AOT Autograd Call
+            #
+            # Inductor will optimize strides of outputs when it deems it profitable.
+            # For instance, converting to channels last. When we split the graph here
+            # into multiple inductor compilations, we need to make sure that the
+            # output strides of one compilation is appropriately passed to the subsequent
+            # compilations. However, the mapping from inductor output to dynamo output
+            # is non-trivial due to aot_autograd's deduping, de-aliasing, mutation, re-writing,
+            # subclass handling, etc. In order to replay all this logic we set a flag such that
+            # the first invocation of inductor in aot_autograd will return Fake Tensors with
+            # appropriate strides. Then, all of aot autograd's runtime logic is replayed.
+            # This gives us the appropriately strided outputs here which will reflect runtime strides.
+
+            class FakeifyFirstAOTInvocationGuard:
+                def __init__(self):
+                    self.tc = torch._guards.TracingContext.try_get()
+                    assert self.tc
+                    torch._guards.TracingContext.try_get().fakify_first_call = True
+
+                def __del__(self):
+                    self.tc.fakify_first_call = False
+
+            # For aot_eager and other backends, tracing context is not set
+            has_tracing_context = torch._guards.TracingContext.try_get() is not None
+            if has_tracing_context:
+                g = FakeifyFirstAOTInvocationGuard()
+
+            from torch._dynamo.utils import counters
+
+            init = counters["aot_autograd"]["total"]
+            compiled_submod_real = self.compile_submod(real_mod, new_args, kwargs)
+
+            # TODO - better way of doing this?
+            # Only aot autograd handles fakifying first call
+            invoked_aot_autograd = init != counters["aot_autograd"]["total"]
+
+            # We update the original (outer) graph with a call into the compiled module
+            # instead of the uncompiled one.
+            self.module.delete_submodule(n.target)
+            n.target = "compiled_" + n.target
+            self.module.add_submodule(n.target, compiled_submod_real)
+
+            # Finally, we have to produce inputs for use compiling the next submodule,
+            # and these need to be FakeTensors, so we execute the module under fake_mode
+            # Because parameters are not fake we patch fake tensor mode to allow non fake inputs
+            with self.fake_mode, mock.patch.object(
+                self.fake_mode, "allow_non_fake_inputs", True
+            ):
+                if has_tracing_context and invoked_aot_autograd:
+                    out = compiled_submod_real(*new_args, **kwargs)
+                    # output should be fake or subclass
+                    assert all(
+                        (not isinstance(t, torch.Tensor) or type(t) is not torch.Tensor)
+                        for t in (out if isinstance(out, (list, tuple)) else [out])
+                    )
+                    return out
+                else:
+                    return curr_submod(*new_args, **kwargs)
+        else:
+            # placeholder or output nodes don't need to get compiled, just executed
+            return getattr(self, n.op)(n.target, new_args, kwargs)
+
+
 class DDPOptimizer:
+
     """Note [DDPOptimizer]
     DDPOptimizer applies when dynamo compiles models wrapped in DistributedDataParallel (DDP),
     breaking the dynamo graph into chunks to compile separately, with the breaks aligning to
@@ -170,7 +425,7 @@ class DDPOptimizer:
     Debugging
      - Generally, it is easiest to debug DDPOptimizer in a single process program, using pdb.
      - In many cases, the log messages are helpful (they show bucket size assignments)-
-       just configure torch._dynamo.config.log_level to info or debug.
+       just set TORCH_LOGS env to include any of 'dynamo', 'distributed', or 'dist_ddp'.
      - See `benchmarks/dynamo/distributed.py` for a simple harness that will run a toy model or a torchbench model
        in a single process (or with torchrun, in multiple processes)
 
@@ -209,6 +464,14 @@ def __init__(
     def _ignore_parameter(self, parameter):
         return hasattr(parameter, "_ddp_ignored") and parameter._ddp_ignored
 
+    def add_module_params_to_bucket(self, mod, bucket, processed_modules, prefix):
+        processed_modules.add(mod)
+        for name, param in mod.named_parameters():
+            if param.requires_grad and not self._ignore_parameter(param):
+                bucket.size += param.untyped_storage().nbytes()
+                bucket.params.append(f"{prefix}_{name}")
+                bucket.param_ids.append(id(param))
+
     def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
         """
         Implements graph splitting, first determining a set of of buckets by counting
@@ -235,6 +498,7 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
 
         # 1: compute the partition map according to DDP bucket logic
         buckets = [Bucket()]  # (size, param_names)
+        processed_modules = set()
         for node in reversed(gm.graph.nodes):
             if node.op in ("output", "placeholder"):
                 continue
@@ -254,18 +518,29 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
                     if buckets[0].opcount_increased_to_capture_external_output == 0:
                         buckets[0].paramsize_before_opcount_increase = buckets[0].size
                     buckets[0].opcount_increased_to_capture_external_output += 1
-
             if node.op == "call_module":
-                target = gm.get_submodule(node.target)
-                for name, param in target.named_parameters():
-                    if param.requires_grad and not self._ignore_parameter(param):
-                        buckets[0].size += param.untyped_storage().nbytes()
-                        buckets[0].params.append(f"{node.target}_{name}")
-                        buckets[0].param_ids.append(id(param))
+                target_mod = gm.get_submodule(node.target)
+                if target_mod not in processed_modules:
+                    self.add_module_params_to_bucket(
+                        target_mod, buckets[0], processed_modules, node.target
+                    )
+            elif node.op == "call_method":
+                if isinstance(node.args[0].target, str):
+                    target_mod = None
+                    try:
+                        target_mod = gm.get_submodule(node.args[0].target)
+                    except AttributeError:
+                        pass
+                    if target_mod is not None and target_mod not in processed_modules:
+                        self.add_module_params_to_bucket(
+                            target_mod, buckets[0], processed_modules, node.target
+                        )
             elif node.op == "get_attr":
                 maybe_param = getattr(gm, node.target)
-                if maybe_param.requires_grad and not self._ignore_parameter(
-                    maybe_param
+                if (
+                    isinstance(maybe_param, torch.nn.Parameter)
+                    and maybe_param.requires_grad
+                    and not self._ignore_parameter(maybe_param)
                 ):
                     buckets[0].size += maybe_param.untyped_storage().nbytes()
                     buckets[0].params.append(node.target)
@@ -310,89 +585,17 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
         debug_str += "\n---------------\n"
         ddp_graph_log.debug(debug_str)
 
-        # 3 (lazy compile): Replace submodules with lazily compiling submodule
-        class SubmoduleReplacer(torch.fx.interpreter.Interpreter):
-            def __init__(self, module, compiler):
-                super().__init__(module)
-                self.compiler = compiler
-
-            def lazily_compiled_submod(self, input_mod):
-                """
-                Create a wrapper around submodules which:
-                - lazily compiles each of the partitioned submodules using the user-provided compiler
-                - unpacks singleton tuples/lists into flat arg
-                """
-
-                class LazilyCompiledModule(torch.nn.Module):
-                    def __init__(self, submod, compiler, unwrap_singleton_tuple):
-                        super().__init__()
-                        self.submod = submod
-                        self.compiler = compiler
-                        self.compiled = False
-                        self.unwrap_singleton_tuple = unwrap_singleton_tuple
-
-                    def forward(self, *args):
-                        if not self.compiled:
-                            # First compile with args as example_inputs
-                            # These args will be fakeified if using Inductor/AOTAutograd
-                            new_submod = self.compiler(self.submod, args)
-                            del self.submod
-                            self.submod = new_submod
-                            self.compiled = True
-                            self.compiler = None
-
-                        x = self.submod(*args)
-                        # we must let 'input_mod' return a tuple, to make AOT happy.
-                        # (aot_autograd compile_fn literally requires that the output of a graph it compiles is a tuple).
-                        # however, we don't acutally want this tuple to be returned, since the fx logic that calls the submod
-                        # will again wrap outputs from the submod in a tuple.  So we unwrap it, and count on it being re-wrapped
-                        if self.unwrap_singleton_tuple and isinstance(x, (tuple, list)):
-                            return x[0]
-                        return x
-
-                unwrap_singleton_tuple = False
-                for sn in input_mod.graph.nodes:
-                    if sn.op == "output":
-                        if not isinstance(sn.args[0], tuple):
-                            unwrap_singleton_tuple = True
-                            sn.args = (sn.args,)
-
-                input_mod.recompile()
-                input_mod.compile_subgraph_reason = GraphCompileReason(
-                    "DDPOptimizer intentional graph-break (See Note [DDPOptimizer])."
-                    " Set `torch._dynamo.config.optimize_ddp = False` to disable.",
-                    [
-                        # it's close to useless to get a real stacktrace here, and quite verbose.
-                        traceback.FrameSummary(__file__, 0, DDPOptimizer),
-                    ],
-                )
-                wrapper = LazilyCompiledModule(
-                    input_mod,
-                    self.compiler,
-                    unwrap_singleton_tuple,
+        trace_structured(
+            "optimize_ddp_split_graph",
+            payload_fn=lambda: split_gm.print_readable(print_output=False),
+        )
+        for name, module in split_gm.named_modules():
+            if "." not in name and len(name):
+                trace_structured(
+                    "optimize_ddp_split_child",
+                    lambda: {"name": name},
+                    payload_fn=lambda: module.print_readable(print_output=False),
                 )
-                return wrapper
-
-            # We replace the submodules with lazy submodules which compile
-            # the corresponding submodules when they are run with real values
-            # Always returns `None` - we do not need to propagate values in order
-            # to replace submodules.
-            def run_node(self, n: Node) -> Any:
-                if n.op == "call_module":
-                    real_mod = self.fetch_attr(n.target)
-
-                    ddp_graph_log.debug(
-                        "\n---%s graph---\n%s", n.target, real_mod.graph
-                    )
-
-                    assert len(n.kwargs) == 0, "We assume only args for these modules"
-                    lazily_compiled_submod = self.lazily_compiled_submod(real_mod)
-
-                    # We update the original (outer) graph with a call into the compiled module
-                    # instead of the uncompiled one.
-                    self.module.delete_submodule(n.target)
-                    n.target = "compiled_" + n.target
-                    self.module.add_submodule(n.target, lazily_compiled_submod)
 
         # NOTE, we want to enable `optimize_ddp_lazy_compile` by default as soon as possible,
         # becuase it will fix stride mismatch errors (see motivation: https://github.com/pytorch/pytorch/pull/114154).
@@ -410,143 +613,16 @@ def run_node(self, n: Node) -> Any:
             # to the second graph to be wrong.
             # To really fix this, we would need to faithfully ask inductor
             # what the outputs to each graph it expects are.
-            assert torch._inductor.config.keep_output_stride, """\
-Detected that you are running DDP with torch.compile, along with these two flags:
-- torch._dynamo.config.optimize_ddp = True
-- torch._inductor.config.keep_output_stride = False
-This combination of flags is incompatible. Please set keep_output_stride to False,
-or file a github issue."""
             fake_mode = detect_fake_mode(example_inputs)
             if fake_mode is None:
                 fake_mode = torch._subclasses.fake_tensor.FakeTensorMode()
 
-        # 3 (no lazy compile): compile each of the partitioned submodules using the user-provided compiler
-        class SubmodCompiler(torch.fx.interpreter.Interpreter):
-            def __init__(self, module, compiler):
-                super().__init__(module)
-                self.compiler = compiler
-
-            def compile_submod(self, input_mod, args, kwargs):
-                """
-                Compile the submodule,
-                using a wrapper to make sure its output is always a tuple,
-                which is required by AotAutograd based compilers
-                """
-                assert len(kwargs) == 0, "We assume only args for these modules"
-
-                class WrapperModule(torch.nn.Module):
-                    def __init__(self, submod, unwrap_singleton_tuple):
-                        super().__init__()
-                        self.submod = submod
-                        self.unwrap_singleton_tuple = unwrap_singleton_tuple
-
-                    def forward(self, *args):
-                        x = self.submod(*args)
-                        # TODO(whc)
-                        # for some reason the isinstance check is necessary if I split one node per submod
-                        # - even though I supposedly wrapped the output in a tuple in those cases, the real
-                        # compiled module was still returning a tensor
-                        if self.unwrap_singleton_tuple and isinstance(x, (tuple, list)):
-                            return x[0]
-                        return x
-
-                unwrap_singleton_tuple = False
-                for sn in input_mod.graph.nodes:
-                    if sn.op == "output":
-                        if not isinstance(sn.args[0], tuple):
-                            unwrap_singleton_tuple = True
-                            sn.args = (sn.args,)
-
-                input_mod.recompile()
-                input_mod.compile_subgraph_reason = GraphCompileReason(
-                    "DDPOptimizer intentional graph-break (See Note [DDPOptimizer])."
-                    " Set `torch._dynamo.config.optimize_ddp = False` to disable.",
-                    [
-                        # it's close to useless to get a real stacktrace here, and quite verbose.
-                        traceback.FrameSummary(__file__, 0, DDPOptimizer),
-                    ],
-                )
-                wrapper = WrapperModule(
-                    self.compiler(input_mod, args),
-                    unwrap_singleton_tuple,
-                )
-                return wrapper
-
-            # Note:
-            #
-            # The way distributed works today around fake tensors can be somewhat confusing.
-            # Some of these codepaths are shared in both runtime, and compile time. The presence
-            # of a fake_mode, read off of fake tensor inputs, dictates how we will operate.
-            #
-            # A few things to keep in mind:
-            #
-            # 1) We invoke `compile_submod` with a real module. The output of that gets stored
-            # on the graph via `self.module.add_submodule(n.target, compiled_submod_real)`.
-            #
-            # 2) When running a call_module targeted node, if we have a fake_mode, we fakify the
-            # module we got from self.fetch_attr(n.target). Regardless of fake_mode, we then execute it.
-            #
-            # 3) Fake tensors should always be around during compile time.
-            #
-            # 4) Fake tensors should never be around at runtime.
-            #
-            # 5) We end up with a compilation mode that takes a real submodule and fake tensors,
-            # to match what aot_autograd expects. See Note: [Fake Modules and AOTAutograd]
-            def run_node(self, n: Node) -> Any:
-                args, kwargs = self.fetch_args_kwargs_from_env(n)
-                new_args = []
-                assert fake_mode
-                for arg in args:
-                    if isinstance(arg, torch.Tensor) and not isinstance(
-                        arg, torch._subclasses.FakeTensor
-                    ):
-                        new_args.append(
-                            torch._dynamo.utils.to_fake_tensor(arg, fake_mode)
-                        )
-                    else:
-                        new_args.append(arg)
-
-                log.debug("run_node %s, %s got args %s", n.op, n.target, args_str(args))
-                assert isinstance(args, tuple)
-                assert isinstance(kwargs, dict)
-
-                if n.op == "call_module":
-                    real_mod = self.fetch_attr(n.target)
-                    if fake_mode:
-                        curr_submod = deepcopy_to_fake_tensor(real_mod, fake_mode)
-                    else:
-                        curr_submod = real_mod
-
-                    ddp_graph_log.debug(
-                        "\n---%s graph---\n%s", n.target, curr_submod.graph
-                    )
-
-                    # When calling the compiler on the submod, inputs (new_args) are expected to
-                    # be FakeTensors already since Dynamo would have made them FakeTensors in the
-                    # non-DDP flow.  However, the parameters are _not_ expected to be FakeTensors,
-                    # since this wrapping happens during compilation
-                    compiled_submod_real = self.compile_submod(
-                        real_mod, new_args, kwargs
-                    )
-
-                    # We update the original (outer) graph with a call into the compiled module
-                    # instead of the uncompiled one.
-                    self.module.delete_submodule(n.target)
-                    n.target = "compiled_" + n.target
-                    self.module.add_submodule(n.target, compiled_submod_real)
-
-                    # Finally, we have to produce inputs for use compiling the next submodule,
-                    # and these need to be FakeTensors, so we execute the module under fake_mode
-                    with fake_mode:
-                        return curr_submod(*new_args, **kwargs)
-                else:
-                    # placeholder or output nodes don't need to get compiled, just executed
-                    return getattr(self, n.op)(n.target, new_args, kwargs)
-
         if torch._dynamo.config.optimize_ddp_lazy_compile:
             submod_compiler = SubmoduleReplacer(split_gm, self.backend_compile_fn)
         else:
-            submod_compiler = SubmodCompiler(split_gm, self.backend_compile_fn)
+            submod_compiler = SubmodCompiler(
+                split_gm, self.backend_compile_fn, fake_mode
+            )
         submod_compiler.run(*example_inputs)
         split_gm.recompile()
 
diff --git a/torch/_dynamo/backends/inductor.py b/torch/_dynamo/backends/inductor.py
index cbc427e8eec00..fcdc8ba39c146 100644
--- a/torch/_dynamo/backends/inductor.py
+++ b/torch/_dynamo/backends/inductor.py
@@ -1,8 +1,15 @@
+# mypy: ignore-errors
+
+import sys
+
 from torch._dynamo import register_backend
 
 
 @register_backend
 def inductor(*args, **kwargs):
+    if sys.platform == "win32":
+        raise RuntimeError("Windows not yet supported for inductor")
+
     # do import here to avoid loading inductor into memory when it is not used
     from torch._inductor.compile_fx import compile_fx
 
diff --git a/torch/_dynamo/backends/onnxrt.py b/torch/_dynamo/backends/onnxrt.py
index 444e187e342e7..91e69923e3124 100644
--- a/torch/_dynamo/backends/onnxrt.py
+++ b/torch/_dynamo/backends/onnxrt.py
@@ -1,7 +1,9 @@
+# mypy: ignore-errors
+
 # This backend is maintained by ONNX team. To direct issues
 # to the right people, please tag related GitHub issues with `module: onnx`.
 #
-# Maintainers' Github IDs: wschin, thiagocrepaldi, BowenBao, abock
+# Maintainers' Github IDs: wschin, thiagocrepaldi, BowenBao
 from torch.onnx._internal.onnxruntime import (
     is_onnxrt_backend_supported,
     torch_compile_backend,
@@ -10,7 +12,7 @@
 
 
 def has_onnxruntime():
-    # FIXME(abock): update test/dynamo/test_backends.py to call is_onnxrt_backend_supported()
+    # FIXME: update test/dynamo/test_backends.py to call is_onnxrt_backend_supported()
     return is_onnxrt_backend_supported()
 
 
diff --git a/torch/_dynamo/backends/registry.py b/torch/_dynamo/backends/registry.py
index b8f2a607c894a..13cb47a50354e 100644
--- a/torch/_dynamo/backends/registry.py
+++ b/torch/_dynamo/backends/registry.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import functools
 import sys
 from typing import Callable, Dict, List, Optional, Protocol, Sequence, Tuple
diff --git a/torch/_dynamo/backends/tensorrt.py b/torch/_dynamo/backends/tensorrt.py
index 493e21a9dfc5f..1868919ea7621 100644
--- a/torch/_dynamo/backends/tensorrt.py
+++ b/torch/_dynamo/backends/tensorrt.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 # import torch  # type: ignore[import]
 # from .common import device_from_inputs, fake_tensor_unsupported  # type: ignore[import]
 # from .registry import register_backend  # type: ignore[import]
diff --git a/torch/_dynamo/backends/torchxla.py b/torch/_dynamo/backends/torchxla.py
index 49997e3f3f268..32e1aba8ec64b 100644
--- a/torch/_dynamo/backends/torchxla.py
+++ b/torch/_dynamo/backends/torchxla.py
@@ -1,5 +1,6 @@
+# mypy: ignore-errors
+
 import logging
-import warnings
 
 from functorch.compile import make_boxed_func
 
@@ -10,20 +11,6 @@
 
 
 @register_experimental_backend
-def torchxla_trivial(gm, fake_tensor_inputs):
-    return gm
-
-
-@register_experimental_backend
-def torchxla_trace_once(model, fake_tensor_inputs):
-    warnings.warn(
-        "This backend will be deprecated in 2.2, please use `openxla` backend instead"
-    )
-
-    return xla_backend_helper(model, fake_tensor_inputs)
-
-
-@register_backend
 def openxla_eval(model, fake_tensor_inputs):
     return xla_backend_helper(model, fake_tensor_inputs, boxed=False)
 
@@ -53,20 +40,6 @@ def fwd(*args):
     return make_boxed_func(fwd) if boxed else fwd
 
 
-aot_torchxla_trivial = aot_autograd(
-    fw_compiler=torchxla_trivial,
-)
-register_experimental_backend(
-    name="aot_torchxla_trivial", compiler_fn=aot_torchxla_trivial
-)
-
-aot_torchxla_trace_once = aot_autograd(
-    fw_compiler=torchxla_trace_once,
-)
-register_experimental_backend(
-    name="aot_torchxla_trace_once", compiler_fn=aot_torchxla_trace_once
-)
-
 openxla = aot_autograd(
     fw_compiler=openxla_eval_boxed,
 )
diff --git a/torch/_dynamo/backends/tvm.py b/torch/_dynamo/backends/tvm.py
index 57a9a4c5c02b2..a4fe7b1736b2d 100644
--- a/torch/_dynamo/backends/tvm.py
+++ b/torch/_dynamo/backends/tvm.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import functools
 import importlib
 import logging
diff --git a/torch/_dynamo/bytecode_analysis.py b/torch/_dynamo/bytecode_analysis.py
index c321cce36ce1c..340378e7266b6 100644
--- a/torch/_dynamo/bytecode_analysis.py
+++ b/torch/_dynamo/bytecode_analysis.py
@@ -17,6 +17,8 @@
     TERMINAL_OPCODES.add(dis.opmap["JUMP_FORWARD"])
 else:
     TERMINAL_OPCODES.add(dis.opmap["JUMP_ABSOLUTE"])
+if sys.version_info >= (3, 12):
+    TERMINAL_OPCODES.add(dis.opmap["RETURN_CONST"])
 JUMP_OPCODES = set(dis.hasjrel + dis.hasjabs)
 JUMP_OPNAMES = {dis.opname[opcode] for opcode in JUMP_OPCODES}
 HASLOCAL = set(dis.haslocal)
@@ -223,10 +225,13 @@ def stacksize_analysis(instructions) -> Union[int, float]:
             )
             if inst.opcode not in TERMINAL_OPCODES:
                 assert next_inst is not None, f"missing next inst: {inst}"
-                stack_sizes[next_inst].offset_of(
-                    stack_size,
-                    stack_effect(inst.opcode, inst.arg, jump=is_call_finally),
+                # total stack effect of CALL_FINALLY and END_FINALLY in 3.8 is 0
+                eff = (
+                    0
+                    if is_call_finally
+                    else stack_effect(inst.opcode, inst.arg, jump=False)
                 )
+                stack_sizes[next_inst].offset_of(stack_size, eff)
             if inst.opcode in JUMP_OPCODES and not is_call_finally:
                 stack_sizes[inst.target].offset_of(
                     stack_size, stack_effect(inst.opcode, inst.arg, jump=True)
@@ -242,8 +247,8 @@ def stacksize_analysis(instructions) -> Union[int, float]:
             stack_size = stack_sizes[inst]
             print(stack_size.low, stack_size.high, inst)
 
-    low = min([x.low for x in stack_sizes.values()])
-    high = max([x.high for x in stack_sizes.values()])
+    low = min(x.low for x in stack_sizes.values())
+    high = max(x.high for x in stack_sizes.values())
 
     assert fixed_point.value, "failed to reach fixed point"
     assert low >= 0
diff --git a/torch/_dynamo/bytecode_transformation.py b/torch/_dynamo/bytecode_transformation.py
index 737b05d6d4e10..dec673b0e9108 100644
--- a/torch/_dynamo/bytecode_transformation.py
+++ b/torch/_dynamo/bytecode_transformation.py
@@ -97,8 +97,12 @@ def create_instruction(
     `argval` or `target`.
 
     Do not use for LOAD_GLOBAL - use create_load_global instead.
+    Do not use for LOAD_ATTR - use create_load_attr instead.
+    Do not use for LOAD_SUPER_ATTR - if you need to create this instruction,
+        implement a create_load_super_attr function.
     """
-    assert name != "LOAD_GLOBAL"
+    if name in ("LOAD_GLOBAL", "LOAD_ATTR", "LOAD_SUPER_ATTR"):
+        raise RuntimeError(f"cannot create_instruction with {name}")
     cnt = (arg is not None) + (argval is not _NotProvided) + (target is not None)
     if cnt > 1:
         raise RuntimeError(
@@ -191,7 +195,7 @@ def create_call_function(nargs, push_null) -> List[Instruction]:
     (assume `math` is available in the global scope),
 
     create_load_global("math", True)  # pushes a null
-    create_instruction("LOAD_ATTR", argval="sqrt")
+    create_load_attr("sqrt")
     create_instruction("LOAD_CONST", argval=25)
     create_call_function(1, False)
     """
@@ -200,13 +204,16 @@ def create_call_function(nargs, push_null) -> List[Instruction]:
         if push_null:
             output.append(create_instruction("PUSH_NULL"))
             output.extend(create_rot_n(nargs + 2))
-        output.append(create_instruction("PRECALL", arg=nargs))
+        if sys.version_info < (3, 12):
+            output.append(create_instruction("PRECALL", arg=nargs))
         output.append(create_instruction("CALL", arg=nargs))
         return output
     return [create_instruction("CALL_FUNCTION", arg=nargs)]
 
 
 def create_call_method(nargs) -> List[Instruction]:
+    if sys.version_info >= (3, 12):
+        return [create_instruction("CALL", arg=nargs)]
     if sys.version_info >= (3, 11):
         return [
             create_instruction("PRECALL", arg=nargs),
@@ -215,6 +222,75 @@ def create_call_method(nargs) -> List[Instruction]:
     return [create_instruction("CALL_METHOD", arg=nargs)]
 
 
+def create_load_attr(name) -> Instruction:
+    # in 3.12, create a LOAD_ATTR instruction with the low bit unset
+    return Instruction(
+        opcode=dis.opmap["LOAD_ATTR"],
+        opname="LOAD_ATTR",
+        arg=False,  # lowbit for 3.12
+        argval=name,
+    )
+
+
+def create_load_method(name) -> Instruction:
+    if sys.version_info >= (3, 12):
+        # in 3.12, create a LOAD_ATTR instruction with the low bit set
+        return Instruction(
+            opcode=dis.opmap["LOAD_ATTR"],
+            opname="LOAD_ATTR",
+            arg=True,  # lowbit for 3.12
+            argval=name,
+        )
+    return create_instruction("LOAD_METHOD", argval=name)
+
+
+def create_setup_with(target) -> Instruction:
+    opname = "BEFORE_WITH" if sys.version_info >= (3, 11) else "SETUP_WITH"
+    return create_instruction(opname, target=target)
+
+
+def create_swap(n) -> List[Instruction]:
+    if sys.version_info >= (3, 11):
+        return [create_instruction("SWAP", arg=n)]
+    # in Python < 3.11, SWAP is a macro that expands to multiple instructions
+    if n == 1:
+        return []
+    """
+    e.g. swap "a" and "b" in this stack:
+    0 a 1 2 3 b
+    0 a [1 2 3 b]
+    0 a [1 2 3 b] [1 2 3 b]
+    0 a [1 2 3 b] [1 2 3 b] -1
+    0 a [1 2 3 b] b
+    0 b a [1 2 3 b]
+    0 b a [1 2 3 b] [1 2 3 b]
+    0 b [1 2 3 b] a [1 2 3 b]
+    0 b [1 2 3 b] a [1 2 3 b] -1
+    0 b [1 2 3 a]
+    0 b [1 2 3 a] [1 2 3 a]
+    0 b [1 2 3 a] [1 2 3 a] reverse
+    0 b [a 3 2 1] None
+    0 b [a 3 2 1]
+    0 b 1 2 3 a
+    """
+    return [
+        create_instruction("BUILD_LIST", arg=n - 1),
+        create_instruction("DUP_TOP"),
+        create_instruction("LOAD_CONST", argval=-1),
+        create_instruction("BINARY_SUBSCR"),
+        create_instruction("ROT_THREE"),
+        create_instruction("DUP_TOP"),
+        create_instruction("ROT_THREE"),
+        create_instruction("LOAD_CONST", argval=-1),
+        create_instruction("STORE_SUBSCR"),
+        create_instruction("DUP_TOP"),
+        create_load_method("reverse"),
+        *create_call_method(0),
+        create_instruction("POP_TOP"),
+        create_instruction("UNPACK_SEQUENCE", arg=n - 1),
+    ]
+
+
 def lnotab_writer(
     lineno: int, byteno: int = 0
 ) -> Tuple[List[int], Callable[[int, int], None]]:
@@ -367,7 +443,7 @@ def encode_exception_table_varint(n: int) -> List[int]:
     while n > 0:
         b.append(n & 63)
         n >>= 6
-    b = list(reversed(b))
+    b.reverse()
     for i in range(len(b) - 1):
         b[i] |= 64
     return b
@@ -759,6 +835,7 @@ def strip_extended_args(instructions: List[Instruction]) -> None:
 
 def remove_load_call_method(instructions: List[Instruction]) -> List[Instruction]:
     """LOAD_METHOD puts a NULL on the stack which causes issues, so remove it"""
+    assert sys.version_info < (3, 11)
     rewrites = {"LOAD_METHOD": "LOAD_ATTR", "CALL_METHOD": "CALL_FUNCTION"}
     for inst in instructions:
         if inst.opname in rewrites:
@@ -774,12 +851,23 @@ def remove_jump_if_none(instructions: List[Instruction]) -> None:
         if "_NONE" in inst.opname:
             is_op = create_instruction("IS_OP", arg=int("NOT" in inst.opname))
             is_op.argval = is_op.arg
-            jump_op = create_instruction(
-                "POP_JUMP_FORWARD_IF_TRUE"
-                if "FORWARD" in inst.opname
-                else "POP_JUMP_BACKWARD_IF_TRUE",
-                target=inst.target,
-            )
+            is_op.positions = inst.positions
+            if sys.version_info < (3, 12):
+                jump_op = create_instruction(
+                    "POP_JUMP_FORWARD_IF_TRUE"
+                    if "FORWARD" in inst.opname
+                    else "POP_JUMP_BACKWARD_IF_TRUE",
+                    target=inst.target,
+                )
+            else:
+                jump_op = create_instruction("POP_JUMP_IF_TRUE", target=inst.target)
+            jump_op.positions = inst.positions
+            # update inst.exn_tab_entry.end if necessary
+            if inst.exn_tab_entry and inst.exn_tab_entry.end is inst:
+                inst.exn_tab_entry.end = jump_op
+            # preserve exception table entries
+            is_op.exn_tab_entry = copy.copy(inst.exn_tab_entry)
+            jump_op.exn_tab_entry = copy.copy(inst.exn_tab_entry)
             # modify inst in-place to preserve jump target
             inst.opcode = dis.opmap["LOAD_CONST"]
             inst.opname = "LOAD_CONST"
@@ -789,15 +877,46 @@ def remove_jump_if_none(instructions: List[Instruction]) -> None:
     instructions[:] = new_insts
 
 
+def remove_binary_store_slice(instructions: List[Instruction]) -> None:
+    new_insts = []
+    for inst in instructions:
+        new_insts.append(inst)
+        if inst.opname in ("BINARY_SLICE", "STORE_SLICE"):
+            # new instruction
+            subscr_inst = create_instruction(inst.opname.replace("SLICE", "SUBSCR"))
+            if inst.exn_tab_entry and inst.exn_tab_entry.end is inst:
+                inst.exn_tab_entry.end = subscr_inst
+            subscr_inst.exn_tab_entry = copy.copy(inst.exn_tab_entry)
+            subscr_inst.positions = inst.positions
+            # modify inst in-place to preserve jump target
+            inst.opcode = dis.opmap["BUILD_SLICE"]
+            inst.opname = "BUILD_SLICE"
+            inst.arg = 2
+            inst.argval = 2
+            new_insts.append(subscr_inst)
+    instructions[:] = new_insts
+
+
 def explicit_super(code: types.CodeType, instructions: List[Instruction]) -> None:
     """convert super() with no args into explicit arg form"""
     cell_and_free = (code.co_cellvars or tuple()) + (code.co_freevars or tuple())
+    if not len(code.co_varnames):
+        # A function with no argument cannot contain a valid "super()" call
+        return
     output = []
     for idx, inst in enumerate(instructions):
         output.append(inst)
         if inst.opname == "LOAD_GLOBAL" and inst.argval == "super":
             nexti = instructions[idx + 1]
-            if nexti.opname in ("CALL_FUNCTION", "PRECALL") and nexti.arg == 0:
+            if nexti.arg == 0 and (
+                (sys.version_info >= (3, 12) and nexti.opname == "CALL")
+                or (
+                    sys.version_info >= (3, 11)
+                    and sys.version_info < (3, 12)
+                    and nexti.opname == "PRECALL"
+                )
+                or (sys.version_info < (3, 11) and nexti.opname == "CALL_FUNCTION")
+            ):
                 assert "__class__" in cell_and_free
                 output.append(create_instruction("LOAD_DEREF", argval="__class__"))
                 first_var = code.co_varnames[0]
@@ -849,26 +968,11 @@ def maybe_pop_n(n):
     return added
 
 
-# from https://github.com/python/cpython/blob/v3.11.1/Include/internal/pycore_opcode.h#L41
-# TODO use the actual object instead, can interface from eval_frame.c
-_PYOPCODE_CACHES = {
-    "BINARY_SUBSCR": 4,
-    "STORE_SUBSCR": 1,
-    "UNPACK_SEQUENCE": 1,
-    "STORE_ATTR": 4,
-    "LOAD_ATTR": 4,
-    "COMPARE_OP": 2,
-    "LOAD_GLOBAL": 5,
-    "BINARY_OP": 1,
-    "LOAD_METHOD": 10,
-    "PRECALL": 1,
-    "CALL": 4,
-}
-
-
 def instruction_size(inst) -> int:
+    import torch
+
     if sys.version_info >= (3, 11):
-        return 2 * (_PYOPCODE_CACHES.get(dis.opname[inst.opcode], 0) + 1)
+        return 2 * (torch._C._dynamo.eval_frame.py_opcode_caches[inst.opcode] + 1)
     return 2
 
 
@@ -925,6 +1029,17 @@ def get_const_index(code_options, val) -> int:
 def fix_vars(instructions: List[Instruction], code_options, varname_from_oparg=None):
     # compute instruction arg from argval if arg is not provided
     names = {name: idx for idx, name in enumerate(code_options["co_names"])}
+
+    def get_name_index(name) -> int:
+        try:
+            idx = names[name]
+        except KeyError:
+            # Add a missing item to co_names
+            idx = names[name] = len(names)
+            code_options["co_names"] = (*code_options["co_names"], name)
+            assert len(code_options["co_names"]) == len(names)
+        return idx
+
     if sys.version_info < (3, 11):
         assert varname_from_oparg is None
         varnames = {name: idx for idx, name in enumerate(code_options["co_varnames"])}
@@ -959,17 +1074,36 @@ def should_compute_arg():
             assert instructions[i].arg is not None
             assert instructions[i].argval is not _NotProvided
             if sys.version_info >= (3, 11):
-                instructions[i].arg = (names[instructions[i].argval] << 1) + (
+                instructions[i].arg = (get_name_index(instructions[i].argval) << 1) + (
                     cast(int, instructions[i].arg) % 2
                 )
             else:
-                instructions[i].arg = names[instructions[i].argval]
+                instructions[i].arg = get_name_index(instructions[i].argval)
+        elif instructions[i].opname == "LOAD_ATTR":
+            # 3.12 LOAD_ATTR requires both arg and argval, like LOAD_GLOBAL
+            assert instructions[i].arg is not None
+            assert instructions[i].argval is not _NotProvided
+            if sys.version_info >= (3, 12):
+                instructions[i].arg = (get_name_index(instructions[i].argval) << 1) + (
+                    cast(int, instructions[i].arg) % 2
+                )
+            else:
+                instructions[i].arg = get_name_index(instructions[i].argval)
+        elif instructions[i].opname == "LOAD_SUPER_ATTR":
+            assert instructions[i].arg is not None
+            assert instructions[i].argval is not _NotProvided
+            # Copy low bit, force second bit on for explicit super (the "+ 2")
+            instructions[i].arg = (
+                (get_name_index(instructions[i].argval) << 2)
+                + (cast(int, instructions[i].arg) % 2)
+                + 2
+            )
         elif instructions[i].opcode in HAS_LOCAL:
             if should_compute_arg():
                 instructions[i].arg = varnames[instructions[i].argval]
         elif instructions[i].opcode in HAS_NAME:
             if should_compute_arg():
-                instructions[i].arg = names[instructions[i].argval]
+                instructions[i].arg = get_name_index(instructions[i].argval)
         elif instructions[i].opcode in HAS_FREE:
             if should_compute_arg():
                 instructions[i].arg = freenames[instructions[i].argval]
@@ -1071,6 +1205,7 @@ def clean_and_assemble_instructions(
         code_options["co_exceptiontable"] = assemble_exception_table(
             compute_exception_table(instructions)
         )
+
     return instructions, types.CodeType(*[code_options[k] for k in keys])
 
 
@@ -1091,11 +1226,14 @@ def cleaned_instructions(code, safe=False) -> List[Instruction]:
     if not safe:
         if sys.version_info < (3, 11):
             remove_load_call_method(instructions)
-        else:
-            remove_jump_if_none(instructions)
-            update_offsets(instructions)
-            devirtualize_jumps(instructions)
-        explicit_super(code, instructions)
+        if sys.version_info < (3, 12):
+            explicit_super(code, instructions)
+    if sys.version_info >= (3, 11):
+        remove_jump_if_none(instructions)
+        if sys.version_info >= (3, 12):
+            remove_binary_store_slice(instructions)
+        update_offsets(instructions)
+        devirtualize_jumps(instructions)
     return instructions
 
 
diff --git a/torch/_dynamo/cache_size.py b/torch/_dynamo/cache_size.py
index 08f579c18b1b7..340f227a9956f 100644
--- a/torch/_dynamo/cache_size.py
+++ b/torch/_dynamo/cache_size.py
@@ -2,6 +2,7 @@
 import types
 import weakref
 from dataclasses import dataclass
+from typing import Tuple
 
 from . import config
 
@@ -84,10 +85,16 @@ class CacheSizeRelevantForFrame:
     def will_compilation_exceed(self, limit: int) -> bool:
         # Checks if a compilation will exceed the given limit (thats why >=).
         return (
-            self.num_cache_entries >= config.accumulated_cache_size_limit
-            or self.num_cache_entries_with_same_id_matched_objs >= limit
+            self.will_compilation_exceed_accumulated_limit()
+            or self.will_compilation_exceed_specific_limit(limit)
         )
 
+    def will_compilation_exceed_accumulated_limit(self) -> bool:
+        return self.num_cache_entries >= config.accumulated_cache_size_limit
+
+    def will_compilation_exceed_specific_limit(self, limit: int) -> bool:
+        return self.num_cache_entries_with_same_id_matched_objs >= limit
+
 
 def _get_weakref_from_f_locals(frame: types.FrameType, local_name: str):
     obj = frame.f_locals.get(local_name, None)
@@ -154,8 +161,12 @@ def is_recompilation(cache_size: CacheSizeRelevantForFrame) -> bool:
     return cache_size.will_compilation_exceed(1)
 
 
-def exceeds_cache_size_limit(cache_size: CacheSizeRelevantForFrame) -> bool:
+def exceeds_cache_size_limit(cache_size: CacheSizeRelevantForFrame) -> Tuple[bool, str]:
     """
     Checks if we are exceeding the cache size limit.
     """
-    return cache_size.will_compilation_exceed(config.cache_size_limit)
+    if cache_size.will_compilation_exceed_accumulated_limit():
+        return True, "accumulated_cache_size_limit"
+    if cache_size.will_compilation_exceed_specific_limit(config.cache_size_limit):
+        return True, "cache_size_limit"
+    return False, ""
diff --git a/torch/_dynamo/callback.py b/torch/_dynamo/callback.py
new file mode 100644
index 0000000000000..a65e2844f215f
--- /dev/null
+++ b/torch/_dynamo/callback.py
@@ -0,0 +1,82 @@
+class CompilationCallbackHandler:
+    def __init__(self):
+        self.start_callbacks = []
+        self.end_callbacks = []
+
+    def register_start_callback(self, callback):
+        """
+        Register a callback function to be called when the compilation starts.
+
+        Args:
+        - callback (callable): The callback function to register.
+        """
+        self.start_callbacks.append(callback)
+        return callback
+
+    def register_end_callback(self, callback):
+        """
+        Register a callback function to be called when the compilation ends.
+
+        Args:
+        - callback (callable): The callback function to register.
+        """
+        self.end_callbacks.append(callback)
+        return callback
+
+    def remove_start_callback(self, callback):
+        """
+        Remove a registered start callback function.
+
+        Args:
+        - callback (callable): The callback function to remove.
+        """
+        self.start_callbacks.remove(callback)
+
+    def remove_end_callback(self, callback):
+        """
+        Remove a registered end callback function.
+
+        Args:
+        - callback (callable): The callback function to remove.
+        """
+        self.end_callbacks.remove(callback)
+
+    def run_start_callbacks(self):
+        """
+        Execute all registered start callbacks.
+        """
+        for callback in self.start_callbacks:
+            callback()
+
+    def run_end_callbacks(self):
+        """
+        Execute all registered end callbacks.
+        """
+        for callback in self.end_callbacks:
+            callback()
+
+    def clear(self):
+        """
+        Clear all registered callbacks.
+        """
+        self.start_callbacks.clear()
+        self.end_callbacks.clear()
+
+
+callback_handler = CompilationCallbackHandler()
+
+
+def on_compile_start(callback):
+    """
+    Decorator to register a callback function for the start of the compilation.
+    """
+    callback_handler.register_start_callback(callback)
+    return callback
+
+
+def on_compile_end(callback):
+    """
+    Decorator to register a callback function for the end of the compilation.
+    """
+    callback_handler.register_end_callback(callback)
+    return callback
diff --git a/torch/_dynamo/codegen.py b/torch/_dynamo/codegen.py
index 2d273ed0142d3..0f4c0dad59bde 100644
--- a/torch/_dynamo/codegen.py
+++ b/torch/_dynamo/codegen.py
@@ -10,9 +10,12 @@
 
 from .bytecode_transformation import (
     create_call_function,
+    create_call_method,
     create_dup_top,
     create_instruction,
+    create_load_attr,
     create_load_global,
+    create_load_method,
     create_rot_n,
     Instruction,
 )
@@ -76,10 +79,14 @@ def restore_stack(self, stack_values, *, value_from_source=True):
     def graph_output_vars(self):
         return [x.variable for x in self.graph_outputs.values()]
 
+    def call_reconstruct(self, value):
+        res = value.reconstruct(self)
+        assert res is None, f"reconstruct!=None {value}"
+
     def __call__(self, value, allow_cache=True):
         """Generate code such that top-of-stack (TOS) is set to value"""
         if isinstance(value, Source):
-            self._output.extend(value.reconstruct(self))
+            self.call_reconstruct(value)
             self.clear_tos()
             return
 
@@ -113,7 +120,7 @@ def __call__(self, value, allow_cache=True):
                 return
 
         if value.source is not None and allow_cache and self.value_from_source:
-            output.extend(value.source.reconstruct(self))
+            self.call_reconstruct(value.source)
         elif value.is_python_constant() and is_safe_constant(
             value.as_python_constant()
         ):
@@ -125,7 +132,7 @@ def __call__(self, value, allow_cache=True):
             self.load_graph_output(graph_outputs[graph_outputs_key].index)
             output.append(
                 self.create_load_global(
-                    value.global_mangled_class_name(), False, add=True
+                    value.global_mangled_class_name(self.tx), False, add=True
                 )
             )
             output.extend(create_call_function(2, True))
@@ -164,7 +171,7 @@ def __call__(self, value, allow_cache=True):
         else:
             self.uses[value] += 1
             try:
-                output.extend(value.reconstruct(self))
+                self.call_reconstruct(value)
             except NotImplementedError:
                 unimplemented(f"reconstruct: {value}")
             if allow_cache and value in self.tempvars:
@@ -255,14 +262,35 @@ def _create_load_const(self, value) -> Instruction:
 
     create_load_output = _create_load_const
 
+    def create_load_method(self, name):
+        self.tx.output.update_co_names(name)
+        return create_load_method(name)
+
+    def load_method(self, name):
+        self.append_output(self.create_load_method(name))
+
+    def call_method(self, nargs):
+        self.extend_output(create_call_method(nargs))
+
     def create_load_attr(self, name) -> Instruction:
         if name not in self.code_options["co_names"]:
             self.code_options["co_names"] += (name,)
-        return create_instruction("LOAD_ATTR", argval=name)
+        return create_load_attr(name)
+
+    def load_attr(self, name):
+        self.append_output(self.create_load_attr(name))
 
     def create_load_attrs(self, names):
         return [self.create_load_attr(name) for name in names.split(".")]
 
+    def create_store_attr(self, name) -> Instruction:
+        if name not in self.code_options["co_names"]:
+            self.code_options["co_names"] += (name,)
+        return create_instruction("STORE_ATTR", argval=name)
+
+    def store_attr(self, name):
+        self.append_output(self.create_store_attr(name))
+
     def load_function_name(self, fn_name, push_null, num_on_stack=0):
         """Load the global fn_name on the stack num_on_stack down"""
         output = []
@@ -301,6 +329,18 @@ def pop_null(self):
             create_instruction("POP_TOP"),
         ]
 
+    def pop_top(self):
+        self.append_output(create_instruction("POP_TOP"))
+
+    def call_function(self, nargs: int, push_null: bool):
+        self.extend_output(create_call_function(nargs, push_null=push_null))
+
+    def dup_top(self):
+        self.append_output(create_dup_top())
+
+    def store(self, varname):
+        self.append_output(self.create_store(varname))
+
     def make_function_with_closure(
         self, fn_name: str, code: types.CodeType, push_null: bool, num_on_stack=0
     ):
@@ -325,14 +365,14 @@ def create_load_python_module(self, mod, push_null) -> Instruction:
         """
         Generate a LOAD_GLOBAL instruction to fetch a given python module.
         """
-        global_scope = self.tx.output.global_scope
+        output = self.tx.output
+        global_scope = output.global_scope
         name = re.sub(r"^.*[.]", "", mod.__name__)
         if global_scope.get(name, None) is mod:
             return self.create_load_global(name, push_null, add=True)
-        mangled_name = f"___module_{name}_{id(mod)}"
-        if mangled_name not in global_scope:
-            self.tx.output.install_global(mangled_name, mod)
-        return self.create_load_global(mangled_name, push_null, add=True)
+        prefix = f"___module_{name}"
+        global_name = self.tx.output.install_global_by_id(prefix, mod)
+        return self.create_load_global(global_name, push_null, add=True)
 
     def make_call_generated_code(self, fn_name: str) -> None:
         """Call the generated code function stored in fn_name"""
@@ -340,36 +380,40 @@ def make_call_generated_code(self, fn_name: str) -> None:
 
         graphargs = self.tx.output.graphargs
         for arg in graphargs:
-            if arg.is_unspecialized:
+            if arg.pass_arg_as_tensor:
                 self.extend_output(
                     [
                         self.create_load_python_module(torch, True),
                         self.create_load_attr("as_tensor"),
                     ]
                 )
-                self.extend_output(arg.load(self))
+                self.call_reconstruct(arg)
                 self.extend_output(create_call_function(1, False))
             else:
-                self.extend_output(arg.load(self))
+                self.call_reconstruct(arg)
 
         self.extend_output(create_call_function(len(graphargs), False))
 
-    def create_load_import_from(self, module_name, object_name) -> List[Instruction]:
-        return AttrSource(self.tx.import_source(module_name), object_name).reconstruct(
-            self
-        )
-
     def load_import_from(self, module_name, object_name) -> None:
-        self.extend_output(self.create_load_import_from(module_name, object_name))
+        self(AttrSource(self.tx.import_source(module_name), object_name))
 
     def create_call_function_kw(self, nargs, kw_names, push_null) -> List[Instruction]:
         if sys.version_info >= (3, 11):
             output = create_call_function(nargs, push_null)
-            assert output[-2].opname == "PRECALL"
+            if sys.version_info >= (3, 12):
+                idx = -1
+                expected_inst = "CALL"
+            else:
+                idx = -2
+                expected_inst = "PRECALL"
+            assert output[idx].opname == expected_inst
             kw_names_inst = create_instruction("KW_NAMES", argval=kw_names)
-            output.insert(-2, kw_names_inst)
+            output.insert(idx, kw_names_inst)
             return output
         return [
             self.create_load_const(kw_names),
             create_instruction("CALL_FUNCTION_KW", arg=nargs),
         ]
+
+    def create_delete(self, value) -> Instruction:
+        return create_instruction("DELETE_FAST", argval=value)
diff --git a/torch/_dynamo/compiled_autograd.py b/torch/_dynamo/compiled_autograd.py
index cf9e4516a518a..e8e61042d4746 100644
--- a/torch/_dynamo/compiled_autograd.py
+++ b/torch/_dynamo/compiled_autograd.py
@@ -1,28 +1,40 @@
 import contextlib
 import functools
-from typing import List, Optional
+from typing import List, Optional, TYPE_CHECKING
 
 import torch
-from torch._dynamo.external_utils import call_hook
+from torch._dynamo.external_utils import call_backward, call_hook
 from torch._dynamo.source import GetItemSource, LocalSource
-from torch._dynamo.utils import counters, lazy_format_graph_code
-from torch._logging import getArtifactLogger
+from torch._dynamo.utils import counters, lazy_format_graph_code, set_locals_to_steal
+from torch._logging import getArtifactLogger, trace_structured
 from torch._prims_common import clone_preserve_strides
 from torch._subclasses import FakeTensorMode
 from torch.fx import GraphModule
+from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental.proxy_tensor import (
     decompose,
     disable_autocast_cache,
     disable_proxy_modes_tracing,
-    fetch_tensor_proxy,
+    fetch_object_proxy,
     ProxyTorchDispatchMode,
     PythonKeyTracer,
     track_tensor_tree,
 )
 from torch.fx.experimental.symbolic_shapes import DimDynamic, ShapeEnv
-from torch.fx.proxy import Proxy
+from torch.fx.traceback import preserve_node_meta, set_stack_trace
+from torch.utils._traceback import CapturedTraceback
+
+if TYPE_CHECKING:
+    from torch.fx.proxy import Proxy
 
 compiled_autograd_log = getArtifactLogger(__name__, "compiled_autograd")
+verbose_log = getArtifactLogger(__name__, "compiled_autograd_verbose")
+
+
+def snapshot_verbose_logging_enabled():
+    return torch._logging._internal.log_state.is_artifact_enabled(
+        "compiled_autograd_verbose"
+    )
 
 
 def maybe_clone(x):
@@ -88,9 +100,44 @@ def begin_capture(self, inputs: List[torch.Tensor], sizes: List[int]):
         self.stack.enter_context(self.proxy_mode.sym_mode)
         self.stack.enter_context(self.proxy_mode)
         self.stack.enter_context(disable_autocast_cache())
-        self.stack.enter_context(disable_proxy_modes_tracing(enable_current=True))
+        self.stack.enter_context(preserve_node_meta())
         return inputs, sizes
 
+    def proxy_call_backward(
+        self,
+        inputs,
+        output_metadatas,
+        saved_tensors,
+        backward_idx: int,
+    ):
+        assert self.hooks_proxy is not None
+        backward_c_function = self.hooks_proxy[backward_idx]  # type: ignore[index]
+        proxies = self.fx_tracer.create_proxy(
+            kind="call_function",
+            target=call_backward,
+            args=(
+                backward_c_function,
+                self.to_proxy(saved_tensors),
+                *self.to_proxy(inputs),
+            ),
+            kwargs={},
+        )
+
+        with disable_proxy_modes_tracing():
+            # create fake Tensors
+            grad_ins: List[Optional[torch.Tensor]] = []
+            for output_metadata in output_metadatas:
+                if output_metadata is None:
+                    grad_ins.append(None)
+                    continue
+
+                layout, device, dtype, size = output_metadata
+                grad_ins.append(
+                    torch.empty(size=size, dtype=dtype, layout=layout, device=device)
+                )
+            self.bind_tensors_to_proxies(grad_ins, proxies)
+        return tuple(grad_ins)
+
     def proxy_call_hook(self, hook, *args):
         return self.fx_tracer.create_proxy(
             "call_function",
@@ -104,7 +151,7 @@ def proxy_call_hook(self, hook, *args):
 
     def tensor_pre_hook(self, inputs, hook_id, i: int):
         assert self.hooks_proxy is not None
-        hook = self.hooks_proxy[hook_id]
+        hook = self.hooks_proxy[hook_id]  # type: ignore[index]
         proxy = self.proxy_call_hook(
             hook,
             inputs[i],
@@ -116,7 +163,7 @@ def tensor_pre_hook(self, inputs, hook_id, i: int):
 
     def pre_hook(self, inputs, hook_id):
         assert self.hooks_proxy is not None
-        hook = self.hooks_proxy[hook_id]
+        hook = self.hooks_proxy[hook_id]  # type: ignore[index]
         proxies = self.proxy_call_hook(
             hook,
             inputs,
@@ -128,7 +175,7 @@ def pre_hook(self, inputs, hook_id):
 
     def post_hook(self, outputs, inputs, hook_id):
         assert self.hooks_proxy is not None
-        hook = self.hooks_proxy[hook_id]
+        hook = self.hooks_proxy[hook_id]  # type: ignore[index]
         proxies = self.proxy_call_hook(
             hook,
             outputs,
@@ -142,7 +189,7 @@ def post_hook(self, outputs, inputs, hook_id):
     def post_acc_grad_hook(self, input, hook_id):
         assert isinstance(input, torch.Tensor)
         assert self.hooks_proxy is not None
-        hook = self.hooks_proxy[hook_id]
+        hook = self.hooks_proxy[hook_id]  # type: ignore[index]
         proxies = self.proxy_call_hook(
             hook,
             input,
@@ -160,14 +207,36 @@ def end_capture(self, outputs):
             (self.fx_tracer.create_arg(self.to_proxy(outputs)),),
             {},
         )
+        self.reorder_accumulate_grad_nodes()
         graph = GraphModule(
             self.fx_tracer.root, self.fx_tracer.graph, "CompiledAutograd"
         )
+        set_locals_to_steal(graph, ["inputs"])
         compiled_autograd_log.info(
             "%s", lazy_format_graph_code("Compiled autograd graph", graph)
         )
+        verbose_log.debug(
+            "%s", lazy_format_graph_code("Compiled autograd graph", graph)
+        )
+        trace_structured(
+            "compiled_autograd_graph",
+            payload_fn=lambda: graph.print_readable(print_output=False),
+        )
         return self.compiler_fn(graph)
 
+    def reorder_accumulate_grad_nodes(self):
+        """
+        Usage of AOTAutograd causes all the accumulate_grad_ nodes to get pushed to the end of
+        the graph.  This differs from eager mode, which schedules them as soon as possible. This
+        pass attempts to reorder the graph to mimic eager behavior.
+        """
+        for node in self.fx_tracer.graph.find_nodes(
+            op="call_function", target=torch.ops.inductor.accumulate_grad_.default
+        ):
+            arg = max(node.args)  # last arg
+            if arg is not node.prev and arg.op != "placeholder":
+                arg.append(node)
+
     def to_proxy(self, t):
         if t is None:
             return None
@@ -176,7 +245,7 @@ def to_proxy(self, t):
         if isinstance(t, tuple):
             return tuple(self.to_proxy(x) for x in t)
         assert isinstance(t, (torch.Tensor, torch.SymInt))
-        return fetch_tensor_proxy(self.fx_tracer)(t).proxy
+        return fetch_object_proxy(self.fx_tracer)(t).proxy
 
     def bind_tensors_to_proxies(self, tensors, proxies):
         if isinstance(proxies, torch.fx.Proxy):
@@ -184,21 +253,56 @@ def bind_tensors_to_proxies(self, tensors, proxies):
         assert len(tensors) == len(proxies)
         track_tensor_tree(tensors, proxies, constant=None, tracer=self.fx_tracer)
 
+    def bind_backward_state(self, index: int):
+        assert self.hooks_proxy is not None
+        proxy = self.hooks_proxy[index]  # type: ignore[index]
+        bw_state = BackwardState()
+        track_tensor_tree(bw_state, proxy, constant=None, tracer=self.fx_tracer)
+        return bw_state
+
+    def set_node_origin(self, node_name, node_index):
+        raw_stack_trace = CapturedTraceback.extract().format()[-1]
+        new_code = f"{node_name} (NodeCall {node_index})"
+        new_stack_trace = raw_stack_trace.replace(
+            "raw_stack_trace = CapturedTraceback.extract().format()[-1]", new_code
+        )
+        set_stack_trace(new_stack_trace)
+
 
 compiled_autograd_enabled = False
 
+# We may have code like:
+# with enable(compiler_fn):
+#   ...
+#   with disable():
+#     ...
+#   ...
+# The disable() call just want to disable compiled autograd temporarily.
+# But overall the feature is enabled.
+#
+# The code covered by the disable context manager has no way to know if
+# compiled autograd is overall eanbled. Use another variable
+# compiled_autograd_enabled_count to indicate how many times compiled
+# autograd has been enabled in the call stack for this purpose.
+compiled_autograd_enabled_count = 0
+
 
 @contextlib.contextmanager
 def enable(compiler_fn):
     prior = torch._C._dynamo.compiled_autograd.set_autograd_compiler(
         functools.partial(AutogradCompilerInstance, compiler_fn)
     )
-    global compiled_autograd_enabled
+    torch._C._dynamo.compiled_autograd.set_verbose_logging(
+        snapshot_verbose_logging_enabled()
+    )
+    global compiled_autograd_enabled, compiled_autograd_enabled_count
     compiled_autograd_enabled = True
+    compiled_autograd_enabled_count += 1
     try:
         with torch.autograd.set_multithreading_enabled(False):
             yield
     finally:
+        compiled_autograd_enabled_count -= 1
         if not prior:
             compiled_autograd_enabled = False
         torch._C._dynamo.compiled_autograd.set_autograd_compiler(prior)
diff --git a/torch/_dynamo/comptime.py b/torch/_dynamo/comptime.py
index ab33ce9fe7a99..7b876258bd485 100644
--- a/torch/_dynamo/comptime.py
+++ b/torch/_dynamo/comptime.py
@@ -11,8 +11,11 @@
 from typing import Optional, Union
 
 import torch
+from torch.fx.experimental.symbolic_shapes import free_symbols
 
 from .exc import unimplemented
+from .variables.constant import ConstantVariable
+from .variables.tensor import SymNodeVariable
 
 
 class ComptimeVar:
@@ -93,6 +96,26 @@ def is_python_constant(self):
         """
         return self.__variable.is_python_constant()
 
+    def is_dynamic(self):
+        if isinstance(self.__variable, SymNodeVariable):
+            fs = free_symbols(self.__variable.sym_num)
+            return bool(fs)
+        return False
+
+    def force_static(self):
+        """
+        Forces that a value is static, inducing a guard on its specific value
+        """
+        if isinstance(self.__variable, SymNodeVariable):
+            self.__variable.evaluate_expr()
+        elif isinstance(self.__variable, ConstantVariable):
+            # TODO: Maybe complain if this isn't a int/bool/float variable
+            pass
+        else:
+            raise AssertionError(
+                f"cannot force {self.__variable} ({type(self.__variable)}) static"
+            )
+
     def _i_will_not_complain_if_bc_breaks_VariableTracker(self):
         """
         Returns the internal data structure VariableTracker that Dynamo uses
@@ -139,6 +162,14 @@ def graph(self):
         """
         return self.__tx.output.graph
 
+    def assert_static(self, val):
+        """
+        Asserts that the int is static (and not dynamic, per dynamic shapes)
+        """
+        assert (
+            not val.is_dynamic()
+        ), "expected static but got dynamic (run with TORCH_LOGS=dynamic for more info)"
+
     def print_graph(self, *, verbose=True, file=None):
         """
         Print the partially constructed FX graph that would be passed
@@ -306,6 +337,14 @@ def print_bt(*, stacklevel=0):
     def print_guards():
         comptime(lambda ctx: ctx.print_guards())
 
+    @staticmethod
+    def assert_static(val):
+        comptime(lambda ctx: ctx.assert_static(ctx.get_local("val")))
+
+    @staticmethod
+    def force_static(val):
+        comptime(lambda ctx: ctx.get_local("val").force_static())
+
     @staticmethod
     def breakpoint():
         """
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 16f1a122663ec..6a870a6d1229b 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -1,10 +1,11 @@
+import getpass
 import inspect
 import os
 import re
 import sys
 import tempfile
 from os.path import abspath, dirname
-from typing import Any, Dict, Set, Type, TYPE_CHECKING
+from typing import Any, Callable, Dict, Optional, Set, Type, TYPE_CHECKING, Union
 
 import torch
 
@@ -16,7 +17,7 @@
 # Design doc: https://docs.google.com/document/d/1ZRfTWKa8eaPq1AxaiHrq4ASTPouzzlPiuquSBEJYwS8/edit#
 # the name of a file to write the logs to
 # [@compile_ignored: debug]
-log_file_name = None
+log_file_name: Optional[str] = None
 
 # [@compile_ignored: debug] Verbose will print full stack traces on warnings and errors
 verbose = os.environ.get("TORCHDYNAMO_VERBOSE", "0") == "1"
@@ -38,8 +39,8 @@
 # [@compile_ignored: runtime_behaviour]
 cache_size_limit = 8
 
-# [@compile_ignored: runtime_behaviour] controls the maximum number of entries for a code object.
-accumulated_cache_size_limit = 64
+# [@compile_ignored: runtime_behaviour] safeguarding to prevent horrible recomps
+accumulated_cache_size_limit = 256
 
 # whether or not to specialize on int inputs.  This only has an effect with
 # dynamic_shapes; when dynamic_shapes is False, we ALWAYS specialize on int
@@ -51,6 +52,10 @@
 # legacy config, does nothing now!
 dynamic_shapes = True
 
+use_lazy_graph_module = (
+    os.environ.get("TORCH_COMPILE_USE_LAZY_GRAPH_MODULE", "1") == "1"
+)
+
 # This is a temporarily flag, which changes the behavior of dynamic_shapes=True.
 # When assume_static_by_default is True, we only allocate symbols for shapes marked dynamic via mark_dynamic.
 # NOTE - this flag can be removed once we can run dynamic_shapes=False w/ the mark_dynamic API
@@ -125,7 +130,7 @@
 # Record and write an execution record of the current frame to a file
 # if an exception is encountered
 # @compile_ignored[debug]
-replay_record_enabled = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
+replay_record_enabled = os.environ.get("TORCH_COMPILE_REPLAY_RECORD", "0") == "1"
 
 # Rewrite assert statement in python with torch._assert
 rewrite_assert_with_torch_assert = True
@@ -182,6 +187,13 @@
 # [@compile_ignored: debug]
 repro_tolerance = 1e-3
 
+
+# Whether to ignore non-floating point values when checking accuracy.
+# Checking accuracy of non-floating point values such as boolean tensors
+# can lead to false positives.
+# [@compile_ignored: debug]
+repro_ignore_non_fp = os.environ.get("TORCHDYNAMO_REPRO_IGNORE_NON_FP") == "1"
+
 # If True, when testing if two models are the same, we will test them against
 # a third fp64 reference and only report a problem if the RMSE relative to the
 # fp64 is greater.  However, this will use more memory; you may disable this
@@ -192,7 +204,7 @@
 # Not all backends support scalars. Some calls on torch.Tensor (like .item()) return a scalar type.
 # When this flag is set to False, we introduce a graph break instead of capturing.
 # This requires dynamic_shapes to be True.
-capture_scalar_outputs = False
+capture_scalar_outputs = os.environ.get("TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS") == "1"
 
 # Not all backends support operators that have dynamic output shape (e.g.,
 # nonzero, unique).  When this flag is set to False, we introduce a graph
@@ -215,12 +227,59 @@
 # false_fn produces code with identical guards.
 enforce_cond_guards_match = True
 
-# Automatically split model graph into pieces to match DDP bucket sizes
-# to allow DDP comm/compute overlap.  Disable to allow DDP models to
-# run without graph-breaks, but also without comm/compute overlap.
-# set torch._dynamo.config.log_level to INFO or DEBUG for more info
-# about optimize_ddp behavior.
-optimize_ddp = True
+# Specify how to optimize a compiiled DDP module. The flag accepts a bollean
+# value or a string. There are 4 modes.
+# 1. "ddp_optimizer" (or True): with "ddp_ptimizer", Dynamo will automatically
+# split model graph into pieces to match DDP bucket sizes to allow DDP
+# comm/compute overlap.
+# 2. "python_reducer" (experimental): this optimization requires the usage
+# of compiled_autograd. With "python_reducer", DDP will disable the C++ reducer
+# and use the Python reducer to allow compiled_autograd to trace the
+# communication and allow comm/compute overlap without graph-breaks.
+# 3. "python_reducer_without_compiled_forward" (experimental): this mode is
+# similar to "python_reducer". One should only use this optimization mode
+# when compiled_autograd is used but the DDP module is not compiled.
+# 4. "no_optimization" (or False): Dynamo won't split the model graph, nor
+# will Python reducer be used. With this mode, there will be no graph-breaks
+# and the original DDP C++ reducer will be used. There will no comm/compute
+# overlap. This mode CANNOT be used with compiled_autograd.
+# Note that to avoid breaking the existing usage, mode 1 and mode 4 can be
+# specified with a boolean value. True is using ddp_optimizer and False is
+# no optimization.
+optimize_ddp: Union[bool, str] = True
+
+# By default, Dynamo emits runtime asserts (e.g. torch._check, torch._check_is_size) in the graph.
+# In some cases those asserts could be performance costly
+# E.g. torch._check(tensor[0].item() > 2) for tensor on cuda will require cuda sync.
+# Setting this to True keeps them hinting to symbolic shapes engine,
+# but not be emitted in the graph.
+do_not_emit_runtime_asserts: bool = (
+    os.environ.get("TORCH_DYNAMO_DO_NOT_EMIT_RUNTIME_ASSERTS", "0") == "1"
+)
+
+_ddp_optimization_mode = [
+    "ddp_optimizer",
+    "python_reducer",  # experimental mode
+    "python_reducer_without_compiled_forward",  # experimental mode
+    "no_optimization",
+]
+
+
+def _get_optimize_ddp_mode():
+    m = sys.modules[__name__]
+    if isinstance(m.optimize_ddp, bool):
+        if m.optimize_ddp:
+            mode = "ddp_optimizer"
+        else:
+            mode = "no_optimization"
+    elif isinstance(m.optimize_ddp, str):
+        mode = m.optimize_ddp
+    else:
+        raise ValueError(f"Invalid type, {type(optimize_ddp)=}")
+
+    assert mode in m._ddp_optimization_mode, f"Invalid mode {mode=}"
+    return mode
+
 
 # If True, delays DDPOptimizer submodule compilation to 1st run of the model,
 # so that real tensor strides are used in all submodules
@@ -230,6 +289,8 @@
 
 # Whether to skip guarding on FSDP-managed modules
 skip_fsdp_guards = True
+# Whether to apply torch._dynamo.disable() to per-param FSDP hooks
+skip_fsdp_hooks = False
 
 # Make dynamo skip guarding on hooks on nn modules
 # Note: unsafe: if your model actually has hooks and you remove them, or doesn't and  you add them,
@@ -279,25 +340,34 @@
 # use numpy's PRNG if True, pytorch otherwise
 use_numpy_random_stream = False
 
+# Use C++ guard manager
+enable_cpp_guard_manager = os.environ.get("TORCHDYNAMO_CPP_GUARD_MANAGER", "1") == "1"
+
+# Inline inbuilt nn modules
+inline_inbuilt_nn_modules = (
+    os.environ.get("TORCHDYNAMO_INLINE_INBUILT_NN_MODULES", "0") == "1"
+)
+
 
 def is_fbcode():
     return not hasattr(torch.version, "git_version")
 
 
-DEBUG_DIR_VAR_NAME = "TORCH_COMPILE_DEBUG_DIR"  # [@compile_ignored: debug]
+def default_debug_dir_root():
+    # [@compile_ignored: debug]
+    DEBUG_DIR_VAR_NAME = "TORCH_COMPILE_DEBUG_DIR"
+    if DEBUG_DIR_VAR_NAME in os.environ:
+        return os.path.join(os.environ[DEBUG_DIR_VAR_NAME], "torch_compile_debug")
+    elif is_fbcode():
+        return os.path.join(
+            tempfile.gettempdir(), getpass.getuser(), "torch_compile_debug"
+        )
+    else:
+        return os.path.join(os.getcwd(), "torch_compile_debug")
+
 
-if DEBUG_DIR_VAR_NAME in os.environ:
-    debug_dir_root = os.path.join(  # [@compile_ignored: debug]
-        os.environ[DEBUG_DIR_VAR_NAME], "torch_compile_debug"
-    )
-elif is_fbcode():
-    debug_dir_root = os.path.join(  # [@compile_ignored: debug]
-        tempfile.gettempdir(), "torch_compile_debug"
-    )
-else:
-    debug_dir_root = os.path.join(  # [@compile_ignored: debug]
-        os.getcwd(), "torch_compile_debug"
-    )
+# [@compile_ignored: debug]
+debug_dir_root = default_debug_dir_root()
 
 # [@compile_ignored: debug]
 _save_config_ignore = {
@@ -309,6 +379,14 @@ def is_fbcode():
     "skipfiles_inline_module_allowlist",
 }
 
+# for backend="cudagraphs", mutations on input be sent to the cudagraph backend
+# or replayed in aot_autograd epilogue. default is False because mutation on inputs
+# can prevent cudagraphing.
+cudagraph_backend_keep_input_mutation = False
+
+# enable cudagraph support for mutated inputs from prior cudagraph pool
+cudagraph_backend_support_input_mutation = False
+
 # When True, only ops that have the torch.Tag.pt2_compliant tag
 # will be allowed into the graph; all other ops will be disallowed
 # and will fall back to eager-mode PyTorch. Useful to ensure
@@ -318,11 +396,17 @@ def is_fbcode():
 capture_autograd_function = True
 
 # enable/disable dynamo tracing for `torch.func` transforms
-capture_func_transforms = False
+capture_func_transforms = True
 
 # If to log Dynamo compilation metrics into log files (for OSS) and Scuba tables (for fbcode).
 log_compilation_metrics = True
 
+# A set of logging functions which will be reordered to the end of graph breaks,
+# allowing dynamo to construct larget graph. Note that there are some
+# limitations to this, such as how it does not correctly print objects that were
+# mutated after the print statement.
+reorderable_logging_functions: Set[Callable[[Any], None]] = set()
+
 # simulates what would happen if we didn't have support for BUILD_SET opcode,
 # used for testing
 inject_BUILD_SET_unimplemented_TESTING_ONLY = False
@@ -339,6 +423,15 @@ def is_fbcode():
     [name for name, _ in inspect.getmembers(torch.Tensor) if re.match(r"^is_.*", name)]
 )
 
+# Enables caching of dispatches to fake tensors.
+fake_tensor_cache_enabled = (
+    os.environ.get("TORCH_FAKE_TENSOR_DISPATCH_CACHE", "1") == "1"
+)
+
+# Enables cross checking between the fake tensor cache and dispatch.
+fake_tensor_cache_crosscheck_enabled = (
+    os.environ.get("TORCH_FAKE_TENSOR_DISPATCH_CACHE_CROSSCHECK", "0") == "1"
+)
 
 # support `context_fn` in torch.utils.checkpoint.checkpoint API under torch.compile().
 # WARNING: this is an experimental flag and is subject to change.
@@ -347,6 +440,10 @@ def is_fbcode():
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
 
+    def _make_closure_patcher(**changes):
+        ...
+
+
 from torch.utils._config_module import install_config_module
 
 install_config_module(sys.modules[__name__])
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 4ae8488728a92..582fdd3b41a35 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -1,14 +1,24 @@
 import collections
+import dis
 import functools
 import itertools
 import logging
 import os
 import random
+import sys
+import threading
+import time
+import traceback
 import types
 import typing
 import weakref
 from typing import Any, Callable, Dict, List, Optional, Set
 
+from torch.fx._lazy_graph_module import (  # type: ignore[attr-defined]
+    _use_lazy_graph_module,
+)
+from torch.utils._traceback import CapturedTraceback
+
 try:
     import numpy as np
 except ModuleNotFoundError:
@@ -17,15 +27,18 @@
 import torch
 import torch._logging
 from torch._guards import compile_context, CompileContext, CompileId, tracing
-from torch._utils_internal import signpost_event
+from torch._logging import structured
+from torch._utils_internal import compile_time_strobelight_meta, signpost_event
 from torch.fx.experimental.symbolic_shapes import (
     ConstraintViolationError,
     GuardOnDataDependentSymNode,
 )
 from torch.fx.graph_module import _forward_from_src as original_forward_from_src
+from torch.nn.parallel.distributed import DistributedDataParallel
+from torch.utils._python_dispatch import _disable_current_modes
 from torch.utils._traceback import format_traceback_short
 
-from . import config, exc
+from . import config, exc, trace_rules
 from .backends.registry import CompilerFn
 from .bytecode_analysis import remove_dead_code, remove_pointless_jumps
 from .bytecode_transformation import (
@@ -58,7 +71,6 @@
     GuardedCode,
 )
 from .hooks import Hooks
-from .output_graph import OutputGraph
 from .replay_record import ExecutionRecord
 from .symbolic_convert import InstructionTranslator, SpeculationLog
 from .trace_rules import is_numpy
@@ -67,7 +79,6 @@
     CleanupManager,
     CompilationMetrics,
     counters,
-    cprofile_wrapper,
     dynamo_timed,
     format_bytecode,
     frame_phase_timing,
@@ -76,6 +87,7 @@
     is_namedtuple,
     istype,
     LazyString,
+    maybe_cprofile,
     orig_code_map,
     record_compilation_metrics,
     reset_graph_break_dup_checker,
@@ -86,8 +98,11 @@
 
 log = logging.getLogger(__name__)
 bytecode_log = torch._logging.getArtifactLogger(__name__, "bytecode")
+graph_break_log = torch._logging.getArtifactLogger(__name__, "graph_breaks")
 GlobalStateGuard = torch._C._dynamo.guards.GlobalStateGuard
 
+compile_lock = threading.RLock()
+
 
 class Tracker:
     def __init__(self):
@@ -112,7 +127,7 @@ def clear(self):
 input_codes = Tracker()
 output_codes = Tracker()
 
-initial_global_state = None
+initial_global_state: Optional[GlobalStateGuard] = None
 
 
 @functools.wraps(original_forward_from_src)
@@ -137,33 +152,42 @@ def preserve_global_state(fn):
     def _fn(*args, **kwargs):
         guards = GlobalStateGuard()
         prior_grad_mode = torch.is_grad_enabled()
-        prior_inference_mode = torch.is_inference_mode_enabled()
-        prior_deterministic = torch.are_deterministic_algorithms_enabled()
-        prior_warn_only = torch.is_deterministic_algorithms_warn_only_enabled()
-        py_rng_state = random.getstate()
-        torch_rng_state = torch.random.get_rng_state()
-        if torch.cuda.is_available():
-            cuda_rng_state = torch.cuda.get_rng_state()
-        prior_fwd_from_src = torch.fx.graph_module._forward_from_src
-        torch.fx.graph_module._forward_from_src = fx_forward_from_src_skip_result
-        cleanup = setup_compile_debug()
-        try:
-            return fn(*args, **kwargs)
-        finally:
-            cleanup.close()
-            torch._C._set_grad_enabled(prior_grad_mode)
-            torch.torch.autograd.grad_mode._enter_inference_mode(prior_inference_mode)
-            torch.use_deterministic_algorithms(
-                prior_deterministic, warn_only=prior_warn_only
-            )
-            random.setstate(py_rng_state)
-            torch.random.set_rng_state(torch_rng_state)
+        # Just in case we get left in a bad dispatch state we want to restore
+        # it. This can happen because the dispatch bits aren't a true
+        # stack/counter - so we can't just increment/decrement them as we enter
+        # and leave.
+        with torch._C._PreserveDispatchKeyGuard():
+            prior_inference_mode = torch.is_inference_mode_enabled()
+            prior_deterministic = torch.are_deterministic_algorithms_enabled()
+            prior_warn_only = torch.is_deterministic_algorithms_warn_only_enabled()
+            py_rng_state = random.getstate()
+            torch_rng_state = torch.random.get_rng_state()
             if torch.cuda.is_available():
-                torch.cuda.set_rng_state(cuda_rng_state)
-            torch.fx.graph_module._forward_from_src = prior_fwd_from_src
-            assert (
-                guards.check()
-            ), "Global state changed while dynamo tracing, please report a bug"
+                cuda_rng_state = torch.cuda.get_rng_state()
+            allow_tf32 = torch._C._get_cublas_allow_tf32()
+            prior_fwd_from_src = torch.fx.graph_module._forward_from_src
+            torch.fx.graph_module._forward_from_src = fx_forward_from_src_skip_result
+            cleanup = setup_compile_debug()
+            try:
+                return fn(*args, **kwargs)
+            finally:
+                cleanup.close()
+                torch._C._set_grad_enabled(prior_grad_mode)
+                torch.torch.autograd.grad_mode._enter_inference_mode(
+                    prior_inference_mode
+                )
+                torch.use_deterministic_algorithms(
+                    prior_deterministic, warn_only=prior_warn_only
+                )
+                random.setstate(py_rng_state)
+                torch.random.set_rng_state(torch_rng_state)
+                if torch.cuda.is_available():
+                    torch.cuda.set_rng_state(cuda_rng_state)  # type: ignore[possibly-undefined]
+                torch._C._set_cublas_allow_tf32(allow_tf32)
+                torch.fx.graph_module._forward_from_src = prior_fwd_from_src
+                assert (
+                    guards.check()
+                ), f"Global {guards.reason()}state changed while dynamo tracing, please report a bug"
 
     _fn._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
     return _fn
@@ -272,7 +296,7 @@ def convert_frame_assert(
     reset_graph_break_dup_checker()
 
     def _convert_frame_assert(
-        frame: types.FrameType, cache_entry, hooks: Hooks, frame_state
+        frame: types.FrameType, cache_entry, hooks: Hooks, frame_state, *, skip: int = 0
     ):
         increment_frame()
 
@@ -330,7 +354,8 @@ def _convert_frame_assert(
 
         if is_generator(code):
             unimplemented("generator")
-        if exceeds_cache_size_limit(cache_size):
+        exceeded, limit_type = exceeds_cache_size_limit(cache_size)
+        if exceeded:
 
             def format_func_info(code):
                 return f"'{code.co_name}' ({code.co_filename}:{code.co_firstlineno})"
@@ -340,17 +365,18 @@ def format_guard_failures():
                 return recompile_reasons[-1]
 
             log.warning(
-                "torch._dynamo hit config.cache_size_limit (%s)\n"
+                "torch._dynamo hit config.%s (%s)\n"
                 "   function: %s\n"
                 "   last reason: %s\n"
                 'To log all recompilation reasons, use TORCH_LOGS="recompiles".\n'
                 "To diagnose recompilation issues, see %s.",
-                config.cache_size_limit,
+                limit_type,
+                getattr(config, limit_type),
                 format_func_info(code),
                 format_guard_failures(),
                 troubleshooting_url,
             )
-            unimplemented("cache_size_limit reached")
+            unimplemented(f"{limit_type} reached")
 
         if not has_tensor_in_frame(frame):
             return None
@@ -395,6 +421,7 @@ def format_guard_failures():
             frame,
             frame_state=frame_state,
             compile_id=compile_id,
+            skip=skip + 1,
         )
 
     _convert_frame_assert._torchdynamo_orig_callable = compiler_fn  # type: ignore[attr-defined]
@@ -406,16 +433,13 @@ def _clone_with_backend(backend):
     return _convert_frame_assert
 
 
-def maybe_cprofile(func):
-    if config.cprofile:
-        return cprofile_wrapper(func)
-    return func
-
-
 from collections import OrderedDict
 
 from torch.utils.hooks import RemovableHandle
 
+if typing.TYPE_CHECKING:
+    from .output_graph import OutputGraph
+
 # we have to use `OrderedDict` to make `RemovableHandle` work.
 _bytecode_hooks: Dict[int, BytecodeHook] = OrderedDict()
 
@@ -430,6 +454,8 @@ def register_bytecode_hook(hook: BytecodeHook) -> RemovableHandle:
     return handle
 
 
+@compile_time_strobelight_meta(phase_name="_compile")
+@_use_lazy_graph_module(config.use_lazy_graph_module)
 @maybe_cprofile
 def _compile(
     code: types.CodeType,
@@ -445,6 +471,8 @@ def _compile(
     frame: Optional[types.FrameType] = None,
     frame_state=None,
     compile_id=None,
+    *,
+    skip: int = 0,
 ) -> Optional[GuardedCode]:
     from torch.fx.experimental.validator import (
         bisect,
@@ -453,15 +481,15 @@ def _compile(
         ValidationException,
     )
 
+    # Time spent compiling this frame before restarting or failing analysis
+    dynamo_time_before_restart: float = 0.0
+    restart_reasons: set[str] = set()
     output: Optional[OutputGraph] = None
     tracer: Optional[InstructionTranslator] = None
     # This is shared across restarts
     mutated_closure_cell_contents: Set[str] = set()
-    fail_type: Optional[str] = None
-    fail_reason: Optional[str] = None
-    fail_user_frame_filename: Optional[str] = None
-    fail_user_frame_lineno: Optional[int] = None
     speculation_log = SpeculationLog()
+    torch._dynamo.callback_handler.run_start_callbacks()
 
     @preserve_global_state
     def transform(instructions, code_options):
@@ -518,6 +546,24 @@ def compile_inner(
         transform: Callable[[List[Instruction], Dict[str, Any]], Any],
     ) -> Optional[GuardedCode]:
         nonlocal output
+        nonlocal dynamo_time_before_restart
+        nonlocal restart_reasons
+        last_attempt_start_time = start_time = time.time()
+
+        def log_bytecode(prefix, name, filename, line_no, code):
+            if bytecode_log.isEnabledFor(logging.DEBUG):
+                bytecode_log.debug(
+                    format_bytecode(prefix, name, filename, line_no, code)
+                )
+
+        log_bytecode(
+            "ORIGINAL BYTECODE",
+            code.co_name,
+            code.co_filename,
+            code.co_firstlineno,
+            code,
+        )
+
         for attempt in itertools.count():
             CompileContext.get().attempt = attempt
             try:
@@ -528,6 +574,10 @@ def compile_inner(
                     "Restarting analysis due to %s",
                     LazyString(format_traceback_short, e.__traceback__),
                 )
+                # If restart reason is None just log the type of the exception
+                restart_reasons.add(e.restart_reason or str(type(e)))
+                # We now have a new "last attempt", reset the clock
+                last_attempt_start_time = time.time()
                 if attempt > 100:
                     unimplemented("100+ RestartAnalysis() calls")
             except exc.SkipFrame as e:
@@ -543,25 +593,12 @@ def compile_inner(
                     log.debug("No graph captured with one_graph=True")
                 return None
 
-        def log_bytecode(prefix, name, filename, line_no, code):
-            if bytecode_log.isEnabledFor(logging.DEBUG):
-                bytecode_log.debug(
-                    format_bytecode(prefix, name, filename, line_no, code)
-                )
-
-        log_bytecode(
-            "ORIGINAL BYTECODE",
-            code.co_name,
-            code.co_filename,
-            code.co_firstlineno,
-            code,
-        )
         log_bytecode(
             "MODIFIED BYTECODE",
             code.co_name,
             code.co_filename,
             code.co_firstlineno,
-            out_code,
+            out_code,  # type: ignore[possibly-undefined]
         )
 
         for hook in _bytecode_hooks.values():
@@ -571,7 +608,7 @@ def log_bytecode(prefix, name, filename, line_no, code):
 
         orig_code_map[out_code] = code
         output_codes.add(out_code)
-
+        dynamo_time_before_restart = last_attempt_start_time - start_time
         assert output is not None
 
         # Tests for new code objects.
@@ -635,10 +672,33 @@ def count_args(code):
             # they are benign and do not generate any new graphs.
             hooks.guard_export_fn(output.guards)
 
-        output.local_scope.clear()
         return guarded_code
 
     with compile_context(CompileContext(compile_id)):
+        log.debug(
+            "torchdynamo start compiling %s %s:%s, stack (elided %s frames):\n%s",
+            code.co_name,
+            code.co_filename,
+            code.co_firstlineno,
+            skip + 2,
+            # -2: omit current frame, omit contextlib decorator
+            "".join(CapturedTraceback.extract(skip=2 + skip).format()),
+        )
+        # -4: -2 as above, plus trace_structured frames
+        torch._logging.trace_structured(
+            "dynamo_start",
+            lambda: {
+                "stack": structured.from_traceback(
+                    CapturedTraceback.extract(skip=4 + skip).summary()
+                )
+            },
+        )
+        start_time = time.time()
+        fail_type: Optional[str] = None
+        fail_reason: Optional[str] = None
+        fail_user_frame_filename: Optional[str] = None
+        fail_user_frame_lineno: Optional[int] = None
+        guarded_code = None
         try:
             guarded_code = compile_inner(code, one_graph, hooks, transform)
             return guarded_code
@@ -659,6 +719,7 @@ def count_args(code):
             if e.innermost_user_frame_summary is not None:  # type: ignore[union-attr]
                 fail_user_frame_filename = e.innermost_user_frame_summary.filename  # type: ignore[union-attr]
                 fail_user_frame_lineno = e.innermost_user_frame_summary.lineno  # type: ignore[union-attr]
+            e.compile_id = compile_id  # type: ignore[union-attr]
             raise
         except Exception as e:
             fail_type = str(type(e))
@@ -667,6 +728,7 @@ def count_args(code):
             if e.innermost_user_frame_summary is not None:  # type: ignore[attr-defined]
                 fail_user_frame_filename = e.innermost_user_frame_summary.filename  # type: ignore[attr-defined]
                 fail_user_frame_lineno = e.innermost_user_frame_summary.lineno  # type: ignore[attr-defined]
+            e.compile_id = compile_id  # type: ignore[attr-defined]
             raise InternalTorchDynamoError(str(e)).with_traceback(
                 e.__traceback__
             ) from None
@@ -693,6 +755,10 @@ def count_args(code):
                 backend_compile_time = frame_phase_timing[frame_key].get(
                     "backend_compile", None
                 )
+                inductor_compile_time = frame_phase_timing[frame_key].get(
+                    "inductor_compile", None
+                )
+                code_gen_time = frame_phase_timing[frame_key].get("code_gen", None)
                 non_compliant_ops = {op.__qualname__ for op in output.non_compliant_ops}
                 compliant_custom_ops = {
                     op.__qualname__ for op in output.compliant_custom_ops
@@ -705,8 +771,14 @@ def count_args(code):
                 graph_input_count = None
                 entire_frame_compile_time = None
                 backend_compile_time = None
+                inductor_compile_time = None
+                code_gen_time = None
                 non_compliant_ops = set({})
                 compliant_custom_ops = set({})
+                restart_reasons = set()
+                # If compilation failed, the entire time is wasted
+                dynamo_time_before_restart = time.time() - start_time
+
             metrics = CompilationMetrics(
                 frame_key,
                 code.co_name,
@@ -719,26 +791,37 @@ def count_args(code):
                 graph_op_count,
                 graph_node_count,
                 graph_input_count,
+                start_time,
                 entire_frame_compile_time,
                 backend_compile_time,
+                inductor_compile_time,
+                code_gen_time,
                 fail_type,
                 fail_reason,
                 fail_user_frame_filename,
                 fail_user_frame_lineno,
                 non_compliant_ops,
                 compliant_custom_ops,
+                restart_reasons,
+                dynamo_time_before_restart,
+                guarded_code is not None,
             )
             record_compilation_metrics(metrics)
+            torch._dynamo.callback_handler.run_end_callbacks()
 
 
 def convert_frame(compiler_fn: CompilerFn, hooks: Hooks):
     """Try to convert a frame into an FX graph, if error leave frame unmodified"""
     inner_convert = convert_frame_assert(compiler_fn, one_graph=False)
 
-    def _convert_frame(frame: types.FrameType, cache_entry, hooks: Hooks, frame_state):
+    def _convert_frame(
+        frame: types.FrameType, cache_entry, hooks: Hooks, frame_state, skip: int = 0
+    ):
         counters["frames"]["total"] += 1
         try:
-            result = inner_convert(frame, cache_entry, hooks, frame_state)
+            result = inner_convert(
+                frame, cache_entry, hooks, frame_state, skip=skip + 1
+            )
             counters["frames"]["ok"] += 1
             return result
         except Exception as e:
@@ -760,6 +843,29 @@ def _convert_frame(frame: types.FrameType, cache_entry, hooks: Hooks, frame_stat
                 raise
 
             soft_fail = isinstance(e, Unsupported)
+
+            # This is a soft failure. In the sense, the code path reaches here
+            # when we do not support graph breaks on bytecodes like LOAD_ATTR,
+            # BUILD_SET etc. In such case, we can fallback to eager without
+            # scaring users.
+            if isinstance(e, Unsupported) and graph_break_log.isEnabledFor(
+                logging.DEBUG
+            ):
+                # Log this message in the graph break. Also use the string
+                # "skip: " to tell that the whole frame is falling back to
+                # eager.
+                if hasattr(e, "compile_id"):
+                    with compile_context(CompileContext(e.compile_id)):  # type: ignore[attr-defined]
+                        user_stack = e.real_stack
+                        user_stack_formatted = "".join(
+                            traceback.format_list(user_stack)
+                        )
+                        graph_break_log.debug(
+                            "Graph break: skip: from user code at:\n%s",
+                            user_stack_formatted,
+                            exc_info=True,
+                        )
+
             if not config.suppress_errors and not soft_fail:
                 raise
 
@@ -805,8 +911,74 @@ def replay(filename):
             hooks=Hooks(),
             cache_size=CacheSizeRelevantForFrame(0, 0),
             frame=None,
+            frame_state={},
         )
-    except Exception:
-        pass
     finally:
         config.replay_record_enabled = original_replay_val
+
+
+def first_real_inst_idx(code):
+    if sys.version_info < (3, 11):
+        return 0
+    for inst in dis.get_instructions(code):
+        if inst.opname == "RESUME":
+            return inst.offset // 2
+    raise RuntimeError("RESUME instruction not found in code")
+
+
+def catch_errors_wrapper(callback, hooks: Hooks):
+    @functools.wraps(callback)
+    def catch_errors(frame, cache_entry, frame_state):
+        assert frame_state is not None
+
+        is_skipfile = trace_rules.check(frame.f_code)
+        if (
+            # TODO: the first condition is not covered by any test
+            frame.f_lasti >= first_real_inst_idx(frame.f_code)
+            or is_skipfile
+            or config.disable
+        ):
+            if log.isEnabledFor(logging.DEBUG):
+                skip_reason = (
+                    "traced frame already"
+                    if frame.f_lasti >= first_real_inst_idx(frame.f_code)
+                    else (
+                        "in skipfiles"
+                        if trace_rules.check(frame.f_code)
+                        else "dynamo tracing is disabled"
+                    )
+                )
+                log.debug(
+                    "skipping: %s (reason: %s, file: %s)",
+                    frame.f_code.co_name,
+                    skip_reason,
+                    frame.f_code.co_filename,
+                )
+            return None
+        if frame.f_code.co_filename == "<string>" and frame.f_code.co_name == "__new__":
+            # nametuple constructor
+            return None
+        if config._get_optimize_ddp_mode() == "ddp_optimizer":
+            ddp_module = DistributedDataParallel._get_active_ddp_module()
+            if ddp_module:
+                with compile_lock:
+                    from torch._dynamo.backends.distributed import DDPOptimizer
+
+                    ddp_optimizer = DDPOptimizer(
+                        bucket_bytes_cap=ddp_module.bucket_bytes_cap,
+                        backend_compile_fn=callback._torchdynamo_orig_callable,
+                    )
+                    assert hasattr(
+                        callback, "_clone_with_backend"
+                    ), "DDPOptimizer only supports callback fns that know how to clone themselves."
+                    hijacked_callback = callback._clone_with_backend(
+                        ddp_optimizer.compile_fn,
+                    )
+                    return hijacked_callback(frame, cache_entry, hooks, frame_state)
+
+        with compile_lock, _disable_current_modes():
+            # skip=1: skip this frame
+            return callback(frame, cache_entry, hooks, frame_state, skip=1)
+
+    catch_errors._torchdynamo_orig_callable = callback  # type: ignore[attr-defined]
+    return catch_errors
diff --git a/torch/_dynamo/create_parameter_op.py b/torch/_dynamo/create_parameter_op.py
new file mode 100644
index 0000000000000..25ba68b9be76e
--- /dev/null
+++ b/torch/_dynamo/create_parameter_op.py
@@ -0,0 +1,50 @@
+import torch
+from torch._prims import _make_prim, RETURN_TYPE
+from torch._prims_common import clone_preserve_strides
+
+doc = """
+This is used when dynamo traces torch.nn.Parameter, which normally would not trace properly
+with AOTAutograd.  We instead create a placeholder torch.nn.Parameter before the graph, which
+becomes a graph arg and has no storage backing it.  At the point in the graph where the parameter
+actually should be created we mutate this sacrificial placeholder into it.  This allows gradients
+to flow into the parameter as if it were an input to the graph (which is the only thing we are
+allowed to compute gradients on).
+""".strip()
+
+_bind_nn_parameter = _make_prim(
+    schema="_bind_nn_parameter(Tensor self, Tensor placeholder) -> Tensor",
+    return_type=RETURN_TYPE.NEW,
+    meta=lambda self, placeholder: torch.nn.Parameter(
+        clone_preserve_strides(self), placeholder.requires_grad
+    ),
+    impl_aten=lambda self, placeholder: placeholder.set_(self),
+    doc=doc,
+)
+torch.fx.node.has_side_effect(_bind_nn_parameter)
+
+
+class TracableCreateParameter(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, tensor, placeholder):
+        assert not tensor.requires_grad
+        return _bind_nn_parameter(tensor, placeholder)
+
+    @staticmethod
+    def backward(ctx, grad):
+        return None, grad  # grad flows to placeholder
+
+
+def tracable_create_parameter(tensor, placeholder):
+    with torch.set_grad_enabled(placeholder.requires_grad):
+        return TracableCreateParameter.apply(tensor, placeholder)
+
+
+def new_parameter_placeholder(size, dtype, device, requires_grad):
+    """Create a placeholder to be passed to the above functions"""
+    result = torch.nn.Parameter(
+        torch.empty(size, dtype=dtype, device=device), requires_grad=requires_grad
+    )
+    # TODO(jansel): alloc followed by free is inefficient, need a way to allocate an unbacked tensor.
+    # Allocating a zero tensor would causes assert failures in autograd.
+    result.untyped_storage().resize_(0)
+    return result
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 88b5998fa9873..67dd492fe8512 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -3,19 +3,22 @@
 import copy
 import functools
 import getpass
+import inspect
 import itertools
 import logging
 import os
+import re
 import subprocess
 import tempfile
 import textwrap
 from collections import Counter
 from importlib import import_module
-from typing import Callable, Optional, TypeVar
+from typing import Any, Callable, Dict, List, Optional, TypeVar
 
 import torch
 import torch._prims_common as utils
 import torch._subclasses.meta_utils
+from torch import Tensor
 
 from torch._dynamo.testing import rand_strided
 from torch._prims_common import is_float_dtype
@@ -46,7 +49,7 @@
         "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu",
         "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops",
     ]
-    cur_target = libfb.py.build_info.BuildInfo.get_build_rule().replace("fbcode:", "//")
+    cur_target = libfb.py.build_info.BuildInfo.get_build_rule().replace("fbcode:", "//")  # type: ignore[possibly-undefined]
     extra_imports = "\n".join([f'torch.ops.load_library("{x}")' for x in extra_deps])
 
 
@@ -88,6 +91,7 @@ def build(self):
 {extra_cpp_deps}
     ],
     main_module = "{self.path}",
+    par_style = "xar",
 )
 """
         )
@@ -229,7 +233,7 @@ def _cuda_system_info_comment():
         cuda_version_lines = cuda_version_out.decode().split("\n")
         comment = "".join([f"# {s} \n" for s in cuda_version_lines if s not in [""]])
         model_str += f"{comment}\n"
-    except FileNotFoundError:
+    except (FileNotFoundError, subprocess.CalledProcessError):
         model_str += "# nvcc not found\n"
 
     gpu_names = Counter(
@@ -250,6 +254,7 @@ def generate_config_string(*, stable_output=False):
     if stable_output:
         return "# config omitted due to stable_output=True"
 
+    experimental_config = torch.fx.experimental._config.codegen_config()  # type: ignore[attr-defined]
     return f"""\
 import torch._dynamo.config
 import torch._inductor.config
@@ -258,7 +263,7 @@ def generate_config_string(*, stable_output=False):
 {torch._dynamo.config.codegen_config()}
 {torch._inductor.config.codegen_config()}
 {torch._functorch.config.codegen_config()}
-{torch.fx.experimental._config.codegen_config()}
+{experimental_config}
 """
 
 
@@ -277,7 +282,7 @@ def helper_for_dump_minify(contents):
             fd.write(contents)
 
     except OSError as e:
-        log.exception(e)
+        log.exception("")
         raise NotImplementedError("Could not write to {minified_repro_path}") from e
 
 
@@ -305,8 +310,6 @@ def run_fwd_maybe_bwd(gm, args, only_fwd=False, disable_clone=False):
     When disable_clone is True, we will use args as-is without cloning.
     This is higher fidelity but we may destroy the args in the process.
     """
-    from torch._functorch.aot_autograd import make_boxed_func
-
     from .testing import collect_results, reduce_to_scalar_loss, requires_bwd_pass
 
     gm = copy.deepcopy(gm)
@@ -316,19 +319,9 @@ def run_fwd_maybe_bwd(gm, args, only_fwd=False, disable_clone=False):
     if hasattr(gm, "zero_grad"):
         gm.zero_grad(True)
 
-    # TorchInductor returned callable expects lists. So, boxing the call.
-    orig_named_parameters = getattr(gm, "named_parameters", None)
-    orig_named_buffers = getattr(gm, "named_buffers", None)
-    if not hasattr(gm, "_boxed_call") and (
-        orig_named_parameters is not None or orig_named_buffers is not None
-    ):
-        gm = make_boxed_func(gm)
-        if orig_named_parameters is not None:
-            gm.named_parameters = orig_named_parameters
-        if orig_named_buffers is not None:
-            gm.named_buffers = orig_named_buffers
+    # TorchInductor returned callable expects lists. So, may need a boxed calling convention.
+    out = gm(args) if hasattr(gm, "_boxed_call") else gm(*args)
 
-    out = gm(args)
     if only_fwd:
         return out
     if requires_bwd_pass(out):
@@ -354,21 +347,8 @@ def same_two_models(
         is mostly useful for the minifier (which wants to avoid quantizing floating point
         error into integer/boolean error)
     """
-    from .eval_frame import OptimizedModule
-    from .testing import (
-        named_buffers_for_optimized_module,
-        named_parameters_for_optimized_module,
-    )
     from .utils import same
 
-    if isinstance(gm, OptimizedModule):
-        gm.named_parameters = named_parameters_for_optimized_module(gm)
-        gm.named_buffers = named_buffers_for_optimized_module(gm)
-
-    if isinstance(opt_gm, OptimizedModule):
-        opt_gm.named_parameters = named_parameters_for_optimized_module(opt_gm)
-        opt_gm.named_buffers = named_buffers_for_optimized_module(opt_gm)
-
     ref = run_fwd_maybe_bwd(gm, example_inputs, only_fwd)
 
     fp64_ref = None
@@ -696,3 +676,102 @@ def symint(self, name, val) -> None:
         if isinstance(val, torch.SymInt):
             val = val.node.hint
         self._lines.append(f"reader.symint({val!r})  # {name}")
+
+
+def aot_graph_input_parser(
+    func: Callable[[List[Tensor]], List[Tensor]],
+    device: str = "cuda",
+    sym_shapes: Optional[Dict[str, int]] = None,
+    default_sym_shape: Optional[int] = None,
+) -> Dict[str, Any]:
+    """
+    Takes in a function which has been printed with print_readable() and constructs kwargs to run it.
+
+    Handles Tensor inputs, Symints, and a graph module which might have tensor constants.
+
+    Consider a function `forward` defined as follows:
+
+    def forward(self, primals_1: "f32[1001, 6]", primals_2: "f32[s0]", primals_3: "Sym(s0)",):
+        _tensor_constant0: "i64[4190]" = self._tensor_constant0
+        # Further implementation
+
+    kwargs = aot_graph_input_parser(forward)
+    forward(**kwargs)
+    """
+
+    from torch.fx.graph import dtype_abbrs
+
+    dtype_map = {value: key for key, value in dtype_abbrs.items()}
+    dtype_pattern = "|".join(dtype_abbrs.values())
+
+    # Extracting the source code from the function
+    source = inspect.getsource(func)
+
+    # Regular expressions
+    tensor_assignment_regex = rf"(_tensor_constant\d+): \"({dtype_pattern})\[\s*(.*?)\s*\]\" = self\.(_tensor_constant\d+)"
+    tensor_regex = rf"({dtype_pattern})\[\s*(.*?)\s*\]"
+    sym_shape_regex = r"Sym\((s\d+)\)"
+
+    class TensorContainer:
+        "Container for tensors as attributes"
+        pass
+
+    # Dictionary for tensors from annotations
+    kwargs: Dict[str, Any] = {}
+
+    sym_shapes = sym_shapes or {}
+
+    def get_sym_int(symint):
+        torch._check(
+            symint in sym_shapes or default_sym_shape is not None,
+            lambda: f"{symint} not in symbolic_shapes and default sym shape not passed in",
+        )
+        return sym_shapes.get(symint, default_sym_shape)
+
+    def gen_tensor(shape, dtype) -> Tensor:
+        # Resolve symbolic shapes to concrete values
+        resolved_shape = []
+        dynamic_dims = []
+        for i, dim in enumerate(shape):
+            dim = dim.strip()
+            if "s" in dim:
+                s = get_sym_int(dim)
+                resolved_shape.append(s)
+                dynamic_dims.append(i)
+            else:
+                resolved_shape.append(int(dim))
+
+        constructor = torch.randn if dtype.is_floating_point else torch.zeros
+        out = constructor(resolved_shape, dtype=dtype, device=device)  # type: ignore[call-arg]
+        for d in dynamic_dims:
+            torch._dynamo.mark_dynamic(out, d)
+        return out
+
+    # Parse function annotations for tensor generation
+    annotations = func.__annotations__
+    for param, annotation in annotations.items():
+        # Skip 'return' annotation
+        if param == "return":
+            continue
+
+        match = re.search(tensor_regex, annotation)
+        if match:
+            data_type, shape_str = match.groups()
+            shape = tuple(shape_str.split(","))
+            dtype = dtype_map[data_type]
+            kwargs[param] = gen_tensor(shape, dtype)
+
+        match = re.search(sym_shape_regex, annotation)
+        if match:
+            kwargs[param] = get_sym_int(match.group(1))
+
+    if "self" in inspect.signature(func).parameters:
+        container = TensorContainer()
+        kwargs["self"] = container
+        for match in re.finditer(tensor_assignment_regex, source):
+            attr_name, data_type, shape_str, _ = match.groups()
+            shape = tuple(shape_str.split(","))
+            dtype = dtype_map[data_type]
+            setattr(container, attr_name, gen_tensor(shape, dtype))
+
+    return kwargs
diff --git a/torch/_dynamo/decorators.py b/torch/_dynamo/decorators.py
index a8e09d77dc7ae..8c82bf5421696 100644
--- a/torch/_dynamo/decorators.py
+++ b/torch/_dynamo/decorators.py
@@ -1,10 +1,13 @@
+from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
 import torch
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 from . import trace_rules, variables
+from .comptime import comptime
 from .eval_frame import DisableContext, innermost_fn, RunOnlyContext
 from .exc import IncorrectUsage
+from .external_utils import is_compiling
 
 if TYPE_CHECKING:
     from torch._C._dynamo.eval_frame import (  # noqa: F401
@@ -91,7 +94,7 @@ def fn(a):
     if isinstance(fn, (list, tuple)):
         return [allow_in_graph(x) for x in fn]
     assert callable(fn), "allow_in_graph expects a callable"
-    if trace_rules.lookup(fn) != variables.TorchInGraphFunctionVariable:
+    if trace_rules.lookup_callable(fn) != variables.TorchInGraphFunctionVariable:
         trace_rules._disallowed_callable_ids.remove(id(fn))
         trace_rules._allowed_callable_ids.add(id(fn))
     return fn
@@ -104,8 +107,9 @@ def inner(fn):
         assert callable(fn), "disallow_in_graph expects a callable"
         if (
             throw_if_not_allowed
+            and trace_rules.lookup_callable(fn)
+            != variables.TorchInGraphFunctionVariable
             and trace_rules.lookup(fn) != variables.TorchInGraphFunctionVariable
-            and fn not in trace_rules._allowed_callable_ids
         ):
             raise IncorrectUsage(
                 "disallow_in_graph is expected to be used on an already allowed callable (like torch.* ops). "
@@ -165,20 +169,33 @@ def forbid_in_graph(fn):
 # Helper function to flatten a tensor subclass and apply a function to
 # all inner tensors that match the outer dim. Used to reduce duplication
 # across the various marking APIs.
-def _apply_func_to_inner_tensors_of_same_dim(func, t, *args):
+def _apply_func_to_inner_tensors_of_same_dim(func, t, *args, **kwargs):
     assert is_traceable_wrapper_subclass(t)
 
     attrs, ctx = t.__tensor_flatten__()
     for attr in attrs:
         inner = getattr(t, attr)
         if inner.dim() == t.dim():
-            func(inner, *args)
+            func(inner, *args, **kwargs)
+
+
+@dataclass(frozen=True)
+class _DimRange:
+    """
+    This represents an dimension of a tensor and the corresponding
+    min and max values it can take.  Don't create this
+    class directly; instead, use :func:`mark_dynamic`.
+    """
+
+    dim: int
+    min: int
+    max: int
 
 
 @forbid_in_graph
-def mark_dynamic(t, index):
+def mark_dynamic(t, index, *, min=None, max=None):
     """
-    Mark a tensor as having a dynamic dim.
+    Mark a tensor as having a dynamic dim and set corresponding min and max range for the dim.
 
     [Note - on the state of mark_dynamic]
 
@@ -203,18 +220,22 @@ def mark_dynamic(t, index):
     if is_traceable_wrapper_subclass(t):
         # default behavior: mirror mark_dynamic() on all inner tensors with same dim as t
         # TODO: Make this configurable via a supported public API
-        _apply_func_to_inner_tensors_of_same_dim(mark_dynamic, t, index)
+        _apply_func_to_inner_tensors_of_same_dim(
+            mark_dynamic, t, index, min=min, max=max
+        )
 
     if isinstance(index, int):
         if not hasattr(t, "_dynamo_dynamic_indices"):
             t._dynamo_dynamic_indices = set()
+            t._dynamo_dynamic_range = set()
         # TODO(voz): Should we bounds check?
         t._dynamo_dynamic_indices.add(index)
+        t._dynamo_dynamic_range.add(_DimRange(index, min, max))
         return
 
     assert isinstance(index, (list, tuple))
     for i in index:
-        mark_dynamic(t, i)
+        mark_dynamic(t, i, min=min, max=max)
 
 
 @forbid_in_graph
@@ -240,7 +261,6 @@ def maybe_mark_dynamic(t, index):
         maybe_mark_dynamic(t, i)
 
 
-@forbid_in_graph
 def mark_static(t, index=None):
     """
     Mark a tensor as having a static dim.
@@ -249,7 +269,18 @@ def mark_static(t, index=None):
     when dynamic=True; this can improve trace-time performance.
 
     This has lower precedence than mark_dynamic.
+
+    Unlike mark_dynamic, this can be done inside a graph, in which case it
+    induces specialization on the tensor.
     """
+    if is_compiling():
+        if index is None:
+            for s in t.size():
+                comptime.force_static(s)
+        else:
+            comptime.force_static(t.size(index))
+        return
+
     if is_traceable_wrapper_subclass(t):
         # default behavior: mirror mark_static() on all inner tensors with same dim as t
         # TODO: Make this configurable via a supported public API
diff --git a/torch/_dynamo/device_interface.py b/torch/_dynamo/device_interface.py
index 77430c8a59647..d93a265466832 100644
--- a/torch/_dynamo/device_interface.py
+++ b/torch/_dynamo/device_interface.py
@@ -39,7 +39,7 @@ class DeviceInterface(metaclass=DeviceInterfaceMeta):
 
     class device:
         def __new__(cls, device: _device_t):
-            raise NotImplementedError()
+            raise NotImplementedError
 
     class Worker:
         """
@@ -51,63 +51,96 @@ class Worker:
 
         @staticmethod
         def set_device(device: int):
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @staticmethod
         def current_device() -> int:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         @staticmethod
         def get_device_properties(device: _device_t = None):
-            raise NotImplementedError()
+            raise NotImplementedError
 
     @staticmethod
     def current_device():
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @staticmethod
     def set_device(device: _device_t):
-        raise NotImplementedError()
+        raise NotImplementedError
+
+    @staticmethod
+    def maybe_exchange_device(device: int) -> int:
+        raise NotImplementedError
+
+    @staticmethod
+    def exchange_device(device: int) -> int:
+        raise NotImplementedError
 
     @staticmethod
     def device_count():
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @staticmethod
     def is_available() -> bool:
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @staticmethod
     def stream(stream: torch.Stream):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @staticmethod
     def current_stream():
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @staticmethod
     def set_stream(stream: torch.Stream):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @staticmethod
     def _set_stream_by_id(stream_id: int, device_index: int, device_type: int):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @staticmethod
     def get_raw_stream():
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @staticmethod
     def synchronize(device: _device_t = None):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @staticmethod
     def get_device_properties(device: _device_t = None):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @staticmethod
     def get_compute_capability(device: _device_t = None):
-        raise NotImplementedError()
+        raise NotImplementedError
+
+
+class DeviceGuard:
+    """
+    This class provides a context manager for device switching. This is a stripped
+    down version of torch.{device_name}.device.
+
+    The context manager changes the current device to the given device index
+    on entering the context and restores the original device on exiting.
+    The device is switched using the provided device interface.
+    """
+
+    def __init__(self, device_interface: Type[DeviceInterface], index: Optional[int]):
+        self.device_interface = device_interface
+        self.idx = index
+        self.prev_idx = -1
+
+    def __enter__(self):
+        if self.idx is not None:
+            self.prev_idx = self.device_interface.exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        if self.idx is not None:
+            self.idx = self.device_interface.maybe_exchange_device(self.prev_idx)
+        return False
 
 
 class CudaInterface(DeviceInterface):
@@ -159,6 +192,8 @@ def get_device_properties(device: _device_t = None):
     synchronize = staticmethod(torch.cuda.synchronize)
     get_device_properties = staticmethod(torch.cuda.get_device_properties)  # type: ignore[assignment]
     get_raw_stream = staticmethod(get_cuda_stream)  # type: ignore[arg-type]
+    exchange_device = staticmethod(torch.cuda._exchange_device)  # type: ignore[arg-type]
+    maybe_exchange_device = staticmethod(torch.cuda._maybe_exchange_device)  # type: ignore[arg-type]
 
     # Can be mock patched by @patch decorator.
     @staticmethod
@@ -167,25 +202,116 @@ def is_available() -> bool:
 
     @staticmethod
     def get_compute_capability(device: _device_t = None):
-        major, min = torch.cuda.get_device_capability(device)
-        return major * 10 + min
+        if torch.version.hip is None:
+            major, min = torch.cuda.get_device_capability(device)
+            return major * 10 + min
+        else:
+            return torch.cuda.get_device_properties(device).gcnArchName.split(":", 1)[0]
+
+
+get_xpu_stream: Optional[Callable[[int], int]]
+if torch.xpu._is_compiled():
+    from torch._C import _xpu_getCurrentRawStream as get_xpu_stream
+else:
+    get_xpu_stream = None
+
+
+class XpuInterface(DeviceInterface):
+    device = torch.xpu.device
+    Event = torch.xpu.Event
+    Stream = torch.xpu.Stream
+
+    class Worker:
+        @staticmethod
+        def set_device(device: int):
+            caching_worker_current_devices["xpu"] = device
+
+        @staticmethod
+        def current_device() -> int:
+            if "xpu" in caching_worker_current_devices:
+                return caching_worker_current_devices["xpu"]
+            return torch.xpu.current_device()
+
+        @staticmethod
+        def get_device_properties(device: _device_t = None):
+            if device is not None:
+                if isinstance(device, str):
+                    device = torch.device(device)
+                    assert device.type == "xpu"
+                if isinstance(device, torch.device):
+                    device = device.index
+            if device is None:
+                device = XpuInterface.Worker.current_device()
+
+            if "xpu" not in caching_worker_device_properties:
+                device_prop = [
+                    torch.xpu.get_device_properties(i)
+                    for i in range(torch.xpu.device_count())
+                ]
+                caching_worker_device_properties["xpu"] = device_prop
+
+            return caching_worker_device_properties["xpu"][device]
+
+    current_device = staticmethod(torch.xpu.current_device)
+    set_device = staticmethod(torch.xpu.set_device)
+    device_count = staticmethod(torch.xpu.device_count)
+    stream = staticmethod(torch.xpu.stream)  # type: ignore[assignment]
+    current_stream = staticmethod(torch.xpu.current_stream)
+    set_stream = staticmethod(torch.xpu.set_stream)  # type: ignore[assignment]
+    _set_stream_by_id = staticmethod(torch.xpu._set_stream_by_id)  # type: ignore[assignment]
+    synchronize = staticmethod(torch.xpu.synchronize)
+    get_device_properties = staticmethod(torch.xpu.get_device_properties)  # type: ignore[assignment]
+    get_raw_stream = staticmethod(get_xpu_stream)  # type: ignore[arg-type]
+    exchange_device = staticmethod(torch.xpu._exchange_device)  # type: ignore[arg-type]
+    maybe_exchange_device = staticmethod(torch.xpu._maybe_exchange_device)  # type: ignore[arg-type]
+
+    # Can be mock patched by @patch decorator.
+    @staticmethod
+    def is_available() -> bool:
+        return torch.xpu.is_available()
+
+    @staticmethod
+    def get_compute_capability(device: _device_t = None):
+        cc = torch.xpu.get_device_capability(device)
+        return cc
 
 
 device_interfaces: Dict[str, Type[DeviceInterface]] = {}
+_device_initialized = False
 
 
-def register_interface_for_device(device: str, device_interface: Type[DeviceInterface]):
+def register_interface_for_device(
+    device: Union[str, torch.device], device_interface: Type[DeviceInterface]
+):
+    if isinstance(device, torch.device):
+        device = str(device)
     device_interfaces[device] = device_interface
 
 
-def get_interface_for_device(device: str) -> Type[DeviceInterface]:
+def get_interface_for_device(device: Union[str, torch.device]) -> Type[DeviceInterface]:
+    if isinstance(device, torch.device):
+        device = str(device)
+    if not _device_initialized:
+        init_device_reg()
     if device in device_interfaces:
         return device_interfaces[device]
     raise NotImplementedError(f"No interface for device {device}")
 
 
 def get_registered_device_interfaces() -> Iterable[Tuple[str, Type[DeviceInterface]]]:
+    if not _device_initialized:
+        init_device_reg()
     return device_interfaces.items()
 
 
-register_interface_for_device("cuda", CudaInterface)
+def init_device_reg():
+    global _device_initialized
+    register_interface_for_device("cuda", CudaInterface)
+    for i in range(torch.cuda.device_count()):
+        register_interface_for_device(f"cuda:{i}", CudaInterface)
+
+    register_interface_for_device("xpu", XpuInterface)
+    for i in range(torch.xpu.device_count()):
+        register_interface_for_device(f"xpu:{i}", XpuInterface)
+
+    _device_initialized = True
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index dcabc4141d9e2..391bdfcf02020 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -1,19 +1,25 @@
 # mypy: disable-error-code="method-assign"
 
+"""
+Functions in this file are responsible for modifying the eval frame
+handler at RUNTIME.  Therefore, all functions in this file are hot.
+Functions that only execute at compile time should be placed
+in torch._dynamo.convert_frame.
+"""
+
 from __future__ import annotations
 
 import contextlib
-import dis
 import functools
 import inspect
 import logging
 import os
 import sys
 import textwrap
-import threading
 import traceback
 import types
 import warnings
+import weakref
 from enum import Enum
 from os.path import dirname, join
 from typing import (
@@ -35,8 +41,8 @@
 import torch.utils._pytree as pytree
 import torch.utils.checkpoint
 from torch import _guards
-from torch._subclasses import fake_tensor
-from torch.export import Constraint
+from torch._utils_internal import log_export_usage
+from torch.export.dynamic_shapes import _process_dynamic_shapes
 from torch.fx.experimental.proxy_tensor import make_fx, maybe_disable_fake_tensor_mode
 from torch.fx.experimental.symbolic_shapes import (
     ConstraintViolationError,
@@ -44,38 +50,28 @@
     StatelessSymbolicContext,
 )
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
-from torch.nn.parallel.distributed import DistributedDataParallel
 
 from ..fx import GraphModule
 from .backends.registry import CompilerFn, lookup_backend
 
 from .hooks import Hooks
 
-if TYPE_CHECKING:
-    from torch._C._dynamo.eval_frame import (  # noqa: F401
-        reset_code,
-        set_eval_frame,
-        set_guard_error_hook,
-        skip_code,
-        unsupported,
-    )
-else:
-    for name in dir(torch._C._dynamo.eval_frame):
-        if name.startswith("__"):
-            continue
-        globals()[name] = getattr(torch._C._dynamo.eval_frame, name)
+# see discussion at https://github.com/pytorch/pytorch/issues/120699
+reset_code = torch._C._dynamo.eval_frame.reset_code  # noqa: F401
+set_eval_frame = torch._C._dynamo.eval_frame.set_eval_frame  # noqa: F401
+set_guard_error_hook = torch._C._dynamo.eval_frame.set_guard_error_hook  # noqa: F401
+skip_code = torch._C._dynamo.eval_frame.skip_code  # noqa: F401
+unsupported = torch._C._dynamo.eval_frame.unsupported  # noqa: F401
 
-from . import config, convert_frame, external_utils, skipfiles, utils
+from . import config, convert_frame, external_utils, trace_rules, utils
 from .code_context import code_context
 from .exc import CondOpArgsMismatchError, UserError, UserErrorType
 from .mutation_guard import install_generation_tagging_init
-from .types import CacheEntry, DynamoCallback
-from .utils import compile_times
+from .utils import common_constant_types, compile_times
 
 log = logging.getLogger(__name__)
 
 from torch._dispatch.python import enable_python_dispatcher
-from torch.utils._python_dispatch import _disable_current_modes
 
 always_optimize_code_objects = utils.ExactWeakKeyDictionary()
 null_context = contextlib.nullcontext
@@ -83,68 +79,27 @@
 
 import sympy
 
+if TYPE_CHECKING:
+    from torch._subclasses import fake_tensor
+    from .types import CacheEntry, DynamoCallback
+
 
 # See https://github.com/python/typing/pull/240
 class Unset(Enum):
     token = 0
 
 
-unset = Unset.token
-
-compile_lock = threading.RLock()
-guarded_backend_cache = threading.local()
 cached_backends: Dict[int, CompilerFn] = {}
 
-
-def _maybe_init_guarded_backend_cache():
-    if not hasattr(guarded_backend_cache, "skip_backend_check_for_run_only_mode"):
-        guarded_backend_cache.skip_backend_check_for_run_only_mode = False
-    if not hasattr(guarded_backend_cache, "current_backend"):
-        guarded_backend_cache.current_backend = None
+unset = Unset.token
 
 
 def _reset_guarded_backend_cache():
     global cached_backends
-    _maybe_init_guarded_backend_cache()
-    guarded_backend_cache.skip_backend_check_for_run_only_mode = False
-    guarded_backend_cache.current_backend = None
     for backend in cached_backends.values():
         if hasattr(backend, "reset"):
             backend.reset()
     cached_backends.clear()
-    cached_backends = {}
-
-
-@contextlib.contextmanager
-def backend_cache_wrapper(callback: DynamoCallback):
-    _maybe_init_guarded_backend_cache()
-
-    # callback is False for RunOnlyContext. RunOnlyContext is used
-    # as a way to re-use the previous compiled cache.
-    # We therefore skip the check and re-use whatever code that's already cached.
-    # Note: the cache that's actually used depends on the caching policy.
-    if callback is False:
-        try:
-            prev_skip = guarded_backend_cache.skip_backend_check_for_run_only_mode
-            guarded_backend_cache.skip_backend_check_for_run_only_mode = True
-            yield None
-        finally:
-            guarded_backend_cache.skip_backend_check_for_run_only_mode = prev_skip
-    else:
-        backend = innermost_fn(callback)
-
-        def _set_current_backend(backend: CompilerFn):
-            prev_backend = guarded_backend_cache.current_backend
-            guarded_backend_cache.current_backend = backend
-            # Mapping id of a CompilerFn to itself
-            cached_backends[id(backend)] = backend
-            return prev_backend
-
-        prev_backend = _set_current_backend(backend)
-        try:
-            yield backend
-        finally:
-            _set_current_backend(prev_backend)
 
 
 DONT_WRAP_FILES = {
@@ -163,12 +118,7 @@ def _debug_get_cache_entry_list(
     """
     if callable(code):
         code = code.__code__
-    cache_head = torch._C._dynamo.eval_frame._debug_get_cache_entry_list(code)
-    cache_list = []
-    while cache_head is not None:
-        cache_list.append(cache_head)
-        cache_head = cache_head.next
-    return cache_list
+    return torch._C._dynamo.eval_frame._debug_get_cache_entry_list(code)
 
 
 class OptimizedModule(torch.nn.Module):
@@ -180,6 +130,17 @@ class OptimizedModule(torch.nn.Module):
     _torchdynamo_orig_callable: Callable[..., Any]
     get_compiler_config: Callable[[], Any]
 
+    _opt_mod_attributes = {
+        "_orig_mod",
+        "dynamo_ctx",
+        "_torchdynamo_orig_callable",
+        "get_compiler_config",
+        "forward",
+        "_forward",
+        "__dict__",
+        "named_children_walk",
+    }
+
     def __init__(self, mod: torch.nn.Module, dynamo_ctx):
         super().__init__()
         # Installs the params/buffer
@@ -189,10 +150,13 @@ def __init__(self, mod: torch.nn.Module, dynamo_ctx):
 
     def _initialize(self):
         # Do this stuff in constructor to lower overhead slightly
-        if isinstance(self._orig_mod.forward, types.MethodType) and skipfiles.check(
+        if isinstance(self.dynamo_ctx, DisableContext):
+            # No need to check trace rules
+            self.forward = self.dynamo_ctx(self._orig_mod.__call__)
+        elif isinstance(self._orig_mod.forward, types.MethodType) and trace_rules.check(
             self._orig_mod.forward
         ):
-            # This may be a torch.nn.* instance in skipfiles.py which
+            # This may be a torch.nn.* instance in trace_rules.py which
             # won't trigger a frame evaluation workaround to add an extra
             # frame we can capture
             self.forward = self.dynamo_ctx(external_utils.wrap_inline(self._orig_mod))
@@ -219,6 +183,15 @@ def __getattr__(self, name):
             return self._modules["_orig_mod"]
         return getattr(self._orig_mod, name)
 
+    def __setattr__(self, name, val):
+        # Allow patching over class attributes
+        if hasattr(type(self), name):
+            return super().__setattr__(name, val)
+
+        if name in OptimizedModule._opt_mod_attributes:
+            return super().__setattr__(name, val)
+        return setattr(self._orig_mod, name, val)
+
     def _call_lazy_check(self, *args, **kwargs):
         if hasattr(self._orig_mod, "_initialize_hook"):
             # In the case of a lazy module, we want to run
@@ -256,6 +229,10 @@ def nothing():
     pass
 
 
+def always_false():
+    return False
+
+
 def innermost_fn(fn):
     """
     In case of nesting of _TorchDynamoContext calls, find the innermost
@@ -269,19 +246,15 @@ def innermost_fn(fn):
     return unaltered_fn
 
 
-@contextlib.contextmanager
-def enable_dynamic(enable: Optional[bool] = None, export: bool = False):
-    if enable is None:
-        yield
-    elif enable:
+def make_set_enable_dynamic(enable: bool):
+    assert isinstance(enable, bool)
+    if enable:
         # Assume everything is dynamic by default
-        with config.patch(assume_static_by_default=False):
-            yield
+        return config._make_closure_patcher(assume_static_by_default=False)
     else:
-        with config.patch(
+        return config._make_closure_patcher(
             automatic_dynamic_shapes=False, assume_static_by_default=True
-        ):
-            yield
+        )
 
 
 class _TorchDynamoContext:
@@ -301,14 +274,37 @@ def __init__(
         assert callable(callback) or callback is False or callback is None
         self.callback: DynamoCallback = callback
         self.prior: Union[Unset, DynamoCallback] = unset
-        self.on_enter = on_enter
-        self.extra_ctx_ctor = backend_ctx_ctor
         self.first_ctx = first_ctx
         self.export = export
-        self.dynamic = dynamic
         self.compiler_config = compiler_config
+        self.cleanup_fns: List[Callable[[], Any]] = []
+        self.enter_exit_hooks = []
         patch_fn()
 
+        # Save the backends so that we can reset them during torch._dynamo.reset
+        backend = innermost_fn(callback)
+        cached_backends.setdefault(id(backend), backend)
+
+        if dynamic is not None:
+            self.enter_exit_hooks.append(make_set_enable_dynamic(dynamic))
+
+        if on_enter is not nothing:
+            # this case is not common
+            def call_on_enter():
+                on_enter()
+                return nothing
+
+            self.enter_exit_hooks.append(call_on_enter)
+
+        if backend_ctx_ctor is not contextlib.nullcontext:
+            # this case is not common
+            def call_backend_ctx():
+                ctx = backend_ctx_ctor()
+                ctx.__enter__()
+                return functools.partial(ctx.__exit__, None, None, None)
+
+            self.enter_exit_hooks.append(call_backend_ctx)
+
     def __enter__(self):
         if config.raise_on_ctx_manager_usage:
             raise RuntimeError(
@@ -316,23 +312,16 @@ def __enter__(self):
                 "Please refer to https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html "
                 "to use torch._dynamo.optimize(...) as an annotation/decorator. "
             )
-        self.on_enter()
+        self.cleanup_fns = [enter() for enter in self.enter_exit_hooks]
         self.prior = set_eval_frame(self.callback)
-        self.backend_cache_manager = backend_cache_wrapper(self.callback)
-        self.backend_cache_manager.__enter__()
-        self.backend_ctx = self.extra_ctx_ctor()
-        self.backend_ctx.__enter__()
-        self.dynamic_ctx = enable_dynamic(self.dynamic, self.export)
-        self.dynamic_ctx.__enter__()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         assert self.prior is not unset
         set_eval_frame(self.prior)
         self.prior = unset
-        # TODO: This is totally not the right way to chain contexts manually
-        self.dynamic_ctx.__exit__(exc_type, exc_val, exc_tb)
-        self.backend_ctx.__exit__(exc_type, exc_val, exc_tb)
-        self.backend_cache_manager.__exit__(exc_type, exc_val, exc_tb)
+        for cleanup in self.cleanup_fns:
+            cleanup()
+        self.cleanup_fns.clear()
 
     def __call__(self, fn):
         # public api for compiler config/options
@@ -342,11 +331,11 @@ def get_compiler_config():
         fn = innermost_fn(fn)
 
         # add context containing GraphModule to any GraphModule forward functions
-        if isinstance(fn, torch.fx.GraphModule):
-            # Assume that the underlying node metadata of `fn`,
-            # a GraphModule instance, accurately represents
-            # all instances of type(fn).
-            code_context.get_context(fn.forward.__code__)["orig_graphmodule"] = fn
+        if isinstance(fn, GraphModule):
+            # add context containing GraphModule to any GraphModule forward functions
+            code_context.get_context(fn.forward.__code__)[
+                "orig_graphmodule"
+            ] = weakref.ref(fn)
 
         # Optimize the forward method of torch.nn.Module object
         if isinstance(fn, torch.nn.Module):
@@ -362,6 +351,17 @@ def get_compiler_config():
             new_mod.get_compiler_config = get_compiler_config
 
             return new_mod
+
+        if inspect.isclass(fn):
+            # User has wrapped the class with compile/disable decorator. Apply
+            # disable to init/call method.
+            cls_obj = fn
+            cls_obj.__call__ = self(cls_obj.__call__)
+            if issubclass(cls_obj, torch.nn.Module):
+                # NN module variable tracker directly inlines the _call_impl.
+                cls_obj._call_impl = self(cls_obj._call_impl)
+            return cls_obj
+
         assert callable(fn)
 
         try:
@@ -369,9 +369,10 @@ def get_compiler_config():
         except TypeError:
             filename = None
         if (
-            (filename is None or skipfiles.check(fn))
+            (filename is None or trace_rules.check(fn))
             and (
-                getattr(fn, "__name__", "") not in ["_call_impl", "_wrapped_call_impl"]
+                getattr(fn, "__name__", "")
+                not in ["_call_impl", "_wrapped_call_impl", "_lazy_forward"]
             )
             and filename not in DONT_WRAP_FILES
         ):
@@ -379,15 +380,13 @@ def get_compiler_config():
             fn = external_utils.wrap_inline(fn)
 
         callback = self.callback
-        on_enter = self.on_enter
-        backend_ctx_ctor = self.extra_ctx_ctor
+
+        is_jit_tracing = torch._C._is_tracing
+        is_fx_tracing = torch.fx._symbolic_trace.is_fx_tracing
 
         @functools.wraps(fn)
         def _fn(*args, **kwargs):
-            if (
-                not isinstance(self, DisableContext)
-                and torch.fx._symbolic_trace.is_fx_tracing()
-            ):
+            if is_fx_tracing():
                 if config.error_on_nested_fx_trace:
                     raise RuntimeError(
                         "Detected that you are using FX to symbolically trace "
@@ -396,7 +395,7 @@ def _fn(*args, **kwargs):
                 else:
                     return fn(*args, **kwargs)
 
-            if torch.jit.is_tracing():
+            if is_jit_tracing():
                 if config.error_on_nested_jit_trace:
                     raise RuntimeError(
                         "Detected that you are using FX to torch.jit.trace "
@@ -405,27 +404,21 @@ def _fn(*args, **kwargs):
                 else:
                     return fn(*args, **kwargs)
 
-            on_enter()
+            cleanups = [enter() for enter in self.enter_exit_hooks]
             prior = set_eval_frame(callback)
-            backend_cache_manager = backend_cache_wrapper(self.callback)
-            backend_cache_manager.__enter__()
-            backend_ctx = backend_ctx_ctor()
-            backend_ctx.__enter__()
-            dynamic_ctx = enable_dynamic(self.dynamic, self.export)
-            dynamic_ctx.__enter__()
             try:
-                return fn(*args, **kwargs)
+                # Ensure that if an assertion occurs after graph pushes
+                # something onto the DynamicLayerStack then we pop it off (the
+                # constructed graph code isn't guarded with try/finally).
+                with torch._C._functorch._PreserveDynamicLayerStack():
+                    return fn(*args, **kwargs)
             finally:
                 set_eval_frame(prior)
-                dynamic_ctx.__exit__(None, None, None)
-                backend_ctx.__exit__(None, None, None)
-                backend_cache_manager.__exit__(None, None, None)
+                for cleanup in cleanups:
+                    cleanup()
 
         # hooks to properly handle inlining
-        if isinstance(self, DisableContext):
-            _fn._torchdynamo_disable = True  # type: ignore[attr-defined]
-        else:
-            _fn._torchdynamo_inline = fn  # type: ignore[attr-defined]
+        _fn._torchdynamo_inline = fn  # type: ignore[attr-defined]
 
         # Save the function pointer to find the original callable while nesting
         # of decorators.
@@ -517,70 +510,52 @@ class DisableContext(_TorchDynamoContext):
     def __init__(self):
         super().__init__(callback=None)
 
+    def __call__(self, fn):
+        # Earlier this code was in the base class _TorchDynamoContext. But we
+        # moved it here to have better code organization. For disable, we just
+        # want the callback to be None. We don't have to check trace_rules or
+        # create any wrapper.
+        fn = innermost_fn(fn)
 
-def first_real_inst_idx(code):
-    if sys.version_info < (3, 11):
-        return 0
-    for inst in dis.get_instructions(code):
-        if inst.opname == "RESUME":
-            return inst.offset // 2
-    raise RuntimeError("RESUME instruction not found in code")
+        if isinstance(fn, torch.nn.Module):
+            mod = fn
+            new_mod = OptimizedModule(mod, self)
+            new_mod._torchdynamo_orig_callable = mod.forward
+            return new_mod
 
+        if inspect.isclass(fn):
+            # User has wrapped the class with compile/disable decorator. Apply
+            # disable to init/call method.
+            cls_obj = fn
+            # Disable on init is useful for reconstruction of bytecodes where we
+            # want to prevent Dynamo from tracing into the init function. Check
+            # test_reconstruction in test_model_output.py.
+            cls_obj.__init__ = self(cls_obj.__init__)
+            cls_obj.__call__ = self(cls_obj.__call__)
+            if issubclass(cls_obj, torch.nn.Module):
+                # NN module variable tracker directly inlines the _call_impl. Disable it.
+                cls_obj._call_impl = self(cls_obj._call_impl)
+            return cls_obj
 
-def catch_errors_wrapper(callback, hooks: Hooks):
-    @functools.wraps(callback)
-    def catch_errors(frame, cache_entry, frame_state):
-        assert frame_state is not None
+        assert callable(fn)
 
-        is_skipfile = skipfiles.check(frame.f_code)
-        if (
-            # TODO: the first condition is not covered by any test
-            frame.f_lasti >= first_real_inst_idx(frame.f_code)
-            or is_skipfile
-            or config.disable
-        ):
-            if log.isEnabledFor(logging.DEBUG):
-                skip_reason = (
-                    "traced frame already"
-                    if frame.f_lasti >= first_real_inst_idx(frame.f_code)
-                    else "in skipfiles"
-                    if skipfiles.check(frame.f_code)
-                    else "dynamo tracing is disabled"
-                )
-                if not is_skipfile or config.verbose:
-                    log.debug(
-                        "skipping: %s (reason: %s, file: %s)",
-                        frame.f_code.co_name,
-                        skip_reason,
-                        frame.f_code.co_filename,
-                    )
-            return None
-        if frame.f_code.co_filename == "<string>" and frame.f_code.co_name == "__new__":
-            # nametuple constructor
-            return None
-        if config.optimize_ddp:
-            ddp_module = DistributedDataParallel._get_active_ddp_module()
-            if ddp_module:
-                with compile_lock:
-                    from torch._dynamo.backends.distributed import DDPOptimizer
-
-                    ddp_optimizer = DDPOptimizer(
-                        bucket_bytes_cap=ddp_module.bucket_bytes_cap,
-                        backend_compile_fn=callback._torchdynamo_orig_callable,
-                    )
-                    assert hasattr(
-                        callback, "_clone_with_backend"
-                    ), "DDPOptimizer only supports callback fns that know how to clone themselves."
-                    hijacked_callback = callback._clone_with_backend(
-                        ddp_optimizer.compile_fn,
-                    )
-                    return hijacked_callback(frame, cache_entry, hooks, frame_state)
+        callback = self.callback
 
-        with compile_lock, _disable_current_modes():
-            return callback(frame, cache_entry, hooks, frame_state)
+        @functools.wraps(fn)
+        def _fn(*args, **kwargs):
+            prior = set_eval_frame(callback)
+            try:
+                return fn(*args, **kwargs)
+            finally:
+                set_eval_frame(prior)
 
-    catch_errors._torchdynamo_orig_callable = callback  # type: ignore[attr-defined]
-    return catch_errors
+        _fn._torchdynamo_disable = True  # type: ignore[attr-defined]
+
+        # Save the function pointer to find the original callable while nesting
+        # of decorators.
+        _fn._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
+
+        return _fn
 
 
 def _optimize_catch_errors(
@@ -592,7 +567,7 @@ def _optimize_catch_errors(
     compiler_config=None,
 ):
     return OptimizeContext(
-        catch_errors_wrapper(compile_fn, hooks),
+        convert_frame.catch_errors_wrapper(compile_fn, hooks),
         backend_ctx_ctor=backend_ctx_ctor,
         first_ctx=True,
         export=export,
@@ -621,10 +596,8 @@ def __call__(self, fn):
 
 
 def check_if_dynamo_supported():
-    if sys.platform == "win32":
-        raise RuntimeError("Windows not yet supported for torch.compile")
-    if sys.version_info >= (3, 12):
-        raise RuntimeError("Python 3.12+ not yet supported for torch.compile")
+    if sys.version_info >= (3, 13):
+        raise RuntimeError("Python 3.13+ not yet supported for torch.compile")
 
 
 def is_dynamo_supported():
@@ -635,6 +608,21 @@ def is_dynamo_supported():
         return False
 
 
+def check_if_inductor_supported():
+    check_if_dynamo_supported()
+
+    if sys.platform == "win32":
+        raise RuntimeError("Windows not yet supported for inductor")
+
+
+def is_inductor_supported():
+    try:
+        check_if_inductor_supported()
+        return True
+    except Exception:
+        return False
+
+
 def optimize(
     backend="inductor",
     *,
@@ -692,6 +680,9 @@ def toy_example(a, b):
             dynamic=dynamic,
             hooks=hooks,
         )
+    # The backend function is stashed in the callable returned by
+    # _optimize_catch_errors in the field _torchdynamo_orig_callable. This can
+    # be used by eval_frame.c to insert a guard on the backend.
     return _optimize_catch_errors(
         convert_frame.convert_frame(backend, hooks=hooks),
         hooks,
@@ -747,20 +738,6 @@ def guard_export_print(guards):
         opt_f(*args, **kwargs)
 
         graph_count = len(graphs)
-
-        # For the explanation summary, dedupe reasons by the innermost stack frame and dedupe by it.
-        deduped_reasons = {}
-        for reason in break_reasons:
-            innermost_frame = reason.user_stack[-1]
-            # __repr__ uniquely identifies a FrameSummary so we can use it for deduping
-            deduped_reasons[repr(innermost_frame)] = reason
-
-        formatted_list = ""
-        for idx, break_reason in enumerate(deduped_reasons.values()):
-            formatted_stack = "".join(traceback.format_list(break_reason.user_stack))
-            msg = f"{idx + 1}. Reason: {break_reason.reason}\n   User Stack: {formatted_stack}\n"
-            formatted_list += msg
-
         graph_break_count = graph_count - 1
         compile_time = compile_times(repr="str")
 
@@ -796,6 +773,7 @@ def __init__(
         m: torch.fx.GraphModule,
         flat_args: Tuple[Any],
         matched_input_elements_positions: List[int],
+        flat_results: List[Any],
         matched_output_elements_positions: List[int],
         example_fake_inputs: List[torch.Tensor],
         flat_args_dynamic_dims: List[Set[int]],
@@ -834,6 +812,7 @@ def __init__(
             self.new_args.append(arg)
         self.old_args_gen = (self.new_args[i] for i in matched_input_elements_positions)
         self.matched_output_elements_positions = matched_output_elements_positions
+        self.flat_results = flat_results
 
     def placeholder(self, target, args, kwargs):
         arg = next(self.old_args_gen)
@@ -842,14 +821,28 @@ def placeholder(self, target, args, kwargs):
         if "tensor_dict" in self.current_node.meta:
             arg.node.meta["tensor_dict"] = self.current_node.meta["tensor_dict"]
         if "example_value" in self.current_node.meta:
+            # NB: intentionally do not use set_example_value
             arg.node.meta["example_value"] = self.current_node.meta["example_value"]
+        if "unbacked_bindings" in self.current_node.meta:
+            arg.node.meta["unbacked_bindings"] = self.current_node.meta[
+                "unbacked_bindings"
+            ]
         return arg
 
     def output(self, target, args, kwargs):
         dynamo_result_flat = args[0]
         lookup = [*dynamo_result_flat, *self.new_args]
-        new_result_flat = [lookup[i] for i in self.matched_output_elements_positions]
-        return super().output(target, (new_result_flat,), {})
+        new_results_flat = []
+        for i in range(len(self.flat_results)):
+            if self.matched_output_elements_positions[i] is not None:
+                new_results_flat.append(
+                    lookup[self.matched_output_elements_positions[i]]
+                )
+            else:
+                const_val = self.flat_results[i]
+                assert isinstance(const_val, tuple(common_constant_types))
+                new_results_flat.append(const_val)
+        return super().output(target, (new_results_flat,), {})
 
     def run_node(self, n):
         self.current_node = n
@@ -857,9 +850,14 @@ def run_node(self, n):
         if "val" in self.current_node.meta:
             result_proxy.node.meta["val"] = self.current_node.meta["val"]
         if "example_value" in self.current_node.meta:
+            # NB: intentionally do not use set_example_value
             result_proxy.node.meta["example_value"] = self.current_node.meta[
                 "example_value"
             ]
+        if "unbacked_bindings" in self.current_node.meta:
+            result_proxy.node.meta["unbacked_bindings"] = self.current_node.meta[
+                "unbacked_bindings"
+            ]
         if self.current_node.op != "output":
             result_proxy.node._rename(
                 getattr(self.current_node, "name", result_proxy.node.name)
@@ -884,32 +882,31 @@ class ExportResult(NamedTuple):
 
 def check_signature_rewritable(graph):
     input_errors = []
-    for node in graph.graph.nodes:
-        if node.op == "placeholder":
-            assert hasattr(node, "_dynamo_source")
-            source = node._dynamo_source
-            user_stacks = graph._source_to_user_stacks.get(source)
-            if user_stacks is None:
+    for node in graph.graph.find_nodes(op="placeholder"):
+        assert hasattr(node, "_dynamo_source")
+        source = node._dynamo_source
+        user_stacks = graph._source_to_user_stacks.get(source)
+        if user_stacks is None:
+            continue
+        assert len(user_stacks) > 0
+        # In some cases we may not have a useful stack.  Look for a
+        # useful stack
+        stack = None
+        for s in user_stacks:
+            if len(s) == 0:
                 continue
-            assert len(user_stacks) > 0
-            # In some cases we may not have a useful stack.  Look for a
-            # useful stack
-            stack = None
-            for s in user_stacks:
-                if len(s) == 0:
-                    continue
-                stack = s
-                break
-            if stack is None:
-                msg = f"{source.name()}, a closed over free variable"
-            else:
-                tb = "".join(traceback.format_list(stack))
-                extra = ""
-                if len(user_stacks) > 1:
-                    extra = f"(elided {len(user_stacks)-1} more accesses)"
-                msg = f"{source.name()}, accessed at:\n{tb}{extra}"
-            # TODO: option to print ALL of the stack traces at once
-            input_errors.append(msg)
+            stack = s
+            break
+        if stack is None:
+            msg = f"{source.name()}, a closed over free variable"
+        else:
+            tb = "".join(traceback.format_list(stack))
+            extra = ""
+            if len(user_stacks) > 1:
+                extra = f"(elided {len(user_stacks)-1} more accesses)"
+            msg = f"{source.name()}, accessed at:\n{tb}{extra}"
+        # TODO: option to print ALL of the stack traces at once
+        input_errors.append(msg)
 
     if input_errors:
         raise UserError(
@@ -938,64 +935,74 @@ def rewrite_signature(
 ):
     orig_args, orig_kwargs = pytree.tree_unflatten(flat_args, in_spec)
 
-    supported_types = (torch.Tensor, torch.SymInt, torch.SymFloat, torch.SymBool)
+    def check_user_input_output(flat_values, error_type):
+        supported_types = [
+            torch.Tensor,
+            torch.SymInt,
+            torch.SymFloat,
+            torch.SymBool,
+            torch._C.ScriptObject,
+        ] + list(common_constant_types)
+
+        def is_supported_type(val):
+            return isinstance(val, tuple(supported_types))
+
+        value_type = "input" if error_type == UserErrorType.INVALID_INPUT else "output"
+        # We only check that the outputs are not None. Inputs can be None.
+        for v in flat_values:
+            if not is_supported_type(v):
+                if error_type == UserErrorType.INVALID_INPUT and v is None:
+                    continue
 
-    def is_supported_type(val):
-        return isinstance(val, supported_types)
+                raise UserError(
+                    error_type,
+                    f"It looks like one of the {value_type}s with type `{type(v)}` "
+                    "is not supported or pytree-flattenable. \n"
+                    f"Exported graphs {value_type}s can only contain the "
+                    f"following supported types: {supported_types}. \n"
+                    "If you are using a custom class object, "
+                    "please register a pytree_flatten/unflatten function "
+                    "using `torch.utils._pytree.register_pytree_node` or "
+                    "`torch.export.register_dataclass`.",
+                )
 
-    def produce_matching(sources, candidates):
-        source_types = " or ".join(
-            [
-                desc
-                + " of types: ("
-                + ", ".join([str(type(val)) for val in vals])
-                + ")"
-                for desc, vals in sources.items()
-            ]
-        )
-        source_vals = [val for vals in sources.values() for val in vals]
-        matched_elements_positions = []
+    check_user_input_output(flat_args, UserErrorType.INVALID_INPUT)
+    flat_results_traced, out_spec_traced = pytree.tree_flatten(dynamo_traced_result)
+    check_user_input_output(flat_results_traced, UserErrorType.INVALID_OUTPUT)
+
+    def produce_matching(debug_type, sources, candidates):
+        matched_elements_positions: List[Optional[int]] = []
         dict_of_source_vals = {}
-        for i, val in enumerate(source_vals):
+        for i, val in enumerate(sources):
             dict_of_source_vals[id(val)] = i
 
-        for candidate_desc, candidate_vals in candidates.items():
-            for i, val in enumerate(candidate_vals):
-                if is_supported_type(val):
-                    if id(val) in dict_of_source_vals:
-                        matched_elements_positions.append(dict_of_source_vals[id(val)])
-                    else:
-                        raise AssertionError(
-                            f"{candidate_desc} #{i+1}, of type {type(val)}, is not among {source_types}"
-                        )
-                else:
-                    raise AssertionError(
-                        f"{candidate_desc} #{i+1} is {val}, but only "
-                        f"the following types are supported: {supported_types}"
-                    )
+        for i, val in enumerate(candidates):
+            if isinstance(val, tuple(common_constant_types)):
+                matched_elements_positions.append(None)
+            elif id(val) not in dict_of_source_vals:
+                raise AssertionError(
+                    f"Unexpectedly found a {type(val)} in the {debug_type}.\n"
+                    'Please file an issue along with a paste of the logs from TORCH_LOGS="+export"'
+                )
+            else:
+                matched_elements_positions.append(dict_of_source_vals[id(val)])
 
         return matched_elements_positions
 
     matched_input_elements_positions = produce_matching(
-        sources={"original inputs": flat_args},
-        candidates={"graph-captured input": graph_captured_input},
+        "inputs", flat_args, graph_captured_input
     )
 
-    flat_results_traced, out_spec_traced = pytree.tree_flatten(dynamo_traced_result)
-
     assert graph_captured_output is not None
     matched_output_elements_positions = produce_matching(
-        sources={
-            "graph-captured outputs": list(graph_captured_output),
-            "original inputs": flat_args,
-        },
-        candidates={"original output": flat_results_traced},
+        "outputs", list(graph_captured_output) + flat_args, flat_results_traced
     )
 
     new_graph = FlattenInputOutputSignature(
         graph,
         flat_args,
         matched_input_elements_positions,
+        flat_results_traced,
         matched_output_elements_positions,
         example_fake_inputs,
         flat_args_dynamic_dims,
@@ -1105,10 +1112,11 @@ def export(
         Dict[torch._ops.OpOverload, Callable[..., Any]]
     ] = None,
     tracing_mode: str = "symbolic",
-    constraints: Optional[List[Constraint]] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any], List[Any]]] = None,
     assume_static_by_default: bool = False,
     same_signature: bool = True,
     disable_constraint_solver: bool = False,
+    _log_export_usage: bool = True,
     **extra_kwargs,
 ) -> Callable[..., ExportResult]:
     """
@@ -1132,6 +1140,21 @@ def export(
 
         tracing_mode (str): If "symbolic", turn on dynamic shapes support. Default is "symbolic".
 
+        dynamic_shapes:
+         An optional argument where the type should either be:
+         1) a dict from argument names of ``f`` to their dynamic shape specifications,
+         2) a tuple that specifies dynamic shape specifications for each input in original order.
+         If you are specifying dynamism on keyword args, you will need to pass them in the order that
+         is defined in the original function signature.
+
+         The dynamic shape of a tensor argument can be specified as either
+         (1) a dict from dynamic dimension indices to :func:`Dim` types, where it is
+         not required to include static dimension indices in this dict, but when they are,
+         they should be mapped to None; or (2) a tuple / list of :func:`Dim` types or None,
+         where the :func:`Dim` types correspond to dynamic dimensions, and static dimensions
+         are denoted by None. Arguments that are dicts or tuples / lists of tensors are
+         recursively specified by using mappings or sequences of contained specifications.
+
         same_signature (bool): If True, rewrite the returned graph's signature to be the same as f.
 
         disable_constraint_solver (bool): Whether the dim constraint solver must be disabled.
@@ -1149,11 +1172,15 @@ def export(
 
     Note - this headerdoc was authored by ChatGPT, with slight modifications by the author.
     """
+    if _log_export_usage:
+        log_export_usage(event="export.private_api", flags={"_dynamo"})
+
     # Deal with "local variable referenced before assignment"
     _f = f
     _assume_static_by_default = assume_static_by_default
 
     def inner(*args, **kwargs):
+        constraints = _process_dynamic_shapes(_f, args, kwargs, dynamic_shapes)
         f = _f
         assume_static_by_default = _assume_static_by_default
         check_if_dynamo_supported()
@@ -1216,10 +1243,21 @@ def result_capturing_wrapper(*graph_inputs):
                     else fake_mode
                 )
 
-                with ambient_fake_mode, enable_python_dispatcher():
+                # We reran fake tensor propagation, but we didn't do
+                # anything with the resulting unbacked SymInts.  Drop them
+                # from the pending list.
+                # NB: this is wrong if graph_captured_result has
+                # data-dependent output size!
+                ignore_fresh_unbacked = null_context()
+                if shape_env := ambient_fake_mode.shape_env:
+                    ignore_fresh_unbacked = shape_env.ignore_fresh_unbacked_symbols()
+
+                with (
+                    ambient_fake_mode
+                ), enable_python_dispatcher(), ignore_fresh_unbacked:
                     params_and_buffers = {
-                        **dict(named_parameters),
-                        **dict(named_buffers),
+                        **named_parameters,
+                        **named_buffers,
                     }
                     fake_params_buffers = dict()
 
@@ -1274,7 +1312,10 @@ def result_capturing_wrapper(*graph_inputs):
             not disable_constraint_solver
             and (shape_env := getattr(fake_mode, "shape_env", None)) is not None
             and (dim_constraints := shape_env.dim_constraints) is not None
-            and not skipfiles.check(call_to_inspect)
+            and not isinstance(
+                call_to_inspect, (torch._ops.OpOverloadPacket, torch._ops.OpOverload)
+            )
+            and not trace_rules.check(call_to_inspect)
         ):
             dim_constraints.solve()
             dim_constraints.remove_redundant_dynamic_results()
@@ -1302,18 +1343,22 @@ def result_capturing_wrapper(*graph_inputs):
                         f"{''.join(traceback.format_list(shape_env.var_to_stack[k]))}\n"
                         "It appears that you're trying to set a constraint on a "
                         f"value which we evaluated to have a static value of {k}. "
-                        "Scroll up to see where this constraint was set."
+                        'Set TORCH_LOGS="+export" for more information.'
                     )
         if constraint_violation_error:
             raise constraint_violation_error
 
         assert (
             graph is not None
-        ), "Failed to produce a graph during tracing. Tracing through 'f' must produce a single graph."
+        ), "Failed to produce a graph during tracing as no tensor operations were found."
         assert hasattr(graph, "_source_to_user_stacks")
         assert out_guards is not None, "Failed to produce guards during tracing"
         assert fake_mode is not None
 
+        log.info(
+            "Dynamo captured graph:\n\n%s", graph.print_readable(print_output=False)
+        )
+
         # This check need to happened before aten_graph
         # because placeholder's _source_node attribute is not preserved by make_fx
         if same_signature:
@@ -1348,10 +1393,9 @@ def graph_with_interpreter(*args):
                         case_name="cond_operands",
                     )
 
-            for node in graph.graph.nodes:
-                if node.op == "get_attr" and isinstance(
-                    getattr(graph, node.target), torch.Tensor
-                ):
+            assert graph is not None
+            for node in graph.graph.find_nodes(op="get_attr"):
+                if isinstance(getattr(graph, node.target), torch.Tensor):
                     node.meta["val"] = fake_mode.from_tensor(
                         getattr(graph, node.target), static_shapes=True
                     )
@@ -1370,10 +1414,11 @@ def graph_with_interpreter(*args):
                 example_fake_inputs,
                 graph_captured_input,
                 graph_captured_result,
-                result_traced,
+                result_traced,  # type: ignore[possibly-undefined]
                 flat_args_dynamic_dims,
             )
         # Store constraints and inputs as metadata for user passes, e.g. turn constraints to runtime check
+        assert graph is not None
         graph.meta["input_shape_constraints"] = (
             [constraint.serializable_spec for constraint in constraints]
             if constraints
@@ -1469,25 +1514,10 @@ def patch():
             sparse_adam,
         }
 
-        disabled_multi_tensor_opt_modules = {
-            adamax,
-            radam,  # data-dependent control flow
-            sgd,  # for now, until we can speed up compilation (this affects the benchmarks)
-        }
-
         for opt_mod in optimizer_modules:
             opt_name = opt_mod.__name__.split(".")[-1]
-            multi_tensor_fn_name = f"_multi_tensor_{opt_name}"
             fused_fn_name = f"_fused_{opt_name}"
-            if (
-                hasattr(opt_mod, multi_tensor_fn_name)
-                and opt_mod in disabled_multi_tensor_opt_modules
-            ):
-                setattr(
-                    opt_mod,
-                    multi_tensor_fn_name,
-                    disable(getattr(opt_mod, multi_tensor_fn_name)),
-                )
+            single_tensor_fn_name = f"_single_tensor_{opt_name}"
 
             if hasattr(opt_mod, fused_fn_name):
                 setattr(
@@ -1500,12 +1530,12 @@ def patch():
             if inspect.isclass(opt) and issubclass(opt, torch.optim.Optimizer)
         ]
 
-        # Note: we don't support sparsity, data-dependent control, or tracing through backwards
+        # Note: we don't support sparsity or tracing through backwards
         excluded_optimizer_classes = {
             torch.optim.SparseAdam,
-            torch.optim.RAdam,
             torch.optim.LBFGS,
         }
+
         for opt in optimizer_classes:
             if opt in excluded_optimizer_classes:
                 opt.step = disable(opt.step)
@@ -1513,18 +1543,6 @@ def patch():
             if hasattr(opt, "_init_group"):
                 opt._init_group = disable(opt._init_group)
 
-            # disable any currently set hooks
-            # Note: we only want to disable the profiling hook
-            # which is the *last* hook applied, we want to keep the no_grad hook
-            hooked = getattr(opt.step, "hooked", False)
-            if hooked:
-                unwrapped_step = getattr(opt.step, "__wrapped__", None)
-                if unwrapped_step:
-                    opt.step = unwrapped_step
-
-            # disable future hooking
-            opt.step.hooked = True  # type: ignore[attr-defined]
-
     @staticmethod
     def suppress_torch_distributed_warnings(fn):
         def inner_fn(*args, **kwargs):
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index c47cce8fa04d2..2ca4c311540ed 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -2,25 +2,21 @@
 import textwrap
 from enum import auto, Enum
 from traceback import extract_stack, format_exc, format_list, StackSummary
-from typing import cast, NoReturn, Optional
+from typing import Any, cast, NoReturn, Optional
 
 import torch._guards
 
 from . import config
-from .config import is_fbcode
 
 from .utils import counters
 
-if is_fbcode():
-    from torch.fb.exportdb.logging import exportdb_error_message
-else:
 
-    def exportdb_error_message(case_name):
-        return (
-            "For more information about this error, see: "
-            + "https://pytorch.org/docs/main/generated/exportdb/index.html#"
-            + case_name.replace("_", "-")
-        )
+def exportdb_error_message(case_name):
+    return (
+        "For more information about this error, see: "
+        + "https://pytorch.org/docs/main/generated/exportdb/index.html#"
+        + case_name.replace("_", "-")
+    )
 
 
 import logging
@@ -38,7 +34,11 @@ class InternalTorchDynamoError(TorchDynamoException):
 
 
 class RestartAnalysis(TorchDynamoException):
-    pass
+    restart_reason: str
+
+    def __init__(self, *args, restart_reason=None):
+        self.restart_reason = restart_reason
+        super().__init__(*args)
 
 
 class SpeculationRestartAnalysis(RestartAnalysis):
@@ -133,6 +133,7 @@ class UserErrorType(Enum):
     CONSTRAINT_VIOLATION = auto()
     DYNAMIC_DIM = auto()
     INVALID_INPUT = auto()
+    INVALID_OUTPUT = auto()
 
 
 class UserError(Unsupported):
@@ -157,6 +158,23 @@ def __init__(self, error_type: UserErrorType, msg, case_name=None):
         self.message = msg
 
 
+class UserStopIteration(TorchDynamoException):
+    value: Optional[Any]
+
+    # Reference `StopIteration_init` in CPython
+    # https://github.com/python/cpython/blob/3.11/Objects/exceptions.c#L568-L584
+    def __init__(self, *args, **kwargs):
+        super().__init__("unhandled `raise StopIteration`")
+        if len(args) > 0:
+            self.value = args[0]
+        else:
+            self.value = None
+
+
+class UnsafeScriptObjectError(TorchDynamoException):
+    pass
+
+
 class UncapturedHigherOrderOpError(TorchDynamoException):
     pass
 
@@ -185,11 +203,16 @@ def unimplemented_with_warning(e: Exception, code, msg: str) -> NoReturn:
     graph_break_msg = format_error_msg_verbose(e, code)
     graph_breaks_log.debug("%s", graph_break_msg)
     log.warning(msg)
-    raise unimplemented(msg) from e
+    unimplemented(msg, from_exc=e)
+
+
+_NOTHING = object()
 
 
-def unimplemented(msg: str) -> NoReturn:
+def unimplemented(msg: str, *, from_exc: Any = _NOTHING) -> NoReturn:
     assert msg != os.environ.get("BREAK", False)
+    if from_exc is not _NOTHING:
+        raise Unsupported(msg) from from_exc
     raise Unsupported(msg)
 
 
diff --git a/torch/_dynamo/external_utils.py b/torch/_dynamo/external_utils.py
index ce20db74aba0a..3ba10d34b771b 100644
--- a/torch/_dynamo/external_utils.py
+++ b/torch/_dynamo/external_utils.py
@@ -1,6 +1,7 @@
 # This module contains functions that *will be allowed* by dynamo
 
 import functools
+from typing import List
 
 import torch
 import torch.utils._pytree as pytree
@@ -12,7 +13,18 @@
 
 
 def is_compiling() -> bool:
-    return False
+    """
+    Indicates whether we are tracing/compiling with torch.compile() or torch.export().
+
+    If need to check specifically that TorchDynamo is used, then use
+    torch.compiler.is_dynamo_compiling().
+
+    TODO(khabinov): we should deprecate this function and use one of these two:
+    * torch.compiler.is_compiling(),
+    * torch.compiler.is_dynamo_compiling().
+    It will depend on the context where to use what.
+    """
+    return torch.compiler.is_compiling()
 
 
 def wrap_inline(fn):
@@ -53,3 +65,49 @@ def wrap(*args, **kwargs):
         return pytree.tree_map_only(np.ndarray, lambda x: torch.as_tensor(x), out)
 
     return wrap
+
+
+class FakeBackwardCFunction:
+    def __init__(
+        self,
+        real: torch.autograd.function.BackwardCFunction,
+        saved_tensors: List[torch.Tensor],
+    ):
+        self.real = real
+        self.saved_tensors = saved_tensors
+
+    def __getattr__(self, name):
+        # route any attribute that isn't defined on this obj
+        return getattr(self.real, name)
+
+
+# This function corresponds to the "eager" implementation of a lifted autograd.Function.backward
+def call_backward(backward_c_function, saved_tensors, *args):
+    fake = FakeBackwardCFunction(backward_c_function, saved_tensors)
+    grads = fake._forward_cls.backward(fake, *args)  # type: ignore[attr-defined]
+
+    # in eager, we wrap in a tuple when there's only one grad output
+    if type(grads) is not tuple:
+        grads = (grads,)
+
+    return grads
+
+
+def untyped_storage_size(x: torch.Tensor):
+    return x.untyped_storage().size()
+
+
+def call_hook_from_backward_state(*args, bw_state, hook_name: str, **kwargs):
+    return getattr(bw_state, hook_name)(*args, **kwargs)
+
+
+def call_module_hooks_from_backward_state(
+    _, result, *args, bw_state, hooks_name: str, module_name: str
+):
+    module = getattr(bw_state, module_name)
+    hooks = getattr(bw_state, hooks_name)
+    for hook in hooks:
+        new_result = hook(module, result, *args)
+        if new_result is not None:
+            result = new_result
+    return result
diff --git a/torch/_dynamo/funcname_cache.py b/torch/_dynamo/funcname_cache.py
index c43e9a2305176..8cbf5010e5b45 100644
--- a/torch/_dynamo/funcname_cache.py
+++ b/torch/_dynamo/funcname_cache.py
@@ -11,7 +11,7 @@ def clearcache() -> None:
 
 def _add_file(filename: str) -> None:
     try:
-        with open(filename) as f:
+        with tokenize.open(filename) as f:
             tokens = list(tokenize.generate_tokens(f.readline))
     except OSError:
         cache[filename] = {}
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index ec7faced01f2a..642b7e0b1a8c4 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -18,7 +18,18 @@
 import types
 import weakref
 from inspect import currentframe, getframeinfo
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+    Union,
+)
 from weakref import ReferenceType
 
 
@@ -30,11 +41,12 @@
 import torch
 import torch.utils._device
 from torch._dynamo.source import (
+    is_from_flatten_script_object_source,
     is_from_local_source,
+    is_from_optimizer_source,
     TensorProperty,
     TensorPropertySource,
 )
-
 from torch._guards import (
     DuplicateInputs,
     Guard,
@@ -43,32 +55,59 @@
     GuardSource,
     Source,
 )
+
+from torch._logging import structured
 from torch.fx.experimental.symbolic_shapes import (
     EqualityConstraint,
     is_symbolic,
     SYMPY_INTERP,
 )
-
 from torch.utils._traceback import format_frame, report_compile_source_on_error
 from torch.utils.weak import TensorWeakRef
 
 from . import config, convert_frame, exc, mutation_guard
 from .eval_frame import set_guard_error_hook
-from .source import DefaultsSource, LocalSource, TypeSource
-from .types import GuardedCode, GuardFail, GuardFn  # noqa: F401
+
+from .source import (
+    AttrSource,
+    ChainedSource,
+    ConstDictKeySource,
+    DefaultsSource,
+    FlattenScriptObjectSource,
+    FSDPNNModuleSource,
+    GetItemSource,
+    GlobalSource,
+    GlobalStateSource,
+    GlobalWeakRefSource,
+    GradSource,
+    LocalSource,
+    NNModuleSource,
+    NotNNModuleSource,
+    NumpyTensorSource,
+    ODictGetItemSource,
+    OptimizerSource,
+    ScriptObjectQualifiedNameSource,
+    ShapeEnvSource,
+    TupleIteratorGetItemSource,
+    TypeSource,
+)
+from .types import CacheEntry, ExtraState, GuardedCode, GuardFail, GuardFn  # noqa: F401
 from .utils import (
     common_constant_types,
-    dict_keys_getitem,
     dict_keys_repr,
     guard_failures,
     istype,
+    key_is_id,
+    key_to_id,
     orig_code_map,
     tensor_always_has_static_shape,
-    tensor_or_module_to_id,
     tuple_iterator_getitem,
     tuple_iterator_len,
 )
 
+if TYPE_CHECKING:
+    from sympy import Symbol
+
 log = logging.getLogger(__name__)
 guards_log = torch._logging.getArtifactLogger(__name__, "guards")
 recompiles_log = torch._logging.getArtifactLogger(__name__, "recompiles")
@@ -82,6 +121,116 @@
 check_type_id = torch._C._dynamo.guards.check_type_id
 dict_version = torch._C._dynamo.guards.dict_version
 
+RootGuardManager = torch._C._dynamo.guards.RootGuardManager
+DictGuardManager = torch._C._dynamo.guards.DictGuardManager
+install_tensor_aliasing_guard = torch._C._dynamo.guards.install_tensor_aliasing_guard
+install_no_tensor_aliasing_guard = (
+    torch._C._dynamo.guards.install_no_tensor_aliasing_guard
+)
+
+
+class GuardManager:
+    """
+    A helper class that contains the root guard manager. An instance of this
+    class is stored in the Dynamo cache entry, so that the cache entry can
+    access the RootGuardManager stored in the "root" attribute and directly call
+    the check_nopybind from C++.
+    """
+
+    def __init__(self):
+        self.root = RootGuardManager()
+
+        self.closure_vars = None
+        self.args = None
+        self.code_parts = None
+        self.verbose_code_parts = None
+        self.global_scope = None
+        self.guard_fail_fn = None
+        self.cache_entry = None
+        self.extra_state = None
+        self.id_matched_objs = None
+        self.no_tensor_aliasing_sources = []
+
+    def get_guard_lines(self, guard):
+        guard_name = guard.__class__.__name__
+        parts = guard.verbose_code_parts()
+        parts = [guard_name + ": " + part for part in parts]
+        return parts
+
+    def get_manager_line(self, guard_manager, accessor_str=None):
+        source = guard_manager.get_source()
+        t = guard_manager.__class__.__name__
+        s = t + ": source=" + source
+        if accessor_str:
+            s += ", " + accessor_str
+        return s
+
+    def construct_dict_manager_string(self, mgr, body):
+        for idx, (key_mgr, val_mgr) in sorted(mgr.get_key_value_managers().items()):
+            body.writeline(f"KeyValueManager pair at index={idx}")
+            with body.indent():
+                if key_mgr:
+                    body.writeline(f"KeyManager: {self.get_manager_line(key_mgr)}")
+                    self.construct_manager_string(key_mgr, body)
+
+                if val_mgr:
+                    body.writeline(f"ValueManager: {self.get_manager_line(val_mgr)}")
+                    self.construct_manager_string(val_mgr, body)
+
+    def construct_manager_string(self, mgr, body):
+        with body.indent():
+            for guard in mgr.get_leaf_guards():
+                body.writelines(self.get_guard_lines(guard))
+
+            # This works for both DictGuardManager and SubclassedDictGuardManager
+            if isinstance(mgr, DictGuardManager):
+                self.construct_dict_manager_string(mgr, body)
+
+            # General case of GuardManager/RootGuardManager
+            for accessor, child_mgr in zip(
+                mgr.get_accessors(), mgr.get_child_managers()
+            ):
+                body.writeline(
+                    self.get_manager_line(child_mgr, f"accessed_by={accessor.repr()}")
+                )
+                self.construct_manager_string(child_mgr, body)
+
+    def __str__(self):
+        from torch._inductor.utils import IndentedBuffer
+
+        class IndentedBufferWithPrefix(IndentedBuffer):
+            def prefix(self):
+                return "| " * (self._indent * self.tabwidth)
+
+            def writeline(self, line, skip_prefix=False):
+                if skip_prefix:
+                    super().writeline(line)
+                else:
+                    super().writeline("+- " + line)
+
+        body = IndentedBufferWithPrefix()
+        body.tabwidth = 1
+        body.writeline("", skip_prefix=True)
+        body.writeline("TREE_GUARD_MANAGER:", skip_prefix=True)
+        body.writeline("RootGuardManager")
+        self.construct_manager_string(self.root, body)
+        for guard in self.root.get_epilogue_lambda_guards():
+            body.writelines(self.get_guard_lines(guard))
+        return body.getvalue()
+
+    def check(self, x):
+        # Only needed for debugging purposes.
+        return self.root.check(x)
+
+    def check_verbose(self, x):
+        # Only needed for debugging purposes.
+        return self.root.check_verbose(x)
+
+
+def from_numpy(a):
+    # If not numpy array, piggy back on e.g. tensor guards to check type
+    return torch.as_tensor(a) if isinstance(a, (np.generic, np.ndarray)) else a
+
 
 # For user stack printing
 @functools.lru_cache(None)
@@ -97,33 +246,21 @@ def uninteresting_files():
 CLOSURE_VARS = {
     "___check_type_id": check_type_id,
     "___check_obj_id": check_obj_id,
-    "___current_backend": (
-        lambda: torch._dynamo.eval_frame.guarded_backend_cache.current_backend
-    ),
-    "___lookup_backend": (
-        lambda backend_obj_id: torch._dynamo.eval_frame.cached_backends.get(
-            backend_obj_id, None
-        )
-    ),
-    "___skip_backend_check": (
-        lambda: torch._dynamo.eval_frame.guarded_backend_cache.skip_backend_check_for_run_only_mode
-    ),
     "___odict_getitem": collections.OrderedDict.__getitem__,
-    "___tensor_or_module_to_id": tensor_or_module_to_id,
+    "___key_to_id": key_to_id,
     "___dict_version": dict_version,
     "___dict_contains": lambda a, b: a in b,
-    "___dict_keys_getitem": dict_keys_getitem,
     "___tuple_iterator_len": tuple_iterator_len,
     "___tuple_iterator_getitem": tuple_iterator_getitem,
     "__math_isnan": math.isnan,
+    "__numpy_isnan": None if np is None else np.isnan,
     "inf": float("inf"),
     "__load_module": importlib.import_module,
     "utils_device": torch.utils._device,
     "device": torch.device,
-    "___from_numpy":
-    # If not numpy array, piggy back on e.g. tensor guards to check type
-    (lambda a: torch.as_tensor(a) if isinstance(a, (np.generic, np.ndarray)) else a),
+    "___from_numpy": from_numpy,
     "torch": torch,
+    "inspect": inspect,
 }
 
 if sys.version_info[:2] <= (3, 8):
@@ -179,6 +316,118 @@ def strip_getattr_getitem(name):
     return re.split(r"[.\[]", name)[0]
 
 
+def get_verbose_code_part(code_part: str, guard: Guard) -> str:
+    extra = ""
+    if guard.user_stack:
+        for fs in reversed(guard.user_stack):
+            if fs.filename not in uninteresting_files():
+                extra = f"  # {format_frame(fs, line=True)}"
+                break
+    elif guard.stack:
+        extra = f"  # {format_frame(guard.stack.summary()[-1])}"
+
+    return f"{code_part:<60}{extra}"
+
+
+def get_verbose_code_parts(
+    code_parts: Union[str | List[str]], guard: Guard
+) -> List[str]:
+    if not isinstance(code_parts, list):
+        code_parts = [code_parts]
+    return [get_verbose_code_part(code_part, guard) for code_part in code_parts]
+
+
+def convert_to_concrete_values(size_or_stride):
+    converted: List[Optional[int]] = []
+    for dim in size_or_stride:
+        if not is_symbolic(dim):
+            converted.append(dim)
+        else:
+            assert isinstance(dim, torch.SymInt)
+            converted.append(dim.node.maybe_as_int())
+    return converted
+
+
+def get_tensor_guard_code_part(value, name, sizes, strides):
+    pytype = type(value)
+    dispatch_key = (
+        torch._C._dispatch_keys(value) | torch._C._dispatch_tls_local_include_set()
+    ) - torch._C._dispatch_tls_local_exclude_set()
+    dtype = value.dtype
+    device_index = value.device.index
+    requires_grad = value.requires_grad
+    guard_str = (
+        f"check_tensor({name}, {pytype.__qualname__}, {dispatch_key}, {dtype}, "
+        f"device={device_index}, requires_grad={requires_grad}, size={sizes}, stride={strides})"
+    )
+    return guard_str
+
+
+def get_key_index(dct, key):
+    return list(dct.keys()).index(key)
+
+
+def get_key_index_source(source, index):
+    return f"list({source}.keys())[{index}]"
+
+
+@dataclasses.dataclass(frozen=True)
+class NNModuleAttrAccessorInfo:
+    # Represents where is the attr name is present in the nn module attribute
+    # access
+
+    # Tells that the attribute can be accessed via __dict__
+    present_in_generic_dict: bool = False
+
+    # Either the actual name or _parameters/_buffers/_modules
+    l1_key: Optional[str] = None
+
+    # Actual paramter/buffer/submodule name
+    l2_key: Optional[str] = None
+
+
+def getitem_on_dict_manager(
+    source, base_guard_manager, base_example_value, example_value, guard_manager_enum
+):
+    base_source_name = source.base.name()
+    source_name = source.name()
+    if isinstance(source.index, ConstDictKeySource):
+        index = source.index.index
+    else:
+        assert isinstance(base_example_value, dict)
+        index = get_key_index(base_example_value, source.index)
+
+    key_source = get_key_index_source(base_source_name, index)
+    key_example_value = list(base_example_value.keys())[index]
+    if isinstance(key_example_value, (int, str)):
+        value_source = f"{base_source_name}[{key_example_value!r}]"
+    else:
+        value_source = f"{base_source_name}[{key_source}]"
+    if not isinstance(source.index, ConstDictKeySource):
+        # We have to insert a key manager guard here
+        # TODO - source debug string is probably wrong here.
+        base_guard_manager.get_key_manager(
+            index=index,
+            source=key_source,
+            example_value=source.index,
+            guard_manager_enum=GuardManagerType.GUARD_MANAGER,
+        ).add_equals_match_guard(
+            source.index, [f"{key_source} == {key_example_value!r}"]
+        )
+
+    return base_guard_manager.get_value_manager(
+        index=index,
+        source=value_source,
+        example_value=example_value,
+        guard_manager_enum=guard_manager_enum,
+    )
+
+
+def match_on_id_for_tensor(guard):
+    source = guard.originating_source
+    return source.is_dict_key() and not isinstance(source, GradSource)
+
+
 # The ready to eval generated code (possibly multiple parts) for a guard, plus
 # the original guard object that created it for provenance
 @dataclasses.dataclass
@@ -187,6 +436,12 @@ class GuardCodeList:
     guard: Guard
 
 
+class GuardManagerType(enum.Enum):
+    GUARD_MANAGER = 1
+    DICT_GUARD_MANAGER = 2
+    DICT_SUBCLASS_GUARD_MANAGER = 3
+
+
 class GuardBuilder(GuardBuilderBase):
     def __init__(
         self,
@@ -195,6 +450,7 @@ def __init__(
         lookup_weakrefs: Callable[[object], ReferenceType[object]],
         local_scope: Dict[str, object],
         global_scope: Dict[str, object],
+        guard_manager: Optional[GuardManager],
         check_fn_manager: CheckFunctionManager,
     ):
         self.id_ref = id_ref
@@ -211,6 +467,7 @@ def __init__(
             self.scope["__builtins__"][name] = package_module
             # Write the demangled name to the scope so that we can use it
             self.scope[name] = package_module
+        self.guard_manager = guard_manager
 
         self.argnames: List[str] = []
         # Code is python expression strings generated for each guard
@@ -237,13 +494,542 @@ def __init__(
         self.tensor_check_names: List[str] = []
         self.tensor_check_examples: List[torch.Tensor] = []
         self.tensor_check_guards: List[Guard] = []
+        self.tensor_check_guard_managers: List[GuardManager] = []
 
         self.check_fn_manager: CheckFunctionManager = check_fn_manager
+
+        # Collect the ids of dicts which need key order guarding. source_name is
+        # not sufficient because for nn modules, we can have different sources
+        # to access the same object - self._module["param"] is same as
+        # self.param.
+        self.key_order_guarded_dict_ids = set()
+        for source_name in self.check_fn_manager.output_graph.guard_on_key_order:
+            self.key_order_guarded_dict_ids.add(id(self.get(source_name)))
+
         # Keep track of weak references of objects with ID_MATCH guard. This
         # info is stored alongside optimized_code and check_fn and is used to
         # limit the number of cache entries with same ID_MATCH'd object.
         self.id_matched_objs: Dict[str, ReferenceType[object]] = {}
 
+    def guard_on_dict_keys_and_ignore_order(self, example_value, guard):
+        dict_mgr = self.get_guard_manager(guard)
+        if isinstance(dict_mgr, DictGuardManager):
+            raise NotImplementedError(
+                "Not expecting a DictGuardManager. Seems like Dynamo incorrectly "
+                f"added the dict to tx.output.guard_on_key_order for {guard.name}"
+            )
+
+        # Iterate over the dicts and install a dict_getitem_manager.
+        dict_source = guard.originating_source.name()
+        for key in example_value.keys():
+            value = example_value[key]
+            value_source = GetItemSource(guard.originating_source, index=key)
+            guard_manager_enum = self.get_guard_manager_type(
+                value_source, example_value
+            )
+            dict_mgr.dict_getitem_manager(
+                key=key,
+                source=f"{dict_source}[{key!r}]",
+                example_value=value,
+                guard_manager_enum=guard_manager_enum,
+            )
+
+    def guard_on_dict_keys_and_order(self, value, guard):
+        # Add key managers for the DictGuardManager. Then add either an
+        # ID_MATCH or EQUALS_MATCH guard on the key.
+        dict_mgr = self.get_guard_manager(guard)
+        if not isinstance(dict_mgr, DictGuardManager):
+            raise NotImplementedError(
+                "Expecting a DictGuardManager. Seems like Dynamo forgot "
+                f"to set the right guard manager enum for {guard.name}"
+            )
+        assert isinstance(dict_mgr, DictGuardManager)
+
+        for idx, key in enumerate(value.keys()):
+            key_source = get_key_index_source(guard.name, idx)
+            key_manager = dict_mgr.get_key_manager(
+                index=idx,
+                source=key_source,
+                example_value=key,
+                guard_manager_enum=GuardManagerType.GUARD_MANAGER,
+            )
+            if key_is_id(key):
+                # Install ID_MATCH guard
+                id_val = self.id_ref(key)
+                key_manager.add_id_match_guard(
+                    id_val,
+                    get_verbose_code_parts(
+                        f"__check_obj_id({key_source}, {id_val})", guard
+                    ),
+                )
+            else:
+                # Install EQUALS_MATCH guard
+                key_manager.add_equals_match_guard(
+                    key, get_verbose_code_parts(f"{key_source} == {key!r}", guard)
+                )
+
+    def getattr_on_nn_module(
+        self,
+        source,
+        base_guard_manager,
+        base_example_value,
+        example_value,
+        base_source_name,
+        source_name,
+        guard_manager_enum,
+    ):
+        """
+        This tries to avoid calling the expensive nn module custom getattr method by
+        checking if the attribute is accessible via __dict__. For attributes that
+        are not accessible via __dict__ (like descriptors), we fallback to
+        PyObject_GetAttr.
+
+        There are two cases that we optimize for
+        1) attributes present directly in __dict__, e.g training.
+        2) parameters/buffers/modules - they can be accessed via _parameters,
+        _buffers, _modules keys in __dict__. For example, mod.linear can be
+        accessed as mod.__dict__["_parameters"]["linear"]
+
+        The most common and expensive case for nn module guards is of type
+        mod.submod1.submod2.submod3.training. We avoid the python getattr of nn
+        modules by going through the __dict__.
+        """
+
+        def getitem_on_dict_mgr(
+            mgr, key, source_name, base_example_value, example_value, guard_manager_enum
+        ):
+            if isinstance(mgr, DictGuardManager):
+                # Case where the user code relies on key order, e.g.,
+                # named_parameters
+                index = get_key_index(base_example_value, key)
+
+                # Install the key manager and add equals match guard
+                key_source = f"list({source_name}.keys())[{index!r}]"
+                mgr.get_key_manager(
+                    index=index,
+                    source=key_source,
+                    example_value=key,
+                    guard_manager_enum=GuardManagerType.GUARD_MANAGER,
+                ).add_equals_match_guard(l2_key, [f"{key_source} == {l2_key!r}"])
+
+                # Install the value manager
+                return mgr.get_value_manager(
+                    index=index,
+                    source=source_name,
+                    example_value=example_value,
+                    guard_manager_enum=guard_manager_enum,
+                )
+            else:
+                return mgr.dict_getitem_manager(
+                    key=key,
+                    source=source_name,
+                    example_value=example_value,
+                    guard_manager_enum=guard_manager_enum,
+                )
+
+        attr_name = source.member
+        mod_dict = base_example_value.__dict__
+
+        all_class_attribute_names: Set[str] = set()
+        for x in inspect.getmro(base_example_value.__class__):
+            all_class_attribute_names.update(x.__dict__.keys())
+
+        accessor_info = NNModuleAttrAccessorInfo(False, None, None)
+
+        if attr_name in mod_dict:
+            accessor_info = NNModuleAttrAccessorInfo(True, attr_name, None)
+        elif "_parameters" in mod_dict and attr_name in mod_dict["_parameters"]:
+            accessor_info = NNModuleAttrAccessorInfo(True, "_parameters", attr_name)
+        elif "_buffers" in mod_dict and attr_name in mod_dict["_buffers"]:
+            accessor_info = NNModuleAttrAccessorInfo(True, "_buffers", attr_name)
+        elif (
+            attr_name not in all_class_attribute_names
+            and "_modules" in mod_dict
+            and attr_name in mod_dict["_modules"]
+        ):
+            # Check test_attr_precedence test - instance attributes always take precedence unless its an nn.Module.
+            accessor_info = NNModuleAttrAccessorInfo(True, "_modules", attr_name)
+
+        if not accessor_info.present_in_generic_dict:
+            # The attribute can be accessed by __getattribute__ call, so rely on
+            # PyObject_GetAttr
+            return base_guard_manager.getattr_manager(
+                attr=source.member,
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+        else:
+            assert accessor_info.l1_key
+            l1_key = accessor_info.l1_key
+            l2_key = accessor_info.l2_key
+
+            # Set source strings for debug info
+            mod_dict_source = f"{base_source_name}.__dict__"
+            l1_source_name = l2_source_name = None
+            l1_value = l2_value = None
+            l1_guard_manager_enum = l2_guard_manager_enum = None
+            if l2_key:
+                l1_source = AttrSource(source.base, l1_key)
+                l1_source_name = l1_source.name()
+                l1_value = mod_dict[l1_key]
+                # do not guard on key order for _parameters etc unless the user code
+                # actually needs the key order (e.g. calling named_parameters)
+                l1_guard_manager_enum = self.get_guard_manager_type(l1_source, l1_value)
+
+                l2_source_name = source_name
+                l2_value = example_value
+                l2_guard_manager_enum = self.get_guard_manager_type(
+                    source, example_value
+                )
+            else:
+                l1_source_name = source_name
+                l1_value = example_value
+                l1_guard_manager_enum = self.get_guard_manager_type(
+                    source, example_value
+                )
+
+            # Get __dict__ accessor. No need to guard on dict key order, so use base
+            # Guard Manager
+            mod_generic_dict_manager = base_guard_manager.get_generic_dict_manager(
+                source=mod_dict_source,
+                example_value=mod_dict,
+                guard_manager_enum=GuardManagerType.GUARD_MANAGER,
+            )
+
+            l1_mgr = getitem_on_dict_mgr(
+                mgr=mod_generic_dict_manager,
+                key=l1_key,
+                source_name=l1_source_name,
+                base_example_value=mod_dict,
+                example_value=l1_value,
+                guard_manager_enum=l1_guard_manager_enum,
+            )
+
+            if l2_key:
+                return getitem_on_dict_mgr(
+                    mgr=l1_mgr,
+                    key=l2_key,
+                    source_name=l2_source_name,
+                    base_example_value=l1_value,
+                    example_value=l2_value,
+                    guard_manager_enum=l2_guard_manager_enum,
+                )
+            return l1_mgr
+
+    def requires_key_order_guarding(self, source):
+        source_name = source.name()
+        if source_name == "":
+            return False
+        obj_id = id(self.get(source_name))
+        return obj_id in self.key_order_guarded_dict_ids
+
+    def get_guard_manager_type(self, source, example_value):
+        guard_manager_enum = GuardManagerType.GUARD_MANAGER
+        if self.requires_key_order_guarding(source):
+            assert isinstance(example_value, dict)
+            # If keys method is not overriden, we can use PyDict_Next to get key
+            # orderings. Read more in guards.cpp
+            if type(example_value).keys is type({}).keys:
+                guard_manager_enum = GuardManagerType.DICT_GUARD_MANAGER
+            else:
+                guard_manager_enum = GuardManagerType.DICT_SUBCLASS_GUARD_MANAGER
+        return guard_manager_enum
+
+    def manager_guards_on_keys(self, mgr_enum):
+        return (
+            mgr_enum == GuardManagerType.DICT_GUARD_MANAGER
+            or mgr_enum == GuardManagerType.DICT_SUBCLASS_GUARD_MANAGER
+        )
+
+    def get_global_guard_manager(self):
+        assert self.guard_manager  # to make mypy happy
+        return self.guard_manager.root.globals_dict_manager(
+            f_globals=self.scope["G"],
+            source="G",
+            example_value=self.scope["G"],
+            guard_manager_enum=GuardManagerType.GUARD_MANAGER,
+        )
+
+    def get_guard_manager_from_source(self, source):
+        assert self.guard_manager  # to make mypy happy
+        root_guard_manager = self.guard_manager.root
+
+        example_value = None
+        source_name = source.name()
+        if source_name != "":
+            example_value = self.get(source_name)
+
+        guard_manager_enum = self.get_guard_manager_type(source, example_value)
+
+        # Get base manager related information
+        base_source_name = None
+        base_example_value = None
+        base_guard_manager = None
+        base_guard_manager_enum = GuardManagerType.GUARD_MANAGER
+        if isinstance(source, ChainedSource):
+            base_source_name = source.base.name()
+            base_example_value = self.get(base_source_name)
+            base_guard_manager = self.get_guard_manager_from_source(source.base)
+            base_guard_manager_enum = self.get_guard_manager_type(
+                source.base, base_example_value
+            )
+
+        # Use istype instead of isinstance to check for exact type of source.
+        if istype(source, LocalSource):
+            # RootGuardManager accepts a dict but still its not a
+            # DictGuardManager because we will eventually move to
+            # fastlocals.
+            return root_guard_manager.dict_getitem_manager(
+                key=source.local_name,
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+        elif istype(source, GlobalSource):
+            # Global manager accepts a dict but it is not a DictGuardManager
+            # because globals dict is big and we typically guard on a very
+            # selected items on globals.
+            return self.get_global_guard_manager().dict_getitem_manager(
+                key=source.global_name,
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+        elif istype(source, GlobalWeakRefSource):
+            return self.get_global_guard_manager().global_weakref_manager(
+                global_name=source.global_name,
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+        elif istype(source, GlobalStateSource):
+            # Don't do anything here. We guard on global state completely in
+            # C++. So just return the root mgr.
+            return root_guard_manager
+        elif istype(source, ShapeEnvSource):
+            return root_guard_manager
+        elif istype(source, TypeSource):
+            assert base_guard_manager  # to make mypy happy
+            return base_guard_manager.type_manager(
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+        elif istype(
+            source,
+            (OptimizerSource, NNModuleSource, NotNNModuleSource, FSDPNNModuleSource),
+        ):
+            assert base_guard_manager  # to make mypy happy
+            return base_guard_manager
+        elif istype(source, GradSource):
+            assert base_guard_manager  # to make mypy happy
+            return base_guard_manager.grad_manager(
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+        elif istype(source, AttrSource):
+            assert base_guard_manager  # to make mypy happy
+
+            if isinstance(base_example_value, torch.nn.Module):
+                return self.getattr_on_nn_module(
+                    source,
+                    base_guard_manager,
+                    base_example_value,
+                    example_value,
+                    base_source_name,
+                    source_name,
+                    guard_manager_enum,
+                )
+
+            return base_guard_manager.getattr_manager(
+                attr=source.member,
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+        elif istype(source, GetItemSource):
+            assert base_guard_manager  # to make mypy happy
+            if isinstance(base_example_value, (dict, collections.OrderedDict)):
+                # TODO(anijain2305) - Consider isolating GetItemSource and
+                # DictGetItemSource (or maybe use ODictGetItemSource for
+                # dicts) so that GetItemSource is only for non dict objects.
+                if isinstance(base_guard_manager, DictGuardManager):
+                    assert self.manager_guards_on_keys(base_guard_manager_enum)
+                    return getitem_on_dict_manager(
+                        source,
+                        base_guard_manager,
+                        base_example_value,
+                        example_value,
+                        guard_manager_enum,
+                    )
+                else:
+                    if isinstance(source.index, ConstDictKeySource):
+                        raise RuntimeError(
+                            "Expecting clean index here. Likely Dynamo forgot to mark"
+                            " a dict as guard_on_key_order"
+                        )
+                    return base_guard_manager.dict_getitem_manager(
+                        key=source.index,
+                        source=source_name,
+                        example_value=example_value,
+                        guard_manager_enum=guard_manager_enum,
+                    )
+            elif isinstance(base_example_value, list) and not source.index_is_slice:
+                return base_guard_manager.list_getitem_manager(
+                    key=source.index,
+                    source=source_name,
+                    example_value=example_value,
+                    guard_manager_enum=guard_manager_enum,
+                )
+            elif isinstance(base_example_value, tuple) and not source.index_is_slice:
+                return base_guard_manager.tuple_getitem_manager(
+                    key=source.index,
+                    source=source_name,
+                    example_value=example_value,
+                    guard_manager_enum=guard_manager_enum,
+                )
+
+            index = source.index
+            if source.index_is_slice:
+                index = source.unpack_slice()
+            return base_guard_manager.getitem_manager(
+                key=index,
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+        elif istype(source, ODictGetItemSource):
+            if isinstance(base_guard_manager, DictGuardManager):
+                assert self.manager_guards_on_keys(base_guard_manager_enum)
+                return getitem_on_dict_manager(
+                    source,
+                    base_guard_manager,
+                    base_example_value,
+                    example_value,
+                    guard_manager_enum,
+                )
+            else:
+                assert base_guard_manager  # to make mypy happy
+                return base_guard_manager.dict_getitem_manager(
+                    key=source.index,
+                    source=source_name,
+                    example_value=example_value,
+                    guard_manager_enum=guard_manager_enum,
+                )
+        elif istype(source, DefaultsSource):
+            assert base_guard_manager  # to make mypy happy
+            assert callable(base_example_value)
+            if not source.is_kw:
+                return base_guard_manager.func_defaults_manager(
+                    source=base_source_name,
+                    example_value=base_example_value.__defaults__,
+                    guard_manager_enum=GuardManagerType.GUARD_MANAGER,
+                ).getitem_manager(
+                    key=source.idx_key,
+                    source=source_name,
+                    example_value=example_value,
+                    guard_manager_enum=guard_manager_enum,
+                )
+            else:
+                # kwdefauts is a dict, so use a DictGuardManager
+                kwdefaults = base_example_value.__kwdefaults__
+                assert base_source_name is not None
+                kw_source = base_source_name + ".__kwdefaults__"
+
+                # kwdefaults is a dict. No need to guard on dict order.
+                dict_mgr = base_guard_manager.func_kwdefaults_manager(
+                    source=kw_source,
+                    example_value=kwdefaults,
+                    guard_manager_enum=GuardManagerType.GUARD_MANAGER,
+                )
+                assert not isinstance(dict_mgr, DictGuardManager)
+
+                return dict_mgr.dict_getitem_manager(
+                    key=source.idx_key,
+                    source=source_name,
+                    example_value=example_value,
+                    guard_manager_enum=guard_manager_enum,
+                )
+        elif istype(source, NumpyTensorSource):
+            assert base_guard_manager  # to make mypy happy
+            return base_guard_manager.lambda_manager(
+                python_lambda=from_numpy,
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+        elif istype(source, FlattenScriptObjectSource):
+            assert base_guard_manager  # to make mypy happy
+            return base_guard_manager.lambda_manager(
+                python_lambda=lambda x: x.__obj_flatten__(),
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+        elif istype(source, ScriptObjectQualifiedNameSource):
+            assert base_guard_manager  # to make mypy happy
+            return base_guard_manager.lambda_manager(
+                python_lambda=lambda x: x._type().qualified_name(),
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+        elif istype(source, TupleIteratorGetItemSource):
+            assert base_guard_manager  # to make mypy happy
+            return base_guard_manager.tuple_iterator_getitem_manager(
+                index=source.index,
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+        elif isinstance(source, ConstDictKeySource):
+            if not isinstance(base_guard_manager, DictGuardManager):
+                raise AssertionError(
+                    "ConstDictKeySource can only work on DictGuardManager"
+                )
+            return base_guard_manager.get_key_manager(
+                index=source.index,
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
+        else:
+            raise AssertionError(
+                f"missing guard manager builder {source} - {source.name()}"
+            )
+
+    def get_guard_manager(self, guard: Guard):
+        return self.get_guard_manager_from_source(guard.originating_source)
+
+    def add_python_lambda_leaf_guard_to_root(
+        self,
+        code_parts,
+        verbose_code_parts,
+        closure_vars=CLOSURE_VARS,
+        is_epilogue=True,
+    ):
+        # Adds a lambda leaf guard to the root guard manager. It wraps the
+        # code_parts in a function object which is then passed on to the leaf
+        # guard.
+        make_guard_fn_args = ", ".join(closure_vars.keys())
+        guard_body, pycode = build_guard_function(code_parts, make_guard_fn_args)
+        out: Dict[str, Any] = dict()
+        globals_for_guard_fn = {"G": self.scope["G"]}
+        exec(pycode, globals_for_guard_fn, out)
+        guard_fn = out["___make_guard_fn"](*closure_vars.values())
+        assert self.guard_manager  # to make mypy happy
+        if is_epilogue:
+            # Epilogue guards are run after all the other guards have finished.
+            # If epilogue guards contain a getattr or getitem access, one of the
+            # other guards would fail preventing the epilogue guards to run.
+            self.guard_manager.root.add_epilogue_lambda_guard(
+                guard_fn, verbose_code_parts
+            )
+        else:
+            self.guard_manager.root.add_lambda_guard(guard_fn, verbose_code_parts)
+
     # Warning: use this with care!  This lets you access what the current
     # value of the value you are guarding on is.  You probably don't want
     # to actually durably save this value though (because it's specific
@@ -273,26 +1059,114 @@ def arg_ref(self, guard: Union[str, Guard]) -> str:
 
         return name
 
+    def _guard_on_attribute(self, guard: Guard, attr_name: str, guard_fn):
+        attr_source = AttrSource(guard.originating_source, attr_name)
+        # Copy the stack info
+        new_guard = Guard(
+            attr_source, guard_fn, stack=guard.stack, user_stack=guard.user_stack
+        )
+        new_guard.create(self)
+
+    # Note: the order of the guards in this file matters since we sort guards on the same object by lineno
+    def HASATTR(self, guard: Guard):
+        source = guard.originating_source
+        if isinstance(source, NNModuleSource):
+            source = source.base
+        assert isinstance(source, AttrSource), f"invalid source {guard.name}"
+        base_source = source.base
+        base = base_source.name()
+        attr = source.member
+
+        ref = self.arg_ref(base)
+        val = hasattr(self.get(base), attr)
+        code = None
+        if val:
+            code = f"hasattr({ref}, {attr!r})"
+        else:
+            code = f"not hasattr({ref}, {attr!r})"
+        self._set_guard_export_info(
+            guard, [code], provided_guarded_object=self.get(base)
+        )
+
+        if config.enable_cpp_guard_manager:
+            base_manager = self.get_guard_manager_from_source(base_source)
+            if val:
+                # Just install a getattr manager. GetAttrGuardAccessor itself
+                # acts as hasattr guard.
+                example_value = self.get(source.name())
+                base_example_value = self.get(base)
+                guard_manager_enum = self.get_guard_manager_type(source, example_value)
+
+                # if the base value is nn.Module, check if we can speedup the
+                # guard by going through __dict__ attrs.
+                if isinstance(base_example_value, torch.nn.Module):
+                    return self.getattr_on_nn_module(
+                        source,
+                        base_manager,
+                        base_example_value,
+                        example_value,
+                        base,
+                        source.name(),
+                        guard_manager_enum,
+                    )
+                else:
+                    base_manager.getattr_manager(
+                        attr=attr,
+                        source=guard.name,
+                        example_value=example_value,
+                        guard_manager_enum=guard_manager_enum,
+                    )
+            else:
+                base_manager.add_no_hasattr_guard(
+                    attr, get_verbose_code_parts(code, guard)
+                )
+        else:
+            self._produce_guard_code(guard, [code])
+
     def TYPE_MATCH(self, guard: Guard) -> None:
         # ___check_type_id is same as `id(type(x)) == y`
         t = type(self.get(guard.name))
         obj_id = self.id_ref(t)
         code = f"___check_type_id({self.arg_ref(guard)}, {obj_id})"
-        self._produce_guard_code(guard, [code])
+        self._set_guard_export_info(guard, [code])
+
+        if config.enable_cpp_guard_manager:
+            self.get_guard_manager(guard).add_type_match_guard(
+                obj_id, get_verbose_code_parts(code, guard)
+            )
+        else:
+            self._produce_guard_code(guard, [code])
 
     def DICT_VERSION(self, guard: Guard):
         # ___check_dict_version is same as `dict_version(x) == y`
         ref = self.arg_ref(guard)
+        val = self.get(guard.name)
         version = dict_version(self.get(guard.name))
         code = f"___dict_version({ref}) == {version}"
-        self._produce_guard_code(guard, [code])
+        self._set_guard_export_info(guard, [code])
+
+        if config.enable_cpp_guard_manager:
+            # TODO(anijain2305) - Delete this when DictGuardManager uses tags
+            # for dicts.
+            self.get_guard_manager(guard).add_dict_version_guard(
+                val, get_verbose_code_parts(code, guard)
+            )
+        else:
+            self._produce_guard_code(guard, [code])
 
     def DICT_CONTAINS(self, guard: Guard, key: str, invert: bool):
         dict_ref = self.arg_ref(guard)
 
         maybe_not = "not " if invert else ""
         code = f"{maybe_not}___dict_contains({key!r}, {dict_ref})"
-        return self._produce_guard_code(guard, [code])
+        self._set_guard_export_info(guard, [code])
+
+        if config.enable_cpp_guard_manager:
+            self.get_guard_manager(guard).add_dict_contains_guard(
+                not invert, key, get_verbose_code_parts(code, guard)
+            )
+        else:
+            self._produce_guard_code(guard, [code])
 
     def BOOL_FALSE(self, guard: Guard):
         # Guard on the runtime value being 'False',
@@ -308,7 +1182,18 @@ def BOOL_FALSE(self, guard: Guard):
         # to DICT_KEYS.
         ref = self.arg_ref(guard)
         code = f"not {ref}"
-        self._produce_guard_code(guard, [code])
+        self._set_guard_export_info(guard, [code])
+
+        if config.enable_cpp_guard_manager:
+            # BOOL_FALSE is a weird guard. It is used to effectively check
+            # len(dict) == 0. Since it is used only and only for dicts, we don't
+            # have to anything here. DictGuardManager internally stores the size
+            # of the dict, and checks its size on every invocation. PyDict_Size
+            # is very fast, so we don't need BOOL_FALSE optimization. Just
+            # construct the dict guard manager to install a DictGuardManager.
+            self.get_guard_manager(guard)
+        else:
+            self._produce_guard_code(guard, [code])
 
     def ID_MATCH(self, guard: Guard):
         # ___check_obj_id is same as `id(x) == y`
@@ -320,13 +1205,21 @@ def ID_MATCH(self, guard: Guard):
 
         ref = self.arg_ref(guard)
         val = self.get(guard.name)
-        code = f"___check_obj_id({ref}, {self.id_ref(val)})"
-        self._produce_guard_code(guard, [code])
+        id_val = self.id_ref(val)
+        code = f"___check_obj_id({ref}, {id_val})"
+        self._set_guard_export_info(guard, [code])
+
+        if config.enable_cpp_guard_manager:
+            self.get_guard_manager(guard).add_id_match_guard(
+                id_val, get_verbose_code_parts(code, guard)
+            )
+        else:
+            self._produce_guard_code(guard, [code])
 
         # Keep track of ID_MATCH'd objects. This will be used to modify the
         # cache size logic
         if isinstance(guard.originating_source, LocalSource):
-            # TODO(janimesh) - This is currently restricted to nn.Module objects
+            # TODO(anijain2305) - This is currently restricted to nn.Module objects
             # because many other ID_MATCH'd objects fail - like DeviceMesh.
             # Increase the scope of ID_MATCH'd objects.
             if isinstance(val, torch.nn.Module):
@@ -335,29 +1228,81 @@ def ID_MATCH(self, guard: Guard):
                 if weak_id is not None:
                     self.id_matched_objs[local_name] = weak_id
 
+    def NOT_NONE_MATCH(self, guard: Guard, value=None):
+        ref = self.arg_ref(guard)
+        val = self.get(guard.name)
+        assert isinstance(val, torch.Tensor)
+        code = f"{ref} is not None"
+        self._set_guard_export_info(guard, [code])
+
+        if config.enable_cpp_guard_manager:
+            self.get_guard_manager(guard).add_not_none_guard(
+                get_verbose_code_parts(code, guard)
+            )
+        else:
+            self._produce_guard_code(guard, [code])
+
     def NAME_MATCH(self, guard: Guard):
-        obj = self.get(guard.name)
-        code = f"{self.arg_ref(guard)}.__name__ == '{obj.__name__}'"
-        self._produce_guard_code(guard, [code])
+        self._guard_on_attribute(guard, "__name__", GuardBuilder.EQUALS_MATCH)
 
     def DATA_PTR_MATCH(self, guard: Guard):
+        # Add a type check. C++ guard has the type check internally, so only
+        # enable it for Python guards.
+        if not config.enable_cpp_guard_manager:
+            self.TYPE_MATCH(guard)
+
         obj = self.get(guard.name)
         code = f"{self.arg_ref(guard)}.data_ptr() == {obj.data_ptr()}"
-        self._produce_guard_code(guard, [code])
+        self._set_guard_export_info(guard, [code])
 
-    def HASATTR(self, guard: Guard):
-        m = re.match(r"^(.*)[.]([a-zA-Z0-9_]+)$", guard.name)
-        assert m, f"invalid hasattr check {guard.name}"
-        base, attr = m.group(1, 2)
-        ref = self.arg_ref(base)
-        val = hasattr(self.get(base), attr)
-        code = None
-        if val:
-            code = f"hasattr({ref}, {attr!r})"
+        if config.enable_cpp_guard_manager:
+            self.get_guard_manager(guard).add_data_ptr_guard(
+                obj, get_verbose_code_parts(code, guard)
+            )
         else:
-            code = f"not hasattr({ref}, {attr!r})"
+            self._produce_guard_code(guard, [code])
+
+    def DUAL_LEVEL(self, guard: Guard):
+        # Invalidate dual level if current dual level is different than the one
+        # in the fx graph
+        dual_level = torch.autograd.forward_ad._current_level
+        code = [f"torch.autograd.forward_ad._current_level == {dual_level}"]
+        self._set_guard_export_info(guard, [code])
+        if config.enable_cpp_guard_manager:
+            # TODO(anijain2305) - Consider this moving this guard to C++
+            forward_ad = torch.autograd.forward_ad
+
+            def fn(x):
+                return forward_ad._current_level == dual_level
+
+            assert self.guard_manager  # to make mypy happy
+            self.guard_manager.root.add_lambda_guard(
+                fn, get_verbose_code_parts(code, guard)
+            )
+        else:
+            self._produce_guard_code(guard, code)
+
+    def FUNCTORCH_STACK_MATCH(self, guard: Guard):
+        # Invalidate functorch code if current level is different than
+        # the one when FX graph was generated
+        cis = torch._functorch.pyfunctorch.retrieve_all_functorch_interpreters()
+        states = [ci.get_state() for ci in cis]
+        code = [f"torch._functorch.pyfunctorch.compare_functorch_state({states})"]
+        self._set_guard_export_info(guard, code)
+
+        if config.enable_cpp_guard_manager:
+            # TODO(anijain2305) - Consider this moving this guard to C++
+            compare_fn = torch._functorch.pyfunctorch.compare_functorch_state
 
-        self._produce_guard_code(guard, [code], provided_guarded_object=self.get(base))
+            def fn(x):
+                return compare_fn(states)
+
+            assert self.guard_manager  # to make mypy happy
+            self.guard_manager.root.add_lambda_guard(
+                fn, get_verbose_code_parts(code, guard)
+            )
+        else:
+            self._produce_guard_code(guard, code)
 
     def EQUALS_MATCH(self, guard: Guard):
         ref = self.arg_ref(guard)
@@ -401,14 +1346,45 @@ def EQUALS_MATCH(self, guard: Guard):
             assert istype(
                 val,
                 ok_types,
-            ), t.__name__
+            ), f"Unexpected type {type(val)}, not in {ok_types}"
 
         # Special case for nan because float("nan") == float("nan") evaluates to False
         if istype(val, float) and math.isnan(val):
+            self.TYPE_MATCH(guard)
             code = list()
-            code.append(f"___check_type_id({ref}, {self.id_ref(t)})")
             code.append(f"__math_isnan({ref})")
-            self._produce_guard_code(guard, code)
+            self._set_guard_export_info(guard, code)
+
+            if config.enable_cpp_guard_manager:
+                self.get_guard_manager(guard).add_lambda_guard(
+                    CLOSURE_VARS["__math_isnan"], get_verbose_code_parts(code, guard)
+                )
+            else:
+                self._produce_guard_code(guard, code)
+            return
+
+        # Python math library doesn't support complex nan, so we need to use numpy
+        if istype(val, complex) and np.isnan(val):
+            self.TYPE_MATCH(guard)
+            code = list()
+            code.append(f"__numpy_isnan({ref})")
+            self._set_guard_export_info(guard, code)
+
+            if config.enable_cpp_guard_manager:
+                self.get_guard_manager(guard).add_lambda_guard(
+                    CLOSURE_VARS["__numpy_isnan"], get_verbose_code_parts(code, guard)
+                )
+            else:
+                self._produce_guard_code(guard, code)
+            return
+
+        if config.enable_cpp_guard_manager:
+            # Construct a debug string to put into the c++ equals match guard.
+            code = [f"{ref} == {val!r}"]
+            self.get_guard_manager(guard).add_equals_match_guard(
+                val, get_verbose_code_parts(code, guard)
+            )
+            self._set_guard_export_info(guard, code)
             return
 
         code = list()
@@ -416,8 +1392,8 @@ def EQUALS_MATCH(self, guard: Guard):
         # If matching equality against list/tuple, we must also check that
         # the internal types match.  (TODO: what about nested lists?)
         if istype(val, (list, tuple)):
-            # NB: LIST_LENGTH takes care of the outer __check_type_id test
-            self.LIST_LENGTH(guard)
+            # NB: SEQUENCE_LENGTH takes care of the outer __check_type_id test
+            self.SEQUENCE_LENGTH(guard)
 
             for idx, elem in enumerate(val):
                 code.append(
@@ -425,60 +1401,51 @@ def EQUALS_MATCH(self, guard: Guard):
                 )
         else:
             # Add type check to prevent equality check between tensor and non-tensor.
-            code.append(f"___check_type_id({ref}, {self.id_ref(t)})")
+            self.TYPE_MATCH(guard)
 
         if istype(val, torch.Size):
             val = tuple(val)
 
+        # Code object can not be compared against their string representation
+        # I.e `eval(f"{compile('2+2','','exec')!r}")` raises SyntaxError
+        assert not istype(val, types.CodeType)
+
         # TODO: It feels like it would be better to just implement our own
         # equality test in C that handles all of the necessary type checking
         # and NaN tests
         code.append(f"{ref} == {val!r}")
         self._produce_guard_code(guard, code)
+        self._set_guard_export_info(guard, code)
 
     def CONSTANT_MATCH(self, guard: Guard):
         val = self.get(guard.name)
-        if istype(val, (bool, type(None))):
+        if istype(val, (bool, type(None), types.CodeType)):
             self.ID_MATCH(guard)
         else:
             self.EQUALS_MATCH(guard)
 
     def NN_MODULE(self, guard: Guard):
         self.ID_MATCH(guard)
-        ref = self.arg_ref(guard)
         val = self.get(guard.name)
-
-        def setup_guard():
-            assert istype(val.training, bool)
-            # TODO: Why doesn't this use produce_guard_code?
-            self.code.append(
-                GuardCodeList([f"{ref}.training == {val.training}"], guard)
-            )
-
         if hasattr(val, "training"):
-            # There are cases where a monkeypatched object has a guard made between __new__ and __init__
-            setup_guard()
+            assert istype(val.training, bool)
+            self._guard_on_attribute(guard, "training", GuardBuilder.CONSTANT_MATCH)
         else:
             exc.unimplemented(f"Guard setup for uninitialized class {type(val)}")
 
     def FUNCTION_MATCH(self, guard: Guard):
         """things like torch.add and user defined functions"""
-        if guard.is_local():
-            return self.ID_MATCH(guard)
+        return self.ID_MATCH(guard)
 
     def CLOSURE_MATCH(self, guard: Guard):
         """matches a closure by __code__ id."""
-        if guard.is_local():
-            val = self.get(guard.name)
-            # Strictly only want user-defined functions
-            if type(val) == types.FunctionType and hasattr(val, "__code__"):
-                ref = self.arg_ref(guard)
-                code = [
-                    f"___check_obj_id(getattr({ref}, '__code__', None), {self.id_ref(val.__code__)})",
-                ]
-                self._produce_guard_code(guard, code)
-            else:
-                self.FUNCTION_MATCH(guard)
+        val = self.get(guard.name)
+        # Strictly only want user-defined functions
+        if type(val) == types.FunctionType and hasattr(val, "__code__"):
+            self._guard_on_attribute(guard, "__code__", GuardBuilder.HASATTR)
+            self._guard_on_attribute(guard, "__code__", GuardBuilder.FUNCTION_MATCH)
+        else:
+            self.FUNCTION_MATCH(guard)
 
     def BUILTIN_MATCH(self, guard: Guard):
         return self.FUNCTION_MATCH(guard)
@@ -486,35 +1453,77 @@ def BUILTIN_MATCH(self, guard: Guard):
     def PYMODULE_MATCH(self, guard: Guard):
         return self.FUNCTION_MATCH(guard)
 
-    def LIST_LENGTH(self, guard):
+    def SEQUENCE_LENGTH(self, guard):
+        # This guard is used to check lenght of PySequence objects like list,
+        # tuple, collections.deque etc
         ref = self.arg_ref(guard)
         value = self.get(guard.name)
         t = type(value)
 
+        self.TYPE_MATCH(guard)
         code = list()
-        code.append(f"___check_type_id({ref}, {self.id_ref(t)})")
-        code.append(f"len({ref}) == {len(value)}")
+        if len(value) == 0:
+            code.append(f"not {ref}")
+        else:
+            code.append(f"len({ref}) == {len(value)}")
 
-        self._produce_guard_code(guard, code)
+        self._set_guard_export_info(guard, code)
+        if config.enable_cpp_guard_manager:
+            if isinstance(value, dict):
+                self.get_guard_manager(guard).add_dict_length_check_guard(
+                    len(value), get_verbose_code_parts(code, guard)
+                )
+            else:
+                self.get_guard_manager(guard).add_length_check_guard(
+                    len(value), get_verbose_code_parts(code, guard)
+                )
+        else:
+            self._produce_guard_code(guard, code)
 
     def TUPLE_ITERATOR_LEN(self, guard):
         ref = self.arg_ref(guard)
         value = self.get(guard.name)
         t = type(value)
 
+        if not config.enable_cpp_guard_manager:
+            # C++ guard already checks the type
+            self.TYPE_MATCH(guard)
+
         code = list()
-        code.append(f"___check_type_id({ref}, {self.id_ref(t)})")
         code.append(f"___tuple_iterator_len({ref}) == {tuple_iterator_len(value)}")
+        self._set_guard_export_info(guard, code)
 
-        self._produce_guard_code(guard, code)
+        if config.enable_cpp_guard_manager:
+            t = type(value)
+            obj_id = self.id_ref(t)
+
+            self.get_guard_manager(guard).add_tuple_iterator_length_guard(
+                tuple_iterator_len(value), obj_id, get_verbose_code_parts(code, guard)
+            )
+        else:
+            self._produce_guard_code(guard, code)
 
     # TODO(voz): Deduplicate w/ AOTAutograd dupe input guards
     def DUPLICATE_INPUT(self, guard, source_b):
         ref_a = self.arg_ref(guard)
         ref_b = self.arg_ref(source_b.name())
 
+        if is_from_optimizer_source(
+            guard.originating_source
+        ) or is_from_optimizer_source(source_b):
+            return
+
         code = [f"{ref_b} is {ref_a}"]
-        self._produce_guard_code(guard, code)
+        self._set_guard_export_info(guard, code)
+
+        if config.enable_cpp_guard_manager:
+            install_tensor_aliasing_guard(
+                self.get_guard_manager(guard),
+                self.get_guard_manager_from_source(source_b),
+                get_verbose_code_parts(code, guard),
+            )
+        else:
+            self._produce_guard_code(guard, code)
 
     def DICT_KEYS(self, guard):
         # Guard on the keys and their order
@@ -522,24 +1531,37 @@ def DICT_KEYS(self, guard):
         value = self.get(guard.name)
         t = type(value)
 
+        self.TYPE_MATCH(guard)
         code = list()
-        code.append(f"___check_type_id({ref}, {self.id_ref(t)})")
-        any_tensor_or_module = any(
-            isinstance(k, (torch.Tensor, torch.nn.Module)) for k in value.keys()
-        )
+        any_key_is_id = any(key_is_id(k) for k in value.keys())
         const_keys_repr = dict_keys_repr(
-            tensor_or_module_to_id(value),
+            key_to_id(value),
             local=is_from_local_source(guard.originating_source),
         )
-        if any_tensor_or_module:
-            code.append(f"___tensor_or_module_to_id({ref}) == {const_keys_repr}")
+        if any_key_is_id:
+            code.append(f"___key_to_id({ref}) == {const_keys_repr}")
         else:
             code.append(f"list({ref}.keys()) == {const_keys_repr}")
 
-        self._produce_guard_code(guard, code)
+        self._set_guard_export_info(guard, code)
+        if config.enable_cpp_guard_manager:
+            if self.requires_key_order_guarding(guard.originating_source):
+                self.guard_on_dict_keys_and_order(value, guard)
+            else:
+                self.guard_on_dict_keys_and_ignore_order(value, guard)
+        else:
+            self._produce_guard_code(guard, code)
 
     def WEAKREF_ALIVE(self, guard):
-        self._produce_guard_code(guard, [f"{self.arg_ref(guard)} is not None"])
+        code = [f"{self.arg_ref(guard)} is not None"]
+
+        self._set_guard_export_info(guard, code)
+        if config.enable_cpp_guard_manager:
+            self.get_guard_manager(guard).add_not_none_guard(
+                get_verbose_code_parts(code, guard)
+            )
+        else:
+            self._produce_guard_code(guard, code)
 
     def NN_MODULE_PARAM_NAMES(self, guard):
         ref = self.arg_ref(guard)
@@ -547,23 +1569,45 @@ def NN_MODULE_PARAM_NAMES(self, guard):
         t = type(value)
         keys = {k for k, v in value.named_parameters()}
 
+        self.TYPE_MATCH(guard)
         code = list()
-        code.append(f"___check_type_id({ref}, {self.id_ref(t)})")
         code.append(f"{{k for k, v in {ref}.named_parameters()}} == {keys!r}")
 
-        self._produce_guard_code(guard, code)
+        self._set_guard_export_info(guard, code)
+        if config.enable_cpp_guard_manager:
+            # TODO(anijain2305) - Consider moving this guard to C++. anijain2305
+            # tried but unable to come up with a testcase that installs this
+            # guard.
+            def fn(x):
+                return {k for k, v in x.named_parameters()} == keys
+
+            self.get_guard_manager(guard).add_lambda_guard(
+                fn, get_verbose_code_parts(code, guard)
+            )
+        else:
+            self._produce_guard_code(guard, code)
 
-    def ODICT_KEYS(self, guard):
-        """OrderedDict keys match"""
+    def DICT_CONST_KEYS(self, guard):
+        """Constant keys match"""
         ref = self.arg_ref(guard)
         value = self.get(guard.name)
         t = type(value)
 
+        if not config.enable_cpp_guard_manager:
+            # DictGuardManager supports TYPE_MATCH internally
+            self.TYPE_MATCH(guard)
+
         code = list()
-        code.append(f"___check_type_id({ref}, {self.id_ref(t)})")
-        code.append(f"str({ref}.keys()) == {str(value.keys())!r}")
+        code.append(f"list({ref}.keys()) == {list(value.keys())!r}")
+        self._set_guard_export_info(guard, code)
 
-        self._produce_guard_code(guard, code)
+        if config.enable_cpp_guard_manager:
+            if self.requires_key_order_guarding(guard.originating_source):
+                self.guard_on_dict_keys_and_order(value, guard)
+            else:
+                self.guard_on_dict_keys_and_ignore_order(value, guard)
+        else:
+            self._produce_guard_code(guard, code)
 
     def OBJECT_MUTATION(self, guard: Guard):
         mutation_guard.watch(self.get(guard.name), self.check_fn_manager)
@@ -582,20 +1626,15 @@ def DEFAULT_DEVICE(self, guard: Guard):
         assert guard.source is GuardSource.GLOBAL
         import torch.utils._device as m
 
-        self._produce_guard_code(
-            guard, [f"utils_device.CURRENT_DEVICE == {m.CURRENT_DEVICE!r}"]
-        )
+        code = [f"utils_device.CURRENT_DEVICE == {m.CURRENT_DEVICE!r}"]
+        self._set_guard_export_info(guard, code)
 
-    def BACKEND_MATCH(self, guard: Guard):
-        """Guard on backend matching based on id of current_backend"""
-        assert guard.source is GuardSource.GLOBAL
-        backend_id = (
-            f"{id(torch._dynamo.eval_frame.guarded_backend_cache.current_backend)}"
-        )
-        code = [
-            f"(___skip_backend_check() or ___current_backend() == ___lookup_backend({backend_id}))"
-        ]
-        self._produce_guard_code(guard, code)
+        if config.enable_cpp_guard_manager:
+            self.get_guard_manager(guard).add_default_device_guard(
+                get_verbose_code_parts(code, guard)
+            )
+        else:
+            self._produce_guard_code(guard, code)
 
     def SHAPE_ENV(self, guard: Guard):
         # Let's handle ShapeEnv guards.  To do this, we will resolve
@@ -617,30 +1656,26 @@ def get_sources(t_id, dim):
 
         if output_graph.export_constraints:
             source_pairs: List[Tuple[Source, Source]] = []
+            derived_equalities: List[  # type: ignore[type-arg]
+                Tuple[Source, Union[Source, Symbol], Callable]
+            ] = []
+            phantom_symbols: Dict[str, Symbol] = {}
             for constraint in output_graph.export_constraints:
                 if constraint.t_id in output_graph.tracked_fakes_id_to_source:
-                    source, *other_sources = get_sources(
-                        constraint.t_id, constraint.dim
-                    )
-                    # When t.size()[dim] maps to src0, src1, ..., srcN, we add
-                    # constraints that make src0 "equal" to src1, ..., srcN.
-                    source_pairs.extend(
-                        (source, other_source) for other_source in other_sources
+                    torch.export.dynamic_shapes._process_equalities(
+                        constraint,
+                        get_sources,
+                        output_graph.shape_env,
+                        source_pairs,
+                        derived_equalities,
+                        phantom_symbols,
                     )
-                    if constraint.shared is not None:
-                        # Moreover, when t.size()[dim] is specified equal to t'.size()[dim']
-                        # and t'.size()[dim'] maps to src1', ..., srcN', we add
-                        # constraints that also make src0 "equal" to src1', ..., srcN'.
-                        other_sources = get_sources(
-                            constraint.shared.t_id, constraint.shared.dim
-                        )
-                        source_pairs.extend(
-                            (source, other_source) for other_source in other_sources
-                        )
                 else:
                     log.warning("Untracked tensor used in export constraints")
             equalities_inputs = EqualityConstraint(
                 source_pairs=source_pairs,
+                derived_equalities=derived_equalities,
+                phantom_symbols=list(phantom_symbols.values()),
                 warn_only=False,
             )
         else:
@@ -654,12 +1689,32 @@ def get_sources(t_id, dim):
             # Export keeps static.
             ignore_static=(not self.check_fn_manager.output_graph.export),
         )
-        output_graph.shape_env.freeze()
+        # When exporting, we may work with the shape constraints some more in
+        # postprocessing, so don't freeze yet
+        if not self.check_fn_manager.output_graph.export:
+            output_graph.shape_env.freeze()
+
         for shape_guard in guards:
-            self._produce_guard_code(guard, [shape_guard], shape_env=True)
+            self._set_guard_export_info(guard, [shape_guard])
+
+        if config.enable_cpp_guard_manager:
+            # Install all the symbolic guards in one lambda guard. These are run
+            # at the very end of the RootGuardManager via epilogue guards.
+            # TODO(anijain2305,williamwen42) - Consider moving this to C++.
+            code_parts = guards
+            self.add_python_lambda_leaf_guard_to_root(
+                code_parts,
+                get_verbose_code_parts(code_parts, guard),
+                closure_vars={**SYMPY_INTERP, **CLOSURE_VARS},
+            )
+        else:
+            for shape_guard in guards:
+                self._produce_guard_code(guard, [shape_guard], shape_env=True)
 
     def TENSOR_MATCH(self, guard: Guard, value=None):
-        if guard.is_nn_module():
+        if (
+            not torch._dynamo.config.guard_nn_modules and guard.is_nn_module()
+        ) or match_on_id_for_tensor(guard):
             self.ID_MATCH(guard)
         else:
             if isinstance(value, TensorWeakRef):
@@ -708,59 +1763,104 @@ def TENSOR_MATCH(self, guard: Guard, value=None):
                     else:
                         code.append(f"{tensor_name}.{term} == {real_value}")
             else:
-                self.tensor_check_names.append(tensor_name)
                 self.tensor_check_examples.append(value)
+                self.tensor_check_names.append(tensor_name)
                 self.tensor_check_guards.append(guard)
 
-            # A frame is valid for reuse with dynamic dimensions if the new dynamic dimensions are a
-            # strict subset of the old.
+                if config.enable_cpp_guard_manager:
+                    guard_manager = self.get_guard_manager(guard)
+                    # Keep track of all the tensor guard managers to insert
+                    # NoAliasing check at the end.
+                    self.tensor_check_guard_managers.append(guard_manager)
+
+                    output_graph = self.check_fn_manager.output_graph
+                    metadata = output_graph.input_source_to_sizes_strides[
+                        guard.originating_source
+                    ]
+                    size = convert_to_concrete_values(metadata["size"])
+                    stride = convert_to_concrete_values(metadata["stride"])
+
+                    verbose_code_parts = get_verbose_code_parts(
+                        get_tensor_guard_code_part(value, tensor_name, size, stride),
+                        guard,
+                    )
+                    guard_manager.add_tensor_match_guard(
+                        value,
+                        size,
+                        stride,
+                        tensor_name,
+                        verbose_code_parts,
+                    )
+
+            # A frame is valid for reuse with dynamic dimensions if the new
+            # (user-requested) dynamic dimensions are a subset of the old
+            # (already compiled) dynamic dimensions.
             #
-            # The logic here is as follows:
+            # It's a little non-obvious why you'd want this: in particular,
+            # if an already compiled frame matches all of the guards, why
+            # not just use it, why force a recompile?
             #
-            # Every mark_dynamic directive is a user-knows-best command, which can incur a raise at tracing
-            # time if we find guards that run counter to the user directive.
-            # If compiling a frame with explicit dynamic dims X could cause an exception, we MUST NOT skip compiling.
+            # We force it for two reasons:
             #
-            # If the frame is compiled with any marked dynamic indices, let's call that set of indices X.
-            # When we evaluated inputs against the guards, given the same tensor with potentially new dynamic indices,
-            # let's call that set Y.
+            #   - The user *required* us to compile with a new dynamic dimension,
+            #     we should not ignore that and serve up the old, specialized
+            #     frame.  Listen to the user!
             #
-            # When X is a strict subset of Y, the potential new raises introduced during compilation are a strict subset
-            # of the raises we
-            # could have encountered. The frame compiled under Y is safe to reuse with X.
-            # When X is not a strict subset of Y, the non-overlapping new elements of X may cause new raises, and the
-            # frame is no longer fit for reuse.
+            #   - In fact, we are obligated to *raise an error* if we fail to
+            #     make the requested dimension dynamic.  If we don't
+            #     recompile, we can't tell if that dimension can actually be
+            #     made dynamic.
+            #
+            # If the new dynamic dims are a subset of the old, we already know
+            # we can make them dynamic (since we made them dynamic in old).
+            # This is slightly unsound, because maybe your input size is
+            # [s0, s0, s1] and so you can do it dynamic if you say dynamic
+            # dims {0, 1, 2} but you can't if you only do {0, 2} (because now
+            # the second s0 is specialized).  But we're not entirely sure if
+            # this is a good idea anyway lol... (if you want to try removing
+            # this logic, be my guest!  -- ezyang 2024)
             #
-            # This is the case because any newly introduced mark_dynamic directives have a chance of
-            # raising, failing compilation. Any existing mark_dynamic indices that we lost are safe to lose
-            # as all it means is that we have gotten rid of a user directive which could incur a raise at compile time.
-            # In the case of when there is no Y, that is, there are no dynamic indices marked at all, the frame is safe
-            # to reuse
-            # as an empty set is a safe degeneration - that is, a strictly static tensor is always valid for a frame
-            # compiled with that same
-            # tensor + more onerous user directives.
             assert guard.source is not None
             static, reason = tensor_always_has_static_shape(
                 value, is_tensor=True, guard_source=guard.source
             )
+
             if not static:
                 if hasattr(value, "_dynamo_dynamic_indices"):
-                    code.append(
-                        f"(({tensor_name}._dynamo_dynamic_indices.issubset({value._dynamo_dynamic_indices})) if hasattr({tensor_name}, '_dynamo_dynamic_indices') else True)"  # noqa: B950
-                    )
+                    dynamic_indices = value._dynamo_dynamic_indices
+                    code_part = f"(({tensor_name}._dynamo_dynamic_indices.issubset({dynamic_indices})) if hasattr({tensor_name}, '_dynamo_dynamic_indices') else True)"  # noqa: B950
+                    code.append(code_part)
+                    if config.enable_cpp_guard_manager:
+                        self.get_guard_manager(guard).add_dynamic_indices_guard(
+                            dynamic_indices, get_verbose_code_parts(code_part, guard)
+                        )
                 # In the case of us not having any dynamic dimension indices, we compiled the frame with no chance of
                 # raising for this specific tensor - and any inputs with more dynamic user directives specified must be recompiled.
                 else:
-                    code.append(
+                    code_part = (
                         f"hasattr({tensor_name}, '_dynamo_dynamic_indices') == False"
                     )
+                    code.append(code_part)
+                    if config.enable_cpp_guard_manager:
+                        self.get_guard_manager(guard).add_no_hasattr_guard(
+                            "_dynamo_dynamic_indices",
+                            get_verbose_code_parts(code_part, guard),
+                        )
             if len(code) > 0:
-                self._produce_guard_code(guard, code)
+                self._set_guard_export_info(guard, code)
+                if not config.enable_cpp_guard_manager:
+                    self._produce_guard_code(guard, code)
 
-    # A util that appends guarded code, or, in the case of export, adds data onto guards
-    def _produce_guard_code(
-        self, guard, code_list, provided_guarded_object=None, shape_env=False
-    ):
+    # A util that appends guarded code
+    def _produce_guard_code(self, guard, code_list, shape_env=False):
+        assert not config.enable_cpp_guard_manager
+        if shape_env:
+            self.shape_env_code.append(GuardCodeList(code_list, guard))
+        else:
+            self.code.append(GuardCodeList(code_list, guard))
+
+    # A util that in the case of export, adds data onto guards
+    def _set_guard_export_info(self, guard, code_list, provided_guarded_object=None):
         # WARNING: It is important that cur_frame/caller do NOT stay in
         # the current frame, because they will keep things live longer
         # than they should.  See TestMisc.test_release_module_memory
@@ -776,11 +1876,6 @@ def _produce_guard_code(
             self.__class__
         ), f"_produce_guard_code must be called from inside GuardedCode. Called from {func_name}"
 
-        if shape_env:
-            self.shape_env_code.append(GuardCodeList(code_list, guard))
-        else:
-            self.code.append(GuardCodeList(code_list, guard))
-
         # Not all guards have names, some can be installed globally (see asserts on HAS_GRAD)
         if provided_guarded_object is None:
             name_valid = guard.name is not None and guard.name != ""
@@ -898,7 +1993,11 @@ def _new_var(self, prefix: str = "_var") -> str:
     def count(self, exprs: List[str]) -> None:
         counter = self.ExprCounter(self._config)
         for e in exprs:
-            counter.visit(ast.parse(e))
+            try:
+                counter.visit(ast.parse(e))
+            except SyntaxError as ex:
+                log.exception("Failed to visit expr at line %s.\n%s", ex.lineno, e)
+                raise
 
     def replace(self, expr: str) -> Tuple[List[str], str]:
         replacer = self.Replacer(self._config, self._new_var)
@@ -920,16 +2019,15 @@ def must_add_nn_module_guards(guard):
     )
 
 
+class DeletedGuardFn:
+    pass
+
+
 # NB: Naively, you'd expect this to only be a function that produces
 # the callable that constitutes the guard.  However, there is some
 # delicate handling for invalidating this check function when the
 # locals/globals get invalidated, so there's some extra state
 # we have to hold in this manager class.
-#
-# TODO: this object has reference cycle with itself, via check_fn which
-# references back to CheckFunction via ___guarded_code in closure_vars.
-# Ideally, there shouldn't be any ref cycle so that guards are
-# promptly disposed of.
 class CheckFunctionManager:
     def __init__(
         self,
@@ -937,20 +2035,11 @@ def __init__(
         guard_fail_fn: Optional[Callable[[GuardFail], None]] = None,
     ):
         guards = output_graph.guards if output_graph else None
-        self.valid = True
         self._weakrefs: Dict[int, ReferenceType[object]] = {}
+        self.guard_manager = None
+        if config.enable_cpp_guard_manager:
+            self.guard_manager = GuardManager()
         self.output_graph = output_graph
-
-        # Note: right overrides left
-        def combine_scopes(left, right):
-            if left is None:
-                return right
-
-            if right is None:
-                return left
-
-            return {**left, **right}
-
         w_builder = None
 
         def source_ref(source):
@@ -969,6 +2058,7 @@ def source_ref(source):
             self.lookup_weakrefs,
             output_graph.local_scope,
             output_graph.global_scope,
+            self.guard_manager,
             self,
         )
 
@@ -994,18 +2084,39 @@ def cleanup_builder(weak_b):
                 continue
 
             guard.create(builder)
+
         self.check_fn = self.compile_check_fn(builder, guards, guard_fail_fn)
-        self._weakrefs.clear()
         # Keep track of weak references of objects with ID_MATCH guard. This
         # info is stored alongside optimized_code and check_fn and is used to
         # limit the number of cache entries with same ID_MATCH'd object.
-        # TODO(janimesh) - Currently this information is stored as an attr on
+        # TODO(anijain2305) - Currently this information is stored as an attr on
         # the check_fn itself to avoid changing CacehEntry datastructure in
         # eval_frame.c. In future, we should probably replace check_fn with a
         # queryable data structure such that this information is already present
         # in some form.
         self.check_fn.id_matched_objs = builder.id_matched_objs
 
+        if config.enable_cpp_guard_manager:
+            # TODO: don't do the string rep, do something more structured here
+            torch._logging.trace_structured(
+                "dynamo_cpp_guards_str", payload_fn=lambda: str(self.guard_manager)
+            )
+            guards_log.debug("%s", self.guard_manager)
+            assert self.guard_manager  # to make mypy happy
+            self.guard_manager.id_matched_objs = builder.id_matched_objs
+            self.check_fn = self.guard_manager
+
+        # NB - We have to very careful of cleaning up here. Because of the
+        # invalidate function, we can create a weakref finalizer that keeps
+        # `self` alive for very long. Sometimes by mistake, we can run
+        # invalidate for a type/object (check id_ref method) that Python can
+        # leak by design, preventing us from calling the finalizer. In that
+        # case, the `self` will be alive even though the cache entry will be
+        # deleted (check invalidate method), which can cause a memory leak,
+        # e.g., not setting output_graph = None can keep hold of nn_modules.
+        self._weakrefs.clear()
+        self.output_graph = None
+
     def compile_check_fn(self, builder, guards_out, guard_fail_fn):
         # see parallel handling of ".0" / "___implicit0" in _eval_frame.c
         largs = builder.argnames
@@ -1013,21 +2124,35 @@ def compile_check_fn(self, builder, guards_out, guard_fail_fn):
 
         guards_log.debug("GUARDS:")
 
-        # Don't report this guard, it's always the same, useless!
-        code_parts = ["___guarded_code.valid", "___check_global_state()"]
-        verbose_code_parts = code_parts[:]
+        code_parts = []
+        verbose_code_parts = []
+        structured_guard_fns = []
 
-        def add_code_part(code, guard, log_only=False):
-            extra = ""
-            if guard.user_stack:
-                for fs in reversed(guard.user_stack):
-                    if fs.filename not in uninteresting_files():
-                        extra = f"  # {format_frame(fs, line=True)}"
-                        break
-            elif guard.stack:
-                extra = f"  # {format_frame(guard.stack.summary()[-1])}"
-
-            guards_log.debug("%s", f"{code:<60}{extra}")
+        if config.enable_cpp_guard_manager:
+            # Insert the global_state guard
+            assert self.guard_manager  # to make mypy happy
+            self.guard_manager.root.add_global_state_guard(["___check_global_state()"])
+        else:
+            # Don't report this guard, it's always the same, useless!
+            global_guard = "___check_global_state()"
+            code_parts.append(global_guard)
+            verbose_code_parts.append(global_guard)
+
+        def add_code_part(code_part, guard, log_only=False):
+            verbose_code_part = get_verbose_code_part(code_part, guard)
+            guards_log.debug("%s", verbose_code_part)
+
+            structured_guard_fns.append(
+                lambda: {
+                    "code": code_part,
+                    "stack": structured.from_traceback(guard.stack.summary())
+                    if guard.stack
+                    else None,
+                    "user_stack": structured.from_traceback(guard.user_stack)
+                    if guard.user_stack
+                    else None,
+                }
+            )
 
             if verbose_guards_log.isEnabledFor(logging.DEBUG):
                 maybe_stack = ""
@@ -1041,50 +2166,44 @@ def add_code_part(code, guard, log_only=False):
                         )
                 verbose_guards_log.debug(
                     "Guard: %s%s%s",
-                    code,
+                    code_part,
                     maybe_stack,
                     maybe_user_stack,
                 )
 
             if not log_only:
-                code_parts.append(code)
-                verbose_code_parts.append(f"{code:<60}{extra}")
+                code_parts.append(code_part)
+                verbose_code_parts.append(verbose_code_part)
 
         seen = set()
         for gcl in builder.code:
             for code in gcl.code_list:
                 if code not in seen:
-                    add_code_part(code, gcl.guard)
+                    # If Cpp guard manager is enabled, we don't need to add to
+                    # code_parts.
+                    add_code_part(code, gcl.guard, config.enable_cpp_guard_manager)
                     seen.add(code)
 
         tensor_check_names = builder.tensor_check_names
         check_tensors_fn = None
         check_tensors_verbose_fn = None
-        if tensor_check_names:
+        if tensor_check_names and not config.enable_cpp_guard_manager:
+            tensor_check_guards = builder.tensor_check_guards
             assert (
                 not self.output_graph.export
             ), "Illegal to set tensor_check_names in export."
             tensor_check_examples = builder.tensor_check_examples
 
-            def convert(size_or_stride):
-                converted: List[Optional[int]] = []
-                for dim in size_or_stride:
-                    if not is_symbolic(dim):
-                        converted.append(dim)
-                    else:
-                        assert isinstance(dim, torch.SymInt)
-                        converted.append(dim.node.maybe_as_int())
-                return converted
-
-            dynamic_dims_sizes = [
-                convert(self.output_graph.tensor_weakref_to_sizes_strides[t]["size"])
-                for t in tensor_check_examples
-            ]
-
-            dynamic_dims_strides = [
-                convert(self.output_graph.tensor_weakref_to_sizes_strides[t]["stride"])
-                for t in tensor_check_examples
-            ]
+            dynamic_dims_sizes = []
+            dynamic_dims_strides = []
+            for t, g in zip(tensor_check_examples, tensor_check_guards):
+                metadata = self.output_graph.input_source_to_sizes_strides[
+                    g.originating_source
+                ]
+                dynamic_dims_sizes.append(convert_to_concrete_values(metadata["size"]))
+                dynamic_dims_strides.append(
+                    convert_to_concrete_values(metadata["stride"])
+                )
 
             tensor_guards = TensorGuards(
                 *tensor_check_examples,
@@ -1099,39 +2218,46 @@ def convert(size_or_stride):
             # Do this manually, to un-stagger the guards in log message
             code_parts.append(f"___check_tensors({tensor_check_args})")
             verbose_code_parts.append(f"___check_tensors({tensor_check_args})")
-            tensor_check_guards = builder.tensor_check_guards
 
             for i, name in enumerate(tensor_check_names):
                 # This is a copy of what guards.cpp checks against
                 # Keep this in sync with TensorCheck constructor
                 t = tensor_check_examples[i]
-                pytype = type(t)
-                dispatch_key = (
-                    torch._C._dispatch_keys(t)
-                    | torch._C._dispatch_tls_local_include_set()
-                ) - torch._C._dispatch_tls_local_exclude_set()
-                dtype = t.dtype
-                device_index = t.device.index
-                requires_grad = t.requires_grad
                 sizes = dynamic_dims_sizes[i]
                 strides = dynamic_dims_strides[i]
-                add_code_part(
-                    f"check_tensor({name}, {pytype.__qualname__}, {dispatch_key}, {dtype}, "
-                    f"device={device_index}, requires_grad={requires_grad}, size={sizes}, stride={strides})",
-                    tensor_check_guards[i],
-                    log_only=True,
-                )
+                code_part = get_tensor_guard_code_part(t, name, sizes, strides)
+                add_code_part(code_part, tensor_check_guards[i], log_only=True)
+
+        if len(tensor_check_names) > 1 and config.enable_cpp_guard_manager:
+            # Install tensor aliasing guard. TENSOR_MATCH guards are already
+            # installed for cpp guard manager.
+            install_no_tensor_aliasing_guard(
+                builder.tensor_check_guard_managers,
+                tensor_check_names,
+                ["check_no_aliasing(" + ", ".join(tensor_check_names) + ")"],
+            )
 
         aotautograd_guards: List[GuardEnvExpr] = (
             self.output_graph.tracing_context.guards_context.aotautograd_guards
             if self.output_graph
             else []
         )
+
+        # TODO(anijain2305) - There is a duplicate logic in Dynamo to find
+        # aliased input tensors. So most probably we don't need this here.
+        # Revisit.
         for guard in aotautograd_guards:
             if isinstance(guard, DuplicateInputs):
                 source_a = guard.input_source_a
                 source_b = guard.input_source_b
-                add_code_part(f"{source_a.name()} is {source_b.name()}", None)
+                code_part = f"{source_a.name()} is {source_b.name()}"
+                if config.enable_cpp_guard_manager:
+                    install_tensor_aliasing_guard(
+                        builder.get_guard_manager_from_source(source_a),
+                        builder.get_guard_manager_from_source(source_b),
+                        [code_part],
+                    )
+                add_code_part(code_part, None, config.enable_cpp_guard_manager)
             else:
                 raise RuntimeError(f"Unknown GuardEnvExpr: {guard}")
 
@@ -1139,14 +2265,20 @@ def convert(size_or_stride):
         # which is useless.  Get ShapeEnv to pass in more provenance.
         for gcl in builder.shape_env_code:
             for code in gcl.code_list:
-                add_code_part(code, gcl.guard)
+                # Shape env guards are already added for CPP guard manager in
+                # SHAPE_ENV implementation.
+                add_code_part(code, gcl.guard, config.enable_cpp_guard_manager)
+
+        # OK, all done generating guards
+        torch._logging.trace_structured(
+            "dynamo_guards", payload_fn=lambda: [f() for f in structured_guard_fns]
+        )
 
         global_state = convert_frame.initial_global_state
         if global_state is None:
             # we should only hit this case in NopTests()
             global_state = convert_frame.GlobalStateGuard()
         closure_vars = {
-            "___guarded_code": self,
             "___check_tensors": check_tensors_fn,
             "___check_tensors_verbose": check_tensors_verbose_fn,
             "___check_global_state": global_state.check,
@@ -1155,34 +2287,69 @@ def convert(size_or_stride):
             **CLOSURE_VARS,
         }
 
-        unique_code_parts = list(unique(code_parts))
-        make_guard_fn_args = ", ".join(closure_vars.keys())
-        guard_body, pycode = build_guard_function(unique_code_parts, make_guard_fn_args)
+        globals_for_guard_fn = {"G": builder.scope["G"]}
+        if config.enable_cpp_guard_manager:
+            # Guard manager construction is complete
+            assert self.guard_manager  # to make mypy happy
+            # TODO (anijain2305) - When enable_cpp_guard_manager is ON by
+            # default, change the guard_fn name to be guard_manager everywhere
+            # to avoid confusion.
+            guard_fn = self.guard_manager
+            # Ensure we did not miss to insert a guard in cpp guard manager.
+            assert len(code_parts) == 0
+        else:
+            unique_code_parts = list(unique(code_parts))
+            make_guard_fn_args = ", ".join(closure_vars.keys())
+            guard_body, pycode = build_guard_function(
+                unique_code_parts, make_guard_fn_args
+            )
 
-        if os.environ.get("TORCHDYNAMO_PRINT_GUARDS", None) == "1":
-            print("GUARDS\n", guard_body)
+            if os.environ.get("TORCHDYNAMO_PRINT_GUARDS", None) == "1":
+                print("GUARDS\n", guard_body)
+
+            out: Dict[str, Any] = dict()
+
+            # We don't put builder.scope as the globals in exec call because
+            # guard_fn.__globals__ becomes equal to builder.scope. This causes
+            # guard_fn to hold a referece to f_locals sitting in builder.scope["L"]
+            try:
+                exec(pycode, globals_for_guard_fn, out)
+            except SyntaxError as ex:
+                log.exception("Failed to exec guard at line %s.\n%s", ex.lineno, pycode)
+                raise
+            guard_fn = out["___make_guard_fn"](*closure_vars.values())
 
-        out: Dict[str, Any] = dict()
-        exec(pycode, builder.scope, out)
-        guard_fn = out["___make_guard_fn"](*closure_vars.values())
         guard_fn.closure_vars = closure_vars
         # TODO(whc) maybe '.code_parts' was only kept around for the guard callback? so we don't need both
         guard_fn.args = largs
         guard_fn.code_parts = code_parts
         guard_fn.verbose_code_parts = verbose_code_parts
         # Grab only G, but preserve "G" because guards access it as "G"
-        guard_fn.global_scope = {
-            "G": builder.scope["G"],
-        }
+        guard_fn.global_scope = globals_for_guard_fn
         guard_fn.guard_fail_fn = guard_fail_fn
+        # will be populated by a non-owning reference to CacheEntry/ExtraState
+        # when the CacheEntry is constructed
+        guard_fn.cache_entry = None
+        guard_fn.extra_state = None
+        guard_fn.no_tensor_aliasing_sources = tensor_check_names
         return guard_fn
 
     def invalidate(self):
-        # A weakref is no longer valid, self.check_fn should return false
-        # TODO(janimesh) - Free up cache entry after the cache entry formation
-        # is in python, and the underlying data structure is a doubly linked
-        # list.
-        self.valid = False
+        # Some tests reveal that CheckFunctionManager has no attribute
+        # check_fn, but this case should not be of any concern.
+        # This case doesn't seem easy to repro.
+        if (
+            hasattr(self, "check_fn")
+            and self.check_fn is not DeletedGuardFn
+            and (cache_entry := self.check_fn.cache_entry) is not None
+            and (extra_state := self.check_fn.extra_state) is not None
+        ):
+            assert isinstance(cache_entry, CacheEntry)
+            assert isinstance(extra_state, ExtraState)
+            extra_state.invalidate(cache_entry)
+            self.check_fn.cache_entry = None
+            self.check_fn.extra_state = None
+            self.check_fn = DeletedGuardFn
 
     def id_ref(self, obj):
         """add a weakref, return the id"""
@@ -1255,6 +2422,23 @@ def is_recompiles_verbose_enabled():
     return torch._logging._internal.log_state.is_artifact_enabled("recompiles_verbose")
 
 
+def recompilation_reason_for_no_tensor_aliasing_guard(guard_manager, scope):
+    duplicate_tensors = []
+    global_scope = dict(guard_manager.global_scope)
+    ids_to_source = collections.defaultdict(list)
+    for tensor_source in guard_manager.no_tensor_aliasing_sources:  # type: ignore[attr-defined]
+        global_scope["__compile_source__"] = tensor_source
+        tensor_id = id(eval(tensor_source, global_scope, scope))
+        ids_to_source[tensor_id].append(tensor_source)
+
+    for key in ids_to_source:
+        if len(ids_to_source[key]) > 1:
+            duplicate_tensors.append(f"{ids_to_source[key]}")
+
+    reason = ", ".join(duplicate_tensors)
+    return [f"Duplicate tensors found: {reason}"]
+
+
 def get_guard_fail_reason(
     guard_fn: GuardFn,
     code: types.CodeType,
@@ -1267,28 +2451,61 @@ def get_guard_fail_reason(
     """
     scope = {"L": f_locals, "G": guard_fn.global_scope["G"]}
     scope.update(guard_fn.closure_vars)
-    scope["___check_tensors"] = scope["___check_tensors_verbose"]
     reasons: List[str] = []
-    for part in guard_fn.verbose_code_parts:
-        global_scope = dict(guard_fn.global_scope)
-        global_scope["__compile_source__"] = part
-        with report_compile_source_on_error():
-            try:
-                fail_reason = eval(part, global_scope, scope)
-            except Exception as e:
-                if is_recompiles_verbose_enabled():
-                    continue
+
+    no_tensor_aliasing_check_failed = False
+
+    verbose_code_parts: List[str] = []
+    if config.enable_cpp_guard_manager:
+        guard_manager = guard_fn
+        guard_debug_info = guard_manager.check_verbose(f_locals)  # type: ignore[attr-defined]
+        # For test_export_with_map_cond, the check_verbose fail even without the
+        # C++ guard manager. We need to fix the issue to remove the comment.
+        # assert not guard_debug_info.result
+        if not guard_debug_info.result:
+            verbose_code_parts = guard_debug_info.verbose_code_parts
+            # verbose_code_parts is either the actual reason (e.g. in case of
+            # TENSOR_MATCH) or it could be a list of verbose_code_part that we
+            # passed to the leaf guard at construction time. If its a list, we
+            # walk through this list and find the guard that failed. This is
+            # very important for symbolic shape guards which are currently
+            # installed as a lambda guard and can encompass a long list of code_parts.
+
+            if len(verbose_code_parts) == 1:
+                if "Duplicate tensor found" in verbose_code_parts[0]:
+                    no_tensor_aliasing_check_failed = True
                 else:
-                    raise
-        # Only ___check_tensors knows how to return a fancy fail reason;
-        # for everything else we just report the code that failed
-
-        if isinstance(fail_reason, bool) and not fail_reason:
-            fail_reason = part
-        if isinstance(fail_reason, str):
-            reasons.append(fail_reason)
-            if not is_recompiles_verbose_enabled():
-                break
+                    reasons = verbose_code_parts
+                    verbose_code_parts = []
+    else:
+        verbose_code_parts = guard_fn.verbose_code_parts
+        # This is not needed for CPP guard because the verbose check is already
+        # run in C++.
+        scope["___check_tensors"] = scope["___check_tensors_verbose"]
+
+    if no_tensor_aliasing_check_failed:
+        reasons = recompilation_reason_for_no_tensor_aliasing_guard(guard_fn, scope)
+    else:
+        for part in verbose_code_parts:
+            global_scope = dict(guard_fn.global_scope)
+            global_scope["__compile_source__"] = part
+            with report_compile_source_on_error():
+                try:
+                    fail_reason = eval(part, global_scope, scope)
+                except Exception as e:
+                    if is_recompiles_verbose_enabled():
+                        continue
+                    else:
+                        raise
+            # Only ___check_tensors knows how to return a fancy fail reason;
+            # for everything else we just report the code that failed
+
+            if isinstance(fail_reason, bool) and not fail_reason:
+                fail_reason = part
+            if isinstance(fail_reason, str):
+                reasons.append(fail_reason)
+                if not is_recompiles_verbose_enabled():
+                    break
 
     reason_str = "\n".join(reasons)
     guard_failures[orig_code_map[code]].append(reason_str)
@@ -1364,13 +2581,19 @@ def guard_error_hook(
     print(
         f"ERROR RUNNING GUARDS {code.co_name} {code.co_filename}:{code.co_firstlineno}"
     )
-    # TODO: If we passed in the exception here, we could get a precise
-    # column number of which subexpression failed.  But that would also
-    # require us to have the TRUE code that was eval'ed, not a shoddy
-    # reconstruction (like is done here)
     print("lambda " + ", ".join(guard_fn.args) + ":")
     print(" ", " and\n  ".join(guard_fn.code_parts))
 
+    if config.enable_cpp_guard_manager:
+        print(guard_fn)
+
+    local_scope = {"L": f_locals, **guard_fn.closure_vars}
+    for guard in guard_fn.code_parts:
+        try:
+            eval(guard, guard_fn.global_scope, local_scope)
+        except:  # noqa: B001,E722
+            print(f"Malformed guard:\n{guard}")
+
 
 set_guard_error_hook(guard_error_hook)
 
@@ -1398,6 +2621,14 @@ def make_dupe_guard(obj_source, dupe_source):
     if dupe_source and dupe_source != obj_source:
         ser_source_is_local = is_from_local_source(dupe_source)
         source_is_local = is_from_local_source(obj_source)
+        if is_from_flatten_script_object_source(
+            dupe_source
+        ) or is_from_flatten_script_object_source(obj_source):
+            raise exc.UnsafeScriptObjectError(
+                f"{obj_source.name()} is alising {dupe_source.name()}. This is not supported."
+                f" Please do a clone for corresponding input."
+            )
+
         # Note - both must be local, or global, or we will run afoul of a lack of merging in how we currently
         # reconcile guards builder scopes in compile_check_fn. This technically means we miss a guard here,
         # so maybe we should do this refactor before we land this...
@@ -1419,7 +2650,10 @@ def install_guard(*guards, skip=0):
     """
     from torch._guards import TracingContext
 
+    collect_debug_stack = guards_log.isEnabledFor(
+        logging.DEBUG
+    ) or verbose_guards_log.isEnabledFor(logging.DEBUG)
     add = TracingContext.get().guards_context.dynamo_guards.add
     for guard in guards:
         assert isinstance(guard, Guard)
-        add(guard, skip=skip + 1)
+        add(guard, collect_debug_stack=collect_debug_stack, skip=skip + 1)
diff --git a/torch/_dynamo/logging.py b/torch/_dynamo/logging.py
index 945d6bb68939a..1e9a820785be2 100644
--- a/torch/_dynamo/logging.py
+++ b/torch/_dynamo/logging.py
@@ -51,7 +51,7 @@ def get_step_logger(logger):
 
     step = next(_step_counter)
 
-    def log(level, msg):
-        logger.log(level, "Step %s: %s", step, msg)
+    def log(level, msg, **kwargs):
+        logger.log(level, "Step %s: %s", step, msg, **kwargs)
 
     return log
diff --git a/torch/_dynamo/mutation_guard.py b/torch/_dynamo/mutation_guard.py
index fb514005ac932..5933aa97e036b 100644
--- a/torch/_dynamo/mutation_guard.py
+++ b/torch/_dynamo/mutation_guard.py
@@ -5,6 +5,7 @@
 
 import torch.nn
 from torch.nn import Module
+from . import config
 
 from .utils import ExactWeakKeyDictionary, is_lazy_module
 
@@ -83,7 +84,7 @@ def check(cls, obj):
         )
 
 
-def is_dynamic_nn_module(obj):
+def is_dynamic_nn_module(obj, is_export):
     """Check for nn.Modules() created dynamically or mutated"""
     if isinstance(obj, torch.nn.Module) and "forward" in obj.__dict__:
         # A monkey patched `.forward` indicates something wacky is going on
@@ -92,6 +93,12 @@ def is_dynamic_nn_module(obj):
         return obj.torchdynamo_force_dynamic
     if is_lazy_module(obj):
         return False
+    # For export, we will have to fix
+    # 1) Input signature problem because params are lifted as inputs
+    # 2) nn module stack info changes
+    # 3) adjust failing tests
+    if config.inline_inbuilt_nn_modules and not is_export:
+        return True
     dyn = GenerationTracker.dynamic_classes.get(type(obj)) or GenerationTracker.check(
         obj
     )
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index bb272903388dc..4606795bf677d 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -10,7 +10,7 @@
 import traceback
 import weakref
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING, Union
 
 import sympy
 
@@ -21,20 +21,13 @@
 import torch.nn
 import torch.utils._pytree as pytree
 from torch import fx
-from torch._guards import (
-    Checkpointable,
-    GlobalContextCheckpointState,
-    GuardsCheckpointState,
-    Source,
-    TracingContext,
-)
+from torch._guards import GlobalContextCheckpointState, Source, TracingContext
 from torch._utils_internal import signpost_event
-from torch.fx.experimental.sym_node import SymNode
+from torch.fx._lazy_graph_module import _make_graph_module  # type: ignore[attr-defined]
+from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental.symbolic_shapes import free_symbols, is_symbolic, ShapeEnv
+from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
-from torch.utils._sympy.interp import sympy_interp
-from torch.utils._sympy.reference import PythonReferenceAnalysis
-from torch.utils.weak import WeakTensorKeyDictionary
 
 from . import config, logging as torchdynamo_logging, variables
 from .backends.registry import CompiledFn, CompilerFn
@@ -56,16 +49,19 @@
 )
 from .guards import GuardBuilder, install_guard
 from .mutation_guard import is_dynamic_nn_module
-from .side_effects import SideEffects
+from .side_effects import AttributeMutationExisting, SideEffects
 from .source import (
     AttrSource,
+    BackwardStateSource,
     ConstantSource,
+    GetItemSource,
     GlobalStateSource,
     is_constant_source,
     is_from_local_source,
     LocalSource,
     ParamBufferSource,
     ShapeEnvSource,
+    SyntheticLocalSource,
     TensorProperty,
     TensorPropertySource,
 )
@@ -77,16 +73,26 @@
     counters,
     dynamo_timed,
     get_instruction_source_311,
+    get_locals_to_steal,
     get_static_address_type,
     graph_break_reasons,
     increment_op_count,
     lazy_format_graph_code,
-    lazy_format_graph_tabular,
     LazyString,
+    nn_module_proxy,
     same,
+    set_example_value,
 )
 from .variables.base import VariableTracker
-from .variables.builder import GraphArg, TrackedFake, VariableBuilder, wrap_fx_proxy
+from .variables.builder import (
+    BackwardStateGraphArg,
+    GraphArg,
+    TrackedFake,
+    VariableBuilder,
+    wrap_fx_proxy,
+)
+from .variables.lists import BaseListVariable
+from .variables.misc import NullVariable
 from .variables.nn_module import NNModuleVariable
 from .variables.tensor import (
     NumpyNdarrayVariable,
@@ -97,6 +103,9 @@
 
 from .variables.torch_function import TensorWithTFOverrideVariable
 
+if TYPE_CHECKING:
+    from torch._dynamo.symbolic_convert import InstructionTranslatorBase
+
 log = logging.getLogger(__name__)
 graph_tabular_log = torch._logging.getArtifactLogger(__name__, "graph")
 graph_code_log = torch._logging.getArtifactLogger(__name__, "graph_code")
@@ -104,42 +113,37 @@
 trace_call_log = torch._logging.getArtifactLogger(__name__, "trace_call")
 
 
-class OutputGraphState(NamedTuple):
-    input_source_to_var: Dict[Source, VariableTracker]
-    tracked_fakes: List[TrackedFake]
-    guard_state: GuardsCheckpointState
-    nn_modules: Optional[Dict[str, torch.nn.Module]]
-    register_finalizer_fns: List[Callable[[fx.GraphModule], None]]
-    global_state: Optional[Dict[str, bool]]
-    param_name_to_source: Optional[Dict[str, Source]]
-    side_effects: SideEffects
-    timestamp: int
-    non_compliant_ops: Set[torch._ops.OpOverload]
-    compliant_custom_ops: Set[torch._ops.OpOverload]
-
-    def diff(self, other: "OutputGraphState", *, prefix: str = "") -> Optional[str]:
-        for k in self._fields:
-            if k == "guard_state":
-                r = self.guard_state.diff(other.guard_state)
-                if r is not None:
-                    return r
-                continue
-            elif k == "side_effects":
-                r = self.side_effects.diff(other.side_effects)
-                if r is not None:
-                    return r
-                continue
+@dataclass(frozen=True)
+class VariableTrackerCacheKey:
+    vt_id: int
+    # Two different source can point to the same object. However, Dynamo handles
+    # globals and local source differently when it comes to guards and possibly
+    # some other parts as well. So, cache also relies on the source.
+    source: Source
 
-            sv = getattr(self, k)
-            ov = getattr(other, k)
-            if sv != ov:
-                return f"{prefix}{k} mismatch: {sv} != {ov}"
-        return None
 
-    # Back compat .guards api
-    @property
-    def guards(self):
-        return self.guard_state.dynamo_guards
+class VariableTrackerCache:
+    def __init__(self):
+        self.cache = {}
+
+    def lookup(self, value, source):
+        key = VariableTrackerCacheKey(id(value), source)
+        if key not in self.cache:
+            return None
+        return self.cache[key]
+
+    def add(self, value, source, vt):
+        key = VariableTrackerCacheKey(id(value), source)
+        self.cache[key] = vt
+
+    def clone(self):
+        # Needed for copy and restore graph state
+        new_cache = VariableTrackerCache()
+        new_cache.cache.update(self.cache)
+        return new_cache
+
+    def clear(self):
+        self.cache.clear()
 
 
 @functools.lru_cache(None)
@@ -219,7 +223,7 @@ def __call__(self, gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor])
 Scope = Dict[str, object]
 
 
-class OutputGraph(Checkpointable[OutputGraphState]):
+class OutputGraph:
     """
     Wrapper class to hold outputs of InstructionTranslator.  Mainly the
     generated fx.Graph.
@@ -250,8 +254,13 @@ def __init__(
         self.export = export
         self.export_constraints = export_constraints
         self.frame_state = frame_state
-        self.tensor_weakref_to_sizes_strides = WeakTensorKeyDictionary()
+        # Map from graph input's `Source` to sizes / strides metadata
+        self.input_source_to_sizes_strides: Dict[Source, Dict[str, Any]] = {}
         self.cleanup_hooks: List[Callable[[], Any]] = []
+        # compile_id is an id number for the current torch.compile
+        self.compile_id: int = next(_compile_id_counter)
+        # Set of globals installed via install_global* APIs
+        self.installed_globals: Set[str] = set()
 
         # TODO: maybe should just pass the entire f_code in here?  Not
         # sure...
@@ -288,11 +297,14 @@ def __init__(
 
         # In export mode, we force the shape_env to strictly disallow any constraining
         # of the user marked dynamic dims
-        fake_mode = torch._subclasses.FakeTensorMode(
-            shape_env=shape_env,
-            # TODO (tmanlaibaatar) Remove this once we always lift params and buffers
-            allow_non_fake_inputs=True if self.export else False,
-        )
+        import torch._functorch.config as _config
+
+        with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
+            fake_mode = torch._subclasses.FakeTensorMode(
+                shape_env=shape_env,
+                # TODO (tmanlaibaatar) Remove this once we always lift params and buffers
+                allow_non_fake_inputs=True if self.export else False,
+            )
         self.tracing_context: TracingContext = TracingContext(fake_mode)
         self.init_ambient_guards()
 
@@ -307,6 +319,10 @@ def __init__(
         # Stores the full fqn of a param or buffer to the relevant source.
         self.param_name_to_source: Optional[Dict[str, Source]] = dict()
         self.side_effects = SideEffects()
+        # Cached variable trackers. This makes symbolic analysis of LOAD_GLOBAL
+        # and LOAD_ATTR for same python objects free.
+        self.variable_tracker_cache = VariableTrackerCache()
+        self.unique_var_id = itertools.count()
         self.code_options = dict(code_options)
         self.output_instructions: List[Instruction] = []
         # used to track nodes that are added between calls of copy_graphstate
@@ -321,7 +337,6 @@ def __init__(
         self.global_scope = global_scope
         self.local_scope = local_scope
         self.root_tx = root_tx
-        from torch._dynamo.symbolic_convert import InstructionTranslatorBase
 
         # Given a source, what are the user stacks of all locations that
         # accessed it?
@@ -338,7 +353,6 @@ def __init__(
         self._current_tx: List[InstructionTranslatorBase] = []
         self.cleanups: List[CleanupHook] = []
         self.should_exit = False
-        self.random_values_var = None
         self.unspec_variable_map: Dict[str, UnspecializedPythonVariable] = {}
         self.torch_function_enabled = torch._C._is_torch_function_enabled()
         # Tracks if the output graph has a user defined allowed function in the
@@ -367,8 +381,67 @@ def __init__(
         # i.e. buffers and parameters.
         self.dynamo_flat_name_to_original_fqn: Dict[str, str] = {}
 
-    # This gets its own helper function so guards DEBUG logs are more
-    # informative
+        # All calls to random() are replaced with a single call to __gen_rand_values
+        # functions that returns a tuple of random values for each original call.
+        # random_calls tracks calls to random() and random_values_var stores the name of
+        # the variable that stores __gen_rand_values results.
+        self.random_calls: List[
+            Tuple[Callable[..., object], Tuple[object, ...], Dict[str, object]]
+        ] = []
+        self.random_values_var = None
+
+        # Bytecode to insert right before we call the graph
+        self.pregraph_bytecode: List[Instruction] = []
+
+        # Use to pass values to backward hooks when using compiled autograd
+        self.backward_state: Dict[str, VariableTracker] = {}
+        self.backward_state_proxy: Optional[torch.fx.Proxy] = None
+        self.backward_state_var: Optional[str] = None
+
+        self.name_of_builtins_dict_key_in_fglobals: str = (
+            self.install_builtins_dict_in_fglobals()
+        )
+
+        self.guard_on_key_order: Set[str] = set()
+
+    def install_builtins_dict_in_fglobals(self):
+        # f_globals["__builtins__"] can be a dict or a module. This is an
+        # implemenation detail -
+        # https://docs.python.org/3/library/builtins.html.
+
+        # This makes guarding on any builtin messy because the guard check_fn
+        # has to check if the __builtins__ is a module or dict, and then access
+        # by either using getattr or getitem respectively.
+
+        # To solve this problem, we insert a new entry in f_globals which points
+        # to the builtins __dict__ and then we guard any builtin on this dict.
+        # To avoid any collision with the pre-existing keys, we use the
+        # install_global to give us a unique dict key.
+
+        f_builtins = self.global_scope["__builtins__"]
+        if not isinstance(f_builtins, dict):
+            f_builtins = f_builtins.__dict__
+        return self.install_global("__builtins_dict__", f_builtins)
+
+    def add_backward_state_hook(self, hook: VariableTracker, prefix="hook"):
+        name = f"{prefix}{len(self.backward_state)}"
+        assert name not in self.backward_state
+        self.backward_state[name] = hook
+        return name, self.get_backward_state_proxy()
+
+    def get_backward_state_proxy(self):
+        if self.backward_state_proxy is None:
+            if self.export:
+                unimplemented("backward_state does not support export")
+            self.backward_state_proxy = self.root_tracer.create_graph_input(
+                "dynamo_backward_state", BackwardState, source=BackwardStateSource()
+            )
+            self.backward_state_proxy.node.meta["grapharg"] = BackwardStateGraphArg()
+            set_example_value(self.backward_state_proxy.node, BackwardState())
+            self.backward_state_var = self.new_var()
+        return self.backward_state_proxy
+
+    # This gets its own helper function so guards DEBUG logs are more informative
     def init_ambient_guards(self):
         # Register a SHAPE_ENV guard to make sure we setup shape guards
         # that show up in ShapeEnv
@@ -386,7 +459,33 @@ def init_ambient_guards(self):
             GlobalStateSource().make_guard(GuardBuilder.TORCH_FUNCTION_STATE)
         )
 
-        self.guards.add(GlobalStateSource().make_guard(GuardBuilder.BACKEND_MATCH))
+        ci = torch._C._functorch.peek_interpreter_stack()
+        if ci is not None:
+            self.guards.add(
+                GlobalStateSource().make_guard(GuardBuilder.FUNCTORCH_STACK_MATCH)
+            )
+
+    def synthetic_graph_input(self, fn, args):
+        """
+        call fn(*args) before the graph runs and turn the result into a fake input.
+        """
+        example_value = fn(*args)
+        varname = self.new_var()
+        cg = PyCodegen(self.root_tx)
+        cg.load_import_from(
+            fn.__module__,
+            fn.__name__,
+        )
+        cg.foreach(map(variables.ConstantVariable.create, args))
+        cg.call_function(len(args), True)
+        cg.store(varname)
+        self.pregraph_bytecode.extend(cg.get_instructions())
+        source = SyntheticLocalSource(varname)
+        result = VariableBuilder(self.root_tx, source)(example_value)
+        TracingContext.get().guards_context.dynamo_guards.remove_guards_with_source(
+            source
+        )
+        return result
 
     def add_cleanup_hook(self, fn: Callable[[], Any]):
         self.cleanup_hooks.append(fn)
@@ -489,26 +588,33 @@ def save_global_state(self, out=None):
             out if out is not None else self.tracing_context.global_context.global_state
         )
 
+        # TODO - Consider having a torch level API for torch_function_state. As
+        # of now, we create a ref cycle by passing the
+        # output.set_torch_function_state to
+        # output.tracing_context.global_context.global_state. In the interim,
+        # the problem can be solved by manually set
+        # output.tracing_context.global_context.global_state to None at cleanup.
         global_state["torch_function_enabled"] = (
             self.set_torch_function_state,
             self.torch_function_enabled,
         )
         global_state["grad_enabled"] = (torch.set_grad_enabled, torch.is_grad_enabled())
+
         global_state["autocast_enabled"] = (
-            torch.set_autocast_enabled,
-            torch.is_autocast_enabled(),
+            functools.partial(torch.set_autocast_enabled, "cuda"),
+            torch.is_autocast_enabled("cuda"),
         )
         global_state["autocast_cpu_enabled"] = (
-            torch.set_autocast_cpu_enabled,
-            torch.is_autocast_cpu_enabled(),
+            functools.partial(torch.set_autocast_enabled, "cpu"),
+            torch.is_autocast_enabled("cpu"),
         )
         global_state["autocast_gpu_dtype"] = (
-            torch.set_autocast_gpu_dtype,
-            torch.get_autocast_gpu_dtype(),
+            functools.partial(torch.set_autocast_dtype, "cuda"),
+            torch.get_autocast_dtype("cuda"),
         )
         global_state["autocast_cpu_dtype"] = (
-            torch.set_autocast_cpu_dtype,
-            torch.get_autocast_cpu_dtype(),
+            functools.partial(torch.set_autocast_dtype, "cpu"),
+            torch.get_autocast_dtype("cpu"),
         )
         global_state["autocast_cache_enabled"] = (
             torch.set_autocast_cache_enabled,
@@ -525,64 +631,6 @@ def pop_tx(self):
     def current_tx(self):
         return self.root_tx if not self._current_tx else self._current_tx[-1]
 
-    def copy_graphstate(self) -> OutputGraphState:
-        """Create a checkpoint of the current state by copying everything"""
-        assert self.param_name_to_source is not None
-        guards_graph_state = self.tracing_context.guards_context.copy_graphstate()
-        module_state = self.tracing_context.module_context.copy_graphstate()
-        global_state = self.tracing_context.global_context.copy_graphstate()
-        state = OutputGraphState(
-            dict(self.input_source_to_var),
-            list(self.tracked_fakes),
-            guards_graph_state,
-            module_state,
-            list(self.register_finalizer_fns),
-            global_state,
-            dict(self.param_name_to_source),
-            self.side_effects.clone(),
-            self.timestamp,
-            set(self.non_compliant_ops),
-            set(self.compliant_custom_ops),
-        )
-        self.timestamp += 1
-        return state
-
-    def restore_graphstate(self, state: OutputGraphState):
-        """Restore a checkpoint created by self.copy_graphstate()"""
-        (
-            self.input_source_to_var,
-            self.tracked_fakes,
-            guards_state,
-            module_state,
-            self.register_finalizer_fns,
-            global_state,
-            self.param_name_to_source,
-            self.side_effects,
-            self.timestamp,
-            self.non_compliant_ops,
-            self.compliant_custom_ops,
-        ) = state
-        self.tracing_context.guards_context.restore_graphstate(guards_state)
-        self.tracing_context.module_context.restore_graphstate(module_state)
-        self.tracing_context.global_context.restore_graphstate(global_state)
-
-        # FX deepcopy doesn't work for a partially created graph, so just remove new nodes
-        removed_nodes = 0
-        for node in reversed(list(self.graph.nodes)):
-            if (
-                node.meta["creation_timestamp"] > self.timestamp
-                # placeholders here may have been lazily added by existing objects
-                and node.op != "placeholder"
-            ):
-                # Erasing node alone does not remove the meta information
-                # So, remove the help tensor explicitly
-                if "example_value" in node.meta:
-                    del node.meta["example_value"]
-                self.remove_node(node)
-                self.real_value_cache.pop(node, None)
-                removed_nodes += 1
-        log.debug("restore_graphstate: removed %s nodes", removed_nodes)
-
     def add_symbol_bindings(self, arg: GraphArg):
         # Insert implicit size vars as necessary.  With dynamic shapes, we
         # maintain the invariant that every sizevar gets a direct SymInt input
@@ -611,11 +659,11 @@ def bind_symint(s, prop):
                 before=True,
                 source=prop,
             )
-            proxy.node.meta["example_value"] = s
+            set_example_value(proxy.node, s)
             proxy.node.meta["grapharg"] = GraphArg(
                 prop,
                 s,
-                is_unspecialized=False,
+                pass_arg_as_tensor=False,
                 fake_tensor=None,
                 is_tensor=False,
             )
@@ -655,8 +703,9 @@ def get_submodule(self, keys):
 
     def new_var(self, name="tmp"):
         existing = set(self.code_options["co_varnames"])
-        for i in itertools.count():
-            var = f"{name}_{i}"
+        # In common case, this will be O(1)
+        while True:
+            var = f"{name}_{next(self.unique_var_id)}"
             if var not in existing:
                 self.code_options["co_varnames"] += (var,)
                 return var
@@ -688,7 +737,7 @@ def register_attr_or_module(
         *names,
         **options,
     ):
-        if is_dynamic_nn_module(target):
+        if is_dynamic_nn_module(target, self.root_tx.export):
             return variables.UnspecializedNNModuleVariable(target, **options)
 
         options = dict(options)
@@ -712,30 +761,48 @@ def register_attr_or_module(
                 # are registered as get_attr nodes in the root graph.
                 tracer = self.root_tracer
 
-            if not is_constant_source(source):
-                install_guard(source.make_guard(GuardBuilder.TENSOR_MATCH))
-
-            if get_static_address_type(target) == "guarded":
-                install_guard(source.make_guard(GuardBuilder.DATA_PTR_MATCH))
-
             def wrap_name(module_key):
                 assert self.param_name_to_source is not None
                 self.param_name_to_source[module_key] = source
 
-                return wrap_fx_proxy(
+                # Check if the attr has already been registered. This can happen
+                # when two different sources point to the same tensor.
+                if target in self.root_tx.output.side_effects:
+                    return self.root_tx.output.side_effects[target]
+
+                if get_static_address_type(target) == "guarded":
+                    install_guard(source.make_guard(GuardBuilder.ID_MATCH))
+                elif not is_constant_source(source):
+                    install_guard(source.make_guard(GuardBuilder.TENSOR_MATCH))
+
+                vt = wrap_fx_proxy(
                     self.root_tx,
                     tracer.create_proxy("get_attr", module_key, tuple(), {}),
                     example_value=target,
                     **options,
                 )
 
+                # Track the object so to avoid duplicate registration in case of
+                # different sources pointing to the same tensor object.
+                vt = self.root_tx.output.side_effects.track_object_existing(target, vt)
+                return vt
+
         elif isinstance(target, torch.nn.Module):
             assert isinstance(target, torch.nn.Module)
 
-            install_guard(source.make_guard(GuardBuilder.NN_MODULE))
+            if source:
+                install_guard(source.make_guard(GuardBuilder.NN_MODULE))
 
-            def wrap_name(module_key):
-                return NNModuleVariable(type(target), module_key, target, **options)
+                def wrap_name(module_key):
+                    return NNModuleVariable(type(target), module_key, target, **options)
+
+            else:
+                # This is Dynamo created graph module, e.g., graph module coming
+                # from higher order ops. NNModuleVariable tracker can't be
+                # sourceless, so let's return a unspecializedNNModule variable
+                # tracker.
+                def wrap_name(module_key):
+                    return variables.UnspecializedNNModuleVariable(target, **options)
 
         elif isinstance(target, (torch.SymInt, torch.SymFloat)):
             # HACKY CODE REGION BEGIN
@@ -800,6 +867,79 @@ def register_leaf_name(leaf_name):
 
         raise AssertionError("unreachable")
 
+    def handle_aliases_for_stolen_lists(self, tx):
+        # If list inputs are stolen, but still needed after the function call, create aliases to keep them alive
+        maybe_gm = self.local_scope.get("self")
+        stolen_list_names = get_locals_to_steal(maybe_gm)
+        if not stolen_list_names:
+            return []
+
+        alias_insts = []
+        needs_alias: Dict[
+            str, List[Union[VariableTracker, AttributeMutationExisting]]
+        ] = {}
+
+        queue = [
+            *tx.stack,
+            *tx.symbolic_locals.values(),
+            *self.side_effects.store_attr_mutations.keys(),
+        ]
+
+        while queue:
+            x = queue.pop()
+            if isinstance(x, BaseListVariable):
+                assert isinstance(x.items, List)
+                queue += x.items
+                continue
+
+            if not (
+                isinstance(x, (VariableTracker, AttributeMutationExisting))
+                and isinstance(x.source, GetItemSource)
+                and isinstance(x.source.base, LocalSource)
+                and x.source.base.local_name in stolen_list_names
+            ):
+                continue
+
+            stolen_name = x.source.base.local_name
+            if stolen_name not in needs_alias:
+                needs_alias[stolen_name] = []
+            needs_alias[stolen_name].append(x)
+
+        visited = {}
+        for arg in self.graphargs:
+            if not (
+                isinstance(arg._example, list)
+                and isinstance(arg.source, LocalSource)
+                and arg.source.local_name in needs_alias
+            ):
+                continue
+
+            # arg is a list that will be cleared by the compiled function
+            list_name = arg.source.local_name
+            assert list_name in self.code_options["co_varnames"]
+            for x in needs_alias[list_name]:
+                list_idx = x.source.index
+                if list_idx not in visited:
+                    alias_name = self.new_var(
+                        f"{list_name}_ref"
+                    )  # self.new_var already adds unique id suffix
+
+                    visited[list_idx] = alias_name
+                    # bytecode of `alias_name = list_name[list_idx]`
+                    alias_insts.extend(
+                        [
+                            create_instruction("LOAD_FAST", argval=list_name),
+                            create_instruction("LOAD_CONST", argval=list_idx),
+                            create_instruction("BINARY_SUBSCR"),
+                            create_instruction("STORE_FAST", argval=alias_name),
+                        ]
+                    )
+
+                # operate on alias, handled by suffix codegen
+                x.source = LocalSource(visited[list_idx])
+
+        return alias_insts
+
     def compile_subgraph(
         self, tx, partial_convert=False, reason: Optional[GraphCompileReason] = None
     ):
@@ -836,6 +976,11 @@ def compile_subgraph(
                     )
                 else:
                     prefix_insts.append(copy.copy(inst))
+        assert not (
+            self.pregraph_bytecode and self.export
+        ), "export does not support pregraph_bytecode"
+        prefix_insts.extend(self.pregraph_bytecode)
+        prefix_insts.extend(self.handle_aliases_for_stolen_lists(tx))
 
         def append_prefix_insts():
             self.add_output_instructions(prefix_insts)
@@ -847,7 +992,22 @@ def append_prefix_insts():
         self.cleanup_graph()
         tx.prune_dead_locals()
         stack_values = list(tx.stack)
-        root = FakeRootModule(self.nn_modules)
+
+        # realize any unrealized tensor VTs in case they
+        # need to be added to self.nn_modules as attributes
+        for value in stack_values:
+            value.realize()
+
+        # Use nn.Module "proxies" in the constructed GraphModule so that
+        # the resulting GM does not hold additional strong references to the original modules.
+        # This prevents a strong ref cycle where Dynamo created code holds on to references
+        # to modules that also have Dynamo code cache invalidation checks.
+        # When cache invalidation runs, the generated GM will be invalidated, which also deletes
+        # the proxies.
+        nn_modules_proxies = {
+            name: nn_module_proxy(mod) for name, mod in self.nn_modules.items()
+        }
+        root = FakeRootModule(nn_modules_proxies)
         # Add all the local vars to the "stack" so restore at the end
         restore_vars = []
         val_to_names: Dict[VariableTracker, List[str]] = {}
@@ -866,6 +1026,14 @@ def append_prefix_insts():
             # while running test_subgraphs.py
             if isinstance(v.source, LocalSource) and v.source.local_name == k:
                 continue  # no need to restore initial state
+            # Do not load variable if it is NULL.
+            if sys.version_info >= (3, 12):
+                # Continuation function will load the NULL for v.
+                if type.__instancecheck__(NullVariable, v):
+                    continue
+            else:
+                # A variable should never be NULL in < 3.12
+                assert not type.__instancecheck__(NullVariable, v)
             if v not in val_to_names:
                 val_to_names[v] = list()
             val_to_names[v].append(k)
@@ -874,13 +1042,12 @@ def append_prefix_insts():
             stack_values.extend([v] * len(val_to_names[v]))
 
         # to handle random calls
-        if len(tx.random_calls) > 0:
+        if len(self.random_calls) > 0:
             append_prefix_insts()
             random_calls_instructions = []
             self.random_values_var = self.new_var("random_values")
-            rand_fn_name = unique_id("__gen_rand_values")
-            rand_fn = disable(_get_gen_rand_values_fn(tx.random_calls))
-            self.install_global(rand_fn_name, rand_fn)
+            rand_fn = disable(_get_gen_rand_values_fn(self.random_calls))
+            rand_fn_name = self.install_global("__gen_rand_values", rand_fn)
             codegen = PyCodegen(tx, root)
             random_calls_instructions.extend(
                 codegen.load_function_name(rand_fn_name, True)
@@ -907,6 +1074,8 @@ def append_prefix_insts():
             and all(isinstance(x, TensorVariable) for x in stack_values)
             and len(set(stack_values)) == len(stack_values)
             and self.side_effects.is_empty()
+            and not len(tx.debug_locals) != 0
+            and not self.backward_state
         ):
             append_prefix_insts()
             # optimization to generate better code in a common case
@@ -914,13 +1083,14 @@ def append_prefix_insts():
                 self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
                 + [create_instruction("UNPACK_SEQUENCE", arg=len(stack_values))]
             )
+            # restore all the live local vars
+            self.add_output_instructions(
+                [PyCodegen(tx).create_store(var) for var in reversed(restore_vars)]
+            )
         else:
             graph_output_var = self.new_var("graph_out")
             pass1 = PyCodegen(tx, root, graph_output_var)
-            self.side_effects.codegen_hooks(pass1)
-            self.side_effects.codegen_save_tempvars(pass1)
-            pass1.restore_stack(stack_values, value_from_source=not tx.export)
-            self.side_effects.codegen_update_mutated(pass1)
+            self.codegen_suffix(tx, stack_values, pass1)
 
             # one more time now that we have established tempvars
             pass2 = PyCodegen(
@@ -929,11 +1099,9 @@ def append_prefix_insts():
                 graph_output_var,
                 tempvars={val: None for val, count in pass1.uses.items() if count > 1},
             )
-            self.side_effects.codegen_hooks(pass2)
-            self.side_effects.codegen_save_tempvars(pass2)
-            pass2.restore_stack(stack_values, value_from_source=not tx.export)
-            self.side_effects.codegen_update_mutated(pass2)
+            self.codegen_suffix(tx, stack_values, pass2)
 
+            stored_graph_output_var = False
             output = []
             if count_calls(self.graph) != 0 or len(pass2.graph_outputs) != 0:
                 output.extend(
@@ -942,15 +1110,42 @@ def append_prefix_insts():
 
                 if len(pass2.graph_outputs) != 0:
                     output.append(pass2.create_store(graph_output_var))
+                    stored_graph_output_var = True
                 else:
                     output.append(create_instruction("POP_TOP"))
             append_prefix_insts()
             self.add_output_instructions(output + pass2.get_instructions())
 
-        # restore all the live local vars
-        self.add_output_instructions(
-            [PyCodegen(tx).create_store(var) for var in reversed(restore_vars)]
-        )
+            # restore all the live local vars
+            self.add_output_instructions(
+                [PyCodegen(tx).create_store(var) for var in reversed(restore_vars)]
+            )
+
+            if stored_graph_output_var:
+                self.add_output_instructions(
+                    [PyCodegen(tx).create_delete(graph_output_var)]
+                )
+
+    def codegen_suffix(self, tx, stack_values, cg):
+        if self.backward_state:
+            assert not self.export
+            for name, val in self.backward_state.items():
+                cg(val)
+                cg.append_output(cg.create_load(self.backward_state_var))
+                cg.store_attr(name)
+        self.side_effects.codegen_hooks(cg)
+        self.side_effects.codegen_save_tempvars(cg)
+
+        # Return variables used for logging at the end
+        for debug_var, args in tx.debug_locals:
+            cg(debug_var)
+            for arg in args:
+                cg(arg)
+            cg.extend_output(create_call_function(len(args), True))
+            cg.extend_output([create_instruction("POP_TOP")])
+
+        cg.restore_stack(stack_values, value_from_source=not tx.export)
+        self.side_effects.codegen_update_mutated(cg)
 
     def cleanup_graph(self):
         """
@@ -982,7 +1177,16 @@ def cleanup_graph(self):
                     self.graph.erase_node(node1)
                     self.graph.erase_node(node2)
 
-    def get_graph_sizes_log_str(self, name):
+    def get_graph_sizes_structured(self):
+        ret = {}
+        for node in self.graph.nodes:
+            example_value = node.meta.get("example_value", None)
+            if isinstance(example_value, torch._subclasses.FakeTensor):
+                size = example_value.size()
+                ret[node.name] = [s if isinstance(s, int) else repr(s) for s in size]
+        return ret
+
+    def get_graph_sizes(self, name: str):
         graph_sizes_str = "TRACED GRAPH TENSOR SIZES\n"
         graph_sizes_str += f"===== {name} =====\n"
         for node in self.graph.nodes:
@@ -1045,7 +1249,12 @@ def compile_and_call_fx_graph(self, tx, rv, root):
             (self.current_tracer.create_arg(tuple(x.as_proxy() for x in rv)),),
             {},
         )
-        self.insert_deferred_runtime_asserts(root, name)
+        if not config.do_not_emit_runtime_asserts:
+            insert_deferred_runtime_asserts(
+                fx.GraphModule(root, self.graph),
+                self.shape_env,
+                name,
+            )
         # NB: deferred runtime asserts can keep graphargs live, so make sure
         # those are inserted before pruning
         self.remove_unused_graphargs()
@@ -1055,7 +1264,7 @@ def compile_and_call_fx_graph(self, tx, rv, root):
         # free a bit of memory
         self.real_value_cache.clear()
 
-        gm = fx.GraphModule(root, self.graph)
+        gm = _make_graph_module(root, self.graph)
         for register_finalizer in self.register_finalizer_fns:
             register_finalizer(gm)
 
@@ -1065,17 +1274,23 @@ def compile_and_call_fx_graph(self, tx, rv, root):
         ] = self.dynamo_flat_name_to_original_fqn.copy()
 
         graph_code_log.debug("%s", lazy_format_graph_code(name, gm))
-        graph_tabular_log.debug("%s", lazy_format_graph_tabular(name, gm))
-        graph_sizes_log.debug(
-            "%s", LazyString(lambda: self.get_graph_sizes_log_str(name))
+        torch._logging.trace_structured(
+            "dynamo_output_graph",
+            lambda: {"sizes": self.get_graph_sizes_structured()},
+            payload_fn=lambda: gm.print_readable(
+                print_output=False, include_stride=True, include_device=True
+            ),
         )
         self.call_cleanup_hooks()
         old_fake_mode = self.tracing_context.fake_mode
         if not self.export:
-            # TODO(voz): The way export uses gm, and fake tensors, is not supported with us resetting
-            backend_fake_mode = torch._subclasses.FakeTensorMode(
-                shape_env=old_fake_mode.shape_env,
-            )
+            import torch._functorch.config as _config
+
+            with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
+                # TODO(voz): The way export uses gm, and fake tensors, is not supported with us resetting
+                backend_fake_mode = torch._subclasses.FakeTensorMode(
+                    shape_env=old_fake_mode.shape_env,
+                )
             # TODO(voz): Ostensibily, this should be scoped and
             # restore back to old_fake_mode, but doing so currently violates
             # a lot of fake_tensor ownership assumptions and runs afoul of detect_fake_mode
@@ -1083,10 +1298,35 @@ def compile_and_call_fx_graph(self, tx, rv, root):
 
         with self.restore_global_state():
             compiled_fn = self.call_user_compiler(gm)
+
+        from torch.fx._lazy_graph_module import _LazyGraphModule
+
+        if isinstance(compiled_fn, _LazyGraphModule) or (
+            isinstance(getattr(compiled_fn, "__self__", None), _LazyGraphModule)
+            and compiled_fn.__name__ == "_lazy_forward"
+        ):
+            # Since dynamo will run the forward method for the GraphModule shortly
+            # anyways, it does not hurt to do the real recompilation here if
+            # this is a _LazyGraphModule. This makes it easier for dynamo to
+            # optimize a _LazyGraphModule.
+
+            lazy_gm = (
+                compiled_fn
+                if isinstance(compiled_fn, _LazyGraphModule)
+                else compiled_fn.__self__
+            )
+
+            _LazyGraphModule.force_recompile(lazy_gm)
+
+            if not isinstance(compiled_fn, _LazyGraphModule):
+                # replace compiled_fn with the real forward method
+                compiled_fn = lazy_gm.forward
+
         compiled_fn = disable(compiled_fn)
 
         counters["stats"]["unique_graphs"] += 1
-        self.install_global(name, compiled_fn)
+        # This is safe because we pre-process name to be unique
+        self.install_global_unsafe(name, compiled_fn)
 
         cg = PyCodegen(tx)
         cg.make_call_generated_code(name)
@@ -1094,13 +1334,7 @@ def compile_and_call_fx_graph(self, tx, rv, root):
 
     @property
     def placeholders(self) -> List[fx.Node]:
-        r = []
-        for node in self.graph.nodes:
-            if node.op == "placeholder":
-                r.append(node)
-                continue
-            break
-        return r
+        return self.graph.find_nodes(op="placeholder")
 
     @property
     def graphargs(self) -> List[GraphArg]:
@@ -1122,8 +1356,8 @@ def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
             # TODO: Why isn't this stored in meta :think:
             pl._dynamo_source = arg.source
 
-        gm._param_name_to_source = self.param_name_to_source
-        gm._source_to_user_stacks = self.source_to_user_stacks
+        gm._param_name_to_source = self.param_name_to_source  # type: ignore[assignment]
+        gm._source_to_user_stacks = self.source_to_user_stacks  # type: ignore[assignment]
 
         try:
             name = (
@@ -1204,7 +1438,11 @@ def remove_unused(node):
             self.remove_node(node)
             self.real_value_cache.pop(node, None)
 
-        used_symbols = set()
+        used_symbols: Set[sympy.Symbol] = set()
+
+        def update_used_symbols(used_symbols, fake: Union[torch.SymInt, torch.Tensor]):
+            used_symbols |= free_symbols(fake)
+
         recheck_placeholders = []
         for node in self.placeholders:
             binds_symbol = placeholder_binds_symbol(node) is not None
@@ -1213,15 +1451,31 @@ def remove_unused(node):
                 if not node.users:
                     recheck_placeholders.append(node)
             else:
-                if not node.users:
+                if not node.users and not isinstance(
+                    node.meta["grapharg"], BackwardStateGraphArg
+                ):
                     remove_unused(node)
                 else:
                     # Register the free symbols as uses
                     arg = node.meta["grapharg"]
+                    if isinstance(arg, BackwardStateGraphArg):
+                        continue
+                    if isinstance(node.meta["grapharg"].example, torch.ScriptObject):
+                        real_script_obj = node.meta["grapharg"].example
+                        fake_script_obj = node.meta["grapharg"].example_strong_ref
+                        flat_dict = dict(real_script_obj.__obj_flatten__())  # type: ignore[attr-defined]
+                        for attr in flat_dict.keys():
+                            fake_attr_val = getattr(fake_script_obj.wrapped_obj, attr)
+                            pytree.tree_map_only(
+                                (torch.SymInt, torch.Tensor),
+                                lambda t: update_used_symbols(used_symbols, t),
+                                fake_attr_val,
+                            )
+                        continue
                     fake = (
                         arg.fake_tensor if arg.fake_tensor is not None else arg.example
                     )
-                    used_symbols |= free_symbols(fake)
+                    update_used_symbols(used_symbols, fake)
 
         # After removing unused graphargs, prune unused binds_symbol
         for node in recheck_placeholders:
@@ -1233,128 +1487,6 @@ def remove_unused(node):
                     # Make sure we delete later occurrences of the same symbol
                     used_symbols.remove(symbol)
 
-    # TODO: this is a generic pass that should live outside of Dynamo
-    def insert_deferred_runtime_asserts(self, root, name) -> None:
-        """
-        During tracing, we may have discovered that some data-dependent values
-        had runtime assert on them; e.g., torch.empty(x.item()) induces a runtime
-        that x.item() >= 0.  This asserts can happen unpredictably during fake
-        tensor propagation, so we cannot conveniently insert them into the FX graph
-        when they occur.  Instead, we accumulate them in the ShapeEnv, and in this
-        pass insert them into the graph as proper tests.
-        """
-        # TODO: Request simplification on runtime asserts before emitting them
-        ras_by_symbol = self.shape_env.deferred_runtime_asserts.copy()
-
-        if not any(ras for ras in ras_by_symbol.values()):
-            return
-
-        gm = fx.GraphModule(root, self.graph)
-        graph_code_log.debug(
-            "%s",
-            lazy_format_graph_code(f"pre insert_deferred_runtime_asserts {name}", gm),
-        )
-
-        # We are going to mutate the dict
-        symbol_to_proxy = {}
-        placeholders = set()
-        last_placeholder = None
-        for node in self.graph.nodes:
-            if node.op != "placeholder":
-                last_placeholder = node
-                break
-            placeholders.add(node)
-        assert last_placeholder is not None
-
-        # Identify what symbols we need to reify.  This isn't strictly needed
-        # but helps reduce churn on the graph
-        needed_symbols: Set[sympy.Symbol] = set()
-        for ras in ras_by_symbol.values():
-            for ra in ras:
-                needed_symbols.update(free_symbols(ra.expr))
-
-        log.debug("needed_symbols = %s", needed_symbols)
-
-        for node in self.graph.nodes:
-            # Placeholders can match symbols, but when we destructure them
-            # with size we have to make sure we insert the nodes after all
-            # the placeholders
-            with self.graph.inserting_before(
-                node.next if node not in placeholders else last_placeholder.next
-            ):
-                if "example_value" not in node.meta:
-                    continue
-
-                defs = []
-
-                # For every new unbacked symbol, we need an fx.Node representing
-                # precisely this value.  There are a few places where the unbacked
-                # symbol could have come from, and we will check them to setup
-                # these nodes.
-                #
-                # For a case like item(), this is trivial (no new node is added.)
-                #
-                # For nonzero(), we need to add something like i0 = out.size(0)
-                #
-                # We could end up with duplicate nodes this way but it is not a
-                # big deal.
-                #
-                # We also do this to setup backed SymInts, but those are all going
-                # to be matched from placeholders
-                def match_symbol(symint, cb):
-                    if (
-                        isinstance(symint, torch.SymInt)
-                        and isinstance(symint.node, SymNode)
-                        and isinstance(s := symint.node.expr, sympy.Symbol)
-                        and s not in symbol_to_proxy
-                        and s in needed_symbols
-                    ):
-                        symbol_to_proxy[s] = fx.Proxy(cb())
-                        log.debug("symbol_to_proxy[%s] = %s", s, symbol_to_proxy[s])
-                        defs.append(s)
-
-                match_symbol(node.meta["example_value"], lambda: node)
-                if isinstance(t := node.meta["example_value"], torch.Tensor):
-                    for i, s in enumerate(t.size()):
-                        match_symbol(
-                            s, lambda: self.graph.call_method("size", (node, i))
-                        )
-                    for i, s in enumerate(t.stride()):
-                        match_symbol(
-                            s, lambda: self.graph.call_method("stride", (node, i))
-                        )
-                    match_symbol(
-                        t.storage_offset(),
-                        lambda: self.graph.call_method("storage_offset", (node,)),
-                    )
-
-                for i0 in defs:
-                    ras = ras_by_symbol.pop(i0, [])
-                    for ra in ras:
-                        log.debug("inserting runtime assert %s", ra.expr)
-                        # Need to process ALL free symbols, not just unbacked ones
-                        fvs = free_symbols(ra.expr)
-                        missing = fvs - symbol_to_proxy.keys()
-                        if missing:
-                            i1 = sorted(missing)[0]
-                            assert self.shape_env.is_unbacked_symint(i1), i1
-                            ras_by_symbol.setdefault(i1, []).append(ra)
-                        else:
-                            # Convert the sympy expression into a sequence of FX
-                            # nodes
-                            res = sympy_interp(
-                                PythonReferenceAnalysis, symbol_to_proxy, ra.expr
-                            ).node
-                            self.graph.call_function(
-                                torch.ops.aten._assert_scalar.default,
-                                # TODO: use ra.msg here, but it's pretty
-                                # useless right now
-                                (
-                                    res,
-                                    f"Deferred runtime assertion failed {ra.expr}",
-                                ),
-                            )
-
     def add_output_instructions(self, prefix: List[Instruction]) -> None:
         """
         We call this on the creation of a new compiled subgraph that is inserted
@@ -1363,13 +1495,47 @@ def add_output_instructions(self, prefix: List[Instruction]) -> None:
         self.output_instructions.extend(prefix)
         self.should_exit = True
 
-    def install_global(self, name, value) -> None:
+    def install_global_unsafe(self, name, value) -> None:
+        """
+        WARNING: prefer the safer `install_global_by_id/install_global`.
+        torch.compile instances should be independent of each other;
+        one footgun is to have one instance depend on the existence of
+        a global installed by another instance. This can happen if we mangle
+        a global the same way across both instances.
+        """
+        assert name not in self.installed_globals
+        self.installed_globals.add(name)
         self.cleanups.append(CleanupHook.create(self.global_scope, name, value))
 
+    def install_global_by_id(self, prefix, value) -> str:
+        """
+        Installs a global if it hasn't been installed already.
+        This is determined by (prefix, id(value)) pair.
+
+        Returns the name of the newly installed global.
+        """
+        # NB: need self.compile_id to distinguish this global
+        # from another global created in a different torch.compile instance
+        name = f"{prefix}_{id(value)}_c{self.compile_id}"
+        if name in self.installed_globals:
+            return name
+        self.install_global_unsafe(name, value)
+        return name
+
+    def install_global(self, prefix, value) -> str:
+        """
+        Installs a global, generating a unique name for it.
+
+        Returns the name of the newly installed global.
+        """
+        # NB: unique_id is unique, even across torch.compile instances
+        name = unique_id(prefix)
+        self.install_global_unsafe(name, value)
+        return name
+
     def cleanup(self) -> None:
         # There is a reference cycle between tracer and OutputGraph, causing
         # some of the tensor objects to be held alive for longer than necessary.
-
         self.root_tx = None
         self.nn_modules.clear()
         self.param_name_to_source = None
@@ -1380,8 +1546,10 @@ def cleanup(self) -> None:
         self.real_value_cache.clear()
         self.input_name_to_proxy.clear()
         self.side_effects.clear()
+        self.variable_tracker_cache.clear()
         self.register_finalizer_fns.clear()
         self.dynamo_flat_name_to_original_fqn.clear()
+        self.tracing_context.clear()
 
     def set_torch_function_state(self, enabled: bool) -> None:
         self.torch_function_enabled = enabled
@@ -1391,6 +1559,13 @@ def add_graph_finalizer(
     ) -> None:
         self.register_finalizer_fns.append(register_finalizer)
 
+    def example_value_from_input_node(self, node: torch.fx.Node):
+        """Extract the non-fake example tensor"""
+        if node.op == "placeholder":
+            return node.meta["grapharg"].example
+        assert node.op == "get_attr"
+        return self.nn_modules[node.target]  # type: ignore[index]
+
 
 err_epilogue = (
     "With the current config, we will graph break "
@@ -1444,7 +1619,7 @@ def encountered_non_compliant_op(target, msg):
             return
 
         args, kwargs = torch._dynamo.utils.get_fake_values_from_nodes(
-            output_graph.current_tx, (args, kwargs)
+            output_graph.current_tx, (args, kwargs), False
         )
         try:
             overload = torch._C._jit_resolve_packet(
@@ -1465,6 +1640,9 @@ def encountered_non_compliant_op(target, msg):
             )
 
 
+_compile_id_counter = itertools.count()
+
+
 class SubgraphTracer(fx.Tracer):
     """
     Holds an FX graph that is being traced. OutputGraph owns a SubgraphTracer
@@ -1479,6 +1657,7 @@ def __init__(
         super().__init__()
         self.output_graph = weakref.proxy(output_graph)
         self.graph = torch.fx.Graph()
+
         # The export is only ever set for the ROOT tracer.  It controls
         # whether or not certain inputs are allowed to be added or not.
         # Look at call sites of create_graph_input to see how it is used.
@@ -1609,8 +1788,8 @@ def get_trace_call_log_str():
         is_retracing = False
         if tx.f_code is not self._cur_code:
             orig_graphmodule_maybe = code_context.get_context(tx.f_code).get(
-                "orig_graphmodule", None
-            )
+                "orig_graphmodule", lambda: None
+            )()
             if isinstance(orig_graphmodule_maybe, torch.fx.GraphModule):
                 is_retracing = True
                 self._orig_gm_meta = [
@@ -1807,7 +1986,7 @@ def lift_tracked_freevar_to_input(self, proxy):
         if proxy in self.lifted_freevars:
             return self.lifted_freevars[proxy]
         new_proxy = self.create_graph_input(proxy.node.name)
-        new_proxy.node.meta["example_value"] = proxy.node.meta["example_value"]
+        set_example_value(new_proxy.node, proxy.node.meta["example_value"])
         self.lifted_freevars[proxy] = new_proxy
         if self.parent is not None and proxy.tracer != self.parent:
             self.parent.lift_tracked_freevar_to_input(proxy)
diff --git a/torch/_dynamo/polyfill.py b/torch/_dynamo/polyfill.py
index d3e3dd6e0d783..18aaa067a3d28 100644
--- a/torch/_dynamo/polyfill.py
+++ b/torch/_dynamo/polyfill.py
@@ -1,7 +1,12 @@
+# mypy: ignore-errors
+
 """
 Python polyfills for common builtins.
 """
 import math
+from typing import Any, Callable, Sequence
+
+import torch
 
 
 def all(iterator):
@@ -18,7 +23,7 @@ def any(iterator):
     return False
 
 
-def index(iterator, item, start=0, end=-1):
+def index(iterator, item, start=0, end=None):
     for i, elem in enumerate(list(iterator))[start:end]:
         if item == elem:
             return i
@@ -33,3 +38,29 @@ def repeat(item, count):
 
 def radians(x):
     return math.pi / 180.0 * x
+
+
+def accumulate_grad(x, new_grad):
+    new_grad = torch.clone(new_grad)
+    if x.grad is None:
+        x.grad = new_grad
+    else:
+        x.grad.add_(new_grad)
+
+
+def list_cmp(op: Callable[[Any, Any], bool], left: Sequence[Any], right: Sequence[Any]):
+    """emulate `(1,2,3) > (1,2)` etc"""
+    for a, b in zip(left, right):
+        if a != b:
+            return op(a, b)
+    return op(len(left), len(right))
+
+
+def dropwhile(predicate, iterable):
+    # dropwhile(lambda x: x<5, [1,4,6,4,1]) -> 6 4 1
+    iterable = iter(iterable)
+    for x in iterable:
+        if not predicate(x):
+            yield x
+            break
+    yield from iterable
diff --git a/torch/_dynamo/replay_record.py b/torch/_dynamo/replay_record.py
index 05ca85515c6d7..7a312e5d58a9e 100644
--- a/torch/_dynamo/replay_record.py
+++ b/torch/_dynamo/replay_record.py
@@ -3,10 +3,9 @@
 from types import CodeType, ModuleType
 from typing import Any, Dict
 
-try:
-    import dill
-except ImportError:
-    dill = None
+from torch.utils._import_utils import import_dill
+
+dill = import_dill()
 
 
 @dataclasses.dataclass
@@ -20,6 +19,10 @@ class DummyModule:
     name: str
     is_torch: bool = False
 
+    @property
+    def __name__(self):
+        return self.name
+
 
 @dataclasses.dataclass
 class ExecutionRecord:
diff --git a/torch/_dynamo/repro/after_aot.py b/torch/_dynamo/repro/after_aot.py
index ed0f285990f71..0dbf3cd5c0e46 100644
--- a/torch/_dynamo/repro/after_aot.py
+++ b/torch/_dynamo/repro/after_aot.py
@@ -138,7 +138,15 @@ def inner_debug_fn(real_inputs):
                     raise NotImplementedError(
                         "Accuracy minification is supported for inductor only"
                     )
-                if backend_aot_accuracy_fails(gm, real_inputs, compiler_fn):
+                failed = not same_two_models(
+                    gm,
+                    inner_compiled_fn,
+                    real_inputs,
+                    only_fwd=True,
+                    ignore_non_fp=config.repro_ignore_non_fp,
+                )
+
+                if failed:
                     log.warning(
                         "Accuracy failed for the AOT Autograd graph %s", graph_name
                     )
@@ -274,10 +282,13 @@ def save_graph_repro(
     fd.write("if __name__ == '__main__':\n")
     fd.write("    from torch._dynamo.repro.after_aot import run_repro\n")
     fd.write(
-        f"    with torch.no_grad():"
+        f"    with torch.no_grad():\n"
         f"        run_repro(mod, load_args, accuracy={accuracy!r}, command={command!r}, "
-        f"save_dir={save_dir!r}, tracing_mode={tracing_mode!r}, check_str={check_str!r}"
-        ")\n"
+        f"save_dir={save_dir!r}, tracing_mode={tracing_mode!r}, check_str={check_str!r})\n"
+        f"        # To run it separately, do \n"
+        f"        # mod, args = run_repro(mod, load_args, accuracy={accuracy!r}, command='get_args', "
+        f"save_dir={save_dir!r}, tracing_mode={tracing_mode!r}, check_str={check_str!r})\n"
+        f"        # mod(*args)"
     )
 
 
@@ -682,6 +693,11 @@ def log_error(msg, *args):
     assert not args
 
 
+def repro_get_args(options, mod, load_args):
+    mod, args = repro_common(options, mod, load_args)
+    return mod, args
+
+
 def repro_run(options, mod, load_args):
     from torch._inductor.compile_fx import compile_fx_inner
 
@@ -694,7 +710,13 @@ def repro_run(options, mod, load_args):
     if options.accuracy != "":
         # We don't really respect --accuracy vs --strict-accuracy here, it
         # seems counterintuitive
-        if not same_two_models(mod, compiled, args, only_fwd=True):
+        if not same_two_models(
+            mod,
+            compiled,
+            args,
+            only_fwd=True,
+            ignore_non_fp=config.repro_ignore_non_fp,
+        ):
             raise AccuracyError("Bad accuracy detected")
     else:
         need_sync = False
@@ -702,9 +724,10 @@ def repro_run(options, mod, load_args):
             if isinstance(arg, torch.Tensor) and arg.is_cuda:
                 need_sync = True
                 break
-        ref = compiled(args)
+        ref = compiled(list(args))
         if need_sync:
             synchronize()  # ensure segfaults are surfaced
+    return lambda: compiled(list(args))
 
 
 # TODO: lazily load the inputs or something, rather than cloning them
@@ -838,6 +861,8 @@ def common_flags(parser):
         "minify", help="run the minifier on the repro"
     )
     common_flags(parser_minify)
+    parser_get_args = subparsers.add_parser("get_args", help="get the args")
+    common_flags(parser_get_args)
     parser_minify_isolate = parser_minify.add_mutually_exclusive_group()
     parser_minify_isolate.add_argument(
         "--isolate",
@@ -927,5 +952,6 @@ def common_flags(parser):
         "analyze": repro_analyze,
         "minifier-query": repro_minifier_query,
         "run": repro_run,
+        "get_args": repro_get_args,
     }
-    COMMAND_FNS[options.command](options, mod, load_args)
+    return COMMAND_FNS[options.command](options, mod, load_args)
diff --git a/torch/_dynamo/repro/after_dynamo.py b/torch/_dynamo/repro/after_dynamo.py
index 2028a9a4c12ea..76b9128e69952 100644
--- a/torch/_dynamo/repro/after_dynamo.py
+++ b/torch/_dynamo/repro/after_dynamo.py
@@ -46,6 +46,16 @@
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 
+def _accuracy_fails(gm, example_inputs, compiler_fn):
+    return backend_accuracy_fails(
+        gm,
+        example_inputs,
+        compiler_fn,
+        only_fwd=config.repro_forward_only,
+        ignore_non_fp=config.repro_ignore_non_fp,
+    )
+
+
 def wrap_backend_debug(unconfigured_compiler_fn, compiler_name: str):
     """
     A minifier decorator that wraps the TorchDynamo produced Fx graph modules.
@@ -78,7 +88,7 @@ def add_paths(exc):
             if config.repro_level == 4:
                 # Check Accuracy
                 compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs)
-                if backend_accuracy_fails(gm, example_inputs, compiler_fn):
+                if _accuracy_fails(gm, example_inputs, compiler_fn):
                     log.warning(
                         "Accuracy failed for the TorchDynamo produced graph. Creating script to minify the error."
                     )
@@ -304,17 +314,14 @@ def dynamo_accuracy_minifier_backend(gm, example_inputs, compiler_name):
     gm.eval()
 
     # Check Accuracy
-    if backend_accuracy_fails(
-        gm, example_inputs, compiler_fn, only_fwd=config.repro_forward_only
-    ):
+    if _accuracy_fails(gm, example_inputs, compiler_fn):
         log.warning("Accuracy failed for the TorchDynamo produced graph")
         dump_state_fn = functools.partial(
             dump_backend_state, compiler_name=compiler_name, check_accuracy=True
         )
         fails_fn = functools.partial(
-            backend_accuracy_fails,
+            _accuracy_fails,
             compiler_fn=compiler_fn,
-            only_fwd=config.repro_forward_only,
         )
         dump_state_fn(fx.GraphModule(gm, copy.deepcopy(gm.graph)), example_inputs)
         minifier(
@@ -424,7 +431,13 @@ def repro_run(options, mod, load_args):
             # TODO: disable clone
             args = run_load_args(options, mod, load_args)
             assert same_two_models(mod, mod, args), "Eager itself failed"
-            if not same_two_models(mod, opt_mod, args):
+            if not same_two_models(
+                mod,
+                opt_mod,
+                args,
+                only_fwd=config.repro_forward_only,
+                ignore_non_fp=config.repro_ignore_non_fp,
+            ):
                 raise AccuracyError("Dynamo failed")
     else:
         with torch.cuda.amp.autocast(enabled=options.autocast):
diff --git a/torch/_dynamo/resume_execution.py b/torch/_dynamo/resume_execution.py
index 69f0c39b6bf93..ced0013cadbb6 100644
--- a/torch/_dynamo/resume_execution.py
+++ b/torch/_dynamo/resume_execution.py
@@ -10,6 +10,7 @@
     create_dup_top,
     create_instruction,
     create_jump_absolute,
+    create_load_method,
     Instruction,
     InstructionExnTabEntry,
     transform_code_object,
@@ -29,6 +30,9 @@
 CO_ITERABLE_COROUTINE = 0x0100
 CO_ASYNC_GENERATOR = 0x0200
 
+# trace_rules.py import this constant for consistency
+TORCH_DYNAMO_RESUME_IN_PREFIX = "torch_dynamo_resume_in"
+
 
 @dataclasses.dataclass(frozen=True)
 class ReenterWith:
@@ -47,6 +51,7 @@ def try_except(self, code_options, cleanup: List[Instruction]):
         finally:
             exit context
         """
+        # NOTE: we assume that TOS is a context manager CLASS!
         load_args = []
         if self.target_values:
             load_args = [
@@ -70,7 +75,7 @@ def try_except(self, code_options, cleanup: List[Instruction]):
             *create_call_function(len(load_args), True),
             create_instruction("STORE_FAST", argval=ctx_name),
             create_instruction("LOAD_FAST", argval=ctx_name),
-            create_instruction("LOAD_METHOD", argval="__enter__"),
+            create_load_method("__enter__"),
             *create_call_method(0),
             create_instruction("POP_TOP"),
         ]
@@ -94,7 +99,7 @@ def try_except(self, code_options, cleanup: List[Instruction]):
         def create_reset():
             return [
                 create_instruction("LOAD_FAST", argval=ctx_name),
-                create_instruction("LOAD_METHOD", argval="__exit__"),
+                create_load_method("__exit__"),
                 create_instruction("LOAD_CONST", argval=None),
                 create_dup_top(),
                 create_dup_top(),
@@ -152,6 +157,7 @@ def __call__(self, code_options, cleanup):
         with ctx(args):
             (rest)
         """
+        # NOTE: we assume that TOS is a context manager CLASS!
         load_args = []
         if self.target_values:
             load_args = [
@@ -254,7 +260,9 @@ def create_load_none():
                 exn_tab_1_target,  # PUSH_EXC_INFO
                 create_instruction("WITH_EXCEPT_START"),
                 create_instruction(
-                    "POP_JUMP_FORWARD_IF_TRUE",
+                    "POP_JUMP_FORWARD_IF_TRUE"
+                    if sys.version_info < (3, 12)
+                    else "POP_JUMP_IF_TRUE",
                     target=pop_top_after_with_except_start,
                 ),
                 exn_tab_2_end,  # RERAISE 2
@@ -311,6 +319,17 @@ def _filter_iter(l1, l2, cond):
     return res
 
 
+def _load_tuple_and_call(tup):
+    insts = []
+    if sys.version_info >= (3, 11):
+        insts.append(create_instruction("PUSH_NULL"))
+        insts.append(create_instruction("SWAP", arg=2))
+    for val in tup:
+        insts.append(create_instruction("LOAD_CONST", argval=val))
+    insts.extend(create_call_function(len(tup), False))
+    return insts
+
+
 class ContinueExecutionCache:
     cache = ExactWeakKeyDictionary()
     generated_code_metadata = ExactWeakKeyDictionary()
@@ -333,7 +352,10 @@ def generate(
         setup_fn_target_offsets: Tuple[int],  # only used in Python 3.11+
         nstack: int,
         argnames: Tuple[str],
+        argnames_null: Tuple[str],
         setup_fns: Tuple[ReenterWith],
+        stack_ctx_vars: Tuple[int, Tuple[Any]],
+        argnames_ctx_vars: Tuple[str, Tuple[Any]],
         null_idxes: Tuple[int],
     ) -> types.CodeType:
         assert offset is not None
@@ -350,7 +372,10 @@ def generate(
                 setup_fn_target_offsets,
                 nstack,
                 argnames,
+                argnames_null,
                 setup_fns,
+                stack_ctx_vars,
+                argnames_ctx_vars,
                 null_idxes,
             )
 
@@ -365,7 +390,9 @@ def update(instructions: List[Instruction], code_options: Dict[str, Any]):
             freevars = tuple(code_options["co_cellvars"] or []) + tuple(
                 code_options["co_freevars"] or []
             )
-            code_options["co_name"] = f"resume_in_{code_options['co_name']}_at_{lineno}"
+            code_options[
+                "co_name"
+            ] = f"{TORCH_DYNAMO_RESUME_IN_PREFIX}_{code_options['co_name']}_at_{lineno}"
             if is_py311_plus:
                 qualified_path = code_options["co_qualname"].rsplit(".", maxsplit=1)
                 if len(qualified_path) == 1:
@@ -375,7 +402,7 @@ def update(instructions: List[Instruction], code_options: Dict[str, Any]):
                     module_name, co_name = qualified_path
                     code_options[
                         "co_qualname"
-                    ] = f"{module_name}.resume_in_{co_name}_at_{lineno}"
+                    ] = f"{module_name}.{TORCH_DYNAMO_RESUME_IN_PREFIX}_{co_name}_at_{lineno}"
             code_options["co_firstlineno"] = lineno
             code_options["co_cellvars"] = tuple()
             code_options["co_freevars"] = freevars
@@ -383,7 +410,9 @@ def update(instructions: List[Instruction], code_options: Dict[str, Any]):
             code_options["co_posonlyargcount"] = 0
             code_options["co_kwonlyargcount"] = 0
             code_options["co_varnames"] = tuple(
-                args + [v for v in code_options["co_varnames"] if v not in args]
+                args
+                + [v for v in argnames_null if v not in args]
+                + [v for v in code_options["co_varnames"] if v not in args]
             )
             code_options["co_flags"] = code_options["co_flags"] & ~(
                 CO_VARARGS | CO_VARKEYWORDS
@@ -408,6 +437,7 @@ def update(instructions: List[Instruction], code_options: Dict[str, Any]):
             # map old hook targets to new targets generated by the hook
             old_hook_target_remap = {}
             null_idxes_i = 0
+            stack_ctx_vars_d = dict(stack_ctx_vars)  # type: ignore[var-annotated,arg-type]
             for i in range(nstack):
                 while (
                     null_idxes_i < len(null_idxes)
@@ -425,6 +455,12 @@ def update(instructions: List[Instruction], code_options: Dict[str, Any]):
                         old_hook_target = offset_to_inst[hook_target_offset]
                         meta.prefix_block_target_offset_remap.append(hook_target_offset)
                         old_hook_target_remap[old_hook_target] = exn_target
+                real_i = i + null_idxes_i
+                if real_i in stack_ctx_vars_d:
+                    # NOTE: we assume that current stack var is a context manager CLASS!
+                    # Load args for context variable and construct it
+                    prefix.extend(_load_tuple_and_call(stack_ctx_vars_d[real_i]))
+
             if is_py311_plus:
                 # reverse the mapping since targets of later/nested contexts are inserted
                 # into the mapping later, but show up earlier in the prefix.
@@ -434,6 +470,25 @@ def update(instructions: List[Instruction], code_options: Dict[str, Any]):
 
             assert not hooks
 
+            # NOTE: we assume that local var is a context manager CLASS!
+            # initialize inactive context vars in argnames
+            for name, vals in argnames_ctx_vars:
+                prefix.append(create_instruction("LOAD_FAST", argval=name))
+                prefix.extend(_load_tuple_and_call(vals))
+                prefix.append(create_instruction("STORE_FAST", argval=name))
+
+            # 3.12+: store NULL into variables that were NULL
+            if argnames_null:
+                assert sys.version_info >= (3, 12)
+                for v in argnames_null:
+                    assert v not in args
+                    prefix.extend(
+                        [
+                            create_instruction("PUSH_NULL"),
+                            create_instruction("STORE_FAST", argval=v),
+                        ]
+                    )
+
             prefix.append(create_jump_absolute(target))
 
             # because the line number table monotonically increases from co_firstlineno
@@ -564,7 +619,7 @@ def remap_block_offsets(
 
             # if offset is not in setup_fn_target_offsets, it is an error
             setup_fn_target_offsets = tuple(
-                block_target_offset_remap[n] for n in setup_fn_target_offsets
+                meta.block_target_offset_remap[n] for n in setup_fn_target_offsets
             )
         return ContinueExecutionCache.lookup(
             meta.code, lineno, new_offset, setup_fn_target_offsets, *args
diff --git a/torch/_dynamo/side_effects.py b/torch/_dynamo/side_effects.py
index 0d23031247f78..647fae379c540 100644
--- a/torch/_dynamo/side_effects.py
+++ b/torch/_dynamo/side_effects.py
@@ -1,4 +1,5 @@
 import inspect
+import warnings
 from typing import Any, Dict, List, Optional, Union
 
 import torch.nn
@@ -8,10 +9,11 @@
     create_call_function,
     create_call_method,
     create_instruction,
+    create_load_method,
 )
 from .codegen import PyCodegen
 from .exc import unimplemented
-from .source import LocalSource, Source
+from .source import GlobalSource, LocalSource, Source
 from .utils import nn_module_new, object_new
 from .variables.base import (
     is_side_effect_safe,
@@ -108,6 +110,8 @@ def diff(self, other: "SideEffects") -> Optional[str]:
             return "store_attr_mutations: unknown diff"
         elif self.save_for_backward != other.save_for_backward:
             return "save_for_backward"
+        elif self.tensor_hooks != other.tensor_hooks:
+            return "tensor_hooks"
         else:
             return None
 
@@ -123,23 +127,6 @@ def clone(self):
             tensor_hooks=self.tensor_hooks,
         )
 
-    def apply(self, fn, cache=None, skip_fn=lambda _: False):
-        if cache is None:
-            cache = dict()
-
-        self.id_to_variable = {
-            k: VariableTracker.apply(fn, v, cache, skip_fn)
-            for k, v in self.id_to_variable.items()
-        }
-        self.store_attr_mutations = {
-            k: VariableTracker.apply(fn, v, cache, skip_fn)
-            for k, v in self.store_attr_mutations.items()
-        }
-        self.save_for_backward = VariableTracker.apply(
-            fn, self.save_for_backward, cache, skip_fn
-        )
-        self.tensor_hooks = VariableTracker.apply(fn, self.tensor_hooks, cache, skip_fn)
-
     def __contains__(self, item):
         return id(item) in self.id_to_variable
 
@@ -192,9 +179,9 @@ def store_global(self, gvar: VariableTracker, name: str, value: VariableTracker)
 
     @staticmethod
     def cls_supports_mutation_side_effects(cls):
-        return inspect.getattr_static(cls, "__setattr__", None) in (
-            object.__setattr__,
-            torch.nn.Module.__setattr__,
+        return (
+            inspect.getattr_static(cls, "__getattribute__", None)
+            is object.__getattribute__
         )
 
     def is_attribute_mutation(self, item):
@@ -205,6 +192,11 @@ def has_pending_mutation(self, item):
             self.store_attr_mutations.get(item.mutable_local)
         )
 
+    def has_pending_mutation_of_attr(self, item, name):
+        return self.is_attribute_mutation(
+            item
+        ) and name in self.store_attr_mutations.get(item.mutable_local, ())
+
     def is_modified(self, item):
         if isinstance(item.mutable_local, AttributeMutationNew):
             return True
@@ -242,7 +234,8 @@ def track_object_new(
         options,
     ):
         if user_cls is torch.autograd.function.FunctionCtx:
-            obj = torch.autograd.Function()
+            with warnings.catch_warnings(record=True):
+                obj = torch.autograd.Function()
         elif issubclass(user_cls, torch.nn.Module):
             obj = nn_module_new(user_cls)
         else:
@@ -287,6 +280,18 @@ def track_save_for_backward(self, ctx, args):
         assert isinstance(ctx, variables.AutogradFunctionContextVariable)
         self.save_for_backward.append((ctx, args))
 
+    def track_tensor_variables_from_runahead_side_effects(self, other):
+        # In higher order ops we want to keep track of tensors seen in the
+        # speculate_subgraph so that we don't lift them again as a new input in
+        # other speculate_subgraph or in the root tracer.
+        for other_item in other.keepalive:
+            other_id = id(other_item)
+            other_variable = other.id_to_variable[other_id]
+            if other_id not in self.id_to_variable and isinstance(
+                other_variable, variables.TensorVariable
+            ):
+                self.track_object_existing(other_item, other_variable)
+
     def prune_dead_object_new(self, tx):
         live_new_objects = set()
         skip_obj = None
@@ -297,7 +302,6 @@ def visit(var: VariableTracker):
                 and var.mutable_local is not skip_obj
             ):
                 live_new_objects.add(var.mutable_local)
-            return var
 
         def is_live(var: Union[MutableLocalBase, VariableTracker]):
             if isinstance(var, AttributeMutationNew):
@@ -306,13 +310,13 @@ def is_live(var: Union[MutableLocalBase, VariableTracker]):
                 return is_live(var.mutable_local)
             return True
 
-        VariableTracker.apply(visit, (tx.stack, tx.symbolic_locals))
+        VariableTracker.visit(visit, (tx.stack, tx.symbolic_locals))
         for var in self.id_to_variable.values():
             if not isinstance(var.mutable_local, AttributeMutationNew):
-                VariableTracker.apply(visit, var)
+                VariableTracker.visit(visit, var)
 
         for skip_obj, setattrs in self.store_attr_mutations.items():
-            VariableTracker.apply(visit, setattrs)
+            VariableTracker.visit(visit, setattrs)
 
         self.id_to_variable = {
             k: v for k, v in self.id_to_variable.items() if is_live(v)
@@ -338,7 +342,7 @@ def codegen_save_tempvars(self, cg: PyCodegen):
                 cg.extend_output(create_call_function(0, True))
                 cg.add_cache(var)
                 if isinstance(var.mutable_local, AttributeMutationNew):
-                    var.mutable_local.source = LocalSource(cg.tempvars[var])
+                    var.mutable_local.source = LocalSource(cg.tempvars[var])  # type: ignore[attr-defined]
             elif isinstance(var.mutable_local, AttributeMutationNew):
                 if isinstance(var, variables.AutogradFunctionContextVariable):
                     unimplemented("AutogradFunctionContextVariable escaped")
@@ -361,9 +365,7 @@ def codegen_save_tempvars(self, cg: PyCodegen):
 
         for ctx, args in self.save_for_backward:
             cg(ctx.source)
-            cg.extend_output(
-                [create_instruction("LOAD_METHOD", argval="save_for_backward")]
-            )
+            cg.load_method("save_for_backward")
             for arg in args:
                 cg(arg)
             cg.extend_output(
@@ -374,7 +376,17 @@ def codegen_save_tempvars(self, cg: PyCodegen):
             )
 
     def register_hook(self, tensor, hook, handle, name):
+        assert isinstance(tensor, variables.TensorVariable)
+        assert isinstance(hook, variables.VariableTracker)
+        assert (
+            isinstance(handle, variables.RemovableHandleVariable)
+            and handle.mutable_local
+        )
+        assert hasattr(torch.Tensor, name)
         idx = len(self.tensor_hooks.keys())
+        # duplicate index possible because of self.remove_hook()
+        while idx in self.tensor_hooks:
+            idx += 1
         self.tensor_hooks[idx] = (tensor, hook, handle, name)
         assert not handle.idx
         handle.idx = idx
@@ -427,27 +439,10 @@ def codegen_hooks(self, cg):
             cg.extend_output([cg.create_load_attr(name)])
             cg(hook)
             cg.extend_output(create_call_function(1, True))
-            # Let's go over how handles work.
-            #
-            # A handle is created from invoking `register_hook` on a tensor. A handle can be referenced at any
-            # time after that, or never. In dynamo, we track and associate a name with a handle (user_code_variable_name) to
-            # determine if a handle is accessed. If a handle has no user_code_variable_name, we just pop the produced value
-            # off the top of the stack, discarding the handle.
-            #
-            # If a handle is seen, we store it under that name. This is extremely important, because, the handle
-            # can be generated at any time after this point, and can be generated multiple times! If we were to defer
-            # actual codegen of the handle object until we saw a codegen call to it - then we would end up generating multiple
-            # register_hook calls, which is incorrect. This turns the codegen reconstruct(handle) call for the handle into
-            # essentially a lookup.
-            if (
-                hasattr(handle, "user_code_variable_name")
-                and handle.user_code_variable_name
-            ):
-                # register_hook stored with variable name assigned to the handle
-                cg.extend_output([cg.create_store(handle.user_code_variable_name)])
-            else:
-                # register_hook stored w/o a variable name assigned to the handle
-                cg.extend_output([create_instruction("POP_TOP")])
+
+            # Adding the handle to the cache means RemovableHandleVariable().reconstruct() will
+            # be associated with the return value of register_hook().  This consumes the top of stack.
+            cg.add_cache(handle)
 
     def codegen_update_mutated(self, cg: PyCodegen):
         suffixes = []
@@ -455,7 +450,7 @@ def codegen_update_mutated(self, cg: PyCodegen):
             if isinstance(var, variables.ListVariable):
                 # old[:] = new
                 cg(var, allow_cache=False)
-                cg(var.mutable_local.source)
+                cg(var.mutable_local.source)  # type: ignore[attr-defined]
                 cg.extend_output(
                     [
                         cg.create_load_const(None),
@@ -468,12 +463,12 @@ def codegen_update_mutated(self, cg: PyCodegen):
                 cg.tx.output.update_co_names("clear")
                 cg.tx.output.update_co_names("update")
 
-                cg(var.mutable_local.source)
-                cg.extend_output([create_instruction("LOAD_METHOD", argval="update")])
+                cg(var.mutable_local.source)  # type: ignore[attr-defined]
+                cg.extend_output([create_load_method("update")])
                 cg(var, allow_cache=False)
 
-                cg(var.mutable_local.source)
-                cg.extend_output([create_instruction("LOAD_METHOD", argval="clear")])
+                cg(var.mutable_local.source)  # type: ignore[attr-defined]
+                cg.extend_output([create_load_method("clear")])
 
                 suffixes.append(
                     [
@@ -490,6 +485,7 @@ def codegen_update_mutated(self, cg: PyCodegen):
                     if isinstance(var, variables.NewGlobalVariable):
                         cg.tx.output.update_co_names(name)
                         cg(value)
+                        assert isinstance(var.mutable_local.source, GlobalSource)  # type: ignore[attr-defined]
                         suffixes.append(
                             [create_instruction("STORE_GLOBAL", argval=name)]
                         )
@@ -504,6 +500,19 @@ def codegen_update_mutated(self, cg: PyCodegen):
                             suffixes.append(
                                 [create_instruction("DELETE_ATTR", argval=name)]
                             )
+                    elif (
+                        isinstance(var, variables.UserDefinedObjectVariable)
+                        and var.needs_slow_setattr()
+                    ):
+                        # __setattr__ is defined on this object, so call object.__setattr__ directly
+                        cg.load_import_from("builtins", "object")
+                        cg.load_method("__setattr__")
+                        cg(var.mutable_local.source)  # type: ignore[attr-defined]
+                        cg(variables.ConstantVariable(name))
+                        cg(value)
+                        suffixes.append(
+                            [*create_call_method(3), create_instruction("POP_TOP")]
+                        )
                     else:
                         cg.tx.output.update_co_names(name)
                         cg(value)
@@ -512,9 +521,9 @@ def codegen_update_mutated(self, cg: PyCodegen):
             elif isinstance(var, variables.TupleIteratorVariable):
                 for _ in range(var.index):
                     cg.load_import_from(utils.__name__, "iter_next")
-                    cg(var.mutable_local.source)
-                    cg.extend_output(create_call_function(1, True))
-                    cg.append_output(create_instruction("POP_TOP"))
+                    cg(var.mutable_local.source)  # type: ignore[attr-defined]
+                    cg.call_function(1, True)
+                    cg.pop_top()
             else:
                 raise AssertionError(type(var))
 
diff --git a/torch/_dynamo/skipfiles.py b/torch/_dynamo/skipfiles.py
deleted file mode 100644
index fd433e9597bc9..0000000000000
--- a/torch/_dynamo/skipfiles.py
+++ /dev/null
@@ -1,430 +0,0 @@
-import _collections_abc
-import _weakrefset
-import abc
-import collections
-import contextlib
-import copy
-import copyreg
-import dataclasses
-import enum
-import functools
-import importlib
-import inspect
-import linecache
-import logging
-import multiprocessing
-import operator
-import os
-import posixpath
-import random
-import re
-import selectors
-import signal
-import tempfile
-import threading
-import tokenize
-import traceback
-import types
-import typing
-import unittest
-import weakref
-from typing import Optional
-
-import torch
-import torch._inductor.test_operators
-import torch.distributed
-import torch.utils._content_store
-from .utils import getfile
-
-from .variables.functions import (
-    NestedUserFunctionVariable,
-    UserFunctionVariable,
-    UserMethodVariable,
-)
-
-
-"""
-A note on skipfiles:
-
-Dynamo consults this file to determine whether function should be inlined or skipped.
-
-A skip applies at the frame boundary, meaning dynamo either triggers a graph break
-at the beginning of the frame or attempts to trace/inline the whole frame. When skipping
-a frame, recursively called frames are still traced by dynamo unless also skipped.
-
-Skipfiles (skipped at the file level instead of function level) still apply on a
-frame-by-frame boundary as dynamo traces, but apply to all functions in that file.
-
-@skip is a helper decorator that can be applied to your function to cause it to be
-included here.
-
-Dynamo skip/inline rules & priorities are defined as follows:
-* Inline is the default behavior and will be used unless explicitly skipped.
-* Dynamo has two SKIPLIST: BUILTIN_SKIPLIST and THIRDPARTY_SKIPLIST.
-    * BUILTIN_SKIPLIST contains builtin python modules, such as abc, collections, etc.
-    * THIRDPARTY_SKIPLIST contains common third party libraries, such as numpy, pandas, etc.
-* Functions in these two SKIPLISTs are always skipped, except when they are explicitly
-    put into the two INLINELIST: FUNC_INLINELIST and MOD_INLINELIST.
-* PyTorch(torch) is in the BUILTIN_SKIPLIST by default, but there are many cases
-    where we want inline the functions under torch namespace. We should add them
-    into one of the two *_INLINELIST to make dynamo inline those functions.
-* If you call functions under skipped modules/files, Dynamo will wrap these functions
-    as SkipFilesVariable. There are a few functions(e.g, collections.OrderedDict) that
-    we have special handling at SkipFilesVariable.call_function.
-
-Overall: *_INLINELIST has precedence over *_SKIPLIST has precedence over DEFAULT (inline)
-
-To figure out what the behavior is, check the following list in order:
-* FUNC_INLINELIST (Inline if YES)
-* MOD_INLINELIST (Inline if YES)
-* BUILTIN_SKIPLIST & THIRDPARTY_SKIPLIST (Skip if YES)
-* Inline by default
-
-In general, if you want to force inline a function or module, please consider adding
-the function's python module to MOD_INLINELIST first.
-Use the FUNC_INLINELIST only when there are other functions under the same module that
-you don't want to inline them.
-"""
-
-
-BUILTIN_SKIPLIST = (
-    abc,
-    collections,
-    contextlib,
-    copy,
-    copyreg,
-    dataclasses,
-    enum,
-    functools,
-    importlib,
-    inspect,
-    linecache,
-    logging,
-    multiprocessing,
-    operator,
-    os,
-    posixpath,
-    random,
-    re,
-    selectors,
-    signal,
-    tempfile,
-    threading,
-    tokenize,
-    torch,  # torch/* is skipped by default unless specified in FUNC_INLINELIST or MOD_INLINELIST
-    traceback,
-    types,
-    typing,
-    unittest,
-    weakref,
-    _collections_abc,
-    _weakrefset,
-)
-
-# third party libraries skiplist is defined by str, because users may not use these libraries.
-# we should use lazy import & skip in the future.
-THIRDPARTY_SKIPLIST = (
-    "functorch",
-    "fx2trt_oss",
-    "networkx",
-    "numpy",
-    "omegaconf",
-    "onnx",
-    "onnxruntime",
-    "onnx_tf",
-    "pandas",
-    "sklearn",
-    "tabulate",
-    "tensorflow",
-    "tensorrt",
-    "torch2trt",
-    "tqdm",
-    "tree",
-    "tvm",
-    "xarray",
-)
-
-
-def _strip_init_py(s):
-    return re.sub(r"__init__.py$", "", s)
-
-
-def _module_dir(m: types.ModuleType):
-    return _strip_init_py(m.__file__)
-
-
-# TODO: Add a decoractor for easily adding functions to FUNC_INLINELIST
-# after resolving all circular import issues.
-FUNC_INLINELIST = {
-    "torch._constrain_as_size",
-    "torch._constrain_as_value",
-    "torch._tensor._convert",
-}
-
-
-# These are legacy workarounds, don't add new modules to this list.
-# Please use the MOD_INLINELIST instead to force inline functions under particular modules.
-LEGACY_MOD_INLINELIST = {
-    "torch._dynamo.external_utils",
-    "torch._export.db.examples",
-    "torch._export.wrappers",
-    "torch._functorch.apis",
-    "torch._functorch.deprecated",
-    "torch._higher_order_ops.cond",
-    "torch.ao.quantization.pt2e.eval_utils",
-    "torch.ao.quantization.pt2e.qat_utils",
-    "torch.ao.quantization.pt2e.representation.rewrite",
-    "torch.ao.quantization.pt2e.utils",
-    "torch.ao.quantization.quantizer.xnnpack_quantizer",
-    "torch.optim",
-}
-
-if torch.distributed.is_available():
-    LEGACY_MOD_INLINELIST |= {
-        "torch.distributed._tensor.api",
-        "torch.distributed._tensor.device_mesh",
-        "torch.distributed.device_mesh",
-        "torch.distributed.algorithms._checkpoint.checkpoint_wrapper",
-        "torch.distributed.tensor.parallel._data_parallel_utils",
-        "torch.distributed.tensor.parallel._utils",
-        "torch.distributed.tensor.parallel.style",
-    }
-
-
-# Force inline functions under these modules, even they are in *_SKIPLIST.
-# We are using python module name instead of file or directory object to avoid circular dependency.
-# Please keep this sorted alphabetically.
-MOD_INLINELIST = {
-    "torch._refs",
-    "torch._prims",
-    "torch._decomp",
-    "torch._dynamo._trace_wrapped_higher_order_op",
-    "torch._dynamo.comptime",
-    "torch._dynamo.polyfill",
-    "torch._inductor.test_operators",
-    "torch.amp.autocast_mode",
-    "torch.ao.nn",
-    "torch.autograd.function",
-    "torch.cuda.amp.autocast_mode",
-    "torch.distributions",
-    "torch.fx._pytree",
-    "torch.fx.passes.shape_prop",
-    "torch.nn",
-    "torch.random",
-    "torch.sparse",
-    "torch.testing",
-    "torch.utils._content_store",
-    "torch.utils._contextlib",
-    "torch.utils._foreach_utils",
-    "torch.utils._pytree",
-    "torch._tensor",
-    "torch._higher_order_ops.strict_mode",
-}
-
-
-if torch.distributed.is_available():
-    MOD_INLINELIST.add("torch.distributed")
-    MOD_INLINELIST.add("torch.distributed._functional_collectives")
-
-
-# TODO: support adding bound method into this list
-@functools.lru_cache(None)
-def get_func_inlinelist():
-    inlinelist = set()
-    for f in FUNC_INLINELIST:
-        module_name, fn_name = f.rsplit(".", 1)
-        m = importlib.import_module(module_name)
-        fn = getattr(m, fn_name)
-        inlinelist.add(fn.__code__)
-    return inlinelist
-
-
-@functools.lru_cache(None)
-def get_legacy_mod_inlinelist():
-    inlinelist = set()
-    for m in LEGACY_MOD_INLINELIST:
-        inlinelist.add(_module_dir(torch) + m[len("torch.") :].replace(".", "/"))
-    return inlinelist
-
-
-@functools.lru_cache(None)
-def get_mod_inlinelist():
-    inlinelist = set()
-    for m in MOD_INLINELIST:
-        inlinelist.add(_module_dir(torch) + m[len("torch.") :].replace(".", "/"))
-    return inlinelist
-
-
-# skip some standard python builtin libs
-SKIP_DIRS = [
-    "<frozen importlib",
-    "<__array_function__ internals>",
-] + [_module_dir(m) for m in BUILTIN_SKIPLIST]
-
-SKIP_DIRS_RE = re.compile(r"match nothing^")
-
-is_fbcode = importlib.import_module("torch._inductor.config").is_fbcode()
-# Skip fbcode paths(including torch.package paths) containing
-# one of the following strings.
-FBCODE_SKIP_DIRS = {
-    "torchrec/distributed",
-    "torchrec/fb/distributed",
-    "caffe2/torch/fb/sparsenn/pooled_embeddings_modules.py",
-}
-FBCODE_SKIP_DIRS_RE = re.compile(f".*({'|'.join(map(re.escape, FBCODE_SKIP_DIRS))})")
-
-
-def _recompile_re():
-    global SKIP_DIRS_RE
-    SKIP_DIRS_RE = re.compile(f"^({'|'.join(map(re.escape, SKIP_DIRS))})")
-
-
-def add(import_name: str):
-    if isinstance(import_name, types.ModuleType):
-        return add(import_name.__name__)
-    assert isinstance(import_name, str)
-    from importlib.util import find_spec
-
-    module_spec = find_spec(import_name)
-    if not module_spec:
-        return
-    origin = module_spec.origin
-    if origin is None:
-        return
-    global SKIP_DIRS_RE
-    SKIP_DIRS.append(_strip_init_py(origin))
-    _recompile_re()
-
-
-@dataclasses.dataclass
-class SkipResult:
-    skipped: bool
-    reason: Optional[str]
-
-
-def check_file(filename, is_inlined_call=False):
-    """Should skip this file?"""
-    if filename is None:
-        return SkipResult(True, "filename is None")
-    if any(filename.startswith(d) for d in get_legacy_mod_inlinelist()):
-        return SkipResult(
-            False,
-            "inlined according skipfiles.LEGACY_MOD_INLINELIST",
-        )
-    if is_inlined_call and is_torch_inline_allowed(filename):
-        return SkipResult(
-            False,
-            "inlined according skipfiles.MOD_INLINELIST",
-        )
-    if is_fbcode and bool(FBCODE_SKIP_DIRS_RE.match(filename)):
-        return SkipResult(
-            True,
-            "skipped according skipfiles.FBCODE_SKIP_DIRS",
-        )
-    if bool(SKIP_DIRS_RE.match(filename)):
-        return SkipResult(True, "skipped according skipfiles.SKIP_DIRS")
-    else:
-        return SkipResult(False, "inlined by default")
-
-
-@dataclasses.dataclass
-class FunctionInfo:
-    py_obj: Optional[object]
-    name: Optional[str]
-    filename: str
-    code: Optional[types.CodeType]
-
-
-"""
-This is the main entry point to determine whether an object (function) should be inlined or skipped.
-Let's illustrate the logic with an example:
-    @torch.compile
-    def f1(x, y):
-        ......
-        f2(x, y)
-        ......
-
-    def f2(x, y):
-        ......
-        f3(x, y)
-        ......
-
-    def f3(x, y):
-        ......
-
-There are mainly three call sites of check/check_verbose:
-* The compile region entrance (like function f1), the correspoinding code is located at eval_frame.py.
-* When tracing the recursively called functions (like function f2 and f3).
-    * Dynamo decides inline/skip everytime it encounters a new recursively function call, and the call site
-      is in InliningInstructionTranslator.check_inlineable of symbolic_convert.py.
-    * If f2 is skipped by Dynamo, when evaluating the frame of f3, Dynamo need the inline/skip check again
-      and the call site is in catch_errors_wrapper.catch_errors of eval_frame.py.
-* For global variables and function arguments, Dynamo needs to decide if they are wrapped as SkipFilesVariable in builder.py.
-
-`is_inlined_call` is used to indicate if the current function call is inlined (f2 is inlined call if it passes check)
-or not (f3 is not inlined call if f2 is skipped). Inside of the `check_verbose` function, there are more rules
-to be checked if this `is_inlined_call`.
-The reason to have this flag is that if the upper level function call (e.g, f2) is skipped,
-we don't want to inline the lower level function call (e.g, f3) by default.
-"""
-
-
-def check_verbose(obj, is_inlined_call=False):
-    if isinstance(
-        obj, (UserFunctionVariable, UserMethodVariable, NestedUserFunctionVariable)
-    ):
-        try:
-            py_obj = obj.get_function()
-        except NotImplementedError:
-            py_obj = None
-        fi = FunctionInfo(py_obj, obj.get_name(), obj.get_filename(), obj.get_code())
-    elif isinstance(obj, types.CodeType):
-        fi = FunctionInfo(None, obj.co_name, obj.co_filename, obj)
-    elif isinstance(obj, (types.FunctionType, types.MethodType)):
-        fi = FunctionInfo(
-            obj, obj.__name__, getfile(obj), obj.__code__  # type: ignore[union-attr] # FIXME Add MethodType.__code__ to typeshed
-        )
-    else:
-        fi = FunctionInfo(obj, None, getfile(obj), None)
-    # Go through function based skip/inline rules.
-    if fi.code in get_func_inlinelist():
-        return SkipResult(
-            False,
-            "inlined according skipfiles.FUNC_INLINELIST",
-        )
-    if is_inlined_call:
-        if fi.name == "patched_init":
-            return SkipResult(True, "patched init cannot be inlined.")
-        elif fi.name == "__torch_function__":
-            return SkipResult(False, "allow inlining __torch_function__")
-
-    # Go through file based skip/inline rules.
-    return check_file(fi.filename, is_inlined_call)
-
-
-def check(obj, is_inlined_call=False):
-    return check_verbose(obj, is_inlined_call).skipped
-
-
-# skip common third party libs
-for _name in THIRDPARTY_SKIPLIST:
-    add(_name)
-
-_recompile_re()
-
-
-def is_torch_inline_allowed(filename):
-    return any(filename.startswith(d) for d in get_mod_inlinelist())
-
-
-@functools.lru_cache(None)
-def dynamo_dir():
-    import torch._dynamo
-
-    return _module_dir(torch._dynamo)
-
-
-def is_torch(filename):
-    if filename.startswith(dynamo_dir()):
-        return False
-    return filename.startswith(_module_dir(torch))
diff --git a/torch/_dynamo/source.py b/torch/_dynamo/source.py
index e4f461abe4897..cb42c7eb20344 100644
--- a/torch/_dynamo/source.py
+++ b/torch/_dynamo/source.py
@@ -49,32 +49,18 @@ def is_constant_source(source):
     return False
 
 
-def is_input_source(source):
-    return source.guard_source() in [
-        GuardSource.LOCAL,
-        GuardSource.GLOBAL,
-        GuardSource.LOCAL_NN_MODULE,
-        GuardSource.GLOBAL_NN_MODULE,
-        GuardSource.LOCAL_FSDP_MODULE,
-        GuardSource.GLOBAL_FSDP_MODULE,
-    ]
-
-
 def reconstruct_getitem(
     source: Union["GetItemSource", "ODictGetItemSource"], codegen, index_is_slice
 ):
-    instrs = source.base.reconstruct(codegen)
-
+    source.base.reconstruct(codegen)
     if isinstance(source.index, Source):
-        instrs.extend(source.index.reconstruct(codegen))
+        source.index.reconstruct(codegen)
     else:
         if index_is_slice:
             assert isinstance(source, GetItemSource)
-            instrs.append(codegen.create_load_const(source.unpack_slice()))
+            codegen.append_output(codegen.create_load_const(source.unpack_slice()))
         else:
-            instrs.append(codegen.create_load_const(source.index))
-
-    return instrs
+            codegen.append_output(codegen.create_load_const(source.index))
 
 
 @dataclasses.dataclass(frozen=True)
@@ -83,7 +69,7 @@ class LocalSource(Source):
     cell_or_freevar: bool = False
 
     def reconstruct(self, codegen):
-        return [codegen.create_load(self.local_name)]
+        codegen.append_output(codegen.create_load(self.local_name))
 
     def guard_source(self):
         return GuardSource.LOCAL
@@ -92,6 +78,20 @@ def name(self):
         return f"L[{repr(self.local_name)}]"
 
 
+@dataclasses.dataclass(frozen=True)
+class SyntheticLocalSource(Source):
+    local_name: str
+
+    def reconstruct(self, codegen):
+        codegen.append_output(codegen.create_load(self.local_name))
+
+    def guard_source(self):
+        return GuardSource.SYNTHETIC_LOCAL
+
+    def name(self):
+        return f"SYNTHETIC_LOCAL[{self.local_name!r}]"
+
+
 @dataclasses.dataclass(frozen=True)
 class RandomValueSource(Source):
     random_call_index: int
@@ -100,11 +100,9 @@ def guard_source(self):
         return GuardSource.RANDOM_VALUE
 
     def reconstruct(self, codegen):
-        return [
-            codegen.create_load(codegen.tx.output.random_values_var),
-            codegen.create_load_const(self.random_call_index),
-            create_instruction("BINARY_SUBSCR"),
-        ]
+        codegen.append_output(codegen.create_load(codegen.tx.output.random_values_var))
+        codegen.append_output(codegen.create_load_const(self.random_call_index))
+        codegen.append_output(create_instruction("BINARY_SUBSCR"))
 
     def name(self):
         return f"random_value_{self.random_call_index}"
@@ -115,7 +113,9 @@ class GlobalSource(Source):
     global_name: str
 
     def reconstruct(self, codegen):
-        return [codegen.create_load_global(self.global_name, False, add=True)]
+        codegen.append_output(
+            codegen.create_load_global(self.global_name, False, add=True)
+        )
 
     def guard_source(self):
         return GuardSource.GLOBAL
@@ -129,10 +129,10 @@ class GlobalWeakRefSource(Source):
     global_name: str
 
     def reconstruct(self, codegen):
-        return [
-            codegen.create_load_global(self.global_name, True, add=True),
-            *create_call_function(0, False),
-        ]
+        codegen.append_output(
+            codegen.create_load_global(self.global_name, True, add=True)
+        )
+        codegen.extend_output(create_call_function(0, False))
 
     def guard_source(self):
         return GuardSource.GLOBAL
@@ -155,7 +155,8 @@ def __post_init__(self):
             object.__setattr__(self, "member", member_parts[-1])
 
     def reconstruct(self, codegen):
-        return self.base.reconstruct(codegen) + codegen.create_load_attrs(self.member)
+        self.base.reconstruct(codegen)
+        codegen.extend_output(codegen.create_load_attrs(self.member))
 
     def guard_source(self):
         return self.base.guard_source()
@@ -166,12 +167,57 @@ def name(self):
         return f"{self.base.name()}.{self.member}"
 
 
+# Represents tensor.grad source. It could be represented by AttrSource as well.
+# But, we could access grad field on tensor directly in C++ without going
+# through the Python bytecodes. Therefore, we use a separate source for grad
+# field.
+@dataclasses.dataclass(frozen=True)
+class GradSource(ChainedSource):
+    member: str = "grad"
+
+    def reconstruct(self, codegen):
+        self.base.reconstruct(codegen)
+        codegen.extend_output(codegen.create_load_attrs(self.member))
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+        return f"{self.base.name()}.{self.member}"
+
+
 @dataclasses.dataclass(frozen=True)
 class ParamBufferSource(AttrSource):
     def guard_source(self):
         return _GUARD_SOURCE_NN_MODULE[self.base.guard_source()]
 
 
+# This source is intended to be used in places where a source is needed but it is expected
+# that the symbol will be simplified out later on. Symbols with ephemeral sources are
+# prioritized to be simplified out when e.g. compared against a symbol without an ephemeral
+# source. Guarding on this source is an error.
+#
+# Example: During subclass view fake-ification, any close-over ViewFunc state should be
+# symbolicized / fake-ified to avoid invalid specialization during view replay. This source
+# is useful for symbols utilized in the middle of the view chain that are not expected to be
+# present within the final view shape metadata.
+@dataclasses.dataclass(frozen=True)
+class EphemeralSource(Source):
+    desc: Optional[str] = None
+
+    def guard_source(self):
+        return GuardSource.EPHEMERAL
+
+    def name(self):
+        return f"<ephemeral{': ' + self.desc if self.desc is not None else ''}>"
+
+    def make_guard(self):
+        raise NotImplementedError
+
+    def is_ephemeral(self):
+        return True
+
+
 class TensorProperty(enum.Enum):
     SIZE = 0
     STRIDE = 1
@@ -199,16 +245,13 @@ def __post_init__(self):
             assert self.idx is not None
 
     def reconstruct(self, codegen):
-        instructions = [
-            *self.base.reconstruct(codegen),
-            codegen.create_load_attr(self.prop.method_name()),
-        ]
+        self.base.reconstruct(codegen)
+        codegen.append_output(codegen.create_load_attr(self.prop.method_name()))
         if self.idx is not None:
-            instructions.append(codegen.create_load_const(self.idx))
-        instructions.extend(
+            codegen.append_output(codegen.create_load_const(self.idx))
+        codegen.extend_output(
             create_call_function(1 if self.idx is not None else 0, True)
         )
-        return instructions
 
     def guard_source(self):
         return self.base.guard_source()
@@ -231,7 +274,7 @@ def __post_init__(self):
         assert self.base is not None
 
     def reconstruct(self, codegen):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def guard_source(self):
         return self.base.guard_source()
@@ -247,7 +290,7 @@ def __post_init__(self):
         assert self.base is not None
 
     def reconstruct(self, codegen):
-        return self.base.reconstruct(codegen)
+        self.base.reconstruct(codegen)
 
     def guard_source(self):
         return self.base.guard_source()
@@ -256,6 +299,36 @@ def name(self):
         return f"cast_symbool_to_symint_guardless({self.base.name()})"
 
 
+@dataclasses.dataclass(frozen=True)
+class FlattenScriptObjectSource(ChainedSource):
+    def __post_init__(self):
+        assert self.base is not None
+
+    def reconstruct(self, codegen):
+        self.base.reconstruct(codegen)
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+        return f"{self.base.name()}.__obj_flatten__()"
+
+
+@dataclasses.dataclass(frozen=True)
+class ScriptObjectQualifiedNameSource(ChainedSource):
+    def __post_init__(self):
+        assert self.base is not None
+
+    def reconstruct(self, codegen):
+        self.base.reconstruct(codegen)
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+        return f"{self.base.name()}._type().qualified_name()"
+
+
 @dataclasses.dataclass(frozen=True)
 class DefaultsSource(ChainedSource):
     idx_key: Union[int, str]
@@ -281,15 +354,10 @@ def __post_init__(self):
             )
 
     def reconstruct(self, codegen):
-        instrs = self.base.reconstruct(codegen)
-        instrs.extend(codegen.create_load_attrs(self.field))
-        instrs.extend(
-            [
-                codegen.create_load_const(self.idx_key),
-                create_instruction("BINARY_SUBSCR"),
-            ]
-        )
-        return instrs
+        self.base.reconstruct(codegen)
+        codegen.extend_output(codegen.create_load_attrs(self.field))
+        codegen.append_output(codegen.create_load_const(self.idx_key))
+        codegen.append_output(create_instruction("BINARY_SUBSCR"))
 
     def guard_source(self):
         return self.base.guard_source()
@@ -311,10 +379,8 @@ def __post_init__(self):
             super().__setattr__("index_is_slice", True)
 
     def reconstruct(self, codegen):
-        return [
-            *reconstruct_getitem(self, codegen, index_is_slice=self.index_is_slice),
-            create_instruction("BINARY_SUBSCR"),
-        ]
+        reconstruct_getitem(self, codegen, index_is_slice=self.index_is_slice)
+        codegen.append_output(create_instruction("BINARY_SUBSCR"))
 
     def guard_source(self):
         return self.base.guard_source()
@@ -325,40 +391,48 @@ def unpack_slice(self):
         return slice_class(*slice_args)
 
     def name(self):
+        # Index can be of following types
+        # 1) ConstDictKeySource
+        # 2) enum.Enum
+        # 3) index is a slice - example 1:4
+        # 4) index is a constant - example string, integer
         if isinstance(self.index, Source):
+            if not isinstance(self.index, ConstDictKeySource):
+                raise ValueError(
+                    "GetItemSource index must be a constant, enum or ConstDictKeySource"
+                )
             return f"{self.base.name()}[{self.index.name()}]"
+        elif self.index_is_slice:
+            return f"{self.base.name()}[{self.unpack_slice()!r}]"
+        elif isinstance(self.index, enum.Enum):
+            return f"{self.base.name()}[{enum_repr(self.index, self.guard_source().is_local())}]"
         else:
-            if self.index_is_slice:
-                return f"{self.base.name()}[{self.unpack_slice()!r}]"
-            elif isinstance(self.index, enum.Enum):
-                return f"{self.base.name()}[{enum_repr(self.index, self.guard_source().is_local())}]"
-            else:
-                return f"{self.base.name()}[{self.index!r}]"
+            return f"{self.base.name()}[{self.index!r}]"
 
 
 @dataclasses.dataclass(frozen=True)
 class ConstDictKeySource(GetItemSource):
+    def is_dict_key(self):
+        return True
+
     def reconstruct(self, codegen):
-        return [
-            *codegen.create_load_import_from(utils.__name__, "dict_keys_getitem"),
-            *self.base.reconstruct(codegen),
-            codegen.create_load_const(self.index),
-            *create_call_function(2, True),
-        ]
+        codegen.load_import_from(utils.__name__, "dict_keys_getitem")
+        self.base.reconstruct(codegen)
+        codegen.append_output(codegen.create_load_const(self.index))
+        codegen.extend_output(create_call_function(2, True))
 
     def name(self):
-        return f"___dict_keys_getitem({self.base.name()}, {self.index!r})"
+        # The list creation will be CSE'd by PyExprCSEPass
+        return f"list({self.base.name()}.keys())[{self.index!r}]"
 
 
 @dataclasses.dataclass(frozen=True)
 class TupleIteratorGetItemSource(GetItemSource):
     def reconstruct(self, codegen):
         codegen.load_import_from(utils.__name__, "tuple_iterator_getitem")
-        return [
-            *self.base.reconstruct(codegen),
-            codegen.create_load_const(self.index),
-            *create_call_function(2, True),
-        ]
+        self.base.reconstruct(codegen)
+        codegen.append_output(codegen.create_load_const(self.index))
+        codegen.extend_output(create_call_function(2, True))
 
     def name(self):
         return f"___tuple_iterator_getitem({self.base.name()}, {self.index!r})"
@@ -371,7 +445,8 @@ def __post_init__(self):
 
     def reconstruct(self, codegen):
         codegen.load_import_from("builtins", "type")
-        return self.base.reconstruct(codegen) + create_call_function(1, True)
+        self.base.reconstruct(codegen)
+        codegen.extend_output(create_call_function(1, True))
 
     def guard_source(self):
         return self.base.guard_source()
@@ -388,11 +463,11 @@ def __post_init__(self):
         assert self.base is not None
 
     def reconstruct(self, codegen):
-        return [
-            codegen._create_load_const(collections.OrderedDict.__getitem__),
-            *reconstruct_getitem(self, codegen, index_is_slice=False),
-            *create_call_function(2, True),
-        ]
+        codegen.append_output(
+            codegen._create_load_const(collections.OrderedDict.__getitem__)
+        )
+        reconstruct_getitem(self, codegen, index_is_slice=False)
+        codegen.extend_output(create_call_function(2, True))
 
     def guard_source(self):
         return self.base.guard_source()
@@ -407,10 +482,22 @@ def name(self):
             return f"___odict_getitem({self.base.name()}, {self.index!r})"
 
 
+@dataclasses.dataclass(frozen=True)
+class OptimizerSource(ChainedSource):
+    def reconstruct(self, codegen):
+        self.base.reconstruct(codegen)
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+        return self.base.name()
+
+
 @dataclasses.dataclass(frozen=True)
 class NNModuleSource(ChainedSource):
     def reconstruct(self, codegen):
-        return self.base.reconstruct(codegen)
+        self.base.reconstruct(codegen)
 
     def guard_source(self):
         return _GUARD_SOURCE_NN_MODULE[self.base.guard_source()]
@@ -445,7 +532,9 @@ class ConstantSource(Source):
     source_name: str
 
     def reconstruct(self, codegen):
-        return [codegen.create_load_global(self.source_name, False, add=False)]
+        codegen.append_output(
+            codegen.create_load_global(self.source_name, False, add=False)
+        )
 
     def guard_source(self):
         return GuardSource.CONSTANT
@@ -454,7 +543,7 @@ def name(self):
         return self.source_name
 
     def make_guard(self, fn):
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 @dataclasses.dataclass(frozen=True)
@@ -467,7 +556,8 @@ def guard_source(self):
 
     def reconstruct(self, codegen):
         codegen.load_import_from("torch", "as_tensor")
-        return self.base.reconstruct(codegen) + create_call_function(1, True)
+        self.base.reconstruct(codegen)
+        codegen.extend_output(create_call_function(1, True))
 
 
 # This is a synthetic source that is associated with the singleton
@@ -482,6 +572,15 @@ def guard_source(self):
         return GuardSource.SHAPE_ENV
 
 
+@dataclasses.dataclass(frozen=True)
+class BackwardStateSource(Source):
+    def name(self):
+        return ""
+
+    def guard_source(self):
+        return GuardSource.BACKWARD_STATE
+
+
 def is_from_local_source(source: Source, *, allow_cell_or_freevar=True):
     if isinstance(source, ChainedSource):
         return is_from_local_source(
@@ -492,3 +591,29 @@ def is_from_local_source(source: Source, *, allow_cell_or_freevar=True):
     if not allow_cell_or_freevar and source.cell_or_freevar:
         return False
     return True
+
+
+def is_from_flatten_script_object_source(source: Source):
+    if isinstance(source, FlattenScriptObjectSource):
+        return True
+    elif isinstance(source, ChainedSource):
+        return is_from_flatten_script_object_source(source.base)
+    return False
+
+
+def is_from_optimizer_source(source: Source):
+    if isinstance(source, OptimizerSource):
+        return True
+    if isinstance(source, ChainedSource):
+        return is_from_optimizer_source(source.base)
+    return False
+
+
+# TODO: can probably write a generic "test this on everything in the chain"
+# helper
+def is_from_defaults(source: Source):
+    if isinstance(source, DefaultsSource):
+        return True
+    if isinstance(source, ChainedSource):
+        return is_from_defaults(source.base)
+    return False
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 882f93e9f2904..d6fb3e2145b73 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -1,4 +1,5 @@
 import collections
+import collections.abc
 import contextlib
 import copy
 import dataclasses
@@ -17,14 +18,14 @@
 import types
 import typing
 import weakref
-from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Type
+from typing import Any, Callable, cast, Dict, List, Optional, Set, Tuple, Type
 from unittest.mock import patch
 
 import torch
 import torch._logging
-from torch._guards import Checkpointable, tracing, TracingContext
+from torch._guards import tracing, TracingContext
 
-from . import config, exc, logging as torchdynamo_logging, skipfiles, variables
+from . import config, exc, logging as torchdynamo_logging, trace_rules, variables
 from .bytecode_analysis import (
     get_indexof,
     JUMP_OPNAMES,
@@ -36,17 +37,18 @@
     create_call_function,
     create_instruction,
     create_jump_absolute,
+    create_swap,
+    get_code_keys,
     Instruction,
     is_generator,
     unique_id,
 )
 from .code_context import code_context
 from .codegen import PyCodegen
-from .current_scope_id import current_scope_id
 from .exc import ArgsMismatchError, BackendCompilerFailed, unimplemented, Unsupported
 from .funcname_cache import get_funcname
 from .guards import GuardBuilder, install_guard
-from .output_graph import GraphCompileReason, OutputGraph, OutputGraphState
+from .output_graph import GraphCompileReason, OutputGraph
 from .replay_record import DummyModule, ExecutionRecorder
 from .resume_execution import ContinueExecutionCache, ReenterWith
 from .source import (
@@ -67,13 +69,7 @@
     LazyString,
     proxy_args_kwargs,
 )
-from .variables.base import (
-    _is_top_level_scope,
-    is_side_effect_safe,
-    MutableLocal,
-    typestr,
-    VariableTracker,
-)
+from .variables.base import is_side_effect_safe, MutableLocal, typestr, VariableTracker
 from .variables.builder import VariableBuilder, wrap_fx_proxy
 from .variables.builtin import BuiltinVariable
 from .variables.constant import ConstantVariable
@@ -86,6 +82,7 @@
 from .variables.functions import (
     BaseUserFunctionVariable,
     NestedUserFunctionVariable,
+    SkipFunctionVariable,
     UserFunctionVariable,
     UserMethodVariable,
 )
@@ -105,24 +102,30 @@
     UnknownVariable,
 )
 from .variables.nn_module import NNModuleVariable
-from .variables.tensor import (
-    supported_const_comparison_ops,
-    supported_tensor_comparison_ops,
-    SymNodeVariable,
-    TensorVariable,
-)
+from .variables.tensor import supported_comparison_ops, SymNodeVariable, TensorVariable
 from .variables.user_defined import (
     RemovableHandleVariable,
     UserDefinedClassVariable,
     UserDefinedObjectVariable,
-    UserDefinedVariable,
 )
 
 log = logging.getLogger(__name__)
 graph_break_log = torch._logging.getArtifactLogger(__name__, "graph_breaks")
 trace_call_log = torch._logging.getArtifactLogger(__name__, "trace_call")
 trace_source_log = torch._logging.getArtifactLogger(__name__, "trace_source")
+trace_bytecode_log = torch._logging.getArtifactLogger(__name__, "trace_bytecode")
 tls = threading.local()
+compare_op_handlers: Dict[str, Any] = {
+    k: BuiltinVariable(v).call_function for k, v in supported_comparison_ops.items()
+}
+handle_contains = BuiltinVariable(operator.contains).call_function
+handle_not = BuiltinVariable(operator.not_).call_function
+compare_op_handlers["in"] = lambda tx, args, _: handle_contains(
+    tx, [*reversed(args)], {}
+)
+compare_op_handlers["not in"] = lambda tx, args, _: handle_not(
+    tx, [handle_contains(tx, [*reversed(args)], {})], {}
+)
 
 
 @dataclasses.dataclass
@@ -138,7 +141,11 @@ def fail_and_restart_analysis(self):
         Start tracing of the current frame over again, and don't take this branch.
         """
         self.failed = True
-        raise exc.SpeculationRestartAnalysis()
+        if self.reason is not None:
+            restart_reason = self.reason.reason
+        else:
+            restart_reason = "Unknown fail_and_restart_analysis"
+        raise exc.SpeculationRestartAnalysis(restart_reason=restart_reason)
 
 
 @dataclasses.dataclass
@@ -211,25 +218,8 @@ def exit(self, tx):
         return self.with_context.exit(tx)
 
 
-class InstructionTranslatorGraphState(NamedTuple):
-    output: OutputGraphState
-    symbolic_locals: Dict[str, VariableTracker]
-    stack: List[VariableTracker]
-    block_stack: List[BlockStackEntry]
-    instruction_pointer: Optional[int]
-    current_instruction: Instruction
-    next_instruction: Optional[Instruction]
-    lineno: int
-
-    def diff(self, other: "InstructionTranslatorGraphState") -> Optional[str]:
-        for k in self._fields:
-            if k == "output":
-                return self.output.diff(other.output, prefix=f"{k}.")
-            sv = getattr(self, k)
-            ov = getattr(other, k)
-            if sv != ov:
-                return f"{k} mismatch: {sv} != {ov}"
-        return None
+class ReturnValueOp(Exception):
+    pass
 
 
 def stack_op(fn: typing.Callable[..., object]):
@@ -292,14 +282,14 @@ def _detect_and_normalize_assert_statement(
         error_msg = inst.argval
 
         # if it is LOAD_CONSTANT, it must be followed by CALL_FUNCTION
-        # (PRECALL for Python 3.11+)
+        # (PRECALL for Python 3.11, CALL for Python 3.12+)
         current_instruction_pointer += 1
         inst = self.instructions[current_instruction_pointer]
-        if inst.opname not in ("CALL_FUNCTION", "PRECALL"):
+        if inst.opname not in ("CALL_FUNCTION", "PRECALL", "CALL"):
             return False
 
-        # for Python 3.11+, PRECALL should be followed by CALL, then RAISE_VARARGS
-        # for Python < 3.11, CALL_FUNCTION should be followed by RAISE_VARARGS
+        # for Python 3.11, PRECALL should be followed by CALL, then RAISE_VARARGS
+        # for Python != 3.11, CALL_FUNCTION/CALL should be followed by RAISE_VARARGS
         current_instruction_pointer += 1
         if inst.opname == "PRECALL":
             current_instruction_pointer += 1
@@ -314,6 +304,37 @@ def _detect_and_normalize_assert_statement(
 
 
 def generic_jump(truth_fn: typing.Callable[[object], bool], push: bool):
+    def jump_graph_break(self, inst, value, extra_msg=""):
+        if not self.should_compile_partial_graph():
+            unimplemented("should_compile_partial_graph=False")
+        # compile a partial subgraph prefix then jump into user code
+        if self.maybe_has_backedge():
+            msg = (
+                "Skipping frame because there is a graph break in a for/while loop\n"
+                f"{self.frame_summary()}"
+            )
+            log.info(msg)
+            raise exc.SkipFrame(msg)
+
+        self.push(value)
+        log.debug("generic_jump triggered compile")
+        self.output.compile_subgraph(
+            self,
+            reason=GraphCompileReason(
+                f"generic_jump {typestr(value)}{extra_msg}", [self.frame_summary()]
+            ),
+        )
+        self.pop()
+
+        if_next = self.create_call_resume_at(self.next_instruction)
+        if push:
+            self.push(value)
+        if_jump = self.create_call_resume_at(inst.target)
+
+        self.output.add_output_instructions(
+            [create_instruction(inst.opname, target=if_jump[0])] + if_next + if_jump
+        )
+
     def inner(self: "InstructionTranslatorBase", inst: Instruction):
         value: VariableTracker = self.pop()
         if (
@@ -340,6 +361,21 @@ def inner(self: "InstructionTranslatorBase", inst: Instruction):
                 self.jump(inst)
                 return
 
+            if isinstance(value, SymNodeVariable):
+                # if the assertion is normal shape expression.
+                # just install guard and bail out.
+                sym_expr = value.sym_num
+                if not isinstance(sym_expr, torch.SymBool):
+                    sym_expr = sym_expr != 0
+
+                result = torch.fx.experimental.symbolic_shapes.expect_true(sym_expr)
+                if not result:
+                    unimplemented(
+                        "Assertion failed on symbolic shapes. Did you make sure eager mode succeeds?"
+                    )
+                self.jump(inst)
+                return
+
             scalar_to_tensor_proxy = self.output.create_proxy(
                 "call_function", torch.scalar_tensor, *proxy_args_kwargs((value,), {})
             )
@@ -360,42 +396,19 @@ def inner(self: "InstructionTranslatorBase", inst: Instruction):
 
         if value.is_python_constant():
             if truth_fn(value.as_python_constant()):
-                push and self.push(value)
+                if push:
+                    self.push(value)
                 self.jump(inst)
         elif (
             isinstance(value, (TensorVariable)) and self.should_compile_partial_graph()
         ):
-            # compile a partial subgraph prefix then jump into user code
-            if self.has_backedge():
-                msg = (
-                    "Skipping frame because there is a graph break in a for/while loop\n"
-                    f"{self.frame_summary()}"
-                )
-                log.info(msg)
-                raise exc.SkipFrame(msg)
-
-            self.push(value)
-            log.debug("generic_jump triggered compile")
-            self.output.compile_subgraph(
-                self,
-                reason=GraphCompileReason(
-                    f"generic_jump {typestr(value)}", [self.frame_summary()]
-                ),
-            )
-            self.pop()
-
-            if_next = self.create_call_resume_at(self.next_instruction)
-            push and self.push(value)
-            if_jump = self.create_call_resume_at(inst.target)
-
-            self.output.add_output_instructions(
-                [create_instruction(inst.opname, target=if_jump[0])] + if_next + if_jump
-            )
+            jump_graph_break(self, inst, value)
         elif isinstance(value, NNModuleVariable):
             # Equivalent of "self.nn_module is not None"
             mod = self.output.get_submodule(value.module_key)
             if truth_fn(mod):
-                push and self.push(value)
+                if push:
+                    self.push(value)
                 self.jump(inst)
         elif isinstance(value, UserDefinedObjectVariable):
             x = value.var_getattr(self, "__bool__")
@@ -410,7 +423,8 @@ def inner(self: "InstructionTranslatorBase", inst: Instruction):
                     result.value, (bool, int)
                 ):
                     if truth_fn(result.value):
-                        push and self.push(value)
+                        if push:
+                            self.push(value)
                         self.jump(inst)
                 else:
                     unimplemented(
@@ -419,27 +433,48 @@ def inner(self: "InstructionTranslatorBase", inst: Instruction):
             # __bool__ or __len__ is non-function or not existed in the user defined object
             else:
                 if truth_fn(True):
-                    push and self.push(value)
+                    if push:
+                        self.push(value)
                     self.jump(inst)
         elif not isinstance(value, TensorVariable) and value.has_unpack_var_sequence(
             self
         ):
             if truth_fn(len(value.unpack_var_sequence(self))):
-                push and self.push(value)
+                if push:
+                    self.push(value)
                 self.jump(inst)
         elif isinstance(value, SymNodeVariable):
-            eval_result = value.evaluate_expr(self.output)
+            try:
+                eval_result = value.evaluate_expr(self.output)
+            except exc.UserError as e:
+                if self.should_compile_partial_graph():
+                    return jump_graph_break(self, inst, value, extra_msg=f"\n{e}")
+                raise
             if truth_fn(eval_result):
-                push and self.push(value)
+                if push:
+                    self.push(value)
+                self.jump(inst)
+        elif isinstance(value, variables.BackwardHookVariable):
+            if truth_fn(True):
+                if push:
+                    self.push(value)
                 self.jump(inst)
         else:
-            # TODO link the torch.cond doc later
-            raise exc.UserError(
-                exc.UserErrorType.DYNAMIC_CONTROL_FLOW,
-                "Dynamic control flow is not supported at the moment. Please use "
-                "functorch.experimental.control_flow.cond to explicitly capture the control flow.",
-                case_name="cond_operands",
-            )
+            from .source import is_constant_source
+
+            if value.source is not None and is_constant_source(value.source):
+                if truth_fn(value.get_real_value()):  # type: ignore[attr-defined]
+                    if push:
+                        self.push(value)
+                    self.jump(inst)
+            else:
+                # TODO link the torch.cond doc later
+                raise exc.UserError(
+                    exc.UserErrorType.DYNAMIC_CONTROL_FLOW,
+                    "Dynamic control flow is not supported at the moment. Please use "
+                    "functorch.experimental.control_flow.cond to explicitly capture the control flow.",
+                    case_name="cond_operands",
+                )
 
     return inner
 
@@ -456,9 +491,6 @@ def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
                 assert speculation.reason is not None
                 return handle_graph_break(self, inst, speculation.reason)
             try:
-                TracingContext.set_current_loc(
-                    self.f_code.co_filename, self.lineno, self.f_code.co_name
-                )
                 return inner_fn(self, inst)
             except Unsupported as excp:
                 if self.generic_context_manager_depth > 0:
@@ -473,12 +505,17 @@ def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
                 if not self.should_compile_partial_graph():
                     raise
 
-                log.debug("break_graph_if_unsupported triggered compile", exc_info=True)
-
                 user_stack = excp.real_stack
                 # TODO: Also report the traceback from the parent frame
-                user_stack_formatted = "".join(traceback.format_list(user_stack))
-                frame_loc = (user_stack[-1].filename, user_stack[-1].lineno)
+                try:
+                    frame_loc = (user_stack[-1].filename, user_stack[-1].lineno)
+                except IndexError:
+                    # first instruction
+                    code_options = self.code_options
+                    frame_loc = (
+                        code_options["co_filename"],
+                        code_options["co_firstlineno"],
+                    )
                 # torch._dynamo.explain() formats this a little nicer, and presents a slightly
                 # more actionable user code pointer
                 if (
@@ -486,13 +523,24 @@ def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
                     and not explain
                     and graph_break_dup_warning_checker.add(frame_loc)
                 ):
+                    user_stack_formatted = "".join(traceback.format_list(user_stack))
+                    # This log line is exercised from
+                    #   python test/dynamo/test_exc.py -k test_graph_break_log
                     graph_break_log.debug(
-                        "Graph break: %s from user code at:\n%s",
-                        excp,
+                        "Graph break: from user code at:\n%s",
                         user_stack_formatted,
+                        exc_info=True,
+                    )
+                else:
+                    # This log line MUST NOT contain the string "Graph break",
+                    # exercised by
+                    #   python test/dynamo/test_misc.py -k test_duplicate_graph_break_log
+                    log.debug(
+                        "Unsupported break in user code at %s:%s (details suppressed)",
+                        *frame_loc,
                     )
 
-                if self.has_backedge():
+                if self.maybe_has_backedge():
                     msg = (
                         "Skipping frame because there is a graph break in a for/while loop\n"
                         f"{self.frame_summary()}"
@@ -513,15 +561,13 @@ def handle_graph_break(
             self.output.compile_subgraph(self, reason=reason)
             cg = PyCodegen(self)
             cleanup: List[Instruction] = []
-            # Reconstruct the context variables in the block stack
+            # Reconstruct the context variable CLASS in the block stack
             for b in self.block_stack:
                 assert b.with_context is not None
-                self.output.add_output_instructions(
-                    [
-                        *b.with_context.reconstruct(cg),
-                        *b.resume_fn().try_except(cg.code_options, cleanup),
-                    ]
-                )
+                b.with_context.reconstruct_type(cg)
+                cg.extend_output(b.resume_fn().try_except(cg.code_options, cleanup))
+            self.output.add_output_instructions(cg.get_instructions())
+            del cg
 
             if sys.version_info >= (3, 11) and inst.opname == "CALL":
                 kw_names = (
@@ -545,7 +591,11 @@ def handle_graph_break(
 
             self.output.add_output_instructions(cleanup)
 
-            if sys.version_info >= (3, 11) and inst.opname == "CALL":
+            if (
+                sys.version_info >= (3, 11)
+                and sys.version_info < (3, 12)
+                and inst.opname == "CALL"
+            ):
                 # stack effect for PRECALL + CALL is split between the two instructions
                 stack_effect = dis.stack_effect(
                     dis.opmap["PRECALL"], inst.arg
@@ -565,14 +615,31 @@ def handle_graph_break(
     return decorator
 
 
-class InstructionTranslatorBase(Checkpointable[InstructionTranslatorGraphState]):
+class BytecodeDistpatchTableMeta(type):
+    """Installs a `cls.dispatch_table` on every subclass to speed up calls to self.OPCODE()"""
+
+    def __init__(cls, name, bases, dct):
+        super().__init__(name, bases, dct)
+
+        def _missing(opname, *args):
+            unimplemented(f"missing: {opname}")
+
+        dispatch_table = {
+            op: getattr(cls, opname, functools.partial(_missing, opname))
+            for opname, op in dis.opmap.items()
+        }
+        cls.dispatch_table = [dispatch_table.get(i) for i in range(2**8)]
+
+
+class InstructionTranslatorBase(
+    metaclass=BytecodeDistpatchTableMeta,
+):
     output: OutputGraph
     symbolic_locals: Dict[str, VariableTracker]
     symbolic_globals: Dict[str, VariableTracker]
     stack: List[VariableTracker]
     instruction_pointer: Optional[int]
     current_instruction: Instruction
-    next_instruction: Optional[Instruction]
     block_stack: List[BlockStackEntry]
     lineno: int
     kw_names: Optional[ConstantVariable]
@@ -581,9 +648,9 @@ class InstructionTranslatorBase(Checkpointable[InstructionTranslatorGraphState])
     inline_depth: int
     inconsistent_side_effects: bool
     current_speculation: Optional[SpeculationEntry]
-    random_calls: List[
-        Tuple[Callable[..., object], Tuple[object, ...], Dict[str, object]]
-    ]
+    dispatch_table: List[Any]
+    exec_recorder: Optional[ExecutionRecorder]
+    strict_checks_fn: Optional[Callable[[VariableTracker], bool]]
 
     def mark_inconsistent_side_effects(self):
         """
@@ -593,10 +660,31 @@ def mark_inconsistent_side_effects(self):
         """
         self.inconsistent_side_effects = True
 
-    def has_backedge(self):
+    def maybe_has_backedge(self):
+        # This function employs a heuristic. It does not reliably detect a backedge.
+        # The heuristic is straightforward: starting from the current instruction and
+        # continuing to the end, if any jump instruction targets an instruction before
+        # the current one, there might be a backedge.
+
+        # Python 3.12 introduced changes to bytecode that group common paths in
+        # blockstacks (with or try...else) and allow for early returns. Consequently,
+        # there can be multiple RETURN_VALUE instructions. Another heuristic is to
+        # halt detection upon encountering the first RETURN_VALUE or RETURN_CONST.
+
+        # These heuristics can result in both false positives and negatives, but
+        # in either case, the Dynamo code remains valid. For false positives
+        # (where an edge is incorrectly marked as a backedge), Dynamo will
+        # perform a SkipFrame instead of potentially applying optimizations. For
+        # false negatives (where an edge that should be marked as a backedge
+        # isn't), multiple graphs may be generated if there's a break in the
+        # graph during a for loop. In general, its better to have fewer false
+        # negatives so that Dynamo does not skip the whole frame.
+
         cur_offset = self.current_instruction.offset
         assert self.instruction_pointer is not None
         for inst in self.instructions[self.instruction_pointer :]:
+            if inst.opname in ("RETURN_VALUE", "RETURN_CONST"):
+                return False
             if inst.opname in JUMP_OPNAMES:
                 jump_offset = inst.argval
                 if jump_offset < cur_offset:
@@ -608,6 +696,11 @@ def cell_and_freevars(self):
             self._cell_and_freevars = tuple(
                 self.code_options["co_cellvars"] or []
             ) + tuple(self.code_options["co_freevars"] or [])
+
+            # An inlined function might depend on the freevar of the parent
+            # function. So, recursively obtain parent cell and freevars.
+            if isinstance(self, InliningInstructionTranslator):
+                self._cell_and_freevars += self.parent.cell_and_freevars()
         return self._cell_and_freevars
 
     def prune_dead_locals(self):
@@ -665,26 +758,29 @@ def get_log_starts_line_log_str(self):
         log_str += f"    {line}"
         return log_str
 
-    def log_starts_line(self):
-        trace_source_log.debug("%s", LazyString(self.get_log_starts_line_log_str))
+    def starts_line(self, lineno):
+        if self.lineno == lineno:
+            return
+        self.lineno = lineno
+        TracingContext.set_current_loc(
+            self.f_code.co_filename, lineno, self.f_code.co_name
+        )
+        if trace_source_log.isEnabledFor(logging.DEBUG):
+            trace_source_log.debug("%s", LazyString(self.get_log_starts_line_log_str))
 
     def step(self):
         """Process exactly one instruction, return False we should exit"""
-        assert isinstance(self.instruction_pointer, int)
-        inst = self.instructions[self.instruction_pointer]
-        self.current_instruction = inst
-        self.instruction_pointer += 1
-        if self.instruction_pointer < len(self.instructions):
-            self.next_instruction = self.instructions[self.instruction_pointer]
-        else:
-            self.instruction_pointer = None
-            self.next_instruction = None
-        if inst.starts_line and self.lineno != inst.starts_line:
-            self.lineno = inst.starts_line
-            self.log_starts_line()
+        ip = self.instruction_pointer
+        if ip is None:
+            return False
+        self.current_instruction = inst = self.instructions[ip]
+        self.instruction_pointer = ip + 1
+
+        if inst.starts_line:
+            self.starts_line(inst.starts_line)
 
         if (
-            len(self.stack) == 0
+            not self.stack
             and self.should_compile_partial_graph()
             and self.is_non_empty_graph()
         ):
@@ -692,57 +788,18 @@ def step(self):
             if self.current_speculation.failed:
                 return self.step_graph_break(inst)
 
-        log.debug("TRACE %s %s %s", inst.opname, inst.argval, self.stack)
+        if trace_bytecode_log.isEnabledFor(logging.DEBUG):
+            trace_bytecode_log.debug(
+                "TRACE %s %s %s", inst.opname, inst.argval, self.stack
+            )
 
-        # 3.11 no longer uses a block stack, but we still keep track of one
-        # so that we know which contexts are currently active.
-        # For our purposes, all exception table entries with the same target
-        # are considered to be part of the same "block".
-        if sys.version_info >= (3, 11):
-            entry = inst.exn_tab_entry
-            if not (
-                # still in the same block
-                self.block_stack
-                and entry
-                and self.block_stack[-1].target is entry.target
-            ):
-                if not entry:
-                    # no longer in any block
-                    # It is possible for NOPs to be between two instructions
-                    # in the same block, but the NOPs are not covered by an
-                    # exception table entry. In this case, assume that we
-                    # are still in the same block.
-                    if self.block_stack and inst.opname != "NOP":
-                        # If we really escape from a block and the current
-                        # instruction is not in another block, then there
-                        # should be no other nested blocks that we are in.
-                        assert len(self.block_stack) == 1
-                        self.block_stack.pop()
-                elif (
-                    # current instruction is in the previous block
-                    len(self.block_stack) > 1
-                    and self.block_stack[-2].target is entry.target
-                ):
-                    # exit the current block
-                    self.block_stack.pop()
-                else:
-                    # current instruction is in a new block
-                    # push block to stack - note, BEFORE_WITH blocks won't
-                    # be pushed here since BEFORE_WITH pushes the block, and
-                    # the current instruction would be counted as being in that block.
-                    self.block_stack.append(
-                        BlockStackEntry(entry.target, len(self.stack))
-                    )
+        self.update_block_stack(inst)
 
         try:
-            if not hasattr(self, inst.opname):
-                unimplemented(f"missing: {inst.opname}")
-            TracingContext.set_current_loc(
-                self.f_code.co_filename, self.lineno, self.f_code.co_name
-            )
-            getattr(self, inst.opname)(inst)
-
-            return inst.opname != "RETURN_VALUE"
+            self.dispatch_table[inst.opcode](self, inst)
+            return not self.output.should_exit
+        except ReturnValueOp:
+            return False
         except Unsupported:
             if self.current_speculation is None:
                 log.debug("empty checkpoint")
@@ -751,6 +808,56 @@ def step(self):
 
         self.current_speculation.fail_and_restart_analysis()
 
+    if sys.version_info >= (3, 11):
+
+        def update_block_stack(self, inst):
+            # 3.11+ no longer uses a block stack, but we still keep track of one
+            # so that we know which contexts are currently active.
+            # For our purposes, all exception table entries with the same target
+            # are considered to be part of the same "block".
+            # NOTE: we only keep track of with blocks that are not contained in try blocks.
+            # This is because we will not create continuation functions on graph breaks in try blocks,
+            # but we may for with blocks. We do not push blocks here since
+            # with blocks are pushed when handling BEFORE_WITH.
+            entry = inst.exn_tab_entry
+            if entry:
+                # Detect when we have exited the top with block.
+                # The with blocks on the block stack are not enclosed in try
+                # blocks, so a with block's cleanup code should be in the
+                # previous with block (if any).
+                if (
+                    len(self.block_stack) >= 2
+                    and entry.target is not self.block_stack[-1].target
+                    and entry.target is self.block_stack[-2].target
+                ):
+                    # exit the current block
+                    self.block_stack.pop()
+            else:
+                # no longer in any block
+                # It is possible for NOPs to be between two instructions
+                # in the same block, but the NOPs are not covered by an
+                # exception table entry. In this case, assume that we
+                # are still in the same block.
+                # In 3.12+, JUMP_BACKWARD might also not be covered by
+                # an exception table entry, so we also assume that we
+                # are still in the same block. It is probably safe to do
+                # this in 3.11, even though we haven't encountered this case before.
+                if self.block_stack and inst.opname not in ("NOP", "JUMP_BACKWARD"):
+                    # If we really escape from a block and the current
+                    # instruction is not in another block, then there
+                    # should be no other nested blocks that we are in.
+                    assert len(self.block_stack) == 1
+                    self.block_stack.pop()
+
+    else:
+
+        def update_block_stack(self, inst):
+            pass
+
+    @property
+    def next_instruction(self):
+        return self.instructions[self.instruction_pointer]  # type: ignore[index]
+
     def step_graph_break(self, continue_inst):
         # generate code from checkpoint
         assert not self.output.output_instructions
@@ -774,16 +881,12 @@ def run(self):
         with self.run_ctx_mgr():
             try:
                 self.output.push_tx(self)
-                while (
-                    self.instruction_pointer is not None
-                    and not self.output.should_exit
-                    and self.step()
-                ):
+                while self.step():
                     pass
             except BackendCompilerFailed:
                 raise
             except Exception as e:
-                if config.replay_record_enabled:
+                if self.exec_recorder:
                     e.exec_record = self.exec_recorder.get_record()  # type: ignore[attr-defined]
                 raise
             finally:
@@ -800,7 +903,7 @@ def push(self, val: Optional[VariableTracker]):
         assert val is None or isinstance(
             val, VariableTracker
         ), f"push expects VariableTracker, got {typestr(val)}"
-        self.stack.append(val)
+        self.stack.append(val)  # type: ignore[arg-type]
 
     def push_many(self, vals: List[VariableTracker]):
         for val in vals:
@@ -810,29 +913,34 @@ def pop(self) -> VariableTracker:
         return self.stack.pop()
 
     def popn(self, n: int) -> List[VariableTracker]:
-        assert n >= 0
-        return list(reversed([self.pop() for _ in range(n)]))
+        return [*reversed([self.pop() for _ in range(n)])]
 
     def LOAD_FAST(self, inst):
         name = inst.argval
 
-        if name in self.f_locals and config.replay_record_enabled:
+        if self.exec_recorder and name in self.f_locals:
             self.exec_recorder.add_local_var(name, self.f_locals[name])
 
-        if name.startswith(".") and name not in self.symbolic_locals:
-            # This happens in dict/list comprehensions
-            name = name.replace(".", "implicit")
-        assert name not in self.cell_and_freevars()
-        if name not in self.symbolic_locals:
-            unimplemented("undefined LOAD_FAST")
-        self.push(self.symbolic_locals[name])
+        try:
+            self.push(self.symbolic_locals[name].unwrap())
+        except KeyError:
+            if name.startswith("."):
+                try:
+                    # This happens in dict/list comprehensions
+                    self.push(self.symbolic_locals[name.replace(".", "implicit")])
+                except KeyError:
+                    unimplemented("undefined LOAD_FAST (implicit)")
+            else:
+                unimplemented("undefined LOAD_FAST")
+
+        # for continuation functions
         if name.startswith("___stack"):
             self.symbolic_locals.pop(name)
 
     def LOAD_DEREF(self, inst):
         assert inst.argval in self.cell_and_freevars()
 
-        if inst.argval in self.f_locals and config.replay_record_enabled:
+        if self.exec_recorder and inst.argval in self.f_locals:
             self.exec_recorder.add_local_var(inst.argval, self.f_locals[inst.argval])
 
         if inst.argval not in self.symbolic_locals:
@@ -842,11 +950,7 @@ def LOAD_DEREF(self, inst):
     def STORE_FAST(self, inst):
         loaded_vt = self.pop()
         name = inst.argval
-        # Only rename at the top-level scope, this is to avoid the confusion between
-        # mutating a variable vs renaming it (e.g. a = b) during speculating a higher order op,
-        # where mutation is prohibited and it's difficult to differentiate it with renaming.
-        if _is_top_level_scope(current_scope_id()):
-            loaded_vt = loaded_vt.rename(self, name)
+        loaded_vt.set_name_hint(name)
         self.symbolic_locals[name] = loaded_vt
 
     def DELETE_FAST(self, inst):
@@ -857,28 +961,17 @@ def DELETE_FAST(self, inst):
     def LOAD_CLOSURE(self, inst):
         self.push(ClosureVariable(name=inst.argval))
 
-    def LOAD_CONST(self, inst):
-        # For empty tuples, create empty TupleVariable
-        if isinstance(inst.argval, tuple) and not inst.argval:
-            self.push(TupleVariable([]))
-        else:
-            self.push(ConstantVariable.create(value=inst.argval))
+    def _load_const(self, inst):
+        i = inst.arg
+        if i is None:
+            return ConstantVariable.create(value=inst.argval)
+        val = self._constants_cache[i]
+        if not val:
+            self._constants_cache[i] = val = ConstantVariable.create(value=inst.argval)
+        return val
 
-    def get_global_source(self, name):
-        source: Source
-        if self.output.global_scope is self.f_globals:
-            source = GlobalSource(name)
-        else:
-            if "__name__" in self.f_globals:
-                source = AttrSource(
-                    self.import_source(self.f_globals["__name__"]), name
-                )
-            else:
-                mangled_name = f"___unnamed_scope_{id(self.f_globals)}"
-                if mangled_name not in self.output.global_scope:
-                    self.output.install_global(mangled_name, self.f_globals)
-                source = GetItemSource(GlobalSource(mangled_name), name)
-        return source
+    def LOAD_CONST(self, inst):
+        self.push(self._load_const(inst))
 
     def LOAD_GLOBAL(self, inst):
         if sys.version_info >= (3, 11):
@@ -887,7 +980,7 @@ def LOAD_GLOBAL(self, inst):
 
         name = inst.argval
 
-        if config.replay_record_enabled:
+        if self.exec_recorder:
             if name in self.f_globals:
                 self.exec_recorder.add_global_var(name, self.f_globals[name])
             else:
@@ -907,15 +1000,15 @@ def LOAD_GLOBAL(self, inst):
         except KeyError:
             return self.load_builtin(inst)
 
-        source = self.get_global_source(name)
+        source = GlobalSource(name)
         self.push(VariableBuilder(self, source)(value))
 
     def STORE_GLOBAL(self, inst):
         value = self.pop()
         name = inst.argval
-        source = self.get_global_source(name)
+        source = GlobalSource(name)
         if name not in self.symbolic_globals:
-            self.symbolic_globals[name] = object()  # sentinel object
+            self.symbolic_globals[name] = object()  # type: ignore[assignment]  # sentinel object
         variable = self.output.side_effects.track_global_existing(
             source, self.symbolic_globals[name]
         )
@@ -996,12 +1089,15 @@ def IMPORT_NAME(self, inst):
             value = self.f_globals[recorded_name]
             source = GlobalSource(recorded_name)
         else:
-            value = __import__(
-                module_name,
-                fromlist=fromlist,
-                level=level,
-                globals=self.f_globals,
-            )
+            try:
+                value = __import__(
+                    module_name,
+                    fromlist=fromlist,
+                    level=level,
+                    globals=self.f_globals,
+                )
+            except ImportError:
+                unimplemented("import a module that does not exist")
 
             if level != 0:
                 pkg = self.calc_package()
@@ -1018,7 +1114,7 @@ def IMPORT_NAME(self, inst):
             else:
                 source = self.import_source(module_name)
 
-        if config.replay_record_enabled:
+        if self.exec_recorder:
             self.exec_recorder.add_local_mod(recorded_name, value)
 
         if istype(value, (types.ModuleType, DummyModule)):
@@ -1028,7 +1124,7 @@ def IMPORT_NAME(self, inst):
 
     def IMPORT_FROM(self, inst):
         self.DUP_TOP(inst)
-        self.LOAD_ATTR(inst)
+        self._load_attr(inst)
 
     def load_builtin(self, inst):
         if inst.argval not in self.f_builtins:
@@ -1036,7 +1132,11 @@ def load_builtin(self, inst):
         val = self.f_builtins[inst.argval]
 
         if callable(val):
-            self.push(VariableBuilder(self, GlobalSource(inst.argval))(val))
+            builtins_source = GlobalSource(
+                self.output.name_of_builtins_dict_key_in_fglobals
+            )
+            var_source = GetItemSource(builtins_source, inst.argval)
+            self.push(VariableBuilder(self, var_source)(val))
         else:
             assert is_builtin_constant(val)
             self.push(ConstantVariable.create(value=val))
@@ -1088,10 +1188,9 @@ def CALL_FINALLY(self, inst):
         bytecode counter by delta
         """
         # Python 3.8 only
-        assert self.next_instruction is not None
         addr = self.indexof[self.next_instruction]
         self.push(ConstantVariable.create(addr))
-        self.instruction_pointer = self.indexof[inst.target]
+        self.jump(inst)
 
     def END_FINALLY(self, inst):
         # Python 3.8 only
@@ -1109,76 +1208,40 @@ def POP_FINALLY(self, inst):
             tos = self.pop()
         _ = self.pop()
         if preserve_tos:
-            self.push(tos)
+            self.push(tos)  # type: ignore[possibly-undefined]
 
     def FOR_ITER(self, inst):
         it = self.pop().realize()
-        if isinstance(it, (variables.ListIteratorVariable, variables.IteratorVariable)):
-            try:
-                val, next_iter = it.next_variables(self)
-                self.push(next_iter)
-                self.push(val)
-            except StopIteration:
-                self.jump(inst)
+        try:
+            val = it.next_variable(self)
+            self.push(it)
+            self.push(val)
+        except (StopIteration, exc.UserStopIteration):
+            # leave iterator upon exhaustion in 3.12
+            if sys.version_info >= (3, 12):
+                # CPython 3.12 actually jumps to the instruction after the END_FOR
+                # and performs the action of END_FOR as part of FOR_ITER. We jump
+                # to the END_FOR and run it, so we need to make sure 2 values are
+                # on the stack for it to pop.
+                self.push(it)
+                self.push(ConstantVariable.create(None))
+            self.jump(inst)
+
+    def RAISE_VARARGS(self, inst):
+        if inst.arg == 0:
+            unimplemented("re-raise")
+        elif inst.arg == 1:
+            val = self.pop()
+            if (
+                isinstance(val, BuiltinVariable) and val.fn is StopIteration
+            ) or isinstance(val, variables.StopIterationVariable):
+                raise exc.UserStopIteration
+            unimplemented(f"raise {exc}")
         else:
-            unimplemented(f"FOR_ITER {typestr(it)}")
+            unimplemented("raise ... from ...")
 
     def COMPARE_OP(self, inst):
-        left, right = self.popn(2)
-        op = inst.argval
-        supported_any = dict(
-            itertools.chain(
-                supported_tensor_comparison_ops.items(),
-                supported_const_comparison_ops.items(),
-            )
-        )
-        if (
-            isinstance(
-                left,
-                (
-                    TensorVariable,
-                    SymNodeVariable,
-                    NNModuleVariable,
-                    BaseListVariable,
-                    UserDefinedVariable,
-                    BaseUserFunctionVariable,
-                    ConstDictVariable,
-                ),
-            )
-            and isinstance(right, ConstantVariable)
-            and right.value is None
-            and op in supported_const_comparison_ops
-        ):
-            # <non-None> is None
-            self.push(
-                ConstantVariable.create(
-                    supported_const_comparison_ops[op](object(), right.value)
-                )
-            )
-
-        elif (
-            left.is_python_constant()
-            and right.is_python_constant()
-            and op in supported_any
-        ):
-            # constant fold
-            self.push(
-                ConstantVariable.create(
-                    supported_any[op](
-                        left.as_python_constant(), right.as_python_constant()
-                    ),
-                )
-            )
-        elif op in ("in", "not in"):
-            self.push(right.call_method(self, "__contains__", [left], {}))
-            if op == "not in":
-                self.UNARY_NOT(inst)
-        else:
-            self.push(
-                BuiltinVariable(supported_any[op]).call_function(
-                    self, [left, right], {}
-                )
-            )
+        self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))
 
     def GET_ITER(self, inst):
         self.call_function(BuiltinVariable(iter), [self.pop()], {})
@@ -1247,7 +1310,7 @@ def LOAD_METHOD_SUPER(self, inst):
         arg = inst.argval[0]
         argval = self.code_options["co_names"][arg]
         if sys.version_info < (3, 11):
-            self.LOAD_ATTR(dataclasses.replace(inst, argval=argval))
+            self._load_attr(dataclasses.replace(inst, argval=argval))
         else:
             self.LOAD_METHOD(dataclasses.replace(inst, argval=argval))
 
@@ -1255,10 +1318,10 @@ def LOAD_ATTR_SUPER(self, inst):
         self.CALL_FUNCTION(dataclasses.replace(inst, argval=2))
         arg = inst.argval[0]
         argval = self.code_options["co_names"][arg]
-        self.LOAD_ATTR(dataclasses.replace(inst, argval=argval))
+        self._load_attr(dataclasses.replace(inst, argval=argval))
 
     def LOAD_METHOD(self, inst):
-        self.LOAD_ATTR(inst)
+        self._load_attr(inst)
         obj = self.pop()
         if sys.version_info >= (3, 11):
             # always follow the NULL + fn convention, since if obj
@@ -1277,21 +1340,28 @@ def CALL_METHOD(self, inst):
         fn = self.pop()
         self.call_function(fn, args, {})
 
-    def LOAD_ATTR(self, inst):
+    def _load_attr(self, inst):
         obj = self.pop()
         result = BuiltinVariable(getattr).call_function(
             self, [obj, ConstantVariable.create(inst.argval)], {}
         )
         self.push(result)
 
+    def LOAD_ATTR(self, inst):
+        if sys.version_info >= (3, 12):
+            if inst.arg % 2:
+                self.LOAD_METHOD(inst)
+                return
+        self._load_attr(inst)
+
     def STORE_ATTR(self, inst):
         speculation = self.speculate()
         if speculation.failed:
             return self.store_attr_graph_break(inst)
         val, obj = self.popn(2)
 
-        if isinstance(obj, NNModuleVariable):
-            # We don't allow side effects during export
+        if isinstance(obj, NNModuleVariable) and not isinstance(val, ConstantVariable):
+            # We don't allow side effects during export on non-constant values
             # https://github.com/pytorch/torchdynamo/issues/1475
             assert (
                 not self.export
@@ -1311,6 +1381,8 @@ def STORE_ATTR(self, inst):
         speculation.fail_and_restart_analysis()
 
     def store_attr_graph_break(self, inst):
+        if not self.should_compile_partial_graph():
+            unimplemented("should_compile_partial_graph=False")
         self.output.compile_subgraph(
             self, reason=GraphCompileReason("store_attr", [self.frame_summary()])
         )
@@ -1341,6 +1413,10 @@ def STORE_SUBSCR(self, inst):
         val, obj, key = self.popn(3)
         result = obj.call_method(self, "__setitem__", [key, val], {})
 
+    def DELETE_SUBSCR(self, inst):
+        obj, key = self.popn(2)
+        obj.call_method(self, "__delitem__", [key], {})
+
     def BUILD_TUPLE(self, inst):
         items = self.popn(inst.argval)
         self.push(TupleVariable(items))
@@ -1418,7 +1494,7 @@ def MAP_ADD(self, inst):
         assert inst.argval > 0
         obj = self.stack[-inst.arg].realize()
         assert isinstance(obj, ConstDictVariable)
-        obj.call_method(self, "__setitem__", (k, v), {})
+        obj.call_method(self, "__setitem__", (k, v), {})  # type: ignore[arg-type]
 
     def SET_ADD(self, inst):
         v = self.pop()
@@ -1446,8 +1522,8 @@ def MAKE_FUNCTION(self, inst):
         if sys.version_info >= (3, 11):
             # MAKE_FUNCTION behavior actually changed in 3.11, see
             # https://github.com/python/cpython/pull/93189/
-            assert hasattr(code.value, "co_qualname")
-            fn_name = ConstantVariable.create(value=code.value.co_qualname)
+            assert hasattr(code.value, "co_qualname")  # type: ignore[attr-defined]
+            fn_name = ConstantVariable.create(value=code.value.co_qualname)  # type: ignore[attr-defined]
         defaults = None
         closure = None
         annotations = None
@@ -1487,7 +1563,8 @@ def UNPACK_SEQUENCE(self, inst):
             val = seq.unpack_var_sequence(self)
         else:
             unimplemented(f"UNPACK_SEQUENCE {seq}")
-        assert len(val) == inst.argval
+        if len(val) != inst.argval:
+            unimplemented("UNPACK_SEQUENCE length mismatch")
         for i in reversed(val):
             self.push(i)
 
@@ -1629,7 +1706,7 @@ def LIST_TO_TUPLE(self, inst):
     def DICT_MERGE(self, inst):
         v = self.pop()
         assert inst.argval > 0
-        obj = self.stack[-inst.arg]
+        obj = self.stack[-inst.arg].realize()
         assert isinstance(obj, ConstDictVariable)
         assert obj.mutable_local
         obj.call_method(self, "update", [v], {})
@@ -1670,8 +1747,8 @@ def MATCH_KEYS(self, inst):
         tos1 = self.stack[-2]
         assert isinstance(tos1, ConstDictVariable)
 
-        if all(k in tos1 for k in tos):
-            self.push(TupleVariable([tos1.getitem_const(k) for k in tos]))
+        if all(k in tos1 for k in tos):  # type: ignore[attr-defined]
+            self.push(TupleVariable([tos1.getitem_const(k) for k in tos]))  # type: ignore[attr-defined]
             if sys.version_info < (3, 11):
                 self.push(ConstantVariable.create(True))
         else:
@@ -1726,14 +1803,10 @@ def RESUME(self, inst):
         else:
             assert not self.accept_prefix_inst
 
-    def BINARY_OP(self, inst):
-        if sys.version_info >= (3, 11):
-            opname = dis._nb_ops[inst.arg][0][3:]  # type: ignore[attr-defined]
-            if opname.startswith("INPLACE"):
-                return getattr(self, "INPLACE_" + opname[8:])(inst)
-            return getattr(self, "BINARY_" + opname)(inst)
-        else:
-            unimplemented("BINARY_OP requires Python 3.11+")
+    if sys.version_info >= (3, 11):
+
+        def BINARY_OP(self, inst):
+            return _binary_op_lookup[inst.arg](self, inst)
 
     def PRECALL(self, inst):
         pass
@@ -1744,7 +1817,7 @@ def KW_NAMES(self, inst):
         for name in kw_names:
             assert isinstance(name, str)
         assert self.kw_names is None
-        self.kw_names = ConstantVariable.create(value=kw_names)
+        self.kw_names = ConstantVariable.create(value=kw_names)  # type: ignore[assignment]
 
     def PUSH_NULL(self, inst):
         self.push(NullVariable())
@@ -1804,17 +1877,27 @@ def setup_or_before_with(self, inst):
             ctx,
             inst.target,
         )
+
         if sys.version_info >= (3, 11):
-            # see create_call_resume_at for block stack details
-            assert self.next_instruction
-            assert self.next_instruction.exn_tab_entry
-            target = self.next_instruction.exn_tab_entry.target
+            # See create_call_resume_at for block stack details.
+            # Only push a block if the current instruction's block is a
+            # with block that is not nested in a try block - that is, the current
+            # instruction's block target is the same as the top block's target.
+            if inst.exn_tab_entry and (
+                not self.block_stack
+                or inst.exn_tab_entry.target is not self.block_stack[-1].target
+            ):
+                target = None
+            else:
+                target = self.next_instruction.exn_tab_entry.target
         else:
             target = inst.target
-        if isinstance(self, InstructionTranslator):
-            self.block_stack.append(BlockStackEntry(target, len(self.stack), ctx))
-        else:
-            self.block_stack.append(BlockStackEntry(target))
+
+        if target:
+            if isinstance(self, InstructionTranslator):
+                self.block_stack.append(BlockStackEntry(target, len(self.stack), ctx))
+            else:
+                self.block_stack.append(BlockStackEntry(target))
 
         self.push(exit)
         self.push(ctx.enter(self))
@@ -1824,7 +1907,15 @@ def append_prefix_inst(self, inst):
         self.prefix_insts.append(inst)
 
     def MAKE_CELL(self, inst):
-        self.append_prefix_inst(inst)
+        if sys.version_info >= (3, 12) and not self.accept_prefix_inst:
+            # In 3.12+, MAKE_CELL is not longer necessarily a prefix instruction.
+            # It can be generated by inlined comprehensions.
+            assert isinstance(self.symbolic_locals[inst.argval], NullVariable)
+            self.symbolic_locals[
+                inst.argval
+            ] = self.output.side_effects.track_cell_new()
+        else:
+            self.append_prefix_inst(inst)
 
     def COPY_FREE_VARS(self, inst):
         self.append_prefix_inst(inst)
@@ -1832,32 +1923,44 @@ def COPY_FREE_VARS(self, inst):
     def RETURN_GENERATOR(self, inst):
         self.append_prefix_inst(inst)
 
-    def copy_graphstate(self) -> InstructionTranslatorGraphState:
-        """Create a checkpoint of the current state by copying everything"""
-        return InstructionTranslatorGraphState(
-            self.output.copy_graphstate(),
-            dict(self.symbolic_locals),
-            list(self.stack),
-            list(self.block_stack),
-            self.instruction_pointer,
-            self.current_instruction,
-            self.next_instruction,
-            self.lineno,
-        )
+    # 3.12 opcodes
+    # BINARY/STORE_SLICE opcodes are broken down into
+    # BUILD_SLICE 2 and BINARY/STORE_SUBSCR
 
-    def restore_graphstate(self, state: InstructionTranslatorGraphState):
-        """Restore a checkpoint created by self.copy_graphstate()"""
-        (
-            output_state,
-            self.symbolic_locals,
-            self.stack,
-            self.block_stack,
-            self.instruction_pointer,
-            self.current_instruction,
-            self.next_instruction,
-            self.lineno,
-        ) = state
-        self.output.restore_graphstate(output_state)
+    def END_FOR(self, inst):
+        self.popn(2)
+
+    def LOAD_FAST_CHECK(self, inst):
+        if isinstance(self.symbolic_locals[inst.argval], NullVariable):
+            unimplemented("LOAD_FAST_CHECK on uninitialized variable")
+        self.LOAD_FAST(inst)
+
+    def LOAD_FAST_AND_CLEAR(self, inst):
+        if inst.argval not in self.symbolic_locals:
+            self.push(NullVariable())
+        else:
+            self.LOAD_FAST(inst)
+        self.symbolic_locals[inst.argval] = NullVariable()
+
+    def LOAD_SUPER_ATTR(self, inst):
+        self.CALL_FUNCTION(dataclasses.replace(inst, argval=2))
+        if inst.arg & 1:
+            self.LOAD_METHOD(inst)
+        else:
+            self._load_attr(inst)
+
+    def CALL_INTRINSIC_1(self, inst):
+        if inst.argval == 5:
+            # INTRINSIC_UNARY_POSITIVE
+            self.UNARY_POSITIVE(inst)
+        elif inst.argval == 6:
+            # INTRINSIC_LIST_TO_TUPLE
+            self.push(TupleVariable(self.pop().unpack_var_sequence(self)))
+        else:
+            unimplemented(f"missing CALL_INTRINSIC_1 operand {inst.argval}")
+
+    def END_SEND(self, inst):
+        del self.stack[-2]
 
     def is_non_empty_graph(self):
         if self.output.count_calls() > 1:
@@ -1883,10 +1986,12 @@ def frame_summary(self):
             lookup_line=False,
         )
 
-    def store_global_weakref(self, name, value):
-        install_guard(GlobalWeakRefSource(name).make_guard(GuardBuilder.WEAKREF_ALIVE))
-        if name not in self.output.global_scope:
-            self.output.install_global(name, weakref.ref(value))
+    def store_global_weakref_by_id(self, prefix, value):
+        global_name = self.output.install_global_by_id(prefix, weakref.ref(value))
+        install_guard(
+            GlobalWeakRefSource(global_name).make_guard(GuardBuilder.WEAKREF_ALIVE)
+        )
+        return global_name
 
     @property
     def fake_mode(self):
@@ -1899,12 +2004,16 @@ def find_symbolic_locals_name(self, tensor_variable):
         return None
 
     @contextlib.contextmanager
-    def strict_translation_mode(self):
-        self.strict_checks_enabled = True
+    def strict_translation_mode(self, check_fn: Callable[[VariableTracker], bool]):
+        """
+        Strict mode is enabled on a per-VariableTracker level depending on the return value of check_fn(node).
+        """
+        prior = self.strict_checks_fn
+        self.strict_checks_fn = check_fn
         try:
             yield
         finally:
-            self.strict_checks_enabled = False
+            self.strict_checks_fn = prior
 
     def speculate(self) -> SpeculationEntry:
         return self.speculation_log.next(
@@ -1936,11 +2045,10 @@ def __init__(
         self.stack = []
         self.instruction_pointer = 0
         self.current_instruction = create_instruction("NOP")
-        self.next_instruction = None
         self.block_stack = []
         # states before SETUP_WITH for checkpointing and fallback
         self.generic_context_manager_depth = 0
-        self.lineno = code_options["co_firstlineno"]
+        self.lineno = -1
         self.kw_names = None
         self.accept_prefix_inst = True
         self.prefix_insts = []
@@ -1957,7 +2065,12 @@ def __init__(
         self.f_code: types.CodeType = f_code
 
         # Execution record for replaying errors
-        self.exec_recorder = ExecutionRecorder(code=f_code, code_options=code_options)
+        if config.replay_record_enabled:
+            self.exec_recorder = ExecutionRecorder(
+                code=f_code, code_options=code_options
+            )
+        else:
+            self.exec_recorder = None
         # Stack of module being parsed, current nn.module is at the end of ordered dict.
         # The first field of tuple is the fully qualified name of current module
         # in original hierarchy.  The second field is the type of current nn.module
@@ -1966,9 +2079,8 @@ def __init__(
         self.export = export
 
         self.current_speculation = None
-        self.random_calls = []
 
-        self.strict_checks_enabled = False
+        self.strict_checks_fn = None
 
         if sys.version_info >= (3, 10):
             from .resume_execution import (
@@ -1985,8 +2097,10 @@ def __init__(
 
         self.inline_depth = inline_depth
         self.inconsistent_side_effects = False
+        self._constants_cache: List[Optional[VariableTracker]] = [None] * len(
+            f_code.co_consts
+        )
         linecache.lazycache(f_code.co_filename, f_globals)
-        self.log_starts_line()
 
 
 class InstructionTranslator(InstructionTranslatorBase):
@@ -2051,6 +2165,8 @@ def __init__(
             speculation_log=speculation_log,
         )
 
+        self._throw_if_in_functorch()
+
         # as soon as we create the tracing context we should keep it active, so any calls
         # into dynamo apis can rely on finding it
         with tracing(self.output.tracing_context), self.set_current_tx():
@@ -2075,11 +2191,13 @@ def __init__(
                 for k in vars
                 if k in f_locals
             }
+
+            self.debug_locals: List[Tuple[VariableTracker, List[VariableTracker]]] = []
             if export:
                 # export gets confused if we never realize unused inputs
                 # in export mode just eagerly realize everything
-                self.symbolic_locals = VariableTracker.apply(
-                    lambda x: x.realize(), self.symbolic_locals
+                self.symbolic_locals = variables.LazyVariableTracker.realize_all(
+                    self.symbolic_locals
                 )
 
             self._freevars_ids = dict()
@@ -2087,12 +2205,30 @@ def __init__(
                 if name in f_locals:
                     self._freevars_ids[name] = id(f_locals[name])
 
+    def _throw_if_in_functorch(self):
+        # Fallback to eager in case of a graph break inside vmap
+        eager = torch._dynamo.lookup_backend("eager")
+        compiler_fn = inspect.getattr_static(
+            self.output.compiler_fn, "compiler_fn", self.output.compiler_fn
+        )
+        ci = torch._C._functorch.peek_interpreter_stack()
+        forbidden_keys = (
+            torch._C._functorch.TransformType.Vmap,
+            torch._C._functorch.TransformType.Grad,
+            torch._C._functorch.TransformType.Jvp,
+        )
+        if ci is not None and ci.key() in forbidden_keys and compiler_fn is not eager:
+            # if it reaches here, it means Dynamo failed to inline a functorch function
+            name = ci.key().name.lower()
+            msg = f"torch.func.{name}(fn) requires the function to be inlined by dynamo"
+            unimplemented(msg)
+
     def get_example_value(self, source: Source):
         if isinstance(source, LocalSource):
             return self.f_locals[source.local_name]
         if isinstance(source, GlobalSource):
             return self.f_globals[source.global_name]
-        raise KeyError()
+        raise KeyError
 
     def run(self):
         super().run()
@@ -2109,6 +2245,13 @@ def match_nested_cell(self, name, cell):
         return self.symbolic_locals[name]
 
     def should_compile_partial_graph(self):
+        if sys.version_info >= (3, 11):
+            # Do not compile if current instruction's block is not the top with block
+            entry = self.current_instruction.exn_tab_entry
+            if entry and (
+                not self.block_stack or entry.target is not self.block_stack[-1].target
+            ):
+                return False
         return (
             all(b.can_restore() for b in self.block_stack)
             and not self.one_graph
@@ -2120,16 +2263,61 @@ def create_call_resume_at(self, inst):
 
         if inst.opname == "RETURN_VALUE":
             return [create_instruction("RETURN_VALUE")]
+        elif inst.opname == "RETURN_CONST":
+            return [create_instruction("RETURN_CONST", argval=inst.argval)]
 
         reads = livevars_analysis(self.instructions, inst)
-        argnames = tuple(
+        all_argnames = tuple(
             k
             for k in self.symbolic_locals.keys()
             if k in reads and k not in self.cell_and_freevars()
         )
+        # NOTE: do not use isinstance, since it realizes lazy VT's
+        argnames = tuple(
+            k
+            for k in all_argnames
+            if not type.__instancecheck__(NullVariable, self.symbolic_locals[k])
+        )
+        argnames_null = tuple(
+            k
+            for k in all_argnames
+            if type.__instancecheck__(NullVariable, self.symbolic_locals[k])
+        )
+        if sys.version_info < (3, 12):
+            assert len(argnames_null) == 0, "variables should not be NULL in < 3.12"
 
         cg = PyCodegen(self)
 
+        # Handle inactive context variables.
+        # The resume function assumes that context variables are the class, NOT the object.
+        # e.g. torch.set_grad_enabled(True) will be reconstructed as torch.set_grad_enabled
+        stack_ctx_vars = []
+        for i, var in enumerate(self.stack):
+            if type.__instancecheck__(ContextWrappingVariable, var):
+                ctx = cast(ContextWrappingVariable, var)
+                target_values = (
+                    () if ctx.target_values is None else tuple(ctx.target_values)
+                )
+                stack_ctx_vars.append((i, target_values))
+                # Replace the current stack var with the context class
+                ctx.reconstruct_type(cg)
+                cg.extend_output(create_swap(len(self.stack) - i + 1))
+                cg.append_output(create_instruction("POP_TOP"))
+
+        argnames_ctx_vars = []
+        for name in argnames:
+            if type.__instancecheck__(
+                ContextWrappingVariable, var := self.symbolic_locals[name]
+            ):
+                ctx = cast(ContextWrappingVariable, var)
+                target_values = (
+                    () if ctx.target_values is None else tuple(ctx.target_values)
+                )
+                argnames_ctx_vars.append((name, target_values))
+                # Replace the local with the context class
+                ctx.reconstruct_type(cg)
+                cg.append_output(create_instruction("STORE_FAST", argval=name))
+
         # Python does not allow null to be an arg to a function, so
         # we remove nulls from the stack and restore them in the
         # prologue of the resume function
@@ -2139,12 +2327,12 @@ def create_call_resume_at(self, inst):
         if sys.version_info >= (3, 11):
             # find indices of NullVariables
             for i, var in enumerate(self.stack):
-                if isinstance(var, NullVariable):
+                if type.__instancecheck__(NullVariable, var):
                     null_idxes.append(i)
             # generate bytecode to pop the nulls
             null_cnt = 0
             for i, var in enumerate(reversed(self.stack)):
-                if isinstance(var, NullVariable):
+                if type.__instancecheck__(NullVariable, var):
                     for j in range(2, i + 2 - null_cnt):
                         cg.append_output(create_instruction("SWAP", arg=j))
                     cg.extend_output(cg.pop_null())
@@ -2164,24 +2352,28 @@ def create_call_resume_at(self, inst):
             tuple(b.target.offset for b in self.block_stack),
             stack_len,
             argnames,
+            argnames_null,
             tuple(b.resume_fn() for b in self.block_stack),
+            tuple(stack_ctx_vars),
+            tuple(argnames_ctx_vars),
             tuple(null_idxes),
         )
 
         # Add original GraphModule context to the resume function to handle
         # the case of a graph break while tracing a GraphModule
         orig_graphmodule_maybe = code_context.get_context(self.f_code).get(
-            "orig_graphmodule", None
-        )
+            "orig_graphmodule", lambda: None
+        )()
         if orig_graphmodule_maybe is not None:
-            code_context.get_context(new_code)[
-                "orig_graphmodule"
-            ] = orig_graphmodule_maybe
+            code_context.get_context(new_code)["orig_graphmodule"] = weakref.ref(
+                orig_graphmodule_maybe
+            )
 
         if new_code.co_freevars:
             cg.make_function_with_closure(name, new_code, True, stack_len)
         else:
-            self.output.install_global(
+            # This is safe: we pre-generate a unique name
+            self.output.install_global_unsafe(
                 name, types.FunctionType(new_code, self.f_globals, name)
             )
             cg.extend_output(cg.load_function_name(name, True, stack_len))
@@ -2199,7 +2391,7 @@ def symbolic_locals_contain_module_class(self):
                 return True
         return False
 
-    def RETURN_VALUE(self, inst):
+    def _return(self, inst):
         if (
             self.output.count_calls() == 0
             and not self.inconsistent_side_effects
@@ -2210,16 +2402,38 @@ def RETURN_VALUE(self, inst):
         self.instruction_pointer = None
         _step_logger()(
             logging.INFO,
-            f"torchdynamo done tracing {self.f_code.co_name} (RETURN_VALUE)",
+            f"torchdynamo done tracing {self.f_code.co_name} ({inst.opname})",
         )
-        log.debug("RETURN_VALUE triggered compile")
+        log.debug("%s triggered compile", inst.opname)
         self.output.compile_subgraph(
             self,
             reason=GraphCompileReason(
                 "return_value", [self.frame_summary()], graph_break=False
             ),
         )
-        self.output.add_output_instructions([create_instruction("RETURN_VALUE")])
+        return_inst = (
+            create_instruction("RETURN_VALUE")
+            if inst.opname == "RETURN_VALUE"
+            else create_instruction("RETURN_CONST", argval=inst.argval)
+        )
+        self.output.add_output_instructions([return_inst])
+        raise ReturnValueOp
+
+    def RETURN_VALUE(self, inst):
+        self._return(inst)
+
+    def RETURN_CONST(self, inst):
+        self._return(inst)
+
+
+if sys.version_info >= (3, 11):
+    _binary_op_lookup = [
+        getattr(
+            InstructionTranslator,
+            opname[3:] if "INPLACE" in opname else f"BINARY_{opname[3:]}",
+        )
+        for opname, _ in dis._nb_ops  # type: ignore[attr-defined]
+    ]
 
 
 class InliningInstructionTranslator(InstructionTranslatorBase):
@@ -2237,25 +2451,22 @@ def check_inlineable(func):
         if func.has_self():
             unimplemented("inline with __self__")
 
-        result = skipfiles.check_verbose(func, is_inlined_call=True)
+        result = trace_rules.check_verbose(func, is_inlined_call=True)
         if result.skipped:
-            from torch._dynamo.variables.misc import (
-                produce_trampoline_autograd_apply,
-                produce_trampoline_autograd_bwd,
-                produce_trampoline_autograd_fwd,
-            )
+            from torch._dynamo.variables.misc import produce_trampoline_autograd_apply
 
             # _origin marks this as coming from an internal dynamo known function that is safe to
             # trace through.
-            if hasattr(func.fn, "_origin") and func.fn._origin in [
-                produce_trampoline_autograd_fwd,
+            if hasattr(getattr(func, "fn", None), "_origin") and func.fn._origin in [
                 produce_trampoline_autograd_apply,
-                produce_trampoline_autograd_bwd,
             ]:
                 # Known sound
-                return skipfiles.SkipResult(False, "allowlist in dynamo known function")
+                return trace_rules.SkipResult(
+                    False, "allowlist in dynamo known function"
+                )
+            fn_qualname = func.fn.__qualname__ if hasattr(func, "fn") else ""
             unimplemented(
-                f"'inline in skipfiles: {func.fn.__qualname__} | {func.get_name()} {func.get_filename()}, {result.reason}'"
+                f"'inline in skipfiles: {fn_qualname} | {func.get_name()} {func.get_filename()}, {result.reason}'"
             )
 
         if isinstance(func, UserFunctionVariable) and inspect.getattr_static(
@@ -2271,6 +2482,8 @@ def check_inlineable(func):
     def inline_call_(
         parent, func: VariableTracker, args: List[VariableTracker], kwargs
     ):
+        if isinstance(func, SkipFunctionVariable):
+            unimplemented("inline with functions in skip files")
         assert isinstance(
             func,
             (UserFunctionVariable, NestedUserFunctionVariable),
@@ -2296,16 +2509,18 @@ def inline_call_(
 
         code: types.CodeType = func.get_code()
         if code.co_name in ("__setitem__", "__setattr__") and not (
-            args is not None
-            and len(args) > 0
-            and isinstance(args[0], variables.CustomizedDictVariable)
+            args
+            and isinstance(
+                args[0],
+                (variables.CustomizedDictVariable, variables.UserDefinedObjectVariable),
+            )
         ):
             unimplemented(f"inline {code.co_name}")
 
         suffix = ""
         # TODO: mlazos, add support for enabling multiple artifact logs
         # with a single alias
-        if torch._logging._internal.log_state.is_artifact_enabled("output_code"):
+        if torch._logging._internal.log_state.is_artifact_enabled("bytecode"):
             suffix = f"\n{dis.Bytecode(code).dis()}"
         if sys.version_info >= (3, 11):
             cur_inst = parent.current_instruction
@@ -2328,7 +2543,7 @@ def get_trace_call_log_str():
                 # but it is enough to add a context for `forward` in case it is called.
                 code_context.get_context(module.forward.__code__)[
                     "orig_graphmodule"
-                ] = module
+                ] = weakref.ref(module)
 
         tracer: InliningInstructionTranslator
         if is_generator(code):
@@ -2341,8 +2556,8 @@ def get_trace_call_log_str():
             )
 
         strict_ctx: Any = contextlib.nullcontext()
-        if parent.strict_checks_enabled:
-            strict_ctx = tracer.strict_translation_mode()
+        if parent.strict_checks_fn:
+            strict_ctx = tracer.strict_translation_mode(parent.strict_checks_fn)
         try:
             with strict_ctx:
                 tracer.run()
@@ -2383,7 +2598,7 @@ def __init__(
         closure_cells: Dict[str, VariableTracker],
         funcvar: BaseUserFunctionVariable,
     ):
-        f_globals = funcvar.get_globals()
+        f_globals = funcvar.get_globals()  # type: ignore[attr-defined]
         f_builtins = f_globals["__builtins__"]
         if not isinstance(f_builtins, dict):
             f_builtins = f_builtins.__dict__
@@ -2397,7 +2612,7 @@ def __init__(
             symbolic_locals=symbolic_locals,
             symbolic_globals=symbolic_globals,
             instructions=instructions,
-            code_options={k: getattr(code, k) for k in dir(code)},
+            code_options={k: getattr(code, k) for k in get_code_keys()},
             f_code=code,
             export=parent.export,
             inline_depth=parent.inline_depth + 1,
@@ -2451,7 +2666,7 @@ def STORE_DEREF(self, inst):
                     self.output.root_tx.mutated_closure_cell_contents.add(
                         maybe_cell.source.name()
                     )
-                    raise exc.UnspecializeRestartAnalysis()
+                    raise exc.UnspecializeRestartAnalysis
                 unimplemented("write to __closure__ while inlining")
 
     def LOAD_DEREF(self, inst):
@@ -2488,8 +2703,71 @@ def create_call_resume_at(self, offset):
         unimplemented("cant resume while inlining")
 
     def RETURN_VALUE(self, inst):
-        self.symbolic_result = self.pop()
+        self.symbolic_result = self.pop()  # type: ignore[assignment]
         self.instruction_pointer = None
+        raise ReturnValueOp
+
+    def RETURN_CONST(self, inst):
+        self.symbolic_result = self._load_const(inst)
+        self.instruction_pointer = None
+        raise ReturnValueOp
+
+    def get_globals_source_and_value(self, name):
+        if "__name__" in self.f_globals:
+            module_name = self.f_globals["__name__"]
+            module_source = self.import_source(module_name)
+            if "torch_package" in module_name:
+                fglobals_value = torch.package.package_importer._package_imported_modules[module_name]  # type: ignore[assignment]
+            else:
+                fglobals_value = importlib.import_module(module_name)  # type: ignore[assignment]
+            fglobals_vt = VariableBuilder(self, module_source)(fglobals_value)
+            global_source = AttrSource(module_source, name)
+        else:
+            globals_name = self.output.install_global_by_id(
+                "___unnamed_scope", self.f_globals
+            )
+            globals_source = GlobalSource(globals_name)
+            fglobals_value = self.f_globals  # type: ignore[assignment]
+            fglobals_vt = VariableBuilder(self, globals_source)(fglobals_value)
+            global_source = GetItemSource(globals_source, name)  # type: ignore[assignment]
+        return fglobals_value, fglobals_vt, global_source
+
+    def LOAD_GLOBAL(self, inst):
+        if self.output.global_scope is self.f_globals:
+            super().LOAD_GLOBAL(inst)
+        else:
+            if sys.version_info >= (3, 11):
+                if inst.arg % 2:
+                    self.PUSH_NULL(inst)
+
+            name = inst.argval
+            if inst.argval == "AssertionError":
+                unimplemented("assert with non-string message")
+
+            _, fglobals_vt, global_source = self.get_globals_source_and_value(name)
+            if self.output.side_effects.has_pending_mutation_of_attr(fglobals_vt, name):
+                self.push(self.output.side_effects.load_attr(fglobals_vt, name))
+            else:
+                try:
+                    value = self.f_globals[name]
+                except KeyError:
+                    return self.load_builtin(inst)
+
+                self.push(VariableBuilder(self, global_source)(value))
+
+    def STORE_GLOBAL(self, inst):
+        if self.f_globals is self.parent.f_globals:
+            super().STORE_GLOBAL(inst)
+        else:
+            value = self.pop()
+            if isinstance(value, RemovableHandleVariable):
+                unimplemented("Storing handles in globals - NYI")
+            name = inst.argval
+            fglobals_value, fglobals_vt, _ = self.get_globals_source_and_value(name)
+            fglobals_vt = self.output.side_effects.track_object_existing(
+                fglobals_value, fglobals_vt
+            )
+            self.output.side_effects.store_attr(fglobals_vt, name, value)
 
 
 class InliningGeneratorInstructionTranslator(InliningInstructionTranslator):
@@ -2501,7 +2779,6 @@ def __init__(self, *args, **kwargs):
 
     def YIELD_VALUE(self, inst: Instruction):
         self.generated_items.append(self.pop())
-        # TODO(jansel): figure out why this is needed, it isn't in the docs for YIELD_VALUE
         self.push(ConstantVariable.create(None))
 
     def GET_YIELD_FROM_ITER(self, inst):
@@ -2510,37 +2787,61 @@ def GET_YIELD_FROM_ITER(self, inst):
             self.pop()
             res = BuiltinVariable(iter).call_function(self, [tos], {})
             self.push(res)
-        return self.YIELD_FROM(inst)
 
     def YIELD_FROM(self, inst):
-        while True:
-            tos = self.stack[-1].realize()
-            if isinstance(tos, ConstantVariable) and tos.value is None:
-                self.pop()
-                return
-            if isinstance(
-                tos, (variables.ListIteratorVariable, variables.IteratorVariable)
-            ):
-                try:
-                    val, next_iter = tos.next_variables(self)
-                    self.push(val)
-                    # TODO(voz): Unclear if we need the push None in YIELD_VALUE?
-                    self.YIELD_VALUE(inst)
-                    self.pop()
-                    self.push(next_iter)
-                except StopIteration:
-                    return
-            else:
-                unimplemented(f"YIELD_FROM {typestr(tos)}")
+        assert len(self.stack) >= 2
+        val = self.pop()
+        tos = self.stack[-1]
+        if not (isinstance(val, ConstantVariable) and val.value is None):
+            # invoke send
+            # Unreachable code - if you hit this, you are implementing generator support and have
+            # lifted the `unimplemented("generator")` in frame conversion. This codepath handles
+            # subgenerator and lines up with this line in Python 3.10
+            # https://github.com/python/cpython/blob/3.10/Python/ceval.c#L2599
+            unimplemented("Unreachable sub-generator code")
+
+        try:
+            val = tos.next_variable(self)
+        except (StopIteration, exc.UserStopIteration) as ex:
+            # The iterator is exhausted. Stop the loop and return.
+            self.pop()
+            self.push(ConstantVariable.create(ex.value))
+        else:
+            self.push(val)
+            # Add the value to yield into generated_items and replace the top of the stack with None
+            self.YIELD_VALUE(inst)
+
+            # Repeat the YIELD_FROM instruction in the next eval loop
+            assert (
+                isinstance(self.instruction_pointer, int)
+                and self.instruction_pointer > 0
+            )
+            self.instruction_pointer -= 1
 
     def SEND(self, inst):
         assert len(self.stack) >= 2
         val = self.pop()
         tos = self.stack[-1]
-        if isinstance(tos, ListIteratorVariable):
+        if isinstance(tos, ListIteratorVariable) or (
+            isinstance(tos, UserDefinedObjectVariable)
+            and isinstance(tos.value, collections.abc.Iterator)
+        ):
             if isinstance(val, ConstantVariable) and val.value is None:
-                self.push(val)
-                self.instruction_pointer = self.indexof[inst.target]
+                try:
+                    val = tos.next_variable(self)
+                except (StopIteration, exc.UserStopIteration) as ex:
+                    # To implement SEND, we have to look at the implementation
+                    # when the iterator returns StopIteration. This translates to this code
+                    # 3.11: https://github.com/python/cpython/blob/3.11/Python/ceval.c#L2613-L2619
+                    # 3.12: https://github.com/python/cpython/blob/3.12/Python/bytecodes.c#L863-L866
+                    # The implementation is different in 3.11 and 3.12. In 3.12, we rely
+                    # on END_SEND to clean up. In 3.11, SEND does the cleanup as well.
+                    if sys.version_info < (3, 12):
+                        self.pop()  # Python 3.12 uses new opcode END_SEND
+                    self.push(ConstantVariable.create(ex.value))
+                    self.jump(inst)
+                else:
+                    self.push(val)
             else:
                 # invoke send
                 # Unreachable code - if you hit this, you are implementing generator support and have
diff --git a/torch/_dynamo/tensor_version_op.py b/torch/_dynamo/tensor_version_op.py
new file mode 100644
index 0000000000000..4c4246474c1d4
--- /dev/null
+++ b/torch/_dynamo/tensor_version_op.py
@@ -0,0 +1,57 @@
+import torch
+from torch._prims import _make_prim, RETURN_TYPE
+from torch._subclasses import FakeTensorMode
+from torch._subclasses.functional_tensor import FunctionalTensorMode
+
+_tensor_version = _make_prim(
+    schema="_tensor_version(Tensor self) -> SymInt",
+    return_type=RETURN_TYPE.NEW,
+    meta=torch.ops.aten._version.default,
+    impl_aten=torch.ops.aten._version.default,
+    doc="Tracable unbacked SymInt version of torch.Tensor._version",
+)
+
+
+@_tensor_version.py_impl(FakeTensorMode)
+def _tensor_version_fake(fake_mode, self_tensor):
+    """
+    The initial dynamo capture of _tensor_version + _unsafe_set_version_counter turns the
+    `._version` into an unbacked SymInt so that we don't need to specialize on the `._version`
+    of input tensors to the graph.
+    """
+    return fake_mode.shape_env.create_unbacked_symint()
+
+
+_unsafe_set_version_counter = _make_prim(
+    schema="_unsafe_set_version_counter(Tensor self, SymInt version) -> ()",
+    return_type=RETURN_TYPE.NEW,
+    meta=lambda self, version: None,
+    impl_aten=torch._C._autograd._unsafe_set_version_counter,
+    doc="Tracable+SymInt version of torch._C._autograd._unsafe_set_version_counter",
+)
+torch.fx.node.has_side_effect(_unsafe_set_version_counter)
+
+
+"""
+When we functionalize _tensor_version + _unsafe_set_version_counter,
+the ops disappear from the traced graph.  We run them eagerly on the
+fake tensors used for tracing, in order to get past asserts that would
+fail in autograd.
+
+Why is this ok?
+1) Versions on functional tensors don't make any sense since you can't mutate a functional tensor.
+2) The whole point of version munging is to trick autograd into doing what we want, and after
+   AotAtuograd there is no longer any need for these ops.
+
+Note this is similar to how no_grad is handled.
+"""
+
+
+@_tensor_version.py_impl(FunctionalTensorMode)
+def _tensor_version_functional(mode, self):
+    return self._version
+
+
+@_unsafe_set_version_counter.py_impl(FunctionalTensorMode)
+def _unsafe_set_version_counter_functional(ctx, self, version):
+    torch._C._autograd._unsafe_set_version_counter(self, version)
diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py
index 5212655f3e3d8..297ea6e2bc2a5 100644
--- a/torch/_dynamo/test_case.py
+++ b/torch/_dynamo/test_case.py
@@ -1,11 +1,10 @@
 import contextlib
 import importlib
 import logging
-import sys
 
 import torch
 import torch.testing
-from torch.testing._internal.common_utils import (
+from torch.testing._internal.common_utils import (  # type: ignore[attr-defined]
     IS_WINDOWS,
     TEST_WITH_CROSSREF,
     TEST_WITH_TORCHDYNAMO,
@@ -20,12 +19,7 @@
 def run_tests(needs=()):
     from torch.testing._internal.common_utils import run_tests
 
-    if (
-        TEST_WITH_TORCHDYNAMO
-        or IS_WINDOWS
-        or TEST_WITH_CROSSREF
-        or sys.version_info >= (3, 12)
-    ):
+    if TEST_WITH_TORCHDYNAMO or IS_WINDOWS or TEST_WITH_CROSSREF:
         return  # skip testing
 
     if isinstance(needs, str):
@@ -42,6 +36,8 @@ def run_tests(needs=()):
 
 
 class TestCase(TorchTestCase):
+    _exit_stack: contextlib.ExitStack
+
     @classmethod
     def tearDownClass(cls):
         cls._exit_stack.close()
@@ -50,8 +46,8 @@ def tearDownClass(cls):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
-        cls._exit_stack = contextlib.ExitStack()
-        cls._exit_stack.enter_context(
+        cls._exit_stack = contextlib.ExitStack()  # type: ignore[attr-defined]
+        cls._exit_stack.enter_context(  # type: ignore[attr-defined]
             config.patch(
                 raise_on_ctx_manager_usage=True,
                 suppress_errors=False,
diff --git a/torch/_dynamo/test_minifier_common.py b/torch/_dynamo/test_minifier_common.py
index df1b1c1622172..d12e5a92315a4 100644
--- a/torch/_dynamo/test_minifier_common.py
+++ b/torch/_dynamo/test_minifier_common.py
@@ -43,12 +43,12 @@ class MinifierTestBase(torch._dynamo.test_case.TestCase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
-        cls._exit_stack.enter_context(
+        cls._exit_stack.enter_context(  # type: ignore[attr-defined]
             torch._dynamo.config.patch(debug_dir_root=cls.DEBUG_DIR)
         )
         # These configurations make new process startup slower.  Disable them
         # for the minification tests to speed them up.
-        cls._exit_stack.enter_context(
+        cls._exit_stack.enter_context(  # type: ignore[attr-defined]
             torch._inductor.config.patch(
                 {
                     # https://github.com/pytorch/pytorch/issues/100376
@@ -67,7 +67,7 @@ def tearDownClass(cls):
             shutil.rmtree(cls.DEBUG_DIR)
         else:
             print(f"test_minifier_common tmpdir kept at: {cls.DEBUG_DIR}")
-        cls._exit_stack.close()
+        cls._exit_stack.close()  # type: ignore[attr-defined]
 
     def _gen_codegen_fn_patch_code(self, device, bug_type):
         assert bug_type in ("compile_error", "runtime_error", "accuracy")
@@ -118,7 +118,7 @@ def _maybe_subprocess_run(self, args, *, isolate, cwd=None):
                 finally:
                     log.removeHandler(log_handler)
                     if cwd is not None:
-                        os.chdir(prev_cwd)
+                        os.chdir(prev_cwd)  # type: ignore[possibly-undefined]
                     # Make sure we don't leave buggy compiled frames lying
                     # around
                     torch._dynamo.reset()
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index 199cc08cf21d1..b4c022e8d8c24 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -43,26 +43,6 @@ def clone_me(x):
     return x.detach().clone().requires_grad_(x.requires_grad)
 
 
-def skip_if_pytest(fn):
-    @functools.wraps(fn)
-    def wrapped(*args, **kwargs):
-        if "PYTEST_CURRENT_TEST" in os.environ:
-            raise unittest.SkipTest("does not work under pytest")
-        return fn(*args, **kwargs)
-
-    return wrapped
-
-
-def named_parameters_for_optimized_module(mod):
-    assert isinstance(mod, eval_frame.OptimizedModule)
-    return mod._orig_mod.named_parameters
-
-
-def named_buffers_for_optimized_module(mod):
-    assert isinstance(mod, eval_frame.OptimizedModule)
-    return mod._orig_mod.named_buffers
-
-
 def remove_optimized_module_prefix(name) -> str:
     return re.sub(r"^_orig_mod[.]", "", name)
 
@@ -125,7 +105,7 @@ def reduce_to_scalar_loss(out):
         # Mean does not work on integer tensors
         return out.sum() / out.numel()
     elif isinstance(out, (list, tuple)):
-        return sum([reduce_to_scalar_loss(x) for x in out]) / len(out)
+        return sum(reduce_to_scalar_loss(x) for x in out) / len(out)
     elif type(out).__name__ in (
         "MaskedLMOutput",
         "Seq2SeqLMOutput",
@@ -135,7 +115,7 @@ def reduce_to_scalar_loss(out):
     elif type(out).__name__ == "SquashedNormal":
         return out.mean.sum()
     elif isinstance(out, dict):
-        return sum([reduce_to_scalar_loss(value) for value in out.values()]) / len(
+        return sum(reduce_to_scalar_loss(value) for value in out.values()) / len(
             out.keys()
         )
     raise NotImplementedError("Don't know how to reduce", type(out))
@@ -155,7 +135,9 @@ def debug_dump(name, code: types.CodeType, extra="") -> None:
         )
 
 
-def debug_insert_nops(frame, cache_size, hooks, _) -> Optional[GuardedCode]:
+def debug_insert_nops(
+    frame, cache_size, hooks, _, *, skip: int = 0
+) -> Optional[GuardedCode]:
     """used to debug jump updates"""
 
     def insert_nops(instructions, code_options):
@@ -226,7 +208,7 @@ def __init__(self):
 
     def __call__(self, gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
         self.graphs.append(gm)
-        return gm
+        return gm.forward
 
 
 def strip_comment(code) -> str:
@@ -302,7 +284,16 @@ def rand_strided(
         + extra_size
     )
     if dtype.is_floating_point:
-        buffer = torch.randn(needed_size, dtype=dtype, device=device)
+        if dtype.itemsize == 1:
+            """
+            normal distribution kernel is not implemented for fp8..
+            Workaround that by creating a fp16 tensor and then cast.
+            """
+            buffer = torch.randn(needed_size, dtype=torch.float16, device=device).to(
+                dtype=dtype
+            )
+        else:
+            buffer = torch.randn(needed_size, dtype=dtype, device=device)
     else:
         buffer = torch.zeros(size=[needed_size], dtype=dtype, device=device)
     return torch.as_strided(buffer, size, stride)
@@ -320,7 +311,9 @@ def _fn(*args, **kwargs):
     return _fn
 
 
-def make_test_cls_with_patches(cls, cls_prefix, fn_suffix, *patches, xfail_prop=None):
+def make_test_cls_with_patches(
+    cls, cls_prefix, fn_suffix, *patches, xfail_prop=None, decorator=lambda x: x
+):
     DummyTestClass = type(f"{cls_prefix}{cls.__name__}", cls.__bases__, {})
     DummyTestClass.__qualname__ = DummyTestClass.__name__
 
@@ -335,7 +328,7 @@ def make_test_cls_with_patches(cls, cls_prefix, fn_suffix, *patches, xfail_prop=
             new_fn.__name__ = new_name
             if xfail_prop is not None and hasattr(fn, xfail_prop):
                 new_fn = unittest.expectedFailure(new_fn)
-            setattr(DummyTestClass, new_name, new_fn)
+            setattr(DummyTestClass, new_name, decorator(new_fn))
         # NB: Doesn't handle slots correctly, but whatever
         elif not hasattr(DummyTestClass, name):
             setattr(DummyTestClass, name, getattr(cls, name))
@@ -350,6 +343,12 @@ def skipIfNotPy311(fn):
     return unittest.skip(fn)
 
 
+def xfailIfPy312(fn):
+    if sys.version_info >= (3, 12):
+        return unittest.expectedFailure(fn)
+    return fn
+
+
 # Controls tests generated in test/inductor/test_torchinductor_dynamic_shapes.py
 # and test/dynamo/test_dynamic_shapes.py
 def expectedFailureDynamic(fn):
diff --git a/torch/_dynamo/trace_rules.py b/torch/_dynamo/trace_rules.py
index f1e2edcfc6b60..8a2c12ee4e84e 100644
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@@ -1,11 +1,36 @@
+import _collections_abc
+import _weakrefset
+import abc
 import builtins
+import collections
+import contextlib
 import copy
+import copyreg
+import dataclasses
+import enum
 import functools
 import importlib
+import inspect
 import itertools
+import linecache
+import logging
+import multiprocessing
 import operator
+import os
+import posixpath
+import random
+import re
+import selectors
+import signal
 import sys
+import tempfile
+import threading
+import tokenize
+import traceback
 import types
+import typing
+import unittest
+import weakref
 from collections import defaultdict
 from typing import Any, Callable, cast, Dict, List, Optional, Set, Union
 
@@ -16,50 +41,95 @@
     pass
 
 import torch
-
-from .utils import hashable, is_function, NP_SUPPORTED_MODULES
+import torch._inductor.test_operators
+import torch.distributed
+import torch.utils._content_store
+from ..utils import _config_module
+from .resume_execution import TORCH_DYNAMO_RESUME_IN_PREFIX
+from .utils import getfile, hashable, NP_SUPPORTED_MODULES, unwrap_if_wrapper
 
 from .variables import (
-    SkipFilesVariable,
-    TorchCtxManagerClassVariable,
+    BuiltinVariable,
+    FunctorchHigherOrderVariable,
+    NestedUserFunctionVariable,
+    SkipFunctionVariable,
     TorchInGraphFunctionVariable,
+    UserFunctionVariable,
+    UserMethodVariable,
 )
 
-from .variables.base import VariableTracker
+
+if typing.TYPE_CHECKING:
+    from .variables.base import VariableTracker
 
 
 """
-Map of torch objects to their tracing rules (Dynamo variables).
+A note on skip/inline rules:
+
+Dynamo consults this file to determine whether function should be inlined or skipped.
+
+A skip applies at the frame boundary, meaning dynamo either triggers a graph break
+at the beginning of the frame or attempts to trace/inline the whole frame. When skipping
+a frame, recursively called frames are still traced by dynamo unless also skipped.
+
+Skipfiles (skipped at the file level instead of function level) still apply on a
+frame-by-frame boundary as dynamo traces, but apply to all functions in that file.
+
+@skip is a helper decorator that can be applied to your function to cause it to be
+included here.
+
+Dynamo skip/inline rules & priorities are defined as follows:
+* Inline is the default behavior and will be used unless explicitly skipped.
+* Dynamo has two SKIPLIST: BUILTIN_SKIPLIST and THIRDPARTY_SKIPLIST.
+    * BUILTIN_SKIPLIST contains builtin python modules, such as abc, collections, etc.
+    * THIRDPARTY_SKIPLIST contains common third party libraries, such as numpy, pandas, etc.
+* Functions in these two SKIPLISTs are always skipped, except:
+    * They have explicitly defined rule in `manual_torch_name_rule_map`;
+    * The corresponding python module has been put into MOD_INLINELIST.
+* PyTorch(torch) is in the BUILTIN_SKIPLIST by default, but there are many cases
+    where we want inline the functions under torch namespace.
+    We should specify inline for the functions in `manual_torch_name_rule_map` or
+    put the corresponding python module into MOD_INLINELIST to make dynamo inline them.
+* If you call functions under skipped modules/files, Dynamo will wrap these functions
+    as SkipFunctionVariable. There are a few functions(e.g, collections.OrderedDict) that
+    we have special handling at SkipFunctionVariable.call_function.
+
+Overall: *_INLINELIST has precedence over *_SKIPLIST has precedence over DEFAULT (inline)
+
+To figure out what the behavior is, check the following list in order:
+* `manual_torch_name_rule_map` (Inline if YES)
+* MOD_INLINELIST (Inline if YES)
+* BUILTIN_SKIPLIST & THIRDPARTY_SKIPLIST (Skip if YES)
+* Inline by default
+
+In general, if you want to force inline a function or module, please consider adding
+the function's python module to MOD_INLINELIST first.
+Use the `manual_torch_name_rule_map` only when there are other functions under the same module that
+you don't want to inline them.
+"""
+
+"""
+Map of function objects to their tracing rules (Dynamo variables).
 * TorchInGraphFunctionVariable: The functions should be put into the FX graph or can be constant folded. E.g.,
   - torch.add: should be put into the FX graph.
   - torch.is_floating_point: constant folded.
-* TorchCtxManagerClassVariable: The context manager classes are supported by Dynamo. E.g., torch.no_grad
-* SkipFilesVariable: The objects should be skipped from tracing.
+* SkipFunctionVariable: The objects should be skipped from tracing.
 * UserFunctionVariable: The functions should be inlined.
 
-We explicitly list torch objects which should be wrapped as TorchCtxManagerClassVariable.
-The initial list comes from the heuristic in test/dynamo/test_trace_rules.py:generate_allow_list.
-
 For developers: If you add/remove a torch level API, it may trigger failures from
-test/dynamo/test_trace_rules.py:test_torch_name_rule_map. To fix the failures:
+test/dynamo/test_trace_rules.py:test_torch_name_rule_map_updated. To fix the failures:
 If you are adding a new torch level API or Dynamo implementation:
-* Add the name with TorchCtxManagerClassVariable to this map
-  if you are adding Dynamo implementation for that context manager.
-* Remove the object name from test/dynamo/test_trace_rules.ignored_torch_name_rule_set if it's there.
+* Add the name with the corresponding tracing rule to this map
+  if you are adding a new in graph function or Dynamo implementation for an existing function.
+* Remove the object name from test/dynamo/test_trace_rules.ignored_c_binding_in_graph_function_names if it's there.
 
 If you are removing an existing torch level API:
-* Remove the entry represented the API from this map or test/dynamo/test_trace_rules.ignored_torch_name_rule_set
+* Remove the entry represented the API from this map or test/dynamo/test_trace_rules.ignored_c_binding_in_graph_function_names
   depends on where it is.
 
-TODO: We would consolidate the skipfiles.check rules into trace_rules.lookup later.
-TODO: We would support explictly list objects treated as skip/inline after the skipfiles.check
-and trace_rules.lookup consolidation is done. Then the explicit listing of skip/inline objects have
-a higher priority, which can be used to override the skipfiles.check rules in some cases.
+
 """
 manual_torch_name_rule_map = {
-    "torch.profiler.profiler.profile": TorchCtxManagerClassVariable,
-    "torch.autograd.profiler.profile": TorchCtxManagerClassVariable,
-    "torch.autograd.profiler.record_function": TorchCtxManagerClassVariable,
     "torch.onnx.is_in_onnx_export": TorchInGraphFunctionVariable,
     "torch.onnx.operators.shape_as_tensor": TorchInGraphFunctionVariable,
     "torch.overrides.is_tensor_like": TorchInGraphFunctionVariable,
@@ -70,37 +140,42 @@
     "torch.distributed.is_initialized": TorchInGraphFunctionVariable,
     "torch.distributed.get_rank": TorchInGraphFunctionVariable,
     "torch.distributed.get_world_size": TorchInGraphFunctionVariable,
-    "torch.distributed._tensor.DTensor#from_local": TorchInGraphFunctionVariable,
+    "torch.distributed._tensor.api.DTensor#from_local": TorchInGraphFunctionVariable,
+    "torch.distributed.distributed_c10d._get_group_size_by_name": TorchInGraphFunctionVariable,
+    "torch.distributed.distributed_c10d._resolve_group_name_by_ranks_and_tag": TorchInGraphFunctionVariable,
     "torch.distributed.distributed_c10d._get_group_tag": TorchInGraphFunctionVariable,
     "torch.distributed.distributed_c10d.get_process_group_ranks": TorchInGraphFunctionVariable,
     "torch._utils.is_compiling": TorchInGraphFunctionVariable,
     "torch.overrides.get_default_nowrap_functions": TorchInGraphFunctionVariable,
     "torch.fx._symbolic_trace.is_fx_tracing": TorchInGraphFunctionVariable,
     "torch._dynamo.external_utils.is_compiling": TorchInGraphFunctionVariable,
-    "torch.autograd.graph.disable_saved_tensors_hooks": TorchInGraphFunctionVariable,
-    "torch.autograd._profiler_enabled": SkipFilesVariable,
+    "torch.compiler.is_compiling": TorchInGraphFunctionVariable,
+    "torch.compiler.is_dynamo_compiling": TorchInGraphFunctionVariable,
+    "torch.autograd._profiler_enabled": SkipFunctionVariable,
+    "torch._C._to_dlpack": SkipFunctionVariable,
+    "torch.to_dlpack": SkipFunctionVariable,
     # We graph break on RNG state setters or getters like
     # `torch.get_rng_state` or `torch.set_rng_state`. These functions
     # are not aten operations and therefore they are completely ignored
     # by the AOT dispatcher. As a result, the AOT graph does not have
     # these setter or getter functions, producing an incorrect graph
     # when it comes to rng states.
-    "torch.default_generator#get_state": SkipFilesVariable,
-    "torch._C.Generator#get_state": SkipFilesVariable,
-    "torch.get_rng_state": SkipFilesVariable,
-    "torch.cuda.get_rng_state": SkipFilesVariable,
-    "torch.default_generator#set_state": SkipFilesVariable,
-    "torch._C.Generator#set_state": SkipFilesVariable,
-    "torch.set_rng_state": SkipFilesVariable,
-    "torch.cuda.set_rng_state": SkipFilesVariable,
+    "torch.default_generator#get_state": SkipFunctionVariable,
+    "torch._C.Generator#get_state": SkipFunctionVariable,
+    "torch.get_rng_state": SkipFunctionVariable,
+    "torch.cuda.get_rng_state": SkipFunctionVariable,
+    "torch.default_generator#set_state": SkipFunctionVariable,
+    "torch._C.Generator#set_state": SkipFunctionVariable,
+    "torch.set_rng_state": SkipFunctionVariable,
+    "torch.cuda.set_rng_state": SkipFunctionVariable,
     # https://github.com/pytorch/pytorch/issues/107187
-    "torch.manual_seed": SkipFilesVariable,
+    "torch.manual_seed": SkipFunctionVariable,
     # https://github.com/pytorch/pytorch/issues/93501
-    "torch.nn.utils.rnn.pack_padded_sequence": SkipFilesVariable,
-    # https://github.com/pytorch/pytorch/issues/99569
-    "torch.nn.Parameter": SkipFilesVariable,
-    "torch._nested_tensor_from_mask": SkipFilesVariable,
-    "torch._nested_from_padded": SkipFilesVariable,
+    "torch.nn.utils.rnn.pack_padded_sequence": SkipFunctionVariable,
+    "torch.nn.Parameter": TorchInGraphFunctionVariable,
+    "torch._nested_tensor_from_mask": SkipFunctionVariable,
+    "torch._nested_from_padded": SkipFunctionVariable,
+    "torch.nested.nested_tensor_from_jagged": UserFunctionVariable,
     # symbol operators implemented in Python
     "torch.sym_not": TorchInGraphFunctionVariable,
     "torch.sym_float": TorchInGraphFunctionVariable,
@@ -109,52 +184,138 @@
     "torch.sym_min": TorchInGraphFunctionVariable,
     "torch.sym_sqrt": TorchInGraphFunctionVariable,
     "torch.sym_ite": TorchInGraphFunctionVariable,
-    "torch.Tensor#_make_wrapper_subclass": SkipFilesVariable,
-    "torch.Tensor#__init__": SkipFilesVariable,
-    "torch.cuda.set_device": SkipFilesVariable,
-    "torch.cuda.current_device": SkipFilesVariable,
-    "torch._C.autocast_decrement_nesting": SkipFilesVariable,
-    "torch._C.autocast_increment_nesting": SkipFilesVariable,
-    "torch.autograd.grad": SkipFilesVariable,
-    "torch._C.clear_autocast_cache": SkipFilesVariable,
-    "torch.distributions.constraints.is_dependent": SkipFilesVariable,
-    "torch.jit.isinstance": SkipFilesVariable,
-    "torch._C.set_anomaly_enabled": SkipFilesVariable,
-    "torch._C.set_autocast_cache_enabled": SkipFilesVariable,
-    "torch._C.set_autocast_cpu_dtype": SkipFilesVariable,
-    "torch._C.set_autocast_cpu_enabled": SkipFilesVariable,
-    "torch._C.set_autocast_enabled": SkipFilesVariable,
-    "torch._C.set_autocast_gpu_dtype": SkipFilesVariable,
-    "torch._C.set_autocast_ipu_dtype": SkipFilesVariable,
-    "torch._C.set_autocast_ipu_enabled": SkipFilesVariable,
-    "torch._C.set_autocast_xla_dtype": SkipFilesVariable,
-    "torch._C.set_autocast_xla_enabled": SkipFilesVariable,
-    "torch.resize_as_": SkipFilesVariable,
-    "torch.resize_as_sparse_": SkipFilesVariable,
+    "torch.Tensor#_make_wrapper_subclass": SkipFunctionVariable,
+    "torch.Tensor#__init__": SkipFunctionVariable,
+    "torch.cuda.set_device": SkipFunctionVariable,
+    "torch.cuda.current_device": SkipFunctionVariable,
+    "torch._C.autocast_decrement_nesting": SkipFunctionVariable,
+    "torch._C.autocast_increment_nesting": SkipFunctionVariable,
+    "torch.autograd.grad": SkipFunctionVariable,
+    "torch.autograd.backward": SkipFunctionVariable,
+    "torch._C.clear_autocast_cache": SkipFunctionVariable,
+    "torch.distributions.constraints.is_dependent": SkipFunctionVariable,
+    "torch.jit.isinstance": SkipFunctionVariable,
+    "torch._C.set_anomaly_enabled": SkipFunctionVariable,
+    "torch._C.set_autocast_cache_enabled": SkipFunctionVariable,
+    "torch._C.set_autocast_cpu_dtype": SkipFunctionVariable,
+    "torch._C.set_autocast_cpu_enabled": SkipFunctionVariable,
+    "torch._C.set_autocast_enabled": SkipFunctionVariable,
+    "torch._C.set_autocast_gpu_dtype": SkipFunctionVariable,
+    "torch._C.set_autocast_ipu_dtype": SkipFunctionVariable,
+    "torch._C.set_autocast_ipu_enabled": SkipFunctionVariable,
+    "torch._C.set_autocast_xla_dtype": SkipFunctionVariable,
+    "torch._C.set_autocast_xla_enabled": SkipFunctionVariable,
+    "torch.resize_as_": SkipFunctionVariable,
+    "torch.resize_as_sparse_": SkipFunctionVariable,
     "torch.get_default_device": TorchInGraphFunctionVariable,
-}
-
-
-# Dynamo implemented context managers
-torch_ctx_manager_classes = {
-    k: TorchCtxManagerClassVariable
-    for k in [
-        "torch._C.DisableTorchFunctionSubclass",
-        "torch.amp.autocast_mode.autocast",
-        "torch.autograd.grad_mode.enable_grad",
-        "torch.autograd.grad_mode.inference_mode",
-        "torch.autograd.grad_mode.no_grad",
-        "torch.autograd.grad_mode.set_grad_enabled",
-        "torch.cpu.amp.autocast_mode.autocast",
-        "torch.cuda.amp.autocast_mode.autocast",
-    ]
+    # functorch/vmap
+    "torch._functorch.vmap._check_int_or_none": UserFunctionVariable,
+    "torch._functorch.vmap._check_out_dims_is_int_or_int_pytree": UserFunctionVariable,
+    "torch._functorch.vmap._check_randomness_arg": UserFunctionVariable,
+    "torch._functorch.vmap._chunked_vmap": UserFunctionVariable,
+    "torch._functorch.vmap._concat_chunked_outputs": UserFunctionVariable,
+    "torch._functorch.vmap._create_batched_inputs": UserFunctionVariable,
+    "torch._functorch.vmap._flat_vmap": UserFunctionVariable,
+    "torch._functorch.vmap._flatten_chunks_output": UserFunctionVariable,
+    "torch._functorch.vmap._get_chunked_inputs": UserFunctionVariable,
+    "torch._functorch.vmap._get_name": UserFunctionVariable,
+    "torch._functorch.vmap._maybe_remove_batch_dim": UserFunctionVariable,
+    "torch._functorch.vmap._num_outputs": UserFunctionVariable,
+    "torch._functorch.vmap._process_batched_inputs": UserFunctionVariable,
+    "torch._functorch.vmap._unwrap_batched": UserFunctionVariable,
+    "torch._functorch.vmap._validate_and_get_batch_size": UserFunctionVariable,
+    "torch._functorch.vmap.doesnt_support_saved_tensors_hooks": UserFunctionVariable,
+    "torch._functorch.vmap.get_chunk_sizes": UserFunctionVariable,
+    # lazy_load_decompositions uses a lock that is not supported yet in dynamo
+    # "torch._functorch.vmap.lazy_load_decompositions": UserFunctionVariable,
+    "torch._functorch.vmap.restore_vmap": UserFunctionVariable,
+    "torch._functorch.apis.vmap": UserFunctionVariable,
+    "torch._functorch.vmap.unwrap_batched": UserFunctionVariable,
+    "torch._functorch.vmap.vmap_impl": FunctorchHigherOrderVariable,
+    "torch._functorch.vmap.wrap_batched": UserFunctionVariable,
+    # functorch/grad
+    "torch._functorch.eager_transforms.grad_impl": FunctorchHigherOrderVariable,
+    "torch._functorch.apis.grad_and_value": UserFunctionVariable,
+    "torch._functorch.eager_transforms._as_tuple": UserFunctionVariable,
+    "torch._functorch.eager_transforms._check_unique_non_empty": UserFunctionVariable,
+    "torch._functorch.eager_transforms._create_differentiable": UserFunctionVariable,
+    "torch._functorch.eager_transforms._slice_argnums": UserFunctionVariable,
+    "torch._functorch.eager_transforms._undo_create_differentiable": UserFunctionVariable,
+    "torch._functorch.eager_transforms._validate_and_wrap_argnum": UserFunctionVariable,
+    "torch._functorch.eager_transforms._validate_and_wrap_argnums": UserFunctionVariable,
+    "torch._functorch.eager_transforms._wrap_all_tensors": UserFunctionVariable,
+    "torch._functorch.eager_transforms._wrap_tensor_for_grad": UserFunctionVariable,
+    # functorch/jacrev
+    "torch._functorch.eager_transforms.jacrev": FunctorchHigherOrderVariable,
+    "torch._functorch.eager_transforms.error_if_complex": UserFunctionVariable,
+    "torch._functorch.eager_transforms._chunked_standard_basis_for_": UserFunctionVariable,
+    "torch._functorch.eager_transforms._safe_zero_index": UserFunctionVariable,
+    # functorch/vjp
+    "torch._functorch.eager_transforms.vjp": FunctorchHigherOrderVariable,
+    "torch._functorch.eager_transforms._vjp_with_argnums": UserFunctionVariable,
+    "torch._functorch.eager_transforms.assert_non_empty_tensor_output": UserFunctionVariable,
+    # functorch/jvp
+    "torch._functorch.eager_transforms._jvp_with_argnums": UserFunctionVariable,
+    "torch._functorch.eager_transforms.jvp": FunctorchHigherOrderVariable,
+    "torch._functorch.eager_transforms._replace_args": UserFunctionVariable,
+    "torch._functorch.eager_transforms.safe_unpack_dual": UserFunctionVariable,
+    "torch._functorch.eager_transforms.assert_non_empty_list_of_tensors": UserFunctionVariable,
+    "torch._functorch.eager_transforms.assert_output_is_tensor_or_tensors": UserFunctionVariable,
+    "torch.autograd.forward_ad.enter_dual_level": UserFunctionVariable,
+    "torch.autograd.forward_ad.exit_dual_level": UserFunctionVariable,
+    "torch.autograd.forward_ad.make_dual": UserFunctionVariable,
+    "torch.autograd.forward_ad.unpack_dual": UserFunctionVariable,
+    # functorch/linearize
+    "torch._functorch.eager_transforms.linearize": FunctorchHigherOrderVariable,
+    # functorch/jacfwd
+    "torch._functorch.eager_transforms.jacfwd": FunctorchHigherOrderVariable,
+    "torch._functorch.eager_transforms._construct_standard_basis_for": UserFunctionVariable,
+    "torch._functorch.eager_transforms.safe_unflatten": UserFunctionVariable,
+    # functorch/hessian
+    "torch._functorch.eager_transforms.hessian": FunctorchHigherOrderVariable,
+    # functorch/deprecated
+    "torch._functorch.deprecated.jvp": UserFunctionVariable,
+    "torch._functorch.deprecated.hessian": UserFunctionVariable,
+    "torch._functorch.deprecated.jacfwd": UserFunctionVariable,
+    "torch._functorch.deprecated.jacrev": UserFunctionVariable,
+    "torch._functorch.deprecated.grad": UserFunctionVariable,
+    "torch._functorch.deprecated.grad_and_value": UserFunctionVariable,
+    "torch._functorch.deprecated.vjp": UserFunctionVariable,
+    #
+    "torch._constrain_as_size": UserFunctionVariable,
+    "torch._constrain_as_value": UserFunctionVariable,
+    "torch._tensor._convert": UserFunctionVariable,
+    "torch.jit._unwrap_optional": UserFunctionVariable,
+    "torch.backends.mha.get_fastpath_enabled": UserFunctionVariable,
+    "torch._C._functorch._add_batch_dim": TorchInGraphFunctionVariable,
+    "torch._C._functorch._remove_batch_dim": TorchInGraphFunctionVariable,
+    "torch._C._functorch._wrap_for_grad": TorchInGraphFunctionVariable,
+    "torch._C._functorch._unwrap_for_grad": TorchInGraphFunctionVariable,
+    "torch._C._functorch.maybe_current_level": TorchInGraphFunctionVariable,
+    "torch._C._functorch.is_batchedtensor": TorchInGraphFunctionVariable,
+    "torch._dynamo.mark_static": UserFunctionVariable,
+    "torch.fx.experimental.symbolic_shapes.guard_size_oblivious": TorchInGraphFunctionVariable,
+    "torch.cuda._get_device_properties": TorchInGraphFunctionVariable,
+    "torch.utils.hooks.BackwardHook": TorchInGraphFunctionVariable,
+    "torch.sparse_bsc_tensor": SkipFunctionVariable,
+    "torch.sparse_bsr_tensor": SkipFunctionVariable,
+    "torch.sparse_csc_tensor": SkipFunctionVariable,
+    "torch.sparse_csr_tensor": SkipFunctionVariable,
+    "torch.sparse_compressed_tensor": SkipFunctionVariable,
+    "torch._C._autograd._unsafe_set_version_counter": TorchInGraphFunctionVariable,
+    # avoid skipping user defined modules in distributed unit tests
+    "torch/testing/_internal/common_fsdp.py#forward": UserFunctionVariable,
+    f"torch/testing/_internal/common_fsdp.py#{TORCH_DYNAMO_RESUME_IN_PREFIX}": UserFunctionVariable,
+    "torch/testing/_internal/distributed/_tensor/common_dtensor.py#forward": UserFunctionVariable,
+    f"torch/testing/_internal/distributed/_tensor/common_dtensor.py#{TORCH_DYNAMO_RESUME_IN_PREFIX}": UserFunctionVariable,
+    "torch/testing/_internal/common_distributed.py#forward": UserFunctionVariable,
+    f"torch/testing/_internal/common_distributed.py#{TORCH_DYNAMO_RESUME_IN_PREFIX}": UserFunctionVariable,
 }
 
 
 # In graph functions (including constant folding) that are C bindings
-torch_c_binding_in_graph_functions = {
-    k: TorchInGraphFunctionVariable
-    for k in [
+torch_c_binding_in_graph_functions = dict.fromkeys(
+    [
         "math.acos",
         "math.acosh",
         "math.asin",
@@ -218,7 +379,7 @@
         "torch._assert_async",
         "torch._assert_tensor_metadata",
         "torch._batch_norm_impl_index",
-        "torch._C._activate_cuda_trace",
+        "torch._C._activate_gpu_trace",
         "torch._C._add_cached_tensor",
         "torch._C._add_docstr",
         "torch._C._are_functorch_transforms_active",
@@ -408,6 +569,7 @@
         "torch._C._get_autograd_fallback_mode",
         "torch._C._get_backcompat_broadcast_warn",
         "torch._C._get_backcompat_keepdim_warn",
+        "torch._C._get_blas_preferred_backend",
         "torch._C._get_caught_jit_exception_class_name",
         "torch._C._get_caught_jit_exception_original_msg",
         "torch._C._get_constant_bool_symnode",
@@ -437,6 +599,8 @@
         "torch._C._get_max_operator_version",
         "torch._C._get_mem_efficient_sdp_enabled",
         "torch._C._get_mkldnn_enabled",
+        "torch._C._get_cudnn_sdp_enabled",
+        "torch._C._set_sdp_use_cudnn",
         "torch._C._get_mobile_model_contained_types_from_buffer",
         "torch._C._get_mobile_model_contained_types",
         "torch._C._get_model_bytecode_version_from_buffer",
@@ -453,7 +617,7 @@
         "torch._C._get_privateuse1_backend_name",
         "torch._C._get_qengine",
         "torch._C._get_schema",
-        "torch._C._get_singleton_int",
+        "torch._C._get_nested_int",
         "torch._C._get_tensor_metadata",
         "torch._C._get_tracing_state",
         "torch._C._get_upgrader_ranges",
@@ -718,6 +882,7 @@
         "torch._C._last_executed_optimized_graph",
         "torch._C._len_torch_dispatch_stack",
         "torch._C._len_torch_function_stack",
+        "torch._C._linalg._linalg_eigvals",
         "torch._C._linalg.linalg_cholesky_ex",
         "torch._C._linalg.linalg_cholesky",
         "torch._C._linalg.linalg_cond",
@@ -929,9 +1094,11 @@
         "torch._C._scatter_out",
         "torch._C._scatter",
         "torch._C._select_conv_backend",
+        "torch._C._select_batch_norm_backend",
         "torch._C._set_autograd_fallback_mode",
         "torch._C._set_backcompat_broadcast_warn",
         "torch._C._set_backcompat_keepdim_warn",
+        "torch._C._set_blas_preferred_backend",
         "torch._C._set_cached_tensors_enabled",
         "torch._C._set_check_sparse_tensor_invariants",
         "torch._C._set_conj",
@@ -1093,7 +1260,6 @@
         "torch._C._test_only_populate_upgraders",
         "torch._C._test_only_remove_entry_to_op_version_map",
         "torch._C._test_only_remove_upgraders",
-        "torch._C._to_dlpack",
         "torch._C._to_functionality_key",
         "torch._C._tracer_set_force_outplace",
         "torch._C._tracer_set_get_unique_name_fn",
@@ -1152,6 +1318,7 @@
         "torch._cast_Long",
         "torch._cast_Short",
         "torch._choose_qparams_per_tensor",
+        "torch._chunk_cat",
         "torch._coalesce",
         "torch._compute_linear_combination",
         "torch._conj_copy",
@@ -1317,6 +1484,7 @@
         "torch._linalg_check_errors",
         "torch._linalg_det",
         "torch._linalg_eigh",
+        "torch._linalg_eigvals",
         "torch._linalg_slogdet",
         "torch._linalg_solve_ex",
         "torch._linalg_svd",
@@ -1367,6 +1535,8 @@
         "torch._scaled_dot_product_attention_math",
         "torch._scaled_dot_product_efficient_attention",
         "torch._scaled_dot_product_flash_attention",
+        "torch._scaled_dot_product_flash_attention_for_cpu",
+        "torch._scaled_dot_product_cudnn_attention",
         "torch._scaled_mm",
         "torch._shape_as_tensor",
         "torch._sobol_engine_draw",
@@ -1380,7 +1550,9 @@
         "torch._sparse_csr_prod",
         "torch._sparse_csr_sum",
         "torch._sparse_log_softmax_backward_data",
+        "torch._sparse_semi_structured_addmm",
         "torch._sparse_semi_structured_linear",
+        "torch._sparse_semi_structured_mm",
         "torch._sparse_softmax_backward_data",
         "torch._sparse_sparse_matmul",
         "torch._sparse_sum",
@@ -1408,15 +1580,9 @@
         "torch._unsafe_index",
         "torch._use_cudnn_ctc_loss",
         "torch._use_cudnn_rnn_flatten_weight",
-        "torch._validate_compressed_sparse_indices",
-        "torch._validate_sparse_bsc_tensor_args",
-        "torch._validate_sparse_bsr_tensor_args",
-        "torch._validate_sparse_compressed_tensor_args",
-        "torch._validate_sparse_coo_tensor_args",
-        "torch._validate_sparse_csc_tensor_args",
-        "torch._validate_sparse_csr_tensor_args",
         "torch._values_copy",
         "torch._weight_int4pack_mm",
+        "torch._weight_int8pack_mm",
         "torch._weight_norm_interface",
         "torch._weight_norm",
         "torch.abs_",
@@ -1560,7 +1726,6 @@
         "torch.cross",
         "torch.crow_indices_copy",
         "torch.ctc_loss",
-        "torch.cuda._get_device_properties",
         "torch.cudnn_affine_grid_generator",
         "torch.cudnn_batch_norm",
         "torch.cudnn_convolution_add_relu",
@@ -1853,6 +2018,7 @@
         "torch.positive",
         "torch.pow",
         "torch.prelu",
+        "torch._print",
         "torch.prod",
         "torch.promote_types",
         "torch.put",
@@ -1897,6 +2063,7 @@
         "torch.resolve_conj",
         "torch.resolve_neg",
         "torch.result_type",
+        "torch.rms_norm",
         "torch.rnn_relu_cell",
         "torch.rnn_relu",
         "torch.rnn_tanh_cell",
@@ -1942,12 +2109,6 @@
         "torch.smm",
         "torch.softmax",
         "torch.sort",
-        "torch.sparse_bsc_tensor",
-        "torch.sparse_bsr_tensor",
-        "torch.sparse_compressed_tensor",
-        "torch.sparse_coo_tensor",
-        "torch.sparse_csc_tensor",
-        "torch.sparse_csr_tensor",
         "torch.split_copy",
         "torch.split_with_sizes_copy",
         "torch.split_with_sizes",
@@ -2024,8 +2185,13 @@
         "torch.xlogy",
         "torch.zero_",
         "torch.zeros",
-    ]
-}
+        "torch._fused_sgd_",
+        "torch.slice_inverse",
+        "torch._assert_scalar",
+        "torch._functional_assert_scalar",
+    ],
+    TorchInGraphFunctionVariable,
+)
 
 
 if sys.version_info >= (3, 9):
@@ -2036,9 +2202,8 @@
 
 
 # In graph functions (including constant folding) that are not C bindings
-torch_non_c_binding_in_graph_functions = {
-    k: TorchInGraphFunctionVariable
-    for k in [
+torch_non_c_binding_in_graph_functions = dict.fromkeys(
+    [
         "torch.__future__.get_overwrite_module_params_on_conversion",
         "torch.__future__.set_overwrite_module_params_on_conversion",
         "torch.__getattr__",
@@ -2053,8 +2218,6 @@
         "torch._check_with",
         "torch._check",
         "torch._compile._disable_dynamo",
-        "torch._constrain_as_size",
-        "torch._constrain_as_value",
         "torch._functorch.apis.chunk_vmap",
         "torch._functorch.autograd_function.custom_function_call_functionalize",
         "torch._functorch.autograd_function.custom_function_call_grad",
@@ -2073,57 +2236,25 @@
         "torch._functorch.deprecated.combine_state_for_ensemble",
         "torch._functorch.deprecated.functionalize",
         "torch._functorch.deprecated.get_warning",
-        "torch._functorch.deprecated.grad_and_value",
-        "torch._functorch.deprecated.hessian",
-        "torch._functorch.deprecated.jacfwd",
-        "torch._functorch.deprecated.jacrev",
-        "torch._functorch.deprecated.jvp",
         "torch._functorch.deprecated.make_functional_with_buffers",
         "torch._functorch.deprecated.make_functional",
         "torch._functorch.deprecated.setup_docs",
-        "torch._functorch.deprecated.vjp",
         "torch._functorch.deprecated.warn_deprecated",
         "torch._functorch.eager_transforms._any_differentiable",
-        "torch._functorch.eager_transforms._as_tuple",
         "torch._functorch.eager_transforms._autograd_grad",
-        "torch._functorch.eager_transforms._check_unique_non_empty",
-        "torch._functorch.eager_transforms._chunked_standard_basis_for_",
-        "torch._functorch.eager_transforms._construct_standard_basis_for",
-        "torch._functorch.eager_transforms._create_differentiable",
+        "torch._functorch.eager_transforms._vjp_treespec_compare",
+        "torch._functorch.eager_transforms._set_tensor_requires_grad",
+        "torch._functorch.eager_transforms._jvp_treespec_compare",
+        "torch._functorch.eager_transforms._linearize_treespec_compare",
         "torch._functorch.eager_transforms._is_differentiable",
-        "torch._functorch.eager_transforms._jvp_with_argnums",
         "torch._functorch.eager_transforms._maybe_unwrap_functional_tensor",
         "torch._functorch.eager_transforms._maybe_wrap_functional_tensor",
-        "torch._functorch.eager_transforms._replace_args",
-        "torch._functorch.eager_transforms._safe_zero_index",
-        "torch._functorch.eager_transforms._slice_argnums",
-        "torch._functorch.eager_transforms._undo_create_differentiable",
         "torch._functorch.eager_transforms._unwrap_all_tensors_from_functional",
-        "torch._functorch.eager_transforms._validate_and_wrap_argnum",
-        "torch._functorch.eager_transforms._validate_and_wrap_argnums",
-        "torch._functorch.eager_transforms._vjp_with_argnums",
         "torch._functorch.eager_transforms._wrap_all_tensors_to_functional",
-        "torch._functorch.eager_transforms._wrap_all_tensors",
-        "torch._functorch.eager_transforms._wrap_tensor_for_grad",
         "torch._functorch.eager_transforms.assert_flat_tuple_of_tensors",
-        "torch._functorch.eager_transforms.assert_non_empty_list_of_tensors",
-        "torch._functorch.eager_transforms.assert_non_empty_tensor_output",
-        "torch._functorch.eager_transforms.assert_output_is_tensor_or_tensors",
-        "torch._functorch.eager_transforms.enable_inplace_requires_grad",
-        "torch._functorch.eager_transforms.error_if_complex",
         "torch._functorch.eager_transforms.functionalize",
-        "torch._functorch.eager_transforms.grad_and_value",
-        "torch._functorch.eager_transforms.grad_impl",
-        "torch._functorch.eager_transforms.hessian",
-        "torch._functorch.eager_transforms.jacfwd",
-        "torch._functorch.eager_transforms.jacrev",
-        "torch._functorch.eager_transforms.jvp",
         "torch._functorch.eager_transforms.lazy_dynamo_disable",
-        "torch._functorch.eager_transforms.linearize",
         "torch._functorch.eager_transforms.noop",
-        "torch._functorch.eager_transforms.safe_unflatten",
-        "torch._functorch.eager_transforms.safe_unpack_dual",
-        "torch._functorch.eager_transforms.vjp",
         "torch._functorch.functional_call.construct_stacked_leaf",
         "torch._functorch.functional_call.functional_call",
         "torch._functorch.functional_call.stack_module_state",
@@ -2135,29 +2266,7 @@
         "torch._functorch.utils.enable_single_level_autograd_function",
         "torch._functorch.utils.exposed_in",
         "torch._functorch.utils.unwrap_dead_wrappers",
-        "torch._functorch.vmap._as_tuple",
-        "torch._functorch.vmap._check_int_or_none",
-        "torch._functorch.vmap._check_out_dims_is_int_or_int_pytree",
-        "torch._functorch.vmap._check_randomness_arg",
-        "torch._functorch.vmap._chunked_vmap",
-        "torch._functorch.vmap._concat_chunked_outputs",
-        "torch._functorch.vmap._create_batched_inputs",
-        "torch._functorch.vmap._flat_vmap",
-        "torch._functorch.vmap._flatten_chunks_output",
-        "torch._functorch.vmap._get_chunked_inputs",
-        "torch._functorch.vmap._get_name",
-        "torch._functorch.vmap._maybe_remove_batch_dim",
-        "torch._functorch.vmap._num_outputs",
-        "torch._functorch.vmap._process_batched_inputs",
-        "torch._functorch.vmap._unwrap_batched",
-        "torch._functorch.vmap._validate_and_get_batch_size",
-        "torch._functorch.vmap.doesnt_support_saved_tensors_hooks",
-        "torch._functorch.vmap.get_chunk_sizes",
         "torch._functorch.vmap.lazy_load_decompositions",
-        "torch._functorch.vmap.restore_vmap",
-        "torch._functorch.vmap.unwrap_batched",
-        "torch._functorch.vmap.vmap_impl",
-        "torch._functorch.vmap.wrap_batched",
         "torch._guards.compile_context",
         "torch._guards.detect_fake_mode",
         "torch._guards.tracing",
@@ -2179,7 +2288,6 @@
         "torch._higher_order_ops.out_dtype.out_dtype_fake_tensor_mode",
         "torch._higher_order_ops.out_dtype.out_dtype_fallback",
         "torch._higher_order_ops.out_dtype.out_dtype_func",
-        "torch._higher_order_ops.out_dtype.out_dtype_predispatch",
         "torch._higher_order_ops.out_dtype.out_dtype_proxy",
         "torch._higher_order_ops.out_dtype.trace_out_dtype",
         "torch._higher_order_ops.utils.autograd_not_implemented_inner",
@@ -2216,7 +2324,7 @@
         "torch._preload_cuda_deps",
         "torch._register_device_module",
         "torch._running_with_deploy",
-        "torch._sparse_coo_tensor_unsafe",
+        "torch._utils._dummy_type",
         "torch._weights_only_unpickler._get_allowed_globals",
         "torch._weights_only_unpickler.load",
         "torch.align_tensors",
@@ -2232,11 +2340,7 @@
         "torch.autograd._make_grads",
         "torch.autograd._register_py_tensor_class_for_device",
         "torch.autograd._tensor_or_tensors_to_tuple",
-        "torch.autograd.backward",
-        "torch.autograd.forward_ad.enter_dual_level",
-        "torch.autograd.forward_ad.exit_dual_level",
-        "torch.autograd.forward_ad.make_dual",
-        "torch.autograd.forward_ad.unpack_dual",
+        "torch.autograd.forward_ad._maybe_load_decompositions",
         "torch.autograd.function._iter_filter",
         "torch.autograd.function._iter_jit_values",
         "torch.autograd.function._iter_None_tensors",
@@ -2267,8 +2371,6 @@
         "torch.autograd.functional.vjp",
         "torch.autograd.grad_mode._enter_inference_mode",
         "torch.autograd.grad_mode._exit_inference_mode",
-        "torch.autograd.gradcheck",
-        "torch.autograd.gradgradcheck",
         "torch.autograd.graph._get_sid",
         "torch.autograd.graph._get_tid",
         "torch.autograd.graph.allow_mutation_on_saved_tensors",
@@ -2287,6 +2389,9 @@
         "torch.backends.cuda.is_built",
         "torch.backends.cuda.math_sdp_enabled",
         "torch.backends.cuda.mem_efficient_sdp_enabled",
+        "torch.backends.cuda.cudnn_sdp_enabled",
+        "torch.backends.cuda.enable_cudnn_sdp",
+        "torch.backends.cuda.preferred_blas_library",
         "torch.backends.cuda.preferred_linalg_library",
         "torch.backends.cuda.sdp_kernel",
         "torch.backends.cudnn._init",
@@ -2363,13 +2468,12 @@
         "torch.cuda._set_stream_by_id",
         "torch.cuda._sleep",
         "torch.cuda._transform_uuid_to_ordinals",
-        "torch.cuda._utils._dummy_type",
         "torch.cuda._utils._get_device_index",
         "torch.cuda.amp.autocast_mode._cast",
         "torch.cuda.amp.autocast_mode.custom_bwd",
         "torch.cuda.amp.autocast_mode.custom_fwd",
         "torch.cuda.amp.common.amp_definitely_not_available",
-        "torch.cuda.amp.grad_scaler._refresh_per_optimizer_state",
+        "torch.amp.grad_scaler._refresh_per_optimizer_state",
         "torch.cuda.can_device_access_peer",
         "torch.cuda.check_error",
         "torch.cuda.clock_rate",
@@ -2487,6 +2591,8 @@
         "torch.functional.unravel_index",
         "torch.futures.collect_all",
         "torch.futures.wait_all",
+        "torch.fx.experimental.const_fold.split_const_subgraphs",
+        "torch.fx.experimental.proxy_tensor.make_fx",
         "torch.get_deterministic_debug_mode",
         "torch.get_float32_matmul_precision",
         "torch.is_deterministic_algorithms_warn_only_enabled",
@@ -2547,6 +2653,8 @@
         "torch.nested._internal.nested_tensor.is_expandable_to",
         "torch.nested._internal.nested_tensor.jagged_from_list",
         "torch.nested._internal.nested_tensor.jagged_from_tensor_and_lengths",
+        "torch.nested._internal.nested_tensor.nested_view_from_values_offsets",
+        "torch.nested._internal.nested_tensor.nested_view_from_values_offsets_lengths",
         "torch.nested.as_nested_tensor",
         "torch.nested.narrow",
         "torch.nested.nested_tensor",
@@ -2714,7 +2822,6 @@
         "torch.signal.windows.windows.merge_dicts",
         "torch.signal.windows.windows.nuttall",
         "torch.signal.windows.windows.parse_kwargs",
-        "torch.sparse.as_sparse_gradcheck",
         "torch.sparse.semi_structured.to_sparse_semi_structured",
         "torch.sparse.sum",
         "torch.split",
@@ -2729,17 +2836,18 @@
         "torch.typename",
         "torch.unique_consecutive",
         "torch.use_deterministic_algorithms",
-    ]
-}
+    ],
+    TorchInGraphFunctionVariable,
+)
 
 
 torch_name_rule_map = [
     manual_torch_name_rule_map,
-    torch_ctx_manager_classes,
     torch_c_binding_in_graph_functions,
     torch_non_c_binding_in_graph_functions,
 ]
 
+
 """
 Generate the torch object - Dynamo tracing rule (the wrapping variable) map.
 """
@@ -2749,8 +2857,11 @@
 def get_torch_obj_rule_map():
     d: Dict[Any, VariableTracker] = dict()
     for m in torch_name_rule_map:
-        for k, v in m.items():
-            obj = load_object(k)
+        for k, v in m.items():  # type: ignore[attr-defined]
+            if ".py#" not in k:
+                obj = load_object(k)
+            else:
+                obj = _module_dir(torch) + k[len("torch/") :]
             if obj is not None:
                 if obj in d and d[obj] != v:
                     raise AssertionError(
@@ -2780,8 +2891,7 @@ def load_object(name):
         else:
             assert len(x) == 1, f"Invalid obj name {name}"
             val = _load_obj_from_str(x[0])
-        if hasattr(val, "__wrapped__"):
-            val = val.__wrapped__
+        val = unwrap_if_wrapper(val)
     except (AttributeError, ImportError):
         val = None
     return val
@@ -2935,10 +3045,6 @@ def add_module_init_func(name: str, init_func: Callable[[], None]) -> None:
     """Register a module without eagerly importing it"""
     # If the module is already imported, eagerly run init
     assert "." not in name, f"Expected a root module name, but got {name}"
-    if name in sys.modules:
-        init_func()
-
-    # Module is not yet imported, delay processing until needed
     assert name not in _lazy_module_init
     _lazy_module_init[name].append(init_func)
 
@@ -2967,7 +3073,7 @@ def is_callable_disallowed(obj) -> bool:
 
 def is_forbidden(obj) -> bool:
     _maybe_init_lazy_module(obj)
-    return getattr(obj, "_dynamo_forbidden", False)
+    return inspect.getattr_static(obj, "_dynamo_forbidden", False)
 
 
 def is_builtin_callable(obj) -> bool:
@@ -2984,32 +3090,473 @@ def is_numpy(obj) -> bool:
     return isinstance(obj, (np.ndarray, np.generic)) or id(obj) in _numpy_function_ids
 
 
+BUILTIN_SKIPLIST = (
+    abc,
+    collections,
+    contextlib,
+    copy,
+    copyreg,
+    dataclasses,
+    enum,
+    functools,
+    importlib,
+    inspect,
+    linecache,
+    logging,
+    multiprocessing,
+    operator,
+    os,
+    posixpath,
+    random,
+    re,
+    selectors,
+    signal,
+    tempfile,
+    threading,
+    tokenize,
+    torch,  # torch/* is skipped by default unless specified in FUNC_INLINELIST or MOD_INLINELIST
+    traceback,
+    types,
+    typing,
+    unittest,
+    weakref,
+    _collections_abc,
+    _weakrefset,
+)
+
+# third party libraries skiplist is defined by str, because users may not use these libraries.
+# we should use lazy import & skip in the future.
+THIRDPARTY_SKIPLIST = (
+    "fx2trt_oss",
+    "hypothesis",
+    "networkx",
+    "numpy",
+    "omegaconf",
+    "onnx",
+    "onnxruntime",
+    "onnx_tf",
+    "pandas",
+    "sklearn",
+    "tabulate",
+    "tensorflow",
+    "tensorrt",
+    "torch2trt",
+    "tqdm",
+    "tree",
+    "tvm",
+    "xarray",
+)
+
+
+def _strip_init_py(s):
+    # TODO: Once we require py3.9 use removesuffix instead.
+    suffix = "__init__.py"
+    if s.endswith(suffix):
+        return s[: -len(suffix)]
+    else:
+        return s
+
+
+def _module_dir(m: types.ModuleType):
+    # Protect against a module not exporting __file__ - this can happen for
+    # frozen modules, for example.
+    file = getattr(m, "__file__", None)
+    return file and _strip_init_py(file)
+
+
+# These are legacy workarounds, don't add new modules to this list.
+# Please use the MOD_INLINELIST instead to force inline functions under particular modules.
+LEGACY_MOD_INLINELIST = {
+    "torch._dynamo.external_utils",
+    "torch._export.db.examples",
+    "torch._export.wrappers",
+    "torch._functorch.apis",
+    "torch._functorch.deprecated",
+    "torch._higher_order_ops.cond",
+    "torch.ao.quantization.pt2e.export_utils",
+    "torch.ao.quantization.pt2e.qat_utils",
+    "torch.ao.quantization.pt2e.representation.rewrite",
+    "torch.ao.quantization.pt2e.utils",
+    "torch.ao.quantization.quantizer.xnnpack_quantizer",
+    "torch.optim",
+}
+
+if torch.distributed.is_available():
+    LEGACY_MOD_INLINELIST |= {
+        "torch.distributed._tensor.api",
+        "torch.distributed._tensor.device_mesh",
+        "torch.distributed.device_mesh",
+        "torch.distributed.algorithms._checkpoint.checkpoint_wrapper",
+        "torch.distributed.tensor.parallel._data_parallel_utils",
+        "torch.distributed.tensor.parallel._utils",
+        "torch.distributed.tensor.parallel.style",
+        # we have to add replicate to LEGACY_MOD_INLINELIST to ensure
+        # the forward_hook won't be ignored.
+        "torch.distributed._composable.replicate",
+    }
+
+
+# Force inline functions under these modules, even they are in *_SKIPLIST.
+# We are using python module name instead of file or directory object to avoid circular dependency.
+# Please keep this sorted alphabetically.
+MOD_INLINELIST = {
+    "torch._refs",
+    "torch._prims",
+    "torch._decomp",
+    "torch._dynamo._trace_wrapped_higher_order_op",
+    "torch._dynamo.comptime",
+    "torch._dynamo.polyfill",
+    "torch._functorch.vmap",
+    "torch._functorch.autograd_function",
+    "torch._library.custom_ops",
+    "torch._functorch.eager_transforms",
+    "torch._inductor.test_operators",
+    "torch.amp.autocast_mode",
+    "torch.ao.nn",
+    "torch.autograd.function",
+    "torch.backends.cuda",
+    "torch.cuda.amp.autocast_mode",
+    "torch.distributions",
+    "torch.fx._pytree",
+    "torch.fx.passes.shape_prop",
+    "torch.nn",
+    "torch.random",
+    "torch.sparse",
+    "torch.testing",
+    "torch.testing._internal.hypothesis_utils",
+    "torch.utils._content_store",
+    "torch.utils._contextlib",
+    "torch.utils._foreach_utils",
+    "torch.utils._pytree",
+    "torch.utils.hooks",
+    "torch._tensor",
+    "torch._higher_order_ops.strict_mode",
+    "torch._higher_order_ops.while_loop",
+    "torch._higher_order_ops.associative_scan",
+}
+
+
+if torch.distributed.is_available():
+    MOD_INLINELIST.add("torch.distributed")
+    MOD_INLINELIST.add("torch.distributed._functional_collectives")
+    MOD_INLINELIST.add("torch.distributed._composable.replicate")
+
+
+@functools.lru_cache(None)
+def get_legacy_mod_inlinelist():
+    inlinelist = {
+        _module_dir(torch) + m[len("torch.") :].replace(".", "/")
+        for m in LEGACY_MOD_INLINELIST
+    }
+    return inlinelist
+
+
+@functools.lru_cache(None)
+def get_mod_inlinelist():
+    inlinelist = {
+        _module_dir(torch) + m[len("torch.") :].replace(".", "/")
+        for m in MOD_INLINELIST
+    }
+    return inlinelist
+
+
+# skip some standard python builtin libs
+SKIP_DIRS = [
+    "<frozen importlib",
+    "<__array_function__ internals>",
+    _config_module.__file__,
+]
+SKIP_DIRS.extend(filter(None, (_module_dir(m) for m in BUILTIN_SKIPLIST)))
+
+SKIP_DIRS_RE = re.compile(r"match nothing^")
+
+is_fbcode = importlib.import_module("torch._inductor.config").is_fbcode()
+# Skip fbcode paths(including torch.package paths) containing
+# one of the following strings.
+FBCODE_SKIP_DIRS = {
+    "torchrec/distributed",
+    "torchrec/fb/distributed",
+    "caffe2/torch/fb/sparsenn/pooled_embeddings_modules.py",
+}
+FBCODE_SKIP_DIRS_RE = re.compile(f".*({'|'.join(map(re.escape, FBCODE_SKIP_DIRS))})")
+
+
+# TODO(yanboliang, anijain2305) - There are a few concerns that we should
+# resolve
+# 1) Audit if torchrec/distributed is even required in FBCODE_SKIPS_DIR
+# 2) To inline just one file but skip others in a directory, we could use
+# manual_torch_name_rule_map but this one is hard because FBCODE can add unusual
+# names like torch_package.
+# So, this is a stop gap solution till then.
+FBCODE_INLINE_FILES_IN_SKIPPED_DIRS = {
+    "torchrec/distributed/types.py",
+}
+FBCODE_INLINE_FILES_IN_SKIPPED_DIRS_RE = re.compile(
+    f".*({'|'.join(map(re.escape, FBCODE_INLINE_FILES_IN_SKIPPED_DIRS))})"
+)
+
+# torch.optim is a special case,
+# we usually want to inline it, but the directory
+# structure does not match the module structure
+# and we want to skip the functions in optim/lr_scheduler.py
+# this has precedence over all other rules in check_file
+FORCE_SKIP_FILES = {f"{_module_dir(torch)}optim/lr_scheduler.py"}
+
+
+def _recompile_re():
+    global SKIP_DIRS_RE
+    SKIP_DIRS_RE = re.compile(f"^({'|'.join(map(re.escape, SKIP_DIRS))})")
+
+
+def add(import_name: str):
+    if isinstance(import_name, types.ModuleType):
+        return add(import_name.__name__)
+    assert isinstance(import_name, str)
+    from importlib.util import find_spec
+
+    module_spec = find_spec(import_name)
+    if not module_spec:
+        return
+    origin = module_spec.origin
+    if origin is None:
+        return
+    global SKIP_DIRS_RE
+    SKIP_DIRS.append(_strip_init_py(origin))
+    _recompile_re()
+
+
+@dataclasses.dataclass
+class SkipResult:
+    skipped: bool
+    reason: Optional[str]
+
+
+def check_file(filename, is_inlined_call=False):
+    """Should skip this file?"""
+    if filename is None:
+        return SkipResult(True, "filename is None")
+    if filename in FORCE_SKIP_FILES:
+        return SkipResult(True, "FORCE_SKIP_FILES")
+    if any(filename.startswith(d) for d in get_legacy_mod_inlinelist()):
+        return SkipResult(
+            False,
+            "LEGACY_MOD_INLINELIST",
+        )
+    if is_inlined_call and is_torch_inline_allowed(filename):
+        return SkipResult(
+            False,
+            "MOD_INLINELIST",
+        )
+    if (
+        is_fbcode
+        and bool(FBCODE_SKIP_DIRS_RE.match(filename))
+        and not bool(FBCODE_INLINE_FILES_IN_SKIPPED_DIRS_RE.match(filename))
+    ):
+        return SkipResult(
+            True,
+            "FBCODE_SKIP_DIRS",
+        )
+    if bool(SKIP_DIRS_RE.match(filename)):
+        return SkipResult(True, "SKIP_DIRS")
+    else:
+        return SkipResult(False, "inlined by default")
+
+
+@dataclasses.dataclass
+class FunctionInfo:
+    py_obj: Optional[object]
+    name: Optional[str]
+    filename: str
+    code: Optional[types.CodeType]
+
+
 """
-Main entry point for looking up the trace rule (the Dynamo variable) for a given object.
-E.g, the lookup result of `torch.amp.autocast_mode.autocast` is `TorchCtxManagerClassVariable`.
+This is the main entry point to determine whether an object (function) should be inlined or skipped.
+Let's illustrate the logic with an example:
+    @torch.compile
+    def f1(x, y):
+        ......
+        f2(x, y)
+        ......
+
+    def f2(x, y):
+        ......
+        f3(x, y)
+        ......
+
+    def f3(x, y):
+        ......
+
+There are mainly three call sites of check/check_verbose:
+* The compile region entrance (like function f1), the correspoinding code is located at eval_frame.py.
+* When tracing the recursively called functions (like function f2 and f3).
+    * Dynamo decides inline/skip everytime it encounters a new recursively function call, and the call site
+      is in InliningInstructionTranslator.check_inlineable of symbolic_convert.py.
+    * If f2 is skipped by Dynamo, when evaluating the frame of f3, Dynamo need the inline/skip check again
+      and the call site is in catch_errors_wrapper.catch_errors of convert_frame.py.
+* For global variables and function arguments, Dynamo needs to decide if they are wrapped as SkipFunctionVariable in builder.py.
+
+`is_inlined_call` is used to indicate if the current function call is inlined (f2 is inlined call if it passes check)
+or not (f3 is not inlined call if f2 is skipped). Inside of the `check_verbose` function, there are more rules
+to be checked if this `is_inlined_call`.
+The reason to have this flag is that if the upper level function call (e.g, f2) is skipped,
+we don't want to inline the lower level function call (e.g, f3) by default.
 """
 
 
-def lookup(obj):
+def check_verbose(obj, is_inlined_call=False):
+    if isinstance(
+        obj, (UserFunctionVariable, UserMethodVariable, NestedUserFunctionVariable)
+    ):
+        try:
+            py_obj = obj.get_function()
+        except NotImplementedError:
+            py_obj = None
+        fi = FunctionInfo(py_obj, obj.get_name(), obj.get_filename(), obj.get_code())
+    elif isinstance(obj, types.CodeType):
+        fi = FunctionInfo(None, obj.co_name, obj.co_filename, obj)
+    elif isinstance(obj, (types.FunctionType, types.MethodType)):
+        fi = FunctionInfo(
+            obj, obj.__name__, getfile(obj), obj.__code__  # type: ignore[union-attr] # FIXME Add MethodType.__code__ to typeshed
+        )
+    else:
+        fi = FunctionInfo(obj, None, getfile(obj), None)
+
+    # Consulte the central trace rules defined in torch._dynamo.trace_rules.
+    reasons: Set[str] = set()
+    rule = torch._dynamo.trace_rules.lookup_inner(
+        fi.py_obj, fi.name, fi.filename, is_inlined_call, reasons
+    )
+    if rule in [UserFunctionVariable, FunctorchHigherOrderVariable]:
+        return SkipResult(
+            False,
+            f"inlined according trace_rules.lookup {reasons.pop()}",
+        )
+    else:
+        assert rule == SkipFunctionVariable, rule
+        return SkipResult(
+            True,
+            f"skipped according trace_rules.lookup {reasons.pop()}",
+        )
+
+
+def check(obj, is_inlined_call=False):
+    return check_verbose(obj, is_inlined_call).skipped
+
+
+# skip common third party libs
+for _name in THIRDPARTY_SKIPLIST:
+    add(_name)
+
+_recompile_re()
+
+
+def is_torch_inline_allowed(filename):
+    return any(filename.startswith(d) for d in get_mod_inlinelist())
+
+
+@functools.lru_cache(None)
+def dynamo_dir():
+    import torch._dynamo
+
+    return _module_dir(torch._dynamo)
+
+
+def is_torch(filename):
+    if filename.startswith(dynamo_dir()):
+        return False
+    return filename.startswith(_module_dir(torch))
+
+
+"""
+Main entry point for looking up the trace rule (the Dynamo variable) for a given callable object.
+"""
+
+
+def lookup_callable(obj):
     if not hashable(obj):
         return None
-    # Custom allow/disallow in graph takes precedence over the `torch_name_rule_map`.
-    if callable(obj) and is_callable_disallowed(obj):
-        return SkipFilesVariable
-    if callable(obj) and is_callable_allowed(obj):
-        return TorchInGraphFunctionVariable
-    # Unwrap if the function is wrapped by functools.lru_cache or functools.wraps.
-    if isinstance(obj, functools._lru_cache_wrapper) or (
-        is_function(obj) and hasattr(obj, "__wrapped__")
-    ):
-        # TODO: Weird case, should not unwrap if it's wrapped as _VariableFunctionsClass.
-        if not (
-            hasattr(obj, "__qualname__")
-            and str(obj.__qualname__).startswith("_VariableFunctionsClass")
-        ):
-            obj = obj.__wrapped__
-    rule = get_torch_obj_rule_map().get(obj, None)
-    if rule is None and is_aten_op_or_tensor_method(obj):
+    # Custom allow/disallow in graph takes precedence over the general lookup.
+    if is_callable_disallowed(obj):
+        return SkipFunctionVariable
+    if is_callable_allowed(obj):
         return TorchInGraphFunctionVariable
+    if is_builtin_callable(obj):
+        return BuiltinVariable
+
+
+"""
+Main entry point for looking up the trace rule (the Dynamo variable) for a given function object.
+E.g, the lookup result of `torch.sin` is `TorchInGraphFunctionVariable`.
+"""
+
+
+def lookup(obj):
+    return lookup_inner(obj)
+
+
+def lookup_inner(
+    obj,
+    name=None,
+    filename=None,
+    is_direct_call=True,
+    reasons: Union[None, Set[str]] = None,
+):
+    # Step 1: lookup obj's tracing rule in `torch_name_rule_map`.
+    # The rules defined in `torch_name_rule_map` mainly includes two parts:
+    # - Manually defined rules for any functions.
+    # - The list of torch in graph functions.
+    if not hashable(obj):
+        if reasons is not None:
+            reasons.add("obj is not hashable")
+        return None
+    if obj is not None:
+        if is_aten_op_or_tensor_method(obj):
+            return TorchInGraphFunctionVariable
+        rule = get_torch_obj_rule_map().get(obj, None)
+        if rule is not None:
+            if reasons is not None:
+                reasons.add("get_torch_obj_rule_map")
+            return rule
+    elif name is not None and filename is not None and not is_direct_call:
+        if name.startswith(TORCH_DYNAMO_RESUME_IN_PREFIX):
+            rule = get_torch_obj_rule_map().get(
+                filename + "#" + TORCH_DYNAMO_RESUME_IN_PREFIX, None
+            )
+        else:
+            rule = get_torch_obj_rule_map().get(filename + "#" + name, None)
+        if rule is not None:
+            if reasons is not None:
+                reasons.add("get_torch_obj_rule_map")
+            return rule
+
+    # Step 2: lookup obj's tracing rule by function name.
+    if is_direct_call:
+        if name == "patched_init":
+            if reasons is not None:
+                reasons.add("func name is patched_init")
+            return SkipFunctionVariable
+        elif name == "__torch_function__":
+            if reasons is not None:
+                reasons.add("func name is __torch_function__")
+            return UserFunctionVariable
+
+    # Step 3: lookup obj's tracing rule by filename.
+    if filename is None:
+        filename = getfile(obj)
+
+    skip_result = check_file(filename, is_direct_call)
+    if reasons is not None:
+        reasons.add(skip_result.reason)
+    if skip_result.skipped:
+        return SkipFunctionVariable
     else:
-        return rule
+        return UserFunctionVariable
+
+
+def clear_lru_cache():
+    torch._dynamo.trace_rules.get_torch_obj_rule_map.cache_clear()
+    torch._dynamo.trace_rules.get_tensor_method.cache_clear()
+    torch._dynamo.trace_rules.get_legacy_mod_inlinelist.cache_clear()
+    torch._dynamo.trace_rules.get_mod_inlinelist.cache_clear()
+    torch._dynamo.trace_rules.dynamo_dir.cache_clear()
diff --git a/torch/_dynamo/types.py b/torch/_dynamo/types.py
index 0c1dedddfaf61..7156ec55e7026 100644
--- a/torch/_dynamo/types.py
+++ b/torch/_dynamo/types.py
@@ -19,6 +19,8 @@
 #  and a `code` field for the code object.
 CacheEntry = torch._C._dynamo.eval_frame._CacheEntry
 
+ExtraState = torch._C._dynamo.eval_frame._ExtraState
+
 # We use a dict to store additional data per frame.
 FrameState = Dict[Any, Any]
 
@@ -37,6 +39,8 @@ class GuardFn(Protocol):
     verbose_code_parts: List[str]
     global_scope: Dict[str, object]
     guard_fail_fn: Optional[Callable[[GuardFail], None]]
+    cache_entry: Optional[CacheEntry]
+    extra_state: Optional[ExtraState]
 
     # maps locals of user function to bool
     def __call__(self, f_locals: Dict[str, object]) -> bool:
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index b684107b03179..ff9438085c529 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -17,6 +17,7 @@
 import operator
 import os
 import pstats
+import re
 import subprocess
 import sys
 import textwrap
@@ -28,6 +29,7 @@
 from contextlib import contextmanager
 from functools import lru_cache, wraps
 from pathlib import Path
+from types import MethodWrapperType
 from typing import (
     Any,
     Callable,
@@ -48,6 +50,9 @@
     ValuesView,
 )
 
+from torch._utils_internal import maybe_upload_prof_stats_to_manifold
+
+from ..utils.hooks import RemovableHandle
 
 try:
     import numpy as np
@@ -89,17 +94,24 @@
 import torch
 import torch._functorch.config
 import torch.fx.experimental.symbolic_shapes
+import torch.utils._pytree as pytree
 from torch import fx
 from torch._dispatch.python import enable_python_dispatcher
+from torch._guards import TracingContext
+from torch._subclasses.meta_utils import is_sparse_compressed
 from torch._utils_internal import log_compilation_event
 
+from torch.fx._utils import _format_graph_code, lazy_format_graph_code
 from torch.nn.modules.lazy import LazyModuleMixin
-from torch.utils._pytree import tree_map_only
+from torch.utils._triton import has_triton, has_triton_package
 
 
 counters: DefaultDict[str, Counter[str]] = collections.defaultdict(collections.Counter)
-troubleshooting_url = "https://pytorch.org/docs/master/compile/troubleshooting.html"
-nnmodule_doc_url = "https://pytorch.org/docs/master/compile/nn-module.html"
+optimus_scuba_log: Dict[str, Any] = {}
+troubleshooting_url = (
+    "https://pytorch.org/docs/main/torch.compiler_troubleshooting.html"
+)
+nnmodule_doc_url = "https://pytorch.org/docs/main/torch.compiler_nn_module.html"
 nnmodule_doc_url_msg = f"See {nnmodule_doc_url} for more information and limitations."
 log = logging.getLogger(__name__)
 
@@ -123,16 +135,27 @@ def tabulate(rows, headers):
         )
 
 
+def maybe_cprofile(func):
+    if config.cprofile:
+        return cprofile_wrapper(func)
+    return func
+
+
 def cprofile_wrapper(func):
     @wraps(func)
     def profile_wrapper(*args, **kwargs):
         global timer_counter
-        profile_path = Path(func.__name__ + f"{next(timer_counter)}.profile")
+        profile_cnt = next(timer_counter)
+        profile_path = Path("/tmp/" + func.__name__ + f"{profile_cnt}.profile")
         prof = cProfile.Profile()
         prof.enable()
+        start_ts = time.time()
         retval = prof.runcall(func, *args, **kwargs)
+        profile_latency = time.time() - start_ts
         prof.disable()
-        print(f"### Cprofile for {func.__name__} iter {next(timer_counter)} ###")
+        print(
+            f"### Cprofile for {func.__name__} iter {profile_cnt} took {profile_latency:.3f} seconds ###"
+        )
         ps = pstats.Stats(prof)
         prof.dump_stats(profile_path)
         svg_path = profile_path.with_suffix(".svg")
@@ -161,6 +184,9 @@ def profile_wrapper(*args, **kwargs):
             )
             ps.sort_stats(pstats.SortKey.TIME).print_stats(20)
             ps.sort_stats(pstats.SortKey.CUMULATIVE).print_stats(20)
+
+        maybe_upload_prof_stats_to_manifold(str(profile_path))  # fb-only
+
         return retval
 
     return profile_wrapper
@@ -251,10 +277,10 @@ def time_wrapper(*args, **kwargs):
                 frame_key = str(curr_frame)
                 if frame_key not in frame_phase_timing:
                     frame_phase_timing[frame_key] = {}
-                assert (
-                    phase_name not in frame_phase_timing[frame_key]
-                ), f"Duplicate phase name {phase_name} for frame {frame_key}"
-                frame_phase_timing[frame_key][phase_name] = time_spent
+                if phase_name not in frame_phase_timing[frame_key]:
+                    frame_phase_timing[frame_key][phase_name] = time_spent
+                else:
+                    frame_phase_timing[frame_key][phase_name] += time_spent
             return r
 
         return time_wrapper
@@ -346,12 +372,6 @@ def setup_compile_debug():
     compile_debug = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
 
     if compile_debug:
-        torch._logging.set_logs(
-            dynamo=logging.DEBUG,
-            aot=logging.DEBUG,
-            inductor=logging.DEBUG,
-            output_code=True,  # this is off by default
-        )
         return add_file_handler()
 
     return contextlib.ExitStack()
@@ -423,6 +443,9 @@ def hashable(x):
         return True
     except TypeError:
         return False
+    # cannot hash writable memoryview object
+    except ValueError:
+        return False
 
 
 def nothing(*args, **kwargs):
@@ -469,6 +492,19 @@ def istype(obj, allowed_types):
     return type(obj) is allowed_types
 
 
+if sys.version_info >= (3, 12):
+    # Some typing classes moved to C in 3.12,
+    # which no longer have the _Final mixin.
+    _builtin_final_typing_classes = (
+        typing.ParamSpecArgs,
+        typing.ParamSpecKwargs,
+        typing.ParamSpec,
+        typing.TypeVar,
+        typing.TypeVarTuple,
+        typing.TypeAliasType,
+    )
+
+
 def is_typing(value):
     # _Final catches most of typing classes:
     #   - Any
@@ -478,6 +514,8 @@ def is_typing(value):
     #
     # NB: we intentionally ignore classes that inherit from Generic, since they
     # can be used as both TypingVariable as well as UserDefinedClassVariable.
+    if sys.version_info >= (3, 12) and isinstance(value, _builtin_final_typing_classes):
+        return True
     return isinstance(value, typing._Final) or value is typing.Generic  # type: ignore[attr-defined]
 
 
@@ -514,18 +552,52 @@ def is_numpy_float_type(value):
     )
 
 
+def is_function_or_wrapper(value):
+    return (
+        is_function(value)
+        or isinstance(value, functools._lru_cache_wrapper)
+        and is_function(inspect.getattr_static(value, "__wrapped__"))
+        or isinstance(value, (torch._ops.OpOverloadPacket, torch._ops.OpOverload))
+    )
+
+
 def is_function(value):
-    return istype(
+    return isinstance(
         value,
         (
             types.FunctionType,
             types.BuiltinFunctionType,
             types.MethodDescriptorType,
             types.WrapperDescriptorType,
+            torch.jit.ScriptFunction,
         ),
     )
 
 
+def unwrap_if_wrapper(fn):
+    return unwrap_with_attr_name_if_wrapper(fn)[0]
+
+
+def unwrap_with_attr_name_if_wrapper(fn):
+    # unpack @functools.lru_cache wrapped function
+    if isinstance(fn, functools._lru_cache_wrapper):
+        fn = inspect.getattr_static(fn, "__wrapped__")
+        attr_name = "__wrapped__"
+    # unpack @torch._dynamo.optimize()(fn) wrapped function
+    elif is_function(fn) and inspect.getattr_static(fn, "_torchdynamo_inline", False):
+        fn = inspect.getattr_static(fn, "_torchdynamo_inline", fn)
+        attr_name = "_torchdynamo_inline"
+    # unpack torch.jit.script_if_tracing
+    elif is_function(fn) and inspect.getattr_static(
+        fn, "__script_if_tracing_wrapper", False
+    ):
+        fn = inspect.getattr_static(fn, "__original_fn", fn)
+        attr_name = "__original_fn"
+    else:
+        attr_name = None
+    return fn, attr_name
+
+
 def is_numpy_ndarray(value):
     if not np:
         return False
@@ -573,9 +645,10 @@ def proxy_args_kwargs(args, kwargs):
         from .exc import unimplemented
         from .variables.base import typestr
 
-        raise unimplemented(
-            f"call_function args: {typestr(*args)} {typestr(*list(kwargs.values()))}"
-        ) from e
+        unimplemented(
+            f"call_function args: {typestr(*args)} {typestr(*list(kwargs.values()))}",
+            from_exc=e,
+        )
 
 
 @dataclasses.dataclass
@@ -591,14 +664,23 @@ class CompilationMetrics:
     graph_op_count: Optional[int]
     graph_node_count: Optional[int]
     graph_input_count: Optional[int]
+    start_time: float
     entire_frame_compile_time_s: Optional[float]
     backend_compile_time_s: Optional[float]
+    inductor_compile_time_s: Optional[float]
+    code_gen_time_s: Optional[float]
     fail_type: Optional[str]
     fail_reason: Optional[str]
     fail_user_frame_filename: Optional[str]
     fail_user_frame_lineno: Optional[int]
     non_compliant_ops: Set[str]
     compliant_custom_ops: Set[str]
+    restart_reasons: Set[str]
+    dynamo_time_before_restart_s: float
+    # Sometimes, we will finish analyzing a frame but conclude we don't want
+    # to install any guarded code.  True means we actually decided to install
+    # a compiled frame
+    has_guarded_code: bool
 
 
 DEFAULT_COMPILATION_METRICS_LIMIT = 64
@@ -612,6 +694,13 @@ class CompilationMetrics:
 def record_compilation_metrics(compilation_metrics: CompilationMetrics):
     global _compilation_metrics
     _compilation_metrics.append(compilation_metrics)
+    torch._logging.trace_structured(
+        "compilation_metrics",
+        lambda: {
+            k: list(v) if isinstance(v, set) else v
+            for k, v in dataclasses.asdict(compilation_metrics).items()
+        },
+    )
     if config.log_compilation_metrics:
         log_compilation_event(compilation_metrics)
 
@@ -641,7 +730,9 @@ class CleanupHook:
     name: str
 
     def __call__(self, *args):
-        CleanupManager.count -= 1
+        # Make sure we're not shutting down
+        if CleanupManager is not None:
+            CleanupManager.count -= 1
         del self.scope[self.name]
 
     @staticmethod
@@ -695,6 +786,29 @@ def torch_clone(x):
             # Access data_ptr() for a xla tensor will cause crash
             return torch_clone(x)
 
+        # Handle sparse storage (no stride).
+        if x.layout is torch.sparse_coo:
+            return torch.sparse_coo_tensor(
+                torch_clone(x._indices()),
+                torch_clone(x._values()),
+                x.shape,
+                is_coalesced=x.is_coalesced(),
+            )
+        elif is_sparse_compressed(x):
+            if x.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                compressed_indices = x.crow_indices()
+                plain_indices = x.col_indices()
+            else:
+                compressed_indices = x.ccol_indices()
+                plain_indices = x.row_indices()
+            return torch.sparse_compressed_tensor(
+                torch_clone(compressed_indices),
+                torch_clone(plain_indices),
+                torch_clone(x.values()),
+                x.shape,
+                layout=x.layout,
+            )
+
         needed_size = sum(
             (shape - 1) * stride for shape, stride in zip(x.size(), x.stride())
         )
@@ -743,10 +857,26 @@ def clone_inputs(example_inputs):
     return res
 
 
+def skip_frame_if_in_functorch_mode(val: torch.Tensor):
+    try:
+        val.data_ptr()  # will throw for functorch tensors
+    except RuntimeError as e:
+        from .exc import SkipFrame
+
+        # This will be GradTrackingTensor/BatchedTensor/etc
+        functorch_subclass_name = re.sub(r"\(.*", "", repr(val))
+        raise SkipFrame(
+            f"torch.compile cannot be run in context: {functorch_subclass_name}"
+        ) from e
+
+
 @contextmanager
 def preserve_rng_state():
-    with torch.utils._python_dispatch._disable_current_modes():
+    disable_functorch = torch._C._DisableFuncTorch
+    disable_current_modes = torch.utils._python_dispatch._disable_current_modes
+    with disable_current_modes(), disable_functorch():
         rng_state = torch.clone(torch.random.get_rng_state())
+        skip_frame_if_in_functorch_mode(rng_state)
         if torch.cuda.is_available():
             cuda_rng_state = torch.clone(torch.cuda.get_rng_state())
     try:
@@ -755,7 +885,7 @@ def preserve_rng_state():
         with torch.utils._python_dispatch._disable_current_modes():
             torch.random.set_rng_state(rng_state)
             if torch.cuda.is_available():
-                torch.cuda.set_rng_state(cuda_rng_state)
+                torch.cuda.set_rng_state(cuda_rng_state)  # type: ignore[possibly-undefined]
 
 
 def is_jit_model(model0):
@@ -801,12 +931,12 @@ def is_namedtuple(obj):
 
 
 def is_namedtuple_cls(cls):
-    """Test if an object is a namedtuple or a torch.return_types.* quasi-namedtuple"""
+    """Test if an object is a namedtuple or a (torch.return_types|torch.autograd.forward_ad).* quasi-namedtuple"""
     try:
         if issubclass(cls, tuple):
             bases = getattr(cls, "__bases__", []) or [None]
             module = getattr(cls, "__module__", None)
-            return module == "torch.return_types" or (
+            return module in ("torch.return_types", "torch.autograd.forward_ad") or (
                 bases[0] is tuple and hasattr(cls, "_make") and hasattr(cls, "_fields")
             )
     except TypeError:
@@ -874,7 +1004,7 @@ def timed(model, example_inputs, times=1):
         result = model(*example_inputs)
         synchronize()
     t1 = time.perf_counter()
-    return result, t1 - t0
+    return result, t1 - t0  # type: ignore[possibly-undefined]
 
 
 def check_is_cuda(gm, example_inputs):
@@ -894,10 +1024,12 @@ def rot_n_helper(n):
 common_constant_types = {
     int,
     float,
+    complex,
     bool,
     str,
     bytes,
     type(None),
+    Ellipsis.__class__,
     types.CodeType,
     torch.device,
     torch.dtype,
@@ -905,6 +1037,11 @@ def rot_n_helper(n):
     torch.layout,
 }
 
+if has_triton_package():
+    import triton
+
+    common_constant_types.add(triton.language.dtype)
+
 
 def is_safe_constant(v):
     if istype(v, (tuple, frozenset)):
@@ -948,14 +1085,22 @@ def check_unspec_python_args(args, kwargs):
     for x in itertools.chain(args, kwargs.values()):
         if isinstance(x, UnspecializedPythonVariable):
             unspec_count += 1
-        elif not isinstance(x, (UnspecializedPythonVariable, ConstantVariable)):
+        elif not isinstance(x, ConstantVariable):
             return False
-        else:
-            pass
-
     return unspec_count > 0
 
 
+def check_unspec_or_constant_args(args, kwargs):
+    # A fused version of:
+    # return check_constant_args(args, kwargs) or check_unspec_python_args(args, kwargs)
+    from .variables.tensor import UnspecializedPythonVariable
+
+    for x in itertools.chain(args, kwargs.values()):
+        if not (x.is_python_constant() or isinstance(x, UnspecializedPythonVariable)):
+            return False
+    return True
+
+
 def check_numpy_ndarray_args(args, kwargs):
     from .variables.tensor import NumpyNdarrayVariable
 
@@ -1009,6 +1154,21 @@ def enum_repr(value, local):
     return local_name
 
 
+def set_example_value(node, example_value):
+    # NB: example_value is a bit of a misnomer, because this is always a fake
+    # tensor of some sort.  Furthermore, these example values serve as the
+    # runtime state of Dynamo tracing, which means if metadata mutation
+    # occurs, the example_value gets directly updated (so you can't rely on
+    # this to accurately reflect what the state of the value was at the time
+    # the program was traced).
+    node.meta["example_value"] = example_value
+    shape_env = TracingContext.get().fake_mode.shape_env
+    if symbol_to_path := torch.fx.experimental.symbolic_shapes.compute_unbacked_bindings(
+        shape_env, example_value
+    ):
+        node.meta["unbacked_bindings"] = symbol_to_path
+
+
 def _get_fake_tensor(vt):
     fake_tensor = vt.as_proxy().node.meta.get("example_value")
     if not is_fake(fake_tensor):
@@ -1059,11 +1219,13 @@ def iter_contains(items, search, tx, check_tensor_identity=False):
     return found
 
 
-def tensor_or_module_to_id(value):
-    return [
-        id(k) if isinstance(k, (torch.Tensor, torch.nn.Module)) else k
-        for k in value.keys()
-    ]
+def key_is_id(k):
+    """Returns whether it indexes dictionaries using its id"""
+    return isinstance(k, (torch.Tensor, torch.nn.Module, MethodWrapperType))
+
+
+def key_to_id(value):
+    return [id(k) if key_is_id(k) else k for k in value.keys()]
 
 
 def const_repr(x, *, local) -> str:
@@ -1104,14 +1266,10 @@ def dict_keys_repr(const_keys, *, local) -> str:
     return "[" + keys_str + "]"
 
 
-def global_key_name(key):
-    return f"__dict_key_{id(key)}"
+GLOBAL_KEY_PREFIX = "__dict_key"
 
 
-from torch._subclasses import (  # noqa: F401
-    FakeTensorMode,
-    UnsupportedFakeTensorException,
-)
+from torch._subclasses import UnsupportedFakeTensorException  # noqa: F401
 
 
 def wrap_fake_exception(fn):
@@ -1122,7 +1280,7 @@ def wrap_fake_exception(fn):
 
         msg = f"Unsupported: {e.reason} with fake tensor propagation."
         log.warning(msg)
-        raise unimplemented(msg) from e
+        unimplemented(msg, from_exc=e)
 
 
 def deepcopy_to_fake_tensor(obj, fake_mode):
@@ -1172,6 +1330,22 @@ def same(
             )
             for ai, bi, fp64_refi in zip(ref, res, fp64_ref)
         )
+    elif type(ref).__name__ == "QuestionAnsweringModelOutput":
+        # This skips checking accuracy for start_logits/end_logits.
+        # Tentatively, start_logits/end_logits appear to be very prone to
+        # inaccuracies and is somewhat subsumed by checking the loss.
+        return same(
+            ref.loss,
+            res.loss,
+            fp64_ref.loss,
+            cos_similarity,
+            tol,
+            equal_nan,
+            exact_dtype,
+            relax_numpy_equality,
+            ignore_non_fp,
+            log_error=log_error,
+        )
     elif isinstance(ref, dict):
         assert isinstance(res, dict)
         assert set(ref.keys()) == set(
@@ -1279,10 +1453,13 @@ def to_tensor(t):
                 passes_test = res_error <= (multiplier * ref_error + tol / 10.0)
                 if not passes_test:
                     log_error(
-                        "RMSE (res-fp64): %.5f, (ref-fp64): %.5f and shape=%s",
+                        "RMSE (res-fp64): %.5f, (ref-fp64): %.5f and shape=%s. res.dtype: %s, multiplier: %f, tol: %f",
                         res_error,
                         ref_error,
                         res.size(),
+                        res.dtype,
+                        multiplier,
+                        tol,
                     )
                     # import pdb; pdb.set_trace()
                 return passes_test
@@ -1443,7 +1620,7 @@ def graph_break_report():
 
         def recompilation_report():
             if len(gf):
-                max_recompiles = max([num_recompiles(code) for code in gf])
+                max_recompiles = max(num_recompiles(code) for code in gf)
                 recomp_table = tabulate(
                     summarized_gf,
                     headers=["Function", "Recompiles", "Recompile Reasons"],
@@ -1519,14 +1696,19 @@ def ensure_graph_fake(e, tx):
     return e
 
 
-def get_fake_values_from_nodes(tx, nodes):
+def get_fake_values_from_nodes(tx, nodes, allow_non_graph_fake):
     def visit(n: torch.fx.Node):
-        return n.meta["example_value"]
+        if n.op == "call_function" and "example_value" not in n.meta:
+            # fake tensor validity is checked inside get_fake_value using
+            # ensure_graph_fake
+            return get_fake_value(n, tx, allow_non_graph_fake)
+
+        out = n.meta["example_value"]
+        if not allow_non_graph_fake and isinstance(out, torch.Tensor):
+            return ensure_graph_fake(out, tx)
+        return out
 
-    args_kwargs = torch.fx.node.map_arg(nodes, visit)
-    return tree_map_only(
-        torch.Tensor, functools.partial(ensure_graph_fake, tx=tx), args_kwargs
-    )
+    return torch.fx.node.map_arg(nodes, visit)
 
 
 def get_fake_value(node, tx, allow_non_graph_fake=False):
@@ -1553,7 +1735,9 @@ def get_fake_value(node, tx, allow_non_graph_fake=False):
     if "example_value" in node.meta and is_fake(node.meta["example_value"]):
         return node.meta["example_value"]
 
-    args, kwargs = get_fake_values_from_nodes(tx, (node.args, node.kwargs))
+    args, kwargs = get_fake_values_from_nodes(
+        tx, (node.args, node.kwargs), allow_non_graph_fake
+    )
 
     nnmodule = None
     if op == "call_method" and len(args) > 0 and isinstance(args[0], torch.nn.Module):
@@ -1595,15 +1779,35 @@ def get_fake_value(node, tx, allow_non_graph_fake=False):
         elif isinstance(
             cause, torch._subclasses.fake_tensor.DynamicOutputShapeException
         ):
-            unimplemented(
-                f"dynamic shape operator: {cause.func}; "
-                "to enable, set torch._dynamo.config.capture_dynamic_output_shape_ops = True"
-            )
+            if not torch._dynamo.config.capture_dynamic_output_shape_ops:
+                unimplemented(
+                    f"dynamic shape operator: {cause.func}; "
+                    "to enable, set torch._dynamo.config.capture_dynamic_output_shape_ops = True"
+                )
+            else:
+                unimplemented(
+                    f"dynamic shape operator: {cause.func}; "
+                    "Operator does not have a meta kernel that supports dynamic output shapes, "
+                    "please report an issue to PyTorch"
+                )
         elif isinstance(
             cause, torch._subclasses.fake_tensor.UnsupportedOperatorException
         ):
+            op = cause.func
+            import_suggestion = ""
+            if isinstance(op, torch._ops.OpOverload):
+                maybe_pystub = torch._C._dispatch_pystub(
+                    op._schema.name, op._schema.overload_name
+                )
+                if maybe_pystub is not None:
+                    module, ctx = maybe_pystub
+                    import_suggestion = (
+                        f"It's possible that the support was implemented in "
+                        f"module `{module}` and you may need to `import {module}`"
+                        f"({ctx}), otherwise "
+                    )
             unimplemented(
-                f"unsupported operator: {cause.func} (see "
+                f"unsupported operator: {cause.func} ({import_suggestion}see "
                 "https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.64r4npvq0w0"
                 " for how to fix)"
             )
@@ -1615,15 +1819,18 @@ def get_fake_value(node, tx, allow_non_graph_fake=False):
                 "Tried to use data-dependent value in the subsequent computation. "
                 "This can happen when we encounter unbounded dynamic value that is unknown during tracing time.  "
                 "You will need to explicitly give hint to the compiler. Please take a look at "
-                f"constrain_as_value OR constrain_as_size APIs.  {cause}",
+                f"torch._check OR torch._check_is_size APIs.  {cause}",
                 case_name="constrain_as_size_example",
             )
         elif isinstance(cause, ValueRangeError):
             raise UserError(UserErrorType.CONSTRAINT_VIOLATION, e.args[0]) from e
+        elif isinstance(cause, TypeError) and "argument" in str(cause):
+            unimplemented(f"TypeError {node.target}: {cause}")
+
         raise TorchRuntimeError(str(e)).with_traceback(e.__traceback__) from None
 
     if not allow_non_graph_fake:
-        _ = tree_map_only(
+        _ = pytree.tree_map_only(
             torch.Tensor, functools.partial(ensure_graph_fake, tx=tx), ret_val
         )
     return ret_val
@@ -1664,6 +1871,10 @@ def run_node(tracer, node, args, kwargs, nnmodule):
     op = node.op
 
     with set_current_node(node):
+
+        def make_error_message(e):
+            return f"Failed running {op} {node.target}(*{args}, **{kwargs}):\n" + str(e)
+
         try:
             if op == "call_function":
                 return node.target(*args, **kwargs)
@@ -1677,17 +1888,16 @@ def run_node(tracer, node, args, kwargs, nnmodule):
             elif op == "placeholder":
                 assert "example_value" in node.meta
                 return node.meta["example_value"]
-        except NotImplementedError as e:
+
+        except (NotImplementedError, UnsupportedFakeTensorException) as e:
             # NB: mimic how wrap_fake_exception does it
             from .exc import unimplemented
 
-            raise unimplemented(
-                f"running {op} {node.target}(*{args}, **{kwargs})"
-            ) from e
-
+            unimplemented(make_error_message(e), from_exc=e)
         except Exception as e:
-            fn_str = f"Failed running {op} {node.target}(*{args}, **{kwargs}):\n"
-            raise RuntimeError(fn_str + str(e)).with_traceback(e.__traceback__) from e
+            raise RuntimeError(make_error_message(e)).with_traceback(
+                e.__traceback__
+            ) from e
 
     raise AssertionError(op)
 
@@ -1729,7 +1939,7 @@ def get_real_value(node, tracer):
 
 
 def assert_no_fake_params_or_buffers(gm):
-    from torch._subclasses.fake_tensor import FakeTensorConfig
+    from torch._subclasses.fake_tensor import FakeTensorConfig, is_fake
 
     def stack_or_hint(t):
         if FakeTensorConfig.debug:
@@ -1740,12 +1950,12 @@ def stack_or_hint(t):
             return "Enable TORCH_FAKE_TENSOR_DEBUG=1 to get creation stack traces on fake tensors."
 
     for name, buffer in gm.named_buffers():
-        assert not isinstance(
-            buffer, torch._subclasses.FakeTensor
+        assert not is_fake(
+            buffer
         ), f"Unexpected fake buffer {name} {stack_or_hint(buffer)}"
     for name, param in gm.named_parameters():
-        assert not isinstance(
-            param, torch._subclasses.FakeTensor
+        assert not is_fake(
+            param
         ), f"Unexpected fake param {name} {stack_or_hint(param)}"
 
 
@@ -1836,26 +2046,6 @@ def tensor_always_has_static_shape(
     return False, None
 
 
-def lazy_format_graph_code(name, gm, maybe_id=None):
-    def format_name():
-        if maybe_id is not None:
-            return f"{name} {maybe_id}"
-        else:
-            return name
-
-    return LazyString(
-        lambda: _format_graph_code(
-            f"===== {format_name()} =====\n",
-            gm.forward.__code__.co_filename,
-            gm.print_readable(print_output=False),
-        )
-    )
-
-
-def _format_graph_code(name, filename, graph_str):
-    return f"TRACED GRAPH\n {name} {filename} {graph_str}\n"
-
-
 def lazy_format_graph_tabular(fn_name, gm):
     def inner():
         try:
@@ -1946,6 +2136,8 @@ def nnmodule_has_hooks(
 
 def to_numpy_helper(value):
     """Convert tensor and tnp.ndarray to numpy.ndarray."""
+    if is_fake(value):
+        return value
     if isinstance(value, tnp.ndarray):
         return to_numpy_helper(value.tensor)
     elif isinstance(value, torch.Tensor):
@@ -2036,18 +2228,18 @@ def defake(x):
     size: "torch._prims_common.ShapeType"
     stride: "torch._prims_common.StrideType"
     if x._has_symbolic_sizes_strides:
-        size = [
-            s.node.shape_env.size_hint(s.node.expr)
-            if isinstance(s, torch.SymInt)
-            else s
-            for s in x.size()
-        ]
-        stride = [
-            s.node.shape_env.size_hint(s.node.expr)
-            if isinstance(s, torch.SymInt)
-            else s
-            for s in x.stride()
-        ]
+        size = []
+        for s in x.size():
+            if isinstance(s, torch.SymInt):
+                size.append(s.node.shape_env.size_hint(s.node.expr))
+            else:
+                size.append(s)
+        stride = []
+        for s in x.stride():
+            if isinstance(s, torch.SymInt):
+                stride.append(s.node.shape_env.size_hint(s.node.expr))
+            else:
+                stride.append(s)
     else:
         size = x.size()
         stride = x.stride()
@@ -2094,8 +2286,6 @@ def is_compile_supported(device_type):
     if device_type == "cpu":
         pass
     elif device_type == "cuda" and compile_supported:
-        from torch.utils._triton import has_triton
-
         compile_supported = has_triton()
     else:
         compile_supported = False
@@ -2426,3 +2616,92 @@ def get_first_attr(obj, *attrs):
             return getattr(obj, attr)
 
     raise AssertionError(f"{obj} does not has any of the attributes: {attrs}")
+
+
+@contextlib.contextmanager
+def maybe_enable_compiled_autograd(should_enable):
+    def compiler_fn(gm):
+        def inner_compiler(gm_, example_inputs_):
+            torch._dynamo.utils.counters["compiled_autograd"]["compiles"] += 1
+            return torch._inductor.compile(gm_, example_inputs_)
+
+        return torch.compile(gm, backend=inner_compiler, fullgraph=True, dynamic=True)
+
+    if should_enable:
+        with torch._dynamo.compiled_autograd.enable(compiler_fn) as ctx:
+            yield ctx
+    else:
+        yield
+
+
+def invalid_removeable_handle():
+    # need a subclass so weakref works
+    class Invalid(dict):  # type: ignore[type-arg]
+        pass
+
+    return RemovableHandle(Invalid())
+
+
+# Returns a "proxy" (new object with the same class and dict) for (non-GraphModule) nn.Module's.
+# Attribute changes to the original object/proxy will be reflected in the other.
+# This is useful for cases where we want a keep-alive reference to a module without increasing
+# its reference count.
+def nn_module_proxy(mod):
+    if not isinstance(mod, torch.nn.Module):
+        return mod
+    if isinstance(mod, torch.fx.GraphModule):
+        # Dynamo-generated GM's shouldn't contain user-created GM's
+        return mod
+    proxy = mod.__class__.__new__(mod.__class__)
+    proxy.__dict__ = mod.__dict__
+    return proxy
+
+
+class GmWrapper(torch.nn.Module):
+    def __init__(self, gm, spec):
+        super().__init__()
+        self.gm = gm
+        self.spec = spec
+
+    def forward(self, *args):
+        args: List[Any] = list(args)
+        return self.gm(*pytree.tree_unflatten(args, self.spec))
+
+
+def flatten_graph_inputs(gm: torch.fx.GraphModule, inputs, compile_gm):
+    """
+    Mutate inputs so that they are flat and wrap gm such that it
+    accepts those inputs.  This is needed for graphs that take
+    bumpy inputs.
+    """
+    inputs, spec = pytree.tree_flatten(inputs)
+    compiled_fn = compile_gm(GmWrapper(gm, spec), inputs)
+
+    idx_to_steal = [
+        i
+        for i, node in enumerate(gm.graph.nodes)
+        if node.op == "placeholder" and node.meta.get("steal_arg", False)
+    ]
+
+    def wrapper(*args):
+        # note this doesn't check the spec, assuming it is the same
+        flat_args = pytree.arg_tree_leaves(*args)
+
+        # flat_args is a new list, so we need to clear references from the old list
+        for i in idx_to_steal:
+            args[i].clear()
+
+        # this call is boxed to avoid increasing refcount until we reach aot_module_simplified forward
+        return compiled_fn(flat_args)
+
+    return wrapper
+
+
+def get_locals_to_steal(maybe_gm):
+    if not isinstance(maybe_gm, torch.fx.GraphModule) or not hasattr(maybe_gm, "meta"):
+        return []
+    return maybe_gm.meta.get("locals_to_steal", [])
+
+
+def set_locals_to_steal(gm, locals_to_steal):
+    gm.meta["locals_to_steal"] = locals_to_steal
diff --git a/torch/_dynamo/variables/__init__.py b/torch/_dynamo/variables/__init__.py
index 4f211383bc2a7..06f634efb3489 100644
--- a/torch/_dynamo/variables/__init__.py
+++ b/torch/_dynamo/variables/__init__.py
@@ -1,14 +1,23 @@
+# mypy: ignore-errors
+
 from .base import VariableTracker
 from .builtin import BuiltinVariable
 from .constant import ConstantVariable, EnumVariable
 from .ctx_manager import (
+    CatchWarningsCtxManagerVariable,
     ContextWrappingVariable,
     DeterministicAlgorithmsVariable,
     DisabledSavedTensorsHooksVariable,
+    DualLevelContextManager,
+    GradIncrementNestingCtxManagerVariable,
+    GradInplaceRequiresGradCtxManagerVariable,
     GradModeVariable,
     InferenceModeVariable,
+    JvpIncrementNestingCtxManagerVariable,
+    SetFwdGradEnabledContextManager,
     StreamContextVariable,
     StreamVariable,
+    VmapIncrementNestingCtxManagerVariable,
     WithExitFunctionVariable,
 )
 from .dicts import (
@@ -18,12 +27,18 @@
     DefaultDictVariable,
     SetVariable,
 )
+from .distributed import BackwardHookVariable, DistributedVariable, PlacementVariable
 from .functions import (
+    FunctoolsPartialVariable,
     NestedUserFunctionVariable,
+    SkipFunctionVariable,
     UserFunctionVariable,
     UserMethodVariable,
 )
-from .higher_order_ops import TorchHigherOrderOperatorVariable
+from .higher_order_ops import (
+    FunctorchHigherOrderVariable,
+    TorchHigherOrderOperatorVariable,
+)
 from .iter import (
     CountIteratorVariable,
     CycleIteratorVariable,
@@ -51,31 +66,45 @@
     GetAttrVariable,
     InspectSignatureVariable,
     LambdaVariable,
+    MethodWrapperVariable,
     NewCellVariable,
     NewGlobalVariable,
     NumpyVariable,
     PythonModuleVariable,
-    SkipFilesVariable,
+    RegexPatternVariable,
+    StopIterationVariable,
     StringFormatVariable,
     SuperVariable,
+    TorchVersionVariable,
+    TypingVariable,
     UnknownVariable,
 )
 from .nn_module import NNModuleVariable, UnspecializedNNModuleVariable
+
+from .optimizer import OptimizerVariable
+from .sdpa import SDPAParamsVariable
 from .tensor import (
     FakeItemVariable,
     NumpyNdarrayVariable,
     SymNodeVariable,
     TensorVariable,
     UnspecializedPythonVariable,
+    UntypedStorageVariable,
 )
 from .torch import TorchCtxManagerClassVariable, TorchInGraphFunctionVariable
-from .user_defined import UserDefinedClassVariable, UserDefinedObjectVariable
+from .user_defined import (
+    RemovableHandleVariable,
+    UserDefinedClassVariable,
+    UserDefinedObjectVariable,
+)
 
 __all__ = [
     "AutogradFunctionContextVariable",
     "AutogradFunctionVariable",
+    "BackwardHookVariable",
     "BaseListVariable",
     "BuiltinVariable",
+    "CatchWarningsCtxManagerVariable",
     "ClosureVariable",
     "ConstantVariable",
     "ConstDictVariable",
@@ -105,21 +134,29 @@
     "NNModuleVariable",
     "NumpyNdarrayVariable",
     "NumpyVariable",
+    "OptimizerVariable",
+    "PlacementVariable",
     "PythonModuleVariable",
     "RangeVariable",
+    "RegexPatternVariable",
+    "RemovableHandleVariable",
     "RepeatIteratorVariable",
     "RestrictedListSubclassVariable",
-    "SkipFilesVariable",
+    "SDPAParamsVariable",
+    "SkipFunctionVariable",
     "SliceVariable",
+    "StopIterationVariable",
     "StringFormatVariable",
     "SuperVariable",
     "TensorVariable",
     "TorchCtxManagerClassVariable",
     "TorchInGraphFunctionVariable",
+    "TorchVersionVariable",
     "TupleVariable",
     "UnknownVariable",
     "UnspecializedNNModuleVariable",
     "UnspecializedPythonVariable",
+    "UntypedStorageVariable",
     "UserDefinedClassVariable",
     "UserDefinedObjectVariable",
     "UserFunctionVariable",
diff --git a/torch/_dynamo/variables/base.py b/torch/_dynamo/variables/base.py
index 9dc928e189b5d..80f4995a9fc0d 100644
--- a/torch/_dynamo/variables/base.py
+++ b/torch/_dynamo/variables/base.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import collections
 from enum import Enum
 from typing import Any, Callable, Dict, List
@@ -6,7 +8,7 @@
 from ..current_scope_id import current_scope_id
 from ..exc import unimplemented
 from ..source import AttrSource, Source
-from ..utils import identity, istype
+from ..utils import istype
 
 
 class MutableLocalSource(Enum):
@@ -20,31 +22,6 @@ class MutableLocalSource(Enum):
     Local = 1
 
 
-class ParentsTracker:
-    """
-    This is a perf optimization to limit the number of objects we need to visit in tx.replace_all.
-    This must be a seperate object so that it is not cloned in apply.
-    """
-
-    def __init__(self):
-        # logically this is a set, but we use a dict to ensure deterministic ordering
-        self.parents: Dict[ParentsTracker, bool] = dict()
-
-    def add(self, parent):
-        self.parents[parent] = True
-
-    def recursive_parents(self):
-        rv = dict(self.parents)
-        worklist = list(self.parents)
-        while worklist:
-            for parent in worklist.pop().parents:
-                if parent not in rv:
-                    assert isinstance(parent, ParentsTracker)
-                    rv[parent] = True
-                    worklist.append(parent)
-        return rv.keys()
-
-
 class MutableLocalBase:
     """
     Base class for Variable.mutable_local
@@ -116,11 +93,7 @@ def is_side_effect_safe(m: MutableLocalBase):
 
 
 class VariableTrackerMeta(type):
-    def __call__(cls, *args, **kwargs):
-        """Call __post_init__"""
-        obj = type.__call__(cls, *args, **kwargs)
-        obj.__post_init__(*args, **kwargs)
-        return obj
+    all_subclasses = []
 
     def __instancecheck__(cls, instance) -> bool:
         """Make isinstance work with LazyVariableTracker"""
@@ -133,6 +106,10 @@ def __instancecheck__(cls, instance) -> bool:
             instance = instance.realize()
         return type.__instancecheck__(cls, instance)
 
+    def __init__(cls, name, bases, attrs):
+        super().__init__(name, bases, attrs)
+        VariableTrackerMeta.all_subclasses.append(cls)
+
 
 class VariableTracker(metaclass=VariableTrackerMeta):
     """
@@ -159,83 +136,79 @@ def clone(self, **kwargs):
         return self.__class__(**args)
 
     @classmethod
-    def copy(cls, value):
-        """Deeper (but not full) copy, leaving FX and user objects alone"""
-        return cls.apply(identity, value)
-
-    @classmethod
-    def apply(
+    def visit(
         cls,
-        fn: Callable[["VariableTracker"], "VariableTracker"],
+        fn: Callable[["VariableTracker"], None],
         value,
         cache=None,
-        skip_fn=lambda _: False,  # Whether we should skip applying to this var
     ):
         """
-        Walk this object and call fn on all the VariableTracker
-        instances
+        Walk value and call fn on all the VariableTracker instances
         """
         if cache is None:
             cache = dict()
 
         idx = id(value)
         if idx in cache:
-            return cache[idx][0]
+            return
+        # save `value` to keep it alive and ensure id() isn't reused
+        cache[idx] = value
 
         if isinstance(value, VariableTracker):
-            if not skip_fn(value):
-
-                def update_object_dict(v):
-                    changed = False
-                    rv = v.__dict__
-                    for key in rv.keys():
-                        if key not in v._nonvar_fields:
-                            prior = rv[key]
-                            rv[key] = cls.apply(fn, prior, cache, skip_fn)
-                            changed = changed or prior is not rv[key]
-
-                    return v
-
-                value = value.unwrap()
-                was_realized = value.is_realized()
-                result = fn(update_object_dict(value))
-                if not was_realized and value.is_realized():
-                    # running fn() resulted in value getting realized,
-                    # which means we missed updating the contents of result
-                    result = update_object_dict(result.unwrap())
-            else:
-                result = fn(value)
-                if result is not None:
-                    result = result.unwrap()
-        elif istype(value, list):
-            result = [cls.apply(fn, v, cache, skip_fn) for v in value]
-        elif istype(value, tuple):
-            result = tuple(cls.apply(fn, v, cache, skip_fn) for v in value)
+            value = value.unwrap()
+            fn(value)
+            value = value.unwrap()  # calling fn() might have realized it
+            nonvars = value._nonvar_fields
+            for key, subvalue in value.__dict__.items():
+                if key not in nonvars:
+                    cls.visit(fn, subvalue, cache)
+        elif istype(value, (list, tuple)):
+            for subvalue in value:
+                cls.visit(fn, subvalue, cache)
         elif istype(value, (dict, collections.OrderedDict)):
-            assert "__name__" not in value, "_nonvar_fields should have excluded this"
-            result = {
-                k: cls.apply(fn, v, cache, skip_fn) for k, v in list(value.items())
-            }
-        else:
-            result = value
-
-        # save `value` to keep it alive and ensure id() isn't reused
-        cache[idx] = (result, value)
-        return result
-
-    def __str__(self):
-        return f"{self.__class__.__name__}()"
+            for subvalue in value.values():
+                cls.visit(fn, subvalue, cache)
 
     def __repr__(self):
-        return str(self)
+        return f"{self.__class__.__name__}()"
 
     def python_type(self):
+        """
+        Abstract method to be implemented by subclasses of VariableTracker.
+
+        This method should return the type represented by the instance of the subclass.
+        The purpose is to provide a standardized way to retrieve the Python type information
+        of the variable being tracked.
+
+        Returns:
+            type: The Python type (such as int, str, list, etc.) of the variable tracked by
+                the subclass. If the type cannot be determined or is not relevant,
+                leaving it undefined or invoking super() is always sound.
+
+        Note:
+            This is an abstract method and may be overridden in subclasses.
+
+        Example:
+            class SetVariable(VariableTracker):
+                def python_type(self):
+                    return set
+
+        Raises:
+            NotImplementedError: If the method is not implemented in a subclass.
+        """
         raise NotImplementedError(f"{self} has no type")
 
     def as_python_constant(self):
         """For constants"""
         raise NotImplementedError(f"{self} is not a constant")
 
+    def guard_as_python_constant(self):
+        """Similar to as_python_constant(), but add ID_MATCH guards to try to force things to become constants"""
+        try:
+            return self.as_python_constant()
+        except NotImplementedError as e:
+            unimplemented(str(e))
+
     def is_python_constant(self):
         try:
             self.as_python_constant()
@@ -246,17 +219,17 @@ def is_python_constant(self):
     def make_guard(self, fn):
         if self.source:
             return self.source.make_guard(fn)
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def const_getattr(self, tx, name: str) -> Any:
         """getattr(self, name) returning a python constant"""
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def var_getattr(self, tx, name: str) -> "VariableTracker":
         """getattr(self, name) returning a new variable"""
         value = self.const_getattr(tx, name)
         if not variables.ConstantVariable.is_literal(value):
-            raise NotImplementedError()
+            raise NotImplementedError
         source = None
         if self.source:
             source = AttrSource(self.source, name)
@@ -284,7 +257,7 @@ def maybe_fx_node(self):
             return None
 
     def reconstruct(self, codegen):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def can_reconstruct(self, tx):
         """If it is possible to reconstruct the Python object this
@@ -300,7 +273,7 @@ def can_reconstruct(self, tx):
             return False
 
     def unpack_var_sequence(self, tx) -> List["VariableTracker"]:
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def has_unpack_var_sequence(self, tx) -> bool:
         try:
@@ -337,20 +310,15 @@ def call_method(
             and not kwargs
         ):
             return self.var_getattr(tx, args[0].as_python_constant())
-        raise unimplemented(f"call_method {self} {name} {args} {kwargs}")
+        unimplemented(f"call_method {self} {name} {args} {kwargs}")
 
-    def rename(self, tx, name):
-        self.user_code_variable_name = tx.output.new_var(name)
-        return self
+    def set_name_hint(self, name):
+        pass
 
     def realize(self) -> "VariableTracker":
         """Used by LazyVariableTracker to build the real VariableTracker"""
         return self
 
-    def recursive_realize(self):
-        """Realize all objects under this"""
-        return VariableTracker.apply(lambda x: x.realize(), self)
-
     def unwrap(self) -> "VariableTracker":
         """Used by LazyVariableTracker to return the real VariableTracker if it already exists"""
         return self
@@ -359,29 +327,21 @@ def is_realized(self):
         """Used by LazyVariableTracker to indicate an unrealized node"""
         return True
 
+    def next_variable(self, tx):
+        unimplemented(f"next({self})")
+
+    def is_strict_mode(self, tx):
+        return tx.strict_checks_fn and tx.strict_checks_fn(self)
+
     def __init__(
         self,
         *,
         source: Source = None,
         mutable_local: MutableLocal = None,
-        user_code_variable_name: str = None,
-        parents_tracker: ParentsTracker = None,
     ):
         super().__init__()
         self.source = source
         self.mutable_local = mutable_local
-        self.user_code_variable_name = user_code_variable_name
-        self.parents_tracker = parents_tracker
-
-    def __post_init__(self, *args, **kwargs):
-        if self.parents_tracker is None:
-            self.parents_tracker = ParentsTracker()
-        # visit children 1 level deep and ensure parent is set properly
-        VariableTracker.apply(
-            lambda node: node.parents_tracker.add(self.parents_tracker),
-            [v for k, v in self.__dict__.items() if k not in self._nonvar_fields],
-            skip_fn=lambda _: True,
-        )
 
 
 def typestr(*objs):
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 52043503868a3..575ccfa53f8d2 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import abc
 import collections
 import contextlib
@@ -11,7 +13,9 @@
 import re
 import sys
 import types
-from typing import List, NamedTuple, Optional, Union
+from typing import Any, List, NamedTuple, Optional, Union
+
+from torch.utils._sympy.value_ranges import ValueRanges
 
 try:
     import numpy as np
@@ -22,9 +26,12 @@
 
 from torch import SymInt
 from torch._guards import GuardSource, TracingContext
+from torch._higher_order_ops.torchbind import call_torchbind
 from torch._ops import HigherOrderOperator
 from torch._streambase import _EventBase, _StreamBase
 from torch._subclasses.fake_tensor import FakeTensor, is_fake, maybe_get_fake_mode
+from torch._subclasses.meta_utils import is_sparse_any
+from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental.symbolic_shapes import (
     _constrain_range_for_size,
     DimDynamic,
@@ -33,11 +40,10 @@
     SubclassSymbolicContext,
     SymbolicContext,
 )
-from torch.fx.immutable_collections import immutable_list
-from torch.nested._internal.nested_tensor import NestedTensor
+from torch.fx.immutable_collections import immutable_dict, immutable_list
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 from torch.utils.weak import TensorWeakRef
-from .. import config, mutation_guard, replay_record, skipfiles, trace_rules
+from .. import config, mutation_guard, replay_record, trace_rules
 
 from ..device_interface import get_registered_device_interfaces
 from ..exc import InternalTorchDynamoError, unimplemented
@@ -49,41 +55,48 @@
     ConstDictKeySource,
     ConvertIntSource,
     GetItemSource,
+    GradSource,
     is_constant_source,
+    is_from_defaults,
+    is_from_optimizer_source,
     LocalSource,
     NumpyTensorSource,
+    OptimizerSource,
     RandomValueSource,
     Source,
     TupleIteratorGetItemSource,
 )
-from ..trace_rules import is_builtin_callable, is_callable_allowed, is_numpy
+from ..trace_rules import is_callable_allowed, is_numpy
 from ..utils import (
     build_checkpoint_variable,
     clone_input,
     common_constant_types,
     get_fake_value,
+    get_locals_to_steal,
     get_static_address_type,
-    is_function,
+    is_function_or_wrapper,
     is_namedtuple,
     is_typing,
     is_utils_checkpoint,
     istype,
     odict_values,
-    preserve_rng_state,
+    set_example_value,
     tensor_always_has_static_shape,
     tuple_iterator,
     tuple_iterator_getitem,
     tuple_iterator_len,
+    unwrap_with_attr_name_if_wrapper,
     wrap_fake_exception,
 )
 
-from .base import MutableLocal, typestr, VariableTracker
-from .builtin import BuiltinVariable
+from .base import MutableLocal, typestr, VariableTracker, VariableTrackerMeta
 from .constant import ConstantVariable, EnumVariable
 from .ctx_manager import (
     AutocastModeVariable,
     EventVariable,
     NullContextVariable,
+    PreserveVersionContextVariable,
+    StreamContextVariable,
     StreamVariable,
 )
 from .dicts import (
@@ -91,7 +104,6 @@
     DataClassVariable,
     DefaultDictVariable,
     HFPretrainedConfigVariable,
-    is_hashable_python_var,
     PythonSysModulesVariable,
     SetVariable,
 )
@@ -100,12 +112,12 @@
     PlacementClassVariable,
     PlacementVariable,
     ProcessGroupVariable,
+    WorldMetaClassVariable,
 )
 from .functions import (
     CollectiveFunctionRewriteVariable,
     FunctoolsPartialVariable,
     TritonKernelVariable,
-    UserFunctionVariable,
     UserMethodVariable,
 )
 from .higher_order_ops import TorchHigherOrderOperatorVariable
@@ -126,20 +138,26 @@
     AutogradFunctionContextVariable,
     AutogradFunctionVariable,
     ComptimeVariable,
+    DebuggingVariable,
+    DelayGraphBreakVariable,
     GetAttrVariable,
     GetSetDescriptorVariable,
     InspectSignatureVariable,
     LambdaVariable,
+    LoggingLoggerVariable,
     MethodWrapperVariable,
     NumpyVariable,
     PythonModuleVariable,
+    RegexPatternVariable,
     SavedTensorBox,
-    SkipFilesVariable,
+    TorchVersionVariable,
     TypingVariable,
 )
-
 from .nn_module import FSDPManagedNNModuleVariable, UnspecializedNNModuleVariable
 from .optimizer import OptimizerVariable
+from .script_object import TorchScriptObjectVariable
+
+from .sdpa import SDPAParamsVariable
 from .tensor import (
     NumpyNdarrayVariable,
     SymNodeVariable,
@@ -147,10 +165,11 @@
     TensorVariable,
     UnspecializedPythonVariable,
 )
-from .torch import TorchInGraphFunctionVariable
+from .torch import TorchCtxManagerClassVariable, TorchInGraphFunctionVariable
 from .torch_function import build_torch_function_fn, TensorWithTFOverrideVariable
 from .user_defined import (
     KeyedJaggedTensorVariable,
+    SourcelessGraphModuleVariable,
     UserDefinedClassVariable,
     UserDefinedObjectVariable,
 )
@@ -173,7 +192,23 @@ class GraphArg:
     # thing to do.  Probably should have example (which stores an int) and
     # fake_example
     _example: Union[TensorWeakRef, torch.SymInt]
-    is_unspecialized: bool
+    # When True, this indicates that this GraphArg is a Python quantity (e.g.,
+    # a float or int) which we pass to the FX graph as a Tensor.  This
+    # controls how we codegen calls into the Dynamo graph: we will call
+    # torch.as_tensor on the quantity before passing it in.
+    #
+    # Note that we typically do not pass dynamic integers as tensors, because
+    # they will most frequently just be used for size computation.  But this
+    # is a policy decision that we can change our mind on; in particular, when
+    # an int comes from a random number generator (e.g., random.randint), we
+    # DO pass it as a tensor.
+    #
+    # It's also worth noting that our current tracing rules for
+    # pass_arg_as_tensor as subtly broken: we just pun the variable as a
+    # 0d scalar Tensor and pray that the semantics are the same.  Which they
+    # often are, but not necessarily.  ezyang(May 2024) plans to fix this
+    # soon.
+    pass_arg_as_tensor: bool
     fake_tensor: Optional[torch._subclasses.fake_tensor.FakeTensor]
     # UnspecializedPythonVariable often masquerades as a tensor.
     # We MUST NOT generate shape guard code
@@ -200,8 +235,8 @@ def __post_init__(self):
             self._example = TensorWeakRef(self._example)
             assert is_fake(self.fake_tensor)
 
-    def load(self, tx):
-        return self.source.reconstruct(tx)
+    def reconstruct(self, codegen):
+        self.source.reconstruct(codegen)
 
     def erase(self):
         self._example = None
@@ -211,6 +246,24 @@ def __eq__(self, other):
         return self.source.name() == other.source.name()
 
 
+class BackwardStateGraphArg(GraphArg):
+    def __init__(self):
+        super().__init__(
+            source=None,
+            _example=BackwardState(),
+            pass_arg_as_tensor=False,
+            fake_tensor=None,
+            is_tensor=False,
+        )
+
+    def reconstruct(self, codegen):
+        assert codegen.tx.output.backward_state_var
+        codegen.load_import_from(BackwardState.__module__, "BackwardState")
+        codegen.call_function(0, True)
+        codegen.dup_top()
+        codegen.store(codegen.tx.output.backward_state_var)
+
+
 @dataclasses.dataclass
 class FrameStateSizeEntry:
     scalar: Optional[int]
@@ -241,10 +294,17 @@ def __call__(self, value):
             if dup_guard:
                 self.install_guards(dup_guard)
             return side_effect_result
+
+        cached_vt = self.tx.output.variable_tracker_cache.lookup(value, self.source)
+        if cached_vt:
+            return cached_vt
+
         vt = self._wrap(value)
         vt.source = self.source
         if self._can_lift_attrs_to_inputs(vt):
             vt = self.tx.output.side_effects.track_object_existing(value, vt)
+
+        self.tx.output.variable_tracker_cache.add(value, self.source, vt)
         return vt
 
     def _can_lift_attrs_to_inputs(self, vt):
@@ -310,6 +370,7 @@ def _type_dispatch(cls):
             (tuple_iterator, cls.wrap_tuple_iterator),
             ((slice, range), cls.wrap_slice_range),
             (tuple(common_constant_types), cls.wrap_literal),
+            (re.Pattern, cls.wrap_regex_pattern),
         ]
 
         if config.trace_numpy and np:
@@ -323,6 +384,11 @@ def _type_dispatch(cls):
 
         return result
 
+    def wrap_regex_pattern(self, value: re.Pattern):
+        # TODO(jansel): something like a REPR_MATCH might be more robust here
+        self.install_guards(GuardBuilder.ID_MATCH)
+        return RegexPatternVariable(value)
+
     @classmethod
     @functools.lru_cache(None)
     def _id_dispatch(cls):
@@ -346,6 +412,7 @@ def _id_dispatch(cls):
                     **self.install_guards(GuardBuilder.FUNCTION_MATCH),
                 ),
             ),
+            (torch.__version__, lambda self, value: TorchVersionVariable()),
         ]
 
         result = {}
@@ -397,19 +464,24 @@ class Autotuner:
             # For SUPPORTED_NODES, we guard on the dictionary version (PEP509)
             # under the assumption that the values themselves don't change.
             self.install_guards(GuardBuilder.DICT_VERSION)
+
+            # The keys on the SUPPORTED_NODES can be arbitrary, so save on the
+            # key order.
+            self.tx.output.guard_on_key_order.add(self.source.name())
             result = {
                 ConstantVariable.create(k): UserDefinedObjectVariable(
                     v,
-                    source=GetItemSource(self.get_source(), k),
+                    source=GetItemSource(
+                        self.get_source(), ConstDictKeySource(self.get_source(), i)
+                    ),
                 )
-                for k, v in value.items()
+                for i, (k, v) in enumerate(value.items())
             }
             return ConstDictVariable(result, type(value))
         elif value is sys.modules:
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
             return PythonSysModulesVariable(source=self.source)
-        elif istype(
-            value, (dict, collections.defaultdict, collections.OrderedDict)
-        ) and all(is_hashable_python_var(k) for k in value.keys()):
+        elif istype(value, (dict, collections.defaultdict, collections.OrderedDict)):
             if not value and self.get_source().is_nn_module():
                 # It is faster to guard on 'false' property than to guard
                 # on actual dict keys, but we can't do this fast guard in general because
@@ -420,32 +492,60 @@ class Autotuner:
                 # but not completely secure job ensuring a property wasn't changed.
                 self.install_guards(GuardBuilder.BOOL_FALSE)
             else:
-                self.install_guards(GuardBuilder.DICT_KEYS)
-
-            idx = 0
-
-            def build_key_value(k, v):
-                nonlocal idx
-                if ConstantVariable.is_literal(k):
+                self.install_guards(GuardBuilder.SEQUENCE_LENGTH)
+
+            # Optimisation for the common case strings, ints, etc
+            all_const = all(ConstantVariable.is_literal(k) for k in value.keys())
+            if all_const:
+                # TODO(anijain2305) - Do we have to guard on all the keys? Can
+                # keys be guarded lazily, similar to values?
+                self.install_guards(GuardBuilder.DICT_CONST_KEYS)
+            else:
+                # Guard on the key order
+                # This is not ideal, i.e., there is no need to guard on the key
+                # order. But we guard on the key order because of the complexity
+                #
+                # 1) For non-constant objects, we can't save the key in the
+                # guard context because it can be memory heavy. We can add
+                # weakrefs but this complicates the accesses.
+                #
+                # 2) For non-constant objects, we also have to guard on the keys
+                # (like TENSOR_MATCH on tensor). We might also have guards on
+                # the attributes of the keys (like tensor.grad). To make this
+                # work in tree strucutre is complicated.
+                #
+                # So, instead we guard on the key order. While guarding on key
+                # order, we just save the indices and use it to access keys and
+                # values. Indices are cheap to save.
+                self.tx.output.guard_on_key_order.add(self.source.name())
+
+            # We need all the keys to be hashable. We do this within the
+            # _HashableTracker class in dicts.py
+            def build_key_value(i, k, v):
+                if all_const:
                     key = ConstantVariable.create(k)
                     source_key = k
                 else:
-                    source_key = ConstDictKeySource(self.get_source(), idx)
-                    key = VariableBuilder(self.tx, source_key)(k)
+                    source_key = ConstDictKeySource(self.get_source(), i)
+                    key = LazyVariableTracker.create(k, source_key)
 
                 source_value = GetItemSource(self.get_source(), source_key)
                 value = LazyVariableTracker.create(v, source_value)
 
-                idx += 1
                 return key, value
 
-            result = dict(build_key_value(k, v) for k, v in value.items())
+            result = dict(
+                build_key_value(i, k, v) for i, (k, v) in enumerate(value.items())
+            )
 
             if istype(value, collections.defaultdict):
+                factory_source = AttrSource(self.source, "default_factory")
                 result = DefaultDictVariable(
                     result,
                     type(value),
-                    default_factory=self._wrap(value.default_factory),
+                    default_factory=VariableBuilder(self.tx, factory_source)(
+                        value.default_factory
+                    ),
                     source=self.source,
                 )
             else:
@@ -466,9 +566,14 @@ def build_key_value(k, v):
         elif isinstance(value, enum.Enum):
             self.install_guards(GuardBuilder.ID_MATCH)
             return EnumVariable(value=value, source=self.source)
-        elif is_builtin_callable(value):
+        elif DebuggingVariable.is_reorderable_logging_function(value):
+            # Put this above builtin_callable so that print() can be handled
+            # along with other builtin debugging functions
             self.install_guards(GuardBuilder.BUILTIN_MATCH)
-            return BuiltinVariable(value, source=self.source)
+            return DebuggingVariable(value, source=self.source)
+        elif isinstance(value, logging.Logger):
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+            return LoggingLoggerVariable(value, source=self.source)
         elif is_utils_checkpoint(value):
             return build_checkpoint_variable(source=self.source)
         elif isinstance(value, functools.partial):
@@ -485,6 +590,8 @@ def build_key_value(k, v):
             keywords = {}
             keywords_source = AttrSource(self.get_source(), "keywords")
             for k, v in value.keywords.items():
+                if not ConstantVariable.is_literal(k):
+                    unimplemented("functools.partial with non-literal keyword")
                 keywords[k] = VariableBuilder(
                     self.tx, GetItemSource(keywords_source, k)
                 )(v)
@@ -492,9 +599,9 @@ def build_key_value(k, v):
             install_guard(
                 self.get_source().make_guard(GuardBuilder.TYPE_MATCH),
                 keywords_source.make_guard(GuardBuilder.DICT_KEYS),
-                args_source.make_guard(GuardBuilder.LIST_LENGTH),
+                args_source.make_guard(GuardBuilder.SEQUENCE_LENGTH),
             )
-            return FunctoolsPartialVariable(func_obj, args, keywords, original=value)
+            return FunctoolsPartialVariable(func_obj, args, keywords)
         elif is_typing(value):
             # typing.List, typing.Mapping, etc.
             self.install_guards(GuardBuilder.ID_MATCH)
@@ -528,15 +635,27 @@ def build_key_value(k, v):
                 source=self.source,
             )
         elif isinstance(value, torch.autograd.function.FunctionCtx):
-            saved_tensors_source = AttrSource(self.source, "saved_tensors")
-            install_guard(
-                self.source.make_guard(GuardBuilder.TYPE_MATCH),
-                saved_tensors_source.make_guard(GuardBuilder.LIST_LENGTH),
-            )
-            saved_tensors = [
-                VariableBuilder(self.tx, GetItemSource(saved_tensors_source, n))(v)
-                for n, v in enumerate(value.saved_tensors)
-            ]
+            actual_saved_tensors = None
+            try:
+                actual_saved_tensors = value.saved_tensors
+            except RuntimeError:
+                pass
+
+            saved_tensors = []
+            guards = [self.source.make_guard(GuardBuilder.TYPE_MATCH)]
+            if isinstance(actual_saved_tensors, tuple):
+                saved_tensors_source = AttrSource(self.source, "saved_tensors")
+                guards.append(
+                    saved_tensors_source.make_guard(GuardBuilder.SEQUENCE_LENGTH)
+                )
+                for i, v in enumerate(actual_saved_tensors):
+                    saved_tensors.append(
+                        VariableBuilder(
+                            self.tx, GetItemSource(saved_tensors_source, i)
+                        )(v)
+                    )
+            install_guard(*guards)
+
             return self.tx.output.side_effects.track_object_existing(
                 value,
                 AutogradFunctionContextVariable(
@@ -556,9 +675,17 @@ def build_key_value(k, v):
             # handle aliased autograd function `apply` calls
             self.install_guards(GuardBuilder.FUNCTION_MATCH)
             return GetAttrVariable(
-                AutogradFunctionVariable(value.__self__, source=self.source),
+                AutogradFunctionVariable(
+                    value.__self__, source=AttrSource(self.source, member="__self__")
+                ),
                 "apply",
             )
+        elif callable(value) and trace_rules.lookup_callable(value) is not None:
+            if is_callable_allowed(value):
+                self.tx.output.has_user_defined_allowed_in_graph = True
+            return trace_rules.lookup_callable(value).create_with_source(
+                value, source=self.source
+            )
         elif np and isinstance(value, np.number):
             return self.wrap_unspecialized_primitive(value)
         elif DataClassVariable.is_matching_object(value):
@@ -570,14 +697,33 @@ def build_key_value(k, v):
         elif isinstance(value, HigherOrderOperator):
             self.install_guards(GuardBuilder.TYPE_MATCH, GuardBuilder.NAME_MATCH)
             return TorchHigherOrderOperatorVariable.make(value, source=self.source)
+        elif isinstance(value, torch.cuda.StreamContext):
+            self.install_guards(GuardBuilder.ID_MATCH)
+            stream_source = AttrSource(self.source, "stream")
+            stream_var = VariableBuilder(self.tx, stream_source)(value.stream)
+            return StreamContextVariable.create(self.tx, stream_var)
         elif isinstance(value, _StreamBase):
             self.install_guards(GuardBuilder.ID_MATCH)
+            stream_proxy = self.tx.output.create_proxy(
+                "call_function",
+                torch.cuda.Stream,
+                (),
+                {
+                    "stream_id": value.stream_id,
+                    "device_index": value.device_index,
+                    "device_type": value.device_type,
+                },
+            )
+            set_example_value(stream_proxy.node, value)
             return StreamVariable(
-                None,
+                stream_proxy,
                 value,
-                value.device.type,
+                value.device,
                 source=self.source,
             )
+        elif isinstance(value, (torch._C._SDPAParams)):
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            return SDPAParamsVariable.create(self.tx, value, self.source)
         elif isinstance(value, _EventBase):
             self.install_guards(GuardBuilder.ID_MATCH)
             return EventVariable(
@@ -602,8 +748,11 @@ def build_key_value(k, v):
             # TODO: this doing it manually is bad
             return self.tx.output.side_effects.track_object_existing(value, result)
         elif isinstance(value, torch.optim.Optimizer):
-            self.install_guards(GuardBuilder.TYPE_MATCH)
+            self.install_guards(GuardBuilder.ID_MATCH)
+            self.source = OptimizerSource(self.source)
             return OptimizerVariable(value, source=self.source)
+        elif WorldMetaClassVariable.is_group_member_type(value):
+            return WorldMetaClassVariable(value, source=self.source)
         elif ProcessGroupVariable.is_process_group(value):
             self.install_guards(GuardBuilder.ID_MATCH)
             return ProcessGroupVariable(value, source=self.source)
@@ -685,42 +834,27 @@ def build_key_value(k, v):
                 ],
                 source=self.source,
             )
-        elif trace_rules.lookup(value) is not None:
-            if is_callable_allowed(value):
-                self.tx.output.has_user_defined_allowed_in_graph = True
+        elif TorchCtxManagerClassVariable.is_matching_cls(value):
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+            return TorchCtxManagerClassVariable(value, source=self.source)
+        elif is_function_or_wrapper(value):
+            value, attr_name = unwrap_with_attr_name_if_wrapper(value)
+            # For these wrappers, Dynamo points to the wrapped function,
+            # so source needs to be updated as well.
+            if attr_name is not None:
+                self.source = AttrSource(self.source, attr_name)
             return trace_rules.lookup(value).create_with_source(
                 value, source=self.source
             )
-        elif (
-            istype(value, (types.ModuleType, replay_record.DummyModule))
-            # type(torch.backends.cudnn) -> <class 'torch.backends.cudnn.CudnnModule'>
-            # type(torch.ops) -> <class 'torch._ops._Ops'>
-            or value in [torch.backends.cudnn, torch.ops]
-            or isinstance(value, torch._ops._OpNamespace)
-        ):
+        # Don't use istype, since some python modules are not subclasses of types.ModuleType directly.
+        # E.g, type(torch.ops) -> <class 'torch._ops._Ops'>,
+        # type(torch.backends.cudnn) -> <class 'torch.backends.cudnn.CudnnModule'>
+        elif isinstance(value, (types.ModuleType, replay_record.DummyModule)):
             self.install_guards(GuardBuilder.FUNCTION_MATCH)
             return PythonModuleVariable(
                 value,
                 source=self.source,
             )
-        elif (
-            is_function(value)
-            and skipfiles.check(value, is_inlined_call=True)
-            and not inspect.getattr_static(value, "_torchdynamo_inline", False)
-            and not inspect.getattr_static(value, "__script_if_tracing_wrapper", False)
-        ):
-            self.install_guards(GuardBuilder.FUNCTION_MATCH)
-            return SkipFilesVariable(
-                value,
-                skipfiles.check_verbose(value, is_inlined_call=True).reason,
-                source=self.source,
-            )
-        elif istype(value, (types.FunctionType, torch.jit.ScriptFunction)):
-            self.install_guards(GuardBuilder.CLOSURE_MATCH)
-            return UserFunctionVariable(
-                value,
-                source=self.source,
-            )
         elif isinstance(value, types.MethodType) and isinstance(
             value.__self__, (torch.nn.Module, torch.utils._pytree.TreeSpec)
         ):
@@ -754,13 +888,23 @@ def build_key_value(k, v):
             self.install_guards(GuardBuilder.FUNCTION_MATCH)
             return MethodWrapperVariable(value)
         elif issubclass(type(value), type):
-            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+            if value in (torch.utils.hooks.BackwardHook, torch.nn.Parameter):
+                # TODO(jansel): combine this case with the one above
+                return trace_rules.lookup(value).create_with_source(
+                    value, source=self.source
+                )
+            if value is torch.autograd._unsafe_preserve_version_counter:
+                self.install_guards(GuardBuilder.FUNCTION_MATCH)
+                return PreserveVersionContextVariable.constructor(self.tx)
+            # This is a userdefined class, so install an ID_MATCH even if its a
+            # global variable.
+            self.install_guards(GuardBuilder.ID_MATCH)
             return UserDefinedClassVariable(
                 value,
                 source=self.source,
             )
         elif RestrictedListSubclassVariable.is_matching_cls(type(value)):
-            self.install_guards(GuardBuilder.TYPE_MATCH, GuardBuilder.LIST_LENGTH)
+            self.install_guards(GuardBuilder.SEQUENCE_LENGTH)
             return self.set_source_and_track_mutable(
                 value,
                 RestrictedListSubclassVariable(
@@ -774,34 +918,65 @@ def build_key_value(k, v):
                     user_cls_source=AttrSource(self.source, "__class__"),
                 ),
             )
-        else:
-            # breakpoint()
-            self.install_guards(GuardBuilder.TYPE_MATCH)
-            result = UserDefinedObjectVariable(value, source=self.source)
-            if not SideEffects.cls_supports_mutation_side_effects(type(value)):
-                # don't allow STORE_ATTR mutation with custom __setattr__
-                return result
-            return self.tx.output.side_effects.track_object_existing(value, result)
+        elif TorchScriptObjectVariable.is_matching_cls(type(value)):
+            from ..source import (
+                FlattenScriptObjectSource,
+                ScriptObjectQualifiedNameSource,
+            )
 
-    def tensor_should_specialize(self):
-        return (
-            self.source
-            and isinstance(self.source, GetItemSource)
-            and isinstance(self.source.base, GetItemSource)
-            and self.source.base.index == "params"
-            and isinstance(self.source.base.base, GetItemSource)
-            and isinstance(self.source.base.base.base, AttrSource)
-            and self.source.base.base.base.member == "param_groups"
-            and isinstance(self.source.base.base.base.base, LocalSource)
-            and (
-                isinstance(
-                    self.tx.f_locals[self.source.base.base.base.base.local_name],
-                    torch.optim.Optimizer,
+            # This exists to allow a smoother transition.
+            # The implications are:
+            # The script objects won't be tracked as proxies.
+            # Methods on these objects won't show up in the graph.
+            # The original script object might be mutated.
+            if not hasattr(value, "__obj_flatten__"):
+                return self.wrap_user_defined(value)
+
+            # Install the guards on the fully qualified name of the script object
+            LazyVariableTracker.realize_all(
+                VariableBuilder(self.tx, ScriptObjectQualifiedNameSource(self.source))(
+                    value._type().qualified_name()  # type: ignore[attr-defined]
                 )
-                if self.source.base.base.base.base.local_name in self.tx.f_locals.keys()
-                else True
             )
-        )
+            # Install the guards on the content of the script object by setting the source
+            # to be FlattenScriptObjectSource, which calls __obj_flatten__() to get the contents.
+            LazyVariableTracker.realize_all(
+                VariableBuilder(self.tx, FlattenScriptObjectSource(self.source))(
+                    value.__obj_flatten__()
+                )
+            )
+
+            fake_script_obj = torch._library.fake_class_registry.to_fake_obj(
+                self.tx.output.fake_mode, value
+            )
+
+            proxy = self.tx.output.root_tracer.create_graph_input(
+                re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
+                type(value),
+                source=self.source,
+            )
+
+            # setting is_unspecialized=False to not insert a as_tensor call in reconstruct by default
+            # seting example to be real value because these example values will be used
+            # as example_inputs for user compiler.
+            proxy.node.meta["grapharg"] = GraphArg(
+                self.source, value, False, None, False, fake_script_obj
+            )
+            return TorchScriptObjectVariable.create(
+                proxy,
+                fake_script_obj,
+                source=self.source,
+            )
+        else:
+            return self.wrap_user_defined(value)
+
+    def wrap_user_defined(self, value: Any):
+        self.install_guards(GuardBuilder.TYPE_MATCH)
+        result = UserDefinedObjectVariable(value, source=self.source)
+        if not SideEffects.cls_supports_mutation_side_effects(type(value)):
+            # don't allow STORE_ATTR mutation with custom __setattr__
+            return result
+        return self.tx.output.side_effects.track_object_existing(value, result)
 
     def wrap_listlike(self, value: Union[tuple, list, odict_values, NamedTuple]):
         if config.specialize_int and type(value) is torch.Size:
@@ -809,16 +984,67 @@ def wrap_listlike(self, value: Union[tuple, list, odict_values, NamedTuple]):
             return ConstantVariable.create(value=value)
         # One can index a tensor with a list/tuple. Therefore, we need to
         # have a stricter match.
-        self.install_guards(GuardBuilder.LIST_LENGTH)
+        self.install_guards(GuardBuilder.SEQUENCE_LENGTH)
 
         for item in value:
             if item is value:
                 unimplemented("list elements are pointing to the list itself")
 
         output = [
-            VariableBuilder(self.tx, GetItemSource(self.get_source(), i))(item)
+            LazyVariableTracker.create(item, source=GetItemSource(self.get_source(), i))
             for i, item in enumerate(value)
         ]
+
+        maybe_gm = self.tx.output.local_scope.get("self")
+        if isinstance(
+            self.source, LocalSource
+        ) and self.source.local_name in get_locals_to_steal(maybe_gm):
+            # The input tensor list to dynamo from compiled autograd may contain activations
+            # which are freed as they are used in inductor. Dynamo's default behavior is to
+            # lift all tensors to the graph inputs, but this will cause dynamo to hold an
+            # extra reference to the activation tensors and increase peak memory usage.
+            # To allow freeing ASAP, we keep the list as graph argument to the dynamo output
+            # graph, and unpack it locally.
+            # e.g. instead of `def forward(self, L_inputs_0_, L_inputs_1_, ...):`, we have
+            # `def forward(self, L_inputs_):`
+            source = self.source
+            assert isinstance(value, list)
+            tensor_list_proxy = self.tx.output.root_tracer.create_graph_input(
+                re.sub(r"[^a-zA-Z0-9]+", "_", self.name), type(value), source=source
+            )
+            tensor_list_proxy.node.meta["steal_arg"] = True
+
+            list_variable = wrap_fx_proxy_cls(
+                target_cls=TensorVariable,
+                tx=self.tx,
+                proxy=tensor_list_proxy,
+                example_value=value,
+                subclass_type=None,
+                source=source,
+            )
+
+            guards = []
+            for i, tensor_variable in enumerate(list_variable.items):
+                source_i = GetItemSource(base=source, index=i, index_is_slice=False)
+                # access unpacked tensor from this list instead of from a lifted arg
+                self.tx.output.input_source_to_var[source_i] = tensor_variable
+
+                guard = functools.partial(
+                    GuardBuilder.TENSOR_MATCH, value=TensorWeakRef(value[i])
+                )
+                guards.append(source_i.make_guard(guard))
+
+            install_guard(*guards, skip=1)
+
+            grapharg = GraphArg(
+                source,
+                value,
+                pass_arg_as_tensor=False,
+                fake_tensor=None,
+                is_tensor=False,
+            )
+            tensor_list_proxy.node.meta["grapharg"] = grapharg
+
         result = BaseListVariable.cls_for_instance(value)(
             output, mutable_local=MutableLocal()
         )
@@ -856,7 +1082,17 @@ def wrap_slice_range(self, value: Union[slice, range]):
     def wrap_module(self, value: torch.nn.Module):
         from ..eval_frame import OptimizedModule
 
+        if len(value.__dict__) == 0:
+            unimplemented(f"uninitialized nn.Module: {typestr(value)}")
         if istype(value, OptimizedModule):
+            # Check if the optimized module was disabled
+            if inspect.getattr_static(value.forward, "_torchdynamo_disable", False):
+                # This bytecode is mostly of kind LOAD_ATTR or LOAD_METHOD. If
+                # we graph break here, Dynamo does not know how to create
+                # continuation functions for such bytecodes. So, we delay the
+                # graph break to CALL_FUNCTION.
+                return DelayGraphBreakVariable(source=self.source)
+
             self.install_guards(GuardBuilder.TYPE_MATCH)
             self.source = AttrSource(self.source, "_orig_mod")
             return self.wrap_module(value._orig_mod)
@@ -866,7 +1102,7 @@ def wrap_module(self, value: torch.nn.Module):
             and not config.allow_rnn
         ):
             unimplemented("TorchDynamo purposely graph breaks on RNN, GRU, LSTMs")
-        if mutation_guard.is_dynamic_nn_module(value):
+        if mutation_guard.is_dynamic_nn_module(value, self.tx.export):
             # created dynamically, don't specialize on it
             self.install_guards(GuardBuilder.TYPE_MATCH)
             result = UnspecializedNNModuleVariable(value, source=self.source)
@@ -928,11 +1164,12 @@ def wrap_literal(self, value):
                 # specialized (as we don't expect users to be changing the
                 # NN modules on the fly)
                 or self.source.guard_source().is_nn_module()
+                or is_from_defaults(self.source)
             ):
                 self.install_guards(GuardBuilder.CONSTANT_MATCH)
                 return ConstantVariable.create(value=value, source=self.source)
             else:
-                return self.wrap_unspecialized_primitive(value)
+                return self.wrap_symint(value)
         else:
             self.install_guards(GuardBuilder.CONSTANT_MATCH)
             return ConstantVariable.create(value=value)
@@ -1031,36 +1268,49 @@ def wrap_tensor(self, value: torch.Tensor):
         if (
             isinstance(value, torch.Tensor)
             and value.is_nested
-            and not isinstance(value, NestedTensor)
+            and not isinstance(value, torch.nested._internal.nested_tensor.NestedTensor)
         ):
             unimplemented("torch.compile does not support strided NestedTensor")
 
+        if is_sparse_any(value):
+            unimplemented(
+                f"torch.compile does not support sparse Tensor with {value.layout} layout"
+            )
+
         tensor_variable = wrap_fx_proxy(
             tx=self.tx,
             proxy=tensor_proxy,
             example_value=value,
-            should_specialize=self.tensor_should_specialize(),
             subclass_type=subclass_type,
             source=source,
             **options,
         )
 
+        guard_type = GuardBuilder.TENSOR_MATCH
+
+        if isinstance(source, GradSource) and is_from_optimizer_source(source):
+            guard_type = GuardBuilder.NOT_NONE_MATCH
+
         self.install_guards(
             functools.partial(
-                GuardBuilder.TENSOR_MATCH,
+                guard_type,
                 value=value
                 if isinstance(source, NumpyTensorSource)
                 else TensorWeakRef(value),
             )
         )
 
-        # install guards for subclass inner tensors
+        # We install TYPE_MATCH guards for traceable wrapper subclass object,
+        # and recursively install corresponding guard for each inner attribute.
         if is_traceable_wrapper_subclass(value):
+            self.install_guards(GuardBuilder.TYPE_MATCH)
             attrs, _ = value.__tensor_flatten__()
             for attr in attrs:
                 inner_value = getattr(value, attr)
                 inner_source = AttrSource(self.source, attr)
-                VariableBuilder(self.tx, inner_source)(inner_value).recursive_realize()
+                LazyVariableTracker.realize_all(
+                    VariableBuilder(self.tx, inner_source)(inner_value)
+                )
 
         self.tx.output.input_source_to_var[source] = tensor_variable
         assert "tensor_dict" not in tensor_proxy.node.meta
@@ -1086,7 +1336,13 @@ def wrap_numpy_ndarray(self, value):
 
         readonly = not value.flags.writeable
         if readonly:
-            value.flags.writeable = True
+            try:
+                value.flags.writeable = True
+            except ValueError:
+                # One can not easily make nditer elements writable,
+                # but warning is not the end of the world
+                assert isinstance(value.base, np.nditer)
+                pass
 
         try:
             tensor_value = _util._try_convert_to_tensor(value)
@@ -1102,7 +1358,7 @@ def wrap_numpy_ndarray(self, value):
         # a tensor. It's a little annoying to make a VT to throw out, but there's so many side effects here
         # that there's not another great way to do this atm.
         # This creates the right graphargs, as well as registration for guards in tensor names and shape env.
-        VariableBuilder(self.tx, source)(tensor_value).recursive_realize()
+        LazyVariableTracker.realize_all(VariableBuilder(self.tx, source)(tensor_value))
         proxy = self.tx.output.root_tracer.create_graph_input(
             re.sub(r"[^a-zA-Z0-9]+", "_", self.name), type(tensor_value), source=source
         )
@@ -1118,12 +1374,12 @@ def wrap_numpy_ndarray(self, value):
         self.tx.output.input_source_to_var[source] = numpy_ndarray_variable
         example_value = numpy_ndarray_variable.proxy.node.meta["example_value"]
 
-        # is_unspecialized should be true because we are wrapping a np.ndarray as argument input, and it needs to be
+        # pass_arg_as_tensor should be true because we are wrapping a np.ndarray as argument input, and it needs to be
         # converted to a tensor.
         grapharg = GraphArg(
             source,
             tensor_value,
-            is_unspecialized=True,
+            pass_arg_as_tensor=True,
             fake_tensor=example_value,
             is_tensor=True,
             example_strong_ref=tensor_value,
@@ -1132,130 +1388,169 @@ def wrap_numpy_ndarray(self, value):
 
         return numpy_ndarray_variable
 
-    def wrap_unspecialized_primitive(self, value):
+    def wrap_symint(self, value):
+        assert type(value) is int
+
         if self.name in self.tx.output.unspec_variable_map:
             return self.tx.output.unspec_variable_map[self.name]
+
+        shape_env = self.tx.output.shape_env
+        if TracingContext.get().force_unspec_int_unbacked_size_like:
+            wrapped_value = shape_env.create_unbacked_symint()
+            _constrain_range_for_size(wrapped_value)
+            self.tx.output.bound_symbols.add(wrapped_value.node.expr)
+            self.tx.output.tracked_fakes.append(
+                TrackedFake(wrapped_value, self.source, None)
+            )
+
+        # NB: We do not do float.  For motivation, see
+        # https://docs.google.com/document/d/1INSCdYu1PxXcr43HrD82OudeEuS-qxQe1yZmLg2wy6A/edit
+        # but the general idea is that we generate kernels that can
+        # take unspecialized floats and use them in sizevar computation
+        elif not is_constant_source(self.get_source()):
+            if torch._dynamo.config.specialize_int:
+                # If specialize_int is False, also return
+                # a constant (but this should have been handled
+                # in the caller, TBH)
+                self.install_guards(GuardBuilder.CONSTANT_MATCH)
+                return ConstantVariable.create(value=value, source=self.source)
+
+            name = self.source.name()
+            if name not in self.tx.output.frame_state:
+                # Note - this essentially means that if this name gets reused as a tensor,
+                # it will start fully dynamic. That should always be a safe option, and not awfully inefficient.
+                # Alternatively, if we want to improve pef here, we can add a third state of unset, but I am not
+                # sure that is necessary for now.
+                frame_state_entry = FrameStateSizeEntry(scalar=value, size=None)
+            else:
+                frame_state_entry = self.tx.output.frame_state[name]
+                if frame_state_entry.scalar != value:
+                    log.debug(
+                        "automatic dynamic int %s val %s != %s",
+                        name,
+                        value,
+                        frame_state_entry.scalar,
+                    )
+                    frame_state_entry.scalar = None
+            self.tx.output.frame_state[name] = frame_state_entry
+
+            # TODO: This should be dynamic, as we in general do not
+            # know if bare integers are actually going to be sizevars
+            # and it is inappropriate to eagerly duck size them with
+            # real sizevars
+            if (
+                config.automatic_dynamic_shapes and frame_state_entry.scalar is None
+            ) or not config.assume_static_by_default:
+                dynamic_dim = DimDynamic.DYNAMIC
+            else:  # assume_static_by_default
+                # TODO: dynamic_dim = DimDynamic.STATIC should work but
+                # for some reason it doesn't
+                self.install_guards(GuardBuilder.CONSTANT_MATCH)
+                return ConstantVariable.create(value=value)
+
+            wrapped_value = shape_env.create_unspecified_symint_and_symbol(
+                value,
+                source=self.source,
+                dynamic_dim=dynamic_dim,
+            )
+            self.tx.output.bound_symbols.add(wrapped_value.node.expr)
+
+            self.tx.output.tracked_fakes.append(
+                TrackedFake(wrapped_value, self.source, None)
+            )
         else:
-            shape_env = self.tx.output.shape_env
-            if TracingContext.get().force_unspec_int_unbacked_size_like and isinstance(
-                value, int
-            ):
-                wrapped_value = shape_env.create_unbacked_symint()
-                _constrain_range_for_size(wrapped_value)
-                self.tx.output.bound_symbols.add(wrapped_value.node.expr)
-                self.tx.output.tracked_fakes.append(
-                    TrackedFake(wrapped_value, self.source, None)
-                )
+            assert is_constant_source(self.get_source())
+            # TODO: Do I actually need guard for constant source?
+            self.install_guards(GuardBuilder.CONSTANT_MATCH)
+            return ConstantVariable.create(value=value, source=self.source)
 
-            # NB: We do not do float.  For motivation, see
-            # https://docs.google.com/document/d/1INSCdYu1PxXcr43HrD82OudeEuS-qxQe1yZmLg2wy6A/edit
-            # but the general idea is that we generate kernels that can
-            # take unspecialized floats and use them in sizevar computation
-            elif (
-                isinstance(value, int)
-                and not is_constant_source(self.get_source())
-                and not isinstance(self.get_source(), RandomValueSource)
-            ):
-                if torch._dynamo.config.specialize_int:
-                    # If specialize_int is False, also return
-                    # a constant (but this should have been handled
-                    # in the caller, TBH)
-                    self.install_guards(GuardBuilder.CONSTANT_MATCH)
-                    return ConstantVariable.create(value=value, source=self.source)
-
-                name = self.source.name()
-                if name not in self.tx.output.frame_state:
-                    # Note - this essentially means that if this name gets reused as a tensor,
-                    # it will start fully dynamic. That should always be a safe option, and not awfully inefficient.
-                    # Alternatively, if we want to improve pef here, we can add a third state of unset, but I am not
-                    # sure that is necessary for now.
-                    frame_state_entry = FrameStateSizeEntry(scalar=value, size=None)
-                else:
-                    frame_state_entry = self.tx.output.frame_state[name]
-                    if frame_state_entry.scalar != value:
-                        log.debug(
-                            "automatic dynamic int %s val %s != %s",
-                            name,
-                            value,
-                            frame_state_entry.scalar,
-                        )
-                        frame_state_entry.scalar = None
-                self.tx.output.frame_state[name] = frame_state_entry
-
-                # TODO: This should be dynamic, as we in general do not
-                # know if bare integers are actually going to be sizevars
-                # and it is inappropriate to eagerly duck size them with
-                # real sizevars
-                if (
-                    config.automatic_dynamic_shapes and frame_state_entry.scalar is None
-                ) or not config.assume_static_by_default:
-                    dynamic_dim = DimDynamic.DYNAMIC
-                else:  # assume_static_by_default
-                    # TODO: dynamic_dim = DimDynamic.STATIC should work but
-                    # for some reason it doesn't
-                    self.install_guards(GuardBuilder.CONSTANT_MATCH)
-                    return ConstantVariable.create(value=value)
-
-                wrapped_value = shape_env.create_unspecified_symint_and_symbol(
-                    value,
-                    source=self.source,
-                    dynamic_dim=dynamic_dim,
+        assert not isinstance(self.get_source(), RandomValueSource)
+        install_guard(self.get_source().make_guard(GuardBuilder.TYPE_MATCH))
+
+        options = {"source": self.get_source()}
+
+        proxy = self.tx.output.root_tracer.create_graph_input(
+            re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
+            type(wrapped_value),
+            source=self.get_source(),
+        )
+
+        set_example_value(proxy.node, wrapped_value)
+        unspec_var = SymNodeVariable(proxy, wrapped_value, **options)
+        self.tx.output.unspec_variable_map[self.name] = unspec_var
+
+        if not is_constant_source(self.get_source()):
+            if self.tx.export and not isinstance(self.get_source(), LocalSource):
+                raise AssertionError(
+                    f"Dynamo attempts to add additional input during export: value={wrapped_value}, source={self.get_source()}"
                 )
-                self.tx.output.bound_symbols.add(wrapped_value.node.expr)
 
-                self.tx.output.tracked_fakes.append(
-                    TrackedFake(wrapped_value, self.source, None)
+            example_value = unspec_var.proxy.node.meta["example_value"]
+
+            proxy.node.meta["grapharg"] = GraphArg(
+                self.get_source(),
+                wrapped_value,
+                pass_arg_as_tensor=False,
+                fake_tensor=None,
+                is_tensor=False,
+                example_strong_ref=wrapped_value,
+            )
+
+        return unspec_var
+
+    def wrap_unspecialized_primitive(self, value):
+        if self.name in self.tx.output.unspec_variable_map:
+            return self.tx.output.unspec_variable_map[self.name]
+
+        wrapped_value = torch.tensor(value)
+        if not isinstance(self.get_source(), RandomValueSource):
+            install_guard(self.get_source().make_guard(GuardBuilder.TYPE_MATCH))
+
+        options = {"source": self.get_source()}
+        options.update({"raw_value": value})
+
+        proxy = self.tx.output.root_tracer.create_graph_input(
+            re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
+            type(wrapped_value),
+            source=self.get_source(),
+        )
+
+        unspec_var = wrap_fx_proxy_cls(
+            UnspecializedPythonVariable,
+            tx=self.tx,
+            proxy=proxy,
+            example_value=wrapped_value,
+            **options,
+        )
+        self.tx.output.unspec_variable_map[self.name] = unspec_var
+        if not is_constant_source(self.get_source()):
+            if self.tx.export and not isinstance(self.get_source(), LocalSource):
+                raise AssertionError(
+                    f"Dynamo attempts to add additional input during export: value={wrapped_value}, source={self.get_source()}"
                 )
+            fake_tensor_value = None
+            if isinstance(unspec_var, ConstantVariable):
+                # TODO: when can this happen?
+                example_value = unspec_var.value
             else:
-                wrapped_value = torch.tensor(value)
-            if not isinstance(self.get_source(), RandomValueSource):
-                install_guard(self.get_source().make_guard(GuardBuilder.TYPE_MATCH))
-            options = {"source": self.get_source()}
-            if isinstance(wrapped_value, torch.Tensor):
-                options.update({"raw_value": value})
+                example_value = unspec_var.proxy.node.meta["example_value"]
+            assert is_fake(example_value)
 
-            proxy = self.tx.output.root_tracer.create_graph_input(
-                re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
-                type(wrapped_value),
-                source=self.get_source(),
+            fake_tensor_value = example_value
+            assert fake_tensor_value.fake_mode is self.tx.fake_mode, (
+                f"fake mode ({fake_tensor_value.fake_mode}) from fake tensor metadata doesn't match mode"
+                "({self.tx.fake_mode}) from InstructionTranslator"
             )
 
-            unspec_var = wrap_fx_proxy_cls(
-                UnspecializedPythonVariable,
-                tx=self.tx,
-                proxy=proxy,
-                example_value=wrapped_value,
-                **options,
+            proxy.node.meta["grapharg"] = GraphArg(
+                self.get_source(),
+                wrapped_value,
+                pass_arg_as_tensor=True,
+                fake_tensor=fake_tensor_value,
+                is_tensor=False,
+                example_strong_ref=wrapped_value,
             )
-            self.tx.output.unspec_variable_map[self.name] = unspec_var
-            if not is_constant_source(self.get_source()):
-                if self.tx.export and not isinstance(self.get_source(), LocalSource):
-                    raise AssertionError(
-                        "Dynamo attempts to add additional input during export: value={}, source={}".format(
-                            wrapped_value, self.get_source()
-                        )
-                    )
-                fake_tensor_value = None
-                if isinstance(unspec_var, ConstantVariable):
-                    example_value = unspec_var.value
-                else:
-                    example_value = unspec_var.proxy.node.meta["example_value"]
-                if is_fake(example_value):
-                    fake_tensor_value = example_value
-                    assert fake_tensor_value.fake_mode is self.tx.fake_mode, (
-                        f"fake mode ({fake_tensor_value.fake_mode}) from fake tensor metadata doesn't match mode"
-                        "({self.tx.fake_mode}) from InstructionTranslator"
-                    )
-
-                proxy.node.meta["grapharg"] = GraphArg(
-                    self.get_source(),
-                    wrapped_value,
-                    isinstance(wrapped_value, torch.Tensor),
-                    fake_tensor_value,
-                    is_tensor=False,
-                    example_strong_ref=wrapped_value,
-                )
-            return unspec_var
+        return unspec_var
 
 
 def _dataclasses_fields_lambda(obj):
@@ -1276,7 +1571,9 @@ def _dataclasses_fields_lambda(obj):
     return TupleVariable(items)
 
 
-def wrap_fx_proxy(tx, proxy, example_value=None, subclass_type=None, **options):
+def wrap_fx_proxy(
+    tx, proxy, example_value=None, subclass_type=None, **options
+) -> VariableTracker:
     kwargs = {
         "tx": tx,
         "proxy": proxy,
@@ -1366,59 +1663,49 @@ def _clone_input(value):
 
         return value
 
-    with preserve_rng_state():
-        if example_value is None:
-            # only allow_non_graph_fake in this instance because we handle the non-fake
-            # cases properly below.
-            example_value = get_fake_value(proxy.node, tx, allow_non_graph_fake=True)
-
-        # Handle recursive calls here
-        elif maybe_get_fake_mode(example_value) is tx.fake_mode:
-            pass
-
-        elif isinstance(example_value, torch.Tensor):
-            if tx.export:
-                # The legacy behavior for real value cache with subclasses was
-                # to perform a clone WITHOUT preserving the subclass.  It's
-                # not entirely clear this is what you actually want though.
-                with torch._C.DisableTorchFunctionSubclass():
-                    proxy.tracer.real_value_cache[proxy.node] = _clone_input(
-                        example_value
-                    )
-            # NB: If we're ignoring subclass, then the expectation is you will
-            # take the returned TensorVariable and wrap it into a more
-            # accurate TensorVariable that is able to track subclass-ness;
-            # otherwise this is wrong!
-            kwargs = {
-                "is_tensor": target_cls
-                in (TensorVariable, TensorWithTFOverrideVariable),
-            }
-            assert "source" in options and options["source"] is not None
-            kwargs["source"] = options["source"]
-            example_value = wrap_to_fake_tensor_and_record(
-                example_value, tx=tx, **kwargs
-            )
-        if isinstance(example_value, torch.Tensor) and (
-            maybe_get_fake_mode(example_value) is not tx.fake_mode
-        ):
-            raise InternalTorchDynamoError(
-                "`example_value` needs to be a `FakeTensor`"
-                f"wrapped by this instance of Dynamo. Found: {example_value}"
-            )
+    # with preserve_rng_state():
+    if example_value is None:
+        # only allow_non_graph_fake in this instance because we handle the non-fake
+        # cases properly below.
+        example_value = get_fake_value(proxy.node, tx, allow_non_graph_fake=True)
+
+    # Handle recursive calls here
+    elif maybe_get_fake_mode(example_value) is tx.fake_mode:
+        pass
+
+    elif isinstance(example_value, torch.Tensor):
+        if tx.export:
+            # The legacy behavior for real value cache with subclasses was
+            # to perform a clone WITHOUT preserving the subclass.  It's
+            # not entirely clear this is what you actually want though.
+            with torch._C.DisableTorchFunctionSubclass():
+                proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value)
+        # NB: If we're ignoring subclass, then the expectation is you will
+        # take the returned TensorVariable and wrap it into a more
+        # accurate TensorVariable that is able to track subclass-ness;
+        # otherwise this is wrong!
+        kwargs = {
+            "is_tensor": target_cls in (TensorVariable, TensorWithTFOverrideVariable),
+        }
+        assert "source" in options and options["source"] is not None
+        kwargs["source"] = options["source"]
+        example_value = wrap_to_fake_tensor_and_record(example_value, tx=tx, **kwargs)
+    if isinstance(example_value, torch.Tensor) and (
+        maybe_get_fake_mode(example_value) is not tx.fake_mode
+    ):
+        raise InternalTorchDynamoError(
+            "`example_value` needs to be a `FakeTensor`"
+            f"wrapped by this instance of Dynamo. Found: {example_value}"
+        )
 
     if isinstance(example_value, torch.Tensor):
         is_parameter = isinstance(example_value, torch.nn.Parameter)
-        should_specialize = options.pop("should_specialize", False)
-        if is_parameter or should_specialize:
-            specialized_value = initial_example_value
-        else:
-            specialized_value = None
 
         # NB: In most (all?) cases, this does not actually do a clone.
         # (WARNING: this means that if we mutate metadata on the fake
         # tensor, the stored example value will update too!)
         example_value = _clone_input(example_value)
-        proxy.node.meta["example_value"] = example_value
+        set_example_value(proxy.node, example_value)
         specialized_props = target_cls.specialize(example_value)
         # TODO: not sure about this fake mode test
         if (
@@ -1430,8 +1717,6 @@ def _clone_input(value):
                 torch.nn.Parameter if is_parameter else tensor_type
             )
 
-        specialized_props["specialized_value"] = specialized_value
-
         options.update(specialized_props)
         return target_cls(proxy, **options)
     elif (
@@ -1451,8 +1736,8 @@ def _clone_input(value):
     ):
         sizes = [ConstantVariable.create(x) for x in example_value]
         return SizeVariable(sizes, **options)
-    elif isinstance(example_value, (tuple, list, set)):
-        proxy.node.meta["example_value"] = example_value
+    elif isinstance(example_value, (tuple, list)):
+        set_example_value(proxy.node, example_value)
         unpacked = []
         for i, val in enumerate(example_value):
             if val is None:
@@ -1461,15 +1746,31 @@ def _clone_input(value):
                     ConstantVariable.create(None, **options),
                 )
             else:
+                proxy_i = proxy.tracer.create_proxy(
+                    kind="call_function",
+                    target=operator.getitem,
+                    args=(proxy, i),
+                    kwargs={},
+                )
+
+                if "source" in options:
+                    source = options["source"]
+                    options_i = options.copy()
+                    options_i["source"] = GetItemSource(
+                        base=source, index=i, index_is_slice=False
+                    )
+                else:
+                    # use the same options object as parent
+                    options_i = options
+
+                # WARNING: this assumes the same target_cls as this tuple/list call
                 unpacked.append(
                     wrap_fx_proxy_cls(
-                        target_cls,
-                        tx,
-                        proxy.tracer.create_proxy(
-                            "call_function", operator.getitem, (proxy, i), {}
-                        ),
+                        target_cls=target_cls,
+                        tx=tx,
+                        proxy=proxy_i,
                         example_value=val,
-                        **options,
+                        **options_i,
                     )
                 )
         if isinstance(example_value, torch.Size):
@@ -1480,8 +1781,6 @@ def _clone_input(value):
             return TupleVariable(unpacked, **options)
         elif istype(example_value, (list, immutable_list)):
             return ListVariable(unpacked, mutable_local=MutableLocal(), **options)
-        elif istype(example_value, set):
-            return SetVariable(unpacked, mutable_local=MutableLocal(), **options)
         else:
             assert example_value.__class__.__module__ == "torch.return_types" or hasattr(
                 example_value, "_fields"
@@ -1490,7 +1789,7 @@ def _clone_input(value):
     elif example_value is None or proxy.node.target is torch.manual_seed:
         return ConstantVariable.create(None, **options)
     elif isinstance(example_value, (torch.SymInt, torch.SymFloat, torch.SymBool)):
-        proxy.node.meta["example_value"] = example_value
+        set_example_value(proxy.node, example_value)
         return SymNodeVariable(proxy, example_value, **options)
     elif (
         inspect.isclass(proxy.node.target)
@@ -1499,20 +1798,18 @@ def _clone_input(value):
         device_interface.current_stream
         for _, device_interface in get_registered_device_interfaces()
     ]:
-        proxy.node.meta["example_value"] = example_value
-        return StreamVariable(
-            proxy, example_value, example_value.device.type, **options
-        )
+        set_example_value(proxy.node, example_value)
+        return StreamVariable(proxy, example_value, example_value.device, **options)
     elif (
         inspect.isclass(proxy.node.target) and issubclass(proxy.node.target, _EventBase)
     ) or proxy.node.target in [
         device_interface.Event
         for _, device_interface in get_registered_device_interfaces()
     ]:
-        proxy.node.meta["example_value"] = example_value
+        set_example_value(proxy.node, example_value)
         return EventVariable(proxy, example_value, **options)
     elif proxy.node.target == "query" and proxy.node.op == "call_method":
-        proxy.node.meta["example_value"] = example_value
+        set_example_value(proxy.node, example_value)
         return ConstantVariable(example_value, **options)
     elif (
         example_value is not None
@@ -1520,7 +1817,7 @@ def _clone_input(value):
         and proxy.node.target == "record_event"
         and proxy.node.op == "call_method"
     ):
-        proxy.node.meta["example_value"] = example_value
+        set_example_value(proxy.node, example_value)
         return EventVariable(proxy, example_value, **options)
     elif isinstance(example_value, int) and proxy.node.target in [
         torch.sym_int,
@@ -1529,6 +1826,7 @@ def _clone_input(value):
         torch._utils._element_size,
         torch.seed,
         operator.mod,
+        torch._functorch.vmap._validate_and_get_batch_size,
         # some mac builds are missing torch.distributed.get_rank()
         getattr(torch.distributed, "get_rank", _missing),
         getattr(torch.distributed, "get_world_size", _missing),
@@ -1537,7 +1835,24 @@ def _clone_input(value):
         torch._constrain_as_value,
         torch._constrain_as_size,
     ]:
-        proxy.node.meta["example_value"] = example_value
+        set_example_value(proxy.node, example_value)
+        return ConstantVariable.create(example_value, **options)
+    elif isinstance(example_value, torch.backends.cuda.SDPAParams):
+        from .sdpa import SDPAParamsVariable
+
+        set_example_value(proxy.node, example_value)
+        return SDPAParamsVariable(proxy, **options)
+    elif isinstance(example_value, bool) and proxy.node.target in [
+        torch.backends.cuda.can_use_flash_attention,
+        torch.backends.cuda.can_use_efficient_attention,
+    ]:
+        set_example_value(proxy.node, example_value)
+        return ConstantVariable.create(example_value, **options)
+    elif (
+        isinstance(example_value, (int, float, bool))
+        and proxy.node.target is call_torchbind
+    ):
+        set_example_value(proxy.node, example_value)
         return ConstantVariable.create(example_value, **options)
     else:
         unimplemented(
@@ -1569,12 +1884,24 @@ def __eq__(self, other: object) -> bool:
 def _automatic_dynamic(
     e, tx, source, static_shapes, outer_only=False
 ) -> SymbolicContext:
+    # strided NT not supported
+    if e.is_nested and not isinstance(
+        e, torch.nested._internal.nested_tensor.NestedTensor
+    ):
+        unimplemented("torch.compile does not support strided NestedTensor")
+
     name = source.name()
     prior_policy = tx.output.tracing_context.tensor_to_context.get(e, None)
     shape_env_to_source_to_symbol_cache = (
         prior_policy.shape_env_to_source_to_symbol_cache if prior_policy else None
     )
 
+    # Get base context if the tensor is a view
+    view_base_context: Optional[SymbolicContext] = None
+    if e._is_view():
+        base_source = AttrSource(source, "_base")
+        view_base_context = _automatic_dynamic(e._base, tx, base_source, static_shapes)
+
     if is_traceable_wrapper_subclass(e) and not outer_only:
         # Get symbolic context for outer tensor
         outer_context = _automatic_dynamic(
@@ -1595,6 +1922,7 @@ def _automatic_dynamic(
         return SubclassSymbolicContext(
             dynamic_sizes=outer_context.dynamic_sizes,
             constraint_sizes=outer_context.constraint_sizes,
+            view_base_context=view_base_context,
             tensor_source=outer_context.tensor_source,
             shape_env_to_source_to_symbol_cache=outer_context.shape_env_to_source_to_symbol_cache,
             inner_contexts=inner_contexts,
@@ -1604,21 +1932,23 @@ def _automatic_dynamic(
         return StatefulSymbolicContext(
             dynamic_sizes=[DimDynamic.STATIC] * e.dim(),
             constraint_sizes=[None] * e.dim(),
+            view_base_context=view_base_context,
             tensor_source=source,
             shape_env_to_source_to_symbol_cache=shape_env_to_source_to_symbol_cache,
         )
 
     # We preserve the dynamism of inputs. For example, when users call
     # make_fx(torch.cond, tracing_mode="symbolic")(*args), inputs have SymInt sizes.
-    from torch.fx.experimental.symbolic_shapes import is_singleton
+    from torch.fx.experimental.symbolic_shapes import is_nested_int
 
-    if any(isinstance(s, SymInt) and not is_singleton(s) for s in e.size()):
+    if any(isinstance(s, SymInt) and not is_nested_int(s) for s in e.size()):
         return StatefulSymbolicContext(
             dynamic_sizes=[
                 DimDynamic.DYNAMIC if isinstance(s, SymInt) else DimDynamic.STATIC
                 for s in e.size()
             ],
             constraint_sizes=[None] * e.dim(),
+            view_base_context=view_base_context,
             tensor_source=source,
             shape_env_to_source_to_symbol_cache=shape_env_to_source_to_symbol_cache,
         )
@@ -1671,11 +2001,9 @@ def update_dim2constraint(dim, constraint_range, debug_name):
                 vr=constraint_range.vr & old_constraint_range.vr,
                 warn_only=False,
             )
-            if old_debug_name is not None:
-                assert debug_name is None or debug_name == old_debug_name
-                new_debug_name = old_debug_name
-            else:
-                new_debug_name = debug_name
+            # It is possible for (non-None) old_debug_name and debug_name to be different
+            # but this will only happen the corresponding Dims can be derived equal.
+            new_debug_name = old_debug_name or debug_name
             dim2constraint[dim] = new_constraint_range, new_debug_name
         else:
             dim2constraint[dim] = constraint_range, debug_name
@@ -1724,7 +2052,24 @@ def update_dim2constraint(dim, constraint_range, debug_name):
         constraint = dim2constraint.get(i)
         if constraint is None:
             if marked_dynamic and not config.allow_ignore_mark_dynamic:
-                constraint_dim = RelaxedUnspecConstraint(warn_only=False)
+                if hasattr(e, "_dynamo_dynamic_range"):
+                    dim_range = [
+                        dr for dr in e._dynamo_dynamic_range if dr.dim == i
+                    ].pop()
+                    if dim_range.min is None and dim_range.max is None:
+                        constraint_dim = RelaxedUnspecConstraint(warn_only=False)
+                    else:
+                        from torch.fx.experimental.symbolic_shapes import (
+                            StrictMinMaxConstraint,
+                        )
+
+                        constraint_dim = StrictMinMaxConstraint(
+                            vr=ValueRanges(lower=dim_range.min, upper=dim_range.max),
+                            warn_only=False,
+                        )
+                else:
+                    constraint_dim = RelaxedUnspecConstraint(warn_only=False)
+
             elif not marked_static and automatic_dynamic:
                 constraint_dim = RelaxedUnspecConstraint(warn_only=True)
             else:
@@ -1741,7 +2086,7 @@ def update_dim2constraint(dim, constraint_range, debug_name):
             constraint_dim is not None
             or marked_dynamic
             or marked_weak_dynamic
-            or is_singleton(e.shape[i])
+            or is_nested_int(e.shape[i])
         ):
             # NB: We could assert static_shapes is False here, but it
             # seems better to allow the user to override symbolic_context in this
@@ -1759,6 +2104,7 @@ def update_dim2constraint(dim, constraint_range, debug_name):
     return StatefulSymbolicContext(
         dynamic_sizes=dynamic_dims,
         constraint_sizes=constraint_dims,
+        view_base_context=view_base_context,
         tensor_source=source,
         shape_env_to_source_to_symbol_cache=shape_env_to_source_to_symbol_cache,
     )
@@ -1793,10 +2139,11 @@ def wrap_to_fake_tensor_and_record(
             symbolic_context = parent_context.inner_contexts[inner_context_name]
 
         log.debug(
-            "wrap_to_fake %s %s %s",
+            "wrap_to_fake %s %s %s %s",
             source.name(),
             tuple(e.shape),
             symbolic_context,
+            type(e),
         )
         fake_e = wrap_fake_exception(
             lambda: tx.fake_mode.from_tensor(
@@ -1821,12 +2168,16 @@ def wrap_to_fake_tensor_and_record(
                 )
 
         tx.output.tracing_context.tensor_to_context[e] = symbolic_context
-        tx.output.tensor_weakref_to_sizes_strides[e] = {
+        tx.output.input_source_to_sizes_strides[source] = {
             "size": fake_e.size(),
             "stride": fake_e.stride(),
         }
 
-        if is_tensor and not (static_shapes and source.is_nn_module()):
+        if (
+            is_tensor
+            and not (static_shapes and source.is_nn_module())
+            and not is_constant_source(source)
+        ):
             tx.output.tracked_fakes.append(
                 TrackedFake(fake_e, source, symbolic_context)
             )
@@ -1850,41 +2201,86 @@ class SourcelessBuilder:
     if/else type->VariableTracker trees that were cropping up all over dynamo.
     """
 
-    def __call__(self, tx, value) -> VariableTracker:
+    def __init__(self):
+        raise AssertionError("Use SourcelessBuilder.create()")
+
+    @staticmethod
+    def create(tx, value) -> VariableTracker:
+        value_type = type(value)
+        fast_handler = SourcelessBuilder._type_handlers.get(value_type)
+        if fast_handler:
+            return fast_handler(tx, value)
+
         if isinstance(value, VariableTracker):
             # This is always valid to call, and useful for recursive calls.
             return value
-        if isinstance(value, dataclasses._HAS_DEFAULT_FACTORY_CLASS):
+        elif isinstance(value, dataclasses._HAS_DEFAULT_FACTORY_CLASS):
             return UserDefinedObjectVariable(value)
-        if ConstantVariable.is_literal(value):
-            return SourcelessBuilder.wrap_constant_literal(value)
-        elif is_builtin_callable(value):
-            return BuiltinVariable(value)
-        elif trace_rules.lookup(value) is not None:
+        elif ConstantVariable.is_literal(value):
+            return ConstantVariable.create(value)
+        elif callable(value) and trace_rules.lookup_callable(value) is not None:
             if is_callable_allowed(value):
-                self.tx.output.has_user_defined_allowed_in_graph = True
+                tx.output.has_user_defined_allowed_in_graph = True
+            return trace_rules.lookup_callable(value)(value)
+        elif is_function_or_wrapper(value):
             return trace_rules.lookup(value)(value)
-        elif isinstance(value, types.FunctionType):
-            return UserFunctionVariable(value)
         elif isinstance(value, enum.Enum):
             return EnumVariable(value)
         elif isinstance(value, (type, abc.ABCMeta)):
             return UserDefinedClassVariable(value)
-        elif isinstance(value, dict):
-            items = {self(tx, k): self(tx, v) for k, v in value.items()}
-            return ConstDictVariable(items, mutable_local=MutableLocal())
-        elif isinstance(value, set):
-            return SetVariable(
-                [self(tx, x) for x in value], mutable_local=MutableLocal()
-            )
-        elif isinstance(value, (tuple, list)):
-            cls = BaseListVariable.cls_for(type(value))
-            return cls([self(tx, x) for x in value], mutable_local=MutableLocal())
         elif isinstance(value, types.MethodWrapperType):
             return MethodWrapperVariable(value)
-        unimplemented(f"Unexpected type in sourceless builder {type(value)}")
+        elif isinstance(value, torch.fx.graph_module.GraphModule):
+            return SourcelessGraphModuleVariable(value)
+        elif isinstance(
+            value, (torch.utils._pytree.TreeSpec, torch.utils._pytree.LeafSpec)
+        ):
+            return UserDefinedObjectVariable(value)
+        elif PlacementVariable.is_placement(value):
+            return PlacementVariable(value)
+        elif DeviceMeshVariable.is_device_mesh(value):
+            return DeviceMeshVariable(value)
+        unimplemented(
+            f"Unexpected type in sourceless builder {value_type.__module__}.{value_type.__qualname__}"
+        )
 
     @staticmethod
     def wrap_constant_literal(value):
         assert ConstantVariable.is_literal(value)
         return ConstantVariable.create(value=value)
+
+    @staticmethod
+    def make_type_handlers():
+        create = SourcelessBuilder.create
+        handlers = {}
+        for t in common_constant_types:
+            handlers[t] = lambda tx, value: ConstantVariable(value)
+        handlers[set] = lambda tx, value: SetVariable(
+            [create(tx, x) for x in value], mutable_local=MutableLocal()
+        )
+        handlers[dict] = lambda tx, value: ConstDictVariable(
+            {create(tx, k): create(tx, v) for k, v in value.items()},
+            mutable_local=MutableLocal(),
+        )
+        handlers[list] = lambda tx, value: ListVariable(
+            [create(tx, x) for x in value], mutable_local=MutableLocal()
+        )
+        handlers[tuple] = lambda tx, value: TupleVariable(
+            [create(tx, x) for x in value]
+        )
+        handlers[torch.Size] = lambda tx, value: SizeVariable(
+            [create(tx, x) for x in value]
+        )
+        handlers[immutable_dict] = handlers[dict]
+        handlers[immutable_list] = handlers[list]
+        handlers[types.ModuleType] = lambda tx, value: PythonModuleVariable(value)
+
+        def passthrough(tx, value):
+            return value
+
+        for cls in VariableTrackerMeta.all_subclasses:
+            handlers[cls] = passthrough
+        return handlers
+
+
+SourcelessBuilder._type_handlers = SourcelessBuilder.make_type_handlers()
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 875485647e3eb..fe2e2a10f42c0 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import contextlib
 import functools
 import inspect
@@ -24,23 +26,28 @@
 from ..replay_record import DummyModule
 from ..source import AttrSource, GetItemSource, is_constant_source, TypeSource
 from ..utils import (
-    build_checkpoint_variable,
     check_constant_args,
     check_numpy_ndarray_args,
+    check_unspec_or_constant_args,
     check_unspec_python_args,
     extract_fake_example_value,
     get_fake_value,
     guard_if_dyn,
-    is_utils_checkpoint,
     istype,
     numpy_operator_wrapper,
     proxy_args_kwargs,
     tensortype_to_dtype,
 )
-from .base import MutableLocal, typestr, VariableTracker
+from .base import MutableLocal, VariableTracker
 from .constant import ConstantVariable
 from .ctx_manager import EventVariable, StreamVariable
-from .dicts import ConstDictVariable, DefaultDictVariable, is_hashable, SetVariable
+from .dicts import (
+    ConstDictVariable,
+    DefaultDictVariable,
+    DictView,
+    is_hashable,
+    SetVariable,
+)
 from .lists import (
     BaseListVariable,
     ListIteratorVariable,
@@ -49,8 +56,14 @@
     TupleIteratorVariable,
     TupleVariable,
 )
-from .tensor import FakeItemVariable, SymNodeVariable, UnspecializedPythonVariable
-from .user_defined import UserDefinedVariable
+from .tensor import (
+    FakeItemVariable,
+    supported_comparison_ops,
+    SymNodeVariable,
+    TensorVariable,
+    UnspecializedPythonVariable,
+)
+from .user_defined import UserDefinedObjectVariable, UserDefinedVariable
 
 log = logging.getLogger(__name__)
 
@@ -87,6 +100,15 @@ def call_fn(self, tx, *args, **kwargs):
 
 class BuiltinVariable(VariableTracker):
     _SENTINEL = object()
+    _nonvar_fields = {
+        "fn",
+        *VariableTracker._nonvar_fields,
+    }
+
+    @classmethod
+    def create_with_source(cls, value, source):
+        install_guard(source.make_guard(GuardBuilder.BUILTIN_MATCH))
+        return BuiltinVariable(value, source=source)
 
     @staticmethod
     @functools.lru_cache(None)
@@ -100,6 +122,7 @@ def _constant_fold_functions():
             chr,
             divmod,
             float,
+            getattr,
             int,
             len,
             max,
@@ -112,9 +135,11 @@ def _constant_fold_functions():
             str.format,
             sum,
             type,
+            operator.abs,
             operator.pos,
             operator.neg,
             operator.not_,
+            operator.truth,
             operator.invert,
             operator.pow,
             operator.mul,
@@ -125,6 +150,7 @@ def _constant_fold_functions():
             operator.add,
             operator.sub,
             operator.getitem,
+            operator.length_hint,
             operator.lshift,
             operator.rshift,
             operator.and_,
@@ -145,6 +171,9 @@ def _constant_fold_functions():
             operator.ior,
             operator.index,
         }
+        from .tensor import supported_comparison_ops
+
+        fns.update(supported_comparison_ops.values())
         fns.update(x for x in math.__dict__.values() if isinstance(x, type(math.sqrt)))
         return fns
 
@@ -155,6 +184,7 @@ def can_constant_fold_through(self):
     @functools.lru_cache(None)
     def _fx_graph_functions():
         fns = {
+            operator.abs,
             operator.pos,
             operator.neg,
             operator.not_,
@@ -174,6 +204,7 @@ def _fx_graph_functions():
             operator.eq,
             operator.sub,
             operator.getitem,
+            operator.length_hint,
             operator.lshift,
             operator.rshift,
             operator.and_,
@@ -238,7 +269,17 @@ def _binop_handlers():
         # Multiple dispatch mechanism defining custom binop behavior for certain type
         # combinations. Handlers are attempted in order, and will be used if the type checks
         # match. They are expected to have the signature:
-        # fn(tx, arg0: VariableTracker, arg1: VariableTracker, options) -> VariableTracker
+        # fn(tx, arg0: VariableTracker, arg1: VariableTracker) -> VariableTracker
+        from .dicts import DictKeys, SetVariable
+        from .functions import BaseUserFunctionVariable, UserFunctionVariable
+        from .nn_module import NNModuleVariable
+        from .tensor import supported_const_comparison_ops
+        from .torch import BaseTorchVariable
+        from .user_defined import (
+            UserDefinedClassVariable,
+            UserDefinedObjectVariable,
+            UserDefinedVariable,
+        )
 
         # Override table contains: op_fn -> [list of handlers]
         op_handlers = {}
@@ -256,7 +297,7 @@ def user_defined_handler(
                 tx,
                 a,
                 b,
-                options,
+                *,
                 forward_name=forward_name,
                 reverse_name=reverse_name,
             ):
@@ -286,9 +327,7 @@ def user_defined_handler(
                 ((VariableTracker, UserDefinedVariable), user_defined_handler)
             )
 
-            def user_defined_inplace_handler(
-                tx, a, b, options, forward_name=inplace_name
-            ):
+            def user_defined_inplace_handler(tx, a, b, *, forward_name=inplace_name):
                 return a.call_method(tx, forward_name, [b], {})
 
             op_handlers[in_place_op].append(
@@ -299,7 +338,7 @@ def user_defined_inplace_handler(
             )
 
             # Dynamic shape args
-            def dynamic_handler(tx, a, b, options, fn=op):
+            def dynamic_handler(tx, a, b, *, fn=op):
                 from .builder import wrap_fx_proxy
 
                 return wrap_fx_proxy(
@@ -307,7 +346,6 @@ def dynamic_handler(tx, a, b, options, fn=op):
                     tx.output.create_proxy(
                         "call_function", fn, *proxy_args_kwargs([a, b], {})
                     ),
-                    **options,
                 )
 
             op_handlers[op].append(
@@ -328,11 +366,11 @@ def dynamic_handler(tx, a, b, options, fn=op):
         # Special cases - lower precedence but still prefer these over constant folding
 
         # List-like addition (e.g. [1, 2] + [3, 4])
-        def tuple_add_handler(tx, a, b, options):
-            return TupleVariable(a.items + list(b.unpack_var_sequence(tx)), **options)
+        def tuple_add_handler(tx, a, b):
+            return TupleVariable([*a.items, *b.unpack_var_sequence(tx)])
 
-        def size_add_handler(tx, a, b, options):
-            return SizeVariable(a.items + list(b.unpack_var_sequence(tx)), **options)
+        def size_add_handler(tx, a, b):
+            return SizeVariable([*a.items, *b.unpack_var_sequence(tx)])
 
         list_like_addition_handlers = [
             # NB: Prefer the tuple-specific logic over base logic because of
@@ -352,18 +390,27 @@ def size_add_handler(tx, a, b, options):
             ),
             (
                 (ConstantVariable, TupleVariable),
-                lambda tx, a, b, options: TupleVariable(
-                    list(a.unpack_var_sequence(tx)) + b.items, **options
+                lambda tx, a, b: TupleVariable(
+                    [*a.unpack_var_sequence(tx), *b.items],
+                ),
+            ),
+            (
+                (
+                    ListVariable,
+                    (BaseListVariable, ConstantVariable, ListIteratorVariable),
+                ),
+                lambda tx, a, b: ListVariable(
+                    [*a.items, *b.unpack_var_sequence(tx)], mutable_local=MutableLocal()
                 ),
             ),
             (
                 (BaseListVariable, BaseListVariable),
-                lambda tx, a, b, options: type(a)(a.items + b.items, **options),
+                lambda tx, a, b: type(a)([*a.items, *b.items]),
             ),
         ]
         op_handlers[operator.add].extend(list_like_addition_handlers)
 
-        def list_iadd_handler(tx, a, b, _):
+        def list_iadd_handler(tx, a, b):
             if not a.mutable_local or not b.has_unpack_var_sequence(tx):
                 # Handler doesn't apply
                 return None
@@ -390,41 +437,187 @@ def list_iadd_handler(tx, a, b, _):
         op_handlers[operator.iadd].extend(list_like_iadd_handlers)
 
         # List-like expansion (e.g. [1, 2, 3] * 3)
-        def expand_list_like(tx, lst, const, options):
+        def expand_list_like(tx, lst, const):
+            if isinstance(lst, ConstantVariable):
+                lst, const = const, lst
             return lst.__class__(
                 items=lst.items * const.as_python_constant(),
                 mutable_local=MutableLocal(),
-                **options,
             )
 
         list_like_expansion_handlers = [
             ((ListVariable, ConstantVariable), expand_list_like),
             ((TupleVariable, ConstantVariable), expand_list_like),
-            (
-                (ConstantVariable, ListVariable),
-                lambda tx, a, b, options: expand_list_like(tx, b, a, options),
-            ),
-            (
-                (ConstantVariable, TupleVariable),
-                lambda tx, a, b, options: expand_list_like(tx, b, a, options),
-            ),
+            ((ConstantVariable, ListVariable), expand_list_like),
+            ((ConstantVariable, TupleVariable), expand_list_like),
         ]
         op_handlers[operator.mul].extend(list_like_expansion_handlers)
 
+        size_or_tuple = (SizeVariable, TupleVariable)
+        has_set_items = (SetVariable, DictKeys)
+
+        def create_cmp_op_handlers(op):
+            def compare_by_value(tx, a, b):
+                return ConstantVariable(op(a.value, b.value))
+
+            result = [((ConstantVariable, ConstantVariable), compare_by_value)]
+
+            if op in supported_const_comparison_ops.values():
+                # Tensor is None, List is not None, etc
+                none_result = op(object(), None)
+                if op.__name__.startswith("is_"):
+
+                    def never(tx, a, b):
+                        return ConstantVariable(none_result)
+
+                    obj_op_none = never
+                    none_op_obj = never
+                else:
+
+                    def obj_op_none(tx, a, b: ConstantVariable):
+                        if b.value is None or b.value is True or b.value is False:
+                            return ConstantVariable(none_result)
+
+                    def none_op_obj(tx, a: ConstantVariable, b):
+                        if a.value is None or a.value is True or a.value is False:
+                            return ConstantVariable(none_result)
+
+                types_that_are_never_none = (
+                    TensorVariable,
+                    SymNodeVariable,
+                    NNModuleVariable,
+                    BaseListVariable,
+                    UserDefinedVariable,
+                    BaseUserFunctionVariable,
+                    ConstDictVariable,
+                    BaseTorchVariable,
+                )
+                result.extend(
+                    [
+                        (
+                            (types_that_are_never_none, ConstantVariable),
+                            obj_op_none,
+                        ),
+                        (
+                            (ConstantVariable, types_that_are_never_none),
+                            none_op_obj,
+                        ),
+                    ]
+                )
+
+            def list_compare_nocheck(tx, left, right):
+                return BaseListVariable.list_compare(tx, op, left, right)
+
+            def list_compare_check(tx, left, right):
+                if type(left) is not type(
+                    right
+                ):  # Mismatch in BaseListVariable subclasses
+                    unimplemented(f"{op.__name__}({left}, {right})")
+                return BaseListVariable.list_compare(tx, op, left, right)
+
+            def compare_set_items(tx, left, right):
+                return ConstantVariable(op(left.set_items, right.set_items))
+
+            def compare_via_method(tx, left, right):
+                return left.call_method(tx, f"__{op.__name__}__", [right], {})
+
+            if op.__name__.startswith("is_"):
+                compare_user_defined = compare_by_value
+            else:
+                compare_user_defined = compare_via_method
+
+            op_var = BuiltinVariable(op)
+            result.extend(
+                [
+                    (
+                        (
+                            (UserFunctionVariable, BuiltinVariable),
+                            (UserFunctionVariable, BuiltinVariable),
+                        ),
+                        lambda tx, a, b: ConstantVariable(op(a.fn, b.fn)),
+                    ),
+                    (
+                        (
+                            NNModuleVariable,
+                            NNModuleVariable,
+                        ),
+                        lambda tx, a, b: ConstantVariable(
+                            op(
+                                tx.output.get_submodule(a.module_key),
+                                tx.output.get_submodule(b.module_key),
+                            )
+                        ),
+                    ),
+                    ((size_or_tuple, size_or_tuple), list_compare_nocheck),
+                    (
+                        (variables.BaseListVariable, variables.BaseListVariable),
+                        list_compare_check,
+                    ),
+                    ((has_set_items, has_set_items), compare_set_items),
+                    (
+                        (UserDefinedObjectVariable, UserDefinedObjectVariable),
+                        compare_user_defined,
+                    ),
+                    (
+                        (UserDefinedClassVariable, UserDefinedClassVariable),
+                        compare_user_defined,
+                    ),
+                    (
+                        (
+                            (StreamVariable, EventVariable, ConstantVariable),
+                            (StreamVariable, EventVariable, ConstantVariable),
+                        ),
+                        compare_by_value,
+                    ),
+                    (
+                        (TensorVariable, VariableTracker),
+                        op_var._comparison_with_tensor,
+                    ),
+                    (
+                        (VariableTracker, TensorVariable),
+                        op_var._comparison_with_tensor,
+                    ),
+                    (
+                        (SymNodeVariable, VariableTracker),
+                        op_var._comparison_with_symnode,
+                    ),
+                    (
+                        (VariableTracker, SymNodeVariable),
+                        op_var._comparison_with_symnode,
+                    ),
+                ]
+            )
+
+            if op.__name__.startswith("is_"):
+
+                def handle_is(tx, left, right):
+                    # If the two objects are of different type, we can safely return False
+                    # and True for `is` and `is not`, respectively
+                    if type(left) is not type(right):
+                        return ConstantVariable.create(op.__name__ != "is_")
+
+                result.append(((VariableTracker, VariableTracker), handle_is))
+
+            return result
+
+        for op in supported_comparison_ops.values():
+            assert callable(op)
+            assert op not in op_handlers
+            op_handlers[op] = create_cmp_op_handlers(op)
+
         return op_handlers
 
     @staticmethod
-    def _find_binop_handler(op, a, b):
-        handlers = BuiltinVariable._binop_handlers()
-        if op not in handlers:
+    def _find_binop_handler(op, a_type, b_type):
+        handlers = BuiltinVariable._binop_handlers().get(op)
+        if handlers is None:
             return None
 
-        # Return first handler that matches the type checks
-        for (type1, type2), handler in handlers[op]:
-            if isinstance(a, type1) and isinstance(b, type2):
-                return handler
-
-        return None
+        matches = []
+        for (type1, type2), handler in handlers:
+            if issubclass(a_type, type1) and issubclass(b_type, type2):
+                matches.append(handler)
+        return matches
 
     def can_insert_in_graph(self):
         return self.fn in self._fx_graph_functions()
@@ -461,22 +654,39 @@ def reconstruct(self, codegen):
         name = self.fn.__name__
         assert self.fn.__module__ == "builtins"
         assert name not in codegen.tx.f_globals, "shadowed global"
-        return [codegen.create_load_global(name, False, add=True)]
+        codegen.append_output(codegen.create_load_global(name, False, add=True))
 
     def constant_args(self, *args, **kwargs):
         return check_constant_args(args, kwargs)
 
-    def tensor_args(self, *args, **kwargs):
-        return any(
-            isinstance(i, variables.TensorVariable)
-            for i in itertools.chain(args, kwargs.values())
-        ) and not any(
-            isinstance(i, variables.GetAttrVariable)
-            for i in itertools.chain(args, kwargs.values())
-        )
-
-    def unspec_python_args(self, *args, **kwargs):
-        return check_unspec_python_args(args, kwargs)
+    def tensor_args(self, *args):
+        any_tensor = False
+        for arg in args:
+            if isinstance(arg, variables.GetAttrVariable):
+                return False
+            any_tensor = any_tensor or isinstance(arg, variables.TensorVariable)
+        return any_tensor
+
+    def tensor_args_type(self, arg_types):
+        any_tensor = False
+        for arg_type in arg_types:
+            if issubclass(arg_type, variables.GetAttrVariable):
+                return False
+            any_tensor = any_tensor or issubclass(arg_type, variables.TensorVariable)
+        return any_tensor
+
+    def python_and_tensor_constant_only(self, *args, **kwargs):
+        tensor_args = []
+        non_tensor_args = []
+        for i in itertools.chain(args, kwargs.values()):
+            if isinstance(i, variables.TensorVariable):
+                tensor_args.append(i)
+            else:
+                non_tensor_args.append(i)
+        return all(
+            is_constant_source(t.source) if t.source is not None else False
+            for t in tensor_args
+        ) and self.constant_args(*non_tensor_args)
 
     @staticmethod
     def unwrap_unspec_args_kwargs(args, kwargs):
@@ -484,188 +694,256 @@ def unwrap_unspec_args_kwargs(args, kwargs):
             k: v.as_python_constant() for k, v in kwargs.items()
         }
 
-    def call_function(
-        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
-    ) -> "VariableTracker":
-        from . import UserFunctionVariable
-        from .builder import wrap_fx_proxy, wrap_fx_proxy_cls
-
-        args = [v.realize() for v in args]
-        kwargs = {k: v.realize() for k, v in kwargs.items()}
-        constant_args = check_constant_args(args, kwargs)
-        tensor_args = self.tensor_args(*args, **kwargs)
-        unspec_python_args = self.unspec_python_args(*args, **kwargs)
-        has_constant_handler = self.can_constant_fold_through() and (
-            constant_args or unspec_python_args
+    def has_constant_handler(self, args, kwargs):
+        return self.can_constant_fold_through() and check_unspec_or_constant_args(
+            args, kwargs
         )
-        assert isinstance(args, (list, tuple))
-        assert isinstance(kwargs, dict)
 
-        # args[0] is list and args[1] is unspec
-        if self.fn is operator.getitem and not isinstance(
-            args[0], variables.TensorVariable
-        ):
-            tensor_args = False
+    @staticmethod
+    def _make_handler(fn, arg_types: List[type], has_kwargs: bool):
+        from .builder import SourcelessBuilder
+        from .lazy import LazyVariableTracker
 
-        if (
-            self.can_insert_in_graph()
-            and tensor_args
-            and not (
-                self.fn is operator.getitem
-                and isinstance(args[0], ConstDictVariable)
-                and isinstance(args[1], variables.TensorVariable)
+        obj = BuiltinVariable(fn)
+        handlers = []
+
+        if any(issubclass(t, LazyVariableTracker) for t in arg_types):
+            return lambda tx, args, kwargs: obj.call_function(
+                tx, [v.realize() for v in args], kwargs
             )
+
+        if obj.can_insert_in_graph() and not (
+            fn is operator.getitem
+            and not issubclass(arg_types[0], variables.TensorVariable)
         ):
-            try:
-                fn = self.fn
+            if obj.tensor_args_type(arg_types):
+                return obj._handle_insert_op_in_graph
+            elif has_kwargs:
+                # need runtime check for kwargs
+                handlers.append(obj._handle_insert_op_in_graph)
 
-                if self.fn in IN_PLACE_DESUGARING_MAP and isinstance(
-                    args[0], variables.ConstantVariable
-                ):
-                    # In-place operators like += usually mustate tensor
-                    # values, but in the edge case of immutable values they
-                    # re-bind the variable.
-                    #
-                    # The easiest way to keep the graph consistent in this
-                    # scenario is to de-sugar eagerly.
-                    fn, args = IN_PLACE_DESUGARING_MAP[self.fn], [args[0], args[1]]
-
-                if self.fn is operator.getitem and isinstance(args[1], SymNodeVariable):
-                    # Standard indexing will force specialization due to
-                    # __index__.  Rewrite as a regular torch op which will
-                    # trace fine
-                    fn, args = torch.select, [
-                        args[0],
-                        variables.ConstantVariable.create(0),
-                        args[1],
-                    ]
+        # Handle binary ops (e.g. __add__ / __radd__, __iadd__, etc.)
+        # NB: Tensor args are handled above and not here
+        if len(arg_types) == 2 and not has_kwargs:
+            # Try to find a handler for the arg types; otherwise, fall through to constant handler
+            binop_handlers = BuiltinVariable._find_binop_handler(fn, *arg_types)
+            if not binop_handlers:
+                pass
+            elif len(binop_handlers) == 1:
+                (binop_handler,) = binop_handlers
+                handlers.append(lambda tx, args, _: binop_handler(tx, *args))
+            else:
 
-                # Interaction between ndarray and tensors:
-                #   We prefer the tensor op whenever there are tensors involved
-                if check_numpy_ndarray_args(args, kwargs) and not any(
-                    type(arg) == variables.TensorVariable for arg in args
-                ):
-                    proxy = tx.output.create_proxy(
-                        "call_function",
-                        numpy_operator_wrapper(self.fn),
-                        *proxy_args_kwargs(args, kwargs),
+                def call_binop_handlers(tx, args, _):
+                    for fn in binop_handlers:
+                        rv = fn(tx, *args)
+                        if rv:
+                            return rv
+
+                handlers.append(call_binop_handlers)
+
+        self_handler = getattr(obj, f"call_{fn.__name__}", None)
+        if self_handler:
+
+            def call_self_handler(tx, args, kwargs):
+                try:
+                    result = self_handler(tx, *args, **kwargs)
+                    if result is not None:
+                        return result
+                except TypeError:
+                    # Check if binding is bad. inspect signature bind is expensive.
+                    # So check only when handler call fails.
+                    try:
+                        inspect.signature(self_handler).bind(tx, *args, **kwargs)
+                    except TypeError as e:
+                        has_constant_handler = obj.has_constant_handler(args, kwargs)
+                        if not has_constant_handler:
+                            log.warning(
+                                "incorrect arg count %s %s and no constant handler",
+                                self_handler,
+                                e,
+                            )
+                            unimplemented(
+                                f"invalid handler args {self_handler} {args} {kwargs}"
+                            )
+                    else:
+                        raise
+                except Unsupported as exc:
+                    has_constant_handler = obj.has_constant_handler(args, kwargs)
+                    if not has_constant_handler:
+                        raise
+                    # Actually, we will handle this just fine
+                    exc.remove_from_stats()
+
+            handlers.append(call_self_handler)
+
+        if obj.can_constant_fold_through():
+            builder = SourcelessBuilder.create
+
+            if (
+                all(issubclass(x, ConstantVariable) for x in arg_types)
+                and not has_kwargs
+            ):
+
+                def constant_fold_handler(tx, args, kwargs):
+                    # fast path
+                    return builder(
+                        tx,
+                        fn(
+                            *[x.as_python_constant() for x in args],
+                        ),
                     )
 
-                    return wrap_fx_proxy_cls(variables.NumpyNdarrayVariable, tx, proxy)
+            else:
+
+                def constant_fold_handler(tx, args, kwargs):
+                    # path with a runtime check
+                    if check_unspec_or_constant_args(args, kwargs):
+                        return builder(
+                            tx,
+                            fn(
+                                *[x.as_python_constant() for x in args],
+                                **{
+                                    k: v.as_python_constant() for k, v in kwargs.items()
+                                },
+                            ),
+                        )
+
+            handlers.append(constant_fold_handler)
+
+        error_msg = f"builtin: {fn.__name__} {arg_types} {has_kwargs}"
+        if len(handlers) == 0:
+            return lambda *args: unimplemented(error_msg)
+        elif len(handlers) == 1:
+            (handler,) = handlers
+
+            def builtin_dipatch(tx, args, kwargs):
+                rv = handler(tx, args, kwargs)
+                if rv:
+                    return rv
+                unimplemented(error_msg)
+
+        else:
+
+            def builtin_dipatch(tx, args, kwargs):
+                for fn in handlers:
+                    rv = fn(tx, args, kwargs)
+                    if rv:
+                        return rv
+                unimplemented(error_msg)
+
+        return builtin_dipatch
+
+    def _handle_insert_op_in_graph(self, tx, args, kwargs):
+        from .builder import wrap_fx_proxy, wrap_fx_proxy_cls
+
+        if kwargs and not self.tensor_args(*args, *kwargs.values()):
+            return
+
+        fn = self.fn
+        try:
+            # Constant fold for constant tensor and python constants
+            if self.python_and_tensor_constant_only(*args, **kwargs):
+                from ..bytecode_transformation import unique_id
+                from .functions import invoke_and_store_as_constant
+
+                return invoke_and_store_as_constant(
+                    tx, fn, unique_id(fn.__name__), args, kwargs
+                )
 
+            if fn in IN_PLACE_DESUGARING_MAP and isinstance(
+                args[0], variables.ConstantVariable
+            ):
+                # In-place operators like += usually mustate tensor
+                # values, but in the edge case of immutable values they
+                # re-bind the variable.
+                #
+                # The easiest way to keep the graph consistent in this
+                # scenario is to de-sugar eagerly.
+                fn, args = IN_PLACE_DESUGARING_MAP[fn], [args[0], args[1]]
+
+            if fn is operator.getitem and isinstance(args[1], SymNodeVariable):
+                # Standard indexing will force specialization due to
+                # __index__.  Rewrite as a regular torch op which will
+                # trace fine
+                fn, args = torch.select, [
+                    args[0],
+                    variables.ConstantVariable.create(0),
+                    args[1],
+                ]
+
+            # Interaction between ndarray and tensors:
+            #   We prefer the tensor op whenever there are tensors involved
+            if check_numpy_ndarray_args(args, kwargs) and not any(
+                type(arg) == variables.TensorVariable for arg in args
+            ):
                 proxy = tx.output.create_proxy(
                     "call_function",
-                    fn,
+                    numpy_operator_wrapper(fn),
                     *proxy_args_kwargs(args, kwargs),
                 )
-                if any(isinstance(arg, FakeItemVariable) for arg in args):
-                    return wrap_fx_proxy_cls(
-                        FakeItemVariable,
-                        tx,
-                        proxy,
-                    )
-                elif self.unspec_python_args(*args, **kwargs):
-                    _args, _kwargs = self.unwrap_unspec_args_kwargs(args, kwargs)
-                    raw_value = self.fn(*_args, **_kwargs)
-
-                    need_unwrap = any(
-                        x.need_unwrap
-                        for x in itertools.chain(args, kwargs.values())
-                        if isinstance(x, variables.UnspecializedPythonVariable)
-                    )
 
-                    return wrap_fx_proxy_cls(
-                        UnspecializedPythonVariable,
-                        tx,
-                        proxy,
-                        raw_value=raw_value,
-                        need_unwrap=need_unwrap,
-                    )
-                elif all(isinstance(x, SymNodeVariable) for x in args):
-                    return SymNodeVariable.create(tx, proxy, None)
-                else:
-                    # Work around for vision_maskrcnn due to precision difference
-                    # specialize the dividend when float divide by tensor
-                    if self.fn is operator.truediv and isinstance(
-                        args[0], variables.UnspecializedPythonVariable
-                    ):
-                        args[0] = args[0].convert_to_constant(tx)
-                    return wrap_fx_proxy(tx, proxy)
+                return wrap_fx_proxy_cls(variables.NumpyNdarrayVariable, tx, proxy)
 
-            except NotImplementedError:
-                unimplemented(f"partial tensor op: {self} {args} {kwargs}")
+            proxy = tx.output.create_proxy(
+                "call_function",
+                fn,
+                *proxy_args_kwargs(args, kwargs),
+            )
+            if any(isinstance(arg, FakeItemVariable) for arg in args):
+                return wrap_fx_proxy_cls(
+                    FakeItemVariable,
+                    tx,
+                    proxy,
+                )
+            elif check_unspec_python_args(args, kwargs):
+                _args, _kwargs = self.unwrap_unspec_args_kwargs(args, kwargs)
+                raw_value = fn(*_args, **_kwargs)
 
-        # Handle cases like int(torch.seed())
-        # Also handle sym_float to sym_int cases
-        if self.fn in (int, float) and isinstance(
-            args[0], (SymNodeVariable, variables.TensorVariable)
-        ):
-            if isinstance(args[0], variables.TensorVariable):
-                item = args[0].call_method(tx, "item", [], {})
+                need_unwrap = any(
+                    x.need_unwrap
+                    for x in itertools.chain(args, kwargs.values())
+                    if isinstance(x, variables.UnspecializedPythonVariable)
+                )
+
+                return wrap_fx_proxy_cls(
+                    UnspecializedPythonVariable,
+                    tx,
+                    proxy,
+                    raw_value=raw_value,
+                    need_unwrap=need_unwrap,
+                )
+            elif all(isinstance(x, SymNodeVariable) for x in args):
+                return SymNodeVariable.create(tx, proxy, None)
             else:
-                item = args[0]
-            fn_ = sym_int if self.fn is int else sym_float
-            out = wrap_fx_proxy(
-                tx=tx,
-                proxy=tx.output.create_proxy(
-                    "call_function",
-                    fn_,
-                    (item.as_proxy(),),
-                    {},
-                ),
-            )
-            return out
+                # Work around for vision_maskrcnn due to precision difference
+                # specialize the dividend when float divide by tensor
+                if fn is operator.truediv and isinstance(
+                    args[0], variables.UnspecializedPythonVariable
+                ):
+                    args[0] = args[0].convert_to_constant(tx)
+                return wrap_fx_proxy(tx, proxy)
 
-        # Handle `str` on a user defined function
-        if self.fn == str and args and isinstance(args[0], (UserFunctionVariable)):
-            return variables.ConstantVariable.create(value=str(args[0].fn))
+        except NotImplementedError:
+            unimplemented(f"partial tensor op: {self} {args} {kwargs}")
 
-        # Handle binary ops (e.g. __add__ / __radd__, __iadd__, etc.)
-        # NB: Tensor args are handled above and not here
-        if len(kwargs) == 0 and len(args) == 2:
-            # Try to find a handler for the arg types; otherwise, fall through to constant handler
-            binop_handler = BuiltinVariable._find_binop_handler(
-                self.fn, args[0], args[1]
-            )
-            if binop_handler:
-                res = binop_handler(tx, args[0], args[1], {})
-                if res is not None:
-                    return res
+    call_function_handler_cache = {}
 
-        handler = getattr(self, f"call_{self.fn.__name__}", None)
-        if handler:
-            try:
-                inspect.signature(handler).bind(tx, *args, **kwargs)
-            except TypeError as exc:
-                if not has_constant_handler:
-                    log.warning(
-                        "incorrect arg count %s %s and no constant handler",
-                        handler,
-                        exc,
-                    )
-                handler = None
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        if kwargs:
+            kwargs = {k: v.realize() for k, v in kwargs.items()}
+            key = (self.fn, *(type(x) for x in args), True)
+        else:
+            key = (self.fn, *(type(x) for x in args))
 
-        if handler:
-            try:
-                result = handler(tx, *args, **kwargs)
-                if result is not None:
-                    return result
-            except Unsupported as exc:
-                if not has_constant_handler:
-                    raise
-                # Actually, we will handle this just fine
-                exc.remove_from_stats()
-
-        if has_constant_handler:
-            # constant fold
-            return variables.ConstantVariable.create(
-                self.as_python_constant()(
-                    *[x.as_python_constant() for x in args],
-                    **{k: v.as_python_constant() for k, v in kwargs.items()},
-                ),
+        handler = self.call_function_handler_cache.get(key)
+        if not handler:
+            self.call_function_handler_cache[key] = handler = self._make_handler(
+                self.fn, [type(x) for x in args], bool(kwargs)
             )
-
-        return super().call_function(tx, args, kwargs)
+        return handler(tx, args, kwargs)
 
     def call_method(
         self,
@@ -674,6 +952,17 @@ def call_method(
         args: "List[VariableTracker]",
         kwargs: "Dict[str, VariableTracker]",
     ) -> "VariableTracker":
+        if self.fn == object and name == "__setattr__":
+            assert len(args) == 3
+            assert len(kwargs) == 0
+            obj, name_var, val = args
+            obj = obj.realize()
+            if (
+                isinstance(obj, UserDefinedObjectVariable)
+                and tx.output.side_effects.is_attribute_mutation(obj)
+                and name_var.is_python_constant()
+            ):
+                return obj.method_setattr_standard(tx, name_var, val)
         if self.fn == dict and name == "fromkeys":
             return BuiltinVariable.call_custom_dict_fromkeys(tx, dict, *args, **kwargs)
         if self.fn == itertools.chain and name == "from_iterable":
@@ -687,6 +976,35 @@ def call_method(
 
         return super().call_method(tx, name, args, kwargs)
 
+    def _call_int_float(self, tx, arg):
+        # Handle cases like int(torch.seed())
+        # Also handle sym_float to sym_int cases
+        if isinstance(arg, (SymNodeVariable, variables.TensorVariable)):
+            if isinstance(arg, variables.TensorVariable):
+                item = arg.call_method(tx, "item", [], {})
+            else:
+                item = arg
+            fn_ = sym_int if self.fn is int else sym_float
+            from torch._dynamo.variables.builder import wrap_fx_proxy
+
+            return wrap_fx_proxy(
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_function",
+                    fn_,
+                    (item.as_proxy(),),
+                    {},
+                ),
+            )
+
+    call_int = _call_int_float
+    call_float = _call_int_float
+
+    def call_str(self, tx, arg):
+        # Handle `str` on a user defined function
+        if isinstance(arg, (variables.UserFunctionVariable)):
+            return variables.ConstantVariable.create(value=str(arg.fn))
+
     def _call_min_max(self, tx, *args):
         if len(args) == 1 and args[0].has_unpack_var_sequence(tx):
             # expand iterable
@@ -786,8 +1104,9 @@ def _call_min_max_binary(self, tx, a, b):
             else:
                 return result
         elif isinstance(a, SymNodeVariable) or isinstance(b, SymNodeVariable):
+            fn = torch.sym_max if self.fn is max else torch.sym_min
             proxy = tx.output.create_proxy(
-                "call_function", self.fn, *proxy_args_kwargs([a, b], {})
+                "call_function", fn, *proxy_args_kwargs([a, b], {})
             )
             return SymNodeVariable.create(tx, proxy, None)
 
@@ -801,6 +1120,13 @@ def call_abs(self, tx, arg: "VariableTracker"):
         )
         return abs_method.call_function(tx, [], {})
 
+    def call_pos(self, tx, arg: "VariableTracker"):
+        # Call arg.__pos__()
+        pos_method = BuiltinVariable(getattr).call_function(
+            tx, [arg, ConstantVariable.create("__pos__")], {}
+        )
+        return pos_method.call_function(tx, [], {})
+
     def call_round(self, tx, arg, *args, **kwargs):
         # Call arg.__round__()
         round_method = BuiltinVariable(getattr).call_function(
@@ -809,7 +1135,7 @@ def call_round(self, tx, arg, *args, **kwargs):
         return round_method.call_function(tx, args, kwargs)
 
     def call_range(self, tx, *args):
-        if self.unspec_python_args(*args) or self.constant_args(*args):
+        if check_unspec_or_constant_args(args, {}):
             return variables.RangeVariable(args)
         elif self._dynamic_args(*args):
             args = [
@@ -846,22 +1172,12 @@ def _call_iter_tuple_list(self, tx, obj=None, *args, **kwargs):
             # determine the control flow
             return obj
 
-        # TODO This should probably be treated as a dict, or dicts should also be treated here
-        if self.fn == set:
-            cls = SetVariable
-        else:
-            cls = variables.BaseListVariable.cls_for(self.fn)
+        cls = variables.BaseListVariable.cls_for(self.fn)
         if obj is None:
-            if cls is SetVariable:
-                return cls(
-                    [],
-                    mutable_local=MutableLocal(),
-                )
-            else:
-                return cls(
-                    [],
-                    mutable_local=MutableLocal(),
-                )
+            return cls(
+                [],
+                mutable_local=MutableLocal(),
+            )
         elif obj.has_unpack_var_sequence(tx):
             if obj.source and not is_constant_source(obj.source):
                 if isinstance(obj, TupleIteratorVariable):
@@ -869,22 +1185,31 @@ def _call_iter_tuple_list(self, tx, obj=None, *args, **kwargs):
                         obj.source.make_guard(GuardBuilder.TUPLE_ITERATOR_LEN)
                     )
                 else:
-                    install_guard(obj.source.make_guard(GuardBuilder.LIST_LENGTH))
-            if cls is SetVariable:
-                return cls(
-                    list(obj.unpack_var_sequence(tx)),
-                    mutable_local=MutableLocal(),
-                )
+                    if getattr(obj, "source", False) and isinstance(
+                        obj, ConstDictVariable
+                    ):
+                        tx.output.guard_on_key_order.add(obj.source.name())
+
+                    install_guard(obj.source.make_guard(GuardBuilder.SEQUENCE_LENGTH))
 
             return cls(
                 list(obj.unpack_var_sequence(tx)),
                 mutable_local=MutableLocal(),
             )
 
-    call_iter = _call_iter_tuple_list
+    def call_iter(self, tx, obj, *args, **kwargs):
+        # Handle the case where we are iterating over a tuple, list or iterator
+        ret = self._call_iter_tuple_list(tx, obj, *args, **kwargs)
+
+        if ret is None:
+            # If the object doesn't implement a __iter__ method, it will be an error in eager mode when calling iter on it anyway.
+            # If the object implements a __iter__ method, inlining effectively forwards the call to another iter call
+            # (e.g. when __iter__ just returns iter(self.list)) or return a user-defined iterator.
+            return obj.call_method(tx, "__iter__", args, kwargs)
+        return ret
+
     call_tuple = _call_iter_tuple_list
     call_list = _call_iter_tuple_list
-    call_set = _call_iter_tuple_list
 
     def call_callable(self, tx, arg):
         from .functions import BaseUserFunctionVariable
@@ -893,6 +1218,10 @@ def call_callable(self, tx, arg):
             arg, (variables.UserDefinedClassVariable, BaseUserFunctionVariable)
         ):
             return variables.ConstantVariable.create(True)
+        elif isinstance(arg, UserDefinedVariable):
+            return variables.ConstantVariable.create(callable(arg.value))
+        elif isinstance(arg, (ConstantVariable, SymNodeVariable, TensorVariable)):
+            return variables.ConstantVariable.create(False)
 
     def call_cast(self, _, *args, **kwargs):
         if len(args) == 2:
@@ -965,6 +1294,21 @@ def call_custom_dict_fromkeys(tx, user_cls, *args, **kwargs):
             )
         unimplemented(f"{user_cls.__name__}.fromkeys(): {args} {kwargs}")
 
+    def call_set(self, tx, *args, **kwargs):
+        # Can we merge this implementation and call_dict's one?
+        assert not kwargs
+        if not args:
+            return SetVariable([], mutable_local=MutableLocal())
+        assert len(args) == 1
+        arg = args[0]
+        if isinstance(arg, variables.SetVariable):
+            return arg.clone(mutable_local=MutableLocal())
+        elif arg.has_unpack_var_sequence(tx):
+            items = arg.unpack_var_sequence(tx)
+            return SetVariable(items, mutable_local=MutableLocal())
+        else:
+            unimplemented(f"set(): {args} {kwargs}")
+
     def call_zip(self, tx, *args, **kwargs):
         if kwargs:
             assert len(kwargs) == 1 and "strict" in kwargs
@@ -1002,7 +1346,12 @@ def call_getitem(self, tx, *args, **kwargs):
         return args[0].call_method(tx, "__getitem__", args[1:], kwargs)
 
     def call_isinstance(self, tx, arg, isinstance_type):
-        arg_type = arg.python_type()
+        try:
+            arg_type = arg.python_type()
+        except NotImplementedError:
+            unimplemented(
+                f"isinstance({arg}, {isinstance_type}): can't determine type of {arg}"
+            )
 
         isinstance_type = isinstance_type.as_python_constant()
 
@@ -1057,14 +1406,14 @@ def call_issubclass(self, tx, left_ty, right_ty):
     def call_super(self, tx, a, b):
         return variables.SuperVariable(a, b)
 
-    def call_next(self, tx, arg):
-        if isinstance(
-            arg, (variables.ListIteratorVariable, variables.IteratorVariable)
-        ):
-            val, next_iter = arg.next_variables(tx)
-            return val
-        elif isinstance(arg, variables.BaseListVariable):
-            return arg.items[0]
+    def call_next(self, tx, arg: VariableTracker):
+        try:
+            return arg.next_variable(tx)
+        except Unsupported as ex:
+            if isinstance(arg, variables.BaseListVariable):
+                ex.remove_from_stats()
+                return arg.items[0]
+            raise
 
     def call_hasattr(self, tx, obj, attr):
         if attr.is_python_constant():
@@ -1107,6 +1456,9 @@ def call_sum(self, tx, seq, start=_SENTINEL):
                 {},
             )
 
+    def call_StopIteration(self, tx, *args):
+        return variables.StopIterationVariable([*args])
+
     def call_reduce(self, tx, function, iterable, initial=_SENTINEL):
         if iterable.has_unpack_var_sequence(tx):
             items = iterable.unpack_var_sequence(tx)
@@ -1136,7 +1488,25 @@ def call_getattr(
         if not name_var.is_python_constant():
             unimplemented("non-const getattr() name")
 
-        if tx.output.side_effects.is_attribute_mutation(obj) and name != "grad":
+        if tx.output.side_effects.is_attribute_mutation(obj):
+            if isinstance(obj, variables.UnspecializedNNModuleVariable):
+                if (
+                    name
+                    in (
+                        "named_parameters",
+                        "parameters",
+                        "named_buffers",
+                        "buffers",
+                        "named_modules",
+                        "modules",
+                    )
+                    and obj.is_state_mutated
+                    and tx.output.side_effects.has_pending_mutation(obj)
+                ):
+                    unimplemented(
+                        f"pending mutation on nn module, so graph breaking at {name!r} call"
+                    )
+
             try:
                 # re-read a pending side effect?
                 return tx.output.side_effects.load_attr(obj, name)
@@ -1167,7 +1537,7 @@ def call_getattr(
                             for i, b in enumerate(bases)
                         ]
                     else:
-                        tuple_args = [SourcelessBuilder()(tx, b) for b in bases]
+                        tuple_args = [SourcelessBuilder.create(tx, b) for b in bases]
 
                     return variables.TupleVariable(tuple_args, **options)
             except NotImplementedError:
@@ -1175,61 +1545,13 @@ def call_getattr(
 
         if isinstance(obj, variables.NNModuleVariable):
             return obj.var_getattr(tx, name)
-        elif isinstance(obj, variables.TensorVariable) and name == "grad":
-            if source:
-                # We are going to be raising this tensor as grapharg. So, ensure
-                # that we have real grad value instead of fake tensor value.
-                # Walk through the inputs of the subgraph and find if we already
-                # have the original tensor stored in the graphargs.
-                for grapharg in tx.output.graphargs:
-                    if grapharg.source == source.base:
-                        old_grad = grapharg.example.grad
-                        new_grad = obj.as_proxy().node.meta["example_value"].grad
-
-                        def _grad_changed(old, new):
-                            if old is None or new is None:
-                                return new is not old
-                            try:
-                                if old.shape != new.shape:
-                                    return True
-                                if old.stride() != new.stride():
-                                    return True
-                                return False
-                            except TypeError as te:
-                                # There is a rare edge case in which
-                                # we seem to get symbol mismatches
-                                # for jagged tensor comparison.
-                                # See PYTORCH_TEST_WITH_DYNAMO=1 python test/test_nestedtensor.py
-                                #   -k test_dropout_backward_layout_torch_jagged_cpu
-                                unimplemented(str(te))
-
-                        if _grad_changed(old_grad, new_grad):
-                            if new_grad is not None:
-                                grad_shape_specialized = [
-                                    int(x) for x in new_grad.shape
-                                ]
-                                # We lazily update the grad on the example to its real state as tracked by fake tensor.
-                                # This allocation is fine - it is just a hint. It will not make it to runtime, but it coerces
-                                # the underlying value to always be correct.
-                                grapharg.example.grad = torch.zeros(
-                                    grad_shape_specialized, device=new_grad.device
-                                )
-                            else:
-                                grapharg.example.grad = None
-                        return VariableBuilder(tx, source)(grapharg.example.grad)
-
-                return obj.dynamic_getattr(tx, name)
-            else:
-                example_value = obj.as_proxy().node.meta["example_value"]
-                if example_value.grad is not None:
-                    unimplemented("getattr on non-None grad - NYI")
-                return ConstantVariable(None)
         elif isinstance(
             obj,
             (
                 variables.TensorVariable,
                 variables.NamedTupleVariable,
                 variables.ConstantVariable,
+                variables.DistributedVariable,
                 variables.UserDefinedClassVariable,
                 variables.UserDefinedObjectVariable,
             ),
@@ -1241,7 +1563,9 @@ def _grad_changed(old, new):
         elif isinstance(obj, TorchInGraphFunctionVariable):
             # Get OpOverload from an OpOverloadPacket, e.g., torch.ops.aten.add.default.
             member = getattr(obj.value, name)
-            if trace_rules.is_aten_op_or_tensor_method(member):
+            if isinstance(
+                member, (torch._ops.OpOverloadPacket, torch._ops.OpOverload)
+            ) and trace_rules.is_aten_op_or_tensor_method(member):
                 return TorchInGraphFunctionVariable(member, **options)
         elif isinstance(obj, (PythonModuleVariable, DummyModule)):
             if obj.is_torch:
@@ -1252,15 +1576,10 @@ def _grad_changed(old, new):
             if config.replay_record_enabled:
                 tx.exec_recorder.record_module_access(obj.value, name, member)
 
-            if is_utils_checkpoint(member):
-                options["source"] = source
-                return build_checkpoint_variable(**options)
-            elif trace_rules.lookup(member) is not None:
-                return trace_rules.lookup(member)(member, **options)
-            elif source is not None:
+            if source is not None:
                 return VariableBuilder(tx, source)(member)
             else:
-                return SourcelessBuilder()(tx, member)
+                return SourcelessBuilder.create(tx, member)
         elif istype(obj, UserFunctionVariable) and name in ("__name__", "__module__"):
             return ConstantVariable.create(getattr(obj.fn, name))
         else:
@@ -1272,14 +1591,13 @@ def _grad_changed(old, new):
     def call_setattr(
         self, tx, obj: VariableTracker, name_var: VariableTracker, val: VariableTracker
     ):
-        from .distributed import PlacementVariable
-
         if isinstance(
             obj,
             (
                 variables.DataClassVariable,
                 variables.CustomizedDictVariable,
-                PlacementVariable,
+                variables.PlacementVariable,
+                variables.UserDefinedObjectVariable,
             ),
         ):
             return obj.call_method(tx, "__setattr__", [name_var, val], {})
@@ -1324,7 +1642,7 @@ def call_setattr(
 
                     # Step 3 - drop the version counter - this is a step required to get
                     # .data setting to play correctly with the autograd engine.
-                    # Esentially, dynamo is trying to faithful preserve the (absurd)
+                    # Essentially, dynamo is trying to faithfully preserve the (absurd)
                     # behavior of .data= from eager mode
                     def _lower_version_count_by_1(x):
                         version = x._version
@@ -1408,7 +1726,7 @@ def call_type(self, tx, obj: VariableTracker):
             ) from None
 
         if obj.source is None:
-            return SourcelessBuilder()(tx, py_type)
+            return SourcelessBuilder.create(tx, py_type)
         else:
             return VariableBuilder(tx, TypeSource(obj.source))(py_type)
 
@@ -1486,144 +1804,63 @@ def call_id(self, tx, *args):
     def call_deepcopy(self, tx, x):
         unimplemented(f"copy.deepcopy {repr(x)}")
 
-    def _comparison(self, tx, left, right):
-        """
-        Used to implement comparison operators for different types.
-        For example, list1 < list2 is implemented differently from tensor1 < tensor2
-        """
-        from . import (
-            BaseListVariable,
-            ConstantVariable,
-            NNModuleVariable,
-            TensorVariable,
-            UserDefinedObjectVariable,
-            UserFunctionVariable,
-        )
-        from .lists import SizeVariable
-        from .tensor import (
-            supported_const_comparison_ops,
-            supported_tensor_comparison_ops,
-        )
+    def _comparison_with_tensor(self, tx, left, right):
+        from .builder import wrap_fx_proxy_cls
+        from .tensor import supported_tensor_comparison_op_values
 
         op = self.fn
 
-        def _unimplemented():
-            unimplemented(f"comparison {typestr(left)} {op} {typestr(right)}")
-
-        if (
-            all(
-                isinstance(x, (NNModuleVariable, ConstantVariable))
-                for x in [left, right]
-            )
-            and op in supported_const_comparison_ops.values()
-        ):
-            left = (
-                tx.output.get_submodule(left.module_key)
-                if isinstance(left, NNModuleVariable)
-                else left.as_python_constant()
-            )
-            right = (
-                tx.output.get_submodule(right.module_key)
-                if isinstance(right, NNModuleVariable)
-                else right.as_python_constant()
-            )
-            return ConstantVariable.create(op(left, right))
-
-        if isinstance(left, UserFunctionVariable):
-            if op not in supported_const_comparison_ops.values():
-                _unimplemented()
-            if not isinstance(right, UserFunctionVariable):
-                _unimplemented()
-            return ConstantVariable.create(op(left.fn, right.fn))
-
-        # Note, we have a rare BaseListVariable subtype mismatch with valid comparison
-        # x = torch.randn([3, 3])
-        # x.size() == (3, 3) # True
-        # (3, 3) == x.size() # True
-        if isinstance(left, (SizeVariable, TupleVariable)) and isinstance(
-            right, (TupleVariable, SizeVariable)
-        ):
-            return BaseListVariable.list_compare(tx, op, left, right)
-
-        if isinstance(left, BaseListVariable):
-            if not type(left) == type(right):  # Mismatch in BaseListVariable subclasses
-                _unimplemented()
-            return BaseListVariable.list_compare(tx, op, left, right)
-
-        if isinstance(left, SetVariable):
-            if not type(left) == type(right):  # Mismatch in BaseListVariable subclasses
-                _unimplemented()
-            return ConstantVariable.create(
-                op(left._underlying_items, right._underlying_items)
-            )
-
-        if isinstance(left, TensorVariable) or isinstance(right, TensorVariable):
-            from .builder import wrap_fx_proxy_cls
-
-            if op is operator.is_:
-                return ConstantVariable.create(
-                    isinstance(left, TensorVariable)
-                    and isinstance(right, TensorVariable)
-                    and id(extract_fake_example_value(left.as_proxy().node))
-                    == id(extract_fake_example_value(right.as_proxy().node))
-                )
-
-            if op not in supported_tensor_comparison_ops.values():
-                _unimplemented()
-            if (
+        if op in [operator.is_, operator.is_not]:
+            is_result = (
                 isinstance(left, TensorVariable)
                 and isinstance(right, TensorVariable)
-                and (left.size and right.size) is not None
-                and left.size != right.size
-            ):
-                try:
-                    torch.broadcast_shapes(left.size, right.size)
-                except RuntimeError:
-                    # not broadcastable, can't be compared
-                    _unimplemented()
-            tensor_cls = left if isinstance(left, TensorVariable) else right
-            proxy = tx.output.create_proxy(
-                "call_function", op, (left.as_proxy(), right.as_proxy()), {}
-            )
-            return wrap_fx_proxy_cls(
-                type(tensor_cls),  # handle Ndarrays and Tensors
-                tx,
-                proxy,
-            )
-
-        if isinstance(left, SymNodeVariable) or isinstance(right, SymNodeVariable):
-            if op not in supported_tensor_comparison_ops.values():
-                _unimplemented()
-
-            proxy = tx.output.create_proxy(
-                "call_function", op, (left.as_proxy(), right.as_proxy()), {}
-            )
-            return SymNodeVariable.create(
-                tx,
-                proxy,
-                sym_num=None,
+                and id(extract_fake_example_value(left.as_proxy().node))
+                == id(extract_fake_example_value(right.as_proxy().node))
             )
+            if op is operator.is_:
+                return ConstantVariable.create(is_result)
+            else:
+                return ConstantVariable.create(not is_result)
 
-        if isinstance(left, UserDefinedObjectVariable) and isinstance(
-            right, UserDefinedObjectVariable
+        if op not in supported_tensor_comparison_op_values:
+            unimplemented(f"{op.__name__}({left}, {right})")
+        if (
+            isinstance(left, TensorVariable)
+            and isinstance(right, TensorVariable)
+            and (left.size and right.size) is not None
+            and left.size != right.size
         ):
-            return ConstantVariable.create(op(left.value, right.value))
+            try:
+                torch.broadcast_shapes(left.size, right.size)
+            except RuntimeError:
+                # not broadcastable, can't be compared
+                unimplemented(f"{op.__name__}({left}, {right})")
+        tensor_cls = left if isinstance(left, TensorVariable) else right
+        proxy = tx.output.create_proxy(
+            "call_function", op, (left.as_proxy(), right.as_proxy()), {}
+        )
+        return wrap_fx_proxy_cls(
+            type(tensor_cls),  # handle Ndarrays and Tensors
+            tx,
+            proxy,
+        )
 
-        if (
-            (isinstance(left, StreamVariable) and isinstance(right, StreamVariable))
-            or (isinstance(left, EventVariable) and isinstance(right, EventVariable))
-        ) and op is operator.eq:
-            return ConstantVariable(op(left.value, right.value))
+    def _comparison_with_symnode(self, tx, left, right):
+        from .tensor import supported_tensor_comparison_op_values
 
-        if op.__name__ == "is_":
-            # If the two objects are of different type, we can safely return False
-            if type(left) is not type(right):
-                return ConstantVariable.create(False)
+        op = self.fn
 
-        if isinstance(left, BuiltinVariable) and isinstance(right, BuiltinVariable):
-            return ConstantVariable.create(op(left.fn, right.fn))
+        if op not in supported_tensor_comparison_op_values:
+            unimplemented(f"{op.__name__}({left}, {right})")
 
-        _unimplemented()
+        proxy = tx.output.create_proxy(
+            "call_function", op, (left.as_proxy(), right.as_proxy()), {}
+        )
+        return SymNodeVariable.create(
+            tx,
+            proxy,
+            sym_num=None,
+        )
 
     def call_and_(self, tx, a, b):
         # Rely on constant_handler
@@ -1639,8 +1876,9 @@ def call_and_(self, tx, a, b):
                 ),
                 sym_num=None,
             )
+        if hasattr(a, "set_items") and hasattr(b, "set_items"):
+            return SetVariable(list(a.set_items & b.set_items))
         # None no-ops this handler and lets the driving function proceed
-        return None
 
     def call_or_(self, tx, a, b):
         # Rely on constant_handler
@@ -1656,6 +1894,8 @@ def call_or_(self, tx, a, b):
                 ),
                 sym_num=None,
             )
+        if hasattr(a, "set_items") and hasattr(b, "set_items"):
+            return SetVariable(list(a.set_items | b.set_items))
         # None no-ops this handler and lets the driving function proceed
         return None
 
@@ -1669,19 +1909,16 @@ def call_not_(self, tx, a):
                 sym_num=None,
             )
 
-        if isinstance(a, ListVariable):
+        # Unwrap the underlying ConstDictVariable
+        if isinstance(a, DictView):
+            a = a.dv_dict
+        if isinstance(a, (ListVariable, ConstDictVariable)):
             return ConstantVariable.create(len(a.items) == 0)
 
         return None
 
-    call_eq = _comparison
-    call_gt = _comparison
-    call_lt = _comparison
-    call_ge = _comparison
-    call_le = _comparison
-    call_ne = _comparison
-    call_is_ = _comparison
-    call_is_not = _comparison
+    def call_contains(self, tx, a: VariableTracker, b: VariableTracker):
+        return a.call_method(tx, "__contains__", [b], {})
 
     call_all = _polyfill_call_impl("all")
     call_any = _polyfill_call_impl("any")
diff --git a/torch/_dynamo/variables/constant.py b/torch/_dynamo/variables/constant.py
index a518acd8ce37a..29a3a72a6f86f 100644
--- a/torch/_dynamo/variables/constant.py
+++ b/torch/_dynamo/variables/constant.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import operator
 from typing import Dict, List
 
@@ -7,7 +9,7 @@
 from .. import variables
 from ..exc import unimplemented, UserError, UserErrorType
 from ..guards import GuardBuilder, install_guard
-from ..utils import istype, np
+from ..utils import common_constant_types, istype, np
 from .base import typestr, VariableTracker
 
 _type_to_assert_reason = {
@@ -71,8 +73,7 @@ def as_proxy(self):
         return self.value
 
     def __str__(self):
-        # return f"ConstantVariable({self.value})"
-        return f"ConstantVariable({type(self.value).__name__})"
+        return f"ConstantVariable({type(self.value).__name__}: {repr(self.value)})"
 
     def python_type(self):
         return type(self.value)
@@ -80,6 +81,9 @@ def python_type(self):
     def as_python_constant(self):
         return self.value
 
+    def is_python_constant(self):
+        return True
+
     @property
     def items(self):
         """
@@ -95,16 +99,7 @@ def getitem_const(self, arg: VariableTracker):
 
     @staticmethod
     def is_literal(obj):
-        if type(obj) in (
-            int,
-            float,
-            bool,
-            type(None),
-            str,
-            Ellipsis.__class__,
-            torch.dtype,
-            torch.device,
-        ):
+        if type(obj) in common_constant_types:
             return True
         # The structure within is_literal get routed to variables.BaseListVariable
         if type(obj) in (list, tuple, set, frozenset, torch.Size):
@@ -127,7 +122,7 @@ def const_getattr(self, tx, name):
             )
         member = getattr(self.value, name)
         if callable(member):
-            raise NotImplementedError()
+            raise NotImplementedError
         return member
 
     def call_method(
@@ -217,5 +212,5 @@ def as_python_constant(self):
     def const_getattr(self, tx, name):
         member = getattr(self.value, name)
         if callable(member):
-            raise NotImplementedError()
+            raise NotImplementedError
         return member
diff --git a/torch/_dynamo/variables/ctx_manager.py b/torch/_dynamo/variables/ctx_manager.py
index d8dd581d938df..637636f1e0469 100644
--- a/torch/_dynamo/variables/ctx_manager.py
+++ b/torch/_dynamo/variables/ctx_manager.py
@@ -1,12 +1,19 @@
+# mypy: ignore-errors
 import dataclasses
 import inspect
+import sys
+import warnings
 from typing import Callable, Dict, List, Optional
 
 import torch._C
 from torch._guards import Guard
 
 from .. import variables
-from ..bytecode_transformation import create_call_function, create_instruction
+from ..bytecode_transformation import (
+    create_call_function,
+    create_instruction,
+    create_setup_with,
+)
 from ..device_interface import get_interface_for_device
 from ..exc import unimplemented, Unsupported
 from ..guards import GuardBuilder, install_guard
@@ -75,11 +82,20 @@ def exit(self, tx, *args):
         self.state.cleanup_assert()
         return variables.ConstantVariable.create(None)
 
-    def reconstruct(self, codegen):
-        attr_source = AttrSource(
-            codegen.tx.import_source(self.module_name()), self.fn_name()
+    def reconstruct_type(self, codegen):
+        codegen(
+            AttrSource(codegen.tx.import_source(self.module_name()), self.fn_name())
         )
-        return attr_source.reconstruct(codegen)
+
+    def reconstruct(self, codegen):
+        if sys.version_info >= (3, 11):
+            codegen.append_output(create_instruction("PUSH_NULL"))
+        self.reconstruct_type(codegen)
+        target_values = self.target_values
+        if not target_values:
+            target_values = ()
+        codegen.extend_output([codegen.create_load_const(val) for val in target_values])
+        codegen.extend_output(create_call_function(len(target_values), False))
 
     def module_name(self):
         raise NotImplementedError("module_name called on base")
@@ -119,9 +135,10 @@ def enter(self, tx):
                 source=source,
             ).call_function(tx, [], {})
         except Unsupported as e:
-            raise unimplemented(
-                f"Unsupported context manager {self.cm_obj}'s __enter__ function"
-            ) from e
+            unimplemented(
+                f"Unsupported context manager {self.cm_obj}'s __enter__ function",
+                from_exc=e,
+            )
 
     def exit(self, tx, *args):
         source = None if self.source is None else AttrSource(self.source, "__exit__")
@@ -140,14 +157,282 @@ def exit(self, tx, *args):
                 {},
             )
         except Unsupported as e:
-            raise unimplemented(
-                f"Unsupported context manager {self.cm_obj}'s __exit__ function"
-            ) from e
+            unimplemented(
+                f"Unsupported context manager {self.cm_obj}'s __exit__ function",
+                from_exc=e,
+            )
 
         tx.generic_context_manager_depth -= 1
         return x
 
 
+class GradInplaceRequiresGradCtxManagerVariable(ContextWrappingVariable):
+    """represents torch grad requries grad"""
+
+    @staticmethod
+    def create(tx, target_values, **kwargs):
+        return GradInplaceRequiresGradCtxManagerVariable(
+            target_values=target_values,
+            initial_values=None,
+            **kwargs,
+        )
+
+    def enter(self, tx):
+        [enabled] = self.target_values
+        self.prev_state = torch._C._functorch.get_inplace_requires_grad_allowed()
+        torch._C._functorch.set_inplace_requires_grad_allowed(enabled)
+        self.set_cleanup_hook(
+            tx,
+            lambda: torch._C._functorch.set_inplace_requires_grad_allowed(
+                self.prev_state
+            ),
+        )
+        self.state.proxy = tx.output.create_node(
+            "call_function",
+            torch._C._functorch.set_inplace_requires_grad_allowed,
+            (enabled,),
+            {},
+        )
+        return variables.ConstantVariable.create(None)
+
+    def exit(self, tx, *args):
+        self.state.cleanup()
+        tx.output.create_node(
+            "call_function",
+            torch._C._functorch.set_inplace_requires_grad_allowed,
+            (self.prev_state,),
+            {},
+        )
+        return variables.ConstantVariable.create(None)
+
+
+class JvpIncrementNestingCtxManagerVariable(ContextWrappingVariable):
+    """represents torch.func.jvp increment/decrement nesting"""
+
+    # A guard is needed as the grad level is baked into the torch FX graph
+    # This is fine if jvp is only called from within the function
+    # being compiled. But the FX graph may be invalid in the case of a jvp
+    # call from eager that calls the compiled function, as the jvp levels
+    # may be different.
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.FUNCTORCH_STACK_MATCH)
+
+    @staticmethod
+    def create(tx, **kwargs):
+        var = JvpIncrementNestingCtxManagerVariable(
+            target_values=None,
+            initial_values=None,
+            **kwargs,
+        )
+        return var
+
+    def enter(self, tx):
+        install_guard(self._guards_singleton)
+        jvp_level = torch._functorch.eager_transforms.enter_jvp_nesting()
+        self.set_cleanup_hook(
+            tx, lambda: torch._functorch.eager_transforms.exit_jvp_nesting()
+        )
+        self.state.proxy = tx.output.create_node(
+            "call_function",
+            torch._C._functorch._jvp_increment_nesting,
+            (),
+            {},
+        )
+        return variables.ConstantVariable.create(jvp_level)
+
+    def exit(self, tx, *args):
+        self.state.cleanup()
+        tx.output.create_node(
+            "call_function", torch._C._functorch._jvp_decrement_nesting, (), {}
+        )
+        return variables.ConstantVariable.create(None)
+
+
+class SetFwdGradEnabledContextManager(ContextWrappingVariable):
+    """represents torch.autograd.forward_ad._set_fwd_grad_enabled() to enable/disable fwd grad"""
+
+    @staticmethod
+    def create(tx, target_values, **kwargs):
+        return SetFwdGradEnabledContextManager(
+            target_values=target_values,
+            initial_values=None,
+            **kwargs,
+        )
+
+    def enter(self, tx):
+        [mode] = self.target_values
+        self.prev_state = torch._C._is_fwd_grad_enabled()
+        torch._C._set_fwd_grad_enabled(mode)
+        self.set_cleanup_hook(
+            tx,
+            lambda: torch._C._set_fwd_grad_enabled(self.prev_state),
+        )
+        self.state.proxy = tx.output.create_node(
+            "call_function",
+            torch._C._set_fwd_grad_enabled,
+            (mode,),
+            {},
+        )
+        return variables.ConstantVariable.create(None)
+
+    def exit(self, tx, *args):
+        self.state.cleanup()
+        tx.output.create_node(
+            "call_function",
+            torch._C._set_fwd_grad_enabled,
+            (self.prev_state,),
+            {},
+        )
+        return variables.ConstantVariable.create(None)
+
+
+class DualLevelContextManager(ContextWrappingVariable):
+    """Represents torch.autograd.forward_ad.dual_level ctx manager"""
+
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.DUAL_LEVEL)
+
+    @staticmethod
+    def create(tx, **kwargs):
+        return DualLevelContextManager(
+            target_values=None,
+            initial_values=None,
+            **kwargs,
+        )
+
+    def enter(self, tx):
+        install_guard(self._guards_singleton)
+        self.new_level = torch.autograd.forward_ad.enter_dual_level()
+        self.set_cleanup_hook(
+            tx, lambda: torch.autograd.forward_ad.exit_dual_level(level=self.new_level)
+        )
+        self.state.proxy = tx.output.create_node(
+            "call_function",
+            torch._C._enter_dual_level,
+            (),
+            {},
+        )
+        return variables.ConstantVariable.create(self.new_level)
+
+    def exit(self, tx, *args):
+        self.state.cleanup()
+        tx.output.create_node(
+            "call_function",
+            torch._C._exit_dual_level,
+            (self.new_level,),
+            {},
+        )
+        return variables.ConstantVariable.create(None)
+
+
+class GradIncrementNestingCtxManagerVariable(ContextWrappingVariable):
+    """represents torch.func.grad increment/decrement nesting"""
+
+    # A guard is needed as the grad level is baked into the torch FX graph
+    # This is fine if grad is only called from within the function
+    # being compiled. But the FX graph may be invalid in the case of a grad
+    # call from eager that calls the compiled function, as the grad levels
+    # may be different.
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.FUNCTORCH_STACK_MATCH)
+
+    @staticmethod
+    def create(tx, **kwargs):
+        var = GradIncrementNestingCtxManagerVariable(
+            target_values=None,
+            initial_values=None,
+            **kwargs,
+        )
+        return var
+
+    def enter(self, tx):
+        install_guard(self._guards_singleton)
+        grad_level = torch._C._functorch._grad_increment_nesting()
+        self.set_cleanup_hook(tx, lambda: torch._C._functorch._grad_decrement_nesting())
+        self.state.proxy = tx.output.create_node(
+            "call_function",
+            torch._C._functorch._grad_increment_nesting,
+            (),
+            {},
+        )
+        return variables.ConstantVariable.create(grad_level)
+
+    def exit(self, tx, *args):
+        self.state.cleanup()
+        tx.output.create_node(
+            "call_function", torch._C._functorch._grad_decrement_nesting, (), {}
+        )
+        return variables.ConstantVariable.create(None)
+
+
+class CatchWarningsCtxManagerVariable(ContextWrappingVariable):
+    """Delay a call to warnings.catch_warnings"""
+
+    @staticmethod
+    def create(tx, catch_warnings_args):
+        return CatchWarningsCtxManagerVariable(
+            catch_warnings_args=catch_warnings_args,
+            target_values=None,
+            initial_values=None,
+        )
+
+    def __init__(self, catch_warnings_args, **kwargs):
+        assert isinstance(catch_warnings_args, dict), catch_warnings_args
+        super().__init__(**kwargs)
+        self.catch_warnings_args = catch_warnings_args
+
+    def enter(self, tx):
+        kwargs = {
+            k: v.as_python_constant() for k, v in self.catch_warnings_args.items()
+        }
+        ctx_val = warnings.catch_warnings(**kwargs)
+        self.set_cleanup_hook(tx, lambda: ctx_val.__exit__(None, None, None))
+        return variables.ConstantVariable.create(ctx_val.__enter__())
+
+    def reconstruct(self, cg):
+        cg.load_import_from("warnings", "catch_warnings")
+        cg.foreach(self.catch_warnings_args.values())
+        keys = tuple(self.catch_warnings_args.keys())
+        cg.extend_output(cg.create_call_function_kw(len(keys), keys, True))
+
+
+class VmapIncrementNestingCtxManagerVariable(ContextWrappingVariable):
+    """represents torch VMap increment/decrement nesting"""
+
+    # A guard is needed as the vmap level is baked into the torch FX graph
+    # generated. This is fine if vmap is only called from within the function
+    # being compiled. But the FX graph may be invalid in the case of a vmap
+    # call from eager that calls the compiled function, as the vmap levels
+    # may be different.
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.FUNCTORCH_STACK_MATCH)
+
+    @staticmethod
+    def create(tx, target_values, **kwargs):
+        var = VmapIncrementNestingCtxManagerVariable(
+            target_values=target_values,
+            initial_values=None,
+            **kwargs,
+        )
+        return var
+
+    def enter(self, tx):
+        install_guard(self._guards_singleton)
+        batch_size, randomness = self.target_values
+        vmap_level = torch._C._functorch._vmap_increment_nesting(batch_size, randomness)
+        self.set_cleanup_hook(tx, lambda: torch._C._functorch._vmap_decrement_nesting())
+        self.state.proxy = tx.output.create_node(
+            "call_function",
+            torch._C._functorch._vmap_increment_nesting,
+            (batch_size, randomness),
+            {},
+        )
+        return variables.ConstantVariable.create(vmap_level)
+
+    def exit(self, tx, *args):
+        self.state.cleanup()
+        tx.output.create_node(
+            "call_function", torch._C._functorch._vmap_decrement_nesting, (), {}
+        )
+        return variables.ConstantVariable.create(None)
+
+
 class GradModeVariable(ContextWrappingVariable):
     """represents torch.{no_grad,enable_grad,set_grad_mode}()"""
 
@@ -203,9 +488,9 @@ def fn_name(self):
 
 class InferenceModeVariable(ContextWrappingVariable):
     @staticmethod
-    def create(tx, target_values, **kwargs):
+    def create(tx, target_value, **kwargs):
         var = InferenceModeVariable(
-            target_values, initial_values=torch.is_inference_mode_enabled(), **kwargs
+            [target_value], initial_values=torch.is_inference_mode_enabled(), **kwargs
         )
         return var
 
@@ -233,19 +518,19 @@ def exit(self, tx, *args):
         )
 
     def enter(self, tx):
-        ctx = torch.autograd.grad_mode._enter_inference_mode(self.target_values)
+        ctx = torch.autograd.grad_mode._enter_inference_mode(*self.target_values)
         self.set_cleanup_hook(
             tx, lambda: torch.autograd.grad_mode._exit_inference_mode(ctx)
         )
         self.state.proxy = tx.output.create_node(
             "call_function",
             torch.autograd.grad_mode._enter_inference_mode,
-            (self.target_values,),
+            (*self.target_values,),
             {},
         )
 
     def module_name(self):
-        return "torch.inference_mode"
+        return "torch"
 
     def fn_name(self):
         return "inference_mode"
@@ -521,11 +806,41 @@ def exit(self, tx, *args):
         )
         self.state.cleanup_assert()
 
-    def module_name(self):
-        return "torch." + str(self.device)
 
-    def fn_name(self):
-        return "stream"
+class PreserveVersionContextVariable(ContextWrappingVariable):
+    """
+    Wraps torch.autograd._unsafe_preserve_version_counter
+    """
+
+    @staticmethod
+    def constructor(tx):
+        return variables.LambdaVariable(
+            lambda tensor: PreserveVersionContextVariable(
+                tensor,
+                tensor.var_getattr(tx, "_version"),
+            )
+        )
+
+    def __init__(self, tensor, prev_version, **kwargs):
+        kwargs.setdefault("target_values", None)
+        super().__init__(**kwargs)
+        self.tensor = tensor
+        self.prev_version = prev_version
+
+    def enter(self, tx):
+        pass
+
+    def exit(self, tx, *args):
+        from ..tensor_version_op import _unsafe_set_version_counter
+
+        return variables.TorchInGraphFunctionVariable(
+            _unsafe_set_version_counter
+        ).call_function(tx, [self.tensor, self.prev_version], {})
+
+    def reconstruct(self, codegen):
+        unimplemented(
+            "torch.autograd._unsafe_preserve_version_counter with graph break"
+        )
 
 
 class StreamVariable(VariableTracker):
@@ -533,7 +848,7 @@ def __init__(self, proxy, value, device, **kwargs):
         if proxy is not None and "example_value" in proxy.node.meta:
             assert proxy.node.meta["example_value"] == value
         assert (
-            value.device.type == device
+            value.device.type == device.type
         ), "stream value is not equal to the passed device"
         super().__init__(**kwargs)
         self.proxy = proxy
@@ -586,6 +901,22 @@ def call_method(
     def as_proxy(self):
         return self.proxy
 
+    def reconstruct(self, codegen):
+        # If we got here, this stream is fully subsumed by the graph - this means it is
+        # not an input or global
+        assert not self.source
+        # Since we just proved that - for other such structures, like lists and dicts, reconstruction
+        # is fine and sound according to dynamo principles of treating collectives. However,
+        # streams are special in that we want to preserve the identity of the stream as the same as in the graph
+        # Normally, we would do this via codegen for the proxy mapping to an output - we cannot do this yet, as we do not
+        # yet have a plan for how we want to handle the case where the stream is used as an input or an output. Pending
+        # design, to unblock current work, we lift the stream into a global and then codegen bytecode to load it from there.
+        prefix = f"_stream_{self.device}"
+        name = codegen.tx.output.install_global_by_id(prefix, self.value)
+        codegen.append_output(
+            codegen.create_load_global(name, push_null=False, add=True)
+        )
+
 
 class EventVariable(VariableTracker):
     def __init__(self, proxy, value, **kwargs):
@@ -626,6 +957,11 @@ def as_proxy(self):
 
 
 class WithExitFunctionVariable(VariableTracker):
+    _nonvar_fields = {
+        "target",
+        *VariableTracker._nonvar_fields,
+    }
+
     def __init__(self, ctx: ContextWrappingVariable, target, **kwargs):
         super().__init__(**kwargs)
         assert isinstance(ctx, ContextWrappingVariable)
@@ -642,18 +978,16 @@ def reconstruct(self, codegen):
         # Note here we reconstruct the context manager rather than the
         # exit function.  The handler generated by BlockStackEntry
         # will re-enter the context in the resume function.
-        output = AttrSource(
-            codegen.tx.import_source(self.ctx.module_name()), self.ctx.fn_name()
-        ).reconstruct(codegen)
-
+        self.ctx.reconstruct_type(codegen)
         if codegen.tx.output.partial_convert:
-            loads = [codegen.create_load_const(val) for val in self.ctx.target_values]
-            output.extend(loads)
-            output.extend(
-                [
-                    *create_call_function(len(loads), True),
-                    create_instruction("SETUP_WITH", target=self.target),
-                    create_instruction("POP_TOP"),
-                ]
+            if sys.version_info >= (3, 11):
+                codegen.append_output(create_instruction("PUSH_NULL"))
+                codegen.append_output(create_instruction("SWAP", arg=2))
+            codegen.extend_output(
+                [codegen.create_load_const(val) for val in self.ctx.target_values]
+            )
+            codegen.extend_output(
+                create_call_function(len(self.ctx.target_values), False)
             )
-        return output
+            codegen.append_output(create_setup_with(self.target))
+            codegen.append_output(create_instruction("POP_TOP"))
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index 964539ad58561..db97004d545c0 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -1,52 +1,36 @@
+# mypy: ignore-errors
+
 import collections
 import dataclasses
-import enum
 import functools
 import inspect
 import sys
-from typing import Any, Dict, List, Optional
-
-import torch
+from typing import Dict, List, Optional
 
 from torch._subclasses.fake_tensor import is_fake
 
 from .. import variables
-from ..bytecode_transformation import create_call_function, create_instruction
+from ..bytecode_transformation import (
+    create_call_function,
+    create_call_method,
+    create_instruction,
+    create_load_method,
+)
 from ..eval_frame import skip_code
 
 from ..exc import unimplemented
-from ..guards import GuardBuilder, install_guard, make_dupe_guard
+from ..guards import GuardBuilder, install_guard
 from ..source import AttrSource, GetItemSource
-from ..utils import istype, iter_contains, specialize_symnode
+from ..utils import dict_keys, dict_values, istype, specialize_symnode
 from .base import MutableLocal, VariableTracker
 from .constant import ConstantVariable
 
-# Note: [Adding a new supported class the keys of ConstDictVarialble]
-# You'll need to add it to:
-# - `is_hashable_python_var` in this file
-# - `is_hashable` in this file
-# - `const_repr` in util.py, and perhaps modify DICT_KEYS in guards.py
-
-
-def is_hashable_python_var(x):
-    from torch import Tensor
-
-    # Note: Keep me in sync with is_hashable!
-    # Even better, we should have a map of functions connecting the two
-    from ..trace_rules import is_builtin_callable, is_numpy
-
-    return (
-        ConstantVariable.is_literal(x)
-        or isinstance(x, (Tensor, enum.Enum, type, torch.nn.Module))
-        or is_builtin_callable(x)
-        or (isinstance(x, tuple) and all(is_hashable_python_var(e) for e in x))
-        or is_numpy(x)
-    )
+# [Adding a new supported class within the keys of ConstDictVarialble]
+# - Add its tracker type to is_hashable
+# - (perhaps) Define how it is compared in _HashableTracker._eq_impl
 
 
 def is_hashable(x):
-    # Keep me in sync with is_hashable_python_var!
-    # Even better, we should have a map of functions connecting the two
     if isinstance(x, variables.TensorVariable):
         # Tensors are hashable if they have an example_value (a fake tensor)
         # Most VT's should have one.
@@ -63,14 +47,25 @@ def is_hashable(x):
                 variables.ConstantVariable,
                 variables.EnumVariable,
                 variables.user_defined.UserDefinedClassVariable,
-                variables.misc.SkipFilesVariable,
+                variables.UserFunctionVariable,
+                variables.SkipFunctionVariable,
                 variables.misc.NumpyVariable,
                 variables.NNModuleVariable,
+                variables.UnspecializedNNModuleVariable,
+                variables.MethodWrapperVariable,
+                variables.TorchInGraphFunctionVariable,
+                variables.TypingVariable,
+                variables.FunctoolsPartialVariable,
             ),
         )
 
 
 class ConstDictVariable(VariableTracker):
+    _nonvar_fields = {
+        "user_cls",
+        *VariableTracker._nonvar_fields,
+    }
+
     class _HashableTracker:
         """
         Auxiliary opaque internal class that wraps a VariableTracker and makes it hashable
@@ -81,7 +76,10 @@ class _HashableTracker:
         def __init__(self, vt):
             # We specialize SymNodes
             vt = specialize_symnode(vt)
-            assert is_hashable(vt), type(vt)
+            # TODO Temorarily remove to figure out what keys are we breaking on
+            # and add proper support for them
+            if not is_hashable(vt):
+                unimplemented(f"Dict key of type {type(vt)}. Key: {vt}")
             self.vt = vt
 
         @property
@@ -93,6 +91,10 @@ def underlying_value(self):
                 x = tuple(Hashable(e).underlying_value for e in self.vt.items)
             elif isinstance(self.vt, variables.NNModuleVariable):
                 return self.vt.module
+            elif isinstance(self.vt, variables.UnspecializedNNModuleVariable):
+                return self.vt.value
+            elif isinstance(self.vt, variables.UserFunctionVariable):
+                return self.vt.get_function()
             else:
                 x = self.vt.as_python_constant()
             return x
@@ -117,8 +119,14 @@ def _eq_impl(a, b):
 
         def __eq__(self, other: "ConstDictVariable._HashableTracker") -> bool:
             Hashable = ConstDictVariable._HashableTracker
-            assert isinstance(other, Hashable)
-            return Hashable._eq_impl(self.underlying_value, other.underlying_value)
+            assert isinstance(other, Hashable) or ConstantVariable.is_literal(
+                other
+            ), type(other)
+            if isinstance(other, Hashable):
+                return Hashable._eq_impl(self.underlying_value, other.underlying_value)
+
+            # constant
+            return Hashable._eq_impl(self.underlying_value, other)
 
     def __init__(
         self, items: Dict[VariableTracker, VariableTracker], user_cls=dict, **kwargs
@@ -175,26 +183,20 @@ def reconstruct(self, codegen):
             codegen(value)
         # BUILD_MAP and calling collections.OrderedDict if necessary
         if self.user_cls is collections.OrderedDict:
-            return [
-                create_instruction("BUILD_MAP", arg=len(self.items)),
-                *create_call_function(1, False),
-            ]
+            codegen.extend_output(
+                [
+                    create_instruction("BUILD_MAP", arg=len(self.items)),
+                    *create_call_function(1, False),
+                ]
+            )
         # BUILD_MAP only if user_cls is dict
         else:
-            return [create_instruction("BUILD_MAP", arg=len(self.items))]
-
-    @staticmethod
-    def _wrap_keys_python_var(d):
-        """Wrap the keys of a dictionary with python objs as keys into Hashable objects"""
-        assert all(is_hashable_python_var(k) for k in d.keys())
-        Hashable = ConstDictVariable._HashableTracker
-        from .builder import SourcelessBuilder
-
-        build = SourcelessBuilder()
-        return {Hashable(build(k)): v for k, v in d.items()}
+            codegen.append_output(create_instruction("BUILD_MAP", arg=len(self.items)))
 
     def getitem_const(self, arg: VariableTracker):
         key = ConstDictVariable._HashableTracker(arg)
+        if key not in self.items:
+            raise KeyError(arg.value)
         return self.items[key]
 
     def call_method(
@@ -221,18 +223,21 @@ def call_method(
             return self.getitem_const(args[0])
         elif name == "items":
             assert not (args or kwargs)
+            if self.source:
+                tx.output.guard_on_key_order.add(self.source.name())
             return TupleVariable(
                 [TupleVariable([k.vt, v]) for k, v in self.items.items()]
             )
         elif name == "keys":
+            if self.source:
+                tx.output.guard_on_key_order.add(self.source.name())
             assert not (args or kwargs)
-            return SetVariable(
-                [k.vt for k in self.items.keys()],
-                mutable_local=MutableLocal(),
-            )
+            return DictKeys(self)
         elif name == "values":
+            if self.source:
+                tx.output.guard_on_key_order.add(self.source.name())
             assert not (args or kwargs)
-            return TupleVariable(list(self.items.values()))
+            return DictValues(self)
         elif name == "copy":
             assert not (args or kwargs)
             return self.clone(items=self.items.copy(), mutable_local=MutableLocal())
@@ -244,6 +249,10 @@ def call_method(
             tx.output.side_effects.mutation(self)
             self.items[Hashable(args[0])] = args[1]
             return ConstantVariable.create(None)
+        elif name == "__delitem__" and arg_hashable and self.mutable_local:
+            tx.output.side_effects.mutation(self)
+            self.items.__delitem__(Hashable(args[0]))
+            return ConstantVariable.create(None)
         elif name in ("pop", "get") and len(args) in (1, 2) and args[0] not in self:
             # missing item, return the default value
             if len(args) == 1:
@@ -253,6 +262,10 @@ def call_method(
         elif name == "pop" and arg_hashable and self.mutable_local:
             tx.output.side_effects.mutation(self)
             return self.items.pop(Hashable(args[0]))
+        elif name == "clear":
+            tx.output.side_effects.mutation(self)
+            self.items.clear()
+            return ConstantVariable.create(None)
         elif (
             name == "update"
             and len(args) == 1
@@ -273,8 +286,10 @@ def call_method(
             else:
                 dict_vt = BuiltinVariable.call_custom_dict(tx, dict, args[0])
             self.items.update(dict_vt.items)
-            # all keys in kwargs are valid (`str`s)
-            kwargs = ConstDictVariable._wrap_keys_python_var(kwargs)
+            # Wrap strings
+            kwargs = {
+                Hashable(ConstantVariable.create(k)): v for k, v in kwargs.items()
+            }
             self.items.update(kwargs)
             return ConstantVariable.create(None)
         elif name in ("get", "__getattr__") and args[0] in self:
@@ -333,102 +348,38 @@ def call_method(
             return super().call_method(tx, name, args, kwargs)
 
 
-class SetVariable(VariableTracker):
-    @dataclasses.dataclass
-    class SetElement:
-        vt: VariableTracker
-        underlying_value: Any
-
-        def __hash__(self) -> int:
-            return hash(self.underlying_value)
-
-        def __eq__(self, other: object) -> bool:
-            if not isinstance(other, SetVariable.SetElement):
-                return False
-            if isinstance(self.vt, variables.TensorVariable):
-                return self.underlying_value is other.underlying_value
-            else:
-                return self.underlying_value == other.underlying_value
+class SetVariable(ConstDictVariable):
+    """We model a sets as dictonary with None values"""
 
     def __init__(
         self,
         items: List[VariableTracker],
         **kwargs,
     ):
-        super().__init__(**kwargs)
-        # Note - Set is still backed by a list, because we want set behavior over the contents,
-        assert isinstance(items, list)
-        assert all(isinstance(x, VariableTracker) for x in items)
+        items = dict.fromkeys(items, SetVariable._default_value())
+        super().__init__(items, **kwargs)
 
-        self.items = []
-        self._add(items)
+    @property
+    def set_items(self):
+        return set(self.items.keys())
+
+    @staticmethod
+    def _default_value():
+        # Variable to fill in he keys of the dictinary
+        return ConstantVariable.create(None)
 
     def as_proxy(self):
-        return [x.as_proxy() for x in self.items]
+        return {k.vt.as_proxy() for k in self.set_items}
 
     def python_type(self):
         return set
 
-    def reconstruct(self, codegen):
-        codegen.load_import_from("builtins", "set")
-        codegen.foreach(self.items)
-        return [
-            create_instruction("BUILD_SET", arg=len(self.items))
-        ] + create_call_function(1, True)
-
-    # Note - this is only used for producing a set
-    def _as_set_element(self, vt):
-        from .base import VariableTracker
-        from .misc import MethodWrapperVariable
-        from .tensor import TensorVariable
-
-        assert isinstance(vt, VariableTracker)
-
-        if isinstance(vt, TensorVariable):
-            fake_tensor = vt.as_proxy().node.meta.get("example_value")
-            if fake_tensor is None:
-                unimplemented(
-                    "Cannot check Tensor object identity without its fake value"
-                )
-            return SetVariable.SetElement(vt, fake_tensor)
-        if isinstance(vt, ConstantVariable):
-            return SetVariable.SetElement(vt, vt.value)
-        if isinstance(vt, MethodWrapperVariable):
-            return SetVariable.SetElement(vt, vt.as_python_constant())
-
-        unimplemented(f"Sets with {type(vt)} NYI")
-
-    @property
-    def _underlying_items(self):
-        underlying_items = set()
-        for current_item in self.items:
-            assert (
-                current_item not in underlying_items
-            ), "Items modeling set invariant violated"
-            underlying_items.add(self._as_set_element(current_item))
-        return underlying_items
-
-    def _add(self, item):
-        underlying_items = self._underlying_items
-
-        if isinstance(item, (list, set)):
-            items_to_add = item
-        else:
-            items_to_add = [item]
+    def as_python_constant(self):
+        return {k.vt.as_python_constant() for k in self.set_items}
 
-        for item_to_add in items_to_add:
-            set_element = self._as_set_element(item_to_add)
-            if set_element not in underlying_items:
-                underlying_items.add(set_element)
-                self.items.append(set_element.vt)
-            else:
-                for e in underlying_items:
-                    if hash(set_element) == hash(e):
-                        alias_guard = make_dupe_guard(
-                            e.vt.source, set_element.vt.source
-                        )
-                        if alias_guard:
-                            install_guard(e.vt.source.make_guard(alias_guard))
+    def reconstruct(self, codegen):
+        codegen.foreach([x.vt for x in self.set_items])
+        codegen.append_output(create_instruction("BUILD_SET", arg=len(self.set_items)))
 
     def call_method(
         self,
@@ -437,37 +388,114 @@ def call_method(
         args: List[VariableTracker],
         kwargs: Dict[str, VariableTracker],
     ) -> "VariableTracker":
-        # Somewhat duplicative of CommonListMethodsVariable - but better than to violate substitution
-        # principles and end up with things like direct item access attempts on a set, or
-        # getitem sources.
-        if name == "add" and args and self.mutable_local:
+        # We foward the calls to the dictionary model
+        if name == "add":
             assert not kwargs
-            item = args[0]
-            tx.output.side_effects.mutation(self)
-            self._add(item)
-            return ConstantVariable.create(None)
-        elif name == "pop" and self.mutable_local:
-            assert not kwargs
-            assert not args
-            tx.output.side_effects.mutation(self)
-            return self.items.pop()
-        elif name == "__len__":
-            return ConstantVariable.create(len(self.items))
-        elif name == "__contains__":
             assert len(args) == 1
+            name = "__setitem__"
+            args = (args[0], SetVariable._default_value())
+        elif name == "pop":
             assert not kwargs
-            return iter_contains(self.items, args[0], tx, check_tensor_identity=True)
-        else:
-            return super().call_method(tx, name, args, kwargs)
+            assert not args
+            # Choose an item at random and pop it via the Dict.pop method
+            result = self.set_items.pop().vt
+            super().call_method(tx, name, (result,), kwargs)
+            return result
+        return super().call_method(tx, name, args, kwargs)
 
     def getitem_const(self, arg: VariableTracker):
         raise RuntimeError("Illegal to getitem on a set")
 
-    def as_python_constant(self):
-        return self.python_type()([x.as_python_constant() for x in self.items])
+
+class DictView(VariableTracker):
+    """
+    Models _PyDictViewObject
+
+    This is an "abstract" class. Subclasses will override kv and the items method
+    """
+
+    kv: Optional[str] = None
+
+    def __init__(self, dv_dict: ConstDictVariable, **kwargs):
+        super().__init__(**kwargs)
+        assert self.kv in ("keys", "values")
+        assert isinstance(dv_dict, ConstDictVariable)
+        self.dv_dict = dv_dict
+
+    @property
+    def view_items(self):
+        return getattr(self.dv_dict.items, self.kv)()
+
+    @property
+    def view_items_vt(self):
+        # Returns an iterable of the unpacked items
+        # Implement in the subclasses
+        raise NotImplementedError
 
     def unpack_var_sequence(self, tx):
-        return list(self.items)
+        def unwrap(x):
+            return x.vt if self.kv == "keys" else x
+
+        return [unwrap(x) for x in self.view_items]
+
+    def reconstruct(self, codegen):
+        codegen(self.dv_dict)
+        codegen.extend_output(
+            [
+                create_load_method(self.kv),
+                *create_call_method(0),
+            ]
+        )
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List["VariableTracker"],
+        kwargs: Dict[str, "VariableTracker"],
+    ) -> "VariableTracker":
+        if name == "__len__":
+            return self.dv_dict.call_method(tx, name, args, kwargs)
+        return super().call_method(tx, name, args, kwargs)
+
+
+class DictKeys(DictView):
+    kv = "keys"
+
+    @property
+    def set_items(self):
+        return set(self.view_items)
+
+    @property
+    def view_items_vt(self):
+        # Returns an iterable of the unpacked items
+        return [x.vt for x in self.view_items]
+
+    def python_type(self):
+        return dict_keys
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List["VariableTracker"],
+        kwargs: Dict[str, "VariableTracker"],
+    ) -> "VariableTracker":
+        if name == "__contains__":
+            return self.dv_dict.call_method(tx, name, args, kwargs)
+        return super().call_method(tx, name, args, kwargs)
+
+
+class DictValues(DictView):
+    # DictValues is an iterable but cannot be compared.
+    kv = "values"
+
+    @property
+    def view_items_vt(self):
+        return list(self.view_items)
+
+    def python_type(self):
+        return dict_values
 
 
 def _is_matching_transformers_cls(cls) -> bool:
@@ -597,7 +625,7 @@ def __init__(self, items, user_cls, **options):
         assert self.is_matching_cls(user_cls)
 
     def as_proxy(self):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def reconstruct(self, codegen):
         codegen.extend_output([codegen._create_load_const(self.user_cls)])
@@ -605,7 +633,7 @@ def reconstruct(self, codegen):
         d = self.keys_as_python_constant()
         codegen.foreach(d.values())
         keys = tuple(d.keys())
-        return codegen.create_call_function_kw(len(keys), keys, True)
+        codegen.extend_output(codegen.create_call_function_kw(len(keys), keys, True))
 
     def call_method(
         self,
@@ -690,9 +718,20 @@ def make_var(x):
                         "expect VariableTracker or ConstantVariable.is_literal"
                     )
 
+            bound_args = {}
+            if _is_matching_transformers_cls(user_cls) or _is_matching_diffusers_cls(
+                user_cls
+            ):
+                # Skip none
+                for k, v in bound.arguments.items():
+                    if isinstance(v, ConstantVariable) and v.value is None or v is None:
+                        continue
+                    bound_args[k] = v
+            else:
+                bound_args = bound.arguments
+
             items = {
-                ConstantVariable.create(k): make_var(v)
-                for k, v in bound.arguments.items()
+                ConstantVariable.create(k): make_var(v) for k, v in bound_args.items()
             }
         elif not args:
             # CustomDict(a=1, b=2) in the general (non-dataclass) case.
@@ -708,24 +747,45 @@ def make_var(x):
     # called from builder.py
     @classmethod
     def wrap(cls, builder, obj):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def __init__(self, items, user_cls, **options):
         super().__init__(items, user_cls, **options)
         assert self.is_matching_cls(user_cls)
 
     def as_proxy(self):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     # 'RETURN_VALUE triggered compile'
     # called from torch/_dynamo/codegen.py
     def reconstruct(self, codegen):
+        is_hf_model_output = _is_matching_transformers_cls(
+            self.user_cls
+        ) or _is_matching_diffusers_cls(self.user_cls)
+
+        # If the user class is a ModelOutput, then wrap the instance creation in
+        # torch._dynamo.disable(). Even though we mark the __post_init__ as skip
+        # in `create` function, this is not enough. TorchDynamo can still get
+        # triggered on the child functions of __post_init__. This upsets export.
+        # Since, we know that ModelOutput __post_init__ is not worth optimizing,
+        # we just wrap the instance creation in torch._dynamo.disable(),
+        # regardless whether its export or not.
+        if is_hf_model_output:
+            # load torch._dynamo.disable
+            codegen.append_output(codegen.create_load_global("torch", True, add=True))
+            codegen.append_output(codegen.create_load_attr("_dynamo"))
+            codegen.append_output(codegen.create_load_attr("disable"))
         codegen.extend_output([codegen._create_load_const(self.user_cls)])
+
+        if is_hf_model_output:
+            # Wrap user_cls with disable
+            codegen.extend_output(create_call_function(1, False))
+
         # All the keys are just wrapped strings
         d = self.keys_as_python_constant()
         codegen.foreach(d.values())
         keys = tuple(d.keys())
-        return codegen.create_call_function_kw(len(keys), keys, True)
+        codegen.extend_output(codegen.create_call_function_kw(len(keys), keys, True))
 
     def call_method(
         self,
@@ -822,7 +882,6 @@ class PythonSysModulesVariable(VariableTracker):
     def python_type(self):
         return dict
 
-    @staticmethod
     def reconstruct(self, codegen):
         codegen.extend_output(
             [
@@ -834,18 +893,13 @@ def reconstruct(self, codegen):
     def call_method(
         self, tx, name, args: List[VariableTracker], kwargs: Dict[str, VariableTracker]
     ):
-        from .builder import VariableBuilder
-
         if name == "__getitem__":
             return self.call_getitem(tx, *args, **kwargs)
         elif name == "get":
             return self.call_get(tx, *args, **kwargs)
         elif name == "__contains__":
             return self.call_contains(tx, *args, **kwargs)
-
-        # Fallback to dict implementation
-        real_dict = VariableBuilder(tx, self.source)(sys.modules)
-        return real_dict.call_method(tx, name, args, kwargs)
+        unimplemented(f"sys.modules.{name}(*{args}, **{kwargs})")
 
     def _contains_helper(self, tx, key: VariableTracker):
         k = key.as_python_constant()
diff --git a/torch/_dynamo/variables/distributed.py b/torch/_dynamo/variables/distributed.py
index 54ad1cdf9b40f..6816ea9b1e11f 100644
--- a/torch/_dynamo/variables/distributed.py
+++ b/torch/_dynamo/variables/distributed.py
@@ -1,19 +1,40 @@
+# mypy: ignore-errors
+import functools
 import inspect
 from typing import Dict, List
 
 import torch
-from .. import variables
+from ...fx.experimental._backward_state import BackwardState
+from .. import compiled_autograd, variables
+from .._trace_wrapped_higher_order_op import trace_wrapped
 from ..exc import unimplemented
+from ..external_utils import call_module_hooks_from_backward_state
+from ..guards import GuardBuilder, install_guard
+from ..source import AttrSource
 from ..utils import istype
 from .base import VariableTracker
 from .constant import ConstantVariable
 
 
 class DistributedVariable(VariableTracker):
-    def __init__(self, **kwargs):
+    """
+    The base distributed variable that encapsulates common methods
+    for the distributed objects (i.e. ProcessGroup, DeviceMesh, etc.).
+    Concrete distributed objects could inherit this class and add object
+    specific logic.
+
+    i.e. It provides the check on the distributed package existance
+    and hold the tracking value for the corresponding distributed object.
+    """
+
+    def __init__(self, value, **kwargs):
         super().__init__(**kwargs)
         if not DistributedVariable.is_available():
             unimplemented("torch.distributed package is not available!")
+        self.value = value
+
+    def python_type(self):
+        return type(self.value)
 
     @staticmethod
     def is_available():
@@ -34,23 +55,48 @@ def is_constant_pg_functions(value):
         return False
 
     from torch.distributed.distributed_c10d import (
+        _get_group_size_by_name,
         _get_group_tag,
+        _rank_not_in_group,
+        _resolve_group_name_by_ranks_and_tag,
         get_process_group_ranks,
     )
 
     constant_processgroup_functions = [
-        get_process_group_ranks,
+        _get_group_size_by_name,
         _get_group_tag,
+        _rank_not_in_group,
+        get_process_group_ranks,
+        _resolve_group_name_by_ranks_and_tag,
     ]
 
     return inspect.isfunction(value) and value in constant_processgroup_functions
 
 
-class PlacementClassVariable(DistributedVariable):
-    def __init__(self, value, **kwargs):
-        super().__init__(**kwargs)
-        self.value = value
+class WorldMetaClassVariable(DistributedVariable):
+    """
+    Tracks torch.distributed.GroupMember and torch.distributed.group, which are
+    instances of the metaclass _WorldMeta.
+    """
+
+    @classmethod
+    def is_group_member_type(cls, value):
+        if not cls.is_available():
+            return False
 
+        from torch.distributed.distributed_c10d import _WorldMeta
+
+        return type(value) is _WorldMeta
+
+    def var_getattr(self, tx, name: str) -> VariableTracker:
+        if name == "WORLD":
+            source = AttrSource(base=self.source, member="WORLD")
+            install_guard(source.make_guard(GuardBuilder.ID_MATCH))
+            return ProcessGroupVariable(self.value.WORLD)
+        return super().var_getattr(tx, name)
+
+
+class PlacementClassVariable(DistributedVariable):
     @staticmethod
     def is_placement_type(value):
         # we can't rely on importing/accessing torch distributed, it is not always built.
@@ -61,6 +107,9 @@ def is_placement_type(value):
 
         return type(value) is type and issubclass(value, Placement)
 
+    def as_python_constant(self):
+        return self.value
+
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
@@ -80,10 +129,6 @@ def call_function(
 
 
 class PlacementVariable(DistributedVariable):
-    def __init__(self, value, **kwargs):
-        super().__init__(**kwargs)
-        self.value = value
-
     @staticmethod
     def is_placement(value):
         # we can't rely on importing/accessing torch distributed, it is not always built.
@@ -97,6 +142,11 @@ def is_placement(value):
     def as_python_constant(self):
         return self.value
 
+    def var_getattr(self, tx, name: str) -> VariableTracker:
+        if name == "dim":
+            return ConstantVariable.create(self.value.dim)
+        return super().var_getattr(tx, name)
+
     def call_method(
         self,
         tx,
@@ -106,10 +156,20 @@ def call_method(
     ) -> "VariableTracker":
         from . import ConstantVariable
 
-        allowed_methods = ["__init__", "__setattr__"]
-        # placement types dynamo tracking allows only __init__
-        # and __setattr__ methods, the latter is for case like `Shard(dim)`
-        if name in allowed_methods:
+        # Placement types dynamo tracking only allows following methods
+        # and __setattr__  is for case like `Shard(dim)` and methods.
+        # Methods in the list must satisfy:
+        #    1. Input arguments are constants and do not need to be guarded on;
+        #    2. Output is constant with respect to their inputs
+        constant_fold_functions = [
+            "__init__",
+            "__setattr__",
+            "is_shard",
+            "is_partial",
+            "is_replicate",
+        ]
+
+        if name in constant_fold_functions:
             try:
                 value_type = type(self.value)
                 assert (
@@ -123,17 +183,16 @@ def call_method(
 
             args = [x.as_python_constant() for x in args]
             kwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
-            method(self.value, *args, **kwargs)
-            return self
+            if name == "__setattr__":
+                method(self.value, *args, **kwargs)
+                return self
+            constant_val = method(self.value, *args, **kwargs)
+            return ConstantVariable.create(constant_val)
 
         return super().call_method(tx, name, args, kwargs)
 
 
 class DeviceMeshVariable(DistributedVariable):
-    def __init__(self, value, **kwargs):
-        super().__init__(**kwargs)
-        self.value = value
-
     @staticmethod
     def is_device_mesh(value):
         # we can't rely on importing/accessing torch distributed, it is not always built.
@@ -150,8 +209,29 @@ def as_python_constant(self):
     def var_getattr(self, tx, name: str) -> VariableTracker:
         if name == "ndim":
             return ConstantVariable.create(self.value.ndim)
+        if name == "device_type":
+            return ConstantVariable.create(self.value.device_type)
         return super().var_getattr(tx, name)
 
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        if name == "size":
+            const_args = [x.as_python_constant() for x in args]
+            const_kwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
+            return ConstantVariable.create(self.value.size(*const_args, **const_kwargs))
+        if name == "get_coordinate":
+            return ConstantVariable.create(self.value.get_coordinate())
+        if name == "get_group":
+            return ConstantVariable.create(self.value.get_group())
+        if name == "_get_or_create_default_group":
+            return ProcessGroupVariable(self.value._get_or_create_default_group())
+        return super().call_method(tx, name, args, kwargs)
+
 
 class ProcessGroupVariable(DistributedVariable):
     """
@@ -172,16 +252,9 @@ class ProcessGroupVariable(DistributedVariable):
           or just graph-break whenever one of our special cases is not hit?
     """
 
-    def __init__(self, value, **kwargs):
-        super().__init__(**kwargs)
-        self.value = value
-
     def as_python_constant(self):
         return self.value
 
-    def python_type(self):
-        return type(self.value)
-
     def call_method(
         self,
         tx,
@@ -197,6 +270,8 @@ def call_method(
         return super().call_method(tx, name, args, kwargs)
 
     def var_getattr(self, tx, name):
+        if name == "group_name":
+            return variables.ConstantVariable.create(self.value.group_name)
         if name in ["rank", "size"]:
             return variables.LambdaVariable(
                 lambda *args, **kwargs: self.call_method(tx, name, args, kwargs)
@@ -213,3 +288,102 @@ def is_process_group(value):
         from torch.testing._internal.distributed.fake_pg import FakeProcessGroup
 
         return istype(value, (ProcessGroup, FakeProcessGroup))
+
+
+class BackwardHookVariable(VariableTracker):
+    """
+    Handles torch.utils.hooks.BackwardHook for module-level backward
+    hooks.
+    """
+
+    @staticmethod
+    def create(
+        tx,
+        module: VariableTracker,
+        user_hooks: VariableTracker,
+        user_pre_hooks: VariableTracker,
+    ):
+        if not compiled_autograd.compiled_autograd_enabled:
+            unimplemented("module-level backwards hooks require compiled autograd")
+
+        def _in_graph_bw_hooks(bw_state: BackwardState):
+            """
+            Rather than installing the user hooks in the graph (which
+            don't survive AotAutograd), we install hooks that will call
+            trace_wrapped in the backward pass that CompiledAutograd
+            can turn into actual hook calls.
+            """
+            return torch.utils.hooks.BackwardHook(
+                None,
+                (
+                    functools.partial(
+                        trace_wrapped,
+                        fn=call_module_hooks_from_backward_state,
+                        bw_state=bw_state,
+                        hooks_name=user_hooks_name,
+                        module_name=module_name,
+                    ),
+                ),
+                (
+                    functools.partial(
+                        trace_wrapped,
+                        fn=call_module_hooks_from_backward_state,
+                        bw_state=bw_state,
+                        hooks_name=user_pre_hooks_name,
+                        module_name=module_name,
+                    ),
+                ),
+            )
+
+        module_name, bw_state_proxy = tx.output.add_backward_state_hook(module, "mod")
+        user_pre_hooks_name, _ = tx.output.add_backward_state_hook(user_pre_hooks)
+        user_hooks_name, _ = tx.output.add_backward_state_hook(user_hooks)
+        proxy = tx.output.create_proxy(
+            "call_function",
+            _in_graph_bw_hooks,
+            (bw_state_proxy,),
+            {},
+        )
+        proxy.node.meta["example_value"] = torch.utils.hooks.BackwardHook(None, (), ())
+        return BackwardHookVariable(proxy, module, user_hooks, user_pre_hooks)
+
+    def __init__(
+        self,
+        proxy: torch.fx.Proxy,
+        module: VariableTracker,
+        user_hooks: VariableTracker,
+        user_pre_hooks: VariableTracker,
+        **options,
+    ):
+        super().__init__(**options)
+        self.proxy = proxy
+        self.module = module
+        self.user_hooks = user_hooks
+        self.user_pre_hooks = user_pre_hooks
+
+    def as_proxy(self):
+        return self.proxy
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List[VariableTracker],
+        kwargs: Dict[str, VariableTracker],
+    ) -> VariableTracker:
+        if name in ("setup_input_hook", "setup_output_hook"):
+            return self._setup_hook(tx, name, *args, **kwargs)
+        return super().call_method(tx, name, args, kwargs)
+
+    def _setup_hook(self, tx, hook_method_name, args):
+        from .builder import wrap_fx_proxy
+
+        return wrap_fx_proxy(
+            tx,
+            tx.output.create_proxy(
+                "call_method",
+                hook_method_name,
+                (self.as_proxy(), args.as_proxy()),
+                {},
+            ),
+        )
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index 0572cafc811d9..809315c27efa9 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -1,3 +1,7 @@
+# mypy: ignore-errors
+
+import collections
+import copy
 import functools
 import inspect
 import itertools
@@ -9,9 +13,11 @@
 from .. import variables
 from ..bytecode_transformation import create_call_function, create_rot_n
 from ..exc import unimplemented, Unsupported
+from ..guards import GuardBuilder, install_guard
 from ..source import AttrSource, ConstantSource, DefaultsSource, GetItemSource
-from ..utils import get_first_attr, make_cell
-from .base import typestr, VariableTracker
+from ..utils import check_constant_args, get_first_attr, identity, istype, make_cell
+from .base import MutableLocal, typestr, VariableTracker
+from .constant import ConstantVariable
 
 if TYPE_CHECKING:
     from torch._guards import Source
@@ -24,11 +30,11 @@ def wrap_bound_arg(tx, val, source=None):
     elif not source:
         from torch._dynamo.variables.builder import SourcelessBuilder
 
-        return SourcelessBuilder()(tx, val)
+        return SourcelessBuilder.create(tx, val)
     else:
-        from torch._dynamo.variables.builder import VariableBuilder
-
-        return VariableBuilder(tx, source=source)(val)
+        # Create a lazy variable to avoid guarding on __defaults__ unless really
+        # needed.
+        return variables.LazyVariableTracker.create(val, source)
 
 
 def wrap_args_kwargs(tx, result):
@@ -81,9 +87,7 @@ def get_name(self):
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
-        return tx.inline_user_function_return(
-            self, list(self.self_args()) + list(args), kwargs
-        )
+        return tx.inline_user_function_return(self, [*self.self_args(), *args], kwargs)
 
     def call_hasattr(self, tx, name: str) -> VariableTracker:
         result = False
@@ -105,6 +109,20 @@ def closure_vars(self, tx):
 class UserFunctionVariable(BaseUserFunctionVariable):
     """Some unsupported user-defined global function"""
 
+    _nonvar_fields = {
+        "fn",
+        "is_constant",
+        *BaseUserFunctionVariable._nonvar_fields,
+    }
+
+    @classmethod
+    def create_with_source(cls, value, source):
+        install_guard(source.make_guard(GuardBuilder.CLOSURE_MATCH))
+        return cls(
+            value,
+            source=source,
+        )
+
     def __init__(self, fn, is_constant=False, **kwargs):
         super().__init__(**kwargs)
         if getattr(fn, "_dynamo_marked_constant", False):
@@ -123,6 +141,12 @@ def __init__(self, fn, is_constant=False, **kwargs):
             fn = inspect.getattr_static(fn, "__original_fn", fn)
         self.fn: types.FunctionType = fn
 
+    def as_python_constant(self):
+        if istype(self, UserFunctionVariable):
+            return self.fn
+        # subclasses (such as methods) usually aren't a constant
+        return super().as_python_constant()
+
     def self_args(self):
         return []
 
@@ -247,13 +271,17 @@ def bind_args(self, parent, args, kwargs):
                 else:
                     from .builder import SourcelessBuilder
 
-                    result[name] = SourcelessBuilder()(tx, cell.cell_contents)
+                    result[name] = SourcelessBuilder.create(tx, cell.cell_contents)
 
         return result, closure_cells
 
     def export_freevars(self, parent, child):
         pass
 
+    def call_hasattr(self, tx, name: str) -> VariableTracker:
+        result = hasattr(self.fn, name)
+        return variables.ConstantVariable.create(result)
+
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
@@ -300,6 +328,11 @@ def call_function(
             self.obj, variables.NNModuleVariable
         ):
             module_attr = getattr(self.fn, "__module__", "")
+            # skip tracing torch.nn.utils.parametrize
+            if module_attr == "torch.nn.utils.parametrize":
+                return SkipFunctionVariable.create_with_source(
+                    self.fn, source=self.source
+                ).call_function(tx, args, kwargs)
             if (
                 module_attr is not None
                 and module_attr.startswith("torch.nn.")
@@ -411,7 +444,7 @@ def get_code(self):
 
     def get_function(self):
         if self.closure:
-            raise NotImplementedError()
+            raise NotImplementedError
         func = types.FunctionType(
             self.code.as_python_constant(),
             self.f_globals,
@@ -495,7 +528,7 @@ def reconstruct(self, codegen):
         codegen.load_import_from(__name__, "_create_nested_fn")
         codegen(self.code)
         codegen.extend_output([codegen._create_load_const(self.f_globals)])
-        codegen(self.fn_name)
+        codegen(ConstantVariable.create(self.code.value.co_name))
 
         if self.defaults:
             codegen(self.defaults)
@@ -530,7 +563,85 @@ def reconstruct(self, codegen):
             codegen.extend_output(create_rot_n(2))
             codegen.extend_output(create_call_function(1, True))
 
-        return []
+
+class SkipFunctionVariable(VariableTracker):
+    _nonvar_fields = {
+        "value",
+        "reason",
+        *VariableTracker._nonvar_fields,
+    }
+
+    def __init__(self, value, reason=None, **kwargs):
+        super().__init__(**kwargs)
+        self.value = value
+        self.reason = reason
+
+    def python_type(self):
+        return type(self.value)
+
+    def as_python_constant(self):
+        return self.value
+
+    @classmethod
+    def create_with_source(cls, value, source):
+        install_guard(source.make_guard(GuardBuilder.FUNCTION_MATCH))
+        return cls(
+            value,
+            source=source,
+        )
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def fold_through_function_to_wrapper():
+        return {
+            collections.namedtuple: variables.UserDefinedClassVariable,
+        }
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        if inspect.getattr_static(self.value, "_torchdynamo_disable", False):
+            unimplemented(f"call torch._dynamo.disable() wrapped function {self.value}")
+        # Fold through the functions(e.g, collections.namedtuple)
+        # that inputs & outputs are all python constants
+        elif (
+            self.value in self.fold_through_function_to_wrapper().keys()
+            and check_constant_args(args, kwargs)
+        ):
+            value = self.value(
+                *[x.as_python_constant() for x in args],
+                **{k: v.as_python_constant() for k, v in kwargs.items()},
+            )
+            return self.fold_through_function_to_wrapper().get(self.value)(
+                value, mutable_local=MutableLocal()
+            )
+        elif (
+            self.value is functools.wraps
+            and not kwargs
+            and len(args) == 1
+            and (
+                args[0].source is not None or args[0].can_reconstruct(tx.output.root_tx)
+            )
+        ):
+
+            def wraps(fn):
+                if isinstance(fn, variables.NestedUserFunctionVariable):
+                    if args[0].source:
+                        reconstructible = args[0].source
+                    else:
+                        reconstructible = args[0]
+                    return fn.clone(wrapped_reconstructible=reconstructible)
+                unimplemented(f"functools.wraps({fn})")
+
+            return variables.LambdaVariable(wraps)
+        else:
+            try:
+                path = inspect.getfile(self.value)
+            except TypeError:
+                path = f"Builtin {self.value.__name__}"
+            msg = f"'skip function {self.value.__qualname__} in file {path}'"
+            msg += f"', {self.reason}'" if self.reason else ""
+            unimplemented(msg)
 
 
 def _traceable_collective_remaps():
@@ -596,22 +707,65 @@ def call_function(
         # call_function must check any unsupported arguments and graph-break.
         # It's safe to assume args/kwargs from orig_fn map 1:1 to args/kwargs of remapped_fn,
         # since that's the contract for putting a mapping in `traceable_collective_remaps`
+        import torch.distributed as dist
+        from torch.distributed._functional_collectives import REDUCE_OP_TO_STR
+
+        # Merge args into kwargs so positional and keyword args
+        # can be processed the same way.
+        signature = inspect.signature(self.fn)
+        kwargs = dict(signature.bind(*args, **kwargs).arguments)
+        args = ()
+
         if "async_op" in kwargs and kwargs["async_op"].as_python_constant():
             unimplemented(
                 f"CollectiveFunctionRewriteVariable can't support async_op=True for {self.fn}"
             )
+
+        if self.fn in (
+            dist.all_reduce,
+            dist.reduce_scatter_tensor,
+            dist._reduce_scatter_base,
+        ):
+            reduce_op_var = kwargs.get("op")
+            reduce_op = (
+                reduce_op_var.value
+                if reduce_op_var is not None
+                else signature.parameters["op"].default
+            )
+            if reduce_op not in REDUCE_OP_TO_STR:
+                raise ValueError(f"Unsupported all_reduce op: {reduce_op}")
+            kwargs["op"] = variables.ConstantVariable.create(
+                REDUCE_OP_TO_STR[reduce_op]
+            )
         return self.replacement_var.call_function(tx, args, kwargs)
 
 
 class FunctoolsPartialVariable(VariableTracker):
-    def __init__(self, func: VariableTracker, args, keywords, original=None, **kwargs):
+    def __init__(self, func: VariableTracker, args, keywords, **kwargs):
         super().__init__(**kwargs)
         self.func = func
         assert isinstance(args, list)
         self.args = args
         assert isinstance(keywords, dict)
         self.keywords = keywords
-        self.original = original
+
+    def reconstruct(self, codegen):
+        codegen.load_import_from("functools", "partial")
+        codegen(self.func)
+        if self.args:
+            codegen.foreach(self.args)
+        if not self.keywords:
+            codegen.extend_output(create_call_function(len(self.args) + 1, True))
+            return
+
+        codegen.foreach(self.keywords.values())
+        keys = tuple(self.keywords.keys())
+        codegen.extend_output(
+            codegen.create_call_function_kw(len(keys) + len(self.args) + 1, keys, True)
+        )
+
+    def get_function(self):
+        return self.as_python_constant()
 
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
@@ -621,30 +775,25 @@ def call_function(
         return self.func.call_function(tx, merged_args, merged_kwargs)
 
     def call_hasattr(self, tx, name: str) -> VariableTracker:
-        from .constant import ConstantVariable
-
-        # reconstruct the partial without the keyword arguments
-        # This works as PyTorch does not allow mutating the partial variable
-        p = functools.partial(self.func.get_function())
-        r = hasattr(p, name)
-        return ConstantVariable.create(r)
+        # functools.partial uses slots, so attributes are constant
+        return variables.ConstantVariable.create(
+            hasattr(functools.partial(identity), name)
+        )
 
     def as_python_constant(self):
-        if self.original:
-            return self.original
-        else:
-
-            def get_val(v):
-                if isinstance(v, variables.UserDefinedObjectVariable):
-                    return v.value
-                else:
-                    return v.as_python_constant()
+        return functools.partial(
+            self.func.as_python_constant(),
+            *[arg.as_python_constant() for arg in self.args],
+            **{k: v.as_python_constant() for k, v in self.keywords.items()},
+        )
 
-            return functools.partial(
-                self.func.get_function(),
-                *[get_val(arg) for arg in self.args],
-                **{k: get_val(v) for k, v in self.keywords.items()},
-            )
+    def guard_as_python_constant(self):
+        """Similar to as_python_constant(), but add ID_MATCH guards to try to force things to become constants"""
+        return functools.partial(
+            self.func.guard_as_python_constant(),
+            *[v.guard_as_python_constant() for v in self.args],
+            **{k: v.guard_as_python_constant() for k, v in self.keywords.items()},
+        )
 
 
 class TritonKernelVariable(VariableTracker):
@@ -687,6 +836,9 @@ def __init__(self, kernel, kernel_idx, grid, **kwargs):
                     and defaults["prune_configs_by"].default
                     != kernel.early_config_prune
                 )
+                # Set via reset_to_zero argument
+                or len(kernel.reset_idx) != 0
+                or len(kernel.restore_idx) != 0
             ):
                 raise Unsupported(
                     "Only configs and keys are supported for triton.autotune"
@@ -695,22 +847,55 @@ def __init__(self, kernel, kernel_idx, grid, **kwargs):
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
-        from triton.runtime.autotuner import Autotuner
+        from triton.runtime.autotuner import autotune, Autotuner, Config
 
         from .constant import ConstantVariable
         from .dicts import ConstDictVariable
         from .lists import BaseListVariable
 
+        if "num_ctas" in kwargs:
+            raise Unsupported(
+                "Passing num_ctas directly to the Triton kernel is not supported. "
+                "Please use a Config in @triton.autotune instead."
+            )
+
+        special_kwargs = {}
+        for name in ("num_warps", "num_stages"):
+            if name in kwargs:
+                # remove special kwargs from `kwargs`
+                val = kwargs.pop(name)
+                assert isinstance(val, ConstantVariable)
+                special_kwargs[name] = val.value
+
+        if special_kwargs:
+            if isinstance(self.kernel, Autotuner):
+                # if there is Autotuner already, set
+                # special kwargs to each of its configs
+                new_configs = copy.deepcopy(self.kernel.configs)
+                for config in new_configs:
+                    config.__dict__.update(special_kwargs)
+                new_kernel = autotune(configs=new_configs, key=[])(self.kernel.fn)
+            else:
+                # if there is no Autotuner, wrap the kernel into a
+                # new one with a single config with special kwargs
+                new_config = Config(kwargs={}, **special_kwargs)
+                new_kernel = autotune(configs=[new_config], key=[])(self.kernel)
+
+            # create a new variable to contain the new (wrapped) kernel;
+            # skip kernel_idx to get a new record in the kernel side table
+            new_var = TritonKernelVariable(new_kernel, None, self.grid)
+            return new_var.call_function(tx, args, kwargs)
+
         if self.grid is None:
             raise Unsupported("Triton kernels should always be called with a grid")
 
         # Both for grid's meta as well as for the kernel, we need combined
-        # args and kwargs normalized
-        names = (
-            variables.ConstantVariable.create(name) for name in self.kernel.arg_names
-        )
-        kwargs = {variables.ConstantVariable.create(k): v for k, v in kwargs.items()}
-        normalized_args = {**dict(zip(names, args)), **kwargs}
+        # args and kwargs combined and normalized
+        combined_args_raw = {**dict(zip(self.kernel.arg_names, args)), **kwargs}
+        combined_args = {
+            variables.ConstantVariable.create(k): v
+            for k, v in combined_args_raw.items()
+        }
 
         configs = (
             [config.kwargs for config in self.kernel.configs]
@@ -728,7 +913,7 @@ def call_function(
                     ConstantVariable.create(k): ConstantVariable.create(v)
                     for k, v in config_args.items()
                 }
-                meta = ConstDictVariable({**normalized_args, **config_args}, dict)
+                meta = ConstDictVariable({**combined_args, **config_args}, dict)
                 grid = grid.call_function(tx, [meta], {})
 
             # Now, the grid must be a list either originally or through above
@@ -755,19 +940,33 @@ def call_function(
             grids = [grids[0]]
 
         from torch._higher_order_ops.triton_kernel_wrap import (
+            kernel_side_table,
             triton_kernel_wrapper_mutation,
         )
 
         # Combine args and kwargs and pass as a dict so that if user defined triton
         # kernel uses variables as 'grid' or 'kernel', it does not conflict with
         # parameters of the wrapper function
-        meta = ConstDictVariable(normalized_args, dict)
+        constant_args = {
+            k: v.as_python_constant()
+            for k, v in combined_args_raw.items()
+            if isinstance(v, ConstantVariable)
+        }
+        non_constant_args = {
+            k: v
+            for k, v in combined_args.items()
+            if not isinstance(v, ConstantVariable)
+        }
+
+        constant_args_idx = kernel_side_table.add_constant_args(constant_args)
+        meta = ConstDictVariable(non_constant_args, dict)
         tx.output.create_proxy(
             "call_function",
             triton_kernel_wrapper_mutation,
             (),
             {
                 "kernel_idx": self.kernel_idx,
+                "constant_args_idx": constant_args_idx,
                 "grid": grids,
                 "kwargs": meta.as_proxy(),
             },
diff --git a/torch/_dynamo/variables/higher_order_ops.py b/torch/_dynamo/variables/higher_order_ops.py
index a8aec0053a3e5..2514ae0855f2e 100644
--- a/torch/_dynamo/variables/higher_order_ops.py
+++ b/torch/_dynamo/variables/higher_order_ops.py
@@ -1,37 +1,40 @@
+# mypy: ignore-errors
+
 import contextlib
 import functools
 import itertools
 import logging
+import types
 
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, TYPE_CHECKING
 
 import torch._C
 import torch.fx
 import torch.nn
 import torch.onnx.operators
-from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.utils import deepcopy_to_fake_tensor, get_fake_value, get_real_value
+from torch._dynamo.variables import ConstantVariable
 from torch._dynamo.variables.base import VariableTracker
 from torch._dynamo.variables.builtin import BuiltinVariable
 from torch._dynamo.variables.functions import UserFunctionVariable
 from torch._dynamo.variables.tensor import SymNodeVariable
 from torch._guards import Source
+from torch._ops import HigherOrderOperator
 from torch.fx.passes.shape_prop import _extract_tensor_metadata
 from torch.utils import _pytree as pytree
+from .. import variables
 
-from ..exc import (
-    UncapturedHigherOrderOpError,
-    unimplemented,
-    Unsupported,
-    UserError,
-    UserErrorType,
-)
-from ..source import FSDPNNModuleSource, GetItemSource, NNModuleSource
+from ..exc import UncapturedHigherOrderOpError, unimplemented, Unsupported
+from ..source import AttrSource
 from ..utils import proxy_args_kwargs
 from .dicts import ConstDictVariable
+from .lazy import LazyVariableTracker
 from .lists import ListVariable, TupleVariable
 from .nn_module import NNModuleVariable, UnspecializedNNModuleVariable
 
+if TYPE_CHECKING:
+    from torch._dynamo.symbolic_convert import InstructionTranslator
+
 
 log = logging.getLogger(__name__)
 
@@ -63,13 +66,17 @@ def dynamo_enable_grad(tx, enable=True):
         GradModeVariable.create(tx, org_value, initialized=True)
 
 
-def only_consist_of(var, types):
+def only_consist_of(var, types, allow_none=False):
     if isinstance(var, types):
         return True
+    if allow_none and var.is_python_constant() and var.as_python_constant() is None:
+        return True
     if isinstance(var, (TupleVariable, ListVariable)):
-        return all(only_consist_of(item, types) for item in var.items)
+        return all(only_consist_of(item, types, allow_none) for item in var.items)
     if isinstance(var, ConstDictVariable):
-        return all(only_consist_of(item, types) for item in var.items.values())
+        return all(
+            only_consist_of(item, types, allow_none) for item in var.items.values()
+        )
     return False
 
 
@@ -85,15 +92,11 @@ def inline_call(*args, **kwargs):
     return inline_call
 
 
-def _call_function_and_unflatten_output(tx, fn, args, kwargs, ret_vt, ret_treespec):
+def _call_function_and_unflatten_output(
+    tx, fn, args, kwargs, flat_example_value, ret_treespec
+):
     from .builder import wrap_fx_proxy
 
-    flat_example_value = pytree.tree_map_only(
-        torch.fx.Proxy,
-        lambda a: a.node.meta["example_value"],
-        ret_vt.as_proxy(),
-    )
-
     # Store the invocation as a call
     flat_variable = wrap_fx_proxy(
         tx=tx,
@@ -135,7 +138,7 @@ def validate_args_and_maybe_create_graph_inputs(
     set_subgraph_inputs,
     description,
 ):
-    from . import AutogradFunctionContextVariable, ConstantVariable, EnumVariable
+    from . import AutogradFunctionContextVariable
     from .builder import wrap_fx_proxy_cls
 
     assert tracer.parent is not None
@@ -163,8 +166,27 @@ def validate_args_and_maybe_create_graph_inputs(
             if set_subgraph_inputs == "automatic":
                 args.append(a)
                 continue
+            elif set_subgraph_inputs == "semi_automatic":
+                if isinstance(a, AutogradFunctionContextVariable):
+                    tracer.create_graph_input(a.as_proxy().node.name)
+                elif a.maybe_fx_node() is not None:
+                    node = a.maybe_fx_node()
+                    new_proxy = tracer.create_graph_input(node.name)
+                    example_value = (
+                        node.meta["example_value"]
+                        if "example_value" in node.meta
+                        else None
+                    )
+                    a = wrap_fx_proxy_cls(
+                        target_cls=type(a),
+                        tx=tx,
+                        proxy=new_proxy,
+                        example_value=example_value,
+                    )
+                args.append(a)
+                continue
 
-            if isinstance(a, (ConstantVariable, EnumVariable)):
+            if a.is_python_constant():
                 # This arg is not used in the body of the higher order op.
                 # Currently, this new input is added to make the calls
                 # happy, which expect a fixed number of arguments. In
@@ -192,7 +214,7 @@ def validate_args_and_maybe_create_graph_inputs(
             # If `a` cannot be put into a graph
             else:
                 # HOPs work much better if they use speculate_subgraph(set_subgraph_inputs="automatic").
-                raise unimplemented(
+                unimplemented(
                     f"{description} with body that accepts non-Tensors as input. "
                     f"Got: {a.python_type()}"
                 )
@@ -200,6 +222,107 @@ def validate_args_and_maybe_create_graph_inputs(
         return args
 
 
+# This helper function is used to make sure two graphs share the same input signature. For example,
+# in torch.cond, two branches might lift different set of tensors as inputs. This function helps to
+# dedup the inputs and modify the graphs to take the same set of inputs.
+def _merge_graph_inputs(
+    l_graph, l_lifted_freevars, l_name, r_graph, r_lifted_freevars, r_name
+):
+    def dedup_and_sort_lifted_freevars(l_lifted_freevars, r_lifted_freevars):
+        # The nn module attributes are guaranteed to be registered into the top-level graph module during
+        # higher order op speculation. Therefore, get_attr nodes in two branches with the same
+        # target refer to the same attribute and we can safely deduplicate them with their target.
+        #
+        # Note: ideally, dynamo should just create a single proxy for the same attribute of a nn module. But
+        # true_branch and false_branch belong to two separate tracing contexts, they may register the same
+        # attribute to top level seperately. This creates two get_attr proxies for the same attribute
+        # that have different meta data such as stack_trace (one stack trace for the true_branch,
+        # and the other for false_branch). It seems better to discard the proxy explicitly in cond
+        # than make dynamo create a single proxy for the same get_attr target.
+        def shared_getattrs(l_lifted_proxies, r_lifted_proxies):
+            true_targets = {
+                proxy.node.target: proxy
+                for proxy in l_lifted_proxies
+                if proxy.node.op == "get_attr"
+            }
+            l_shared_getattrs = {}
+            r_shared_getattrs = {}
+
+            for false_proxy in r_lifted_proxies:
+                if (
+                    false_proxy.node.op == "get_attr"
+                    and false_proxy.node.target in true_targets
+                ):
+                    true_proxy = true_targets[false_proxy.node.target]
+                    l_shared_getattrs[true_proxy] = true_proxy
+                    r_shared_getattrs[false_proxy] = true_proxy
+            return l_shared_getattrs, r_shared_getattrs
+
+        l_shared_getattrs, r_shared_getattrs = shared_getattrs(
+            l_lifted_freevars.keys(), r_lifted_freevars.keys()
+        )
+
+        l_shared_freevars = (l_lifted_freevars.keys() & r_lifted_freevars.keys()).union(
+            l_shared_getattrs.keys()
+        )
+        r_shared_freevars = (l_lifted_freevars.keys() & r_lifted_freevars.keys()).union(
+            r_shared_getattrs.keys()
+        )
+        unique_l_freevars = l_lifted_freevars.keys() - l_shared_freevars
+        unique_r_freevars = r_lifted_freevars.keys() - r_shared_freevars
+
+        def _sort_by_name(vars):
+            return sorted(vars, key=lambda var: var.node.name)
+
+        return (
+            list(_sort_by_name(list(l_shared_freevars))),
+            list(_sort_by_name(list(r_shared_freevars))),
+            list(_sort_by_name(list(unique_l_freevars))),
+            list(_sort_by_name(list(unique_r_freevars))),
+        )
+
+    (l_shared, r_shared, unique_l, unique_r) = dedup_and_sort_lifted_freevars(
+        l_lifted_freevars, r_lifted_freevars
+    )
+
+    # Let's say we capture cond(pred, true_fn, false_fn, (x,))
+    # With set_graph_input set to automatic,
+    # true_fn has lifted variables x, a, b, c
+    # false_fn has lifted variables x, a, b, d
+    # Then fixup_branch_inps make sure both branches have the same signature, i.e.:
+    # - true_fn(x, a, b, c_true_branch, d_false_branch)
+    # - false_fn(x, a, b, c_true_branch, d_false_branch)
+    #
+    # More formally, the signature has three parts in the following order:
+    # 1. used in both branches: x, a, b
+    # 2. only used in true branches: c, suffixed with _true_branch
+    # 3. only used in false branches: d, suffixed with _false_branch
+    # Within each part, we re-order the nodes by name to have a derterministic ordering for testing.
+    def fixup_branch_inps(graph, lifted_freevars, shared, unique_l, unique_r):
+        def _insert_or_replace_phs(new_args, name_suffix):
+            for arg in new_args:
+                new_ph = graph.placeholder(arg.node.name + name_suffix)
+                # Override with new_ph if there exists a old placeholder.
+                if arg in lifted_freevars:
+                    old_ph = lifted_freevars[arg].node
+                    old_ph.replace_all_uses_with(new_ph)
+                    # replace_all_uses_with doesn't clean users. Clean it mannually so that we could erase it.
+                    old_ph.users = {}
+                    graph.erase_node(old_ph)
+
+        first_not_ph_node = next(
+            node for node in graph.nodes if node.op != "placeholder"
+        )
+        with graph.inserting_before(first_not_ph_node):
+            _insert_or_replace_phs(shared, "")
+            _insert_or_replace_phs(unique_l, "_" + l_name)
+            _insert_or_replace_phs(unique_r, "_" + r_name)
+
+    fixup_branch_inps(l_graph, l_lifted_freevars, l_shared, unique_l, unique_r)
+    fixup_branch_inps(r_graph, r_lifted_freevars, r_shared, unique_l, unique_r)
+    return l_graph, r_graph, l_shared, r_shared, unique_l, unique_r
+
+
 # See NOTE [HigherOrderOperator tracing design] for details of the design
 def speculate_subgraph(
     tx,
@@ -215,10 +338,10 @@ def speculate_subgraph(
     enable_grad=None,
     # NOTE [argument `set_subgraph_inputs`]
     # set_subgraph_inputs controls what how to construct subgraphs' placeholders from sub_args.
-    # 1. if your HOP supports arbitrary inputs, use set_subtraph_inputs="automatic" (most recommended).
+    # 1. if your HOP supports arbitrary inputs, use set_subgraph_inputs="automatic" (most recommended).
     # 2. if your HOP supports only Tensor and symnode inputs, use set_subgraph_inputs="flatten_manual" (recommended).
     # If sub_args contain Pytree structure (e.g. dict/list/tuple/set), the sub_args will be flattened first.
-    # Then the flattend args are manually set as subgraph's placeholders.
+    # Then the flattened args are manually set as subgraph's placeholders.
     # 3. if your HOP must preserve inputs that are not tensor or symnode as placeholders e.g. AutogradFunctionContextVariable
     # use set_subgraph_inputs="manual" (not recommended). We do not recommend it in general because it has the
     # restriction that user need to manually control how to create placeholders and VariableTrackers for the args.
@@ -234,6 +357,7 @@ def speculate_subgraph(
 
     assert set_subgraph_inputs in {
         "automatic",
+        "semi_automatic",
         "flatten_manual",
         "manual",
     }, "Please use one of the supported set_subgraph_inputs options."
@@ -243,9 +367,8 @@ def speculate_subgraph(
         unimplemented("Use `set_subgraph_inputs=automatic` when passing `sub_kwargs`.")
 
     try:
-        f, sub_args, sub_kwargs = VariableTracker.apply(
-            # ensure guards on args get installed in parent subgraph
-            lambda x: x.realize(),
+        # ensure guards on args get installed in parent subgraph
+        f, sub_args, sub_kwargs = LazyVariableTracker.realize_all(
             (f, sub_args, sub_kwargs),
         )
 
@@ -268,6 +391,21 @@ def speculate_subgraph(
                 else contextlib.nullcontext()
             )
 
+            # For handling side effects, we can make an argument that we don't
+            # have to do anything here. The side effects infra does a good job
+            # of graph breaking if we mutate any nonlocal or global variable
+            # while subtracing. As a result if tracing succeeds, side effects
+            # data structure will only contain read-only data structures that
+            # are put there for tracking purposes.
+            # But on the other hand, there is an argument that if we ever write
+            # a new side effect in Dynamo which does not go through the side
+            # effect infra, we can end up in bad state.
+            # Therefore we restore the side effects after tracing. The catch is
+            # that we have to special handle tensor variables. If we have seen a
+            # nonlocal variable tensor during subtracing, we want to keep a
+            # track of that tensor, so that later subtracing or the root tracer
+            # itself does not create a new proxy for the already observed tensor
+            # variable.
             if restore_side_effects:
                 prev_side_effects = tx.output.side_effects.clone()
 
@@ -275,11 +413,10 @@ def speculate_subgraph(
                 output = f.call_function(tx, args, sub_kwargs)
 
             if restore_side_effects:
-                # Captured variables are tracked in side-effects
-                # and they show up in output graph incorrectly.
-                # It is ok to undo this side-effect tracking
-                # as speculate_subgraph will allow only
-                # pure functions.
+                new_side_effects = tx.output.side_effects.clone()
+                prev_side_effects.track_tensor_variables_from_runahead_side_effects(
+                    new_side_effects
+                )
                 tx.output.side_effects = prev_side_effects
 
             treespec = None
@@ -303,7 +440,7 @@ def speculate_subgraph(
             else:
                 from . import TensorVariable
 
-                if not only_consist_of(output, TensorVariable):
+                if not only_consist_of(output, TensorVariable, allow_none=True):
                     unimplemented(
                         "HigherOrderOperator body's output must consist of tensors only"
                     )
@@ -342,12 +479,9 @@ def speculate_subgraph(
             f"that Dynamo was unable to prove safety for this API and will "
             f"fall back to eager-mode PyTorch, which could lead to a slowdown."
         )
-        log.warning(msg)
-        log.exception(ex)
-        raise Unsupported(
-            f"{msg} Scroll up for the stack trace "
-            f"of the initial exception. The reason was: {ex.msg}"
-        ) from ex
+        log.info(msg)
+        log.info(ex)
+        raise ex
 
 
 def make_attr(tx, name):
@@ -360,7 +494,7 @@ def make_attr(tx, name):
     return node
 
 
-def add_subgraph(tx, source, name, gm):
+def add_subgraph(tx, name, gm):
     next_name = None
     i = 0
     while not next_name:
@@ -371,17 +505,17 @@ def add_subgraph(tx, source, name, gm):
             next_name = candidate
 
     gm.__name__ = next_name
-    if source.guard_source().is_fsdp_module():
-        src = FSDPNNModuleSource(GetItemSource(source, next_name))
-    else:
-        src = NNModuleSource(GetItemSource(source, next_name))
     gm.torchdynamo_force_dynamic = False
-    tx.output.register_attr_or_module(gm, next_name, source=src)
+    # This graph module is not present in the user space, so it can't be
+    # accessed by a source. Set source=None.
+    tx.output.register_attr_or_module(gm, next_name, source=None)
     return next_name
 
 
 class TorchHigherOrderOperatorVariable(VariableTracker):
-    def __init__(self, value, source: Optional[Source] = None, **kwargs):
+    def __init__(
+        self, value: HigherOrderOperator, source: Optional[Source] = None, **kwargs
+    ):
         super().__init__(**kwargs)
         self.value = value
         self.source = source
@@ -390,26 +524,18 @@ def __init__(self, value, source: Optional[Source] = None, **kwargs):
     def make(value, source=None, **kwargs):
         if value.__name__ == "cond":
             return CondHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "while_loop":
+            return WhileLoopHigherOrderVariable(value, source, **kwargs)
         elif value.__name__ in ("map", "map_impl"):
             return MapHigherOrderVariable(value, source, **kwargs)
         elif value.__name__ == "executorch_call_delegate":
             return ExecutorchCallDelegateHigherOrderVariable(value, source, **kwargs)
         elif value.__name__ == "out_dtype":
             return OutDtypeHigherOrderVariable(value, source, **kwargs)
-        elif value is torch._functorch.eager_transforms.grad_impl:
-            return FunctorchGradHigherOrderVariable(value, source, **kwargs)
-        elif value is torch._functorch.vmap.vmap_impl:
-            return FunctorchVmapHigherOrderVariable(value, source, **kwargs)
-        elif value.__name__ in (
-            "trampoline_autograd_fwd",
-            "trampoline_autograd_bwd",
-            "trampoline_autograd_apply",
-        ):
-            return AutogradFunctionMethodHigherOrderVariable(
-                value=value, source=source, **kwargs
-            )
         elif value.__name__ == "wrap":
             return WrapHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "flex_attention":
+            return TemplatedAttentionHigherOrderVariable(value, source, **kwargs)
         elif value.__name__ in (
             "wrap_activation_checkpoint",
             "tag_activation_checkpoint",
@@ -421,6 +547,10 @@ def make(value, source=None, **kwargs):
             return TraceWrappedHigherOrderOperatorVariable(value, source, **kwargs)
         elif value.__name__ == "strict_mode":
             return StrictModeHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "associative_scan":
+            return AssociativeScanHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "call_torchbind":
+            return CallTorchbindHigherOrderVariable(value, source, **kwargs)
         else:
             unimplemented(f"HigherOrderOperator {value.__name__}")
 
@@ -438,14 +568,13 @@ def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
         from . import (
-            ConstantVariable,
             ListVariable,
             NestedUserFunctionVariable,
             TensorVariable,
             UserFunctionVariable,
         )
 
-        args, kwargs = VariableTracker.apply(lambda x: x.realize(), (args, kwargs))
+        args, kwargs = LazyVariableTracker.realize_all((args, kwargs))
 
         for i, k in enumerate(["pred", "true_fn", "false_fn", "operands"]):
             if v := kwargs.pop(k, None):
@@ -588,117 +717,29 @@ def diff_meta(tensor_vars1, tensor_vars2):
                 f"Expected branches to return tensors with same metadata. [(tensor_pair, difference)...]:{diffs}"
             )
 
-        def dedup_and_sort_lifted_freevars(true_lifted_freevars, false_lifted_freevars):
-            # The nn module attributes are guaranteed to be registered into the top-level graph module during
-            # higher order op speculation. Therefore, get_attr nodes in two branches with the same
-            # target refer to the same attribute and we can safely deduplicate them with their target.
-            #
-            # Note: ideally, dynamo should just create a single proxy for the same attribute of a nn module. But
-            # true_branch and false_branch belong to two separate tracing contexts, they may register the same
-            # attribute to top level seperately. This creates two get_attr proxies for the same attribute
-            # that have different meta data such as stack_trace (one stack trace for the true_branch,
-            # and the other for false_branch). It seems better to discard the proxy explicitly in cond
-            # than make dynamo create a single proxy for the same get_attr target.
-            def shared_getattrs(true_lifted_proxies, false_lifted_proxies):
-                true_targets = {
-                    proxy.node.target: proxy
-                    for proxy in true_lifted_proxies
-                    if proxy.node.op == "get_attr"
-                }
-                true_fn_shared_getattrs = {}
-                false_fn_shared_getattrs = {}
-
-                for false_proxy in false_lifted_proxies:
-                    if (
-                        false_proxy.node.op == "get_attr"
-                        and false_proxy.node.target in true_targets
-                    ):
-                        true_proxy = true_targets[false_proxy.node.target]
-                        true_fn_shared_getattrs[true_proxy] = true_proxy
-                        false_fn_shared_getattrs[false_proxy] = true_proxy
-                return true_fn_shared_getattrs, false_fn_shared_getattrs
-
-            true_fn_shared_getattrs, false_fn_shared_getattrs = shared_getattrs(
-                true_lifted_freevars.keys(), false_lifted_freevars.keys()
-            )
-
-            true_shared_freevars = (
-                true_lifted_freevars.keys() & false_lifted_freevars.keys()
-            ).union(true_fn_shared_getattrs.keys())
-            false_shared_freevars = (
-                true_lifted_freevars.keys() & false_lifted_freevars.keys()
-            ).union(false_fn_shared_getattrs.keys())
-            unique_true_freevars = true_lifted_freevars.keys() - true_shared_freevars
-            unique_false_freevars = false_lifted_freevars.keys() - false_shared_freevars
-
-            def _sort_by_name(vars):
-                return sorted(vars, key=lambda var: var.node.name)
-
-            return (
-                list(_sort_by_name(list(true_shared_freevars))),
-                list(_sort_by_name(list(false_shared_freevars))),
-                list(_sort_by_name(list(unique_true_freevars))),
-                list(_sort_by_name(list(unique_false_freevars))),
-            )
-
         (
+            true_graph,
+            false_graph,
             true_shared,
             false_shared,
             unique_true,
             unique_false,
-        ) = dedup_and_sort_lifted_freevars(true_lifted_freevars, false_lifted_freevars)
-
-        # Let's say we capture cond(pred, true_fn, false_fn, (x,))
-        # With set_graph_input set to automatic,
-        # true_fn has lifted variables x, a, b, c
-        # false_fn has lifted variables x, a, b, d
-        # Then fixup_branch_inps make sure both branches have the same signature, i.e.:
-        # - true_fn(x, a, b, c_true_branch, d_false_branch)
-        # - false_fn(x, a, b, c_true_branch, d_false_branch)
-        #
-        # More formally, the signature has three parts in the following order:
-        # 1. used in both branches: x, a, b
-        # 2. only used in true branches: c, suffixed with _true_branch
-        # 3. only used in false branches: d, suffixed with _false_branch
-        # Within each part, we re-order the nodes by name to have a derterministic ordering for testing.
-        def fixup_branch_inps(
-            graph, lifted_freevars, shared, unique_true, unique_false
-        ):
-            def _insert_or_replace_phs(new_args, name_suffix):
-                for arg in new_args:
-                    new_ph = graph.placeholder(arg.node.name + name_suffix)
-                    # Override with new_ph if there exists a old placeholder.
-                    if arg in lifted_freevars:
-                        old_ph = lifted_freevars[arg].node
-                        old_ph.replace_all_uses_with(new_ph)
-                        # replace_all_uses_with doesn't clean users. Clean it mannually so that we could erase it.
-                        old_ph.users = {}
-                        graph.erase_node(old_ph)
-
-            first_not_ph_node = next(
-                node for node in graph.nodes if node.op != "placeholder"
-            )
-            with graph.inserting_before(first_not_ph_node):
-                _insert_or_replace_phs(shared, "")
-                _insert_or_replace_phs(unique_true, "_true_branch")
-                _insert_or_replace_phs(unique_false, "_false_branch")
-
-        fixup_branch_inps(
-            true_graph, true_lifted_freevars, true_shared, unique_true, unique_false
-        )
-        fixup_branch_inps(
-            false_graph, false_lifted_freevars, false_shared, unique_true, unique_false
+        ) = _merge_graph_inputs(
+            true_graph,
+            true_lifted_freevars,
+            "true_branch",
+            false_graph,
+            false_lifted_freevars,
+            "false_branch",
         )
 
         true_name = add_subgraph(
             tx,
-            self.source,
             "cond_true",
             torch.fx.GraphModule(true_nn_modules, true_graph),
         )
         false_name = add_subgraph(
             tx,
-            self.source,
             "cond_false",
             torch.fx.GraphModule(false_nn_modules, false_graph),
         )
@@ -714,8 +755,309 @@ def _insert_or_replace_phs(new_args, name_suffix):
             true_shared + unique_true + unique_false,
         )
 
+        flat_example_value = pytree.tree_map_only(
+            torch.fx.Proxy,
+            lambda a: a.node.meta["example_value"],
+            true_r.as_proxy(),
+        )
+
+        return _call_function_and_unflatten_output(
+            tx,
+            torch.ops.higher_order.cond,
+            p_args,
+            {},
+            flat_example_value,
+            true_treespec,
+        )
+
+
+class CallTorchbindHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    def __init__(self, hop, source, script_obj_var, method_name):
+        super().__init__(hop, source)
+        self.script_obj_var = script_obj_var
+        self.method_name = method_name
+
+    def call_function(
+        self, tx, args: List[VariableTracker], kwargs: Dict[str, VariableTracker]
+    ) -> VariableTracker:
+        from .builder import wrap_fx_proxy
+
+        args, kwargs = LazyVariableTracker.realize_all((args, kwargs))
+
+        args_proxy = [arg.as_proxy() for arg in args]
+        kwargs_proxy = {k: v.as_proxy() for k, v in kwargs.items()}
+        return wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                self.value,
+                args=tuple(
+                    [self.script_obj_var.as_proxy(), self.method_name] + args_proxy
+                ),
+                kwargs=kwargs_proxy,
+            ),
+        )
+
+
+class WhileLoopHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    @raise_hard_error_if_graph_break(
+        reason="while_loop doesn't work unless it is captured completely with torch.compile."
+    )
+    def call_function(
+        self, tx, args: List[VariableTracker], kwargs: Dict[str, VariableTracker]
+    ) -> VariableTracker:
+        from . import NestedUserFunctionVariable, TensorVariable, UserFunctionVariable
+
+        args, kwargs = LazyVariableTracker.realize_all((args, kwargs))
+
+        for i, k in enumerate(["cond_fn", "body_fn", "operands"]):
+            if v := kwargs.pop(k, None):
+                assert i == len(
+                    args
+                ), "did not provide the right number of non-keyword args"
+                args.append(v)
+
+        if kwargs:
+            unimplemented(
+                f"torch.while_loop: Got unexpected kwargs: {list(kwargs.keys())}"
+            )
+
+        if len(args) != 4:
+            unimplemented(
+                f"Expected 4 arguments but got {len(args)}.\n"
+                f"Usage: while_loop(cond_fn, body_fn, operands)",
+            )
+
+        def _check_supported_callable(fn_var):
+            assert isinstance(
+                fn_var,
+                (
+                    UserFunctionVariable,
+                    NestedUserFunctionVariable,
+                    NNModuleVariable,
+                    UnspecializedNNModuleVariable,
+                ),
+            ), str(type(fn_var))
+
+        _check_supported_callable(args[0])
+        _check_supported_callable(args[1])
+
+        # operands
+        if not isinstance(args[2], (ListVariable, TupleVariable)):
+            unimplemented(
+                f"Expected a tuple but got {args[2].python_type()}",
+            )
+        operands = args[2].unpack_var_sequence(tx)
+        if not only_consist_of(args[2], (TensorVariable,)):
+            unimplemented(
+                "Expect operands to be a tuple of pytrees that only consists of tensor leaves."
+            )
+
+        # additional inputs check
+        if not isinstance(args[3], (ListVariable, TupleVariable)):
+            unimplemented(
+                f"Expected a tuple but got {args[3].python_type()}",
+            )
+        additional_inputs = args[3].unpack_var_sequence(tx)
+
+        (
+            (cond_r, cond_treespec),
+            cond_graph,
+            cond_lifted_freevars,
+        ) = speculate_subgraph(
+            tx,
+            args[0],
+            operands + additional_inputs,
+            {},
+            "while_loop",
+            source_target=self.value,
+            set_subgraph_inputs="manual",
+        )
+        cond_nn_modules = dict(tx.output.nn_modules)
+        if not isinstance(cond_r, TensorVariable):
+            unimplemented(
+                f"Expected cond_fn to return a tensor but got {cond_r.python_type()}",
+            )
+
+        cond_r_meta = _extract_tensor_metadata(
+            cond_r.proxy.node.meta["example_value"], include_contiguity=False
+        )
+        if not cond_r_meta.dtype == torch.bool or not cond_r_meta.shape == torch.Size(
+            []
+        ):
+            unimplemented(
+                f"Expected cond_fn to return a tensor with shape (,) but got {cond_r_meta.shape}"
+            )
+
+        (
+            (body_r, body_treespec),
+            body_graph,
+            body_lifted_freevars,
+        ) = speculate_subgraph(
+            tx,
+            args[1],
+            operands + additional_inputs,
+            {},
+            "while_loop",
+            source_target=self.value,
+            set_subgraph_inputs="manual",
+            should_flatten_outputs=True,
+        )
+        (
+            cond_graph,
+            body_graph,
+            cond_shared,
+            body_shared,
+            cond_unique,
+            body_unique,
+        ) = _merge_graph_inputs(
+            cond_graph,
+            cond_lifted_freevars,
+            "cond_fn",
+            body_graph,
+            body_lifted_freevars,
+            "body_fn",
+        )
+
+        # Note: cond_shared and body_shared refer to the same proxy in parent graph
+        # so using either of them is OK. Use cond_shared as it doesnt matter.
+        additional_lifted_inputs = cond_shared + cond_unique + body_unique
+
+        body_nn_modules = dict(tx.output.nn_modules)
+
+        cond_name = add_subgraph(
+            tx,
+            "cond_fn",
+            torch.fx.GraphModule(cond_nn_modules, cond_graph),
+        )
+        body_name = add_subgraph(
+            tx,
+            "body_fn",
+            torch.fx.GraphModule(body_nn_modules, body_graph),
+        )
+
+        cond_node = make_attr(tx, cond_name)
+        body_node = make_attr(tx, body_name)
+
+        p_args = (
+            cond_node,
+            body_node,
+            tuple([operand.as_proxy() for operand in operands]),
+            tuple(
+                [inp.as_proxy() for inp in additional_inputs] + additional_lifted_inputs
+            ),
+        )
+
+        flat_example_value = pytree.tree_map_only(
+            torch.fx.Proxy,
+            lambda a: a.node.meta["example_value"],
+            body_r.as_proxy(),
+        )
+
         return _call_function_and_unflatten_output(
-            tx, torch.ops.higher_order.cond, p_args, {}, true_r, true_treespec
+            tx,
+            torch.ops.higher_order.while_loop,
+            p_args,
+            {},
+            flat_example_value,
+            body_treespec,
+        )
+
+
+class AssociativeScanHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    @raise_hard_error_if_graph_break(
+        reason="associative_scan must be captured completely with torch.compile."
+    )
+    def call_function(
+        self, tx, args: List[VariableTracker], kwargs: Dict[str, VariableTracker]
+    ) -> VariableTracker:
+        from .builder import SourcelessBuilder, wrap_fx_proxy
+
+        args, kwargs = LazyVariableTracker.realize_all((args, kwargs))
+
+        def arg_extractor(combine_fn, input, dim):
+            return combine_fn, input, dim
+
+        combine_fn, input, dim = arg_extractor(*args, **kwargs)
+
+        if input.python_type() != list:
+            unimplemented(
+                f"Expected input to be a list of tensors but got {input.python_type()}",
+            )
+        assert isinstance(input, torch._dynamo.variables.lists.BaseListVariable)
+
+        # Trace the subgraph
+        # TODO: Fix these pointless new_empty calls appearing in the dynamo output graph.
+        null_shape = SourcelessBuilder.create(tx, ())
+        sub_args = [
+            leaf.call_method(tx, "new_empty", args=(null_shape,), kwargs={})
+            for leaf in itertools.chain(input.items, input.items)
+        ]
+        (
+            (combine_result, combine_treespec),
+            combine_graph,
+            combine_lifted_freevars,
+        ) = speculate_subgraph(
+            tx,
+            combine_fn,
+            sub_args,
+            sub_kwargs={},
+            description="scan_combine",
+            source_target=self.value,
+            set_subgraph_inputs="flatten_manual",
+        )
+
+        if combine_lifted_freevars:
+            unimplemented(
+                f"Combine fn had unexpected freevars: {combine_lifted_freevars}"
+            )
+
+        if combine_result.python_type() != list:
+            unimplemented(
+                f"Expected combine_fn to return a list if tensor but got {combine_result.python_type()}",
+            )
+
+        input_proxy = input.as_proxy()
+        combine_result_proxy = combine_result.as_proxy()
+        for result, inp_proxy in zip(combine_result_proxy, input_proxy):
+            inp_meta = inp_proxy.node.meta["example_value"]
+            combine_result_meta = result.node.meta["example_value"]
+            if combine_result_meta.device != inp_meta.device:
+                unimplemented(
+                    f"Expected combine_fn to return a tensor on device {inp_meta.device} but "
+                    + f"got {combine_result_meta.device}"
+                )
+            if combine_result_meta.dtype != inp_meta.dtype:
+                unimplemented(
+                    f"Expected combine_fn to return a tensor of {inp_meta.dtype} but "
+                    + f"got {combine_result_meta.dtype}"
+                )
+
+            if combine_result_meta.shape != ():
+                unimplemented(
+                    f"Expected combine_fn to return a tensor with shape () but got {combine_result_meta.shape}"
+                )
+
+        combine_gm = torch.fx.GraphModule(dict(tx.output.nn_modules), combine_graph)
+        combine_fn_name = add_subgraph(tx, "scan_combine", combine_gm)
+
+        p_args = (
+            make_attr(tx, combine_fn_name),
+            input_proxy,
+            dim.as_proxy(),
+        )
+
+        with tx.fake_mode:
+            out_meta = tuple(
+                inp_proxy.node.meta["example_value"].clone()
+                for inp_proxy in input_proxy
+            )
+        return wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function", torch.ops.higher_order.associative_scan, p_args, {}
+            ),
+            example_value=out_meta,
         )
 
 
@@ -779,11 +1121,27 @@ def call_function(
             should_flatten_outputs=True,
         )
 
+        subgraph_example_value = [
+            proxy.node.meta["example_value"] for proxy in body_r.as_proxy()
+        ]
+
+        with tx.output.fake_mode:
+            # We need to expand the example output from map() so that it has
+            # the same first dimension as the mapped input.
+            # We also do a clone with contiguous_format. This is to be consistent with
+            # eager semantic of map, which stacks the outputs. The result is contiguous
+            # as a result of the stack operation.
+            map_example_out = [
+                t.expand(sample_shape[0], *t.size()).clone(
+                    memory_format=torch.contiguous_format
+                )
+                for t in subgraph_example_value
+            ]
+
         body_nn_modules = dict(tx.output.nn_modules)
 
         body_name = add_subgraph(
             tx,
-            self.source,
             "map_body",
             torch.fx.GraphModule(body_nn_modules, body_graph),
         )
@@ -792,11 +1150,12 @@ def call_function(
 
         p_args = (
             body_node,
-            1,  # right now we only supports num_mapped = 1
-            *([arg.as_proxy() for arg in args[1:]] + list(body_lifted_freevars.keys())),
+            [args[1].as_proxy()],
+            [arg.as_proxy() for arg in args[2:]] + list(body_lifted_freevars.keys()),
         )
+
         return _call_function_and_unflatten_output(
-            tx, torch.ops.higher_order.map_impl, p_args, {}, body_r, body_spec
+            tx, torch.ops.higher_order.map_impl, p_args, {}, map_example_out, body_spec
         )
 
 
@@ -825,7 +1184,7 @@ def call_function(
             torch.fx.Proxy, lambda a: get_real_value(a.node, tx.output), p_args
         )
 
-        example_res = lowered_module.original_module(*real_sub_args)
+        example_res = lowered_module.original_module.module()(*real_sub_args)
 
         # NOTE [Guaranteeing the 1-1 correspondence of FakeTensors and real tensors]:
         # executorch modules promise not to alias inputs and outputs.
@@ -849,437 +1208,29 @@ def call_function(
         )
 
 
-class FunctorchGradHigherOrderVariable(TorchHigherOrderOperatorVariable):
-    def call_function(
-        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
-    ) -> "VariableTracker":
-        from . import ConstantVariable
-        from .builder import wrap_fx_proxy
-
-        # TODO: Support `fn` with kwargs.
-        if not torch._dynamo.config.capture_func_transforms:
-            unimplemented(
-                "torch.func.grad capture is disabled, "
-                "it can be turned on by setting "
-                "`torch._dynamo.config.capture_func_transforms=True`"
-            )
-        # [NOTE] Here we are (roughly) modelling the following
-        #
-        #   grad_fn = torch.func.grad(fn, argnums=.., has_aux=..)
-        #   grad_output = grad_fn(x)
-        grad_args = (args[0], args[1], args[2])
-
-        # get arguments
-        func, argnums, has_aux = grad_args
-        kwargs = args[4].items
-        if len(kwargs) > 0:
-            # Since speculate_subgraph doesn't support kwargs, we can't handle this for now.
-            unimplemented(
-                "torch.func.grad: kwargs arguments are currently unsupported."
-            )
-
-        # Trace through the `func`
-        # NOTE [HACK: Enable autograd while tracing function]
-        # `torch.func.grad` should not be affected by `no_grad` outside of `grad`.
-        # So, we enable_grad right before the function to which `grad` is applied
-        # (the parts explicitly disabled with `no_grad` inside the function are still disabled).
-        # Eg.
-        # def f(x):
-        #     with no_grad():  # This will disable grad tracking under it.
-        #        y = x * 2
-        #
-        #     return x ** 2 - y  # grad tracking should be enabled irrespective of outside `no_grad`.
-        #
-        # with no_grad():  # This will not disable grad tracking inside of grad(f).
-        #     grad_o = torch.func.grad(f)(x)
-        # TODO: Support kwargs
-        (body_r, _), body_graph, body_lifted_freevars = speculate_subgraph(
-            tx,
-            func,
-            args[3].items,
-            {},
-            "torch.func.grad",
-            source_target=self.value,
-            # See NOTE [HACK: Enable autograd while tracing function]
-            enable_grad=True,
-            set_subgraph_inputs="manual",
-        )
-
-        body_name = add_subgraph(
-            tx,
-            self.source,
-            "grad_body",
-            torch.fx.GraphModule(tx.output.nn_modules, body_graph),
-        )
-        body_node = make_attr(tx, body_name)
-        grad_proxy_args = (
-            body_node,
-            *(arg.as_proxy() for arg in grad_args[1:]),
-        )
-
-        # Model `grad_fn = grad(fn, *grad_args, **grad_kwargs)`
-        grad_fn = tx.output.create_proxy(
-            "call_function",
-            torch.func.grad,
-            args=tuple(grad_proxy_args),
-            kwargs={},
-            name="grad_proxy",
-        )
-
-        # Pass lifted freevars to the call to `grad_fn`
-        args = args[3].items
-        grad_fn_args = tuple(arg.as_proxy() for arg in args) + tuple(
-            body_lifted_freevars
-        )
-
-        # Call grad_fn with inputs.
-        # grad_output = grad_fn(*grad_fn_args, **grad_fn_kwargs)
-        grad_output = grad_fn(*grad_fn_args)
-
-        # `grad_fn(*grad_fn_args, **grad_fn_kwargs)`
-        # Output of grad_fn is
-        # For has_aux=False, Tuple[gradients of inputs indicated by argnums].
-        # For has_aux=True, Tuple[Tuple[gradients of inputs indicated by argnums], aux values]
-        # NOTE: example_value should match `grad_output`.
-        def _from_args(idx):
-            return args[idx].as_proxy().node.meta["example_value"].contiguous()
-
-        def to_python_ints(argnums):
-            if not isinstance(argnums, (ConstantVariable, TupleVariable)):
-                raise UserError(
-                    UserErrorType.INVALID_INPUT,
-                    f"argnums is expected to be int or tuple of ints. Got {argnums}.",
-                )
-
-            if isinstance(argnums, ConstantVariable):
-                if not isinstance(argnums.value, (int, tuple)):
-                    raise UserError(
-                        UserErrorType.INVALID_INPUT,
-                        f"argnums is expected to be int or tuple of ints. Got {argnums}.",
-                    )
-                return argnums.value
-            else:
-                const_vars = argnums.unpack_var_sequence(tx)
-                if not all(
-                    isinstance(var, ConstantVariable) and isinstance(var.value, int)
-                    for var in const_vars
-                ):
-                    raise UserError(
-                        UserErrorType.INVALID_INPUT,
-                        f"argnums is expected to contain int only. Got {const_vars}.",
-                    )
-                return tuple(var.value for var in const_vars)
-
-        argnums_v = to_python_ints(argnums)
-        example_value = pytree.tree_map(_from_args, argnums_v)
-
-        if has_aux.value:
-            # case : has_aux = True
-            # NOTE: Currently speculate subgraph allows body_r to be
-            # Tensor or Tuple/List of Tensor.
-            # Since `grad` expects output with has_aux
-            # to be (output, aux), only valid output currently is
-            # (output, some_tensor)
-            body_r_proxy = body_r.as_proxy()
-            aux = body_r_proxy[1].node.meta["example_value"]
-            example_value = (example_value, aux)
-
-        fx_proxy = wrap_fx_proxy(tx=tx, proxy=grad_output, example_value=example_value)
-
-        # Call contiguous on all the computed grads.
-        if not has_aux.value:
-            if isinstance(argnums_v, int):
-                return fx_proxy.call_method(tx, "contiguous", (), {})
-            else:
-                grads = fx_proxy
-                items = []
-                for idx in range(len(argnums_v)):
-                    proxy = grads.call_method(
-                        tx, "__getitem__", (ConstantVariable.create(idx),), {}
-                    ).call_method(tx, "contiguous", (), {})
-                    items.append(proxy)
-                return TupleVariable(items)
-        else:  # case: has_aux.value = True
-            # fx_proxy -> Tuple(grads, aux)
-            grads = fx_proxy.call_method(
-                tx, "__getitem__", (ConstantVariable.create(0),), {}
-            )
-            aux = fx_proxy.call_method(
-                tx, "__getitem__", (ConstantVariable.create(1),), {}
-            )
-            if isinstance(argnums_v, int):
-                return TupleVariable([grads.call_method(tx, "contiguous", (), {}), aux])
-            else:
-                items = []
-                for idx in range(len(argnums_v)):
-                    proxy = grads.call_method(
-                        tx, "__getitem__", (ConstantVariable.create(idx),), {}
-                    ).call_method(tx, "contiguous", (), {})
-                    items.append(proxy)
-                return TupleVariable([TupleVariable(items), aux])
-
-
-class FunctorchVmapHigherOrderVariable(TorchHigherOrderOperatorVariable):
+class FunctorchHigherOrderVariable(UserFunctionVariable):
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
-        from . import ConstantVariable, TensorVariable
-        from .builder import wrap_fx_proxy
-
         if not torch._dynamo.config.capture_func_transforms:
+            name = self.get_name()
+            fn = {
+                "grad_impl": "grad",
+                "vmap_impl": "vmap",
+                "vjp": "vjp",
+                "jvp": "jvp",
+                "jacrev": "jacrev",
+                "jacfwd": "jacfwd",
+                "hessian": "hessian",
+                "linearize": "linearize",
+            }.get(name)
+            assert name is not None
             unimplemented(
-                "torch.func.vmap capture is disabled, "
+                f"torch.func.{fn} capture is disabled, "
                 "it can be turned on by setting "
                 "`torch._dynamo.config.capture_func_transforms=True`"
             )
-
-        # unpack args
-        fn = args[0]
-        in_dims = args[1]
-        out_dims = args[2]
-        randomness = args[3]
-        chunk_size = args[4]
-        batch_input_args = args[5:]
-
-        if not isinstance(in_dims, (ConstantVariable, TupleVariable)):
-            unimplemented("torch.func.vmap: in_dims is not an int or tuple variable.")
-
-        if not isinstance(out_dims, (ConstantVariable, TupleVariable)):
-            unimplemented("torch.func.vmap: out_dims is not an int or tuple variable.")
-
-        if len(kwargs) > 0:
-            unimplemented(
-                "NYI - torch.func.vmap: kwargs arguments are currently unsupported."
-            )
-
-        if chunk_size.value is not None:
-            unimplemented(
-                "NYI - torch.func.vmap is not implemented when chunk_size is passed"
-            )
-
-        # Trace into tree_flatten with the list of batch_input_args.
-        flat_args, arg_spec = _make_inlined(tx, pytree.tree_flatten)(
-            ListVariable(batch_input_args)
-        ).unpack_var_sequence(tx)
-
-        # Transform in_dims into a list if it's not an integer literal.
-        in_dims_v = (
-            in_dims
-            if isinstance(in_dims.as_python_constant(), int)
-            else BuiltinVariable(list).call_function(tx, [in_dims], {})
-        )
-
-        # Trace into _broadcast_to_and_flatten with the transformed in_dims.
-        broadcasted_in_dims = _make_inlined(tx, pytree._broadcast_to_and_flatten)(
-            in_dims_v, arg_spec
-        )
-
-        # We want to pass unbatched input to speculate subgraph.
-        # So we loop through the inputs and select only one sample
-        # from the batch.
-        unbatched_input_args = []
-        for arg, in_dim in zip(
-            flat_args.unpack_var_sequence(tx),
-            broadcasted_in_dims.unpack_var_sequence(tx),
-        ):
-            if in_dim is not None:
-                assert isinstance(arg, TensorVariable)
-                unbatched_arg = arg.call_method(
-                    tx, "select", [in_dim, ConstantVariable.create(0)], {}
-                )
-                unbatched_input_args.append(unbatched_arg)
-            else:
-                unbatched_input_args.append(arg)
-
-        # Ban ops like `stride`, `storage_offset` in the traced functions.
-        # NOTE: We are conservatively banning more ops (vmap should be able
-        #       to handle a few of them).
-        with tx.strict_translation_mode():
-            # trace through the function with unbatched inputs.
-            _, body_graph, body_lifted_freevars = speculate_subgraph(
-                tx,
-                fn,
-                # Returns a ListVariable, since that's where we started flattening.
-                # However, we really want to pass the inner Python list as argument.
-                _make_inlined(tx, pytree.tree_unflatten)(
-                    ListVariable(unbatched_input_args), arg_spec
-                ).unpack_var_sequence(tx),
-                {},
-                "torch.vmap",
-                source_target=self.value,
-                set_subgraph_inputs="manual",
-            )
-
-        body_name = add_subgraph(
-            tx,
-            self.source,
-            "vmap_body",
-            torch.fx.GraphModule(tx.output.nn_modules, body_graph),
-        )
-        body_node = make_attr(tx, body_name)
-
-        # body_lifted_variable should not be treated as batched.
-        # So here we update `in_dims` to reflect that.
-        # NOTE: updated_in_dims is flat list, it is ok for now
-        #       as speculate_subgraph does not supports functions with non-Tensor args.
-        #       (so we graph-break above)
-        updated_in_dims = TupleVariable(
-            broadcasted_in_dims.unpack_var_sequence(tx)
-            + [
-                ConstantVariable.create(None),
-            ]
-            * len(body_lifted_freevars)
-        )
-
-        vmap_proxy_args = (
-            body_node,
-            *(arg.as_proxy() for arg in (updated_in_dims, out_dims, randomness)),
-        )
-        # vmap_proxy corresponds to `vmap_proxy = vmap(fn, *vmap_args, **vmap_kwargs)`
-        vmap_proxy = tx.output.create_proxy(
-            "call_function",
-            torch.func.vmap,
-            args=tuple(vmap_proxy_args),
-            kwargs={},
-            name="vmap_proxy",
-        )
-
-        proxy_batched_fn_args = tuple(
-            arg.as_proxy() for arg in batch_input_args
-        ) + tuple(body_lifted_freevars)
-
-        # We compute the example_value by actually calling
-        # `vmap` with FakeTensors.
-        fake_batched_fn_args = itertools.chain(
-            (get_fake_value(arg.as_proxy().node, tx) for arg in batch_input_args),
-            (get_fake_value(arg.node, tx) for arg in body_lifted_freevars),
-        )
-        actual_in_dims = tuple(
-            pytree.tree_map(lambda x: x.value, updated_in_dims.items)
-        )
-
-        # NOTE: `body_graph` might have operators which
-        # will create new tensors. So it is required
-        # that we run `vmap` under FakeMode.
-        with tx.fake_mode, enable_python_dispatcher():
-            example_value = torch._functorch.vmap.vmap_impl(
-                torch.fx.GraphModule(tx.output.nn_modules, body_graph),
-                actual_in_dims,
-                out_dims.as_python_constant(),
-                randomness.value,
-                chunk_size.value,
-                *fake_batched_fn_args,
-            )
-
-        # proxy corresponds to `call = vmap_proxy(*batched_fn_args, **batched_fn_kwargs)`
-        proxy = vmap_proxy(*proxy_batched_fn_args)
-        return wrap_fx_proxy(
-            tx=tx,
-            proxy=proxy,
-            example_value=example_value,
-        )
-
-
-class AutogradFunctionMethodHigherOrderVariable(TorchHigherOrderOperatorVariable):
-    def __init__(
-        self, value, fwd_bwd_tracer=None, source: Optional[Source] = None, **kwargs
-    ):
-        super().__init__(value, source, **kwargs)
-        # The fwd_bwd_tracer is owned by AutogradFunctionVariable and passed
-        # in for speculation. It allows us to share tracing information about proxies
-        # across fwd bwd, such as when users stash tensors on a context.
-        self.fwd_bwd_tracer = fwd_bwd_tracer
-
-    def call_function(
-        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
-    ) -> "VariableTracker":
-        from . import UserFunctionVariable
-        from .builder import wrap_fx_proxy
-
-        tracer = self.fwd_bwd_tracer
-
-        if len(kwargs) > 0:
-            unimplemented(
-                "kwargs have not been implemented for torch.autograd.Function"
-            )
-
-        from . import TorchInGraphFunctionVariable
-
-        always_restore = self.value.__name__ == "trampoline_autograd_bwd"
-        if (
-            self.value.__name__ == "trampoline_autograd_bwd"
-            or self.value.__name__ == "trampoline_autograd_fwd"
-        ):
-            fn = UserFunctionVariable(self.value, source=self.source)
-        else:
-            fn = TorchInGraphFunctionVariable(self.value)
-        # TODO(jansel): BUG!!! we aren't copying on the line below, so the post-pre check below is pointless
-        pre_guards = tx.output.guards
-        # In eager-mode PyTorch, if we only compute first-order gradients,
-        # then the grad_mode is False during the backward pass.
-        # torch.compile assumes that we only compute first-order gradients,
-        # so we want to speculate the backward pass with the grad mode disabled.
-        enable_grad = (
-            False if self.value.__name__ == "trampoline_autograd_bwd" else None
-        )
-
-        # TODO: Support kwargs
-        (
-            (body_r, _),
-            body_graph,
-            body_lifted_freevars,
-        ) = speculate_subgraph(
-            tx,
-            fn,
-            [
-                *args,
-            ],
-            {},
-            "the user-defined autograd.Function",
-            source_target=self.value,
-            # Backwards should never, ever be stored!
-            always_restore=always_restore,
-            restore_side_effects=False,
-            tracer=tracer,
-            enable_grad=enable_grad,
-            set_subgraph_inputs="manual",
-        )
-        post_guards = tx.output.guards
-        if body_lifted_freevars:
-            unimplemented("NYI - freevars in autograd function.")
-
-        if always_restore:
-            if post_guards - pre_guards:
-                unimplemented("NYI - New guards discovered in a restoring state")
-            # Nothing left to do here
-            return None
-
-        # don't add call module to parent graph if speculating forward
-        # return the result directly
-        if self.value.__name__ == "trampoline_autograd_fwd":
-            return body_r
-
-        p_args = (
-            *(arg.as_proxy() for arg in args),
-            *(arg for arg in body_lifted_freevars.keys()),
-        )
-        example_value = pytree.tree_map_only(
-            torch.fx.Proxy,
-            lambda a: a.node.meta["example_value"],
-            body_r.as_proxy(),
-        )
-
-        # Store the invocation as a call
-        return wrap_fx_proxy(
-            tx=tx,
-            proxy=tx.output.create_proxy(
-                "call_function",
-                self.value,
-                args=tuple(p_args),
-                kwargs={},
-            ),
-            example_value=example_value,
-        )
+        return super().call_function(tx, args, kwargs)
 
 
 class WrapHigherOrderVariable(TorchHigherOrderOperatorVariable):
@@ -1303,7 +1254,6 @@ def create_wrapped_node(self, tx, args, kwargs, description):
         body_gmod = torch.fx.GraphModule(tx.output.nn_modules, body_graph)
         body_name = add_subgraph(
             tx,
-            self.source,
             "wrap_body",
             body_gmod,
         )
@@ -1334,8 +1284,14 @@ def call_function(
         if len(p_kwargs) > 0:
             unimplemented("kwargs should have been flattened into lifted args")
 
+        flat_example_value = pytree.tree_map_only(
+            torch.fx.Proxy,
+            lambda a: a.node.meta["example_value"],
+            body_r.as_proxy(),
+        )
+
         return _call_function_and_unflatten_output(
-            tx, self.value, tuple(p_args), p_kwargs, body_r, treespec
+            tx, self.value, tuple(p_args), p_kwargs, flat_example_value, treespec
         )
 
 
@@ -1378,8 +1334,6 @@ class StrictModeHigherOrderVariable(TorchHigherOrderOperatorVariable):
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
-        from .builder import wrap_fx_proxy
-
         callable = args[0]
 
         unpacked_sequence = args[1].unpack_var_sequence(tx)
@@ -1411,7 +1365,6 @@ def call_function(
 
         strict_mode_name = add_subgraph(
             tx,
-            self.source,
             "strict_mode_body",
             torch.fx.GraphModule(strict_mode_nn_modules, ret_graph),
         )
@@ -1428,20 +1381,13 @@ def call_function(
             ret_val.as_proxy(),
         )
 
-        # Store the invocation as a call
-        flat_variable = wrap_fx_proxy(
-            tx=tx,
-            proxy=tx.output.create_proxy(
-                "call_function",
-                torch.ops.higher_order.strict_mode,
-                args=tuple(p_args),
-                kwargs={},
-            ),
-            example_value=flat_example_value,
-        )
-
         return _call_function_and_unflatten_output(
-            tx, torch.ops.higher_order.strict_mode, p_args, {}, ret_val, ret_treespec
+            tx,
+            torch.ops.higher_order.strict_mode,
+            p_args,
+            {},
+            flat_example_value,
+            ret_treespec,
         )
 
 
@@ -1455,7 +1401,17 @@ def call_function(
 
         context_fn = None
         if "context_fn" in kwargs and kwargs["context_fn"] != noop_context_fn:
-            context_fn = kwargs.pop("context_fn").fn
+            ctx = kwargs.pop("context_fn")
+            if isinstance(ctx, torch._dynamo.variables.UserFunctionVariable):
+                context_fn = ctx.fn
+            elif isinstance(
+                ctx, torch._dynamo.variables.functions.FunctoolsPartialVariable
+            ):
+                context_fn = ctx.as_python_constant()
+            else:
+                raise NotImplementedError(
+                    f"checkpoint not implemented for {type(ctx)} context_fn"
+                )
 
         checkpoint_kwargs, gmod_kwargs = TagActivationCheckpoint.divide_kwargs(kwargs)
 
@@ -1519,15 +1475,399 @@ def call_function(
 
 
 class TraceWrappedHigherOrderOperatorVariable(TorchHigherOrderOperatorVariable):
+    """
+    Handles torch._dynamo._trace_wrapped_higher_order_op.inner_trace
+    by unwrapping the higher order op and inlining through it.  This op
+    is created by dynamo to survive through AotAutograd, then unwrapped
+    here in the call to dynamo from compiled autograd.
+    """
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        kwargs = dict(kwargs)
+        fn = kwargs.pop("fn")
+        return fn.call_function(tx, args, kwargs)
+
+
+class TemplatedAttentionHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    @staticmethod
+    def normalize_to_args(args, kwargs):
+        # input signature is (query, key, value, score_mod, *other_buffers)
+        # Flatten args and kwargs into lists
+        flat_args = pytree.tree_flatten(args)[0]
+        flat_kwargs = pytree.tree_flatten(kwargs)[0]
+
+        # Combine the flattened lists
+        all_args = flat_args + flat_kwargs
+        return all_args
+
+    def create_wrapped_node(
+        self, tx, query: "VariableTracker", score_function: "VariableTracker"
+    ):
+        from torch._higher_order_ops.flex_attention import TransformGetItemToIndex
+        from .builder import SourcelessBuilder
+
+        tx: InstructionTranslator = tx
+
+        scores_require_grad: bool = query.requires_grad
+        score = query.call_method(
+            tx,
+            "new_empty",
+            (SourcelessBuilder.create(tx, []),),
+            {"requires_grad": SourcelessBuilder.create(tx, scores_require_grad)},
+        )
+
+        def create_scalar():
+            return query.call_method(
+                tx,
+                "new_empty",
+                (SourcelessBuilder.create(tx, []),),
+                {
+                    "dtype": SourcelessBuilder.create(tx, torch.int32),
+                },
+            )
+
+        bhmn = [create_scalar() for _ in range(4)]
+        new_args = [score, *bhmn]
+
+        with TransformGetItemToIndex():
+            (
+                (body_output, body_treespec),
+                body_graph,
+                body_lifted_freevars,
+            ) = speculate_subgraph(
+                tx,
+                score_function,
+                new_args,
+                {},  # expect only args no kwargs for now
+                description="flex_attention",
+                source_target=self.value,
+                set_subgraph_inputs="flatten_manual",
+            )
+
+        body_name = add_subgraph(
+            tx,
+            "flex_attention",
+            torch.fx.GraphModule(tx.output.nn_modules, body_graph),
+        )
+
+        body_node = make_attr(tx, body_name)
+
+        # It is possible that the score-mod function captures some free variables that are not
+        # passed in as arguments. In this case, we need to lift them, which is handled by speculate_subgraph.
+        # We then need to create proxies for this + the inputs.
+
+        lifted_args = tuple(arg for arg in body_lifted_freevars.keys())
+
+        proxy_args = (body_node,) + lifted_args
+
+        return proxy_args
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from .builder import wrap_fx_proxy
+
+        query, key, value, score_mod = self.normalize_to_args(args, kwargs)
+
+        p_args = self.create_wrapped_node(tx, query, score_mod)
+        proxied_args = [query, key, value]
+
+        # Store the invocation as a call
+        # Norm_kwargs contains the score_function and we dont want to proxy this because
+        # Proxying user defined functions is not supported.
+        inp_args, _ = proxy_args_kwargs(proxied_args, {})
+
+        query_meta = query.as_proxy().node.meta["example_value"]
+        logsumexp_shape = query_meta.size()[:-1]  # [B, H, M]
+        with torch._guards.TracingContext.try_get().fake_mode:
+            out_meta = torch.empty_like(
+                query_meta, memory_format=torch.contiguous_format
+            )
+            lse_meta = query_meta.new_empty(logsumexp_shape, dtype=torch.float32)
+        example_value = (out_meta, lse_meta)
+
+        return wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                self.value,
+                args=inp_args + p_args,
+                kwargs={},
+            ),
+            example_value=example_value,
+        )
+
+
+class AutogradFunctionApplyVariable(VariableTracker):
+    def __init__(self, fwd_graph, bwd_graph, parent_source, **kwargs):
+        super().__init__(**kwargs)
+        self.fwd_graph = fwd_graph
+        self.bwd_graph = bwd_graph
+        self.parent_source = parent_source
+
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
-        from . import TensorVariable
+        from . import (
+            AutogradFunctionContextVariable,
+            UserDefinedClassVariable,
+            UserFunctionVariable,
+            UserMethodVariable,
+        )
+        from .builder import wrap_fx_proxy
+
+        """
+        Consider the following:
+        class MySin(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.save_for_backward(x)
+                return x.sin()
+            @staticmethod
+            def backward(ctx, grad):
+                x, = ctx.saved_tensors
+                return grad * x.cos()
+        We want the resulting graphs to look like:
+        def fwd(ctx, x):
+            # (output, saved tensors / attrs)
+            return (x.sin(), [x])
+        # bwd(ctx, grad0, grad1, ..., gradn, *saved_tensors_or_attrs)
+        def bwd(ctx, grad, x):
+            return grad * x.cos()
+        To accomplish this, we're going to:
+        1. Construct a ctx object
+        2. (fwd_out, _), fwd_graph, fwd_freevars = speculate_subgraph on MySin.forward (manually_set_inputs=True)
+        3. (bwd_out, _), bwd_graph, bwd_freevars = speculate_subgraph on MySin.backward, while manually setting
+        the ctx and grad inputs.
+        4. Manually rewriting the fwd graph's output to be (output, stuff_that_gets_used in bwd_graph)
+        Getting from 3 to 4 is pretty elegant: stuff_that_gets_used in bwd graph is
+        just the bwd_freevars returned from speculate_subgraph, assuming MySin.backward
+        doesn't capture any arguments.
+        All these steps work if MySin.backward doesn't capture any values. This is a
+        limitation in general that we should check for.
+        """
+
+        prev_side_effects = tx.output.side_effects.clone()
+        fwd_tracer = torch._dynamo.output_graph.SubgraphTracer(
+            tx.output,
+            parent=tx.output.current_tracer,
+            source_target="autograd.Function",
+        )
+
+        fwd_src = AttrSource(self.parent_source, member="forward")
+        ctx = AutogradFunctionContextVariable.create(tx, args, kwargs)
+        if isinstance(self.fwd_graph, types.FunctionType):
+            fwd_fn = UserFunctionVariable(self.fwd_graph)
+            fwd_args = [ctx, *args]
+        elif isinstance(self.fwd_graph, types.MethodType):
+            fwd_fn = UserMethodVariable(
+                self.fwd_graph.__func__,
+                UserDefinedClassVariable(self.fwd_graph.__class__),
+            )
+            fwd_args = [fwd_fn.obj, ctx, *args]
+        else:
+            unimplemented("non-function or method")
+
+        # Speculate subgraph on the fwd
+        (fwd_out, _), fwd_graph, fwd_freevars = speculate_subgraph(
+            tx,
+            fwd_fn,
+            fwd_args,
+            kwargs,
+            "autograd.Function",
+            set_subgraph_inputs="semi_automatic",
+            restore_side_effects=False,
+            tracer=fwd_tracer,
+        )
+
+        if ctx.mutable_local in tx.output.side_effects.store_attr_mutations:
+            if (
+                "_materialize_non_diff_grads"
+                in tx.output.side_effects.store_attr_mutations[ctx.mutable_local]
+            ):
+                unimplemented("NYI")
+
+        bwd_tracer = torch._dynamo.output_graph.SubgraphTracer(
+            tx.output,
+            parent=fwd_tracer,
+            source_target="autograd.Function",
+        )
+
+        # Speculate subgraph on the backward. We make the
+        # bwd tracer a child of the fwd tracer, because backward may rely on
+        # tensors/attrs created in the fwd tracer.
+
+        if isinstance(fwd_out, variables.BaseListVariable):
+            bwd_args = [ctx, *fwd_out.items]
+        else:
+            bwd_args = [ctx, fwd_out]
+
+        bwd_src = AttrSource(self.parent_source, member="backward")
+        if isinstance(self.bwd_graph, types.FunctionType):
+            bwd_fn = UserFunctionVariable(self.bwd_graph, source=bwd_src)
+        elif isinstance(self.bwd_graph, types.MethodType):
+            bwd_fn = UserMethodVariable(
+                self.bwd_graph.__func__,
+                UserDefinedClassVariable(self.bwd_graph.__class__),
+                source=bwd_src,
+            )
+            bwd_args = [bwd_fn.obj, *bwd_args]
+        else:
+            unimplemented("non-function or method")
+
+        def is_strict_for(v: VariableTracker):
+            if isinstance(v, variables.TensorVariable):
+                # we can be more lax for stuff from forward
+                return v.proxy.tracer is not fwd_tracer
+            return True
+
+        with tx.output.subtracer(fwd_fn, fwd_tracer), tx.strict_translation_mode(
+            is_strict_for
+        ):
+            (bwd_out, _), bwd_graph, bwd_freevars = speculate_subgraph(
+                tx,
+                bwd_fn,
+                bwd_args,
+                kwargs,
+                "autograd.Function",
+                enable_grad=False,
+                set_subgraph_inputs="manual",
+                restore_side_effects=False,
+                tracer=bwd_tracer,
+            )
+
+        # TODO: assert that bwd_graph didn't capture values that were
+        # not created inside fwd_graph.
+
+        # TODO(oulgen): Ideally, we would not do a linear search for output
+        # node but as things currently are there could be nodes after the
+        # output node
+        # This is bug prone as if there's code after the output node, then
+        # graph.output will append the output at the very end
+        # This might be a behavior difference
+
+        # Rewrite the output of fwd_graph to (output, stuff_necessary_for_bwd)
+        for node in fwd_graph.find_nodes(op="output"):
+            fwd_graph.erase_node(node)
+            break
+
+        # Because we lift the bwd_freevars as inputs of the bwd_graph,
+        # we have to manually add the bwd_freevars as output of fwd_graph.
+        # However, the bwd_freevars got from speculate_subgraph use the Proxies in the bwd_graph,
+        # we need to convert them to Proxies in the fwd_graph and then generate new fwd_graph output.
+        fwd_proxy_of_bwd_freevars = []
+        for k in bwd_freevars.keys():
+            if k in fwd_freevars:
+                fwd_proxy_of_bwd_freevars.append(fwd_freevars[k])
+            else:
+                fwd_proxy_of_bwd_freevars.append(k)
+
+        new_fwd_graph_outputs = (fwd_out.as_proxy(), fwd_proxy_of_bwd_freevars)
+        new_fwd_graph_outputs = pytree.tree_map(lambda x: x.node, new_fwd_graph_outputs)
+        fwd_graph.output(new_fwd_graph_outputs)
+        fwd_graph.lint()
+
+        # Store fwd_body
+        fwd_nn_modules = tx.output.tracing_context.module_context.copy_graphstate()
+        fwd_name = add_subgraph(
+            tx,
+            "fwd_body",
+            torch.fx.GraphModule(fwd_nn_modules.nn_modules, fwd_graph),
+        )
+
+        fwd_node = make_attr(tx, fwd_name)
+
+        # The type of original args can be arbitrary, but we only support basic type in FX graph.
+        # So the speculated subgraph input includes original tensor args and the lifted freevars.
+        # We need to filter out the original tensor args and concat them with the lifted freevars
+        # to generate the proxy args for the FX call_function node.
+        filtered_args = []
+        # A boolean list to mark if the type of corresponding argument is tensor.
+        # This is used to determine if a FX node's argument should be an argument of
+        # ApplyTemplate.forward and if we should skip the output from ApplyTemplate.backward
+        # at torch._functorch.autograd_function.AutogradFunctionApply.
+        args_tensor_mask = [False] * len(args)
+        for i, arg in enumerate(args):
+            if isinstance(arg, (variables.TensorVariable, variables.SymNodeVariable)):
+                filtered_args.append(arg)
+                args_tensor_mask[i] = True
+
+        # Rewrite the output of bwd_graph to remove the grad output for the non-Tensor args.
+        new_bwd_graph_outputs = None
+        for node in bwd_graph.find_nodes(op="output"):
+            bwd_graph.erase_node(node)
+            break
+
+        # The same as the above fwd proxies, we need to use the bwd proxies in the bwd_graph
+        # if some of the output is from fwd_freevars.
+        bwd_out_proxy = bwd_out.as_proxy()
+        bwd_proxy_of_fwd_freevars = []
+        if isinstance(bwd_out_proxy, (tuple, list)):
+            for k in bwd_out_proxy:
+                if k in bwd_freevars:
+                    bwd_proxy_of_fwd_freevars.append(bwd_freevars[k])
+                else:
+                    bwd_proxy_of_fwd_freevars.append(k)
+        else:
+            if bwd_out_proxy in bwd_freevars:
+                bwd_proxy_of_fwd_freevars = bwd_freevars[bwd_out_proxy]
+            else:
+                bwd_proxy_of_fwd_freevars = bwd_out_proxy
+
+        # Remove bwd output for non-Tensor args.
+        output_proxy = bwd_proxy_of_fwd_freevars
+        if isinstance(output_proxy, (tuple, list)):
+            new_bwd_graph_outputs = ()
+            for x, mask in zip(output_proxy, args_tensor_mask):
+                if mask:
+                    new_bwd_graph_outputs = new_bwd_graph_outputs + (x,)
+                else:
+                    assert x is None, f"Grad of non-Tensor arg {x} is not None."
+        else:
+            new_bwd_graph_outputs = output_proxy
+
+        # Update the bwd graph output.
+        new_bwd_graph_outputs = pytree.tree_map(
+            lambda x: None if x is None else x.node, new_bwd_graph_outputs
+        )
+        bwd_graph.output(new_bwd_graph_outputs)
+        bwd_graph.lint()
+
+        # Store bwd_body
+        bwd_nn_modules = tx.output.tracing_context.module_context.copy_graphstate()
+        bwd_name = add_subgraph(
+            tx,
+            "bwd_body",
+            torch.fx.GraphModule(bwd_nn_modules.nn_modules, bwd_graph),
+        )
+
+        bwd_node = make_attr(tx, bwd_name)
+
+        tx.output.side_effects = prev_side_effects
 
-        assert "fn" in kwargs
-        fn = kwargs["fn"]
-        assert len(args) == 1
-        grad = args[0]
-        assert isinstance(grad, TensorVariable)
+        p_args = (
+            fwd_node,
+            bwd_node,
+            *([arg.as_proxy() for arg in filtered_args] + list(fwd_freevars.keys())),
+        )
+        example_value = pytree.tree_map_only(
+            torch.fx.Proxy,
+            lambda a: a.node.meta["example_value"],
+            fwd_out.as_proxy(),
+        )
+
+        # Store the invocation as a call
+        from torch._functorch.autograd_function import autograd_function_apply
 
-        return fn.call_function(tx, args, {})
+        return wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                autograd_function_apply,
+                args=p_args,
+                kwargs={"args_tensor_mask": args_tensor_mask},
+            ),
+            example_value=example_value,
+        )
diff --git a/torch/_dynamo/variables/iter.py b/torch/_dynamo/variables/iter.py
index 481d570b1fdd9..e907c4c815bbb 100644
--- a/torch/_dynamo/variables/iter.py
+++ b/torch/_dynamo/variables/iter.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 MAX_CYCLE = 3000
 
 import itertools
@@ -84,9 +86,10 @@ def call_function(
                 else:
                     try:
                         acc = func(tx, [acc, item], {})
-                    except Exception:
-                        raise unimplemented(  # noqa: TRY200
-                            f"Unexpected failure in invoking function during accumulate. Failed running func {func}({item}{acc})"
+                    except Exception as e:
+                        unimplemented(
+                            f"Unexpected failure in invoking function during accumulate. Failed running func {func}({item}{acc})",
+                            from_exc=e,
                         )
                 items.append(acc)
 
@@ -118,7 +121,7 @@ def retrieve_const_key(key):
                 elif isinstance(key, variables.ConstantVariable):
                     return key.as_python_constant()
                 else:
-                    raise unimplemented(
+                    unimplemented(
                         "Unsupported key type for itertools.groupby: " + str(type(key))
                     )
 
@@ -154,9 +157,10 @@ def retrieve_const_key(key):
                             mutable_local=MutableLocal(),
                         )
                     )
-            except Exception:
-                raise unimplemented(  # noqa: TRY200
-                    "Unexpected failure when calling itertools.groupby"
+            except Exception as e:
+                unimplemented(
+                    "Unexpected failure when calling itertools.groupby",
+                    from_exc=e,
                 )
             return variables.ListIteratorVariable(result, mutable_local=MutableLocal())
         elif self.value is itertools.repeat:
@@ -168,12 +172,16 @@ def retrieve_const_key(key):
             from .builder import SourcelessBuilder
 
             return tx.inline_user_function_return(
-                SourcelessBuilder()(tx, polyfill.repeat), args, kwargs
+                SourcelessBuilder.create(tx, polyfill.repeat), args, kwargs
             )
         elif self.value is itertools.count:
             return variables.CountIteratorVariable(*args, mutable_local=MutableLocal())
         elif self.value is itertools.cycle:
             return variables.CycleIteratorVariable(*args, mutable_local=MutableLocal())
+        elif self.value is itertools.dropwhile:
+            return variables.UserFunctionVariable(polyfill.dropwhile).call_function(
+                tx, args, kwargs
+            )
         else:
             return super().call_function(tx, args, kwargs)
 
@@ -182,7 +190,7 @@ class IteratorVariable(VariableTracker):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def next_variables(self, tx):
+    def next_variable(self, tx):
         unimplemented("abstract method, must implement")
 
 
@@ -192,8 +200,8 @@ def __init__(self, item: VariableTracker, **kwargs):
         self.item = item
 
     # Repeat needs no mutation, clone self
-    def next_variables(self, tx):
-        return self.item, self
+    def next_variable(self, tx):
+        return self.item
 
 
 class CountIteratorVariable(IteratorVariable):
@@ -206,12 +214,12 @@ def __init__(self, item: int = 0, step: int = 1, **kwargs):
         self.item = item
         self.step = step
 
-    def next_variables(self, tx):
+    def next_variable(self, tx):
         assert self.mutable_local
         tx.output.side_effects.mutation(self)
         next_item = self.item.call_method(tx, "__add__", [self.step], {})
         self.item = next_item
-        return self.item, self
+        return self.item
 
 
 class CycleIteratorVariable(IteratorVariable):
@@ -231,12 +239,12 @@ def __init__(
         self.saved_index = saved_index
         self.item = item
 
-    def next_variables(self, tx):
+    def next_variable(self, tx):
         assert self.mutable_local
 
         if self.iterator is not None:
             try:
-                new_item, _ = self.iterator.next_variables(tx)
+                new_item = self.iterator.next_variable(tx)
                 if len(self.saved) > MAX_CYCLE:
                     unimplemented(
                         "input iterator to itertools.cycle has too many items"
@@ -245,14 +253,14 @@ def next_variables(self, tx):
                 self.saved.append(new_item)
                 self.item = new_item
                 if self.item is None:
-                    return self.next_variables(tx)
-                return self.item, self
+                    return self.next_variable(tx)
+                return self.item
             except StopIteration:
                 self.iterator = None
-                return self.next_variables(tx)
+                return self.next_variable(tx)
         elif len(self.saved) > 0:
             tx.output.side_effects.mutation(self)
             self.saved_index = (self.saved_index + 1) % len(self.saved)
-            return self.item, self
+            return self.item
         else:
             raise StopIteration
diff --git a/torch/_dynamo/variables/lazy.py b/torch/_dynamo/variables/lazy.py
index 5dc0d0cb50bbe..fb4f5cfa76cf1 100644
--- a/torch/_dynamo/variables/lazy.py
+++ b/torch/_dynamo/variables/lazy.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+import collections
 import functools
 from typing import Optional
 
@@ -13,14 +15,14 @@ def __init__(self, value, source):
         self.source = source
         self.vt: Optional[VariableTracker] = None
 
-    def realize(self, parents_tracker):
+    def realize(self):
         assert self.vt is None
         from ..symbolic_convert import InstructionTranslator
         from .builder import VariableBuilder
 
         tx = InstructionTranslator.current_tx()
         self.vt = VariableBuilder(tx, self.source)(self.value)
-        self.vt.parents_tracker.add(parents_tracker)
+
         del self.value
         del self.source
 
@@ -53,7 +55,7 @@ def __init__(self, _cache, **kwargs):
     def realize(self) -> VariableTracker:
         """Force construction of the real VariableTracker"""
         if self._cache.vt is None:
-            self._cache.realize(self.parents_tracker)
+            self._cache.realize()
         return self._cache.vt
 
     def unwrap(self):
@@ -72,17 +74,57 @@ def clone(self, **kwargs):
         return VariableTracker.clone(self.unwrap(), **kwargs)
 
     def __str__(self):
+        if self.is_realized():
+            return self.unwrap().__str__()
         return VariableTracker.__str__(self.unwrap())
 
     def __getattr__(self, item):
         return getattr(self.realize(), item)
 
     # most methods are auto-generated below, these are the ones we want to exclude
-    apply = VariableTracker.apply
-    copy = VariableTracker.copy
-    __post_init__ = VariableTracker.__post_init__
+    visit = VariableTracker.visit
     __repr__ = VariableTracker.__repr__
 
+    @classmethod
+    def realize_all(
+        cls,
+        value,
+        cache=None,
+    ):
+        """
+        Walk an object and realize all LazyVariableTrackers inside it.
+        """
+        if cache is None:
+            cache = dict()
+
+        idx = id(value)
+        if idx in cache:
+            return cache[idx][0]
+
+        value_cls = type(value)
+        if issubclass(value_cls, LazyVariableTracker):
+            result = cls.realize_all(value.realize(), cache)
+        elif issubclass(value_cls, VariableTracker):
+            # update value in-place
+            result = value
+            value_dict = value.__dict__
+            nonvars = value._nonvar_fields
+            for key in value_dict:
+                if key not in nonvars:
+                    value_dict[key] = cls.realize_all(value_dict[key], cache)
+        elif value_cls is list:
+            result = [cls.realize_all(v, cache) for v in value]
+        elif value_cls is tuple:
+            result = tuple(cls.realize_all(v, cache) for v in value)
+        elif value_cls in (dict, collections.OrderedDict):
+            result = {k: cls.realize_all(v, cache) for k, v in list(value.items())}
+        else:
+            result = value
+
+        # save `value` to keep it alive and ensure id() isn't reused
+        cache[idx] = (result, value)
+        return result
+
 
 def _create_realize_and_forward(name):
     @functools.wraps(getattr(VariableTracker, name))
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index 1e7cade54c211..d51b4daff3471 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import collections
 import functools
 import inspect
@@ -21,6 +23,7 @@
     iter_contains,
     namedtuple_fields,
     odict_values,
+    set_example_value,
 )
 from .base import MutableLocal, VariableTracker
 from .constant import ConstantVariable
@@ -125,55 +128,22 @@ def call_method(
         elif name == "__contains__":
             assert len(args) == 1
             assert not kwargs
-            return iter_contains(self.items, args[0], tx)
+            return iter_contains(self.unpack_var_sequence(tx), args[0], tx)
         elif name == "index":
             from .builder import SourcelessBuilder
 
             return tx.inline_user_function_return(
-                SourcelessBuilder()(tx, polyfill.index), [self] + list(args), kwargs
+                SourcelessBuilder.create(tx, polyfill.index),
+                [self] + list(args),
+                kwargs,
             )
 
         return super().call_method(tx, name, args, kwargs)
 
     @staticmethod
     def list_compare(tx, op, left, right):
-        from .builtin import BuiltinVariable
-
-        eq_result = BaseListVariable.list_eq(tx, left, right)
-        if op is operator.eq:
-            return eq_result
-        elif op is operator.ne:
-            return BuiltinVariable(operator.not_).call_function(tx, [eq_result], {})
-        else:
-            unimplemented(f"list_compare {left} {op} {right}")
-
-    @staticmethod
-    def list_eq(tx, left, right):
-        from .builtin import BuiltinVariable
-
-        # Most list-like variables implement comparison ops the same way,
-        # so they can re-use this helper.
-        # There are quirks though, like how `tuple([2]) == torch.Size([2])`,
-        # but `tuple([2]) != list([2])`
-        if len(left.items) != len(right.items):
-            return ConstantVariable.create(False)
-        if len(left.items) == 0:
-            return ConstantVariable.create(True)
-
-        # Generic list comparison works by iterating over left aka self and right the compared-to list.
-        # If we hit here, their lengths are the same and they cannot be expressed as python constants.
-        # So, we iterate over the zipped list items.
-        comps = []
-        for l, r in zip(left.items, right.items):
-            comp = BuiltinVariable(operator.eq).call_function(tx, [l, r], {})
-            if comp.is_python_constant() and not comp.as_python_constant():
-                # early exit in false case
-                return comp
-            comps.append(comp)
-
-        return functools.reduce(
-            lambda a, b: BuiltinVariable(operator.and_).call_function(tx, [a, b], {}),
-            comps,
+        return variables.UserFunctionVariable(polyfill.list_cmp).call_function(
+            tx, [variables.BuiltinVariable(op), left, right], {}
         )
 
 
@@ -191,7 +161,7 @@ def __init__(self, items, **kwargs):
         elif len(items_to_map) == 3:
             start, stop, step = items_to_map
         else:
-            raise AssertionError()
+            raise AssertionError
 
         assert stop is not None
         super().__init__([start, stop, step], **kwargs)
@@ -212,7 +182,7 @@ def reconstruct(self, codegen):
         assert "range" not in codegen.tx.f_globals
         codegen.append_output(codegen.create_load_python_module(range, True))
         codegen.foreach(self.items)
-        return create_call_function(3, False)
+        codegen.extend_output(create_call_function(3, False))
 
     def var_getattr(self, tx, name):
         fields = ["start", "stop", "step"]
@@ -287,6 +257,12 @@ def call_method(
             assert not args
             items = list(self.items)
             return self.modified(items, mutable_local=MutableLocal())
+        elif name == "reverse" and self.mutable_local:
+            assert not kwargs
+            assert not args
+            self.items.reverse()
+            tx.output.side_effects.mutation(self)
+            return ConstantVariable.create(None)
         else:
             return super().call_method(tx, name, args, kwargs)
 
@@ -295,9 +271,12 @@ class ListVariable(CommonListMethodsVariable):
     def python_type(self):
         return list
 
+    def __repr__(self):
+        return f"{self.__class__.__name__}(length={len(self.items)})"
+
     def reconstruct(self, codegen):
         codegen.foreach(self.items)
-        return [create_instruction("BUILD_LIST", arg=len(self.items))]
+        codegen.append_output(create_instruction("BUILD_LIST", arg=len(self.items)))
 
     def call_method(
         self,
@@ -343,7 +322,7 @@ def reconstruct(self, codegen):
             codegen.create_load_python_module(collections.deque, True)
         )
         codegen.foreach(self.items)
-        return create_call_function(len(self.items), False)
+        codegen.extend_output(create_call_function(len(self.items), False))
 
     def call_method(
         self,
@@ -397,7 +376,7 @@ def python_type(self):
 
     def reconstruct(self, codegen):
         codegen.foreach(self.items)
-        return [create_instruction("BUILD_TUPLE", arg=len(self.items))]
+        codegen.append_output(create_instruction("BUILD_TUPLE", arg=len(self.items)))
 
     def call_method(
         self,
@@ -408,10 +387,20 @@ def call_method(
     ) -> "VariableTracker":
         return super().call_method(tx, name, args, kwargs)
 
+    def call_hasattr(self, tx, name: str) -> "VariableTracker":
+        if self.python_type() is not tuple:
+            return super().call_hasattr(tx, name)
+        return variables.ConstantVariable.create(hasattr((), name))
+
 
 class SizeVariable(TupleVariable):
     """torch.Size(...)"""
 
+    _nonvar_fields = {
+        "proxy",
+        *TupleVariable._nonvar_fields,
+    }
+
     def __init__(
         self,
         items: List[VariableTracker],
@@ -460,11 +449,14 @@ def as_proxy(self):
             return torch.Size(proxies)
 
         proxy = tracer.create_proxy("call_function", torch.Size, (proxies,), {})
-        proxy.node.meta["example_value"] = torch.Size(
-            [
-                p.node.meta["example_value"] if not isinstance(p, int) else p
-                for p in proxies
-            ]
+        set_example_value(
+            proxy.node,
+            torch.Size(
+                [
+                    p.node.meta["example_value"] if not isinstance(p, int) else p
+                    for p in proxies
+                ]
+            ),
         )
         return proxy
 
@@ -474,7 +466,7 @@ def reconstruct(self, codegen):
         build_torch_size = [
             create_instruction("BUILD_TUPLE", arg=len(self.items)),
         ] + create_call_function(1, True)
-        return build_torch_size
+        codegen.extend_output(build_torch_size)
 
     def unpack_var_sequence(self, tx):
         return list(self.items)
@@ -539,6 +531,11 @@ def get_item_dyn(self, tx, arg: VariableTracker):
 
 
 class NamedTupleVariable(TupleVariable):
+    _nonvar_fields = {
+        "tuple_cls",
+        *TupleVariable._nonvar_fields,
+    }
+
     def __init__(self, items, tuple_cls, **kwargs):
         super().__init__(items, **kwargs)
         self.tuple_cls = tuple_cls
@@ -549,13 +546,20 @@ def python_type(self):
     def as_python_constant(self):
         return self.python_type()(*[x.as_python_constant() for x in self.items])
 
+    def as_proxy(self):
+        assert self.python_type() is not SizeVariable
+        return self.python_type()(*self._as_proxy())
+
     def reconstruct(self, codegen):
         create_fn = getattr(self.tuple_cls, "_make", self.tuple_cls)
         codegen.append_output(codegen._create_load_const(create_fn))
         codegen.foreach(self.items)
-        return [
-            create_instruction("BUILD_TUPLE", arg=len(self.items)),
-        ] + create_call_function(1, True)
+        codegen.extend_output(
+            [
+                create_instruction("BUILD_TUPLE", arg=len(self.items)),
+            ]
+            + create_call_function(1, True)
+        )
 
     def var_getattr(self, tx, name):
         def check_and_create_method():
@@ -582,8 +586,7 @@ def check_and_create_method():
         return self.items[fields.index(name)]
 
     def call_hasattr(self, tx, name: str) -> "VariableTracker":
-        fields = namedtuple_fields(self.tuple_cls)
-        return variables.ConstantVariable.create(name in fields)
+        return variables.ConstantVariable.create(hasattr(self.tuple_cls, name))
 
 
 class SliceVariable(BaseListVariable):
@@ -598,7 +601,7 @@ def __init__(self, items, **kwargs):
         elif len(items_to_map) == 3:
             start, stop, step = items_to_map
         else:
-            raise AssertionError()
+            raise AssertionError
 
         if isinstance(start, variables.TensorVariable) or isinstance(
             stop, variables.TensorVariable
@@ -618,7 +621,7 @@ def as_python_constant(self):
 
     def reconstruct(self, codegen):
         codegen.foreach(self.items)
-        return [create_instruction("BUILD_SLICE", arg=len(self.items))]
+        codegen.append_output(create_instruction("BUILD_SLICE", arg=len(self.items)))
 
     def var_getattr(self, tx, name):
         fields = ["start", "stop", "step"]
@@ -628,6 +631,11 @@ def var_getattr(self, tx, name):
 
 
 class ListIteratorVariable(VariableTracker):
+    _nonvar_fields = {
+        "index",
+        *VariableTracker._nonvar_fields,
+    }
+
     def __init__(self, items, index: int = 0, **kwargs):
         super().__init__(**kwargs)
         assert isinstance(items, list)
@@ -639,18 +647,16 @@ def __init__(self, items, index: int = 0, **kwargs):
         self.index = index
 
     def __repr__(self):
-        return (
-            f"{self.__class__.__name__}({repr(self.items)}, index={repr(self.index)})"
-        )
+        return f"{self.__class__.__name__}(length={len(self.items)}, index={repr(self.index)})"
 
-    def next_variables(self, tx):
+    def next_variable(self, tx):
         assert self.mutable_local
         old_index = self.index
         if old_index >= len(self.items):
-            raise StopIteration()
+            raise StopIteration
         tx.output.side_effects.mutation(self)
         self.index += 1
-        return self.items[old_index], self
+        return self.items[old_index]
 
     def call_method(
         self,
@@ -668,7 +674,7 @@ def call_method(
 
     def as_python_constant(self):
         if self.index > 0:
-            raise NotImplementedError()
+            raise NotImplementedError
         return iter([x.as_python_constant() for x in self.items])
 
     def unpack_var_sequence(self, tx):
@@ -677,10 +683,12 @@ def unpack_var_sequence(self, tx):
     def reconstruct(self, codegen):
         remaining_items = self.items[self.index :]
         codegen.foreach(remaining_items)
-        return [
-            create_instruction("BUILD_TUPLE", arg=len(remaining_items)),
-            create_instruction("GET_ITER"),
-        ]
+        codegen.extend_output(
+            [
+                create_instruction("BUILD_TUPLE", arg=len(remaining_items)),
+                create_instruction("GET_ITER"),
+            ]
+        )
 
 
 class TupleIteratorVariable(ListIteratorVariable):
@@ -749,7 +757,7 @@ def as_proxy(self):
         return [x.as_proxy() for x in self.items]
 
     def as_python_constant(self):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def is_python_constant(self):
         return False
@@ -768,7 +776,8 @@ def modified(self, items, **kwargs):
 
     def reconstruct(self, codegen):
         codegen(self.user_cls_source)
-        return super().reconstruct(codegen) + create_call_function(1, True)
+        super().reconstruct(codegen)
+        codegen.extend_output(create_call_function(1, True))
 
     def call_method(
         self,
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index 7ff67363f370e..3e9495b3c7ca8 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -1,39 +1,50 @@
+# mypy: ignore-errors
 import collections
 import dataclasses
 import functools
 import inspect
 import itertools
+import re
 import sys
 import types
 from typing import Dict, List
 
 import torch._C
 import torch._numpy as tnp
+import torch.utils._pytree as pytree
 from .. import config, variables
 from ..bytecode_transformation import create_call_function, create_instruction
 from ..exc import unimplemented
 from ..guards import GuardBuilder, install_guard
 from ..source import AttrSource, GetItemSource, ODictGetItemSource, TypeSource
 from ..utils import (
-    check_constant_args,
-    check_unspec_python_args,
+    check_unspec_or_constant_args,
     identity,
     is_tensor_base_attr_getter,
     proxy_args_kwargs,
+    set_example_value,
 )
-from .base import MutableLocal, VariableTracker
-from .functions import (
-    NestedUserFunctionVariable,
-    UserFunctionVariable,
-    UserMethodVariable,
-)
-from .user_defined import UserDefinedObjectVariable
+from .base import VariableTracker
+from .functions import NestedUserFunctionVariable, UserFunctionVariable
+from .user_defined import is_standard_setattr, UserDefinedObjectVariable
 
 
 class SuperVariable(VariableTracker):
+    _nonvar_fields = {
+        "specialized",
+        *VariableTracker._nonvar_fields,
+    }
+
     def __init__(self, typevar, objvar=None, specialized=False, **kwargs):
         super().__init__(**kwargs)
+        # typevar is the fist argument to super(). In the case where no argument
+        # is provided to super(), it is the __class__ object where
+        # the super() function is being called
         self.typevar = typevar
+        # objvar here must be an instance or subtype of typevar.
+        # In the case where super() is called without arguments, it is the first argument
+        # to the current function where super() is called from (self for regular method,
+        # cls for a classmethod)
         self.objvar = objvar
         self.specialized = specialized  # directly get attr from self.typevar if true
 
@@ -42,9 +53,9 @@ def reconstruct(self, codegen):
         codegen(self.typevar)
         if self.objvar is not None:
             codegen(self.objvar)
-            return create_call_function(2, True)
+            codegen.extend_output(create_call_function(2, True))
         else:
-            return create_call_function(1, True)
+            codegen.extend_output(create_call_function(1, True))
 
     def _resolved_getattr_and_source(self, tx, name):
         assert self.objvar, "1-arg super not implemented"
@@ -52,9 +63,13 @@ def _resolved_getattr_and_source(self, tx, name):
             return getattr(self.typevar.as_python_constant(), name)
         search_type = self.typevar.as_python_constant()
 
-        # We default to the python type of the object. However, if this is
-        # a `type` or subclass of `type`, then the original object represents
-        # the user defined type.
+        # The rest of this function does two things:
+        #   - Walk the mro to find where the attribute comes from to be
+        #     able to provide accurate source
+        #   - Call the getattr to get the object
+
+        # Find the class object, where the function lives.
+        # When objvar is "self", use type(self), when objvar is "cls", use it as-is
         type_to_use = self.objvar.python_type()
         type_to_use_source = (
             TypeSource(self.objvar.source) if self.objvar.source else None
@@ -156,8 +171,12 @@ def call_method(
             return super(variables.CustomizedDictVariable, self.objvar).call_method(
                 tx, "__setitem__", args, kwargs
             )
-        else:
-            unimplemented(f"non-function or method super: {inner_fn}")
+        elif is_standard_setattr(inner_fn) and isinstance(
+            self.objvar, UserDefinedObjectVariable
+        ):
+            return self.objvar.method_setattr_standard(tx, *args, **kwargs)
+
+        unimplemented(f"non-function or method super: {inner_fn}")
 
 
 class UnknownVariable(VariableTracker):
@@ -230,22 +249,32 @@ def call_function(
 
 
 class ClosureVariable(UnknownVariable):
+    _nonvar_fields = {
+        "name",
+        *UnknownVariable._nonvar_fields,
+    }
+
     def __init__(self, name, **kwargs):
         super().__init__(**kwargs)
         self.name = name
 
     def reconstruct(self, codegen):
-        return [codegen.create_load_closure(self.name)]
+        codegen.append_output(codegen.create_load_closure(self.name))
 
 
 # closure variable created by an inlined function
 class InlinedClosureVariable(UnknownVariable):
+    _nonvar_fields = {
+        "name",
+        *UnknownVariable._nonvar_fields,
+    }
+
     def __init__(self, name, **kwargs):
         super().__init__(**kwargs)
         self.name = name
 
     def reconstruct(self, codegen):
-        return [codegen.create_load_closure(self.name)]
+        codegen.append_output(codegen.create_load_closure(self.name))
 
 
 class NewCellVariable(VariableTracker):
@@ -289,22 +318,6 @@ class InspectParameterVariable(VariableTracker):
     pass
 
 
-def produce_trampoline_autograd_fwd(fn_cls):
-    def trampoline_autograd_fwd(*args, **kwargs):
-        return fn_cls.forward(*args, **kwargs)
-
-    trampoline_autograd_fwd._origin = produce_trampoline_autograd_fwd
-    return trampoline_autograd_fwd
-
-
-def produce_trampoline_autograd_bwd(fn_cls):
-    def trampoline_autograd_bwd(*args, **kwargs):
-        return fn_cls.backward(*args, **kwargs)
-
-    trampoline_autograd_bwd._origin = produce_trampoline_autograd_bwd
-    return trampoline_autograd_bwd
-
-
 def produce_trampoline_autograd_apply(fn_cls):
     def trampoline_autograd_apply(*args, **kwargs):
         return fn_cls.apply(*args, **kwargs)
@@ -316,6 +329,11 @@ def trampoline_autograd_apply(*args, **kwargs):
 class AutogradFunctionVariable(VariableTracker):
     """represents a torch.autograd.Function subclass"""
 
+    _nonvar_fields = {
+        "fn_cls",
+        *VariableTracker._nonvar_fields,
+    }
+
     def __init__(self, fn_cls, **kwargs):
         super().__init__(**kwargs)
         self.fn_cls = fn_cls
@@ -331,26 +349,27 @@ def visit(node):
             if isinstance(node, variables.NNModuleVariable):
                 if node.is_training(tx):
                     requires_grad = True
-            return node
-
-        VariableTracker.apply(visit, (args, kwargs))
 
-        ctx = AutogradFunctionContextVariable.create(tx)
-        args = [ctx, *args]
+        VariableTracker.visit(visit, (args, kwargs))
 
         if (
             requires_grad
             and torch.is_grad_enabled()
             and config.capture_autograd_function
         ):
-            # Note - this is the same check used in autograd/function.py, except inverted.
-            # If we want to support functorch transforms here, we will need to enable this.
-            if (
-                self.fn_cls.setup_context
-                != torch.autograd.function._SingleLevelFunction.setup_context
-            ):
-                unimplemented(
-                    "NYI - autograd.Function with custom setup_context method"
+            from torch._functorch.autograd_function import (
+                autograd_function_forward_rewritten,
+            )
+            from torch.autograd.function import _is_setup_context_defined
+
+            forward_fn = self.fn_cls.forward
+
+            is_setup_ctx_defined = _is_setup_context_defined(self.fn_cls.setup_context)
+            if is_setup_ctx_defined:
+                # If setup_context is defined, we generate a new forward function which includes
+                # the original forward and setup_context function, and trace the new forward function.
+                forward_fn = autograd_function_forward_rewritten(
+                    self.fn_cls.forward, self.fn_cls.setup_context
                 )
 
             vjp_fn = self.fn_cls.vjp  # type: ignore[attr-defined]
@@ -361,70 +380,42 @@ def visit(node):
             if jvp_fn is not torch.autograd.Function.jvp:
                 unimplemented("NYI - User defind jvp")
 
-            from .higher_order_ops import TorchHigherOrderOperatorVariable
-
-            trampoline_autograd_apply = produce_trampoline_autograd_apply(self.fn_cls)
-            trampoline_autograd_fwd = produce_trampoline_autograd_fwd(self.fn_cls)
-            trampoline_autograd_bwd = produce_trampoline_autograd_bwd(self.fn_cls)
-
-            # NOTE [On Tracing autograd.Function w/ grad]
-            # The complex system described here revolves around the soundness evaluation of an autograd.Function in
-            # PyTorch. The system follows a well-defined strategy for tracing, which involves three key steps: tracing
-            # forward, tracing backward, and if both are sound the potential recording of an "apply" operation into the
-            # graph.We trace forward, and evaluate soundness. Soundness, in this context, refers to the absence of side
-            # effects, the avoidance of lifting new arguments into the trace, the production of a single tensor output,
-            # and a limited input scope confined to contexts, tensors, and constants. If the forward trace is sound,
-            # we install any guards accumulated from tracing. If not, we graph break. We trace backward, and evaluate
-            # for soundness, same as forward, except with more strictness. We enable a strict mode on the tx, and
-            # reject certain ops when running under this strict mode. If both the forward and backward traces are sound,
-            # we write the autograd function’s apply into the graph.
-
-            # For tracing forward and backward, we use UserFunctionVariable. Although it does not directly contribute
-            # to soundness evaluation, it plus a  GlobalSource makes sure we can produce valid guards,
-            # and that we can inline properly here. Inlining is required in order to be able to ensure that the
-            # soundness evaluation works as described above.
-
-            module_source = AttrSource(
-                tx.import_source(self.fn_cls.__module__), self.fn_cls.__name__
-            )
-            fwd_bwd_tracer = torch._dynamo.output_graph.SubgraphTracer(
-                tx.output,
-                parent=tx.output.current_tracer,
-                source_target="autograd.Function",
-            )
-            higher_order_autograd_fn = TorchHigherOrderOperatorVariable.make(
-                trampoline_autograd_fwd,
-                source=AttrSource(module_source, "forward"),
-                fwd_bwd_tracer=fwd_bwd_tracer,
-            )
-            speculated_fwd_result = higher_order_autograd_fn.call_function(
-                tx, args, kwargs
-            )
+            from .higher_order_ops import AutogradFunctionApplyVariable
 
-            if isinstance(speculated_fwd_result, variables.TupleVariable):
-                bwd_args = [ctx, *speculated_fwd_result.items]
-            else:
-                bwd_args = [ctx, speculated_fwd_result]
-
-            TorchHigherOrderOperatorVariable.make(
-                trampoline_autograd_bwd,
-                source=AttrSource(module_source, "backward"),
-                fwd_bwd_tracer=fwd_bwd_tracer,
-            ).call_function(tx, bwd_args, {})
-
-            # If fwd and backward are sound, we want apply in the graph.
-            # We don't want backward because we are tracing forwards.
-            args = args[1:]
-            return TorchHigherOrderOperatorVariable.make(
-                trampoline_autograd_apply,
-                fwd_bwd_tracer=None,
+            source = self.source
+            if source is None:
+                source = AttrSource(
+                    tx.import_source(self.fn_cls.__module__), self.fn_cls.__name__
+                )
+
+            val = AutogradFunctionApplyVariable(
+                forward_fn,
+                self.fn_cls.backward,
+                source,
+                source=AttrSource(source, member="apply"),
             ).call_function(tx, args, kwargs)
+            # Inside of AutogradFunctionApplyVariable.call_function, we use sourceless variable wrapping
+            # the forward function, as we don't want to generate guards for new_forward.__closure__
+            # if forward is rewritten by autograd_function_forward_rewritten.
+            # But we still need to generate correct guards for the original forward and setup_context
+            # functions, so we have to add guards manually.
+            if self.source:
+                fwd_src = AttrSource(self.source, "forward")
+                install_guard(fwd_src.make_guard(GuardBuilder.FUNCTION_MATCH))
+                if is_setup_ctx_defined:
+                    setup_ctx_src = AttrSource(self.source, "setup_context")
+                    install_guard(setup_ctx_src.make_guard(GuardBuilder.FUNCTION_MATCH))
+
+            return val
 
         if self.source:
             source = AttrSource(self.source, "forward")
         else:
             source = None
+
         fn = self.fn_cls.forward
+        ctx = AutogradFunctionContextVariable.create(tx, args, kwargs)
+        args = [ctx, *args]
         if isinstance(fn, types.FunctionType):
             return variables.UserFunctionVariable(fn, source=source).call_function(
                 tx, args, kwargs
@@ -440,6 +431,16 @@ def visit(node):
                 f"non-function or method in subclass of torch.autograd.Function: {fn}"
             )
 
+    def call_backward(self, tx, args, kwargs):
+        fn = self.fn_cls.backward
+        self.source = AttrSource(self.source, "backward")
+        assert type(args[0].value) is torch._dynamo.external_utils.FakeBackwardCFunction
+        assert isinstance(fn, types.FunctionType)
+
+        return variables.UserFunctionVariable(fn, source=self.source).call_function(
+            tx, args, kwargs
+        )
+
     def call_function(self, tx, args, kwargs):
         return AutogradFunctionVariable(self.fn_cls)
 
@@ -468,41 +469,36 @@ def call_method(
                 )
             else:
                 return self.call_apply(tx, args, kwargs)
+
         elif name == "backward":
-            with tx.strict_translation_mode():
-                if isinstance(self.fn_cls.backward, types.FunctionType):
-                    backward = UserFunctionVariable(self.fn_cls.backward)
-                elif isinstance(self.fn_cls.backward, types.MethodType):
-                    backward = UserMethodVariable(
-                        self.fn_cls.backward.__func__,
-                        variables.UserDefinedClassVariable(self.fn_cls),
+            return self.call_backward(tx, args, kwargs)
+        else:
+            from .. import trace_rules
+
+            source = AttrSource(self.source, name) if self.source is not None else None
+            try:
+                obj = inspect.getattr_static(self.fn_cls, name)
+            except AttributeError:
+                obj = None
+
+            if isinstance(obj, staticmethod):
+                func = obj.__get__(self.fn_cls)
+                if source is not None:
+                    return (
+                        trace_rules.lookup(func)
+                        .create_with_source(func, source=source)
+                        .call_function(tx, args, kwargs)
                     )
-                    args = [backward.obj] + args
                 else:
-                    unimplemented(
-                        f"backward is a non-function or method: {self.fn_cls.backward}"
+                    return trace_rules.lookup(func)(func).call_function(
+                        tx, args, kwargs
                     )
-
-                return tx.inline_call(tx, backward, args, kwargs)
-
-        elif name == "forward":
-            if isinstance(self.fn_cls.forward, types.FunctionType):
-                forward = UserFunctionVariable(self.fn_cls.forward)
-            elif isinstance(self.fn_cls.forward, types.MethodType):
-                forward = UserMethodVariable(
-                    self.fn_cls.forward.__func__,
-                    variables.UserDefinedClassVariable(self.fn_cls),
-                )
-                args = [forward.obj] + args
+            elif isinstance(obj, classmethod):
+                return variables.UserMethodVariable(
+                    obj.__func__, self, source=source
+                ).call_function(tx, args, kwargs)
             else:
-                unimplemented(
-                    f"forward is a non-function or method: {self.fn_cls.forward}"
-                )
-
-            return tx.inline_call(tx, forward, args, kwargs)
-
-        else:
-            unimplemented(f"Unsupported method: {name}")
+                unimplemented(f"Unsupported method: {name}")
 
 
 @dataclasses.dataclass
@@ -518,6 +514,7 @@ class AutogradFunctionContextVariable(UserDefinedObjectVariable):
     _nonvar_fields = {
         "proxy",
         "inference",
+        "saved_tensors",
         *UserDefinedObjectVariable._nonvar_fields,
     }
 
@@ -528,15 +525,23 @@ def __init__(
         inference=False,
         proxy=None,
         saved_tensors=None,
+        needs_input_grad=None,
         **kwargs,
     ):
         super().__init__(value=value, value_type=value_type, **kwargs)
         self.inference = inference
         self.proxy = proxy
         self.saved_tensors = saved_tensors
+        self.needs_input_grad = needs_input_grad
 
     @staticmethod
-    def create(tx):
+    def create(tx, args=None, kwargs=None):
+        needs_input_grad = None
+        if args and not kwargs:
+            needs_input_grad = tuple(
+                isinstance(x, variables.TensorVariable) and x.requires_grad
+                for x in args
+            )
         proxy = tx.output.create_proxy(
             "call_function", torch.autograd.function.FunctionCtx, tuple(), {}
         )
@@ -548,10 +553,12 @@ def create(tx):
                 inference=True,
                 proxy=proxy,
                 saved_tensors=SavedTensorBox(),
+                needs_input_grad=needs_input_grad,
             ),
             {},
         )
-        proxy.node.meta["example_value"] = out.value
+        set_example_value(proxy.node, out.value)
+
         return out
 
     def as_proxy(self):
@@ -566,6 +573,8 @@ def call_method(
         args: "List[VariableTracker]",
         kwargs: "Dict[str, VariableTracker]",
     ) -> "VariableTracker":
+        if name == "__setattr__":
+            return super().call_method(tx, name, args, kwargs)
         if name != "save_for_backward":
             unimplemented(f"autograd.Function context method: {name}")
         if self.saved_tensors is None:
@@ -577,6 +586,9 @@ def call_method(
             assert self.source and not kwargs
             tx.output.side_effects.track_save_for_backward(self, args)
 
+        # In eager mode, multiple calls to .save_for_backward() will overwrite previous calls.
+        if len(self.saved_tensors.tensors) > 0:
+            self.saved_tensors.tensors = []
         for arg in args:
             self.saved_tensors.tensors.append(arg)
         return variables.ConstantVariable.create(None)
@@ -588,6 +600,15 @@ def var_getattr(self, tx, name):
             )
         if name == "saved_tensors" and self.saved_tensors is not None:
             return variables.TupleVariable(list(self.saved_tensors.tensors))
+        if name == "needs_input_grad":
+            if self.needs_input_grad is not None:
+                return variables.ConstantVariable.create(self.needs_input_grad)
+            if self.source:
+                from .builder import VariableBuilder
+
+                return VariableBuilder(tx, AttrSource(self.source, "needs_input_grad"))(
+                    self.value.needs_input_grad
+                )
         return super().var_getattr(tx, name)
 
 
@@ -603,6 +624,11 @@ def call_function(
 
 
 class GetAttrVariable(VariableTracker):
+    _nonvar_fields = {
+        "name",
+        *VariableTracker._nonvar_fields,
+    }
+
     def __init__(self, obj, name, **kwargs):
         super().__init__(**kwargs)
         assert isinstance(obj, VariableTracker)
@@ -622,24 +648,51 @@ def as_proxy(self):
 
     def const_getattr(self, tx, name):
         if not isinstance(self.obj, variables.NNModuleVariable):
-            raise NotImplementedError()
+            raise NotImplementedError
         step1 = tx.output.get_submodule(self.obj.module_key)
         if self.name not in step1.__dict__:
-            raise NotImplementedError()
+            raise NotImplementedError
         step2 = inspect.getattr_static(step1, self.name)
         if name not in step2.__dict__:
-            raise NotImplementedError()
+            raise NotImplementedError
         return inspect.getattr_static(step2, name)
 
     def reconstruct(self, codegen):
         codegen(self.obj)
-        return codegen.create_load_attrs(self.name)
+        codegen.extend_output(codegen.create_load_attrs(self.name))
 
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
         return self.obj.call_method(tx, self.name, args, kwargs)
 
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List[VariableTracker],
+        kwargs: Dict[str, VariableTracker],
+    ) -> VariableTracker:
+        if (
+            name == "__getitem__"
+            and self.name == "__dict__"
+            and len(args) == 1
+            and not kwargs
+            and args[0].is_python_constant()
+        ):
+            obj = self.obj
+            key = args[0].as_python_constant()
+            # redirect to var_getattr on the original obj
+            if isinstance(obj, variables.UserDefinedObjectVariable):
+                obj._check_for_getattribute()
+                if (
+                    key in obj.value.__dict__
+                    or tx.output.side_effects.has_pending_mutation_of_attr(obj, key)
+                ):
+                    return obj.var_getattr(tx, key)
+
+        return super().call_method(tx, name, args, kwargs)
+
 
 class MethodWrapperVariable(VariableTracker):
     def __init__(self, method_wrapper, **kwargs):
@@ -688,6 +741,12 @@ def as_python_constant(self):
 
 
 class PythonModuleVariable(VariableTracker):
+    _nonvar_fields = {
+        "value",
+        "is_torch",
+        *VariableTracker._nonvar_fields,
+    }
+
     def __init__(self, value: types.ModuleType, **kwargs):
         super().__init__(**kwargs)
         self.value = value
@@ -709,80 +768,6 @@ def call_hasattr(self, tx, name):
         return super().call_hasattr(tx, name)
 
 
-class SkipFilesVariable(VariableTracker):
-    def __init__(self, value, reason=None, **kwargs):
-        super().__init__(**kwargs)
-        self.value = value
-        self.reason = reason
-
-    def python_type(self):
-        return type(self.value)
-
-    def as_python_constant(self):
-        return self.value
-
-    @classmethod
-    def create_with_source(cls, value, source):
-        install_guard(source.make_guard(GuardBuilder.FUNCTION_MATCH))
-        return cls(
-            value,
-            source=source,
-        )
-
-    @staticmethod
-    @functools.lru_cache(None)
-    def fold_through_function_to_wrapper():
-        return {
-            collections.namedtuple: variables.UserDefinedClassVariable,
-        }
-
-    def call_function(
-        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
-    ) -> "VariableTracker":
-        if inspect.getattr_static(self.value, "_torchdynamo_disable", False):
-            unimplemented(f"call torch._dynamo.disable() wrapped function {self.value}")
-        # Fold through the functions(e.g, collections.namedtuple)
-        # that inputs & outputs are all python constants
-        elif (
-            self.value in self.fold_through_function_to_wrapper().keys()
-            and check_constant_args(args, kwargs)
-        ):
-            value = self.value(
-                *[x.as_python_constant() for x in args],
-                **{k: v.as_python_constant() for k, v in kwargs.items()},
-            )
-            return self.fold_through_function_to_wrapper().get(self.value)(
-                value, mutable_local=MutableLocal()
-            )
-        elif (
-            self.value is functools.wraps
-            and not kwargs
-            and len(args) == 1
-            and (
-                args[0].source is not None or args[0].can_reconstruct(tx.output.root_tx)
-            )
-        ):
-
-            def wraps(fn):
-                if isinstance(fn, variables.NestedUserFunctionVariable):
-                    if args[0].source:
-                        reconstructible = args[0].source
-                    else:
-                        reconstructible = args[0]
-                    return fn.clone(wrapped_reconstructible=reconstructible)
-                unimplemented(f"functools.wraps({fn})")
-
-            return variables.LambdaVariable(wraps)
-        else:
-            try:
-                path = inspect.getfile(self.value)
-            except TypeError:
-                path = f"Builtin {self.value.__name__}"
-            msg = f"'skip function {self.value.__qualname__} in file {path}'"
-            msg += f"', {self.reason}'" if self.reason else ""
-            unimplemented(msg)
-
-
 class TypingVariable(VariableTracker):
     def __init__(self, value, **kwargs):
         super().__init__(**kwargs)
@@ -848,8 +833,9 @@ def call_function(
         if not config.trace_numpy:
             unimplemented(f"numpy.{self.value}()")
 
-        from ..utils import numpy_to_tensor_wrapper
+        import numpy as np
 
+        from ..utils import numpy_to_tensor_wrapper
         from .tensor import NumpyNdarrayVariable
 
         # lookup method name in tnp. Things like np.dtype(float) are not supported yet.
@@ -857,6 +843,18 @@ def call_function(
             unimplemented(
                 f"numpy dtype function is not supported yet. Got type {type(self.value)}."
             )
+        elif self.value in (np.iinfo, np.finfo):
+            try:
+                return NumpyTypeInfoVariable(
+                    self.value(
+                        *[x.as_python_constant() for x in args],
+                        **{k: v.as_python_constant() for k, v in kwargs.items()},
+                    )
+                )
+            except NotImplementedError:
+                unimplemented(
+                    f"{self.value.__name__} with non-const args: {args} {kwargs}"
+                )
         else:  # We are dealing with a callable.
             func = get_np_to_tnp_map().get(self.value)
             if func is None:
@@ -873,11 +871,10 @@ def call_function(
                 msg += f"confg.use_numpy_random_stream={config.use_numpy_random_stream}"
                 unimplemented(msg)
 
-            constant_args = check_constant_args(args, kwargs)
-            unspec_python_args = check_unspec_python_args(args, kwargs)
+            args, kwargs = NumpyNdarrayVariable.patch_args(func.__name__, args, kwargs)
 
             if self.can_constant_fold_through(func) and (
-                constant_args or unspec_python_args
+                check_unspec_or_constant_args(args, kwargs)
             ):
                 # constant fold
                 return variables.ConstantVariable.create(
@@ -931,7 +928,7 @@ def __str__(self):
     def reconstruct(self, codegen):
         if sys.version_info < (3, 11):
             unimplemented("cannot reconstruct NullVariable in < Python 3.11")
-        return [create_instruction("PUSH_NULL")]
+        codegen.append_output(create_instruction("PUSH_NULL"))
 
 
 class DeletedVariable(VariableTracker):
@@ -974,12 +971,168 @@ def reconstruct(self, codegen):
             codegen.append_output(create_instruction("PUSH_NULL"))
         codegen.append_output(codegen.create_load_const(self.format_string))
         codegen.append_output(codegen.create_load_attr("format"))
-        codegen.extend_output(
-            variables.TupleVariable(self.sym_args).reconstruct(codegen)
-        )
+        codegen(variables.TupleVariable(self.sym_args))
         kwargs = {
             variables.ConstantVariable.create(k): v for k, v in self.sym_kwargs.items()
         }
-        codegen.extend_output(variables.ConstDictVariable(kwargs).reconstruct(codegen))
+        codegen(variables.ConstDictVariable(kwargs))
         codegen.append_output(create_instruction("CALL_FUNCTION_EX", arg=1))
-        return []
+
+
+class DebuggingVariable(VariableTracker):
+    """
+    Represents a call to a debugging function like print(), or something
+    registered to config.reorderable_logging_functions.
+    """
+
+    def __init__(self, value, **kwargs):
+        super().__init__(**kwargs)
+        self.value = value
+
+    @staticmethod
+    def is_reorderable_logging_function(obj):
+        return (
+            callable(obj)
+            and isinstance(obj, (types.FunctionType, types.BuiltinFunctionType))
+            and obj in torch._dynamo.config.reorderable_logging_functions
+        )
+
+    def call_function(self, tx, args, kwargs):
+        if tx.export:
+            # For export cases, we can just make debugging functions no-ops
+            return
+
+        if not self.can_reorder_logs(self.value, args, kwargs):
+            unimplemented(
+                f"Reordering debugging function {self.value} "
+                f"with inputs {args} {kwargs} is not yet implemented."
+            )
+
+        tx.debug_locals.append((self, list(args)))
+
+    def reconstruct(self, codegen):
+        return self.source.reconstruct(codegen)
+
+    @staticmethod
+    def can_reorder_logs(fn, args, kwargs) -> True:
+        """
+        Run some additional checks for what sort of function calls can we
+        actually reorder.
+        """
+
+        allowed_input_types = (
+            variables.TensorVariable,
+            variables.ConstantVariable,
+            StringFormatVariable,
+        )
+
+        flat_args = pytree.tree_leaves([args, kwargs])
+        for arg in flat_args:
+            if not isinstance(arg, allowed_input_types):
+                return False
+
+        return True
+
+
+class LoggingLoggerVariable(VariableTracker):
+    """
+    Represents a call to any of logging.Logger methods
+    """
+
+    def __init__(self, value, **kwargs):
+        super().__init__(**kwargs)
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        if tx.export:
+            # For export cases, we can just make debugging functions no-ops
+            return
+        unimplemented("Logger not supported for non-export cases")
+
+
+class StopIterationVariable(VariableTracker):
+    def __init__(self, args, **kwargs):
+        super().__init__(**kwargs)
+        self.args = args
+
+    def reconstruct(self, codegen):
+        codegen.load_import_from("builtins", "StopIteration")
+        codegen.foreach(self.args)
+        codegen.call_function(len(self.args), True)
+
+
+class ConstantLikeVariable(VariableTracker):
+    """self.value is a compile-time constant, but not a literal"""
+
+    _error_prefix = "ConstantLikeVariable"
+    try:
+        from numpy import floating as np_floating
+    except ImportError:
+        np_floating = type("invalid_type", (), {})
+
+    def __init__(self, value, **kwargs):
+        super().__init__(**kwargs)
+        self.value = value
+
+    def python_type(self):
+        return type(self.value)
+
+    def as_python_constant(self):
+        return self.value
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List[VariableTracker],
+        kwargs: Dict[str, VariableTracker],
+    ) -> VariableTracker:
+        try:
+            # we only support constant propagation for methods
+            cargs = [x.as_python_constant() for x in args]
+            ckwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
+        except NotImplementedError:
+            unimplemented(f"{self._error_prefix}.{name}(*{args}, **{kwargs})")
+
+        result = getattr(self.value, name)(*cargs, **ckwargs)
+
+        if variables.ConstantVariable.is_literal(result):
+            return variables.ConstantVariable.create(result)
+        if isinstance(result, re.Match):
+            return ConstantRegexMatchVariable(result)
+
+        unimplemented(f"{self._error_prefix}.{name}() -> {result}")
+
+    def var_getattr(self, tx, name: str) -> VariableTracker:
+        result = getattr(self.value, name)
+        if isinstance(result, self.np_floating):
+            result = float(result)
+        if variables.ConstantVariable.is_literal(result):
+            return variables.ConstantVariable.create(result)
+        return GetAttrVariable(self, name)
+
+
+class RegexPatternVariable(ConstantLikeVariable):
+    _error_prefix = "re.Pattern"
+
+
+class ConstantRegexMatchVariable(ConstantLikeVariable):
+    _error_prefix = "re.Match"
+
+
+class TorchVersionVariable(ConstantLikeVariable):
+    _error_prefix = "torch.__version__"
+
+    def __init__(self, **kwargs):
+        kwargs.setdefault("value", torch.__version__)
+        assert kwargs["value"] is torch.__version__
+        super().__init__(**kwargs)
+
+
+class NumpyTypeInfoVariable(ConstantLikeVariable):
+    _error_prefix = "np.iinfo/np.finfo"
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 82f6c00432fcb..bcded32ab4fb5 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import functools
 import inspect
 import itertools
@@ -7,7 +9,7 @@
 
 import torch.nn
 
-from .. import skipfiles, variables
+from .. import trace_rules, variables
 from ..exc import unimplemented, UnspecializeRestartAnalysis, Unsupported
 from ..guards import GuardBuilder, install_guard
 from ..mutation_guard import GenerationTracker
@@ -22,12 +24,14 @@
     get_custom_getattr,
     get_fake_value,
     is_lazy_module,
+    is_namedtuple,
     is_safe_constant,
     istensor,
     istype,
     nnmodule_has_hooks,
     object_has_getattribute,
     proxy_args_kwargs,
+    set_example_value,
 )
 from .base import MutableLocal, typestr, VariableTracker
 from .functions import invoke_and_store_as_constant
@@ -43,37 +47,43 @@ def initialize_lazy_module(tx, mod, args, kwargs):
     useful now that 'allowed' modules graph-break on hooks, calling this first ensures there is no hook
     by the time we trace __call__ and thus no graph-break for lazy allowed modules.
     """
-    assert len(kwargs) == 0
-
     if hasattr(mod, "_initialize_hook"):
 
         def convert_to_fake(x):
-            if isinstance(x, torch.fx.Proxy):
+            if is_namedtuple(x):
+                return type(x)(*(convert_to_fake(elem) for elem in x))
+            elif isinstance(x, dict):
+                return {k: convert_to_fake(v) for k, v in x.items()}
+            elif isinstance(x, (list, tuple, set)):
+                return type(x)(convert_to_fake(elem) for elem in x)
+            elif isinstance(x, torch.fx.Proxy):
                 return get_fake_value(x.node, tx)
             else:
                 return x
 
-        input = [
-            type(arg)([convert_to_fake(x) for x in arg])
-            if isinstance(arg, (list, tuple))
-            else convert_to_fake(arg)
-            for arg in proxy_args_kwargs(args, {})[0]
-        ]
-        mod._infer_parameters(mod, input)
+        proxy_args, proxy_kwargs = proxy_args_kwargs(args, kwargs)
+        fake_args = [convert_to_fake(arg) for arg in proxy_args]
+        fake_kwargs = {k: convert_to_fake(v) for k, v in proxy_kwargs.items()}
+        mod._infer_parameters(mod, fake_args, fake_kwargs)
 
 
 @contextmanager
 def record_nn_module_stack(module_key: str, source, tx, mod: torch.nn.Module):
     fully_qualified_name = source.name()
     try:
-        tx.nn_module_stack[module_key] = (fully_qualified_name, type(mod))
+        tx.nn_module_stack[module_key] = (fully_qualified_name, mod.__class__)
         yield
     finally:
         del tx.nn_module_stack[module_key]
 
 
 class NNModuleVariable(VariableTracker):
-    _nonvar_fields = {"module_type", "module_key", *VariableTracker._nonvar_fields}
+    _nonvar_fields = {
+        "module_type",
+        "module_key",
+        "module",
+        *VariableTracker._nonvar_fields,
+    }
 
     def __init__(
         self, module_type: type, module_key: str, module: torch.nn.Module, **kwargs
@@ -144,7 +154,7 @@ def convert_to_unspecialized(self, tx):
         # Mark the class dynamic unless its module initialization
         if tx.f_code.co_name != "__init__":
             GenerationTracker.mark_class_dynamic(type(mod))
-        raise UnspecializeRestartAnalysis()
+        raise UnspecializeRestartAnalysis
 
     def _custom_getattr_fallback(self, base, tx, name, options):
         """Check for a __getattr__ and handle it specially if it is implemented"""
@@ -213,6 +223,11 @@ def var_getattr(self, tx, name):
             return VariableBuilder(tx, NNModuleSource(source))(subobj)
         else:
             if istype(subobj, property):
+                if self.source:
+                    # Read the class attribute to reach the property
+                    source = AttrSource(AttrSource(self.source, "__class__"), name)
+                    # Get the getter function
+                    source = AttrSource(source, "fget")
                 return variables.UserFunctionVariable(
                     subobj.fget,
                     source=source,
@@ -233,7 +248,9 @@ def var_getattr(self, tx, name):
                 # Support possibly common cases of class members
                 return VariableBuilder(tx, NNModuleSource(source))(subobj)
             else:
-                unimplemented(f"class property {typestr(base)} {typestr(subobj)}")
+                unimplemented(
+                    f"class property {name} - {typestr(base)} {typestr(subobj)}"
+                )
 
         return variables.GetAttrVariable(self, name, source=source)
 
@@ -291,8 +308,14 @@ def call_function(
             # If we are tracing the higher order op, we want Dynamo to step
             # inside the module call so that Dynamo can see the underlying
             # parameters and buffers and raise them as inputs to the graph.
-            if tx.output.is_root_tracer() and mod.__module__.startswith(
-                ("torch.nn.", "torch.ao.")
+            #
+            # NB: torch.nn.utils.parametrize changes the class type of a
+            # parametrized module such that its __module__ points to
+            # "torch.nn.utils.parametrize".
+            if (
+                tx.output.is_root_tracer()
+                and mod.__module__.startswith(("torch.nn.", "torch.ao."))
+                and mod.__module__ != "torch.nn.utils.parametrize"
             ):
                 if nnmodule_has_hooks(
                     mod, check_forward_hooks=True, check_backward_hooks=True
@@ -320,9 +343,10 @@ def call_function(
                     # If so at least some changes are needed, we don't allow inlining
                     # the call_wrapped currently, and maybe other issues too
                     fn = mod.forward
+                    fn_source = AttrSource(self.source, "forward")
                 else:
                     fn = mod._call_impl
-                fn_source = AttrSource(self.source, "__call__")
+                    fn_source = AttrSource(self.source, "_call_impl")
                 if istype(fn, types.MethodType):
                     fn = fn.__func__
                     fn_source = AttrSource(fn_source, "__func__")
@@ -357,7 +381,7 @@ def generic_call_method_helper(name):
                 tuple(),
                 {},
             )
-            mod_proxy.node.meta["example_value"] = module
+            set_example_value(mod_proxy.node, module)
 
             proxy_args, proxy_kwargs = proxy_args_kwargs(args, kwargs)
 
@@ -385,7 +409,7 @@ def generic_call_method_helper(name):
             with record_nn_module_stack(self.module_key, self.source, tx, module):
                 return generic_call_method_helper(name)
 
-        if name == "_check_input_dim" and skipfiles.is_torch_inline_allowed(
+        if name == "_check_input_dim" and trace_rules.is_torch_inline_allowed(
             inspect.getfile(module.__class__._check_input_dim)
         ):
             return ConstantVariable.create(True)
@@ -414,7 +438,7 @@ def assert_all_args_kwargs_const():
             if not all(
                 x.is_python_constant() for x in itertools.chain(args, kwargs.values())
             ):
-                raise unimplemented(f"non-const NNModule method {name}")
+                unimplemented(f"non-const NNModule method {name}")
 
         def get_kwargs(*names):
             assert_all_args_kwargs_const()
@@ -463,12 +487,16 @@ def gen_source(source, name):
             return source
 
         if name == "named_children":
+            tx.output.guard_on_key_order.add(AttrSource(self.source, "_modules").name())
             assert not (args or kwargs)
             result = []
             for name, submod in module.named_children():
                 result.append(named_embed(name, submod))
             return ListIteratorVariable(result, mutable_local=MutableLocal())
         elif name == "named_parameters":
+            tx.output.guard_on_key_order.add(
+                AttrSource(self.source, "_parameters").name()
+            )
             result = []
             for name, param in module.named_parameters(
                 **get_kwargs("prefix", "recurse")
@@ -476,6 +504,7 @@ def gen_source(source, name):
                 result.append(named_embed(name, param))
             return ListIteratorVariable(result, mutable_local=MutableLocal())
         elif name == "named_buffers":
+            tx.output.guard_on_key_order.add(AttrSource(self.source, "_buffers").name())
             result = []
             for name, buffer in module.named_buffers(
                 **get_kwargs("prefix", "recurse", "remove_duplicate")
@@ -483,6 +512,7 @@ def gen_source(source, name):
                 result.append(named_embed(name, buffer))
             return ListIteratorVariable(result, mutable_local=MutableLocal())
         elif name == "named_modules":
+            tx.output.guard_on_key_order.add(AttrSource(self.source, "_modules").name())
             result = []
             for name, submod in module.named_modules(
                 **get_kwargs("memo", "prefix", "remove_duplicate")
@@ -490,13 +520,19 @@ def gen_source(source, name):
                 result.append(named_embed(name, submod))
             return ListIteratorVariable(result, mutable_local=MutableLocal())
         elif name == "children":
+            tx.output.guard_on_key_order.add(AttrSource(self.source, "_modules").name())
             assert not (args or kwargs)
             return wrap_values(module.named_children())
         elif name == "modules":
+            tx.output.guard_on_key_order.add(AttrSource(self.source, "_modules").name())
             return wrap_values(module.named_modules())
         elif name == "parameters":
+            tx.output.guard_on_key_order.add(
+                AttrSource(self.source, "_parameters").name()
+            )
             return wrap_values(module.named_parameters(**get_kwargs("recurse")))
         elif name == "buffers":
+            tx.output.guard_on_key_order.add(AttrSource(self.source, "_buffers").name())
             return wrap_values(module.named_buffers(**get_kwargs("recurse")))
         elif name == "keys":
             assert not (args or kwargs)
@@ -630,7 +666,11 @@ def gen_source(source, name):
 
 
 class UnspecializedNNModuleVariable(UserDefinedObjectVariable):
-    _nonvar_fields = {"value_type", *UserDefinedObjectVariable._nonvar_fields}
+    _nonvar_fields = {
+        "value_type",
+        "is_state_mutated",
+        *UserDefinedObjectVariable._nonvar_fields,
+    }
 
     """
     The above class will specialize on the id() of a module and place
@@ -656,14 +696,17 @@ def __init__(self, value, **kwargs):
                 kwargs["value_type"] = type(value)
 
         super().__init__(value=value, **kwargs)
+        self.is_state_mutated = False
 
     @staticmethod
     @functools.lru_cache(None)
     def _nn_module_method_ids():
+        # Allow __setattr__ to fall through to base class handler
+        supported = {torch.nn.Module.__setattr__}
         return {
             id(x.__code__)
             for x in torch.nn.Module.__dict__.values()
-            if hasattr(x, "__code__")
+            if hasattr(x, "__code__") and x not in supported
         }
 
     def unpack_var_sequence(self, tx):
@@ -764,9 +807,50 @@ def call_method(
                     kwargs,
                 )
 
-            if id(method.__code__) in self._nn_module_method_ids():
+            if (
+                hasattr(method, "__code__")
+                and id(method.__code__) in self._nn_module_method_ids()
+            ):
                 unimplemented(f"UnspecializedNNModuleVariable missing {name}")
 
+            # "_parameters" in self.value.__dict__ checks that module is initialized
+            if name == "__setattr__" and "_parameters" in self.value.__dict__:
+                # Record if mutations happens on parameters/buffers/modules. The
+                # mutations on these are not tracked by base class
+                # UserDefinedObject vt. This will be used later to graph break
+                # on seeing a paramters() and family calls.
+                # TODO(anijain2305) - This might not be needed if we let Dynamo
+                # inline both getattr and setattr. In that case, it should see
+                # the lowest level dicts - _parameters and family and
+                # automatically track mutations on those. Investigate if that
+                # can be done.
+                attr_name = args[0].as_python_constant()
+                value = args[1]
+
+                # This is reverse engineered by looking at nn module __setattr__
+                # logic.
+                if (
+                    isinstance(value, variables.TensorVariable)
+                    and value.python_type() is torch.nn.Parameter
+                ) or attr_name in self.value.__dict__["_parameters"]:
+                    # Handle parameters
+                    self.is_state_mutated = True
+                elif attr_name in self.value.__dict__["_buffers"]:
+                    # Handle buffers
+                    self.is_state_mutated = True
+                elif (
+                    isinstance(
+                        value,
+                        (
+                            variables.NNModuleVariable,
+                            variables.UnspecializedNNModuleVariable,
+                        ),
+                    )
+                    or attr_name in self.value.__dict__["_modules"]
+                ):
+                    # Handle submodules
+                    self.is_state_mutated = True
+
         return super().call_method(tx, name, args, kwargs)
 
 
diff --git a/torch/_dynamo/variables/optimizer.py b/torch/_dynamo/variables/optimizer.py
index 03743358b144c..9a178e59331c8 100644
--- a/torch/_dynamo/variables/optimizer.py
+++ b/torch/_dynamo/variables/optimizer.py
@@ -1,20 +1,30 @@
+# mypy: ignore-errors
+
 import weakref
-from typing import Dict, List
+from typing import Dict, List, TYPE_CHECKING
 
 import torch
-from ..decorators import mark_static_address
+from torch.utils._pytree import tree_map_only
 
 from ..guards import GuardBuilder, install_guard
-from ..source import AttrSource, GetItemSource, GlobalWeakRefSource
-from ..utils import global_key_name
+from ..source import (
+    AttrSource,
+    ConstDictKeySource,
+    GetItemSource,
+    GlobalWeakRefSource,
+    GradSource,
+)
+from ..utils import GLOBAL_KEY_PREFIX
 
-from .base import VariableTracker
 from .constant import ConstantVariable
 from .dicts import ConstDictVariable
 from .lists import ListVariable
 from .misc import GetAttrVariable
 from .user_defined import UserDefinedObjectVariable
 
+if TYPE_CHECKING:
+    from .base import VariableTracker
+
 
 class ArgMappingException(Exception):
     pass
@@ -25,6 +35,13 @@ class GuardInstallException(Exception):
 
 
 class OptimizerVariable(UserDefinedObjectVariable):
+    _nonvar_fields = {
+        "grad_to_source",
+        "tensor_to_source",
+        "static_tensor_names",
+        *UserDefinedObjectVariable._nonvar_fields,
+    }
+
     def __init__(
         self,
         value,
@@ -34,14 +51,6 @@ def __init__(
         **kwargs,
     ):
         super().__init__(value, **kwargs)
-
-        for group in self.value.param_groups:
-            if "capturable" in group:
-                group["capturable"] = True
-
-            for p in group["params"]:
-                mark_static_address(p, guard=False)
-
         self.grad_to_source = grad_to_source or {}
         self.tensor_to_source = tensor_to_source or {}
         self.static_tensor_names = static_tensor_names or set()
@@ -56,13 +65,16 @@ def call_method(
         """This is an optimization to avoid tracing the very slow initialization of the optimizer"""
         if name == "_init_group":
             try:
+                self.graph_break_if_pending_mutation(tx)
+                self.move_step_if_cpu()
                 py_args, py_kwargs = self.get_python_args(*args, **kwargs)
                 ret_val = self.value._init_group(*py_args, **py_kwargs)
                 self.map_sources_and_install_guards(tx)
                 self.update_list_args(tx, args, kwargs, py_args, py_kwargs)
                 # stash a weak_ptr to optimizer to invalidate code
                 # if the optimizer object dies
-                tx.store_global_weakref(self.get_global_name(), self.value)
+                mangled_name = f"__optimizer_{id(self.value)}"
+                tx.store_global_weakref_by_id(mangled_name, self.value)
                 self.create_finalizer(tx)
 
                 # This is currently safe only because the only actual `ret_val`s returned
@@ -77,11 +89,58 @@ def call_method(
         return super().call_method(tx, name, args, kwargs)
 
     def var_getattr(self, tx, name):
-        if name == "_init_group":
-            return GetAttrVariable(self, name)
+        # Note: this allows us to intercept the call in call_method
+        # in the typical case, we return a UserMethodVariable
+        # which will directly inline
+        if name in ("_init_group", "step"):
+            return GetAttrVariable(self, name, source=AttrSource(self.source, name))
+
+        if name == "param_groups":
+            from ..decorators import mark_static_address
+
+            for group in self.value.param_groups:
+                for p in group["params"]:
+                    mark_static_address(p)
+
+            self._set_capturable(tx)
 
         return super().var_getattr(tx, name)
 
+    def graph_break_if_pending_mutation(self, tx):
+        # If there are pending mutations on a parameter (due to using closure)
+        # then we need to graph break to allow the python version of the parameter
+        # to update, so that running _init_group will initialize the states with
+        # the correct values
+        for g in self.value.param_groups:
+            for p in g["params"]:
+                side_effects = tx.output.side_effects
+                variable = side_effects.id_to_variable.get(id(p), None)
+                if variable and side_effects.has_pending_mutation(variable):
+                    from ..exc import Unsupported
+
+                    raise Unsupported("Pending mutation on parameter")
+
+    def _set_capturable(self, tx):
+        from . import LazyVariableTracker
+        from .builder import VariableBuilder
+
+        # Set capturable to True
+        for group in self.value.param_groups:
+            if "capturable" in group:
+                group["capturable"] = True
+
+        param_groups_vt = LazyVariableTracker.realize_all(
+            VariableBuilder(tx, AttrSource(self.source, "param_groups"))(
+                self.value.param_groups
+            )
+        )
+        for param_group_vt in param_groups_vt.items:
+            key = ConstDictVariable._HashableTracker(
+                ConstantVariable.create("capturable")
+            )
+            if key in param_group_vt.items:
+                param_group_vt.items[key] = ConstantVariable.create(True)
+
     def get_python_args(self, *args, **kwargs):
         """Get python values equivalent to the variable tracker args"""
 
@@ -98,40 +157,112 @@ def map_arg(arg):
             ):
                 return self.value.param_groups[arg.source.index]
 
-            raise ArgMappingException()
+            raise ArgMappingException
 
         new_args = [map_arg(arg) for arg in args]
         new_kwargs = {k: map_arg(v) for k, v in kwargs.items()}
 
         return new_args, new_kwargs
 
+    # If users load an old state dictionary,
+    # it's possible that step could be on the cpu
+    # if this is the case, move it to the GPU
+    # corresponding to the parameter
+    # in most cases this is a no-op because the state is empty
+    def move_step_if_cpu(self):
+        for p, state in self.value.state.items():
+            if "step" in state and state["step"].is_cpu:
+                state["step"] = state["step"].to(p.device)
+
     def map_sources_and_install_guards(self, tx):
+        from ..decorators import mark_static_address
         from .builder import VariableBuilder
+        from .lazy import LazyVariableTracker
 
         self.grad_to_source = {}
         self.tensor_to_source = {}
 
-        for g_ind, group in enumerate(self.value.param_groups):
-            group_source = GetItemSource(AttrSource(self.source, "param_groups"), g_ind)
-            for p_ind, p in enumerate(group["params"]):
-                param_source = GetItemSource(
-                    GetItemSource(group_source, "params"), p_ind
-                )
+        # Tracing the _init_group is expensive. But we still have to insert the
+        # necessary guards for _init_group. So, we manually handle insertion of
+        # guards. We also want to mark all the tensors inside the state dict to
+        # be static address.
+
+        # Mark all the tensors in the state dict to be static address. This has
+        # to be done first because the variable builder relies on the static
+        # address annotation.
+        def mark_static(x):
+            mark_static_address(x)
+
+        tree_map_only(torch.Tensor, mark_static, self.value.state)
+
+        # Recursively realize the variable trackers for optim.state and
+        # optim.param_groups, which recursively install the necessary guards.
+        param_groups_vt = LazyVariableTracker.realize_all(
+            VariableBuilder(tx, AttrSource(self.source, "param_groups"))(
+                self.value.param_groups
+            )
+        )
+
+        state_vt = VariableBuilder(tx, AttrSource(self.source, "state"))(
+            self.value.state
+        )
+
+        # We need to realize the top level state dict to populate
+        # the guard locals
+        state_vt.realize()
+
+        # Populate self.grad_to_source and self.tensor_to_source so that we can
+        # manually update_list_args
+        for g_ind, (group, group_vt) in enumerate(
+            zip(self.value.param_groups, param_groups_vt.items)
+        ):
+            # we assume here that all params within a param group
+            # are initialized similarly
+            if len(group["params"]) > 0:
+                for param in group["params"]:
+                    if param.grad is not None:
+                        key_index = None
+                        for i, k in enumerate(self.value.state.keys()):
+                            if k is param:
+                                key_index = i
+                                break
+                        if key_index:
+                            state_source = AttrSource(self.source, "state")
+                            LazyVariableTracker.realize_all(
+                                VariableBuilder(
+                                    tx,
+                                    GetItemSource(
+                                        state_source,
+                                        ConstDictKeySource(state_source, key_index),
+                                    ),
+                                )(self.value.state[param])
+                            )
+                            break
+
+            group_source = group_vt.source
+            params_vt = group_vt.getitem_const(ConstantVariable.create("params"))
+            for p_ind, (p, p_vt) in enumerate(
+                zip(group["params"], params_vt.unpack_var_sequence(tx))
+            ):
+                param_source = p_vt.source
                 self.tensor_to_source[p] = param_source
+                grad_source = GradSource(
+                    param_source,
+                    "grad",
+                )
+
                 if p.grad is not None:
-                    self.grad_to_source[p.grad] = AttrSource(
-                        param_source,
-                        "grad",
-                    )
+                    self.grad_to_source[p.grad] = grad_source
+                else:
+                    install_guard(grad_source.make_guard(GuardBuilder.CONSTANT_MATCH))
 
-        # state guards take a long time to generate
-        # so we manually generate them here
+        # We have to again iterate over the state dict to collect the
+        # tensor_to_source dict. This is used for the finalizer.
         state_source = AttrSource(self.source, "state")
-        install_guard(state_source.make_guard(GuardBuilder.DICT_KEYS))
-        for p, value in self.value.state.items():
-            tx.store_global_weakref(global_key_name(p), p)
-            p_state_source = GetItemSource(state_source, self.tensor_to_source[p])
-            install_guard(p_state_source.make_guard(GuardBuilder.DICT_KEYS))
+        for idx, (p, value) in enumerate(self.value.state.items()):
+            p_state_source = GetItemSource(
+                state_source, ConstDictKeySource(state_source, idx)
+            )
             for k, v in value.items():
                 if (
                     isinstance(v, torch.Tensor)
@@ -139,22 +270,10 @@ def map_sources_and_install_guards(self, tx):
                     and v not in self.tensor_to_source
                 ):
                     self.tensor_to_source[v] = GetItemSource(p_state_source, k)
-                elif v is None or isinstance(v, (bool, int, float, str)):
-                    install_guard(
-                        GetItemSource(p_state_source, k).make_guard(
-                            GuardBuilder.CONSTANT_MATCH
-                        )
-                    )
-                else:
-                    raise GuardInstallException()
-
-        # this next line has the side effect of installing guards
-        VariableBuilder(tx, AttrSource(self.source, "param_groups"))(
-            self.value.param_groups
-        ).recursive_realize()
 
     def wrap_tensor(self, tx, tensor_value):
         """Wrap state tensor in a TensorVariable"""
+        from ..decorators import mark_static_address
         from .builder import VariableBuilder
 
         # If we have a source for a tensor already use it,
@@ -164,19 +283,17 @@ def wrap_tensor(self, tx, tensor_value):
 
         if tensor_value in self.tensor_to_source:
             # mark these tensors as static for cudagraphs
-            mark_static_address(tensor_value, guard=False)
+            mark_static_address(tensor_value)
             builder = VariableBuilder(tx, self.tensor_to_source[tensor_value])
             self.static_tensor_names.add(tx.output.module_key_name(builder.name))
         elif tensor_value in self.grad_to_source:
             builder = VariableBuilder(tx, self.grad_to_source[tensor_value])
         else:
             # mark these tensors as static for cudagraphs
-            mark_static_address(tensor_value, guard=False)
+            mark_static_address(tensor_value)
 
-            tx.store_global_weakref(global_key_name(tensor_value), tensor_value)
-            builder = VariableBuilder(
-                tx, GlobalWeakRefSource(global_key_name(tensor_value))
-            )
+            global_name = tx.store_global_weakref_by_id(GLOBAL_KEY_PREFIX, tensor_value)
+            builder = VariableBuilder(tx, GlobalWeakRefSource(global_name))
             self.static_tensor_names.add(tx.output.module_key_name(builder.name))
 
         result = builder(tensor_value)
@@ -185,11 +302,23 @@ def wrap_tensor(self, tx, tensor_value):
     def update_list_args(self, tx, args, kwargs, py_args, py_kwargs):
         """Update the args and kwargs to the traced optimizer call"""
         for arg, py_arg in zip(args, py_args):
-            if isinstance(arg, ListVariable) and all(
-                isinstance(t, torch.Tensor) for t in py_arg
-            ):
-                tx.output.side_effects.mutation(arg)
-                arg.items.extend([self.wrap_tensor(tx, t) for t in py_arg])
+            if isinstance(arg, ListVariable):
+                assert isinstance(
+                    py_arg, list
+                ), "py_arg should be a list in optimizer variable"
+                for i, val in enumerate(py_arg):
+                    tx.output.side_effects.mutation(arg)
+                    if isinstance(val, torch.Tensor):
+                        arg.items.append(self.wrap_tensor(tx, val))
+                    else:
+                        from .builder import SourcelessBuilder, VariableBuilder
+
+                        if arg.source:
+                            arg.items.append(
+                                VariableBuilder(tx, GetItemSource(arg.source, i))(val)
+                            )
+                        else:
+                            arg.items.append(SourcelessBuilder.create(tx, val))
 
     def create_finalizer(self, tx):
         names_to_delete = self.static_tensor_names
@@ -207,6 +336,3 @@ def clear_static_tensor_refs():
             weakref.finalize(value, clear_static_tensor_refs)
 
         tx.output.add_graph_finalizer(init_finalizer)
-
-    def get_global_name(self):
-        return f"__optimizer_{id(self.value)}"
diff --git a/torch/_dynamo/variables/script_object.py b/torch/_dynamo/variables/script_object.py
new file mode 100644
index 0000000000000..70354e28bb3d3
--- /dev/null
+++ b/torch/_dynamo/variables/script_object.py
@@ -0,0 +1,80 @@
+import functools
+from typing import Dict
+
+import torch
+from ..exc import unimplemented, UnsafeScriptObjectError, Unsupported
+
+from .base import VariableTracker
+from .user_defined import UserDefinedObjectVariable
+
+
+def _raise_hard_error_if_graph_break(reason):
+    def deco(fn):
+        @functools.wraps(fn)
+        def graph_break_as_hard_error(*args, **kwargs):
+            try:
+                return fn(*args, **kwargs)
+            except Unsupported as e:
+                raise UnsafeScriptObjectError(e.msg) from e
+
+        return graph_break_as_hard_error
+
+    return deco
+
+
+class TorchScriptObjectVariable(UserDefinedObjectVariable):
+    _fake_script_object_cache: Dict[int, "TorchScriptObjectVariable"] = {}
+
+    @classmethod
+    def is_matching_cls(cls, user_cls: type):
+        return issubclass(user_cls, torch.ScriptObject)
+
+    @staticmethod
+    def create(proxy, value, **options):
+        return TorchScriptObjectVariable(proxy, value, **options)
+
+    def __init__(self, proxy, value, source, **kwargs):
+        super().__init__(value, **kwargs)
+        self.proxy = proxy
+        self.proxy.node.meta["example_value"] = value
+        self.source = source
+
+    def as_proxy(self):
+        return self.proxy
+
+    @_raise_hard_error_if_graph_break(
+        "Dynamo cannot safely trace script object due to graph break."
+    )
+    def var_getattr(self, tx, name: str) -> VariableTracker:
+        from torch._higher_order_ops.torchbind import call_torchbind
+        from ..source import AttrSource
+        from .higher_order_ops import TorchHigherOrderOperatorVariable
+
+        method = getattr(self.value, name, None)
+        if method is None:
+            unimplemented(
+                f"FakeScriptObject doesn't define method {name}. Did you forget to implement it in the fake class?"
+            )
+
+        if not callable(method):
+            unimplemented(
+                "Only method calls on TorchScript objects can be supported safely."
+                " Please use method calls instead of attribute access."
+            )
+
+        return TorchHigherOrderOperatorVariable.make(
+            call_torchbind,
+            source=AttrSource(self.source, name),
+            script_obj_var=self,
+            method_name=name,
+        )
+
+    # We only support method calls on script objects. Interpreting the bytecodes
+    # should go through var_getattr then call_function instead of call_method.
+    #
+    # However, it's possible for call_method to be used directly e.g. for __setattr__.
+    @_raise_hard_error_if_graph_break(
+        "Dynamo cannot safely trace script object due to graph break."
+    )
+    def call_method(self, tx, name, args, kwargs):
+        unimplemented(f"call method {name} on script object is not safe.")
diff --git a/torch/_dynamo/variables/sdpa.py b/torch/_dynamo/variables/sdpa.py
new file mode 100644
index 0000000000000..0a6af76690df6
--- /dev/null
+++ b/torch/_dynamo/variables/sdpa.py
@@ -0,0 +1,84 @@
+# mypy: ignore-errors
+
+from inspect import getattr_static
+
+from ..bytecode_transformation import create_call_function
+from ..exc import Unsupported
+from .base import VariableTracker
+
+
+class SDPAParamsVariable(VariableTracker):
+    """Represents the c++ params struct for scaled dot product attention.
+    This is a read-only container."""
+
+    @staticmethod
+    def create(tx, value, source):
+        from torch.backends.cuda import SDPAParams
+        from ..source import AttrSource
+        from .builder import VariableBuilder
+        from .torch import TorchInGraphFunctionVariable
+
+        query_var = VariableBuilder(tx, AttrSource(source, "query"))(value.query)
+        key_var = VariableBuilder(tx, AttrSource(source, "key"))(value.key)
+        value_var = VariableBuilder(tx, AttrSource(source, "value"))(value.value)
+        attn_mask_var = VariableBuilder(tx, AttrSource(source, "attn_mask"))(
+            value.attn_mask
+        )
+        dropout_var = VariableBuilder(tx, AttrSource(source, "dropout"))(value.dropout)
+        is_causal_var = VariableBuilder(tx, AttrSource(source, "is_causal"))(
+            value.is_causal
+        )
+        param_vars = [
+            query_var,
+            key_var,
+            value_var,
+            attn_mask_var,
+            dropout_var,
+            is_causal_var,
+        ]
+        return TorchInGraphFunctionVariable(SDPAParams).call_function(
+            tx, param_vars, {}
+        )
+
+    def __init__(self, proxy, param_vars, **kwargs):
+        self.proxy = proxy
+        self.param_vars = param_vars
+        super().__init__(**kwargs)
+
+    def reconstruct(self, codegen):
+        assert self.source is None
+        assert self.param_vars is not None
+        codegen.load_import_from("torch._C", "_SDPAParams")
+        codegen.foreach(self.param_vars)
+        codegen.extend_output(create_call_function(len(self.param_vars), True))
+
+    def as_proxy(self):
+        return self.proxy
+
+    def var_getattr(self, tx, name: str) -> VariableTracker:
+        import torch._C
+        from ..source import AttrSource
+        from .builder import wrap_fx_proxy
+        from .misc import GetAttrVariable
+
+        try:
+            getattr_static(torch._C._SDPAParams, name)
+        except AttributeError:
+            # Using raise from is too verbose here
+            raise Unsupported(  # noqa: TRY200
+                f"Unsupported torch._C._SDPAParams attribute {name}"
+            )
+
+        proxy = GetAttrVariable.create_getattr_proxy(self.as_proxy(), name)
+        if self.source is not None:
+            return wrap_fx_proxy(
+                tx=tx, proxy=proxy, source=AttrSource(self.source, name)
+            )
+        else:
+            return wrap_fx_proxy(tx=tx, proxy=proxy)
+
+    @staticmethod
+    def is_sdpa_params(value):
+        from torch.backends.cuda import SDPAParams
+
+        return value is SDPAParams
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index cd7c2581971c3..e928a9e0ea6ed 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -1,25 +1,21 @@
-import functools
+# mypy: ignore-errors
 
+import functools
 import inspect
+import logging
 import operator
+import textwrap
 import types
+import unittest
 from typing import Dict, List
 
-
-try:
-    import numpy as np
-except ModuleNotFoundError:
-    np = None
-
-
 import sympy
 
 import torch._numpy as tnp
-
 import torch.fx
 import torch.random
 from torch._dynamo import compiled_autograd
-
+from torch._subclasses.meta_utils import is_sparse_any
 from torch.fx.experimental.symbolic_shapes import (
     guard_scalar,
     GuardOnDataDependentSymNode,
@@ -27,11 +23,13 @@
     is_symbolic,
     SymTypes,
 )
-
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 from .. import config, variables
 from .._trace_wrapped_higher_order_op import trace_wrapped
-
+from ..bytecode_transformation import create_call_method
+from ..current_scope_id import current_scope_id
 from ..exc import unimplemented, UserError, UserErrorType
+from ..external_utils import call_hook_from_backward_state
 from ..guards import GuardBuilder, install_guard
 from ..source import AttrSource
 from ..utils import (
@@ -43,12 +41,21 @@
     object_has_getattribute,
     product,
     proxy_args_kwargs,
+    set_example_value,
     tensortype_to_dtype,
 )
-from .base import VariableTracker
+from .base import _is_top_level_scope, VariableTracker
 from .constant import ConstantVariable
 from .lists import SizeVariable
 
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    np = None
+
+log = logging.getLogger(__name__)
+
+# Ops that allow tensor <op> tensor
 supported_tensor_comparison_ops = {
     ">": operator.gt,
     "<": operator.lt,
@@ -57,12 +64,23 @@
     "==": operator.eq,
     "!=": operator.ne,
 }
+# Ops that allow tensor <op> None
 supported_const_comparison_ops = {
     "is": operator.is_,
     "is not": operator.is_not,
     "==": operator.eq,
     "!=": operator.ne,
 }
+supported_comparison_ops = {
+    **supported_tensor_comparison_ops,
+    **supported_const_comparison_ops,
+}
+supported_tensor_comparison_op_values = dict.fromkeys(
+    supported_tensor_comparison_ops.values()
+)
+supported_const_comparison_op_values = dict.fromkeys(
+    supported_const_comparison_ops.values()
+)
 
 
 class TensorVariable(VariableTracker):
@@ -82,6 +100,7 @@ class TensorVariable(VariableTracker):
         "is_sparse",
         "class_type",
         "specialized_value",
+        "_is_name_set",
         *VariableTracker._nonvar_fields,
     }
 
@@ -109,7 +128,7 @@ def __init__(
         size=None,
         stride=None,
         is_contiguous=None,
-        specialized_value=None,
+        _is_name_set=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -125,7 +144,10 @@ def __init__(
         self.is_contiguous = is_contiguous
         self.is_sparse = is_sparse
         self.class_type = class_type
-        self.specialized_value = specialized_value
+        if _is_name_set is None:
+            # no need to rename inputs
+            _is_name_set = self.proxy.node.op == "placeholder"
+        self._is_name_set: bool = _is_name_set
 
     def as_proxy(self):
         return self.proxy
@@ -145,7 +167,11 @@ def specialize(value: torch.Tensor):
             "is_sparse": value.is_sparse,
             "class_type": type(value),
         }
-        if not has_free_symbols(value):
+        if is_sparse_any(value) and not has_free_symbols(value):
+            props["size"] = tuple(
+                [int(s) if is_symbolic(s) else s for s in value.size()]
+            )
+        elif not has_free_symbols(value):
             # this is a fully static shape, and the keys on props here inform specialization.
             # We have to cast to int here, because these might get accessed as ConstantVariable, which has
             # a strict no-symint policy. If we got here due to not having free symbols, this is a known constant
@@ -173,8 +199,32 @@ def specialize(value: torch.Tensor):
         return props
 
     def dynamic_getattr(self, tx, name):
-        if not self.source:
-            raise NotImplementedError()
+        fake_val = self.proxy.node.meta["example_value"]
+        # For getattrs on tensors without sources,
+        # we can do better than the default (creating a GetAttrVariable)
+        # if:
+        # (1) the tensor is a traceable tensor subclass
+        # (2) We are getattr'ing an inner tensor from that subclass
+        if not self.source and is_traceable_wrapper_subclass(fake_val):
+            fake_val = self.proxy.node.meta["example_value"]
+            attrs, ctx = fake_val.__tensor_flatten__()
+            proxy = getattr(self.as_proxy(), name)
+            example_value = getattr(fake_val, name)
+            if name in attrs:
+                # attrs returned from tensor_flatten are always tensors
+                assert isinstance(example_value, torch.Tensor)
+                from .builder import wrap_fx_proxy
+
+                return wrap_fx_proxy(tx=tx, proxy=proxy, example_value=example_value)
+            # any other attributes on the subclass (that are not methods)
+            # are assumed to be constant metadata.
+            elif not callable(example_value):
+                from .builder import SourcelessBuilder
+
+                return SourcelessBuilder.create(tx, example_value)
+
+        if not (self.source and self.source.subguards_allowed()):
+            raise NotImplementedError
 
         # For local source, we associate the real value. We use this real value
         # for implementing getattr fallthrough on the variable tracker base class.
@@ -190,23 +240,23 @@ def dynamic_getattr(self, tx, name):
             # Which is incorrect, and violates the invariant that all sources should be eval()-able against the scope.
             _input_associated_real_value = eval(self.source.name(), scope)
         except Exception as exc:
-            raise NotImplementedError() from exc
+            raise NotImplementedError from exc
 
         if _input_associated_real_value is None:
-            raise NotImplementedError()
+            raise NotImplementedError
 
         if object_has_getattribute(_input_associated_real_value):
-            raise NotImplementedError()
+            raise NotImplementedError
 
         if get_custom_getattr(_input_associated_real_value):
-            raise NotImplementedError()
+            raise NotImplementedError
 
         real_value = getattr(_input_associated_real_value, name)
         if callable(real_value):
             # Callables have more nuanced handling, and we should let the existing system delegate here.
             # Raising was past behavior and so should always be sound to fall back.
             # Note - at a certain point we may want to handle
-            raise NotImplementedError()
+            raise NotImplementedError
 
         from ..guards import GuardBuilder
         from .builder import VariableBuilder
@@ -215,46 +265,80 @@ def dynamic_getattr(self, tx, name):
         install_guard(attr_source.make_guard(GuardBuilder.HASATTR))
         return VariableBuilder(tx, attr_source)(real_value)
 
-    def var_getattr(self, tx, name):
-        from . import ConstantVariable, UserDefinedClassVariable
+    def method_attr_ndim(self, tx):
+        if self.ndim is not None:
+            return ConstantVariable.create(self.ndim)
+        else:
+            return self.call_method(tx, "dim", [], {})
 
-        if tx.strict_checks_enabled:
-            if name in self._strict_mode_banned_ops():
-                unimplemented(f"Illegal getattr invocation {name} in strict mode")
+    def method_attr_dtype(self, tx):
+        if self.dtype is not None:
+            return ConstantVariable.create(self.dtype)
 
-        result = None
-        if name == "ndim" and self.ndim is not None:
-            result = ConstantVariable.create(self.ndim)
-        elif name == "dtype" and self.dtype is not None:
-            result = ConstantVariable.create(self.dtype)
-        elif name == "device" and self.device is not None:
-            result = ConstantVariable.create(self.device)
-        elif name == "layout" and self.layout is not None:
-            result = ConstantVariable.create(self.layout)
-        elif name == "is_cuda" and self.device is not None:
-            result = ConstantVariable.create(self.device.type == "cuda")
-        elif name == "shape" and self.size is not None:
+    def method_attr_device(self, tx):
+        if self.device is not None:
+            return ConstantVariable.create(self.device)
+
+    def method_attr_layout(self, tx):
+        if self.layout is not None:
+            return ConstantVariable.create(self.layout)
+
+    def method_attr_is_cuda(self, tx):
+        if self.device is not None:
+            return ConstantVariable.create(self.device.type == "cuda")
+
+    def method_attr_shape(self, tx):
+        if self.size is not None:
             sizes = [variables.ConstantVariable.create(x) for x in self.size]
-            result = SizeVariable(sizes)
-        elif name == "requires_grad" and self.requires_grad is not None:
-            result = ConstantVariable.create(self.requires_grad)
-        elif name == "is_quantized" and self.is_quantized is not None:
-            result = ConstantVariable.create(self.is_quantized)
-        elif name == "is_sparse" and self.is_sparse is not None:
-            result = ConstantVariable.create(self.is_sparse)
-        elif name == "shape" and self.size is None:
-            result = self.call_method(tx, "size", [], {})
-        elif name == "ndim" and self.ndim is None:
-            result = self.call_method(tx, "dim", [], {})
-        elif name == "data":
-            result = self.call_method(tx, "detach", [], {})
+            return SizeVariable(sizes)
+        else:
+            return self.call_method(tx, "size", [], {})
+
+    def method_attr_requires_grad(self, tx):
+        if self.requires_grad is not None:
+            return ConstantVariable.create(self.requires_grad)
+
+    def method_attr_is_quantized(self, tx):
+        if self.is_quantized is not None:
+            return ConstantVariable.create(self.is_quantized)
+
+    def method_attr_is_sparse(self, tx):
+        if self.is_sparse is not None:
+            return ConstantVariable.create(self.is_sparse)
+
+    def method_attr_data(self, tx):
+        return self.call_method(tx, "detach", [], {})
+
+    def method_attr__version(self, tx):
+        from ..tensor_version_op import _tensor_version
+
+        return variables.TorchInGraphFunctionVariable(_tensor_version).call_function(
+            tx, [self], {}
+        )
+
+    def var_getattr(self, tx, name):
+        from . import UserDefinedClassVariable
+
+        if self.is_strict_mode(tx) and name in self._strict_mode_banned_ops():
+            unimplemented(f"Illegal getattr invocation {name} in strict mode")
+
         if name == "__class__":
             return UserDefinedClassVariable(self.python_type())
 
+        handler = getattr(self, f"method_attr_{name}", None)
+        result = handler(tx) if handler is not None else None
+
         # Add a guard for type matching, these guards are checked before tensor guards
         # In some cases, a <tensor>.<attr> guard can be evaluated first, and break if
         # <tensor> is later changed to another type
-        if result is not None and self.source is not None:
+        if (
+            result is not None
+            and self.source
+            and self.source.subguards_allowed()
+            and not (
+                name not in ("grad", "requires_grad") and result.is_python_constant()
+            )
+        ):
             install_guard(self.make_guard(GuardBuilder.TYPE_MATCH))
             result.source = AttrSource(self.source, name)
 
@@ -275,7 +359,7 @@ def var_getattr(self, tx, name):
         # For attributes (not methods) that were not caught in the special handling above,
         # (e.g. tensor.real), we handle these generically, assuming that the output type is
         # a tensor.
-        if result is None:
+        if result is None and name != "grad":
 
             def try_generic_attr_handling():
                 from .builder import wrap_fx_proxy
@@ -309,7 +393,7 @@ def try_generic_attr_handling():
             result = self.dynamic_getattr(tx, name)
 
         if result is None:
-            raise NotImplementedError()
+            raise NotImplementedError
         return result
 
     def has_unpack_var_sequence(self, tx):
@@ -348,273 +432,321 @@ def call_method(
         args: "List[VariableTracker]",
         kwargs: "Dict[str, VariableTracker]",
     ) -> "VariableTracker":
-        if tx.strict_checks_enabled:
-            if name in self._strict_mode_banned_ops():
-                unimplemented(f"Illegal method invocation {name} in strict mode")
-        from . import ConstantVariable, TorchInGraphFunctionVariable, TupleVariable
-        from .builder import wrap_fx_proxy
+        if self.is_strict_mode(tx) and name in self._strict_mode_banned_ops():
+            unimplemented(f"Illegal method invocation {name} in strict mode")
 
-        kwargs = dict(kwargs)
+        """
+        Dispatch to a method-specific handler defined below.  If the
+        handler returns None (or doesn't exist) we put the method call
+        in the graph.
+        """
+        try:
+            handler_method = getattr(self, f"method_{name}")
+        except AttributeError:
+            pass
+        else:
+            try:
+                result = handler_method(*args, **kwargs)
+                if result:
+                    return result
+            except TypeError as e:
+                unimplemented(f"unhandled args for {name}: {e}")
 
-        if name in ("stride", "size"):
-            dim_var = None
-            if len(args) == 1:
-                dim_var = args[0]
-            elif "dim" in kwargs:
-                dim_var = kwargs["dim"]
-            else:
-                assert not args and not kwargs, f"Tensor.{name}() unhandled args/kwargs"
+        from .builder import wrap_fx_proxy
 
-            dim = guard_if_dyn(dim_var)
+        return wrap_fx_proxy(
+            tx,
+            tx.output.create_proxy(
+                "call_method",
+                name,
+                *proxy_args_kwargs([self, *args], kwargs),
+            ),
+        )
 
-            def make_const_size_variable(x, **options):
-                return SizeVariable(
-                    [ConstantVariable.create(y, **options) for y in x], **options
-                )
+    def method_size(self, *args, **kwargs):
+        return self._method_size_stride("size", *args, **kwargs)
 
-            RetVariable = (
-                make_const_size_variable if name == "size" else ConstantVariable.create
-            )
+    def method_stride(self, *args, **kwargs):
+        return self._method_size_stride("stride", *args, **kwargs)
 
-            # Technically, this should not be necessary, but I'm including it
-            # for enhanced BC, in case example_value is sometimes not set
-            # (it really should always be set though!)
-            if (r := getattr(self, name)) is not None:
-                if dim is None:
-                    return RetVariable(r)
-                else:
-                    return ConstantVariable.create(r[dim])
-
-            # It might still be constant!  Consult the fake tensor and see
-            if (fake := self.proxy.node.meta.get("example_value")) is not None:
-                if dim is None:
-                    fake_r = getattr(fake, name)()
-                    if not has_free_symbols(fake_r):
-                        # int conversion for safety, in case a SymInt refined
-                        # to constant
-                        return RetVariable(tuple(int(r) for r in fake_r))
-                else:
-                    fake_r = getattr(fake, name)(dim)
-                    if not has_free_symbols(fake_r):
-                        return ConstantVariable.create(int(fake_r))
+    def _method_size_stride(self, name, dim=None):
+        dim = guard_if_dyn(dim)
 
-            # Oops, it's not constant.  Do the dynamic shapes path.
-            return wrap_fx_proxy(
-                tx,
-                tx.output.create_proxy(
-                    "call_method",
-                    name,
-                    *proxy_args_kwargs([self] + list(args), kwargs),
-                ),
+        def make_const_size_variable(x, **options):
+            return SizeVariable(
+                [ConstantVariable.create(y, **options) for y in x], **options
             )
 
-        elif name in ("numel", "nelement"):
-            if self.size is not None:
-                return ConstantVariable.create(product(self.size))
+        RetVariable = (
+            make_const_size_variable if name == "size" else ConstantVariable.create
+        )
+
+        # Technically, this should not be necessary, but I'm including it
+        # for enhanced BC, in case example_value is sometimes not set
+        # (it really should always be set though!)
+        if (r := getattr(self, name)) is not None:
+            if dim is None:
+                return RetVariable(r)
+            else:
+                return ConstantVariable.create(r[dim])
 
-            # It might still be constant!  Consult the fake tensor and see
-            if (fake := self.proxy.node.meta.get("example_value")) is not None:
-                fake_r = fake.numel()
+        # It might still be constant!  Consult the fake tensor and see
+        if (fake := self.proxy.node.meta.get("example_value")) is not None:
+            if dim is None:
+                fake_r = getattr(fake, name)()
+                if not has_free_symbols(fake_r):
+                    # int conversion for safety, in case a SymInt refined
+                    # to constant
+                    return RetVariable(tuple(int(r) for r in fake_r))
+            else:
+                fake_r = getattr(fake, name)(dim)
                 if not has_free_symbols(fake_r):
                     return ConstantVariable.create(int(fake_r))
 
-            assert not kwargs, f"Tensor.{name}() unhandled kwargs"
+    def method_numel(self):
+        if self.size is not None:
+            return ConstantVariable.create(product(self.size))
 
-            # Oops, it's not constant.  Do the dynamic shapes path.
-            return wrap_fx_proxy(
-                tx,
-                tx.output.create_proxy(
-                    "call_method",
-                    "numel",
-                    *proxy_args_kwargs([self] + list(args), kwargs),
-                ),
-            )
+        # It might still be constant!  Consult the fake tensor and see
+        if (fake := self.proxy.node.meta.get("example_value")) is not None:
+            fake_r = fake.numel()
+            if not has_free_symbols(fake_r):
+                return ConstantVariable.create(int(fake_r))
+
+    method_nelement = method_numel
+
+    def method_dim(self):
+        if self.ndim is not None:
+            return ConstantVariable.create(self.ndim)
 
-        elif name in ("ndimension", "dim") and self.ndim is not None:
-            constant_result = ConstantVariable.create(self.ndim)
-        elif name == "is_floating_point" and self.dtype is not None:
-            constant_result = ConstantVariable.create(self.dtype.is_floating_point)
-        elif name == "is_contiguous":
-            memory_format = (
-                kwargs.pop("memory_format").as_python_constant()
-                if "memory_format" in kwargs
-                else torch.contiguous_format
+    method_ndimension = method_dim
+
+    def method_is_floating_point(self):
+        if self.dtype is not None:
+            return ConstantVariable.create(self.dtype.is_floating_point)
+
+    def method_is_complex(self):
+        if self.dtype is not None:
+            return ConstantVariable.create(self.dtype.is_complex)
+
+    def method_is_contiguous(self, memory_format=None):
+        memory_format = (
+            memory_format.as_python_constant()
+            if memory_format is not None
+            else torch.contiguous_format
+        )
+        if self.is_contiguous is not None:
+            return ConstantVariable.create(memory_format in self.is_contiguous)
+        elif (fake := self.proxy.node.meta.get("example_value")) is not None:
+            return ConstantVariable.create(
+                fake.is_contiguous(memory_format=memory_format)
             )
-            if self.is_contiguous is not None:
-                constant_result = ConstantVariable.create(
-                    memory_format in self.is_contiguous
-                )
-            elif (fake := self.proxy.node.meta.get("example_value")) is not None:
-                constant_result = ConstantVariable.create(
-                    fake.is_contiguous(memory_format=memory_format)
-                )
-            else:
-                constant_result = None
-        elif (
-            name == "type"
+
+    def method_type(self, dtype=None, non_blocking=False, **kwargs):
+        if (
+            dtype is None
             and self.dtype is not None
-            and len(args) == 0
             and isinstance(self.device, torch.device)
         ):
             tensortype = next(
                 k for k, v in tensortype_to_dtype.items() if self.dtype in v
             )
             if self.device.type == "cuda":
-                constant_result = ConstantVariable.create(
-                    f"torch.cuda.{tensortype.__name__}"
-                )
+                return ConstantVariable.create(f"torch.cuda.{tensortype.__name__}")
             else:
-                constant_result = ConstantVariable.create(
-                    f"torch.{tensortype.__name__}"
-                )
+                return ConstantVariable.create(f"torch.{tensortype.__name__}")
         elif (
-            name == "type"
-            and len(args) == 1
-            and fqn(type(args[0].as_python_constant())) == "torch.tensortype"
+            dtype is not None
+            and fqn(type(dtype.as_python_constant())) == "torch.tensortype"
         ):
             # torch.FloatTensor, etc. are all of type "torch.tensortype".
             # torch.fx's tracer fails on these types, because it doesn't support arguments of torch.tensortype type.
             # So, we pass it in as a string (which is also supported, see above implementation for .type() with 0 args)
-            tensor_type = args[0].as_python_constant()
+            tensor_type = dtype.as_python_constant()
             tensor_type_const = ConstantVariable.create(fqn(tensor_type))
+
+            from ..symbolic_convert import InstructionTranslator
+            from .builder import wrap_fx_proxy
+
+            tx = InstructionTranslator.current_tx()
+
+            if non_blocking:
+                kwargs = {"non_blocking": non_blocking, **kwargs}
+
             return wrap_fx_proxy(
                 tx,
                 tx.output.create_proxy(
                     "call_method",
-                    name,
+                    "type",
                     *proxy_args_kwargs([self, tensor_type_const], kwargs),
                 ),
             )
-        elif (
-            name == "as_subclass"
-            and len(args) == 1
-            and isinstance(args[0], TensorSubclassVariable)
-        ):
+
+    def method_as_subclass(self, cls):
+        if isinstance(cls, TensorSubclassVariable) and cls.source:
+            from ..symbolic_convert import InstructionTranslator
             from .builder import VariableBuilder
             from .torch_function import TensorWithTFOverrideVariable
 
+            tx = InstructionTranslator.current_tx()
+
             # [Note: __torch_function__] coerce this tensor variable into a TensorWithTFOverrideVariable
             # in eager, this is just a type change. This isn't sound if a __torch_function__ tensor subclass
             # defines a constructor, but if only a __torch_function__ impl is defined, this is okay to call.
             # It is up to the user whether this is correct behavior or not.
-            py_cls = args[0].as_python_constant()
+            py_cls = cls.as_python_constant()
             torch_fn = VariableBuilder(
                 tx,
-                AttrSource(
-                    AttrSource(args[0].source, "__torch_function__"), "__func__"
-                ),
+                AttrSource(AttrSource(cls.source, "__torch_function__"), "__func__"),
             )(py_cls.__torch_function__.__func__)
 
             return TensorWithTFOverrideVariable.from_tensor_var(
                 tx, self, py_cls, torch_fn
             )
-        elif name == "get_device" and isinstance(self.device, torch.device):
+
+    def method_get_device(self):
+        if isinstance(self.device, torch.device):
             index = self.device.index if self.device.type != "cpu" else -1
-            constant_result = ConstantVariable.create(index)
+            return ConstantVariable.create(index)
+
+    def method_element_size(self):
+        return ConstantVariable.create(self.dtype.itemsize)
+
+    def method_numpy(self, *, force=False):
+        if not config.trace_numpy:
+            unimplemented("Tensor.numpy(). config.trace_numpy is False")
+        if not np:
+            unimplemented("Tensor.numpy(). NumPy is not available")
+        if self.layout != torch.strided:
+            raise TypeError(
+                f"can't convert {self.layout} layout tensor to numpy. Use Tensor.dense() first"
+            )
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+
+        # We don't check that the tensor is on CPU when force is False, as this
+        # allows us to execute NumPy code on CUDA. Same for requires_grad=True
+        if force and force.as_python_constant():
+            # If the user set force=True we try to preserve the semantics (no gradients, move to CPU...)
+            t = self.call_method(tx, "detach", [], {})
+            proxy = tx.output.create_proxy("call_method", "cpu", (t.as_proxy(),), {})
         else:
-            constant_result = None
-
-        if constant_result:
-            assert not kwargs, f"Tensor.{name}() unhandled kwargs"
-            # TODO: I think this branch is dead
-            if len(args) == 1:
-                return constant_result.getitem_const(args[0])
-            elif args:
-                return TupleVariable([constant_result.getitem_const(a) for a in args])
-            return constant_result
-        elif name == "numpy":
-            if not config.trace_numpy:
-                unimplemented("Tensor.numpy(). config.trace_numpy is False")
-            if not np:
-                unimplemented("Tensor.numpy(). NumPy is not available")
-            assert not args, "Tensor.numpy() doesn't take args."
-            if self.layout != torch.strided:
-                raise TypeError(
-                    f"can't convert {self.layout} layout tensor to numpy. Use Tensor.dense() first"
-                )
-            # We don't check that the tensor is on CPU when force is False, as this
-            # allows us to execute NumPy code on CUDA. Same for requires_grad=True
-            force = "force" in kwargs and kwargs["force"].as_python_constant()
-            if force:
-                # If the user set force=True we try to preserve the semantics (no gradients, move to CPU...)
-                t = self.call_method(tx, "detach", [], {})
-                proxy = tx.output.create_proxy(
-                    "call_method", "cpu", (t.as_proxy(),), {}
-                )
-            else:
-                # Hacky way to create a view of self that will be marked as NumpyNdarrayVariable
-                proxy = tx.output.create_proxy(
-                    "call_method", "view_as", *proxy_args_kwargs([self, self], {})
-                )
-            return NumpyNdarrayVariable.create(tx, proxy)
-        elif name == "tolist":
-            from .builder import SourcelessBuilder
+            # Hacky way to create a view of self that will be marked as NumpyNdarrayVariable
+            proxy = tx.output.create_proxy(
+                "call_method", "view_as", *proxy_args_kwargs([self, self], {})
+            )
+        return NumpyNdarrayVariable.create(tx, proxy)
+
+    def method_tolist(self):
+        from ..symbolic_convert import InstructionTranslator
+        from .builder import SourcelessBuilder
 
-            def tolist(tensor, sub_proxy):
-                def wrap(i, sub_proxy):
+        tx = InstructionTranslator.current_tx()
+
+        def tolist(tensor, sub_proxy):
+            def wrap(i, sub_proxy):
+                # Sigh, we forgot to gate this, so this data dependent is on
+                # by default and is load bearing in CI
+                with unittest.mock.patch.object(
+                    tx.fake_mode, "allow_scalar_outputs", True
+                ):
                     return SymNodeVariable.create(
                         tx,
                         sub_proxy.item(),
-                        sym_num=tx.output.shape_env.create_unbacked_symint(),
                     )
 
-                if tensor.dtype not in [
-                    torch.int8,
-                    torch.int16,
-                    torch.int32,
-                    torch.int64,
-                ]:
-                    unimplemented("Input tensor for tolist must be an integer tensor")
-
-                if tensor.dim() == 0:
-                    return wrap(tensor, sub_proxy)
-
-                if tensor.dim() == 1:
-                    return [wrap(val, sub_proxy[i]) for i, val in enumerate(tensor)]
-
-                return [
-                    tolist(sub_tensor, sub_proxy=sub_proxy[i])
-                    for i, sub_tensor in enumerate(tensor)
-                ]
-
-            tensor = self.as_proxy().node.meta["example_value"]
-            out = tolist(tensor, self.as_proxy())
-            return SourcelessBuilder()(tx, out)
-        elif name in ("backward", "data_ptr"):
-            unimplemented(f"Tensor.{name}")
-        elif name == "item" and not config.capture_scalar_outputs:
-            unimplemented(f"Tensor.{name}")
-        elif name == "__len__":
-            return self.call_method(tx, "size", [ConstantVariable.create(0)], {})
-        elif name == "__setitem__":
-            key, value = args
-
-            def has_bool_key(v):
-                if isinstance(v, TensorVariable):
-                    return v.dtype in (torch.bool, torch.int8)
-                elif isinstance(v, TupleVariable):
-                    return any(has_bool_key(item) for item in v.items)
-                else:
-                    return False
+            if tensor.dtype not in [
+                torch.int8,
+                torch.int16,
+                torch.int32,
+                torch.int64,
+            ]:
+                unimplemented("Input tensor for tolist must be an integer tensor")
 
-            if (
-                has_bool_key(key)
-                and isinstance(value, TensorVariable)
-                and value.requires_grad
-            ):
-                unimplemented(
-                    "boolean masking setitem backwards, see https://github.com/pytorch/pytorch/issues/114123"
-                )
-            tx.output.create_proxy(
-                "call_function",
-                operator.setitem,
-                *proxy_args_kwargs([self] + list(args), kwargs),
+            if tensor.dim() == 0:
+                return wrap(tensor, sub_proxy)
+
+            if tensor.dim() == 1:
+                return [wrap(val, sub_proxy[i]) for i, val in enumerate(tensor)]
+
+            return [
+                tolist(sub_tensor, sub_proxy=sub_proxy[i])
+                for i, sub_tensor in enumerate(tensor)
+            ]
+
+        tensor = self.as_proxy().node.meta["example_value"]
+        out = tolist(tensor, self.as_proxy())
+        return SourcelessBuilder.create(tx, out)
+
+    def method_backward(self, *args, **kwargs):
+        unimplemented("Tensor.backward")
+
+    def method_data_ptr(self, *args, **kwargs):
+        unimplemented("Tensor.data_ptr")
+
+    def method_item(self, *args, **kwargs):
+        if not config.capture_scalar_outputs:
+            self._warn_capture_scalar_outputs()
+            unimplemented("Tensor.item")
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def _warn_capture_scalar_outputs():
+        log.warning(
+            textwrap.dedent(
+                """\
+                    Graph break from `Tensor.item()`, consider setting:
+                        torch._dynamo.config.capture_scalar_outputs = True
+                    or:
+                        env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
+                    to include these operations in the captured graph.
+                """
             )
-            return ConstantVariable.create(None)
-        elif name in ("resize_", "resize_as_"):
-            # Handling resizing in its full generality is difficult.
-            unimplemented(f"Tensor.{name}")
-        elif name == "set_" and len(args) > 1:
+        )
+
+    def method___len__(self):
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+        return self.call_method(tx, "size", [ConstantVariable.create(0)], {})
+
+    def method___setitem__(self, key, value):
+        def has_bool_key(v):
+            if isinstance(v, TensorVariable):
+                return v.dtype in (torch.bool, torch.int8)
+            elif isinstance(v, variables.TupleVariable):
+                return any(has_bool_key(item) for item in v.items)
+            else:
+                return False
+
+        if (
+            has_bool_key(key)
+            and isinstance(value, TensorVariable)
+            and value.requires_grad
+            and torch.is_grad_enabled()
+        ):
+            unimplemented(
+                "boolean masking setitem backwards, see https://github.com/pytorch/pytorch/issues/114123"
+            )
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+        tx.output.create_proxy(
+            "call_function",
+            operator.setitem,
+            *proxy_args_kwargs([self, key, value], {}),
+        )
+        return ConstantVariable.create(None)
+
+    def method_resize_(self, *args, **kwargs):
+        unimplemented("Tensor.resize_")
+
+    def method_resize_as_(self, *args, **kwargs):
+        unimplemented("Tensor.resize_as_")
+
+    def method_set_(self, *args, **kwargs):
+        if len(args) > 1:
             # torch.Tensor.set_() has several overloads.
             # aten::set_.source_Tensor(Tensor) gets special handling
             # in AOTAutograd and functionalization, because it is the most common
@@ -622,163 +754,206 @@ def has_bool_key(v):
             # graph-breaking on aten::set_source_Tensor_storage_offset for now,
             # unless we find that we need to make it work.
             unimplemented("Tensor.set_.source_Tensor_storage_offset")
-        elif (
-            name == "add_" and len(args) == 1 and len(kwargs) == 1 and "alpha" in kwargs
-        ):
-            result = TorchInGraphFunctionVariable(torch.mul).call_function(
-                tx, args + [kwargs["alpha"]], {}
-            )
-            return self.call_method(tx, "add_", [result], {})
-        elif (
-            name == "addcdiv_"
-            and len(args) == 2
-            and len(kwargs) == 1
-            and "value" in kwargs
-        ):
-            result = TorchInGraphFunctionVariable(torch.div).call_function(tx, args, {})
-            result = TorchInGraphFunctionVariable(torch.mul).call_function(
-                tx, [result, kwargs["value"]], {}
+
+    def method_add_(self, other, *, alpha=None):
+        if alpha is not None:
+            from ..symbolic_convert import InstructionTranslator
+
+            tx = InstructionTranslator.current_tx()
+            result = variables.TorchInGraphFunctionVariable(torch.mul).call_function(
+                tx, [other, alpha], {}
             )
             return self.call_method(tx, "add_", [result], {})
-        elif name == "__contains__":
-            # Rewrite __contains__ here so that downstream passes can trace through
-            # without dealing with unbacked symbool. Roughly the code we translate is:
-            # def __contains__(self, x):
-            #     return (x == self).any().item()
-            result = TorchInGraphFunctionVariable(torch.eq).call_function(
-                tx, [self, args[0]], {}
+
+    def method_addcdiv_(self, tensor1, tensor2, *, value=None):
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+        if value is not None:
+            result = variables.TorchInGraphFunctionVariable(torch.div).call_function(
+                tx, [tensor1, tensor2], {}
             )
-            result = TorchInGraphFunctionVariable(torch.any).call_function(
-                tx, [result], {}
+            result = variables.TorchInGraphFunctionVariable(torch.mul).call_function(
+                tx, [result, value], {}
             )
-            return result.call_method(tx, "item", [], {})
-        elif name == "redistribute":
-            # rewrite non-primitive args/kwargs to be included in the on-the-fly prim function
-            # and rewrite args to have only proxyable args, then insert call_function
-            args_as_value = [x.as_python_constant() for x in args]
-            kwargs_as_value = {k: v.as_python_constant() for k, v in kwargs.items()}
+            return self.call_method(tx, "add_", [result], {})
 
-            def redistribute_fn_with_prim_types(x):
-                return x.redistribute(*args_as_value, **kwargs_as_value)
+    def method___contains__(self, arg):
+        from ..symbolic_convert import InstructionTranslator
 
-            # attach the same function name for better debugging
-            redistribute_fn_with_prim_types.__name__ = f"prim_{name}"
+        tx = InstructionTranslator.current_tx()
 
-            return wrap_fx_proxy(
-                tx=tx,
-                proxy=tx.output.create_proxy(
-                    "call_function",
-                    redistribute_fn_with_prim_types,
-                    *proxy_args_kwargs([self], {}),
-                ),
-            )
-        elif name in {"register_hook", "register_post_accumulate_grad_hook"}:
-            # Note - do not arbitrarily add hooks here - make sure they match the same contract
-            # see [On tensor.register_hook]
-            assert len(args) == 1
-            fn_var = args[0]
-            if not isinstance(
-                fn_var,
-                (
-                    variables.functions.FunctoolsPartialVariable,
-                    variables.UserFunctionVariable,
-                    variables.TorchInGraphFunctionVariable,
-                    variables.NNModuleVariable,
-                ),
-            ):
-                unimplemented("Unexpected callable type passed to register_hook")
-
-            if isinstance(fn_var, variables.NestedUserFunctionVariable):
-                # NestedUserFunctionVariable don't carry their fn, but reconstruction builds it
-                # This should not be onerous to support when needed.
-                unimplemented("NYI - lambda variables as hooks")
-            elif isinstance(fn_var, variables.functions.FunctoolsPartialVariable):
-                fn = fn_var.as_python_constant()
-            else:
-                fn = fn_var.fn
+        # Rewrite __contains__ here so that downstream passes can trace through
+        # without dealing with unbacked symbool. Roughly the code we translate is:
+        # def __contains__(self, x):
+        #     return (x == self).any().item()
+        result = variables.TorchInGraphFunctionVariable(torch.eq).call_function(
+            tx, [self, arg], {}
+        )
+        result = variables.TorchInGraphFunctionVariable(torch.any).call_function(
+            tx, [result], {}
+        )
+        return result.call_method(tx, "item", [], {})
 
-            handle_variable = variables.user_defined.RemovableHandleVariable(
-                mutable_local=variables.base.MutableLocal(),
-            )
+    def method_redistribute(self, *args, **kwargs):
+        from ..symbolic_convert import InstructionTranslator
 
-            if not self.source:
-                # Intermediary
-                src = fn_var.source
-                if (
-                    not src
-                    and isinstance(fn_var, variables.functions.FunctoolsPartialVariable)
-                    and fn_var.func.source
-                ):
-                    src = fn_var.func.source
-
-                if not src:
-                    unimplemented("No source for register_hook target fn")
-
-                tx.output.guards.add(src.make_guard(GuardBuilder.ID_MATCH))
-
-                if not compiled_autograd.compiled_autograd_enabled:
-                    # TODO(voz):
-                    # We can relax this by speculating the callable and ensuring that it doesn't modify arbitrary
-                    # python state.
-                    # We *Must* be in compiled_autograd here because backward hooks can contain anything, and it is unsafe to run
-                    # them in a compiled bwd without re-entering dynamo as compiled_autograd does.
-                    #
-                    # Discussion point 1 - Should we bypass this if nopython/fullgraph = True?
-                    #   No. Because this was going to be a graph break anyway - this check does not
-                    # introduce new graph breaks where there were none.
-                    #
-                    # Discussion point 2 - Should we defer this check to backwards?
-                    #   No. Because compiled autograd is not yet ready for prime time. As such, if we defer, a user
-                    # would have no recourse - their forward traces just fine, but will fail at backwards unless
-                    # compiled_autograd is enabled. If compiled_autograd fails (there are a lot of failures today)
-                    # then they have nothing they can do except disable compile.
-                    unimplemented(
-                        "Compilation of intermediate hooks requires compiled autograd"
-                    )
+        tx = InstructionTranslator.current_tx()
+        # rewrite non-primitive args/kwargs to be included in the on-the-fly prim function
+        # and rewrite args to have only proxyable args, then insert call_function
+        args_as_value = [x.as_python_constant() for x in args]
+        kwargs_as_value = {k: v.as_python_constant() for k, v in kwargs.items()}
 
-                # This wraps our user provided fn with a function that intercedes and
-                # uses our `invoke` higher order op to record a hook invocation in bwd graph.
-                fn = functools.partial(trace_wrapped, fn=fn)
+        def redistribute_fn_with_prim_types(x):
+            return x.redistribute(*args_as_value, **kwargs_as_value)
 
-                def _register_hook_trampoline(tensor):
-                    hook_callable = getattr(tensor, name)
-                    hook_callable(fn)
-                    return tensor
+        # attach the same function name for better debugging
+        redistribute_fn_with_prim_types.__name__ = "prim_redistribute"
 
-                return wrap_fx_proxy(
-                    tx,
-                    tx.output.create_proxy(
-                        "call_function",
-                        _register_hook_trampoline,
-                        (self.as_proxy(),),
-                        {},
-                    ),
+        from .builder import wrap_fx_proxy
+
+        return wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                redistribute_fn_with_prim_types,
+                *proxy_args_kwargs([self], {}),
+            ),
+        )
+
+    def method_to_local(self, *args, **kwargs):
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+        # rewrite non-primitive args/kwargs to be included in the on-the-fly prim function
+        # and rewrite args to have only proxyable args, then insert call_function
+        args_as_value = [x.as_python_constant() for x in args]
+        kwargs_as_value = {k: v.as_python_constant() for k, v in kwargs.items()}
+
+        def to_local_fn_with_prim_types(x):
+            return x.to_local(*args_as_value, **kwargs_as_value)
+
+        # attach the same function name for better debugging
+        to_local_fn_with_prim_types.__name__ = "prim_to_local"
+
+        from .builder import wrap_fx_proxy
+
+        return wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                to_local_fn_with_prim_types,
+                *proxy_args_kwargs([self], {}),
+            ),
+        )
+
+    def method_register_hook(self, *args, **kwargs):
+        return self._method_register_hook("register_hook", *args, **kwargs)
+
+    def method_register_post_accumulate_grad_hook(self, *args, **kwargs):
+        return self._method_register_hook(
+            "register_post_accumulate_grad_hook", *args, **kwargs
+        )
+
+    def _method_register_hook(self, name: str, hook: VariableTracker):
+        # Note - do not arbitrarily add hooks here - make sure they match the same contract
+        # see [On tensor.register_hook]
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+
+        if not self.source:
+            if not compiled_autograd.compiled_autograd_enabled:
+                # TODO(voz):
+                # We can relax this by speculating the callable and ensuring that it doesn't modify arbitrary
+                # python state.
+                # We *Must* be in compiled_autograd here because backward hooks can contain anything, and it is unsafe to run
+                # them in a compiled bwd without re-entering dynamo as compiled_autograd does.
+                #
+                # Discussion point 1 - Should we bypass this if nopython/fullgraph = True?
+                #   No. Because this was going to be a graph break anyway - this check does not
+                # introduce new graph breaks where there were none.
+                #
+                # Discussion point 2 - Should we defer this check to backwards?
+                #   No. Because compiled autograd is not yet ready for prime time. As such, if we defer, a user
+                # would have no recourse - their forward traces just fine, but will fail at backwards unless
+                # compiled_autograd is enabled. If compiled_autograd fails (there are a lot of failures today)
+                # then they have nothing they can do except disable compile.
+                unimplemented(
+                    "Compilation of intermediate hooks requires compiled autograd"
                 )
 
-            tx.output.side_effects.register_hook(self, fn_var, handle_variable, name)
-            return handle_variable
-        elif name == "requires_grad_" and self.as_proxy().node.meta[
-            "example_value"
-        ].requires_grad != (args[0].value if len(args) > 0 else True):
-            unimplemented("Tensor.requires_grad_")
+            hook_name, bw_state_proxy = tx.output.add_backward_state_hook(hook)
+
+            def _register_hook_trampoline(tensor, bw_state):
+                register_hook = getattr(tensor, name)
+                register_hook(
+                    functools.partial(
+                        trace_wrapped,
+                        fn=call_hook_from_backward_state,
+                        bw_state=bw_state,
+                        hook_name=hook_name,
+                    )
+                )
+                # TODO(jansel): returning None here is wrong, it should be
+                # RemovableHandle, but we need some extra work to support
+                # this properly.
+                return None
+
+            from .builder import wrap_fx_proxy
 
-        else:
-            # Convert x.new(torch.Size) into x.new_empty(torch.Size),
-            # as Tensor.new acts differently with a Size input versus a tuple input.
-            if name == "new" and len(args) == 1 and isinstance(args[0], SizeVariable):
-                name = "new_empty"
             return wrap_fx_proxy(
                 tx,
                 tx.output.create_proxy(
-                    "call_method",
-                    name,
-                    *proxy_args_kwargs([self] + list(args), kwargs),
+                    "call_function",
+                    _register_hook_trampoline,
+                    (self.as_proxy(), bw_state_proxy),
+                    {},
                 ),
             )
 
-    def rename(self, tx, name):
-        self.proxy.node._rename(name)
-        return super().rename(tx, name)
+        handle_variable = variables.RemovableHandleVariable(
+            mutable_local=variables.base.MutableLocal(),
+        )
+        tx.output.side_effects.register_hook(self, hook, handle_variable, name)
+        return handle_variable
+
+    def method_requires_grad_(self, requires_grad=True):
+        if requires_grad is not True:
+            requires_grad = requires_grad.as_python_constant()
+
+        if self.as_proxy().node.meta["example_value"].requires_grad != requires_grad:
+            unimplemented("Tensor.requires_grad_")
+        else:
+            return self
+
+    def method_new(self, *args, **kwargs):
+        # Convert x.new(torch.Size) into x.new_empty(torch.Size),
+        # as Tensor.new acts differently with a Size input versus a tuple input.
+        if (len(args) == 1 and isinstance(args[0], SizeVariable)) or (
+            len(args) >= 1
+            and all(
+                isinstance(a, ConstantVariable) and a.python_type() == int for a in args
+            )
+        ):
+            from ..symbolic_convert import InstructionTranslator
+
+            return self.call_method(
+                InstructionTranslator.current_tx(), "new_empty", args, kwargs
+            )
+
+    def method_untyped_storage(self):
+        return UntypedStorageVariable(
+            self, self.as_proxy().node.meta["example_value"].untyped_storage()
+        )
+
+    def set_name_hint(self, name: str):
+        # Only rename at the top-level scope, this is to avoid the confusion between
+        # mutating a variable vs renaming it (e.g. a = b) during speculating a higher order op,
+        # where mutation is prohibited and it's difficult to differentiate it with renaming.
+        if not self._is_name_set and _is_top_level_scope(current_scope_id()):
+            self.proxy.node._rename(name)
+            self._is_name_set = True
 
 
 class SymNodeVariable(VariableTracker):
@@ -786,17 +961,21 @@ class SymNodeVariable(VariableTracker):
     Represents a symbolic size, e.g., as returned by tensor.size(0)
     """
 
+    _nonvar_fields = {
+        "proxy",
+        "sym_num",
+        *VariableTracker._nonvar_fields,
+    }
+
     @classmethod
-    def create(cls, tx, proxy, sym_num, **options):
-        if "example_value" in proxy.node.meta:
-            assert proxy.node.meta["example_value"] == sym_num
+    def create(cls, tx, proxy, sym_num=None, **options):
         if sym_num is None:
             sym_num = get_fake_value(proxy.node, tx)
-        proxy.node.meta["example_value"] = sym_num
+        if "example_value" in proxy.node.meta:
+            assert proxy.node.meta["example_value"] == sym_num
+        set_example_value(proxy.node, sym_num)
 
         if isinstance(sym_num, (sympy.Integer, int, bool)):
-            if isinstance(sym_num, sympy.Integer):
-                breakpoint()
             sym_num = int(sym_num) if isinstance(sym_num, sympy.Integer) else sym_num
             return ConstantVariable.create(sym_num)
 
@@ -823,7 +1002,7 @@ def evaluate_expr(self, output_graph=None):
         except GuardOnDataDependentSymNode as e:
             raise UserError(  # noqa: TRY200
                 UserErrorType.ANTI_PATTERN,
-                f"Consider annotating your code using torch._constrain_as_*(). {str(e)}",
+                f"Consider annotating your code using torch._check*(). {str(e)}",
                 case_name="constrain_as_size_example",
             )
 
@@ -841,14 +1020,14 @@ def call_method(
             tx.output.create_proxy(
                 "call_method",
                 name,
-                *proxy_args_kwargs([self] + list(args), kwargs),
+                *proxy_args_kwargs([self, *args], kwargs),
             ),
         )
 
 
 class NumpyNdarrayVariable(TensorVariable):
     """
-    Represents an np.ndarray, but backed by torch Tensor via torch._numpy.ndarray.
+    Represents a np.ndarray, but backed by torch Tensor via torch._numpy.ndarray.
     Use this for Tensor.numpy() call.
     """
 
@@ -921,9 +1100,16 @@ def insert_into_graph():
         elif name in ["__version__"]:
             unimplemented("delegate np.__version__ to NumPy")
         if result is None:
-            raise NotImplementedError()
+            raise NotImplementedError
         return result
 
+    @staticmethod
+    def patch_args(name, args, kwargs):
+        if name == "clip":
+            kwargs_rename = {"a_min": "min", "a_max": "max"}
+            kwargs = {kwargs_rename.get(k, k): v for k, v in kwargs.items()}
+        return args, kwargs
+
     def call_method(
         self,
         tx,
@@ -933,6 +1119,8 @@ def call_method(
     ) -> "VariableTracker":
         from ..utils import numpy_method_wrapper
 
+        args, kwargs = self.patch_args(name, args, kwargs)
+
         if name in ["__len__", "size", "tolist"]:
             # delegate back to TensorVariable
             return super().call_method(tx, name, args, kwargs)
@@ -954,6 +1142,12 @@ class UnspecializedPythonVariable(TensorVariable):
     This is a 1-element tensor represents unspecialized python float/int.
     """
 
+    _nonvar_fields = {
+        "raw_value",
+        "need_unwrap",
+        *TensorVariable._nonvar_fields,
+    }
+
     def __init__(
         self, proxy: torch.fx.Proxy, *, raw_value=None, need_unwrap=True, **kwargs
     ):
@@ -975,6 +1169,11 @@ class FakeItemVariable(TensorVariable):
     """An unspecialized python variable which prevents access to the underlying raw value.
     This is needed if item is called on a FakeTensor."""
 
+    _nonvar_fields = {
+        "need_unwrap",
+        *TensorVariable._nonvar_fields,
+    }
+
     def __init__(self, proxy: torch.fx.Proxy, **kwargs):
         need_unwrap = kwargs.pop("need_unwrap", False)
         super().__init__(proxy, **kwargs)
@@ -1012,3 +1211,65 @@ def as_python_constant(self):
 
     def python_type(self):
         return type(self.value)
+
+
+class UntypedStorageVariable(VariableTracker):
+    _nonvar_fields = {
+        "example_value",
+        *VariableTracker._nonvar_fields,
+    }
+
+    def __init__(
+        self,
+        from_tensor: TensorVariable,
+        example_value: torch.UntypedStorage,
+        **kwargs,
+    ):
+        super().__init__(**kwargs),
+        self.from_tensor = from_tensor
+        # Example_value will always have device="meta"
+        self.example_value = example_value
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List[VariableTracker],
+        kwargs: Dict[str, VariableTracker],
+    ) -> VariableTracker:
+        if name == "size":
+            assert not args
+            assert not kwargs
+            result = self.example_value.size()
+            if not has_free_symbols(result):
+                # avoid creating a node in the graph
+                return ConstantVariable.create(int(result))
+            else:
+                from ..external_utils import untyped_storage_size
+                from .builder import wrap_fx_proxy
+
+                return wrap_fx_proxy(
+                    tx,
+                    tx.output.create_proxy(
+                        "call_function",
+                        untyped_storage_size,
+                        (self.from_tensor.as_proxy(),),
+                        {},
+                    ),
+                )
+        if name == "resize_" and len(args) == 1:
+            assert not kwargs
+            tx.output.create_proxy(
+                "call_function",
+                torch.ops.inductor.resize_storage_bytes_,
+                (self.from_tensor.as_proxy(), args[0].as_proxy()),
+                {},
+            )
+            return self
+
+        return super().call_method(tx, name, args, kwargs)
+
+    def reconstruct(self, codegen):
+        codegen(self.from_tensor)
+        codegen.append_output(codegen.create_load_method("untyped_storage"))
+        codegen.extend_output(create_call_method(0))
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 9b6ef1c3073b2..47705cdc07e1f 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -1,3 +1,4 @@
+import functools
 import inspect
 import logging
 
@@ -5,30 +6,30 @@
 import re
 from typing import Dict, List
 
-from torch._streambase import _StreamBase
-from ..guards import install_guard
-
-try:
-    import numpy as np
-except ModuleNotFoundError:
-    np = None
-
 import torch._C
 import torch._refs
 import torch.fx
 import torch.nn
 import torch.onnx.operators
+from torch._logging import warning_once
 
+from torch._streambase import _StreamBase
+from ..._guards import TracingContext
 from .. import config, polyfill, variables
+from ..codegen import PyCodegen
+from ..create_parameter_op import new_parameter_placeholder, tracable_create_parameter
 from ..device_interface import get_registered_device_interfaces
 from ..exc import unimplemented
-from ..guards import GuardBuilder
+from ..guards import GuardBuilder, install_guard
+from ..source import SyntheticLocalSource
 from ..utils import (
-    check_constant_args,
-    check_unspec_python_args,
+    check_unspec_or_constant_args,
+    guard_if_dyn,
     has_torch_function,
+    hashable,
     product,
     proxy_args_kwargs,
+    unwrap_if_wrapper,
 )
 from .base import VariableTracker
 from .ctx_manager import (
@@ -36,22 +37,53 @@
     NullContextVariable,
     TorchFunctionDisableVariable,
 )
-from .distributed import is_constant_pg_functions, is_from_local, ProcessGroupVariable
-from .higher_order_ops import TorchHigherOrderOperatorVariable
+from .distributed import DistributedVariable, ProcessGroupVariable
 from .lists import ListVariable, TupleVariable
 from .torch_function import can_dispatch_torch_function, dispatch_torch_function
 
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    np = None  # type: ignore[assignment]
+
 log = logging.getLogger(__name__)
 
+supported_ctx_manager_classes = dict.fromkeys(
+    [
+        torch.profiler.profiler.profile,
+        torch.autograd.forward_ad._set_fwd_grad_enabled,
+        torch.autograd.forward_ad.dual_level,
+        torch.autograd.profiler.profile,
+        torch.autograd.profiler.record_function,
+        torch._C.DisableTorchFunctionSubclass,
+        torch._functorch.vmap.vmap_increment_nesting,
+        torch._functorch.eager_transforms.grad_increment_nesting,
+        torch._functorch.eager_transforms.jvp_increment_nesting,
+        torch._functorch.eager_transforms.enable_inplace_requires_grad,
+        torch.amp.autocast_mode.autocast,
+        torch.autograd.grad_mode.enable_grad,
+        torch.autograd.grad_mode.inference_mode,
+        torch.autograd.grad_mode.no_grad,
+        torch.autograd.grad_mode.set_grad_enabled,
+        torch.autograd.graph.disable_saved_tensors_hooks,
+        torch.cpu.amp.autocast_mode.autocast,
+        torch.cuda.amp.autocast_mode.autocast,
+    ]
+)
 
-REWRITE_OPS_TO_TENSOR_SIZE_METHOD = [
-    torch.onnx.operators.shape_as_tensor,
-    torch._shape_as_tensor,
-]
+
+REWRITE_OPS_TO_TENSOR_SIZE_METHOD = dict.fromkeys(
+    [
+        torch.onnx.operators.shape_as_tensor,
+        torch._shape_as_tensor,
+    ]
+)
 
 constant_fold_functions = [
     torch._assert,
     torch._utils._get_device_index,
+    torch._C._get_cublas_allow_tf32,
+    torch.cuda.get_device_properties,
     torch.cuda.is_available,
     torch.distributed.is_available,
     torch.get_autocast_gpu_dtype,
@@ -61,12 +93,10 @@
     torch.is_autocast_enabled,
     torch.is_complex,
     torch.is_floating_point,
-    torch.nn.functional._Reduction.get_enum,
+    torch.nn.functional._Reduction.get_enum,  # type: ignore[attr-defined]
     torch.promote_types,
     torch._C._get_privateuse1_backend_name,
 ]
-
-
 if torch.distributed.is_available():
     constant_fold_functions.extend(
         [
@@ -75,6 +105,8 @@
             torch.distributed.get_world_size,
         ]
     )
+# Convert to dict for O(1) access times
+constant_fold_functions = dict.fromkeys(constant_fold_functions)
 
 
 tracing_state_functions = {
@@ -85,8 +117,12 @@
     torch.onnx.is_in_onnx_export: False,
     torch._dynamo.external_utils.is_compiling: True,
     torch._utils.is_compiling: True,
+    torch.compiler.is_compiling: True,
+    torch.compiler.is_dynamo_compiling: True,
 }
 
+bin_ops = dict.fromkeys(["add", "sub", "mul", "div", "sqrt"])
+
 
 class BaseTorchVariable(VariableTracker):
     """common base for all torch.* functions, classes, modules and other things"""
@@ -109,7 +145,9 @@ def reconstruct(self, codegen):
         except Exception:
             name = f"torch_obj_{id(self.value)}"
         unique_var_name = "__" + re.sub(r"[^a-zA-Z0-9_]+", "_", name)
-        return codegen.setup_globally_cached(unique_var_name, self.value, False)
+        codegen.extend_output(
+            codegen.setup_globally_cached(unique_var_name, self.value, False)
+        )
 
     def as_proxy(self):
         return self.value
@@ -136,10 +174,37 @@ class TorchCtxManagerClassVariable(BaseTorchVariable):
     def __repr__(self):
         return f"TorchCtxManagerClassVariable({self.value})"
 
+    @staticmethod
+    def is_matching_cls(value):
+        # Unwrap if it's a functools.lru_cache wrapper
+        value = unwrap_if_wrapper(value)
+        # We can't do isinstance(value, type) check because some ctx managers
+        # are implemented as a function decorated by contextlib.contextmanager,
+        # E.g., torch._functorch.vmap.vmap_increment_nesting.
+        return (
+            # Context manager type or function with @contextmanager is callable
+            callable(value)
+            and (
+                hashable(value)  # accesses value.__hash__()
+                and value in supported_ctx_manager_classes
+            )
+        )
+
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
-        from . import GradModeVariable, InferenceModeVariable, StreamVariable
+        from . import (
+            DisabledSavedTensorsHooksVariable,
+            DualLevelContextManager,
+            GradIncrementNestingCtxManagerVariable,
+            GradInplaceRequiresGradCtxManagerVariable,
+            GradModeVariable,
+            InferenceModeVariable,
+            JvpIncrementNestingCtxManagerVariable,
+            SetFwdGradEnabledContextManager,
+            StreamVariable,
+            VmapIncrementNestingCtxManagerVariable,
+        )
 
         if self.value is torch.no_grad:
             if len(args) == 1 and isinstance(
@@ -161,7 +226,9 @@ def call_function(
                 tx, args[0].as_python_constant(), initialized=True
             )
         elif self.value is torch.inference_mode:
-            return InferenceModeVariable.create(tx, args[0].as_python_constant())
+            assert len(args) <= 1 and len(kwargs) == 0
+            inf_mode = args[0].as_python_constant() if len(args) == 1 else True
+            return InferenceModeVariable.create(tx, inf_mode)
         elif inspect.isclass(self.value) and issubclass(self.value, _StreamBase):
             from torch._dynamo.variables.builder import wrap_fx_proxy_cls
 
@@ -187,11 +254,47 @@ def call_function(
             torch.autograd.profiler.profile,
             torch.autograd.profiler.record_function,
         ):
-            log.warning("Profiler function %s will be ignored", self.value)
+            warning_once(log, "Profiler function %s will be ignored", self.value)
             return NullContextVariable()
         elif self.value is torch._C.DisableTorchFunctionSubclass:
             assert not (args or kwargs)
             return TorchFunctionDisableVariable.create(tx)
+        elif self.value is torch._functorch.vmap.vmap_increment_nesting:
+            assert len(args) == 2
+            return VmapIncrementNestingCtxManagerVariable.create(
+                tx,
+                [guard_if_dyn(x) for x in args],
+            )
+        elif self.value is torch._functorch.eager_transforms.jvp_increment_nesting:
+            assert len(args) == 0
+            return JvpIncrementNestingCtxManagerVariable.create(tx)
+        elif self.value is torch.autograd.forward_ad._set_fwd_grad_enabled:
+            assert len(args) == 1
+            return SetFwdGradEnabledContextManager.create(
+                tx,
+                [guard_if_dyn(x) for x in args],
+            )
+        elif self.value is torch.autograd.forward_ad.dual_level:
+            assert len(args) == 0
+            return DualLevelContextManager.create(tx)
+        elif self.value is torch._functorch.eager_transforms.grad_increment_nesting:
+            assert len(args) == 0
+            return GradIncrementNestingCtxManagerVariable.create(tx)
+        elif (
+            self.value is torch._functorch.eager_transforms.enable_inplace_requires_grad
+        ):
+            assert len(args) == 1
+            return GradInplaceRequiresGradCtxManagerVariable.create(
+                tx,
+                [guard_if_dyn(x) for x in args],
+            )
+        elif self.value is torch.autograd.graph.disable_saved_tensors_hooks:
+            assert len(args) == 1
+            return DisabledSavedTensorsHooksVariable.create(
+                tx, args[0].as_python_constant()
+            )
+
+        return super().call_function(tx, args, kwargs)
 
 
 class TorchInGraphFunctionVariable(BaseTorchVariable):
@@ -200,87 +303,92 @@ class TorchInGraphFunctionVariable(BaseTorchVariable):
     def __repr__(self):
         return f"TorchInGraphFunctionVariable({self.value})"
 
-    def call_function(
-        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
-    ) -> "VariableTracker":
+    def get_function(self):
+        return self.value
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def _get_handlers():
+        """Build a dict from function -> method to handle it so that we are O(1)
+        in terms of the number of function with special handling."""
+        handlers = {}
+
+        def register(*fns):
+            def _register(handler):
+                for fn in fns:
+                    assert fn not in handlers, fn
+                    handlers[fn] = handler
+                return handler
+
+            assert callable(fns[0])
+            return _register
+
+        from torch.backends.cuda import SDPAParams
         from . import (
             ConstantVariable,
             DeterministicAlgorithmsVariable,
-            DisabledSavedTensorsHooksVariable,
             GradModeVariable,
             StreamContextVariable,
             SymNodeVariable,
             TensorVariable,
             UserDefinedObjectVariable,
         )
+        from .builder import SourcelessBuilder, wrap_fx_proxy, wrap_fx_proxy_cls
 
-        from .builder import wrap_fx_proxy, wrap_fx_proxy_cls
-
-        constant_args = check_constant_args(args, kwargs)
-        unspec_python_args = check_unspec_python_args(args, kwargs)
-
-        if self.can_constant_fold_through() and (constant_args or unspec_python_args):
-            # constant fold
-            return ConstantVariable.create(
-                self.as_python_constant()(
-                    *[x.as_python_constant() for x in args],
-                    **{k: v.as_python_constant() for k, v in kwargs.items()},
-                ),
-            )
-        elif self.value in tracing_state_functions:
+        @register(*tracing_state_functions)
+        def handle_tracing_state_functions(self, tx, *args, **kwargs):
             assert not args and not kwargs
             # See: https://github.com/pytorch/pytorch/issues/110765
             if self.value in (
                 torch._utils.is_compiling,
                 torch._dynamo.external_utils.is_compiling,
+                torch.compiler.is_compiling,
+                torch.compiler.is_dynamo_compiling,
             ):
                 tx.mark_inconsistent_side_effects()
             return ConstantVariable.create(tracing_state_functions[self.value])
-        elif self.value in (
-            torch._functorch.vmap.vmap_impl,
-            torch._functorch.eager_transforms.grad_impl,
-        ):
-            return TorchHigherOrderOperatorVariable.make(
-                self.value,
-                source=self.source,
-            ).call_function(tx, args, kwargs)
-        elif self.value is torch.overrides.get_default_nowrap_functions:
+
+        @register(torch.overrides.get_default_nowrap_functions.__wrapped__)
+        def handle_get_default_nowrap_functions(self, tx, *args, **kwargs):
             # [Note: __torch_function__] we return empty here because we restrict
             # the set of functions that we trace __torch_function__ on to
             # functions outside of the actual set. Implementing this properly will require implementing
             # some variable types to track and compare tensor getset descriptors
-            from .builder import SourcelessBuilder
-
-            return SourcelessBuilder()(
+            return SourcelessBuilder.create(
                 tx, torch.overrides.get_default_nowrap_functions()
             )
-        elif self.value == math.radians and not (constant_args or unspec_python_args):
-            # Use polyfill to convert math.radians(x) into math.pi * x / 180.0
-            from .builder import SourcelessBuilder
 
+        @register(torch.ops.inductor.accumulate_grad_.default)
+        def handle_accumulate_grad_(self, tx, *args, **kwargs):
             return tx.inline_user_function_return(
-                SourcelessBuilder()(tx, polyfill.radians), args, kwargs
+                SourcelessBuilder.create(tx, polyfill.accumulate_grad), args, kwargs
             )
-        elif self.value in (torch.is_tensor, torch.overrides.is_tensor_like):
-            assert len(args) == 1
-            if isinstance(args[0], TensorVariable) or (
+
+        @register(math.radians)
+        def handle_radians(self, tx, *args, **kwargs):
+            if not check_unspec_or_constant_args(args, kwargs):
+                # Use polyfill to convert math.radians(x) into math.pi * x / 180.0
+                return tx.inline_user_function_return(
+                    SourcelessBuilder.create(tx, polyfill.radians), args, kwargs
+                )
+
+        @register(torch.is_tensor, torch.overrides.is_tensor_like)
+        def handle_is_tensor(self, tx, arg):
+            if isinstance(arg, TensorVariable) or (
                 self.value is torch.overrides.is_tensor_like
-                and isinstance(args[0], UserDefinedObjectVariable)
-                and hasattr(args[0].value, "__torch_function__")
+                and isinstance(arg, UserDefinedObjectVariable)
+                and hasattr(arg.value, "__torch_function__")
             ):
                 return ConstantVariable.create(True)
             else:
                 return ConstantVariable.create(False)
-        elif self.value in (
+
+        @register(
             torch.is_floating_point,
             torch.is_complex,
-        ):
-            input_arg = None
-            if args:
-                input_arg = args[0]
-            else:
-                assert "input" in kwargs
-                input_arg = kwargs["input"]
+        )
+        def handle_is_floating_point(self, tx, input):
+            input_arg = input
             if isinstance(input_arg, TensorVariable) and input_arg.dtype is not None:
                 if self.value is torch.is_floating_point:
                     return ConstantVariable.create(input_arg.dtype.is_floating_point)
@@ -288,64 +396,77 @@ def call_function(
                     return ConstantVariable.create(input_arg.dtype.is_complex)
                 else:
                     raise AssertionError(f"calling {self.value}")
-        elif (
-            self.value is torch.numel
-            and isinstance(args[0], TensorVariable)
-            and args[0].size is not None
-        ):
-            return ConstantVariable.create(product(args[0].size))
-        elif self.value in REWRITE_OPS_TO_TENSOR_SIZE_METHOD:
-            assert len(args) == 1
-            assert isinstance(args[0], TensorVariable)
-            return args[0].call_method(tx, "size", [], {})
-        elif self.value in (
+
+        @register(torch.numel)
+        def handle_numel(self, tx, input):
+            if isinstance(input, TensorVariable) and input.size is not None:
+                return ConstantVariable.create(product(input.size))
+            elif isinstance(input, TensorVariable):
+                # Workaround dynamic shapes issue
+                return input.call_method(tx, "numel", [], {})
+
+        @register(*REWRITE_OPS_TO_TENSOR_SIZE_METHOD)
+        def handle_tensor_size_rewrites(self, tx, input):
+            assert isinstance(input, TensorVariable)
+            return input.call_method(tx, "size", [], {})
+
+        @register(
             torch.nn.modules.utils._single,
             torch.nn.modules.utils._pair,
             torch.nn.modules.utils._triple,
             torch.nn.modules.utils._quadruple,
             torch.nn.modules.utils._ntuple,
-        ):
+        )
+        def handle_ntuple(self, tx, *args, **kwargs):
             return self._call_ntuple(tx, args, kwargs)
-        elif self.value is torch.is_grad_enabled:
-            assert not (args or kwargs)
+
+        @register(torch.is_grad_enabled)
+        def handle_is_grad_enabled(self, tx):
             install_guard(GradModeVariable._guards_singleton)
             return ConstantVariable.create(torch.is_grad_enabled())
-        elif self.value is torch.use_deterministic_algorithms and len(args) == 1:
-            return DeterministicAlgorithmsVariable.create(
-                tx, args[0].as_python_constant()
-            )
-        elif self.value is torch.are_deterministic_algorithms_enabled:
-            assert not (args or kwargs)
+
+        @register(torch.use_deterministic_algorithms)
+        def handle_use_deterministic_algorithms(self, tx, mode, warn_only=False):
+            if warn_only and warn_only.as_python_constant():
+                unimplemented("torch.use_deterministic_algorithms(warn_only=True)")
+            return DeterministicAlgorithmsVariable.create(tx, mode.as_python_constant())
+
+        @register(torch.are_deterministic_algorithms_enabled)
+        def handle_are_deterministic_algorithms_enabled(self, tx):
             install_guard(DeterministicAlgorithmsVariable._guards_singleton)
             return ConstantVariable.create(torch.are_deterministic_algorithms_enabled())
-        elif self.value is torch.autograd.graph.disable_saved_tensors_hooks:
-            assert len(args) == 1
-            return DisabledSavedTensorsHooksVariable.create(
-                tx, args[0].as_python_constant()
-            )
-        elif self.value is torch._C._is_torch_function_enabled:
-            assert not (args or kwargs)
+
+        @register(torch._C._is_torch_function_enabled)
+        def handle_is_torch_function_enabled(self, tx):
             install_guard(TorchFunctionDisableVariable._guards_singleton)
             return ConstantVariable.create(tx.output.torch_function_enabled)
-        elif self.value in (
+
+        @register(
             torch.overrides.has_torch_function,
             torch.overrides.has_torch_function_variadic,
             torch.overrides.has_torch_function_unary,
-        ):
-            assert not kwargs
+        )
+        def handle_has_torch_function(self, tx, *args):
+            elems = (
+                args[0].unpack_var_sequence(tx)
+                if len(args) == 1 and isinstance(args[0], TupleVariable)
+                else args
+            )
             return ConstantVariable.create(
-                any(has_torch_function(a) for a in args),
+                any(has_torch_function(x) for x in elems),
             )
-        elif any(
-            self.value is method
-            for method in [
+
+        @register(
+            *dict.fromkeys(  # remove duplicates
                 device_interface.stream
                 for _, device_interface in get_registered_device_interfaces()
-            ]
-        ):
-            assert len(args) == 1
-            return StreamContextVariable.create(tx, args[0])
-        elif self.value is torch.from_numpy:
+            )
+        )
+        def handle_device_interface_stream(self, tx, stream):
+            return StreamContextVariable.create(tx, stream)
+
+        @register(torch.from_numpy)
+        def handle_from_numpy(self, tx, *args):
             if not config.trace_numpy:
                 unimplemented("torch.from_numpy. config.trace_numpy is False")
             if not np:
@@ -360,151 +481,251 @@ def call_function(
                 ),
                 example_value=None,
             )
-        elif can_dispatch_torch_function(tx, args, kwargs):
-            return dispatch_torch_function(tx, self, args, kwargs)
-        elif self.value is torch.jit.annotate:
-            assert len(args) == 2
-            return args[1]
-        elif self.value is torch.backends.cudnn.is_acceptable:
+
+        @register(torch.jit.annotate)
+        def handle_jit_annotate(self, tx, the_type, the_value):
+            return the_value
+
+        @register(torch.backends.cudnn.is_acceptable)
+        def handle_cudnn_is_acceptable(self, tx, tensor, *extra):
             # is_acceptable(tensor) returns true if
             #   (a) tensor dtype/device are supported by cudnn
             #   (b) cudnn is available
             #   (c) some initialization has completed
             # technically, it depends on some global state from (c) (torch.backends.cudnn.__cudnn_version)
-            assert (
-                len(args) == 1 or "tensor" in kwargs
-            ), "Expect 1 input to cudnn.is_acceptable"
-            tensor_variable = args[0] if len(args) > 0 else kwargs["tensor"]
+            assert not extra, "Expect 1 input to cudnn.is_acceptable"
             assert isinstance(
-                tensor_variable, TensorVariable
+                tensor, TensorVariable
             ), "Expect input to cudnn.is_acceptable to be a tensor"
-            tensor_inp = torch.tensor(
-                0, dtype=tensor_variable.dtype, device=tensor_variable.device
-            )
+            tensor_inp = torch.tensor(0, dtype=tensor.dtype, device=tensor.device)
             return ConstantVariable.create(
                 torch.backends.cudnn.is_acceptable(tensor_inp)
             )
-        elif (
-            self.value == torch.numel
-            and len(args) == 1
-            and isinstance(args[0], TensorVariable)
-            and len(kwargs) == 0
-        ):
-            # TODO(voz): This is rewritten as a call_method because
-            # torch.numel(x) w/ sym shapes raises a RuntimeError and x.numel() does not
+
+        @register(torch.utils.hooks.BackwardHook)
+        def handle_backward_hook(self, tx, *args, **kwargs):
+            return variables.BackwardHookVariable.create(tx, *args, **kwargs)
+
+        @register(torch.nn.Parameter)
+        def handle_parameter(self, tx, *args, **kwargs):
+            return self.call_nn_parameter(tx, *args, **kwargs)
+
+        @register(torch.ops.aten.sym_size, torch.ops.aten.sym_size.int)
+        def handle_sym_size(self_, tx, self, dim=None):
+            # we see this when retracing already traced code
+            if dim is not None:
+                return self.call_method(tx, "size", [dim], {})
+
+        @register(torch.ops.aten.sym_stride, torch.ops.aten.sym_stride.int)
+        def handle_sym_stride(self_, tx, self, dim=None):
+            if dim is not None:
+                return self.call_method(tx, "stride", [dim], {})
+
+        @register(torch.addcdiv)
+        def handle_addcdiv(self, tx, *args, **kwargs):
+            if len(args) == 3 and "value" in kwargs and len(kwargs) == 1:
+                # decompose addcdiv into constituent ops, prevents a graph break due to converting
+                # value to a scalar
+                result = TorchInGraphFunctionVariable(torch.div).call_function(
+                    tx, [*args[1:]], {}
+                )
+                result = TorchInGraphFunctionVariable(torch.mul).call_function(
+                    tx, [result, kwargs["value"]], {}
+                )
+                return TorchInGraphFunctionVariable(torch.add).call_function(
+                    tx, [args[0], result], {}
+                )
+
+        @register(torch._assert)
+        def handle_assert(self, tx, condition, message):
+            if (condition.is_python_constant() and condition.as_python_constant()) or (
+                isinstance(condition, variables.SymNodeVariable)
+                and condition.evaluate_expr()
+            ):
+                return ConstantVariable(None)
+
+        @register(SDPAParams)
+        def handle_sdpa_params(self, tx, *args, **kwargs):
             return wrap_fx_proxy(
-                tx=tx,
+                tx,
                 proxy=tx.output.create_proxy(
-                    "call_method",
-                    "numel",
+                    "call_function",
+                    torch._C._SDPAParams,
                     *proxy_args_kwargs(args, kwargs),
                 ),
+                param_vars=args,
             )
-        # TODO: These special cases shouldn't be necessary; we should
-        # generically support torch.ops that return int
-        elif (
-            self.value in (torch.ops.aten.sym_size, torch.ops.aten.sym_size.int)
-            and len(args) == 2
-            and len(kwargs) == 0
-            and isinstance(args[0], TensorVariable)
-        ):
-            # we see this when retracing already traced code
-            return args[0].call_method(tx, "size", [args[1]], {})
-        elif (
-            self.value in (torch.ops.aten.sym_stride, torch.ops.aten.sym_stride.int)
-            and len(args) == 2
-            and len(kwargs) == 0
-            and isinstance(args[0], TensorVariable)
-        ):
-            return args[0].call_method(tx, "stride", [args[1]], {})
-        elif (
-            self.value == torch.addcdiv
-            and len(args) == 3
-            and "value" in kwargs
-            and len(kwargs) == 1
-        ):
-            # decompose addcdiv into constituent ops, prevents a graph break due to converting
-            # value to a scalar
-            result = TorchInGraphFunctionVariable(torch.div).call_function(
-                tx, args[1:], {}
-            )
-            result = TorchInGraphFunctionVariable(torch.mul).call_function(
-                tx, [result, kwargs["value"]], {}
+
+        if DistributedVariable.is_available():
+            from torch.distributed._tensor import DTensor
+            from torch.distributed.distributed_c10d import (
+                _get_group_size_by_name,
+                _get_group_tag,
+                _rank_not_in_group,
+                _resolve_group_name_by_ranks_and_tag,
+                get_process_group_ranks,
             )
-            return TorchInGraphFunctionVariable(torch.add).call_function(
-                tx, [args[0], result], {}
+
+            @register(
+                _get_group_size_by_name,
+                _get_group_tag,
+                _rank_not_in_group,
+                get_process_group_ranks,
+                _resolve_group_name_by_ranks_and_tag,
             )
-        elif (
-            self.value is torch._assert
-            and len(args) >= 1
-            and (
-                (args[0].is_python_constant() and args[0].as_python_constant())
-                or (
-                    isinstance(args[0], variables.SymNodeVariable)
-                    and args[0].evaluate_expr()
+            def handle_constant_processgroup_functions(self, tx, *args):
+                # because the input is a "ProcessGroupVariable", we'll be guarding on its
+                # ID_MATCH based on how it was constructed.
+
+                # We desugar it at trace-time into ranks by directly calling util
+                # bake the result into the trace
+                if len(args) == 1:
+                    # group or group name
+                    assert isinstance(args[0], (ProcessGroupVariable, ConstantVariable))
+                elif len(args) == 2:
+                    # ranks + tag
+                    assert isinstance(args[0], ListVariable) and isinstance(
+                        args[1], ConstantVariable
+                    )
+                else:
+                    raise AssertionError(
+                        f"Invalid group value ({args}) for constant pg "
+                        f"function {self.value}"
+                    )
+                args_as_value = [arg.as_python_constant() for arg in args]
+                invocation_result = self.value(*args_as_value)
+
+                # Note - while we *could* cook up sources around invocations, like a FunctionSource
+                # the space of invoking functions in the middle of the guard chain is very iffy. As such,
+                # guard propagation via options is the best we can do.
+                return SourcelessBuilder.create(tx, invocation_result)
+
+            @register(DTensor.from_local)
+            def handle_from_local(self, tx, *args, **kwargs):
+                # rewrite non-primitive args/kwargs to be included in the on-the-fly prim function
+                # and rewrite args to have only proxyable args, then insert call_function
+                args_as_value = [x.as_python_constant() for x in args[1:]]
+                kwargs_as_value = {k: v.as_python_constant() for k, v in kwargs.items()}
+
+                def fn_with_prim_types(x):
+                    return self.value(x, *args_as_value, **kwargs_as_value)
+
+                # attach the same function name for better debugging
+                fn_with_prim_types.__name__ = "prim " + self.value.__name__
+
+                return wrap_fx_proxy(
+                    tx=tx,
+                    proxy=tx.output.create_proxy(
+                        "call_function",
+                        fn_with_prim_types,
+                        *proxy_args_kwargs([args[0]], {}),
+                    ),
                 )
-            )
-        ):
-            return ConstantVariable(None)
-        elif is_constant_pg_functions(self.value):
-            # becuase the input is a "ProcessGroupVariable", we'll be guarding on its
-            # ID_MATCH based on how it was constructed.
-
-            # We desugar it at trace-time into ranks by directly calling util
-            # bake the result into the trace
-            assert len(args) == 1, "Expected one arg (pg)"
-            assert isinstance(args[0], ProcessGroupVariable)
-
-            invocation_result = self.value(args[0].as_python_constant())
-            # Note - while we *could* cook up sources around invocations, like a FunctionSource
-            # the space of invoking functions in the middle of the guard chain is very iffy. As such,
-            # guard propagation via options is the best we can do.
-            from .builder import SourcelessBuilder
-
-            return SourcelessBuilder()(tx, invocation_result)
-        elif is_from_local(self.value):
-            # rewrite non-primitive args/kwargs to be included in the on-the-fly prim function
-            # and rewrite args to have only proxyable args, then insert call_function
-            args_as_value = [x.as_python_constant() for x in args[1:]]
-            kwargs_as_value = {k: v.as_python_constant() for k, v in kwargs.items()}
-
-            def fn_with_prim_types(x):
-                return self.value(x, *args_as_value, **kwargs_as_value)
-
-            # attach the same function name for better debugging
-            fn_with_prim_types.__name__ = "prim " + self.value.__name__
 
-            return wrap_fx_proxy(
-                tx=tx,
-                proxy=tx.output.create_proxy(
-                    "call_function",
-                    fn_with_prim_types,
-                    *proxy_args_kwargs([args[0]], {}),
-                ),
-            )
-        elif (
-            self.value is torch.nested.nested_tensor
-            and kwargs.get("layout", torch.strided) == torch.strided
+        @register(torch.nested.nested_tensor)
+        def handle_nested_tensor(
+            self, tx, tensor_list=None, *args, layout=None, **kwargs
         ):
-            raise unimplemented("torch.compile does not support strided NestedTensor")
-        elif self.value is torch.nn.functional.one_hot and (
-            len(args) + len(kwargs) == 1
-            or (
+            from .lists import BaseListVariable
+
+            if layout and layout.as_python_constant() == torch.strided:
+                unimplemented("torch.compile does not support strided NestedTensor")
+            if not isinstance(tensor_list, BaseListVariable):
+                unimplemented("nested_tensor with non-list input")
+
+        @register(torch.nn.functional.one_hot)
+        def handle_one_hot(self, tx, *args, **kwargs):
+            if len(args) + len(kwargs) == 1 or (
                 len(args) == 2
                 and args[1].is_python_constant()
                 and args[1].as_python_constant() == -1
-            )
+            ):
+                unimplemented(
+                    "torch.nn.functional.one_hot with data-dependent output shape"
+                )
+
+        @register(torch.fx.experimental.symbolic_shapes.guard_size_oblivious)
+        def handle_guard_size_oblivious(self, tx, expr):
+            if isinstance(expr, SymNodeVariable):
+                # TODO: this probably should be folded somewhere else but I'm not sure where
+                # TODO: some of the other symbolic_shapes special tools can also get this treatment too
+                return variables.ConstantVariable.create(
+                    torch.fx.experimental.symbolic_shapes.guard_size_oblivious(
+                        expr.sym_num
+                    )
+                )
+            elif isinstance(expr, ConstantVariable):
+                return expr
+
+        @register(torch._C._autograd._unsafe_set_version_counter)
+        def handle_unsafe_set_version_counter(self, tx, *args, **kwargs):
+            from ..tensor_version_op import _unsafe_set_version_counter
+
+            return TorchInGraphFunctionVariable(
+                _unsafe_set_version_counter
+            ).call_function(tx, [*args], kwargs)
+
+        @register(torch.tensor)
+        def handle_torch_tensor(self, tx, *args, **kwargs):
+            def check_any_unspec(x):
+                # NB: This includes UnspecializedPythonVariable
+                if isinstance(x, (TensorVariable, SymNodeVariable)):
+                    return True
+                elif isinstance(x, (ListVariable, TupleVariable)):
+                    return any(check_any_unspec(y) for y in x.items)
+                # TODO: there maybe other recursive structures you need to
+                # check
+                else:
+                    return False
+
+            data_arg = None
+            if args:
+                data_arg = args[0]
+            elif "data" in kwargs:
+                data_arg = kwargs["data"]
+
+            # NB: OK to pass torch.tensor(tensor), this will trace fine
+            if not isinstance(data_arg, TensorVariable) and check_any_unspec(data_arg):
+                # This is slower and less canonical, so only use it if we
+                # have to
+                return TorchInGraphFunctionVariable(torch._refs.tensor).call_function(
+                    tx, [*args], kwargs
+                )
+
+        return handlers
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from . import ConstantVariable, SymNodeVariable, TensorVariable
+        from .builder import wrap_fx_proxy
+
+        if self.can_constant_fold_through() and check_unspec_or_constant_args(
+            args, kwargs
         ):
-            raise unimplemented(
-                "torch.nn.functional.one_hot with data-dependent output shape"
+            # constant fold
+            return ConstantVariable.create(
+                self.as_python_constant()(
+                    *[x.as_python_constant() for x in args],
+                    **{k: v.as_python_constant() for k, v in kwargs.items()},
+                ),
             )
+
+        special_handler = self._get_handlers().get(self.value)
+        if special_handler:
+            result = special_handler(self, tx, *args, **kwargs)
+            if result:
+                return result
+
+        if can_dispatch_torch_function(tx, args, kwargs):
+            return dispatch_torch_function(tx, self, args, kwargs)
         else:
             any_symints_or_symfloats = any(isinstance(x, SymNodeVariable) for x in args)
+
             all_ints_or_floats = all(
                 isinstance(x, (variables.ConstantVariable, variables.SymNodeVariable))
                 for x in args
             )
-            bin_ops = {"add", "sub", "mul", "div", "sqrt"}
             if (
                 getattr(self.value, "__module__", "") == "torch"
                 and self.value.__name__ in bin_ops
@@ -517,44 +738,18 @@ def fn_with_prim_types(x):
 For now, dynamo will explicitly graph break when it encounters user code with this behavior.
 """
                 log.warning(msg)
-                raise unimplemented(msg)
+                unimplemented(msg)
 
             # TODO(voz): Replace w/ dynamic shape rewrite table.
             # Ideally, we would be able to do this at ctor time, but alas we need a combination
             # of value + args to determine this.
             fn_ = self.value
-            if any(isinstance(x, SymNodeVariable) for x in args):
-                if self.value == math.sqrt:
-                    from torch.fx.experimental.sym_node import sym_sqrt
-
-                    fn_ = sym_sqrt
-
-            if fn_ is torch.tensor:
-
-                def check_any_unspec(x):
-                    # NB: This includes UnspecializedPythonVariable
-                    if isinstance(x, (TensorVariable, SymNodeVariable)):
-                        return True
-                    elif isinstance(x, (ListVariable, TupleVariable)):
-                        return any(check_any_unspec(y) for y in x.items)
-                    # TODO: there maybe other recursive structures you need to
-                    # check
-                    else:
-                        return False
-
-                data_arg = None
-                if args:
-                    data_arg = args[0]
-                elif "data" in kwargs:
-                    data_arg = kwargs["data"]
-
-                # NB: OK to pass torch.tensor(tensor), this will trace fine
-                if not isinstance(data_arg, TensorVariable) and check_any_unspec(
-                    data_arg
+            if any_symints_or_symfloats:
+                torch_sym_op = f"_sym_{self.value.__name__}"
+                if getattr(self.value, "__module__", None) == "math" and hasattr(
+                    torch, torch_sym_op
                 ):
-                    # This is slower and less canonical, so only use it if we
-                    # have to
-                    fn_ = torch._refs.tensor
+                    fn_ = getattr(torch, torch_sym_op)
 
             tensor_variable = wrap_fx_proxy(
                 tx=tx,
@@ -590,20 +785,33 @@ def check_any_unspec(x):
                     for idx, name in enumerate(output_tensor_names):
                         if name in tx.symbolic_locals:
                             tx.symbolic_locals[name] = tensor_variable.items[idx]
+                    for out_tensor, result_tensor in zip(
+                        kwargs["out"].items, tensor_variable.items
+                    ):
+                        if (
+                            out_tensor.source
+                            and out_tensor in tx.output.graphargs
+                            and isinstance(out_tensor, variables.TensorVariable)
+                            and isinstance(result_tensor, variables.TensorVariable)
+                            and out_tensor.size != result_tensor.size
+                        ):
+                            # It's hard to get out variants with resizing on graph inputs work
+                            # properly across dynamo/aot/inductor, just fall back.
+                            unimplemented("out variants with resizing on graph inputs")
                 elif isinstance(tensor_variable, TensorVariable):
                     assert isinstance(kwargs["out"], TensorVariable)
+                    assert "example_value" in kwargs["out"].proxy.node.meta
+                    fake_tensor = tensor_variable.proxy.node.meta["example_value"]
+                    fake_out = kwargs["out"].proxy.node.meta["example_value"]
                     if (
                         kwargs["out"].source
                         and kwargs["out"] in tx.output.graphargs
-                        and kwargs["out"].size != tensor_variable.size
+                        and fake_out.shape != fake_tensor.shape
                     ):
                         # It's hard to get out variants with resizing on graph inputs work
                         # properly across dynamo/aot/inductor, just fall back.
                         unimplemented("out variants with resizing on graph inputs")
-                    assert "example_value" in kwargs["out"].proxy.node.meta
-                    if not torch._prims_common.is_contiguous(
-                        kwargs["out"].proxy.node.meta["example_value"]
-                    ):
+                    if not torch._prims_common.is_contiguous(fake_out):
                         # It's difficult to handle strides correctly in functionalization
                         # when calling an out= op with a non-contiguous out argument
                         unimplemented(
@@ -643,3 +851,81 @@ def handle_ntuple(value):
             return variables.LambdaVariable(handle_ntuple)
         else:
             return handle_ntuple(args[0])
+
+    @classmethod
+    def call_nn_parameter(cls, tx, data=None, requires_grad=True):
+        """A call to torch.nn.Parameter() gets lifted to before the graph"""
+        if isinstance(requires_grad, variables.VariableTracker):
+            try:
+                requires_grad = requires_grad.as_python_constant()
+            except NotImplementedError:
+                unimplemented("Parameter(requires_grad=...) not constant")
+
+        if not isinstance(data, variables.TensorVariable):
+            unimplemented(f"Parameter(data={data}) not implemented")
+
+        # this results in cleaner graphs, but only works for inputs
+        if data.source:
+            return cls._nn_param_via_prefix_insert(tx, data, requires_grad)
+
+        try:
+            shape = tuple(data.var_getattr(tx, "shape").as_python_constant())
+            dtype = data.var_getattr(tx, "dtype").as_python_constant()
+            device = data.var_getattr(tx, "device").as_python_constant()
+        except NotImplementedError as e:
+            unimplemented(f"Parameter not python_constant: {e}")
+
+        placeholder = tx.output.synthetic_graph_input(
+            new_parameter_placeholder, [shape, dtype, device, requires_grad]
+        )
+        if data.requires_grad:
+            data = data.call_method(tx, "detach", [], {})
+
+        from .builder import wrap_fx_proxy
+
+        result = wrap_fx_proxy(
+            tx,
+            tx.output.create_proxy(
+                "call_function",
+                tracable_create_parameter,
+                (data.as_proxy(), placeholder.as_proxy()),
+                {},
+            ),
+        )
+        assert isinstance(result, variables.TensorVariable)
+        result.class_type = torch.nn.Parameter
+        # In reconstruct() should use the original parameter.  The one returned by the graph will be an alias.
+        result.source = placeholder.source
+
+        # TODO(jansel): if the new param falls out of scope, currently it won't get freed until
+        # the end of the graph.  We should fix this.
+        return result
+
+    @staticmethod
+    def _nn_param_via_prefix_insert(tx, data, requires_grad):
+        # Alternate version if we have a .source
+        from .builder import VariableBuilder
+
+        varname = tx.output.new_var()
+
+        # construct the nn.Parmeter before the graph save it to varname
+        cg = PyCodegen(tx)
+        cg.load_import_from("torch.nn", "Parameter")
+        cg(data.source)
+        cg(variables.ConstantVariable(requires_grad))
+        cg.call_function(2, True)
+        cg.store(varname)
+        tx.output.pregraph_bytecode.extend(cg.get_instructions())
+
+        # add the newly constructed nn.Parameter as a graph input
+        source = SyntheticLocalSource(varname)
+        example_value = torch.nn.Parameter(
+            tx.output.example_value_from_input_node(data.as_proxy().node)
+        )
+        result = VariableBuilder(tx, source)(example_value)
+        # No need to guard on this since we already guarded on `data`.
+        # These guards would fail since varname doesn't exist until after the function starts
+        TracingContext.get().guards_context.dynamo_guards.remove_guards_with_source(
+            source
+        )
+        return result
diff --git a/torch/_dynamo/variables/torch_function.py b/torch/_dynamo/variables/torch_function.py
index e6bccb6c7a5b6..0674b8cfd1464 100644
--- a/torch/_dynamo/variables/torch_function.py
+++ b/torch/_dynamo/variables/torch_function.py
@@ -1,5 +1,7 @@
+# mypy: ignore-errors
+
 import inspect
-from typing import Dict, List
+from typing import Dict, List, TYPE_CHECKING
 
 import torch.utils._pytree as pytree
 
@@ -8,12 +10,14 @@
 from ..guards import GuardBuilder, install_guard
 from ..source import AttrSource, GlobalSource
 from ..utils import has_torch_function, is_tensor_base_attr_getter
-from .base import VariableTracker
 from .constant import ConstantVariable
 from .lists import TupleVariable
 from .tensor import TensorSubclassVariable, TensorVariable
 from .user_defined import UserDefinedObjectVariable
 
+if TYPE_CHECKING:
+    from .base import VariableTracker
+
 
 # [Note: __torch_function__] This feature is a prototype and has some rough edges (contact mlazos with issues):
 # At a high level, a torch function tensor subclass is represented as a TensorWithTFOverrideVariable, which dispatches
@@ -52,14 +56,14 @@ def _get_subclass_type(var):
 def _get_subclass_type_var(tx, var):
     assert isinstance(var, (TensorWithTFOverrideVariable, UserDefinedObjectVariable))
     if isinstance(var, TensorWithTFOverrideVariable):
-        return var.class_type_var()
+        return var.class_type_var(tx)
     elif isinstance(var, UserDefinedObjectVariable):
         from .builder import SourcelessBuilder, VariableBuilder
 
         if var.source:
             return VariableBuilder(tx, var.source)(var.python_type())
         else:
-            return SourcelessBuilder()(tx, var.python_type())
+            return SourcelessBuilder.create(tx, var.python_type())
 
 
 def _is_attr_overidden(tx, var, name):
@@ -86,8 +90,8 @@ def call_torch_function(
         torch_function_type,
         fn,
         types,
-        SourcelessBuilder()(tx, tuple(args)),
-        SourcelessBuilder()(tx, kwargs),
+        SourcelessBuilder.create(tx, tuple(args)),
+        SourcelessBuilder.create(tx, kwargs),
     )
     return tx.inline_user_function_return(torch_function_var, tf_args, {})
 
@@ -95,13 +99,13 @@ def call_torch_function(
 def build_torch_function_fn(tx, value, source):
     from .builder import SourcelessBuilder, VariableBuilder
 
-    if not source:
+    if source:
         return VariableBuilder(
             tx,
             AttrSource(AttrSource(source, "__torch_function__"), "__func__"),
         )(value.__torch_function__.__func__)
     else:
-        return SourcelessBuilder()(tx, value.__torch_function__.__func__)
+        return SourcelessBuilder.create(tx, value.__torch_function__.__func__)
 
 
 def can_dispatch_torch_function(tx, args, kwargs):
@@ -163,19 +167,29 @@ def install_global(self, tx):
         # stash the subclass type to rewrap an output tensor if needed
         # this is needed because the actual type needs to be available
         # each time the compiled artifact is run and outputs a wrapped tensor.
-        if self.global_mangled_class_name() not in tx.output.global_scope:
-            tx.output.install_global(self.global_mangled_class_name(), self.class_type)
+        if self.global_mangled_class_name(tx) not in tx.output.global_scope:
+            # Safe because global_mangled_class_name figures it out
+            tx.output.install_global_unsafe(
+                self.global_mangled_class_name(tx), self.class_type
+            )
 
     def python_type(self):
         return self.class_type
 
-    def class_type_var(self):
+    def class_type_var(self, tx):
         return TensorSubclassVariable(
-            self.class_type, source=GlobalSource(self.global_mangled_class_name())
+            self.class_type, source=GlobalSource(self.global_mangled_class_name(tx))
         )
 
-    def global_mangled_class_name(self):
-        return f"__subclass_{self.class_type.__name__}_{id(self.class_type)}"
+    def global_mangled_class_name(self, tx):
+        # The global_mangled_class_name should be different for different
+        # invocations of torch.compile. Otherwise, we can run into a situation
+        # where multiple torch.compile invocations re-use the same global name,
+        # but the global's lifetime is tied to the first invocation (and
+        # may be deleted when the first torch.compile invocation is deleted)
+        # We mangle it based off of the output_graph's id.
+        compile_id = tx.output.compile_id
+        return f"__subclass_{self.class_type.__name__}_{id(self.class_type)}_c{id}"
 
     def var_getattr(self, tx, name):
         # [Note: __torch_function__] We currently only support attributes that are defined on
@@ -183,7 +197,7 @@ def var_getattr(self, tx, name):
         import torch
         from .builder import SourcelessBuilder
 
-        if name in banned_attrs or not hasattr(torch.Tensor, name):
+        if name in banned_attrs:
             unimplemented(
                 f"Accessing {name} on a tensor subclass with a __torch_function__ override is not supported"
             )
@@ -194,19 +208,19 @@ def var_getattr(self, tx, name):
                 " subclass with a __torch_function__ override is not supported"
             )
 
-        if tx.output.torch_function_enabled:
+        if tx.output.torch_function_enabled and hasattr(torch.Tensor, name):
             if self.source:
                 install_guard(
                     AttrSource(AttrSource(self.source, "__class__"), name).make_guard(
                         GuardBuilder.FUNCTION_MATCH
                     )
                 )
-            get_fn = SourcelessBuilder()(tx, getattr(torch.Tensor, name).__get__)
+            get_fn = SourcelessBuilder.create(tx, getattr(torch.Tensor, name).__get__)
 
             return self.call_torch_function(
                 tx,
                 get_fn,
-                TupleVariable([self.class_type_var()]),
+                TupleVariable([self.class_type_var(tx)]),
                 [self],
                 {},
             )
@@ -216,7 +230,7 @@ def var_getattr(self, tx, name):
     def call_torch_function(self, tx, fn, types, args, kwargs):
         return call_torch_function(
             tx,
-            self.class_type_var(),
+            self.class_type_var(tx),
             self.torch_function_fn,
             fn,
             types,
@@ -252,7 +266,7 @@ def call_method(
                     tx, AttrSource(AttrSource(self.source, "__class__"), name)
                 )(inspect.getattr_static(self.python_type(), name))
             else:
-                func_var = SourcelessBuilder()(tx, getattr(torch.Tensor, name))
+                func_var = SourcelessBuilder.create(tx, getattr(torch.Tensor, name))
             return dispatch_torch_function(tx, func_var, [self] + args, kwargs)
         else:
             return super().call_method(tx, name, args, kwargs)
diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index b1bf60eed4eff..390f1959aa01f 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -1,20 +1,33 @@
+# mypy: ignore-errors
+
 import collections
 import contextlib
+import enum
 import functools
 import importlib
 import inspect
 import itertools
 import random
+import re
 import sys
 import threading
 import types
-from typing import Dict, List
+import warnings
+
+from typing import Dict, Generic, List
+
+from ..bytecode_transformation import create_call_function
 
 try:
     import numpy as np
 except ModuleNotFoundError:
     np = None
 
+try:
+    from torch.utils._cxx_pytree import PyTreeSpec
+except ImportError:
+    PyTreeSpec = type(None)
+
 import torch._dynamo.config
 
 import torch.nn
@@ -43,6 +56,13 @@
 from .dicts import DefaultDictVariable
 
 
+def is_standard_setattr(val):
+    return val in (
+        object.__setattr__,
+        torch.nn.Module.__setattr__,
+    )
+
+
 class UserDefinedVariable(VariableTracker):
     pass
 
@@ -61,7 +81,7 @@ def python_type(self):
     def as_proxy(self):
         return self.value
 
-    def __repr__(self):
+    def __str__(self):
         return f"UserDefinedClassVariable({self.value})"
 
     @staticmethod
@@ -88,11 +108,13 @@ def can_constant_fold_through(self):
 
     def var_getattr(self, tx, name: str) -> "VariableTracker":
         from .. import trace_rules
-        from . import ConstantVariable
+        from . import ConstantVariable, EnumVariable
         from .builder import VariableBuilder
 
         if name == "__name__":
             return ConstantVariable.create(self.value.__name__)
+        elif name == "__qualname__":
+            return ConstantVariable.create(self.value.__qualname__)
 
         source = AttrSource(self.source, name) if self.source is not None else None
         try:
@@ -102,14 +124,18 @@ def var_getattr(self, tx, name: str) -> "VariableTracker":
 
         if isinstance(obj, staticmethod):
             func = obj.__get__(self.value)
-            if trace_rules.lookup(func) is not None:
+            if source is not None:
                 return trace_rules.lookup(func).create_with_source(func, source=source)
             else:
-                return variables.UserFunctionVariable(func, source=source)
+                return trace_rules.lookup(func)(func)
         elif isinstance(obj, classmethod):
             return variables.UserMethodVariable(obj.__func__, self, source=source)
-        elif source and inspect.ismemberdescriptor(obj):
-            return VariableBuilder(tx, source)(obj.__get__(self.value))
+        elif source:
+            # __mro__ is a member in < 3.12, an attribute in >= 3.12
+            if inspect.ismemberdescriptor(obj) or (
+                sys.version_info >= (3, 12) and name == "__mro__"
+            ):
+                return VariableBuilder(tx, source)(obj.__get__(self.value))
 
         # Special handling of collections.OrderedDict.fromkeys()
         # Wrap it as GetAttrVariable(collections.OrderedDict, "fromkeys") to make it consistent with
@@ -119,14 +145,16 @@ def var_getattr(self, tx, name: str) -> "VariableTracker":
         if self.value is collections.OrderedDict and name == "fromkeys":
             return super().var_getattr(tx, name)
 
-        if name in getattr(self.value, "__dict__", {}) or (
+        if ConstantVariable.is_literal(obj):
+            return ConstantVariable.create(obj)
+        elif isinstance(obj, enum.Enum):
+            return EnumVariable(obj)
+        elif name in getattr(self.value, "__dict__", {}) or (
             self.value.__module__.startswith("torch.")
             or self.value.__module__ == "torch"
         ):
             if source:
                 return VariableBuilder(tx, source)(obj)
-        elif ConstantVariable.is_literal(obj):
-            return ConstantVariable.create(obj)
 
         return super().var_getattr(tx, name)
 
@@ -225,6 +253,10 @@ def call_method(
             return BuiltinVariable.call_custom_dict_fromkeys(
                 tx, self.value, *args, **kwargs
             )
+        elif name == "__eq__" and len(args) == 1 and hasattr(args[0], "value"):
+            return variables.ConstantVariable(self.value == args[0].value)
+        elif name == "__ne__" and len(args) == 1 and hasattr(args[0], "value"):
+            return variables.ConstantVariable(self.value != args[0].value)
 
         return super().call_method(tx, name, args, kwargs)
 
@@ -283,11 +315,18 @@ def call_function(
             return variables.functions.FunctoolsPartialVariable(
                 fn, args=rest_args, keywords=kwargs
             )
+        elif self.value is warnings.catch_warnings and not args:
+            return variables.CatchWarningsCtxManagerVariable.create(tx, kwargs)
         elif (
             issubclass(type(self.value), type)
-            and hasattr(self.value, "__enter__")
-            and hasattr(self.value, "__exit__")
+            and hasattr(
+                self.value, "__enter__"
+            )  # TODO(voz): These can invoke user code!
+            and hasattr(
+                self.value, "__exit__"
+            )  # TODO(voz): These can invoke user code!
             and check_constant_args(args, kwargs)
+            and self.value.__init__ == object.__init__
             and len(kwargs) == 0  # TODO(ybliang): support kwargs
         ):
             unwrapped_args = [x.as_python_constant() for x in args]
@@ -295,9 +334,15 @@ def call_function(
                 unwrapped_args,
                 cm_obj=self.value(*unwrapped_args),
             )
+
         elif is_namedtuple_cls(self.value):
             fields = namedtuple_fields(self.value)
-            field_defaults = self.value._field_defaults
+            # check if this a quasi-namedtuple or a real one
+            if self.value.__module__ == "torch.return_types":
+                # create pseudo-defaults from values of the quasi-namedtuple
+                field_defaults = dict(zip(fields, args[0].items))
+            else:
+                field_defaults = self.value._field_defaults
 
             items = list(args)
             items.extend([None] * (len(fields) - len(items)))
@@ -309,7 +354,9 @@ def call_function(
                         field_var = kwargs[field_name]
                     else:
                         assert field_name in field_defaults
-                        field_var = SourcelessBuilder()(tx, field_defaults[field_name])
+                        field_var = SourcelessBuilder.create(
+                            tx, field_defaults[field_name]
+                        )
                     var_tracker_kwargs[field_name] = field_var
 
             for name, value in var_tracker_kwargs.items():
@@ -319,7 +366,7 @@ def call_function(
             assert all(x is not None for x in items)
             return variables.NamedTupleVariable(items, self.value)
         elif (
-            inspect.getattr_static(self.value, "__new__", None) in (object.__new__,)
+            self.is_standard_new()
             and SideEffects.cls_supports_mutation_side_effects(self.value)
             and self.source
         ):
@@ -397,12 +444,30 @@ def call_function(
 
         return super().call_function(tx, args, kwargs)
 
+    def is_standard_new(self):
+        """Check for __new__ being overridden"""
+        new_fn = inspect.getattr_static(self.value, "__new__", None)
+        if isinstance(new_fn, staticmethod):
+            new_fn = new_fn.__func__
+        return new_fn in (object.__new__, Generic.__new__)
+
+    def call_hasattr(self, tx, name: str) -> "VariableTracker":
+        if self.source:
+            source = AttrSource(self.source, name)
+            install_guard(source.make_guard(GuardBuilder.HASATTR))
+            return variables.ConstantVariable(hasattr(self.value, name))
+        return super().call_hasattr(tx, name)
+
     def const_getattr(self, tx, name):
         if name == "__name__":
             return self.value.__name__
         return super().const_getattr(tx, name)
 
 
+class NO_SUCH_SUBOBJ:
+    pass
+
+
 class UserDefinedObjectVariable(UserDefinedVariable):
     """
     Mostly objects of defined type.  Catch-all for something where we only know the type.
@@ -430,6 +495,12 @@ def __str__(self):
     def python_type(self):
         return self.value_type
 
+    def guard_as_python_constant(self):
+        if self.source:
+            install_guard(self.source.make_guard(GuardBuilder.ID_MATCH))
+            return self.value
+        return super().guard_as_python_constant()
+
     def torch_function_check(self):
         assert has_torch_function(
             self
@@ -494,6 +565,9 @@ def call_method(
             if method is object.__init__:
                 return ConstantVariable.create(None)
 
+            if is_standard_setattr(method):
+                return self.method_setattr_standard(tx, *args, **kwargs)
+
             # [NOTE] OrderedDict, dict subtypes must always have source
             # We cannot instantiate such subtypes in-graph due to builtin __new__
             if method is collections.OrderedDict.keys:
@@ -502,7 +576,8 @@ def call_method(
                 assert self.source  # OrderedDict, dict subtypes must always have source
                 keys = list(self.value.keys())
                 assert all(map(ConstantVariable.is_literal, keys))
-                install_guard(self.source.make_guard(GuardBuilder.ODICT_KEYS))
+                install_guard(self.source.make_guard(GuardBuilder.DICT_CONST_KEYS))
+                tx.output.guard_on_key_order.add(self.source.name())
                 return TupleVariable([ConstantVariable.create(k) for k in keys])
 
             if (
@@ -514,7 +589,9 @@ def call_method(
             ):
                 assert not kwargs
                 assert self.source  # OrderedDict, dict subtypes must always have source
-                install_guard(self.source.make_guard(GuardBuilder.ODICT_KEYS))
+
+                # TODO(anijain2305) - Why do we need to guard on all keys?
+                install_guard(self.source.make_guard(GuardBuilder.DICT_CONST_KEYS))
                 return ConstantVariable.create(
                     args[0].as_python_constant() in self.value
                 )
@@ -532,6 +609,7 @@ def call_method(
                             [key, self.odict_getitem(tx, key)],
                         )
                     )
+                tx.output.guard_on_key_order.add(self.source.name())
                 return TupleVariable(items)
 
             if method is collections.OrderedDict.__getitem__ and len(args) == 1:
@@ -539,6 +617,16 @@ def call_method(
                 assert self.source  # OrderedDict, dict subtypes must always have source
                 return self.odict_getitem(tx, args[0])
 
+            if (
+                method in (object.__ne__, object.__eq__)
+                and len(args) == 1
+                and not kwargs
+                and hasattr(args[0], "value")
+            ):
+                return ConstantVariable(
+                    (self.value is args[0].value) is (method is object.__eq__)
+                )
+
             # check for methods implemented in C++
             if isinstance(method, types.FunctionType):
                 source = (
@@ -552,11 +640,27 @@ def call_method(
                 )
 
             if method is list.__len__ and self.source and not (args or kwargs):
-                install_guard(self.source.make_guard(GuardBuilder.LIST_LENGTH))
+                install_guard(self.source.make_guard(GuardBuilder.SEQUENCE_LENGTH))
                 return ConstantVariable(len(self.value))
 
         return super().call_method(tx, name, args, kwargs)
 
+    def method_setattr_standard(self, tx, name, value):
+        try:
+            name = name.as_python_constant()
+        except NotImplementedError:
+            unimplemented(f"non-const setattr name: {name}")
+        if not tx.output.side_effects.is_attribute_mutation(self):
+            unimplemented(f"setattr({self}, {name}, ...)")
+
+        tx.output.side_effects.store_attr(self, name, value)
+        return variables.ConstantVariable(None)
+
+    def needs_slow_setattr(self):
+        return not is_standard_setattr(
+            inspect.getattr_static(self.value, "__setattr__", None)
+        )
+
     def unpack_var_sequence(self, tx):
         if (
             self.source
@@ -564,7 +668,7 @@ def unpack_var_sequence(self, tx):
             and self._maybe_get_baseclass_method("__len__") is list.__len__
             and self._maybe_get_baseclass_method("__getitem__") is list.__getitem__
         ):
-            install_guard(self.source.make_guard(GuardBuilder.LIST_LENGTH))
+            install_guard(self.source.make_guard(GuardBuilder.SEQUENCE_LENGTH))
             return [
                 variables.LazyVariableTracker.create(
                     self.value[k],
@@ -574,6 +678,9 @@ def unpack_var_sequence(self, tx):
             ]
         return super().unpack_var_sequence(tx)
 
+    def next_variable(self, tx):
+        return self.call_method(tx, "__next__", [], {})
+
     def is_supported_random(self):
         try:
             return self.value in self._supported_random_functions()
@@ -594,10 +701,13 @@ def call_function(
         ):
             args = [x.as_python_constant() for x in args]
             kwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
-            random_call_index = len(tx.random_calls)
+            random_call_index = len(tx.output.random_calls)
             example_value = self.value(*args, **kwargs)
             source = RandomValueSource(random_call_index)
-            tx.random_calls.append((self.value, args, kwargs))
+            tx.output.random_calls.append((self.value, args, kwargs))
+            # TODO: arguably, this should route to wrap_symint/wrap_symfloat
+            # (currently hypothetical), but I'm not going to poke my hand in
+            # this nest for now
             return VariableBuilder(tx, source).wrap_unspecialized_primitive(
                 example_value
             )
@@ -606,8 +716,9 @@ def call_function(
             obj = self.value.__self__
             if (
                 func is torch.utils._contextlib._DecoratorContextManager.clone
-                and trace_rules.lookup(obj.__class__)
-                == variables.TorchCtxManagerClassVariable
+                and variables.TorchCtxManagerClassVariable.is_matching_cls(
+                    obj.__class__
+                )
                 and not (args or kwargs)
             ):
                 return variables.TorchCtxManagerClassVariable(
@@ -623,6 +734,16 @@ def call_function(
                 return variables.TorchCtxManagerClassVariable(
                     obj.__class__
                 ).call_function(tx, [var], kwargs)
+
+            if self.source is None:
+                unimplemented(
+                    "Sourceless UserDefinedObjectVariable method not supported"
+                )
+            func_src = AttrSource(self.source, "__func__")
+            func_var = VariableBuilder(tx, func_src)(func)
+            obj_src = AttrSource(self.source, "__self__")
+            obj_var = VariableBuilder(tx, obj_src)(obj)
+            return func_var.call_function(tx, [obj_var] + args, kwargs)
         elif (
             istype(self.value, functools.partial)
             and trace_rules.lookup(self.value.func)
@@ -675,11 +796,20 @@ def _check_for_getattr(self):
 
     def _getattr_static(self, name):
         if (
-            isinstance(self.value, torch.nn.Module)
+            isinstance(self.value, (torch.nn.Module, PyTreeSpec))
             or "__slots__" in self.value.__class__.__dict__
             or type(self.value) == threading.local
         ):
-            # getattr_static doesn't work on these
+            try:
+                cls_var = inspect.getattr_static(
+                    self.value.__class__, name, NO_SUCH_SUBOBJ
+                )
+                if cls_var is not NO_SUCH_SUBOBJ and name not in self.value.__dict__:
+                    # maybe user-defined @property that we need to inline
+                    return cls_var
+            except AttributeError:
+                pass  # __slots__
+            # this might call torch.nn.Module.__getattr__
             subobj = getattr(self.value, name)
         else:
             subobj = inspect.getattr_static(self.value, name)
@@ -693,15 +823,15 @@ def var_getattr(self, tx, name):
         value = self.value
         source = AttrSource(self.source, name) if self.source else None
         self._check_for_getattribute()
-        getattr_fn = self._check_for_getattr()
 
-        class NO_SUCH_SUBOBJ:
-            pass
+        if tx.output.side_effects.has_pending_mutation_of_attr(self, name):
+            return tx.output.side_effects.load_attr(self, name)
 
         try:
             subobj = self._getattr_static(name)
         except AttributeError:
             subobj = NO_SUCH_SUBOBJ
+            getattr_fn = self._check_for_getattr()
             if isinstance(getattr_fn, types.FunctionType):
                 return variables.UserMethodVariable(
                     getattr_fn, self, source=source
@@ -710,6 +840,11 @@ class NO_SUCH_SUBOBJ:
                 unimplemented("UserDefined with non-function __getattr__")
 
         if isinstance(subobj, property):
+            if self.source:
+                # Read the class attribute to reach the property
+                source = AttrSource(AttrSource(self.source, "__class__"), name)
+                # Get the getter function
+                source = AttrSource(source, "fget")
             return variables.UserMethodVariable(
                 subobj.fget, self, source=source
             ).call_function(tx, [], {})
@@ -720,12 +855,14 @@ class NO_SUCH_SUBOBJ:
             ).call_function(tx, [self], {})
         elif isinstance(subobj, staticmethod):
             func = subobj.__get__(self.value)
-            if trace_rules.lookup(func) is not None:
+            if source is not None:
                 return trace_rules.lookup(func).create_with_source(func, source=source)
             else:
-                return variables.UserFunctionVariable(func, source=source)
+                return trace_rules.lookup(func)(func)
         elif isinstance(subobj, classmethod):
-            return variables.UserMethodVariable(subobj.__func__, self, source=source)
+            return variables.UserMethodVariable(
+                subobj.__func__, self.var_getattr(tx, "__class__"), source=source
+            )
         elif isinstance(subobj, types.FunctionType) or (
             isinstance(subobj, types.MethodType)
             and isinstance(self.value, torch.nn.Module)
@@ -753,12 +890,12 @@ class NO_SUCH_SUBOBJ:
             elif inspect.isfunction(dynamic_subobj):
                 if is_utils_checkpoint(func):
                     return build_checkpoint_variable(source=source)
-                elif trace_rules.lookup(func) is not None:
+                elif source is not None:
                     return trace_rules.lookup(func).create_with_source(
                         func, source=source
                     )
                 else:
-                    return variables.UserFunctionVariable(func, source=source)
+                    return trace_rules.lookup(func)(func)
 
         if (
             name in getattr(value, "__dict__", {})
@@ -768,19 +905,33 @@ class NO_SUCH_SUBOBJ:
                 (
                     torch.Tensor,
                     torch.nn.Module,
+                    re.Pattern,
                 ),
             )
         ):
             if source:
+                install_guard(source.make_guard(GuardBuilder.HASATTR))
                 return VariableBuilder(tx, source)(subobj)
             elif ConstantVariable.is_literal(subobj):
                 return ConstantVariable.create(subobj)
+            elif (
+                type(subobj) == torch.utils._pytree.TreeSpec
+                or type(subobj) == torch.utils._pytree.LeafSpec
+                or type(value) == torch.utils._pytree.TreeSpec
+            ):
+                from .builder import SourcelessBuilder
+
+                return SourcelessBuilder.create(tx, subobj)
 
         if (
             name not in getattr(value, "__dict__", {})
-            and type(value).__module__.startswith("torch.")
+            and (
+                type(value).__module__.startswith("torch.")
+                or isinstance(subobj, re.Pattern)
+            )
             and "torch.optim" not in type(value).__module__
             and not callable(value)
+            and not isinstance(subobj, types.MethodDescriptorType)
         ):
             if not source:
                 assert getattr(
@@ -857,6 +1008,30 @@ def odict_getitem(self, tx, key):
         )(collections.OrderedDict.__getitem__(self.value, key.as_python_constant()))
 
 
+class SourcelessGraphModuleVariable(UserDefinedObjectVariable):
+    def __init__(
+        self,
+        value,
+        **kwargs,
+    ):
+        super().__init__(value, **kwargs)
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        fn_variable = variables.UserFunctionVariable(self.value.forward.__func__)
+        args = [self] + args
+        return tx.inline_user_function_return(
+            fn_variable,
+            args,
+            kwargs,
+        )
+
+
 class KeyedJaggedTensorVariable(UserDefinedObjectVariable):
     @staticmethod
     def is_matching_object(obj):
@@ -881,6 +1056,8 @@ def var_getattr(self, tx, name):
 
 
 class RemovableHandleVariable(VariableTracker):
+    REMOVED = -1
+
     def __init__(
         self,
         mutable_local=None,
@@ -894,16 +1071,17 @@ def __init__(
 
     def call_method(self, tx, method_name, args, kwargs):
         if method_name == "remove":
-            tx.output.side_effects.remove_hook(self.idx)
+            if self.idx != self.REMOVED:
+                tx.output.side_effects.remove_hook(self.idx)
+                self.idx = self.REMOVED
             return variables.ConstantVariable.create(None)
         super().call_method(tx, method_name, args, kwargs)
 
-    # This reconstruct is actually pretty unique - it does not construct the object from scratch.
-    # Handles always come from a register_hook call on a tensor, and so, rerunning that for the codegen of a
-    # hook would be incorrect.
-    # Instead, the invariant is that codegen has already produced the handle and stored it at a known name.
     def reconstruct(self, codegen):
-        if self.user_code_variable_name:
-            # It is an invariant that at this point, a STORE_FAST was executed for this name.
-            return [codegen.create_load(self.user_code_variable_name)]
-        return super().reconstruct(codegen)
+        if self.idx == self.REMOVED:
+            # Hook has already been removed, return a dummy handle
+            codegen.load_import_from("torch._dynamo.utils", "invalid_removeable_handle")
+            codegen.extend_output(create_call_function(0, True))
+            return
+        # unreachable due to codegen.add_cache() when the hook is installed
+        super().reconstruct(codegen)
diff --git a/torch/_export/__init__.py b/torch/_export/__init__.py
index e5cc7d8a20106..58ca08fc236de 100644
--- a/torch/_export/__init__.py
+++ b/torch/_export/__init__.py
@@ -3,9 +3,10 @@
 import functools
 import io
 import json
+import logging
+import os
 import re
 import sys
-import os
 import types
 import warnings
 import weakref
@@ -21,25 +22,30 @@
 import torch
 import torch._dynamo
 import torch.fx
-import torch.fx._pytree as fx_pytree
-
 import torch.utils._pytree as pytree
+
 from torch._decomp import core_aten_decompositions, get_decompositions
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.exc import UserError, UserErrorType
 from torch._dynamo.source import ConstantSource
+from torch._export.non_strict_utils import make_constraints
 from torch._export.passes.collect_tracepoints_pass import CollectTracepointsPass
 from torch._functorch.aot_autograd import aot_export_module, GraphSignature
 from torch._functorch.eager_transforms import functionalize
 from torch._guards import detect_fake_mode
+from torch._inductor import config
 from torch._ops import OpOverload
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
 from torch._subclasses.functional_tensor import FunctionalTensor
+from torch._utils_internal import log_export_usage
+from torch.export._tree_utils import reorder_kwargs
+from torch.export._unlift import _create_stateful_graph_module
+from torch.export.dynamic_shapes import _combine_args, Constraint, dims, dynamic_dim
 from torch.export.exported_program import (
+    _disable_prexisiting_fake_mode,
     ExportedProgram,
     ModuleCallEntry,
     ModuleCallSignature,
-    _disable_prexisiting_fake_mode,
 )
 from torch.export.graph_signature import (
     _sig_to_specs,
@@ -53,13 +59,6 @@
     SymIntArgument,
     TensorArgument,
 )
-from torch.export.dynamic_shapes import (
-    Constraint,
-    dynamic_dim,
-    _process_constraints,
-    _process_dynamic_shapes,
-)
-from torch.export._unlift import _create_stateful_graph_module
 from torch.fx import traceback as fx_traceback
 from torch.fx._compatibility import compatibility
 from torch.fx.experimental.proxy_tensor import make_fx, maybe_disable_fake_tensor_mode
@@ -72,21 +71,12 @@
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
 from torch.utils._sympy.value_ranges import ValueRangeError, ValueRanges
 
-from .exported_program import (
-    CallSpec,
-)
 from .passes.add_runtime_assertions_for_constraints_pass import (
     _AddRuntimeAssertionsForInlineConstraintsPass,
 )
-from .passes.lift_constant_tensor_pass import lift_constant_tensor_pass
-from .passes.remove_runtime_assertions import _RemoveRuntimeAssertionsPass
-from .passes.replace_sym_size_ops_pass import _replace_sym_size_ops_pass
-from .passes.replace_view_ops_with_view_copy_ops_pass import (
-    ReplaceViewOpsWithViewCopyOpsPass,
-)
 from .wrappers import _wrap_submodules
-from torch._inductor import config
 
+log = logging.getLogger(__name__)
 
 @dataclasses.dataclass
 class ExportDynamoConfig:
@@ -98,10 +88,10 @@ class ExportDynamoConfig:
 
 @compatibility(is_backward_compatible=False)
 def capture_pre_autograd_graph(
-    f: Callable,
+    f: torch.nn.Module,
     args: Tuple[Any],
     kwargs: Optional[Dict[str, Any]] = None,
-    constraints: Optional[List[Constraint]] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
 ) -> torch.nn.Module:
     """
     A helper function that is intended to trace a module before any pre-autograd
@@ -110,99 +100,128 @@ def capture_pre_autograd_graph(
     torch.export API.
 
     Args:
-      f: A callable to be traced
+      f: nn.Module to be traced
 
       args: example positional inputs.
 
       kwargs: optional example keyword inputs.
 
-      constraints: A optional list of constraints on the dynamic arguments specifying
-            their possible range of their shapes
+      dynamic_shapes: Should either be:
+         1) a dict from argument names of ``f`` to their dynamic shape specifications,
+         2) a tuple that specifies dynamic shape specifications for each input in original order.
+         If you are specifying dynamism on keyword args, you will need to pass them in the order that
+         is defined in the original function signature.
+
+         The dynamic shape of a tensor argument can be specified as either
+         (1) a dict from dynamic dimension indices to :func:`Dim` types, where it is
+         not required to include static dimension indices in this dict, but when they are,
+         they should be mapped to None; or (2) a tuple / list of :func:`Dim` types or None,
+         where the :func:`Dim` types correspond to dynamic dimensions, and static dimensions
+         are denoted by None. Arguments that are dicts or tuples / lists of tensors are
+         recursively specified by using mappings or sequences of contained specifications.
 
     Returns:
         An nn.Module containing the traced method.
 
     """
     from torch.export._trace import _convert_input_to_fake, DEFAULT_EXPORT_DYNAMO_CONFIG
+    from torch._utils_internal import export_api_rollout_check
+
+    log.warning("+============================+")
+    log.warning("|     !!!   WARNING   !!!    |")
+    log.warning("+============================+")
+    log.warning("capture_pre_autograd_graph() is deprecated and doesn't provide any function guarantee moving forward.")
+    log.warning("Please switch to use torch.export instead.")
+    if config.is_fbcode():
+        log.warning("Unless the unittest is in the blocklist, capture_pre_autograd_graph() will fallback to torch.export.")
+
+    assert isinstance(f, torch.nn.Module), "Expected an nn.Module instance."
 
     if kwargs is None:
         kwargs = {}
 
-    decomp_table = {op: op.decompose for op in FunctionalTensor.maybe_aliasing_or_mutating_ops}
-    with torch._dynamo.config.patch(dataclasses.asdict(DEFAULT_EXPORT_DYNAMO_CONFIG)):
-        m = torch._dynamo.export(
-            f,
-            constraints=constraints,
-            assume_static_by_default=True,
-            tracing_mode="symbolic",
-            decomposition_table=decomp_table,
-            pre_dispatch=True,
-            aten_graph=True,
-        )(
-            *args,
-            **kwargs,
-        )[0]
-
-        _, _, _, fake_mode = _convert_input_to_fake(m, args, kwargs)
-
-        m.meta["inline_constraints"] = {
-            k: v
-            for k, v in fake_mode.shape_env.runtime_var_to_range.items()
-            if re.match(r"^[if]\d+$", str(k))
+    if export_api_rollout_check():
+        log.warning("Using torch.export._trace._export")
+        module = torch.export._trace._export(f, args, kwargs, dynamic_shapes=dynamic_shapes, pre_dispatch=True).module()
+    else:
+        log_export_usage(event="export.private_api", flags={"capture_pre_autograd_graph"})
+
+        # Do not decompose dropout for exported models, because in eval mode the dropout
+        # op disappears from the graph, which makes it difficult to switch to train mode.
+        # See https://github.com/pytorch/pytorch/pull/115258#issuecomment-1900755832.
+        decomp_table = {
+            op: op.decompose
+            for op in FunctionalTensor.maybe_aliasing_or_mutating_ops
+            if op != torch.ops.aten.dropout.default
         }
+        with torch._dynamo.config.patch(dataclasses.asdict(DEFAULT_EXPORT_DYNAMO_CONFIG)):
+            m = torch._dynamo.export(
+                f,
+                dynamic_shapes=dynamic_shapes,
+                assume_static_by_default=True,
+                tracing_mode="symbolic",
+                decomposition_table=decomp_table,
+                pre_dispatch=True,
+                aten_graph=True,
+                _log_export_usage=False,
+            )(
+                *args,
+                **kwargs,
+            )[0]
+
+            _, _, _, fake_mode = _convert_input_to_fake(m, args, kwargs)
+
+            m.meta["inline_constraints"] = {
+                k: v
+                for k, v in fake_mode.shape_env.var_to_range.items()
+                if re.match(r"^[if]\d+$", str(k))
+            }
 
-        if isinstance(f, torch.nn.Module):
-            from torch.export._trace import _restore_state_dict
-            _restore_state_dict(f, m)
+            if isinstance(f, torch.nn.Module):
+                from torch.export._trace import _restore_state_dict
+                _restore_state_dict(f, m)
+
+            flat_args, _ = pytree.tree_flatten((args, kwargs or {}))
+            combined_args = _combine_args(f, args, kwargs)
+            range_constraints = make_constraints(
+                fake_mode,
+                m,
+                combined_args,
+                dynamic_shapes,
+                0,
+            )
 
-        flat_args, _ = pytree.tree_flatten((args, kwargs or {}))
-        range_constraints = _process_constraints(m, 0, flat_args)
-        module = _create_stateful_graph_module(
-            m,
-            range_constraints=range_constraints,
-        )
+            module = _create_stateful_graph_module(
+                m,
+                range_constraints=range_constraints,
+            )
+
+        error_message = \
+            """
+            Calling train() or eval() is not supported for exported models.
+            Alternatively, you may override these methods to do custom user behavior as follows:
+
+                def _my_train(self, mode: bool = True):
+                    ...
+
+                def _my_eval(self):
+                    ...
+
+                model.train = types.MethodType(_my_train, model)
+                model.eval = types.MethodType(_my_eval, model)
+            """
 
     def _train(self, mode: bool = True):
-        raise NotImplementedError("Calling train() is not supported yet.")
+        raise NotImplementedError(error_message)
 
     def _eval(self, mode: bool = True):
-        raise NotImplementedError("Calling eval() is not supported yet.")
+        raise NotImplementedError(error_message)
 
     module.train = types.MethodType(_train, module)  # type: ignore[method-assign]
     module.eval = types.MethodType(_eval, module)  # type: ignore[method-assign]
     return module
 
 
-def export(
-    f: Callable,
-    args: Tuple[Any, ...],
-    kwargs: Optional[Dict[str, Any]] = None,
-    constraints: Optional[List[Constraint]] = None,
-    *,
-    strict: bool = True,
-    preserve_module_call_signature: Tuple[str, ...] = (),
-) -> ExportedProgram:
-    from torch.export._trace import _export
-    warnings.warn("This function is deprecated. Please use torch.export.export instead.")
-
-    if constraints is not None:
-        warnings.warn(
-            "Using `constraints` to specify dynamic shapes for export is DEPRECATED "
-            "and will not be supported in the future. "
-            "Please use `dynamic_shapes` instead (see docs on `torch.export.export`).",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-    return _export(
-        f,
-        args,
-        kwargs,
-        constraints,
-        strict=strict,
-        preserve_module_call_signature=preserve_module_call_signature,
-    )
-
-
 def save(
     ep: ExportedProgram,
     f: Union[str, os.PathLike, io.BytesIO],
@@ -210,6 +229,9 @@ def save(
     extra_files: Optional[Dict[str, Any]] = None,
     opset_version: Optional[Dict[str, int]] = None,
 ) -> None:
+    if not isinstance(ep, ExportedProgram):
+        raise TypeError(f"save() expects an ExportedProgram but got {type(ep)}")
+
     from .serde.serialize import serialize, SerializedArtifact
     from .serde.schema import SCHEMA_VERSION
     artifact: SerializedArtifact = serialize(ep, opset_version)
@@ -218,11 +240,12 @@ def save(
         f = os.fspath(f)
 
     with zipfile.ZipFile(f, 'w') as zipf:
-        # Save every field the SerializedArtifact to a file
-        for field in dataclasses.fields(artifact):
-            field_name = field.name
-            serialized_field = getattr(artifact, field_name)
-            zipf.writestr(f"serialized_{field_name}.json", serialized_field)
+        # Save every field in the SerializedArtifact to a file.
+        assert isinstance(artifact.exported_program, bytes)
+        zipf.writestr("serialized_exported_program.json", artifact.exported_program)
+        zipf.writestr("serialized_state_dict.pt", artifact.state_dict)
+        zipf.writestr("serialized_constants.pt", artifact.constants)
+        zipf.writestr("serialized_example_inputs.pt", artifact.example_inputs)
 
         zipf.writestr('version', ".".join(map(str, SCHEMA_VERSION)))
 
@@ -242,6 +265,8 @@ def load(
     if isinstance(f, (str, os.PathLike)):
         f = os.fspath(f)
 
+    extra_files = extra_files or {}
+
     with zipfile.ZipFile(f, 'r') as zipf:
         # Check the version
         version = zipf.read('version').decode().split('.')
@@ -257,20 +282,46 @@ def load(
         from .serde.serialize import deserialize, SerializedArtifact
 
         # Load serialized_ep and serialized_state_dict from the zip file
+
+        serialized_exported_program: Optional[bytes] = None
+        serialized_state_dict: Optional[bytes] = None
+        serialized_constants: Optional[bytes] = None
+        serialized_example_inputs: Optional[bytes] = None
+
+        for file_info in zipf.infolist():
+            file_content = zipf.read(file_info.filename)
+
+            if file_info.filename == "serialized_exported_program.json":
+                serialized_exported_program = file_content
+            elif file_info.filename == "serialized_state_dict.json":
+                warnings.warn("This version of file is deprecated")
+                serialized_state_dict = file_content
+            elif file_info.filename == "serialized_constants.json":
+                warnings.warn("This version of file is deprecated")
+                serialized_constants = file_content
+            elif file_info.filename == "serialized_state_dict.pt":
+                serialized_state_dict = file_content
+            elif file_info.filename == "serialized_constants.pt":
+                serialized_constants = file_content
+            elif file_info.filename == "serialized_example_inputs.pt":
+                serialized_example_inputs = file_content
+            elif file_info.filename.startswith("extra_files"):
+                filename = file_info.filename.split("/", 1)[1]
+                extra_files[filename] = file_content.decode('utf-8')
+
+        assert serialized_exported_program is not None
+        assert serialized_state_dict is not None
+        assert serialized_constants is not None
+        assert serialized_example_inputs is not None
         artifact: SerializedArtifact = SerializedArtifact(
-            **{
-                field.name: zipf.read(f"serialized_{field.name}.json")
-                for field in dataclasses.fields(SerializedArtifact)
-            }
+            serialized_exported_program,
+            serialized_state_dict,
+            serialized_constants,
+            serialized_example_inputs,
         )
 
         # Deserialize ExportedProgram
-        ep = deserialize(artifact)
-
-        # Populate extra_files map
-        if extra_files is not None:
-            for filename in extra_files.keys():
-                extra_files[filename] = zipf.read(f"extra_files/{filename}").decode('utf-8')
+        ep = deserialize(artifact, expected_opset_version)
 
         return ep
 
@@ -280,7 +331,6 @@ def aot_compile(
     args: Tuple[Any],
     kwargs: Optional[Dict[str, Any]] = None,
     *,
-    constraints: Optional[List[Constraint]] = None,
     dynamic_shapes: Optional[Dict[str, Any]] = None,
     options: Optional[Dict[str, Any]] = None,
     remove_runtime_assertions: bool = False,
@@ -300,14 +350,19 @@ def aot_compile(
 
         kwargs: optional example keyword inputs.
 
-        constraints: A optional list of constraints on the dynamic arguments specifying
-            their possible range of their shapes
+        dynamic_shapes: Should either be:
+            1) a dict from argument names of ``f`` to their dynamic shape specifications,
+            2) a tuple that specifies dynamic shape specifications for each input in original order.
+            If you are specifying dynamism on keyword args, you will need to pass them in the order that
+            is defined in the original function signature.
 
-        dynamic_shapes: An experimental new feature designed to subsume ``constraints``.
-            A dict mapping argument names of ``f`` to their dynamic shape
-            specifications, as follows. Dynamic shape specifications can be a
-            dict from dynamic dimensions to ``Dim`` types, or a tuple/list of
-            ``Optional[Dim]`` corresponding to each input dimension.
+            The dynamic shape of a tensor argument can be specified as either
+            (1) a dict from dynamic dimension indices to :func:`Dim` types, where it is
+            not required to include static dimension indices in this dict, but when they are,
+            they should be mapped to None; or (2) a tuple / list of :func:`Dim` types or None,
+            where the :func:`Dim` types correspond to dynamic dimensions, and static dimensions
+            are denoted by None. Arguments that are dicts or tuples / lists of tensors are
+            recursively specified by using mappings or sequences of contained specifications.
 
         options: A dictionary of options to control inductor
 
@@ -316,20 +371,11 @@ def aot_compile(
     Returns:
         Path to the generated shared library
     """
-    if constraints is not None:
-        warnings.warn(
-            "The constraints field is deprecated. "
-            "Please use dynamic_shapes instead."
-        )
-
     from torch.export._trace import _export_to_torch_ir
     from torch._inductor.decomposition import select_decomp_table
 
-    if constraints is None:
-        constraints = _process_dynamic_shapes(f, args, kwargs, dynamic_shapes)
-
     if config.is_predispatch:
-        gm = capture_pre_autograd_graph(f, args, kwargs, constraints)
+        gm = torch.export._trace._export(f, args, kwargs, dynamic_shapes, pre_dispatch=True).module()
     else:
         # We want to export to Torch IR here to utilize the pre_grad passes in
         # inductor, which run on Torch IR.
@@ -337,15 +383,41 @@ def aot_compile(
             f,
             args,
             kwargs,
-            constraints,
+            dynamic_shapes,
             disable_constraint_solver=disable_constraint_solver,
             # Disabling this flag, because instead we can rely on the mapping
             # dynamo_flat_name_to_original_fqn which is coming from Dynamo.
             restore_fqn=False,
         )
-    flat_example_inputs = pytree.arg_tree_leaves(*args, **(kwargs or {}))
 
     with torch.no_grad():
-        so_path = torch._inductor.aot_compile(gm, flat_example_inputs, options)  # type: ignore[arg-type]
+        so_path = torch._inductor.aot_compile(gm, args, kwargs, options=options)  # type: ignore[arg-type]
 
     return so_path
+
+def aot_load(so_path: str, device: str) -> Callable:
+    """
+    Loads a shared library generated by aot_compile and returns a callable
+
+    Args:
+        so_path: Path to the shared library
+
+    Returns:
+        A callable
+    """
+    if device == "cpu":
+        runner = torch._C._aoti.AOTIModelContainerRunnerCpu(so_path, 1)  # type: ignore[call-arg]
+    elif device == "cuda" or device.startswith("cuda:"):
+        runner = torch._C._aoti.AOTIModelContainerRunnerCuda(so_path, 1, device)  # type: ignore[assignment, call-arg]
+    else:
+        raise RuntimeError("Unsupported device " + device)
+
+    def optimized(*args, **kwargs):
+        call_spec = runner.get_call_spec()  # type: ignore[attr-defined]
+        in_spec = pytree.treespec_loads(call_spec[0])
+        out_spec = pytree.treespec_loads(call_spec[1])
+        flat_inputs = pytree.tree_flatten((args, reorder_kwargs(kwargs, in_spec)))[0]
+        flat_outputs = runner.run(flat_inputs)  # type: ignore[attr-defined]
+        return pytree.tree_unflatten(flat_outputs, out_spec)
+
+    return optimized
diff --git a/torch/_export/db/case.py b/torch/_export/db/case.py
index ee6d011e7bf95..6c4c03572e3ab 100644
--- a/torch/_export/db/case.py
+++ b/torch/_export/db/case.py
@@ -4,6 +4,7 @@
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from types import ModuleType
 
 import torch
 
@@ -96,8 +97,8 @@ def __post_init__(self):
 
 
 _EXAMPLE_CASES: Dict[str, ExportCase] = {}
-_MODULES = set()
-_EXAMPLE_CONFLICT_CASES = {}
+_MODULES: Set[ModuleType] = set()
+_EXAMPLE_CONFLICT_CASES: Dict[str, List[ExportCase]] = {}
 _EXAMPLE_REWRITE_CASES: Dict[str, List[ExportCase]] = {}
 
 
@@ -120,10 +121,9 @@ def to_snake_case(name):
 
 
 def _make_export_case(m, name, configs):
-    if inspect.isclass(m):
-        if not issubclass(m, torch.nn.Module):
-            raise TypeError("Export case class should be a torch.nn.Module.")
-        m = m()
+    if not issubclass(m, torch.nn.Module):
+        raise TypeError("Export case class should be a torch.nn.Module.")
+    m = m()
 
     if "description" not in configs:
         # Fallback to docstring if description is missing.
@@ -145,9 +145,9 @@ def wrapper(m):
         if module in _MODULES:
             raise RuntimeError("export_case should only be used once per example file.")
 
+        assert module is not None
         _MODULES.add(module)
         normalized_name = to_snake_case(m.__name__)
-        assert module is not None
         module_name = module.__name__.split(".")[-1]
         if module_name != normalized_name:
             raise RuntimeError(
diff --git a/torch/_export/db/examples/assume_constant_result.py b/torch/_export/db/examples/assume_constant_result.py
index 664aab8b64da2..0078200bc0f09 100644
--- a/torch/_export/db/examples/assume_constant_result.py
+++ b/torch/_export/db/examples/assume_constant_result.py
@@ -5,7 +5,7 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2), torch.tensor(4)),
+    example_inputs=(torch.randn(3, 2), torch.tensor(4)),
     tags={"torch.escape-hatch"},
 )
 class AssumeConstantResult(torch.nn.Module):
diff --git a/torch/_export/db/examples/class_method.py b/torch/_export/db/examples/class_method.py
index 77c629559d21e..838a0a1cdb678 100644
--- a/torch/_export/db/examples/class_method.py
+++ b/torch/_export/db/examples/class_method.py
@@ -4,7 +4,7 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 4),),
+    example_inputs=(torch.randn(3, 4),),
 )
 class ClassMethod(torch.nn.Module):
     """
diff --git a/torch/_export/db/examples/cond_branch_class_method.py b/torch/_export/db/examples/cond_branch_class_method.py
index 68dd3772684d1..40430d23c0f2a 100644
--- a/torch/_export/db/examples/cond_branch_class_method.py
+++ b/torch/_export/db/examples/cond_branch_class_method.py
@@ -13,7 +13,7 @@ def forward(self, x):
 
 
 @export_case(
-    example_inputs=(torch.ones(3),),
+    example_inputs=(torch.randn(3),),
     tags={
         "torch.cond",
         "torch.dynamic-shape",
diff --git a/torch/_export/db/examples/cond_branch_nested_function.py b/torch/_export/db/examples/cond_branch_nested_function.py
index c403c83de6a17..00bce0b580a10 100644
--- a/torch/_export/db/examples/cond_branch_nested_function.py
+++ b/torch/_export/db/examples/cond_branch_nested_function.py
@@ -5,13 +5,13 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3),),
+    example_inputs=(torch.randn(3),),
     tags={
         "torch.cond",
         "torch.dynamic-shape",
     },
 )
-def cond_branch_nested_function(x):
+class CondBranchNestedFunction(torch.nn.Module):
     """
     The branch functions (`true_fn` and `false_fn`) passed to cond() must follow these rules:
       - both branches must take the same args, which must also match the branch args passed to cond.
@@ -25,17 +25,20 @@ def cond_branch_nested_function(x):
 
     NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized.
     """
+    def __init__(self):
+        super().__init__()
 
-    def true_fn(x):
-        def inner_true_fn(y):
-            return x + y
+    def forward(self, x):
+        def true_fn(x):
+            def inner_true_fn(y):
+                return x + y
 
-        return inner_true_fn(x)
+            return inner_true_fn(x)
 
-    def false_fn(x):
-        def inner_false_fn(y):
-            return x - y
+        def false_fn(x):
+            def inner_false_fn(y):
+                return x - y
 
-        return inner_false_fn(x)
+            return inner_false_fn(x)
 
-    return cond(x.shape[0] < 10, true_fn, false_fn, [x])
+        return cond(x.shape[0] < 10, true_fn, false_fn, [x])
diff --git a/torch/_export/db/examples/cond_branch_nonlocal_variables.py b/torch/_export/db/examples/cond_branch_nonlocal_variables.py
index bad8f121792e1..2db6192117df6 100644
--- a/torch/_export/db/examples/cond_branch_nonlocal_variables.py
+++ b/torch/_export/db/examples/cond_branch_nonlocal_variables.py
@@ -5,21 +5,21 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(6),),
+    example_inputs=(torch.randn(6),),
     tags={
         "torch.cond",
         "torch.dynamic-shape",
     },
 )
-def cond_branch_nonlocal_variables(x):
+class CondBranchNonlocalVariables(torch.nn.Module):
     """
     The branch functions (`true_fn` and `false_fn`) passed to cond() must follow these rules:
-      - both branches must take the same args, which must also match the branch args passed to cond.
-      - both branches must return a single tensor
-      - returned tensor must have the same tensor metadata, e.g. shape and dtype
-      - branch function can be free function, nested function, lambda, class methods
-      - branch function can not have closure variables
-      - no inplace mutations on inputs or global variables
+    - both branches must take the same args, which must also match the branch args passed to cond.
+    - both branches must return a single tensor
+    - returned tensor must have the same tensor metadata, e.g. shape and dtype
+    - branch function can be free function, nested function, lambda, class methods
+    - branch function can not have closure variables
+    - no inplace mutations on inputs or global variables
 
     This example demonstrates how to rewrite code to avoid capturing closure variables in branch functions.
 
@@ -42,18 +42,22 @@ def false_fn(y):
     NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized.
     """
 
-    my_tensor_var = x + 100
-    my_primitive_var = 3.14
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        my_tensor_var = x + 100
+        my_primitive_var = 3.14
 
-    def true_fn(x, y, z):
-        return x + y + z
+        def true_fn(x, y, z):
+            return x + y + z
 
-    def false_fn(x, y, z):
-        return x - y - z
+        def false_fn(x, y, z):
+            return x - y - z
 
-    return cond(
-        x.shape[0] > 5,
-        true_fn,
-        false_fn,
-        [x, my_tensor_var, torch.tensor(my_primitive_var)],
-    )
+        return cond(
+            x.shape[0] > 5,
+            true_fn,
+            false_fn,
+            [x, my_tensor_var, torch.tensor(my_primitive_var)],
+        )
diff --git a/torch/_export/db/examples/cond_closed_over_variable.py b/torch/_export/db/examples/cond_closed_over_variable.py
index b201c5d679b8e..226576cc83f7f 100644
--- a/torch/_export/db/examples/cond_closed_over_variable.py
+++ b/torch/_export/db/examples/cond_closed_over_variable.py
@@ -5,7 +5,7 @@
 
 
 @export_case(
-    example_inputs=(torch.tensor(True), torch.ones(3, 2)),
+    example_inputs=(torch.tensor(True), torch.randn(3, 2)),
     tags={"torch.cond", "python.closure"},
 )
 class CondClosedOverVariable(torch.nn.Module):
diff --git a/torch/_export/db/examples/cond_operands.py b/torch/_export/db/examples/cond_operands.py
index 584036329244a..1a0db6a110d3a 100644
--- a/torch/_export/db/examples/cond_operands.py
+++ b/torch/_export/db/examples/cond_operands.py
@@ -5,7 +5,7 @@
 from functorch.experimental.control_flow import cond
 
 x = torch.randn(3, 2)
-y = torch.ones(2)
+y = torch.randn(2)
 dim0_x = Dim("dim0_x")
 
 @export_case(
@@ -14,22 +14,26 @@
         "torch.cond",
         "torch.dynamic-shape",
     },
-    extra_inputs=(torch.randn(2, 2), torch.ones(2)),
+    extra_inputs=(torch.randn(2, 2), torch.randn(2)),
     dynamic_shapes={"x": {0: dim0_x}, "y": None},
 )
-def cond_operands(x, y):
+class CondOperands(torch.nn.Module):
     """
     The operands passed to cond() must be:
-      - a list of tensors
-      - match arguments of `true_fn` and `false_fn`
+    - a list of tensors
+    - match arguments of `true_fn` and `false_fn`
 
     NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized.
     """
 
-    def true_fn(x, y):
-        return x + y
+    def __init__(self):
+        super().__init__()
 
-    def false_fn(x, y):
-        return x - y
+    def forward(self, x, y):
+        def true_fn(x, y):
+            return x + y
 
-    return cond(x.shape[0] > 2, true_fn, false_fn, [x, y])
+        def false_fn(x, y):
+            return x - y
+
+        return cond(x.shape[0] > 2, true_fn, false_fn, [x, y])
diff --git a/torch/_export/db/examples/cond_predicate.py b/torch/_export/db/examples/cond_predicate.py
index 1cded103aeeb8..c72c11e32f576 100644
--- a/torch/_export/db/examples/cond_predicate.py
+++ b/torch/_export/db/examples/cond_predicate.py
@@ -5,13 +5,13 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(6, 4, 3),),
+    example_inputs=(torch.randn(6, 4, 3),),
     tags={
         "torch.cond",
         "torch.dynamic-shape",
     },
 )
-def cond_predicate(x):
+class CondPredicate(torch.nn.Module):
     """
     The conditional statement (aka predicate) passed to cond() must be one of the following:
       - torch.Tensor with a single element
@@ -20,6 +20,10 @@ def cond_predicate(x):
     NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized.
     """
 
-    pred = x.dim() > 2 and x.shape[2] > 10
+    def __init__(self):
+        super().__init__()
 
-    return cond(pred, lambda x: x.cos(), lambda y: y.sin(), [x])
+    def forward(self, x):
+        pred = x.dim() > 2 and x.shape[2] > 10
+
+        return cond(pred, lambda x: x.cos(), lambda y: y.sin(), [x])
diff --git a/torch/_export/db/examples/constrain_as_size_example.py b/torch/_export/db/examples/constrain_as_size_example.py
index dade607bdc757..16d6462524146 100644
--- a/torch/_export/db/examples/constrain_as_size_example.py
+++ b/torch/_export/db/examples/constrain_as_size_example.py
@@ -10,13 +10,19 @@
         "torch.escape-hatch",
     },
 )
-def constrain_as_size_example(x):
+class ConstrainAsSizeExample(torch.nn.Module):
     """
     If the value is not known at tracing time, you can provide hint so that we
-    can trace further. Please look at constrain_as_value and constrain_as_size APIs
-    constrain_as_size is used for values that NEED to be used for constructing
+    can trace further. Please look at torch._check and torch._check_is_size APIs.
+    torch._check_is_size is used for values that NEED to be used for constructing
     tensor.
     """
-    a = x.item()
-    torch._constrain_as_size(a, min=0, max=5)
-    return torch.ones((a, 5))
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        a = x.item()
+        torch._check_is_size(a)
+        torch._check(a <= 5)
+        return torch.zeros((a, 5))
diff --git a/torch/_export/db/examples/constrain_as_value_example.py b/torch/_export/db/examples/constrain_as_value_example.py
index e4f2e3c457676..1de266c689c4d 100644
--- a/torch/_export/db/examples/constrain_as_value_example.py
+++ b/torch/_export/db/examples/constrain_as_value_example.py
@@ -10,16 +10,22 @@
         "torch.escape-hatch",
     },
 )
-def constrain_as_value_example(x, y):
+class ConstrainAsValueExample(torch.nn.Module):
     """
     If the value is not known at tracing time, you can provide hint so that we
-    can trace further. Please look at constrain_as_value and constrain_as_size APIs.
-    constrain_as_value is used for values that don't need to be used for constructing
+    can trace further. Please look at torch._check and torch._check_is_size APIs.
+    torch._check is used for values that don't need to be used for constructing
     tensor.
     """
-    a = x.item()
-    torch._constrain_as_value(a, min=0, max=5)
 
-    if a < 6:
-        return y.sin()
-    return y.cos()
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        a = x.item()
+        torch._check(a >= 0)
+        torch._check(a <= 5)
+
+        if a < 6:
+            return y.sin()
+        return y.cos()
diff --git a/torch/_export/db/examples/decorator.py b/torch/_export/db/examples/decorator.py
index 39eff84af3481..fbc95182e60ec 100644
--- a/torch/_export/db/examples/decorator.py
+++ b/torch/_export/db/examples/decorator.py
@@ -14,7 +14,7 @@ def wrapper(*args, **kwargs):
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2), torch.ones(3, 2)),
+    example_inputs=(torch.randn(3, 2), torch.randn(3, 2)),
 )
 class Decorator(torch.nn.Module):
     """
diff --git a/torch/_export/db/examples/dictionary.py b/torch/_export/db/examples/dictionary.py
index 9db3bfccb5cfb..5a210906e6808 100644
--- a/torch/_export/db/examples/dictionary.py
+++ b/torch/_export/db/examples/dictionary.py
@@ -4,14 +4,18 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2), torch.tensor(4)),
+    example_inputs=(torch.randn(3, 2), torch.tensor(4)),
     tags={"python.data-structure"},
 )
-def dictionary(x, y):
+class Dictionary(torch.nn.Module):
     """
     Dictionary structures are inlined and flattened along tracing.
     """
-    elements = {}
-    elements["x2"] = x * x
-    y = y * elements["x2"]
-    return {"y": y}
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        elements = {}
+        elements["x2"] = x * x
+        y = y * elements["x2"]
+        return {"y": y}
diff --git a/torch/_export/db/examples/dynamic_shape_assert.py b/torch/_export/db/examples/dynamic_shape_assert.py
index 795c0f3803f2e..52cc43a210497 100644
--- a/torch/_export/db/examples/dynamic_shape_assert.py
+++ b/torch/_export/db/examples/dynamic_shape_assert.py
@@ -4,15 +4,19 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2),),
+    example_inputs=(torch.randn(3, 2),),
     tags={"python.assert"},
 )
-def dynamic_shape_assert(x):
+class DynamicShapeAssert(torch.nn.Module):
     """
     A basic usage of python assertion.
     """
-    # assertion with error message
-    assert x.shape[0] > 2, f"{x.shape[0]} is greater than 2"
-    # assertion without error message
-    assert x.shape[0] > 1
-    return x
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        # assertion with error message
+        assert x.shape[0] > 2, f"{x.shape[0]} is greater than 2"
+        # assertion without error message
+        assert x.shape[0] > 1
+        return x
diff --git a/torch/_export/db/examples/dynamic_shape_constructor.py b/torch/_export/db/examples/dynamic_shape_constructor.py
index 6a4d1e001de4b..599747f7968af 100644
--- a/torch/_export/db/examples/dynamic_shape_constructor.py
+++ b/torch/_export/db/examples/dynamic_shape_constructor.py
@@ -4,12 +4,16 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2),),
+    example_inputs=(torch.randn(3, 2),),
     tags={"torch.dynamic-shape"},
 )
-def dynamic_shape_constructor(x):
+class DynamicShapeConstructor(torch.nn.Module):
     """
     Tensor constructors should be captured with dynamic shape inputs rather
     than being baked in with static shape.
     """
-    return torch.ones(x.shape[0] * 2)
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.zeros(x.shape[0] * 2)
diff --git a/torch/_export/db/examples/dynamic_shape_if_guard.py b/torch/_export/db/examples/dynamic_shape_if_guard.py
index 45c8d36bee1fa..2120ec0145fe1 100644
--- a/torch/_export/db/examples/dynamic_shape_if_guard.py
+++ b/torch/_export/db/examples/dynamic_shape_if_guard.py
@@ -4,7 +4,7 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2, 2),),
+    example_inputs=(torch.randn(3, 2, 2),),
     tags={"torch.dynamic-shape", "python.control-flow"},
 )
 class DynamicShapeIfGuard(torch.nn.Module):
diff --git a/torch/_export/db/examples/dynamic_shape_map.py b/torch/_export/db/examples/dynamic_shape_map.py
index 22a3876d67d96..5607c2796d683 100644
--- a/torch/_export/db/examples/dynamic_shape_map.py
+++ b/torch/_export/db/examples/dynamic_shape_map.py
@@ -5,15 +5,19 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2), torch.ones(2)),
+    example_inputs=(torch.randn(3, 2), torch.randn(2)),
     tags={"torch.dynamic-shape", "torch.map"},
 )
-def dynamic_shape_map(xs, y):
+class DynamicShapeMap(torch.nn.Module):
     """
     functorch map() maps a function over the first tensor dimension.
     """
 
-    def body(x, y):
-        return x + y
+    def __init__(self):
+        super().__init__()
 
-    return map(body, xs, y)
+    def forward(self, xs, y):
+        def body(x, y):
+            return x + y
+
+        return map(body, xs, y)
diff --git a/torch/_export/db/examples/dynamic_shape_round.py b/torch/_export/db/examples/dynamic_shape_round.py
index 62709520fad38..d581d6d839bc8 100644
--- a/torch/_export/db/examples/dynamic_shape_round.py
+++ b/torch/_export/db/examples/dynamic_shape_round.py
@@ -3,7 +3,7 @@
 from torch._export.db.case import export_case, SupportLevel
 from torch.export import Dim
 
-x = torch.ones(3, 2)
+x = torch.randn(3, 2)
 dim0_x = Dim("dim0_x")
 
 @export_case(
@@ -12,8 +12,13 @@
     support_level=SupportLevel.NOT_SUPPORTED_YET,
     dynamic_shapes={"x": {0: dim0_x}},
 )
-def dynamic_shape_round(x):
+class DynamicShapeRound(torch.nn.Module):
     """
     Calling round on dynamic shapes is not supported.
     """
-    return x[: round(x.shape[0] / 2)]
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x[: round(x.shape[0] / 2)]
diff --git a/torch/_export/db/examples/dynamic_shape_slicing.py b/torch/_export/db/examples/dynamic_shape_slicing.py
index b95334509a54a..eb237876f4e65 100644
--- a/torch/_export/db/examples/dynamic_shape_slicing.py
+++ b/torch/_export/db/examples/dynamic_shape_slicing.py
@@ -4,12 +4,17 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2),),
+    example_inputs=(torch.randn(3, 2),),
     tags={"torch.dynamic-shape"},
 )
-def dynamic_shape_slicing(x):
+class DynamicShapeSlicing(torch.nn.Module):
     """
     Slices with dynamic shape arguments should be captured into the graph
     rather than being baked in.
     """
-    return x[: x.shape[0] - 2, x.shape[1] - 1 :: 2]
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x[: x.shape[0] - 2, x.shape[1] - 1 :: 2]
diff --git a/torch/_export/db/examples/dynamic_shape_view.py b/torch/_export/db/examples/dynamic_shape_view.py
index c6c577b6cfae2..bcedd04cf36fb 100644
--- a/torch/_export/db/examples/dynamic_shape_view.py
+++ b/torch/_export/db/examples/dynamic_shape_view.py
@@ -4,14 +4,19 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(10, 10),),
+    example_inputs=(torch.randn(10, 10),),
     tags={"torch.dynamic-shape"},
 )
-def dynamic_shape_view(x):
+class DynamicShapeView(torch.nn.Module):
     """
     Dynamic shapes should be propagated to view arguments instead of being
     baked into the exported graph.
     """
-    new_x_shape = x.size()[:-1] + (2, 5)
-    x = x.view(*new_x_shape)
-    return x.permute(0, 2, 1)
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        new_x_shape = x.size()[:-1] + (2, 5)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1)
diff --git a/torch/_export/db/examples/fn_with_kwargs.py b/torch/_export/db/examples/fn_with_kwargs.py
index 6b6dbd935a6cb..6182a74795556 100644
--- a/torch/_export/db/examples/fn_with_kwargs.py
+++ b/torch/_export/db/examples/fn_with_kwargs.py
@@ -14,15 +14,19 @@
     tags={"python.data-structure"},
     support_level=SupportLevel.SUPPORTED,
 )
-def fn_with_kwargs(pos0, tuple0, *myargs, mykw0, **mykwargs):
+class FnWithKwargs(torch.nn.Module):
     """
     Keyword arguments are not supported at the moment.
     """
-    out = pos0
-    for arg in tuple0:
-        out = out * arg
-    for arg in myargs:
-        out = out * arg
-    out = out * mykw0
-    out = out * mykwargs["input0"] * mykwargs["input1"]
-    return out
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, pos0, tuple0, *myargs, mykw0, **mykwargs):
+        out = pos0
+        for arg in tuple0:
+            out = out * arg
+        for arg in myargs:
+            out = out * arg
+        out = out * mykw0
+        out = out * mykwargs["input0"] * mykwargs["input1"]
+        return out
diff --git a/torch/_export/db/examples/list_contains.py b/torch/_export/db/examples/list_contains.py
index a222049d31bd8..d25d815cde1a9 100644
--- a/torch/_export/db/examples/list_contains.py
+++ b/torch/_export/db/examples/list_contains.py
@@ -4,14 +4,18 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2),),
+    example_inputs=(torch.randn(3, 2),),
     tags={"torch.dynamic-shape", "python.data-structure", "python.assert"},
 )
-def list_contains(x):
+class ListContains(torch.nn.Module):
     """
     List containment relation can be checked on a dynamic shape or constants.
     """
-    assert x.size(-1) in [6, 2]
-    assert x.size(0) not in [4, 5, 6]
-    assert "monkey" not in ["cow", "pig"]
-    return x + x
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        assert x.size(-1) in [6, 2]
+        assert x.size(0) not in [4, 5, 6]
+        assert "monkey" not in ["cow", "pig"]
+        return x + x
diff --git a/torch/_export/db/examples/list_unpack.py b/torch/_export/db/examples/list_unpack.py
index d2d92e08b88b9..2251c6eb360d8 100644
--- a/torch/_export/db/examples/list_unpack.py
+++ b/torch/_export/db/examples/list_unpack.py
@@ -6,13 +6,22 @@
 
 
 @export_case(
-    example_inputs=([torch.ones(3, 2), torch.tensor(4), torch.tensor(5)],),
+    example_inputs=([torch.randn(3, 2), torch.tensor(4), torch.tensor(5)],),
     tags={"python.control-flow", "python.data-structure"},
 )
-def list_unpack(args: List[torch.Tensor]):
+class ListUnpack(torch.nn.Module):
     """
     Lists are treated as static construct, therefore unpacking should be
     erased after tracing.
     """
-    x, *y = args
-    return x + y[0]
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, args: List[torch.Tensor]):
+        """
+        Lists are treated as static construct, therefore unpacking should be
+        erased after tracing.
+        """
+        x, *y = args
+        return x + y[0]
diff --git a/torch/_export/db/examples/model_attr_mutation.py b/torch/_export/db/examples/model_attr_mutation.py
index b4d76cc67eda8..409a0c0f6c031 100644
--- a/torch/_export/db/examples/model_attr_mutation.py
+++ b/torch/_export/db/examples/model_attr_mutation.py
@@ -4,7 +4,7 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2),),
+    example_inputs=(torch.randn(3, 2),),
     tags={"python.object-model"},
     support_level=SupportLevel.NOT_SUPPORTED_YET,
 )
@@ -15,7 +15,7 @@ class ModelAttrMutation(torch.nn.Module):
 
     def __init__(self):
         super().__init__()
-        self.attr_list = [torch.ones(3, 2), torch.ones(3, 2)]
+        self.attr_list = [torch.randn(3, 2), torch.randn(3, 2)]
 
     def recreate_list(self):
         return [torch.zeros(3, 2), torch.zeros(3, 2)]
diff --git a/torch/_export/db/examples/nested_function.py b/torch/_export/db/examples/nested_function.py
index 701caceee956e..608ef39d51878 100644
--- a/torch/_export/db/examples/nested_function.py
+++ b/torch/_export/db/examples/nested_function.py
@@ -4,20 +4,24 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2), torch.ones(2)),
+    example_inputs=(torch.randn(3, 2), torch.randn(2)),
     tags={"python.closure"},
 )
-def nested_function(a, b):
+class NestedFunction(torch.nn.Module):
     """
     Nested functions are traced through. Side effects on global captures
     are not supported though.
     """
-    x = a + b
-    z = a - b
+    def __init__(self):
+        super().__init__()
 
-    def closure(y):
-        nonlocal x
-        x += 1
-        return x * y + z
+    def forward(self, a, b):
+        x = a + b
+        z = a - b
 
-    return closure(x)
+        def closure(y):
+            nonlocal x
+            x += 1
+            return x * y + z
+
+        return closure(x)
diff --git a/torch/_export/db/examples/null_context_manager.py b/torch/_export/db/examples/null_context_manager.py
index 990291ed0d2b1..da759b0980fac 100644
--- a/torch/_export/db/examples/null_context_manager.py
+++ b/torch/_export/db/examples/null_context_manager.py
@@ -6,13 +6,21 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2),),
+    example_inputs=(torch.randn(3, 2),),
     tags={"python.context-manager"},
 )
-def null_context_manager(x):
+class NullContextManager(torch.nn.Module):
     """
     Null context manager in Python will be traced out.
     """
-    ctx = contextlib.nullcontext()
-    with ctx:
-        return x.sin() + x.cos()
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        """
+        Null context manager in Python will be traced out.
+        """
+        ctx = contextlib.nullcontext()
+        with ctx:
+            return x.sin() + x.cos()
diff --git a/torch/_export/db/examples/optional_input.py b/torch/_export/db/examples/optional_input.py
index 4a06207b6eaf8..47bb5e1bab8d3 100644
--- a/torch/_export/db/examples/optional_input.py
+++ b/torch/_export/db/examples/optional_input.py
@@ -13,7 +13,7 @@ class OptionalInput(torch.nn.Module):
     Tracing through optional input is not supported yet
     """
 
-    def forward(self, x, y=torch.ones(2, 3)):
+    def forward(self, x, y=torch.randn(2, 3)):
         if y is not None:
             return x + y
         return x
diff --git a/torch/_export/db/examples/pytree_flatten.py b/torch/_export/db/examples/pytree_flatten.py
index d877342083209..0d799b2a609ac 100644
--- a/torch/_export/db/examples/pytree_flatten.py
+++ b/torch/_export/db/examples/pytree_flatten.py
@@ -8,9 +8,13 @@
     example_inputs=({1: torch.randn(3, 2), 2: torch.randn(3, 2)},),
     support_level=SupportLevel.SUPPORTED,
 )
-def pytree_flatten(x):
+class PytreeFlatten(torch.nn.Module):
     """
-    Pytree from PyTorch cannot be captured by TorchDynamo.
+    Pytree from PyTorch can be captured by TorchDynamo.
     """
-    y, spec = pytree.tree_flatten(x)
-    return y[0] + 1
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        y, spec = pytree.tree_flatten(x)
+        return y[0] + 1
diff --git a/torch/_export/db/examples/scalar_output.py b/torch/_export/db/examples/scalar_output.py
index dd6a7aa584282..86217847bff89 100644
--- a/torch/_export/db/examples/scalar_output.py
+++ b/torch/_export/db/examples/scalar_output.py
@@ -3,7 +3,7 @@
 from torch._export.db.case import export_case
 from torch.export import Dim
 
-x = torch.ones(3, 2)
+x = torch.randn(3, 2)
 dim1_x = Dim("dim1_x")
 
 @export_case(
@@ -11,9 +11,13 @@
     tags={"torch.dynamic-shape"},
     dynamic_shapes={"x": {1: dim1_x}},
 )
-def scalar_output(x):
+class ScalarOutput(torch.nn.Module):
     """
     Returning scalar values from the graph is supported, in addition to Tensor
     outputs. Symbolic shapes are captured and rank is specialized.
     """
-    return x.shape[1] + 1
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x.shape[1] + 1
diff --git a/torch/_export/db/examples/specialized_attribute.py b/torch/_export/db/examples/specialized_attribute.py
index 743a357fc13ca..3f8f09c4128d0 100644
--- a/torch/_export/db/examples/specialized_attribute.py
+++ b/torch/_export/db/examples/specialized_attribute.py
@@ -10,7 +10,7 @@ class Animal(Enum):
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2),),
+    example_inputs=(torch.randn(3, 2),),
 )
 class SpecializedAttribute(torch.nn.Module):
     """
diff --git a/torch/_export/db/examples/static_for_loop.py b/torch/_export/db/examples/static_for_loop.py
index 9d030b6e82aa5..af14f6fe8ae13 100644
--- a/torch/_export/db/examples/static_for_loop.py
+++ b/torch/_export/db/examples/static_for_loop.py
@@ -4,7 +4,7 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2),),
+    example_inputs=(torch.randn(3, 2),),
     tags={"python.control-flow"},
 )
 class StaticForLoop(torch.nn.Module):
diff --git a/torch/_export/db/examples/static_if.py b/torch/_export/db/examples/static_if.py
index c258e430f7ea0..048bf20ce8bf2 100644
--- a/torch/_export/db/examples/static_if.py
+++ b/torch/_export/db/examples/static_if.py
@@ -4,7 +4,7 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2, 2),),
+    example_inputs=(torch.randn(3, 2, 2),),
     tags={"python.control-flow"},
 )
 class StaticIf(torch.nn.Module):
diff --git a/torch/_export/db/examples/tensor_setattr.py b/torch/_export/db/examples/tensor_setattr.py
index 3aaedb9a9d52b..fae18fb1cf934 100644
--- a/torch/_export/db/examples/tensor_setattr.py
+++ b/torch/_export/db/examples/tensor_setattr.py
@@ -8,9 +8,10 @@
     tags={"python.builtin"},
     support_level=SupportLevel.SUPPORTED,
 )
-def tensor_setattr(x, attr):
+class TensorSetattr(torch.nn.Module):
     """
     setattr() call onto tensors is not supported.
     """
-    setattr(x, attr, torch.randn(3, 2))
-    return x + 4
+    def forward(self, x, attr):
+        setattr(x, attr, torch.randn(3, 2))
+        return x + 4
diff --git a/torch/_export/db/examples/torch_sym_min.py b/torch/_export/db/examples/torch_sym_min.py
index b9f4dd8f8496c..f7edc7003f148 100644
--- a/torch/_export/db/examples/torch_sym_min.py
+++ b/torch/_export/db/examples/torch_sym_min.py
@@ -4,7 +4,7 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2),),
+    example_inputs=(torch.randn(3, 2),),
     tags={"torch.operator"},
     support_level=SupportLevel.NOT_SUPPORTED_YET,
 )
diff --git a/torch/_export/db/examples/type_reflection_method.py b/torch/_export/db/examples/type_reflection_method.py
index b0b304649e708..869fb4cadd65c 100644
--- a/torch/_export/db/examples/type_reflection_method.py
+++ b/torch/_export/db/examples/type_reflection_method.py
@@ -10,22 +10,32 @@ def func(cls, x):
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 4),),
+    example_inputs=(torch.randn(3, 4),),
     tags={"python.builtin"},
     support_level=SupportLevel.SUPPORTED,
 )
-def type_reflection_method(x):
+class TypeReflectionMethod(torch.nn.Module):
     """
-    type() calls on custom objects followed by method calls are not allowed
+    type() calls on custom objects followed by attribute accesses are not allowed
     due to its overly dynamic nature.
     """
-    a = A()
-    return type(a).func(x)
 
+    def __init__(self):
+        super().__init__()
 
-@export_rewrite_case(parent=type_reflection_method)
-def type_reflection_method_rewrite(x):
+    def forward(self, x):
+        a = A()
+        return type(a).func(x)
+
+
+@export_rewrite_case(parent=TypeReflectionMethod)
+class TypeReflectionMethodRewrite(torch.nn.Module):
     """
     Custom object class methods will be inlined.
     """
-    return A.func(x)
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return A.func(x)
diff --git a/torch/_export/db/examples/user_input_mutation.py b/torch/_export/db/examples/user_input_mutation.py
index 2bb16cd64a56f..01c5d775a264f 100644
--- a/torch/_export/db/examples/user_input_mutation.py
+++ b/torch/_export/db/examples/user_input_mutation.py
@@ -4,7 +4,7 @@
 
 
 @export_case(
-    example_inputs=(torch.ones(3, 2),),
+    example_inputs=(torch.randn(3, 2),),
     tags={"torch.mutation"},
     support_level=SupportLevel.SUPPORTED,
 )
diff --git a/torch/_export/error.py b/torch/_export/error.py
index 9fc55092fda95..03b7f52fb9de4 100644
--- a/torch/_export/error.py
+++ b/torch/_export/error.py
@@ -5,13 +5,13 @@ class ExportErrorType(Enum):
     # User providing invalid inputs to either tracer, or other public facing APIs
     INVALID_INPUT_TYPE = 1
 
-    # User returning values from their models that we don’t support.
+    # User returning values from their models that we don't support.
     INVALID_OUTPUT_TYPE = 2
 
     # Generated IR does not conform to Export IR Specification.
     VIOLATION_OF_SPEC = 3
 
-    # User’s code contains types and functionalities we don’t support.
+    # User's code contains types and functionalities we don't support.
     NOT_SUPPORTED = 4
 
     # User's code didn't provide necessary details for us to successfully trace and export.
diff --git a/torch/_export/exported_program.py b/torch/_export/exported_program.py
index a7b0fd679a5cd..5d28ea3154908 100644
--- a/torch/_export/exported_program.py
+++ b/torch/_export/exported_program.py
@@ -1,11 +1,8 @@
-import dataclasses
-from typing import Optional
 import warnings
 
 
 import torch
 import torch.fx
-import torch.utils._pytree as pytree
 
 
 # TODO(ycao): This is added to avoid breaking existing code temporarily.
@@ -32,13 +29,6 @@
 ]
 
 
-# Information to maintain user calling/returning specs
-@dataclasses.dataclass
-class CallSpec:
-    in_spec: Optional[pytree.TreeSpec]
-    out_spec: Optional[pytree.TreeSpec]
-
-
 def _create_graph_module_for_export(root, graph):
     try:
         gm = torch.fx.GraphModule(root, graph)
diff --git a/torch/_export/non_strict_utils.py b/torch/_export/non_strict_utils.py
index c200f306a7fe7..e7abcfd0ab878 100644
--- a/torch/_export/non_strict_utils.py
+++ b/torch/_export/non_strict_utils.py
@@ -1,9 +1,12 @@
+import contextlib
 import inspect
 from collections import defaultdict
-from typing import Dict, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, TYPE_CHECKING, Union
 
 import torch
+import torch.utils._pytree as pytree
 from torch._dynamo.source import (
+    AttrSource,
     GetItemSource,
     LocalSource,
     TensorProperty,
@@ -11,24 +14,68 @@
 )
 from torch._dynamo.variables.builder import TrackedFake
 from torch._export.passes.add_runtime_assertions_for_constraints_pass import InputDim
+from torch._export.passes.lift_constants_pass import ConstantAttrMap
 from torch._guards import Source
-from torch._subclasses.fake_tensor import FakeTensorMode
+from torch._library.fake_class_registry import FakeScriptObject
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
 from torch.export import Constraint
+from torch.export.dynamic_shapes import _tree_map
+from torch.export.graph_signature import CustomObjArgument
 from torch.fx.experimental.symbolic_shapes import (
     ConstraintViolationError,
     DimDynamic,
     EqualityConstraint,
     ShapeEnv,
     StatelessSymbolicContext,
+    ValueRanges,
 )
+from torch.utils._pytree import (
+    GetAttrKey,
+    KeyPath,
+    MappingKey,
+    SequenceKey,
+    tree_map_with_path,
+)
+
+if TYPE_CHECKING:
+    from sympy import Symbol
 
 
-def fakify(mode, t, t_constraints, source, sources):
+def key_path_to_source(kp: KeyPath) -> Source:
     """
-    Make a fake tensor, given a fake mode, a tensor, constraints indexed
-    by tensor ids, the source for the tensor, and an accumulator mapping
-    tensor dimensions to their sources.
+    Given a key path, return the source for the key path.
     """
+    source: Source = LocalSource("args")
+    for k in kp:
+        if isinstance(k, SequenceKey):
+            source = GetItemSource(source, k.idx)
+        elif isinstance(k, MappingKey):
+            source = GetItemSource(source, k.key)
+        elif isinstance(k, GetAttrKey):
+            source = AttrSource(source, k.name)
+        else:
+            raise ValueError(f"Unknown KeyEntry {k}")
+
+    return source
+
+
+def _is_constant_argument(t):
+    return t is None or isinstance(t, (int, float, bool, str))
+
+
+def fakify(
+    mode: FakeTensorMode,
+    kp: KeyPath,
+    t: Any,
+    t_constraints: Dict[int, Dict[int, Constraint]],
+    sources: Dict[Tuple[int, int], List[Source]],
+):
+    source = key_path_to_source(kp)
+    if _is_constant_argument(t) or isinstance(t, torch.ScriptObject):
+        return t
+
+    if not isinstance(t, torch.Tensor):
+        raise ValueError(f"Unsupported input type {type(t)}")
     n_dims = len(t.shape)
     symbolic_context = StatelessSymbolicContext(
         dynamic_sizes=[DimDynamic.STATIC] * n_dims,
@@ -40,40 +87,49 @@ def fakify(mode, t, t_constraints, source, sources):
             symbolic_context.constraint_sizes[i] = constraint.constraint_range
             symbolic_context.dynamic_sizes[i] = DimDynamic.DYNAMIC
             src = TensorPropertySource(base=source, prop=TensorProperty.SIZE, idx=i)
-            sources[(t_id, i)] = src
-            mode.shape_env.source_name_to_debug_name[src.name()] = constraint.debug_name
+            sources[(t_id, i)].append(src)
+            mode.shape_env.source_name_to_debug_name[src.name()] = constraint.debug_name  # type: ignore[assignment]
     fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)
-    mode.shape_env.tracked_fakes.append(TrackedFake(fake, source, symbolic_context))
+    mode.shape_env.tracked_fakes.append(TrackedFake(fake, source, symbolic_context))  # type: ignore[union-attr]
     return fake
 
 
-def fake_tree(mode, arg, t_constraints, source, sources):
-    """
-    Call fakify while recursively mapping on lists and dictionaries. Using pytree map
-    would be ideal here, but we are also building sources as we recurse.
-    """
-    if isinstance(arg, (tuple, list)):
-        return [
-            fake_tree(mode, arg, t_constraints, GetItemSource(source, i), sources)
-            for i, arg in enumerate(arg)
-        ]
-    elif isinstance(arg, dict):
-        return {
-            k: fake_tree(mode, arg, t_constraints, GetItemSource(source, k), sources)
-            for k, arg in arg.items()
-        }
-    # TODO(avik): data classes
-    else:
-        return fakify(mode, arg, t_constraints, source, sources)
-
-
-def make_fake_inputs(nn_module, args, constraints):
+def make_fake_params_buffers(
+    fake_mode: FakeTensorMode,
+    params_buffers: Dict[str, torch.Tensor],
+) -> Dict[str, Union[torch.Tensor, torch.nn.Parameter]]:
+    faked_params_buffers = {}
+    memo: Dict[int, FakeTensor] = {}
+    for key, value in params_buffers.items():
+        if id(value) in memo:
+            fake_tensor = memo[id(value)]
+        else:
+            fake_tensor = fake_mode.from_tensor(value, static_shapes=True)
+            memo[id(value)] = fake_tensor
+        faked_params_buffers[key] = fake_tensor
+    return faked_params_buffers  # type: ignore[return-value]
+
+
+def make_fake_inputs(nn_module, args, kwargs, dynamic_shapes):
     """
     Given an nn module, example inputs, and constraints, return a new fake mode,
     fake inputs created in that mode whose dynamic shape dimensions are constrained
     by the given ranges, and sources for pairs of dynamic shape dimensions that are
     constrained to be equal.
     """
+    # TODO(avik): refactor Dynamo to avoid duplication of the following code
+    # between non-strict and strict.
+    # Specifically, here (non-strict) we do the following pre-tracing steps:
+    #   - Fakify inputs.
+    #   - Process input shape equalities.
+    # In strict, these steps are spread across multiple files:
+    #   - output_graph.py fakifies inputs.
+    #   - [post-tracing] guards.py processes input shape equalities.
+
+    constraints = torch.export.dynamic_shapes._process_dynamic_shapes(
+        nn_module, args, kwargs, dynamic_shapes
+    )
+    constraints = constraints or []
     t_constraints: Dict[int, Dict[int, Constraint]] = defaultdict(dict)
     for constraint in constraints:
         t_constraints[constraint.t_id][constraint.dim] = constraint
@@ -86,39 +142,86 @@ def make_fake_inputs(nn_module, args, constraints):
         "co_filename": code.co_filename,
         "co_firstlineno": code.co_firstlineno,
     }
-    with FakeTensorMode(
-        shape_env=ShapeEnv(tracked_fakes=[], co_fields=co_fields)
-    ) as fake_mode:
+
+    fake_mode = FakeTensorMode(
+        shape_env=ShapeEnv(tracked_fakes=[], co_fields=co_fields),
+        allow_non_fake_inputs=True,
+    )
+    if fake_mode.shape_env is None or fake_mode.shape_env.tracked_fakes is None:
+        raise ValueError(
+            "Detected fake_mode does not have a shape_env with tracked fakes. "
+            "If you constructed the module under a FakeTensorMode, "
+            "please initialize it like: FakeTensorMode(shape_env=ShapeEnv(tracked_fakes=[]))"
+        )
+
+    with fake_mode:
         original_signature = inspect.signature(nn_module.forward)
-        params = original_signature.parameters
-        sources: Dict[Tuple[int, int], Source] = {}
-        fake_args = tuple(
-            fake_tree(fake_mode, arg, t_constraints, LocalSource(x), sources)
-            for x, arg in zip(params, args)
+        sources: Dict[Tuple[int, int], List[Source]] = defaultdict(list)
+        fake_args, fake_kwargs = tree_map_with_path(
+            lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),
+            (args, kwargs),
         )
-        src_equalities = []
+
+        source_pairs: List[Tuple[Source, Source]] = []
+        derived_equalities: List[Tuple[Source, Union[Source, Symbol], Callable]] = []
+        phantom_symbols: Dict[str, Symbol] = {}
         for constraint in constraints:
-            if constraint.shared is not None:
-                src_equality = (
-                    sources[(constraint.t_id, constraint.dim)],
-                    sources[(constraint.shared.t_id, constraint.shared.dim)],
-                )
-                src_equalities.append(src_equality)
-        return fake_mode, fake_args, src_equalities, original_signature
+            torch.export.dynamic_shapes._process_equalities(
+                constraint,
+                lambda t_id, dim: sources[(t_id, dim)],
+                fake_mode.shape_env,
+                source_pairs,
+                derived_equalities,
+                phantom_symbols,
+            )
+
+        equalities_inputs = EqualityConstraint(
+            source_pairs=source_pairs,
+            derived_equalities=derived_equalities,
+            phantom_symbols=list(phantom_symbols.values()),
+            warn_only=False,
+        )
+        return fake_mode, fake_args, fake_kwargs, equalities_inputs, original_signature
+
 
+def _flatten_dynamic_shapes(
+    combined_args: Dict[str, Any],
+    dynamic_shapes: Union[Dict[str, Any], Tuple[Any], List[Any]],
+) -> List[Any]:
+    flat_shapes = []
 
-def make_constraints(fake_mode, src_equalities, original_signature, gm):
+    def _tree_map_helper(t, shape):
+        nonlocal flat_shapes
+        flat_shapes.append(shape)
+
+    _tree_map(_tree_map_helper, combined_args, dynamic_shapes)
+    return flat_shapes
+
+
+def produce_guards_and_solve_constraints(
+    fake_mode: FakeTensorMode,
+    gm: torch.fx.GraphModule,
+    equalities_inputs: EqualityConstraint,
+    original_signature: inspect.Signature,
+    _disable_forced_specializations: Optional[bool] = False,
+):
     """
     Given a fake mode, sources pairs corresponding to equal dynamic shape dimensions,
     and a graph module, produce guards on the fake mode's shape env (raising constraint
-    violations if any), solve (to suggest simplifications or fixes), and return the
-    resulting range constraints and equality constraints.
+    violations if any), solve (to suggest simplifications or fixes).
+    Dynamo already performs this, so this is for non-strict mode.
+
+    Additional inputs:
+        equalities_inputs: the equality constraints to use for guards
+        original_signature: the signature of the forward method
+        _disable_forced_specializations: if True, avoids forced specializations
     """
     shape_env = fake_mode.shape_env
+    assert shape_env.tracked_fakes is not None
+
     placeholders = [tf.fake for tf in shape_env.tracked_fakes]
     sources = [tf.source for tf in shape_env.tracked_fakes]
     input_contexts = [tf.symbolic_context for tf in shape_env.tracked_fakes]
-    equalities_inputs = EqualityConstraint(source_pairs=src_equalities, warn_only=False)
     constraint_violation_error = None
     try:
         shape_env.produce_guards(
@@ -127,13 +230,22 @@ def make_constraints(fake_mode, src_equalities, original_signature, gm):
             input_contexts=input_contexts,
             equalities_inputs=equalities_inputs,
             ignore_static=False,
+            _disable_forced_specializations=_disable_forced_specializations,
         )
     except ConstraintViolationError as e:
         constraint_violation_error = e
 
     shape_env.frozen = True
     dim_constraints = shape_env.dim_constraints
-    dim_constraints.solve()
+    if dim_constraints is None:
+        # Expected when shape_env.produce_guards throws an early constraint violation error.
+        # There is nothing to solve for in this case.
+        # TODO(avik): Maybe record the constraint violation error instead and replay later?
+        assert constraint_violation_error
+        raise constraint_violation_error
+    dim_constraints.solve(
+        _disable_forced_specializations=_disable_forced_specializations
+    )
     dim_constraints.remove_redundant_dynamic_results()
     forced_specializations = dim_constraints.forced_specializations()
     msg = dim_constraints.prettify_results(
@@ -146,20 +258,184 @@ def make_constraints(fake_mode, src_equalities, original_signature, gm):
     if constraint_violation_error:
         raise constraint_violation_error
 
-    range_constraints = {}
+
+def make_constraints(
+    fake_mode: FakeTensorMode,
+    gm: torch.fx.GraphModule,
+    combined_args: Dict[str, Any],
+    dynamic_shapes: Union[Dict[str, Any], Tuple[Any], List[Any], None],
+    num_lifted_inputs: int,
+):
+    """
+    Given a fake mode's shape env and user-specified dynamic shapes,
+    return the resulting range constraints and equality constraints.
+
+    Additional args:
+        num_lifted_inputs: the number of non-user-input placeholder nodes in the graph
+        (used only to enumerate the user-input nodes)
+    """
+
+    shape_env = fake_mode.shape_env
+    inline_constraints = gm.meta.get("inline_constraints", [])
+    range_constraints = {
+        symbol: inline_constraints[symbol] for symbol in inline_constraints
+    }
+    if not dynamic_shapes:
+        return range_constraints
+
+    # get individual dynamic shapes spec for each input
+    if not isinstance(dynamic_shapes, dict):
+        assert isinstance(dynamic_shapes, (tuple, list))
+        combined_args = type(dynamic_shapes)(combined_args.values())  # type: ignore[assignment, misc]
+    flat_dynamic_shapes = _flatten_dynamic_shapes(combined_args, dynamic_shapes)
+
+    # check number of shapes vs. number of inputs
+    num_placeholders = [node.op == "placeholder" for node in gm.graph.nodes].count(True)
+    assert len(flat_dynamic_shapes) == num_placeholders - num_lifted_inputs
+
     input_dims = defaultdict(list)
-    for node in gm.graph.nodes:
-        if node.op != "placeholder":
+    free_symbols = set()
+    for input_index, node in enumerate(gm.graph.nodes):
+        if input_index < num_lifted_inputs or node.op != "placeholder":
             continue
+        if _is_constant_argument(node.meta["val"]) or isinstance(
+            node.meta["val"], CustomObjArgument
+        ):
+            continue
+        shape_spec = flat_dynamic_shapes[input_index - num_lifted_inputs]
         for i, d in enumerate(node.meta["val"].shape):
             if isinstance(d, torch.SymInt):
-                range_constraints[d.node.expr] = shape_env.var_to_range[d.node.expr]
+                # Look up the range constraint for the symbol corresponding to this shape dimension
+                # and store it indexed by the symbolic expression corresponding to it.
+                # NOTE(avik): Use node._expr instead of node.expr for the lookup here because
+                # we want the symbol, not its replacement, which could be an expression. Maybe
+                # there's a better way to do this, e.g., by (re)computing value ranges for expressions?
+                dim = shape_spec[i] if shape_spec else None
+                if dim:
+                    range_constraints[d.node.expr] = ValueRanges(
+                        lower=dim.min, upper=dim.max
+                    )
+                else:
+                    range_constraints[d.node.expr] = shape_env.var_to_range[
+                        d.node._expr
+                    ]
                 input_dims[d.node.expr].append(InputDim(input_name=node.name, dim=i))
+                free_symbols.update(d.node.expr.free_symbols)
+
+    for symbol in free_symbols:
+        if symbol not in range_constraints:
+            # Placeholders can have symbolic shapes that are derived expressions.
+            # The above code will record direct range constraints for them
+            # so that we can do runtime assertions. In addition, for serde checks
+            # we want to record range constraints for their root symbols.
+            range_constraints[symbol] = shape_env.var_to_range[symbol]
 
-    equality_constraints = []
-    for equal_input_dims in input_dims.values():
-        primary, *others = equal_input_dims
-        for other in others:
-            equality_constraints.append((primary, other))
+    return range_constraints
 
-    return range_constraints, equality_constraints
+
+def _gather_constant_attrs(m: torch.nn.Module) -> ConstantAttrMap:
+    """Search the module hierarchy, gathering up all tensor and ScriptObject constants.
+
+    Returns a dictionary mapping hash(value) to the name of the constant. We
+    have to abuse `hash` here unfortunately, see: [ScriptObject hash].
+    """
+    constants = ConstantAttrMap()
+    buffers_parameters = set(m.buffers())
+    buffers_parameters.update(m.parameters())
+
+    def inner(m: torch.nn.Module, prefix_atoms: List[str], constants):
+        for k, v in m.__dict__.items():
+            if isinstance(
+                v,
+                (
+                    torch.Tensor,
+                    torch.ScriptObject,
+                    FakeScriptObject,
+                ),
+            ):
+                if v in buffers_parameters:
+                    # filter out buffers and parameters, leaving only constants
+                    continue
+
+                fqn = ".".join(prefix_atoms + [k])
+                if v in constants:
+                    raise ValueError(
+                        f"Duplicate reference to constant attribute found: '{constants[v]}' and '{fqn}'."
+                    )
+
+                constants[v] = fqn
+        for k, v in m.named_children():
+            inner(v, prefix_atoms + [k], constants)
+
+    inner(m, [], constants)
+    return constants
+
+
+@contextlib.contextmanager
+def _fakify_script_objects(
+    mod: torch.nn.Module,
+    args: Tuple[Any],
+    kwargs: Dict[Any, Any],
+    fake_mode: torch._subclasses.fake_tensor.FakeTensorMode,
+):
+    # This context manager is used to fakify script objects into FakeScriptObject.
+    # Inputs:
+    #   mod: the module to be exported, it (and its recursive submodules)'s script object attrs haven't been fakified.
+    #   args, kwargs: the args and kwargs inputs for mod, script object inputs haven't been fakified.
+    #   fake_mode: the fake mode to be used for fakifying script objects. It's the same mode that fakify input tensors.
+    #
+    # Returns:
+    #   mod: the patched module, its (and its recursive submodules) script object attrs have been fakified.
+    #   fake_args, fake_kwargs: new fakified args and kwargs.
+    #        Script object inputs have been fakified. Don't touch the tensors.
+    #   fake_constant_attrs: a new map from FakeScriptObject to the fqn of the original script object.
+    #   fake_to_real: a mapping between FakeScriptObject and the original script object in order to un-do the patching.
+
+    constant_attrs: ConstantAttrMap = _gather_constant_attrs(mod)
+    assert not any(
+        isinstance(obj, FakeScriptObject) for obj in constant_attrs.values()
+    ), "Mod shouldn't contain any FakeScriptObject."
+    assert not pytree.tree_any(
+        lambda obj: isinstance(obj, FakeScriptObject), (args, kwargs)
+    ), "args and kwargs shouldn't contain any FakeScriptObject."
+
+    patched_attr = {}
+    fake_constant_attrs = ConstantAttrMap()
+    fake_to_real = {}
+
+    def _maybe_fakify_obj(obj):
+        if not torch._library.fake_class_registry.has_fake_class(obj._type().qualified_name()):  # type: ignore[attr-defined]
+            return obj
+        fake_obj = torch._library.fake_class_registry.to_fake_obj(fake_mode, obj)
+        fake_to_real[fake_obj] = obj
+        return fake_obj
+
+    def _leaf_mod_and_attr(
+        mod: torch.nn.Module, attr_fqn: str
+    ) -> Tuple[torch.nn.Module, str]:
+        *prefix_attr, last_attr = attr_fqn.split(".")
+        cur_mod = mod
+        for attr in prefix_attr:
+            cur_mod = getattr(cur_mod, attr)
+        return cur_mod, last_attr
+
+    try:
+        for obj, fqn in constant_attrs.items():
+            if isinstance(obj, torch.ScriptObject):
+                cur_mod, attr = _leaf_mod_and_attr(mod, fqn)
+                assert obj is getattr(cur_mod, attr)
+                fake_script_obj = _maybe_fakify_obj(obj)
+                setattr(cur_mod, attr, fake_script_obj)
+                fake_constant_attrs[fake_script_obj] = fqn
+                patched_attr[fqn] = obj
+            else:
+                fake_constant_attrs[obj] = fqn
+
+        fake_args, fake_kwargs = pytree.tree_map_only(
+            torch.ScriptObject, _maybe_fakify_obj, (args, kwargs)
+        )
+        yield (mod, fake_args, fake_kwargs, fake_constant_attrs, fake_to_real)
+    finally:
+        for fqn, orig_obj in patched_attr.items():
+            cur_mod, attr = _leaf_mod_and_attr(mod, fqn)
+            setattr(cur_mod, attr, orig_obj)
diff --git a/torch/_export/pass_base.py b/torch/_export/pass_base.py
index e4d5b6a18b1db..1cf7e75ad5f9b 100644
--- a/torch/_export/pass_base.py
+++ b/torch/_export/pass_base.py
@@ -18,6 +18,7 @@
 from torch.fx.passes.infra.pass_base import PassBase, PassResult
 from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
 from torch.utils import _pytree as pytree
+from torch.fx.experimental.symbolic_shapes import PropagateUnbackedSymInts, compute_unbacked_bindings
 
 
 __all__ = ["_ExportPassBaseDeprecatedDoNotUse"]
@@ -200,8 +201,8 @@ def call_function(
                 pred, true_fn, false_fn, inputs = args
                 return self.callback.call_cond(pred, true_fn, false_fn, inputs, meta)
             elif target == torch.ops.higher_order.map_impl:
-                f, num_args, *rest = args  # type: ignore[assignment]
-                return self.callback.call_map(f, num_args, list(rest), meta)
+                f, mapped_args, operands = args  # type: ignore[assignment]
+                return self.callback.call_map(f, mapped_args, operands, meta)
             # For other unregistered HigherOrderOps, just interpret them blindly
             elif isinstance(target, torch._ops.HigherOrderOperator):
                 return self.callback._fx(
@@ -238,7 +239,7 @@ def run_node(self, n: torch.fx.Node) -> Argument:
             return super().run_node(n)
 
     def __init__(self) -> None:
-        self.interpreter = torch.fx.Interpreter(
+        self.interpreter = PropagateUnbackedSymInts(
             torch.fx.GraphModule(torch.nn.Module(), torch.fx.Graph())
         )
         self.tracer = self.ExportTracer(self, CodeGen())
@@ -268,6 +269,9 @@ def _fx(
 
         res_proxy = self.tracer.create_proxy(kind, target, args_proxy, kwargs_proxy, name=name)
         res_proxy.node.meta.update(meta.data)
+        if self.fake_tensor_mode and (shape_env := self.fake_tensor_mode.shape_env):
+            if symbol_to_path := compute_unbacked_bindings(shape_env, res_data):
+                res_proxy.node.meta["unbacked_bindings"] = symbol_to_path
         self.tracer.set_metadata(res_proxy.node, res_data)
         return ProxyValue(res_data, res_proxy)
 
@@ -357,18 +361,17 @@ def call_cond(
     def call_map(
         self,
         f: torch.fx.GraphModule,
-        num_args: int,
-        args: List[ProxyValue],
+        mapped_args: List[ProxyValue],
+        operands: List[ProxyValue],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        xs = _unstack_pytree([arg.data for arg in args[:num_args]])[0]
-        pos_args = args[num_args:]
-        f_branch = self.call_submodule(f, tuple(xs + [arg.data for arg in pos_args]))
+        xs = _unstack_pytree([arg.data for arg in mapped_args])[0]
+        f_branch = self.call_submodule(f, tuple(xs + [arg.data for arg in operands]))
         assert f_branch is not None
         return self._fx(
             "call_function",
             torch.ops.higher_order.map_impl,
-            (f_branch.graph_module, num_args, *args),
+            (f_branch.graph_module, mapped_args, operands),
             {},
             meta,
         )
diff --git a/torch/_export/passes/_node_metadata_hook.py b/torch/_export/passes/_node_metadata_hook.py
new file mode 100644
index 0000000000000..529690ed934f2
--- /dev/null
+++ b/torch/_export/passes/_node_metadata_hook.py
@@ -0,0 +1,44 @@
+import torch
+
+
+def _node_metadata_hook(node: torch.fx.Node, stack_trace: str) -> None:
+    """
+    Hook for adding the appropriate metadata to nodes that are created during a
+    pass using graph.create_node. An example of how to use it:
+
+    ```
+    with gm.graph._set_create_node_hook(
+        functools.partial(_node_metadata_hook, stack_trace="file")
+    ):
+        pass(gm)
+    ```
+
+    This hook should not work for all generic cases -- specifically it assumes
+    that nodes being added are only call_function nodes, and copies over the
+    first argument node's nn_module_stack.
+    """
+    assert node.op == "call_function" and callable(node.target)
+
+    arg_meta = [arg.meta for arg in node.args if isinstance(arg, torch.fx.Node)]
+    assert len(arg_meta) >= 1
+    arg_meta = arg_meta[0]
+
+    if (
+        isinstance(node.target, torch._ops.OpOverload)
+        and len(node.target._schema.returns) == 0
+    ):
+        node.meta["val"] = None
+    else:
+        fake_args = [
+            arg.meta["val"] if isinstance(arg, torch.fx.Node) else arg
+            for arg in node.args
+        ]
+        fake_res = node.target(*fake_args)
+        node.meta["val"] = fake_res
+
+    node.meta["stack_trace"] = stack_trace
+    node.meta["nn_module_stack"] = arg_meta["nn_module_stack"]
+    node.meta["torch_fn"] = (
+        f"{node.target.__name__}_0",
+        f"{node.target.__class__.__name__}.{node.target.__name__}",
+    )
diff --git a/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py b/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py
index 9e4ebf60916d5..97463dd19e85f 100644
--- a/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py
+++ b/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py
@@ -8,9 +8,9 @@
 
 import torch
 import torch.fx
-from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse, ProxyValue, PassResult
 from torch.utils._sympy.value_ranges import ValueRanges
-
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
 
 __all__ = ["InputDim"]
 
@@ -40,7 +40,7 @@ def _convert_range_to_int(range: ValueRanges):
     return min_val, max_val
 
 
-class _AddRuntimeAssertionsForInlineConstraintsPass(_ExportPassBaseDeprecatedDoNotUse):
+class _AddRuntimeAssertionsForInlineConstraintsPass(PassBase):
     def __init__(
         self,
         range_constraints: Dict[sympy.Symbol, ValueRanges],
@@ -50,96 +50,100 @@ def __init__(
         self._asserts_generated_unbacked_symbols: Set[sympy.Symbol] = set()
         self.counter = 0
 
-    def _assert_range_constraint(self, proxy, lower, upper, assert_msg):
+    def _assert_range_constraint(self, node, lower, upper, assert_msg):
+        last_node = node
         if lower > -math.inf:
-            self._insert_assert_async(operator.ge, proxy, lower, assert_msg)
+            last_node = self._insert_assert_async(last_node, operator.ge, node, lower, assert_msg)
 
         if upper < math.inf:
-            self._insert_assert_async(operator.le, proxy, upper, assert_msg)
+            last_node = self._insert_assert_async(last_node, operator.le, node, upper, assert_msg)
 
-    def _insert_assert_async(self, operator, lower, upper, assert_msg):
+    def _insert_assert_async(self, last_node, op, lower, upper, assert_msg):
         """
         Inserts assert_async call_function nodes in the graph. This function is
         called **during** the interpreter-based pass.
         """
         self.counter += 1
-        cmp = super().call_operator(operator, (lower, upper), {}, self._create_dummy_node_metadata())
-        cmp_tensor = super().call_operator(torch.ops.aten.scalar_tensor.default, (cmp,), {}, self._create_dummy_node_metadata())
-        super().call_operator(
-            torch.ops.aten._assert_async.msg,
-            (cmp_tensor, assert_msg),
-            {},
-            self._create_dummy_node_metadata(),
+        graph = last_node.graph
+        with graph.inserting_after(last_node):
+            cmp = graph.call_function(op, (lower, upper), {})
+        with graph.inserting_after(cmp):
+            cmp_tensor = graph.call_function(torch.ops.aten.scalar_tensor.default, (cmp,), {})
+        with graph.inserting_after(cmp_tensor):
+            assert_async = graph.call_function(
+                torch.ops.aten._assert_async.msg,
+                (cmp_tensor, assert_msg),
+                {},
+            )
+        return assert_async
+
+    def call(self, graph_module) -> PassResult:
+        self.existing_inline_assertions = _get_existing_inline_assertions(
+            graph_module, self.range_constraints
         )
 
-    def call_operator(self, op, args, kwargs, meta) -> ProxyValue:
-        ret = super().call_operator(op, args, kwargs, meta)
-        if "val" not in meta:
-            return ret
-
-        val = meta["val"]
-
-        # In general, we may have to deal the case such as: ret[1].shape[0].
-        # We need first find out what symbols require assertion, then we need to follow the path
-        # from ret to the symbol, construct the proxies along the way and construct the messages
-        # piece-wise at the same time.
-        #
-        # We use post-order traversal to collect all the proxies callbacks needed, construct
-        # the error message callbacks, and at the top-level traversal tree we execute all the callbacks.
-        # We need the callbacks because, in order to call the function to create a proxy for shape[0], we
-        # need the proxy for shape, which further requires the proxy for ret[1], etc.
-        def add_assertions(val):
-            call_backs: List[Callable] = []
-            messages: List[str] = []
-            if isinstance(val, (torch.SymInt, torch.SymFloat, torch.SymBool)):
-                symbol = val.node._expr
-                if symbol in self.existing_inline_assertions:
-                    return call_backs, messages
-                if isinstance(symbol, sympy.Symbol) and symbol.name.startswith("i"):
-                    if symbol in self._asserts_generated_unbacked_symbols:
-                        return call_backs, messages
-                    # We only care about unbacked symints for these inline
-                    # constraints, which are prefixed with 'i'
-                    constraint = self.range_constraints[symbol]
-                    min_val, max_val = _convert_range_to_int(constraint)
-                    assert_msg = f" is outside of inline constraint [{min_val}, {max_val}]."
-                    call_backs.append(
-                        partial(self._assert_range_constraint, lower=min_val, upper=max_val)
-                    )
-                    messages.append(assert_msg)
-                    self._asserts_generated_unbacked_symbols.add(symbol)
-
-            elif isinstance(val, torch.Tensor):
-                for i, sym in enumerate(val.shape):
-                    cbs, msgs = add_assertions(sym)
-                    for cb, msg in zip(cbs, msgs):
-                        def sym_size_cb(proxy, assert_msg, dim):
-                            dim_proxy = super(
-                                _AddRuntimeAssertionsForInlineConstraintsPass,
-                                self
-                            ).call_operator(
-                                torch.ops.aten.sym_size.int,
-                                (proxy, dim),
-                                {},
-                                self._create_dummy_node_metadata(),
+        for module in graph_module.modules():
+            if not isinstance(module, torch.fx.GraphModule):
+                continue
+            for node in module.graph.nodes:
+                if node.op != "call_function":
+                    continue
+                if "val" not in node.meta:
+                    continue
+
+                val = node.meta["val"]
+                # In general, we may have to deal the case such as: ret[1].shape[0].
+                # We need first find out what symbols require assertion, then we need to follow the path
+                # from ret to the symbol, construct the proxies along the way and construct the messages
+                # piece-wise at the same time.
+                #
+                # We use post-order traversal to collect all the proxies callbacks needed, construct
+                # the error message callbacks, and at the top-level traversal tree we execute all the callbacks.
+                # We need the callbacks because, in order to call the function to create a proxy for shape[0], we
+                # need the proxy for shape, which further requires the proxy for ret[1], etc.
+
+                def add_assertions(val):
+                    call_backs: List[Callable] = []
+                    messages: List[str] = []
+                    if isinstance(val, (torch.SymInt, torch.SymFloat, torch.SymBool)):
+                        symbol = val.node.expr
+                        if symbol in self.existing_inline_assertions:
+                            return call_backs, messages
+                        if isinstance(symbol, sympy.Symbol) and free_unbacked_symbols(symbol):
+                            if symbol in self._asserts_generated_unbacked_symbols:
+                                return call_backs, messages
+                            # We only care about unbacked symints for these inline
+                            # constraints, which are prefixed with 'u'
+                            constraint = self.range_constraints[symbol]
+                            min_val, max_val = _convert_range_to_int(constraint)
+                            assert_msg = f" is outside of inline constraint [{min_val}, {max_val}]."
+                            call_backs.append(
+                                partial(self._assert_range_constraint, lower=min_val, upper=max_val)
                             )
-                            cb(proxy=dim_proxy, assert_msg=assert_msg)
-                        call_backs.append(partial(sym_size_cb, dim=i))
-                        messages.append(f".shape[{i}]" + msg)
-            return call_backs, messages
-
-        callbacks, messages = add_assertions(val)
-        for cb, msg in zip(callbacks, messages):
-            cb(proxy=ret, assert_msg=f"{ret.node}" + msg)
-        return ret
+                            messages.append(assert_msg)
+                            self._asserts_generated_unbacked_symbols.add(symbol)
+
+                    elif isinstance(val, torch.Tensor):
+                        for i, sym in enumerate(val.shape):
+                            cbs, msgs = add_assertions(sym)
+                            for cb, msg in zip(cbs, msgs):
+                                def sym_size_cb(node, assert_msg, dim):
+                                    with node.graph.inserting_after(node):
+                                        dim_node = module.graph.call_function(
+                                            torch.ops.aten.sym_size.int,
+                                            (node, dim),
+                                            {},
+                                        )
+                                    cb(node=dim_node, assert_msg=assert_msg)
+                                call_backs.append(partial(sym_size_cb, dim=i))
+                                messages.append(f".shape[{i}]" + msg)
+                    return call_backs, messages
 
-    def call(self, graph_module):
-        self.existing_inline_assertions = _get_existing_inline_assertions(
-            graph_module, self.range_constraints
-        )
+                callbacks, messages = add_assertions(val)
+                for cb, msg in zip(callbacks, messages):
+                    cb(node=node, assert_msg=f"{node}" + msg)
 
-        # Add runtime asserts for inline constraints
-        val = super().call(graph_module)
+            module.recompile()
 
         # Sometimes this pass would return a wrong graph where we have mismatched
         # node names in signature. Before we fix it, let's just skip it.
@@ -147,11 +151,10 @@ def call(self, graph_module):
             return PassResult(graph_module, False)
 
         # Populate the stack trace with dummy vals to respect IR
-        for node in val.graph_module.graph.nodes:
-            if not node.meta.get("stack_trace", None):
+        for node in graph_module.graph.nodes:
+            if not node.meta.get("stack_trace", None) and node.op not in ["placeholder", "output"]:
                 node.meta["stack_trace"] = "".join(traceback.format_stack(limit=1))
-
-        return PassResult(val.graph_module, val.modified)
+        return PassResult(graph_module, True)
 
 
 def _get_existing_inline_assertions(
@@ -167,20 +170,12 @@ def _get_existing_inline_assertions(
         # Find all the existing inline assertions. They will look something like:
         # %_local_scalar_dense = call_function[target=torch.ops.aten._local_scalar_dense.default](args = (%arg1_1,), kwargs = {})
         # %ge = call_function[target=operator.ge](args = (%_local_scalar_dense, 0), kwargs = {})
-        # %scalar_tensor = call_function[target=torch.ops.aten.scalar_tensor.default](args = (%ge,), kwargs = {})
-        # %_assert_async = call_function[target=torch.ops.aten._assert_async.msg](args = (%scalar_tensor, "..."), kwargs = {})
+        # %_assert_scalar = call_function[target=torch.ops.aten._assert_scalar.default](args = (%scalar_tensor, "..."), kwargs = {})
         for node in module.graph.nodes:
-            if node.target != torch.ops.aten._assert_async.msg:
+            if node.target != torch.ops.aten._assert_scalar.default:
                 continue
 
-            scalar_tensor_arg = node.args[0]
-            if not (
-                scalar_tensor_arg.op == "call_function" and
-                scalar_tensor_arg.target == torch.ops.aten.scalar_tensor.default
-            ):
-                continue
-
-            compare_arg = scalar_tensor_arg.args[0]
+            compare_arg = node.args[0]
             if not (
                 compare_arg.op == "call_function" and
                 compare_arg.target in (operator.le, operator.ge) and
@@ -203,13 +198,24 @@ def _get_existing_inline_assertions(
                 compare_op = operator.ge
                 compare_int = -1 * compare_int
 
-            if not (
-                "val" in maybe_symint_arg.meta and
-                isinstance(maybe_symint_arg.meta["val"], torch.SymInt)
-            ):
-                continue
+                if not (
+                    "val" in maybe_symint_arg.meta and
+                    isinstance(maybe_symint_arg.meta["val"], torch.SymInt)
+                ):
+                    continue
+
+                symint = maybe_symint_arg.meta["val"].node.expr
+                symint = -1 * symint
+
+            else:
+                if not (
+                    "val" in maybe_symint_arg.meta and
+                    isinstance(maybe_symint_arg.meta["val"], torch.SymInt)
+                ):
+                    continue
+
+                symint = maybe_symint_arg.meta["val"].node.expr
 
-            symint = maybe_symint_arg.meta["val"].node._expr
             if not isinstance(symint, sympy.Symbol):
                 continue
 
diff --git a/torch/_export/passes/collect_tracepoints_pass.py b/torch/_export/passes/collect_tracepoints_pass.py
index 6a2b9c674859f..ca8eaf30be592 100644
--- a/torch/_export/passes/collect_tracepoints_pass.py
+++ b/torch/_export/passes/collect_tracepoints_pass.py
@@ -28,8 +28,43 @@ def get_arg_spec(arg):
                         "Symint input is not implemented yet for submodule call signature."
                     )
             else:
-                return ConstantArgument(value=arg)
+                return ConstantArgument(name="", value=arg)
 
+        for module in gm.modules():
+            if not isinstance(module, torch.fx.GraphModule):
+                continue
+            nn_module_stack = None
+            for node in module.graph.nodes:
+                if node.op != "call_function":
+                    continue
+                if node.target == torch.ops.higher_order._export_tracepoint:
+                    kind = node.kwargs["kind"]
+                    if kind == "module_call_outputs":
+                        nn_module_stack = node.meta["nn_module_stack"]
+                    elif kind == "module_call_inputs":
+                        nn_module_stack = None
+                    else:
+                        raise AssertionError(f"Unknown tracepoint kind: {kind}")
+                elif node.meta["nn_module_stack"] == nn_module_stack:
+                    node.meta["nn_module_stack"].popitem()
+                else:
+                    nn_module_stack = None
+            nn_module_stack = None
+            for node in reversed(module.graph.nodes):
+                if node.op != "call_function":
+                    continue
+                if node.target == torch.ops.higher_order._export_tracepoint:
+                    kind = node.kwargs["kind"]
+                    if kind == "module_call_inputs":
+                        nn_module_stack = node.meta["nn_module_stack"]
+                    elif kind == "module_call_outputs":
+                        nn_module_stack = None
+                    else:
+                        raise AssertionError(f"Unknown tracepoint kind: {kind}")
+                elif node.meta["nn_module_stack"] == nn_module_stack:
+                    node.meta["nn_module_stack"].popitem()
+                else:
+                    nn_module_stack = None
         for module in gm.modules():
             if not isinstance(module, torch.fx.GraphModule):
                 continue
diff --git a/torch/_export/passes/lift_constant_tensor_pass.py b/torch/_export/passes/lift_constant_tensor_pass.py
deleted file mode 100644
index e4f1f3b0a95a9..0000000000000
--- a/torch/_export/passes/lift_constant_tensor_pass.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from typing import Dict
-
-import torch
-from torch._guards import detect_fake_mode
-from torch.export.exported_program import InputKind, InputSpec, TensorArgument
-
-
-def lift_constant_tensor_pass(gm, graph_signature) -> Dict[str, torch.Tensor]:
-    """
-    Takes an ExportedProgram and returns the ExportedProgram modified in-place,
-    with the constant tensors as buffers.
-    """
-    if len([node for node in gm.graph.nodes if node.op == "placeholder"]) == 0:
-        return {}
-
-    inputs = graph_signature.input_specs
-    num_tensor_constants = sum(
-        input_specs.kind == InputKind.CONSTANT_TENSOR for input_specs in inputs
-    )
-
-    fake_mode = detect_fake_mode(
-        tuple(node.meta["val"] for node in gm.graph.nodes if node.op == "placeholder")
-    )
-    assert fake_mode is not None
-
-    first_user_input_loc, first_user_input = 0, None
-    for node in gm.graph.nodes:
-        if node.op == "placeholder" and node.name in graph_signature.user_inputs:
-            first_user_input = node
-            break
-        first_user_input_loc += 1
-
-    tensor_constants = {}
-
-    for node in gm.graph.nodes:
-        if node.op == "get_attr":
-            constant_tensor = getattr(gm, node.target)
-            if not isinstance(constant_tensor, torch.Tensor):
-                continue
-
-            constant_tensor_fqn = f"_lifted_tensor_constant{num_tensor_constants}"
-            num_tensor_constants += 1
-
-            with gm.graph.inserting_before(first_user_input):
-                # Insert the constant node before the first user input
-                const_placeholder_node = gm.graph.placeholder(constant_tensor_fqn)
-                for k, v in node.meta.items():
-                    const_placeholder_node.meta[k] = v
-                const_placeholder_node.meta["val"] = fake_mode.from_tensor(
-                    constant_tensor, static_shapes=True
-                )
-                const_placeholder_node.meta["val"].constant = constant_tensor
-                node.replace_all_uses_with(const_placeholder_node)
-                gm.graph.erase_node(node)
-
-                # Add the constant as a buffer to the graph signature
-                graph_signature.input_specs.insert(
-                    first_user_input_loc,
-                    InputSpec(
-                        kind=InputKind.CONSTANT_TENSOR,
-                        arg=TensorArgument(name=const_placeholder_node.name),
-                        target=constant_tensor_fqn,
-                    ),
-                )
-                tensor_constants[constant_tensor_fqn] = constant_tensor
-                first_user_input_loc += 1
-
-    gm.recompile()
-    return tensor_constants
diff --git a/torch/_export/passes/lift_constants_pass.py b/torch/_export/passes/lift_constants_pass.py
new file mode 100644
index 0000000000000..877ecc23aaf8c
--- /dev/null
+++ b/torch/_export/passes/lift_constants_pass.py
@@ -0,0 +1,278 @@
+import collections
+from typing import Any, Dict, Union
+
+import torch
+from torch._export.verifier import SpecViolationError
+from torch._guards import detect_fake_mode
+
+from torch._library.fake_class_registry import FakeScriptObject
+from torch.export.exported_program import (
+    ArgumentSpec,
+    CustomObjArgument,
+    ExportGraphSignature,
+    InputKind,
+    InputSpec,
+    TensorArgument,
+)
+
+
+class ConstantAttrMap(collections.abc.MutableMapping):
+    """A mapping class that understands how to use module constants (tensors,
+    ScriptObjects, FakeScriptObjects) as keys. We store tensors and FakeScriptObjects normally,
+    but ScriptObjects are stored by hash, because different torch.ScriptObjects can point to
+    the same underlying value (but we guarantee that they will `hash()` to the same value
+    if that's the case).
+    """
+
+    def __init__(self):
+        # Underlying dict that we use to implement this mapping.
+        self._constant_attrs: Dict[Union[int, torch.Tensor, FakeScriptObject], Any] = {}
+        # Map from the hash(ScriptObject) to the ScriptObject itself. Used for
+        # APIs like `__iter__` that should look like they're returning the
+        # original ScriptObjects.
+        self._script_object_map: Dict[int, torch.ScriptObject] = {}
+
+    def __getitem__(
+        self, key: Union[torch.Tensor, torch.ScriptObject, FakeScriptObject]
+    ) -> Any:
+        real_key = hash(key) if isinstance(key, torch.ScriptObject) else key
+        assert isinstance(real_key, (int, torch.Tensor, FakeScriptObject))
+        return self._constant_attrs[real_key]
+
+    def __setitem__(
+        self, key: Union[torch.Tensor, torch.ScriptObject, FakeScriptObject], value: Any
+    ) -> None:
+        if isinstance(key, torch.ScriptObject):
+            self._constant_attrs[hash(key)] = value
+            self._script_object_map[hash(key)] = key
+        elif isinstance(key, (torch.Tensor, FakeScriptObject)):
+            self._constant_attrs[key] = value
+        else:
+            raise TypeError(
+                f"Expected key to be a tensor or ScriptObject, got {type(key)}"
+            )
+
+    def __delitem__(self, key):
+        real_key = hash(key) if isinstance(key, torch.ScriptObject) else key
+
+        del self._constant_attrs[real_key]
+
+    def __iter__(self):
+        for key in self._constant_attrs:
+            if isinstance(key, int):
+                yield self._script_object_map[key]
+            else:
+                yield key
+
+    def __len__(self):
+        return len(self._constant_attrs)
+
+    def __contains__(self, key: object) -> bool:
+        real_key = hash(key) if isinstance(key, torch.ScriptObject) else key
+        return real_key in self._constant_attrs
+
+
+def get_constant_fqn(node: torch.fx.Node, constant_name: str) -> str:
+    # The FQN of the constant tensor in the state dict should
+    # correspond to the module where the constant tensor was
+    # originally used.
+    parent_fqn = list(node.meta["nn_module_stack"].values())[-1][0]
+    if len(parent_fqn) > 0:
+        return f"{parent_fqn}.{constant_name}"
+    else:
+        return constant_name
+
+
+def lift_constants_pass(
+    gm: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
+    constant_attrs: ConstantAttrMap,
+) -> Dict[str, Union[torch.Tensor, torch.ScriptObject, FakeScriptObject]]:
+    """
+    Takes a graph module, graph signature, and modifies them implace to lift any
+    constants (tensors or custom classes) as inputs to the graph. Returns a
+    dictionary of names to constants.
+
+    Arguments:
+        gm (torch.fx.GraphModule): The graph module containing the graph and constants to lift.
+        graph_signature (ExportGraphSignature): This graph signature will be
+            mutated to add additional CONSTANT_TENSOR and CUSTOM_OBJ inputs.
+        constant_attrs (ConstantAttr): A mapping from a constant value to its
+            fully-qualified path in `gm`. This is used to maintain consistent
+            location of constants between the original module and the exported
+            version.
+
+    Returns:
+        A dictionary of fqn => constant value.
+    """
+    all_constants: Dict[
+        str, Union[torch.Tensor, torch.ScriptObject, FakeScriptObject]
+    ] = {}
+
+    inputs = graph_signature.input_specs
+    num_custom_obj = sum(
+        input_specs.kind == InputKind.CUSTOM_OBJ for input_specs in inputs
+    )
+    num_tensor_constants = sum(
+        input_specs.kind == InputKind.CONSTANT_TENSOR for input_specs in inputs
+    )
+
+    fake_mode = detect_fake_mode(
+        tuple(node.meta["val"] for node in gm.graph.nodes if node.op == "placeholder")
+    )
+
+    first_user_input_loc, first_user_input = 0, None
+    for node in gm.graph.nodes:
+        if node.op == "placeholder" and node.name in graph_signature.user_inputs:
+            first_user_input = node
+            break
+        first_user_input_loc += 1
+
+    lifted_objs = ConstantAttrMap()
+    for node in gm.graph.nodes:
+        if node.op == "get_attr":
+            constant_val = getattr(gm, node.target)
+            if constant_val in lifted_objs:
+                # We already lifted this constant elsewhere. Just rewrite uses
+                # of this get_attr to point to the already-existing placeholder
+                # node.
+                const_placeholder_node = lifted_objs[constant_val]
+                node.replace_all_uses_with(const_placeholder_node)
+                gm.graph.erase_node(node)
+                continue
+
+            # For ScriptObject, Tensor and FakeScriptObject constants:
+            # First check if the constant was an attribute on some module by
+            # consulting `constant_attrs` map. If it is, use the fqn that keeps
+            # its location consistent with the eager module.
+            #
+            # If it's not in the `constant_attrs` map, that means it's an inline
+            # constant (e.g. x + torch.tensor(0)), and thus did not have a
+            # specific location in the eager module. In that case, just generate
+            # some name and attach it to the module in which it was used.
+            if isinstance(constant_val, (torch.ScriptObject, FakeScriptObject)):
+                constant_kind = InputKind.CUSTOM_OBJ
+                constant_fqn = constant_attrs.get(constant_val)
+                if constant_fqn is not None:
+                    constant_name = constant_fqn.replace(".", "_")
+                else:
+                    constant_name = f"lifted_custom_{num_custom_obj}"
+                    constant_fqn = get_constant_fqn(node, constant_name)
+                    num_custom_obj += 1
+            elif isinstance(constant_val, torch.Tensor):
+                constant_kind = InputKind.CONSTANT_TENSOR
+                constant_fqn = constant_attrs.get(constant_val)
+                if constant_fqn is not None:
+                    constant_name = constant_fqn.replace(".", "_")
+                else:
+                    constant_name = f"lifted_tensor_{num_tensor_constants}"
+                    constant_fqn = get_constant_fqn(node, constant_name)
+                    num_tensor_constants += 1
+            elif isinstance(constant_val, torch.fx.GraphModule):
+                continue
+            elif "LoweredBackendModule" in type(constant_val).__name__:
+                continue
+            else:
+                raise SpecViolationError(
+                    f"getattr node {node} referencing unsupported type {type(constant_val)}"
+                )
+
+            with gm.graph.inserting_before(first_user_input):
+                # Insert the constant node before the first user input
+                const_placeholder_node = gm.graph.placeholder(constant_name)
+                # match target name with its node name in case there is name collision
+                # and suffix is added to node name in fx
+                const_placeholder_node.target = const_placeholder_node.name
+
+                for k, v in node.meta.items():
+                    const_placeholder_node.meta[k] = v
+
+                # Once the FQN has been used, remove nn_module_stack, stack_trace
+                const_placeholder_node.meta.pop("nn_module_stack")
+                const_placeholder_node.meta.pop("stack_trace", None)
+
+                input_spec_arg: ArgumentSpec
+                if isinstance(constant_val, torch.Tensor):
+                    if fake_mode is not None:
+                        const_placeholder_node.meta["val"] = fake_mode.from_tensor(
+                            constant_val, static_shapes=True
+                        )
+                        const_placeholder_node.meta["val"].constant = constant_val
+                    else:
+                        const_placeholder_node.meta["val"] = constant_val
+                    input_spec_arg = TensorArgument(name=const_placeholder_node.name)
+                elif isinstance(constant_val, torch._C.ScriptObject):
+                    class_fqn = constant_val._type().qualified_name()  # type: ignore[attr-defined]
+                    const_placeholder_node.meta["val"] = CustomObjArgument(
+                        constant_fqn, class_fqn
+                    )
+                    input_spec_arg = CustomObjArgument(
+                        name=const_placeholder_node.name, class_fqn=class_fqn
+                    )
+                elif isinstance(constant_val, FakeScriptObject):
+                    class_fqn = constant_val.script_class_name
+                    const_placeholder_node.meta["val"] = CustomObjArgument(
+                        constant_fqn, class_fqn
+                    )
+                    input_spec_arg = CustomObjArgument(
+                        name=const_placeholder_node.name, class_fqn=class_fqn
+                    )
+                else:
+                    raise SpecViolationError(
+                        f"tried to lift unsupported type {type(constant_val)} from node {node.format_node()}"
+                    )
+
+                lifted_objs[constant_val] = const_placeholder_node
+                node.replace_all_uses_with(const_placeholder_node)
+                gm.graph.erase_node(node)
+
+                # Add the constant as a buffer to the graph signature
+                graph_signature.input_specs.insert(
+                    first_user_input_loc,
+                    InputSpec(
+                        kind=constant_kind,
+                        arg=input_spec_arg,
+                        target=constant_fqn,
+                    ),
+                )
+                all_constants[constant_fqn] = constant_val
+                first_user_input_loc += 1
+
+    return all_constants
+
+
+def rewrite_script_object_meta(
+    gm: torch.fx.GraphModule,
+) -> Dict[str, Union[torch.Tensor, torch.ScriptObject, FakeScriptObject],]:
+    """When tracing, we produce a graph with FakeScriptObject in the
+    meta["val"].
+
+    For now, we rewrie meta["val"] to be a placeholder CustomObjArgument
+    """
+    constants: Dict[
+        str,
+        Union[
+            torch.Tensor,
+            torch.ScriptObject,
+            FakeScriptObject,
+        ],
+    ] = {}
+    for node in gm.graph.nodes:
+        if "val" not in node.meta:
+            continue
+
+        if isinstance(node.meta["val"], torch.ScriptObject):
+            old_meta = node.meta["val"]
+            class_fqn = old_meta._type().qualified_name()  # type: ignore[attr-defined]
+            new_meta = CustomObjArgument(node.name, class_fqn)
+            constants[node.name] = old_meta
+            node.meta["val"] = new_meta
+
+        elif isinstance(node.meta["val"], FakeScriptObject):
+            old_meta = node.meta["val"]  # type: ignore[assignment]
+            class_fqn = old_meta.script_class_name  # type: ignore[attr-defined]
+            new_meta = CustomObjArgument(node.name, class_fqn)
+            constants[node.name] = old_meta
+            node.meta["val"] = new_meta
+
+    return constants
diff --git a/torch/_export/passes/replace_set_grad_with_hop_pass.py b/torch/_export/passes/replace_set_grad_with_hop_pass.py
new file mode 100644
index 0000000000000..6e0f351ea5e45
--- /dev/null
+++ b/torch/_export/passes/replace_set_grad_with_hop_pass.py
@@ -0,0 +1,177 @@
+import copy
+
+import torch
+from torch._higher_order_ops.wrap import wrap_with_set_grad_enabled
+
+from ..utils import (
+    node_inline_,
+    node_replace_,
+    nodes_filter,
+    nodes_first,
+    nodes_map,
+    sequential_split,
+)
+
+
+def _is_set_grad_enabled_node(node: torch.fx.Node):
+    return (
+        node
+        and node.op == "call_function"
+        and node.target == torch._C._set_grad_enabled
+    )
+
+
+def _is_set_grad_enabled_sub_mod(node: torch.fx.Node, omit_if_same_with_ambient=False):
+    if node.op == "call_module":
+        assert isinstance(node.target, str)
+        subgm = getattr(node.graph.owning_module, node.target)
+        first_non_ph = nodes_first(
+            subgm.graph.nodes, lambda node: node.op != "placeholder"
+        )
+        if (
+            first_non_ph
+            and first_non_ph.op == "call_function"
+            and first_non_ph.target == torch._C._set_grad_enabled
+        ):
+            return (
+                first_non_ph.args[0] != torch.is_grad_enabled()
+                if omit_if_same_with_ambient
+                else True
+            )
+    return False
+
+
+def _replace_with_hop(node: torch.fx.Node):
+    assert node.op == "call_module"
+    graph: torch.fx.Graph = node.graph
+    gm: torch.fx.GraphModule = graph.owning_module
+    assert isinstance(node.target, str)
+    sub_gm = getattr(gm, node.target)
+    sub_graph = sub_gm.graph
+    set_grad_nodes = nodes_filter(sub_graph.nodes, _is_set_grad_enabled_node)
+    if len(set_grad_nodes) > 0:
+        assert len(set_grad_nodes) == 1
+        set_grad_node = set_grad_nodes[0]
+        enable_grad_val = set_grad_node.args[0]
+        with graph.inserting_before(node):
+            get_attr_node = graph.get_attr(node.target)
+            get_attr_node.meta["nn_module_stack"] = copy.copy(
+                set_grad_node.meta.get("nn_module_stack", {})
+            )
+            output_node = next(iter(reversed(sub_gm.graph.nodes)), None)
+            if output_node is not None:
+                assert len(output_node.args) == 1
+                output_args = output_node.args[0]
+                if isinstance(output_args, (tuple, list)):
+                    call_func_node = graph.call_function(
+                        wrap_with_set_grad_enabled,
+                        (enable_grad_val, get_attr_node, *node.args),
+                        {},
+                    )
+                    # Create the metadata
+                    call_func_node.meta["val"] = tuple(
+                        arg.meta["val"] for arg in output_args
+                    )
+                    call_func_node.meta["nn_module_stack"] = copy.copy(
+                        set_grad_node.meta.get("nn_module_stack", {})
+                    )
+                    call_func_node.meta["torch_fn"] = (
+                        f"{wrap_with_set_grad_enabled.__name__}",
+                        f"{wrap_with_set_grad_enabled.__class__.__name__}.{wrap_with_set_grad_enabled.__name__}",
+                    )
+                    node_replace_(node, call_func_node, delete_old=True)
+
+                    # Rename the name of getitem nodes to the actual name of its contents
+                    # for passing verifier and better readability, also propagate metadata
+                    for get_item_node in call_func_node.users.keys():
+                        idx: int = get_item_node.args[1]
+                        output_node = output_args[idx]
+                        get_item_node._rename(output_node.name)
+                        get_item_node.meta = output_node.meta
+                        pass
+
+                elif isinstance(output_args, torch.fx.Node):
+                    call_func_node = graph.create_node(
+                        "call_function",
+                        wrap_with_set_grad_enabled,
+                        (enable_grad_val, get_attr_node, *node.args),
+                        {},
+                        output_args.name,
+                    )
+                    call_func_node.meta = output_args.meta
+                    node_replace_(node, call_func_node, delete_old=True)
+                else:
+                    raise NotImplementedError(
+                        f"repalce_set_grad_with_hop_pass doesnt' support output type {type(output_args)}"
+                    )
+            else:
+                raise NotImplementedError(
+                    "Cannot replace a call_module with a hop if it has no output. This module will gets DCEed."
+                )
+        sub_graph.erase_node(set_grad_node)
+
+
+def _remove_set_grad_and_inline(node: torch.fx.Node):
+    assert node.op == "call_module"
+    graph: torch.fx.Graph = node.graph
+    gm: torch.fx.GraphModule = graph.owning_module
+    assert isinstance(node.target, str)
+    sub_gm = getattr(gm, node.target)
+    sub_graph = sub_gm.graph
+    nodes_map(
+        sub_graph.nodes,
+        lambda n: sub_graph.erase_node(n) if _is_set_grad_enabled_node(n) else n,
+    )
+    node_inline_(node)
+
+
+def _sequential_split_and_maybe_inline_subgraphs(gm: torch.fx.GraphModule):
+    """
+    Helper function for replace_set_grad_with_hop_pass().
+    Split the graph module into multiple subgraphs based on the set_grad_enabled nodes.
+    For each subgraph, decides whether to construct a HOO subgraph, or inline the calls
+    back into the parent graph module.
+    """
+    # If there is no set_grad_enabled node, return the original graph module
+    need_replacing = False
+    for node in gm.graph.nodes:
+        if _is_set_grad_enabled_node(node):
+            need_replacing = True
+
+    if need_replacing:
+        new_gm = sequential_split(gm, _is_set_grad_enabled_node)
+
+        def _maybe_inline_or_replace_with_hop(node: torch.fx.Node):
+            if _is_set_grad_enabled_sub_mod(node, omit_if_same_with_ambient=True):
+                _replace_with_hop(node)
+            else:
+                _remove_set_grad_and_inline(node)
+
+        nodes_map(
+            list(new_gm.graph.nodes),
+            lambda node: (
+                _maybe_inline_or_replace_with_hop(node)
+                if node.op == "call_module"
+                else node
+            ),
+        )
+        return new_gm
+
+    return gm
+
+
+def replace_set_grad_with_hop_pass(gm: torch.fx.GraphModule):
+    new_gm = _sequential_split_and_maybe_inline_subgraphs(gm)
+
+    # recursively call
+    for node in new_gm.graph.nodes:
+        if node.op == "get_attr":
+            subgm = getattr(new_gm, node.target)
+            if not isinstance(subgm, torch.fx.GraphModule):
+                continue
+            new_subgm = replace_set_grad_with_hop_pass(subgm)
+            setattr(new_gm, node.target, new_subgm)
+
+    new_gm.recompile()
+    new_gm.graph.lint()
+    return new_gm
diff --git a/torch/_export/serde/schema.py b/torch/_export/serde/schema.py
index 8e65c3083a402..8f2a3ad60b4d3 100644
--- a/torch/_export/serde/schema.py
+++ b/torch/_export/serde/schema.py
@@ -1,14 +1,14 @@
 # NOTE: This is a placeholder for iterating on export serialization schema design.
 #       Anything is subject to change and no guarantee is provided at this point.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import IntEnum
 from typing import Dict, List, Optional, Tuple
 
 from torch._export.serde.union import _Union
 
 # NOTE: Please update this value if any modifications are made to the schema
-SCHEMA_VERSION = (3, 1)
+SCHEMA_VERSION = (5, 3)
 TREESPEC_VERSION = 1
 
 
@@ -51,7 +51,7 @@ class MemoryFormat(IntEnum):
 @dataclass
 class Device:
     type: str
-    index: Optional[int]
+    index: Optional[int] = None
 
 
 @dataclass(repr=False)
@@ -90,7 +90,7 @@ class TensorMeta:
     requires_grad: bool
     device: Device
     strides: List[SymInt]
-    storage_offset: int
+    storage_offset: SymInt
     layout: Layout
 
 
@@ -123,13 +123,18 @@ class TensorArgument:
     name: str
 
 
+@dataclass
+class TokenArgument:
+    name: str
+
+
 # This is use for storing the contents of a list which contain optional tensors
 # (Tensor?[], ex. [Tensor, None, ...]), where the list will be serialized to the
 # type List[OptionalTensorArgument], with tensor values seiralized to the
 # "as_tensor" field, and None values serialized to the "as_none" field.
 @dataclass(repr=False)
 class OptionalTensorArgument(_Union):
-    as_tensor: str
+    as_tensor: TensorArgument
     as_none: Tuple[()]
 
 
@@ -142,6 +147,7 @@ class GraphArgument:
 @dataclass
 class CustomObjArgument:
     name: str
+    class_fqn: str
 
 
 # This is actually a union type
@@ -169,6 +175,7 @@ class Argument(_Union):
     as_graph: GraphArgument
     as_optional_tensors: List[OptionalTensorArgument]
     as_custom_obj: CustomObjArgument
+    as_operator: str
 
 
 @dataclass
@@ -199,6 +206,7 @@ class Graph:
     # tensor, rather than following export schema and returning a singleton
     # list.
     is_single_tensor_return: bool = False
+    custom_obj_values: Dict[str, CustomObjArgument] = field(default_factory=dict)
 
 
 @dataclass
@@ -207,6 +215,21 @@ class UserInputSpec:
     arg: Argument
 
 
+@dataclass(repr=False)
+class ConstantValue(_Union):
+    as_none: Tuple[()]
+    as_int: int
+    as_float: float
+    as_string: str
+    as_bool: bool
+
+
+@dataclass
+class ConstantInputSpec:
+    name: str
+    value: ConstantValue
+
+
 @dataclass
 class InputToParameterSpec:
     arg: TensorArgument
@@ -217,6 +240,8 @@ class InputToParameterSpec:
 class InputToBufferSpec:
     arg: TensorArgument
     buffer_name: str
+    persistent: bool
+
 
 
 @dataclass
@@ -225,12 +250,26 @@ class InputToTensorConstantSpec:
     tensor_constant_name: str
 
 
+@dataclass
+class InputToCustomObjSpec:
+    arg: CustomObjArgument
+    custom_obj_name: str
+
+
+@dataclass
+class InputTokenSpec:
+    arg: TokenArgument
+
+
 @dataclass(repr=False)
 class InputSpec(_Union):
     user_input: UserInputSpec
     parameter: InputToParameterSpec
     buffer: InputToBufferSpec
     tensor_constant: InputToTensorConstantSpec
+    custom_obj: InputToCustomObjSpec
+    token: InputTokenSpec
+    constant_input: ConstantInputSpec
 
 
 @dataclass
@@ -261,6 +300,17 @@ class GradientToUserInputSpec:
     user_input_name: str
 
 
+@dataclass
+class UserInputMutationSpec:
+    arg: TensorArgument
+    user_input_name: str
+
+
+@dataclass
+class OutputTokenSpec:
+    arg: TokenArgument
+
+
 @dataclass(repr=False)
 class OutputSpec(_Union):
     user_output: UserOutputSpec
@@ -268,6 +318,8 @@ class OutputSpec(_Union):
     buffer_mutation: BufferMutationSpec
     gradient_to_parameter: GradientToParameterSpec
     gradient_to_user_input: GradientToUserInputSpec
+    user_input_mutation: UserInputMutationSpec
+    token: OutputTokenSpec
 
 
 @dataclass
diff --git a/torch/_export/serde/schema.yaml b/torch/_export/serde/schema.yaml
new file mode 100644
index 0000000000000..eecb43431ec55
--- /dev/null
+++ b/torch/_export/serde/schema.yaml
@@ -0,0 +1,430 @@
+# @generated by update_schema.py
+# checksum<<f1da1027d3bccb23db1f8dde8e635e53c7ab67fde5248ede49a6b7a3402ce743>>
+Argument:
+  kind: union
+  fields:
+    as_none:
+      type: Tuple[()]
+    as_tensor:
+      type: TensorArgument
+    as_tensors:
+      type: List[TensorArgument]
+    as_int:
+      type: int
+    as_ints:
+      type: List[int]
+    as_float:
+      type: float
+    as_floats:
+      type: List[float]
+    as_string:
+      type: str
+    as_strings:
+      type: List[str]
+    as_sym_int:
+      type: SymIntArgument
+    as_sym_ints:
+      type: List[SymIntArgument]
+    as_scalar_type:
+      type: ScalarType
+    as_memory_format:
+      type: MemoryFormat
+    as_layout:
+      type: Layout
+    as_device:
+      type: Device
+    as_bool:
+      type: bool
+    as_bools:
+      type: List[bool]
+    as_sym_bool:
+      type: SymBoolArgument
+    as_sym_bools:
+      type: List[SymBoolArgument]
+    as_graph:
+      type: GraphArgument
+    as_optional_tensors:
+      type: List[OptionalTensorArgument]
+    as_custom_obj:
+      type: CustomObjArgument
+    as_operator:
+      type: str
+BufferMutationSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    buffer_name:
+      type: str
+ConstantInputSpec:
+  kind: struct
+  fields:
+    name:
+      type: str
+    value:
+      type: ConstantValue
+ConstantValue:
+  kind: union
+  fields:
+    as_none:
+      type: Tuple[()]
+    as_int:
+      type: int
+    as_float:
+      type: float
+    as_string:
+      type: str
+    as_bool:
+      type: bool
+CustomObjArgument:
+  kind: struct
+  fields:
+    name:
+      type: str
+    class_fqn:
+      type: str
+Device:
+  kind: struct
+  fields:
+    type:
+      type: str
+    index:
+      type: Optional[int]
+      default: None
+ExportedProgram:
+  kind: struct
+  fields:
+    graph_module:
+      type: GraphModule
+    opset_version:
+      type: Dict[str, int]
+    range_constraints:
+      type: Dict[str, RangeConstraint]
+    schema_version:
+      type: SchemaVersion
+    dialect:
+      type: str
+GradientToParameterSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    parameter_name:
+      type: str
+GradientToUserInputSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    user_input_name:
+      type: str
+Graph:
+  kind: struct
+  fields:
+    inputs:
+      type: List[Argument]
+    outputs:
+      type: List[Argument]
+    nodes:
+      type: List[Node]
+    tensor_values:
+      type: Dict[str, TensorMeta]
+    sym_int_values:
+      type: Dict[str, SymInt]
+    sym_bool_values:
+      type: Dict[str, SymBool]
+    is_single_tensor_return:
+      type: bool
+      default: 'False'
+    custom_obj_values:
+      type: Dict[str, CustomObjArgument]
+      default: '{}'
+GraphArgument:
+  kind: struct
+  fields:
+    name:
+      type: str
+    graph:
+      type: Graph
+GraphModule:
+  kind: struct
+  fields:
+    graph:
+      type: Graph
+    signature:
+      type: GraphSignature
+    module_call_graph:
+      type: List[ModuleCallEntry]
+GraphSignature:
+  kind: struct
+  fields:
+    input_specs:
+      type: List[InputSpec]
+    output_specs:
+      type: List[OutputSpec]
+InputSpec:
+  kind: union
+  fields:
+    user_input:
+      type: UserInputSpec
+    parameter:
+      type: InputToParameterSpec
+    buffer:
+      type: InputToBufferSpec
+    tensor_constant:
+      type: InputToTensorConstantSpec
+    custom_obj:
+      type: InputToCustomObjSpec
+    token:
+      type: InputTokenSpec
+    constant_input:
+      type: ConstantInputSpec
+InputToBufferSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    buffer_name:
+      type: str
+    persistent:
+      type: bool
+InputToCustomObjSpec:
+  kind: struct
+  fields:
+    arg:
+      type: CustomObjArgument
+    custom_obj_name:
+      type: str
+InputToParameterSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    parameter_name:
+      type: str
+InputToTensorConstantSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    tensor_constant_name:
+      type: str
+InputTokenSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TokenArgument
+Layout:
+  kind: enum
+  fields:
+    Unknown: 0
+    SparseCoo: 1
+    SparseCsr: 2
+    SparseCsc: 3
+    SparseBsr: 4
+    SparseBsc: 5
+    _mkldnn: 6
+    Strided: 7
+LossOutputSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+MemoryFormat:
+  kind: enum
+  fields:
+    Unknown: 0
+    ContiguousFormat: 1
+    ChannelsLast: 2
+    ChannelsLast3d: 3
+    PreserveFormat: 4
+ModuleCallEntry:
+  kind: struct
+  fields:
+    fqn:
+      type: str
+    signature:
+      type: Optional[ModuleCallSignature]
+      default: None
+ModuleCallSignature:
+  kind: struct
+  fields:
+    inputs:
+      type: List[Argument]
+    outputs:
+      type: List[Argument]
+    in_spec:
+      type: str
+    out_spec:
+      type: str
+NamedArgument:
+  kind: struct
+  fields:
+    name:
+      type: str
+    arg:
+      type: Argument
+Node:
+  kind: struct
+  fields:
+    target:
+      type: str
+    inputs:
+      type: List[NamedArgument]
+    outputs:
+      type: List[Argument]
+    metadata:
+      type: Dict[str, str]
+OptionalTensorArgument:
+  kind: union
+  fields:
+    as_tensor:
+      type: TensorArgument
+    as_none:
+      type: Tuple[()]
+OutputSpec:
+  kind: union
+  fields:
+    user_output:
+      type: UserOutputSpec
+    loss_output:
+      type: LossOutputSpec
+    buffer_mutation:
+      type: BufferMutationSpec
+    gradient_to_parameter:
+      type: GradientToParameterSpec
+    gradient_to_user_input:
+      type: GradientToUserInputSpec
+    user_input_mutation:
+      type: UserInputMutationSpec
+    token:
+      type: OutputTokenSpec
+OutputTokenSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TokenArgument
+RangeConstraint:
+  kind: struct
+  fields:
+    min_val:
+      type: int
+    max_val:
+      type: int
+ScalarType:
+  kind: enum
+  fields:
+    UNKNOWN: 0
+    BYTE: 1
+    CHAR: 2
+    SHORT: 3
+    INT: 4
+    LONG: 5
+    HALF: 6
+    FLOAT: 7
+    DOUBLE: 8
+    COMPLEXHALF: 9
+    COMPLEXFLOAT: 10
+    COMPLEXDOUBLE: 11
+    BOOL: 12
+    BFLOAT16: 13
+SchemaVersion:
+  kind: struct
+  fields:
+    major:
+      type: int
+    minor:
+      type: int
+SymBool:
+  kind: union
+  fields:
+    as_expr:
+      type: SymExpr
+    as_bool:
+      type: bool
+SymBoolArgument:
+  kind: union
+  fields:
+    as_name:
+      type: str
+    as_bool:
+      type: bool
+SymExpr:
+  kind: struct
+  fields:
+    expr_str:
+      type: str
+    hint:
+      type: Optional[SymExprHint]
+      default: None
+SymExprHint:
+  kind: union
+  fields:
+    as_int:
+      type: int
+    as_float:
+      type: float
+    as_bool:
+      type: bool
+SymInt:
+  kind: union
+  fields:
+    as_expr:
+      type: SymExpr
+    as_int:
+      type: int
+SymIntArgument:
+  kind: union
+  fields:
+    as_name:
+      type: str
+    as_int:
+      type: int
+TensorArgument:
+  kind: struct
+  fields:
+    name:
+      type: str
+TensorMeta:
+  kind: struct
+  fields:
+    dtype:
+      type: ScalarType
+    sizes:
+      type: List[SymInt]
+    requires_grad:
+      type: bool
+    device:
+      type: Device
+    strides:
+      type: List[SymInt]
+    storage_offset:
+      type: SymInt
+    layout:
+      type: Layout
+TokenArgument:
+  kind: struct
+  fields:
+    name:
+      type: str
+UserInputMutationSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    user_input_name:
+      type: str
+UserInputSpec:
+  kind: struct
+  fields:
+    arg:
+      type: Argument
+UserOutputSpec:
+  kind: struct
+  fields:
+    arg:
+      type: Argument
+SCHEMA_VERSION:
+- 5
+- 3
+TREESPEC_VERSION: 1
diff --git a/torch/_export/serde/schema_check.py b/torch/_export/serde/schema_check.py
new file mode 100644
index 0000000000000..cde4cf1ada271
--- /dev/null
+++ b/torch/_export/serde/schema_check.py
@@ -0,0 +1,285 @@
+import dataclasses
+import hashlib
+import re
+import typing
+from enum import IntEnum
+from typing import Any, Dict, Optional, Union
+
+from torch._export.serde import schema
+from torch._export.serde.union import _Union
+
+
+class SchemaUpdateError(Exception):
+    pass
+
+
+def _check(x, msg):
+    if not x:
+        raise SchemaUpdateError(msg)
+
+
+def _staged_schema():
+    ret: Dict[str, Any] = {}
+    defs = {}
+
+    def _handle_aggregate(ty):
+        def dump_type(t):
+            if isinstance(t, type):
+                return t.__name__
+            elif isinstance(t, str):
+                assert t in defs
+                return t
+            elif o := typing.get_origin(t):
+                # Lemme know if there's a better way to do this.
+                if o == list:
+                    head = "List"
+                elif o == dict:
+                    head = "Dict"
+                elif o == tuple:
+                    if typing.get_args(t) == ():
+                        return "Tuple[()]"
+                    head = "Tuple"
+                elif o == Union:
+                    args = typing.get_args(t)
+                    assert len(args) == 2 and args[1] == type(None)
+                    return f"Optional[{dump_type(args[0])}]"
+                else:
+                    raise AssertionError(f"Type {t} is not supported in export schema.")
+                return (
+                    f"{head}[{', '.join([dump_type(x) for x in typing.get_args(t)])}]"
+                )
+            elif t == ():
+                return "()"
+            else:
+                raise AssertionError(f"Type {t} is not supported in export schema.")
+
+        def dump_field(f):
+            t = dump_type(f.type)
+            ret = {"type": t}
+
+            value = dataclasses.MISSING
+            if f.default is not dataclasses.MISSING:
+                value = f.default
+            elif f.default_factory is not dataclasses.MISSING:
+                value = f.default_factory()
+
+            if t.startswith("Optional[") and value is not None:
+                raise AssertionError(
+                    f"Optional field {ty.__name__}.{f.name} must have default value to be None."
+                )
+
+            if value is not dataclasses.MISSING:
+                default = str(value)
+                ret["default"] = default
+            return ret
+
+        return {f.name: dump_field(f) for f in dataclasses.fields(ty)}
+
+    def _handle_int_enum(name, ty):
+        ret[name] = {"kind": "enum", "fields": {x.name: x.value for x in ty}}
+
+    def _handle_struct(name, ty):
+        ret[name] = {"kind": "struct", "fields": _handle_aggregate(ty)}
+
+    def _handle_union(name, ty):
+        ret[name] = {"kind": "union", "fields": _handle_aggregate(ty)}
+
+    for name in dir(schema):
+        if name.startswith("_"):
+            continue
+
+        value = getattr(schema, name)
+
+        if hasattr(value, "__module__") and value.__module__ != schema.__name__:
+            continue
+
+        defs[name] = value
+
+    for name, value in defs.items():
+        if isinstance(value, type):
+            if issubclass(value, IntEnum):
+                _handle_int_enum(name, value)
+            elif dataclasses.is_dataclass(value):
+                if issubclass(value, _Union):
+                    _handle_union(name, value)
+                else:
+                    _handle_struct(name, value)
+            else:
+                raise AssertionError(f"Unknown schema type {name}: {value}")
+        elif isinstance(value, (int, tuple)):
+            assert name in ("SCHEMA_VERSION", "TREESPEC_VERSION")
+        else:
+            raise AssertionError(f"Unknown variable {name}: {value}")
+
+    ret["SCHEMA_VERSION"] = list(defs["SCHEMA_VERSION"])
+    assert all(x > 0 for x in ret["SCHEMA_VERSION"])
+    ret["TREESPEC_VERSION"] = defs["TREESPEC_VERSION"]
+    assert ret["TREESPEC_VERSION"] > 0
+    return ret
+
+
+def _diff_schema(dst, src):
+    additions = {key: src[key] for key in src.keys() - dst.keys()}
+    subtractions = {key: dst[key] for key in dst.keys() - src.keys()}
+
+    common_keys = src.keys() & dst.keys()
+
+    versions = {"SCHEMA_VERSION", "TREESPEC_VERSION"}
+    common_keys -= versions
+
+    for key in common_keys:
+        src_kind = src[key]["kind"]
+        src_fields = src[key]["fields"]
+        dst_kind = dst[key]["kind"]
+        dst_fields = dst[key]["fields"]
+        _check(
+            src_kind == dst_kind,
+            f"Type {key} changed kind from {dst_kind} to {src_kind}",
+        )
+        assert isinstance(src_fields, dict) and isinstance(dst_fields, dict)
+        added_fields = {
+            key: src_fields[key] for key in src_fields.keys() - dst_fields.keys()
+        }
+        subtracted_fields = {
+            key: dst_fields[key] for key in dst_fields.keys() - src_fields.keys()
+        }
+        common_fields = src_fields.keys() & dst_fields.keys()
+
+        for field in common_fields:
+            src_field = src_fields[field]
+            dst_field = dst_fields[field]
+            if src_kind == "struct":
+                _check(
+                    src_field["type"] == dst_field["type"],
+                    f"Type of the field {key}.{field} changed from {dst_field['type']} to {src_field['type']}",
+                )
+                if "default" in src_field and "default" not in dst_field:
+                    added_fields[field] = {}
+                    added_fields[field]["default"] = src_field["default"]
+                if "default" not in src_field and "default" in dst_field:
+                    subtracted_fields[field] = {}
+                    subtracted_fields[field]["default"] = dst_field["default"]
+            elif src_kind == "enum":
+                _check(
+                    src_field == dst_field,
+                    f"Value of the enum field {key}.{field} changed from {dst_field} to {src_field}",
+                )
+            elif src_kind == "union":
+                _check(
+                    src_field["type"] == dst_field["type"],
+                    f"Type of the field {key}.{field} changed from {dst_field['type']} to {src_field['type']}",
+                )
+            else:
+                raise AssertionError(f"Unknown kind {src_kind}: {key}")
+        if len(added_fields) > 0:
+            assert key not in additions
+            additions[key] = {}
+            additions[key]["fields"] = added_fields
+        if len(subtracted_fields) > 0:
+            assert key not in subtractions
+            subtractions[key] = {}
+            subtractions[key]["fields"] = subtracted_fields
+
+    return additions, subtractions
+
+
+def _hash_schema(s):
+    return hashlib.sha256(repr(s).encode("utf-8")).hexdigest()
+
+
+@dataclasses.dataclass
+class _Commit:
+    result: Dict[str, Any]
+    checksum_result: str
+    path: str
+    additions: Dict[str, Any]
+    subtractions: Dict[str, Any]
+    base: Dict[str, Any]
+    checksum_base: Optional[str]
+
+
+def update_schema():
+    import importlib.resources
+
+    if importlib.resources.is_resource(__package__, "schema.yaml"):
+        content = importlib.resources.read_text(__package__, "schema.yaml")
+        match = re.search("checksum<<([A-Fa-f0-9]{64})>>", content)
+        _check(match is not None, "checksum not found in schema.yaml")
+        assert match is not None
+        checksum_base = match.group(1)
+        from yaml import load, Loader
+
+        dst = load(content, Loader=Loader)
+        assert isinstance(dst, dict)
+    else:
+        checksum_base = None
+        dst = {"SCHEMA_VERSION": None, "TREESPEC_VERSION": None}
+
+    src = _staged_schema()
+    additions, subtractions = _diff_schema(dst, src)
+    return _Commit(
+        result=src,
+        checksum_result=_hash_schema(src),
+        path=__package__.replace(".", "/") + "/schema.yaml",
+        additions=additions,
+        subtractions=subtractions,
+        base=dst,
+        checksum_base=checksum_base,
+    )
+
+
+def check(commit: _Commit, force_unsafe: bool = False):
+    next_version = None
+    reason = ""
+    # Step 1: Detect major schema updates.
+    if len(commit.additions) > 0:
+        for k, v in commit.additions.items():
+            if k not in commit.base:
+                continue
+            kind = commit.result[k]["kind"]
+            fields = v["fields"]
+            for f, d in fields.items():
+                if "default" not in d and kind == "struct":
+                    reason += (
+                        f"Field {k}.{f} is added to schema.py without a default value as an incomparible change "
+                        + "which requires major version bump.\n"
+                    )
+                    next_version = [commit.base["SCHEMA_VERSION"][0] + 1, 1]
+
+    if len(commit.subtractions) > 0:
+        for k, v in commit.subtractions.items():
+            if k not in commit.result:
+                continue
+            for f in v["fields"]:
+                reason = f"Field {k}.{f} is removed from schema.py as an incompatible change which requires major version bump.\n"
+            next_version = [commit.base["SCHEMA_VERSION"][0] + 1, 1]
+
+    if force_unsafe:
+        reason += "--force-unsafe is used."
+        next_version = commit.result["SCHEMA_VERSION"]
+    else:
+        # Step 2: Detect minor schema updates.
+        if next_version is None and len(commit.additions) > 0:
+            for k, v in commit.additions.items():
+                for f in v["fields"]:
+                    reason += (
+                        f"Field {k}.{f} is added to schema.py as an compatible change "
+                        + "which still requires minor version bump.\n"
+                    )
+            next_version = [
+                commit.base["SCHEMA_VERSION"][0],
+                commit.base["SCHEMA_VERSION"][1] + 1,
+            ]
+        if next_version is None and len(commit.subtractions) > 0:
+            for k, v in commit.subtractions.items():
+                for f in v["fields"]:
+                    reason += (
+                        f"Field {k}.{f} is removed from schema.py as an compatible change "
+                        + "which still requires minor version bump.\n"
+                    )
+            next_version = [
+                commit.base["SCHEMA_VERSION"][0],
+                commit.base["SCHEMA_VERSION"][1] + 1,
+            ]
+
+    return next_version, reason
diff --git a/torch/_export/serde/serialize.py b/torch/_export/serde/serialize.py
index 7e117eaa3b4a9..530d52090128b 100644
--- a/torch/_export/serde/serialize.py
+++ b/torch/_export/serde/serialize.py
@@ -1,5 +1,6 @@
 import base64
 import copy
+import copyreg
 import dataclasses
 import heapq
 import inspect
@@ -8,8 +9,8 @@
 import logging
 import math
 import operator
+import re
 import typing
-from collections import defaultdict
 
 from contextlib import contextmanager
 from dataclasses import dataclass, field
@@ -19,6 +20,7 @@
     Callable,
     cast,
     Dict,
+    final,
     Iterator,
     List,
     Optional,
@@ -42,6 +44,8 @@
 from .schema import (  # type: ignore[attr-defined]
     Argument,
     BufferMutationSpec,
+    ConstantInputSpec,
+    ConstantValue,
     CustomObjArgument,
     Device,
     ExportedProgram,
@@ -53,6 +57,8 @@
     GraphSignature,
     InputSpec,
     InputToBufferSpec,
+    InputToCustomObjSpec,
+    InputTokenSpec,
     InputToParameterSpec,
     InputToTensorConstantSpec,
     Layout,
@@ -64,6 +70,7 @@
     Node,
     OptionalTensorArgument,
     OutputSpec,
+    OutputTokenSpec,
     RangeConstraint,
     ScalarType,
     SCHEMA_VERSION,
@@ -75,7 +82,9 @@
     SymIntArgument,
     TensorArgument,
     TensorMeta,
+    TokenArgument,
     TREESPEC_VERSION,
+    UserInputMutationSpec,
     UserInputSpec,
     UserOutputSpec,
 )
@@ -90,12 +99,6 @@
     "ExportedProgramDeserializer",
 ]
 
-from torch.export.exported_program import (
-    ConstantArgument as PyConstantArgument,
-    SymIntArgument as PySymIntArgument,
-    TensorArgument as PyTensorArgument,
-)
-
 from .upgrade import GraphModuleOpUpgrader
 
 log = logging.getLogger(__name__)
@@ -109,7 +112,9 @@ def _reverse_map(d: Dict[Any, Enum]):
     return {v.value: k for k, v in d.items()}
 
 
-MetaType = Union[FakeTensor, int, torch.SymInt, bool, torch.SymBool]
+MetaType = Union[
+    FakeTensor, int, torch.SymInt, bool, torch.SymBool, ep.CustomObjArgument
+]
 
 
 ST_DELIMITER = ";"
@@ -127,7 +132,7 @@ def _reverse_map(d: Dict[Any, Enum]):
     torch.complex64: ScalarType.COMPLEXFLOAT,
     torch.complex128: ScalarType.COMPLEXDOUBLE,
     torch.bool: ScalarType.BOOL,
-    torch.bfloat16: ScalarType.BFLOAT16
+    torch.bfloat16: ScalarType.BFLOAT16,
 }
 
 
@@ -165,7 +170,6 @@ def _reverse_map(d: Dict[Any, Enum]):
     operator.sub,
     operator.floordiv,
     operator.mod,
-    torch.sym_sqrt,
     torch.sym_int,
     torch.sym_ite,
     torch.sym_max,
@@ -187,9 +191,18 @@ def _reverse_map(d: Dict[Any, Enum]):
 
 @dataclass
 class SerializedArtifact:
-    exported_program: Union[ExportedProgram, bytes]
+    exported_program: bytes
+    state_dict: bytes
+    constants: bytes
+    example_inputs: bytes
+
+
+@dataclass
+class _SerializedProgram:
+    exported_program: ExportedProgram
     state_dict: bytes
     constants: bytes
+    example_inputs: bytes
 
 
 def deserialize_device(d: Device) -> torch.device:
@@ -207,7 +220,9 @@ def serialize_sym_int(s: Union[int, torch.SymInt]) -> SymInt:
             if s.node.hint is None:
                 return SymInt.create(as_expr=SymExpr(str(s)))
             else:
-                return SymInt.create(as_expr=SymExpr(str(s), hint=SymExprHint.create(as_int=s.node.hint)))
+                return SymInt.create(
+                    as_expr=SymExpr(str(s), hint=SymExprHint.create(as_int=s.node.hint))
+                )
     else:
         raise SerializeError(
             f"SymInt should be either symbol or int, got `{s}` of type `{type(s)}`"
@@ -236,32 +251,74 @@ def serialize_tensor_meta(t: torch.Tensor) -> TensorMeta:
         requires_grad=t.requires_grad,
         device=Device(type=t.device.type, index=t.device.index),
         strides=[serialize_sym_int(s) for s in t.stride()],
-        storage_offset=0,
+        storage_offset=serialize_sym_int(0),  # TODO needs to be fixed.
         layout=_TORCH_TO_SERIALIZE_LAYOUT[t.layout],
     )
 
 
-def serialize_torch_artifact(artifact) -> bytes:
-    buffer = io.BytesIO()
-    # This is a workaround for backend's tensor deserialization problem:
-    # unpickleTensor() always create a tensor on the device where it was originally saved
-    # This behavior is bad for multi-gpu training, as we wish to directly load the tensor
-    # on the designated device.
-    # For now, we simply move the tensor to cpu before saving.
-    # TODO: this should be fixed by deserialization instead.
-    torch.save(artifact, buffer)
-    return buffer.getvalue()
-
-
-def deserialize_torch_artifact(serialized: bytes):
+_CURRENT_DESERIALIZER: Optional["GraphModuleDeserializer"] = None
+
+
+def _reduce_fake_tensor(fake_tensor: FakeTensor):
+    is_parameter = isinstance(fake_tensor, torch.nn.Parameter)
+    tensor_meta = serialize_tensor_meta(fake_tensor)
+    tensor_meta_bytes = json.dumps(
+        _dataclass_to_dict(tensor_meta), cls=EnumEncoder
+    ).encode("utf-8")
+    return _reconstruct_fake_tensor, (tensor_meta_bytes, is_parameter)
+
+
+def _reconstruct_fake_tensor(
+    serialized_tensor_meta: bytes, is_parameter: bool
+) -> FakeTensor:
+    # Deserialize the bytes into a TensorMeta
+    json_tensor_meta = json.loads(serialized_tensor_meta.decode("utf-8"))
+    tensor_meta = _dict_to_dataclass(TensorMeta, json_tensor_meta)
+    # Find the current fake mode
+    assert (
+        _CURRENT_DESERIALIZER is not None
+    ), "Need access to current deserializer state"
+    fake_tensor = _CURRENT_DESERIALIZER.deserialize_tensor_meta(tensor_meta)
+    if is_parameter:
+        fake_tensor = torch.nn.Parameter(fake_tensor)  # type: ignore[assignment]
+    return fake_tensor
+
+
+def serialize_torch_artifact(artifact: Optional[Any]) -> bytes:
+    if artifact is None:
+        return b""
+
+    assert (
+        FakeTensor not in copyreg.dispatch_table
+    ), "Refusing to stomp on existing FakeTensor reducer"
+    try:
+        copyreg.pickle(FakeTensor, _reduce_fake_tensor)
+        buffer = io.BytesIO()
+        # This is a workaround for backend's tensor deserialization problem:
+        # unpickleTensor() always create a tensor on the device where it was originally saved
+        # This behavior is bad for multi-gpu training, as we wish to directly load the tensor
+        # on the designated device.
+        # For now, we simply move the tensor to cpu before saving.
+        # TODO: this should be fixed by deserialization instead.
+        torch.save(artifact, buffer)
+        return buffer.getvalue()
+    finally:
+        del copyreg.dispatch_table[FakeTensor]
+
+
+def deserialize_torch_artifact(serialized: Union[Dict[str, Any], Tuple[Any, ...], bytes]):
+    if isinstance(serialized, (dict, tuple)):
+        return serialized
     if len(serialized) == 0:
         return {}
     buffer = io.BytesIO(serialized)
     buffer.seek(0)
-    return torch.load(buffer)
+    artifact = torch.load(buffer)
+    assert isinstance(artifact, (tuple, dict))
+    return artifact
 
 
-def _sympy_int_to_int(val: sympy.Expr):
+def _sympy_int_to_int(val: sympy.Expr, adjust: str):
     # Convert simple sympy Integers into concrete int
     if val == sympy.oo:
         return math.inf
@@ -269,10 +326,21 @@ def _sympy_int_to_int(val: sympy.Expr):
         return -math.inf
     if isinstance(val, sympy.Integer):
         return int(val)
-    raise RuntimeError(
-        "Export constraints cannot be non-integer expressions"
+
+    # TODO: Remove this adjustment when Ed gets rid of fractional ranges
+    log.warning(
+        "Export constraints cannot be non-integer expressions. Found "
+        "type %s, and value %s. We will attempt to %s "
+        "this value.", type(val), val, adjust
     )
 
+    if adjust == "floor":
+        return math.floor(val)
+    elif adjust == "ceil":
+        return math.ceil(val)
+    else:
+        raise RuntimeError(f"Got invalid adjustment {adjust}")
+
 
 def _int_to_sympy_int(val) -> sympy.Expr:
     # Convert concrete int into simple sympy Integers
@@ -288,8 +356,8 @@ def serialize_range_constraints(
 ) -> Dict[str, RangeConstraint]:
     return {
         str(k): RangeConstraint(
-            _sympy_int_to_int(v.lower),  # type: ignore[arg-type]
-            _sympy_int_to_int(v.upper),  # type: ignore[arg-type]
+            _sympy_int_to_int(v.lower, "ceil"),  # type: ignore[arg-type]
+            _sympy_int_to_int(v.upper, "floor"),  # type: ignore[arg-type]
         )
         for k, v in range_constraints.items()
     }
@@ -309,6 +377,14 @@ def _is_single_tensor_list_return(target: torch._ops.OpOverload) -> bool:
         return_type.getElementType(), torch.TensorType
     )
 
+def _output_node_at_index(node, index):
+    for user in node.users:
+        assert user.target is operator.getitem, f"{user} is not a getitem node"
+        if index == user.args[1]:
+            return user
+    return None
+
+
 
 @dataclass
 class GraphState:
@@ -319,13 +395,23 @@ class GraphState:
     sym_int_values: Dict[str, SymInt] = field(default_factory=dict)
     sym_bool_values: Dict[str, SymBool] = field(default_factory=dict)
     is_single_tensor_return: bool = False
+    custom_obj_values: Dict[str, CustomObjArgument] = field(default_factory=dict)
+
+
+class Final(type):
+    def __new__(metacls, name, bases, classdict):
+        for b in bases:
+            if isinstance(b, Final):
+                raise TypeError(f"type '{b.__name__}' is not an acceptable base type")
+        return type.__new__(metacls, name, bases, dict(classdict))
 
 
-class GraphModuleSerializer:
+@final
+class GraphModuleSerializer(metaclass=Final):
     def __init__(
         self,
         graph_signature: ep.ExportGraphSignature,
-        module_call_graph: List[ep.ModuleCallEntry]
+        module_call_graph: List[ep.ModuleCallEntry],
     ):
         self.graph_state = GraphState()
         self.graph_signature = graph_signature
@@ -343,13 +429,23 @@ def save_graph_state(self):
 
     def handle_placeholder(self, node: torch.fx.Node):
         assert node.op == "placeholder"
-        if isinstance(node.meta['val'], torch.Tensor):
+        if isinstance(node.meta["val"], torch.Tensor):
             graph_input = Argument.create(as_tensor=TensorArgument(name=node.name))
-            self.graph_state.tensor_values[node.name] = serialize_tensor_meta(node.meta["val"])
-        elif isinstance(node.meta['val'], torch.SymInt):
+            self.graph_state.tensor_values[node.name] = serialize_tensor_meta(
+                node.meta["val"]
+            )
+        elif isinstance(node.meta["val"], torch.SymInt):
             raise AssertionError("SymInt graph input is not implemented yet.")
-        elif isinstance(node.meta['val'], (int, bool, str, float, type(None))):
-            graph_input = self.serialize_input(node.meta['val'])
+        elif isinstance(node.meta["val"], (int, bool, str, float, type(None))):
+            graph_input = self.serialize_input(node.meta["val"])
+        elif isinstance(node.meta["val"], ep.CustomObjArgument):
+            class_fqn = node.meta["val"].class_fqn
+            graph_input = Argument.create(
+                as_custom_obj=CustomObjArgument(name=node.name, class_fqn=class_fqn)
+            )
+            self.graph_state.custom_obj_values[node.name] = (
+                self.serialize_script_obj_meta(node.meta["val"])
+            )
         else:
             raise AssertionError(f"Unimplemented graph input type: {node.meta['val']}")
         self.graph_state.inputs.append(graph_input)
@@ -390,7 +486,11 @@ def handle_call_function(self, node: torch.fx.Node):
             ex_node = Node(
                 target=self.serialize_operator(node.target),
                 inputs=self.serialize_sym_op_inputs(node.target, node.args),
-                outputs=[Argument.create(as_sym_int=self.serialize_sym_int_output(node.name, meta_val))],
+                outputs=[
+                    Argument.create(
+                        as_sym_int=self.serialize_sym_int_output(node.name, meta_val)
+                    )
+                ],
                 metadata=self.serialize_metadata(node),
             )
         elif node.target in _SYM_BOOL_OPS:
@@ -399,7 +499,11 @@ def handle_call_function(self, node: torch.fx.Node):
             ex_node = Node(
                 target=self.serialize_operator(node.target),
                 inputs=self.serialize_sym_op_inputs(node.target, node.args),
-                outputs=[Argument.create(as_sym_bool=self.serialize_sym_bool_output(node.name, meta_val))],
+                outputs=[
+                    Argument.create(
+                        as_sym_bool=self.serialize_sym_bool_output(node.name, meta_val)
+                    )
+                ],
                 metadata=self.serialize_metadata(node),
             )
         elif isinstance(node.target, torch._ops.OpOverload):
@@ -411,31 +515,10 @@ def handle_call_function(self, node: torch.fx.Node):
                 metadata=self.serialize_metadata(node),
             )
         elif isinstance(node.target, torch._ops.HigherOrderOperator):
-
-            inputs = [
-                NamedArgument(
-                    name="",  # TODO(zhxchen17) This is sad, should be improved when HOO has schema arg names.
-                    arg=self.serialize_input(a),
-                ) for a in node.args
-            ]
-
-            meta_val = node.meta["val"]
-
-            if isinstance(meta_val, torch.Tensor):
-                outputs = [Argument.create(as_tensor=self.serialize_tensor_output(node.name, meta_val))]
-            elif isinstance(meta_val, (list, tuple)) and all(isinstance(v, torch.Tensor) for v in meta_val):
-                arg_list = self._handle_getitem_users(node)
-                outputs = [Argument.create(as_tensors=arg_list)]
-            else:
-                raise SerializeError(
-                    "Only single tensor output or list of tensor output "
-                    "is supported for HigherOrderOperator serialization"
-                )
-
             ex_node = Node(
                 target=self.serialize_operator(node.target),
-                inputs=inputs,
-                outputs=outputs,
+                inputs=self.serialize_hoo_inputs(node.args, node.kwargs),
+                outputs=self.serialize_hoo_outputs(node),
                 metadata=self.serialize_metadata(node),
             )
         else:
@@ -452,27 +535,54 @@ def serialize_metadata(self, node: torch.fx.Node) -> Dict[str, str]:
             ret["stack_trace"] = stack_trace
 
         if nn_module_stack := node.meta.get("nn_module_stack"):
+
             def export_nn_module_stack(val):
                 assert isinstance(val, tuple) and len(val) == 2
                 path, ty = val
 
                 assert isinstance(path, str)
-                normalized_ty = ty.__module__ + "." + ty.__qualname__
+
+                # node.meta["nn_module_stack"] could have two forms:
+                # 1. (path: str, module_type: 'type'), e.g.
+                #    ('', <class 'sigmoid.inference.MySimpleModel'>)
+                # 2. (path: str, module_type: str), e.g.
+                #    ('', 'sigmoid.inference.MySimpleModel')
+                # ExportedProgram directly produced by torch.export() has form 1
+                # ExportedProgram deserialized from disk has form 2
+                # TODO: This is not ideal, we should fix this.
+                if isinstance(ty, str):
+                    normalized_ty = ty
+                else:
+                    normalized_ty = ty.__module__ + "." + ty.__qualname__
+
                 return path + "," + normalized_ty
 
             # Serialize to "key,orig_path,type_str"
             nn_module_list = [
-                f"{k},{export_nn_module_stack(v)}"
-                for k, v in nn_module_stack.items()
+                f"{k},{export_nn_module_stack(v)}" for k, v in nn_module_stack.items()
             ]
             ret["nn_module_stack"] = ST_DELIMITER.join(nn_module_list)
 
         if source_fn_st := node.meta.get("source_fn_stack"):
-            source_fn_list = [f"{source_fn[0]},{self.serialize_operator(source_fn[1])}" for source_fn in source_fn_st]
+            source_fn_list = [
+                f"{source_fn[0]},{self.serialize_operator(source_fn[1])}"
+                for source_fn in source_fn_st
+            ]
             ret["source_fn_stack"] = ST_DELIMITER.join(source_fn_list)
 
+        if torch_fn := node.meta.get("torch_fn"):
+            ret["torch_fn"] = ST_DELIMITER.join(list(torch_fn))
+
         return ret
 
+    def serialize_script_obj_meta(
+        self, script_obj_meta: ep.CustomObjArgument
+    ) -> CustomObjArgument:
+        return CustomObjArgument(
+            name=script_obj_meta.name,
+            class_fqn=script_obj_meta.class_fqn,
+        )
+
     def serialize_sym_op_inputs(self, op, args) -> List[NamedArgument]:
         serialized_args = []
         args_names = inspect.signature(op).parameters.keys()
@@ -493,14 +603,14 @@ def serialize_inputs(
                 serialized_args.append(
                     NamedArgument(
                         name=schema_arg.name,
-                        arg=self.serialize_input(kwargs[schema_arg.name]),
+                        arg=self.serialize_input(kwargs[schema_arg.name], schema_arg.type),
                     )
                 )
             elif not schema_arg.kwarg_only and i < len(args):
                 serialized_args.append(
                     NamedArgument(
                         name=schema_arg.name,
-                        arg=self.serialize_input(args[i]),
+                        arg=self.serialize_input(args[i], schema_arg.type),
                     )
                 )
             else:
@@ -508,21 +618,44 @@ def serialize_inputs(
                 # with default values
                 pass
 
-
         return serialized_args
 
+    def serialize_hoo_inputs(self, args, kwargs) -> List[NamedArgument]:
+        """
+        For serializing HOO inputs since HOOs do not have a schema.
+        """
+        inputs = [
+            NamedArgument(
+                name="",
+                arg=self.serialize_input(a),
+            )
+            for a in args
+        ]
+        inputs.extend(
+            [
+                NamedArgument(name=name, arg=self.serialize_input(a))
+                for name, a in kwargs.items()
+            ]
+        )
+        return inputs
+
     def is_sym_int_arg(self, arg) -> bool:
         return isinstance(arg, int) or (
-            isinstance(arg, torch.fx.Node) and arg.name in self.graph_state.sym_int_values
+            isinstance(arg, torch.fx.Node)
+            and arg.name in self.graph_state.sym_int_values
         )
 
     def is_sym_bool_arg(self, arg) -> bool:
         return isinstance(arg, bool) or (
-            isinstance(arg, torch.fx.Node) and arg.name in self.graph_state.sym_bool_values
+            isinstance(arg, torch.fx.Node)
+            and arg.name in self.graph_state.sym_bool_values
         )
 
-    def serialize_input(self, arg) -> Argument:
+    def serialize_input(
+        self, arg, arg_type: Optional[torch._C.Argument] = None
+    ) -> Argument:
         import torch._inductor.ir as inductor_ir
+
         inductor_tensor_buffers = (
             inductor_ir.Buffer,
             inductor_ir.ReinterpretView,
@@ -534,18 +667,34 @@ def serialize_input(self, arg) -> Argument:
                 attr = getattr(arg.graph.owning_module, arg.target)
 
                 if isinstance(attr, torch.Tensor):
-                    raise SerializeError("getattr nodes containing tensors should not appear in the graph")
+                    raise SerializeError(
+                        "getattr nodes containing tensors should not appear in the graph"
+                    )
                 elif isinstance(attr, torch.fx.GraphModule):
                     with self.save_graph_state():
                         graph = self.serialize_graph(attr)
-                    return Argument.create(as_graph=GraphArgument(name=arg.target, graph=graph))
+                    return Argument.create(
+                        as_graph=GraphArgument(name=arg.target, graph=graph)
+                    )
                 else:
-                    raise SerializeError(f"Unsupported getattr attribute {arg.target} with type: {type(attr)}")
+                    raise SerializeError(
+                        f"Unsupported getattr attribute {arg.target} with type: {type(attr)}"
+                    )
             elif self.is_sym_int_arg(arg):
-                return Argument.create(as_sym_int=SymIntArgument.create(as_name=arg.name))
+                return Argument.create(
+                    as_sym_int=SymIntArgument.create(as_name=arg.name)
+                )
             elif self.is_sym_bool_arg(arg):
-                return Argument.create(as_sym_bool=SymBoolArgument.create(as_name=arg.name))
+                return Argument.create(
+                    as_sym_bool=SymBoolArgument.create(as_name=arg.name)
+                )
             else:
+                if isinstance(arg.meta["val"], ep.CustomObjArgument):
+                    return Argument.create(
+                        as_custom_obj=CustomObjArgument(
+                            name=arg.name, class_fqn=arg.meta["val"].class_fqn
+                        )
+                    )
                 return Argument.create(as_tensor=TensorArgument(name=arg.name))
         elif isinstance(arg, inductor_tensor_buffers):
             # Other branches are for arguments in fx node.
@@ -572,6 +721,39 @@ def serialize_input(self, arg) -> Argument:
         elif arg is None:
             return Argument.create(as_none=())
         elif isinstance(arg, (list, tuple)):
+            if len(arg) == 0:
+                if arg_type is not None:
+                    if isinstance(arg_type, torch.OptionalType):
+                        arg_type = arg_type.getElementType()  # type: ignore[assignment]
+                    assert isinstance(arg_type, torch.ListType)
+                    elem_type = arg_type.getElementType()
+                    if isinstance(elem_type, torch.OptionalType):
+                        elem_type = elem_type.getElementType()
+
+                    if isinstance(elem_type, torch.BoolType):
+                        return Argument.create(as_bools=[])
+                    elif isinstance(elem_type, torch.IntType):
+                        return Argument.create(as_ints=[])
+                    elif isinstance(elem_type, torch.FloatType):
+                        return Argument.create(as_floats=[])
+                    elif isinstance(elem_type, torch.StringType):
+                        return Argument.create(as_strings=[])
+                    elif isinstance(elem_type, torch.TensorType):
+                        return Argument.create(as_tensors=[])
+                    else:
+                        # I believe empty symint lists default to ints, but
+                        # please file an issue if this is not the case
+                        raise SerializeError(f"Empty list with type {elem_type} nyi.")
+                else:
+                    # We could serialize this by default to a tensor list. This
+                    # is needed in the HOO case
+                    log.warning(
+                        "Unsure how to serialize the given empty list, "
+                        "as we don't know what is the type of this argument. "
+                        "Serializing it as a tensor list by default."
+                    )
+                    return Argument.create(as_tensors=[])
+
             # Must check bool first, as bool is also treated as int
             if all(isinstance(a, bool) for a in arg):
                 return Argument.create(as_bools=list(arg))
@@ -612,7 +794,9 @@ def serialize_input(self, arg) -> Argument:
                 arguments = []
                 for a in arg:
                     if a.op == "get_attr":
-                        raise SerializeError("getattr nodes containing tensors should not appear in the graph")
+                        raise SerializeError(
+                            "getattr nodes containing tensors should not appear in the graph"
+                        )
                     arguments.append(TensorArgument(name=a.name))
                 return Argument.create(as_tensors=arguments)
             elif all(isinstance(a, (torch.fx.Node, type(None))) for a in arg):
@@ -621,9 +805,12 @@ def serialize_optional_tensor_args(a):
                     if a is None:
                         return OptionalTensorArgument.create(as_none=())
                     elif isinstance(a, torch.fx.Node):
-                        return OptionalTensorArgument.create(as_tensor=a.name)
+                        return OptionalTensorArgument.create(
+                            as_tensor=TensorArgument(name=a.name)
+                        )
                     else:
                         raise SerializeError(f"Unsupported list/tuple argument: {a}")
+
                 return Argument.create(
                     as_optional_tensors=list(map(serialize_optional_tensor_args, arg))
                 )
@@ -632,32 +819,41 @@ def serialize_optional_tensor_args(a):
                 return Argument.create(
                     as_tensors=[TensorArgument(name=a.get_name()) for a in arg],
                 )
-            elif all(isinstance(a, (*inductor_tensor_buffers, type(None))) for a in arg):
+            elif all(
+                isinstance(a, (*inductor_tensor_buffers, type(None))) for a in arg
+            ):
                 # list of inductor buffers as optional tensors
                 def serialize_optional_tensor_args(a):
                     if a is None:
                         return OptionalTensorArgument.create(as_none=())
                     elif isinstance(a, inductor_tensor_buffers):
-                        return OptionalTensorArgument.create(as_tensor=a.get_name())
+                        return OptionalTensorArgument.create(
+                            as_tensor=TensorArgument(name=a.get_name())
+                        )
                     else:
                         raise SerializeError(f"Unsupported list/tuple argument: {a}")
+
                 return Argument.create(
                     as_optional_tensors=list(map(serialize_optional_tensor_args, arg))
                 )
             else:
-                raise SerializeError(f"Unsupported list/tuple argument type: {[type(a) for a in arg]}")
+                raise SerializeError(
+                    f"Unsupported list/tuple argument type: {[type(a) for a in arg]}"
+                )
         elif isinstance(arg, torch.dtype):
             return Argument.create(as_scalar_type=_TORCH_TO_SERIALIZE_DTYPE[arg])
         elif isinstance(arg, torch.device):
             return Argument.create(as_device=Device(type=arg.type, index=arg.index))
         elif isinstance(arg, torch.memory_format):
-            return Argument.create(as_memory_format=_TORCH_TO_SERIALIZE_MEMORY_FORMAT[arg])
+            return Argument.create(
+                as_memory_format=_TORCH_TO_SERIALIZE_MEMORY_FORMAT[arg]
+            )
         elif isinstance(arg, torch.layout):
             return Argument.create(as_layout=_TORCH_TO_SERIALIZE_LAYOUT[arg])
         elif isinstance(arg, torch._C.ScriptObject):
             if not (
-                arg._has_method("__getstate__") and  # type: ignore[attr-defined]
-                arg._has_method("__setstate__")  # type: ignore[attr-defined]
+                arg._has_method("__getstate__")  # type: ignore[attr-defined]
+                and arg._has_method("__setstate__")  # type: ignore[attr-defined]
             ):
                 raise SerializeError(
                     f"Unable to serialize custom class {arg}. Please define "
@@ -669,7 +865,12 @@ def serialize_optional_tensor_args(a):
             # serialize/deserialize function.
             custom_obj_name = f"_custom_obj_{len(self.custom_objs)}"
             self.custom_objs[custom_obj_name] = arg
-            return Argument.create(as_custom_obj=CustomObjArgument(custom_obj_name))
+            class_fqn = arg._type().qualified_name()  # type: ignore[attr-defined]
+            return Argument.create(
+                as_custom_obj=CustomObjArgument(custom_obj_name, class_fqn)
+            )
+        elif isinstance(arg, torch._ops.OpOverload):
+            return Argument.create(as_operator=self.serialize_operator(arg))
         else:
             raise SerializeError(f"Unsupported argument type: {type(arg)}")
 
@@ -690,11 +891,30 @@ def serialize_sym_bool_output(self, name, meta_val) -> SymIntArgument:
 
     def serialize_input_spec(self, spec: ep.InputSpec) -> InputSpec:
         if spec.kind == ep.InputKind.USER_INPUT:
-            return InputSpec.create(
-                user_input=UserInputSpec(
-                    arg=self.serialize_argument_spec(spec.arg)
+            if isinstance(spec.arg, ep.ConstantArgument):
+                if isinstance(spec.arg.value, int):
+                    constant_spec = ConstantValue.create(as_int=spec.arg.value)
+                elif isinstance(spec.arg.value, bool):
+                    constant_spec = ConstantValue.create(as_bool=spec.arg.value)
+                elif isinstance(spec.arg.value, str):
+                    constant_spec = ConstantValue.create(as_string=spec.arg.value)
+                elif isinstance(spec.arg.value, float):
+                    constant_spec = ConstantValue.create(as_float=spec.arg.value)
+                elif spec.arg.value is None:
+                    constant_spec = ConstantValue.create(as_none=())
+                else:
+                    raise SerializeError(f"Unhandled constant input {spec.arg.value} to serialize")
+                return InputSpec.create(
+                    constant_input=ConstantInputSpec(
+                        name=spec.arg.name, value=constant_spec
+                    )
+                )
+            else:
+                return InputSpec.create(
+                    user_input=UserInputSpec(
+                        arg=self.serialize_argument_spec(spec.arg)
+                    )
                 )
-            )
         elif spec.kind == ep.InputKind.PARAMETER:
             assert spec.target is not None
             assert isinstance(spec.arg, ep.TensorArgument)
@@ -707,10 +927,12 @@ def serialize_input_spec(self, spec: ep.InputSpec) -> InputSpec:
         elif spec.kind == ep.InputKind.BUFFER:
             assert spec.target is not None
             assert isinstance(spec.arg, ep.TensorArgument)
+            assert spec.persistent is not None
             return InputSpec.create(
                 buffer=InputToBufferSpec(
                     arg=TensorArgument(name=spec.arg.name),
                     buffer_name=spec.target,
+                    persistent=spec.persistent,
                 )
             )
         elif spec.kind == ep.InputKind.CONSTANT_TENSOR:
@@ -722,26 +944,40 @@ def serialize_input_spec(self, spec: ep.InputSpec) -> InputSpec:
                     tensor_constant_name=spec.target,
                 )
             )
+        elif spec.kind == ep.InputKind.CUSTOM_OBJ:
+            assert spec.target is not None
+            assert isinstance(spec.arg, ep.CustomObjArgument)
+            return InputSpec.create(
+                custom_obj=InputToCustomObjSpec(
+                    arg=CustomObjArgument(
+                        name=spec.arg.name, class_fqn=spec.arg.class_fqn
+                    ),
+                    custom_obj_name=spec.target,
+                )
+            )
+        elif spec.kind == ep.InputKind.TOKEN:
+            assert isinstance(spec.arg, ep.TokenArgument)
+            return InputSpec.create(
+                token=InputTokenSpec(
+                    arg=TokenArgument(name=spec.arg.name),
+                )
+            )
         else:
             raise AssertionError(f"Unknown argument kind: {spec}")
 
     def serialize_output_spec(self, spec: ep.OutputSpec) -> OutputSpec:
         if spec.kind == ep.OutputKind.USER_OUTPUT:
             return OutputSpec.create(
-                user_output=UserOutputSpec(
-                    arg=self.serialize_argument_spec(spec.arg)
-                )
+                user_output=UserOutputSpec(arg=self.serialize_argument_spec(spec.arg))
             )
         elif spec.kind == ep.OutputKind.LOSS_OUTPUT:
             assert isinstance(spec.arg, ep.TensorArgument)
             return OutputSpec.create(
-                loss_output=LossOutputSpec(
-                    arg=TensorArgument(name=spec.arg.name)
-                )
+                loss_output=LossOutputSpec(arg=TensorArgument(name=spec.arg.name))
             )
         elif spec.kind == ep.OutputKind.BUFFER_MUTATION:
             assert spec.target is not None
-            assert isinstance(spec.arg, PyTensorArgument)
+            assert isinstance(spec.arg, ep.TensorArgument)
             return OutputSpec.create(
                 buffer_mutation=BufferMutationSpec(
                     arg=TensorArgument(name=spec.arg.name),
@@ -750,7 +986,7 @@ def serialize_output_spec(self, spec: ep.OutputSpec) -> OutputSpec:
             )
         elif spec.kind == ep.OutputKind.GRADIENT_TO_PARAMETER:
             assert spec.target is not None
-            assert isinstance(spec.arg, PyTensorArgument)
+            assert isinstance(spec.arg, ep.TensorArgument)
             return OutputSpec.create(
                 gradient_to_parameter=GradientToParameterSpec(
                     arg=TensorArgument(name=spec.arg.name),
@@ -759,13 +995,29 @@ def serialize_output_spec(self, spec: ep.OutputSpec) -> OutputSpec:
             )
         elif spec.kind == ep.OutputKind.GRADIENT_TO_USER_INPUT:
             assert spec.target is not None
-            assert isinstance(spec.arg, PyTensorArgument)
+            assert isinstance(spec.arg, ep.TensorArgument)
             return OutputSpec.create(
                 gradient_to_user_input=GradientToUserInputSpec(
                     arg=TensorArgument(name=spec.arg.name),
                     user_input_name=spec.target,
                 )
             )
+        elif spec.kind == ep.OutputKind.USER_INPUT_MUTATION:
+            assert spec.target is not None
+            assert isinstance(spec.arg, ep.TensorArgument)
+            return OutputSpec.create(
+                user_input_mutation=UserInputMutationSpec(
+                    arg=TensorArgument(name=spec.arg.name),
+                    user_input_name=spec.target,
+                )
+            )
+        elif spec.kind == ep.OutputKind.TOKEN:
+            assert isinstance(spec.arg, ep.TokenArgument)
+            return OutputSpec.create(
+                token=OutputTokenSpec(
+                    arg=TokenArgument(name=spec.arg.name),
+                )
+            )
         else:
             raise AssertionError(f"Unknown argument kind: {spec}")
 
@@ -776,29 +1028,46 @@ def serialize_signature(self, sig: ep.ExportGraphSignature) -> GraphSignature:
         )
 
     def serialize_argument_spec(self, x: ep.ArgumentSpec) -> Argument:
-        if isinstance(x, PyTensorArgument):
+        if isinstance(x, ep.TensorArgument):
             return Argument.create(as_tensor=TensorArgument(name=x.name))
-        elif isinstance(x, PySymIntArgument):
+        elif isinstance(x, ep.SymIntArgument):
             return Argument.create(as_sym_int=SymIntArgument.create(as_name=x.name))
-        elif isinstance(x, PyConstantArgument):
+        elif isinstance(x, ep.ConstantArgument):
             return self.serialize_input(x.value)
+        elif isinstance(x, ep.CustomObjArgument):
+            return Argument.create(
+                as_custom_obj=CustomObjArgument(name=x.name, class_fqn=x.class_fqn)
+            )
         else:
             raise AssertionError("TODO")
 
-    def serialize_module_call_signature(self, module_call_signature: ep.ModuleCallSignature) -> ModuleCallSignature:
+    def serialize_module_call_signature(
+        self, module_call_signature: ep.ModuleCallSignature
+    ) -> ModuleCallSignature:
         return ModuleCallSignature(
-            inputs=[self.serialize_argument_spec(x) for x in module_call_signature.inputs],
-            outputs=[self.serialize_argument_spec(x) for x in module_call_signature.outputs],
+            inputs=[
+                self.serialize_argument_spec(x) for x in module_call_signature.inputs
+            ],
+            outputs=[
+                self.serialize_argument_spec(x) for x in module_call_signature.outputs
+            ],
             in_spec=treespec_dumps(module_call_signature.in_spec, TREESPEC_VERSION),
             out_spec=treespec_dumps(module_call_signature.out_spec, TREESPEC_VERSION),
         )
 
-    def serialize_module_call_graph(self, module_call_graph: List[ep.ModuleCallEntry]) -> List[ModuleCallEntry]:
+    def serialize_module_call_graph(
+        self, module_call_graph: List[ep.ModuleCallEntry]
+    ) -> List[ModuleCallEntry]:
         return [
             ModuleCallEntry(
                 fqn=entry.fqn,
-                signature=self.serialize_module_call_signature(entry.signature) if entry.signature else None,
-            ) for entry in module_call_graph
+                signature=(
+                    self.serialize_module_call_signature(entry.signature)
+                    if entry.signature
+                    else None
+                ),
+            )
+            for entry in module_call_graph
         ]
 
     def serialize_outputs(self, node: torch.fx.Node) -> List[Argument]:
@@ -820,7 +1089,9 @@ def serialize_outputs(self, node: torch.fx.Node) -> List[Argument]:
         mostly reuse the names coming from FX. This function computes a mapping from
         the FX representation to our representation, preserving the names.
         """
-        assert node.op == "call_function" and isinstance(node.target, torch._ops.OpOverload)
+        assert node.op == "call_function" and isinstance(
+            node.target, torch._ops.OpOverload
+        )
 
         assert isinstance(node.target, torch._ops.OpOverload)
         returns = node.target._schema.returns
@@ -830,28 +1101,12 @@ def serialize_outputs(self, node: torch.fx.Node) -> List[Argument]:
 
         meta_val = node.meta["val"]
 
-        def output_node_at_index(node, index):
-            for user in node.users:
-                assert user.target is operator.getitem, f"{user} is not a getitem node"
-                if index == user.args[1]:
-                    return user
-            return None
-
         # Check single value return
-        if _is_single_tensor_return(node.target):
-            # e.g "-> Tensor"
-            return [Argument.create(as_tensor=self.serialize_tensor_output(node.name, meta_val))]
-        elif len(returns) == 1 and isinstance(meta_val, torch.SymInt):
-            # e.g "-> SymInt"
-            return [Argument.create(as_sym_int=self.serialize_sym_int_output(node.name, meta_val))]
-        elif len(returns) == 1 and isinstance(meta_val, torch.SymBool):
-            # e.g "-> SymBool"
-            return [Argument.create(as_sym_bool=self.serialize_sym_bool_output(node.name, meta_val))]
-        elif _is_single_tensor_list_return(node.target):
+        if _is_single_tensor_list_return(node.target):
             # e.g "-> Tensor[]"
             tensor_args = []
             for idx, meta in enumerate(meta_val):
-                user_node = output_node_at_index(node, idx)
+                user_node = _output_node_at_index(node, idx)
                 name = (
                     user_node.name
                     if user_node is not None
@@ -859,6 +1114,8 @@ def output_node_at_index(node, index):
                 )
                 tensor_args.append(self.serialize_tensor_output(name, meta))
             return [Argument.create(as_tensors=tensor_args)]
+        elif len(returns) == 1:
+            return [self.serialize_output(node.name, meta_val)]
 
         # There are a two possibilities at this point:
         # - This operator returns a tuple of Tensors, e.g. "-> (Tensor, Tensor)"
@@ -871,21 +1128,21 @@ def output_node_at_index(node, index):
         output_arguments = []
         for idx, (meta, return_schema) in enumerate(zip(meta_val, returns)):
             if meta is None:
-                assert isinstance(return_schema.real_type, (torch.OptionalType, torch.TensorType))
+                assert isinstance(
+                    return_schema.real_type, (torch.OptionalType, torch.TensorType)
+                )
                 # When the return type is annoated as Tensor type, the op can also return an
                 # undefined Tensor which will be implicitly converted to None in Python.
                 output_arguments.append(Argument.create(as_none=()))
-            elif isinstance(meta, torch._subclasses.fake_tensor.FakeTensor):
-                assert isinstance(return_schema.real_type, torch.TensorType)
-                user_node = output_node_at_index(node, idx)
+            elif isinstance(meta, FakeTensor):
+                assert isinstance(return_schema.real_type, (torch.OptionalType, torch.TensorType))
+                user_node = _output_node_at_index(node, idx)
                 name = (
                     user_node.name
                     if user_node is not None
                     else f"{node.name}_unused_{idx}"
                 )
-                output_arguments.append(
-                    Argument.create(as_tensor=self.serialize_tensor_output(name, meta))
-                )
+                output_arguments.append(self.serialize_output(name, meta))
             elif isinstance(meta, list):
                 # for List[Tensor] return type
                 assert isinstance(
@@ -893,27 +1150,122 @@ def output_node_at_index(node, index):
                 ) and isinstance(
                     return_schema.real_type.getElementType(), torch.TensorType
                 )
-                user_node = output_node_at_index(node, idx)
+                user_node = _output_node_at_index(node, idx)
                 assert user_node is not None
 
                 args = []
                 for i, m in enumerate(meta):
                     if m is None:
                         continue
-                    sub_user_node = output_node_at_index(user_node, i)
+                    sub_user_node = _output_node_at_index(user_node, i)
                     assert sub_user_node is not None, f"No user found at index {i}"
 
                     args.append(self.serialize_tensor_output(sub_user_node.name, m))
                 output_arguments.append(Argument.create(as_tensors=args))
+            elif isinstance(meta, (int, SymInt)):
+                user_node = _output_node_at_index(node, idx)
+                name = (
+                    user_node.name
+                    if user_node is not None
+                    else f"{node.name}_unused_{idx}"
+                )
+                output_arguments.append(self.serialize_output(name, meta))
+            else:
+                raise ValueError(
+                    f"Unhandled output type {type(meta)} from node {node.format_node()}"
+                )
 
         return output_arguments
 
+    def serialize_hoo_outputs(self, node: torch.fx.Node) -> List[Argument]:
+        """
+        For serializing HOO outputs since HOOs do not have a schema.
+        """
+        meta_val = node.meta["val"]
+
+        if isinstance(meta_val, tuple):
+            # Note: Since we don't have a schema, we just serialize all tuple
+            # outputs to be a list of values. Even if the output is supposed to
+            # be a tensor list (Tensor[]), we will serialize it to be a list of
+            # tensors (Tensor, Tensor, Tensor). An exception is that if there's
+            # a singleton tensor, we will serialize this to be a singleton
+            # tensor list so that the deserializer knows to insert getitem nodes.
+
+            if len(meta_val) == 1:
+                assert isinstance(meta_val[0], torch.Tensor)
+                user_node = _output_node_at_index(node, 0)
+                name = (
+                    user_node.name
+                    if user_node is not None
+                    else f"{node.name}_unused_0"
+                )
+                return [Argument.create(as_tensors=[self.serialize_tensor_output(name, meta_val[0])])]
+
+            outputs = []
+            for i, element_meta_val in enumerate(meta_val):
+                user_node = _output_node_at_index(node, i)
+                if isinstance(element_meta_val, list):
+                    # e.g "-> Tensor[]"
+                    assert user_node is not None
+
+                    tensors = []
+                    for j, m in enumerate(element_meta_val):
+                        if not isinstance(m, torch.Tensor):
+                            raise SerializeError(f"Serialize list output with type {type(m)} nyi")
+
+                        sub_user_node = _output_node_at_index(user_node, j)
+                        name = (
+                            sub_user_node.name
+                            if sub_user_node is not None
+                            else f"{user_node.name}_unused_{j}"
+                        )
+                        tensors.append(self.serialize_tensor_output(name, m))
+                    outputs.append(Argument.create(as_tensors=tensors))
+
+                else:
+                    name = (
+                        user_node.name
+                        if user_node is not None
+                        else f"{node.name}_unused_{i}"
+                    )
+
+                    outputs.append(self.serialize_output(name, element_meta_val))
+
+            return outputs
+        else:
+            return [self.serialize_output(node.name, meta_val)]
+
+    def serialize_output(self, name: str, meta_val: Any) -> Argument:
+        # Check single value return
+        if meta_val is None:
+            return Argument.create(as_none=())
+        if isinstance(meta_val, torch.Tensor):
+            # e.g "-> Tensor"
+            return Argument.create(
+                as_tensor=self.serialize_tensor_output(name, meta_val)
+            )
+        elif isinstance(meta_val, (int, torch.SymInt)):
+            # e.g "-> SymInt"
+            return Argument.create(
+                as_sym_int=self.serialize_sym_int_output(name, meta_val)
+            )
+        elif isinstance(meta_val, torch.SymBool):
+            # e.g "-> SymBool"
+            return Argument.create(
+                as_sym_bool=self.serialize_sym_bool_output(name, meta_val)
+            )
+
+        # list outputs should've been handled earlier
+        raise SerializeError(f"Unable to serialize output {meta_val}")
+
     def _handle_getitem_users(self, node: torch.fx.Node) -> List[TensorArgument]:
         meta_val = node.meta["val"]
 
         idx_to_name = {}
         for user in node.users:
-            assert user.target is operator.getitem, f"User node {user} of {node} is incorrect"
+            assert (
+                user.target is operator.getitem
+            ), f"User node {user} of {node} is incorrect"
             idx_to_name[user.args[1]] = user.name
 
         for idx, _ in enumerate(meta_val):
@@ -937,7 +1289,9 @@ def serialize_graph(self, graph_module: torch.fx.GraphModule) -> Graph:
             try:
                 getattr(self, f"handle_{node.op}")(node)
             except Exception as e:
-                raise SerializeError(f"Failed serializing node {node} in graph: {node.format_node()}") from e
+                raise SerializeError(
+                    f"Failed serializing node {node} in graph: {node.format_node()}"
+                ) from e
 
         return Graph(
             inputs=self.graph_state.inputs,
@@ -945,6 +1299,7 @@ def serialize_graph(self, graph_module: torch.fx.GraphModule) -> Graph:
             tensor_values=self.graph_state.tensor_values,
             sym_int_values=self.graph_state.sym_int_values,
             sym_bool_values=self.graph_state.sym_bool_values,
+            custom_obj_values=self.graph_state.custom_obj_values,
             outputs=self.graph_state.outputs,
             is_single_tensor_return=self.graph_state.is_single_tensor_return,
         )
@@ -959,7 +1314,8 @@ def serialize(self, graph_module: torch.fx.GraphModule) -> GraphModule:
         )
 
 
-class ExportedProgramSerializer:
+@final
+class ExportedProgramSerializer(metaclass=Final):
     def __init__(self, opset_version: Optional[Dict[str, int]] = None):
         self.opset_version: Dict[str, int] = {}
         if opset_version:
@@ -967,20 +1323,20 @@ def __init__(self, opset_version: Optional[Dict[str, int]] = None):
         if "aten" not in self.opset_version:
             self.opset_version["aten"] = torch._C._get_max_operator_version()
 
-    def serialize(self, exported_program: ep.ExportedProgram) -> SerializedArtifact:
+    def serialize(self, exported_program: ep.ExportedProgram) -> _SerializedProgram:
         """
         Args:
             exported_program: Exported Program to serialize
         """
-        if type(self) == ExportedProgramSerializer:
-            exported_program._validate()
+        exported_program._validate()
 
         gm_serializer = GraphModuleSerializer(
-            exported_program.graph_signature,
-            exported_program.module_call_graph
+            exported_program.graph_signature, exported_program.module_call_graph
         )
         serialized_graph_module = gm_serializer.serialize(exported_program.graph_module)
-        serialized_range_constraints = serialize_range_constraints(exported_program.range_constraints)
+        serialized_range_constraints = serialize_range_constraints(
+            exported_program.range_constraints
+        )
 
         # TODO: Directly serialize exported_program.constants once
         # CustomClassHolders get stored in the ExportedProgram rather than in
@@ -988,7 +1344,7 @@ def serialize(self, exported_program: ep.ExportedProgram) -> SerializedArtifact:
         constants = {}
         for n, c in gm_serializer.custom_objs.items():
             constants[n] = c
-        for n, t in exported_program.tensor_constants.items():
+        for n, t in exported_program.constants.items():
             assert n not in constants
             constants[n] = t
 
@@ -1000,26 +1356,31 @@ def serialize(self, exported_program: ep.ExportedProgram) -> SerializedArtifact:
                 major=SCHEMA_VERSION[0],
                 minor=SCHEMA_VERSION[1],
             ),
-            dialect=exported_program.dialect,
+            dialect=exported_program.dialect
         )
 
         # Test canonical form is well defined.
         canonicalize(serialized_ep)
 
-        return SerializedArtifact(
+        return _SerializedProgram(
             serialized_ep,
             serialize_torch_artifact(exported_program.state_dict),
             serialize_torch_artifact(constants),
+            serialize_torch_artifact(exported_program.example_inputs),
         )
 
 
-class GraphModuleDeserializer:
+@final
+class GraphModuleDeserializer(metaclass=Final):
     @dataclasses.dataclass
     class Result:
         graph_module: torch.fx.GraphModule
         signature: ep.ExportGraphSignature
         module_call_graph: List[ep.ModuleCallEntry]
         names_to_symbols: Dict[str, sympy.Symbol]
+        state_dict: Dict[str, Union[torch.Tensor, torch.nn.Parameter]]
+        constants: Dict[str, Union[torch.Tensor, torch.ScriptObject]]
+        example_inputs: Optional[Tuple[Tuple[torch.Tensor, ...], Dict[str, Any]]]
 
     def __init__(self):
         self.serialized_name_to_node: Dict[str, torch.fx.Node] = {}
@@ -1029,7 +1390,12 @@ def __init__(self):
 
     @contextmanager
     def save_graph_module(self) -> Iterator[None]:
-        saved = self.graph, self.module, self.serialized_name_to_node, self.serialized_name_to_meta
+        saved = (
+            self.graph,
+            self.module,
+            self.serialized_name_to_node,
+            self.serialized_name_to_meta,
+        )
         self.graph = torch.fx.Graph()
         self.module = torch.nn.Module()
         self.serialized_name_to_node = {}
@@ -1037,10 +1403,17 @@ def save_graph_module(self) -> Iterator[None]:
         try:
             yield
         finally:
-            self.graph, self.module, self.serialized_name_to_node, self.serialized_name_to_meta = saved
+            (
+                self.graph,
+                self.module,
+                self.serialized_name_to_node,
+                self.serialized_name_to_meta,
+            ) = saved
 
     def deserialize_operator(self, serialized_target: str):
-        if serialized_target.startswith("_operator"):  # TODO(zhxchen17) Follow up on this.
+        if serialized_target.startswith(
+            "_operator"
+        ):  # TODO(zhxchen17) Follow up on this.
             module = operator
             serialized_target_names = serialized_target.split(".")[1:]
         elif serialized_target.startswith("torch"):
@@ -1060,28 +1433,52 @@ def deserialize_operator(self, serialized_target: str):
     def deserialize_sym_int(self, s: SymInt) -> Union[int, torch.SymInt]:
         val = s.value
         if s.type == "as_expr":
+            if val.hint is None:
+                hint = None
+            else:
+                assert val.hint.type == "as_int"
+                hint = val.hint.value
+
             if val.expr_str in self.symbol_name_to_symbol:
                 sym = self.symbol_name_to_symbol[val.expr_str]
             else:
                 sym = sympy.sympify(val.expr_str, locals=self.symbol_name_to_symbol)
+                # NOTE(avik): Assumptions on symbols are not explicitly serialized.
+                # This seems dangerous: it might cause unknown differences in shape env behavior
+                # on deserialization? Probably deserves a follow-up.
+
+                # Here we force symbols corresponding to SymInts to be at least integers.
+                # Otherwise some expressions that the shape env would otherwise evaluate to False,
+                # e.g., 2*s = 9, can have rational solutions, e.g., 9/2.
+                sym = sym.subs(
+                    {s: sympy.Symbol(s.name, integer=True) for s in sym.free_symbols}
+                )
                 if isinstance(sym, sympy.Symbol):
                     self.symbol_name_to_symbol[val.expr_str] = sym
+                    if hint is not None:
+                        self.shape_env.add_var_to_val(sym, hint)
 
                     if vr := self.symbol_name_to_range.get(val.expr_str):
-                        symbolic_shapes._constrain_symbol_range(
-                            self.shape_env,
+                        self.shape_env.constrain_symbol_range(
                             sym,
                             compiler_min=vr.lower,  # type: ignore[arg-type]
                             compiler_max=vr.upper,  # type: ignore[arg-type]
-                            runtime_min=vr.lower,  # type: ignore[arg-type]
-                            runtime_max=vr.upper  # type: ignore[arg-type]
                         )
-
-            if val.hint is None:
-                hint = None
-            else:
-                assert val.hint.type == "as_int"
-                hint = val.hint.value
+                else:
+                    # Placeholders, in particular, can have shapes as symbolic expressions.
+                    # We need to populate the shape env with the range constraints of their
+                    # free symbols, otherwise evaluating such expressions will error.
+                    self.symbol_name_to_symbol[val.expr_str] = sym
+                    free_symbols = sym.free_symbols
+                    for s in free_symbols:
+                        if s.name not in self.symbol_name_to_symbol:
+                            self.symbol_name_to_symbol[s.name] = s
+                        if vr := self.symbol_name_to_range.get(s.name):
+                            self.shape_env.constrain_symbol_range(
+                                s,
+                                compiler_min=vr.lower,  # type: ignore[arg-type]
+                                compiler_max=vr.upper,  # type: ignore[arg-type]
+                            )
 
             return self.shape_env.create_symintnode(sym, hint=hint)
         elif s.type == "as_int":
@@ -1108,9 +1505,8 @@ def deserialize_sym_bool(self, s: SymBool) -> Union[bool, torch.SymBool]:
     def deserialize_tensor_meta(
         self,
         tensor_meta: TensorMeta,
-        fake_tensor_mode: FakeTensorMode,
     ) -> FakeTensor:
-        with fake_tensor_mode:
+        with self.fake_tensor_mode:
             return cast(
                 FakeTensor,
                 torch.empty_strided(
@@ -1121,32 +1517,68 @@ def deserialize_tensor_meta(
                 ),
             )
 
-    def deserialize_graph_output(self, output) -> torch.fx.Node:
+    def deserialize_script_obj_meta(
+        self, script_obj_meta: CustomObjArgument
+    ) -> ep.CustomObjArgument:
+        return ep.CustomObjArgument(
+            name=script_obj_meta.name,
+            class_fqn=script_obj_meta.class_fqn,
+        )
+
+    def deserialize_graph_output(self, output) -> Optional[Union[torch.fx.Node, int]]:
         if output.type == "as_tensor":
             return self.serialized_name_to_node[output.as_tensor.name]
         elif output.type == "as_sym_int":
             return self.serialized_name_to_node[output.as_sym_int.as_name]
         elif output.type == "as_sym_bool":
             return self.serialized_name_to_node[output.as_sym_bool.as_name]
+        elif output.type == "as_int":
+            return output.as_int
+        elif output.type == "as_none":
+            return None
         else:
             raise SerializeError(f"Unable to deserialize output node {output}")
 
     def deserialize_graph(self, serialized_graph: Graph) -> torch.fx.Graph:
         # Handle the tensor metas.
         for name, tensor_value in serialized_graph.tensor_values.items():
-            meta_val = self.deserialize_tensor_meta(tensor_value, self.fake_tensor_mode)
+            meta_val = self.deserialize_tensor_meta(tensor_value)
             self.serialized_name_to_meta[name] = meta_val
 
         for name, sym_int_value in serialized_graph.sym_int_values.items():
             self.serialized_name_to_meta[name] = self.deserialize_sym_int(sym_int_value)
 
         for name, sym_bool_value in serialized_graph.sym_bool_values.items():
-            self.serialized_name_to_meta[name] = self.deserialize_sym_bool(sym_bool_value)
+            self.serialized_name_to_meta[name] = self.deserialize_sym_bool(
+                sym_bool_value
+            )
+
+        for name, script_obj_meta in serialized_graph.custom_obj_values.items():
+            self.serialized_name_to_meta[name] = self.deserialize_script_obj_meta(
+                script_obj_meta
+            )
 
         # Inputs: convert to placeholder nodes in FX.
-        for input in serialized_graph.inputs:
-            placeholder_node = self.graph.placeholder(input.as_tensor.name)
-            self.sync_fx_node(input.as_tensor.name, placeholder_node)
+        for i, input_ in enumerate(serialized_graph.inputs):
+            if input_.type in ("as_tensor", "as_sym_int", "as_custom_obj"):
+                node_name = input_.value.name
+                placeholder_node = self.graph.placeholder(node_name)
+                # FX might declare a name illegal (e.g. some nn.Modules use "input" as forward() arguments)
+                # we will overwrite it
+                placeholder_node.name = node_name
+                self.sync_fx_node(node_name, placeholder_node)
+            elif input_.type in (
+                "as_int",
+                "as_float",
+                "as_bool",
+                "as_none",
+                "as_string",
+            ):
+                node_name = self.signature.input_specs[i].arg.name
+                placeholder_node = self.graph.placeholder(node_name)
+                placeholder_node.meta["val"] = self.deserialize_input(input_)
+            else:
+                raise SerializeError(f"Invalid input type {input_}")
 
         # Nodes: convert to call_function nodes.
         for serialized_node in serialized_graph.nodes:
@@ -1155,7 +1587,9 @@ def deserialize_graph(self, serialized_graph: Graph) -> torch.fx.Graph:
                 self.deserialize_node(serialized_node, target)
 
             except Exception as e:
-                raise SerializeError(f"Failed deserializing node {serialized_node}") from e
+                raise SerializeError(
+                    f"Failed deserializing node {serialized_node}"
+                ) from e
 
         # Outputs: convert to a single `output` node.
         outputs = []
@@ -1174,7 +1608,8 @@ def deserialize_graph(self, serialized_graph: Graph) -> torch.fx.Graph:
             output_node.meta["val"] = output_node.args[0].meta["val"]
         else:
             output_node.meta["val"] = tuple(
-                arg.meta["val"] for arg in output_node.args[0]
+                arg.meta["val"] if isinstance(arg, torch.fx.Node) else arg
+                for arg in output_node.args[0]
             )
 
         return self.graph
@@ -1186,26 +1621,25 @@ def deserialize_node(self, serialized_node: Node, target: Callable) -> None:
 
             fx_node = self.graph.create_node("call_function", target, args, {}, name)
             self.deserialize_sym_op_outputs(serialized_node, fx_node)
-        elif isinstance(target, torch._ops.HigherOrderOperator):
-            assert (
-                len(serialized_node.outputs) == 1
-                and serialized_node.outputs[0].type in ("as_tensors", "as_tensor")
-            ), "Only single tensor output or list of tensor output is supported for higher order operators."
-
-            output = serialized_node.outputs[0]
 
+        elif isinstance(target, torch._ops.HigherOrderOperator):
+            args, kwargs = self.deserialize_hoo_inputs(serialized_node.inputs)
+            # If HOP returns a single tensor, name the
+            # newly-created node after it. This ensures that these tensor values
+            # have names that are consistent with serialized.
+            #
+            # HOPs don't have schema yet, just check the output lengths and as_tensor attribute
             name = (
-                output.value.name
-                if output.type == "as_tensor"
-                else None  # FX will generate a name for us.
+                serialized_node.outputs[0].as_tensor.name
+                if len(serialized_node.outputs) == 1
+                and hasattr(serialized_node.outputs[0], "as_tensor")
+                else None
             )
-            args = tuple(self.deserialize_input(input.arg) for input in serialized_node.inputs)
-            fx_node = self.graph.create_node("call_function", target, args, {}, name)
-
-            if output.type == "as_tensor":
-                self.sync_fx_node(name, fx_node)
-            if output.type == "as_tensors":
-                self.deserialize_multiple_outputs(serialized_node, fx_node)
+            fx_node = self.graph.create_node(
+                "call_function", target, args, kwargs, name
+            )
+            self.deserialize_outputs(serialized_node, fx_node)
+            fx_node.meta.update(self.deserialize_metadata(serialized_node.metadata))
 
         elif isinstance(target, torch._ops.OpOverload):
             # For convenience: if this node returns a single tensor, name the
@@ -1217,40 +1651,70 @@ def deserialize_node(self, serialized_node: Node, target: Callable) -> None:
                 else None  # FX will generate a name for us.
             )
             args, kwargs = self.deserialize_inputs(target, serialized_node)
-            fx_node = self.graph.create_node("call_function", target, args, kwargs, name)
+            fx_node = self.graph.create_node(
+                "call_function", target, args, kwargs, name
+            )
             self.deserialize_outputs(serialized_node, fx_node)
         else:
-            raise SerializeError(f"Unsupported target type for node {serialized_node}: {target}")
+            raise SerializeError(
+                f"Unsupported target type for node {serialized_node}: {target}"
+            )
 
         fx_node.meta.update(self.deserialize_metadata(serialized_node.metadata))
+        if fx_node.op not in ["placeholder", "output"] and "nn_module_stack" not in fx_node.meta:
+            fx_node.meta["nn_module_stack"] = {}  # serialization throws away empty dicts
 
     def deserialize_input_spec(self, i: InputSpec) -> ep.InputSpec:
         if i.type == "user_input":
             return ep.InputSpec(
                 kind=ep.InputKind.USER_INPUT,
                 arg=self.deserialize_argument_spec(i.user_input.arg),
-                target=None
+                target=None,
             )
         elif i.type == "parameter":
             return ep.InputSpec(
                 kind=ep.InputKind.PARAMETER,
-                arg=PyTensorArgument(name=i.parameter.arg.name),
+                arg=ep.TensorArgument(name=i.parameter.arg.name),
                 target=i.parameter.parameter_name,
             )
         elif i.type == "buffer":
             return ep.InputSpec(
                 kind=ep.InputKind.BUFFER,
-                arg=PyTensorArgument(name=i.buffer.arg.name),
+                arg=ep.TensorArgument(name=i.buffer.arg.name),
                 target=i.buffer.buffer_name,
+                persistent=i.buffer.persistent,
             )
         elif i.type == "tensor_constant":
             return ep.InputSpec(
                 kind=ep.InputKind.CONSTANT_TENSOR,
-                arg=PyTensorArgument(name=i.tensor_constant.arg.name),
+                arg=ep.TensorArgument(name=i.tensor_constant.arg.name),
                 target=i.tensor_constant.tensor_constant_name,
             )
+        elif i.type == "custom_obj":
+            return ep.InputSpec(
+                kind=ep.InputKind.CUSTOM_OBJ,
+                arg=ep.CustomObjArgument(
+                    name=i.custom_obj.arg.name, class_fqn=i.custom_obj.arg.class_fqn
+                ),
+                target=i.custom_obj.custom_obj_name,
+            )
+        elif i.type == "token":
+            return ep.InputSpec(
+                kind=ep.InputKind.TOKEN,
+                arg=ep.TokenArgument(name=i.token.arg.name),
+                target=None
+            )
+        elif i.type == "constant_input":
+            return ep.InputSpec(
+                kind=ep.InputKind.USER_INPUT,
+                arg=ep.ConstantArgument(
+                    name=i.constant_input.name,
+                    value=self.deserialize_constant_input(i.constant_input.value)
+                ),
+                target=None,
+            )
         else:
-            raise AssertionError(f"Unkown input spec {i}")
+            raise AssertionError(f"Unknown input spec {i}")
 
     def deserialize_output_spec(self, o: OutputSpec) -> ep.OutputSpec:
         if o.type == "user_output":
@@ -1262,26 +1726,38 @@ def deserialize_output_spec(self, o: OutputSpec) -> ep.OutputSpec:
         elif o.type == "loss_output":
             return ep.OutputSpec(
                 kind=ep.OutputKind.LOSS_OUTPUT,
-                arg=PyTensorArgument(name=o.loss_output.arg.name),
+                arg=ep.TensorArgument(name=o.loss_output.arg.name),
                 target=None,
             )
         elif o.type == "buffer_mutation":
             return ep.OutputSpec(
                 kind=ep.OutputKind.BUFFER_MUTATION,
-                arg=PyTensorArgument(name=o.buffer_mutation.arg.name),
-                target=o.buffer_mutation.buffer_name
+                arg=ep.TensorArgument(name=o.buffer_mutation.arg.name),
+                target=o.buffer_mutation.buffer_name,
             )
         elif o.type == "gradient_to_parameter":
             return ep.OutputSpec(
                 kind=ep.OutputKind.GRADIENT_TO_PARAMETER,
-                arg=PyTensorArgument(name=o.gradient_to_parameter.arg.name),
-                target=o.gradient_to_parameter.parameter_name
+                arg=ep.TensorArgument(name=o.gradient_to_parameter.arg.name),
+                target=o.gradient_to_parameter.parameter_name,
             )
         elif o.type == "gradient_to_user_input":
             return ep.OutputSpec(
                 kind=ep.OutputKind.GRADIENT_TO_USER_INPUT,
-                arg=PyTensorArgument(name=o.gradient_to_user_input.arg.name),
-                target=o.gradient_to_user_input.user_input_name
+                arg=ep.TensorArgument(name=o.gradient_to_user_input.arg.name),
+                target=o.gradient_to_user_input.user_input_name,
+            )
+        elif o.type == "user_input_mutation":
+            return ep.OutputSpec(
+                kind=ep.OutputKind.USER_INPUT_MUTATION,
+                arg=ep.TensorArgument(name=o.user_input_mutation.arg.name),
+                target=o.user_input_mutation.user_input_name,
+            )
+        elif o.type == "token":
+            return ep.OutputSpec(
+                kind=ep.OutputKind.TOKEN,
+                arg=ep.TokenArgument(name=o.token.arg.name),
+                target=None
             )
         else:
             raise AssertionError(f"Unknown output spec {o}")
@@ -1289,35 +1765,63 @@ def deserialize_output_spec(self, o: OutputSpec) -> ep.OutputSpec:
     def deserialize_signature(self, sig: GraphSignature) -> ep.ExportGraphSignature:
         return ep.ExportGraphSignature(
             input_specs=[self.deserialize_input_spec(i) for i in sig.input_specs],
-            output_specs=[self.deserialize_output_spec(o) for o in sig.output_specs]
+            output_specs=[self.deserialize_output_spec(o) for o in sig.output_specs],
         )
 
     def deserialize(
         self,
         serialized_graph_module: GraphModule,
+        serialized_state_dict: Union[Dict[str, torch.Tensor], bytes],
+        constants: Union[Dict[str, Any], bytes],
+        example_inputs: Optional[Union[Tuple[Tuple[torch.Tensor, ...], Dict[str, Any]], bytes]] = None,
         symbol_name_to_range: Optional[Dict[str, symbolic_shapes.ValueRanges]] = None,
-        constants: Optional[Dict[str, Any]] = None,
     ) -> Result:
-        self.shape_env = symbolic_shapes.ShapeEnv(assume_static_by_default=True)
-        self.fake_tensor_mode = FakeTensorMode(
-            allow_fallback_kernels=False,
-            allow_non_fake_inputs=True,
-            shape_env=self.shape_env,
-        )
-        self.symbol_name_to_symbol: Dict[str, sympy.Symbol] = {}
-        self.symbol_name_to_range = {} if symbol_name_to_range is None else symbol_name_to_range
-        self.constants = {} if constants is None else constants
-
-        self.deserialize_graph(serialized_graph_module.graph)
+        global _CURRENT_DESERIALIZER
+        assert _CURRENT_DESERIALIZER is None
+        _CURRENT_DESERIALIZER = self
+        try:
+            self.shape_env = symbolic_shapes.ShapeEnv(assume_static_by_default=True)
+            self.fake_tensor_mode = FakeTensorMode(
+                allow_fallback_kernels=False,
+                allow_non_fake_inputs=True,
+                shape_env=self.shape_env,
+            )
+            self.symbol_name_to_symbol: Dict[str, sympy.Symbol] = {}
+            self.constants = deserialize_torch_artifact(constants)
+            self.signature = self.deserialize_signature(serialized_graph_module.signature)
+
+            # deserialization does analysis with checks on 0/1, so we create fake range constraints and
+            # restore the original range constraints afterwards
+            self.symbol_name_to_range = {}
+            if symbol_name_to_range:
+                for k, vr in symbol_name_to_range.items():
+                    lower = int(vr.lower)
+                    if vr.upper >= 2:  # max is >= 2, not sym bool range
+                        lower = max(2, lower)
+                    self.symbol_name_to_range[k] = symbolic_shapes.ValueRanges(_int_to_sympy_int(lower), vr.upper)
+
+            if example_inputs is not None and len(example_inputs) > 0:
+                self.example_inputs = deserialize_torch_artifact(example_inputs)
+            else:
+                self.example_inputs = None
+            self.deserialize_graph(serialized_graph_module.graph)
 
-        sig = self.deserialize_signature(serialized_graph_module.signature)
-        module_call_graph = self.deserialize_module_call_graph(serialized_graph_module.module_call_graph)
-        return GraphModuleDeserializer.Result(
-            graph_module=torch._export.exported_program._create_graph_module_for_export(self.module, self.graph),
-            signature=sig,
-            module_call_graph=module_call_graph,
-            names_to_symbols=self.symbol_name_to_symbol,
-        )
+            module_call_graph = self.deserialize_module_call_graph(
+                serialized_graph_module.module_call_graph
+            )
+            return GraphModuleDeserializer.Result(
+                graph_module=ep._create_graph_module_for_export(
+                    self.module, self.graph
+                ),
+                signature=self.signature,
+                module_call_graph=module_call_graph,
+                names_to_symbols=self.symbol_name_to_symbol,
+                state_dict=deserialize_torch_artifact(serialized_state_dict),
+                constants=self.constants,
+                example_inputs=self.example_inputs,
+            )
+        finally:
+            _CURRENT_DESERIALIZER = None
 
     def sync_fx_node(self, name: str, fx_node: torch.fx.Node):
         if name in self.serialized_name_to_node:
@@ -1332,12 +1836,15 @@ def deserialize_sym_op_inputs(self, inputs):
     def deserialize_inputs(self, target: torch._ops.OpOverload, serialized_node: Node):
         schema_args = target._schema.arguments
         actual_args = {
-            input.name: self.deserialize_input(input.arg) for input in serialized_node.inputs
+            input.name: self.deserialize_input(input.arg)
+            for input in serialized_node.inputs
         }
         args = []
         kwargs = {}
         for schema_arg in schema_args:
-            is_positional = not schema_arg.has_default_value() and not schema_arg.kwarg_only
+            is_positional = (
+                not schema_arg.has_default_value() and not schema_arg.kwarg_only
+            )
             if is_positional:
                 args.append(actual_args[schema_arg.name])
             else:
@@ -1345,6 +1852,19 @@ def deserialize_inputs(self, target: torch._ops.OpOverload, serialized_node: Nod
                     kwargs[schema_arg.name] = actual_args[schema_arg.name]
         return tuple(args), kwargs
 
+    def deserialize_hoo_inputs(self, inputs: List[NamedArgument]):
+        """
+        For deserializing HOO inputs since HOOs do not have a schema.
+        """
+        args = []
+        kwargs = {}
+        for input_ in inputs:
+            if input_.name != "":
+                kwargs[input_.name] = self.deserialize_input(input_.arg)
+            else:
+                args.append(self.deserialize_input(input_.arg))
+        return (tuple(args), kwargs)
+
     def deserialize_input(self, inp: Argument) -> Any:
         value = inp.value
         typ_ = inp.type
@@ -1364,7 +1884,7 @@ def deserialize_input(self, inp: Argument) -> Any:
             assert isinstance(value, GraphArgument)
             with self.save_graph_module():
                 self.deserialize_graph(value.graph)
-                submodule = torch._export.exported_program._create_graph_module_for_export(self.module, self.graph)
+                submodule = ep._create_graph_module_for_export(self.module, self.graph)
             self.module.register_module(value.name, submodule)
             return self.graph.create_node(
                 "get_attr",
@@ -1388,32 +1908,53 @@ def deserialize_input(self, inp: Argument) -> Any:
         elif isinstance(value, list):
             if len(value) == 0:
                 return []
-            elif isinstance(value[0], TensorArgument):
+            elif typ_ == "as_tensors":
                 result = []
                 for arg in value:
                     result.append(self.serialized_name_to_node[arg.name])
                 return result
-            elif isinstance(value[0], (int, float, bool)):
+            elif typ_ in ("as_ints", "as_floats", "as_bools", "as_strings"):
                 # convert from serialized.python.types.List to python list
                 return list(value)
-            elif isinstance(value[0], (SymIntArgument, SymBoolArgument)):
+            elif typ_ in ("as_sym_ints", "as_sym_bools"):
                 return [self.deserialize_sym_argument(arg) for arg in value]
-            elif isinstance(value[0], OptionalTensorArgument):
+            elif typ_ == "as_optional_tensors":
+
                 def deserialize_optional_tensor_args(a):
                     if a.type == "as_none":
                         return None
                     elif a.type == "as_tensor":
-                        return self.serialized_name_to_node[a.value]
+                        return self.serialized_name_to_node[a.value.name]
                     else:
                         raise SerializeError(f"Unhandled argument {inp}")
+
                 return list(map(deserialize_optional_tensor_args, value))
             else:
                 raise SerializeError(f"Unhandled argument {inp}")
         elif typ_ == "as_custom_obj":
+            if inp.as_custom_obj.name in self.serialized_name_to_node:
+                # Custom object has been lifted as an input
+                return self.serialized_name_to_node[inp.as_custom_obj.name]
             return self.constants[inp.as_custom_obj.name]
+        elif typ_ == "as_operator":
+            return self.deserialize_operator(inp.as_operator)
         else:
             raise SerializeError(f"Unhandled argument {inp}")
 
+    def deserialize_constant_input(self, inp: ConstantValue) -> Any:
+        if inp.type == "as_int":
+            return int(inp.as_int)
+        elif inp.type == "as_float":
+            return float(inp.as_float)
+        elif inp.type == "as_string":
+            return str(inp.as_string)
+        elif inp.type == "as_bool":
+            return bool(inp.as_bool)
+        elif inp.type == "as_none":
+            return None
+        else:
+            raise SerializeError(f"Unhandled constant argument {inp} to deserialize")
+
     def deserialize_sym_argument(self, sym_arg):
         if isinstance(sym_arg, SymIntArgument):
             if sym_arg.type == "as_int":
@@ -1431,27 +1972,42 @@ def deserialize_sym_op_outputs(self, serialized_node: Node, fx_node: torch.fx.No
         self.sync_fx_node(serialized_node.outputs[0].value.as_name, fx_node)
 
     def deserialize_outputs(self, serialized_node: Node, fx_node: torch.fx.Node):
-        # Simple case for single tensor return.
-        assert isinstance(fx_node.target, torch._ops.OpOverload)
-        returns = fx_node.target._schema.returns
-
         # Check single value return
-        if len(returns) == 0:
+        if len(serialized_node.outputs) == 0:
             return
-        if _is_single_tensor_return(fx_node.target):
+        if (
+            len(serialized_node.outputs) == 1
+            and serialized_node.outputs[0].type == "as_tensor"
+        ):
             self.sync_fx_node(serialized_node.outputs[0].as_tensor.name, fx_node)
             return
-        elif len(returns) == 1 and isinstance(serialized_node.outputs[0].value, (SymIntArgument, SymBoolArgument)):
+        elif len(serialized_node.outputs) == 1 and isinstance(
+            serialized_node.outputs[0].value, (SymIntArgument, SymBoolArgument)
+        ):
             self.sync_fx_node(serialized_node.outputs[0].value.as_name, fx_node)
             return
 
         self.deserialize_multiple_outputs(serialized_node, fx_node)
 
-    def deserialize_multiple_outputs(self, serialized_node: Node, fx_node: torch.fx.Node) -> None:
+    def deserialize_multiple_outputs(
+        self, serialized_node: Node, fx_node: torch.fx.Node
+    ) -> None:
         deserialized_metadata = self.deserialize_metadata(serialized_node.metadata)
 
-        def generate_getitem(meta_val, fx_node: torch.fx.Node, arg: TensorArgument, idx: int):
-            name = arg.name
+        def generate_getitem(
+            meta_val,
+            fx_node: torch.fx.Node,
+            arg: Union[TensorArgument, SymIntArgument],
+            idx: int,
+        ):
+            if isinstance(arg, TensorArgument):
+                name = arg.name
+            elif isinstance(arg, SymIntArgument):
+                name = arg.as_name
+            else:
+                raise AssertionError(
+                    f"generate_getitem got unknown argument type {type(arg)}"
+                )
             individual_output = self.graph.create_node(
                 "call_function",
                 operator.getitem,
@@ -1468,7 +2024,7 @@ def generate_getitems(meta_val, fx_node: torch.fx.Node, args):
             for idx, arg in enumerate(args):
                 if isinstance(arg, Argument):
                     arg = arg.value
-                if isinstance(arg, TensorArgument):
+                if isinstance(arg, (TensorArgument, SymIntArgument)):
                     generate_getitem(meta_val, fx_node, arg, idx)
                 elif isinstance(arg, (list, tuple)):
                     list_output = self.graph.create_node(
@@ -1479,7 +2035,7 @@ def generate_getitems(meta_val, fx_node: torch.fx.Node, args):
                     meta_val.append([])
                     generate_getitems(meta_val[-1], list_output, arg)
                     list_output.meta.update(deserialized_metadata)
-                    list_output.meta['val'] = meta_val[-1]
+                    list_output.meta["val"] = meta_val[-1]
                 else:
                     raise NotImplementedError(f"Unimplemented node output type: {arg}")
 
@@ -1529,8 +2085,21 @@ def deserialize_meta_func(serialized_target: str):
             # Originally serialized to "key,orig_path,type_str"
             def import_nn_module_stack(key, path, ty):
                 return key, (path, ty)
+
+            # Helper function that splits strings by commas except for those
+            # encapsulated by parens, which are valid traces.
+            # TODO: Currently this is needed due to indexing Sequential
+            # layers introducing names in the form "layer.slice(1, None, None)".
+            # If that naming is improved, this fancier splitting can probably be
+            # reverted to a simple split by comma.
+            def metadata_split(metadata):
+                # Remove the parentheses and commas inside them
+                metadata = re.sub(r'\(.*?\)', '', metadata)
+                # Split the string by comma, except for those inside parentheses
+                return re.split(r'(?<!\()\s*,\s*(?!\()', metadata)
+
             nn_module_stack = dict(
-                import_nn_module_stack(*item.split(","))
+                import_nn_module_stack(*metadata_split(item))
                 for item in nn_module_stack_str.split(ST_DELIMITER)
             )
             ret["nn_module_stack"] = nn_module_stack
@@ -1542,34 +2111,53 @@ def import_nn_module_stack(key, path, ty):
                 name, target_str = source_fn_str.split(",")
                 source_fn_st.append((name, deserialize_meta_func(target_str)))
             ret["source_fn_stack"] = source_fn_st
+
+        if torch_fn_str := metadata.get("torch_fn"):
+            ret["torch_fn"] = tuple(torch_fn_str.split(ST_DELIMITER))
         return ret
 
     def deserialize_argument_spec(self, x: Argument) -> ep.ArgumentSpec:
         if x.type == "as_tensor":
-            return PyTensorArgument(name=x.as_tensor.name)
+            return ep.TensorArgument(name=x.as_tensor.name)
         elif x.type == "as_sym_int":
-            return PySymIntArgument(name=x.as_sym_int.as_name)
+            return ep.SymIntArgument(name=x.as_sym_int.as_name)
+        elif x.type == "as_custom_obj":
+            return ep.ConstantArgument(name=x.as_custom_obj.name, value=self.deserialize_input(x))
         else:
-            return PyConstantArgument(value=self.deserialize_input(x))
+            return ep.ConstantArgument(name="", value=self.deserialize_input(x))
 
-    def deserialize_module_call_signature(self, module_call_signature: ModuleCallSignature) -> ep.ModuleCallSignature:
+    def deserialize_module_call_signature(
+        self, module_call_signature: ModuleCallSignature
+    ) -> ep.ModuleCallSignature:
         return ep.ModuleCallSignature(
-            inputs=[self.deserialize_argument_spec(x) for x in module_call_signature.inputs],
-            outputs=[self.deserialize_argument_spec(x) for x in module_call_signature.outputs],
+            inputs=[
+                self.deserialize_argument_spec(x) for x in module_call_signature.inputs
+            ],
+            outputs=[
+                self.deserialize_argument_spec(x) for x in module_call_signature.outputs
+            ],
             in_spec=treespec_loads(module_call_signature.in_spec),
             out_spec=treespec_loads(module_call_signature.out_spec),
         )
 
-    def deserialize_module_call_graph(self, module_call_graph: List[ModuleCallEntry]) -> List[ep.ModuleCallEntry]:
+    def deserialize_module_call_graph(
+        self, module_call_graph: List[ModuleCallEntry]
+    ) -> List[ep.ModuleCallEntry]:
         return [
             ep.ModuleCallEntry(
                 fqn=entry.fqn,
-                signature=self.deserialize_module_call_signature(entry.signature) if entry.signature else None,
-            ) for entry in module_call_graph
+                signature=(
+                    self.deserialize_module_call_signature(entry.signature)
+                    if entry.signature
+                    else None
+                ),
+            )
+            for entry in module_call_graph
         ]
 
 
-class ExportedProgramDeserializer:
+@final
+class ExportedProgramDeserializer(metaclass=Final):
     def __init__(self, expected_opset_version: Optional[Dict[str, int]] = None):
         self.expected_opset_version: Dict[str, int] = {}
         if expected_opset_version:
@@ -1591,59 +2179,65 @@ def deserialize_range_constraints(
         return range_constraints
 
     def deserialize(
-        self, serialized_artifact: SerializedArtifact
+        self,
+        exported_program: ExportedProgram,
+        state_dict: Union[Dict[str, torch.Tensor], bytes],
+        constants: Union[Dict[str, torch.Tensor], bytes],
+        example_inputs: Optional[Union[Tuple[Tuple[torch.Tensor, ...], Dict[str, Any]], bytes]] = None,
     ) -> ep.ExportedProgram:
-        assert isinstance(serialized_artifact.exported_program, ExportedProgram)
+        assert isinstance(exported_program, ExportedProgram)
+        version = exported_program.schema_version
 
-        if serialized_artifact.exported_program.schema_version.major != SCHEMA_VERSION[0]:
+        # TODO(zhxchen17) blocked on thrift schema refactor
+        if version.major != SCHEMA_VERSION[0] and not (version.major == 0 and version.minor == 0):
             raise SerializeError(
-                f"Serialized schema version {serialized_artifact.exported_program.schema_version} "
+                f"Serialized schema version {exported_program.schema_version} "
                 f"does not match our current schema version {SCHEMA_VERSION}."
             )
 
         symbol_name_to_range = {
-            k: symbolic_shapes.ValueRanges(_int_to_sympy_int(v.min_val), _int_to_sympy_int(v.max_val))
-            for k, v in serialized_artifact.exported_program.range_constraints.items()
-        }
-        constants = deserialize_torch_artifact(serialized_artifact.constants)
-
-        # TODO: No need to do this once CustomClassHolders are lifted to the ExportedProgram
-        tensor_constants = {
-            k: v for k, v in constants.items() if isinstance(v, torch.Tensor)
+            k: symbolic_shapes.ValueRanges(
+                _int_to_sympy_int(v.min_val), _int_to_sympy_int(v.max_val)
+            )
+            for k, v in exported_program.range_constraints.items()
         }
-
         res = (
             GraphModuleDeserializer()
             .deserialize(
-                serialized_artifact.exported_program.graph_module,
-                symbol_name_to_range,
+                exported_program.graph_module,
+                state_dict,
                 constants,
+                example_inputs,
+                symbol_name_to_range,
             )
         )
         range_constraints = self.deserialize_range_constraints(
-            symbol_name_to_range, res.names_to_symbols,
+            symbol_name_to_range,
+            res.names_to_symbols,
         )
-        model_opset_version: Optional[Dict[str, int]] = serialized_artifact.exported_program.opset_version
+        model_opset_version: Optional[Dict[str, int]] = exported_program.opset_version
         self._validate_model_opset_version(model_opset_version)
 
-        upgrader = GraphModuleOpUpgrader(self.expected_opset_version, model_opset_version)
-
-        state_dict = deserialize_torch_artifact(serialized_artifact.state_dict)
+        upgrader = GraphModuleOpUpgrader(
+            self.expected_opset_version, model_opset_version
+        )
 
         exported_program = ep.ExportedProgram(
             root=res.graph_module,
             graph=res.graph_module.graph,
             graph_signature=res.signature,
-            state_dict=state_dict,  # type: ignore[arg-type]
+            state_dict=res.state_dict,  # type: ignore[arg-type]
             range_constraints=range_constraints,
             module_call_graph=res.module_call_graph,
-            example_inputs=None,
-            verifier=load_verifier(serialized_artifact.exported_program.dialect),
-            tensor_constants=tensor_constants,
+            example_inputs=res.example_inputs,
+            verifier=load_verifier(exported_program.dialect),
+            constants=res.constants,
         )
         return upgrader.upgrade(exported_program)
 
-    def _validate_model_opset_version(self, model_opset_version: Optional[Dict[str, int]]):
+    def _validate_model_opset_version(
+        self, model_opset_version: Optional[Dict[str, int]]
+    ):
         """Compare model_opset_version with expected_opset_version and raise error if we can't resolve the version
         difference.
         E.g., model_opset_version = {"aten": 3, "custom": 4}
@@ -1662,15 +2256,15 @@ def _validate_model_opset_version(self, model_opset_version: Optional[Dict[str,
         """
         if not model_opset_version:
             raise RuntimeError("Serialized model should have opset version.")
-        common_namespaces = {key for key in model_opset_version if key in self.expected_opset_version}
+        common_namespaces = {
+            key for key in model_opset_version if key in self.expected_opset_version
+        }
         for namespace in common_namespaces:
-            assert (
-                isinstance(model_version := model_opset_version[namespace], int)
-            ), f"model_opset_version value should be int, got {model_opset_version[namespace]}"
+            model_version = model_opset_version[namespace]
+            assert isinstance(model_version, int), f"model_opset_version value should be int, got {model_version}"
 
-            assert (
-                isinstance(compiler_version := self.expected_opset_version[namespace], int)
-            ), f"expected_opset_version value should be int, got {self.expected_opset_version[namespace]}"
+            compiler_version = self.expected_opset_version[namespace]
+            assert isinstance(compiler_version, int), f"expected_opset_version value should be int, got {compiler_version}"
 
             # TODO(larryliu0820): Add support for upgrader & downgrader
             if model_version != compiler_version:
@@ -1681,7 +2275,10 @@ def _validate_model_opset_version(self, model_opset_version: Optional[Dict[str,
         for namespace in model_opset_version:
             if namespace in common_namespaces:
                 continue
-            log.warning("Compiler doesn't have a version table for op namespace: {ns}. ", extra={"ns": namespace})
+            log.warning(
+                "Compiler doesn't have a version table for op namespace: {ns}. ",
+                extra={"ns": namespace},
+            )
 
 
 class EnumEncoder(json.JSONEncoder):
@@ -1689,43 +2286,47 @@ def default(self, obj):
         if isinstance(obj, Enum):
             return obj.value
         if isinstance(obj, bytes):
-            return base64.b64encode(obj).decode('utf-8')
+            return base64.b64encode(obj).decode("utf-8")
         return super().default(obj)
 
 
+def _dataclass_to_dict(obj):
+    if isinstance(obj, _Union):
+        return {obj.type: _dataclass_to_dict(obj.value)}
+    elif dataclasses.is_dataclass(obj):
+        return {
+            f.name: _dataclass_to_dict(getattr(obj, f.name))
+            for f in dataclasses.fields(obj)
+            if not (f.default is None and getattr(obj, f.name) is None)
+        }
+    elif isinstance(obj, list):
+        return [_dataclass_to_dict(x) for x in obj]
+    elif isinstance(obj, tuple):
+        return tuple(_dataclass_to_dict(x) for x in obj)
+    elif isinstance(obj, dict):
+        return {k: _dataclass_to_dict(v) for k, v in obj.items()}
+    else:
+        return obj
+
+
 def serialize(
     exported_program: ep.ExportedProgram,
     opset_version: Optional[Dict[str, int]] = None,
 ) -> SerializedArtifact:
-    serialized_artifact = (
-        ExportedProgramSerializer(opset_version).serialize(exported_program)
+    serialized_program = ExportedProgramSerializer(opset_version).serialize(
+        exported_program
     )
-    assert isinstance(serialized_artifact.exported_program, ExportedProgram)
-
-    def _dataclass_to_dict(obj):
-        if isinstance(obj, _Union):
-            return {"$type": obj.type, "$value": _dataclass_to_dict(obj.value)}
-        elif dataclasses.is_dataclass(obj):
-            return {
-                f.name: _dataclass_to_dict(getattr(obj, f.name)) for f in dataclasses.fields(obj)
-            }
-        elif isinstance(obj, list):
-            return [_dataclass_to_dict(x) for x in obj]
-        elif isinstance(obj, tuple):
-            return tuple(_dataclass_to_dict(x) for x in obj)
-        elif isinstance(obj, dict):
-            return {k: _dataclass_to_dict(v) for k, v in obj.items()}
-        else:
-            return obj
+    assert isinstance(serialized_program.exported_program, ExportedProgram)
 
     json_program = json.dumps(
-        _dataclass_to_dict(serialized_artifact.exported_program), cls=EnumEncoder
+        _dataclass_to_dict(serialized_program.exported_program), cls=EnumEncoder
     )
-    json_bytes = json_program.encode('utf-8')
+    json_bytes = json_program.encode("utf-8")
     artifact = SerializedArtifact(
         json_bytes,
-        serialized_artifact.state_dict,
-        serialized_artifact.constants
+        serialized_program.state_dict,
+        serialized_program.constants,
+        serialized_program.example_inputs
     )
     return artifact
 
@@ -1740,8 +2341,9 @@ def _dict_to_dataclass(cls, data):
         return _dict_to_dataclass(ty_args[0], data)
     elif isinstance(cls, type) and issubclass(cls, _Union):
         assert isinstance(data, dict)
-        _type = data["$type"]
-        _value = data["$value"]
+        assert len(data) == 1
+        _type = next(iter(data.keys()))
+        _value = next(iter(data.values()))
         assert isinstance(_type, str)
         field_type = cls.__annotations__[_type]
         return cls.create(**{_type: _dict_to_dataclass(field_type, _value)})
@@ -1757,16 +2359,10 @@ def _dict_to_dataclass(cls, data):
         if len(data) == 0:
             return data
         d_type = typing.get_args(cls)[0]
-        return [
-            _dict_to_dataclass(d_type, d)
-            for d in data
-        ]
+        return [_dict_to_dataclass(d_type, d) for d in data]
     elif isinstance(data, dict):
         v_type = typing.get_args(cls)[1]
-        return {
-            k: _dict_to_dataclass(v_type, v)
-            for k, v in data.items()
-        }
+        return {k: _dict_to_dataclass(v_type, v) for k, v in data.items()}
     return data
 
 
@@ -1775,22 +2371,23 @@ def deserialize(
     expected_opset_version: Optional[Dict[str, int]] = None,
 ) -> ep.ExportedProgram:
     assert isinstance(artifact.exported_program, bytes)
-    exported_program_str = artifact.exported_program.decode('utf-8')
+    exported_program_str = artifact.exported_program.decode("utf-8")
     exported_program_dict = json.loads(exported_program_str)
     serialized_exported_program = _dict_to_dataclass(ExportedProgram, exported_program_dict)
     return (
         ExportedProgramDeserializer(expected_opset_version)
         .deserialize(
-            SerializedArtifact(
-                serialized_exported_program,
-                artifact.state_dict,
-                artifact.constants
-            )
+            serialized_exported_program,
+            artifact.state_dict,
+            artifact.constants,
+            artifact.example_inputs,
         )
     )
 
 
-def _canonicalize_graph(sorted_inputs, sorted_outputs, graph) -> Graph:
+def _canonicalize_graph(
+    sorted_inputs, sorted_outputs, graph
+) -> Tuple[Graph, Dict[str, str]]:
     def _get_argument(a: Argument):
         if a.type == "as_none":
             return None
@@ -1836,8 +2433,10 @@ def _get_argument(a: Argument):
             return a.as_optional_tensors
         elif a.type == "as_custom_obj":
             return None
+        elif a.type == "as_operator":
+            return None
         else:
-            raise AssertionError(f"Unknown argument type: {a}")
+            raise AssertionError(f"Unknown input type to the ExportedProgram: {a}")
 
     # Stage 1: Reorder named items.
     def for_args(f, a):
@@ -1852,7 +2451,7 @@ class Edges:
 
         graph_inputs: Set[str] = set()
         def_table: Dict[str, int] = {}
-        edges: Dict[int, Edges] = defaultdict(lambda: Edges([], 0))
+        edges: Dict[int, Edges] = {}
         candidates: List[Tuple[str, List[Tuple[str, List[int]]], int]] = []
         rank: Dict[str, int] = {}
         ret: List[Node] = []
@@ -1871,8 +2470,7 @@ def get_name(a) -> Optional[str]:
                     raise AssertionError(f"Unknown argument type: {a}")
             elif isinstance(a, OptionalTensorArgument):
                 if a.type == "as_tensor":
-                    assert isinstance(a.as_tensor, str)
-                    return a.as_tensor
+                    return a.as_tensor.name
                 elif a.type == "as_none":
                     return None
                 else:
@@ -1881,13 +2479,15 @@ def get_name(a) -> Optional[str]:
                 raise AssertionError(f"Unknown argument type: {a}")
 
         for i in sorted_inputs:
+
             def add_input(a):
                 if s := get_name(a):
                     graph_inputs.add(s)
 
-            for_args(add_input , i)
+            for_args(add_input, i)
 
         for idx, node in enumerate(nodes):
+
             def add_def(a):
                 if s := get_name(a):
                     assert s not in def_table
@@ -1896,7 +2496,10 @@ def add_def(a):
             for o in node.outputs:
                 for_args(add_def, o)
 
+            edges[idx] = Edges([], 0)
+
         for idx, user in enumerate(nodes):
+
             def add_edge(a):
                 if s := get_name(a):
                     if s not in def_table:
@@ -1928,6 +2531,7 @@ def get_ranks(i):
                 ranks = []
                 for_args(lambda x: ranks.append(get_rank(x)), i)
                 return ranks
+
             node = nodes[idx]
             args_rank = [(a.name, get_ranks(a.arg)) for a in node.inputs]
             heapq.heappush(candidates, (node.target, args_rank, idx))
@@ -1954,6 +2558,7 @@ def get_ranks(i):
         return ret
 
     sorted_nodes = sort_nodes(graph.nodes)
+    assert len(sorted_nodes) == len(graph.nodes)
 
     # Stage 2: Rename nodes.
     name_table: Dict[str, str] = {}
@@ -1993,8 +2598,7 @@ def replace_use(a):
                 a.as_name = name_table.get(a.as_name, a.as_name)
         elif isinstance(a, OptionalTensorArgument):
             if a.type == "as_tensor":
-                assert isinstance(a.as_tensor, str)
-                a.as_tensor = name_table.get(a.as_tensor, a.as_tensor)
+                a.as_tensor.name = name_table.get(a.as_tensor.name, a.as_tensor.name)
         else:
             raise AssertionError(f"Unknown argument type: {a}")
 
@@ -2017,9 +2621,13 @@ def replace_use(a):
         n.metadata.clear()
 
     # Stage 4: Aggregate values.
-    sorted_tensor_values = dict(sorted(graph.tensor_values.items(), key=lambda x: x[0]))
-    sorted_sym_int_values = dict(sorted(graph.sym_int_values.items(), key=lambda x: x[0]))
-    sorted_sym_bool_values = dict(sorted(graph.sym_bool_values.items(), key=lambda x: x[0]))
+    sorted_tensor_values = dict(sorted(graph.tensor_values.items(), key=operator.itemgetter(0)))
+    sorted_sym_int_values = dict(
+        sorted(graph.sym_int_values.items(), key=operator.itemgetter(0))
+    )
+    sorted_sym_bool_values = dict(
+        sorted(graph.sym_bool_values.items(), key=operator.itemgetter(0))
+    )
 
     # Stage 5: Recurse in subgraphs.
     counter = 0
@@ -2028,14 +2636,12 @@ def replace_use(a):
             a = i.arg
             if a.type == "as_graph":
                 a.as_graph.graph = _canonicalize_graph(
-                    a.as_graph.graph.inputs,
-                    a.as_graph.graph.outputs,
-                    a.as_graph.graph
+                    a.as_graph.graph.inputs, a.as_graph.graph.outputs, a.as_graph.graph
                 )
                 a.as_graph.name = f"_g{counter}"
                 counter += 1
 
-    return Graph(
+    graph = Graph(
         inputs=sorted_inputs,
         outputs=sorted_outputs,
         nodes=sorted_nodes,
@@ -2044,13 +2650,32 @@ def replace_use(a):
         sym_bool_values=sorted_sym_bool_values,
         is_single_tensor_return=graph.is_single_tensor_return,
     )
+    return graph, name_table
 
 
 def canonicalize(ep: ExportedProgram) -> ExportedProgram:
+    """
+    Normalize a serialized ExportedProgram, so that different eager program which
+    shares the same semantics can get a single representation on disk.
+
+    This function canonicalizes an ExportedProgram by:
+
+    1. Sorting nodes in topological order.
+    2. Rename nodes to have unique names.
+    3. Remove unstable fields.
+    4. Aggregate the above program fields.
+    5. Recurse in subgraphs.
+
+    Args:
+        ep (ExportedProgram): The ExportedProgram to canonicalize.
+
+    Returns:
+        ExportedProgram: The canonicalized exported program.
+    """
     ep = copy.deepcopy(ep)
 
-    opset_version = dict(sorted(ep.opset_version.items(), key=lambda x: x[0]))
-    range_constraints = dict(sorted(ep.range_constraints.items(), key=lambda x: x[0]))
+    opset_version = dict(sorted(ep.opset_version.items(), key=operator.itemgetter(0)))
+    range_constraints = dict(sorted(ep.range_constraints.items(), key=operator.itemgetter(0)))
     module_call_graph = sorted(ep.graph_module.module_call_graph, key=lambda x: x.fqn)
     signature = ep.graph_module.signature
     graph = ep.graph_module.graph
@@ -2062,13 +2687,19 @@ def rank_input(inp) -> Tuple[int, Optional[str], int]:
         idx, (arg, spec) = inp
         assert isinstance(spec, InputSpec)
         if spec.type == "user_input":
-            return 4, None, idx
+            return 5, None, idx
         elif spec.type == "parameter":
             return 1, spec.parameter.parameter_name, idx
         elif spec.type == "buffer":
             return 2, spec.buffer.buffer_name, idx
         elif spec.type == "tensor_constant":
             return 3, spec.tensor_constant.tensor_constant_name, idx
+        elif spec.type == "custom_obj":
+            return 4, spec.custom_obj.custom_obj_name, idx
+        elif spec.type == "token":
+            return 0, None, idx
+        elif spec.type == "constant_input":
+            return 6, spec.constant_input.name, idx
         else:
             raise AssertionError(f"Unknown input type: {spec}")
 
@@ -2076,34 +2707,140 @@ def rank_output(out) -> Tuple[int, Optional[str], int]:
         idx, (arg, spec) = out
         assert isinstance(spec, OutputSpec)
         if spec.type == "user_output":
-            return 2, None, idx
+            return 3, None, idx
         elif spec.type == "loss_output":
-            return 2, None, idx
+            return 3, None, idx
         elif spec.type == "buffer_mutation":
             return 1, spec.buffer_mutation.buffer_name, idx
         elif spec.type == "gradient_to_parameter":
-            return 3, spec.gradient_to_parameter.parameter_name, idx
+            return 4, spec.gradient_to_parameter.parameter_name, idx
         elif spec.type == "gradient_to_user_input":
-            return 4, None, idx
+            return 5, None, idx
+        elif spec.type == "user_input_mutation":
+            return 2, None, idx
+        elif spec.type == "token":
+            return 0, None, idx
         else:
             raise AssertionError(f"Unknown output type: {spec}")
 
-    sorted_ins = sorted(enumerate(zip(graph.inputs, signature.input_specs)), key=rank_input)
-    sorted_inputs, signature.input_specs = zip(*(i for idx, i in sorted_ins))  # type: ignore[assignment]
+    sorted_ins = sorted(
+        enumerate(zip(graph.inputs, signature.input_specs)), key=rank_input
+    )
+    sorted_inputs, input_specs = zip(*(i for idx, i in sorted_ins))  # type: ignore[assignment]
 
-    sorted_outs = sorted(enumerate(zip(graph.outputs, signature.output_specs)), key=rank_output)
-    sorted_outputs, signature.output_specs = zip(*(i for idx, i in sorted_outs))  # type: ignore[assignment]
+    sorted_outs = sorted(
+        enumerate(zip(graph.outputs, signature.output_specs)), key=rank_output
+    )
+    sorted_outputs, output_specs = zip(*(i for idx, i in sorted_outs))  # type: ignore[assignment]
 
-    sorted_graph = _canonicalize_graph(sorted_inputs, sorted_outputs, graph)
+    sorted_graph, replace_table = _canonicalize_graph(
+        sorted_inputs, sorted_outputs, graph
+    )
+
+    def replace_input(inp):
+        assert isinstance(spec, InputSpec)
+        if spec.type == "user_input":
+            arg = spec.user_input.arg
+            if arg.type == "as_tensor":
+                t = arg.as_tensor
+                t.name = replace_table[t.name]
+            elif arg.type == "as_sym_int":
+                s = arg.as_sym_int
+                if s.type == "as_name":
+                    s.as_name = replace_table[s.as_name]
+                elif s.type == "as_int":
+                    pass
+                else:
+                    raise AssertionError(f"Unknown sym_int type: {s}")
+            elif arg.type in (
+                "as_none",
+                "as_bool",
+                "as_int",
+                "as_float",
+                "as_string",
+                "as_custom_obj",
+            ):
+                return
+            else:
+                raise AssertionError(f"Unknown input type: {arg}")
+        elif spec.type == "parameter":
+            t = spec.parameter.arg
+            t.name = replace_table[t.name]
+        elif spec.type == "buffer":
+            t = spec.buffer.arg
+            t.name = replace_table[t.name]
+        elif spec.type == "tensor_constant":
+            t = spec.tensor_constant.arg
+            t.name = replace_table[t.name]
+        elif spec.type == "custom_obj":
+            return
+        elif spec.type == "token":
+            tok = spec.token.arg
+            tok.name = replace_table[tok.name]
+        elif spec.type == "constant_input":
+            return
+        else:
+            raise AssertionError(f"Unknown input type: {spec}")
+
+    def replace_output(out):
+        assert isinstance(spec, OutputSpec)
+        if spec.type == "user_output":
+            arg = spec.user_output.arg
+            if arg.type == "as_tensor":
+                t = arg.as_tensor
+                t.name = replace_table[t.name]
+            elif arg.type == "as_sym_int":
+                s = arg.as_sym_int
+                if s.type == "as_name":
+                    s.as_name = replace_table[s.as_name]
+                elif s.type == "as_int":
+                    pass
+                else:
+                    raise AssertionError(f"Unknown sym_int type: {s}")
+            elif arg.type in ("as_none", "as_int", "as_float", "as_string"):
+                return
+            else:
+                raise AssertionError(f"Unknown input type: {arg}")
+        elif spec.type == "loss_output":
+            t = spec.loss_output.arg
+            t.name = replace_table[t.name]
+        elif spec.type == "buffer_mutation":
+            t = spec.buffer_mutation.arg
+            t.name = replace_table[t.name]
+        elif spec.type == "gradient_to_parameter":
+            t = spec.gradient_to_parameter.arg
+            t.name = replace_table[t.name]
+        elif spec.type == "gradient_to_user_input":
+            g = spec.gradient_to_user_input
+            g.arg.name = replace_table[g.arg.name]
+            g.user_input_name = replace_table[g.user_input_name]
+        elif spec.type == "user_input_mutation":
+            u = spec.user_input_mutation
+            u.arg.name = replace_table[u.arg.name]
+            u.user_input_name = replace_table[u.user_input_name]
+        elif spec.type == "token":
+            tok = spec.token.arg
+            tok.name = replace_table[tok.name]
+        else:
+            raise AssertionError(f"Unknown output type: {spec}")
+
+    for spec in input_specs:
+        replace_input(spec)
+
+    for spec in output_specs:
+        replace_output(spec)
 
     return ExportedProgram(
         graph_module=GraphModule(
             graph=sorted_graph,
-            signature=signature,
+            signature=GraphSignature(
+                input_specs=list(input_specs),
+                output_specs=list(output_specs),
+            ),
             module_call_graph=module_call_graph,
         ),
         opset_version=opset_version,
         range_constraints=range_constraints,
         schema_version=ep.schema_version,
-        dialect=ep.dialect,
+        dialect=ep.dialect
     )
diff --git a/torch/_export/serde/upgrade.py b/torch/_export/serde/upgrade.py
index 0a5827d5ad72d..c34917f3dd074 100644
--- a/torch/_export/serde/upgrade.py
+++ b/torch/_export/serde/upgrade.py
@@ -3,7 +3,7 @@
 from typing import Tuple, Dict, Optional, List
 
 import torch
-from torch._export import export
+from torch.export import export
 from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse
 from torch._export.pass_infra.node_metadata import NodeMetadata
 from torch._export.pass_infra.proxy_value import ProxyValue
diff --git a/torch/_export/utils.py b/torch/_export/utils.py
index 2f9890fb838bc..59648ccadab25 100644
--- a/torch/_export/utils.py
+++ b/torch/_export/utils.py
@@ -1,82 +1,129 @@
 import dataclasses
+import inspect
 import math
+import operator
+import re
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Type
 
 import torch
-
-from torch._export import ExportedProgram
 from torch._subclasses.fake_tensor import FakeTensor
+
+from torch.export import ExportedProgram
+from torch.export.exported_program import (
+    _name_hoo_subgraph_placeholders,
+    _rename_without_collisions,
+)
+from torch.export.graph_signature import InputKind, OutputKind
 from torch.utils._pytree import (
     _register_pytree_node,
     Context,
-    DumpableContext,
     FlattenFunc,
     FromDumpableContextFn,
+    GetAttrKey,
+    KeyPath,
+    keystr,
+    MappingKey,
+    SequenceKey,
     ToDumpableContextFn,
-    tree_flatten,
+    tree_flatten_with_path,
     UnflattenFunc,
 )
 
-
-SERIALIZED_DATACLASS_TO_PYTHON_DATACLASS: Dict[str, Type[Any]] = {}
-
-
-@torch._dynamo.disable
-def _check_input_constraints_pre_hook(self, *args, **kwargs):
-    flat_args, _ = tree_flatten(args)
-    return _check_input_constraints_for_graph(
-        [node for node in self.graph.nodes if node.op == "placeholder"],
-        flat_args,
-        self.range_constraints,
-    )
+placeholder_prefixes = {
+    InputKind.USER_INPUT: "",
+    InputKind.PARAMETER: "p_",
+    InputKind.BUFFER: "b_",
+    InputKind.CONSTANT_TENSOR: "c_",
+    InputKind.CUSTOM_OBJ: "obj_",
+    InputKind.TOKEN: "token",
+}
 
 
 def _check_input_constraints_for_graph(
-    input_placeholders: List[torch.fx.Node], args, range_constraints
+    input_placeholders: List[torch.fx.Node], flat_args_with_path, range_constraints
 ):
-    def check(cond, msg):
-        if not cond:
-            # TODO(avik): maybe add more context, e.g., graph signature
-            raise RuntimeError(msg)
+    def get_keystr(key_path: KeyPath) -> str:
+        """For a given index into the flat_args, return a human readable string
+        describing how to access it, e.g. "*args["foo"][0].bar"
+        """
+        # Prefix the keypath with "*args" or "**kwargs" to make it clearer where
+        # the arguments come from. Ultimately we ought to serialize the
+        # original arg names for the best error message here.
+        args_kwargs_key_path = key_path[0]
+        assert isinstance(args_kwargs_key_path, SequenceKey)
+        if args_kwargs_key_path.idx == 0:
+            return f"*args{keystr(key_path[1:])}"
+        else:
+            kwarg_key = key_path[1]
+            assert isinstance(kwarg_key, MappingKey)
+            name = str(kwarg_key)[1:-1]  # get rid of the enclosed []
+            return f"{name}{keystr(key_path[2:])}"
 
     import sympy
 
     from torch._export.passes.add_runtime_assertions_for_constraints_pass import (
         _convert_range_to_int,
     )
+    from torch.utils._sympy.solve import try_solve
 
-    check(
-        len(args) == len(input_placeholders),
-        "Unexpected number of inputs "
-        f"(expected {len(input_placeholders)}, got {len(args)})",
-    )
+    if len(flat_args_with_path) != len(input_placeholders):
+        raise RuntimeError(
+            "Unexpected number of inputs "
+            f"(expected {len(input_placeholders)}, got {len(flat_args_with_path)})"
+        )
     # NOTE: export already guarantees that the same symbol is used in metadata
     # for all InputDims related by equality constraints, so we can just unify
     # symbols with given input dimension values to check equality constraints.
     unification_map: "Dict[sympy.Symbol, Any]" = {}
-    for arg, node in zip(args, input_placeholders):
-        node_val = node.meta["val"]
+    for (key_path, arg), node in zip(flat_args_with_path, input_placeholders):
+        node_val = node.meta.get("val")
         if isinstance(node_val, FakeTensor):
-            check(
-                isinstance(arg, torch.Tensor),
-                f"Expected input {node.name} to be a tensor, but got {type(arg)}",
-            )
-            check(
-                len(node_val.shape) == len(arg.shape),
-                f"Unexpected number of dimensions in input {node.name}.shape "
-                f"(expected {node_val.shape}, got {arg.shape})",
-            )
+            if not isinstance(arg, torch.Tensor):
+                raise RuntimeError(
+                    f"Expected input at {get_keystr(key_path)} to be a tensor, but got {type(arg)}",
+                )
+
+            if len(node_val.shape) != len(arg.shape):
+                raise RuntimeError(
+                    f"Unexpected number of dimensions in input at {get_keystr(key_path)}.shape "
+                    f"(expected {node_val.shape}, got {arg.shape})"
+                )
+
             for j, (arg_dim, node_dim) in enumerate(zip(arg.shape, node_val.shape)):
-                if isinstance(node_dim, torch.SymInt):
-                    if node_dim.node.expr in unification_map:
-                        existing_dim = unification_map[node_dim.node.expr]
-                        check(
-                            arg_dim == existing_dim,
-                            f"Expected input {node.name}.shape[{j}] to be equal to "
-                            f"{existing_dim}, but got {arg_dim}",
-                        )
+                # TODO(avik): Assert the following property in the IR verifier:
+                # node_dim is either an int or a SymInt containing an int or a unary sympy.Expr
+                if (
+                    isinstance(node_dim, torch.SymInt)
+                    and len(node_dim.node.expr.free_symbols) == 1
+                ):
+                    symbol = next(iter(node_dim.node.expr.free_symbols))
+                    if symbol in unification_map:
+                        existing_dim = node_dim.node.expr.subs(unification_map)
+                        if arg_dim != existing_dim:
+                            raise RuntimeError(
+                                f"Expected input at {get_keystr(key_path)}.shape[{j}] to be equal to "
+                                f"{existing_dim}, but got {arg_dim}",
+                            )
                     else:
-                        unification_map[node_dim.node.expr] = arg_dim
+                        if (
+                            isinstance(arg_dim, torch.SymInt)
+                            and not arg_dim.node.expr.is_number
+                        ):
+                            # This can happen when, say, arg is a fake tensor.
+                            # We do not run checks on symbolic shapes of fake inputs as
+                            # such checks can affect the shape env.
+                            pass
+                        else:
+                            solution = try_solve(
+                                sympy.Eq(node_dim.node.expr, arg_dim), symbol
+                            )
+                            if solution is None:
+                                raise RuntimeError(  # noqa: TRY200
+                                    f"Expected input {node.name}.shape[{j}] = {arg_dim} to be "
+                                    f"of the form {node_dim.node.expr}, where {symbol} is an integer"
+                                )
+                            else:
+                                unification_map[symbol] = int(solution[1])
 
                     if node_dim.node.expr in range_constraints:
                         min_val, max_val = _convert_range_to_int(
@@ -84,28 +131,32 @@ def check(cond, msg):
                         )
                         # NOTE: we allow dimensions to be 0/1 at runtime
                         if min_val > 2:
-                            check(
-                                arg_dim >= min_val,
-                                f"Expected input {node.name}.shape[{j}] to be >= "
-                                f"{min_val}, but got {arg_dim}",
-                            )
+                            if arg_dim < min_val:
+                                raise RuntimeError(
+                                    f"Expected input at {get_keystr(key_path)}.shape[{j}] to be >= "
+                                    f"{min_val}, but got {arg_dim}",
+                                )
                         if max_val < math.inf:
-                            check(
-                                arg_dim <= max_val,
-                                f"Expected input {node.name}.shape[{j}] to be <= "
-                                f"{max_val}, but got {arg_dim}",
-                            )
+                            if arg_dim > max_val:
+                                raise RuntimeError(
+                                    f"Expected input at {get_keystr(key_path)}.shape[{j}] to be <= "
+                                    f"{max_val}, but got {arg_dim}",
+                                )
                 else:
-                    check(
-                        arg_dim == node_dim,
-                        f"Expected input {node.name}.shape[{j}] to be equal to "
-                        f"{node_dim}, but got {arg_dim}",
-                    )
+                    if arg_dim != node_dim:
+                        if isinstance(
+                            node_dim, torch.SymInt
+                        ):  # this means we deferred a guard from export analysis to runtime, let this pass
+                            continue
+                        raise RuntimeError(
+                            f"Expected input at {get_keystr(key_path)}.shape[{j}] to be equal to "
+                            f"{node_dim}, but got {arg_dim}",
+                        )
         elif isinstance(node_val, (int, float, str)):
-            check(
-                type(arg) == type(node_val) and arg == node_val,
-                f"Expected input {node.name} to be equal to {node_val}, but got {arg}",
-            )
+            if type(arg) != type(node_val) or arg != node_val:
+                raise RuntimeError(
+                    f"Expected input at {get_keystr(key_path)} to be equal to {node_val}, but got {arg}",
+                )
 
 
 def register_dataclass_as_pytree_node(
@@ -122,9 +173,6 @@ def register_dataclass_as_pytree_node(
         cls
     ), f"Only dataclasses can be registered with this function: {cls}"
 
-    serialized_type = f"{cls.__module__}.{cls.__qualname__}"
-    SERIALIZED_DATACLASS_TO_PYTHON_DATACLASS[serialized_type] = cls
-
     def default_flatten_fn(obj: Any) -> Tuple[List[Any], Context]:
         flattened = []
         flat_names = []
@@ -136,21 +184,15 @@ def default_flatten_fn(obj: Any) -> Tuple[List[Any], Context]:
                 flat_names.append(name)
             else:
                 none_names.append(name)
-        return flattened, (cls, flat_names, none_names)
+        return flattened, [flat_names, none_names]
 
     def default_unflatten_fn(values: Iterable[Any], context: Context) -> Any:
-        typ, flat_names, none_names = context
-        return typ(**dict(zip(flat_names, values)), **{k: None for k in none_names})
+        flat_names, none_names = context
+        return cls(**dict(zip(flat_names, values)), **dict.fromkeys(none_names))
 
-    def default_to_dumpable_context(context: Context) -> DumpableContext:
-        return (serialized_type, context[1], context[2])
-
-    def default_from_dumpable_context(dumpable_context: DumpableContext) -> Context:
-        return (
-            SERIALIZED_DATACLASS_TO_PYTHON_DATACLASS[dumpable_context[0]],
-            dumpable_context[1],
-            dumpable_context[2],
-        )
+    def default_flatten_fn_with_keys(obj: Any) -> Tuple[List[Any], Context]:
+        flattened, (flat_names, none_names) = flatten_fn(obj)  # type: ignore[misc]
+        return [(MappingKey(k), v) for k, v in zip(flat_names, flattened)], flat_names
 
     flatten_fn = flatten_fn if flatten_fn is not None else default_flatten_fn
     unflatten_fn = unflatten_fn if unflatten_fn is not None else default_unflatten_fn
@@ -161,22 +203,12 @@ def default_from_dumpable_context(dumpable_context: DumpableContext) -> Context:
             "be None or registered."
         )
 
-    to_dumpable_context = (
-        to_dumpable_context
-        if to_dumpable_context is not None
-        else default_to_dumpable_context
-    )
-    from_dumpable_context = (
-        from_dumpable_context
-        if from_dumpable_context is not None
-        else default_from_dumpable_context
-    )
-
     _register_pytree_node(
         cls,
         flatten_fn,
         unflatten_fn,
         serialized_type_name=serialized_type_name,
+        flatten_with_keys_fn=default_flatten_fn_with_keys,
         to_dumpable_context=to_dumpable_context,
         from_dumpable_context=from_dumpable_context,
     )
@@ -225,6 +257,312 @@ def get_buffer(
 
     if is_buffer(program, node):
         buffer_name = program.graph_signature.inputs_to_buffers[node.name]
-        return program.state_dict[buffer_name]
+        if buffer_name in program.graph_signature.non_persistent_buffers:
+            return program.constants[buffer_name]
+        else:
+            return program.state_dict[buffer_name]
 
     return None
+
+
+def is_lifted_tensor_constant(
+    program: ExportedProgram,
+    node: torch.fx.Node,
+) -> bool:
+    """
+    Checks if the given node is a lifted tensor constant within the exported program
+    """
+
+    return node.name in program.graph_signature.inputs_to_lifted_tensor_constants
+
+
+def get_lifted_tensor_constant(
+    program: ExportedProgram,
+    node: torch.fx.Node,
+) -> Optional[torch.Tensor]:
+    """
+    Returns the lifted tensor constant associated with the given node in the exported program.
+    Returns None if the node is not a lifted tensor constant within the exported program
+    """
+
+    if is_lifted_tensor_constant(program, node):
+        lifted_tensor_name = program.graph_signature.inputs_to_lifted_tensor_constants[
+            node.name
+        ]
+        return program.constants[lifted_tensor_name]
+
+    return None
+
+
+def sequential_split(gm: torch.fx.GraphModule, node_call_back) -> torch.fx.GraphModule:
+    """
+    Splits the graph module into multiple submodules based on the node_call_back.
+    The node_call_back should return True if the node is a delimiter. Delimiter will be
+    the first node in the next submodule.
+    """
+    from torch.fx.passes.split_module import split_module
+
+    split_map = {}
+    split_id = 0
+    for node in gm.graph.nodes:
+        if node_call_back(node):
+            split_id += 1
+        split_map[node] = split_id
+
+    new_gm = split_module(
+        gm,
+        gm,
+        lambda node: split_map[node],
+        keep_original_order=True,
+        keep_original_node_name=True,
+    )
+    # Keep the codegen from original graph module to preserve e.g. pytree info.
+    new_gm.graph._codegen = gm.graph._codegen
+    new_gm.recompile()
+    return new_gm
+
+
+def nodes_filter(nodes: List[torch.fx.Node], node_call_back) -> List[torch.fx.Node]:
+    """Returns the nodes that match the node_call_back as a list."""
+    return [node for node in nodes if node_call_back(node)]
+
+
+def nodes_first(
+    nodes: List[torch.fx.Node], node_call_back=None
+) -> Optional[torch.fx.Node]:
+    """
+    Returns the first node that matches the node_call_back. If no node matches, returns None.
+    When node_call_back is None, returns the first node in the node list.
+    """
+    ret = nodes_filter(nodes, node_call_back if node_call_back else lambda node: True)
+    if len(ret) > 0:
+        return ret[0]
+    return None
+
+
+def nodes_count(nodes: List[torch.fx.Node], node_call_back) -> int:
+    """Returns the number of nodes that match the node_call_back."""
+    return len(nodes_filter(nodes, node_call_back))
+
+
+def nodes_map(nodes: List[torch.fx.Node], node_call_back) -> List[torch.fx.Node]:
+    """
+    Sequentially visit the nodes list and invoke node_call_back on each element.
+    Returns the nodes list after the node_call_back is invoked on each element.
+    """
+    for node in nodes:
+        node_call_back(node)
+    return nodes
+
+
+def node_replace_(
+    old_node: torch.fx.Node, new_node: torch.fx.Node, delete_old: bool = False
+) -> None:
+    """
+    Replace all uses of old_node with new_node.
+    """
+    old_node.replace_all_uses_with(new_node)
+    if delete_old:
+        old_node.users.clear()
+        old_node.graph.erase_node(old_node)
+
+
+def node_inline_(call_mod_node: torch.fx.Node) -> None:
+    """
+    Inline the submodule of the given node into the parent module.
+    Note: we only support the case where submodule takes tensors inputs.
+    """
+    assert call_mod_node.op == "call_module"
+    gm = call_mod_node.graph.owning_module
+
+    assert isinstance(call_mod_node.target, str)
+    sub_gm = getattr(gm, call_mod_node.target)
+
+    phs = (node for node in sub_gm.graph.nodes if node.op == "placeholder")
+    body = (
+        node for node in sub_gm.graph.nodes if node.op not in ("placeholder", "output")
+    )
+    output = [node for node in sub_gm.graph.nodes if node.op == "output"]
+
+    for ph, arg in zip(phs, call_mod_node.args):
+        assert isinstance(arg, torch.fx.Node)
+        node_replace_(ph, arg, delete_old=True)
+
+    with gm.graph.inserting_before(call_mod_node):
+        for node in body:
+            new_node = gm.graph.node_copy(node)
+            node_replace_(node, new_node, delete_old=True)
+
+        if len(output) > 0:
+            assert len(output) == 1 and len(output[0].args) == 1
+            new_output = output[0].args[0]
+
+            if isinstance(new_output, torch.fx.Node):
+                node_replace_(call_mod_node, new_output, delete_old=True)
+            elif isinstance(new_output, (list, tuple)):
+                # Inline the get_item calls for the output node.
+                get_item_users = nodes_filter(
+                    list(call_mod_node.users.keys()),
+                    lambda node: node.op == "call_function"
+                    and node.target == operator.getitem,
+                )
+                # get_item_node.args[1] is the idx referring to new_output[idx]
+                nodes_map(
+                    get_item_users,
+                    lambda get_item_node: node_replace_(
+                        get_item_node,
+                        new_output[get_item_node.args[1]],
+                        delete_old=True,
+                    ),
+                )
+                call_mod_node.graph.erase_node(call_mod_node)
+            else:
+                raise NotImplementedError(
+                    f"Unsupported output type {type(new_output)}. Expect it to be a Node or a list/tuple of Nodes."
+                )
+        else:
+            call_mod_node.graph.erase_node(call_mod_node)
+
+    gm.delete_all_unused_submodules()
+    gm.recompile()
+    return gm
+
+
+def placeholder_naming_pass(
+    gm: torch.fx.GraphModule,
+    export_graph_signature: torch.export.ExportGraphSignature,
+    mod: torch.nn.Module,
+    fake_args,
+    fake_kwargs,
+    fake_params_buffers,
+    constants: Dict[str, Any],
+) -> None:
+    """
+    This pass is run at the end of _export_non_strict() to assign better placeholder node names:
+        - User inputs:
+            These follow the signature of mod.forward(), e.g. forward(x, y) produces nodes x, y.
+            For nested inputs from dictionaries, lists, tuples, or dataclasses,
+            the names are a concatenation of the path to the tensor.
+                e.g. x = {
+                    'a': torch.randn(),
+                    'b': [torch.randn(), torch.randn()]
+                }
+            produces nodes x_a, x_b_0, x_b_1.
+        - Parameters/buffers/constants/custom objects:
+            These follow the FQN of the object, prefixed by "p", "b", "c", "obj" respectively.
+                e.g. self.bar.l0.weight produces "p_bar_l0_weight".
+        - Effect tokens:
+            These are named token, token_1, ...
+    """
+
+    def _strip_name(x):
+        if x.startswith("L__self___"):
+            x = x[len("L__self___") :]
+        x = re.sub(r"[^a-zA-Z0-9]", "_", x)
+        return x
+
+    def _extract_pytree_key(x):
+        if isinstance(x, MappingKey):
+            x = re.sub(r"[^a-zA-Z0-9]", "_", str(x.key))
+            return x
+        elif isinstance(x, SequenceKey):
+            return str(x.idx)
+        elif isinstance(x, GetAttrKey):
+            return x.name
+        else:
+            raise RuntimeError(f"Pytree key of type {type(x)} not handled for {x}")
+
+    name_map: Dict[str, str] = {}
+
+    # map user input names with mod.forward() signature
+    combined_args = (
+        inspect.signature(mod.forward).bind(*fake_args, **fake_kwargs).arguments
+    )
+    flat_args_with_path, _ = tree_flatten_with_path(combined_args)
+    user_input_names = [
+        spec.arg.name
+        for spec in export_graph_signature.input_specs
+        if spec.kind == InputKind.USER_INPUT
+    ]
+
+    # use pytree path to name nested user inputs
+    for (arg_path, arg), user_input_name in zip(flat_args_with_path, user_input_names):
+        if user_input_name:
+            _rename_without_collisions(
+                name_map,
+                user_input_name,
+                placeholder_prefixes[InputKind.USER_INPUT]
+                + "_".join(_extract_pytree_key(x).lower() for x in arg_path),
+                is_placeholder=True,
+            )
+
+    # use graph signature input specs to map param/buffer/constant names
+    # name effect tokens as token, token_1, ... (these aren't visible to user)
+    for spec in export_graph_signature.input_specs:
+        if spec.kind == InputKind.USER_INPUT:
+            continue
+        if spec.kind == InputKind.TOKEN:
+            base_name = ""
+        else:
+            base_name = _strip_name(spec.target).lower()
+        base_name = re.sub(r"[^a-zA-Z0-9]", "_", base_name)
+
+        _rename_without_collisions(
+            name_map,
+            spec.arg.name,
+            placeholder_prefixes[spec.kind] + base_name,
+            is_placeholder=True,
+        )
+
+    # handle naming collisions with call_function/get_attr inputs.
+    # here, we want to prioritize user input names over call_function names
+    # e.g. not have forward(self, mul): lead to a placeholder node called mul_13,
+    # so we increment the suffix of call_function nodes as needed
+    for node in gm.graph.nodes:
+        if node.op == "placeholder":
+            continue
+        _rename_without_collisions(name_map, node.name, node.name)
+
+    # assign new node names
+    for node in gm.graph.nodes:
+        if node.op == "placeholder":
+            assert node.name in name_map
+            node.name = node.target = name_map[node.name]
+        elif node.name in name_map:
+            node.name = name_map[node.name]
+
+    # propagate names to higher order op subgraphs
+    _name_hoo_subgraph_placeholders(gm)
+
+    # re-generate graph module code
+    gm.recompile()
+
+    # modify graph signature (input specs, output specs, user input mutations)
+    for spec in export_graph_signature.input_specs:
+        assert spec.arg.name in name_map
+        spec.arg.name = name_map[spec.arg.name]
+        if (  # handle targets for custom objects
+            spec.kind == InputKind.CUSTOM_OBJ and spec.target in name_map
+        ):
+            spec.target = name_map[spec.target][4:]  # strip obj_ prefix
+
+    for spec in export_graph_signature.output_specs:
+        if spec.arg.name in name_map:
+            spec.arg.name = name_map[spec.arg.name]
+        if spec.kind == OutputKind.USER_INPUT_MUTATION and spec.target in name_map:
+            spec.target = name_map[spec.target]
+
+    # rename keys in constants dict for custom objects
+    for name in list(constants.keys()):
+        constant = constants[name]
+        if name in name_map and not isinstance(
+            constant, torch.Tensor
+        ):  # rename custom objects with generic names
+            new_name = name_map[name]
+            if (
+                new_name != name
+                and re.match(r"arg(\d+)_1", name)
+                and new_name != placeholder_prefixes[InputKind.CUSTOM_OBJ] + name
+            ):
+                constants[new_name] = constant
+                del constants[name]
diff --git a/torch/_export/verifier.py b/torch/_export/verifier.py
index 9b7c5fa6a0793..65f37df701b4a 100644
--- a/torch/_export/verifier.py
+++ b/torch/_export/verifier.py
@@ -9,10 +9,11 @@
 from torch._subclasses.fake_tensor import FakeTensor
 from torch.export.exported_program import ExportedProgram
 from torch.export.graph_signature import (
-    ExportGraphSignature,
+    CustomObjArgument,
     InputKind,
     SymIntArgument,
     TensorArgument,
+    TokenArgument,
 )
 from torch.fx import GraphModule
 from torch.fx.experimental.symbolic_shapes import SymBool, SymFloat, SymInt
@@ -43,6 +44,8 @@ def _check_correct_val(val):
             return True
         elif isinstance(val, (SymInt, SymFloat, SymBool)):
             return True
+        elif isinstance(val, CustomObjArgument):
+            return True
         elif isinstance(val, Iterable):
             return all(_check_correct_val(x) for x in val)
         return False
@@ -62,6 +65,17 @@ def _no_returns(op):
         raise SpecViolationError(f"Node.meta {node.name} has invalid val field {val}")
 
 
+def _check_torch_fn(node: torch.fx.Node) -> None:
+    torch_fn = node.meta.get("torch_fn")
+    if torch_fn is None:
+        raise SpecViolationError(f"Unable to find torch_fn metadata for node {node.name}")
+    if (
+        not isinstance(torch_fn, tuple) and
+        isinstance(torch_fn[0], str) and
+        isinstance(torch_fn[1], str)
+    ):
+        raise SpecViolationError(f"Node.meta {node.name} has invalid torch_fn field {torch_fn}")
+
 class _VerifierMeta(type):
     _registry: Dict[str, Type['Verifier']] = {}
 
@@ -80,6 +94,15 @@ def __new__(metacls, name, bases, attrs):
         metacls._registry[attrs["dialect"]] = ret  # type: ignore[assignment]
         return ret
 
+def getattr_recursive(obj: Any, target: str) -> Any:
+    target_atoms = target.split('.')
+    attr_itr = obj
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}")
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
+
 
 class Verifier(metaclass=_VerifierMeta):
     dialect = "ATEN"
@@ -126,9 +149,6 @@ def check_additional(self, gm: GraphModule) -> None:
 
     @final
     def check(self, ep: ExportedProgram) -> None:
-        if not isinstance(ep.graph_signature, ExportGraphSignature):
-            # TODO Enforce type checking in the constructor.
-            return
         self._check_graph_module(ep.graph_module)
         _verify_exported_program_signature(ep)
 
@@ -159,6 +179,11 @@ def _allowed_op_types() -> Tuple[Type[Any], ...]:
                 torch.sym_min,
                 torch.sym_not,
                 torch.sym_sqrt,
+                # TODO (tmanlaibaatar)
+                # Predispatch export is able to contain autograd ops.
+                # These will be modeled as HOO later
+                torch._C._set_grad_enabled
+
             )
 
             if not isinstance(op, _allowed_op_types()):
@@ -200,7 +225,7 @@ def _allowed_op_types() -> Tuple[Type[Any], ...]:
                             f"Expected get_attr target to be string, but got {type(node.target)}"
                         )
 
-                    attr = getattr(mod, node.target)
+                    attr = getattr_recursive(mod, node.target)
                     if isinstance(attr, torch.nn.Module):
                         def _is_type(name, ty):
                             return isinstance(getattr(attr, name, None), ty)
@@ -293,10 +318,20 @@ def _verify_exported_program_signature(exported_program) -> None:
                 )
 
             buffer = input_spec.target
-            if buffer not in exported_program.state_dict:
+            if input_spec.persistent is None:
+                raise SpecViolationError(
+                    f"Buffer {buffer} is missing a persistence flag"
+                )
+
+            if input_spec.persistent is True and buffer not in exported_program.state_dict:
                 raise SpecViolationError(
                     f"Buffer {buffer} is not in the state dict."
                 )
+
+            if input_spec.persistent is False and buffer in exported_program.state_dict:
+                raise SpecViolationError(
+                    f"Non-persistent buffer {buffer} is in the state dict, it should not be."
+                )
         elif input_spec.kind == InputKind.CONSTANT_TENSOR:
             if not isinstance(input_spec.arg, TensorArgument):
                 raise SpecViolationError(
@@ -308,9 +343,29 @@ def _verify_exported_program_signature(exported_program) -> None:
                 )
 
             tensor_const = input_spec.target
-            if tensor_const not in exported_program.tensor_constants:
+            if tensor_const not in exported_program.constants:
                 raise SpecViolationError(
-                    f"Constant tensor {tensor_const} is not in the tensor constants dictionary."
+                    f"Constant tensor {tensor_const} is not in the constants dictionary."
+                )
+        elif input_spec.kind == InputKind.CUSTOM_OBJ:
+            if not isinstance(input_spec.arg, CustomObjArgument):
+                raise SpecViolationError(
+                    f"Custom object {input_spec.name} is not a custom object argument. Found {input_spec.arg} instead."
+                )
+            if input_spec.target is None:
+                raise SpecViolationError(
+                    f"InputSpec for {input_spec.name} has no target."
+                )
+
+            custom_obj = input_spec.target
+            if custom_obj not in exported_program.constants:
+                raise SpecViolationError(
+                    f"Custom object {custom_obj} is not in the constants dictionary."
+                )
+        elif input_spec.kind == InputKind.TOKEN:
+            if not isinstance(input_spec.arg, TokenArgument):
+                raise SpecViolationError(
+                    f"Constant tensor {input_spec.name} is not a tensor argument. Found {input_spec.arg} instead."
                 )
         else:
             raise SpecViolationError(
@@ -320,7 +375,10 @@ def _verify_exported_program_signature(exported_program) -> None:
     # Check outputs
     output_node = list(exported_program.graph.nodes)[-1]
     assert output_node.op == "output"
-    output_nodes = [arg.name for arg in output_node.args[0]]
+    output_nodes = [
+        arg.name if isinstance(arg, torch.fx.Node) else arg
+        for arg in output_node.args[0]
+    ]
 
     if len(output_nodes) != len(gs.output_specs):
         raise SpecViolationError(
@@ -330,8 +388,9 @@ def _verify_exported_program_signature(exported_program) -> None:
             f"Number of user outputs: {len(gs.user_outputs)}. \n"
         )
 
-    end = len(gs.buffers_to_mutate) + len(gs.user_inputs_to_mutate)
-    mutate_nodes: List[str] = output_nodes[:end]
+    num_tokens = len(gs.output_tokens)
+    end = len(gs.buffers_to_mutate) + len(gs.user_inputs_to_mutate) + num_tokens
+    mutate_nodes: List[str] = output_nodes[num_tokens:end]
     user_output_nodes = output_nodes[end:end + len(gs.user_outputs)]
 
     for mutation_node in mutate_nodes:
@@ -364,6 +423,6 @@ def _verify_exported_program_signature(exported_program) -> None:
 
 
 def load_verifier(dialect: str) -> Optional[Type[Verifier]]:
-    if dialect == "ATEN":
+    if dialect == "ATEN" or dialect == "":
         return _VerifierMeta._registry.get(dialect)
     return _VerifierMeta._registry[dialect]
diff --git a/torch/_export/wrappers.py b/torch/_export/wrappers.py
index b791550d5ec8c..5ca2375ec124f 100644
--- a/torch/_export/wrappers.py
+++ b/torch/_export/wrappers.py
@@ -61,12 +61,11 @@ def _wrap_submodule(mod, path, module_call_specs):
         submodule = getattr(submodule, name)
 
     def update_module_call_signatures(path, in_spec, out_spec):
-        assert path not in module_call_specs
+        if path in module_call_specs:
+            assert module_call_specs[path]["in_spec"] == in_spec
+            assert module_call_specs[path]["out_spec"] == out_spec
         module_call_specs[path] = {"in_spec": in_spec, "out_spec": out_spec}
 
-    assert "forward" not in submodule.__dict__
-    wrapped_forward = submodule.forward
-
     def check_flattened(flat_args):
         for a in flat_args:
             if not (isinstance(a, (torch.Tensor, str, int, float, bool)) or a is None):
@@ -74,33 +73,37 @@ def check_flattened(flat_args):
                     f"Only Tensors or scalars are supported as pytree flattened inputs, got: {a}"
                 )
 
-    def wrapper(self, *args, **kwargs):
+    def pre_hook(module, args, kwargs):
         flat_args, in_spec = pytree.tree_flatten((args, kwargs))
         check_flattened(flat_args)
         flat_args = _export_tracepoint(*flat_args, kind="module_call_inputs", path=path)
         args, kwargs = pytree.tree_unflatten(flat_args, in_spec)
-        res = wrapped_forward(*args, **kwargs)
+        return args, kwargs
+
+    def post_hook(module, args, kwargs, res):
+        _, in_spec = pytree.tree_flatten((args, kwargs))
         flat_res, out_spec = pytree.tree_flatten(res)
         check_flattened(flat_res)
         flat_res = _export_tracepoint(*flat_res, kind="module_call_outputs", path=path)
         update_module_call_signatures(path, in_spec, out_spec)
         return pytree.tree_unflatten(flat_res, out_spec)
 
-    submodule.forward = wrapper.__get__(submodule, type(submodule))
-    return submodule
+    pre_handle = submodule.register_forward_pre_hook(pre_hook, with_kwargs=True)
+    post_handle = submodule.register_forward_hook(post_hook, with_kwargs=True)
+    return pre_handle, post_handle
 
 
 @contextmanager
 def _wrap_submodules(f, preserve_signature, module_call_signatures):
-    tasks = []
+    handles = []
 
     try:
         for path in preserve_signature:
-            tasks.append(_wrap_submodule(f, path, module_call_signatures))
+            handles.extend(_wrap_submodule(f, path, module_call_signatures))
         yield
     finally:
-        for submodule in tasks:
-            del submodule.__dict__["forward"]
+        for handle in handles:
+            handle.remove()
 
 
 def _mark_strict_experimental(cls):
diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py
new file mode 100644
index 0000000000000..f4f2fa9d5bb45
--- /dev/null
+++ b/torch/_functorch/_aot_autograd/autograd_cache.py
@@ -0,0 +1,70 @@
+"""
+Utils for caching the outputs of AOTAutograd
+"""
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+from torch._inductor.codecache import _ident, FxGraphCachePickler
+
+from .schemas import AOTConfig  # noqa: F401
+
+if TYPE_CHECKING:
+    import torch
+
+log = logging.getLogger(__name__)
+
+
+class AOTAutogradCacheDetails:
+    """
+    Object to capture all the details for a dynamo graph module relevant to computing
+    a safe and stable cache key for AOTAutograd.
+    """
+
+    def __init__(self, gm: torch.fx.GraphModule, config: AOTConfig):
+        self.gm = gm  # TODO: we'll handle different parts of the graph module
+        # TODO: We'll want to handle the full_args passed in as well
+        self.config = config  # Gets reduced by the Pickler
+
+    def debug_str(self) -> str:
+        return AOTAutogradCachePickler.debug_str(self)
+
+
+def _reduce_aot_config(config: AOTConfig):
+    """
+    Reduce the config to a stable key for caching.
+    """
+    return (
+        _ident,
+        (
+            config.num_params_buffers,
+            config.keep_inference_input_mutations,
+            config.is_export,
+            config.no_tangents,
+            config.dynamic_shapes,
+            config.aot_autograd_arg_pos_to_source,
+            config.enable_log,
+            config.pre_dispatch,
+        ),
+    )
+
+
+class AOTAutogradCachePickler(FxGraphCachePickler):
+    dispatch_table = FxGraphCachePickler.dispatch_table.copy()
+    dispatch_table[AOTConfig] = _reduce_aot_config
+
+
+def autograd_cache_hash(
+    gm: torch.fx.GraphModule,
+    config: AOTConfig,
+    # TODO: add args and parameters
+) -> str:
+    """
+    Generate a unique hash of the FX graph for caching.
+    """
+    details = AOTAutogradCacheDetails(gm, config)
+    # The prefix distinguishes among the other kinds of objects we cache
+    key = "a" + AOTAutogradCachePickler.get_hash(details)
+    log.debug("FX graph cache hash details for key %s:\n%s", key, details.debug_str())
+    return key
diff --git a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
index bda600bf09281..57d68d6995100 100644
--- a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
+++ b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
@@ -8,13 +8,14 @@
 """
 
 import collections
+import logging
 from functools import wraps
 from typing import Callable, DefaultDict, Dict, List
 
 import torch
 import torch.utils._pytree as pytree
 from torch import Tensor
-from torch._logging import getArtifactLogger
+from torch._guards import detect_fake_mode
 from torch._subclasses.functional_tensor import FunctionalTensor, FunctionalTensorMode
 from torch._subclasses.meta_utils import safe_is_leaf
 from torch.fx.experimental.symbolic_shapes import is_concrete_int
@@ -24,7 +25,6 @@
     transform_subclass,
 )
 from .functional_utils import (
-    _get_mutation_type,
     are_all_mutations_hidden_from_autograd,
     are_all_mutations_under_no_grad_or_inference_mode,
     from_fun,
@@ -34,6 +34,7 @@
     to_fun,
 )
 from .schemas import (
+    FunctionalTensorMetadataEq,
     InputAliasInfo,
     MutationType,
     OutputAliasInfo,
@@ -46,7 +47,54 @@
 
 zip = strict_zip
 
-aot_graphs_log = getArtifactLogger(__name__, "aot_graphs")
+log = logging.getLogger(__name__)
+
+
+# Note [Tangents must be contiguous]
+# We force tangents to be contiguous today.
+# The idea is that we are technically making a guess about the strides of our tangents,
+# while we trace out the joint.
+# Today, we force this guess to be correct by additioanlly calling contiguous()
+# on all tangents at runtime.
+# In the future, you could imagine lifting this restriction, since these contiguous()
+# calls can have noticeable perf overhead depending on the model.
+def coerce_tangent(x):
+    if not isinstance(x, Tensor):
+        return x
+    out = x.detach().contiguous()
+    # Note [Tangents must be contiguous, Part 2]
+    # In the same way that "what strides do we assigns to our tangents" is a question
+    # that we can not answer (and therefore have to guess) as we trace the backward ahead-of-time,
+    # The same applies to any tensor subclass metadata, when we have tangents that are subclasses.
+    # To handle this situation, we have two new methods that a tensor subclass can implement:
+    # (1) __coerce_tangent_metadata__(self)
+    #     Given a subclass with "non-standard" metadata, turn it into a new subclass with "normal" metadata.
+    #     The main example here is a DTensor with the "_Partial" placement.
+    #     If we have a forward output with a _Partial placement, and corresponding tangent
+    #     with a Replicate/Shard placement, we have no way to convert the tangent "back" to a _Partial placement.
+    #     This method lets us avoid the problem entirely by allowing subclasses to ensure that we can never
+    #     have a tangent with "problematic" metadata, that we cannot convert to.
+    # (1) __coerce_same_metadata_as_tangent__(self, target)
+    #     Given a subclass, and a target subclass with differing metadata,
+    #     convert self to have the same metadata as the target.
+    #     With DTensor being the main example, we can use this to convert a DTensor with a Replicate()
+    #     placement into one with a Shard() placement, in the case that we "guessed wrong",
+    #     and traced tangents with a Shard() placement at compile time.
+    #
+    if is_traceable_wrapper_subclass(out) and hasattr(
+        out, "__coerce_tangent_metadata__"
+    ):
+        out = out.__coerce_tangent_metadata__()
+    # It's possible to have a subclass that advertises as contiguous,
+    # but has noncontiguous inner tensors.
+    # Force these to be conntiguous too
+    if is_traceable_wrapper_subclass(out):
+        for attr in out.__tensor_flatten__()[0]:  # type: ignore[attr-defined]
+            elem = getattr(out, attr)
+            if not elem.is_contiguous():
+                elem_contig = elem.contiguous()
+                setattr(out, attr, elem_contig)
+    return out
 
 
 # This is a version of functionalization that is specifically designed
@@ -74,7 +122,6 @@ def run_functionalized_fw_and_collect_metadata(
     keep_input_mutations: bool,
     # TODO: refactor to kill this flag
     is_train: bool = False,
-    requires_subclass_dispatch: bool = False,
     pre_dispatch: bool = False,
 ) -> Callable[..., ViewAndMutationMeta]:
     memo: Dict[Tensor, Tensor] = {}
@@ -92,7 +139,7 @@ def _to_fun(t):
     @wraps(f)
     def inner(*flat_args):
         # This function is meant to be run with the forward, which expects a flat list of tensor/symint/other args.
-        assert all(isinstance(a, KNOWN_TYPES) for a in flat_args)
+        assert all(isinstance(a, tuple(KNOWN_TYPES)) for a in flat_args)
 
         input_info: List[InputAliasInfo] = []
         output_info: List[OutputAliasInfo] = []
@@ -107,10 +154,15 @@ def inner(*flat_args):
 
         # It doesn't matter if we run this under predispatch or not because it is
         # only for figuring out metadata
-        with disable_above, FunctionalTensorMode():
+        mode = FunctionalTensorMode(_allow_token_discovery=True)
+        with disable_above, mode:
             # precondition: The passed in function already handles unflattening inputs + flattening outputs
             flat_f_args = pytree.tree_map(_to_fun, flat_args)
             flat_f_outs = f(*flat_f_args)
+            # We didn't do any tracing, so we don't need to process the
+            # unbacked symbols, they will just disappear into the ether
+            if (fake_mode := detect_fake_mode()) and (shape_env := fake_mode.shape_env):
+                shape_env.pending_fresh_unbacked_symbols.clear()
 
         if prior_autocast_states != _get_autocast_states():
             raise RuntimeError(
@@ -123,6 +175,21 @@ def inner(*flat_args):
         # Inspect the state of the input tensor functional wrapper to detect input mutation info
         # If inp[i] has a metadata-only mutation, then maybe_inputs_with_mutated_metadata[i] contains the updated version
         for i, (arg, f_arg) in enumerate(zip(flat_args, flat_f_args)):
+            # NB: Mutation of non-contiguous tensor subclass input can result in a mismatch in
+            # strides between the functionalized arg inner tensors and non-functionalized arg inner
+            # tensors. This is a problem as the inner tensor stride change may not be reflected
+            # correctly in the outer tensor, so disallow this for now.
+            mutates_data = has_data_mutation(f_arg)
+            if (
+                mutates_data
+                and not arg.is_contiguous()
+                and is_traceable_wrapper_subclass(arg)
+            ):
+                raise RuntimeError(
+                    "Mutations on non-contiguous inputs are currently not allowed on "
+                    "tensor subclasses"
+                )
+
             if not isinstance(arg, Tensor):
                 new_arg = arg
             else:
@@ -137,7 +204,6 @@ def inner(*flat_args):
             mutates_storage_metadata = has_metadata_mutation(
                 f_arg, arg, check_only_storage_mutation=True
             )
-            mutates_data = has_data_mutation(f_arg)
             mutations_hidden_from_autograd = are_all_mutations_hidden_from_autograd(
                 f_arg
             )
@@ -146,20 +212,6 @@ def inner(*flat_args):
                 and are_all_mutations_under_no_grad_or_inference_mode(f_arg)
             )
 
-            # Here, we're saying that if an input experienced a set call, inp.set_(other),
-            # then we can effectively not have to worry about whether its data was mutated.
-            # There are 3 cases:
-            # (1) We mutate inp *after* the set_() call. other is a graph intermediate.
-            #     In this case, we're not really mutating the input storage of "inp";
-            #     we're mutating the storage of an intermdiate value (other),
-            #     and slamming that storage into the input tensor. So no data mutation is necessary.
-            # (2) We mutate inp *after* the set_() call. other is a graph *input*.
-            #     In this case, the data mutation will be properly handled in the runtime
-            #     epilogue during the processing of "other"
-            # (3) We mutate inp *before* the set_() call.
-            #     This case is *not* currently handled.
-            #     TODO: discuss this in the PR. Both supporting this, and detecting + erroring out,
-            #     seem painful to get working.
             if mutates_storage_metadata:
                 mutates_data = False
 
@@ -174,14 +226,7 @@ def inner(*flat_args):
                     mutates_storage_metadata=mutates_storage_metadata,
                     mutations_under_no_grad_or_inference_mode=mutations_under_no_grad_or_inference_mode,
                     requires_grad=requires_grad,
-                    mutation_type=_get_mutation_type(
-                        keep_input_mutations,
-                        mutates_data,
-                        mutates_metadata,
-                        mutations_hidden_from_autograd,
-                        mutations_under_no_grad_or_inference_mode,
-                        requires_grad,
-                    ),
+                    keep_input_mutations=keep_input_mutations,
                 )
             )
 
@@ -408,7 +453,7 @@ def inner(*flat_args):
                     # However, autograd does not allow users to mutate multi-output views
                     # in any way that can change the autograd metadata of other aliases.
                     # So we hide this aliasing from autograd here.
-                    aot_graphs_log.info(
+                    log.debug(
                         "Encountered AOTAutograd case: differentiable outputs that \
 alias each other from a multi-output view call"
                     )
@@ -417,6 +462,12 @@ def inner(*flat_args):
                     output_type = OutputType.is_input
                 else:
                     output_type = OutputType.alias_of_input
+            elif functional_tensor_storage_changed and id(o) in inp_tensor_ids:
+                # When there is a set_() on an input, we cannot rely on checking storages
+                # to detect if we are returning an input (since the inputs storage is different)
+                assert curr_storage is not None
+                base_idx = inp_storage_refs[curr_storage]
+                output_type = OutputType.is_input
 
             # We only need to handle the intermediate base case when both
             # the intermediate base and the output require gradients.
@@ -450,7 +501,7 @@ def inner(*flat_args):
                         out_tensor_alias_counts[curr_storage] != 1
                         and num_aliased_outs_that_are_not_multi_output_views <= 1
                     ):
-                        aot_graphs_log.info(
+                        log.debug(
                             "Encountered AOTAutograd case: differentiable outputs that alias each other \
 from a multi-output view call"
                         )
@@ -496,7 +547,6 @@ def inner(*flat_args):
                 and len(outs_with_identical_metadata_that_require_grad) > 0
                 and not o.requires_grad
             ):
-                assert len(outs_with_identical_metadata_that_require_grad) > 0
                 # In theory we could use any of these tensors to regenerate the aliased outputs from,
                 # since they all alias each other and have identical metatadata
                 out_alias = outs_with_identical_metadata_that_require_grad[0]
@@ -513,12 +563,55 @@ def inner(*flat_args):
                 }
             else:
                 dynamic_dims = None
+
+            # Save the current FunctionalTensor output.
+            #
+            # This will be used at runtime for reconstructing output views from
+            # their respective base tensors.
+            #
+            # The FunctionalTensor will be saved if one of the 2 conditions below
+            # is true:
+            functional_tensor = None
+            if (
+                # 1. If the output_type is either of:
+                #    (i) alias_of_intermediate;
+                #    (ii) alias_of_intermediate_save_as_output; or
+                #    (iii) alias_of_intermediate_base_is_user_output.
+                #
+                # No need to worry about in-place view operations here, since
+                # this functionalization step elimitates mutations.
+                #
+                # i.e. we have access to the actual base tensor, before the
+                # in-place operation was applied.
+                output_type
+                in (
+                    OutputType.alias_of_intermediate,
+                    OutputType.alias_of_intermediate_save_as_output,
+                    OutputType.alias_of_intermediate_base_is_user_output,
+                )
+            ) or (
+                # 2. If the output_type is alias_of_input, and no in-place view
+                #    operationthe was run on the input (base tensor).
+                #
+                # In this case, we need to check for metadata mutation because
+                # the runtime explicitly reconstructs the inputs, before actually
+                # reconstructing the outputs. Due to in-place view operations, the
+                # fully reconstructed input may not be this output base tensor
+                # anymore.
+                output_type == OutputType.alias_of_input
+                and base_idx is not None
+                and not input_info[base_idx].mutates_metadata
+            ):
+                if isinstance(o, FunctionalTensor):
+                    functional_tensor = FunctionalTensorMetadataEq(o.elem)
+
             out_info = OutputAliasInfo(
                 output_type=output_type,
                 raw_type=type(o),
                 base_idx=base_idx,
                 dynamic_dims=dynamic_dims,
                 requires_grad=isinstance(o, torch.Tensor) and o.requires_grad,
+                functional_tensor=functional_tensor,
             )
             output_info.append(out_info)
 
@@ -538,17 +631,7 @@ def view_avoid_dupes_with_primals(t):
         f_input_tangents = [
             inp
             for inp, info in zip(flat_f_args, input_info)
-            if _get_mutation_type(
-                keep_input_mutations,
-                mutates_data=info.mutates_data,
-                mutates_metadata=info.mutates_metadata,
-                mutations_hidden_from_autograd=info.mutations_hidden_from_autograd,
-                mutations_under_no_grad_or_inference_mode=info.mutations_under_no_grad_or_inference_mode,
-                requires_grad=info.requires_grad
-                # MUTATED_OUT_GRAPH corresponds to any input mutations that happen outside the graph.
-                # this can also include metadata mutations, and inputs that do not require grad,
-            )
-            == MutationType.MUTATED_OUT_GRAPH
+            if info.mutation_type == MutationType.MUTATED_OUT_GRAPH
             and info.mutates_data
             and info.requires_grad
         ]
@@ -570,12 +653,17 @@ def view_avoid_dupes_with_primals(t):
         traced_tangents = pytree.tree_map(
             view_avoid_dupes_with_primals, traced_tangents
         )
+        # See Note [Tangents must be contiguous]
+        traced_tangents = pytree.tree_map(
+            coerce_tangent,
+            traced_tangents,
+        )
         user_outs = pytree.tree_map(from_fun, f_output_tangents)
 
         f_mutated_inputs = [
             inp
             for inp, info in zip(flat_f_args, input_info)
-            if info.mutates_data or info.mutates_metadata
+            if info.mutation_type == MutationType.MUTATED_OUT_GRAPH
         ]
         f_metadata_mutated_inputs = [
             inp for inp, info in zip(flat_f_args, input_info) if info.mutates_metadata
@@ -604,7 +692,7 @@ def view_avoid_dupes_with_primals(t):
             torch.set_grad_enabled(
                 prior_grad_enabled
             )  # Restore the prior state after tracing it
-            aot_graphs_log.info(
+            log.debug(
                 (
                     "grad_mode mutation encountered in graph. "
                     "Will emit mutation epilogue, to set grad_mode=%s"
@@ -623,7 +711,7 @@ def view_avoid_dupes_with_primals(t):
             subclass_tangent_meta=create_subclass_meta(traced_tangents),
             is_train=is_train,
             grad_enabled_mutation=grad_enabled_mutation,
-            requires_subclass_dispatch=requires_subclass_dispatch,
+            tokens=mode._tokens,
         )
         return metadata
 
diff --git a/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py b/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py
index b90458f628b01..5e01f1e0b9716 100644
--- a/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py
+++ b/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py
@@ -3,6 +3,7 @@
 pathways, taking into account the AOTConfig and the collected ViewAndMutationMetadata.
 """
 
+import dataclasses
 from typing import Any, Callable, List, Optional, Tuple, Union
 
 import torch
@@ -11,11 +12,15 @@
 from torch import Tensor
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.utils import lazy_format_graph_code
-from torch._logging import getArtifactLogger
+from torch._logging import getArtifactLogger, trace_structured
 from torch._subclasses.functional_tensor import FunctionalTensorMode
 from torch.fx.experimental.proxy_tensor import make_fx
 
-from .functional_utils import assert_functional_graph
+from .. import config
+from .functional_utils import (
+    assert_functional_graph,
+    propagate_input_mutation_stacktraces,
+)
 from .schemas import AOTConfig, SubclassMeta, ViewAndMutationMeta
 from .traced_function_transforms import (
     aot_dispatch_subclass,
@@ -24,6 +29,7 @@
     fn_input_mutations_to_outputs,
     fn_prepped_for_autograd,
 )
+from .utils import root_module_when_exporting_non_strict, unlift_tokens
 
 aot_graphs_log = getArtifactLogger(__name__, "aot_graphs")
 
@@ -31,7 +37,9 @@
 def _create_graph(f, args, *, aot_config: AOTConfig) -> torch.fx.GraphModule:
     # FunctionalTensorMode must be enabled here.
     # See Note [Accessing .grad_fn on FunctionalTensor]
-    with enable_python_dispatcher(), FunctionalTensorMode(aot_config.pre_dispatch):
+    with enable_python_dispatcher(), FunctionalTensorMode(
+        pre_dispatch=aot_config.pre_dispatch, export=aot_config.is_export
+    ):
         fx_g = make_fx(
             f,
             decomposition_table=aot_config.decompositions,
@@ -81,13 +89,78 @@ def aot_dispatch_base_graph(
         meta=fw_metadata,
         fw_only=flat_fn,
     )
+    aot_graphs_log.debug(
+        "aot_config id: %s, fw_metadata=%s,subclass_metadata=%s",
+        str(aot_config.aot_id),
+        str(fw_metadata),
+        str(maybe_subclass_meta),
+    )
+
+    # We track buffer assignments when exporting in non-strict mode.
+    # (In contrast, strict mode errors on any attribute assignment.)
+    mod_when_exporting_non_strict = root_module_when_exporting_non_strict(flat_fn)
+    if aot_config.is_export and mod_when_exporting_non_strict is not None:
+        # For any buffer that is assigned, we want to associate it to the final proxy node
+        # that it is assigned to. This node can then be added as a buffer mutation output.
+        assigned_buffers = {}
 
+        def _map_assigned_buffer_to_proxy(_mod, name, buffer):
+            # We intercept buffer assignments on the root module through this hook.
+            if _mod._buffers is mod_when_exporting_non_strict._buffers:
+                # The value assigned to a buffer is a functional tensor, which wraps a fake tensor.
+                assert isinstance(
+                    buffer, torch._subclasses.functional_tensor.FunctionalTensor
+                )
+                fake = buffer.from_functional()
+                # The fake tensor in turn is associated with a proxy node.
+                proxy_mode = torch._C._get_dispatch_mode(
+                    torch._C._TorchDispatchModeKey.PROXY
+                )
+                assert proxy_mode is not None
+                proxy = torch.fx.experimental.proxy_tensor.get_proxy_slot(
+                    fake, proxy_mode.tracer
+                ).proxy.node
+                # We map the assigned buffer to this proxy node.
+                assigned_buffers[name] = proxy.name
+            return buffer
+
+        handle = torch.nn.modules.module.register_module_buffer_registration_hook(
+            _map_assigned_buffer_to_proxy
+        )
+
+    saved_updated_flat_args_subclasses_desugared = pytree.tree_map_only(
+        torch.Tensor, lambda t: t.detach(), updated_flat_args_subclasses_desugared
+    )
     fw_module = _create_graph(
         fn_to_trace,
         updated_flat_args_subclasses_desugared,
         aot_config=aot_config,
     )
 
+    if aot_config.is_export and mod_when_exporting_non_strict is not None:
+        # We update metadata to consider any assigned buffers as buffer mutations.
+        i = len(dict(mod_when_exporting_non_strict.named_parameters()))
+        for name, _ in mod_when_exporting_non_strict.named_buffers():
+            if name in assigned_buffers and not fw_metadata.input_info[i].mutates_data:  # type: ignore[possibly-undefined]
+                fw_metadata.input_info[i] = dataclasses.replace(
+                    fw_metadata.input_info[i], mutates_data=True
+                )
+                fw_metadata.num_mutated_inp_runtime_indices += 1
+            i += 1
+
+        # We add nodes corresponding to buffer assignments as output nodes in the graph.
+        add_nodes = []
+        output_node = None
+        output_node = list(fw_module.graph.nodes)[-1]
+        for name in assigned_buffers.values():  # type: ignore[possibly-undefined]
+            for node in fw_module.graph.nodes:
+                if node.name == name:
+                    add_nodes.append(node)
+                    node.users[output_node] = None
+        output_node.args = ((*add_nodes, *output_node.args[0]),)
+
+        handle.remove()  # type: ignore[possibly-undefined]
+
     # As long as we opted to remove input mutations, then
     # there should be *NO* mutating ops in the graph at this point.
     copy_count = assert_functional_graph(fw_module.graph)
@@ -96,6 +169,15 @@ def aot_dispatch_base_graph(
     fw_module.recompile()
 
     copy_count2 = assert_functional_graph(fw_module.graph)
+    propagate_input_mutation_stacktraces(fw_module.graph)
+
+    # See Note [Side-Effectful Tokens in AOTAutograd]
+    num_tokens = len(fw_metadata.tokens)
+    if num_tokens != 0 and config.unlift_effect_tokens:
+        unlift_tokens(fw_module, fw_metadata)
+        saved_updated_flat_args_subclasses_desugared = (
+            saved_updated_flat_args_subclasses_desugared[num_tokens:]
+        )
 
     assert copy_count == copy_count2
 
@@ -103,6 +185,10 @@ def aot_dispatch_base_graph(
         aot_graphs_log.info(
             "%s", lazy_format_graph_code("Forward graph", fw_module, aot_config.aot_id)
         )
+        trace_structured(
+            "aot_forward_graph",
+            payload_fn=lambda: fw_module.print_readable(print_output=False),
+        )
 
     # TODO: should factor this into a separate function for export that always only returns just the graph.
     if aot_config.is_export:
@@ -110,7 +196,7 @@ def aot_dispatch_base_graph(
             maybe_subclass_meta is None
         ), "aot_export_module does not support tensor subclass inputs for now."
         return fw_module
-    return fw_module, list(updated_flat_args_subclasses_desugared), maybe_subclass_meta
+    return fw_module, saved_updated_flat_args_subclasses_desugared, maybe_subclass_meta
 
 
 # Has the precondition that there
@@ -127,12 +213,7 @@ def aot_dispatch_autograd_graph(
     # traced_tangents corresponds to the set of outputs in the traced forward that should get grad_outputs in the traced backward.
     # It includes outputs of the original forward, *and* any updated inputs due to input mutations.
     # However, it does *not* include any outputs that are aliases of inputs or intermediates, or any metadata-only input mutations.
-    traced_tangents = pytree.tree_map(
-        lambda x: x.detach().contiguous() if isinstance(x, Tensor) else x,
-        fw_metadata.traced_tangents,
-    )
-
-    joint_inputs = (flat_args, traced_tangents)
+    joint_inputs = (flat_args, fw_metadata.traced_tangents)
 
     fn_prepared_for_autograd = fn_prepped_for_autograd(
         flat_fn,
@@ -158,7 +239,24 @@ def aot_dispatch_autograd_graph(
 
     joint_fn_to_trace = subclass_tracing_info.plain_tensor_trace_fn
     updated_joint_inputs = subclass_tracing_info.plain_tensor_args
+    # When we call _create_graph, this may mutate the metadata of joint
+    # inputs.  But callers are expecting to get the original joint inputs.  So
+    # we make aliases of all the inputs to make sure we have a copy that
+    # doesn't get modified.
+    #
+    # This destroys requires_grad/grad_fn information.  However, backends
+    # beneath AOTAutograd are indifferent to this information, so it doesn't
+    # matter.
+    saved_updated_joint_inputs = pytree.tree_map_only(
+        torch.Tensor, lambda t: t.detach(), updated_joint_inputs
+    )
     maybe_subclass_meta = subclass_tracing_info.maybe_subclass_meta
+    aot_graphs_log.info(
+        "aot_config id: %s, fw_metadata=%s,subclass_metadata=%s",
+        str(aot_config.aot_id),
+        str(fw_metadata),
+        str(maybe_subclass_meta),
+    )
 
     fx_g = _create_graph(joint_fn_to_trace, updated_joint_inputs, aot_config=aot_config)
 
@@ -179,4 +277,4 @@ def aot_dispatch_autograd_graph(
             maybe_subclass_meta is None
         ), "aot_export_module does not support tensor subclass inputs for now."
         return fx_g
-    return fx_g, updated_joint_inputs, maybe_subclass_meta
+    return fx_g, saved_updated_joint_inputs, maybe_subclass_meta  # type: ignore[return-value]
diff --git a/torch/_functorch/_aot_autograd/functional_utils.py b/torch/_functorch/_aot_autograd/functional_utils.py
index 0340e02681ddd..25197e9eea25d 100644
--- a/torch/_functorch/_aot_autograd/functional_utils.py
+++ b/torch/_functorch/_aot_autograd/functional_utils.py
@@ -6,8 +6,10 @@
 4. checking if a graph is functional i.e. whether it contains any mutation ops
 """
 
+
 import torch
 from torch import Tensor
+from torch._logging import getArtifactLogger
 from torch._subclasses.fake_tensor import FakeTensor
 from torch._subclasses.functional_tensor import FunctionalTensor
 from torch.fx.experimental.symbolic_shapes import definitely_true, sym_eq
@@ -16,8 +18,9 @@
     is_traceable_wrapper_subclass,
     transform_subclass,
 )
+from .. import config
 
-from .schemas import MutationType
+aot_joint_log = getArtifactLogger(__name__, "aot_joint_graph")
 
 
 def to_fun(t):
@@ -193,7 +196,64 @@ def has_metadata_mutation(f_arg, arg, *, check_only_storage_mutation: bool):
         return has_metadata_mutation_
 
 
-def gen_alias_from_base(aliased_base_tensor, target_meta_tensor, target_requires_grad):
+def gen_alias_from_base(
+    aliased_base_tensor,
+    target_meta_tensor,
+    target_requires_grad,
+    # Actual type: Optional[FunctionalTensorMetadataEq]
+    # Can't use it here because it lives inside schemas.py. Importing that class would lead
+    # to an error due to an import cycle.
+    target_functional_tensor=None,
+):
+    # Patch the correct requires_grad field of the output tensor, depending on whether:
+    # (i) the reconstructed output (out) was came from a tensor that requires grad or not;
+    # and (ii) the concrete returned output does require grad or not.
+    def patch_requires_grad(out):
+        if aliased_base_tensor.requires_grad and not target_requires_grad:
+            out = out.detach()
+        elif not aliased_base_tensor.requires_grad and target_requires_grad:
+            out.requires_grad_(True)
+        return out
+
+    # If provided, use the target functional tensor for replaying the views.
+    #
+    # In summary, we use the fact that FunctionalTensorWrapper saves the view
+    # functions applied to itself (collected during functionalization) so as
+    # to replay them (view functions) on the aliased_base_tensor.
+    if config.view_replay_for_aliased_outputs and target_functional_tensor is not None:
+        from .schemas import FunctionalTensorMetadataEq
+
+        assert isinstance(target_functional_tensor, FunctionalTensorMetadataEq)
+        functional_tensor = target_functional_tensor.tensor
+
+        try:
+            out = torch._functionalize_apply_view_metas(
+                functional_tensor, aliased_base_tensor
+            )
+        except RuntimeError as e:
+            # NYI for dynamic shapes.
+            #
+            # On functionalization, the ViewMeta lambdas will have symbolic shapes.
+            # When trying to apply those lambdas on concrete tensors, it will fail.
+            #
+            # In order for this to work, we should have a way to replace those
+            # symbolic shapes with concrete numbers.
+            aot_joint_log.info(
+                "could not reconstruct view by re-applying a ViewMeta sequence. "
+                "Fallbacking to reconstruction using as_strided. "
+                "Reason: %s",
+                str(e),
+            )
+        else:
+            # If re-applying the ViewMeta sequence succeeded, there should be no more
+            # problems going forward. We just check we got to the target shape and
+            # patch requires_grad flag.
+            assert out.shape == target_meta_tensor.shape, (
+                "incorrect out shape after application of ViewMeta sequence: "
+                f"{tuple(out.shape)} (actual) vs {tuple(target_meta_tensor.shape)} (expected)"
+            )
+            return patch_requires_grad(out)
+
     # Try to do view-replay if possible.
     # fall back to .as_strided() if we can't.
     if target_meta_tensor._base is not None:
@@ -220,11 +280,8 @@ def gen_alias_from_base(aliased_base_tensor, target_meta_tensor, target_requires
         #
         # As a stopgap, we'll fall back to as_strided.
         if out is not None and out.shape == target_meta_tensor.shape:
-            if aliased_base_tensor.requires_grad and not target_requires_grad:
-                out = out.detach()
-            elif not aliased_base_tensor.requires_grad and target_requires_grad:
-                out.requires_grad_(True)
-            return out
+            return patch_requires_grad(out)
+
     size = target_meta_tensor.size()
     stride = target_meta_tensor.stride()
     storage_offset = target_meta_tensor.storage_offset()
@@ -239,10 +296,7 @@ def gen_alias_from_base(aliased_base_tensor, target_meta_tensor, target_requires
     else:
         aliased_out = aliased_base_tensor.as_strided(size, stride, storage_offset)
     # For outputs aliasing inputs, we need to check if the requires-gradness has changed.
-    if aliased_base_tensor.requires_grad and not target_requires_grad:
-        aliased_out = aliased_out.detach()
-    elif not aliased_base_tensor.requires_grad and target_requires_grad:
-        aliased_out.requires_grad_(True)
+    aliased_out = patch_requires_grad(aliased_out)
     # For outputs aliasing inputs, we need to check if the dtype has changed.
     # as_strided() is the "most generic" view, but it does not cover cross-dtype views
     if aliased_out.dtype != target_meta_tensor.dtype:
@@ -315,7 +369,7 @@ def was_tensor_metadata_updated(arg, new_arg):
 # Returns the number of detected copy_
 def assert_functional_graph(fx_g: torch.fx.Graph) -> int:
     placeholders = set()
-    copy_count = 0
+    mutation_count = 0
     # NB: It would also be nice to verify that the mutations all happen at the
     # end, but we also do some administrative views after mutations so this
     # isn't actually true.  (TODO: Could this cause problems for Inductor?)
@@ -323,17 +377,38 @@ def assert_functional_graph(fx_g: torch.fx.Graph) -> int:
         if n.op == "placeholder":
             placeholders.add(n)
         if isinstance(n.target, torch._ops.OpOverload):
-            if n.target is torch.ops.aten.copy_.default:
+            if n.target in [
+                torch.ops.aten.copy_.default,
+                torch.ops.aten.set_.source_Tensor,
+            ]:
                 suffix = True
-                # Can only copy_ into an input, and can only do so once
+                # Can only copy_/set_ into an input, and can only do so once
                 assert n.args[0] in placeholders
                 placeholders.remove(n.args[0])
-                copy_count += 1
+                mutation_count += 1
             else:
                 assert (
                     not n.target._schema.is_mutable
                 ), f"aot_autograd expected to have an entirely functional graph, but found {n.format_node()}"
-    return copy_count
+    return mutation_count
+
+
+def propagate_input_mutation_stacktraces(fx_g: torch.fx.Graph) -> None:
+    placeholders = set()
+    for n in fx_g.nodes:
+        if n.op == "placeholder":
+            placeholders.add(n)
+        if isinstance(n.target, torch._ops.OpOverload):
+            if n.target is torch.ops.aten.copy_.default:
+                # Can only copy_ into an input, and can only do so once
+                assert n.args[0] in placeholders
+                placeholders.remove(n.args[0])
+                copy_from_node = n.args[1]
+                # Pre-condition: every node has a "stack_trace" field in its meta,
+                # but copy_() nodes do not (since we manually added them during functionalization).
+                # Instead, we manually propagate here.
+                if "stack_trace" in copy_from_node.meta:
+                    n.meta["stack_trace"] = copy_from_node.meta["stack_trace"]
 
 
 def _check_if_mutation_can_be_in_graph(
@@ -342,36 +417,22 @@ def _check_if_mutation_can_be_in_graph(
     mutates_metadata,
     mutations_hidden_from_autograd,
     mutations_under_no_grad_or_inference_mode,
+    mutates_storage_metadata,
     requires_grad,
 ):
     if keep_input_mutations:
-        return mutates_data and (
+        in_graph = (mutates_data or mutates_storage_metadata) and (
             (not mutates_metadata and not requires_grad)
             or mutations_hidden_from_autograd
             or mutations_under_no_grad_or_inference_mode
         )
-    return False
-
-
-def _get_mutation_type(
-    keep_input_mutations: bool,
-    mutates_data,
-    mutates_metadata,
-    mutations_hidden_from_autograd,
-    mutations_under_no_grad_or_inference_mode,
-    requires_grad,
-):
-    if (not mutates_data) and (not mutates_metadata):
-        return MutationType.NOT_MUTATED
-
-    if _check_if_mutation_can_be_in_graph(
-        keep_input_mutations,
-        mutates_data,
-        mutates_metadata,
-        mutations_hidden_from_autograd,
-        mutations_under_no_grad_or_inference_mode,
-        requires_grad,
-    ):
-        return MutationType.MUTATED_IN_GRAPH
-
-    return MutationType.MUTATED_OUT_GRAPH
+    else:
+        in_graph = False
+    # See Note [set_() Input Mutations in AOTAutograd]
+    # If there was a `set_()`, we require that all mutations were under no_grad,
+    # so we can (safely) emit the set_() in the graph at runtime
+    if mutates_storage_metadata:
+        assert (
+            in_graph
+        ), "input tensor encountered a set_(), but had other mutations that prevented us from including it in the graph safely"
+    return in_graph
diff --git a/torch/_functorch/_aot_autograd/input_output_analysis.py b/torch/_functorch/_aot_autograd/input_output_analysis.py
index 04b2c16964418..bac6e3a3f2da7 100644
--- a/torch/_functorch/_aot_autograd/input_output_analysis.py
+++ b/torch/_functorch/_aot_autograd/input_output_analysis.py
@@ -15,10 +15,10 @@
 import torch
 import torch.utils._pytree as pytree
 from torch import Tensor
-from torch._logging import getArtifactLogger
 from torch._subclasses.functional_tensor import FunctionalTensor
 from torch.fx.experimental.symbolic_shapes import is_concrete_int
-from .functional_utils import _get_mutation_type
+from .. import config
+from .collect_metadata_analysis import coerce_tangent
 from .schemas import (
     BackwardSignature,
     GraphSignature,
@@ -31,8 +31,6 @@
 
 zip = strict_zip
 
-aot_graphs_log = getArtifactLogger(__name__, "aot_graphs")
-
 
 def remove_dupe_metadata(
     m: ViewAndMutationMeta,
@@ -48,6 +46,7 @@ def remove_dupe_metadata(
     other_traced_tangents = m.traced_tangents[num_data_mutations:]
     inp_traced_tangents = m.traced_tangents[:num_data_mutations]
     filtered_inp_traced_tangents = [
+        # See Note [Tangents must be contiguous]
         x
         for i, x in enumerate(inp_traced_tangents)
         if keep_arg_mask[m.mutated_inp_runtime_indices[i]]
@@ -65,6 +64,7 @@ def remove_dupe_metadata(
                 dynamic_dims=o.dynamic_dims,
                 base_idx=None if o.base_idx is None else add_dupe_map[o.base_idx],
                 requires_grad=o.requires_grad,
+                functional_tensor=o.functional_tensor,
             )
             for o in m.output_info
         ],
@@ -151,14 +151,6 @@ def create_synthetic_base_metadata(
             m.input_info[x].mutations_under_no_grad_or_inference_mode
             for x in outer_indices
         )
-        mutation_type = _get_mutation_type(
-            m.keep_input_mutations,
-            mutates_data,
-            mutates_metadata,
-            mutations_hidden_from_autograd,
-            mutations_under_no_grad_or_inference_mode,
-            requires_grad,
-        )
 
         inpt_info = InputAliasInfo(
             # If len(outer_indices) > 1, then this input is a synthetic base.
@@ -176,7 +168,7 @@ def create_synthetic_base_metadata(
             mutations_under_no_grad_or_inference_mode=mutations_under_no_grad_or_inference_mode,
             is_leaf=any_leaf,
             requires_grad=requires_grad,
-            mutation_type=mutation_type,
+            keep_input_mutations=m.keep_input_mutations,
         )
         input_infos.append(inpt_info)
 
@@ -231,11 +223,13 @@ def create_synthetic_base_metadata(
                 # Map the input idx pre-synthetic-bases to the new idx post-synthetic-bases
                 base_idx=new_base_idx,  # type: ignore[arg-type]
                 requires_grad=o.requires_grad,
+                functional_tensor=o.functional_tensor,
             )
         )
 
     inner_mutated_tangents = [
-        x
+        # See Note [Tangents must be contiguous]
+        coerce_tangent(x)
         for inner_idx, x in enumerate(inner_args)
         if input_infos[inner_idx].mutates_data and input_infos[inner_idx].requires_grad
     ]
@@ -263,6 +257,13 @@ def create_synthetic_base_metadata(
     )
 
 
+def _get_last_mem_address(x):
+    out = x.storage_offset()
+    for size, stride in zip(x.size(), x.stride()):
+        out += (size - 1) * stride
+    return out
+
+
 # Assumption: x and y are known to share a storage, and we are trying to determine
 # if their memory is actually completely disjoint, based on sizes/strides/storage_offset
 def _tensors_definitely_do_not_overlap(x, y):
@@ -283,6 +284,11 @@ def _tensors_definitely_do_not_overlap(x, y):
             # definitely no overlap
             return True
 
+    # Short-circuit: if last memory address of x is < start of y, then not overlapping.
+    x_last = _get_last_mem_address(x)
+    if x_last < y.storage_offset():
+        return True
+
     if x.dim() == 2 and y.dim() == 2 and x.stride(1) == 1 and y.stride(1) == 1:
         # This cases is needed for the shampoo optimizer.
         # All tensors are 2d (non-contiguous), have the same outer stride, and have an inner stride of 1
@@ -332,11 +338,60 @@ def _tensors_definitely_do_not_overlap(x, y):
 
 
 def compute_overlapping_inputs(fwd_inputs, aliased_input_indices):
+    max_aliased_inps_w_dyn_shapes = (
+        config._max_aliased_inputs_with_dynamic_shapes_enabled
+    )
+    definitely_error_on_dyn_shapes = False
+    # If the JK is false / not set, we will fall back to obeying the config above
+    # If it is true, we will always error when there are aliased + mutated inps with dynamic shapes
+    if torch._inductor.config.is_fbcode():
+        definitely_error_on_dyn_shapes = torch._utils_internal.justknobs_check(
+            "pytorch/dynamo:disable_aliased_inputs_with_mutation_and_dyn_shapes"
+        )
+
     actual_aliased_indices = set()
-    for j in range(len(aliased_input_indices)):
+    num_aliases = len(aliased_input_indices)
+    # > 2 check because num_aliases==1 means no aliasing
+    if num_aliases >= 2 and (
+        definitely_error_on_dyn_shapes or num_aliases > max_aliased_inps_w_dyn_shapes
+    ):
+        dynamic_shape_indices = set()
+        for j in range(num_aliases):
+            j_ = aliased_input_indices[j]
+            curr_inp = fwd_inputs[j_]
+            if any(
+                isinstance(x, torch.SymInt)
+                for x in itertools.chain(
+                    curr_inp.shape, curr_inp.stride(), [curr_inp.storage_offset()]
+                )
+            ):
+                dynamic_shape_indices.add(j_)
+        assert (
+            len(dynamic_shape_indices) == 0
+        ), f"""\
+Encountered a graph where:
+- {num_aliases} graph inputs all share the same storage (input indices: {str(aliased_input_indices)})
+- at least one of these aliased inputs was mutated
+- at least one of these inputs is being compiled with dynamic shapes (indices: {str(dynamic_shape_indices)})
+
+Current limit: {str(max_aliased_inps_w_dyn_shapes)}
+Killswitch enabled: {str(definitely_error_on_dyn_shapes)}
+
+The most common way to run into this situation is when your model parameters are allocated as one giant buffer
+and are all mutated by the optimizer, and some of your parameters end up getting compiled with dynamic shapes.
+
+You can avoid this problem by marking your parameters so they explicitly do not participate in dynamic shapes,
+by marking each dim of your parameter static:
+
+torch._dynamo.mark_static(param, 0) # (1, 2, ... for every dimension on the parameter).
+
+If you are running into this issue in a situation where your parameters are static but some other inputs
+are aliased and mutated, and they should be dynamic, please file an issue.
+"""
+    for j in range(num_aliases):
         for i in range(j):
-            i_ = aliased_input_indices[i]
             j_ = aliased_input_indices[j]
+            i_ = aliased_input_indices[i]
             if not _tensors_definitely_do_not_overlap(fwd_inputs[i_], fwd_inputs[j_]):
                 actual_aliased_indices.add(i_)
                 actual_aliased_indices.add(j_)
@@ -344,7 +399,7 @@ def compute_overlapping_inputs(fwd_inputs, aliased_input_indices):
 
 
 def _graph_input_names(gm):
-    return [node.name for node in gm.graph.nodes if node.op == "placeholder"]
+    return [node.name for node in gm.graph.find_nodes(op="placeholder")]
 
 
 def _graph_output_names(gm):
@@ -374,9 +429,10 @@ def create_graph_signature(
     graph_output_names = _graph_output_names(fx_g)
 
     num_params_buffers = len(param_names) + len(buffer_names)
+    num_tokens = len(fw_metadata.tokens)
     # We have enough restrictions on the graph (no de-duping, synthetic bases, etc),
     # Such that # graph inps = # user inps + # params + # buffers
-    num_user_args = len(graph_input_names) - num_params_buffers
+    num_user_args = len(graph_input_names) - num_params_buffers - num_tokens
 
     if trace_joint:
         assert num_user_fw_outs is not None
@@ -411,7 +467,9 @@ def create_graph_signature(
     else:
         backward_signature = None
         num_user_fw_outs = (
-            len(graph_output_names) - fw_metadata.num_mutated_inp_runtime_indices
+            len(graph_output_names)
+            - fw_metadata.num_mutated_inp_runtime_indices
+            - num_tokens
         )
 
     return GraphSignature.from_tracing_metadata(
diff --git a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
index b63bec5eaf460..2aca2e386250f 100644
--- a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
@@ -8,21 +8,29 @@
 """
 
 import logging
+import time
 from contextlib import nullcontext
 from functools import wraps
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Sequence
 
 import torch
-import torch.utils._pytree as pytree
 import torch.utils.dlpack
 from torch import Tensor
 from torch._dynamo.utils import lazy_format_graph_code
-from torch._guards import detect_fake_mode, tracing, TracingContext
-from torch._logging import getArtifactLogger
+from torch._guards import (
+    compile_context,
+    CompileContext,
+    detect_fake_mode,
+    tracing,
+    TracingContext,
+)
+from torch._logging import getArtifactLogger, trace_structured
 from torch._prims_common import CUDARngStateHelper
 from torch._subclasses import FakeTensor
+from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental.proxy_tensor import is_sym_node
 from torch.fx.experimental.symbolic_shapes import fx_placeholder_vals
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 from .. import config
 from .dispatch_and_compile_graph import (
     aot_dispatch_autograd_graph,
@@ -32,8 +40,8 @@
 
 from .runtime_wrappers import (
     aot_dispatch_subclass_wrapper,
-    create_runtime_wrapper,
     functionalized_rng_runtime_epilogue,
+    RuntimeWrapper,
 )
 from .schemas import (
     AOTConfig,
@@ -43,7 +51,11 @@
     TensorAlias,
     ViewAndMutationMeta,
 )
-from .subclass_utils import unwrap_tensor_subclasses, wrap_tensor_subclasses
+from .subclass_utils import (
+    compute_inner_mutated_inp_indices_from_subclass_meta,
+    unwrap_tensor_subclasses,
+    wrap_tensor_subclasses,
+)
 
 from .utils import (
     _get_symint_hints,
@@ -51,6 +63,7 @@
     make_boxed_func,
     normalize_as_list,
     strict_zip,
+    unlift_tokens,
 )
 
 zip = strict_zip
@@ -62,6 +75,62 @@
 aten = torch.ops.aten
 
 
+def _force_contiguous(x):
+    if not isinstance(x, torch.Tensor):
+        return x
+    x = x.contiguous()
+    if not is_traceable_wrapper_subclass(x):
+        return x
+    for attr in x.__tensor_flatten__()[0]:  # type: ignore[attr-defined]
+        elem = getattr(x, attr)
+        if not elem.is_contiguous():
+            setattr(x, attr, elem.contiguous())
+    return x
+
+
+def _compute_output_meta_with_inductor_strides(fw_module, fwd_output_strides):
+    out = [n.meta["val"] for n in (list(fw_module.graph.nodes)[-1].args[0])]
+    # will only be set for inductor
+    if not fwd_output_strides:
+        return out
+    with TracingContext.get().fake_mode.shape_env.suppress_guards():
+        for i in range(len(out)):
+            if not isinstance(out[i], Tensor):
+                continue
+            if all(s1 == s2 for s1, s2 in zip(out[i].stride(), fwd_output_strides[i])):
+                continue
+            out[i] = out[i].as_strided(out[i].shape, fwd_output_strides[i])
+    return out
+
+
+# See Note [Tangents must be contiguous, Part 2]
+def coerce_runtime_tangent(x, metadata_tensor):
+    if not isinstance(x, torch.Tensor):
+        return x
+    if not is_traceable_wrapper_subclass(x):
+        return x
+    assert is_traceable_wrapper_subclass(metadata_tensor)
+    _, runtime_tangent_metadata = x.__tensor_flatten__()  # type: ignore[attr-defined]
+    _, expected_tangent_metadata = metadata_tensor.__tensor_flatten__()
+    if runtime_tangent_metadata == expected_tangent_metadata:
+        return x
+    if not hasattr(x, "__coerce_same_metadata_as_tangent__"):
+        raise RuntimeError(
+            f"""
+During the backward, we encountered a tensor subclass where we guessed its
+metadata incorrectly.
+
+Expected metadata: {str(expected_tangent_metadata)}
+
+Runtime metadata: {str(runtime_tangent_metadata)}
+
+shape: {str(x.shape)}
+To fix this, your tensor subclass must implement the dunder method __force_to_same_metadata__.
+"""
+        )
+    return x.__coerce_same_metadata_as_tangent__(metadata_tensor)  # type: ignore[attr-defined]
+
+
 def aot_dispatch_base(
     flat_fn,
     flat_args: List[Tensor],
@@ -75,6 +144,7 @@ def aot_dispatch_base(
 
     disable_amp = torch._C._is_any_autocast_enabled()
     context = torch._C._DisableAutocast if disable_amp else nullcontext
+    fakified_out = None
 
     with context(), track_graph_compiling(aot_config, "inference"):
         compiler = (
@@ -94,10 +164,17 @@ def aot_dispatch_base(
                 if maybe_subclass_meta is None
                 else maybe_subclass_meta.fw_metadata
             )
-        compiled_fw = compiler(fw_module, updated_flat_args)
 
-    # This boxed_call handling happens inside create_runtime_wrapper as well.
-    # However, create_runtime_wrapper does not expect the rng offsets in the
+        with TracingContext.report_output_strides() as fwd_output_strides:
+            compiled_fw = compiler(fw_module, updated_flat_args)
+
+        # see note: [Returning Fake Tensors on First AOT Autograd Call]
+        if tracing_context and tracing_context.fakify_first_call:
+            fakified_out = _compute_output_meta_with_inductor_strides(
+                fw_module, fwd_output_strides
+            )
+
+    # However, RuntimeWrapper does not expect the rng offsets in the
     # output. So, we have to create another wrapper and take out the offset. As
     # a result, we have to account for not boxed_call compilers as well.
     if not hasattr(compiled_fw, "_boxed_call"):
@@ -105,8 +182,14 @@ def aot_dispatch_base(
 
     # Create a wrapper to set up the rng functionalize bits
     @wraps(compiled_fw)
-    def rng_functionalization_wrapper(args):
-        # args is a list because compiled_fw is boxed_call
+    def rng_functionalization_wrapper(args: List[Any]):
+        # see note: [Returning Fake Tensors on First AOT Autograd Call]
+        nonlocal fakified_out
+        if fakified_out is not None:
+            out = fakified_out
+            fakified_out = None
+            return out
+
         if fw_metadata.is_rng_op_functionalized:
             # Add the seed and offset to args
             seed, offset = CUDARngStateHelper.get_torch_state_as_tuple()
@@ -129,18 +212,44 @@ def rng_functionalization_wrapper(args):
     if not hasattr(compiled_fw_func, "_boxed_call"):
         compiled_fw_func = make_boxed_func(compiled_fw_func)
 
-    compiled_fn = create_runtime_wrapper(
-        compiled_fw_func,
-        runtime_metadata=fw_metadata,
+    compiled_fn = RuntimeWrapper(
         indices_of_inps_to_detach=[],
         trace_joint=False,
-        keep_input_mutations=aot_config.keep_inference_input_mutations,
         disable_amp=disable_amp,
+    ).post_compile(
+        compiled_fw_func,
+        aot_config,
+        fw_metadata=fw_metadata,
     )
 
     return compiled_fn
 
 
+def _output_node(gm: torch.fx.GraphModule) -> torch.fx.Node:
+    """Return the output node of a graph"""
+    # reversed() since we expect output at end of graph
+    return next(reversed(gm.graph.find_nodes(op="output")))
+
+
+def _input_node(gm: torch.fx.GraphModule, i: int) -> torch.fx.Node:
+    """Fetch the i-th placeholder in the graph"""
+    seen = 0
+    for n in gm.graph.find_nodes(op="placeholder"):
+        if seen == i:
+            return n
+        seen += 1
+    raise IndexError(f"input {i} does not exist, only {seen} inputs in graph")
+
+
+def _can_detach(node: torch.fx.Node):
+    """
+    Avoid calling .detach() on inputs passed to _bind_nn_parameter()
+    """
+    from torch._dynamo.create_parameter_op import _bind_nn_parameter
+
+    return all(n.target is not _bind_nn_parameter for n in node.users)
+
+
 def aot_dispatch_autograd(
     flat_fn,
     flat_args: List[Any],
@@ -148,21 +257,25 @@ def aot_dispatch_autograd(
     *,
     fw_metadata: ViewAndMutationMeta,
 ):
+    fw_metadata.deterministic = torch.are_deterministic_algorithms_enabled()
     fx_g, joint_inputs, maybe_subclass_meta = aot_dispatch_autograd_graph(  # type: ignore[misc]
         flat_fn, flat_args, aot_config, fw_metadata=fw_metadata
     )
 
     # Copied from aot_dispatch_autograd_graph.
-    traced_tangents = pytree.tree_map(
-        lambda x: x.detach().contiguous() if isinstance(x, Tensor) else x,
-        fw_metadata.traced_tangents,
-    )
     disable_amp = torch._C._is_any_autocast_enabled()
 
     if aot_config.enable_log:
         aot_joint_log.info(
             "%s", lazy_format_graph_code("Joint graph", fx_g, aot_config.aot_id)
         )
+        trace_structured(
+            "aot_joint_graph",
+            payload_fn=lambda: fx_g.print_readable(print_output=False),  # type: ignore[union-attr]
+        )
+
+    fakify_first_call = False
+    fakified_out = None
 
     with torch.no_grad():
         inner_meta = (
@@ -172,16 +285,32 @@ def aot_dispatch_autograd(
         )
         with track_graph_compiling(aot_config, "joint"):
             # See Note: [Partitioner handling for Subclasses, Part 1]
+            # See Note: [Recomputing subclass mutation handling]
+            mutated_inp_runtime_indices = (
+                compute_inner_mutated_inp_indices_from_subclass_meta(
+                    fw_metadata, inner_meta
+                )
+            )
+            num_tokens = len(fw_metadata.tokens)
+            num_mutated_inp_runtime_indices = len(mutated_inp_runtime_indices)
             num_inner_fwd_outputs = (
-                inner_meta.num_mutated_inp_runtime_indices
+                num_mutated_inp_runtime_indices
                 + inner_meta.num_outputs
                 + inner_meta.num_intermediate_bases
                 + inner_meta.num_outputs_rng_offset
+                + num_tokens  # See Note [Side-Effectful Tokens in AOTAutograd]
             )
             fw_module, bw_module = aot_config.partition_fn(
                 fx_g, joint_inputs, num_fwd_outputs=num_inner_fwd_outputs
             )
-            fw_outs = next(n for n in fw_module.graph.nodes if n.op == "output").args[0]
+
+            # See Note [Side-Effectful Tokens in AOTAutograd]
+            if num_tokens != 0 and config.unlift_effect_tokens:
+                unlift_tokens(fw_module, fw_metadata)
+                num_inner_fwd_outputs -= num_tokens
+                joint_inputs = (joint_inputs[0][num_tokens:], joint_inputs[1])
+
+            fw_outs = next(iter(fw_module.graph.find_nodes(op="output"))).args[0]
             # we only need to bookkeep the symints that are saved for bw, not any symints
             # the user forward might have returned in its own output
             fw_outs_saved_for_bw = fw_outs[num_inner_fwd_outputs:]
@@ -246,8 +375,8 @@ def aot_dispatch_autograd(
         # and we will end up with a zero grad at x.
         # If we later backprop through the second output, this will also require backprop'ing through x.
         # Meaning we'll need to use `retain_graph=True` to be able to backprop through x the second time.
-        _indices_of_inps_to_detach = []
-        bw_outs = next(n for n in bw_module.graph.nodes if n.op == "output").args[0]
+        _indices_of_inps_to_detach: List[int] = []
+        bw_outs: Sequence[torch.fx.Node] = _output_node(bw_module).args[0]  # type: ignore[assignment]
 
         # TODO: we should apply the below "detach inputs if their gradients are statically known to be None"
         # optimization even if we have subclass inputs/outputs (we do not handle this today).
@@ -261,8 +390,24 @@ def aot_dispatch_autograd(
                 len(bw_outs)
                 == len(fw_metadata.input_info) + inner_meta.num_outputs_rng_offset
             )
-            for i, (bw_out) in enumerate(bw_outs):
-                if bw_out is None:
+            bw_outs_no_rng = bw_outs
+            if inner_meta.num_outputs_rng_offset > 0:
+                bw_outs_no_rng = bw_outs[: -inner_meta.num_outputs_rng_offset]
+            assert len(bw_outs_no_rng) == len(fw_metadata.input_info)
+
+            for i, (bw_out) in enumerate(bw_outs_no_rng):
+                # If our input experiences a metadata mutation inside the graph (e.g. set_()),
+                # we *must* not detach, otherwise it will be the detach'd input that gets the metadata mutation
+                metadata_mutation_in_graph = (
+                    fw_metadata.input_info[i].mutation_type
+                    == MutationType.MUTATED_IN_GRAPH
+                    and fw_metadata.input_info[i].mutates_storage_metadata
+                )
+                if (
+                    bw_out is None
+                    and not metadata_mutation_in_graph
+                    and _can_detach(_input_node(fx_g, i))
+                ):
                     _indices_of_inps_to_detach.append(i)
 
         if aot_config.enable_log:
@@ -274,6 +419,14 @@ def aot_dispatch_autograd(
                 "%s",
                 lazy_format_graph_code("Backward graph", bw_module, aot_config.aot_id),
             )
+            trace_structured(
+                "aot_forward_graph",
+                payload_fn=lambda: fw_module.print_readable(print_output=False),
+            )
+            trace_structured(
+                "aot_backward_graph",
+                payload_fn=lambda: bw_module.print_readable(print_output=False),
+            )
 
         with track_graph_compiling(aot_config, "forward"):
             # flat_args at this point might still be subclasses-
@@ -296,6 +449,13 @@ def aot_dispatch_autograd(
             if not hasattr(compiled_fw_func, "_boxed_call"):
                 compiled_fw_func = make_boxed_func(compiled_fw_func)
 
+            # see note: [Returning Fake Tensors on First AOT Autograd Call]
+            if tracing_context and tracing_context.fakify_first_call:
+                fakified_out = _compute_output_meta_with_inductor_strides(
+                    fw_module, fwd_output_strides
+                )
+                fakify_first_call = True
+
             if maybe_subclass_meta is not None:
                 # Why do we need to pass in num_fw_outs_saved_for_bw?
                 # See Note: [Partitioner handling for Subclasses, Part 2]
@@ -373,8 +533,32 @@ def aot_dispatch_autograd(
                             "failed to eagerly compile backwards for dynamic, suppressing in case backwards not needed",
                             exc_info=True,
                         )
+            # Compiled autograd will run the bw_module in the backward pass,
+            # so recompilation need happen anyway if the backward pass is ever
+            # called.
+            #
+            # The reason we do the GraphModule recompilation here is because
+            # the lazy recompilation will cause issue in the backward pass
+            # with compiled autograd.
+            #
+            # Do the _LazyGraphModule.force_recompile here rather than when
+            # bw_module is first generated by the partitioner because the bw_module.recompile
+            # may be called in some code path later and cause the _LazyGraphModule.forward
+            # becomes the lazy version again. One example is when dynamic shape is enabled
+            # upfront, the bw_compiler will be called above which can cause extra
+            # graph module recompilation on bw_module.
+            if torch._dynamo.compiled_autograd.compiled_autograd_enabled_count:
+                from torch.fx._lazy_graph_module import _LazyGraphModule
+
+                _LazyGraphModule.force_recompile(bw_module)
 
     saved_context = TracingContext.try_get()
+    saved_compile_context = CompileContext.try_get()
+
+    backward_state_indices = [
+        idx for idx, x in enumerate(flat_args) if isinstance(x, BackwardState)
+    ]
+    assert len(backward_state_indices) <= 1
 
     class CompiledFunction(torch.autograd.Function):
         compiled_fw = compiled_fw_func
@@ -382,42 +566,57 @@ class CompiledFunction(torch.autograd.Function):
         metadata: ViewAndMutationMeta = fw_metadata  # type: ignore[assignment]
         maybe_subclass_metadata: Optional[SubclassMeta] = maybe_subclass_meta
         num_symints_saved_for_bw = _num_symints_saved_for_bw
+        _compiled_autograd_should_lift = False
+        _fakify_first_call = fakify_first_call
 
         @staticmethod
         def _compiled_autograd_key(ctx):
-            return (aot_config.aot_id, *ctx.symints)
+            return (ctx._autograd_function_id, *ctx.symints)
 
         @staticmethod
         def forward(ctx, *deduped_flat_tensor_args):
             args = deduped_flat_tensor_args
+            if backward_state_indices:
+                bw_state = args[backward_state_indices[0]]
+                assert isinstance(bw_state, BackwardState)
+                ctx._compiled_autograd_backward_state = bw_state
 
             marked_dirty_inps = []
             for i in fw_metadata.mutated_graph_handled_indices_seen_by_autograd:
-                ctx.mark_dirty(deduped_flat_tensor_args[i])
-                marked_dirty_inps.append(deduped_flat_tensor_args[i])
-
-            if CompiledFunction.metadata.is_rng_op_functionalized:
-                # Add the seed and offset to args
-                seed, offset = CUDARngStateHelper.get_torch_state_as_tuple()
-                args = (*args, seed, offset)
-            # There is a pretty complicated calling convention around what the compiled fw returns.
-            # The full list of outputs and their relative order is:
-            # (*mutated_inputs, *fw_outs, *fw_intermediate_bases, *saved_tensors, *saved_symints)
-            # - Note that in the synthetic bases case, mutated_inputs will correspond to an updated version
-            #   of the original view, and not the synthetic base
-            fw_outs = call_func_at_runtime_with_args(
-                CompiledFunction.compiled_fw,
-                args,
-                disable_amp=disable_amp,
-            )
+                arg = deduped_flat_tensor_args[i]
+                if not (arg.requires_grad and arg.is_leaf):  # would error
+                    ctx.mark_dirty(arg)
+                marked_dirty_inps.append(arg)
+
+            if not CompiledFunction._fakify_first_call:
+                if CompiledFunction.metadata.is_rng_op_functionalized:
+                    # Add the seed and offset to args
+                    seed, offset = CUDARngStateHelper.get_torch_state_as_tuple()
+                    args = (*args, seed, offset)
+                # There is a pretty complicated calling convention around what the compiled fw returns.
+                # The full list of outputs and their relative order is:
+                # (*tokens, *mutated_inputs, *fw_outs, *fw_intermediate_bases, *saved_tensors, *saved_symints)
+                # - Note that in the synthetic bases case, mutated_inputs will correspond to an updated version
+                #   of the original view, and not the synthetic base
+
+                fw_outs = call_func_at_runtime_with_args(
+                    CompiledFunction.compiled_fw,
+                    args,
+                    disable_amp=disable_amp,
+                )
+            else:
+                nonlocal fakified_out
+                assert fakified_out is not None
+                CompiledFunction._fakify_first_call = False
+                fw_outs = fakified_out
+                fakified_out = None
 
             num_outputs = CompiledFunction.metadata.num_outputs
             num_outputs_aliased = CompiledFunction.metadata.num_outputs_aliased
-            num_intermediate_bases = CompiledFunction.metadata.num_intermediate_bases
-            num_symints_saved_for_bw = CompiledFunction.num_symints_saved_for_bw
             num_mutated_runtime_inps = (
                 CompiledFunction.metadata.num_mutated_inp_runtime_indices
             )
+            num_tokens = len(CompiledFunction.metadata.tokens)
             num_forward_returns = CompiledFunction.metadata.num_forward_returns
             num_forward = CompiledFunction.metadata.num_forward
 
@@ -454,7 +653,10 @@ def forward(ctx, *deduped_flat_tensor_args):
                     # (instead of looping over inputs with either data or metadata mutations), but there shouldn't be many.
                     info = CompiledFunction.metadata.input_info[idx]
                     if info.mutates_metadata and not info.mutates_data:
-                        raw_returns[i] = TensorAlias(raw_returns[i])
+                        raw_return_idx = num_tokens + i
+                        raw_returns[raw_return_idx] = TensorAlias(
+                            raw_returns[raw_return_idx]
+                        )
 
                 if config.debug_assert:
                     user_mutated_inputs_raw = raw_returns[0:num_mutated_runtime_inps]
@@ -467,7 +669,7 @@ def forward(ctx, *deduped_flat_tensor_args):
 
             if CompiledFunction.metadata.num_unsafe_view_outputs > 0:
                 for idx in CompiledFunction.metadata.unsafe_view_out_indices:
-                    raw_return_idx = num_mutated_runtime_inps + idx
+                    raw_return_idx = num_tokens + num_mutated_runtime_inps + idx
                     o = raw_returns[raw_return_idx]
                     raw_returns[raw_return_idx] = torch.ops.aten._unsafe_view(
                         o, o.shape
@@ -475,14 +677,14 @@ def forward(ctx, *deduped_flat_tensor_args):
 
             if num_outputs_aliased > 0:
                 for idx in CompiledFunction.metadata.aliased_out_indices:
-                    raw_return_idx = num_mutated_runtime_inps + idx
+                    raw_return_idx = num_tokens + num_mutated_runtime_inps + idx
                     raw_returns[raw_return_idx] = TensorAlias(
                         raw_returns[raw_return_idx]
                     )
 
                 if config.debug_assert:
                     intermediates_raw = raw_returns[
-                        num_mutated_runtime_inps + num_outputs :
+                        num_tokens + num_mutated_runtime_inps + num_outputs :
                     ]
                     assert not any(
                         isinstance(x, TensorAlias) for x in intermediates_raw
@@ -491,7 +693,7 @@ def forward(ctx, *deduped_flat_tensor_args):
             # invariant: intermediate bases always require gradients, so we don't have to
             # consider marking them as non-differentiable.
             raw_returns_not_including_intermediate_bases = raw_returns[
-                : num_mutated_runtime_inps + num_outputs
+                : num_mutated_runtime_inps + num_outputs + num_tokens
             ]
             raw_returns_meta = [
                 x
@@ -501,7 +703,9 @@ def forward(ctx, *deduped_flat_tensor_args):
 
             fw_outs_not_requiring_grad = [
                 x
-                for (i, x) in enumerate(raw_returns_not_including_intermediate_bases)
+                for (i, x) in enumerate(
+                    raw_returns_not_including_intermediate_bases[num_tokens:]
+                )
                 if isinstance(x, torch.Tensor) and not raw_returns_meta[i].requires_grad
             ]
             ctx.mark_non_differentiable(*fw_outs_not_requiring_grad)
@@ -527,16 +731,30 @@ def backward(ctx, *flat_args):
             # and we filter them out here before passing the remaining grad_outputs into the compiled backward.
             num_intermediate_bases = CompiledFunction.metadata.num_intermediate_bases
             num_graph_handled_inputs = (
-                CompiledFunction.metadata.num_mutated_graph_handled_indices
+                CompiledFunction.metadata.num_mutated_graph_handled_indices_seen_by_autograd
             )
             num_mutated_runtime_inps = (
                 CompiledFunction.metadata.num_mutated_inp_runtime_indices
             )
+            num_tokens = len(CompiledFunction.metadata.tokens)
             expected_grad_outs = (
                 CompiledFunction.metadata.num_outputs
                 + num_mutated_runtime_inps
                 + num_intermediate_bases
+                + num_tokens
             )
+            deterministic = CompiledFunction.metadata.deterministic
+            global_deterministic = torch.are_deterministic_algorithms_enabled()
+            if deterministic is not None:
+                torch._check(
+                    not (not deterministic and global_deterministic),
+                    lambda: (
+                        "This compiled backward function is being run with "
+                        "torch.use_deterministic_algorithms(True), "
+                        "but it was previously generated during the forward function while "
+                        "torch.use_deterministic_algorithms(False) was set."
+                    ),
+                )
 
             if num_graph_handled_inputs > 0:
                 flat_args = flat_args[:-num_graph_handled_inputs]
@@ -544,13 +762,17 @@ def backward(ctx, *flat_args):
             out_info = CompiledFunction.metadata.output_info
 
             inp_tangents, out_tangents, intermediate_base_tangents = (
-                flat_args[0:num_mutated_runtime_inps],
+                flat_args[num_tokens:num_mutated_runtime_inps],
                 flat_args[
-                    num_mutated_runtime_inps : num_mutated_runtime_inps
+                    num_tokens
+                    + num_mutated_runtime_inps : num_tokens
+                    + num_mutated_runtime_inps
                     + CompiledFunction.metadata.num_outputs
                 ],
                 flat_args[
-                    num_mutated_runtime_inps + CompiledFunction.metadata.num_outputs :
+                    num_tokens
+                    + num_mutated_runtime_inps
+                    + CompiledFunction.metadata.num_outputs :
                 ],
             )
             # input_info contains info on *every* input,
@@ -669,6 +891,17 @@ def backward(ctx, *flat_args):
                         is_joint_structure=False,
                     )
                 )
+                all_args = [
+                    coerce_runtime_tangent(
+                        t,
+                        CompiledFunction.metadata.traced_tangents[
+                            i - tangents_start_idx
+                        ],
+                    )
+                    if tangents_start_idx <= i < tangents_end_idx
+                    else t
+                    for i, t in enumerate(all_args)
+                ]
                 all_args = unwrap_tensor_subclasses(all_args, is_joint_structure=False)
                 tangents_start_idx = len(all_args) - len_tangents - len(rng_args)
                 tangents_end_idx = tangents_start_idx + len_tangents
@@ -676,7 +909,9 @@ def backward(ctx, *flat_args):
             # Make the tangents contiguous. Note that we must do this after subclass desugaring
             # because inputs to inductor have to be contiguous
             all_args = [
-                t.contiguous() if tangents_start_idx <= i < tangents_end_idx else t
+                _force_contiguous(t)
+                if (tangents_start_idx <= i < tangents_end_idx)
+                else t
                 for i, t in enumerate(all_args)
             ]
 
@@ -686,6 +921,9 @@ def call_compiled_backward():
                     symints = ctx._get_compiled_autograd_symints()
                     assert len(symints) == len(ctx.symints)
                     all_args[: len(symints)] = symints
+                    if backward_state_indices:
+                        assert ctx._compiled_autograd_backward_state.proxy is not None
+                        all_args.append(ctx._compiled_autograd_backward_state)
                     context = torch._C._DisableAutocast if disable_amp else nullcontext
                     with context():
                         out = normalize_as_list(bw_module(*all_args))
@@ -693,15 +931,50 @@ def call_compiled_backward():
                         CompiledFunction.metadata, out
                     )
                     return tuple(out)
+                assert (
+                    not backward_state_indices
+                ), "BackwardState requires CompiledAutograd"
                 ctx.maybe_clear_saved_tensors()
                 if CompiledFunction.compiled_bw is None:
                     context = torch._C._DisableAutocast if disable_amp else nullcontext
-                    with tracing(saved_context), context(), track_graph_compiling(
-                        aot_config, "backward"
-                    ):
-                        CompiledFunction.compiled_bw = aot_config.bw_compiler(
-                            bw_module, placeholder_list
-                        )
+                    with tracing(saved_context), compile_context(
+                        saved_compile_context
+                    ), context(), track_graph_compiling(aot_config, "backward"):
+                        fail_type: Optional[str] = None
+                        fail_reason: Optional[str] = None
+                        start_time = time.time()
+                        try:
+                            CompiledFunction.compiled_bw = aot_config.bw_compiler(
+                                bw_module, placeholder_list
+                            )
+                        except Exception as e:
+                            fail_type = str(type(e))
+                            fail_reason = str(e)
+                            if saved_compile_context is not None:
+                                e.compile_id = saved_compile_context.compile_id  # type: ignore[attr-defined]
+                            raise
+                        finally:
+                            # TODO: Similar to CompilationMetrics, we would
+                            # like to report inductor_compile_time, but we
+                            # cannot conveniently do so because these are
+                            # keyed on utils.frame, and frame key is not
+                            # incremented on backwards compilations.  Maybe
+                            # should just bump the frame key here too?
+                            end_time = time.time()
+                            # TODO: Put this in scuba?  But CompilationMetrics
+                            # is kind of not a great match, because there's no
+                            # interaction with Dynamo, so a lot of Dynamo only
+                            # events don't exist anymore.  So we need a new
+                            # scuba table. Lazy lazy...
+                            trace_structured(
+                                "aot_autograd_backward_compilation_metrics",
+                                lambda: {
+                                    "start_time": start_time,
+                                    "elapsed_time": time.time() - start_time,
+                                    "fail_type": fail_type,
+                                    "fail_reason": fail_reason,
+                                },
+                            )
 
                 out = call_func_at_runtime_with_args(
                     CompiledFunction.compiled_bw,
@@ -722,6 +995,9 @@ def call_compiled_backward():
                 # See comment for why once_differentiable is not sufficient:
                 # https://github.com/pytorch/pytorch/pull/92348/files#r1072962107
                 class CompiledFunctionBackward(torch.autograd.Function):
+                    # CompiledFunctionBackward is not yet supported in dynamo skipfiles
+                    _compiled_autograd_should_lift = False
+
                     @staticmethod
                     def forward(ctx, *unused_args):
                         outs = call_compiled_backward()
@@ -744,7 +1020,7 @@ def backward(ctx, *args):
                             "torch.compile with aot_autograd does not currently support double backward"
                         )
 
-                CompiledFunctionBackward._compiled_autograd_key = (  # type: ignore[attr-defined]
+                CompiledFunctionBackward._compiled_autograd_key = (  # type: ignore[method-assign]
                     CompiledFunction._compiled_autograd_key
                 )
 
@@ -763,16 +1039,17 @@ def backward(ctx, *args):
                     out,
                     subclass_metas=CompiledFunction.maybe_subclass_metadata.grad_input_metas,
                 )
-                return outs_wrapped
-            return out
+                return (*[None] * num_tokens, *outs_wrapped)
+            return (*[None] * num_tokens, *out)
 
-    compiled_function = create_runtime_wrapper(
-        CompiledFunction.apply,
-        runtime_metadata=fw_metadata,
+    compiled_function = RuntimeWrapper(
         indices_of_inps_to_detach=_indices_of_inps_to_detach,
         trace_joint=True,
-        keep_input_mutations=aot_config.keep_inference_input_mutations,
         disable_amp=disable_amp,
+    ).post_compile(
+        CompiledFunction.apply,
+        aot_config,
+        fw_metadata=fw_metadata,
     )
 
     if not config.debug_assert:
@@ -783,7 +1060,7 @@ def backward(ctx, *args):
     ]
 
     @wraps(compiled_function)
-    def debug_compiled_function(*args):
+    def debug_compiled_function(args: List[Any]):
         # TODO: Check aliasing relationships
         # TODO: Check strides for metadata mutation
         # (NB: ideally, this logic is factored out of this function and
@@ -803,6 +1080,6 @@ def debug_compiled_function(*args):
                     f"{describe_input(i, aot_config)} would not require grad",
                 )
 
-        return compiled_function(*args)
+        return compiled_function(args)
 
     return debug_compiled_function
diff --git a/torch/_functorch/_aot_autograd/logging_utils.py b/torch/_functorch/_aot_autograd/logging_utils.py
index 4d787f4ab4e27..414166cbdd2f9 100644
--- a/torch/_functorch/_aot_autograd/logging_utils.py
+++ b/torch/_functorch/_aot_autograd/logging_utils.py
@@ -46,12 +46,22 @@ def track_graph_compiling(aot_config, graph_name):
     global graph_being_compiled
     # TODO: Don't shove the aot_id in here; set it in the context
     graph_being_compiled = [f"{aot_config.aot_id}_{graph_name}"]
+    old_name = None
+    if tracing_context := torch._guards.TracingContext.try_get():
+        old_name = tracing_context.aot_graph_name
+        tracing_context.aot_graph_name = graph_being_compiled
+        has_tracing_context = True
+    else:
+        has_tracing_context = False
     try:
         yield
     finally:
         global nth_graph
         nth_graph += 1
         graph_being_compiled = []
+        if has_tracing_context:
+            if tracing_context := torch._guards.TracingContext.try_get():
+                tracing_context.aot_graph_name = old_name
 
 
 # Set up hooks so that during backward the fx's stack_trace is properly set
@@ -65,7 +75,7 @@ def iter_graph(roots):
         seen = set()
         q = collections.deque()  # type: ignore[var-annotated]
         for node in roots:
-            if node is not None:
+            if node is not None and node not in seen:
                 seen.add(node)
                 q.append(node)
 
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
index 2a5143fa79bf9..11366ad3be20b 100644
--- a/torch/_functorch/_aot_autograd/runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -8,6 +8,7 @@
 
 import collections
 import pprint
+from dataclasses import dataclass
 from functools import wraps
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
@@ -52,6 +53,79 @@
 zip = strict_zip
 
 
+class CompilerWrapper:
+    """
+    A wrapper around the inputs and outputs to the compiler_fn. We separate these into two parts:
+
+    1. The prologue, which edits the input to the compiler_fn(flat_fn, flat_args, etc)
+    2. The epilogue, which edits the outputs of the compiler_fn (compiled_fn, real arguments)
+
+    Each wrapper below should be implemented as a CompilerWrapper, so that we can facilitate
+    caching on the compiled output, and re-wrapping the output via epilogues.
+    Extra metadata that is needed to compute pre or post compile can be passed in via attributes.
+    """
+
+    def pre_compile(
+        self,
+        flat_fn,
+        flat_args: List[Tensor],
+        aot_config: AOTConfig,
+        *,
+        fw_metadata: ViewAndMutationMeta,
+    ):
+        """
+        Process the inputs to the compiler_fn. You can pass in extra metadata via kwargs.
+        Args:
+        flat_fn: The function to compile
+        flat_args: Metadata from example inputs of the function to compile
+        aot_config: AOTConfig passed in at compile time
+        fw_metadata: ViewAndMutationMeta generated from flat_fn and flat_args
+        """
+        return flat_fn, flat_args, aot_config, fw_metadata
+
+    def post_compile(self, compiled_fn, aot_config, *, fw_metadata):
+        """
+        Given an output of the compiler, wrap it with information received from prologue.
+        Args:
+        compiled_fn: Callable after calling compiler_fn
+        aot_config: AOTConfig after calling prologue
+        fw_metadata: ViewAndMutationMeta after calling prologue
+        Example:
+
+        def wrapped_compiled_fn(args):
+            # do something with args, aot_config, fw_metadata
+            return compiled_fn(args)
+
+        return wrapped_compiled_fn
+        """
+        return compiled_fn
+
+    def create(
+        self,
+        flat_fn,
+        flat_args: List[Tensor],
+        aot_config: AOTConfig,
+        *,
+        fw_metadata: ViewAndMutationMeta,
+        compiler_fn,
+        **kwargs,
+    ):
+        (
+            wrapped_flat_fn,
+            new_flat_args,
+            new_aot_config,
+            new_fw_metadata,
+        ) = self.pre_compile(
+            flat_fn, flat_args, aot_config, fw_metadata=fw_metadata, **kwargs
+        )
+        compiled_fn = compiler_fn(
+            wrapped_flat_fn, new_flat_args, new_aot_config, fw_metadata=new_fw_metadata
+        )
+        return self.post_compile(
+            compiled_fn, new_aot_config, fw_metadata=new_fw_metadata, **kwargs
+        )
+
+
 # The wrapper created by this function handles all of the runtime aliasing and mutation "epilogue" logic
 # that needs to run after the compiled function.
 #
@@ -60,7 +134,30 @@
 # This is because there are some minor differences in how we treat these cases at runtime:
 # - resize_() is currently handled in the inference case, but not fully handled in the autograd case.
 # - the autograd cases inserts TensorAlias wrapper objects for outputs that alias inputs
-def create_runtime_wrapper(
+@dataclass
+class RuntimeWrapper(CompilerWrapper):
+    indices_of_inps_to_detach: List[int]
+    trace_joint: bool
+    disable_amp: bool
+
+    def post_compile(
+        self,
+        compiled_fn,
+        aot_config: AOTConfig,
+        *,
+        fw_metadata: ViewAndMutationMeta,
+    ):
+        return _create_runtime_wrapper(
+            compiled_fn,
+            runtime_metadata=fw_metadata,
+            indices_of_inps_to_detach=self.indices_of_inps_to_detach,
+            trace_joint=self.trace_joint,
+            keep_input_mutations=aot_config.keep_inference_input_mutations,
+            disable_amp=self.disable_amp,
+        )
+
+
+def _create_runtime_wrapper(
     compiled_fn,
     *,
     runtime_metadata: ViewAndMutationMeta,
@@ -72,7 +169,40 @@ def create_runtime_wrapper(
     if not hasattr(compiled_fn, "_boxed_call"):
         compiled_fn = make_boxed_func(compiled_fn)
 
-    def runtime_wrapper(*args):
+    # Note [Inputs needed in runtime epilogue after list clearing]
+    # In Python functions, you can't free the input arguments of a function within the scope of that function. A workaround is to
+    # wrap the input arguments in a list, and clear the list from within the function.
+    # Here, this is implemented as `call_func_at_runtime_with_args(..., steal_args=True)`.
+    #
+    # This is needed for Compiled Autograd since some of the inputs (activations) should be freed early.
+    # However, we cannot blindly clear the entire list, because AOTAutograd may need access to some of the graph inputs
+    # **after** the compiled function has finished running. There are two main cases:
+    #   (1) Input mutations: If there are an input mutations that we must run outside of the graph, we need access to the input.
+    #   (2) Output aliasing: Outputs that aliases graph inputs generally must be regenerated outside of the `autograd.Function`,
+    #       and doing so requires us accessing the corresponding input after the compiled artifact has run.
+    epilogue_args_idx = []
+    epilogue_args_idx.extend(runtime_metadata.mutated_inp_runtime_indices)
+    num_tokens = len(runtime_metadata.tokens)
+    for info in runtime_metadata.output_info:
+        if (
+            info.output_type == OutputType.alias_of_input
+            or info.output_type == OutputType.is_input
+        ):
+            assert isinstance(info.base_idx, int)
+            epilogue_args_idx.append(info.base_idx + num_tokens)
+
+    def runtime_wrapper(args: List[Any]):
+        if config.unlift_effect_tokens:
+            assert num_tokens == 0
+        elif num_tokens > 0:
+            # Pass in effect tokens (See Note [Side-Effectful Tokens in AOTAutograd])
+            old_args = args
+            args = [[None] * num_tokens, *args]
+            old_args.clear()
+
+        # stash a ref to each input tensor we plan to use after the compiled function
+        orig_inputs = {i: args[i] for i in epilogue_args_idx}
+
         if trace_joint:
             args_ = list(args)
             # See Note [Detaching inputs that never need gradients]
@@ -81,21 +211,23 @@ def runtime_wrapper(*args):
                     args_[idx] = args_[idx].detach()
             with torch.autograd._force_original_view_tracking(True):
                 all_outs = call_func_at_runtime_with_args(
-                    compiled_fn,
-                    args_,
-                    disable_amp=disable_amp,
+                    compiled_fn, args_, disable_amp=disable_amp, steal_args=True
                 )
         else:
             # When we have an inference graph, we run with torch.no_grad.
             # It's possible to get an inference graph with inputs that require grad,
             # in which case we want to make sure autograd is disabled
             # (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)
-            with torch.no_grad():
+            if torch.is_grad_enabled():
+                with torch.no_grad():
+                    all_outs = call_func_at_runtime_with_args(
+                        compiled_fn, args, disable_amp=disable_amp, steal_args=True
+                    )
+            else:
                 all_outs = call_func_at_runtime_with_args(
-                    compiled_fn,
-                    args,
-                    disable_amp=disable_amp,
+                    compiled_fn, args, disable_amp=disable_amp, steal_args=True
                 )
+        del args
 
         num_mutated_runtime_inps = runtime_metadata.num_mutated_inp_runtime_indices
         num_intermediate_bases = runtime_metadata.num_intermediate_bases
@@ -113,8 +245,12 @@ def runtime_wrapper(*args):
             == num_mutated_runtime_inps
             + runtime_metadata.num_outputs
             + num_intermediate_bases
+            + num_tokens
         )
 
+        # Toss out the effect tokens (See Note [Side-Effectful Tokens in AOTAutograd])
+        all_outs = all_outs[num_tokens:]
+
         # Step 3: After running the compiled fw, apply updates to mutated inputs
         num_mutations_to_apply = runtime_metadata.num_mutated_inp_runtime_indices
         if num_mutations_to_apply > 0:
@@ -125,9 +261,10 @@ def runtime_wrapper(*args):
                 meta = runtime_metadata.input_info[inpt_idx]
                 if not meta.mutates_data and not meta.mutates_metadata:
                     continue
-                original_inpt = args[inpt_idx]
+                original_inpt = orig_inputs[inpt_idx]
                 updated_inpt = updated_inputs[i]
                 if meta.mutates_storage_metadata:
+                    # See Note [set_() Input Mutations in AOTAutograd]
                     # mutates_storage_metadata means our input saw a x.set_(y) call.
                     # What if x **also** saw a data and/or a metadata mutation?
                     # (1) If the [meta]data mutation occurred after the set_(),
@@ -218,14 +355,14 @@ def runtime_wrapper(*args):
 
                 o_grad = runtime_metadata.output_info[i].requires_grad
                 if info.output_type == OutputType.alias_of_input:
-                    aliased_base_tensor = args[info.base_idx]  # type: ignore[index]
+                    aliased_base_tensor = orig_inputs[info.base_idx + num_tokens]  # type: ignore[index]
                     regenerated_out = gen_alias_from_base(
-                        aliased_base_tensor, o_, o_grad
+                        aliased_base_tensor, o_, o_grad, info.functional_tensor
                     )
                     fw_outs_including_aliases.append(regenerated_out)
                     continue
                 elif info.output_type == OutputType.is_input:
-                    aliased_base_tensor = args[info.base_idx]  # type: ignore[index]
+                    aliased_base_tensor = orig_inputs[info.base_idx + num_tokens]  # type: ignore[index]
                     regenerated_out = aliased_base_tensor
                     fw_outs_including_aliases.append(regenerated_out)
                     continue
@@ -245,7 +382,9 @@ def runtime_wrapper(*args):
                 # TODO: handle the custom autograd function case here.
                 # We need a way to check whether a tensor came from a custom autograd fn from python,
                 # AND a way to replay that custom view fn.
-                regenerated_out = gen_alias_from_base(aliased_base_tensor, o_, o_grad)
+                regenerated_out = gen_alias_from_base(
+                    aliased_base_tensor, o_, o_grad, info.functional_tensor
+                )
                 fw_outs_including_aliases.append(regenerated_out)
             ret_outs = fw_outs_including_aliases
         else:
@@ -293,8 +432,9 @@ def aot_dispatch_subclass_wrapper(
     subclass_metas: List[Union[int, SubclassCreationMeta]],
     num_fw_outs_saved_for_bw: Optional[int],
 ) -> Callable:
-    def inner_fn(args):
+    def inner_fn(args: List[Any]):
         unwrapped_args = unwrap_tensor_subclasses(args, is_joint_structure=False)
+        args.clear()
         # expectation: runtime_fn is a boxed fn
         unwrapped_outs = runtime_fn(unwrapped_args)
         wrapped_outs = wrap_tensor_subclasses(
@@ -554,11 +694,8 @@ def wrapped_flat_fn(*args):
         wrapped_flat_fn, deduped_flat_args, aot_config, fw_metadata=updated_fw_metadata
     )
 
-    if not hasattr(compiled_fn, "_boxed_call"):
-        compiled_fn = make_boxed_func(compiled_fn)
-
     @wraps(compiled_fn)
-    def wrapped_compiled_fn(args):
+    def wrapped_compiled_fn(args: List[Any]):
         deduped_args = remove_dupe_args(args)
         args.clear()
         return compiled_fn(deduped_args)
@@ -724,9 +861,6 @@ def wrapped_flat_fn(*args):
         fw_metadata=fw_metadata_updated,
     )
 
-    if not hasattr(compiled_fn, "_boxed_call"):
-        compiled_fn = make_boxed_func(compiled_fn)
-
     @wraps(compiled_fn)
     def wrapped_compiled_fn(args):
         args_with_synthetic_bases, synthetic_base_info = merge_view_inputs(
diff --git a/torch/_functorch/_aot_autograd/schemas.py b/torch/_functorch/_aot_autograd/schemas.py
index a4ece1ff5eb29..dd4a6f9684aee 100644
--- a/torch/_functorch/_aot_autograd/schemas.py
+++ b/torch/_functorch/_aot_autograd/schemas.py
@@ -4,7 +4,8 @@
 """
 
 import collections
-from dataclasses import dataclass
+import functools
+from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Callable, Dict, List, NewType, Optional, Set, Tuple, Union
 
@@ -16,6 +17,7 @@
 
 from .. import config
 
+from .functional_utils import _check_if_mutation_can_be_in_graph, has_same_metadata
 from .utils import strict_zip
 
 zip = strict_zip
@@ -52,6 +54,27 @@
 )
 
 
+# Wrapper around a FunctionalTensorWrapper for comparing only the resulting metadata
+# after applying all the ViewMeta operations.
+class FunctionalTensorMetadataEq:
+    def __init__(self, tensor: torch.Tensor) -> None:
+        assert torch._is_functional_tensor(tensor)
+        self.tensor = tensor
+
+    def __eq__(self, other: object) -> bool:
+        # If other is None, then it probably means that we weren't able to recreate
+        # the FunctionalTensorMetadataEq. One of this cases is when we update the
+        # view metadata by calling: create_synthetic_base_metadata.
+        if other is None:
+            return True
+
+        # Comparison agains any other type is not implemented.
+        if not isinstance(other, FunctionalTensorMetadataEq):
+            return NotImplemented
+
+        return has_same_metadata(self.tensor, other.tensor)
+
+
 # This class stores info about every user output.
 @dataclass(frozen=True)
 class OutputAliasInfo:
@@ -82,6 +105,15 @@ class OutputAliasInfo:
     dynamic_dims: Optional[Set[int]]
     # requires_grad
     requires_grad: bool
+    # FunctionalTensorWrapper that represents this output.
+    #
+    # Provides us the means to replay views from it.
+    #
+    # We need to wrap the actual FunctionalTensorWrapper with this class so that
+    # we only compare the tensor's metadata. That's because with the transformations
+    # of the model throughout AOTAutograd, the sequence of ViewMeta and the base
+    # tensor might change.
+    functional_tensor: Optional[FunctionalTensorMetadataEq] = None
 
 
 class MutationType(Enum):
@@ -100,7 +132,7 @@ class InputAliasInfo:
     mutations_under_no_grad_or_inference_mode: bool
     mutates_storage_metadata: bool
     requires_grad: bool
-    mutation_type: MutationType
+    keep_input_mutations: bool
 
     def __post_init__(self):
         if self.mutates_storage_metadata:
@@ -110,6 +142,24 @@ def __post_init__(self):
             # call to inp.set_(updated_inp) will already have the right metadata
             assert self.mutates_metadata
 
+    @functools.cached_property
+    def mutation_type(self) -> MutationType:
+        if (not self.mutates_data) and (not self.mutates_metadata):
+            return MutationType.NOT_MUTATED
+
+        if _check_if_mutation_can_be_in_graph(
+            self.keep_input_mutations,
+            self.mutates_data,
+            self.mutates_metadata,
+            self.mutations_hidden_from_autograd,
+            self.mutations_under_no_grad_or_inference_mode,
+            self.mutates_storage_metadata,
+            self.requires_grad,
+        ):
+            return MutationType.MUTATED_IN_GRAPH
+
+        return MutationType.MUTATED_OUT_GRAPH
+
 
 @dataclass
 class SubclassCreationMeta:
@@ -232,9 +282,6 @@ class ViewAndMutationMeta:
     # TODO: we should kill this
     # (need to default it to not break internal)
     is_train: bool = False
-    # We're plumbing this requires_subclass_dispatch here is because it's painful to support input mutations
-    # on subclasses, and that info isn't easily available.
-    requires_subclass_dispatch: bool = False
 
     num_symints_saved_for_bw: Optional[int] = None
 
@@ -245,33 +292,32 @@ class ViewAndMutationMeta:
     # to reset the grad mode to its pre-graph value prior to calling aot_autograd.
     grad_enabled_mutation: Optional[bool] = None
 
+    # Keeps track of whether `torch.use_deterministic_algorithms` was turned on
+    # when the forward was run. If deterministic mode was turned off during the
+    # forward, but is turned on during the backward call, then an error is
+    # raised
+    deterministic: Optional[bool] = None
+
+    # Map of effect type (ex. _EffectType.ORDERED) to token.  If there are
+    # side-effectful operators, FunctionalTensorMode will populate this
+    # dictionary telling us how many tokens we will need during tracing.
+    tokens: Dict[Any, torch.Tensor] = field(default_factory=dict)
+
     def __post_init__(self):
         # pre-compute the indices of the inputs that are mutated.
         # When keep_input_mutations is set, we don't need to worry about our epilogue
         # handling data-only mutations, because we keep them directly in the graph.
 
-        # TODO (tmanlaibaatar) Ideally input mutation type should be calculated
-        # based on requires_subclass_dispatch argument but this is not easy to do because you would
-        # have to pass around this argument multiple level down.
-        if not self.requires_subclass_dispatch:
-            mutated_inp_runtime_indices = [
-                i
-                for i, m in enumerate(self.input_info)
-                if (m.mutation_type == MutationType.MUTATED_OUT_GRAPH)
-            ]
-        else:
-            mutated_inp_runtime_indices = [
-                i
-                for i, m in enumerate(self.input_info)
-                if m.mutation_type
-                in (MutationType.MUTATED_IN_GRAPH, MutationType.MUTATED_OUT_GRAPH)
-            ]
+        mutated_inp_runtime_indices = [
+            i
+            for i, m in enumerate(self.input_info)
+            if (m.mutation_type == MutationType.MUTATED_OUT_GRAPH)
+        ]
 
         mutated_graph_handled_indices = [
             i
             for i, m in enumerate(self.input_info)
             if m.mutation_type == MutationType.MUTATED_IN_GRAPH
-            and not self.requires_subclass_dispatch
         ]
         self.mutated_graph_handled_indices = mutated_graph_handled_indices
         self.num_mutated_graph_handled_indices = len(self.mutated_graph_handled_indices)
@@ -378,11 +424,12 @@ def __post_init__(self):
         # separately.
         self.num_outputs_rng_offset = 1 if self.is_rng_op_functionalized else 0
 
-        # Our forward() returns both (mutated_inputs, outputs, output_intermediate_bases, saved_tensors, saved_symints)
+        # Our forward() returns both (tokens, mutated_inputs, outputs, output_intermediate_bases, saved_tensors, saved_symints)
         self.num_forward_returns = (
             self.num_mutated_inp_runtime_indices
             + self.num_outputs
             + self.num_intermediate_bases
+            + len(self.tokens)
         )
         # In case of functionalization of rng ops, the fw_module returns one
         # additional output for rng offset. This rng offset is used right
@@ -457,7 +504,7 @@ class SubclassMeta:
     # in case we made incorrect assumptions about the subclass-ness of our grad_outputs
     #
     # Optional field because we don't compute for inference graphs
-    grad_input_metas: Optional[List[Union[int, SubclassCreationMeta]]]
+    grad_input_metas: Optional[List[Union[int, SubclassCreationMeta]]] = None
 
     def __init__(self):
         # The fields in this class get set after its construction.
@@ -532,12 +579,16 @@ class GraphSignature:
     # "graph outputs that correspond to updated buffers"
     # to the FQN names of those mutated buffers.
     buffers_to_mutate: Dict[GraphOutputName, FQN]
+    user_inputs_to_mutate: Dict[GraphOutputName, GraphInputName]
 
     in_spec: pytree.TreeSpec
     out_spec: pytree.TreeSpec
 
     backward_signature: Optional[BackwardSignature]
 
+    input_tokens: List[GraphInputName]
+    output_tokens: List[GraphOutputName]
+
     @classmethod
     def from_tracing_metadata(
         cls,
@@ -558,40 +609,64 @@ def from_tracing_metadata(
         graph_outputs = graph_output_names
         parameters = list(named_parameters)
         buffers = list(named_buffers)
+        num_tokens = len(view_mutation_metadata.tokens)
 
         # Calling convention assumptions:
-        # (1) graph inputs = (params, buffers, user_inputs)
-        # (2) graph outputs = (mutated_inputs, user_outs, param_gradients)
+        # (1) graph inputs = (input_tokens, params, buffers, user_inputs)
+        # (2) graph outputs = (output_tokens, mutated_inputs, user_outs, param_gradients)
         # (If we are capturing an inference graph, this convention is identical
         #  except that param_gradients is empty)
-        user_inputs = graph_inputs[len(parameters) + len(buffers) :]
-        assert num_user_inputs == len(user_inputs)
-        assert len(graph_inputs) == (len(parameters) + len(buffers) + len(user_inputs))
+        # See Note [Side-Effectful Tokens in AOTAutograd] for information on tokens
 
-        inputs_to_parameters = dict(zip(graph_inputs[: len(parameters)], parameters))
+        # Address input calling conventions:
+        start, stop = 0, num_tokens
+        input_tokens = graph_inputs[start:stop]
+
+        start, stop = stop, stop + len(parameters)
+        inputs_to_parameters = dict(zip(graph_inputs[start:stop], parameters))
+
+        start, stop = stop, stop + len(buffers)
         inputs_to_buffers = dict(
             zip(
-                graph_inputs[len(parameters) : len(parameters) + len(buffers)],
+                graph_inputs[start:stop],
                 buffers,
             )
         )
 
-        state_names = [*parameters, *buffers]
-        mutated_buffers = []
+        start, stop = stop, stop + num_user_inputs
+        user_inputs = graph_inputs[start:stop]
+
+        # We should've gone through all the inputs now
+        assert len(graph_inputs) - stop == 0
+
+        # Address output calling conventions:
+        start, stop = 0, num_tokens
+        output_tokens = graph_outputs[start:stop]
+
+        names = [*input_tokens, *parameters, *buffers, *user_inputs]
+        mutations = []
         for idx, input_info in enumerate(view_mutation_metadata.input_info):
             if input_info.mutates_data:
                 # Only buffers can be mutated, not parameters
                 assert idx >= len(parameters)
-                buffer_name = state_names[idx]
-                mutated_buffers.append(buffer_name)
+                mutations.append(names[idx + num_tokens])
+
+        assert len(mutations) == view_mutation_metadata.num_mutated_inp_runtime_indices
 
-        assert (
-            len(mutated_buffers)
-            == view_mutation_metadata.num_mutated_inp_runtime_indices
+        start, stop = (
+            stop,
+            stop + view_mutation_metadata.num_mutated_inp_runtime_indices,
         )
+        outputs_to_mutations = dict(zip(graph_outputs[start:stop], mutations))
 
-        start, stop = 0, view_mutation_metadata.num_mutated_inp_runtime_indices
-        buffers_to_mutate = dict(zip(graph_outputs[start:stop], mutated_buffers))
+        user_inputs_to_mutate = {}
+        buffers_to_mutate = {}
+        for output_name, mutation_name in outputs_to_mutations.items():
+            if mutation_name in user_inputs:
+                user_inputs_to_mutate[output_name] = mutation_name
+            else:
+                assert mutation_name in buffers
+                buffers_to_mutate[output_name] = mutation_name
 
         start, stop = stop, stop + num_user_outputs
         user_outputs = graph_outputs[start:stop]
@@ -610,10 +685,13 @@ def from_tracing_metadata(
             user_outputs=user_outputs,  # type: ignore[arg-type]
             inputs_to_buffers=inputs_to_buffers,  # type: ignore[arg-type]
             inputs_to_parameters=inputs_to_parameters,  # type: ignore[arg-type]
+            user_inputs_to_mutate=user_inputs_to_mutate,
             buffers_to_mutate=buffers_to_mutate,  # type: ignore[arg-type]
             in_spec=in_spec,
             out_spec=out_spec,
             backward_signature=backward_signature,
+            input_tokens=input_tokens,  # type: ignore[arg-type]
+            output_tokens=output_tokens,  # type: ignore[arg-type]
         )
 
 
diff --git a/torch/_functorch/_aot_autograd/subclass_utils.py b/torch/_functorch/_aot_autograd/subclass_utils.py
index cc657ce7f57b3..cee3cf6e4edab 100644
--- a/torch/_functorch/_aot_autograd/subclass_utils.py
+++ b/torch/_functorch/_aot_autograd/subclass_utils.py
@@ -11,7 +11,7 @@
 from torch import Tensor
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
-from .schemas import SubclassCreationMeta, ViewAndMutationMeta
+from .schemas import MutationType, SubclassCreationMeta, ViewAndMutationMeta
 from .utils import strict_zip
 
 zip = strict_zip
@@ -24,10 +24,10 @@ def requires_subclass_dispatch(args, fw_metadata: ViewAndMutationMeta) -> bool:
         for x in args_flattened
         if isinstance(x, Tensor)
     )
+    from torch._functorch._aot_autograd.schemas import SubclassCreationMeta
+
     any_subclass_outputs = any(
-        is_traceable_wrapper_subclass(x)
-        for x in fw_metadata.traced_tangents
-        if isinstance(x, Tensor)
+        type(x) is SubclassCreationMeta for x in fw_metadata.subclass_fw_graph_out_meta
     )
     # This tells us whether or not we need to perform any unwrapping/wrapping of tensor subclasses at runtime.
     return any_subclass_args or any_subclass_outputs
@@ -148,7 +148,11 @@ def wrap_tensor_subclasses(
     # but `subclass_metas` will only correspond to subclass metatadata on `user_fw_outs`.
     # We then need to make sure that we return (*wrapped_user_fw_outs, *activations).
     if num_fw_outs_saved_for_bw is not None:
-        assert len(unwrapped_args) == num_args_tallied + num_fw_outs_saved_for_bw
+        assert len(unwrapped_args) == num_args_tallied + num_fw_outs_saved_for_bw, (
+            f"Expected the number actual unwrapped-subclass outputs {len(unwrapped_args)} to equal "
+            f"the number of args calculated from subclasses ({num_args_tallied}) plus the number of "
+            f"additional activations saved for the backward pass ({num_fw_outs_saved_for_bw})"
+        )
         activations = unwrapped_args[num_args_tallied:]
         if isinstance(wrapped_args, tuple) and isinstance(activations, tuple):
             return wrapped_args + activations
@@ -237,3 +241,55 @@ def create_metadata_for_subclass(meta: ViewAndMutationMeta) -> ViewAndMutationMe
         subclass_tangent_meta=subclass_tangent_meta,  # type: ignore[arg-type]
     )
     return metadata
+
+
+def compute_inner_mutated_inp_indices_from_subclass_meta(
+    fw_metadata: ViewAndMutationMeta,
+    inner_metadata: ViewAndMutationMeta,
+) -> List[int]:
+    # Note: [Recomputing subclass mutation handling]
+    #
+    # Generally, if a subclass requires grad, its components will not require grad.
+    # But for the purposes of tracking returned tensors, we should treat those component
+    # tensors as if they require grad.
+    #
+    # For example, if the subclass tensor requires grad and will be mutated in a way that
+    # requires us to handle the mutation outside of the graph, we need to return it
+    # from the forward graph. The inner_meta data won't consider the component tensors
+    # as if they need to be returned, because they don't require grad; but really, we
+    # should handle those tensors the same way we handle the subclass tensor itself; i.e.
+    # if we'd include the subclass tensor as part of the outputs, then we should also
+    # include the component tensors.
+    #
+    # To do this, we patch num_mutated_inp_runtime_indices below by expanding the inputs
+    # from the outer subclass tensors and propagating
+
+    updated_input_info = []
+    inner_idx = 0
+    if not fw_metadata.subclass_inp_meta:
+        # Sometimes we don't have subclass info, e.g. synthetic_base codepaths
+        return inner_metadata.mutated_inp_runtime_indices
+    assert len(fw_metadata.subclass_inp_meta) == len(fw_metadata.input_info)
+    for outer_idx, inp_meta in enumerate(fw_metadata.subclass_inp_meta):
+        if isinstance(inp_meta, int):
+            assert outer_idx < len(fw_metadata.input_info)
+            if inner_metadata is not None:
+                assert inner_idx < len(inner_metadata.input_info)
+                assert (
+                    inner_metadata.input_info[inner_idx]
+                    == fw_metadata.input_info[outer_idx]
+                )
+            updated_input_info.append(fw_metadata.input_info[outer_idx])
+            inner_idx += 1
+        else:
+            for _ in range(inp_meta.arg_count):
+                updated_input_info.append(fw_metadata.input_info[outer_idx])
+                inner_idx += 1
+    if inner_metadata is not None:
+        assert len(inner_metadata.input_info) == len(updated_input_info)
+
+    return [
+        i
+        for i, inp in enumerate(updated_input_info)
+        if inp.mutation_type == MutationType.MUTATED_OUT_GRAPH
+    ]
diff --git a/torch/_functorch/_aot_autograd/traced_function_transforms.py b/torch/_functorch/_aot_autograd/traced_function_transforms.py
index 180a33881cf04..cce76e0108ff1 100644
--- a/torch/_functorch/_aot_autograd/traced_function_transforms.py
+++ b/torch/_functorch/_aot_autograd/traced_function_transforms.py
@@ -23,9 +23,11 @@
 from torch._decomp.decompositions_for_rng import PhiloxStateTracker
 from torch._guards import detect_fake_mode
 from torch._prims_common import CUDARngStateHelper
-from torch._subclasses.functional_tensor import FunctionalTensorMode
-from torch.fx import Interpreter
-from torch.fx.experimental.symbolic_shapes import definitely_false, sym_eq
+from torch.fx.experimental.symbolic_shapes import (
+    definitely_false,
+    PropagateUnbackedSymInts,
+    sym_eq,
+)
 from torch.nn.utils import stateless
 
 from .. import config
@@ -351,12 +353,43 @@ def _functionalized_f_helper(*args):
         disable_above = torch._C._ExcludeDispatchKeyGuard(
             torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize)
         )
-        with disable_above, FunctionalTensorMode(aot_config.pre_dispatch):
+
+        # See Note [Side-Effectful Tokens in AOTAutograd]
+        if trace_joint:
+            assert (
+                isinstance(args, tuple)
+                and len(args) == 2
+                and isinstance(args[0], (list, tuple))
+            )
+            tokens = args[0][: len(meta.tokens)]
+            actual_args = args[0][len(meta.tokens) :]
+            args = (actual_args, args[1])
+        else:
+            tokens = args[: len(meta.tokens)]
+            args = args[len(meta.tokens) :]
+        assert all(token.numel() == 0 for token in tokens)
+
+        with disable_above:
             # Wrap inputs into functional wrappers
             f_args = pytree.tree_map(to_fun, args)
+            f_tokens = pytree.tree_map(to_fun, tokens)
+
+            # Populate the current FunctionalTensorMode with the tokens per
+            # operator. See Note [FunctionalTensorMode is Stateful]
+            functional_tensor_mode = (
+                torch.utils._python_dispatch._detect_functional_mode()
+            )
+            assert functional_tensor_mode is not None
+            for i, k in enumerate(meta.tokens.keys()):
+                functional_tensor_mode._tokens[k] = f_tokens[i]
+
             # Run the joint
             f_outs = fn(*f_args)
 
+            # Return both the tokens and the outputs
+            # See Note [Side-Effectful Tokens in AOTAutograd]
+            f_outs = (*functional_tensor_mode._tokens.values(), *f_outs)
+
         if trace_joint:
             # We support a limited amount of mutation of graph inputs during the backward pass.
             # (This is used e.g. by Float8, which needs to update buffers during the backward pass)
@@ -387,7 +420,11 @@ def _functionalized_f_helper(*args):
                     ), "Found a graph input that had its metadata mutated in the backward. This is not supported"
                 # Allow data mutations on fw inputs during the bw, but only if they do not require grad
                 # So we can guarantee that we can keep the mutations in the graph
-                if has_data_mutation(f_inpt) and not inpt_info.mutates_data:
+                if (
+                    has_data_mutation(f_inpt)
+                    and not inpt_info.mutates_data
+                    and not inpt_info.mutates_storage_metadata
+                ):
                     assert (
                         not inpt_info.requires_grad
                     ), "Found a graph input that requires_grad and was mutated in the backward. This is not supported"
@@ -440,24 +477,70 @@ def _functionalized_f_helper(*args):
                 assert is_fun(inpt_f)
                 inpt_new = from_fun(inpt_f)
                 if meta.input_info[i].mutation_type == MutationType.MUTATED_IN_GRAPH:
+                    # See Note [set_() Input Mutations in AOTAutograd]
+                    # all mutations on the input must be under no_grad, so it is safe to put in the graph
+                    # Here, we're saying that if an input experienced a set call, inp.set_(other),
+                    # then we can effectively not have to worry about whether its data was mutated.
+                    # There are 3 cases:
+                    # (1) We mutate inp *after* the set_() call. other is a graph intermediate.
+                    #     In this case, we're not really mutating the input storage of "inp";
+                    #     we're mutating the storage of an intermdiate value (other),
+                    #     and slamming that storage into the input tensor. So no data mutation is necessary.
+                    # (2) We mutate inp *after* the set_() call. other is a graph *input*.
+                    #     In this case, the data mutation will be properly handled in the runtime
+                    #     epilogue during the processing of "other"
+                    # (3) We mutate inp *before* the set_() call.
+                    #     This case is *not* currently handled.
+                    if meta.input_info[i].mutates_storage_metadata:
+                        with torch.no_grad():
+                            inpt_old.set_(inpt_new)
+
                     # We found an input that had a (data-only) mutation.
                     # Since keep_input_mutations is set, we need to faithfully apply a copy_()
                     # so the compiler will see the input mutation in the graph.
-                    if meta.input_info[i].mutations_hidden_from_autograd:
+                    if (
+                        meta.input_info[i].mutates_data
+                        and meta.input_info[i].mutations_hidden_from_autograd
+                    ):
                         # Hidden from autograd = run under no_grad, **and** don't bump VC
                         with torch.no_grad(), torch.autograd._unsafe_preserve_version_counter(
                             inpt_old
                         ):
                             inpt_old.copy_(inpt_new)
-                    elif meta.input_info[i].mutations_under_no_grad_or_inference_mode:
+                    elif (
+                        meta.input_info[i].mutates_data
+                        and meta.input_info[i].mutations_under_no_grad_or_inference_mode
+                    ):
                         # Under no_grad = run under no_grad (we still bump the VC though)
                         # (inference_mode will also bump the VC, as long as the tensor in question
                         # was created outside of inference_mode)
                         with torch.no_grad():
                             inpt_old.copy_(inpt_new)
-                    else:
+                    elif meta.input_info[i].mutates_data:
                         inpt_old.copy_(inpt_new)
 
+            # When an output tensor is a functionalized mutated input, and we
+            # were able to move the mutation in to the graph then we can return
+            # the mutated input directly. This prevents duplicating the
+            # tensors contents.
+            flat_outs, outs_spec = pytree.tree_flatten(f_outs)
+            flat_outs = [from_fun(o) for o in flat_outs]
+            num_outs = len(meta.output_info)
+
+            for i, outp in enumerate(flat_outs[:num_outs]):
+                info = meta.output_info[i]
+                if info.output_type != OutputType.is_input:
+                    continue
+
+                assert info.base_idx is not None
+                if (
+                    meta.input_info[info.base_idx].mutation_type
+                    == MutationType.MUTATED_IN_GRAPH
+                ):
+                    fw_args = args[0] if trace_joint else args
+                    flat_outs[i] = fw_args[info.base_idx]
+            return pytree.tree_unflatten(flat_outs, outs_spec)
+
         return pytree.tree_map(from_fun, f_outs)
 
     # Kinda annoying, but needed to make sure that the fx graph we trace out has "primals"
@@ -471,6 +554,14 @@ def joint_helper(primals, tangents):
         # Setup the wrapper for functionalization of rng ops
         helper, args = create_functionalized_rng_ops_wrapper(helper, args, trace_joint)
 
+    # Additionally pass in tokens as inputs
+    # See Note [Side-Effectful Tokens in AOTAutograd]
+    additional_token_inputs = [torch.tensor([])] * len(meta.tokens)
+    if trace_joint:
+        args = ([*additional_token_inputs, *args[0]], *args[1:])
+    else:
+        args = [*additional_token_inputs, *args]
+
     return helper, args
 
 
@@ -575,7 +666,6 @@ def metadata_fn(*primals):
         metadata_fn,
         keep_input_mutations=meta.keep_input_mutations,
         is_train=meta.is_train,
-        requires_subclass_dispatch=True,
     )(*primals_unwrapped)
 
     subclass_meta.fw_metadata = meta_updated
@@ -601,7 +691,10 @@ def functional_call(*args, **kwargs):
                         "ignore", "Anomaly Detection has been enabled."
                     )
                     with torch.autograd.detect_anomaly(check_nan=False):
-                        out = Interpreter(mod).run(*args[params_len:], **kwargs)
+                        detect_fake_mode().epoch += 1
+                        out = PropagateUnbackedSymInts(mod).run(
+                            *args[params_len:], **kwargs
+                        )
             else:
                 out = mod(*args[params_len:], **kwargs)
 
diff --git a/torch/_functorch/_aot_autograd/utils.py b/torch/_functorch/_aot_autograd/utils.py
index d3dc9dbc09834..e23a32f10cc45 100644
--- a/torch/_functorch/_aot_autograd/utils.py
+++ b/torch/_functorch/_aot_autograd/utils.py
@@ -3,18 +3,29 @@
 """
 
 import dataclasses
+import operator
 import warnings
 from contextlib import nullcontext
 from functools import wraps
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, List, Optional, Tuple, Union
 
 import torch
 import torch.utils._pytree as pytree
+from torch._library.fake_class_registry import FakeScriptObject
+from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental.proxy_tensor import py_sym_types
 
-KNOWN_TYPES = tuple(
-    [torch.Tensor, int, str, float, bool, type(None)] + list(py_sym_types)
-)
+KNOWN_TYPES = [
+    torch.Tensor,
+    BackwardState,
+    int,
+    str,
+    float,
+    bool,
+    type(None),
+    *py_sym_types,
+    FakeScriptObject,
+]
 
 original_zip = zip
 
@@ -68,10 +79,10 @@ def normalize_as_list(x):
 
 def _get_autocast_states():
     return [
-        torch.is_autocast_enabled(),
-        torch.is_autocast_cpu_enabled(),
-        torch.get_autocast_gpu_dtype(),
-        torch.get_autocast_cpu_dtype(),
+        torch.is_autocast_enabled("cuda"),
+        torch.is_autocast_enabled("cpu"),
+        torch.get_autocast_dtype("cuda"),
+        torch.get_autocast_dtype("cpu"),
         torch.is_autocast_cache_enabled(),
     ]
 
@@ -94,7 +105,9 @@ def f(fx_g, inps):
     return f
 
 
-def call_func_at_runtime_with_args(f, args, steal_args=False, disable_amp=False):
+def call_func_at_runtime_with_args(
+    f, args: Union[Tuple[Any], List[Any]], steal_args=False, disable_amp=False
+):
     if not steal_args:
         args = list(args)
     assert isinstance(args, list)
@@ -119,22 +132,23 @@ def call_func_at_runtime_with_args(f, args, steal_args=False, disable_amp=False)
 class PytreeThunk:
     spec: Optional[pytree.TreeSpec] = None
     # These are some kinda dumb microoptimizations that save about 3-4 us of overhead.
-    is_simple = (
-        None  # if the output spec is a tuple/list, we won't bother unflattening it.
-    )
-    is_really_simple = None  # if the output spec is a LeafSpec
+    is_simple: Optional[
+        bool
+    ] = None  # if the output spec is a tuple/list, we won't bother unflattening it.
+    is_really_simple: Optional[bool] = None  # if the output spec is a LeafSpec
 
-    def set(self, spec):
+    def set(self, spec: pytree.TreeSpec) -> None:
         assert self.spec is None or self.spec == spec
-        self.spec = spec
-        if type(self.spec) in [tuple, list] and all(
-            isinstance(i, pytree.LeafSpec) for i in spec.children_specs
+        assert spec is not None
+        self.spec: pytree.TreeSpec = spec
+        if self.spec.type in {tuple, list} and all(
+            child.is_leaf() for child in spec.children_specs
         ):
             self.is_simple = True
-        if isinstance(self.spec, pytree.LeafSpec):
+        if self.spec.is_leaf():
             self.is_really_simple = True
 
-    def unflatten(self, x):
+    def unflatten(self, x: List[Any]) -> Any:
         if self.is_really_simple:
             return x[0]
         if self.is_simple:
@@ -215,3 +229,68 @@ def maybe_to_fresh_input(idx, t, meta):
             # sees the tensor before the metadata mutation
             return t.view(t.shape)
     return t
+
+
+def unlift_tokens(fw_module, fw_metadata):
+    # Remove the tokens from the inputs/outputs of the graph since inductor does
+    # not want these extra inputs/outputs, and replace them with
+    # _make_token() to create a token, and _sink_tokens() to collect the
+    # tokens.  See Note [Side-Effectful Tokens in AOTAutograd]
+    num_tokens = len(fw_metadata.tokens)
+
+    input_token_nodes = []
+    for i, node in enumerate(fw_module.graph.nodes):
+        if i < num_tokens:
+            assert node.op == "placeholder"
+            input_token_nodes.append(node)
+
+        elif node.op == "call_function" and node.target.__name__ == "with_effects":
+            if node.args[0] in input_token_nodes:
+                with fw_module.graph.inserting_before(node):
+                    new_token_node = fw_module.graph.call_function(
+                        torch.ops.prims._make_token.default, ()
+                    )
+                    new_token_node.meta["val"] = torch.tensor([])
+                    new_token_node.meta["tensor_meta"] = torch.tensor([])
+
+                    args = list(node.args)
+                    args[0] = new_token_node
+                    node.args = tuple(args)
+
+        elif node.op == "output":
+            output_token_nodes = node.args[0][:num_tokens]
+            other_output_args = node.args[0][num_tokens:]
+
+            for output_token_node in output_token_nodes:
+                assert (
+                    output_token_node.op == "call_function"
+                    and output_token_node.target == operator.getitem
+                    and output_token_node.args[1] == 0
+                )
+            with fw_module.graph.inserting_before(node):
+                sink_token_node = fw_module.graph.call_function(
+                    torch.ops.prims._sink_tokens.default,
+                    (output_token_nodes,),
+                )
+                node.args = (other_output_args,)
+
+    for input_token_node in input_token_nodes:
+        fw_module.graph.erase_node(input_token_node)
+
+    fw_module.recompile()
+
+    # This is sad, but we need to update the metadata to get rid of
+    # the tokens.
+    fw_metadata.num_forward_returns -= num_tokens
+    fw_metadata.num_forward -= num_tokens
+    fw_metadata.tokens = {}
+
+
+def root_module_when_exporting_non_strict(flat_fn):
+    # When exporting in non-strict mode, we wrap the root module in a specific pattern.
+    # See `_aot_export_non_strict` in torch.export._trace.py.
+    # We look for that wrapping pattern here.
+    if hasattr(flat_fn, "_orig_mod") and hasattr(flat_fn._orig_mod, "_export_root"):
+        return flat_fn._orig_mod._export_root
+    else:
+        return None
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index 805845251f7f5..64d20c1477ee3 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -1,5 +1,7 @@
+# mypy: ignore-errors
+
 import itertools
-from contextlib import nullcontext
+from contextlib import contextmanager, nullcontext
 from functools import partial, wraps
 from typing import Any, Callable, Dict, List, Optional, Tuple
 from unittest.mock import patch
@@ -9,119 +11,115 @@
 import torch.utils._pytree as pytree
 import torch.utils.dlpack
 from torch import Tensor
+from torch._decomp.decompositions_for_rng import PhiloxStateTracker, rng_decompositions
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo import compiled_autograd
 from torch._dynamo.utils import dynamo_timed, preserve_rng_state
 from torch._guards import detect_fake_mode
 from torch._subclasses import FakeTensor, FakeTensorMode
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.fx.experimental.symbolic_shapes import (
-    ShapeEnv
-)
+from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
-from torch._decomp.decompositions_for_rng import PhiloxStateTracker, rng_decompositions
 from . import config
-from .partitioners import default_partition
-
-from ._aot_autograd.utils import (  # noqa: F401
-    strict_zip,
-    _get_symint_hints,
-    KNOWN_TYPES,
-    partial_flatten_asdict,
-    normalize_as_list,
-    _get_autocast_states,
-    make_boxed_func,
-    make_boxed_compiler,
-    call_func_at_runtime_with_args,
-    create_tree_flattened_fn,
-    maybe_to_fresh_input,
+from ._aot_autograd.collect_metadata_analysis import (  # noqa: F401
+    run_functionalized_fw_and_collect_metadata,
+)
+from ._aot_autograd.dispatch_and_compile_graph import (  # noqa: F401
+    aot_dispatch_autograd_graph,
+    aot_dispatch_base_graph,
+)
+from ._aot_autograd.functional_utils import (  # noqa: F401
+    _check_if_mutation_can_be_in_graph,
+    are_all_mutations_hidden_from_autograd,
+    are_all_mutations_under_no_grad_or_inference_mode,
+    assert_functional_graph,
+    from_fun,
+    gen_alias_from_base,
+    has_data_mutation,
+    has_metadata_mutation,
+    is_fun,
+    sync_functional_tensor,
+    to_fun,
+)
+from ._aot_autograd.input_output_analysis import (  # noqa: F401
+    _tensors_definitely_do_not_overlap,
+    compute_overlapping_inputs,
+    create_graph_signature,
+    create_synthetic_base_metadata,
+    remove_dupe_metadata,
+)
+from ._aot_autograd.jit_compile_runtime_wrappers import (  # noqa: F401
+    aot_dispatch_autograd,
+    aot_dispatch_base,
 )
 from ._aot_autograd.logging_utils import (  # noqa: F401
-    graph_being_compiled,
-    nth_graph,
-    model_name,
-    set_model_name,
+    callback_set,
+    describe_input,
+    format_guard_bug_msg,
     get_aot_compilation_context,
     get_aot_graph_name,
     get_graph_being_compiled,
-    track_graph_compiling,
-    callback_set,
+    graph_being_compiled,
+    model_name,
+    nth_graph,
+    set_model_name,
     setup_stacktrace_preservation_hooks,
-    describe_input,
-    format_guard_bug_msg,
+    track_graph_compiling,
 )
-from ._aot_autograd.functional_utils import (  # noqa: F401
-    is_fun,
-    to_fun,
-    from_fun,
-    sync_functional_tensor,
-    has_metadata_mutation,
-    has_data_mutation,
-    are_all_mutations_hidden_from_autograd,
-    are_all_mutations_under_no_grad_or_inference_mode,
-    gen_alias_from_base,
-    assert_functional_graph,
-    _get_mutation_type,
-    _check_if_mutation_can_be_in_graph,
+from ._aot_autograd.runtime_wrappers import (  # noqa: F401
+    aot_dispatch_subclass_wrapper,
+    aot_wrapper_dedupe,
+    aot_wrapper_synthetic_base,
+    functionalized_rng_runtime_epilogue,
 )
 from ._aot_autograd.schemas import (  # noqa: F401
-    OutputType,
-    OutputAliasInfo,
-    MutationType,
+    AOTConfig,
+    BackwardSignature,
+    FQN,
+    GraphInputName,
+    GraphOutputName,
+    GraphSignature,
     InputAliasInfo,
+    MutationType,
+    OutputAliasInfo,
+    OutputType,
     SubclassCreationMeta,
-    ViewAndMutationMeta,
     SubclassMeta,
     TensorAlias,
-    BackwardSignature,
-    GraphOutputName,
-    GraphInputName,
-    FQN,
-    GraphSignature,
-    AOTConfig,
+    ViewAndMutationMeta,
 )
 from ._aot_autograd.subclass_utils import (  # noqa: F401
+    create_metadata_for_subclass,
     requires_subclass_dispatch,
     unwrap_tensor_subclasses,
     wrap_tensor_subclasses,
     wrap_tensor_subclasses_maybe_joint,
-    create_metadata_for_subclass,
-)
-from ._aot_autograd.collect_metadata_analysis import (  # noqa: F401
-    run_functionalized_fw_and_collect_metadata,
-)
-from ._aot_autograd.input_output_analysis import (  # noqa: F401
-    remove_dupe_metadata,
-    create_synthetic_base_metadata,
-    _tensors_definitely_do_not_overlap,
-    compute_overlapping_inputs,
-    create_graph_signature,
 )
 from ._aot_autograd.traced_function_transforms import (  # noqa: F401
-    fn_input_mutations_to_outputs,
-    fn_prepped_for_autograd,
-    create_functionalized_fn,
-    create_functionalized_rng_ops_wrapper,
     aot_dispatch_subclass,
     create_functional_call,
+    create_functionalized_fn,
+    create_functionalized_rng_ops_wrapper,
     create_joint,
+    fn_input_mutations_to_outputs,
+    fn_prepped_for_autograd,
 )
-from ._aot_autograd.runtime_wrappers import (  # noqa: F401
-    create_runtime_wrapper,
-    functionalized_rng_runtime_epilogue,
-    aot_dispatch_subclass_wrapper,
-    aot_wrapper_dedupe,
-    aot_wrapper_synthetic_base,
-    merge_view_inputs,
-)
-from ._aot_autograd.dispatch_and_compile_graph import (  # noqa: F401
-    aot_dispatch_base_graph,
-    aot_dispatch_autograd_graph,
-)
-from ._aot_autograd.jit_compile_runtime_wrappers import (  # noqa: F401
-    aot_dispatch_base,
-    aot_dispatch_autograd,
+
+from ._aot_autograd.utils import (  # noqa: F401
+    _get_autocast_states,
+    _get_symint_hints,
+    call_func_at_runtime_with_args,
+    create_tree_flattened_fn,
+    KNOWN_TYPES,
+    make_boxed_compiler,
+    make_boxed_func,
+    maybe_to_fresh_input,
+    normalize_as_list,
+    partial_flatten_asdict,
+    root_module_when_exporting_non_strict,
+    strict_zip,
 )
+from .partitioners import default_partition
 
 zip = strict_zip
 
@@ -374,6 +372,47 @@
 # To work around this, we view every forward output when creating out tangent
 # tensors so that tangents can never be the same as forward inputs even if
 # forward inputs alias forward outputs.
+
+# Note [Side-Effectful Tokens in AOTAutograd]
+#
+# We allow some some side-effectful operators in
+# the post-AOTAutograd (functional) graph, such as prints and torchbind operations.
+# To ensure that these side-effects are compatible to future graph passes that
+# assume that the graph is functional, we will thread "effect tokens" to show
+# data dependence between these side-effectful operators. Practically speaking,
+# effect tokens are just dummy values (torch.tensor([])). The graph would look
+# like the following:
+#
+# def gm(self, token0, reader):
+#    token1, frame = with_token(ordered_effect_op, (reader,), token0)
+#    frame = frame * 2
+#    token2, frame2 = with_token(ordered_effect_op, (reader,), token1)
+#    frame2 = frame2 * 2
+#    return token2, frame, frame2
+#
+# We will pass the token as an input to the graph, thread it through
+# side-effectful operators using the `with_effects` high order operator, and then
+# return the updated token as an output.
+# So the signature of the graph input would look something like
+# (*tokens, *params_buffers, *user_inputs), and the signature of the graph
+# output would look something like (*tokens, *outputs).
+#
+# However, Inductor does not want the concept of tokens in the final generated
+# code's input and output. Since changing the graph signature inside of inductor
+# is difficult, after generating the forward graph, we will run a pass to
+# remove the tokens from the inputgenerate the following graph for Inductor, where
+# the tokens are created and sunk within the graph, rather than as inputs and
+# outputs:
+#
+# def gm(self, reader):
+#    token0 = torch.ops.prims._make_token()
+#    token1, frame = with_token(ordered_effect_op, (reader,), token0)
+#    frame = frame * 2
+#    token2, frame2 = with_token(ordered_effect_op, (reader,), token1)
+#    frame2 = frame2 * 2
+#    sink_token = torch.ops.prims._sink_tokens([token2])
+#    return frame, frame2
+
 #
 #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -382,6 +421,7 @@
 
 aot_autograd_decompositions = {}
 
+
 @dynamo_timed
 def create_aot_dispatcher_function(
     flat_fn, flat_args: List[Any], aot_config: AOTConfig
@@ -415,7 +455,6 @@ def create_aot_dispatcher_function(
     if aot_config.decompositions is None:
         aot_config.decompositions = {}
 
-
     aot_config.decompositions = {
         **aot_autograd_decompositions,
         **aot_config.decompositions,
@@ -450,15 +489,14 @@ def process_inputs(flat_args):
             def convert(idx, x):
                 if shape_env is not None:
                     from torch._dynamo.source import ConstantSource
+
                     if isinstance(x, int):
                         # We always specialize on scalar values in export.
                         if aot_config.is_export:
                             return x
                         source = ConstantSource(f"sym_{idx}")
                         return shape_env.create_symintnode(
-                            shape_env.create_symbol(x, source),
-                            hint=x,
-                            source=source
+                            shape_env.create_symbol(x, source), hint=x, source=source
                         )
                 if not isinstance(x, torch.Tensor):
                     return x
@@ -468,10 +506,11 @@ def convert(idx, x):
                 if is_traceable_wrapper_subclass(x):
                     attrs, _ = x.__tensor_flatten__()
                     if all(isinstance(getattr(x, attr), FakeTensor) for attr in attrs):
-                        assert all(getattr(x, attr).fake_mode is fake_mode for attr in attrs)
+                        assert all(
+                            getattr(x, attr).fake_mode is fake_mode for attr in attrs
+                        )
                         return x
 
-
                 # see note [Tensor Fakification and Symbol Caching]
                 symbolic_context = None
                 source = None
@@ -489,8 +528,12 @@ def convert(idx, x):
                     return fake_mode.from_tensor(x, static_shapes=True)
 
                 return fake_mode.from_tensor(
-                    x, static_shapes=False, symbolic_context=symbolic_context, source=source
+                    x,
+                    static_shapes=False,
+                    symbolic_context=symbolic_context,
+                    source=source,
                 )
+
             return [convert(idx, x) for idx, x in enumerate(flat_args)]
 
         fake_flat_args = process_inputs(flat_args)
@@ -504,16 +547,26 @@ def convert(idx, x):
             # Patch set_rng_state as set_rng_state with fake tensors is
             # nonsensical. This does not affect the collection of metadata.
             with patch("torch.cuda.set_rng_state", lambda *args: None):
-                fw_metadata = run_functionalized_fw_and_collect_metadata(
-                    flat_fn,
-                    keep_input_mutations=aot_config.keep_inference_input_mutations,
-                    is_train=needs_autograd,
-                    pre_dispatch=aot_config.pre_dispatch,
-                )(*fake_flat_args)
-
-                req_subclass_dispatch = requires_subclass_dispatch(fake_flat_args, fw_metadata)
+                mod = root_module_when_exporting_non_strict(flat_fn)
+                if mod is not None:
+                    ctx = _detect_attribute_assignment(mod)
+                else:
+                    ctx = nullcontext()
+                with ctx:
+                    fw_metadata = run_functionalized_fw_and_collect_metadata(
+                        flat_fn,
+                        keep_input_mutations=aot_config.keep_inference_input_mutations,
+                        is_train=needs_autograd,
+                        pre_dispatch=aot_config.pre_dispatch,
+                    )(*fake_flat_args)
+
+                req_subclass_dispatch = requires_subclass_dispatch(
+                    fake_flat_args, fw_metadata
+                )
 
-                if needs_autograd and not any(x.requires_grad for x in fw_metadata.output_info):
+                if needs_autograd and not any(
+                    x.requires_grad for x in fw_metadata.output_info
+                ):
                     # We realized that none of the outputs require grad,
                     # so we actually have an inference graph.
                     needs_autograd = False
@@ -525,7 +578,8 @@ def convert(idx, x):
                     if req_subclass_dispatch:
                         fw_metadata = run_functionalized_fw_and_collect_metadata(
                             flat_fn,
-                            keep_input_mutations=aot_config.keep_inference_input_mutations and not needs_autograd,
+                            keep_input_mutations=aot_config.keep_inference_input_mutations
+                            and not needs_autograd,
                             is_train=needs_autograd,
                             pre_dispatch=aot_config.pre_dispatch,
                         )(*fake_flat_args)
@@ -534,7 +588,8 @@ def convert(idx, x):
                             input_info=fw_metadata.input_info,
                             output_info=fw_metadata.output_info,
                             num_intermediate_bases=fw_metadata.num_intermediate_bases,
-                            keep_input_mutations=aot_config.keep_inference_input_mutations and not needs_autograd,
+                            keep_input_mutations=aot_config.keep_inference_input_mutations
+                            and not needs_autograd,
                             traced_tangents=fw_metadata.traced_tangents,
                             subclass_inp_meta=fw_metadata.subclass_inp_meta,
                             subclass_fw_graph_out_meta=fw_metadata.subclass_fw_graph_out_meta,
@@ -542,7 +597,6 @@ def convert(idx, x):
                             is_train=needs_autograd,
                         )
 
-
         if fw_metadata.num_intermediate_bases > 0:
             assert not req_subclass_dispatch, f"""\
 torch.compile is currently being used with tensor subclass inputs:
@@ -555,72 +609,86 @@ def convert(idx, x):
             # Keeping .resize_() in the graph will require some work
             # Allowing it but keeping the graph functional will require some calling convention changes.
             if len([x for x in fw_metadata.input_info if x.mutates_metadata]) != 0:
-                raise RuntimeError(f"""\
+                raise RuntimeError(
+                    f"""\
 Found an input that received a metadata mutation, through e.g. a call to `.resize_()` or `.transpose_()`.
 This is currently banned in the aot_export workflow. If you need this functionality, please file a github issue.
 
-fw_metadata={str(fw_metadata)}""")
+fw_metadata={str(fw_metadata)}"""
+                )
             # In export, banning data mutations on inputs that require grad for now.
             # This should be rare, and is tricky to get right. When we trace the backward,
             # we currently trace with autograd.grad instead of .backward(), which makes it difficult
             # to ensure that we run autograd all the way through the input **before** it saw the mutation.
-            if len([x for x in fw_metadata.input_info if x.requires_grad and x.mutates_data]) != 0:
-                raise RuntimeError(f"""\
+            if (
+                len(
+                    [
+                        x
+                        for x in fw_metadata.input_info
+                        if x.requires_grad and x.mutates_data
+                    ]
+                )
+                != 0
+            ):
+                raise RuntimeError(
+                    f"""\
 Found a graph input that requires gradients, and received a mutation.
 This is currently banned in the aot_export workflow. If you need this functionality, please file a github issue.
 
-fw_metadata={str(fw_metadata)}""")
+fw_metadata={str(fw_metadata)}"""
+                )
             if req_subclass_dispatch:
-                raise RuntimeError("""\
+                raise RuntimeError(
+                    """\
 aot_export is not currently supported with traceable tensor subclass.
-If you need this feature, please comment on <CREATE_ISSUE_LINK>""")
+If you need this feature, please comment on <CREATE_ISSUE_LINK>"""
+                )
 
             # Need to decide on a strategy for functionalized RNG: toggling via global config seems bad,
             # and turning it on will require a non-trivial calling convention change for any export runtime.
             if config.functionalize_rng_ops:
-                raise RuntimeError("""\
+                raise RuntimeError(
+                    """\
 Functionalized RNG is not currently supported in the aot_export workflow. Please file a github issue,
-or otherwise set torch._functorch.config.functionalize_rng_ops = False.""")
+or otherwise set torch._functorch.config.functionalize_rng_ops = False."""
+                )
 
         # crappy version of dispatcher
         # TODO: Do this properly
-        if needs_autograd:
+        if needs_autograd and not aot_config.pre_dispatch:
             # For now, aot_dispatch_autograd knows to explicitly return a graph
             # when run with export, and an opaque callable otherwise.
             # In theory we could factor these out, but I wanted to let the dust
             # settle on how functionalized rng fits into export first.
-            compiler_fn = aot_dispatch_autograd_graph if aot_config.is_export else aot_dispatch_autograd
+            compiler_fn = (
+                aot_dispatch_autograd_graph
+                if aot_config.is_export
+                else aot_dispatch_autograd
+            )
         else:
             # aot_dispatch_base_graph contains only the "graph bits", while aot_dispatch_base
             # includes some extra work around handling a runtime epilogue.
-            compiler_fn = aot_dispatch_base_graph if aot_config.is_export else aot_dispatch_base
+            compiler_fn = (
+                aot_dispatch_base_graph if aot_config.is_export else aot_dispatch_base
+            )
 
-        compiler_fn = partial(aot_wrapper_synthetic_base, compiler_fn=compiler_fn, needs_autograd=needs_autograd)
+        compiler_fn = partial(
+            aot_wrapper_synthetic_base,
+            compiler_fn=compiler_fn,
+            needs_autograd=needs_autograd,
+        )
         compiler_fn = partial(aot_wrapper_dedupe, compiler_fn=compiler_fn)
         # You can put more passes here
 
-        compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+        compiled_fn = compiler_fn(
+            flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata
+        )
         if aot_config.is_export:
-            mutated_user_inp_locs = [
-                idx - aot_config.num_params_buffers
-                for idx in fw_metadata.mutated_inp_runtime_indices
-                if idx >= aot_config.num_params_buffers
-            ]
-            if len(mutated_user_inp_locs) > 0:
-                raise RuntimeError(f"""
-Found following user inputs located at {mutated_user_inp_locs} are mutated. This is currently banned in the aot_export workflow.
-If you need this functionality, please file a github issue.
-
-fw_metadata={str(fw_metadata)}""")
-
             # During export, we don't get back a callable - we get back the raw fx graph
             # (either a joint or an inference-only graph)
             assert isinstance(compiled_fn, torch.fx.GraphModule)
             return compiled_fn, fw_metadata
 
-        if not hasattr(compiled_fn, "_boxed_call"):
-            compiled_fn = make_boxed_func(compiled_fn)
-
         return compiled_fn
 
 
@@ -850,16 +918,15 @@ def aot_module_simplified(
 
     if hasattr(mod, "graph"):
         # Non dynamo entrypoints can get to here...
-        for i, node in enumerate(mod.graph.nodes):
-            if node.op == "placeholder":
-                if hasattr(node, "_dynamo_source"):
-                    # ... but not here!
-                    if aot_autograd_arg_pos_to_source is None:
-                        aot_autograd_arg_pos_to_source = []
-                    source = node._dynamo_source
-                    assert source not in seen_sources, source
-                    seen_sources.add(source)
-                    aot_autograd_arg_pos_to_source.append(source)
+        for node in mod.graph.find_nodes(op="placeholder"):
+            if hasattr(node, "_dynamo_source"):
+                # ... but not here!
+                if aot_autograd_arg_pos_to_source is None:
+                    aot_autograd_arg_pos_to_source = []
+                source = node._dynamo_source
+                assert source not in seen_sources, source
+                seen_sources.add(source)
+                aot_autograd_arg_pos_to_source.append(source)
 
     if aot_autograd_arg_pos_to_source is not None:
         assert len(full_args) == len(aot_autograd_arg_pos_to_source)
@@ -892,11 +959,30 @@ def aot_module_simplified(
             aot_config,
         )
 
+    if isinstance(mod, torch._dynamo.utils.GmWrapper):
+        # This function is called by the flatten_graph_inputs wrapper, which boxes
+        # the inputs so that they can be freed before the end of this scope.
+        # For overhead reasons, this is not the default wrapper, see comment:
+        # https://github.com/pytorch/pytorch/pull/122535/files#r1560096481
+        def boxed_forward(runtime_args: List[Any]):
+            flat_args = []
+            flat_args.extend(params_flat)
+            flat_args.extend(runtime_args)
+            runtime_args.clear()
+            return compiled_fn(flat_args)
+
+        # Just for convenience
+        boxed_forward.zero_grad = mod.zero_grad
+        boxed_forward.named_parameters = mod.named_parameters
+        boxed_forward.named_buffers = mod.named_buffers
+        return boxed_forward
+
     # TODO: There is something deeply wrong here; compiled_fn running with
     # the boxed calling convention, but aot_module_simplified somehow
     # historically returned a function that was not the boxed calling
     # convention.  This should get fixed...
-    def forward(*runtime_args):
+    # NB: GraphModule/nn.Module rely on the non-boxed calling convention here
+    def forward(*runtime_args: Tuple[Any]):
         full_args = []
         full_args.extend(params_flat)
         full_args.extend(runtime_args)
@@ -922,6 +1008,7 @@ def aot_export_module(
     # Your module can return multiple outputs, so you must specify which output the loss is.
     output_loss_index: Optional[int] = None,
     pre_dispatch: bool = False,
+    kwargs=None,
 ) -> Tuple[torch.fx.GraphModule, GraphSignature]:
     """
     This function takes in a module, and returns:
@@ -957,6 +1044,7 @@ def aot_export_module(
         raise RuntimeError("pre_dispatch is not supported when trace_joint is True.")
     named_parameters = dict(mod.named_parameters(remove_duplicate=False))
     named_buffers = dict(mod.named_buffers(remove_duplicate=False))
+
     params_and_buffers = {
         **dict(named_parameters),
         **dict(named_buffers),
@@ -965,7 +1053,11 @@ def aot_export_module(
     params_and_buffers_flat = tuple(params_and_buffers_flat)
     params_len = len(params_and_buffers_flat)
 
-    functional_call = create_functional_call(mod, params_spec, params_len, store_orig_mod=True)
+    kwargs = kwargs or {}
+
+    functional_call = create_functional_call(
+        mod, params_spec, params_len, store_orig_mod=True
+    )
 
     num_fw_outs = None
 
@@ -978,13 +1070,17 @@ def fn_to_trace(*args):
             nonlocal num_fw_outs
             out = functional_call(*args)
             if output_loss_index is None:
-                raise RuntimeError("""\
+                raise RuntimeError(
+                    """\
 If trace_joint=Trueit is required that one of your forward outputs must be a scalar loss.
-You must specify the which (index) output is the loss with output_loss_index.""")
+You must specify the which (index) output is the loss with output_loss_index."""
+                )
             if isinstance(out, (torch.Tensor)):
                 out = (out,)
             if not isinstance(out, (tuple, list)):
-                raise RuntimeError(f"Expected forward output to be either a tensor or a list/tuple of tensors. found {type(out)}")
+                raise RuntimeError(
+                    f"Expected forward output to be either a tensor or a list/tuple of tensors. found {type(out)}"
+                )
 
             for i, o in enumerate(out):
                 # We only want to create a backward graph w.r.t. the loss that the user passed in.
@@ -993,26 +1089,34 @@ def fn_to_trace(*args):
                 # of their forward),
                 # we'll automatically detach them here.
                 if o.requires_grad and i != output_loss_index:
-                    raise RuntimeError(f"""\
+                    raise RuntimeError(
+                        f"""\
 Found an output of the forward that requires gradients, that was not the scalar loss.
 We require all outputs to the forward that are not the scalar loss to not require gradient,
 because we will only compute a backward graph against the scalar loss.
 You can fix this by calling .detach() on each of your forward outputs that is not the loss.
 You specified that output index {output_loss_index} is the loss, but we found that
-the output at index {i} requires gradients.""")
+the output at index {i} requires gradients."""
+                    )
             out_loss = out[output_loss_index]
             num_fw_outs = len(out)
             if not out_loss.requires_grad:
-                raise RuntimeError(f"""\
-The output at index {output_loss_index} was marked as the loss, but it does not require gradients""")
+                raise RuntimeError(
+                    f"""\
+The output at index {output_loss_index} was marked as the loss, but it does not require gradients"""
+                )
             if out_loss.numel() != 1:
-                raise RuntimeError(f"""\
-We require the output marked as the loss (at index {output_loss_index}) to be a scalar, but it has shape {out_loss.shape}""")
+                raise RuntimeError(
+                    f"""\
+We require the output marked as the loss (at index {output_loss_index}) to be a scalar, but it has shape {out_loss.shape}"""
+                )
             return out
+
         ctx = nullcontext
     else:
         # Run under no_grad, so our tracing machinery only traces an inference graph.
-        ctx = torch.no_grad
+        # However if pre_dispatch=True, we want to correctly trace set_grad_enabled calls for training.
+        ctx = nullcontext if pre_dispatch else torch.no_grad
         fn_to_trace = functional_call
 
     full_args = []
@@ -1033,8 +1137,10 @@ def fn_to_trace(*args):
             num_params_buffers=params_len,
             no_tangents=True,
             pre_dispatch=pre_dispatch,
+            kwargs=kwargs,
         )
     if trace_joint:
+
         def flattened_joint(*args):
             # The idea here is that the joint graph that AOTAutograd creates has some strict properties:
             # (1) It accepts two arguments (primals, tangents), and pytree_flattens them
@@ -1052,13 +1158,20 @@ def flattened_joint(*args):
             #     and there are therefore no tangents that are needed to run the joint graph.
             # This function "fixes" both of the above by removing any tangent inputs,
             # and removing pytrees from the original FX graph.
-            fake_tangents = [None for _ in range(metadata.num_outputs + metadata.num_mutated_inp_runtime_indices)]
+            fake_tangents = [
+                None
+                for _ in range(
+                    metadata.num_outputs + metadata.num_mutated_inp_runtime_indices
+                )
+            ]
             fw_outs, gradients = fx_g(args, fake_tangents)
             assert len(gradients) == len(args)
             output_gradients = []
             for i, (a, grad) in enumerate(zip(args, gradients)):
                 if isinstance(a, torch.Tensor) and a.requires_grad:
-                    assert grad is not None, """\
+                    assert (
+                        grad is not None
+                    ), """\
 Found a parameter that did not receive a gradient.
 "This is most likely a bug, but if this needs to be supported please comment on this Github issue:
 https://github.com/pytorch/pytorch/issues/101192
@@ -1067,9 +1180,10 @@ def flattened_joint(*args):
                 else:
                     assert grad is None
             return *fw_outs, *output_gradients
+
         fx_g = make_fx(flattened_joint)(*full_args)
 
-    user_args_flat = pytree.arg_tree_leaves(*args)
+    user_args_flat = pytree.arg_tree_leaves(*args, **kwargs)
     return fx_g, create_graph_signature(
         fx_g,
         metadata,
@@ -1084,6 +1198,7 @@ def flattened_joint(*args):
         loss_index=output_loss_index,
     )
 
+
 def aot_export_joint_simple(
     func: Callable,
     args,
@@ -1124,26 +1239,45 @@ def aot_export_joint_simple(
             args,
             decompositions=decompositions,
         )
+        in_spec, _kw_in_spec = in_spec.children_specs
     # At this point, we can just directly return the (joint or inference graph) that we traced.
     # First though: a bunch of assertions to make sure that our graph doesn't require
     # any calling convention changes compared to the original function.
     # These restrictions are *in addition to* the general restrictions on export.
 
     # No input mutations
-    if len([x for x in metadata.input_info if x.mutates_data or x.mutates_metadata]) != 0:
-        raise RuntimeError(f"aot_export_joint_simple does not support input mutations. {str(metadata)}")
+    if (
+        len([x for x in metadata.input_info if x.mutates_data or x.mutates_metadata])
+        != 0
+    ):
+        raise RuntimeError(
+            f"aot_export_joint_simple does not support input mutations. {str(metadata)}"
+        )
     # No output aliasing
-    if len([x for x in metadata.output_info if x.output_type != OutputType.non_alias]) != 0:
-        raise RuntimeError(f"aot_export_joint_simple does not support outputs that alias inputs. {str(metadata)}")
+    if (
+        len([x for x in metadata.output_info if x.output_type != OutputType.non_alias])
+        != 0
+    ):
+        raise RuntimeError(
+            f"aot_export_joint_simple does not support outputs that alias inputs. {str(metadata)}"
+        )
     # No pytrees
-    if type(in_spec) == pytree.LeafSpec:
-        raise RuntimeError(f"aot_export_joint_simple requires inputs to be a single list/tuple. in_spec={str(in_spec)}")
-    if len([x for x in in_spec.children_specs if type(x) != pytree.LeafSpec]) != 0:
-        raise RuntimeError(f"aot_export_joint_simple requires individual inputs not to be pytrees. in_spec={str(in_spec)}")
-    if type(out_spec) == pytree.LeafSpec:
-        raise RuntimeError(f"aot_export_joint_simple requires outputs to be a single list/tuple. out_spec={str(out_spec)}")
-    if len([x for x in out_spec.children_specs if type(x) != pytree.LeafSpec]) != 0:
-        raise RuntimeError(f"aot_export_joint_simple requires individual outputs not to be pytrees. out_spec={str(out_spec)}")
+    if in_spec.is_leaf():
+        raise RuntimeError(
+            f"aot_export_joint_simple requires inputs to be a single list/tuple. in_spec={str(in_spec)}"
+        )
+    if not all(child.is_leaf() for child in in_spec.children_specs):
+        raise RuntimeError(
+            f"aot_export_joint_simple requires individual inputs not to be pytrees. in_spec={str(in_spec)}"
+        )
+    if out_spec.is_leaf():
+        raise RuntimeError(
+            f"aot_export_joint_simple requires outputs to be a single list/tuple. out_spec={str(out_spec)}"
+        )
+    if not all(child.is_leaf() for child in out_spec.children_specs):
+        raise RuntimeError(
+            f"aot_export_joint_simple requires individual outputs not to be pytrees. out_spec={str(out_spec)}"
+        )
     # TODO: we might have to temporarily patch config.functionalize_rng
     # so that it doesn't run when we're exporting a higher order op.
 
@@ -1160,6 +1294,7 @@ def aot_export_joint_simple(
             fw_module(*args)
     return fx_g
 
+
 # Private for now because we aren't providing a contract on what to return
 # for joint graphs (we could when there's a clearer use case)
 # In the future, we may need to add more export API's that provide their own strong guarantees.
@@ -1179,16 +1314,19 @@ def _aot_export_function(
     # We don't know this info at trace time though, so we need to make it an explicit config.
     no_tangents: bool = False,
     pre_dispatch: bool = False,
+    kwargs=None,
 ) -> Tuple[torch.fx.GraphModule, ViewAndMutationMeta, pytree.TreeSpec, pytree.TreeSpec]:
+    kwargs = kwargs or {}
+
+    flat_fn, out_spec = create_tree_flattened_fn(func, args, kwargs)
+    flat_args, in_spec = pytree.tree_flatten((args, kwargs))
+
     dynamic_shapes = False
-    for x in args:
+    for x in flat_args:
         if isinstance(x, FakeTensor):
             dynamic_shapes = x.fake_mode.shape_env is not None
             break
 
-    flat_fn, out_spec = create_tree_flattened_fn(func, args)
-    flat_args, in_spec = pytree.tree_flatten(args)
-
     # The export use case doesn't care about several bits of AOTConfig
     # (1) compilers (we just export the graph)
     # (2) partitioners (export is only full graph, user can partition themselves)
@@ -1219,5 +1357,71 @@ def _aot_export_function(
     return fx_g, meta, in_spec, out_spec.spec
 
 
+@contextmanager
+def _detect_attribute_assignment(mod: torch.nn.Module):
+    # Do not allow assignment of tensor attributes during export unless
+    # the attribute is registered as a buffer.
+
+    STD_ATTRS = {
+        "_backward_hooks",
+        "_backward_pre_hooks",
+        "_buffers",
+        "_forward_hooks",
+        "_forward_hooks_always_called",
+        "_forward_hooks_with_kwargs",
+        "_forward_pre_hooks",
+        "_forward_pre_hooks_with_kwargs",
+        "_is_full_backward_hook",
+        "_load_state_dict_post_hooks",
+        "_load_state_dict_pre_hooks",
+        "_modules",
+        "_non_persistent_buffers_set",
+        "_parameters",
+        "_state_dict_hooks",
+        "_state_dict_pre_hooks",
+        "training",
+    }
+
+    def _get_attributes(mod):
+        # return any attributes of a module that are not standard attributes
+        return {k: v for k, v in mod.__dict__.items() if k not in STD_ATTRS}
+
+    # save state of attributes before enter
+    snapshot = pytree.tree_map(lambda x: x, _get_attributes(mod))
+    try:
+        yield
+    finally:
+        # after exit, compare state of attributes with snapshot
+        # to detect which tensor attributes were assigned
+        assigned_tensor_attributes = []
+
+        def _collect_assigned_tensor_attributes(kp, v, _v):
+            if _v is not v:
+                attr, *rest = kp
+                if isinstance(v, torch.Tensor):
+                    assigned_tensor_attributes.append(
+                        f"self.{attr.key}{pytree.keystr(rest)}"
+                    )
+                # TODO(avik): Assigning all other types are allowed right now.
+                # Maybe in the future we want to limit this to primitive types?
+
+        pytree.tree_map_with_path(
+            _collect_assigned_tensor_attributes, snapshot, _get_attributes(mod)
+        )
+        # restore state of all attributes (including, e.g., of primitive types)
+        mod.__dict__.update(snapshot)
+
+        if assigned_tensor_attributes:
+            if len(assigned_tensor_attributes) > 1:
+                noun, verb = "attributes", "were"
+            else:
+                noun, verb = "attribute", "was"
+            raise ValueError(
+                f"The tensor {noun} {', '.join(assigned_tensor_attributes)} {verb} assigned during export. "
+                "Such attributes must be registered as buffers using the `register_buffer` API "
+                "(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
+            )
+
+
 compiled_function = aot_function
 compiled_module = aot_module
diff --git a/torch/_functorch/apis.py b/torch/_functorch/apis.py
index 96e84f965084c..ee0c0a1984e4c 100644
--- a/torch/_functorch/apis.py
+++ b/torch/_functorch/apis.py
@@ -1,14 +1,22 @@
-# NOTE: We allow Dynamo to see this file (via torch/_dynamo/skipfiles.py) so that it can
+# NOTE: We allow Dynamo to see this file (via torch/_dynamo/trace_rules.py) so that it can
 #       trace through functorch transforms.
 #       Currently, we can't allow Dynamo to see `eager_transforms.py`/`vmap.py` as that break a lot of thing
 #       and there isn't a mechanism to selectively expose only some functions (eg. grad) from a file
 #       to Dynamo.
-from torch._functorch.vmap import (vmap_impl, _check_randomness_arg,
-                                   Callable, in_dims_t, out_dims_t, _check_out_dims_is_int_or_int_pytree,
-                                   _process_batched_inputs, _chunked_vmap)
-from torch._functorch.utils import exposed_in, argnums_t
 import functools
 
+from torch._functorch.utils import argnums_t, exposed_in
+from torch._functorch.vmap import (
+    _check_out_dims_is_int_or_int_pytree,
+    _check_randomness_arg,
+    _chunked_vmap,
+    _process_batched_inputs,
+    Callable,
+    in_dims_t,
+    out_dims_t,
+    vmap_impl,
+)
+
 # vmap(func)(inputs) wraps all Tensor inputs to be batched in BatchedTensors,
 # sends those into func, and then unwraps the output BatchedTensors. Operations
 # on BatchedTensors perform the batched operations that the user is asking for.
@@ -17,14 +25,15 @@
 # to be passed everywhere.
 
 
-@exposed_in('torch.func')
+@exposed_in("torch.func")
 def vmap(
-        func: Callable,
-        in_dims: in_dims_t = 0,
-        out_dims: out_dims_t = 0,
-        randomness: str = 'error',
-        *,
-        chunk_size=None) -> Callable:
+    func: Callable,
+    in_dims: in_dims_t = 0,
+    out_dims: out_dims_t = 0,
+    randomness: str = "error",
+    *,
+    chunk_size=None,
+) -> Callable:
     """
     vmap is the vectorizing map; ``vmap(func)`` returns a new function that
     maps ``func`` over some dimension of the inputs. Semantically, vmap
@@ -179,23 +188,32 @@ def vmap(
         vmap does not provide general autobatching or handle variable-length
         sequences out of the box.
     """
+    from torch._dynamo import is_compiling
+
     _check_randomness_arg(randomness)
     if not (chunk_size is None or chunk_size > 0):
-        raise ValueError(f"vmap: chunk_size should be None or greater than 0. (got {chunk_size})")
+        raise ValueError(
+            f"vmap: chunk_size should be None or greater than 0. (got {chunk_size})"
+        )
 
-    # @functools.wraps(func)
     def wrapped(*args, **kwargs):
-        return vmap_impl(func, in_dims, out_dims, randomness, chunk_size, *args, **kwargs)
+        return vmap_impl(
+            func, in_dims, out_dims, randomness, chunk_size, *args, **kwargs
+        )
+
+    if not is_compiling():
+        wrapped = functools.wraps(func)(wrapped)
 
     return wrapped
 
 
 def chunk_vmap(
-        func: Callable,
-        in_dims: in_dims_t = 0,
-        out_dims: out_dims_t = 0,
-        randomness: str = 'error',
-        chunks=2) -> Callable:
+    func: Callable,
+    in_dims: in_dims_t = 0,
+    out_dims: out_dims_t = 0,
+    randomness: str = "error",
+    chunks=2,
+) -> Callable:
     """
     chunk_vmap is the vectorizing map (vmap) using chunks of input data. It is a mix of vmap (which vectorizes
     everything) and map (which executes things sequentially). ``chunk_vmap`` vectorizes the input with number of
@@ -239,7 +257,12 @@ def chunk_vmap(
 
     def _get_chunk_flat_args(flat_args_, flat_in_dims_, chunks_):
         flat_args_chunks = tuple(
-            t.chunk(chunks_, dim=in_dim) if in_dim is not None else [t, ] * chunks_
+            t.chunk(chunks_, dim=in_dim)
+            if in_dim is not None
+            else [
+                t,
+            ]
+            * chunks_
             for t, in_dim in zip(flat_args_, flat_in_dims_)
         )
         # transpose chunk dim and flatten structure
@@ -250,12 +273,22 @@ def _get_chunk_flat_args(flat_args_, flat_in_dims_, chunks_):
     @functools.wraps(func)
     def wrapped_with_chunks(*args, **kwargs):
         _check_out_dims_is_int_or_int_pytree(out_dims, func)
-        _, flat_in_dims, flat_args, args_spec = _process_batched_inputs(in_dims, args, func)
+        _, flat_in_dims, flat_args, args_spec = _process_batched_inputs(
+            in_dims, args, func
+        )
         # Chunk flat arguments
         chunks_flat_args = _get_chunk_flat_args(flat_args, flat_in_dims, chunks)
 
         # Apply vmap on chunks
-        return _chunked_vmap(func, flat_in_dims, chunks_flat_args, args_spec, out_dims, randomness, **kwargs)
+        return _chunked_vmap(
+            func,
+            flat_in_dims,
+            chunks_flat_args,
+            args_spec,
+            out_dims,
+            randomness,
+            **kwargs,
+        )
 
     return wrapped_with_chunks
 
@@ -357,8 +390,57 @@ def grad(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Calla
     """
     # To avoid cyclical dependency.
     import torch._functorch.eager_transforms as eager_transforms
+    from torch._dynamo import is_compiling
 
-    @functools.wraps(func)
     def wrapper(*args, **kwargs):
         return eager_transforms.grad_impl(func, argnums, has_aux, args, kwargs)
+
+    if not is_compiling():
+        wrapper = functools.wraps(func)(wrapper)
+
+    return wrapper
+
+
+@exposed_in("torch.func")
+def grad_and_value(
+    func: Callable, argnums: argnums_t = 0, has_aux: bool = False
+) -> Callable:
+    """
+    Returns a function to compute a tuple of the gradient and primal, or
+    forward, computation.
+
+    Args:
+        func (Callable): A Python function that takes one or more arguments.
+            Must return a single-element Tensor. If specified ``has_aux``
+            equals ``True``, function can return a tuple of single-element
+            Tensor and other auxiliary objects: ``(output, aux)``.
+        argnums (int or Tuple[int]): Specifies arguments to compute gradients
+            with respect to. ``argnums`` can be single integer or tuple of
+            integers. Default: 0.
+        has_aux (bool): Flag indicating that ``func`` returns a tensor and
+            other auxiliary objects: ``(output, aux)``. Default: False.
+
+    Returns:
+        Function to compute a tuple of gradients with respect to its inputs
+        and the forward computation. By default, the output of the function is
+        a tuple of the gradient tensor(s) with respect to the first argument
+        and the primal computation. If specified ``has_aux`` equals
+        ``True``, tuple of gradients and tuple of the forward computation with
+        output auxiliary objects is returned. If ``argnums`` is a tuple of
+        integers, a tuple of a tuple of the output gradients with respect to
+        each ``argnums`` value and the forward computation is returned.
+
+    See :func:`grad` for examples
+    """
+    from torch._dynamo import is_compiling
+    from torch._functorch import eager_transforms
+
+    def wrapper(*args, **kwargs):
+        return eager_transforms.grad_and_value_impl(
+            func, argnums, has_aux, args, kwargs
+        )
+
+    if not is_compiling():
+        wrapper = functools.wraps(func)(wrapper)
+
     return wrapper
diff --git a/torch/_functorch/autograd_function.py b/torch/_functorch/autograd_function.py
index 07e68b8b67ede..03bfd710ae341 100644
--- a/torch/_functorch/autograd_function.py
+++ b/torch/_functorch/autograd_function.py
@@ -1,23 +1,25 @@
+from typing import Any, NamedTuple, Tuple
+
 import torch
-from torch._ops import HigherOrderOperator
-from torch._C._functorch import TransformType
-from torch._functorch.utils import enable_single_level_autograd_function
 import torch.utils._pytree as pytree
 from torch._C._functorch import (
-    _wrap_for_grad,
     _unwrap_for_grad,
+    _wrap_for_grad,
     current_level,
+    TransformType,
 )
+from torch._functorch.apis import vmap
+from torch._functorch.utils import enable_single_level_autograd_function
 from torch._functorch.vmap import (
-    wrap_batched,
-    unwrap_batched,
-    restore_vmap,
     _add_batch_dim,
+    _broadcast_to_and_flatten,
+    restore_vmap,
+    unwrap_batched,
+    wrap_batched,
 )
-from torch._functorch.apis import vmap
-from torch._functorch.vmap import _broadcast_to_and_flatten
+from torch._ops import HigherOrderOperator
 from torch.autograd.forward_ad import _set_fwd_grad_enabled
-from typing import Any, NamedTuple, Tuple
+
 
 # autograd.Function technically runs before the regular PyTorch dispatcher.
 # This is how features like autocast and torch_dispatch (e.g. PythonTLSSnapshot)
@@ -28,7 +30,7 @@
 # dispatches specially.
 class CustomFunctionHigherOrderOperator(HigherOrderOperator):
     def __init__(self):
-        super().__init__('custom_function_call')
+        super().__init__("custom_function_call")
 
     def __call__(self, autograd_function, *args, **kwargs):
         # When custom_function_call is done dispatching through functorch,
@@ -96,24 +98,23 @@ def generate_single_level_function(interpreter, autograd_function):
 
     def forward(*operands):
         unwrapped_operands = pytree.tree_map_only(
-            torch.Tensor,
-            lambda x: _unwrap_for_grad(x, level),
-            operands)
+            torch.Tensor, lambda x: _unwrap_for_grad(x, level), operands
+        )
         # Both enable_grad() and _set_fwd_grad_enabled() are necessary no matter
         # the transform. _SingleLevelFunction will turn off both fwd and bwd
         # gradient computation and we need to turn it back on here.
         with torch.enable_grad(), _set_fwd_grad_enabled(True), interpreter.lower():
-            unwrapped_output = custom_function_call(autograd_function, *unwrapped_operands)
+            unwrapped_output = custom_function_call(
+                autograd_function, *unwrapped_operands
+            )
 
         # See NOTE [mark_dirty object identity check]
         def wrap_fn(output):
             return _wrap_for_grad(output, level)
 
         return wrap_outputs_maintaining_identity(
-            unwrapped_output,
-            unwrapped_operands,
-            operands,
-            wrap_fn)
+            unwrapped_output, unwrapped_operands, operands, wrap_fn
+        )
 
     def setup_context(ctx, inputs, output):
         return autograd_function.setup_context(ctx, inputs, output)
@@ -132,19 +133,20 @@ def jvp(ctx, *tangents):
     # a given name. A Tensor's .grad_fn field has a class name that is the original
     # autograd.Function's name + Backward, so we do this to generate some
     # meaningful name.
-    name = f'{autograd_function.__name__}Generated'
+    name = f"{autograd_function.__name__}Generated"
     Generated = type(
         name,
         (torch.autograd.function._SingleLevelFunction,),
         {
-            'forward': staticmethod(forward),
-            'backward': staticmethod(backward),
-            'jvp': staticmethod(jvp),
-            'setup_context': staticmethod(setup_context),
+            "forward": staticmethod(forward),
+            "backward": staticmethod(backward),
+            "jvp": staticmethod(jvp),
+            "setup_context": staticmethod(setup_context),
         },
     )
     return Generated
 
+
 # wrap_outputs_maintaining_identity handles outputs from the vmap,
 # backward (vjp), and jvp staticmethod. The way it distinguishes
 # between the vmap case and the {backward, jvp} case is if the out_dims
@@ -156,12 +158,14 @@ def jvp(ctx, *tangents):
 # dimension that is being vmapped over, which is valid.
 NO_OUT_DIMS = "not specified"
 
+
 # NOTE [mark_dirty object identity check]
 # autograd.Function's ctx.mark_dirty expect a returned input
 # to have the same object identity as the input.
 # Mode-only functorch will greatly simplify this logic.
 def wrap_outputs_maintaining_identity(
-        outputs, unwrapped_inputs, orig_inputs, wrap_fn, out_dims=NO_OUT_DIMS):
+    outputs, unwrapped_inputs, orig_inputs, wrap_fn, out_dims=NO_OUT_DIMS
+):
     flat_unwrapped_inputs = pytree.arg_tree_leaves(*unwrapped_inputs)
     flat_orig_inputs = pytree.arg_tree_leaves(*orig_inputs)
 
@@ -188,7 +192,7 @@ def wrap_outputs_maintaining_identity(
                 f"out_dims has structure {pytree.tree_flatten(out_dims)[1]} "
                 f"but output has structure {spec}. "
                 f"For more details, please see "
-                f"https://pytorch.org/docs/master/notes/extending.func.html"
+                f"https://pytorch.org/docs/main/notes/extending.func.html"
             )
 
     for i, output in enumerate(flat_outputs):
@@ -199,7 +203,7 @@ def wrap_outputs_maintaining_identity(
             result.append(unwrapped_input_to_orig_input[id(output)])
             continue
         if out_dims_specified:
-            result.append(wrap_fn(output, flat_out_dims[i]))  # type: ignore[index]
+            result.append(wrap_fn(output, flat_out_dims[i]))  # type: ignore[possibly-undefined, index]
         else:
             result.append(wrap_fn(output))
 
@@ -267,6 +271,7 @@ def validate_vmap_returns_tuple_of_two_elements(result):
     if not len(result) == 2:
         raise RuntimeError(base_error_msg + f"Got {len(result)} returns instead")
 
+
 @custom_function_call.py_impl(TransformType.Vmap)
 def custom_function_call_vmap(interpreter, autograd_function, *operands):
     if autograd_function.generate_vmap_rule:
@@ -279,8 +284,11 @@ def custom_function_call_vmap(interpreter, autograd_function, *operands):
                 f"staticmethod. Please set generate_vmap_rule=False or delete "
                 f"the overriden vmap staticmethod to avoid ambiguity. "
                 f"For more details, please see "
-                f"https://pytorch.org/docs/master/notes/extending.func.html")
-        return custom_function_call_vmap_generate_rule(interpreter, autograd_function, *operands)
+                f"https://pytorch.org/docs/main/notes/extending.func.html"
+            )
+        return custom_function_call_vmap_generate_rule(
+            interpreter, autograd_function, *operands
+        )
 
     if not has_overriden_vmap_rule(autograd_function):
         # TODO: Update link to stable once that's out
@@ -290,7 +298,8 @@ def custom_function_call_vmap(interpreter, autograd_function, *operands):
             f"it does not have vmap support. Please override and implement the "
             f"vmap staticmethod or set generate_vmap_rule=True. "
             f"For more details, please see "
-            f"https://pytorch.org/docs/master/notes/extending.func.html")
+            f"https://pytorch.org/docs/main/notes/extending.func.html"
+        )
 
     current_level = interpreter.level()
     info = VmapInfo(
@@ -313,20 +322,22 @@ def custom_function_call_vmap(interpreter, autograd_function, *operands):
 
     # See NOTE [mark_dirty object identity check]
     def wrap_fn(output, out_dim):
-        return output if out_dim is None else _add_batch_dim(output, out_dim, current_level)
+        return (
+            output
+            if out_dim is None
+            else _add_batch_dim(output, out_dim, current_level)
+        )
 
     return wrap_outputs_maintaining_identity(
-        unwrapped_output,
-        unwrapped_operands,
-        operands,
-        wrap_fn,
-        out_dims=out_dims)
+        unwrapped_output, unwrapped_operands, operands, wrap_fn, out_dims=out_dims
+    )
 
 
 def custom_function_call_vmap_generate_rule(interpreter, autograd_function, *operands):
     unwrapped_operands, in_dims = unwrap_batched(operands, interpreter.level())
     vmapped_function, get_out_dims = vmapify_autograd_function(
-        autograd_function, in_dims, interpreter.batch_size(), interpreter.randomness())
+        autograd_function, in_dims, interpreter.batch_size(), interpreter.randomness()
+    )
 
     with interpreter.lower():
         output = custom_function_call(vmapped_function, *unwrapped_operands)
@@ -336,7 +347,9 @@ def custom_function_call_vmap_generate_rule(interpreter, autograd_function, *ope
 
 
 @custom_function_call.py_impl(TransformType.Functionalize)
-def custom_function_call_functionalize(interpreter, autograd_function, generate_vmap_rule, *operands):
+def custom_function_call_functionalize(
+    interpreter, autograd_function, generate_vmap_rule, *operands
+):
     raise RuntimeError("NYI: Functionalize rule for custom_function_call")
 
 
@@ -357,7 +370,8 @@ def vmapify_autograd_function(autograd_function, in_dims, batch_size, randomness
     def forward(*operands):
         nonlocal out_dims
         outputs, out_dims = restore_vmap(
-            autograd_function.forward, in_dims, batch_size, randomness)(*operands)
+            autograd_function.forward, in_dims, batch_size, randomness
+        )(*operands)
         return outputs
 
     def setup_context(ctx, inputs, outputs):
@@ -377,8 +391,9 @@ def inner(inputs, outputs):
             # See NOTE: [Why can't we rely on autograd to reduce expanded gradients?]
             # for more details
             nonlocal input_shapes_
-            input_shapes_ = tuple(inp.shape if isinstance(inp, torch.Tensor) else None
-                                  for inp in inputs)
+            input_shapes_ = tuple(
+                inp.shape if isinstance(inp, torch.Tensor) else None for inp in inputs
+            )
             nonlocal saved_tensors_bdims_
             saved_tensors_bdims_ = wrapped_ctx._pt_saved_tensors_bdims
 
@@ -405,8 +420,11 @@ def jvp_no_context(saved_tensors, tangents):
 
         tangent_in_dims = get_tangents_in_dims(in_dims, tangents)
         out_tangents, out_tangents_dims = restore_vmap(
-            jvp_no_context, (saved_tensors_bdims, tangent_in_dims), batch_size, randomness)(
-                ctx.saved_tensors, tangents)
+            jvp_no_context,
+            (saved_tensors_bdims, tangent_in_dims),
+            batch_size,
+            randomness,
+        )(ctx.saved_tensors, tangents)
 
         result = reductify(out_tangents, out_tangents_dims, out_dims, batch_size)
         return result
@@ -422,22 +440,25 @@ def backward_no_context(inputs):
             return autograd_function.backward(wrapped_ctx, *grad_outputs)
 
         grad_ins, grad_ins_dims = restore_vmap(
-            backward_no_context, ((saved_tensors_bdims, out_dims),), batch_size, randomness)(
-                (ctx.saved_tensors, grad_outputs))
+            backward_no_context,
+            ((saved_tensors_bdims, out_dims),),
+            batch_size,
+            randomness,
+        )((ctx.saved_tensors, grad_outputs))
         result = reductify(grad_ins, grad_ins_dims, in_dims, batch_size, input_shapes)
         return result
 
-    name = f'Vmapped{autograd_function.__name__}'
+    name = f"Vmapped{autograd_function.__name__}"
     Generated = type(
         name,
         (torch.autograd.Function,),
         {
-            'forward': staticmethod(forward),
-            'backward': staticmethod(backward),
-            'jvp': staticmethod(jvp),
-            'setup_context': staticmethod(setup_context),
-            'generate_vmap_rule': True
-        }
+            "forward": staticmethod(forward),
+            "backward": staticmethod(backward),
+            "jvp": staticmethod(jvp),
+            "setup_context": staticmethod(setup_context),
+            "generate_vmap_rule": True,
+        },
     )
 
     def get_out_dims():
@@ -452,8 +473,10 @@ def get_out_dims():
 def get_tangents_in_dims(input_dims, tangents):
     flat_in_dims, spec = pytree.tree_flatten(input_dims)
     flat_tangents = pytree.arg_tree_leaves(*tangents)
-    result = [None if tangent is None else in_dim
-              for in_dim, tangent in zip(flat_in_dims, flat_tangents)]
+    result = [
+        None if tangent is None else in_dim
+        for in_dim, tangent in zip(flat_in_dims, flat_tangents)
+    ]
     return pytree.tree_unflatten(result, spec)
 
 
@@ -475,7 +498,7 @@ def get_tangents_in_dims(input_dims, tangents):
 # in_dims = 0
 # vmap(Sum.apply, in_dims)(x)
 #
-# Let’s assume for a moment that we didn’t vmap setup_context in VmappedSum:
+# Let's assume for a moment that we didn't vmap setup_context in VmappedSum:
 #
 # class VmappedSum(torch.autograd.Function):
 #    @staticmethod
@@ -496,7 +519,7 @@ def get_tangents_in_dims(input_dims, tangents):
 #        return gx
 #
 # We end up saving [B, 4] as x_shape. In the backward, gy has shape [B],
-# and we’re doing:
+# and we're doing:
 #
 # def backward_no_context(gy):
 #     return gy.expand([B, 4])
@@ -507,10 +530,11 @@ def get_tangents_in_dims(input_dims, tangents):
 # have shape [4]). Performing vmap over setup_context means the shape
 # saved has shape [4] and leads to a correct result shape for gx.
 
+
 # Wraps a ctx object. Forwards all attr accesses to the underlying object
 # except for the attrs in _pt_attrs
 class WrappedCtx:
-    _pt_reserved_attrs: Tuple[str, ...] = ('_pt_reserved_attrs', '_pt_inner_ctx')
+    _pt_reserved_attrs: Tuple[str, ...] = ("_pt_reserved_attrs", "_pt_inner_ctx")
 
     def __init__(self, ctx):
         if not isinstance(ctx, WrappedCtx):
@@ -519,9 +543,10 @@ def __init__(self, ctx):
                 if not hasattr(ctx, name):
                     continue
                 raise RuntimeError(
-                    f'PyTorch reserves the {reserved_attrs} field on ctx. '
-                    'Please name your fields on ctx something else to avoid name '
-                    'collision.')
+                    f"PyTorch reserves the {reserved_attrs} field on ctx. "
+                    "Please name your fields on ctx something else to avoid name "
+                    "collision."
+                )
         self._pt_inner_ctx = ctx
 
     def __getattr__(self, name):
@@ -533,9 +558,10 @@ def __setattr__(self, name, value):
             return
         return setattr(self._pt_inner_ctx, name, value)
 
+
 # Wraps ctx to create a new ctx object that overrides saved_tensors.
 class CtxWithSavedTensors(WrappedCtx):
-    _pt_reserved_attrs = ('_pt_new_saved_tensors', *WrappedCtx._pt_reserved_attrs)
+    _pt_reserved_attrs = ("_pt_new_saved_tensors", *WrappedCtx._pt_reserved_attrs)
 
     def __init__(self, ctx, new_saved_tensors):
         super().__init__(ctx)
@@ -545,9 +571,13 @@ def __init__(self, ctx, new_saved_tensors):
     def saved_tensors(self):
         return self._pt_new_saved_tensors
 
+
 class CtxCustomSave(WrappedCtx):
-    _pt_reserved_attrs = ('_pt_saved_tensors_bdims', '_pt_current_level',
-                          *WrappedCtx._pt_reserved_attrs)
+    _pt_reserved_attrs = (
+        "_pt_saved_tensors_bdims",
+        "_pt_current_level",
+        *WrappedCtx._pt_reserved_attrs,
+    )
 
     def __init__(self, ctx, current_level):
         super().__init__(ctx)
@@ -565,8 +595,13 @@ def save_for_forward(self, *tensors):
         self._pt_saved_tensors_bdims = bdims
 
 
-def reductify(grad_input, grad_input_bdim, input_bdim, batch_size,
-              target_shape_without_bdim_to_reduce_to=None):
+def reductify(
+    grad_input,
+    grad_input_bdim,
+    input_bdim,
+    batch_size,
+    target_shape_without_bdim_to_reduce_to=None,
+):
     if not isinstance(grad_input, tuple):
         grad_input = (grad_input,)
     if not isinstance(grad_input_bdim, tuple):
@@ -578,14 +613,23 @@ def reductify(grad_input, grad_input_bdim, input_bdim, batch_size,
         target_shape_without_bdim_to_reduce_to = len(grad_input) * (None,)
     result = tuple(
         reductify_leaf(gi, gi_bdim, i_bdim, batch_size, maybe_ishape)
-        for gi, gi_bdim, i_bdim, maybe_ishape in
-        zip(grad_input, grad_input_bdim, input_bdim, target_shape_without_bdim_to_reduce_to)
+        for gi, gi_bdim, i_bdim, maybe_ishape in zip(
+            grad_input,
+            grad_input_bdim,
+            input_bdim,
+            target_shape_without_bdim_to_reduce_to,
+        )
     )
     return result
 
 
-def reductify_leaf(grad_input, grad_input_bdim, input_bdim, batch_size,
-                   target_shape_without_bdim_to_reduce_to=None):
+def reductify_leaf(
+    grad_input,
+    grad_input_bdim,
+    input_bdim,
+    batch_size,
+    target_shape_without_bdim_to_reduce_to=None,
+):
     if grad_input is None:
         return None
 
@@ -627,9 +671,51 @@ def reductify_leaf(grad_input, grad_input_bdim, input_bdim, batch_size,
         grad_input_bdim = input_bdim
 
     if target_shape_without_bdim_to_reduce_to is not None:
-        return vmap(torch.Tensor.sum_to_size, in_dims=(grad_input_bdim, None), out_dims=input_bdim)(
-            grad_input, target_shape_without_bdim_to_reduce_to)
+        return vmap(
+            torch.Tensor.sum_to_size,
+            in_dims=(grad_input_bdim, None),
+            out_dims=input_bdim,
+        )(grad_input, target_shape_without_bdim_to_reduce_to)
 
     if input_bdim != grad_input_bdim:
         grad_input = grad_input.movedim(grad_input_bdim, input_bdim)
     return grad_input
+
+
+def autograd_function_forward_rewritten(original_forward, original_setup_context):
+    def new_forward(ctx, *args, **kwargs):
+        output = original_forward(*args, **kwargs)
+        original_setup_context(ctx, args, output)
+        return output
+
+    return new_forward
+
+
+class AutogradFunctionApply(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("autograd_function_apply")
+
+    def __call__(self, fwd, bwd, *fwd_args, **fwd_kwargs):
+        saved_values = None
+        args_tensor_mask = fwd_kwargs["args_tensor_mask"]
+        length_of_tensor_args = sum(args_tensor_mask)
+        # Filter out the original tensor args from fwd_args,
+        # lifted freevars should not be args of ApplyTemplate.apply
+        # since we don't need to calculate the gradients of them.
+        new_fwd_args = fwd_args[:length_of_tensor_args]
+
+        class ApplyTemplate(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *args):
+                nonlocal saved_values
+                output, saved_values = fwd(None, *fwd_args)
+                return output
+
+            @staticmethod
+            def backward(ctx, *grad):
+                return bwd(None, *grad, *saved_values)
+
+        return ApplyTemplate.apply(*new_fwd_args)
+
+
+autograd_function_apply = AutogradFunctionApply()
diff --git a/torch/_functorch/batch_norm_replacement.py b/torch/_functorch/batch_norm_replacement.py
index 29715581837e1..a2df284138e7d 100644
--- a/torch/_functorch/batch_norm_replacement.py
+++ b/torch/_functorch/batch_norm_replacement.py
@@ -3,7 +3,10 @@
 
 
 def batch_norm_without_running_stats(module: nn.Module):
-    if isinstance(module, nn.modules.batchnorm._BatchNorm) and module.track_running_stats:
+    if (
+        isinstance(module, nn.modules.batchnorm._BatchNorm)
+        and module.track_running_stats
+    ):
         module.running_mean = None
         module.running_var = None
         module.num_batches_tracked = None
diff --git a/torch/_functorch/benchmark_utils.py b/torch/_functorch/benchmark_utils.py
index 78111275008b5..e0bcae4c836e9 100644
--- a/torch/_functorch/benchmark_utils.py
+++ b/torch/_functorch/benchmark_utils.py
@@ -1,7 +1,10 @@
+# mypy: ignore-errors
+
 import contextlib
-import time
-import os
 import json
+import operator
+import os
+import time
 
 import torch
 from torch.profiler import profile, ProfilerActivity
@@ -11,8 +14,17 @@ def synchronize():
     pass
 
 
-def dump_chrome_trace(f, input, trace_filename, optimize_ctx, activities, num_runs=1,
-                      devices=None, kwargs_for_f=None, kwargs_for_profiler=None):
+def dump_chrome_trace(
+    f,
+    input,
+    trace_filename,
+    optimize_ctx,
+    activities,
+    num_runs=1,
+    devices=None,
+    kwargs_for_f=None,
+    kwargs_for_profiler=None,
+):
     """
     Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx]
     [num_runs] times to [trace_filename].
@@ -69,7 +81,12 @@ def get_chrome_trace_events(filename):
 
 def is_gpu_compute_event(event):
     global gpu_pids
-    return "pid" in event and event["pid"] in gpu_pids and "ph" in event and event["ph"] == "X"
+    return (
+        "pid" in event
+        and event["pid"] in gpu_pids
+        and "ph" in event
+        and event["ph"] == "X"
+    )
 
 
 def get_sorted_gpu_events(events):
@@ -78,7 +95,7 @@ def get_sorted_gpu_events(events):
         if not is_gpu_compute_event(event):
             continue
         sorted_gpu_events.append(event)
-    return sorted(sorted_gpu_events, key=lambda x: x["ts"])
+    return sorted(sorted_gpu_events, key=operator.itemgetter("ts"))
 
 
 def get_duration(sorted_gpu_events):
@@ -97,8 +114,13 @@ def get_duration(sorted_gpu_events):
 
 def get_sorted_gpu_mm_conv_events(events):
     def is_mm_conv_event(event):
-        return "name" in event and ("gemm" in event["name"] or "conv" in event["name"]
-                                    or "cutlass" in event["name"] or "wgrad" in event["name"])
+        return "name" in event and (
+            "gemm" in event["name"]
+            or "conv" in event["name"]
+            or "cutlass" in event["name"]
+            or "wgrad" in event["name"]
+        )
+
     gpu_events = get_sorted_gpu_events(events)
     sorted_events = []
     for event in gpu_events:
@@ -132,7 +154,7 @@ def compute_utilization(filename: str, total_length: float):
     for event in events:
         if "name" not in event:
             continue
-        if event["name"] == 'process_labels' and "GPU" in event["args"]["labels"]:
+        if event["name"] == "process_labels" and "GPU" in event["args"]["labels"]:
             gpu_pids.append(event["pid"])
 
     total_length = total_length * 1e6
@@ -145,7 +167,14 @@ def compute_utilization(filename: str, total_length: float):
     return utilization, mm_conv_utilization
 
 
-def benchmark_utilization(f, input, trace_folder, optimize_ctx=None, trace_file_name="tmp_chrome_trace", num_runs=1):
+def benchmark_utilization(
+    f,
+    input,
+    trace_folder,
+    optimize_ctx=None,
+    trace_file_name="tmp_chrome_trace",
+    num_runs=1,
+):
     """
     Benchmark the GPU Utilization and percent of time spent on matmul and convolution operations of
     running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times.
@@ -186,8 +215,17 @@ def f(a):
         optimize_ctx = contextlib.nullcontext()
 
     chrome_trace_file_name = os.path.join(trace_folder, trace_file_name + ".json")
-    total_length = dump_chrome_trace(f, input, chrome_trace_file_name, optimize_ctx,
-                                     [ProfilerActivity.CUDA], num_runs=num_runs, devices="cuda")
-    utilization, mm_conv_utilization = compute_utilization(chrome_trace_file_name, total_length)
+    total_length = dump_chrome_trace(
+        f,
+        input,
+        chrome_trace_file_name,
+        optimize_ctx,
+        [ProfilerActivity.CUDA],
+        num_runs=num_runs,
+        devices="cuda",
+    )
+    utilization, mm_conv_utilization = compute_utilization(
+        chrome_trace_file_name, total_length
+    )
 
     return utilization, mm_conv_utilization
diff --git a/torch/_functorch/compile_utils.py b/torch/_functorch/compile_utils.py
index d9ffc946eed0e..ffa37e59f04df 100644
--- a/torch/_functorch/compile_utils.py
+++ b/torch/_functorch/compile_utils.py
@@ -1,22 +1,37 @@
+# mypy: ignore-errors
+
 
 import torch
 import torch.fx as fx
-from torch.utils._pytree import tree_flatten
 from torch.utils import _pytree as pytree
+from torch.utils._pytree import tree_flatten
 
 aten = torch.ops.aten
 
 
 def get_aten_target(node):
-    if hasattr(node.target, 'overloadpacket'):
+    if hasattr(node.target, "overloadpacket"):
         return node.target.overloadpacket
     return node.target
 
 
-rand_ops = [aten.dropout, aten._fused_dropout, aten._standard_gamma,
-            aten.bernoulli, aten.multinomial, aten.native_dropout,
-            aten.normal, aten.poisson, aten.binomial, aten.rrelu,
-            aten.rand_like, aten.rand, aten.randint, aten.randn, aten.randperm]
+rand_ops = [
+    aten.dropout,
+    aten._fused_dropout,
+    aten._standard_gamma,
+    aten.bernoulli,
+    aten.multinomial,
+    aten.native_dropout,
+    aten.normal,
+    aten.poisson,
+    aten.binomial,
+    aten.rrelu,
+    aten.rand_like,
+    aten.rand,
+    aten.randint,
+    aten.randn,
+    aten.randperm,
+]
 
 
 # return a new copy of torch.fx.graph.Graph with CSE applied to the input graph
@@ -28,7 +43,12 @@ def fx_graph_cse(fx_g: torch.fx.graph.Graph):
     for n in fx_g.nodes:
         # The placeholder, output, and get_attr nodes are copied to the new graph without change
         # do not CSE away random operations
-        if n.op == 'placeholder' or n.op == 'output' or n.op == 'get_attr' or get_aten_target(n) in rand_ops:
+        if (
+            n.op == "placeholder"
+            or n.op == "output"
+            or n.op == "get_attr"
+            or get_aten_target(n) in rand_ops
+        ):
             new_node = new_graph.node_copy(n, lambda x: env[x])
             env[n] = new_node
         else:  # n.op == 'call_function', should never see n.op == 'call_module' or 'call_method'
@@ -43,16 +63,26 @@ def substitute(arg_list):
                     if isinstance(v, (torch.SymBool, torch.SymInt, torch.SymFloat)):
                         arg_list[i] = v.node
                 return tuple(arg_list), spec
+
             args, args_spec = substitute(n.args)
             kwargs, kwargs_spec = substitute(n.kwargs)
 
             # each token corresponds to a unique node
             # nodes with the same token can be substituted
-            token = {"target": n.target, "args": args, "args_spec": args_spec,
-                     "kwargs": kwargs, "kwargs_spec": kwargs_spec}
+            token = {
+                "target": n.target,
+                "args": args,
+                "args_spec": args_spec,
+                "kwargs": kwargs,
+                "kwargs_spec": kwargs_spec,
+            }
 
             # hash substituted args to a number, do not hash specs because specs are not hashable
-            hash_arg = hash((args, kwargs))
+            # We need to add type into hash to avoid situations like:
+            # hash((primals_2, 1.0)) == hash((primals_2, 1))
+            hash_arg = hash(
+                (tuple((a, type(a)) for a in args), tuple((a, type(a)) for a in kwargs))
+            )
             hash_val = (n.target, hash_arg)
 
             # check if a node has a substitute and can be eliminated
@@ -84,10 +114,10 @@ def strip_overloads(gm):
 
 
 def get_placeholders(graph):
-    return list(filter(lambda x: x.op == 'placeholder', graph.nodes))
+    return graph.find_nodes(op="placeholder")
+
 
 def get_outputs(graph):
-    for node in graph.nodes:
-        if node.op == 'output':
-            return pytree.tree_leaves(node.args[0])
+    for node in graph.find_nodes(op="output"):
+        return pytree.tree_leaves(node.args[0])
     raise AssertionError("No output node found")
diff --git a/torch/_functorch/compilers.py b/torch/_functorch/compilers.py
index fc8e2d85a5f66..b420daca5ac34 100644
--- a/torch/_functorch/compilers.py
+++ b/torch/_functorch/compilers.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import copy
 import logging
 import os
@@ -6,12 +8,14 @@
 from contextlib import contextmanager
 from functools import partial
 from typing import Callable, Union
+
 import sympy
 
 import torch
-from torch import SymInt
 import torch.fx as fx
 import torch.nn as nn
+import torch.utils._pytree as pytree
+from torch import SymInt
 from torch._decomp import get_decompositions
 from torch.fx.experimental.symbolic_shapes import bind_symbols
 
@@ -22,7 +26,6 @@
     draw_graph,
     min_cut_rematerialization_partition,
 )
-import torch.utils._pytree as pytree
 
 
 log = logging.getLogger(__name__)
@@ -31,9 +34,10 @@
 # These canonicalizations are needed here (and not decompositions), as the ops
 # we're trying to canonicalize to CompositeImplicitAutograd.
 def _canonicalize(fx_g):
-    for node in fx_g.graph.nodes:
-        if node.target == torch.ops.aten._to_copy:
-            node.target = torch.ops.aten.to
+    for node in fx_g.graph.find_nodes(
+        op="call_function", target=torch.ops.aten._to_copy
+    ):
+        node.target = torch.ops.aten.to
     fx_g.recompile()
     return fx_g
 
@@ -65,13 +69,10 @@ def ts_compile(fx_g: fx.GraphModule, inps) -> Callable:
     with _disable_jit_autocast():
         strip_overloads(fx_g)
 
-        for node in fx_g.graph.nodes:
-            if (
-                node.target == torch.ops.aten._to_copy
-                and len(node.args) == 1
-                and len(node.kwargs) == 1
-                and "dtype" in node.kwargs
-            ):
+        for node in fx_g.graph.find_nodes(
+            op="call_function", target=torch.ops.aten._to_copy
+        ):
+            if len(node.args) == 1 and len(node.kwargs) == 1 and "dtype" in node.kwargs:
                 node.target = torch.ops.aten.to
 
         for node in fx_g.graph.nodes:
@@ -104,9 +105,7 @@ def _draw_graph_compile(fx_g, _, name, clear_meta=True):
 
 
 def draw_graph_compile(name):
-    return make_boxed_compiler(
-        partial(_draw_graph_compile, name=name)
-    )
+    return make_boxed_compiler(partial(_draw_graph_compile, name=name))
 
 
 @make_boxed_compiler
@@ -121,13 +120,13 @@ def nop(fx_g: fx.GraphModule, _) -> Callable:
     """
     return fx_g
 
+
 class DebugInterpreter(fx.Interpreter):
     def run(self, *args):
         self.symbol_mapping = bind_symbols(self.module, *args)
         super().run(*args)
 
     def run_node(self, n):
-
         def subst_symint(ni):
             if not isinstance(ni, SymInt):
                 return ni
@@ -141,21 +140,27 @@ def subst_symint_tuple(nis):
         def check_significant_strides(a, b):
             if subst_symint(a.numel()) > 0:
                 for idx in range(a.ndim):
-                    if subst_symint(a.stride(idx)) != b.stride(idx) and subst_symint(a.size(idx)) > 1:
+                    if (
+                        subst_symint(a.stride(idx)) != b.stride(idx)
+                        and subst_symint(a.size(idx)) > 1
+                    ):
                         return False
             return True
 
         def check(nv, rv, desc):
             assert callable(desc)
             assert nv.dtype == rv.dtype, f"{desc()}: {nv.dtype} != {rv.dtype}"
-            assert subst_symint_tuple(nv.size()) == rv.size(), \
-                f"{desc()}: {nv.size()} aka {subst_symint_tuple(nv.size())} != {rv.size()}"
+            assert (
+                subst_symint_tuple(nv.size()) == rv.size()
+            ), f"{desc()}: {nv.size()} aka {subst_symint_tuple(nv.size())} != {rv.size()}"
             same_strides = check_significant_strides(nv, rv)
-            assert same_strides, f"{desc()}: {nv.stride()} aka {subst_symint_tuple(nv.stride())} != {rv.stride()}"
+            assert (
+                same_strides
+            ), f"{desc()}: {nv.stride()} aka {subst_symint_tuple(nv.stride())} != {rv.stride()}"
 
         r = super().run_node(n)
-        if 'val' in n.meta:
-            n_vals, n_spec = pytree.tree_flatten(n.meta['val'])
+        if "val" in n.meta:
+            n_vals, n_spec = pytree.tree_flatten(n.meta["val"])
             r_vals, r_spec = pytree.tree_flatten(r)
             # TODO: There is some sort of problem where we record that an
             # operator returned a tuple/list, and then later it turns out the
@@ -180,6 +185,7 @@ def debug_nop(fx_g: fx.GraphModule, _) -> Callable:
     """
     return DebugInterpreter(fx_g).run
 
+
 @make_boxed_compiler
 def simple_ts_compile(fx_g, _):
     strip_overloads(fx_g)
@@ -303,7 +309,7 @@ def get_inputs(input_data_path):
     Return a random input for the given inputs meta generated from _save_fx_default.
     """
     inputs = []
-    with (open(input_data_path, "rb")) as f:
+    with open(input_data_path, "rb") as f:
         inputs_meta = pickle.load(f)
         inputs = []
         for meta in inputs_meta:
diff --git a/torch/_functorch/config.py b/torch/_functorch/config.py
index 1902c6ad9f4c0..5749477c6e98c 100644
--- a/torch/_functorch/config.py
+++ b/torch/_functorch/config.py
@@ -26,13 +26,115 @@
 
 debug_partitioner = os.environ.get("AOT_PARTITIONER_DEBUG", False)
 
+# Today, if you are in a situation where there is "false aliasing"
+# (e.g. you have a bunch of model parameters that all alias the same underlying buffer),
+# our checks for this situation are very slow if these inputs have dynamic shapes.
+# This config is set to ensure that there aren't too many aliased inputs in this situation,
+# so that we error loudly instead of compiling forever.
+# Eventually, we should make these checks faster.
+# For now, however, you can simply turn off dynamic shapes by marking your inputs static
+# when you run into this situation.
+_max_aliased_inputs_with_dynamic_shapes_enabled = 5
+
 static_weight_shapes = True
 
 # Applies CSE to the graph before partitioning
 cse = True
 
+# When AOTAutograd regenerates aliased graph outputs,
+# attempte to use functionalization's view-replay logic
+# before falling back to the autograd engine's view replay or as_strided.
+# This can have some perf implications
+# (although for many models this will not matter).
+# (1) If you have many view ops chained together, replaying all of them
+#     at runtime can have more overhead compared to a single as_strided call
+# (2) If you are doing training, AsStridedBackward is quite slow,
+#     and the individual view op backward formulas will likely be faster.
+# (3) Some backends like XLA do not support as_strided
+
+# Temporary hack: disable this flag for internal
+# (needed to fix an internal issue while avoiding bumping XLA pin)
+# eventually: either default this config to false completely
+# once XLA pin update works,
+# or default config to true and fix relevant bugs
+from torch._inductor.config import is_fbcode
+
+view_replay_for_aliased_outputs = not is_fbcode()
+
 # Restricts the amount of computation AOTAutograd can do.
-max_dist_from_bw = 3
+# NB: We have essentially disabled this heuristic now. However, this is kept
+# here for now in case it's useful. Setting it low can artificially reduce the
+# amount of recomputation AOTAutograd performs, although not in any kind of
+# principled way.
+max_dist_from_bw = 1000
+
+
+# Bans recomputation of nodes that are reading from nodes that is far before
+# the current node
+ban_recompute_used_far_apart = True
+# Breaks up long chain of fusible ops, as otherwise we can have an arbitrarily
+# long chain of recomputation in the backwards pass.
+ban_recompute_long_fusible_chains = True
+# Bans recomputation of nodes that must be materialized in the backwards pass
+# (used by a non-fusible node)
+ban_recompute_materialized_backward = True
+# Chooses to ban recomputation of nodes based off an allowlist. Setting it to
+# False changes it to use a denylist. Main change is on operators like
+# sort/pool/stuff that isn't cheap enough to be fusible for free but also isn't
+# that expensive
+ban_recompute_not_in_allowlist = True
+# Chooses to ban recomputation of reductions. This is generally a good idea, as
+# the result of reductions is generally very small but recomputing reductions in
+# a fusion can be expensive.
+ban_recompute_reductions = True
+
+
+# Sets all of the ban_recompute heuristics to False except ban_recompute_reductions
+# Generally, this will probably result in some memory improvement, but at the
+# cost of some performance
+aggressive_recomputation = False
+
+# If FakeTensor.data_ptr() should error.
+# This option is independent of AOTAutograd and torch.compile, but our policy
+# is to turn it off during torch.compile.
+fake_tensor_allow_unsafe_data_ptr_access = True
+
+# Unlifts effect tokens from the inputs/outputs in the traced graph and instead
+# inserts make_token/sink_token calls in the graph to create tokens and then
+# sink them at the end. Note that this means the graph is no longer functional
+# which may lead to silent errors unless the backend knows how to handle the
+# tokens.
+unlift_effect_tokens = False
+
+# This mode specifies that we should also keep track of the real
+# tensor along with the fake tensor, and do real compute.  While
+# seemingly this eliminates the whole point of fake tensors, there are
+# two obvious use cases for it:
+#
+#   1. When users call item()/other data dependent operations,
+#      if we propagate_real_tensors we are able to determine what
+#      the true value is and keep going.
+#
+#   2. It can be useful for testing, when you want to see if the fake
+#      and real tensors agree with each other.  (Note that there are
+#      currently known inaccuracies in how we clone real tensors, that
+#      would have to be tightened up for this to be useful in this
+#      case.)
+#
+# Note that fake tensors are typically understood to be cheap to store
+# indefinitely, so we tend to hold on to them longer than we would
+# hold onto the real tensors.  So we also support you explicitly
+# deallocating the real tensor associated with a fake tensor, at which
+# point we will stop propagating real tensors.
+#
+# One more thing: when you provide a real tensor to fakeify, we will
+# clone it, so that we can safely perform mutations on it if necessary.
+# This will increase live memory usage.  This could potentially be
+# optimized by using COW.  We also currently do not faithfully
+# maintain autograd metadata on the real tensor; this is fine because
+# AOTAutograd will only use the fake tensor to determine leafness/etc
+# of tensors in question.
+fake_tensor_propagate_real_tensors = False
 
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
diff --git a/torch/_functorch/deprecated.py b/torch/_functorch/deprecated.py
index de5f5a002c06b..82a34f7d41c32 100644
--- a/torch/_functorch/deprecated.py
+++ b/torch/_functorch/deprecated.py
@@ -1,12 +1,13 @@
+import textwrap
+import warnings
+from typing import Any, Callable, Optional, Tuple, Union
+
 import torch._functorch.apis as apis
 import torch._functorch.eager_transforms as _impl
 import torch._functorch.make_functional as _nn_impl
-from torch._functorch.vmap import in_dims_t, out_dims_t
-from torch._functorch.eager_transforms import argnums_t
 import torch.nn as nn
-import textwrap
-from typing import Any, Callable, Optional, Tuple, Union
-import warnings
+from torch._functorch.eager_transforms import argnums_t
+from torch._functorch.vmap import in_dims_t, out_dims_t
 
 """
 The APIs in this file are exposed as `functorch.*`. They are thin wrappers
@@ -17,16 +18,17 @@
 documentation.
 """
 
+
 def get_warning(api, new_api=None, replace_newlines=False):
     if new_api is None:
-        new_api = f'torch.func.{api}'
+        new_api = f"torch.func.{api}"
     warning = (
         f"We've integrated functorch into PyTorch. As the final step of the \n"
         f"integration, functorch.{api} is deprecated as of PyTorch \n"
         f"2.0 and will be deleted in a future version of PyTorch >= 2.3. \n"
         f"Please use {new_api} instead; see the PyTorch 2.0 release notes \n"
         f"and/or the torch.func migration guide for more details \n"
-        f"https://pytorch.org/docs/master/func.migrating.html"
+        f"https://pytorch.org/docs/main/func.migrating.html"
     )
     if replace_newlines:
         warning = warning.replace("\n", "")
@@ -51,75 +53,119 @@ def setup_docs(functorch_api, torch_func_api=None, new_api_name=None):
     warning_note = textwrap.indent(warning_note, "    ")
     functorch_api.__doc__ = torch_func_api.__doc__ + warning_note
 
+
 def vmap(
-        func: Callable,
-        in_dims: in_dims_t = 0,
-        out_dims: out_dims_t = 0,
-        randomness: str = 'error',
-        *,
-        chunk_size=None) -> Callable:
-    warn_deprecated('vmap', 'torch.vmap')
+    func: Callable,
+    in_dims: in_dims_t = 0,
+    out_dims: out_dims_t = 0,
+    randomness: str = "error",
+    *,
+    chunk_size=None,
+) -> Callable:
+    warn_deprecated("vmap", "torch.vmap")
     return apis.vmap(func, in_dims, out_dims, randomness, chunk_size=chunk_size)
 
+
 def grad(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Callable:
-    warn_deprecated('grad')
+    warn_deprecated("grad")
     return apis.grad(func, argnums, has_aux)
 
-def grad_and_value(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Callable:
-    warn_deprecated('grad_and_value')
-    return _impl.grad_and_value(func, argnums, has_aux)
+
+def grad_and_value(
+    func: Callable, argnums: argnums_t = 0, has_aux: bool = False
+) -> Callable:
+    warn_deprecated("grad_and_value")
+    return apis.grad_and_value(func, argnums, has_aux)
+
 
 def vjp(func: Callable, *primals, has_aux: bool = False):
-    warn_deprecated('vjp')
+    warn_deprecated("vjp")
     return _impl.vjp(func, *primals, has_aux=has_aux)
 
-def jvp(func: Callable, primals: Any, tangents: Any, *, strict: bool = False, has_aux: bool = False):
-    warn_deprecated('jvp')
+
+def jvp(
+    func: Callable,
+    primals: Any,
+    tangents: Any,
+    *,
+    strict: bool = False,
+    has_aux: bool = False,
+):
+    warn_deprecated("jvp")
     return _impl.jvp(func, primals, tangents, strict=strict, has_aux=has_aux)
 
-def jacrev(func: Callable, argnums: Union[int, Tuple[int]] = 0, *, has_aux=False,
-           chunk_size: Optional[int] = None,
-           _preallocate_and_copy=False):
-    warn_deprecated('jacrev')
-    return _impl.jacrev(func, argnums, has_aux=has_aux, chunk_size=chunk_size,
-                        _preallocate_and_copy=_preallocate_and_copy)
 
-def jacfwd(func: Callable, argnums: argnums_t = 0, has_aux: bool = False, *, randomness: str = "error"):
-    warn_deprecated('jacfwd')
+def jacrev(
+    func: Callable,
+    argnums: Union[int, Tuple[int]] = 0,
+    *,
+    has_aux=False,
+    chunk_size: Optional[int] = None,
+    _preallocate_and_copy=False,
+):
+    warn_deprecated("jacrev")
+    return _impl.jacrev(
+        func,
+        argnums,
+        has_aux=has_aux,
+        chunk_size=chunk_size,
+        _preallocate_and_copy=_preallocate_and_copy,
+    )
+
+
+def jacfwd(
+    func: Callable,
+    argnums: argnums_t = 0,
+    has_aux: bool = False,
+    *,
+    randomness: str = "error",
+):
+    warn_deprecated("jacfwd")
     return _impl.jacfwd(func, argnums, has_aux, randomness=randomness)
 
+
 def hessian(func, argnums=0):
-    warn_deprecated('hessian')
+    warn_deprecated("hessian")
     return _impl.hessian(func, argnums=argnums)
 
-def functionalize(func: Callable, *, remove: str = 'mutations') -> Callable:
-    warn_deprecated('functionalize')
+
+def functionalize(func: Callable, *, remove: str = "mutations") -> Callable:
+    warn_deprecated("functionalize")
     return _impl.functionalize(func, remove=remove)
 
+
 def make_functional(model: nn.Module, disable_autograd_tracking: bool = False):
-    warn_deprecated('make_functional', 'torch.func.functional_call')
+    warn_deprecated("make_functional", "torch.func.functional_call")
     return _nn_impl.make_functional(model, disable_autograd_tracking)
 
-def make_functional_with_buffers(model: nn.Module, disable_autograd_tracking: bool = False):
-    warn_deprecated('make_functional_with_buffers', 'torch.func.functional_call')
+
+def make_functional_with_buffers(
+    model: nn.Module, disable_autograd_tracking: bool = False
+):
+    warn_deprecated("make_functional_with_buffers", "torch.func.functional_call")
     return _nn_impl.make_functional_with_buffers(model, disable_autograd_tracking)
 
+
 def combine_state_for_ensemble(models):
-    warn_deprecated('combine_state_for_ensemble', 'torch.func.stack_module_state')
+    warn_deprecated("combine_state_for_ensemble", "torch.func.stack_module_state")
     return _nn_impl.combine_state_for_ensemble(models)
 
-setup_docs(vmap, apis.vmap, 'torch.vmap')
+
+setup_docs(vmap, apis.vmap, "torch.vmap")
 setup_docs(grad, apis.grad)
-setup_docs(grad_and_value)
+setup_docs(grad_and_value, apis.grad_and_value)
 setup_docs(vjp)
 setup_docs(jvp)
 setup_docs(jacrev)
 setup_docs(jacfwd)
 setup_docs(hessian)
 setup_docs(functionalize)
-setup_docs(make_functional, _nn_impl.make_functional,
-           'torch.func.functional_call')
-setup_docs(make_functional_with_buffers, _nn_impl.make_functional,
-           'torch.func.functional_call')
-setup_docs(combine_state_for_ensemble, _nn_impl.combine_state_for_ensemble,
-           'torch.func.stack_module_state')
+setup_docs(make_functional, _nn_impl.make_functional, "torch.func.functional_call")
+setup_docs(
+    make_functional_with_buffers, _nn_impl.make_functional, "torch.func.functional_call"
+)
+setup_docs(
+    combine_state_for_ensemble,
+    _nn_impl.combine_state_for_ensemble,
+    "torch.func.stack_module_state",
+)
diff --git a/torch/_functorch/eager_transforms.py b/torch/_functorch/eager_transforms.py
index f41660e59c81b..fff6bd67838f0 100644
--- a/torch/_functorch/eager_transforms.py
+++ b/torch/_functorch/eager_transforms.py
@@ -1,55 +1,60 @@
+# mypy: ignore-errors
+
 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable, Union, Tuple, List, Any, Optional
-import torch
-from functools import partial, wraps
 import contextlib
-from torch.utils._pytree import (
-    tree_flatten,
-    tree_unflatten,
-    tree_map,
-    tree_map_only,
-    tree_map_,
-    treespec_pprint,
-)
-from torch.utils import _pytree as pytree
-from torch.fx.experimental import const_fold
-from torch.fx.experimental.proxy_tensor import make_fx
-import torch.autograd.forward_ad as fwAD
-from torch._subclasses.functional_tensor import FunctionalTensor
+from functools import partial, wraps
+from typing import Any, Callable, List, Optional, Tuple, Union
 
-from .vmap import doesnt_support_saved_tensors_hooks, get_chunk_sizes
-from .apis import vmap
+import torch
+import torch.autograd.forward_ad as fwAD
 
 from torch._C._functorch import (
-    _wrap_for_grad,
-    _unwrap_for_grad,
-    _grad_increment_nesting,
-    _grad_decrement_nesting,
-    _jvp_increment_nesting,
-    _jvp_decrement_nesting,
-    _wrap_functional_tensor,
-    _unwrap_functional_tensor,
+    _assert_wrapped_functional,
     _func_decrement_nesting,
     _func_increment_nesting,
-    _assert_wrapped_functional,
+    _grad_decrement_nesting,
+    _grad_increment_nesting,
+    _jvp_decrement_nesting,
+    _jvp_increment_nesting,
     _propagate_functional_input_mutation,
+    _unwrap_for_grad,
+    _unwrap_functional_tensor,
+    _wrap_for_grad,
+    _wrap_functional_tensor,
+    get_inplace_requires_grad_allowed,
     set_inplace_requires_grad_allowed,
-    get_inplace_requires_grad_allowed
 )
-from torch._functorch.utils import exposed_in, argnums_t
+from torch._functorch.utils import argnums_t, exposed_in
+from torch._subclasses.functional_tensor import FunctionalTensor
+from torch.fx.experimental import const_fold
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.utils import _pytree as pytree
+from torch.utils._pytree import (
+    tree_flatten,
+    tree_map,
+    tree_map_,
+    tree_map_only,
+    tree_unflatten,
+    treespec_pprint,
+)
+from .apis import vmap
+
+from .vmap import doesnt_support_saved_tensors_hooks, get_chunk_sizes
 
 
-def lazy_dynamo_disable(func):
+def lazy_dynamo_disallow(func):
     import torch._dynamo
-    return torch._dynamo.disable(func)
+
+    return torch._dynamo.disallow_in_graph(func)
+
 
 @contextlib.contextmanager
-def enable_inplace_requires_grad(enabled=True):
+def enable_inplace_requires_grad(enabled):
     prev_state = get_inplace_requires_grad_allowed()
     set_inplace_requires_grad_allowed(enabled)
     try:
@@ -58,13 +63,60 @@ def enable_inplace_requires_grad(enabled=True):
         set_inplace_requires_grad_allowed(prev_state)
 
 
+def _vjp_treespec_compare(primals_out, cotangents):
+    # Revert this once #116264 gets fixed
+    _, primals_out_spec = tree_flatten(primals_out)
+    _, cotangents_spec = tree_flatten(cotangents)
+    # Dynamo fails to trace operator.ne below. To bypass this limitation, this
+    # function is not inlined.
+    if primals_out_spec != cotangents_spec:
+        raise RuntimeError(
+            f"Expected pytree structure of cotangents to be the same "
+            f"as pytree structure of outputs to the function. "
+            f"cotangents: {treespec_pprint(cotangents_spec)}, "
+            f"primal output: {treespec_pprint(primals_out_spec)}"
+        )
+
+
+def _jvp_treespec_compare(primals, tangents):
+    # Revert this once #116264 gets fixed
+    _, primals_spec = tree_flatten(primals)
+    _, tangents_spec = tree_flatten(tangents)
+    if primals_spec != tangents_spec:
+        raise RuntimeError(
+            f"{jvp_str}: Expected primals and tangents to have the same python "
+            f"structure. For example, if primals is a tuple of 3 tensors, "
+            f"tangents also must be. Got primals with structure {primals_spec} "
+            f"and tangents with structure {tangents_spec}"
+        )
+
+
+def _linearize_treespec_compare(primals, tangents):
+    # Revert this once #116264 gets fixed
+    _, primals_argspec = tree_flatten(primals)
+    _, tangent_argspec = tree_flatten(tangents)
+    if tangent_argspec != primals_argspec:
+        raise RuntimeError(
+            f"Expected the tangents {tangent_argspec} to have "
+            f"the same argspec as the primals {primals_argspec}"
+        )
+
+
+def _set_tensor_requires_grad(x):
+    # avoid graph-break on x.requires_grad_()
+    # https://github.com/pytorch/pytorch/pull/110053
+    return x.requires_grad_()
+
+
 def _create_differentiable(inps, level=None):
     def create_differentiable(x):
         if isinstance(x, torch.Tensor):
-            with enable_inplace_requires_grad():
-                return x.requires_grad_()
-        raise ValueError(f'Thing passed to transform API must be Tensor, '
-                         f'got {type(x)}')
+            with enable_inplace_requires_grad(True):
+                return _set_tensor_requires_grad(x)
+        raise ValueError(
+            f"Thing passed to transform API must be Tensor, " f"got {type(x)}"
+        )
+
     return tree_map(create_differentiable, inps)
 
 
@@ -107,28 +159,40 @@ def _as_tuple(val):
         return val
     return (val,)
 
+
 # Version of autograd.grad that handles outputs that don't depend on inputs
 
 
-def _autograd_grad(outputs, inputs, grad_outputs=None, retain_graph=False, create_graph=True):
+def _autograd_grad(
+    outputs, inputs, grad_outputs=None, retain_graph=False, create_graph=True
+):
     if grad_outputs is None:
         diff_outputs = tuple(out for out in outputs if out.requires_grad)
     else:
-        result = tuple((out, go) for out, go in zip(outputs, grad_outputs) if out.requires_grad)
+        result = tuple(
+            (out, go) for out, go in zip(outputs, grad_outputs) if out.requires_grad
+        )
         if len(result) == 0:
             diff_outputs, grad_outputs = (), ()
         else:
             diff_outputs, grad_outputs = zip(*result)
     if len(diff_outputs) == 0:
         return tuple(torch.zeros_like(inp) for inp in inputs)
-    grad_inputs = torch.autograd.grad(diff_outputs, inputs, grad_outputs,
-                                      retain_graph=retain_graph,
-                                      create_graph=create_graph,
-                                      allow_unused=True)
-    grad_inputs = tuple(torch.zeros_like(inp) if gi is None else gi
-                        for gi, inp in zip(grad_inputs, inputs))
+    grad_inputs = torch.autograd.grad(
+        diff_outputs,
+        inputs,
+        grad_outputs,
+        retain_graph=retain_graph,
+        create_graph=create_graph,
+        allow_unused=True,
+    )
+    grad_inputs = tuple(
+        torch.zeros_like(inp) if gi is None else gi
+        for gi, inp in zip(grad_inputs, inputs)
+    )
     return grad_inputs
 
+
 # NOTE [grad and vjp interaction with no_grad]
 #
 # def f(x):
@@ -275,8 +339,40 @@ def vjp(func: Callable, *primals, has_aux: bool = False):
     return _vjp_with_argnums(func, *primals, has_aux=has_aux)
 
 
+@contextlib.contextmanager
+def grad_increment_nesting():
+    try:
+        grad_level = _grad_increment_nesting()
+        yield grad_level
+    finally:
+        _grad_decrement_nesting()
+
+
+def enter_jvp_nesting():
+    global JVP_NESTING
+    jvp_level = _jvp_increment_nesting()
+    JVP_NESTING += 1
+    return jvp_level
+
+
+def exit_jvp_nesting():
+    global JVP_NESTING
+    _jvp_decrement_nesting()
+    JVP_NESTING -= 1
+
+
+@contextlib.contextmanager
+def jvp_increment_nesting():
+    try:
+        yield enter_jvp_nesting()
+    finally:
+        exit_jvp_nesting()
+
+
 @doesnt_support_saved_tensors_hooks
-def _vjp_with_argnums(func: Callable, *primals, argnums: Optional[argnums_t] = None, has_aux: bool = False):
+def _vjp_with_argnums(
+    func: Callable, *primals, argnums: Optional[argnums_t] = None, has_aux: bool = False
+):
     # This is the same function as vjp but also accepts an argnums argument
     # All args are the same as vjp except for the added argument
     # argnums (Optional[int or tuple[int]]): Optional, specifies the argument(s) to compute gradients with respect to.
@@ -289,12 +385,14 @@ def _vjp_with_argnums(func: Callable, *primals, argnums: Optional[argnums_t] = N
     #
     # Returns the same two elements as :func:`vjp` but the function returned, vjp_fn, returns a tuple of VJPs
     # for only the primal elements given by argnums.
-    level = _grad_increment_nesting()
-    try:
+    with grad_increment_nesting() as level:
         # See NOTE [grad and vjp interaction with no_grad]
         with torch.enable_grad():
             primals = _wrap_all_tensors(primals, level)
-            if argnums is None:
+            # Note for the reviewer: This is extremely odd but it passes the
+            # assertion "len(self.block_stack) == 1" on symbolic_convert.py
+            # The equivalent "if argnums is None" fails for some reason
+            if not isinstance(argnums, int) and not argnums:
                 diff_primals = _create_differentiable(primals, level)
             else:
                 diff_primals = _slice_argnums(primals, argnums, as_tuple=False)
@@ -311,7 +409,7 @@ def _vjp_with_argnums(func: Callable, *primals, argnums: Optional[argnums_t] = N
                 aux = _undo_create_differentiable(aux, level)
 
             flat_primals_out, primals_out_spec = tree_flatten(primals_out)
-            assert_non_empty_tensor_output(flat_primals_out, 'vjp(f, *primals)')
+            assert_non_empty_tensor_output(flat_primals_out, "vjp(f, *primals)")
             flat_diff_primals, primals_spec = tree_flatten(diff_primals)
             results = _undo_create_differentiable(primals_out, level)
 
@@ -319,27 +417,26 @@ def _vjp_with_argnums(func: Callable, *primals, argnums: Optional[argnums_t] = N
                 assert isinstance(primal_out, torch.Tensor)
                 if primal_out.is_floating_point() or primal_out.is_complex():
                     continue
-                raise RuntimeError("vjp(f, ...): All outputs of f must be "
-                                   "floating-point or complex Tensors, got Tensor "
-                                   f"with dtype {primal_out.dtype}")
+                raise RuntimeError(
+                    "vjp(f, ...): All outputs of f must be "
+                    "floating-point or complex Tensors, got Tensor "
+                    f"with dtype {primal_out.dtype}"
+                )
 
         def wrapper(cotangents, retain_graph=True, create_graph=None):
             if create_graph is None:
                 create_graph = torch.is_grad_enabled()
             flat_cotangents, cotangents_spec = tree_flatten(cotangents)
-            if primals_out_spec != cotangents_spec:
-                raise RuntimeError(
-                    f'Expected pytree structure of cotangents to be the same '
-                    f'as pytree structure of outputs to the function. '
-                    f'cotangents: {treespec_pprint(cotangents_spec)}, '
-                    f'primal output: {treespec_pprint(primals_out_spec)}')
-            result = _autograd_grad(flat_primals_out, flat_diff_primals, flat_cotangents,
-                                    retain_graph=retain_graph, create_graph=create_graph)
+            _vjp_treespec_compare(primals_out, cotangents)
+            result = _autograd_grad(
+                flat_primals_out,
+                flat_diff_primals,
+                flat_cotangents,
+                retain_graph=retain_graph,
+                create_graph=create_graph,
+            )
             return tree_unflatten(result, primals_spec)
 
-    finally:
-        _grad_decrement_nesting()
-
     if has_aux:
         return results, wrapper, aux
     else:
@@ -350,21 +447,30 @@ def _safe_zero_index(x):
     assert len(x) == 1
     return x[0]
 
+
 # jacrev and jacfwd don't support complex functions
 # Helper function to throw appropriate error.
 def error_if_complex(func_name, args, is_input):
     flat_args = pytree.tree_leaves(args)
     for idx, arg in enumerate(flat_args):
         if isinstance(arg, torch.Tensor) and arg.dtype.is_complex:
-            input_or_output = ("inputs" if is_input else "outputs")
-            err_msg = (f"{func_name}: Expected all {input_or_output} "
-                       f"to be real but received complex tensor at flattened input idx: {idx}")
+            input_or_output = "inputs" if is_input else "outputs"
+            err_msg = (
+                f"{func_name}: Expected all {input_or_output} "
+                f"to be real but received complex tensor at flattened input idx: {idx}"
+            )
             raise RuntimeError(err_msg)
 
+
 @exposed_in("torch.func")
-def jacrev(func: Callable, argnums: Union[int, Tuple[int]] = 0, *, has_aux=False,
-           chunk_size: Optional[int] = None,
-           _preallocate_and_copy=False):
+def jacrev(
+    func: Callable,
+    argnums: Union[int, Tuple[int]] = 0,
+    *,
+    has_aux=False,
+    chunk_size: Optional[int] = None,
+    _preallocate_and_copy=False,
+):
     """
     Computes the Jacobian of ``func`` with respect to the arg(s) at index
     ``argnum`` using reverse mode autodiff
@@ -494,7 +600,6 @@ def jacrev(func: Callable, argnums: Union[int, Tuple[int]] = 0, *, has_aux=False
     if not (chunk_size is None or chunk_size > 0):
         raise ValueError("jacrev: `chunk_size` should be greater than 0.")
 
-    @wraps(func)
     def wrapper_fn(*args):
         error_if_complex("jacrev", args, is_input=True)
         vjp_out = _vjp_with_argnums(func, *args, argnums=argnums, has_aux=has_aux)
@@ -520,15 +625,17 @@ def compute_jacobian_stacked():
             # The intermediate chunked calculation are only
             # scoped at this function level.
             chunked_results = []
-            for flat_basis_chunk in _chunked_standard_basis_for_(flat_output,
-                                                                 flat_output_numels,
-                                                                 chunk_size=chunk_size):
+            for flat_basis_chunk in _chunked_standard_basis_for_(
+                flat_output, flat_output_numels, chunk_size=chunk_size
+            ):
                 if chunk_size == 1:
                     # sanity check.
                     for t in flat_basis_chunk:
                         assert t.size(0) == 1
 
-                    flat_basis_chunk = tree_map(lambda t: torch.squeeze(t, 0), flat_basis_chunk)
+                    flat_basis_chunk = tree_map(
+                        lambda t: torch.squeeze(t, 0), flat_basis_chunk
+                    )
 
                 basis = tree_unflatten(flat_basis_chunk, output_spec)
 
@@ -542,7 +649,9 @@ def compute_jacobian_stacked():
                 flat_results = pytree.tree_leaves(chunked_result)
 
                 if chunk_size == 1:
-                    flat_results = tree_map(lambda t: torch.unsqueeze(t, 0), flat_results)
+                    flat_results = tree_map(
+                        lambda t: torch.unsqueeze(t, 0), flat_results
+                    )
 
                 chunked_results.append(flat_results)
 
@@ -568,11 +677,16 @@ def compute_jacobian_preallocate_and_copy():
 
             # Don't pre-allocate if we have a single chunk.
             if not (chunk_size is None or chunk_size >= out_vec_size):
-                stacked_results = [primal.new_zeros(out_vec_size, *primal.shape) for primal in flat_primals]
-
-            for idx, flat_basis_chunk in enumerate(_chunked_standard_basis_for_(flat_output,
-                                                                                flat_output_numels,
-                                                                                chunk_size=chunk_size)):
+                stacked_results = [
+                    primal.new_zeros(out_vec_size, *primal.shape)
+                    for primal in flat_primals
+                ]
+
+            for idx, flat_basis_chunk in enumerate(
+                _chunked_standard_basis_for_(
+                    flat_output, flat_output_numels, chunk_size=chunk_size
+                )
+            ):
                 if chunk_size == 1:
                     # sanity check.
                     for t in flat_basis_chunk:
@@ -595,11 +709,13 @@ def compute_jacobian_preallocate_and_copy():
                 if chunk_size is None or chunk_size >= out_vec_size:
                     if chunk_size == 1:  # and out_vec_size == 1
                         # Since we squeezed the output dim
-                        flat_results = tree_map(lambda t: torch.unsqueeze(t, 0), flat_results)
+                        flat_results = tree_map(
+                            lambda t: torch.unsqueeze(t, 0), flat_results
+                        )
                     return flat_results
 
                 for r, sr in zip(flat_results, stacked_results):
-                    sr[idx * chunk_size: (idx + 1) * chunk_size].copy_(r)
+                    sr[idx * chunk_size : (idx + 1) * chunk_size].copy_(r)
 
             return stacked_results
 
@@ -610,10 +726,15 @@ def compute_jacobian_preallocate_and_copy():
 
         # Step 2: The returned jacobian is one big tensor per input. In this step,
         # we split each Tensor by output.
-        flat_jacobians_per_input = [result.split(flat_output_numels, dim=0) for result in flat_jacobians_per_input]
+        flat_jacobians_per_input = [
+            result.split(flat_output_numels, dim=0)
+            for result in flat_jacobians_per_input
+        ]
         flat_input_flat_output = [
-            tuple(split.view(out.shape + primal.shape)
-                  for split, out in zip(splits, flat_output))
+            tuple(
+                split.view(out.shape + primal.shape)
+                for split, out in zip(splits, flat_output)
+            )
             for splits, primal in zip(flat_jacobians_per_input, flat_primals)
         ]
 
@@ -627,18 +748,29 @@ def compute_jacobian_preallocate_and_copy():
         # d. tree_unflatten the outer List (which corresponds to the outputs)
         flat_output_flat_input = tuple(zip(*flat_input_flat_output))
 
-        flat_output_input = tuple(tree_unflatten(flat_input, primals_spec)
-                                  for flat_input in flat_output_flat_input)
+        flat_output_input = tuple(
+            tree_unflatten(flat_input, primals_spec)
+            for flat_input in flat_output_flat_input
+        )
 
         if isinstance(argnums, int):
-            flat_output_input = tuple(_safe_zero_index(flat_input)
-                                      for flat_input in flat_output_input)
+            flat_output_input = tuple(
+                _safe_zero_index(flat_input) for flat_input in flat_output_input
+            )
         output_input = tree_unflatten(flat_output_input, output_spec)
         if has_aux:
             return output_input, aux
         return output_input
+
+    # Dynamo does not support HOP composition if their inner function is
+    # annotated with @functools.wraps(...). We circumvent this issue by applying
+    # wraps only if we're not tracing with dynamo.
+    if not torch._dynamo.is_compiling():
+        wrapper_fn = wraps(func)(wrapper_fn)
+
     return wrapper_fn
 
+
 # NOTE: [Computing jacobian with vmap and vjp for multiple outputs]
 #
 # Let's consider f(x) = (x**2, x.sum()) and let x = torch.randn(3).
@@ -711,18 +843,26 @@ def _chunked_standard_basis_for_(tensors, tensor_numels, chunk_size=None):
         chunk_size = total_numel
         chunk_numels = [total_numel]
 
-    diag_start_indices = (0, *torch.tensor(tensor_numels).cumsum(dim=0)[:-1].neg().unbind())
+    diag_start_indices = (
+        0,
+        *torch.tensor(tensor_numels).cumsum(dim=0)[:-1].neg().unbind(),
+    )
 
     for chunk_idx, total_numel in enumerate(chunk_numels):
-        chunks = tuple(tensor.new_zeros(total_numel, tensor_numel)
-                       for tensor, tensor_numel in zip(tensors, tensor_numels))
+        chunks = tuple(
+            tensor.new_zeros(total_numel, tensor_numel)
+            for tensor, tensor_numel in zip(tensors, tensor_numels)
+        )
 
         for chunk, diag_start_idx in zip(chunks, diag_start_indices):
             chunk.diagonal(diag_start_idx + chunk_idx * chunk_size).fill_(1)
-        chunks = tuple(chunk.view(total_numel, *tensor.shape)
-                       for chunk, tensor in zip(chunks, tensors))
+        chunks = tuple(
+            chunk.view(total_numel, *tensor.shape)
+            for chunk, tensor in zip(chunks, tensors)
+        )
         yield chunks
 
+
 def _construct_standard_basis_for(tensors, tensor_numels):
     for basis in _chunked_standard_basis_for_(tensors, tensor_numels, chunk_size=None):
         return basis
@@ -730,12 +870,12 @@ def _construct_standard_basis_for(tensors, tensor_numels):
 
 def _validate_and_wrap_argnum(argnum, num_args):
     if not isinstance(argnum, int):
-        raise RuntimeError(f'argnum must be int, got: {type(argnum)}')
+        raise RuntimeError(f"argnum must be int, got: {type(argnum)}")
     if argnum >= 0 and argnum < num_args:
         return argnum
     if argnum < 0 and argnum >= -num_args:
         return argnum + num_args
-    raise RuntimeError(f'Got argnum={argnum}, but only {num_args} positional inputs')
+    raise RuntimeError(f"Got argnum={argnum}, but only {num_args} positional inputs")
 
 
 def _check_unique_non_empty(argnums):
@@ -749,19 +889,24 @@ def _check_unique_non_empty(argnums):
 def _replace_args(old_args, new_args, argnums):
     if isinstance(argnums, int):
         if len(new_args) != 1:
-            raise RuntimeError(f'new_args should be of size 1, was of size {len(new_args)}')
-        return tuple(new_args[0] if i == argnums else old_args[i] for i in range(len(old_args)))
+            raise RuntimeError(
+                f"new_args should be of size 1, was of size {len(new_args)}"
+            )
+        return tuple(
+            new_args[0] if i == argnums else old_args[i] for i in range(len(old_args))
+        )
     if isinstance(argnums, tuple):
         if len(new_args) != len(argnums):
             raise RuntimeError(
                 "new_args should have the same size as argnums. "
-                f"Argnums size {len(argnums)}, new_args size {len(new_args)}")
+                f"Argnums size {len(argnums)}, new_args size {len(new_args)}"
+            )
 
         def get_right_elem(i):
             return new_args[argnums.index(i)] if i in argnums else old_args[i]
 
         return tuple(get_right_elem(i) for i in range(len(old_args)))
-    raise RuntimeError(f'argnums must be int or Tuple[int, ...], got: {type(argnums)}')
+    raise RuntimeError(f"argnums must be int or Tuple[int, ...], got: {type(argnums)}")
 
 
 def _validate_and_wrap_argnums(argnums, num_args):
@@ -774,7 +919,9 @@ def _validate_and_wrap_argnums(argnums, num_args):
 
 def _slice_argnums(args, argnums, as_tuple=True):
     if not isinstance(argnums, int) and not isinstance(argnums, tuple):
-        raise RuntimeError(f'argnums must be int or Tuple[int, ...], got: {type(argnums)}')
+        raise RuntimeError(
+            f"argnums must be int or Tuple[int, ...], got: {type(argnums)}"
+        )
     argnums = _validate_and_wrap_argnums(argnums, len(args))
     _check_unique_non_empty(argnums)
     if isinstance(argnums, int):
@@ -788,36 +935,34 @@ def _slice_argnums(args, argnums, as_tuple=True):
 JVP_NESTING = 0
 
 
-@contextlib.contextmanager
-def noop():
-    yield
-
-
 def assert_flat_tuple_of_tensors(elts: Any, api: str, argname: str) -> None:
     if not isinstance(elts, tuple):
         raise RuntimeError(
-            f'{api}: Expected {argname} to be a tuple of Tensors, got {type(elts)}')
+            f"{api}: Expected {argname} to be a tuple of Tensors, got {type(elts)}"
+        )
     for elt in elts:
         if isinstance(elt, torch.Tensor):
             continue
         raise RuntimeError(
-            f'{api}: Expected {argname} to be a tuple of Tensors, got '
-            f'a tuple with an element of type {type(elt)}')
+            f"{api}: Expected {argname} to be a tuple of Tensors, got "
+            f"a tuple with an element of type {type(elt)}"
+        )
     if len(elts) == 0:
         raise RuntimeError(
-            f'{api}: Expected {argname} to be a non-empty tuple of Tensors.')
+            f"{api}: Expected {argname} to be a non-empty tuple of Tensors."
+        )
 
 
 def assert_non_empty_tensor_output(output: List[Any], api: str) -> None:
-    if output == [None] or len(output) < 1:
+    if (len(output) == 1 and output[0] is None) or len(output) < 1:
         raise RuntimeError(
-            f'{api}: Expected f to be a function that has non-empty output (got output = {output})'
+            f"{api}: Expected f to be a function that has non-empty output (got output = {output})"
         )
     for o in output:
         if not isinstance(o, torch.Tensor):
             raise RuntimeError(
-                f'{api}: expected f(*primals) to return only tensors'
-                f', got unsupported type {type(o)}'
+                f"{api}: expected f(*primals) to return only tensors"
+                f", got unsupported type {type(o)}"
             )
 
 
@@ -826,54 +971,66 @@ def assert_output_is_tensor_or_tensors(output: Any, api: str) -> None:
         return
     if not isinstance(output, tuple):
         raise RuntimeError(
-            f'{api}: Expected output of f to be a Tensor or Tensors, got '
-            f'{type(output)}')
+            f"{api}: Expected output of f to be a Tensor or Tensors, got "
+            f"{type(output)}"
+        )
     if len(output) == 0:
         raise RuntimeError(
-            f'{api}: Expected output of f to be a non-empty tuple of Tensors.')
+            f"{api}: Expected output of f to be a non-empty tuple of Tensors."
+        )
     for out in output:
         if isinstance(out, torch.Tensor):
             continue
         raise RuntimeError(
-            f'{api}: Expected output of f to be a Tensor or Tensors, got '
-            f'{type(out)} as an output')
+            f"{api}: Expected output of f to be a Tensor or Tensors, got "
+            f"{type(out)} as an output"
+        )
 
 
-def assert_non_empty_list_of_tensors(output: List[torch.Tensor], api: str, argname: str) -> None:
+def assert_non_empty_list_of_tensors(
+    output: List[torch.Tensor], api: str, argname: str
+) -> None:
     if len(output) == 0:
-        raise RuntimeError(
-            f'{api}: Expected {argname} to contain at least one Tensor.')
+        raise RuntimeError(f"{api}: Expected {argname} to contain at least one Tensor.")
     for out in output:
         if isinstance(out, torch.Tensor):
             continue
         raise RuntimeError(
-            f'{api}: Expected {argname} to only contain Tensors, got '
-            f'{type(out)}')
+            f"{api}: Expected {argname} to only contain Tensors, got " f"{type(out)}"
+        )
 
 
-jvp_str = 'jvp(f, primals, tangents)'
+jvp_str = "jvp(f, primals, tangents)"
 
 
 def safe_unpack_dual(dual, strict):
     if not isinstance(dual, torch.Tensor):
         raise RuntimeError(
-            f'{jvp_str}: expected f(*args) to return only tensors'
-            f', got unsupported type {type(dual)}'
+            f"{jvp_str}: expected f(*args) to return only tensors"
+            f", got unsupported type {type(dual)}"
         )
 
     primal, tangent = fwAD.unpack_dual(dual)
     if tangent is None:
         if strict:
             raise RuntimeError(
-                'jvp(f, primals, tangents, strict=True): '
-                'The output of f is independent of '
-                'the inputs. This is not allowed with strict=True.')
+                "jvp(f, primals, tangents, strict=True): "
+                "The output of f is independent of "
+                "the inputs. This is not allowed with strict=True."
+            )
         tangent = torch.zeros_like(primal)
     return primal, tangent
 
 
 @exposed_in("torch.func")
-def jvp(func: Callable, primals: Any, tangents: Any, *, strict: bool = False, has_aux: bool = False):
+def jvp(
+    func: Callable,
+    primals: Any,
+    tangents: Any,
+    *,
+    strict: bool = False,
+    has_aux: bool = False,
+):
     """
     Standing for the Jacobian-vector product, returns a tuple containing
     the output of `func(*primals)` and the "Jacobian of ``func`` evaluated at
@@ -924,12 +1081,21 @@ def jvp(func: Callable, primals: Any, tangents: Any, *, strict: bool = False, ha
 
     """
 
-    return _jvp_with_argnums(func, primals, tangents, argnums=None, strict=strict, has_aux=has_aux)
+    return _jvp_with_argnums(
+        func, primals, tangents, argnums=None, strict=strict, has_aux=has_aux
+    )
 
 
 @doesnt_support_saved_tensors_hooks
-def _jvp_with_argnums(func: Callable, primals: Any, tangents: Any, argnums: Optional[argnums_t], *,
-                      strict: bool = False, has_aux: bool):
+def _jvp_with_argnums(
+    func: Callable,
+    primals: Any,
+    tangents: Any,
+    argnums: Optional[argnums_t],
+    *,
+    strict: bool = False,
+    has_aux: bool,
+):
     # This is the same function as jvp but also accepts an argnums argument
     # Most args are the same as jvp except for the added argument
     # argnums (Optional[int or tuple[int]]): Optional, specifies the argument(s) to compute gradients with respect to.
@@ -946,31 +1112,30 @@ def _jvp_with_argnums(func: Callable, primals: Any, tangents: Any, argnums: Opti
     # the primals given by argnums
     if not isinstance(primals, tuple):
         raise RuntimeError(
-            f'{jvp_str}: Expected primals to be a tuple. '
-            f'E.g. it should be valid to call f(*primals).')
+            f"{jvp_str}: Expected primals to be a tuple. "
+            f"E.g. it should be valid to call f(*primals)."
+        )
     diff_args = primals if argnums is None else _slice_argnums(primals, argnums)
     flat_primals, primals_spec = tree_flatten(diff_args)
     flat_tangents, tangents_spec = tree_flatten(tangents)
-    if primals_spec != tangents_spec:
-        raise RuntimeError(
-            f'{jvp_str}: Expected primals and tangents to have the same python '
-            f'structure. For example, if primals is a tuple of 3 tensors, '
-            f'tangents also must be. Got primals with structure {primals_spec} '
-            f'and tangents with structure {tangents_spec}')
-    assert_non_empty_list_of_tensors(flat_primals, jvp_str, 'primals')
-    assert_non_empty_list_of_tensors(flat_tangents, jvp_str, 'tangents')
-
-    level = _jvp_increment_nesting()
-    try:
-        global JVP_NESTING
-        JVP_NESTING += 1
+    _jvp_treespec_compare(diff_args, tangents)
+    assert_non_empty_list_of_tensors(flat_primals, jvp_str, "primals")
+    assert_non_empty_list_of_tensors(flat_tangents, jvp_str, "tangents")
+
+    global JVP_NESTING
+
+    with jvp_increment_nesting() as level:
         with fwAD._set_fwd_grad_enabled(True):
-            ctx = fwAD.dual_level if JVP_NESTING == 1 else noop
+            ctx = fwAD.dual_level if JVP_NESTING == 1 else contextlib.nullcontext
             with ctx():
-                flat_duals = tuple(fwAD.make_dual(p, t)
-                                   for p, t in zip(flat_primals, flat_tangents))
+                flat_duals = tuple(
+                    fwAD.make_dual(p, t) for p, t in zip(flat_primals, flat_tangents)
+                )
                 duals = tree_unflatten(flat_duals, primals_spec)
-                if argnums is not None:
+                # Note for the reviewer: This is extremely odd but it passes the
+                # assertion "len(self.block_stack) == 1" on symbolic_convert.py
+                # The equivalent "if argnums is not None" fails for some reason
+                if isinstance(argnums, (int, tuple)):
                     primals = _wrap_all_tensors(primals, level)
                     duals = _replace_args(primals, duals, argnums)
                 result_duals = func(*duals)
@@ -986,12 +1151,15 @@ def _jvp_with_argnums(func: Callable, primals: Any, tangents: Any, argnums: Opti
                 result_duals, spec = tree_flatten(result_duals)
                 assert_non_empty_tensor_output(result_duals, jvp_str)
 
-                primals_out, tangents_out = \
-                    zip(*[safe_unpack_dual(dual, strict) for dual in result_duals])
+                primals_out, tangents_out = zip(
+                    *[safe_unpack_dual(dual, strict) for dual in result_duals]
+                )
                 primals_out = tree_map(
-                    partial(_undo_create_differentiable, level=level), primals_out)
+                    partial(_undo_create_differentiable, level=level), primals_out
+                )
                 tangents_out = tree_map(
-                    partial(_undo_create_differentiable, level=level), tangents_out)
+                    partial(_undo_create_differentiable, level=level), tangents_out
+                )
 
                 primals_out_unflatten = tree_unflatten(primals_out, spec)
                 tangents_out_unflatten = tree_unflatten(tangents_out, spec)
@@ -999,9 +1167,6 @@ def _jvp_with_argnums(func: Callable, primals: Any, tangents: Any, argnums: Opti
                     return primals_out_unflatten, tangents_out_unflatten, aux
 
                 return primals_out_unflatten, tangents_out_unflatten
-    finally:
-        _jvp_decrement_nesting()
-        JVP_NESTING -= 1
 
 
 def safe_unflatten(tensor, dim, shape):
@@ -1012,7 +1177,13 @@ def safe_unflatten(tensor, dim, shape):
 
 
 @exposed_in("torch.func")
-def jacfwd(func: Callable, argnums: argnums_t = 0, has_aux: bool = False, *, randomness: str = "error"):
+def jacfwd(
+    func: Callable,
+    argnums: argnums_t = 0,
+    has_aux: bool = False,
+    *,
+    randomness: str = "error",
+):
     """
     Computes the Jacobian of ``func`` with respect to the arg(s) at index
     ``argnum`` using forward-mode autodiff
@@ -1117,7 +1288,7 @@ def jacfwd(func: Callable, argnums: argnums_t = 0, has_aux: bool = False, *, ran
         >>> assert torch.allclose(jacobian[1], expectedY)
 
     """
-    @wraps(func)
+
     def wrapper_fn(*args):
         error_if_complex("jacfwd", args, is_input=True)
         primals = args if argnums is None else _slice_argnums(args, argnums)
@@ -1127,7 +1298,9 @@ def wrapper_fn(*args):
         basis = tree_unflatten(flat_basis, primals_spec)
 
         def push_jvp(basis):
-            output = _jvp_with_argnums(func, args, basis, argnums=argnums, has_aux=has_aux)
+            output = _jvp_with_argnums(
+                func, args, basis, argnums=argnums, has_aux=has_aux
+            )
             # output[0] is the output of `func(*args)`
             error_if_complex("jacfwd", output[0], is_input=False)
             if has_aux:
@@ -1153,18 +1326,29 @@ def push_jvp(basis):
         jac_outs_ins = tuple(
             tuple(
                 safe_unflatten(jac_out_in, -1, primal.shape)
-                for primal, jac_out_in in
-                zip(flat_primals, jac_out.movedim(0, -1).split(flat_primals_numels, dim=-1))
+                for primal, jac_out_in in zip(
+                    flat_primals,
+                    jac_out.movedim(0, -1).split(flat_primals_numels, dim=-1),
+                )
             )
             for jac_out in jac_outs
         )
-        jac_outs_ins = tuple(tree_unflatten(jac_ins, primals_spec) for jac_ins in jac_outs_ins)
+        jac_outs_ins = tuple(
+            tree_unflatten(jac_ins, primals_spec) for jac_ins in jac_outs_ins
+        )
 
         if isinstance(argnums, int):
             jac_outs_ins = tuple(jac_ins[0] for jac_ins in jac_outs_ins)
         if has_aux:
             return tree_unflatten(jac_outs_ins, spec), aux
         return tree_unflatten(jac_outs_ins, spec)
+
+    # Dynamo does not support HOP composition if their inner function is
+    # annotated with @functools.wraps(...). We circumvent this issue by applying
+    # wraps only if we're not tracing with dynamo.
+    if not torch._dynamo.is_compiling():
+        wrapper_fn = wraps(func)(wrapper_fn)
+
     return wrapper_fn
 
 
@@ -1211,95 +1395,70 @@ def hessian(func, argnums=0):
     return jacfwd(jacrev(func, argnums), argnums)
 
 
-@exposed_in("torch.func")
-def grad_and_value(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Callable:
-    """
-    Returns a function to compute a tuple of the gradient and primal, or
-    forward, computation.
+@doesnt_support_saved_tensors_hooks
+def grad_and_value_impl(func, argnums, has_aux, args, kwargs) -> Callable:
+    with grad_increment_nesting() as level:
+        output, aux, grad_input = None, None, None
+        # See NOTE [grad and vjp interaction with no_grad]
+        with torch.enable_grad():
+            args = _wrap_all_tensors(args, level)
+            kwargs = _wrap_all_tensors(kwargs, level)
+            diff_args = _slice_argnums(args, argnums, as_tuple=False)
+            tree_map_(partial(_create_differentiable, level=level), diff_args)
 
-    Args:
-        func (Callable): A Python function that takes one or more arguments.
-            Must return a single-element Tensor. If specified ``has_aux``
-            equals ``True``, function can return a tuple of single-element
-            Tensor and other auxiliary objects: ``(output, aux)``.
-        argnums (int or Tuple[int]): Specifies arguments to compute gradients
-            with respect to. ``argnums`` can be single integer or tuple of
-            integers. Default: 0.
-        has_aux (bool): Flag indicating that ``func`` returns a tensor and
-            other auxiliary objects: ``(output, aux)``. Default: False.
+            output = func(*args, **kwargs)
+            if has_aux:
+                if not (isinstance(output, tuple) and len(output) == 2):
+                    raise RuntimeError(
+                        "grad_and_value(f)(*args): output of function f should be a tuple: (output, aux) "
+                        "if has_aux is True"
+                    )
+                output, aux = output
 
-    Returns:
-        Function to compute a tuple of gradients with respect to its inputs
-        and the forward computation. By default, the output of the function is
-        a tuple of the gradient tensor(s) with respect to the first argument
-        and the primal computation. If specified ``has_aux`` equals
-        ``True``, tuple of gradients and tuple of the forward computation with
-        output auxiliary objects is returned. If ``argnums`` is a tuple of
-        integers, a tuple of a tuple of the output gradients with respect to
-        each ``argnums`` value and the forward computation is returned.
-
-    See :func:`grad` for examples
-    """
-    @doesnt_support_saved_tensors_hooks
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        level = _grad_increment_nesting()
-        try:
-            output, aux, grad_input = None, None, None
-            # See NOTE [grad and vjp interaction with no_grad]
-            with torch.enable_grad():
-                args = _wrap_all_tensors(args, level)
-                kwargs = _wrap_all_tensors(kwargs, level)
-                diff_args = _slice_argnums(args, argnums, as_tuple=False)
-                tree_map_(partial(_create_differentiable, level=level), diff_args)
-
-                output = func(*args, **kwargs)
-                if has_aux:
-                    if not (isinstance(output, tuple) and len(output) == 2):
-                        raise RuntimeError(
-                            "grad_and_value(f)(*args): output of function f should be a tuple: (output, aux) "
-                            "if has_aux is True"
-                        )
-                    output, aux = output
-
-                if not isinstance(output, torch.Tensor):
-                    raise RuntimeError('grad_and_value(f)(*args): Expected f(*args) '
-                                       f'to return a Tensor, got {type(output)}')
-                if output.dim() != 0:
-                    raise RuntimeError('grad_and_value(f)(*args): Expected f(*args) '
-                                       'to return a scalar Tensor, got tensor with '
-                                       f'{output.dim()} dims. Maybe you wanted to '
-                                       'use the vjp or jacrev APIs instead?')
-
-                flat_diff_args, spec = tree_flatten(diff_args)
-
-                # NB: need create_graph so that backward pass isn't run in no_grad mode
-                flat_outputs = _as_tuple(output)
-                flat_grad_input = _autograd_grad(flat_outputs, flat_diff_args, create_graph=True)
-                grad_input = tree_unflatten(flat_grad_input, spec)
-
-                grad_input = _undo_create_differentiable(grad_input, level)
-                output = _undo_create_differentiable(output, level)
-                if aux is not None:
-                    aux = _undo_create_differentiable(aux, level)
+            if not isinstance(output, torch.Tensor):
+                raise RuntimeError(
+                    "grad_and_value(f)(*args): Expected f(*args) "
+                    f"to return a Tensor, got {type(output)}"
+                )
+            if output.dim() != 0:
+                raise RuntimeError(
+                    "grad_and_value(f)(*args): Expected f(*args) "
+                    "to return a scalar Tensor, got tensor with "
+                    f"{output.dim()} dims. Maybe you wanted to "
+                    "use the vjp or jacrev APIs instead?"
+                )
+
+            flat_diff_args, spec = tree_flatten(diff_args)
+
+            # NB: need create_graph so that backward pass isn't run in no_grad mode
+            flat_outputs = _as_tuple(output)
+            flat_grad_input = _autograd_grad(
+                flat_outputs, flat_diff_args, create_graph=True
+            )
+            grad_input = tree_unflatten(flat_grad_input, spec)
 
+            grad_input = _undo_create_differentiable(grad_input, level)
+            output = _undo_create_differentiable(output, level)
             if has_aux:
-                return grad_input, (output, aux)
-            return grad_input, output
-        finally:
-            _grad_decrement_nesting()
-    return wrapper
+                aux = _undo_create_differentiable(aux, level)
+
+        if has_aux:
+            return grad_input, (output, aux)
+        return grad_input, output
+
 
 def grad_impl(func: Callable, argnums: argnums_t, has_aux: bool, args, kwargs):
-    func = lazy_dynamo_disable(func)
-    results = grad_and_value(func, argnums, has_aux=has_aux)(*args, **kwargs)
+    results = grad_and_value_impl(func, argnums, has_aux, args, kwargs)
     if has_aux:
         grad, (_, aux) = results
         return grad, aux
     grad, _ = results
     return grad
 
-def _maybe_wrap_functional_tensor(maybe_tensor, level, *, _python_functionalize: bool = False):
+
+def _maybe_wrap_functional_tensor(
+    maybe_tensor, level, *, _python_functionalize: bool = False
+):
     if not isinstance(maybe_tensor, torch.Tensor):
         return maybe_tensor
     wrapped = _wrap_functional_tensor(maybe_tensor, level)
@@ -1311,9 +1470,17 @@ def _maybe_wrap_functional_tensor(maybe_tensor, level, *, _python_functionalize:
     return wrapped
 
 
-def _wrap_all_tensors_to_functional(tensor_pytree, level, *, _python_functionalize: bool = False):
-    return tree_map(partial(lambda x: _maybe_wrap_functional_tensor(
-        x, level, _python_functionalize=_python_functionalize)), tensor_pytree)
+def _wrap_all_tensors_to_functional(
+    tensor_pytree, level, *, _python_functionalize: bool = False
+):
+    return tree_map(
+        partial(
+            lambda x: _maybe_wrap_functional_tensor(
+                x, level, _python_functionalize=_python_functionalize
+            )
+        ),
+        tensor_pytree,
+    )
 
 
 def _maybe_unwrap_functional_tensor(maybe_tensor, *, reapply_views: bool):
@@ -1333,11 +1500,14 @@ def _maybe_unwrap_functional_tensor(maybe_tensor, *, reapply_views: bool):
 
 
 def _unwrap_all_tensors_from_functional(tensor_pytree, *, reapply_views: bool):
-    return tree_map(lambda t: _maybe_unwrap_functional_tensor(t, reapply_views=reapply_views), tensor_pytree)
+    return tree_map(
+        lambda t: _maybe_unwrap_functional_tensor(t, reapply_views=reapply_views),
+        tensor_pytree,
+    )
 
 
 @exposed_in("torch.func")
-def functionalize(func: Callable, *, remove: str = 'mutations') -> Callable:
+def functionalize(func: Callable, *, remove: str = "mutations") -> Callable:
     """
     functionalize is a transform that can be used to remove (intermediate)
     mutations and aliasing from a function, while preserving the function's
@@ -1491,9 +1661,9 @@ def forward(self, a_1):
     Information about which ATen operators are aliasing or mutating all comes from
     https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/native_functions.yaml.
     """
-    if remove == 'mutations':
+    if remove == "mutations":
         reapply_views = True
-    elif remove == 'mutations_and_views':
+    elif remove == "mutations_and_views":
         reapply_views = False
     else:
         raise RuntimeError(
@@ -1519,7 +1689,9 @@ def wrapped(*args, **kwargs):
             flattened_wrapped_kwargs = pytree.arg_tree_leaves(**func_kwargs)
 
             func_outputs = func(*func_args, **func_kwargs)
-            outputs = _unwrap_all_tensors_from_functional(func_outputs, reapply_views=reapply_views)
+            outputs = _unwrap_all_tensors_from_functional(
+                func_outputs, reapply_views=reapply_views
+            )
             flat_outputs, func_out_spec = tree_flatten(outputs)
 
             for a in flattened_wrapped_args + flattened_wrapped_kwargs:
@@ -1528,21 +1700,31 @@ def wrapped(*args, **kwargs):
                     torch._sync(a)
 
             # And if any mutations were applied to the inputs, we need to propagate them back to the user.
-            for unwrapped, wrapped in zip(flattened_unwrapped_args, flattened_wrapped_args):
-                if isinstance(unwrapped, torch.Tensor) and isinstance(wrapped, torch.Tensor):
+            for unwrapped, wrapped in zip(
+                flattened_unwrapped_args, flattened_wrapped_args
+            ):
+                if isinstance(unwrapped, torch.Tensor) and isinstance(
+                    wrapped, torch.Tensor
+                ):
                     _propagate_functional_input_mutation(unwrapped, wrapped)
-            for unwrapped, wrapped in zip(flattened_unwrapped_kwargs, flattened_wrapped_kwargs):
-                if isinstance(unwrapped, torch.Tensor) and isinstance(wrapped, torch.Tensor):
+            for unwrapped, wrapped in zip(
+                flattened_unwrapped_kwargs, flattened_wrapped_kwargs
+            ):
+                if isinstance(unwrapped, torch.Tensor) and isinstance(
+                    wrapped, torch.Tensor
+                ):
                     _propagate_functional_input_mutation(unwrapped, wrapped)
 
             return outputs
         finally:
             _func_decrement_nesting()
+
     return wrapped
 
+
 @exposed_in("torch.func")
 def linearize(func: Callable, *primals) -> Tuple[Any, Callable]:
-    '''
+    """
     Returns the value of ``func`` at ``primals`` and linear approximation
     at ``primals``.
 
@@ -1578,7 +1760,7 @@ def linearize(func: Callable, *primals) -> Tuple[Any, Callable]:
                 [1., 1., 1.]])
         >>>
 
-    '''
+    """
     # Note: We evaluate `fn` twice.
     # Once for returning the output and other while
     # tracing the graph.
@@ -1596,15 +1778,21 @@ def linearize(func: Callable, *primals) -> Tuple[Any, Callable]:
     # function to trace
     def trace_fn(flat_tangents):
         with fwAD.dual_level():
-            flat_duals = tuple(fwAD.make_dual(p, t) for p, t in zip(flat_primals, flat_tangents))
+            flat_duals = tuple(
+                fwAD.make_dual(p, t) for p, t in zip(flat_primals, flat_tangents)
+            )
             duals = tree_unflatten(flat_duals, primals_argspec)
             output = func(*duals)
-            tangents = tree_map_only(torch.Tensor, lambda t: fwAD.unpack_dual(t)[1], output)
+            tangents = tree_map_only(
+                torch.Tensor, lambda t: fwAD.unpack_dual(t)[1], output
+            )
 
         return tangents
 
-    jvp_graph = make_fx(trace_fn)(flat_tangents)
-    const_folded_jvp_graph = const_fold.split_const_subgraphs(jvp_graph)
+    jvp_graph = lazy_dynamo_disallow(make_fx)(trace_fn)(flat_tangents)
+    const_folded_jvp_graph = lazy_dynamo_disallow(const_fold.split_const_subgraphs)(
+        jvp_graph
+    )
 
     # Hold only the meta-data regarding the primals.
     flat_primals_shape = tuple(p.shape for p in flat_primals)
@@ -1614,21 +1802,27 @@ def trace_fn(flat_tangents):
     def forward_ad_checks(flat_tangents):
         for idx, t in enumerate(flat_tangents):
             if t.shape != flat_primals_shape[idx]:
-                msg = (f"tangent:{idx} with shape {t.shape} in flattened "
-                       f"pytree doesn't match the shape {flat_primals_shape[idx]} "
-                       "of the corresponding primal.")
+                msg = (
+                    f"tangent:{idx} with shape {t.shape} in flattened "
+                    f"pytree doesn't match the shape {flat_primals_shape[idx]} "
+                    "of the corresponding primal."
+                )
                 raise RuntimeError(msg)
 
             if t.device != flat_primals_device[idx]:
-                msg = (f"tangent:{idx} with device {t.device} in flattened "
-                       f"pytree doesn't match the device {flat_primals_device[idx]} "
-                       "of the corresponding primal.")
+                msg = (
+                    f"tangent:{idx} with device {t.device} in flattened "
+                    f"pytree doesn't match the device {flat_primals_device[idx]} "
+                    "of the corresponding primal."
+                )
                 raise RuntimeError(msg)
 
             if t.dtype != flat_primals_dtype[idx]:
-                msg = (f"tangent:{idx} with dtype {t.dtype} in flattened "
-                       f"pytree doesn't match the dtype {flat_primals_dtype[idx]} "
-                       "of the corresponding primal.")
+                msg = (
+                    f"tangent:{idx} with dtype {t.dtype} in flattened "
+                    f"pytree doesn't match the dtype {flat_primals_dtype[idx]} "
+                    "of the corresponding primal."
+                )
                 raise RuntimeError(msg)
 
     # jvp_fn : callable to return
@@ -1636,9 +1830,7 @@ def forward_ad_checks(flat_tangents):
     #   calling the folded fx graph and unflattening fx graph output
     def jvp_fn(*tangents):
         flat_tangents, tangent_argspec = tree_flatten(tangents)
-        if tangent_argspec != primals_argspec:
-            raise RuntimeError(f"Expected the tangents {tangent_argspec} to have "
-                               f"the same argspec as the primals {primals_argspec}")
+        _linearize_treespec_compare(primals, tangents)
 
         forward_ad_checks(flat_tangents)
 
diff --git a/torch/_functorch/fx_minifier.py b/torch/_functorch/fx_minifier.py
index b7df1626bf6ad..d24aa3f2964df 100644
--- a/torch/_functorch/fx_minifier.py
+++ b/torch/_functorch/fx_minifier.py
@@ -1,19 +1,23 @@
-import torch.fx as fx
+# mypy: ignore-errors
+
 import copy
-import torch
 import math
+import os
 import sys
-from typing import Callable, List
-from functools import wraps, partial
 from dataclasses import dataclass
-from .compile_utils import get_placeholders, get_outputs
-from torch.utils._content_store import ContentStoreWriter
+from functools import partial, wraps
+from typing import Callable, List
+
+import torch
+import torch.fx as fx
 from torch.hub import tqdm
 from torch.multiprocessing.reductions import StorageWeakRef
-import os
+from torch.utils._content_store import ContentStoreWriter
+from .compile_utils import get_outputs, get_placeholders
 
 is_tuple = object()
 
+
 @dataclass
 class LoadTensorMeta:
     size: List[int]
@@ -21,6 +25,7 @@ class LoadTensorMeta:
     dtype: torch.dtype
     device: torch.device
 
+
 class ConcreteProp(torch.fx.Interpreter):
     def __init__(self, mod, *, writer=None, skip_offload=False):
         super().__init__(mod)
@@ -35,24 +40,21 @@ def run_node(self, n):
 
         if isinstance(r, torch.Tensor):
             if self.writer is None:
-                n.meta['concrete_value'] = r
+                n.meta["concrete_value"] = r
             else:
                 if StorageWeakRef(r.untyped_storage()) in self.seen_storages:
                     # Refuse to offload tensors which alias other live
                     # tensors, because this will violate operator contracts
-                    n.meta['concrete_value'] = None
+                    n.meta["concrete_value"] = None
                 else:
                     if not self.skip_offload:
                         self.writer.write_tensor(os.path.join("eager", name), r)
-                    n.meta['concrete_value'] = LoadTensorMeta(
-                        r.size(),
-                        r.stride(),
-                        r.dtype,
-                        r.device
+                    n.meta["concrete_value"] = LoadTensorMeta(
+                        r.size(), r.stride(), r.dtype, r.device
                     )
                     self.seen_storages.add(StorageWeakRef(r.untyped_storage()))
         else:
-            n.meta['concrete_value'] = is_tuple
+            n.meta["concrete_value"] = is_tuple
 
         return r
 
@@ -60,30 +62,36 @@ def propagate(self, *args):
         with tqdm(
             desc="Saving intermediates for delta debugging",
             total=len(self.module.graph.nodes),
-            disable=self.writer is None
+            disable=self.writer is None,
         ) as pbar:
             self.pbar = pbar
             r = super().run(*args)
             if not self.skip_offload:
-                pbar.set_description("Saved!  To skip next time, run with --skip-saving-eager-intermediates")
+                pbar.set_description(
+                    "Saved!  To skip next time, run with --skip-saving-eager-intermediates"
+                )
             return r
 
+
 def is_load_tensor_node(node):
-    return node.op == 'call_function' and node.target is torch.ops.debugprims.load_tensor.default
+    return (
+        node.op == "call_function"
+        and node.target is torch.ops.debugprims.load_tensor.default
+    )
 
 
 # inplace modifies node/inps
 def _convert_node_to_placeholder(graph, node, inps):
-    if node.op == 'output' or node.op == "placeholder":
+    if node.op == "output" or node.op == "placeholder":
         return False
 
     if is_load_tensor_node(node):
         return False
 
-    concrete_val = node.meta.get('concrete_value', None)
+    concrete_val = node.meta.get("concrete_value", None)
 
     if isinstance(concrete_val, torch.Tensor):
-        node.op = 'placeholder'
+        node.op = "placeholder"
         node.target = node.name
         node.args = ()
         node.kwargs = {}
@@ -105,17 +113,22 @@ def _convert_node_to_placeholder(graph, node, inps):
         return r
 
     elif isinstance(concrete_val, LoadTensorMeta):
-        node.op = 'call_function'
+        node.op = "call_function"
         node.target = torch.ops.debugprims.load_tensor.default
-        node.args = (os.path.join("eager", node.name), concrete_val.size, concrete_val.stride)
+        node.args = (
+            os.path.join("eager", node.name),
+            concrete_val.size,
+            concrete_val.stride,
+        )
         node.kwargs = {
-            'device': concrete_val.device,
-            'dtype': concrete_val.dtype,
+            "device": concrete_val.device,
+            "dtype": concrete_val.dtype,
         }
         return True
 
     return False
 
+
 def create_minified_hlo_graph(minified_fx_graph, inputs):
     """
     Takes minified FX graph as primary input, and ports it to HLO via StableHLO
@@ -125,21 +138,27 @@ def create_minified_hlo_graph(minified_fx_graph, inputs):
     os.makedirs(hlo_dir, exists_ok=True)
 
     from torch_xla.stablehlo import save_torch_model_as_stablehlo
+
     save_torch_model_as_stablehlo(minified_fx_graph, inputs, hlo_dir)
 
+
 def dump_state(fx_g, inps):
-    print(f"""
+    print(
+        f"""
 # Working Repro with {len(fx_g.graph.nodes)} nodes
 inps = {[(i.shape, i.dtype, i.device.type) for i in inps]}
 inps = [torch.zeros(())] + [torch.ones(shape, dtype=dtype, device=device) for (shape, dtype, device) in inps]
 {fx_g.code}
-""")
+"""
+    )
+
 
 def is_power_of_two(n):
     if n == 0:
         return False
     return (n & (n - 1)) == 0
 
+
 @dataclass
 class ReproState:
     graph: fx.Graph
@@ -149,10 +168,18 @@ def __post_init__(self):
         ph_nodes = get_placeholders(self.graph)
         assert len(ph_nodes) == len(self.inps)
 
+
 def minifier(
-    fail_f: fx.GraphModule, inps, module_fails, dump_state: Callable = dump_state, *,
-    save_dir=None, offload_to_disk=False, skip_offload=False, skip_sanity=False,
-    max_granularity=None
+    fail_f: fx.GraphModule,
+    inps,
+    module_fails,
+    dump_state: Callable = dump_state,
+    *,
+    save_dir=None,
+    offload_to_disk=False,
+    skip_offload=False,
+    skip_sanity=False,
+    max_granularity=None,
 ):
     """
     Minimizes a FX graph with given inputs, such that the resulting FX graph still returns True for module_fails.
@@ -181,7 +208,6 @@ def minifier(
     def deepcopy_fx_graph(fx_graph):
         return fx.GraphModule(fail_f, copy.deepcopy(fx_graph)).graph
 
-
     def graph_fails(graph, inps):
         nonlocal num_queries
         graph = copy.deepcopy(graph)
@@ -206,9 +232,11 @@ def new_func(old_state: ReproState, granularity=1):
             print(
                 f"Strategy: {name} (G: {granularity}) "
                 f"({len(old_state.graph.nodes)} nodes, {len(old_state.inps)} inputs)",
-                file=sys.stderr
+                file=sys.stderr,
+            )
+            new_state = strategy(
+                deepcopy_fx_graph(old_state.graph), list(old_state.inps), granularity
             )
-            new_state = strategy(deepcopy_fx_graph(old_state.graph), list(old_state.inps), granularity)
             if new_state is not None:
                 new_nodes = len(new_state.graph.nodes)
                 old_nodes = len(old_state.graph.nodes)
@@ -219,19 +247,31 @@ def new_func(old_state: ReproState, granularity=1):
                 progress_made = False
                 if new_nodes < old_nodes:
                     progress_made = True
-                    print(f"SUCCESS: Went from {old_nodes} to {new_nodes} nodes", file=sys.stderr)
+                    print(
+                        f"SUCCESS: Went from {old_nodes} to {new_nodes} nodes",
+                        file=sys.stderr,
+                    )
                 if new_inps > old_inps:
                     progress_made = True
-                    print(f"SUCCESS: Went from {old_inps} to {new_inps} inputs", file=sys.stderr)
+                    print(
+                        f"SUCCESS: Went from {old_inps} to {new_inps} inputs",
+                        file=sys.stderr,
+                    )
                 if new_outs < old_outs:
                     progress_made = True
-                    print(f"SUCCESS: Went from {old_outs} to {new_outs} outputs", file=sys.stderr)
+                    print(
+                        f"SUCCESS: Went from {old_outs} to {new_outs} outputs",
+                        file=sys.stderr,
+                    )
 
                 if not progress_made:
                     raise RuntimeError("Success raised but no progress made?")
 
                 if not graph_fails(new_state.graph, new_state.inps):
-                    print("WARNING: Something went wrong, not applying this minification", file=sys.stderr)
+                    print(
+                        "WARNING: Something went wrong, not applying this minification",
+                        file=sys.stderr,
+                    )
                     return None
                 return new_state
             else:
@@ -250,11 +290,17 @@ def remove_suffix(cur_graph, cur_inps, granularity):
         env = {}
         for idx, node in enumerate(cur_graph.nodes):
             new_node = new_graph.node_copy(node, lambda x: env[x])
-            if node.op not in ['placeholder', 'output']:
+            if node.op not in ["placeholder", "output"]:
                 # If idx is divisible by (granularity * 2), it would have been checked already.
-                if idx % granularity == 0 and (idx % (granularity * 2) != 0) and idx not in tested:
+                if (
+                    idx % granularity == 0
+                    and (idx % (granularity * 2) != 0)
+                    and idx not in tested
+                ):
                     output_node = new_graph.output((new_node,))
-                    if len(new_graph.nodes) < len(cur_graph.nodes) and graph_fails(new_graph, cur_inps):
+                    if len(new_graph.nodes) < len(cur_graph.nodes) and graph_fails(
+                        new_graph, cur_inps
+                    ):
                         return ReproState(new_graph, cur_inps)
                     else:
                         tested.add(idx)
@@ -267,24 +313,25 @@ def remove_outputs(cur_graph, cur_inps, granularity):
         granularity = max(1, granularity // 2)
         for idx, node in enumerate(cur_graph.nodes):
             node.idx = idx
-            if node.op == 'output':
+            if node.op == "output":
                 output = node
                 break
 
         if isinstance(output.args[0], fx.Node):
             return None
 
-        output_args = sorted(output.args[0], key=lambda x: x.idx if isinstance(x, fx.Node) else int(1e9))
+        output_args = sorted(
+            output.args[0], key=lambda x: x.idx if isinstance(x, fx.Node) else int(1e9)
+        )
         if len(output_args) == 1:
             return None
 
         for idx in range(0, len(output_args), granularity):
-            output.args = (output_args[:idx] + output_args[idx + granularity:],)
+            output.args = (output_args[:idx] + output_args[idx + granularity :],)
             if graph_fails(cur_graph, cur_inps):
                 return ReproState(cur_graph, cur_inps)
         return None
 
-
     def remove_unused_inputs_unchecked(cur_state: ReproState):
         cur_graph = cur_state.graph
         cur_inps = cur_state.inps
@@ -310,7 +357,9 @@ def remove_unused_inputs_checked(cur_state: ReproState):
     def _remove_unused_wrapper(cur_graph, cur_inps, granularity):
         return remove_unused_inputs_checked(ReproState(cur_graph, cur_inps))
 
-    remove_unused_inputs = register_strategy("Remove unused inputs")(_remove_unused_wrapper)
+    remove_unused_inputs = register_strategy("Remove unused inputs")(
+        _remove_unused_wrapper
+    )
 
     @register_strategy("Eliminate dead code")
     def eliminate_dead_code(cur_graph, cur_inps, granularity):
@@ -318,7 +367,6 @@ def eliminate_dead_code(cur_graph, cur_inps, granularity):
             return ReproState(cur_graph, cur_inps)
         return None
 
-
     def _consolidate_placeholders(cur_graph, inps):
         new_graph = fx.Graph()
         env = {}
@@ -328,13 +376,15 @@ def _consolidate_placeholders(cur_graph, inps):
         # is at the front, convert it into an input (because it can be live
         # all the time)
         for node in cur_graph.nodes:
-            if node.op == 'placeholder':
+            if node.op == "placeholder":
                 new_node = new_graph.node_copy(node, lambda x: env[x])
                 env[node] = new_node
             elif not seen_non_placeholder and is_load_tensor_node(node):
                 new_node = new_graph.placeholder(node.name)
                 env[node] = new_node
-                inps.append(torch.ops.debugprims.load_tensor.default(*node.args, **node.kwargs))
+                inps.append(
+                    torch.ops.debugprims.load_tensor.default(*node.args, **node.kwargs)
+                )
             else:
                 seen_non_placeholder = True
 
@@ -389,7 +439,11 @@ def try_granularity(failing_state, granularity, use_non_granular):
             strategies += [remove_outputs]
 
         if use_non_granular:
-            strategies += [eliminate_dead_code, remove_unused_inputs, consolidate_inputs]
+            strategies += [
+                eliminate_dead_code,
+                remove_unused_inputs,
+                consolidate_inputs,
+            ]
 
         strategies += [remove_suffix, delta_debugging]
 
@@ -401,7 +455,7 @@ def try_granularity(failing_state, granularity, use_non_granular):
 
     while True:
         dump_state(fx.GraphModule(fail_f, failing_state.graph), failing_state.inps)
-        granularity = int(2**(math.floor(math.log2(len(failing_state.graph.nodes)))))
+        granularity = int(2 ** (math.floor(math.log2(len(failing_state.graph.nodes)))))
         if max_granularity is not None:
             granularity = min(max_granularity, granularity)
         new_state = try_granularity(failing_state, granularity, use_non_granular=True)
@@ -412,7 +466,9 @@ def try_granularity(failing_state, granularity, use_non_granular):
         granularity //= 2
         has_progress = False
         while granularity >= 1:
-            new_state = try_granularity(failing_state, granularity, use_non_granular=False)
+            new_state = try_granularity(
+                failing_state, granularity, use_non_granular=False
+            )
             if new_state is not None:
                 failing_state = new_state
                 has_progress = True
diff --git a/torch/_functorch/partitioners.py b/torch/_functorch/partitioners.py
index 1bc024184e87a..4ac5596cde500 100644
--- a/torch/_functorch/partitioners.py
+++ b/torch/_functorch/partitioners.py
@@ -1,30 +1,45 @@
+# mypy: ignore-errors
+
+import copy
+import functools
+import heapq
+import itertools
+import logging
+import math
+import operator
+import os
+from collections import defaultdict
+from typing import List, Optional, Set, Tuple, TYPE_CHECKING, Union
+
+import torch
+import torch._inductor.inductor_prims
+import torch.fx as fx
+import torch.utils._pytree as pytree
+from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental.proxy_tensor import is_sym_node, py_sym_types
 from torch.fx.experimental.sym_node import magic_methods, method_to_operator
 from torch.fx.experimental.symbolic_shapes import (
-    hint_int, free_symbols, is_symbol_binding_fx_node, find_symbol_binding_fx_nodes
+    find_symbol_binding_fx_nodes,
+    free_symbols,
+    hint_int,
+    is_symbol_binding_fx_node,
 )
-import torch
-import torch.fx as fx
-import operator
-import math
-import torch.utils._pytree as pytree
-import copy
-import os
-import itertools
-import sympy
-from collections import defaultdict
 from torch.fx.passes import graph_drawer
-from typing import List, Optional, Set, Tuple, Union
-from .compile_utils import fx_graph_cse, get_aten_target
 from . import config
-import functools
+from .compile_utils import fx_graph_cse, get_aten_target
+
+if TYPE_CHECKING:
+    import sympy
+
 
 AOT_PARTITIONER_DEBUG = config.debug_partitioner
+log = logging.getLogger(__name__)
 
 
 def must_recompute(node):
     return node.meta.get("recompute", False)
 
+
 def has_recomputable_ops(fx_g):
     found = False
     for node in fx_g.graph.nodes:
@@ -32,18 +47,25 @@ def has_recomputable_ops(fx_g):
             return True
     return False
 
+
 def has_recomputable_rng_ops(fx_g):
     for node in fx_g.graph.nodes:
-        if must_recompute(node) and hasattr(node.target, "tags") and torch.Tag.nondeterministic_seeded in node.target.tags:
+        if (
+            must_recompute(node)
+            and hasattr(node.target, "tags")
+            and torch.Tag.nondeterministic_seeded in node.target.tags
+        ):
             return True
     return False
 
+
 def sym_node_size(node):
     if isinstance(node.meta["val"], (torch.SymInt, torch.SymBool)):
         return 1
     assert isinstance(node.meta["val"], torch.SymFloat)
     return 4
 
+
 class InvalidNodeBase:
     def __repr__(self):
         return "Invalid Node"
@@ -76,25 +98,31 @@ def _extract_graph_with_inputs_outputs(joint_graph, inputs, outputs):
     for node in joint_graph.nodes:
         if node in inputs:
             continue
-        elif node.op == 'placeholder':
+        elif node.op == "placeholder":
             env[node] = InvalidNode
-        elif node.op == 'call_function':
+        elif node.op == "call_function":
             all_args = pytree.arg_tree_leaves(*node.args, **node.kwargs)
-            all_args = [isinstance(env[x], InvalidNodeBase) for x in all_args if isinstance(x, fx.Node)]
+            all_args = [
+                isinstance(env[x], InvalidNodeBase)
+                for x in all_args
+                if isinstance(x, fx.Node)
+            ]
             if any(all_args):
                 env[node] = InvalidNode
                 continue
             env[node] = new_graph.node_copy(node, lambda x: env[x])
-        elif node.op == 'get_attr':
+        elif node.op == "get_attr":
             env[node] = new_graph.node_copy(node, lambda x: env[x])
-        elif node.op == 'output':
+        elif node.op == "output":
             pass
     output_values = []
     for x in outputs:
         if isinstance(x, fx.Node):
             if x not in env:
                 raise RuntimeError(f"Node {x} couldn't be found in env")
-            assert not isinstance(env[x], InvalidNodeBase), f"Node {x} was invalid, but is output"
+            assert not isinstance(
+                env[x], InvalidNodeBase
+            ), f"Node {x} was invalid, but is output"
             output_values.append(env[x])
         else:
             output_values.append(x)
@@ -113,55 +141,71 @@ def _is_primal(node):
         and not _is_fwd_seed_offset(node)
     )
 
+
 def _is_tangent(node):
     return node.op == "placeholder" and "tangents" in node.target
 
+
 def _is_bwd_seed_offset(node):
-    return node.op == "placeholder" and ("bwd_seed" in node.target or "bwd_base_offset" in node.target)
+    return node.op == "placeholder" and (
+        "bwd_seed" in node.target or "bwd_base_offset" in node.target
+    )
+
 
 def _is_fwd_seed_offset(node):
-    return node.op == "placeholder" and ("fwd_seed" in node.target or "fwd_base_offset" in node.target)
+    return node.op == "placeholder" and (
+        "fwd_seed" in node.target or "fwd_base_offset" in node.target
+    )
+
+
+def _is_backward_state(node):
+    return node.op == "placeholder" and isinstance(node.meta.get("val"), BackwardState)
 
 
 def _extract_fwd_bwd_outputs(joint_module: fx.GraphModule, *, num_fwd_outputs):
-    outputs = pytree.arg_tree_leaves(*(node.args for node in joint_module.graph.nodes if node.op == 'output'))
+    outputs = pytree.arg_tree_leaves(
+        *(node.args for node in joint_module.graph.find_nodes(op="output"))
+    )
     fwd_outputs = outputs[:num_fwd_outputs]
     bwd_outputs = outputs[num_fwd_outputs:]
     return fwd_outputs, bwd_outputs
 
 
-def _extract_fwd_bwd_modules(joint_module: fx.GraphModule, saved_values, saved_sym_nodes, *, num_fwd_outputs):
-    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
-    primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
-    tangent_inputs = list(filter(_is_tangent, joint_module.graph.nodes))
-    fwd_seed_offset_inputs = list(filter(_is_fwd_seed_offset, joint_module.graph.nodes))
-    bwd_seed_offset_inputs = list(filter(_is_bwd_seed_offset, joint_module.graph.nodes))
+def _remove_by_name(saved_values, name):
+    for saved_value in saved_values:
+        if saved_value.name == name:
+            saved_values.remove(saved_value)
+            break
 
-    # Construct the forward module
-    # Keep symints separate from tensors, passed between fwd/bwd graphs, and in the right order.
-    fwd_graph = _extract_graph_with_inputs_outputs(
-        joint_module.graph,
-        primal_inputs + fwd_seed_offset_inputs,
-        fwd_outputs + saved_values + saved_sym_nodes
+
+def _extract_fwd_bwd_modules(
+    joint_module: fx.GraphModule, saved_values, saved_sym_nodes, *, num_fwd_outputs
+):
+    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(
+        joint_module, num_fwd_outputs=num_fwd_outputs
     )
+    placeholders = joint_module.graph.find_nodes(op="placeholder")
+    primal_inputs = [*filter(_is_primal, placeholders)]
+    tangent_inputs = [*filter(_is_tangent, placeholders)]
+    fwd_seed_offset_inputs = [*filter(_is_fwd_seed_offset, placeholders)]
+    bwd_seed_offset_inputs = [*filter(_is_bwd_seed_offset, placeholders)]
+    backward_state_inputs = [*filter(_is_backward_state, placeholders)]
+
     bwd_graph = _extract_graph_with_inputs_outputs(
         joint_module.graph,
         saved_sym_nodes + saved_values + tangent_inputs + bwd_seed_offset_inputs,
-        bwd_outputs
+        bwd_outputs,
     )
 
-    # This is to filter out saved values that don't actually end up being used by the backwards pass
-    for node in bwd_graph.nodes:
-        if node.op == 'placeholder' and not node.users:
-            for saved_value in saved_values:
-                if saved_value.name == node.name:
-                    saved_values.remove(saved_value)
-                    break
-
-            for saved_sym in saved_sym_nodes:
-                if saved_sym.name == node.name:
-                    saved_sym_nodes.remove(saved_sym)
-                    break
+    for node in bwd_graph.find_nodes(op="placeholder"):
+        # This is to filter out saved values that don't actually end up being used by the backwards pass
+        if not node.users:
+            _remove_by_name(saved_values, node.name)
+            _remove_by_name(saved_sym_nodes, node.name)
+        elif _is_backward_state(node):
+            # BackwardState is saved directly
+            _remove_by_name(saved_values, node.name)
+            assert backward_state_inputs
 
     # Now that we have the finalized list of saved values, we need to ensure
     # we propagate all symbols which are referenced by backwards inputs.
@@ -198,7 +242,6 @@ def _extract_fwd_bwd_modules(joint_module: fx.GraphModule, saved_values, saved_s
             saved_sym_nodes_binding.append(symbol_bindings[s])
         saved_symbols |= new_symbols
 
-
     # Update saved_sym_nodes that are now reordered to have all bindings at
     # front. This can also be used later on to figure out the position of saved
     # sym nodes in the output of fwd graph.
@@ -210,16 +253,20 @@ def _extract_fwd_bwd_modules(joint_module: fx.GraphModule, saved_values, saved_s
     fwd_graph = _extract_graph_with_inputs_outputs(
         joint_module.graph,
         primal_inputs + fwd_seed_offset_inputs,
-        fwd_outputs + saved_values + saved_sym_nodes
+        fwd_outputs + saved_values + saved_sym_nodes,
     )
     bwd_graph = _extract_graph_with_inputs_outputs(
         joint_module.graph,
-        saved_sym_nodes + saved_values + tangent_inputs + bwd_seed_offset_inputs,
-        bwd_outputs
+        saved_sym_nodes
+        + saved_values
+        + tangent_inputs
+        + bwd_seed_offset_inputs
+        + backward_state_inputs,
+        bwd_outputs,
     )
 
-    fwd_module = fx.GraphModule(joint_module, fwd_graph)
-    bwd_module = fx.GraphModule(joint_module, bwd_graph)
+    fwd_module = fx._lazy_graph_module._make_graph_module(joint_module, fwd_graph)
+    bwd_module = fx._lazy_graph_module._make_graph_module(joint_module, bwd_graph)
     return fwd_module, bwd_module
 
 
@@ -250,13 +297,21 @@ def default_partition(
         Returns the generated forward and backward Fx graph modules.
     """
     if has_recomputable_ops(joint_module):
-        return min_cut_rematerialization_partition(joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs)
+        return min_cut_rematerialization_partition(
+            joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs
+        )
     primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
     fwd_seed_offset_inputs = list(filter(_is_fwd_seed_offset, joint_module.graph.nodes))
     inputs = primal_inputs + fwd_seed_offset_inputs
-    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
-    forward_only_graph = _extract_graph_with_inputs_outputs(joint_module.graph, inputs, fwd_outputs)
-    forward_node_names = {node.name for node in forward_only_graph.nodes if node.op != 'output'}
+    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(
+        joint_module, num_fwd_outputs=num_fwd_outputs
+    )
+    forward_only_graph = _extract_graph_with_inputs_outputs(
+        joint_module.graph, inputs, fwd_outputs
+    )
+    forward_node_names = {
+        node.name for node in forward_only_graph.nodes if node.op != "output"
+    }
     saved_values = []
     saved_sym_nodes = []
 
@@ -267,17 +322,18 @@ def default_partition(
             # Symints must be kept separate from tensors so that PythonFunction only calls
             # save_for_backward on tensors and stashes symints in autograd .ctx
             saved_sym_nodes.append(node)
-        elif (
-            'tensor_meta' not in node.meta
-            and node.op == 'call_function'
-        ):
+        elif "tensor_meta" not in node.meta and node.op == "call_function":
             # Since we can't save tuple of tensor values, we need to flatten out what we're saving
             users = node.users
             assert all(user.target == operator.getitem for user in users)
             saved_values.extend(users)
         else:
-            backward_usages = [n for n in node.users if n.name not in forward_node_names]
-            if 'tensor_meta' in node.meta and all(is_sym_node(n) for n in backward_usages):
+            backward_usages = [
+                n for n in node.users if n.name not in forward_node_names
+            ]
+            if "tensor_meta" in node.meta and all(
+                is_sym_node(n) for n in backward_usages
+            ):
                 # If we have a tensor in the forward, where only its sizes/strides are needed in the backward,
                 # and not the actual tensor data,
                 # then it will be a lot cheaper to save only the sizes/strides, and not the actual tensor.
@@ -289,10 +345,15 @@ def default_partition(
                 saved_sym_nodes.extend(backward_usages)
             else:
                 saved_values.append(node)
-    saved_values = list({k: None for k in saved_values}.keys())
-    saved_sym_nodes = list({k: None for k in saved_sym_nodes}.keys())
-
-    return _extract_fwd_bwd_modules(joint_module, saved_values, saved_sym_nodes=saved_sym_nodes, num_fwd_outputs=num_fwd_outputs)
+    saved_values = list(dict.fromkeys(saved_values).keys())
+    saved_sym_nodes = list(dict.fromkeys(saved_sym_nodes).keys())
+
+    return _extract_fwd_bwd_modules(
+        joint_module,
+        saved_values,
+        saved_sym_nodes=saved_sym_nodes,
+        num_fwd_outputs=num_fwd_outputs,
+    )
 
 
 def _prod(x):
@@ -301,12 +362,14 @@ def _prod(x):
         s *= i
     return s
 
+
 def _tensor_nbytes(numel, dtype):
     return numel * dtype.itemsize
 
+
 def _size_of(node: fx.Node) -> int:
-    if 'val' in node.meta:
-        val = node.meta['val']
+    if "val" in node.meta:
+        val = node.meta["val"]
         if isinstance(val, py_sym_types):
             if isinstance(val, torch.SymInt):
                 return 1
@@ -316,15 +379,19 @@ def _size_of(node: fx.Node) -> int:
         # torch._inductor.config.unbacked_symint_fallback (but this is a
         # layering violation)
         elif isinstance(val, (list, tuple)):
-            return sum(_tensor_nbytes(hint_int(n.numel(), fallback=4098), n.dtype) for n in val if isinstance(n, torch.Tensor))
+            return sum(
+                _tensor_nbytes(hint_int(n.numel(), fallback=4098), n.dtype)
+                for n in val
+                if isinstance(n, torch.Tensor)
+            )
         elif isinstance(val, torch.Tensor):
             return _tensor_nbytes(hint_int(val.numel(), fallback=4098), val.dtype)
 
         raise RuntimeError(f"Unknown metadata type {type(val)}")
 
     # Only needed since we don't always trace with fake tensors.
-    if 'tensor_meta' in node.meta:
-        metadata = node.meta['tensor_meta']
+    if "tensor_meta" in node.meta:
+        metadata = node.meta["tensor_meta"]
         # TODO: What is to_size_hint suppose to be?
         numel = _prod(map(to_size_hint, metadata.shape))  # noqa: F821
         dtype = metadata.dtype
@@ -337,11 +404,12 @@ def _size_of(node: fx.Node) -> int:
 # Used for some investigative purposes
 def _count_ops(graph):
     from collections import defaultdict
+
     cnt = defaultdict(int)
     for node in graph.nodes:
-        if node.op == 'call_function':
+        if node.op == "call_function":
             cnt[node.target.__name__] += 1
-    print(sorted(cnt.items(), key=lambda x: x[1], reverse=True))
+    print(sorted(cnt.items(), key=operator.itemgetter(1), reverse=True))
 
 
 @functools.lru_cache(None)
@@ -361,35 +429,12 @@ def pointwise_ops():
 
     return ops
 
-def get_depth(node, depth_map):
-    if node in depth_map:
-        return depth_map[node]
-
-    # Base case
-    if node.op == "placeholder":
-        depth_map[node] = 0
-        return depth_map[node]
-
-    # Handle output node
-    if node.op == "output":
-        args = node.args[0]
-        for arg in args:
-            if isinstance(arg, torch.fx.node.Node):
-                get_depth(arg, depth_map)
-        return
-
-    # Get the depth of args and set the depth of this node
-    arg_depths = [get_depth(arg, depth_map) for arg in node.all_input_nodes if isinstance(arg, torch.fx.node.Node)]
-    # factory ops like full, rand might not have any input args
-    if len(arg_depths) == 0:
-        arg_depths = [0]
-    depth_map[node] = max(arg_depths) + 1
-    return depth_map[node]
-
 
 def sort_depths(args, depth_map):
-    arg_depths = {arg: depth_map[arg] for arg in args if isinstance(arg, torch.fx.node.Node)}
-    return sorted(arg_depths.items(), key=lambda x: x[1], reverse=True)
+    arg_depths = {
+        arg: depth_map[arg] for arg in args if isinstance(arg, torch.fx.node.Node)
+    }
+    return sorted(arg_depths.items(), key=operator.itemgetter(1), reverse=True)
 
 
 def reordering_to_mimic_autograd_engine(gm):
@@ -419,13 +464,8 @@ def reordering_to_mimic_autograd_engine(gm):
     env = {}
 
     # Add new placeholder nodes in the order specified by the inputs
-    for node in gm.graph.nodes:
-        if node.op == "placeholder":
-            new_node = new_graph.placeholder(node.name)
-            # Can't use node_copy here as we may be turning previous call_function into placeholders
-            new_node.meta = node.meta
-            env[node] = new_node
-
+    for node in gm.graph.find_nodes(op="placeholder"):
+        env[node] = new_graph.node_copy(node, lambda x: env[x])
 
     order = {}
     for idx, node in enumerate(gm.graph.nodes):
@@ -433,8 +473,12 @@ def reordering_to_mimic_autograd_engine(gm):
 
     # Populate depth for the nodes. Depth is the distance from the inputs.
     depths = {}
-    output_node = next(node for node in gm.graph.nodes if node.op == "output")
-    get_depth(output_node, depths)
+    output_node = next(iter(gm.graph.find_nodes(op="output")))
+    for node in gm.graph.nodes:
+        if node.op == "placeholder":
+            depths[node] = 0
+        else:
+            depths[node] = max([depths[arg] for arg in node.all_input_nodes], default=0)
 
     def insert_node_in_graph(node):
         if node in env:
@@ -456,10 +500,13 @@ def insert_node_in_graph(node):
             if order[user] < minimum_order:
                 minimum_order = order[user]
                 first_node_in_bwd = user
-    assert first_node_in_bwd is not None
+
+    # If gradInp does not depend upon gradOut, we may not find any nodes in the "backwards pass"
+    if first_node_in_bwd is None:
+        return gm
 
     # Build the graph op-by-op by starting from the node all the way to the end
-    for node in list(gm.graph.nodes)[order[first_node_in_bwd]:]:
+    for node in list(gm.graph.nodes)[order[first_node_in_bwd] :]:
         insert_node_in_graph(node)
 
     # The output node is already built by the traversal.
@@ -542,12 +589,11 @@ def get_sample_rng_state(device):
     run_and_save_rng = torch._prims.rng_prims.run_and_save_rng_state
     run_with_rng_state = torch._prims.rng_prims.run_with_rng_state
 
-    for node in bw_module.graph.nodes:
-        if node.op == "placeholder" and "tangent" in node.name:
+    for node in bw_module.graph.find_nodes(op="placeholder"):
+        if "tangent" in node.name:
             bw_tangent_start_node = node
             break
 
-
     fw_rng_state_outputs = []
     for base_node, node_pair in recomputable_rng_ops_map.items():
         # Step 2 - Modify the fwd pass such that
@@ -559,15 +605,27 @@ def get_sample_rng_state(device):
                 "call_function",
                 run_and_save_rng,
                 args=(fw_node.target, *fw_node.args),
-                kwargs=fw_node.kwargs
+                kwargs=fw_node.kwargs,
+            )
+            state = fw_graph.create_node(
+                "call_function",
+                operator.getitem,
+                args=(functional_fw_node, 0),
+                kwargs={},
+            )
+            rng_output = fw_graph.create_node(
+                "call_function",
+                operator.getitem,
+                args=(
+                    functional_fw_node,
+                    1,
+                ),
+                kwargs={},
             )
-            state = fw_graph.create_node("call_function", operator.getitem, args=(functional_fw_node, 0), kwargs={})
-            rng_output = fw_graph.create_node("call_function", operator.getitem, args=(functional_fw_node, 1,), kwargs={})
             fw_node.replace_all_uses_with(rng_output)
             fw_graph.erase_node(fw_node)
             fw_rng_state_outputs.append(state)
 
-
         # Step 3 - Modify the bwd pass such that
         bw_graph = bw_module.graph
         with bw_graph.inserting_before(bw_tangent_start_node):
@@ -580,20 +638,23 @@ def get_sample_rng_state(device):
                 "call_function",
                 run_with_rng_state,
                 args=(bw_rng_state_node, bw_node.target, *bw_node.args),
-                kwargs=bw_node.kwargs
+                kwargs=bw_node.kwargs,
             )
 
             bw_node.replace_all_uses_with(rng_output)
             bw_graph.erase_node(bw_node)
 
-
     # Add the rng states in the output of the fwd graph. AOT Autograd assumes
     # that symints are at the end of forward graph outputs. So, insert the new
     # rng states accordingly.
-    fw_output_node = next(node for node in fw_module.graph.nodes if node.op == "output")
+    fw_output_node = next(iter(fw_module.graph.find_nodes(op="output")))
     fw_outputs = fw_output_node.args[0]
     sym_node_start_idx = len(fw_outputs) - num_sym_nodes
-    outputs = fw_outputs[:sym_node_start_idx] + fw_rng_state_outputs + fw_outputs[sym_node_start_idx:]
+    outputs = (
+        fw_outputs[:sym_node_start_idx]
+        + fw_rng_state_outputs
+        + fw_outputs[sym_node_start_idx:]
+    )
     fw_module.graph.output(outputs)
     fw_module.graph.erase_node(fw_output_node)
     fw_module.recompile()
@@ -611,14 +672,21 @@ def cleanup_recompute_tags(joint_module):
     for node in joint_module.graph.nodes:
         if must_recompute(node):
             for user in node.users:
-                if must_recompute(user) and user.meta["recompute"] > node.meta["recompute"]:
+                if (
+                    must_recompute(user)
+                    and user.meta["recompute"] > node.meta["recompute"]
+                ):
                     node.meta["recompute"] = 0
     return joint_module
 
 
 def min_cut_rematerialization_partition(
-    joint_module: fx.GraphModule, _joint_inputs, compiler="inductor", recomputable_ops=None,
-    *, num_fwd_outputs
+    joint_module: fx.GraphModule,
+    _joint_inputs,
+    compiler="inductor",
+    recomputable_ops=None,
+    *,
+    num_fwd_outputs,
 ) -> Tuple[fx.GraphModule, fx.GraphModule]:
     """
     Partitions the joint graph such that the backward recomputes the forward.
@@ -648,8 +716,9 @@ def min_cut_rematerialization_partition(
     try:
         import networkx as nx
     except ImportError as e:
-        raise RuntimeError("Need networkx installed to perform smart recomputation "
-                           "heuristics") from e
+        raise RuntimeError(
+            "Need networkx installed to perform smart recomputation " "heuristics"
+        ) from e
 
     joint_module.graph.eliminate_dead_code()
     joint_module.recompile()
@@ -660,7 +729,7 @@ def min_cut_rematerialization_partition(
     if config.cse:
         cse_graph = fx_graph_cse(fx_g)
         joint_module.graph = cse_graph
-    full_bw_graph = joint_module.graph
+    joint_graph = joint_module.graph
 
     graph_has_recomputable_ops = has_recomputable_ops(joint_module)
     graph_has_recomputable_rng_ops = has_recomputable_rng_ops(joint_module)
@@ -674,31 +743,71 @@ def min_cut_rematerialization_partition(
     def classify_nodes(joint_module):
         required_bw_nodes = set()
         for node in joint_module.graph.nodes:
-            if node.op == 'placeholder' and "tangents" in node.target:
+            if node.op == "placeholder" and "tangents" in node.target:
                 required_bw_nodes.add(node)
             if node in required_bw_nodes:
-                for user in node.users:
-                    required_bw_nodes.add(user)
+                required_bw_nodes.update(node.users)
 
         primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
-        fwd_seed_offset_inputs = list(filter(_is_fwd_seed_offset, joint_module.graph.nodes))
+        fwd_seed_offset_inputs = list(
+            filter(_is_fwd_seed_offset, joint_module.graph.nodes)
+        )
         inputs = primal_inputs + fwd_seed_offset_inputs
-        fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
-        required_bw_nodes.update(o for o in bwd_outputs if o is not None)
-        forward_only_graph = _extract_graph_with_inputs_outputs(joint_module.graph, inputs, fwd_outputs)
-        required_fw_nodes = {name_to_node[node.name] for node in forward_only_graph.nodes
-                             if node.op != 'output'}
-        unclaimed_nodes = {node for node in joint_module.graph.nodes
-                           if node not in required_fw_nodes and node not in required_bw_nodes}
-        return fwd_outputs, required_fw_nodes, required_bw_nodes, unclaimed_nodes
-
-    orig_fw_outputs, required_fw_nodes, required_bw_nodes, unclaimed_nodes = classify_nodes(joint_module)
+        fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(
+            joint_module, num_fwd_outputs=num_fwd_outputs
+        )
+        required_bw_nodes.update(
+            o for o in bwd_outputs if o is not None and o.op != "output"
+        )
+        forward_only_graph = _extract_graph_with_inputs_outputs(
+            joint_module.graph, inputs, fwd_outputs
+        )
+        required_fw_nodes = {
+            name_to_node[node.name]
+            for node in forward_only_graph.nodes
+            if node.op != "output"
+        }
+        unclaimed_nodes = {
+            node
+            for node in joint_module.graph.nodes
+            if node not in required_fw_nodes and node not in required_bw_nodes
+        }
+        return (
+            fwd_outputs,
+            required_fw_nodes,
+            required_bw_nodes,
+            unclaimed_nodes,
+            inputs,
+        )
+
+    (
+        orig_fw_outputs,
+        required_fw_nodes,
+        required_bw_nodes,
+        unclaimed_nodes,
+        inputs,
+    ) = classify_nodes(joint_module)
 
     # networkx blows up on graphs with no required backward nodes
     # Since there's nothing to partition anyway, and the default partitioner can "handle"
     # this case, send our graph over to the default partitioner.
     if len(required_bw_nodes) == 0:
-        return default_partition(joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs)
+        return default_partition(
+            joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs
+        )
+
+    def is_fusible(a, b):
+        # We can perform "memory fusion" into a cat, but cat cannot be a
+        # producer to a fusion
+        if get_aten_target(b) == aten.cat:
+            return True
+        return get_aten_target(a) in fusible_ops and get_aten_target(b) in fusible_ops
+
+    fw_order = 0
+    for node in joint_module.graph.nodes:
+        if node in required_fw_nodes:
+            node.fw_order = fw_order
+            fw_order += 1
 
     for node in reversed(joint_module.graph.nodes):
         if node not in required_fw_nodes:
@@ -712,13 +821,131 @@ def classify_nodes(joint_module):
     prims = torch.ops.prims
 
     # compiler == "nvfuser" is the default set of recomputable ops
-    default_recomputable_ops = [aten.add, aten.sub, aten.div, aten.atan2, aten.mul, aten.max, aten.min, aten.pow, aten.remainder, aten.fmod, aten.__and__, aten.__or__, aten.__xor__, aten.__lshift__, aten.__rshift__, aten.eq, aten.ne, aten.ge, aten.gt, aten.le, aten.lt, aten.abs, aten.bitwise_not, aten.ceil, aten.floor, aten.frac, aten.neg, aten.relu, aten.round, aten.silu, aten.trunc, aten.log, aten.log10, aten.log1p, aten.log2, aten.lgamma, aten.exp, aten.expm1, aten.erf, aten.erfc, aten.cos, aten.acos, aten.cosh, aten.sin, aten.asin, aten.sinh, aten.tan, aten.atan, aten.tanh, aten.atanh, aten.sqrt, aten.rsqrt, aten.reciprocal, aten.sigmoid, aten.softplus, aten.threshold, aten.threshold_backward, aten.clamp, aten.where, aten.lerp, aten.addcmul, aten.gelu, aten.gelu_backward, aten.sum, aten.mean, aten._grad_sum_to_size, aten.sum_to_size, aten.amax, aten.to, aten.type_as, operator.getitem, aten.squeeze, aten.unsqueeze, aten.rsub, aten._to_copy]  # noqa: E501,B950
+    default_recomputable_ops = [
+        aten.add,
+        aten.sub,
+        aten.div,
+        aten.atan2,
+        aten.mul,
+        aten.max,
+        aten.min,
+        aten.pow,
+        aten.remainder,
+        aten.fmod,
+        aten.__and__,
+        aten.__or__,
+        aten.__xor__,
+        aten.__lshift__,
+        aten.__rshift__,
+        aten.eq,
+        aten.ne,
+        aten.ge,
+        aten.gt,
+        aten.le,
+        aten.lt,
+        aten.abs,
+        aten.bitwise_not,
+        aten.ceil,
+        aten.floor,
+        aten.frac,
+        aten.neg,
+        aten.relu,
+        aten.round,
+        aten.silu,
+        aten.trunc,
+        aten.log,
+        aten.log10,
+        aten.log1p,
+        aten.log2,
+        aten.lgamma,
+        aten.exp,
+        aten.expm1,
+        aten.erf,
+        aten.erfc,
+        aten.cos,
+        aten.acos,
+        aten.cosh,
+        aten.sin,
+        aten.asin,
+        aten.sinh,
+        aten.tan,
+        aten.atan,
+        aten.tanh,
+        aten.atanh,
+        aten.sqrt,
+        aten.rsqrt,
+        aten.reciprocal,
+        aten.sigmoid,
+        aten.softplus,
+        aten.threshold,
+        aten.threshold_backward,
+        aten.clamp,
+        aten.where,
+        aten.lerp,
+        aten.addcmul,
+        aten.gelu,
+        aten.gelu_backward,
+        aten.sum,
+        aten.mean,
+        aten._grad_sum_to_size,
+        aten.sum_to_size,
+        aten.amax,
+        aten.to,
+        aten.type_as,
+        operator.getitem,
+        aten.squeeze,
+        aten.unsqueeze,
+        aten.rsub,
+        aten._to_copy,
+    ]  # noqa: E501,B950
     view_ops = [aten.squeeze, aten.unsqueeze, aten.alias]
     if compiler == "inductor":
-        default_recomputable_ops += [prims.div, prims.convert_element_type, aten.clone, aten._to_copy, aten.full_like, prims.var, prims.sum, aten.var, aten.std, prims.broadcast_in_dim, aten.select, aten.permute, aten._unsafe_view, aten.view, aten.expand, aten.slice, aten.reshape, aten.broadcast_tensors, aten.scalar_tensor, aten.ones, aten.new_zeros, aten.lift_fresh_copy, aten.arange, aten.triu, aten.var_mean, aten.isinf, aten.any, aten.full, aten.as_strided, aten.zeros, aten.argmax, aten.maximum]  # noqa: E501,B950
-        view_ops += [aten.view, aten.slice, aten.permute, aten.t, prims.broadcast_in_dim, aten.expand, aten.as_strided]
+        default_recomputable_ops += [
+            prims.div,
+            prims.convert_element_type,
+            aten.clone,
+            aten._to_copy,
+            aten.full_like,
+            prims.var,
+            prims.sum,
+            aten.var,
+            aten.std,
+            prims.broadcast_in_dim,
+            aten.select,
+            aten._unsafe_view,
+            aten.view,
+            aten.expand,
+            aten.slice,
+            aten.reshape,
+            aten.broadcast_tensors,
+            aten.scalar_tensor,
+            aten.ones,
+            aten.new_zeros,
+            aten.lift_fresh_copy,
+            aten.arange,
+            aten.triu,
+            aten.var_mean,
+            aten.isinf,
+            aten.any,
+            aten.full,
+            aten.as_strided,
+            aten.zeros,
+            aten.argmax,
+            aten.maximum,
+            prims.iota,
+            prims._low_memory_max_pool2d_offsets_to_indices,
+        ]  # noqa: E501,B950
+        view_ops += [
+            aten.view,
+            aten.slice,
+            aten.t,
+            prims.broadcast_in_dim,
+            aten.expand,
+            aten.as_strided,
+            aten.permute,
+        ]
         # Natalia said that we should allow recomputing indexing :)
-        default_recomputable_ops += [aten.index]
+        default_recomputable_ops += [aten.index, aten.gather]
     default_recomputable_ops += view_ops
 
     default_recomputable_ops += pointwise_ops()
@@ -727,15 +954,24 @@ def classify_nodes(joint_module):
         aten.zeros_like,
     ]
 
-    default_recomputable_ops += [
-        method_to_operator(m)
-        for m in magic_methods
-    ]
-
-    recomputable_ops = set(recomputable_ops) if recomputable_ops is not None else set(default_recomputable_ops)
+    default_recomputable_ops += [method_to_operator(m) for m in magic_methods]
+    recomputable_ops = (
+        set(recomputable_ops)
+        if recomputable_ops is not None
+        else set(default_recomputable_ops)
+    )
 
     random_ops = [aten.native_dropout, aten.rand_like, aten.randn_like]
-    compute_intensive_ops = [aten.mm, aten.convolution, aten.convolution_backward, aten.bmm, aten.addmm, aten.upsample_bilinear2d, aten._softmax, aten._softmax_backward_data, aten.native_layer_norm, aten.native_layer_norm_backward, aten.native_batch_norm, aten.native_batch_norm_backward, aten._native_batch_norm_legit]  # noqa: E501,B950
+    compute_intensive_ops = [
+        aten.mm,
+        aten.convolution,
+        aten.convolution_backward,
+        aten.bmm,
+        aten.addmm,
+        aten._scaled_dot_product_flash_attention,
+        aten._scaled_dot_product_efficient_attention,
+        aten.upsample_bilinear2d,
+    ]  # noqa: E501,B950
 
     fusible_ops = recomputable_ops | set(random_ops)
     if AOT_PARTITIONER_DEBUG:
@@ -748,73 +984,84 @@ def classify_nodes(joint_module):
         print("Ops banned from rematerialization: ", ops_ignored)
         print()
 
-    # `AGGRESSIVE_RECOMPUTATION` is a mode that recomputes everything except
-    # random ops and compute-intensive ops.
-    # It's an internal-only debug mode and is not related to user-facing
-    # (selective) activation checkpointing.
-    AGGRESSIVE_RECOMPUTATION = False
+    BAN_IF_USED_FAR_APART = config.ban_recompute_used_far_apart
+    BAN_IF_LONG_FUSIBLE_CHAINS = config.ban_recompute_long_fusible_chains
+    BAN_IF_MATERIALIZED_BACKWARDS = config.ban_recompute_materialized_backward
+    BAN_IF_NOT_IN_ALLOWLIST = config.ban_recompute_not_in_allowlist
+    BAN_IF_REDUCTION = config.ban_recompute_reductions
+
+    if config.aggressive_recomputation:
+        BAN_IF_MATERIALIZED_BACKWARDS = False
+        BAN_IF_USED_FAR_APART = False
+        BAN_IF_LONG_FUSIBLE_CHAINS = False
+        BAN_IF_NOT_IN_ALLOWLIST = False
 
     def is_materialized_backwards(node):
+        if get_aten_target(node) in view_ops:
+            return False
         cur_nodes = {node}
         while len(cur_nodes) > 0:
             cur = cur_nodes.pop()
             for user in cur.users:
                 if user not in required_fw_nodes and not is_fusible(cur, user):
                     return True
-                if user not in required_fw_nodes and get_aten_target(user) in view_ops:
+                if get_aten_target(user) in view_ops:
                     cur_nodes.add(user)
 
         return False
 
-    def ban_recomputation(node):
-        if "recompute" in node.meta:
-            return node.meta["recompute"] == 0
-        elif AGGRESSIVE_RECOMPUTATION:
-            ignored_ops = random_ops + compute_intensive_ops
-            return (node.op == 'call_function' and get_aten_target(node) in ignored_ops)
-        else:
-            if node.op != 'call_function':
-                return False
+    def should_ban_recomputation(node):
+        if node.op != "call_function":
+            return False
+        if node.target == operator.getitem:
+            return False
+        if node.target in [aten.lift_fresh_copy.default, aten.lift_fresh.default]:
+            return False
+
+        # NB: "recompute" == 0 means that must save this node.
+        if node.meta.get("recompute", None) == 0:
+            return True
+
+        if BAN_IF_NOT_IN_ALLOWLIST:
             if get_aten_target(node) not in recomputable_ops:
                 return True
-            if node.target == operator.getitem:
-                return False
-            if node.target in [aten.lift_fresh_copy.default, aten.lift_fresh.default]:
-                return False
-
-            # If a node *must* be materialized in the backwards pass, then we
-            # should never recompute it. This is a pretty subtle point.  In
-            # general, the assumption we make is that recomputing a node in the
-            # backwards pass is "free". However, if a node must be materialized
-            # in the backwards pass, then recomputing it is never free.
-            if is_materialized_backwards(node):
+        else:
+            ignored_ops = random_ops + compute_intensive_ops
+            if get_aten_target(node) in ignored_ops:
                 return True
 
-            # Arbitrary hack that sometimes seems to help things. The above
-            # modification appears to have made this heuristic a lot less critical
-            # for performance.
-            # TODO: Investigate why this hack helps.
-            # TODO: Investigate the interaction with compiler assisted
-            # activation checkpointing. Removing the heuristic improves both
-            # memory footprint and speedup.
-            if not graph_has_recomputable_ops:
-                if compiler == "inductor" and node.dist_from_bw > config.max_dist_from_bw:
-                    return True
-            # If the output of an op is 4x smaller (arbitrary choice),
-            # then we don't allow recomputation.
-            input_tensors_size = sum(_size_of(i) for i in node.args if isinstance(i, fx.Node))
-            output_size = _size_of(node)
-            return (output_size * 4 < input_tensors_size)
-
-    def is_fusible(a, b):
-        # We can perform "memory fusion" into a cat, but cat cannot be a
-        # producer to a fusion
-        if get_aten_target(b) == aten.cat:
+        # If a node *must* be materialized in the backwards pass, then we
+        # should never recompute it. This is a pretty subtle point.  In
+        # general, the assumption we make is that recomputing a node in the
+        # backwards pass is "free". However, if a node must be materialized
+        # in the backwards pass, then recomputing it is never free.
+        if is_materialized_backwards(node) and BAN_IF_MATERIALIZED_BACKWARDS:
+            log.info("materialized backwards: %s %s", node, tuple(node.users))
             return True
-        return get_aten_target(a) in fusible_ops and get_aten_target(b) in fusible_ops
+
+        # Arbitrary hack that sometimes seems to help things. The above
+        # modification appears to have made this heuristic a lot less critical
+        # for performance.
+        # NB: As of PR #121692, this hack no longer seems necessary.
+        if not graph_has_recomputable_ops:
+            if compiler == "inductor" and node.dist_from_bw > config.max_dist_from_bw:
+                return True
+
+        # If the output of an op is 4x smaller (arbitrary choice),
+        # then we don't allow recomputation. The idea here is that for
+        # things like reductions, saving the output of the reduction is very
+        # cheap/small, and it makes sure we don't do things like recompute
+        # normalizations in the backwards.
+        if BAN_IF_REDUCTION:
+            input_tensors_size = sum(
+                _size_of(i) for i in node.args if isinstance(i, fx.Node)
+            )
+            output_size = _size_of(node)
+            return output_size * 4 < input_tensors_size
+        return False
 
     def is_materialized(node):
-        if node.op == 'placeholder':
+        if node.op == "placeholder":
             return True
 
         return not all(is_fusible(node, user) for user in node.users)
@@ -825,39 +1072,69 @@ def get_node_weight(node) -> int:
         # Heuristic to bias towards nodes closer to the backwards pass
         # Complete guess about current value
         mem_sz = int(mem_sz * (1.1 ** max(min(node.dist_from_bw, 100), 1)))
-        # mem_sz = int(mem_sz + node.dist_from_bw)
-
         if is_materialized(node):
             return mem_sz
         else:
             return mem_sz * 2
 
     nx_graph = nx.DiGraph()
-    for node in full_bw_graph.nodes:
-        if node.op == 'output':
+    banned_nodes = set()
+
+    def ban_recomputation_if_allowed(node):
+        # This bans recomputation of the node unless we've been forced not to by
+        # user annotation
+        # NB: "recompute" > 0 means that user annotation has asked us to
+        # recompute it
+        if node.meta.get("recompute", 0) > 0:
+            return False
+
+        if "val" in node.meta and isinstance(node.meta["val"], torch.SymFloat):
+            return False
+
+        banned_nodes.add(node)
+        # A node will only ever be recomputed if there is a path from an
+        # ancestor of this node to the backwards path through this node that
+        # doesn't go through any saved value. If this node is saved, then that
+        # condition is not possible.
+        nx_graph.add_edge("source", node.name + "_in", capacity=math.inf)
+        return True
+
+    for node in joint_graph.nodes:
+        if node.op == "output":
             continue
 
         if node in required_bw_nodes:
-            nx_graph.add_edge(node.name + "_in", "sink", capacity=math.inf)
-            continue
+            if node not in inputs:
+                nx_graph.add_edge(node.name + "_in", "sink", capacity=math.inf)
+                continue
+            # If someone saves a input for backward as-is and backward
+            # returns that tensor as-is as a grad input, then the node x would
+            # be both a required_bw_node and an input. In this case we
+            # (1) connect x_in to to the source, (2) x_out to the sink, and
+            # (3) assign the proper weight to the x_in-x_out edge, so that
+            # x would be part of cut nodes. A case where this happens is if
+            # NestedTensor saves a offset tensor as part of the singleton int
+            # in sizes.
+            nx_graph.add_edge(node.name + "_out", "sink", capacity=math.inf)
 
         if _is_primal(node) or _is_fwd_seed_offset(node):
-            nx_graph.add_edge("source", node.name + "_in", capacity=math.inf)
+            ban_recomputation_if_allowed(node)
 
         # If a node can't be recomputed (too expensive or involves randomness),
         # we prevent it from being recomputed by adding an inf edge to the source
         # We only need to ban nodes in the fw pass, as those are the only ones that would be recomputed.
-        if ban_recomputation(node) and node in required_fw_nodes:
-            nx_graph.add_edge("source", node.name + "_in", capacity=math.inf)
+        if node in required_fw_nodes and should_ban_recomputation(node):
+            ban_recomputation_if_allowed(node)
 
         # Checks if a node is actually a tuple. Can be simplified to just an isinstance check if we always use faketensors.
-        is_non_tensor_node = (('val' not in node.meta and 'tensor_meta' not in node.meta) or
-                              ('val' in node.meta and not isinstance(node.meta['val'], torch.Tensor)))
+        is_non_tensor_node = (
+            "val" not in node.meta and "tensor_meta" not in node.meta
+        ) or ("val" in node.meta and not isinstance(node.meta["val"], torch.Tensor))
 
         if is_sym_node(node):
             weight = sym_node_size(node)
         elif is_non_tensor_node:
-            weight = math.inf
+            weight = 0 if isinstance(node.meta.get("val"), BackwardState) else math.inf
         else:
             weight = get_node_weight(node)
 
@@ -866,11 +1143,119 @@ def get_node_weight(node) -> int:
         for user in node.users:
             nx_graph.add_edge(node.name + "_out", user.name + "_in", capacity=math.inf)
 
+    # todo(chilli): This is the most questionable of the 3 heuristics for banning recompute.
+    # Some example models to look at where this helps perf: poolformer_m36,
+    # mixer_b16_224, cait_m36_384
+
+    # The "rough" idea here is that if you have some node that is used by both a
+    # node nearby downstream as well as a node far downstream, if we recompute
+    # both of the downstream nodes, we're unlikely to be able to fuse both
+    # downstream nodes together.
+
+    # Thus, we shouldn't aim to recompute far downstream nodes that depend on
+    # this node. That intuition of "far downstream" is captured by whether
+    # there's an unfusible op along the chain somewhere
+
+    # It could probably be improved by properly analyzing what's going on in the
+    # backwards pass instead of only relying on whether it's unfusible in the
+    # forwards.
+
+    def find_first_unfusible(start_nodes: List[fx.Node], max_range: int) -> int:
+        """
+        Finds the first unfusible node in the chain of nodes starting from
+        `start_nodes` and returns its position.
+        """
+        sorted_nodes = []
+        for n in start_nodes:
+            heapq.heappush(sorted_nodes, (n.fw_order, n, True))
+
+        while len(sorted_nodes) > 0:
+            _, node, node_is_fusible = heapq.heappop(sorted_nodes)
+            if not node_is_fusible:
+                return node.fw_order
+            for user in node.users:
+                if user in required_fw_nodes:
+                    if user.fw_order > max_range:
+                        continue
+                    heapq.heappush(
+                        sorted_nodes, (user.fw_order, user, is_fusible(node, user))
+                    )
+        return max_range
+
+    if BAN_IF_USED_FAR_APART:
+        for used_node in required_fw_nodes:
+            orders = [
+                user.fw_order for user in used_node.users if user in required_fw_nodes
+            ]
+            fw_users = [user for user in used_node.users if user in required_fw_nodes]
+            if len(orders) > 0:
+                first_unfusible_use = find_first_unfusible(fw_users, max(orders))
+                for user in tuple(used_node.users):
+                    if (
+                        user in required_fw_nodes
+                        and user.fw_order > first_unfusible_use
+                        and is_fusible(used_node, user)
+                    ):
+                        if user in banned_nodes:
+                            continue
+                        log.info(
+                            "used above/below fusible %s:(%s) -> %s -> %s:(%s)",
+                            used_node,
+                            used_node.fw_order,
+                            first_unfusible_use,
+                            user,
+                            user.fw_order,
+                        )
+                        ban_recomputation_if_allowed(user)
+
+    # This heuristic is fairly straightforward. The idea is that although it is
+    # cheap to recompute bandwidth-bound ops, we don't want to end up in a situation
+    # where we have a long chain of pointwise ops from the beginning to the end
+    # of the model (like say, residual connections)
+
+    # todo: I'm not totally sure why this heuristic matters. It's possible that this is
+    # working around Inductor fusion decisions, or that it's a patch over
+    # suboptimal partitioning decisions
+
+    # Some models it improves perf on are cait_m36_384, mixer_b16_224, poolformer_m36
+
+    if BAN_IF_LONG_FUSIBLE_CHAINS:
+        visited = set()
+        for start_node in joint_graph.nodes:
+            if start_node not in required_fw_nodes:
+                continue
+            fusible = [(start_node.fw_order, start_node)]
+            start_order = start_node.fw_order
+            while len(fusible) > 0:
+                _, cur = heapq.heappop(fusible)
+                if cur in visited:
+                    continue
+                visited.add(cur)
+                # 100 is arbitrary choice to try and prevent degenerate cases
+                if cur.fw_order > start_order + 100 and len(fusible) == 0:
+                    log.info(
+                        "too long %s %s %s %s",
+                        cur,
+                        start_node,
+                        cur.fw_order,
+                        start_node.fw_order,
+                    )
+                    ban_recomputation_if_allowed(cur)
+                    break
+
+                for user in cur.users:
+                    if (
+                        user in required_fw_nodes
+                        and is_fusible(cur, user)
+                        and user not in banned_nodes
+                    ):
+                        heapq.heappush(fusible, (user.fw_order, user))
+
     try:
         cut_value, partition = nx.minimum_cut(nx_graph, "source", "sink")
     except Exception:
-        print('Failed to compute min-cut on following graph:')
-        print('\n'.join(nx.readwrite.edgelist.generate_edgelist(nx_graph)))
+        print("Failed to compute min-cut on following graph:")
+        print("\n".join(nx.readwrite.edgelist.generate_edgelist(nx_graph)))
         raise
 
     reachable, non_reachable = partition
@@ -886,33 +1271,55 @@ def get_node_weight(node) -> int:
 
     # To make this stuff deterministic
     node_idx = {node: idx for idx, node in enumerate(joint_module.graph.nodes)}
-    saved_values = sorted((name_to_node[node] for node in cut_nodes), key=lambda x: node_idx[x])
+    saved_values = sorted(
+        (name_to_node[node] for node in cut_nodes), key=lambda x: node_idx[x]
+    )
     # save_for_backward on tensors and stashes symints in autograd .ctx
     saved_sym_nodes = list(filter(is_sym_node, saved_values))
     saved_values = list(filter(lambda n: not is_sym_node(n), saved_values))
     # NB: saved_sym_nodes will be mutated to reflect the actual saved symbols
     fw_module, bw_module = _extract_fwd_bwd_modules(
-        joint_module, saved_values, saved_sym_nodes=saved_sym_nodes, num_fwd_outputs=num_fwd_outputs)
+        joint_module,
+        saved_values,
+        saved_sym_nodes=saved_sym_nodes,
+        num_fwd_outputs=num_fwd_outputs,
+    )
 
     if graph_has_recomputable_ops:
         if graph_has_recomputable_rng_ops:
             fw_module, bw_module = functionalize_rng_ops(
                 joint_module, fw_module, bw_module, len(saved_sym_nodes)
             )
-        bw_module = reordering_to_mimic_autograd_engine(bw_module)
+    bw_module = reordering_to_mimic_autograd_engine(bw_module)
 
     if AOT_PARTITIONER_DEBUG:
-        print("Theoretical Activations Stored: ", sum([_size_of(i) for i in saved_values]) / 1e9)
-        fw_module_nodes = {node.name for node in fw_module.graph.nodes if node.op == 'call_function'}
-        bw_module_nodes = {node.name for node in bw_module.graph.nodes if node.op == 'call_function'}
+        from torch._inductor.fx_utils import get_node_storage
+
+        storages = {get_node_storage(node) for node in saved_values}
+        print(
+            "Theoretical Activations Stored: ",
+            sum(_size_of(i) for i in saved_values) / 1e9,
+        )
+        sorted_sizes = sorted([(_size_of(i), str(i)) for i in saved_values])
+        fw_module_nodes = {
+            node.name for node in fw_module.graph.nodes if node.op == "call_function"
+        }
+        bw_module_nodes = {
+            node.name for node in bw_module.graph.nodes if node.op == "call_function"
+        }
         remat_nodes = fw_module_nodes & bw_module_nodes
 
         counts = defaultdict(int)
         for node in fw_module.graph.nodes:
-            if node.name in remat_nodes and hasattr(node.target, '_overloadpacket'):
+            if node.name in remat_nodes and hasattr(node.target, "_overloadpacket"):
                 counts[str(node.target._overloadpacket)] += 1
-        print(f"# remat/fw/bw: {len(remat_nodes)}/{len(fw_module_nodes)}/{len(bw_module_nodes)}")
-        print("Count of Ops Rematerialized: ", sorted(counts.items(), key=lambda x: x[1], reverse=True))
+        print(
+            f"# remat/fw/bw: {len(remat_nodes)}/{len(fw_module_nodes)}/{len(bw_module_nodes)}"
+        )
+        print(
+            "Count of Ops Rematerialized: ",
+            sorted(counts.items(), key=operator.itemgetter(1), reverse=True),
+        )
     return fw_module, bw_module
 
 
diff --git a/torch/_functorch/pyfunctorch.py b/torch/_functorch/pyfunctorch.py
index 8080232bde283..5a78facf08c08 100644
--- a/torch/_functorch/pyfunctorch.py
+++ b/torch/_functorch/pyfunctorch.py
@@ -1,18 +1,19 @@
-from abc import ABC, abstractmethod
 import contextlib
-from typing import Any
+from abc import ABC, abstractmethod
+from typing import Any, List, Tuple
+
 import torch
 import torch.utils._pytree as pytree
 from torch._C._functorch import (
-    TransformType,
-    RandomnessType,
-    CInterpreter,
-    CGradInterpreterPtr,
     CFunctionalizeInterpreterPtr,
-    CVmapInterpreterPtr,
+    CGradInterpreterPtr,
+    CInterpreter,
     CJvpInterpreterPtr,
+    CVmapInterpreterPtr,
     pop_dynamic_layer_stack,
     push_dynamic_layer_stack,
+    RandomnessType,
+    TransformType,
 )
 from torch.autograd.forward_ad import _set_fwd_grad_enabled
 
@@ -70,6 +71,12 @@ def level(self):
     def key(self):
         return self._cptr.key()
 
+    def get_state(self):
+        raise NotImplementedError
+
+    def check_state(self, state):
+        return state == self.get_state()
+
 
 @contextlib.contextmanager
 def temporarily_pop_interpreter_stack():
@@ -80,6 +87,33 @@ def temporarily_pop_interpreter_stack():
         push_dynamic_layer_stack(saved)
 
 
+@contextlib.contextmanager
+def temporarily_clear_interpreter_stack():
+    stack = []
+    try:
+        while torch._C._functorch.peek_interpreter_stack() is not None:
+            stack.append(pop_dynamic_layer_stack())
+        yield list(stack)
+    finally:
+        while stack:
+            push_dynamic_layer_stack(stack.pop())
+
+
+@contextlib.contextmanager
+def temporarily_restore_interpreter_stack(stack):
+    pushed = []
+    try:
+        for s in reversed(stack):
+            push_dynamic_layer_stack(s)
+            pushed.append(s)
+        yield
+    finally:
+        for s in reversed(pushed):
+            # TODO: would be nice to assert that the layers are the same, but
+            # Python object identity is not preserved
+            pop_dynamic_layer_stack()
+
+
 class VmapInterpreter(FuncTorchInterpreter):
     def __init__(self, cdata: CInterpreter):
         assert cdata.key() == TransformType.Vmap
@@ -106,6 +140,9 @@ def randomness(self):
             return "different"
         raise RuntimeError(f"Unknown RandomnessType: {typ}")
 
+    def get_state(self):
+        return (self.key().name, self.level(), self.randomness())
+
 
 @contextlib.contextmanager
 def nested(*contexts):
@@ -123,7 +160,9 @@ def __init__(self, cdata: CInterpreter):
         self._cptr = CGradInterpreterPtr(cdata)
 
     def lift(self, args, kwargs):
-        args, kwargs = pytree.tree_map_only(torch.Tensor, self._cptr.lift, [args, kwargs])
+        args, kwargs = pytree.tree_map_only(
+            torch.Tensor, self._cptr.lift, [args, kwargs]
+        )
         return args, kwargs
 
     def process(self, op, args, kwargs):
@@ -143,6 +182,9 @@ def lower(self):
     def prev_grad_mode(self):
         return self._cptr.prevGradMode()
 
+    def get_state(self):
+        return (self.key().name, self.level(), self.prev_grad_mode())
+
 
 class JvpInterpreter(FuncTorchInterpreter):
     def __init__(self, cdata: CInterpreter):
@@ -152,7 +194,9 @@ def __init__(self, cdata: CInterpreter):
         self._cptr = CJvpInterpreterPtr(cdata)
 
     def lift(self, args, kwargs):
-        args, kwargs = pytree.tree_map_only(torch.Tensor, self._cptr.lift, [args, kwargs])
+        args, kwargs = pytree.tree_map_only(
+            torch.Tensor, self._cptr.lift, [args, kwargs]
+        )
         return args, kwargs
 
     def process(self, op, args, kwargs):
@@ -172,6 +216,9 @@ def lower(self):
     def prev_fwd_grad_mode(self):
         return self._cptr.prevFwdGradMode()
 
+    def get_state(self):
+        return (self.key().name, self.level(), self.prev_fwd_grad_mode())
+
 
 class FunctionalizeInterpreter(FuncTorchInterpreter):
     def __init__(self, cdata: CInterpreter):
@@ -186,6 +233,9 @@ def process(self, op, args, kwargs):
     def functionalize_add_back_views(self):
         return self._cptr.functionalizeAddBackViews()
 
+    def get_state(self):
+        return (self.key().name, self.level())
+
 
 def coerce_cinterpreter(cinterpreter: CInterpreter) -> FuncTorchInterpreter:
     key = cinterpreter.key()
@@ -200,12 +250,35 @@ def coerce_cinterpreter(cinterpreter: CInterpreter) -> FuncTorchInterpreter:
     raise RuntimeError(f"NYI: PyDispatcher has not implemented support for {key}")
 
 
-def retrieve_current_functorch_interpreter():
+def retrieve_current_functorch_interpreter() -> FuncTorchInterpreter:
     interpreter = torch._C._functorch.peek_interpreter_stack()
     assert interpreter is not None
     return coerce_cinterpreter(interpreter)
 
 
+def retrieve_all_functorch_interpreters() -> List[FuncTorchInterpreter]:
+    cis = torch._C._functorch.get_interpreter_stack()
+    if cis is None:
+        return []
+    return [coerce_cinterpreter(ci) for ci in cis]
+
+
+def compare_functorch_state(states: List[Tuple[Any, ...]]) -> bool:
+    # There are four possible cases covered here:
+    # 1. Current stack empty AND stack when generated not empty -> Invalidate
+    # 2. Current stack not empty AND stack when generated empty -> Invalidate
+    # 3. Current stack and generated stack empty -> Valid FX graph
+    # 4. Current stack and generated stack not empty -> Valid if both states match
+    peek = torch._C._functorch.peek_interpreter_stack()
+    if (peek is None and len(states) != 0) or (peek is not None and len(states) == 0):
+        return False
+
+    cis = retrieve_all_functorch_interpreters()
+    return len(cis) == len(states) and all(
+        ci.check_state(state) for ci, state in zip(cis, states)
+    )
+
+
 def dispatch_functorch(op, args, kwargs):
     interpreter = retrieve_current_functorch_interpreter()
     # In traditional PyTorch operators, DispatchKey::FuncTorchTensorWrapper's
@@ -214,5 +287,6 @@ def dispatch_functorch(op, args, kwargs):
     # transforms, so we manually unwrap the dead tensors here.
     # This logic won't need to exist when we have mode-only functorch.
     args, kwargs = pytree.tree_map_only(
-        torch.Tensor, torch._C._functorch.unwrap_if_dead, (args, kwargs))
+        torch.Tensor, torch._C._functorch.unwrap_if_dead, (args, kwargs)
+    )
     return interpreter.process(op, args, kwargs)
diff --git a/torch/_functorch/python_key.py b/torch/_functorch/python_key.py
index e7c805841a62b..dea0b16a7c6a4 100644
--- a/torch/_functorch/python_key.py
+++ b/torch/_functorch/python_key.py
@@ -4,6 +4,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 __all__ = ["make_fx", "dispatch_trace", "PythonKeyTracer", "pythonkey_decompose"]
-from torch.fx.experimental.proxy_tensor import make_fx, dispatch_trace, PythonKeyTracer, decompose
+from torch.fx.experimental.proxy_tensor import (
+    decompose,
+    dispatch_trace,
+    make_fx,
+    PythonKeyTracer,
+)
 
 pythonkey_decompose = decompose
diff --git a/torch/_functorch/top_operators_github_usage.py b/torch/_functorch/top_operators_github_usage.py
index cc00478b31a60..ce74f7aadfb6d 100644
--- a/torch/_functorch/top_operators_github_usage.py
+++ b/torch/_functorch/top_operators_github_usage.py
@@ -1,7 +1,11 @@
+# mypy: ignore-errors
+
 """
 From https://docs.google.com/spreadsheets/d/12R3nCOLskxPYjjiNkdqy4OdQ65eQp_htebXGODsjSeA/edit#gid=0
 Try to keep this list in sync with that.
 """
+import operator
+
 top_torch = [
     ("t", 6837449),
     ("tensor", 585786),
@@ -525,7 +529,11 @@
     ("nn.Container", 217, None),
     ("nn.Unflatten", 206, "nn.functional.unflatten"),
     ("nn.FeatureAlphaDropout", 136, "nn.functional.feature_alpha_dropout"),
-    ("nn.TripletMarginWithDistanceLoss", 107, "nn.functional.triplet_margin_with_distance_loss"),
+    (
+        "nn.TripletMarginWithDistanceLoss",
+        107,
+        "nn.functional.triplet_margin_with_distance_loss",
+    ),
     ("nn.ChannelShuffle", 90, "nn.functional.channel_shuffle"),
     ("nn.RNNCellBase", 88, None),
     ("nn.LazyLinear", 81, "nn.functional.linear"),
@@ -550,52 +558,52 @@
 
 # No rankings because these are a little hard to get rankings for
 method_only_ops = [
-    'bfloat16',
-    'bool',
-    'byte',
-    'char',
-    'contiguous',
-    'cpu',
-    'cuda',
-    'detach',
-    'double',
-    'expand',
-    'expand_as',
-    'float',
-    'get_device',
-    'half',
-    'hardshrink',
-    'index_add',
-    'index_copy',
-    'index_fill',
-    'index_put',
-    'int',
-    'is_contiguous',
-    'is_pinned',
-    'is_set_to',
-    'is_shared',
-    'is_signed',
-    'item',
-    'long',
-    'masked_scatter',
-    'masked_fill',
-    'narrow_copy',
-    'numpy',
-    'pin_memory',
-    'repeat',
-    'reshape_as',
-    'select',
-    'short',
-    'storage_offset',
-    'sum_to_size',
-    'to',
-    'to_mkldnn',
-    'tolist',
-    'type',
-    'type_as',
-    'unfold',
-    'view',
-    'view_as',
+    "bfloat16",
+    "bool",
+    "byte",
+    "char",
+    "contiguous",
+    "cpu",
+    "cuda",
+    "detach",
+    "double",
+    "expand",
+    "expand_as",
+    "float",
+    "get_device",
+    "half",
+    "hardshrink",
+    "index_add",
+    "index_copy",
+    "index_fill",
+    "index_put",
+    "int",
+    "is_contiguous",
+    "is_pinned",
+    "is_set_to",
+    "is_shared",
+    "is_signed",
+    "item",
+    "long",
+    "masked_scatter",
+    "masked_fill",
+    "narrow_copy",
+    "numpy",
+    "pin_memory",
+    "repeat",
+    "reshape_as",
+    "select",
+    "short",
+    "storage_offset",
+    "sum_to_size",
+    "to",
+    "to_mkldnn",
+    "tolist",
+    "type",
+    "type_as",
+    "unfold",
+    "view",
+    "view_as",
 ]
 
 
@@ -604,7 +612,7 @@ def get_nn_functional_top_list():
     for _, count, functional_name in top_nn_module:
         if functional_name is None:
             continue
-        if functional_name == 'torch.flatten':
+        if functional_name == "torch.flatten":
             continue
         if functional_name not in top_nn_functional_:
             top_nn_functional_[functional_name] = count
@@ -612,7 +620,7 @@ def get_nn_functional_top_list():
             top_nn_functional_[functional_name] += count
 
     top_nn_functional_ = list(top_nn_functional_.items())
-    top_nn_functional_.sort(key=lambda x: x[1], reverse=True)
+    top_nn_functional_.sort(key=operator.itemgetter(1), reverse=True)
     return top_nn_functional_
 
 
diff --git a/torch/_functorch/utils.py b/torch/_functorch/utils.py
index 8e3c60029978e..303ebbc45d63e 100644
--- a/torch/_functorch/utils.py
+++ b/torch/_functorch/utils.py
@@ -1,11 +1,21 @@
 import contextlib
+from typing import Tuple, Union
+
 import torch
 from torch._C._functorch import (
-    set_single_level_autograd_function_allowed,
     get_single_level_autograd_function_allowed,
+    set_single_level_autograd_function_allowed,
     unwrap_if_dead,
 )
-from typing import Union, Tuple
+from torch.utils._exposed_in import exposed_in
+
+__all__ = [
+    "exposed_in",
+    "argnums_t",
+    "enable_single_level_autograd_function",
+    "unwrap_dead_wrappers",
+]
+
 
 @contextlib.contextmanager
 def enable_single_level_autograd_function():
@@ -16,26 +26,13 @@ def enable_single_level_autograd_function():
     finally:
         set_single_level_autograd_function_allowed(prev_state)
 
+
 def unwrap_dead_wrappers(args):
     # NB: doesn't use tree_map_only for performance reasons
     result = tuple(
-        unwrap_if_dead(arg) if isinstance(arg, torch.Tensor) else arg
-        for arg in args
+        unwrap_if_dead(arg) if isinstance(arg, torch.Tensor) else arg for arg in args
     )
     return result
 
-# Allows one to expose an API in a private submodule publicly as per the definition
-# in PyTorch's public api policy.
-#
-# It is a temporary solution while we figure out if it should be the long-term solution
-# or if we should amend PyTorch's public api policy. The concern is that this approach
-# may not be very robust because it's not clear what __module__ is used for.
-# However, both numpy and jax overwrite the __module__ attribute of their APIs
-# without problem, so it seems fine.
-def exposed_in(module):
-    def wrapper(fn):
-        fn.__module__ = module
-        return fn
-    return wrapper
 
 argnums_t = Union[int, Tuple[int, ...]]
diff --git a/torch/_functorch/vmap.py b/torch/_functorch/vmap.py
index 69bc77fa70178..054a40123ee8a 100644
--- a/torch/_functorch/vmap.py
+++ b/torch/_functorch/vmap.py
@@ -1,24 +1,21 @@
+# mypy: ignore-errors
+
 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import torch
+import contextlib
 import functools
+import itertools
+import os
 import threading
-from torch import Tensor
-from typing import Any, Callable, Optional, Tuple, Union, List
-from torch.utils._pytree import (
-    tree_flatten,
-    tree_unflatten,
-    tree_map_,
-    _broadcast_to_and_flatten,
-    TreeSpec,
-)
 from functools import partial
-import os
-import itertools
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor
 
 from torch._C._functorch import (
     _add_batch_dim,
@@ -27,6 +24,13 @@
     _vmap_increment_nesting,
     is_batchedtensor,
 )
+from torch.utils._pytree import (
+    _broadcast_to_and_flatten,
+    tree_flatten,
+    tree_map_,
+    tree_unflatten,
+    TreeSpec,
+)
 
 in_dims_t = Union[int, Tuple]
 out_dims_t = Union[int, Tuple[int, ...]]
@@ -42,21 +46,26 @@ def doesnt_support_saved_tensors_hooks(f):
     def fn(*args, **kwargs):
         with torch.autograd.graph.disable_saved_tensors_hooks(message):
             return f(*args, **kwargs)
+
     return fn
 
 
 # Checks that all args-to-be-batched have the same batch dim size
 def _validate_and_get_batch_size(
-        flat_in_dims: List[Optional[int]],
-        flat_args: List) -> int:
-    batch_sizes = [arg.size(in_dim) for in_dim, arg in zip(flat_in_dims, flat_args)
-                   if in_dim is not None]
+    flat_in_dims: List[Optional[int]], flat_args: List
+) -> int:
+    batch_sizes = [
+        arg.size(in_dim)
+        for in_dim, arg in zip(flat_in_dims, flat_args)
+        if in_dim is not None
+    ]
     if len(batch_sizes) == 0:
-        raise ValueError('vmap: Expected at least one Tensor to vmap over')
+        raise ValueError("vmap: Expected at least one Tensor to vmap over")
     if batch_sizes and any(size != batch_sizes[0] for size in batch_sizes):
         raise ValueError(
-            f'vmap: Expected all tensors to have the same size in the mapped '
-            f'dimension, got sizes {batch_sizes} for the mapped dimension')
+            f"vmap: Expected all tensors to have the same size in the mapped "
+            f"dimension, got sizes {batch_sizes} for the mapped dimension"
+        )
     return batch_sizes[0]
 
 
@@ -65,11 +74,14 @@ def _num_outputs(batched_outputs: Union[Tensor, Tuple[Tensor, ...]]) -> int:
         return len(batched_outputs)
     return 1
 
+
 # If value is a tuple, check it has length `num_elements`.
 # If value is not a tuple, make a tuple with `value` repeated `num_elements` times
 
 
-def _as_tuple(value: Any, num_elements: int, error_message_lambda: Callable[[], str]) -> Tuple:
+def _as_tuple(
+    value: Any, num_elements: int, error_message_lambda: Callable[[], str]
+) -> Tuple:
     if not isinstance(value, tuple):
         return (value,) * num_elements
     if len(value) != num_elements:
@@ -82,92 +94,113 @@ def _process_batched_inputs(
 ) -> Tuple[int, List[Any], List[Any], TreeSpec]:
     if not isinstance(in_dims, int) and not isinstance(in_dims, tuple):
         raise ValueError(
-            f'vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): '
-            f'expected `in_dims` to be int or a (potentially nested) tuple '
-            f'matching the structure of inputs, got: {type(in_dims)}.')
+            f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+            f"expected `in_dims` to be int or a (potentially nested) tuple "
+            f"matching the structure of inputs, got: {type(in_dims)}."
+        )
     if len(args) == 0:
         raise ValueError(
-            f'vmap({_get_name(func)})(<inputs>): got no inputs. Maybe you forgot to add '
-            f'inputs, or you are trying to vmap over a function with no inputs. '
-            f'The latter is unsupported.')
+            f"vmap({_get_name(func)})(<inputs>): got no inputs. Maybe you forgot to add "
+            f"inputs, or you are trying to vmap over a function with no inputs. "
+            f"The latter is unsupported."
+        )
 
     flat_args, args_spec = tree_flatten(args)
     flat_in_dims = _broadcast_to_and_flatten(in_dims, args_spec)
     if flat_in_dims is None:
         raise ValueError(
-            f'vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): '
-            f'in_dims is not compatible with the structure of `inputs`. '
-            f'in_dims has structure {tree_flatten(in_dims)[1]} but inputs '
-            f'has structure {args_spec}.')
+            f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+            f"in_dims is not compatible with the structure of `inputs`. "
+            f"in_dims has structure {tree_flatten(in_dims)[1]} but inputs "
+            f"has structure {args_spec}."
+        )
 
     for i, (arg, in_dim) in enumerate(zip(flat_args, flat_in_dims)):
         if not isinstance(in_dim, int) and in_dim is not None:
             raise ValueError(
-                f'vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): '
-                f'Got in_dim={in_dim} for an input but in_dim must be either '
-                f'an integer dimension or None.')
+                f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+                f"Got in_dim={in_dim} for an input but in_dim must be either "
+                f"an integer dimension or None."
+            )
         if isinstance(in_dim, int) and not isinstance(arg, Tensor):
             raise ValueError(
-                f'vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): '
-                f'Got in_dim={in_dim} for an input but the input is of type '
-                f'{type(arg)}. We cannot vmap over non-Tensor arguments, '
-                f'please use None as the respective in_dim')
+                f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+                f"Got in_dim={in_dim} for an input but the input is of type "
+                f"{type(arg)}. We cannot vmap over non-Tensor arguments, "
+                f"please use None as the respective in_dim"
+            )
         if in_dim is not None and (in_dim < -arg.dim() or in_dim >= arg.dim()):
             raise ValueError(
-                f'vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): '
-                f'Got in_dim={in_dim} for some input, but that input is a Tensor '
-                f'of dimensionality {arg.dim()} so expected in_dim to satisfy '
-                f'-{arg.dim()} <= in_dim < {arg.dim()}.')
+                f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+                f"Got in_dim={in_dim} for some input, but that input is a Tensor "
+                f"of dimensionality {arg.dim()} so expected in_dim to satisfy "
+                f"-{arg.dim()} <= in_dim < {arg.dim()}."
+            )
         if in_dim is not None and in_dim < 0:
             flat_in_dims[i] = in_dim % arg.dim()
 
-    return _validate_and_get_batch_size(flat_in_dims, flat_args), flat_in_dims, flat_args, args_spec
+    return (
+        _validate_and_get_batch_size(flat_in_dims, flat_args),
+        flat_in_dims,
+        flat_args,
+        args_spec,
+    )
+
 
 # Creates BatchedTensors for every Tensor in arg that should be batched.
 # Returns the (potentially) batched arguments and the batch_size.
 
 
 def _create_batched_inputs(
-        flat_in_dims: List[Any], flat_args: List[Any], vmap_level: int, args_spec) -> Tuple:
+    flat_in_dims: List[Any], flat_args: List[Any], vmap_level: int, args_spec
+) -> Tuple:
     # See NOTE [Ignored _remove_batch_dim, _add_batch_dim]
-    batched_inputs = [arg if in_dim is None else
-                      _add_batch_dim(arg, in_dim, vmap_level)
-                      for in_dim, arg in zip(flat_in_dims, flat_args)]
+    batched_inputs = [
+        arg if in_dim is None else _add_batch_dim(arg, in_dim, vmap_level)
+        for in_dim, arg in zip(flat_in_dims, flat_args)
+    ]
     return tree_unflatten(batched_inputs, args_spec)
 
 
 def _maybe_remove_batch_dim(name, batched_output, vmap_level, batch_size, out_dim):
-
     if out_dim is None:
-        if isinstance(batched_output, torch.Tensor) and is_batchedtensor(batched_output):
+        if isinstance(batched_output, torch.Tensor) and is_batchedtensor(
+            batched_output
+        ):
             raise ValueError(
-                f'vmap({name}, ...): `{name}` can not return a '
-                f'BatchedTensor when out_dim is None'
+                f"vmap({name}, ...): `{name}` can not return a "
+                f"BatchedTensor when out_dim is None"
             )
         return batched_output
 
     # out_dim is non None
     if not isinstance(batched_output, torch.Tensor):
-        raise ValueError(f'vmap({name}, ...): `{name}` must only return '
-                         f'Tensors, got type {type(batched_output)}. '
-                         'Did you mean to set out_dim= to None for output?')
+        raise ValueError(
+            f"vmap({name}, ...): `{name}` must only return "
+            f"Tensors, got type {type(batched_output)}. "
+            "Did you mean to set out_dims= to None for output?"
+        )
 
     return _remove_batch_dim(batched_output, vmap_level, batch_size, out_dim)
 
 
 # Undos the batching (and any batch dimensions) associated with the `vmap_level`.
 def _unwrap_batched(
-        batched_outputs: Union[Tensor, Tuple[Tensor, ...]],
-        out_dims: out_dims_t,
-        vmap_level: int, batch_size: int, func: Callable) -> Tuple:
+    batched_outputs: Union[Tensor, Tuple[Tensor, ...]],
+    out_dims: out_dims_t,
+    vmap_level: int,
+    batch_size: int,
+    func: Callable,
+) -> Tuple:
     flat_batched_outputs, output_spec = tree_flatten(batched_outputs)
 
     def incompatible_error():
         raise ValueError(
-            f'vmap({_get_name(func)}, ..., out_dims={out_dims})(<inputs>): '
-            f'out_dims is not compatible with the structure of `outputs`. '
-            f'out_dims has structure {tree_flatten(out_dims)[1]} but outputs '
-            f'has structure {output_spec}.')
+            f"vmap({_get_name(func)}, ..., out_dims={out_dims})(<inputs>): "
+            f"out_dims is not compatible with the structure of `outputs`. "
+            f"out_dims has structure {tree_flatten(out_dims)[1]} but outputs "
+            f"has structure {output_spec}."
+        )
 
     if isinstance(batched_outputs, torch.Tensor):
         # Some weird edge case requires us to spell out the following
@@ -186,7 +219,9 @@ def incompatible_error():
             incompatible_error()
 
     flat_outputs = [
-        _maybe_remove_batch_dim(_get_name(func), batched_output, vmap_level, batch_size, out_dim)
+        _maybe_remove_batch_dim(
+            _get_name(func), batched_output, vmap_level, batch_size, out_dim
+        )
         for batched_output, out_dim in zip(flat_batched_outputs, flat_out_dims)
     ]
     return tree_unflatten(flat_outputs, output_spec)
@@ -198,9 +233,10 @@ def _check_int_or_none(x, func, out_dims):
     if x is None:
         return
     raise ValueError(
-        f'vmap({_get_name(func)}, ..., out_dims={out_dims}): `out_dims` must be '
-        f'an int, None or a python collection of ints representing where in the outputs the '
-        f'vmapped dimension should appear.')
+        f"vmap({_get_name(func)}, ..., out_dims={out_dims}): `out_dims` must be "
+        f"an int, None or a python collection of ints representing where in the outputs the "
+        f"vmapped dimension should appear."
+    )
 
 
 def _check_out_dims_is_int_or_int_pytree(out_dims: out_dims_t, func: Callable) -> None:
@@ -210,7 +246,7 @@ def _check_out_dims_is_int_or_int_pytree(out_dims: out_dims_t, func: Callable) -
 
 
 def _get_name(func: Callable):
-    if hasattr(func, '__name__'):
+    if hasattr(func, "__name__"):
         return func.__name__
 
     # Not all callables have __name__, in fact, only static functions/methods do.
@@ -223,6 +259,7 @@ def _get_name(func: Callable):
 DECOMPOSITIONS_LOCK = threading.Lock()
 VMAP_DECOMPOSITIONS_LIB = None
 
+
 # torch.package, Python 3.11, and torch.jit-less environments are unhappy with
 # decompositions. Only load them when needed if possible.
 def lazy_load_decompositions():
@@ -243,7 +280,9 @@ def lazy_load_decompositions():
         #  because the Tensor types generated cannot be unioned by torchscript
         # decomp should be type OpOverload
         global VMAP_DECOMPOSITIONS_LIB
-        VMAP_DECOMPOSITIONS_LIB = torch.library.Library("aten", "IMPL", "FuncTorchBatched")
+        VMAP_DECOMPOSITIONS_LIB = torch.library.Library(
+            "aten", "IMPL", "FuncTorchBatched"
+        )
 
         from torch._decomp import decomposition_table
 
@@ -254,7 +293,9 @@ def _register_python_decomposition_vmap(decomp):
                 raise RuntimeError(f"could not find decomposition for {decomp}")
 
         _register_python_decomposition_vmap(torch.ops.aten.mse_loss_backward.default)
-        _register_python_decomposition_vmap(torch.ops.aten.smooth_l1_loss_backward.default)
+        _register_python_decomposition_vmap(
+            torch.ops.aten.smooth_l1_loss_backward.default
+        )
         _register_python_decomposition_vmap(torch.ops.aten.huber_loss_backward.default)
         _register_python_decomposition_vmap(torch.ops.aten.nll_loss_forward.default)
         _register_python_decomposition_vmap(torch.ops.aten.nll_loss2d_forward.default)
@@ -264,27 +305,40 @@ def _register_python_decomposition_vmap(decomp):
 
         DECOMPOSITIONS_LOADED = True
 
+
 def vmap_impl(func, in_dims, out_dims, randomness, chunk_size, *args, **kwargs):
     lazy_load_decompositions()
     _check_out_dims_is_int_or_int_pytree(out_dims, func)
-    batch_size, flat_in_dims, flat_args, args_spec = _process_batched_inputs(in_dims, args, func)
+    batch_size, flat_in_dims, flat_args, args_spec = _process_batched_inputs(
+        in_dims, args, func
+    )
 
     if chunk_size is not None:
-        chunks_flat_args = _get_chunked_inputs(flat_args, flat_in_dims, batch_size, chunk_size)
-        return _chunked_vmap(func, flat_in_dims, chunks_flat_args,
-                             args_spec, out_dims, randomness, **kwargs)
-
-    from torch._dynamo import disable
-
-    # remove @disable once #114306 is fixed
-    @disable
-    def wrapper():
-        # If chunk_size is not specified.
-        return _flat_vmap(
-            func, batch_size, flat_in_dims, flat_args, args_spec, out_dims, randomness, **kwargs
+        chunks_flat_args = _get_chunked_inputs(
+            flat_args, flat_in_dims, batch_size, chunk_size
+        )
+        return _chunked_vmap(
+            func,
+            flat_in_dims,
+            chunks_flat_args,
+            args_spec,
+            out_dims,
+            randomness,
+            **kwargs,
         )
 
-    return wrapper()
+    # If chunk_size is not specified.
+    return _flat_vmap(
+        func,
+        batch_size,
+        flat_in_dims,
+        flat_args,
+        args_spec,
+        out_dims,
+        randomness,
+        **kwargs,
+    )
+
 
 def get_chunk_sizes(total_elems, chunk_size):
     n_chunks = n_chunks = total_elems // chunk_size
@@ -295,6 +349,7 @@ def get_chunk_sizes(total_elems, chunk_size):
         chunk_sizes.append(remainder)
     return chunk_sizes
 
+
 def _get_chunked_inputs(flat_args, flat_in_dims, batch_size, chunk_size):
     split_idxs = (batch_size,)
     if chunk_size is not None:
@@ -302,7 +357,12 @@ def _get_chunked_inputs(flat_args, flat_in_dims, batch_size, chunk_size):
         split_idxs = tuple(itertools.accumulate(chunk_sizes))
 
     flat_args_chunks = tuple(
-        t.tensor_split(split_idxs, dim=in_dim) if in_dim is not None else [t, ] * len(split_idxs)
+        t.tensor_split(split_idxs, dim=in_dim)
+        if in_dim is not None
+        else [
+            t,
+        ]
+        * len(split_idxs)
         for t, in_dim in zip(flat_args, flat_in_dims)
     )
 
@@ -343,8 +403,9 @@ def _concat_chunked_outputs(out_dims, arg_spec, flat_output_chunks):
 
 
 # Applies vmap on chunked_input and returns concatenated output over the chunks.
-def _chunked_vmap(func, flat_in_dims, chunks_flat_args, args_spec, out_dims, randomness, **kwargs):
-
+def _chunked_vmap(
+    func, flat_in_dims, chunks_flat_args, args_spec, out_dims, randomness, **kwargs
+):
     chunks_output = []
     rs = torch.get_rng_state() if randomness == "same" else None
     for flat_args in chunks_flat_args:
@@ -367,7 +428,14 @@ def _chunked_vmap(func, flat_in_dims, chunks_flat_args, args_spec, out_dims, ran
             torch.set_rng_state(rs)
         chunks_output.append(
             _flat_vmap(
-                func, batch_size, flat_in_dims, flat_args, args_spec, out_dims, randomness, **kwargs
+                func,
+                batch_size,
+                flat_in_dims,
+                flat_args,
+                args_spec,
+                out_dims,
+                randomness,
+                **kwargs,
             )
         )
 
@@ -386,21 +454,33 @@ def _chunked_vmap(func, flat_in_dims, chunks_flat_args, args_spec, out_dims, ran
 
 # Vmap refactored helper functions:
 def _check_randomness_arg(randomness):
-    if randomness not in ['error', 'different', 'same']:
-        raise RuntimeError(f"Only allowed values for randomness are 'error', 'different', or 'same'. Got {randomness}")
+    if randomness not in ["error", "different", "same"]:
+        raise RuntimeError(
+            f"Only allowed values for randomness are 'error', 'different', or 'same'. Got {randomness}"
+        )
 
 
-@doesnt_support_saved_tensors_hooks
-def _flat_vmap(func, batch_size, flat_in_dims, flat_args, args_spec, out_dims, randomness, **kwargs):
-    vmap_level = _vmap_increment_nesting(batch_size, randomness)
+@contextlib.contextmanager
+def vmap_increment_nesting(batch_size, randomness):
     try:
-        batched_inputs = _create_batched_inputs(flat_in_dims, flat_args, vmap_level, args_spec)
-        batched_outputs = func(*batched_inputs, **kwargs)
-        return _unwrap_batched(batched_outputs, out_dims, vmap_level, batch_size, func)
+        vmap_level = _vmap_increment_nesting(batch_size, randomness)
+        yield vmap_level
     finally:
         _vmap_decrement_nesting()
 
 
+@doesnt_support_saved_tensors_hooks
+def _flat_vmap(
+    func, batch_size, flat_in_dims, flat_args, args_spec, out_dims, randomness, **kwargs
+):
+    with vmap_increment_nesting(batch_size, randomness) as vmap_level:
+        batched_inputs = _create_batched_inputs(
+            flat_in_dims, flat_args, vmap_level, args_spec
+        )
+        batched_outputs = func(*batched_inputs, **kwargs)
+        return _unwrap_batched(batched_outputs, out_dims, vmap_level, batch_size, func)
+
+
 # `restore_vmap` is a private helper function. It is vmap but has the following
 # differences:
 # - instead of returning outputs, it returns an (outputs, out_dims) tuple.
@@ -424,13 +504,11 @@ def _flat_vmap(func, batch_size, flat_in_dims, flat_args, args_spec, out_dims, r
 @doesnt_support_saved_tensors_hooks
 def restore_vmap(func, in_dims, batch_size, randomness):
     def inner(*args, **kwargs):
-        vmap_level = _vmap_increment_nesting(batch_size, randomness)
-        try:
+        with vmap_increment_nesting(batch_size, randomness) as vmap_level:
             batched_inputs = wrap_batched(args, in_dims, vmap_level)
             batched_outputs = func(*batched_inputs, **kwargs)
             return unwrap_batched(batched_outputs, vmap_level)
-        finally:
-            _vmap_decrement_nesting()
+
     return inner
 
 
@@ -446,7 +524,11 @@ def unwrap_batched(args, level):
     flat_args, spec = tree_flatten(args)
     if len(flat_args) == 0:
         return args, ()
-    result = [torch._C._functorch._unwrap_batched(arg, level) if isinstance(arg, torch.Tensor)
-              else (arg, None) for arg in flat_args]
+    result = [
+        torch._C._functorch._unwrap_batched(arg, level)
+        if isinstance(arg, torch.Tensor)
+        else (arg, None)
+        for arg in flat_args
+    ]
     output, bdims = zip(*result)
     return tree_unflatten(output, spec), tree_unflatten(bdims, spec)
diff --git a/torch/_guards.py b/torch/_guards.py
index 69912b15313d8..4dccd4aa84e6a 100644
--- a/torch/_guards.py
+++ b/torch/_guards.py
@@ -10,7 +10,7 @@
 import traceback
 import unittest.mock
 import weakref
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from contextlib import contextmanager
 from typing import (
     Any,
@@ -26,7 +26,6 @@
     TypeVar,
 )
 
-import torch
 from torch.utils import _pytree as pytree
 from torch.utils._traceback import CapturedTraceback
 from torch.utils.weak import WeakTensorKeyDictionary
@@ -35,11 +34,13 @@
 
 
 if TYPE_CHECKING:
+    import sympy
+
     # Import the following modules during type checking to enable code intelligence features,
     # such as auto-completion in tools like pylance, even when these modules are not explicitly
     # imported in user code.
 
-    import sympy
+    import torch
 
 
 """
@@ -86,6 +87,9 @@ class GuardSource(enum.Enum):
     SHAPE_ENV = 6
     LOCAL_FSDP_MODULE = 7
     GLOBAL_FSDP_MODULE = 8
+    BACKWARD_STATE = 9
+    EPHEMERAL = 10
+    SYNTHETIC_LOCAL = 11
 
     def is_fsdp_module(self) -> bool:
         return self in (GuardSource.GLOBAL_FSDP_MODULE, GuardSource.LOCAL_FSDP_MODULE)
@@ -159,9 +163,9 @@ class Guard:
     obj_weakref: Optional[object] = None
     guarded_class_weakref: Optional[type] = None
 
-    stack = None
-    user_stack = None
-    _hash = None
+    stack: Optional[CapturedTraceback] = None
+    user_stack: Optional[traceback.StackSummary] = None
+    _hash: Optional[int] = None
 
     def __hash__(self):
         if self._hash is None:
@@ -169,7 +173,16 @@ def __hash__(self):
         return self._hash
 
     def sort_key(self):
+        # Put the duplicate input guards at the end. The duplicate guards have
+        # two sources while guard.name only considers one source.
+        from ._dynamo.guards import GuardBuilder
+
+        is_duplicate_input = (
+            isinstance(self.create_fn, functools.partial)
+            and self.create_fn.func is GuardBuilder.DUPLICATE_INPUT
+        )
         return (
+            is_duplicate_input,
             self.source.value if self.source else -1,
             len(self.name),
             self.name,
@@ -245,7 +258,7 @@ def create(self, builder: GuardBuilderBase):
         try:
             return self.create_fn(builder, self)
         except Exception:
-            log.error("Error while creating guard:\n%s", str(self).rstrip())
+            log.exception("Error while creating guard:\n%s", str(self).rstrip())
             if self.stack:
                 log.error("Created at:\n%s", "".join(self.stack.format()[-4:]).rstrip())
             raise
@@ -276,10 +289,19 @@ def set_export_info(self, guard_type, guarded_class, code_list, obj_weakref):
         else:
             self.code_list.extend(code_list)
 
-        assert self.obj_weakref in (
-            obj_weakref,
-            None,
-        ), "Guarded object must be identical, or None"
+        # Some objects are ephemeral, e.g., list[slice(1, 2)]. If we have
+        # multiple guards on the same object, the weakref can die between the
+        # invocation of set_export_info calls. So a dead weakref is also
+        # acceptable.
+        assert (
+            self.obj_weakref
+            in (
+                obj_weakref,
+                None,
+            )
+            or callable(self.obj_weakref)
+            and self.obj_weakref() is None
+        ), "Guarded object must be identical, None or ephemeral (dead weakref)"
         self.obj_weakref = obj_weakref
 
 
@@ -326,7 +348,7 @@ def __post_init__(self):
 """
 
 
-class Checkpointable(ABC, Generic[T]):
+class Checkpointable(Generic[T]):
     @abstractmethod
     def copy_graphstate(self) -> T:
         ...
@@ -336,25 +358,23 @@ def restore_graphstate(self, state: T):
         ...
 
 
-"""
-The GuardCheckpointState - it is the T of Checkpointable[T] for GuardsContext
-"""
-
-
 class GuardsCheckpointState:
+    """
+    The GuardCheckpointState - it is the T of Checkpointable[T] for GuardsContext
+    """
+
     dynamo_guards: Set[Guard] = set()
 
     def __init__(self, dynamo_guards):
         self.dynamo_guards = dynamo_guards
 
-    """
-    Produces a delta against another GuardsCheckpointState.
-
-    Returns None if no delta is found, otherwise, return a set() of mismatched
-    Guard type objects.
-    """
-
     def diff(self, other):
+        """
+        Produces a delta against another GuardsCheckpointState.
+
+        Returns None if no delta is found, otherwise, return a set() of mismatched
+        Guard type objects.
+        """
         r = self.dynamo_guards.difference(other.dynamo_guards)
         if len(r) == 0:
             return None
@@ -370,14 +390,13 @@ class ModuleContextCheckpointState:
     def __init__(self, nn_modules):
         self.nn_modules = nn_modules
 
-    """
-    Produces a delta against another ModuleContextCheckpointState.
-
-    Returns None if no delta is found, otherwise, return a set() of mismatched
-    module key names.
-    """
-
     def diff(self, other):
+        """
+        Produces a delta against another ModuleContextCheckpointState.
+
+        Returns None if no delta is found, otherwise, return a set() of mismatched
+        module key names.
+        """
         r = set(self.nn_modules.keys()).difference(set(other.nn_modules.keys()))
         if len(r) == 0:
             return None
@@ -405,14 +424,13 @@ class GlobalContextCheckpointState:
     def __init__(self, global_states):
         self.global_state = global_states
 
-    """
-    Produces a delta against another GlobalContextCheckpointState.
-
-    Returns None if no delta is found, otherwise, return a set() of mismatched
-    global key names.
-    """
-
     def diff(self, other):
+        """
+        Produces a delta against another GlobalContextCheckpointState.
+
+        Returns None if no delta is found, otherwise, return a set() of mismatched
+        global key names.
+        """
         r = set(self.global_state.keys()).difference(set(other.global_state.keys()))
         if len(r) == 0:
             return None
@@ -485,13 +503,14 @@ def __sub__(self, other):
     def __bool__(self):
         return bool(self.inner)
 
-    def add(self, guard: Guard, *, skip=0):
+    def add(self, guard: Guard, *, collect_debug_stack=True, skip=0):
         if guard in self.inner:
             return
-        if guard.stack is None:
-            guard.stack = CapturedTraceback.extract(skip=1 + skip)
-        if guard.user_stack is None:
-            guard.user_stack = TracingContext.extract_stack()
+        if collect_debug_stack:
+            if guard.stack is None:
+                guard.stack = CapturedTraceback.extract(skip=1 + skip)
+            if guard.user_stack is None:
+                guard.user_stack = TracingContext.extract_stack()
         self.inner.add(guard)
 
     def update(self, *others: Set[Guard]):
@@ -499,6 +518,10 @@ def update(self, *others: Set[Guard]):
             for g in o:
                 self.add(g, skip=1)
 
+    def remove_guards_with_source(self, source):
+        """Delete all guards with a given source"""
+        self.inner = {g for g in self.inner if g.originating_source != source}
+
 
 class GuardsContext(Checkpointable[GuardsCheckpointState]):
     def __init__(self):
@@ -602,6 +625,8 @@ def __init__(self, fake_mode):
         self.loc_in_frame = None
         # this is only set after aot_autograd
         self.fw_metadata = None
+        # this is only set after aot_autograd
+        self.aot_graph_name = None
         self.params_flat = None
         # this is for extended return calling convention from backend
         # compiler to aot_autograd
@@ -622,6 +647,16 @@ def __init__(self, fake_mode):
         # See note [Tensor Fakification and Symbol Caching]
         self.tensor_to_context = WeakTensorKeyDictionary()
 
+        # If this true, Aot Autograd will return output Fake Tensors with appropiate
+        # meta on the first invocation
+        # see note: [Returning Fake Tensors on First AOT Autograd Call]
+        self.fakify_first_call = False
+
+    def clear(self):
+        # Look at the note in output_graph.py in function `save_global_state`
+        # for the context on clearing global context.
+        self.global_context.global_state = {}
+
     @staticmethod
     @contextmanager
     def patch(**kwargs):
@@ -644,9 +679,9 @@ def extract_stack():
         self = TracingContext.try_get()
         if self is None:
             return traceback.StackSummary()
-        stack = list(self.frame_summary_stack)
+        stack = self.frame_summary_stack
         if self.loc_in_frame is not None:
-            stack.append(self.loc_in_frame)
+            stack = stack + [self.loc_in_frame]
         return traceback.StackSummary.from_list(stack)
 
     # Call this when you want to call into some code that isn't necessarily
@@ -720,12 +755,12 @@ def report_output_strides():
     @staticmethod
     def set_current_loc(filename, lineno, frame_name):
         TracingContext.get().loc_in_frame = traceback.FrameSummary(
-            filename, lineno, frame_name
+            filename, lineno, frame_name, lookup_line=False
         )
 
 
 @contextmanager
-def compile_context(context: CompileContext):
+def compile_context(context: Optional[CompileContext]):
     old_context = getattr(_TLS, "compile_context", None)
     _TLS.compile_context = context
     try:
@@ -765,29 +800,46 @@ def tracing(context: Optional[TracingContext]):
 # TODO(voz): Consider a toplevel torch/_source.py
 @dataclasses.dataclass(frozen=True)
 class Source:
+    def is_dict_key(self):
+        return False
+
+    def is_ephemeral(self):
+        return False
+
     def reconstruct(self, codegen):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def guard_source(self) -> GuardSource:
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def name(self) -> str:
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def make_guard(self, fn) -> Guard:
         if self.guard_source() is GuardSource.CONSTANT:
-            raise NotImplementedError()
+            raise NotImplementedError
         return Guard(self, fn)
 
     def is_nn_module(self) -> bool:
         return self.guard_source().is_nn_module()
 
+    def subguards_allowed(self):
+        """True if you can guard on attributes of this"""
+        return self.guard_source() != GuardSource.SYNTHETIC_LOCAL
+
 
 # Subclasses can be found in torch/_dynamo/source.py
 @dataclasses.dataclass(frozen=True)
 class ChainedSource(Source):
     base: Source
 
+    def is_dict_key(self):
+        # Recurse until you either hit a ConstDictKey or a Source
+        return self.base.is_dict_key()
+
+    def is_ephemeral(self):
+        return self.base.is_ephemeral()
+
 
 def detect_fake_mode(inputs: Any = None):
     """
@@ -831,3 +883,18 @@ def detect_fake_mode(inputs: Any = None):
         return fake_mode
     else:
         return None
+
+
+def active_fake_mode():
+    """
+    Inspects the dispatch mode stack for an active fake mode and returns it.
+    Returns None if no fake mode is active.
+    """
+    from torch._subclasses.fake_tensor import FakeTensorMode
+    from torch.utils._python_dispatch import _get_current_dispatch_mode_stack
+
+    for _, m in enumerate(reversed(_get_current_dispatch_mode_stack())):
+        if isinstance(m, FakeTensorMode):
+            return m
+
+    return None
diff --git a/torch/_higher_order_ops/__init__.py b/torch/_higher_order_ops/__init__.py
index 2ac132d9db588..99b3577c2cc82 100644
--- a/torch/_higher_order_ops/__init__.py
+++ b/torch/_higher_order_ops/__init__.py
@@ -1 +1,3 @@
 from .cond import cond
+from .while_loop import while_loop
+from .flex_attention import flex_attention, flex_attention_backward
diff --git a/torch/_higher_order_ops/associative_scan.py b/torch/_higher_order_ops/associative_scan.py
new file mode 100644
index 0000000000000..287e59ea00932
--- /dev/null
+++ b/torch/_higher_order_ops/associative_scan.py
@@ -0,0 +1,219 @@
+import functools
+import itertools
+from typing import Callable, List
+
+import torch
+
+import torch._prims_common as utils
+import torch._subclasses.functional_tensor
+
+import torch.utils._pytree as pytree
+
+from torch._C import DispatchKey
+from torch._C._functorch import _add_batch_dim, get_unwrapped, maybe_get_bdim
+from torch._higher_order_ops.utils import (
+    _set_compilation_env,
+    autograd_not_implemented,
+    reenter_make_fx,
+    unique_graph_id,
+)
+
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+
+aten = torch._ops.ops.aten
+
+
+def wrap_combine_fn_flat(*args, combine_fn, spec, num_leaves):
+    assert len(args) == 2 * num_leaves
+    lhs = pytree.tree_unflatten(args[:num_leaves], spec)
+    rhs = pytree.tree_unflatten(args[num_leaves:], spec)
+    combined = combine_fn(lhs, rhs)
+    combined_leaves = pytree.tree_leaves(combined)
+    assert num_leaves == len(combined_leaves)
+    return combined_leaves
+
+
+def associative_scan(
+    combine_fn: Callable[[pytree.PyTree, pytree.PyTree], pytree.PyTree],
+    input: pytree.PyTree,
+    dim: int,
+) -> torch.Tensor:
+    r"""
+    Performs an inclusive scan with an associative pointwise combine function.
+
+    .. warning::
+        `torch.associative_scan` is a prototype feature in PyTorch. It currently
+        does not support autograd and you may run into miscompiles.
+        Read more about feature classification at:
+        https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype
+
+    This operator requires runtime code generation and so requires support for
+    ``torch.compile``. Further, only CUDA device codegen is supported at the moment.
+
+    Args:
+        combine_fn (Callable): A binary callable with type ``(Tensor, Tensor) -> Tensor``,
+            or if input is a pytree ``(pytree, pytree) -> pytree``.
+            This function must be pure, pointwise, and satisfy the associative property.
+        input (torch.Tensor): The input tensor, or nested pytree of tensors.
+            All inputs are expected to have the same shape.
+        dim (int): the dimension to scan over
+
+
+    Example::
+
+        def add(x: torch.Tensor, y: torch.Tensor):
+            return x + y
+
+        cumsum = associative_scan(add, x, dim)
+
+    """
+    assert callable(combine_fn), "combine_fn must be a callable, but got {combine_fn}"
+    assert isinstance(dim, int), "dim must be an int, but got {type(dim)}"
+
+    if not torch._dynamo.is_compiling():
+        with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit():
+            return torch.compile(associative_scan, fullgraph=True)(
+                combine_fn, input, dim
+            )
+
+    leaves, spec = pytree.tree_flatten(input)
+
+    assert len(leaves) >= 1, "expected at least 1 input leaf"
+    assert all(
+        isinstance(x, torch.Tensor) for x in leaves
+    ), "input leaves must be a Tensor"
+    shape = leaves[0].shape
+    ndim = len(shape)
+    dim = utils.canonicalize_dim(ndim, dim)
+
+    for x in leaves[1:]:
+        assert x.shape == shape, "All input tensors must have the same shape"
+
+    combine_fn = functools.partial(
+        wrap_combine_fn_flat, combine_fn=combine_fn, spec=spec, num_leaves=len(leaves)
+    )
+
+    result_flat = associative_scan_op(combine_fn, leaves, dim)
+
+    return pytree.tree_unflatten(result_flat, spec)
+
+
+associative_scan_op = HigherOrderOperator("associative_scan")
+
+
+def trace_associative_scan(
+    proxy_mode, func_overload, combine_fn: Callable, input: List[torch.Tensor], dim: int
+):
+    pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
+
+    with disable_proxy_modes_tracing():
+        sample_inputs = [
+            torch.full((), False, dtype=x.dtype, device=x.device)
+            for x in itertools.chain(input, input)
+        ]
+        combine_graph = reenter_make_fx(combine_fn, pre_dispatch=pre_dispatch)(
+            *sample_inputs
+        )
+
+    outputs = None
+    for node in combine_graph.graph.nodes:
+        if node.op == "output":
+            assert outputs is None
+            assert len(node.args) == 1
+            outputs = node.args[0]
+
+    assert outputs is not None
+    assert len(outputs) == len(
+        input
+    ), f"expected combine_fn to return {len(input)} results but got {len(outputs)}"
+
+    for i, o in zip(input, outputs):
+        o_meta = o.meta["tensor_meta"]
+        assert o_meta.dtype == i.dtype, (
+            f"combine_fn output type mismatch, expected {i.dtype} "
+            + f"but got {o_meta.dtype}"
+        )
+        assert (
+            o_meta.shape == ()
+        ), f"combine_fn must return a scalar tensor but got shape {o_meta.shape}"
+        assert (
+            o_meta.shape == ()
+        ), f"combine_fn must return a scalar tensor but got shape {o_meta.shape}"
+
+    _, combine_graph_name = unique_graph_id(proxy_mode, prefix="scan_combine_graph")
+
+    proxy_mode.tracer.root.register_module(combine_graph_name, combine_graph)
+
+    args = (combine_graph, input, dim)
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", func_overload, proxy_args, {}, name="associative_scan"
+    )
+
+    with disable_proxy_modes_tracing():
+        out = [aten.clone(x) for x in input]
+
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
+
+
+@associative_scan_op.py_impl(DispatchKey.CompositeExplicitAutograd)
+def associative_scan_op_dense(combine_fn, input, dim):
+    raise NotImplementedError("associative_scan is not implemented for eager")
+
+
+associative_scan_op.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(associative_scan_op, deferred_error=True)
+)
+
+
+@associative_scan_op.py_impl(ProxyTorchDispatchMode)
+def associative_scan_proxy_mode(mode, combine_fn, input, dim):
+    if mode.enable_tracing:
+        return trace_associative_scan(mode, associative_scan_op, combine_fn, input, dim)
+    else:
+        return associative_scan_op(mode, associative_scan_op, combine_fn, input, dim)
+
+
+@associative_scan_op.py_impl(FakeTensorMode)
+def assoiciative_scan_fake_tensor_mode(mode, combine_fn, input, dim):
+    with mode:
+        return [x.clone() for x in input]
+
+
+@associative_scan_op.py_functionalize_impl
+def associative_scan_functionalize(ctx, combine_fn, input, dim):
+    unwrapped_input = ctx.unwrap_tensors(input)
+    with ctx.redispatch_to_next() as m:
+        ret = associative_scan_op(combine_fn, unwrapped_input, dim)
+    return ctx.wrap_tensors(ret)
+
+
+@associative_scan_op.py_impl(torch._C._functorch.TransformType.Vmap)
+def associative_scan_batch_rule(interpreter, input, dim, combine_fn):
+    input_ = [get_unwrapped(x) for x in input]
+    input_bdims = [maybe_get_bdim(x) for x in input]
+
+    batch_size = None
+    for inp, bdim in zip(input, input_bdims):
+        if bdim is not None:
+            batch_size = get_unwrapped(inp).shape[bdim]
+
+    assert batch_size
+    input_unwrapped = []
+    for x, bdim in zip(input, input_bdims):
+        unwrap = get_unwrapped(x)
+        if dim is None:
+            unwrap = unwrap.unsqueeze(0).expand(batch_size, *x.shape)
+        else:
+            unwrap = unwrap.movedim(bdim, 0)
+        input_unwrapped.append(unwrap)
+
+    res = associative_scan_op(combine_fn, input_unwrapped, dim + 1)
+    lvl = interpreter.level()
+    return [_add_batch_dim(x, 0, lvl) for x in res]
diff --git a/torch/_higher_order_ops/auto_functionalize.py b/torch/_higher_order_ops/auto_functionalize.py
index a04fba30f5369..89263bd65e7ab 100644
--- a/torch/_higher_order_ops/auto_functionalize.py
+++ b/torch/_higher_order_ops/auto_functionalize.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -23,7 +23,7 @@
 # op. First, when FakeTensor sees this op:
 # - If the schema says it returns nothing, we can generate a trivial
 #   FakeTensor rule for it (that returns nothing).
-# - Otherwise, the user needs to provide a FakeTensor rule (abstract impl)
+# - Otherwise, the user needs to provide a FakeTensor impl (fake impl)
 #
 # Next, when Python FunctionalTensor sees the op, it will functionalize
 # it by emitting a call to an auto_functionalize(op, ["x"], {"x": ...})
@@ -34,17 +34,21 @@
 
 
 class AutoFunctionalized(HigherOrderOperator):
-    """auto_functionalized(op, mutated_args_names, kwargs)
+    """auto_functionalized(_mutable_op, **kwargs)
 
-    This HOP runs a "functional" version of op.
+    This HOP runs a "functional" version of _mutable_op.
 
-    Concretely, it clones kwargs that `op` mutates (specified by
-    mutated_args_names), runs `out = op(**kwargs)` with the cloned values,
-    and then returns (out, Tuple of the cloned values that were mutated).
+    Concretely, it looks at all the arguments that are mutable through
+    _mutable_op's operator schema, clones those kwargs, runs
+    `out = _mutable_op(**kwargs)` with the cloned values, and then returns the
+    operator output concatenated with the cloned values that were mutated.
 
-    We have some restrictions on `op`.
+    We have some restrictions on `_mutable_op`.
     See `can_auto_functionalize` for the restrictions. We can likely lift
     many of these if users request it.
+
+    The reason why _mutable_op is prefixed with an
+    underscore is to prevent collisions with kwarg names in **kwargs.
     """
 
     def __init__(self):
@@ -52,14 +56,12 @@ def __init__(self):
 
     def __call__(
         self,
-        op: torch._ops.OpOverload,
-        mutated_args_names: List[str],
-        kwargs: Dict[str, Any],
+        _mutable_op: torch._ops.OpOverload,
+        **kwargs: Dict[str, Any],
     ) -> Tuple[Any, Tuple[Tensor, ...]]:
-        assert can_auto_functionalize(op)
-        assert isinstance(mutated_args_names, list)
+        assert can_auto_functionalize(_mutable_op)
         assert isinstance(kwargs, dict)
-        return super().__call__(op, mutated_args_names, kwargs)
+        return super().__call__(_mutable_op, **kwargs)
 
 
 auto_functionalized = AutoFunctionalized()
@@ -105,50 +107,64 @@ def can_auto_functionalize(op: torch._ops.OperatorBase) -> bool:
 
 @auto_functionalized.py_impl(DispatchKey.CompositeExplicitAutograd)
 def auto_functionalized_dense(
-    op: torch._ops.OpOverload, mutated_args_names: List[str], kwargs: Dict[str, Any]
+    _mutable_op: torch._ops.OpOverload,
+    _only_clone_these_tensors: Optional[Tuple[str, ...]] = None,
+    **kwargs: Dict[str, Any],
 ) -> Tuple[Any, Tuple[Tensor, ...]]:
     new_kwargs = dict(**kwargs)
     result = []
-    for name in mutated_args_names:
-        new_kwargs[name] = (
-            clone_preserve_strides(kwargs[name]) if kwargs[name] is not None else None
-        )
+
+    _mutable_args_names = get_mutable_arg_names(_mutable_op)
+    for name in _mutable_args_names:
+        if (
+            _only_clone_these_tensors is not None
+            and name not in _only_clone_these_tensors
+        ):
+            new_kwargs[name] = kwargs[name]
+        else:
+            new_kwargs[name] = (
+                clone_preserve_strides(kwargs[name])
+                if kwargs[name] is not None
+                else None
+            )
         result.append(new_kwargs[name])
-    out = op(**new_kwargs)
-    return out, tuple(result)
+    out = _mutable_op(**new_kwargs)
+
+    if isinstance(out, tuple):
+        return (*out, *result)  # type: ignore[return-value]
+    else:
+        return (out, *result)  # type: ignore[return-value]
 
 
 @auto_functionalized.py_impl(FakeTensorMode)
 def auto_functionalized_fake(
     mode,
-    op: torch._ops.OpOverload,
-    mutated_args_names: List[str],
-    kwargs: Dict[str, Any],
+    _mutable_op: torch._ops.OpOverload,
+    **kwargs: Dict[str, Any],
 ) -> Tuple[Any, Tuple[Tensor, ...]]:
     with mode:
-        result = auto_functionalized_dense(op, mutated_args_names, kwargs)
+        result = auto_functionalized_dense(_mutable_op, **kwargs)
         return result
 
 
 @auto_functionalized.py_impl(ProxyTorchDispatchMode)
 def auto_functionalized_proxy(
     mode,
-    op: torch._ops.OpOverload,
-    mutated_args_names: List[str],
-    kwargs: Dict[str, Any],
+    _mutable_op: torch._ops.OpOverload,
+    **kwargs: Dict[str, Any],
 ) -> Tuple[Any, Tuple[Tensor, ...]]:
     if not mode.enable_tracing:
-        return auto_functionalized(op, mutated_args_names, kwargs)
+        return auto_functionalized(_mutable_op, **kwargs)
 
     with disable_proxy_modes_tracing():
-        out = auto_functionalized(op, mutated_args_names, kwargs)
+        out = auto_functionalized(_mutable_op, **kwargs)
 
     proxy_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, kwargs)
     out_proxy = mode.tracer.create_proxy(
         "call_function",
         auto_functionalized,
-        (op, mutated_args_names, proxy_kwargs),
-        {},
+        (_mutable_op,),
+        proxy_kwargs,
     )
     result = track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
     return result
@@ -158,11 +174,24 @@ def auto_functionalized_proxy(
 auto_functionalized.fallthrough(DispatchKey.AutogradCUDA)
 
 
+def get_mutable_arg_names(op: torch._ops.OpOverload) -> List[str]:
+    """
+    Returns the list of argument names that get mutated according to the
+    schema.
+    """
+    mutable_args_names = [
+        arg.name
+        for arg in op._schema.arguments
+        if arg.alias_info is not None and arg.alias_info.is_write
+    ]
+    return mutable_args_names
+
+
 def do_auto_functionalize(
     op: torch._ops.OpOverload, args: Tuple[Any, ...], kwargs: Dict[str, Any]
 ) -> Any:
     """Functionalizes a call to op(*args, **kwargs) by emitting a call to
-    `outs = auto_functionalized(op, mutated_args_names, normalized_kwargs)`
+    `outs = auto_functionalized(op, normalized_kwargs)`
     and replacing the mutated (args, kwargs) with the corresponding outputs.
 
     The normalized_kwargs are just the (args, kwargs), but all in kwarg form.
@@ -172,28 +201,46 @@ def do_auto_functionalize(
 
     ctx = PythonFunctionalizeAPI()
 
-    # List of the name of args that get mutated (according to the schema)
-    mutable_args_names = []
     # All of the (args, kwargs), but all as kwargs. The names for the
     # args come from the schema. This makes it easier for us to work with them.
     normalized_kwargs = {}
     schema = op._schema
     for idx, arg in enumerate(schema.arguments):
-        if arg.alias_info is not None and arg.alias_info.is_write:
-            mutable_args_names.append(arg.name)
         # NB: torch_dispatch kwargs are the args defined as kwarg-only in the schema
         if arg.name in kwargs:
             normalized_kwargs[arg.name] = kwargs[arg.name]
-        else:
+        elif idx < len(args):
+            # if its out of bounds we don't need to do anything
+            # as it means the the optional arg was passed with its default
+            # value
             normalized_kwargs[arg.name] = args[idx]
+        else:
+            normalized_kwargs[arg.name] = arg.default_value
 
     unwrapped_kwargs = ctx.unwrap_tensors(normalized_kwargs)  # type: ignore[arg-type]
     with ctx.redispatch_to_next():
-        unwrapped_actual_out, unwrapped_outs = auto_functionalized(
-            op, mutable_args_names, unwrapped_kwargs  # type: ignore[arg-type]
+        unwrapped_outs = auto_functionalized(
+            op, **unwrapped_kwargs  # type: ignore[arg-type]
         )
-    assert len(unwrapped_outs) == len(mutable_args_names)
-    for name, unwrapped_out in zip(mutable_args_names, unwrapped_outs):
+
+    # List of the name of args that get mutated (according to the schema)
+    mutable_args_names = get_mutable_arg_names(op)
+
+    unwrapped_actual_out: Union[Any, Tuple[Any]] = unwrapped_outs[
+        : -len(mutable_args_names)
+    ]
+    unwrapped_mutable_out = unwrapped_outs[-len(mutable_args_names) :]
+
+    if len(op._schema.returns) == 0:
+        assert unwrapped_actual_out[0] is None
+        unwrapped_actual_out = None
+    elif len(op._schema.returns) == 1:
+        assert len(unwrapped_actual_out) == 1
+        unwrapped_actual_out = unwrapped_actual_out[0]
+    else:
+        assert len(unwrapped_actual_out) == len(op._schema.returns)
+
+    for name, unwrapped_out in zip(mutable_args_names, unwrapped_mutable_out):
         # Can be None if input was `Tensor(a!)?`
         if unwrapped_out is None:
             continue
@@ -203,4 +250,12 @@ def do_auto_functionalize(
         ctx.commit_update(orig_arg)
         ctx.sync(orig_arg)
 
-    return ctx.wrap_tensors(unwrapped_actual_out)
+    return ctx.wrap_tensors(unwrapped_actual_out)  # type: ignore[arg-type]
+
+
+@auto_functionalized.py_functionalize_impl
+def auto_functionalized_func(ctx, _mutable_op, **kwargs):
+    unwrapped_kwargs = ctx.unwrap_tensors(kwargs)
+    with ctx.redispatch_to_next():
+        result = auto_functionalized(_mutable_op, **unwrapped_kwargs)
+    return ctx.wrap_tensors(result)
diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py
index 4f292bf4c92ce..40aee90affccd 100644
--- a/torch/_higher_order_ops/cond.py
+++ b/torch/_higher_order_ops/cond.py
@@ -1,9 +1,7 @@
-from contextlib import contextmanager
-from dataclasses import dataclass
+import contextlib
 
 import torch
 import torch._subclasses.functional_tensor
-import torch.fx.traceback as fx_traceback
 
 import torch.utils._pytree as pytree
 
@@ -15,38 +13,30 @@
     maybe_get_bdim,
 )
 from torch._functorch.utils import exposed_in
+from torch._guards import detect_fake_mode
+
+from torch._higher_order_ops.utils import (
+    _has_potential_branch_input_alias,
+    _has_potential_branch_input_mutation,
+    _set_compilation_env,
+    autograd_not_implemented,
+    reenter_make_fx,
+    unique_graph_id,
+    UnsupportedAliasMutationException,
+)
 
-from torch._higher_order_ops.utils import autograd_not_implemented
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.proxy_tensor import (
+    _temp_remove_pre_dispatch_torch_function_mode,
     disable_proxy_modes_tracing,
-    make_fx,
     ProxyTorchDispatchMode,
     track_tensor_tree,
 )
 from torch.fx.passes.shape_prop import _extract_tensor_metadata
-from torch.multiprocessing.reductions import StorageWeakRef
 from torch.utils._python_dispatch import _get_current_dispatch_mode
 
 
-@contextmanager
-def _set_compilation_env():
-    _old_is_tracing = torch.fx._symbolic_trace._is_fx_tracing_flag
-    try:
-        # We need to turn off the is_fx_tracing_flag. Remove this flag check from dyanmo
-        # once we are confident fx tracing works with dynamo.
-        torch.fx._symbolic_trace._is_fx_tracing_flag = False
-        yield
-    finally:
-        torch.fx._symbolic_trace._is_fx_tracing_flag = _old_is_tracing
-
-
-@dataclass
-class UnsupportedAliasMutationException(RuntimeError):
-    reason: str
-
-
 @exposed_in("torch")
 def cond(pred, true_fn, false_fn, operands):
     r"""
@@ -118,7 +108,7 @@ def false_fn(x: torch.Tensor):
 
     """
 
-    if torch._dynamo.is_compiling():
+    if torch.compiler.is_dynamo_compiling():
         return cond_op(pred, true_fn, false_fn, operands)
 
     def _validate_input(pred, true_fn, false_fn, operands):
@@ -148,9 +138,10 @@ def _validate_input(pred, true_fn, false_fn, operands):
 
     with _set_compilation_env():
         with torch._dynamo.utils.disable_cache_limit():
-            return torch.compile(cond_op, backend="eager", fullgraph=True)(
-                pred, true_fn, false_fn, operands
-            )
+            with _temp_remove_pre_dispatch_torch_function_mode():
+                return torch.compile(cond_op, backend="eager", fullgraph=True)(
+                    pred, true_fn, false_fn, operands
+                )
 
 
 """
@@ -160,18 +151,6 @@ def _validate_input(pred, true_fn, false_fn, operands):
 cond_op = HigherOrderOperator("cond")
 
 
-def _maybe_run_with_interpreter(fn):
-    maybe_interpreted_fn = fn
-    if isinstance(fn, torch.fx.GraphModule) and fx_traceback.has_preserved_node_meta():
-        # Running graph with interpreter is needed for propagating the stack_trace
-        def graph_with_interpreter(*args):
-            with fx_traceback.preserve_node_meta():
-                return torch.fx.Interpreter(fn).run(*args)
-
-        maybe_interpreted_fn = graph_with_interpreter
-    return maybe_interpreted_fn
-
-
 def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
     assert isinstance(
         operands, (list, tuple)
@@ -183,12 +162,8 @@ def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
     pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
 
     with disable_proxy_modes_tracing():
-        true_graph = make_fx(
-            _maybe_run_with_interpreter(true_fn), pre_dispatch=pre_dispatch
-        )(*operands)
-        false_graph = make_fx(
-            _maybe_run_with_interpreter(false_fn), pre_dispatch=pre_dispatch
-        )(*operands)
+        true_graph = reenter_make_fx(true_fn, pre_dispatch)(*operands)
+        false_graph = reenter_make_fx(false_fn, pre_dispatch)(*operands)
 
     true_outs = []
     false_outs = []
@@ -219,19 +194,8 @@ def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
                 f"\n  {false_fn.__name__} returns {false_out.meta['tensor_meta']}"
             )
 
-    # There are probably better ways - I know that create_arg has some self incrementing name
-    # magic to it, but since we explicitly have to get the name for register_module,
-    # I was not sure how to do that. This kinda simulates it.
-    next_name = None
-    i = 0
-    while not next_name:
-        candidate = f"true_graph_{i}"
-        if hasattr(proxy_mode.tracer.root, candidate):
-            i += 1
-        else:
-            next_name = candidate
-
-    true_name = next_name
+    i, true_name = unique_graph_id(proxy_mode, prefix="true_graph")
+
     false_name = f"false_graph_{i}"
     assert not hasattr(proxy_mode.tracer.root, false_name)
 
@@ -250,12 +214,25 @@ def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
     # true or false branch is indistinguishable. So, as this is just for tracing
     # purposes, choose the true branch.
 
+    # TODO: the unbacked symbol allocations MUST NOT leak out, if you want to
+    # support this we need to arrange for the reenter_make_fx unbacked SymInts
+    # to be used, AND we need to arrange for some sort of unification between
+    # the two branches (but not really unification; e.g., if one branch
+    # returns [u0] and the other returns [5] this is OK but you MUST NOT
+    # conclude the result is 5.  Also if one branch returns [3] and another
+    # branch returns [5] you can make it work by immediately allocating a new
+    # unbacked SymInt here).
+    ignore_fresh_unbacked = contextlib.nullcontext()
+    if (fake_mode := detect_fake_mode()) and fake_mode.shape_env:
+        ignore_fresh_unbacked = fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+
     # TODO: Uhh.... it shouldn't matter, but changing this to true_fn results in
     # a FakeTensorMode error :
     # `Current active mode <class 'torch._subclasses.fake_tensor.FakeTensorMode'> not registered`
     # TODO Sometimes the operands are not completely FakeTensor, something seems went wrong in
     # dynamo? Because of that it runs real computation sometimes and re-triggering downstream dispatch keys.
-    out = false_fn(*operands)
+    with ignore_fresh_unbacked:
+        out = false_fn(*operands)
 
     return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
 
@@ -285,7 +262,15 @@ def inner(mode, pred, true_fn, false_fn, operands):
 
 @cond_op.py_impl(FakeTensorMode)
 def cond_fake_tensor_mode(mode, pred, true_fn, false_fn, operands):
-    with mode:
+    # Ignore here, because if you've gotten here but you're not manually
+    # tracing the inner graphs, that means that you intend to reuse the graph
+    # directly.  Which means the old unbacked symbol bindings are appropriate.
+    # This strategy will not work if unbacked symbols can escape.
+    ignore_fresh_unbacked = contextlib.nullcontext()
+    if mode.shape_env:
+        ignore_fresh_unbacked = mode.shape_env.ignore_fresh_unbacked_symbols()
+
+    with mode, ignore_fresh_unbacked:
         true_outs = true_fn(*operands)
         flat_true_outs = pytree.tree_leaves(true_outs)
         flat_false_outs = pytree.tree_leaves(false_fn(*operands))
@@ -304,90 +289,6 @@ def cond_fake_tensor_mode(mode, pred, true_fn, false_fn, operands):
     return true_outs
 
 
-def _has_potential_branch_input_mutation(branch, inputs):
-    """
-    Dispatch-trace the branch with inputs and check if
-    producing graph has mutable op on the input. This is
-    bit restrictive as the branch must be traceable.
-    """
-    try:
-        gm = make_fx(branch)(*inputs)
-    except UnsupportedAliasMutationException:
-        # this can happen when nested cond_op is
-        # functionalized
-        return True
-    except Exception as e:
-        raise e
-
-    def _detect_input_mutation(gm):
-        input_nodes = set()
-        for node in gm.graph.nodes:
-            if node.op == "placeholder":
-                input_nodes.add(node)
-            if node.op == "call_function":
-                target = node.target
-                if (
-                    isinstance(target, torch._ops.OpOverload)
-                    and target._schema.is_mutable
-                ):
-                    for arg in node.args:
-                        if arg in input_nodes:
-                            return True
-
-        for _, module in gm.named_children():
-            if isinstance(module, torch.fx.GraphModule):
-                if _detect_input_mutation(module):
-                    return True
-
-        return False
-
-    return _detect_input_mutation(gm)
-
-
-def _has_potential_branch_input_alias(branch, inputs):
-    """
-    Dispatch-trace the branch with inputs and check if
-    producing graph has output aliasing the branch input. This is
-    bit restrictive as the branch must be traceable.
-    """
-    try:
-        gm = make_fx(branch)(*inputs)
-
-    except UnsupportedAliasMutationException:
-        # this can happen when nested cond_op is
-        # functionalized
-        return True
-    except Exception as e:
-        raise e
-
-    def _detect_input_alias(gm):
-        input_storages = set()
-        for node in gm.graph.nodes:
-            # We need to check existence of "val" because we reuse the logic here
-            # for map operator, where num_mapped_args is a scalar
-            # and doesn't have a "val" meta.
-            if node.op == "placeholder" and "val" in node.meta:
-                input_storages.add(StorageWeakRef(node.meta["val"]._typed_storage()))
-            if node.op == "output":
-
-                def check_alias(out):
-                    if out is not None and "val" in out.meta:
-                        out_storage = StorageWeakRef(out.meta["val"]._typed_storage())
-                        return out_storage in input_storages
-                    return False
-
-                if any(pytree.tree_leaves(pytree.tree_map(check_alias, node.args))):
-                    return True
-
-        for _, module in gm.named_children():
-            if isinstance(module, torch.fx.GraphModule) and _detect_input_alias(module):
-                return True
-
-        return False
-
-    return _detect_input_alias(gm)
-
-
 @cond_op.py_functionalize_impl
 def cond_func(ctx, pred, true_fn, false_fn, inputs):
     unwrapped_inputs = ctx.unwrap_tensors(inputs)
@@ -395,13 +296,18 @@ def cond_func(ctx, pred, true_fn, false_fn, inputs):
     with ctx.redispatch_to_next() as m:
         functional_true = ctx.functionalize(true_fn)
         functional_false = ctx.functionalize(false_fn)
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
         for branch in [functional_true, functional_false]:
-            if _has_potential_branch_input_mutation(branch, unwrapped_inputs):
+            if _has_potential_branch_input_mutation(
+                branch, unwrapped_inputs, pre_dispatch=pre_dispatch
+            ):
                 raise UnsupportedAliasMutationException(
                     "One of torch.cond branch might be modifying the input!"
                 )
         for branch in [true_fn, false_fn]:
-            if _has_potential_branch_input_alias(branch, unwrapped_inputs):
+            if _has_potential_branch_input_alias(
+                branch, unwrapped_inputs, pre_dispatch=pre_dispatch
+            ):
                 raise UnsupportedAliasMutationException(
                     "One of torch.cond branch might be aliasing the input!"
                 )
diff --git a/torch/_higher_order_ops/effects.py b/torch/_higher_order_ops/effects.py
new file mode 100644
index 0000000000000..a89b4c58917af
--- /dev/null
+++ b/torch/_higher_order_ops/effects.py
@@ -0,0 +1,221 @@
+from enum import Enum
+from typing import Any, Dict, Optional, Tuple
+
+import torch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+
+
+class _EffectType(Enum):
+    ORDERED = "Ordered"
+
+
+SIDE_EFFECTS: Dict[torch._ops.OpOverload, _EffectType] = {
+    torch.ops.aten._print.default: _EffectType.ORDERED,
+}
+
+
+def _register_effectful_op(op: torch._ops.OpOverload, effect: _EffectType):
+    assert isinstance(op, torch._ops.OpOverload) and not has_aliasing(op)
+    if op in SIDE_EFFECTS and SIDE_EFFECTS[op] != effect:
+        raise RuntimeError(
+            f"Already registered effect type {SIDE_EFFECTS[op]} to op {op}, "
+            f"trying to register a different effect type {effect}."
+        )
+    SIDE_EFFECTS[op] = effect
+
+
+class WithEffects(HigherOrderOperator):
+    """
+    with_effects(token, op, args, kwargs) -> (new_token, op_results)
+
+    This HOP helps ensure ordering between side effectful ops like prints or ops
+    using torchbind objects. This is needed to ensure a traced graph from
+    AOTAutograd is functional so that future optimization passes do not reorder
+    these operators. This is done through threading "effect tokens" through the
+    graph to enforce data dependence between side effectful ops.
+
+    The tokens are basically dummy values (torch.tensor([])). We create a token
+    per "effect type", which are enumerated in the _EffectType enum.
+    """
+
+    def __init__(self):
+        super().__init__("with_effects")
+
+    def __call__(
+        self,
+        token,
+        op: torch._ops.OpOverload,
+        *args: Tuple[Any, ...],
+        **kwargs: Dict[str, Any],
+    ) -> Tuple[Any, ...]:
+        assert isinstance(op, torch._ops.OpOverload)
+        assert not has_aliasing(op), "Ops with aliasing is not supported"
+        assert has_effects(op, args, kwargs)
+        assert isinstance(kwargs, dict)
+        return super().__call__(token, op, *args, **kwargs)
+
+
+with_effects = WithEffects()
+
+
+def has_aliasing(op: torch._ops.OpOverload):
+    for arg in op._schema.arguments:
+        if arg.alias_info is not None:
+            return True
+    for arg in op._schema.returns:
+        if arg.alias_info is not None:
+            return True
+    return False
+
+
+def has_effects(op, args, kwargs) -> bool:
+    # Skip over the profiler's RecordFunction as they should not show up in the graph
+    _skip_ops = {torch.ops.profiler._record_function_exit._RecordFunction}
+    if op in _skip_ops:
+        return False
+
+    return (
+        isinstance(op, torch._ops.OpOverload)
+        and not has_aliasing(op)
+        and get_effect_key(op, args, kwargs) is not None
+    )
+
+
+def get_effect_key(op, args, kwargs) -> Optional[_EffectType]:
+    if op in SIDE_EFFECTS:
+        return SIDE_EFFECTS[op]
+
+    for arg in args:
+        if isinstance(arg, torch.ScriptObject):
+            return _EffectType.ORDERED
+
+    return None
+
+
+@with_effects.py_impl(DispatchKey.CompositeExplicitAutograd)
+def with_effects_dense(
+    token: torch.Tensor,
+    op: torch._ops.OpOverload,
+    *args: Tuple[Any, ...],
+    **kwargs: Dict[str, Any],
+) -> Tuple[torch.Tensor, ...]:
+    out = op(*args, **kwargs)
+    new_token = torch.tensor([])
+    if isinstance(out, tuple):
+        return (new_token, *out)
+    return (new_token, out)
+
+
+@with_effects.py_impl(FakeTensorMode)
+def with_effects_fake(
+    mode,
+    token: torch.Tensor,
+    op: torch._ops.OpOverload,
+    *args: Tuple[Any, ...],
+    **kwargs: Dict[str, Any],
+) -> Tuple[torch.Tensor, ...]:
+    with mode:
+        result = with_effects_dense(token, op, *args, **kwargs)
+        return result
+
+
+@with_effects.py_impl(ProxyTorchDispatchMode)
+def with_effects_proxy(
+    mode,
+    token: torch.Tensor,
+    op: torch._ops.OpOverload,
+    *args: Tuple[Any, ...],
+    **kwargs: Dict[str, Any],
+) -> Tuple[torch.Tensor, ...]:
+    if not mode.enable_tracing:
+        return with_effects(token, op, *args, **kwargs)
+
+    with disable_proxy_modes_tracing():
+        out = with_effects(token, op, *args, **kwargs)
+
+    proxy_token = mode.tracer.unwrap_proxy(token)
+    proxy_args = pytree.tree_map(mode.tracer.unwrap_proxy, args)
+    proxy_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, kwargs)
+
+    out_proxy = mode.tracer.create_proxy(
+        "call_function",
+        with_effects,
+        (proxy_token, op, *proxy_args),
+        proxy_kwargs,
+    )
+    result = track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
+    return result
+
+
+with_effects.fallthrough(DispatchKey.AutogradCPU)
+with_effects.fallthrough(DispatchKey.AutogradCUDA)
+
+
+def handle_effects(
+    allow_token_discovery: bool,
+    tokens: Dict[_EffectType, torch.Tensor],
+    op: torch._ops.OpOverload,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+) -> Any:
+    """
+    Args:
+        allow_token_discovery: Whether or not we are discovering tokens. If this
+        is true, we will create a token for every side effect type seen that
+        does not have a token assigned yet.  If this is false, the tokens
+        should've all been created ahead of time, so we will error if there is
+        no token mapping to every effect type.
+
+        tokens: Map of effect type to tokens. This is to chain operators of the
+        same effects together so that they do not get reordered in later
+        optimization passes.
+    """
+
+    # Get a token. We can't do `tokens.get(op, torch.tensor([]))` because
+    # this will create an empty tensor during proxy mode tracing if the token
+    # doesn't exist. But the tokens should always exist during proxy mode tracing.
+    key = get_effect_key(op, args, kwargs)
+    assert key is not None
+    if key not in tokens:
+        assert (
+            allow_token_discovery
+        ), f"Could not find a token for effect {key} which came from the function {op}"
+        tokens[key] = torch.tensor([])
+    token = tokens[key]
+
+    from torch._subclasses.functional_tensor import PythonFunctionalizeAPI
+
+    ctx = PythonFunctionalizeAPI()
+
+    unwrapped_token = ctx.unwrap_tensors([token])[0]  # type: ignore[arg-type]
+    unwrapped_args = ctx.unwrap_tensors(args)  # type: ignore[arg-type]
+    unwrapped_kwargs = ctx.unwrap_tensors(kwargs)  # type: ignore[arg-type]
+    with ctx.redispatch_to_next():
+        (new_token, *unwrapped_outs) = with_effects(
+            unwrapped_token, op, *unwrapped_args, **unwrapped_kwargs  # type: ignore[arg-type]
+        )
+
+    if len(op._schema.returns) == 0:
+        assert unwrapped_outs[0] is None
+        unwrapped_outs = None  # type: ignore[assignment]
+    elif len(op._schema.returns) == 1:
+        assert len(unwrapped_outs) == 1
+        unwrapped_outs = unwrapped_outs[0]
+    else:
+        assert len(unwrapped_outs) == len(op._schema.returns)
+
+    # Add the newly created token into the tokens map for a following call to
+    # use this token.
+    wrapped_token = ctx.wrap_tensors(new_token)
+    assert isinstance(wrapped_token, torch.Tensor)
+    tokens[key] = wrapped_token
+
+    return ctx.wrap_tensors(unwrapped_outs)  # type: ignore[arg-type]
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
new file mode 100644
index 0000000000000..664bfe1c4dd0a
--- /dev/null
+++ b/torch/_higher_order_ops/flex_attention.py
@@ -0,0 +1,673 @@
+from typing import Any, Callable, Tuple, Union
+
+import torch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._higher_order_ops.utils import (
+    _has_potential_branch_input_mutation,
+    autograd_not_implemented,
+    UnsupportedAliasMutationException,
+)
+from torch._ops import HigherOrderOperator
+from torch._subclasses import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    make_fx,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.fx.graph_module import GraphModule
+
+from torch.overrides import TorchFunctionMode
+
+
+def transform_getitem_args(x: torch.Tensor, index_args) -> Tuple[Any, ...]:
+    if isinstance(index_args, tuple):
+        return (x, list(index_args))
+    elif not isinstance(index_args, (list, tuple)):
+        return (x, [index_args])
+    return (x, index_args)
+
+
+class TransformGetItemToIndex(TorchFunctionMode):
+    # This is needed since we want to support calling
+    # A[q_idx], where q_idx is a scalar tensor in score_mod.
+    # Today, when q_idx is a scalar tensor, we implicitly convert it to a python
+    # scalar and create a view. We do not want that behavior in this case, so we
+    # use this torchfunctionmode to override that behavior for score_mod
+    # wherever we're running it.
+    def __torch_function__(self, func, types, args, kwargs=None):
+        if func == torch.Tensor.__getitem__:
+            return torch.ops.aten.index(*transform_getitem_args(*args))
+        return func(*args, **(kwargs or {}))
+
+
+class FlexAttentionHOP(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("flex_attention")
+
+    def __call__(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        score_mod: Callable,
+        *other_buffers: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not all(isinstance(buf, torch.Tensor) for buf in other_buffers):
+            raise RuntimeError("Other buffers must be tensors.")
+        return super().__call__(query, key, value, score_mod, *other_buffers)
+
+
+flex_attention = FlexAttentionHOP()
+flex_attention.__module__ = "torch.ops.higher_order"
+
+
+class FlexAttentionBackwardHOP(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("flex_attention_backward")
+
+    def __call__(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        out: torch.Tensor,
+        logsumexp: torch.Tensor,
+        grad_out: torch.Tensor,
+        fw_graph: Union[Callable, GraphModule],
+        joint_graph: GraphModule,
+        *other_buffers: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        if not all(isinstance(buf, torch.Tensor) for buf in other_buffers):
+            raise RuntimeError("Other buffers must be tensors.")
+        return super().__call__(
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            grad_out,
+            fw_graph,
+            joint_graph,
+            *other_buffers,
+        )
+
+
+flex_attention_backward = FlexAttentionBackwardHOP()
+flex_attention_backward.__module__ = "torch.ops.higher_order"
+
+
+def math_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    *other_buffers: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Eager implementation
+
+    This implementation uses vmap to vectorize the score_mod function over the batch, head, m, and n dimensions.
+    We then apply the vectorized score_mod function to the scores matrix. Each wrap of vmap applies one of the
+    batch, head, m, or n dimensions. We need to apply vmap 4 times to vectorized over all 4 dimensions.
+
+    Args:
+        query: The query tensor
+        key: The key tensor
+        value: The value tensor
+        score_mod: The score_mod function
+        other_buffers: Other buffers that are passed to the score_mod function
+    """
+    working_precision = torch.float64 if query.dtype == torch.float64 else torch.float32
+
+    scores = (query @ key.transpose(-2, -1)).to(dtype=working_precision)
+
+    b = torch.arange(0, scores.size(0), device=scores.device)
+    h = torch.arange(0, scores.size(1), device=scores.device)
+    m = torch.arange(0, scores.size(2), device=scores.device)
+    n = torch.arange(0, scores.size(3), device=scores.device)
+
+    in_dim_buffers = (None,) * len(other_buffers)
+    score_mod = torch.vmap(score_mod, in_dims=(0, None, None, None, 0) + in_dim_buffers)
+    score_mod = torch.vmap(score_mod, in_dims=(0, None, None, 0, None) + in_dim_buffers)
+    score_mod = torch.vmap(score_mod, in_dims=(0, None, 0, None, None) + in_dim_buffers)
+    score_mod = torch.vmap(score_mod, in_dims=(0, 0, None, None, None) + in_dim_buffers)
+
+    # todo: We wouldn't need these overrides in this file if Dynamo always did the
+    # rewriting.
+    with TransformGetItemToIndex():
+        scores = score_mod(scores, b, h, m, n, *other_buffers).to(working_precision)
+
+    # TODO Unconditionally return logsumexp for backwards
+    # if any(t.requires_grad for t in (query, key, value)):
+    logsumexp = scores.logsumexp(dim=-1)
+
+    scores = scores.softmax(dim=-1)
+
+    return scores.to(query.dtype) @ value, logsumexp
+
+
+@flex_attention.py_impl(DispatchKey.CompositeExplicitAutograd)
+def sdpa_dense(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    *other_buffers: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    out, lse = math_attention(query, key, value, score_mod, *other_buffers)
+    out = out.contiguous()
+    return out, lse
+
+
+def trace_flex_attention(
+    proxy_mode: ProxyTorchDispatchMode,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    *other_buffers: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Traces the flex_attention operator with the given score_mod function and other_buffers.
+
+    Trace SDPA will call make_fx with "fake" example vals and then trace the score_mod function
+    This will produce a GraphModule that will be stored on the root tracer as "sdpa_score". We
+    access this graph module in inductor to inline the score_mod function to the triton template.
+    """
+    example_out = flex_attention(query, key, value, score_mod, *other_buffers)
+    example_vals = [
+        torch.zeros((), dtype=query.dtype, requires_grad=query.requires_grad)
+    ] + [torch.zeros((), dtype=torch.int) for _ in range(4)]
+    with TransformGetItemToIndex():
+        score_graph = make_fx(score_mod)(*example_vals, *other_buffers)
+    qualname = proxy_mode.tracer.get_fresh_qualname("sdpa_score")
+    proxy_mode.tracer.root.register_module(qualname, score_graph)
+    node_args = (query, key, value, score_graph, *other_buffers)
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", flex_attention, proxy_args, {}
+    )
+    return track_tensor_tree(
+        example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
+    )
+
+
+@flex_attention.py_impl(ProxyTorchDispatchMode)
+def flex_attention_proxy_torch_dispatch_mode(
+    mode: ProxyTorchDispatchMode,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    *other_buffers: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert mode is not None, "Mode should always be enabled for python fallback key"
+    if mode.enable_tracing:
+        return trace_flex_attention(mode, query, key, value, score_mod, *other_buffers)
+    else:
+        return flex_attention(query, key, value, score_mod, *other_buffers)
+
+
+@flex_attention.py_functionalize_impl
+def flex_attention_functionalize(
+    ctx: torch._subclasses.functional_tensor.BaseFunctionalizeAPI,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    *other_buffers: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Defines the functionalization rules for the flex_attention operator.
+
+    Write now we are unwrapping each tensor and then redispatching to the next, however we want to
+    guard against any mutations in the score_mod function, to the other_buffers since those
+    are free variables.
+    """
+    query_unwrapped = ctx.unwrap_tensors(query)
+    key_unwrapped = ctx.unwrap_tensors(key)
+    value_unwrapped = ctx.unwrap_tensors(value)
+    other_buffers_unwrapped = ctx.unwrap_tensors(other_buffers)
+
+    # Appease the mypy overlords
+    assert isinstance(query_unwrapped, torch.Tensor)
+    assert isinstance(key_unwrapped, torch.Tensor)
+    assert isinstance(value_unwrapped, torch.Tensor)
+    assert isinstance(other_buffers_unwrapped, tuple)
+    assert all(isinstance(item, torch.Tensor) for item in other_buffers_unwrapped)
+
+    example_vals = (
+        [torch.zeros((), dtype=query.dtype)]
+        + [torch.zeros((), dtype=torch.int) for _ in range(4)]
+        + list(other_buffers_unwrapped)
+    )
+    with ctx.redispatch_to_next() as m:
+        functional_score_mod = ctx.functionalize(score_mod)
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        with TransformGetItemToIndex():
+            mutates = _has_potential_branch_input_mutation(
+                functional_score_mod, example_vals, pre_dispatch
+            )
+        # The only care about mutations of existing buffers since we can't replay these.
+        # However, we can just error if anything is detected
+        if mutates:
+            raise UnsupportedAliasMutationException("Mutations detected in score_mod")
+
+        out = flex_attention(
+            query_unwrapped,
+            key_unwrapped,
+            value_unwrapped,
+            functional_score_mod,
+            *other_buffers_unwrapped,
+        )
+    return ctx.wrap_tensors(out)  # type: ignore[return-value, arg-type]
+
+
+@flex_attention.py_impl(FakeTensorMode)
+def flex_attention_fake_tensor_mode(
+    mode: FakeTensorMode,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    *other_buffers: Tuple[torch.Tensor, ...],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    with mode:
+        batch_size, num_heads, seq_len_q, _ = query.shape
+        logsumexp = query.new_empty(
+            batch_size, num_heads, seq_len_q, dtype=torch.float32
+        )
+        return torch.empty_like(query, memory_format=torch.contiguous_format), logsumexp
+
+
+# ---------------------------- Autograd Implementation ----------------------------
+def create_fw_bw_graph(score_mod, index_values, other_buffers):
+    # See Note:[HOP create fw_bw graph]
+
+    # All of these imports need to be here in order to avoid circular dependencies
+    from torch._dispatch.python import suspend_functionalization
+    from torch._functorch.aot_autograd import AOTConfig, create_joint
+    from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+
+    from torch._subclasses.functional_tensor import disable_functional_mode
+    from torch.fx.experimental.proxy_tensor import disable_proxy_modes_tracing
+
+    dummy_aot_config = AOTConfig(
+        fw_compiler=None,  # type: ignore[arg-type]
+        bw_compiler=None,  # type: ignore[arg-type]
+        partition_fn=None,  # type: ignore[arg-type]
+        decompositions={},
+        num_params_buffers=0,
+        aot_id=0,
+        keep_inference_input_mutations=False,
+    )
+
+    with suspend_functionalization(), disable_functional_mode():
+        with disable_proxy_modes_tracing():
+
+            def _from_fun(t):
+                return torch.empty_strided(
+                    t.size(),
+                    t.stride(),
+                    device=t.device,
+                    dtype=t.dtype,
+                    requires_grad=t.requires_grad,
+                )
+
+            # If someone runs this hop under the default compiler backend ("eager")
+            # Then this path will be run with the actual user inputs. We convert them
+            # to fake tensors in order to not perform any actual compute.
+            from torch._guards import detect_fake_mode
+
+            fake_mode = detect_fake_mode(index_values)
+            if fake_mode is None:
+                fake_mode = FakeTensorMode(allow_non_fake_inputs=True)
+
+            with fake_mode:
+                unwrapped_score_mod_indexes = pytree.tree_map(_from_fun, index_values)
+                unwrapped_other_buffers = pytree.tree_map(_from_fun, other_buffers)
+
+            assert all(isinstance(t, FakeTensor) for t in unwrapped_score_mod_indexes)
+            assert all(isinstance(t, FakeTensor) for t in unwrapped_other_buffers)
+
+            example_flat_out = pytree.tree_map(
+                _from_fun,
+                score_mod(*unwrapped_score_mod_indexes, *unwrapped_other_buffers),
+            )
+            if not isinstance(example_flat_out, torch.Tensor):
+                raise RuntimeError(
+                    "Expected output of score_mod to be a tensor."
+                    f"Got type {type(example_flat_out)}."
+                )
+            example_grad = _from_fun(example_flat_out)
+
+        def joint_f(score, b, h, m, n, example_grad, *other_buffers):
+            def fw_with_masks(*args):
+                fw_out = score_mod(*args)
+                out_requires_grad = fw_out.requires_grad
+                return ((fw_out,), (out_requires_grad,))
+
+            joint = create_joint(fw_with_masks, aot_config=dummy_aot_config)
+            args = [score, b, h, m, n] + list(other_buffers)
+            optional_grad = [example_grad] if example_grad.requires_grad else []
+            _, grads = joint(args, optional_grad)
+
+            return grads
+
+        joint_graph = make_fx(joint_f)(
+            *unwrapped_score_mod_indexes, example_grad, *unwrapped_other_buffers
+        )
+        return score_mod, joint_graph
+
+
+class FlexAttentionAutogradOp(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx, query, key, value, fw_graph, joint_graph, *other_buffers
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        any_buffer_requires_grad = any(buffer.requires_grad for buffer in other_buffers)
+        assert (
+            not any_buffer_requires_grad
+        ), "Captured buffers that require grad are not yet supported."
+        ctx._fw_graph = fw_graph
+        ctx._joint_graph = joint_graph
+        with torch._C._AutoDispatchBelowAutograd():
+            out, logsumexp = flex_attention(query, key, value, fw_graph, *other_buffers)
+
+        ctx.save_for_backward(query, key, value, out, logsumexp, *other_buffers)
+        return out, logsumexp
+
+    @staticmethod
+    def backward(ctx, grad_out, logsumexp_grad):
+        fw_args = ctx.saved_tensors
+        query, key, value, out, logsumexp, *other_buffers = fw_args
+        fw_graph = ctx._fw_graph
+        joint_graph = ctx._joint_graph
+        # We have asserted that other_buffers do not require grad in the forward
+        none_grads = [None] * (2 + len(other_buffers))
+        grad_query, grad_key, grad_value = flex_attention_backward(
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            grad_out,
+            fw_graph,
+            joint_graph,
+            *other_buffers,
+        )
+        return grad_query, grad_key, grad_value, *none_grads
+
+
+@flex_attention.py_impl(DispatchKey.Autograd)
+def flex_attention_autograd(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: Callable,
+    *other_buffers: Tuple[torch.Tensor, ...],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    input_requires_grad = any(t.requires_grad for t in (query, key, value))
+    if torch.is_grad_enabled() and input_requires_grad:
+        example_vals = [
+            torch.zeros((), dtype=query.dtype, requires_grad=input_requires_grad)
+        ] + [torch.zeros((), dtype=torch.int) for _ in range(4)]
+        fw_graph, bw_graph = create_fw_bw_graph(score_mod, example_vals, other_buffers)
+    else:
+        fw_graph, bw_graph = score_mod, None
+    out, logsumexp = FlexAttentionAutogradOp.apply(
+        query, key, value, fw_graph, bw_graph, *other_buffers
+    )
+    return out, logsumexp
+
+
+# ---------------------------- Backward HOP Implementation ----------------------------
+
+
+@flex_attention_backward.py_impl(DispatchKey.CompositeExplicitAutograd)
+def sdpa_dense_backward(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    grad_out: torch.Tensor,
+    fw_graph: Callable,  # GraphModule type hint?
+    joint_graph: Callable,
+    *other_buffers: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    working_precision = torch.float64 if query.dtype == torch.float64 else torch.float32
+    scores = (query @ key.transpose(-2, -1)).to(working_precision)
+
+    b = torch.arange(0, scores.size(0), device=scores.device)
+    h = torch.arange(0, scores.size(1), device=scores.device)
+    m = torch.arange(0, scores.size(2), device=scores.device)
+    n = torch.arange(0, scores.size(3), device=scores.device)
+
+    in_dim_buffers = (None,) * len(other_buffers)
+    score_mod = torch.vmap(fw_graph, in_dims=(0, None, None, None, 0) + in_dim_buffers)
+    score_mod = torch.vmap(score_mod, in_dims=(0, None, None, 0, None) + in_dim_buffers)
+    score_mod = torch.vmap(score_mod, in_dims=(0, None, 0, None, None) + in_dim_buffers)
+    score_mod = torch.vmap(score_mod, in_dims=(0, 0, None, None, None) + in_dim_buffers)
+
+    post_mod_scores = score_mod(scores, b, h, m, n, *other_buffers).to(
+        working_precision
+    )
+
+    softmax_scores = torch.exp(post_mod_scores - logsumexp.unsqueeze(-1))
+
+    grad_value = softmax_scores.to(query.dtype).transpose(-2, -1) @ grad_out
+
+    grad_softmax_scores = grad_out @ value.transpose(-2, -1)
+
+    sum_scores = torch.sum(out * grad_out, -1, keepdim=True)
+    grad_score_mod = softmax_scores * (grad_softmax_scores - sum_scores)
+
+    # Gradient of the inline score_mod function, with respect to the scores
+    in_dim_buffers = (None,) * len(other_buffers)
+    out_dims = [0, None, None, None, None] + [None] * len(other_buffers)
+    joint_score_mod = torch.vmap(
+        joint_graph,
+        in_dims=(0, None, None, None, 0, 0) + in_dim_buffers,
+        out_dims=out_dims,
+    )
+    joint_score_mod = torch.vmap(
+        joint_score_mod,
+        in_dims=(0, None, None, 0, None, 0) + in_dim_buffers,
+        out_dims=out_dims,
+    )
+    joint_score_mod = torch.vmap(
+        joint_score_mod,
+        in_dims=(0, None, 0, None, None, 0) + in_dim_buffers,
+        out_dims=out_dims,
+    )
+    joint_score_mod = torch.vmap(
+        joint_score_mod,
+        in_dims=(0, 0, None, None, None, 0) + in_dim_buffers,
+        out_dims=out_dims,
+    )
+    grad_scores, *_ = joint_score_mod(
+        scores, b, h, m, n, grad_score_mod, *other_buffers
+    )
+    grad_scores = grad_scores.to(query.dtype)
+
+    grad_query = grad_scores @ key
+    grad_key = grad_scores.transpose(-2, -1) @ query
+    return grad_query.contiguous(), grad_key.contiguous(), grad_value.contiguous()
+
+
+def trace_flex_attention_backward(
+    proxy_mode: ProxyTorchDispatchMode,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    grad_out: torch.Tensor,
+    fw_graph: Union[Callable, GraphModule],
+    joint_graph: GraphModule,
+    *other_buffers: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """We already have the forward graph and joint graph from the forward pass, so we create a proxy attach both graphs"""
+    example_out = flex_attention_backward(
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        grad_out,
+        fw_graph,
+        joint_graph,
+        *other_buffers,
+    )
+
+    fw_example_vals = [
+        torch.zeros((), dtype=query.dtype, requires_grad=query.requires_grad)
+    ] + [torch.zeros((), dtype=torch.int) for _ in range(4)]
+    bw_example_vals = fw_example_vals + [torch.zeros((), dtype=query.dtype)]
+    fw_graph = make_fx(fw_graph)(*fw_example_vals, *other_buffers)
+    joint_graph = make_fx(joint_graph)(*bw_example_vals, *other_buffers)
+    proxy_mode.tracer.root.register_module("fw_graph", fw_graph)
+    proxy_mode.tracer.root.register_module("joint_graph", joint_graph)
+    node_args = (
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        grad_out,
+        fw_graph,
+        joint_graph,
+        *other_buffers,
+    )
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function",
+        flex_attention_backward,
+        proxy_args,
+        {},
+        name="flex_attention_backward",
+    )
+    return track_tensor_tree(
+        example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
+    )
+
+
+@flex_attention_backward.py_impl(ProxyTorchDispatchMode)
+def flex_attention_backward_proxy_torch_dispatch_mode(
+    mode: ProxyTorchDispatchMode,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    grad_out: torch.Tensor,
+    fw_graph: Union[Callable, GraphModule],
+    joint_graph: GraphModule,
+    *other_buffers: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    assert mode is not None, "Mode should always be enabled for python fallback key"
+    if mode.enable_tracing:
+        return trace_flex_attention_backward(
+            mode,
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            grad_out,
+            fw_graph,
+            joint_graph,
+            *other_buffers,
+        )
+    else:
+        return flex_attention_backward(
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            grad_out,
+            fw_graph,
+            joint_graph,
+            *other_buffers,
+        )
+
+
+@flex_attention_backward.py_functionalize_impl
+def flex_attention_backward_functionalize(
+    ctx: torch._subclasses.functional_tensor.BaseFunctionalizeAPI,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    grad_out: torch.Tensor,
+    fw_graph: Union[Callable, GraphModule],
+    joint_graph: GraphModule,
+    *other_buffers: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Defines the functionalization rules for the flex_attention operator.
+
+    Write now we are unwrapping each tensor and then redispatching to the next,
+    since we know that the forward score mod function is assured to be free of mutations
+    to the other_buffers, we skip that mutate check and go straight to redispatching.
+    """
+    query_unwrapped = ctx.unwrap_tensors(query)
+    key_unwrapped = ctx.unwrap_tensors(key)
+    value_unwrapped = ctx.unwrap_tensors(value)
+    out_unwrapped = ctx.unwrap_tensors(out)
+    logsumexp_unwrapped = ctx.unwrap_tensors(logsumexp)
+    grad_out_unwrapped = ctx.unwrap_tensors(grad_out)
+    other_buffers_unwrapped = ctx.unwrap_tensors(other_buffers)
+
+    # Appease the mypy overlords
+    assert isinstance(query_unwrapped, torch.Tensor)
+    assert isinstance(key_unwrapped, torch.Tensor)
+    assert isinstance(value_unwrapped, torch.Tensor)
+    assert isinstance(out_unwrapped, torch.Tensor)
+    assert isinstance(logsumexp_unwrapped, torch.Tensor)
+    assert isinstance(grad_out_unwrapped, torch.Tensor)
+    assert isinstance(other_buffers_unwrapped, tuple)
+    assert all(isinstance(item, torch.Tensor) for item in other_buffers_unwrapped)
+
+    with ctx.redispatch_to_next() as m:
+        functional_fw_graph = ctx.functionalize(fw_graph)
+        functional_joint_graph = ctx.functionalize(joint_graph)
+
+        grad_query, grad_key, grad_value = flex_attention_backward(
+            query_unwrapped,
+            key_unwrapped,
+            value_unwrapped,
+            out_unwrapped,
+            logsumexp_unwrapped,
+            grad_out_unwrapped,
+            functional_fw_graph,  # type: ignore[arg-type]
+            functional_joint_graph,  # type: ignore[arg-type]
+            *other_buffers_unwrapped,
+        )
+
+    return ctx.wrap_tensors((grad_query, grad_key, grad_value))  # type: ignore[return-value,arg-type]
+
+
+@flex_attention_backward.py_impl(FakeTensorMode)
+def flex_attention_backward_fake_tensor_mode(
+    mode: FakeTensorMode,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    logsumexp: torch.Tensor,
+    grad_out: torch.Tensor,
+    fw_graph: Union[Callable, GraphModule],
+    joint_graph: GraphModule,
+    *other_buffers: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    with mode:
+        grad_query = torch.empty_like(query, memory_format=torch.contiguous_format)
+        grad_key = torch.empty_like(key, memory_format=torch.contiguous_format)
+        grad_value = torch.empty_like(value, memory_format=torch.contiguous_format)
+        return grad_query, grad_key, grad_value
+
+
+flex_attention_backward.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(flex_attention_backward, deferred_error=True)
+)
diff --git a/torch/_higher_order_ops/map.py b/torch/_higher_order_ops/map.py
index 1ef6abaa0e906..6bef897dfa511 100644
--- a/torch/_higher_order_ops/map.py
+++ b/torch/_higher_order_ops/map.py
@@ -1,22 +1,20 @@
-from contextlib import nullcontext
-
 import torch
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
 from torch._dispatch.python import suspend_functionalization
 from torch._functorch.aot_autograd import AOTConfig, create_joint, from_fun
 
-from torch._higher_order_ops.cond import (
+from torch._higher_order_ops.utils import (
     _has_potential_branch_input_alias,
     _has_potential_branch_input_mutation,
+    reenter_make_fx,
     UnsupportedAliasMutationException,
 )
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch._subclasses.functional_tensor import (
+    disable_functional_mode,
     FunctionalTensor,
-    FunctionalTensorMode,
-    unset_functional_temporarily,
 )
 from torch.fx.experimental.proxy_tensor import (
     disable_proxy_modes_tracing,
@@ -52,7 +50,7 @@ def create_fw_bw_graph(f, num_mapped_args, *args):
     mapped_xs = args[:num_mapped_args]
     pos_args = args[num_mapped_args:]
 
-    # Note: We create "clean" environments for make_fx by suspending all dispatch keys
+    # Note:[HOP create fw_bw graph] We create "clean" environments for make_fx by suspending all dispatch keys
     # between Autograd and Python key. Currently, we only suspend functionalization but more can be
     # added when required. Will encounter two problems if we don't suspend functionalization:
     #
@@ -67,7 +65,7 @@ def create_fw_bw_graph(f, num_mapped_args, *args):
     # when creating the output node, it fails to associate the wrapped tensor with its proxy.
     # Instead, it will create _tensor_constant as output.
 
-    with suspend_functionalization(), unset_functional_temporarily():
+    with suspend_functionalization(), disable_functional_mode():
         with disable_proxy_modes_tracing():
 
             def _from_fun(t):
@@ -95,7 +93,8 @@ def _from_fun(t):
                         return maybe_unfunc_t.clone()
                 return t
 
-            example_xs = [_from_fun(xs) for xs in _unstack_pytree(mapped_xs)[0]]
+            unwrapped_mapped_xs = pytree.tree_map(_from_fun, mapped_xs)
+            example_xs = _unstack_pytree(unwrapped_mapped_xs)[0]
 
             example_pos_args = [
                 _from_fun(arg) if isinstance(arg, torch.Tensor) else arg
@@ -194,7 +193,7 @@ def flat_fn(*flat_args):
         return flat_out
 
     return pytree.tree_unflatten(
-        map_impl(flat_fn, num_mapped_args, *flat_xs, *args), out_spec  # type: ignore[arg-type]
+        map_impl(flat_fn, flat_xs, args), out_spec  # type: ignore[arg-type]
     )
 
 
@@ -205,7 +204,11 @@ def forward(ctx, fw_graph, joint_graph, num_mapped_args, *flat_args):
         ctx._joint_graph = joint_graph
         ctx._num_mapped_args = num_mapped_args
         with torch._C._AutoDispatchBelowAutograd():
-            return (*map_impl(fw_graph, num_mapped_args, *flat_args),)
+            return (
+                *map_impl(
+                    fw_graph, flat_args[:num_mapped_args], flat_args[num_mapped_args:]
+                ),
+            )
 
     @staticmethod
     def backward(ctx, *flat_grads):
@@ -215,32 +218,22 @@ def backward(ctx, *flat_grads):
 
         grads = map_impl(
             ctx._joint_graph,
-            ctx._num_mapped_args + len(flat_grads),
-            *fw_mapped_args,
-            *flat_grads,
-            *pos_args,
+            fw_mapped_args + flat_grads,
+            pos_args,
         )
         return None, None, None, *grads
 
 
-def trace_map(proxy_mode, func_overload, f, num_mapped, *args):
-    xs = list(args[:num_mapped])
-    pos_args = list(args[num_mapped:])
+def trace_map(proxy_mode, func_overload, f, xs, pos_args):
     leading_dim_size = xs[0].shape[0]
 
     example_input = _unstack_pytree(xs)[0]
     body_graph = f
-    if not isinstance(body_graph, torch.fx.GraphModule):
-        body_graph = make_fx(body_graph)(*example_input, *pos_args)
-
-    next_name = None
-    i = 0
-    while not next_name:
-        candidate = f"body_graph_{i}"
-        if hasattr(proxy_mode.tracer.root, candidate):
-            i += 1
-        else:
-            next_name = candidate
+
+    pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
+    body_graph = reenter_make_fx(body_graph, pre_dispatch)(*example_input, *pos_args)
+
+    next_name = proxy_mode.tracer.get_fresh_qualname("body_graph_")
 
     proxy_mode.tracer.root.register_module(next_name, body_graph)
 
@@ -254,7 +247,7 @@ def expand_tensor(t):
 
         expanded_outs = pytree.tree_map(expand_tensor, example_outs)
 
-    node_args = (body_graph, num_mapped, *args)
+    node_args = (body_graph, list(xs), list(pos_args))
     proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
     out_proxy = proxy_mode.tracer.create_proxy(
         "call_function", func_overload, proxy_args, {}, name="map_impl"
@@ -274,13 +267,7 @@ def _unstack_pytree(xs):
             f"Leaves of xs must have same leading dimension size {[xs.shape for xs in flat_xs]}"
         )
 
-    ctx = (
-        FunctionalTensorMode
-        if any(isinstance(x, FunctionalTensor) for x in flat_xs)
-        else nullcontext
-    )
-    with ctx():
-        a = zip(*flat_xs)
+    a = zip(*flat_xs)
 
     pytrees = []
     for tuple in a:
@@ -311,9 +298,7 @@ def _stack_pytree(pytrees):
 
 
 @map_impl.py_impl(DispatchKey.CompositeExplicitAutograd)
-def map_dense(f, num_mapped_args, *args):
-    xs = args[:num_mapped_args]
-    pos_args = args[num_mapped_args:]
+def map_dense(f, xs, pos_args):
     pytrees = []
     for inp in _unstack_pytree(xs):
         pytrees.append(f(*inp, *pos_args))
@@ -321,30 +306,29 @@ def map_dense(f, num_mapped_args, *args):
 
 
 @map_impl.py_impl(DispatchKey.Autograd)
-def map_autograd(f, num_mapped_args, *args):
-    fw_graph, bw_graph = create_fw_bw_graph(f, num_mapped_args, *args)
-    flat_out = MapAutogradOp.apply(fw_graph, bw_graph, num_mapped_args, *args)
+def map_autograd(f, xs, pos_args):
+    num_mapped_args = len(xs)
+    fw_graph, bw_graph = create_fw_bw_graph(f, num_mapped_args, *xs, *pos_args)
+    flat_out = MapAutogradOp.apply(fw_graph, bw_graph, num_mapped_args, *xs, *pos_args)
     return flat_out
 
 
 @map_impl.py_impl(ProxyTorchDispatchMode)
-def map_proxy_torch_dispatch_mode(mode, f, num_mapped, *args):
+def map_proxy_torch_dispatch_mode(mode, f, xs, args):
     if mode.enable_tracing:
-        return trace_map(mode, map_impl, f, num_mapped, *args)
+        return trace_map(mode, map_impl, f, xs, args)
     else:
-        return map_impl(f, num_mapped, *args)
+        return map_impl(f, xs, args)
 
 
 @map_impl.py_impl(FakeTensorMode)
-def map_fake_tensor_mode(mode, f, num_mapped, *args):
+def map_fake_tensor_mode(mode, f, xs, args):
     with mode:
-        return map_dense(f, num_mapped, *args)
+        return map_dense(f, xs, args)
 
 
 @map_impl.py_functionalize_impl
-def map_functionalize(ctx, f, num_mapped, *args):
-    xs = args[:num_mapped]
-    pos_args = args[num_mapped:]
+def map_functionalize(ctx, f, xs, pos_args):
     unwrapped_xs = ctx.unwrap_tensors(xs)
     unwrapped_args = ctx.unwrap_tensors(pos_args)
     wrapped_fn = ctx.functionalize(f)
@@ -352,11 +336,16 @@ def map_functionalize(ctx, f, num_mapped, *args):
     with ctx.redispatch_to_next():
         with disable_proxy_modes_tracing():
             example_inputs = (*_unstack_pytree(unwrapped_xs)[0], *unwrapped_args)
-        if _has_potential_branch_input_mutation(f, example_inputs):
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        if _has_potential_branch_input_mutation(
+            f, example_inputs, pre_dispatch=pre_dispatch
+        ):
             raise UnsupportedAliasMutationException("torch.map is mutating the input!")
 
-        if _has_potential_branch_input_alias(f, example_inputs):
+        if _has_potential_branch_input_alias(
+            f, example_inputs, pre_dispatch=pre_dispatch
+        ):
             raise UnsupportedAliasMutationException("torch.map is aliasing the input!")
 
-        map_return = map_impl(wrapped_fn, num_mapped, *unwrapped_xs, *unwrapped_args)
+        map_return = map_impl(wrapped_fn, unwrapped_xs, unwrapped_args)
         return ctx.wrap_tensors(map_return)
diff --git a/torch/_higher_order_ops/out_dtype.py b/torch/_higher_order_ops/out_dtype.py
index bfce30bb06d8f..f675519ee1829 100644
--- a/torch/_higher_order_ops/out_dtype.py
+++ b/torch/_higher_order_ops/out_dtype.py
@@ -97,12 +97,6 @@ def trace_out_dtype(proxy_mode, func_overload, op, output_dtype, *args):
     return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
 
 
-@out_dtype.py_impl(DispatchKey.PreDispatch)  # type: ignore[attr-defined]
-def out_dtype_predispatch(*args, **kwargs):
-    with torch._C._ExcludeDispatchKeyGuard(torch._C.DispatchKeySet(DispatchKey.PreDispatch)):  # type: ignore[attr-defined]
-        return out_dtype(*args, **kwargs)
-
-
 @out_dtype.py_impl(DispatchKey.CompositeExplicitAutograd)
 def out_dtype_dense(
     op: torch._ops.OpOverload,
diff --git a/torch/_higher_order_ops/strict_mode.py b/torch/_higher_order_ops/strict_mode.py
index 5c22a0be44a70..81c20bc3462b3 100644
--- a/torch/_higher_order_ops/strict_mode.py
+++ b/torch/_higher_order_ops/strict_mode.py
@@ -1,5 +1,3 @@
-from contextlib import contextmanager
-
 import torch
 import torch._subclasses.functional_tensor
 
@@ -8,7 +6,7 @@
 from torch._C import DispatchKey
 from torch._functorch.utils import exposed_in
 
-from torch._higher_order_ops.utils import autograd_not_implemented
+from torch._higher_order_ops.utils import _set_compilation_env, autograd_not_implemented
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.proxy_tensor import (
@@ -20,21 +18,9 @@
 from torch.utils._python_dispatch import _get_current_dispatch_mode
 
 
-@contextmanager
-def _set_compilation_env():
-    _old_is_tracing = torch.fx._symbolic_trace._is_fx_tracing_flag
-    try:
-        # We need to turn off the is_fx_tracing_flag. Remove this flag check from dyanmo
-        # once we are confident fx tracing works with dynamo.
-        torch.fx._symbolic_trace._is_fx_tracing_flag = False
-        yield
-    finally:
-        torch.fx._symbolic_trace._is_fx_tracing_flag = _old_is_tracing
-
-
 @exposed_in("torch")
 def strict_mode(callable, operands):
-    if torch._dynamo.is_compiling():
+    if torch.compiler.is_dynamo_compiling():
         return strict_mode_op(callable, operands)
 
     with _set_compilation_env():
@@ -73,16 +59,7 @@ def trace_strict_mode(mode, strict_mode_op, callable, operands):
     with disable_proxy_modes_tracing():
         graph = make_fx(callable, pre_dispatch=pre_dispatch)(*operands)
 
-    next_name = None
-    i = 0
-    while not next_name:
-        candidate = f"strict_graph_{i}"
-        if hasattr(mode.tracer.root, candidate):
-            i += 1
-        else:
-            next_name = candidate
-
-    graph_name = next_name
+    graph_name = mode.tracer.get_fresh_qualname("strict_graph_")
     mode.tracer.root.register_module(graph_name, graph)
 
     args = (graph, operands)
diff --git a/torch/_higher_order_ops/torchbind.py b/torch/_higher_order_ops/torchbind.py
new file mode 100644
index 0000000000000..235dfe6ec416a
--- /dev/null
+++ b/torch/_higher_order_ops/torchbind.py
@@ -0,0 +1,118 @@
+import logging
+from contextlib import contextmanager
+
+import torch
+from torch._C import DispatchKey  # @manual
+from torch._functorch._aot_autograd.utils import KNOWN_TYPES
+from torch._higher_order_ops.utils import autograd_not_implemented
+from torch._library.fake_class_registry import _ns_and_class_name, FakeScriptObject
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
+from torch.fx.node import has_side_effect
+from torch.utils import _pytree as pytree
+
+log = logging.getLogger(__name__)
+
+# The call_torchbind operator represents a method invocation on a torchbind
+# object. The calling convention is:
+#   call_torchbind(self: ScriptObject, method_name: str, *method_args, **method_kwargs)
+# We do not expect users to write this operator directly. Instead it will be
+# emitted by Dynamo when tracing encounters a torchbind object.
+call_torchbind = HigherOrderOperator("call_torchbind")
+
+# Register this operator as side-effectful with FX.
+# TODO: this is not really sufficient. While passes (hopefully) check
+# Node.is_impure() and make good decisions, we also assume we can execute the
+# graph as many times as we want without changing behavior, which is NOT true of
+# ops that mutate torchbind object state.
+has_side_effect(call_torchbind)
+
+_orig_scriptmethod_call = torch.ScriptMethod.__call__
+
+
+def torchbind_method_redispatch(self, *args, **kwargs):
+    if isinstance(self.raw_owner, torch.ScriptObject):
+        return call_torchbind(self.raw_owner, self.name, *args, **kwargs)
+    return _orig_scriptmethod_call(self, *args, **kwargs)
+
+
+@contextmanager
+def enable_torchbind_tracing():
+    """Context manager that acts as a feature flag to enable torchbind tracing
+    behavior. Once torchbind tracing has been stabilized, we can remove this and
+    turn it always on.
+    """
+    try:
+        KNOWN_TYPES.append(torch.ScriptObject)
+        torch.ScriptMethod.__call__ = torchbind_method_redispatch  # type: ignore[method-assign]
+        yield
+    finally:
+        assert (
+            KNOWN_TYPES.pop() is torch.ScriptObject
+        ), "Someone else messed with KNOWN_TYPES during tracing, exploding."
+        torch.ScriptMethod.__call__ = _orig_scriptmethod_call  # type: ignore[method-assign]
+
+
+@call_torchbind.py_impl(DispatchKey.CompositeExplicitAutograd)
+def call_torchbind_impl(obj, method, *args, **kwargs):
+    if isinstance(obj, torch.ScriptObject):
+        return _orig_scriptmethod_call(getattr(obj, method), *args, **kwargs)
+    elif isinstance(obj, FakeScriptObject):
+        return getattr(obj.wrapped_obj, method)(*args, **kwargs)
+    else:
+        raise RuntimeError(f"Unsupported first arg type {type(obj)} for call_torchbind")
+
+
+@call_torchbind.py_impl(ProxyTorchDispatchMode)
+def inner(mode, *args, **kwargs):
+    if mode.enable_tracing:
+        proxy_args = pytree.tree_map(mode.tracer.unwrap_proxy, args)
+        proxy_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, kwargs)
+
+        out_proxy = mode.tracer.create_proxy(
+            "call_function",
+            call_torchbind,
+            proxy_args,
+            proxy_kwargs,
+        )
+        out = call_torchbind(*args, **kwargs)
+
+        obj, method, *rest_args = args
+        if isinstance(obj, torch.ScriptObject):
+            ns, class_name = _ns_and_class_name(
+                obj._type().qualified_name()  # type: ignore[attr-defined]
+            )
+            log.warning(
+                "Tracing torchbind method %s.%s with real ScriptObject. This may"
+                " cause the original object being mutated. If this is not intended,"
+                ' You can register a fake class with torch._library.register_fake_class("%s::%s").',
+                class_name,
+                method,
+                ns,
+                class_name,
+            )
+
+        return track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
+    else:
+        return call_torchbind(*args, **kwargs)
+
+
+# TODO: currently we just run the C++ implementation with fake tensors.
+# But we should make it possible to register a fake torchbind implementation.
+@call_torchbind.py_impl(FakeTensorMode)
+def call_torchbind_fake(mode, *args, **kwargs):
+    with mode:
+        return call_torchbind_impl(*args, **kwargs)
+
+
+call_torchbind.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(call_torchbind, deferred_error=True)
+)
+
+
+@call_torchbind.py_functionalize_impl
+def call_torchbind_func(ctx, *args, **kwargs):
+    args = ctx.unwrap_tensors(args)
+    with ctx.redispatch_to_next():
+        return ctx.wrap_tensors(call_torchbind(*args, **kwargs))
diff --git a/torch/_higher_order_ops/triton_kernel_wrap.py b/torch/_higher_order_ops/triton_kernel_wrap.py
index 9bc5f81fe3515..9c3307476ef97 100644
--- a/torch/_higher_order_ops/triton_kernel_wrap.py
+++ b/torch/_higher_order_ops/triton_kernel_wrap.py
@@ -1,7 +1,9 @@
-import ast
 import dataclasses
+import inspect
+import logging
 import threading
-from typing import Any, Dict
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Union
 
 import torch.utils._pytree as pytree
 from torch import Tensor
@@ -15,6 +17,8 @@
     track_tensor_tree,
 )
 
+log = logging.getLogger("torch._dynamo")
+
 
 ###############################################################################
 # Kernel Side Table
@@ -27,6 +31,7 @@
 class KernelSideTable:
     id_to_kernel: Dict[int, Any] = dict()
     kernel_to_id: Dict[Any, int] = dict()
+    constant_args: Dict[int, Any] = dict()
     lock = threading.Lock()
 
     # Returns index on the table
@@ -46,11 +51,26 @@ def get_kernel(self, idx: int):
         assert idx in self.id_to_kernel
         return self.id_to_kernel[idx]
 
+    # Not every constant arg can be added to the graph. Use this side table
+    # for constant args.
+    def add_constant_args(self, args) -> int:
+        with self.lock:
+            idx = len(self.constant_args)
+            self.constant_args[idx] = args
+            return idx
+
+    # Returns the constant args
+    def get_constant_args(self, idx: int):
+        # No need to lock here as fetching from dict is atomic
+        assert idx in self.constant_args
+        return self.constant_args[idx]
+
     # Resets the table (only meant to be used in unit tests)
     # This is only safe assuming single threaded execution
     def reset_table(self) -> None:
         self.id_to_kernel = dict()
         self.kernel_to_id = dict()
+        self.constant_args = dict()
 
 
 kernel_side_table = KernelSideTable()
@@ -60,89 +80,410 @@ def reset_table(self) -> None:
 # Mutation Tracker
 
 
-@dataclasses.dataclass
-class MutationInfo:
-    mutated: bool = False
-    used_in_unknown: bool = False
-
-
-# Super basic mutation tracking pass that tracks which inputs are used in stores
-# It bails if any of the inputs are used in non tl.load/tl.store positions.
-# This pass will miss simple things like
-# a = in_ptr
-# tl.load(a, ...)
-# since it does not do any contextual analysis. This means that we might incorrectly
-# find extra mutations but this is safe as it would only be incorrect to miss
-# mutations.
-class MutationTracker(ast.NodeVisitor):
-    ALLOWED_READ_FNS = {
-        "load",
-        "max_constancy",
-        "max_contiguous",
-        "multiple_of",
-        "static_print",
-        "static_assert",
-        "device_print",
-        "device_assert",
-    }
+@dataclasses.dataclass(frozen=True)
+class Param:
+    idx: int
 
-    def __init__(self, infos) -> None:
-        super().__init__()
-        self.infos = infos
-        self.read_depth = 0
-        self.in_store = False
 
-    def visit_Name(self, node):
-        if node.id not in self.infos:
-            return
-        if self.read_depth:
-            pass
-        elif self.in_store:
-            self.infos[node.id].mutated = True
+@dataclasses.dataclass(frozen=True)
+class Intermediate:
+    idx: int
+
+    def fake(self):
+        return self.idx < 0
+
+
+@dataclasses.dataclass(frozen=True)
+class Op:
+    name: str
+    fn_call_name: Optional[str]
+    args: List[Union[Param, Intermediate]]
+    ret: Intermediate = dataclasses.field(repr=False)
+
+    def __post_init__(self):
+        if self.name == "tt.call":
+            assert self.fn_call_name is not None
         else:
-            self.infos[node.id].used_in_unknown = True
-
-    def visit_Call(self, node):
-        # TODO(oulgen): Here we assume that there exists a line called
-        # from triton import language as tl. This needs to be checked
-        # as if someones imports xyz as tl then we will incorrectly
-        # assume a mutation but this would be ok as it is only unsafe to
-        # miss a mutation.
-        if (
-            isinstance(node.func, ast.Attribute)
-            and isinstance(node.func.value, ast.Name)
-            and node.func.value.id == "tl"
-        ):
-            if node.func.attr == "store":
-                # Do not allow for store to appear inside a read
-                # tl.load(a if tl.store(b) else z) is not useful
-                # and allowing this would complicate the analysis
-                assert self.read_depth == 0
-                assert self.in_store is False
-                self.in_store = True
-                self.generic_visit(node)
-                self.in_store = False
-                return
-            if node.func.attr in self.ALLOWED_READ_FNS:
-                self.read_depth += 1
-                self.generic_visit(node)
-                self.read_depth -= 1
-                return
-        self.generic_visit(node)
-
-
-def filter_non_mutated(kernel, tensors):
+            assert self.fn_call_name is None
+
+
+def generate_ttir(kernel, kwargs):
+    """
+    Uses Triton's internal code generation to create TTIR
+    """
+    import sympy
+    import triton
+    from triton.compiler.compiler import ASTSource
     from triton.runtime.autotuner import Autotuner
+    from triton.runtime.jit import JITFunction
+
+    import torch
+    from torch._subclasses.fake_tensor import FakeTensor
 
     if isinstance(kernel, Autotuner):
+        if len(kernel.configs) > 0:
+            # If we are autotuning, then it doesn't matter which version gets
+            # picked for tracing purposes, so lets pick the first one
+            kwargs = {**kwargs, **kernel.configs[0].kwargs}
         kernel = kernel.fn
 
-    infos = {name: MutationInfo() for name in tensors}
-    tracker = MutationTracker(infos)
-    tracker.visit(kernel.parse())
-    return [
-        name for name, info in infos.items() if info.mutated or info.used_in_unknown
+    assert isinstance(kernel, JITFunction)
+
+    if len(kwargs) != len(kernel.arg_names):
+        raise ValueError("Incorrect number of arguments passed to kernel")
+
+    # Replace all SymExprs with a regular value for TTIR generation
+    # Replace all FakeTensor/TensorBox with real tensors
+    # These replacements are needed for triton's type, key and config functions
+    ordered_args: Dict[str, Any] = {}
+    for name in kernel.arg_names:
+        a = kwargs[name]
+        if isinstance(a, (torch.SymInt, torch.SymFloat, torch.SymBool, sympy.Expr)):
+            ordered_args[name] = 2
+        elif isinstance(a, (FakeTensor, torch._inductor.ir.TensorBox)):
+            with torch._C._DisableTorchDispatch():
+                ordered_args[name] = torch.empty(2, dtype=a.dtype)
+        else:
+            ordered_args[name] = a
+
+    ordered_tensor_names = [
+        name for name, arg in ordered_args.items() if isinstance(arg, Tensor)
     ]
+    specialization = kernel._get_config(*ordered_args.values())
+    constants = {
+        i: arg
+        for i, arg in enumerate(ordered_args.values())
+        if not isinstance(arg, Tensor)
+    }
+
+    # Build kernel signature -- doesn't include constexpr arguments.
+    signature = {
+        i: kernel._type_of(kernel._key_of(arg))
+        for i, arg in enumerate(ordered_args.values())
+        if i not in kernel.constexprs
+    }
+
+    context = triton._C.libtriton.ir.context()
+    target = triton.runtime.driver.active.get_current_target()
+    backend = triton.compiler.compiler.make_backend(target)
+    options = backend.parse_options(dict())
+    triton._C.libtriton.ir.load_dialects(context)
+    backend.load_dialects(context)
+
+    src = ASTSource(kernel, signature, constants, specialization)
+
+    # Triton changes ASTSource.make_ir to take 3 arguments. Handle
+    # backward compatibility here.
+    if len(inspect.signature(src.make_ir).parameters) == 2:
+        ttir_module = src.make_ir(options, context)
+    else:
+        codegen_fns = backend.get_codegen_implementation()
+        ttir_module = src.make_ir(options, codegen_fns, context)
+    if not ttir_module.verify():
+        raise RuntimeError("Verification for TTIR module has failed")
+
+    return ttir_module, ordered_tensor_names
+
+
+def ttir_to_functions(ttir_module) -> Dict[str, Dict[Intermediate, List[Op]]]:
+    """
+    Walk the `ttir_module` bottom up to mine the `functions` from
+    the structured MLIR entities representing the Triton kernel
+    (mlir::Operation, mlir::Block, mlir::Region).
+    """
+    functions: Dict[str, Dict[Intermediate, List[Op]]] = {}
+
+    # block id --> op result (Intermediate) --> one or more ops
+    op_stack: Dict[int, Dict[Intermediate, List[Op]]] = defaultdict(
+        lambda: defaultdict(list)
+    )
+    region_id_to_block_ids: Dict[int, List[int]] = defaultdict(list)
+    block_id_to_block_arg_ids: Dict[int, List[int]] = {}
+    replacements: Dict[int, Union[Intermediate, Param]] = {}
+    reindex_map: Dict[int, int] = {}
+    next_fake_intermediate = 0
+
+    def reindex(idx):
+        if idx not in reindex_map:
+            reindex_map[idx] = len(reindex_map)
+        return reindex_map[idx]
+
+    def mlir_to_functions(op) -> None:
+        name: str = op.get_name()
+        if name == "builtin.module":
+            # this wraps all tt.func ops
+            return
+
+        operand_ids: List[int] = [
+            reindex(op.get_operand(i).id()) for i in range(op.get_num_operands())
+        ]
+        result_ids: List[int] = [
+            reindex(op.get_result(i).id()) for i in range(op.get_num_results())
+        ]
+
+        child_block_ids: List[int] = []
+        for i in [op.get_region(i).id() for i in range(op.get_num_regions())]:
+            # as the walk is bottom-up, the region_id_to_block_ids[i]
+            # must be populated by the time we process the enclosing op
+            child_block_ids.extend(region_id_to_block_ids[i])
+
+        parent_block_id = -1
+        parent_block = op.get_block()
+        if parent_block is not None:
+            parent_block_id = parent_block.id()
+            if parent_block_id not in block_id_to_block_arg_ids:
+                block_id_to_block_arg_ids[parent_block_id] = []
+                for i in range(parent_block.get_num_arguments()):
+                    block_id_to_block_arg_ids[parent_block_id].append(
+                        reindex(parent_block.get_argument(i).id()),
+                    )
+                # the region info is collected via ops' parent blocks to be
+                # used later when the region's encloding op is traversed
+                parent_region = parent_block.get_parent()
+                if parent_region is not None:
+                    region_id_to_block_ids[parent_region.id()].append(parent_block_id)
+
+        nonlocal next_fake_intermediate
+
+        if name == "tt.func":
+            # for function ops: gather and inline
+            # the ops from all child blocks
+            fn_ops = defaultdict(list)
+            for child_block_id in child_block_ids:
+                for result, block_fn_ops in op_stack.pop(child_block_id).items():
+                    for block_fn_op in block_fn_ops:
+                        fn_ops[result].append(block_fn_op)
+
+            # replace the corresponding Intermediates in the
+            # child op args with the function args (Params)
+            for i, idx in enumerate(block_id_to_block_arg_ids[child_block_ids[0]]):
+                replacements[idx] = Param(i)
+
+            for fn_op_list in fn_ops.values():
+                for fn_op in fn_op_list:
+                    for i in range(len(fn_op.args)):
+                        arg = fn_op.args[i]
+                        seen = set()  # to break cycles
+                        # there can be transitive replacements, but likely
+                        # no cycles (we keep the `seen` set just in case)
+                        while (
+                            isinstance(arg, Intermediate)
+                            and arg.idx in replacements
+                            and arg.idx not in seen
+                        ):
+                            seen.add(arg.idx)
+                            arg = fn_op.args[i] = replacements[arg.idx]
+
+            # next function capture starts
+            # with empty replacements
+            replacements.clear()
+
+            fn_name = op.get_str_attr("sym_name")
+            functions[fn_name] = fn_ops
+        elif child_block_ids:
+            if name in {"scf.if", "scf.for", "scf.while", "tt.reduce", "tt.scan"}:
+                # for blocked ops: inline the enclosed ops into
+                # the parent block + rewire the last op in each
+                # child block to return the block result
+                return_ops = []
+                for block_id in child_block_ids:
+                    if name.startswith("scf."):
+                        # the scf block args are ignored by the pass. but, as they
+                        # may be used as operands of the ops inside the block
+                        # (and nested blocks inlined in the current block by now),
+                        # they are replaced by new fake Intermediates to avoid "this
+                        # operand is not returned by any other op in the fn" error
+                        # in the downstream analysis
+                        for idx in block_id_to_block_arg_ids[block_id]:
+                            next_fake_intermediate -= 1
+                            replacements[idx] = Intermediate(next_fake_intermediate)
+                    else:
+                        assert name in ("tt.reduce", "tt.scan")
+                        # wire the block arguments to the op arguments
+                        num_operands = len(operand_ids)
+                        block_arg_ids = block_id_to_block_arg_ids[block_id]
+                        assert len(block_arg_ids) == 2 * num_operands, (
+                            f"{name} is expected to have twice as "
+                            "many block arguments as op arguments: "
+                            f"{operand_ids=}, {block_arg_ids=}."
+                        )
+                        for i, idx in enumerate(block_arg_ids):
+                            # for a tt.reduce/tt.scan op with N arguments, the block
+                            # arguments comprise N reduced values followed by
+                            # N current values corresponding to the N op args
+                            replacements[idx] = Intermediate(
+                                operand_ids[i % num_operands]
+                            )
+
+                    if block_id in op_stack:
+                        block_ops = op_stack.pop(block_id)
+                        if not block_ops:
+                            continue
+                        last_ret, last_ops = block_ops.popitem()
+                        if all(
+                            op.name
+                            in ("scf.yield", "tt.reduce.return", "tt.scan.return")
+                            for op in last_ops
+                        ):
+                            # if last_ops are all return ops, treat them separately
+                            return_ops.extend(last_ops)
+                        else:
+                            # otherwise, return last_ops to the block
+                            block_ops[last_ret] = last_ops
+                        for op_result, child_ops in block_ops.items():
+                            op_stack[parent_block_id][op_result].extend(child_ops)
+
+                scf_results = [Intermediate(idx) for idx in result_ids]
+                for scf_result in scf_results:
+                    for return_op in return_ops:
+                        op_stack[parent_block_id][scf_result].append(return_op)
+            else:
+                raise RuntimeError(
+                    f"Unknown blocked function: {name}. Can't capture the TTIR."
+                )
+        else:
+            callee = None
+            if name == "tt.call":
+                callee = op.get_flat_symbol_ref_attr("callee")
+            args: List[Union[Param, Intermediate]] = [
+                Intermediate(operand) for operand in operand_ids
+            ]
+            block_ops = op_stack[parent_block_id]
+            if result_ids:
+                for result_id in result_ids:
+                    res = Intermediate(result_id)
+                    block_ops[res].append(Op(name, callee, args, res))
+            else:
+                next_fake_intermediate -= 1
+                fake_res = Intermediate(next_fake_intermediate)
+                block_ops[fake_res].append(Op(name, callee, args, fake_res))
+
+    ttir_module.walk(mlir_to_functions)
+
+    return functions
+
+
+class MemoizeWithCycleCheck:
+    def __init__(self, fn):
+        self.fn = fn
+        self.reset()
+
+    def __call__(self, functions, fn_name, num_args):
+        key = (fn_name, num_args)
+        if key not in self.cache:
+            self.cache[key] = None
+            self.cache[key] = self.fn(functions, fn_name, num_args)
+        if self.cache[key] is None:
+            raise RuntimeError("Recursion is not supported")
+        return self.cache[key]
+
+    def reset(self):
+        self.cache = {}
+
+
+@MemoizeWithCycleCheck
+def analyze_kernel_mutations(functions, fn_name, num_args):
+    """
+    Analyzes the graph to detect all sinks from a predefined list of sinks
+    by using triton's MemWrite trait list. NOTE: What if triton exposed this?
+    From each sink, it traverses the CFG backwards to identify all the input
+    pointers that are mutated.
+    """
+    # Name of mutation op to mutated parameter indices
+    # List from Triton Github include/triton/Dialect/Triton/IR/TritonOps.td
+    # All the OPs that have MemWrite trait.
+    # What if Triton exposed this?
+    MUTATION_OPS = {"tt.store": [0], "tt.atomic_cas": [0], "tt.atomic_rmw": [0]}
+    # Ops that we want to bail out on
+    UNKNOWN_OPS = {"tt.elementwise_inline_asm"}
+
+    stack: List[Union[Param, Intermediate]] = []
+    visited = set()
+    ops = functions[fn_name]
+    for op_list in ops.values():
+        for op in op_list:
+            if op.name in UNKNOWN_OPS:
+                raise RuntimeError(
+                    f"ttir analysis hit an op we do not know how to analyze: {op.name}"
+                )
+
+            if op.name == "tt.call":
+                assert op.fn_call_name in functions
+                mutations = analyze_kernel_mutations(
+                    functions, op.fn_call_name, len(op.args)
+                )
+                stack.extend(arg for arg, mutated in zip(op.args, mutations) if mutated)
+            else:
+                for idx in MUTATION_OPS.get(op.name, []):
+                    stack.append(op.args[idx])
+
+    # The following is an iterative DFS algorithm
+    mutated = [False] * num_args
+    while stack:
+        arg = stack.pop()
+        if arg in visited:
+            continue
+
+        visited.add(arg)
+
+        if isinstance(arg, Param):
+            if arg.idx >= num_args:
+                # This is an argument defined in the kernel, not passed in
+                continue
+            mutated[arg.idx] = True
+        elif isinstance(arg, Intermediate) and not arg.fake():
+            for op in ops[arg]:
+                # Skip arguments to load
+                if op.name != "tt.load":
+                    stack.extend(op.args)
+    return mutated
+
+
+def identify_mutated_tensors(kernel, kwargs):
+    """
+    Given a triton kernel and the arguments for this kernel, this function
+    1) Retrieves the TTIR converted version of the kernel from Triton's API.
+    2) Parses the TTIR and creates a control flow graph
+    3) Analyzes the graph to detect all input tensor mutations
+    """
+
+    ttir_module = None
+    functions = None
+    try:
+        ttir_module, ordered_tensor_names = generate_ttir(kernel, kwargs)
+
+        # extract functions from TTIR using MLIR bindings exposed by Triton code
+        functions = ttir_to_functions(ttir_module)
+
+        assert functions is not None
+        kernel_name = next(iter(functions.keys()))
+        # Triton codegen modifies the name
+        assert kernel.fn.__name__ in kernel_name
+        # Reset the cache between top level invocations
+        # The cache for analyze kernel mutations is mainly used for cycle
+        # detection, so each top level invocation needs a clean cache
+        analyze_kernel_mutations.reset()
+        mutations = analyze_kernel_mutations(
+            functions, kernel_name, len(ordered_tensor_names)
+        )
+
+        return [
+            ordered_tensor_names[i] for i, mutated in enumerate(mutations) if mutated
+        ]
+    except Exception as e:
+        log.warning(
+            "Encountered an exception in identify_mutated_tensors, assuming every input is mutated",
+            exc_info=True,
+        )
+        if ttir_module is not None:
+            log.debug("TTIR:\n%s", str(ttir_module))
+        if functions is not None:
+            log.debug("functions:")
+            for name, fn in functions.items():
+                log.debug("===\t%s\t===", name)
+                for ret, ops in fn.items():
+                    log.debug("%s\t=>\t%s", ret, ops)
+        return [key for key, value in kwargs.items() if isinstance(value, Tensor)]
 
 
 ###############################################################################
@@ -168,10 +509,13 @@ def __init__(self):
 
 
 @triton_kernel_wrapper_mutation.py_impl(DispatchKey.CompositeExplicitAutograd)
-def triton_kernel_wrapper_mutation_dense(*, kernel_idx, grid, kwargs):
+def triton_kernel_wrapper_mutation_dense(
+    *, kernel_idx, constant_args_idx, grid, kwargs
+):
     from torch._inductor.codegen.wrapper import user_defined_kernel_grid_fn_code
 
     kernel = kernel_side_table.get_kernel(kernel_idx)
+    constant_args = kernel_side_table.get_constant_args(constant_args_idx)
 
     if len(grid) == 1:
         grid_fn = grid[0]
@@ -183,11 +527,13 @@ def triton_kernel_wrapper_mutation_dense(*, kernel_idx, grid, kwargs):
         exec(code, namespace)
         grid_fn = namespace[fn_name]
 
-    kernel[grid_fn](**kwargs)
+    kernel[grid_fn](**kwargs, **constant_args)
 
 
 @triton_kernel_wrapper_mutation.py_impl(FakeTensorMode)
-def triton_kernel_wrapper_mutation_fake_tensor_mode(mode, *, kernel_idx, grid, kwargs):
+def triton_kernel_wrapper_mutation_fake_tensor_mode(
+    mode, *, kernel_idx, constant_args_idx, grid, kwargs
+):
     with mode:
         return None
 
@@ -209,35 +555,48 @@ def trace_triton_kernel_wrapper(proxy_mode, func_overload, node_args):
 
 @triton_kernel_wrapper_mutation.py_impl(ProxyTorchDispatchMode)
 def triton_kernel_wrapper_mutation_proxy_torch_dispatch_mode(
-    mode, *, kernel_idx, grid, kwargs
+    mode, *, kernel_idx, constant_args_idx, grid, kwargs
 ):
     if mode.enable_tracing:
         trace_triton_kernel_wrapper(
             mode,
             triton_kernel_wrapper_mutation,
-            {"kernel_idx": kernel_idx, "grid": grid, "kwargs": kwargs},
+            {
+                "kernel_idx": kernel_idx,
+                "constant_args_idx": constant_args_idx,
+                "grid": grid,
+                "kwargs": kwargs,
+            },
         )
     else:
-        triton_kernel_wrapper_mutation(kernel_idx=kernel_idx, grid=grid, kwargs=kwargs)
+        triton_kernel_wrapper_mutation(
+            kernel_idx=kernel_idx,
+            constant_args_idx=constant_args_idx,
+            grid=grid,
+            kwargs=kwargs,
+        )
 
     return None
 
 
 @triton_kernel_wrapper_mutation.py_functionalize_impl
-def triton_kernel_wrapper_mutation_functionalize(ctx, kernel_idx, grid, kwargs):
+def triton_kernel_wrapper_mutation_functionalize(
+    ctx, kernel_idx, constant_args_idx, grid, kwargs
+):
     unwrapped_kwargs = ctx.unwrap_tensors(kwargs)
-    tensors_to_clone = [
-        key for key, value in unwrapped_kwargs.items() if isinstance(value, Tensor)
-    ]
     kernel = kernel_side_table.get_kernel(kernel_idx)
+    constant_args = kernel_side_table.get_constant_args(constant_args_idx)
     # TODO(oulgen): Preexisting bug, if two kernel inputs are views of each
     # other, and one gets mutated in kernel, and later another gets mutated,
     # they are no longer equal. Fix this by graph breaking on this condition
     # earlier in dynamo.
-    tensors_to_clone = filter_non_mutated(kernel, tensors_to_clone)
+    tensors_to_clone = identify_mutated_tensors(
+        kernel, {**unwrapped_kwargs, **constant_args}
+    )
     with ctx.redispatch_to_next():
         unwrapped_outputs = triton_kernel_wrapper_functional(
             kernel_idx=kernel_idx,
+            constant_args_idx=constant_args_idx,
             grid=grid,
             kwargs=unwrapped_kwargs,
             tensors_to_clone=tensors_to_clone,
@@ -255,15 +614,12 @@ def triton_kernel_wrapper_mutation_functionalize(ctx, kernel_idx, grid, kwargs):
         ctx.mark_mutation_hidden_from_autograd(input_arg)
         ctx.commit_update(input_arg)
         ctx.sync(input_arg)
-        # sync calls replace_ under the hood, so again indicate that
-        # this indirect replace is hidden from autograd
-        ctx.mark_mutation_hidden_from_autograd(input_arg)
     return None
 
 
 @triton_kernel_wrapper_functional.py_impl(DispatchKey.CompositeExplicitAutograd)
 def triton_kernel_wrapper_functional_dense(
-    *, kernel_idx, grid, kwargs, tensors_to_clone
+    *, kernel_idx, constant_args_idx, grid, kwargs, tensors_to_clone
 ):
     # TODO(oulgen): For performance reasons, we want to ensure that these
     # `clone_preserve_strides` calls are never executed at runtime
@@ -273,13 +629,18 @@ def triton_kernel_wrapper_functional_dense(
         key: (clone_preserve_strides(val) if key in tensors_to_clone else val)
         for key, val in kwargs.items()
     }
-    triton_kernel_wrapper_mutation(kernel_idx=kernel_idx, grid=grid, kwargs=kwargs)
+    triton_kernel_wrapper_mutation(
+        kernel_idx=kernel_idx,
+        constant_args_idx=constant_args_idx,
+        grid=grid,
+        kwargs=kwargs,
+    )
     return {key: val for key, val in kwargs.items() if key in tensors_to_clone}
 
 
 @triton_kernel_wrapper_functional.py_impl(FakeTensorMode)
 def triton_kernel_wrapper_functional_fake_tensor_mode(
-    mode, *, kernel_idx, grid, kwargs, tensors_to_clone
+    mode, *, kernel_idx, constant_args_idx, grid, kwargs, tensors_to_clone
 ):
     # TODO(oulgen): For performance reasons, we want to ensure that these
     # `clone_preserve_strides` calls are never executed at runtime
@@ -295,7 +656,7 @@ def triton_kernel_wrapper_functional_fake_tensor_mode(
 
 @triton_kernel_wrapper_functional.py_impl(ProxyTorchDispatchMode)
 def triton_kernel_wrapper_functional_proxy_torch_dispatch_mode(
-    mode, *, kernel_idx, grid, kwargs, tensors_to_clone
+    mode, *, kernel_idx, constant_args_idx, grid, kwargs, tensors_to_clone
 ):
     if mode.enable_tracing:
         return trace_triton_kernel_wrapper(
@@ -303,6 +664,7 @@ def triton_kernel_wrapper_functional_proxy_torch_dispatch_mode(
             triton_kernel_wrapper_functional,
             {
                 "kernel_idx": kernel_idx,
+                "constant_args_idx": constant_args_idx,
                 "grid": grid,
                 "kwargs": kwargs,
                 "tensors_to_clone": tensors_to_clone,
@@ -319,12 +681,13 @@ def triton_kernel_wrapper_functional_proxy_torch_dispatch_mode(
 
 @triton_kernel_wrapper_functional.py_functionalize_impl
 def triton_kernel_wrapper_functional_functionalize(
-    ctx, kernel_idx, grid, kwargs, tensors_to_clone
+    ctx, kernel_idx, constant_args_idx, grid, kwargs, tensors_to_clone
 ):
     unwrapped_kwargs = ctx.unwrap_tensors(kwargs)
     with ctx.redispatch_to_next():
         outputs = triton_kernel_wrapper_functional(
             kernel_idx=kernel_idx,
+            constant_args_idx=constant_args_idx,
             grid=grid,
             kwargs=unwrapped_kwargs,
             tensors_to_clone=tensors_to_clone,
diff --git a/torch/_higher_order_ops/utils.py b/torch/_higher_order_ops/utils.py
index a26758b74cc4e..32bb465041ce5 100644
--- a/torch/_higher_order_ops/utils.py
+++ b/torch/_higher_order_ops/utils.py
@@ -1,8 +1,18 @@
+from contextlib import contextmanager
+from dataclasses import dataclass
 from typing import Any, Callable
 
 import torch
+import torch.fx.traceback as fx_traceback
 import torch.utils._pytree as pytree
 from torch._ops import HigherOrderOperator
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.multiprocessing.reductions import StorageWeakRef
+
+
+@dataclass
+class UnsupportedAliasMutationException(RuntimeError):
+    reason: str
 
 
 def autograd_not_implemented_inner(
@@ -52,3 +62,138 @@ def inner(*args, **kwargs):
         return autograd_not_implemented_inner(op, deferred_error, *args, **kwargs)
 
     return inner
+
+
+def _maybe_run_with_interpreter(fn):
+    maybe_interpreted_fn = fn
+    if isinstance(fn, torch.fx.GraphModule) and fx_traceback.has_preserved_node_meta():
+        # Running graph with interpreter is needed for propagating the stack_trace
+        def graph_with_interpreter(*args):
+            with fx_traceback.preserve_node_meta():
+                return torch.fx.Interpreter(fn).run(*args)
+
+        maybe_interpreted_fn = graph_with_interpreter
+    return maybe_interpreted_fn
+
+
+# We'll use the current decomposition table to make sure operators in subgraphs are
+# decomposed properly.
+# We also need to maybe run with interpreter for propagating stack_trace
+def reenter_make_fx(fn, pre_dispatch=False):
+    decomp_table = torch.fx.experimental.proxy_tensor.CURRENT_DECOMPOSITION_TABLE
+    return make_fx(
+        _maybe_run_with_interpreter(fn),
+        decomposition_table=decomp_table,
+        pre_dispatch=pre_dispatch,
+    )
+
+
+@contextmanager
+def _set_compilation_env():
+    _old_is_tracing = torch.fx._symbolic_trace._is_fx_tracing_flag
+    try:
+        # We need to turn off the is_fx_tracing_flag. Remove this flag check from dyanmo
+        # once we are confident fx tracing works with dynamo.
+        torch.fx._symbolic_trace._is_fx_tracing_flag = False
+        yield
+    finally:
+        torch.fx._symbolic_trace._is_fx_tracing_flag = _old_is_tracing
+
+
+def _has_potential_branch_input_mutation(branch, inputs, pre_dispatch=False):
+    """
+    Dispatch-trace the branch with inputs and check if
+    producing graph has mutable op on the input. This is
+    bit restrictive as the branch must be traceable.
+    """
+    try:
+        gm = make_fx(branch, pre_dispatch=pre_dispatch)(*inputs)
+    except UnsupportedAliasMutationException:
+        # this can happen when nested cond_op is
+        # functionalized
+        return True
+    except Exception as e:
+        raise e
+
+    def _detect_input_mutation(gm):
+        input_nodes = set()
+        for node in gm.graph.nodes:
+            if node.op == "placeholder":
+                input_nodes.add(node)
+            if node.op == "call_function":
+                target = node.target
+                if (
+                    isinstance(target, torch._ops.OpOverload)
+                    and target._schema.is_mutable
+                ):
+                    for arg in node.args:
+                        if arg in input_nodes:
+                            return True
+
+        for _, module in gm.named_children():
+            if isinstance(module, torch.fx.GraphModule):
+                if _detect_input_mutation(module):
+                    return True
+
+        return False
+
+    return _detect_input_mutation(gm)
+
+
+def _has_potential_branch_input_alias(branch, inputs, pre_dispatch=False):
+    """
+    Dispatch-trace the branch with inputs and check if
+    producing graph has output aliasing the branch input. This is
+    bit restrictive as the branch must be traceable.
+    """
+    try:
+        gm = make_fx(branch, pre_dispatch=pre_dispatch)(*inputs)
+    except UnsupportedAliasMutationException:
+        # this can happen when nested cond_op is
+        # functionalized
+        return True
+    except Exception as e:
+        raise e
+
+    def _detect_input_alias(gm):
+        input_storages = set()
+        for node in gm.graph.nodes:
+            # We need to check existence of "val" because we reuse the logic here
+            # for map operator, where num_mapped_args is a scalar
+            # and doesn't have a "val" meta.
+            if node.op == "placeholder" and "val" in node.meta:
+                input_storages.add(StorageWeakRef(node.meta["val"]._typed_storage()))
+            if node.op == "output":
+
+                def check_alias(out):
+                    if out is not None and "val" in out.meta:
+                        out_storage = StorageWeakRef(out.meta["val"]._typed_storage())
+                        return out_storage in input_storages
+                    return False
+
+                if any(pytree.tree_leaves(pytree.tree_map(check_alias, node.args))):
+                    return True
+
+        for _, module in gm.named_children():
+            if isinstance(module, torch.fx.GraphModule) and _detect_input_alias(module):
+                return True
+
+        return False
+
+    return _detect_input_alias(gm)
+
+
+def unique_graph_id(proxy_mode, prefix):
+    """Returns a unique name and id for a graph to be added to a proxy_mode tracer"""
+    # There are probably better ways - I know that create_arg has some self incrementing name
+    # magic to it, but since we explicitly have to get the name for register_module,
+    # I was not sure how to do that. This kinda simulates it.
+    next_name = None
+    i = 0
+    while not next_name:
+        candidate = f"{prefix}_{i}"
+        if hasattr(proxy_mode.tracer.root, candidate):
+            i += 1
+        else:
+            next_name = candidate
+    return i, next_name
diff --git a/torch/_higher_order_ops/while_loop.py b/torch/_higher_order_ops/while_loop.py
new file mode 100644
index 0000000000000..15bacb4bc1942
--- /dev/null
+++ b/torch/_higher_order_ops/while_loop.py
@@ -0,0 +1,279 @@
+from typing import Callable, Tuple, Union
+
+import torch
+import torch.utils._pytree as pytree
+
+from torch._C import DispatchKey
+
+from torch._higher_order_ops.utils import (
+    _has_potential_branch_input_alias,
+    _has_potential_branch_input_mutation,
+    _set_compilation_env,
+    autograd_not_implemented,
+    reenter_make_fx,
+    UnsupportedAliasMutationException,
+)
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+
+
+class WhileLoopOp(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("while_loop")
+
+    def __call__(
+        self,
+        cond_fn: Callable,
+        body_fn: Callable,
+        carried_inputs: Tuple[Union[torch.Tensor, int, float, bool]],
+        additional_inputs: Tuple[Union[torch.Tensor, int, float, bool]],
+        /,
+    ):
+        if not isinstance(carried_inputs, tuple):
+            raise RuntimeError(
+                f"carried_inputs must be a tuple, got {type(carried_inputs)}"
+            )
+        if not isinstance(additional_inputs, tuple):
+            raise RuntimeError(
+                f"additional_inputs must be a tuple, got {type(additional_inputs)}"
+            )
+        if not all(
+            isinstance(t, (torch.Tensor, int, float, bool)) for t in carried_inputs
+        ):
+            raise RuntimeError(
+                "carried_inputs must be a tuple of tensors, ints, floats, or bools, got "
+                f"{carried_inputs}"
+            )
+
+        if not all(
+            isinstance(t, (torch.Tensor, int, float, bool)) for t in additional_inputs
+        ):
+            raise RuntimeError(
+                "additional_inputs must be a tuple of tensors, ints, floats, or bools, got "
+                f"{additional_inputs}"
+            )
+        return super().__call__(cond_fn, body_fn, carried_inputs, additional_inputs)
+
+
+while_loop_op = WhileLoopOp()
+# Override while_loop_op.__module__ to "torch.ops.higher_order" so that in the generated
+# graph module, while_loop node's target is correctedly printed as torch.ops.higher_order.while_loop
+while_loop_op.__module__ = "torch.ops.higher_order"
+
+
+def while_loop(cond_fn, body_fn, carried_inputs):
+    r"""
+    Run body_fn(*carried_inputs) while cond_fn(*carried_inputs) returns a True scalar tensor. Returns the output of body_fn or
+    initial carried_inputs.
+
+    .. warning::
+        `torch.while_loop` is a prototype feature in PyTorch. It has limited support for input and output types and
+        doesn't support training currently. Please look forward to a more stable implementation in a future version of PyTorch.
+        Read more about feature classification at: https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype
+
+    `while_loop` is a structured control flow operator. It preserves the loop semantic across the torch.compile and torch.export.
+
+    `while_loop` is equivalent to the following:
+
+        def while_loop(cond_fn, body_fn, carried_inputs):
+            val = carried_inputs
+            while cond_fn(*val):
+                val = body_fn(*val)
+            return val
+
+    Args:
+        cond_fn (Callable): A callable function that returns a boolean Scalar tensor.
+
+        body_fn (Callable): A callable function that takes the same inputs as `cond_fn` and returns a tuple of tensors
+
+        carried_inputs (Tuple of possibly nested dict/list/tuple of tensors): A tuple of inputs to cond_fn and body_fn. It's also
+            the initial value of states that are carried across iterations.
+
+    Example:
+
+        def cond_fn(iter, x):
+            return iter.sum() < 10
+
+        def body_fn(iter, x):
+            return iter + 1, x.sin()
+
+        while_loop(cond_fn, body_fn, (torch.zeros(1), torch.randn(3, 4)))
+
+    Restrictions:
+
+        - body_fn must return tensors with the same metadata (e.g.shape, dtype) as inputs.
+
+        - body_fn and cond_fn must not in-place mutate the carried_inputs. A clone before the mutation is required.
+
+        - body_fn and cond_fn must not mutate python varialbles (e.g. list/dict) created outside of the body_fn.
+
+        - body_fn and cond_fn's output cannot aliase any of the inputs. A clone is required.
+
+    .. warning::
+        Temporal Limitations:
+
+        - 'while_loop' only supports **inference** right now. Autograd will be supported in the future.
+
+    """
+
+    # Currently, additional_inputs is not a user-facing input. It will be automatically set in dynamo.
+    # parameters and buffers accessed in cond_fn or body_fn or tensor closures will become additional_inputs.
+    additional_inputs: Tuple = tuple()
+    if torch.compiler.is_dynamo_compiling():
+        return while_loop_op(cond_fn, body_fn, carried_inputs, additional_inputs)
+
+    def _validate_input(cond_fn, body_fn, carried_inputs):
+        if not callable(cond_fn) or not callable(body_fn):
+            raise RuntimeError("Expect cond_fn and body_fn to be callbale.")
+
+        if not isinstance(carried_inputs, (tuple, list)) or pytree.tree_any(
+            lambda t: not isinstance(t, torch.Tensor), carried_inputs
+        ):
+            raise RuntimeError(
+                "Expect carried_inputs to be a tuple of possibly nested dict/list/tuple that only"
+                f"consists of tensor leaves, but got {carried_inputs}."
+            )
+
+    _validate_input(cond_fn, body_fn, carried_inputs)
+
+    with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit():
+        return torch.compile(while_loop_op, backend="eager", fullgraph=True)(
+            cond_fn, body_fn, carried_inputs, additional_inputs
+        )
+
+
+@while_loop_op.py_impl(DispatchKey.CompositeExplicitAutograd)
+def while_loop_dense(cond_fn, body_fn, carried_inputs, additional_inputs):
+    carried_vals = carried_inputs
+
+    def _is_boolean_scalar_tensor(pred):
+        return (
+            isinstance(pred, torch.Tensor)
+            and pred.size() == torch.Size([])
+            and pred.dtype == torch.bool
+        )
+
+    if not isinstance(carried_inputs, tuple):
+        raise RuntimeError(
+            f"carried_inputs must be a tuple but got {type(carried_inputs)}"
+        )
+
+    while pred := cond_fn(*carried_vals, *additional_inputs):
+        if not _is_boolean_scalar_tensor(pred):
+            raise RuntimeError(
+                f"cond_fn must return a boolean scalar tensor but got {pred}"
+            )
+        out = body_fn(*carried_vals, *additional_inputs)
+        assert isinstance(
+            out, tuple
+        ), f"body_fn should return a tuple but got {type(out)}"
+        assert len(out) == len(
+            carried_inputs
+        ), "body_fn should return the same number of elements as carried_inputs"
+        carried_vals = out
+    return carried_vals
+
+
+while_loop_op.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(while_loop_op, deferred_error=True)
+)
+
+
+@while_loop_op.py_impl(ProxyTorchDispatchMode)
+def while_loop_tracing(mode, cond_fn, body_fn, carried_inputs, additional_inputs):
+    def _trace_while_loop(
+        proxy_mode, while_loop_op, cond_fn, body_fn, carried_inputs, additional_inputs
+    ):
+        pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
+        with disable_proxy_modes_tracing():
+            cond_graph = reenter_make_fx(cond_fn, pre_dispatch)(
+                *carried_inputs, *additional_inputs
+            )
+            body_graph = reenter_make_fx(body_fn, pre_dispatch)(
+                *carried_inputs, *additional_inputs
+            )
+
+        next_name = None
+        i = 0
+        while not next_name:
+            candidate = f"while_loop_cond_graph_{i}"
+            if hasattr(proxy_mode.tracer.root, candidate):
+                i += 1
+            else:
+                next_name = candidate
+        cond_graph_name = next_name
+        body_graph_name = f"while_loop_body_graph_{i}"
+        assert not hasattr(proxy_mode.tracer.root, body_graph_name)
+
+        proxy_mode.tracer.root.register_module(cond_graph_name, cond_graph)
+        proxy_mode.tracer.root.register_module(body_graph_name, body_graph)
+
+        args = (cond_graph, body_graph, carried_inputs, additional_inputs)
+
+        proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)
+
+        out_proxy = proxy_mode.tracer.create_proxy(
+            "call_function", while_loop_op, proxy_args, {}, name="while_loop"
+        )
+
+        # body_fn return output with the same pytree and tensor meta data as carried_inputs
+        # so we could just return the output after one iteration.
+        out = body_fn(*carried_inputs, *additional_inputs)
+        return track_tensor_tree(
+            out, out_proxy, constant=None, tracer=proxy_mode.tracer
+        )
+
+    if mode.enable_tracing:
+        return _trace_while_loop(
+            mode, while_loop_op, cond_fn, body_fn, carried_inputs, additional_inputs
+        )
+    else:
+        return while_loop_op(cond_fn, body_fn, carried_inputs, additional_inputs)
+
+
+@while_loop_op.py_impl(FakeTensorMode)
+def while_loop_fake_tensor_mode(
+    mode, cond_fn, body_fn, carried_inputs, additional_inputs
+):
+    with mode:
+        return body_fn(*carried_inputs, *additional_inputs)
+
+
+@while_loop_op.py_functionalize_impl
+def while_loop_func(ctx, cond_fn, body_fn, carried_inputs, additional_inputs):
+    unwrapped_carried_inputs = ctx.unwrap_tensors(carried_inputs)
+    unwrapped_additional_inputs = ctx.unwrap_tensors(additional_inputs)
+    unwrapped_inputs = unwrapped_carried_inputs + unwrapped_additional_inputs
+    with ctx.redispatch_to_next() as m:
+        functional_cond_fn = ctx.functionalize(cond_fn)
+        functional_body_fn = ctx.functionalize(body_fn)
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        for fn, fn_name in [
+            (functional_cond_fn, "cond_fn"),
+            (functional_body_fn, "body_fn"),
+        ]:
+            if _has_potential_branch_input_mutation(
+                fn, unwrapped_inputs, pre_dispatch=pre_dispatch
+            ):
+                raise UnsupportedAliasMutationException(
+                    f"torch.while_loop's {fn_name} might be modifying the input!"
+                )
+
+            if _has_potential_branch_input_alias(
+                fn, unwrapped_inputs, pre_dispatch=pre_dispatch
+            ):
+                raise UnsupportedAliasMutationException(
+                    f"torch.while_loop's {fn_name} might be aliasing the input!"
+                )
+        ret = while_loop_op(
+            functional_cond_fn,
+            functional_body_fn,
+            unwrapped_carried_inputs,
+            unwrapped_additional_inputs,
+        )
+        return ctx.wrap_tensors(ret)
diff --git a/torch/_higher_order_ops/wrap.py b/torch/_higher_order_ops/wrap.py
index 10f9040353456..f288c350f0ee2 100644
--- a/torch/_higher_order_ops/wrap.py
+++ b/torch/_higher_order_ops/wrap.py
@@ -30,6 +30,24 @@ def wrapper():
 
 wrap = Wrap()
 
+class WrapWithSetGradEnabled(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("wrap_with_set_grad_enabled")
+
+    def __call__(self, enable_grad, wrapped_func, *args, **kwargs):
+        # Dynamo already traces the body of HigherOrderOp beforehand when it
+        # so no need to trace into it.
+        import torch._dynamo  # noqa: F401
+        from torch._dynamo import disable
+
+        @disable
+        def wrapper():
+            with torch.set_grad_enabled(enable_grad):
+                return wrapped_func(*args, **kwargs)
+        return wrapper()
+
+wrap_with_set_grad_enabled = WrapWithSetGradEnabled()
+
 class WrapActivationCheckpoint(HigherOrderOperator):
     """
     This operator is used to wrap torch.utils.checkpoint. This avoids
diff --git a/torch/_inductor/__init__.py b/torch/_inductor/__init__.py
index 2d80a25973bd5..5ea494fc7b100 100644
--- a/torch/_inductor/__init__.py
+++ b/torch/_inductor/__init__.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch.fx
 import torch.utils._pytree as pytree
@@ -28,11 +28,11 @@ def compile(
     return compile_fx(gm, example_inputs, config_patches=options)
 
 
-# TODO: aot_compile can only work with fx generated by export. Will remove this next
-# to prevent people from calling it with arbitrary fx graph.
 def aot_compile(
     gm: torch.fx.GraphModule,
-    example_inputs: List[torch.Tensor],
+    args: Tuple[Any],
+    kwargs: Optional[Dict[str, Any]] = None,
+    *,
     options: Optional[Dict[str, Any]] = None,
 ) -> str:
     """
@@ -40,7 +40,8 @@ def aot_compile(
 
     Args:
         gm: The FX graph to compile.
-        example_inputs:  List of tensor inputs.
+        args:  Example arguments
+        kwargs: Example keyword arguments
         options:  Optional dict of config options.  See `torch._inductor.config`.
 
     Returns:
@@ -49,18 +50,41 @@ def aot_compile(
     from .compile_fx import compile_fx_aot
 
     # We will serialize the pytree info into the .so as constant strings
-    serialized_in_spec = ""
-    serialized_out_spec = ""
+    in_spec = None
+    out_spec = None
     if isinstance(gm.graph._codegen, torch.fx.graph._PyTreeCodeGen):
         codegen = gm.graph._codegen
         gm.graph._codegen = torch.fx.graph.CodeGen()
         gm.recompile()
 
         if codegen.pytree_info.in_spec is not None:
-            serialized_in_spec = pytree.treespec_dumps(codegen.pytree_info.in_spec)
-
+            in_spec = codegen.pytree_info.in_spec
         if codegen.pytree_info.out_spec is not None:
-            serialized_out_spec = pytree.treespec_dumps(codegen.pytree_info.out_spec)
+            out_spec = codegen.pytree_info.out_spec
+
+    else:
+        if hasattr(gm, "_in_spec"):
+            in_spec = gm._in_spec
+        if hasattr(gm, "_out_spec"):
+            out_spec = gm._out_spec
+
+    serialized_in_spec = pytree.treespec_dumps(in_spec) if in_spec is not None else ""
+    serialized_out_spec = (
+        pytree.treespec_dumps(out_spec) if out_spec is not None else ""
+    )
+
+    flat_args_with_path, received_spec = pytree.tree_flatten_with_path(
+        (args, kwargs or {})
+    )
+    flat_example_inputs = tuple(x[1] for x in flat_args_with_path)
+
+    if in_spec is not None and received_spec != in_spec:
+        raise ValueError(  # noqa: TRY200
+            "Trying to flatten user inputs with exported input tree spec: \n"
+            f"{in_spec}\n"
+            "but actually got inputs with tree spec of: \n"
+            f"{received_spec}"
+        )
 
     options = (
         {
@@ -77,7 +101,7 @@ def aot_compile(
 
     return compile_fx_aot(
         gm,
-        example_inputs,
+        list(flat_example_inputs),
         config_patches=options,
     )
 
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
index 4b89a9ce9283b..b69c0cd292edd 100644
--- a/torch/_inductor/autotune_process.py
+++ b/torch/_inductor/autotune_process.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import contextlib
+import ctypes
 import dataclasses
 import functools
 import logging
@@ -9,9 +10,8 @@
 import time
 import warnings
 from concurrent.futures import ThreadPoolExecutor
-from ctypes import byref, c_size_t, c_void_p
-from multiprocessing.process import BaseProcess
-from multiprocessing.queues import Queue
+from ctypes import byref, c_size_t, c_void_p, CDLL
+from types import ModuleType
 from typing import (
     Any,
     Callable,
@@ -29,13 +29,22 @@
 from torch._dynamo.testing import rand_strided
 
 from torch._inductor import ir
-from torch._inductor.codecache import CUDACodeCache, DLLWrapper, PyCodeCache
+from torch._inductor.codecache import (
+    CppCodeCache,
+    CUDACodeCache,
+    DLLWrapper,
+    get_hash,
+    PyCodeCache,
+)
 
 if TYPE_CHECKING:
+    from multiprocessing.process import BaseProcess
+    from multiprocessing.queues import Queue
+
     from torch._inductor.select_algorithm import TritonTemplateCaller
 
 from . import config
-from .utils import do_bench
+from .runtime.runtime_utils import do_bench, do_bench_cpu
 from .virtualized import V
 
 CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
@@ -53,6 +62,10 @@ class Pong:
     pass
 
 
+class NonzeroWorkspaceNotSupportedError(Exception):
+    pass
+
+
 @contextlib.contextmanager
 def set_cuda_visible_device(device: Optional[int]):
     """
@@ -102,7 +115,7 @@ def process_main(
         try:
             TuningProcess.workloop(request_queue, response_queue)
         except Exception as ex:
-            log.exception("Exception in TuningProcess: %s", ex)
+            log.exception("Exception in TuningProcess")
 
     @staticmethod
     def workloop(request_queue: Queue[Any], response_queue: Queue[Any]) -> None:
@@ -171,22 +184,55 @@ def put(self, obj: Any) -> None:
         assert self.request_queue is not None
         self.request_queue.put(obj)
 
-    def get(self) -> Any:
+    def get(
+        self, result_timeout=120.0, graceful_timeout=3.0, terminate_timeout=1.0
+    ) -> Any:
         """
-        Get a response from the child process.
+        Get a response from the child process. Raises queue.Empty on timeout
+        or if the process dies.
+
+        This method is (so far) only used by TuningProcessPool, where torch._inductor.config entries are being used
+        to populate the timeouts:
+
+        Arguments:
+
+            @param result_timeout: Timeout in seconds, defaults to 120.0 or to
+                                   config.max_autotune_subproc_result_timeout_seconds when called by TuningProcessPool
+            @param graceful_timeout: Timeout in seconds to allow graceful shutdown (SIGTERM is sent after this time).
+                                    Defaults to 3.0 or to config.max_autotune_subproc_graceful_timeout_seconds
+            @param terminate_timeout: Timeout in seconds after SIGTERM, until we send SIGKILL if the process
+                                      remains alive. Defaults to 1.0 or to
+                                      config.max_autotune_subproc_terminate_timeout_seconds.
+        Returns:
+            A response from the child process (Any type)
         """
         assert self.process is not None
         assert self.response_queue is not None
         while True:
             try:
-                return self.response_queue.get(timeout=1.0)
+                remaining_timeout = result_timeout
+                res = None
+                while remaining_timeout is not None and remaining_timeout >= 1.0:
+                    remaining_timeout -= 0.5
+                    try:
+                        res = self.response_queue.get(timeout=0.5)
+                        break
+                    except queue.Empty:
+                        if not self.process.is_alive():
+                            raise  # is being caught a few lines below
+                if res is None:
+                    res = self.response_queue.get(timeout=remaining_timeout)
+                return res
             except queue.Empty:
                 status = self.process.exitcode
                 if status is None:
-                    # child process is still running
-                    continue
-                # child process crashed
-                self.clear()
+                    self.kill(
+                        graceful_timeout=graceful_timeout,
+                        terminate_timeout=terminate_timeout,
+                    )
+                else:
+                    # child process crashed
+                    self.clear()
                 raise
 
     def terminate(self) -> None:
@@ -206,6 +252,29 @@ def wait(self) -> None:
             self.process.join()
             self.clear()
 
+    def kill(self, graceful_timeout=5.0, terminate_timeout=1.0) -> None:
+        # Tries to kill the process, using a graceful_timeout in which the process
+        # is allowed to exit gracefully. If the process is still alive,
+        # it will be terminated. If that is not sufficient to end it
+        # within terminate_timeout seconds, it will be killed.
+        if self.process is not None:
+            self.terminate()
+            self.process.join(timeout=graceful_timeout)
+            if self.process.is_alive():
+                log.warning(
+                    "Sending SIGTERM to process with PID %d",
+                    self.process.pid,
+                )
+                self.process.terminate()
+                self.process.join(timeout=terminate_timeout)
+                if self.process.is_alive():
+                    log.error(
+                        "Sending SIGKILL to process with PID %d",
+                        self.process.pid,
+                    )
+                    self.process.kill()  # This should definitely end the process
+            self.clear()
+
 
 @dataclasses.dataclass
 class TuningProcessPool:
@@ -239,7 +308,7 @@ def initialize(self) -> None:
 
         # Wait for the initialization to finish
         for p in self.processes.queue:
-            assert isinstance(p.get(), Pong)
+            assert isinstance(p.get(result_timeout=None), Pong)
 
         # Use a thread pool to manage distributing work to the subprocesses.
         # Threads block on an available process, so it makes sense to match
@@ -300,7 +369,11 @@ def target(self, choice: TritonTemplateCaller) -> float:
         process = self.processes.get()
         process.put(choice.bmreq)
         try:
-            return process.get()
+            return process.get(
+                config.max_autotune_subproc_result_timeout_seconds,
+                config.max_autotune_subproc_graceful_timeout_seconds,
+                config.max_autotune_subproc_terminate_timeout_seconds,
+            )
         except queue.Empty:
             warnings.warn(
                 f"Failed to benchmark choice '{choice}'. It will be ignored. "
@@ -344,6 +417,7 @@ class TensorMeta:
     sizes: torch._prims_common.ShapeType
     strides: torch._prims_common.StrideType
     offset: int
+    name: Optional[str] = None
 
     @classmethod
     def from_irnodes(
@@ -376,6 +450,7 @@ def from_irnodes(
                 node.get_layout().offset,
                 fallback=config.unbacked_symint_fallback,
             ),
+            name=node.get_name(),
         )
 
     def to_tensor(self) -> torch.Tensor:
@@ -393,6 +468,9 @@ class BenchmarkRequest:
     """
     Only handle triton template benchmark for now. The extern kernel benchmark
     can be done inside the same process since they usually don't cause crash.
+
+    Important: Instances of this class and subclasses have to be serializable
+    across process boundaries. Do not put CUDA Tensors in here!
     """
 
     def __init__(
@@ -419,11 +497,19 @@ def __init__(
     def make_run_fn(
         self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
     ) -> Callable[[], None]:
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def cleanup_run_fn(self) -> None:
         pass
 
+    def do_bench(
+        self,
+        fn,
+        *input_tensors: torch.Tensor,
+        output_tensor: Optional[torch.Tensor] = None,
+    ) -> float:
+        raise NotImplementedError
+
     def benchmark(
         self,
         *input_tensors: torch.Tensor,
@@ -440,25 +526,28 @@ def benchmark(
             output_tensor = self.output_tensor_meta.to_tensor()
 
         if debug:
-            create_tensor_elapse = time.time() - start_ts
+            create_tensor_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
             start_ts = time.time()
-
-        fn = self.make_run_fn(*input_tensors, output_tensor=output_tensor)
+        try:
+            fn = self.make_run_fn(*input_tensors, output_tensor=output_tensor)
+        except NonzeroWorkspaceNotSupportedError:
+            # Skipping all ops with nonzero workspace requirements
+            log.info("Skipping op due to nonzero workspace requirement")
+            return float("inf")
 
         if debug:
-            load_elapse = time.time() - start_ts
+            load_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
             start_ts = time.time()
 
-        out = do_bench(fn)
-        torch.cuda.synchronize()  # shake out any CUDA errors
+        out = self.do_bench(fn, *input_tensors, output_tensor)
 
         if debug:
-            bench_elapse = time.time() - start_ts
+            bench_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
             log.debug(
                 "InChildProcess %s: load %f, create tensor %f, bench %f",
                 str(self),
-                load_elapse,
-                create_tensor_elapse,
+                load_elapse,  # type: ignore[possibly-undefined]
+                create_tensor_elapse,  # type: ignore[possibly-undefined]
                 bench_elapse,
             )
         self.cleanup_run_fn()
@@ -478,11 +567,40 @@ def benchmark(
         self, *input_tensors: torch.Tensor, output_tensor: Optional[torch.Tensor] = None
     ) -> float:
         if self.value is None:
-            raise Exception("Failed to run")
+            raise Exception("Failed to run")  # noqa: TRY002
         return self.value
 
 
-class TritonBenchmarkRequest(BenchmarkRequest):
+class GPUDeviceBenchmarkRequest(BenchmarkRequest):
+    def do_bench(
+        self,
+        fn,
+        *input_tensors: torch.Tensor,
+        output_tensor: Optional[torch.Tensor] = None,
+    ) -> float:
+        device_idx_set = {
+            tensor.device.index
+            for tensor in [*input_tensors, output_tensor]
+            if isinstance(tensor, torch.Tensor)
+            and tensor.is_cuda
+            and tensor.device.index is not None
+        }
+        assert len(device_idx_set) <= 1, f"Can not mix devices {device_idx_set}"
+        if len(device_idx_set) == 1:
+            device_idx = next(iter(device_idx_set))
+        else:
+            device_idx = torch.cuda.current_device()
+
+        with torch.cuda.device(device_idx):
+            out = do_bench(fn)
+            torch.cuda.synchronize()  # shake out any CUDA errors
+
+        return out
+
+
+class TritonBenchmarkRequest(GPUDeviceBenchmarkRequest):
+    # Important: Instances of this class have to be serializable
+    # across process boundaries. Do not put CUDA Tensors in here!
     def __init__(
         self,
         kernel_name: str,
@@ -494,6 +612,7 @@ def __init__(
         grid: List[int],
         num_stages: int,
         num_warps: int,
+        matrix_instr_nonkdim: int = 0,  # only used for hip to choose the shape of mfma instruction.
     ):
         super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args)
         self.module_path = module_path
@@ -501,6 +620,7 @@ def __init__(
         self.grid = grid
         self.num_stages = num_stages
         self.num_warps = num_warps
+        self.matrix_instr_nonkdim = matrix_instr_nonkdim
 
     def make_run_fn(
         self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
@@ -523,22 +643,41 @@ def make_run_fn(
         if "warmup" in inspect.signature(run_method).parameters:
             warmup_arg["warmup"] = False
 
-        return functools.partial(
-            run_method,
-            *input_tensors,
-            output_tensor,
-            *self.extra_args,
-            grid=self.grid,
-            **warmup_arg,
-            num_stages=self.num_stages,
-            num_warps=self.num_warps,
-        )
+        from torch._C import _cuda_getCurrentRawStream as get_raw_stream
+
+        if torch.version.hip and self.matrix_instr_nonkdim != 0:
+            return functools.partial(
+                run_method,
+                *input_tensors,
+                output_tensor,
+                *self.extra_args,
+                grid=self.grid,
+                **warmup_arg,
+                stream=get_raw_stream(self.output_tensor_meta.device.index),
+            )
+        else:
+            return functools.partial(
+                run_method,
+                *input_tensors,
+                output_tensor,
+                *self.extra_args,
+                grid=self.grid,
+                **warmup_arg,
+                stream=get_raw_stream(self.output_tensor_meta.device.index),
+            )
+
+    def precompile(self):
+        mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path)
+        getattr(mod, self.kernel_name).precompile()
 
     def __str__(self) -> str:
         return f"{self.kernel_name=}, {self.module_path=}, {self.module_cache_key=}"
 
 
-class CUDABenchmarkRequest(BenchmarkRequest):
+class CUDABenchmarkRequest(GPUDeviceBenchmarkRequest):
+    # Important: Instances of this class have to be serializable
+    # across process boundaries. Do not put CUDA Tensors in here!
+
     def __init__(
         self,
         kernel_name: str,
@@ -552,16 +691,23 @@ def __init__(
         self.workspace_size: int = 0
         self.workspace: Optional[torch.Tensor] = None
         self.DLL: Optional[DLLWrapper] = None
+        self._workspace_size_updated = False
         self.hash_key: str = ""
         self.source_file: str = ""
         self.hash_key, self.source_file = CUDACodeCache.write(self.source_code, "so")
 
+    def precompile(self):
+        # Prepopulate CUDACodeCache
+        # may happen in separate Threadpool
+        log.debug("Precompiling %s", self)
+        CUDACodeCache.compile(self.source_code, "so")
+        log.debug("Done precompiling %s", self)
+
     def make_run_fn(
         self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
     ) -> Callable[[], None]:
-        self.DLL, self.hash_key, self.source_file = CUDACodeCache.load(
-            self.source_code, "so"
-        )
+        self.ensure_dll_loaded()
+        self.update_workspace_size()
         args = [
             c_void_p(tensor.data_ptr())
             for tensor in list(input_tensors) + [output_tensor]
@@ -575,9 +721,36 @@ def make_run_fn(
             args,
             self.extra_args,
         )
+        stream_ptr = c_void_p(torch.cuda.current_stream().cuda_stream)
         run_method = getattr(self.DLL, self.kernel_name)
+        workspace_ptr = c_void_p(0)
+        if self.workspace_size > 0:
+            self.workspace = torch.zeros(
+                (self.workspace_size + 7) // 8,
+                dtype=torch.float64,
+                device=output_tensor.device,
+            )
+            workspace_ptr = c_void_p(self.workspace.data_ptr())
+
+        # Generate partial function.
+        return functools.partial(
+            run_method,
+            *args,
+            *self.extra_args,
+            None,  # null workspace size ptr
+            workspace_ptr,  # set workspace ptr,
+            stream_ptr,
+        )
+
+    def update_workspace_size(self) -> None:
+        if self._workspace_size_updated:
+            return
+        self.ensure_dll_loaded()
+        unique_input_count = len({meta.name for meta in self.input_tensor_meta})
+        args = [c_void_p(None) for _ in range(unique_input_count + 1)]
         stream_ptr = c_void_p(torch.cuda.current_stream().cuda_stream)
 
+        run_method = getattr(self.DLL, self.kernel_name)
         # Retrieve workspace_size and initialize workspace.
         c_workspace_size = c_size_t()
         run_method(
@@ -589,31 +762,98 @@ def make_run_fn(
             None,  # null workspace ptr
             stream_ptr,
         )
+        torch.cuda.synchronize()  # shake out any CUDA errors
         self.workspace_size = c_workspace_size.value
-        # TODO: Support non-zero workspace_size.
-        assert self.workspace_size == 0, (
-            "Things need to be fixed to support non-zero workspace_size: "
-            "1) max autotune cache needs to store workspace size; "
-            "2) memory allocation needs to allocate / deallocate workspace correctly; "
+        log.debug(
+            "update_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",  # noqa: B950
+            self.workspace_size,
+            self.kernel_name,
+            self.source_file,
+            self.hash_key,
+            self.DLL,
+            args,
+            self.extra_args,
+        )
+        self._workspace_size_updated = True
+
+    def ensure_dll_loaded(self):
+        if self.DLL is None:
+            self.DLL, self.hash_key, self.source_file = CUDACodeCache.load(
+                self.source_code, "so"
+            )
+
+    def cleanup_run_fn(self) -> None:
+        if self.DLL is not None:
+            self.DLL.close()
+        self.workspace = None
+
+    def __str__(self) -> str:
+        return f"{self.kernel_name=}, {self.source_file=}, {self.hash_key=}"
+
+
+class CPUDeviceBenchmarkRequest(BenchmarkRequest):
+    def do_bench(
+        self,
+        fn,
+        *input_tensors: torch.Tensor,
+        output_tensor: Optional[torch.Tensor] = None,
+    ) -> float:
+        return do_bench_cpu(fn)
+
+
+class CppBenchmarkRequest(CPUDeviceBenchmarkRequest):
+    # Important: Instances of this class have to be serializable
+    # across process boundaries. Do not put Tensors in here!
+
+    def __init__(
+        self,
+        kernel_name: str,
+        input_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        output_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        extra_args: Iterable[Any],
+        source_code: str,
+    ):
+        super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args)
+        self.source_code = source_code
+        self.hash_key = get_hash(source_code)
+        self.DLL: Optional[Union[CDLL, ModuleType]] = None
+
+    def precompile(self):
+        # Prepopulate CppCodeCache
+        # may happen in separate Threadpool
+        log.debug("Precompiling %s", self)
+        CppCodeCache.load(self.source_code, cuda=False)
+        log.debug("Done precompiling %s", self)
+
+    def make_run_fn(
+        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+    ) -> Callable[[], None]:
+        # TODO(jgong5): use CppPythonBindingsCodeCache for better binding perf
+        self.DLL = CppCodeCache.load(self.source_code, cuda=False)
+        args = [tensor.data_ptr() for tensor in list(input_tensors) + [output_tensor]]
+        log.debug(
+            "make_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%s",
+            self.kernel_name,
+            self.DLL,
+            args,
+            self.extra_args,
         )
+        run_method = getattr(self.DLL, self.kernel_name)
+        run_method.argtypes = [ctypes.c_ulonglong] * len(args)
 
         # Generate partial function.
         return functools.partial(
             run_method,
             *args,
             *self.extra_args,
-            None,  # null workspace size ptr
-            None,  # set workspace ptr, TODO: update it to a real ptr if workspace_size > 0
-            stream_ptr,
         )
 
     def cleanup_run_fn(self) -> None:
         if self.DLL is not None:
             self.DLL.close()
-        self.workspace = None
 
     def __str__(self) -> str:
-        return f"{self.kernel_name=}, {self.source_file=}, {self.hash_key=}"
+        return f"{self.kernel_name=}"
 
 
 def benchmark_in_sub_process(
diff --git a/torch/_inductor/bounds.py b/torch/_inductor/bounds.py
index 3f28dfde20fd6..38545d5663bd6 100644
--- a/torch/_inductor/bounds.py
+++ b/torch/_inductor/bounds.py
@@ -24,7 +24,7 @@ class BoundVars:
     def __init__(self, loop_body: LoopBody) -> None:
         self.loop_body = loop_body
         self.replacement_vals = {
-            k: ValueRanges(0, v - 1)
+            k: ValueRanges[Expr](0, v - 1)
             if (isinstance(v, int) or v.is_number)
             else bound_sympy(v)
             for k, v in loop_body.var_ranges.items()
@@ -37,10 +37,10 @@ def __init__(self, loop_body: LoopBody) -> None:
             or "masked_subblock" in node.target
         )
         # To access this variable call `get_bounds()`
-        self._bounds: Dict[torch.fx.Node, ValueRanges] = {}
+        self._bounds: Dict[torch.fx.Node, ValueRanges[Expr]] = {}
 
     @cache_on_self
-    def get_bounds(self) -> Dict[torch.fx.Node, ValueRanges]:
+    def get_bounds(self) -> Dict[torch.fx.Node, ValueRanges[Expr]]:
         submodules = self.swap_submodules(self.loop_body.submodules)
 
         # Initialize the environment with the unbounded variables
@@ -50,7 +50,7 @@ def get_bounds(self) -> Dict[torch.fx.Node, ValueRanges]:
                 "masked_subblock" not in node.target
                 and "set_indirect" not in node.target
             ):
-                self._bounds[node] = ValueRanges.unknown()
+                self._bounds[node] = ValueRanges[Expr].unknown()
 
         with V.set_ops_handler(ValueRangeAnalysis()):
             interpreter = InterpreterShim(self.loop_body.root_block.graph, submodules)
@@ -59,8 +59,8 @@ def get_bounds(self) -> Dict[torch.fx.Node, ValueRanges]:
 
     def swap_submodules(
         self, submodules: Dict[str, Callable[..., Any]]
-    ) -> Dict[str, Callable[..., ValueRanges]]:
-        result: Dict[str, Callable[..., ValueRanges]] = {}
+    ) -> Dict[str, Callable[..., ValueRanges[Expr]]]:
+        result: Dict[str, Callable[..., ValueRanges[Expr]]] = {}
         for key in submodules.keys():
             if key == "get_index":
                 result[key] = self.get_index
@@ -94,11 +94,11 @@ def make_fn(subblock):
     def masked_subblock(
         self,
         subblock: LoopBodyBlock,
-        env: Dict[torch.fx.Node, ValueRanges],
+        env: Dict[torch.fx.Node, ValueRanges[Expr]],
         mask: Any,
         value: Any,
         submodules: Dict[str, Callable[..., Any]],
-    ) -> ValueRanges:
+    ) -> ValueRanges[Expr]:
         interp = InterpreterShim(subblock.graph, submodules)
         interp.run(V.get_ops_handler(), initial_env=env)
         output = [node for node in subblock.graph.nodes if node.target == "output"]
@@ -107,12 +107,12 @@ def masked_subblock(
         # pessimistically assumed to be inf anyway
         return interp.env[output[0]]
 
-    def set_indirect(self, old: Expr, new: ValueRanges) -> ValueRanges:
+    def set_indirect(self, old: Expr, new: ValueRanges[Expr]) -> ValueRanges[Expr]:
         assert isinstance(new, ValueRanges)
         self.replacement_vals[old] = new
         return new
 
-    def get_index(self, name: Expr) -> ValueRanges:
+    def get_index(self, name: Expr) -> ValueRanges[Expr]:
         expr = self.loop_body.indexing_exprs[name]
         bound = self.replacement_vals.get(expr)
         if bound is None:
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 8b66a5328ce74..2c24a1ea92332 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -11,7 +11,6 @@
 import logging
 import multiprocessing
 import os
-import pathlib
 import pickle
 import pkgutil
 import platform
@@ -19,47 +18,69 @@
 import shlex
 import shutil
 import signal
+import struct
 import subprocess
 import sys
 import sysconfig
 import tempfile
+import textwrap
 import threading
 import warnings
-import weakref
 from bisect import bisect_right
 from concurrent.futures import Future, ProcessPoolExecutor, ThreadPoolExecutor
 from copy import copy
 from ctypes import c_void_p, cdll, CDLL
-from dataclasses import field
 from functools import partial
-from importlib import abc
 from pathlib import Path
 from threading import Thread
-from time import sleep, time
+from time import sleep, time, time_ns
 from types import ModuleType
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING, Union
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Generator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+)
 
 import torch
-
-from torch._dynamo.device_interface import (
-    get_interface_for_device,
-    get_registered_device_interfaces,
-)
-from torch._dynamo.utils import counters
-from torch._inductor import config, exc
+from torch._dynamo.device_interface import get_registered_device_interfaces
+from torch._dynamo.utils import counters, dynamo_timed
+from torch._inductor import config, exc, metrics
 from torch._inductor.codegen.cuda import cuda_env
-from torch._inductor.utils import cache_dir, developer_warning, is_linux
-from torch._prims_common import suggest_memory_format
+from torch._inductor.runtime.compile_tasks import (
+    _module_to_triton_kernel,
+    _reload_python_module,
+    _reload_python_module_in_subproc,
+    _set_triton_ptxas_path,
+    _worker_compile_triton,
+)
+from torch._inductor.runtime.runtime_utils import cache_dir
+from torch._inductor.utils import clear_on_fresh_inductor_cache, is_linux
+
+from torch._logging import trace_structured
+from torch._subclasses.fake_tensor import (
+    extract_tensor_metadata,
+    FakeTensor,
+    TensorMetadata,
+)
 from torch.fx.experimental.symbolic_shapes import has_hint, hint_int, ShapeEnv
 
 if TYPE_CHECKING:
     from torch._inductor.graph import GraphLowering
-    from torch._inductor.select_algorithm import ChoiceCaller
+    from torch._inductor.ir import ChoiceCaller
 
 from torch.hub import _Faketqdm, tqdm
 
 _HERE = os.path.abspath(__file__)
 _TORCH_PATH = os.path.dirname(os.path.dirname(_HERE))
+_LINKER_SCRIPT = os.path.join(_TORCH_PATH, "_inductor/script.ld")
 
 if config.is_fbcode():
     from triton.fb import build_paths
@@ -86,11 +107,15 @@ def use_global_cache() -> bool:
         return False
 
 
+output_code_log = torch._logging.getArtifactLogger(__name__, "output_code")
+
 LOCK_TIMEOUT = 600
 
+_IS_WINDOWS = sys.platform == "win32"
+
 # timing metrics for time spent in the compilation
 _cumulative_compile_time = 0.0
-_t0 = None
+_t0: Optional[float] = None
 
 
 def _compile_start() -> None:
@@ -135,9 +160,11 @@ class CacheBase:
     @functools.lru_cache(None)
     def get_system() -> Dict[str, Any]:
         try:
-            import triton
+            from triton.compiler.compiler import triton_key
 
-            triton_version = triton.__version__
+            # Use triton_key instead of triton.__version__ as the version
+            # is not updated with each code change
+            triton_version = triton_key()
         except ModuleNotFoundError:
             triton_version = None
 
@@ -164,6 +191,7 @@ def get_system() -> Dict[str, Any]:
         return system
 
     @staticmethod
+    @clear_on_fresh_inductor_cache
     @functools.lru_cache(None)
     def get_local_cache_path() -> Path:
         return Path(os.path.join(cache_dir(), "cache", CacheBase.get_system()["hash"]))
@@ -178,27 +206,22 @@ def get_global_cache_path() -> Optional[Path]:
         )
 
     def __init__(self) -> None:
-        if not torch.cuda.is_available():
-            return
-
         self.system = CacheBase.get_system()
 
-        self.local_cache_path = CacheBase.get_local_cache_path()
-        self.global_cache_path = CacheBase.get_global_cache_path()
-
     def get_local_cache(self) -> Dict[str, Any]:
-        if not self.local_cache_path.is_file():
+        local_cache_path = self.get_local_cache_path()
+        if not local_cache_path.is_file():
             return {}
-        with open(self.local_cache_path) as local_cache_fp:
+        with open(local_cache_path) as local_cache_fp:
             local_cache = json.load(local_cache_fp)
         return local_cache["cache"]
 
     def update_local_cache(self, local_cache: Dict[str, Any]) -> None:
-        if not os.path.exists(self.local_cache_path.parent):
-            os.makedirs(self.local_cache_path.parent, exist_ok=True)
+        local_cache_path = self.get_local_cache_path()
         write_atomic(
-            str(self.local_cache_path),
+            str(local_cache_path),
             json.dumps({"system": self.system, "cache": local_cache}, indent=4),
+            make_dirs=True,
         )
 
 
@@ -230,9 +253,10 @@ def set_value(self, *keys: str, value: Any) -> None:
 class PersistentCache(CacheBase):
     @functools.lru_cache(None)
     def get_global_cache(self):
-        if self.global_cache_path is None or not self.global_cache_path.is_file():
+        global_cache_path = self.get_global_cache_path()
+        if global_cache_path is None or not global_cache_path.is_file():
             return {}
-        with open(self.global_cache_path) as global_cache_fp:
+        with open(global_cache_path) as global_cache_fp:
             global_cache = json.load(global_cache_fp)
         return global_cache["cache"]
 
@@ -241,7 +265,7 @@ def lookup(
         choices: List[ChoiceCaller],
         op: str,
         inputs: str,
-        benchmark: Callable[[Any], Dict[ChoiceCaller, float]],
+        benchmark: Optional[Callable[[Any], Dict[ChoiceCaller, float]]],
     ) -> Dict[ChoiceCaller, float]:
         """
         Check to see if we have benchmarked the given choice callers. For each
@@ -249,7 +273,7 @@ def lookup(
 
             1. Check global_cache[op][inputs][choice][precision], return benchmark if cached.
             2. Check local_cache[op][inputs][choice][precision], return benchmark if cached.
-            3.
+            3. If benchmark is not None:
                 a. `max_autotune_gemm=True`: benchmark the choice, update
                     local_cache[op][inputs][choice], and return the benchmark.
                 b. `max_autotune_gemm=False`: don't benchmark the choice, return nothing.
@@ -280,11 +304,15 @@ def check_cache(cache, callback=None) -> bool:
             return hit
 
         if config.max_autotune or config.max_autotune_gemm:
-            local_cache = self.get_local_cache()
+            local_cache = self.get_local_cache() if config.autotune_local_cache else {}
             # check local cache first since it is data specific to the current machine
-            if not check_cache(local_cache) and not (
-                use_global_cache()
-                and check_cache(self.get_global_cache(), callback=log_stats)
+            if (
+                not check_cache(local_cache)
+                and not (
+                    use_global_cache()
+                    and check_cache(self.get_global_cache(), callback=log_stats)
+                )
+                and benchmark is not None
             ):
                 try:
                     # re-benchmark everything to try to get consistent numbers from the same machine
@@ -362,24 +390,33 @@ def write(
     specified_dir: str = "",
 ) -> Tuple[str, str]:
     # use striped content to compute hash so we don't end up with different
-    # hashes just because the content begins/ends with differnet number of
+    # hashes just because the content begins/ends with different number of
     # spaces.
     key: str = get_hash(content.strip(), extra, hash_type)
     basename, subdir, path = get_path(key, extension, specified_dir)
-    if not os.path.exists(subdir):
-        os.makedirs(subdir, exist_ok=True)
     if not os.path.exists(path):
-        write_atomic(path, content)
+        write_atomic(path, content, make_dirs=True)
     return basename, path
 
 
-def write_atomic(path: str, content: Union[str, bytes]) -> None:
+def write_text(text: str) -> str:
+    """
+    Write the `text` to a file and return the path computed based on the hash.
+    """
+    return write(text, "txt")[1]
+
+
+def write_atomic(
+    path: str, content: Union[str, bytes], make_dirs: bool = False
+) -> None:
     # Write into temporary file first to avoid conflicts between threads
     # Avoid using a named temporary file, as those have restricted permissions
     assert isinstance(
         content, (str, bytes)
     ), "Only strings and byte arrays can be saved in the cache"
-    path = pathlib.Path(path)
+    path = Path(path)
+    if make_dirs:
+        path.parent.mkdir(parents=True, exist_ok=True)
     tmp_path = path.parent / f".{os.getpid()}.{threading.get_ident()}.tmp"
     write_mode = "w" if isinstance(content, str) else "wb"
     with tmp_path.open(write_mode) as f:
@@ -387,28 +424,6 @@ def write_atomic(path: str, content: Union[str, bytes]) -> None:
     tmp_path.rename(path)
 
 
-@dataclasses.dataclass
-class TensorMetadata:
-    """
-    The Tensor metadata relevant when hashing FxGraph cache keys.
-    """
-
-    dtype: torch.dtype
-    shape: torch.Size
-    stride: Tuple[Any, ...]
-    device: torch.device
-    layout: torch.layout
-    memory_format: Optional[torch.memory_format]
-    storage_offset: int
-    requires_grad: bool
-    is_quantized: bool
-    is_conj: bool
-    is_neg: bool
-    is_coalesced: bool
-    dense_dim: int
-    sparse_dim: int
-
-
 @dataclasses.dataclass
 class TensorMetadataAndValues:
     """
@@ -420,32 +435,6 @@ class TensorMetadataAndValues:
     values: List[Any]
 
 
-def extract_tensor_metadata(t: torch.Tensor) -> TensorMetadata:
-    """
-    Extract the TensorMetadata of a tensor.
-    """
-    memory_format: Optional[torch.memory_format] = suggest_memory_format(t)
-    if not t.is_contiguous(memory_format=memory_format):
-        memory_format = None
-
-    return TensorMetadata(
-        dtype=t.dtype,
-        shape=t.shape,
-        stride=t.stride() if t.layout == torch.strided else (),
-        device=t.device,
-        layout=t.layout,
-        memory_format=memory_format,
-        storage_offset=t.storage_offset(),
-        requires_grad=t.requires_grad,
-        is_quantized=t.is_quantized,
-        is_conj=t.is_conj(),
-        is_neg=t.is_neg(),
-        is_coalesced=t.is_coalesced() if t.is_sparse else False,
-        dense_dim=t.dense_dim() if t.is_sparse else False,
-        sparse_dim=t.sparse_dim() if t.is_sparse else False,
-    )
-
-
 def _ident(x: Any) -> Any:
     return x
 
@@ -461,18 +450,32 @@ def _reduce_fake_tensor(t):
 def _reduce_tensor(t):
     """
     See FxGraphCachePickler. Custom reducer to pickle Tensors.
+    If we see tensors, we know they're constants stored as attributes on
+    the GraphModule. Include the values in the key calculation. Small
+    tensors will be inlined, so we can't serve the same cache entry for
+    different values anyway. Large constants are treated as parameters,
+    so we could conceivably reuse a cache entry. To do that, however,
+    PyCodeCache would need more complexity to create a new module from its
+    cache, but with the right constants attached as attributes.
     """
-    # If we see tensors, we know they're contstants stored as attributes on
-    # the GraphModule. See tensor lowering; small constants are inlined. If
-    # we see a small tensor, therefore, no reference will ultimately remain
-    # in the generated code. So we need to include its value in the cache key.
-    # Large constannts are effectively treated as inputs and we consider only
-    # their metadata.
+    if t.is_mkldnn:
+        # TODO: These tensors don't currently pickle, so we can't cache a
+        # compiled graph containing them. Just fail now. If mkldnn tensors
+        # get pickling support, we can remove this.
+        raise BypassFxGraphCache
+
+    # Very large tensors could be expensive to copy to cpu and hash. Let's
+    # at least report if we find slowness.
+    start = time()
+    values = t.tolist()
+    elapsed = time() - start
+    if elapsed > 1.0:
+        warnings.warn(
+            f"FX graph cache handling of a large constant took {elapsed:.1}s. Please file an issue."
+        )
+
     metadata = extract_tensor_metadata(t)
-    if len(t.shape) == 0 or torch._inductor.graph.GraphLowering.can_inline_constant(t):
-        return (_ident, (TensorMetadataAndValues(metadata, t.tolist()),))
-    else:
-        return (_ident, (metadata,))
+    return (_ident, (TensorMetadataAndValues(metadata, values),))
 
 
 def _reduce_symint(s):
@@ -485,6 +488,14 @@ def _reduce_symint(s):
     return (_ident, (str(s),))
 
 
+def _reduce_unsupported(s):
+    """
+    See FxGraphCachePickler. Custom reducer to handle any objects that we don't
+    support and therefore raise to bypass caching.
+    """
+    raise BypassFxGraphCache
+
+
 class FxGraphCachePickler(pickle.Pickler):
     """
     Custom pickler to customize the pickling of some objects (Tensors), only for the
@@ -494,29 +505,63 @@ class FxGraphCachePickler(pickle.Pickler):
     """
 
     dispatch_table = copyreg.dispatch_table.copy()
-    dispatch_table[torch._subclasses.fake_tensor.FakeTensor] = _reduce_fake_tensor
+    dispatch_table[FakeTensor] = _reduce_fake_tensor
     dispatch_table[torch.Tensor] = _reduce_tensor
     dispatch_table[torch.SymInt] = _reduce_symint
+    dispatch_table[
+        torch.fx.experimental._backward_state.BackwardState
+    ] = _reduce_unsupported
 
-    @staticmethod
-    def dumps(obj) -> bytes:
+    @classmethod
+    def dumps(cls, obj) -> bytes:
         """
         Pickle an object using the FxGraphCachePickler.
         """
         with io.BytesIO() as stream:
-            pickler = FxGraphCachePickler(stream)
+            pickler = cls(stream)
             pickler.dump(obj)
             return stream.getvalue()
 
-    @staticmethod
-    def get_hash(obj: Any) -> str:
+    @classmethod
+    def get_hash(cls, obj: Any) -> str:
         """
         Serialize an object using the FxGraphCachePickler and return a hash
         of the pickled object.
         """
-        serialized_data = FxGraphCachePickler.dumps(obj)
+        serialized_data = cls.dumps(obj)
         return sha256_hash(serialized_data)
 
+    @classmethod
+    def debug_str(cls, inp: Any) -> str:
+        """
+        Get a printable string describing in more detail all the attributes
+        comprising an object. Useful for debugging when one graph hashes
+        to a different value than another.
+        """
+
+        def get_str(obj) -> str:
+            if isinstance(obj, torch.Tensor):
+                return str(extract_tensor_metadata(obj))
+            elif isinstance(obj, bytes):
+                return "<bytes>"
+            else:
+                return str(obj)
+
+        lines = []
+        for attr, obj in vars(inp).items():
+            if isinstance(obj, list):
+                for ii in range(len(obj)):
+                    h = cls.get_hash(obj[ii])
+                    lines.append(f"[{h}] {attr}[{ii}]: {get_str(obj[ii])}")
+            elif isinstance(obj, dict):
+                for k, v in obj.items():
+                    h = cls.get_hash(v)
+                    lines.append(f"[{h}] {attr}[{k}]: {get_str(v)}")
+            else:
+                h = cls.get_hash(obj)
+                lines.append(f"[{h}] {attr}: {get_str(obj)}")
+        return "\n".join(lines)
+
 
 @functools.lru_cache(None)
 def get_inductor_code_hash() -> bytes:
@@ -548,6 +593,14 @@ class OrderedSetHolder:
     items: List[Any]
 
 
+class BypassFxGraphCache(Exception):
+    """
+    Exception to indicate that the FxGraphCache should be bypassed.
+    """
+
+    pass
+
+
 class FxGraphHashDetails:
     """
     Object to capture all the details for a compiled FX graph relevant to computing
@@ -577,13 +630,33 @@ def __init__(
                 else:
                     self.fx_kwargs[k] = fx_kwargs[k]
 
-        # Also hash on various system info (including the triton compiler version), as
-        # well as the inductor configuration and code.
+        # 'Deterministic algorithms' can affect codegen via lowering to cuda kernels.
+        self.deterministic_algorithms_settings = (
+            torch.are_deterministic_algorithms_enabled(),
+            torch.is_deterministic_algorithms_warn_only_enabled(),
+            torch.utils.deterministic.fill_uninitialized_memory,  # type: ignore[attr-defined]
+        )
+
+        # Global settings affecting matmul codegen.
+        self.cuda_matmul_settings = (
+            torch.backends.cuda.matmul.allow_tf32,
+            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction,
+            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction,
+        )
+
+        # Also hash on various system info (including the triton compiler version).
         self.torch_version = torch.__version__
         self.system_info = CacheBase.get_system()
 
-        self.inductor_config = config.save_config()
+        # And the inductor configuration and code.
         self.inductor_code_hash = get_inductor_code_hash()
+        try:
+            self.inductor_config = config.save_config()
+        except (TypeError, AttributeError) as e:
+            # Some configs options are callables, e.g., post_grad_custom_pre_pass,
+            # and may not pickle.
+            log.debug("Can't pickle inductor config: %s", e)
+            raise BypassFxGraphCache from e
 
     def debug_str(self) -> str:
         """
@@ -591,29 +664,7 @@ def debug_str(self) -> str:
         comprising this object. Useful for debugging when one graph hashes
         to a different value than another.
         """
-
-        def get_str(obj) -> str:
-            if isinstance(obj, torch.Tensor):
-                return str(extract_tensor_metadata(obj))
-            elif isinstance(obj, bytes):
-                return "<bytes>"
-            else:
-                return str(obj)
-
-        lines = []
-        for attr, obj in vars(self).items():
-            if isinstance(obj, list):
-                for ii in range(len(obj)):
-                    h = FxGraphCachePickler.get_hash(obj[ii])
-                    lines.append(f"[{h}] {attr}[{ii}]: {get_str(obj[ii])}")
-            elif isinstance(obj, dict):
-                for k, v in obj.items():
-                    h = FxGraphCachePickler.get_hash(v)
-                    lines.append(f"[{h}] {attr}[{k}]: {get_str(v)}")
-            else:
-                h = FxGraphCachePickler.get_hash(obj)
-                lines.append(f"[{h}] {attr}: {get_str(obj)}")
-        return "\n".join(lines)
+        return FxGraphCachePickler.debug_str(self)
 
 
 def compiled_fx_graph_hash(
@@ -628,7 +679,11 @@ def compiled_fx_graph_hash(
     # The prefix distinguishes among the other kinds of objects we
     # cache in this module.
     key = "f" + FxGraphCachePickler.get_hash(details)
-    log.debug("FX graph cache hash details for key %s:\n%s", key, details.debug_str())
+    log.debug(
+        "FX graph cache hash details for key %s:\n%s",
+        key,
+        details.debug_str(),
+    )
     return key
 
 
@@ -684,76 +739,142 @@ def _filter_symints(inputs: List[Any]) -> List[torch.SymInt]:
         return [s for s in inputs if isinstance(s, torch.SymInt)]
 
     @staticmethod
-    def _get_shape_env() -> ShapeEnv:
+    def _get_shape_env() -> Optional[ShapeEnv]:
         """
         Helper to get the shape env from the tracing context.
         """
-        return torch._guards.TracingContext.get().fake_mode.shape_env
+        ctx = torch._guards.TracingContext.try_get()
+        if not ctx:
+            return None
+        return ctx.fake_mode.shape_env
 
     @staticmethod
     def _lookup_graph(
         key: str,
         example_inputs: List[torch.Tensor],
+        local,
+        remote_cache,
     ) -> Optional[CompiledFxGraph]:
         """
         Lookup a compiled graph in the cache by key. On a hit, return the
         deserialized CompiledFxGraph object. On a miss, return None.
         """
-        subdir = FxGraphCache._get_tmp_dir_for_key(key)
-        if not os.path.exists(subdir):
-            return None
+        shape_env = FxGraphCache._get_shape_env()
+        assert shape_env is not None
+
+        symints = FxGraphCache._filter_symints(example_inputs)
+        assert all(has_hint(s) for s in symints)
+        hints = [hint_int(s) for s in symints]
+
+        def iterate_over_candidates() -> Generator[CompiledFxGraph, None, None]:
+            if local:
+                subdir = FxGraphCache._get_tmp_dir_for_key(key)
+                if os.path.exists(subdir):
+                    for path in sorted(os.listdir(subdir)):
+                        with open(os.path.join(subdir, path), "rb") as f:
+                            yield pickle.load(f)
+            if remote_cache:
+                if (data := remote_cache.get(key)) is not None:
+                    yield pickle.loads(data)
 
         # Iterate over any entries in the subdir for this key and evaluate
         # their guards to determine whether there's a hit.
-        for path in sorted(os.listdir(subdir)):
-            with open(os.path.join(subdir, path), "rb") as f:
-                graph: CompiledFxGraph = pickle.load(f)
+        graph = None
 
-            guards_expr = graph.guards_expr
-            if not guards_expr:
-                # No guards to evaluate
-                return graph
+        for candidate in iterate_over_candidates():
+            if not candidate.guards_expr:
+                # No guards to evaluate, so this is a hit.
+                graph = candidate
+                break
 
             # Evaluate the guard expression in the current context.
-            shape_env = FxGraphCache._get_shape_env()
-            symints = FxGraphCache._filter_symints(example_inputs)
-
             # If there's not a cache hit, we don't want the evaluation to
             # affect the current env, e.g., cause the creation of new guards,
             # so we evaluate with the hints instead of the symbols.
-            assert all(has_hint(s) for s in symints)
-            hints = [hint_int(s) for s in symints]
-            hit = bool(shape_env.evaluate_guards_expression(guards_expr, hints))
+            hit = bool(
+                shape_env.evaluate_guards_expression(candidate.guards_expr, hints)
+            )
             log.debug(
-                "fx graph cache key %s evaluating guards for %s with values %s => %s",
+                "fx graph cache key %s evaluating guards [%s] with values %s => hit=%s",
                 key,
-                guards_expr,
+                candidate.guards_expr,
                 hints,
                 hit,
             )
             if hit:
-                # Now re-evaluate with the symints to add any guards to the current env.
-                check = bool(shape_env.evaluate_guards_expression(guards_expr, symints))
-                assert check is True
-                log.debug(
-                    "fx graph cache key %s post-load guards: %s",
-                    key,
-                    shape_env.guards,
-                )
-                return graph
+                graph = candidate
+                break
 
-        return None
+        if graph is None:
+            return None
+
+        # See _save_graph(); we don't store the callable in the cache entry so
+        # recreate it here from the PyCodeCache disk cache.
+        artifact_path = get_path(graph.cache_key, "py")[2]
+        if not os.path.exists(artifact_path):
+            counters["inductor"]["fxgraph_lookup_write_file"] += 1
+            Path(os.path.dirname(artifact_path)).mkdir(parents=True, exist_ok=True)
+            code = graph.source_code
+            cpp_pp = cpp_prefix_path()
+            if os.path.basename(cpp_pp) in code:
+                if cpp_pp in code:
+                    # Great the name is correct
+                    pass
+                else:
+                    # Old dir name is included, replace it
+                    pattern = rf'#include\s*"[^"]+{os.path.basename(cpp_pp)}"'
+                    code = re.sub(pattern, f'#include "{cpp_pp}"', code)
+
+            write_atomic(artifact_path, code, make_dirs=True)
+
+        try:
+            graph.current_callable = PyCodeCache.load_by_key_path(
+                graph.cache_key,
+                artifact_path,
+                graph.cache_linemap,
+                graph.constants,
+            ).call
+        except OSError:
+            # Not expected, but in case the PyCodeCache entry is removed from
+            # underneath us, treat it as a cache miss and recompile.
+            log.error("Failed to load cached artifact: %s", artifact_path)
+            return None
+
+        # Now re-evaluate with the symints to add any guards to the current env.
+        if graph.guards_expr:
+            check = bool(
+                shape_env.evaluate_guards_expression(graph.guards_expr, symints)
+            )
+            assert check is True
+            log.debug(
+                "fx graph cache key %s post-load guards: %s", key, shape_env.guards
+            )
+
+        # Increment the cached metrics by the amounts recorded when the FX
+        # graph was compiled for this cache entry. Pretending these counters
+        # were incremented normally is useful for testing with the cache enabled.
+        metrics.CachedMetricsHelper.apply_deltas(graph.metrics_deltas)
+
+        return graph
 
     @staticmethod
     def _save_graph(
-        key: str, compiled_graph: CompiledFxGraph, example_inputs: List[torch.Tensor]
+        key: str,
+        compiled_graph: CompiledFxGraph,
+        example_inputs: List[torch.Tensor],
+        time_taken_ns,
+        local,
+        remote_cache,
     ):
         """
         Store a serialized CompiledFxGraph on disk.
         """
         disk_compiled_graph = copy(compiled_graph)
-        # Important as compiled models are not pickleable:
-        disk_compiled_graph.compiled_artifact = None
+        # We can't really serialize callables that may be C++/Triton/etc.,
+        # so we serialize their PyCodeCache disk cache location instead.
+        # TODO: This could be better if we're ever able to serialize compiled
+        # models to disk.
+        disk_compiled_graph.current_callable = None
 
         # Before serializing, compute the guard expression that will be used to
         # ensure that a CompiledFxGraph is valid when loaded from the cache. It's
@@ -761,20 +882,60 @@ def _save_graph(
         # Tensor shapes are already captured in the hash for the cache key. Any
         # Tensor arg with a symbolic shape will have a SymInt arg for the graph.
         shape_env = FxGraphCache._get_shape_env()
+        assert shape_env is not None
         symints = FxGraphCache._filter_symints(example_inputs)
         disk_compiled_graph.guards_expr = shape_env.produce_guards_expression(symints)
 
-        content = pickle.dumps(disk_compiled_graph)
+        try:
+            content = pickle.dumps(disk_compiled_graph)
+        except Exception as e:
+            log.debug("fx graph cache unable to serialize compiled graph: %s", e)
+            counters["inductor"]["fxgraph_cache_pickle_error"] += 1
+            return
 
-        subdir = FxGraphCache._get_tmp_dir_for_key(key)
-        if not os.path.exists(subdir):
-            os.makedirs(subdir, exist_ok=True)
+        if local:
+            subdir = FxGraphCache._get_tmp_dir_for_key(key)
+            if not os.path.exists(subdir):
+                os.makedirs(subdir, exist_ok=True)
+
+            # Use a hash of the serialized CompiledFxGraph to get a unique file
+            # name. The specific name doesn't matter since a lookup involves
+            # iterating over all entries in the parent subdir.
+            path = os.path.join(subdir, sha256_hash(content))
+            write_atomic(path, content, make_dirs=True)
+
+        if remote_cache:
+            cache_data = (
+                {
+                    "data": content,
+                    "time_taken_ms": time_taken_ns // 1000000,  # Convert from NS to MS
+                }
+                if config.is_fbcode()
+                else content
+            )
+            remote_cache.put(key, cache_data)
 
-        # Use a hash of the serialized CompiledFxGraph to get a unique file
-        # name. The specific name doesn't matter since a lookup involves
-        # iterating over all entries in the parent subdir.
-        path = os.path.join(subdir, sha256_hash(content))
-        write_atomic(path, content)
+    @staticmethod
+    def _check_can_cache(gm: torch.fx.GraphModule):
+        """
+        Check some conditions that would preclude caching and raise BypassFxGraphCache
+        to bypass in case caching is not possible.
+        """
+        # Freezing can embed constants that wouldn't be static across runs.
+        if config.freezing or config.aot_inductor.use_runtime_constant_folding:
+            raise BypassFxGraphCache
+
+        # The treatment of guards in the caching implementation requires that
+        # we have a shape env.
+        if FxGraphCache._get_shape_env() is None:
+            log.debug("fx graph cache no shape env")
+            raise BypassFxGraphCache
+
+        # HigherOrderOperators should be handled on a case-by-case basis.
+        # Currently, we just skip caching if we have any.
+        for node in gm.graph.nodes:
+            if isinstance(node.target, torch._ops.HigherOrderOperator):
+                raise BypassFxGraphCache
 
     @staticmethod
     def load(
@@ -782,36 +943,73 @@ def load(
         gm: torch.fx.GraphModule,
         example_inputs: List[torch.Tensor],
         fx_kwargs: Dict[str, Any],
+        local: bool,
+        remote: bool,
     ):
         """
         Load a compiled graph from the cache. If a cached entry does not exist,
         compile the graph and save it to the cache.
         """
-        from filelock import FileLock
+        assert local or remote, "at least one of them needs to be enabled"
+        compiled_graph = None
+        try:
+            FxGraphCache._check_can_cache(gm)
+            key = compiled_fx_graph_hash(gm, example_inputs, fx_kwargs)
 
-        key = compiled_fx_graph_hash(gm, example_inputs, fx_kwargs)
+            remote_cache = None
+            if remote:
+                cache_id = "fx-graph-v1"
+                try:
+                    import triton
 
-        lock_dir = get_lock_dir()
-        lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
-        with lock:
-            compiled_graph = FxGraphCache._lookup_graph(key, example_inputs)
+                    if config.is_fbcode():
+                        remote_cache = triton.runtime.fb_memcache.FbMemcacheRemoteFxGraphCacheBackend(
+                            cache_id
+                        )
+                    else:
+                        remote_cache = triton.runtime.cache.RedisRemoteCacheBackend(
+                            cache_id
+                        )
+                except Exception:
+                    remote_cache = None
+                    log.warning("Unable to create a remote cache", exc_info=True)
+
+            compiled_graph = FxGraphCache._lookup_graph(
+                key, example_inputs, local, remote_cache
+            )
             if compiled_graph is None:
                 log.debug("fx graph cache miss for key %s", key)
                 counters["inductor"]["fxgraph_cache_miss"] += 1
+                start_time = time_ns()
                 compiled_graph = compile_fx_fn(gm, example_inputs, **fx_kwargs)
-                FxGraphCache._save_graph(key, compiled_graph, example_inputs)
+                time_taken_ns = time_ns() - start_time
+                FxGraphCache._save_graph(
+                    key,
+                    compiled_graph,
+                    example_inputs,
+                    time_taken_ns,
+                    local,
+                    remote_cache,
+                )
             else:
                 log.debug("fx graph cache hit for key %s", key)
                 counters["inductor"]["fxgraph_cache_hit"] += 1
+        except BypassFxGraphCache:
+            counters["inductor"]["fxgraph_cache_bypass"] += 1
+            if not compiled_graph:
+                compiled_graph = compile_fx_fn(gm, example_inputs, **fx_kwargs)
 
-            return compiled_graph
+        return compiled_graph
 
     @staticmethod
     def clear():
         """
         Clear out the on-disk cache.
         """
-        shutil.rmtree(FxGraphCache._get_tmp_dir())
+        try:
+            shutil.rmtree(FxGraphCache._get_tmp_dir())
+        except FileNotFoundError:
+            pass
 
 
 @dataclasses.dataclass
@@ -821,35 +1019,40 @@ class CompiledFxGraph:
     to support FxGraph caching.
     """
 
-    compiled_artifact: Optional[Callable[..., Any]] = None
-    current_callable: Optional[Callable[..., Any]] = None
-    cache_key: Optional[str] = None
-    artifact_path: Optional[str] = None
-    cache_linemap: Optional[List[Tuple[int, str]]] = None
-    device_types: Set[str] = field(default_factory=set)
-    device_idxs: Set[int] = field(default_factory=set)
-    mutated_inputs: Set[str] = field(default_factory=set)
-    mutated_input_idxs: Set[int] = field(default_factory=set)
-    constants: Dict[str, torch.Tensor] = field(default_factory=dict)
-    output_strides: Optional[List[Optional[Tuple[int, ...]]]] = None
+    current_callable: Optional[Callable[..., Any]]
+    cache_key: str
+    source_code: str = dataclasses.field(repr=False)  # Do not display source_code
+    cache_linemap: Optional[List[Tuple[int, str]]]
+    device_types: Set[str]
+    device_idxs: Set[int]
+    mutated_inputs: Set[str]
+    mutated_input_idxs: Set[int]
+    constants: Dict[str, torch.Tensor]
+    output_strides: Optional[List[Optional[Tuple[int, ...]]]]
+    disabled_cudagraphs_reason: Optional[str]
+    metrics_deltas: metrics.CachedMetricsDeltas
     # This is a string representation of an expression we serialize
     # with the object so the guards can be evaluated in a different
     # context in order to verify the validity of serving a cached
     # fx graph. The expression must be generated by:
     # ShapeEnv.produce_guards_expression()
-    guards_expr: Optional[str] = None
+    guards_expr: Optional[str]
 
     _boxed_call: Optional[bool] = None
 
     def __init__(
         self,
-        compiled_artifact: Optional[Callable[..., Any]],
+        current_callable: Optional[Callable[..., Any]],
         graph: GraphLowering,
         output_strides: List[Optional[Tuple[int, ...]]],
+        disabled_cudagraphs_reason: Optional[str],
+        metrics_deltas: metrics.CachedMetricsDeltas,
     ):
-        self.compiled_artifact = compiled_artifact
+        self.current_callable = current_callable
         self.cache_key = graph.cache_key
-        self.artifact_path = graph.cache_path
+        if graph.cache_path:
+            with open(graph.cache_path) as f:
+                self.source_code = f.read()
         self.cache_linemap = graph.cache_linemap
         self.device_types = graph.device_types
         self.device_idxs = graph.device_idxs
@@ -857,43 +1060,18 @@ def __init__(
         self.mutated_input_idxs = set(graph.mutated_input_idxs)
         self.constants = graph.constants
         self.output_strides = output_strides
+        self.disabled_cudagraphs_reason = disabled_cudagraphs_reason
+        self.metrics_deltas = metrics_deltas
         self.guards_expr = None
 
     def __call__(self, inputs: List[Any]) -> Any:
-        return self.get_current_callable()(inputs)
-
-    def get_current_callable(self) -> Callable[..., Any]:
-        if self.current_callable is None:
-            # This prevents a circular reference that makes CompiledFxGraph
-            # get stuck without getting garbage collected
-            return functools.partial(_run_from_cache, weakref.proxy(self))
-        else:
-            return self.current_callable
-
-
-def _run_from_cache(compiled_graph: CompiledFxGraph, inputs: List[Any]) -> Any:
-    # We can't really serialize callables that may be C++/Triton/etc.,
-    # so we serialize their disk cache location instead
-    # TODO: When making an API that can save compiled models e2e to disk
-    # this will need to be better
-    if compiled_graph.compiled_artifact is None:
-        from .codecache import PyCodeCache
-
-        assert compiled_graph.cache_key
-        assert compiled_graph.artifact_path
-        compiled_graph.compiled_artifact = PyCodeCache.load_by_key_path(
-            compiled_graph.cache_key,
-            compiled_graph.artifact_path,
-            compiled_graph.cache_linemap,
-            compiled_graph.constants,
-        ).call
-
-    return compiled_graph.compiled_artifact(inputs)
+        assert self.current_callable is not None
+        return self.current_callable(inputs)
 
 
 def cpp_compiler() -> str:
     if config.is_fbcode():
-        return build_paths.cc()
+        return build_paths.cc() if torch.version.hip is None else build_paths.clang()
     if isinstance(config.cpp.cxx, (list, tuple)):
         search = tuple(config.cpp.cxx)
     else:
@@ -925,7 +1103,7 @@ def cpp_compiler_search(search: str) -> str:
             return cxx
         except (subprocess.SubprocessError, FileNotFoundError, ImportError):
             continue
-    raise exc.InvalidCxxCompiler()
+    raise exc.InvalidCxxCompiler
 
 
 def install_gcc_via_conda() -> str:
@@ -955,13 +1133,11 @@ def install_gcc_via_conda() -> str:
 
 
 def is_gcc() -> bool:
+    if sys.platform == "darwin" and is_apple_clang():
+        return False
     return bool(re.search(r"(gcc|g\+\+)", cpp_compiler()))
 
 
-def is_clang() -> bool:
-    return bool(re.search(r"(clang|clang\+\+)", cpp_compiler()))
-
-
 @functools.lru_cache(None)
 def is_apple_clang() -> bool:
     cxx = cpp_compiler()
@@ -969,6 +1145,47 @@ def is_apple_clang() -> bool:
     return "Apple" in version_string.splitlines()[0]
 
 
+def is_clang() -> bool:
+    # Mac OS apple clang maybe named as gcc, need check compiler info.
+    if sys.platform == "darwin":
+        return is_apple_clang()
+    return bool(re.search(r"(clang|clang\+\+)", cpp_compiler()))
+
+
+def get_compiler_version_info(compiler):
+    SUBPROCESS_DECODE_ARGS = ("oem",) if _IS_WINDOWS else ()
+    env = os.environ.copy()
+    env["LC_ALL"] = "C"  # Don't localize output
+    try:
+        version_string = subprocess.check_output(
+            [compiler, "-v"], stderr=subprocess.STDOUT, env=env
+        ).decode(*SUBPROCESS_DECODE_ARGS)
+    except Exception as e:
+        try:
+            version_string = subprocess.check_output(
+                [compiler, "--version"], stderr=subprocess.STDOUT, env=env
+            ).decode(*SUBPROCESS_DECODE_ARGS)
+        except Exception as e:
+            return ""
+    # Mutiple lines to one line string.
+    version_string = version_string.replace("\r", "_")
+    version_string = version_string.replace("\n", "_")
+    return version_string
+
+
+def _get_isa_dry_compile_fingerprint(isa_flags: str) -> str:
+    # ISA dry compile will cost about 1 sec time each startup time.
+    # Please check the issue: https://github.com/pytorch/pytorch/issues/100378
+    # Actually, dry compile is checking compile capability for ISA.
+    # We just record the compiler version, isa options and pytorch version info,
+    # and generated them to output binary hash path.
+    # It would optimize and skip compile existing binary.
+    compiler_info = get_compiler_version_info(cpp_compiler())
+    torch_version = torch.__version__
+    fingerprint = f"{compiler_info}={isa_flags}={torch_version}"
+    return fingerprint
+
+
 class VecISA:
     _bit_width: int
     _macro: str
@@ -991,7 +1208,7 @@ class VecISA:
     # In fbcode however, we are using the same compiler for pytorch and for inductor codegen,
     # making the runtime check unnecessary.
     _avx_code = """
-#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR)
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR) || defined(CPU_CAPABILITY_NEON)
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #endif
@@ -1003,7 +1220,7 @@ class VecISA:
     auto tmp1 = tmp0.exp();
     tmp1.store(in_out_ptr0);
 }
-"""
+"""  # noqa: B950
 
     _avx_py_load = """
 import torch
@@ -1034,7 +1251,11 @@ def __bool__(self) -> bool:
         if config.is_fbcode():
             return True
 
-        key, input_path = write(VecISA._avx_code, "cpp")
+        key, input_path = write(
+            VecISA._avx_code,
+            "cpp",
+            extra=_get_isa_dry_compile_fingerprint(self._arch_flags),
+        )
         from filelock import FileLock
 
         lock_dir = get_lock_dir()
@@ -1047,8 +1268,11 @@ def __bool__(self) -> bool:
                 )
             )
             try:
+                # Check if the output file exist, and compile when not.
+                if not os.path.isfile(output_path):
+                    compile_file(input_path, output_path, build_cmd)
+
                 # Check build result
-                compile_file(input_path, output_path, build_cmd)
                 subprocess.check_call(
                     [
                         sys.executable,
@@ -1064,6 +1288,19 @@ def __bool__(self) -> bool:
             return True
 
 
+@dataclasses.dataclass
+class VecNEON(VecISA):
+    _bit_width = 256  # This is required to leverage the compute implemented in aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
+    _macro = "-DCPU_CAPABILITY_NEON"
+    _arch_flags = ""  # Unused
+    _dtype_nelements = {torch.float: 8, torch.bfloat16: 16, torch.float16: 16}
+
+    def __str__(self) -> str:
+        return "asimd"  # detects the presence of advanced SIMD on armv8-a kernels
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+
+
 @dataclasses.dataclass
 class VecAVX512(VecISA):
     _bit_width = 512
@@ -1119,7 +1356,7 @@ def __bool__(self) -> bool:  # type: ignore[override]
 
 
 invalid_vec_isa = InvalidVecISA()
-supported_vec_isa_list = [VecAVX512(), VecAVX2()]
+supported_vec_isa_list = [VecAVX512(), VecAVX2(), VecNEON()]
 
 
 # Cache the cpuinfo to avoid I/O overhead. Meanwhile, the cpuinfo content
@@ -1127,11 +1364,25 @@ def __bool__(self) -> bool:  # type: ignore[override]
 # we only cache some key isa information.
 @functools.lru_cache(None)
 def valid_vec_isa_list() -> List[VecISA]:
+    if sys.platform == "darwin" and platform.processor() == "arm":
+        return [VecNEON()]
+
     if sys.platform != "linux":
         return []
 
     if platform.machine() == "s390x":
-        return [VecZVECTOR()]
+        with open("/proc/cpuinfo") as _cpu_info:
+            while True:
+                line = _cpu_info.readline()
+                if not line:
+                    break
+                # process line
+                featuresmatch = re.match(r"^features\s*:\s*(.*)$", line)
+                if featuresmatch:
+                    for group in featuresmatch.groups():
+                        if re.search(r"[\^ ]+vxe[\$ ]+", group):
+                            return [VecZVECTOR()]
+        return []
 
     isa_list = []
     with open("/proc/cpuinfo") as _cpu_info:
@@ -1150,7 +1401,7 @@ def pick_vec_isa() -> VecISA:
     if not _valid_vec_isa_list:
         return invalid_vec_isa
 
-    # If the simdlen is None, it indicates determin the vectorization length automatically
+    # If the simdlen is None, it indicates determine the vectorization length automatically
     if config.cpp.simdlen is None:
         assert _valid_vec_isa_list
         return _valid_vec_isa_list[0]
@@ -1166,8 +1417,16 @@ def get_compile_only(compile_only: bool = True) -> str:
     return "-c" if compile_only else ""
 
 
-def get_shared(shared: bool = True) -> str:
-    return "-shared -fPIC" if shared else ""
+def get_shared(shared: bool = True, compile_only: bool = False) -> str:
+    if not shared:
+        return ""
+    if compile_only:
+        return "-fPIC"
+    if platform.system() == "Darwin" and "clang" in cpp_compiler():
+        # This causes undefined symbols to behave the same as linux
+        return "-shared -fPIC -undefined dynamic_lookup"
+    else:
+        return "-shared -fPIC"
 
 
 def get_warning_all_flag(warning_all: bool = True) -> str:
@@ -1194,6 +1453,8 @@ def optimization_flags() -> str:
     base_flags += " -ffast-math -fno-finite-math-only"
     if not config.cpp.enable_unsafe_math_opt_flag:
         base_flags += " -fno-unsafe-math-optimizations"
+    if not config.cpp.enable_floating_point_contract_flag:
+        base_flags += " -ffp-contract=off"
 
     if config.is_fbcode():
         # FIXME: passing `-fopenmp` adds libgomp.so to the generated shared library's dependencies.
@@ -1273,21 +1534,39 @@ def homebrew_libomp() -> Tuple[bool, str]:
         return False, ""
 
 
-def get_include_and_linking_paths(
-    include_pytorch: bool = False,
-    vec_isa: VecISA = invalid_vec_isa,
-    cuda: bool = False,
-    aot_mode: bool = False,
-) -> Tuple[List[str], str, str, str, str]:
+def _set_gpu_runtime_env() -> None:
     if (
         config.is_fbcode()
+        and torch.version.hip is None
         and "CUDA_HOME" not in os.environ
         and "CUDA_PATH" not in os.environ
     ):
         os.environ["CUDA_HOME"] = os.path.dirname(build_paths.cuda())
+
+
+def _get_python_include_dirs():
+    include_dir = Path(sysconfig.get_path("include"))
+    # On Darwin Python executable from a framework can return
+    # non-existing /Library/Python/... include path, in which case
+    # one should use Headers folder from the framework
+    if not include_dir.exists() and platform.system() == "Darwin":
+        std_lib = Path(sysconfig.get_path("stdlib"))
+        include_dir = (std_lib.parent.parent / "Headers").absolute()
+    if not (include_dir / "Python.h").exists():
+        warnings.warn(f"Can't find Python.h in {str(include_dir)}")
+    return [str(include_dir)]
+
+
+def get_include_and_linking_paths(
+    include_pytorch: bool = False,
+    vec_isa: VecISA = invalid_vec_isa,
+    cuda: bool = False,
+    aot_mode: bool = False,
+) -> Tuple[List[str], str, str, str, str]:
+    _set_gpu_runtime_env()
     from torch.utils import cpp_extension
 
-    macros = ""
+    macros = vec_isa.build_macro() if vec_isa != invalid_vec_isa else ""
     build_arch_flags = ""
     if sys.platform == "linux" and (
         include_pytorch
@@ -1298,7 +1577,7 @@ def get_include_and_linking_paths(
         # Note - We include pytorch only on linux right now. There is more work
         # to do to enable OMP build on darwin where PyTorch is built with IOMP
         # and we need a way to link to what PyTorch links.
-        ipaths = cpp_extension.include_paths(cuda) + [sysconfig.get_path("include")]
+        ipaths = cpp_extension.include_paths(cuda) + _get_python_include_dirs()
         lpaths = cpp_extension.library_paths(cuda) + [
             sysconfig.get_config_var("LIBDIR")
         ]
@@ -1316,7 +1595,7 @@ def get_include_and_linking_paths(
             libs += ["omp"]
             if aot_mode:
                 ipaths += [os.path.dirname(cpp_prefix_path())]
-                if cuda:
+                if cuda and torch.version.hip is None:
                     # This is a special treatment for Meta internal cuda-12 where all libs
                     # are in lib/cuda-12 and lib/cuda-12/stubs
                     for i, path in enumerate(lpaths):
@@ -1328,7 +1607,6 @@ def get_include_and_linking_paths(
                                     lpaths[i] = os.path.join(path, root)
                                     lpaths.append(os.path.join(lpaths[i], "stubs"))
                                     break
-        macros = vec_isa.build_macro()
         if macros:
             if config.is_fbcode() and vec_isa != invalid_vec_isa:
                 cap = str(vec_isa).upper()
@@ -1341,14 +1619,17 @@ def get_include_and_linking_paths(
                     ]
                 )
 
-        if aot_mode and cuda:
+        if cuda:
             if macros is None:
                 macros = ""
             macros += " -D USE_ROCM" if torch.version.hip else " -D USE_CUDA"
 
         if cuda:
             if torch.version.hip is not None:
-                libs += ["c10_hip", "torch_hip"]
+                if config.is_fbcode():
+                    libs += ["amdhip64"]
+                else:
+                    libs += ["c10_hip", "torch_hip"]
                 macros += " -D __HIP_PLATFORM_AMD__"
             else:
                 if config.is_fbcode():
@@ -1361,7 +1642,7 @@ def get_include_and_linking_paths(
         # symbol not found, if those header files require a library.
         # For those cases, include the lpath and libs command as we do for pytorch above.
         # This approach allows us to only pay for what we use.
-        ipaths = cpp_extension.include_paths(cuda) + [sysconfig.get_path("include")]
+        ipaths = cpp_extension.include_paths(cuda) + _get_python_include_dirs()
         if aot_mode:
             ipaths += [os.path.dirname(cpp_prefix_path())]
         lpaths = []
@@ -1371,11 +1652,11 @@ def get_include_and_linking_paths(
 
             # check the `OMP_PREFIX` environment first
             if os.getenv("OMP_PREFIX") is not None:
-                header_path = os.path.join(os.getenv("OMP_PREFIX"), "include", "omp.h")
+                header_path = os.path.join(os.getenv("OMP_PREFIX"), "include", "omp.h")  # type: ignore[arg-type]
                 valid_env = os.path.exists(header_path)
                 if valid_env:
-                    ipaths.append(os.path.join(os.getenv("OMP_PREFIX"), "include"))
-                    lpaths.append(os.path.join(os.getenv("OMP_PREFIX"), "lib"))
+                    ipaths.append(os.path.join(os.getenv("OMP_PREFIX"), "include"))  # type: ignore[arg-type]
+                    lpaths.append(os.path.join(os.getenv("OMP_PREFIX"), "lib"))  # type: ignore[arg-type]
                 else:
                     warnings.warn("environment variable `OMP_PREFIX` is invalid.")
                 omp_available = omp_available or valid_env
@@ -1386,8 +1667,8 @@ def get_include_and_linking_paths(
             if not omp_available and os.getenv("CONDA_PREFIX") is not None:
                 omp_available = is_conda_llvm_openmp_installed()
                 if omp_available:
-                    conda_lib_path = os.path.join(os.getenv("CONDA_PREFIX"), "lib")
-                    ipaths.append(os.path.join(os.getenv("CONDA_PREFIX"), "include"))
+                    conda_lib_path = os.path.join(os.getenv("CONDA_PREFIX"), "lib")  # type: ignore[arg-type]
+                    ipaths.append(os.path.join(os.getenv("CONDA_PREFIX"), "include"))  # type: ignore[arg-type]
                     lpaths.append(conda_lib_path)
                     # Prefer Intel OpenMP on x86 machine
                     if os.uname().machine == "x86_64" and os.path.exists(
@@ -1408,21 +1689,33 @@ def get_include_and_linking_paths(
             libs = ["omp"] if config.is_fbcode() else ["gomp"]
 
     # Unconditionally import c10 for non-abi-compatible mode to use TORCH_CHECK - See PyTorch #108690
-    if not config.aot_inductor.abi_compatible:
+    if not config.abi_compatible:
         libs += ["c10"]
         lpaths += [cpp_extension.TORCH_LIB_PATH]
 
     # third party libs
     if config.is_fbcode():
-        ipaths.append(build_paths.sleef())
+        # Note that the order of include paths do matter, as a result
+        # we need to have several branches interleaved here
+        if torch.version.hip is None:
+            ipaths.append(build_paths.sleef())
         ipaths.append(build_paths.openmp())
-        ipaths.append(build_paths.cc_include())
-        ipaths.append(build_paths.libgcc())
-        ipaths.append(build_paths.libgcc_arch())
+        ipaths.append(build_paths.python())
+        if torch.version.hip is not None:
+            ipaths.append(build_paths.clang_include())
+            ipaths.append(build_paths.gcc_include())
+            ipaths.append(build_paths.gcc_install_tools_include())
+        else:
+            ipaths.append(build_paths.cc_include())
+            ipaths.append(build_paths.libgcc())
+            ipaths.append(build_paths.libgcc_arch())
         ipaths.append(build_paths.libgcc_backward())
         ipaths.append(build_paths.glibc())
         ipaths.append(build_paths.linux_kernel())
-        ipaths.append(build_paths.cuda())
+        if torch.version.hip is not None:
+            ipaths.append(build_paths.rocm())
+        else:
+            ipaths.append(build_paths.cuda())
         # We also need to bundle includes with absolute paths into a remote directory
         # (later on, we copy the include paths from cpp_extensions into our remote dir)
         ipaths.append("include")
@@ -1430,7 +1723,8 @@ def get_include_and_linking_paths(
     static_link_libs = []
     if aot_mode and cuda and config.is_fbcode():
         # For Meta internal cuda-12, it is recommended to static link cudart
-        static_link_libs = ["-Wl,-Bstatic", "-lcudart_static", "-Wl,-Bdynamic"]
+        if torch.version.hip is None:
+            static_link_libs = ["-Wl,-Bstatic", "-lcudart_static", "-Wl,-Bdynamic"]
 
     lpaths_str = " ".join(["-L" + p for p in lpaths])
     libs_str = " ".join(static_link_libs + ["-l" + p for p in libs])
@@ -1448,6 +1742,7 @@ def cpp_compile_command(
     aot_mode: bool = False,
     compile_only: bool = False,
     use_absolute_path: bool = False,
+    use_mmap_weights: bool = False,
 ) -> str:
     ipaths, lpaths, libs, macros, build_arch_flags = get_include_and_linking_paths(
         include_pytorch, vec_isa, cuda, aot_mode
@@ -1460,26 +1755,34 @@ def cpp_compile_command(
         if aot_mode and not use_absolute_path:
             inp_name = input
             out_name = output
+            linker_script = _LINKER_SCRIPT
         else:
             # We need to copy any absolute-path torch includes
             inp_name = [os.path.basename(i) for i in input]
             out_name = os.path.basename(output)
+            linker_script = os.path.basename(_LINKER_SCRIPT)
         assert is_clang()
         # Use clang runtime instead of libgcc
         clang_flags += " --rtlib=compiler-rt"
         clang_flags += " -fuse-ld=lld"
+        clang_flags += f" -Wl,--script={linker_script}"
         linker_paths = "-B" + build_paths.glibc_lib()
         linker_paths += " -L" + build_paths.glibc_lib()
     else:
         inp_name = input
         out_name = output
         linker_paths = ""  # let the compiler pick
+    if compile_only:
+        libs, lpaths = "", ""
     inp_name_str = " ".join(inp_name)
+    if use_mmap_weights:
+        macros += " -D USE_MMAP_SELF"
+
     return re.sub(
         r"[ \n]+",
         " ",
         f"""
-            {cpp_compiler()} {inp_name_str} {get_shared(shared)}
+            {cpp_compiler()} {inp_name_str} {get_shared(shared, compile_only)}
             {get_warning_all_flag(warning_all)} {cpp_flags()}
             {get_glibcxx_abi_build_flags()}
             {ipaths_str} {lpaths} {libs} {build_arch_flags}
@@ -1511,9 +1814,10 @@ def split_aot_inductor_output_path(path: str) -> Tuple[str, str]:
         return path, ""
 
 
+@clear_on_fresh_inductor_cache
 class CudaKernelParamCache:
     cache: Dict[str, Dict[str, str]] = dict()
-    clear = staticmethod(cache.clear)
+    cache_clear = staticmethod(cache.clear)
 
     @classmethod
     def set(cls, key: str, params: Dict[str, str], cubin: str) -> None:
@@ -1535,11 +1839,12 @@ def set(cls, key: str, params: Dict[str, str], cubin: str) -> None:
     def get(cls, key: str) -> Optional[Dict[str, str]]:
         return cls.cache.get(key, None)
 
+    @classmethod
+    def get_keys(cls):
+        return cls.cache.keys()
 
-class AotCodeCache:
-    cache: Dict[str, str] = dict()
-    clear = staticmethod(cache.clear)
 
+class AotCodeCompiler:
     @classmethod
     def compile(
         cls,
@@ -1551,7 +1856,11 @@ def compile(
         picked_vec_isa = pick_vec_isa()
         cpp_command = repr(
             cpp_compile_command(
-                "i", "o", vec_isa=picked_vec_isa, cuda=cuda, aot_mode=graph.aot_mode
+                "i",
+                "o",
+                vec_isa=picked_vec_isa,
+                cuda=cuda,
+                aot_mode=graph.aot_mode,
             )
         )
         fbcode_aot_cpu_re = False
@@ -1578,140 +1887,218 @@ def compile(
             extra=cpp_command,
             specified_dir=specified_output_path,
         )
+        output_code_log.info("Output code written to: %s", input_path)
+        trace_structured(
+            "graph_dump",
+            lambda: {
+                "name": "inductor_aot_code",
+                "type": "cpp",
+                "filename": input_path,
+            },
+            payload_fn=lambda: source_code,
+        )
 
-        if key not in cls.cache or (
-            specified_output_path
-            and os.path.dirname(cls.cache[key]) != specified_output_path
-            or specified_so_name
-            and os.path.basename(cls.cache[key]) != specified_so_name
-        ):
-            from filelock import FileLock
+        def _compile_consts_linux(consts: bytes) -> str:
+            _, consts_path = write(
+                consts,
+                "bin",
+                specified_dir=specified_output_path,
+            )
 
-            lock_dir = get_lock_dir()
-            lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
-            with lock:
-                # Currently, this only support serializing extern nodes in fbcode
-                # Eventually, we should also have a serializer for OSS.
-                if config.is_fbcode() and serialized_extern_kernel_nodes:
-                    output_json = os.path.splitext(input_path)[0] + ".json"
-                    with open(output_json, "w") as f:
-                        f.write(serialized_extern_kernel_nodes)
-
-                output_so = (
-                    config.aot_inductor.output_path
-                    if specified_so_name
-                    else os.path.splitext(input_path)[0] + ".so"
-                )
+            consts_o = os.path.splitext(consts_path)[0] + ".o"
+            if fbcode_aot_cpu_re:
+                cmd = f"{ld_command} -r -b binary -o {os.path.basename(consts_o)} {os.path.basename(consts_path)}"
+                compile_file(consts_path, consts_o, cmd.split())
+                os.chmod(consts_o, 0o644)
+            else:
+                cmd = f"{ld_command} -r -b binary -o {consts_o} {consts_path}"
+                run_command_and_check(cmd)
+            log.debug("aot constant binary command: %s", cmd)
+
+            # .data section is between .text and .bss. When the size of .data is large,
+            # during the linking, the relocation of .text against .bss may overflow.
+            # Rename it to .ldata so that it won't be in between the .text and .bss section
+            cmd = (
+                f"{objcopy_command} --rename-section"
+                " .data=.ldata"
+                " --set-section-alignment .data=64"  # following the gAlignment of CPU in c10/core/alignment.h
+                f" {consts_o} {consts_o}"
+            )
+            log.debug("aot constant rename section command: %s", cmd)
+            run_command_and_check(cmd)
 
-                if not os.path.exists(output_so):
-                    output_o = os.path.splitext(input_path)[0] + ".o"
-                    cmd = cpp_compile_command(
-                        input=input_path,
-                        output=output_o,
-                        vec_isa=picked_vec_isa,
-                        cuda=cuda,
-                        aot_mode=graph.aot_mode,
-                        compile_only=True,
-                        use_absolute_path=use_absolute_path,
-                    )
-                    log.debug("aot compilation command: %s", cmd)
-                    if fbcode_aot_cpu_re:
-                        compile_file(input_path, output_o, cmd.split())
-                        os.chmod(output_o, 0o644)
-                    else:
-                        run_command_and_check(cmd)
+            cmd = f"rm {consts_path}"
+            log.debug("aot constant bin removal command: %s", cmd)
+            run_command_and_check(cmd)
 
-                    def _to_bytes(t: torch.Tensor) -> bytes:
-                        # This serializes the tensor's untyped_storage to bytes by accessing
-                        # the raw data of the underlying structure.
-                        import ctypes
+            if fbcode_aot_cpu_re:
+                body = re.sub(r"[\W]", "_", os.path.basename(consts_path))
+            else:
+                body = re.sub(r"[\W]", "_", consts_path)
 
-                        if t.numel() == 0:
-                            return b""
+            symbol_list = []
+            symbol_list.append(
+                f"{objcopy_command} --redefine-sym _binary_{body}_start=_binary_constants_bin_start {consts_o}"
+            )
+            symbol_list.append(
+                f"{objcopy_command} --redefine-sym _binary_{body}_size=_binary_constants_bin_size {consts_o}"
+            )
+            symbol_list.append(
+                f"{objcopy_command} --redefine-sym _binary_{body}_end=_binary_constants_bin_end {consts_o}"
+            )
+            log.debug("aot constant binary redefine symbol: %s", " ".join(symbol_list))
+            for cmd in symbol_list:
+                run_command_and_check(cmd)
+            return consts_o
+
+        def _compile_consts_darwin(consts: bytes) -> str:
+            is_large_consts = len(consts) > 1024
+            consts_asm = "\t.section\t__DATA,__data\n"
+            consts_asm += "\t.globl\t__binary_constants_bin_start\n"
+            consts_asm += "__binary_constants_bin_start:\n"
+            if not is_large_consts:
+                for c in consts:
+                    consts_asm += f"\t.byte {c}\n"
+                # Add one element even if constants are empty
+                # Otherwise assembler will not put them in data section
+                if not consts:
+                    consts_asm += "\t.space 1\n"
+            else:
+                consts_asm += "\t.quad 0x1234567899abcdef\n"
+                consts_asm += f"\t.space {len(consts) - 8}\n"
+            consts_asm += ".globl\t__binary_constants_bin_end\n"
+            consts_asm += "__binary_constants_bin_end:\n"
+            _, consts_path = write(
+                consts_asm,
+                "S",
+                specified_dir=specified_output_path,
+            )
+            consts_o = os.path.splitext(consts_path)[0] + ".o"
+            cmd = f"{cpp_compiler()} -c -o {consts_o} {consts_path}"
+            run_command_and_check(cmd)
+            if is_large_consts:
+                with open(consts_o, "r+b") as f:
+                    f.seek(0)
+                    hdr = f.read(1024)
+                    # Search for magic number and write the actual data over it
+                    start_idx = hdr.find(b"\xef\xcd\xab\x99\x78\x56\x34\x12")
+                    assert start_idx != -1
+                    f.seek(start_idx)
+                    pos = 0
+                    while pos < len(consts):
+                        rc = f.write(consts[pos:])
+                        pos += rc
+            return consts_o
 
-                        t_cpu = t.untyped_storage().cpu()
-                        raw_array = ctypes.cast(
-                            t_cpu.data_ptr(),
-                            ctypes.POINTER(ctypes.c_ubyte * t_cpu.nbytes()),
-                        )
+        from filelock import FileLock
 
-                        return bytes(raw_array.contents)
+        lock_dir = get_lock_dir()
+        lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+        with lock:
+            # Currently, this only support serializing extern nodes in fbcode
+            # Eventually, we should also have a serializer for OSS.
+            if config.is_fbcode() and serialized_extern_kernel_nodes:
+                output_json = os.path.splitext(input_path)[0] + ".json"
+                with open(output_json, "w") as f:
+                    f.write(serialized_extern_kernel_nodes)
+
+            output_so = (
+                config.aot_inductor.output_path
+                if specified_so_name
+                else os.path.splitext(input_path)[0] + ".so"
+            )
 
-                    aot_constants = b"".join(
-                        _to_bytes(tensor) for tensor in graph.constants.values()
-                    )
+            output_o = os.path.splitext(input_path)[0] + ".o"
+            consts_size = sum(
+                tensor.untyped_storage().nbytes()
+                for (name, tensor) in graph.constants.items()
+                if name not in graph.folded_constants
+            )
+            # TODO: Fix mmap weights with cuda
+            use_mmap_weights = not config.is_fbcode() and consts_size > 2_000_000_000
+            if config.aot_inductor.force_mmap_weights:
+                use_mmap_weights = True
+            compile_cmd = cpp_compile_command(
+                input=input_path,
+                output=output_o,
+                vec_isa=picked_vec_isa,
+                cuda=cuda,
+                aot_mode=graph.aot_mode,
+                compile_only=True,
+                use_absolute_path=use_absolute_path,
+                use_mmap_weights=use_mmap_weights,
+            )
+            log.debug("aot compilation command: %s", compile_cmd)
+            if fbcode_aot_cpu_re:
+                compile_file(input_path, output_o, compile_cmd.split())
+                os.chmod(output_o, 0o644)
+            else:
+                run_command_and_check(compile_cmd)
 
-                    consts_key, consts_path = write(
-                        aot_constants,
-                        "bin",
-                        specified_dir=specified_output_path,
-                    )
+            def _to_bytes(t: torch.Tensor) -> bytes:
+                # This serializes the tensor's untyped_storage to bytes by accessing
+                # the raw data of the underlying structure.
+                import ctypes
 
-                    consts_o = os.path.splitext(consts_path)[0] + ".o"
-                    if fbcode_aot_cpu_re:
-                        cmd = f"{ld_command} -r -b binary -o {os.path.basename(consts_o)} {os.path.basename(consts_path)}"
-                        compile_file(consts_path, consts_o, cmd.split())
-                        os.chmod(consts_o, 0o644)
-                    else:
-                        cmd = f"{ld_command} -r -b binary -o {consts_o} {consts_path}"
-                        run_command_and_check(cmd)
-                    log.debug("aot constant binary command: %s", cmd)
-
-                    cmd = (
-                        f"{objcopy_command} --rename-section"
-                        " .data=.lrodata,alloc,load,readonly,data,contents"
-                        f" {consts_o} {consts_o}"
-                    )
-                    log.debug("aot constant obj command: %s", cmd)
-                    run_command_and_check(cmd)
+                if t.numel() == 0:
+                    return b""
 
-                    cmd = f"rm {consts_path}"
-                    log.debug("aot constant bin removal command: %s", cmd)
-                    run_command_and_check(cmd)
+                t_cpu = t.untyped_storage().cpu()
+                raw_array = ctypes.cast(
+                    t_cpu.data_ptr(),
+                    ctypes.POINTER(ctypes.c_ubyte * t_cpu.nbytes()),
+                )
 
-                    if fbcode_aot_cpu_re:
-                        body = re.sub(r"[\W]", "_", os.path.basename(consts_path))
-                    else:
-                        body = re.sub(r"[\W]", "_", consts_path)
+                return bytes(raw_array.contents)
 
-                    symbol_list = []
-                    symbol_list.append(
-                        f"{objcopy_command} --redefine-sym _binary_{body}_start=_binary_constants_bin_start {consts_o}"
-                    )
-                    symbol_list.append(
-                        f"{objcopy_command} --redefine-sym _binary_{body}_size=_binary_constants_bin_size {consts_o}"
-                    )
-                    symbol_list.append(
-                        f"{objcopy_command} --redefine-sym _binary_{body}_end=_binary_constants_bin_end {consts_o}"
-                    )
-                    log.debug(
-                        "aot constant binary redefine symbol: %s", " ".join(symbol_list)
-                    )
-                    for cmd in symbol_list:
-                        run_command_and_check(cmd)
-
-                    cmd = cpp_compile_command(
-                        input=[output_o, consts_o],
-                        output=output_so,
-                        vec_isa=picked_vec_isa,
-                        cuda=cuda,
-                        aot_mode=graph.aot_mode,
-                        use_absolute_path=use_absolute_path,
-                    )
-                    log.debug("aot linkage command: %s", cmd)
-                    if fbcode_aot_cpu_re:
-                        compile_file([output_o, consts_o], output_so, cmd.split())
-                        os.chmod(output_so, 0o755)
-                    else:
-                        run_command_and_check(cmd)
-                else:
-                    log.debug(
-                        "aot_inductor dynamic library already exist: %s", output_so
-                    )
+            serialized_weights = b"".join(
+                _to_bytes(graph.get_original_value_of_constant(name))
+                for name in graph.constants.keys()
+                if name not in graph.folded_constants
+            )
+            if not use_mmap_weights:
+                aot_constants = serialized_weights
+                magic_number = 0
+            else:
+                magic_number = cast(
+                    int, torch.randint(0, torch.iinfo(torch.int64).max, (1,)).item()
+                )
+                aot_constants = struct.pack("qq", consts_size + 8, magic_number)
+            consts_o = {
+                "linux": _compile_consts_linux,
+                "darwin": _compile_consts_darwin,
+            }[sys.platform](aot_constants)
+
+            link_cmd = cpp_compile_command(
+                input=[output_o, consts_o],
+                output=output_so,
+                vec_isa=picked_vec_isa,
+                cuda=cuda,
+                aot_mode=graph.aot_mode,
+                use_absolute_path=use_absolute_path,
+            )
+            log.debug("aot linkage command: %s", link_cmd)
+            if fbcode_aot_cpu_re:
+                compile_file([output_o, consts_o], output_so, link_cmd.split())
+                os.chmod(output_so, 0o755)
+            else:
+                run_command_and_check(link_cmd)
 
-                cls.cache[key] = output_so
+            if use_mmap_weights:
+                with open(output_so, "a+b") as f_so:
+                    so_size = f_so.tell()
+                    # Page align the weights
+                    f_so.write(b" " * (16384 - so_size % 16384))
+                    f_so.write(serialized_weights)
+                    f_so.write(struct.pack("q", magic_number))
 
-        return cls.cache[key]
+            # Append cmds to the end of codegen-ed wrapper file
+            with open(input_path, "a") as f:
+                f.write("\n")
+                f.write(f"// Compile cmd\n// {compile_cmd}\n")
+                f.write(f"// Link cmd\n// {link_cmd}\n")
+
+        return output_so
 
 
 # Putting this fn in cpp.py (unfortunately) causes a deadlock, which is why it's in codecache.py.
@@ -1722,6 +2109,7 @@ def _to_bytes(t: torch.Tensor) -> bytes:
 # - valid_vec_isa_list()
 # - VecISA.__bool__() <-- takes out a lock
 # - compile_file() <-- imports cpp_prefix_path from cpp, which causes us to try to take out the same lock.
+@clear_on_fresh_inductor_cache
 @functools.lru_cache
 def cpp_prefix_path() -> str:
     path = Path(__file__).parent / "codegen/cpp_prefix.h"
@@ -1746,6 +2134,7 @@ def cpp_prefix() -> str:
 
 # Given a path to an input cpp file and an output path,
 # Attempts to compile the file, storing the output in "output_path"
+@dynamo_timed
 def compile_file(
     input_path: Union[str, List[str]], output_path: str, cmd: List[str]
 ) -> None:
@@ -1762,12 +2151,11 @@ def compile_file(
             # When we build remotely, we need to make sure to carefully copy any files
             # that are required during the compilation process into our build directly.
             # This is where all of the ATen/c10/Torch includes come from.
-            torch_includes_path = os.path.join(
-                torch.utils.cpp_extension._TORCH_PATH, "include"
-            )
+            torch_includes_path = os.path.join(_TORCH_PATH, "include")
             with tempfile.TemporaryDirectory() as tmp_dir:
                 # Copy everything to tmp compilation folder
                 shutil.copy(header_path, os.path.join(tmp_dir, header_name))
+                shutil.copy(_LINKER_SCRIPT, os.path.join(tmp_dir, "script.ld"))
                 for p, f in zip(input_paths, input_files):
                     shutil.copy(p, os.path.join(tmp_dir, f))
                 dest_include_path = os.path.join(tmp_dir, "include")
@@ -1800,20 +2188,64 @@ def compile_file(
 _libgomp: Optional[CDLL] = None
 
 
+def custom_op_wrapper(op: str, *args):
+    # This function will be called from generated cpp wrapper code in the JIT mode.
+    # Because tensors will be passed in as AtenTensorHandle, we need to explicitly convert them.
+    def convert_arg(arg):
+        if str(type(arg)) == "<class 'PyCapsule'>":
+            # No easy way to do isinstance check on PyCapsule
+            return torch._C._aoti.alloc_tensor_by_stealing_from_void_ptr(arg)
+        elif isinstance(arg, (list, tuple)):
+            return type(arg)(convert_arg(a) for a in arg)
+        else:
+            return arg
+
+    converted_args = [convert_arg(arg) for arg in args]
+
+    assert op.startswith("torch.ops."), (
+        op + " can not be called through custom_op_wrapper"
+    )
+    func = None
+    for i, s in enumerate(op.split(".")):
+        if i == 0:
+            func = importlib.import_module(s)
+        func = getattr(func, s)
+
+    assert callable(func), op + " can not be loaded through custom_op_wrapper"
+    result = func(*converted_args)
+    if isinstance(result, (list, tuple)):
+        for r in result:
+            assert isinstance(r, torch.Tensor), op + " returns a list of non-tensors"
+        return torch._C._aoti.unsafe_alloc_void_ptrs_from_tensors(result)  # type: ignore[arg-type]
+    else:
+        assert isinstance(result, torch.Tensor), op + " returns a non-tensor"
+        return torch._C._aoti.unsafe_alloc_void_ptr_from_tensor(result)
+
+
+@clear_on_fresh_inductor_cache
 class CppCodeCache:
-    cache: Dict[str, CDLL] = dict()
-    clear = staticmethod(cache.clear)
+    cache: Dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
+    cache_clear = staticmethod(cache.clear)
+    cpp_compile_command_flags: Dict[str, Any] = {}
 
     @staticmethod
-    def _load_library(path: str) -> CDLL:
+    def _load_library_inner(path: str, key: str) -> Union[CDLL, ModuleType]:
+        return cdll.LoadLibrary(path)
+
+    @classmethod
+    def _load_library(cls, path: str, key: str) -> Union[CDLL, ModuleType]:
         try:
-            return cdll.LoadLibrary(path)
-        except OSError as e:
+            result = cls._load_library_inner(path, key)
+            result.key = key  # type: ignore[union-attr]
+            return result
+        except (ImportError, OSError) as e:
             if "gomp" in str(e) and os.path.exists("/usr/lib64/libgomp.so.1"):
                 # hacky workaround for fbcode/buck
                 global _libgomp
                 _libgomp = cdll.LoadLibrary("/usr/lib64/libgomp.so.1")
-                return cdll.LoadLibrary(path)
+                result = cls._load_library_inner(path, key)
+                result.key = key  # type: ignore[union-attr]
+                return result
             if "failed to map segment from shared object" in str(e):
                 raise OSError(
                     f"{e}.  The most common reason this may occur is if the {tempfile.gettempdir()} folder "
@@ -1824,34 +2256,287 @@ def _load_library(path: str) -> CDLL:
             raise
 
     @classmethod
-    def load(cls, source_code: str) -> CDLL:
-        picked_vec_isa = pick_vec_isa()
-        cpp_command = repr(cpp_compile_command("i", "o", vec_isa=picked_vec_isa))
+    def load_async(cls, source_code: str, cuda=False, submit_fn=None):
+        compile_command = {
+            **cls.cpp_compile_command_flags,
+            "cuda": cuda,
+            "vec_isa": pick_vec_isa(),
+        }
+        cpp_command = repr(cpp_compile_command("i", "o", **compile_command))
         key, input_path = write(source_code, "cpp", extra=cpp_command)
+
         if key not in cls.cache:
             from filelock import FileLock
 
-            lock_dir = get_lock_dir()
-            lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
-            with lock:
-                output_path = input_path[:-3] + "so"
-                if not os.path.exists(output_path):
-                    cmd = shlex.split(
-                        cpp_compile_command(
-                            input=input_path, output=output_path, vec_isa=picked_vec_isa
-                        )
-                    )
-                    compile_file(input_path, output_path, cmd)
-                cls.cache[key] = cls._load_library(output_path)
-                cls.cache[key].key = key  # type: ignore[attr-defined]
+            lock_path = os.path.join(get_lock_dir(), key + ".lock")
+            output_path = input_path[:-3] + "so"
+            future: Optional[Future[Any]] = None
+            lib = None
+            worker_fn = functools.partial(
+                _worker_compile_cpp,
+                lock_path,
+                input_path,
+                output_path,
+                cpp_compile_command(
+                    input=input_path, output=output_path, **compile_command
+                ),
+            )
+
+            def load_fn():
+                nonlocal lib
+                if lib is None:
+                    if future is not None:
+                        future.result()
+                    worker_fn()
+                    lib = cls._load_library(output_path, key)
+                    assert lib is not None
+                return lib
+
+            if submit_fn is not None:
+                with FileLock(lock_path, timeout=LOCK_TIMEOUT):
+                    if not os.path.exists(output_path):
+                        future = submit_fn(worker_fn)
+
+            cls.cache[key] = load_fn
 
         return cls.cache[key]
 
+    @classmethod
+    def load(cls, source_code: str, cuda: bool = False):
+        return cls.load_async(source_code, cuda)()
+
+
+def _worker_compile_cpp(lock_path, input_path, output_path, cmd):
+    from filelock import FileLock
+
+    with FileLock(lock_path, timeout=LOCK_TIMEOUT):
+        if not os.path.exists(output_path):
+            compile_file(input_path, output_path, shlex.split(cmd))
+
+
+# Customized Python binding for cpp kernels
+@clear_on_fresh_inductor_cache
+class CppPythonBindingsCodeCache(CppCodeCache):
+    cache: Dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
+    cache_clear = staticmethod(cache.clear)
+    cpp_compile_command_flags = {
+        # kernels have no dependency on libtorch
+        "include_pytorch": False,
+        "shared": True,
+    }
+    entry_function = "kernel"
+    call_entry_function = "kernel(%s);Py_RETURN_NONE;"
+    extra_parse_arg = ""
+    suffix_template = textwrap.dedent(
+        """
+        // Python bindings to call %s():
+        #define PY_SSIZE_T_CLEAN
+        #include <Python.h>
+        #include <sstream>
+        #include <cstdlib>
+
+        #ifndef _MSC_VER
+        #if __cplusplus < 202002L
+        // C++20 earlier code
+        // https://en.cppreference.com/w/cpp/language/attributes/likely
+        #define likely(x)       __builtin_expect(!!(x), 1)
+        #define unlikely(x)     __builtin_expect(!!(x), 0)
+        #endif
+        #endif
+
+        // This is defined in guards.cpp so we don't need to import PyTorch headers that are slooow.
+        // We manually link it below to workaround issues with fbcode build.
+        static void* (*_torchinductor_pyobject_tensor_data_ptr)(PyObject* obj);
+
+        template <typename T> static inline T parse_arg(PyObject* args, size_t n) {
+            static_assert(std::is_pointer<T>::value, "arg type must be pointer or long");
+            return static_cast<T>(_torchinductor_pyobject_tensor_data_ptr(PyTuple_GET_ITEM(args, n)));
+        }
+        template <> inline long parse_arg<long>(PyObject* args, size_t n) {
+            auto result = PyLong_AsSsize_t(PyTuple_GET_ITEM(args, n));
+            if(result == -1 && PyErr_Occurred())
+                [[unlikely]] throw std::runtime_error("expected int arg");
+            return result;
+        }
+
+        %s
+
+        static PyObject* %s_py(PyObject* self, PyObject* args) {
+            try {
+                if(!PyTuple_CheckExact(args))
+                    [[unlikely]] throw std::runtime_error("tuple args required");
+                if(PyTuple_GET_SIZE(args) != %s)
+                    [[unlikely]] throw std::runtime_error("requires %s args");
+                %s
+            } catch(std::exception const& e) {
+                PyErr_SetString(PyExc_RuntimeError, e.what());
+                return nullptr;
+            } catch(...) {
+                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
+                return nullptr;
+            }
+        }
+
+        static PyMethodDef py_methods[] = {
+            {"%s", %s_py, METH_VARARGS, ""},
+            {NULL, NULL, 0, NULL}};
 
+        static struct PyModuleDef py_module =
+            {PyModuleDef_HEAD_INIT, "%s", NULL, -1, py_methods};
+
+        PyMODINIT_FUNC PyInit_%s(void) {
+            const char* str_addr = std::getenv("_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR");
+            if(!str_addr) {
+                PyErr_SetString(PyExc_RuntimeError, "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR must be set");
+                return nullptr;
+            }
+            std::istringstream iss(str_addr);
+            uintptr_t addr = 0;
+            iss >> addr;
+            _torchinductor_pyobject_tensor_data_ptr =
+                reinterpret_cast<decltype(_torchinductor_pyobject_tensor_data_ptr)>(addr);
+            return PyModule_Create(&py_module);
+        }
+        """
+    )
+
+    @classmethod
+    def _load_library_inner(cls, path: str, key: str) -> ModuleType:
+        os.environ["_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR"] = str(
+            torch._C._dynamo.guards._torchinductor_pyobject_tensor_data_ptr  # type: ignore[attr-defined]
+        )
+        module_name = f"{key}.{cls.entry_function}"
+        try:
+            return sys.modules[module_name]
+        except KeyError:
+            pass
+        spec = importlib.util.spec_from_file_location(module_name, path)
+        assert spec is not None
+        module = importlib.util.module_from_spec(spec)
+        sys.modules[module_name] = module
+        spec.loader.exec_module(module)  # type: ignore[union-attr]
+        return module
+
+    @classmethod
+    def load_pybinding_async(
+        cls,
+        argtypes: List[str],
+        source_code: str,
+        cuda: bool = False,
+        num_outputs: int = -1,
+        submit_fn=None,
+    ) -> Any:
+        """
+        Wrap a C++ function in fast Python bindings.
+
+        Args:
+            argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
+            source_code: C++ source code containing a ENTRY_FUNCTION() function
+
+        Returns:
+            A python version of ENTRY_FUNCTION()
+        """
+        parseargs = ", ".join(
+            f"parse_arg<{argtype.replace('const ', '')}>(args, {n})"
+            for n, argtype in enumerate(argtypes)
+        )
+        suffix = cls.suffix_template % (
+            cls.entry_function,
+            cls.extra_parse_arg % num_outputs if cls.extra_parse_arg else "",
+            cls.entry_function,
+            len(argtypes),
+            len(argtypes),
+            cls.call_entry_function % parseargs,
+            cls.entry_function,
+            cls.entry_function,
+            cls.entry_function,
+            cls.entry_function,
+        )
+        get_result = cls.load_async(source_code + suffix, cuda, submit_fn=submit_fn)
+        result = None
+
+        def future():
+            nonlocal result
+            if result is None:
+                result = get_result()
+                assert isinstance(result, ModuleType)
+            return getattr(result, cls.entry_function)
+
+        return future
+
+    @classmethod
+    def load_pybinding(cls, *args, **kwargs) -> Any:
+        return cls.load_pybinding_async(*args, **kwargs)()
+
+
+@clear_on_fresh_inductor_cache
+class CppWrapperCodeCache(CppPythonBindingsCodeCache):
+    cache: Dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
+    cache_clear = staticmethod(cache.clear)
+    cpp_compile_command_flags = {
+        "include_pytorch": not config.abi_compatible,
+        "shared": True,
+    }
+    entry_function = "inductor_entry_cpp"
+    call_entry_function = "return inductor_entry_cpp(%s);"
+    extra_parse_arg = textwrap.dedent(
+        """
+        #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+        static inline std::vector<AtenTensorHandle> unpack_tensor_handle_list(PyObject* pyvec) {
+            std::vector<AtenTensorHandle> result;
+            size_t result_len = PyList_GET_SIZE(pyvec);
+            result.reserve(result_len);
+            for (size_t i = 0; i < result_len; i++) {
+                // AtenTensorHandle is essentially a pointer
+                void* elem = PyCapsule_GetPointer(PyList_GET_ITEM(pyvec, i), NULL);
+                result.push_back(reinterpret_cast<AtenTensorHandle>(elem));
+            }
+            return result;
+        }
+
+        static inline PyObject* pack_tensor_handle_list(const std::vector<AtenTensorHandle>& cppvec) {
+            size_t result_len = cppvec.size();
+            PyObject* result = PyList_New(static_cast<Py_ssize_t>(result_len));
+            for (size_t i = 0; i < result_len; i++) {
+                PyObject *elem =
+                    cppvec[i] == nullptr
+                        ? Py_None
+                        // Store AtenTensorHandle as PyCapsulate
+                        : PyCapsule_New(reinterpret_cast<void*>(cppvec[i]), NULL, NULL);
+                PyList_SET_ITEM(result, i, elem);
+            }
+            return result;
+        }
+
+        template <> inline std::vector<AtenTensorHandle> parse_arg<std::vector<AtenTensorHandle>>(PyObject* args, size_t n) {
+            return unpack_tensor_handle_list(PyTuple_GET_ITEM(args, n));
+        }
+
+        PyObject* inductor_entry_cpp(std::vector<AtenTensorHandle>&& input_handles) {
+            // For outputs, we only allocate a vector to hold returned tensor handles,
+            // not allocating the actual output tensor storage here
+            std::vector<AtenTensorHandle> output_handles(%s);
+            try {
+                inductor_entry_impl(input_handles.data(), output_handles.data());
+                return pack_tensor_handle_list(output_handles);
+            } catch(std::exception const& e) {
+                PyErr_SetString(PyExc_RuntimeError, e.what());
+                return {};
+            } catch(...) {
+                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
+                return {};
+            }
+        }
+        """
+    )
+
+
+@clear_on_fresh_inductor_cache
 class PyCodeCache:
     cache: Dict[str, ModuleType] = dict()
     linemaps: Dict[str, List[Tuple[Any, ...]]] = dict()
-    clear = staticmethod(cache.clear)
+    cache_clear = staticmethod(cache.clear)
 
     @classmethod
     def write(cls, source_code: str, extra: str = "") -> Tuple[str, str]:
@@ -1879,26 +2564,21 @@ def load_by_key_path(
         if linemap is None:
             linemap = []
         if key not in cls.cache:
-            with open(path) as f:
-                try:
-                    code = compile(f.read(), path, "exec")
-                except Exception as e:
-                    raise RuntimeError(
-                        f"Failed to import {path}\n{type(e).__name__}: {e}"
-                    ) from None
-                mod = ModuleType(f"{__name__}.{key}")
-                mod.__file__ = path
-                mod.key = key  # type: ignore[attr-defined]
-                exec(code, mod.__dict__, mod.__dict__)
-                sys.modules[mod.__name__] = mod
-                # another thread might set this first
-                cls.cache.setdefault(key, mod)
-                # unzip into separate lines/nodes lists
-                cls.linemaps[path] = list(zip(*linemap))
-
-                if attrs is not None:
-                    for k, v in attrs.items():
-                        setattr(mod, k, v)
+            mod = _reload_python_module(key, path)
+
+            # another thread might set this first
+            cls.cache.setdefault(key, mod)
+            # unzip into separate lines/nodes lists
+            cls.linemaps[path] = list(zip(*linemap))
+
+            if attrs is not None:
+                for k, v in attrs.items():
+                    setattr(mod, k, v)
+
+            if not (linemap or attrs):
+                mod._reload_in_subproc = functools.partial(  # type: ignore[attr-defined]
+                    _reload_python_module_in_subproc, key, path
+                )
 
         return cls.cache[key]
 
@@ -1931,86 +2611,10 @@ def parse_stack_trace(stack_trace: str) -> List[Dict[str, Any]]:
         return parse_stack_trace(entry)
 
 
-class CppWrapperCodeCache:
-    cache: Dict[str, CDLL] = dict()
-    clear = staticmethod(cache.clear)
-
-    @classmethod
-    def load(cls, source_code: str, func_name: str, key: str, cuda: bool) -> CDLL:
-        name = f"inline_extension_{key}"
-        cpp_wrapper_dir = cpp_wrapper_cache_dir(name)
-        os.makedirs(cpp_wrapper_dir, exist_ok=True)
-
-        ext = "so"
-        filepath = os.path.join(cpp_wrapper_dir, f"{name}.{ext}")
-        log.debug("Cpp wrapper code path %s", filepath)
-
-        if key not in cls.cache:
-            log.debug("Cpp wrapper cache miss for %s", filepath)
-            from filelock import FileLock
-
-            lock_dir = get_lock_dir()
-            lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
-            with lock:
-                if not os.path.exists(filepath):
-                    log.debug("Cpp wrapper building %s", filepath)
-
-                    _cpp_flags = cpp_flags()
-                    _opt_flags = optimization_flags()
-                    _shared = get_shared()
-                    _warning_all_flag = get_warning_all_flag()
-                    (
-                        _ipaths,
-                        _lpaths,
-                        _libs,
-                        _macros,
-                        _build_arch_flags,
-                    ) = get_include_and_linking_paths(
-                        vec_isa=pick_vec_isa(),
-                        cuda=cuda,
-                    )
-                    _use_custom_generated_macros = use_custom_generated_macros()
-                    _cpp_wrapper_flags = cpp_wrapper_flags()
-
-                    extra_cflags = f"{_cpp_flags} {_opt_flags} {_warning_all_flag} {_build_arch_flags} {_macros} \
-                    {_cpp_wrapper_flags} {_use_custom_generated_macros}"
-                    # For CPP wrapper, add -ffast-math during linking to make CPU flush denormals.
-                    # CPP wrapper leverages cpp_extension which will do the compilation and linking in two stages.
-                    # We need to explicitly add -ffast-math as a linking flag.
-                    # For the default python wrapper, the compilation and linking are done in one command thus -ffast-math
-                    # will take effect in both compilation and linking.
-                    extra_ldflags = f"{_shared} {_lpaths} {_libs} -ffast-math"
-
-                    mod = torch.utils.cpp_extension.load_inline(
-                        name=name,
-                        build_directory=cpp_wrapper_dir,
-                        cpp_sources=[source_code],
-                        functions=[func_name],
-                        extra_cflags=[extra_cflags],
-                        extra_ldflags=[extra_ldflags],
-                        extra_include_paths=_ipaths,
-                        use_pch=True,
-                    )
-                    log.debug("Cpp wrapper done building %s", filepath)
-                else:
-                    log.debug("Found target .so, cpp wrapper loading %s", filepath)
-                    spec = importlib.util.spec_from_file_location(name, filepath)  # type: ignore[attr-defined]
-                    assert spec is not None
-                    mod = importlib.util.module_from_spec(spec)  # type: ignore[attr-defined]
-                    assert isinstance(spec.loader, abc.Loader)
-                    spec.loader.exec_module(mod)
-                    log.debug("Cpp wrapper done loading %s", filepath)
-
-                cls.cache[key] = mod
-
-        return cls.cache[key]
-
-
 class TritonCodeCache:
     @classmethod
     def load(cls, kernel_name: str, source_code: str) -> ModuleType:
-        mod = PyCodeCache.load(source_code)
-        return getattr(mod, kernel_name)
+        return _module_to_triton_kernel(PyCodeCache.load(source_code), kernel_name)
 
 
 def _cuda_compiler() -> Optional[str]:
@@ -2019,17 +2623,23 @@ def _cuda_compiler() -> Optional[str]:
     if cuda_env.nvcc_exist(os.getenv("CUDACXX")):
         return os.getenv("CUDACXX", "")
     if cuda_env.nvcc_exist(os.getenv("CUDA_HOME")):
-        return os.path.join(os.getenv("CUDA_HOME", ""), "bin/nvcc")
+        return os.path.realpath(os.path.join(os.getenv("CUDA_HOME", ""), "bin/nvcc"))
     return "nvcc"
 
 
 def _cutlass_include_paths() -> List[str]:
-    cutlass_path = config.cuda.cutlass_dir
+    if config.is_fbcode():
+        from libfb.py import parutil
+
+        cutlass_path = parutil.get_dir_path("cutlass-3-headers")
+    else:
+        cutlass_path = config.cuda.cutlass_dir
     return [
-        os.path.join(cutlass_path, "include"),
-        os.path.join(cutlass_path, "tools/library/include"),
-        os.path.join(cutlass_path, "tools/library/src"),
-        os.path.join(cutlass_path, "tools/util/include"),
+        # Use realpath to get canonical absolute paths, in order not to mess up cache keys
+        os.path.realpath(os.path.join(cutlass_path, "include")),
+        os.path.realpath(os.path.join(cutlass_path, "tools/library/include")),
+        os.path.realpath(os.path.join(cutlass_path, "tools/library/src")),
+        os.path.realpath(os.path.join(cutlass_path, "tools/util/include")),
     ]
 
 
@@ -2083,6 +2693,7 @@ def _nvcc_compiler_options() -> List[str]:
         config.cuda.compile_opt_level,
         "-std=c++17",
         "--expt-relaxed-constexpr",
+        "-DNDEBUG",
     ]
     if config.cuda.enable_debug_info:
         options.extend(["-lineinfo", "-g", "-DCUTLASS_DEBUG_TRACE_LEVEL=1"])
@@ -2110,13 +2721,17 @@ def cuda_compile_command(
     src_files: List[str],
     dst_file: str,
     dst_file_ext: str,
+    extra_args: Optional[List[str]] = None,
 ) -> str:
+    if extra_args is None:
+        extra_args = []
     include_paths = _cutlass_include_paths()
     cuda_lib_options = _cuda_lib_options()
     nvcc_host_compiler_options = _nvcc_host_compiler_options()
     nvcc_compiler_options = _nvcc_compiler_options()
     options = (
         nvcc_compiler_options
+        + extra_args
         + [
             f"-Xcompiler {opt}" if "=" in opt else f"-Xcompiler={opt}"
             for opt in nvcc_host_compiler_options
@@ -2131,6 +2746,8 @@ def cuda_compile_command(
     elif dst_file_ext == "so":
         options.append("-shared")
         res = f"{_cuda_compiler()} {' '.join(options)} -o {dst_file} {src_file}"
+    elif dst_file_ext == "exe":
+        res = f"{_cuda_compiler()} {' '.join(options)} -o {dst_file} {src_file}"
     else:
         raise NotImplementedError(f"Unsupported output file suffix {dst_file_ext}!")
     log.debug("CUDA command: %s", res)
@@ -2145,6 +2762,7 @@ def __init__(
         lib_path: str,
     ):
         self.lib_path = lib_path
+        self.is_open = False
         self.DLL = cdll.LoadLibrary(lib_path)
         self.is_open = True
 
@@ -2198,6 +2816,7 @@ def __del__(self):
         self.close()
 
 
+@clear_on_fresh_inductor_cache
 class CUDACodeCache:
     @dataclasses.dataclass
     class CacheEntry:
@@ -2205,7 +2824,7 @@ class CacheEntry:
         output_path: str
 
     cache: Dict[str, CacheEntry] = dict()
-    clear = staticmethod(cache.clear)
+    cache_clear = staticmethod(cache.clear)
     _SOURCE_CODE_SUFFIX = "cu"
 
     @classmethod
@@ -2224,12 +2843,13 @@ def write(cls, source_code, dst_file_ext) -> Tuple[str, str]:
         return key, input_path
 
     @classmethod
-    def compile(cls, source_code, dst_file_ext) -> Tuple[str, str, str]:
+    def compile(
+        cls, source_code, dst_file_ext, extra_args: Optional[List[str]] = None
+    ) -> Tuple[str, str, str]:
         """
         Compiles CUDA source_code into a file with dst_file_ext extension.
         Returns a tuple of dst_file_path, hash_key, source_code_path
         """
-
         key, input_path = cls.write(source_code, dst_file_ext)
         if key not in cls.cache:
             from filelock import FileLock
@@ -2240,14 +2860,25 @@ def compile(cls, source_code, dst_file_ext) -> Tuple[str, str, str]:
                 output_path = input_path[: -len(cls._SOURCE_CODE_SUFFIX)] + dst_file_ext
                 if not os.path.exists(output_path):
                     cmd = cuda_compile_command(
-                        [input_path], output_path, dst_file_ext
-                    ).split(" ")
+                        [input_path], output_path, dst_file_ext, extra_args
+                    )
+                    start_time = time()
+                    log.debug("CUDA Compilation: %s", cmd)
+                    cmd_parts = cmd.split(" ")
                     try:
                         subprocess.check_output(
-                            cmd, stderr=subprocess.STDOUT, env=os.environ
+                            cmd_parts, stderr=subprocess.STDOUT, env=os.environ
                         )
                     except subprocess.CalledProcessError as error:
-                        raise exc.CUDACompileError(cmd, error.output) from error
+                        raise exc.CUDACompileError(cmd_parts, error.output) from error
+                    end_time = time()
+                    log_duration_msg = f"CUDA Compilation took {end_time-start_time} seconds. Compile command: {cmd}"
+                    log.info(log_duration_msg)
+                else:
+                    log.debug(
+                        "CUDA Compilation skipped: %s since output already exists",
+                        input_path,
+                    )
                 cls.cache[key] = CUDACodeCache.CacheEntry(input_path, output_path)
 
         return (cls.cache[key].output_path, key, input_path)
@@ -2276,50 +2907,38 @@ def caching_device_properties():
             device_interface.Worker.get_device_properties()
 
 
-def _worker_compile(
-    kernel_name: str, source_code: str, cc: int, device: torch.device
-) -> None:
-    device_interface = get_interface_for_device(device.type)
-    device_interface.Worker.set_device(device.index)
-    kernel = TritonCodeCache.load(kernel_name, source_code)
-    kernel.precompile(warm_cache_only_with_cc=cc)
-
-
-def _load_kernel(kernel_name: str, source_code: str) -> ModuleType:
-    kernel = TritonCodeCache.load(kernel_name, source_code)
-    kernel.precompile()
-    return kernel
+class CodeCacheFuture:
+    def result(self):
+        raise NotImplementedError
 
 
-class TritonFuture:
+class TritonFuture(CodeCacheFuture):
     kernel: ModuleType
 
     def __init__(
         self,
-        kernel_name: str,
-        source_code: str,
-        future: Future[Any],
+        kernel: Any,
+        future: Optional[Future[Any]],
     ) -> None:
-        self.kernel_name = kernel_name
-        self.source_code = source_code
+        self.kernel = kernel
         self.future = future
 
     # @dynamo_utils.dynamo_timed
     def result(self) -> ModuleType:
-        t0 = time()
-        if hasattr(self, "kernel"):
-            return self.kernel
-        # If the worker failed this will throw an exception.
-        self.future.result()
-        kernel = self.kernel = _load_kernel(self.kernel_name, self.source_code)
-        latency = time() - t0
-        if latency > 50:
-            developer_warning(
-                f"Detected long compilation time of {latency} seconds for kernel name {self.kernel_name}"
-            )
-            developer_warning(self.source_code)
-        del self.kernel_name, self.source_code, self.future
-        return kernel
+        if self.future is not None:
+            # If the worker failed this will throw an exception.
+            self.future.result()
+            self.future = None
+            self.kernel.precompile()
+        return self.kernel
+
+
+class LambdaFuture(CodeCacheFuture):
+    def __init__(self, result_fn):
+        self.result_fn = result_fn
+
+    def result(self):
+        return self.result_fn()
 
 
 # If this process dies abnormally (e.g. segfault)
@@ -2347,6 +2966,28 @@ def run() -> None:
 
 _watchdog_thread: Optional[Thread] = None
 
+# Used to keep track of all process pools invoked so far.
+_pool_set: Set[ProcessPoolExecutor] = set()
+
+
+def shutdown_compile_workers() -> None:
+    """Shut down all outstanding compile-worker pools."""
+    for pool in _pool_set:
+        pool.shutdown()
+    after_fork()
+
+
+def after_fork():
+    """Reset pools to initial state without shutting them down"""
+    _pool_set.clear()
+    AsyncCompile.process_pool.cache_clear()
+
+
+try:
+    os.register_at_fork(after_in_child=after_fork)
+except AttributeError:
+    pass  # register_at_fork does not exists on windows
+
 
 class AsyncCompile:
     def __init__(self) -> None:
@@ -2373,6 +3014,10 @@ def process_pool() -> ProcessPoolExecutor:
             mp_context=ctx,
             initializer=partial(_async_compile_initializer, orig_ppid),
         )
+
+        global _pool_set
+        _pool_set.add(pool)
+
         # when this pool is created in a subprocess object, the normal exit handler
         # doesn't run, and we need to register our own handler.
         # exitpriority has to be high, because another one of the finalizers will
@@ -2416,33 +3061,44 @@ def submit(cls, task: Callable[..., Any]) -> Any:
             return task()
         return cls.pool().submit(task)
 
-    @classmethod
-    def map(cls, fn: Callable[..., Any], seq: List[Any]) -> List[Any]:
-        if config.compile_threads <= 1 or len(seq) <= 1:
-            return list(map(fn, seq))
-        return [t.result() for t in [cls.pool().submit(fn, x) for x in seq]]
-
-    def triton(
-        self, kernel_name: str, source_code: str, device_str: str = "cuda"
-    ) -> Union[TritonFuture, ModuleType]:
+    def triton(self, kernel_name: str, source_code: str, device_str: str = "cuda"):
         _compile_start()
+        _set_triton_ptxas_path()
 
+        kernel = TritonCodeCache.load(kernel_name, source_code)
         if config.compile_threads > 1:
-            device_interface = get_interface_for_device(device_str)
-            device = torch.device(device_str, device_interface.current_device())
-            cc = device_interface.get_compute_capability(device)
-            future = self.process_pool().submit(
-                _worker_compile, kernel_name, source_code, cc, device
+            return TritonFuture(
+                kernel,
+                self.process_pool().submit(
+                    _worker_compile_triton,
+                    kernel._reload_in_subproc,
+                ),
             )
-            return TritonFuture(kernel_name, source_code, future)
         else:
-            return _load_kernel(kernel_name, source_code)
+            kernel.precompile()
+            return kernel
 
-    def cpp(self, source_code: str) -> ModuleType:
-        def task():
+    def multi_kernel(self, *args, **kwargs) -> Any:
+        from torch._inductor.codegen.multi_kernel import MultiKernelCall
+
+        # no need to call this in parallel since the sub-kernels are already parallel tasks
+        return MultiKernelCall(*args, **kwargs)
+
+    def cpp(self, source_code: str):
+        if config.compile_threads <= 1:
             return CppCodeCache.load(source_code).kernel
+        else:
+            get_result = CppCodeCache.load_async(source_code, submit_fn=self.submit)
+            return LambdaFuture(lambda: get_result().kernel)
 
-        return self.submit(task)
+    def cpp_pybinding(self, argtypes: List[str], source_code: str):
+        if config.compile_threads <= 1:
+            return CppPythonBindingsCodeCache.load_pybinding(argtypes, source_code)
+        else:
+            get_result = CppPythonBindingsCodeCache.load_pybinding_async(
+                argtypes, source_code, submit_fn=self.submit
+            )
+            return LambdaFuture(get_result)
 
     def cuda(self, source_code, dst_file_ext):
         def task():
@@ -2455,7 +3111,7 @@ def wait(self, scope: Dict[str, Any]) -> None:
             [
                 value
                 for key, value in scope.items()
-                if isinstance(value, (Future, TritonFuture))
+                if isinstance(value, (Future, CodeCacheFuture))
             ]
         )
         pbar = tqdm(
@@ -2468,11 +3124,19 @@ def wait(self, scope: Dict[str, Any]) -> None:
             for key, result in scope.items():
                 if config.verbose_progress and not isinstance(pbar, _Faketqdm):
                     pbar.set_postfix_str(key)
-                if isinstance(result, (Future, TritonFuture)):
+                if isinstance(result, (Future, CodeCacheFuture)):
                     scope[key] = result.result()
                     pbar.update(1)
 
         _compile_end()
 
 
-AsyncCompile.warm_pool()
+if (
+    os.environ.get("TORCH_TNT_IN_USE", "0") == "1"
+    or os.environ.get("TORCH_WARM_POOL", "1") != "1"
+):
+    pass
+elif sys.version_info >= (3, 12):
+    log.info("AsyncCompile.warm_pool() is broken on 3.12+.")
+else:
+    AsyncCompile.warm_pool()
diff --git a/torch/_inductor/codegen/aoti_hipify_utils.py b/torch/_inductor/codegen/aoti_hipify_utils.py
new file mode 100644
index 0000000000000..a86ef2d297616
--- /dev/null
+++ b/torch/_inductor/codegen/aoti_hipify_utils.py
@@ -0,0 +1,20 @@
+import torch
+
+from torch.utils.hipify.hipify_python import PYTORCH_MAP, RE_PYTORCH_PREPROCESSOR
+
+# It is not a good idea to directly apply hipify_torch to codegen, which will be vulnerable to cases like:
+#   "...
+#    from ..codecache import CudaKernelParamCache
+#   ..."
+# In such cases, we do not need to hipify_torch the orignial class/file name in codegen/codecache
+
+
+def maybe_hipify_code_wrapper(source_codes: str) -> str:
+    if torch.version.hip is None:
+        return source_codes
+
+    def c2_repl(m):
+        return PYTORCH_MAP[m.group(0)]
+
+    source_codes = RE_PYTORCH_PREPROCESSOR.sub(c2_repl, source_codes)
+    return source_codes
diff --git a/torch/_inductor/codegen/aoti_runtime/implementation.cpp b/torch/_inductor/codegen/aoti_runtime/implementation.cpp
index 68cfc2f8f3873..4869825cadabd 100644
--- a/torch/_inductor/codegen/aoti_runtime/implementation.cpp
+++ b/torch/_inductor/codegen/aoti_runtime/implementation.cpp
@@ -6,155 +6,6 @@
 
 namespace torch {
 namespace aot_inductor {
-template <typename T>
-struct ThreadLocalCachedOutputTensor;
-
-template <>
-struct ThreadLocalCachedOutputTensor<RAIIAtenTensorHandle> {
-  explicit ThreadLocalCachedOutputTensor(const RAIIAtenTensorHandle&) {}
-  void copy_data_from(const RAIIAtenTensorHandle& handle) {
-    throw std::runtime_error("can't happen");
-  }
-
-  AtenTensorHandle tensor() const {
-    throw std::runtime_error("can't happen");
-  }
-};
-
-template <>
-struct ThreadLocalCachedOutputTensor<AtenTensorHandle> {
-  explicit ThreadLocalCachedOutputTensor(const AtenTensorHandle&) {}
-  void copy_data_from(const AtenTensorHandle& handle) {
-    throw std::runtime_error("can't happen");
-  }
-
-  AtenTensorHandle tensor() const {
-    throw std::runtime_error("can't happen");
-  }
-};
-
-template <>
-struct ThreadLocalCachedOutputTensor<ConstantHandle> {
-  explicit ThreadLocalCachedOutputTensor(const ConstantHandle&) {}
-  void copy_data_from(const ConstantHandle& handle) {
-    throw std::runtime_error("can't happen");
-  }
-
-  AtenTensorHandle tensor() const {
-    throw std::runtime_error("can't happen");
-  }
-};
-
-template <typename T>
-struct ThreadLocalCachedOutputTensor<ArrayRefTensor<T>> {
-  explicit ThreadLocalCachedOutputTensor(const ArrayRefTensor<T>& t) {
-    realloc(t);
-  }
-
-  void copy_data_from(const ArrayRefTensor<T>& t) {
-    if (t.numel() > capacity_) {
-      realloc(t);
-    }
-    std::copy(t.data(), t.data() + t.numel(), storage_.get());
-  }
-
-  AtenTensorHandle tensor() const {
-    return tensor_.get();
-  }
-
- private:
-  void realloc(const ArrayRefTensor<T>& t) {
-    capacity_ = t.numel();
-    storage_ = std::make_unique<T[]>(t.numel());
-    AtenTensorHandle handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob(
-        storage_.get(),
-        t.sizes().size(),
-        t.sizes().data(),
-        t.strides().data(),
-        0,
-        aoti_torch_dtype<std::remove_const_t<T>>(),
-        t.device_type(),
-        t.device_idx(),
-        &handle));
-    tensor_ = handle;
-  }
-
-  std::unique_ptr<T[]> storage_;
-  size_t capacity_ = 0;
-  RAIIAtenTensorHandle tensor_;
-};
-
-template <typename T>
-struct ThreadLocalCachedOutputArray;
-
-// Just needs to compile, doesn't need to do anything.
-template <>
-struct ThreadLocalCachedOutputArray<RAIIAtenTensorHandle> {
-  explicit ThreadLocalCachedOutputArray(const RAIIAtenTensorHandle&) {
-    throw std::runtime_error("can't happen");
-  }
-
-  // Not supported yet! We would need to put contiguous() or
-  // expect_contiguous() into the ABI.
-  void copy_data_from(const RAIIAtenTensorHandle&) {
-    throw std::runtime_error("can't happen");
-  }
-
-  template <typename U>
-  ArrayRefTensor<U> arrayref_tensor() const {
-    throw std::runtime_error("can't happen");
-  }
-};
-
-// Just needs to compile, doesn't need to do anything.
-template <>
-struct ThreadLocalCachedOutputArray<ConstantHandle> {
-  explicit ThreadLocalCachedOutputArray(const ConstantHandle&) {
-    throw std::runtime_error("can't happen");
-  }
-
-  // Not supported yet! We would need to put contiguous() or
-  // expect_contiguous() into the ABI.
-  void copy_data_from(const ConstantHandle&) {
-    throw std::runtime_error("can't happen");
-  }
-
-  template <typename U>
-  ArrayRefTensor<U> arrayref_tensor() const {
-    throw std::runtime_error("can't happen");
-  }
-};
-
-template <typename T>
-struct ThreadLocalCachedOutputArray<ArrayRefTensor<T>> {
-  explicit ThreadLocalCachedOutputArray(const ArrayRefTensor<T>& t) {}
-
-  template <
-      typename U,
-      std::enable_if_t<
-          std::is_same_v<std::remove_const_t<T>, std::remove_const_t<U>>,
-          bool> = true>
-  ArrayRefTensor<T> arrayref_tensor() const {
-    return tensor_;
-  }
-
-  void copy_data_from(const ArrayRefTensor<T>& t) {
-    if (t.numel() > capacity_) {
-      capacity_ = t.numel();
-      storage_ = std::make_unique<T[]>(capacity_);
-    }
-    std::copy(t.data(), t.data() + t.numel(), storage_.get());
-    tensor_ = t;
-    tensor_.set_arrayref(MiniArrayRef<T>(storage_.get(), t.numel()));
-  }
-
- private:
-  std::unique_ptr<T[]> storage_;
-  uint32_t capacity_ = 0;
-  ArrayRefTensor<T> tensor_;
-};
-
 template <typename T>
 void convert_output_to_handle(
     const ArrayRefTensor<T>& output,
@@ -224,17 +75,6 @@ void convert_handles_to_inputs(
       input_handles, inputs, std::make_index_sequence<sizeof...(Ts)>());
 }
 
-template <typename T>
-const T& convert_arrayref_tensor_to_tensor(const T& t) {
-  return t;
-}
-
-template <typename T>
-RAIIAtenTensorHandle convert_arrayref_tensor_to_tensor(
-    const ArrayRefTensor<T>& art) {
-  return art.expensiveCopyToTensor();
-}
-
 template <typename T>
 void assert_numel(const ArrayRefTensor<T>& tensor, int64_t numel) {
   if (tensor.numel() != numel) {
diff --git a/torch/_inductor/codegen/aoti_runtime/interface.cpp b/torch/_inductor/codegen/aoti_runtime/interface.cpp
index bcae6165f0d9d..7e52dc8f5f46c 100644
--- a/torch/_inductor/codegen/aoti_runtime/interface.cpp
+++ b/torch/_inductor/codegen/aoti_runtime/interface.cpp
@@ -1,6 +1,8 @@
 #include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
 #include <torch/csrc/inductor/aoti_runtime/interface.h>
 #include <torch/csrc/inductor/aoti_runtime/model_container.h>
+#include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
+#include <torch/csrc/inductor/aoti_runtime/thread_local.h>
 
 #include <iostream>
 #include <sstream>
@@ -50,6 +52,18 @@ AOTIRuntimeError AOTInductorModelContainerCreate(
     AOTInductorModelContainerHandle* container_handle,
     size_t num_models,
     bool is_cpu,
+    const char* cubin_dir) {
+      return AOTInductorModelContainerCreateWithDevice(
+        container_handle,
+        num_models,
+        is_cpu ? "cpu" : "cuda",
+        cubin_dir);
+}
+
+AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    const char* device_str,
     const char* cubin_dir) {
   if (num_models == 0) {
     std::cerr << "Error: num_models must be positive, but got 0" << std::endl;
@@ -61,7 +75,7 @@ AOTIRuntimeError AOTInductorModelContainerCreate(
       cubin_dir_opt.emplace(cubin_dir);
     }
     auto* container = new torch::aot_inductor::AOTInductorModelContainer(
-        num_models, is_cpu, cubin_dir_opt);
+        num_models, std::string(device_str), cubin_dir_opt);
     *container_handle =
         reinterpret_cast<AOTInductorModelContainerHandle>(container);
   })
@@ -136,6 +150,15 @@ AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
     { *original_fqn = container->constant_original_fqn(idx); })
 }
 
+AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    bool* from_folded) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); })
+}
+
 AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
     AOTInductorModelContainerHandle container_handle,
     size_t idx,
@@ -171,6 +194,22 @@ AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
           /*validate_full_update*/ true);
 }
 
+AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
+    AOTInductorModelContainerHandle container_handle,
+    bool use_inactive,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run_const_fold(use_inactive, stream, proxy_executor_handle);
+  })
+}
+
 AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
     AOTInductorModelContainerHandle container_handle) {
   auto* container =
@@ -247,6 +286,7 @@ AOTIRuntimeError AOTInductorModelCreate(
       auto model = new torch::aot_inductor::AOTInductorModel(
           constant_map,
           constant_array,
+          "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models
           ""
       );
 
@@ -255,7 +295,7 @@ AOTIRuntimeError AOTInductorModelCreate(
           constant_map->emplace(kv.first, kv.second);
         }
       } else {
-        model->load_constants(/*is_cpu*/true);
+        model->load_constants();
       }
 
       *model_handle = reinterpret_cast<AOTInductorModelHandle>(model);
@@ -311,8 +351,4 @@ AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
   })
 }
 
-#define CACHE_TORCH_DTYPE(typename) static auto cached_torch_dtype_##typename = aoti_torch_dtype_##typename()
-
-  static auto cached_torch_device_type_cpu = aoti_torch_device_type_cpu();
-  static auto cached_torch_device_type_cuda = aoti_torch_device_type_cuda();
 } // extern "C"
diff --git a/torch/_inductor/codegen/codegen_device_driver.py b/torch/_inductor/codegen/codegen_device_driver.py
new file mode 100644
index 0000000000000..73fcb7afd5237
--- /dev/null
+++ b/torch/_inductor/codegen/codegen_device_driver.py
@@ -0,0 +1,88 @@
+import torch
+
+# Provide aoti module launch hip/cuda drivers. This file is also used for unit testing purpose
+
+
+def cuda_kernel_driver() -> str:
+    source_codes = """
+            #define CUDA_DRIVER_CHECK(EXPR)                    \\
+            do {                                               \\
+                CUresult code = EXPR;                          \\
+                const char *msg;                               \\
+                cuGetErrorString(code, &msg);                  \\
+                if (code != CUDA_SUCCESS) {                    \\
+                    throw std::runtime_error(                  \\
+                        std::string("CUDA driver error: ") +   \\
+                        std::string(msg));                     \\
+                }                                              \\
+            } while (0);
+
+            namespace {
+
+            struct Grid {
+                Grid(uint32_t x, uint32_t y, uint32_t z)
+                  : grid_x(x), grid_y(y), grid_z(z) {}
+                uint32_t grid_x;
+                uint32_t grid_y;
+                uint32_t grid_z;
+
+                bool is_non_zero() {
+                    return grid_x > 0 && grid_y > 0 && grid_z > 0;
+                }
+            };
+
+            }  // anonymous namespace
+
+            static inline CUfunction loadKernel(
+                    std::string filePath,
+                    const std::string &funcName,
+                    uint32_t sharedMemBytes,
+                    const std::optional<std::string> &cubinDir = std::nullopt) {
+                if (cubinDir) {
+                    std::filesystem::path p1{*cubinDir};
+                    std::filesystem::path p2{filePath};
+                    filePath = (p1 / p2.filename()).string();
+                }
+
+                CUmodule mod;
+                CUfunction func;
+                CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str()));
+                CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
+                if (sharedMemBytes > 0) {
+                    CUDA_DRIVER_CHECK(cuFuncSetAttribute(
+                        func,
+                        CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                        sharedMemBytes
+                    ))
+                }
+                return func;
+            }
+
+            static inline void launchKernel(
+                    CUfunction func,
+                    uint32_t gridX,
+                    uint32_t gridY,
+                    uint32_t gridZ,
+                    uint32_t numWarps,
+                    uint32_t sharedMemBytes,
+                    void* args[],
+                    cudaStream_t stream) {
+                CUDA_DRIVER_CHECK(cuLaunchKernel(
+                    func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr
+                ));
+            }
+    """
+    if torch.version.hip is not None:
+        # Replace the warp size from 32 (cuLaunchKernel) to 64 (hipModuleLaunchKernel)
+        # The warp size on NV GPU is 32, while the wavefront size on AMD GPU is 64
+        source_codes = source_codes.replace("32*numWarps", "64*numWarps")
+    return source_codes
+
+
+def cuda_kernel_header() -> str:
+    source_codes = """
+        #include <c10/cuda/CUDAGuard.h>
+        #include <c10/cuda/CUDAStream.h>
+        #include <ATen/cuda/EmptyTensor.h>
+    """
+    return source_codes
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index af087367d281f..2b7d6c65704e7 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -5,7 +5,6 @@
 import logging
 import operator
 import re
-from collections import namedtuple
 from itertools import chain
 from typing import (
     Any,
@@ -17,7 +16,6 @@
     Optional,
     Set,
     Tuple,
-    TYPE_CHECKING,
     Union,
 )
 
@@ -26,23 +24,15 @@
 
 import torch
 import torch.fx
-from torch.utils._sympy.value_ranges import ValueRanges
+from torch._prims_common import ELEMENTWISE_TYPE_PROMOTION_KIND
+from torch.utils import _pytree as pytree
+from torch.utils._sympy.symbol import free_symbol_is_type, symbol_is_type, SymT
+from torch.utils._sympy.value_ranges import bound_sympy, ValueRangeAnalysis, ValueRanges
 
 from .. import config, metrics
-from ..utils import (
-    DeferredLineBase,
-    do_bench,
-    free_symbol_startswith,
-    IndentedBuffer,
-    sympy_dot,
-    sympy_subs,
-    sympy_symbol,
-    unique,
-)
-from ..virtualized import ops, OpsValue, V
+from ..utils import DeferredLineBase, IndentedBuffer, sympy_dot, sympy_subs, unique
+from ..virtualized import ops, OpsHandler, OpsValue, ReductionType, StoreMode, V
 
-if TYPE_CHECKING:
-    from ..ir import TensorBox
 
 schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
 
@@ -52,13 +42,61 @@ def data_type_logger(msg):
         schedule_log.debug("Data type propagation: %s", msg)
 
 
-TensorArg = namedtuple("TensorArg", ["name", "buffer", "dtype", "check_alignment"])
-SizeArg = namedtuple("SizeArg", ["name", "expr"])
+@dataclasses.dataclass
+class WorkspaceArg:
+    """A temporary buffer used for a single kernel, then discarded.
+
+    Not registered as a traditional buffer since there are no users,
+    so it would be dead code eliminated.
+    """
+
+    nbytes: sympy.Expr
+    zero_fill: bool
+
+
+@dataclasses.dataclass
+class TensorArg:
+    name: str
+    buffer: str
+    dtype: torch.dtype
+    offset: sympy.Expr = sympy.Integer(0)
+
+
+@dataclasses.dataclass
+class SizeArg:
+    name: str
+    expr: sympy.Expr
+
+
+@dataclasses.dataclass
+class DeviceCodegen:
+    scheduling: Any
+    wrapper_codegen: type
+    cpp_wrapper_codegen: type = type(None)
+
+
+KernelArgType = Union[WorkspaceArg, TensorArg, SizeArg]
 
-DeviceCodegen = namedtuple("DeviceCodegen", ["scheduling", "wrapper_codegen"])
 device_codegens: Dict[str, DeviceCodegen] = {}
 
 
+class DeviceOpOverrides:
+    def import_get_raw_stream_as(self, name):
+        raise NotImplementedError
+
+    def set_device(self, device_idx):
+        raise NotImplementedError
+
+    def synchronize(self):
+        raise NotImplementedError
+
+    def device_guard(self, device_idx):
+        raise NotImplementedError
+
+
+device_op_overrides_dict: Dict[str, DeviceOpOverrides] = {}
+
+
 # The code generated by Inductor consists of two main parts: kernel code and wrapper code.
 # For any new backend looking to integrate with Inductor, customization of these two main
 # parts are necessary to generate its specific code.
@@ -81,19 +119,30 @@ def data_type_logger(msg):
 # This backend can be used as a reference:
 # https://github.com/intel/intel-extension-for-pytorch/blob/5dcc9d57e5422cf295e1a1ee97896d6b6a554a85/intel_extension_for_pytorch/_inductor/__init__.py#L9
 def register_backend_for_device(
-    device: str, device_scheduling: type, device_wrapper_codegen: type
+    device: str,
+    device_scheduling: type,
+    device_wrapper_codegen: type,
+    device_cpp_wrapper_codegen: type = type(None),
 ):
-    device_codegens[device] = DeviceCodegen(device_scheduling, device_wrapper_codegen)
+    device_codegens[device] = DeviceCodegen(
+        device_scheduling, device_wrapper_codegen, device_cpp_wrapper_codegen
+    )
 
 
 def get_scheduling_for_device(device: str):
     return device_codegens[device].scheduling if device in device_codegens else None
 
 
-def get_wrapper_codegen_for_device(device: str):
-    return (
-        device_codegens[device].wrapper_codegen if device in device_codegens else None
-    )
+def get_wrapper_codegen_for_device(device: str, cpp_wrapper: bool = False):
+    if device in device_codegens:
+        wrapper_codegen_obj: DeviceCodegen = device_codegens[device]
+        return (
+            wrapper_codegen_obj.cpp_wrapper_codegen
+            if cpp_wrapper
+            else wrapper_codegen_obj.wrapper_codegen
+        )
+    else:
+        return None
 
 
 def index_prevent_reordering(index: List[sympy.Expr], index_vars, sizes):
@@ -103,14 +152,19 @@ def index_prevent_reordering(index: List[sympy.Expr], index_vars, sizes):
     return [*index, sympy_dot(index_vars, FlexibleLayout.contiguous_strides(sizes))]
 
 
+def register_device_op_overrides(device: str, device_op_overrides: DeviceOpOverrides):
+    device_op_overrides_dict[device] = device_op_overrides
+
+
 def get_device_op_overrides(device: str):
     assert isinstance(device, str)
-    if device == "cuda":
-        from .cuda.device_op_overrides import CUDADeviceOpOverrides
 
-        return CUDADeviceOpOverrides()
+    if not device_op_overrides_dict.keys():
+        from .cuda import device_op_overrides  # noqa: F401
+        from .xpu import device_op_overrides as xpu_op_overrides  # noqa: F401
 
-    return DeviceOpOverrides()
+    if device in device_op_overrides_dict.keys():
+        return device_op_overrides_dict[device]
 
 
 @functools.lru_cache(None)
@@ -144,6 +198,9 @@ def boolean_ops():
             torch.int32,
             torch.int64,
             torch.uint8,
+            torch.uint16,
+            torch.uint32,
+            torch.uint64,
         ]
     },
 }
@@ -166,12 +223,12 @@ def deduce_node_dtype_by_inputs(self, node: torch.fx.Node):
         if len(input_nodes) == 0:
             return None
 
-        all_input_nodes_propogated = all(
+        all_input_nodes_propagated = all(
             OptimizationContext.key in n.meta
             and n.meta[OptimizationContext.key].dtype is not None
             for n in input_nodes
         )
-        if not all_input_nodes_propogated:
+        if not all_input_nodes_propagated:
             return None
 
         return functools.reduce(
@@ -212,6 +269,7 @@ def deduce_node_dtype(self, node: torch.fx.Node):
         if node.target in (
             "get_index",
             "index_expr",
+            "randint64",
         ):
             return torch.int64
 
@@ -221,10 +279,10 @@ def deduce_node_dtype(self, node: torch.fx.Node):
             "store_reduction",
         ):
             buf_name = node.args[1]
-            return V.graph.get_dtype(buf_name)
+            return V.graph.get_dtype(buf_name)  # type: ignore[arg-type]
 
         if node.target == operator.getitem:
-            return self.deduce_node_dtype(node.args[0])
+            return self.deduce_node_dtype(node.args[0])  # type: ignore[arg-type]
 
         assert isinstance(node.target, str)
 
@@ -232,7 +290,7 @@ def deduce_node_dtype(self, node: torch.fx.Node):
             return node.args[1]
 
         if node.target == "constant":
-            return DTYPE_TO_COMPUTATION_DTYPE[node.args[-1]]
+            return DTYPE_TO_COMPUTATION_DTYPE[node.args[-1]]  # type: ignore[index]
 
         if node.target.startswith("masked_subblock"):
             return self.deduce_node_dtype_by_subgraph(node)
@@ -357,6 +415,9 @@ def _print_FloorDiv(self, expr):
     def _helper_sqrt(self, expr):
         return f"math.sqrt({self._print(expr)})"
 
+    def _print_OpaqueUnaryFn_sqrt(self, expr):
+        return self._helper_sqrt(expr.args[0])
+
     def _print_Pow(self, expr):
         # Pow() confuses triton
         base, exp = expr.args
@@ -383,6 +444,10 @@ def _print_floor(self, expr):
         assert len(expr.args) == 1
         return f"math.floor({self._print(expr.args[0])})"
 
+    def _print_Trunc(self, expr):
+        assert len(expr.args) == 1
+        return f"math.trunc({self._print(expr.args[0])})"
+
     def _print_ceiling(self, expr):
         assert len(expr.args) == 1
         return f"math.ceil({self._print(expr.args[0])})"
@@ -399,6 +464,42 @@ def _print_Min(self, expr):
         assert len(expr.args) >= 2
         return f"min({', '.join(map(self._print, expr.args))})"
 
+    def _print_OpaqueUnaryFn_cos(self, expr):
+        assert len(expr.args) == 1
+        return f"math.cos({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_cosh(self, expr):
+        assert len(expr.args) == 1
+        return f"math.cosh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_acos(self, expr):
+        assert len(expr.args) == 1
+        return f"math.acos({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_sin(self, expr):
+        assert len(expr.args) == 1
+        return f"math.sin({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_sinh(self, expr):
+        assert len(expr.args) == 1
+        return f"math.sinh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_asin(self, expr):
+        assert len(expr.args) == 1
+        return f"math.asin({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_tan(self, expr):
+        assert len(expr.args) == 1
+        return f"math.tan({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_tanh(self, expr):
+        assert len(expr.args) == 1
+        return f"math.tanh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_atan(self, expr):
+        assert len(expr.args) == 1
+        return f"math.atan({self._print(expr.args[0])})"
+
     def _print_Round(self, expr):
         assert len(expr.args) == 1
         return f"round({self._print(expr.args[0])})"
@@ -429,7 +530,7 @@ def constant(value, dtype):
 
     @staticmethod
     def reciprocal(x):
-        return ops.truediv("1", x)
+        return ops.truediv(ops.constant(1, torch.int32), x)
 
     @staticmethod
     def square(x):
@@ -459,8 +560,6 @@ def bitwise_xor(x, y):
     def bitwise_left_shift(x, y):
         return f"{ExprPrinter.paren(x)} << {ExprPrinter.paren(y)}"
 
-    # TODO(fdrocha): this is currently not being used anywhere,
-    # pending on moving triton pin past 972b761
     @staticmethod
     def bitwise_right_shift(x, y):
         return f"{ExprPrinter.paren(x)} >> {ExprPrinter.paren(y)}"
@@ -468,25 +567,269 @@ def bitwise_right_shift(x, y):
     @staticmethod
     def remainder(a, b):
         r = ops.mod(a, b)
-        return ops.where(f"(({r} != 0) & (({r} < 0) != ({b} < 0)))", ops.add(r, b), r)
+        cond = ops.and_(
+            ops.ne(r, ops.constant(0, torch.int32)),
+            ops.ne(ops.signbit(r), ops.signbit(b)),
+        )
+        return ops.where(cond, ops.add(r, b), r)
 
     @staticmethod
     def load_seed(name, offset):
         return ops.load(name, sympy.Integer(offset))
 
+    @classmethod
+    def _initialize_pointwise_overrides(cls, target):
+        assert target in {"triton", "cpp", "cppvec"}, target
 
-class DeviceOpOverrides:
-    def import_get_raw_stream_as(self, name):
-        raise NotImplementedError()
+        for funcname, data in pointwise_overrides_data.items():
+            impl = getattr(data, target)
+            if impl is None:
+                continue
+            setattr(cls, funcname, staticmethod(impl))
 
-    def set_device(self, device_idx):
-        raise NotImplementedError()
 
-    def synchronize(self):
-        raise NotImplementedError()
+@dataclasses.dataclass
+class OverridesData:
+    name: str
+    cpp: Callable[..., str]
+    # None when not impl in libdevice/triton
+    triton: Optional[Callable[..., str]] = None
+    # None when not impl in aten/.../vec
+    cppvec: Optional[Callable[..., str]] = None
+    type_promotion_kind: ELEMENTWISE_TYPE_PROMOTION_KIND = (
+        ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
 
-    def device_guard(self, device_idx):
-        raise NotImplementedError()
+
+pointwise_overrides_data: Dict[str, OverridesData] = dict(
+    airy_ai=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"airy_ai_forward({x})",
+        name="special_airy_ai",
+    ),
+    bessel_j0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"bessel_j0_forward({x})",
+        triton=lambda x: f"libdevice.j0({x})",
+        name="special_bessel_j0",
+    ),
+    bessel_j1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"bessel_j1_forward({x})",
+        triton=lambda x: f"libdevice.j1({x})",
+        name="special_bessel_j1",
+    ),
+    bessel_y0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"bessel_y0_forward({x})",
+        triton=lambda x: f"libdevice.y0({x})",
+        name="special_bessel_y0",
+    ),
+    bessel_y1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"bessel_y1_forward({x})",
+        triton=lambda x: f"libdevice.y1({x})",
+        name="special_bessel_y1",
+    ),
+    digamma=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_digamma({x})",
+        cppvec=lambda x: f"{x}.digamma()",
+        name="digamma",
+    ),
+    # no cpp nor triton implementation for entr, it is defined as decomposition
+    # erf, erfc
+    erfcx=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_erfcx({x})",
+        triton=lambda x: f"libdevice.erfcx({x})",
+        name="special_erfcx",
+    ),
+    fma=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y, z: f"std::fma({x}, {y}, {z})",
+        cppvec=lambda x, y, z: f"fmadd({x}, {y}, {z})",
+        triton=lambda x, y, z: f"libdevice.fma({x}, {y}, {z})",
+        name="fma",
+    ),
+    # erfinv, exp2, expit, gammaln
+    igamma=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"calc_igamma({x}, {y})",
+        name="igamma",
+    ),
+    igammac=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"calc_igammac({x}, {y})",
+        name="igammac",
+    ),
+    gammainc=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"calc_igamma({x}, {y})",
+        name="special_gammainc",
+    ),
+    gammaincc=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"calc_igammac({x}, {y})",
+        name="special_gammaincc",
+    ),
+    i0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_i0({x})",
+        triton=lambda x: f"libdevice.cyl_bessel_i0({x})",
+        cppvec=lambda x: f"{x}.i0()",
+        name="i0",
+    ),
+    i0e=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_i0e({x})",
+        cppvec=lambda x: f"{x}.i0e()",
+        name="special_i0e",
+    ),
+    i1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_i1({x})",
+        triton=lambda x: f"libdevice.cyl_bessel_i1({x})",
+        name="special_i1",
+    ),
+    i1e=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_i1e({x})",
+        name="special_i1e",
+    ),
+    log_ndtr=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_log_ndtr({x})",
+        name="special_log_ndtr",
+    ),
+    # logit
+    modified_bessel_i0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"modified_bessel_i0_forward({x})",
+        triton=lambda x: f"libdevice.cyl_bessel_i0({x})",
+        name="special_modified_bessel_i0",
+    ),
+    modified_bessel_i1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"modified_bessel_i1_forward({x})",
+        triton=lambda x: f"libdevice.cyl_bessel_i1({x})",
+        name="special_modified_bessel_i1",
+    ),
+    modified_bessel_k0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"modified_bessel_k0_forward({x})",
+        name="special_modified_bessel_k0",
+    ),
+    modified_bessel_k1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"modified_bessel_k1_forward({x})",
+        name="special_modified_bessel_k1",
+    ),
+    # multigamma
+    ndtr=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_ndtr({x})",
+        name="special_ndtr",
+    ),
+    ndtri=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"calc_ndtri({x})",
+        name="special_ndtri",
+    ),
+    polygamma=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"calc_polygamma({y}, {x})",
+        name="polygamma",
+    ),
+    # psi - alias to digamma
+    # round
+    scaled_modified_bessel_k0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"scaled_modified_bessel_k0_forward({x})",
+        name="special_scaled_modified_bessel_k0",
+    ),
+    scaled_modified_bessel_k1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"scaled_modified_bessel_k1_forward({x})",
+        name="special_scaled_modified_bessel_k1",
+    ),
+    # sinc
+    spherical_bessel_j0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x: f"spherical_bessel_j0_forward({x})",
+        name="special_spherical_bessel_j0",
+    ),
+    zeta=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"zeta({x}, {y})",
+        name="special_zeta",
+    ),
+    chebyshev_polynomial_t=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"chebyshev_polynomial_t_forward({x}, {y})",
+        name="special_chebyshev_polynomial_t",
+    ),
+    chebyshev_polynomial_u=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"chebyshev_polynomial_u_forward({x}, {y})",
+        name="special_chebyshev_polynomial_u",
+    ),
+    chebyshev_polynomial_v=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"chebyshev_polynomial_v_forward({x}, {y})",
+        name="special_chebyshev_polynomial_v",
+    ),
+    chebyshev_polynomial_w=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"chebyshev_polynomial_w_forward({x}, {y})",
+        name="special_chebyshev_polynomial_w",
+    ),
+    legendre_polynomial_p=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"legendre_polynomial_p_forward({x}, {y})",
+        name="special_legendre_polynomial_p",
+    ),
+    shifted_chebyshev_polynomial_t=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"shifted_chebyshev_polynomial_t_forward({x}, {y})",
+        name="special_shifted_chebyshev_polynomial_t",
+    ),
+    shifted_chebyshev_polynomial_u=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"shifted_chebyshev_polynomial_u_forward({x}, {y})",
+        name="special_shifted_chebyshev_polynomial_u",
+    ),
+    shifted_chebyshev_polynomial_v=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"shifted_chebyshev_polynomial_v_forward({x}, {y})",
+        name="special_shifted_chebyshev_polynomial_v",
+    ),
+    shifted_chebyshev_polynomial_w=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"shifted_chebyshev_polynomial_w_forward({x}, {y})",
+        name="special_shifted_chebyshev_polynomial_w",
+    ),
+    hermite_polynomial_h=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"hermite_polynomial_h_forward({x}, {y})",
+        name="special_hermite_polynomial_h",
+    ),
+    hermite_polynomial_he=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"hermite_polynomial_he_forward({x}, {y})",
+        name="special_hermite_polynomial_he",
+    ),
+    laguerre_polynomial_l=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp=lambda x, y: f"laguerre_polynomial_l_forward({x}, {y})",
+        name="special_laguerre_polynomial_l",
+    ),
+)
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_OpOverrides(h: OpOverrides) -> OpsHandler[str]:
+    return h
 
 
 class DeferredLine(DeferredLineBase):
@@ -495,6 +838,7 @@ class DeferredLine(DeferredLineBase):
     def __init__(self, name, line):
         super().__init__(line)
         self.name = name
+        assert not isinstance(line, DeferredLineBase)
 
     def __call__(self):
         if all(
@@ -552,6 +896,7 @@ def __init__(self, sizevars=None):
         self.output_buffers = dict()
         self.inplace_buffers = dict()
         self.sizevars = sizevars or dict()
+        self.workspace_arg = None
 
     def __repr__(self):
         return "KernelArgs({})".format(
@@ -605,6 +950,16 @@ def make_inplace(self, input_name, output_name):
             self.inplace_buffers[input_name] = buf
             self.inplace_buffers[output_name] = buf
 
+    def workspace(self, nbytes: sympy.Expr, zero_fill: bool):
+        if self.workspace_arg is None:
+            self.workspace_arg = WorkspaceArg(nbytes, zero_fill)
+            return "ws_ptr", 0
+
+        offset = self.workspace_arg.nbytes
+        zero_fill = zero_fill or self.workspace_arg.zero_fill
+        self.workspace_arg = WorkspaceArg(offset + nbytes, zero_fill)
+        return "ws_ptr", offset
+
     def seed_offset(self, name, value):
         if value in self.sizevars:
             return self.sizevars[value]
@@ -627,13 +982,13 @@ def call_names(self):
         )
 
     def wrap_ptr_arg(self, buf, dtype):
-        return f"c_void_p({buf}.data_ptr())"
+        return buf
 
     def wrap_size_arg(self, size):
-        return f"c_long({size})"
+        return str(size)
 
     def cpp_argdefs(self):
-        from .cpp import DTYPE_TO_CPP, INDEX_TYPE
+        from .cpp_utils import DTYPE_TO_CPP, INDEX_TYPE
 
         call_args = []
         arg_defs = []
@@ -670,12 +1025,13 @@ def cpp_argdefs(self):
             arg_types.append(f"const {INDEX_TYPE}")
             if V.graph.wrapper_code:
                 V.graph.wrapper_code.ensure_size_computed(outer)
+        assert self.workspace_arg is None, "Workspace not supported on CPU "
         return arg_defs, call_args, arg_types
 
     def python_argdefs(self):
         arg_defs = []
         call_args = []
-        precompile_args: List[Union[TensorArg, SizeArg]] = []
+        precompile_args: List[Union[TensorArg, SizeArg, WorkspaceArg]] = []
         for inplaced in unique(self.inplace_buffers.values()):
             if self._buffer_is_marked_removed(inplaced):
                 continue
@@ -683,10 +1039,9 @@ def python_argdefs(self):
             call_args.append(inplaced.other_names[-1])
             precompile_args.append(
                 TensorArg(
-                    inplaced.inner_name,
-                    inplaced.other_names[-1],
-                    V.graph.get_dtype(inplaced.other_names[-1]),
-                    True,
+                    name=inplaced.inner_name,
+                    buffer=inplaced.other_names[-1],
+                    dtype=V.graph.get_dtype(inplaced.other_names[-1]),
                 )
             )
         for outer, inner in chain(
@@ -697,7 +1052,11 @@ def python_argdefs(self):
             arg_defs.append(inner)
             call_args.append(outer)
             precompile_args.append(
-                TensorArg(inner, outer, V.graph.get_dtype(outer), True)
+                TensorArg(
+                    name=inner,
+                    buffer=outer,
+                    dtype=V.graph.get_dtype(outer),
+                )
             )
         for outer, inner in self.sizevars.items():
             arg_defs.append(inner)
@@ -705,6 +1064,10 @@ def python_argdefs(self):
             precompile_args.append(SizeArg(inner, outer))
             if V.graph.wrapper_code:
                 V.graph.wrapper_code.ensure_size_computed(outer)
+        if self.workspace_arg is not None:
+            arg_defs.append("ws_ptr")
+            call_args.append("workspace")
+            precompile_args.append(self.workspace_arg)
 
         return arg_defs, call_args, precompile_args
 
@@ -754,7 +1117,7 @@ class CSEVariable:
     See example of TritonCSEVariable in triton.py
     """
 
-    def __init__(self, name, bounds: ValueRanges):
+    def __init__(self, name, bounds: ValueRanges[Any]):
         assert isinstance(bounds, ValueRanges)
         self.name = name
         self.bounds = bounds
@@ -774,9 +1137,9 @@ def update_on_args(self, name, args, kwargs):
 
 class CppWrapperKernelArgs(KernelArgs):
     def wrap_ptr_arg(self, buf, dtype):
-        from .cpp import DTYPE_TO_CPP
+        from .cpp_utils import DTYPE_TO_CPP
 
-        if config.aot_inductor.abi_compatible:
+        if config.abi_compatible:
             # In the abi_compatible model, we just return the buf here.
             # We will form correct call args later in wrapper.generate_kernel_all.
             return buf
@@ -833,7 +1196,7 @@ def generate(
         buffer: IndentedBuffer,
         expr: Union[str, CSEVariable, OpsValue, IndentedBuffer],
         *,
-        bounds: ValueRanges = ValueRanges.unknown(),
+        bounds: ValueRanges[Any] = ValueRanges.unknown(),
         write=True,
         assignment=True,
     ) -> CSEVariable:
@@ -874,7 +1237,7 @@ def generate(
 
         return var
 
-    def newvar(self, bounds: ValueRanges = ValueRanges.unknown()) -> CSEVariable:
+    def newvar(self, bounds: ValueRanges[Any] = ValueRanges.unknown()) -> CSEVariable:
         var_name = f"{self.name_prefix}{next(self.iter_buffer_ids)}"
         var = V.kernel.create_cse_var(var_name, bounds)
         self.varname_map[var_name] = var
@@ -882,11 +1245,11 @@ def newvar(self, bounds: ValueRanges = ValueRanges.unknown()) -> CSEVariable:
 
 
 class IndirectAssertLine(DeferredLineBase):
-    def __init__(self, line, assert_fn, var, mask, size_map):
+    def __init__(self, line, indirect_assert, var, mask, size_map):
+        super().__init__(line)
         self.var = var
         self.mask = mask
-        self.line = line
-        self.assert_fn = assert_fn
+        self.indirect_assert = indirect_assert
         self.size_map = size_map
 
     def __call__(self):
@@ -896,31 +1259,26 @@ def __call__(self):
         assert_min = (self.var.bounds.lower >= 0) != sympy.true
         assert_max = (self.var.bounds.upper < size) != sympy.true
 
-        # FooBar interview question
+        lower = None
+        upper = None
         if not (assert_min or assert_max):
             return None
         elif assert_min and assert_max:
-            # The conditions need to be in parens because of Python's operator precedence.
-            # It'd be less error-prone to use and/or/not, which is suported by triton
-            cond = f"(0 <= {self.var}) & ({self.var} < {size_str})"
-            cond_print = f"0 <= {self.var} < {size_str}"
+            lower = "0"
+            upper = size_str
         elif assert_min:
-            cond = f"0 <= {self.var}"
-            cond_print = cond
+            lower = "0"
         else:
             assert assert_max
-            cond = f"{self.var} < {size_str}"
-            cond_print = cond
+            upper = size_str
 
-        if self.mask:
-            cond = f"({cond}) | ~{self.mask}"
         return self.line.format(
-            assert_fn=self.assert_fn, cond=cond, cond_print=cond_print
+            assert_line=self.indirect_assert(self.var, lower, upper, self.mask)
         )
 
     def _new_line(self, line):
         return IndirectAssertLine(
-            line, self.assert_fn, self.var, self.mask, self.size_map
+            line, self.indirect_assert, self.var, self.mask, self.size_map
         )
 
 
@@ -937,12 +1295,35 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self.exit_stack.__exit__(exc_type, exc_val, exc_tb)
 
 
+class ScopedDict:
+    def __init__(self, original_dict):
+        self.original_dict = original_dict
+        self.new_items = {}
+
+    def __getitem__(self, key):
+        if key in self.new_items:
+            return self.new_items[key]
+        return self.original_dict[key]
+
+    def __setitem__(self, key, value):
+        self.new_items[key] = value
+
+    def __contains__(self, key):
+        return key in self.new_items or key in self.original_dict
+
+    def get(self, key, default=None):
+        if key in self.new_items:
+            return self.new_items[key]
+        return self.original_dict.get(key, default)
+
+
 class Kernel(CodeGen):
     newvar_prefix = ""
     suffix = ""
-    overrides = None
-    load_format = None
-    store_format = None
+    overrides: Optional[Callable[[OpsHandler[Any]], OpsHandler[Any]]] = None
+    # TODO: these look dead, but with all the getattr it's hard to tell...
+    load_format: None = None
+    store_format: None = None
 
     def __init__(self, args=None, increase_kernel_count=True):
         super().__init__()
@@ -958,9 +1339,13 @@ def __init__(self, args=None, increase_kernel_count=True):
         self._load_mask = None
         # set in set_current_node
         self.current_node = None
-        self.node_to_bounds: Optional[Dict[torch.fx.Node, ValueRanges]] = None
+        self.node_to_bounds: Optional[Dict[torch.fx.Node, ValueRanges[Any]]] = None
         # Upper bounds for indirect_indexing and their str representation
-        self.indirect_max_sizes: Dict[Tuple[str, str], Tuple[sympy.Expr, str]] = {}
+        # NB: None, None is never stored in map, but it is the assumed
+        # "not set" value for the dict
+        self.indirect_max_sizes: Dict[
+            Tuple[CSEVariable, str], Union[Tuple[sympy.Expr, str], Tuple[None, None]]
+        ] = {}
 
         self.removed_buffers = set()
         self.inplaced_to_remove = set()
@@ -971,6 +1356,7 @@ def __init__(self, args=None, increase_kernel_count=True):
         self.inplace_update_buffers = dict()
         # Set minimum number of elements processed per thread.
         self.min_elem_per_thread = 1
+        self.kernel_name = None
 
     @contextlib.contextmanager
     def set_current_node(self, node):
@@ -984,6 +1370,13 @@ def set_current_node(self, node):
 
     @contextlib.contextmanager
     def swap_buffers(self, lb, cb=None, sb=None):
+        def scope_cse(cse):
+            new_cse = cse.clone()
+            new_cse.cache = ScopedDict(cse.cache)
+            new_cse.reduction_cache = ScopedDict(cse.reduction_cache)
+            new_cse.store_cache = ScopedDict(cse.store_cache)
+            return new_cse
+
         if cb is None:
             cb = lb
         loads = self.loads
@@ -993,7 +1386,7 @@ def swap_buffers(self, lb, cb=None, sb=None):
         self.loads = lb
         self.compute = cb
         self.stores = sb
-        self.cse = cse.clone()
+        self.cse = scope_cse(cse)
         try:
             yield
         finally:
@@ -1002,8 +1395,8 @@ def swap_buffers(self, lb, cb=None, sb=None):
             self.stores = stores
             self.cse = cse
 
-    def load(self, name: str, index: sympy.Expr):
-        raise NotImplementedError()
+    def load(self, name: str, index: sympy.Expr) -> CSEVariable:
+        raise NotImplementedError
 
     def indirect_load(self, name: str, index: sympy.Expr):
         """A load the depends on an index we have read"""
@@ -1015,69 +1408,144 @@ def indirect_load(self, name: str, index: sympy.Expr):
         finally:
             self.loads = prior
 
-    def store_reduction(self, name, index, value):
-        raise NotImplementedError()
-
-    def store(self, name, index, value, mode=None):
-        raise NotImplementedError()
+    def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable):
+        raise NotImplementedError
 
-    def reduction(self, dtype, src_dtype, reduction_type, value):
-        raise NotImplementedError()
+    def store(
+        self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
+    ) -> None:
+        raise NotImplementedError
 
-    def scan(self, dtype, combine_fn, value, init):
-        raise NotImplementedError()
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[CSEVariable, Tuple[CSEVariable, ...]],
+    ) -> Union[CSEVariable, Tuple[CSEVariable, ...]]:
+        raise NotImplementedError
+
+    def scan(
+        self,
+        dtypes: Tuple[torch.dtype, ...],
+        combine_fn: Callable[
+            [Tuple[CSEVariable, ...], Tuple[CSEVariable, ...]], Tuple[CSEVariable, ...]
+        ],
+        values: Tuple[CSEVariable, ...],
+    ) -> Tuple[CSEVariable, ...]:
+        raise NotImplementedError
 
     def bucketize(
         self,
-        values,
+        values: CSEVariable,
         offsets_name: str,
         offsets_size: sympy.Expr,
         indexing_dtype: torch.dtype,
         right: bool,
-    ):
+    ) -> CSEVariable:
         """
         See [Note: Inductor bucketize op]
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @property
     def assert_function(self) -> str:
-        raise NotImplementedError()
+        raise NotImplementedError
+
+    def indirect_assert(self, var, lower, upper, mask=None):
+        if lower and upper:
+            # The conditions need to be in parens because of Python's operator precedence.
+            # It'd be less error-prone to use and/or/not, which is suported by triton
+            cond = f"({lower} <= {var}) & ({var} < {upper})"
+            cond_print = f"{lower} <= {var} < {upper}"
+        elif lower:
+            cond = f"{lower} <= {var}"
+            cond_print = cond
+        else:
+            assert upper
+            cond = f"{var} < {upper}"
+            cond_print = cond
+
+        if mask:
+            cond = f"({cond}) | ~{mask}"
+
+        return f'{self.assert_function}({cond}, "index out of bounds: {cond_print}")'
 
     def index_to_str(self, index: sympy.Expr) -> str:
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def __enter__(self):
+        # TODO: hoist this to top level
         class CSEProxy:
             self.name = "CSEProxy"
+            vr_analysis = ValueRangeAnalysis()
 
             @staticmethod
             def __getattr__(name: str) -> Callable[..., CSEVariable]:  # type: ignore[misc]
                 def inner(*args, **kwargs):
-                    # TritonTemplateKernel has no current_node
-                    buf_bounds = ValueRanges.unknown()
-                    if hasattr(V.interpreter, "current_node"):
-                        fx_node = V.interpreter.current_node
-                        assert isinstance(self.node_to_bounds, dict)
-                        buf_bounds = self.node_to_bounds.get(
-                            fx_node, ValueRanges.unknown()
-                        )
+                    bounds = CSEProxy._bound_variable(name, *args, **kwargs)
 
-                    csevar = self.cse.generate(
-                        self.compute,
-                        getattr(parent_handler, name)(*args, **kwargs),  # type: ignore[has-type]
-                        bounds=buf_bounds,
-                    )
-                    csevar.update_on_args(name, args, kwargs)
-                    return csevar
+                    value = getattr(parent_handler, name)(*args, **kwargs)  # type: ignore[has-type]
+
+                    def do_cse(v):
+                        csevar = self.cse.generate(self.compute, v, bounds=bounds)
+                        csevar.update_on_args(name, args, kwargs)
+                        return csevar
+
+                    return pytree.tree_map(do_cse, value)
 
                 return inner
 
             @staticmethod
-            def indirect_indexing(var, size, check=True):
+            def _bound_variable(name, *args, **kwargs):
+                """
+                If the variable comes from an FX node, we forward the bound we have already computed
+                Else, if the variable when codegen'ing another op, we try to compute its bounds
+                """
+                from ..select_algorithm import TritonTemplateKernel
+
+                if isinstance(V.kernel, TritonTemplateKernel):
+                    return ValueRanges.unknown()
+
+                fx_node = V.interpreter.current_node
+                if fx_node.target == name:
+                    assert isinstance(self.node_to_bounds, dict)
+                    return self.node_to_bounds.get(fx_node, ValueRanges.unknown())
+                elif config.compute_all_bounds and hasattr(ValueRangeAnalysis, name):
+                    # These create lots of inner strings. We would need to compute the bounds at the ops
+                    # We will also likely not get much from computing VRs on these nodes
+                    if any(
+                        s in fx_node.target
+                        for s in ("set_indirect", "reduction", "scan")
+                    ):
+                        return ValueRanges.unknown()
+
+                    # We assume that the inputs come from `ops.` and are not strings. If you want to generate
+                    # intermediary strings, wrap them in CSE variables with properly initialised bounds.
+
+                    # If there is no FX bound but we know how to compute one we do so
+                    assert not kwargs
+
+                    def arg_to_bound(x):
+                        if isinstance(x, CSEVariable):
+                            return x.bounds
+                        elif isinstance(x, sympy.Expr):
+                            return bound_sympy(x)
+                        else:
+                            return x
+
+                    arg_bounds = list(map(arg_to_bound, args))
+                    return getattr(CSEProxy.vr_analysis, name)(*arg_bounds)
+                else:
+                    return ValueRanges.unknown()
+
+            @staticmethod
+            def indirect_indexing(
+                var: CSEVariable, size: sympy.Expr, check: bool = True
+            ):
                 # Skip CSE since this doesn't return an expression
 
-                if var.bounds.lower < 0:
+                if var.bounds.lower < 0:  # type: ignore[operator]
                     new_bounds = ValueRanges.unknown()
                     if var.bounds != ValueRanges.unknown() and isinstance(
                         size, sympy.Number
@@ -1088,14 +1556,14 @@ def indirect_indexing(var, size, check=True):
                         neg = var.bounds & ValueRanges(-sympy.oo, -1)
                         new_bounds = ValueRanges(neg.lower + size, neg.upper + size)
                         # We don't have a good way of representing the empty range
-                        if var.bounds.upper >= 0:
+                        if var.bounds.upper >= 0:  # type: ignore[operator]
                             pos = var.bounds & ValueRanges(0, sympy.oo)
                             new_bounds = new_bounds | pos
 
                     stm = ops.add(var, self.rename_indexing(size))
                     # Mixed negative and non-negative
-                    if var.bounds.upper >= 0:
-                        lt = ops.lt(var, "0")
+                    if var.bounds.upper >= 0:  # type: ignore[operator]
+                        lt = ops.lt(var, 0)
                         stm = ops.where(lt, stm, var)
                     new_var = self.cse.generate(self.compute, stm, bounds=new_bounds)
 
@@ -1114,13 +1582,10 @@ def indirect_indexing(var, size, check=True):
                     if existing_size is not None:
                         size = sympy.Min(size, existing_size)
                     else:
-                        line = (
-                            '{assert_fn}({cond}, "index out of bounds: {cond_print}")'
-                        )
                         self.compute.writeline(
                             IndirectAssertLine(
-                                line,
-                                self.assert_function,
+                                "{assert_line}",
+                                self.indirect_assert,
                                 var,
                                 mask,
                                 self.indirect_max_sizes,
@@ -1128,15 +1593,15 @@ def indirect_indexing(var, size, check=True):
                         )
 
                     self.indirect_max_sizes[map_key] = (size, self.index_to_str(size))
-                return sympy_symbol(str(var))
+                return parent_handler.indirect_indexing(var, size, check)
 
             @staticmethod
-            def load(name: str, index: sympy.Expr):
+            def load(name: str, index: sympy.Expr) -> CSEVariable:
                 if name in self.cse.invalidated_stores:
                     # A load from an invalidated store requires us to
                     # keep the actual buffer around
                     V.kernel.must_keep_buffers.add(name)
-                if free_symbol_startswith(index, "tmp"):
+                if free_symbol_is_type(index, SymT.TMP):
                     return self.indirect_load(name, index)
                 store_cache = self.cse.store_cache
                 if name in store_cache:
@@ -1144,7 +1609,9 @@ def load(name: str, index: sympy.Expr):
                 return self.load(name, index)
 
             @staticmethod
-            def store(name, index, value, mode=None):
+            def store(
+                name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
+            ) -> None:
                 self.store_buffer_names.add(name)
                 if mode is None:
                     self.cse.store_cache[name] = value
@@ -1153,9 +1620,11 @@ def store(name, index, value, mode=None):
                             self.cse.store_cache[other_name] = value
                 if name not in V.graph.removed_buffers:
                     return self.store(name, index, value, mode=mode)
+                else:
+                    return None  # type: ignore[return-value]
 
             @staticmethod
-            def store_reduction(name, index, value):
+            def store_reduction(name: str, index: sympy.Expr, value: CSEVariable):
                 self.store_buffer_names.add(name)
                 self.cse.store_cache[name] = value
                 if self.current_node:
@@ -1166,21 +1635,33 @@ def store_reduction(name, index, value):
                     return self.store_reduction(name, index, value)
 
             @staticmethod
-            def reduction(dtype, src_dtype, reduction_type, value):
+            def reduction(
+                dtype: torch.dtype,
+                src_dtype: torch.dtype,
+                reduction_type: ReductionType,
+                value: Union[CSEVariable, Tuple[CSEVariable, ...]],
+            ) -> Union[CSEVariable, Tuple[CSEVariable, ...]]:
                 return self.reduction(dtype, src_dtype, reduction_type, value)
 
             @staticmethod
-            def scan(dtype, combine_fn, value, init):
-                return self.scan(dtype, combine_fn, value, init)
+            def scan(
+                dtypes: Tuple[torch.dtype, ...],
+                combine_fn: Callable[
+                    [Tuple[CSEVariable, ...], Tuple[CSEVariable, ...]],
+                    Tuple[CSEVariable, ...],
+                ],
+                values: Tuple[CSEVariable, ...],
+            ) -> Tuple[CSEVariable, ...]:
+                return self.scan(dtypes, combine_fn, values)
 
             @staticmethod
             def bucketize(
-                values,
+                values: CSEVariable,
                 offsets_name: str,
                 offsets_size: sympy.Expr,
                 indexing_dtype: torch.dtype,
                 right: bool,
-            ):
+            ) -> CSEVariable:
                 """
                 [Note: Inductor bucketize op]
 
@@ -1199,6 +1680,10 @@ def bucketize(
                     values, offsets_name, offsets_size, indexing_dtype, right
                 )
 
+        # Use mypy to check protocol implemented correctly
+        def _typecheck_CSEProxy(h: CSEProxy) -> OpsHandler[CSEVariable]:
+            return h
+
         super().__enter__()
         assert self.overrides
         parent_handler = self.overrides(V.get_ops_handler())
@@ -1218,7 +1703,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
     def generate_assert(self, check):
         return (check or config.debug_index_asserts) and config.assert_indirect_indexing
 
-    def load_mask(self, var):
+    def load_mask(self, var) -> str:
         # only the triton kernel requires mask
         return ""
 
@@ -1226,15 +1711,13 @@ def rename_indexing(self, index) -> sympy.Expr:
         # adds the necessary kernel args for index expressions
         # and renames variables in index expressions to kernel arg names
         if isinstance(index, (list, tuple)):
-            return [self.rename_indexing(x) for x in index]
+            return [self.rename_indexing(x) for x in index]  # type: ignore[return-value]
         index = V.graph.sizevars.simplify(index)
         sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name)
         replacements = {
             x: self.args.size(x)
             for x in sorted_symbols
-            if x.name.startswith("s")
-            or x.name.startswith("ps")
-            or (x.name.startswith("i") and not x.name.startswith("idx"))
+            if symbol_is_type(x, (SymT.UNBACKED_INT, SymT.SIZE, SymT.PRECOMPUTED_SIZE))
         }
         return sympy_subs(index, replacements)
 
@@ -1246,15 +1729,9 @@ def create_cse_var(self, *args, **kwargs):
 class OptimizationContext:
     key: ClassVar[str] = "opt_ctx"
 
-    # Load value as mask
-    is_load_as_mask: bool = False
-
     dtype: Optional[torch.dtype] = None
     ops_name: str = ""
 
-    # Load uint8 value as float32
-    is_load_uint8_as_float: bool = False
-
 
 @functools.lru_cache(None)
 def jinja2_env():
@@ -1268,38 +1745,6 @@ def jinja2_env():
         return None
 
 
-class ChoiceCaller:
-    """
-    Represents a possible choice used in autotune_process.py.
-    During autotuning, self.benchmark() is first called to get benchmark result,
-    and if this choice is selected, self.output_node() is called to get the output_node.
-
-    Children classes: TritonTemplateCaller, CUDATemplateCaller.
-    """
-
-    def __init__(self, name, input_nodes, layout):
-        super().__init__()
-        self.name = name
-        self.layout = layout
-        self.input_nodes = input_nodes
-
-    def benchmark(self, *args, out) -> float:
-        algo = self.to_callable()
-        return do_bench(lambda: algo(*args, out=out))
-
-    def call_name(self) -> str:
-        raise NotImplementedError()
-
-    def to_callable(self):
-        raise NotImplementedError()
-
-    def hash_key(self) -> str:
-        raise NotImplementedError()
-
-    def output_node(self) -> "TensorBox":
-        raise NotImplementedError()
-
-
 class KernelTemplate:
     """
     Base class for defining kernel templates.
@@ -1307,10 +1752,20 @@ class KernelTemplate:
     Children classes: TritonTemplate, CUDATemplate
     """
 
+    @staticmethod
+    def indent_except_first(source: str, num_indents: int, indents_spacing=4):
+        lines = source.splitlines(True)
+        if len(lines) > 1:
+            lines[1:] = [
+                (" " * indents_spacing * num_indents) + line for line in lines[1:]
+            ]
+        return "".join(lines)
+
     @staticmethod
     def _template_from_string(source):
         env = jinja2_env()
         if env is not None:
+            env.filters["indent_except_first"] = KernelTemplate.indent_except_first
             return env.from_string(source)
         return None
 
@@ -1341,9 +1796,9 @@ def maybe_append_choice(self, choices, **kwargs):
         except NotImplementedError:
             pass
 
-    def generate(self, **kwargs) -> ChoiceCaller:
+    def generate(self, **kwargs) -> "torch._inductor.ir.ChoiceCaller":
         """
         Generates a ChoiceCaller instance from the given arguments.
         """
 
-        raise NotImplementedError()
+        raise NotImplementedError
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 9119d1bf5fb6a..c0aad2d27428e 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -7,29 +7,42 @@
 import re
 import sys
 from copy import copy, deepcopy
-from typing import Dict, List, Optional, Set, Tuple, Union
+from enum import Enum
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import sympy
 
 import torch
 import torch.fx
 from torch._inductor import dependencies
-from torch._inductor.ir import StorageBox, TensorBox
 from torch._prims_common import is_float_dtype
-from torch.utils._sympy.functions import FloorDiv
+from torch.utils import _pytree as pytree
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing
+from torch.utils._sympy.symbol import free_symbol_is_type, symbol_is_type, SymT
 from torch.utils._sympy.value_ranges import bound_sympy, ValueRanges
 
 from .. import codecache, config, ir, metrics
 from ..codegen.wrapper import WrapperCodeGen
 from ..optimize_indexing import range_expressable_in_32_bits
-from ..scheduler import BaseScheduling, SchedulerNode
+from ..scheduler import (
+    BaseSchedulerNode,
+    BaseScheduling,
+    ForeachKernelSchedulerNode,
+    FusedSchedulerNode,
+    Scheduler,
+    SchedulerNode,
+)
 from ..utils import (
     cache_on_self,
+    get_bounds_index_expr,
     get_fused_kernel_name,
     is_welford_reduction,
+    parallel_num_threads,
+    Placeholder,
+    sympy_index_symbol,
+    sympy_index_symbol_with_prefix,
     sympy_product,
     sympy_subs,
-    sympy_symbol,
 )
 
 from ..virtualized import ops, OpsValue, V
@@ -41,7 +54,6 @@
     DataTypePropagation,
     DeferredLine,
     DTYPE_TO_COMPUTATION_DTYPE,
-    ExprPrinter,
     IndentedBuffer,
     Kernel,
     KernelArgs,
@@ -49,44 +61,9 @@
     OptimizationContext,
 )
 
-schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
-
-DTYPE_TO_CPP = {
-    torch.float32: "float",
-    torch.float64: "double",
-    torch.float16: "half",
-    torch.int64: "long",
-    torch.int32: "int",
-    torch.int16: "short",
-    torch.int8: "signed char",
-    torch.uint8: "unsigned char",
-    torch.bool: "bool",
-    torch.bfloat16: "bfloat16",
-    torch.complex64: "complex64",
-}
-
-DTYPE_TO_ATEN = {
-    torch.float32: "at::kFloat",
-    torch.float64: "at::kDouble",
-    torch.float16: "at::kHalf",
-    torch.int64: "at::kLong",
-    torch.int32: "at::kInt",
-    torch.int16: "at::kShort",
-    torch.int8: "at::kChar",
-    torch.uint8: "at::kByte",
-    torch.bool: "at::kBool",
-    torch.bfloat16: "at::kBFloat16",
-    torch.complex64: "at::kComplexFloat",
-    torch.float8_e4m3fn: "at::kFloat8_e4m3fn",
-    torch.float8_e5m2: "at::kFloat8_e5m2",
-}
-
-DEVICE_TO_ATEN = {
-    "cpu": "at::kCPU",
-    "cuda": "at::kCUDA",
-}
+from .cpp_utils import cexpr, cexpr_index, DTYPE_TO_CPP, INDEX_TYPE, value_to_cpp
 
-INDEX_TYPE = "long"
+schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
 
 NATIVE_OMP_RTYPES = {"+", "*", "^", "||", "min", "max"}
 RTYPE_TO_CPP = {
@@ -135,17 +112,7 @@
 ]
 
 
-def value_to_cpp(value, cpp_type):
-    if value == float("-inf"):
-        return f"-std::numeric_limits<{cpp_type}>::infinity()"
-    elif value == float("inf"):
-        return f"std::numeric_limits<{cpp_type}>::infinity()"
-    elif isinstance(value, bool):
-        return f"static_cast<{cpp_type}>({str(value).lower()})"
-    elif math.isnan(value):
-        return f"std::numeric_limits<{cpp_type}>::quiet_NaN()"
-    else:
-        return f"static_cast<{cpp_type}>({repr(value)})"
+BIN_CMP_OPS = ["eq", "ne", "le", "ge", "lt", "gt"]
 
 
 def reduction_init(reduction_type, dtype):
@@ -174,17 +141,6 @@ def reduction_init(reduction_type, dtype):
     raise AssertionError(reduction_type)
 
 
-def reduction_init_vec(reduction_type, dtype):
-    scalar_type = DTYPE_TO_CPP[DTYPE_TO_COMPUTATION_DTYPE[dtype]]
-    vec_type = f"at::vec::Vectorized<{scalar_type}>"
-
-    if is_welford_reduction(reduction_type):
-        return f"Welford<{vec_type}>()"
-
-    scalar_init = reduction_init(reduction_type, dtype)
-    return f"{vec_type}({scalar_init})"
-
-
 def reduction_acc_type(reduction_type, dtype):
     assert reduction_type not in {"argmin", "argmax"}
     scalar_type = DTYPE_TO_CPP[DTYPE_TO_COMPUTATION_DTYPE[dtype]]
@@ -194,16 +150,6 @@ def reduction_acc_type(reduction_type, dtype):
     return scalar_type
 
 
-def reduction_acc_type_vec(reduction_type, dtype):
-    assert reduction_type not in {"argmin", "argmax"}
-    scalar_type = DTYPE_TO_CPP[DTYPE_TO_COMPUTATION_DTYPE[dtype]]
-    vec_type = f"at::vec::Vectorized<{scalar_type}>"
-    if is_welford_reduction(reduction_type):
-        return f"Welford<{vec_type}>"
-
-    return vec_type
-
-
 def reduction_combine(reduction_type, var, next_value):
     if reduction_type == "sum":
         return f"{var} + {next_value}"
@@ -226,31 +172,6 @@ def reduction_combine(reduction_type, var, next_value):
     raise AssertionError(reduction_type)
 
 
-def reduction_combine_vec(reduction_type, var, next_value):
-    if reduction_type == "max":
-        return f"at::vec::maximum({var}, {next_value})"
-    elif reduction_type == "min":
-        return f"at::vec::minimum({var}, {next_value})"
-    elif reduction_type == "sum":
-        return f"{var} + {next_value}"
-    elif reduction_type == "prod":
-        return f"{var} * {next_value}"
-    elif reduction_type == "xor_sum":
-        return f"{var} ^ {next_value}"
-    elif reduction_type == "welford_reduce":
-        return f"welford_combine({var}, {next_value})"
-    elif reduction_type == "welford_combine":
-        if isinstance(next_value, tuple):
-            # When reading a value from Inductor IR we have a tuple of variable names
-            mean, m2, weight = next_value
-        else:
-            # When combining intermediate accumulators we have a Welford<T> struct
-            mean, m2, weight = reduction_project(reduction_type, next_value)
-        return f"welford_combine({var}, {{{mean}, {m2}, {weight}}})"
-    else:
-        raise NotImplementedError()
-
-
 def reduction_project(reduction_type, acc):
     if is_welford_reduction(reduction_type):
         return f"{acc}.mean", f"{acc}.m2", f"{acc}.weight"
@@ -259,11 +180,27 @@ def reduction_project(reduction_type, acc):
     return acc
 
 
+def is_to_lowp_dtype(expr):
+    to_exprs = ["convert<half>", "convert<bfloat16>"]
+    return any(to_expr in expr for to_expr in to_exprs)
+
+
+def get_lowp_to_fp32_expr(lowp_var, kernel):
+    if isinstance(kernel, CppVecKernel):
+        return f"at::vec::convert<float>({lowp_var})"
+    else:
+        assert isinstance(kernel, CppKernel)
+        return f"c10::convert<float>({lowp_var})"
+
+
 index_value_name_counter = 1
 
 
 def argmax_argmin_prefix(reduction_type, src_dtype, tmpvar):
     global index_value_name_counter
+    num_threads = (
+        "max_threads" if config.cpp.dynamic_threads else parallel_num_threads()
+    )
     struct_name = f"IndexValue_{index_value_name_counter}"
     index_value_name_counter += 1
 
@@ -272,155 +209,261 @@ def argmax_argmin_prefix(reduction_type, src_dtype, tmpvar):
         f"struct {struct_name} {{size_t index; {DTYPE_TO_CPP[src_dtype]} value;}};",
         f"{struct_name} {tmpvar}{{0, {reduction_init(reduction_type, src_dtype)}}};",
     ]
-    if reduction_type == "argmax":
-        prefix.extend(
-            [
-                "#if !defined(__clang_major__) || __clang_major__ > 9",
-                f"#pragma omp declare reduction(argmax : {struct_name} :\\",
-                "    omp_out.value = omp_in.value < omp_out.value ? omp_out.value : omp_in.value,\\",
-                "    omp_out.index = omp_in.value < omp_out.value ? omp_out.index : omp_in.index)\\",
-                f"\tinitializer(omp_priv = {{0, {reduction_init(reduction_type, src_dtype)}}})",
-                "#endif",
-            ]
-        )
-    elif reduction_type == "argmin":
-        prefix.extend(
-            [
-                "#if !defined(__clang_major__) || __clang_major__ > 9",
-                f"#pragma omp declare reduction(argmin : {struct_name} :\\",
-                "    omp_out.value = omp_in.value > omp_out.value ? omp_out.value : omp_in.value,\\",
-                "    omp_out.index = omp_in.value > omp_out.value ? omp_out.index : omp_in.index)\\",
-                f"\tinitializer(omp_priv = {{0, {reduction_init(reduction_type, src_dtype)}}})",
-                "#endif",
-            ]
-        )
-    return prefix
-
-
-def parallel_num_threads():
-    threads = config.cpp.threads
-    if threads < 1:
-        threads = torch.get_num_threads()
-    return threads
+    local_init = [
+        f"{struct_name} {tmpvar}_local{{0, {reduction_init(reduction_type, src_dtype)}}};",
+    ]
+    tmpvar_per_thd = f"{tmpvar}_arr[{num_threads}]"
+    parallel_prefix = [
+        f"{struct_name} {tmpvar_per_thd};",
+    ]
+    return prefix, parallel_prefix, local_init
 
 
 @functools.lru_cache
-def stride_at(var: sympy.Symbol, index: sympy.Expr):
+def stride_at(index: sympy.Expr, var: sympy.Symbol):
     replacement = {var: var + 1}
-    new_index = sympy_subs(index, replacement)
+    new_index = sympy_subs(index, replacement)  # type: ignore[arg-type]
     return sympy.simplify(new_index - index)
 
 
-class CppPrinter(ExprPrinter):
-    def _print_Integer(self, expr):
-        return f"{int(expr)}L"
+@functools.lru_cache
+def simplify_index_in_vec_range(index: sympy.Expr, var: sympy.Expr, vec_length: int):
+    """
+    Simplifies the index expression within the range of a vectorized loop.
+    Given a vectorized loop variable `var` in the range of a loop with `vec_length`,
+    this function transforms the `index` into an equivalent form. It handles
+    simplifications for cases where `var` can be expressed as `vec_length * a + b`,
+    where `b` ranges from 0 to `vec_length - 1`. The function reduces occurrences
+    of `FloorDiv` and `ModularIndexing` in the `index` with best-effort optimizations.
+
+    NOTE:
+    The simplified index expression is intended for analysis purposes only, not
+    for code generation. It replaces `FloorDiv` and `ModularIndexing` with free variables
+    which are not dependent on the loop variable `var` in the vectorized range. Check
+    https://github.com/pytorch/pytorch/pull/117221#discussion_r1449746217 for more details.
+
+    Examples:
+    1. If `var` is `x3` and `vec_length` is 16, and `x3 = 16*a + b`, then
+       `FloorDiv(x3, div)` or `ModularIndexing(x3, div, mod)` becomes a free variable
+       when `div` is divisible by 16.
+    2. `ModularIndexing(x3, 1, mod)` can be simplified to `x3 + c` where `c` is a free
+       variable when `mod` is divisible by 16.
+    """
 
-    def _print_Where(self, expr):
-        c = self.paren(self.doprint(expr.args[0]))
-        p = self.paren(self.doprint(expr.args[1]))
-        q = self.paren(self.doprint(expr.args[2]))
-        return f"{c} ? {p} : {q}"
+    div_freevar_id = 0
+    mod_freevar_id = 0
 
-    def _print_ModularIndexing(self, expr):
-        x, div, mod = expr.args
-        x = self.paren(self.doprint(x))
-        if div != 1:
-            div = self.paren(self.doprint(div))
-            if expr.is_integer:
-                x = f"c10::div_floor_integer({x}, {div})"
-            else:
-                x = f"c10::div_floor_floating(static_cast<double>({x}), static_cast<double>({div}))"
-        mod = self.paren(self.doprint(mod))
-        return f"static_cast<{INDEX_TYPE}>({x}) % static_cast<{INDEX_TYPE}>({mod})"
-
-    def _print_FloorDiv(self, expr):
-        x, div = expr.args
-        x = self.paren(self.doprint(x))
-        div = self.paren(self.doprint(div))
-        if expr.is_integer:
-            return f"c10::div_floor_integer({x}, {div})"
-        return f"c10::div_floor_floating(static_cast<double>({x}), static_cast<double>({div}))"
-
-    def _print_floor(self, expr):
-        assert len(expr.args) == 1
-        r = f"std::floor({self._print(expr.args[0])})"
-        return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
-
-    def _print_Pow(self, expr):
-        # Uses float constants to perform FP div
-        base, exp = expr.args
-        base = self._print(base)
-
-        if exp == 0.5 or exp == -0.5:
-            return f"std::sqrt({base})" if exp == 0.5 else f"1.0/std::sqrt({base})"
-        assert exp.is_integer
-        exp = int(exp)
-        if exp > 0:
-            r = "*".join([self.paren(base)] * exp)
-        elif exp < 0:
-            r = "1.0/" + self.paren("*".join([self.paren(base)] * abs(exp)))
-        else:  # exp == 0
-            r = "1.0"
-
-        return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
-
-    def _print_Rational(self, expr):
-        # Uses float constants to perform FP div
-        if expr.q == 1:
-            r = f"{expr.p}"
-        else:
-            r = f"{expr.p}.0/{expr.q}.0"
-        return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
-
-    def _print_ceiling(self, expr):
-        assert len(expr.args) == 1
-        r = f"std::ceil({self._print(expr.args[0])})"
-        return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
-
-    def _print_Min(self, expr):
-        args = [self._print(a) for a in expr.args]
-        if len(args) == 2:
-            return f"std::min({args[0]}, {args[1]})"
-        else:
-            # Initializer list overload
-            il = "{" + ", ".join(args) + "}"
-            return f"std::min({il})"
-
-    def _print_Max(self, expr):
-        args = [self._print(a) for a in expr.args]
-        if len(args) == 2:
-            return f"std::max({args[0]}, {args[1]})"
-        else:
-            # Initializer list overload
-            il = "{" + ", ".join(args) + "}"
-            return f"std::max({il})"
-
-    def _print_Abs(self, expr):
-        assert len(expr.args) == 1
-        return f"std::abs({self._print(expr.args[0])})"
-
-    def _print_Round(self, expr):
-        assert len(expr.args) == 1
-        return f"std::lrint({self._print(expr.args[0])})"
-
-    def _print_RoundDecimal(self, expr):
-        assert len(expr.args) == 2
-        number, ndigits = expr.args
-        if number.is_integer:
-            # ndigits < 0 should have been filtered by the sympy function
-            assert ndigits < 0
-            raise ValueError(
-                f"For integer inputs, only non-negative ndigits are currently supported, but got {ndigits}."
+    def visit_indexing_div(divisor):
+        nonlocal div_freevar_id
+        result = FloorDiv(var, divisor)
+        if sympy.gcd(divisor, vec_length) == vec_length:
+            result = sympy.Symbol(f"{var}_div_c{div_freevar_id}")
+            div_freevar_id += 1
+        return result
+
+    def visit_modular_indexing(divisor, modulus):
+        nonlocal mod_freevar_id
+        result = ModularIndexing(var, divisor, modulus)
+        if sympy.gcd(divisor, vec_length) == vec_length:
+            result = sympy.Symbol(f"{var}_mod_c{mod_freevar_id}")
+            mod_freevar_id += 1
+        elif divisor == 1 and sympy.gcd(modulus, vec_length) == vec_length:
+            result = var + sympy.Symbol(f"{var}_mod_c{mod_freevar_id}")
+            mod_freevar_id += 1
+        return result
+
+    original_index = index
+
+    div = sympy.Wild("divisor")
+    if index.has(FloorDiv):
+        index = index.replace(FloorDiv(var, div), visit_indexing_div)
+
+    mod = sympy.Wild("modulus")
+    if index.has(ModularIndexing):
+        index = index.replace(ModularIndexing(var, div, mod), visit_modular_indexing)
+
+    index = sympy.simplify(index)
+    if index != original_index:
+        return simplify_index_in_vec_range(index, var, vec_length)
+
+    return index
+
+
+@functools.lru_cache
+def stride_at_vec_range(index: sympy.Expr, var: sympy.Symbol, vec_length: int):
+    index_vec_simplified = simplify_index_in_vec_range(index, var, vec_length)
+    return stride_at(index_vec_simplified, var)
+
+
+class OuterLoopFusedSchedulerNode(FusedSchedulerNode):
+    @classmethod
+    def fuse(  # type: ignore[override]
+        cls, node1: BaseSchedulerNode, node2: BaseSchedulerNode, outer_loop_fusion_depth
+    ):
+        assert node1.scheduler is node2.scheduler
+        assert all(
+            type(node)
+            in (
+                OuterLoopFusedSchedulerNode,
+                SchedulerNode,
+                FusedSchedulerNode,
+            )
+            for node in (node1, node2)
+        )
+        if any(type(node) is OuterLoopFusedSchedulerNode for node in (node1, node2)):
+            return cls(
+                node1.scheduler,
+                (
+                    list(node1.get_outer_nodes())
+                    if type(node1) is OuterLoopFusedSchedulerNode
+                    else [
+                        node1,
+                    ]
+                )
+                + (
+                    list(node2.get_outer_nodes())
+                    if type(node2) is OuterLoopFusedSchedulerNode
+                    else [
+                        node2,
+                    ]
+                ),
+                outer_loop_fusion_depth,
             )
-        return f"static_cast<double>(std::nearbyint(1e{ndigits} * {self.paren(self._print(number))}) * 1e{-ndigits})"
+        else:
+            return cls(node1.scheduler, [node1, node2], outer_loop_fusion_depth)  # type: ignore[list-item]
+
+    def __init__(
+        self,
+        scheduler: "Scheduler",
+        outer_fused_nodes: List[Union[FusedSchedulerNode, SchedulerNode]],
+        outer_loop_fusion_depth,
+    ):
+        self.outer_fused_nodes: List[
+            Union[FusedSchedulerNode, SchedulerNode]
+        ] = outer_fused_nodes
+        self.outer_loop_fusion_depth = outer_loop_fusion_depth
+        flatten_snodes = []
+        for _node in self.outer_fused_nodes:
+            assert isinstance(_node, (SchedulerNode, FusedSchedulerNode))
+            flatten_snodes.extend(list(_node.get_nodes()))
+        super().__init__(scheduler, flatten_snodes)  # type: ignore[arg-type]
+
+    def get_outer_nodes(self):
+        return self.outer_fused_nodes
+
+    def check_outer_fusion_loop_level_attr(
+        self, cpp_kernel_proxy_list, outer_loop_fusion_depth
+    ):
+        # This function ensures that the same tiling split is applied at each loop level within the outer loop fusion depth.
+        # In the fusion stage, we only examine nodes with same vars and reduce.
+        # However, for nodes with same vars and reduce, the loops may still have different tile splits.
+        # For example (test_expr_vec_non_contiguous in test_cpu_repro.py):
+        #   * buf0 tiling along the 2nd loop level, buf1 tiling along the 3rd loop level.
+        # If the check failed, we should fall back to standard loop codegen.
+        def _inner(
+            left_loop_level: LoopLevel,
+            right_loop_level: LoopLevel,
+            loop_fusion_depth: int,
+        ) -> bool:
+            # Check if same loop level attr
+            outer_loops_attr_compare_list = [
+                "var",
+                "size",
+                "offset",
+                "steps",
+            ]
+            if not (
+                all(
+                    getattr(left_loop_level, attr_compare)
+                    == getattr(right_loop_level, attr_compare)
+                    for attr_compare in outer_loops_attr_compare_list
+                )
+            ):
+                return False
+
+            assert loop_fusion_depth >= 1
+            if (loop_fusion_depth := loop_fusion_depth - 1) > 0:
+                # If the next loop level is expected to undergo outer loop fusion,
+                # there should be no kernel present at the current loop level.
+                assert (
+                    left_loop_level.kernel is None and right_loop_level.kernel is None
+                )
+                # Check next loop level attr
+                if any(
+                    # Assume no main/tail loop split at any outer loop fusion depth
+                    # Given no clear performance benefit for this complex case
+                    len(loop_level.inner) != 1
+                    for loop_level in [left_loop_level, right_loop_level]
+                ) or not _inner(
+                    left_loop_level.inner[0],
+                    right_loop_level.inner[0],
+                    loop_fusion_depth,
+                ):
+                    return False
+
+            return True
 
+        for idx in range(len(cpp_kernel_proxy_list) - 1):
+            left_loop_nest = cpp_kernel_proxy_list[idx].loop_nest
+            right_loop_nest = cpp_kernel_proxy_list[idx + 1].loop_nest
+            if any(
+                # Assume no main/tail loop split at any outer loop fusion depth
+                len(loop_nest.root) != 1
+                for loop_nest in [left_loop_nest, right_loop_nest]
+            ) or not _inner(
+                left_loop_nest.root[0], right_loop_nest.root[0], outer_loop_fusion_depth
+            ):
+                return False
+
+        return True
 
-# A function to print, useful for printing sympy symbols.
-cexpr = CppPrinter().doprint
+    def merge_outer_fusion_kernels(
+        self,
+        cpp_kernel_proxy_list,
+    ):
+        loop_nest_list: List[LoopNestWithSplit] = [
+            kernel.loop_nest for kernel in cpp_kernel_proxy_list
+        ]
+        metrics.cpp_outer_loop_fused_inner_counts.append(len(loop_nest_list))
 
+        kernel_group = cpp_kernel_proxy_list[0].kernel_group
 
-def cexpr_index(index):
-    return f"static_cast<{INDEX_TYPE}>({cexpr(index)})"
+        def _merge_outer_fusion_loop_levels(
+            loop_level_nested_list: List[List["LoopLevel"]],
+            outer_loop_fusion_depth,
+        ):
+            assert outer_loop_fusion_depth >= 1
+            # Assume no main/tail loop split at any outer loop fusion depth
+            assert all(
+                len(loop_level_list) == 1 for loop_level_list in loop_level_nested_list
+            )
+            if (outer_loop_fusion_depth := outer_loop_fusion_depth - 1) >= 1:
+                # Further merge the next loop level
+                next_loop_level_nested_list = [
+                    loop_level_list[0].inner
+                    for loop_level_list in loop_level_nested_list
+                ]
+                _merge_outer_fusion_loop_levels(
+                    next_loop_level_nested_list,
+                    outer_loop_fusion_depth,
+                )
+            else:
+                outer_loop_fused_kernel = OuterLoopFusedKernel(kernel_group)
+                loop_level_of_first_kernel = loop_level_nested_list[0][0]
+                for kernel_idx in range(len(loop_level_nested_list)):
+                    outer_loop_fused_kernel.inner.append(
+                        deepcopy(loop_level_nested_list[kernel_idx][0]),
+                    )
+                loop_level_of_first_kernel.inner = []
+                loop_level_of_first_kernel.kernel = outer_loop_fused_kernel
+
+        # Merge the List[LoopNestWithSplit] from cpp_kernel_proxy_list
+        # into cpp_kernel_proxy_list[0].loop_nest
+        _merge_outer_fusion_loop_levels(
+            [_loop_nest.root for _loop_nest in loop_nest_list],  # type: ignore[misc]
+            self.outer_loop_fusion_depth,
+        )
+        return cpp_kernel_proxy_list[0]
 
 
 class RecordOptimizationContext:
@@ -466,12 +509,18 @@ def get_current_node_opt_ctx() -> OptimizationContext:
 
 
 class CppCSEVariable(CSEVariable):
-    def __init__(self, name, bounds: ValueRanges):
+    def __init__(self, name, bounds: ValueRanges[Any]):
         super().__init__(name, bounds)
         self.is_vec = False
         self.dtype: Optional[torch.dtype] = None
         self.dependent_itervars: Set[sympy.Symbol] = set()
 
+    def __repr__(self):
+        return (
+            f"CppCSEVariable(name: {self.name}, bounds: {self.bounds}, is_vec: {self.is_vec}, dtype: {self.dtype}, "
+            f"dependent_itervars: {self.dependent_itervars})"
+        )
+
     def update_on_args(self, name, args, kwargs):
         if name == "load":
             # args[1] is index
@@ -503,6 +552,9 @@ def update_on_args(self, name, args, kwargs):
         ):
             self.dtype = get_current_node_opt_ctx().dtype
 
+        if name in BIN_CMP_OPS:
+            self.dtype = torch.bool
+
     def _set_dependent_itervars(self, index: sympy.Expr):
         """
         Set the relevant itervars for this variable based on the `index` expression.
@@ -511,10 +563,10 @@ def _set_dependent_itervars(self, index: sympy.Expr):
         """
         for s in index.free_symbols:
             if s in V.kernel.itervars:
-                self.dependent_itervars.add(s)
-            elif s.name in V.kernel.cse.varname_map:
+                self.dependent_itervars.add(s)  # type: ignore[arg-type]
+            elif s.name in V.kernel.cse.varname_map:  # type: ignore[attr-defined]
                 self.dependent_itervars.update(
-                    V.kernel.cse.varname_map[s.name].dependent_itervars
+                    V.kernel.cse.varname_map[s.name].dependent_itervars  # type: ignore[attr-defined]
                 )
 
     def depends_on(self, itervar: sympy.Symbol):
@@ -722,6 +774,23 @@ def atanh(x):
     def copysign(x, y):
         return f"std::copysign({x}, {y})"
 
+    @staticmethod
+    def frexp(x):
+        cache_keys = f"frexp({x})[0]", f"frexp({x})[1]"
+        if all(cache_key in V.kernel.cse.cache for cache_key in cache_keys):
+            return tuple(V.kernel.cse.cache[cache_key] for cache_key in cache_keys)
+
+        code = BracesBuffer()
+        exponent = V.kernel.cse.newvar()
+        mantissa = V.kernel.cse.newvar()
+        code.writeline(f"int32_t {exponent};")
+        code.writeline(f"auto {mantissa} = std::frexp({x}, &{exponent});")
+        V.kernel.compute.splice(code)
+        cse_vars = (mantissa, exponent)
+        for cache_key, cse_var in zip(cache_keys, cse_vars):
+            V.kernel.cse.cache[cache_key] = cse_var
+        return mantissa, exponent
+
     @staticmethod
     def hypot(x, y):
         return f"std::hypot({x}, {y})"
@@ -730,6 +799,10 @@ def hypot(x, y):
     def log10(x):
         return f"std::log10({x})"
 
+    @staticmethod
+    def log2(x):
+        return f"std::log2({x})"
+
     @staticmethod
     def nextafter(x, y):
         return f"std::nextafter({x}, {y})"
@@ -744,7 +817,7 @@ def relu(x):
         elif bug == "accuracy":
             return f"{x} + decltype({x})(1)"
         elif bug is None:
-            return f"{x} * ({x}>0)"
+            return f"std::max({x}, decltype({x})(0))"
         else:
             raise AssertionError(
                 f"unrecognized config cpp.inject_relu_bug_TESTING_ONLY = {bug!r}"
@@ -769,7 +842,7 @@ def mod(a, b):
     @staticmethod
     def constant(val, dtype):
         opt_ctx: OptimizationContext = get_current_node_opt_ctx()
-        assert opt_ctx and opt_ctx.dtype is not None
+        assert opt_ctx and opt_ctx.dtype is not None, opt_ctx
         dtype = opt_ctx.dtype
         if dtype in DTYPE_LOWP_FP:
             # Since load promotes all half-precision inputs to float, constants
@@ -782,7 +855,12 @@ def index_expr(expr, dtype):
         opt_ctx: OptimizationContext = get_current_node_opt_ctx()
         assert opt_ctx and opt_ctx.dtype is not None
         dtype = opt_ctx.dtype
-        return ops.to_dtype(cexpr(V.kernel.rename_indexing(expr)), dtype)
+
+        idx_str = cexpr(V.kernel.rename_indexing(expr))
+        var = V.kernel.cse.generate(
+            V.kernel.compute, idx_str, bounds=get_bounds_index_expr(expr)
+        )
+        return ops.to_dtype(var, dtype)
 
     @staticmethod
     def masked(mask, body, other):
@@ -860,17 +938,18 @@ def sigmoid(x):
     @staticmethod
     def sign(x):
         code = BracesBuffer()
-        # auto tmp5 = tmp4 < 0 ? -1 : 1;
-        left = V.kernel.cse.newvar()
-        right = V.kernel.cse.newvar()
-        result = V.kernel.cse.newvar()
         scalar_zero = f"decltype({x})(0)"
         scalar_one = f"decltype({x})(1)"
-        code.writeline(f"auto {left} = {x} > 0 ? {scalar_one} : {scalar_zero};")
-        code.writeline(f"auto {right} = {x} < 0 ? {scalar_one} : {scalar_zero};")
-        code.writeline(f"auto {result} = {left} - {right};")
-        V.kernel.compute.splice(code)
-        return result
+        code.writeline("[&]()")
+        with code.indent():
+            code.writeline(f"auto left = {x} > 0 ? {scalar_one} : {scalar_zero};")
+            code.writeline(f"auto right = {x} < 0 ? {scalar_one} : {scalar_zero};")
+            code.writeline("return left - right;")
+        code.writeline("()")
+        return code
+
+
+CppOverrides._initialize_pointwise_overrides("cpp")
 
 
 class CppVecOverrides(CppOverrides):
@@ -893,24 +972,49 @@ def wrap(func):
             #     needs to further analyze the dependency of the index expression on
             #     the tiling itervar.
             def wrapper(*args, **kwargs):
-                has_scalar = any(
-                    not arg.is_vec for arg in args if isinstance(arg, CppCSEVariable)
-                )
-                has_vector = any(
-                    arg.is_vec for arg in args if isinstance(arg, CppCSEVariable)
-                )
+                scalars = [
+                    arg
+                    for arg in args
+                    if isinstance(arg, (int, sympy.Expr))
+                    or (isinstance(arg, CppCSEVariable) and not arg.is_vec)
+                ]
+                vectors = [
+                    arg
+                    for arg in args
+                    if isinstance(arg, CppCSEVariable) and arg.is_vec
+                ]
                 new_args = list(args)
-                if has_scalar and has_vector:
+                if scalars and vectors:
                     # broadcast scalar args to vector if needed
                     new_args = []
+                    vec_dtype = vectors[0].dtype
                     for arg in args:
+                        if isinstance(arg, (int, sympy.Expr)):
+                            arg_dtype = torch.int64
+                            opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+                            assert opt_ctx
+                            if opt_ctx.dtype is not None:
+                                arg_dtype = opt_ctx.dtype
+                            if isinstance(arg, sympy.Expr) and not arg.is_number:
+                                arg = ops.index_expr(arg, arg_dtype)
+                            else:
+                                arg = ops.constant(arg, arg_dtype)
+                            arg = arg.value if isinstance(arg, OpsValue) else arg
                         if isinstance(arg, CppCSEVariable) and not arg.is_vec:
                             assert isinstance(V.kernel, CppVecKernel)
+                            # align scalar data type to the vector for binary ops
+                            if len(args) == 2 and arg.dtype != vec_dtype:
+                                arg = ops.to_dtype(arg, vec_dtype)
+                                arg = arg.value if isinstance(arg, OpsValue) else arg
+                                # See NOTE [dtype of CppCSEVariable]: we have to fix arg.dtype since
+                                # the dtype from optimization context could be wrong.
+                                assert isinstance(arg, CppCSEVariable)
+                                arg.dtype = vec_dtype
                             new_arg = V.kernel.broadcast(arg)
                             new_args.append(new_arg)
                         else:
                             new_args.append(arg)
-                if has_vector:
+                if vectors:
                     return func(*new_args, **kwargs)
                 else:
                     # fallback to scalar ops
@@ -991,27 +1095,45 @@ def sqrt(x):
 
     @staticmethod
     def eq(x, y):
-        return f"to_float_mask({x} == {y})"
+        assert isinstance(V.kernel, CppVecKernel)
+        assert isinstance(x, CppCSEVariable)
+        assert x.dtype is not None
+        return f"{V.kernel._get_mask_type(x.dtype)}({x} == {y})"
 
     @staticmethod
     def ne(x, y):
-        return f"to_float_mask({x} != {y})"
+        assert isinstance(V.kernel, CppVecKernel)
+        assert isinstance(x, CppCSEVariable)
+        assert x.dtype is not None
+        return f"{V.kernel._get_mask_type(x.dtype)}({x} != {y})"
 
     @staticmethod
     def lt(x, y):
-        return f"to_float_mask({x} < {y})"
+        assert isinstance(V.kernel, CppVecKernel)
+        assert isinstance(x, CppCSEVariable)
+        assert x.dtype is not None
+        return f"{V.kernel._get_mask_type(x.dtype)}({x} < {y})"
 
     @staticmethod
     def gt(x, y):
-        return f"to_float_mask({x} > {y})"
+        assert isinstance(V.kernel, CppVecKernel)
+        assert isinstance(x, CppCSEVariable)
+        assert x.dtype is not None
+        return f"{V.kernel._get_mask_type(x.dtype)}({x} > {y})"
 
     @staticmethod
     def le(x, y):
-        return f"to_float_mask({x} <= {y})"
+        assert isinstance(V.kernel, CppVecKernel)
+        assert isinstance(x, CppCSEVariable)
+        assert x.dtype is not None
+        return f"{V.kernel._get_mask_type(x.dtype)}({x} <= {y})"
 
     @staticmethod
     def ge(x, y):
-        return f"to_float_mask({x} >= {y})"
+        assert isinstance(V.kernel, CppVecKernel)
+        assert isinstance(x, CppCSEVariable)
+        assert x.dtype is not None
+        return f"{V.kernel._get_mask_type(x.dtype)}({x} >= {y})"
 
     @staticmethod
     def and_(x, y):
@@ -1055,19 +1177,19 @@ def lgamma(x):
 
     @staticmethod
     def logical_and(a, b):
-        return f"({a} != 0) & ({b} != 0)"
+        return f"{a} & {b}"
 
     @staticmethod
     def logical_not(a):
-        return f"{a} == 0"
+        return f"~{a}"
 
     @staticmethod
     def logical_or(a, b):
-        return f"({a} != 0) | ({b} != 0)"
+        return f"{a} | {b}"
 
     @staticmethod
     def logical_xor(a, b):
-        return f"({a} != 0) ^ ({b} != 0)"
+        return f"{a} ^ {b}"
 
     @staticmethod
     def tan(a):
@@ -1108,6 +1230,10 @@ def sinh(x):
     def log10(x):
         return f"{x}.log10()"
 
+    @staticmethod
+    def log2(x):
+        return f"{x}.log2()"
+
     @staticmethod
     def nextafter(x):
         return f"{x}.nextafter()"
@@ -1139,9 +1265,7 @@ def asinh(x):
 
     @staticmethod
     def acosh(x):
-        # For real x, acosh(x) = log(x + sqrt(x**2 -1))
-        vec_one = f"decltype({x})(1)"
-        return f"({x} + ({x}*{x} - {vec_one}).sqrt()).log()"
+        return f"{x}.acosh()"
 
     @staticmethod
     def relu(x):
@@ -1173,8 +1297,9 @@ def floordiv(a, b):
         # a and b are integer type
         _t = f"decltype({a})"
         quot = f"{a} / {b}"
-        rem = f"{a} % {b}"
-        return f"(({a} < {_t}(0)) != ({b} < {_t}(0)) ? ({rem} != {_t}(0) ? {quot} - {_t}(1) : {quot}) : {quot})"
+        has_rem = f"({a} % {b} != {_t}(0))"
+        is_neg = f"(({a} < {_t}(0)) != ({b} < {_t}(0)))"
+        return f"{_t}::blendv({quot}, {quot} - {_t}(1), {has_rem} & {is_neg})"
 
     @staticmethod
     def truncdiv(a, b):
@@ -1195,26 +1320,30 @@ def square(a):
 
     @staticmethod
     def where(a, b, c):
-        return f"decltype({b})::blendv({c}, {b}, {a})"
+        assert isinstance(V.kernel, CppVecKernel)
+        if b.dtype == torch.bool:
+            assert c.dtype == torch.bool
+            blendv_a = f"{V.kernel._get_mask_cast(a, torch.float)}"
+            blendv_b = f"{V.kernel._get_mask_cast(b, torch.float)}"
+            blendv_c = f"{V.kernel._get_mask_cast(c, torch.float)}"
+            return f"decltype({b})::blendv({blendv_c}, {blendv_b}, {blendv_a})"
+        else:
+            return f"decltype({b})::blendv({c}, {b}, {V.kernel._get_mask_cast(a, b.dtype)})"
 
     @staticmethod
     def sign(x):
         code = BracesBuffer()
-        # auto tmp5 = tmp4 < 0 ? -1 : 1;
         vec_zero = f"decltype({x})(0)"
         vec_one = f"decltype({x})(1)"
-        blendv = f"decltype({x})::blendv({vec_zero}, {vec_one}, {vec_zero} < {x})"
-        left = V.kernel.cse.newvar()
-        code.writeline(f"auto {left} = {blendv};")
-
-        # auto tmp6 = tmp4 == 0 ? 0 : tmp5;
-        blendv = f"decltype({x})::blendv({vec_zero}, {vec_one}, {x} < {vec_zero})"
-        right = V.kernel.cse.newvar()
-        code.writeline(f"auto {right} = {blendv};")
-        result = V.kernel.cse.newvar()
-        code.writeline(f"auto {result} = {left} - {right};")
-        V.kernel.compute.splice(code)
-        return result
+        blendv_l = f"decltype({x})::blendv({vec_zero}, {vec_one}, {vec_zero} < {x})"
+        blendv_r = f"decltype({x})::blendv({vec_zero}, {vec_one}, {x} < {vec_zero})"
+        code.writeline("[&]()")
+        with code.indent():
+            code.writeline(f"auto left = {blendv_l};")
+            code.writeline(f"auto right = {blendv_r};")
+            code.writeline("return left - right;")
+        code.writeline("()")
+        return code
 
     @staticmethod
     def to_dtype(x, dtype, src_dtype=None):
@@ -1224,31 +1353,30 @@ def to_dtype(x, dtype, src_dtype=None):
             torch.bfloat16,
             torch.float16,
             torch.uint8,
+            torch.int8,
             torch.int32,
+            torch.int64,
         ], f"{__name__} does not support {dtype}"
         node: torch.fx.Node = V.interpreter.current_node
         assert node and isinstance(node, torch.fx.Node)
         opt_ctx_x = get_opt_ctx(node.args[1])
         assert opt_ctx_x
-        if opt_ctx_x.dtype in (torch.float, torch.float32) and dtype == torch.bool:
-            return f"vec_convert_to_mask({x})"
-        if opt_ctx_x.dtype == torch.bool and dtype in (torch.float, torch.float32):
-            return f"mask_convert_to_float({x})"
-        if opt_ctx_x.dtype in (torch.float, torch.float32) and dtype in DTYPE_LOWP_FP:
-            return f"cvt_fp32_to_lowp_fp<{DTYPE_TO_CPP[dtype]}>({x})"
-        if opt_ctx_x.dtype in DTYPE_LOWP_FP and dtype in (torch.float, torch.float32):
-            return f"cvt_lowp_fp_to_fp32<{DTYPE_TO_CPP[opt_ctx_x.dtype]}>({x})"
-        if opt_ctx_x.dtype == torch.uint8 and dtype in (torch.float, torch.float32):
-            # Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
-            return f"at::vec::convert_uint8_to_float({x})"
-        if opt_ctx_x.dtype in (torch.float, torch.float32) and dtype == torch.uint8:
-            # TODO(Leslie): Add fast path to at::vec::convert_float_to_uint8,
-            # if we already handle the saturation previously.
-            # * Pattern match of quantization op in the loop body.
-            # * Skip the explicit saturation and clamp inside at::vec::convert_float_to_uint8.
-            return f"at::vec::convert_float_to_uint8({x})"
-        # TODO(jgong5): support conversion for other types
-        # currently we only allow load/store torch.uint8 and handle conversion there
+        assert opt_ctx_x.dtype is not None
+        assert isinstance(V.kernel, CppVecKernel)
+        src_dtype = opt_ctx_x.dtype
+        src_cpp_type = DTYPE_TO_CPP[src_dtype]
+        src_num_vectors = V.kernel._get_num_vectors(src_dtype)
+        dst_cpp_type = DTYPE_TO_CPP[dtype]
+        dst_num_vectors = V.kernel._get_num_vectors(dtype)
+        if src_dtype != torch.bool and dtype == torch.bool:
+            return f"{V.kernel._get_mask_type(src_dtype)}::from<{src_cpp_type},{src_num_vectors}>({x})"
+        if opt_ctx_x.dtype == torch.bool and dtype != torch.bool:
+            return f"{x}.to<{dst_cpp_type},{dst_num_vectors}>()"
+        if src_dtype != dtype:
+            if src_num_vectors == dst_num_vectors == 1:
+                return f"at::vec::convert<{dst_cpp_type}>({x})"
+            else:
+                return f"at::vec::convert<{dst_cpp_type},{dst_num_vectors},{src_cpp_type},{src_num_vectors}>({x})"
         return f"({x})"
 
     @staticmethod
@@ -1265,6 +1393,7 @@ def log1p(x):
 
     @staticmethod
     def masked(mask, body, other):
+        assert isinstance(V.kernel, CppVecKernel)
         code = BracesBuffer()
         var = V.kernel.cse.newvar()
         with V.kernel.masked(mask) as new_mask:
@@ -1275,32 +1404,38 @@ def masked(mask, body, other):
         code.writeline(";")
         V.kernel.compute.splice(code)
 
+        dtype = result.dtype
         body_code = f"{var}()"
         body_code_vec = (
-            body_code if result.is_vec else f"at::vec::Vectorized<float>({body_code})"
+            body_code
+            if result.is_vec
+            else f"{V.kernel._get_vec_type(dtype)}({body_code})"
         )
-        other_code = value_to_cpp(other, "float")
-        other_code_vec = f"at::vec::Vectorized<float>({other_code})"
+        other_code = value_to_cpp(other, DTYPE_TO_CPP[dtype])
+        other_code_vec = f"{V.kernel._get_vec_type(dtype)}({other_code})"
         assert isinstance(new_mask, CppCSEVariable), new_mask
-        if new_mask.is_vec or result.is_vec:
+        if new_mask.is_vec:
             type = f"decltype({body_code_vec})"
-            float_mask = f"to_float_mask({new_mask})"
             code = BracesBuffer()
             code.writeline("[&]")
             with V.kernel.swap_buffers(code), code.indent():
-                code.writeline(f"if (all_zero({float_mask}))")
+                code.writeline(f"if ({new_mask}.all_zero())")
                 with code.indent():
                     code.writeline(f"return {other_code_vec};")
                 code.writeline("else")
                 with code.indent():
                     code.writeline(
-                        f"return {type}::blendv({other_code_vec}, {body_code_vec}, {float_mask});"
+                        f"return {type}::blendv({other_code_vec}, {body_code_vec}, {V.kernel._get_mask_cast(new_mask, dtype)});"
                     )
             code.writeline("()")
             csevar = V.kernel.cse.generate(
                 V.kernel.compute,
                 code,
             )
+        elif result.is_vec:
+            csevar = V.kernel.cse.generate(
+                V.kernel.compute, f"{mask} ? {body_code_vec} : {other_code_vec}"
+            )
         else:
             csevar = V.kernel.cse.generate(
                 V.kernel.compute, f"{mask} ? {body_code} : {other_code}"
@@ -1315,26 +1450,31 @@ def index_expr(expr, dtype):
         opt_ctx: OptimizationContext = get_current_node_opt_ctx()
         assert opt_ctx and opt_ctx.dtype is not None
         dtype = opt_ctx.dtype
-        assert dtype == torch.int32
         assert isinstance(V.kernel, CppVecKernel)
         index = V.kernel.rename_indexing(expr)
         tiling_var = V.kernel.itervars[V.kernel.tiling_idx]
-        if V.kernel.index_is_vector_invariant(index):
+        stride = V.kernel._try_get_const_stride(index, tiling_var)
+        if stride == 0:
             return CppOverrides.index_expr(expr, dtype)
-        if stride_at(
-            tiling_var, index
-        ).is_number and not V.kernel.index_indirect_depends_on(index, tiling_var):
-            stride = stride_at(tiling_var, index)
-            value = ops.to_dtype(cexpr(index), dtype)
+        elif stride is not None:
+            idx = V.kernel.cse.generate(
+                V.kernel.compute, cexpr(index), bounds=get_bounds_index_expr(expr)
+            )
+            value = ops.to_dtype(idx, dtype)
             if isinstance(value, OpsValue):
                 value = value.value
             csevar = V.kernel.arange(value, stride)
         else:
-            csevar = V.kernel.load_non_contiguous(None, index, dtype, V.kernel.compute)
+            csevar = V.kernel._load_or_store_non_contiguous(  # type: ignore[assignment]
+                None, index, dtype, V.kernel.compute
+            )
         csevar.update_on_args("index_expr", (expr, dtype), {})
         return csevar
 
 
+CppVecOverrides._initialize_pointwise_overrides("cppvec")
+
+
 class CppTile2DOverrides(CppVecOverrides):
     @staticmethod
     def index_expr(expr, dtype):
@@ -1357,13 +1497,73 @@ def __init__(self, args, num_threads):
         self.reduction_depth = None
         self.reduction_prefix = IndentedBuffer()
         self.reduction_suffix = IndentedBuffer()
-        self.reduction_var_map = {}
+        self.parallel_reduction_prefix = IndentedBuffer()
+        self.parallel_reduction_suffix = IndentedBuffer()
+        self.local_reduction_init = IndentedBuffer()
+        self.local_reduction_stores = IndentedBuffer()
+        self.is_reduction = False
         self.reduction_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc")
         self.preloads = IndentedBuffer()
         self.poststores = IndentedBuffer()
         self.num_threads = num_threads  # num_threads the kernel specialized for
         self.reduction_omp_dec: Dict[Tuple[str, str], str] = {}
 
+    def _gen_parallel_reduction_buffers(
+        self,
+        acc,
+        acc_type,
+        reduction_type,
+        dtype,
+        reduction_combine_fn=reduction_combine,
+        reduction_init_fn=reduction_init,
+    ):
+        if config.cpp.dynamic_threads and not self.parallel_reduction_prefix:
+            self.parallel_reduction_prefix.writeline(
+                "int max_threads = omp_get_max_threads();"
+            )
+        acc_local = f"{acc}_local"
+        num_threads = (
+            "max_threads" if config.cpp.dynamic_threads else parallel_num_threads()
+        )
+        acc_per_thread = f"{acc}_arr[{num_threads}]"
+        acc_local_in_array = acc_per_thread.replace(f"[{num_threads}]", "[tid]")
+        self.local_reduction_init.writeline(
+            f"{acc_type} {acc_local} = {reduction_init_fn(reduction_type, dtype)};"
+        )
+        self.parallel_reduction_prefix.writeline(f"{acc_type} {acc_per_thread};")
+        self.parallel_reduction_prefix.writelines(
+            [
+                f"for (int tid = 0; tid < {num_threads}; tid++)",
+                "{",
+                f"    {acc_local_in_array} = {reduction_init_fn(reduction_type, dtype)};",
+                "}",
+            ],
+        )
+        self.local_reduction_stores.writelines(
+            [
+                f"{acc_local_in_array} = {acc_local};",
+            ]
+        )
+        self.parallel_reduction_suffix.writelines(
+            [
+                f"for (int tid = 0; tid < {num_threads}; tid++)",
+                "{",
+                f"    {acc} = {reduction_combine_fn(reduction_type, acc, acc_local_in_array)};",
+                "}",
+            ],
+        )
+
+    def get_reduction_var_pattern(self, line: str):
+        return re.search("tmp_acc[0-9]+", line)
+
+    def update_stores_with_parallel_reduction(self):
+        for i, line in enumerate(self.stores._lines):
+            if isinstance(line, str):
+                m = self.get_reduction_var_pattern(line)
+                if m:
+                    var_name = m.group(0)
+                    self.stores._lines[i] = line.replace(var_name, f"{var_name}_local")
+
     @contextlib.contextmanager
     def masked(self, mask):
         """Context manager to add an additional mask to loads and stores."""
@@ -1383,6 +1583,64 @@ def masked(self, mask):
         finally:
             self._load_mask = prior
 
+    def cache_fp32_cse_var_before_lowp_store(self, var_to_store):
+        """
+        https://github.com/pytorch/pytorch/issues/115260
+        For FusedSchedulerNode[node1, node2], the node2 loads what node1 stores and the buffer is
+        in low-precision floating point data type. When the output of node1 also serves as the output of the
+        kernel, the result of nodes would be different from the case when output of node1 is not the output
+        of the kernel (where we don't need to insert `to_dtype` for legalization). To address the problem, on
+        storing the lowp node1 output, we also add the inverse dtype conversion to high precision data type
+        to the cse cache.
+
+        Example (pseudo code):
+            node1_output = ...
+            node1_output_lowp = to_dtype(node1_output, dtype=torch.bfloat16)
+            store(buf, node1_output_lowp)
+            node2_input_lowp = load(buf)
+            node2_input = to_dtype(node2_input_lowp, dtype=torch.float)
+
+        Without cse cache trick:
+            node1_output = ...
+            node1_output_lowp = to_dtype(node1_output, dtype=torch.bfloat16)
+            store(buf, node1_output_lowp)
+            node2_input_lowp = node_output_lowp # hit store cache
+            node2_input = to_dtype(node2_input_lowp, dtype=torch.float)
+
+        With cse cache trick:
+            node1_output = ...
+            node1_output_lowp = to_dtype(node1_output, dtype=torch.bfloat16)
+            # also add `to_dtype(node1_input_lowp, dtype=torch.float)` -> `node1_output` to cse cache
+            store(buf, node1_output_lowp)
+            node2_input_lowp = node_output_lowp # hit store cache
+            node2_input = node1_output # hit cse cache
+        """
+
+        if var_to_store.dtype not in DTYPE_LOWP_FP:
+            # only need to cache fp32 cse var while var_to_store is lowp data
+            return
+
+        def find_fp32_var(var, cache):
+            fp32_cse_var = None
+            fp32_cse_var_name = None
+            for expr, cse_var in cache.items():
+                if cse_var == var:
+                    if is_to_lowp_dtype(expr):
+                        m = re.search(r"tmp\d+", expr)
+                        if m is not None:
+                            fp32_cse_var_name = m.group()
+            if fp32_cse_var_name:
+                for cse_var in cache.values():
+                    if cse_var.name == fp32_cse_var_name:
+                        fp32_cse_var = cse_var
+                        break
+                assert fp32_cse_var is not None
+            return fp32_cse_var
+
+        fp32_var = find_fp32_var(var_to_store, self.cse.cache)
+        if fp32_var:
+            self.cse.cache[get_lowp_to_fp32_expr(var_to_store, self)] = fp32_var
+
     def scale_index_with_offset(
         self, index: sympy.Expr, scale=1, itervar_idx=-1, offset=0
     ):
@@ -1403,10 +1661,10 @@ def index_indirect_depends_on(self, index: sympy.Expr, itervar: sympy.Symbol):
         Check if an index has free symbol CppCSEVariable that depends on `itervar`.
         """
         return any(
-            self.cse.varname_map[s.name].depends_on(itervar)
+            self.cse.varname_map[s.name].depends_on(itervar)  # type: ignore[attr-defined]
             for s in index.free_symbols
-            if s.name in self.cse.varname_map
-            and isinstance(self.cse.varname_map[s.name], CppCSEVariable)
+            if s.name in self.cse.varname_map  # type: ignore[attr-defined]
+            and isinstance(self.cse.varname_map[s.name], CppCSEVariable)  # type: ignore[attr-defined]
         )
 
     def index_depends_on(self, index: sympy.Expr, itervar: sympy.Symbol):
@@ -1427,6 +1685,7 @@ def load(self, name: str, index: sympy.Expr):
     def store(self, name, index, value, mode=None):
         assert "buf" in name
         var = self.args.output(name)
+        self.cache_fp32_cse_var_before_lowp_store(value)
         index = self.rename_indexing(index)
         if mode is None:
             line = f"{var}[{cexpr_index(index)}] = {value};"
@@ -1434,6 +1693,9 @@ def store(self, name, index, value, mode=None):
             if not config.cpp.dynamic_threads and self.num_threads == 1:
                 line = f"{var}[{cexpr_index(index)}] += {value};"
             else:
+                dtype = V.graph.get_dtype(name)
+                # mirroring static_cast<float>(...) in load:
+                value = f"static_cast<{DTYPE_TO_CPP[dtype]}>({value})"
                 line = f"atomic_add(&{var}[{cexpr_index(index)}], {value});"
         else:
             raise NotImplementedError(f"store mode={mode}")
@@ -1449,48 +1711,57 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         acc = self.reduction_cse.generate(
             self.loads, f"reduction {reduction_key}", write=False
         )
-        self.reduction_var_map[acc] = reduction_type
+        self.is_reduction = True
         if argmax_or_argmin:
-            self.reduction_prefix.writelines(
-                argmax_argmin_prefix(reduction_type, src_dtype, acc)
+            prefix, parallel_prefix, local_init = argmax_argmin_prefix(
+                reduction_type, src_dtype, acc
+            )
+            self.local_reduction_init.writelines(local_init)
+            self.reduction_prefix.writelines(prefix)
+            self.parallel_reduction_prefix.writelines(parallel_prefix)
+            compare_op = (
+                "greater_or_nan" if reduction_type == "argmax" else "less_or_nan"
             )
-            compare_op = "<" if reduction_type == "argmax" else ">"
             assert self.reduction_depth is not None
             index = self.itervars[self.reduction_depth]
             for i in range(self.reduction_depth + 1, len(self.itervars)):
                 index = index * self.ranges[i] + self.itervars[i]
             self.stores.writelines(
                 [
-                    f"if ({acc}.value {compare_op} {value}) {{",
+                    f"if(!({compare_op}({acc}.value, {value}, {acc}.index, {cexpr_index(index)}))) {{",
                     f"    {acc}.index = {cexpr_index(index)}; {acc}.value = {value};",
                     "}",
+                ]
+            )
+            acc_local = f"{acc}_local"
+            num_threads = parallel_num_threads()
+            acc_per_thread = f"{acc}_arr[{num_threads}]"
+            acc_local_in_array = acc_per_thread.replace(f"[{num_threads}]", "[tid]")
+            self.parallel_reduction_suffix.writelines(
+                [
+                    f"for (int tid = 0; tid < {num_threads}; tid++)",
+                    "{",
+                    f"    if(!({compare_op}({acc}.value, {acc_local_in_array}.value, {acc}.index, {acc_local_in_array}.index))) {{",
+                    f"        {acc}.index = {acc_local_in_array}.index; {acc}.value = {acc_local_in_array}.value;",
+                    "    }",
+                    "}",
                 ],
             )
+            self.local_reduction_stores.writelines(
+                [
+                    f"{acc_local_in_array} = {acc_local};",
+                ]
+            )
         else:
             acc_type = reduction_acc_type(reduction_type, dtype)
 
-            if (reduction_type, acc_type) not in self.reduction_omp_dec:
-                if RTYPE_TO_CPP[reduction_type] not in NATIVE_OMP_RTYPES:
-                    # Scalar reduction for other reductions are declared by default
-                    self.reduction_prefix.splice(
-                        f"""\
-    #pragma omp declare reduction(\
-    {RTYPE_TO_CPP[reduction_type]}:{acc_type}:\
-    omp_out = {reduction_combine(reduction_type, "omp_out", "omp_in")}) \
-    initializer(omp_priv={{{reduction_init(reduction_type, dtype)}}})
-                """
-                    )
-                self.reduction_omp_dec[reduction_type, acc_type] = RTYPE_TO_CPP[
-                    reduction_type
-                ]
-
             self.reduction_prefix.writeline(
                 f"{acc_type} {acc} = {reduction_init(reduction_type, dtype)};"
             )
             self.stores.writeline(
                 f"{acc} = {reduction_combine(reduction_type, acc, value)};"
             )
-
+            self._gen_parallel_reduction_buffers(acc, acc_type, reduction_type, dtype)
         result = reduction_project(reduction_type, acc)
         self.reduction_cse.reduction_cache[reduction_key] = result
         return result
@@ -1511,7 +1782,10 @@ def set_ranges(self, lengths, reduction_lengths):
         else:
             self.call_ranges = tuple(lengths) + tuple(reduction_lengths)
             self.ranges = [self.rename_indexing(x) for x in self.call_ranges]
-            self.itervars = [sympy_symbol(f"x{n}") for n in range(len(self.ranges))]
+            self.itervars = [
+                sympy_index_symbol_with_prefix(SymT.XBLOCK, n)
+                for n in range(len(self.ranges))
+            ]
             self.reduction_depth = len(lengths)
         return (
             self.itervars[: self.reduction_depth],
@@ -1526,9 +1800,18 @@ def size_hint(self):
     def codegen_loops_impl(self, loop_nest, code, worksharing):
         threads = parallel_num_threads()
         assert self.call_ranges is not None
-        par_depth = self.decide_parallel_depth(
-            self.call_ranges[: loop_nest.max_parallel_depth()], threads
-        )
+        kernels = loop_nest.get_kernels()
+        if any(isinstance(kernel, OuterLoopFusedKernel) for kernel in kernels):
+            assert len(kernels) == 1
+            assert isinstance(kernels[0], OuterLoopFusedKernel)
+            par_depth = kernels[0].decide_parallel_depth(
+                loop_nest.max_parallel_depth(), threads
+            )
+        else:
+            par_depth = self.decide_parallel_depth(
+                loop_nest.max_parallel_depth(), threads
+            )
+
         with contextlib.ExitStack() as stack:
             if par_depth:
                 if loop_nest.is_reduction_only():
@@ -1541,55 +1824,97 @@ def codegen_loops_impl(self, loop_nest, code, worksharing):
                 if worksharing.single():
                     stack.enter_context(code.indent())
 
+            def gen_loop_kernel(loop: LoopLevel):
+                def is_parallel_reduction(loop):
+                    root = loop.get_root()
+                    return root.is_reduction and root.parallel
+
+                kernels = loop.get_kernels()
+                assert len(kernels) == 1
+                if not isinstance(
+                    kernels[0], OuterLoopFusedKernel
+                ) and is_parallel_reduction(loop):
+                    kernels[0].update_stores_with_parallel_reduction()
+                gen_kernel(kernels[0])
+
             def gen_kernel(kernel):
-                with contextlib.ExitStack() as stack:
-                    assert kernel
+                if isinstance(kernel, OuterLoopFusedKernel):
+                    for loop in kernel.inner:
+                        if loop.inner:
+                            gen_loops(loop.inner, loop.is_reduction)
+                        else:
+                            with contextlib.ExitStack() as stack:
+                                # If there is any kernel existing at the final outer loop fusion level,
+                                # the kernel code should be placed within its respective indent to prevent
+                                # the duplication of variable definitions.
+                                stack.enter_context(code.indent())
+                                gen_loop_kernel(loop)
+                else:
+                    with contextlib.ExitStack() as stack:
+                        assert kernel
+                        if hasattr(kernel, "codegen_inner_loops"):
+                            code.splice(kernel.preloads)
+                            kernel.codegen_inner_loops(code)
+                            stack.enter_context(code.indent())
+                        code.splice(kernel.loads)
+                        code.splice(kernel.compute)
+                        code.splice(kernel.stores)
                     if hasattr(kernel, "codegen_inner_loops"):
-                        code.splice(kernel.preloads)
-                        kernel.codegen_inner_loops(code)
-                        stack.enter_context(code.indent())
-                    code.splice(kernel.loads)
-                    code.splice(kernel.compute)
-                    code.splice(kernel.stores)
-                if hasattr(kernel, "codegen_inner_loops"):
-                    code.splice(kernel.poststores)
-
-            def get_reduction_code_buffer(loops, is_suffix=True):
+                        code.splice(kernel.poststores)
+
+            def get_reduction_code_buffer(loops, buffer="prefix"):
+                assert buffer in ("prefix", "suffix", "local")
                 for loop in loops:
                     for kernel in loop.get_kernels():
-                        if is_suffix:
-                            return kernel.reduction_suffix
+                        if buffer == "local":
+                            return (
+                                kernel.local_reduction_init,
+                                kernel.local_reduction_stores,
+                            )
+                        elif buffer == "suffix":
+                            suffix = kernel.reduction_suffix
+                            if loop.parallel:
+                                suffix = kernel.parallel_reduction_suffix + suffix
+                            return suffix
                         else:
-                            return kernel.reduction_prefix
-                return None
+                            prefix = kernel.reduction_prefix
+                            if loop.parallel:
+                                prefix = prefix + kernel.parallel_reduction_prefix
+                            return prefix
 
             def gen_loops(loops: List[LoopLevel], in_reduction=False):
                 with contextlib.ExitStack() as stack_outer:
+                    local_reduction_init = local_reduction_stores = None
                     if loops:
                         loop = loops[0]
-                        if loop.is_reduction() and not in_reduction:
-                            reduction_prefix = get_reduction_code_buffer(
-                                loops, is_suffix=False
-                            )
+                        if loop.is_reduction and not in_reduction:
+                            reduction_prefix = get_reduction_code_buffer(loops)
                             if reduction_prefix:
                                 stack_outer.enter_context(code.indent())
                             code.splice(reduction_prefix)
                         if loop_nest.is_reduction_only() and loop.parallel:
+                            (
+                                local_reduction_init,
+                                local_reduction_stores,
+                            ) = get_reduction_code_buffer(loops, "local")
                             worksharing.parallel(threads)
+                            if local_reduction_init:
+                                assert local_reduction_stores
+                                code.splice(local_reduction_init)
 
                     for loop in loops:
-                        gen_loop(loop, in_reduction)
+                        gen_loop(loop)
 
                     if loops:
                         loop = loops[0]
                         if loop_nest.is_reduction_only() and loop.parallel:
+                            if local_reduction_stores:
+                                code.splice(local_reduction_stores)
                             worksharing.close()
-                        if loop.is_reduction() and not in_reduction:
-                            code.splice(
-                                get_reduction_code_buffer(loops, is_suffix=True)
-                            )
+                        if loop.is_reduction and not in_reduction:
+                            code.splice(get_reduction_code_buffer(loops, "suffix"))
 
-            def gen_loop(loop: LoopLevel, in_reduction=False):
+            def gen_loop(loop: LoopLevel):
                 with contextlib.ExitStack() as stack:
                     loop_lines = loop.lines()
                     if loop_lines is None:
@@ -1598,11 +1923,9 @@ def gen_loop(loop: LoopLevel, in_reduction=False):
                     stack.enter_context(code.indent())
                     # generate inner loops or loop body
                     if loop.inner:
-                        gen_loops(loop.inner, loop.is_reduction())
+                        gen_loops(loop.inner, loop.is_reduction)
                     else:
-                        kernels = loop.get_kernels()
-                        assert len(kernels) == 1
-                        gen_kernel(kernels[0])
+                        gen_loop_kernel(loop)
 
             stack.enter_context(code.indent())
             if loop_nest.root:
@@ -1616,9 +1939,14 @@ def codegen_loops(self, code, worksharing):
 
     @property
     def assert_function(self) -> str:
-        return "TORCH_CHECK"
+        if V.graph.aot_mode:
+            return "AOTI_TORCH_CHECK"
+        else:
+            return "TORCH_CHECK"
 
-    def decide_parallel_depth(self, ranges, threads):
+    def decide_parallel_depth(self, max_parallel_depth, threads):
+        assert self.call_ranges is not None
+        ranges = self.call_ranges[:max_parallel_depth]
         seq = self.size_hint()
         par = 1
         depth = 0
@@ -1668,24 +1996,54 @@ def __init__(
         tiling_dtype=torch.float,
     ):
         super().__init__(args, num_threads)
-        assert codecache.pick_vec_isa()
+        self.vec_isa = codecache.pick_vec_isa()
+        assert self.vec_isa
         if tiling_factor == 0:
-            tiling_factor = codecache.pick_vec_isa().nelements(dtype=tiling_dtype)
+            tiling_factor = self.vec_isa.nelements(dtype=tiling_dtype)
         self.tiling_factor = tiling_factor
         self.tiling_idx = tiling_idx
-        metrics.generated_cpp_vec_kernel_count += 1
 
-    def index_is_vector_invariant(self, index: sympy.Expr):
-        """`index` is either independent from the tiling itervar or unchanged in the vector range"""
-        tiling_var = self.itervars[self.tiling_idx]
-        if not self.index_depends_on(index, tiling_var):
-            return True
-        if not self.index_indirect_depends_on(index, tiling_var):
-            vec_range = [
-                sympy_subs(index, {tiling_var: i}) for i in range(self.tiling_factor)
-            ]
-            return all(expr == vec_range[0] for expr in vec_range)
-        return False
+    def _try_get_const_stride(self, index: sympy.Expr, itervar: sympy.Symbol):
+        if self.index_indirect_depends_on(index, itervar):
+            return None
+        for indirect_var in (
+            self.cse.varname_map[s.name]  # type: ignore[attr-defined]
+            for s in index.free_symbols
+            if symbol_is_type(s, SymT.TMP)
+        ):
+            assert isinstance(indirect_var, CppCSEVariable)
+            if indirect_var.is_vec:
+                return None
+        stride = stride_at_vec_range(index, itervar, self.tiling_factor)
+        return stride if stride.is_number else None
+
+    def _get_num_vectors(self, dtype: torch.dtype) -> int:
+        num_vectors = math.ceil(
+            self.tiling_factor * dtype.itemsize * 8 / self.vec_isa.bit_width()
+        )
+        assert num_vectors >= 1
+        return num_vectors
+
+    def _get_vec_type(self, dtype: torch.dtype) -> str:
+        num_vectors = self._get_num_vectors(dtype)
+        if num_vectors == 1:
+            return f"at::vec::Vectorized<{DTYPE_TO_CPP[dtype]}>"
+        else:
+            return f"at::vec::VectorizedN<{DTYPE_TO_CPP[dtype]},{num_vectors}>"
+
+    def _get_mask_type(self, dtype: torch.dtype = torch.float) -> str:
+        if dtype == torch.bool:
+            return ""
+        num_vectors = self._get_num_vectors(dtype)
+        return f"at::vec::VecMask<{DTYPE_TO_CPP[dtype]},{num_vectors}>"
+
+    def _get_mask_cast(self, mask: CppCSEVariable, dtype: torch.dtype) -> str:
+        assert mask.dtype == torch.bool, repr(mask)
+        num_vectors = self._get_num_vectors(dtype)
+        return f"{mask}.template cast<{DTYPE_TO_CPP[dtype]},{num_vectors}>()"
+
+    def get_reduction_var_pattern(self, line: str):
+        return re.search("tmp_acc[0-9]+_vec", line)
 
     def _get_vec_load_line(
         self,
@@ -1705,41 +2063,39 @@ def _get_vec_load_line(
         """
         opt_ctx: OptimizationContext = get_current_node_opt_ctx()
         assert opt_ctx is not None
-        load_mask_str = f"to_float_mask({load_mask})" if load_mask else None
+        cpp_type = DTYPE_TO_CPP[dtype]
+        num_vectors = self._get_num_vectors(dtype)
+        load_mask_str = None
+        if load_mask:
+            if not load_mask.is_vec:
+                # TODO: avoid hard-code torch.float
+                load_mask_str = f"{self._get_mask_type(torch.float)}::from({load_mask})"
+            else:
+                load_mask_str = f"{self._get_mask_cast(load_mask, torch.float)}"
         loadbuf = f"{var} + {cexpr_index(index)}" if index != 0 else var
-        if dtype == torch.uint8 and opt_ctx.is_load_uint8_as_float:
-            line = (
-                f"masked_load({loadbuf}, {load_mask_str})"
-                if load_mask_str
-                else f"at::vec::Vectorized<uint8_t>::loadu_one_fourth({loadbuf})"
-            )
-        elif opt_ctx.is_load_as_mask:
-            line = f"flag_to_float_vec({loadbuf})"
-        elif dtype in DTYPE_LOWP_FP:
-            line = (
-                f"masked_load({loadbuf}, {load_mask_str})"
-                if load_mask_str
-                else f"at::vec::Vectorized<{DTYPE_TO_CPP[dtype]}>::loadu({loadbuf}, {self.tiling_factor})"
-            )
+        if dtype == torch.bool:
+            # TODO: should we consider load mask here?
+            line = f"{self._get_mask_type()}::from({loadbuf})"
         else:
             line = (
-                f"masked_load({loadbuf}, {load_mask_str})"
+                f"{load_mask_str}.template loadu<{cpp_type},{num_vectors}>({loadbuf})"
                 if load_mask_str
-                else f"at::vec::Vectorized<{DTYPE_TO_CPP[dtype]}>::loadu({loadbuf})"
+                else f"{self._get_vec_type(dtype)}::loadu({loadbuf}, {self.tiling_factor})"
             )
         return line
 
-    def load_non_contiguous(
+    def _load_or_store_non_contiguous(
         self,
         var: Optional[str],
         index: sympy.Expr,
         dtype: torch.dtype,
         buffer: Optional[IndentedBuffer] = None,
-    ) -> CppCSEVariable:
+        store_value: Optional[Union[str, CppCSEVariable]] = None,
+    ) -> Optional[CppCSEVariable]:
         """
-        Load a vector in a non-contiguous way. The vector is initialized from an array that is
+        Load or store a vector in a non-contiguous way. The vector is initialized from an array that is
         filled in an inner loop over the tiling factor.
-        :param var: buffer to load from, i.e. `var[transformed(index)]`. If None, we load the index
+        :param var: buffer to load from or store to, i.e. `var[transformed(index)]`. If None, we load the index
                     as index expression, i.e. `transformed(index)`.
         :param index: index into the `var` or the index expression by its own if `var` is None.
                       The `index` could contain indirect indexing or the tiling itervar. When used in
@@ -1748,20 +2104,25 @@ def load_non_contiguous(
                       2. the indirect indexing vector variables are transformed into arrays over the tiling dim.
         :param dtype: data type of `var` or `index` if `var` is None.
         :param buffer: the code buffer to write the generated code to. If None, we write to `self.loads`.
-        :return: a CppCSEVariable that represents the loaded vector.
+        :param store_value: the value to store. If None, we load the vector.
+        :return: a CppCSEVariable that represents the loaded vector or None if it is a store.
         """
+        assert not store_value or var is not None, "store var must be provided"
+
         if buffer is None:
             buffer = self.loads
 
         def get_result_size(dtype: torch.dtype) -> int:
-            assert dtype.itemsize <= 4
-            return self.tiling_factor * (4 // dtype.itemsize)
+            if dtype.itemsize < 4:
+                return self.tiling_factor * (4 // dtype.itemsize)
+            else:
+                return self.tiling_factor
 
         def vec_to_array(vec_var: CppCSEVariable) -> CppCSEVariable:
             assert vec_var.is_vec
             code = BracesBuffer()
             code.writeline("[&]")
-            with self.swap_buffers(code), code.indent():
+            with code.indent():
                 vec_dtype = vec_var.dtype
                 assert vec_dtype is not None
                 if vec_dtype == torch.bool:
@@ -1780,40 +2141,40 @@ def vec_to_array(vec_var: CppCSEVariable) -> CppCSEVariable:
 
         opt_ctx: OptimizationContext = get_current_node_opt_ctx()
         assert opt_ctx is not None
-        is_mask = opt_ctx.is_load_as_mask
         code = BracesBuffer()
         code.writeline("[&]")
-        with self.swap_buffers(code), code.indent():
-            result_type = "float" if is_mask else f"{DTYPE_TO_CPP[dtype]}"
+        with code.indent():
             result_size = get_result_size(dtype)
             result_declare = (
-                f"__at_align__ std::array<{result_type}, {result_size}> tmpbuf;"
+                f"__at_align__ std::array<{DTYPE_TO_CPP[dtype]}, {result_size}> tmpbuf;"
             )
             code.writeline(result_declare)
-            itervar_inner = sympy_symbol(f"{self.itervars[self.tiling_idx]}_inner")
+            if store_value:
+                code.writeline(f"{store_value}.store(tmpbuf.data());")
+            itervar_inner = sympy_index_symbol(
+                f"{self.itervars[self.tiling_idx]}_inner"
+            )
             replacements = {}
             for indirect_var in (
-                self.cse.varname_map[s.name]
+                self.cse.varname_map[s.name]  # type: ignore[attr-defined]
                 for s in index.free_symbols
-                if s.name.startswith("tmp")
+                if symbol_is_type(s, SymT.TMP)
             ):
                 assert isinstance(indirect_var, CppCSEVariable)
                 if indirect_var.is_vec:
                     array_var = vec_to_array(indirect_var)
                     replacements[indirect_var] = f"{array_var}[{itervar_inner}]"
+            index = self.scale_index_with_offset(
+                index, itervar_idx=self.tiling_idx, offset=itervar_inner
+            )
             load_mask = None
             if self._load_mask is not None:
+                assert not store_value, "unexpected store with load mask"
                 assert isinstance(self._load_mask, CppCSEVariable), self._load_mask
                 if self._load_mask.is_vec:
-                    load_mask = (
-                        f"vector_lane_mask_check({self._load_mask}, {itervar_inner})"
-                    )
+                    load_mask = f"{self._load_mask}.is_masked({itervar_inner})"
                 else:
                     load_mask = f"{self._load_mask} != 0"
-            index = sympy_subs(index, replacements)
-            index = self.scale_index_with_offset(
-                index, itervar_idx=self.tiling_idx, offset=itervar_inner
-            )
             if codecache.is_gcc():
                 code.writeline(f"#pragma GCC unroll {self.tiling_factor}")
             else:
@@ -1822,24 +2183,34 @@ def vec_to_array(vec_var: CppCSEVariable) -> CppCSEVariable:
                 f"for (long {itervar_inner} = 0; {itervar_inner} < {self.tiling_factor}; {itervar_inner}++)"
             )
             with code.indent(), contextlib.ExitStack() as stack:
-                rhs = (
-                    f"{var}[{cexpr_index(index)}]"
-                    if var is not None
-                    else f"{cexpr_index(index)}"
-                )
-                if is_mask:
-                    rhs = f"flag_to_float_scalar({rhs})"
+                index_c = cexpr_index(index)
+                for indirect_var in replacements:
+                    index_c = re.sub(
+                        r"\b" + f"{indirect_var}" + r"\b",
+                        replacements[indirect_var],
+                        index_c,
+                    )
+                rhs = f"{var}[{index_c}]" if var is not None else f"{index_c}"
                 if load_mask:
                     code.writeline(f"if ({load_mask})")
                     stack.enter_context(code.indent())
-                code.writeline(f"tmpbuf[{itervar_inner}] = {rhs};")
-            load_line = self._get_vec_load_line("tmpbuf.data()", 0, dtype)
-            code.writeline(f"return {load_line};")
+                if store_value:
+                    code.writeline(f"{rhs} = tmpbuf[{itervar_inner}];")
+                else:
+                    code.writeline(f"tmpbuf[{itervar_inner}] = {rhs};")
+            if not store_value:
+                load_line = self._get_vec_load_line("tmpbuf.data()", 0, dtype)  # type: ignore[arg-type]
+                code.writeline(f"return {load_line};")
         code.writeline("()")
-        csevar = self.cse.generate(buffer, code)
-        assert isinstance(csevar, CppCSEVariable)
-        csevar.is_vec = True
-        return csevar
+        if store_value:
+            code.writeline(";")
+            buffer.splice(code)
+            return None
+        else:
+            csevar = self.cse.generate(buffer, code)
+            assert isinstance(csevar, CppCSEVariable)
+            csevar.is_vec = True
+            return csevar
 
     def load(self, name: str, index: sympy.Expr):
         opt_ctx: OptimizationContext = get_current_node_opt_ctx()
@@ -1847,23 +2218,22 @@ def load(self, name: str, index: sympy.Expr):
         index = self.rename_indexing(index)
         dtype = V.graph.get_dtype(name)
         tiling_var = self.itervars[self.tiling_idx]
-        if self.index_is_vector_invariant(index):
+        stride = self._try_get_const_stride(index, tiling_var)
+        if stride == 0:
             # load scalar and lazily broadcast it on demand
             return super().load(name, index)
-        non_contiguous = stride_at(
-            tiling_var, index
-        ) != 1 or self.index_indirect_depends_on(index, tiling_var)
-        if non_contiguous:
-            csevar = self.load_non_contiguous(var, index, dtype)
-        else:
+        elif stride == 1:
+            # load contiguously
             line = self._get_vec_load_line(var, index, dtype, self._load_mask)
             csevar = self.cse.generate(self.loads, line)  # type: ignore[assignment]
+        else:
+            csevar = self._load_or_store_non_contiguous(var, index, dtype)  # type: ignore[assignment]
         assert isinstance(csevar, CppCSEVariable)
         csevar.update_on_args("load", (name, index), {})
         csevar.is_vec = True
         return csevar
 
-    def _get_vec_store_line(
+    def _get_store_line(
         self,
         value: Union[str, CppCSEVariable],
         var: str,
@@ -1871,7 +2241,8 @@ def _get_vec_store_line(
         dtype: torch.dtype,
     ):
         """
-        Get a store line str that stores `value` into `var` at `index` of `dtype`.
+        Get a store line buffer that stores `value` into `var` at `index` of `dtype`. It handles
+        both contiguous and non-contiguous store cases.
         :param value: Vectorized type templaterized on `dtype`.
         :param var: buffer to store into.
         :index: index into the `var`.
@@ -1882,29 +2253,19 @@ def _get_vec_store_line(
             isinstance(value, CppCSEVariable) and value.is_vec
         ), value
         tiling_var = self.itervars[self.tiling_idx]
-        assert index.has(tiling_var)
         var_expr = f"{var} + {cexpr_index(index)}"
-        non_contiguous = stride_at(tiling_var, index) != 1 or "tmp" in f"{index}"
-        if non_contiguous:
-            var_expr = "tmpbuf"
-        if dtype == torch.float:
-            line = f"{value}.store({var_expr});"
+        stride = self._try_get_const_stride(index, tiling_var)
+        code = IndentedBuffer()
+        if stride == 1:
+            if dtype == torch.float:
+                code.writeline(f"{value}.store({var_expr});")
+            else:
+                code.writeline(f"{value}.store({var_expr}, {self.tiling_factor});")
         else:
-            line = f"{value}.store({var_expr}, {self.tiling_factor});"
-        if non_contiguous:
-            inner = sympy_symbol(f"{tiling_var}_inner")
-            new_index = self.scale_index_with_offset(
-                index, itervar_idx=self.tiling_idx, offset=inner
-            )
-            tmp_bufsize = (
-                f"{self.tiling_factor}*sizeof(float)/sizeof({DTYPE_TO_CPP[dtype]})"
+            self._load_or_store_non_contiguous(
+                var, index, dtype, buffer=code, store_value=value
             )
-            line = (
-                f"{{ __at_align__ {DTYPE_TO_CPP[dtype]} tmpbuf[{tmp_bufsize}]; {line} "
-                f"for (long {inner} = 0; {inner} < {self.tiling_factor}; {inner}++) "
-                f"{var}[{cexpr_index(new_index)}] = tmpbuf[{inner}]; }}"
-            )
-        return line
+        return code
 
     def store(self, name, index, value, mode=None):
         assert "buf" in name
@@ -1915,13 +2276,10 @@ def store(self, name, index, value, mode=None):
             value = self.broadcast(value)
         opt_ctx: OptimizationContext = get_current_node_opt_ctx()
         var = self.args.output(name)
+        self.cache_fp32_cse_var_before_lowp_store(value)
         index = self.rename_indexing(index)
-        self.stores.writeline(
-            DeferredLine(
-                name,
-                self._get_vec_store_line(value, var, index, V.graph.get_dtype(name)),
-            )
-        )
+        code = self._get_store_line(value, var, index, V.graph.get_dtype(name))
+        self.stores.splice(code.map(lambda x: DeferredLine(name, x)))
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
         assert reduction_type in {
@@ -1933,75 +2291,66 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             "welford_reduce",
             "welford_combine",
         }
-        assert dtype == torch.float
-        assert src_dtype == torch.float
-        assert isinstance(value, CppCSEVariable) and value.is_vec, value
-
-        vec_ns = "at::vec"
-        vec = f"{vec_ns}::Vectorized<{DTYPE_TO_CPP[dtype]}>"
-        acc_type = reduction_acc_type(reduction_type, dtype)
-        acc_type_vec = reduction_acc_type_vec(reduction_type, dtype)
-
-        if (reduction_type, acc_type) not in self.reduction_omp_dec:
-            if RTYPE_TO_CPP[reduction_type] not in NATIVE_OMP_RTYPES:
-                # Scalar reduction for other reductions are declared by default
-                self.reduction_prefix.splice(
-                    f"""\
-#pragma omp declare reduction(\
-{RTYPE_TO_CPP[reduction_type]}:{acc_type}:\
-omp_out = {reduction_combine(reduction_type, "omp_out", "omp_in")}) \
-initializer(omp_priv={{{reduction_init(reduction_type, dtype)}}})
-            """
-                )
-            self.reduction_omp_dec[reduction_type, acc_type] = RTYPE_TO_CPP[
-                reduction_type
-            ]
+        assert dtype == src_dtype
+        assert dtype in [torch.float, torch.int64]
+        assert isinstance(value, CppCSEVariable), value
 
-        if (reduction_type, acc_type_vec) not in self.reduction_omp_dec:
-            self.reduction_prefix.splice(
-                f"""\
-#pragma omp declare reduction(\
-{RTYPE_TO_CPP[reduction_type]}:{acc_type_vec}:\
-omp_out = {reduction_combine_vec(reduction_type, "omp_out", "omp_in")}) \
-initializer(omp_priv={{{reduction_init_vec(reduction_type, dtype)}}})
-            """
-            )
-            self.reduction_omp_dec[reduction_type, acc_type_vec] = RTYPE_TO_CPP[
-                reduction_type
-            ]
+        if not value.is_vec:
+            value = self.broadcast(value)
 
         reduction_key = src_dtype, reduction_type, value
         if reduction_key in self.reduction_cse.reduction_cache:
             return self.reduction_cse.reduction_cache[reduction_key]
 
+        vec_ns = "at::vec"
+        vec = f"{vec_ns}::Vectorized<{DTYPE_TO_CPP[dtype]}>"
+        acc_type = reduction_acc_type(reduction_type, dtype)
+        acc_type_vec = self.reduction_acc_type_vec(reduction_type, dtype)
+
         acc = self.reduction_cse.generate(
             self.loads, f"reduction {reduction_key}", write=False
         )
         acc_vec = f"{acc}_vec"
-
-        self.reduction_var_map[acc_vec] = reduction_type
+        self.is_reduction = True
         self.reduction_prefix.writeline(
             f"{acc_type} {acc} = {reduction_init(reduction_type, dtype)};"
         )
         self.reduction_prefix.writeline(
-            f"{acc_type_vec} {acc_vec} = {reduction_init_vec(reduction_type, dtype)};"
+            f"{acc_type_vec} {acc_vec} = {self.reduction_init_vec(reduction_type, dtype)};"
         )
         self.stores.writeline(
-            f"{acc_vec} = {reduction_combine_vec(reduction_type, acc_vec, value)};"
+            f"{acc_vec} = {self.reduction_combine_vec(reduction_type, acc_vec, value)};"
+        )
+        self._gen_parallel_reduction_buffers(
+            acc,
+            acc_type,
+            reduction_type,
+            dtype,
+        )
+        self._gen_parallel_reduction_buffers(
+            acc_vec,
+            acc_type_vec,
+            reduction_type,
+            dtype,
+            reduction_combine_fn=self.reduction_combine_vec,
+            reduction_init_fn=self.reduction_init_vec,
         )
-
         tmpvar: Union[str, CSEVariable]
         if self.tiling_idx >= self.reduction_depth:
             # Horizontal reduction
             if is_welford_reduction(reduction_type):
+                assert (
+                    self._get_num_vectors(dtype) == 1
+                ), "Welford reduction does not support VectorizedN (N>1)"
                 next_value = f"welford_vec_reduce_all({acc_vec})"
             else:
                 reduce_all_body = (
                     "{ return "
-                    + reduction_combine_vec(reduction_type, "x", "y")
+                    + self.reduction_combine_vec(reduction_type, "x", "y")
                     + "; }"
                 )
-                vec_reduce_all_func = f"{vec_ns}::vec_reduce_all<{DTYPE_TO_CPP[dtype]}>"
+                vec = f"at::vec::Vectorized<{DTYPE_TO_CPP[dtype]}>"
+                vec_reduce_all_func = f"at::vec::vec_reduce_all<{DTYPE_TO_CPP[dtype]}>"
                 next_value = f"{vec_reduce_all_func}([]({vec}& x, {vec}& y) {reduce_all_body}, {acc_vec})"
 
             self.reduction_suffix.writeline(
@@ -2019,52 +2368,35 @@ def store_reduction(self, name, index, value):
         index = self.rename_indexing(index)
         var = self.args.output(name)
         out_dtype = V.graph.get_dtype(name)
-        # Only float reductions are vectorized currently
-        dtype = torch.float
+        dtype = torch.float if out_dtype.is_floating_point else torch.int64
+        code = IndentedBuffer()
         if self.tiling_idx >= self.reduction_depth:
             # Horizontal reduction
-            self.reduction_suffix.writeline(
-                DeferredLine(
-                    name,
-                    f"{var}[{cexpr_index(index)}] = static_cast<{DTYPE_TO_CPP[out_dtype]}>({value});",
-                )
+            code.writeline(
+                f"{var}[{cexpr_index(index)}] = static_cast<{DTYPE_TO_CPP[out_dtype]}>({value});"
             )
         else:
             # Vertical reduction
-            store_lines = []
             if out_dtype != dtype:
-                if out_dtype in DTYPE_LOWP_FP and dtype == torch.float:
-                    _lowp_fp_tmpvar_vec = f"{DTYPE_TO_CPP[out_dtype]}_{value}"
-                    store_lines = [
-                        DeferredLine(
-                            name,
-                            f"auto {_lowp_fp_tmpvar_vec} = cvt_fp32_to_lowp_fp<{DTYPE_TO_CPP[out_dtype]}>({value});",
-                        )
-                    ]
-                    value = _lowp_fp_tmpvar_vec
-                else:
-                    raise AssertionError(
-                        f"Unsupported reduction type from {dtype} to {out_dtype}"
-                    )
-            store_lines += [
-                DeferredLine(
-                    name,
-                    self._get_vec_store_line(value, var, index, out_dtype),
+                converted_value = f"{DTYPE_TO_CPP[out_dtype]}_{value}"
+                code.writeline(
+                    f"auto {converted_value} = at::vec::convert<{DTYPE_TO_CPP[out_dtype]}>({value});"
                 )
-            ]
-            self.reduction_suffix.writelines(store_lines)
+                value = converted_value
+            code.splice(self._get_store_line(value, var, index, out_dtype))
+        self.reduction_suffix.splice(code.map(lambda x: DeferredLine(name, x)))
 
     def broadcast(self, scalar_var: CppCSEVariable) -> CppCSEVariable:
         assert not scalar_var.is_vec
         if scalar_var.dtype == torch.bool:
             vec_var = self.cse.generate(
-                self.compute, f"to_float_mask({scalar_var.name})"
+                self.compute, f"{self._get_mask_type()}::from({scalar_var.name})"
             )
         else:
             assert scalar_var.dtype is not None
             vec_var = self.cse.generate(
                 self.compute,
-                f"at::vec::Vectorized<{DTYPE_TO_CPP[scalar_var.dtype]}>({scalar_var.name})",
+                f"{self._get_vec_type(scalar_var.dtype)}({scalar_var.name})",
             )
         assert isinstance(vec_var, CppCSEVariable)
         vec_var.dtype = scalar_var.dtype
@@ -2072,22 +2404,86 @@ def broadcast(self, scalar_var: CppCSEVariable) -> CppCSEVariable:
         vec_var.is_vec = True
         return vec_var
 
-    def arange(
-        self, index: Union[sympy.Expr, CppCSEVariable], stride: sympy.Symbol
-    ) -> CppCSEVariable:
-        if isinstance(index, sympy.Expr):
-            index = cexpr(index)
-        else:
-            assert isinstance(index, CppCSEVariable)
-            assert not index.is_vec
+    def arange(self, index: CppCSEVariable, stride: sympy.Symbol) -> CppCSEVariable:
+        assert not index.is_vec
+        assert index.dtype is not None
         csevar = self.cse.generate(
-            self.compute, f"at::vec::Vectorized<int32_t>::arange({index}, {stride})"
+            self.compute,
+            f"{self._get_vec_type(index.dtype)}::arange({index}, {stride})",
         )
         assert isinstance(csevar, CppCSEVariable)
-        csevar.dtype = torch.int32
+        csevar.dtype = index.dtype
         csevar.is_vec = True
         return csevar
 
+    def reduction_init_vec(self, reduction_type, dtype):
+        scalar_type = DTYPE_TO_COMPUTATION_DTYPE[dtype]
+        vec_type = self._get_vec_type(scalar_type)
+
+        if is_welford_reduction(reduction_type):
+            return f"Welford<{vec_type}>()"
+
+        scalar_init = reduction_init(reduction_type, dtype)
+        return f"{vec_type}({scalar_init})"
+
+    def reduction_acc_type_vec(self, reduction_type, dtype):
+        assert reduction_type not in {"argmin", "argmax"}
+        scalar_type = DTYPE_TO_COMPUTATION_DTYPE[dtype]
+        vec_type = self._get_vec_type(scalar_type)
+        if is_welford_reduction(reduction_type):
+            return f"Welford<{vec_type}>"
+
+        return vec_type
+
+    def reduction_combine_vec(self, reduction_type, var, next_value):
+        if reduction_type == "max":
+            return f"at::vec::maximum({var}, {next_value})"
+        elif reduction_type == "min":
+            return f"at::vec::minimum({var}, {next_value})"
+        elif reduction_type == "sum":
+            return f"{var} + {next_value}"
+        elif reduction_type == "prod":
+            return f"{var} * {next_value}"
+        elif reduction_type == "xor_sum":
+            return f"{var} ^ {next_value}"
+        elif reduction_type == "welford_reduce":
+            return f"welford_combine({var}, {next_value})"
+        elif reduction_type == "welford_combine":
+            if isinstance(next_value, tuple):
+                # When reading a value from Inductor IR we have a tuple of variable names
+                mean, m2, weight = next_value
+            else:
+                # When combining intermediate accumulators we have a Welford<T> struct
+                mean, m2, weight = reduction_project(reduction_type, next_value)
+            return f"welford_combine({var}, {{{mean}, {m2}, {weight}}})"
+        else:
+            raise NotImplementedError
+
+    def indirect_assert(self, var, lower, upper, mask=None):
+        assert not mask, "do not support mask in indirect_indexing assertion"
+        assert isinstance(var, CppCSEVariable)
+        assert var.dtype is not None
+        if not var.is_vec:
+            return super().indirect_assert(var, lower, upper, mask)
+        lower_scalar = lower
+        upper_scalar = upper
+        if lower:
+            lower = f"{self._get_vec_type(var.dtype)}({lower})"
+        if upper:
+            upper = f"{self._get_vec_type(var.dtype)}({upper})"
+        if lower and upper:
+            cond = f"({lower} <= {var}) & ({var} < {upper})"
+            cond_print = f"{lower_scalar} <= {var} < {upper_scalar}"
+        elif lower:
+            cond = f"{lower} <= {var}"
+            cond_print = f"{lower_scalar} <= {var}"
+        else:
+            assert upper
+            cond = f"{var} < {upper}"
+            cond_print = f"{var} < {upper_scalar}"
+        cond = f"({self._get_mask_type(var.dtype)}({cond})).all_masked()"
+        return f'{self.assert_function}({cond}, "index out of bounds: {cond_print}")'
+
 
 class CppTile2DKernel(CppVecKernel):
     """
@@ -2129,19 +2525,19 @@ def __init__(self, args, num_threads, tiling_factor, tiling_indices, tiling_dtyp
         self.tiling_indices = tiling_indices
 
     def inner_itervar(self):
-        return sympy_symbol(f"{self.itervars[self.outer_idx]}_inner")
+        return sympy_index_symbol(f"{self.itervars[self.outer_idx]}_inner")
 
     def need_vec_transpose(self, index):
+        outer_var = self.itervars[self.outer_idx]
+        inner_var = self.itervars[self.tiling_idx]
+        outer_stride = stride_at_vec_range(index, outer_var, self.tiling_factor)
+        inner_stride = stride_at_vec_range(index, inner_var, self.tiling_factor)
         return (
             self._load_mask is None  # TODO: support transposition with mask
-            and stride_at(self.itervars[self.outer_idx], index) == 1
-            and index.has(self.itervars[self.tiling_idx])
-            and not stride_at(self.itervars[self.tiling_idx], index).has(
-                self.itervars[self.tiling_idx]
-            )
-            and not stride_at(self.itervars[self.tiling_idx], index).has(
-                self.itervars[self.outer_idx]
-            )
+            and outer_stride == 1
+            and index.has(inner_var)
+            and not inner_stride.has(inner_var)
+            and not inner_stride.has(outer_var)
         )
 
     def gen_transposed_tile_load_store(self, name, var, index, is_store):
@@ -2150,7 +2546,7 @@ def gen_transposed_tile_load_store(self, name, var, index, is_store):
         factor = self.tiling_factor
         src = f"{var} + {cexpr_index(index)}"
         dst = "__place_holder__"
-        ld_src = f"{cexpr_index(stride_at(self.itervars[self.tiling_idx], index))}"
+        ld_src = f"{cexpr_index(stride_at_vec_range(index, self.itervars[self.tiling_idx], self.tiling_factor))}"
         ld_dst = f"{factor}"
         if is_store:
             src, dst = dst, src
@@ -2191,7 +2587,7 @@ def load(self, name: str, index: sympy.Expr):
             # vector load inside the kernel inner loop
             loadbuf = f"{tile_var} + {cexpr_index(inner * self.tiling_factor)}"
             dtype = V.graph.get_dtype(name)
-            line = self._get_vec_load_line(loadbuf, 0, dtype)
+            line = self._get_vec_load_line(loadbuf, 0, dtype)  # type: ignore[arg-type]
             csevar = self.cse.generate(self.loads, line)
             csevar.update_on_args("load", (name, index), {})
             assert isinstance(csevar, CppCSEVariable)
@@ -2217,7 +2613,7 @@ def store(self, name, index, value, mode=None):
             storebuf = f"{tile_var} + {cexpr_index(inner * self.tiling_factor)}"
             if V.graph.get_dtype(name) in DTYPE_LOWP_FP:
                 line = f"{value}.store({storebuf}, {self.tiling_factor});"
-            elif V.graph.get_dtype(name) in [torch.uint8]:
+            elif V.graph.get_dtype(name) in (torch.uint8, torch.int8):
                 line = f"{value}.store({storebuf}, {self.tiling_factor});"
             else:
                 line = f"{value}.store({storebuf});"
@@ -2257,7 +2653,6 @@ def __init__(self, args, num_threads, tiling_factor, tiling_idx=-1):
         # Since this kernel is only for checker but does not generate any
         # code, so we need to decrease the kernel count.
         metrics.generated_kernel_count -= 1
-        metrics.generated_cpp_vec_kernel_count -= 1
 
         # Used to record the graph wrapper code as the wrapper_code status could be
         # changed during graph run.
@@ -2272,129 +2667,38 @@ def __init__(self, args, num_threads, tiling_factor, tiling_idx=-1):
         self.exit_stack = contextlib.ExitStack()
 
         # Cache all the load result
-        self.load_supported_dtypes: List[torch.dtype] = [
+        self.supported_dtypes: List[torch.dtype] = [
             torch.float,
             torch.bfloat16,
             torch.float16,
             torch.bool,
             torch.uint8,
+            torch.int8,
+            torch.int32,
+            torch.int64,
         ]
-        self.store_supported_dtypes: List[torch.dtype] = [
-            torch.float,
-            torch.bfloat16,
-            torch.float16,
-            torch.uint8,
-        ]
-        # Cache the dtypes of the store operation. If the store is mixing dtypes, the
-        # vectorization would not support it as it is hard to determine the vec dtype
-        self.store_dtypes: List[torch.dtype] = []
-        # The dtype is used for vectorization
-        self.vec_dtype: torch.dtype = torch.float32
 
     def disable_vec(self, msg=None):
         if schedule_log.isEnabledFor(logging.DEBUG):
             schedule_log.debug("Disabled vectorization: %s", msg)
         self.simd_vec = False
 
-    def is_mask(self, name: str, users: Dict[torch.fx.Node, None]):
-        load_type = V.graph.get_dtype(name)
-        if load_type == torch.bool:
-            return all(user.target in ("where", "masked") for user in users.keys())
-        elif load_type == torch.uint8:
-            """
-            If the load value is torch.uint8, then we only support the loaded
-            value is as the mask.
-            """
-            if not all(
-                user.target == "to_dtype" and user.args[-1] == torch.bool
-                for user in users.keys()
-            ):
-                return False
-
-            for to_dtype_node in users.keys():
-                assert to_dtype_node.target == "to_dtype"
-                if not all(
-                    user.target in ("where", "masked")
-                    for user in to_dtype_node.users.keys()
-                ):
-                    return False
-            return True
-        else:
-            return False
-
-    def is_load_uint8_as_float(self, name: str, users: Dict[torch.fx.Node, None]):
-        """
-        Check:
-        1. load_type is torch.uint8
-        2. has 1 user node of target to_dtype
-        3. dtype of to_dtype is torch.float
-        """
-        load_type = V.graph.get_dtype(name)
-        if load_type is not torch.uint8:
-            return False
-        if len(users) == 1:
-            user = next(iter(users))
-            if (user.target == "to_dtype") and (user.args[-1] == torch.float):
-                return True
-            return False
-        return False
-
-    def can_store_fp32_as_uint8(self, store_var: str, value_node: torch.fx.Node):
-        """
-        Check:
-        1. store_type is torch.uint8
-        2. value_node is of target to_dtype
-        3. dtype of to_dtype node is torch.uint8
-        """
-        store_type = V.graph.get_dtype(store_var)
-        if store_type not in [torch.uint8]:
-            return False
-        if value_node.target == "to_dtype" and value_node.args[-1] == torch.uint8:
-            return True
-
-        return False
-
-    def is_load_integer_scalar_tensor(self, name: str, index: sympy.Expr):
-        load_dtype = V.graph.get_dtype(name)
-        buffer = V.graph.get_buffer(name)
-        return (
-            load_dtype in [torch.int32, torch.int64]
-            and isinstance(buffer, TensorBox)
-            and isinstance(buffer.data, StorageBox)
-            and (len(buffer.data.layout.size) == 0)
-            and (index == 0)
-        )
-
     def load(self, name: str, index: sympy.Expr):
         with RecordOptimizationContext(__name__) as node_ctx:
             load_dtype = V.graph.get_dtype(name)
             opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
             assert opt_ctx
-            opt_ctx.dtype = load_dtype
-            opt_ctx.is_load_as_mask = self.is_mask(name, node_ctx.get_fx_node().users)
-            opt_ctx.is_load_uint8_as_float = self.is_load_uint8_as_float(
-                name, node_ctx.get_fx_node().users
-            )
 
+            opt_ctx.dtype = load_dtype
             var = self.cse.newvar()
 
             if len(self.itervars) == 0:
                 self.disable_vec("not a loop")
                 return var
 
-            if load_dtype in [torch.bool, torch.uint8] and not (
-                opt_ctx.is_load_as_mask or opt_ctx.is_load_uint8_as_float
-            ):
-                if not opt_ctx.is_load_as_mask:
-                    self.disable_vec(f"{load_dtype} not loaded as mask")
-                elif not opt_ctx.is_load_uint8_as_float:
-                    self.disable_vec(f"{load_dtype} not loaded as float")
-                return var
-
-            if (
-                (load_dtype not in self.load_supported_dtypes)
-                and not self.is_load_integer_scalar_tensor(name, index)
-                and index.has(self.itervars[self.tiling_idx])
+            if load_dtype not in self.supported_dtypes and (
+                index.has(self.itervars[self.tiling_idx])
+                or free_symbol_is_type(index, SymT.TMP)
             ):
                 self.disable_vec(f"{load_dtype} not supported by load")
                 return var
@@ -2413,18 +2717,10 @@ def store(self, name, index, value, mode=None):
             assert opt_ctx
             opt_ctx.dtype = store_dtype
 
-            store_dtype = torch.float if store_dtype == torch.float32 else store_dtype
-            self.store_dtypes.append(store_dtype)
-            if store_dtype not in self.store_supported_dtypes:
+            if store_dtype not in self.supported_dtypes:
                 self.disable_vec(f"{store_dtype} not supported by store")
                 return self.simd_vec
 
-            if store_dtype in [torch.uint8]:
-                value_node = node_ctx.get_fx_node().all_input_nodes[-1]
-                if not self.can_store_fp32_as_uint8(name, value_node):
-                    self.disable_vec("not support store float32 as uint8")
-                    return self.simd_vec
-
             assert "buf" in name
             index = self.rename_indexing(index)
 
@@ -2432,18 +2728,14 @@ def store(self, name, index, value, mode=None):
                 self.disable_vec(f"store mode: {mode}")
                 return self.simd_vec
 
-            if index.is_number:
-                self.disable_vec(f"constant store index: {index}")
             return self.simd_vec
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
-        if (
-            dtype == torch.float
-            and src_dtype == torch.float
+        if not (
+            (dtype == torch.float and src_dtype == torch.float)
+            or (dtype == torch.int64 and src_dtype == torch.int64)
             and reduction_type in VECTORIZABLE_RTYPES
         ):
-            pass
-        else:
             self.disable_vec(
                 f"reduction: dtype {dtype}, src_dtype {src_dtype}, reduction_type {reduction_type}"
             )
@@ -2454,32 +2746,6 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
     def store_reduction(self, name, index, value):
         return self.simd_vec
 
-    def is_supported_cmp(self, node: torch.fx.Node):
-        def get_node_dtype(node):
-            if type(node) == torch.fx.Node:
-                opt_ctx: OptimizationContext = get_current_node_opt_ctx()
-                return opt_ctx.dtype if opt_ctx else None
-            else:
-                return None
-
-        def get_cmp_dtypes(node: torch.fx.Node):
-            return get_node_dtype(node.args[-2]), get_node_dtype(node.args[-1])
-
-        assert len(node.args) >= 2
-        # cmp(x, y): y is a magic value like x >= 1
-        if type(node.args[-1]) in [int, float]:
-            return True
-        # cmp(x, y): x is a magic value like 1 >= y
-        if type(node.args[-2]) in [int, float]:
-            return False
-
-        left_dtype, right_dtype = get_cmp_dtypes(node)
-        if left_dtype is None or right_dtype is None:
-            # TODO(Eikan): To record, deduce and propagate the data type of every expression.
-            return True
-        else:
-            return left_dtype == right_dtype
-
     def __exit__(self, exc_type, exc_val, exc_tb):
         assert self._orig_wrapper_code is not None
         # Restore the wrapper_code
@@ -2496,25 +2762,17 @@ def __enter__(self):
         self._orig_wrapper_code = V.graph.wrapper_code
         V.graph.wrapper_code = WrapperCodeGen()
 
-        class VecCheckerProxy:
-            bin_cmp_ops = ["eq", "ne", "le", "ge", "lt", "gt"]
-
-            @staticmethod
-            def _bin_cmp_op(x, y):
-                current_node: torch.fx.Node = V.interpreter.current_node
-                if not self.is_supported_cmp(current_node):
-                    self.disable_vec(f"binary comparison op: {current_node}")
-                return self.simd_vec
+        parent_handler = V.MockHandler()
 
+        class VecCheckerProxy:
             @staticmethod
             def __getattr__(name):  # type: ignore[misc]
                 def inner(*args, **kwargs):
-                    if name in VecCheckerProxy.bin_cmp_ops:
-                        return VecCheckerProxy._bin_cmp_op(args, kwargs)
-
                     if name not in self.fast_vec_list:
                         self.disable_vec(f"op: {name}")
-                    return self.simd_vec
+
+                    parent_val = getattr(parent_handler, name)(*args, **kwargs)
+                    return pytree.tree_map(lambda _: self.simd_vec, parent_val)
 
                 return inner
 
@@ -2547,6 +2805,10 @@ def constant(val, dtype):
                         dtype == torch.int64
                         and val <= i32_iinfo.max
                         and val >= i32_iinfo.min
+                        and all(
+                            user.target in BIN_CMP_OPS
+                            for user in node_ctx.current_node.users
+                        )
                     ):
                         opt_ctx.dtype = torch.int32
 
@@ -2559,33 +2821,13 @@ def constant(val, dtype):
                         ):
                             opt_ctx.dtype = torch.float32
 
-                    supported_dtypes = [
-                        torch.float32,
-                        torch.int32,
-                        torch.bfloat16,
-                        torch.float16,
-                    ]
-
-                    if opt_ctx.dtype not in supported_dtypes or (
-                        opt_ctx.dtype == torch.int32
-                        and not all(
-                            user.target in VecCheckerProxy.bin_cmp_ops
-                            for user in node_ctx.current_node.users
-                        )
-                    ):
+                    if opt_ctx.dtype not in self.supported_dtypes:
                         self.disable_vec(f"constant dtype: {opt_ctx.dtype}")
                     return val
 
             @staticmethod
             def index_expr(expr, dtype):
                 assert len(self.ranges) == len(self.itervars)
-                if not len(self.ranges) or not all(
-                    not isinstance(range, sympy.Expr) or sympy.simplify(range).is_number
-                    for range in self.ranges
-                ):
-                    # if the range value is sympy.Expr, we might could not deduce the accurate loop interval.
-                    self.disable_vec(f"index_expr: {expr}, dtype {dtype}")
-                    return self.cse.newvar()
 
                 def can_use_int32():
                     free_symbols = list(expr.free_symbols)
@@ -2598,7 +2840,11 @@ def can_use_int32():
                     if any(v == 0 for v in sizes.values()):
                         return True
 
-                    vars_ranges = {k: ValueRanges(0, v - 1) for k, v in sizes.items()}
+                    vars_ranges = {
+                        k: ValueRanges(0, v - 1)
+                        for k, v in sizes.items()
+                        if not isinstance(v, sympy.Expr) or v.is_number
+                    }
                     if not vars_ranges or len(vars_ranges) != len(free_symbols):
                         i32_iinfo = torch.iinfo(torch.int32)
                         return (
@@ -2626,13 +2872,12 @@ def can_use_int32():
                         dtype == torch.int64
                         and can_use_int32()
                         and all(
-                            user.target in VecCheckerProxy.bin_cmp_ops
+                            user.target in BIN_CMP_OPS
                             for user in node_ctx.current_node.users
                         )
                     ):
                         opt_ctx.dtype = torch.int32
                     else:
-                        opt_ctx.dtype = dtype
                         self.disable_vec(f"index_expr: {expr}, dtype {dtype}")
 
                     tmp_var = self.cse.newvar()
@@ -2640,7 +2885,7 @@ def can_use_int32():
 
             @staticmethod
             def indirect_indexing(index_var, size, check=True):
-                return sympy_symbol(str(index_var))
+                return sympy_index_symbol(str(index_var))
 
             @staticmethod
             def masked(mask, body, other):
@@ -2649,84 +2894,9 @@ def masked(mask, body, other):
 
             @staticmethod
             def to_dtype(x, dtype, src_dtype=None):
-                with RecordOptimizationContext(__name__) as node_ctx:
-                    opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
-                    assert opt_ctx
-                    opt_ctx.dtype = dtype
-
-                    cur_node = node_ctx.get_fx_node()
-                    input_value: torch.fx.Node = cur_node.all_input_nodes[1]
-                    if dtype == torch.float:
-                        if input_value.target in [
-                            "load",
-                        ]:
-                            # Support masked_load for BF16/FP16. Because the legalization will
-                            # insert to_dtype to convert the BF16/FP16 input to FP32.
-                            dtype = (
-                                V.graph.get_dtype(input_value.args[1])
-                                if input_value.target == "load"
-                                else input_value.args[-1]
-                            )
-                            if dtype in [
-                                torch.float16,
-                                torch.bfloat16,
-                                torch.float,
-                                torch.uint8,
-                            ]:
-                                # Convert from dtype to torch.float
-                                pass
-                            elif (
-                                dtype in [torch.int32, torch.int64]
-                                and input_value.target == "load"
-                            ):
-                                buffer = V.graph.get_buffer(input_value.args[1])
-                                # Check if load of a scalar tensor of integer
-                                if not (
-                                    isinstance(buffer, TensorBox)
-                                    and isinstance(buffer.data, StorageBox)
-                                    and len(buffer.data.layout.size) == 0
-                                ):
-                                    self.disable_vec(f"to_dtype: dtype {dtype}")
-                            else:
-                                self.disable_vec(f"to_dtype: dtype {dtype}")
-                    elif dtype in DTYPE_LOWP_FP:
-                        if not all(usr.target == "store" for usr in cur_node.users):
-                            self.disable_vec(
-                                "to_dtype: bfloat16/float16 expecting users are all stores"
-                            )
-                            return x
-
-                        store_names = [usr.args[1] for usr in cur_node.users]
-                        if not all(
-                            V.graph.get_dtype(name) in [dtype] for name in store_names
-                        ):
-                            self.disable_vec(
-                                "to_dtype: expecting all stores into bfloat16 or float16"
-                            )
-                            return x
-                    elif dtype == torch.bool:
-                        pass
-                    elif dtype == torch.uint8:
-                        # Only allow below 2 cases:
-                        # Case 1: to_uint8 and store which corresponding to the single quant node
-                        # at last of fusion pattern.
-                        is_to_uint8_and_store = all(
-                            usr.target in ["store"] for usr in cur_node.users
-                        )
-                        # Case 2: to_uint8 and to_float which corresponding to pair of quant/dequant node
-                        # at middle of fusion pattern.
-                        is_to_uint8_and_to_float = all(
-                            (
-                                usr.target in ["to_dtype"]
-                                and usr.args[2] == torch.float32
-                            )
-                            for usr in cur_node.users
-                        )
-                        if not (is_to_uint8_and_store or is_to_uint8_and_to_float):
-                            self.disable_vec(f"to_dtype: dtype {dtype}")
-                    else:
-                        self.disable_vec(f"to_dtype: dtype {dtype}")
-                    return x
+                if dtype not in self.supported_dtypes:
+                    self.disable_vec(f"to_dtype: {dtype}")
+                return x
 
         self.exit_stack.enter_context(V.set_ops_handler(VecCheckerProxy()))
         self.exit_stack.enter_context(V.set_kernel_handler(self))
@@ -2802,14 +2972,14 @@ def is_lowp_fp_load(node: torch.fx.Node):
                 if node.target not in ["load"]:
                     return False
                 assert len(node.args) == 3
-                load_dtype = V.graph.get_dtype(node.args[1])
+                load_dtype = V.graph.get_dtype(node.args[1])  # type: ignore[arg-type]
                 return load_dtype in DTYPE_LOWP_FP
 
             def is_lowp_fp_store(node: torch.fx.Node):
                 if node.target != "store":
                     return False
                 _, store_var, _, _, _ = node.args
-                store_dtype = V.graph.get_dtype(store_var)
+                store_dtype = V.graph.get_dtype(store_var)  # type: ignore[arg-type]
                 return store_dtype in DTYPE_LOWP_FP
 
             sub_graph_nodes = list(sub_graph.nodes)
@@ -2977,7 +3147,7 @@ def is_memory_copy_scheduler_node(node: SchedulerNode):
                 body: ir.LoopBody = node._body
                 _legalize_lowp_fp(body)
 
-    def codegen_nodes(self, nodes):
+    def codegen_nodes(self, nodes: List[SchedulerNode]):
         # Legalize BF16 node by adding to_dtype explicitly
         self.legalize_lowp_fp_dtype(nodes)
         self.data_type_propagation(nodes)
@@ -2985,10 +3155,10 @@ def codegen_nodes(self, nodes):
         assert len(nodes) >= 1
         first_node = nodes[0]
         vec_dtype = (
-            first_node._lowp_fp_type
+            first_node._lowp_fp_type  # type: ignore[attr-defined]
             if all(
                 hasattr(_node, "_lowp_fp_type")
-                and _node._lowp_fp_type == first_node._lowp_fp_type
+                and _node._lowp_fp_type == first_node._lowp_fp_type  # type: ignore[attr-defined]
                 for _node in nodes
             )
             else torch.float
@@ -3003,12 +3173,11 @@ def codegen_nodes(self, nodes):
 
         def codegen_kernel(cls, *args):
             with kernel_group.new_kernel(cls, *args) as kernel:
-                run(kernel)
-
                 # Ugly hack to maintain the metrics kernel count since
                 # we only count in CppKernelProxy, not those contained in it
                 metrics.generated_kernel_count -= 1
 
+                run(kernel)
                 return kernel
 
         def run(kernel):
@@ -3039,7 +3208,7 @@ def run(kernel):
         if not self.picked_vec_isa:
             return
 
-        def select_tiling_indices():
+        def select_tiling_indices(tiling_factor):
             all_index = []
             for node in nodes:
                 rw = dependencies.extract_read_writes(node._body, *node._sizes)
@@ -3052,11 +3221,13 @@ def select_tiling_indices():
                 for var in index.free_symbols:
                     if not re.search(r"^d\d+$", var.name):
                         continue
-                    stride = stride_at(var, index)
-                    if stride == 1:
+                    stride = stride_at_vec_range(index, var, tiling_factor)
+                    if stride == 0:
+                        continue
+                    elif stride == 1:
                         contig_vars.add(int(var.name[1:]))
                         contig_vars_list.append(int(var.name[1:]))
-                    elif all(s.name.startswith("s") for s in stride.free_symbols):
+                    elif all(symbol_is_type(s, SymT.SIZE) for s in stride.free_symbols):
                         non_contig_stride_const.add(int(var.name[1:]))
                     else:
                         non_contig_stride_other.add(int(var.name[1:]))
@@ -3083,7 +3254,7 @@ def select_tiling_indices():
         def select_tiling(dtype: torch.dtype = torch.float):
             # TODO(jgong5): support alternative tiling factors and data types
             tiling_factor = self.picked_vec_isa.nelements(dtype=dtype)
-            tiling_indices = select_tiling_indices()
+            tiling_indices = select_tiling_indices(tiling_factor)
             if tiling_indices:
                 could_vec = True
                 for tiling_indice in tiling_indices:
@@ -3112,14 +3283,14 @@ def select_tiling(dtype: torch.dtype = torch.float):
             tiling_factors, tiling_indices = select_tiling(vec_dtype)
             assert len(tiling_factors) == len(tiling_indices)
             if len(tiling_indices) == 1:
+                vec_kernel = codegen_kernel(
+                    CppVecKernel, tiling_factors[0], tiling_indices[0], vec_dtype
+                )
+                metrics.generated_cpp_vec_kernel_count += 1
                 main_loop, tail_loop = self.loop_nest.split_with_tiling(
                     tiling_indices[0], factor=tiling_factors[0]
                 )
-                main_loop.set_kernel(
-                    codegen_kernel(
-                        CppVecKernel, tiling_factors[0], tiling_indices[0], vec_dtype
-                    )
-                )
+                main_loop.set_kernel(vec_kernel)
                 tail_loop.set_kernel(scalar_kernel)
                 main_loop.simd_vec = True
                 tail_loop.simd_omp = True
@@ -3134,28 +3305,61 @@ def select_tiling(dtype: torch.dtype = torch.float):
                     tiling_indices[1] == len(self.itervars) - 1
                     and tiling_factors[0] == tiling_factors[1]
                 )
+                tile2d_kernel = codegen_kernel(
+                    CppTile2DKernel, tiling_factors[0], tiling_indices, vec_dtype
+                )
+                vec_kernel = codegen_kernel(
+                    CppVecKernel, tiling_factors[0], tiling_indices[0], vec_dtype
+                )
+                metrics.generated_cpp_vec_kernel_count += 2
                 outer_main_loop, outer_tail_loop = self.loop_nest.split_with_tiling(
                     tiling_indices[0], factor=tiling_factors[0]
                 )
                 outer_tail_loop.set_kernel(scalar_kernel)
-                inner_main_loop, inner_tail_loop = outer_main_loop.split_with_tiling(
+                (
+                    inner_main_loop,
+                    inner_tail_loop,
+                ) = outer_main_loop.split_with_tiling(
                     tiling_indices[1] - tiling_indices[0], factor=tiling_factors[0]
                 )
-                inner_main_loop.set_kernel(
-                    codegen_kernel(
-                        CppTile2DKernel, tiling_factors[0], tiling_indices, vec_dtype
-                    )
-                )
-                inner_tail_loop.set_kernel(
-                    codegen_kernel(
-                        CppVecKernel, tiling_factors[0], tiling_indices[0], vec_dtype
-                    )
-                )
+                inner_main_loop.set_kernel(tile2d_kernel)
+                inner_tail_loop.set_kernel(vec_kernel)
 
     def codegen_loops(self, code, worksharing):
         self.codegen_loops_impl(self.loop_nest, code, worksharing)
 
 
+class OuterLoopFusedKernel(CppKernel):
+    def __init__(self, kernel_group):
+        super().__init__(kernel_group.args, kernel_group.ws.num_threads)
+        self.inner: List["LoopLevel"] = []
+
+    def decide_parallel_depth(self, max_parallel_depth, threads) -> int:
+        kernels_parallel_depth = []
+        nested_kernels: List[List[CppKernel]] = [
+            loop.get_kernels() for loop in self.inner
+        ]
+        for kernels in nested_kernels:
+            # For any ScalarKernel, VecKernel, or Tile2DKernel,
+            # they should all have the same call_ranges
+            call_ranges = kernels[0].call_ranges
+            assert call_ranges is not None
+            assert all(kernel.call_ranges == call_ranges for kernel in kernels)
+            kernels_parallel_depth.append(
+                kernels[0].decide_parallel_depth(len(call_ranges), threads)
+            )
+        return min(
+            max_parallel_depth,
+            max(kernels_parallel_depth),
+        )
+
+
+class ReasonFusedNodes(Enum):
+    SAME_VARS_REDUCE = "same_vars_reduce"
+    COMPATIBLE_REDUCTION = "compatible_reduction"
+    COMPATIBLE_RANGES_NO_REDUCTION = "compatible_ranges_no_reduction"
+
+
 class CppScheduling(BaseScheduling):
     # ctypes limits the number of args to 1024, refer to:
     # https://github.com/python/cpython/commit/a285af7e626d1b81cf09f8b2bf7656f100bc1237
@@ -3164,7 +3368,7 @@ class CppScheduling(BaseScheduling):
 
     def __init__(self, scheduler):
         self.scheduler = scheduler
-        self.get_kernel_group()
+        self.reset_kernel_group()
         self._ready_to_flush = False
 
     def _set_flush_status(self, status: bool):
@@ -3173,24 +3377,148 @@ def _set_flush_status(self, status: bool):
     def group_fn(self, sizes):
         return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes)
 
-    def get_kernel_group(self):
-        from .wrapper import CppWrapperCodeGen
+    def reset_kernel_group(self):
+        from .cpp_wrapper_cpu import CppWrapperCpu
 
         self.kernel_group: Union[CppWrapperKernelGroup, KernelGroup]
-        if isinstance(V.graph.wrapper_code, CppWrapperCodeGen):
+        if isinstance(V.graph.wrapper_code, CppWrapperCpu):
             self.kernel_group = CppWrapperKernelGroup()
         else:
             self.kernel_group = KernelGroup()
 
-    def _can_fuse_horizontal_impl(self, node1, node2):
+    def fuse(self, node1, node2):
+        if node1.is_foreach() or node2.is_foreach():
+            return ForeachKernelSchedulerNode.fuse(node1, node2)
+        else:
+            if (
+                self._why_fuse_nodes(node1, node2)
+                == ReasonFusedNodes.COMPATIBLE_RANGES_NO_REDUCTION
+            ):
+                assert isinstance(node1, (SchedulerNode, FusedSchedulerNode))
+                assert isinstance(node2, (SchedulerNode, FusedSchedulerNode))
+
+                _, (vars1, reduce1) = node1.group
+                _, (vars2, reduce2) = node2.group
+                assert reduce1 == () and reduce2 == (), (reduce1, reduce2)
+
+                def get_indexing_ranges_exprs(node):
+                    if isinstance(node, FusedSchedulerNode):
+                        assert len(node.snodes) > 0, node.snodes
+                        var_ranges = None
+                        indexing_exprs = set()
+                        for snode in node.snodes:
+                            v, exprs = get_indexing_ranges_exprs(snode)
+                            if var_ranges is None:
+                                var_ranges = v
+                            assert var_ranges == v, (var_ranges, v, node.snodes)
+                            indexing_exprs.update(exprs)
+                        return var_ranges, list(indexing_exprs)
+                    else:
+                        assert isinstance(node, SchedulerNode)
+                        comp_buffer = node.node
+                        assert isinstance(comp_buffer, ir.ComputedBuffer)
+                        _, body, _ = comp_buffer.get_default_sizes_body()
+                        return body.var_ranges, list(body.indexing_exprs.values())
+
+                node_to_recomp = node1 if len(vars1) < len(vars2) else node2
+                assert isinstance(node_to_recomp, SchedulerNode)
+
+                ref_node = node2 if len(vars1) < len(vars2) else node1
+
+                extra_indexing_constraints = get_indexing_ranges_exprs(ref_node)
+
+                node_to_recomp.recompute_size_and_body(
+                    extra_indexing_constraints=extra_indexing_constraints
+                )
+
+                _, (vars1, _) = node1.group
+                _, (vars2, _) = node2.group
+                assert vars1 == vars2, (vars1, vars2)
+                return FusedSchedulerNode.fuse(node1, node2)
+            elif self.can_fuse_vertical_outer_loop(node1, node2):
+                return OuterLoopFusedSchedulerNode.fuse(
+                    node1, node2, self._get_outer_loop_fusion_depth(node1, node2)
+                )
+            else:
+                return FusedSchedulerNode.fuse(node1, node2)
+
+    def _why_fuse_nodes(self, node1, node2) -> Optional[ReasonFusedNodes]:
         _, (vars1, reduce1) = node1.group
         _, (vars2, reduce2) = node2.group
+
         if vars1 == vars2 and reduce1 == reduce2:
-            return True
+            return ReasonFusedNodes.SAME_VARS_REDUCE
         if reduce1 == () and vars1 == vars2 + reduce2:
-            return True
+            return ReasonFusedNodes.COMPATIBLE_REDUCTION
+        if self._can_fuse_nodes_with_compatible_ranges(node1, node2):
+            return ReasonFusedNodes.COMPATIBLE_RANGES_NO_REDUCTION
         # TODO(jansel): allow fusion pointwise (vars1, ()) suffix?
-        return False
+        return None
+
+    def _can_fuse_nodes_with_compatible_ranges(self, node1, node2):
+        # Here we try to fuse SchedulerNode/FusedSchedulerNode with compatible ranges
+        # e.g. (s0, s1, s2) and (s0 * s1 * s2)
+        _, (vars1, reduce1) = node1.group
+        _, (vars2, reduce2) = node2.group
+
+        c1 = reduce1 == () and reduce2 == ()
+        c2 = math.prod(vars1) == math.prod(vars2)
+        c3 = len(vars1) == 1 or len(vars2) == 1
+        if not (c1 and c2 and c3):
+            return False
+
+        node_to_recomp = node1 if len(vars1) < len(vars2) else node2
+        ref_node = node2 if len(vars1) < len(vars2) else node1
+
+        # We can not recompute sizes and body for nodes other than SchedulerNode
+        # TODO: we can extend fusion support with compatible ranges for FusedSchedulerNode
+        if isinstance(node_to_recomp, FusedSchedulerNode):
+            return False
+
+        # It may happen that node1 and node2 compatible number of elements
+        # but different original ranges, for example:
+        # {d0: s0, d1: s1, d2: s2} vs {d0: s0*s1*s2}
+        # See https://github.com/pytorch/pytorch/pull/120077/files#r1500427848 for more details
+        # TODO: we can fix if it allows us to CSE at least one of the variables
+
+        assert isinstance(node_to_recomp, SchedulerNode)
+        if isinstance(node_to_recomp.node, ir.TemplateBuffer):
+            return False
+        assert isinstance(node_to_recomp.node, ir.ComputedBuffer)
+        # node.data.get_size() is a cheaper version of node.get_read_writes().var_ranges
+        # but without variable name
+        ranges2 = node_to_recomp.node.data.get_size()
+        ranges1 = None
+        if isinstance(ref_node, FusedSchedulerNode):
+            ranges_set = set()
+            for snode in ref_node.snodes:
+                if isinstance(snode.node, ir.TemplateBuffer):
+                    break
+                assert isinstance(snode.node, ir.ComputedBuffer)
+                ranges_set.add(tuple(snode.node.data.get_size()))
+
+            if len(ranges_set) != 1:
+                return False
+
+            ranges1 = list(next(iter(ranges_set)))
+        else:
+            assert isinstance(ref_node, SchedulerNode)
+            assert isinstance(ref_node.node, ir.ComputedBuffer)
+            ranges1 = ref_node.node.data.get_size()
+
+        if ranges1 != ranges2:
+            return False
+
+        return True
+
+    def _can_fuse_horizontal_impl(self, node1, node2):
+        assert isinstance(node1, (FusedSchedulerNode, SchedulerNode))
+        assert isinstance(node2, (FusedSchedulerNode, SchedulerNode))
+        if any(
+            isinstance(node, OuterLoopFusedSchedulerNode) for node in (node1, node2)
+        ):
+            return False
+        return self._why_fuse_nodes(node1, node2) is not None
 
     def can_fuse_horizontal(self, node1, node2):
         if (
@@ -3201,19 +3529,129 @@ def can_fuse_horizontal(self, node1, node2):
 
         return self._can_fuse_horizontal_impl(node1, node2)
 
+    def _get_outer_loop_fusion_depth(self, node1, node2):
+        DISABLE_OUTER_LOOP_FUSION = 0
+        if not all(
+            type(node)
+            in (OuterLoopFusedSchedulerNode, FusedSchedulerNode, SchedulerNode)
+            for node in (node1, node2)
+        ):
+            return DISABLE_OUTER_LOOP_FUSION
+
+        _node1 = (
+            node1.get_outer_nodes()[-1]
+            if isinstance(node1, OuterLoopFusedSchedulerNode)
+            else node1
+        )
+        assert isinstance(_node1, (FusedSchedulerNode, SchedulerNode))
+        _node2 = (
+            node2.get_outer_nodes()[0]
+            if isinstance(node2, OuterLoopFusedSchedulerNode)
+            else node2
+        )
+        assert isinstance(_node2, (FusedSchedulerNode, SchedulerNode))
+
+        _, (vars1, reduce1) = _node1.group
+        _, (vars2, reduce2) = _node2.group
+        if vars1 == () and vars2 == () and reduce1 != () and reduce2 != ():
+            # Reduction only
+            return DISABLE_OUTER_LOOP_FUSION
+        if all(type(node) is OuterLoopFusedSchedulerNode for node in (node1, node2)):
+            return (
+                node1.outer_loop_fusion_depth
+                if node1.outer_loop_fusion_depth == node2.outer_loop_fusion_depth
+                else DISABLE_OUTER_LOOP_FUSION
+            )
+        outer_loop_fusion_depth = min(len(vars1), len(vars2))
+        if (
+            outer_loop_fusion_depth >= 1
+            and vars1[:outer_loop_fusion_depth] == vars2[:outer_loop_fusion_depth]
+        ):
+            if any(
+                type(node) is OuterLoopFusedSchedulerNode for node in (node1, node2)
+            ):
+                _compare_node = (
+                    node1 if type(node1) is OuterLoopFusedSchedulerNode else node2
+                )
+                if _compare_node.outer_loop_fusion_depth == outer_loop_fusion_depth:
+                    # Same outer loop fusion depth as prev nodes in OuterLoopFusedSchedulerNode
+                    return outer_loop_fusion_depth
+                else:
+                    return DISABLE_OUTER_LOOP_FUSION
+            else:
+                # First 2 nodes to generate OuterLoopFusedSchedulerNode
+                return outer_loop_fusion_depth
+        return DISABLE_OUTER_LOOP_FUSION
+
+    def can_fuse_vertical_outer_loop(self, node1, node2):
+        return (
+            node1.get_names() & node2.ancestors
+            and not (
+                self._can_fuse_horizontal_impl(node1, node2)
+                and not node1.is_reduction()
+            )
+            and self._get_outer_loop_fusion_depth(node1, node2) >= 1
+        )
+
+    def get_fusion_pair_priority(self, node1, node2):
+        if self.can_fuse_vertical_outer_loop(node1, node2):
+            # Outer loop fusion with lower priority
+            return 1
+        else:
+            return 0
+
     def can_fuse_vertical(self, node1, node2):
-        return self._can_fuse_horizontal_impl(node1, node2) and not node1.is_reduction()
+        return (
+            self._can_fuse_horizontal_impl(node1, node2) and not node1.is_reduction()
+        ) or self.can_fuse_vertical_outer_loop(node1, node2)
 
-    def codegen_nodes(self, nodes):
+    def codegen_node(
+        self,
+        node: Union[OuterLoopFusedSchedulerNode, FusedSchedulerNode, SchedulerNode],
+    ):
         """
         Turn an set of pre-fused nodes into a C++ kernel.
         """
         kernel_group = self.kernel_group
 
-        cpp_kernel_proxy = CppKernelProxy(kernel_group)
-        cpp_kernel_proxy.codegen_nodes(nodes)
-
-        kernel_group.finalize_kernel(cpp_kernel_proxy, nodes)
+        if isinstance(node, OuterLoopFusedSchedulerNode):
+            cpp_kernel_proxy_list: List[CppKernelProxy] = []
+            nodes_list: List[List[SchedulerNode]] = []
+
+            for _node in node.get_outer_nodes():
+                assert isinstance(_node, (FusedSchedulerNode, SchedulerNode))
+                _nodes: List[SchedulerNode] = _node.get_nodes()  # type: ignore[assignment]
+                cpp_kernel_proxy = CppKernelProxy(kernel_group)
+                cpp_kernel_proxy.codegen_nodes(_nodes)
+
+                cpp_kernel_proxy_list.append(cpp_kernel_proxy)
+                nodes_list.append(_nodes)
+
+            # Note that, in the future, when every kernel can be vectorized,
+            # the function select_tiling will be much easier, and we'll be able to lift
+            # check_outer_fusion_loop_level_attr to the fusion phase,
+            # avoiding grouping kernels at fusion time that "look like we'll be able to fuse them"
+            # but then we actually won't.
+            if node.check_outer_fusion_loop_level_attr(
+                cpp_kernel_proxy_list, node.outer_loop_fusion_depth
+            ):
+                # Merge the cpp_kernel_proxy_list into cpp_kernel_proxy
+                outer_fusion_cpp_kernel_proxy = node.merge_outer_fusion_kernels(
+                    cpp_kernel_proxy_list,
+                )
+                kernel_group.finalize_kernel(
+                    outer_fusion_cpp_kernel_proxy,
+                    [_node for _nodes in nodes_list for _node in _nodes],
+                )
+            else:
+                # Fall back to standard loop codegen
+                for _kernel_proxy, _nodes in zip(cpp_kernel_proxy_list, nodes_list):
+                    kernel_group.finalize_kernel(_kernel_proxy, _nodes)
+        else:
+            nodes: List[SchedulerNode] = node.get_nodes()  # type: ignore[assignment]
+            cpp_kernel_proxy = CppKernelProxy(kernel_group)
+            cpp_kernel_proxy.codegen_nodes(nodes)
+            kernel_group.finalize_kernel(cpp_kernel_proxy, nodes)
 
         args_num = self._get_scheduled_num_args()
         if args_num > CppScheduling.MAX_FUSED_KERNEL_ARGS_NUM:
@@ -3228,9 +3666,39 @@ def ready_to_flush(self):
     def codegen_sync(self):
         pass
 
+    def define_kernel(self, src_code, nodes):
+        wrapper = V.graph.wrapper_code
+        fused_name = (
+            get_fused_kernel_name(nodes, config.cpp.descriptive_names)
+            if config.cpp.descriptive_names
+            else ""
+        )
+        kernel_name = "_".join(["cpp", fused_name, wrapper.next_kernel_suffix()])
+        kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel"
+        src_code = src_code.replace(str(Placeholder.KERNEL_NAME), kernel_decl_name)
+        src_code = src_code.replace(str(Placeholder.DESCRIPTIVE_NAME), kernel_name)
+        # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
+        # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
+        src_code = src_code.replace("#pragma CMT", "//")
+
+        compile_wrapper = IndentedBuffer()
+        _, _, arg_types = self.kernel_group.args.cpp_argdefs()
+        if not V.graph.cpp_wrapper:
+            compile_wrapper.writeline(f"async_compile.cpp_pybinding({arg_types!r}, '''")
+        compile_wrapper.splice(src_code, strip=True)
+        if not V.graph.cpp_wrapper:
+            compile_wrapper.writeline("''')")
+        wrapper.define_kernel(kernel_name, compile_wrapper.getvalue(), cuda=False)
+        return kernel_name
+
     def flush(self):
-        self.kernel_group.codegen_define_and_call(V.graph.wrapper_code)
-        self.get_kernel_group()
+        src_code = self.kernel_group.codegen_group()
+        if src_code:
+            kernel_name = self.define_kernel(
+                src_code, self.kernel_group.scheduled_nodes
+            )
+            self.kernel_group.call_kernel(V.graph.wrapper_code, kernel_name)
+        self.reset_kernel_group()
         self._set_flush_status(False)
 
 
@@ -3258,31 +3726,28 @@ def get_num_args(self):
         args_num = len(arg_defs)
         return args_num
 
-    def codegen_define_and_call(self, wrapper):
+    def codegen_group(self, name=None) -> str:
         self.stack.close()
         if not self.scheduled_nodes:
-            return
-
-        fused_name = (
-            get_fused_kernel_name(self.scheduled_nodes, config.cpp.descriptive_names)
-            if config.cpp.descriptive_names
-            else ""
-        )
-        kernel_name = "_".join(["cpp", fused_name, wrapper.next_kernel_suffix()])
-        arg_defs, call_args, arg_types = self.args.cpp_argdefs()
-        arg_defs = ",\n".ljust(25).join(arg_defs)
-        arg_types = ",".join(arg_types)
+            return ""
         code = BracesBuffer()
+        # 1. Include header files
         # TODO: support kernel profile on other platforms
         enable_kernel_profile = (
             config.cpp.enable_kernel_profile and sys.platform == "linux"
         )
         if enable_kernel_profile:
             code.writelines(["#include <ATen/record_function.h>"])
-        kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel"
         code.writeline(codecache.cpp_prefix())
 
+        # 2. Function definition
+        kernel_decl_name = str(Placeholder.KERNEL_NAME) if name is None else name
+        kernel_name = str(Placeholder.DESCRIPTIVE_NAME) if name is None else name
+        arg_defs, _, _ = self.args.cpp_argdefs()
+        arg_defs = ",\n".ljust(25).join(arg_defs)
         code.writeline(f'extern "C" void {kernel_decl_name}({arg_defs})')
+
+        # 3. Function body
         with code.indent():
             if enable_kernel_profile:
                 graph_id = V.graph.graph_id
@@ -3295,21 +3760,13 @@ def codegen_define_and_call(self, wrapper):
             for old, new in self.args.aliases():
                 code.writeline(f"auto {old} = {new};")
             code.splice(self.loops_code)
+        return code.getvalue()
 
-        codecache_def = IndentedBuffer()
-        if not V.graph.cpp_wrapper:
-            codecache_def.writeline("async_compile.cpp('''")
-        codecache_def.splice(code)
-        if not V.graph.cpp_wrapper:
-            codecache_def.writeline("''')")
-
-        codecache_str = codecache_def.getvalue()
-        # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
-        # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
-        codecache_str = codecache_str.replace("#pragma CMT", "//")
-        wrapper.define_kernel(kernel_name, codecache_str, cuda=False)
-        # generate the code to call this
-        wrapper.generate_kernel_call(kernel_name, call_args, cuda=False)
+    def call_kernel(self, wrapper, kernel_name):
+        _, call_args, arg_types = self.args.cpp_argdefs()
+        wrapper.generate_kernel_call(
+            kernel_name, call_args, cuda=False, arg_types=arg_types
+        )
 
 
 class CppWrapperKernelGroup(KernelGroup):
@@ -3337,6 +3794,9 @@ def parallel(self, threads):
             else:
                 self.code.writeline(f"#pragma omp parallel num_threads({threads})")
             self.stack.enter_context(self.code.indent())
+            self.code.writeline(
+                "int tid = omp_get_thread_num();",
+            )
 
     def single(self):
         if self.in_parallel:
@@ -3365,7 +3825,7 @@ class LoopLevel:
     simd_omp: bool = False
     simd_vec: bool = False
     collapsed: bool = False
-    reduction_var_map: Optional[Dict[str, str]] = None
+    is_reduction: bool = False
     parent: Optional["LoopLevel"] = None
     # the next inner level of the loop, empty if it is inner-most
     # contains >1 LoopLevel if the inner level of loop is split
@@ -3394,6 +3854,13 @@ def get_kernels(self) -> List[CppKernel]:
             kernels += loop.get_kernels()
         return kernels
 
+    def get_root(self):
+        """Get all kernel objects under this loop level"""
+        root = self
+        while root.parent:
+            root = root.parent
+        return root
+
     def set_kernel(self, kernel: CppKernel):
         """
         Set the kernel under this loop level. No split is allowed under
@@ -3403,13 +3870,6 @@ def set_kernel(self, kernel: CppKernel):
             self.kernel = kernel
             loop: Optional[LoopLevel] = self
             assert loop is not None
-            if loop.is_reduction():
-                loop.reduction_var_map = kernel.reduction_var_map.copy()
-                loop = loop.parent
-                while loop is not None and loop.is_reduction():
-                    assert loop.reduction_var_map is not None
-                    loop.reduction_var_map.update(kernel.reduction_var_map)
-                    loop = loop.parent
             return
         assert len(self.inner) == 1
         self.inner[0].set_kernel(kernel)
@@ -3423,9 +3883,6 @@ def get_loops_at(self, depth) -> List["LoopLevel"]:
                 loops += loop.get_loops_at(depth - 1)
             return loops
 
-    def is_reduction(self):
-        return bool(self.reduction_var_map)
-
     def split_with_tiling(self, depth, factor):
         def clone_inner():
             inner = []
@@ -3442,7 +3899,7 @@ def do_split_with_tiling():
             main_loop.steps = sympy_factor
             main_loop.parallel = self.parallel
             main_loop.collapsed = False
-            main_loop.reduction_var_map = self.reduction_var_map
+            main_loop.is_reduction = self.is_reduction
             main_loop.inner = clone_inner()
             if main_loop.inner:
                 for loop in main_loop.inner:
@@ -3452,7 +3909,7 @@ def do_split_with_tiling():
             tail_loop.offset = offset
             tail_loop.parallel = self.parallel
             tail_loop.collapsed = False
-            tail_loop.reduction_var_map = self.reduction_var_map
+            tail_loop.is_reduction = self.is_reduction
             tail_loop.inner = clone_inner()
             if tail_loop.inner:
                 for loop in tail_loop.inner:
@@ -3488,13 +3945,6 @@ def lines(self):
         size_expr = cexpr_index(self.size)
         if config.cpp.no_redundant_loops and offset_expr == size_expr:
             return None
-        if self.reduction_var_map:
-            reduction = " " + " ".join(
-                f"reduction({RTYPE_TO_CPP[rtype]}:{var})"
-                for var, rtype in self.reduction_var_map.items()
-            )
-        else:
-            reduction = ""
         simd = (
             f"simd simdlen({self.simd_nelements}) "
             if self.simd_omp and self.simd_nelements > 1
@@ -3502,7 +3952,7 @@ def lines(self):
         )
         if self.parallel:
             # TODO(jansel): look into chunk size and other schedules
-            line1 = f"#pragma omp for{reduction} "
+            line1 = "#pragma omp for"
             if self.parallel > 1:
                 line1 += f" collapse({self.parallel})"
             if self.simd_omp:
@@ -3510,8 +3960,8 @@ def lines(self):
         elif self.simd_vec:
             line1 = ""
         elif self.simd_omp:
-            line1 = f"#pragma omp {simd}{reduction}"
-        elif not self.reduction_var_map and codecache.is_gcc():
+            line1 = f"#pragma omp {simd}"
+        elif not self.is_reduction and codecache.is_gcc():
             line1 = "#pragma GCC ivdep"
         else:
             line1 = ""
@@ -3554,7 +4004,7 @@ def build(kernel: CppKernel):
         for loop_idx, (var, size) in enumerate(zip(itervars, ranges)):
             loop = LoopLevel(var, size, parent=loop)
             if loop_idx >= reduction_depth:
-                loop.reduction_var_map = kernel.reduction_var_map.copy()
+                loop.is_reduction = kernel.is_reduction
             levels.append(loop)
             levels = loop.inner
         loop_nest = LoopNestWithSplit(root)
@@ -3588,8 +4038,8 @@ def max_parallel_depth(self):
         loops = self.root
         if len(loops) > 1:
             return 1
-        is_reduction = loops[0].is_reduction() if loops else False
-        while len(loops) == 1 and loops[0].is_reduction() == is_reduction:
+        is_reduction = loops[0].is_reduction if loops else False
+        while len(loops) == 1 and loops[0].is_reduction == is_reduction:
             max_depth += 1
             loops = loops[0].inner
         return max_depth
@@ -3600,7 +4050,7 @@ def is_reduction_only(self):
         are always the inner most ones.
         """
         return (
-            self.root is not None and len(self.root) > 0 and self.root[0].is_reduction()
+            self.root is not None and len(self.root) > 0 and self.root[0].is_reduction
         )
 
     def mark_parallel(self, par_depth):
@@ -3628,3 +4078,13 @@ def split_with_tiling(self, depth, factor):
         if depth == 0:
             self.root = split_loops
         return split_loops
+
+    def get_kernels(self) -> List[CppKernel]:
+        """Get all kernel objects under this loop nest"""
+        if self.kernel:
+            return [self.kernel]
+        kernels: List[CppKernel] = []
+        assert self.root is not None
+        for loop in self.root:
+            kernels += loop.get_kernels()
+        return kernels
diff --git a/torch/_inductor/codegen/cpp_prefix.h b/torch/_inductor/codegen/cpp_prefix.h
index 98e3f58245d4d..ddf0ce76c12ae 100644
--- a/torch/_inductor/codegen/cpp_prefix.h
+++ b/torch/_inductor/codegen/cpp_prefix.h
@@ -7,17 +7,24 @@
 #include <limits>
 #include <omp.h>
 
+// WARNING: be extra careful when including more ATen/c10 header files here!
+// Because AOTInductor generated code will copy-paste this cpp_prefix.h for
+// the CPU backend, we have to make sure the used headers are implemented
+// in a header-only way, i.e. all the function and class definitions are
+// in .h files instead of .cpp files, to avoid ABI backward-compatiblity breakage.
+
 #include <ATen/NumericUtils.h>
 #include <ATen/core/PhiloxRNGEngine.h>
-#include <ATen/native/Math.h>
 
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e5m2.h>
 #include <c10/util/BFloat16.h>
 #include <c10/util/BFloat16-math.h>
 #include <c10/util/generic_math.h>
 #include <c10/util/Half.h>
 #include <c10/util/TypeCast.h>
 
-#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR)
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR) || defined(CPU_CAPABILITY_NEON)
 #define INDUCTOR_USE_VECTOR_TYPES() 1
 #else
 #define INDUCTOR_USE_VECTOR_TYPES() 0
@@ -31,6 +38,9 @@
 typedef at::Half half;
 typedef at::BFloat16 bfloat16;
 
+typedef at::Float8_e4m3fn float8_e4m3fn;
+typedef at::Float8_e5m2 float8_e5m2;
+
 template <typename T>
 struct Welford {
   T mean = T(0);
@@ -87,6 +97,31 @@ Welford<T> welford_combine(const Welford<T> &acc, T data) {
   return result;
 }
 
+// Refer to https://github.com/pytorch/pytorch/blob/b5b36cf0c4e1958f1ff25120f5d4beeef3288187/
+// aten/src/ATen/native/SharedReduceOps.h#L419-L445
+template <typename scalar_t>
+inline bool greater_or_nan(scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) {
+  // If (a == b), then choose the one with lower idx, else max(a, b)
+  if (at::_isnan(a)) {
+    if (at::_isnan(b)) {
+      return idx_a < idx_b;
+    }
+    return true;
+  }
+  return (a == b) ? idx_a < idx_b : (a > b);
+}
+
+template <typename scalar_t>
+inline bool less_or_nan(scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) {
+  // If (a == b), then choose the one with lower idx, else min(a, b)
+  if (at::_isnan(a)) {
+    if (at::_isnan(b)) {
+      return idx_a < idx_b;
+    }
+    return true;
+  }
+  return (a == b) ? idx_a < idx_b : (a < b);
+}
 
 #if INDUCTOR_USE_VECTOR_TYPES()
 template <typename scalar_t>
@@ -144,7 +179,7 @@ Welford<scalar_t> welford_vec_reduce_all(Welford<at::vec::Vectorized<scalar_t>>
 #endif
 
 
-template <typename T> inline T mod(T a, T b) { return a % b; }
+template <typename T, typename U> inline typename std::common_type<T, U>::type mod(T a, U b) { return a % b; }
 template <> inline float mod(float a, float b) { return std::fmod(a, b); }
 template <> inline double mod(double a, double b) { return std::fmod(a, b); }
 
@@ -179,12 +214,12 @@ float randn_cpu(uint32_t seed, uint32_t offset) {
   return engine.randn(10);
 }
 
-uint64_t randint64_cpu(uint32_t seed, uint32_t offset, int64_t low, int64_t high) {
+int64_t randint64_cpu(uint32_t seed, uint32_t offset, int64_t low, int64_t high) {
   auto gen = at::Philox4_32(seed, 0, offset);
   uint64_t r0 = gen();
   uint64_t r1 = gen();
   uint64_t result = r0 | (r1 << 32);
-  return (result % static_cast<uint64_t>(high - low)) + low;
+  return static_cast<int64_t>(result % (high - low)) + low;
 }
 
 template <typename T> struct AsIntegerType { typedef T type; };
@@ -193,19 +228,19 @@ template <> struct AsIntegerType<double> { typedef uint64_t type; };
 template <> struct AsIntegerType<bfloat16> { typedef uint16_t type; };
 
 template <typename T>
-typename std::enable_if<!std::is_reduced_floating_point<T>::value, T>::type
+typename std::enable_if_t<!std::is_reduced_floating_point_v<T>, T>
 inline fetch_value(volatile T *addr) {
   return *addr;
 }
 
 template <typename T>
-typename std::enable_if<std::is_reduced_floating_point<T>::value, T>::type
+typename std::enable_if_t<std::is_reduced_floating_point_v<T>, T>
 inline fetch_value(volatile T *addr) {
   return T(addr->x, T::from_bits());
 }
 
 template <typename T>
-typename std::enable_if<!std::is_integral<T>::value>::type
+typename std::enable_if_t<!std::is_integral_v<T>>
 atomic_add(volatile T *addr, T offset) {
   typedef typename AsIntegerType<T>::type alt_type;
 
@@ -229,214 +264,10 @@ atomic_add(volatile T *addr, T offset) {
 // better than compare_exchange_weak, which can be checked by microbenchmark
 // inductor_cpu_atomic.py
 template <typename T>
-typename std::enable_if<std::is_integral<T>::value>::type
+typename std::enable_if_t<std::is_integral_v<T>>
 atomic_add(volatile T *addr, T offset) {
   static_assert(sizeof(std::atomic<T>) == sizeof(T),
                 "std::atomic issue");
   std::atomic<T> *atomic_addr = (std::atomic<T> *)addr;
   atomic_addr->fetch_add(offset, std::memory_order_relaxed);
 }
-
-// This function is used to convert bool or uint8 to float mask for
-// vectorization. The caller needs to make sure the src represents TRUE/FALSE
-// correctly.
-template <typename T>
-inline float flag_to_float_scalar(T src) {
-  float ret;
-  *(uint32_t*)(&ret) = src ? 0xFFFFFFFF : 0;
-  return ret;
-}
-
-#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR)
-
-inline at::vec::Vectorized<float> masked_load(const float* src, at::vec::Vectorized<float> mask) {
-# if defined(CPU_CAPABILITY_AVX512)
-    at::vec::Vectorized<float> zero_vec(0);
-    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
-    auto mmask = _mm512_cmp_epi32_mask(_mm512_castps_si512(mask), all_ones, _MM_CMPINT_EQ);
-    return _mm512_mask_loadu_ps(zero_vec, mmask, src);
-# elif defined(CPU_CAPABILITY_AVX2)
-    auto all_ones = _mm256_set1_epi32(0xFFFFFFFF);
-    auto mmask = _mm256_cmpeq_epi32(_mm256_castps_si256(mask), all_ones);
-    return _mm256_maskload_ps(src, mmask);
-# elif defined(CPU_CAPABILITY_ZVECTOR)
-    auto result = at::vec::Vectorized<float>::loadu(src);
-    return (result & mask);
-# else
-# error Unsupported vectorization CPU capability
-# endif
-}
-
-template <typename T>
-typename std::enable_if<std::is_same<T, bfloat16>::value || std::is_same<T, half>::value, at::vec::Vectorized<T>>::type
-inline masked_load(const T* src, at::vec::Vectorized<float> mask) {
-# if defined(CPU_CAPABILITY_AVX512)
-  auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
-  auto mmask = _mm512_cmp_epi32_mask(_mm512_castps_si512(mask), all_ones, _MM_CMPINT_EQ);
-  auto zero = _mm256_set1_epi16(0);
-  auto temp = _mm256_mask_loadu_epi16(zero, mmask, src);
-  return _mm512_inserti32x8(_mm512_castsi256_si512(temp), zero, 1);
-# elif defined(CPU_CAPABILITY_AVX2)
-  auto all_ones = _mm256_set1_epi32(0xFFFFFFFF);
-  auto mmask_vec = _mm256_cmpeq_epi32(_mm256_castps_si256(mask), all_ones);
-  __at_align__ uint32_t mmask[8];
-  _mm256_storeu_si256(reinterpret_cast<__m256i*>(mmask), mmask_vec);
-  __at_align__ uint16_t result[16];
-  for (auto i = 0; i < 8; i++) {
-    result[i] = mmask[i] == 0xFFFFFFFF ? src[i].x: uint16_t(0);
-  }
-  return at::vec::Vectorized<T>::loadu(result);
-# elif defined(CPU_CAPABILITY_ZVECTOR)
-  auto result = at::vec::Vectorized<T>::loadu(src, 8);
-  uint32_t maskdata[8] = { 0 };
-  uint16_t maskdata_dest[16] = { 0 };
-  mask.store(maskdata);
-  for (auto i = 0; i < 8; i++) {
-    maskdata_dest[i] = (maskdata[i] == 0xFFFFFFFF) ? 0xFFFF: 0;
-  }
-  auto maskvector = at::vec::Vectorized<T>::loadu(maskdata_dest);
-  return (result & maskvector);
-# else
-# error Unsupported vectorization CPU capability
-# endif
-}
-
-inline at::vec::Vectorized<uint8_t> masked_load(const uint8_t* src, at::vec::Vectorized<float> mask) {
-# if defined(CPU_CAPABILITY_AVX512)
-    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
-    auto mmask = _mm512_cmp_epi32_mask(_mm512_castps_si512(mask), all_ones, _MM_CMPINT_EQ);
-    auto zero = _mm_set1_epi8(0);
-    auto temp = _mm_mask_loadu_epi8(zero, mmask, src);
-    return _mm512_inserti64x2(_mm512_set1_epi32(0), temp, 0);
-# elif defined(CPU_CAPABILITY_AVX2)
-    auto all_ones = _mm256_set1_epi32(0xFFFFFFFF);
-    auto mmask_vec = _mm256_cmpeq_epi32(_mm256_castps_si256(mask), all_ones);
-    __at_align__ uint32_t mmask[8];
-    _mm256_storeu_si256(reinterpret_cast<__m256i*>(mmask), mmask_vec);
-    __at_align__ uint8_t result[32];
-    for (auto i = 0; i < 8; i++) {
-      result[i] = mmask[i] == 0xFFFFFFFF ? src[i]: uint8_t(0);
-    }
-    return at::vec::Vectorized<uint8_t>::loadu(result);
-# elif defined(CPU_CAPABILITY_ZVECTOR)
-    auto result = at::vec::Vectorized<uint8_t>::loadu(src, 8);
-    uint32_t maskdata[8];
-    uint8_t maskdata_dest[32] = { 0 };
-    mask.store(maskdata);
-    for (auto i = 0; i < 8; i++) {
-      maskdata_dest[i] = (maskdata[i] == 0xFFFFFFFF) ? 0xFF: 0;
-    }
-    auto maskvector = at::vec::Vectorized<uint8_t>::loadu(maskdata_dest);
-    return (result & maskvector);
-# else
-# error Unsupported vectorization CPU capability
-# endif
-}
-
-template <typename T>
-inline at::vec::Vectorized<float> flag_to_float_vec(const T* src) {
-  __at_align__ float dst_tmp[at::vec::Vectorized<float>::size()];
-  #pragma unroll
-  for (int64_t i = 0; i < at::vec::Vectorized<float>::size(); i++) {
-    dst_tmp[i] = flag_to_float_scalar(src[i]);
-  }
-  return at::vec::Vectorized<float>::loadu(dst_tmp);
-}
-
-template <typename scalar_t>
-inline at::vec::Vectorized<float> cvt_lowp_fp_to_fp32(
-    at::vec::Vectorized<scalar_t> src) {
-  at::vec::Vectorized<float> res_vec1(0);
-  at::vec::Vectorized<float> res_vec2(0);
-  std::tie(res_vec1, res_vec2) = at::vec::convert_to_float<scalar_t>(src);
-  return res_vec1;
-}
-
-template <typename scalar_t>
-inline at::vec::Vectorized<scalar_t> cvt_fp32_to_lowp_fp(
-    at::vec::Vectorized<float> src) {
-  return at::vec::convert_from_float<scalar_t>(src, src);
-}
-
-inline at::vec::Vectorized<float> mask_convert_to_float(at::vec::Vectorized<float> src) {
-  auto zeros = at::vec::Vectorized<float>(0);
-  auto ones = at::vec::Vectorized<float>(1);
-  return at::vec::Vectorized<float>::blendv(zeros, ones, src);
-}
-
-template <typename SRC>
-inline at::vec::Vectorized<float> vec_convert_to_mask(at::vec::Vectorized<SRC> src) {
-  assert(
-      at::vec::Vectorized<float>::size() == at::vec::Vectorized<SRC>::size());
-  at::vec::Vectorized<float> res_vec(0);
-  __at_align__ float dst_tmp[at::vec::Vectorized<float>::size()];
-  __at_align__ SRC src_tmp[at::vec::Vectorized<SRC>::size()];
-  src.store(src_tmp);
-
-#pragma unroll
-  for (int i = 0; i < at::vec::Vectorized<float>::size(); i++) {
-    *(uint32_t*)(dst_tmp + i) = src_tmp[i] ? 0xFFFFFFFF : 0;
-  }
-
-  return res_vec.loadu(dst_tmp);
-}
-
-template <typename SRC>
-inline at::vec::Vectorized<float> to_float_mask(at::vec::Vectorized<SRC> src) {
-  return vec_convert_to_mask(src);
-}
-
-#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)
-template <>
-inline at::vec::Vectorized<float> to_float_mask(at::vec::Vectorized<int> src) {
-#if defined(CPU_CAPABILITY_AVX2)
-  return at::vec::Vectorized<float>(_mm256_castsi256_ps(src));
-#else
-  return at::vec::Vectorized<float>(_mm512_castsi512_ps(src));
-#endif
-}
-#endif
-
-template <>
-inline at::vec::Vectorized<float> to_float_mask(at::vec::Vectorized<float> src) {
-  return src;
-}
-
-inline at::vec::Vectorized<float> to_float_mask(int src) {
-  float mask;
-  *(uint32_t*)&mask = src ? 0xFFFFFFFF : 0;
-  return at::vec::Vectorized<float>(mask);
-}
-
-inline bool all_zero(at::vec::Vectorized<float> src) {
-# if defined(CPU_CAPABILITY_AVX512)
-  auto src_int = _mm512_castps_si512(src);
-  __mmask16 mask = _mm512_test_epi32_mask(src_int, src_int);
-  return mask == 0;
-# elif defined(CPU_CAPABILITY_AVX2)
-  return _mm256_testz_ps(src, src);
-# else
-  __at_align__ int mask[at::vec::Vectorized<float>::size()];
-  src.store(mask);
-  for (int i = 0; i < at::vec::Vectorized<float>::size(); i++) {
-    if (mask[i] != 0) {
-      return false;
-    }
-  }
-  return true;
-# endif
-}
-
-inline bool vector_lane_mask_check(at::vec::Vectorized<float> src, int lane) {
-# if defined(CPU_CAPABILITY_AVX512)
-  return _mm512_movepi32_mask(_mm512_castps_si512(src)) & (1 << lane);
-# elif defined(CPU_CAPABILITY_AVX2)
-  return _mm256_movemask_ps(src) & (1 << lane);
-# else
-  __at_align__ int mask[at::vec::Vectorized<float>::size()];
-  src.store(mask);
-  return mask[lane] != 0;
-# endif
-}
-
-#endif
diff --git a/torch/_inductor/codegen/cpp_utils.py b/torch/_inductor/codegen/cpp_utils.py
new file mode 100644
index 0000000000000..e205bacad4f26
--- /dev/null
+++ b/torch/_inductor/codegen/cpp_utils.py
@@ -0,0 +1,231 @@
+import math
+
+import torch
+
+from .common import ExprPrinter
+
+DTYPE_TO_CPP = {
+    torch.float32: "float",
+    torch.float64: "double",
+    torch.float16: "half",
+    torch.int64: "int64_t",
+    torch.int32: "int",
+    torch.int16: "short",
+    torch.int8: "signed char",
+    torch.uint64: "uint64_t",
+    torch.uint32: "unsigned int",
+    torch.uint16: "unsigned short",
+    torch.uint8: "unsigned char",
+    torch.bool: "bool",
+    torch.bfloat16: "bfloat16",
+    torch.complex64: "complex64",
+    torch.float8_e4m3fn: "float8_e4m3fn",
+    torch.float8_e5m2: "float8_e5m2",
+}
+
+DTYPE_TO_ATEN = {
+    torch.float32: "at::kFloat",
+    torch.float64: "at::kDouble",
+    torch.float16: "at::kHalf",
+    torch.int64: "at::kLong",
+    torch.int32: "at::kInt",
+    torch.int16: "at::kShort",
+    torch.int8: "at::kChar",
+    torch.uint64: "at::kUInt64",
+    torch.uint32: "at::kUInt32",
+    torch.uint16: "at::kUInt16",
+    torch.uint8: "at::kByte",
+    torch.uint32: "at::kUInt32",
+    torch.uint64: "at::kUInt64",
+    torch.bool: "at::kBool",
+    torch.bfloat16: "at::kBFloat16",
+    torch.complex32: "at::kComplexHalf",
+    torch.complex64: "at::kComplexFloat",
+    torch.complex128: "at::kComplexDouble",
+    torch.float8_e4m3fn: "at::kFloat8_e4m3fn",
+    torch.float8_e5m2: "at::kFloat8_e5m2",
+    torch.float8_e4m3fnuz: "at::kFloat8_e4m3fnuz",
+    torch.float8_e5m2fnuz: "at::kFloat8_e5m2fnuz",
+}
+
+DEVICE_TO_ATEN = {
+    "cpu": "at::kCPU",
+    "cuda": "at::kCUDA",
+}
+
+INDEX_TYPE = "long"
+
+
+class CppPrinter(ExprPrinter):
+    def _print_Integer(self, expr):
+        return f"{int(expr)}L"
+
+    def _print_Where(self, expr):
+        c = self.paren(self.doprint(expr.args[0]))
+        p = self.paren(self.doprint(expr.args[1]))
+        q = self.paren(self.doprint(expr.args[2]))
+        return f"{c} ? {p} : {q}"
+
+    def _print_ModularIndexing(self, expr):
+        x, div, mod = expr.args
+        x = self.paren(self.doprint(x))
+        if div != 1:
+            div = self.paren(self.doprint(div))
+            if expr.is_integer:
+                x = f"c10::div_floor_integer({x}, {div})"
+            else:
+                x = f"c10::div_floor_floating(static_cast<double>({x}), static_cast<double>({div}))"
+        mod = self.paren(self.doprint(mod))
+        return f"static_cast<{INDEX_TYPE}>({x}) % static_cast<{INDEX_TYPE}>({mod})"
+
+    def _print_FloorDiv(self, expr):
+        x, div = expr.args
+        x = self.paren(self.doprint(x))
+        div = self.paren(self.doprint(div))
+        if expr.is_integer:
+            return f"c10::div_floor_integer({x}, {div})"
+        return f"c10::div_floor_floating(static_cast<double>({x}), static_cast<double>({div}))"
+
+    def _print_floor(self, expr):
+        assert len(expr.args) == 1
+        r = f"std::floor({self._print(expr.args[0])})"
+        return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
+
+    def _print_Trunc(self, expr):
+        assert len(expr.args) == 1
+        r = f"std::trunc({self._print(expr.args[0])})"
+        return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
+
+    def _print_Pow(self, expr):
+        # Uses float constants to perform FP div
+        base, exp = expr.args
+        base = self._print(base)
+
+        if exp == 0.5 or exp == -0.5:
+            return f"std::sqrt({base})" if exp == 0.5 else f"1.0/std::sqrt({base})"
+        assert exp.is_integer
+        exp = int(exp)
+        if exp > 0:
+            r = "*".join([self.paren(base)] * exp)
+        elif exp < 0:
+            r = "1.0/" + self.paren("*".join([self.paren(base)] * abs(exp)))
+        else:  # exp == 0
+            r = "1.0"
+
+        return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
+
+    def _print_Rational(self, expr):
+        # Uses float constants to perform FP div
+        if expr.q == 1:
+            r = f"{expr.p}"
+        else:
+            r = f"{expr.p}.0/{expr.q}.0"
+        return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
+
+    def _print_ceiling(self, expr):
+        assert len(expr.args) == 1
+        r = f"std::ceil({self._print(expr.args[0])})"
+        return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
+
+    def _print_Min(self, expr):
+        args = [self._print(a) for a in expr.args]
+        if len(args) == 2:
+            return f"std::min({args[0]}, {args[1]})"
+        else:
+            # Initializer list overload
+            il = "{" + ", ".join(args) + "}"
+            return f"std::min({il})"
+
+    def _print_Max(self, expr):
+        args = [self._print(a) for a in expr.args]
+        if len(args) == 2:
+            return f"std::max({args[0]}, {args[1]})"
+        else:
+            # Initializer list overload
+            il = "{" + ", ".join(args) + "}"
+            return f"std::max({il})"
+
+    def _print_Abs(self, expr):
+        assert len(expr.args) == 1
+        return f"std::abs({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_cos(self, expr):
+        assert len(expr.args) == 1
+        return f"std::cos({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_cosh(self, expr):
+        assert len(expr.args) == 1
+        return f"std::cosh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_acos(self, expr):
+        assert len(expr.args) == 1
+        return f"std::acos({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_sin(self, expr):
+        assert len(expr.args) == 1
+        return f"std::sin({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_sinh(self, expr):
+        assert len(expr.args) == 1
+        return f"std::sinh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_asin(self, expr):
+        assert len(expr.args) == 1
+        return f"std::asin({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_tan(self, expr):
+        assert len(expr.args) == 1
+        return f"std::tan({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_tanh(self, expr):
+        assert len(expr.args) == 1
+        return f"std::tanh({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_atan(self, expr):
+        assert len(expr.args) == 1
+        return f"std::atan({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_sqrt(self, expr):
+        return f"std::sqrt({self._print(expr.args[0])})"
+
+    def _print_Round(self, expr):
+        assert len(expr.args) == 1
+        return f"std::lrint({self._print(expr.args[0])})"
+
+    def _print_RoundDecimal(self, expr):
+        assert len(expr.args) == 2
+        number, ndigits = expr.args
+        if number.is_integer:
+            # ndigits < 0 should have been filtered by the sympy function
+            assert ndigits < 0
+            raise ValueError(
+                f"For integer inputs, only non-negative ndigits are currently supported, but got {ndigits}."
+            )
+        return f"static_cast<double>(std::nearbyint(1e{ndigits} * {self.paren(self._print(number))}) * 1e{-ndigits})"
+
+    def _print_BooleanTrue(self, expr):
+        return "true"
+
+    def _print_BooleanFalse(self, expr):
+        return "false"
+
+
+# A function to print, useful for printing sympy symbols.
+cexpr = CppPrinter().doprint
+
+
+def cexpr_index(index):
+    return f"static_cast<{INDEX_TYPE}>({cexpr(index)})"
+
+
+def value_to_cpp(value, cpp_type):
+    if value == float("-inf"):
+        return f"-std::numeric_limits<{cpp_type}>::infinity()"
+    elif value == float("inf"):
+        return f"std::numeric_limits<{cpp_type}>::infinity()"
+    elif isinstance(value, bool):
+        return f"static_cast<{cpp_type}>({str(value).lower()})"
+    elif math.isnan(value):
+        return f"std::numeric_limits<{cpp_type}>::quiet_NaN()"
+    else:
+        return f"static_cast<{cpp_type}>({repr(value)})"
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
new file mode 100644
index 0000000000000..875a0d0b353fa
--- /dev/null
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -0,0 +1,2333 @@
+import functools
+import math
+import os
+import sys
+from itertools import count
+from typing import Dict, List, Optional, Tuple
+
+import sympy
+from sympy import Expr
+
+import torch
+import torch._ops
+from torch.fx.experimental.symbolic_shapes import ConvertIntKey, DivideByKey
+from .. import config, ir
+
+from ..codecache import CudaKernelParamCache
+from ..utils import cache_on_self, sympy_product
+from ..virtualized import V
+from .aoti_hipify_utils import maybe_hipify_code_wrapper
+from .common import IndentedBuffer
+from .cpp_utils import cexpr, CppPrinter, DEVICE_TO_ATEN, DTYPE_TO_ATEN, DTYPE_TO_CPP
+from .wrapper import EnterSubgraphLine, ExitSubgraphLine, WrapperCodeGen
+
+
+class CppWrapperCpu(WrapperCodeGen):
+    """
+    Generates cpp wrapper for running on CPU and calls cpp kernels
+    """
+
+    def __init__(self):
+        if not hasattr(self, "device"):
+            self.device = "cpu"
+        super().__init__()
+        self.declare = "auto "
+        self.declare_maybe_reference = "decltype(auto) "
+        self.ending = ";"
+        self.open_bracket = "{"
+        self.closed_bracket = "}"
+        self.comment = "//"
+        self.namespace = "at::"
+        self.none_str = "nullptr" if config.abi_compatible else "at::Tensor()"
+        self.extern_call_ops = set()
+        self.size = "sizes()"
+        self.stride = "strides()"
+        self.cuda = False
+        self.supports_intermediate_hooks = False
+        self.outputs_need_copy = set()
+        self.kernel_callsite_id = count()
+        self.int_array_id = count()  # for int array local variable declarations
+        self.declared_int_array_vars = set()
+        self.tmp_tensor_id = count()  # for tmp tensor local variable declarations
+        self.arg_var_id = count()
+        self.used_cached_devices = set()
+        self.used_cached_dtypes = set()
+        self.cached_output_id = count()
+        self.scalar_to_tensor_id = count()
+        self.custom_op_wrapper_loaded = False
+        self.expr_printer = cexpr
+
+        # CppPrinter sometimes calls at::native functions which causes problems in
+        # the ABI-compatible mode. Currently we are hitting this problem when codegen
+        # Grid computation expressions, but we my need to fix other size computation
+        # as well.
+        class GridExprCppPrinter(CppPrinter):
+            def _print_FloorDiv(self, expr):
+                x, div = expr.args
+                x = self.paren(self.doprint(x))
+                div = self.paren(self.doprint(div))
+                assert expr.is_integer, "Expect integers in GridExprPrinter"
+                return f"({x}/{div})"
+
+        self.grid_expr_printer = GridExprCppPrinter().doprint
+
+    def generate_kernel_call(
+        self,
+        name,
+        call_args,
+        grid=None,
+        device_index=None,
+        cuda=True,
+        triton=True,
+        arg_types=None,
+        grid_fn: str = "grid",
+        triton_meta=None,
+    ):
+        """
+        Generates kernel call code.
+
+        cuda: Defines whether the backend is GPU. Otherwise the backend is CPU.
+
+        triton: Defines whether the GPU backend uses Triton for codegen.
+                Otherwise it uses the CUDA language for codegen.
+                Only valid when cuda == True.
+        """
+        if cuda:
+            return super().generate_kernel_call(
+                name,
+                call_args,
+                grid,
+                device_index,
+                cuda,
+                triton,
+                arg_types,
+                grid_fn,
+            )
+        else:
+            if config.abi_compatible:
+                assert arg_types is not None and len(call_args) == len(
+                    arg_types
+                ), "Mismatch call_args and arg_types in generate_kernel_call"
+                new_args = []
+                for idx, arg in enumerate(call_args):
+                    if "*" in arg_types[idx]:
+                        var_name = f"var_{next(self.arg_var_id)}"
+                        self.writeline(
+                            f"auto* {var_name} = get_data_ptr_wrapper({arg});"
+                        )
+                        new_args.append(f"({arg_types[idx]})({var_name})")
+                    else:
+                        # arg is a scalar
+                        new_args.append(arg)
+                self.writeline(self.wrap_kernel_call(name, new_args))
+            else:
+                self.writeline(self.wrap_kernel_call(name, call_args))
+
+    def write_constant(self, name, hashed):
+        # include a hash so our code cache gives different constants different files
+        self.header.writeline(f"// {name} {hashed}")
+
+    def write_header(self):
+        if V.graph.is_const_graph:
+            # We do not write header for constant graph, it will be written by main module.
+            return
+
+        if V.graph.aot_mode:
+            for header_cpp_file in ("interface.cpp", "implementation.cpp"):
+                with open(
+                    os.path.join(
+                        os.path.dirname(__file__), "aoti_runtime", header_cpp_file
+                    )
+                ) as f:
+                    self.header.splice(f.read())
+        else:
+            self.header.splice(
+                """
+                import torch
+                from torch._inductor.codecache import CppWrapperCodeCache
+
+                cpp_wrapper_src = (
+                '''
+                """
+            )
+
+        if config.abi_compatible:
+            if config.c_shim_version == "1":
+                self.header.splice("#include <torch/csrc/inductor/aoti_torch/c/shim.h>")
+            else:
+                self.header.splice(
+                    f"#include <torch/csrc/inductor/aoti_torch/generated/c_shim_{self.device}.h>"
+                )
+            self.header.splice(
+                """
+                #include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
+                #include <torch/csrc/inductor/aoti_runtime/thread_local.h>
+                #include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
+                """
+            )
+            if V.graph.aot_mode:
+                self.header.splice(
+                    """
+                    #include <torch/csrc/inductor/aoti_runtime/model.h>
+                    """
+                )
+        else:
+            self.header.splice(
+                """
+                #include <ATen/ATen.h>
+                #include <ATen/core/dispatch/Dispatcher.h>
+                #include <ATen/native/BinaryOps.h>
+                #include <torch/csrc/inductor/aoti_runtime/utils.h>
+                #include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
+                #include <torch/csrc/inductor/aoti_torch/utils.h>
+                #include <torch/csrc/inductor/inductor_ops.h>
+                #include <torch/types.h>
+                #include <ATen/ops/bernoulli_native.h>
+
+                #define reinterpret_tensor torch::inductor::_reinterpret_tensor
+                #define alloc_from_pool torch::inductor::_alloc_from_pool
+                """
+            )
+
+        self.header.splice("#include <c10/util/generic_math.h>")
+
+        if not V.graph.aot_mode:
+            self.header.splice(
+                """
+                #include <pybind11/pybind11.h>
+
+                namespace py = pybind11;
+                using namespace torch::aot_inductor;
+
+                class RAIIPyObject {
+                public:
+                    RAIIPyObject() : obj_(nullptr) {}
+                    RAIIPyObject(PyObject* obj) : obj_(obj) {}
+                    ~RAIIPyObject() {
+                        Py_XDECREF(obj_);
+                    }
+                    RAIIPyObject& operator=(const RAIIPyObject& other) {
+                        if (this != &other) {
+                            Py_XDECREF(obj_);
+                            obj_ = other.obj_;
+                            Py_XINCREF(obj_);
+                        }
+                        return *this;
+                    }
+                    operator PyObject*() {
+                        return obj_;
+                    }
+                    PyObject* get() {
+                        return obj_;
+                    }
+                private:
+                    PyObject* obj_;
+                };
+                """
+            )
+
+        from .memory_planning import ALIGN_BYTES
+
+        # Round up to the nearest multiple of ALIGN_BYTES
+        # ALIGN_BYTES must be a power of 2
+        self.header.splice(
+            f"""
+            [[maybe_unused]] static int64_t align(int64_t nbytes) {{
+              return (nbytes + {ALIGN_BYTES} - 1) & -{ALIGN_BYTES};
+            }}
+            """
+        )
+
+    def mark_output_type(self):
+        # mark output type to unwrap tensor back to python scalar
+        from ..ir import ShapeAsConstantBuffer
+
+        output_is_tensor = dict()
+        for idx, x in enumerate(V.graph.graph_outputs):
+            if isinstance(x, ShapeAsConstantBuffer):
+                output_is_tensor[idx] = False
+            else:
+                output_is_tensor[idx] = True
+
+        self.output_is_tensor = output_is_tensor
+
+    def write_prefix(self):
+        if V.graph.is_const_graph:
+            # We do not write prefix for constant graph, it will be written by main module.
+            return
+
+        if V.graph.aot_mode:
+            self.prefix.writeline("namespace torch {")
+            self.prefix.writeline("namespace aot_inductor {")
+
+    def write_input_output_info(
+        self,
+        info_kind: str,
+        idx: int,
+        name: str,
+    ):
+        self.prefix.writeline(f"""{info_kind}[{idx}].name = "{name}";""")
+
+    @staticmethod
+    def get_input_cpp_type(input):
+        assert config.use_minimal_arrayref_interface
+
+        if isinstance(input, sympy.Expr):
+            from ..graph import may_get_constant_buffer_dtype
+
+            dtype = may_get_constant_buffer_dtype(input)
+            assert dtype is not None, f"Failed to get the dtype of sympy.Expr: {input}"
+            return DTYPE_TO_CPP[dtype]
+        return f"ArrayRefTensor<{DTYPE_TO_CPP[input.get_dtype()]}>"
+
+    def generate_input_output_runtime_checks(self):
+        # In debug_compile mode, we generate checks to ensure the dtype/shape/stride of each
+        # real input/output tensor match ones provided at compile time via sample
+        # input/output.
+        def gen_check(handle_kind, idx, name, tensor):
+            self.prefix.writeline(f"auto {name} = {handle_kind}[{idx}];")
+            self.codegen_tensor_dtype_var_decl(self.prefix, name)
+            expected_dtype_name = DTYPE_TO_ATEN[tensor.dtype]
+            dtype_str = str(tensor.dtype).split(".")[-1]
+            self.prefix.splice(
+                f"""
+                    int32_t {name}_expected_dtype = aoti_torch_dtype_{dtype_str}();
+                    if ({name}_expected_dtype != {name}_dtype) {{
+                        std::stringstream ss;
+                        ss << "{handle_kind}[{idx}]: unmatched dtype, "
+                           << "expected: " << {name}_expected_dtype << "({expected_dtype_name}), "
+                           << "but got: " << {name}_dtype << "\\n";
+                        throw std::runtime_error(ss.str());
+                    }}
+                """
+            )
+            self.codegen_input_size_var_decl(self.prefix, name)
+            for dim_idx, d in enumerate(tensor.get_size()):
+                if isinstance(d, (int, sympy.Integer)):
+                    self.prefix.splice(
+                        f"""
+                            if ({d} != {name}_size[{dim_idx}]) {{
+                                std::stringstream ss;
+                                ss << "{handle_kind}[{idx}]: unmatched dim value at {dim_idx}, "
+                                   << "expected: {d}, " << "but got: " << {name}_size[{dim_idx}]
+                                   << "\\n";
+                                throw std::runtime_error(ss.str());
+                            }}
+                        """
+                    )
+                else:
+                    assert isinstance(
+                        d, sympy.Symbol
+                    ), f"dimention at {dim_idx=} for tensor {name=} must be a sympy.Symbol"
+                    sym_range = V.graph.sizevars.shape_env.var_to_range.get(d, None)
+                    if sym_range is None:
+                        continue
+                    if not math.isinf(sym_range.lower):
+                        self.prefix.splice(
+                            f"""
+                                if ({name}_size[{dim_idx}] < {sym_range.lower}) {{
+                                    std::stringstream ss;
+                                    ss << "{handle_kind}[{idx}]: dim value is too small at {dim_idx}, "
+                                       << "expected it to be >= {sym_range.lower}, " << "but got: "
+                                       << {name}_size[{dim_idx}] << "\\n";
+                                    throw std::runtime_error(ss.str());
+                                }}
+                            """
+                        )
+                    if not math.isinf(sym_range.upper):
+                        self.prefix.splice(
+                            f"""
+                                if ({name}_size[{dim_idx}] > {sym_range.upper}) {{
+                                    std::stringstream ss;
+                                    ss << "{handle_kind}[{idx}]: dim value is too large at {dim_idx}, "
+                                       << "expected to be <= {sym_range.upper}, " << "but got: "
+                                       << {name}_size[{dim_idx}] << "\\n";
+                                    throw std::runtime_error(ss.str());
+                                }}
+                            """
+                        )
+
+            self.codegen_input_stride_var_decl(self.prefix, name)
+            for stride_idx, s in enumerate(tensor.get_stride()):
+                if not isinstance(s, (int, sympy.Integer)):
+                    continue
+                self.prefix.splice(
+                    f"""
+                        if ({s} != {name}_stride[{stride_idx}]) {{
+                            std::stringstream ss;
+                            ss << "{handle_kind}[{idx}]: unmatched stride value at {stride_idx}, "
+                               << "expected: {s}, " << "but got: " << {name}_stride[{stride_idx}]
+                               << "\\n";
+                            throw std::runtime_error(ss.str());
+                        }}
+                    """
+                )
+
+        # force noinline to avoid any potential compilation slowdown due to aggressive
+        # inline done by the host compiler
+        self.prefix.splice(
+            """
+            AOTI_NOINLINE static void __check_inputs_outputs(
+                AtenTensorHandle* input_handles,
+                AtenTensorHandle* output_handles) {
+            """
+        )
+        with self.prefix.indent():
+            for idx, (name, tensor) in enumerate(V.graph.graph_inputs.items()):
+                gen_check("input_handles", idx, name, tensor)
+        self.prefix.writeline("}")
+
+    def write_wrapper_decl(self):
+        inputs_len = len(V.graph.graph_inputs.keys())
+        if V.graph.aot_mode:
+            if config.use_minimal_arrayref_interface and not V.graph.is_const_graph:
+                input_cpp_types = ", ".join(
+                    f"{CppWrapperCpu.get_input_cpp_type(x)}"
+                    for x in V.graph.graph_inputs.values()
+                )
+
+                output_arrayref_types = ", ".join(
+                    f"ArrayRefTensor<{DTYPE_TO_CPP[x.get_dtype()]}>"
+                    for x in V.graph.graph_outputs
+                )
+
+                self.prefix.splice(
+                    f"""
+                    using AOTInductorModelInputs = std::tuple<{input_cpp_types}>;
+                    using AOTInductorModelOutputs = std::tuple<{output_arrayref_types}>;
+                    """
+                )
+
+            if V.graph.const_module:
+                self.header.splice(V.graph.const_module.wrapper_code.header)
+                self.prefix.splice(V.graph.const_code)
+
+            if V.graph.is_const_graph:
+                self.prefix.splice(
+                    """
+                    void AOTInductorModel::_const_run_impl(
+                        std::vector<AtenTensorHandle>& output_handles,
+                        DeviceStreamType stream,
+                        AOTIProxyExecutorHandle proxy_executor
+                    ) {
+                    """
+                )
+            else:
+                if not config.aot_inductor.use_runtime_constant_folding:
+                    # If we do not split the constant graph, we'll just create
+                    # an empty implementation when wrapping the main module.
+                    self.prefix.splice(
+                        """
+                        void AOTInductorModel::_const_run_impl(
+                            std::vector<AtenTensorHandle>& output_handles,
+                            DeviceStreamType stream,
+                            AOTIProxyExecutorHandle proxy_executor
+                        ) {}
+
+                        """
+                    )
+
+                run_impl_proto = """
+                    void AOTInductorModel::run_impl(
+                        AtenTensorHandle*
+                            input_handles, // array of input AtenTensorHandle; handles
+                                            // are stolen; the array itself is borrowed
+                        AtenTensorHandle*
+                            output_handles, // array for writing output AtenTensorHandle; handles
+                                            // will be stolen by the caller; the array itself is
+                                            // borrowed
+                        DeviceStreamType stream,
+                        AOTIProxyExecutorHandle proxy_executor
+                    ) {
+                    """
+                # Since we are removing non-abi-compatible mode, let's generate
+                # runtime checks only for abi_compatible mode to avoid extra branches.
+                if config.aot_inductor.debug_compile and config.abi_compatible:
+                    self.generate_input_output_runtime_checks()
+                    run_impl_proto += """
+                        __check_inputs_outputs(input_handles, output_handles);
+                    """
+                if config.use_minimal_arrayref_interface:
+                    self.prefix.splice(
+                        """
+                        template <>
+                        AOTInductorModelOutputs AOTInductorModel::run_impl_minimal_arrayref_interface<
+                          AOTInductorModelInputs, AOTInductorModelOutputs>(
+                            const AOTInductorModelInputs& inputs,
+                            DeviceStreamType stream,
+                            AOTIProxyExecutorHandle proxy_executor
+                        ) {
+                        """
+                    )
+                    self.suffix.splice(run_impl_proto)
+                    self.suffix.splice(
+                        """
+                            AOTInductorModelInputs inputs;
+                            convert_handles_to_inputs(input_handles, inputs);
+                            auto outputs = run_impl_minimal_arrayref_interface<AOTInductorModelInputs, AOTInductorModelOutputs>(
+                                inputs, stream, proxy_executor);
+                            // NOTE: outputs is full of ArrayRef to thread_local storage. If in the future we need this
+                            // interface to perform well for a DSO using the minimal arrayref interface, all we need
+                            // to do is provide ThreadLocalCachedTensor for each one!
+                            convert_outputs_to_handles(outputs, output_handles);
+                        }
+                    """
+                    )
+
+                    self.suffix.splice(
+                        """
+                        extern "C" AOTIRuntimeError AOTInductorModelRunMinimalArrayrefInterface(
+                            AOTInductorModelHandle model_handle,
+                            const AOTInductorModelInputs& inputs,
+                            AOTInductorModelOutputs& outputs) {
+                          auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+                          CONVERT_EXCEPTION_TO_ERROR_CODE({
+                              outputs = model->run_impl_minimal_arrayref_interface<AOTInductorModelInputs, AOTInductorModelOutputs>(
+                                  inputs,
+                                  (torch::aot_inductor::DeviceStreamType)nullptr,
+                                  nullptr);
+                          })
+                        }
+                    """
+                    )
+                else:
+                    self.prefix.splice(run_impl_proto)
+        else:
+            # cpp entry function for JIT with cpp wrapper
+            self.prefix.splice(
+                """
+                void inductor_entry_impl(
+                    AtenTensorHandle*
+                        input_handles, // array of input AtenTensorHandle; handles
+                                        // are stolen; the array itself is borrowed
+                    AtenTensorHandle*
+                        output_handles  // array for writing output AtenTensorHandle; handles
+                                        // will be stolen by the caller; the array itself is
+                                        // borrowed)
+                ) {
+                """
+            )
+        with self.prefix.indent():
+            # assign inputs and outputs in both cases so the later codegen can be simplified
+            if not config.use_minimal_arrayref_interface:
+                if not V.graph.is_const_graph:
+                    if V.graph.aot_mode:
+                        num_args = len(V.graph.graph_inputs)
+                    else:
+                        # Weights are promoted in the JIT mode
+                        num_args = len(V.graph.graph_inputs) + len(V.graph.constants)
+                        # release GIL to support multiple instances inference (in different threads of the same process)
+                        self.prefix.splice("py::gil_scoped_release release;")
+
+                    if config.abi_compatible:
+                        self.prefix.splice(
+                            f"""
+                                auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, {num_args});
+                            """
+                        )
+                    else:
+                        # This looks dumb, but can avoid creating two versions of code in the AOTInductor runtime.
+                        self.prefix.splice(
+                            f"""
+                                auto inputs = alloc_tensors_by_stealing_from_handles(input_handles, {num_args});
+                            """
+                        )
+
+            if inputs_len != 0:
+                for idx, input_key in enumerate(V.graph.graph_inputs.keys()):
+                    if config.use_minimal_arrayref_interface:
+                        self.prefix.writeline(
+                            f"auto {input_key} = std::get<{idx}>(inputs);"
+                        )
+                        continue
+                    # unwrap input tensor back to scalar
+                    if isinstance(V.graph.graph_inputs[input_key], sympy.Expr):
+                        from ..graph import may_get_constant_buffer_dtype
+
+                        dtype = may_get_constant_buffer_dtype(
+                            V.graph.graph_inputs[input_key]
+                        )
+                        assert (
+                            dtype is not None
+                        ), "Fails to get the dtype of the sympy.Expr"
+                        cpp_dtype = DTYPE_TO_CPP[dtype]
+                        if config.abi_compatible:
+                            self.prefix.writeline(f"{cpp_dtype} {input_key};")
+                            dtype_str = str(dtype).split(".")[-1]
+                            self.prefix.writeline(
+                                f"aoti_torch_item_{dtype_str}(inputs[{idx}], &{input_key});"
+                            )
+                        else:
+                            self.prefix.writeline(
+                                f"{cpp_dtype} {input_key} = inputs[{idx}].item<{cpp_dtype}>();"
+                            )
+                    else:
+                        self.prefix.writeline(
+                            f"auto {input_key} = std::move(inputs[{idx}]);"
+                        )
+
+            assert all(
+                isinstance(v, torch.Tensor) for v in list(V.graph.constants.values())
+            ), "Expect all constants to be Tensor"
+            for idx, constants_key in enumerate(V.graph.constants.keys()):
+                if V.graph.aot_mode:
+                    # Weights are stored in constants_ and owned by RAIIAtenTensorHandle there.
+                    # Don't call std::move here because it will cause constants_ to lose the ownership.
+                    if config.abi_compatible:
+                        self.prefix.writeline(
+                            f"""auto {constants_key} = constants_->at({idx});"""
+                        )
+                    else:
+                        self.prefix.writeline(
+                            f"auto {constants_key} = *tensor_handle_to_tensor_pointer("
+                            + f"""constants_->at({idx}));"""
+                        )
+                else:
+                    # Append constants as inputs to the graph
+                    constants_idx = inputs_len + idx
+                    if config.abi_compatible:
+                        self.prefix.writeline(
+                            f"auto {constants_key} = std::move(inputs[{constants_idx}]);"
+                        )
+                    else:
+                        self.prefix.writeline(
+                            f"auto {constants_key} = inputs[{constants_idx}];"
+                        )
+
+            self.codegen_inputs(self.prefix, V.graph.graph_inputs)
+
+            if V.graph.aot_mode:
+                if not V.graph.is_const_graph:
+                    if config.use_minimal_arrayref_interface:
+                        # TODO: input shape checking for regular tensor interface as well?
+                        self.codegen_input_numel_asserts()
+                    else:
+                        self.prefix.writeline("inputs.clear();")
+                self.prefix.writeline(
+                    "auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());"
+                )
+
+    def codegen_input_numel_asserts(self):
+        for name, buf in V.graph.graph_inputs.items():
+            if isinstance(buf, sympy.Expr):
+                continue
+
+            # comparing strides for 0 size tensor is tricky. Ignore them for now.
+            if sympy_product(buf.get_size()) == 0:
+                continue
+            numel = buf.get_numel()
+            self.prefix.writeline(f"assert_numel({name}, {numel});")
+
+    def codegen_tensor_dtype_var_decl(self, code: IndentedBuffer, name):
+        if config.abi_compatible:
+            code.writeline(f"int32_t {name}_dtype;")
+            code.writeline(
+                "AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype"
+                f"({name}, &{name}_dtype));"
+            )
+        else:
+            # Note that we don't have a corresponding class method from
+            # the WrapperCodeGen since this method is used for asserting AOTI
+            # cpp wrapper code.
+            code.writeline(f"auto {name}_dtype = {name}.dtype();")
+
+    def codegen_input_size_var_decl(self, code: IndentedBuffer, name):
+        if config.abi_compatible:
+            code.writeline(f"int64_t* {name}_size;")
+            code.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_sizes({name}, &{name}_size));"
+            )
+        else:
+            super().codegen_input_size_var_decl(code, name)
+
+    def codegen_input_stride_var_decl(self, code: IndentedBuffer, name):
+        if config.abi_compatible:
+            code.writeline(f"int64_t* {name}_stride;")
+            code.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides({name}, &{name}_stride));"
+            )
+        else:
+            super().codegen_input_stride_var_decl(code, name)
+
+    def codegen_model_kernels(self):
+        self.prefix.writeline("namespace {")
+        self.prefix.writeline(
+            "class AOTInductorModelKernels : public AOTInductorModelKernelsBase {"
+        )
+        self.prefix.writeline("  public:")
+        declare_kernel = set(self.src_to_kernel.values())
+        declare_kernel.update(
+            entry[0] for entry in self.user_defined_kernel_cache.values()
+        )
+        if V.graph.const_module:
+            declare_kernel.update(
+                V.graph.const_module.wrapper_code.src_to_kernel.values()
+            )
+        for kernel in sorted(declare_kernel):
+            self.prefix.writeline(
+                maybe_hipify_code_wrapper(f"    CUfunction {kernel}{{nullptr}};")
+            )
+        self.prefix.writeline("};")
+        self.prefix.writeline("}  // namespace")
+
+    def codegen_model_constructor(self):
+        """
+        // Generated code example
+        AOTInductorModel::AOTInductorModel()
+            : AOTInductorModelBase(4, 1) {
+        inputs_info_[0].name = "input0";
+        inputs_info_[0].dtype = "torch.float16";
+        ...
+        constants_info_[0].name = "L__self___weight";
+        constants_info_[0].dtype = at::kFloat;
+        constants_info_[0].offset = 0;
+        constants_info_[0].data_size = 8192;
+        constants_info_[0].shape = {64, 32};
+        constants_info_[0].stride = {32, 1};
+        ...
+        outputs_info_[0].name = "output0";
+        outputs_info_[0].dtype = "torch.float16";
+        }
+        """
+
+        num_inputs = len(V.graph.graph_inputs)
+        num_outputs = len(V.graph.graph_outputs)
+        num_constants = len(V.graph.constants)
+        self.prefix.splice(
+            f"""
+            AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
+                                               std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+                                               const std::string& device_str,
+                                               std::optional<std::string> cubin_dir)
+                : AOTInductorModelBase({num_inputs}, {num_outputs}, {num_constants}, device_str, cubin_dir) {{
+            """
+        )
+
+        with self.prefix.indent():
+            for idx, (name, inp) in enumerate(V.graph.graph_inputs.items()):
+                assert not isinstance(
+                    inp, sympy.Expr
+                ), f"input {name=} cannot be symbolic"
+                self.write_input_output_info("inputs_info_", idx, name)
+
+            for idx, name in enumerate(V.graph.constants.keys()):
+                tensor = V.graph.get_original_value_of_constant(name)
+                assert isinstance(tensor, torch.Tensor)
+                self.prefix.writeline(f"""constants_info_[{idx}].name = "{name}";""")
+                self.prefix.writeline(
+                    f"constants_info_[{idx}].dtype = static_cast<int32_t>({self.codegen_dtype(tensor.dtype)});"
+                )
+                self.prefix.writeline(
+                    f"constants_info_[{idx}].offset = {tensor.storage_offset()};"
+                )
+                self.prefix.writeline(
+                    f"constants_info_[{idx}].data_size = {tensor.untyped_storage().nbytes()};"
+                )
+                from_folded = "true" if name in V.graph.folded_constants else "false"
+                self.prefix.writeline(
+                    f"constants_info_[{idx}].from_folded = {from_folded};"
+                )
+
+                size_str = ", ".join([str(s) for s in tensor.size()])
+                self.prefix.writeline(f"constants_info_[{idx}].shape = {{{size_str}}};")
+
+                stride_str = ", ".join([str(s) for s in tensor.stride()])
+                self.prefix.writeline(
+                    f"constants_info_[{idx}].stride = {{{stride_str}}};"
+                )
+                if name in V.graph.dynamo_flat_name_to_original_fqn:
+                    original_fqn = V.graph.dynamo_flat_name_to_original_fqn.get(
+                        name, name
+                    )
+                elif name in V.graph.allocated_constant_name:
+                    original_fqn = V.graph.allocated_constant_name[name]
+                else:
+                    raise AssertionError("original_fqn must be set for constant")
+                self.prefix.writeline(
+                    f"""constants_info_[{idx}].original_fqn = "{original_fqn}";"""
+                )
+            self.prefix.writeline("update_constants_map(std::move(constants_map));")
+            self.prefix.writeline("update_constants_array(std::move(constants_array));")
+
+            def escape_string(x):
+                return (
+                    x.replace("\\", "\\\\")
+                    .replace('"', '\\"')
+                    .replace("\n", "\\n")
+                    .replace("\t", "\\t")
+                )
+
+            self.prefix.writeline(
+                f'in_spec_ = "{escape_string(config.aot_inductor.serialized_in_spec)}";'
+            )
+            self.prefix.writeline(
+                f'out_spec_ = "{escape_string(config.aot_inductor.serialized_out_spec)}";'
+            )
+
+            for idx, output in enumerate(V.graph.graph_outputs):
+                assert not isinstance(
+                    output, sympy.Expr
+                ), f"output {name=} cannot be symbolic"
+                name = f"output{idx}"
+                self.write_input_output_info("outputs_info_", idx, name)
+
+            self.prefix.writeline(
+                "this->kernels_ = std::make_unique<AOTInductorModelKernels>();"
+            )
+
+        self.prefix.writeline("}")
+
+    def codegen_const_run_driver(self):
+        """
+        // Generated code example
+        std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
+            DeviceStreamType stream,
+            AOTIProxyExecutorHandle proxy_executor,
+            bool initialization
+        ) {
+            std::unordered_map<std::string, AtenTensorHandle> folded_constants_map;
+            std::vector<AtenTensorHandle> output_handles;
+            // build up output_handles over here.
+            _const_run_impl(output_handles, stream, proxy_executor);
+            // build up folded_constants_map
+            return folded_constants_map;
+        }
+        """
+
+        self.prefix.splice(
+            """
+            std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
+                DeviceStreamType stream,
+                AOTIProxyExecutorHandle proxy_executor,
+                bool initialization
+            ) {
+            """
+        )
+        if not config.aot_inductor.use_runtime_constant_folding:
+            self.prefix.splice(
+                """
+                    if (!initialization) {
+                        std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: "
+                                  << "aot_inductor.use_runtime_constant_folding=False\\n";
+                    }
+                    return {};
+                }
+                """
+            )
+            return
+
+        with self.prefix.indent():
+            # This is a mapping to the index of constant folding graph's output
+            const_index_mapping: List[Optional[Tuple[int, str]]] = [None] * len(
+                V.graph.const_output_index
+            )
+            for idx, (name, _) in enumerate(V.graph.constants.items()):
+                if name in V.graph.const_output_index:
+                    const_index_mapping[V.graph.const_output_index[name]] = (idx, name)  # type: ignore[call-overload]
+            assert (
+                None not in const_index_mapping
+            ), "Not all constant gets mapped for constant folding graph."
+
+            self.prefix.writeline(
+                f"""
+                std::unordered_map<std::string, AtenTensorHandle> folded_constants_map;
+                folded_constants_map.reserve({len(const_index_mapping)});
+                std::vector<AtenTensorHandle> output_handles({len(const_index_mapping)});
+                """
+            )
+
+            self.prefix.splice(
+                """
+                // The below assignment of output_handles to constants is not used directly.
+                // It's only used to memo the correspondence of handle and constants.
+                """
+            )
+
+            for output_idx, (const_idx, _) in enumerate(const_index_mapping):  # type: ignore[misc]
+                self.prefix.writeline(
+                    f"output_handles[{output_idx}] = constants_->at({const_idx});"
+                )
+
+            self.prefix.writeline(
+                "_const_run_impl(output_handles, stream, proxy_executor);"
+            )
+
+            for output_idx, (_, const_name) in enumerate(const_index_mapping):  # type: ignore[misc]
+                self.prefix.writeline(
+                    f'folded_constants_map["{const_name}"] = output_handles[{output_idx}];'
+                )
+            self.prefix.writeline("return folded_constants_map;")
+
+        self.prefix.writeline("}")
+
+    def generate(self, is_inference):
+        if V.graph.aot_mode and not V.graph.is_const_graph:
+            self.codegen_model_kernels()
+            self.codegen_model_constructor()
+            self.codegen_const_run_driver()
+        self.write_wrapper_decl()
+        return super().generate(is_inference)
+
+    def finalize_prefix(self):
+        cached_dtypes_buffer = IndentedBuffer()
+        if config.abi_compatible:
+            for dtype in self.used_cached_dtypes:
+                cached_dtypes_buffer.writeline(f"CACHE_TORCH_DTYPE({dtype});")
+            for device in self.used_cached_devices:
+                cached_dtypes_buffer.writeline(f"CACHE_TORCH_DEVICE({device});")
+        cached_dtypes_buffer.splice(self.prefix)
+        self.prefix = cached_dtypes_buffer
+
+    def define_kernel(
+        self, name: str, kernel: str, metadata: Optional[str] = None, cuda=False
+    ):
+        self.header.splice(f"\n{kernel}\n")
+
+    def codegen_scalar_to_tensor(self, output: str):
+        name = f"scalar_to_tensor_{next(self.scalar_to_tensor_id)}"
+        self.wrapper_call.writeline(
+            f"RAIIAtenTensorHandle {name} = scalar_to_tensor_handle({output});"
+        )
+        return name
+
+    @cache_on_self
+    def get_output_refs(self):
+        return [
+            f"torch::tensor({x.codegen_reference(self.wrapper_call)})"
+            if isinstance(x, ir.ShapeAsConstantBuffer) and not config.abi_compatible
+            else x.codegen_reference(self.wrapper_call)
+            for x in V.graph.graph_outputs
+        ]
+
+    def generate_return(self, output_refs: List[str]):
+        cst_names = V.graph.constants.keys()
+        arr_iface = (
+            not V.graph.is_const_graph and config.use_minimal_arrayref_interface
+        )  # For brevity.
+
+        def use_thread_local_cached_output_tensor(idx, output):
+            cached_output_name = f"cached_output_{next(self.cached_output_id)}"
+            cache_type = "Array" if arr_iface else "Tensor"
+            self.wrapper_call.writeline(
+                f"thread_local ThreadLocalCachedOutput{cache_type}<std::decay_t<decltype({output})>> "
+                f"{cached_output_name}({output});"
+            )
+            if arr_iface:
+                self.wrapper_call.writeline(
+                    f"{cached_output_name}.copy_data_from({output});"
+                )
+                output_entry = f"std::get<{idx}>(output_arrayref_tensors)"
+                element_type = f"std::decay_t<decltype({output_entry}.data()[0])>"
+                self.wrapper_call.writeline(
+                    f"{output_entry} = {cached_output_name}.arrayref_tensor<{element_type}>();"
+                )
+            else:
+                self.wrapper_call.writeline(
+                    f"{cached_output_name}.copy_data_from({output});"
+                )
+                self.wrapper_call.writeline(
+                    f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_new_uninitialized_tensor(&output_handles[{idx}]));"
+                )
+                self.wrapper_call.writeline(
+                    f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_assign_tensors({cached_output_name}.tensor(), "
+                    f"output_handles[{idx}]));"
+                )
+
+        if arr_iface:
+            self.wrapper_call.writeline(
+                "AOTInductorModelOutputs output_arrayref_tensors;"
+            )
+
+        output2idx: Dict[str, int] = {}
+        for idx, output in enumerate(output_refs):
+            if output == self.none_str:
+                continue
+
+            is_constant_buffer = output in cst_names
+            output_buffer = V.graph.graph_outputs[idx]
+            if isinstance(output_buffer, ir.BaseView):
+                output_storage = output_buffer.unwrap_view()
+                if isinstance(output_storage.data, ir.ConstantBuffer):
+                    is_constant_buffer = True
+
+            if config.abi_compatible:
+                if isinstance(output_buffer, ir.ShapeAsConstantBuffer):
+                    # Need to wrap scalar into tensor as the main function returns a vector of tensors
+                    output_tensor = self.codegen_scalar_to_tensor(output)
+                    self.wrapper_call.writeline(
+                        f"output_handles[{idx}] = {output_tensor}.release();"
+                    )
+                    continue
+
+                output_is_tensor_handle_expr = (
+                    f"std::is_same_v<std::decay_t<decltype({output})>,"
+                    "RAIIAtenTensorHandle> || "
+                    f"std::is_same_v<std::decay_t<decltype({output})>,"
+                    "AtenTensorHandle> || "
+                    f"std::is_same_v<std::decay_t<decltype({output})>,"
+                    "ConstantHandle>"
+                )
+                self.wrapper_call.writeline(
+                    f"if constexpr ({output_is_tensor_handle_expr}) {{"
+                )
+                with self.wrapper_call.indent():
+                    if arr_iface:
+                        cached_output_name = (
+                            f"cached_output_{next(self.cached_output_id)}"
+                        )
+                        output_value_type = f"std::decay_t<decltype(std::get<{idx}>(output_arrayref_tensors).data()[0])>"
+                        self.wrapper_call.writeline(
+                            f"thread_local RAIIAtenTensorHandle {cached_output_name};"
+                        )
+                        if is_constant_buffer:
+                            # NOTE(return_constant): In some rare cases where we return
+                            # a constant, we have to return a copy of this constant,
+                            # because (1) constants are not owned by the Model instance
+                            # (2) constants remain the same cross inference runs,
+                            # assuming they are not updated at runtime Basically, we
+                            # cannot release or transfer the ownership of any original
+                            # constant to the user.
+                            self.wrapper_call.writeline(
+                                f"AtenTensorHandle {cached_output_name}_tmp;"
+                            )
+                            self.wrapper_call.writeline(
+                                f"aoti_torch_clone({output}, &{cached_output_name}_tmp);"
+                            )
+                            self.wrapper_call.writeline(
+                                f"{cached_output_name} = {cached_output_name}_tmp;"
+                            )
+                        else:
+                            self.wrapper_call.writeline(
+                                f"{cached_output_name} = {output}.release();"
+                            )
+                        self.wrapper_call.writeline(
+                            f"convert_handle_to_arrayref_tensor({cached_output_name}, "
+                            f"std::get<{idx}>(output_arrayref_tensors));"
+                        )
+                    else:
+                        if is_constant_buffer:
+                            # See NOTE(return_constant) above.
+                            self.wrapper_call.writeline(
+                                f"aoti_torch_clone({output}, &output_handles[{idx}]);"
+                            )
+                        else:
+                            if output in output2idx:
+                                src_idx = output2idx[output]
+                                self.wrapper_call.writeline(
+                                    f"output_handles[{idx}] = output_handles[{src_idx}];"
+                                )
+                            else:
+                                self.wrapper_call.writeline(
+                                    f"output_handles[{idx}] = {output}.release();"
+                                )
+                self.wrapper_call.writeline("} else {")
+                with self.wrapper_call.indent():
+                    use_thread_local_cached_output_tensor(idx, output)
+                self.wrapper_call.writeline("}")
+
+            else:
+                assert (
+                    not arr_iface
+                ), "minimal ArrayRef interface is only supported in ABI-compatible mode"
+                if is_constant_buffer:
+                    output_expr = f"{output}.clone()"
+                    # See NOTE(return_constant) above.
+                else:
+                    output_expr = output
+                self.wrapper_call.writeline(
+                    f"output_handles[{idx}] = reinterpret_cast<AtenTensorHandle>("
+                    + f"new at::Tensor({output_expr}));"
+                )
+
+            if output not in output2idx:
+                output2idx[output] = idx
+        if arr_iface:
+            self.wrapper_call.writeline("return output_arrayref_tensors;")
+
+    def generate_before_suffix(self, result):
+        if not V.graph.is_const_graph:
+            if V.graph.aot_mode:
+                result.writeline("} // AOTInductorModel::run_impl")
+            else:
+                result.writeline("} // inductor_entry_impl")
+
+    def generate_end(self, result):
+        if V.graph.aot_mode:
+            if V.graph.is_const_graph:
+                result.writeline("} // AOTInductorModel::_const_run_impl")
+            else:
+                result.writeline("} // namespace aot_inductor")
+                result.writeline("} // namespace torch")
+            return
+
+        # cpp entry function for JIT with cpp wrapper
+        result.writeline("'''\n)")
+        result.splice(
+            f"""
+            inductor_entry = CppWrapperCodeCache.load_pybinding(
+                ["std::vector<AtenTensorHandle>"], cpp_wrapper_src, {self.cuda}, {len(V.graph.graph_outputs)})
+            """
+        )
+
+        wrapper_body = "input_tensors = [arg if isinstance(arg, torch.Tensor) else torch.tensor(arg) for arg in args]"
+        if V.graph.constants:
+            # Append constants to the input args for cpp wrapper.
+            # Python wrapper directly gets the value inside the wrapper call
+            # as a global variable passed when calling exec(code, mod.__dict__, mod.__dict__).
+            # For cpp wrapper, we need to pass this python value to the inductor_entry_impl function explicitly.
+            assert all(
+                isinstance(v, torch.Tensor) for v in list(V.graph.constants.values())
+            ), "Expect all constants to be Tensor"
+            constants_str = f"[{', '.join(V.graph.constants.keys())}]"
+            wrapper_body += f"""
+                    constants_tensor = {constants_str}
+                    input_tensors.extend(constants_tensor)
+            """
+        # Convert vector of at::Tensor to vector of AtenTensorHandle.
+        # If we pass at::Tensor, the compilation will be too slow.
+        wrapper_body += """
+                    input_handles = torch._C._aoti.unsafe_alloc_void_ptrs_from_tensors(input_tensors)
+        """
+
+        # unwrap output tensor back to python scalar
+        if all(x for x in self.output_is_tensor.values()):
+            # If no ShapeAsConstantBuffer in the output, directly return the output as tensors
+            outputs_str = "output_tensors"
+        else:
+            outputs = [
+                f"output_tensors[{i}]"
+                if self.output_is_tensor[i]
+                else f"output_tensors[{i}].item()"
+                for i in range(len(V.graph.graph_outputs))
+            ]
+            outputs_str = f"[{', '.join(outputs)}]"
+        wrapper_body += f"""
+                    output_handles = f(input_handles)
+                    output_tensors = torch._C._aoti.alloc_tensors_by_stealing_from_void_ptrs(output_handles)
+                    return {outputs_str}
+        """
+
+        # Wrap the func to support setting result._boxed_call = True
+        result.splice(
+            f"""
+            def _wrap_func(f):
+                def g(args):
+                    {wrapper_body}
+                return g
+
+            call = _wrap_func(inductor_entry)
+            """
+        )
+
+    def get_c_shim_func_name(self, kernel):
+        if not config.abi_compatible:
+            return kernel
+
+        assert "::" in kernel, "Cpp kernel name: " + kernel + " does not contain '::'"
+        kernel_tokens = kernel.split("::")
+        kernel_suffix = kernel_tokens[-1]
+        if kernel_suffix == "call":
+            kernel_suffix = kernel_tokens[-2]
+        if config.c_shim_version == "1":
+            # For sdpa, we need the v2 version since v1 didn't consider optional arg
+            # FIXME: no need to do this after we switch to the torchgen-ed C shim
+            if kernel_suffix == "_scaled_dot_product_flash_attention":
+                shim_fn = "aoti_torch__scaled_dot_product_flash_attention_v2"
+            elif kernel_suffix.startswith("wrapped_fbgemm"):
+                assert self.device == "cpu", "Using wrapped_fbgemm out of CPU!"
+                shim_fn = f"aoti_torch_cpu_{kernel_suffix}"
+            else:
+                shim_fn = f"aoti_torch_{kernel_suffix}"
+        else:
+            shim_fn = f"aoti_torch_{self.device}_{kernel_suffix}"
+        return shim_fn
+
+    def generate_c_shim_extern_kernel_call(self, kernel, args):
+        # In the abi_compatible mode, we call fallback aten ops through a C shim layer
+        # Setting self.allow_stack_allocation to False because the exchange between
+        # ArrayRefTensor and at::Tensor is still fragile.
+        self.allow_stack_allocation = False
+
+        # HACK: val_to_arg_str jams multiple arguments together using a comma. If that
+        # ever breaks, it needs to be reworked to be able to return multiple arguments,
+        # and the split-on-comma code here needs to be removed.
+        def is_number(s: str):
+            try:
+                int(s)
+                return True
+            except ValueError:
+                pass
+
+            try:
+                float(s)
+                return True
+            except ValueError:
+                return False
+
+        wrapped_args = []
+        for x in args:
+            pieces = x.split(", ")
+            for piece in pieces:
+                # We only really *need* convert_arrayref_tensor_to_tensor for
+                # ArrayRefTensors. The code flowing into here uses `0` for nullptr,
+                # which convert_arrayref_tensor_to_tensor would blindly coerce to int,
+                # so just avoid wrapping integers.
+                if not is_number(piece):
+                    piece = f"convert_arrayref_tensor_to_tensor({piece})"
+                wrapped_args.append(piece)
+
+        shim_fn = self.get_c_shim_func_name(kernel)
+        self.writeline(
+            f"AOTI_TORCH_ERROR_CODE_CHECK({shim_fn}({', '.join(wrapped_args)}));"
+        )
+
+    def generate_c_shim_extern_kernel_alloc(self, extern_kernel, args):
+        # registered output buffer name
+        name = extern_kernel.name
+        output_handle_name = f"{name}_handle"
+        self.writeline(f"AtenTensorHandle {output_handle_name};")
+        output_arg = f"&{output_handle_name}"
+        self.generate_c_shim_extern_kernel_call(
+            extern_kernel.get_kernel_name(), args + [output_arg]
+        )
+        self.writeline(f"RAIIAtenTensorHandle {name}({output_handle_name});")
+
+    def generate_extern_kernel_alloc(self, extern_kernel, args):
+        if config.abi_compatible:
+            self.generate_c_shim_extern_kernel_alloc(extern_kernel, args)
+        else:
+            super().generate_extern_kernel_alloc(extern_kernel, args)
+
+    def generate_c_shim_fallback_kernel(self, fallback_kernel, args):
+        output_args = []
+        output_raii_handles = []
+        output_name_base = fallback_kernel.get_name()
+        for idx, output in enumerate(fallback_kernel.outputs):
+            if isinstance(output, ir.MultiOutput):
+                # TODO: handle integer output (e.g., as in attention)
+                name = f"{output.get_name()}"
+                output_handle_name = f"{name}_handle"
+                if output.indices:
+                    assert (
+                        output.indices[0][1] == idx
+                    ), f"expected {output.indices[0][1]=} == {idx=} for {output_name_base=}"
+                self.writeline(f"AtenTensorHandle {output_handle_name};")
+                output_args.append(f"&{output_handle_name}")
+                output_raii_handles.append(
+                    f"RAIIAtenTensorHandle {name}({output_handle_name});"
+                )
+            elif isinstance(output, int):
+                output_name = f"{output_name_base}_{idx}"
+                self.writeline(f"int64_t {output_name} = {output};")
+                output_args.append(f"&{output_name}")
+            elif output is None:
+                output_args.append("nullptr")
+            else:
+                raise NotImplementedError(f"unsupported type of {output=}")
+        args = args + output_args
+        self.generate_c_shim_extern_kernel_call(fallback_kernel.cpp_kernel_name, args)
+        for raii_handle in output_raii_handles:
+            self.writeline(raii_handle)
+
+    def generate_fallback_kernel(self, fallback_kernel, args):
+        if config.abi_compatible:
+            self.generate_c_shim_fallback_kernel(fallback_kernel, args)
+        else:
+            super().generate_fallback_kernel(fallback_kernel, args)
+
+    def generate_extern_kernel_out(
+        self, kernel: str, out: str, out_view: Optional[str], args: List[str]
+    ):
+        if out_view:
+            out_name = f"{out}_as_strided"
+            self.writeline(f"auto {out_name} = {out_view};")
+            args.insert(0, out_name)
+        else:
+            args.insert(0, out)
+
+        if config.abi_compatible:
+            self.generate_c_shim_extern_kernel_call(kernel, args)
+        else:
+            self.writeline(self.wrap_kernel_call(kernel, args))
+
+    def generate_user_defined_triton_kernel(
+        self, kernel_name, grid, configs, args, triton_meta
+    ):
+        assert len(grid) != 0
+        if len(grid) == 1:
+            grid_decision = grid[0]
+        else:
+            meta = CudaKernelParamCache.get(kernel_name)
+            assert meta is not None
+            grid_decision = None
+            for i, c in enumerate(configs):
+                if all(arg == meta["meta"][key] for key, arg in c.kwargs.items()):
+                    grid_decision = grid[i]
+                    break
+            assert grid_decision is not None
+
+        self.generate_kernel_call(
+            kernel_name,
+            args,
+            grid=grid_decision,
+            device_index=V.graph.scheduler.current_device.index,
+            cuda=True,
+            triton=True,
+            triton_meta=triton_meta,
+        )
+
+    def generate_scatter_fallback(
+        self,
+        output,
+        inputs,
+        cpp_kernel_name,
+        python_kernel_name,
+        src_is_tensor,
+        reduce,
+        kwargs,
+    ):
+        # No stack allocation when there is a fallback op
+        self.allow_stack_allocation = False
+
+        # TODO: needs updates to use C shim v2
+        if config.abi_compatible:
+            # call the ABI shim function instead of the ATen one
+            if config.c_shim_version == "1":
+                cpp_kernel_name = (
+                    "aoti_torch_scatter_reduce_out"
+                    if python_kernel_name.startswith("aten.scatter_reduce")
+                    else "aoti_torch_scatter_out"
+                )
+            else:
+                cpp_kernel_name = self.get_c_shim_func_name(cpp_kernel_name)
+                # C shim only contains out-variant instead of inplace-variant
+                cpp_kernel_name = cpp_kernel_name.replace("__", "_") + "_out"
+            inputs_wrapped = [
+                f"convert_arrayref_tensor_to_tensor({x})"
+                if isinstance(x, str)
+                else str(x)
+                for x in inputs
+            ]
+            line = f"{cpp_kernel_name}(convert_arrayref_tensor_to_tensor({output}), {','.join(inputs_wrapped)}"
+        else:
+            line = f"{cpp_kernel_name}({','.join(map(str, inputs))}"
+
+        if python_kernel_name.startswith("aten.scatter_reduce"):
+            line += f", {','.join(kwargs)}"
+        else:
+            if src_is_tensor:
+                if reduce:
+                    line += f", {V.graph.wrapper_code.val_to_arg_str(reduce)}"
+            else:
+                assert (
+                    reduce is None
+                ), "Expect reduce to be None for aten.scatter_ with scalar src"
+        line += ");"
+        self.writeline(line)
+
+    def generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
+        # No stack allocation when there is a fallback op
+        self.allow_stack_allocation = False
+
+        # TODO: needs updates to use C shim v2
+        if config.abi_compatible:
+            # See the comment in codegen_reinterpret_view about why having something like
+            # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the correponding
+            # tensor prematurely deallocated, thus this std::vector().data() trick here.
+            indices_str = (
+                "std::vector<AtenTensorHandle>{"
+                + (
+                    ", ".join(
+                        [f"convert_arrayref_tensor_to_tensor({ind})" for ind in indices]
+                    )
+                )
+                + "}.data()"
+            )
+            args = [
+                f"convert_arrayref_tensor_to_tensor({x})",
+                indices_str,
+                str(len(indices)),
+                f"convert_arrayref_tensor_to_tensor({values})",
+                accumulate,
+            ]
+            args.insert(
+                0, f"convert_arrayref_tensor_to_tensor({x})"
+            )  # set x as the output tensor, this fallback mutates x.
+        else:
+            indices_str = (
+                f"{self.open_bracket}{', '.join(indices)}{self.closed_bracket}"
+            )
+            args = [x, indices_str, values, accumulate]
+            args.insert(0, x)  # set x as the output tensor, this fallback mutates
+
+        self.writeline(self.wrap_kernel_call(kernel, args))
+
+    def add_benchmark_harness(self, output):
+        if V.graph.aot_mode:
+            return
+        super().add_benchmark_harness(output)
+
+    def codegen_sizevar(self, x: Expr) -> str:
+        return self.expr_printer(V.graph.sizevars.simplify(x))
+
+    def codegen_tuple_access(self, basename: str, name: str, index: str) -> str:
+        if config.abi_compatible:
+            # in the abi_compatible mode, outputs are returned via arguments
+            return name
+        else:
+            return f"std::get<{index}>({basename})"
+
+    def codegen_shape_tuple(self, shape: Tuple[Expr, ...]) -> str:
+        parts = list(map(self.codegen_sizevar, shape))
+        if len(parts) == 0:
+            return "{}"
+        if len(parts) == 1:
+            return f"{{{parts[0]}, }}"
+        return f"{{{', '.join(parts)}}}"
+
+    def codegen_dynamic_scalar(self, node):
+        (data,) = (t.codegen_reference() for t in node.inputs)
+        if config.abi_compatible:
+            dtype = node.inputs[0].get_dtype()
+            dtype_str = str(dtype).split(".")[-1]
+            self.writeline(f"{DTYPE_TO_CPP[dtype]} {node.sym}_raw;")
+            self.writeline(f"aoti_torch_item_{dtype_str}({data}, &{node.sym}_raw);")
+        else:
+            convert_type = DTYPE_TO_ATEN[node.inputs[0].get_dtype()].replace(
+                "at::k", "to"
+            )
+            self.writeline(f"auto {node.sym}_raw = {data}.item().{convert_type}();")
+
+        if len(node.keypath) == 0:
+            self.writeline(f"auto {node.sym} = {node.sym}_raw;")
+        elif len(node.keypath == 1) and isinstance(node.keypath[0], ConvertIntKey):
+            self.writeline(f"int64_t {node.sym} = {node.sym}_raw ? 1 : 0;")
+        elif len(node.keypath == 1) and isinstance(node.keypath[0], DivideByKey):
+            # TODO: assert divisibility here
+            self.writeline(
+                f"int64_t {node.sym} = {node.sym}_raw / {node.keypath[0].divisor};"
+            )
+        else:
+            raise AssertionError(f"unrecognized keypath {node.keypath}")
+
+        # record in unbacked_symbol_decls so we won't generate a declaration of the symbol again
+        self.unbacked_symbol_decls.add(str(node.sym))
+
+    def can_stack_allocate_buffer(self, buffer):
+        return (
+            self.allow_stack_allocation
+            and buffer.get_device().type == "cpu"
+            and self.can_prove_buffer_has_static_shape(buffer)
+            and ir.is_contiguous_strides_for_shape(
+                buffer.get_stride(), buffer.get_size()
+            )
+        )
+
+    def make_buffer_free(self, buffer):
+        return (
+            ""
+            if isinstance(buffer.get_layout(), ir.MultiOutputLayout)
+            or (V.graph.aot_mode and buffer.get_name() in self.stack_allocated_buffers)
+            or (
+                config.use_minimal_arrayref_interface
+                and V.graph.aot_mode
+                and buffer.get_name() in V.graph.graph_inputs
+            )
+            else f"{buffer.get_name()}.reset();"
+        )
+
+    def make_free_by_names(self, names_to_del: List[str]):
+        return " ".join(f"{name}.reset();" for name in names_to_del)
+
+    def codegen_exact_buffer_reuse(self, old_name: str, new_name: str, del_line: str):
+        if config.abi_compatible:
+            return f"auto {new_name} = std::move({old_name});  // reuse"
+        else:
+            return super().codegen_exact_buffer_reuse(old_name, new_name, del_line)
+
+    def generate_profiler_mark_wrapper_call(self, stack):
+        self.wrapper_call.writeline(
+            'RECORD_FUNCTION("inductor_wrapper_call", c10::ArrayRef<c10::IValue>());'
+        )
+
+    def write_triton_header_once(self):
+        pass
+
+    def generate_start_graph(self):
+        pass
+
+    def generate_end_graph(self):
+        pass
+
+    def generate_inf_and_nan_checker(self, nodes):
+        for buf in nodes.get_names():
+            # TODO: Add buf name directly into check_inf_and_nan.
+            self.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_check_inf_and_nan({buf}));"
+            )
+
+    def codegen_device(self, device):
+        if config.abi_compatible:
+            self.used_cached_devices.add(device.type)
+            return f"cached_torch_device_type_{device.type}, {device.index if device.index else 0}"
+        else:
+            return (
+                f"c10::Device({DEVICE_TO_ATEN[device.type]}, {device.index})"
+                if device.index is not None
+                else f"{DEVICE_TO_ATEN[device.type]}"
+            )
+
+    def codegen_dtype(self, dtype):
+        if config.abi_compatible:
+            dtype_str = str(dtype).split(".")[-1]
+            self.used_cached_dtypes.add(dtype_str)
+            return f"cached_torch_dtype_{dtype_str}"
+        else:
+            return DTYPE_TO_ATEN[dtype]
+
+    @functools.lru_cache(None)
+    def codegen_int_array_var(
+        self,
+        int_array: str,
+        writer=None,
+        known_statically=False,
+        graph=None,  # for per-graph caching
+    ):
+        # Because the memory planning is done in two passes (see the implementation
+        # of self.generate), the writeline behavior is different in the two passes.
+        # As a result, the emitted int array declarations may appear in a later
+        # position of the generated code, so the second pass codegen should not
+        # reuse int array declarations generated in the first pass
+        if writer is None:
+            # The first pass codegen uses `self` as the writer
+            writer = self
+
+        var = f"int_array_{next(self.int_array_id)}"
+        if var not in self.declared_int_array_vars:
+            self.declared_int_array_vars.add(var)
+            if known_statically:
+                writer.writeline(f"static constexpr int64_t {var}[] = {int_array};")
+            else:
+                writer.writeline(f"int64_t {var}[] = {int_array};")
+        return var
+
+    def make_buffer_allocation(self, buffer):
+        return self.make_allocation(
+            buffer.get_name(),
+            buffer.get_device(),
+            buffer.get_dtype(),
+            buffer.get_size(),
+            buffer.get_stride(),
+            buffer if self.can_stack_allocate_buffer(buffer) else None,
+        )
+
+    def make_allocation(
+        self, name, device, dtype, shape, stride, buffer_if_can_stack_allocate=None
+    ):
+        orig_stride = stride
+        device_str = self.codegen_device(device)
+        dtype_code = self.codegen_dtype(dtype)
+        size = self.codegen_shape_tuple(shape)
+        stride = self.codegen_shape_tuple(orig_stride)
+        if config.abi_compatible:
+            size_array_var = self.codegen_int_array_var(
+                size,
+                self.wrapper_call,
+                known_statically=self.is_statically_known_list_of_ints(shape),
+                graph=self.get_codegened_graph(),
+            )
+            stride_array_var = self.codegen_int_array_var(
+                stride,
+                self.wrapper_call,
+                known_statically=self.is_statically_known_list_of_ints(orig_stride),
+                graph=self.get_codegened_graph(),
+            )
+            device_type, device_id = device_str.split(",")
+            device_idx = "this->device_idx_" if V.graph.aot_mode else device_id
+            if buffer_if_can_stack_allocate is not None:
+                self.stack_allocated_buffers[name] = buffer_if_can_stack_allocate
+                cpp_type = DTYPE_TO_CPP[dtype]
+                numel = buffer_if_can_stack_allocate.get_numel()
+                # Note: we don't zero storage because empty_strided doesn't zero either.
+                self.wrapper_call.writeline(f"{cpp_type} {name}_storage[{numel}];")
+                args = [
+                    f"{name}_storage",
+                    size_array_var,
+                    stride_array_var,
+                    device_type,
+                    device_idx,
+                ]
+                return f"ArrayRefTensor<{cpp_type}> {name}({', '.join(args)});"
+
+            args = [
+                str(len(shape)),
+                size_array_var,
+                stride_array_var,
+                dtype_code,
+                device_type,
+                device_idx,
+                f"&{name}_handle",
+            ]
+
+            self.wrapper_call.writeline(f"AtenTensorHandle {name}_handle;")
+            self.wrapper_call.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided({', '.join(args)}));"
+            )
+
+            return f"RAIIAtenTensorHandle {name}({name}_handle);"
+
+        if V.graph.aot_mode and device_str.startswith("c10::Device("):
+            tensor_device = f"{device_str.split(',')[0]}, this->device_idx_)"
+        else:
+            tensor_device = device_str
+
+        if device.type == "cpu":
+            return f"at::Tensor {name} = at::detail::empty_strided_cpu({size}, {stride}, {dtype_code});"
+        if device.type == "cuda":
+            return (
+                f"at::Tensor {name} = at::detail::empty_strided_cuda("
+                f"{size}, {stride}, {dtype_code}, c10::DeviceType::CUDA);"
+            )
+        return (
+            f"{self.declare}{name} = {self.namespace}empty_strided("
+            f"{size}, {stride}, at::TensorOptions({tensor_device}).dtype({dtype_code})){self.ending}"
+        )
+
+    def codegen_alloc_from_pool(self, name, offset, dtype, shape, stride) -> str:
+        if config.abi_compatible:
+            size = self.codegen_shape_tuple(shape)
+            stride = self.codegen_shape_tuple(stride)
+            tmp_name = f"tmp_tensor_handle_{next(self.tmp_tensor_id)}"
+            args = [
+                name,
+                self.expr_printer(offset),  # bytes not numel
+                self.codegen_dtype(dtype),
+                str(len(shape)),
+                self.codegen_int_array_var(
+                    size, self.wrapper_call, graph=self.get_codegened_graph()
+                ),
+                self.codegen_int_array_var(
+                    stride, self.wrapper_call, graph=self.get_codegened_graph()
+                ),
+                f"&{tmp_name}",
+            ]
+            self.wrapper_call.writeline(f"AtenTensorHandle {tmp_name};")
+            self.wrapper_call.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__alloc_from_pool({', '.join(args)}));"
+            )
+            return f"RAIIAtenTensorHandle({tmp_name})"
+
+        return "alloc_from_pool({})".format(
+            ", ".join(
+                [
+                    name,
+                    self.expr_printer(offset),  # bytes not numel
+                    self.codegen_dtype(dtype),
+                    self.codegen_shape_tuple(shape),
+                    self.codegen_shape_tuple(stride),
+                ]
+            )
+        )
+
+    def codegen_reinterpret_view(
+        self, data, size_list, stride_list, offset, writer
+    ) -> str:
+        dim = str(len(size_list))
+        size = self.codegen_shape_tuple(size_list)
+        stride = self.codegen_shape_tuple(stride_list)
+        offset = self.codegen_sizevar(offset)
+
+        if config.abi_compatible:
+            tmp_name = f"tmp_tensor_handle_{next(self.tmp_tensor_id)}"
+            # Because the memory planning is done in two passes (see the implementation
+            # of self.generate), the writeline behavior is different in the two passes.
+            if writer is None:
+                writer = self
+
+            args = [
+                f"{data.get_name()}",
+                dim,
+                self.codegen_int_array_var(
+                    size,
+                    writer,
+                    known_statically=self.is_statically_known_list_of_ints(size_list),
+                    graph=self.get_codegened_graph(),
+                ),
+                self.codegen_int_array_var(
+                    stride,
+                    writer,
+                    known_statically=self.is_statically_known_list_of_ints(stride_list),
+                    graph=self.get_codegened_graph(),
+                ),
+                offset,
+            ]
+
+            def gen_reinterpret_call(writer, args):
+                writer.writeline(
+                    f"auto {tmp_name} = reinterpret_tensor_wrapper({', '.join(args)});"
+                )
+
+            if (
+                self.can_stack_allocate_buffer(data)
+                and self.is_statically_known_list_of_ints(size_list)
+                and self.is_statically_known_list_of_ints(stride_list)
+                and ir.is_contiguous_strides_for_shape(stride_list, size_list)
+            ):
+                gen_reinterpret_call(writer, args)
+                return tmp_name
+
+            gen_reinterpret_call(writer, args)
+
+            # NB, the return handle here represents a temporary tensor, which will be automatically
+            # released.
+            # Here's a sample usage in the cpp wrapper code:
+            # ```
+            # aoti_torch_addmm_out(
+            #     buf1,
+            #     arg1_1,
+            #     RAIIAtenTensorHandle(tmp_tensor_handle_0),
+            #     buf0,
+            #     1L,
+            #     1L));
+            # ```
+            # RAIIAtenTensorHandle(tmp_tensor_handle_0) will be released after the call to addmm_out.
+            # This could be problematic when it's used in a different pattern, for example:
+            # ````
+            # AtenTensorHandle tensor_args[] = {RAIIAtenTensorHandle(tmp_tensor_handle_2), buf5, buf6};
+            # aoti_torch_proxy_executor_call_function(..., tensor_args);
+            # ````
+            # RAIIAtenTensorHandle(tmp_tensor_handle_2) will be invalid when it's used in the latter
+            # kernel call.
+            #
+            # This is solved by updating the proxy_executor invocation to
+            # ```
+            # aoti_torch_proxy_executor_call_function(...,
+            #     std::vector<AtenTensorHandle>{
+            #         RAIIAtenTensorHandle(tmp_tensor_handle_2), buf5, buf6
+            #     }.data()
+            # );
+            # ```
+            return f"wrap_with_raii_handle_if_needed({tmp_name})"
+        else:
+            args = [data.get_name(), size, stride, offset]
+            return f"reinterpret_tensor({', '.join(args)})"
+
+    def codegen_device_copy(self, src, dst):
+        if config.abi_compatible:
+            self.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_tensor_copy_(expensive_copy_to_tensor_if_needed({src}), {dst}));"
+            )
+        else:
+            self.writeline(f"{dst}.copy_({src});")
+
+    def codegen_multi_output(self, name, value):
+        # in the abi_compatible mode, outputs are retrieved by passing
+        # output pointers, so we skip its codegen here.
+        if not config.abi_compatible:
+            super().codegen_multi_output(name, value)
+
+    def codegen_subgraph_prefix(self, subgraph, outer_inputs, outer_outputs):
+        for inner_input, outer_input in zip(subgraph.graph.graph_inputs, outer_inputs):
+            if config.abi_compatible:
+                # in ABI-compatible mode, we copy the underlying at::Tensor of the conditional
+                # input (outer_input) into another at::Tensor to be used as a subgraph input
+                # (inner_input) in the nested scope. we can't std::move here, as the codegened
+                # outer input may be an expression / rvalue (e.g., reinterpret_view(x)), so we
+                # can't necessarily std::move it back to the origin (x).
+                self.writeline(f"AtenTensorHandle {inner_input}_handle;")
+                self.writeline(
+                    f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_assign_tensors_out({outer_input}, &{inner_input}_handle));"
+                )
+                self.writeline(
+                    f"RAIIAtenTensorHandle {inner_input}({inner_input}_handle);"
+                )
+            else:
+                self.writeline(
+                    f"{self.declare}{inner_input} = {outer_input}{self.ending}"
+                )
+
+    def codegen_subgraph_suffix(self, subgraph, outer_inputs, outer_outputs):
+        for inner_output, outer_output in zip(
+            subgraph.graph.graph_outputs, outer_outputs
+        ):
+            src = inner_output.codegen_reference()
+            if config.abi_compatible:
+                # in ABI-compatible mode, we need to std::move subgraph output (inner_output)
+                # to the conditional output (outer_output), as RAIIAtenTensorHandle's copy
+                # constructor is deleted.
+                src = f"std::move({src})"
+                # in case the outer_output carried a value
+                # before (e.g., in the while_loop codegen)
+                self.writeline(f"{outer_output}.reset();")
+            self.writeline(f"{outer_output} = {src}{self.ending}")
+
+    def codegen_conditional(self, conditional):
+        name = conditional.get_name()
+        outer_inputs = [f"{buf.codegen_reference()}" for buf in conditional.operands]
+        if config.abi_compatible:
+            outer_outputs = []
+            for out in conditional.outputs:
+                # in ABI-compatible mode, ir.MultiOutput is not codegened,
+                # hence pre-declare output variables directly and separately
+                self.writeline(f"RAIIAtenTensorHandle {out.get_name()};")
+                outer_outputs.append(out.get_name())
+
+            if not isinstance(conditional.predicate, ir.ShapeAsConstantBuffer):
+                predicate = f"{conditional.predicate.get_name()}_scalar"
+                self.writeline(f"bool {predicate};")
+                # in ABI-compatible mode, we need to use the ABI shim function
+                # to extract a C++ bool from the unrelying scalar bool Tensor
+                self.writeline(
+                    f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_bool({conditional.predicate.codegen_reference()}, &{predicate}));"
+                )
+            else:
+                # the predicate is not a Tensor: SymBool or Python bool
+                predicate = conditional.predicate.codegen_reference()
+        else:
+            # in non-ABI-compatible mode, we can codegen the conditional outputs
+            # as array of at::Tensor instances, as the ir.MultiOutput is codegened
+            outer_outputs = [f"{name}[{i}]" for i in range(len(conditional.outputs))]
+            self.writeline(f"at::Tensor {name}[{len(conditional.outputs)}];")
+            predicate = f"{conditional.predicate.codegen_reference()}"
+            if not isinstance(conditional.predicate, ir.ShapeAsConstantBuffer):
+                # move the Tensor predicate to host
+                predicate = f"{predicate}.item<bool>()"
+
+        self.writeline(f"if ({predicate}) {{")
+        self.writeline(EnterSubgraphLine(self, conditional.true_subgraph.graph))
+        self.codegen_subgraph(conditional.true_subgraph, outer_inputs, outer_outputs)
+        self.writeline(ExitSubgraphLine(self))
+        self.writeline("} else {")
+        self.writeline(EnterSubgraphLine(self, conditional.false_subgraph.graph))
+        self.codegen_subgraph(conditional.false_subgraph, outer_inputs, outer_outputs)
+        self.writeline(ExitSubgraphLine(self))
+        self.writeline("}")
+
+    def codegen_while_loop(self, while_loop):
+        name = while_loop.get_name()
+        outer_carried_inputs = [
+            buf.codegen_reference() for buf in while_loop.carried_inputs
+        ]
+        outer_additional_inputs = [
+            buf.codegen_reference() for buf in while_loop.additional_inputs
+        ]
+        cond_result_name = f"{name}_cond_result"
+
+        if config.abi_compatible:
+            self.writeline(f"RAIIAtenTensorHandle {cond_result_name};")
+
+            cond_outer_inputs = []
+            for inp, out in zip(outer_carried_inputs, while_loop.outputs):
+                # in ABI-compatible mode, the carried inputs are codegened
+                # as buffers outside the while loop and set to the initial
+                # values. at the end of each while_loop iteration, they
+                # will be assined the carried values.
+                out_name = out.get_name()
+                self.writeline(f"AtenTensorHandle {out_name}_handle;")
+                self.writeline(
+                    f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_assign_tensors_out({inp}, &{out_name}_handle));"
+                )
+                self.writeline(f"RAIIAtenTensorHandle {out_name}({out_name}_handle);")
+                cond_outer_inputs.append(out_name)
+
+            # additional inputs will be assinged within the while_loop
+            # iteration directly from the corresponding outer graph buffers
+            cond_outer_inputs.extend(outer_additional_inputs)
+        else:
+            self.writeline(f"at::Tensor {cond_result_name};")
+            self.writeline(f"at::Tensor {name}[{len(outer_carried_inputs)}];")
+            for i, inp in enumerate(outer_carried_inputs):
+                # set the initial state before the loop
+                self.writeline(f"{name}[{i}] = {inp};")
+
+            cond_outer_inputs = [
+                *[f"{name}[{i}]" for i in range(len(outer_carried_inputs))],
+                *outer_additional_inputs,
+            ]
+
+        cond_outer_outputs = [cond_result_name]
+        body_outer_inputs = list(cond_outer_inputs)
+        body_outer_outputs = body_outer_inputs[: len(outer_carried_inputs)]
+
+        self.writeline("while (1) {")
+        self.writeline(EnterSubgraphLine(self, while_loop.cond_subgraph.graph))
+        self.codegen_subgraph(
+            while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
+        )
+
+        if config.abi_compatible:
+            cond_result = f"{cond_result_name}_scalar"
+            self.writeline(f"bool {cond_result};")
+            # in ABI-compatible mode, we need to use the ABI shim function
+            # to extract a C++ bool from the unrelying scalar bool Tensor
+            self.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_bool({cond_result_name}, &{cond_result}));"
+            )
+        else:
+            cond_result = f"{cond_result_name}.item<bool>()"
+        self.writeline(f"if (!{cond_result}) break;")
+
+        self.writeline(ExitSubgraphLine(self))
+        self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
+        self.codegen_subgraph(
+            while_loop.body_subgraph, body_outer_inputs, body_outer_outputs
+        )
+        self.writeline(ExitSubgraphLine(self))
+        self.writeline("}")
+
+    def generate_extern_kernel_args_decl_if_needed(
+        self, op_overload, raw_args, output_args
+    ):
+        arg_types = [x.real_type for x in op_overload._schema.arguments]
+        return_types = [x.type for x in op_overload._schema.returns]
+
+        new_tensor_args = []
+        new_int_args = []
+
+        def fill_args(arg, arg_type):
+            static_arg_types = (
+                torch.FloatType,
+                torch.BoolType,
+                torch.StringType,
+                torch.Type,
+                torch.DeviceObjType,
+            )
+            inductor_tensor_buffers = (
+                ir.Buffer,
+                ir.ReinterpretView,
+            )
+
+            if isinstance(arg_type, torch.TensorType):
+                assert isinstance(arg, inductor_tensor_buffers), f"got {type(arg)}"
+                new_tensor_args.append(f"{arg.codegen_reference()}")
+            elif isinstance(arg_type, torch.IntType):
+                # int
+                new_int_args.append(str(arg))
+            elif isinstance(arg_type, torch.SymIntType):
+                # SymInt
+                expr = arg.node.expr if isinstance(arg, torch.SymInt) else arg
+                new_int_args.append(self.expr_printer(expr))
+            elif isinstance(arg_type, torch.NumberType):
+                # Scalar of type int
+                assert isinstance(arg, (int, float, bool))
+                # Only treat int Scalar as dynamic
+                if isinstance(arg, int):
+                    new_int_args.append(str(arg))
+            elif isinstance(arg_type, torch.ListType):
+                assert isinstance(arg, (list, tuple))
+
+                # List[Tensor]
+                if isinstance(arg_type.getElementType(), torch.TensorType):
+                    new_tensor_args.extend([f"{a.codegen_reference()}" for a in arg])
+                # List[Optional[Tensor]]
+                elif isinstance(
+                    arg_type.getElementType(), torch.OptionalType
+                ) and isinstance(
+                    arg_type.getElementType().getElementType(), torch.TensorType
+                ):
+                    new_tensor_args.extend(
+                        [f"{a.codegen_reference()}" for a in arg if a is not None]
+                    )
+                # List[int]
+                elif isinstance(arg_type.getElementType(), torch.IntType):
+                    new_int_args.extend([str(a) for a in arg])
+                # List[SymInt]
+                elif isinstance(arg_type.getElementType(), torch.SymIntType):
+                    expressions = [
+                        a.node.expr if isinstance(a, torch.SymInt) else a for a in arg
+                    ]
+                    new_int_args.extend(
+                        [self.expr_printer(expr) for expr in expressions]
+                    )
+                # List[Scalar]
+                elif isinstance(arg_type.getElementType(), torch.NumberType):
+                    # Only treat int Scalar as dynamic
+                    is_int_type = [isinstance(a, int) for a in arg]
+                    if any(is_int_type):
+                        assert all(
+                            is_int_type
+                        ), "AOTInductor only supports int scalars of the same type"
+                        new_int_args.extend([str(a) for a in arg])
+                else:
+                    assert isinstance(
+                        arg_type.getElementType(), static_arg_types  # type: ignore[arg-type]
+                    ), f"Fall through arguments must be one of static_arg_types, got {type(arg_type)}"
+            else:
+                assert isinstance(
+                    arg_type, static_arg_types  # type: ignore[arg-type]
+                ), f"Fall through arguments must be one of static_arg_types, got {type(arg_type)}"
+
+        for arg, arg_type in zip(raw_args, arg_types):
+            if arg is not None:
+                if isinstance(arg_type, torch.OptionalType):
+                    fill_args(arg, arg_type.getElementType())
+                else:
+                    fill_args(arg, arg_type)
+
+        def fill_output_arg(arg, return_type):
+            if isinstance(return_type, torch.TensorType):
+                self.writeline(f"AtenTensorHandle {arg}_handle;  // output buffer")
+                self.writeline(
+                    f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_new_uninitialized_tensor(&{arg}_handle));"
+                )
+                self.writeline(f"RAIIAtenTensorHandle {arg}({arg}_handle);")
+                new_tensor_args.append(f"{arg}")
+            elif isinstance(return_type, torch.SymIntType):
+                raise NotImplementedError("NYI support for return type: SymInt")
+            elif isinstance(return_type, torch.ListType) and isinstance(
+                return_type.getElementType(), torch.SymIntType
+            ):
+                raise NotImplementedError("NYI support for return type: List[SymInt]")
+            else:
+                raise AssertionError(f"Unsupported return type found: {return_type}")
+
+        # TODO: Only support tensor(s) returns for now, SymInt is not implemented yet
+        for return_type in return_types:
+            if isinstance(return_type, (torch.TensorType)):
+                pass
+            elif isinstance(return_type, torch.OptionalType):
+                assert isinstance(return_type.getElementType(), torch.TensorType)
+            elif isinstance(return_type, torch.ListType):
+                assert isinstance(return_type.getElementType(), torch.TensorType)
+            else:
+                raise NotImplementedError(
+                    f"return type {return_type} is not yet supported."
+                )
+
+        for output_arg in output_args:
+            assert output_arg is not None, "Optional return types are not yet supported"
+            if isinstance(output_arg, (list, tuple)):
+                for out in output_arg:
+                    fill_output_arg(out, torch.TensorType.get())
+            else:
+                fill_output_arg(output_arg, torch.TensorType.get())
+
+        return new_tensor_args, new_int_args
+
+    def generate_extern_kernel_alloc_and_find_schema_if_needed(
+        self,
+        buf_name: str,
+        python_kernel_name: str,
+        cpp_kernel_name: str,
+        codegen_args: List[str],
+        cpp_op_schema: str,
+        cpp_kernel_key: str,
+        cpp_kernel_overload_name: str = "",
+        op_overload: Optional[torch._ops.OpOverload] = None,
+        raw_args=None,
+        outputs=None,
+    ):
+        # No stack allocation when there is a fallback op
+        self.allow_stack_allocation = False
+
+        def extract_output_name(out):
+            assert out is not None, "None, i.e. optional output is not supported"
+            if isinstance(out, ir.MultiOutput):
+                return out.get_name()
+            elif isinstance(out, (list, tuple)):
+                return type(out)(extract_output_name(o) for o in out)
+            else:
+                raise AssertionError(f"Unexpected output: {type(out)}")
+
+        # output_args has the same pytree structure as outputs
+        output_args = None
+        if config.abi_compatible:
+            output_args = extract_output_name(outputs)
+            if isinstance(output_args, str):
+                output_args = [output_args]
+
+        if config.is_fbcode():
+            assert op_overload is not None
+            assert raw_args is not None
+            assert outputs is not None
+
+            return self.generate_extern_kernel_alloc_and_find_schema_if_needed_fbcode(
+                cpp_kernel_key,
+                op_overload,
+                raw_args,
+                output_args,
+            )
+        else:
+            return self.generate_extern_kernel_alloc_and_find_schema_if_needed_oss(
+                buf_name,
+                python_kernel_name,
+                cpp_kernel_name,
+                codegen_args,
+                cpp_op_schema,
+                cpp_kernel_key,
+                cpp_kernel_overload_name,
+                op_overload,
+                raw_args,
+                output_args,
+            )
+
+    def generate_scoped_gil_acquire(self, declarations_before_scope, lines_in_scope):
+        scoped_lines = IndentedBuffer()
+        for declaration in declarations_before_scope:
+            scoped_lines.writeline(declaration)
+
+        scoped_lines.writeline("{")
+        with scoped_lines.indent():
+            scoped_lines.writeline("py::gil_scoped_acquire acquire;")
+            scoped_lines.writelines(lines_in_scope.split("\n"))
+        scoped_lines.writelines("}")
+        return scoped_lines._lines
+
+    def load_custom_op_wrapper(self):
+        # TODO: need to support control flow
+        if self.custom_op_wrapper_loaded:
+            return
+
+        lines = """
+RAIIPyObject codecache_module(PyImport_ImportModule("torch._inductor.codecache"));
+if (codecache_module.get() == NULL) {
+    throw std::runtime_error("Failed to load torch._inductor.codecache");
+}
+custom_op_wrapper = PyObject_GetAttrString(codecache_module, "custom_op_wrapper");
+if (custom_op_wrapper.get() == NULL) {
+    throw std::runtime_error("Failed to load torch._inductor.codecache.custom_op_wrapper");
+}"""
+
+        declarations_before_scope = ["RAIIPyObject custom_op_wrapper;"]
+        scope_gil_acquire = self.generate_scoped_gil_acquire(
+            declarations_before_scope, lines
+        )
+        self.writelines(scope_gil_acquire)
+
+        self.custom_op_wrapper_loaded = True
+
+    def generate_py_arg(self, py_args_var, idx, raw_arg, arg_type):
+        def generate_py_arg_inner(raw_arg, arg_type):
+            if isinstance(arg_type, torch.TensorType):
+                # Store AtenTensorHandle as void*
+                return f"PyCapsule_New(reinterpret_cast<void*>({raw_arg.codegen_reference()}.get()), NULL, NULL)"
+            elif isinstance(arg_type, torch.IntType):
+                # int
+                return f"PyInt_FromLong({raw_arg})"
+            elif isinstance(arg_type, torch.SymIntType):
+                # SymInt
+                expr = (
+                    raw_arg.node.expr if isinstance(raw_arg, torch.SymInt) else raw_arg
+                )
+                return f"PyInt_FromLong({self.expr_printer(expr)})"
+            elif isinstance(arg_type, torch.FloatType):
+                return f"PyFloat_FromDouble({raw_arg})"
+            elif isinstance(arg_type, torch.BoolType):
+                return f"PyBool_FromBool({raw_arg})"
+            else:
+                raise NotImplementedError(
+                    f"arg type {arg_type} is not yet supported by custom_op_wrapper"
+                )
+
+        lines = ""
+        if isinstance(arg_type, torch.ListType):
+            assert isinstance(raw_arg, (list, tuple)), str(raw_arg) + " is not a list"
+            lines += f"PyObject* {py_args_var}_{idx} = PyList_New({len(raw_arg)});\n"
+            for i, elem in enumerate(raw_arg):
+                lines += f"PyList_SetItem({py_args_var}_{idx}, {i}, {generate_py_arg_inner(elem, arg_type.getElementType())});\n"
+            lines += f"PyTuple_SetItem({py_args_var}, {idx}, {py_args_var}_{idx});\n"
+        else:
+            lines += f"PyTuple_SetItem({py_args_var}, {idx}, {generate_py_arg_inner(raw_arg, arg_type)});\n"
+        return lines
+
+    def generate_extern_kernel_alloc_and_find_schema_if_needed_oss(
+        self,
+        buf_name: str,
+        python_kernel_name: str,
+        cpp_kernel_name: str,
+        codegen_args: List[str],
+        cpp_op_schema: str,
+        cpp_kernel_key: str,
+        cpp_kernel_overload_name: str = "",
+        op_overload: Optional[torch._ops.OpOverload] = None,
+        raw_args=None,
+        output_args: Optional[List[str]] = None,
+    ):
+        if V.graph.aot_mode or not config.abi_compatible:
+            # Will update this to use an OSS version ProxyExecutor
+            if cpp_kernel_key not in self.extern_call_ops:
+                self.writeline(
+                    f"static auto op_{cpp_kernel_key} = c10::Dispatcher::singleton()"
+                )
+                self.writeline(
+                    f'\t.findSchemaOrThrow("{cpp_kernel_name}", "{cpp_kernel_overload_name}")'
+                )
+                self.writeline(f"\t.typed<{cpp_op_schema}>();")
+                self.extern_call_ops.add(cpp_kernel_key)
+
+            self.writeline(
+                f"auto {buf_name} = op_{cpp_kernel_key}.call({', '.join(codegen_args)});"
+            )
+        else:
+            # In the JIT mode, because of the ABI-compatible requirement, we can't directly call
+            # c10::Dispatcher to find the custom op and call it. Instead, we go back to Python
+            # to invoke this custom op.
+            self.load_custom_op_wrapper()
+
+            assert output_args is not None, "output_args should not be None"
+            num_args = len(raw_args)
+            py_args_var = f"py_args_{next(self.arg_var_id)}"
+            # First arg is always the python op name
+            lines = f"""
+RAIIPyObject {py_args_var}(PyTuple_New({num_args+1}));
+if ({py_args_var}.get() == NULL) {{
+    throw std::runtime_error("PyTuple_New {py_args_var} failed");
+}}
+PyTuple_SetItem({py_args_var}, 0, PyUnicode_FromString("{python_kernel_name}"));
+"""
+
+            assert op_overload is not None, "op_overload should not be None"
+            for idx, (raw_arg, schema_arg) in enumerate(
+                zip(raw_args, op_overload._schema.arguments)
+            ):
+                lines += self.generate_py_arg(
+                    py_args_var, idx + 1, raw_arg, schema_arg.real_type
+                )
+
+            lines += f"""
+// Call the custom op in Python
+RAIIPyObject py_{buf_name}(PyObject_CallObject(custom_op_wrapper, {py_args_var}));
+if (py_{buf_name}.get() == NULL) {{
+    throw std::runtime_error("PyObject_CallObject {python_kernel_name} failed");
+}}"""
+
+            if len(output_args) == 1:
+                # result is a single tensor
+                lines += f"""
+{output_args[0]} = reinterpret_cast<AtenTensorHandle>(PyCapsule_GetPointer(py_{buf_name}.get(), NULL));"""
+            else:
+                # result is a tuple of tensors
+                for idx, output_arg in enumerate(output_args):
+                    lines += f"""
+{output_arg} =
+    reinterpret_cast<AtenTensorHandle>(PyCapsule_GetPointer(PyList_GET_ITEM(py_{buf_name}.get(), {idx}), NULL));"""
+
+            declarations_before_scope = [
+                f"RAIIAtenTensorHandle {output_arg};"
+                for idx, output_arg in enumerate(output_args)
+            ]
+            scope_gil_acquire = self.generate_scoped_gil_acquire(
+                declarations_before_scope, lines
+            )
+            self.writelines(scope_gil_acquire)
+
+    def generate_extern_kernel_alloc_and_find_schema_if_needed_fbcode(
+        self,
+        cpp_kernel_key,
+        op_overload,
+        raw_args,  # contains both args and flatten kwargs
+        output_args: Optional[List[str]] = None,
+    ):
+        (
+            tensor_call_args,
+            int_call_args,
+        ) = self.generate_extern_kernel_args_decl_if_needed(
+            op_overload, raw_args, output_args
+        )
+
+        tensor_call_args_str = ", ".join(tensor_call_args)
+        int_call_args_str = ", ".join(int_call_args)
+
+        extern_kernel_node_index = len(V.graph.extern_kernel_nodes) - 1
+
+        self.writeline(
+            f"aoti_torch_proxy_executor_call_function(proxy_executor, "
+            f"{extern_kernel_node_index}, "
+            f"{len(int_call_args)}, "
+            f"std::vector<int64_t>{{{int_call_args_str}}}.data(), "
+            f"{len(tensor_call_args)}, "
+            f"std::vector<AtenTensorHandle>{{{tensor_call_args_str}}}.data());"
+        )
+
+        self.extern_call_ops.add(cpp_kernel_key)
+
+    def generate_reset_kernel_saved_flags(self):
+        pass
+
+    def generate_save_uncompiled_kernels(self):
+        pass
+
+    def val_to_cpp_arg_str(self, type_, val) -> str:
+        if config.abi_compatible and isinstance(type_, torch.OptionalType):
+            if val is None:
+                return "0"  # nullptr is not available in C
+            if not isinstance(type_.getElementType(), torch.TensorType):
+                var_name = f"var_{next(self.arg_var_id)}"
+                if isinstance(
+                    type_.getElementType(),
+                    (torch.ListType, torch.TupleType, torch.DeviceObjType),
+                ):
+                    arg_str = self.val_to_arg_str(val)
+                    if val is None:
+                        return "{arg_str}, 0"
+                    else:
+                        # For datatypes with auxiliary info, we need to hoist out the extra arguments.
+                        # NOTE: This only works if there is one additional argument, though it can easily be generalized.
+                        main_value, aux = arg_str.rsplit(", ")
+                        self.writeline(f"auto {var_name} = {main_value};")
+                        return f"&{var_name}, {aux}"
+                else:
+                    self.writeline(f"auto {var_name} = {self.val_to_arg_str(val)};")
+                    return f"&{var_name}"
+            elif config.c_shim_version == "2":
+                # Similar to other data type, use pointer to denote optional tensor arg in v2 C shim
+                base_handle = self.val_to_arg_str(val)
+                if "wrap_with_raii_handle_if_needed" in base_handle:
+                    # wrap_with_raii_handle_if_needed creates a temp RAIIAtenTensorHandle, so we need to
+                    # explicitly store it. Otherwise, it will be destroyed before the fallback kernel call.
+                    tmp_var_name = f"var_{next(self.arg_var_id)}"
+                    self.writeline(
+                        f"RAIIAtenTensorHandle {tmp_var_name} = {base_handle};"
+                    )
+                    base_handle = tmp_var_name
+                var_name = f"var_{next(self.arg_var_id)}"
+                self.writeline(f"AtenTensorHandle {var_name} = {base_handle}.get();")
+                return f"&{var_name}"
+
+        return self.val_to_arg_str(val)
+
+    def val_to_arg_str(self, val) -> str:
+        if val is None:
+            # When None is passed as an argument, it represents an optional that does not contain a value.
+            if config.abi_compatible:
+                return "0"  # nullptr is not available in C
+            return "c10::nullopt"
+        elif isinstance(val, bool):
+            if config.abi_compatible:
+                return "1" if val else "0"
+            else:
+                return "true" if val else "false"
+        elif isinstance(val, int):
+            # uint64_t is long on Linux, but long long on MacOS
+            return f"{val}LL" if sys.platform == "darwin" else f"{val}L"
+        elif isinstance(val, str):
+            return f'"{val}"'
+        elif isinstance(
+            val, (ir.Buffer, ir.ReinterpretView, ir.StorageBox, ir.TensorBox)
+        ):
+            return val.codegen_reference()
+        elif isinstance(val, torch.device):
+            return self.codegen_device(val)
+        elif isinstance(val, torch.dtype):
+            return self.codegen_dtype(val)
+        elif isinstance(val, float) and val in [float("inf"), float("-inf")]:
+            if val == float("inf"):
+                return "std::numeric_limits<float>::infinity()"
+            else:
+                return "-std::numeric_limits<float>::infinity()"
+        elif isinstance(val, (list, tuple)):
+            # FIXME handle embedded optional types?
+            result = f"{{{', '.join(self.val_to_arg_str(x) for x in val)}}}"
+            if config.abi_compatible:
+                static = self.is_statically_known_list_of_ints(val)
+                # Need to pass the array length because we can't use std::vector
+                int_var_array = self.codegen_int_array_var(
+                    result,
+                    known_statically=static,
+                    graph=self.get_codegened_graph(),
+                )
+                return f"{int_var_array}, {len(val)}"
+            else:
+                return result
+        else:
+            return repr(val)
diff --git a/torch/_inductor/codegen/cpp_wrapper_cuda.py b/torch/_inductor/codegen/cpp_wrapper_cuda.py
new file mode 100644
index 0000000000000..39d37348d703b
--- /dev/null
+++ b/torch/_inductor/codegen/cpp_wrapper_cuda.py
@@ -0,0 +1,262 @@
+import functools
+import os
+from itertools import chain, count
+from typing import Any, List, Optional, TYPE_CHECKING
+
+import sympy
+
+from torch._inductor.codecache import get_cpp_wrapper_cubin_path_name
+from torch._inductor.runtime.triton_heuristics import grid as default_grid
+
+from .. import config
+from ..codecache import CudaKernelParamCache
+from ..virtualized import V
+from .aoti_hipify_utils import maybe_hipify_code_wrapper
+from .codegen_device_driver import cuda_kernel_driver, cuda_kernel_header
+from .cpp_wrapper_cpu import CppWrapperCpu
+from .wrapper import SymbolicCallArg
+
+if TYPE_CHECKING:
+    from ..graph import GraphLowering
+
+
+def is_int(s: str) -> bool:
+    # Cpp code gen adds L at the end of ints
+    # Lets remove it for checking whether we have an int or not
+    if s and s[-1] == "L":
+        s = s[:-1]
+    try:
+        int(s)
+    except ValueError:
+        return False
+    except TypeError:
+        return False
+    return True
+
+
+def is_float(s: str) -> bool:
+    try:
+        float(s)
+    except ValueError:
+        return False
+    return True
+
+
+class CppWrapperCuda(CppWrapperCpu):
+    """
+    Generates cpp wrapper for running on GPU and calls CUDA kernels
+    """
+
+    def __init__(self):
+        self.device = "cuda"
+        super().__init__()
+        self.grid_id = count()
+        self.cuda = True
+
+    def write_header(self):
+        if V.graph.is_const_graph:
+            # We do not write header for constant graph, it will be written by main module.
+            return
+
+        super().write_header()
+
+        self.header.splice("#include <filesystem>")
+        if config.abi_compatible:
+            self.header.splice(
+                "#include <torch/csrc/inductor/aoti_runtime/utils_cuda.h>"
+            )
+        else:
+            self.header.splice(maybe_hipify_code_wrapper(cuda_kernel_header()))
+        self.header.splice(maybe_hipify_code_wrapper(cuda_kernel_driver()))
+
+    def write_get_raw_stream(self, index, graph=None):
+        name = f"stream{index}"
+        self.writeline(maybe_hipify_code_wrapper(f"cudaStream_t {name};"))
+        self.writeline(
+            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_current_cuda_stream({index}, (void**)&{name}));"
+        )
+        return name
+
+    def define_kernel(
+        self, name: str, kernel: str, metadata: Optional[str] = None, cuda=True
+    ):
+        if not cuda:
+            return super().define_kernel(name, kernel, metadata, cuda)
+
+    def generate(self, is_inference):
+        self.prefix.writeline("\n")
+        if not V.graph.aot_mode:
+            for kernel in chain(
+                sorted(self.src_to_kernel.values()),
+                sorted([entry[0] for entry in self.user_defined_kernel_cache.values()]),
+            ):
+                self.prefix.writeline(
+                    maybe_hipify_code_wrapper(f"static CUfunction {kernel} = nullptr;")
+                )
+            self.prefix.writeline("\n")
+        return super().generate(is_inference)
+
+    @functools.lru_cache(None)
+    def generate_load_kernel_once(
+        self,
+        name: str,
+        mangled_name: str,
+        cubin_path: str,
+        shared_mem: int,
+        graph: "GraphLowering",  # for per-graph caching
+    ):
+        if V.graph.aot_mode:
+            self.writeline(f"if (kernels.{name} == nullptr) {{")
+            self.writeline(
+                f"""    kernels.{name} = loadKernel("{cubin_path}", "{mangled_name}", {shared_mem}, this->cubin_dir_);"""
+            )
+            self.writeline("}")
+        else:
+            self.writeline(f"if ({name} == nullptr) {{")
+            self.writeline(
+                f"""    {name} = loadKernel("{cubin_path}", "{mangled_name}", {shared_mem});"""
+            )
+            self.writeline("}")
+
+    def generate_args_decl(self, call_args):
+        dynamic_symbols = V.graph.sizevars.free_symbols()
+        # TODO: only works for constant now, need type info
+        new_args = []
+        for arg in call_args:
+            var_name = f"var_{next(self.arg_var_id)}"
+            if isinstance(arg, (sympy.Integer, sympy.Symbol, SymbolicCallArg)):
+                self.writeline(f"auto {var_name} = {arg};")
+            elif isinstance(arg, sympy.Float):
+                self.writeline(f"float {var_name} = {self.expr_printer(arg)};")
+            elif isinstance(arg, sympy.Expr):
+                self.writeline(f"auto {var_name} = {self.expr_printer(arg)};")
+            elif is_int(arg):
+                self.writeline(f"int {var_name} = {arg};")
+            elif is_float(arg):
+                self.writeline(f"float {var_name} = {arg};")
+            elif any(str(arg) == s.name for s in dynamic_symbols):
+                self.writeline(f"auto {var_name} = {arg};")
+            elif arg == "nullptr":
+                self.writeline(f"auto {var_name} = nullptr;")
+            elif arg == "c10::nullopt":
+                self.writeline(f"auto {var_name} = c10::nullopt;")
+            else:
+                if config.abi_compatible:
+                    self.writeline(
+                        maybe_hipify_code_wrapper(f"CUdeviceptr {var_name};")
+                    )
+                    self.writeline(
+                        f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr({arg}, reinterpret_cast<void**>(&{var_name})));"
+                    )
+                else:
+                    self.writeline(
+                        maybe_hipify_code_wrapper(
+                            f"CUdeviceptr {var_name} = reinterpret_cast<CUdeviceptr>({arg}.data_ptr());"
+                        )
+                    )
+            new_args.append(f"&{var_name}")
+
+        return ", ".join(new_args)
+
+    def generate_default_grid(self, name: str, grid: List[Any], cuda: bool = True):
+        """
+        Generate grid configs for launching a CUDA kernel using the grid
+        function from triton_heuristics.
+        """
+        if not cuda:
+            return grid
+        assert isinstance(grid, list), f"expected {grid=} to be a list"
+        grid = [e.inner_expr if isinstance(e, SymbolicCallArg) else e for e in grid]
+        grid_fn = default_grid(*grid)
+        params = CudaKernelParamCache.get(name)
+        assert (
+            params is not None
+        ), f"cuda kernel parameters for {name} should already exist at this moment, only found {CudaKernelParamCache.get_keys()}"
+        block_cfg = {
+            "XBLOCK": params["x_block"],
+            "YBLOCK": params["y_block"],
+            "ZBLOCK": params["z_block"],
+        }
+        return grid_fn(block_cfg)
+
+    def generate_kernel_call(
+        self,
+        name,
+        call_args,
+        grid=None,
+        device_index=None,
+        cuda=True,
+        triton=True,
+        arg_types=None,
+        grid_fn: str = "grid",
+        triton_meta=None,
+    ):
+        if not cuda:
+            # Even in CppWrapperCuda, we may see cpp kernels
+            return super().generate_kernel_call(
+                name, call_args, grid, device_index, cuda, triton, arg_types
+            )
+
+        params = CudaKernelParamCache.get(name)
+        assert (
+            params is not None
+        ), f"cuda kernel parameters for {name} should already exist at this moment"
+        mangled_name = params.get("mangled_name", None)
+        assert mangled_name is not None, "missing mangled_name"
+        cubin_path = params.get(get_cpp_wrapper_cubin_path_name(), None)
+        assert cubin_path is not None and os.path.exists(
+            cubin_path
+        ), f"cubin file should already exist at this moment: {cubin_path}"
+        shared_mem = params.get("shared_mem", 0)
+
+        self.generate_load_kernel_once(
+            name, mangled_name, cubin_path, shared_mem, V.graph
+        )
+
+        # args with value 1 are added into equal_to_1 and constants
+        # in triton_meta (in the Python codegen) which makes them
+        # inlined in the PTX and compiled CUBIN
+        if (
+            triton_meta is not None
+            and "configs" in triton_meta
+            and triton_meta["configs"]
+        ):
+            equal_to_1 = triton_meta["configs"][0].equal_to_1
+            call_args = [arg for i, arg in enumerate(call_args) if i not in equal_to_1]
+
+        call_args = self.generate_args_decl(call_args)
+        kernel_args_var = f"kernel_args_var_{next(self.kernel_callsite_id)}"
+        self.writeline(f"void* {kernel_args_var}[] = {{{call_args}}};")
+        stream = (
+            "stream"
+            if V.graph.aot_mode
+            else self.write_get_raw_stream(device_index, V.graph)
+        )
+        grid_name = f"{name}_grid_{next(self.grid_id)}"
+        assert isinstance(
+            grid, (list, tuple)
+        ), f"expected grid to be a list or tuple but got: {grid=}"
+
+        grid = [V.graph.sizevars.simplify(item) for item in grid]
+        grid_uses_symbolic_shapes = any(item.free_symbols for item in grid)
+        grid_args = [self.grid_expr_printer(item) for item in grid]
+        grid_args_str = ", ".join(grid_args)
+        self.writeline(f"Grid {grid_name} = Grid({grid_args_str});")
+
+        if grid_uses_symbolic_shapes:
+            self.writeline(f"if ({grid_name}.is_non_zero()) {{")
+        kernel_var_name = f"kernels.{name}" if V.graph.aot_mode else name
+        self.writeline(
+            "launchKernel({}, {}, {}, {}, {}, {}, {}, {});".format(
+                kernel_var_name,
+                f"{grid_name}.grid_x",
+                f"{grid_name}.grid_y",
+                f"{grid_name}.grid_z",
+                params["num_warps"],
+                params["shared_mem"],
+                kernel_args_var,
+                stream,
+            )
+        )
+        if grid_uses_symbolic_shapes:
+            self.writeline("}")
diff --git a/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py b/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
index 2e9dc2d8a7e14..da058fac50284 100644
--- a/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
+++ b/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
@@ -3,22 +3,15 @@
 
 from ...._dynamo.utils import counters
 
-from ... import config, ir
+from ... import config
 from ...codecache import code_hash, get_path
-from ...ir import ComputedBuffer, CUDATemplateBuffer, Pointwise
-from ...scheduler import (
-    BaseSchedulerNode,
-    BaseScheduling,
-    FusedSchedulerNode,
-    Scheduler,
-    SchedulerNode,
-)
+
+from ...ir import CUDATemplateBuffer
+from ...scheduler import BaseSchedulerNode, BaseScheduling, Scheduler, SchedulerNode
 from ...utils import get_fused_kernel_name, get_kernel_metadata, sympy_product
 from ...virtualized import V
 from ..common import IndentedBuffer
 
-from .cutlass_epilogue_gen import CUTLASSEVTOpNotImplementedError
-
 log = logging.getLogger(__name__)
 
 
@@ -38,116 +31,15 @@ def __init__(self, scheduler: Scheduler):
     def group_fn(self, sizes):
         return tuple(V.graph.sizevars.simplify(sympy_product(s)) for s in sizes)
 
-    def is_cuda_cpp_template(self, node: BaseSchedulerNode) -> bool:
+    @staticmethod
+    def is_cuda_cpp_template(node: BaseSchedulerNode) -> bool:
         return isinstance(node, SchedulerNode) and isinstance(
             node.node, CUDATemplateBuffer
         )
 
-    def is_cuda_cpp_fused_template(self, node: BaseSchedulerNode) -> bool:
-        return isinstance(node, FusedSchedulerNode) and self.is_cuda_cpp_template(
-            node.get_template_node()
-        )
-
-    def _can_fuse_epilogue_impl(
-        self,
-        cuda_template_buffer: CUDATemplateBuffer,
-        epilogue_nodes: List[ir.IRNode],
-        additional_node: ir.IRNode,
-    ) -> bool:
-        """
-        Check if the given node can be fused with the epilogue. At the moment, Kernels
-        support fusion with Pointwise operations, wrapped in (named) ComputedBuffer nodes.
-
-        Args:
-            cuda_template_buffer : A CUDATemplateBuffer object representing the CUDA template and it's result buffer
-            epilogue_nodes : List[ir.Buffer]: The list of already fused epilogue nodes.
-            additional_node: The ir.Buffer node to be checked if it can be fused with the epilogue.
-        Returns:
-        - bool: True if the given node can be fused with the epilogue, False otherwise.
-
-        """
-        if not isinstance(cuda_template_buffer, CUDATemplateBuffer):
-            return False
-        if not cuda_template_buffer.template.can_fuse_epilogue:
-            # The used GEMM op does not support fusing epilogues
-            return False
-        if not isinstance(additional_node, ComputedBuffer):
-            return False
-        if not isinstance(additional_node.data, Pointwise):
-            return False
-        # We can fuse a Pointwise op that depends on the last fused epilogue node
-        # if any. If there is no epilogue node yet, it needs to depend on the template
-        # node
-        node_name = additional_node.get_computed_buffer_name()
-        if node_name is None:
-            return False
-
-        if len(epilogue_nodes) == 0:
-            if cuda_template_buffer.name not in additional_node.get_read_names():
-                return False
-        else:
-            last_epilogue_node = epilogue_nodes[-1]
-            assert isinstance(last_epilogue_node, ir.ComputedBuffer)  # for mypy
-            last_epilogue_name = (
-                last_epilogue_node.name
-                if last_epilogue_node.name is not None
-                else last_epilogue_node.data.name  # type: ignore[attr-defined]
-            )
-            if last_epilogue_name not in additional_node.get_read_names():
-                return False
-        if additional_node.layout != cuda_template_buffer.layout:
-            return False
-        try:
-            from torch._inductor.codegen.cuda.cutlass_epilogue_gen import (
-                CutlassEVTEpilogueArgumentFormatter,
-                CutlassEVTEpilogueTypeFormatter,
-            )
-
-            CutlassEVTEpilogueTypeFormatter.ir_to_evt_string(
-                cast(str, cuda_template_buffer.name), "anything", [additional_node]
-            )
-            CutlassEVTEpilogueArgumentFormatter.ir_to_evt_argument_string(
-                cast(str, cuda_template_buffer.name), [additional_node]
-            )
-        except CUTLASSEVTOpNotImplementedError as e:
-            not_implemented_op = str(e)
-            if not_implemented_op.startswith("_op_"):
-                not_implemented_op = not_implemented_op[4:]
-                log.warning(
-                    f"Cannot fuse epilogue node {additional_node} into {cuda_template_buffer.name}, likely due to unsupported operation: {not_implemented_op}"  # noqa: G004, B950
-                )
-                return False
-            else:
-                # Likely due to unsupported dtype.
-                log.warning(
-                    f"Cannot fuse epilogue node {additional_node} into {cuda_template_buffer.name}. Reason: {not_implemented_op}"  # noqa: G004, B950
-                )
-                return False
-        return True
-
-    @staticmethod
-    def _unwrap_epilogue_nodes(fused_node: FusedSchedulerNode) -> List[ir.IRNode]:
-        nodes = fused_node.get_nodes()
-        template_node = fused_node.get_template_node()
-        nodes.remove(template_node)
-        return [n.node for n in nodes]
-
     def can_fuse_vertical(
         self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
     ) -> bool:
-        if self.is_cuda_cpp_template(node1) and isinstance(node2, SchedulerNode):
-            return self._can_fuse_epilogue_impl(
-                cast(CUDATemplateBuffer, node1.node), [], node2.node
-            )
-        elif self.is_cuda_cpp_fused_template(node1) and isinstance(
-            node2, SchedulerNode
-        ):
-            fnode1 = cast(FusedSchedulerNode, node1)
-            return self._can_fuse_epilogue_impl(
-                fnode1.get_template_node().node,
-                self._unwrap_epilogue_nodes(fnode1),
-                node2.node,
-            )
         return False
 
     def define_kernel(self, src_code: str, node_schedule) -> str:
@@ -194,19 +86,14 @@ def codegen_template(
         _, (numel, rnumel) = template_node.group
         assert rnumel == 1
         ctb: CUDATemplateBuffer = cast(CUDATemplateBuffer, template_node.node)
-        epilogue_ir_nodes: List[ir.Buffer] = [n.node for n in epilogue_nodes]
-        assert all(
-            isinstance(n, ir.ComputedBuffer) for n in epilogue_ir_nodes
-        ), "Epilogue nodes must all be instances of ir.ComputedBuffer"
-        kernel, render = ctb.make_kernel_render(ctb, epilogue_nodes=epilogue_ir_nodes)
+        kernel, render = ctb.make_kernel_render(ctb)
         with kernel:
-            for node in [template_node, *epilogue_nodes]:
-                node.mark_run()
+            template_node.mark_run()
             src_code = render()
 
         with V.set_kernel_handler(kernel):
-            node_schedule = [template_node, *epilogue_nodes]
+            node_schedule = [template_node]
             kernel_name = self.define_kernel(src_code, node_schedule)
-        kernel.call_kernel(kernel_name, ctb, epilogue_ir_nodes)
+        kernel.call_kernel(kernel_name, ctb)
         V.graph.removed_buffers |= kernel.removed_buffers
         self.scheduler.free_buffers()
diff --git a/torch/_inductor/codegen/cuda/cuda_env.py b/torch/_inductor/codegen/cuda/cuda_env.py
index 2e872861cdac9..6171921173e97 100644
--- a/torch/_inductor/codegen/cuda/cuda_env.py
+++ b/torch/_inductor/codegen/cuda/cuda_env.py
@@ -15,7 +15,7 @@ def get_cuda_arch() -> Optional[str]:
         if cuda_arch is None:
             # Get Compute Capability of the first Visible device
             major, minor = torch.cuda.get_device_capability(0)
-            cuda_arch = major * 10 + minor
+            return str(major * 10 + minor)
         return str(cuda_arch)
     except Exception as e:
         log.error("Error getting cuda arch: %s", e)
diff --git a/torch/_inductor/codegen/cuda/cuda_kernel.py b/torch/_inductor/codegen/cuda/cuda_kernel.py
index c365590be59a6..b95174d567a86 100644
--- a/torch/_inductor/codegen/cuda/cuda_kernel.py
+++ b/torch/_inductor/codegen/cuda/cuda_kernel.py
@@ -1,15 +1,21 @@
 import logging
-from typing import Callable, Dict, List, Optional, TYPE_CHECKING
+from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union
 
-from ... import ir
 from ...autotune_process import CUDABenchmarkRequest
-from ...ir import Buffer, CUDATemplateBuffer, IRNode, Layout, TensorBox
-from ...select_algorithm import ChoiceCaller
+from ...ir import (
+    Buffer,
+    ChoiceCaller,
+    CUDATemplateBuffer,
+    IRNode,
+    Layout,
+    PrimitiveInfoType,
+    TensorBox,
+)
 from ...utils import sympy_product
 from ...virtualized import V
-
 from ..common import IndentedBuffer, Kernel, OpOverrides
-from ..cpp import CppPrinter, DTYPE_TO_CPP
+
+from ..cpp_utils import CppPrinter, DTYPE_TO_CPP
 
 if TYPE_CHECKING:
     from torch._inductor.codegen.cuda.cuda_template import CUDATemplate
@@ -137,7 +143,9 @@ def def_kernel(
         return f"PT_EXPORT int {self.kernel_name}({', '.join(arg_defs)}, {self._EXTRA_CPP_ARGS})"
 
     def call_kernel(
-        self, name: str, node: "CUDATemplateBuffer", epilogue_nodes: List[ir.Buffer]
+        self,
+        name: str,
+        node: "CUDATemplateBuffer",  # type: ignore[name-defined]
     ) -> None:
         """
         Generates code to call the kernel through V.graph.wrapper_code.
@@ -161,7 +169,10 @@ def call_kernel(
         call_args.append("None")
 
         if node.get_workspace_size() > 0:
-            call_args.append(f"c_void_p({node.get_name()}_workspace.data_ptr())")
+            wrapper.generate_workspace_allocation(
+                node.get_workspace_size(), V.graph.scheduler.current_device, False
+            )
+            call_args.append("c_void_p(workspace.data_ptr())")
         else:
             call_args.append("None")
 
@@ -172,6 +183,8 @@ def call_kernel(
             cuda=True,
             triton=False,
         )
+        if node.get_workspace_size() > 0:
+            wrapper.writeline(wrapper.make_free_by_names(["workspace"]))
 
     def dtype(self, node: IRNode) -> Optional[str]:
         """
@@ -182,6 +195,23 @@ def dtype(self, node: IRNode) -> Optional[str]:
             return "void"
         return DTYPE_TO_CPP.get(node.get_layout().dtype)
 
+    def cutlass_dtype(self, node: IRNode, default_dtype="void") -> Optional[str]:
+        # Helper method, called into from CUTLASSGemmTemplate
+        if node is None:
+            return default_dtype
+        from torch._inductor.codegen.cuda.cuda_template import CUTLASSTemplate
+
+        return CUTLASSTemplate._DTYPE_TO_CUTLASS[node.get_layout().dtype]
+
+    def max_valid_index(self, node: IRNode, default=-1):
+        # Helper method, called into from CUTLASSGemmTemplate
+        if node is None:
+            return default
+        max_valid_offset = 0
+        for i in range(len(node.get_size())):
+            max_valid_offset += (node.get_size()[i] - 1) * node.get_stride()[i]
+        return max_valid_offset
+
     def offset(self, node: IRNode) -> str:
         """
         Generates code which represents offset of a given node.
@@ -298,17 +328,25 @@ def __init__(
         layout: Layout,
         make_kernel_render: Callable[[CUDATemplateBuffer, Optional[List[IRNode]]], str],
         bmreq: CUDABenchmarkRequest,
-        template: "CUDATemplate",
+        template: "CUDATemplate",  # type: ignore[name-defined]
+        info_kwargs: Optional[Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]],  # type: ignore[type-arg]
     ):
         super().__init__(name, input_nodes, layout)
         self.category = category
         self.make_kernel_render = make_kernel_render
         self.bmreq = bmreq
         self.template = template
+        self.info_kwargs = info_kwargs
+
+    def precompile(self) -> None:
+        assert self.bmreq is not None
+        self.bmreq.precompile()
 
     def benchmark(self, *args, out) -> float:
         assert self.bmreq is not None
-        return self.bmreq.benchmark(*args, output_tensor=out)
+        return self.bmreq.benchmark(
+            *args, output_tensor=out
+        )  # @TODO: Hack for ensuring that Cutlass Kernel is preferred
 
     def __str__(self):
         return f"CUDATemplateCaller(source_file={self.bmreq.source_file})"
@@ -324,7 +362,29 @@ def hash_key(self) -> str:
             ]
         )
 
+    def info_dict(self) -> Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        if self.info_kwargs is not None and "op" in self.info_kwargs:
+            op: Any = self.info_kwargs["op"]
+            return {
+                "backend": "CUDA",
+                "op_type": type(op).__name__,
+                "op_conf_name": str(op.configuration_name()),
+                "op_arch": str(op.arch),
+                "tile_shape": str(op.tile_description.tile_shape),
+                "epilogue_schedule": str(op.epilogue_schedule),
+                "kernel_schedule": str(op.kernel_schedule),
+                "element_accumulator": str(op.accumulator_type()),
+                "op_name": str(op.procedural_name()),
+                "instruction_shape": str(
+                    op.tile_description.math_instruction.instruction_shape
+                ),
+            }
+        else:
+            return {"backend": "CUDA", "op_type": "unknown"}
+
     def output_node(self) -> TensorBox:
+        self.bmreq.update_workspace_size()
         return TensorBox.create(
             CUDATemplateBuffer(
                 layout=self.layout,
diff --git a/torch/_inductor/codegen/cuda/cuda_template.py b/torch/_inductor/codegen/cuda/cuda_template.py
index 3e106dad84e4f..258e54266477c 100644
--- a/torch/_inductor/codegen/cuda/cuda_template.py
+++ b/torch/_inductor/codegen/cuda/cuda_template.py
@@ -126,6 +126,7 @@ def make_kernel_render(
             make_kernel_render,
             bmreq,
             self,
+            kwargs,
         )
 
     def header(self) -> IndentedBuffer:
diff --git a/torch/_inductor/codegen/cuda/cutlass_epilogue_gen.py b/torch/_inductor/codegen/cuda/cutlass_epilogue_gen.py
index 0eeb9abe1caf6..d8bf408dc28a2 100644
--- a/torch/_inductor/codegen/cuda/cutlass_epilogue_gen.py
+++ b/torch/_inductor/codegen/cuda/cutlass_epilogue_gen.py
@@ -15,7 +15,7 @@
 
 def _arg_str(a):
     if isinstance(a, sympy.Expr):
-        # If this return value containting the _MAGIC_SYMPY_ERROR_STRING
+        # If this return value containing the _MAGIC_SYMPY_ERROR_STRING
         # is used as part of the final generated C++ code,
         # a CUTLASSEVTOpNotImplementedError is raised to indicate that
         # the op could not be converted to a valid EVT expression.
@@ -99,7 +99,7 @@ def ir_to_evt_string(
                 result = pnode.inner_fn(index)
                 # each epilogue node results in a single "using" statement and may refer to the previous steps by name
                 formatter.aliases[node.name] = result
-            res = formatter.getvalue(result)
+            res = formatter.getvalue(result)  # type: ignore[possibly-undefined]
             if _MAGIC_SYMPY_ERROR_STRING in res:
                 raise CUTLASSEVTOpNotImplementedError(
                     "sympy / indexing expressions not yet supported in EVT fusion"
@@ -197,7 +197,7 @@ def _op_relu(self, a):
         return f"cutlass::epilogue::fusion::Sm90EVT<cutlass::epilogue::fusion::Sm90Compute<cutlass::maximum, ElementAcc, ElementAcc, RoundStyle>,{a}, {const_zero}>"  # noqa: B950
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
-        raise CUTLASSEVTOpNotImplementedError()
+        raise CUTLASSEVTOpNotImplementedError
 
     # Add more ops here...
     def getvalue(self, result) -> str:
@@ -266,7 +266,7 @@ def ir_to_evt_argument_string(
                 if node.name is not None:
                     formatter.aliases[node.name] = result
 
-            res: str = formatter.getvalue(result)
+            res: str = formatter.getvalue(result)  # type: ignore[possibly-undefined]
             if _MAGIC_SYMPY_ERROR_STRING in res:
                 raise CUTLASSEVTOpNotImplementedError(
                     "sympy / indexing expressions not yet supported in EVT fusion"
@@ -354,7 +354,7 @@ def _op_to_dtype(self, a, dtype, src_dtype=None):
         return a
 
     def reduction(self, dtype, src_dtype, reduction_type, value):
-        raise CUTLASSEVTOpNotImplementedError()
+        raise CUTLASSEVTOpNotImplementedError
 
     def getvalue(self, result) -> str:
         return "{" + str(result) + "}"
diff --git a/torch/_inductor/codegen/cuda/cutlass_utils.py b/torch/_inductor/codegen/cuda/cutlass_utils.py
index a098e99981452..789a2e44152c0 100644
--- a/torch/_inductor/codegen/cuda/cutlass_utils.py
+++ b/torch/_inductor/codegen/cuda/cutlass_utils.py
@@ -2,16 +2,18 @@
 import logging
 import os
 import sys
+
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Any, List, Optional
 
 import sympy
 
 import torch
-
-from ...codecache import cache_dir
-from ...config import cuda as inductor_cuda_config
+from ... import config
 from ...ir import Layout
+
+from ...runtime.runtime_utils import cache_dir
 from .cuda_env import get_cuda_arch, get_cuda_version
 
 log = logging.getLogger(__name__)
@@ -46,12 +48,15 @@ def _gen_cutlass_file(
 
 @functools.lru_cache(None)
 def try_import_cutlass() -> bool:
+    if config.is_fbcode():
+        return True
+
     # Copy CUTLASS python scripts to a temp dir and add the temp dir to Python search path.
     # This is a temporary hack to avoid CUTLASS module naming conflicts.
     # TODO(ipiszy): remove this hack when CUTLASS solves Python scripts packaging structure issues.
 
     cutlass_py_full_path = os.path.abspath(
-        os.path.join(inductor_cuda_config.cutlass_dir, "python/cutlass_library")
+        os.path.join(config.cuda.cutlass_dir, "python/cutlass_library")
     )
     tmp_cutlass_py_full_path = os.path.abspath(
         os.path.join(cache_dir(), "torch_cutlass_library")
@@ -119,9 +124,10 @@ class CUTLASSArgs:
     generator_target = ""
     kernels = "all"
     ignore_kernels = ""
-    kernel_filter_file = None
-    selected_kernel_list = None
-    interface_dir = None
+    # TODO: these three look dead?
+    kernel_filter_file: None = None
+    selected_kernel_list: None = None
+    interface_dir: None = None
     filter_by_cc = True
     disable_full_archs_compilation = False
 
@@ -178,6 +184,23 @@ def gen_ops() -> List[Any]:
     return _gen_ops_cached(arch, version)
 
 
+def torch_dtype_to_cutlass_type(
+    torch_dtype: torch.dtype,
+) -> "cutlass_library.library.DataType":  # type: ignore[name-defined] # noqa: F821
+    # Import cutlass python scripts.
+    assert try_import_cutlass()
+    import cutlass_library  # type: ignore[import]
+
+    if torch_dtype == torch.float:
+        return cutlass_library.library.DataType.f32
+    elif torch_dtype == torch.half:
+        return cutlass_library.library.DataType.f16
+    elif torch_dtype == torch.bfloat16:
+        return cutlass_library.library.DataType.bf16
+    else:
+        raise NotImplementedError(f"Unsupported data type: {torch_dtype=}")
+
+
 def dtype_match(
     torch_dtype: Optional[torch.dtype],
     cutlass_dtype: "cutlass_library.library.DataType",  # type: ignore[name-defined]  # noqa: F821
@@ -195,6 +218,12 @@ def dtype_match(
         return cutlass_dtype == cutlass_library.library.DataType.f16
     elif torch_dtype == torch.bfloat16:
         return cutlass_dtype == cutlass_library.library.DataType.bf16
+    elif torch_dtype == torch.int8:
+        return cutlass_dtype == cutlass_library.library.DataType.s8
+    elif torch_dtype == torch.uint8:
+        return cutlass_dtype == cutlass_library.library.DataType.u8
+    elif torch_dtype == torch.int32:
+        return cutlass_dtype == cutlass_library.library.DataType.s32
     else:
         return False
 
@@ -203,15 +232,28 @@ def get_accumulator_dtype(
     input_torch_dtypes: List[torch.dtype],
 ) -> Optional[torch.dtype]:
     """
-    Given a list of input torch dtypes, returns the inferred accumulator torch dtype.
+    Given a pair of input torch dtypes, returns the inferred accumulator torch dtype.
     """
 
-    if len(input_torch_dtypes) == 0:
+    if len(input_torch_dtypes) != 2:
         return None
-    torch_dtype = input_torch_dtypes[0]
-    for dtype in input_torch_dtypes[1:]:
-        if torch_dtype != dtype:
-            raise RuntimeError(f"Unmatched input dtypes: {torch_dtype=}, {dtype=}")
+
+    torch_dtype = None
+    if input_torch_dtypes[0] == input_torch_dtypes[1]:
+        torch_dtype = input_torch_dtypes[0]
+    else:
+        size0 = torch.tensor([], dtype=input_torch_dtypes[0]).element_size()
+        size1 = torch.tensor([], dtype=input_torch_dtypes[1]).element_size()
+        if size0 > size1:
+            dtype0, dtype1 = input_torch_dtypes
+        else:
+            dtype1, dtype0 = input_torch_dtypes
+        if dtype0 in [torch.half, torch.bfloat16] and dtype1 in [
+            torch.int8,
+            torch.uint8,
+        ]:
+            torch_dtype = dtype0
+
     if torch_dtype == torch.half:
         if torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction:
             return torch_dtype
@@ -219,7 +261,9 @@ def get_accumulator_dtype(
             return torch.float
     if torch_dtype in {torch.bfloat16, torch.float}:
         return torch.float
-    raise NotImplementedError(f"Unsupported data type: {input_torch_dtypes=}")
+    if torch_dtype == torch.int8:
+        return torch.int32
+    raise NotImplementedError(f"Unsupported data types: {input_torch_dtypes=}")
 
 
 def get_alignments(torch_dtype: torch.dtype) -> List[int]:
@@ -232,6 +276,10 @@ def get_alignments(torch_dtype: torch.dtype) -> List[int]:
         return [8, 4, 2, 1]
     elif torch_dtype == torch.float:
         return [4, 2, 1]
+    elif torch_dtype in (torch.uint8, torch.int8):
+        return [16, 8, 4, 2]
+    elif torch_dtype == torch.int32:
+        return [4, 2, 1]
     else:
         raise NotImplementedError(f"unsupported {torch_dtype=} for alignments")
 
@@ -248,10 +296,67 @@ def get_max_alignment(inductor_layout: Layout) -> int:
     def is_static_int(number):
         return isinstance(number, (int, sympy.Integer))
 
-    if is_static_int(size[-1]) and is_static_int(offset):
+    try:
+        contiguous_dim = inductor_layout.stride.index(1)
+    except ValueError:
+        # No dim with stride 1 found, return 1
+        return 1
+    if (
+        is_static_int(size[contiguous_dim])
+        and is_static_int(offset)
+        and all(is_static_int(s) for s in inductor_layout.stride)
+    ):
         alignments = get_alignments(dtype)
         for alignment in alignments:
-            if int(size[-1]) % alignment == 0 and int(offset) % alignment == 0:
+            if (
+                int(size[contiguous_dim]) % alignment != 0
+                or int(offset) % alignment != 0
+            ):
+                continue
+            if all(
+                (dim == contiguous_dim)
+                or (inductor_layout.stride[dim] % alignment == 0)
+                for dim in range(len(size))
+            ):
                 return alignment
-
     return 1
+
+
+class CUDACompileSourceCapturingContext:
+    # Helper class for Benchmarking and Testing CUTLASS Kernels in isolation.
+    # Can be used to capture the sourcecode passed to CUDACodeCache.compile
+
+    def __init__(self):
+        self.sources = []
+        self._compile_patch = None
+
+    def __enter__(self, *args, **kwargs):
+        import unittest.mock as mock
+
+        import torch._inductor.codecache
+
+        _compile_method_orig = torch._inductor.codecache.CUDACodeCache.compile
+
+        def my_compile(source_code, dst_file_ext):
+            self.sources.append(source_code)
+            return _compile_method_orig(source_code, dst_file_ext)
+
+        self._compile_patch = mock.patch(
+            "torch._inductor.codecache.CUDACodeCache.compile", my_compile
+        )
+        return self._compile_patch.__enter__(*args, **kwargs)  # type: ignore[union-attr]
+
+    def __exit__(self, *args, **kwargs):
+        return self._compile_patch.__exit__(*args, **kwargs)  # type: ignore[union-attr]
+
+
+def cuda_standalone_runner_compile_command(srcpath: Path, exepath: Path):
+    # returns command string to compile a (captured) CUDA GEMM Kernel source to a standalone executable that's ready to run
+    # Passes the correct preprocessor define to nvcc to ensure the standalone runner is enabled.
+    from torch._inductor.codecache import cuda_compile_command
+
+    extra_args = ["-DGENERATE_STANDALONE_RUNNER=1", "-DCUTLASS_DEBUG_TRACE_LEVEL=1"]
+    compile_command = cuda_compile_command(
+        [str(srcpath)], str(exepath), "exe", extra_args=extra_args
+    )
+    return compile_command
diff --git a/torch/_inductor/codegen/cuda/device_op_overrides.py b/torch/_inductor/codegen/cuda/device_op_overrides.py
index 7f722cf86d678..93a8c08b6a0f2 100644
--- a/torch/_inductor/codegen/cuda/device_op_overrides.py
+++ b/torch/_inductor/codegen/cuda/device_op_overrides.py
@@ -1,4 +1,4 @@
-from ..common import DeviceOpOverrides
+from ..common import DeviceOpOverrides, register_device_op_overrides
 
 
 class CUDADeviceOpOverrides(DeviceOpOverrides):
@@ -13,3 +13,6 @@ def synchronize(self):
 
     def device_guard(self, device_idx):
         return f"torch.cuda._DeviceGuard({device_idx})"
+
+
+register_device_op_overrides("cuda", CUDADeviceOpOverrides())
diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py
index 9e76b5291c59e..89c326cef5468 100644
--- a/torch/_inductor/codegen/cuda/gemm_template.py
+++ b/torch/_inductor/codegen/cuda/gemm_template.py
@@ -1,22 +1,29 @@
 import copy
+import enum
 import logging
 import re
-from typing import cast, Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
+from ... import ir
 from ...config import cuda as inductor_cuda_config
-from ...ir import Buffer, CUDATemplateBuffer, FixedLayout, IRNode, Layout
+from ...ir import (
+    Buffer,
+    ChoiceCaller,
+    CUDATemplateBuffer,
+    FixedLayout,
+    IRNode,
+    Layout,
+    ReinterpretView,
+)
 from ..common import IndentedBuffer
 
 from . import cutlass_utils
 from .cuda_kernel import CUDATemplateKernel
 from .cuda_template import CUTLASSTemplate
-from .cutlass_epilogue_gen import (
-    CutlassEVTEpilogueArgumentFormatter,
-    CutlassEVTEpilogueTypeFormatter,
-)
 
 log = logging.getLogger(__name__)
 
+# Jinja template for GEMM Kernel, used by the CUTLASSGemmTemplate class below.
 GEMM_TEMPLATE = r"""
 {{template.header().getvalue()}}
 {{template.globals().getvalue()}}
@@ -24,18 +31,19 @@
 // When workspace_size is not a nullptr, populates requested workspace_size and returns.
 // Otherwise, computes the Gemm kernel using the given workspace ptr.
 extern "C" {
-{{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} {
+{{kernel_call_signature}} {
   try {
-  {{kernel.check_not_null(X)}}
-  {{kernel.check_not_null(W)}}
-  {{kernel.check_not_null(Bias)}}
-  {{kernel.check_not_null(Y)}}
   int64_t B = {{kernel.size(Y, 0, -3, default_value=1)}};
   int64_t M = {{kernel.size(X, -2)}};
   int64_t K = {{kernel.size(X, -1)}};
   int64_t N = {{kernel.size(W, -1)}};
   using ElementComputeEpilogue = {{instance_type}}::ElementAccumulator;
   using coord_t = cutlass::gemm::GemmCoord::Index;
+  static cutlass::KernelHardwareInfo hw_info;
+  if (hw_info.sm_count == 0) {
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(0);
+    CUTLASS_TRACE_HOST("Query result for SM count per device: " << hw_info.sm_count);
+  }
   {{instance_type}}::Arguments arguments;
   {{template.render_gemm_arguments(argument_template, epilogue_template, should_swap_xw,
                                     X, W, Bias, Y, alpha, beta, kernel, epilogue_args)}}
@@ -44,10 +52,26 @@
     *workspace_size = gemm_op.get_workspace_size(arguments);
     return 0;
   }
+  // check for null pointers after workspace size, since querying workspace size doesn't require valid data pointers
+#ifndef CUTLASS_BACKEND_DISABLE_CHECKS
+  {{kernel.check_not_null(X)}}
+  {{kernel.check_not_null(W)}}
+  {{kernel.check_not_null(Bias)}}
+  {{kernel.check_not_null(Y)}}
   {
     auto status = gemm_op.can_implement(arguments);
     CUTLASS_CHECK(status);
   }
+#endif
+#ifdef CUTLASS_DEBUG_TRACE_LEVEL
+#if CUTLASS_DEBUG_TRACE_LEVEL == 1
+  {
+    // Print the maximum number of active blocks per SM for the kernel if CUTLASS_DEBUG_TRACE_LEVEL == 1
+    // we don't need a print statement, it's happening inside the function.
+    gemm_op.maximum_active_blocks();
+  }
+#endif
+#endif
   {
     auto status = gemm_op.initialize(arguments, workspace, stream);
     CUTLASS_CHECK(status);
@@ -69,42 +93,7 @@
 }
 """
 
-
-GEMM_ARGS_CUTLASS_2X = r"""
-  int64_t batch_stride_x = {{kernel.stride(X, -3)}};
-  int64_t row_stride_x = {{kernel.row_or_column_stride(X)}};
-  int64_t batch_stride_w = {{kernel.stride(W, -3)}};
-  int64_t row_stride_w = {{kernel.row_or_column_stride(W)}};
-  int64_t batch_stride_bias = {{kernel.stride(Bias, -3)}};
-  int64_t row_stride_bias = {{kernel.row_or_column_stride(Bias)}};
-  int64_t batch_stride_y = {{kernel.stride(Y, -3)}};
-  int64_t row_stride_y = {{kernel.row_or_column_stride(Y)}};
-  // Initialize GemmUniversalInstance arguments.
-  arguments = {
-    {{template.gemm_mode()}},  // GemmUniversalMode mode
-    {
-      static_cast<coord_t>(M),
-      static_cast<coord_t>(N),
-      static_cast<coord_t>(K)
-    },  // GemmCoord problem_size
-    {{split_k if split_k > 1 else 'B'}},  // int batch_count
-    {ElementComputeEpilogue({{alpha}}), ElementComputeEpilogue({{beta}})},  // typename EpilogueOutputOp::Params epilogue
-    {{template.cutlass_type_cast(X, kernel.ptr(X))}},  // void const * ptr_A
-    {{template.cutlass_type_cast(W, kernel.ptr(W))}},  // void const * ptr_B
-    {{template.cutlass_type_cast(Bias, kernel.ptr(Bias))}},  // void const * ptr_C
-    {{template.cutlass_type_cast(Y, kernel.ptr(Y))}},  // void * ptr_D
-    batch_stride_x,  // int64_t batch_stride_A
-    batch_stride_w,  // int64_t batch_stride_B
-    batch_stride_bias,  // int64_t batch_stride_C
-    batch_stride_y,  // int64_t batch_stride_D
-    row_stride_x,  // typename LayoutA::Stride::LongIndex lda
-    row_stride_w,  // typename LayoutB::Stride::LongIndex ldb
-    row_stride_bias,  // typename LayoutC::Stride::LongIndex ldc
-    row_stride_y,  // typename LayoutC::Stride::LongIndex ldd
-  };
-"""
-
-
+# Jinja template for Cutlass 3.x GEMM Kernel arguments, used by the CUTLASSGemmTemplate class below.
 GEMM_ARGS_CUTLASS_3X = r"""
   // Initialize GemmUniversal3xInstance arguments.
   arguments = {
@@ -129,10 +118,13 @@
         {{template.cute_int(kernel.stride(W, -3), "batch_stride_w")}}
       },  // StrideB dB
     },  // MainloopArguments mainloop
-    {{epilogue_arguments}}
+    {{epilogue_arguments}},
+    hw_info
   };
 """
 
+# Jinja template for Cutlass 3.x GEMM Kernel arguments if epilogue fusion is applied,
+# used by the CUTLASSGemmTemplate class below.
 GEMM_ARGS_CUTLASS_3X_EPILOGUE = r"""
     // see https://tinyurl.com/4rk89z48
     {
@@ -152,10 +144,92 @@
     },  // EpilogueArguments epilogue
 """
 
+# Additional includes which are neccessary if the standalone test / debug runner is generated as wel
+GEMM_STANDALONE_RUNNER_ADDITIONAL_INCLUDES = r"""
+#ifdef GENERATE_STANDALONE_RUNNER
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include <iostream>
+#endif
+"""
+
+# Jinja template for the standalone runner that may be generated as part of the code.
+GEMM_STANDALONE_RUNNER_TEMPLATE = r"""
+#ifdef GENERATE_STANDALONE_RUNNER
+/// Helper to initialize a block of device data
+template <class Element>
+bool initialize_block(
+  cutlass::DeviceAllocation<Element>& block,
+  uint64_t seed, float max=1.0, float min=-1.0) {
+  if (block.size()<=0) return false;
+  Element scope_max(static_cast<Element>(max)), scope_min(static_cast<Element>(min));
+  cutlass::reference::device::BlockFillRandomUniform(
+    block.get(), block.size(), seed, scope_max, scope_min, 0);
+
+  return true;
+}
+
+extern "C" int run_standalone(uint64_t seed, int repetitions) {
+    std::cout << "Starting GEMM Standalone test run with seed " << seed << std::endl;
+    size_t workspace_size = 0;
+    size_t* workspace_size_ptr = &workspace_size;
+
+    using ElementA = {{kernel.cutlass_dtype(X)}};
+    using ElementB = {{kernel.cutlass_dtype(W)}};
+    using ElementC = {{kernel.cutlass_dtype(Bias, default_dtype='uint8_t')}}; // may not be void
+    using ElementD = {{kernel.cutlass_dtype(Y)}};
+
+    cutlass::DeviceAllocation<ElementA> X_data({{kernel.max_valid_index(X)+1}});
+    initialize_block(X_data, seed++);
+    cutlass::DeviceAllocation<ElementB> W_data({{kernel.max_valid_index(W)+1}});
+    initialize_block(W_data, seed++);
+    cutlass::DeviceAllocation<ElementC> Bias_data({{kernel.max_valid_index(Bias)+1}});
+    initialize_block(Bias_data, seed++);
+    cutlass::DeviceAllocation<ElementD> Y_data({{kernel.max_valid_index(Y)+1}});
+
+    cutlass::DeviceAllocation<uint8_t> workspace_data;
+    // Call once with workspace_size_ptr set to get workspace size
+
+    std::cout << "Calling once to get workspace size" << std::endl;
+    {{test_call_statement}};
+    // Allocate workspace if neccessary
+    if (workspace_size > 0) {
+        workspace_data.reset(workspace_size);
+        std::cout << "Allocated workspace size of " << workspace_size << " bytes" << std::endl;
+    }
+    std::cout << "Calling Kernel as {{test_call_statement}};" << std::endl;
+    workspace_size_ptr = nullptr;
+    for (int i=0; i<repetitions; i++) {
+        {{test_call_statement}};
+    }
+    cudaError_t result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "Device synchronize failed with error "
+        << cudaGetErrorString(result) << std::endl;
+      return result;
+    }
+    return 0;
+}
+
+int main(int argc, char** argv) {
+    // warmup
+    run_standalone(1, 2);
+    // repeat
+    return run_standalone(2, 10);
+}
+
+#endif
+"""  # noqa: B950
+
 
 class CUTLASSGemmTemplate(CUTLASSTemplate):
     """
-    CUTLASS GEMM template, which is used to generate CUTLASS GEMM kernels
+    CUTLASS GEMM Template, which is used to generate CUTLASS GEMM kernels
     including those which allow flexible fusions with epilogues.
     """
 
@@ -166,83 +240,156 @@ def __init__(
         alpha: float,
         beta: float,
         input_reorder: Optional[List[int]] = None,
-        can_fuse_epilogue: Optional[bool] = None,
     ):
         """
         Args:
-            input_nodes: input nodes of the kernel
-            layout: layout of the output node
-            alpha: alpha value of the GEMM operation
-            beta: beta value of the GEMM operation
-            input_reorder: reorder of the input nodes
-            can_fuse_epilogue: If set to True, will only list and use operators capable of flexible epilogue fusions.
-                               If False, it will not use those. If None, both may be listed, but it will not allow fusions.
-                               Defaults to None
+            input_nodes (List[Buffer]): List of input nodes of the GEMM kernel.
+            layout (Layout): Layout type of the resulting output node.
+            alpha (float): The scaling factor for the product of the inputs in the GEMM operation.
+            beta (float): The scaling factor applied to the output matrix.
+            input_reorder (Optional[List[int]]): Specifies the reordering of the input nodes. If not provided,
+                            no reordering is performed. Defaults to None.
         """
         super().__init__("cutlass_gemm", input_nodes, layout, input_reorder)
         self.alpha = alpha
         self.beta = beta
-        self.can_fuse_epilogue = can_fuse_epilogue
+        assert len(input_nodes) == 2 or len(input_nodes) == 3
+        assert self._are_inputs_layout_compatible(
+            [node.get_layout() for node in input_nodes]
+        )
+
+    def _are_inputs_layout_compatible(self, layouts: List[Layout]) -> bool:
+        """
+        Evaluates whether input layouts are compatible for General Matrix Multiply (GEMM).
+
+        This function checks compatibility of A, B, and possibly C operand layouts for
+        a General Matrix Multiply (GEMM) operation, expressed as 'alpha * matmul(A, B) + beta * C'.
+        It verifies requirements such as matching data types, minimum rank, and suitability
+        for broadcasting, as defined by PyTorch operations like `torch.matmul`, `torch.aten.mm`,
+        `addmm`, `bmm`, `baddbmm`, etc.
+
+        Args:
+            layouts (List[Layout]): List containing 2 or 3 Layout objects representing
+                                    the input matrices A, B, and possibly C.
+
+        Returns:
+            bool: True if layouts are GEMM compatible, otherwise False.
+        """
+        assert len(layouts) == 2 or len(layouts) == 3
+        # Check if A and B are compatible
+        A_layout, B_layout = layouts[:2]
+        if len(A_layout.size) < 1:
+            return False
+        if len(B_layout.size) < 1:
+            return False
+        A_size = [int(i) for i in A_layout.size]
+        B_size = [int(i) for i in B_layout.size]
+        if len(A_size) < 2:
+            A_size.insert(0, 1)
+        if len(B_size) < 2:
+            A_size.insert(1, 1)
+        # Are batch dims broadcastable?
+        while len(A_size) < len(B_size):
+            A_size.insert(0, 1)
+        while len(B_size) < len(A_size):
+            B_size.insert(0, 1)
+        K = max(A_size[-1], B_size[-2])
+        M = A_size[-2]
+        N = B_size[-1]
+        if K != A_size[-1] and A_size[-1] != 1:
+            return False
+        if K != B_size[-2] and B_size[-1] != 1:
+            return False
+        # check batch dim broadcastable
+        for i in range(len(A_size) - 2):
+            if A_size[i] != B_size[i] and A_size[i] != 1 and B_size[i] != 1:
+                return False
+        if len(layouts) == 3:
+            C_layout = layouts[2]
+            C_size = [int(i) for i in C_layout.size]
+            while len(C_size) < len(A_size):
+                C_size.insert(0, 1)
+            # check batch dims
+            for i in range(len(A_size) - 2):
+                bd = max(A_size[i], B_size[i])
+                if bd != C_size[i] and C_size[i] != 1:
+                    return False
+            if len(C_size) > len(A_size):
+                # This may happen if the last elements of C are contiguous and
+                # their multiplied size equals the last dim size of B
+                if M != C_size[len(A_size) - 2] and C_size[len(A_size) - 2] != 1:
+                    return False
+                remaining_size = 1
+                for i in range(len(A_size) - 1, len(C_size)):
+                    remaining_size *= C_size[i]
+                if N != remaining_size and remaining_size != 1:
+                    return False
+                return True
+            assert len(C_size) == len(A_size)
+            if M != C_size[-2] and C_size[-2] != 1:
+                return False
+            if N != C_size[-1] and C_size[-1] != 1:
+                return False
+        return True
 
     @staticmethod
     def add_cutlass_gemm_choices(
-        choices,
-        layout,
-        input_nodes,
-        alpha=1,
-        beta=0,
-        input_reorder=None,
-        fuseable=True,
-        non_fuseable=True,
-    ):
-        if non_fuseable:
-            if fuseable:
-                # list both fuseable and non-fuseable ops, and treat them all as non-fuseable
-                can_fuse_epilogue = False
-            else:
-                can_fuse_epilogue = None
-
-            cutlass_template = CUTLASSGemmTemplate(
-                input_nodes,
-                layout,
-                alpha=alpha,
-                beta=beta,
-                input_reorder=input_reorder,
-                can_fuse_epilogue=can_fuse_epilogue,
-            )
-            ops = cutlass_template.gen_ops()
-            for op in ops:
-                cutlass_template.maybe_append_choice(
-                    choices,
-                    op=op,
-                )
-        else:
-            ops = []
-        if fuseable:
-            cutlass_template_evt = CUTLASSGemmTemplate(
-                input_nodes,
-                layout,
-                alpha=alpha,
-                beta=beta,
-                input_reorder=input_reorder,
-                can_fuse_epilogue=True,
+        choices: List[ChoiceCaller],
+        layout: ir.Layout,
+        input_nodes: List[ir.IRNode],
+        alpha: Union[float, int] = 1,
+        beta: Union[float, int] = 0,
+        input_reorder: Optional[List[int]] = None,
+        **extra_kwargs,
+    ) -> None:
+        """
+        Adds Cutlass GEMM configurations choices to the auto-tuning list.
+
+        This function mutates the passed list of choices by appending the choices for Cutlass GEMM configs to it.
+
+        Args:
+            choices (list): The list to which choices are appended.
+            layout (ir.Layout): The layout configuration.
+            input_nodes (list): The list of input nodes.
+            alpha (float,int): Scaling factor, defaults to 1.
+            beta (float,int): Offset, defaults to 0.
+            input_reorder (list, optional): Order of the inputs, defaults to None.
+            **extra_kwargs: Additional keyword arguments.
+
+        """
+
+        cutlass_template = CUTLASSGemmTemplate(
+            input_nodes,  # type: ignore[arg-type]
+            layout,
+            alpha=alpha,
+            beta=beta,
+            input_reorder=input_reorder,
+        )
+        ops = cutlass_template.gen_ops()
+        for op in ops:
+            cutlass_template.maybe_append_choice(
+                choices,
+                op=op,
             )
-            # This will list only ops capable of EVT fusion
-            ops_evt = cutlass_template_evt.gen_ops()
-            for op in ops_evt:
-                cutlass_template_evt.maybe_append_choice(
-                    choices,
-                    op=op,
-                )
-        else:
-            ops_evt = []
+        if len(ops) == 0:
+            input_layouts = [node.get_layout() for node in input_nodes]
+            input_strides = [node.get_stride() for node in input_nodes]
+            output_layout = layout
+            warning_msg = f"No suitable Cutlass GEMM configs found, fallbacks used ( {len(ops)=}, {output_layout=}, {input_layouts=}, {input_strides=} )"  # noqa: B950
+            log.warning(warning_msg)
         log.debug(
-            "Added %d cutlass gemm configs and %d fuseable gemm configs.",
+            "Added %d Cutlass gemm configs.",
             len(ops),
-            len(ops_evt),
         )
 
     def header(self) -> IndentedBuffer:
+        """
+        Returns a buffer containing CUDA C++ code for the header section of the CUTLASS GEMM template.
+        This section primarily includes the necessary header files.
+
+        Returns:
+            IndentedBuffer: An instance of IndentedBuffer that contains the generated CUDA C++ header code.
+        """
         res = super().header()
         res.splice(
             """
@@ -254,6 +401,7 @@ def header(self) -> IndentedBuffer:
                 #include "cutlass/epilogue/collective/collective_builder.hpp"
                 #include "cutlass/epilogue/collective/default_epilogue.hpp"
                 #include "cutlass/epilogue/thread/linear_combination.h"
+                #include "cutlass/epilogue/thread/activation.h"
                 #include "cutlass/gemm/dispatch_policy.hpp"
                 #include "cutlass/gemm/kernel/tile_scheduler.hpp"
                 #include "cutlass/util/distribution.h"
@@ -261,10 +409,23 @@ def header(self) -> IndentedBuffer:
                 #include "cutlass/util/tensor_view_io.h"
             """
         )
+        if inductor_cuda_config.generate_test_runner:
+            res.splice(GEMM_STANDALONE_RUNNER_ADDITIONAL_INCLUDES)
         return res
 
     @staticmethod
-    def cutlass_layout(torch_layout) -> "Optional[cutlass_lib.LayoutType]":  # type: ignore[name-defined]  # noqa: F821
+    def cutlass_layout(torch_layout: ir.Layout) -> "Optional[cutlass_lib.LayoutType]":  # type: ignore[name-defined]  # noqa: F821
+        """
+        Converts an ir.Layout instance into the corresponding cutlass_library.LayoutType enum value
+        (RowMajor, ColumnMajor, or None if no matching value is found ).
+
+        Args:
+            torch_layout (ir.Layout): The layout that needs to be looked up.
+
+        Returns:
+            cutlass_lib.LayoutType: The converted layout corresponding to the `torch_layout` or None if no matching
+            value is found.
+        """
         assert cutlass_utils.try_import_cutlass()
         import cutlass_library.library as cutlass_lib
 
@@ -279,6 +440,8 @@ def cutlass_layout(torch_layout) -> "Optional[cutlass_lib.LayoutType]":  # type:
     def flip_cutlass_layout(
         cutlass_layout: "cutlass_lib.LayoutType",  # type: ignore[name-defined]  # noqa: F821
     ) -> "cutlass_lib.LayoutType":  # type: ignore[name-defined]  # noqa: F821
+        """Helper method: Flips a given cutlass layout (cutlass_lib.LayoutType) from RowMajor
+        to ColumnMajor or vice versa"""
         assert cutlass_utils.try_import_cutlass()
         import cutlass_library.library as cutlass_lib
 
@@ -288,20 +451,41 @@ def flip_cutlass_layout(
             return cutlass_lib.LayoutType.RowMajor
 
     @staticmethod
-    def layout_match(torch_layout, cutlass_layout) -> bool:
+    def layout_match(
+        torch_layout: ir.Layout,
+        cutlass_layout: "cutlass_lib.LayoutType",  # type: ignore[name-defined] # noqa: F821
+    ) -> bool:
+        """Helper Method: Determines whether a given torch layout matches a given Cutlass layout"""
         return CUTLASSGemmTemplate.cutlass_layout(torch_layout) == cutlass_layout
 
     @staticmethod
     def set_alignment(torch_layout, op_element) -> bool:
+        """
+        Helper method to update the alignment of a given CUTLASS GEMM op operand's element.
+
+        This method modifies the alignment of the given Cutlass GEMM op operand's element to match the
+        layout of the corresponding ir.Buffer node.
+
+        Args:
+            torch_layout: The layout of the corresponding ir.Buffer node.
+            op_element: The Cutlass GEMM op operand's element whose alignment is to be updated.
+
+        Returns:
+            bool: True if the alignment was successfully updated, False otherwise.
+        """
         alignment = cutlass_utils.get_max_alignment(torch_layout)
-        if alignment < op_element.alignment:
+        cuda_arch = cutlass_utils.get_cuda_arch()
+        if cuda_arch and int(cuda_arch) >= 90 and alignment < op_element.alignment:
             return False
         else:
             op_element.alignment = alignment
             return True
 
     @staticmethod
-    def has_tma_epilogue(op) -> bool:
+    def has_tma_epilogue(  # noqa: F821 # type: ignore[arg-type,name-defined]
+        op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined,arg-type] # noqa: F821
+    ) -> bool:  # type: ignore[name-defined]
+        """Helper method: Determine whether a given Cutlass GEMM op has a TMA Epilogue"""
         assert cutlass_utils.try_import_cutlass()
         import cutlass_library.library as cutlass_lib
 
@@ -311,76 +495,37 @@ def has_tma_epilogue(op) -> bool:
             result = epilogue_schedule_str.lower().startswith("tma")
         return result
 
-    @staticmethod
-    def supports_evt(op: "cutlass_library.gemm_op.GemmOperation") -> bool:  # type: ignore[name-defined]  # noqa: F821
-        """
-        returns True if the op is capable of flexible epilogue fusions
-        using epilogue visitor trees.
-
-        See https://github.com/NVIDIA/cutlass/blob/e01b9b5029b7caca5a43c29f7d2714d7cf1dcae8/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu#L283-L285 # noqa: B950
-        """
-        assert cutlass_utils.try_import_cutlass()
-        import cutlass_library.library as cutlass_lib
-
-        if op.gemm_kind != cutlass_lib.GemmKind.Universal3x:
-            return False
-        if op.epilogue_schedule not in (
-            cutlass_lib.EpilogueScheduleType.TmaWarpSpecialized,
-            cutlass_lib.EpilogueScheduleType.TmaWarpSpecializedCooperative,
-        ):
-            return False
-
-        return True
-
-    def render_evt_epilogue_declaration(
-        self,
-        template_output_node_name: str,
-        evt_type_name: str,
-        epilogue_nodes: List[IRNode],
-    ) -> str:
-        """Generates the epilogue for the EVT epilogue fusion"""
-        return CutlassEVTEpilogueTypeFormatter.ir_to_evt_string(
-            template_output_node_name, evt_type_name, epilogue_nodes
-        )
-
     def define_gemm_instance(
         self,
         op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
-        output_buffer_name: str,
-        epilogue_nodes: Optional[List[IRNode]] = None,
     ) -> Tuple[str, str]:
+        """Defines and renders the Cutlass / CUDA C++ code for a given GEMM operation instance.
+
+        This function uses the Cutlass library to generate key parts of the codegen process. General Matrix Multiply
+        forms a core part of a number of scientific applications, so this efficient and adaptable implementation is
+        crucial.
+
+        Args:
+            op (cutlass_library.gemm_op.GemmOperation): This is the core GEMM operation that we are defining and rendering.
+
+        Returns:
+            Tuple[str, str]: A tuple where the first part is a string that constitutes the defined GEMM operation in C++
+                             code (render) and the second part is the string that specifies the operation type.
+        """
         assert cutlass_utils.try_import_cutlass()
         import cutlass_library.gemm_operation as cutlass_gemm_op
         import cutlass_library.library as cutlass_lib
 
-        from torch._inductor.codegen.cuda.cutlass_lib_extensions.gemm_operation_extensions import (
-            EmitGemmUniversal3xInstanceWithEVT,
-        )
+        emitter = cutlass_gemm_op.EmitGemmUniversal3xInstance()
+        if not hasattr(op, "epilogue_functor") or not isinstance(
+            op.epilogue_functor, enum.Enum
+        ):
+            op = copy.deepcopy(op)
+            op.epilogue_functor = cutlass_lib.EpilogueFunctor.LinearCombination
+        op_def = emitter.emit(op)
+        pattern = re.compile(r"\s*struct\s(.*?)\s:")
+        decl = [line for line in op_def.split("\n") if "struct " in line][-1]
 
-        if op.gemm_kind == cutlass_lib.GemmKind.Universal3x:
-            if epilogue_nodes is not None and len(epilogue_nodes) > 0:
-                emitter = EmitGemmUniversal3xInstanceWithEVT()
-                op.epilogue_functor = lambda epilogue_functor_type_name: self.render_evt_epilogue_declaration(
-                    output_buffer_name, epilogue_functor_type_name, epilogue_nodes
-                )
-            else:
-                emitter = cutlass_gemm_op.EmitGemmUniversal3xInstance()
-            op_def = emitter.emit(op)
-            pattern = re.compile(r"\s*struct\s(.*?)\s:")
-            decl = [line for line in op_def.split("\n") if "struct " in line][-1]
-        else:
-            if epilogue_nodes is not None and len(epilogue_nodes) > 0:
-                raise RuntimeError(
-                    "EVT epilogue fusion is not supported for Cutlass 2.x ops."
-                )
-            emitter = cutlass_gemm_op.EmitGemmInstance()
-            op_def = emitter.emit(op)
-            op_def = op_def.replace(
-                "cutlass::gemm::device::Gemm", "cutlass::gemm::device::GemmUniversal"
-            )
-            op_def = op_def.replace("false,", "")
-            pattern = re.compile(r"\s*using\s(.*?)\s=")
-            decl = op_def.split("\n")[2]
         match = pattern.match(decl)
         if match is None:
             raise RuntimeError("Invalid Gemm config: \n" + op_def)
@@ -393,24 +538,37 @@ def define_gemm_instance(
     @staticmethod
     def should_swap_XW(
         bias: IRNode,
-        beta: float,
     ) -> bool:
-        return True
-
-        # TODO(ipiszy): Check whether it's necessary to swap X/W.
-        # strides = bias.get_stride()
-        # if strides[-1] != 1:
-        #     return True
-        # for stride in strides[:-1]:
-        #     if stride != 0:
-        #         return True
-        # return False
+        """
+        Helper method to determine whether we should do an explicit transpose by switching the order of the
+        matmul operands. This might be neccessary when we can't otherwise arrive at the right memory
+        layout for the given Bias operand.
+
+        Note: This method is a workaround for CUDA Errors that seemingly non-deterministically
+        occurred in practice in some CUTLASS GEMM Kernels with Linear epilogues that have a bias term.
+        it might make sense to check on newer Cutlass releases whether it makes sense to keep
+        returning True in certain cases or whether it becomes unneccessary.
+        """
+        # If bias is row major, swap all M and N dimensions
+        if (
+            bias is not None
+            and len(bias.get_stride()) >= 2
+            and bias.get_stride()[-1] in (0, 1)
+        ):
+            log.debug("GEMM Layout swapped X and W -> explicit transpose")
+            return True
+        return False
 
     @staticmethod
     def swap_XW(
         op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
     ) -> "cutlass_library.gemm_op.GemmOperation":  # type: ignore[name-defined]  # noqa: F821
-        # Swap X and W in GemmOperation.
+        """
+        Swap operands X and W (aka operans A and B) of the GEMM operation. This
+        requires transposing the operands, which is done by swapping the strides.
+        Note that we don't change the apparent external layout, just the operand layout.
+        this is intentional.
+        """
         new_op = copy.deepcopy(op)
         new_op.A.layout = CUTLASSGemmTemplate.flip_cutlass_layout(new_op.A.layout)
         new_op.B.layout = CUTLASSGemmTemplate.flip_cutlass_layout(new_op.B.layout)
@@ -419,10 +577,70 @@ def swap_XW(
         new_op.D.layout = CUTLASSGemmTemplate.flip_cutlass_layout(new_op.D.layout)
         return new_op
 
+    def fix_op_layout(
+        self,
+        op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined] # noqa: F821
+        X: Buffer,
+        W: Buffer,
+        Bias: Optional[Buffer],
+        Y: Union[Buffer, ReinterpretView],
+    ) -> "cutlass_library.gemm_op.GemmOperation":  # type: ignore[name-defined]  # noqa: F821
+        # This is a workaround to deal with cases where the input layouts have changed
+        # between autotuning and rendering. This happens if the inputs layout
+        # are FlexibleLayout instances. In this case, we need to update the
+        # op's input layouts. It is a hack, because now the op
+        # we benchmarked is not the same as the op we render,
+        # but there is no simple way to fix this in the autotuner, since that would
+        # potentially disable other optimizations.
+        a_layout = X.get_layout()
+        b_layout = W.get_layout()
+        c_layout = Bias.get_layout() if Bias is not None else None
+
+        d_layout = copy.deepcopy(Y.get_layout())
+        match_list = [
+            CUTLASSGemmTemplate.layout_match(buf.get_layout(), op_layout)
+            for buf, op_layout in zip(
+                (X, W, Bias, Y),
+                (op.A.layout, op.B.layout, op.C.layout, op.D.layout),
+            )
+            if buf is not None
+        ]
+        all_match = all(match_list)
+        if all_match:
+            return op
+        log.warning(
+            f"Cutlass GEMM Layout change: Input and/or output layouts have changed between autotuning/retuning and call to render on {self}. Applying workaround. This can lead to suboptimal performance. Match List: {match_list}"  # noqa: G004, B950
+        )
+        new_op = copy.deepcopy(op)
+
+        if a_layout is not None:
+            new_op.A.layout = CUTLASSGemmTemplate.cutlass_layout(a_layout)
+        if b_layout is not None:
+            new_op.B.layout = CUTLASSGemmTemplate.cutlass_layout(b_layout)
+        if c_layout is not None:
+            new_op.C.layout = CUTLASSGemmTemplate.cutlass_layout(c_layout)
+            new_op.C.element = cutlass_utils.torch_dtype_to_cutlass_type(c_layout.dtype)
+        if d_layout is not None:
+            new_op.D.layout = CUTLASSGemmTemplate.cutlass_layout(d_layout)
+        return new_op
+
     def filter_op(
         self,
         op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
     ) -> "cutlass_library.gemm_op.GemmOperation":  # type: ignore[name-defined]  # noqa: F821
+        """
+        Helper method:
+
+        Determines whether a given Cutlass GEMM op definition is suitable for the current
+        input / output of the operation that this template is supposed to implement.
+
+        Takes memory layout, dtype and support for EVT operations into account,
+        and filters potentially problematic ops.
+
+        Returns None if the op is not suitable, otherwise returns the op to be used, which might
+        have been mutated.
+        """
+
         assert cutlass_utils.try_import_cutlass()
         import cutlass_library.library as cutlass_lib
 
@@ -435,7 +653,6 @@ def filter_op(
 
         # Only keep GemmUniversal kernels
         if op.gemm_kind not in {
-            cutlass_lib.GemmKind.Universal,
             cutlass_lib.GemmKind.Universal3x,
         }:
             return None
@@ -481,7 +698,16 @@ def filter_op(
         # Set epilogue.
         # TODO: update epilogue functor according to epilogues.
         op.element_epilogue = op.accumulator_type()
-
+        if inductor_cuda_config.cutlass_op_allowlist_regex is not None:
+            if not re.search(
+                inductor_cuda_config.cutlass_op_allowlist_regex, op.configuration_name()
+            ):
+                return None
+        if inductor_cuda_config.cutlass_op_denylist_regex is not None:
+            if re.search(
+                inductor_cuda_config.cutlass_op_denylist_regex, op.configuration_name()
+            ):
+                return None
         # Set bias layout and alignment.
         if len(self.input_nodes) >= 3 and self.input_nodes[2] is not None:
             Bias = self.input_nodes[2]
@@ -499,24 +725,25 @@ def filter_op(
                 op.C.element = cutlass_lib.DataType.void
             else:
                 op.C.layout = op.D.layout
-        supports_evt: bool = self.supports_evt(op)
-        if (self.can_fuse_epilogue is not None) and (
-            self.can_fuse_epilogue != supports_evt
-        ):
-            return None
-        if inductor_cuda_config.cutlass_only_evt_capable_ops and not supports_evt:
-            return None
         return op
 
     def gen_ops(self) -> "List[cutlass_gemm_op.GemmOperation]":  # type: ignore[name-defined]  # noqa: F821
+        """
+        Creates a list of Cutlass GemmOperation instances that match the operation this template is designed to represent.
+        The matching is carried out with respect to the input and output specifications of the operation.
+
+        No function arguments.
+
+        Returns:
+            List[cutlass_gemm_op.GemmOperation]: A list of GemmOperation instances that are compatible with the
+            operation requirements of this template.
+        """
         assert cutlass_utils.try_import_cutlass()
         import cutlass_library.gemm_operation as cutlass_gemm_op
         import cutlass_library.library as cutlass_lib
 
         ops = cutlass_utils.gen_ops()[cutlass_lib.OperationKind.Gemm]
         res: Dict[str, cutlass_gemm_op.GemmOperation] = dict()
-        num_3x_ops = 0
-        num_2x_ops = 0
         for op_dict in ops.values():
             for op_list in op_dict.values():
                 for op in op_list:
@@ -527,21 +754,19 @@ def gen_ops(self) -> "List[cutlass_gemm_op.GemmOperation]":  # type: ignore[name
                         and res.get(filter_res.configuration_name(), None) is None
                     ):
                         res[filter_res.configuration_name()] = filter_res
-        for op in res.values():
-            if op.gemm_kind == cutlass_lib.GemmKind.Universal3x:
-                num_3x_ops += 1
-            else:
-                num_2x_ops += 1
-        log.debug(
-            "Got cutlass configs: total number of ops: %d, "
-            "total number of 3x ops: %d, total number of 2x ops: %d",
-            len(res),
-            num_3x_ops,
-            num_2x_ops,
-        )
+        log.debug("Got cutlass configs: total number of ops: %d, ", len(res))
         return list(res.values())[: inductor_cuda_config.cutlass_max_profiling_configs]
 
     def gemm_mode(self) -> str:
+        """
+        Returns a Cutlass GEMM mode string for the current operation, dependent on whether this op implements
+        a batched GEMM or a simple GEMM without batch dimension.
+
+        Returns:
+        str: A string indicating the Cutlass GEMM mode. If the output node has more than two dimensions,
+            "cutlass::gemm::GemmUniversalMode::kBatched" is returned, otherwise
+            "cutlass::gemm::GemmUniversalMode::kGemm" is returned.
+        """
         sizes = self.output_node.get_size()
         if len(sizes) > 2:
             return "cutlass::gemm::GemmUniversalMode::kBatched"
@@ -562,9 +787,33 @@ def render_gemm_arguments(
         kernel: CUDATemplateKernel,
         epilogue_args,
     ) -> str:
+        """
+        Render the Cutlass CUDA C++ code required for passing arguments to the GEMM operation.
+
+        Args:
+            argument_template (str): Template for the GEMM operation arguments.
+            epilogue_template (str): Template for the epilogue arguments.
+            should_swap_xw (bool): Determines whether X, W operands should be swapped. If True, applies an explicit
+            transpose operation to X and W.
+            X (IRNode): The X input tensor.
+            W (IRNode): The W input tensor.
+            Bias (IRNode): The bias tensor.
+            Y (IRNode): The output tensor.
+            alpha (float): Scaling factor for the product of the inputs.
+            beta (float): Scaling factor for the output tensor.
+            kernel (CUDATemplateKernel): CUDA Template kernel for the operation.
+            epilogue_args (any): Additional arguments for the epilogue state.
+
+        Returns:
+            str: A block of CUDA C++ code as a string, ready to be used as arguments for the GEMM operation.
+
+        Note: If `should_swap_xw` is True, a transpose operation will be applied to the X, W, Bias, and Y
+        tensors. This operation also implies the M and N dimensions of Bias and GEMM output to be swapped
+        before the function call.
+        """
         options = dict(
-            alpha=self.alpha,
-            beta=self.beta,
+            alpha=alpha,
+            beta=beta,
             X=X,
             W=W,
             Y=Y,
@@ -575,45 +824,42 @@ def render_gemm_arguments(
             N="N",
             epilogue_args=epilogue_args,
         )
-
-        if epilogue_template is not None:
-            if should_swap_xw:
-                # Swap
-                def clone_with_transposed_stride(node: IRNode) -> IRNode:
-                    old_layout = node.get_layout()
-                    new_stride = list(old_layout.stride)
-                    new_stride[-2], new_stride[-1] = new_stride[-1], new_stride[-2]
-                    new_layout = FixedLayout(
-                        old_layout.device,
-                        old_layout.dtype,
-                        list(old_layout.size),
-                        new_stride,
-                        old_layout.offset,
-                    )
-                    return Buffer(node.get_name(), new_layout)
-
-                new_X = clone_with_transposed_stride(X)
-                new_W = clone_with_transposed_stride(W)
-                new_Bias = clone_with_transposed_stride(Bias)
-                new_Y = clone_with_transposed_stride(Y)
-                options["X"], options["W"], options["Bias"], options["Y"] = (
-                    new_W,
-                    new_X,
-                    new_Bias,
-                    new_Y,
+        assert epilogue_template is not None
+
+        if should_swap_xw:
+            # Swap
+            def clone_with_transposed_stride(node: IRNode) -> IRNode:
+                old_layout = node.get_layout()
+                new_stride = list(old_layout.stride)
+                new_stride[-2], new_stride[-1] = new_stride[-1], new_stride[-2]
+                new_layout = FixedLayout(
+                    old_layout.device,
+                    old_layout.dtype,
+                    list(old_layout.size),
+                    new_stride,
+                    old_layout.offset,
                 )
-                options["M"], options["N"] = "N", "M"
-
-            epilogue_arguments = self._template_from_string(epilogue_template).render(
-                **options
-            )
-            arguments = self._template_from_string(argument_template).render(
-                epilogue_arguments=epilogue_arguments, **options
-            )
-        else:
-            arguments = self._template_from_string(GEMM_ARGS_CUTLASS_2X).render(
-                split_k=1, **options
+                return Buffer(node.get_name(), new_layout)
+
+            new_X = clone_with_transposed_stride(X)
+            new_W = clone_with_transposed_stride(W)
+            new_Bias = clone_with_transposed_stride(Bias)
+            new_Y = clone_with_transposed_stride(Y)
+            options["X"], options["W"], options["Bias"], options["Y"] = (
+                new_W,
+                new_X,
+                new_Bias,
+                new_Y,
             )
+            options["M"], options["N"] = "N", "M"
+
+        epilogue_arguments = self._template_from_string(epilogue_template).render(
+            **options
+        )
+        arguments = self._template_from_string(argument_template).render(
+            epilogue_arguments=epilogue_arguments, **options
+        )
+
         return arguments
 
     def render(  # type: ignore[override]
@@ -621,28 +867,27 @@ def render(  # type: ignore[override]
         kernel: CUDATemplateKernel,
         op: "cutlass_gemm_op.GemmOperation" = None,  # type: ignore[name-defined]  # noqa: F821
         template_buffer_node: Optional[CUDATemplateBuffer] = None,
-        epilogue_nodes: Optional[List[IRNode]] = None,
         **kwargs,
     ) -> str:
-        if epilogue_nodes is not None and len(epilogue_nodes) > 0:
-            assert self.can_fuse_epilogue and CUTLASSGemmTemplate.supports_evt(
-                op
-            ), "op does not support EVT epilogue fusion"
-            assert (
-                template_buffer_node is not None
-            ), "Template node is required for epilogue fusion"
-            assert isinstance(
-                template_buffer_node, CUDATemplateBuffer
-            ), f"Template node has to be a CUDATemplateBuffer, is type {type(template_buffer_node)}"
-            assert (
-                template_buffer_node.name is not None
-            ), "Output node has to be a Buffer with a name"
-            # This is the name of the output of the Matmul, before epilogues are applied.
-            # it is not necessarily materialized in global memory if we have an epilogue
-
-        template_output_node_name = (
-            template_buffer_node.name if template_buffer_node is not None else None
-        )
+        """
+        The primary entry point for the code rendering process used in this template.
+        Renders the Cutlass based CUDA C++ code for the GEMM Kernel that this template is designed to implement,
+        including potentially fused epilogues.
+
+        Args:
+            kernel (CUDATemplateKernel): The kernel to be rendered.
+            op (cutlass_gemm_op.GemmOperation, optional): A GEMM operation that is required to be compatible with the
+                input and output definitions as well as a possible epilogue. Defaults to None.
+            **kwargs: Additional keyword arguments. Currently unused.
+
+        Returns:
+            str: Cutlass based CUDA C++ code fragment as a string, to be used by the current
+            CUDATemplateKernel or autotuning code.
+
+        Note:
+            All inputs and their corresponding buffer addresses and names take precedence over previously
+            passed inputs to the template at construction time. However, they should be layout compatible.
+        """
 
         assert cutlass_utils.try_import_cutlass()
         import cutlass_library.gemm_operation as cutlass_gemm_op
@@ -651,46 +896,66 @@ def render(  # type: ignore[override]
         assert isinstance(
             op, cutlass_gemm_op.GemmOperation
         ), "op argument is required and has to be an instance of GemmOperation"
-        if template_buffer_node is not None:
-            self.output_node = template_buffer_node
-        if epilogue_nodes is not None and len(epilogue_nodes) > 0:
-            self.output_node = cast(Buffer, epilogue_nodes[-1])
 
         assert len(self.input_nodes) >= 2 and self.output_node is not None
         X, W = self.input_nodes[0], self.input_nodes[1]
+        assert isinstance(X.layout, FixedLayout), "X.layout is not fixed"
+        assert isinstance(W.layout, FixedLayout), "W.layout is not fixed"
         Y = self.output_node
+        if template_buffer_node is not None:
+            Y = template_buffer_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
-        epilogue_template: Optional[str] = None
+        # to make op mutable without affecting others
+        op = copy.deepcopy(op)
+        if Bias is not None:
+            assert Bias.get_layout().dtype == X.get_layout().dtype
+            # This might have been set to void during filtering, when the assumption was still that there's no C
+            # operand
+            op.C.element = op.A.element
+
+        # Define Kernel call signature
+        # Important: This step also populates Kernel name to node mapping data structures,
+        # which are required further below ( for example by CutlassEVTEpilogueArgumentFormatter and
+        # the template renderer )
+        inputs = [X, W, Bias]
+        names = ["X", "W", "Bias"] + ["Y"]
+        names_str = ",".join(names)
+        if self.input_reorder is not None:
+            input_reorder = self.input_reorder
+        else:
+            input_reorder = None
+        kernel_call_signature = kernel.def_kernel(
+            inputs=inputs, outputs=[Y], names_str=names_str, input_reorder=input_reorder  # type: ignore[arg-type]
+        )
+        test_call_statement = self.test_call_statement(kernel, inputs, names_str)
+        # The layouts might have changed between autotuning and this call if they were FlexibleLayout
+        # we need to adapt, which might lead to suboptimal performance.
+
+        op = self.fix_op_layout(op, X, W, Bias, Y)
+        epilogue_template: str = GEMM_ARGS_CUTLASS_3X_EPILOGUE
+        argument_template: str = GEMM_ARGS_CUTLASS_3X
         should_swap_xw: bool = False
         epilogue_args = f"{{ElementComputeEpilogue({self.alpha}), ElementComputeEpilogue({self.beta})}}"
-        if op.gemm_kind == cutlass_lib.GemmKind.Universal3x:
-            if Bias is not None and self.has_tma_epilogue(op):
-                if self.should_swap_XW(Bias, self.beta):
-                    # TMA epilogue requires bias vector in column major to get best perf.
-                    op = self.swap_XW(op)
-                    should_swap_xw = True
-            if epilogue_nodes is not None and len(epilogue_nodes) > 0:
-                epilogue_args = (
-                    CutlassEVTEpilogueArgumentFormatter.ir_to_evt_argument_string(
-                        cast(str, template_output_node_name), epilogue_nodes
-                    )
-                )
-            epilogue_template = GEMM_ARGS_CUTLASS_3X_EPILOGUE
-            argument_template = GEMM_ARGS_CUTLASS_3X
-        else:
-            # TODO: Support split_k.
-            argument_template = GEMM_ARGS_CUTLASS_2X
+        if Bias is not None and self.has_tma_epilogue(op):
+            if (
+                op.epilogue_schedule
+                != cutlass_lib.EpilogueScheduleType.EpilogueTransposed
+                and self.should_swap_XW(Bias)
+            ):
+                # TMA epilogue requires bias vector in column major to get best perf.
+                op = self.swap_XW(op)
+                should_swap_xw = True
+
+        instance_definition, instance_type = self.define_gemm_instance(op)
 
-        instance_definition, instance_type = self.define_gemm_instance(
-            op, cast(str, template_output_node_name), epilogue_nodes
-        )
         options = dict(
             alpha=self.alpha,
             beta=self.beta,
             X=X,
             W=W,
             Y=Y,
+            kernel_call_signature=kernel_call_signature,
             Bias=Bias,
             epilogue_template=epilogue_template,
             argument_template=argument_template,
@@ -701,6 +966,35 @@ def render(  # type: ignore[override]
             instance_type=instance_type,
             input_reorder=self.input_reorder,
             epilogue_args=epilogue_args,
+            test_call_statement=test_call_statement,
         )
         res = self._template_from_string(GEMM_TEMPLATE).render(**options)
+        if inductor_cuda_config.generate_test_runner:
+            test_runner_code = self._template_from_string(
+                GEMM_STANDALONE_RUNNER_TEMPLATE
+            ).render(**options)
+            res += "\n\n" + test_runner_code
         return res
+
+    def test_call_statement(
+        self,
+        kernel,
+        input_nodes,
+        names_str: str = "",
+    ) -> str:
+        """
+        Helper method to render the Cutlass CUDA C++ code required for calling the GEMM operation in the standalone
+        test runner that might also be generated along with the rest of the code, if the corresponding config is
+        enabled.
+
+        Returns a C++ statement that calls the GEMM operation with the correct arguments.
+        """
+        _, __, arg_types = kernel.args.cpp_argdefs()
+        arg_names = [name.strip() for name in names_str.strip().split(",")]
+        if input_nodes[2] is None:
+            del arg_names[2]
+        arguments = [
+            f"(({arg_type}){arg_name}_data.get())"
+            for arg_type, arg_name in zip(arg_types, arg_names)
+        ]
+        return f"{kernel.kernel_name}({', '.join(arguments)}, workspace_size_ptr, (uint8_t*)workspace_data.get(), 0);"
diff --git a/torch/_inductor/codegen/cuda_combined_scheduling.py b/torch/_inductor/codegen/cuda_combined_scheduling.py
index 6d04983ef4696..b238cd805c032 100644
--- a/torch/_inductor/codegen/cuda_combined_scheduling.py
+++ b/torch/_inductor/codegen/cuda_combined_scheduling.py
@@ -1,6 +1,12 @@
-from typing import List
-
-from ..scheduler import BaseSchedulerNode, BaseScheduling, Scheduler, SchedulerNode
+from typing import List, Union
+
+from ..scheduler import (
+    BaseSchedulerNode,
+    BaseScheduling,
+    FusedSchedulerNode,
+    Scheduler,
+    SchedulerNode,
+)
 from .cuda.cuda_cpp_scheduling import CUDACPPScheduling
 
 from .triton import TritonScheduling
@@ -23,9 +29,7 @@ def __init__(self, scheduler: Scheduler):
         self._cuda_cpp_scheduling = CUDACPPScheduling(scheduler)
 
     def choose_node_backend(self, node: BaseSchedulerNode) -> BaseScheduling:
-        if self._cuda_cpp_scheduling.is_cuda_cpp_template(
-            node
-        ) or self._cuda_cpp_scheduling.is_cuda_cpp_fused_template(node):
+        if self._cuda_cpp_scheduling.is_cuda_cpp_template(node):
             return self._cuda_cpp_scheduling
         return self._triton_scheduling
 
@@ -36,9 +40,7 @@ def can_fuse_vertical(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
 
     def can_fuse_horizontal(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
         for node in (node1, node2):
-            if self._cuda_cpp_scheduling.is_cuda_cpp_template(
-                node
-            ) or self._cuda_cpp_scheduling.is_cuda_cpp_fused_template(node):
+            if self._cuda_cpp_scheduling.is_cuda_cpp_template(node):
                 return self._cuda_cpp_scheduling.can_fuse_horizontal(
                     node1, node2
                 )  # always False at the moment
@@ -51,6 +53,7 @@ def codegen_template(
         self, template_node: SchedulerNode, epilogue_nodes: List[SchedulerNode]
     ):
         if self._cuda_cpp_scheduling.is_cuda_cpp_template(template_node):
+            assert epilogue_nodes is None or len(epilogue_nodes) == 0
             return self._cuda_cpp_scheduling.codegen_template(
                 template_node, epilogue_nodes
             )
@@ -59,8 +62,8 @@ def codegen_template(
                 template_node, epilogue_nodes
             )
 
-    def codegen_nodes(self, nodes: List[BaseSchedulerNode]):
-        return self._triton_scheduling.codegen_nodes(nodes)
+    def codegen_node(self, node: Union[FusedSchedulerNode, SchedulerNode]):
+        return self._triton_scheduling.codegen_node(node)
 
     def codegen_sync(self):
         return self._triton_scheduling.codegen_sync()
@@ -73,3 +76,8 @@ def codegen_foreach(self, *args, **kwargs):
 
     def benchmark_fused_nodes(self, nodes):
         return self._triton_scheduling.benchmark_fused_nodes(nodes)
+
+    def generate_kernel_code_from_nodes(self, nodes, benchmark_kernel=False):
+        return self._triton_scheduling.generate_kernel_code_from_nodes(
+            nodes, benchmark_kernel
+        )
diff --git a/torch/_inductor/codegen/memory_planning.py b/torch/_inductor/codegen/memory_planning.py
index 0299941b40eaa..2aade2a297df5 100644
--- a/torch/_inductor/codegen/memory_planning.py
+++ b/torch/_inductor/codegen/memory_planning.py
@@ -62,8 +62,8 @@ class LiveRange:
     Invariant: begin <= end
     """
 
-    begin: float  # int | ±inf
-    end: float  # int | ±inf
+    begin: float  # int | +/-inf
+    end: float  # int | +/-inf
 
     def contains(self, other: LiveRange):
         """Is other entirely within self"""
@@ -134,15 +134,15 @@ def allocate(self, block: Allocation, is_last: bool) -> bool:
 
     def get_live_ranges(self) -> LiveRanges:
         """Aggregate LiveRanges for all objects below this in tree"""
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def get_size_hint(self) -> int:
         """Number of bytes used for example inputs"""
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def get_symbolic_size(self) -> sympy.Expr:
         """Number of bytes needed at runtime"""
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def finalize(self, pool, offset) -> AllocationTreeNode:
         """Called after all allocations have been made"""
@@ -326,7 +326,7 @@ def get_size_hint(self) -> int:
     @cache_on_self
     def get_symbolic_size(self) -> sympy.Expr:
         if not self.allocations:
-            return 0
+            return 0  # type: ignore[return-value]
         return sympy.Max(*[x.get_symbolic_size() for x in self.allocations])
 
     def is_empty(self):
diff --git a/torch/_inductor/codegen/multi_kernel.py b/torch/_inductor/codegen/multi_kernel.py
new file mode 100644
index 0000000000000..c2ff41517170e
--- /dev/null
+++ b/torch/_inductor/codegen/multi_kernel.py
@@ -0,0 +1,414 @@
+import logging
+import os
+from typing import Any, List
+
+from torch._inductor.metrics import get_metric_table, is_metric_table_enabled
+
+from .. import config
+from ..codecache import PyCodeCache, TritonFuture
+from ..runtime.runtime_utils import do_bench
+from ..utils import cache_on_self
+from ..virtualized import V
+from .common import TensorArg
+
+log = logging.getLogger(__name__)
+
+
+def get_kernel_argdefs(kernel):
+    arg_defs, _, _ = kernel.args.python_argdefs()
+    return arg_defs
+
+
+def _get_all_args(args_list):
+    all_args = max(args_list, key=len)[:]
+    for args in args_list:
+        assert set(args).issubset(set(all_args)), f"{args} v.s. {all_args}"
+
+    return all_args
+
+
+def get_all_kernel_argdefs(kernels):
+    """
+    The logic here must match with `get_all_call_args`.
+    """
+    argdefs_list = [get_kernel_argdefs(kernel) for kernel in kernels]
+
+    return _get_all_args(argdefs_list)
+
+
+def get_all_call_args(call_args_list):
+    """
+    Passed in the call_args for each subkernel and return the call_args for the
+    combined multi-kernel.
+
+    Note an algorithm as follows does not always work:
+    ```
+        all_call_args: Dict[
+            Any, None
+        ] = {}  # use a dict rather than set to maintain insertion order
+        for call_args in call_args_list:
+            all_call_args.update({arg: None for arg in call_args})
+
+        all_call_args = list(all_call_args.keys())
+    ```
+    It will fail if any kernel has the same argument passed in multiple times.
+    Check test_pass_same_arg_multi_times in test_multi_kernel.py
+
+    Instead, we pick the longest call args and assert that other call args are
+    a subset of it.
+    """
+    return _get_all_args(call_args_list)
+
+
+def get_numel_argdefs(kernel):
+    numel_argdefs = []
+    for tree in kernel.range_trees:
+        if tree.prefix != "r" or kernel.inside_reduction:
+            numel_argdefs.append(f"{tree.prefix}numel")
+
+    return numel_argdefs
+
+
+class MultiKernelState:
+    """
+    Maintain state of multi-kernel compilation so we don't define duplicated
+    multi-kernel for the same set of sub-kernels.
+
+    V.graph.wrapper_code has a reference to MultiKernelState instance.
+    """
+
+    def __init__(self):
+        self.subkernel_to_kernel_name = {}
+
+    def define_kernel(self, kernels):
+        """
+        Previously we name the multi kernel as "multi_kernel_{kernel_names[0]}".
+        This has some minor issue.
+
+        E.g. for persistent reduction https://gist.github.com/shunting314/39e7c00ff8bb2055942ed5a3255d61ca ,
+        there are 2 flavors of non-persistent reduction:
+          https://gist.github.com/shunting314/056d43d35907e87efb883970b35c17d4
+        and
+          https://gist.github.com/shunting314/02ee753b65c513c54e695626afe682bd
+
+        The only different is cache eviction policy.
+
+        We should name the multi-kernel differently in these 2 cases.
+        """
+        kernel_names = tuple(k.kernel_name for k in kernels)
+        if kernel_names in self.subkernel_to_kernel_name:
+            return self.subkernel_to_kernel_name[kernel_names]
+
+        # name the multi kernel based on the first kernel
+        multi_kernel_name = f"multi_kernel_{len(self.subkernel_to_kernel_name)}"
+        self.subkernel_to_kernel_name[kernel_names] = multi_kernel_name
+
+        if V.graph.cpp_wrapper:
+            # we should not generate any python code for multi-kernel during
+            # the second pass of cpp-wrapper.
+            return multi_kernel_name
+
+        wrapper = V.graph.wrapper_code
+
+        kernel_call_def_code = "\n".join(
+            [
+                f"""
+    def call{idx}(need_clone_args=False):
+        args = [{', '.join(get_kernel_argdefs(kernels[idx]))}]
+        if need_clone_args:
+            args, _ = multi_kernel_call.kernels[{idx}].clone_args(*args)
+        multi_kernel_call.kernels[{idx}].run(*args, {', '.join(get_numel_argdefs(kernels[idx]))}, grid=grid, stream=stream)
+        """.format(
+                    idx
+                ).strip(
+                    "\n"
+                )
+                for idx in range(len(kernels))
+            ]
+        )
+
+        # add subkernel src code hashes to the multi-kernel source code so changing a
+        # subkernel implementation will result in a different py file for
+        # multi-kernel. This makes cache implementation straightforward since
+        # we can decide cache file name based on multi-kernel py file name
+        # directly.
+        #
+        # Without the hash added for subkernels, the cache file may be shared by
+        # different subkernels which is incorrect.
+        subkernel_hashes = "\n".join(
+            f"# subkernel{i} code hash: {kernel.code_hash}"
+            for i, kernel in enumerate(kernels)
+        )
+
+        src_code = f"""
+{subkernel_hashes}
+def run(multi_kernel_call, {', '.join(get_all_kernel_argdefs(kernels))}, {', '.join(get_numel_argdefs(kernels[0]))}, grid, stream):
+{kernel_call_def_code}
+    multi_kernel_call.run_with_argless_kernels([call0, call1])
+        """  # noqa: B950 line too long
+        wrapper.header.splice(
+            f"""
+        {multi_kernel_name} = async_compile.multi_kernel({multi_kernel_name!r}, [
+            {", ".join(kernel_names)},
+        ],
+            '''
+        """
+        )
+        wrapper.header.splice(src_code)
+        wrapper.header.splice(
+            """
+            '''
+        )
+        """
+        )
+
+        return multi_kernel_name
+
+
+class MultiKernel:
+    """
+    This class maintains the compile time state for multi kernels.
+
+    Assume we do codegen for a MultiKernel encapsulating kernel1 and kernel2.
+    The generated definition for the multi-kernel will looks like:
+    ```
+    multi_kernel_kernel1 = MultiKernelCall([kernel1, kernel2], multi_kernel_definition_code)
+    ```
+
+    Here is an concrete example: https://gist.github.com/shunting314/d9f3fb6bc6cee3dbae005825ca196d39
+    """
+
+    def __init__(self, kernels):
+        assert len(kernels) >= 2
+
+        self.kernels = kernels
+        self.kernel_name = V.graph.wrapper_code.multi_kernel_state.define_kernel(
+            kernels
+        )
+
+        # need this since some code in inductor check if the kernel object has an args
+        # attribute to decide if it's a non-null kernel.
+        self.args = object()
+
+    def call_kernel(self, kernel_name):
+        """
+        Collect the union of arguments from all subkernels as the arguments
+        for the multi-kernel.
+        """
+        assert kernel_name == self.kernel_name
+        call_args_list = [kernel.get_call_args() for kernel in self.kernels]
+
+        all_call_args = get_all_call_args(call_args_list)
+        grid: List[Any] = []
+
+        if V.graph.cpp_wrapper:
+            # for the second pass of cpp-wrapper codegen, we should call
+            # the fast kernel directly
+            picked_kernel = MultiKernelCall.lookup_choice(kernel_name)
+            kernel_name = self.kernels[picked_kernel].kernel_name
+            final_call_args = call_args_list[picked_kernel]
+        else:
+            final_call_args = all_call_args
+
+        # numels for all subkernels should be the same. Use kernels[0] here
+        self.kernels[0].add_numel_to_call_args_and_grid(
+            kernel_name, final_call_args, grid
+        )
+
+        grid = V.graph.wrapper_code.generate_default_grid(kernel_name, grid)
+
+        V.graph.wrapper_code.generate_kernel_call(
+            kernel_name,
+            final_call_args,
+            grid,
+            V.graph.scheduler.current_device.index,
+        )
+
+    def codegen_nan_check(self):
+        wrapper = V.graph.wrapper_code
+        seen = set()
+        for k in self.kernels:
+            _, call_args, arg_types = k.args.python_argdefs()
+            for arg, arg_type in zip(call_args, arg_types):
+                if arg in seen:
+                    continue
+                seen.add(arg)
+                if isinstance(arg_type, TensorArg):
+                    line = f"assert not {arg}.isnan().any().item()"
+                    wrapper.writeline(line)
+                    line = f"assert not {arg}.isinf().any().item()"
+                    wrapper.writeline(line)
+
+    @property
+    def removed_buffers(self):
+        return set.intersection(*[k.removed_buffers for k in self.kernels])
+
+    @property
+    def inplaced_to_remove(self):
+        return set.intersection(*[k.inplaced_to_remove for k in self.kernels])
+
+    @property
+    @cache_on_self
+    def inplace_update_buffers(self):
+        """
+        Make sure all kernels have the same inplace update mappings.
+        """
+        for k in self.kernels[1:]:
+            assert k.inplace_update_buffers == self.kernels[0].inplace_update_buffers
+        return self.kernels[0].inplace_update_buffers
+
+    def warn_mix_layout(self, kernel_name: str):
+        pass
+
+
+class MultiKernelCall:
+    """
+    This class is called at run time to actually run the kernel
+    """
+
+    def __init__(self, multi_kernel_name, kernels, src_code):
+        assert len(kernels) >= 2
+        self._kernels = kernels
+        self.multi_kernel_name = multi_kernel_name
+
+        self._run = PyCodeCache.load(src_code).run
+        self.disable_cache = os.environ.get(
+            "TORCHINDUCTOR_DISABLE_MULTI_KERNEL_CACHE"
+        ) == "1" or is_metric_table_enabled("persistent_red_perf")
+
+        self.picked_kernel = None
+        if config.triton.multi_kernel > 1:
+            # manually force a subkernel to ease perf testing
+            picked_by_config = config.triton.multi_kernel - 2
+            assert picked_by_config < len(self._kernels)
+            self.picked_kernel = picked_by_config
+        elif not self.disable_cache:
+            self.load_cache()
+
+        self._recorded = False
+
+    def cache_file_path(self):
+        py_file_path = self._run.__globals__["__file__"]
+        return os.path.splitext(py_file_path)[0] + ".picked_kernel"
+
+    def load_cache(self):
+        assert self.picked_kernel is None
+        path = self.cache_file_path()
+        if os.path.exists(path):
+            with open(path) as fd:
+                self.picked_kernel = int(fd.read())
+                assert self.picked_kernel >= 0 and self.picked_kernel < len(
+                    self._kernels
+                )
+                log.debug(
+                    "Load picked kernel %d from cache file %s", self.picked_kernel, path
+                )
+
+    def store_cache(self):
+        assert self.picked_kernel is not None
+        path = self.cache_file_path()
+        with open(path, "w") as fd:
+            fd.write(str(self.picked_kernel))
+        log.debug("Store picked kernel %d to cache file %s", self.picked_kernel, path)
+
+    @property
+    def kernels(self):
+        """
+        Read results from future.
+
+        This should be called after parallel compilation is done.
+        In case you call this before compilation is done,
+        it may slow down the parallel compilation.
+        """
+        for i, kernel in enumerate(self._kernels):
+            if isinstance(kernel, TritonFuture):
+                self._kernels[i] = kernel.result()
+
+        return self._kernels
+
+    def run(self, *args, **kwargs):
+        self._run(self, *args, **kwargs)
+
+    @staticmethod
+    def benchmark_sub_kernels(kernel_calls):
+        """
+        Benchmark all the sub kernels and return the execution time
+        (in milliseconds) for each of time.
+
+        Unit test may mock this method to force a specific kernel to
+        be picked.
+        """
+        return [
+            do_bench(lambda: kernel_call(True), rep=40, fast_flush=True)
+            for kernel_call in kernel_calls
+        ]
+
+    # record_choice and lookup_choice are helper functions for cpp-wrapper
+    # codegen. The first pass use record_choice to keep the choice and
+    # the second pass do lookup by calling lookup_choice.
+    #
+    # An alternative that reused the multi-kernel cache does not work well
+    # since during codegen of the second pass, it's very hard to know the
+    # path for the cache file. Also reading the cache file need do some IO
+    # which can be slower.
+    @staticmethod
+    def record_choice(multi_kernel_name, choice):
+        """
+        Record the multi-kernel choice for cpp-wrapper first pass codegen
+        for the second pass.
+
+        We should do nothing if this function is not called during codegen.
+        """
+        from torch._inductor.graph import GraphLowering
+
+        if not isinstance(V.graph, GraphLowering):
+            return
+
+        if not V.graph.record_multi_kernel_choice:
+            return
+
+        V.graph.multi_kernel_to_choice[multi_kernel_name] = choice
+
+    @staticmethod
+    def lookup_choice(multi_kernel_name):
+        # this should always been done during cpp-wrapper codegen
+        assert V.graph.record_multi_kernel_choice
+        # there should be no miss
+        return V.graph.multi_kernel_to_choice[multi_kernel_name]
+
+    def run_with_argless_kernels(self, kernel_calls):
+        if self.picked_kernel is None:
+            timings = self.benchmark_sub_kernels(kernel_calls)
+            self.picked_kernel = timings.index(min(timings))
+            k0 = self.kernels[0]
+            log.debug(
+                "pick %dth sub-kernel in %s. Size hints %s. Reduction hint %s. Timings %s",
+                self.picked_kernel,
+                [k.inductor_meta.get("kernel_name") for k in self.kernels],
+                k0.size_hints,
+                k0.inductor_meta.get("reduction_hint"),
+                timings,
+            )
+
+            def get_kernel_path(k):
+                return k.fn.fn.__code__.co_filename
+
+            get_metric_table("persistent_red_perf").add_row(
+                lambda: {
+                    "kernel1_name": get_kernel_path(self.kernels[0]),
+                    "kernel2_name": get_kernel_path(self.kernels[1]),
+                    "kernel1_latency": timings[0],
+                    "kernel2_latency": timings[1],
+                    "size_hints": k0.size_hints,
+                    "reduction_hint": k0.inductor_meta.get("reduction_hint"),
+                    "speedup": timings[1] / timings[0],
+                }
+            )
+
+            if not self.disable_cache:
+                self.store_cache()
+
+        if not self._recorded:
+            self._recorded = True
+            self.record_choice(self.multi_kernel_name, self.picked_kernel)
+        kernel_calls[self.picked_kernel]()
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 5bb43dc411776..3a5905b63139e 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -14,7 +14,9 @@
 from typing import (
     Any,
     Callable,
+    cast,
     Counter,
+    DefaultDict,
     Dict,
     Iterable,
     List,
@@ -27,42 +29,53 @@
 import sympy
 
 import torch
-
 import torch._logging
+import torch.utils._pytree as pytree
+from torch._dynamo.utils import preserve_rng_state
+
+from torch._inductor.metrics import is_metric_table_enabled, log_kernel_metadata
+from torch._inductor.runtime.hints import AutotuneHint, DeviceProperties
 from torch._prims_common import is_integer_dtype
 from torch.utils._sympy.functions import FloorDiv, ModularIndexing
+from torch.utils._sympy.symbol import free_symbol_is_type, symbol_is_type, SymT
 from torch.utils._sympy.value_ranges import ValueRanges
 from torch.utils._triton import has_triton_package
 
 from ..._dynamo.utils import counters
 from .. import config, ir, scheduler
 from ..codecache import code_hash, get_path, PyCodeCache
-from ..dependencies import MemoryDep, StarDep
-from ..ir import IRNode, ReductionHint, TritonTemplateBuffer
+from ..dependencies import Dep, MemoryDep, StarDep, WeakDep
+from ..ir import IRNode, TritonTemplateBuffer
 from ..optimize_indexing import indexing_dtype_strength_reduction
-from ..scheduler import BaseScheduling, WhyNoFuse
-from ..triton_heuristics import AutotuneHint
-from ..utils import (
+from ..runtime.hints import ReductionHint, TRITON_MAX_BLOCK
+from ..runtime.runtime_utils import (
     do_bench,
+    get_max_y_grid,
+    green_text,
+    next_power_of_2,
+    yellow_text,
+)
+from ..scheduler import BaseSchedulerNode, BaseScheduling, WhyNoFuse
+from ..utils import (
+    cache_on_self,
+    get_bounds_index_expr,
+    get_dtype_size,
     get_fused_kernel_name,
     get_kernel_metadata,
-    green_text,
     is_welford_reduction,
-    next_power_of_2,
     Placeholder,
+    sympy_dot,
+    sympy_index_symbol,
     sympy_product,
     sympy_subs,
-    sympy_symbol,
     unique,
-    yellow_text,
 )
-from ..virtualized import ops, V
+from ..virtualized import _ops as ops, OpsHandler, ReductionType, StoreMode, V
 from ..wrapper_benchmark import get_kernel_category_by_source_code
 from .common import (
     CSE,
     CSEVariable,
     DeferredLine,
-    free_symbol_startswith,
     IndentedBuffer,
     index_prevent_reordering,
     Kernel,
@@ -71,14 +84,55 @@
     SizeArg,
     TensorArg,
 )
+from .multi_kernel import MultiKernel
 from .triton_utils import config_of, signature_of, signature_to_meta
 
+
 log = logging.getLogger(__name__)
 perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
 schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
 fusion_log = torch._logging.getArtifactLogger(__name__, "fusion")
 
 
+@lru_cache(None)
+def gen_attr_descriptor_import():
+    """
+    import AttrsDescriptor if the triton version is new enough to have this
+    class defined.
+    """
+    if not has_triton_package():
+        return ""
+
+    import triton.compiler.compiler
+
+    if hasattr(triton.compiler.compiler, "AttrsDescriptor"):
+        return "from triton.compiler.compiler import AttrsDescriptor"
+    else:
+        return ""
+
+
+@lru_cache(None)
+def gen_common_triton_imports():
+    imports = IndentedBuffer()
+    imports.splice(
+        """
+        import triton
+        import triton.language as tl
+        """
+    )
+    if attr_desc := gen_attr_descriptor_import():
+        imports.writeline(attr_desc)
+
+    imports.splice(
+        """
+        from torch._inductor.runtime import triton_helpers, triton_heuristics
+        from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties
+        """
+    )
+    return imports.getvalue()
+
+
 @dataclasses.dataclass
 class IndexingOptions:
     index_str: str
@@ -96,18 +150,176 @@ def has_rindex(self):
     def has_tmpmask(self):
         return "tmp" in self.mask_str
 
+    def has_rmask(self):
+        return "rmask" in self.mask_str
+
+
+@dataclasses.dataclass
+class BlockPtrOptions:
+    constant_offset: sympy.Expr
+    shape: List[sympy.Expr]
+    strides: List[sympy.Expr]
+    block_shape: List[str]
+    order: List[int]
+    offsets: List[str]
+    mask_vars: Set[sympy.Symbol]
+    reshape_suffix: List[str]
+
+    @staticmethod
+    def create(
+        strides: List[sympy.Expr],
+        constant_offset: sympy.Expr,
+        range_trees: List[IterationRangesEntry],
+        mask_vars: Set[sympy.Symbol],
+    ) -> BlockPtrOptions:
+        """Helper to create a  BlockPtrOptions instance"""
+        block_shape = [f"{t.prefix.upper()}BLOCK" for t in range_trees]
+        reshape_suffix = [*block_shape]
+
+        broadcasting_dim = [s == 0 for s in strides]
+        for i, is_broadcasting in enumerate(broadcasting_dim):
+            if is_broadcasting:
+                # drop any stride==0 dimensions for performance
+                reshape_suffix[i] = "1"
+
+        if V.kernel.no_x_dim:
+            assert range_trees[0].prefix == "x"
+            reshape_suffix.pop(0)
+
+        if (
+            not V.kernel.inside_reduction
+            and len(strides) == len(V.kernel.numels) - 1
+            and V.kernel.numels[-1] != 1
+        ):
+            # Need to expand rank by 1 to match rank when self.inside_reduction=True
+            reshape_suffix.append("1")
+
+        def filter(it):
+            """Removes any broadcasting dims from a given sequence"""
+            assert len(it) == len(broadcasting_dim)
+            return [
+                item
+                for item, is_broadcasting in zip(it, broadcasting_dim)
+                if not is_broadcasting
+            ]
+
+        return BlockPtrOptions(
+            constant_offset=V.graph.sizevars.lookup_precomputed_size(constant_offset),
+            shape=[
+                V.graph.sizevars.lookup_precomputed_size(t.numel)
+                for t in filter(range_trees)
+            ],
+            strides=[*map(V.graph.sizevars.lookup_precomputed_size, filter(strides))],
+            block_shape=filter(block_shape),
+            order=V.graph.sizevars.guarded_order(filter(strides)),
+            offsets=filter([f"{t.prefix}offset" for t in range_trees]),
+            mask_vars=mask_vars,
+            reshape_suffix=reshape_suffix,
+        )
+
+    def format(self, name: str, roffset=True) -> str:
+        """
+        Codegen a call to tl.make_block_ptr()
+
+        Args:
+            name: variable name for pointer
+            roffset: should roffset be included in offsets=..., for use with tl.advance()
+
+        Returns:
+            "tl.make_block_ptr(...)"
+        """
+        f = V.kernel.index_to_str
+        offsets = [*self.offsets]
+        if not roffset:
+            offsets[offsets.index("roffset")] = "0"
+        args = [
+            f"{name} + ({f(self.constant_offset)})"
+            if self.constant_offset != 0
+            else name,
+            f"shape={f(self.shape)}",
+            f"strides={f(self.strides)}",
+            f"block_shape={f(self.block_shape)}",
+            f"order={f(self.order)}",
+            f"offsets={f(offsets)}",
+        ]
+        return f"tl.make_block_ptr({', '.join(args)})"
+
+    @cache_on_self
+    def boundary_check(self) -> List[int]:
+        """List of indices to pass to tl.load(boundary_check=...)"""
+        check = []
+        for i in range(len(self.shape)):
+            if (
+                self.block_shape[i] != "1"
+                and not V.graph.sizevars.statically_known_equals(self.strides[i], 0)  # type: ignore[arg-type]
+                and not V.graph.sizevars.statically_known_multiple_of(
+                    self.shape[i],
+                    TRITON_MAX_BLOCK[self.block_shape[i][0]],  # type: ignore[arg-type]
+                )
+                and not (V.kernel.no_x_dim and self.block_shape[i] == "XBLOCK")
+            ):
+                check.append(i)
+        return check
+
+    def advance_roffset(self):
+        """Codegen string to pass to tl.advance(name, ...)"""
+        advance = ["0"] * len(self.shape)
+        advance[self.offsets.index("roffset")] = "RBLOCK"
+        return V.kernel.index_to_str(advance)
+
+    def has_rindex(self):
+        return "RBLOCK" in self.block_shape
+
+    def has_rmask(self):
+        return self.has_rindex()
+
+    def has_tmpmask(self):
+        return False  # block_ptr can't do indirect indexing
+
+    def has_mask(self):
+        return bool(self.boundary_check())
+
+
+def triton_reshape(value: str, old_shape: List[str], new_shape: List[str]):
+    """Workaround https://github.com/openai/triton/issues/2836"""
+    assert isinstance(old_shape, list) and isinstance(new_shape, list)
+    if old_shape == new_shape:
+        return value
+    if [s for s in new_shape if s != "1"] != old_shape:
+        return f"tl.reshape({value}, [{', '.join(new_shape)}])"
+    # rewrite to [:, None] syntax, which is less buggy
+    idx = 0
+    expand = []
+    for size in new_shape:
+        if idx < len(old_shape) and size == old_shape[idx]:
+            expand.append(":")
+            idx += 1
+        else:
+            assert size == "1"
+            expand.append("None")
+    assert idx == len(old_shape)
+    return f"{value}[{', '.join(expand)}]"
+
 
 class TritonPrinter(PythonPrinter):
     def _print_floor(self, expr):
         assert len(expr.args) == 1
-        return f"tl.math.floor({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
+        return (
+            f"libdevice.floor({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
+        )
+
+    def _print_Trunc(self, expr):
+        assert len(expr.args) == 1
+        return (
+            f"libdevice.trunc({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
+        )
 
     def _print_ceiling(self, expr):
         assert len(expr.args) == 1
-        return f"tl.math.ceil({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
+        return f"libdevice.ceil({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
 
     def _helper_sqrt(self, expr):
-        return f"tl.math.sqrt({self._print(expr)}.to(tl.float32))"
+        return f"libdevice.sqrt({self._print(expr)}.to(tl.float32))"
 
     def _print_Where(self, expr):
         c = self.doprint(expr.args[0])
@@ -115,30 +327,6 @@ def _print_Where(self, expr):
         q = self.doprint(expr.args[2])
         return f"tl.where({c}, {p}, {q})"
 
-    @staticmethod
-    @lru_cache(None)
-    def _propagate_nan_arg():
-        """
-        Newer triton version added propagate_nan as required argument for
-        tl.math.{min, max}. This method make inductor work with both old
-        and new version of triton.
-        """
-
-        if not has_triton_package():
-            # some tests run under environment without triton installed want to
-            # check that the generated code is as expected.
-            return ""
-        import inspect
-
-        import triton.language as tl
-
-        if "propagate_nan" in inspect.signature(tl.math.min).parameters:
-            # tl.PropagateNan.NONE is the default
-            propagate_nan_arg = ", tl.PropagateNan.NONE"
-        else:
-            propagate_nan_arg = ""
-        return propagate_nan_arg
-
     def _print_Min(self, expr):
         nargs = len(expr.args)
         if len(expr.args) == 1:
@@ -147,7 +335,7 @@ def _print_Min(self, expr):
         mid = len(expr.args) // 2
         a = self._print(sympy.Min(*expr.args[:mid]))
         b = self._print(sympy.Min(*expr.args[mid:]))
-        return f"tl.math.min({a}, {b}{TritonPrinter._propagate_nan_arg()})"
+        return f"tl.minimum({a}, {b})"
 
     def _print_Max(self, expr):
         nargs = len(expr.args)
@@ -158,11 +346,47 @@ def _print_Max(self, expr):
         a = self._print(sympy.Max(*expr.args[:mid]))
         b = self._print(sympy.Max(*expr.args[mid:]))
 
-        return f"tl.math.max({a}, {b}{TritonPrinter._propagate_nan_arg()})"
+        return f"tl.maximum({a}, {b})"
 
     def _print_Abs(self, expr):
         assert len(expr.args) == 1
-        return f"tl.abs({self._print(expr.args[0])})"
+        return f"tl_math.abs({self._print(expr.args[0])})"
+
+    def _print_OpaqueUnaryFn_cos(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.cos(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_OpaqueUnaryFn_cosh(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.cosh(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_OpaqueUnaryFn_acos(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.acos(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_OpaqueUnaryFn_sin(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.sin(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_OpaqueUnaryFn_sinh(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.sinh(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_OpaqueUnaryFn_asin(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.asin(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_OpaqueUnaryFn_tan(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.tan(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_OpaqueUnaryFn_tanh(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.tanh(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_OpaqueUnaryFn_atan(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.atan(({self._print(expr.args[0])}).to(tl.float32))"
 
     def _print_FloorDiv(self, expr):
         if expr.is_integer:
@@ -171,11 +395,13 @@ def _print_FloorDiv(self, expr):
         x, div = expr.args
         x = self.paren(self.doprint(x))
         div = self.paren(self.doprint(div))
-        return f"tl.math.floor({x} / {div}).to({V.kernel.index_dtype})"
+        return f"libdevice.floor({x} / {div}).to({V.kernel.index_dtype})"
 
     def _print_Round(self, expr):
         assert len(expr.args) == 1
-        return f"tl.math.llrint({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
+        return (
+            f"libdevice.llrint({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
+        )
 
     def _print_RoundDecimal(self, expr):
         assert len(expr.args) == 2
@@ -186,7 +412,7 @@ def _print_RoundDecimal(self, expr):
             raise ValueError(
                 f"For integer inputs, only non-negative ndigits are currently supported, but got {ndigits}."
             )
-        return f"tl.math.nearbyint(1e{ndigits} * {self.paren(self._print(number))}) * 1e{-ndigits}"
+        return f"libdevice.nearbyint(1e{ndigits} * {self.paren(self._print(number))}) * 1e{-ndigits}"
 
 
 texpr = TritonPrinter().doprint
@@ -204,6 +430,21 @@ def triton_compute_type(dtype):
         triton_type_name = "float8e4nv"
     elif triton_type_name == "float8_e5m2":
         triton_type_name = "float8e5"
+    elif triton_type_name == "float8_e4m3fnuz":
+        triton_type_name = "float8e4b8"
+    elif triton_type_name == "float8_e5m2":
+        triton_type_name = "float8e5b16"
+    return f"tl.{triton_type_name}"
+
+
+def triton_store_type(dtype):
+    triton_type_name = str(dtype).split(".")[-1]
+    if triton_type_name == "bool":
+        triton_type_name = "int8"
+    elif triton_type_name == "float8_e4m3fn":
+        triton_type_name = "float8e4nv"
+    elif triton_type_name == "float8_e5m2":
+        triton_type_name = "float8e5"
     return f"tl.{triton_type_name}"
 
 
@@ -225,7 +466,7 @@ def triton_constant(value):
 
 
 class TritonCSEVariable(CSEVariable):
-    def __init__(self, name, bounds: ValueRanges):
+    def __init__(self, name, bounds: ValueRanges[Any]):
         super().__init__(name, bounds)
         # We'll use this to track which masks the variable needs when used for indirect indexing
         self.mask_vars: Set[str] = set()
@@ -246,6 +487,9 @@ def update_on_args(self, name, args, kwargs):
                 # those reads should subsequently be masked,
                 self.mask_vars.update({f"{arg.name[0]}mask"})
 
+    def __repr__(self):
+        return f"TritonCSEVariable(name={self.name})"
+
 
 class TritonOverrides(OpOverrides):
     """Map element-wise ops to Triton"""
@@ -332,35 +576,35 @@ def constant(cls, value, dtype):
 
     @staticmethod
     def abs(x):
-        return f"tl.abs({x})"
+        return f"tl_math.abs({x})"
 
     @staticmethod
     def libdevice_abs(x):
-        return f"tl.math.abs({x})"
+        return f"libdevice.abs({x})"
 
     @staticmethod
     def exp(x):
-        return f"tl.exp({x})"
+        return f"tl_math.exp({x})"
 
     @staticmethod
     def libdevice_exp(x):
-        return f"tl.math.exp({x})"
+        return f"libdevice.exp({x})"
 
     @staticmethod
     def exp2(x):
-        return f"tl.math.exp2({x})"
+        return f"libdevice.exp2({x})"
 
     @staticmethod
     def expm1(x):
-        return f"tl.math.expm1({x})"
+        return f"libdevice.expm1({x})"
 
     @staticmethod
     def sqrt(x):
-        return f"tl.sqrt({x})"
+        return f"libdevice.sqrt({x})"
 
     @staticmethod
     def libdevice_sqrt(x):
-        return f"tl.math.sqrt({x})"
+        return f"libdevice.sqrt({x})"
 
     @staticmethod
     def relu(x):
@@ -374,7 +618,7 @@ def relu(x):
         elif bug == "accuracy":
             return f"{x} + 1"
         elif bug is None:
-            return ops.maximum("0", x)
+            return ops.maximum(ops.constant(0, torch.int32), x)
         else:
             raise AssertionError(
                 f"unrecognized config triton.inject_relu_bug_TESTING_ONLY = {bug!r}"
@@ -394,19 +638,19 @@ def where(a, b, c):
 
     @staticmethod
     def cos(x):
-        return f"tl.cos({x})"
+        return f"tl_math.cos({x})"
 
     @staticmethod
     def libdevice_cos(x):
-        return f"tl.math.cos({x})"
+        return f"libdevice.cos({x})"
 
     @staticmethod
     def sin(x):
-        return f"tl.sin({x})"
+        return f"tl_math.sin({x})"
 
     @staticmethod
     def libdevice_sin(x):
-        return f"tl.math.sin({x})"
+        return f"libdevice.sin({x})"
 
     @classmethod
     def index_expr(cls, expr, dtype):
@@ -418,71 +662,75 @@ def masked(mask, body, other):
 
     @staticmethod
     def lgamma(x):
-        return f"tl.math.lgamma({x})"
+        return f"libdevice.lgamma({x})"
 
     @staticmethod
     def erf(x):
-        return f"tl.math.erf({x})"
+        return f"libdevice.erf({x})"
 
     @staticmethod
     def cosh(x):
-        return f"tl.math.cosh({x})"
+        return f"libdevice.cosh({x})"
 
     @staticmethod
     def sinh(x):
-        return f"tl.math.sinh({x})"
+        return f"libdevice.sinh({x})"
 
     @staticmethod
     def acos(x):
-        return f"tl.math.acos({x})"
+        return f"libdevice.acos({x})"
 
     @staticmethod
     def acosh(x):
-        return f"tl.math.acosh({x})"
+        return f"libdevice.acosh({x})"
 
     @staticmethod
     def asin(x):
-        return f"tl.math.asin({x})"
+        return f"libdevice.asin({x})"
 
     @staticmethod
     def asinh(x):
-        return f"tl.math.asinh({x})"
+        return f"libdevice.asinh({x})"
 
     @staticmethod
     def atan2(x, y):
-        return f"tl.math.atan2({x}, {y})"
+        return f"libdevice.atan2({x}, {y})"
 
     @staticmethod
     def atan(x):
-        return f"tl.math.atan({x})"
+        return f"libdevice.atan({x})"
 
     @staticmethod
     def atanh(x):
-        return f"tl.math.atanh({x})"
+        return f"libdevice.atanh({x})"
 
     @staticmethod
     def copysign(x, y):
-        return f"tl.math.copysign({x}, {y})"
+        return f"libdevice.copysign({x}, {y})"
 
     @staticmethod
     def erfc(x):
-        return f"tl.math.erfc({x})"
+        return f"libdevice.erfc({x})"
 
     @staticmethod
     def erfinv(x):
-        return f"tl.math.erfinv({x})"
+        return f"libdevice.erfinv({x})"
 
     @staticmethod
     def hypot(x, y):
-        return f"tl.math.hypot({x}, {y})"
+        return f"libdevice.hypot({x}, {y})"
 
     @staticmethod
     def log10(x):
-        return f"tl.math.log10({x})"
+        return f"libdevice.log10({x})"
+
+    @staticmethod
+    def log2(x):
+        return f"libdevice.log2({x})"
 
     @staticmethod
     def nextafter(x, y):
-        return f"tl.math.nextafter({x}, {y})"
+        return f"libdevice.nextafter({x}, {y})"
 
     @staticmethod
     def logical_and(a, b):
@@ -545,19 +793,19 @@ def load_seed(name, offset):
 
     @staticmethod
     def rsqrt(x):
-        return f"tl.math.rsqrt({x})"
+        return f"libdevice.rsqrt({x})"
 
     @staticmethod
     def log1p(x):
-        return f"tl.math.log1p({x})"
+        return f"libdevice.log1p({x})"
 
     @staticmethod
     def tan(x):
-        return f"tl.math.tan({x})"
+        return f"libdevice.tan({x})"
 
     @staticmethod
     def tanh(x):
-        return f"tl.math.tanh({x})"
+        return f"libdevice.tanh({x})"
 
     @staticmethod
     def sigmoid(x):
@@ -565,44 +813,44 @@ def sigmoid(x):
 
     @staticmethod
     def libdevice_sigmoid(x):
-        return f"1/(1 + tl.math.exp(-({x})))"
+        return f"1/(1 + libdevice.exp(-({x})))"
 
     @staticmethod
     def signbit(x):
         # XX: This is wrong for the value -0.0 in floating point
-        return f"tl.math.signbit({x}) if ({x}).dtype is tl.float32 else {x} < 0"
+        return f"libdevice.signbit({x}) if ({x}).dtype is tl.float32 else {x} < 0"
 
     @staticmethod
     def fmod(a, b):
-        return f"tl.math.fmod({a}, {b})"
+        return f"libdevice.fmod({a}, {b})"
 
     @staticmethod
     def pow(a, b):
-        return f"tl.math.pow({a}, {b})"
+        return f"libdevice.pow({a}, {b})"
 
     @staticmethod
     def log(x):
-        return f"tl.log({x})"
+        return f"tl_math.log({x})"
 
     @staticmethod
     def libdevice_log(x):
-        return f"tl.math.log({x})"
+        return f"libdevice.log({x})"
 
     @staticmethod
     def isinf(x):
-        return f"tl.math.isinf({x}).to(tl.int1)"
+        return f"libdevice.isinf({x}).to(tl.int1)"
 
     @staticmethod
     def isnan(x):
-        return f"tl.math.isnan({x}).to(tl.int1)"
+        return f"libdevice.isnan({x}).to(tl.int1)"
 
     @staticmethod
     def round(x):
-        return f"tl.math.nearbyint({x})"
+        return f"libdevice.nearbyint({x})"
 
     @staticmethod
     def floor(x):
-        return f"tl.math.floor({x})"
+        return f"libdevice.floor({x})"
 
     @staticmethod
     def floordiv(a, b):
@@ -615,17 +863,15 @@ def floordiv(a, b):
 
     @staticmethod
     def sign(x):
-        def to_int(s):
-            return f"{s}.to(tl.int8)"
-
-        left = to_int(ops.lt("0", x))
-        right = to_int(ops.lt(x, "0"))
+        z = ops.constant(0, torch.int32)
+        left = ops.to_dtype((ops.lt(z, x)), torch.int8)
+        right = ops.to_dtype((ops.lt(x, z)), torch.int8)
         sub = ops.sub(left, right)
         return f"{sub}.to({x}.dtype)"
 
     @staticmethod
     def trunc(x):
-        return f"tl.math.trunc({x})"
+        return f"libdevice.trunc({x})"
 
     @staticmethod
     def truncdiv(a, b):
@@ -635,7 +881,15 @@ def truncdiv(a, b):
 
     @staticmethod
     def ceil(x):
-        return f"tl.math.ceil({x})"
+        return f"libdevice.ceil({x})"
+
+
+TritonOverrides._initialize_pointwise_overrides("triton")
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_TritonOverrides(h: TritonOverrides) -> OpsHandler[str]:
+    return h
 
 
 class TritonKernelOverrides(TritonOverrides):
@@ -657,10 +911,11 @@ def constant(cls, value, dtype):
 
     @classmethod
     def index_expr(cls, expr, dtype):
-        indexing = V.kernel.indexing(expr)
+        indexing = V.kernel.indexing(expr, block_ptr=False)
         assert isinstance(indexing, IndexingOptions)
-        # This is called from CSEProxy.__getattr__,  so we'll set the bounds there
-        var = V.kernel.cse.generate(V.kernel.compute, indexing.index_str)
+        var = V.kernel.cse.generate(
+            V.kernel.compute, indexing.index_str, bounds=get_bounds_index_expr(expr)
+        )
 
         if dtype not in {torch.int32, torch.int64}:
             var = V.kernel.cse.generate(V.kernel.compute, cls.to_dtype(var, dtype))
@@ -672,10 +927,14 @@ def masked(mask, body, other):
         with V.kernel.mask_loads(mask) as new_mask:
             result = body()
 
+        # Remove once CSEVariables track the dtype
+        if result.bounds.is_bool:
+            other = bool(other)
         # Take dtype from result to prevent accidental promotion
         other = V.kernel.cse.generate(
             V.kernel.compute,
             f"tl.full({result}.shape, {triton_constant(other)}, {result}.dtype)",
+            bounds=ValueRanges.wrap(other),
         )
         return ops.where(new_mask, result, other)
 
@@ -686,6 +945,25 @@ def load_seed(name, offset):
             f"tl.load({var} + {V.kernel.args.seed_offset('load_seed_offset', offset)})"
         )
 
+    @staticmethod
+    def frexp(x):
+        cache_key = f"frexp({x})"
+        if cache_key in V.kernel.cse.cache:
+            return V.kernel.cse.cache[cache_key]
+
+        mantissa = V.kernel.cse.newvar()
+        exponent = V.kernel.cse.newvar()
+        V.kernel.compute.writeline(
+            f"{mantissa}, {exponent} = triton_helpers.frexp({x})"
+        )
+        V.kernel.cse.cache[cache_key] = (mantissa, exponent)
+        return (mantissa, exponent)
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_TritonKernelOverrides(h: TritonKernelOverrides) -> OpsHandler[str]:
+    return h
+
 
 @dataclasses.dataclass
 class IterationRanges:
@@ -714,6 +992,7 @@ def __init__(
         kernel: TritonKernel,
         divisor=sympy.Integer(1),
         length=sympy.Integer(1),
+        root: IterationRangesRoot,
     ):
         super().__init__()
         self.name = name
@@ -724,12 +1003,10 @@ def __init__(
         self.divisor = divisor
         self.length = length
         self.kernel = kernel
-
-    def is_loop(self):
-        return self.prefix == "r" and not self.kernel.persistent_reduction
+        self.root = root
 
     def symbol(self):
-        return sympy_symbol(self.name)
+        return sympy_index_symbol(self.name)
 
 
 class IterationRangesRoot(IterationRanges):
@@ -737,10 +1014,16 @@ def __init__(
         self,
         name: str,
         numel: sympy.Expr,
+        # TODO: this is probably SymTy.INDEX and SymTy.RINDEX
         prefix: str,
         index: int,
         kernel: TritonKernel,
         pid_cache=None,
+        *,
+        is_loop: bool,
+        tensor_dim: Optional[int],
+        grid_dim: Optional[int],
+        has_zdim: bool,
     ):
         if pid_cache is None:
             pid_cache = {}
@@ -751,6 +1034,7 @@ def __init__(
             numel=numel,
             prefix=prefix,
             kernel=kernel,
+            root=self,
         )
         self.index = index
         # Store all the nodes in one flat list
@@ -759,6 +1043,16 @@ def __init__(
         # pid_cache["tl.program_id(0)"] = pid_m
         self.pid_cache: Dict[str, str] = pid_cache
 
+        # True if the dimension is implemented as a single program looping over
+        # the full dimension (currently only used for non-persistent reduction)
+        assert not is_loop or (prefix == "r" and grid_dim is None)
+        self.is_loop = is_loop
+        # Index of corresponding dimension on triton tensors
+        self.tensor_dim = tensor_dim
+        # Index of corresponding dimension in the triton grid
+        self.grid_dim = grid_dim
+        self.has_zdim = has_zdim
+
     def __repr__(self):
         return f"IterationRangesRoot({self.name!r}, {self.numel}, ...)"
 
@@ -771,9 +1065,11 @@ def lookup(self, divisor, length):
         Lookup a given RangeTreeEntry, creating it if needed
         """
         if V.graph.sizevars.statically_known_equals(divisor * length, self.numel):
-            expr = FloorDiv(sympy_symbol(f"{self.prefix}index"), divisor)
+            expr = FloorDiv(sympy_index_symbol(f"{self.prefix}index"), divisor)
         else:
-            expr = ModularIndexing(sympy_symbol(f"{self.prefix}index"), divisor, length)
+            expr = ModularIndexing(
+                sympy_index_symbol(f"{self.prefix}index"), divisor, length
+            )
 
         if expr not in self.nodes:
             node = IterationRangesEntry(
@@ -828,7 +1124,8 @@ def add(node):
         return list(reversed(index_vars)), list(reversed(sizes))
 
     def ranges_code(self):
-        size = self.kernel.indexing_size_str(self.index, self.prefix)
+        assert self.tensor_dim is not None
+        size = self.kernel.indexing_size_str(self.tensor_dim)
         index_dtype = self.kernel.index_dtype
         convert = f".to({index_dtype})" if index_dtype != "tl.int32" else ""
         return f"tl.arange(0, {self.prefix.upper()}BLOCK){size}{convert}"
@@ -840,23 +1137,31 @@ def scalar_code(self, value):
         return f"tl.full({size}, {value}, {index_dtype})"
 
     def get_pid(self):
-        key = f"tl.program_id({self.index})"
+        assert self.grid_dim is not None
+        key = f"tl.program_id({self.grid_dim})"
+        # y_grid has a limit, so express it in terms of y and z in case of overflow.
+        # z grid is only exercised when max_tiles == 3 (off by default).
+        if (
+            self.grid_dim == 1
+            and not self.has_zdim
+            and not (isinstance(self.numel, int) and self.numel <= get_max_y_grid())
+        ):
+            key = f"{key} * (tl.program_id({self.grid_dim + 1}) + 1)"
         pid = self.pid_cache.get(key, key)
         if self.kernel.index_dtype != "tl.int32":
             return f"{pid}.to({self.kernel.index_dtype})"
         return pid
 
-    def codegen_header(self, code, no_x_dim=False):
+    def codegen_header(self, code):
         x = self.prefix
-        if self.is_loop():
+        if self.is_loop:
             code.writeline(f"{self.name} = {x}offset + {x}base")
-        elif x == "r" and self.kernel.persistent_reduction:
-            code.writeline(
-                f"{self.name} = {self.ranges_code()}",
-            )
-            code.writeline("roffset = 0")
+        elif self.grid_dim is None:
+            # no need to "{x}offset = "
+            code.writeline(f"{self.name} = {self.ranges_code()}")
+            code.writeline(f"{x}offset = 0")
         else:
-            if not no_x_dim:
+            if self.tensor_dim is not None:
                 line = f"{x}offset + {self.ranges_code()}"
             else:
                 line = self.scalar_code(f"{x}offset")
@@ -887,6 +1192,7 @@ def __init__(
             divisor=divisor,
             length=length,
             kernel=parent.kernel,
+            root=parent.root,
         )
         self.parent = parent
         self.codegen = functools.lru_cache(None)(self._codegen)
@@ -904,7 +1210,7 @@ def cache_clear(self):
         self.codegen.cache_clear()
 
     def writeline(self, line):
-        if self.is_loop():
+        if self.root.is_loop:
             V.kernel.indexing_code.writeline(line)
         else:
             # lift non-reduction stores outside loop
@@ -923,7 +1229,9 @@ def precomputed_args(self):
         for arg in self.expr.args[1:]:
             if not isinstance(arg, (sympy.Integer, sympy.Symbol)):
                 symbols = arg.free_symbols
-                if len(symbols) > 0 and all(s.name.startswith("s") for s in symbols):
+                if len(symbols) > 0 and all(
+                    symbol_is_type(s, SymT.SIZE) for s in symbols
+                ):
                     precomputed_args.append(arg)
         return precomputed_args
 
@@ -944,7 +1252,7 @@ def __init__(self):
         self._templates_seen = {}
         self.finalized_helpers = []
 
-    def add(self, template_code: str) -> str:
+    def add(self, template_code: str, *, base_name="_triton_helper_fn") -> str:
         """This accepts a function definition with the function name
         left as a format specifier e.g.
 
@@ -961,7 +1269,7 @@ def {name}(arg0, arg1):
             # Don't duplicate existing helpers
             return existing_name
 
-        name = f"_triton_helper_fn{len(self.finalized_helpers)}"
+        name = f"{base_name}{len(self.finalized_helpers)}"
         self._templates_seen[template_code] = name
         self.finalized_helpers.append(template_code.format(name=name))
         return name
@@ -987,6 +1295,7 @@ def __init__(
         pid_cache=None,
         reduction_hint=ReductionHint.DEFAULT,
         min_elem_per_thread=0,
+        disable_persistent_reduction=False,
     ):
         if pid_cache is None:
             pid_cache = {}
@@ -1005,8 +1314,13 @@ def __init__(
         self.index_dtype: str = index_dtype
         self.min_elem_per_thread = min_elem_per_thread
         self.last_usage: Set[str] = set()
+        self.block_ptr_id = itertools.count()
+        # buffer accesses in the kernel
+        self.buf_accesses: DefaultDict[str, List[Dep]] = collections.defaultdict(list)
 
-        self.persistent_reduction: bool = self.should_use_persistent_reduction()
+        self.persistent_reduction: bool = (
+            not disable_persistent_reduction
+        ) and self.should_use_persistent_reduction()
         self.no_x_dim = (
             self.reduction_hint == ReductionHint.INNER
             and self.persistent_reduction
@@ -1029,6 +1343,8 @@ def simplify_indexing(index: sympy.Expr):
             return index
 
         self.simplify_indexing = simplify_indexing
+        self.code_hash = None
+        self.triton_meta: Optional[Dict[str, object]] = None
 
     def need_numel_args(self):
         r"""
@@ -1050,6 +1366,13 @@ def should_use_persistent_reduction(self) -> bool:
         threshold = {
             ReductionHint.INNER: 1024,
         }.get(self.reduction_hint, 64)
+
+        # If multi_kernel is enabled, we do more aggressive persistent reduction.
+        # This may result in some persistent reductions slower than the
+        # corresponding non-persistent reductions. MultiKernel will do benchmarking
+        # to pick the faster one.
+        if config.triton.multi_kernel:
+            threshold *= 16
         last_numel = self.numels[-1]
         if not isinstance(last_numel, (int, sympy.Integer)):
             # Not static
@@ -1058,7 +1381,7 @@ def should_use_persistent_reduction(self) -> bool:
         if hint > threshold:
             return False
         # will need to recompile if we cross a larger power of 2 boundary
-        V.graph.sizevars.guard_leq(self.numels[-1], next_power_of_2(hint))
+        V.graph.sizevars.guard_leq(self.numels[-1], next_power_of_2(hint))  # type: ignore[arg-type]
         return True
 
     def set_last_usage(self, nodes):
@@ -1071,40 +1394,66 @@ def set_last_usage(self, nodes):
         )
 
     def initialize_range_tree(self, pid_cache):
-        names = list(
-            reversed(["xindex", "yindex", "zindex"][: len(self.numels) - 1])
-        ) + ["rindex"]
-        for i in range(len(self.numels)):
-            pid_idx = i if names[i][0] == "r" else "xyz".find(names[i][0])
+        no_r_dim = not self.inside_reduction or self.numels[-1] == 1
+
+        prefixes = "zyxr"
+        active_prefixes = prefixes[-len(self.numels) :]
+
+        grid_dims = "xyz"
+        if self.no_x_dim:
+            tensor_dims = "r"
+        elif no_r_dim:
+            tensor_dims = "xyz"
+        else:
+            tensor_dims = "xyzr"
+
+        tensor_dims = "".join(p for p in tensor_dims if p in active_prefixes)
+
+        for i, prefix in enumerate(active_prefixes):
+            is_reduction = prefix == "r"
+            tensor_dim = tensor_dims.find(prefix) if prefix in tensor_dims else None
+            grid_dim = None if is_reduction else grid_dims.find(prefix)
+            index = i if grid_dim is None else grid_dim
             self.range_trees.append(
                 IterationRangesRoot(
-                    names[i], self.numels[i], names[i][0], pid_idx, self, pid_cache
+                    f"{prefix}index",
+                    self.numels[i],
+                    prefix,
+                    index,
+                    self,
+                    pid_cache=pid_cache,
+                    is_loop=is_reduction and not self.persistent_reduction,
+                    tensor_dim=tensor_dim,
+                    grid_dim=grid_dim,
+                    has_zdim="z" in active_prefixes,
                 )
             )
         for tree in self.range_trees:
             # reduction indexing goes inside a loop
-            if not tree.is_loop():
-                tree.codegen_header(self.body, self.no_x_dim)
-        if self.inside_reduction and self.range_trees[-1].is_loop():
+            if not tree.is_loop:
+                tree.codegen_header(self.body)
+        if self.inside_reduction and self.range_trees[-1].is_loop:
             # workaround for this issue:
             # https://gist.github.com/jansel/6527126f781559095c5531f98a4235a7
             self.body.writeline(f"rbase = {self.range_trees[-1].ranges_code()}")
 
     def disable_reduction(self):
+        should_flush = self.range_trees[-1].is_loop
+
         @contextlib.contextmanager
         def ctx():
             if self.numels[-1] == 1:
                 assert not self.inside_reduction
                 yield
                 return
-            if not self.persistent_reduction:
+            if should_flush:
                 # calling codegen_body() will flush all the pending buffers
                 # and write out a reduction loop
                 self.codegen_body()
             self.inside_reduction = False
             try:
                 yield
-                if not self.persistent_reduction:
+                if should_flush:
                     # flush out any code before opening the next loop
                     self.codegen_body()
             finally:
@@ -1131,7 +1480,7 @@ def _split_iteration_ranges(
         def add_range(i, expr):
             expr = sv.simplify(expr)
             if not sv.statically_known_multiple_of(remaining[i], expr):
-                raise CantSplit()
+                raise CantSplit
             # guard on the last item out
             remaining[i] = FloorDiv(remaining[i], expr)
             new_ranges[i].append(expr)
@@ -1148,7 +1497,7 @@ def getter(flat_vars):
         for length_group in lengths:
             return_getters = []
             for size in length_group:
-                if sv.statically_known_equals(size, 1):
+                if sv.statically_known_equals(size, 1):  # type: ignore[arg-type]
                     return_getters.append(lambda _: sympy.Integer(0))
                     continue
 
@@ -1164,7 +1513,7 @@ def getter(flat_vars):
                     if not sv.statically_known_multiple_of(
                         size, remaining[current_group]
                     ):
-                        raise CantSplit()
+                        raise CantSplit
                     size1 = remaining[current_group]
                     size2 = FloorDiv(size, remaining[current_group])
                     return_getters.append(
@@ -1227,7 +1576,7 @@ def split_and_set_ranges(self, lengths: List[List[sympy.Expr]]):
 
     def is_indirect_indexing(self, index: sympy.Expr):
         # tmpX  means indirect indexing
-        return free_symbol_startswith(index, "tmp")
+        return free_symbol_is_type(index, SymT.TMP)
 
     def is_broadcasted(self, index: sympy.Expr):
         # Note. This may not be correct when there is indirect indexing
@@ -1239,7 +1588,7 @@ def is_broadcasted(self, index: sympy.Expr):
             if symbol not in self.range_tree_nodes:
                 # Non-iterated variables, e.g. strides
                 continue
-            entry = self.range_tree_nodes[symbol]
+            entry = self.range_tree_nodes[symbol]  # type: ignore[index]
             assert isinstance(entry.parent, IterationRangesRoot)
             index_numels[entry.parent.index] *= entry.length
 
@@ -1247,7 +1596,7 @@ def is_broadcasted(self, index: sympy.Expr):
         # numels, then it must be broadcasted.
         simplify = V.graph.sizevars.simplify
         return any(
-            simplify(idx_range) != simplify(iter_range)
+            simplify(idx_range) != simplify(iter_range)  # type: ignore[arg-type]
             for idx_range, iter_range in zip(index_numels, self.numels)
         )
 
@@ -1289,7 +1638,8 @@ def indexing(
         copy_shape=None,
         dense_indexing=False,
         override_mask=None,
-    ) -> IndexingOptions:
+        block_ptr=False,
+    ) -> Union[IndexingOptions, BlockPtrOptions]:
         """
         Compute the index and mask to pass to tl.load() or tl.store()
         """
@@ -1310,7 +1660,8 @@ def indexing(
                 # so if everything goes fine, lower level replacements will come up empty
                 symbols = a.free_symbols
                 if len(symbols) > 0 and all(
-                    s.name.startswith("s") or s.name.startswith("ps") for s in symbols
+                    symbol_is_type(s, (SymT.SIZE, SymT.PRECOMPUTED_SIZE))
+                    for s in symbols
                 ):
                     replacements = {a: V.graph.sizevars.lookup_precomputed_size(a)}
                     index = sympy_subs(index, replacements)
@@ -1322,18 +1673,22 @@ def indexing(
         mask_vars: Set[str] = set()
         for var in index_vars:
             assert isinstance(var, sympy.Symbol)
-            has_rindex = has_rindex or var.name.startswith("r")
+            has_rindex = has_rindex or symbol_is_type(var, SymT.RINDEX)
             if override_mask:
                 pass
-            elif var.name.startswith("tmp"):
+            elif symbol_is_type(var, SymT.TMP):
                 # indirect indexing
                 cse_var = self.cse.varname_map[var.name]
                 mask_vars.update(cse_var.mask_vars)
-            elif var.name.startswith(("s", "ps", "i")):
+            elif symbol_is_type(
+                var, (SymT.UNBACKED_INT, SymT.SIZE, SymT.PRECOMPUTED_SIZE, SymT.INDEX)
+            ):
                 pass
             else:
                 # var is one of xN, yN or rN
-                assert var.name[0] in "xyr", var.name
+                assert symbol_is_type(
+                    var, (SymT.RINDEX, SymT.XBLOCK, SymT.YBLOCK)
+                ), var.name
                 mask_vars.add(f"{var.name[0]}mask")
 
         need_dense = (
@@ -1353,6 +1708,36 @@ def indexing(
                 have_dense = False
             dense_mask_vars.add(f"{tree.prefix}mask")
 
+        if (
+            block_ptr
+            and config.triton.use_block_ptr
+            and not override_mask
+            and not self._load_mask
+            and len(mask_vars - dense_mask_vars) == 0
+            and not self.is_indirect_indexing(index)
+            and have_loop_vars
+            # workaround https://github.com/openai/triton/issues/2821
+            and self.index_dtype == "tl.int32"
+        ):
+            index_relative_to_xyr_index = sympy_subs(
+                index, {v: t.expr for v, t in self.range_tree_nodes.items()}
+            )
+            range_trees = self.active_range_trees(reorder=True)
+            symbols = [t.symbol() for t in range_trees]
+            strides = [sympy.Wild(f"stride_{s}", exclude=symbols) for s in symbols]
+            offset = sympy.Wild("_offset", exclude=symbols)
+            m = index_relative_to_xyr_index.match(sympy_dot(symbols, strides) + offset)
+            # TODO(jansel): it is sometimes possible to do higher dimensional block_ptrs with
+            #               a tl.reshape the correct block.  We will miss these cases today.
+            if m:
+                self.filter_masks(mask_vars)
+                return BlockPtrOptions.create(
+                    [m[s] for s in strides],
+                    m[offset],
+                    range_trees,
+                    mask_vars,  # type: ignore[arg-type]
+                )
+
         expand_str = None
         index_str = self.index_to_str(index)
         if isinstance(index, sympy.Integer):
@@ -1377,7 +1762,7 @@ def indexing(
         self.filter_masks(mask_vars)
 
         mask_str = " & ".join(sorted(map(str, mask_vars))) if mask_vars else "None"
-        return IndexingOptions(index_str, mask_vars, mask_str, expand_str, has_rindex)
+        return IndexingOptions(index_str, mask_vars, mask_str, expand_str, has_rindex)  # type: ignore[arg-type]
 
     def active_range_trees(self, reorder=False):
         trees = [
@@ -1394,19 +1779,19 @@ def active_range_trees(self, reorder=False):
     def filter_masks(self, mask_vars):
         for tree in self.range_trees:
             # Masks are superfluous if we only have one element
-            if V.graph.sizevars.statically_known_equals(tree.numel, 1):
+            if V.graph.sizevars.statically_known_equals(tree.numel, 1):  # type: ignore[arg-type]
                 mask_vars.discard(f"{tree.prefix}mask")
                 continue
             # Masks are superfluous if numel is a multiple of BLOCK
             # (We use the fact that BLOCK is required by triton to be a power of 2)
-            if tree.prefix.upper() not in config.triton.max_block:
+            if tree.prefix.upper() not in TRITON_MAX_BLOCK:
                 continue
-            max_block = config.triton.max_block[tree.prefix.upper()]
+            max_block = TRITON_MAX_BLOCK[tree.prefix.upper()]
             # Optional optimization: if block divides numel exactly, we will
             # never need to do a masked load to handle stragglers at the end.
             # It's faster to avoid masking at all.  But it is sound to always
             # mask.
-            if V.graph.sizevars.statically_known_multiple_of(tree.numel, max_block):
+            if V.graph.sizevars.statically_known_multiple_of(tree.numel, max_block):  # type: ignore[arg-type]
                 mask_vars.discard(f"{tree.prefix}mask")
 
     def var_ranges(self):
@@ -1423,13 +1808,13 @@ def codegen_indexing(self, expr: sympy.Expr):
                 # if indexing expression is complicated, we precompute it on the host side
                 # and send the result as a kernel argument
                 replacements = {}
-                for ps in self.range_tree_nodes[sym].precomputed_args():
+                for ps in self.range_tree_nodes[sym].precomputed_args():  # type: ignore[index]
                     replacements[ps] = V.graph.sizevars.lookup_precomputed_size(ps)
                 if len(replacements) > 0:
-                    self.range_tree_nodes[sym].expr = sympy_subs(
-                        self.range_tree_nodes[sym].expr, replacements
+                    self.range_tree_nodes[sym].expr = sympy_subs(  # type: ignore[index]
+                        self.range_tree_nodes[sym].expr, replacements  # type: ignore[index]
                     )
-                self.range_tree_nodes[sym].codegen()
+                self.range_tree_nodes[sym].codegen()  # type: ignore[index]
         return expr
 
     @contextlib.contextmanager
@@ -1482,20 +1867,63 @@ def get_strides_of_load(self, index: sympy.Expr):
         {xindex: 512, rindex: 1024}
         """
         index_to_tile_indexes = {k: v.expr for k, v in self.range_tree_nodes.items()}
-        index_in_tile_vars = sympy_subs(index, index_to_tile_indexes)
+        index_in_tile_vars = sympy_subs(index, index_to_tile_indexes)  # type: ignore[arg-type]
         strides = {}
         for range_tree in self.range_trees:
-            s = sympy_symbol(range_tree.name)
+            s = sympy_index_symbol(range_tree.name)
             strides[s] = sympy_subs(index_in_tile_vars, {s: 1}) - sympy_subs(
                 index_in_tile_vars, {s: 0}
             )
         return strides
 
+    def codegen_block_ptr(
+        self, name: str, var: str, indexing: BlockPtrOptions, other=""
+    ) -> Tuple[str, Optional[DeferredLine], str]:
+        advance_block_ptr = None
+        check = indexing.boundary_check()
+        if not check:
+            # workaround https://github.com/openai/triton/issues/2813
+            other = ""
+        elif other:
+            assert other == ", other=0.0"
+            other = f", boundary_check={check!r}, padding_option='zero'"
+        else:
+            other = f", boundary_check={check!r}"
+        if (
+            self.inside_reduction
+            and self.range_trees[-1].is_loop
+            and indexing.has_rindex()
+        ):
+            block_ptr = f"block_ptr{next(self.block_ptr_id)}"
+            self.body.writeline(
+                DeferredLine(
+                    name, f"{block_ptr} = {indexing.format(var, roffset=False)}"
+                )
+            )
+            advance_block_ptr = DeferredLine(
+                name,
+                f"{block_ptr} = tl.advance({block_ptr}, {indexing.advance_roffset()})",
+            )
+        else:
+            block_ptr = indexing.format(var)
+        return block_ptr, advance_block_ptr, other
+
+    def codegen_block_ptr_store_line(self, name, indexing, block_ptr, value, other=""):
+        # broadcasting is not implicit for block_ptrs
+        value = (
+            f"tl.broadcast_to({value}, {self.index_to_str(indexing.reshape_suffix)})"
+        )
+        # drop any extra size=1 dimensions
+        value = triton_reshape(value, indexing.reshape_suffix, indexing.block_shape)
+        # workaround https://github.com/openai/triton/issues/2814
+        value = f"{value}.to({triton_store_type(V.graph.get_dtype(name))})"
+        return f"tl.store({block_ptr}, {value}{other})"
+
     def load(self, name: str, index: sympy.Expr):
         var = self.args.input(name)
         indirect_indexing = self.is_indirect_indexing(index)
         original_index = index
-        indexing = self.indexing(index)
+        indexing = self.indexing(index, block_ptr=True)
         has_rindex = indexing.has_rindex()
         has_tmpmask = indexing.has_tmpmask()
 
@@ -1516,7 +1944,7 @@ def load(self, name: str, index: sympy.Expr):
             ep = ", eviction_policy='evict_last'"
         elif not is_coalesced:
             ep = ", eviction_policy='evict_last'"
-        elif self.inside_reduction and not self.persistent_reduction:
+        elif self.inside_reduction and self.range_trees[-1].is_loop:
             if name in self.args.inplace_buffers:
                 names = set(self.args.inplace_buffers[name].other_names)
             else:
@@ -1541,11 +1969,21 @@ def load(self, name: str, index: sympy.Expr):
         else:
             other = ""
 
+        advance_block_ptr = None
         append_broadcast = None
         if V.graph.is_unspec_arg(name):
             line = var
         else:
-            if isinstance(original_index, sympy.Integer):
+            if isinstance(indexing, BlockPtrOptions):
+                block_ptr, advance_block_ptr, other = self.codegen_block_ptr(
+                    name, var, indexing, other
+                )
+                line = f"tl.load({block_ptr}{other}{ep})"
+                # add needed size=1 dimensions
+                line = triton_reshape(
+                    line, indexing.block_shape, indexing.reshape_suffix
+                )
+            elif isinstance(original_index, sympy.Integer):
                 line = f"tl.load({var} + ({original_index}))"
                 append_broadcast = indexing.expand_str
             else:
@@ -1565,7 +2003,7 @@ def load(self, name: str, index: sympy.Expr):
             load_buffer = self.compute
         elif (
             self.inside_reduction
-            and not self.persistent_reduction
+            and self.range_trees[-1].is_loop
             and not indirect_indexing
             and not has_rindex
         ):
@@ -1577,21 +2015,26 @@ def load(self, name: str, index: sympy.Expr):
 
         result_var = self.cse.generate(load_buffer, line)
         assert isinstance(result_var, TritonCSEVariable)
-        result_var.mask_vars = indexing.mask_vars
+        result_var.mask_vars = indexing.mask_vars  # type: ignore[assignment]
 
         if append_broadcast:
             line = f"tl.broadcast_to({result_var}, {append_broadcast})"
             result_var = self.cse.generate(load_buffer, line)
 
-        if not self.inside_reduction or not has_rindex:
+        if advance_block_ptr:
+            load_buffer.writeline(advance_block_ptr)
+
+        if not self.inside_reduction or (not indexing.has_rmask() and not has_rindex):
             self.outside_loop_vars.add(result_var)
 
         return result_var
 
-    def store(self, name, index, value, mode=None):
+    def store(
+        self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
+    ) -> None:
         var = self.args.output(name)
         original_index = index
-        indexing = self.indexing(index, dense_indexing=True)
+        indexing = self.indexing(index, dense_indexing=True, block_ptr=mode is None)
 
         # Guard against write-after-read corruption in triton.
         # See # https://github.com/openai/triton/issues/1615
@@ -1604,13 +2047,24 @@ def store(self, name, index, value, mode=None):
         if is_inplace and is_broadcasted:
             self.stores.writeline(DeferredLine(name, "tl.debug_barrier()"))
 
-        if mode is None:
+        advance_block_ptr = None
+        if isinstance(indexing, BlockPtrOptions):
+            block_ptr, advance_block_ptr, other = self.codegen_block_ptr(
+                name, var, indexing
+            )
+            # block_ptr stores don't do implicit casting
+            line = self.codegen_block_ptr_store_line(
+                name, indexing, block_ptr, value, other
+            )
+        elif mode is None:
             line = f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})"
         elif mode == "atomic_add":
             line = f"tl.atomic_add({var} + ({indexing.index_str}), {value}, {indexing.mask_str})"
         else:
             raise NotImplementedError(f"store mode={mode}")
         self.stores.writeline(DeferredLine(name, line))
+        if advance_block_ptr:
+            self.stores.writeline(advance_block_ptr)
 
         if not self.inside_reduction:
             self.outside_loop_vars.add(value)
@@ -1622,7 +2076,7 @@ def bucketize(
         offsets_size: sympy.Expr,
         indexing_dtype: torch.dtype,
         right: bool,
-    ):
+    ) -> CSEVariable:
         """
         See [Note: Inductor bucketize op]
         """
@@ -1668,7 +2122,13 @@ def _map_tuple_or_scalar(fn, value):
             return tuple(map(fn, value))
         return fn(value)
 
-    def reduction(self, dtype, src_dtype, reduction_type, value):
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[CSEVariable, Tuple[CSEVariable, ...]],
+    ) -> Union[CSEVariable, Tuple[CSEVariable, ...]]:
         assert self.inside_reduction
         masks = {f"{tree.prefix}mask" for tree in self.range_trees}
         self.filter_masks(masks)
@@ -1715,7 +2175,7 @@ def final_argreduce(buffer, result_var, value, index):
         if cache_key in self.cse.reduction_cache:
             return self.cse.reduction_cache[cache_key]
 
-        dim = len(self.range_trees) - 1 - int(bool(self.no_x_dim))
+        dim = self.triton_tensor_ndim() - 1
         acc_type = triton_acc_type(src_dtype)
         result_var: Any = self.cse.newvar()
         result_var.mask_vars = {var for var in masks if var[0] != "r"}
@@ -1833,7 +2293,7 @@ def _mask_value(value, default):
                     self.compute.splice(
                         f"""\
                     {accumulator}_next, {accumulator_m2}_next, {accumulator_weight}_next = triton_helpers.welford_reduce(
-                        {value}, {accumulator}, {accumulator_m2}, {accumulator_weight},
+                        {value}, {accumulator}, {accumulator_m2}, {accumulator_weight}, roffset == 0
                     )
                     """
                     )
@@ -1893,35 +2353,58 @@ def _mask_value(value, default):
 
         return result_var
 
-    def store_reduction(self, name, index, value):
+    def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable):
         assert self.inside_reduction
         self.inside_reduction = False
-        indexing = self.indexing(index)
+        indexing = self.indexing(index, block_ptr=True)
         self.inside_reduction = True
         var = self.args.output(name)
 
-        assert isinstance(indexing, IndexingOptions)
-        self.suffix.writeline(
-            DeferredLine(
-                name,
-                f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})",
+        if isinstance(indexing, BlockPtrOptions):
+            self.suffix.writeline(
+                DeferredLine(
+                    name,
+                    self.codegen_block_ptr_store_line(
+                        name,
+                        indexing,
+                        indexing.format(var),
+                        value,
+                        f", boundary_check={indexing.boundary_check()!r}",
+                    ),
+                )
+            )
+        else:
+            assert isinstance(indexing, IndexingOptions)
+            self.suffix.writeline(
+                DeferredLine(
+                    name,
+                    f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})",
+                )
             )
-        )
 
     def _lift_helper(self, fn, num_args) -> str:
-        # Lift IR function into a triton function in the global namespace
+        # Lift IR function for scan operations into a triton function
+        # in the global namespace
         helper = IndentedBuffer()
         helper.writeline("@triton.jit")
-        args = [f"arg{n}" for n in range(num_args)]
-        signature = ", ".join(args)
+        args = [tuple(f"arg{i}_{n}" for n in range(num_args)) for i in range(2)]
+        signature = ", ".join(itertools.chain.from_iterable(args))
         helper.writeline(f"def {{name}}({signature}):")
 
         cse = CSE(prefix="", suffix="")
         overrides = TritonOverrides(V.MockHandler())
 
+        # Build a name that changes depending on fn to workaround a triton bug
+        # where the combine_fn to reduce and scan is not hashed, and so different
+        # scan ops may collide in the triton cache.
+        # This is fixed with the latest triton pin, but not the triton-rocm pin.
+        helper_name = "_triton_helper_fn"
+
         class CSEProxy:
             def __getattr__(self, name: str) -> Callable[..., CSEVariable]:
                 def inner(*args, **kwargs):
+                    nonlocal helper_name
+                    helper_name += f"_{name}"
                     return cse.generate(
                         helper,
                         getattr(overrides, name)(*args, **kwargs),
@@ -1931,73 +2414,114 @@ def inner(*args, **kwargs):
 
         with helper.indent(), V.set_ops_handler(CSEProxy()):
             outputs = fn(*args)
+            outputs = ", ".join(str(output) for output in outputs)
             helper.writeline(f"return {outputs}")
 
-        return self.helper_functions.add(helper.getvalue())
+        return self.helper_functions.add(helper.getvalue(), base_name=helper_name)
 
-    def scan(self, dtype, combine_fn, value, init):
+    def scan(
+        self,
+        dtypes: Tuple[torch.dtype, ...],
+        combine_fn: Callable[
+            [Tuple[CSEVariable, ...], Tuple[CSEVariable, ...]], Tuple[CSEVariable, ...]
+        ],
+        values: Tuple[CSEVariable, ...],
+    ) -> Tuple[CSEVariable, ...]:
         assert self.inside_reduction
         masks = {f"{tree.prefix}mask" for tree in self.range_trees}
         self.filter_masks(masks)
         masks = sorted(masks)
-        if self._load_mask:
-            masks.append(self._load_mask)
+        assert not self._load_mask, "ops.scan not supported inside ops.masked"
         reduction_range_prefix = self.range_trees[-1].prefix
 
-        value = self.cse.generate(
-            self.compute, f"tl.broadcast_to({value}, {self.dense_size_str()})"
-        )
+        broadcasted_values = []
+        accumulators = []
 
-        default = init
-        default_tensor = self.cse.generate(
-            self.body,
-            f"tl.full({[1] * self.triton_tensor_ndim()}, {default}, {triton_compute_type(dtype)})",
-        )
-        dim = len(self.range_trees) - 1 - int(bool(self.no_x_dim))
-        acc_type = triton_acc_type(dtype)
-        cond = " & ".join(masks)
+        cse_compute = functools.partial(self.cse.generate, self.compute)
+        combine_helper_fn = self._lift_helper(combine_fn, len(values))
+        dim = self.triton_tensor_ndim() - 1
 
-        combine_helper_fn = self._lift_helper(combine_fn, 2)
+        for value, dtype in zip(values, dtypes):
+            acc_type = triton_acc_type(dtype)
+            cond = " & ".join(masks)
 
-        if self.persistent_reduction:
-            masked_value = self.cse.generate(
-                self.compute, f"tl.where({cond}, {value}, {default_tensor})"
+            value_dtype = self.cse.generate(
+                self.compute,
+                f"{value}.to({triton_compute_type(dtype)})",
             )
-            result_var = self.cse.generate(
+            value = self.cse.generate(
                 self.compute,
-                f"tl.associative_scan({masked_value}, {dim}, {combine_helper_fn})",
+                f"tl.broadcast_to({value_dtype}, {self.dense_size_str()})",
             )
-        else:
-            accumulator = self.cse.newvar()
-            reduced_size = self.dense_size_list()
-            reduced_size[-1] = "1"
-            reduced_size = f"[{', '.join(reduced_size)}]"
+            broadcasted_values.append(value)
 
-            self.body.writeline(
-                f"{accumulator} = tl.full({reduced_size}, {default}, {acc_type})"
-            )
+            acc_type = triton_acc_type(dtype)
+            cond = " & ".join(masks)
+
+            if not self.persistent_reduction:
+                accumulator = self.cse.newvar()
+                reduced_size = self.dense_size_list()
+                reduced_size[-1] = "1"
+                reduced_size = f"[{', '.join(reduced_size)}]"
+
+                default = "float('nan')" if dtype.is_floating_point else "-1"
+                self.body.writeline(
+                    f"{accumulator} = tl.full({reduced_size}, {default}, {acc_type})"
+                )
 
-            masked_value = self.cse.generate(
-                self.compute, f"tl.where({cond}, {value}, {default_tensor})"
+                accumulators.append(accumulator)
+
+        def csv(values):
+            return " ".join(f"{value}," for value in values)
+
+        def cse_multiple(line, n, masks):
+            cache_keys = [f"{line}, {i}, {masks}" for i in range(n)]
+            if all(cache_key in self.cse.cache for cache_key in cache_keys):
+                return [self.cse.cache[cache_key] for cache_key in cache_keys]
+            result_vars = [self.cse.newvar() for _ in range(n)]
+            self.compute.writeline(
+                f"{csv(result_vars)} = {line}",
             )
-            partial_reduce = self.cse.generate(
-                self.compute,
-                self.reduction_resize(
-                    f"tl.reduce({value}, {dim}, {combine_helper_fn})"
+            for result_var, cache_key in zip(result_vars, cache_keys):
+                if masks:
+                    result_var.mask_vars = masks  # type: ignore[attr-defined]
+                self.cse.cache[cache_key] = result_var
+            return tuple(result_vars)
+
+        partial_scan_vars = cse_multiple(
+            f"tl.associative_scan(({csv(broadcasted_values)}), {dim}, {combine_helper_fn})",
+            len(values),
+            masks,
+        )
+
+        if not self.persistent_reduction:
+            partial_reduce_vars = pytree.tree_map(
+                self.reduction_resize,
+                cse_multiple(
+                    f"tl.reduce(({csv(broadcasted_values)}), {dim}, {combine_helper_fn})",
+                    len(values),
+                    None,
                 ),
             )
-            acc_next = combine_fn(accumulator, partial_reduce)
-            partial_scan = self.cse.generate(
-                self.compute,
-                f"tl.associative_scan({masked_value}, {dim}, {combine_helper_fn})",
-            )
-            result_var = self.cse.generate(
-                self.compute, combine_fn(accumulator, partial_scan)
-            )
-            self.compute.writeline(f"{accumulator} = {acc_next}")
+            accs_next = combine_fn(tuple(accumulators), partial_reduce_vars)
+            full_scan_vars = combine_fn(tuple(accumulators), partial_scan_vars)
+            result_vars = [
+                cse_compute(f"tl.where(roffset > 0, {full_scan}, {partial_scan})")
+                for full_scan, partial_scan in zip(full_scan_vars, partial_scan_vars)
+            ]
+            for acc_next, accumulator, partial_reduce in zip(
+                accs_next, accumulators, partial_reduce_vars
+            ):
+                self.compute.writeline(
+                    f"{accumulator} = tl.where(roffset > 0, {acc_next}, {partial_reduce})"
+                )
+        else:
+            result_vars = partial_scan_vars
 
-        result_var.mask_vars = masks  # type: ignore[attr-defined]
-        return result_var
+        for result_var in result_vars:
+            result_var.mask_vars = masks  # type: ignore[attr-defined]
+
+        return tuple(result_vars)
 
     def codegen_body(self):
         """
@@ -2018,7 +2542,7 @@ def codegen_body(self):
         ):
             return
 
-        if self.inside_reduction and not self.persistent_reduction:
+        if self.inside_reduction and self.range_trees[-1].is_loop:
             self.body.writeline("for roffset in range(0, rnumel, RBLOCK):")
             with self.body.indent():
                 # last range tree is always reduction
@@ -2043,7 +2567,7 @@ def codegen_body(self):
         self.stores.clear()
         self.suffix.clear()
 
-    def codegen_kernel_benchmark(self):
+    def codegen_kernel_benchmark(self, num_gb, grid=None):
         result = IndentedBuffer()
         argdefs, call_args, signature = self.args.python_argdefs()
 
@@ -2062,7 +2586,7 @@ def codegen_kernel_benchmark(self):
                     # note that random seed is put in V.graph.constants
                     const_tensor = V.graph.constants[arg_name]
                     result.writeline(
-                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(const_tensor.size())}, {V.graph.sizevars.size_hints(const_tensor.stride())}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # noqa: B950 line too long
+                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(const_tensor.size())}, {V.graph.sizevars.size_hints(const_tensor.stride())}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # type: ignore[arg-type]  # noqa: B950 line too long
                     )
                 elif isinstance(arg_sig, SizeArg):
                     symval_hint = V.graph.sizevars.size_hint(arg_sig.expr)
@@ -2081,9 +2605,22 @@ def codegen_kernel_benchmark(self):
             result.writeline(f"return {', '.join(var_names)},")
 
         result.writelines(["\n", "\n", "def call(args):"])
-        grid = []
-        extra_args = []
-        extra_args_str = None
+        if grid is None:
+            grid = []
+            extra_args = []
+            extra_args_str = None
+            for tree in self.active_range_trees():
+                expr = pexpr(V.graph.sizevars.size_hint(tree.numel))
+                extra_args.append(expr)
+                if tree.prefix != "r":
+                    grid.append(expr)
+            if self.need_numel_args():
+                extra_args_str = ", ".join(map(str, extra_args)) + ", "
+            else:
+                extra_args_str = ""
+            grid_arg = f"{extra_args_str}grid=grid({', '.join(grid)})"
+        else:
+            grid_arg = f"grid={grid}"
         index = V.graph.scheduler.current_device.index
         with result.indent():
             result.writeline(f"with {V.graph.device_ops.device_guard(index)}:")
@@ -2091,22 +2628,10 @@ def codegen_kernel_benchmark(self):
                 result.writeline(
                     V.graph.device_ops.set_device(index)
                 )  # no-op to ensure context
-                for tree in self.active_range_trees():
-                    expr = pexpr(V.graph.sizevars.size_hint(tree.numel))
-                    extra_args.append(expr)
-                    if tree.prefix != "r":
-                        grid.append(expr)
-
                 stream_name = f"stream{index}"
                 result.writeline(f"{stream_name} = get_raw_stream({index})")
-
-                if self.need_numel_args():
-                    extra_args_str = ", ".join(map(str, extra_args)) + ", "
-                else:
-                    extra_args_str = ""
-
                 result.writeline(
-                    f"{str(Placeholder.KERNEL_NAME)}.run(*args, {extra_args_str}grid=grid({', '.join(grid)}), stream={stream_name})"
+                    f"{str(Placeholder.KERNEL_NAME)}.run(*args, {grid_arg}, stream={stream_name})"
                 )
 
         # benchmark all configs
@@ -2118,13 +2643,11 @@ def codegen_kernel_benchmark(self):
                     V.graph.device_ops.set_device(index)
                 )  # no-op to ensure context
                 result.writeline(
-                    f"return {str(Placeholder.KERNEL_NAME)}.benchmark_all_configs(*args, {extra_args_str}grid=grid({', '.join(grid)}))"  # noqa: B950 line too long
+                    f"return {str(Placeholder.KERNEL_NAME)}.benchmark_all_configs(*args, {grid_arg})"
                 )
 
-        ninplace_args = len(unique(self.args.inplace_buffers.values()))
         result.writelines(["\n", "\n", "if __name__ == '__main__':"])
         with result.indent():
-            result.writeline("from torch._inductor.utils import get_num_bytes")
             result.writeline("from triton.testing import do_bench")
             result.writeline("")
 
@@ -2132,9 +2655,7 @@ def codegen_kernel_benchmark(self):
             result.writeline(
                 "ms = do_bench(lambda: call(args), rep=40, fast_flush=True)"
             )
-            result.writeline(
-                f"num_gb = get_num_bytes(*args, num_in_out_args={ninplace_args}) / 1e9"
-            )
+            result.writeline(f"num_gb = {num_gb}")
             result.writeline("gb_per_s = num_gb / (ms / 1e3)")
             result.writeline(
                 'print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")'
@@ -2148,29 +2669,119 @@ def imports_for_benchmark_kernel(self):
             from torch._dynamo.testing import rand_strided
             {}
             import torch
-            from torch._inductor.triton_heuristics import grid
+            from torch._inductor.runtime.triton_heuristics import grid, split_scan_grid
         """.format(
                 V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
             )
         )
 
-    @staticmethod
-    @lru_cache(None)
-    def gen_attr_descriptor_import():
+    def estimate_kernel_num_bytes(self):
         """
-        import AttrsDescriptor if the triton version is new enough to have this
-        class defined.
+        Try the best to estimate the total size (in bytes) of the
+        kernel's inputs and outputs, which is used for estimating the memory
+        throughput of this kernel. This information is used for checking how
+        far we are from the peak memory bandwidth. It's important that
+        we want to avoid overestimating the sizes of the inputs and outputs,
+        because it can wrongfully give us a very large memory traffic value,
+        which may be even larger than the theoretical bandwidth and thus
+        become very misleading. This is particularly problematic for cases
+        where we slice some inputs. In those cases, we should only count
+        the size of the "slices" instead of the original inputs, because
+        only the slices contribute to the real memory traffic.
         """
-        import triton.compiler.compiler
+        nbytes = []
+        ninplace_args = len(unique(self.args.inplace_buffers.values()))
+        _, call_args, _ = self.args.python_argdefs()
 
-        if hasattr(triton.compiler.compiler, "AttrsDescriptor"):
-            return "from triton.compiler.compiler import AttrsDescriptor"
-        else:
-            return ""
+        # For pointwise and reduction kernels, this is the upper-bound numels
+        # for the output buffer.
+        # FIXME: This is not exactly right for cases like below:
+        #    def foo(tensor0, tensor1):
+        #        x0 = narrow(tensor0)
+        #        return cat(x0, tensor1)
+        # For this example, we will end up overestimate the size for the
+        # slice s0. Potentially, we could have precise inputs information
+        # if we maintained the original inputs of the Pointwise kernel created
+        # for the "cat". However, I think it might be a bit overwhelming that
+        # we add such complexity only for handling some particular cases for
+        # benchmarking.
+        out_numel = V.graph.sizevars.size_hint(sympy_product(self.numels))
+        for i, arg in enumerate(call_args):
+            # "buf" may be narrowed. In this case, the number of memory accesses
+            # should be estimated based on the reinterpreted layout.
+            # On the other hand, buf may be broadcasted. In this case,
+            # counting the size of the underline storage would give us
+            # a better estimation in terms of memory accesses.
+            if arg not in self.buf_accesses:
+                nbytes.append(0)
+                continue
+            arg_numel = V.graph.get_numel(arg)
+            buf_size = V.graph.sizevars.size_hint(arg_numel)
+            if buf_size > out_numel:
+                # This arg points to a buf that has been sliced.
+                # We need to count each individual slice to have
+                # a better estimation.
+                indices: Set[Any] = set()
+                no_index_dep_count = 0
+                for dep in self.buf_accesses[arg]:
+                    if isinstance(dep, (StarDep, WeakDep)):
+                        indices.add(f"no_index_dep_{no_index_dep_count}")
+                        no_index_dep_count += 1
+                    else:
+                        indices.add(dep.index)
+                numel = len(indices) * out_numel
+            else:
+                numel = buf_size
+            dtype = V.graph.get_dtype(arg)
+            dtype_size = get_dtype_size(dtype)
+            nbytes.append(numel * dtype_size * (1 + int(i < ninplace_args)))
+        return sum(nbytes)
 
-    def codegen_kernel(self, name=None):
-        from triton import next_power_of_2
+    def _get_heuristic(self):
+        if self.persistent_reduction:
+            assert self.inside_reduction
+            return "persistent_reduction"
+        elif self.inside_reduction:
+            return "reduction"
+        return "pointwise"
+
+    @staticmethod
+    def inductor_meta_common():
+        inductor_meta = {
+            "backend_hash": torch.utils._triton.triton_hash_with_backend(),
+            "are_deterministic_algorithms_enabled": torch.are_deterministic_algorithms_enabled(),
+            "assert_indirect_indexing": config.assert_indirect_indexing,
+            "autotune_local_cache": config.autotune_local_cache,
+            "autotune_pointwise": config.triton.autotune_pointwise,
+            "autotune_remote_cache": config.autotune_remote_cache,
+            "dynamic_scale_rblock": config.dynamic_scale_rblock,
+            "max_autotune": config.max_autotune,
+            "max_autotune_pointwise": config.max_autotune_pointwise,
+            "min_split_scan_rblock": config.triton.min_split_scan_rblock,
+            "spill_threshold": config.triton.spill_threshold,
+            "store_cubin": config.triton.store_cubin,
+        }
+        if torch.version.hip is not None:
+            inductor_meta["is_hip"] = True
+        if config.is_fbcode():
+            inductor_meta["is_fbcode"] = True
+        if config.profile_bandwidth:
+            inductor_meta["profile_bandwidth"] = config.profile_bandwidth
+            inductor_meta["profile_bandwidth_regex"] = config.profile_bandwidth_regex
+            inductor_meta["profile_bandwidth_output"] = config.profile_bandwidth_output
+        if config.coordinate_descent_tuning:
+            inductor_meta[
+                "coordinate_descent_tuning"
+            ] = config.coordinate_descent_tuning
+            inductor_meta[
+                "coordinate_descent_search_radius"
+            ] = config.coordinate_descent_search_radius
+            inductor_meta[
+                "coordinate_descent_check_all_directions"
+            ] = config.coordinate_descent_check_all_directions
+        return inductor_meta
 
+    def codegen_kernel(self, name=None):
         code = IndentedBuffer()
 
         size_hints = []
@@ -2192,43 +2803,29 @@ def codegen_kernel(self, name=None):
             else:
                 size_hint = next_power_of_2(int(numel_hint))
             size_hints.append(size_hint)
-        if self.persistent_reduction:
-            assert self.inside_reduction
-            heuristics = "persistent_reduction"
-        elif self.inside_reduction:
-            heuristics = "reduction"
-        else:
+
+        if not self.inside_reduction:
             size_hints.pop()
-            heuristics = "pointwise"
+
+        heuristics = self._get_heuristic()
 
         if name is None:
-            code.splice(
-                f"""
-                    import triton
-                    import triton.language as tl
-                    from torch._inductor.ir import ReductionHint
-                    from torch._inductor.ir import TileHint
-                    from torch._inductor.triton_heuristics import AutotuneHint, {heuristics}
-                    from torch._inductor.utils import instance_descriptor
-                    from torch._inductor import triton_helpers
-                """
-            )
-            if self.gen_attr_descriptor_import():
-                code.splice(self.gen_attr_descriptor_import())
+            code.splice(gen_common_triton_imports())
 
             if config.benchmark_kernel:
                 code.splice(self.imports_for_benchmark_kernel())
 
         argdefs, _, signature = self.args.python_argdefs()
-        # maps actual expression to SizeArg if its in sizevars replacements
+        # maps actual expression to SizeArg if it is in sizevars replacements
         for i, arg in enumerate(signature):
-            if (
-                isinstance(arg, SizeArg)
-                and arg.expr in V.graph.sizevars.inv_precomputed_replacements
-            ):
-                signature[i] = SizeArg(
-                    arg.name, V.graph.sizevars.inv_precomputed_replacements[arg.expr]
-                )
+            if isinstance(arg, SizeArg):
+                # mypy is unhappy about the sympy.Expr
+                # type for the key of the dict below
+                symbol = cast(sympy.Symbol, arg.expr)
+                if symbol in V.graph.sizevars.inv_precomputed_replacements:
+                    signature[i] = SizeArg(
+                        arg.name, V.graph.sizevars.inv_precomputed_replacements[symbol]
+                    )
 
         mutated_args = set()
         for mutation in self.mutations:
@@ -2249,8 +2846,7 @@ def codegen_kernel(self, name=None):
         )
         triton_meta = {
             "signature": triton_meta_signature,
-            "device": V.graph.scheduler.current_device.index,
-            "device_type": V.graph.scheduler.current_device.type,
+            "device": DeviceProperties.create(V.graph.scheduler.current_device),
             "constants": {},
         }
 
@@ -2259,8 +2855,14 @@ def codegen_kernel(self, name=None):
             "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
             "mutated_arg_names": mutated_args,
             "no_x_dim": self.no_x_dim,
+            **self.inductor_meta_common(),
         }
 
+        num_gb = None
+        if config.benchmark_kernel or config.profile_bandwidth:
+            num_gb = self.estimate_kernel_num_bytes() / 1e9
+            inductor_meta["kernel_num_gb"] = num_gb
+
         for tree in self.active_range_trees():
             sizearg = SizeArg(f"{tree.prefix}numel", tree.numel)
             signature.append(sizearg)
@@ -2276,12 +2878,21 @@ def codegen_kernel(self, name=None):
             # argdefs.append(f"{tree.prefix}numel: tl.constexpr")
         triton_meta["configs"] = [config_of(signature)]
 
+        # Triton compiler includes equal_to_1 args into constants even
+        # when they are not constexpr. otherwise there may be a segfault
+        # during launching the Inductor-compiled Triton kernel.
+        # https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307
+        # https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
+        for arg_num in triton_meta["configs"][0].equal_to_1:  # type: ignore[index]
+            triton_meta["constants"][arg_num] = 1  # type: ignore[index]
+
+        self.triton_meta = triton_meta
+
         for tree in self.range_trees:
-            if tree.prefix == "r" and (
-                not self.inside_reduction or self.persistent_reduction
-            ):
+            if tree.prefix == "r" and self.persistent_reduction:
+                # RBLOCK for persistent_reduction is defined in codegen_static_numels
                 continue
-            if tree.prefix == "x" and self.no_x_dim:
+            if tree.tensor_dim is None:
                 continue
             argdefs.append(f"{tree.prefix.upper()}BLOCK : tl.constexpr")
 
@@ -2294,7 +2905,7 @@ def codegen_kernel(self, name=None):
         if self.inside_reduction:
             reduction_hint = self.reduction_hint
             heuristics_line = f"""
-                @{heuristics}(
+                @triton_heuristics.{heuristics}(
                     size_hints={size_hints!r},
                     reduction_hint={reduction_hint},
                     filename=__file__,
@@ -2311,7 +2922,7 @@ def codegen_kernel(self, name=None):
                 else:
                     tile_hint = "tile_hint=TileHint.DEFAULT,"
             heuristics_line = f"""
-                @{heuristics}(
+                @triton_heuristics.{heuristics}(
                     size_hints={size_hints!r}, {tile_hint}
                     filename=__file__,
                     triton_meta={triton_meta!r},
@@ -2331,7 +2942,7 @@ def codegen_kernel(self, name=None):
             code.splice(self.body)
 
         if config.benchmark_kernel:
-            code.splice(self.codegen_kernel_benchmark())
+            code.splice(self.codegen_kernel_benchmark(num_gb))
 
         return code.getvalue()
 
@@ -2371,75 +2982,81 @@ def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexp
                 code.writeline("XBLOCK: tl.constexpr = 1")
 
     def triton_tensor_ndim(self):
-        no_x_dim = int(bool(self.no_x_dim))
-        no_r_dim = self.numels[-1] == 1
-        return len(self.range_trees) - no_x_dim - no_r_dim
+        return sum(int(tree.tensor_dim is not None) for tree in self.range_trees)
 
-    def indexing_size_str(self, i=None, x=None):
-        # no_x_dim is sympy.logic.boolalg.BooleanTrue
-        no_x_dim = int(bool(self.no_x_dim))
+    def indexing_size_str(self, i):
         sizes = ["None"] * self.triton_tensor_ndim()
-        if i is not None:
-            idx = i - no_x_dim
-            sizes[idx] = ":"
+        sizes[i] = ":"
         return f"[{', '.join(sizes)}]"
 
     def dense_size_list(self) -> List[str]:
-        sizes = []
+        sizes = ["1"] * self.triton_tensor_ndim()
         for tree in self.range_trees:
-            if self.no_x_dim and tree.prefix == "x":
+            if tree.tensor_dim is None:
                 continue
-            if tree.prefix != "r" or self.inside_reduction:
-                sizes.append(f"{tree.prefix.upper()}BLOCK")
-            elif tree.prefix == "r" and tree.numel != 1:
-                sizes.append("1")
-
-        if sizes[0:3] == ["ZBLOCK", "YBLOCK", "XBLOCK"]:
-            sizes[0:3] = reversed(sizes[0:3])
-
-        if sizes[0:2] == ["YBLOCK", "XBLOCK"]:
-            sizes[0:2] = reversed(sizes[0:2])
 
+            if tree.prefix != "r" or self.inside_reduction:
+                sizes[tree.tensor_dim] = f"{tree.prefix.upper()}BLOCK"
         return sizes
 
     def dense_size_str(self):
         sizes = self.dense_size_list()
         return f"[{', '.join(sizes)}]"
 
-    def call_kernel(self, name: str, node: Optional[IRNode] = None):
-        wrapper = V.graph.wrapper_code
-        _, call_args, _ = self.args.python_argdefs()
-        # dynamo wraps unspec variable as 0d CPU tensor, need convert to scalar
-        for i in range(len(call_args)):
-            if V.graph.is_unspec_arg(call_args[i]):
-                call_args[i] = call_args[i] + ".item()"
-        grid = []
+    def _get_grid_fn(self):
+        return "grid"
+
+    def add_numel_to_call_args_and_grid(self, name, call_args, grid):
         # TODO(jansel): if there are constants, we shouldn't bother passing them as args
         for tree in self.range_trees:
             if isinstance(tree.numel, (sympy.Integer, sympy.Symbol)):
                 expr = tree.numel
             else:
-                expr = wrapper.generate_numel_expr(name, tree)
+                expr = V.graph.wrapper_code.generate_numel_expr(name, tree)
 
             if tree.prefix != "r" or self.inside_reduction:
                 call_args.append(expr)
-            if tree.prefix != "r":
+            if tree.grid_dim is not None:
                 grid.append(expr)
 
+    def get_call_args(self):
+        _, call_args, _ = self.args.python_argdefs()
+        # dynamo wraps unspec variable as 0d CPU tensor, need convert to scalar
+        for i in range(len(call_args)):
+            if V.graph.is_unspec_arg(call_args[i]):
+                call_args[i] = call_args[i] + ".item()"
+
+        return call_args
+
+    def call_kernel(self, name: str, node: Optional[IRNode] = None):
+        wrapper = V.graph.wrapper_code
+        call_args = self.get_call_args()
+        grid: List[Any] = []
+        self.add_numel_to_call_args_and_grid(name, call_args, grid)
+        current_device = V.graph.scheduler.current_device
+
+        if self.args.workspace_arg is not None:
+            ws = self.args.workspace_arg
+            wrapper.generate_workspace_allocation(
+                ws.nbytes, current_device, ws.zero_fill
+            )
+
         grid = wrapper.generate_default_grid(name, grid)
         wrapper.generate_kernel_call(
             name,
             call_args,
             grid,
-            V.graph.scheduler.current_device.index,
+            current_device.index,
             cuda=True,
             triton=True,
+            grid_fn=self._get_grid_fn(),
+            triton_meta=self.triton_meta,
         )
 
-    def codegen_nan_check(self):
-        if not config.nan_asserts:
-            return
+        if self.args.workspace_arg is not None:
+            wrapper.writeline(wrapper.make_free_by_names(["workspace"]))
 
+    def codegen_nan_check(self):
         wrapper = V.graph.wrapper_code
         _, call_args, arg_types = self.args.python_argdefs()
         for arg, arg_type in zip(call_args, arg_types):
@@ -2540,6 +3157,13 @@ def can_fuse(self, node1, node2):
         _, (numel2, rnumel2) = node2.group
         why = WhyNoFuse(node1, node2)
 
+        if node1.is_split_scan() and not node2.is_split_scan():
+            if node2.is_reduction():
+                why("Split scan cannot fuse with reductions")
+        elif node2.is_split_scan() and not node1.is_split_scan():
+            if node1.is_reduction():
+                why("Split scan cannot fuse with reductions")
+
         if node1.is_reduction() and node2.is_reduction():
             reduction_can_fuse = numel1 == numel2 and rnumel1 == rnumel2
             if not reduction_can_fuse:
@@ -2720,17 +3344,26 @@ def requires_closing_previous_reduction(node, node_schedule):
 
         return node_schedule
 
-    def codegen_nodes(self, nodes):
+    def codegen_node(
+        self, node: Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode]
+    ):
         """
         Given a set of pre-fused nodes, generate a Triton kernel.
         """
+
+        nodes: List[scheduler.SchedulerNode] = node.get_nodes()  # type: ignore[assignment]
+
         _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
 
         node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
+        buf_accesses = collections.defaultdict(list)
+        for node in nodes:
+            for access in node.read_writes.reads | node.read_writes.writes:
+                buf_accesses[access.name].append(access)
 
         schedule_log.debug("Schedule:\n %s", node_schedule)
 
-        return self.codegen_node_schedule(node_schedule, numel, rnumel)
+        return self.codegen_node_schedule(node_schedule, buf_accesses, numel, rnumel)
 
     @staticmethod
     def reduction_hint(node):
@@ -2775,9 +3408,9 @@ def within_32bit(e):
 
         # Only install guards for 32-bit indexing as there is no correctness
         # issue with using 64-bit for everything
-        V.graph.sizevars.guard_leq(numel, int_max)
+        V.graph.sizevars.guard_leq(numel, int_max)  # type: ignore[arg-type]
         for size in buf_sizes:
-            V.graph.sizevars.guard_leq(size, int_max)
+            V.graph.sizevars.guard_leq(size, int_max)  # type: ignore[arg-type]
         return True
 
     @staticmethod
@@ -2792,22 +3425,14 @@ def select_index_dtype(node_schedule, numel, reduction_numel):
             buffer_names.update(node.used_buffer_names())
 
         # Get buffers objects
+
         def _get_buffer(name: str) -> Union[ir.Buffer, ir.TensorBox]:
-            if name in V.graph.name_to_buffer:
-                return V.graph.name_to_buffer[name]
-            elif name in V.graph.graph_inputs:
-                return V.graph.graph_inputs[name]
-            elif name in V.graph.constants:
-                data = V.graph.constants[name]
-                return ir.ConstantBuffer(
-                    name,
-                    ir.FixedLayout(
-                        data.device, data.dtype, *V.graph.static_sizes_strides(data)
-                    ),
-                )
-            raise RuntimeError(f"Failed to find buffer matching name {name}")
+            buf = V.graph.get_buffer(name)
+            if buf is None:
+                raise RuntimeError(f"Failed to find buffer matching name {name}")
+            return buf
 
-        buffers = [_get_buffer(name) for name in buffer_names]
+        buffers = [V.graph.get_buffer(name) for name in buffer_names]
 
         # In theory we can separately check xnumel and rnumel are <= int_max
         # but some indexers do use the full linear index so we need to be
@@ -2818,6 +3443,29 @@ def _get_buffer(name: str) -> Union[ir.Buffer, ir.TensorBox]:
             return "tl.int32"
         return "tl.int64"
 
+    def has_non_contiguous_pw_in_reduction_kernel(self, node_schedule, numel, rnumel):
+        pointwise_nodes = list(
+            filter(
+                lambda n: n not in (EnableReduction, DisableReduction)
+                and not n.is_reduction()
+                and n.group[1][0] == numel * rnumel,
+                node_schedule,
+            )
+        )
+        for node in pointwise_nodes:
+            # An index can be an integer when loading a random seed.
+            if not all(
+                not isinstance(dep, MemoryDep)
+                or dep.is_contiguous()
+                or isinstance(dep.index, (sympy.Integer, int))
+                or dep.stride1_for_last_dim()
+                for dep in itertools.chain(
+                    node.read_writes.reads, node.read_writes.writes
+                )
+            ):
+                return True
+        return False
+
     def get_kernel_args(self, node_schedule, numel, reduction_numel):
         reductions = list(
             filter(
@@ -2832,6 +3480,14 @@ def get_kernel_args(self, node_schedule, numel, reduction_numel):
                 reduction_hint_val = hints[0]
             else:
                 reduction_hint_val = ReductionHint.DEFAULT
+
+            if (
+                reduction_hint_val == ReductionHint.INNER
+                and self.has_non_contiguous_pw_in_reduction_kernel(
+                    node_schedule, numel, reduction_numel
+                )
+            ):
+                reduction_hint_val = ReductionHint.DEFAULT
         else:
             reduction_hint_val = ReductionHint.DEFAULT
 
@@ -2870,38 +3526,76 @@ def codegen_comment(self, node_schedule):
                     f"{wrapper.comment} Fused node name list: {', '.join(node_names)}"
                 )
 
-    def codegen_node_schedule(self, node_schedule, numel, reduction_numel):
+    def codegen_node_schedule(
+        self, node_schedule, buf_accesses, numel, reduction_numel
+    ):
+        from torch._inductor.codegen.triton_split_scan import TritonSplitScanKernel
+
         tiled_groups = self.select_tiling(node_schedule, numel, reduction_numel)
-        reduction_hint_val, mutations, index_dtype = self.get_kernel_args(
-            node_schedule, numel, reduction_numel
+        (
+            reduction_hint_val,
+            mutations,
+            index_dtype,
+        ) = self.get_kernel_args(node_schedule, numel, reduction_numel)
+
+        is_split_scan = any(
+            isinstance(node, BaseSchedulerNode) and node.is_split_scan()
+            for node in node_schedule
         )
-
-        kernel = TritonKernel(
-            *tiled_groups,
-            reduction_hint=reduction_hint_val,
-            mutations=mutations,
-            index_dtype=index_dtype,
+        kernel_type = TritonSplitScanKernel if is_split_scan else TritonKernel
+        kernel_args = tiled_groups
+        kernel_kwargs = {
+            "reduction_hint": reduction_hint_val,
+            "mutations": mutations,
+            "index_dtype": index_dtype,
+        }
+        kernel = kernel_type(
+            *kernel_args,
+            **kernel_kwargs,
         )
+        kernel.buf_accesses = buf_accesses
 
         self.codegen_node_schedule_with_kernel(node_schedule, kernel)
 
         with V.set_kernel_handler(kernel):
             src_code = kernel.codegen_kernel()
 
+        kernel_name = self.define_kernel(src_code, node_schedule)
+        log.debug("Generating kernel code with kernel_name: %s", kernel_name)
+        kernel.kernel_name = kernel_name
+        kernel.code_hash = code_hash(src_code)
+
+        if kernel.persistent_reduction and config.triton.multi_kernel:
+            kernel2 = TritonKernel(
+                *kernel_args,
+                **kernel_kwargs,
+                disable_persistent_reduction=True,
+            )
+            self.codegen_node_schedule_with_kernel(node_schedule, kernel2)
+            with V.set_kernel_handler(kernel2):
+                src_code2 = kernel2.codegen_kernel()
+            kernel_name2 = self.define_kernel(src_code2, node_schedule)
+            kernel2.kernel_name = kernel_name2
+            kernel2.code_hash = code_hash(src_code2)
+
+            final_kernel = MultiKernel([kernel, kernel2])
+        else:
+            final_kernel = kernel  # type: ignore[assignment]
+
+        with V.set_kernel_handler(final_kernel):
             for node in node_schedule:
                 if node not in (EnableReduction, DisableReduction):
                     node.mark_run()
 
-        kernel_name = self.define_kernel(src_code, node_schedule)
-        log.debug("Generating kernel code with kernel_name: %s", kernel_name)
         self.codegen_comment(node_schedule)
-        kernel.call_kernel(kernel_name)
-        kernel.codegen_nan_check()
-        V.graph.removed_buffers |= kernel.removed_buffers
-        V.graph.inplaced_to_remove |= kernel.inplaced_to_remove
-
+        final_kernel.call_kernel(final_kernel.kernel_name)
+        if config.nan_asserts:
+            final_kernel.codegen_nan_check()
         if config.warn_mix_layout:
-            kernel.warn_mix_layout(kernel_name)
+            final_kernel.warn_mix_layout(kernel_name)
+
+        V.graph.removed_buffers |= final_kernel.removed_buffers
+        V.graph.inplaced_to_remove |= final_kernel.inplaced_to_remove
 
         if (
             V.graph.wrapper_code.supports_intermediate_hooks
@@ -2981,7 +3675,9 @@ def define_kernel(self, src_code, node_schedule):
             compile_wrapper = IndentedBuffer()
             compile_wrapper.writeline(f"async_compile.triton({subs_name!r}, '''")
             compile_wrapper.splice(src_code, strip=True)
-            compile_wrapper.writeline("''')")
+            compile_wrapper.writeline(
+                f"''', device_str='{V.graph.scheduler.current_device.type}')"
+            )
 
             metadata_comment = f"# kernel path: {kernel_path}"
             origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper)
@@ -2989,18 +3685,30 @@ def define_kernel(self, src_code, node_schedule):
             wrapper.define_kernel(
                 kernel_name, compile_wrapper.getvalue(), metadata_comment
             )
+
+            # log kernel metadata for offline analysis.
+            # E.g. one can find all unaligned inner reduction and check if
+            # padding helps with the perf kernel by kernel.
+            if is_metric_table_enabled("kernel_metadata"):
+                log_kernel_metadata(kernel_name, kernel_path, src_code)
+
         return kernel_name
 
-    def codegen_template(self, template_node, epilogue_nodes):
+    def codegen_template(
+        self, template_node, epilogue_nodes, only_gen_src_code=False
+    ) -> Optional[str]:
         """
         Codegen a triton template
+
+        If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper
         """
         _, (numel, rnumel) = template_node.group
         assert rnumel == 1
         kernel, render = template_node.node.make_kernel_render(template_node.node)
         with kernel:
-            for node in [template_node, *epilogue_nodes]:
-                node.mark_run()
+            if not only_gen_src_code:
+                for node in [template_node, *epilogue_nodes]:
+                    node.mark_run()
             partial_code = render()
             for node in epilogue_nodes:
                 node.codegen(kernel.split_and_set_ranges(node.get_ranges()))
@@ -3016,14 +3724,27 @@ def codegen_template(self, template_node, epilogue_nodes):
             node_schedule = [template_node, *epilogue_nodes]
 
             if config.benchmark_kernel:
-                src_code = f"{kernel.imports_for_benchmark_kernel()}\n{src_code}\n{kernel.codegen_kernel_benchmark().getvalue()}"
+                num_gb = kernel.estimate_kernel_num_bytes() / 1e9
+                grid_args = V.graph.sizevars.size_hints(kernel.call_sizes)
+                assert kernel.meta is not None, "meta is None"
+                grid = kernel.grid_fn(*grid_args, kernel.meta)
+                src_code = (
+                    f"{kernel.imports_for_benchmark_kernel()}\n"
+                    f"{src_code}\n"
+                    f"{kernel.codegen_kernel_benchmark(num_gb, grid).getvalue()}"
+                )
+
+            if only_gen_src_code:
+                return src_code
 
             kernel_name = self.define_kernel(src_code, node_schedule)
+
         self.codegen_comment(node_schedule)
         kernel.call_kernel(kernel_name, template_node.node)
         V.graph.removed_buffers |= kernel.removed_buffers
         V.graph.inplaced_to_remove |= kernel.inplaced_to_remove
         self.scheduler.free_buffers()
+        return None
 
     def codegen_sync(self):
         V.graph.wrapper_code.writeline(V.graph.device_ops.synchronize())
@@ -3212,30 +3933,57 @@ def flush(self):
     def ready_to_flush(self) -> bool:
         return False
 
-    def benchmark_fused_nodes(self, nodes):
-        _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
-        node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
-        tiled_groups = self.select_tiling(node_schedule, numel, rnumel)
-        reduction_hint_val, mutations, index_dtype = self.get_kernel_args(
-            node_schedule, numel, rnumel
-        )
+    def generate_kernel_code_from_nodes(self, nodes, benchmark_kernel=False):
+        @dataclasses.dataclass
+        class LastUsageHolder:
+            n: Any
+            last_usage: Any
 
-        kernel = TritonKernel(
-            *tiled_groups,
-            reduction_hint=reduction_hint_val,
-            mutations=mutations,
-            index_dtype=index_dtype,
-        )
+            def __del__(self):
+                self.n.last_usage = self.last_usage
+
+        last_usage_holders = [LastUsageHolder(n, n.last_usage) for n in nodes]
 
         # empty last_usage. May cause more aggressive 'evict_last'. Should be fine.
         for n in nodes:
             n.last_usage = set()
 
-        self.codegen_node_schedule_with_kernel(node_schedule, kernel)
-        with config.patch("benchmark_kernel", True), V.set_kernel_handler(kernel):
-            src_code = kernel.codegen_kernel()
+        if not nodes[0].is_template():
+            _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
+            node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
+
+            tiled_groups = self.select_tiling(node_schedule, numel, rnumel)
+            reduction_hint_val, mutations, index_dtype = self.get_kernel_args(
+                node_schedule, numel, rnumel
+            )
+
+            kernel = TritonKernel(
+                *tiled_groups,
+                reduction_hint=reduction_hint_val,
+                mutations=mutations,
+                index_dtype=index_dtype,
+            )
+
+            self.codegen_node_schedule_with_kernel(node_schedule, kernel)
+            with config.patch(
+                "benchmark_kernel", benchmark_kernel
+            ), V.set_kernel_handler(kernel):
+                src_code = kernel.codegen_kernel()
+        else:
+            template_node = nodes[0]
+            epilogue_nodes = nodes[1:]
+
+            with config.patch("benchmark_kernel", benchmark_kernel):
+                src_code = self.codegen_template(
+                    template_node, epilogue_nodes, only_gen_src_code=True
+                )
 
         src_code = src_code.replace(str(Placeholder.KERNEL_NAME), "triton_")
+        return src_code
+
+    @preserve_rng_state()
+    def benchmark_fused_nodes(self, nodes):
+        src_code = self.generate_kernel_code_from_nodes(nodes, benchmark_kernel=True)
         mod = PyCodeCache.load(src_code)
 
         def cache_file_path():
@@ -3268,7 +4016,17 @@ def store_cache():
         wrapped_jit_function = mod.triton_
 
         # call once to trigger the compilation
-        call(wrapped_jit_function.clone_args(*args)[0])
+        try:
+            call(wrapped_jit_function.clone_args(*args)[0])
+        except Exception as e:
+            log.debug(
+                "Exception (%s) in compiling fused nodes %s",
+                e,
+                {n.get_name() for n in nodes},
+            )
+            ms = float("inf")
+            store_cache()
+            return ms, mod.__file__
 
         launchers = wrapped_jit_function.launchers
         assert len(launchers) == 1
@@ -3280,6 +4038,12 @@ def store_cache():
             # generating out of range indices for later calls.
             ms = do_bench(lambda: call(wrapped_jit_function.clone_args(*args)[0]))
 
+            # overhead of cloning args gives bias for fusing the kernel
+            # in the case of mutating/in-placeable second fusion
+            # TODO - would be better as a hook in triton do_bench that reset
+            # the input values between benchmarking
+            ms = ms - do_bench(lambda: wrapped_jit_function.clone_args(*args))
+
         log.debug(
             "The fused kernel for %s took %.3f ms to run",
             {n.get_name() for n in nodes},
diff --git a/torch/_inductor/codegen/triton_foreach.py b/torch/_inductor/codegen/triton_foreach.py
index df75615747dca..210ab6b50a003 100644
--- a/torch/_inductor/codegen/triton_foreach.py
+++ b/torch/_inductor/codegen/triton_foreach.py
@@ -6,11 +6,12 @@
 from sympy import Integer
 
 from .. import metrics
+from ..runtime.hints import DeviceProperties
 from ..scheduler import SchedulerNode
 from ..utils import ceildiv, Placeholder
 from ..virtualized import V
 from .common import IndentedBuffer, Kernel
-from .triton import TritonKernel
+from .triton import gen_common_triton_imports, TritonKernel
 from .triton_utils import config_of, signature_to_meta
 
 
@@ -151,22 +152,28 @@ def create_sub_kernel(self, *groups, index_dtype, mutations, reduction_hint):
         self.sub_kernels.append(sub_kernel)
         return sub_kernel
 
-    def jit_line(self):
+    def jit_lines(self):
         can_use_32bit = all(k.index_dtype == "tl.int32" for k in self.sub_kernels)
         size_dtype = "tl.int32" if can_use_32bit else "tl.int64"
         _, _, signature = self.args.python_argdefs()
         triton_meta = {
             "signature": signature_to_meta(signature, size_dtype=size_dtype),
-            "device": V.graph.scheduler.current_device.index,
-            "device_type": V.graph.scheduler.current_device.type,
+            "device": DeviceProperties.create(V.graph.scheduler.current_device),
             "constants": {},
         }
         triton_meta["configs"] = [config_of(signature)]
-        inductor_meta = {"kernel_name": str(Placeholder.DESCRIPTIVE_NAME)}
-        return (
-            f"@foreach(num_warps={self.num_warps}, triton_meta={triton_meta!r}, inductor_meta={inductor_meta!r})\n"
-            + "@triton.jit"
-        )
+        inductor_meta = {
+            "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
+            **TritonKernel.inductor_meta_common(),
+        }
+        return f"""
+            @triton_heuristics.foreach(
+                num_warps={self.num_warps},
+                triton_meta={triton_meta!r},
+                inductor_meta={inductor_meta!r},
+            )
+            @triton.jit
+        """
 
     def grid(self):
         return (
@@ -180,19 +187,9 @@ def grid(self):
     def codegen_kernel(self, name=None):
         code = IndentedBuffer()
 
-        code.splice(
-            """
-                import triton
-                import triton.language as tl
-                from torch._inductor.triton_heuristics import foreach
-                from torch._inductor.utils import instance_descriptor
-                from torch._inductor import triton_helpers
-            """
-        )
-        if TritonKernel.gen_attr_descriptor_import():
-            code.splice(TritonKernel.gen_attr_descriptor_import())
+        code.splice(gen_common_triton_imports())
         argdefs, _, _ = self.args.python_argdefs()
-        code.writeline(self.jit_line())
+        code.splice(self.jit_lines())
         code.writeline(
             f"def {name or str(Placeholder.KERNEL_NAME)}({', '.join(argdefs)}):"
         )
diff --git a/torch/_inductor/codegen/triton_split_scan.py b/torch/_inductor/codegen/triton_split_scan.py
new file mode 100644
index 0000000000000..8df904946e4aa
--- /dev/null
+++ b/torch/_inductor/codegen/triton_split_scan.py
@@ -0,0 +1,178 @@
+import functools
+
+from typing import Optional, Set
+
+import torch._inductor.runtime.hints
+from torch._inductor import config
+
+from torch._inductor.codegen.triton import (
+    IterationRangesRoot,
+    triton_compute_type,
+    TritonKernel,
+)
+
+from torch._prims_common import prod
+
+from torch.utils._sympy.functions import CeilDiv
+
+
+class TritonSplitScanKernel(TritonKernel):
+    """Generates a triton kernel that supports ops.scan calls while also splitting
+    the reduction dimension over multiple triton programs.
+
+    For this kernel, loop numels will always take the form ``(xdim, rdim)``
+    and the grid has the shape ``(CeilDiv(rdim, RBLOCK), xdim)``. Communication
+    between blocks occurs within a global memory workspace buffer, which
+    must be zero-filled before launching the kernel.
+
+    Note that generation for ``ops.reduction`` is not supported.
+
+    For details of the communication strategy, see
+    https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back
+
+    """
+
+    def __init__(
+        self,
+        *groups,
+        index_dtype: str,
+        mutations: Optional[Set[str]] = None,
+        reduction_hint=torch._inductor.runtime.hints.ReductionHint.DEFAULT,
+        min_elem_per_thread=0,
+    ):
+        super().__init__(
+            *groups,
+            index_dtype=index_dtype,
+            mutations=mutations,
+            pid_cache=None,
+            reduction_hint=reduction_hint,
+            min_elem_per_thread=min_elem_per_thread,
+        )
+        self.no_x_dim = True
+
+    def initialize_range_tree(self, pid_cache):
+        prefixes = "yxr"
+        assert len(self.numels) <= len(
+            prefixes
+        ), "z dimension not supported for split scan"
+        active_prefixes = prefixes[len(prefixes) - len(self.numels) :]
+
+        grid_dims = "rxy"
+        for numel, prefix in zip(self.numels, active_prefixes):
+            is_reduction = prefix == "r"
+            tensor_dim = 0 if is_reduction else None
+            grid_dim = grid_dims.find(prefix)
+            self.range_trees.append(
+                IterationRangesRoot(
+                    f"{prefix}index",
+                    numel,
+                    prefix,
+                    grid_dim,
+                    self,
+                    pid_cache=pid_cache,
+                    is_loop=False,
+                    tensor_dim=tensor_dim,
+                    grid_dim=grid_dim,
+                    has_zdim=False,
+                )
+            )
+        for tree in self.range_trees:
+            tree.codegen_header(self.body)
+
+    def reduction(self, dtype, src_dtype, reduction_type, value):
+        raise NotImplementedError("NYI TritonSplitDimKernel reductions")
+
+    def scan(self, dtypes, combine_fn, values):
+        import triton.language as tl
+
+        (dtype,) = dtypes
+        (value,) = values
+
+        compute_type = triton_compute_type(dtype)
+        compute_type_triton = getattr(tl, compute_type[3:])
+
+        element_nbits = compute_type_triton.primitive_bitwidth
+
+        scratch_type = "tl.uint32" if element_nbits <= 16 else "tl.uint64"
+        scratch_type_triton = getattr(tl, scratch_type[3:])
+        scratch_elems_per_block = 3 if element_nbits == 64 else 1
+        scratch_nbytes_per_block = scratch_elems_per_block * (
+            scratch_type_triton.primitive_bitwidth // 8
+        )
+
+        cse_load = functools.partial(self.cse.generate, self.loads)
+        cse_compute = functools.partial(self.cse.generate, self.compute)
+
+        assert len(self.numels) == 2, "Unexpected tiling"
+        min_rblock = config.triton.min_split_scan_rblock
+        max_blocks = prod(self.numels[:-1]) * CeilDiv(self.numels[-1], min_rblock)
+        nbytes = scratch_nbytes_per_block * max_blocks
+        scratch_base, offset = self.args.workspace(nbytes=nbytes, zero_fill=True)
+        if offset != 0:
+            scratch_base = cse_load(f"{scratch_base} + {self.index_to_str(offset)}")
+        runtime_rblocks = cse_load(f"tl.num_programs({self.range_trees[-1].index})")
+        scratch_base = cse_load(
+            f"{scratch_base}.to(tl.pointer_type({scratch_type})) + xoffset * "
+            f"{scratch_elems_per_block} * {runtime_rblocks}"
+        )
+
+        masks = {f"{tree.prefix}mask" for tree in self.range_trees}
+        self.filter_masks(masks)
+        masks = sorted(masks)
+        assert not self._load_mask, "ops.scan not supported inside ops.masked"
+
+        value = cse_compute(f"{value}.to({compute_type})")
+        value = cse_compute(f"tl.broadcast_to({value}, {self.dense_size_str()})")
+
+        combine_helper_fn = self._lift_helper(combine_fn, 1)
+        dim = self.triton_tensor_ndim() - 1
+        assert dim == 0, ""
+
+        block_sum = cse_compute(f"tl.reduce({value}, {dim}, {combine_helper_fn})")
+        exclusive_prefix = self.cse.newvar()
+        if element_nbits == 64:
+            self.compute.splice(
+                f"""
+                {exclusive_prefix} = triton_helpers.exclusive_scan_decoupled_lookback_64(
+                    {scratch_base},
+                    {block_sum},
+                    {self.range_trees[-1].get_pid()},
+                    {combine_helper_fn},
+                )
+                """,
+                strip=True,
+            )
+
+        else:
+            assert element_nbits <= 32
+            value_as_uint_dtype = f"tl.uint{element_nbits}"
+
+            self.compute.splice(
+                f"""
+                {exclusive_prefix} = triton_helpers.exclusive_scan_decoupled_lookback(
+                    {scratch_base},
+                    {block_sum},
+                    {self.range_trees[-1].get_pid()},
+                    {combine_helper_fn},
+                    DTYPE_VALUE_AS_UINT={value_as_uint_dtype},
+                    DTYPE_PACK={scratch_type},
+                )
+                """,
+                strip=True,
+            )
+        # Compute final cumsum
+        block_scan = cse_compute(
+            f"tl.associative_scan({value}, {dim}, {combine_helper_fn})"
+        )
+        combined_result = cse_compute(
+            f"{combine_helper_fn}({exclusive_prefix}, {block_scan})"
+        )
+        return (
+            cse_compute(f"tl.where(roffset == 0, {block_scan}, {combined_result})"),
+        )
+
+    def _get_heuristic(self):
+        return "split_scan"
+
+    def _get_grid_fn(self):
+        return "split_scan_grid"
diff --git a/torch/_inductor/codegen/triton_utils.py b/torch/_inductor/codegen/triton_utils.py
index 7e4596e47690d..ea6f25ae2c0a5 100644
--- a/torch/_inductor/codegen/triton_utils.py
+++ b/torch/_inductor/codegen/triton_utils.py
@@ -1,16 +1,17 @@
-from typing import Dict, List, Union
+from typing import Any, Dict, List, Optional
+
+import sympy
 
 import torch
 
 from .. import config
-from ..utils import instance_descriptor
+from ..runtime.hints import instance_descriptor
+from ..utils import _type_of
 from ..virtualized import V
-from .common import SizeArg, TensorArg
-
+from .common import KernelArgType, SizeArg, TensorArg, WorkspaceArg
 
-def signature_of(arg: Union[TensorArg, SizeArg], *, size_dtype: str) -> str:
-    from triton.runtime.jit import JITFunction
 
+def signature_of(arg: KernelArgType, *, size_dtype: str) -> str:
     if isinstance(arg, TensorArg):
         # TODO: Remove fp8 special handling when Triton supports PyTorch fp8 dtypes.
         # Related PR: https://github.com/openai/triton/pull/2279/
@@ -18,8 +19,12 @@ def signature_of(arg: Union[TensorArg, SizeArg], *, size_dtype: str) -> str:
             tye = "*fp8e4nv"
         elif arg.dtype == torch.float8_e5m2:
             tye = "*fp8e5"
+        elif arg.dtype == torch.float8_e4m3fnuz:
+            tye = "*fp8e4b8"
+        elif arg.dtype == torch.float8_e5m2fnuz:
+            tye = "*fp8e5b16"
         else:
-            tye = JITFunction._type_of(arg.dtype)
+            tye = _type_of(arg.dtype)
         if V.graph.is_unspec_arg(arg.buffer):
             # had unwrapped 0d tensor as scalar
             new_tye = tye.lstrip("*")
@@ -34,7 +39,7 @@ def signature_of(arg: Union[TensorArg, SizeArg], *, size_dtype: str) -> str:
             # From triton/runtime/jit.py
             # `None` is nullptr.  Implicitly convert to *i8.
             return "*i8"
-        elif isinstance(arg.expr, float):
+        elif isinstance(arg.expr, (float, sympy.Float)):
             return "fp32"
         if size_dtype == "tl.int32":
             return "i32"
@@ -42,30 +47,71 @@ def signature_of(arg: Union[TensorArg, SizeArg], *, size_dtype: str) -> str:
             return "i64"
         else:
             raise NotImplementedError(f"unhandled size_dtype {size_dtype}")
+    if isinstance(arg, WorkspaceArg):
+        return "*i8"
     raise NotImplementedError(f"unhandled {type(arg)}: {arg}")
 
 
 def signature_to_meta(
-    signature: List[Union[TensorArg, SizeArg]], *, size_dtype: str
+    signature: List[KernelArgType],
+    *,
+    size_dtype: str,
+    indices: Optional[List[int]] = None,
 ) -> Dict[int, str]:
+    if indices is None:
+        indices = list(range(len(signature)))
     return {
-        i: signature_of(arg, size_dtype=size_dtype) for i, arg in enumerate(signature)
+        i: signature_of(arg, size_dtype=size_dtype)
+        for i, arg in zip(indices, signature)
     }
 
 
-def config_of(args: List[Union[TensorArg, SizeArg]]) -> instance_descriptor:
-    def is_aligned(
-        x: Union[TensorArg, SizeArg], alignment: int, include_tensor: bool
-    ) -> bool:
+def is_unaligned_buffer(arg: TensorArg):
+    buf_name = arg.buffer
+    if buf_name in V.graph.graph_inputs:
+        # See Note: [Input Alignment handling in Inductor]
+        return buf_name not in V.graph.aligned_inputs
+
+    if buf_name in V.graph.constants:
+        # all constants are assumed to be aligned
+        return False
+
+    if V.graph.scheduler:
+        layout = V.graph.scheduler.get_buffer_layout(buf_name)
+    else:
+        buffer = V.graph.get_buffer(buf_name)
+        # output arg
+        if not buffer:
+            assert buf_name == V.kernel.output_node.name
+            layout = V.kernel.output_node.layout
+        else:
+            layout = buffer.get_layout()
+
+    if isinstance(layout, torch._inductor.ir.NonOwningLayout):
+        return not layout.maybe_guard_aligned()
+    else:
+        return False
+
+
+def config_of(
+    args: List[KernelArgType],
+    *,
+    indices: Optional[List[int]] = None,
+) -> Any:
+    if indices is None:
+        indices = list(range(len(args)))
+
+    def is_aligned(x: KernelArgType, alignment: int, include_tensor: bool) -> bool:
         """
         Roughly follow triton code here:
         https://github.com/openai/triton/blob/5282ed890d453e10b9ee30076ef89115dd197761/python/triton/runtime/jit.py#L208-L222
         """
         if isinstance(x, TensorArg):
-            if not x.check_alignment:
-                return False
             if include_tensor:
-                return not V.graph.scheduler.is_unaligned_buffer(x.buffer)
+                offset_aligned = V.graph.sizevars.statically_known_multiple_of(
+                    x.offset * x.dtype.itemsize, alignment  # type: ignore[arg-type]
+                )
+                return offset_aligned and not is_unaligned_buffer(x)
             else:
                 return False
         if isinstance(x, SizeArg):
@@ -77,20 +123,36 @@ def is_aligned(
                 return False
             if isinstance(x.expr, float):
                 return False
-            return V.graph.sizevars.statically_known_multiple_of(x.expr, alignment)
+            return V.graph.sizevars.statically_known_multiple_of(x.expr, alignment)  # type: ignore[arg-type]
+        if isinstance(x, WorkspaceArg):
+            return V.graph.sizevars.statically_known_multiple_of(x.nbytes, alignment)  # type: ignore[arg-type]
         raise NotImplementedError(f"unhandled {type(x)}: {x}")
 
     if config.triton.divisible_by_16:
         divisible_by_16 = tuple(
             i
-            for i, arg in enumerate(args)
+            for i, arg in zip(indices, args)
             if is_aligned(arg, alignment=16, include_tensor=True)
         )
     else:
         divisible_by_16 = ()
     divisible_by_8 = tuple(
         i
-        for i, arg in enumerate(args)
+        for i, arg in zip(indices, args)
         if is_aligned(arg, alignment=8, include_tensor=False)
     )
-    return instance_descriptor(divisible_by_16, (), (), divisible_by_8)
+
+    equal_to_1 = tuple(
+        i
+        for i, arg in zip(indices, args)
+        if isinstance(arg, SizeArg)
+        and isinstance(arg.expr, (int, sympy.Integer))
+        and V.graph.sizevars.statically_known_equals(arg.expr, 1)  # type: ignore[arg-type]
+    )
+    # ids_of_folded_args is set from equal_to_1
+    # and None args by the Triton compiler
+    ids_of_folded_args = tuple(equal_to_1)
+
+    return instance_descriptor(
+        divisible_by_16, equal_to_1, ids_of_folded_args, divisible_by_8
+    )
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index 417de71d14766..b626289691423 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -1,29 +1,47 @@
 import collections
 import contextlib
 import dataclasses
+import dis
 import functools
 import inspect
 import operator
-import os
 import re
-from itertools import chain, count
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from itertools import count
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+)
 
 import sympy
 from sympy import Expr
 
 import torch
+import torch._ops
 from torch._dynamo.utils import counters, dynamo_timed
-from torch._inductor.codecache import get_cpp_wrapper_cubin_path_name
-from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols, SymTypes
 
+from torch._inductor.codegen.multi_kernel import MultiKernelState
+from torch.fx.experimental.symbolic_shapes import (
+    ConvertIntKey,
+    DivideByKey,
+    free_unbacked_symbols,
+    SymTypes,
+)
 from torch.fx.node import _get_qualified_name
 from torch.utils._sympy.singleton_int import SingletonInt
+from torch.utils._sympy.symbol import symbol_is_type, SymT
 
 from .. import codecache, config, ir
-from ..codecache import CudaKernelParamCache
-from ..ir import ComputedBuffer, InputBuffer, ReinterpretView
-from ..triton_heuristics import grid as default_grid
+from ..ir import ReinterpretView
+from ..runtime import triton_heuristics
+from ..runtime.hints import DeviceProperties
 from ..utils import (
     cache_on_self,
     get_benchmark_name,
@@ -32,14 +50,23 @@
     sympy_str,
 )
 from ..virtualized import V
+from .aoti_hipify_utils import maybe_hipify_code_wrapper
 from .common import CodeGen, DeferredLine, IndentedBuffer, PythonPrinter
 from .triton_utils import config_of, signature_to_meta
 
+if TYPE_CHECKING:
+    import triton
+
+    from ..graph import GraphLowering
+
 
 pexpr = PythonPrinter().doprint
 
 
-def buffer_reuse_key(node: ir.Buffer):
+ReuseKey = Tuple[torch.device, torch.dtype, str]
+
+
+def buffer_reuse_key(node: ir.Buffer) -> ReuseKey:
     return (
         node.get_device(),
         node.get_dtype(),
@@ -50,29 +77,7 @@ def buffer_reuse_key(node: ir.Buffer):
     )
 
 
-def is_int(s: str):
-    # Cpp code gen adds L at the end of ints
-    # Lets remove it for checking whether we have an int or not
-    if s and s[-1] == "L":
-        s = s[:-1]
-    try:
-        int(s)
-    except ValueError:
-        return False
-    except TypeError:
-        return False
-    return True
-
-
-def is_float(s: str):
-    try:
-        float(s)
-    except ValueError:
-        return False
-    return True
-
-
-def convert_arg_type(arg: torch.Argument):
+def convert_arg_type(arg: torch.Argument) -> str:
     from .cpp import CONTAINER_PYTHON_TO_CPP, PYTHON_TO_CPP
 
     # use x.real_type instead of x.type so that we get ScalarType instead of int
@@ -103,7 +108,7 @@ def convert_arg_type(arg: torch.Argument):
     raise AssertionError(f"unsupport python_type: {python_type}")
 
 
-def convert_return_type(ret: torch.Argument):
+def convert_return_type(ret: torch.Argument) -> str:
     # use x.real_type instead of x.type so that we get ScalarType instead of int
     python_type = repr(ret.real_type)  # type: ignore[attr-defined]
     python_to_cpp = {
@@ -121,7 +126,7 @@ def convert_return_type(ret: torch.Argument):
     return cpp_type
 
 
-def get_cpp_op_schema(kernel):
+def get_cpp_op_schema(kernel: torch._ops.OpOverload) -> str:
     args = kernel._schema.arguments
     returns = kernel._schema.returns
 
@@ -135,17 +140,41 @@ def get_cpp_op_schema(kernel):
         cpp_return_value = f"std::tuple<{tuple_returns}>"
 
     cpp_arg_type = [f"{convert_arg_type(arg)} {arg.name}" for arg in args]
-    return f"{cpp_return_value}({', '.join(cpp_arg_type)})"
+    return f"{cpp_return_value}({', '.join(cpp_arg_type)})"  # type: ignore[possibly-undefined]
+
+
+# TODO: Move to a well known place
+TritonMetaParams = Dict[str, int]
+TritonGrid = Union[
+    Tuple[Union[int, sympy.Expr], ...], Callable[[TritonMetaParams], Tuple[int, ...]]
+]
 
 
-def user_defined_kernel_grid_fn_code(name, configs, grids):
+def user_defined_kernel_grid_fn_code(
+    name: str,
+    configs: List["triton.Config"],
+    grids: List[TritonGrid],
+    wrapper: Optional["WrapperCodeGen"] = None,
+) -> Tuple[str, str]:
     output = IndentedBuffer()
 
+    def _convert_to_sympy_expr(item: Union[int, sympy.Expr]) -> sympy.Expr:
+        return item if isinstance(item, sympy.Expr) else sympy.Integer(item)
+
+    def determine_grid(grid: TritonGrid):
+        if wrapper is None or callable(grid):
+            # return as-is when used in eager mode or when grid is callable
+            return grid
+        # Grid contains ints/Expr, so utilize wrapper's expr printer for codegen
+        sympy_grid = tuple(_convert_to_sympy_expr(g) for g in grid)
+        return wrapper.codegen_shape_tuple(sympy_grid)
+
     fn_name = f"grid_wrapper_for_{name}"
     output.writeline(f"def {fn_name}(meta):")
     with output.indent():
         if len(grids) == 1:
-            output.writeline(f"return {grids[0]}")
+            grid = determine_grid(grids[0])
+            output.writeline(f"return {grid}")
         else:
             assert len(grids) > 1
             assert len(grids) == len(configs)
@@ -153,6 +182,7 @@ def user_defined_kernel_grid_fn_code(name, configs, grids):
             for grid, c in zip(grids, configs):
                 guards = [f"meta['{name}'] == {val}" for name, val in c.kwargs.items()]
                 guards = " and ".join(guards)
+                grid = determine_grid(grid)
                 statement = f"if {guards}: return {grid}"
                 if statement in seen:
                     continue
@@ -164,7 +194,7 @@ def user_defined_kernel_grid_fn_code(name, configs, grids):
 
 @dataclasses.dataclass
 class SymbolicCallArg:
-    inner: Any
+    inner: str
     # the original symbolic expression represented by inner
     inner_expr: sympy.Expr
 
@@ -183,30 +213,53 @@ def __str__(self):
 class MemoryPlanningState:
     def __init__(self):
         super().__init__()
-        self.reuse_pool: Dict[Any, List[FreeIfNotReusedLine]] = collections.defaultdict(
-            list
-        )
+        self.reuse_pool: Dict[
+            ReuseKey, List[FreeIfNotReusedLine]
+        ] = collections.defaultdict(list)
         self.total_allocated_buffer_size: int = 0
 
-    def __contains__(self, key):
+    def __contains__(self, key: ReuseKey) -> bool:
         return bool(self.reuse_pool.get(key, None))
 
-    def pop(self, key) -> "FreeIfNotReusedLine":
+    def pop(self, key: ReuseKey) -> "FreeIfNotReusedLine":
         item = self.reuse_pool[key].pop()
         assert not item.is_reused
         return item
 
-    def push(self, key, item: "FreeIfNotReusedLine"):
+    def push(self, key: ReuseKey, item: "FreeIfNotReusedLine") -> None:
         assert not item.is_reused
         self.reuse_pool[key].append(item)
 
 
+class WrapperLine:
+    pass
+
+
+@dataclasses.dataclass
+class EnterSubgraphLine(WrapperLine):
+    wrapper: "WrapperCodeGen"
+    graph: "GraphLowering"
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        self.wrapper.push_codegened_graph(self.graph)
+        code.do_indent()
+
+
 @dataclasses.dataclass
-class EnterDeviceContextManagerLine:
+class ExitSubgraphLine(WrapperLine):
+    wrapper: "WrapperCodeGen"
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        self.wrapper.pop_codegened_graph()
+        code.do_unindent()
+
+
+@dataclasses.dataclass
+class EnterDeviceContextManagerLine(WrapperLine):
     device_idx: int
     last_seen_device_guard_index: Optional[int]
 
-    def codegen(self, code: IndentedBuffer, device_cm_stack: contextlib.ExitStack):
+    def codegen(self, code: IndentedBuffer) -> None:
         if V.graph.cpp_wrapper:
             code.writeline("\n")
             if V.graph.aot_mode:
@@ -214,14 +267,16 @@ def codegen(self, code: IndentedBuffer, device_cm_stack: contextlib.ExitStack):
                 # associated with a device, so we never expect the device to change.
                 # CUDAStreamGuard sets the stream and the device.
                 if self.last_seen_device_guard_index is None:
-                    if config.aot_inductor.abi_compatible:
+                    if config.abi_compatible:
                         code.writeline(
                             "AOTICudaStreamGuard stream_guard(stream, this->device_idx_);"
                         )
                     else:
                         code.writeline(
-                            "at::cuda::CUDAStreamGuard stream_guard("
-                            + "at::cuda::getStreamFromExternal(stream, this->device_idx_));"
+                            maybe_hipify_code_wrapper(
+                                "at::cuda::CUDAStreamGuard stream_guard("
+                                + "at::cuda::getStreamFromExternal(stream, this->device_idx_));"
+                            )
                         )
                 else:
                     assert (
@@ -230,7 +285,11 @@ def codegen(self, code: IndentedBuffer, device_cm_stack: contextlib.ExitStack):
             else:
                 if self.last_seen_device_guard_index is None:
                     code.writeline(
-                        f"at::cuda::CUDAGuard device_guard({self.device_idx});"
+                        f"AOTICudaGuard device_guard({self.device_idx});"
+                        if config.abi_compatible
+                        else maybe_hipify_code_wrapper(
+                            f"at::cuda::CUDAGuard device_guard({self.device_idx});"
+                        )
                     )
                 else:
                     code.writeline(f"device_guard.set_index({self.device_idx});")
@@ -238,29 +297,29 @@ def codegen(self, code: IndentedBuffer, device_cm_stack: contextlib.ExitStack):
             # Note _DeviceGuard has less overhead than device, but only accepts
             # integers
             code.writeline(f"with {V.graph.device_ops.device_guard(self.device_idx)}:")
-            device_cm_stack.enter_context(code.indent())
+            code.do_indent()
             code.writeline(V.graph.device_ops.set_device(self.device_idx))
 
 
-class ExitDeviceContextManagerLine:
-    def codegen(self, code: IndentedBuffer, device_cm_stack: contextlib.ExitStack):
+class ExitDeviceContextManagerLine(WrapperLine):
+    def codegen(self, code: IndentedBuffer) -> None:
         if not V.graph.cpp_wrapper:
-            device_cm_stack.close()
+            code.do_unindent()
 
 
 @dataclasses.dataclass
-class MemoryPlanningLine:
+class MemoryPlanningLine(WrapperLine):
     wrapper: "WrapperCodeGen"
 
     def plan(self, state: MemoryPlanningState) -> "MemoryPlanningLine":
         """First pass to find reuse"""
         return self
 
-    def codegen(self, code: IndentedBuffer):
+    def codegen(self, code: IndentedBuffer) -> None:
         """Second pass to output code"""
         pass
 
-    def __str__(self):
+    def __str__(self) -> str:
         """
         Emits a string representation that fits on one line.
         """
@@ -279,7 +338,7 @@ def __str__(self):
 class AllocateLine(MemoryPlanningLine):
     node: ir.Buffer
 
-    def plan(self, state: MemoryPlanningState):
+    def plan(self, state: MemoryPlanningState) -> MemoryPlanningLine:
         if self.node.get_name() in V.graph.removed_buffers:
             return NullLine(self.wrapper)
 
@@ -299,7 +358,7 @@ def plan(self, state: MemoryPlanningState):
 
         return self
 
-    def codegen(self, code: IndentedBuffer):
+    def codegen(self, code: IndentedBuffer) -> None:
         assert self.node.get_name() not in V.graph.removed_buffers
         line = self.wrapper.make_buffer_allocation(self.node)
         code.writeline(line)
@@ -310,8 +369,10 @@ class FreeIfNotReusedLine(MemoryPlanningLine):
     node: ir.Buffer
     is_reused: bool = False
 
-    def plan(self, state: MemoryPlanningState):
-        if isinstance(self.node.layout, (ir.AliasedLayout, ir.MultiOutputLayout)):
+    def plan(self, state: MemoryPlanningState) -> MemoryPlanningLine:
+        if len(self.node.get_inputs_that_alias_output()) > 0:
+            return self
+        if isinstance(self.node.layout, ir.MultiOutputLayout):
             return self
         assert not self.is_reused
         if self.node.get_name() in V.graph.removed_buffers:
@@ -320,7 +381,7 @@ def plan(self, state: MemoryPlanningState):
             state.push(buffer_reuse_key(self.node), self)
         return self
 
-    def codegen(self, code: IndentedBuffer):
+    def codegen(self, code: IndentedBuffer) -> None:
         assert self.node.get_name() not in V.graph.removed_buffers
         if not self.is_reused:
             code.writeline(self.wrapper.make_buffer_free(self.node))
@@ -332,14 +393,14 @@ class ReuseLine(MemoryPlanningLine):
     reused_as: ir.Buffer
     delete_old: bool = True
 
-    def plan(self, state: MemoryPlanningState):
+    def plan(self, state: MemoryPlanningState) -> MemoryPlanningLine:
         if self.node.get_name() in V.graph.removed_buffers:
             assert self.reused_as.get_name() in V.graph.removed_buffers
             return NullLine(self.wrapper)
         assert self.reused_as.get_name() not in V.graph.removed_buffers
         return self
 
-    def codegen(self, code: IndentedBuffer):
+    def codegen(self, code: IndentedBuffer) -> None:
         assert self.node.get_name() not in V.graph.removed_buffers
         assert self.reused_as.get_name() not in V.graph.removed_buffers
         code.writeline(
@@ -351,6 +412,9 @@ class NullLine(MemoryPlanningLine):
     pass
 
 
+BufferName = str
+
+
 class WrapperCodeGen(CodeGen):
     """
     Generate outer wrapper in Python that calls the kernels.
@@ -358,14 +422,16 @@ class WrapperCodeGen(CodeGen):
 
     def __init__(self):
         super().__init__()
-        self._names_iter = count()
+        self._names_iter: Iterator[int] = count()
         self.header = IndentedBuffer()
         self.prefix = IndentedBuffer()
         self.suffix = IndentedBuffer()
         self.wrapper_call = IndentedBuffer()
-        self.src_to_kernel = {}
-        self.kenel_numel_expr = set()
-        self.lines = []
+        # If the generated source code is exactly the same, reuse the
+        # pre-existing kernel for it
+        self.src_to_kernel: Dict[str, str] = {}
+        self.kernel_numel_expr: Set[Tuple[str, "GraphLowering"]] = set()
+        self.lines: List[Union[MemoryPlanningLine, LineContext]] = []
         self.declare = ""
         self.declare_maybe_reference = ""
         self.ending = ""
@@ -376,14 +442,20 @@ def __init__(self):
         self.none_str = "None"
         self.size = "size()"
         self.stride = "stride()"
-        self.last_seen_device_guard_index = None
+        self.last_seen_device_guard_index: Optional[int] = None
         self.supports_intermediate_hooks = True
         self.expr_printer = pexpr
-        self.user_defined_kernel_cache: Dict[Tuple[Any, ...], str] = {}
-        self.unbacked_symbol_decls = set()
-        self.allow_stack_allocation = None
-        self.stack_allocated_buffers = {}
-        self.computed_sizes = set()
+        self.user_defined_kernel_cache: Dict[Tuple[Any, ...], Tuple[str, Any]] = {}
+        self.unbacked_symbol_decls: Set[str] = set()  # str of sympy.Symbol
+        self.allow_stack_allocation: Optional[bool] = None
+        self.stack_allocated_buffers: Dict[BufferName, ir.Buffer] = {}
+        self.computed_sizes: Set[sympy.Symbol] = set()
+
+        # this is used for tracking which GraphLowering instance---parent graph
+        # or (nested) subgraph---is currently codegened; the primary use case is
+        # including the graph instance into a cache key to avoid cross-graph
+        # caching during lowering of nested subgraphs
+        self.codegened_graph_stack = []
 
         self.write_header()
         self.write_prefix()
@@ -393,29 +465,35 @@ def __init__(self):
                 # include a hash so our code cache puts different constants into different files
                 self.write_constant(name, hashed)
 
-        self.allocated = set()
-        self.freed: Set[str] = set()
+        self.allocated: Set[BufferName] = set()
+        self.freed: Set[BufferName] = set()
 
         # maps from reusing buffer to reused buffer
-        self.reuses = dict()
+        self.reuses: Dict[BufferName, BufferName] = dict()
 
         self.write_get_raw_stream = functools.lru_cache(None)(  # type: ignore[assignment]
             self.write_get_raw_stream
         )
 
         @functools.lru_cache(None)
-        def add_import_once(line):
+        def add_import_once(line: str) -> None:
             self.header.writeline(line)
 
         self.add_import_once = add_import_once
-        self._metas = {}
+        self._metas: Dict[str, str] = {}
+        self.multi_kernel_state = MultiKernelState()
 
-    def write_constant(self, name, hashed):
+    def write_constant(self, name: str, hashed: str) -> None:
         self.header.writeline(f"{name} = None  # {hashed}")
 
-    def write_header(self):
+    def write_header(self) -> None:
+        context = torch._guards.TracingContext.try_get()
+        aot_config_comment = ""
+        if context is not None and context.aot_graph_name is not None:
+            aot_config_comment = f"# AOT ID: {context.aot_graph_name}"
         self.header.splice(
             f"""
+                {aot_config_comment}
                 from ctypes import c_void_p, c_long
                 import torch
                 import math
@@ -427,13 +505,17 @@ def write_header(self):
                 from torch._inductor.utils import maybe_profile
                 from torch._inductor.codegen.memory_planning import _align as align
 
-                from torch import device, empty, empty_strided
+                from torch import device, empty_strided
                 from {codecache.__name__} import AsyncCompile
                 from torch._inductor.select_algorithm import extern_kernels
+                from torch._inductor.codegen.multi_kernel import MultiKernelCall
 
                 aten = torch.ops.aten
                 inductor_ops = torch.ops.inductor
+                _quantized = torch.ops._quantized
                 assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                 alloc_from_pool = torch.ops.inductor._alloc_from_pool
                 reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
                 async_compile = AsyncCompile()
@@ -442,19 +524,20 @@ def write_header(self):
         )
 
     @cache_on_self
-    def write_triton_header_once(self):
+    def write_triton_header_once(self) -> None:
         self.header.splice(
             """
             import triton
             import triton.language as tl
-            from torch._inductor.triton_heuristics import grid, start_graph, end_graph
+            from {} import grid, split_scan_grid, start_graph, end_graph
             {}
             """.format(
-                V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
+                triton_heuristics.__name__,
+                V.graph.device_ops.import_get_raw_stream_as("get_raw_stream"),
             )
         )
 
-    def add_meta_once(self, meta):
+    def add_meta_once(self, meta: TritonMetaParams) -> str:
         meta = repr(meta)
         if meta not in self._metas:
             var = f"meta{len(self._metas)}"
@@ -463,13 +546,13 @@ def add_meta_once(self, meta):
         return self._metas[meta]
 
     @cache_on_self
-    def get_output_refs(self):
+    def get_output_refs(self) -> List[str]:
         return [x.codegen_reference(self.wrapper_call) for x in V.graph.graph_outputs]
 
-    def mark_output_type(self):
+    def mark_output_type(self) -> None:
         return
 
-    def codegen_input_size_asserts(self):
+    def codegen_input_size_asserts(self) -> None:
         for name, buf in V.graph.graph_inputs.items():
             if isinstance(buf, sympy.Expr):
                 continue
@@ -481,7 +564,18 @@ def codegen_input_size_asserts(self):
             stride = self.codegen_shape_tuple(buf.get_stride())
             self.prefix.writeline(f"assert_size_stride({name}, {size}, {stride})")
 
-    def write_prefix(self):
+    def codegen_input_nan_asserts(self) -> None:
+        self.prefix.writeline("# make sure graph inputs are not nan/inf")
+        for name, buf in V.graph.graph_inputs.items():
+            if isinstance(buf, sympy.Expr):
+                continue
+
+            line = f"assert not {name}.isnan().any().item()"
+            self.prefix.writeline(line)
+            line = f"assert not {name}.isinf().any().item()"
+            self.prefix.writeline(line)
+
+    def write_prefix(self) -> None:
         self.prefix.splice(
             """
 
@@ -494,44 +588,59 @@ def call(args):
         with self.prefix.indent():
             if config.triton.debug_sync_graph:
                 self.prefix.writeline(V.graph.device_ops.synchronize())
-            inp_len = len(V.graph.graph_inputs.keys())
-            if inp_len != 0:
-                lhs = f"{', '.join(V.graph.graph_inputs.keys())}{'' if inp_len != 1 else ','}"
+            if V.graph.graph_inputs:
+                lhs = ", ".join(V.graph.graph_input_names)
+                if len(V.graph.graph_input_names) == 1:
+                    lhs += ","
                 self.prefix.writeline(f"{lhs} = args")
                 self.prefix.writeline("args.clear()")
 
             self.codegen_inputs(self.prefix, V.graph.graph_inputs)
             if config.size_asserts:
                 self.codegen_input_size_asserts()
+            if config.nan_asserts:
+                self.codegen_input_nan_asserts()
 
-    def write_get_raw_stream(self, index):
+    # this function (and below) takes a graph as input so
+    # that stream caching happens per graph instance. this
+    # is important for nested subgraph codegening.
+    def write_get_raw_stream(self, device_idx: int, graph=None) -> str:
         self.write_triton_header_once()
-        name = f"stream{index}"
-        self.writeline(f"{name} = get_raw_stream({index})")
+        name = f"stream{device_idx}"
+        self.writeline(f"{name} = get_raw_stream({device_idx})")
         return name
 
-    def next_kernel_suffix(self):
+    def get_codegened_graph(self):
+        return self.codegened_graph_stack[-1]
+
+    def push_codegened_graph(self, graph):
+        self.codegened_graph_stack.append(graph)
+
+    def pop_codegened_graph(self):
+        return self.codegened_graph_stack.pop()
+
+    def next_kernel_suffix(self) -> str:
         return f"{next(self._names_iter)}"
 
-    def codegen_device_guard_enter(self, device_idx):
+    def codegen_device_guard_enter(self, device_idx: int) -> None:
         self.writeline(
             EnterDeviceContextManagerLine(device_idx, self.last_seen_device_guard_index)
         )
         self.last_seen_device_guard_index = device_idx
 
-    def codegen_device_guard_exit(self):
+    def codegen_device_guard_exit(self) -> None:
         self.writeline(ExitDeviceContextManagerLine())
 
-    def generate_return(self, output_refs):
+    def generate_return(self, output_refs: List[str]) -> None:
         if output_refs:
             self.wrapper_call.writeline("return (" + ", ".join(output_refs) + ", )")
         else:
             self.wrapper_call.writeline("return ()")
 
-    def generate_before_suffix(self, result):
+    def generate_before_suffix(self, result: IndentedBuffer) -> None:
         return
 
-    def generate_end(self, result):
+    def generate_end(self, result: IndentedBuffer) -> None:
         return
 
     def generate_fallback_kernel(self, fallback_kernel, args):
@@ -559,49 +668,68 @@ def generate_extern_kernel_alloc(self, extern_kernel, args):
                 f"run_intermediate_hooks({origin_node.name!r}, {output_name})"
             )
 
-    def generate_extern_kernel_out(self, output_view, codegen_reference, args, kernel):
-        if output_view:
-            args.append(f"out={output_view.codegen_reference()}")
-        else:
-            args.append(f"out={codegen_reference}")
+    def generate_extern_kernel_out(
+        self, kernel: str, out: str, out_view: Optional[str], args: List[str]
+    ):
+        args.append(f"out={out_view if out_view else out}")
         self.writeline(f"{kernel}({', '.join(args)})")
 
-    def generate_user_defined_triton_kernel(self, kernel_name, grid, configs, args):
-        grid, code = user_defined_kernel_grid_fn_code(kernel_name, configs, grid)
+    def generate_user_defined_triton_kernel(
+        self, kernel_name, grid, configs, args, triton_meta
+    ):
+        grid, code = user_defined_kernel_grid_fn_code(
+            kernel_name, configs, grid, wrapper=self
+        )
         # Must happen after free symbols are already codegened
-        with self.prefix.indent():
-            self.prefix.splice(code)
+        # Emit the grid wrapper function right before the call
+        for line in code.split("\n"):
+            self.writeline(line)
 
-        stream_name = self.write_get_raw_stream(V.graph.scheduler.current_device.index)
+        stream_name = self.write_get_raw_stream(
+            V.graph.scheduler.current_device.index, V.graph
+        )
         self.writeline(
             f"{kernel_name}.run({', '.join(args)}, grid={grid}, stream={stream_name})"
         )
 
     def generate_scatter_fallback(
-        self, output, inputs, kernel, python_kernel_name, src_is_tensor, reduce, kwargs
+        self,
+        output,
+        inputs,
+        cpp_kernel_name,
+        python_kernel_name,
+        src_is_tensor,
+        reduce,
+        kwargs,
     ):
-        line = f"{kernel}({','.join(map(str, inputs))}"
-        if kernel == "aten.scatter_":
+        line = f"{python_kernel_name}({','.join(map(str, inputs))}"
+        if python_kernel_name.startswith("aten.scatter_reduce"):
+            line += ", ".join([""] + kwargs)
+        else:
             if reduce:
                 line += f", reduce={repr(reduce)}"
-        else:
-            line += ", ".join([""] + kwargs)
-        line += f"){self.ending}"
+        line += ")"
         self.writeline(line)
 
+    def generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
+        indices_str = f"{self.open_bracket}{', '.join(indices)}{self.closed_bracket}"
+        args = [x, indices_str, values, accumulate]
+        self.writeline(self.wrap_kernel_call(kernel, args))
+
     def generate_extern_kernel_alloc_and_find_schema_if_needed(
         self,
-        name,
-        kernel,
-        codegen_args,
-        cpp_op_schema,
-        cpp_kernel_key,
-        cpp_kernel_overload_name="",
-        op_overload=None,
+        buf_name: str,
+        python_kernel_name: str,
+        cpp_kernel_name: str,
+        codegen_args: List[str],
+        cpp_op_schema: str,
+        cpp_kernel_key: str,
+        cpp_kernel_overload_name: str = "",
+        op_overload: Optional[torch._ops.OpOverload] = None,
         raw_args=None,
         outputs=None,
     ):
-        self.writeline(f"{name} = {kernel}({', '.join(codegen_args)})")
+        self.writeline(f"{buf_name} = {python_kernel_name}({', '.join(codegen_args)})")
 
     def generate_inf_and_nan_checker(self, node):
         # TODO: Add check for python too.
@@ -613,6 +741,10 @@ def generate(self, is_inference):
             self.write_triton_header_once()
         result = IndentedBuffer()
         result.splice(self.header)
+        # We do not want the cpp header for intermediate const graph. Headers would be
+        # rendered by the main module instead.
+        if V.graph.aot_mode and V.graph.cpp_wrapper and V.graph.is_const_graph:
+            result = IndentedBuffer()
 
         with contextlib.ExitStack() as stack:
             stack.enter_context(self.wrapper_call.indent())
@@ -629,18 +761,12 @@ def generate(self, is_inference):
             else:
                 self.memory_plan_reuse()
 
-            device_cm_stack = contextlib.ExitStack()
+            if config.triton.store_cubin:
+                self.generate_reset_kernel_saved_flags()
+
             for line in self.lines:
-                if isinstance(line, MemoryPlanningLine):
+                if isinstance(line, WrapperLine):
                     line.codegen(self.wrapper_call)
-                elif isinstance(
-                    line,
-                    (
-                        EnterDeviceContextManagerLine,
-                        ExitDeviceContextManagerLine,
-                    ),
-                ):
-                    line.codegen(self.wrapper_call, device_cm_stack)
                 else:
                     self.wrapper_call.writeline(line)
 
@@ -652,6 +778,9 @@ def generate(self, is_inference):
             if config.profile_bandwidth:
                 self.generate_end_graph()
 
+            if config.triton.store_cubin:
+                self.generate_save_uncompiled_kernels()
+
             self.generate_return(output_refs)
 
         self.finalize_prefix()
@@ -687,15 +816,29 @@ def memory_plan_reuse(self):
             self.lines.pop()
 
         # codegen allocations in two passes
-        planning_state = MemoryPlanningState()
+        planning_states = [MemoryPlanningState()]
+        past_planning_states = []
         for i in range(len(self.lines)):
-            if isinstance(self.lines[i], MemoryPlanningLine):
-                self.lines[i] = self.lines[i].plan(planning_state)
+            line = self.lines[i]
+            if isinstance(line, MemoryPlanningLine):
+                self.lines[i] = line.plan(planning_states[-1])
+            elif isinstance(line, EnterSubgraphLine):
+                planning_states.append(MemoryPlanningState())
+            elif isinstance(line, ExitSubgraphLine):
+                past_planning_states.append(planning_states.pop())
+        past_planning_states.append(planning_states.pop())
+        assert len(planning_states) == 0
+
+        # conservatively use the sum of all allocated buffer sizes
+        # in potentially nested scopes as the total allocated size
+        total_allocated_buffer_size = sum(
+            s.total_allocated_buffer_size for s in past_planning_states
+        )
 
         self.allow_stack_allocation = (
             self.allow_stack_allocation is not False
             and config.allow_stack_allocation
-            and planning_state.total_allocated_buffer_size <= MAX_STACK_ALLOCATION_SIZE
+            and total_allocated_buffer_size <= MAX_STACK_ALLOCATION_SIZE
         )
 
     def codegen_input_size_var_decl(self, code: IndentedBuffer, name):
@@ -732,18 +875,23 @@ def is_expr(x):
             filter(lambda x: not is_expr(x), graph_inputs.items())
         )
 
+        def is_unbacked_symbol(s):
+            return isinstance(s, sympy.Symbol) and free_unbacked_symbols(s)
+
         for name, shape in graph_inputs_expr:
-            shape = V.graph.sizevars.simplify(shape)
-            if shape in needed:
-                needed.remove(shape)
+            shape = V.graph.sizevars.simplify(shape)  # type: ignore[arg-type]
+            if (b := shape in needed) or is_unbacked_symbol(shape):
+                if b:
+                    needed.remove(shape)  # type: ignore[arg-type]
                 code.writeline(f"{self.declare}{shape} = {name}{self.ending}")
 
         for name, value in graph_inputs_tensors:
             shapes = value.get_size()
             for dim, shape in enumerate(shapes):
-                shape = V.graph.sizevars.simplify(shape)
-                if shape in needed:
-                    needed.remove(shape)
+                shape = V.graph.sizevars.simplify(shape)  # type: ignore[arg-type]
+                if (b := shape in needed) or is_unbacked_symbol(shape):
+                    if b:
+                        needed.remove(shape)  # type: ignore[arg-type]
                     code.writeline(
                         f"{self.declare}{shape} = {sizeof(name)}[{dim}]{self.ending}"
                     )
@@ -751,15 +899,16 @@ def is_expr(x):
         for name, value in graph_inputs_tensors:
             shapes = value.get_stride()
             for dim, shape in enumerate(shapes):
-                shape = V.graph.sizevars.simplify(shape)
-                if shape in needed:
-                    needed.remove(shape)
+                shape = V.graph.sizevars.simplify(shape)  # type: ignore[arg-type]
+                if (b := shape in needed) or is_unbacked_symbol(shape):
+                    if b:
+                        needed.remove(shape)  # type: ignore[arg-type]
                     code.writeline(
                         f"{self.declare}{shape} = {strideof(name)}[{dim}]{self.ending}"
                     )
 
     def ensure_size_computed(self, sym: sympy.Symbol):
-        if isinstance(sym, sympy.Symbol) and sym.name.startswith("ps"):
+        if isinstance(sym, sympy.Symbol) and symbol_is_type(sym, SymT.PRECOMPUTED_SIZE):
             if sym in self.computed_sizes:
                 return
             self.computed_sizes.add(sym)
@@ -771,8 +920,11 @@ def ensure_size_computed(self, sym: sympy.Symbol):
     def finalize_prefix(self):
         pass
 
-    def codegen_python_sizevar(self, x: Expr) -> str:
-        return pexpr(V.graph.sizevars.simplify(x))
+    def codegen_python_sizevar(self, x: Expr, *, simplify: bool = True) -> str:
+        if simplify:
+            return pexpr(V.graph.sizevars.simplify(x))
+        else:
+            return pexpr(x)
 
     def codegen_sizevar(self, x: Expr) -> str:
         return self.codegen_python_sizevar(x)
@@ -818,10 +970,21 @@ def codegen_multi_output(self, name, value):
 
     def codegen_dynamic_scalar(self, node):
         (data,) = (t.codegen_reference() for t in node.inputs)
-        if node.is_bool:
+        if len(node.keypath) == 0:
+            self.writeline(f"{node.sym} = {data}.item()")
+        elif len(node.keypath) == 1 and isinstance(node.keypath[0], ConvertIntKey):
             self.writeline(f"{node.sym} = 1 if {data}.item() else 0")
+        elif len(node.keypath) == 1 and isinstance(node.keypath[0], DivideByKey):
+            self.writeline(f"{node.sym}_undivided = {data}.item()")
+            self.writeline(
+                f"assert {node.sym}_undivided % {node.keypath[0].divisor} == 0, "
+                f"f'{{{node.sym}_undivided}} not divisible by {node.keypath[0].divisor}'"
+            )
+            self.writeline(
+                f"{node.sym} = {node.sym}_undivided // {node.keypath[0].divisor}"
+            )
         else:
-            self.writeline(f"{node.sym} = {data}.item()")
+            raise AssertionError(f"unrecognized keypath {node.keypath}")
         # No one should ever use this buffer, but for uniformity
         # define the variable and assign it None
         self.writeline(f"{node.get_name()} = None")
@@ -867,10 +1030,20 @@ def add_expr_input(name, val):
                     # the subclass.
                     continue
                 if isinstance(value, sympy.Expr):  # Don't need to add symbolic
-                    add_expr_input(name, V.graph.sizevars.size_hint(value))
+                    # TODO: this fallback and those below actually will generate possibly
+                    # invalid benchmark code, because it's not guaranteed 42
+                    # is actually a valid value for the kernel in question.
+                    # See https://github.com/pytorch/pytorch/issues/124686
+                    add_expr_input(name, V.graph.sizevars.size_hint(value, fallback=42))
                 else:
-                    shape = [V.graph.sizevars.size_hint(x) for x in value.get_size()]
-                    stride = [V.graph.sizevars.size_hint(x) for x in value.get_stride()]
+                    shape = [
+                        V.graph.sizevars.size_hint(x, fallback=42)
+                        for x in value.get_size()
+                    ]
+                    stride = [
+                        V.graph.sizevars.size_hint(x, fallback=42)
+                        for x in value.get_stride()
+                    ]
                     add_fake_input(
                         name, shape, stride, value.get_device(), value.get_dtype()
                     )
@@ -904,16 +1077,89 @@ def define_kernel(
         self.header.splice(f"\n\n{metadata_comment}{name} = {kernel}")
 
     def define_user_defined_triton_kernel(self, kernel, configs, kwargs):
+        from torch.utils._triton import patch_triton_dtype_repr
+
+        patch_triton_dtype_repr()
+
         original_name = kernel.__name__
 
+        from .common import KernelArgType, SizeArg, TensorArg
+
+        signature: List[KernelArgType] = []
+        constants: Dict[int, Any] = {}
+        non_constant_indices = []
+        equal_to_1_arg_idx: List[int] = []
+        for idx, key in enumerate(kernel.arg_names):
+            if key not in kwargs:
+                continue
+            arg = kwargs[key]
+            if idx in kernel.constexprs:
+                constants[idx] = arg
+            else:
+                non_constant_indices.append(idx)
+                if isinstance(arg, ir.Buffer):
+                    signature.append(
+                        TensorArg(
+                            name=key,
+                            buffer=arg.get_name(),
+                            dtype=arg.get_dtype(),
+                        )
+                    )
+                elif isinstance(arg, ir.ReinterpretView):
+                    # for ReinterpretView we use the underlying
+                    # buffer name and note the (possibly non-zero)
+                    # offset relative to the underlying buffer
+                    signature.append(
+                        TensorArg(
+                            name=key,
+                            buffer=arg.data.get_name(),
+                            dtype=arg.get_dtype(),
+                            offset=arg.layout.offset,
+                        )
+                    )
+                else:
+                    signature.append(SizeArg(key, arg))
+                    if isinstance(
+                        arg, (int, sympy.Integer)
+                    ) and V.graph.sizevars.statically_known_equals(
+                        arg, 1  # type: ignore[arg-type]
+                    ):
+                        equal_to_1_arg_idx.append(idx)
+        index_dtype = "tl.int32"
+        triton_meta = {
+            "signature": signature_to_meta(
+                signature,
+                size_dtype=index_dtype,
+                indices=non_constant_indices,
+            ),
+            "device": DeviceProperties.create(V.graph.scheduler.current_device),
+            # Triton compiler includes equal_to_1 args into constants even
+            # when they are not constexpr. otherwise there may be a segfault
+            # during launching the Inductor-compiled Triton kernel.
+            # TODO(aakhundov): add None args to constants, too. currently, this
+            # causes CUDA errors in test_aot_inductor.test_triton_kernel_with_none_input.
+            # https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307
+            # https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
+            "constants": {
+                **constants,
+                **dict.fromkeys(equal_to_1_arg_idx, 1),
+            },
+            "configs": [
+                config_of(
+                    signature,
+                    indices=non_constant_indices,
+                )
+            ],
+        }
+
         # Distinguish between different functions using function id
-        cache_key = [id(kernel.fn)]
-        for arg in kwargs.values():
-            if isinstance(arg, (ir.Buffer, ir.ReinterpretView)):
-                cache_key.append(arg.get_dtype())
-            elif len(configs) > 0:
+        cache_key: List[Any] = [id(kernel.fn)]
+        if len(configs) > 0:
+            for arg in kwargs.values():
                 # We need to key on non tensor arg only in autotune mode
-                cache_key.append(arg)
+                if not isinstance(arg, (ir.Buffer, ir.ReinterpretView)):
+                    cache_key.append(arg)
+        cache_key.append(str(triton_meta))
         cache_key = tuple(cache_key)
 
         if cache_key in self.user_defined_kernel_cache:
@@ -921,58 +1167,20 @@ def define_user_defined_triton_kernel(self, kernel, configs, kwargs):
 
         name = f"{original_name}_{len(self.user_defined_kernel_cache)}"
         # Add to the cache for the next use
-        self.user_defined_kernel_cache[cache_key] = name
+        self.user_defined_kernel_cache[cache_key] = (name, triton_meta)
 
         compile_wrapper = IndentedBuffer()
         compile_wrapper.writeline(f"async_compile.triton({original_name!r}, '''")
 
-        compile_wrapper.splice(
-            """
-            import triton
-            import triton.language as tl
-            from torch._inductor.utils import instance_descriptor
-            from torch._inductor.triton_heuristics import user_autotune
-            """,
-            strip=True,
-        )
-        from .triton import TritonKernel
-
-        if TritonKernel.gen_attr_descriptor_import():
-            compile_wrapper.splice(TritonKernel.gen_attr_descriptor_import())
-        compile_wrapper.newline()
+        from .triton import gen_common_triton_imports, TritonKernel
 
-        from .common import SizeArg, TensorArg
+        compile_wrapper.splice(gen_common_triton_imports())
 
-        signature: List[Union[TensorArg, SizeArg]] = []
-        constants = {}
-        for key, arg in kwargs.items():
-            idx = kernel.arg_names.index(key)
-            if idx in kernel.constexprs:
-                constants[key] = arg
-                continue
-            if isinstance(arg, (ir.Buffer, ir.ReinterpretView)):
-                signature.append(
-                    TensorArg(
-                        key,
-                        arg.codegen_reference(),
-                        arg.get_dtype(),
-                        # For ReinterpretView, we do not want to check alignment
-                        not isinstance(arg, ReinterpretView),
-                    )
-                )
-            else:
-                signature.append(SizeArg(key, arg))
-        index_dtype = "tl.int32"
         inductor_meta = {
             "kernel_name": name,
+            **TritonKernel.inductor_meta_common(),
         }
-        triton_meta = {
-            "signature": signature_to_meta(signature, size_dtype=index_dtype),
-            "device": V.graph.scheduler.current_device.index,
-            "device_type": V.graph.scheduler.current_device.type,
-            "constants": constants,
-            "configs": [config_of(signature)],
-        }
+
         configs = [
             {
                 "kwargs": config.kwargs,
@@ -981,13 +1189,15 @@ def define_user_defined_triton_kernel(self, kernel, configs, kwargs):
             }
             for config in configs
         ]
+
         compile_wrapper.splice(
             f"""
-            @user_autotune(
+            @triton_heuristics.user_autotune(
                 configs={configs!r},
                 inductor_meta={inductor_meta!r},
                 triton_meta={triton_meta!r},
-                filename=__file__
+                filename=__file__,
+                custom_kernel=True,
             )
             @triton.jit
             """
@@ -1000,6 +1210,15 @@ def define_user_defined_triton_kernel(self, kernel, configs, kwargs):
         symbols_included = {original_name}
 
         def traverse(cur_kernel):
+            # here we extract the unqualified names (i.e., not attributes and
+            # without prepended module name) loaded in the kernel code, which
+            # are matched with the co_names and __globals__ below to codegen
+            # the respective imports necessary for the kernel compilation
+            unqualified_loads = {
+                inst.argval
+                for inst in dis.Bytecode(cur_kernel.fn)
+                if inst.opname == "LOAD_GLOBAL"
+            }
             for symbol_name in cur_kernel.fn.__code__.co_names:
                 if symbol_name in symbols_included:
                     continue
@@ -1015,10 +1234,28 @@ def traverse(cur_kernel):
                         compile_wrapper.newline()
                         compile_wrapper.writeline(f"{symbol_name} = {symbol!r}")
                         symbols_included.add(symbol_name)
+                    elif (
+                        symbol_name in unqualified_loads
+                        and symbol_name != "tl"  # already imported
+                        and hasattr(symbol, "__module__")
+                        # only codegen imports from triton; JITFunctions
+                        # imported from other modules will be codegened
+                        # in the separate branch above
+                        and symbol.__module__.startswith("triton")
+                    ):
+                        # a global symbol imported from triton is referenced
+                        # without module qualification (i.e., `store` instead
+                        # of `tl.store`): need to codegen an import
+                        compile_wrapper.writeline(
+                            f"from {symbol.__module__} import {symbol.__name__} as {symbol_name}"
+                        )
+                        symbols_included.add(symbol_name)
 
         traverse(kernel)
 
-        compile_wrapper.writeline("''')")
+        compile_wrapper.writeline(
+            f"''', device_str='{V.graph.scheduler.current_device.type}')"
+        )
         _, lineno = inspect.getsourcelines(kernel.fn)
         srcfile = inspect.getsourcefile(kernel.fn)
         metadata = f"# Original path: {srcfile}:{lineno}"
@@ -1027,12 +1264,13 @@ def traverse(cur_kernel):
             compile_wrapper.getvalue(),
             metadata,
         )
-        return name
+        return name, triton_meta
 
     def generate_numel_expr(self, kernel_name: str, tree):
         expr = f"{kernel_name}_{tree.prefix}numel"
-        if expr not in self.kenel_numel_expr:
-            self.kenel_numel_expr.add(expr)
+        if (expr, V.graph) not in self.kernel_numel_expr:
+            # declare expr once in each graph (scope)
+            self.kernel_numel_expr.add((expr, V.graph))
             self.writeline(
                 f"{self.declare}{expr} = {self.expr_printer(tree.numel)}{self.ending}"
             )
@@ -1047,6 +1285,14 @@ def generate_numel_expr(self, kernel_name: str, tree):
         # it suffices as a type hint for the purposes of producing the correct code for this type.
         return SymbolicCallArg(expr, tree.numel)
 
+    def generate_workspace_allocation(self, nbytes, device, zero_fill):
+        line = self.make_allocation(
+            "workspace", device, torch.uint8, shape=(nbytes,), stride=(1,)
+        )
+        self.writeline(line)
+        if zero_fill:
+            self.writeline(f"workspace.zero_(){self.ending}")
+
     def wrap_kernel_call(self, name, call_args):
         return f"{name}({', '.join(call_args)}){self.ending}"
 
@@ -1061,7 +1307,42 @@ def generate_start_graph(self):
         self.wrapper_call.writeline("start_graph()")
 
     def generate_end_graph(self):
-        self.wrapper_call.writeline("end_graph()")
+        self.wrapper_call.writeline(f"end_graph({config.profile_bandwidth_output!r})")
+
+    def generate_reset_kernel_saved_flags(self):
+        self.wrapper_call.splice(
+            f"""
+            for kernel in globals().values():
+                if isinstance(kernel, {triton_heuristics.__name__}.CachingAutotuner):
+                    kernel.cuda_kernel_saved = False
+            """
+        )
+
+    def generate_save_uncompiled_kernels(self):
+        """
+        Precompile and save the CUBINs of the Triton kernels that haven't
+        been precompiled and saved as a side effect of running the generated
+        JIT model (Python wrapper). This can happen when the model contains
+        control flow: only one pass through the control flow operators covers
+        the kernels that are saved, the remaining kernels are not launched,
+        hence not saved. The main purpose of this codegen is to compile and
+        save the Triton kernels outside the active control flow path for
+        subsequent AOTInductor code generation and compilation.
+        """
+        self.wrapper_call.splice(
+            f"""
+            for kernel in globals().values():
+                if isinstance(kernel, {triton_heuristics.__name__}.CachingAutotuner):
+                    if not kernel.cuda_kernel_saved:
+                        if len(kernel.launchers) == 0:
+                            kernel.precompile()
+                        kernel.save_cuda_kernel(
+                            grid=(0, 0, 0),   # use dummy grid
+                            stream="stream",  # use dummy stream
+                            launcher=kernel.launchers[0],
+                        )
+            """
+        )
 
     def generate_default_grid(self, name: str, grid_args: List[Any]):
         return grid_args
@@ -1074,6 +1355,9 @@ def generate_kernel_call(
         device_index=None,
         cuda=True,
         triton=True,
+        arg_types=None,
+        grid_fn: str = "grid",
+        triton_meta=None,
     ):
         """
         Generates kernel call code.
@@ -1087,12 +1371,13 @@ def generate_kernel_call(
         if cuda:
             call_args_str = ", ".join(pexpr(item) for item in call_args)
             stream_name = self.write_get_raw_stream(
-                V.graph.scheduler.current_device.index
+                V.graph.scheduler.current_device.index, V.graph
             )
             if triton:
                 grid_str = ", ".join(pexpr(item) for item in grid)
+                grid_str = f"{grid_fn}({grid_str})"
                 self.writeline(
-                    f"{name}.run({call_args_str}, grid=grid({grid_str}), stream={stream_name})"
+                    f"{name}.run({call_args_str}, grid={grid_str}, stream={stream_name})"
                 )
             else:
                 stream_ptr = f"c_void_p({stream_name})"
@@ -1103,15 +1388,24 @@ def generate_kernel_call(
     def writeline(self, line):
         self.lines.append(line)
 
+    def writelines(self, lines):
+        for line in lines:
+            self.writeline(line)
+
     def enter_context(self, ctx):
         self.lines.append(LineContext(ctx))
 
-    def val_to_cpp_arg_str(self, type_, val, is_legacy_abi) -> str:
-        raise NotImplementedError()
+    def val_to_cpp_arg_str(self, type_, val) -> str:
+        raise NotImplementedError
 
     def val_to_arg_str(self, s):
+        from torch.utils._triton import dtype_to_string, has_triton_package
+
+        if has_triton_package():
+            import triton
+
         if isinstance(s, SymTypes):
-            return pexpr(sympy.expand(repr(s)))
+            return pexpr(s.node.expr)
         elif isinstance(s, sympy.Expr):
             return pexpr(s)
         elif isinstance(s, (tuple, list)):
@@ -1126,8 +1420,10 @@ def __repr__(self):
             return repr(type(s)(Shim(self.val_to_arg_str(a)) for a in s))
         elif isinstance(s, torch._ops.OpOverload):
             return _get_qualified_name(s)
-        elif isinstance(s, (ComputedBuffer, InputBuffer, ReinterpretView)):
+        elif isinstance(s, (ir.Buffer, ReinterpretView)):
             return s.codegen_reference()
+        elif has_triton_package() and isinstance(s, triton.language.dtype):  # type: ignore[possibly-undefined]
+            return dtype_to_string(s)
         else:
             return repr(s)
 
@@ -1140,23 +1436,21 @@ def make_buffer_allocation(self, buffer):
         return self.make_allocation(buffer.get_name(), device, dtype, shape, stride)
 
     def make_allocation(self, name, device, dtype, shape, stride):
-        try:
-            expected = tuple(ir.make_contiguous_strides_for(shape))
-        except Exception:  # cannot determine truth value of Relational
-            expected = None
-        if stride == expected:
-            return (
-                f"{name} = empty("
-                f"{self.codegen_shape_tuple(shape)}, "
-                f"device='{device.type}', dtype={dtype})"
-            )
-        else:
+        if device.type in ("cpu", "cuda"):
+            # optimized path for faster allocations, saving ~2us versus the stuff below
             return (
-                f"{name} = empty_strided("
+                f"{name} = empty_strided_{device.type}("
                 f"{self.codegen_shape_tuple(shape)}, "
                 f"{self.codegen_shape_tuple(stride)}, "
-                f"device='{device.type}', dtype={dtype})"
+                f"{dtype})"
             )
+        # all other devices:
+        return (
+            f"{name} = empty_strided("
+            f"{self.codegen_shape_tuple(shape)}, "
+            f"{self.codegen_shape_tuple(stride)}, "
+            f"device='{device.type}', dtype={dtype})"
+        )
 
     def make_tensor_alias(self, new_name, old_name, comment=""):
         return f"{self.declare}{new_name} = {old_name}{self.ending}  {self.comment} {comment}"
@@ -1200,10 +1494,6 @@ def codegen_deferred_allocation(self, name, layout):
         )
 
     def codegen_allocation(self, buffer):
-        assert (
-            buffer.get_workspace_size() == 0
-        ), "Only support zero workspace size for now!"
-
         name = buffer.get_name()
 
         if name in V.graph.removed_buffers or name in self.allocated:
@@ -1216,9 +1506,9 @@ def codegen_allocation(self, buffer):
             return
 
         layout = buffer.get_layout()
-        if isinstance(layout, ir.MutationLayout):
+        if isinstance(layout, ir.MutationLayoutSHOULDREMOVE):
             return
-        if isinstance(layout, ir.AliasedLayout):
+        if isinstance(layout, ir.NonOwningLayout):
             assert isinstance(
                 layout.view, ir.ReinterpretView
             ), f"unexpected {type(layout.view)}: {layout.view}"
@@ -1280,15 +1570,106 @@ def codegen_unbacked_symbol_decl(self, symbol):
         if name in self.unbacked_symbol_decls:
             return name
         else:
-            # When in CppWrapperCodeGen, we should only generate the declaration once
+            # When in CppWrapperCpu, we should only generate the declaration once
             self.unbacked_symbol_decls.add(name)
             return self.declare + name
 
+    def codegen_subgraph_prefix(self, subgraph, outer_inputs, outer_outputs):
+        for inner_input, outer_input in zip(subgraph.graph.graph_inputs, outer_inputs):
+            self.writeline(f"{self.declare}{inner_input} = {outer_input}{self.ending}")
+
+    def codegen_subgraph_suffix(self, subgraph, outer_inputs, outer_outputs):
+        for inner_output, outer_output in zip(
+            subgraph.graph.graph_outputs, outer_outputs
+        ):
+            self.writeline(
+                f"{outer_output} = {inner_output.codegen_reference()}{self.ending}"
+            )
+
+    def codegen_subgraph(self, subgraph, outer_inputs, outer_outputs):
+        try:
+            self.push_codegened_graph(subgraph.graph)
+            self.writeline(f"{self.comment} subgraph: {subgraph.name}")
+            self.codegen_subgraph_prefix(subgraph, outer_inputs, outer_outputs)
+            parent_graph = V.graph
+            with V.set_graph_handler(subgraph.graph):
+                subgraph.graph.codegen_subgraph(
+                    parent_graph=parent_graph,
+                )
+            self.codegen_subgraph_suffix(subgraph, outer_inputs, outer_outputs)
+        finally:
+            self.pop_codegened_graph()
+
+    def codegen_conditional(self, conditional):
+        name = conditional.get_name()
+
+        self.writeline(f"{name} = [None] * {len(conditional.outputs)}")
+
+        outer_inputs = [buf.codegen_reference() for buf in conditional.operands]
+        outer_outputs = [f"{name}[{i}]" for i in range(len(conditional.outputs))]
+
+        predicate = conditional.predicate.codegen_reference()
+        if not isinstance(conditional.predicate, ir.ShapeAsConstantBuffer):
+            # move the Tensor predicate to host
+            predicate = f"{predicate}.item()"
+
+        self.writeline(f"{name} = [None] * {len(conditional.outputs)}")
+        self.writeline(f"if {predicate}:")
+        self.writeline(EnterSubgraphLine(self, conditional.true_subgraph.graph))
+        self.codegen_subgraph(conditional.true_subgraph, outer_inputs, outer_outputs)
+        self.writeline(ExitSubgraphLine(self))
+        self.writeline("else:")
+        self.writeline(EnterSubgraphLine(self, conditional.false_subgraph.graph))
+        self.codegen_subgraph(conditional.false_subgraph, outer_inputs, outer_outputs)
+        self.writeline(ExitSubgraphLine(self))
+
+    def codegen_while_loop(self, while_loop):
+        name = while_loop.get_name()
+        outer_carried_inputs = [
+            buf.codegen_reference() for buf in while_loop.carried_inputs
+        ]
+        outer_additional_inputs = [
+            buf.codegen_reference() for buf in while_loop.additional_inputs
+        ]
+
+        self.writeline(f"{name} = [None] * {len(outer_carried_inputs)}")
+        for i, inp in enumerate(outer_carried_inputs):
+            # set the initial state before the loop
+            self.writeline(f"{name}[{i}] = {inp}")
+
+        cond_outer_inputs = [
+            *[f"{name}[{i}]" for i in range(len(outer_carried_inputs))],
+            *outer_additional_inputs,
+        ]
+        cond_outer_outputs = [f"{name}_cond_result"]
+        body_outer_inputs = list(
+            cond_outer_inputs
+        )  # same inputs for cond_fn and body_fn
+        # Carry over the state from body_fn. Note: We only carry over
+        # the carried_inputs part of the inputs, the additional ones
+        # are passed in as they're before.
+        body_outer_outputs = body_outer_inputs[: len(outer_carried_inputs)]
+
+        self.writeline("while True:")
+        self.writeline(EnterSubgraphLine(self, while_loop.cond_subgraph.graph))
+        self.codegen_subgraph(
+            while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
+        )
+        self.writeline(
+            f"if not {cond_outer_outputs[0]}.item(): break"
+        )  # condition doesn't hold
+        self.writeline(ExitSubgraphLine(self))
+        self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
+        self.codegen_subgraph(
+            while_loop.body_subgraph, body_outer_inputs, body_outer_outputs
+        )
+        self.writeline(ExitSubgraphLine(self))
+
     @staticmethod
     def statically_known_int_or_none(x):
         try:
             val = V.graph._shape_env._maybe_evaluate_static(x)
-            return int(x)
+            return int(val)
         except Exception:
             return None
 
@@ -1313,1698 +1694,3 @@ def static_shape_for_buffer_or_none(buffer):
     @staticmethod
     def can_prove_buffer_has_static_shape(buffer):
         return WrapperCodeGen.static_shape_for_buffer_or_none(buffer) is not None
-
-
-class CppWrapperCodeGen(WrapperCodeGen):
-    """
-    Generates cpp wrapper for running on CPU and calls cpp kernels
-    """
-
-    def __init__(self):
-        super().__init__()
-
-        self.declare = "auto "
-        self.declare_maybe_reference = "decltype(auto) "
-        self.ending = ";"
-        self.open_bracket = "{"
-        self.closed_bracket = "}"
-        self.comment = "//"
-        self.namespace = "at::"
-        self.none_str = "at::Tensor()"
-        self.extern_call_ops = set()
-        self.size = "sizes()"
-        self.stride = "strides()"
-        self.call_func_name = "inductor_entry_cpp"
-        self.cuda = False
-        self.supports_intermediate_hooks = False
-        self.outputs_need_copy = set()
-        self.kernel_callsite_id = count()
-        self.int_array_id = count()  # for int array local variable declarations
-        self.declared_int_array_vars = set()
-        self.tmp_tensor_id = count()  # for tmp tensor local variable declarations
-        self.arg_var_id = count()
-        self.used_cached_dtypes = set()
-        self.cached_output_id = count()
-
-        from .cpp import cexpr, CppPrinter
-
-        self.expr_printer = cexpr
-
-        # CppPrinter sometimes calls at::native functions which causes problems in
-        # the ABI-compatible mode. Currently we are hitting this problem when codegen
-        # Grid computation expressions, but we my need to fix other size computation
-        # as well.
-        class GridExprCppPrinter(CppPrinter):
-            def _print_FloorDiv(self, expr):
-                x, div = expr.args
-                x = self.paren(self.doprint(x))
-                div = self.paren(self.doprint(div))
-                assert expr.is_integer, "Expect integers in GridExprPrinter"
-                return f"({x}/{div})"
-
-        self.grid_expr_printer = GridExprCppPrinter().doprint
-
-    def generate_kernel_call(
-        self,
-        name,
-        call_args,
-        grid=None,
-        device_index=None,
-        cuda=True,
-        triton=True,
-    ):
-        """
-        Generates kernel call code.
-
-        cuda: Defines whether the backend is GPU. Otherwise the backend is CPU.
-
-        triton: Defines whether the GPU backend uses Triton for codegen.
-                Otherwise it uses the CUDA language for codegen.
-                Only valid when cuda == True.
-        """
-        if cuda:
-            return super().generate_kernel_call(
-                name, call_args, grid, device_index, cuda, triton
-            )
-        else:
-            if V.graph.aot_mode and config.aot_inductor.abi_compatible:
-                from .cpp import DTYPE_TO_CPP
-
-                new_args = []
-                for arg in call_args:
-                    var_name = f"var_{next(self.arg_var_id)}"
-                    self.writeline(f"auto* {var_name} = get_data_ptr_wrapper({arg});")
-                    dtype = V.graph.get_dtype(arg)
-                    cpp_dtype = DTYPE_TO_CPP[dtype]
-                    new_args.append(f"({cpp_dtype}*)({var_name})")
-                self.writeline(self.wrap_kernel_call(name, new_args))
-            else:
-                self.writeline(self.wrap_kernel_call(name, call_args))
-
-    def write_constant(self, name, hashed):
-        # include a hash so our code cache gives different constants different files
-        self.header.writeline(f"// {name} {hashed}")
-
-    def write_header(self):
-        if V.graph.aot_mode:
-            for header_cpp_file in ("interface.cpp", "implementation.cpp"):
-                with open(
-                    os.path.join(
-                        os.path.dirname(__file__), "aoti_runtime", header_cpp_file
-                    )
-                ) as f:
-                    self.header.splice(f.read())
-        else:
-            self.header.splice(
-                """
-                import torch
-                from torch._inductor.codecache import CppWrapperCodeCache
-
-                cpp_wrapper_src = (
-                '''
-                """
-            )
-
-        if config.aot_inductor.abi_compatible:
-            self.header.splice("#include <torch/csrc/inductor/aoti_torch/c/shim.h>")
-        else:
-            self.header.splice(
-                """
-                #include <ATen/ATen.h>
-                #include <ATen/core/dispatch/Dispatcher.h>
-                #include <ATen/native/BinaryOps.h>
-                #include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
-                #include <torch/csrc/inductor/inductor_ops.h>
-                #include <torch/types.h>
-                #include <ATen/ops/bernoulli_native.h>
-
-                #define reinterpret_tensor torch::inductor::_reinterpret_tensor
-                #define alloc_from_pool torch::inductor::_alloc_from_pool
-                """
-            )
-
-        self.header.splice("#include <c10/util/generic_math.h>")
-
-        from .memory_planning import ALIGN_BYTES
-
-        # Round up to the nearest multiple of ALIGN_BYTES
-        # ALIGN_BYTES must be a power of 2
-        self.header.splice(
-            f"""
-            [[maybe_unused]] static int64_t align(int64_t nbytes) {{
-              return (nbytes + {ALIGN_BYTES} - 1) & -{ALIGN_BYTES};
-            }}
-            """
-        )
-
-    def mark_output_type(self):
-        # mark output type to unwrap tensor back to python scalar
-        from ..ir import ShapeAsConstantBuffer
-
-        output_is_tensor = dict()
-        for idx, x in enumerate(V.graph.graph_outputs):
-            if isinstance(x, ShapeAsConstantBuffer):
-                output_is_tensor[idx] = False
-            else:
-                output_is_tensor[idx] = True
-
-        self.output_is_tensor = output_is_tensor
-
-    def write_prefix(self):
-        if V.graph.aot_mode:
-            self.prefix.writeline("namespace torch {")
-            self.prefix.writeline("namespace aot_inductor {")
-
-    def write_input_output_info(
-        self,
-        info_kind: str,
-        idx: int,
-        name: str,
-    ):
-        self.prefix.writeline(f"""{info_kind}[{idx}].name = "{name}";""")
-
-    @staticmethod
-    def get_input_cpp_type(input):
-        assert config.use_minimal_arrayref_interface
-        from .cpp import DTYPE_TO_CPP
-
-        if isinstance(input, sympy.Expr):
-            from ..graph import may_get_constant_buffer_dtype
-
-            dtype = may_get_constant_buffer_dtype(input)
-            assert dtype is not None, f"Failed to get the dtype of sympy.Expr: {input}"
-            return DTYPE_TO_CPP[dtype]
-        return f"ArrayRefTensor<{DTYPE_TO_CPP[input.get_dtype()]}>"
-
-    def write_wrapper_decl(self):
-        inputs_len = len(V.graph.graph_inputs.keys())
-        if V.graph.aot_mode:
-            if config.use_minimal_arrayref_interface:
-                from .cpp import DTYPE_TO_CPP
-
-                input_cpp_types = ", ".join(
-                    f"{CppWrapperCodeGen.get_input_cpp_type(x)}"
-                    for x in V.graph.graph_inputs.values()
-                )
-
-                output_arrayref_types = ", ".join(
-                    f"ArrayRefTensor<{DTYPE_TO_CPP[x.get_dtype()]}>"
-                    for x in V.graph.graph_outputs
-                )
-
-                self.prefix.splice(
-                    f"""
-                    using AOTInductorModelInputs = std::tuple<{input_cpp_types}>;
-                    using AOTInductorModelOutputs = std::tuple<{output_arrayref_types}>;
-                    """
-                )
-
-            run_impl_proto = """
-                void AOTInductorModel::run_impl(
-                    AtenTensorHandle*
-                        input_handles, // array of input AtenTensorHandle; handles
-                                        // are stolen; the array itself is borrowed
-                    AtenTensorHandle*
-                        output_handles, // array for writing output AtenTensorHandle; handles
-                                        // will be stolen by the caller; the array itself is
-                                        // borrowed
-                    DeviceStreamType stream,
-                    AOTIProxyExecutorHandle proxy_executor
-                ) {
-                """
-            if config.use_minimal_arrayref_interface:
-                self.prefix.splice(
-                    """
-                    template <>
-                    AOTInductorModelOutputs AOTInductorModel::run_impl_minimal_arrayref_interface<
-                      AOTInductorModelInputs, AOTInductorModelOutputs>(
-                        const AOTInductorModelInputs& inputs,
-                        DeviceStreamType stream,
-                        AOTIProxyExecutorHandle proxy_executor
-                    ) {
-                    """
-                )
-                self.suffix.splice(run_impl_proto)
-                self.suffix.splice(
-                    """
-                        AOTInductorModelInputs inputs;
-                        convert_handles_to_inputs(input_handles, inputs);
-                        auto outputs = run_impl_minimal_arrayref_interface<AOTInductorModelInputs, AOTInductorModelOutputs>(
-                            inputs, stream, proxy_executor);
-                        // NOTE: outputs is full of ArrayRef to thread_local storage. If in the future we need this
-                        // interface to perform well for a DSO using the minimal arrayref interface, all we need
-                        // to do is provide ThreadLocalCachedTensor for each one!
-                        convert_outputs_to_handles(outputs, output_handles);
-                    }
-                """
-                )
-
-                self.suffix.splice(
-                    """
-                    extern "C" AOTIRuntimeError AOTInductorModelRunMinimalArrayrefInterface(
-                        AOTInductorModelHandle model_handle,
-                        const AOTInductorModelInputs& inputs,
-                        AOTInductorModelOutputs& outputs) {
-                      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
-                      CONVERT_EXCEPTION_TO_ERROR_CODE({
-                          outputs = model->run_impl_minimal_arrayref_interface<AOTInductorModelInputs, AOTInductorModelOutputs>(
-                              inputs,
-                              (torch::aot_inductor::DeviceStreamType)nullptr,
-                              nullptr);
-                      })
-                    }
-                """
-                )
-            else:
-                self.prefix.splice(run_impl_proto)
-        else:
-            self.prefix.splice(
-                f"""std::vector<at::Tensor> {self.call_func_name}(const std::vector<at::Tensor>& inputs) {{"""
-            )
-        with self.prefix.indent():
-            # assign inputs and outputs in both cases so the later codegen can be simplified
-            if not config.use_minimal_arrayref_interface:
-                if V.graph.aot_mode:
-                    if config.aot_inductor.abi_compatible:
-                        self.prefix.splice(
-                            """
-                                auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, num_inputs());
-                            """
-                        )
-                    else:
-                        # This looks dumb, but can avoid creating two versions of code in the AOTInductor runtime.
-                        self.prefix.splice(
-                            """
-                                auto inputs = alloc_tensors_by_stealing_from_handles(input_handles, num_inputs());
-                            """
-                        )
-                else:
-                    self.prefix.splice(
-                        """
-                            py::gil_scoped_release release;
-                        """
-                    )
-
-            if inputs_len != 0:
-                for idx, input_key in enumerate(V.graph.graph_inputs.keys()):
-                    if config.use_minimal_arrayref_interface:
-                        self.prefix.writeline(
-                            f"auto {input_key} = std::get<{idx}>(inputs);"
-                        )
-                        continue
-                    # unwrap input tensor back to scalar
-                    if isinstance(V.graph.graph_inputs[input_key], sympy.Expr):
-                        from ..graph import may_get_constant_buffer_dtype
-                        from .cpp import DTYPE_TO_CPP
-
-                        dtype = may_get_constant_buffer_dtype(
-                            V.graph.graph_inputs[input_key]
-                        )
-                        assert (
-                            dtype is not None
-                        ), "Fails to get the dtype of the sympy.Expr"
-                        cpp_dtype = DTYPE_TO_CPP[dtype]
-                        assert (
-                            not config.aot_inductor.abi_compatible
-                        ), "Need to add .item support for abi_compatible AOTInductor codegen"
-                        self.prefix.writeline(
-                            f"{cpp_dtype} {input_key} = inputs[{idx}].item<{cpp_dtype}>();"
-                        )
-                    else:
-                        self.prefix.writeline(
-                            f"auto {input_key} = std::move(inputs[{idx}]);"
-                        )
-
-            assert all(
-                isinstance(v, torch.Tensor) for v in list(V.graph.constants.values())
-            ), "Expect all constants to be Tensor"
-            for idx, constants_key in enumerate(V.graph.constants.keys()):
-                if V.graph.aot_mode:
-                    # Weights are stored in constants_ and owned by RAIIAtenTensorHandle there.
-                    # Don't call std::move here because it will cause constants_ to lose the ownership.
-                    if config.aot_inductor.abi_compatible:
-                        self.prefix.writeline(
-                            f"""auto {constants_key} = constants_->at({idx});"""
-                        )
-                    else:
-                        self.prefix.writeline(
-                            f"auto {constants_key} = *tensor_handle_to_tensor_pointer("
-                            + f"""constants_->at({idx}));"""
-                        )
-                else:
-                    # Append constants as inputs to the graph
-                    constants_idx = inputs_len + idx
-                    self.prefix.writeline(
-                        f"auto {constants_key} = inputs[{constants_idx}];"
-                    )
-
-            self.codegen_inputs(self.prefix, V.graph.graph_inputs)
-
-            if V.graph.aot_mode:
-                if config.use_minimal_arrayref_interface:
-                    # TODO: input shape checking for regular tensor interface as well?
-                    self.codegen_input_numel_asserts()
-                else:
-                    self.prefix.writeline("inputs.clear();")
-                self.prefix.writeline(
-                    "auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());"
-                )
-
-    def codegen_input_numel_asserts(self):
-        for name, buf in V.graph.graph_inputs.items():
-            if isinstance(buf, sympy.Expr):
-                continue
-
-            # comparing strides for 0 size tensor is tricky. Ignore them for now.
-            if sympy_product(buf.get_size()) == 0:
-                continue
-            numel = buf.get_numel()
-            self.prefix.writeline(f"assert_numel({name}, {numel});")
-
-    def codegen_input_size_var_decl(self, code: IndentedBuffer, name):
-        if config.aot_inductor.abi_compatible:
-            code.writeline(f"int64_t* {name}_size;")
-            code.writeline(
-                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_sizes({name}, &{name}_size));"
-            )
-        else:
-            super().codegen_input_size_var_decl(code, name)
-
-    def codegen_input_stride_var_decl(self, code: IndentedBuffer, name):
-        if config.aot_inductor.abi_compatible:
-            code.writeline(f"int64_t* {name}_stride;")
-            code.writeline(
-                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides({name}, &{name}_stride));"
-            )
-        else:
-            super().codegen_input_stride_var_decl(code, name)
-
-    def codegen_model_kernels(self):
-        self.prefix.writeline("namespace {")
-        self.prefix.writeline(
-            "class AOTInductorModelKernels : public AOTInductorModelKernelsBase {"
-        )
-        self.prefix.writeline("  public:")
-        for kernel in chain(
-            self.src_to_kernel.values(), self.user_defined_kernel_cache.values()
-        ):
-            self.prefix.writeline(f"    CUfunction {kernel}{{nullptr}};")
-        self.prefix.writeline("};")
-        self.prefix.writeline("}  // namespace")
-
-    def codegen_model_constructor(self):
-        """
-        // Generated code example
-        AOTInductorModel::AOTInductorModel()
-            : AOTInductorModelBase(4, 1) {
-        inputs_info_[0].name = "input0";
-        inputs_info_[0].dtype = "torch.float16";
-        ...
-        constants_info_[0].name = "L__self___weight";
-        constants_info_[0].dtype = at::kFloat;
-        constants_info_[0].offset = 0;
-        constants_info_[0].data_size = 8192;
-        constants_info_[0].shape = {64, 32};
-        constants_info_[0].stride = {32, 1};
-        ...
-        outputs_info_[0].name = "output0";
-        outputs_info_[0].dtype = "torch.float16";
-        }
-        """
-
-        num_inputs = len(V.graph.graph_inputs)
-        num_outputs = len(V.graph.graph_outputs)
-        num_constants = len(V.graph.constants)
-        self.prefix.splice(
-            f"""
-            AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
-                                               std::shared_ptr<std::vector<ConstantHandle>> constants_array,
-                                               std::optional<std::string> cubin_dir)
-                : AOTInductorModelBase({num_inputs}, {num_outputs}, {num_constants}, cubin_dir) {{
-            """
-        )
-
-        with self.prefix.indent():
-            for idx, (name, inp) in enumerate(V.graph.graph_inputs.items()):
-                assert not isinstance(
-                    inp, sympy.Expr
-                ), f"input {name=} cannot be symbolic"
-                self.write_input_output_info("inputs_info_", idx, name)
-
-            for idx, (name, tensor) in enumerate(V.graph.constants.items()):
-                assert isinstance(tensor, torch.Tensor)
-                self.prefix.writeline(f"""constants_info_[{idx}].name = "{name}";""")
-                self.prefix.writeline(
-                    f"constants_info_[{idx}].dtype = static_cast<int32_t>({self.codegen_dtype(tensor.dtype)});"
-                )
-                self.prefix.writeline(
-                    f"constants_info_[{idx}].offset = {tensor.storage_offset()};"
-                )
-                self.prefix.writeline(
-                    f"constants_info_[{idx}].data_size = {tensor.untyped_storage().nbytes()};"
-                )
-
-                size_str = ", ".join([str(s) for s in tensor.size()])
-                self.prefix.writeline(f"constants_info_[{idx}].shape = {{{size_str}}};")
-
-                stride_str = ", ".join([str(s) for s in tensor.stride()])
-                self.prefix.writeline(
-                    f"constants_info_[{idx}].stride = {{{stride_str}}};"
-                )
-                if name in V.graph.dynamo_flat_name_to_original_fqn:
-                    self.prefix.writeline(
-                        f"""constants_info_[{idx}].original_fqn = "{V.graph.dynamo_flat_name_to_original_fqn[name]}";"""
-                    )
-
-            self.prefix.writeline("update_constants_map(std::move(constants_map));")
-            self.prefix.writeline("update_constants_array(std::move(constants_array));")
-
-            def escape_string(x):
-                return (
-                    x.replace("\\", "\\\\")
-                    .replace('"', '\\"')
-                    .replace("\n", "\\n")
-                    .replace("\t", "\\t")
-                )
-
-            self.prefix.writeline(
-                f'in_spec_ = "{escape_string(config.aot_inductor.serialized_in_spec)}";'
-            )
-            self.prefix.writeline(
-                f'out_spec_ = "{escape_string(config.aot_inductor.serialized_out_spec)}";'
-            )
-
-            for idx, output in enumerate(V.graph.graph_outputs):
-                assert not isinstance(
-                    output, sympy.Expr
-                ), f"output {name=} cannot be symbolic"
-                name = f"output{idx}"
-                self.write_input_output_info("outputs_info_", idx, name)
-
-            self.prefix.writeline(
-                "this->kernels_ = std::make_unique<AOTInductorModelKernels>();"
-            )
-
-        self.prefix.writeline("}")
-
-    def generate(self, is_inference):
-        if V.graph.aot_mode:
-            self.codegen_model_kernels()
-            self.codegen_model_constructor()
-        self.write_wrapper_decl()
-        return super().generate(is_inference)
-
-    def finalize_prefix(self):
-        cached_dtypes_buffer = IndentedBuffer()
-        if config.aot_inductor.abi_compatible:
-            for dtype in self.used_cached_dtypes:
-                cached_dtypes_buffer.writeline(f"CACHE_TORCH_DTYPE({dtype});")
-        cached_dtypes_buffer.splice(self.prefix)
-        self.prefix = cached_dtypes_buffer
-
-    def define_kernel(
-        self, name: str, kernel: str, metadata: Optional[str] = None, cuda=False
-    ):
-        self.header.splice(f"\n{kernel}\n")
-
-    def generate_return(self, output_refs):
-        if V.graph.aot_mode:
-            cst_names = V.graph.constants.keys()
-            arr_iface = config.use_minimal_arrayref_interface  # For brevity.
-
-            def use_thread_local_cached_output_tensor(idx, output):
-                cached_output_name = f"cached_output_{next(self.cached_output_id)}"
-                cache_type = "Array" if arr_iface else "Tensor"
-                self.wrapper_call.writeline(
-                    f"thread_local ThreadLocalCachedOutput{cache_type}<std::decay_t<decltype({output})>> "
-                    f"{cached_output_name}({output});"
-                )
-                if arr_iface:
-                    self.wrapper_call.writeline(
-                        f"{cached_output_name}.copy_data_from({output});"
-                    )
-                    output_entry = f"std::get<{idx}>(output_arrayref_tensors)"
-                    element_type = f"std::decay_t<decltype({output_entry}.data()[0])>"
-                    self.wrapper_call.writeline(
-                        f"{output_entry} = {cached_output_name}.arrayref_tensor<{element_type}>();"
-                    )
-                else:
-                    self.wrapper_call.writeline(
-                        f"{cached_output_name}.copy_data_from({output});"
-                    )
-                    self.wrapper_call.writeline(
-                        f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_new_uninitialized_tensor(&output_handles[{idx}]));"
-                    )
-                    self.wrapper_call.writeline(
-                        f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_assign_tensors({cached_output_name}.tensor(), "
-                        f"output_handles[{idx}]));"
-                    )
-
-            if arr_iface:
-                self.wrapper_call.writeline(
-                    "AOTInductorModelOutputs output_arrayref_tensors;"
-                )
-            for idx, output in enumerate(output_refs):
-                if config.aot_inductor.abi_compatible:
-                    output_is_tensor_handle_expr = (
-                        f"std::is_same_v<std::decay_t<decltype({output})>,"
-                        "RAIIAtenTensorHandle> || "
-                        f"std::is_same_v<std::decay_t<decltype({output})>,"
-                        "AtenTensorHandle> || "
-                        f"std::is_same_v<std::decay_t<decltype({output})>,"
-                        "ConstantHandle>"
-                    )
-                    self.wrapper_call.writeline(
-                        f"if constexpr ({output_is_tensor_handle_expr}) {{"
-                    )
-                    with self.wrapper_call.indent():
-                        if config.use_minimal_arrayref_interface:
-                            cached_output_name = (
-                                f"cached_output_{next(self.cached_output_id)}"
-                            )
-                            output_value_type = f"std::decay_t<decltype(std::get<{idx}>(output_arrayref_tensors).data()[0])>"
-                            self.wrapper_call.writeline(
-                                f"thread_local RAIIAtenTensorHandle {cached_output_name};"
-                            )
-                            if output in cst_names:
-                                # NOTE(return_constant): In some rare cases where we return
-                                # a constant, we have to return a copy of this constant,
-                                # because (1) constants are not owned by the Model instance
-                                # (2) constants remain the same cross inference runs,
-                                # assuming they are not updated at runtime Basically, we
-                                # cannot release or transfer the ownership of any original
-                                # constant to the user.
-                                self.wrapper_call.writeline(
-                                    f"AtenTensorHandle {cached_output_name}_tmp;"
-                                )
-                                self.wrapper_call.writeline(
-                                    f"aoti_torch_clone({output}, &{cached_output_name}_tmp);"
-                                )
-                                self.wrapper_call.writeline(
-                                    f"{cached_output_name} = {cached_output_name}_tmp;"
-                                )
-                            else:
-                                self.wrapper_call.writeline(
-                                    f"{cached_output_name} = {output}.release();"
-                                )
-                            self.wrapper_call.writeline(
-                                f"convert_handle_to_arrayref_tensor({cached_output_name}, "
-                                f"std::get<{idx}>(output_arrayref_tensors));"
-                            )
-                        else:
-                            if output in cst_names:
-                                # See NOTE(return_constant) above.
-                                self.wrapper_call.writeline(
-                                    f"aoti_torch_clone({output}, &output_handles[{idx}]);"
-                                )
-                            else:
-                                self.wrapper_call.writeline(
-                                    f"output_handles[{idx}] = {output}.release();"
-                                )
-                    self.wrapper_call.writeline("} else {")
-                    with self.wrapper_call.indent():
-                        use_thread_local_cached_output_tensor(idx, output)
-                    self.wrapper_call.writeline("}")
-
-                else:
-                    assert (
-                        not arr_iface
-                    ), "minimal ArrayRef interface is only supported in ABI-compatible mode"
-                    if output in cst_names:
-                        output_expr = f"{output}.clone()"
-                        # See NOTE(return_constant) above.
-                    else:
-                        output_expr = output
-                    self.wrapper_call.writeline(
-                        f"output_handles[{idx}] = reinterpret_cast<AtenTensorHandle>("
-                        + f"new at::Tensor({output_expr}));"
-                    )
-            if arr_iface:
-                self.wrapper_call.writeline("return output_arrayref_tensors;")
-
-        else:
-            self.wrapper_call.writeline(f"return {{{', '.join(output_refs)}}};\n}}")
-
-    def generate_before_suffix(self, result):
-        if V.graph.aot_mode:
-            result.writeline("} // AOTInductorModel::run_impl")
-
-    def generate_end(self, result):
-        if V.graph.aot_mode:
-            result.writeline("} // namespace aot_inductor")
-            result.writeline("} // namespace torch")
-            return
-
-        result.writeline("'''\n)")
-        # get the hash of the wrapper code to name the extension
-        wrapper_call_hash = codecache.code_hash(result.getvalue())
-        result.splice(
-            f"""
-            module = CppWrapperCodeCache.load(cpp_wrapper_src, '{self.call_func_name}', '{wrapper_call_hash}', {self.cuda})
-            """
-        )
-
-        # unwrap output tensor back to python scalar
-        if all(x for x in self.output_is_tensor.values()):
-            # If no ShapeAsConstantBuffer in the output, directly return the output as tensors
-            return_str = "return f(args_tensor)"
-        else:
-            outputs = [
-                f"outputs[{i}]" if self.output_is_tensor[i] else f"outputs[{i}].item()"
-                for i in range(len(V.graph.graph_outputs))
-            ]
-            outputs_str = f"[{', '.join(outputs)}]"
-            return_str = f"""
-                    outputs = f(args_tensor)
-                    return {outputs_str}
-            """
-
-        args_str = "args_tensor = [arg if isinstance(arg, torch.Tensor) else torch.tensor(arg) for arg in args]"
-        if V.graph.constants:
-            # Append constants to the input args for cpp wrapper.
-            # Python wrapper directly gets the value inside the wrapper call
-            # as a global variable passed when calling exec(code, mod.__dict__, mod.__dict__).
-            # For cpp wrapper, we need to pass this python value to the inductor_entry_cpp function explicitly.
-            assert all(
-                isinstance(v, torch.Tensor) for v in list(V.graph.constants.values())
-            ), "Expect all constants to be Tensor"
-            constants_str = f"[{', '.join(V.graph.constants.keys())}]"
-            args_str += f"""
-                    constants_tensor = {constants_str}
-                    args_tensor.extend(constants_tensor)
-            """
-
-        # Wrap the func to support setting result._boxed_call = True
-        result.splice(
-            f"""
-            def _wrap_func(f):
-                def g(args):
-                    {args_str}
-                    {return_str}
-                return g
-            call = _wrap_func(module.{self.call_func_name})
-            """
-        )
-
-    def generate_c_shim_extern_kernel_call(self, kernel, args):
-        # In the abi_compatible mode, we call fallback aten ops through a C shim layer
-        self.allow_stack_allocation = False
-        kernel_tokens = kernel.split("::")
-        kernel_suffix = kernel_tokens[-1]
-        if kernel_suffix == "call":
-            kernel_suffix = kernel_tokens[-2]
-        shim_fn = f"aoti_torch_{kernel_suffix}"
-        # HACK: val_to_arg_str jams multiple arguments together using a comma. If that
-        # ever breaks, it needs to be reworked to be able to return multiple arguments,
-        # and the split-on-comma code here needs to be removed.
-        wrapped_args = []
-        for x in args:
-            pieces = x.split(", ")
-            for piece in pieces:
-                # We only really *need* convert_arrayref_tensor_to_tensor for
-                # ArrayRefTensors. The code flowing into here uses `0` for nullptr,
-                # which convert_arrayref_tensor_to_tensor would blindly coerce to int,
-                # so just avoid wrapping integers.
-                if not piece.isdigit():
-                    piece = f"convert_arrayref_tensor_to_tensor({piece})"
-                wrapped_args.append(piece)
-        self.writeline(
-            f"AOTI_TORCH_ERROR_CODE_CHECK({shim_fn}({', '.join(wrapped_args)}));"
-        )
-
-    def generate_c_shim_extern_kernel_alloc(self, extern_kernel, args):
-        # registered output buffer name
-        name = extern_kernel.name
-        output_handle_name = f"{name}_handle"
-        self.writeline(f"AtenTensorHandle {output_handle_name};")
-        output_arg = f"&{output_handle_name}"
-        self.generate_c_shim_extern_kernel_call(
-            extern_kernel.get_kernel_name(), args + [output_arg]
-        )
-        self.writeline(f"RAIIAtenTensorHandle {name}({output_handle_name});")
-
-    def generate_extern_kernel_alloc(self, extern_kernel, args):
-        if V.graph.aot_mode and config.aot_inductor.abi_compatible:
-            self.generate_c_shim_extern_kernel_alloc(extern_kernel, args)
-        else:
-            super().generate_extern_kernel_alloc(extern_kernel, args)
-
-    def generate_c_shim_fallback_kernel(self, fallback_kernel, args):
-        output_args = []
-        output_raii_handles = []
-        output_name_base = fallback_kernel.get_name()
-        for idx, output in enumerate(fallback_kernel.outputs):
-            if isinstance(output, ir.MultiOutput):
-                name = f"{output.get_name()}"
-                output_handle_name = f"{name}_handle"
-                if output.indices:
-                    assert (
-                        output.indices[0][1] == idx
-                    ), f"expected {output.indices[0][1]=} == {idx=} for {output_name_base=}"
-                self.writeline(f"AtenTensorHandle {output_handle_name};")
-                output_args.append(f"&{output_handle_name}")
-                output_raii_handles.append(
-                    f"RAIIAtenTensorHandle {name}({output_handle_name});"
-                )
-            elif isinstance(output, int):
-                output_name = f"{output_name_base}_{idx}"
-                self.writeline(f"int64_t {output_name} = {output};")
-                output_args.append(f"&{output_name}")
-            elif output is None:
-                output_args.append("nullptr")
-            else:
-                raise NotImplementedError("unsupported type of {output=}")
-        args = args + output_args
-        assert (
-            fallback_kernel.abi_compatible_kernel is not None
-        ), f"abi_compatible_kernel is None for {fallback_kernel.python_kernel_name=}"
-        self.generate_c_shim_extern_kernel_call(
-            fallback_kernel.abi_compatible_kernel, args
-        )
-        for raii_handle in output_raii_handles:
-            self.writeline(raii_handle)
-
-    def generate_fallback_kernel(self, fallback_kernel, args):
-        if V.graph.aot_mode and config.aot_inductor.abi_compatible:
-            self.generate_c_shim_fallback_kernel(fallback_kernel, args)
-        else:
-            super().generate_fallback_kernel(fallback_kernel, args)
-
-    def generate_extern_kernel_out(self, output_view, codegen_reference, args, kernel):
-        if output_view:
-            output_as_strided = f"{output_view.codegen_reference()}"
-            output_name = f"{output_view.get_name()}_as_strided"
-            self.writeline(f"auto {output_name} = {output_as_strided};")
-
-            args.insert(0, output_name)
-        else:
-            args.insert(0, f"{codegen_reference}")
-
-        if V.graph.aot_mode and config.aot_inductor.abi_compatible:
-            self.generate_c_shim_extern_kernel_call(kernel, args)
-        else:
-            self.writeline(self.wrap_kernel_call(kernel, args))
-
-    def generate_user_defined_triton_kernel(self, kernel_name, grid, configs, args):
-        assert len(grid) != 0
-        if len(grid) == 1:
-            grid_decision = grid[0]
-        else:
-            meta = CudaKernelParamCache.get(kernel_name)
-            assert meta is not None
-            grid_decision = None
-            for i, c in enumerate(configs):
-                if all(arg == meta["meta"][key] for key, arg in c.kwargs.items()):
-                    grid_decision = grid[i]
-                    break
-            assert grid_decision is not None
-
-        self.generate_kernel_call(
-            kernel_name,
-            args,
-            grid=grid_decision,
-            device_index=V.graph.scheduler.current_device.index,
-            cuda=True,
-            triton=True,
-        )
-
-    def generate_scatter_fallback(
-        self, output, inputs, kernel, python_kernel_name, src_is_tensor, reduce, kwargs
-    ):
-        # TODO: support other overload for cpp wrapper and remove the below assertions
-        if V.graph.aot_mode and config.aot_inductor.abi_compatible:
-            # call the ABI shim function instead of the ATen one
-            kernel = kernel.replace("at::", "aoti_torch_")
-        line = f"{kernel}({output}, {','.join(map(str, inputs))}"
-        if python_kernel_name == "aten.scatter_":
-            if src_is_tensor:
-                if reduce:
-                    line += f", {V.graph.wrapper_code.val_to_arg_str(reduce)}"
-            else:
-                assert (
-                    reduce is None
-                ), "Expect reduce to be None for aten.scatter_ with scalar src"
-        else:
-            line += f", {','.join(kwargs)}"
-        line += f"){self.ending}"
-        self.writeline(line)
-
-    def add_benchmark_harness(self, output):
-        if V.graph.aot_mode:
-            return
-        super().add_benchmark_harness(output)
-
-    def codegen_sizevar(self, x: Expr) -> str:
-        return self.expr_printer(V.graph.sizevars.simplify(x))
-
-    def codegen_tuple_access(self, basename: str, name: str, index: str) -> str:
-        if V.graph.aot_mode and config.aot_inductor.abi_compatible:
-            # in the abi_compatible mode, outputs are returned via arguments
-            return name
-        else:
-            return f"std::get<{index}>({basename})"
-
-    def codegen_shape_tuple(self, shape: Tuple[Expr, ...]) -> str:
-        parts = list(map(self.codegen_sizevar, shape))
-        if len(parts) == 0:
-            return "{}"
-        if len(parts) == 1:
-            return f"{{{parts[0]}, }}"
-        return f"{{{', '.join(parts)}}}"
-
-    def codegen_dynamic_scalar(self, node):
-        from .cpp import DTYPE_TO_ATEN
-
-        (data,) = (t.codegen_reference() for t in node.inputs)
-        if node.is_bool:
-            self.writeline(f"bool {node.sym} = {data}.item() ? 1 : 0;")
-        else:
-            convert_type = DTYPE_TO_ATEN[node.inputs[0].get_dtype()].replace(
-                "at::k", "to"
-            )
-            self.writeline(f"auto {node.sym} = {data}.item().{convert_type}();")
-
-    def can_stack_allocate_buffer(self, buffer):
-        return (
-            self.allow_stack_allocation
-            and buffer.get_device().type == "cpu"
-            and self.can_prove_buffer_has_static_shape(buffer)
-            and ir.is_contiguous_strides_for_shape(
-                buffer.get_stride(), buffer.get_size()
-            )
-        )
-
-    def make_buffer_free(self, buffer):
-        return (
-            ""
-            if isinstance(buffer.get_layout(), ir.MultiOutputLayout)
-            or (V.graph.aot_mode and buffer.get_name() in self.stack_allocated_buffers)
-            or (
-                config.use_minimal_arrayref_interface
-                and V.graph.aot_mode
-                and buffer.get_name() in V.graph.graph_inputs
-            )
-            else f"{buffer.get_name()}.reset();"
-        )
-
-    def make_free_by_names(self, names_to_del: List[str]):
-        return " ".join(f"{name}.reset();" for name in names_to_del)
-
-    def codegen_exact_buffer_reuse(self, old_name: str, new_name: str, del_line: str):
-        if config.aot_inductor.abi_compatible:
-            return f"auto {new_name} = std::move({old_name});  // reuse"
-        else:
-            return super().codegen_exact_buffer_reuse(old_name, new_name, del_line)
-
-    def generate_profiler_mark_wrapper_call(self, stack):
-        self.wrapper_call.writeline(
-            'RECORD_FUNCTION("inductor_wrapper_call", c10::ArrayRef<c10::IValue>());'
-        )
-
-    def write_triton_header_once(self):
-        pass
-
-    def generate_start_graph(self):
-        pass
-
-    def generate_end_graph(self):
-        pass
-
-    def generate_inf_and_nan_checker(self, nodes):
-        for buf in nodes.get_names():
-            # TODO: Add buf name directly into check_inf_and_nan.
-            self.writeline(
-                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_check_inf_and_nan({buf}));"
-            )
-
-    def codegen_device(self, device):
-        if config.aot_inductor.abi_compatible:
-            return f"cached_torch_device_type_{device.type},{device.index if device.index else 0}"
-        else:
-            from .cpp import DEVICE_TO_ATEN
-
-            return (
-                f"c10::Device({DEVICE_TO_ATEN[device.type]}, {device.index})"
-                if device.index is not None
-                else f"{DEVICE_TO_ATEN[device.type]}"
-            )
-
-    def codegen_dtype(self, dtype):
-        if config.aot_inductor.abi_compatible:
-            dtype_str = str(dtype).split(".")[-1]
-            self.used_cached_dtypes.add(dtype_str)
-            return f"cached_torch_dtype_{dtype_str}"
-        else:
-            from .cpp import DTYPE_TO_ATEN
-
-            return DTYPE_TO_ATEN[dtype]
-
-    @functools.lru_cache(None)
-    def codegen_int_array_var(
-        self, int_array: str, writer=None, known_statically=False
-    ):
-        # Because the memory planning is done in two passes (see the implementation
-        # of self.generate), the writeline behavior is different in the two passes.
-        # As a result, the emitted int array declarations may appear in a later
-        # position of the generated code, so the second pass codegen should not
-        # reuse int array declarations generated in the first pass
-        if writer is None:
-            # The first pass codegen uses `self` as the writer
-            writer = self
-
-        var = f"int_array_{next(self.int_array_id)}"
-        if var not in self.declared_int_array_vars:
-            self.declared_int_array_vars.add(var)
-            if known_statically:
-                writer.writeline(f"static constexpr int64_t {var}[] = {int_array};")
-            else:
-                writer.writeline(f"int64_t {var}[] = {int_array};")
-        return var
-
-    def make_buffer_allocation(self, buffer):
-        return self.make_allocation(
-            buffer.get_name(),
-            buffer.get_device(),
-            buffer.get_dtype(),
-            buffer.get_size(),
-            buffer.get_stride(),
-            buffer if self.can_stack_allocate_buffer(buffer) else None,
-        )
-
-    def make_allocation(
-        self, name, device, dtype, shape, stride, buffer_if_can_stack_allocate=None
-    ):
-        orig_stride = stride
-        device = self.codegen_device(device)
-        dtype_code = self.codegen_dtype(dtype)
-        size = self.codegen_shape_tuple(shape)
-        stride = self.codegen_shape_tuple(orig_stride)
-        if config.aot_inductor.abi_compatible:
-            size_array_var = self.codegen_int_array_var(
-                size,
-                self.wrapper_call,
-                known_statically=self.is_statically_known_list_of_ints(shape),
-            )
-            stride_array_var = self.codegen_int_array_var(
-                stride,
-                self.wrapper_call,
-                known_statically=self.is_statically_known_list_of_ints(orig_stride),
-            )
-            device_type, device_id = device.split(",")
-            device_idx = "this->device_idx_" if V.graph.aot_mode else device_id
-            if buffer_if_can_stack_allocate is not None:
-                from .cpp import DTYPE_TO_CPP
-
-                self.stack_allocated_buffers[name] = buffer_if_can_stack_allocate
-                cpp_type = DTYPE_TO_CPP[dtype]
-                numel = buffer_if_can_stack_allocate.get_numel()
-                # Note: we don't zero storage because empty_strided doesn't zero either.
-                self.wrapper_call.writeline(f"{cpp_type} {name}_storage[{numel}];")
-                args = [
-                    f"{name}_storage",
-                    size_array_var,
-                    stride_array_var,
-                    device_type,
-                    device_idx,
-                ]
-                return f"ArrayRefTensor<{cpp_type}> {name}({', '.join(args)});"
-
-            args = [
-                str(len(shape)),
-                size_array_var,
-                stride_array_var,
-                dtype_code,
-                device_type,
-                device_idx,
-                f"&{name}_handle",
-            ]
-
-            self.wrapper_call.writeline(f"AtenTensorHandle {name}_handle;")
-            self.wrapper_call.writeline(
-                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided({', '.join(args)}));"
-            )
-
-            return f"RAIIAtenTensorHandle {name}({name}_handle);"
-
-        if V.graph.aot_mode and device.startswith("c10::Device("):
-            tensor_device = f"{device.split(',')[0]}, this->device_idx_)"
-        else:
-            tensor_device = device
-
-        return (
-            f"{self.declare}{name} = {self.namespace}empty_strided("
-            f"{size}, {stride}, at::TensorOptions({tensor_device}).dtype({dtype_code})){self.ending}"
-        )
-
-    def codegen_alloc_from_pool(self, name, offset, dtype, shape, stride) -> str:
-        if config.aot_inductor.abi_compatible:
-            size = self.codegen_shape_tuple(shape)
-            stride = self.codegen_shape_tuple(stride)
-            tmp_name = f"tmp_tensor_handle_{next(self.tmp_tensor_id)}"
-            args = [
-                name,
-                pexpr(offset),  # bytes not numel
-                self.codegen_dtype(dtype),
-                str(len(shape)),
-                self.codegen_int_array_var(size, self.wrapper_call),
-                self.codegen_int_array_var(stride, self.wrapper_call),
-                f"&{tmp_name}",
-            ]
-            self.wrapper_call.writeline(f"AtenTensorHandle {tmp_name};")
-            self.wrapper_call.writeline(
-                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__alloc_from_pool({', '.join(args)}));"
-            )
-            return f"RAIIAtenTensorHandle({tmp_name})"
-
-        return "alloc_from_pool({})".format(
-            ", ".join(
-                [
-                    name,
-                    pexpr(offset),  # bytes not numel
-                    self.codegen_dtype(dtype),
-                    self.codegen_shape_tuple(shape),
-                    self.codegen_shape_tuple(stride),
-                ]
-            )
-        )
-
-    def codegen_reinterpret_view(
-        self, data, size_list, stride_list, offset, writer
-    ) -> str:
-        dim = str(len(size_list))
-        size = self.codegen_shape_tuple(size_list)
-        stride = self.codegen_shape_tuple(stride_list)
-        offset = self.codegen_sizevar(offset)
-
-        if config.aot_inductor.abi_compatible:
-            tmp_name = f"tmp_tensor_handle_{next(self.tmp_tensor_id)}"
-            # Because the memory planning is done in two passes (see the implementation
-            # of self.generate), the writeline behavior is different in the two passes.
-            if writer is None:
-                writer = self
-
-            args = [
-                f"{data.get_name()}",
-                dim,
-                self.codegen_int_array_var(
-                    size,
-                    writer,
-                    known_statically=self.is_statically_known_list_of_ints(size_list),
-                ),
-                self.codegen_int_array_var(
-                    stride,
-                    writer,
-                    known_statically=self.is_statically_known_list_of_ints(stride_list),
-                ),
-                offset,
-            ]
-
-            def gen_reinterpret_call(writer, args):
-                writer.writeline(
-                    f"auto {tmp_name} = reinterpret_tensor_wrapper({', '.join(args)});"
-                )
-
-            if (
-                self.can_stack_allocate_buffer(data)
-                and self.is_statically_known_list_of_ints(size_list)
-                and self.is_statically_known_list_of_ints(stride_list)
-                and ir.is_contiguous_strides_for_shape(stride_list, size_list)
-            ):
-                gen_reinterpret_call(writer, args)
-                return tmp_name
-
-            gen_reinterpret_call(writer, args)
-
-            # NB, the return handle here represents a temporary tensor, which will be automatically
-            # released.
-            # Here's a sample usage in the cpp wrapper code:
-            # ```
-            # aoti_torch_addmm_out(
-            #     buf1,
-            #     arg1_1,
-            #     RAIIAtenTensorHandle(tmp_tensor_handle_0),
-            #     buf0,
-            #     1L,
-            #     1L));
-            # ```
-            # RAIIAtenTensorHandle(tmp_tensor_handle_0) will be released after the call to addmm_out.
-            # This could be problematic when it's used in a different pattern, for example:
-            # ````
-            # AtenTensorHandle tensor_args[] = {RAIIAtenTensorHandle(tmp_tensor_handle_2), buf5, buf6};
-            # aoti_torch_proxy_executor_call_function(..., tensor_args);
-            # ````
-            # RAIIAtenTensorHandle(tmp_tensor_handle_2) will be invalid when it's used in the latter
-            # kernel call.
-            #
-            # This is solved by updating the proxy_executor invocation to
-            # ```
-            # aoti_torch_proxy_executor_call_function(...,
-            #     std::vector<AtenTensorHandle>{
-            #         RAIIAtenTensorHandle(tmp_tensor_handle_2), buf5, buf6
-            #     }.data()
-            # );
-            # ```
-            return f"wrap_with_raii_handle_if_needed({tmp_name})"
-        else:
-            args = [data.get_name(), size, stride, offset]
-            return f"reinterpret_tensor({', '.join(args)})"
-
-    def codegen_device_copy(self, src, dst):
-        if config.aot_inductor.abi_compatible:
-            self.writeline(
-                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_tensor_copy_(expensive_copy_to_tensor_if_needed({src}), {dst}));"
-            )
-        else:
-            self.writeline(f"{dst}.copy_({src});")
-
-    def codegen_multi_output(self, name, value):
-        # in the abi_compatible mode, outputs are retrieved by passing
-        # output pointers, so we skip its codegen here.
-        if not config.aot_inductor.abi_compatible:
-            super().codegen_multi_output(name, value)
-
-    def generate_extern_kernel_args_decl_if_needed(
-        self, op_overload, raw_args, output_args
-    ):
-        arg_types = [x.real_type for x in op_overload._schema.arguments]
-        return_types = [x.type for x in op_overload._schema.returns]
-
-        new_tensor_args = []
-        new_int_args = []
-
-        def fill_args(arg, arg_type):
-            static_arg_types = (
-                torch.FloatType,
-                torch.BoolType,
-                torch.StringType,
-                torch.Type,
-                torch.DeviceObjType,
-            )
-            inductor_tensor_buffers = (
-                ir.Buffer,
-                ir.ReinterpretView,
-            )
-
-            if isinstance(arg_type, torch.TensorType):
-                assert isinstance(arg, inductor_tensor_buffers), f"got {type(arg)}"
-                new_tensor_args.append(f"{arg.codegen_reference()}")
-            elif isinstance(arg_type, torch.IntType):
-                # int
-                new_int_args.append(str(arg))
-            elif isinstance(arg_type, torch.SymIntType):
-                # SymInt
-                new_int_args.append(str(arg))
-            elif isinstance(arg_type, torch.NumberType):
-                # Scalar of type int
-                assert isinstance(arg, (int, float, bool))
-                # Only treat int Scalar as dynamic
-                if isinstance(arg, int):
-                    new_int_args.append(str(arg))
-            elif isinstance(arg_type, torch.ListType):
-                assert isinstance(arg, (list, tuple))
-
-                # List[Tensor]
-                if isinstance(arg_type.getElementType(), torch.TensorType):
-                    new_tensor_args.extend([f"{a.codegen_reference()}" for a in arg])
-                # List[Optional[Tensor]]
-                elif isinstance(
-                    arg_type.getElementType(), torch.OptionalType
-                ) and isinstance(
-                    arg_type.getElementType().getElementType(), torch.TensorType
-                ):
-                    new_tensor_args.extend(
-                        [f"{a.codegen_reference()}" for a in arg if a is not None]
-                    )
-                # List [int] or List[SymInt]
-                elif isinstance(
-                    arg_type.getElementType(), (torch.IntType, torch.SymIntType)
-                ):
-                    new_int_args.extend([str(a) for a in arg])
-                # List[Scalar]
-                elif isinstance(arg_type.getElementType(), torch.NumberType):
-                    # Only treat int Scalar as dynamic
-                    is_int_type = [isinstance(a, int) for a in arg]
-                    if any(is_int_type):
-                        assert all(
-                            is_int_type
-                        ), "AOTInductor only supports int scalars of the same type"
-                        new_int_args.extend([str(a) for a in arg])
-                else:
-                    assert isinstance(
-                        arg_type.getElementType(), static_arg_types  # type: ignore[arg-type]
-                    ), f"Fall through arguments must be one of static_arg_types, got {type(arg_type)}"
-            else:
-                assert isinstance(
-                    arg_type, static_arg_types  # type: ignore[arg-type]
-                ), f"Fall through arguments must be one of static_arg_types, got {type(arg_type)}"
-
-        for arg, arg_type in zip(raw_args, arg_types):
-            if arg is not None:
-                if isinstance(arg_type, torch.OptionalType):
-                    fill_args(arg, arg_type.getElementType())
-                else:
-                    fill_args(arg, arg_type)
-
-        def fill_output_arg(arg, return_type):
-            if isinstance(return_type, torch.TensorType):
-                self.writeline(f"AtenTensorHandle {arg}_handle;  // output buffer")
-                self.writeline(
-                    f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_new_uninitialized_tensor(&{arg}_handle));"
-                )
-                self.writeline(f"RAIIAtenTensorHandle {arg}({arg}_handle);")
-                new_tensor_args.append(f"{arg}")
-            elif isinstance(return_type, torch.SymIntType):
-                raise NotImplementedError("NYI support for return type: SymInt")
-            elif isinstance(return_type, torch.ListType) and isinstance(
-                return_type.getElementType(), torch.SymIntType
-            ):
-                raise NotImplementedError("NYI support for return type: List[SymInt]")
-            else:
-                raise AssertionError(f"Unsupported return type found: {return_type}")
-
-        # TODO: Only support tensor(s) returns for now, SymInt is not implemented yet
-        for return_type in return_types:
-            if isinstance(return_type, (torch.TensorType)):
-                pass
-            elif isinstance(return_type, torch.OptionalType):
-                assert isinstance(return_type.getElementType(), torch.TensorType)
-            elif isinstance(return_type, torch.ListType):
-                assert isinstance(return_type.getElementType(), torch.TensorType)
-            else:
-                raise NotImplementedError(
-                    f"return type {return_type} is not yet supported."
-                )
-
-        for output_arg in output_args:
-            assert output_arg is not None, "Optional return types are not yet supported"
-            if isinstance(output_arg, (list, tuple)):
-                for out in output_arg:
-                    fill_output_arg(out, torch.TensorType.get())
-            else:
-                fill_output_arg(output_arg, torch.TensorType.get())
-
-        return new_tensor_args, new_int_args
-
-    def generate_extern_kernel_alloc_and_find_schema_if_needed(
-        self,
-        name,
-        kernel,
-        codegen_args,
-        cpp_op_schema,
-        cpp_kernel_key,
-        cpp_kernel_overload_name="",
-        op_overload=None,
-        raw_args=None,
-        outputs=None,
-    ):
-        if config.is_fbcode():
-            assert op_overload is not None
-            assert raw_args is not None
-            assert outputs is not None
-
-            return self.generate_extern_kernel_alloc_and_find_schema_if_needed_fbcode(
-                name,
-                cpp_kernel_key,
-                op_overload,
-                raw_args,
-                outputs,
-            )
-        else:
-            return self.generate_extern_kernel_alloc_and_find_schema_if_needed_oss(
-                name,
-                kernel,
-                codegen_args,
-                cpp_op_schema,
-                cpp_kernel_key,
-                cpp_kernel_overload_name,
-            )
-
-    def generate_extern_kernel_alloc_and_find_schema_if_needed_oss(
-        self,
-        name,
-        kernel,
-        codegen_args,
-        cpp_op_schema,
-        cpp_kernel_key,
-        cpp_kernel_overload_name="",
-    ):
-        if cpp_kernel_key not in self.extern_call_ops:
-            self.writeline(
-                f"static auto op_{cpp_kernel_key} = c10::Dispatcher::singleton()"
-            )
-            self.writeline(
-                f'\t.findSchemaOrThrow("{kernel}", "{cpp_kernel_overload_name}")'
-            )
-            self.writeline(f"\t.typed<{cpp_op_schema}>();")
-            self.extern_call_ops.add(cpp_kernel_key)
-
-        self.writeline(
-            f"auto {name} = op_{cpp_kernel_key}.call({', '.join(codegen_args)});"
-        )
-
-    def generate_extern_kernel_alloc_and_find_schema_if_needed_fbcode(
-        self,
-        name,
-        cpp_kernel_key,
-        op_overload,
-        raw_args,  # contains both args and flatten kwargs
-        outputs,
-    ):
-        def extract_output_name(out):
-            assert out is not None, "None, i.e. optional output is not supported"
-            if isinstance(out, ir.MultiOutput):
-                return out.get_name()
-            elif isinstance(out, (list, tuple)):
-                return type(out)(extract_output_name(o) for o in out)
-            else:
-                raise AssertionError(f"Unexpected output: {type(out)}")
-
-        # output_args has the same pytree structure as outputs
-        output_args = extract_output_name(outputs)
-        if isinstance(output_args, str):
-            output_args = [output_args]
-
-        (
-            tensor_call_args,
-            int_call_args,
-        ) = self.generate_extern_kernel_args_decl_if_needed(
-            op_overload, raw_args, output_args
-        )
-
-        tensor_call_args_str = ", ".join(tensor_call_args)
-        int_call_args_str = ", ".join(int_call_args)
-
-        extern_kernel_node_index = len(V.graph.extern_kernel_nodes) - 1
-
-        self.writeline(
-            f"aoti_torch_proxy_executor_call_function(proxy_executor, "
-            f"{extern_kernel_node_index}, "
-            f"{len(int_call_args)}, "
-            f"std::vector<int64_t>{{{int_call_args_str}}}.data(), "
-            f"{len(tensor_call_args)}, "
-            f"std::vector<AtenTensorHandle>{{{tensor_call_args_str}}}.data());"
-        )
-
-        self.extern_call_ops.add(cpp_kernel_key)
-
-    def val_to_cpp_arg_str(self, type_, val, is_legacy_abi) -> str:
-        if (
-            config.aot_inductor.abi_compatible
-            and not is_legacy_abi
-            and isinstance(type_, torch.OptionalType)
-        ):
-            if val is None:
-                return "0"  # nullptr is not available in C
-            if isinstance(val, (bool, int, str, float)):
-                var_name = f"var_{next(self.arg_var_id)}"
-                self.writeline(f"auto {var_name} = {self.val_to_arg_str(val)};")
-                return f"&{var_name}"
-            if not isinstance(type_.getElementType(), torch.TensorType):
-                return f"&{self.val_to_arg_str(val)}"
-
-        return self.val_to_arg_str(val)
-
-    def val_to_arg_str(self, val) -> str:
-        if val is None:
-            # When None is passed as an argument, it represents an optional that does not contain a value.
-            if config.aot_inductor.abi_compatible:
-                return "0"  # nullptr is not available in C
-            return "c10::nullopt"
-        elif isinstance(val, bool):
-            if config.aot_inductor.abi_compatible:
-                return "1" if val else "0"
-            else:
-                return "true" if val else "false"
-        elif isinstance(val, int):
-            return f"{val}L"
-        elif isinstance(val, str):
-            return f'"{val}"'
-        elif isinstance(val, (ComputedBuffer, InputBuffer, ReinterpretView)):
-            return val.codegen_reference()
-        elif isinstance(val, torch.device):
-            return self.codegen_device(val)
-        elif isinstance(val, torch.dtype):
-            return self.codegen_dtype(val)
-        elif isinstance(val, float) and val in [float("inf"), float("-inf")]:
-            if val == float("inf"):
-                return "std::numeric_limits<float>::infinity()"
-            else:
-                return "-std::numeric_limits<float>::infinity()"
-        elif isinstance(val, (list, tuple)):
-            # FIXME handle embedded optional types?
-            result = f"{{{', '.join(self.val_to_arg_str(x) for x in val)}}}"
-            if config.aot_inductor.abi_compatible:
-                static = self.is_statically_known_list_of_ints(val)
-                # Need to pass the array length because we can't use std::vector
-                return f"{self.codegen_int_array_var(result, known_statically=static)}, {len(val)}"
-            else:
-                return result
-        else:
-            return repr(val)
-
-
-class CudaWrapperCodeGen(CppWrapperCodeGen):
-    """
-    Generates cpp wrapper for running on GPU and calls CUDA kernels
-    """
-
-    def __init__(self):
-        super().__init__()
-        self.grid_id = count()
-        self.cuda = True
-
-    def write_header(self):
-        super().write_header()
-
-        self.header.splice("#include <filesystem>")
-        if not config.aot_inductor.abi_compatible:
-            self.header.splice(
-                """
-                #include <c10/cuda/CUDAGuard.h>
-                #include <c10/cuda/CUDAStream.h>
-                """
-            )
-
-        self.header.splice(
-            """
-            #define CUDA_DRIVER_CHECK(EXPR)                    \\
-            do {                                               \\
-                CUresult code = EXPR;                          \\
-                const char *msg;                               \\
-                cuGetErrorString(code, &msg);                  \\
-                if (code != CUDA_SUCCESS) {                    \\
-                    throw std::runtime_error(                  \\
-                        std::string("CUDA driver error: ") +   \\
-                        std::string(msg));                     \\
-                }                                              \\
-            } while (0);
-
-            namespace {
-
-            struct Grid {
-                Grid(uint32_t x, uint32_t y, uint32_t z)
-                  : grid_x(x), grid_y(y), grid_z(z) {}
-                uint32_t grid_x;
-                uint32_t grid_y;
-                uint32_t grid_z;
-
-                bool is_non_zero() {
-                    return grid_x > 0 && grid_y > 0 && grid_z > 0;
-                }
-            };
-
-            }  // anonymous namespace
-
-            static inline CUfunction loadKernel(
-                    std::string filePath,
-                    const std::string &funcName,
-                    uint32_t sharedMemBytes,
-                    const std::optional<std::string> &cubinDir = std::nullopt) {
-                if (cubinDir) {
-                    std::filesystem::path p1{*cubinDir};
-                    std::filesystem::path p2{filePath};
-                    filePath = (p1 / p2.filename()).string();
-                }
-
-                CUmodule mod;
-                CUfunction func;
-                CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str()));
-                CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
-                if (sharedMemBytes > 0) {
-                    CUDA_DRIVER_CHECK(cuFuncSetAttribute(
-                        func,
-                        CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-                        sharedMemBytes
-                    ))
-                }
-                return func;
-            }
-
-            static inline void launchKernel(
-                    CUfunction func,
-                    uint32_t gridX,
-                    uint32_t gridY,
-                    uint32_t gridZ,
-                    uint32_t numWarps,
-                    uint32_t sharedMemBytes,
-                    void* args[],
-                    cudaStream_t stream) {
-                CUDA_DRIVER_CHECK(cuLaunchKernel(
-                    func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr
-                ));
-            }
-            """
-        )
-
-    def write_get_raw_stream(self, index):
-        name = f"stream{index}"
-        self.writeline(
-            f"cudaStream_t {name} = at::cuda::getCurrentCUDAStream({index});"
-        )
-        return name
-
-    def define_kernel(
-        self, name: str, kernel: str, metadata: Optional[str] = None, cuda=True
-    ):
-        if not cuda:
-            return super().define_kernel(name, kernel, metadata, cuda)
-
-    def generate(self, is_inference):
-        self.prefix.writeline("\n")
-        if not V.graph.aot_mode:
-            for kernel in chain(
-                self.src_to_kernel.values(), self.user_defined_kernel_cache.values()
-            ):
-                self.prefix.writeline(f"static CUfunction {kernel} = nullptr;")
-            self.prefix.writeline("\n")
-        return super().generate(is_inference)
-
-    @functools.lru_cache(None)
-    def generate_load_kernel_once(
-        self, name: str, mangled_name: str, cubin_path: str, shared_mem: int
-    ):
-        if V.graph.aot_mode:
-            self.writeline(f"if (kernels.{name} == nullptr) {{")
-            self.writeline(
-                f"""    kernels.{name} = loadKernel("{cubin_path}", "{mangled_name}", {shared_mem}, this->cubin_dir_);"""
-            )
-            self.writeline("}")
-        else:
-            self.writeline(f"if ({name} == nullptr) {{")
-            self.writeline(
-                f"""    {name} = loadKernel("{cubin_path}", "{mangled_name}", {shared_mem});"""
-            )
-            self.writeline("}")
-
-    def generate_args_decl(self, call_args):
-        dynamic_symbols = V.graph.sizevars.free_symbols()
-        # TODO: only works for constant now, need type info
-        new_args = []
-        for arg in call_args:
-            var_name = f"var_{next(self.arg_var_id)}"
-            if isinstance(arg, (sympy.Integer, sympy.Symbol, SymbolicCallArg)):
-                self.writeline(f"auto {var_name} = {arg};")
-            elif isinstance(arg, sympy.Expr):
-                self.writeline(f"auto {var_name} = {self.expr_printer(arg)};")
-            elif is_int(arg):
-                self.writeline(f"int {var_name} = {arg};")
-            elif is_float(arg):
-                self.writeline(f"float {var_name} = {arg};")
-            elif any(str(arg) == s.name for s in dynamic_symbols):
-                self.writeline(f"auto {var_name} = {arg};")
-            elif arg == "nullptr":
-                self.writeline(f"auto {var_name} = nullptr;")
-            elif arg == "c10::nullopt":
-                self.writeline(f"auto {var_name} = c10::nullopt;")
-            else:
-                if config.aot_inductor.abi_compatible:
-                    self.writeline(f"CUdeviceptr {var_name};")
-                    self.writeline(
-                        f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr({arg}, reinterpret_cast<void**>(&{var_name})));"
-                    )
-                else:
-                    self.writeline(
-                        f"CUdeviceptr {var_name} = reinterpret_cast<CUdeviceptr>({arg}.data_ptr());"
-                    )
-            new_args.append(f"&{var_name}")
-
-        return ", ".join(new_args)
-
-    def generate_default_grid(self, name: str, grid: List[Any], cuda: bool = True):
-        """
-        Generate grid configs for launching a CUDA kernel using the grid
-        function from triton_heuristics.
-        """
-        if not cuda:
-            return grid
-        assert isinstance(grid, list), f"expected {grid=} to be a list"
-        grid = [e.inner_expr if isinstance(e, SymbolicCallArg) else e for e in grid]
-        grid_fn = default_grid(*grid)
-        params = CudaKernelParamCache.get(name)
-        assert (
-            params is not None
-        ), f"cuda kernel parameters for {name} should already exist at this moment"
-        block_cfg = {
-            "XBLOCK": params["x_block"],
-            "YBLOCK": params["y_block"],
-            "ZBLOCK": params["z_block"],
-        }
-        return grid_fn(block_cfg)
-
-    def generate_kernel_call(
-        self, name, call_args, grid=None, device_index=None, cuda=True, triton=True
-    ):
-        if not cuda:
-            # Even in CudaWrapperCodeGen, we may see cpp kernels
-            return super().generate_kernel_call(
-                name, call_args, grid, device_index, cuda, triton
-            )
-
-        params = CudaKernelParamCache.get(name)
-        assert (
-            params is not None
-        ), f"cuda kernel parameters for {name} should already exist at this moment"
-        mangled_name = params.get("mangled_name", None)
-        assert mangled_name is not None, "missing mangled_name"
-        cubin_path = params.get(get_cpp_wrapper_cubin_path_name(), None)
-        assert cubin_path is not None and os.path.exists(
-            cubin_path
-        ), f"cubin file should already exist at this moment: {cubin_path}"
-        shared_mem = params.get("shared_mem", 0)
-
-        self.generate_load_kernel_once(name, mangled_name, cubin_path, shared_mem)
-
-        call_args = self.generate_args_decl(call_args)
-        kernel_args_var = f"kernel_args_var_{next(self.kernel_callsite_id)}"
-        self.writeline(f"void* {kernel_args_var}[] = {{{call_args}}};")
-        stream = (
-            "stream" if V.graph.aot_mode else self.write_get_raw_stream(device_index)
-        )
-        grid_name = f"{name}_grid_{next(self.grid_id)}"
-        assert isinstance(
-            grid, (list, tuple)
-        ), f"expected grid to be a list or tuple but got: {grid=}"
-
-        grid = [V.graph.sizevars.simplify(item) for item in grid]
-        grid_has_unbacked_symbols = any(free_unbacked_symbols(item) for item in grid)
-        grid_args = [self.grid_expr_printer(item) for item in grid]
-        grid_args_str = ", ".join(grid_args)
-        self.writeline(f"Grid {grid_name} = Grid({grid_args_str});")
-
-        if grid_has_unbacked_symbols:
-            self.writeline(f"if ({grid_name}.is_non_zero()) {{")
-        kernel_var_name = f"kernels.{name}" if V.graph.aot_mode else name
-        self.writeline(
-            "launchKernel({}, {}, {}, {}, {}, {}, {}, {});".format(
-                kernel_var_name,
-                f"{grid_name}.grid_x",
-                f"{grid_name}.grid_y",
-                f"{grid_name}.grid_z",
-                params["num_warps"],
-                params["shared_mem"],
-                kernel_args_var,
-                stream,
-            )
-        )
-        if grid_has_unbacked_symbols:
-            self.writeline("}")
diff --git a/torch/_inductor/codegen/xpu/__init__.py b/torch/_inductor/codegen/xpu/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/torch/_inductor/codegen/xpu/device_op_overrides.py b/torch/_inductor/codegen/xpu/device_op_overrides.py
new file mode 100644
index 0000000000000..1f12588982908
--- /dev/null
+++ b/torch/_inductor/codegen/xpu/device_op_overrides.py
@@ -0,0 +1,18 @@
+from ..common import DeviceOpOverrides, register_device_op_overrides
+
+
+class XPUDeviceOpOverrides(DeviceOpOverrides):
+    def import_get_raw_stream_as(self, name):
+        return f"from torch._C import _xpu_getCurrentRawStream as {name}"
+
+    def set_device(self, device_idx):
+        return f"torch.xpu.set_device({device_idx})"
+
+    def synchronize(self):
+        return "torch.xpu.synchronize()"
+
+    def device_guard(self, device_idx):
+        return f"torch.xpu._DeviceGuard({device_idx})"
+
+
+register_device_op_overrides("xpu", XPUDeviceOpOverrides())
diff --git a/torch/_inductor/comm_analysis.py b/torch/_inductor/comm_analysis.py
index 483ccfe2a1c8f..3d9233b34370c 100644
--- a/torch/_inductor/comm_analysis.py
+++ b/torch/_inductor/comm_analysis.py
@@ -1,7 +1,7 @@
 import math
 from enum import IntEnum
 
-from typing import TYPE_CHECKING
+import sympy
 
 import torch
 from . import ir
@@ -9,9 +9,6 @@
 from .utils import get_dtype_size, sympy_product
 from .virtualized import V
 
-if TYPE_CHECKING:
-    from torch._inductor.scheduler import BaseSchedulerNode
-
 
 class NCCL_COLL(IntEnum):
     ALL_REDUCE = 0
@@ -26,7 +23,7 @@ class NVIDIA_GPU_TYPE(IntEnum):
 
 
 def get_gpu_type() -> NVIDIA_GPU_TYPE:
-    gpu_info = torch.utils.collect_env.get_gpu_info(torch.utils.collect_env.run)
+    gpu_info = torch.utils.collect_env.get_gpu_info(torch.utils.collect_env.run) or ""
     if "V100" in gpu_info:
         return NVIDIA_GPU_TYPE.VOLTA
     elif "A100" in gpu_info:
@@ -38,19 +35,42 @@ def get_gpu_type() -> NVIDIA_GPU_TYPE:
         return NVIDIA_GPU_TYPE.AMPERE
 
 
-def get_collective_type(snode: "BaseSchedulerNode") -> NCCL_COLL:
-    if isinstance(snode.node, (ir.AllReduce, ir.AllReduceCoalesced)):
+def get_collective_type(node: ir.IRNode) -> NCCL_COLL:
+    if not isinstance(node, ir._CollectiveKernel):
+        raise ValueError(f"node is not a collective kernel: {node}")
+
+    kernel_name = node.python_kernel_name
+    assert kernel_name is not None
+    if "all_reduce" in kernel_name:
         return NCCL_COLL.ALL_REDUCE
-    elif isinstance(
-        snode.node, (ir.AllGatherIntoTensor, ir.AllGatherIntoTensorCoalesced)
-    ):
+    elif "all_gather" in kernel_name:
         return NCCL_COLL.ALL_GATHER
-    elif isinstance(
-        snode.node, (ir.ReduceScatterTensor, ir.ReduceScatterTensorCoalesced)
-    ):
+    elif "reduce_scatter" in kernel_name:
         return NCCL_COLL.REDUCE_SCATTER
     else:
-        raise Exception(f"Unsupported collective type: {snode.node}")
+        raise ValueError(f"Unsupported collective kernel: {kernel_name}")
+
+
+def get_collective_input_size_bytes(node: ir.IRNode) -> int:
+    sz_bytes = 0
+    for inp in node.inputs:  # type: ignore[attr-defined]
+        numel = sympy_product(inp.layout.size)
+        if isinstance(numel, sympy.Integer):
+            # For ease of testing
+            numel = int(numel)
+        else:
+            numel = V.graph.sizevars.size_hint(numel)
+        sz_bytes += numel * get_dtype_size(inp.layout.dtype)
+    return sz_bytes
+
+
+def get_collective_group_size(node: ir.IRNode) -> int:
+    if type(node) == ir._CollectiveKernel:
+        from torch.distributed.distributed_c10d import _get_group_size_by_name
+
+        return _get_group_size_by_name(node.constant_args[-1])
+    else:
+        raise TypeError(f"Unsupported collective type: {node}")
 
 
 ####################################################################################################################
@@ -80,68 +100,63 @@ class NCCL_PROTO(IntEnum):
 
 # Latencies in us
 # len(NCCL_ALGO) x len(NCCL_PROTO)
-baseLat = torch.tensor(
+# NOTE: use array instead of tensor to prevent incompatibility with fake mode
+baseLat = [
+    # Tree
+    [
+        6.8,  # LL
+    ],
+    # Ring
     [
-        # Tree
-        [
-            6.8,  # LL
-        ],
-        # Ring
-        [
-            6.6,  # LL
-        ],
-    ]
-)
+        6.6,  # LL
+    ],
+]
 
 # Latencies in us
 # len(NCCL_HW) x len(NCCL_ALGO) x len(NCCL_PROTO)
-hwLat = torch.tensor(
+hwLat = [
+    # NVLINK
+    [
+        [0.6],  # Tree (LL)
+        [0.6],  # Ring (LL)
+    ],
+    # PCI
+    [
+        [1.0],  # Tree (LL)
+        [1.0],  # Ring (LL)
+    ],
+    # NET
     [
-        # NVLINK
-        [
-            [0.6],  # Tree (LL)
-            [0.6],  # Ring (LL)
-        ],
-        # PCI
-        [
-            [1.0],  # Tree (LL)
-            [1.0],  # Ring (LL)
-        ],
-        # NET
-        [
-            [5.0],  # Tree (LL)
-            [2.7],  # Ring (LL)
-        ],
-    ]
-)
+        [5.0],  # Tree (LL)
+        [2.7],  # Ring (LL)
+    ],
+]
 
 
 # LL128 max BW per channel
-llMaxBws = torch.tensor(
+llMaxBws = [
+    # Volta-N1/Intel-N2/Intel-N4
     [
-        # Volta-N1/Intel-N2/Intel-N4
-        [
-            39.0,
-            39.0,
-            20.4,
-        ],
-        # Ampere-N1/AMD-N2/AMD-N4
-        [
-            87.7,
-            22.5,  # avg of ring & tree
-            19.0,
-        ],
-        # Hopper-N1/AMD-N2/AMD-N4
-        [
-            87.7,
-            22.5,  # avg of ring & tree
-            19.0,
-        ],
-    ]
-)
-
-
-def estimate_nccl_collective_runtime(snode: "BaseSchedulerNode") -> float:
+        39.0,
+        39.0,
+        20.4,
+    ],
+    # Ampere-N1/AMD-N2/AMD-N4
+    [
+        87.7,
+        22.5,  # avg of ring & tree
+        19.0,
+    ],
+    # Hopper-N1/AMD-N2/AMD-N4
+    [
+        87.7,
+        22.5,  # avg of ring & tree
+        19.0,
+    ],
+]
+
+
+def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
     """
     Returns estimated NCCL collective runtime in nanoseconds (ns).
 
@@ -154,16 +169,14 @@ def estimate_nccl_collective_runtime(snode: "BaseSchedulerNode") -> float:
     - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
     - collective is one of: allreduce, reducescatter, allgather
     """
-    tensor_numel = V.graph.sizevars.size_hint(sympy_product(snode.node.layout.size))
-    tensor_dtype = snode.node.layout.dtype
-    tensor_storage_size_bytes = tensor_numel * get_dtype_size(tensor_dtype)
+    tensor_storage_size_bytes = get_collective_input_size_bytes(node)
     # Convert bytes to GB
     tensor_storage_size_GB = tensor_storage_size_bytes / 1024 / 1024 / 1024
 
     # Currently assumes each node has 8 gpus. And when >1 node is used, assumes each node uses all 8 gpus.
     # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
     num_gpus_per_node = 8
-    _, _, group_size = snode.node.constant_args  # type: ignore[attr-defined]
+    group_size = get_collective_group_size(node)
     nNodes = math.ceil(group_size / num_gpus_per_node)
     nRanks = group_size  # this is total # of gpus globally that participate in this collective op
 
@@ -173,7 +186,7 @@ def estimate_nccl_collective_runtime(snode: "BaseSchedulerNode") -> float:
     # Assumes ring algorithm
     nccl_algo = NCCL_ALGO.RING
     nccl_proto = NCCL_PROTO.LL
-    coll = get_collective_type(snode)
+    coll = get_collective_type(node)
 
     # =============== bandwidth computation ===============
     # First compute bandwidth in GB/s; then at the end, convert it to GB/ns
@@ -185,7 +198,7 @@ def estimate_nccl_collective_runtime(snode: "BaseSchedulerNode") -> float:
     index2 = nNodes - 1 if nNodes <= 2 else 2
     # LL: for single node, we look at GPU type; for multi-node, we look at CPU type
     index1 = compCapIndex if nNodes == 1 else 0
-    llMaxBw = llMaxBws[index1][index2].item()
+    llMaxBw = llMaxBws[index1][index2]
 
     # NOTE: each step of ring algorithm is synchronized,
     # and is bottlenecked by the slowest link which is the inter-node interconnect.
@@ -209,14 +222,13 @@ def estimate_nccl_collective_runtime(snode: "BaseSchedulerNode") -> float:
         nsteps = nRanks - 1
 
     # Convert bus BW to algorithm BW (tensor bytes / algoBW = actual execution time)
-    ratio = (1.0 * nRanks) / nsteps
+    ratio = (1.0 * nRanks) / nsteps  # type: ignore[possibly-undefined]
     bandwidth = busBw * ratio
     # Convert GB/s to GB/ns
     bandwidth_GB_per_ns = bandwidth / 1e9
 
     # =============== latency computation ===============
     intraHw = NCCL_HW.NVLINK
-    hw = intraHw if nNodes == 1 else NCCL_HW.NET
 
     if coll == NCCL_COLL.ALL_REDUCE:
         if nNodes > 1:
@@ -227,16 +239,16 @@ def estimate_nccl_collective_runtime(snode: "BaseSchedulerNode") -> float:
         nInterSteps = nNodes - 1
 
     # First compute latency in us; then at the end, convert it to ns
-    latency = baseLat[nccl_algo][nccl_proto].item()
-    intraLat = hwLat[intraHw][nccl_algo][nccl_proto].item()
-    interLat = hwLat[NCCL_HW.NET][nccl_algo][nccl_proto].item()
+    latency = baseLat[nccl_algo][nccl_proto]
+    intraLat = hwLat[intraHw][nccl_algo][nccl_proto]
+    interLat = hwLat[NCCL_HW.NET][nccl_algo][nccl_proto]
 
     # Inter-node rings still have to launch nsteps * net overhead.
     netOverhead = 0.0
     if nNodes > 1:
         netOverhead = 1.0  # getNetOverhead(comm);
     intraLat = max(intraLat, netOverhead)
-    latency += (nsteps - nInterSteps) * intraLat + nInterSteps * interLat
+    latency += (nsteps - nInterSteps) * intraLat + nInterSteps * interLat  # type: ignore[possibly-undefined]
     # Convert us to ns
     latency_ns = latency * 1e3
 
diff --git a/torch/_inductor/comms.py b/torch/_inductor/comms.py
index f8e8ef9c3a6c8..4247f9a2cffe7 100644
--- a/torch/_inductor/comms.py
+++ b/torch/_inductor/comms.py
@@ -6,7 +6,7 @@
 
 from . import config, ir, scheduler
 from .dependencies import WeakDep
-from .utils import tuple_sorted
+from .utils import is_collective, is_wait, tuple_sorted
 
 overlap_log = torch._logging.getArtifactLogger(__name__, "overlap")
 
@@ -21,7 +21,7 @@ def sink_waits(
     new_order = []
     cur_waits = set()
     for snode in snodes:
-        if isinstance(snode.node, ir.Wait):
+        if is_wait(snode.node):
             cur_waits.add(snode)
         else:
             for wait in tuple_sorted(cur_waits):
@@ -48,7 +48,7 @@ def raise_comms(
     new_order_reversed: List["scheduler.BaseSchedulerNode"] = []
     cur_comms: List["scheduler.BaseSchedulerNode"] = []
     for snode in reversed(snodes):
-        if isinstance(snode.node, ir.CollectiveKernel):
+        if is_collective(snode.node):
             cur_comms.append(snode)
         else:
             for comm in cur_comms:
@@ -98,14 +98,14 @@ def decide_global_ordering_of_comms(nodes: List["scheduler.BaseSchedulerNode"]):
     (might not be the same ordering as the eager mode program).
     TODO: Come up with a better approach
     """
-    comm_nodes = [n for n in nodes if isinstance(n.node, ir.CollectiveKernel)]
+    comm_nodes = [n for n in nodes if is_collective(n.node)]
     for i in range(1, len(comm_nodes)):
         # Enforce ordering by making previous comm a `WeakDep` dependency of the next comm
         comm_nodes[i].add_fake_dep(WeakDep(comm_nodes[i - 1].get_name()))
 
 
 def assert_no_comm_nodes(snodes: List["scheduler.BaseSchedulerNode"]) -> None:
-    assert not any(isinstance(snode.node, ir.CollectiveKernel) for snode in snodes)
+    assert not any(is_collective(snode.node) for snode in snodes)
 
 
 def estimate_op_runtime(snode: "scheduler.BaseSchedulerNode") -> float:
@@ -141,7 +141,7 @@ def reorder_compute_for_overlap(
 
     comm_nodes = []
     for snode in snodes:
-        if isinstance(snode.node, ir.CollectiveKernel):
+        if is_collective(snode.node):
             comm_nodes.append(snode)
     if len(comm_nodes) == 0:
         # if there is no comm nodes, return the current order
@@ -150,7 +150,7 @@ def reorder_compute_for_overlap(
     comm_ancestors = {node: get_ancestors(node) for node in comm_nodes}
     comm_descendants = {node: get_descendants(node) for node in comm_nodes}
 
-    indeg = {k: 0 for k in snodes}
+    indeg = dict.fromkeys(snodes, 0)
     for snode in snodes:
         for user in snode.node_users:
             if user in indeg:
@@ -192,7 +192,7 @@ def schedule_nodes(snodes):
                     all_nodes.remove(node)
                     progress = True
             if not progress:
-                raise Exception(
+                raise AssertionError(
                     "Unable to find a free node (indeg == 0). This is an impossible state to reach. "
                     "Please report a bug to PyTorch."
                 )
@@ -215,10 +215,8 @@ def schedule_nodes(snodes):
         assert_no_comm_nodes(needed_by_next_comm_and_ready_compute_nodes)
 
         total_compute_runtime_cost = rolled_over_compute_cost + sum(
-            [
-                estimate_op_runtime(node)
-                for node in needed_by_next_comm_and_ready_compute_nodes
-            ]
+            estimate_op_runtime(node)
+            for node in needed_by_next_comm_and_ready_compute_nodes
         )
         prev_comm_runtime_cost = estimate_op_runtime(comm_nodes[idx - 1])
         schedule_nodes(tuple_sorted(needed_by_next_comm_and_ready_compute_nodes))
@@ -310,23 +308,23 @@ def visualize_overlap(order):
     cur_comm_node = None
     for snode in order:
         if cur_comm_node is None:
-            if isinstance(snode.node, ir.CollectiveKernel):
+            if is_collective(snode.node):
                 total_est_runtime += estimate_op_runtime(snode)
                 cur_comm_node = snode.node
-            elif isinstance(snode.node, ir.Wait):
-                raise Exception(
+            elif is_wait(snode.node):
+                raise AssertionError(
                     "Wait is not expected when there is no collective running"
                 )
             else:  # exposed compute op
                 total_est_runtime += estimate_op_runtime(snode)
             overlap_log.debug(f"{node_summary(snode)}")  # noqa: G004
         else:  # cur_comm_node is not None
-            if isinstance(snode.node, ir.CollectiveKernel):
-                raise Exception(
+            if is_collective(snode.node):
+                raise AssertionError(
                     "Found two collectives running at the same time. "
                     "`visualize_overlap` needs to be updated to handle this case"
                 )
-            elif isinstance(snode.node, ir.Wait):  # end of this comm op
+            elif is_wait(snode.node):  # end of this comm op
                 overlap_log.debug(f"{node_summary(snode)}")  # noqa: G004
                 cur_comm_node = None
             else:  # overlapped compute op
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index c4c8f65f8253a..cf37b7b9dbe32 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -1,6 +1,6 @@
 import contextlib
-import dataclasses
 import functools
+import itertools
 import logging
 import os
 import sys
@@ -8,40 +8,49 @@
 import warnings
 from itertools import count
 
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    FrozenSet,
-    List,
-    Optional,
-    Sequence,
-    Tuple,
-    Union,
-)
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 from unittest import mock
 
 from functorch.compile import min_cut_rematerialization_partition
 
-import torch._functorch.config as functorch_config
-
 import torch.fx
 import torch.utils._pytree as pytree
 from torch._dynamo import (
     compiled_autograd,
+    config as dynamo_config,
     logging as dynamo_logging,
     utils as dynamo_utils,
 )
-from torch._dynamo.utils import counters, detect_fake_mode, lazy_format_graph_code
+from torch._dynamo.utils import (
+    detect_fake_mode,
+    flatten_graph_inputs,
+    lazy_format_graph_code,
+)
+from torch._functorch import config as functorch_config
 from torch._functorch.aot_autograd import aot_export_module, make_boxed_func
 from torch._inductor.codecache import code_hash, CompiledFxGraph, FxGraphCache
+from torch._inductor.cudagraph_utils import (
+    BoxedDeviceIndex,
+    get_placeholders,
+    log_cudagraph_skip_and_bump_counter,
+)
 
 from torch._inductor.debug import save_args_for_compile_fx_inner
+from torch._inductor.utils import (
+    BoxedBool,
+    count_tangents,
+    should_assume_input_aligned,
+    tensor_is_aligned,
+)
+from torch._logging import trace_structured
 from torch._ops import OpOverload
 from torch._subclasses.fake_tensor import FakeTensor
+from torch._utils_internal import compile_time_strobelight_meta
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
 from torch.fx.passes.fake_tensor_prop import FakeTensorProp
 
 from .._dynamo.backends.common import aot_autograd
+from ..fx._lazy_graph_module import _use_lazy_graph_module  # type: ignore[attr-defined]
 from ..fx.graph import _PyTreeCodeGen
 from . import config, metrics
 from .debug import DebugContext
@@ -51,11 +60,16 @@
 from .fx_passes.pre_grad import pre_grad_passes
 from .graph import GraphLowering
 from .ir import ExternKernelNode
-from .utils import get_dtype_size, has_incompatible_cudagraph_ops
+from .utils import (
+    get_cloned_parameter_buffer_name,
+    has_incompatible_cudagraph_ops,
+    maybe_get_suppress_shape_guards_ctx,
+    output_node,
+)
 from .virtualized import V
 
 if config.is_fbcode():
-    from torch._inductor.fb.utils import time_and_log
+    from torch._inductor.fb.utils import log_optimus_to_scuba, time_and_log
 else:
     # no-op decorator
     def time_and_log(attr: str):
@@ -68,30 +82,6 @@ def time_and_log(attr: str):
 ALIGNMENT = 16
 
 
-@dataclasses.dataclass
-class BoxedBool:
-    value: bool
-
-    def __bool__(self):
-        return self.value
-
-    @staticmethod
-    def disable(obj):
-        if isinstance(obj, BoxedBool):
-            obj.value = False
-            return obj
-        return False
-
-
-@dataclasses.dataclass
-class BoxedDeviceIndex:
-    value: Optional[int]
-
-    def set(self, device_idx):
-        assert device_idx is None or isinstance(device_idx, int)
-        self.value = device_idx
-
-
 # copy_ fails when trying to write to tensors with memory overlap,
 # for expanded dimensions (a dimension which used to have size 1 -> ?)
 # we can select one element from that dimension and write to it
@@ -111,7 +101,9 @@ def index_expanded_dims(t: torch.Tensor, expanded_dims: List[int]) -> torch.Tens
 def complex_memory_overlap(t: torch.Tensor) -> bool:
     # if torch._debug_has_internal_overlap thinks this tensor potentially has
     # memory overlap internally, let's dig deeper to find out whether it's true.
-    t = index_expanded_dims(t, get_expanded_dims(t))
+    #
+    # Call squeeze() so that dimension with size 1 does not cause false positive.
+    t = index_expanded_dims(t, get_expanded_dims(t)).squeeze()
     if torch._debug_has_internal_overlap(t) != 0:
         strides = t.stride()
         sizes = t.shape
@@ -144,32 +136,172 @@ def _warn_tf32_disabled():
 
 
 def _unlift_graph(mod, gm, graph_signature):
+    from torch.export.unflatten import _assign_attr, _AttrKind
+
     state_dict = {}
     for name, param in mod.named_parameters(remove_duplicate=False):
         state_dict[name] = param
-    for name, param in mod.named_buffers(remove_duplicate=False):
-        state_dict[name] = param
+        _assign_attr(
+            param,
+            gm,
+            name,
+            attr_kind=_AttrKind.PARAMETER,
+        )
+    for name, buffer in mod.named_buffers(remove_duplicate=False):
+        state_dict[name] = buffer
+        _assign_attr(
+            buffer,
+            gm,
+            name,
+            attr_kind=_AttrKind.BUFFER,
+        )
 
-    from torch.export._unlift import _construct_inp_pos_to_param_buffer_name, _unlift
+    placeholder_nodes = gm.graph.find_nodes(op="placeholder")
+    lifted_inputs = []
+
+    # In AOTI, module parameters and buffers are not lifted as graph inputs.
+    # As a result, mutation to buffers has side effect which makes their initial
+    # values different from Eager. So we clone them here as a copy.
+    # We are not cloning for parameters, although it will be needed if we want to
+    # support training.
+    for node in placeholder_nodes:
+        node_name = node.name
+        if node_name in graph_signature.inputs_to_parameters:
+            parameter_name = graph_signature.inputs_to_parameters[node_name]
+            lifted_inputs.append(parameter_name)
+        elif node_name in graph_signature.inputs_to_buffers:
+            buffer_name = graph_signature.inputs_to_buffers[node_name]
+            lifted_inputs.append(buffer_name)
+            gm.meta[
+                get_cloned_parameter_buffer_name(buffer_name)
+            ] = clone_preserve_strides(state_dict[buffer_name])
+        else:
+            assert node_name in graph_signature.user_inputs
+            lifted_inputs.append(None)
+
+    from torch.export._unlift import _unlift
+
+    outputs = list(gm.graph.nodes)[-1].args[0]
+    mutated_outputs = []
+    for out in outputs:
+        if out.name in graph_signature.buffers_to_mutate:
+            mutated_outputs.append(graph_signature.buffers_to_mutate[out.name])
+        else:
+            mutated_outputs.append(None)
 
-    inp_pos_to_param_buffer_name = _construct_inp_pos_to_param_buffer_name(
-        gm,
-        graph_signature,
-        state_dict,
-        {},
-    )
     unlifted_gm = _unlift(
         gm,
-        inp_pos_to_param_buffer_name,
+        lifted_inputs,
+        mutated_outputs,
         pytree.LeafSpec(),
         None,
         state_dict,
         {},
-        graph_signature.buffers_to_mutate,
     )
     return unlifted_gm
 
 
+def _get_subgraph_names(gm):
+    for node in sorted(
+        itertools.chain(
+            gm.graph.find_nodes(op="call_function", target=torch.ops.higher_order.cond),
+            gm.graph.find_nodes(
+                op="call_function", target=torch.ops.higher_order.while_loop
+            ),
+        )
+    ):
+        if node.target == torch.ops.higher_order.cond:
+            true_subgraph_name = node.args[1].name
+            false_subgraph_name = node.args[2].name
+            yield true_subgraph_name
+            yield false_subgraph_name
+        elif node.target == torch.ops.higher_order.while_loop:
+            cond_subgraph_name = node.args[0].name
+            body_subgraph_name = node.args[1].name
+            yield cond_subgraph_name
+            yield body_subgraph_name
+
+
+def _recursive_pre_grad_passes(gm, example_inputs):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        # as we don't have recursive example inputs, passing None here
+        new_subgraph = _recursive_pre_grad_passes(subgraph, example_inputs=None)
+        setattr(gm, subgraph_name, new_subgraph)
+    return pre_grad_passes(gm, example_inputs)
+
+
+def _recursive_joint_graph_passes(gm):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        _recursive_joint_graph_passes(subgraph)
+    joint_graph_passes(gm)
+
+
+def _recursive_post_grad_passes(gm, is_inference: bool = False):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        _recursive_post_grad_passes(subgraph, is_inference)
+    post_grad_passes(gm, is_inference)
+
+
+def split_const_gm(
+    gm: torch.fx.GraphModule,
+) -> Tuple[torch.fx.GraphModule, Dict[str, int]]:
+    """
+    This function takes an GraphModule input "gm".
+    The gm will be split into 2 components,
+      1) const_gm, which consists the subgraph of gm that can be constant folded.
+      2) gm (being inplace modified,) which returns the graph after constant folding.
+
+    const_output_index is a mapping of corresponding node name from gm to the
+    output index of const_gm.
+    Returns (const_gm, const_output_index)
+    """
+    from torch._inductor.constant_folding import (
+        CONST_MODULE_TAG,
+        META_TAG,
+        MODULE_TAG,
+        replace_node_with_constant,
+        run_and_get_constant_graph,
+    )
+
+    const_gm = run_and_get_constant_graph(gm)
+    const_result = const_gm()
+
+    const_outputs = {
+        x.name: idx for idx, x in enumerate(tuple(const_gm.graph.nodes)[-1].args[0])
+    }
+
+    to_erase_node = []
+    to_replace_node = []
+    const_output_index = {}
+    for node in gm.graph.nodes:
+        if node.name in const_outputs:
+            to_replace_node.append(node)
+        elif node.meta[META_TAG] == CONST_MODULE_TAG:
+            to_erase_node.append(node)
+
+    for node in to_replace_node:
+        new_const_name = "_FOLDED_CONST_" + node.name
+        replace_node_with_constant(
+            gm,
+            node,
+            const_result[const_outputs[node.name]],
+            new_const_name,
+        )
+        const_output_index[new_const_name] = const_outputs[node.name]
+    for node in to_erase_node[::-1]:
+        if node.users:
+            for n in node.users:
+                assert n.meta[META_TAG] == MODULE_TAG, f"node: {node} user not empty."
+        else:
+            gm.graph.erase_node(node)
+    gm.recompile()
+
+    return const_gm, const_output_index
+
+
 def is_tf32_warning_applicable(gm: torch.fx.GraphModule):
     aten = torch.ops.aten
     tf32_ops = {
@@ -178,18 +310,33 @@ def is_tf32_warning_applicable(gm: torch.fx.GraphModule):
         aten.bmm.default,
         aten.baddbmm.default,
     }
-    for node in gm.graph.nodes:
-        if (
-            node.op == "call_function"
-            and node.target in tf32_ops
-            and isinstance(node.meta.get("val", None), torch.Tensor)
-            and node.meta["val"].dtype == torch.float32
-            and node.meta["val"].device.type == "cuda"
-        ):
-            return True
+    for target in tf32_ops:
+        for node in gm.graph.find_nodes(op="call_function", target=target):
+            if (
+                isinstance(node.meta.get("val", None), torch.Tensor)
+                and node.meta["val"].dtype == torch.float32
+                and node.meta["val"].device.type == "cuda"
+            ):
+                return True
     return False
 
 
+def maybe_disable_comprehensive_padding(example_inputs: List[torch.Tensor]):
+    """
+    For CPU backend, enable comprehensive padding causes some unit tests
+    fail due to changing number of generated kernels. Skip for now.
+    """
+    has_cuda = any(
+        t.device.type == "cuda" for t in example_inputs if isinstance(t, torch.Tensor)
+    )
+
+    if config.comprehensive_padding and not has_cuda:
+        perf_hint_log.info("Skip comprehensive padding on CPU")
+        return config.patch(comprehensive_padding=False)
+    else:
+        return contextlib.nullcontext()
+
+
 @DebugContext.wrap
 def count_bytes_inner(
     gm: torch.fx.GraphModule,
@@ -201,10 +348,12 @@ def count_bytes_inner(
     fake_mode = fake_tensor_prop(gm, example_inputs)
 
     with V.set_fake_mode(fake_mode):
-        post_grad_passes(gm, False)
+        _recursive_post_grad_passes(gm, False)
 
     graph = GraphLowering(gm, shape_env=shape_env, num_static_inputs=num_fixed)
-    with V.set_graph_handler(graph), V.set_real_inputs(example_inputs):
+    with V.set_graph_handler(graph), V.set_real_inputs(
+        example_inputs
+    ), maybe_disable_comprehensive_padding(example_inputs):
         graph.run(*example_inputs)
         num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
         metrics.num_bytes_accessed += num_bytes
@@ -241,9 +390,39 @@ def fake_tensor_prop(
     return fake_mode
 
 
+def should_use_remote_fx_graph_cache():
+    if config.fx_graph_remote_cache:
+        return True
+    if not config.is_fbcode():
+        return False
+    if torch.version.hip is not None:
+        return False
+
+    try:
+        from triton.runtime.fb_memcache import MEMCACHE_VERSION
+    except ModuleNotFoundError:
+        return False
+
+    return MEMCACHE_VERSION >= torch._utils_internal.justknobs_getval_int(
+        "pytorch/remote_cache:fx_graph_memcache_version"
+    )
+
+
+# pass config dict back to user
+def get_patched_config_dict(config_patches=None) -> Dict[str, Any]:
+    with config.patch(config_patches):
+        return config.get_config_copy()
+
+
 @DebugContext.wrap
 @torch.utils._python_dispatch._disable_current_modes()
 @time_and_log(attr="compilation time (in seconds)")
+# Need this decorator for compile_fx_inner even if we already have one for
+# compile_fx. The reason is the compilation for backward graph may happen after
+# compile_fx return and we may want to use the _LazyGraphModule for compiling
+# the backward graph as well.
+@_use_lazy_graph_module(dynamo_config.use_lazy_graph_module)
+@dynamo_utils.dynamo_timed(phase_name="inductor_compile")
 def compile_fx_inner(
     gm: torch.fx.GraphModule,
     example_inputs: List[torch.Tensor],
@@ -255,7 +434,7 @@ def compile_fx_inner(
     aot_mode: bool = False,
     is_inference: bool = False,
     boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
-    user_visible_outputs: FrozenSet[str] = frozenset(),
+    user_visible_outputs: Optional[Dict[str, None]] = None,
     layout_opt: Optional[bool] = None,
     extern_node_serializer: Optional[Callable[[List[ExternKernelNode]], Any]] = None,
 ) -> Union[CompiledFxGraph, str]:
@@ -266,6 +445,11 @@ def compile_fx_inner(
     also update the call to save_args_for_compile_fx_inner below accordingly.
     """
     if dynamo_utils.count_calls(gm.graph) == 0 and not aot_mode:
+        # trigger the real recompilation for _LazyGraphModule before returning
+        # the forward method.
+        from torch.fx._lazy_graph_module import _LazyGraphModule
+
+        _LazyGraphModule.force_recompile(gm)
         return make_boxed_func(gm.forward)
 
     assert isinstance(
@@ -309,9 +493,15 @@ def compile_fx_inner(
 
     start = time.time()
 
-    if config.fx_graph_cache and not aot_mode:
+    fx_graph_remote_cache = should_use_remote_fx_graph_cache()
+    if (config.fx_graph_cache or fx_graph_remote_cache) and not aot_mode:
         compiled_graph = FxGraphCache.load(
-            fx_codegen_and_compile, gm, example_inputs, graph_kwargs
+            fx_codegen_and_compile,
+            gm,
+            example_inputs,
+            graph_kwargs,
+            local=config.fx_graph_cache,
+            remote=fx_graph_remote_cache,
         )
     else:
         compiled_graph = fx_codegen_and_compile(
@@ -320,6 +510,14 @@ def compile_fx_inner(
 
     log.debug("FX codegen and compilation took %.3fs", time.time() - start)
 
+    # check cudagraph disabling reasons from inductor lowering
+    if cudagraphs and compiled_graph.disabled_cudagraphs_reason:
+        if "cuda" in compiled_graph.device_types:
+            log_cudagraph_skip_and_bump_counter(
+                f"skipping cudagraphs due to {compiled_graph.disabled_cudagraphs_reason}"
+            )
+        BoxedBool.disable(cudagraphs)
+
     # Return the output strides to the caller via TracingContext
     context = torch._guards.TracingContext.try_get()
     if context is not None and context.output_strides is not None:
@@ -331,7 +529,7 @@ def compile_fx_inner(
 
     if cudagraphs:
         # output args are tuple of first argument
-        output = list(gm.graph.nodes)[-1]
+        output = output_node(gm)
         assert len(output.args) == 1
         stack_traces = [
             (arg.stack_trace if isinstance(arg, torch.fx.node.Node) else None)
@@ -344,17 +542,24 @@ def compile_fx_inner(
             if isinstance(t, torch.Tensor)
         )
 
-        # doesnt work for non-trees because the warmup run would apply mutation twice
-        if config.triton.cudagraph_trees:
-            # checking if mutation is only on parameters/static inputs
-            has_mutation = not all(
-                idx < num_fixed for idx in compiled_graph.mutated_input_idxs
+        if not config.triton.cudagraph_support_input_mutation:
+            # Skip supports for cudagraph-managed tensors
+            from torch._inductor.cudagraph_utils import (
+                check_for_mutation_ignore_cuda_graph_managed_tensor,
+            )
+
+            has_mutation_str = check_for_mutation_ignore_cuda_graph_managed_tensor(
+                gm, compiled_graph, num_fixed
             )
+            has_mutation = has_mutation_str is not None
+
+            if has_mutation:
+                compiled_graph.disabled_cudagraphs_reason = has_mutation_str
         else:
-            has_mutation = len(compiled_graph.mutated_inputs) != 0
+            # Check mutation later to support cudagraph-managed tensors
+            has_mutation = None
 
         cudagraph_tests = [
-            (set(compiled_graph.device_types) == {"cuda"}, "non-cuda device in graph"),
             (not has_mutation, "mutated inputs"),
             (not has_incompatible_cudagraph_ops(gm), "incompatible ops"),
             (not complex_memory_overlap_inputs, "complex memory overlap"),
@@ -364,13 +569,6 @@ def compile_fx_inner(
                 ),
                 "non-Tensor inputs",
             ),
-            (
-                (
-                    len(compiled_graph.device_idxs) == 1
-                    or not config.triton.cudagraph_trees
-                ),
-                "multiple device indices with cudagraph_trees",
-            ),
         ]
         cudagraph_fail_reasons = [s for b, s in cudagraph_tests if not b]
 
@@ -389,7 +587,7 @@ def compile_fx_inner(
                 boxed_forward_device_index.set(next(iter(compiled_graph.device_idxs)))
 
             compiled_graph.current_callable = cudagraphify(
-                compiled_graph.get_current_callable(),
+                compiled_graph.current_callable,
                 example_inputs,
                 static_input_idxs=range(num_fixed),
                 device_index=next(iter(compiled_graph.device_idxs)),
@@ -397,6 +595,8 @@ def compile_fx_inner(
                 is_backward=is_backward,
                 is_inference=is_inference,
                 constants=tuple(compiled_graph.constants.values()),
+                placeholders=tuple(get_placeholders(gm.graph)),
+                mutated_input_idxs=tuple(compiled_graph.mutated_input_idxs),
             )
         else:
             BoxedBool.disable(cudagraphs)
@@ -407,7 +607,7 @@ def compile_fx_inner(
             if is_backward and config.triton.cudagraph_trees:
                 assert boxed_forward_device_index is not None
                 assert boxed_forward_device_index.value is not None
-                compiled_graph_callable = compiled_graph.get_current_callable()
+                compiled_graph_callable = compiled_graph.current_callable
 
                 manager = torch._inductor.cudagraph_trees.get_manager(
                     boxed_forward_device_index.value, create_if_none_exists=False
@@ -416,22 +616,29 @@ def compile_fx_inner(
                 assert manager is not None
 
                 def compiled_artifact(new_inputs):
-                    manager.set_to_running_backward()
+                    manager.set_to_running_backward()  # type: ignore[union-attr]
                     return compiled_graph_callable(new_inputs)
 
                 compiled_graph.current_callable = compiled_artifact
 
             if "cuda" in compiled_graph.device_types:
-                perf_hint_log.warning(
-                    "skipping cudagraphs due to %s", cudagraph_fail_reasons
-                )
+                # prefer better disable_cudagraphs_reason bc stack trace
+                # TODO: migrate all disable reasons to stack trace, refactor
+                if compiled_graph.disabled_cudagraphs_reason:
+                    log_cudagraph_skip_and_bump_counter(
+                        compiled_graph.disabled_cudagraphs_reason
+                    )
+                else:
+                    log_cudagraph_skip_and_bump_counter(
+                        f"skipping cudagraphs due to {cudagraph_fail_reasons}"
+                    )
 
     # cudagraphs does its own aligning of inputs
     if not cudagraphs:
         new_callable = align_inputs(
-            compiled_graph.get_current_callable(), example_inputs, range(num_fixed)
+            compiled_graph.current_callable, example_inputs, range(num_fixed)
         )
-        if new_callable is not compiled_graph.get_current_callable():
+        if new_callable is not compiled_graph.current_callable:
             compiled_graph.current_callable = new_callable
 
     _step_logger()(
@@ -446,6 +653,7 @@ def compiled_artifact(new_inputs):
     return compiled_graph
 
 
+@dynamo_utils.preserve_rng_state()
 def fx_codegen_and_compile(
     gm: torch.fx.GraphModule,
     example_inputs: List[torch.Tensor],
@@ -456,7 +664,9 @@ def fx_codegen_and_compile(
     cpp_wrapper: bool = False,
     aot_mode: bool = False,
     is_inference: bool = False,
-    user_visible_outputs: FrozenSet[str] = frozenset(),
+    # Use a dict with None value rather than a set for deterministic
+    # iteration order just in case.
+    user_visible_outputs: Optional[Dict[str, None]] = None,
     layout_opt: Optional[bool] = None,
     extern_node_serializer: Optional[Callable[[List[ExternKernelNode]], Any]] = None,
 ) -> Union[CompiledFxGraph, str]:
@@ -474,6 +684,9 @@ def fx_codegen_and_compile(
         f"graph {graph_id}",
     )
     V.debug.fx_graph(gm, example_inputs)
+    # TODO: Should we actually dump this?  It should be redundant with the aot
+    # structured logs...
+    # trace_structured("inductor_input_graph", payload_fn=lambda: gm.print_readable(print_output=False))
 
     shape_env = _shape_env_from_inputs(example_inputs)
 
@@ -508,15 +721,47 @@ def fx_codegen_and_compile(
 
     with V.set_fake_mode(fake_mode):
         # has some issues with memory in training
-        post_grad_passes(gm, is_inference=is_inference)
+        _recursive_post_grad_passes(gm, is_inference=is_inference)
         V.debug.fx_graph_transformed(gm, example_inputs)
         post_grad_graphs_log.debug("%s", lazy_format_graph_code("AFTER POST GRAD", gm))
-        log.debug(
-            "counters of inductor dict after apply passes on the input FX graph in the post grad pass: %s",
-            counters["inductor"],
+        trace_structured(
+            "inductor_post_grad_graph",
+            payload_fn=lambda: gm.print_readable(print_output=False),
         )
+        if config.is_fbcode():
+            log_optimus_to_scuba(
+                extra_logging={"pt2_configs": str(get_patched_config_dict())}
+            )
+
+    with V.set_fake_mode(fake_mode), maybe_disable_comprehensive_padding(
+        example_inputs
+    ):
+        const_output_index = None
+        const_graph = None
+        const_code = None
+
+        if aot_mode and config.aot_inductor.use_runtime_constant_folding:
+            const_gm, const_output_index = split_const_gm(gm)
+
+            const_graph = GraphLowering(
+                const_gm,
+                example_inputs=[],
+                shape_env=shape_env,
+                num_static_inputs=num_fixed,
+                graph_id=graph_id,
+                cpp_wrapper=cpp_wrapper,
+                aot_mode=aot_mode,
+                user_visible_outputs=user_visible_outputs,
+                extern_node_serializer=extern_node_serializer,
+                is_inference=is_inference,
+                is_const_graph=True,
+            )
+            with V.set_graph_handler(const_graph):
+                assert cpp_wrapper, "AOT mode only supports C++ wrapper"
+                const_graph.run()
+
+                const_code, _ = const_graph.codegen_with_cpp_wrapper()
 
-    with V.set_fake_mode(fake_mode):
         graph = GraphLowering(
             gm,
             # example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning.
@@ -531,6 +776,9 @@ def fx_codegen_and_compile(
             user_visible_outputs=user_visible_outputs,
             extern_node_serializer=extern_node_serializer,
             is_inference=is_inference,
+            const_output_index=const_output_index,
+            const_code=const_code,
+            const_module=const_graph,
         )
         with V.set_graph_handler(graph):
             graph.run(*example_inputs)
@@ -539,7 +787,10 @@ def fx_codegen_and_compile(
                 # We'll put the output strides in the compiled graph so we
                 # can later return them to the caller via TracingContext
                 for out in graph.graph_outputs:
-                    if hasattr(out, "layout"):
+                    if (
+                        hasattr(out, "layout")
+                        and len(free_unbacked_symbols(out.layout.stride)) == 0
+                    ):
                         output_strides.append(
                             tuple(
                                 V.graph.sizevars.size_hint(s) for s in out.layout.stride
@@ -548,18 +799,53 @@ def fx_codegen_and_compile(
                     else:
                         output_strides.append(None)
 
+            metrics_helper = metrics.CachedMetricsHelper()
             compiled_fn = graph.compile_to_fn()
 
+            if (
+                cudagraphs
+                and config.triton.cudagraph_skip_dynamic_graphs
+                and not V.graph.disable_cudagraphs_reason
+                and torch._inductor.utils.any_is_symbolic(*example_inputs)
+            ):
+                stack_trace = None
+                for node in gm.graph.nodes:
+                    meta_val = node.meta.get("val", None)
+                    if (
+                        node.op == "placeholder"
+                        or not isinstance(meta_val, torch.Tensor)
+                        or not torch._inductor.utils.any_is_symbolic(meta_val)
+                    ):
+                        continue
+
+                    if stack_trace := node.meta.get("stack_trace", None):
+                        break
+                disable = "graph with symbolic shapes inputs and config.triton.cudagraph_skip_dynamic_graphs=True."
+                if stack_trace:
+                    disable = f"{disable} Found from {stack_trace}\n"
+                else:
+                    disable = f"{disable}\n"
+                V.graph.disable_cudagraphs_reason = disable
+
             if V.aot_compilation is True:
                 return compiled_fn
 
-            if cudagraphs and graph.disable_cudagraphs:
-                perf_hint_log.warning(
-                    "skipping cudagraphs due to %s", V.graph.disable_cudagraphs_reason
+            if cudagraphs and not V.graph.disable_cudagraphs_reason:
+                from torch._inductor.cudagraph_utils import (
+                    check_lowering_disable_cudagraph,
+                )
+
+                V.graph.disable_cudagraphs_reason = check_lowering_disable_cudagraph(
+                    V.graph.device_node_mapping
                 )
-                BoxedBool.disable(cudagraphs)
 
-            compiled_graph = CompiledFxGraph(compiled_fn, graph, output_strides)
+            compiled_graph = CompiledFxGraph(
+                compiled_fn,
+                graph,
+                output_strides,
+                V.graph.disable_cudagraphs_reason,
+                metrics_helper.get_deltas(),
+            )
 
     return compiled_graph
 
@@ -584,20 +870,34 @@ def get_input_idxs_to_check(
     inputs: Union[List[torch.Tensor], Sequence[int]],
     static_input_idxs: Sequence[int],
 ) -> Sequence[int]:
-    def is_aligned(storage_offset, dtype):
-        return (storage_offset * get_dtype_size(dtype)) % ALIGNMENT == 0
-
+    """
+    This function runs at compile time, and generates a list of indices for which we
+    might need to do a copy to preserve alignment requirements.
+    """
     ids_to_check = []
+
     for i, input in enumerate(inputs):
-        if (
-            isinstance(input, torch.Tensor)
-            and (
-                i not in static_input_idxs
-                or not is_aligned(input.storage_offset(), input.dtype)
-            )
-            and input.device.type == "cuda"
-        ):
-            ids_to_check.append(i)
+        if not isinstance(input, torch.Tensor):
+            # non-tensors don't need alignment
+            continue
+        if input.device.type != "cuda":
+            # right now we only care for cuda tensors
+            continue
+        with maybe_get_suppress_shape_guards_ctx():
+            # suppress guards so that tensor_is_aligned and should_assume_input_aligned
+            # do not add guards on input's storage offset
+            if i in static_input_idxs and tensor_is_aligned(input):
+                continue
+            if not should_assume_input_aligned(input):
+                continue
+
+        # if we get here, then
+        # (a) our triton code assumes that the input is aligned
+        # (b) we can't be sure ahead of time that the input will actually be aligned.
+        # therefore, at runtime, we'll need to check that the input is aligned
+        # (and if not, clone it to make it aligned.)
+        ids_to_check.append(i)
+
     return ids_to_check
 
 
@@ -634,6 +934,8 @@ def cudagraphify(
     is_backward: bool,
     is_inference: bool,
     constants: Tuple[torch.Tensor, ...] = (),
+    placeholders: Tuple[torch.fx.Node, ...] = (),
+    mutated_input_idxs: Tuple[int, ...] = (),
 ):
     from torch._inductor.cudagraph_trees import (
         cudagraphify_impl as new_cudagraphify_impl,
@@ -648,6 +950,8 @@ def cudagraphify(
             is_backward=is_backward,
             is_inference=is_inference,
             constants=constants,
+            placeholders=placeholders,
+            mutated_input_idxs=mutated_input_idxs,
         )
     else:
         cudagraphify_fn = cudagraphify_impl
@@ -799,30 +1103,6 @@ def run(new_inputs):
     return align_inputs_from_check_idxs(run, check_input_idxs)
 
 
-def count_tangents(fx_g: torch.fx.GraphModule):
-    """
-    Infers which inputs are static for a backwards graph
-    """
-
-    def is_saved_tensor(x):
-        return (
-            "tangents" not in x.name
-            and "bwd_seed" not in x.name
-            and "bwd_base_offset" not in x.name
-        )
-
-    arg_count = 0
-    static_arg_idxs = []
-    for n in fx_g.graph.nodes:
-        if n.op == "placeholder":
-            if is_saved_tensor(n):
-                static_arg_idxs.append(arg_count)
-            arg_count += 1
-
-    assert static_arg_idxs == list(range(len(static_arg_idxs)))
-    return len(static_arg_idxs)
-
-
 def compile_fx_aot(
     model_: torch.fx.GraphModule,
     example_inputs_: List[torch.Tensor],
@@ -877,7 +1157,7 @@ def fw_compiler_freezing(
     from torch._inductor.freezing import convert_conv_weights_to_channels_last, freeze
 
     # partition_fn won't be called
-    joint_graph_passes(aot_autograd_model)
+    _recursive_joint_graph_passes(aot_autograd_model)
 
     layout_opt = GraphLowering.decide_layout_opt(aot_autograd_model, is_inference=True)
     if layout_opt:
@@ -899,9 +1179,9 @@ def fw_compiler_freezing(
     # for freezing, all graph outputs should be user visible
     *_, model_outputs_node = opt_model.graph.nodes
     model_outputs = model_outputs_node.args[0]
-    user_visible_outputs = [
+    user_visible_outputs = dict.fromkeys(
         n.name for n in model_outputs if isinstance(n, torch.fx.Node)
-    ]
+    )
 
     # constant params will be real tensors, not fake
     tracing_context = torch._guards.TracingContext.try_get()
@@ -940,6 +1220,7 @@ def wrapper(args):
     return wrapper
 
 
+@_use_lazy_graph_module(dynamo_config.use_lazy_graph_module)
 def compile_fx(
     model_: torch.fx.GraphModule,
     example_inputs_: List[torch.Tensor],
@@ -1013,11 +1294,7 @@ def compile_fx(
                 recursive_compile_fx,
             )
 
-        model_ = pre_grad_passes(model_, example_inputs_)
-        log.debug(
-            "counters of inductor dict after apply passes on the input FX graph in the pre grad pass: %s",
-            counters["inductor"],
-        )
+        model_ = _recursive_pre_grad_passes(model_, example_inputs_)
 
     if any(isinstance(x, (list, tuple, dict)) for x in example_inputs_):
         return flatten_graph_inputs(
@@ -1045,15 +1322,15 @@ def fw_compiler_base(
     ):
         if is_inference:
             # partition_fn won't be called
-            joint_graph_passes(model)
+            _recursive_joint_graph_passes(model)
 
-        num_rng_seed_offset_inputs = 2 if functorch_config.functionalize_rng_ops else 0
-        fixed = len(example_inputs) - num_example_inputs - num_rng_seed_offset_inputs
-        user_visible_outputs = set()
+        fixed = torch._inductor.utils.num_fw_fixed_arguments(
+            num_example_inputs, len(example_inputs)
+        )
+        user_visible_outputs = {}
 
         if config.keep_output_stride:
-            *_, model_outputs_node = model.graph.nodes
-            assert model_outputs_node.op == "output"
+            model_outputs_node = output_node(model)
             model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
             num_model_outputs = len(model_outputs)
 
@@ -1096,11 +1373,11 @@ def fw_compiler_base(
             # of "graph" outputs. Make sure we're within bounds.
             assert orig_output_end_idx <= num_model_outputs
 
-            user_visible_outputs = {
+            user_visible_outputs = dict.fromkeys(
                 n.name
                 for n in model_outputs[original_output_start_index:orig_output_end_idx]
                 if isinstance(n, torch.fx.Node)
-            }
+            )
 
         return inner_compile(
             model,
@@ -1129,13 +1406,23 @@ def fw_compiler_base(
         inference_compiler = functools.partial(fw_compiler_base, is_inference=True)
 
     def partition_fn(graph, joint_inputs, **kwargs):
-        joint_graph_passes(graph)
+        _recursive_joint_graph_passes(graph)
         return min_cut_rematerialization_partition(
             graph, joint_inputs, **kwargs, compiler="inductor"
         )
 
+    @compile_time_strobelight_meta(phase_name="bw_compiler")
     @dynamo_utils.dynamo_timed
+    @dynamo_utils.maybe_cprofile
     def bw_compiler(model: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+        user_visible_outputs = {}
+
+        if config.bw_outputs_user_visible:
+            model_outputs_node = output_node(model)
+            model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
+            user_visible_outputs = dict.fromkeys(
+                n.name for n in model_outputs if isinstance(n, torch.fx.Node)
+            )
         fixed = count_tangents(model)
         return inner_compile(
             model,
@@ -1145,6 +1432,7 @@ def bw_compiler(model: torch.fx.GraphModule, example_inputs: List[torch.Tensor])
             is_backward=True,
             graph_id=graph_id,
             boxed_forward_device_index=forward_device,
+            user_visible_outputs=user_visible_outputs,
         )
 
     # TODO: can add logging before/after the call to create_aot_dispatcher_function
@@ -1160,20 +1448,33 @@ def bw_compiler(model: torch.fx.GraphModule, example_inputs: List[torch.Tensor])
     )
 
     if V.aot_compilation is True:
-        gm, graph_signature = aot_export_module(
-            model_, example_inputs_, trace_joint=False, decompositions=decompositions
-        )
+        with functorch_config.patch(unlift_effect_tokens=True):
+            gm, graph_signature = aot_export_module(
+                model_,
+                example_inputs_,
+                trace_joint=False,
+                decompositions=decompositions,
+            )
         unlifted_gm = _unlift_graph(model_, gm, graph_signature)
         if "dynamo_flat_name_to_original_fqn" in model_.meta:
             unlifted_gm.meta["dynamo_flat_name_to_original_fqn"] = model_.meta[
                 "dynamo_flat_name_to_original_fqn"
             ]
-        with V.set_fake_mode(fake_mode), compiled_autograd.disable():
+
+        # Disable amp as in aot_dispatch_autograd (https://github.com/pytorch/pytorch/pull/86515)
+        # In inference_compiler (fw_compiler_base), _recursive_joint_graph_passes will call into
+        # _sfdp_init() to register patterns.
+        # When fallback_random is set to True, the sdpa patterns will be traced during runtime.
+        # If amp is turned on, the traced FP32 patterns will have prims.convert_element_type which
+        # will be the same as the generated FP16 patterns.
+        disable_amp = torch._C._is_any_autocast_enabled()
+        context = torch._C._DisableAutocast if disable_amp else contextlib.nullcontext
+        with V.set_fake_mode(fake_mode), compiled_autograd.disable(), context():
             return inference_compiler(unlifted_gm, example_inputs_)
 
     with V.set_fake_mode(fake_mode), torch._guards.tracing(
         tracing_context
-    ), compiled_autograd.disable():
+    ), compiled_autograd.disable(), functorch_config.patch(unlift_effect_tokens=True):
         return aot_autograd(
             fw_compiler=fw_compiler,
             bw_compiler=bw_compiler,
@@ -1184,12 +1485,6 @@ def bw_compiler(model: torch.fx.GraphModule, example_inputs: List[torch.Tensor])
         )(model_, example_inputs_)
 
 
-# pass config dict back to user
-def get_patched_config_dict(config_patches=None):
-    with config.patch(config_patches):
-        return config.get_config_copy()
-
-
 def _shape_env_from_inputs(inputs: List[torch.Tensor]):
     shape_env = None
     fake_mode = detect_fake_mode(inputs)
@@ -1211,13 +1506,6 @@ def _shape_env_from_inputs(inputs: List[torch.Tensor]):
     return None
 
 
-def output_node(gm: torch.fx.GraphModule):
-    """Get the output node from an FX graph"""
-    last_node = next(iter(reversed(gm.graph.nodes)))
-    assert last_node.op == "output"
-    return last_node
-
-
 def graph_returns_tuple(gm: torch.fx.GraphModule):
     """True if a FX graph returns a tuple"""
     if not isinstance(gm, torch.fx.GraphModule):
@@ -1262,33 +1550,6 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-def flatten_graph_inputs(gm: torch.fx.GraphModule, inputs, compile_gm):
-    """
-    Mutate inputs so that they are flat and wrap gm such that it
-    accepts those inputs.  This is only needed for graphs not created
-    by torchdynamo that take bumpy inputs.
-    """
-    inputs, spec = pytree.tree_flatten(inputs)
-
-    class GmWrapper(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.gm = gm
-
-        def forward(self, *args):
-            args: List[Any] = list(args)
-            return self.gm(*pytree.tree_unflatten(args, spec))
-
-    compiled_fn = compile_gm(GmWrapper(), inputs)
-
-    @functools.wraps(compiled_fn)
-    def wrapper(*args):
-        # note this doesn't check the spec, assuming it is the same
-        return compiled_fn(*pytree.arg_tree_leaves(*args))
-
-    return wrapper
-
-
 def handle_dynamo_export_graph(
     gm: torch.fx.GraphModule,
     inputs: List[torch.Tensor],
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index b61e53f09cea0..980d08123707c 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -1,6 +1,6 @@
 import os  # noqa: C101
 import sys
-from typing import Any, Dict, TYPE_CHECKING
+from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union
 
 import torch
 
@@ -24,9 +24,27 @@ def is_fbcode():
 # use fx aot graph codegen cache
 fx_graph_cache = os.environ.get("TORCHINDUCTOR_FX_GRAPH_CACHE") == "1"
 
+# use fx aot graph codegen cache
+fx_graph_remote_cache = os.environ.get("TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE") == "1"
+
+# enable autotune local cache
+autotune_local_cache = True
+
+# enable autotune remote cache
+autotune_remote_cache = os.environ.get("TORCHINDUCTOR_AUTOTUNE_REMOTE_CACHE") == "1"
+
 # use cpp wrapper instead of python wrapper
 cpp_wrapper = os.environ.get("TORCHINDUCTOR_CPP_WRAPPER", "0") == "1"
 
+# codegen cpp wrapper code in an ABI compatible mode
+abi_compatible = (
+    os.environ.get("TORCHINDUCTOR_ABI_COMPATIBLE", "1" if is_fbcode() else "0") == "1"
+)
+
+c_shim_version = os.environ.get(
+    "TORCHINDUCTOR_C_SHIM_VERSION", "1" if is_fbcode() else "2"
+)
+
 # dead code elimination
 dce = False
 
@@ -81,15 +99,19 @@ def is_fbcode():
 #
 # torch._inductor.config.post_grad_custom_pre_pass = my_custom_pre_pass
 # torch._inductor.config.post_grad_custom_post_pass = my_custom_post_pass
-post_grad_custom_pre_pass = None
-post_grad_custom_post_pass = None
+post_grad_custom_pre_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None
+post_grad_custom_post_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None
+
+# Registers a custom joint graph pass.
+joint_custom_pre_pass: Optional[Callable[[torch.fx.Graph], None]] = None
+joint_custom_post_pass: Optional[Callable[[torch.fx.Graph], None]] = None
 
 # Registers a custom pregrad pass. Note that the pre-grad IR is 1.
 # non-functional, 2. non-normalized, and 3. prone to change. Ideally we should
 # use post-grad passes.
-pre_grad_custom_pass = None
+pre_grad_custom_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None
 
-# Optimize away split cat patterns (Experimental)
+# Deprecated
 split_cat_fx_passes = True
 
 # Optimize conv-batchnorm if batchnorm is in eval mode. Slightly reduces numerical stability.
@@ -104,8 +126,24 @@ def is_fbcode():
 # Deprecated
 batch_fusion = True
 
-# Pre grad group/batch fusion and options in order, set to empty dict to disable fusion.
+# Pre grad fusion and options in order, set to empty dict to disable fusion.
 # Call `torch._inductor.fx_passes.group_batch_fusion.list_group_batch_fusions()` to see available fusions.
+# batch fusion options:
+# batch_linear
+# batch_linear_lhs
+# batch_layernorm
+# batch_tanh
+# batch_relu
+# batch_sigmoid
+
+# split cat fusion options:
+# normalization_pass
+# remove_split_with_size_one_pass
+# merge_getitem_cat_pass
+# merge_stack_tahn_unbind
+# merge_splits_pass
+# mutate_cat_pass
+# split_cat_pass
 pre_grad_fusion_options: Dict[str, Dict[str, Any]] = {
     "batch_linear": {},
     "batch_linear_lhs": {},
@@ -115,7 +153,7 @@ def is_fbcode():
     "batch_sigmoid": {},
 }
 
-# Post grad group/batch fusion and options, set to empty dict to disable fusion.
+# Post grad fusion and options, set to empty dict to disable fusion.
 # Call `torch._inductor.fx_passes.group_batch_fusion.list_group_batch_fusions(False)` to see available fusions.
 post_grad_fusion_options: Dict[str, Dict[str, Any]] = {}
 
@@ -134,6 +172,18 @@ def is_fbcode():
 # Autotune will compare perf with normal cast->then->mm option
 use_mixed_mm = False
 
+# enable runtime numeric check for pre/post grad fx passes
+# floating point provides limited accuracy (about 7 decimal digits for single precision
+# floating point numbers,about 16 decimal digits for double precision floating point numbers)
+# according to PyTorch documentation.
+# https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations
+fx_passes_numeric_check: Dict[str, Any] = {
+    "pre_grad": False,
+    "precision": 1e-4,
+    "num_iterations": 1,
+    "requires_optimizer": True,
+}
+
 # for pattern torch.mm(a, b.to(dtype)) with cuda tensors, always use
 # torch._inductor.kernel.mm.tuned_mixed_mm's fused kernel.
 # Autotune will not compare with normal cast->then->mm option.
@@ -200,6 +250,15 @@ def is_fbcode():
 # We will disable creating subprocess for autotuning if this is False
 autotune_in_subproc = os.environ.get("TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC") == "1"
 
+# The following three timeouts are applicable if autotune_in_subproc is True:
+
+# Max time that a a valid benchmark result may take during autotuning
+max_autotune_subproc_result_timeout_seconds = 60.0
+# Additional time we allow subprocesses to terminate gracefully after the timeout until we send a SIGTERM
+max_autotune_subproc_graceful_timeout_seconds = 1.0
+# Additional time that we grant after a SIGTERM until we do a hard SIGKILL of subprocesses
+max_autotune_subproc_terminate_timeout_seconds = 2.0
+
 # If autotuning in subprocess, whether to use multiple devices
 autotune_multi_device = os.environ.get("TORCHINDUCTOR_AUTOTUNE_MULTI_DEVICE") == "1"
 
@@ -213,8 +272,11 @@ def is_fbcode():
     os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_RADIUS", "1")
 )
 
-layout_optimization = os.environ.get("TORCHINDUCTOR_LAYOUT_OPTIMIZATION", "1") == "1"
-
+# Disabled by default on ROCm, opt-in if model utilises NHWC convolutions
+layout_opt_default = "1" if not torch.version.hip else "0"
+layout_optimization = (
+    os.environ.get("TORCHINDUCTOR_LAYOUT_OPTIMIZATION", layout_opt_default) == "1"
+)
 
 force_layout_optimization = os.environ.get("TORCHINDUCTOR_FORCE_LAYOUT_OPT", "0") == "1"
 
@@ -232,7 +294,7 @@ def is_fbcode():
 # For fanouts, rematerialization can lead to exponential blowup. So, have
 # smaller threshold
 realize_reads_threshold = 4
-realize_bytes_threshold = 2000
+realize_opcount_threshold = 30
 
 # Threshold to prevent excessive accumulation of ops in one buffer during lowering
 realize_acc_reads_threshold = 8
@@ -252,11 +314,21 @@ def is_fbcode():
 benchmark_fusion = os.environ.get("TORCHINDUCTOR_BENCHMARK_FUSION") == "1"
 enabled_metric_tables = os.environ.get("TORCHINDUCTOR_ENABLED_METRIC_TABLES", "")
 
+benchmark_multi_templates = (
+    os.environ.get(
+        "TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES", "0" if is_fbcode() else "1"
+    )
+    == "1"
+)
+
+# Take how many of the top triton kernels to benchmark epilogue
+max_epilogue_benchmarked_choices = 3
+
 # how many nodes to allow into a single fusion
 max_fusion_size = 64
 
 # max number of inputs to generate cat as a pointwise op with masked laods
-max_pointwise_cat_inputs = 4
+max_pointwise_cat_inputs = 8
 
 # replace small reductions with pointwise, disable with `= 1`
 unroll_reductions_threshold = 8
@@ -283,6 +355,9 @@ def is_fbcode():
 # assert that indirect indexing does not read / write out of bounds
 assert_indirect_indexing = True
 
+# compute CSE bounds on variables that do not appear in the FX graph
+compute_all_bounds = False
+
 # constant folding on the joint graph
 joint_graph_constant_folding = True
 
@@ -298,6 +373,29 @@ def is_fbcode():
 # the default to spawn.
 worker_start_method = "fork"
 
+# Flags to turn on all_reduce fusion. These 2 flags should be automaticaly turned
+# on by DDP and should not be set by the users.
+_fuse_ddp_communication = False
+_fuse_ddp_bucket_size = 25
+
+# Flag to control which fusion passes to apply. Functions in the list will
+# be applied in order. There are two different different fusion passes
+# --"fuse_ddp_with_concat_op" and "fuse_ddp_with_coalesced_op". The default
+# one is "fuse_ddp_with_concat_op". Users can also change this to a customized
+# fusion function.
+#
+# The fusion currently does not support multiple DDP with different PG or
+# data type. This feature will be added in the future PRs.
+#
+# "schedule_comm_wait" is used to delay the wait ops to maximize comm/comp
+# overlapping. At this moment, this pass performs better than
+# reorder_for_compute_comm_overlap_passes but we will add the logic of
+# "schedule_comm_wait" in the future and remove the one here.
+_fuse_ddp_communication_passes: List[Union[Callable[..., None], str]] = [
+    "fuse_ddp_with_concat_op",
+    "schedule_comm_wait",
+]
+
 
 def decide_compile_threads():
     """
@@ -346,6 +444,17 @@ def decide_compile_threads():
 # Pad input tensors of matmul/bmm/addmm to leverage Tensor Cores in NVIDIA GPUs
 shape_padding = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING", "1") == "1"
 
+# Control if we will do padding for pointwise/reductions
+comprehensive_padding = (
+    os.environ.get("TORCHINDUCTOR_COMPREHENSIVE_PADDING", "0" if is_fbcode() else "1")
+    == "1"
+)
+pad_channels_last = False
+
+# Whether to treat output of the backward graph as user visible.
+# For user visible outputs, inductor will make sure the stride matches with eager.
+bw_outputs_user_visible = True
+
 # Fx-based linear/matmul/bmm + permute/transpose vertical fusion
 permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1"
 
@@ -386,7 +495,9 @@ def decide_compile_threads():
 
 # Kill switch for allowing temporary tensors to be allocated as stack arrays. Tests
 # should be run with this flag both on and off to make sure we have coverage.
-allow_stack_allocation: bool = True
+allow_stack_allocation: bool = (
+    os.environ.get("TORCHINDUCTOR_STACK_ALLOCATION", "1") == "1"
+)
 
 # Enables an alternate DSO interface (the "minimal ArrayRef interface") intended
 # to maximize performance for use cases that it can accommodate at the expense of
@@ -399,6 +510,14 @@ def decide_compile_threads():
 # but performance for that interface may be degraded.
 use_minimal_arrayref_interface: bool = False
 
+# decompose some memory bound matmul/bmm to mul
+decompose_mem_bound_mm: bool = False
+
+# assume_aligned_inputs means that we assume that inputs will be aligned; we generate
+# code using this assumption, and clone tensors before use if they aren't aligned.
+# In the common case, most inputs will be aligned.
+assume_aligned_inputs: bool = False
+
 
 # config specific to codegen/cpp.py
 class cpp:
@@ -407,69 +526,99 @@ class cpp:
 
     # Do not generate loops when the condition doesn't hold, like:
     # for(long i0=4096; i0<4096; i0+=1)
-    no_redundant_loops = True
+    no_redundant_loops = (
+        os.environ.get("TORCHINDUCTOR_CPP_NO_REDUNDANT_LOOPS", "1") == "1"
+    )
 
     # Assume number of threads is dynamic, don't specialize thread number.
     # Kernels don't recompile on thread number changes with this flag on.
     # For single-threaded workload, turning it on would incur a slight
     # performance degradation.
-    dynamic_threads = False
+    dynamic_threads = os.environ.get("TORCHINDUCTOR_CPP_DYNAMIC_THREADS", "0") == "1"
 
-    simdlen = None
-    min_chunk_size = 4096
+    simdlen: Optional[int] = None
+    min_chunk_size = int(os.environ.get("TORCHINDUCTOR_CPP_MIN_CHUNK_SIZE", "4096"))
     cxx = (
         None,  # download gcc12 from conda-forge if conda is installed
         # "g++-12",
         # "g++-11",
         # "g++-10",
         # "clang++",
-        os.environ.get("CXX", "g++"),
+        os.environ.get("CXX", "clang++" if sys.platform == "darwin" else "g++"),
         # "g++.par",
     )
     # Allow kernel performance profiling via PyTorch profiler
-    enable_kernel_profile = False
+    enable_kernel_profile = (
+        os.environ.get("TORCHINDUCTOR_CPP_ENABLE_KERNEL_PROFILE", "0") == "1"
+    )
 
     # enable weight prepacking to get a better performance; may lead to large memory footprint
-    weight_prepack = True
+    weight_prepack = os.environ.get("TORCHINDUCTOR_CPP_WEIGHT_PREPACK", "1") == "1"
 
     # Inject a bug into our relu implementation; useful for testing our repro
     # extraction and minification functionality.
     # Valid values: "compile_error", "runtime_error", "accuracy"
-    inject_relu_bug_TESTING_ONLY = None
-    inject_log1p_bug_TESTING_ONLY = None
+    inject_relu_bug_TESTING_ONLY: Optional[str] = None
+    inject_log1p_bug_TESTING_ONLY: Optional[str] = None
 
     # If None, autodetect whether or not AVX512/AVX2 can be used.  Otherwise,
     # force usage as specified, without testing.
-    vec_isa_ok = None
+    vec_isa_ok: Optional[bool] = None
 
     # similar to config.triton.descriptive_names
     descriptive_names = "original_aten"
 
     # how many nodes to allow into a single horizontal fusion
-    max_horizontal_fusion_size = 16
+    max_horizontal_fusion_size = int(
+        os.environ.get("TORCHINDUCTOR_CPP_MAX_HORIZONTAL_FUSION_SIZE", "16")
+    )
 
     # Make scatter_reduce fallback when reduce is sum to avoid performance regression
     # using atomic_add.
-    fallback_scatter_reduce_sum = True
+    fallback_scatter_reduce_sum = (
+        os.environ.get("TORCHINDUCTOR_CPP_FALLBACK_SCATTER_REDUCE_SUM", "1") == "1"
+    )
 
     # Use funsafe-math-optimizations when compiling
-    enable_unsafe_math_opt_flag = False
+    enable_unsafe_math_opt_flag = (
+        os.environ.get("TORCHINDUCTOR_CPP_ENABLE_UNSAFE_MATH_OPT_FLAG", "0") == "1"
+    )
+
+    # Use ffp-contract when compiling
+    enable_floating_point_contract_flag = (
+        os.environ.get("TORCHINDUCTOR_CPP_ENABLE_FLOATING_POINT_CONTRACT_FLAG", "0")
+        == "1"
+    )
 
 
 # config specific to codegen/triton.py
 class triton:
     # Use cudagraphs on output code
-    cudagraphs = False
+    cudagraphs = os.environ.get("TORCHINDUCTOR_CUDAGRAPHS") == "1"
 
     # Use cudagraph trees for memory pooling if `cudagraphs` is True
     cudagraph_trees = True
 
+    # Should we skip cudagraphing graphs with dynamic shape inputs
+    # If False, we will re-record a graph for each unique set of shape inputs
+    cudagraph_skip_dynamic_graphs = False
+
     # assertions not on the fast path, steady state
     slow_path_cudagraph_asserts = True
 
     # TODO - need to debug why this prevents cleanup
     cudagraph_trees_history_recording = False
 
+    # Enable cudagraph support for mutated inputs from prior cudagraph pool
+    cudagraph_support_input_mutation = False
+
+    # synchronize after cudagraph invocation
+    force_cudagraph_sync = False
+
+    # always run cudagraphs in the eager warmup stage
+    # instead of recording and executing cudagraphs
+    force_cudagraphs_warmup = False
+
     # assertions on the fast path
     fast_path_cudagraph_asserts = False
 
@@ -517,12 +666,18 @@ class triton:
         os.environ.get("TORCHINDUCTOR_PERSISTENT_REDUCTIONS", "1") == "1"
     )
 
+    # 0/False: disable
+    # 1/True: enable, use tuning to pick between different subkernels
+    # 2: enable, force using persistent reduction (for debugging)
+    # 3: enable, force using non-persistent reduction (for debugging)
+    multi_kernel = int(os.environ.get("TORCHINDUCTOR_MULTI_KERNEL", "0"))
+
     # hint to Triton when arguments are divisible by 16
     divisible_by_16 = True
 
-    # theses are not enforced, but they are used by asserts in triton_heuristics.py
-    # NOTE: mobilevit_s in timm_models required X to be set to the higher value 2048
-    max_block = {"X": 2048, "Y": 1024, "Z": 1024, "R": 4096}
+    # Minimum RBLOCK to be used for a TritonSplitScanKernel
+    # NOTE: This also indirectly controls the size of workspace buffer required
+    min_split_scan_rblock = 256
 
     # Store the generated cubin files for cpp wrapper code to load
     store_cubin = False
@@ -540,10 +695,13 @@ class triton:
     # We should revisit this once we understand more of the source of register spills.
     spill_threshold: int = 16
 
+    # Generate code containing the newer tl.make_block_ptr() API for loads/store
+    use_block_ptr = False
+
     # Inject a bug into our relu implementation; useful for testing our repro
     # extraction and minification functionality.
     # Valid values: "compile_error", "runtime_error", "accuracy"
-    inject_relu_bug_TESTING_ONLY = None
+    inject_relu_bug_TESTING_ONLY: Optional[str] = None
 
 
 class aot_inductor:
@@ -557,26 +715,30 @@ class aot_inductor:
 
     debug_compile = os.environ.get("AOT_INDUCTOR_DEBUG_COMPILE", "0") == "1"
 
-    # Wether to codegen abi compatible model.so
-    abi_compatible = is_fbcode()
-
     # Serialized tree spec for flattening inputs
     serialized_in_spec = ""
 
     # Serialized tree spec for flattening outputs
     serialized_out_spec = ""
 
+    # flag to decide whether to create a submodule for constant graph.
+    use_runtime_constant_folding: bool = False
+
+    # flag to force weight to be appened to the shared library and mmaped  by the runtime
+    # rather than embedded into the data section. Needed to support 1B+ parameter models
+    force_mmap_weights: bool = False
+
 
 class cuda:
     # CUDA arch to use for CUDA template kernel compilation.
     # e.g. "70", "75", "80", "90", etc.
     # When arch is None, Inductor uses torch.cuda.get_device_capability(0).
-    arch = None
+    arch: Optional[str] = None
 
     # CUDA version to use for CUDA template kernel compilation.
     # e.g. "11.4", "12.1", etc.
     # When version is None, Inductor uses torch.version.cuda.
-    version = None
+    version: Optional[str] = None
 
     # Optimization level for the host compiler.
     compile_opt_level = "-O1"
@@ -605,7 +767,7 @@ class cuda:
     # Configures the maximum number of CUTLASS configs to profile in max_autotune.
     # By default it's None, so that all CUTLASS configs are tuned.
     # This is mainly used to reduce test time in CI.
-    cutlass_max_profiling_configs = None
+    cutlass_max_profiling_configs: Optional[int] = None
 
     # Path to CUDA NVCC.
     # NVCC search order:
@@ -613,12 +775,31 @@ class cuda:
     # 2）CUDACXX environment variable
     # 3）CUDA_HOME environment variable
     # 4) default system search PATH.
-    cuda_cxx = None
+    cuda_cxx: Optional[str] = None
 
-    # If set to True, it will ensure that only GEMM ops capable of
-    # epilogue fusion via CUTLASS Epilogue Visitor Trees ( EVT )
-    # are enabled for the CUTLASS backend.
-    cutlass_only_evt_capable_ops: bool = False
+    # Minimum value of M*N*K to consider the CUTLASS backend for GEMM ops.
+    cutlass_backend_min_gemm_size: int = 1
+
+    # enable generation of inline standalone runner in CUDA CPP generated code
+    # which allows to compile the generated code into a standalone executable.
+    generate_test_runner: bool = (
+        os.environ.get("INDUCTOR_CUDA_BACKEND_GENERATE_TEST_RUNNER_CODE", "1") == "1"
+    )
+
+    # Keep only Cutlass op configs which contain this regular expression pattern
+    # Set this to "warpspecialized_cooperative_epi_tma" to enable only SM90 TMA Cutlass Kernels for large GEMMs
+    cutlass_op_allowlist_regex: Optional[str] = None
+
+    # Note: Names of Cutlass ops names can be obtained by calling
+    # op.configuration_name() on a Cutlass op instance, for example those
+    # returned from cutlass_utils.gen_ops() or the op argument passed to
+    # CUTLASSGemmTemplate.render(...)
+
+    # Filter Cutlass configs which contain this regular expression pattern
+    # Set this to "pingpong" to avoid numerical issues
+    # caused by the op ordering of the "pingpong" memory access
+    # pattern used by some Cutlass Kernels.
+    cutlass_op_denylist_regex: Optional[str] = "pingpong"
 
 
 # create a directory containing lots of debug information
@@ -628,7 +809,7 @@ class trace:
 
     # Save debug information to a temporary directory
     # If not specified, a temp directory will be created by system
-    debug_dir = None
+    debug_dir: Optional[str] = None
 
     # Save python logger call >=logging.DEBUG
     debug_log = False
@@ -672,7 +853,9 @@ class trace:
 
     # Upload the .tar.gz file
     # Needs to be overriden based on specific environment needs
-    upload_tar = None
+    upload_tar: Optional[Callable[[str], None]] = None
+
+    log_autotuning_results: bool = False
 
 
 _save_config_ignore = {
diff --git a/torch/_inductor/constant_folding.py b/torch/_inductor/constant_folding.py
index 96c22b3b6d010..82cb5b5a5bb36 100644
--- a/torch/_inductor/constant_folding.py
+++ b/torch/_inductor/constant_folding.py
@@ -6,22 +6,31 @@
 
 aten = torch.ops.aten
 
+# We would like to split modules into two subgraphs for runtime weight updates to work correctly.
+# The use case and more information could be found at:
+# https://docs.google.com/document/d/1inZC-8KarJ6gKB7G9egmYLx1V_dKX_apxon0w4zPC0Q/edit?usp=sharing
+META_TAG = "MODULE_TYPE"
+MODULE_TAG = "_MAIN_MODULE"
+CONST_MODULE_TAG = "_CONST_MODULE"
 
-def replace_node_with_constant(gm, node, constant):
-    g = gm.graph
 
-    if not hasattr(gm, "_frozen_param_count"):
-        gm._frozen_param_count = 0
+def replace_node_with_constant(gm, node, constant, name=None):
+    g = gm.graph
 
-    i = gm._frozen_param_count
+    if name:
+        qualname = name
+    else:
+        if not hasattr(gm, "_frozen_param_count"):
+            gm._frozen_param_count = 0
+        i = gm._frozen_param_count
 
-    while True:
-        qualname = f"_frozen_param{i}"
-        if not hasattr(gm, qualname):
-            break
-        i += 1
+        while True:
+            qualname = f"_frozen_param{i}"
+            if not hasattr(gm, qualname):
+                break
+            i += 1
 
-    gm._frozen_param_count = i + 1
+        gm._frozen_param_count = i + 1
 
     with g.inserting_before(node):
         new_input_node = g.create_node("get_attr", qualname, (), {})
@@ -51,6 +60,14 @@ def __init__(
         self.user_to_last_uses = self.node_to_last_non_output_use()
 
     def is_impure(self, node: torch.fx.node.Node):
+        if (
+            node.target == torch.ops.prims.convert_element_type.default
+            and node.args[0].op == "get_attr"  # type: ignore[union-attr]
+            and node.args[0].meta["val"].dtype == torch.int8  # type: ignore[union-attr]
+            and node.args[1] == torch.bfloat16
+        ):
+            # For int8_weight -> dq -> bf16_weight
+            return True
         if node.target in [
             torch.ops.quantized_decomposed.dequantize_per_channel.default,
             torch.ops.quantized_decomposed.dequantize_per_tensor.default,
@@ -78,7 +95,8 @@ def add_use(inp):
                 seen_uses.add(inp)
                 last_non_output_use[node].append(inp)
 
-            pytree.tree_map_only(torch.fx.Node, add_use, (node.args, node.kwargs))
+            # In-place is fine since we don't mutate
+            pytree.tree_map_only_(torch.fx.Node, add_use, (node.args, node.kwargs))
 
             # if this node is only used in output, we want to gc it right away
             if len(node.users) == 1 and output_node in node.users:
@@ -93,7 +111,8 @@ def run_node(self, node):
             def set_env(arg):
                 self.env[arg] = self.unknown_value
 
-            pytree.tree_map_only(torch.fx.Node, set_env, node.args)
+            # In-place is fine since we don't mutate
+            pytree.tree_map_only_(torch.fx.Node, set_env, node.args)
             return super().run_node(node)
 
         args, kwargs = self.fetch_args_kwargs_from_env(node)
@@ -136,6 +155,9 @@ def set_env(arg):
         out = super().run_node(node)
 
         if node.op != "get_attr" and isinstance(out, torch.Tensor):
+            if out.device.type == "meta":
+                return out
+
             if not self.insertable_tensor_check(out):
                 return out
 
@@ -166,9 +188,8 @@ def add_node_replacement(self, node: torch.fx.Node, tensor: torch.Tensor) -> Non
 
     def run(self):
         env = {}
-        for n in self.module.graph.nodes:
-            if n.op == "placeholder":
-                env[n] = self.unknown_value
+        for n in self.module.graph.find_nodes(op="placeholder"):
+            env[n] = self.unknown_value
         return super().run(initial_env=env)
 
 
@@ -183,8 +204,8 @@ def constant_fold(gm, constraint_fn: Optional[Callable[[torch.fx.Node], bool]] =
         replace_node_with_constant(gm, node, constant)
 
     erased_params = []
-    for node in gm.graph.nodes:
-        if node.op == "get_attr" and len(node.users) == 0:
+    for node in gm.graph.find_nodes(op="get_attr"):
+        if len(node.users) == 0:
             if hasattr(gm, node.target):
                 delattr(gm, node.target)
             erased_params.append(node)
@@ -195,3 +216,60 @@ def constant_fold(gm, constraint_fn: Optional[Callable[[torch.fx.Node], bool]] =
     gm.graph.eliminate_dead_code()
     gm.graph.lint()
     gm.recompile()
+
+
+@torch.utils._python_dispatch._disable_current_modes()
+def constant_graph_tag(gm: torch.fx.GraphModule):
+    cf = ConstantFolder(gm, skip_constructors=True)
+    cf.run()
+
+    for node in gm.graph.nodes:
+        if (
+            node.op == "get_attr"
+            or node in cf.node_replacements
+            or node in cf.replaced_uses
+        ):
+            node.meta[META_TAG] = CONST_MODULE_TAG
+        else:
+            node.meta[META_TAG] = MODULE_TAG
+
+
+def run_and_get_constant_graph(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """
+    Construct a GraphModule which corresponds to the part which could be
+    constant folded in provided gm.
+    """
+
+    constant_graph_tag(gm)
+    # We rewrite the tags, if it's a constant being directly consumed, without
+    # any folding opportunity, we keep it in main gm.
+    for node in gm.graph.find_nodes(op="get_attr"):
+        used_to_fold = False
+        for u in node.users:
+            if u.meta[META_TAG] == CONST_MODULE_TAG:
+                used_to_fold = True
+                break
+        if not used_to_fold:
+            node.meta[META_TAG] = MODULE_TAG
+
+    new_graph = torch.fx.Graph()
+
+    node_remapping: Dict[torch.fx.Node, torch.fx.Node] = {}
+    output_nodes = []
+    for node in gm.graph.nodes:
+        if node.meta[META_TAG] == MODULE_TAG:
+            continue
+
+        new_node = new_graph.node_copy(node, lambda x: node_remapping[x])
+        node_remapping[node] = new_node
+
+        for user in node.users:
+            if user.meta[META_TAG] == MODULE_TAG:
+                output_nodes.append(new_node)
+                break
+
+    new_graph.output(tuple(output_nodes))
+    new_graph.lint()
+    new_gm = torch.fx.GraphModule(gm, new_graph)
+
+    return new_gm
diff --git a/torch/_inductor/cudagraph_trees.py b/torch/_inductor/cudagraph_trees.py
index 5170e7d3cd6a3..cc647790e5844 100644
--- a/torch/_inductor/cudagraph_trees.py
+++ b/torch/_inductor/cudagraph_trees.py
@@ -41,7 +41,6 @@
 import functools
 import gc
 import itertools
-import logging
 import operator
 import sys
 import threading
@@ -59,9 +58,9 @@
     Iterator,
     List,
     Optional,
-    Sequence,
     Set,
     Tuple,
+    TYPE_CHECKING,
     Union,
 )
 
@@ -78,12 +77,20 @@
     remove_unaligned_input_idxs,
     static_input,
 )
+from torch._inductor.cudagraph_utils import (
+    check_for_mutation,
+    FunctionID,
+    log_cudagraph_skip_and_bump_counter,
+    WrappedFunction,
+)
 from torch.multiprocessing.reductions import StorageWeakRef
 from torch.storage import UntypedStorage
-from torch.types import _bool
 from torch.utils import _pytree as pytree
 from torch.utils.weak import TensorWeakRef
 
+if TYPE_CHECKING:
+    from torch.types import _bool
+
 StorageWeakRefPointer = int
 StorageDataPtr = int
 NBytes = int
@@ -102,7 +109,8 @@ def _set_cached_tensors_enabled(enabled: _bool) -> None:
         pass
 
 
-log = logging.getLogger(__name__)
+log = torch._logging.getArtifactLogger(__name__, "cudagraphs")
+
 
 from . import config
 
@@ -113,26 +121,6 @@ class GraphID:
     id: int
 
 
-@dataclasses.dataclass(frozen=True)
-class FunctionID:
-    "Unique counter of a function wrapped in cudagraphify_impl"
-    id: int
-
-
-@dataclasses.dataclass(frozen=True)
-class WrappedFunction:
-    """
-    Represents a function that you want to record for CUDA graph replay,
-    with a little more metadata so we can identify if we have an applicable
-    CUDA graph in our CUDA graph tree for it.
-    """
-
-    model: Callable[..., Any]
-    static_input_idxs: Sequence[int]
-    id: FunctionID
-    constants: Tuple[torch.Tensor, ...]
-
-
 def clear_cublass_cache():
     """
     Cublas keeps a persistent workspace allocation for running matmuls. This poses a problem for
@@ -369,7 +357,10 @@ def deferred_cudagraphify(inputs):
         if fn is not None:
             return fn(inputs)
 
-        log.info("recording cudagraph tree for %s", int_key)
+        if int_key is None:
+            log.info("recording cudagraph tree for graph without symints")
+        else:
+            log.info("recording cudagraph tree for symint key %s", int_key)
 
         # first get indices we need to check to align, then update our static inputs,
         # and finally copy
@@ -396,6 +387,8 @@ def cudagraphify(
     is_inference: bool,
     stack_traces: Optional[StackTraces] = None,
     constants: Tuple[torch.Tensor, ...] = (),
+    placeholders: Tuple[torch.fx.Node, ...] = (),
+    mutated_input_idxs: Tuple[int, ...] = (),
 ):
     manager = get_container(device_index).get_tree_manager()
     assert not (is_backward and is_inference)
@@ -412,6 +405,8 @@ def cudagraphify(
         stack_traces,
         mode,
         constants,
+        placeholders,
+        mutated_input_idxs,
     )
 
 
@@ -518,6 +513,8 @@ def _use_cuda_memory_pool_manager(device, mem_pool, stream):
             torch._C._cuda_endAllocateCurrentStreamToPool(device, mem_pool)
             torch._C._cuda_releasePool(device, mem_pool)
 
+    torch.cuda.current_stream().wait_stream(stream)
+
 
 def map_to_ref(t: Optional[Tensor]) -> Optional[StorageWeakRefWrapper]:
     if not isinstance(t, torch.Tensor):
@@ -565,6 +562,7 @@ def __init__(
         stack_traces: Optional[StackTraces],
         stream: torch.cuda.Stream,
         already_warm: bool,
+        id: GraphID,
     ):
         self.wrapped_function = wrapped_function
         self.parent = parent
@@ -577,6 +575,7 @@ def __init__(
         self.stack_traces = stack_traces
         self.stream = stream
         self.already_warm = already_warm
+        self.id = id
 
     def run(self, new_inputs):
         assert not self.has_run, "Wrapped function should never be run twice"
@@ -588,16 +587,16 @@ def run(self, new_inputs):
         }
 
         def get_non_cudagraph_inps():
-            non_cudagraph_inps = set()
+            non_cudagraph_inps = []
             for t in itertools.chain(new_inputs, self.wrapped_function.constants):
                 if (
                     isinstance(t, torch.Tensor)
                     and t.untyped_storage().data_ptr() not in existing_path_data_ptrs
                 ):
-                    non_cudagraph_inps.add(t.untyped_storage().data_ptr())
+                    non_cudagraph_inps.append(weakref.ref(t.untyped_storage()))
             return non_cudagraph_inps
 
-        non_cudagraph_inps = get_non_cudagraph_inps()
+        non_cudagraph_inps_storages = get_non_cudagraph_inps()
 
         if config.triton.slow_path_cudagraph_asserts and not self.already_warm:
             refs = list(self.path_live_weakrefs())
@@ -610,18 +609,26 @@ def get_non_cudagraph_inps():
         ), get_history_recording():
             out = self.wrapped_function.model(new_inputs)
 
-        # sync up stream used in `_use_cuda_memory_pool_manager` - TODO - wait stream instead ?
-        torch.cuda.synchronize()
+        # We need to know which outputs are allocated within the cudagraph pool
+        # so that we can deallocate them at the beginning of the next cudagraph step,
+        # and set their access to error.
+        # We use a weakref to the inputs storage, in case a block which was previously
+        # allocated to the general caching allocator pool gets reallocated to a private pool.
+
+        non_cudagraph_inps_storage_ptrs = set()
+        for storage in non_cudagraph_inps_storages:
+            s = storage()
+            if s is not None:
+                non_cudagraph_inps_storage_ptrs.add(s._cdata)
 
         assert len(new_inputs) == 0
 
         # sdpa returns cpu tensors when not recording cuda graph
         def add_ref(o):
             return (
-                o is not None
-                and isinstance(o, torch.Tensor)
+                isinstance(o, torch.Tensor)
                 and o.is_cuda
-                and o.untyped_storage().data_ptr() not in non_cudagraph_inps
+                and o.untyped_storage()._cdata not in non_cudagraph_inps_storage_ptrs
                 and o.untyped_storage().data_ptr() != 0
             )
 
@@ -633,11 +640,8 @@ def add_ref(o):
         )
 
         if config.triton.slow_path_cudagraph_asserts and not self.already_warm:
-            out_refs = self.path_live_weakrefs()
-            new_storages = [
-                t for t in out_refs if t.data_ptr() not in non_cudagraph_inps
-            ]
-            check_memory_pool(self.device_index, self.cuda_graphs_pool, new_storages)
+            out_refs = list(self.path_live_weakrefs())
+            check_memory_pool(self.device_index, self.cuda_graphs_pool, out_refs)
 
         return out
 
@@ -661,6 +665,12 @@ def path_live_weakrefs(self) -> Iterator[StorageWeakRefWrapper]:
     def all_outputs_are_dead(self):
         return not list(self.path_live_weakrefs())
 
+    def _is_cuda_graph_recorded_tensor(self, t: torch.Tensor):
+        for storage_weak_ref in self.path_live_weakrefs():
+            if t.untyped_storage().data_ptr() == storage_weak_ref.data_ptr():
+                return True
+        return False
+
 
 # Aliases for List that say what the indices denote
 InputList = List  # input indexes
@@ -785,6 +795,16 @@ def __init__(
             set(wrapped_function.static_input_idxs) | set(self.cudagraph_managed_idxs)
         )
 
+        self.non_static_input_idx: LevelList[int] = [
+            i for i in range(len(inputs)) if i not in self.static_input_idxs
+        ]
+
+        self.non_managed_static_input_idxs: LevelList[int] = [
+            i
+            for i in wrapped_function.static_input_idxs
+            if i not in self.cudagraph_managed_idxs
+        ]
+
         self.static_input_data_ptrs: InputList[Optional[int]] = [
             (
                 inputs[i].data_ptr()
@@ -915,12 +935,36 @@ def __init__(
 
         self.graph.replay()
 
-    def _copy_input(self, idx, dst, src):
-        expanded_dims = self.expanded_dims[idx]
-        dst = index_expanded_dims(dst, expanded_dims)
-        src = index_expanded_dims(src, expanded_dims)
-        # TODO - one jit kernel across multiple inputs
-        dst.copy_(src)
+    def _copy_inputs_and_remove_from_src(self, dsts, srcs):
+        dst_tensors = []
+        src_tensors = []
+        for idx in self.non_static_input_idx:
+            if not isinstance(srcs[idx], torch.Tensor):
+                continue
+            expanded_dims = self.expanded_dims[idx]
+            dst_tensors.append(index_expanded_dims(dsts[idx], expanded_dims))
+            src_tensors.append(index_expanded_dims(srcs[idx], expanded_dims))
+            srcs[idx] = None
+        # Fails on empty lists
+        if dst_tensors:
+            torch._foreach_copy_(dst_tensors, src_tensors)
+
+    def check_static_inputs_are_stable(self, new_inputs):
+        # avoid checking managed tensor static points since we already checked those in check_invariants
+        if not torch._C._tensors_data_ptrs_at_indices_equal(
+            new_inputs, self.static_input_data_ptrs, self.non_managed_static_input_idxs
+        ):
+            # this should error
+            static_tensors = [new_inputs[i] for i in self.non_managed_static_input_idxs]
+            data_ptrs = [
+                self.static_input_data_ptrs[i]
+                for i in self.non_managed_static_input_idxs
+            ]
+            for t, data_ptr in zip(static_tensors, data_ptrs):
+                torch._check(
+                    t.data_ptr() == data_ptr,
+                    lambda: f"static input data pointer changed from {data_ptr} to {t.data_ptr()}",
+                )
 
     def run_first_inputs(self, new_inputs):
         if config.triton.fast_path_cudagraph_asserts:
@@ -934,30 +978,20 @@ def run_first_inputs(self, new_inputs):
         return outputs
 
     def run(self, new_inputs):
-        if config.triton.fast_path_cudagraph_asserts:
-            self.debug_check_invariants_before_invocation()
-
-        assert len(self.static_input_data_ptrs) == len(new_inputs)
-        # NB: this ranges over non-static inputs too
-        for idx, data_ptr in enumerate(self.static_input_data_ptrs):
-            if idx in self.cudagraph_managed_idxs:
-                continue
-            if not isinstance(new_inputs[idx], torch.Tensor):
-                pass
-            elif data_ptr is not None:
-                # static input, e.g., parameter
-                assert data_ptr == new_inputs[idx].data_ptr()
-            else:
-                # non-static input, need to copy it into CUDA graph
-                dst = self.reconstructed_inputs[idx]
-                src = new_inputs[idx]
-                self._copy_input(idx, dst, src)
+        self.check_static_inputs_are_stable(new_inputs)
 
+        self._copy_inputs_and_remove_from_src(self.reconstructed_inputs, new_inputs)
         new_inputs.clear()
+
         self.run_graph()
 
         outputs = self.reconstruct_outputs()
-        self.debug_check_invariants_after_invocation()
+
+        if config.triton.fast_path_cudagraph_asserts:
+            self.debug_check_invariants_after_invocation()
+
+        if config.triton.force_cudagraph_sync:
+            torch.cuda.synchronize()
 
         return outputs
 
@@ -1495,13 +1529,11 @@ def _allocate_and_copy_recording_inputs(
                 elif i not in self.static_input_idxs:
                     # static_input does an allocation!
                     recording_inputs.append(static_input(inp))
-                    # copy over and clear non recording input
-                    self._copy_input(i, recording_inputs[-1], inp)
-                    inputs[i] = None
-                    del inp
                 else:
                     recording_inputs.append(inp)
 
+            self._copy_inputs_and_remove_from_src(recording_inputs, inputs)
+
         return recording_inputs
 
     def check_invariants(self, inputs: List[Tensor]) -> bool:
@@ -1511,9 +1543,12 @@ def check_invariants(self, inputs: List[Tensor]) -> bool:
         """
 
         # previously managed data pointers remain stable
-        for idx in self.cudagraph_managed_idxs:
-            if inputs[idx].data_ptr() != self.static_input_data_ptrs[idx]:
-                return False
+        # this is on the hot path so moved to C++. equivalent to:
+        # return all(t.data_ptr() == data_ptr for (t, data_ptr) in zip(tensors, data_ptrs))
+        if not torch._C._tensors_data_ptrs_at_indices_equal(
+            inputs, self.static_input_data_ptrs, self.cudagraph_managed_idxs
+        ):
+            return False
 
         if not self._check_liveness(
             self.expected_dead_indices_before_graph, self.path_weakrefs
@@ -1684,6 +1719,9 @@ def __init__(self, device_index: int):
         self.warned_functions: Set[FunctionID] = set()
         torch._C._set_cached_tensors_enabled(True)
 
+        # warn only once if a function mutates inputs
+        self.warned_mutation: Set[FunctionID] = set()
+
         # NB: cuda caching allocator will remember the stream a segment is allocated to
         # and only allocate that segment to the same stream. we need to use a single stream
         # for all allocations to the memory pool, otherwise the allocations to separate streams
@@ -1710,6 +1748,13 @@ def __init__(self, device_index: int):
         self.graph_counter = itertools.count(0)
         self.func_counter = itertools.count(0)
 
+        # mapping from graph_id to (function id to mutation type hint) since we are
+        # specializing on a particular combination of Parent Node -> Function ID.
+        self.non_cudagraph_managed_mutation_hint: Dict[
+            Optional[GraphID], Dict[FunctionID, bool]
+        ] = defaultdict(dict)
+        self.warmup_node_counter = itertools.count(start=-1, step=-1)
+
         # whether we the current node is in a state of warmup, recording, execution. If
         # there is no current node the state will be ExecutionState.None.
         self.path_state = ExecutionState.NONE
@@ -1766,6 +1811,42 @@ def run(self, new_inputs: List[Tensor], function_id: FunctionID):
     def set_to_running_backward(self):
         self.running_forwards_with_pending_backwards = False
 
+    def _get_cuda_graph_recorded_tensor_checker(self) -> Callable[[Tensor], bool]:
+        return (
+            self.current_node._is_cuda_graph_recorded_tensor
+            if isinstance(self.current_node, (CUDAGraphNode, CUDAWarmupNode))
+            else lambda _: False
+        )
+
+    def new_warmup_node_id(self) -> GraphID:
+        return GraphID(next(self.warmup_node_counter))
+
+    def _update_non_cudagraph_managed_mutation(
+        self, function_id: FunctionID, inputs: List[Tensor]
+    ):
+        node_id = self._get_node_id()
+        if maybe_mutation_str := check_for_mutation(
+            self.ids_to_funcs[function_id],
+            inputs,
+            self._get_cuda_graph_recorded_tensor_checker(),
+        ):
+            self.non_cudagraph_managed_mutation_hint[node_id][function_id] = True
+            # warn once per function_id
+            if function_id in self.warned_mutation:
+                return
+            self.warned_mutation.add(function_id)
+            log_cudagraph_skip_and_bump_counter(maybe_mutation_str)
+        else:
+            self.non_cudagraph_managed_mutation_hint[node_id][function_id] = False
+
+    def _get_node_id(self) -> Optional[GraphID]:
+        if self.current_node is None:
+            return None
+        elif isinstance(self.current_node, (CUDAGraphNode, CUDAWarmupNode)):
+            return self.current_node.id
+        else:
+            raise RuntimeError(f"Unknown node type {type(self.current_node)}")
+
     def _run(self, new_inputs: List[Tensor], function_id: FunctionID):
         # we will try to end the current execution lazily, since
         # we dont want to do unnecessary checking of the existing outputs
@@ -1777,17 +1858,31 @@ def _run(self, new_inputs: List[Tensor], function_id: FunctionID):
         if self.in_warmup:
             self.try_end_curr_warmup(function_id)
 
+        node_id = self._get_node_id()
+        if function_id not in self.non_cudagraph_managed_mutation_hint[node_id]:
+            self._update_non_cudagraph_managed_mutation(function_id, new_inputs)
+
+        # Early exit if the function mutates inputs which are neither parameters/buffers nor
+        # cudagraph recorded tensors. This check should happen after `try_end_curr_recording`
+        # and `try_end_curr_warmup` which may change self.current_node.
+        if self.non_cudagraph_managed_mutation_hint[node_id][function_id]:
+            return self.ids_to_funcs[function_id].model(new_inputs)
+
         # warming up a function and subsequentally recording may use different memory addresses
         # because both depend on the state of the caching allocator. if we warm up graph A,
         # then warm up graph B and make more allocations, the subsequent recording of A will not
         # necessarily use the same addresses as in the warm up. Thus any warm up of a node can only
         # be followed by warm up runs.
         if (
-            not (
-                function_id in self.warmed_up_functions
-                or config.triton.skip_cudagraph_warmup
+            (
+                not (
+                    function_id in self.warmed_up_functions
+                    or config.triton.skip_cudagraph_warmup
+                )
             )
-        ) or self.in_warmup:
+            or self.in_warmup
+            or config.triton.force_cudagraphs_warmup
+        ):
             # If we are in the middle of executing cuda graphs, then we need to checkpoint memory state.
             # Both Recording and Warmup will be reflected in the allocator and dont need changes
             if self.path_state == ExecutionState.EXECUTION:
@@ -1818,6 +1913,13 @@ def _run(self, new_inputs: List[Tensor], function_id: FunctionID):
                 if self.current_node is None:
                     return self.run(new_inputs, function_id)
 
+            if len(self.ids_to_funcs[function_id].mutated_input_idxs) > 0:
+                self._update_non_cudagraph_managed_mutation(function_id, new_inputs)
+                if self.non_cudagraph_managed_mutation_hint[self._get_node_id()][
+                    function_id
+                ]:
+                    return self.ids_to_funcs[function_id].model(new_inputs)
+
             # at this point, we necessarily will do a new recording
             self.debug_fail_counter += 1
 
@@ -1904,6 +2006,7 @@ def run_eager(self, new_inputs, function_id: FunctionID):
             self.ids_to_stack_traces[function_id],
             self.stream,
             already_warm,
+            self.new_warmup_node_id(),
         )
         self.current_node = node
         self.path_state = ExecutionState.WARMUP
@@ -1924,14 +2027,18 @@ def add_function(
         stack_traces,
         mode,
         constants,
+        placeholders,
+        mutated_input_idxs,
     ) -> Tuple[Callable[..., Any], List[Optional[Tensor]]]:
         id = self.new_func_id()
         self.ids_to_stack_traces[id] = stack_traces
         self.ids_to_funcs[id] = WrappedFunction(
             model,
-            static_input_idxs,
+            list(static_input_idxs),
             id,
             tuple(t for t in constants if isinstance(t, torch.Tensor) and t.is_cuda),
+            placeholders,
+            mutated_input_idxs,
         )
         self.id_to_mode[id] = mode
         fn = functools.partial(self.run, function_id=id)
diff --git a/torch/_inductor/cudagraph_utils.py b/torch/_inductor/cudagraph_utils.py
new file mode 100644
index 0000000000000..c87022fcb7880
--- /dev/null
+++ b/torch/_inductor/cudagraph_utils.py
@@ -0,0 +1,164 @@
+import dataclasses
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import torch
+from torch._dynamo.utils import counters
+
+perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
+
+
+@dataclasses.dataclass(frozen=True)
+class FunctionID:
+    "Unique counter of a function wrapped in cudagraphify_impl"
+    id: int
+
+
+@dataclasses.dataclass(frozen=True)
+class WrappedFunction:
+    """
+    Represents a function that you want to record for CUDA graph replay,
+    with a little more metadata so we can identify if we have an applicable
+    CUDA graph in our CUDA graph tree for it.
+    """
+
+    model: Callable[..., Any]
+    static_input_idxs: List[int]
+    id: FunctionID
+    constants: Tuple[torch.Tensor, ...]
+    placeholders: List[torch.fx.Node]
+    mutated_input_idxs: List[int]
+
+
+def get_placeholders(graph: torch.fx.Graph) -> List[torch.fx.Node]:
+    return [node for node in graph.nodes if node.op == "placeholder"]
+
+
+def get_mutating_use_stack_trace(placeholder_node: torch.fx.Node) -> Optional[str]:
+    # reinplaced uses might have a single, non-copy_ use
+    if len(placeholder_node.users) == 1:
+        return next(iter(placeholder_node.users)).meta.get("stack_trace", None)
+
+    for use in placeholder_node.users:
+        if use.target == torch.ops.aten.copy_.default:
+            if stack_trace := use.meta.get("stack_trace", None):
+                return stack_trace
+
+    return None
+
+
+def format_default_skip_message(reason: str) -> str:
+    return f"skipping cudagraphs due to {reason}"
+
+
+def get_mutation_stack_trace(
+    placeholders: List[torch.fx.Node], mutation_indices: List[int]
+) -> str:
+    stack_trace: Optional[str] = ""
+
+    for idx in mutation_indices:
+        placeholder = placeholders[idx]
+        if stack_trace := get_mutating_use_stack_trace(placeholder):
+            break
+
+    msg = format_default_skip_message(
+        f"mutated inputs ({len(mutation_indices)} instances)"
+    )
+    if stack_trace:
+        return f"{msg}. Found from : \n {stack_trace}"
+
+    return msg
+
+
+def check_for_mutation(
+    func: WrappedFunction,
+    inputs: List[torch.Tensor],
+    is_cuda_graph_recorded_tensor: Callable[[torch.Tensor], bool],
+) -> Optional[str]:
+    # doesnt work for non-trees because the warmup run would apply mutation twice
+    if torch._inductor.config.triton.cudagraph_trees:
+        # checking if mutation is only on parameters/static inputs
+        mutation_indices = [
+            idx
+            for idx in func.mutated_input_idxs
+            if not (
+                idx in func.static_input_idxs
+                or is_cuda_graph_recorded_tensor(inputs[idx])
+            )
+        ]
+    else:
+        mutation_indices = func.mutated_input_idxs
+
+    return (
+        get_mutation_stack_trace(func.placeholders, mutation_indices)
+        if mutation_indices
+        else None
+    )
+
+
+def get_use_stack_trace(node) -> Optional[str]:
+    for use in node.users:
+        if stack_trace := use.meta.get("stack_trace", None):
+            return stack_trace
+    return None
+
+
+def check_multiple_devices_or_any_cpu_nodes(
+    device_node_mapping: Dict[torch.device, torch.fx.Node]
+) -> Optional[str]:
+    if cpu_node := device_node_mapping.get(torch.device("cpu")):
+        msg = f"cpu device ({cpu_node.name})"
+        if stack_trace := get_use_stack_trace(cpu_node):
+            return format_default_skip_message(f"{msg}. Found from : \n {stack_trace}")
+
+        return format_default_skip_message(msg)
+
+    if (
+        len(device_node_mapping) == 1
+        and next(iter(device_node_mapping.keys())).type == "cuda"
+    ):
+        return None
+
+    keys_repr = (repr(key) for key in device_node_mapping.keys())
+    return format_default_skip_message(f"multiple devices: {', '.join(keys_repr)}")
+
+
+def check_lowering_disable_cudagraph(
+    device_node_mapping: Dict[torch.device, torch.fx.Node]
+):
+    return check_multiple_devices_or_any_cpu_nodes(device_node_mapping)
+
+
+def log_cudagraph_skip_and_bump_counter(msg):
+    perf_hint_log.warning(msg)
+    counters["inductor"]["cudagraph_skips"] += 1
+
+
+@dataclasses.dataclass
+class BoxedDeviceIndex:
+    value: Optional[int]
+
+    def set(self, device_idx: Optional[int]):
+        assert device_idx is None or isinstance(device_idx, int)
+        self.value = device_idx
+
+
+def check_for_mutation_ignore_cuda_graph_managed_tensor(
+    gm: torch.fx.GraphModule, compiled_graph, num_fixed: int
+) -> Optional[str]:
+    default_msg = format_default_skip_message("mutated inputs")
+
+    # doesnt work for non-trees because the warmup run would apply mutation twice
+    if torch._inductor.config.triton.cudagraph_trees:
+        # checking if mutation is only on parameters/static inputs
+        mutation_indices = [
+            idx for idx in compiled_graph.mutated_input_idxs if idx >= num_fixed
+        ]
+        has_mutation = len(mutation_indices) != 0
+        if not has_mutation:
+            return None
+        placeholders = [node for node in gm.graph.nodes if node.op == "placeholder"]
+        return get_mutation_stack_trace(placeholders, mutation_indices)
+
+    else:
+        has_mutation = len(compiled_graph.mutated_inputs) != 0
+        return None if not has_mutation else default_msg
diff --git a/torch/_inductor/debug.py b/torch/_inductor/debug.py
index db5ed73a0a2b4..bdb07620f94ee 100644
--- a/torch/_inductor/debug.py
+++ b/torch/_inductor/debug.py
@@ -80,7 +80,7 @@ def draw_buffers(nodes: List[BaseSchedulerNode], print_graph=False, fname=None):
         if isinstance(node, ir.ComputedBuffer):
             dtype = node.data.dtype
 
-        metadata = TensorMetadata(group, dtype, None, None, None, None, None)
+        metadata = TensorMetadata(group, dtype, None, None, None, None, None)  # type: ignore[arg-type]
         node.meta["tensor_meta"] = metadata
 
     if print_graph:
@@ -328,20 +328,27 @@ def copy(self, new_path: str):
         if not self._path:
             return
         assert new_path.endswith(".debug"), new_path
-        if os.path.exists(new_path):
-            shutil.rmtree(new_path)
+        from filelock import FileLock
+
         try:
-            shutil.copytree(self._path, new_path)
-            self._path = new_path
+            with FileLock(f"{new_path}.lock"):
+                if os.path.exists(new_path):
+                    shutil.rmtree(new_path)
+                shutil.copytree(self._path, new_path)
         except OSError:
             log.warning(
                 "Failed to copy debug files from %s to %s", self._path, new_path
             )
-            pass
 
-    def fopen(self, filename: str):
+    def fopen(self, filename: str, write_mode: str = "w", *args, **kwargs):
+        assert self._path
+        return open(os.path.join(self._path, filename), write_mode, *args, **kwargs)
+
+    @contextlib.contextmanager
+    def fopen_context(self, filename: str, write_mode: str = "w", *args, **kwargs):
         assert self._path
-        return open(os.path.join(self._path, filename), "w")
+        with open(os.path.join(self._path, filename), write_mode, *args, **kwargs) as f:
+            yield f
 
     def filename(self, suffix: str):
         assert self._path
@@ -435,6 +442,7 @@ def ignored(*args, **kwargs):
 class DebugFormatter:
     def __init__(self, handler):
         self.fopen = handler.fopen
+        self.fopen_context = handler.fopen_context
         self.filename = handler.filename
         self.handler = handler
 
@@ -481,6 +489,96 @@ def draw_orig_fx_graph(self, gm: torch.fx.GraphModule, nodes: SchedulerNodeList)
     def output_code(self, filename):
         shutil.copy(filename, self.filename("output_code.py"))
 
+    def log_autotuning_results(
+        self,
+        name: str,
+        input_nodes: List[ir.IRNode],
+        timings: Dict["ChoiceCaller", float],  # type: ignore[name-defined] # noqa: F821
+        elapse: float,
+        precompile_elapse: float,
+    ):
+        import json
+
+        from .ir import FixedLayout
+
+        def build_node_info(node: ir.IRNode):
+            if hasattr(node, "name"):
+                node_name = node.name
+            else:
+                node_name = ""
+            node_info = {
+                "name": node_name,
+                "type": type(node).__name__,
+            }
+            try:
+                layout = node.get_layout()
+                if isinstance(layout, FixedLayout):
+                    offset = 0
+                    try:
+                        offset = int(layout.offset)
+                    except Exception:
+                        try:
+                            offset = V.graph.sizevars.size_hint(
+                                layout.offset, fallback=0
+                            )
+                        except Exception:
+                            pass
+                    static_layout = FixedLayout(
+                        layout.device,
+                        dtype=layout.dtype,
+                        size=list(V.graph.sizevars.size_hints(layout.size)),
+                        stride=list(V.graph.sizevars.size_hints(layout.stride)),
+                        offset=offset,
+                    )
+                    node_info["layout"] = str(static_layout)
+                else:
+                    node_info["layout"] = str(node.get_layout())
+            except Exception as e:
+                pass
+            try:
+                node_info["dtype"] = str(node.get_dtype())
+            except Exception as e:
+                pass
+            try:
+                node_info["device"] = str(node.get_device())
+            except Exception as e:
+                pass
+            try:
+                node_info["stride"] = str(
+                    V.graph.sizevars.size_hints(node.get_stride())
+                )
+            except Exception as e:
+                pass
+            try:
+                node_info["size"] = str(V.graph.sizevars.size_hints(node.get_size()))
+            except Exception as e:
+                pass
+            try:
+                node_info["numel"] = str(V.graph.sizevars.size_hint(node.get_numel()))
+            except Exception as e:
+                pass
+            if hasattr(node, "data") and isinstance(node.data, ir.IRNode):
+                node_info["data"] = build_node_info(node.data)
+            return node_info
+
+        general_properties = {
+            "op_name": name,
+            "cuda_device_name": torch.cuda.get_device_name(),
+            "cuda_device_count": torch.cuda.device_count(),
+            "input_nodes": [build_node_info(node) for node in input_nodes],
+            "autotuning_time": elapse,
+            "precompile_time": precompile_elapse,
+        }
+        with self.fopen_context(
+            "autotuning_result_json_list.txt", "at", encoding="utf-8"
+        ) as fd:
+            for caller, time in timings.items():
+                info_dict = dict(caller.info_dict())
+                info_dict.update(general_properties)
+                info_dict["benchmark_result"] = time
+                json.dump(info_dict, fd)
+                fd.write("\n")
+
 
 @dataclasses.dataclass
 class TensorMetadataHolder:
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 997b46c64bbe9..a4fd1a9191c1c 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -20,6 +20,7 @@
 )
 from torch._decomp.decompositions_for_rng import extra_random_decomps
 from torch._higher_order_ops.out_dtype import out_dtype
+from torch._inductor.utils import pad_listlike
 from torch._prims_common import (
     elementwise_dtypes,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
@@ -27,10 +28,12 @@
 )
 
 from . import config, inductor_prims
+from .utils import needs_fallback_due_to_atomic_add_limitations, use_scatter_fallback
 
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
 prims = torch.ops.prims
+quantized = torch.ops.quantized
 quantized_decomposed = torch.ops.quantized_decomposed
 
 inductor_decompositions = get_decompositions(
@@ -54,9 +57,14 @@
         aten._native_batch_norm_legit,
         aten._native_batch_norm_legit_functional,
         aten._native_batch_norm_legit_no_training,
+        aten._batch_norm_with_update,
+        aten._batch_norm_with_update_functional,
+        aten._batch_norm_no_update,
+        aten.batch_norm_backward,
         aten.native_batch_norm,
         aten.native_group_norm,
         aten.native_layer_norm,
+        aten.nll_loss2d_backward,
         aten._softmax,
         aten.sin_,
         aten.sqrt_,
@@ -65,6 +73,7 @@
         aten.tril_indices,
         aten.triu_indices,
         aten.upsample_bilinear2d.vec,
+        quantized.linear_dynamic_fp16_unpacked_weight,
     ]
 )
 decompositions = {**core_aten_decompositions(), **inductor_decompositions}
@@ -73,10 +82,12 @@
 # the Inductor decomp table.
 decomps_to_exclude = [
     aten._unsafe_index,
-    aten._scaled_dot_product_flash_attention.default,  # See comments in torch/_decomp/decompositions.py
+    aten._scaled_dot_product_flash_attention_for_cpu.default,  # See comments in torch/_decomp/decompositions.py
+    aten._softmax_backward_data,
     aten.clamp_max,
     aten.clamp_min,
     aten.glu,  # inductor lowers this directly
+    aten.select_scatter,  # need to be in the ATen graph in order for it to work with the re-inplacing pass
     aten.split.Tensor,  # inductor lowers this directly
     aten.squeeze,  # inductor lowers this directly
     aten.sum,  # inductor lowers this directly
@@ -94,7 +105,7 @@ def register_decomposition(ops):
 
 
 # TODO: for now, inductor doesn't handle asserts
-# because the condition is symbool -> tensor in the graph.
+# because the condition is symbol -> tensor in the graph.
 @register_decomposition([aten._assert_async.msg])
 def assert_async_msg_decomp(tensor, msg):
     return
@@ -126,7 +137,7 @@ def full(size, fill_value, **kwargs):
     dtype = kwargs.get("dtype")
     if dtype is None:
         kwargs["dtype"] = type_to_dtype(type(fill_value))
-        return aten.full(size, fill_value, **kwargs)
+        return torch.full(size, fill_value, **kwargs)
     return NotImplemented
 
 
@@ -175,11 +186,6 @@ def convolution_backward(
     return (grad_inp, grad_weight, grad_bias)
 
 
-@register_decomposition([aten.log2])
-def log2(x):
-    return torch.log(x) * (1.0 / math.log(2.0))
-
-
 @register_decomposition([aten.round.decimals])
 def round_dec(x, decimals=0):
     ten_pow_decimals = 10.0**decimals
@@ -219,6 +225,11 @@ def addmm(self, mat1, mat2, beta=1, alpha=1):
 @register_decomposition([aten.mm])
 @pw_cast_for_opmath
 def mm(self, input2):
+    from torch.fx.experimental.symbolic_shapes import (
+        definitely_true,
+        guard_size_oblivious,
+    )
+
     # Our matrix vector multiplies only achieve peak bandwidth with coordinate descent tuning.
     # todo: Look into why and fix it (hopefully)
     if config.coordinate_descent_tuning:
@@ -226,26 +237,48 @@ def mm(self, input2):
             return (self.unsqueeze(2) * input2.unsqueeze(0)).sum(dim=1)
     if self.device.type == "cpu":
         if (
-            self.size(-1) == 1
-            and self.size(0) > 0
-            and input2.size(0) == 1
+            guard_size_oblivious(self.size(-1) == 1)
+            and guard_size_oblivious(self.size(0) > 0)
+            and guard_size_oblivious(input2.size(0) == 1)
             and (self.dtype == input2.dtype)
-            and ((torch.numel(self) + torch.numel(input2)) <= 32)
+            and definitely_true((torch.numel(self) + torch.numel(input2)) <= 32)
         ):
             return torch.cat([self[i, :] * input2 for i in range(self.size(0))])
-        if self.size(0) == 1 and input2.size(-1) == 1:
+        if guard_size_oblivious(self.size(0) == 1) and guard_size_oblivious(
+            input2.size(-1) == 1
+        ):
             return torch.sum(
                 self.squeeze(0) * input2.squeeze(-1), dim=0, keepdim=True
             ).unsqueeze(0)
     return NotImplemented
 
 
+# This pass does two things:
+# - Eliminate cat when there is only one tensor input
+# - Normalize cat calls, so that legacy empty 1-D tensors are removed (NB: we
+#   don't remove ALL empty tensors, only the naughty ones)
 @register_decomposition([aten.cat.default])
 def cat(tensors, dim=0):
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
     def non_empty_tensor(x):
-        # special case for cat'ing with an empty tensor -
-        # just drop the 'empty' inputs so they don't confuse the logic below.
-        return len(x.shape) > 1 or x.shape[0] > 0
+        # For better or worse, this is a valid cat:
+        #
+        #   torch.cat([torch.randn(2, 2, 4), torch.randn(0), torch.randn(3, 2, 4)])
+        #
+        # We'd like to eliminate naughtiness like this for downstream passes
+        # like split_cat.  The easiest way is to just drop such inputs
+        # (guarding that they are non-zero).
+        #
+        # Is it permissible for this filtering to be size-oblivious?  A case
+        # where this could matter is cat([(2, 2), (u0,)], dim=0); if u0
+        # happened to be zero, we would have liked to have filtered it out.
+        # But actually, the ONLY way this could have passed is if u0 == 0,
+        # so by the time we get here we have already installed a deferred
+        # runtime assert forcing u0 to be zero.  So if this hasn't happened,
+        # we know that the unbacked SymInt has appropriate size and there are
+        # no problems.
+        return len(x.shape) != 1 or guard_size_oblivious(x.shape[0] > 0)
 
     filtered_tensors = list(filter(non_empty_tensor, tensors))
 
@@ -435,66 +468,14 @@ def randint(high, size, **kwargs):
     return aten.randint.low(0, high, size, **kwargs)
 
 
-# The difference between quantize_per_tensor.default and quantize_per_tensor.tensor is
-# scale and zero_point is scalar or scalar tensor
-@register_decomposition(quantized_decomposed.quantize_per_tensor.default)
-def quantize_per_tensor_default_decomp_impl(
-    input: torch.Tensor,
-    scale: float,
-    zero_point: int,
-    quant_min: int,
-    quant_max: int,
-    dtype: torch.dtype,
-) -> torch.Tensor:
-    if input.dtype == torch.bfloat16:
-        input = input.to(torch.float32)
-    inv_scale = 1.0 / scale
-    return torch.clamp(
-        torch.round(input * inv_scale) + zero_point, quant_min, quant_max
-    ).to(dtype)
-
-
-# The difference between dequantize_per_tensor.default and dequantize_per_tensor.tensor is
-# scale and zero_point is scalar or scalar tensor
-@register_decomposition(quantized_decomposed.dequantize_per_tensor.default)
-def dequantize_per_tensor_default_decomp_impl(
-    input: torch.Tensor,
-    scale: float,
-    zero_point: int,
-    quant_min: int,
-    quant_max: int,
-    dtype: torch.dtype,
-) -> torch.Tensor:
-    return (input.to(torch.float32) - zero_point) * scale
-
-
-@register_decomposition(quantized_decomposed.quantize_per_tensor.tensor)
-def quantize_per_tensor_tensor_decomp_impl(
-    input: torch.Tensor,
-    scale: torch.Tensor,
-    zero_point: torch.Tensor,
-    quant_min: int,
-    quant_max: int,
-    dtype: torch.dtype,
-) -> torch.Tensor:
-    if input.dtype == torch.bfloat16:
-        input = input.to(torch.float32)
-    inv_scale = 1.0 / scale
-    return torch.clamp(
-        torch.round(input * inv_scale) + zero_point, quant_min, quant_max
-    ).to(dtype)
-
-
-@register_decomposition(quantized_decomposed.dequantize_per_tensor.tensor)
-def dequantize_per_tensor_tensor_decomp_impl(
-    input: torch.Tensor,
-    scale: torch.Tensor,
-    zero_point: torch.Tensor,
-    quant_min: int,
-    quant_max: int,
-    dtype: torch.dtype,
+@register_decomposition(quantized.linear_dynamic_fp16_unpacked_weight.default)
+def linear_dynamic_fp16_unpacked_weight(
+    input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
 ) -> torch.Tensor:
-    return (input.to(torch.float32) - zero_point) * scale
+    packed_weight = torch.ops._quantized.wrapped_fbgemm_pack_gemm_matrix_fp16(weight)
+    return torch.ops._quantized.wrapped_fbgemm_linear_fp16_weight(
+        input, packed_weight, bias, weight.size()[0]
+    )
 
 
 @register_decomposition(torch.ops.quantized.embedding_bag_byte_unpack)
@@ -619,3 +600,140 @@ def masked_scatter(self, mask, source):
         source_idx = mask.reshape(-1).cumsum(0) - 1
         return inductor_prims.masked_scatter_with_index(self, mask, source_idx, source)
     return NotImplemented
+
+
+@register_decomposition(quantized_decomposed.choose_qparams.tensor)
+def choose_qparams_tensor(
+    input: torch.Tensor, quant_min: int, quant_max: int, eps: float, dtype: torch.dtype
+):
+    min_val, max_val = torch.aminmax(input)
+    scale = (max_val - min_val) / float(quant_max - quant_min)
+    scale = torch.max(scale, torch.Tensor([eps]))
+    zero_point = quant_min - torch.round(min_val / scale).to(torch.int)
+    zero_point = torch.clamp(zero_point, quant_min, quant_max)
+    return scale.to(torch.float64), zero_point.to(torch.int64)
+
+
+@register_decomposition(aten.put)
+def put(self, index, source, accumulate=False):
+    flattened = self.flatten()
+    flattened = torch.index_put(
+        flattened, [index], source.reshape(index.shape), accumulate
+    )
+    return flattened.reshape(self.shape)
+
+
+@register_decomposition(aten.put_)
+def put_(self, index, source, accumulate=False):
+    out = aten.put(self, index, source, accumulate=accumulate)
+    return self.copy_(out)
+
+
+@register_decomposition(aten._softmax_backward_data.default)
+@pw_cast_for_opmath
+def _softmax_backward_data(grad_output, output, dim, input_dtype):
+    new_grad_output = grad_output * output
+    sum_new_grad = torch.sum(new_grad_output, dim=dim, keepdim=True)
+    # grad_input = new_grad_output - output * sum_new_grad
+    grad_input = inductor_prims.fma(-output, sum_new_grad, new_grad_output)
+
+    # CPU kernel doesn't respect input_dtype, but following check doesn't work for meta tensor
+    # if grad_output.device == torch.device("cpu"):
+    #     return grad_input.contiguous()
+
+    if grad_output.dtype != input_dtype:
+        grad_input = grad_input.to(input_dtype)
+    return grad_input.contiguous()
+
+
+@register_decomposition(aten.index_reduce)
+def index_reduce(
+    self, dim: int, index, src, reduction_type: str, *, include_self: bool = True
+):
+    if reduction_type == "mean" and not needs_fallback_due_to_atomic_add_limitations(
+        self.dtype
+    ):
+        true_division = self.dtype.is_floating_point or self.dtype.is_complex
+        ones = torch.ones_like(src)
+        if include_self:
+            out = self
+            counts = torch.ones_like(self).index_add(dim, index, ones)
+        else:
+            out = self.index_fill(dim, index, 0)
+            counts = torch.zeros_like(self).index_add(dim, index, ones)
+            counts = counts.masked_fill(counts < 1, 1)
+        out = out.index_add(dim, index, src)
+        return out / counts if true_division else out // counts
+
+    if use_scatter_fallback(
+        aten.scatter_reduce_.two,
+        reduction_type,
+        self.dtype,
+        src.dtype,
+        src.device.type,
+        True,
+    ):
+        return NotImplemented
+
+    repeats = self.shape[dim + 1 :].numel() * self.shape[:dim].numel()
+    index_shape = (index.numel(), *self.shape[dim + 1 :], *self.shape[:dim])
+    perm = (*range(self.ndim - dim, self.ndim), 0, *range(1, self.ndim - dim))
+    scatter_index = (
+        index.to(torch.int64)
+        .repeat_interleave(repeats)
+        .reshape(index_shape)
+        .permute(perm)
+    )
+    return self.scatter_reduce(
+        dim,
+        scatter_index,
+        src,
+        reduction_type,
+        include_self=include_self,
+    )
+
+
+@register_decomposition(aten.max_pool2d_with_indices)
+def max_pool2d_with_indices(
+    x, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False
+):
+    if dilation == 1:
+        dilation = [1, 1]
+
+    if padding == 0:
+        padding = [0, 0]
+
+    if stride is None:
+        stride = kernel_size
+
+    kernel_size = pad_listlike(kernel_size, 2)
+    dilation = pad_listlike(dilation, 2)
+    padding = pad_listlike(padding, 2)
+    stride = pad_listlike(stride, 2)
+
+    window_size = kernel_size[0] * kernel_size[1]
+    # We fallback when using non-default dilation or when the window size is too large
+    if (
+        torch._inductor.lowering.should_fallback_max_pool2d_with_indices(
+            kernel_size, dilation
+        )
+        or window_size > torch.iinfo(torch.int8).max
+    ):
+        return NotImplemented
+
+    vals, offsets = prims._low_memory_max_pool2d_with_offsets(
+        x,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        ceil_mode,
+    )
+    indices = prims._low_memory_max_pool2d_offsets_to_indices(
+        offsets,
+        kernel_size[1],
+        x.size(-1),
+        stride,
+        padding,
+    )
+    return vals, indices
diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py
index 7f2e9b83a7e50..52f7238f9a886 100644
--- a/torch/_inductor/dependencies.py
+++ b/torch/_inductor/dependencies.py
@@ -1,3 +1,4 @@
+import abc
 import collections
 import dataclasses
 import itertools
@@ -5,6 +6,7 @@
 import re
 import typing
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+from unittest.mock import patch
 
 import sympy
 
@@ -12,15 +14,47 @@
 from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
 
 from .codegen.common import index_prevent_reordering
-from .utils import get_dtype_size, sympy_str, sympy_subs, sympy_symbol, VarRanges
-from .virtualized import V
+from .utils import (
+    get_dtype_size,
+    reduction_num_outputs,
+    sympy_index_symbol,
+    sympy_str,
+    sympy_subs,
+    VarRanges,
+)
+from .virtualized import OpsHandler, ReductionType, V
 
 log = logging.getLogger(__name__)
 is_indirect = re.compile(r"indirect|tmp").search
-Dep = Union["MemoryDep", "StarDep", "WeakDep"]
 
 
-class MemoryDep(typing.NamedTuple):
+class Dep(abc.ABC):
+    name: str
+    index: sympy.Expr
+
+    @abc.abstractmethod
+    def rename(self, renames: Dict[str, str]) -> "Dep":
+        pass
+
+    @abc.abstractmethod
+    def get_numel(self) -> sympy.Expr:
+        pass
+
+    @abc.abstractmethod
+    def numbytes_hint(self):
+        pass
+
+    @abc.abstractmethod
+    def has_unbacked_symbols(self) -> bool:
+        pass
+
+    @abc.abstractmethod
+    def is_contiguous(self) -> bool:
+        pass
+
+
+@dataclasses.dataclass(frozen=True)
+class MemoryDep(Dep):
     name: str
     index: sympy.Expr
     var_names: Tuple[sympy.Symbol, ...]
@@ -29,6 +63,54 @@ class MemoryDep(typing.NamedTuple):
     def __repr__(self):
         return f"MemoryDep({self.name!r}, {self.index}, {self.ranges})"
 
+    def get_offset(self):
+        """
+        Return the offset by setting every variable to be 0.
+        """
+        return sympy_subs(self.index, {v: 0 for v in self.var_names})
+
+    def normalize_with_stride_order(self, prefix="t"):
+        r"""
+        Used to decide if two MemoryDep does not equal due to different loop orders.
+        More specifically, when dep1 and dep2 are not equal, we can normalize
+        both and check if they are equal after that. If yes, then the mismatch is
+        caused by different loop orders.
+        """
+        # import here to avoid circular import
+        from torch._inductor import ir
+
+        strides = V.graph.sizevars.stride_hints(self.index, self.var_names)
+
+        # pick a loop order with stride ordered decreasingly
+        order = sorted(range(len(strides)), key=strides.__getitem__, reverse=True)
+        stride_reorder = ir.same_reorder(order)
+        sizes = self.size
+        var_names = self.var_names
+
+        new_reordered_sizes = stride_reorder(sizes)
+        new_reordered_var_names = stride_reorder(var_names)
+
+        new_simplified_sizes, reindex, prune = V.graph.sizevars._simplify_loops(
+            new_reordered_var_names,
+            new_reordered_sizes,
+            index_prevent_reordering(
+                [self.index], new_reordered_var_names, new_reordered_sizes
+            ),
+        )
+
+        # now let's create new symbols with the passed in prefix
+        var_ranges, add_var = var_builder(prefix)
+        replacement = dict(
+            zip(
+                new_reordered_var_names,
+                reindex([add_var(x) for x in new_simplified_sizes]),
+            )
+        )
+        new_index = sympy_subs(sympy.expand(self.index), replacement)
+
+        out = MemoryDep(self.name, new_index, tuple(var_ranges.keys()), tuple(var_ranges.values()))  # type: ignore[arg-type]
+        return out
+
     @property
     def ranges(self) -> Dict[sympy.Symbol, sympy.Expr]:
         """{c0: 128, c1: 512, ...}"""
@@ -63,19 +145,49 @@ def has_unbacked_symbols(self):
     def is_contiguous(self) -> bool:
         return isinstance(self.index, sympy.Symbol) and self.index in self.var_names
 
+    def stride1_for_last_dim(self, result_for_complex_expression=True) -> bool:
+        """
+        Whether the stride for the last dimension is 1.
+        """
+        # python test/inductor/test_torchinductor_opinfo.py -k test_comprehensive_masked_scatter_cuda_float16
+        # will exercise thru this corner case.
+        if len(self.var_names) == 0:
+            return True
+
+        terms = self.index.args if isinstance(self.index, sympy.Add) else [self.index]
+
+        last_sym = self.var_names[-1]
+        for term in terms:
+            if term is last_sym:
+                return True
+
+            # Having a >1 stride for the last dimension is bad for perf
+            # return False.
+            if (
+                isinstance(term, sympy.Mul)
+                and len(term.args) == 2
+                and term.args[1] is last_sym
+                and isinstance(term.args[0], (int, sympy.Integer))
+                and term.args[0] > 1
+            ):
+                return False
+
+        return result_for_complex_expression
+
     def is_scalar(self) -> bool:
         if isinstance(self.index, sympy.Symbol):
             return self.index not in self.var_names and not self.is_indirect()
         return isinstance(self.index, (int, sympy.Integer))
 
     def is_indirect(self) -> bool:
-        return any(is_indirect(v.name) for v in self.index.free_symbols)
+        return any(is_indirect(v.name) for v in self.index.free_symbols)  # type: ignore[attr-defined]
 
 
-class StarDep(typing.NamedTuple):
-    # depends on the entire buffer
+@dataclasses.dataclass(frozen=True)
+class StarDep(Dep):
     name: str
 
+    # depends on the entire buffer
     @property
     def index(self):
         raise NotImplementedError("StarDep does not have an index")
@@ -112,7 +224,8 @@ def is_indirect(self) -> bool:
 #
 # It is weak because if it turns out A's read is never used, we can still
 # eliminate it
-class WeakDep(typing.NamedTuple):
+@dataclasses.dataclass(frozen=True)
+class WeakDep(Dep):
     name: str
 
     @property
@@ -137,8 +250,9 @@ def is_contiguous(self) -> bool:
         return False
 
 
-class IndexExprDep(typing.NamedTuple):
-    index: sympy.Expr
+@dataclasses.dataclass(frozen=True)
+class IndexExprDep:
+    index: sympy.Expr  # type: ignore[assignment]
     var_names: Tuple[sympy.Symbol, ...]
     size: Tuple[sympy.Expr, ...]
 
@@ -208,6 +322,20 @@ def remove_reads(self, rem_reads):
     def reads_and_writes(self):
         return itertools.chain(self.reads, self.writes)
 
+    def buffer_names(self, ignore_integer_index=True):
+        """
+        Integer index is used for load_seed.
+        """
+        names = set()
+        for dep in self.reads_and_writes():
+            if not isinstance(dep, MemoryDep):
+                continue
+            if not ignore_integer_index or not isinstance(
+                dep.index, (int, sympy.Integer)
+            ):
+                names.add(dep.name)
+        return names
+
 
 class _RecordLoadStoreInner(V.MockHandler):  # type: ignore[name-defined]
     def __init__(self, var_ranges: VarRanges, normalize: bool):
@@ -227,7 +355,7 @@ def canonicalize(
                 k for k, v in zip(self._var_ranges.keys(), sizes) if v != 1
             )
             sizes = tuple(v for v in sizes if v != 1)
-            return index, var_names, sizes
+            return index, var_names, sizes  # type: ignore[return-value]
 
         # Try to further simplify the indexes even if simplify_loops didn't
         # convert it to the simplest form because of the interference from
@@ -261,7 +389,7 @@ def canonicalize(
             # downstream users won't.  Normalize this away.
             new_vars.pop()
             new_sizes.pop()
-        return index, tuple(new_vars), tuple(new_sizes)
+        return index, tuple(new_vars), tuple(new_sizes)  # type: ignore[arg-type]
 
     def load(self, name: str, index: sympy.Expr) -> str:
         self._reads.add(MemoryDep(name, *self.canonicalize(index)))
@@ -316,12 +444,13 @@ def __init__(self, var_ranges: VarRanges, normalize: bool):
         super().__init__(parent_handler=parent_handler)
 
 
+# TODO: check call sites
 def var_builder(prefix: str) -> Tuple[VarRanges, Callable[[sympy.Expr], sympy.Symbol]]:
     cnt = itertools.count()
     var_ranges: VarRanges = dict()
 
     def add_var(length: sympy.Expr) -> sympy.Symbol:
-        v = sympy_symbol(f"{prefix}{next(cnt)}")
+        v = sympy_index_symbol(f"{prefix}{next(cnt)}")
         var_ranges[v] = length
         return v
 
@@ -442,3 +571,60 @@ def extract_input_node_reduction_ranges(
 
 def canonicalization_prefix():
     return "c"
+
+
+# ops handler which computes all the free unbacked symbols for an IR
+class FreeUnbackedSymbolsOpsHandler:
+    symbols: Set[sympy.Symbol]
+
+    def __init__(self):
+        self.symbols = set()
+
+    def __getattr__(self, name: str) -> Callable[..., Any]:
+        def inner(*args, **kwargs):
+            for a in itertools.chain(args, kwargs.values()):
+                if isinstance(a, (sympy.Expr, sympy.logic.boolalg.Boolean)):
+                    self.symbols |= free_unbacked_symbols(a)
+
+        return inner
+
+    def indirect_indexing(self, index_var, size, check=True) -> sympy.Symbol:
+        assert not isinstance(index_var, (sympy.Expr, sympy.logic.boolalg.Boolean))
+        self.symbols |= free_unbacked_symbols(size)
+        return sympy_index_symbol(f"({str(index_var)})")
+
+    def frexp(self, x):
+        return (None,) * 2
+
+    def scan(self, dtypes, combine_fn, values):
+        return (None,) * len(values)
+
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[None, Tuple[None, ...]],
+    ) -> Union[None, Tuple[None, ...]]:
+        num_values = reduction_num_outputs(reduction_type)
+        return (None,) * num_values if num_values > 1 else None
+
+
+def _typecheck_FreeUnbackedSymbolsOpsHandler(
+    h: FreeUnbackedSymbolsOpsHandler,
+) -> OpsHandler[None]:
+    return h
+
+
+def extract_free_unbacked_symbols(fn: Callable[..., Any], index, rindex=None):
+    from .ir import FlexibleLayout
+
+    args = [index, rindex] if rindex is not None else [index]
+    handler = FreeUnbackedSymbolsOpsHandler()
+    # NB: I cargo culted the allow_indexing patch here, I don't understand why
+    # people do this all over
+    with V.set_ops_handler(handler), patch.object(
+        FlexibleLayout, "allow_indexing", True
+    ):
+        fn(*args)
+    return handler.symbols
diff --git a/torch/_inductor/exc.py b/torch/_inductor/exc.py
index d9076e1c1808b..9e6aa6effae28 100644
--- a/torch/_inductor/exc.py
+++ b/torch/_inductor/exc.py
@@ -58,6 +58,10 @@ def __init__(self, exc: Exception, target, args, kwargs):
         )
 
 
+class SubgraphLoweringException(RuntimeError):
+    pass
+
+
 class InvalidCxxCompiler(RuntimeError):
     def __init__(self):
         from . import config
diff --git a/torch/_inductor/freezing.py b/torch/_inductor/freezing.py
index dc06138a37489..7d7cbed25193d 100644
--- a/torch/_inductor/freezing.py
+++ b/torch/_inductor/freezing.py
@@ -33,7 +33,7 @@ def replace_params_with_constants(
     Replaces the parameters of a PyTorch GraphModule with constants wherever possible.
     Returns a list of indices representing the input parameters that were not converted to constants.
     """
-    params = [node for node in gm.graph.nodes if node.op == "placeholder"]
+    params = gm.graph.find_nodes(op="placeholder")
     fake_inp_nodes = params[: len(params)]
     preserved_arg_indices = []
     aliased_input_args = [
@@ -97,9 +97,7 @@ def freeze(
             aot_autograd_gm, params_flat, fw_metadata
         )
     else:
-        inputs = [
-            node for node in aot_autograd_gm.graph.nodes if node.op == "placeholder"
-        ]
+        inputs = aot_autograd_gm.graph.find_nodes(op="placeholder")
         preserved_arg_indices = list(range(len(inputs)))
 
     # TODO - further restrict cse ? right now needed to dedup aliasing ops
diff --git a/torch/_inductor/fx_passes/binary_folding.py b/torch/_inductor/fx_passes/binary_folding.py
index d86565885370d..5cfabf9b7707d 100644
--- a/torch/_inductor/fx_passes/binary_folding.py
+++ b/torch/_inductor/fx_passes/binary_folding.py
@@ -46,9 +46,10 @@ def mark_mixed_dtype_allowed_convs(gm):
     Mark convolutions which we will binary fold even with mixed precision constants. We constant fold in the higher precision
     for better accuracy and then recover the original precision after.
     """
-    for node in gm.graph.nodes:
-        if node.target is aten.convolution.default:
-            mark_mixed_dtype_conv(node)
+    for node in gm.graph.find_nodes(
+        op="call_function", target=aten.convolution.default
+    ):
+        mark_mixed_dtype_conv(node)
 
 
 def recover_original_precision_folded_convs(gm):
@@ -56,8 +57,7 @@ def recover_original_precision_folded_convs(gm):
     After binary folding conv weights and biases to a higher dtype, recover the original precision they were in.
     """
     graph = gm.graph
-    convs = [node for node in graph.nodes if node.target is aten.convolution.default]
-    for node in convs:
+    for node in graph.find_nodes(op="call_function", target=aten.convolution.default):
         orig_dtype = node.meta.get("_allow_conv_mixed_dtype_folding", None)
         if orig_dtype is None:
             continue
diff --git a/torch/_inductor/fx_passes/ddp_fusion.py b/torch/_inductor/fx_passes/ddp_fusion.py
new file mode 100644
index 0000000000000..532a546dd4b6d
--- /dev/null
+++ b/torch/_inductor/fx_passes/ddp_fusion.py
@@ -0,0 +1,591 @@
+# Owner(s): ["oncall: distributed"]
+import collections
+import inspect
+import logging
+import math
+import operator
+from dataclasses import dataclass
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Generator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
+
+import torch
+import torch.fx as fx
+from torch._dynamo.utils import counters
+from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
+from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
+
+from ..fx_utils import get_fake_args_kwargs
+from ..virtualized import V
+
+aten = torch.ops.aten
+logger: logging.Logger = logging.getLogger("comm_fusion")
+
+
+def move_block_after(block: List[fx.Node], target_node: fx.Node) -> None:
+    for node in block:
+        target_node.append(node)
+        target_node = node
+
+
+def move_block_before(block: List[fx.Node], target_node: fx.Node) -> None:
+    for node in block:
+        target_node.prepend(node)
+        target_node = node
+
+
+def call_function(
+    graph: fx.Graph,
+    target: Union[str, Callable[..., Any]],
+    args: Optional[Tuple[fx.node.Argument, ...]] = None,
+    kwargs: Optional[Dict[str, fx.node.Argument]] = None,
+) -> fx.Node:
+    # We accept target as a str to avoid typing error as the type of
+    # a node.target is Union[str, Callable[..., Any]].
+    # This also allows us to avoid writing check for every call.
+    if isinstance(target, str):
+        raise RuntimeError(f"Call function should not get a str target {target=}")
+    node = graph.call_function(target, args, kwargs)
+    _, args, kwargs = get_fake_args_kwargs(node)
+    with V.fake_mode:
+        node.meta["val"] = target(*args, **kwargs)
+        # node.meta["val"] may be a container. So we use tree_map here
+        # to recursively extract the tensor metadata.
+        node.meta["tensor_meta"] = tree_map(
+            _extract_tensor_metadata, (node.meta["val"],)
+        )[0]
+    return node
+
+
+@dataclass(unsafe_hash=True)
+class CommBlock:
+    shape: Union[torch.Size, List[torch.Size]]
+    node_list: List[fx.Node]
+    inputs: List[fx.Node]
+    wait_nodes: List[fx.Node]
+    comm_node: fx.Node
+    outputs: Set[fx.Node]
+
+
+def get_comm_block(comm_node: fx.Node) -> Optional[CommBlock]:
+    """
+    Given a collective node (e.g., allreduce), find out all the nodes belong to
+    this communcation.
+
+    Args:
+        comm_node(fx.Node): The target communication/collective node.
+    Returns:
+        The CommBlock that encapsulates the related nodes (e.g., wait_node) of
+        the given comm_node.
+    """
+    node_list = []
+    wait_nodes = []
+    inputs, _ = tree_flatten((comm_node.args, comm_node.kwargs))
+    input_nodes = [inp for inp in inputs if isinstance(inp, fx.Node)]
+    wait_prefixes = "wait_tensor"
+    # If the users of the wait node are following items, we consinder them
+    # to be a part of the output.
+    intermediate_outputs = ("split", "reshape", "getitem", "detach", "alias")
+
+    first_user = next(iter(comm_node.users))
+    if (
+        len(comm_node.users) == 1
+        and first_user.target == torch.ops._c10d_functional.wait_tensor.default
+    ):
+        # Collective with only one output
+        node_list = [comm_node, first_user]
+        wait_nodes.append(first_user)
+    elif len(comm_node.users) > 1 and first_user.target == operator.getitem:
+        # Collective with only more than one output
+        node_list.append(comm_node)
+        for user in comm_node.users:
+            if user.target != operator.getitem:
+                return None
+            if len(user.users) != 1:
+                return None
+            wait_node = next(iter(user.users))
+            if wait_node.target != torch.ops._c10d_functional.wait_tensor.default:
+                return None
+            wait_nodes.append(wait_node)
+            node_list.append(user)
+        node_list.extend(wait_nodes)
+    else:
+        return None
+
+    # Identify all the outputs of this collective block.
+    outputs: Set[fx.Node] = set()
+    nodes = collections.deque(wait_nodes)
+    while nodes:
+        node = nodes.popleft()
+        for user in node.users:
+            if isinstance(user, fx.Node) and user.name.startswith(intermediate_outputs):
+                nodes.append(user)
+                node_list.append(user)
+            else:
+                outputs.add(node)
+                break
+
+    tensor_meta = input_nodes[0].meta["tensor_meta"]
+    shape: Union[torch.Size, List[torch.Size]]
+    if isinstance(tensor_meta, TensorMetadata):
+        shape = tensor_meta.shape
+    elif isinstance(tensor_meta, (list, tuple)):
+        shape = [tm.shape for tm in tensor_meta]
+    else:
+        logger.warning("Unexpected type of tensor_meta %s", type(tensor_meta))
+        return None
+
+    return CommBlock(
+        shape=shape,
+        node_list=node_list,
+        wait_nodes=wait_nodes,
+        comm_node=comm_node,
+        inputs=input_nodes,
+        outputs=outputs,
+    )
+
+
+def get_all_comm_blocks(
+    graph: fx.Graph,
+    comm_ops: Tuple[torch._ops.OpOverload, ...],
+    comm_filter: Optional[Callable[..., bool]] = None,
+) -> List[CommBlock]:
+    if comm_filter is None:
+
+        def always_true(comm_block: CommBlock) -> bool:
+            return True
+
+        comm_filter = always_true
+
+    blocks = []
+    for node in graph.nodes:
+        if node.target not in comm_ops:
+            continue
+        comm_block = get_comm_block(node)
+        if comm_block is not None and comm_filter(comm_block):
+            blocks.append(comm_block)
+    return blocks
+
+
+def _fuse_allreduce_by_concat(
+    graph: fx.Graph,
+    last_input_node: fx.Node,
+    all_input_nodes: List[fx.Node],
+    last_comm_block: CommBlock,
+) -> CommBlock:
+    """Given a list of inputs in order, create a fused allreduce using concat."""
+    # Flatten all the inputs to the all_reduce nodes.
+    with graph.inserting_after(last_input_node):
+        cat_inputs = []
+        for input_node in all_input_nodes:
+            assert isinstance(input_node.args[0], fx.Node)
+            input_node = input_node.args[0]
+            cat_inputs.append(
+                call_function(graph, aten.flatten.using_ints, (input_node,))
+            )
+
+    # Concat all the flattened nodes.
+    with graph.inserting_after(cat_inputs[0]):
+        cat_node = call_function(graph, aten.cat, (cat_inputs,))
+
+    # Insert the fused div node and remove the input div nodes.
+    # This is an optimization and is not mandatory for fusion.
+    divisors = [div.args[1] for div in all_input_nodes]
+    assert all(divisor == divisors[0] for divisor in divisors)
+    with graph.inserting_after(cat_node):
+        div_node = call_function(graph, last_input_node.target, (cat_node, divisors[0]))
+
+    # Create a new Comm/all_reduce node.
+    last_comm_node = last_comm_block.comm_node
+    last_wait_node = last_comm_block.wait_nodes[0]
+    with graph.inserting_after(div_node):
+        flatten_args, spec = tree_flatten((last_comm_node.args, last_comm_node.kwargs))
+        flatten_args[0] = div_node
+        args, kwargs = tree_unflatten(flatten_args, spec)
+        fused_comm_node = call_function(graph, last_comm_node.target, args, kwargs)
+
+    # Create a new Wait node.
+    with graph.inserting_after(fused_comm_node):
+        flatten_args, spec = tree_flatten((last_wait_node.args, last_wait_node.kwargs))
+        flatten_args[0] = fused_comm_node
+        args, kwargs = tree_unflatten(flatten_args, spec)
+        fused_wait_node = call_function(graph, last_wait_node.target, args, kwargs)
+
+    # Move the fused all_reduce and its args to right after the input node
+    nodes_to_move = cat_inputs + [cat_node, div_node, fused_comm_node, fused_wait_node]
+    move_block_after(nodes_to_move, last_input_node)
+
+    return CommBlock(
+        shape=cast(TensorMetadata, cat_node.meta.get("tensor_meta")).shape,
+        node_list=[fused_comm_node, fused_wait_node],
+        wait_nodes=[fused_wait_node],
+        comm_node=fused_comm_node,
+        inputs=[div_node],
+        outputs={fused_wait_node},
+    )
+
+
+def _fuse_with_coalesced_op(
+    graph: fx.Graph,
+    last_input_node: fx.Node,
+    all_input_nodes: List[fx.Node],
+    last_comm_block: CommBlock,
+) -> CommBlock:
+    """Given a list of inputs in order, create a fused allreduce by coalesced."""
+    last_comm_node = last_comm_block.comm_node
+    last_wait_node = last_comm_block.wait_nodes[0]
+
+    # Insert the fused div node and remove the input div nodes.
+    # This is an optimization and is not mandatory for fusion.
+    dividends = [div.args[0] for div in all_input_nodes]
+    divisors = [div.args[1] for div in all_input_nodes]
+    assert all(divisor == divisors[0] for divisor in divisors)
+    with graph.inserting_before(last_input_node):
+        last_input_node = call_function(
+            graph, aten._foreach_div.Scalar, (dividends, divisors[0])
+        )
+    input_node = last_input_node
+
+    # Create a new Comm/all_reduce_coalesced node.
+    with graph.inserting_after(last_comm_node):
+        flatten_args, spec = tree_flatten((last_comm_node.args, last_comm_node.kwargs))
+        flatten_args[0] = input_node
+        args, kwargs = tree_unflatten(flatten_args, spec)
+        fused_comm_node = call_function(
+            graph, torch.ops._c10d_functional.all_reduce_coalesced.default, args, kwargs
+        )
+
+    # Create a new wait node.
+    getitem_nodes = []
+    wait_nodes = []
+    flatten_args, spec = tree_flatten((last_wait_node.args, last_wait_node.kwargs))
+    for idx in range(len(all_input_nodes)):
+        with graph.inserting_after(fused_comm_node):
+            gi_node = call_function(graph, operator.getitem, (fused_comm_node, idx))
+        getitem_nodes.append(gi_node)
+        flatten_args[0] = gi_node
+        args, kwargs = tree_unflatten(flatten_args, spec)
+        with graph.inserting_after(gi_node):
+            wait_nodes.append(call_function(graph, last_wait_node.target, args, kwargs))
+
+    # Move the new all_reduce_coalesced and its args to right after the input node
+    nodes_to_move = [fused_comm_node] + getitem_nodes + wait_nodes
+    move_block_after(nodes_to_move, last_input_node)
+
+    return CommBlock(
+        shape=[
+            tm.shape
+            for tm in cast(
+                List[TensorMetadata], fused_comm_node.meta.get("tensor_meta")
+            )
+        ],
+        node_list=[fused_comm_node] + getitem_nodes + wait_nodes,
+        wait_nodes=wait_nodes,
+        comm_node=fused_comm_node,
+        inputs=[input_node],
+        outputs=set(wait_nodes),
+    )
+
+
+def _scatter_fused_allreduce_waits(
+    graph: fx.Graph,
+    fused_comm_block: CommBlock,
+    orig_comm_blocks: List[CommBlock],
+    node_indices: Dict[fx.Node, int],
+    split_and_reshape: bool = True,
+) -> None:
+    """
+    Scatters the result of the fused communication node to the original users.
+    If the fused method is concat splitting the output and reshape will be inserted,
+    before inserting getitem. Otherwise getitem will be used as the users of the
+    wait node.
+    """
+
+    # Before we mass up the order, we need to get the index of the last wait node
+    # in orig_comm_blocks. This index will be later used to determinee what users
+    # nodes need to be move to maintain a correct topological sort order.
+    last_wait_node_idx = 0
+    for node in graph.nodes:
+        last_wait_node_idx = max(
+            node_indices.get(node, last_wait_node_idx), last_wait_node_idx
+        )
+        if node == orig_comm_blocks[-1].wait_nodes[0]:
+            break
+
+    if split_and_reshape:
+        fused_wait_node = fused_comm_block.wait_nodes[0]
+        with graph.inserting_after(fused_wait_node):
+            split_node = call_function(
+                graph,
+                aten.split,
+                (
+                    fused_wait_node,
+                    [math.prod(cast(List[int], cb.shape)) for cb in orig_comm_blocks],
+                ),
+            )
+        with graph.inserting_after(split_node):
+            fused_outputs = []
+            for idx, comm_block in enumerate(orig_comm_blocks):
+                split_idx_node = call_function(
+                    graph, operator.getitem, (split_node, idx)
+                )
+                with graph.inserting_after(split_idx_node):
+                    fused_outputs.append(
+                        call_function(
+                            graph, aten.reshape, (split_idx_node, comm_block.shape)
+                        )
+                    )
+    else:
+        fused_outputs = fused_comm_block.wait_nodes
+
+    # Scatter the fused outputs.
+    incorrect_order_nodes = []
+    for comm_block, fused_output in zip(orig_comm_blocks, fused_outputs):
+        # Some descendant users of the orig_comm_blocks may be scheduled before
+        # the fused all_reduce. For example, the user nodes of the very first
+        # all_reduce may be scheduled before the second all_reduce. Since the
+        # fused all_reduce is inserted right after the last all_reudce, the
+        # order can be wrong.
+        # `incorrect_order_nodes` records these nodes.
+
+        orig_wait = comm_block.wait_nodes[0]
+        nodes = collections.deque(list(orig_wait.users))
+        while nodes:
+            user_node = nodes.popleft()
+            if not isinstance(user_node, fx.Node):
+                continue
+            if node_indices[user_node] < last_wait_node_idx:
+                incorrect_order_nodes.append(user_node)
+                nodes.extend(list(user_node.users))
+
+        orig_wait.replace_all_uses_with(fused_output)
+
+    last_fused_result = fused_outputs[0]
+    fused_outputs_set = set(fused_outputs)
+    for node in graph.nodes:
+        if node in fused_outputs_set:
+            last_fused_result = node
+
+    # Move the incorrect_order_nodes to right after the last fused_result.
+    incorrect_order_nodes = sorted(
+        incorrect_order_nodes, key=lambda node: node_indices[node]
+    )
+    move_block_after(incorrect_order_nodes, last_fused_result)
+
+
+def _fuse_allreduce(
+    graph: fx.Graph,
+    comm_blocks: List[CommBlock],
+    node_indices: Dict[fx.Node, int],
+    use_concat: bool,
+) -> CommBlock:
+    """Given a list of allreduce CommBlock, fuse the CommBlocks into one CommBlock."""
+
+    if len(comm_blocks) == 1:
+        return comm_blocks[0]
+
+    # Find the last input node of all the CommBlocks. This node will be served
+    # as the inserting point of the new collective op.
+    last_input_node = comm_blocks[0].inputs[0]
+    last_input_index = -1
+    all_input_nodes = []
+    for comm_block in comm_blocks:
+        input_node = comm_block.inputs[0]
+        all_input_nodes.append(input_node)
+        index = node_indices[input_node]
+        if index >= last_input_index:
+            assert index != last_input_index
+            last_input_node = input_node
+            last_input_index = index
+
+    if use_concat:
+        fused_comm_block = _fuse_allreduce_by_concat(
+            graph, last_input_node, all_input_nodes, comm_blocks[-1]
+        )
+    else:
+        fused_comm_block = _fuse_with_coalesced_op(
+            graph, last_input_node, all_input_nodes, comm_blocks[-1]
+        )
+
+    _scatter_fused_allreduce_waits(
+        graph, fused_comm_block, comm_blocks, node_indices, split_and_reshape=use_concat
+    )
+
+    for comm_block in comm_blocks:
+        for wait in comm_block.wait_nodes:
+            graph.erase_node(wait)
+        graph.erase_node(comm_block.comm_node)
+    graph.eliminate_dead_code()
+
+    return fused_comm_block
+
+
+def _bucket_size_fusion(
+    graph: fx.Graph, comm_blocks: List[CommBlock], bucket_size_mb: int
+) -> Generator[List[CommBlock], None, None]:
+    MB = 1024**2
+    bucket_size = 1 * MB
+    bucket_cap_size = bucket_size_mb * MB
+    curr_size = 0
+    curr_blocks = []
+
+    count = 0
+    fuse_count = 0
+    for i, block in enumerate(comm_blocks):
+        curr_blocks.append(block)
+        itemsize = block.comm_node.meta["tensor_meta"].dtype.itemsize
+        curr_size += cast(torch.Size, block.shape).numel() * itemsize
+        count += 1
+        if curr_size < bucket_size and i != len(comm_blocks) - 1:
+            continue
+
+        fuse_count += 1
+        if torch.distributed.get_rank() == 0:
+            logger.info(
+                "DDP bucketing: block%d, count=%d, curr_size=%d, bucket_size=%d",
+                fuse_count,
+                count,
+                curr_size,
+                bucket_size,
+            )
+
+        # Set the debug counters
+        counters["inductor"]["ddp_buckets"] = fuse_count
+        yield curr_blocks
+
+        bucket_size = bucket_cap_size
+        curr_blocks = []
+        curr_size = 0
+        count = 0
+
+
+def _fuse_ddp_communication(
+    graph: fx.Graph, algorithm_fn: Callable[..., Any], fusion_fn: Callable[..., Any]
+) -> None:
+    for output in reversed(graph.nodes):
+        if output.op == "output":
+            break
+
+    def ddp_reducer_filter(block: CommBlock) -> bool:
+        if (
+            not isinstance(block.comm_node.args[0], fx.Node)
+            or block.comm_node.args[0].target != aten.div.Tensor
+        ):
+            return False
+
+        if len(block.wait_nodes[0].users) != 1:
+            # gradient/wait node should only be used by one user
+            return False
+
+        # Two cases:
+        # 1. gradient/wait node should be directly used by the output
+        # if gradient is None before bwd.
+        # 2. gradient/wait node should be directly used by copy_.
+        if (
+            output not in block.wait_nodes[0].users
+            and next(iter(block.wait_nodes[0].users)).target != aten.copy_.default
+        ):
+            return False
+
+        return True
+
+    ops = (
+        torch.ops._c10d_functional.all_reduce_.default,
+        torch.ops._c10d_functional.all_reduce.default,
+    )
+    comm_blocks = get_all_comm_blocks(graph, ops, comm_filter=ddp_reducer_filter)
+    node_indices = {node: i for i, node in enumerate(graph.nodes)}
+
+    for block in algorithm_fn(graph, comm_blocks):
+        fusion_fn(graph, block, node_indices)
+
+
+def fuse_ddp_with_coalesced_op(graph: fx.Graph, bucket_size_mb: int) -> None:
+    _fuse_ddp_communication(
+        graph,
+        partial(_bucket_size_fusion, bucket_size_mb=bucket_size_mb),
+        partial(_fuse_allreduce, use_concat=False),
+    )
+
+
+def fuse_ddp_with_concat_op(graph: fx.Graph, bucket_size_mb: int) -> None:
+    _fuse_ddp_communication(
+        graph,
+        partial(_bucket_size_fusion, bucket_size_mb=bucket_size_mb),
+        partial(_fuse_allreduce, use_concat=True),
+    )
+
+
+def schedule_comm_wait(graph: fx.Graph) -> None:
+    """
+    Delay the execution of wait tensors of allreduce until its first user.
+
+    This algorithm considers the intermediate users, like split, getitem,
+    of the wait node and schedule those intermediate users as well.
+    This will result in a better overlapping result.
+    """
+    ops = (
+        torch.ops._c10d_functional.all_reduce_.default,
+        torch.ops._c10d_functional.all_reduce.default,
+        torch.ops._c10d_functional.all_reduce_coalesced.default,
+        torch.ops._c10d_functional.all_reduce_coalesced_.default,
+    )
+    comm_blocks = get_all_comm_blocks(graph, ops)
+    if not comm_blocks:
+        return
+
+    # Find all the end users.
+    allreduce_users: Set[fx.Node] = set()
+    for allreduce in comm_blocks:
+        for output in allreduce.outputs:
+            allreduce_users.update(output.users)
+
+    node_indices = {node: i for i, node in enumerate(graph.nodes)}
+    for allreduce in comm_blocks:
+        # Find the earliest/first user -- target_node.
+        assert (
+            len(allreduce.outputs) >= 1
+        ), f"Found a allreduce that has zero outputs/users -- {allreduce}."
+        # Initialize the target node to avoid typing issues.
+        target_node = next(iter(next(iter(allreduce.outputs)).users))
+        target_node_index = 2**31
+        for user in (user for output in allreduce.outputs for user in output.users):
+            index = node_indices[user]
+            if index < target_node_index:
+                target_node = user
+                target_node_index = index
+
+        # Move wait nodes and all the subsequent nodes in the comm_block to
+        # before the first user -- target_node.
+        wait_idx = -1
+        for wait_idx, node in enumerate(allreduce.node_list):
+            if node == allreduce.wait_nodes[0]:
+                break
+        assert wait_idx >= 0
+        move_block_before(allreduce.node_list[wait_idx:], target_node)
+
+
+def fuse_ddp_communication(
+    graph: fx.Graph, passes: List[Union[Callable[..., None], str]], bucket_size_mb: int
+) -> None:
+    for pa in passes:
+        if isinstance(pa, str):
+            func = globals()[pa]
+        else:
+            func = pa
+        if "bucket_size_mb" in {
+            v.name for v in inspect.signature(func).parameters.values()
+        }:
+            func(graph, bucket_size_mb=bucket_size_mb)
+        else:
+            func(graph)
diff --git a/torch/_inductor/fx_passes/decompose_mem_bound_mm.py b/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
new file mode 100644
index 0000000000000..793d29383f567
--- /dev/null
+++ b/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
@@ -0,0 +1,148 @@
+import logging
+from typing import List
+
+import torch
+from torch import Tensor
+from torch._dynamo.utils import counters
+
+from .. import config
+
+from ..pattern_matcher import Arg, CallFunction, Match, register_graph_pattern
+from .split_cat import construct_pattern_matcher_pass
+
+aten = torch.ops.aten
+log = logging.getLogger(__name__)
+
+# TODO: need a better strategy for decomposing mm
+MIN_FIRST_DIMENSION_DECOMPOSITION = 10240
+MAX_OTHER_DIMENSION_DECOMPOSITION = 32
+
+min_first_dimension_decomposition = MIN_FIRST_DIMENSION_DECOMPOSITION
+max_other_dimention_decomposition = MAX_OTHER_DIMENSION_DECOMPOSITION
+if "decompose_mem_bound_mm" in config.post_grad_fusion_options:
+    min_first_dimension_decomposition = config.post_grad_fusion_options[
+        "decompose_mem_bound_mm"
+    ].get("min_first_dimension_decomposition", MIN_FIRST_DIMENSION_DECOMPOSITION)
+    max_other_dimention_decomposition = config.post_grad_fusion_options[
+        "decompose_mem_bound_mm"
+    ].get("max_other_dimention_decomposition", MAX_OTHER_DIMENSION_DECOMPOSITION)
+
+
+def check_device(a: Tensor, b: Tensor) -> bool:
+    return a.is_cuda and b.is_cuda
+
+
+def realize_inputs(inputs: List[torch.fx.Node]):
+    for inp in inputs:
+        if isinstance(inp, torch.fx.node.Node):
+            inp.meta["inductor_realize_to_strides"] = True
+
+
+def should_decompose_bmm(mat1, mat2) -> bool:
+    if is_node_meta_valid(mat1) and is_node_meta_valid(mat2):
+        mat1 = mat1.meta["val"]
+        mat2 = mat2.meta["val"]
+    else:
+        return False
+    if not check_device(mat1, mat2):
+        return False
+    else:
+        if len(mat1.shape) != 3 or len(mat2.shape) != 3:
+            return False
+        if mat1.shape[0] < min_first_dimension_decomposition:
+            return False
+        # 2 of m, n, k must be <= MAX_OTHER_DIMENSION_DECOMPOSITION
+        if (mat1.shape[1] < max_other_dimention_decomposition) + (
+            mat1.shape[2] < max_other_dimention_decomposition
+        ) + (mat2.shape[2] < max_other_dimention_decomposition) < 2:
+            return False
+    return True
+
+
+def should_decompose_mm(mat1, mat2) -> bool:
+    if is_node_meta_valid(mat1) and is_node_meta_valid(mat2):
+        mat1 = mat1.meta["val"]
+        mat2 = mat2.meta["val"]
+    else:
+        return False
+    return (
+        check_device(mat1, mat2)
+        and len(mat1.shape) == 2
+        and len(mat2.shape) == 2
+        and mat1.shape[0] >= min_first_dimension_decomposition
+        and mat2.shape[0] < max_other_dimention_decomposition
+        and mat2.shape[1] < max_other_dimention_decomposition
+    )
+
+
+def is_node_meta_valid(node: torch.fx.Node):
+    return "val" in node.meta
+
+
+def print_decompose_pattern(match: Match, inputs: List[torch.fx.Node]):
+    node = match.nodes[-1]
+    log.debug(
+        "Decompose %s with input shape: %s",
+        node.target,
+        ", ".join(
+            str(input.meta["val"].shape) if "val" in input.meta else "None"
+            for input in inputs
+        ),
+    )
+
+
+@register_graph_pattern(
+    CallFunction(aten.bmm, Arg(), Arg()),
+    pass_dict=construct_pattern_matcher_pass("decompose_mm_pass"),
+)
+def decompose_bmm(match: Match, mat1: torch.fx.Node, mat2: torch.fx.Node):
+    def repl(mat1, mat2):
+        return torch.sum(mat1[:, :, :, None] * mat2[:, None, :, :], dim=-2)
+
+    if should_decompose_bmm(mat1, mat2):
+        counters["inductor"]["decompose_bmm"] += 1
+        match.replace_by_example(repl, [mat1, mat2])
+        print_decompose_pattern(match, [mat1, mat2])
+        realize_inputs([mat1, mat2])
+    return
+
+
+@register_graph_pattern(
+    CallFunction(aten.addmm, Arg(), Arg(), Arg()),
+    pass_dict=construct_pattern_matcher_pass("decompose_mm_pass"),
+)
+def decompose_addmm(
+    match: Match,
+    mat1: torch.fx.Node,
+    mat2: torch.fx.Node,
+    mat3: torch.fx.Node,
+):
+    def repl(mat1, mat2, mat3):
+        return torch.sum(mat2[:, :, None] * mat3[None, :, :], dim=-2) + mat1
+
+    if should_decompose_mm(mat2, mat3):
+        counters["inductor"]["decompose_addmm"] += 1
+        match.replace_by_example(repl, [mat1, mat2, mat3])
+        print_decompose_pattern(match, [mat1, mat2, mat3])
+        realize_inputs([mat1, mat2, mat3])
+    return
+
+
+@register_graph_pattern(
+    CallFunction(aten.mm, Arg(), Arg()),
+    pass_dict=construct_pattern_matcher_pass("decompose_mm_pass"),
+)
+def decompose_mm(
+    match: Match,
+    mat1: torch.fx.Node,
+    mat2: torch.fx.Node,
+):
+    def repl(mat1, mat2):
+        return torch.sum(mat1[:, :, None] * mat2[None, :, :], dim=-2)
+
+    if should_decompose_mm(mat1, mat2):
+        counters["inductor"]["decompose_mm"] += 1
+        match.replace_by_example(repl, [mat1, mat2])
+        print_decompose_pattern(match, [mat1, mat2])
+        realize_inputs([mat1, mat2])
+    return
diff --git a/torch/_inductor/fx_passes/dedupe_symint_uses.py b/torch/_inductor/fx_passes/dedupe_symint_uses.py
new file mode 100644
index 0000000000000..7145508a3ae2e
--- /dev/null
+++ b/torch/_inductor/fx_passes/dedupe_symint_uses.py
@@ -0,0 +1,78 @@
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+from torch.fx.experimental.proxy_tensor import py_sym_types, SymBool, SymFloat, SymInt
+
+
+@dataclass
+class _SymExprHash:
+    """
+    Hash for a py_sym_types that will use the underlying sympy expression
+    """
+
+    sym_obj: Union[SymInt, SymFloat, SymBool]
+
+    def __hash__(self) -> int:
+        return hash((type(self.sym_obj), self.sym_obj.node.expr))
+
+    def __eq__(self, value) -> bool:
+        if not isinstance(value, _SymExprHash):
+            return False
+        return self.sym_obj.node.expr == value.sym_obj.node.expr
+
+
+class _SymHashingDict:
+    """
+    Wrapper around a dictionary that will convert sym types to hash with _SymExprHash and reuse
+    existing sym proxies.
+
+    SymPy hash is not always reliable so optimistically hash sympy expression, and if those fail,
+    fallback to symnodes.
+    """
+
+    def __init__(self):
+        self.sym_hash_dict = {}
+
+    def __setitem__(self, key, value):
+        self.sym_hash_dict.__setitem__(self._wrap_to_sym_expr_hash(key), value)
+
+    def __getitem__(self, key):
+        return self.sym_hash_dict[self._wrap_to_sym_expr_hash(key)]
+
+    def __contains__(self, key):
+        return self._wrap_to_sym_expr_hash(key) in self.sym_hash_dict
+
+    def get(self, key, default=None):
+        return self.sym_hash_dict.get(self._wrap_to_sym_expr_hash(key), default)
+
+    def _wrap_to_sym_expr_hash(self, key):
+        return _SymExprHash(key) if isinstance(key, py_sym_types) else key
+
+
+def dedupe_symints(graph: torch.fx.Graph):
+    """
+    Dedupes sym ints in the graph to nodes are resolvable to symint graph inputs.
+
+    We only dedupe from graph inputs to avoid adding a potential dependency in the forward
+    from the backward.
+
+    """
+
+    sym_dict = _SymHashingDict()
+    resolvable_from_input_symints = set()
+
+    for node in graph.nodes:
+        val = node.meta.get("val", None)
+        if val is None or not isinstance(val, py_sym_types):
+            continue
+
+        if node.op == "placeholder":
+            resolvable_from_input_symints.add(node)
+            sym_dict[val] = node
+        elif existing_node := sym_dict.get(val):
+            node.replace_all_uses_with(existing_node)
+            graph.erase_node(node)
+        elif all(n in resolvable_from_input_symints for n in node.all_input_nodes):
+            sym_dict[val] = node
+            resolvable_from_input_symints.add(node)
diff --git a/torch/_inductor/fx_passes/efficient_conv_bn_eval.py b/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
index 87ed13d88d729..7ab01e0abbb25 100644
--- a/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
+++ b/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
@@ -5,7 +5,12 @@
 from torch._inductor import config as inductor_config
 from torch.func import functional_call
 
-from ..pattern_matcher import CallModuleVarArgs, Match, register_graph_pattern
+from ..pattern_matcher import (
+    CallFunctionVarArgs,
+    CallModuleVarArgs,
+    Match,
+    register_graph_pattern,
+)
 
 from .pre_grad import efficient_conv_bn_eval_pass
 
@@ -15,7 +20,7 @@ def efficient_conv_bn_eval(
 ):
     """
     Implementation based on https://arxiv.org/abs/2305.11624
-    "Tune-Mode ConvBN Blocks For Efficient Transfer Learning"
+    "Efficient ConvBN Blocks for Transfer Learning and Beyond"
     It leverages the associative law between convolution and affine transform,
     i.e., normalize (weight conv feature) = (normalize weight) conv feature.
     It works for Eval mode of ConvBN blocks during validation, and can be used
@@ -70,6 +75,160 @@ def efficient_conv_bn_eval(
     return output
 
 
+def efficient_conv_bn_eval_decomposed(
+    bn_weight,
+    bn_bias,
+    bn_running_mean,
+    bn_running_var,
+    bn_eps,
+    conv: torch._ops.OpOverload,
+    conv_weight,
+    conv_bias,
+    x,
+    conv_remainging_args,
+):
+    """
+    Implementation based on https://arxiv.org/abs/2305.11624
+    "Efficient ConvBN Blocks for Transfer Learning and Beyond"
+    It leverages the associative law between convolution and affine transform,
+    i.e., normalize (weight conv feature) = (normalize weight) conv feature.
+    It works for Eval mode of ConvBN blocks during validation, and can be used
+    for **training** as well, but only if one sets `bn.training=False`. It
+     reduces memory footprint and computation cost, at the cost of slightly
+     reduced numerical stability.
+    Args:
+    """
+    assert bn_running_var is not None
+
+    # These lines of code are designed to deal with various cases
+    # like bn without affine transform, and conv without bias
+    weight_on_the_fly = conv_weight
+    if conv_bias is not None:
+        bias_on_the_fly = conv_bias
+    else:
+        bias_on_the_fly = torch.zeros_like(bn_running_var)
+
+    if bn_weight is not None:
+        bn_weight = bn_weight
+    else:
+        bn_weight = torch.ones_like(bn_running_var)
+
+    if bn_bias is not None:
+        bn_bias = bn_bias
+    else:
+        bn_bias = torch.zeros_like(bn_running_var)
+
+    # shape of [C_out, 1, 1, 1] in Conv2d
+    target_shape = [-1] + [1] * (conv_weight.ndim - 1)
+    if "conv_transpose" in conv.__str__():
+        # for transposed conv, the C_out dimension should at index 1.
+        target_shape[:2] = [target_shape[1], target_shape[0]]
+    weight_coeff = torch.rsqrt(bn_running_var + bn_eps).reshape(target_shape)
+    # shape of [C_out, 1, 1, 1] in Conv2d
+    coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff
+
+    # shape of [C_out, C_in, k, k] in Conv2d
+    weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly
+    # shape of [C_out] in Conv2d
+    bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() * (
+        bias_on_the_fly - bn_running_mean
+    )
+
+    input = x
+    return conv(*((input, weight_on_the_fly, bias_on_the_fly) + conv_remainging_args))
+
+
+@register_graph_pattern(
+    CallFunctionVarArgs(
+        [
+            torch.ops.aten.batch_norm.default,
+        ]
+    ),
+    pass_dict=efficient_conv_bn_eval_pass,
+    extra_check=lambda match: not inductor_config.freezing
+    and inductor_config.efficient_conv_bn_eval_fx_passes,
+)
+def efficient_conv_bn_eval_graph_transform_decomposed(match: Match, *args, **kwargs):
+    bn_node = match.nodes[0]
+    graph = match.graph
+    assert len(bn_node.args) == 9
+
+    # We can only use efficient conv-bn for eval mode with track_running_stats
+    # bn_node.args is `training`
+    if bn_node.args[-4]:
+        return
+
+    # Check if the input is Conv
+    input_node = bn_node.args[0]
+
+    if input_node.op != "call_function":  # type: ignore[union-attr]
+        return
+
+    input_fn = input_node.target  # type: ignore[arg-type, union-attr]
+    supported_convs = [
+        torch.ops.aten.linear.default,
+        torch.ops.aten.conv1d.default,
+        torch.ops.aten.conv2d.default,
+        torch.ops.aten.conv3d.default,
+        torch.ops.aten.conv_transpose1d.default,
+        torch.ops.aten.conv_transpose2d.input,
+        torch.ops.aten.conv_transpose3d.input,
+    ]
+
+    if not any(input_fn is cls for cls in supported_convs):
+        return
+
+    conv_node = input_node
+    # Output of conv is used by other nodes, cannot optimize
+    if len(conv_node.users) > 1:  # type: ignore[union-attr]
+        return
+
+    counters["inductor"]["efficient_conv_bn_eval"] += 1
+
+    with graph.inserting_before(bn_node):
+        # prepare args for the fused function
+        bn_weight = bn_node.args[1]
+        bn_bias = bn_node.args[2]
+        bn_running_mean = bn_node.args[3]
+        bn_running_var = bn_node.args[4]
+        bn_eps = bn_node.args[7]
+        assert len(conv_node.args) >= 2  # type: ignore[union-attr]
+        conv_input = conv_node.args[0]  # type: ignore[union-attr]
+        conv_weight = conv_node.args[1]  # type: ignore[union-attr]
+        conv_bias = conv_node.args[2] if len(conv_node.args) >= 3 else None  # type: ignore[union-attr]
+        conv_remainging_args = conv_node.args[3:]  # type: ignore[union-attr]
+        args = (
+            bn_weight,
+            bn_bias,
+            bn_running_mean,
+            bn_running_var,
+            bn_eps,
+            conv_node.target,  # type: ignore[union-attr]
+            conv_weight,
+            conv_bias,
+            conv_input,
+            conv_remainging_args,
+        )
+
+        # create a new node
+        new_node = graph.create_node(
+            op="call_function",
+            target=efficient_conv_bn_eval_decomposed,
+            args=args,
+            name="efficient_conv_bn_eval",
+        )
+
+    # this node replaces the original conv + bn, and therefore
+    # should replace the uses of bn_node
+    bn_node.replace_all_uses_with(new_node)
+    # take care of the deletion order:
+    # delete bn_node first, and then conv_node
+    graph.erase_node(bn_node)
+    graph.erase_node(conv_node)
+
+    return
+
+
 @register_graph_pattern(
     CallModuleVarArgs(
         [
@@ -89,7 +248,7 @@ def efficient_conv_bn_eval_graph_transform(match: Match, *args, **kwargs):
     bn_node = match.nodes[0]
     graph = match.graph
     gm = graph.owning_module
-    bn_mod = getattr(gm, bn_node.target)
+    bn_mod = getattr(gm, bn_node.target)  # type: ignore[arg-type]
 
     # We can only use efficient conv-bn for eval mode with track_running_stats
     if not bn_mod.track_running_stats or bn_mod.training:
@@ -100,11 +259,11 @@ def efficient_conv_bn_eval_graph_transform(match: Match, *args, **kwargs):
         input_node = bn_node.args[0]
     else:
         input_node = bn_node.kwargs["input"]
-    if input_node.op != "call_module":
+    if input_node.op != "call_module":  # type: ignore[union-attr]
         return
-    if not hasattr(gm, input_node.target):
+    if not hasattr(gm, input_node.target):  # type: ignore[arg-type, union-attr]
         return
-    input_mod = getattr(gm, input_node.target)
+    input_mod = getattr(gm, input_node.target)  # type: ignore[arg-type, union-attr]
     supported_convs = [
         nn.Linear,
         nn.Conv1d,
@@ -118,7 +277,7 @@ def efficient_conv_bn_eval_graph_transform(match: Match, *args, **kwargs):
         return
     conv_node = input_node
     # Output of conv is used by other nodes, cannot optimize
-    if len(conv_node.users) > 1:
+    if len(conv_node.users) > 1:  # type: ignore[union-attr]
         return
 
     # Find a pair of conv and bn computation nodes to optimize.
@@ -130,15 +289,15 @@ def efficient_conv_bn_eval_graph_transform(match: Match, *args, **kwargs):
         # argument. `graph.get_attr` and
         # `graph.call_function` does not allow the `name` argument.
         conv_get_node = graph.create_node(
-            op="get_attr", target=conv_node.target, name="get_conv"
+            op="get_attr", target=conv_node.target, name="get_conv"  # type: ignore[union-attr]
         )
         bn_get_node = graph.create_node(
             op="get_attr", target=bn_node.target, name="get_bn"
         )
-        if conv_node.args:
-            conv_input = conv_node.args[0]
+        if conv_node.args:  # type: ignore[union-attr]
+            conv_input = conv_node.args[0]  # type: ignore[union-attr]
         else:
-            conv_input = conv_node.kwargs["input"]
+            conv_input = conv_node.kwargs["input"]  # type: ignore[union-attr]
         # prepare args for the fused function
         args = (bn_get_node, conv_get_node, conv_input)
         # create a new node
diff --git a/torch/_inductor/fx_passes/freezing_patterns.py b/torch/_inductor/fx_passes/freezing_patterns.py
index cafd7aa4eee83..fe39b13033a70 100644
--- a/torch/_inductor/fx_passes/freezing_patterns.py
+++ b/torch/_inductor/fx_passes/freezing_patterns.py
@@ -50,7 +50,7 @@ def freezing_passes(gm: torch.fx.GraphModule, aot_example_inputs):
         constant_fold(gm)
         # Make sure meta['val'] is properly set for all nodes
         fake_tensor_prop(gm, aot_example_inputs, True)
-        binary_folding_pass.apply(gm.graph)
+        binary_folding_pass.apply(gm.graph)  # type: ignore[arg-type]
         # If we don't have binary folding, we don't need to run the pass again.
         # TODO: remove the need to run fake_tensor_prop on the whole model.
         if counters["inductor"]["binary_folding"] == binary_folding:
@@ -63,7 +63,7 @@ def freezing_passes(gm: torch.fx.GraphModule, aot_example_inputs):
     fake_tensor_prop(gm, aot_example_inputs, True)
 
     for pattern in pass_patterns:
-        pattern.apply(gm.graph)
+        pattern.apply(gm.graph)  # type: ignore[arg-type]
 
     # The CPU weight packing always assume the conv's weight is channels last,
     # So make sure the layout_optimization is on when doing it.
@@ -120,17 +120,30 @@ def addmm_patterns_init():
     val = functools.partial(torch.empty, (10, 10), device=device, requires_grad=False)
 
     def check_concat_weights(match):
-        weights = [
-            match.kwargs["w1"],
-            match.kwargs["w2"],
-        ]
+        weight_inputs = ["w1", "w2"]
         if "w3" in match.kwargs:
-            weights.append(match.kwargs["w3"])
+            weight_inputs.append("w3")
 
-        return all(
-            w.op == "get_attr" and w.meta["val"].shape == weights[0].meta["val"].shape
-            for w in weights
-        )
+        equal_shape_inputs = [weight_inputs]
+
+        if "b1" in match.kwargs:
+            bias_inputs = ["b1", "b2"]
+            if "b3" in match.kwargs:
+                bias_inputs.append("b3")
+
+            equal_shape_inputs.append(bias_inputs)
+
+        for equal_shape_group in equal_shape_inputs:
+            inps = [match.kwargs[name] for name in equal_shape_group]
+
+            if not all(
+                inp.op == "get_attr"
+                and inp.meta["val"].shape == inps[0].meta["val"].shape
+                for inp in inps
+            ):
+                return False
+
+        return True
 
     def matmul_fuse_pattern(inp, w1, w2, w3):
         return (inp @ w1, inp @ w2, inp @ w3)
diff --git a/torch/_inductor/fx_passes/fuse_attention.py b/torch/_inductor/fx_passes/fuse_attention.py
index 207f64558a0e7..3fbb67cb27765 100644
--- a/torch/_inductor/fx_passes/fuse_attention.py
+++ b/torch/_inductor/fx_passes/fuse_attention.py
@@ -8,8 +8,8 @@
 from ..pattern_matcher import (
     filter_nodes,
     fwd_only,
+    gen_register_replacement,
     joint_fwd_bwd,
-    register_replacement,
 )
 
 log = logging.getLogger(__name__)
@@ -321,6 +321,221 @@ def _sfdp_replacement_13(query, key, value, dropout_p):
     ).squeeze(0)
 
 
+def _sfdp_pattern_14(query, key, value, attn_mask, inv_scale):
+    # for BertLarge
+    # Permutations are needed to create clones in graph.
+    q = query.permute([0, 2, 1, 3])
+    k = key.permute([0, 2, 1, 3])
+    v = value.permute([0, 2, 1, 3])
+    return (
+        (torch.matmul(q, k.transpose(-2, -1)).div(inv_scale) + attn_mask)
+        .softmax(dim=-1)
+        .matmul(v)
+    )
+
+
+def _sfdp_replacement_14(query, key, value, attn_mask, inv_scale):
+    counters["inductor"]["fuse_attention"] += 1
+    return aten.scaled_dot_product_attention(
+        query.transpose(1, 2),
+        key.transpose(1, 2),
+        value.transpose(1, 2),
+        attn_mask=attn_mask.to(dtype=query.dtype),
+        dropout_p=0.0,
+        is_causal=False,
+        scale=1.0 / inv_scale,
+    )
+
+
+def _sfdp_pattern_15(query, key, value, attn_mask, inv_scale):
+    # for DistilBert
+    # Permutations are needed to create clones in graph.
+    # Ref: https://github.com/pytorch/pytorch/issues/119911
+    q = query.permute([0, 2, 1, 3])
+    k = key.permute([0, 2, 1, 3])
+    v = value.permute([0, 2, 1, 3])
+    bs = q.size(0)
+    k_len = k.size(-2)
+    scores = q @ k.transpose(-2, -1)
+    scores = scores.div(inv_scale)
+    fill_value = torch.full((), -float("inf"), dtype=query.dtype, device=query.device)
+    attn_mask = (attn_mask == 0).view((bs, 1, 1, k_len)).expand_as(scores)
+    return torch.softmax(scores.masked_fill(attn_mask, fill_value), dim=-1) @ v
+
+
+def _sfdp_replacement_15(query, key, value, attn_mask, inv_scale):
+    counters["inductor"]["fuse_attention"] += 1
+    bs = query.size(0)
+    n_head = query.size(2)
+    q_len = query.size(1)
+    k_len = key.size(1)
+    # do attn_mask->logical_not() in aten.scaled_dot_product_attention
+    attn_mask = (
+        (attn_mask == 1).view((bs, 1, 1, k_len)).expand((bs, n_head, q_len, k_len))
+    )
+    return aten.scaled_dot_product_attention(
+        query.transpose(1, 2),
+        key.transpose(1, 2),
+        value.transpose(1, 2),
+        attn_mask=attn_mask.to(dtype=torch.bool),
+        dropout_p=0.0,
+        is_causal=False,
+        scale=1.0 / inv_scale,
+    )
+
+
+def _sfdp_pattern_16(query, key, value, attn_mask, inv_scale, dropout_p):
+    # for BertLarge with dropout
+    q = query.permute([0, 2, 1, 3])
+    k = key.permute([0, 2, 1, 3])
+    v = value.permute([0, 2, 1, 3])
+    return (
+        torch.nn.functional.dropout(
+            (torch.matmul(q, k.transpose(-2, -1)).div(inv_scale) + attn_mask).softmax(
+                dim=-1
+            ),
+            dropout_p,
+        )
+        .to(dtype=query.dtype)
+        .matmul(v)
+    )
+
+
+def _sfdp_replacement_16(query, key, value, attn_mask, inv_scale, dropout_p):
+    counters["inductor"]["fuse_attention"] += 1
+    return aten.scaled_dot_product_attention(
+        query.transpose(1, 2),
+        key.transpose(1, 2),
+        value.transpose(1, 2),
+        attn_mask=attn_mask.to(dtype=query.dtype),
+        dropout_p=dropout_p,
+        is_causal=False,
+        scale=1.0 / inv_scale,
+    )
+
+
+def _sfdp_pattern_17(query, key, value, attn_mask, inv_scale, dropout_p):
+    # for DistilBert with dropout
+    q = query.permute([0, 2, 1, 3])
+    k = key.permute([0, 2, 1, 3])
+    v = value.permute([0, 2, 1, 3])
+    bs = q.size(0)
+    k_len = k.size(-2)
+    scores = q @ k.transpose(-2, -1)
+    scores = scores.div(inv_scale)
+    fill_value = torch.full((), -float("inf"), dtype=query.dtype, device=query.device)
+    attn_mask = (attn_mask == 0).view((bs, 1, 1, k_len)).expand_as(scores)
+    return (
+        torch.nn.functional.dropout(
+            torch.softmax(scores.masked_fill(attn_mask, fill_value), dim=-1), dropout_p
+        )
+        @ v
+    )
+
+
+def _sfdp_replacement_17(query, key, value, attn_mask, inv_scale, dropout_p):
+    counters["inductor"]["fuse_attention"] += 1
+    bs = query.size(0)
+    n_head = query.size(2)
+    q_len = query.size(1)
+    k_len = key.size(1)
+    # do attn_mask->logical_not() in aten.scaled_dot_product_attention
+    attn_mask = (
+        (attn_mask == 1).view((bs, 1, 1, k_len)).expand((bs, n_head, q_len, k_len))
+    )
+    return aten.scaled_dot_product_attention(
+        query.transpose(1, 2),
+        key.transpose(1, 2),
+        value.transpose(1, 2),
+        attn_mask=attn_mask.to(dtype=torch.bool),
+        dropout_p=dropout_p,
+        is_causal=False,
+        scale=1.0 / inv_scale,
+    )
+
+
+def _sfdp_pattern_18(query, key, value, causal_mask, dropout_p):
+    # for hf_GPT2 with dropout (introduces clone node) for inference
+    # it also returns permuted key & value
+    query = query.permute([0, 2, 1, 3])
+    key = key.permute([0, 2, 1, 3])
+    value = value.permute([0, 2, 1, 3])
+    attn_weights = torch.matmul(query, key.permute(0, 1, 3, 2))
+    inv_scale = torch.full(
+        [],
+        value.size(-1) ** 0.5,
+        dtype=attn_weights.dtype,
+        device=attn_weights.device,
+    )
+    attn_weights = attn_weights.div(inv_scale)
+    causal_mask_value = torch.full(
+        (), torch.finfo(query.dtype).min, dtype=query.dtype, device=query.device
+    )
+    attn_weights = torch.where(causal_mask, attn_weights, causal_mask_value)
+    return (
+        (
+            torch.nn.functional.dropout(attn_weights.softmax(dim=-1), dropout_p).matmul(
+                value
+            )
+        ),
+        key,
+        value,
+    )
+
+
+def _sfdp_replacement_18(query, key, value, causal_mask, dropout_p):
+    counters["inductor"]["fuse_attention"] += 1
+    permuted_key = key.transpose(1, 2)
+    permuted_value = value.transpose(1, 2)
+    return (
+        aten.scaled_dot_product_attention(
+            query.transpose(1, 2),
+            permuted_key,
+            permuted_value,
+            attn_mask=causal_mask,
+            dropout_p=dropout_p,
+            is_causal=False,
+            scale=1.0 / math.sqrt(value.size(-1)),
+        ),
+        permuted_key,
+        permuted_value,
+    )
+
+
+def _sfdp_pattern_19(query, key, value, causal_mask, attn_mask, dropout_p):
+    # for token-classification+gpt2 / text-generation+gpt2
+    attn_weights = torch.matmul(query, key.permute(0, 1, 3, 2))
+    inv_scale = torch.full(
+        [],
+        value.size(-1) ** 0.5,
+        dtype=attn_weights.dtype,
+        device=attn_weights.device,
+    )
+    attn_weights = attn_weights.div(inv_scale)
+    causal_mask_value = torch.full(
+        (), torch.finfo(query.dtype).min, dtype=query.dtype, device=query.device
+    )
+    attn_weights = torch.where(causal_mask, attn_weights, causal_mask_value)
+    attn_weights = attn_weights + attn_mask
+    attn_weights = attn_weights.softmax(dim=-1).type(value.dtype)
+    return torch.nn.functional.dropout(attn_weights, dropout_p).matmul(value)
+
+
+def _sfdp_replacement_19(query, key, value, causal_mask, attn_mask, dropout_p):
+    counters["inductor"]["fuse_attention"] += 1
+    fill_value = torch.full((), -float("inf"), dtype=query.dtype, device=query.device)
+    attn_mask = torch.where(causal_mask, attn_mask, fill_value)
+    return aten.scaled_dot_product_attention(
+        query,
+        key,
+        value,
+        attn_mask=attn_mask,
+        dropout_p=dropout_p,
+        is_causal=False,
+        scale=1.0 / math.sqrt(value.size(-1)),
+    )
+
+
 def _sfdp_params_check(match):
     assert all(k in match.kwargs for k in ("query", "key", "value"))
     query = match.kwargs["query"].meta["val"]
@@ -337,25 +552,37 @@ def _sfdp_params_check(match):
         # attn_mask_node may be a float/int number.
         if not hasattr(attn_mask_node, "meta"):
             return False
-        attn_mask = attn_mask_node.meta["val"]
+        attn_mask = attn_mask_node.meta["val"]  # type: ignore[union-attr]
         # Make sure attn_mask.dtype == query.dtype or attn_mask.dtype == torch.bool
+        # attn_mask.dtype == torch.float for models like albert.
         if (
             not isinstance(attn_mask, torch.Tensor)
-            or not (attn_mask.dtype == query.dtype or attn_mask.dtype == torch.bool)
+            or not (
+                attn_mask.dtype == query.dtype
+                or attn_mask.dtype == torch.bool
+                or attn_mask.dtype == torch.float
+            )
             or query.device != attn_mask.device
         ):
             return False
     return True
 
 
-def _sfdp_scale_factor_check(scale_factor_op):
+def _sfdp_extra_check(scale_factor_op=None, disable_cuda=False):
     def fn(match):
-        scale_factor_node = filter_nodes(match.nodes, scale_factor_op)[0]
-        # Note: args[1] of the scale_factor_node is always the scale_factor for the current patterns.
-        scale_factor = scale_factor_node.args[1]
-        # make sure the scale_factor a float/int. SymInt?
-        if not isinstance(scale_factor, (float, int)):
+        if (
+            disable_cuda
+            and "query" in match.kwargs
+            and "cuda" in str(match.kwargs["query"].meta["val"].device)
+        ):
             return False
+        if scale_factor_op is not None:
+            scale_factor_node = filter_nodes(match.nodes, scale_factor_op)[0]
+            # Note: args[1] of the scale_factor_node is always the scale_factor for the current patterns.
+            scale_factor = scale_factor_node.args[1]
+            # make sure the scale_factor a float/int. SymInt?
+            if not isinstance(scale_factor, (float, int)):
+                return False
         return _sfdp_params_check(match)
 
     return fn
@@ -398,7 +625,10 @@ def _get_sfdp_patterns():
     g_inp = functools.partial(
         torch.empty, (2, 4, 8, 16), device=device, requires_grad=True
     )
+    # attn_mask
     b_inp = functools.partial(torch.empty, (1, 1, 8, 8), device=device)
+    m_inp = functools.partial(torch.empty, (2, 1, 1, 4), device=device)
+    # inv_scale
     c_inp = functools.partial(torch.tensor, 2.0, device=device)
     # workaround https://github.com/pytorch/pytorch/issues/97894
     # 0.113377 is a "magic" value that lets us recover the lost input arg relationship
@@ -409,42 +639,59 @@ def _get_sfdp_patterns():
         torch.empty, (1024, 128, 128), device=device, requires_grad=True
     )
 
+    # reshape in matmul decomposition generates a clone when batch_size>1 due to the memory layout change.
+    # however when batch_size=1, reshape does not change the memory layout, so clone would not be generated.
+    # here we need to trace with input of batch_size=1 to generate a pattern graph without clone.
+    g_bs1_inp = functools.partial(
+        torch.empty, (1, 4, 8, 16), device=device, requires_grad=True
+    )
+    m_bs1_inp = functools.partial(torch.empty, (1, 1, 1, 4), device=device)
+
     # softmax will generate a dtype conversion on inputs if they are in half,
     # but will not in float, so we generate a pattern for both
     for dtype in [torch.float, torch.half]:
         g = functools.partial(g_inp, dtype=dtype)
         b = functools.partial(b_inp, dtype=dtype)
+        b_float = functools.partial(b_inp, dtype=torch.float)
+        b_bool = functools.partial(b_inp, dtype=torch.bool)
+        m = functools.partial(m_inp, dtype=dtype)
+        m_float = functools.partial(m_inp, dtype=torch.float)
+        m_bool = functools.partial(m_inp, dtype=torch.bool)
         c = functools.partial(c_inp, dtype=dtype)
         g_3d = functools.partial(g_3d_inp, dtype=dtype)
+        g_bs1 = functools.partial(g_bs1_inp, dtype=dtype)
+        m_bs1 = functools.partial(m_bs1_inp, dtype=dtype)
+        m_bs1_float = functools.partial(m_bs1_inp, dtype=torch.float)
+        m_bs1_bool = functools.partial(m_bs1_inp, dtype=torch.bool)
 
-        for pattern, replacement, args, workaround, extra_check in [
+        candidates = [
             (
                 _sfdp_pattern_1,
                 _sfdp_replacement_1,
                 [g(), g(), g(), c()],
                 {},
-                _sfdp_scale_factor_check(aten.div.Tensor),
+                _sfdp_extra_check(aten.div.Tensor),
             ),
             (
                 _sfdp_pattern_2,
                 _sfdp_replacement_2,
                 [g(), g(), g(), c()],
                 {},
-                _sfdp_scale_factor_check(aten.mul.Tensor),
+                _sfdp_extra_check(aten.mul.Tensor),
             ),
             (
                 _sfdp_pattern_3,
                 _sfdp_replacement_3,
                 [g(), g(), g(), c()],
                 d,
-                _sfdp_scale_factor_check(aten.div.Tensor),
+                _sfdp_extra_check(aten.div.Tensor),
             ),
             (
                 _sfdp_pattern_4,
                 _sfdp_replacement_4,
                 [g(), g(), g(), c()],
                 d,
-                _sfdp_scale_factor_check(aten.mul.Tensor),
+                _sfdp_extra_check(aten.mul.Tensor),
             ),
             (
                 _sfdp_pattern_5,
@@ -493,14 +740,14 @@ def _get_sfdp_patterns():
                 _sfdp_replacement_11,
                 [g(), g(), g(), c()],
                 {},
-                _sfdp_scale_factor_check(aten.div.Tensor),
+                _sfdp_extra_check(aten.div.Tensor),
             ),
             (
                 _sfdp_pattern_12,
                 _sfdp_replacement_12,
                 [g(), g(), g(), c()],
                 d,
-                _sfdp_scale_factor_check(aten.div.Tensor),
+                _sfdp_extra_check(aten.div.Tensor),
             ),
             (
                 _sfdp_pattern_13,
@@ -509,15 +756,105 @@ def _get_sfdp_patterns():
                 d,
                 _sfdp_params_check,
             ),
-        ]:
+            (
+                _sfdp_pattern_14,
+                _sfdp_replacement_14,
+                [g(), g(), g(), m(), c()],
+                {},
+                _sfdp_extra_check(aten.div.Tensor),
+            ),
+            (
+                _sfdp_pattern_15,
+                _sfdp_replacement_15,
+                [g(), g(), g(), m(), c()],
+                {},
+                _sfdp_extra_check(aten.div.Tensor),
+            ),
+            # TODO: Enable CUDA after solving Bert accuracy issue of calling efficient attention
+            (
+                _sfdp_pattern_16,
+                _sfdp_replacement_16,
+                [g(), g(), g(), m(), c()],
+                d,
+                _sfdp_extra_check(aten.div.Tensor, disable_cuda=True),
+            ),
+            (
+                _sfdp_pattern_16,
+                _sfdp_replacement_16,
+                [g_bs1(), g_bs1(), g_bs1(), m_bs1(), c()],
+                d,
+                _sfdp_extra_check(aten.div.Tensor, disable_cuda=True),
+            ),
+            (
+                _sfdp_pattern_17,
+                _sfdp_replacement_17,
+                [g(), g(), g(), m(), c()],
+                d,
+                _sfdp_extra_check(aten.div.Tensor),
+            ),
+            (
+                _sfdp_pattern_18,
+                _sfdp_replacement_18,
+                [g(), g(), g(), m_bool()],
+                d,
+                # CUDA AOT Inductor CI job's GPT2ForSequenceClassification accuracy test failed
+                _sfdp_extra_check(disable_cuda=True),
+            ),
+            (
+                _sfdp_pattern_18,
+                _sfdp_replacement_18,
+                [g_bs1(), g_bs1(), g_bs1(), m_bs1_bool()],
+                d,
+                # CUDA AOT Inductor CI job's GPT2ForSequenceClassification accuracy test failed
+                _sfdp_extra_check(disable_cuda=True),
+            ),
+            (
+                _sfdp_pattern_19,
+                _sfdp_replacement_19,
+                [g(), g(), g(), b_bool(), b_float()],
+                d,
+                _sfdp_params_check,
+            ),
+        ]
+        mask_fp32_patterns = ["pattern_16"]
+        if dtype == torch.half:
+            # Add inputs of bf16 q/k/v and fp32 mask, for models like albert.
+            candidates.append(
+                (
+                    _sfdp_pattern_16,
+                    _sfdp_replacement_16,
+                    [g(), g(), g(), m_float(), c()],
+                    d,
+                    _sfdp_extra_check(aten.div.Tensor, disable_cuda=True),
+                )
+            )
+            candidates.append(
+                (
+                    _sfdp_pattern_16,
+                    _sfdp_replacement_16,
+                    [g_bs1(), g_bs1(), g_bs1(), m_bs1_float(), c()],
+                    d,
+                    _sfdp_extra_check(aten.div.Tensor, disable_cuda=True),
+                )
+            )
+
+        for pattern, replacement, args, workaround, extra_check in candidates:
             # XXX: when adding a new pattern, re-run `gen_attention_patterns` so the pattern
             # gets serialized to a python file and does not require tracing at runtime.
             assert isinstance(workaround, dict)
             name = pattern.__name__
 
-            training_name = (
-                f"{name}_training" if dtype == torch.float else f"{name}_training_half"
-            )
+            if dtype != torch.float:
+                name += "_half"
+                if (
+                    any(p in name for p in mask_fp32_patterns)
+                    and args[3].dtype == torch.float32
+                ):
+                    name += "_mask_fp32"
+            if args[0].size(0) == 1:
+                name += "_bs1"
+
+            training_name = name + "_training"
             yield training_name, {
                 "search_fn": pattern,
                 "replace_fn": replacement,
@@ -537,11 +874,7 @@ def _get_sfdp_patterns():
                 )
                 workaround = {}
 
-            inference_name = (
-                f"{name}_inference"
-                if dtype == torch.float
-                else f"{name}_inference_half"
-            )
+            inference_name = name + "_inference"
             yield inference_name, {
                 "search_fn": pattern,
                 "replace_fn": replacement,
@@ -555,10 +888,5 @@ def _get_sfdp_patterns():
 
 @functools.lru_cache(None)
 def _sfdp_init():
-    from .serialized_patterns.central_index import get_serialized_pattern
-
     for key, register_replacement_kwargs in _get_sfdp_patterns():
-        search_fn_pattern = get_serialized_pattern(key)
-        register_replacement(
-            **register_replacement_kwargs, search_fn_pattern=search_fn_pattern
-        )
+        gen_register_replacement(key, **register_replacement_kwargs)
diff --git a/torch/_inductor/fx_passes/group_batch_fusion.py b/torch/_inductor/fx_passes/group_batch_fusion.py
index 87ef3dabcfb94..90c59a06bab7b 100644
--- a/torch/_inductor/fx_passes/group_batch_fusion.py
+++ b/torch/_inductor/fx_passes/group_batch_fusion.py
@@ -1,11 +1,23 @@
 import collections
 import logging
 import operator
-from typing import Any, DefaultDict, Deque, Dict, Iterator, List, Optional, Set, Tuple
+from collections import OrderedDict
+from typing import (
+    Any,
+    DefaultDict,
+    Deque,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+)
 
 import torch
-from torch._dynamo.utils import counters
-from torch._utils_internal import print_graph
+from torch._dynamo.utils import counters, optimus_scuba_log
+from torch._utils_internal import upload_graph
 
 from .. import config
 from ..pattern_matcher import (
@@ -48,6 +60,34 @@
 graph_search_options = default_graph_search_options
 
 
+def update_stack_example_value(node, metadata, dim=0, op=torch.stack):
+    """
+    Update the example value of the node in the graph to enable followup split cat opt.
+    """
+    if node is not None and hasattr(node, "meta"):
+        if op == torch.stack:
+            example_value = torch.stack(metadata, dim=dim)
+        elif op == torch.unbind:
+            example_value = torch.unbind(metadata, dim=dim)  # type: ignore[assignment]
+        else:
+            return
+        node.meta["example_value"] = example_value
+
+
+def update_pointwise_example_value(pointwise_node, input, other, op):
+    """
+    Update the example value of the add node in the graph to enable followup split cat opt.
+    """
+    if pointwise_node is not None and hasattr(pointwise_node, "meta"):
+        if op == torch.add:
+            example_value = torch.add(input, other)
+        elif op == torch.mul:
+            example_value = torch.mul(input, other)
+        else:
+            return
+        pointwise_node.meta["example_value"] = example_value
+
+
 class GroupBatchFusionBase:
     def __init__(self, **kwargs):
         self.graph_search_options = kwargs.pop(
@@ -126,7 +166,7 @@ class PostGradBatchLinearFusion(BatchFusion):
 
     def _addmm_node_can_be_fused(self, node: torch.fx.Node) -> bool:
         return (
-            node.kwargs.get("beta", 1.0) == 1.0 and node.kwargs.get("alpha", 1.0) == 1.0
+            node.kwargs.get("beta", 1.0) == 1.0 and node.kwargs.get("alpha", 1.0) == 1.0  # type: ignore[return-value]
         )
 
     def _is_input_2d(self, input: torch.fx.Node) -> bool:
@@ -150,11 +190,11 @@ def match(self, node: torch.fx.Node) -> Optional[Tuple[str, int, int, int, bool]
             return None
 
         # only handle the cases where inputs are 2D tensors
-        if not self._is_input_2d(input_m) or not self._is_input_2d(weight_m):
+        if not self._is_input_2d(input_m) or not self._is_input_2d(weight_m):  # type: ignore[arg-type]
             return None
-        m, k = input_m.meta["tensor_meta"].shape
-        n = weight_m.meta["tensor_meta"].shape[1]
-        batch_key = ("batch_linear", m, k, n, bias_m is not None)
+        m, k = input_m.meta["tensor_meta"].shape  # type: ignore[union-attr]
+        n = weight_m.meta["tensor_meta"].shape[1]  # type: ignore[union-attr]
+        batch_key = ("batch_linear_post_grad", m, k, n, bias_m is not None)
         return batch_key
 
     def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
@@ -170,9 +210,9 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
                 input, weight = node.args
                 bias = None
             batch_nodes.append(node)
-            batch_inputs.append(input)
-            batch_weights.append(weight)
-            batch_biases.append(bias)
+            batch_inputs.append(input)  # type: ignore[possibly-undefined]
+            batch_weights.append(weight)  # type: ignore[possibly-undefined]
+            batch_biases.append(bias)  # type: ignore[possibly-undefined]
 
         with graph.inserting_before(subset[-1]):
             fused_inputs = decompose_stack(graph, batch_inputs)
@@ -191,17 +231,18 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
                     new_bias_add = graph.call_function(
                         aten.add, args=((batch_biases[i], new_mm))
                     )
-            new_mm_cont = new_bias_add if has_bias else new_mm
+            new_mm_cont = new_bias_add if has_bias else new_mm  # type: ignore[possibly-undefined]
             original_mm.replace_all_uses_with(new_mm_cont)
             new_mm_cont.meta.update(original_mm.meta)
             graph.erase_node(original_mm)
+        counters["inductor"]["batch_linear_post_grad"] += 1
 
 
 @register_fusion("group_linear", pre_grad=False)
 class GroupLinearFusion(GroupFusion):
     def _addmm_node_can_be_fused(self, node: torch.fx.Node):
-        input_shape = node.args[1].meta["tensor_meta"].shape
-        weight_shape = node.args[2].meta["tensor_meta"].shape
+        input_shape = node.args[1].meta["tensor_meta"].shape  # type: ignore[union-attr]
+        weight_shape = node.args[2].meta["tensor_meta"].shape  # type: ignore[union-attr]
         return (
             node.kwargs.get("beta", 1.0) == 1.0
             and node.kwargs.get("alpha", 1.0) == 1.0
@@ -215,8 +256,8 @@ def _addmm_node_can_be_fused(self, node: torch.fx.Node):
         )
 
     def _mm_node_can_be_fused(self, node: torch.fx.Node):
-        input_shape = node.args[0].meta["tensor_meta"].shape
-        weight_shape = node.args[1].meta["tensor_meta"].shape
+        input_shape = node.args[0].meta["tensor_meta"].shape  # type: ignore[union-attr]
+        weight_shape = node.args[1].meta["tensor_meta"].shape  # type: ignore[union-attr]
         return (
             len(input_shape) == 2
             and len(weight_shape) == 2
@@ -261,7 +302,6 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
 
         if all(bias is None for bias in group_biases):
             group_biases = None  # type: ignore[assignment]
-        group_biases: Optional[List[Any]]
 
         with graph.inserting_before(subset[0]):
             fused_mm = graph.call_function(
@@ -276,6 +316,7 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
             original_mm.replace_all_uses_with(new_mm)
             new_mm.meta.update(original_mm.meta)
             graph.erase_node(original_mm)
+        counters["inductor"]["group_linear"] += 1
 
 
 class BatchPointwiseOpsPostGradFusion(BatchPointwiseOpsFusionFactory):
@@ -295,11 +336,11 @@ def _pointwise_node_can_be_fused(self, node: torch.fx.Node):
         # its inputs, and cause dtype not same error in mm or addmm
         input, other = node.args
         return (
-            input.meta["tensor_meta"].shape == other.meta["tensor_meta"].shape
+            input.meta["tensor_meta"].shape == other.meta["tensor_meta"].shape  # type: ignore[union-attr]
             if hasattr(input, "meta")
             and hasattr(other, "meta")
-            and "tensor_meta" in input.meta
-            and "tensor_meta" in other.meta
+            and "tensor_meta" in input.meta  # type: ignore[union-attr]
+            and "tensor_meta" in other.meta  # type: ignore[union-attr]
             else False
         )
 
@@ -310,12 +351,12 @@ def match(self, node: torch.fx.Node):
             alpha = node.kwargs.get("alpha", 1.0)
             rounding_mode = node.kwargs.get("rounding_mode", None)
             input, other = node.args
-            shape = list(input.meta["tensor_meta"].shape)
+            shape = list(input.meta["tensor_meta"].shape)  # type: ignore[union-attr]
             group_key = (
-                "batch_" + self.op.__name__.lower() + "_post_grad",
+                "batch_aten_" + self.op.__name__.lower().split(".")[0],
                 str(shape),
-                str(input.meta["tensor_meta"].dtype),
-                str(other.meta["tensor_meta"].dtype),
+                str(input.meta["tensor_meta"].dtype),  # type: ignore[union-attr]
+                str(other.meta["tensor_meta"].dtype),  # type: ignore[union-attr]
                 str(alpha),
                 str(rounding_mode),
             )
@@ -349,6 +390,9 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
                 original_add.replace_all_uses_with(new_add)
                 new_add.meta.update(original_add.meta)
                 graph.erase_node(original_add)
+        counters["inductor"][
+            "batch_aten_" + self.op.__name__.lower().split(".")[0]
+        ] += 1
 
 
 @register_fusion("batch_linear_lhs")
@@ -425,6 +469,7 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
             node.replace_all_uses_with(new_node)
             new_node.meta.update(node.meta)
             graph.erase_node(node)
+        counters["inductor"]["batch_linear_lhs"] += 1
 
 
 def is_node_meta_valid(node: Optional[torch.fx.Node]):
@@ -469,7 +514,7 @@ def match(self, node: torch.fx.Node):
             weight = get_arg_value(node, 1, "weight")
             bias = get_arg_value(node, 2, "bias")
             group_key = (
-                "batch_linear_pre_grad",
+                "batch_linear",
                 self._getitem_args(input),
                 str(input.meta["example_value"].shape),
                 str(weight.meta["example_value"].shape),
@@ -484,19 +529,31 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
         batch_inputs = []
         batch_weights = []
         batch_biases = []
+        batch_inputs_metadata = []
+        batch_weights_metadata = []
+        batch_biases_metadata = []
         for node in subset:
             batch_nodes.append(node)
-            batch_inputs.append(get_arg_value(node, 0, "input"))
-            batch_weights.append(get_arg_value(node, 1, "weight"))
-            batch_biases.append(get_arg_value(node, 2, "bias"))
+            input = get_arg_value(node, 0, "input")
+            batch_inputs.append(input)
+            batch_inputs_metadata.append(input.meta["example_value"])
+            weight = get_arg_value(node, 1, "weight")
+            batch_weights.append(weight)
+            batch_weights_metadata.append(weight.meta["example_value"])
+            bias = get_arg_value(node, 2, "bias")
+            batch_biases.append(bias)
+            if bias is not None and hasattr(bias, "meta"):
+                batch_biases_metadata.append(bias.meta["example_value"])
 
         with graph.inserting_before(subset[0]):
             stack_inputs = graph.call_function(
                 torch.stack, args=(batch_inputs,), kwargs={"dim": 0}
             )
+            update_stack_example_value(stack_inputs, batch_inputs_metadata)
             stack_weights = graph.call_function(
                 torch.stack, args=(batch_weights,), kwargs={"dim": 0}
             )
+            update_stack_example_value(stack_weights, batch_weights_metadata)
             transpose_weight = graph.call_function(
                 torch.transpose, args=(stack_weights, 1, 2)
             )
@@ -509,6 +566,7 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
                 stack_biases = graph.call_function(
                     torch.stack, args=(batch_biases,), kwargs={"dim": 0}
                 )
+                update_stack_example_value(stack_biases, batch_biases_metadata)
                 unsqueeze_biases = graph.call_function(
                     torch.unsqueeze, args=(stack_biases, 1)
                 )
@@ -524,6 +582,7 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
                 linear.replace_all_uses_with(getitem)
                 getitem.meta.update(linear.meta)
                 graph.erase_node(linear)
+        counters["inductor"]["batch_linear"] += 1
 
 
 @register_fusion("batch_layernorm")
@@ -564,12 +623,23 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
         group_biases = []
         group_epss = []
         group_nodes = []
+        group_inputs_metadata = []
+        group_biases_metadata = []
+        group_weights_metadata = []
         for node in subset:
             group_nodes.append(node)
-            group_inputs.append(get_arg_value(node, 0, "input"))
+            input = get_arg_value(node, 0, "input")
+            group_inputs.append(input)
+            group_inputs_metadata.append(input.meta["example_value"])
             group_shapes.append(get_arg_value(node, 1, "normalized_shape"))
-            group_weights.append(get_arg_value(node, 2, "weight"))
-            group_biases.append(get_arg_value(node, 3, "bias"))
+            weight = get_arg_value(node, 2, "weight")
+            group_weights.append(weight)
+            if weight is not None and hasattr(weight, "meta"):
+                group_weights_metadata.append(weight.meta["example_value"])
+            bias = get_arg_value(node, 3, "bias")
+            group_biases.append(bias)
+            if bias is not None and hasattr(bias, "meta"):
+                group_biases_metadata.append(bias.meta["example_value"])
             eps = get_arg_value(node, 4, "eps")
             if eps is None:
                 eps = 1e-5
@@ -578,10 +648,8 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
 
         if all(bias is None for bias in group_biases):
             group_biases = None  # type: ignore[assignment]
-        group_biases: Optional[List[Any]]
         if all(weight is None for weight in group_weights):
             group_weights = None  # type: ignore[assignment]
-        group_weights: Optional[List[Any]]
         assert all(
             eps == group_epss[0] for eps in group_epss
         ), "all epsilon values must be equal"
@@ -590,16 +658,19 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
             stack_input = graph.call_function(
                 torch.stack, args=(group_inputs,), kwargs={"dim": stack_dim}
             )
+            update_stack_example_value(stack_input, group_inputs_metadata, stack_dim)
             if group_weights is not None:
                 stack_weight = graph.call_function(
                     torch.stack, args=(group_weights,), kwargs={"dim": 0}
                 )
+                update_stack_example_value(stack_weight, group_weights_metadata)
             else:
                 stack_weight = None
             if group_biases is not None:
                 stack_bias = graph.call_function(
                     torch.stack, args=(group_biases,), kwargs={"dim": 0}
                 )
+                update_stack_example_value(stack_bias, group_biases_metadata)
             else:
                 stack_bias = None
 
@@ -608,25 +679,63 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
                 args=(stack_input, group_shapes[-1]),
                 kwargs={"eps": group_epss[-1]},
             )
+            batch_layer_norm.meta["example_value"] = stack_input.meta["example_value"]
 
             if group_weights is not None and group_biases is not None:
+                previous_batch_layer_norm_meta = batch_layer_norm.meta["example_value"]
                 batch_layer_norm = graph.call_function(
-                    torch.addcmul, args=(stack_bias, stack_weight, batch_layer_norm)
+                    torch.mul, args=(stack_weight, batch_layer_norm)
+                )
+                update_pointwise_example_value(
+                    batch_layer_norm,
+                    stack_weight.meta["example_value"],
+                    previous_batch_layer_norm_meta,
+                    torch.mul,
+                )
+                previous_batch_layer_norm_meta = batch_layer_norm.meta["example_value"]
+                batch_layer_norm = graph.call_function(
+                    torch.add, args=(stack_bias, batch_layer_norm)
+                )
+                update_pointwise_example_value(
+                    batch_layer_norm,
+                    stack_bias.meta["example_value"],
+                    previous_batch_layer_norm_meta,
+                    torch.add,
                 )
             elif group_weights is not None and group_biases is None:
+                previous_batch_layer_norm_meta = batch_layer_norm.meta["example_value"]
                 batch_layer_norm = graph.call_function(
                     torch.mul, args=(stack_weight, batch_layer_norm)
                 )
+                update_pointwise_example_value(
+                    batch_layer_norm,
+                    stack_weight.meta["example_value"],
+                    previous_batch_layer_norm_meta,
+                    torch.mul,
+                )
             elif group_weights is None and group_biases is not None:
+                previous_batch_layer_norm_meta = batch_layer_norm.meta["example_value"]
                 batch_layer_norm = graph.call_function(
                     torch.add, args=(stack_bias, batch_layer_norm)
                 )
+                update_pointwise_example_value(
+                    batch_layer_norm,
+                    stack_bias.meta["example_value"],
+                    previous_batch_layer_norm_meta,
+                    torch.add,
+                )
 
             batch_layer_norm_unbind = graph.call_function(
                 torch.unbind,
                 args=(batch_layer_norm,),
                 kwargs={"dim": stack_dim},
             )
+            update_stack_example_value(
+                batch_layer_norm_unbind,
+                batch_layer_norm.meta["example_value"],
+                op=torch.unbind,
+                dim=stack_dim,
+            )
 
         for i, node in enumerate(group_nodes):
             with graph.inserting_after(batch_layer_norm_unbind):
@@ -636,11 +745,12 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
             node.replace_all_uses_with(new_node)
             new_node.meta.update(node.meta)
             graph.erase_node(node)
+        counters["inductor"]["batch_layernorm"] += 1
 
 
 class BatchPointwiseOpsPreGradFusion(BatchPointwiseOpsFusionFactory):
     """
-    Batch poinwise ops (e.g., sigmoid, relu, tanh) fusion in pre grad pass.
+    Batch pointwise ops (e.g., sigmoid, relu, tanh) fusion in pre grad pass.
     We fuse it in random place, and the introduced stack node may be merged in split cat.
     """
 
@@ -653,7 +763,7 @@ def match(self, node: torch.fx.Node):
         if CallFunctionVarArgs(self.op).match(node) and is_node_meta_valid(node):
             # for relu op, we also use the inplace to construct the key
             group_key = (
-                "batch_" + self.op.__name__.lower() + "_pre_grad",
+                "batch_" + self.op.__name__.lower().split(".")[0],
                 str(input.meta["example_value"].shape),
                 str(node.kwargs.get("inplace", False)),
             )
@@ -664,15 +774,19 @@ def match(self, node: torch.fx.Node):
     def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
         batch_nodes = []
         batch_inputs = []
+        batch_inputs_metadata = []
 
         for node in subset:
             batch_nodes.append(node)
-            batch_inputs.append(get_arg_value(node, 0, "input"))
+            input = get_arg_value(node, 0, "input")
+            batch_inputs.append(input)
+            batch_inputs_metadata.append(input.meta["example_value"])
 
         with graph.inserting_before(subset[0]):
             stack_inputs = graph.call_function(
                 torch.stack, args=(batch_inputs,), kwargs={"dim": 0}
             )
+            update_stack_example_value(stack_inputs, batch_inputs_metadata)
             if self.op == torch.nn.functional.relu:
                 batch_op = graph.call_function(
                     self.op,
@@ -693,6 +807,7 @@ def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
                 node.replace_all_uses_with(getitem)
                 getitem.meta.update(node.meta)
                 graph.erase_node(node)
+        counters["inductor"]["batch_" + self.op.__name__.lower().split(".")[0]] += 1
 
 
 @register_fusion("batch_tanh")
@@ -737,46 +852,105 @@ def __init__(self, **kwargs):
         super().__init__(aten.mul.Tensor, **kwargs)
 
 
+class _OrderedSet:
+    def __init__(self, param=None):
+        if param:
+            self.rep = OrderedDict(dict.fromkeys(param))
+        else:
+            self.rep = OrderedDict()
+
+    def __contains__(self, o):
+        return o in self.rep
+
+    def __len__(self):
+        return self.rep.__len__()
+
+    def append(self, o):
+        self.rep[o] = None
+
+    def __iter__(self):
+        return self.rep.keys().__iter__()
+
+
 def find_independent_subset_greedy(
-    node_list: List[torch.fx.Node],
+    node_list: Iterable[torch.fx.Node],
     graph_search_options: Dict[str, Any],
-) -> Iterator[List[torch.fx.Node]]:
+) -> Iterator[Iterable[torch.fx.Node]]:
     """
-    Return a list of subset from node_list, all nodes in each subset are independent with each other and can be fused together.
-    The type of subset is list, so we can preserve node's order and benefit from split-cat elimination in later pass.
+    Yields a list of subsets of `node_list` where no element in the subset
+    depends on any other element in the subset. This results in a set of
+    independent nodes which can be fused together.
+
+    The order of `node_list` is preserved within each subset so we can benefit
+    from split-cat elimination in later passes.
+
+    During iteration it is only safe to mutate the graph by changing the nodes
+    that have been returned.
+
+    graph_search_options:
+      - min_fuse_set_size: Minimum size of the subset to consider. Subsets below
+        this size will be ignored.
+      - max_fuse_set_size: Maximum size of the subset to consider. Subsets will
+        be broken to be at most this size.
     """
-    visited_node_set: Set[torch.fx.Node] = set()
-    dep_set: Set[torch.fx.Node] = set()
 
-    def find_dependent_nodes(src_node, cur_node):
-        for input_node in cur_node.all_input_nodes:
-            if input_node in node_list:
-                dep_set.add(input_node)
+    # Compute all the children of `node` which are members of
+    # `interesting_nodes`.
+    def find_dependent_nodes(node, interesting_nodes):
+        visited_node_set: Set[torch.fx.Node] = {node}
+        dep_set: Set[torch.fx.Node] = set()
+
+        work = [node]
+        while work:
+            node = work.pop()
+            for input_node in node.all_input_nodes:
+                if input_node in interesting_nodes:
+                    dep_set.add(input_node)
+
+                if input_node not in visited_node_set:
+                    visited_node_set.add(input_node)
+                    work.append(input_node)
 
-            if input_node not in visited_node_set:
-                visited_node_set.add(input_node)
-                find_dependent_nodes(src_node, input_node)
+        return dep_set
 
-    while len(node_list) > 0:
+    min_fuse_set_size = graph_search_options["min_fuse_set_size"]
+    max_fuse_set_size = graph_search_options["max_fuse_set_size"]
+
+    # node_list needs to be a set because we only track the nodes that are left
+    # in it (and we want to do the `in` on a set, not a list). But we want to
+    # keep the correct order.
+    node_list = _OrderedSet(node_list)
+
+    cache: Dict[torch.fx.Node, Set[torch.fx.Node]] = {}
+    while node_list:
         subset: List[torch.fx.Node] = []
         subset_deps: Set[torch.fx.Node] = set()
 
+        next_round_node_list = _OrderedSet()
         for node in node_list:
-            if len(subset) >= graph_search_options["max_fuse_set_size"]:
-                break
+            if len(subset) >= max_fuse_set_size or node in subset_deps:
+                next_round_node_list.append(node)
+                continue
 
-            visited_node_set.clear()
-            dep_set.clear()
+            dep_set = cache.pop(node, None)
+            if dep_set is None:
+                dep_set = find_dependent_nodes(node, node_list)
 
-            find_dependent_nodes(node, node)
-            if not dep_set.intersection(subset) and node not in subset_deps:
+            if not dep_set.intersection(subset):
                 subset.append(node)
                 subset_deps.update(dep_set)
-
-        if len(subset) >= graph_search_options["min_fuse_set_size"]:
+            else:
+                next_round_node_list.append(node)
+                cache[node] = dep_set
+
+        if len(subset) >= min_fuse_set_size:
+            # Careful here - the caller uses the subsets to fuse nodes together
+            # so we need to clear any cache entry that contains one of the
+            # returned nodes because the dependency list could be different
+            # (larger) after the merge.
+            cache = {k: v for k, v in cache.items() if v.isdisjoint(subset)}
             yield subset
 
-        next_round_node_list = [node for node in node_list if node not in subset]
         node_list = next_round_node_list
 
 
@@ -824,8 +998,9 @@ def get_fusion_candidates(
 
 
 def apply_group_batch_fusion(graph: torch.fx.GraphModule, rule: GroupBatchFusionBase):
-    stable_topological_sort(graph)
+    stable_topological_sort(graph)  # type: ignore[arg-type]
     fused_set: Set[torch.fx.Node] = set()
+    log_to_scuba = False
 
     for node in reversed(graph.nodes):
         candidates = get_fusion_candidates(rule, node, fused_set)
@@ -839,21 +1014,20 @@ def apply_group_batch_fusion(graph: torch.fx.GraphModule, rule: GroupBatchFusion
             ):
                 rule.fuse(graph, subset)
                 fused_set.update(subset)
-                if isinstance(rule, GroupFusion):
-                    counters["inductor"]["group_fusion"] += 1
-                elif isinstance(rule, BatchFusion):
-                    counters["inductor"]["batch_fusion"] += 1
-                else:
-                    counters["inductor"]["unknown_group_batch_fusion"] += 1
-
-                log.info(
-                    f"{rule.__class__.__name__}: key = {key}; subset size = {len(subset)}"  # noqa: G004
+                log.debug(
+                    f"{rule.__class__.__name__}: key = {key}; subset size = {len(list(subset))}"  # noqa: G004
                 )
+                log_to_scuba = True
+    if log_to_scuba:
+        optimus_scuba_log[rule.__class__.__name__] = upload_graph(graph)
 
 
 def generate_fusion_from_config(config_options: Dict[str, Any], pre_grad=True):
     fusions: List[GroupBatchFusionBase] = []
     for name, options in config_options.items():
+        # we skip all patterns from pattern_matcher passes (e.g., split_cat)
+        if name not in PRE_GRAD_FUSIONS and name not in POST_GRAD_FUSIONS:
+            continue
         fusion_cls = PRE_GRAD_FUSIONS[name] if pre_grad else POST_GRAD_FUSIONS[name]
         _options = graph_search_options.copy()
         _options.update(options)
@@ -862,7 +1036,6 @@ def generate_fusion_from_config(config_options: Dict[str, Any], pre_grad=True):
 
 
 def group_batch_fusion_passes(graph: torch.fx.Graph, pre_grad=True):
-    print_graph(graph, "Before group_batch fusion in pre grad pass.")
     fusions: List[GroupBatchFusionBase] = []
     # we keep all current pre grad fusions to keep
     # current implementation, will remove this later
@@ -890,5 +1063,4 @@ def group_batch_fusion_passes(graph: torch.fx.Graph, pre_grad=True):
             fusions += generate_fusion_from_config(fbgemm_fusions, pre_grad=False)
 
     for rule in fusions:
-        apply_group_batch_fusion(graph, rule)
-        print_graph(graph, f"Apply fusion {rule.__class__.__name__}.")
+        apply_group_batch_fusion(graph, rule)  # type: ignore[arg-type]
diff --git a/torch/_inductor/fx_passes/joint_graph.py b/torch/_inductor/fx_passes/joint_graph.py
index e9fb2b9e902e7..3713583e69eee 100644
--- a/torch/_inductor/fx_passes/joint_graph.py
+++ b/torch/_inductor/fx_passes/joint_graph.py
@@ -1,7 +1,7 @@
 import logging
 import typing
 from collections import Counter
-from typing import Dict, Set
+from typing import Dict, List, Set
 
 import torch
 import torch._guards
@@ -78,12 +78,9 @@ def replace_no_op(node, replace_input_index):
         replacement.meta.update(node.meta)
         graph.erase_node(node)
 
-    for node in graph.nodes:
-        if node.op != "call_function":
-            continue
-
+    for node in graph.find_nodes(op="call_function", target=aten.add.Tensor):
         # TODO handle Tensor-Scalar adds, it's a different schema
-        if node.target == aten.add.Tensor and len(node.args) == 2:
+        if len(node.args) == 2:
             if (
                 not any(e in zeros for e in node.args)
                 or node.kwargs.get("alpha", 1) != 1
@@ -93,26 +90,47 @@ def replace_no_op(node, replace_input_index):
             replace_index = 1 if node.args[0] in zeros else 0
             replace_no_op(node, replace_index)
 
-        elif node.target == aten.sub.Tensor and len(node.args) == 2:
+    for node in graph.find_nodes(op="call_function", target=aten.sub.Tensor):
+        if len(node.args) == 2:
             if node.args[1] not in zeros or node.kwargs.get("alpha", 1) != 1:
                 continue
 
             replace_no_op(node, 0)
 
-        elif node.target == aten.mul.Tensor and len(node.args) == 2:
+    for node in graph.find_nodes(op="call_function", target=aten.mul.Tensor):
+        if len(node.args) == 2:
             if not any(e in ones for e in node.args):
                 continue
 
             replace_input_index = 1 if node.args[0] in ones else 0
             replace_no_op(node, replace_input_index)
 
-        elif (
-            node.target == aten.div.Tensor
-            and len(node.args) == 2
-            and node.args[1] in ones
-        ):
+    for node in graph.find_nodes(op="call_function", target=aten.div.Tensor):
+        if len(node.args) == 2 and node.args[1] in ones:
             replace_no_op(node, 0)
 
+    # meta tensors returned from the graph have no data and can be replaced with empty_strided
+    for output_node in graph.find_nodes(op="output"):
+        had_meta_return = False
+
+        def visit(n):
+            nonlocal had_meta_return
+            val = n.meta.get("val")
+            if isinstance(val, torch.Tensor) and val.device.type == "meta":
+                with graph.inserting_before(output_node):
+                    n.replace_all_uses_with(
+                        graph.call_function(
+                            torch.ops.aten.empty_strided.default,
+                            args=(val.size(), val.stride()),
+                            kwargs={"dtype": val.dtype, "device": val.device},
+                        )
+                    )
+                had_meta_return = True
+
+        torch.fx.map_arg(output_node.args, visit)
+        if had_meta_return:
+            graph.eliminate_dead_code()
+
 
 @torch.utils._python_dispatch._disable_current_modes()
 def remove_redundant_views(gm: torch.fx.GraphModule):
@@ -124,13 +142,7 @@ def remove_redundant_views(gm: torch.fx.GraphModule):
     views: Dict[torch.fx.Node, Dict[torch.dtype, torch.fx.Node]] = {}
     graph = gm.graph
 
-    for node in graph.nodes:
-        if node.op != "call_function":
-            continue
-
-        if node.target != torch.ops.aten.view.dtype:
-            continue
-
+    for node in graph.find_nodes(op="call_function", target=torch.ops.aten.view.dtype):
         src = node.args[0]
         to_type = node.args[1]
         existing_views = views.get(src)
@@ -156,10 +168,7 @@ def remove_redundant_views(gm: torch.fx.GraphModule):
 
     # Clean up unused views.
     while True:
-        unused_views = []
-        for alias in views:
-            if not alias.users:
-                unused_views.append(alias)
+        unused_views = [alias for alias in views if not alias.users]
         if len(unused_views) == 0:
             break
         for unused in unused_views:
@@ -177,6 +186,9 @@ def __init__(self, gm, skip_constructors=False):
         super().__init__(gm, skip_constructors)
         self.node_storages_ptrs: Dict[torch.fx.Node, int] = {}
         self.constant_data_ptrs: Dict[torch.fx.Node, StorageWeakRef] = {}
+        # we may constant fold a tensor which in the graph has a sym size
+        # see: [constant folding refining of symints]
+        self.node_replacements_shapes: Dict[torch.fx.Node, List[int]] = {}
 
     def insertable_tensor_check(self, t: torch.Tensor) -> bool:
         # TODO - we could also Tensors which get replaced with arange here
@@ -190,6 +202,9 @@ def insertable_tensor_check(self, t: torch.Tensor) -> bool:
     def add_node_replacement(self, node: torch.fx.Node, tensor: torch.Tensor) -> None:
         self.node_replacements[node] = tensor.flatten()[0].item()
         self.constant_data_ptrs[node] = StorageWeakRef(tensor.untyped_storage())
+        shape = list(tensor.shape)
+        assert all(type(dim) is int for dim in shape)
+        self.node_replacements_shapes[node] = shape
 
 
 @torch.utils._python_dispatch._disable_current_modes()
@@ -204,6 +219,15 @@ def constant_fold_uniform_value(gm: torch.fx.GraphModule):
 
     node_replacements = cf.node_replacements
 
+    # note: [constant folding refining of symints]
+    # constant folding will partially evaluate a graph such that values which have dependencies which
+    # are entirely known at compile time may also become compile time constants. in some cases,
+    # this will include symints which we had not yet previously deduced are guaranteed a
+    # constant value and is then deduced in constant folding. an example is:
+    # unbacked_symint_eq_11 = torch.full((), 11).item()
+    # torch.full((unbacked_symint_eq_11,), 0)
+    node_replacements_shapes = cf.node_replacements_shapes
+
     graph = gm.graph
 
     zeros = set()
@@ -219,6 +243,10 @@ def constant_fold_uniform_value(gm: torch.fx.GraphModule):
     for node, value in node_replacements.items():
         # we dont have a functional way right now of instantiating a non-contiguous tensor with full/zeros/ones right now
         # hasn't shown up to be important yet
+        if "val" not in node.meta:
+            # This can only happen in AOTI
+            continue
+
         fake_tensor = node.meta["val"]
         if not fake_tensor.is_contiguous(memory_format=torch.contiguous_format):
             continue
@@ -235,10 +263,16 @@ def constant_fold_uniform_value(gm: torch.fx.GraphModule):
             ):
                 value = node.args[1]
 
-            # zeros, and ones just get traced into full, so we insert those
+            # refines symints, see [constant folding refining of symints] above
+            for runtime_size, compile_time_size in zip(
+                node_replacements_shapes[node], fake_tensor.shape
+            ):
+                torch._check(runtime_size == compile_time_size)
+
+            # zeros and ones just get traced into full, so we insert those
             new_node = graph.call_function(
                 aten.full.default,
-                args=(list(fake_tensor.shape), value),
+                args=(node_replacements_shapes[node], value),
                 kwargs={
                     "dtype": fake_tensor.dtype,
                     "layout": torch.strided,
@@ -266,16 +300,23 @@ def joint_graph_passes(graph: torch.fx.GraphModule):
     """
     lazy_init()
     count = 0
+    if config.joint_custom_pre_pass is not None:
+        config.joint_custom_pre_pass(graph.graph)
+        count += 1
 
     if config.joint_graph_constant_folding:
         constant_fold_uniform_value(graph)
 
     if config.pattern_matcher:
-        count += patterns.apply(graph.graph)
+        count += patterns.apply(graph.graph)  # type: ignore[arg-type]
 
     if not config.fallback_random:
         count += replace_random_passes(graph)
 
+    if config.joint_custom_post_pass is not None:
+        config.joint_custom_post_pass(graph.graph)
+        count += 1
+
     if count:
         stable_topological_sort(graph.graph)
         graph.graph.lint()
@@ -317,7 +358,7 @@ def pointless_view(match: Match, arg, size):
     """Remove no-op view"""
     graph = match.graph
     node = match.output_node()
-    arg_size = list(node.args[0].meta["val"].shape)
+    arg_size = list(node.args[0].meta["val"].shape)  # type: ignore[union-attr]
     if size == arg_size:
         node.replace_all_uses_with(node.args[0])
         match.erase_nodes(graph)
diff --git a/torch/_inductor/fx_passes/misc_patterns.py b/torch/_inductor/fx_passes/misc_patterns.py
index 689b5fd9dbc79..76c641e3e8eb6 100644
--- a/torch/_inductor/fx_passes/misc_patterns.py
+++ b/torch/_inductor/fx_passes/misc_patterns.py
@@ -58,7 +58,7 @@ def randperm_index_replacement(x, slice_shape):
         index = torch.randperm(x.shape[0], device=x.device)[:slice_shape]
         return torch.ops.aten._unsafe_index(x, (index,)), index
 
-    pattern = register_replacement(
+    register_replacement(
         randperm_index_pattern,
         randperm_index_replacement,
         [torch.empty(4, 8, device=device)],
diff --git a/torch/_inductor/fx_passes/mkldnn_fusion.py b/torch/_inductor/fx_passes/mkldnn_fusion.py
index 8f088bcbbfa74..4ca9879d94a83 100644
--- a/torch/_inductor/fx_passes/mkldnn_fusion.py
+++ b/torch/_inductor/fx_passes/mkldnn_fusion.py
@@ -24,6 +24,7 @@
 from .quantization import (
     _register_quantization_lowerings,
     _register_quantization_weight_pack_pass,
+    _register_woq_lowerings,
 )
 
 if torch._C._has_mkldnn:
@@ -68,13 +69,26 @@ def _to_bf16(input_call):
             _users=1,
         )
 
-    def _unary_fusion_pattern(unary_fusion, call_fn, users, is_bf16):
-        # only insert to_dtype if is_bf16 is True
+    def _to_fp16(input_call):
+        return CallFunction(
+            prims.convert_element_type.default,
+            input_call,
+            KeywordArg("to_fp16"),
+            _users=1,
+        )
+
+    def _unary_fusion_pattern(unary_fusion, call_fn, users, lowp_dtype):
+        # only insert to_dtype if lowp_dtype is True
         computation_call = (
-            _to_float(call_fn(), users=users) if is_bf16 else call_fn(users=users)
+            _to_float(call_fn(), users=users) if lowp_dtype else call_fn(users=users)
         )
         out = unary_fusion(computation_call)
-        return _to_bf16(out) if is_bf16 else out
+        if lowp_dtype == torch.bfloat16:
+            return _to_bf16(out)
+        elif lowp_dtype == torch.float16:
+            return _to_fp16(out)
+        else:
+            return out
 
     def _gelu_fusion_1(computation_call):
         return CallFunction(
@@ -194,11 +208,11 @@ def fn(match):
 
         return fn
 
-    def _is_valid_computation_unary_fusion(computation_op, is_bf16=False):
+    def _is_valid_computation_unary_fusion(computation_op, lowp_dtype=None):
         def fn(match):
             matched = _is_single_computation_op(computation_op)(match)
             computation_node = filter_nodes(match.nodes, computation_op)[0]
-            if is_bf16:
+            if lowp_dtype:
                 conversion_dtype_nodes = filter_nodes(
                     match.nodes, prims.convert_element_type.default
                 )
@@ -207,23 +221,21 @@ def fn(match):
                 # fusion pattern is always in the form of computation_op + to_float32 + unary_op + to_bfloat16
                 if computation_node == conversion_dtype_nodes[0].args[0]:
                     to_float = conversion_dtype_nodes[0].args[1]
-                    to_bf16 = conversion_dtype_nodes[1].args[1]
+                    to_lp = conversion_dtype_nodes[1].args[1]
                 else:
                     to_float = conversion_dtype_nodes[1].args[1]
-                    to_bf16 = conversion_dtype_nodes[0].args[1]
-                matched = (
-                    matched and to_float == torch.float and to_bf16 == torch.bfloat16
-                )
+                    to_lp = conversion_dtype_nodes[0].args[1]
+                matched = matched and to_float == torch.float and to_lp == lowp_dtype
             return matched
 
         return fn
 
     def _register_unary_fusion_lowering(
-        pattern, unary_attr, computation_op, is_bf16=False
+        pattern, unary_attr, computation_op, lowp_dtype=None
     ):
         @register_lowering_pattern(
             pattern,
-            extra_check=_is_valid_computation_unary_fusion(computation_op, is_bf16),
+            extra_check=_is_valid_computation_unary_fusion(computation_op, lowp_dtype),
         )
         def fn(match, *args, **kwargs):
             computation_args = list(args)[:-3] + [
@@ -235,7 +247,7 @@ def fn(match, *args, **kwargs):
 
         return fn
 
-    def _register_leaky_relu_fusion_lowering(pattern, computation_op, is_bf16=False):
+    def _register_leaky_relu_fusion_lowering(pattern, computation_op, lowp_dtype=None):
         @register_lowering_pattern(
             pattern, extra_check=_is_single_computation_op(computation_op)
         )
@@ -245,10 +257,14 @@ def fn(match, *args, **kwargs):
                 matched = False
             else:  # inp is a Number
                 matched = True
-            if is_bf16:
+            if lowp_dtype:
                 dtype1 = kwargs.get("to_float")
-                dtype2 = kwargs.get("to_bf16")
-                matched = matched and dtype1 == torch.float and dtype2 == torch.bfloat16
+                dtype2 = (
+                    kwargs.get("to_bf16")
+                    if lowp_dtype == torch.bfloat16
+                    else kwargs.get("to_fp16")
+                )
+                matched = matched and dtype1 == torch.float and dtype2 == lowp_dtype
             computation_args = list(args)
             if matched:
                 computation_args = computation_args[:-3] + [
@@ -260,22 +276,20 @@ def fn(match, *args, **kwargs):
             else:
                 # computation_args += ["none", [], ""]
                 out = L[computation_op](*computation_args)
-                if is_bf16:
+                if lowp_dtype:
                     out = L[prims.convert_element_type.default](out, dtype=torch.float)
                 out = L[aten.where](
                     L[aten.gt](out, 0),
                     out,
                     L[aten.mul](out, negative_slope),
                 )
-                if is_bf16:
-                    out = L[prims.convert_element_type.default](
-                        out, dtype=torch.bfloat16
-                    )
+                if lowp_dtype:
+                    out = L[prims.convert_element_type.default](out, dtype=dtype2)  # type: ignore[possibly-undefined]
                 return out
 
         return fn
 
-    def _register_hardtanh_fusion_lowering(pattern, computation_op, is_bf16=False):
+    def _register_hardtanh_fusion_lowering(pattern, computation_op, lowp_dtype=None):
         @register_lowering_pattern(
             pattern, extra_check=_is_single_computation_op(computation_op)
         )
@@ -289,10 +303,14 @@ def fn(match, *args, **kwargs):
             else:  # inp is a Number
                 assert max_value is not None
                 matched = min_value <= max_value
-            if is_bf16:
+            if lowp_dtype:
                 dtype1 = kwargs.get("to_float")
-                dtype2 = kwargs.get("to_bf16")
-                matched = matched and dtype1 == torch.float and dtype2 == torch.bfloat16
+                dtype2 = (
+                    kwargs.get("to_bf16")
+                    if lowp_dtype == torch.bfloat16
+                    else kwargs.get("to_fp16")
+                )
+                matched = matched and dtype1 == torch.float and dtype2 == lowp_dtype
             computation_args = list(args)
             if matched:
                 computation_args = computation_args[:-3] + [
@@ -303,13 +321,11 @@ def fn(match, *args, **kwargs):
                 return L[computation_op](*computation_args)
             else:
                 out = L[computation_op](*computation_args)
-                if is_bf16:
+                if lowp_dtype:
                     out = L[prims.convert_element_type.default](out, dtype=torch.float)
                 out = L[aten.clamp_max](L[aten.clamp_min](out, min_value), max_value)
-                if is_bf16:
-                    out = L[prims.convert_element_type.default](
-                        out, dtype=torch.bfloat16
-                    )
+                if lowp_dtype:
+                    out = L[prims.convert_element_type.default](out, dtype=dtype2)  # type: ignore[possibly-undefined]
                 return out
 
         return fn
@@ -325,15 +341,16 @@ def _is_valid_binary(match, fn):
         binary_nodes = filter_nodes(match.nodes, fn)
         if len(binary_nodes) < 1:
             return False
+
+        def get_meta_value(argument: torch.fx.node.Argument):
+            # Only torch.fx.Node is expected to have meta.
+            if isinstance(argument, torch.fx.Node):
+                return argument.meta.get("val", None)
+            return None
+
         if any(
-            not (
-                hasattr(n.args[0], "meta")
-                and isinstance(n.args[0].meta.get("val", None), torch.Tensor)
-            )
-            or not (
-                hasattr(n.args[1], "meta")
-                and isinstance(n.args[1].meta.get("val", None), torch.Tensor)
-            )
+            not isinstance(get_meta_value(n.args[0]), torch.Tensor)
+            or not isinstance(get_meta_value(n.args[1]), torch.Tensor)
             for n in binary_nodes
         ):
             return False
@@ -345,9 +362,9 @@ def _is_valid_binary(match, fn):
         ):
             return False
         if any(
-            n.args[0].meta["val"].size() != n.args[1].meta["val"].size()
-            or n.args[0].meta["val"].device != n.args[1].meta["val"].device
-            or n.args[0].meta["val"].dtype != n.args[1].meta["val"].dtype
+            get_meta_value(n.args[0]).size() != get_meta_value(n.args[1]).size()
+            or get_meta_value(n.args[0]).device != get_meta_value(n.args[1]).device
+            or get_meta_value(n.args[0]).dtype != get_meta_value(n.args[1]).dtype
             for n in binary_nodes
         ):
             return False
@@ -420,15 +437,19 @@ def _get_compute_node(_binary_node, _other_index):
                 _compute_index = 1 if (_other_index == 0) else 0
                 return _binary_node.args[_compute_index]
 
-            if any(
-                len(
-                    _get_remaining_users(
-                        n.args[other_index], _get_compute_node(n, other_index)
+            def _other_input_not_inplaceable(_binary_node, _other_index):
+                _compute_node = _get_compute_node(_binary_node, _other_index)
+                return (
+                    len(
+                        _get_remaining_users(
+                            _binary_node.args[_other_index], _compute_node
+                        )
                     )
+                    > 1
+                    or _binary_node.args[_other_index] == _compute_node.args[0]
                 )
-                > 1
-                for n in binary_nodes
-            ):
+
+            if any(_other_input_not_inplaceable(n, other_index) for n in binary_nodes):
                 return False
             if any(
                 n.args[other_index].op in ["placeholder", "output"]
@@ -469,6 +490,15 @@ def fn(match, *args, **kwargs):
 
         return fn
 
+    def _can_be_inplace(_other):
+        if isinstance(_other.data, ir.View):
+            return _can_be_inplace(_other.data)
+        else:
+            return not (
+                isinstance(_other.data, ir.ReinterpretView)
+                or len(_other.get_inputs_that_alias_output()) > 0
+            )
+
     def _register_binary_unary_maybe_inplace_fusion_lowering(
         pattern,
         computation_op,
@@ -502,11 +532,7 @@ def fn(match, *args, **kwargs):
                     computation_args += [1.0, None, [], None]
             # Make sure the other is not an alias or mutation(fx side doesn't has such info).
             other.realize()
-            can_be_inplace = not (
-                isinstance(other.data, ir.ReinterpretView)
-                or isinstance(other.get_layout(), (ir.MutationLayout, ir.AliasedLayout))
-            )
-            if not can_be_inplace:
+            if not _can_be_inplace(other):
                 return L[outplace_fusion_op](*computation_args)
             return L[inplace_fusion_op](*computation_args)
 
@@ -527,30 +553,30 @@ def __init__(self, op_name: str, scalars_attr=None, algorithm_attr=None):
     def _register_unary_fusion():
         computation_call_fns = [_conv_call, _linear_call, _conv_transpose_call]
 
-        def _unary_fusion_patterns(is_bf16):
+        def _unary_fusion_patterns(lowp_dtype):
             replacement_unary_fusion_patterns = {
                 UnaryAttr("gelu", algorithm_attr="tanh"): [
-                    _unary_fusion_pattern(_gelu_fusion_2, call_fn, 4, is_bf16)
+                    _unary_fusion_pattern(_gelu_fusion_2, call_fn, 4, lowp_dtype)
                     for call_fn in computation_call_fns
                 ],
                 UnaryAttr("gelu", algorithm_attr="none"): [
-                    _unary_fusion_pattern(_gelu_fusion_1, call_fn, 2, is_bf16)
+                    _unary_fusion_pattern(_gelu_fusion_1, call_fn, 2, lowp_dtype)
                     for call_fn in computation_call_fns
                 ],
                 UnaryAttr("hardswish"): [
-                    _unary_fusion_pattern(_hardswish_fusion, call_fn, 2, is_bf16)
+                    _unary_fusion_pattern(_hardswish_fusion, call_fn, 2, lowp_dtype)
                     for call_fn in computation_call_fns
                 ],
                 UnaryAttr("hardsigmoid"): [
-                    _unary_fusion_pattern(_hardsigmoid_fusion, call_fn, 1, is_bf16)
+                    _unary_fusion_pattern(_hardsigmoid_fusion, call_fn, 1, lowp_dtype)
                     for call_fn in computation_call_fns
                 ],
                 UnaryAttr("swish"): [
-                    _unary_fusion_pattern(_silu_fusion, call_fn, 2, is_bf16)
+                    _unary_fusion_pattern(_silu_fusion, call_fn, 2, lowp_dtype)
                     for call_fn in computation_call_fns
                 ],
             }
-            if not is_bf16:
+            if not lowp_dtype:
                 call_user1 = [call_fn(users=1) for call_fn in computation_call_fns]
                 replacement_unary_fusion_patterns.update(
                     {
@@ -568,30 +594,32 @@ def _unary_fusion_patterns(is_bf16):
 
             return replacement_unary_fusion_patterns
 
-        for is_bf16 in [True, False]:
-            replace_patterns = _unary_fusion_patterns(is_bf16)
+        for lowp_dtype in [torch.bfloat16, torch.float16, None]:
+            replace_patterns = _unary_fusion_patterns(lowp_dtype)
             for unary_attr, patterns in replace_patterns.items():
                 _register_unary_fusion_lowering(
-                    patterns[0], unary_attr, computation_ops[0], is_bf16
+                    patterns[0], unary_attr, computation_ops[0], lowp_dtype
                 )
                 _register_unary_fusion_lowering(
-                    patterns[1], unary_attr, computation_ops[1], is_bf16
+                    patterns[1], unary_attr, computation_ops[1], lowp_dtype
                 )
                 _register_unary_fusion_lowering(
-                    patterns[2], unary_attr, computation_ops[2], is_bf16
+                    patterns[2], unary_attr, computation_ops[2], lowp_dtype
                 )
             _leaky_relu_patterns = [
-                _unary_fusion_pattern(_leaky_relu_fusion, call_fn, 3, is_bf16)
+                _unary_fusion_pattern(_leaky_relu_fusion, call_fn, 3, lowp_dtype)
                 for call_fn in computation_call_fns
             ]
             for pattern, computation_op in zip(_leaky_relu_patterns, computation_ops):
-                _register_leaky_relu_fusion_lowering(pattern, computation_op, is_bf16)
+                _register_leaky_relu_fusion_lowering(
+                    pattern, computation_op, lowp_dtype
+                )
             hardtanh_patterns = [
-                _unary_fusion_pattern(_hardtanh_fusion, call_fn, 1, is_bf16)
+                _unary_fusion_pattern(_hardtanh_fusion, call_fn, 1, lowp_dtype)
                 for call_fn in computation_call_fns
             ]
             for pattern, computation_op in zip(hardtanh_patterns, computation_ops):
-                _register_hardtanh_fusion_lowering(pattern, computation_op, is_bf16)
+                _register_hardtanh_fusion_lowering(pattern, computation_op, lowp_dtype)
 
     def _register_inplace_fusion():
         binary_ops = [aten.add, ops.add]
@@ -715,30 +743,30 @@ def _recover_linear():
             pass_number=1,
         )
         def reshape_linear_reshape_pattern(match, *args, **kwargs):
+            def get_val(val):
+                return val if isinstance(val, int) else val.meta.get("val")
+
             reshape_1 = kwargs.get("reshape_1")
             reshape_2 = kwargs.get("reshape_2")
             assert isinstance(reshape_1, list)
             assert isinstance(reshape_2, list)
             assert len(reshape_1) == 2
-            dynamic_shapes = not all(
-                isinstance(x, int) for x in ([reshape_1[0]] + reshape_2[:-1])
-            )
 
             graph = match.graph
             reshape_2_node = match.output_node()
             linear_input_node = reshape_2_node.args[0].args[0].args[0]
             # check linear's input's shape[:-1] == reshape_2[:-1]
             # and check product(reshape_2[:-1]) == reshape_1[0]
-            if dynamic_shapes:
-                # TODO: Haozhe investigate how add guard here
-                return
-            else:
-                can_remove_reshape = linear_input_node.meta.get("val").shape[
-                    :-1
-                ] == torch.Size(reshape_2[:-1])
-                can_remove_reshape = can_remove_reshape and (
-                    reduce(operator.mul, reshape_2[:-1]) == reshape_1[0]
+            can_remove_reshape = linear_input_node.meta.get("val").shape[
+                :-1
+            ] == torch.Size([get_val(val) for val in reshape_2[:-1]])
+            can_remove_reshape = can_remove_reshape and (
+                reduce(
+                    operator.mul,
+                    [get_val(val) for val in reshape_2[:-1]],
                 )
+                == get_val(reshape_1[0])
+            )
 
             if can_remove_reshape:
                 repl = graph.call_function(mkldnn._linear_pointwise.default, args)
@@ -816,6 +844,12 @@ def _is_packable_mkldnn_rnn_layer(match):
             for POS_ARG in POS_ARGS
         ):
             return False
+        if any(
+            lstm_node.args[POS_ARG].meta.get("val").dtype == torch.float16
+            and not mkldnn._is_mkldnn_fp16_supported()
+            for POS_ARG in POS_ARGS
+        ):
+            return False
 
         return True
 
@@ -835,7 +869,7 @@ def _is_packable_convolution(match):
             if (
                 meta_value is None
                 or meta_value.device.type != "cpu"
-                or meta_value.dim() != 4
+                or (meta_value.dim() != 4 and meta_value.dim() != 5)
             ):
                 return False
         if (
@@ -844,6 +878,12 @@ def _is_packable_convolution(match):
         ):
             if not mkldnn._is_mkldnn_bf16_supported():
                 return False
+        if (
+            input_meta_value.dtype == torch.float16
+            or weight_meta_value.dtype == torch.float16
+        ):
+            if not mkldnn._is_mkldnn_fp16_supported():
+                return False
         is_transposed = conv_node.args[-3]
         if is_transposed:
             # TODO: Support dynamic shape case for MKLDNN conv transpose.
@@ -869,6 +909,12 @@ def _is_packable_linear(match):
         Check if the node is supported for MKLDNN linear.
         """
         linear_node = match.output_node()
+        # mkldnn linear only supports beta=1or0 and alpha=1
+        if linear_node.target == aten.addmm.default:
+            alpha = linear_node.kwargs.get("alpha", 1.0)
+            beta = linear_node.kwargs.get("beta", 1.0)
+            if (beta != 0.0 and beta != 1.0) or alpha != 1.0:
+                return False
         # weight_idx is 1 for aten.mm and is 2 for aten.addmm
         weight_idx = 2 if linear_node.target == aten.addmm.default else 1
         if linear_node.args[weight_idx].op != "get_attr":
@@ -878,11 +924,19 @@ def _is_packable_linear(match):
         if input_meta_value is None or weight_meta_value is None:
             return False
         batch_size = input_meta_value.shape[0]
-        is_bf16_weight = weight_meta_value.dtype == torch.bfloat16
+        if (
+            input_meta_value.dtype == torch.float64
+            or weight_meta_value.dtype == torch.float64
+        ):
+            return False
+        is_lp_weight = weight_meta_value.dtype in (
+            torch.bfloat16,
+            torch.float16,
+        )
         # on x86, for fp32, mkl should be enabled and batch_size should not be a free symbol.
         # on aarch64, use mkldnn op for fp32 as well if acl is enabled
         if (
-            not is_bf16_weight
+            not is_lp_weight
             and not mkldnn._is_mkldnn_acl_supported()
             and ((not torch._C.has_mkl) or has_free_symbols(batch_size))
         ):
@@ -910,6 +964,12 @@ def _is_packable_linear(match):
         ):
             if not mkldnn._is_mkldnn_bf16_supported():
                 return False
+        if (
+            input_meta_value.dtype == torch.float16
+            or weight_meta_value.dtype == torch.float16
+        ):
+            if not mkldnn._is_mkldnn_fp16_supported():
+                return False
         return True
 
     _aten_conv_args = (
@@ -1037,7 +1097,14 @@ def get_item(graph, node, index):
                 graph.erase_node(lstm_node)
 
         @register_freezing_graph_pattern(
-            CallFunction(aten.addmm.default, Arg(), Arg(), Arg()),
+            CallFunction(
+                aten.addmm.default,
+                Arg(),
+                Arg(),
+                Arg(),
+                beta=KeywordArg("beta"),
+                alpha=KeywordArg("alpha"),
+            ),
             extra_check=_is_packable_linear,
         )
         @register_freezing_graph_pattern(
@@ -1048,19 +1115,30 @@ def linear(match, *args, **kwargs):
             graph = match.graph
             linear_node = match.output_node()
             input = args[0] if linear_node.target == aten.mm.default else args[1]
-            bias = None if linear_node.target == aten.mm.default else args[0]
+            bias = (
+                None
+                if linear_node.target == aten.mm.default
+                or (
+                    linear_node.target == aten.addmm.default
+                    and linear_node.kwargs.get("beta", 1.0) == 0.0
+                )
+                else args[0]
+            )
             weight = args[1] if linear_node.target == aten.mm.default else args[2]
             with graph.inserting_before(linear_node):
                 transpose_weight_node = graph.create_node(
                     "call_function", aten.permute.default, (weight, (1, 0))
                 )
                 weight_dtype = weight.meta.get("val").dtype
-                is_bf16_weight = weight_dtype == torch.bfloat16
+                is_lp_weight = weight_dtype in (
+                    torch.bfloat16,
+                    torch.float16,
+                )
                 batch_size = input.meta.get("val").shape[0]
                 if has_free_symbols(batch_size):
                     assert (
-                        is_bf16_weight or mkldnn._is_mkldnn_acl_supported()
-                    ), f"only bf16 weight prepacking supports dynamic shape inputs but got {weight_dtype}"
+                        is_lp_weight or mkldnn._is_mkldnn_acl_supported()
+                    ), f"only bf16/fp16 weight prepacking supports dynamic shape inputs but got {weight_dtype}"
                 # For bfloat16 dynamic shape path, using input size hint to pack weight for a better performance.
                 packed_weight_inputs = (
                     transpose_weight_node,
@@ -1070,7 +1148,7 @@ def linear(match, *args, **kwargs):
                 )
                 packed_weight_op = (
                     mkldnn._reorder_linear_weight
-                    if (is_bf16_weight or mkldnn._is_mkldnn_acl_supported())
+                    if (is_lp_weight or mkldnn._is_mkldnn_acl_supported())
                     else torch.ops.mkl._mkl_reorder_linear_weight
                 )
                 packed_weight_node = graph.create_node(
@@ -1078,7 +1156,7 @@ def linear(match, *args, **kwargs):
                 )
 
                 packed_linear_inputs: Tuple[Any, ...] = (input, packed_weight_node)
-                if is_bf16_weight or mkldnn._is_mkldnn_acl_supported():
+                if is_lp_weight or mkldnn._is_mkldnn_acl_supported():
                     packed_linear_inputs += (bias, "none", [], "")
                     packed_linear_op = mkldnn._linear_pointwise.default
                 else:
@@ -1110,6 +1188,7 @@ def forward(self, x):
 
         packed_weight_ops = [
             torch._C._nn.mkldnn_reorder_conv2d_weight,
+            torch._C._nn.mkldnn_reorder_conv3d_weight,
             mkldnn._reorder_convolution_transpose_weight,
             mkldnn._reorder_linear_weight,
             mkldnn._reorder_mkldnn_rnn_layer_weight,
@@ -1142,6 +1221,7 @@ def _mkldnn_fusion_init():
             _register_binary_unary_fusion()
             _register_binary_fusion()
             _register_quantization_lowerings()
+            _register_woq_lowerings()
 
     @functools.lru_cache(None)
     def _mkldnn_weight_pack_init():
diff --git a/torch/_inductor/fx_passes/numeric_utils.py b/torch/_inductor/fx_passes/numeric_utils.py
new file mode 100644
index 0000000000000..44d0564fe3ea2
--- /dev/null
+++ b/torch/_inductor/fx_passes/numeric_utils.py
@@ -0,0 +1,210 @@
+import gc
+import logging
+import os
+import random
+import traceback
+
+import numpy
+
+import torch
+import torch.optim as optim
+
+from .. import config
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+MAIN_RANDOM_SEED = 1337
+
+# Set the CUBLAS_WORKSPACE_CONFIG environment variable
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
+
+# If the two forward functions involve any non-deterministic operations,
+# such as certain types of parallelism or asynchronous execution,
+# this can also lead to different outputs.
+def set_deterministic() -> None:
+    """Make torch manual seed deterministic."""
+
+    torch.manual_seed(MAIN_RANDOM_SEED)
+    random.seed(MAIN_RANDOM_SEED)
+    numpy.random.seed(MAIN_RANDOM_SEED)
+    torch.use_deterministic_algorithms(True)
+
+
+def clean_memory() -> None:
+    """Clean memory to avoid OOM."""
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+# We compare the numerical results before and after pre/post grad fx passes
+# transformation to make sure the numerical results are the same.
+def compare_dict_tensors(dict_base, dict_control, precision):
+    if len(set(dict_base.keys())) != len(set(dict_control.keys())):
+        logger.warning("Mismatch keys found before and after pre/post grad fx passes.")
+        logger.debug("keys before pre/post grad fx passes %s", dict_base.keys())
+        logger.debug("keys after pre/post grad fx passes %s", dict_control.keys())
+        return False
+    is_allclose = True
+    for key in dict_base.keys():
+        if key not in dict_control:
+            logger.warning(
+                "Mismatch parameter name %s does not exist after pre/post grad fx passes",
+                key,
+            )
+        # Some parameters have `None`, and not every param has a valid .grad field, we skip them
+        if dict_base[key] is None or dict_control[key] is None:
+            continue
+        if not torch.allclose(
+            dict_base[key],
+            dict_control[key],
+            rtol=precision,
+            atol=precision,
+            equal_nan=True,
+        ):
+            logger.warning(
+                "Mismatch parameter values found before and after pre/post grad fx passes."
+            )
+            logger.debug("value before pre/post grad fx passes %s", dict_base[key])
+            logger.debug("value after pre/post grad fx passes %s", dict_control[key])
+            is_allclose = False
+    return is_allclose
+
+
+def compare_tuple_tensors(tuple_base, tuple_control, precision):
+    if len(tuple_base) != len(tuple_control):
+        logger.warning(
+            "Mismatch fw output length. before transformation: %s, after transformation: %s",
+            len(tuple_base),
+            len(tuple_control),
+        )
+        return False
+    is_allclose = True
+    for i in range(len(tuple_base)):
+        # Some parameters have `None`, we skip them
+        if tuple_base[i] is None or tuple_control[i] is None:
+            continue
+        if not torch.allclose(
+            tuple_base[i],
+            tuple_control[i],
+            rtol=precision,
+            atol=precision,
+            equal_nan=True,
+        ):
+            logger.debug(
+                "forward output before pre/post grad fx passes %s", tuple_base[i]
+            )
+            logger.debug(
+                "forward output after pre/post grad fx passes %s", tuple_control[i]
+            )
+            is_allclose = False
+    return is_allclose
+
+
+def compare_parameters(model_base, model_control, precision):
+    return compare_dict_tensors(
+        dict(model_base.named_parameters()),
+        dict(model_control.named_parameters()),
+        precision,
+    )
+
+
+def compare_forward_output(pred_base, pred_control, precision):
+    return compare_tuple_tensors(
+        pred_base,
+        pred_control,
+        precision,
+    )
+
+
+def compare_gradients(model_base, model_control, precision):
+    grad_base = {key: param.grad for key, param in model_base.named_parameters()}
+    grad_pt2 = {key: param.grad for key, param in model_control.named_parameters()}
+    return compare_dict_tensors(
+        grad_base,
+        grad_pt2,
+        precision,
+    )
+
+
+def run_model(
+    model_base, model_control, model_input, num_iterations=10, precision=1e-4
+):
+    clean_memory()
+    for i in range(num_iterations):
+        logger.info("start %s iteration", i)
+        set_deterministic()
+        pred_base = model_base(*model_input)
+        set_deterministic()
+        pred_control = model_control(*model_input)
+
+        res = compare_parameters(model_base, model_control, precision)
+        logger.info("compare parameters. Numerical result : %s", res)
+
+        res = compare_forward_output(pred_base, pred_control, precision)
+        logger.info("compare loss/predict. Numerical result : %s", res)
+        # tensor may not have a grad_fn
+        try:
+            _ = pred_base[0].sum().backward(retain_graph=True)
+            _ = pred_control[0].sum().backward(retain_graph=True)
+            res = compare_gradients(model_base, model_control, precision)
+            logger.info("compare param grad. Numerical result : %s", res)
+        except Exception:
+            logger.exception("Exception when comparing gradients")
+            traceback.print_exc()
+
+        if config.fx_passes_numeric_check["requires_optimizer"]:
+            try:
+                optimizer_base = optim.SGD(
+                    [param for name, param in model_base.named_parameters()], lr=0.01
+                )
+                optimizer_base.step()
+
+                optimizer_control = optim.SGD(
+                    [param for name, param in model_control.named_parameters()], lr=0.01
+                )
+                optimizer_control.step()
+
+                res = compare_parameters(model_base, model_control, precision)
+                logger.info(
+                    "compare parameters with optimizer added. Numerical result : %s",
+                    res,
+                )
+            except Exception as e:
+                logger.exception(
+                    "Exception when optimizer is added to check parameter names"
+                )
+                traceback.print_exc()
+        else:
+            logger.warning(
+                "no parameter with optimizer to compare with length %s before transformation"
+                " and the length %s after transformation",
+                len(dict(model_base.named_parameters())),
+                len(dict(model_control.named_parameters())),
+            )
+
+
+def numeric_check_if_enabled(
+    gm_before_fx_passes,
+    gm_after_fx_passes,
+    example_inputs,
+    num_iterations,
+    precision,
+):
+    # need to topo-sort graphmodule before we run the model,
+    # otherwise it may fail as refer before def
+    # fail silently in order not to block the model run
+    try:
+        with torch.autograd.set_detect_anomaly(True):
+            run_model(
+                gm_before_fx_passes,
+                gm_after_fx_passes,
+                example_inputs,
+                num_iterations=num_iterations,
+                precision=precision,
+            )
+    except Exception as e:
+        logger.warning(
+            "Runtime numeric check failed in pre grad fx passes with error: %s", e
+        )
+        traceback.print_exc()
diff --git a/torch/_inductor/fx_passes/pad_mm.py b/torch/_inductor/fx_passes/pad_mm.py
index f6d6210a53b58..44599be97f3a4 100644
--- a/torch/_inductor/fx_passes/pad_mm.py
+++ b/torch/_inductor/fx_passes/pad_mm.py
@@ -1,18 +1,25 @@
 import functools
-from itertools import chain
-from typing import List, Optional
+from typing import List, Optional, Union
 
 import torch
+import torch._inductor.runtime.runtime_utils
 from torch import Tensor
 from torch._inductor import utils
+from torch._subclasses.fake_tensor import FakeTensor
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._triton import has_triton
 
-from ..pattern_matcher import fwd_only, joint_fwd_bwd, Match, register_replacement
+from ..pattern_matcher import fwd_only, gen_register_replacement, joint_fwd_bwd, Match
 
 aten = torch.ops.aten
 
 
+# This flag is only used for testing purpose.
+# Changing it to True will ignore comparing do_bench times
+# between original pattern and padded one.
+_skip_do_bench_times = False
+
+
 def fetch_fake_tensors(match, kwarg_names) -> List[Tensor]:
     kwargs = match.kwargs
     return [kwargs[name].meta["val"] for name in kwarg_names]
@@ -46,29 +53,44 @@ def check_dtype(a: Tensor, b: Tensor) -> bool:
     return a.is_floating_point() and b.is_floating_point()
 
 
-def is_symbolic(a: Optional[Tensor]) -> bool:
-    return a is not None and any(
-        isinstance(x, torch.SymInt) for x in chain(a.size(), a.stride())
-    )
-
-
-def any_is_symbolic(*args: Optional[Tensor]) -> bool:
-    return any(is_symbolic(a) for a in args)
-
-
 def should_pad_common(
     mat1: Tensor, mat2: Tensor, input: Optional[Tensor] = None
 ) -> bool:
+    # It's fine we have symbolic shapes or strides as long as they
+    # have hints. Later, we will make sure we only pad non-symbolic dimensions.
+    def valid_shape_and_stride(t: Optional[Tensor]) -> bool:
+        if t is None:
+            return True
+
+        symbolic_cnt = 0
+        for x in t.size():
+            if isinstance(x, int):
+                continue
+            elif utils.is_symbolic(x):
+                if not x.node.has_hint():
+                    return False
+                symbolic_cnt += 1
+            else:
+                return False
+        # filter out cases where all dimentions are symbolic
+        if symbolic_cnt == len(t.size()):
+            return False
+        return all(
+            isinstance(x, int) or (utils.is_symbolic(x) and x.node.has_hint())
+            for x in t.stride()
+        )
+
     return (
         torch._inductor.config.shape_padding
         and check_device(mat1, mat2)
         and check_dtype(mat1, mat2)
-        and not any_is_symbolic(mat1, mat2, input)
+        and all(valid_shape_and_stride(t) for t in (mat1, mat2, input))
     )
 
 
-def get_padded_length(x: int, alignment_size) -> int:
-    if alignment_size == 0 or x % alignment_size == 0:
+def get_padded_length(x: Union[int, torch.SymInt], alignment_size) -> int:
+    # we don't pad x if it is symbolic
+    if isinstance(x, torch.SymInt) or alignment_size == 0 or x % alignment_size == 0:
         return 0
     return int((x // alignment_size + 1) * alignment_size) - x
 
@@ -162,6 +184,15 @@ def is_mm_compute_bound(M: int, K: int, N: int, dtype: torch.dtype) -> bool:
         return False
     arithmetic_intensity = (M * N * K) / denominator
 
+    # we have experienced some large perf hits in this case, even in bandwidth bound regimes
+    if (
+        dtype is torch.bfloat16
+        and K > M
+        and K > N
+        and torch.cuda.get_device_capability() < (9, 0)
+    ):  # doesnt repro on h100s:
+        return True
+
     # Fails with AMD
     try:
         machine_balance = (
@@ -220,7 +251,7 @@ def should_pad_bench(
         return False
 
     do_bench = functools.partial(
-        utils.do_bench,
+        torch._inductor.runtime.runtime_utils.do_bench,
         warmup=5,
     )
 
@@ -235,7 +266,7 @@ def should_pad_bench(
             n_padded_length = get_padded_length(n, get_alignment_size(mat2))
         elif op is torch.ops.aten.bmm:
             m = mat1.shape[1]
-            k = mat2.shape[2]
+            k = mat1.shape[2]
             n = mat2.shape[2]
 
             m_padded_length = get_padded_length(m, get_alignment_size(mat1))
@@ -258,15 +289,30 @@ def should_pad_bench(
         if cached_pad is not None:
             return cached_pad
 
-        mat1 = torch.randn_like(mat1)
-        mat2 = torch.randn_like(mat2)
+        def realize_symbols(ds):
+            return [d if isinstance(d, int) else d.node.hint for d in ds]
+
+        def realize_tensor(t):
+            if isinstance(t, FakeTensor):
+                size_hints = realize_symbols(t.size())
+                stride_hint = realize_symbols(t.stride())
+                real_size = (
+                    sum((d - 1) * s for d, s in zip(size_hints, stride_hint)) + 1
+                )
+                real_t = torch.randn(real_size, dtype=t.dtype, device=t.device)
+                return torch.as_strided(real_t, size_hints, stride_hint)
+            else:
+                return torch.randn_like(t)
+
+        mat1 = realize_tensor(mat1)
+        mat2 = realize_tensor(mat2)
         if op is torch.ops.aten.bmm or op is torch.ops.aten.mm:
             ori_time = do_bench(
                 lambda: op(mat1, mat2),
             )
         else:
             if input is not None:
-                input = torch.randn_like(input)
+                input = realize_tensor(input)
             ori_time = do_bench(
                 lambda: op(input, mat1, mat2),
             )
@@ -312,7 +358,7 @@ def should_pad_bench(
         # Shape padding introduces additional memory ops. Based on microbenchmarks, 1.1x represents a reasonable
         # tradeoff between performance improvement from shape padding and overhead from additional memory ops
         # TODO: Build a learned model which would be better than this heuristic
-        should_pad = ori_time > pad_time * 1.1
+        should_pad = _skip_do_bench_times or ori_time > pad_time * 1.1
         set_cached_should_pad(key, should_pad)
 
         return should_pad
@@ -449,7 +495,10 @@ def _pad_mm_init():
         ),
     ]:
         assert isinstance(workaround, dict)  # mypy is unable to infer the type properly
-        register_replacement(
+        name = pattern.__name__
+
+        gen_register_replacement(
+            f"{name}_training",
             pattern,
             replacement,
             args,
@@ -458,7 +507,9 @@ def _pad_mm_init():
             extra_check=extra_check,
             scalar_workaround=workaround,
         )
-        register_replacement(
+
+        gen_register_replacement(
+            f"{name}_inference",
             pattern,
             replacement,
             args,
diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py
index 47c7fc53d254a..585d261787e4c 100644
--- a/torch/_inductor/fx_passes/post_grad.py
+++ b/torch/_inductor/fx_passes/post_grad.py
@@ -2,37 +2,33 @@
 import itertools
 import logging
 import operator
-from collections import Counter, defaultdict, namedtuple
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
-
-from sympy import Expr
+from collections import Counter, defaultdict
+from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING, Union
 
 import torch
 import torch._inductor as inductor
 import torch.utils._pytree as pytree
 from torch import fx
 from torch._decomp import register_decomposition
+from torch._dynamo.utils import counters, optimus_scuba_log
 
-from torch._higher_order_ops.triton_kernel_wrap import triton_kernel_wrapper_functional
 from torch._prims_common import is_boolean_dtype, is_expandable_to, is_integer_dtype
 
-from torch._utils_internal import print_graph
-from torch.fx.experimental.symbolic_shapes import definitely_true, sym_eq
-from torch.fx.immutable_collections import immutable_dict
+from torch._utils_internal import upload_graph
+from torch.fx.experimental.symbolic_shapes import statically_known_true, sym_eq
 
-from .. import config, inductor_prims, ir, pattern_matcher
+from .. import config, ir, pattern_matcher
 from ..fx_utils import FakeTensorUpdater, get_fake_args_kwargs, get_node_storage
 
-from ..lowering import (
-    inplaceable_foreach_ops as inplaceable_foreach_ops_lowerings,
-    lowerings as L,
-)
+from ..lowering import lowerings as L
 from ..pattern_matcher import (
     _return_true,
     Arg,
     CallFunction,
+    CallFunctionVarArgs,
     filter_nodes,
     get_arg_value,
+    get_mutation_region_id,
     Ignored,
     init_once_fakemode,
     KeywordArg,
@@ -45,7 +41,15 @@
 )
 from ..utils import decode_device, is_pointwise_use
 from ..virtualized import V
-from .group_batch_fusion import group_batch_fusion_passes
+from .ddp_fusion import fuse_ddp_communication
+from .group_batch_fusion import group_batch_fusion_passes, POST_GRAD_FUSIONS
+from .pre_grad import is_same_dict, save_inductor_dict
+from .reinplace import reinplace_inplaceable_ops
+from .split_cat import POST_GRAD_PATTERNS
+
+if TYPE_CHECKING:
+    from sympy import Expr
+
 
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
@@ -57,8 +61,6 @@
     PatternMatcherPass(),
     PatternMatcherPass(),
 ]
-# patterns applied only in inference
-inference_patterns = PatternMatcherPass()
 
 
 def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
@@ -82,20 +84,31 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
 
     if config.pattern_matcher:
         lazy_init()
-
-        print_graph(gm.graph, "Before group batch fusion in post grad pass.")
+        optimus_scuba_log["before_recompile_post_grad"] = upload_graph(gm.graph)
         group_batch_fusion_passes(gm.graph, pre_grad=False)
-        print_graph(gm.graph, "After group batch fusion in post grad pass.")
         remove_noop_ops(gm.graph)
-        print_graph(gm.graph, "Before split cat in post grad pass.")
         for patterns in pass_patterns:
-            patterns.apply(gm.graph)
-            print_graph(
-                gm.graph,
-                "Apply split cat pattern matcher PatternMatcherPass in post grad.",
+            patterns.apply(gm.graph)  # type: ignore[arg-type]
+        for pass_name in config.post_grad_fusion_options:
+            # skip all patterns for group batch fusions
+            if pass_name in POST_GRAD_FUSIONS:
+                continue
+            pattern_matcher_pass = POST_GRAD_PATTERNS[pass_name]
+            inductor_before_change = save_inductor_dict(
+                [pattern_matcher_pass.pass_name]
             )
-        if is_inference:
-            inference_patterns.apply(gm.graph)
+            pattern_matcher_pass.apply(gm.graph)  # type: ignore[arg-type]
+            if not is_same_dict(counters["inductor"], inductor_before_change):
+                optimus_scuba_log[
+                    f"{pattern_matcher_pass.pass_name}_post_grad"
+                ] = upload_graph(gm.graph)
+
+    if config._fuse_ddp_communication:
+        fuse_ddp_communication(
+            gm.graph,
+            config._fuse_ddp_communication_passes,
+            config._fuse_ddp_bucket_size,
+        )
 
     if config.post_grad_custom_post_pass is not None:
         config.post_grad_custom_post_pass(gm.graph)
@@ -106,18 +119,20 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
 
     fake_tensor_updater.incremental_update()
 
-    # Keep this last, since it introduces mutation. Look at
+    # Keep these last, since they introduces mutation. Look at
     # ./fx_passes/README.md for a discussion of mutation invariants.
     reinplace_inplaceable_ops(gm.graph)
+    decompose_auto_functionalized(gm.graph)
+
     gm.recompile()
+    optimus_scuba_log["after_recompile_post_grad"] = upload_graph(gm.graph)
     gm.graph.lint()
 
-    print_graph(gm.graph, "After recompile in post grad pass.")
-
 
 @init_once_fakemode
 def lazy_init():
     if torch._C._has_mkldnn:
+        from . import decompose_mem_bound_mm  # noqa: F401
         from .mkldnn_fusion import _mkldnn_fusion_init
 
         _mkldnn_fusion_init()
@@ -129,6 +144,8 @@ def visit(other_node):
             other_node.op == "call_function"
             and other_node.target != operator.getitem
             and all((n in seen_nodes) for n in other_node.users)
+            and get_mutation_region_id(graph, node)
+            == get_mutation_region_id(graph, other_node)
         ):
             # move node's producers right before it
             node.prepend(other_node)
@@ -139,12 +156,7 @@ def visit(other_node):
     # copy_ will appear at the end of functionalized graphs when there is mutation on inputs,
     # and this reordering doesnt work well with mutation
     first_copy = next(
-        (
-            node
-            for node in graph.nodes
-            if node.op == "call_function"
-            and node.target == torch.ops.aten.copy_.default
-        ),
+        iter(graph.find_nodes(op="call_function", target=torch.ops.aten.copy_.default)),
         None,
     )
     past_mutating_epilogue = True if first_copy is None else False
@@ -175,12 +187,30 @@ def register_lowering_pattern(pattern, extra_check=_return_true, pass_number=1):
 ################################################################################
 
 
+def is_valid_mm_plus_mm(match: Match):
+    *b1, m1, k1 = match.kwargs["mat1"].meta.get("tensor_meta").shape
+    *b2, k2, n1 = match.kwargs["mat2"].meta.get("tensor_meta").shape
+    if k1 != k2:
+        return False
+
+    *b1, m2, k3 = match.kwargs["mat3"].meta.get("tensor_meta").shape
+    *b2, k4, n2 = match.kwargs["mat4"].meta.get("tensor_meta").shape
+    if k3 != k4:
+        return False
+
+    if m1 != m2 or n1 != n2:
+        return False
+
+    return True
+
+
 @register_lowering_pattern(
     CallFunction(
         aten.add,
-        CallFunction(aten.mm, Arg(), Arg()),
-        CallFunction(aten.mm, Arg(), Arg()),
-    )
+        CallFunction(aten.mm, KeywordArg("mat1"), KeywordArg("mat2")),
+        CallFunction(aten.mm, KeywordArg("mat3"), KeywordArg("mat4")),
+    ),
+    extra_check=is_valid_mm_plus_mm,
 )
 def mm_plus_mm(match: Match, mat1, mat2, mat3, mat4):
     return inductor.kernel.mm_plus_mm.tuned_mm_plus_mm(mat1, mat2, mat3, mat4)
@@ -372,7 +402,7 @@ def cat_tuned_op(match, inputs, dim, *, op, shape_of):
         if new_size is None:
             new_size = shape
         else:
-            new_size[notdim] = V.graph.sizevars.guard_equals(
+            new_size[notdim] = V.graph.sizevars.guard_equals(  # type: ignore[call-overload]
                 shape[notdim], new_size[notdim]
             )
             new_size[dim] += shape[dim]
@@ -396,7 +426,7 @@ def cat_tuned_op(match, inputs, dim, *, op, shape_of):
         dst = ir.SliceView.create(kernel_tensor, dim, offsets_start[i], offsets_end[i])
         src = op(*inputs[i], layout=dst.get_layout()).data.data
         assert isinstance(src, (ir.ExternKernelOut, ir.TemplateBuffer))
-        src.layout = ir.AliasedLayout(dst)
+        src.layout = ir.NonOwningLayout(dst)
         kernel.inputs.append(src)
 
     kernel.name = V.graph.register_buffer(kernel)
@@ -502,13 +532,13 @@ def same_meta(node1: torch.fx.Node, node2: torch.fx.Node):
     return (
         val1 is not None
         and val2 is not None
-        and definitely_true(sym_eq(val1.size(), val2.size()))
+        and statically_known_true(sym_eq(val1.size(), val2.size()))
         and val1.layout == val2.layout
         and val1.dtype == val2.dtype
         and val1.device == val2.device
         and (
             val1.layout != torch.strided
-            or definitely_true(sym_eq(val1.stride(), val2.stride()))
+            or statically_known_true(sym_eq(val1.stride(), val2.stride()))
         )
     )
 
@@ -598,16 +628,21 @@ def remove_noop_ops(graph: torch.fx.Graph):
     """
     Removes both operations that are essentially aten.clone and operations that are essentially aten.alias from the graph.
     """
+    inputs = set()
     input_storages = set()
     output_storages = set()
 
-    for node in graph.nodes:
-        if node.op == "placeholder":
-            input_storages.add(get_node_storage(node))
-        else:
-            break
-
-    for out in next(iter(reversed(graph.nodes))).args[0]:
+    for node in graph.find_nodes(op="placeholder"):
+        inputs.add(node)
+        input_storages.add(get_node_storage(node))
+
+    output_node = next(iter(reversed(graph.nodes)))
+    assert output_node.op == "output"
+    outputs = output_node.args[0]
+    if not isinstance(outputs, (list, tuple)):
+        # nested subgraphs can have singleton outputs
+        outputs = (outputs,)
+    for out in outputs:
         if isinstance(out, torch.fx.Node):
             output_storages.add(get_node_storage(out))
 
@@ -620,13 +655,28 @@ def remove_noop_ops(graph: torch.fx.Graph):
                 src = src_index(node.args)
             if not isinstance(src, torch.fx.Node):
                 continue
+            # Don't introduce new aliasing between inputs and outputs.
             # See fx_passes/README.md for a discussion of why this is
             # necessary.
-            if get_node_storage(node) in output_storages and (
-                get_node_storage(src) in input_storages
-                or get_node_storage(src) in output_storages
+            node_storage = get_node_storage(node)
+            src_storage = get_node_storage(src)
+            node_is_view = node_storage == src_storage
+            if (
+                not node_is_view
+                and node_storage in output_storages
+                and (src_storage in input_storages or src_storage in output_storages)
             ):
                 continue
+
+            # Even if input and outputs are expected to alias,
+            # don't make "node is src" True
+            if (
+                node_is_view
+                and node in output_node.args
+                and (src in inputs or src in output_node.args)
+            ):
+                continue
+
             is_valid, args, kwargs = get_fake_args_kwargs(node)
             if not is_valid:
                 continue
@@ -635,169 +685,37 @@ def remove_noop_ops(graph: torch.fx.Graph):
                 graph.erase_node(node)
 
 
-InplaceableOp = namedtuple("InplaceableOp", ["inplace_op", "mutated_arg"])
-
-inplaceable_ops = {
-    aten.index_put.default: InplaceableOp(aten.index_put_.default, 0),
-    aten._unsafe_index_put.default: InplaceableOp(inductor_prims._unsafe_index_put_, 0),
-}
+def decompose_auto_functionalized(graph):
+    graph_pass = PatternMatcherPass()
 
-try:
-    c10d_functional = torch.ops._c10d_functional
-    inplaceable_collective_ops = {
-        c10d_functional.all_reduce.default: InplaceableOp(
-            c10d_functional.all_reduce_.default, 0
-        ),
-        c10d_functional.all_reduce_coalesced.default: InplaceableOp(
-            c10d_functional.all_reduce_coalesced_.default, 0
-        ),
-    }
-    inplaceable_ops.update(inplaceable_collective_ops)
-except AttributeError:
-    # _c10d_functional ops are only available when torch
-    # is built with USE_DISTRIBUTED=1.
-    pass
-
-inplaceable_foreach_ops = {}
-for outplace_op, inplace_op in inplaceable_foreach_ops_lowerings.items():
-    inplaceable_foreach_ops[outplace_op] = InplaceableOp(inplace_op, 0)
-
-
-inplaceable_triton_ops = {triton_kernel_wrapper_functional}
-
-
-def reinplace_inplaceable_ops(graph):
-    """
-    Reinplaces in-placeable operations.
-    If there are no uses of a view of the mutated arg after the current node,
-    it is possible to inplace the op.
-    This above algorithm could be justified by observing side effects. While
-    we traverse the graph in forwards direction, only latter nodes could view
-    side effects of the current node. If the current node is not used later as
-    well as no view of this node is used later in the graph, then it is safe to
-    inplace as there would be no way to observe the side effects.
-    This condition is slightly different for graph inputs where they can only
-    be inplaced if the above condition is true and there's a copy_ in the
-    epilogue that signals that the caller wants to observe the mutation.
-    """
-
-    copy_args_to_copy_nodes = {}
-    mutated_inputs = set()
-    storage_to_nodes = defaultdict(list)
-    node_order: Dict[Any, int] = {}
-    for i, node in enumerate(reversed(graph.nodes)):
-        node_order[node] = len(graph.nodes) - i - 1
-        storage_to_nodes[get_node_storage(node)].append(node)
-        if node.target == aten.copy_.default and node.args[0].op == "placeholder":
-            dst = node.args[0]
-            src = node.args[1]
-            # If the target is a getitem and it indexes a possible clone,
-            # then skip over it
-            if src.target == operator.getitem and (
-                (
-                    src.args[0].target == triton_kernel_wrapper_functional
-                    and src.args[0].kwargs["kwargs"][src.args[1]] == node.args[0]
-                )
-                or (src.args[0].target in inplaceable_foreach_ops)
-            ):
-                src = src.args[0]
-
-            copy_args_to_copy_nodes[(dst, src)] = node
-
-            mutated_inputs.add(node.args[0])
-
-    def any_use_of_views_after_node(node, shared_view_nodes, *, copy_node):
-        node_loc = node_order[node]
-        for view in shared_view_nodes:
-            for user in view.users:
-                # Skip all users before node
-                if node_order[user] <= node_loc:
-                    continue
-                # Skip over the copy_ epilogue node that could get reinplaced
-                if copy_node == user:
-                    continue
-                return True
-        return False
-
-    def can_inplace(node, mutated_arg):
-        if isinstance(mutated_arg, (list, tuple)):
-            return all(can_inplace(node, arg) for arg in mutated_arg)
-
-        if get_node_storage(mutated_arg) is None:
-            return False
-        shared_view_nodes = storage_to_nodes[get_node_storage(mutated_arg)]
-        if mutated_arg.op == "placeholder":
-            if not (
-                copy_node := copy_args_to_copy_nodes.get((mutated_arg, node), False)
-            ):
-                return False
+    @register_graph_pattern(
+        CallFunctionVarArgs(torch.ops.higher_order.auto_functionalized),
+        pass_dict=graph_pass,
+    )
+    def replacement(match: Match, *args, **kwargs):
+        from torch._higher_order_ops.auto_functionalize import auto_functionalized_dense
 
-            if any_use_of_views_after_node(
-                node, shared_view_nodes, copy_node=copy_node
-            ):
-                return False
+        only_clone_these_tensors = tuple(
+            match.nodes[0].meta.get("only_clone_these_tensors", [])
+        )
 
-            return True
-        elif any(view.op == "placeholder" for view in shared_view_nodes):
-            # If mutated arg is view of any of the inputs of the graph,
-            # do not allow for inplacing.
-            # This would require more sophisticated algorithm to handle
-            return False
-        else:
-            return not any_use_of_views_after_node(
-                node, shared_view_nodes, copy_node=None
-            )
+        flat_args, spec = pytree.tree_flatten((args, kwargs))
 
-    replace_list: List[Tuple[Any, Any]] = []
-    for node in graph.nodes:
-        if (inplaceable_op := inplaceable_ops.get(node.target, None)) is not None:
-            mutated_arg = node.args[inplaceable_op.mutated_arg]
-            if can_inplace(node, mutated_arg):
-                # TODO(yifu): this doesn't properly remove copy epilogues for
-                # ops that mutate multiple inputs. Need to revise the copy
-                # node tracking logic to support the case.
-                copy_node = copy_args_to_copy_nodes.get((mutated_arg, node))
-                if copy_node is not None:
-                    graph.erase_node(copy_node)
-                node.target = inplaceable_op.inplace_op
-        elif node.target in inplaceable_triton_ops:
-            # inplaceable_triton_ops take an additional argument called
-            # tensors_to_clone which contain a list of tensors to clone
-            # This pass iterates over them and sees which ones are safe
-            # to eliminate (i.e. no longer need the clones)
-            tensors_to_clone = []
-            for arg in node.kwargs["tensors_to_clone"]:
-                assert arg in node.kwargs["kwargs"]
-                mutated_arg = node.kwargs["kwargs"][arg]
-                if can_inplace(node, mutated_arg):
-                    copy_node = copy_args_to_copy_nodes.get((mutated_arg, node))
-                    if copy_node is not None:
-                        graph.erase_node(copy_node)
-                    for user in node.users:
-                        if user.target == operator.getitem and user.args[1] == arg:
-                            replace_list.append((user, mutated_arg))
-                else:
-                    tensors_to_clone.append(arg)
-            kwargs = dict(node.kwargs)
-            kwargs["tensors_to_clone"] = tensors_to_clone
-            node.kwargs = immutable_dict(kwargs)
-        elif (
-            inplaceable_op := inplaceable_foreach_ops.get(node.target, None)
-        ) is not None:
-            mutated_args = node.args[inplaceable_op.mutated_arg]
-
-            if not all((arg, node) in copy_args_to_copy_nodes for arg in mutated_args):
-                continue
+        # NB: we combine (args, kwargs) into flat args for replacing.
+        # This is replace_by_example uses make_fx which does not support
+        # tracing a function with kwargs.
+        def decomp(*flat_args):
+            args, kwargs = pytree.tree_unflatten(flat_args, spec)
+            return auto_functionalized_dense(*args, only_clone_these_tensors, **kwargs)
 
-            if can_inplace(node, mutated_args):
-                for arg in mutated_args:
-                    copy_node = copy_args_to_copy_nodes[(arg, node)]
-                    graph.erase_node(copy_node)
+        with V.fake_mode:
+            match.replace_by_example(decomp, flat_args, run_dce=False)
 
-                node.target = inplaceable_op.inplace_op
-    for node, replacement in replace_list:
-        node.replace_all_uses_with(replacement)
-        graph.erase_node(node)
+    graph_pass.apply(graph)
+    for node in graph.find_nodes(
+        op="call_function", target=torch.ops.higher_order.auto_functionalized
+    ):
+        raise AssertionError("auto_functionalized was not removed")
 
 
 @register_lowering_pattern(
@@ -883,9 +801,10 @@ def view_to_reshape(gm):
     """
     Replace view ops in the GraphModule to reshape ops.
     """
-    for nd in gm.graph.nodes:
-        if nd.target == torch.ops.aten.view.default:
-            nd.target = torch.ops.aten.reshape.default
+    for nd in gm.graph.find_nodes(
+        op="call_function", target=torch.ops.aten.view.default
+    ):
+        nd.target = torch.ops.aten.reshape.default
 
 
 def should_prefer_unfused_addmm(match):
diff --git a/torch/_inductor/fx_passes/pre_grad.py b/torch/_inductor/fx_passes/pre_grad.py
index b1b6750e94e77..9af2440eb80bd 100644
--- a/torch/_inductor/fx_passes/pre_grad.py
+++ b/torch/_inductor/fx_passes/pre_grad.py
@@ -1,11 +1,12 @@
 import copy
+import itertools
 import logging
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 import torch
 import torch.nn as nn
-from torch._dynamo.utils import detect_fake_mode
-from torch._utils_internal import print_graph
+from torch._dynamo.utils import counters, detect_fake_mode, optimus_scuba_log
+from torch._utils_internal import upload_graph
 from torch.fx.experimental.optimization import (
     matches_module_pattern,
     replace_node_module,
@@ -22,27 +23,76 @@
     PatternMatcherPass,
     stable_topological_sort,
 )
-from ..utils import is_cpu_device
-from .group_batch_fusion import group_batch_fusion_passes
+from ..utils import is_cpu_device, pass_execution_and_save
+from .group_batch_fusion import group_batch_fusion_passes, PRE_GRAD_FUSIONS
 from .misc_patterns import numpy_compat_normalization
+from .split_cat import PRE_GRAD_PATTERNS
 
 log = logging.getLogger(__name__)
 
-normalization_pass = PatternMatcherPass(prevent_match_across_mutations=True)
-merge_splits_pass = PatternMatcherPass(prevent_match_across_mutations=True)
-split_cat_pass = PatternMatcherPass(prevent_match_across_mutations=True)
-unbind_stack_pass = PatternMatcherPass(prevent_match_across_mutations=True)
-efficient_conv_bn_eval_pass = PatternMatcherPass(prevent_match_across_mutations=True)
-merge_getitem_cat_pass = PatternMatcherPass(prevent_match_across_mutations=True)
-predispatch_pass = PatternMatcherPass(prevent_match_across_mutations=True)
-
-pattern_matcher_passes: List[PatternMatcherPass] = [
-    normalization_pass,
-    merge_getitem_cat_pass,
-    merge_splits_pass,
-    split_cat_pass,
-    unbind_stack_pass,
-    efficient_conv_bn_eval_pass,
+efficient_conv_bn_eval_pass = PatternMatcherPass(
+    prevent_match_across_mutations=True, pass_name="efficient_conv_bn_eval_pass"
+)
+
+fuse_split_linear_add_pass = PatternMatcherPass(
+    prevent_match_across_mutations=True,
+    pass_name="fuse_split_linear_add_pass",
+)
+fuse_chunk_squeeze_cat_pass = PatternMatcherPass(
+    prevent_match_across_mutations=True,
+    pass_name="fuse_chunk_squeeze_cat_pass",
+)
+remove_reshape_pass = PatternMatcherPass(
+    prevent_match_across_mutations=True,
+    pass_name="remove_reshape_pass",
+)
+
+# based on predispatch aten IR
+normalization_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+merge_splits_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+split_cat_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+unbind_stack_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+merge_getitem_cat_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+merge_stack_tahn_unbind_pass_aten = PatternMatcherPass(
+    prevent_match_across_mutations=True
+)
+mutate_cat_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+remove_split_with_size_one_pass_aten = PatternMatcherPass(
+    prevent_match_across_mutations=True
+)
+
+
+def save_inductor_dict(pass_to_compare=None):
+    if not pass_to_compare:
+        pass_to_compare = list(config.pre_grad_fusion_options.keys()) + list(
+            config.post_grad_fusion_options.keys()
+        )
+    return {p: dict(counters["inductor"]).get(p, 0) for p in pass_to_compare}
+
+
+def is_same_dict(inductor_dict, optimus_dict):
+    for pass_name, count in optimus_dict.items():
+        if count != dict(inductor_dict).get(pass_name, 0):
+            return False
+    return True
+
+
+def fuse_parallel_linear_pass(graph):
+    return None
+
+
+def remove_split_ops(graph, shape_prop):
+    return None
+
+
+pattern_matcher_passes_aten: List[PatternMatcherPass] = [
+    remove_split_with_size_one_pass_aten,
+    merge_getitem_cat_pass_aten,
+    merge_stack_tahn_unbind_pass_aten,
+    merge_splits_pass_aten,
+    mutate_cat_pass_aten,
+    split_cat_pass_aten,
+    unbind_stack_pass_aten,
 ]
 
 
@@ -54,7 +104,7 @@ def lazy_init():
         from . import fb  # type: ignore[attr-defined]  # noqa: F401
 
 
-def pre_grad_passes(gm: torch.fx.GraphModule, example_inputs):
+def pre_grad_passes(gm: torch.fx.GraphModule, example_inputs=None):
     """
     Apply passes on the input FX graph using Torch IR.
 
@@ -66,40 +116,140 @@ def pre_grad_passes(gm: torch.fx.GraphModule, example_inputs):
     Consider adding a new pass to post_grad.py or joint_graph.py which
     are after functionalization and normalization.
     """
-
     if config.pattern_matcher:
         lazy_init()
+        if hasattr(
+            config, "fx_passes_numeric_check"
+        ) and config.fx_passes_numeric_check.get("pre_grad", False):
+            gm_before_fx_passes = gm.__copy__()
         # explicitly run with predispatch atenIR based passes
         if config.is_predispatch:
-            group_batch_fusion_passes(gm.graph, pre_grad=True)
-            predispatch_pass.apply(gm.graph)
+
+            def shape_prop(mod) -> None:
+                ShapeProp(
+                    gm=mod,
+                    # pyre-fixme[16]: Module `torch._dynamo.utils` has no attribute `detect_fake_mode`
+                    fake_mode=detect_fake_mode(example_inputs),
+                ).propagate(*example_inputs)
+
+            # normalization pass
+            pass_execution_and_save(
+                normalization_pass_aten.apply,
+                gm,
+                example_inputs,
+                "[Pre grad(predispatch IR)]Apply normalization pass",
+            )
+            pass_execution_and_save(
+                group_batch_fusion_passes,
+                gm,
+                example_inputs,
+                "[Pre grad(predispatch IR)] Apply group_batch_fusion",
+            )
+            pass_execution_and_save(
+                fuse_chunk_squeeze_cat_pass.apply,
+                gm,
+                example_inputs,
+                "[Pre grad(predispatch IR)] Apply fuse_chunk_squeeze_cat_pass",
+            )
+            pass_execution_and_save(
+                fuse_split_linear_add_pass.apply,
+                gm,
+                example_inputs,
+                "[Pre grad(predispatch IR)] Apply fuse_split_linear_add_pass",
+            )
+
+            log.debug(
+                "[Pre grad(predispatch IR)]Before split cat in pre grad pass. graph: %s",
+                gm.graph,
+            )
+            for ind, pattern_matcher_pass_aten in enumerate(
+                pattern_matcher_passes_aten
+            ):
+                pass_execution_and_save(
+                    pattern_matcher_pass_aten.apply,
+                    gm,
+                    example_inputs,
+                    f"[Pre grad(predispatch IR)]Apply split_cat, index: {ind}",
+                )
+            pass_execution_and_save(
+                remove_reshape_pass.apply,
+                gm,
+                example_inputs,
+                "[Pre grad(predispatch IR)] Apply remove_reshape_pass",
+            )
+            pass_execution_and_save(
+                fuse_parallel_linear_pass,
+                gm,
+                example_inputs,
+                "[Pre grad(predispatch IR)] Apply fuse_parallel_linear_pass",
+            )
+            pass_execution_and_save(
+                lambda graph: remove_split_ops(graph.owning_module, shape_prop),
+                gm,
+                example_inputs,
+                "[Pre grad(predispatch IR)] Apply remove_split_ops",
+            )
+            shape_prop(gm)
+
         else:
-            gm = fuse_fx(gm, example_inputs)
+            # We only log the graph with changes to avoid the excessive compilation time
+            # https://fb.workplace.com/groups/257735836456307/permalink/633533465543207/
+            if example_inputs is not None:
+                gm = fuse_fx(gm, example_inputs)
             numpy_compat_normalization(gm.graph)
-            print_graph(gm.graph, "Before group batch fusion in pre grad pass.")
+            optimus_scuba_log["before_recompile_pre_grad"] = upload_graph(gm.graph)
             group_batch_fusion_passes(gm.graph, pre_grad=True)
-            print_graph(gm.graph, "Before split cat in pre grad pass.")
-            for pattern_matcher_pass in pattern_matcher_passes:
-                pattern_matcher_pass.apply(gm.graph)
-                print_graph(
-                    gm.graph,
-                    "Apply split cat pattern matcher PatternMatcherPass in pre grad.",
+            for pass_name in config.pre_grad_fusion_options:
+                # skip all patterns for group batch fusions
+                if pass_name in PRE_GRAD_FUSIONS:
+                    continue
+                pattern_matcher_pass = PRE_GRAD_PATTERNS[pass_name]
+                inductor_before_change = save_inductor_dict(
+                    [pattern_matcher_pass.pass_name]
                 )
+                pattern_matcher_pass.apply(gm.graph)  # type: ignore[arg-type]
+                if not is_same_dict(counters["inductor"], inductor_before_change):
+                    optimus_scuba_log[
+                        f"{pattern_matcher_pass.pass_name}_pre_grad"
+                    ] = upload_graph(gm.graph)
+            # TODO: move efficient_conv_bn_eval_pass to the fusions dict too.
+            efficient_conv_bn_eval_pass.apply(gm.graph)  # type: ignore[arg-type]
 
     if config.pre_grad_custom_pass is not None:
         config.pre_grad_custom_pass(gm.graph)
     stable_topological_sort(gm.graph)
+
+    from .quantization import quant_lift_up
+
+    quant_lift_up(gm)
+
     gm.graph.lint()
     gm.recompile()
+    optimus_scuba_log["after_recompile_pre_grad"] = upload_graph(gm.graph)
+
+    if (
+        config.pattern_matcher
+        and hasattr(config, "fx_passes_numeric_check")
+        and config.fx_passes_numeric_check.get("pre_grad", False)
+        and example_inputs is not None
+    ):
+        from .numeric_utils import numeric_check_if_enabled
 
-    print_graph(gm.graph, "After recompile in pre grad pass.")
+        gm_after_fx_passes = gm.__copy__()
+        numeric_check_if_enabled(
+            gm_before_fx_passes,  # type: ignore[possibly-undefined]
+            gm_after_fx_passes,
+            example_inputs,
+            config.fx_passes_numeric_check.get("num_iterations", 1),
+            config.fx_passes_numeric_check.get("precision", 1e-4),
+        )
 
     return gm
 
 
 def fuse_fx(gm: torch.fx.GraphModule, example_inputs) -> torch.fx.GraphModule:
     is_cpu = is_cpu_device(example_inputs)
-
+    # pyre-fixme[16]: Module `torch._dynamo.utils` has no attribute `detect_fake_mode`
     fake_mode = detect_fake_mode(example_inputs)
 
     gm = sink_cat_after_pointwise(gm)
@@ -163,7 +313,43 @@ def fuse_conv_bn(gm: torch.fx.GraphModule, inplace=False) -> torch.fx.GraphModul
         (torch.nn.Conv3d, F.batch_norm),
     ]
     modules = dict(gm.named_modules())
+
+    class ConvBNFusion:
+        def __init__(
+            self,
+            bn_node,
+            conv_module,
+            bn_module=None,  # For BN Module
+            bn_running_mean=None,  # For Functional BN
+            bn_running_var=None,
+            bn_eps=None,
+            bn_weight=None,
+            bn_bias=None,
+        ):
+            self.bn_nodes = [
+                bn_node,
+            ]
+            self.conv_module = conv_module
+            self.bn_module = bn_module
+            self.bn_running_mean = bn_running_mean
+            self.bn_running_var = bn_running_var
+            self.bn_eps = bn_eps
+            self.bn_weight = bn_weight
+            self.bn_bias = bn_bias
+            self.fusion_enabled = True
+
+        def add_bn_node(self, bn_node):
+            self.bn_nodes.append(bn_node)
+
+        def disable_fusion(self):
+            self.fusion_enabled = False
+
+        def is_fusion_enabled(self):
+            return self.fusion_enabled
+
+    conv_bn_to_fuse: Dict[int, ConvBNFusion] = {}
     for pattern in modules_patterns:
+        conv_bn_to_fuse.clear()
         for node in gm.graph.nodes:
             if matches_module_pattern(pattern, node, modules):
                 if len(node.args[0].users) > 1:  # Output of conv is used by other nodes
@@ -175,12 +361,34 @@ def fuse_conv_bn(gm: torch.fx.GraphModule, inplace=False) -> torch.fx.GraphModul
                     continue
                 if not bn.track_running_stats:
                     continue
+
+                # Do hash based on the module name of conv
+                hash_id = hash(node.args[0].target)
+                if hash_id not in conv_bn_to_fuse:
+                    conv_bn_to_fuse[hash_id] = ConvBNFusion(node, conv, bn)
+                else:
+                    if bn == conv_bn_to_fuse[hash_id].bn_module:
+                        # Do fusion if same bn module
+                        conv_bn_to_fuse[hash_id].add_bn_node(node)
+                    else:
+                        # Disable the conv bn folding if conv shared by different bn
+                        conv_bn_to_fuse[hash_id].disable_fusion()
+
+        for conv_bn_fusion in conv_bn_to_fuse.values():
+            if conv_bn_fusion.is_fusion_enabled():
+                bn_nodes = conv_bn_fusion.bn_nodes
+                conv = conv_bn_fusion.conv_module
+                bn = conv_bn_fusion.bn_module
+
                 fused_conv = fuse_conv_bn_eval(conv, bn)
-                replace_node_module(node.args[0], modules, fused_conv)
-                node.replace_all_uses_with(node.args[0])
-                gm.graph.erase_node(node)
+                for bn_node in bn_nodes:
+                    replace_node_module(bn_node.args[0], modules, fused_conv)
+                    bn_node.replace_all_uses_with(bn_node.args[0])
+                    gm.graph.erase_node(bn_node)
+
     gm.graph.lint()
     for pattern in module_function_patterns:
+        conv_bn_to_fuse.clear()
         for node in gm.graph.nodes:
             if matches_module_function_pattern(pattern, node, modules):
                 # TODO: support kwargs.
@@ -193,8 +401,17 @@ def fuse_conv_bn(gm: torch.fx.GraphModule, inplace=False) -> torch.fx.GraphModul
                     continue
                 if type(bn_eps) is not float:
                     continue
+
+                def _used_by_same_conv_module(users):
+                    conv_module_name = users[0].args[0].target
+                    return all(
+                        conv_module_name == user.args[0].target for user in users
+                    )
+
                 bn_args_is_constant = all(
-                    n.op == "get_attr" and len(n.users) == 1 for n in node.args[1:5]
+                    n.op == "get_attr"
+                    and (len(n.users) == 1 or _used_by_same_conv_module(list(n.users)))
+                    for n in node.args[1:5]
                 )
                 if not bn_args_is_constant:
                     continue
@@ -204,6 +421,48 @@ def fuse_conv_bn(gm: torch.fx.GraphModule, inplace=False) -> torch.fx.GraphModul
                 bn_bias = fetch_attr(node.args[4].target, gm)
                 if bn_running_mean is None or bn_running_var is None:
                     continue
+
+                # Do hash based on the module name of conv
+                hash_id = hash(node.args[0].target)
+                if hash_id not in conv_bn_to_fuse:
+                    conv_bn_to_fuse[hash_id] = ConvBNFusion(
+                        node,
+                        conv,
+                        bn_running_mean=bn_running_mean,
+                        bn_running_var=bn_running_var,
+                        bn_eps=bn_eps,
+                        bn_weight=bn_weight,
+                        bn_bias=bn_bias,
+                    )
+                else:
+                    if (
+                        hash(bn_running_mean)
+                        == hash(conv_bn_to_fuse[hash_id].bn_running_mean)
+                        and hash(bn_running_var)
+                        == hash(conv_bn_to_fuse[hash_id].bn_running_var)
+                        and torch.allclose(
+                            torch.tensor(bn_eps),
+                            torch.tensor(conv_bn_to_fuse[hash_id].bn_eps),
+                        )
+                        and hash(bn_weight) == hash(conv_bn_to_fuse[hash_id].bn_weight)
+                        and hash(bn_bias) == hash(conv_bn_to_fuse[hash_id].bn_bias)
+                    ):
+                        # Do fusion if same functional bn
+                        conv_bn_to_fuse[hash_id].add_bn_node(node)
+                    else:
+                        # Disable the conv bn folding if conv shared by different bn
+                        conv_bn_to_fuse[hash_id].disable_fusion()
+
+        for conv_bn_fusion in conv_bn_to_fuse.values():
+            if conv_bn_fusion.is_fusion_enabled():
+                bn_nodes = conv_bn_fusion.bn_nodes
+                conv = conv_bn_fusion.conv_module
+                bn_running_mean = conv_bn_fusion.bn_running_mean
+                bn_running_var = conv_bn_fusion.bn_running_var
+                bn_eps = conv_bn_fusion.bn_eps
+                bn_weight = conv_bn_fusion.bn_weight
+                bn_bias = conv_bn_fusion.bn_bias
+
                 fused_conv = copy.deepcopy(conv)
                 fused_conv.weight, fused_conv.bias = fuse_conv_bn_weights(
                     fused_conv.weight,
@@ -214,9 +473,10 @@ def fuse_conv_bn(gm: torch.fx.GraphModule, inplace=False) -> torch.fx.GraphModul
                     bn_weight,
                     bn_bias,
                 )
-                replace_node_module(node.args[0], modules, fused_conv)
-                node.replace_all_uses_with(node.args[0])
-                gm.graph.erase_node(node)
+                for bn_node in bn_nodes:
+                    replace_node_module(bn_node.args[0], modules, fused_conv)
+                    bn_node.replace_all_uses_with(bn_node.args[0])
+                    gm.graph.erase_node(bn_node)
     gm.graph.lint()
     gm.recompile()
 
@@ -231,21 +491,21 @@ def __init__(self, node: torch.fx.Node) -> None:
 
     def get_input(self) -> torch.fx.Node:
         if len(self.node.args) > 0:
-            return self.node.args[0]
+            return self.node.args[0]  # type: ignore[return-value]
         else:
-            return self.node.kwargs["input"]
+            return self.node.kwargs["input"]  # type: ignore[return-value]
 
     def get_weight(self) -> torch.fx.Node:
         if len(self.node.args) > 1:
-            return self.node.args[1]
+            return self.node.args[1]  # type: ignore[return-value]
         else:
-            return self.node.kwargs["weight"]
+            return self.node.kwargs["weight"]  # type: ignore[return-value]
 
     def get_bias(self) -> torch.fx.Node:
         if len(self.node.args) > 2:
-            return self.node.args[2]
+            return self.node.args[2]  # type: ignore[return-value]
         else:
-            return self.node.kwargs["bias"] if "bias" in self.node.kwargs else None
+            return self.node.kwargs["bias"] if "bias" in self.node.kwargs else None  # type: ignore[return-value]
 
 
 class NormalizedMatmulNode:
@@ -256,27 +516,27 @@ def __init__(self, node: torch.fx.Node) -> None:
 
     def get_input(self) -> torch.fx.Node:
         if len(self.node.args) > 0:
-            return self.node.args[0]
+            return self.node.args[0]  # type: ignore[return-value]
         else:
-            return self.node.kwargs["input"]
+            return self.node.kwargs["input"]  # type: ignore[return-value]
 
     def get_other(self) -> torch.fx.Node:
         if len(self.node.args) > 1:
-            return self.node.args[1]
+            return self.node.args[1]  # type: ignore[return-value]
         else:
-            return self.node.kwargs["other"]
+            return self.node.kwargs["other"]  # type: ignore[return-value]
 
 
 def check_permute(node: torch.fx.Node) -> bool:
     ranks = len(node.meta["tensor_meta"].shape)
     if len(node.args) > 3:
-        permutation = [node.args[i] % ranks for i in range(1, ranks + 1)]
+        permutation = [node.args[i] % ranks for i in range(1, ranks + 1)]  # type: ignore[operator]
     elif (
         "permutation" in node.kwargs
         and node.kwargs["permutation"] is not None
-        and len(node.kwargs["permutation"]) > 2
+        and len(node.kwargs["permutation"]) > 2  # type: ignore[arg-type]
     ):
-        permutation = [i % ranks for i in node.kwargs["permutation"]]
+        permutation = [i % ranks for i in node.kwargs["permutation"]]  # type: ignore[union-attr]
     else:
         return False
     allowed_permutation = list(range(ranks))
@@ -317,8 +577,11 @@ def cat_args(tensors, dim=0):
                     return tensors, dim
 
                 tensors, dim = cat_args(*node.args, **node.kwargs)
+                new_kwargs = {
+                    name: val for name, val in user.kwargs.items() if name != "input"
+                }
                 new_tensors = [
-                    g.create_node(user.op, user.target, args=(arg,), kwargs=user.kwargs)
+                    g.create_node(user.op, user.target, args=(arg,), kwargs=new_kwargs)
                     for arg in tensors
                 ]
                 new_cat = g.create_node(
@@ -334,12 +597,8 @@ def cat_args(tensors, dim=0):
 
 
 def linear_permute_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
-    for node in module.graph.nodes:
-        if (
-            node.op == "call_method"
-            and node.target == "permute"
-            and check_permute(node)
-        ):
+    for node in module.graph.find_nodes(op="call_method", target="permute"):
+        if check_permute(node):
             if len(node.args) > 0:
                 input_node = node.args[0]
             else:
@@ -379,32 +638,33 @@ def linear_transpose(
 
 
 def permute_linear_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
-    for node in module.graph.nodes:
-        if node.op == "call_function" and node.target == torch.nn.functional.linear:
-            if len(node.args) > 0:
-                input_node = node.args[0]
+    for node in module.graph.find_nodes(
+        op="call_function", target=torch.nn.functional.linear
+    ):
+        if len(node.args) > 0:
+            input_node = node.args[0]
+        else:
+            input_node = node.kwargs["input"]
+        if (
+            input_node.op == "call_method"
+            and input_node.target == "permute"
+            and check_permute(input_node)
+        ):
+            normalized = NormalizedLinearNode(node)
+            if len(input_node.args) > 0:
+                input = input_node.args[0]
             else:
-                input_node = node.kwargs["input"]
-            if (
-                input_node.op == "call_method"
-                and input_node.target == "permute"
-                and check_permute(input_node)
-            ):
-                normalized = NormalizedLinearNode(node)
-                if len(input_node.args) > 0:
-                    input = input_node.args[0]
-                else:
-                    input = input_node.kwargs["input"]
-                weight = normalized.get_weight()
-                bias = normalized.get_bias()
-                with module.graph.inserting_before(node):
-                    fused_node = module.graph.call_function(
-                        transpose_linear, args=(input, weight, bias)
-                    )
-                    node.replace_all_uses_with(fused_node)
-                    module.graph.erase_node(node)
-                    if len(input_node.users) == 0:
-                        module.graph.erase_node(input_node)
+                input = input_node.kwargs["input"]
+            weight = normalized.get_weight()
+            bias = normalized.get_bias()
+            with module.graph.inserting_before(node):
+                fused_node = module.graph.call_function(
+                    transpose_linear, args=(input, weight, bias)
+                )
+                node.replace_all_uses_with(fused_node)
+                module.graph.erase_node(node)
+                if len(input_node.users) == 0:
+                    module.graph.erase_node(input_node)
 
     module.graph.lint()
     module.recompile()
@@ -412,50 +672,50 @@ def permute_linear_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
 
 
 def permute_matmul_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
-    for node in module.graph.nodes:
-        if node.op == "call_function" and (
-            node.target == torch.bmm or node.target == torch.matmul
+    for node in itertools.chain(
+        module.graph.find_nodes(op="call_function", target=torch.bmm),
+        module.graph.find_nodes(op="call_function", target=torch.matmul),
+    ):
+        normalized = NormalizedMatmulNode(node)
+        input_A_node = normalized.get_input()
+        input_B_node = normalized.get_other()
+        input_A = input_A_node
+        input_B = input_B_node
+        Atrans = Btrans = False
+        if (
+            input_A_node.op == "call_method"
+            and input_A_node.target == "permute"
+            and check_permute(input_A_node)
         ):
-            normalized = NormalizedMatmulNode(node)
-            input_A_node = normalized.get_input()
-            input_B_node = normalized.get_other()
-            input_A = input_A_node
-            input_B = input_B_node
-            Atrans = Btrans = False
-            if (
-                input_A_node.op == "call_method"
-                and input_A_node.target == "permute"
-                and check_permute(input_A_node)
-            ):
-                Atrans = True
-                if len(input_A_node.args) > 0:
-                    input_A = input_A_node.args[0]
-                else:
-                    input_A = input_A_node.kwargs["input"]
+            Atrans = True
+            if len(input_A_node.args) > 0:
+                input_A = input_A_node.args[0]  # type: ignore[assignment]
+            else:
+                input_A = input_A_node.kwargs["input"]  # type: ignore[assignment]
 
-            if (
-                input_B_node.op == "call_method"
-                and input_B_node.target == "permute"
-                and check_permute(input_B_node)
-            ):
-                Btrans = True
-                if len(input_B_node.args) > 0:
-                    input_B = input_B_node.args[0]
-                else:
-                    input_B = input_B_node.kwargs["input"]
+        if (
+            input_B_node.op == "call_method"
+            and input_B_node.target == "permute"
+            and check_permute(input_B_node)
+        ):
+            Btrans = True
+            if len(input_B_node.args) > 0:
+                input_B = input_B_node.args[0]  # type: ignore[assignment]
+            else:
+                input_B = input_B_node.kwargs["input"]  # type: ignore[assignment]
 
-            if Atrans or Btrans:
-                with module.graph.inserting_before(node):
-                    fused_node = module.graph.call_function(
-                        transpose_matmul,
-                        args=(input_A, input_B, Atrans, Btrans),
-                    )
-                node.replace_all_uses_with(fused_node)
-                module.graph.erase_node(node)
-                if Atrans and len(input_A_node.users) == 0:
-                    module.graph.erase_node(input_A_node)
-                if Btrans and len(input_B_node.users) == 0:
-                    module.graph.erase_node(input_B_node)
+        if Atrans or Btrans:
+            with module.graph.inserting_before(node):
+                fused_node = module.graph.call_function(
+                    transpose_matmul,
+                    args=(input_A, input_B, Atrans, Btrans),
+                )
+            node.replace_all_uses_with(fused_node)
+            module.graph.erase_node(node)
+            if Atrans and len(input_A_node.users) == 0:
+                module.graph.erase_node(input_A_node)
+            if Btrans and len(input_B_node.users) == 0:
+                module.graph.erase_node(input_B_node)
 
     module.graph.lint()
     module.recompile()
diff --git a/torch/_inductor/fx_passes/quantization.py b/torch/_inductor/fx_passes/quantization.py
index a079460f08352..0d4fc3b429332 100644
--- a/torch/_inductor/fx_passes/quantization.py
+++ b/torch/_inductor/fx_passes/quantization.py
@@ -8,6 +8,7 @@
 import torch
 from torch._dynamo.utils import counters
 from torch.fx.experimental.symbolic_shapes import has_free_symbols
+from torch.fx.node import map_arg
 from ..lowering import lowerings as L, require_channels_last
 from ..pattern_matcher import Arg, CallFunction, filter_nodes, KeywordArg, ListOf, Match
 from ..utils import pad_listlike
@@ -19,6 +20,18 @@
 quantized_decomposed = torch.ops.quantized_decomposed
 quantized = torch.ops.quantized
 
+# Only for per tensor quant since permute may changes the channel idx
+_PER_TENSOR_QUANTIZE_OPS = [
+    quantized_decomposed.quantize_per_tensor.default,
+    quantized_decomposed.quantize_per_tensor.tensor,
+]
+
+_VIEW_OPS = [
+    aten.transpose.int,
+    aten.permute.default,
+    aten.view.default,
+]
+
 """
 The quantization.py file primarily incorporates passes related to quantization fusion
 in inductor, includes:
@@ -36,12 +49,30 @@
 """
 
 
-def _may_generate_pattern_with_dtype_convert(pattern, dtype=Arg(), dtype_convert=True):
-    if dtype_convert:
+def _get_pattern_output_dtype(match: Match):
+    """
+    Get the pattern's output dtype from node's meta
+    Assume only 1 output node in this matched pattern.
+    """
+    pattern_output_nodes = match.output_nodes()
+    assert len(pattern_output_nodes) == 1
+    output_node = pattern_output_nodes[0]
+    assert isinstance(output_node, torch.fx.Node)
+    output_dtype = output_node.meta["val"].dtype
+    if output_dtype is torch.uint8:
+        output_dtype = None
+    return output_dtype
+
+
+def _may_generate_pattern_with_dtype_convert(
+    pattern, dtype=Arg(), with_dtype_convert=True, users=1
+):
+    if with_dtype_convert:
         return CallFunction(
             prims.convert_element_type.default,
             pattern,
             dtype,
+            _users=users,
         )
     else:
         return pattern
@@ -75,25 +106,28 @@ def _generate_linear_t_pattern(
     return t_pattern
 
 
-"""
-dequantize activation:
-    x = x.to(fp32)
-    x = x - zero_point
-    x = x * scale
-"""
-dequantize_per_tensor_activation_pattern = CallFunction(
-    aten.mul.Tensor,
-    CallFunction(
-        aten.sub.Tensor,
-        CallFunction(
-            prims.convert_element_type.default,
-            KeywordArg("x"),
-            KeywordArg("x_dq_dtype"),
-        ),
+def _unary_fusion_pattern(unary_fusion, call_fn, users, is_bf16):
+    # only insert to_dtype if is_bf16 is True
+    computation_call = _may_generate_pattern_with_dtype_convert(
+        call_fn, dtype=KeywordArg("to_float"), with_dtype_convert=is_bf16, users=users
+    )
+    return unary_fusion(computation_call)
+
+
+def get_dequantize_per_tensor_activation_pattern(is_tensor_overload=False):
+    dequantize_per_tensor_activation_pattern = CallFunction(
+        quantized_decomposed.dequantize_per_tensor.tensor
+        if is_tensor_overload
+        else quantized_decomposed.dequantize_per_tensor.default,
+        KeywordArg("x"),
+        KeywordArg("x_scale"),
         KeywordArg("x_zp"),
-    ),
-    KeywordArg("x_scale"),
-)
+        KeywordArg("x_quant_min"),
+        KeywordArg("x_quant_max"),
+        KeywordArg("x_dq_dtype"),
+    )
+    return dequantize_per_tensor_activation_pattern
+
 
 dequantize_per_channel_weight_pattern = CallFunction(
     quantized_decomposed.dequantize_per_channel.default,
@@ -125,56 +159,64 @@ def _generate_linear_t_pattern(
     memory_format=KeywordArg("memory_format"),
 )
 
-dequantize_qconv_pt2e_pattern = CallFunction(
-    torch.ops.onednn.qconv2d_pointwise.default,
-    KeywordArg("x"),
-    KeywordArg("x_scale"),  # x_scale
-    KeywordArg("x_zp"),  # x_zp
-    KeywordArg("packed_weight"),  # packed_weight
-    KeywordArg("w_scale"),  # w_scale
-    KeywordArg("w_zp"),  # w_zp
-    KeywordArg("b"),  # bias
-    KeywordArg("stride"),
-    KeywordArg("padding"),
-    KeywordArg("dilation"),
-    KeywordArg("groups"),
-    KeywordArg("inv_output_scale"),  # inv_output_scale = 1.0
-    KeywordArg("output_zero_point"),  # output_zero_point = 0
-    KeywordArg("output_dtype"),  # output_dtype = None
-    KeywordArg("attr"),  # attr = "none"
-    Arg(),  # scalars
-    Arg(),  # algorithm
-)
 
-qlinear_pt2e_pattern = CallFunction(
-    torch.ops.onednn.qlinear_pointwise.default,
-    KeywordArg("x"),
-    KeywordArg("x_scale"),
-    KeywordArg("x_zp"),
-    KeywordArg("packed_weight"),
-    KeywordArg("w_scale"),
-    KeywordArg("w_zp"),
-    KeywordArg("b"),
-    KeywordArg("output_scale"),
-    KeywordArg("output_zero_point"),
-    KeywordArg("output_dtype"),
-    KeywordArg("postop_name"),
-    KeywordArg("postop_args"),
-    KeywordArg("postop_algorithm"),
-)
+def get_dequantize_qconv_pt2e_pattern(users=1):
+    return CallFunction(
+        torch.ops.onednn.qconv2d_pointwise.default,
+        KeywordArg("x"),
+        KeywordArg("x_scale"),  # x_scale
+        KeywordArg("x_zp"),  # x_zp
+        KeywordArg("packed_weight"),  # packed_weight
+        KeywordArg("w_scale"),  # w_scale
+        KeywordArg("w_zp"),  # w_zp
+        KeywordArg("b"),  # bias
+        KeywordArg("stride"),
+        KeywordArg("padding"),
+        KeywordArg("dilation"),
+        KeywordArg("groups"),
+        KeywordArg("output_scale"),  # output_scale = 1.0
+        KeywordArg("output_zero_point"),  # output_zero_point = 0
+        KeywordArg("output_dtype"),  # output_dtype = None
+        KeywordArg("attr"),  # attr = "none"
+        Arg(),  # scalars
+        Arg(),  # algorithm
+        _users=users,
+    )
+
+
+def get_qlinear_pt2e_pattern(x_scale_zp_are_tensors, users=1):
+    qlinear_op = (
+        torch.ops.onednn.qlinear_pointwise.tensor
+        if x_scale_zp_are_tensors
+        else torch.ops.onednn.qlinear_pointwise.default
+    )
+    return CallFunction(
+        qlinear_op,
+        KeywordArg("x"),
+        KeywordArg("x_scale"),
+        KeywordArg("x_zp"),
+        KeywordArg("packed_weight"),
+        KeywordArg("w_scale"),
+        KeywordArg("w_zp"),
+        KeywordArg("b"),
+        KeywordArg("output_scale"),
+        KeywordArg("output_zero_point"),
+        KeywordArg("output_dtype"),
+        KeywordArg("postop_name"),
+        KeywordArg("postop_args"),
+        KeywordArg("postop_algorithm"),
+        _users=users,
+    )
+
 
 dequantize_accum_pattern = CallFunction(
-    aten.mul.Tensor,
-    CallFunction(
-        aten.sub.Tensor,
-        CallFunction(
-            prims.convert_element_type.default,
-            KeywordArg("accum"),
-            KeywordArg("accum_dq_dtype"),
-        ),
-        KeywordArg("accum_zp"),
-    ),
+    quantized_decomposed.dequantize_per_tensor.default,
+    KeywordArg("accum"),
     KeywordArg("accum_scale"),
+    KeywordArg("accum_zp"),
+    Arg(),
+    Arg(),
+    KeywordArg("accum_dq_dtype"),
 )
 
 
@@ -198,56 +240,25 @@ def generate_pattern_with_binary(
 
 def generate_pattern_with_unary(computation_call, unary_post_op):
     if unary_post_op is not None:
-        if unary_post_op == aten.hardtanh.default:
-            return CallFunction(
-                aten.clamp_max,
-                CallFunction(aten.clamp_min, computation_call, KeywordArg("min_value")),
-                KeywordArg("max_value"),
-            )
-        else:
-            return CallFunction(
-                unary_post_op,
-                computation_call,
-            )
+        return CallFunction(
+            unary_post_op,
+            computation_call,
+        )
     return computation_call
 
 
-def generate_pattern_with_output_quant(computation_call, dtype=torch.float32):
-    """
-    quantize output:
-        output = round(output * o_inv_scale)
-        output = output + zero_point
-        output = clamp_min(output, 0)
-        output = clamp_max(output, 127)
-        output = output.to(uint8)
-    """
-    assert dtype in [torch.float32, torch.bfloat16]
+def generate_pattern_with_output_quant(computation_call, with_dtype_convert=False):
     quantized_op_output_pattern_pt2e = CallFunction(
-        prims.convert_element_type.default,
-        CallFunction(
-            aten.clamp_max.default,
-            CallFunction(
-                aten.clamp_min.default,
-                CallFunction(
-                    aten.add.Tensor,
-                    CallFunction(
-                        aten.round.default,
-                        CallFunction(
-                            aten.mul.Tensor,
-                            _may_generate_pattern_with_dtype_convert(
-                                computation_call,
-                                KeywordArg("autocast_output_quant_dtype"),
-                                dtype == torch.bfloat16,
-                            ),
-                            KeywordArg("o_inv_scale"),
-                        ),
-                    ),
-                    KeywordArg("o_zp"),
-                ),
-                KeywordArg("o_qmin"),
-            ),
-            KeywordArg("o_qmax"),
+        quantized_decomposed.quantize_per_tensor.default,
+        _may_generate_pattern_with_dtype_convert(
+            computation_call,
+            Arg(),
+            with_dtype_convert,
         ),
+        KeywordArg("o_inv_scale"),
+        KeywordArg("o_zp"),
+        KeywordArg("o_qmin"),
+        KeywordArg("o_qmax"),
         KeywordArg("o_dtype"),
     )
     return quantized_op_output_pattern_pt2e
@@ -263,8 +274,9 @@ def _check_node_kwarg_arg_value(check_node, kwarg_name, args_index, expected_val
         return actual_value == expected_value
 
 
-def _is_valid_quantized_conv2d_optimization_pattern(output_dtype):
+def _is_valid_quantized_conv2d_optimization_pattern():
     def fn(match):
+        output_dtype = _get_pattern_output_dtype(match)
         if output_dtype is not None:
             # Only keep matched pattern with same output_dtype
             qconv_node_after_weight_prepack = filter_nodes(
@@ -282,13 +294,11 @@ def _register_quantized_conv_lowering(
     pattern,
     pass_number,
     computation_op,
-    output_dtype,
     unary_attr,
-    original_pattern_output_dtype=torch.float32,
 ):
     @register_lowering_pattern(
         pattern,
-        extra_check=_is_valid_quantized_conv2d_optimization_pattern(output_dtype),
+        extra_check=_is_valid_quantized_conv2d_optimization_pattern(),
         pass_number=pass_number,
     )
     def qconv(match: Match, *args, **kwargs):
@@ -312,13 +322,11 @@ def qconv(match: Match, *args, **kwargs):
             kwargs["dilation"],
             kwargs["groups"],
         )
+        output_dtype = _get_pattern_output_dtype(match)
         assert output_dtype in [None, torch.float32, torch.bfloat16]
         # Output QParams
         o_inv_scale = kwargs["o_inv_scale"] if output_dtype is None else 1.0
         o_zero_point = kwargs["o_zp"] if output_dtype is None else 0
-        assert (
-            kwargs["output_dtype"] is original_pattern_output_dtype
-        )  # Expected int8-in fp32-out qconv in weight prepack phase
         assert (
             kwargs["attr"] == "none"
         )  # Expected no post op fused in weight prepack phase
@@ -353,8 +361,9 @@ def qconv(match: Match, *args, **kwargs):
     return qconv
 
 
-def _is_valid_quantized_linear_optimization_pattern(output_dtype):
+def _is_valid_quantized_linear_optimization_pattern():
     def fn(match):
+        output_dtype = _get_pattern_output_dtype(match)
         if output_dtype is not None:
             # Only keep matched pattern with same output_dtype
             qlinear_node_after_weight_prepack = filter_nodes(
@@ -372,16 +381,15 @@ def _register_quantized_linear_lowering(
     pattern,
     pass_number,
     computation_op,
-    output_dtype,
     unary_attr,
-    original_pattern_output_dtype=torch.float32,
 ):
     @register_lowering_pattern(
         pattern,
-        extra_check=_is_valid_quantized_linear_optimization_pattern(output_dtype),
+        extra_check=_is_valid_quantized_linear_optimization_pattern(),
         pass_number=pass_number,
     )
     def qlinear(match: Match, *args, **kwargs):
+        output_dtype = _get_pattern_output_dtype(match)
         # Activation QParams
         x, x_scale, x_zp = (
             kwargs["x"],
@@ -401,9 +409,6 @@ def qlinear(match: Match, *args, **kwargs):
         # Output QParams
         o_inv_scale = kwargs["o_inv_scale"] if output_dtype is None else 1.0
         o_zero_point = kwargs["o_zp"] if output_dtype is None else 0
-        assert (
-            kwargs["output_dtype"] is original_pattern_output_dtype
-        )  # Expected int8-in fp32/bf16-out qlinear in weight prepack phase
         assert (
             kwargs["postop_name"] == "none"
         )  # Expected no post op fused in weight prepack phase
@@ -430,7 +435,7 @@ def qlinear(match: Match, *args, **kwargs):
     return qlinear
 
 
-def _is_valid_quantized_conv_binary_optimization_pattern(output_dtype):
+def _is_valid_quantized_conv_binary_optimization_pattern():
     # Check if it's a valid Conv Binary Pattern:
     # * qconv2d_pointwise should only has one users
     # * Extra input of binary node comes from dequant pattern
@@ -440,6 +445,7 @@ def _is_valid_quantized_conv_binary_optimization_pattern(output_dtype):
     #   ancestor nodes of the compute node, except for the binary node
     #   connected to the compute node.
     def fn(match):
+        output_dtype = _get_pattern_output_dtype(match)
         compute_node = filter_nodes(match.nodes, torch.ops.onednn.qconv2d_pointwise)[0]
         # qconv2d_pointwise should only have one user
         if len(compute_node.users) != 1:
@@ -455,23 +461,24 @@ def fn(match):
             assert extra_input_of_binary_node is not None
             # Extra input of binary node comes from dequant pattern
             if (not isinstance(extra_input_of_binary_node, torch.fx.Node)) or (
-                extra_input_of_binary_node.target != aten.mul.Tensor
+                extra_input_of_binary_node.target
+                != quantized_decomposed.dequantize_per_tensor.default
             ):
                 return False
 
         # the two inputs of binary node should have attribute "meta" and should be tensors
         if not (
             hasattr(binary_node_inputs[0], "meta")
-            and isinstance(binary_node_inputs[0].meta.get("val", None), torch.Tensor)
+            and isinstance(binary_node_inputs[0].meta.get("val", None), torch.Tensor)  # type: ignore[union-attr]
         ) or not (
             hasattr(binary_node_inputs[1], "meta")
-            and isinstance(binary_node_inputs[1].meta.get("val", None), torch.Tensor)
+            and isinstance(binary_node_inputs[1].meta.get("val", None), torch.Tensor)  # type: ignore[union-attr]
         ):
             return False
         # the two inputs of binary node should have the same shape
         if (
-            binary_node_inputs[0].meta["val"].size()
-            != binary_node_inputs[1].meta["val"].size()
+            binary_node_inputs[0].meta["val"].size()  # type: ignore[union-attr]
+            != binary_node_inputs[1].meta["val"].size()  # type: ignore[union-attr]
         ):
             return False
 
@@ -494,6 +501,7 @@ def fn(match):
                 )
             )
             > 1
+            or extra_input_of_pattern == compute_node.args[0]
         ):
             return False
         return True
@@ -505,15 +513,15 @@ def _register_quantized_conv_binary_lowering(
     pattern,
     pass_number,
     computation_op,
-    output_dtype,
     binary_unary_attr,
 ):
     @register_lowering_pattern(
         pattern,
-        extra_check=_is_valid_quantized_conv_binary_optimization_pattern(output_dtype),
+        extra_check=_is_valid_quantized_conv_binary_optimization_pattern(),
         pass_number=pass_number,
     )
     def qconv_binary(match: Match, *args, **kwargs):
+        output_dtype = _get_pattern_output_dtype(match)
         x, x_scale, x_zp = kwargs["x"], kwargs["x_scale"], kwargs["x_zp"]
         accum = (
             kwargs["accum"] if output_dtype is None else kwargs["accum_after_dequant"]
@@ -536,6 +544,13 @@ def qconv_binary(match: Match, *args, **kwargs):
         o_inv_scale = kwargs["o_inv_scale"] if output_dtype is None else 1.0
         o_zero_point = kwargs["o_zp"] if output_dtype is None else 0
 
+        accum.realize()
+        from .mkldnn_fusion import _can_be_inplace
+
+        assert _can_be_inplace(
+            accum
+        ), "QConv Binary Inplace Fusion requires accum is not an alias or mutation."
+
         computation_args = (
             x,
             x_scale,
@@ -568,6 +583,14 @@ def qconv_binary(match: Match, *args, **kwargs):
 
 
 def _register_quantization_unary_fusion():
+    from .mkldnn_fusion import (
+        _gelu_fusion_1 as _gelu_fusion_erf,
+        _gelu_fusion_2 as _gelu_fusion_tanh,
+        _hardswish_fusion,
+        _hardtanh_fusion,
+        _silu_fusion,
+    )
+
     class UnaryAttr:
         def __init__(self, op_name: str, scalars_attr=None, algorithm_attr=None):
             self.op_name = op_name
@@ -579,22 +602,42 @@ def __init__(self, op_name: str, scalars_attr=None, algorithm_attr=None):
         # Priority 1 to match: QConv2d Unary pattern with int8 output
         # If a pattern1 is a sub-set of pattern2, we should try to match pattern2 firstly.
         # For example: pattern1 is qconv_fp32 -> relu, pattern2 is qconv_fp32 -> relu -> quant
+        is_bf16 = original_pattern_output_dtype == torch.bfloat16
         conv_unary_replace_patterns = {
             UnaryAttr("none", [], ""): generate_pattern_with_output_quant(
-                dequantize_qconv_pt2e_pattern,
-                dtype=original_pattern_output_dtype,
+                get_dequantize_qconv_pt2e_pattern(1),
             ),
             UnaryAttr("relu", [], ""): generate_pattern_with_output_quant(
                 generate_pattern_with_unary(
-                    dequantize_qconv_pt2e_pattern, aten.relu.default
+                    get_dequantize_qconv_pt2e_pattern(1), aten.relu.default
                 ),
-                dtype=original_pattern_output_dtype,
             ),
             UnaryAttr("hardtanh", [], ""): generate_pattern_with_output_quant(
-                generate_pattern_with_unary(
-                    dequantize_qconv_pt2e_pattern, aten.hardtanh.default
+                _unary_fusion_pattern(
+                    _hardtanh_fusion,
+                    get_dequantize_qconv_pt2e_pattern(1),
+                    1,
+                    is_bf16,
+                ),
+                with_dtype_convert=is_bf16,
+            ),
+            UnaryAttr("hardswish", [], ""): generate_pattern_with_output_quant(
+                _unary_fusion_pattern(
+                    _hardswish_fusion,
+                    get_dequantize_qconv_pt2e_pattern(1 if is_bf16 else 2),
+                    2,
+                    is_bf16,
                 ),
-                dtype=original_pattern_output_dtype,
+                with_dtype_convert=is_bf16,
+            ),
+            UnaryAttr("swish", [], ""): generate_pattern_with_output_quant(
+                _unary_fusion_pattern(
+                    _silu_fusion,
+                    get_dequantize_qconv_pt2e_pattern(1 if is_bf16 else 2),
+                    2,
+                    is_bf16,
+                ),
+                with_dtype_convert=is_bf16,
             ),
         }
 
@@ -604,18 +647,43 @@ def __init__(self, op_name: str, scalars_attr=None, algorithm_attr=None):
                 patterns,
                 1,  # pass_number
                 torch.ops.onednn.qconv2d_pointwise,  # computation_op
-                None,  # output_dtype, None is the default value for int8 output
                 unary_attr,  # unary_attr
-                original_pattern_output_dtype=original_pattern_output_dtype,
             )
 
         # Priority 2 to match: QConv2d Unary pattern with fp32/bfloat16 output
         conv_unary_replace_float_out_patterns = {
             UnaryAttr("relu", [], ""): generate_pattern_with_unary(
-                dequantize_qconv_pt2e_pattern, aten.relu.default
+                get_dequantize_qconv_pt2e_pattern(1), aten.relu.default
+            ),
+            UnaryAttr("hardtanh", [], ""): _may_generate_pattern_with_dtype_convert(
+                _unary_fusion_pattern(
+                    _hardtanh_fusion,
+                    get_dequantize_qconv_pt2e_pattern(1),
+                    1,
+                    is_bf16,
+                ),
+                Arg(),
+                is_bf16,
             ),
-            UnaryAttr("hardtanh", [], ""): generate_pattern_with_unary(
-                dequantize_qconv_pt2e_pattern, aten.hardtanh.default
+            UnaryAttr("hardswish", [], ""): _may_generate_pattern_with_dtype_convert(
+                _unary_fusion_pattern(
+                    _hardswish_fusion,
+                    get_dequantize_qconv_pt2e_pattern(1 if is_bf16 else 2),
+                    2,
+                    is_bf16,
+                ),
+                Arg(),
+                is_bf16,
+            ),
+            UnaryAttr("swish", [], ""): _may_generate_pattern_with_dtype_convert(
+                _unary_fusion_pattern(
+                    _silu_fusion,
+                    get_dequantize_qconv_pt2e_pattern(1 if is_bf16 else 2),
+                    2,
+                    is_bf16,
+                ),
+                Arg(),
+                is_bf16,
             ),
         }
 
@@ -625,50 +693,90 @@ def __init__(self, op_name: str, scalars_attr=None, algorithm_attr=None):
                 patterns,
                 2,  # pass_number
                 torch.ops.onednn.qconv2d_pointwise,  # computation_op
-                original_pattern_output_dtype,  # output_dtype
                 unary_attr,  # unary_attr
-                original_pattern_output_dtype=original_pattern_output_dtype,
             )
 
         # QLinear
-        # Priority 1 to match: QLinear Unary pattern with int8 output
-        linear_unary_replace_patterns = {
-            UnaryAttr("none", [], ""): generate_pattern_with_output_quant(
-                qlinear_pt2e_pattern,
-                dtype=original_pattern_output_dtype,
-            ),
-            UnaryAttr("relu", [], ""): generate_pattern_with_output_quant(
-                generate_pattern_with_unary(qlinear_pt2e_pattern, aten.relu.default),
-                dtype=original_pattern_output_dtype,
-            ),
-        }
+        for x_scale_zp_are_tensors in (False, True):
+            qlinear_pattern = get_qlinear_pt2e_pattern(x_scale_zp_are_tensors)
+            # Priority 1 to match: QLinear Unary pattern with int8 output
+            linear_unary_replace_patterns = {
+                UnaryAttr("none", [], ""): generate_pattern_with_output_quant(
+                    qlinear_pattern,
+                ),
+                UnaryAttr("relu", [], ""): generate_pattern_with_output_quant(
+                    generate_pattern_with_unary(qlinear_pattern, aten.relu.default),
+                ),
+                UnaryAttr("gelu", [], "none"): generate_pattern_with_output_quant(
+                    _unary_fusion_pattern(
+                        _gelu_fusion_erf,
+                        get_qlinear_pt2e_pattern(
+                            x_scale_zp_are_tensors, 1 if is_bf16 else 2
+                        ),
+                        2,
+                        is_bf16,
+                    ),
+                    with_dtype_convert=is_bf16,
+                ),
+                UnaryAttr("gelu", [], "tanh"): generate_pattern_with_output_quant(
+                    _unary_fusion_pattern(
+                        _gelu_fusion_tanh,
+                        get_qlinear_pt2e_pattern(
+                            x_scale_zp_are_tensors, 1 if is_bf16 else 4
+                        ),
+                        4,
+                        is_bf16,
+                    ),
+                    with_dtype_convert=is_bf16,
+                ),
+            }
 
-        for unary_attr, patterns in linear_unary_replace_patterns.items():
-            _register_quantized_linear_lowering(
-                patterns,
-                1,  # pass_number
-                torch.ops.onednn.qlinear_pointwise,  # computation_op
-                None,  # output_dtype
-                unary_attr,  # unary_attr
-                original_pattern_output_dtype=original_pattern_output_dtype,
-            )
+            for unary_attr, patterns in linear_unary_replace_patterns.items():
+                _register_quantized_linear_lowering(
+                    patterns,
+                    1,  # pass_number
+                    torch.ops.onednn.qlinear_pointwise,  # computation_op
+                    unary_attr,  # unary_attr
+                )
 
-        # Priority 2 to match: QLinear Unary pattern with FP32/BF16 output
-        linear_unary_replace_float_out_patterns = {
-            UnaryAttr("relu", [], ""): generate_pattern_with_unary(
-                qlinear_pt2e_pattern, aten.relu.default
-            ),
-        }
+            # Priority 2 to match: QLinear Unary pattern with FP32/BF16 output
+            linear_unary_replace_float_out_patterns = {
+                UnaryAttr("relu", [], ""): generate_pattern_with_unary(
+                    qlinear_pattern, aten.relu.default
+                ),
+                UnaryAttr("gelu", [], "none"): _may_generate_pattern_with_dtype_convert(
+                    _unary_fusion_pattern(
+                        _gelu_fusion_erf,
+                        get_qlinear_pt2e_pattern(
+                            x_scale_zp_are_tensors, 1 if is_bf16 else 2
+                        ),
+                        2,
+                        is_bf16,
+                    ),
+                    Arg(),
+                    is_bf16,
+                ),
+                UnaryAttr("gelu", [], "tanh"): _may_generate_pattern_with_dtype_convert(
+                    _unary_fusion_pattern(
+                        _gelu_fusion_tanh,
+                        get_qlinear_pt2e_pattern(
+                            x_scale_zp_are_tensors, 1 if is_bf16 else 4
+                        ),
+                        4,
+                        is_bf16,
+                    ),
+                    Arg(),
+                    is_bf16,
+                ),
+            }
 
-        for unary_attr, patterns in linear_unary_replace_float_out_patterns.items():
-            _register_quantized_linear_lowering(
-                patterns,
-                2,  # pass_number
-                torch.ops.onednn.qlinear_pointwise,  # computation_op
-                original_pattern_output_dtype,  # output_dtype
-                unary_attr,  # unary_attr
-                original_pattern_output_dtype=original_pattern_output_dtype,
-            )
+            for unary_attr, patterns in linear_unary_replace_float_out_patterns.items():
+                _register_quantized_linear_lowering(
+                    patterns,
+                    2,  # pass_number
+                    torch.ops.onednn.qlinear_pointwise,  # computation_op
+                    unary_attr,  # unary_attr
+                )
 
 
 def _register_quantization_binary_fusion():
@@ -695,13 +803,10 @@ def __init__(
             ): generate_pattern_with_output_quant(
                 generate_pattern_with_binary(
                     aten.add.Tensor,
-                    dequantize_qconv_pt2e_pattern,
+                    get_dequantize_qconv_pt2e_pattern(1),
                     dequantize_accum_pattern,
                     int8_mixed_bf16_with_inplace_add,
                 ),
-                dtype=torch.bfloat16
-                if int8_mixed_bf16_with_inplace_add
-                else torch.float32,
             ),
             BinaryUnaryAttr(
                 "sum", 1.0, "relu", [], ""
@@ -709,15 +814,12 @@ def __init__(
                 generate_pattern_with_unary(
                     generate_pattern_with_binary(
                         aten.add.Tensor,
-                        dequantize_qconv_pt2e_pattern,
+                        get_dequantize_qconv_pt2e_pattern(1),
                         dequantize_accum_pattern,
                         int8_mixed_bf16_with_inplace_add,
                     ),
                     aten.relu.default,
                 ),
-                dtype=torch.bfloat16
-                if int8_mixed_bf16_with_inplace_add
-                else torch.float32,
             ),
         }
 
@@ -726,7 +828,6 @@ def __init__(
                 patterns,
                 0,  # pass_number
                 torch.ops.onednn.qconv2d_pointwise.binary,  # computation_op
-                None,  # output_dtype
                 binary_unary_attr,  # binary_unary_attr
             )
 
@@ -735,7 +836,7 @@ def __init__(
             BinaryUnaryAttr("sum", 1.0, "relu", [], ""): generate_pattern_with_unary(
                 generate_pattern_with_binary(
                     aten.add.Tensor,
-                    dequantize_qconv_pt2e_pattern,
+                    get_dequantize_qconv_pt2e_pattern(1),
                     KeywordArg("accum_after_dequant"),
                     int8_mixed_bf16_with_inplace_add,
                 ),
@@ -752,11 +853,6 @@ def __init__(
                     patterns,
                     0,  # pass_number
                     torch.ops.onednn.qconv2d_pointwise.binary,  # computation_op
-                    # Note that for int8-mixed-bf16 and non-inplace add, because we have
-                    # q-dq inserted at extra input of add, so the non-inplace add has bf16 and fp32 inputs,
-                    # the output dtype will be float32.
-                    # For inplace add, there is a extra to_bf16 node at add output, so the fusion pattern has bfloat16 output.
-                    torch.bfloat16,
                     binary_unary_attr,  # binary_unary_attr
                 )
             else:
@@ -764,7 +860,6 @@ def __init__(
                     patterns,
                     1,  # pass_number
                     torch.ops.onednn.qconv2d_pointwise.binary,  # computation_op
-                    torch.float32,
                     binary_unary_attr,  # binary_unary_attr
                 )
 
@@ -772,7 +867,7 @@ def __init__(
         binary_replace_float_out_patterns = {
             BinaryUnaryAttr("sum", 1.0, "none", [], ""): generate_pattern_with_binary(
                 aten.add.Tensor,
-                dequantize_qconv_pt2e_pattern,
+                get_dequantize_qconv_pt2e_pattern(1),
                 KeywordArg("accum_after_dequant"),
                 int8_mixed_bf16_with_inplace_add,
             ),
@@ -786,8 +881,6 @@ def __init__(
                 patterns,
                 1 if int8_mixed_bf16_with_inplace_add else 2,  # pass_number
                 torch.ops.onednn.qconv2d_pointwise.binary,  # computation_op
-                # Same output dtype setting as conv-add-relu pattern
-                torch.bfloat16 if int8_mixed_bf16_with_inplace_add else torch.float32,
                 binary_unary_attr,  # binary_unary_attr
             )
 
@@ -843,6 +936,8 @@ def qmaxpool2d(match: Match, *args, **kwargs):
             ceil_mode,
         )
         computation_args, _ = require_channels_last(computation_op, *computation_args)
+        counters["inductor"]["qmaxpool2d_matcher_count"] += 1
+        counters["inductor"]["qmaxpool2d_matcher_nodes"] += len(match.nodes)
         return L[computation_op](*computation_args)
 
     return qmaxpool2d
@@ -874,50 +969,64 @@ def _register_quantization_maxpool2d():
             KeywordArg("ceil_mode"),
         ],
     ]
-
     for max_pool2d_args in max_pool2d_args_list:
         dequantize_maxpool2d_pattern = CallFunction(
             aten.max_pool2d_with_indices.default,
-            dequantize_per_tensor_activation_pattern,
+            get_dequantize_per_tensor_activation_pattern(),
             KeywordArg("kernel_size"),
             *max_pool2d_args,
         )
+        dequantize_lowmem_maxpool2d_pattern = CallFunction(
+            prims._low_memory_max_pool2d_with_offsets.default,
+            get_dequantize_per_tensor_activation_pattern(),
+            KeywordArg("kernel_size"),
+            *max_pool2d_args,
+            KeywordArg("offset_dtype"),
+        )
         dequantize_maxpool2d_get_item_pattern = CallFunction(
             operator.getitem,
             dequantize_maxpool2d_pattern,
             Arg(),
         )
+        dequantize_lowmem_maxpool2d_get_item_pattern = CallFunction(
+            operator.getitem,
+            dequantize_lowmem_maxpool2d_pattern,
+            Arg(),
+        )
         _register_quantized_maxpool2d_lowering(
             generate_pattern_with_output_quant(dequantize_maxpool2d_get_item_pattern),
             quantized.max_pool2d.default,
         )
+        _register_quantized_maxpool2d_lowering(
+            generate_pattern_with_output_quant(
+                dequantize_lowmem_maxpool2d_get_item_pattern
+            ),
+            quantized.max_pool2d.default,
+        )
 
 
 def _is_input_output_same_scale_zp(check_node):
     def fn(match):
         # Ensure all the inputs and output has same scale and zero point
         # Step 1: Check inputs/output zero point
-        sub_nodes = filter_nodes(match.nodes, aten.sub.Tensor)
-        zero_points = [node.args[1] for node in sub_nodes]
-        add_nodes = filter_nodes(match.nodes, aten.add.Tensor)
-        assert len(add_nodes) == 1, "expect only 1 add node at output quant pattern"
-        zero_points.append(add_nodes[0].args[1])
+        # Get dequant nodes at input
+        dequant_nodes = filter_nodes(
+            match.nodes, quantized_decomposed.dequantize_per_tensor.default
+        )
+        zero_points = [node.args[2] for node in dequant_nodes]
+        # Get quant nodes at output
+        quant_nodes = filter_nodes(
+            match.nodes, quantized_decomposed.quantize_per_tensor.default
+        )
+        assert len(quant_nodes) == 1, "expect only 1 add node at output quant pattern"
+        zero_points.append(quant_nodes[0].args[2])
         if not all(zero_point == zero_points[0] for zero_point in zero_points):
             return False
 
         # Step 2: Check inputs/output scale
-        mul_nodes = filter_nodes(match.nodes, aten.mul.Tensor)
-        # We need to find mul node at output since the scale value is reciprocal to input scale.
-        # Mul node at output should connect to cat node directly.
-        scales = [
-            (
-                mul_node.args[1]
-                if mul_node.args[0].target is check_node
-                else 1.0 / mul_node.args[1]
-            )
-            for mul_node in mul_nodes
-        ]
-        if not all(math.isclose(scale, scales[0], rel_tol=1e-5) for scale in scales):
+        scales = [node.args[1] for node in dequant_nodes]
+        scales.append(quant_nodes[0].args[1])
+        if not all(math.isclose(scale, scales[0], rel_tol=1e-5) for scale in scales):  # type: ignore[arg-type]
             return False
 
         return True
@@ -936,22 +1045,20 @@ def _register_quantized_cat_lowering(
     def qcat(match: Match, inputs, dim, **kwargs):
         # inputs is with format: [[x1, x1_dq_dtype, x1_zp, x1_scale], ...]
         uint8_inputs = [input[0] for input in inputs]
+        counters["inductor"]["qcat_matcher_count"] += 1
+        counters["inductor"]["qcat_matcher_nodes"] += len(match.nodes)
         return L[computation_op](uint8_inputs, dim)
 
     return qcat
 
 
 _raw_dequantize_per_tensor_activation_pattern = CallFunction(
-    aten.mul.Tensor,
-    CallFunction(
-        aten.sub.Tensor,
-        CallFunction(
-            prims.convert_element_type.default,
-            Arg(),
-            Arg(),
-        ),
-        Arg(),
-    ),
+    quantized_decomposed.dequantize_per_tensor.default,
+    Arg(),
+    Arg(),
+    Arg(),
+    Arg(),
+    Arg(),
     Arg(),
 )
 
@@ -989,7 +1096,7 @@ def qreshape(match: Match, *args, **kwargs):
 def _register_quantization_reshape():
     dequantize_reshape_pattern = CallFunction(
         torch.ops.aten.reshape.default,
-        dequantize_per_tensor_activation_pattern,
+        get_dequantize_per_tensor_activation_pattern(),
         KeywordArg("shape"),
     )
     _register_quantized_reshape_lowering(
@@ -998,6 +1105,127 @@ def _register_quantization_reshape():
     )
 
 
+def _is_valid_woq_optimization_pattern():
+    def fn(match):
+        assert all(k in match.kwargs for k in ("x", "weight", "scales"))
+        x = match.kwargs["x"].meta["val"]
+        weight = match.kwargs["weight"].meta["val"]
+        scales = match.kwargs["scales"].meta["val"]
+        return (
+            # For now, we only support woq mm kernels
+            # with x.type=bfloat16 and w.type=int8
+            x.dtype == torch.bfloat16
+            and weight.dtype == torch.int8
+            and scales.dtype == torch.bfloat16
+            # _weight_int8pack_mm kernel only supports cpu now
+            # TODO: add cuda kernel support instead of calling mul+sum
+            and x.device.type == "cpu"
+            and x.device == weight.device
+            and x.device == scales.device
+        )
+
+    return fn
+
+
+def _register_woq_lowering(pattern, computation_woq, computation_reshape):
+    @register_lowering_pattern(
+        pattern,
+        extra_check=_is_valid_woq_optimization_pattern(),
+    )
+    def woq(match: Match, *args, **kwargs):
+        x = kwargs["x"]
+        weight = kwargs["weight"]
+        scales = kwargs["scales"]
+        counters["inductor"]["woq_matcher_count"] += 1
+        counters["inductor"]["woq_matcher_nodes"] += len(match.nodes)
+        out_features = weight.get_size()[0]
+        origin_x_size = x.get_size()
+        x_shape = [-1, origin_x_size[-1]]
+        out_shape = origin_x_size[:-1] + [
+            out_features,
+        ]
+        func1 = L[computation_reshape](x, x_shape)
+        func2 = L[computation_woq](func1, weight, scales)
+        return L[computation_reshape](func2, out_shape)
+
+    return woq
+
+
+def _register_woq_mm_int8_pattern1():
+    # F.linear(x, weight.to(dtype=x.dtype)) * scales
+    # case of dispatching to mm, with x reshape
+    _woq_pattern = CallFunction(
+        aten.mul.Tensor,
+        CallFunction(
+            aten.reshape.default,
+            CallFunction(
+                aten.mm.default,
+                CallFunction(aten.reshape.default, KeywordArg("x"), Arg()),
+                CallFunction(
+                    aten.permute.default,
+                    CallFunction(
+                        prims.convert_element_type.default, KeywordArg("weight"), Arg()
+                    ),
+                    Arg(),
+                ),
+            ),
+            Arg(),
+        ),
+        KeywordArg("scales"),
+    )
+    _register_woq_lowering(_woq_pattern, aten._weight_int8pack_mm.default, aten.reshape)
+
+
+def _register_woq_mm_int8_pattern2():
+    # F.linear(x, weight.to(dtype=x.dtype)) * scales
+    # case of dispatching to mm, w/o x reshape
+    _woq_pattern = CallFunction(
+        aten.mul.Tensor,
+        CallFunction(
+            aten.reshape.default,
+            CallFunction(
+                aten.mm.default,
+                KeywordArg("x"),
+                CallFunction(
+                    aten.permute.default,
+                    CallFunction(
+                        prims.convert_element_type.default, KeywordArg("weight"), Arg()
+                    ),
+                    Arg(),
+                ),
+            ),
+            Arg(),
+        ),
+        KeywordArg("scales"),
+    )
+    _register_woq_lowering(_woq_pattern, aten._weight_int8pack_mm.default, aten.reshape)
+
+
+def _register_woq_mm_int8_pattern3():
+    # F.linear(x, weight.to(dtype=x.dtype)) * scales
+    # case of dispatching to bmm
+    _woq_pattern = CallFunction(
+        aten.mul.Tensor,
+        CallFunction(
+            aten.bmm.default,
+            CallFunction(aten.expand.default, KeywordArg("x"), Arg()),
+            CallFunction(
+                aten.expand.default,
+                CallFunction(
+                    aten.permute.default,
+                    CallFunction(
+                        prims.convert_element_type.default, KeywordArg("weight"), Arg()
+                    ),
+                    Arg(),
+                ),
+                Arg(),
+            ),
+        ),
+        KeywordArg("scales"),
+    )
+    _register_woq_lowering(_woq_pattern, aten._weight_int8pack_mm.default, aten.reshape)
+
+
 def _register_quantization_lowerings():
     _register_quantization_unary_fusion()
     _register_quantization_binary_fusion()
@@ -1006,40 +1234,49 @@ def _register_quantization_lowerings():
     _register_quantization_reshape()
 
 
+def _register_woq_lowerings():
+    _register_woq_mm_int8_pattern1()
+    _register_woq_mm_int8_pattern2()
+    _register_woq_mm_int8_pattern3()
+
+
 def _is_valid_dequant_promotion_pattern(dtype=torch.float32):
     def _inner(match):
         assert dtype in [torch.float32, torch.bfloat16]
         dequant_pattern_end_node = match.output_node()
         if dequant_pattern_end_node.target not in [
-            aten.mul.Tensor,
+            quantized_decomposed.dequantize_per_tensor.default,
+            quantized_decomposed.dequantize_per_tensor.tensor,
             prims.convert_element_type.default,
             aten.reshape.default,
         ]:
             return False
 
         if dequant_pattern_end_node.target is aten.reshape.default:
-            mul_node = (
-                dequant_pattern_end_node.args[0]  # pattern: linear <- reshape <- mul
+            dequant_node = (
+                dequant_pattern_end_node.args[
+                    0
+                ]  # pattern: linear <- reshape <- dequant
                 if dtype == torch.float32
                 else dequant_pattern_end_node.args[0].args[
                     0
-                ]  # pattern: linear <- reshape <- to_bf16 <- mul
+                ]  # pattern: linear <- reshape <- to_bf16 <- dequant
             )
         else:
-            mul_node = (
-                dequant_pattern_end_node  # pattern: linear <- mul
+            dequant_node = (
+                dequant_pattern_end_node  # pattern: linear <- dequant
                 if dtype == torch.float32
                 else dequant_pattern_end_node.args[
                     0
-                ]  # pattern: linear <- to_bf16 <- mul
+                ]  # pattern: linear <- to_bf16 <- dequant
             )
 
-        sub_node = mul_node.args[0]
-        to_fp32_node = sub_node.args[0]
         if (
-            mul_node.target is aten.mul.Tensor
-            and sub_node.target is aten.sub.Tensor
-            and to_fp32_node.target is prims.convert_element_type.default
+            dequant_node.target
+            in [
+                quantized_decomposed.dequantize_per_tensor.default,
+                quantized_decomposed.dequantize_per_tensor.tensor,
+            ]
             and len(list(dequant_pattern_end_node.users)) > 1
         ):
             # If dequant pattern has more than 1 users, then do dequant promoted
@@ -1100,10 +1337,11 @@ def clone_to_new_node(graph, source_node, user_node):
 
         # Find the start node and end node of a dequant pattern
         # * End node should be the match.output_node()
-        # * Start node should be the node of dtype convert to float32
+        # * Start node should be the node of dequantize_per_tensor
         dequant_pattern_end_node = match.output_node()
         assert dequant_pattern_end_node.target in [
-            aten.mul.Tensor,
+            quantized_decomposed.dequantize_per_tensor.default,
+            quantized_decomposed.dequantize_per_tensor.tensor,
             prims.convert_element_type.default,
             aten.reshape.default,
         ]
@@ -1111,15 +1349,13 @@ def clone_to_new_node(graph, source_node, user_node):
         # For a dequant pattern, we should expect see the node list as:
         # * OPT(aten.reshape.default)
         # * OPT(prims.convert_element_type.default) (to_bf16)
-        # * aten.mul
-        # * aten.sub
-        # * prims.convert_element_type.default (to_fp32)
+        # * dequantize_per_tensor
         def _find_first_node_in_dequant_pattern(_node):
-            if (
-                _node.target is prims.convert_element_type.default
-                and _node.args[1] == torch.float32
-            ):
-                # For a dequant pattern, we expect the start node is a to_fp32 node
+            if _node.target in [
+                quantized_decomposed.dequantize_per_tensor.default,
+                quantized_decomposed.dequantize_per_tensor.tensor,
+            ]:
+                # For a dequant pattern, we expect the start node is a dequantize_per_tensor node
                 return _node
             else:
                 assert (
@@ -1131,6 +1367,11 @@ def _find_first_node_in_dequant_pattern(_node):
             dequant_pattern_end_node
         )
 
+        assert dequant_pattern_start_node.target in [
+            quantized_decomposed.dequantize_per_tensor.default,
+            quantized_decomposed.dequantize_per_tensor.tensor,
+        ]
+
         # Clone the dequant pattern for each user node
         graph = match.graph
         user_node_list = list(dequant_pattern_end_node.users)
@@ -1139,7 +1380,7 @@ def _find_first_node_in_dequant_pattern(_node):
             _user_node = user_node
             while _source_node != dequant_pattern_start_node.args[0]:
                 _user_node = clone_to_new_node(graph, _source_node, _user_node)
-                _source_node = _source_node.args[0]
+                _source_node = _source_node.args[0]  # type: ignore[assignment]
 
         counters["inductor"]["dequant_promotion_matcher_count"] += 1
         counters["inductor"]["dequant_promotion_matcher_nodes"] += len(match.nodes)
@@ -1166,22 +1407,14 @@ def _inner(match):
                 return False
 
         assert dtype in [torch.float32, torch.bfloat16]
+
         if dtype == torch.float32:
-            mul_node = conv_node.args[0]
+            dequant_node = conv_node.args[0]
         else:
             convert_to_bf16 = conv_node.args[0]
-            mul_node = convert_to_bf16.args[0]
-        sub_node = mul_node.args[0]
-        to_fp32_node = sub_node.args[0]
+            dequant_node = convert_to_bf16.args[0]
 
-        assert to_fp32_node.target is prims.convert_element_type.default
-        assert sub_node.target is aten.sub.Tensor
-        assert mul_node.target is aten.mul.Tensor
-        if (
-            len(list(to_fp32_node.users)) != 1
-            or len(list(sub_node.users)) != 1
-            or len(list(mul_node.users)) != 1
-        ):
+        if len(list(dequant_node.users)) != 1:
             # Ensure the dequant pattern only has 1 user
             # since we will delete the dequant pattern here
             return False
@@ -1214,14 +1447,12 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
         conv_node = match.output_node()
         assert conv_node.target is aten.convolution.default
         if dtype == torch.float32:
-            mul_node = conv_node.args[0]
+            dequant_node = conv_node.args[0]
         else:
             convert_to_bf16 = conv_node.args[0]
-            mul_node = convert_to_bf16.args[0]
-        sub_node = mul_node.args[0]
-        to_fp32_node = sub_node.args[0]
+            dequant_node = convert_to_bf16.args[0]  # type: ignore[union-attr]
         has_clone_to_channel_last_node_in_pattern = (
-            conv_node.args[1].target is aten.clone.default
+            conv_node.args[1].target is aten.clone.default  # type: ignore[union-attr]
         )
         clone_node = (
             conv_node.args[1] if has_clone_to_channel_last_node_in_pattern else None
@@ -1229,20 +1460,20 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
 
         if dtype == torch.float32:
             dequant_per_channel = (
-                clone_node.args[0]
+                clone_node.args[0]  # type: ignore[union-attr]
                 if has_clone_to_channel_last_node_in_pattern
                 else conv_node.args[1]
             )
         else:
             weight_to_bf16_node = (
-                clone_node.args[0]
+                clone_node.args[0]  # type: ignore[union-attr]
                 if has_clone_to_channel_last_node_in_pattern
                 else conv_node.args[1]
             )
-            dequant_per_channel = weight_to_bf16_node.args[0]
+            dequant_per_channel = weight_to_bf16_node.args[0]  # type: ignore[union-attr]
 
         assert (
-            dequant_per_channel.target
+            dequant_per_channel.target  # type: ignore[union-attr]
             is quantized_decomposed.dequantize_per_channel.default
         )
 
@@ -1304,7 +1535,7 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
                 padding,
                 dilation,
                 groups,
-                1.0,  # inv_output_scale
+                1.0,  # output_scale
                 0,  # output_zero_point
                 dtype,  # output_dtype
                 "none",  # attr
@@ -1321,16 +1552,13 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
             graph.erase_node(conv_node)
             # Erase the dequant pattern
             if dtype == torch.bfloat16:
-                graph.erase_node(convert_to_bf16)
-            # Erase the dequant pattern
-            graph.erase_node(mul_node)
-            graph.erase_node(sub_node)
-            graph.erase_node(to_fp32_node)
+                graph.erase_node(convert_to_bf16)  # type: ignore[possibly-undefined]
+            graph.erase_node(dequant_node)
             # Erase the dequant per channel pattern
             if clone_node is not None:
                 graph.erase_node(clone_node)
             if dtype == torch.bfloat16:
-                graph.erase_node(weight_to_bf16_node)
+                graph.erase_node(weight_to_bf16_node)  # type: ignore[possibly-undefined]
             graph.erase_node(dequant_per_channel)
             counters["inductor"]["qconv2d_weight_prepack_matcher_count"] += 1
             counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"] += len(
@@ -1345,7 +1573,7 @@ def _generate_dequant_convolution_node_pattern(
     dequant_convolution_node_pattern = CallFunction(
         aten.convolution.default,
         _may_generate_pattern_with_dtype_convert(
-            dequantize_per_tensor_activation_pattern,
+            get_dequantize_per_tensor_activation_pattern(),
             KeywordArg("autocast_act_dtype"),
             dtype == torch.bfloat16,
         ),
@@ -1405,7 +1633,7 @@ def _get_linear_node(match, input_dim_exceeds_two, input_contiguous):
     return linear_node, output_reshape_node
 
 
-def _get_linear_dq_mul_node(
+def _get_linear_dq_node(
     linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
 ):
     act_reshape_node = None
@@ -1416,30 +1644,30 @@ def _get_linear_dq_mul_node(
             act_reshape_node = linear_node.args[input_index]
             assert act_reshape_node.target is aten.reshape.default
             if dtype == torch.float32:
-                # pattern: linear -> reshape -> mul
-                mul_node = act_reshape_node.args[0]
+                # pattern: linear -> reshape -> dequant
+                dequant_node = act_reshape_node.args[0]
             else:
-                # pattern: linear -> reshape -> to_bf16 -> mul
+                # pattern: linear -> reshape -> to_bf16 -> dequant
                 activation_to_bf16_node = act_reshape_node.args[0]
-                mul_node = activation_to_bf16_node.args[0]
+                dequant_node = activation_to_bf16_node.args[0]
         else:
             # bmm pattern decomposed from linear when input dim exceeds 2 and not contiguous
             act_expand_node = linear_node.args[input_index]
             assert act_expand_node.target is aten.expand.default
             if dtype == torch.float32:
-                mul_node = act_expand_node.args[0]
+                dequant_node = act_expand_node.args[0]
             else:
                 activation_to_bf16_node = act_expand_node.args[0]
-                mul_node = activation_to_bf16_node.args[0]
+                dequant_node = activation_to_bf16_node.args[0]
     else:
         if dtype == torch.float32:
-            # pattern: linear -> mul
-            mul_node = linear_node.args[input_index]
+            # pattern: linear -> dequant
+            dequant_node = linear_node.args[input_index]
         else:
-            # pattern: linear -> to_bf16 -> mul
+            # pattern: linear -> to_bf16 -> dequant
             activation_to_bf16_node = linear_node.args[input_index]
-            mul_node = activation_to_bf16_node.args[0]
-    return mul_node, act_reshape_node, activation_to_bf16_node, act_expand_node
+            dequant_node = activation_to_bf16_node.args[0]
+    return dequant_node, act_reshape_node, activation_to_bf16_node, act_expand_node
 
 
 def _is_valid_dequant_linear_pattern(dtype, input_dim_exceeds_two, input_contiguous):
@@ -1452,27 +1680,21 @@ def _inner(match):
 
         input_index = 1 if linear_node.target is aten.addmm.default else 0
         assert dtype in [torch.float32, torch.bfloat16]
-
         (
-            mul_node,
+            dequant_node,
             _,
             _,
             _,
-        ) = _get_linear_dq_mul_node(
+        ) = _get_linear_dq_node(
             linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
         )
 
-        sub_node = mul_node.args[0]
-        to_fp32_node = sub_node.args[0]
+        assert dequant_node.target in [
+            quantized_decomposed.dequantize_per_tensor.default,
+            quantized_decomposed.dequantize_per_tensor.tensor,
+        ]
 
-        assert to_fp32_node.target is prims.convert_element_type.default
-        assert sub_node.target is aten.sub.Tensor
-        assert mul_node.target is aten.mul.Tensor
-        if (
-            len(list(to_fp32_node.users)) != 1
-            or len(list(sub_node.users)) != 1
-            or len(list(mul_node.users)) != 1
-        ):
+        if len(list(dequant_node.users)) != 1:
             # Ensure the dequant pattern only has 1 user
             # since we will delete the dequant pattern here
             return False
@@ -1500,8 +1722,8 @@ def _inner(match):
             # wgt before expand should with dim 2
             # Expand size should with dim 3
             # Expand size[0] should same as act size[0]
-            # Expand size[1] should same as wgt size[0]
-            # Expand size[2] should same as wgt size[1]
+            # Expand size[1] should same as wgt size[1]
+            # Expand size[2] should same as wgt size[0]
             qweight_node = match.kwargs["q_weight"]
             wgt_expand_size = match.kwargs["wgt_expand_size"]
             if not (
@@ -1510,8 +1732,8 @@ def _inner(match):
                 and len(qweight_node.meta["val"].size()) == 2
                 and len(wgt_expand_size) == 3
                 and wgt_expand_size[0] == act_node.meta["val"].size()[0]
-                and wgt_expand_size[1] == qweight_node.meta["val"].size()[0]
-                and wgt_expand_size[2] == qweight_node.meta["val"].size()[1]
+                and wgt_expand_size[1] == qweight_node.meta["val"].size()[1]
+                and wgt_expand_size[2] == qweight_node.meta["val"].size()[0]
             ):
                 return False
 
@@ -1557,17 +1779,14 @@ def qlinear_weight_prepack(match: Match, *args, **kwargs):
         weight_index = input_index + 1
 
         (
-            mul_node,
+            dequant_node,
             act_reshape_node,
             activation_to_bf16_node,
             act_expand_node,
-        ) = _get_linear_dq_mul_node(
+        ) = _get_linear_dq_node(
             linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
         )
 
-        sub_node = mul_node.args[0]
-        to_fp32_node = sub_node.args[0]
-
         if input_dim_exceeds_two and not input_contiguous:
             wgt_expand_node = linear_node.args[weight_index]
             assert wgt_expand_node.target is aten.expand.default
@@ -1633,9 +1852,15 @@ def qlinear_weight_prepack(match: Match, *args, **kwargs):
                 [],  # post op args
                 "",  # post op algorithm
             )
-            new_linear_node = graph.call_function(
-                torch.ops.onednn.qlinear_pointwise.default, args=new_args
-            )
+            Node = torch.fx.node.Node
+            if isinstance(x_scale, Node) and isinstance(x_zp, Node):
+                new_linear_node = graph.call_function(
+                    torch.ops.onednn.qlinear_pointwise.tensor, args=new_args
+                )
+            else:
+                new_linear_node = graph.call_function(
+                    torch.ops.onednn.qlinear_pointwise.default, args=new_args
+                )
             if input_dim_exceeds_two:
                 if input_contiguous:
                     output_reshape_node.replace_all_uses_with(new_linear_node)
@@ -1658,24 +1883,22 @@ def qlinear_weight_prepack(match: Match, *args, **kwargs):
                 if input_contiguous:
                     graph.erase_node(output_reshape_node)
                 elif not input_contiguous and bias:
-                    graph.erase_node(output_add_node_for_bias)
+                    graph.erase_node(output_add_node_for_bias)  # type: ignore[possibly-undefined]
             graph.erase_node(linear_node)
             if input_dim_exceeds_two:
                 if input_contiguous:
                     graph.erase_node(act_reshape_node)
                 else:
                     graph.erase_node(act_expand_node)
-                    graph.erase_node(wgt_expand_node)
+                    graph.erase_node(wgt_expand_node)  # type: ignore[possibly-undefined]
             if dtype == torch.bfloat16:
                 graph.erase_node(activation_to_bf16_node)
             # Erase the dequant pattern
-            graph.erase_node(mul_node)
-            graph.erase_node(sub_node)
-            graph.erase_node(to_fp32_node)
+            graph.erase_node(dequant_node)
             # Erase the dequant per channel pattern
             graph.erase_node(t_node)
             if dtype == torch.bfloat16:
-                graph.erase_node(weight_to_bf16_node)
+                graph.erase_node(weight_to_bf16_node)  # type: ignore[possibly-undefined]
             graph.erase_node(dequant_per_channel)
 
             counters["inductor"]["qlinear_weight_prepack_matcher_count"] += 1
@@ -1685,7 +1908,10 @@ def qlinear_weight_prepack(match: Match, *args, **kwargs):
 
 
 def _generate_dequant_linear_node_pattern(
-    _dequant_per_channel_pattern, dtype=torch.float32, input_dim_exceeds_two=False
+    _dequant_per_channel_pattern,
+    dtype=torch.float32,
+    input_dim_exceeds_two=False,
+    is_tensor_overload=False,
 ):
     assert dtype in [torch.float32, torch.bfloat16]
     t_pattern = _generate_linear_t_pattern(_dequant_per_channel_pattern, dtype)
@@ -1695,7 +1921,7 @@ def _generate_dequant_linear_node_pattern(
             KeywordArg("b"),
             _may_generate_pattern_with_reshape(
                 _may_generate_pattern_with_dtype_convert(
-                    dequantize_per_tensor_activation_pattern,
+                    get_dequantize_per_tensor_activation_pattern(is_tensor_overload),
                     KeywordArg("autocast_act_dtype"),
                     dtype == torch.bfloat16,
                 ),
@@ -1712,7 +1938,7 @@ def _generate_dequant_linear_node_pattern(
             aten.mm.default,
             _may_generate_pattern_with_reshape(
                 _may_generate_pattern_with_dtype_convert(
-                    dequantize_per_tensor_activation_pattern,
+                    get_dequantize_per_tensor_activation_pattern(is_tensor_overload),
                     KeywordArg("autocast_act_dtype"),
                     dtype == torch.bfloat16,
                 ),
@@ -1731,6 +1957,7 @@ def _generate_dequant_bmm_node_pattern(
     _dequant_per_channel_pattern,
     dtype=torch.float32,
     with_bias=False,
+    is_tensor_overload=False,
 ):
     # When activation of linear dim exceed 2 and not contiguous
     t_pattern = _generate_linear_t_pattern(_dequant_per_channel_pattern, dtype)
@@ -1741,7 +1968,7 @@ def _generate_dequant_bmm_node_pattern(
         CallFunction(
             aten.expand.default,
             _may_generate_pattern_with_dtype_convert(
-                dequantize_per_tensor_activation_pattern,
+                get_dequantize_per_tensor_activation_pattern(is_tensor_overload),
                 KeywordArg("autocast_act_dtype"),
                 dtype == torch.bfloat16,
             ),
@@ -1772,24 +1999,29 @@ def _generate_qlinear_weight_prepack_patterns(
     input_dim_exceeds_two=False,
     input_contiguous=True,
     with_bias=False,
+    is_tensor_overload=False,
 ):
     if input_dim_exceeds_two and not input_contiguous:
         return _generate_dequant_bmm_node_pattern(
             dequantize_per_channel_weight_pattern,
             dtype,
             with_bias,
+            is_tensor_overload,
         )
     else:
         return _generate_dequant_linear_node_pattern(
-            dequantize_per_channel_weight_pattern, dtype, input_dim_exceeds_two
+            dequantize_per_channel_weight_pattern,
+            dtype,
+            input_dim_exceeds_two,
+            is_tensor_overload,
         )
 
 
 def _register_dequant_promotion():
     dequant_pattern_cases = itertools.product(
-        [torch.float32, torch.bfloat16], [True, False]
+        [torch.float32, torch.bfloat16], [True, False], [True, False]
     )
-    for dtype, input_dim_exceeds_two in dequant_pattern_cases:
+    for dtype, input_dim_exceeds_two, is_tensor_overload in dequant_pattern_cases:
         # 4 dequantization patterns will be matched based on the dtype and input dimension size.
         # Case 1: int8-mixed-fp32, input dim size is 2
         # Case 2: int8-mixed-fp32, input dim size exceeds 2
@@ -1813,7 +2045,9 @@ def _register_dequant_promotion():
         _register_dequant_promotion_pass(
             _may_generate_pattern_with_reshape(
                 _may_generate_pattern_with_dtype_convert(
-                    dequantize_per_tensor_activation_pattern,
+                    get_dequantize_per_tensor_activation_pattern(
+                        is_tensor_overload=is_tensor_overload
+                    ),
                     KeywordArg("autocast_act_dtype"),
                     dtype == torch.bfloat16,
                 ),
@@ -1871,13 +2105,15 @@ def _register_qlinear_weight_prepack():
     #   |            OPT(add)               |
 
     linear_weight_prepack_cases = itertools.product(
-        [torch.float32, torch.bfloat16], [True, False]
+        [torch.float32, torch.bfloat16], [True, False], [True, False]
     )
 
     # Step 1: register patterns from mm and addmm
-    for dtype, input_dim_exceeds_two in linear_weight_prepack_cases:
+    for dtype, input_dim_exceeds_two, is_tensor_overload in linear_weight_prepack_cases:
         weight_prepack_patterns = _generate_qlinear_weight_prepack_patterns(
-            dtype, input_dim_exceeds_two
+            dtype,
+            input_dim_exceeds_two,
+            is_tensor_overload=is_tensor_overload,
         )
         for weight_prepack_pattern in weight_prepack_patterns:
             # Register to pass_number 1, so we can do dequant promotion in pass_number 0.
@@ -1894,14 +2130,15 @@ def _register_qlinear_weight_prepack():
     # https://github.com/pytorch/pytorch/blob/
     # 80c07df659362a95da7cd4f3ec367abfdace38c4/torch/_decomp/decompositions.py#L3965-L3968
     # in this case, we can convert it back to qlinear
-    for dtype, with_bias in itertools.product(
-        [torch.float32, torch.bfloat16], [True, False]
+    for dtype, with_bias, is_tensor_overload in itertools.product(
+        [torch.float32, torch.bfloat16], [True, False], [True, False]
     ):
         bmm_pattern = _generate_qlinear_weight_prepack_patterns(
             dtype=dtype,
             input_dim_exceeds_two=True,
             input_contiguous=False,
             with_bias=with_bias,
+            is_tensor_overload=is_tensor_overload,
         )
         _register_qlinear_weight_prepack_pass(
             bmm_pattern,
@@ -1924,3 +2161,94 @@ def _register_quantization_weight_pack_pass():
 
     # Step 3: QLinear weight prepack
     _register_qlinear_weight_prepack()
+
+
+def quant_lift_up(graph_module: torch.fx.GraphModule):
+    """
+    Lift up the quant node before view like nodes. It can benefit performance
+    of Attention like block. For example, we have the pattern as:
+
+             DQ
+    DQ       LINEAR
+    LINEAR   VIEW
+    VIEW     PERMUTE
+    PERMUTE  TRANSPOSE
+    Q        Q
+    DQ       DQ
+       Matmul
+        DIV
+        ADD
+      SOFTMAX
+
+    We want to lift up the the quant nodes from matmul before view like nodes
+    as the output of Linear node.
+
+             DQ
+    DQ       LINEAR
+    LINEAR   Q
+    Q        VIEW
+    VIEW     PERMUTE
+    PERMUTE  TRANSPOSE
+    DQ       DQ
+       Matmul
+        DIV
+        ADD
+      SOFTMAX
+
+    It produces a DQ->LINEAR->Q pattern which can be fused by backend.
+    """
+
+    def is_view_op(node):
+        return node.op == "call_function" and node.target in _VIEW_OPS
+
+    for node in graph_module.graph.nodes:
+        # <TODO> Leslie: Here we verify that the quant node has exactly
+        # one input FX node, with constant scalar value for scale and zero point.
+        # For the case input of quant node has more than one input FX nodes,
+        # extend the implementation to lift up all the connected nodes
+        # before the view nodes to keep the topological order.
+        if (
+            node.op == "call_function"
+            and node.target in _PER_TENSOR_QUANTIZE_OPS
+            and len(node.all_input_nodes) == 1
+            and is_view_op(node.all_input_nodes[0])
+        ):
+            quant_node = node
+            input_node_of_quant = quant_node.args[0]
+
+            # Check the nodes along lift up path has only 1 user node
+            # Propagate view like node to find where to insert the new quant node
+            could_lift_up = True
+            current_node = quant_node
+            input_node = current_node.args[0]
+            while is_view_op(input_node):
+                if len(input_node.users) != 1:
+                    could_lift_up = False
+                    break
+                current_node = input_node
+                input_node = current_node.args[0]
+
+            # Further check the input node of the first view node has only 1 user node
+            if could_lift_up and len(input_node.users) == 1:
+                # Replace dequant's input from quant to quant's input
+                quant_node.replace_all_uses_with(input_node_of_quant)
+                # Insert the new quant node
+                with graph_module.graph.inserting_before(current_node):
+                    new_quant_node = graph_module.graph.node_copy(quant_node)
+                    input_node.replace_all_uses_with(new_quant_node)
+
+                    # Update inputs of new_quant_node
+                    def maybe_replace_node(n: torch.fx.Node) -> torch.fx.Node:
+                        if n == input_node_of_quant:
+                            return input_node
+                        else:
+                            return n
+
+                    new_args = map_arg(new_quant_node.args, maybe_replace_node)
+                    new_kwargs = map_arg(new_quant_node.kwargs, maybe_replace_node)
+                    new_quant_node.args = new_args
+                    new_quant_node.kwargs = new_kwargs
+                    graph_module.graph.erase_node(quant_node)
+
+    graph_module.graph.lint()
+    graph_module.recompile()
diff --git a/torch/_inductor/fx_passes/reinplace.py b/torch/_inductor/fx_passes/reinplace.py
new file mode 100644
index 0000000000000..e12e8f5c85952
--- /dev/null
+++ b/torch/_inductor/fx_passes/reinplace.py
@@ -0,0 +1,543 @@
+import itertools
+import operator
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Tuple
+
+import torch
+from torch._higher_order_ops.triton_kernel_wrap import triton_kernel_wrapper_functional
+from torch._inductor import inductor_prims
+from torch._inductor.fx_utils import get_node_storage, is_node_realized
+from torch._inductor.lowering import (
+    inplaceable_foreach_ops as inplaceable_foreach_ops_lowerings,
+)
+from torch._inductor.virtualized import V
+from torch.fx.immutable_collections import immutable_dict
+from torch.fx.passes.reinplace import _is_view_op
+from torch.utils import _pytree as pytree
+
+aten = torch.ops.aten
+
+
+@dataclass(frozen=True)
+class InplaceableOp:
+    inplace_op: Callable[..., Any]
+    mutated_arg: int
+    extra_check: Callable[[torch.fx.Node], bool] = lambda node: True
+
+
+_SCATTER_OP_TO_VIEW = {
+    torch.ops.aten.diagonal_scatter.default: torch.ops.aten.diagonal.default,
+    torch.ops.aten.select_scatter.default: torch.ops.aten.select.int,
+    torch.ops.aten.slice_scatter.default: torch.ops.aten.slice.Tensor,
+    torch.ops.aten.as_strided_scatter.default: torch.ops.aten.as_strided.default,
+}
+_VIEW_OP_TO_SCATTER = {v: k for k, v in _SCATTER_OP_TO_VIEW.items()}
+
+
+def graph_call_function(graph: torch.fx.Graph, fn, *args, **kwargs):
+    fake_args, fake_kwargs = pytree.tree_map(
+        lambda node: node.meta["val"] if isinstance(node, torch.fx.Node) else node,
+        (args, kwargs),
+    )
+    with V.fake_mode:
+        fake_result = fn(*fake_args, **fake_kwargs)
+
+    node = graph.call_function(fn, args, kwargs)
+    node.meta["val"] = fake_result
+    return node
+
+
+@dataclass
+class ViewOp:
+    target: torch._ops.OpOverload
+    args: Tuple[Any, ...]
+    kwargs: Dict[str, Any]
+
+
+def _inplace_generalized_scatter(
+    inp: torch.Tensor, src: torch.Tensor, view_ops: List[ViewOp]
+) -> torch.Tensor:
+    tmp = inp
+    for view in view_ops:
+        fake_args, fake_kwargs = pytree.tree_map(
+            lambda node: node.meta["val"] if isinstance(node, torch.fx.Node) else node,
+            (view.args, view.kwargs),
+        )
+        tmp = view.target(tmp, *fake_args, **fake_kwargs)
+    try:
+        tmp.copy_(src)
+    except RuntimeError as e:
+        raise RuntimeError(
+            f"shape error in scatter op, can not broadcast {src.shape} to {tmp.shape}"
+        ) from e
+    return inp
+
+
+def _generalized_scatter(
+    inp: torch.Tensor, src: torch.Tensor, view_ops: List[ViewOp]
+) -> torch.Tensor:
+    out = inp.clone()
+    return _inplace_generalized_scatter(out, src, view_ops)
+
+
+def _decompose_scatter_functional_helper(
+    graph: torch.fx.Graph,
+    inp: torch.Tensor,
+    src: torch.Tensor,
+    view_ops: List[ViewOp],
+) -> torch.fx.Node:
+    view_op, view_ops_tail = view_ops[0], view_ops[1:]
+
+    if view_ops_tail:
+        view = graph_call_function(
+            graph, view_op.target, inp, *view_op.args, **view_op.kwargs
+        )
+        src = _decompose_scatter_functional_helper(graph, view, src, view_ops[1:])  # type: ignore[assignment]
+
+    return graph_call_function(
+        graph,
+        _VIEW_OP_TO_SCATTER[view_op.target],
+        inp,
+        src,
+        *view_op.args,
+        **view_op.kwargs,
+    )
+
+
+def _decompose_scatter_functional(
+    graph: torch.fx.Graph, node: torch.fx.Node
+) -> torch.fx.Node:
+    """Decompose _generalized_scatter to a sequence of view_scatter operations
+
+    e.g. _generalized_scatter(inp, src, [(aten.slice, 0, 0, 10), (aten.slice, 1, 10, -10)])
+
+    will become
+
+    view = aten.slice(inp, 0, 0, 10)
+    view_updated = aten.slice_scatter(view, src, 1, 10, -10)
+    inp_updated = aten.slice_scatter(inp, view_updated, 0, 0, 10)
+    """
+    assert node.target is _generalized_scatter
+    inp, src, view_ops = node.args
+    return _decompose_scatter_functional_helper(graph, *node.args)  # type: ignore[arg-type]
+
+
+def _decompose_scatter_mutating(
+    graph: torch.fx.Graph, node: torch.fx.Node
+) -> torch.fx.Node:
+    """Decompose _generalized_scatter using mutations
+
+    e.g. _generalized_scatter(inp, src, [(aten.slice, 0, 0, 10), (aten.slice, 1, 10, -10)])
+
+    will become
+
+    inp_updated = aten.clone(inp)
+    slice1 = aten.slice(inp_updated, 0, 0, 10)
+    slice2 = aten.slice(slice1, 1, 10, -10)
+    slice2.copy_(src)
+
+    """
+    assert node.target in (_generalized_scatter, _inplace_generalized_scatter)
+    inp, src, view_ops = node.args
+    assert not node.kwargs
+
+    if node.target is _generalized_scatter:
+        inp = graph_call_function(graph, aten.clone, inp)
+
+    tmp = inp
+    for view in view_ops:  # type: ignore[union-attr]
+        tmp = graph_call_function(graph, view.target, tmp, *view.args, **view.kwargs)  # type: ignore[union-attr]
+
+    graph_call_function(graph, aten.copy_.default, tmp, src)
+    return inp  # type: ignore[return-value]
+
+
+# View ops whose view_scatter op is lowered into mutations anyway,
+# so is never a pessimisation to decompose.
+_ALWAYS_MUTATING_SCATTER_OPS = {
+    aten.as_strided.default,
+    aten.diagonal.default,
+}
+
+
+def scatter_always_uses_mutation(node: torch.fx.Node) -> bool:
+    _, _, view_ops = node.args
+    return any(view.target in _ALWAYS_MUTATING_SCATTER_OPS for view in view_ops)  # type: ignore[union-attr]
+
+
+def should_reinplace_scatter(node: torch.fx.Node) -> bool:
+    """Choose between mutating and functional scatter decompositions
+
+    Reinplacing view scatter ops can be pessimising as it blocks fusion with the
+    input or output tensor computations. However, it is still profitable if the
+    input and output would have been realized anyway.
+
+    """
+    inp, src, view_ops = node.args
+
+    # Mutating scatter ops unconditionally realize input and output
+    if scatter_always_uses_mutation(node):
+        return True
+
+    if is_node_realized(inp) and is_node_realized(node):  # type: ignore[arg-type]
+        return True
+
+    # If the output is copied back into the input, this forces both to be
+    # realized as the output is a user of the input
+    if inp.op == "placeholder" and any(  # type: ignore[union-attr]
+        user.target is aten.copy_.default and user.args[0] is inp for user in node.users
+    ):
+        return True
+
+    # Otherwise, assume fusions will make functional variants profitable
+    return False
+
+
+def decompose_generalized_scatter(graph: torch.fx.Graph) -> None:
+    """Replace _generalized_scatter with normal aten ops"""
+    for node in itertools.chain(
+        graph.find_nodes(op="call_function", target=_generalized_scatter),
+        graph.find_nodes(op="call_function", target=_inplace_generalized_scatter),
+    ):
+        use_mutation = (
+            node.target is _inplace_generalized_scatter
+            or scatter_always_uses_mutation(node)
+        )
+
+        with graph.inserting_before(node):
+            if use_mutation:
+                new_node = _decompose_scatter_mutating(graph, node)
+            else:
+                new_node = _decompose_scatter_functional(graph, node)
+
+        node.replace_all_uses_with(new_node)
+        graph.erase_node(node)
+
+
+def canonicalize_view_scatter_ops(graph: torch.fx.Graph) -> None:
+    """
+    This canonicalizes view scatter ops into a generalized form, defined as:
+      def scatter(inp, src, views):
+        tmp = inp.clone()
+        for view in views:
+          tmp = view(tmp)
+        tmp.copy_(src)
+
+    We also fuse consecutive view scatter ops of the form
+        a = scatter(view2(self), src, [view1])
+        b = scatter(self, a, [view2])
+    which can be rewritten as
+        b = scatter(self, src, [view2, view1])
+        a = view2(b)
+
+    This is both more efficient as we only do a single scatter, and also
+    easier to reinplace since there is only one use of `self`
+    """
+
+    node_to_view_base: Dict[torch.fx.Node, torch.fx.Node] = {}
+    node_to_view_op: Dict[torch.fx.Node, List[ViewOp]] = defaultdict(list)
+
+    def handle_views(node: torch.fx.Node):
+        inp = node.args[0]
+        node_to_view_base[node] = node_to_view_base.get(inp, inp)  # type: ignore[arg-type]
+        node_to_view_op[node] = [
+            *node_to_view_op[inp],  # type: ignore[index]
+            ViewOp(
+                node.target,  # type: ignore[arg-type]
+                args=node.args[1:],
+                kwargs=node.kwargs,
+            ),
+        ]
+
+    def handle_view_scatter(node: torch.fx.Node):
+        assert len(node.args) >= 2
+        inp, src = node.args[:2]
+
+        scatter_view_op = ViewOp(
+            _SCATTER_OP_TO_VIEW[node.target],
+            args=node.args[2:],
+            kwargs=node.kwargs,
+        )
+
+        def can_fuse():
+            if src.target is not _generalized_scatter:  # type: ignore[union-attr]
+                return False
+            src_inp, src_src, src_scatter_view_op = src.args  # type: ignore[union-attr]
+
+            inp_base = node_to_view_base.get(inp, inp)  # type: ignore[arg-type]
+            src_base = node_to_view_base.get(src_inp, src_inp)  # type: ignore[arg-type]
+            return inp_base is src_base and node_to_view_op[src_inp] == [  # type: ignore[index]
+                *node_to_view_op[inp],  # type: ignore[index]
+                scatter_view_op,
+            ]
+
+        if not can_fuse():
+            with graph.inserting_before(node):
+                new_node = graph_call_function(
+                    graph,
+                    _generalized_scatter,
+                    inp,
+                    src,
+                    [scatter_view_op],
+                )
+            node.replace_all_uses_with(new_node)
+            graph.erase_node(node)
+            return
+
+        src_inp, src_src, src_scatter_view_op = src.args  # type: ignore[union-attr]
+        with graph.inserting_before(src):
+            new_node = graph_call_function(
+                graph,
+                _generalized_scatter,
+                inp,
+                src_src,
+                [scatter_view_op, *src_scatter_view_op],  # type: ignore[misc]
+            )
+            node.replace_all_uses_with(new_node)
+            graph.erase_node(node)
+
+            if src.users:  # type: ignore[union-attr]
+                new_src = graph_call_function(
+                    graph,
+                    _SCATTER_OP_TO_VIEW[node.target],
+                    new_node,
+                    *node.args[2:],
+                    **node.kwargs,
+                )
+
+                handle_views(new_src)
+                src.replace_all_uses_with(new_src)  # type: ignore[union-attr]
+
+            graph.erase_node(src)
+
+    for node in graph.nodes:
+        if _is_view_op(node.target):
+            handle_views(node)
+        elif node.target in _SCATTER_OP_TO_VIEW:
+            handle_view_scatter(node)
+
+
+inplaceable_ops = {
+    aten.index_put.default: InplaceableOp(aten.index_put_.default, 0),
+    aten._unsafe_index_put.default: InplaceableOp(inductor_prims._unsafe_index_put_, 0),
+    _generalized_scatter: InplaceableOp(
+        _inplace_generalized_scatter,
+        0,
+        extra_check=should_reinplace_scatter,
+    ),
+}
+
+try:
+    c10d_functional = torch.ops._c10d_functional
+    inplaceable_collective_ops = {
+        c10d_functional.all_reduce.default: InplaceableOp(
+            c10d_functional.all_reduce_.default, 0
+        ),
+        c10d_functional.all_reduce_coalesced.default: InplaceableOp(
+            c10d_functional.all_reduce_coalesced_.default, 0
+        ),
+    }
+    inplaceable_ops.update(inplaceable_collective_ops)
+except AttributeError:
+    # _c10d_functional ops are only available when torch
+    # is built with USE_DISTRIBUTED=1.
+    pass
+
+inplaceable_foreach_ops: Dict[torch._ops.OpOverload, InplaceableOp] = {}
+for outplace_op, inplace_op in inplaceable_foreach_ops_lowerings.items():
+    inplaceable_foreach_ops[outplace_op] = InplaceableOp(inplace_op, 0)
+
+
+inplaceable_triton_ops = {triton_kernel_wrapper_functional}
+
+
+# Operators that don't depend on the tensor data
+META_ONLY_OPS = {
+    aten.sym_size.int,
+    aten.sym_stride.int,
+    aten.sym_numel.default,
+    aten.sym_storage_offset.default,
+}
+
+
+def reinplace_inplaceable_ops_core(graph: torch.fx.Graph) -> None:
+    """
+    Reinplaces in-placeable operations.
+    If there are no uses of a view of the mutated arg after the current node,
+    it is possible to inplace the op.
+    This above algorithm could be justified by observing side effects. While
+    we traverse the graph in forwards direction, only latter nodes could view
+    side effects of the current node. If the current node is not used later as
+    well as no view of this node is used later in the graph, then it is safe to
+    inplace as there would be no way to observe the side effects.
+    This condition is slightly different for graph inputs where they can only
+    be inplaced if the above condition is true and there's a copy_ in the
+    epilogue that signals that the caller wants to observe the mutation.
+    """
+
+    copy_args_to_copy_nodes = {}
+    mutated_inputs = set()
+    storage_to_nodes = defaultdict(list)
+    node_order: Dict[Any, int] = {}
+    for i, node in enumerate(reversed(graph.nodes)):
+        node_order[node] = len(graph.nodes) - i - 1
+        storage_to_nodes[get_node_storage(node)].append(node)
+        if node.target == aten.copy_.default and node.args[0].op == "placeholder":
+            dst = node.args[0]
+            src = node.args[1]
+            # If the target is a getitem and it indexes a possible clone,
+            # then skip over it
+            if src.target == operator.getitem and (
+                (
+                    src.args[0].target == triton_kernel_wrapper_functional
+                    and src.args[0].kwargs["kwargs"][src.args[1]] == node.args[0]
+                )
+                or (src.args[0].target in inplaceable_foreach_ops)
+                or (src.args[0].target == torch.ops.higher_order.auto_functionalized)
+            ):
+                src = src.args[0]
+
+            copy_args_to_copy_nodes[(dst, src)] = node
+
+            mutated_inputs.add(node.args[0])
+
+    def any_use_of_views_after_node(node, shared_view_nodes, *, copy_node):
+        node_loc = node_order[node]
+        copy_node_loc = node_order[copy_node] if copy_node is not None else None
+
+        def is_meta_only_user(node):
+            if _is_view_op(node.target):
+                return all(is_meta_only_user(u) for u in node.users)
+            return node.target in META_ONLY_OPS
+
+        for view in shared_view_nodes:
+            for user in view.users:
+                user_loc = node_order[user]
+                # Skip all users before node
+                if user_loc <= node_loc:
+                    continue
+                # Ignore uses after the copy_ epilogue node, where the input
+                # has already been mutated anyway
+                if copy_node_loc is not None and copy_node_loc <= user_loc:
+                    continue
+                # Reinplacing does not change shape metadata
+                if is_meta_only_user(user):
+                    continue
+                return True
+        return False
+
+    def can_inplace(node, mutated_arg):
+        if isinstance(mutated_arg, (list, tuple)):
+            return all(can_inplace(node, arg) for arg in mutated_arg)
+
+        if get_node_storage(mutated_arg) is None:
+            return False
+        shared_view_nodes = storage_to_nodes[get_node_storage(mutated_arg)]
+        if mutated_arg.op == "placeholder":
+            if not (
+                copy_node := copy_args_to_copy_nodes.get((mutated_arg, node), False)
+            ):
+                return False
+
+            if any_use_of_views_after_node(
+                node, shared_view_nodes, copy_node=copy_node
+            ):
+                return False
+
+            return True
+        elif any(view.op == "placeholder" for view in shared_view_nodes):
+            # If mutated arg is view of any of the inputs of the graph,
+            # do not allow for inplacing.
+            # This would require more sophisticated algorithm to handle
+            return False
+        else:
+            return not any_use_of_views_after_node(
+                node, shared_view_nodes, copy_node=None
+            )
+
+    replace_dict: Dict[torch.fx.Node, torch.fx.Node] = {}
+
+    def reinplace_and_refine_tensors_to_clone(old_tensors_to_clone, kwargs):
+        tensors_to_clone: List[str] = []
+        for arg in old_tensors_to_clone:
+            assert arg in kwargs
+            mutated_arg = kwargs[arg]
+            if can_inplace(node, mutated_arg):
+                copy_node = copy_args_to_copy_nodes.get((mutated_arg, node))
+                if copy_node is not None:
+                    replace_dict[copy_node] = copy_node.args[0]
+                for user in node.users:
+                    if user.target == operator.getitem and user.args[1] == arg:
+                        replace_dict[user] = mutated_arg
+            else:
+                tensors_to_clone.append(arg)
+        return tensors_to_clone
+
+    for node in graph.nodes:
+        if (inplaceable_op := inplaceable_ops.get(node.target, None)) is not None:
+            mutated_arg = node.args[inplaceable_op.mutated_arg]
+            if can_inplace(node, mutated_arg) and inplaceable_op.extra_check(node):
+                # TODO(yifu): this doesn't properly remove copy epilogues for
+                # ops that mutate multiple inputs. Need to revise the copy
+                # node tracking logic to support the case.
+                copy_node = copy_args_to_copy_nodes.get((mutated_arg, node))
+                if copy_node is not None:
+                    replace_dict[copy_node] = copy_node.args[0]
+                node.target = inplaceable_op.inplace_op
+        elif node.target == torch.ops.higher_order.auto_functionalized:
+            _mutable_op = node.args[0]
+            from torch._higher_order_ops.auto_functionalize import get_mutable_arg_names
+
+            tensors_to_clone = get_mutable_arg_names(_mutable_op)
+            # Don't try to reinplace Optional[Tensor] args that are None.
+            tensors_to_clone = [
+                t for t in tensors_to_clone if node.kwargs[t] is not None
+            ]
+            tensors_to_clone = reinplace_and_refine_tensors_to_clone(
+                tensors_to_clone, node.kwargs
+            )
+
+            # Stash the metadata. There is a pass later on where we decompose
+            # auto_functionalized into clones + a mutable op; this metadata
+            # tells the decomp to only clone the following inputs
+            node.meta["only_clone_these_tensors"] = tensors_to_clone
+        elif node.target in inplaceable_triton_ops:
+            # inplaceable_triton_ops take an additional argument called
+            # tensors_to_clone which contain a list of tensors to clone
+            # This pass iterates over them and sees which ones are safe
+            # to eliminate (i.e. no longer need the clones)
+            tensors_to_clone = reinplace_and_refine_tensors_to_clone(
+                node.kwargs["tensors_to_clone"], node.kwargs["kwargs"]
+            )
+
+            kwargs = dict(node.kwargs)
+            kwargs["tensors_to_clone"] = tensors_to_clone
+            node.kwargs = immutable_dict(kwargs)
+        elif (
+            inplaceable_op := inplaceable_foreach_ops.get(node.target, None)
+        ) is not None:
+            mutated_args = node.args[inplaceable_op.mutated_arg]
+
+            if not all((arg, node) in copy_args_to_copy_nodes for arg in mutated_args):
+                continue
+
+            if can_inplace(node, mutated_args):
+                for arg in mutated_args:
+                    copy_node = copy_args_to_copy_nodes[(arg, node)]
+                    replace_dict[copy_node] = copy_node.args[0]
+
+                node.target = inplaceable_op.inplace_op
+    for node, replacement in replace_dict.items():
+        while replacement in replace_dict:
+            replacement = replace_dict[replacement]
+        replace_dict[node] = replacement
+
+        node.replace_all_uses_with(replacement)
+        graph.erase_node(node)
+
+
+def reinplace_inplaceable_ops(graph: torch.fx.Graph) -> None:
+    canonicalize_view_scatter_ops(graph)
+    reinplace_inplaceable_ops_core(graph)
+    decompose_generalized_scatter(graph)
diff --git a/torch/_inductor/fx_passes/replace_random.py b/torch/_inductor/fx_passes/replace_random.py
index aaeb48a63e282..59d4c3891226a 100644
--- a/torch/_inductor/fx_passes/replace_random.py
+++ b/torch/_inductor/fx_passes/replace_random.py
@@ -112,7 +112,9 @@ def replacement(size):
     mode = {
         aten.rand: "rand",
         aten.randn: "randn",
-    }[match.output_node().target.overloadpacket]
+    }[
+        match.output_node().target.overloadpacket  # type: ignore[union-attr]
+    ]  # type: ignore[union-attr]
     device = get_device(device)
     match.replace_by_example(replacement, [size])
 
@@ -129,9 +131,9 @@ def replace_randint(
     layout=None,
     pin_memory=None,
 ):
-    def replacement(size):
+    def replacement(low, high, size):
         result = inductor_prims.randint(low, high, size, inductor_prims.seed(device))
         return result.to(dtype)
 
     device = get_device(device)
-    match.replace_by_example(replacement, [size])
+    match.replace_by_example(replacement, [low, high, size])
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_1.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_1.py
index 68b2cd5d56846..ce678d28833be 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_1.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_1.py
@@ -1,8 +1,9 @@
+# mypy: ignore-errors
+
 # noqa: F401, E501
 # This is an auto-generated file. Please do not modify it by hand.
 # To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
 
 import torch
 import torch._inductor
@@ -48,19 +49,19 @@
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
-view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
-permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
-bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
-view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
 alias_default = CallFunction(aten.alias.default, div_Tensor_1)
 alias_default_1 = CallFunction(aten.alias.default, alias_default)
 alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
 alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
 mul_Tensor = CallFunction(aten.mul.Tensor, view_default_7, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
-mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
-div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, KeywordArg('inv_scale'))
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+div_Tensor_2 = CallFunction(aten.div.Tensor, fma_default, KeywordArg('inv_scale'))
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
@@ -98,7 +99,7 @@
 expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_1_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_1_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
 
 
 expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
@@ -122,21 +123,21 @@
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
-view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
-permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
-bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
-view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
-convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
 alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
 alias_default_1 = CallFunction(aten.alias.default, alias_default)
 alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
 alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
-convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
-mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, convert_element_type_default_3, _users=2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_2, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
-mul_Tensor_1 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
-convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
 div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_4, KeywordArg('inv_scale'))
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
@@ -149,7 +150,7 @@
 permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
 bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
-_sfdp_pattern_1_training_half = MultiOutputPattern([view_default_5,
+_sfdp_pattern_1_half_training = MultiOutputPattern([view_default_5,
   view_default_9,
   permute_default_4,
   view_default_11,
@@ -177,4 +178,4 @@
 expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_1_inference_half = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_1_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_10.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_10.py
index 442b6dfef1d12..a9c38dd92fd0d 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_10.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_10.py
@@ -1,8 +1,9 @@
+# mypy: ignore-errors
+
 # noqa: F401, E501
 # This is an auto-generated file. Please do not modify it by hand.
 # To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
 
 import torch
 import torch._inductor
@@ -55,21 +56,21 @@
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
 convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, bmm_default_2, Ignored())
 view_default_7 = CallFunction(aten.view.default, convert_element_type_default_1, Ignored())
 convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
-alias_default = CallFunction(aten.alias.default, div_Tensor_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
 mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
-mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
-view_default_8 = CallFunction(aten.view.default, sub_Tensor_1, Ignored(), _users=2)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+view_default_8 = CallFunction(aten.view.default, fma_default, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
 view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
@@ -116,7 +117,7 @@
 clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_10_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_10_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
 
 
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
@@ -146,20 +147,20 @@
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
 view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
 convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
-alias_default = CallFunction(aten.alias.default, div_Tensor_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
 mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
-mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
-convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
 view_default_8 = CallFunction(aten.view.default, convert_element_type_default_3, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
@@ -175,7 +176,7 @@
 bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
 permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
-_sfdp_pattern_10_training_half = MultiOutputPattern([view_default_5,
+_sfdp_pattern_10_half_training = MultiOutputPattern([view_default_5,
   permute_default_6,
   permute_default_9,
   permute_default_11
@@ -208,4 +209,4 @@
 clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_10_inference_half = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_10_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_11.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_11.py
index 78947c8410197..e324c7943e21b 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_11.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_11.py
@@ -1,8 +1,9 @@
+# mypy: ignore-errors
+
 # noqa: F401, E501
 # This is an auto-generated file. Please do not modify it by hand.
 # To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
 
 import torch
 import torch._inductor
@@ -54,19 +55,19 @@
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
-view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
-permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
-bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
-view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
 alias_default = CallFunction(aten.alias.default, div_Tensor_1)
 alias_default_1 = CallFunction(aten.alias.default, alias_default)
 alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
 alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
 mul_Tensor = CallFunction(aten.mul.Tensor, view_default_7, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
-mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
-div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, KeywordArg('inv_scale'))
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+div_Tensor_2 = CallFunction(aten.div.Tensor, fma_default, KeywordArg('inv_scale'))
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
@@ -113,7 +114,7 @@
 clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_11_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_11_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
 
 
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
@@ -143,21 +144,21 @@
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
-view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
-permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
-bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
-view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
-convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
 alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
 alias_default_1 = CallFunction(aten.alias.default, alias_default)
 alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
 alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
-convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
-mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, convert_element_type_default_3, _users=2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_2, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
-mul_Tensor_1 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
-convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
 div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_4, KeywordArg('inv_scale'))
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
@@ -173,7 +174,7 @@
 bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
 permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
-_sfdp_pattern_11_training_half = MultiOutputPattern([view_default_5,
+_sfdp_pattern_11_half_training = MultiOutputPattern([view_default_5,
   permute_default_6,
   permute_default_9,
   permute_default_11,
@@ -207,4 +208,4 @@
 clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_11_inference_half = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_11_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_12.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_12.py
index b231ef99a0ca5..09220864f13eb 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_12.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_12.py
@@ -1,8 +1,9 @@
+# mypy: ignore-errors
+
 # noqa: F401, E501
 # This is an auto-generated file. Please do not modify it by hand.
 # To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
 
 import torch
 import torch._inductor
@@ -58,6 +59,11 @@
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
@@ -66,15 +72,10 @@
 mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
 mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
 clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
-alias_default = CallFunction(aten.alias.default, div_Tensor_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
 mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
-mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
-div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, KeywordArg('inv_scale_factor'))
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+div_Tensor_2 = CallFunction(aten.div.Tensor, fma_default, KeywordArg('inv_scale_factor'))
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
@@ -123,7 +124,7 @@
 clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
 view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_12_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_12_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
 
 
 rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
@@ -157,25 +158,25 @@
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
 view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
-convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
-mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
 mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
 clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
-convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, clone_default_3, Ignored())
-alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
-convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
-mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_4, _users=2)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, clone_default_3, Ignored())
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, convert_element_type_default_2, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
-mul_Tensor_5 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
-convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
 div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_5, KeywordArg('inv_scale_factor'))
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
@@ -191,7 +192,7 @@
 bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
 permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
-_sfdp_pattern_12_training_half = MultiOutputPattern([view_default_5,
+_sfdp_pattern_12_half_training = MultiOutputPattern([view_default_5,
   permute_default_6,
   permute_default_9,
   permute_default_11,
@@ -227,4 +228,4 @@
 clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
 view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_12_inference_half = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_12_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_13.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_13.py
index 200fe6a7e76a8..ad05c6ed40147 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_13.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_13.py
@@ -1,8 +1,9 @@
+# mypy: ignore-errors
+
 # noqa: F401, E501
 # This is an auto-generated file. Please do not modify it by hand.
 # To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
 
 import torch
 import torch._inductor
@@ -41,24 +42,24 @@
 mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor)
 mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, mul_Tensor_1, KeywordArg('value'))
+alias_default = CallFunction(aten.alias.default, div_Tensor)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
 permute_default_1 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, KeywordArg('tangents_1'), permute_default_1)
 convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
 mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
 mul_Tensor_3 = CallFunction(aten.mul.Tensor, bmm_default_2, mul_Tensor_2)
 clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
-alias_default = CallFunction(aten.alias.default, div_Tensor)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
 mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
-mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5, _users=2)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4, _users=2)
 permute_default_2 = CallFunction(aten.permute.default, permute_default, Ignored())
-bmm_default_3 = CallFunction(aten.bmm.default, sub_Tensor_1, permute_default_2)
+bmm_default_3 = CallFunction(aten.bmm.default, fma_default, permute_default_2)
 permute_default_3 = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
-bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, sub_Tensor_1)
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, fma_default)
 permute_default_4 = CallFunction(aten.permute.default, bmm_default_4, Ignored())
 permute_default_5 = CallFunction(aten.permute.default, mul_Tensor_1, Ignored())
 bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, KeywordArg('tangents_1'))
@@ -78,7 +79,7 @@
 sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
 div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
 clone_default = CallFunction(aten.clone.default, div_Tensor)
-_sfdp_pattern_13_inference = CallFunction(aten.bmm.default, clone_default, KeywordArg('value'))
+_sfdp_pattern_13_inference = CallFunction(aten.bmm.default, clone_default, KeywordArg('value'), _users=0)
 
 
 rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
@@ -95,23 +96,23 @@
 mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
 mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, mul_Tensor_1, KeywordArg('value'))
-permute_default_1 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
-bmm_default_2 = CallFunction(aten.bmm.default, KeywordArg('tangents_1'), permute_default_1)
-convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
-mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
-mul_Tensor_3 = CallFunction(aten.mul.Tensor, bmm_default_2, mul_Tensor_2)
-clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
-convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
 alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
 alias_default_1 = CallFunction(aten.alias.default, alias_default)
 alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
 alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
-convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
-mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_4, _users=2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, KeywordArg('tangents_1'), permute_default_1)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, bmm_default_2, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, convert_element_type_default_2, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
-mul_Tensor_5 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
-convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored(), _users=2)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, fma_default, Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, permute_default, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, convert_element_type_default_5, permute_default_2)
 permute_default_3 = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
@@ -119,7 +120,7 @@
 permute_default_4 = CallFunction(aten.permute.default, bmm_default_4, Ignored())
 permute_default_5 = CallFunction(aten.permute.default, mul_Tensor_1, Ignored())
 bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, KeywordArg('tangents_1'))
-_sfdp_pattern_13_training_half = MultiOutputPattern([bmm_default_1,
+_sfdp_pattern_13_half_training = MultiOutputPattern([bmm_default_1,
   bmm_default_3,
   permute_default_4,
   bmm_default_5,
@@ -137,4 +138,4 @@
 div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
 convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
 clone_default = CallFunction(aten.clone.default, convert_element_type_default_1)
-_sfdp_pattern_13_inference_half = CallFunction(aten.bmm.default, clone_default, KeywordArg('value'))
+_sfdp_pattern_13_half_inference = CallFunction(aten.bmm.default, clone_default, KeywordArg('value'), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py
new file mode 100644
index 0000000000000..a25976ad66727
--- /dev/null
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py
@@ -0,0 +1,217 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_7, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+div_Tensor_2 = CallFunction(aten.div.Tensor, fma_default, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_14_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_14_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_2, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_4, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_14_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_14_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_15.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_15.py
new file mode 100644
index 0000000000000..e5cc2e1cfb617
--- /dev/null
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_15.py
@@ -0,0 +1,235 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+expand_default = CallFunction(aten.expand.default, eq_Scalar, Ignored(), _users=2)
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor, _users=2)
+amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+expand_default_3 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_7, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+where_self_1 = CallFunction(aten.where.self, expand_default, scalar_tensor_default, fma_default)
+div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_15_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_3, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor, _users=2)
+amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_3 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+_sfdp_pattern_15_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+
+
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+expand_default = CallFunction(aten.expand.default, eq_Scalar, Ignored(), _users=2)
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_2, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+where_self_1 = CallFunction(aten.where.self, expand_default, scalar_tensor_default, convert_element_type_default_4)
+div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_15_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_3, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+_sfdp_pattern_15_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_16.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_16.py
new file mode 100644
index 0000000000000..8895782436b4d
--- /dev/null
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_16.py
@@ -0,0 +1,634 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+div_Tensor_2 = CallFunction(aten.div.Tensor, fma_default, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_16_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default_2, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_16_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+div_Tensor_2 = CallFunction(aten.div.Tensor, fma_default, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_16_bs1_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default = CallFunction(aten.clone.default, div_Tensor_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_16_bs1_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, clone_default_3, Ignored())
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, convert_element_type_default_2, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_5, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_16_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, convert_element_type_default_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default_2, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_16_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, convert_element_type_default_2, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_5, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_16_half_bs1_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+clone_default = CallFunction(aten.clone.default, convert_element_type_default_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_16_half_bs1_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, mul_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_1, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_3, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_16_half_mask_fp32_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, clone_default_2, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_16_half_mask_fp32_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, mul_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_1, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_3, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_16_half_mask_fp32_bs1_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default = CallFunction(aten.clone.default, div_Tensor_1)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_16_half_mask_fp32_bs1_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_17.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_17.py
new file mode 100644
index 0000000000000..225dce51a19a3
--- /dev/null
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_17.py
@@ -0,0 +1,255 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+expand_default = CallFunction(aten.expand.default, eq_Scalar, Ignored(), _users=2)
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor, _users=2)
+amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+where_self_1 = CallFunction(aten.where.self, expand_default, scalar_tensor_default, fma_default)
+div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_17_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_3, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor, _users=2)
+amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1)
+expand_default_3 = CallFunction(aten.expand.default, clone_default_2, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+_sfdp_pattern_17_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+expand_default = CallFunction(aten.expand.default, eq_Scalar, Ignored(), _users=2)
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, clone_default_3, Ignored())
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, convert_element_type_default_2, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+where_self_1 = CallFunction(aten.where.self, expand_default, scalar_tensor_default, convert_element_type_default_5)
+div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_17_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_3, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, convert_element_type_default_1)
+expand_default_3 = CallFunction(aten.expand.default, clone_default_2, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+_sfdp_pattern_17_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_18.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_18.py
new file mode 100644
index 0000000000000..cf3fe7cff4a25
--- /dev/null
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_18.py
@@ -0,0 +1,476 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False, _users=2)
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, full_default)
+full_default_1 = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+where_self = CallFunction(aten.where.self, KeywordArg('causal_mask'), div_Tensor, full_default_1, _users=2)
+amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+where_self_1 = CallFunction(aten.where.self, KeywordArg('causal_mask'), fma_default, scalar_tensor_default)
+div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, full_default)
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_18_training = MultiOutputPattern([view_default_5,
+  permute_default_1,
+  permute_default_3,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, full_default)
+full_default_1 = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+where_self = CallFunction(aten.where.self, KeywordArg('causal_mask'), div_Tensor, full_default_1, _users=2)
+amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default_2, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_18_inference = MultiOutputPattern([view_default_5,
+  permute_default_1,
+  permute_default_3
+])
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False, _users=2)
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, full_default)
+full_default_1 = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+where_self = CallFunction(aten.where.self, KeywordArg('causal_mask'), div_Tensor, full_default_1, _users=2)
+amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+where_self_1 = CallFunction(aten.where.self, KeywordArg('causal_mask'), fma_default, scalar_tensor_default)
+div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, full_default)
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_18_bs1_training = MultiOutputPattern([view_default_5,
+  permute_default_1,
+  permute_default_3,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, full_default)
+full_default_1 = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+where_self = CallFunction(aten.where.self, KeywordArg('causal_mask'), div_Tensor, full_default_1, _users=2)
+amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default = CallFunction(aten.clone.default, div_Tensor_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_18_bs1_inference = MultiOutputPattern([view_default_5,
+  permute_default_1,
+  permute_default_3
+])
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False, _users=2)
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, full_default)
+full_default_1 = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+where_self = CallFunction(aten.where.self, KeywordArg('causal_mask'), div_Tensor, full_default_1)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, clone_default_3, Ignored())
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, convert_element_type_default_2, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+where_self_1 = CallFunction(aten.where.self, KeywordArg('causal_mask'), convert_element_type_default_5, scalar_tensor_default)
+div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, full_default)
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_18_half_training = MultiOutputPattern([view_default_5,
+  permute_default_1,
+  permute_default_3,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, full_default)
+full_default_1 = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+where_self = CallFunction(aten.where.self, KeywordArg('causal_mask'), div_Tensor, full_default_1)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, convert_element_type_default_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default_2, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_18_half_inference = MultiOutputPattern([view_default_5,
+  permute_default_1,
+  permute_default_3
+])
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False, _users=2)
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, full_default)
+full_default_1 = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+where_self = CallFunction(aten.where.self, KeywordArg('causal_mask'), div_Tensor, full_default_1)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, convert_element_type_default_2, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+where_self_1 = CallFunction(aten.where.self, KeywordArg('causal_mask'), convert_element_type_default_5, scalar_tensor_default)
+div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, full_default)
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_18_half_bs1_training = MultiOutputPattern([view_default_5,
+  permute_default_1,
+  permute_default_3,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, full_default)
+full_default_1 = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+where_self = CallFunction(aten.where.self, KeywordArg('causal_mask'), div_Tensor, full_default_1)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+clone_default = CallFunction(aten.clone.default, convert_element_type_default_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_18_half_bs1_inference = MultiOutputPattern([view_default_5,
+  permute_default_1,
+  permute_default_3
+])
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_19.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_19.py
new file mode 100644
index 0000000000000..c2b71b521b2b7
--- /dev/null
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_19.py
@@ -0,0 +1,220 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False, _users=2)
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, full_default)
+full_default_1 = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+where_self = CallFunction(aten.where.self, KeywordArg('causal_mask'), div_Tensor, full_default_1)
+add_Tensor = CallFunction(aten.add.Tensor, where_self, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+where_self_1 = CallFunction(aten.where.self, KeywordArg('causal_mask'), fma_default, scalar_tensor_default)
+div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, full_default)
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+_sfdp_pattern_19_training = MultiOutputPattern([view_default_5,
+  view_default_9,
+  permute_default_4,
+  view_default_11,
+  None,
+  None,
+  None
+])
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, full_default)
+full_default_1 = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+where_self = CallFunction(aten.where.self, KeywordArg('causal_mask'), div_Tensor, full_default_1)
+add_Tensor = CallFunction(aten.add.Tensor, where_self, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default = CallFunction(aten.clone.default, div_Tensor_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_19_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False, _users=2)
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, full_default)
+full_default_1 = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+where_self = CallFunction(aten.where.self, KeywordArg('causal_mask'), div_Tensor, full_default_1)
+add_Tensor = CallFunction(aten.add.Tensor, where_self, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_1, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+where_self_1 = CallFunction(aten.where.self, KeywordArg('causal_mask'), convert_element_type_default_3, scalar_tensor_default)
+div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, full_default)
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+_sfdp_pattern_19_half_training = MultiOutputPattern([view_default_5,
+  view_default_9,
+  permute_default_4,
+  view_default_11,
+  None,
+  None,
+  None
+])
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, full_default)
+full_default_1 = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+where_self = CallFunction(aten.where.self, KeywordArg('causal_mask'), div_Tensor, full_default_1)
+add_Tensor = CallFunction(aten.add.Tensor, where_self, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+clone_default = CallFunction(aten.clone.default, convert_element_type_default)
+expand_default_2 = CallFunction(aten.expand.default, clone_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_19_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_2.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_2.py
index fb4dc8e16e749..cdaa975bcfc03 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_2.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_2.py
@@ -1,8 +1,9 @@
+# mypy: ignore-errors
+
 # noqa: F401, E501
 # This is an auto-generated file. Please do not modify it by hand.
 # To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
 
 import torch
 import torch._inductor
@@ -48,20 +49,20 @@
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
-view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
-permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
-bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
-view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
 alias_default = CallFunction(aten.alias.default, div_Tensor)
 alias_default_1 = CallFunction(aten.alias.default, alias_default)
 alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
 alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
 mul_Tensor_1 = CallFunction(aten.mul.Tensor, view_default_7, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_1, Ignored(), True)
-mul_Tensor_2 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_1, mul_Tensor_2)
-mul_Tensor_3 = CallFunction(aten.mul.Tensor, sub_Tensor_1, KeywordArg('scale_factor'))
-view_default_8 = CallFunction(aten.view.default, mul_Tensor_3, Ignored(), _users=2)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_1)
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, fma_default, KeywordArg('scale_factor'))
+view_default_8 = CallFunction(aten.view.default, mul_Tensor_2, Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
 view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
@@ -98,7 +99,7 @@
 expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_2_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_2_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
 
 
 expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
@@ -122,23 +123,23 @@
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
-view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
-permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
-bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
-view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
-convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
 alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
 alias_default_1 = CallFunction(aten.alias.default, alias_default)
 alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
 alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
-convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
-mul_Tensor_1 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, convert_element_type_default_3, _users=2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_2, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_1, Ignored(), True)
-mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_1, mul_Tensor_2)
-convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
-mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, KeywordArg('scale_factor'))
-view_default_8 = CallFunction(aten.view.default, mul_Tensor_3, Ignored(), _users=2)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_1)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, KeywordArg('scale_factor'))
+view_default_8 = CallFunction(aten.view.default, mul_Tensor_2, Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
 view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
@@ -149,7 +150,7 @@
 permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
 bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
-_sfdp_pattern_2_training_half = MultiOutputPattern([view_default_5,
+_sfdp_pattern_2_half_training = MultiOutputPattern([view_default_5,
   view_default_9,
   permute_default_4,
   view_default_11,
@@ -177,4 +178,4 @@
 expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_2_inference_half = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_2_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_3.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_3.py
index 6e9898563654e..481c704f709ed 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_3.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_3.py
@@ -1,8 +1,9 @@
+# mypy: ignore-errors
+
 # noqa: F401, E501
 # This is an auto-generated file. Please do not modify it by hand.
 # To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
 
 import torch
 import torch._inductor
@@ -52,6 +53,11 @@
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
@@ -60,15 +66,10 @@
 mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
 mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
 clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
-alias_default = CallFunction(aten.alias.default, div_Tensor_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
 mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
-mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
-div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, KeywordArg('inv_scale_factor'))
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+div_Tensor_2 = CallFunction(aten.div.Tensor, fma_default, KeywordArg('inv_scale_factor'))
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
@@ -108,7 +109,7 @@
 expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_3_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_3_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
 
 
 rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
@@ -136,25 +137,25 @@
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
 view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
-convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
-mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
 mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
 clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
-convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
-alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
-convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
-mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_4, _users=2)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, convert_element_type_default_2, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
-mul_Tensor_5 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
-convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
 div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_5, KeywordArg('inv_scale_factor'))
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
@@ -167,7 +168,7 @@
 permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
 bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
-_sfdp_pattern_3_training_half = MultiOutputPattern([view_default_5,
+_sfdp_pattern_3_half_training = MultiOutputPattern([view_default_5,
   view_default_9,
   permute_default_4,
   view_default_11,
@@ -197,4 +198,4 @@
 expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_3_inference_half = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_3_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_4.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_4.py
index b8cf651a4980b..d9f8bf2ebc990 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_4.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_4.py
@@ -1,8 +1,9 @@
+# mypy: ignore-errors
+
 # noqa: F401, E501
 # This is an auto-generated file. Please do not modify it by hand.
 # To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
 
 import torch
 import torch._inductor
@@ -52,6 +53,11 @@
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
@@ -60,16 +66,11 @@
 mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
 mul_Tensor_4 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_3)
 clone_default = CallFunction(aten.clone.default, mul_Tensor_4, memory_format=torch.contiguous_format)
-alias_default = CallFunction(aten.alias.default, div_Tensor)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
 mul_Tensor_5 = CallFunction(aten.mul.Tensor, clone_default, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_5, Ignored(), True)
-mul_Tensor_6 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_5, mul_Tensor_6)
-mul_Tensor_7 = CallFunction(aten.mul.Tensor, sub_Tensor_1, KeywordArg('scale_factor'))
-view_default_8 = CallFunction(aten.view.default, mul_Tensor_7, Ignored(), _users=2)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_5)
+mul_Tensor_6 = CallFunction(aten.mul.Tensor, fma_default, KeywordArg('scale_factor'))
+view_default_8 = CallFunction(aten.view.default, mul_Tensor_6, Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
 view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
@@ -108,7 +109,7 @@
 expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_4_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_4_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
 
 
 rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
@@ -136,27 +137,27 @@
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
 view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
-convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
-mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
 mul_Tensor_4 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_3)
 clone_default = CallFunction(aten.clone.default, mul_Tensor_4, memory_format=torch.contiguous_format)
-convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
-alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
-convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
-mul_Tensor_5 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_4, _users=2)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, convert_element_type_default_2, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_5, Ignored(), True)
-mul_Tensor_6 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_5, mul_Tensor_6)
-convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
-mul_Tensor_7 = CallFunction(aten.mul.Tensor, convert_element_type_default_5, KeywordArg('scale_factor'))
-view_default_8 = CallFunction(aten.view.default, mul_Tensor_7, Ignored(), _users=2)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_5)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+mul_Tensor_6 = CallFunction(aten.mul.Tensor, convert_element_type_default_5, KeywordArg('scale_factor'))
+view_default_8 = CallFunction(aten.view.default, mul_Tensor_6, Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
 view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
@@ -167,7 +168,7 @@
 permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
 bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
-_sfdp_pattern_4_training_half = MultiOutputPattern([view_default_5,
+_sfdp_pattern_4_half_training = MultiOutputPattern([view_default_5,
   view_default_9,
   permute_default_4,
   view_default_11,
@@ -197,4 +198,4 @@
 expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_4_inference_half = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_4_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_5.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_5.py
index 623047ba7726d..64f99e2ac21ec 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_5.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_5.py
@@ -1,8 +1,9 @@
+# mypy: ignore-errors
+
 # noqa: F401, E501
 # This is an auto-generated file. Please do not modify it by hand.
 # To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
 
 import torch
 import torch._inductor
@@ -49,19 +50,19 @@
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
-view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
-permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
-bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
-view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
 alias_default = CallFunction(aten.alias.default, div_Tensor_1)
 alias_default_1 = CallFunction(aten.alias.default, alias_default)
 alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
 alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
 mul_Tensor = CallFunction(aten.mul.Tensor, view_default_7, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
-mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
-div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, Ignored())
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+div_Tensor_2 = CallFunction(aten.div.Tensor, fma_default, Ignored())
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
@@ -100,7 +101,7 @@
 expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_5_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_5_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
 
 
 expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
@@ -125,21 +126,21 @@
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
-view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
-permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
-bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
-view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
-convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
 alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
 alias_default_1 = CallFunction(aten.alias.default, alias_default)
 alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
 alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
-convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
-mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, convert_element_type_default_3, _users=2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_2, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
-mul_Tensor_1 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
-convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
 div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_4, Ignored())
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
@@ -152,7 +153,7 @@
 permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
 bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
-_sfdp_pattern_5_training_half = MultiOutputPattern([view_default_5,
+_sfdp_pattern_5_half_training = MultiOutputPattern([view_default_5,
   view_default_9,
   permute_default_4,
   view_default_11,
@@ -181,4 +182,4 @@
 expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_5_inference_half = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_5_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_6.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_6.py
index e880512496a87..9836142aade51 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_6.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_6.py
@@ -1,8 +1,9 @@
+# mypy: ignore-errors
+
 # noqa: F401, E501
 # This is an auto-generated file. Please do not modify it by hand.
 # To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
 
 import torch
 import torch._inductor
@@ -53,6 +54,11 @@
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
@@ -61,15 +67,10 @@
 mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
 mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
 clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
-alias_default = CallFunction(aten.alias.default, div_Tensor_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
 mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
-mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
-div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, Ignored())
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+div_Tensor_2 = CallFunction(aten.div.Tensor, fma_default, Ignored())
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
@@ -110,7 +111,7 @@
 expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_6_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_6_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
 
 
 rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
@@ -139,25 +140,25 @@
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+neg_default = CallFunction(aten.neg.default, convert_element_type_default_2)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
 view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
-convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
-mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
 mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
 clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
-convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
-alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
-convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
-mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_4, _users=2)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, convert_element_type_default_2, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
-mul_Tensor_5 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
-convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
 div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_5, Ignored())
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
@@ -170,7 +171,7 @@
 permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
 bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
-_sfdp_pattern_6_training_half = MultiOutputPattern([view_default_5,
+_sfdp_pattern_6_half_training = MultiOutputPattern([view_default_5,
   view_default_9,
   permute_default_4,
   view_default_11,
@@ -201,4 +202,4 @@
 expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
 view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_6_inference_half = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_6_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py
index d762388fc195f..87c233a2ae18d 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py
@@ -1,8 +1,9 @@
+# mypy: ignore-errors
+
 # noqa: F401, E501
 # This is an auto-generated file. Please do not modify it by hand.
 # To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
 
 import torch
 import torch._inductor
@@ -59,6 +60,11 @@
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
@@ -69,15 +75,10 @@
 mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
 mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, mul_Tensor_2)
 clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
-alias_default = CallFunction(aten.alias.default, div_Tensor_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
 mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
-mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
-div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, Ignored())
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+div_Tensor_2 = CallFunction(aten.div.Tensor, fma_default, Ignored())
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
@@ -126,7 +127,7 @@
 clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
 view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_7_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_7_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
 
 
 rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
@@ -160,6 +161,11 @@
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
@@ -169,15 +175,10 @@
 mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
 mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, mul_Tensor_2)
 clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
-alias_default = CallFunction(aten.alias.default, div_Tensor_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
 mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
-mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
-convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
 div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_4, Ignored())
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
@@ -193,7 +194,7 @@
 bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
 permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
-_sfdp_pattern_7_training_half = MultiOutputPattern([view_default_5,
+_sfdp_pattern_7_half_training = MultiOutputPattern([view_default_5,
   permute_default_6,
   permute_default_9,
   permute_default_11,
@@ -228,4 +229,4 @@
 clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
 view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_7_inference_half = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_7_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_8.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_8.py
index 50d1919fd6cc9..eb6ffee4614ce 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_8.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_8.py
@@ -1,8 +1,9 @@
+# mypy: ignore-errors
+
 # noqa: F401, E501
 # This is an auto-generated file. Please do not modify it by hand.
 # To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
 
 import torch
 import torch._inductor
@@ -55,21 +56,21 @@
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
 convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, bmm_default_2, Ignored())
 view_default_7 = CallFunction(aten.view.default, convert_element_type_default_1, Ignored())
 convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
-alias_default = CallFunction(aten.alias.default, div_Tensor_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
 mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
-mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
-div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, Ignored())
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+div_Tensor_2 = CallFunction(aten.div.Tensor, fma_default, Ignored())
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
@@ -116,7 +117,7 @@
 clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_8_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_8_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
 
 
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
@@ -146,20 +147,20 @@
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
 view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
 convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
-alias_default = CallFunction(aten.alias.default, div_Tensor_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
 mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
-mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
-convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
 div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_3, Ignored())
 view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
@@ -175,7 +176,7 @@
 bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
 permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
-_sfdp_pattern_8_training_half = MultiOutputPattern([view_default_5,
+_sfdp_pattern_8_half_training = MultiOutputPattern([view_default_5,
   permute_default_6,
   permute_default_9,
   permute_default_11
@@ -208,4 +209,4 @@
 clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_8_inference_half = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_8_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_9.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_9.py
index a0171ec633938..f2456fbef495c 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_9.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_9.py
@@ -1,8 +1,9 @@
+# mypy: ignore-errors
+
 # noqa: F401, E501
 # This is an auto-generated file. Please do not modify it by hand.
 # To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
 
 import torch
 import torch._inductor
@@ -59,6 +60,11 @@
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
@@ -69,15 +75,10 @@
 mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
 mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, mul_Tensor_2)
 clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
-alias_default = CallFunction(aten.alias.default, div_Tensor_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
 mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
-mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
-view_default_8 = CallFunction(aten.view.default, sub_Tensor_1, Ignored(), _users=2)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+view_default_8 = CallFunction(aten.view.default, fma_default, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
 view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
@@ -126,7 +127,7 @@
 clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
 view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_9_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_9_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
 
 
 rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
@@ -160,6 +161,11 @@
 view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
 view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+neg_default = CallFunction(aten.neg.default, alias_default_3)
 view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
 permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
 bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
@@ -169,15 +175,10 @@
 mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
 mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, mul_Tensor_2)
 clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
-alias_default = CallFunction(aten.alias.default, div_Tensor_1)
-alias_default_1 = CallFunction(aten.alias.default, alias_default)
-alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
-alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
 mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
 sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
-mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
-sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
-convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor_4)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
 view_default_8 = CallFunction(aten.view.default, convert_element_type_default_4, Ignored(), _users=2)
 permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
 bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
@@ -193,7 +194,7 @@
 bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
 view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
 permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
-_sfdp_pattern_9_training_half = MultiOutputPattern([view_default_5,
+_sfdp_pattern_9_half_training = MultiOutputPattern([view_default_5,
   permute_default_6,
   permute_default_9,
   permute_default_11,
@@ -228,4 +229,4 @@
 clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
 view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
-_sfdp_pattern_9_inference_half = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_9_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/addmm_pattern.py b/torch/_inductor/fx_passes/serialized_patterns/addmm_pattern.py
new file mode 100644
index 0000000000000..58864035cb936
--- /dev/null
+++ b/torch/_inductor/fx_passes/serialized_patterns/addmm_pattern.py
@@ -0,0 +1,52 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+addmm_default = CallFunction(aten.addmm.default, KeywordArg('input'), KeywordArg('mat1'), KeywordArg('mat2'), beta=KeywordArg('beta'), alpha=KeywordArg('alpha'))
+mul_Scalar = CallFunction(aten.mul.Scalar, KeywordArg('tangents_1'), KeywordArg('beta'))
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, mul_Scalar, Ignored(), True)
+view_default = CallFunction(aten.view.default, sum_dim_IntList, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('mat2'), Ignored())
+mm_default = CallFunction(aten.mm.default, KeywordArg('tangents_1'), permute_default)
+mul_Scalar_1 = CallFunction(aten.mul.Scalar, mm_default, KeywordArg('alpha'))
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('mat1'), Ignored())
+mm_default_1 = CallFunction(aten.mm.default, permute_default_1, KeywordArg('tangents_1'))
+mul_Scalar_2 = CallFunction(aten.mul.Scalar, mm_default_1, KeywordArg('alpha'))
+addmm_pattern_training = MultiOutputPattern([addmm_default,
+  view_default,
+  mul_Scalar_1,
+  mul_Scalar_2,
+  None,
+  None
+])
+
+
+addmm_pattern_inference = CallFunction(aten.addmm.default, KeywordArg('input'), KeywordArg('mat1'), KeywordArg('mat2'), beta=KeywordArg('beta'), alpha=KeywordArg('alpha'), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/bmm_pattern.py b/torch/_inductor/fx_passes/serialized_patterns/bmm_pattern.py
new file mode 100644
index 0000000000000..b1363546bd111
--- /dev/null
+++ b/torch/_inductor/fx_passes/serialized_patterns/bmm_pattern.py
@@ -0,0 +1,44 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+bmm_default = CallFunction(aten.bmm.default, KeywordArg('mat1'), KeywordArg('mat2'))
+permute_default = CallFunction(aten.permute.default, KeywordArg('mat2'), Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, KeywordArg('tangents_1'), permute_default)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('mat1'), Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, permute_default_1, KeywordArg('tangents_1'))
+bmm_pattern_training = MultiOutputPattern([bmm_default,
+  bmm_default_1,
+  bmm_default_2
+])
+
+
+bmm_pattern_inference = CallFunction(aten.bmm.default, KeywordArg('mat1'), KeywordArg('mat2'), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/central_index.py b/torch/_inductor/fx_passes/serialized_patterns/central_index.py
deleted file mode 100644
index e85788a7bab0b..0000000000000
--- a/torch/_inductor/fx_passes/serialized_patterns/central_index.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# This is an auto-generated file. Please do not modify it by hand.
-# To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
-from ._sfdp_pattern_1 import (_sfdp_pattern_1_training, _sfdp_pattern_1_inference, _sfdp_pattern_1_training_half, _sfdp_pattern_1_inference_half)
-from ._sfdp_pattern_2 import (_sfdp_pattern_2_training, _sfdp_pattern_2_inference, _sfdp_pattern_2_training_half, _sfdp_pattern_2_inference_half)
-from ._sfdp_pattern_3 import (_sfdp_pattern_3_training, _sfdp_pattern_3_inference, _sfdp_pattern_3_training_half, _sfdp_pattern_3_inference_half)
-from ._sfdp_pattern_4 import (_sfdp_pattern_4_training, _sfdp_pattern_4_inference, _sfdp_pattern_4_training_half, _sfdp_pattern_4_inference_half)
-from ._sfdp_pattern_5 import (_sfdp_pattern_5_training, _sfdp_pattern_5_inference, _sfdp_pattern_5_training_half, _sfdp_pattern_5_inference_half)
-from ._sfdp_pattern_6 import (_sfdp_pattern_6_training, _sfdp_pattern_6_inference, _sfdp_pattern_6_training_half, _sfdp_pattern_6_inference_half)
-from ._sfdp_pattern_7 import (_sfdp_pattern_7_training, _sfdp_pattern_7_inference, _sfdp_pattern_7_training_half, _sfdp_pattern_7_inference_half)
-from ._sfdp_pattern_8 import (_sfdp_pattern_8_training, _sfdp_pattern_8_inference, _sfdp_pattern_8_training_half, _sfdp_pattern_8_inference_half)
-from ._sfdp_pattern_9 import (_sfdp_pattern_9_training, _sfdp_pattern_9_inference, _sfdp_pattern_9_training_half, _sfdp_pattern_9_inference_half)
-from ._sfdp_pattern_10 import (_sfdp_pattern_10_training, _sfdp_pattern_10_inference, _sfdp_pattern_10_training_half, _sfdp_pattern_10_inference_half)
-from ._sfdp_pattern_11 import (_sfdp_pattern_11_training, _sfdp_pattern_11_inference, _sfdp_pattern_11_training_half, _sfdp_pattern_11_inference_half)
-from ._sfdp_pattern_12 import (_sfdp_pattern_12_training, _sfdp_pattern_12_inference, _sfdp_pattern_12_training_half, _sfdp_pattern_12_inference_half)
-from ._sfdp_pattern_13 import (_sfdp_pattern_13_training, _sfdp_pattern_13_inference, _sfdp_pattern_13_training_half, _sfdp_pattern_13_inference_half)
-
-central_index = {
-    '_sfdp_pattern_1_training': _sfdp_pattern_1_training,
-    '_sfdp_pattern_1_inference': _sfdp_pattern_1_inference,
-    '_sfdp_pattern_2_training': _sfdp_pattern_2_training,
-    '_sfdp_pattern_2_inference': _sfdp_pattern_2_inference,
-    '_sfdp_pattern_3_training': _sfdp_pattern_3_training,
-    '_sfdp_pattern_3_inference': _sfdp_pattern_3_inference,
-    '_sfdp_pattern_4_training': _sfdp_pattern_4_training,
-    '_sfdp_pattern_4_inference': _sfdp_pattern_4_inference,
-    '_sfdp_pattern_5_training': _sfdp_pattern_5_training,
-    '_sfdp_pattern_5_inference': _sfdp_pattern_5_inference,
-    '_sfdp_pattern_6_training': _sfdp_pattern_6_training,
-    '_sfdp_pattern_6_inference': _sfdp_pattern_6_inference,
-    '_sfdp_pattern_7_training': _sfdp_pattern_7_training,
-    '_sfdp_pattern_7_inference': _sfdp_pattern_7_inference,
-    '_sfdp_pattern_8_training': _sfdp_pattern_8_training,
-    '_sfdp_pattern_8_inference': _sfdp_pattern_8_inference,
-    '_sfdp_pattern_9_training': _sfdp_pattern_9_training,
-    '_sfdp_pattern_9_inference': _sfdp_pattern_9_inference,
-    '_sfdp_pattern_10_training': _sfdp_pattern_10_training,
-    '_sfdp_pattern_10_inference': _sfdp_pattern_10_inference,
-    '_sfdp_pattern_11_training': _sfdp_pattern_11_training,
-    '_sfdp_pattern_11_inference': _sfdp_pattern_11_inference,
-    '_sfdp_pattern_12_training': _sfdp_pattern_12_training,
-    '_sfdp_pattern_12_inference': _sfdp_pattern_12_inference,
-    '_sfdp_pattern_13_training': _sfdp_pattern_13_training,
-    '_sfdp_pattern_13_inference': _sfdp_pattern_13_inference,
-    '_sfdp_pattern_1_training_half': _sfdp_pattern_1_training_half,
-    '_sfdp_pattern_1_inference_half': _sfdp_pattern_1_inference_half,
-    '_sfdp_pattern_2_training_half': _sfdp_pattern_2_training_half,
-    '_sfdp_pattern_2_inference_half': _sfdp_pattern_2_inference_half,
-    '_sfdp_pattern_3_training_half': _sfdp_pattern_3_training_half,
-    '_sfdp_pattern_3_inference_half': _sfdp_pattern_3_inference_half,
-    '_sfdp_pattern_4_training_half': _sfdp_pattern_4_training_half,
-    '_sfdp_pattern_4_inference_half': _sfdp_pattern_4_inference_half,
-    '_sfdp_pattern_5_training_half': _sfdp_pattern_5_training_half,
-    '_sfdp_pattern_5_inference_half': _sfdp_pattern_5_inference_half,
-    '_sfdp_pattern_6_training_half': _sfdp_pattern_6_training_half,
-    '_sfdp_pattern_6_inference_half': _sfdp_pattern_6_inference_half,
-    '_sfdp_pattern_7_training_half': _sfdp_pattern_7_training_half,
-    '_sfdp_pattern_7_inference_half': _sfdp_pattern_7_inference_half,
-    '_sfdp_pattern_8_training_half': _sfdp_pattern_8_training_half,
-    '_sfdp_pattern_8_inference_half': _sfdp_pattern_8_inference_half,
-    '_sfdp_pattern_9_training_half': _sfdp_pattern_9_training_half,
-    '_sfdp_pattern_9_inference_half': _sfdp_pattern_9_inference_half,
-    '_sfdp_pattern_10_training_half': _sfdp_pattern_10_training_half,
-    '_sfdp_pattern_10_inference_half': _sfdp_pattern_10_inference_half,
-    '_sfdp_pattern_11_training_half': _sfdp_pattern_11_training_half,
-    '_sfdp_pattern_11_inference_half': _sfdp_pattern_11_inference_half,
-    '_sfdp_pattern_12_training_half': _sfdp_pattern_12_training_half,
-    '_sfdp_pattern_12_inference_half': _sfdp_pattern_12_inference_half,
-    '_sfdp_pattern_13_training_half': _sfdp_pattern_13_training_half,
-    '_sfdp_pattern_13_inference_half': _sfdp_pattern_13_inference_half,
-}
-
-
-def get_serialized_pattern(key):
-    import torch._inductor  # noqa: F401
-    from torch._inductor import config
-    if config.fallback_random:
-        return None
-
-    # TODO - could add more validation that the same set of decomps used when
-    # tracing SDPA are also used in current context. softmax, dropout, etc
-    # decomp use is stable so not an issue in practice.
-    return central_index.get(key)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/mm_pattern.py b/torch/_inductor/fx_passes/serialized_patterns/mm_pattern.py
new file mode 100644
index 0000000000000..5380feb34d56f
--- /dev/null
+++ b/torch/_inductor/fx_passes/serialized_patterns/mm_pattern.py
@@ -0,0 +1,44 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+mm_default = CallFunction(aten.mm.default, KeywordArg('mat1'), KeywordArg('mat2'))
+permute_default = CallFunction(aten.permute.default, KeywordArg('mat2'), Ignored())
+mm_default_1 = CallFunction(aten.mm.default, KeywordArg('tangents_1'), permute_default)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('mat1'), Ignored())
+mm_default_2 = CallFunction(aten.mm.default, permute_default_1, KeywordArg('tangents_1'))
+mm_pattern_training = MultiOutputPattern([mm_default,
+  mm_default_1,
+  mm_default_2
+])
+
+
+mm_pattern_inference = CallFunction(aten.mm.default, KeywordArg('mat1'), KeywordArg('mat2'), _users=0)
diff --git a/torch/_inductor/fx_passes/split_cat.py b/torch/_inductor/fx_passes/split_cat.py
index 577a594b91aa7..0fc217abbca77 100644
--- a/torch/_inductor/fx_passes/split_cat.py
+++ b/torch/_inductor/fx_passes/split_cat.py
@@ -1,7 +1,7 @@
 import itertools
 import logging
 import operator
-from typing import Any, Callable, List, Optional, Sequence, Set, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, Union
 
 from typing_extensions import TypeAlias
 
@@ -13,7 +13,6 @@
     CallFunction,
     CallFunctionVarArgs,
     CallMethodVarArgs,
-    config_flag,
     FailedMatch,
     get_arg_value,
     Ignored,
@@ -23,16 +22,11 @@
     MatchContext,
     MULTIPLE,
     PatternExpr,
+    PatternMatcherPass,
     register_graph_pattern,
     RepeatedExpr,
 )
-from .pre_grad import (
-    merge_getitem_cat_pass,
-    merge_splits_pass,
-    normalization_pass,
-    split_cat_pass,
-    unbind_stack_pass,
-)
+from .group_batch_fusion import is_node_meta_valid, POST_GRAD_FUSIONS, PRE_GRAD_FUSIONS
 
 log = logging.getLogger(__name__)
 
@@ -46,6 +40,55 @@
 _Range: TypeAlias = Tuple[int, int]
 
 
+PRE_GRAD_PATTERNS: Dict[str, PatternMatcherPass] = dict()
+POST_GRAD_PATTERNS: Dict[str, PatternMatcherPass] = dict()
+
+pre_grad_pass_names = [
+    "normalization_pass",
+    "remove_split_with_size_one_pass",
+    "merge_getitem_cat_pass",
+    "merge_stack_tahn_unbind_pass",
+    "merge_splits_pass",
+    "mutate_cat_pass",
+    "split_cat_pass",
+    "unbind_stack_pass",
+]
+
+post_grad_pass_names = [
+    "decompose_mm_pass",
+]
+
+for pass_name in pre_grad_pass_names:
+    # exclude all passes from the group batch fusion
+    # they do not use pattern matcher
+    if pass_name in PRE_GRAD_FUSIONS:
+        continue
+    PRE_GRAD_PATTERNS[pass_name] = PatternMatcherPass(
+        prevent_match_across_mutations=True,
+        pass_name=pass_name,
+    )
+
+for pass_name in post_grad_pass_names:
+    # exclude all passes from the group batch fusion
+    # they do not use pattern matcher
+    if pass_name in POST_GRAD_FUSIONS:
+        continue
+    POST_GRAD_PATTERNS[pass_name] = PatternMatcherPass(
+        prevent_match_across_mutations=True,
+        pass_name=pass_name,
+    )
+
+
+def construct_pattern_matcher_pass(pass_name: str) -> PatternMatcherPass:
+    """
+    Return the specific pattern_matcher_pass given the pass name.
+    """
+    if pass_name in PRE_GRAD_PATTERNS:
+        return PRE_GRAD_PATTERNS[pass_name]
+    else:
+        return POST_GRAD_PATTERNS[pass_name]
+
+
 def _get_split_args_default(split_node):
     input_kwarg = "tensor"
     split_size_kwarg = "split_size_or_sections"
@@ -60,6 +103,26 @@ def _get_split_args_default(split_node):
     )
 
 
+def _get_dim(node: Any):
+    assert isinstance(node, torch.fx.Node)
+    if "dim" in node.kwargs:
+        assert isinstance(node.kwargs["dim"], int)
+        return node.kwargs["dim"]
+    if node.target == torch.unbind:
+        if len(node.args) == 2:
+            assert isinstance(node.args[-1], int)
+            return node.args[-1]
+        return 0  # defaults to dim=0
+    if node.target == torch.split:
+        if len(node.args) == 3:
+            assert isinstance(node.args[-1], int)
+            return node.args[-1]
+        return 0  # defaults to dim=0
+    raise AssertionError(
+        f"Can't extract `dim` from {node.target} {node.args} {node.kwargs}"
+    )
+
+
 # noqa: W605
 # ############The pattern to be optimized is#########
 #         unbind (dim=0)
@@ -82,24 +145,6 @@ def _get_split_args_default(split_node):
 #         |
 
 
-def remove_split_with_size_one(
-    graph: torch.fx.Graph,
-    node: torch.fx.Node,
-    input: torch.fx.Node,
-):
-    # find the grand children of the split_node
-    next_users = find_next_users(node)
-    user = next(iter(node.users.keys()))
-    # replace the users of grand child node with the input node
-    for next_user in next_users:
-        next_user.replace_input_with(user, input)
-    # erase the split node and its child
-    graph.erase_node(user)
-    graph.erase_node(node)
-
-    counters["inductor"]["remove_split_with_size_one"] += 1
-
-
 def normalize_split_base(
     match: Match,
     _get_split_args: Callable[
@@ -114,10 +159,10 @@ def normalize_split_base(
     graph = match.graph
     split_input, split_size, split_dim = _get_split_args(split_node)
     if split_input is None or split_dim is None or split_size is None:
-        log.info("couldn't find split args")
+        log.debug("couldn't find split args")
         return
     if "example_value" not in split_node.meta:
-        log.warning("example value absent for node: %s", split_node)
+        log.debug("example value absent for node: %s", split_node)
         return
     assert isinstance(split_node.meta["example_value"], (list, tuple))
     split_sections = [t.size()[split_dim] for t in split_node.meta["example_value"]]
@@ -125,44 +170,127 @@ def normalize_split_base(
     if any(isinstance(section, torch.SymInt) for section in split_sections):
         # TODO dynamic_shapes with assume_static_by_default=False fails while AOT Autograd tracing.
         return
-    # remove the dummy split whose split sections size is one
-    if len(split_sections) == 1:
-        remove_split_with_size_one(graph, split_node, split_input)
-        return
     if split_dim < 0:  # Normalize split dim
         split_dim += split_input.meta["example_value"].dim()
+
+    new_args = (split_input, split_sections)
+    new_kwargs = {"dim": split_dim}
+    if (
+        split_node.args == new_args
+        and split_node.kwargs == new_kwargs
+        and split_node.op == "call_function"
+    ):
+        return
+
     with graph.inserting_after(split_node):
         new_split_node = graph.call_function(
             torch.split,
-            args=(split_input, split_sections),
-            kwargs={"dim": split_dim},
+            args=new_args,
+            kwargs=new_kwargs,
         )
     split_node.replace_all_uses_with(new_split_node)
     new_split_node.meta.update(split_node.meta)
     graph.erase_node(split_node)
-    counters["inductor"]["split_cat_norm"] += 1
+    counters["inductor"]["normalization_pass"] += 1
 
 
 @register_graph_pattern(
     CallFunctionVarArgs(torch.split, users=MULTIPLE),
-    pass_dict=normalization_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("normalization_pass"),
 )
 @register_graph_pattern(
     CallMethodVarArgs("split", users=MULTIPLE),
-    pass_dict=normalization_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("normalization_pass"),
 )
 def normalize_split_default(match: Match, *args, **kwargs):
     return normalize_split_base(match, _get_split_args_default)
 
 
+@register_graph_pattern(
+    CallFunctionVarArgs(torch.split, users=MULTIPLE),
+    pass_dict=construct_pattern_matcher_pass("remove_split_with_size_one_pass"),
+)
+@register_graph_pattern(
+    CallMethodVarArgs("split", users=MULTIPLE),
+    pass_dict=construct_pattern_matcher_pass("remove_split_with_size_one_pass"),
+)
+def remove_split_with_size_one(match: Match, *args, **kwargs):
+    graph = match.graph
+    split_node = match.nodes[0]
+    split_input, split_size, split_dim = _get_split_args_default(split_node)
+    if split_input is None or split_dim is None or split_size is None:
+        log.debug("couldn't find split args")
+        return
+    if "example_value" not in split_node.meta:
+        log.debug("example value absent for node: %s", split_node)
+        return
+    assert isinstance(split_node.meta["example_value"], (list, tuple))
+    split_sections = [t.size()[split_dim] for t in split_node.meta["example_value"]]
+
+    if any(isinstance(section, torch.SymInt) for section in split_sections):
+        # TODO dynamic_shapes with assume_static_by_default=False fails while AOT Autograd tracing.
+        return
+    # remove the dummy split whose split sections size is one
+    if len(split_sections) == 1:
+        # find the grand children of the split_node
+        next_users = find_next_users(split_node)
+        user = next(iter(split_node.users.keys()))
+        # replace the users of grand child node with the input node
+        for next_user in next_users:
+            next_user.replace_input_with(user, split_input)
+        # erase the split node and its child
+        graph.erase_node(user)
+        graph.erase_node(split_node)
+        counters["inductor"]["remove_split_with_size_one_pass"] += 1
+
+
+@register_graph_pattern(
+    CallFunctionVarArgs(torch.unbind, users=MULTIPLE),
+    pass_dict=construct_pattern_matcher_pass("normalization_pass"),
+)
+@register_graph_pattern(
+    CallMethodVarArgs("unbind", users=MULTIPLE),
+    pass_dict=construct_pattern_matcher_pass("normalization_pass"),
+)
+def normalize_unbind_default(match: Match, *args, **kwargs):
+    node = match.nodes[0]
+    graph = match.graph
+    input = get_arg_value(node, 0, "input")
+    dim = get_arg_value(node, 1, "dim")
+    if dim is None:
+        axis = node.kwargs.get("axis")
+        if axis is not None:
+            dim = axis
+        else:
+            dim = 0
+    if input is None:
+        log.debug("couldn't find unbind args")
+        return
+    if "example_value" not in input.meta:
+        log.debug("example value absent for node: %s", input)
+        return
+    ndim = input.meta["example_value"].ndim
+    if dim < 0:  # Normalize unbind dim
+        dim += ndim
+    with graph.inserting_after(node):
+        new_node = graph.call_function(
+            torch.unbind,
+            args=(input,),
+            kwargs={"dim": dim},
+        )
+    node.replace_all_uses_with(new_node)
+    new_node.meta.update(node.meta)
+    graph.erase_node(node)
+    counters["inductor"]["normalization_pass"] += 1
+
+
 @register_graph_pattern(
     CallFunctionVarArgs(torch.cat, users=MULTIPLE),
-    pass_dict=normalization_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("normalization_pass"),
 )
 def normalize_cat_default(match: Match, *args, **kwargs):
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
     cat_node = match.nodes[0]
     graph = match.graph
     tensors = get_arg_value(cat_node, 0, "tensors")
@@ -174,12 +302,12 @@ def normalize_cat_default(match: Match, *args, **kwargs):
         else:
             cat_dim = 0
     if tensors is None or cat_dim is None:
-        log.info("couldn't find cat args")
+        log.debug("couldn't find cat args")
         return
     assert isinstance(tensors, (list, tuple))
     for tensor in itertools.chain([cat_node], tensors):
         if "example_value" not in tensor.meta:
-            log.warning("example value absent for node: %s", tensor)
+            log.debug("example value absent for node: %s", tensor)
             return
 
     ndim = cat_node.meta["example_value"].dim()
@@ -187,7 +315,7 @@ def normalize_cat_default(match: Match, *args, **kwargs):
     def is_empty_tensor(x):
         # special case where torch.cat supports cat'ing with an empty tensor
         x_shape = x.meta["example_value"].shape
-        return len(x_shape) == 1 and x_shape[0] == 0
+        return len(x_shape) == 1 and guard_size_oblivious(x_shape[0] == 0)
 
     assert all(
         ndim == x.meta["example_value"].dim() or is_empty_tensor(x) for x in tensors
@@ -196,22 +324,30 @@ def is_empty_tensor(x):
     if cat_dim < 0:  # Normalize cat dim
         cat_dim += ndim
 
+    new_args = (tensors,)
+    new_kwargs = {"dim": cat_dim}
+    if (
+        cat_node.args == new_args
+        and cat_node.kwargs == new_kwargs
+        and cat_node.op == "call_function"
+    ):
+        return
+
     with graph.inserting_after(cat_node):
         new_cat_node = graph.call_function(
             torch.cat,
-            args=(tensors,),
-            kwargs={"dim": cat_dim},
+            args=new_args,
+            kwargs=new_kwargs,
         )
     cat_node.replace_all_uses_with(new_cat_node)
     new_cat_node.meta.update(cat_node.meta)
     graph.erase_node(cat_node)
-    counters["inductor"]["split_cat_norm"] += 1
+    counters["inductor"]["normalization_pass"] += 1
 
 
 @register_graph_pattern(
     CallFunctionVarArgs(torch.stack, users=MULTIPLE),
-    pass_dict=normalization_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("normalization_pass"),
 )
 def normalize_stack_default(match: Match, *args, **kwargs):
     node = match.nodes[0]
@@ -219,14 +355,14 @@ def normalize_stack_default(match: Match, *args, **kwargs):
     tensors = get_arg_value(node, 0, "tensors")
     dim = get_arg_value(node, 1, "dim") or 0
     if tensors is None or dim is None:
-        log.info("couldn't find stack args")
+        log.debug("couldn't find stack args")
         return
     assert isinstance(tensors, (list, tuple))
 
     # A bug in pytorch, some nodes miss the example_value metadata
     for tensor in itertools.chain([node], tensors):
         if "example_value" not in tensor.meta:
-            log.warning("example value absent for node: %s", tensor)
+            log.debug("example value absent for node: %s", tensor)
             return
 
     ndim = node.meta["example_value"].dim()
@@ -242,7 +378,7 @@ def normalize_stack_default(match: Match, *args, **kwargs):
     node.replace_all_uses_with(new_node)
     new_node.meta.update(node.meta)
     graph.erase_node(node)
-    counters["inductor"]["split_cat_norm"] += 1
+    counters["inductor"]["normalization_pass"] += 1
 
 
 def find_next_users(split_node: torch.fx.Node) -> List[torch.fx.Node]:
@@ -256,8 +392,7 @@ def find_next_users(split_node: torch.fx.Node) -> List[torch.fx.Node]:
 
 @register_graph_pattern(
     CallMethodVarArgs("squeeze", users=MULTIPLE),
-    pass_dict=normalization_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("normalization_pass"),
 )
 def normalize_squeeze_default(match: Match, *args, **kwargs):
     squeeze_node = match.nodes[0]
@@ -299,11 +434,9 @@ class TorchSplit(CallFunction):
     splits are unique getitems.
     """
 
-    def __init__(self, arg, sizes):
+    def __init__(self, arg, sizes, func=torch.split):
         # using KeywordArg("dim") for `dim` checks they all match
-        super().__init__(
-            torch.split, arg, sizes, _users=MULTIPLE, dim=KeywordArg("dim")
-        )
+        super().__init__(func, arg, sizes, _users=MULTIPLE, dim=KeywordArg("dim"))
 
     def _match(self, node: torch.fx.Node, ctx: MatchContext):
         m = super()._match(node, ctx)
@@ -322,7 +455,7 @@ def _match(self, node: torch.fx.Node, ctx: MatchContext):
                 return FailedMatch("only integer getitems are handled")
             if user.args[1] in seen_idxs:
                 return FailedMatch(f"duplicate getitem {user.args[1]}")
-            if user.args[-1] < 0:
+            if user.args[-1] < 0:  # type: ignore[operator]
                 # This shouldn't ideally happen as dynamo normalizes indexes to positive
                 return FailedMatch("negative index")
             seen_idxs.add(user.args[1])
@@ -341,8 +474,7 @@ def _match(self, node: torch.fx.Node, ctx: MatchContext):
         ),
         KeywordArg("next_split_sections"),
     ),
-    pass_dict=merge_splits_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("merge_splits_pass"),
 )
 def merge_splits(
     match: Match,
@@ -353,14 +485,18 @@ def merge_splits(
     dim: int,
 ):
     node = match.output_node()
+    # it is possible that the split has no users,
+    # we check the corner case and skip the pattern
+    if len(node.users.keys()) == 0:
+        return
     graph = match.graph
-    first_split = node.args[0].args[0]
-    next_split_index = node.args[0].args[1]
+    first_split = node.args[0].args[0]  # type: ignore[union-attr]
+    next_split_index = node.args[0].args[1]  # type: ignore[union-attr]
 
     new_split_sections = list(first_split_sections)
-    new_split_sections[next_split_index : next_split_index + 1] = next_split_sections
+    new_split_sections[next_split_index : next_split_index + 1] = next_split_sections  # type: ignore[operator, misc]
 
-    first_split_dim = first_split.kwargs["dim"]
+    first_split_dim = _get_dim(first_split)
 
     to_remove = []
 
@@ -372,7 +508,7 @@ def merge_splits(
             kwargs={"dim": first_split_dim},
         )
         first_split_num_to_user = {
-            user.args[1]: user for user in first_split.users.keys()
+            user.args[1]: user for user in first_split.users.keys()  # type: ignore[union-attr]
         }
 
         new_split_num = 0
@@ -389,7 +525,9 @@ def merge_splits(
                 next_split_num_to_user = {
                     user.args[1]: user for user in node.users.keys()
                 }
-                for next_split_num in range(len(next_split_sections)):
+                # It is not necessary all getitems from the split node are used.
+                # We use the num of users to check the getitems to be merged.
+                for next_split_num in range(len(node.users.keys())):
                     with graph.inserting_after(new_split):
                         new_getitem = graph.call_function(
                             operator.getitem, args=(new_split, new_split_num)
@@ -402,11 +540,11 @@ def merge_splits(
                 to_remove.append(node)
                 to_remove.append(old_getitem)
 
-        to_remove.append(first_split)
+        to_remove.append(first_split)  # type: ignore[arg-type]
     for node in to_remove:
         graph.erase_node(node)
 
-    counters["inductor"]["consecutive_split_merged"] += 1
+    counters["inductor"]["merge_splits_pass"] += 1
 
 
 class SplitCatSimplifier:
@@ -459,9 +597,9 @@ def simplify(
             graph, split_node, split_sections, user_inputs_list, simplified_split_ranges
         )
         self.replace_cat(
-            graph, split_node, next_users, user_inputs_list_new, transform_params_list
+            graph, split_node, next_users, user_inputs_list_new, transform_params_list  # type: ignore[arg-type]
         )
-        self.erase_old_nodes(graph, split_node, next_users)
+        self.erase_old_nodes(graph, split_node, next_users)  # type: ignore[arg-type]
 
     def get_user_input_list(
         self, split_node: torch.fx.Node, next_users: List[torch.fx.Node]
@@ -477,7 +615,7 @@ def get_user_input_list(
             if user.target in {torch.cat, torch.stack}:
                 user_inputs_list.append(self.get_merged_user_inputs(split_node, user))
             else:
-                user_inputs_list.append(self.get_non_cat_node_input(split_node, user))
+                user_inputs_list.append(self.get_non_cat_node_input(split_node, user))  # type: ignore[arg-type]
         return user_inputs_list
 
     def get_merged_user_inputs(
@@ -532,10 +670,10 @@ def merge_consecutive_inputs(
                 if cur_range:
                     merged_ranges.append(tuple(cur_range))
                     cur_range = None
-                merged_ranges.append(input_)
+                merged_ranges.append(input_)  # type: ignore[arg-type]
         if cur_range:
             merged_ranges.append(tuple(cur_range))
-        return merged_ranges
+        return merged_ranges  # type: ignore[return-value]
 
     def get_simplified_split_ranges(
         self,
@@ -597,7 +735,7 @@ def get_transform_params(
 
         We replace a split node with an unflatten followed by a movedim
         """
-        split_dim = split_node.kwargs["dim"]
+        split_dim = _get_dim(split_node)
         split_sections = split_node.args[1]
         transform_params_list: List[List[_TransformParam]] = []
 
@@ -614,7 +752,7 @@ def get_transform_params(
                     transform_params.append((None, None, None, None))
                 elif isinstance(user_input, tuple):  # Split being simplified
                     # Verify equal split
-                    subset_split_sections = split_sections[
+                    subset_split_sections = split_sections[  # type: ignore[index]
                         user_input[0] : user_input[1] + 1
                     ]
                     # All sections should be equal
@@ -653,7 +791,7 @@ def replace_split(
         Returns the new `user_inputs_list`, with tuples replaced with new getitems from the newer split node.
         """
         split_input = split_node.args[0]
-        split_dim = split_node.kwargs["dim"]
+        split_dim = _get_dim(split_node)
         if len(split_ranges) == 1:  # We can completely eliminate the split node
             split_items = [split_input]
         else:
@@ -694,7 +832,7 @@ def replace_split(
                 else:
                     new_user_inputs.append(user_input)
             new_user_inputs_list.append(new_user_inputs)
-        return new_user_inputs_list
+        return new_user_inputs_list  # type: ignore[return-value]
 
     def replace_cat(
         self,
@@ -704,8 +842,7 @@ def replace_cat(
         user_inputs_list_new,
         transform_params_list: List[List[_TransformParam]],
     ):
-        split_dim = split_node.kwargs["dim"]
-
+        split_dim = _get_dim(split_node)
         split_users = split_node.users.keys()
         new_cats = []
         for user_node, user_inputs_new, transform_params in zip(
@@ -833,10 +970,10 @@ def remove_unbind(
         graph: torch.fx.Graph,
         unbind_node: torch.fx.Node,
     ):
-        num_unbind = (
-            max(getitem_node.args[1] for getitem_node in unbind_node.users.keys()) + 1
+        num_unbind = (  # type: ignore[operator]
+            max(getitem_node.args[1] for getitem_node in unbind_node.users.keys()) + 1  # type: ignore[operator, union-attr, type-var]
         )
-        split_sections = [1 for _ in range(num_unbind)]
+        split_sections = [1 for _ in range(num_unbind)]  # type: ignore[operator, arg-type]
 
         super().simplify(graph, unbind_node, split_sections)
 
@@ -855,7 +992,7 @@ def get_simplified_split_ranges(
 
     def get_transform_params(
         self,
-        unbind_node: torch.fx.Node,
+        split_node: torch.fx.Node,
         next_users: List[torch.fx.Node],
         user_inputs_list: List[List[Union[torch.fx.Node, _Range]]],
     ) -> Optional[List[List[_TransformParam]]]:
@@ -880,7 +1017,7 @@ def get_transform_params(
 
 
         """
-        split_dim = unbind_node.kwargs["dim"]
+        split_dim = _get_dim(split_node)
         transform_params_list: List[List[_TransformParam]] = []
         for user_node, user_inputs in zip(next_users, user_inputs_list):
             cat_dim = get_arg_value(user_node, 1, "dim") or 0
@@ -942,8 +1079,7 @@ def find_anchor_nodes(self, ctx: MatchContext, searched: Set[torch.fx.Node]):
             _users=MULTIPLE,
         ),
     ),
-    pass_dict=split_cat_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("split_cat_pass"),
 )
 @register_graph_pattern(
     RepeatedExpr(
@@ -960,8 +1096,7 @@ def find_anchor_nodes(self, ctx: MatchContext, searched: Set[torch.fx.Node]):
             _users=MULTIPLE,
         )
     ),
-    pass_dict=split_cat_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("split_cat_pass"),
 )
 def merge_split_squeeze(
     match: Match, split_input: torch.fx.Node, split_sizes: List[int], dim: int
@@ -994,7 +1129,7 @@ def merge_split_squeeze(
             graph.erase_node(squeeze)
             graph.erase_node(getitem_node)
     graph.erase_node(split)
-    counters["inductor"]["split_squeeze_replaced"] += 1
+    counters["inductor"]["split_cat_pass"] += 1
 
 
 getitem_unbind = ListOf(
@@ -1014,22 +1149,19 @@ def merge_split_squeeze(
 
 @register_graph_pattern(
     CallFunction([torch.stack, torch.cat], getitem_unbind, Ignored(), _users=MULTIPLE),
-    pass_dict=unbind_stack_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("unbind_stack_pass"),
 )
 @register_graph_pattern(
     CallFunction(
         [torch.stack, torch.cat], getitem_unbind, dim=Ignored(), _users=MULTIPLE
     ),
-    pass_dict=unbind_stack_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("unbind_stack_pass"),
 )
 @register_graph_pattern(
     CallFunction(
         [torch.stack, torch.cat], tensors=getitem_unbind, dim=Ignored(), _users=MULTIPLE
     ),
-    pass_dict=unbind_stack_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("unbind_stack_pass"),
 )
 def merge_unbind_stack(match: Match, unbind_input: torch.fx.Node, dim: int):
     unbind_node = next(node for node in match.nodes if node.target == torch.unbind)
@@ -1057,8 +1189,7 @@ def merge_unbind_stack(match: Match, unbind_input: torch.fx.Node, dim: int):
         dim=Ignored(),
         _users=MULTIPLE,
     ),
-    pass_dict=split_cat_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("split_cat_pass"),
 )
 @register_graph_pattern(
     CallFunction(
@@ -1067,8 +1198,7 @@ def merge_unbind_stack(match: Match, unbind_input: torch.fx.Node, dim: int):
         dim=Ignored(),
         _users=MULTIPLE,
     ),
-    pass_dict=split_cat_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("split_cat_pass"),
 )
 @register_graph_pattern(
     CallFunction(
@@ -1077,8 +1207,7 @@ def merge_unbind_stack(match: Match, unbind_input: torch.fx.Node, dim: int):
         Ignored(),
         _users=MULTIPLE,
     ),
-    pass_dict=split_cat_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("split_cat_pass"),
 )
 def simplify_split_cat(match: Match, split_sections: List[int], dim: int):
     if not isinstance(split_sections, (list, tuple)):  # Unnormalized split
@@ -1088,7 +1217,7 @@ def simplify_split_cat(match: Match, split_sections: List[int], dim: int):
 
 
 # noqa: W605
-# ############The pattern to be optimized is#########
+# ############pattern to be optimized is#########
 
 #                 split_node(dim=1)
 #       /     \         ...       /         \
@@ -1097,7 +1226,7 @@ def simplify_split_cat(match: Match, split_sections: List[int], dim: int):
 #      cat (user=mul, dim=1)           cat(user=mul, dim=1)
 #       |            \                   |          \
 
-# ################After transformation#############
+# ################after transformation#############
 
 #                 split_node(dim=1)
 #       /              ...                  \
@@ -1105,19 +1234,16 @@ def simplify_split_cat(match: Match, split_sections: List[int], dim: int):
 #     |    \                              |     \
 
 
-def safe_to_abort_node(node: torch.fx.Node):
-    """
-    1. the input nodes of the node should come from the same parent
-    2. the user of all the input nodes should be only one
-    """
+def has_same_parent_node(node: torch.fx.Node):
+    # the input nodes of the node should come from the same parent
     prev_node = None
-    for arg in node.args[0]:
-        if len(arg.users) != 1 or arg.target != operator.getitem:
+    for getitem in node.args[0]:  # type: ignore[union-attr]
+        if getitem.target != operator.getitem:  # type: ignore[union-attr]
             return False
         if prev_node is None:
-            prev_node = arg.args[0]
+            prev_node = getitem.args[0]  # type: ignore[union-attr]
         else:
-            if arg.args[0] != prev_node:
+            if getitem.args[0] != prev_node:
                 return False
     return True
 
@@ -1138,6 +1264,26 @@ def remove_zeros(split_sections: List[int]):
     return new_split_sections, index_mapping
 
 
+def is_sorted_and_consecutive(arr: List[int]) -> bool:
+    # check if the array is sorted
+    if arr == sorted(arr):
+        # check if the differences between adjacent elements are all 1
+        return all(x[1] - x[0] == 1 for x in zip(arr, arr[1:]))
+    else:
+        return False
+
+
+def calculate_fused_tensor_size(split_node: torch.fx.Node, indices: List[int]) -> int:
+    """
+    Calculate the fused tensor size in the indices
+    """
+    fused_tensor_size = 0
+    for i in range(len(split_node.args[1])):  # type: ignore[arg-type]
+        if i in indices:
+            fused_tensor_size += split_node.args[1][i]  # type: ignore[operator, assignment, index]
+    return fused_tensor_size
+
+
 @register_graph_pattern(
     CallFunction(
         torch.cat,
@@ -1145,8 +1291,7 @@ def remove_zeros(split_sections: List[int]):
         dim=Ignored(),
         _users=MULTIPLE,
     ),
-    pass_dict=merge_getitem_cat_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("merge_getitem_cat_pass"),
 )
 def merge_getitem_cat(match: Match, split_sections: List[int], dim: int):
     if not isinstance(split_sections, (list, tuple)):  # Unnormalized split
@@ -1162,30 +1307,34 @@ def merge_getitem_cat(match: Match, split_sections: List[int], dim: int):
     for cat_user in next_users:
         if cat_user.target == torch.cat:
             cat_dim = get_arg_value(cat_user, 1, "dim")
-            if split_dim != cat_dim:
-                continue
             # check the all getitems in the cat_user from the same node
-            if not safe_to_abort_node(cat_user):
+            # check the input of the cat has all getitem from the split
+            # check all getitem only has one single user
+            if (
+                split_dim != cat_dim
+                or not has_same_parent_node(cat_user)
+                or not all(len(arg.users) == 1 for arg in cat_user.args[0])  # type: ignore[union-attr]
+            ):
                 continue
             # find the index of getitems to be cated/stacked
             indices = []
-            for arg in cat_user.args[0]:
-                indices.append(arg.args[1])
-            # indices may not be necessarily sorted, we sort them first
-            indices.sort()
+            for arg in cat_user.args[0]:  # type: ignore[union-attr]
+                indices.append(arg.args[1])  # type: ignore[union-attr]
             # the gettitems to be merged must be consecutive, otherwise
             # returned sliced tensor could be wrong
-            if indices[len(indices) - 1] - indices[0] + 1 != len(indices):
+            if not is_sorted_and_consecutive(indices):
                 continue
             # update the arg of cat user, only keep the first getitem
-            cat_user.update_arg(0, cat_user.args[0][0])
+            cat_user.update_arg(0, cat_user.args[0][0])  # type: ignore[index]
             # calculate the fused tensor sizes in the indices
             fused_tensor_size = 0
-            for i in range(len(split_node.args[1])):
+            for i in range(len(split_node.args[1])):  # type: ignore[arg-type]
                 if i in indices:
-                    fused_tensor_size += split_node.args[1][i]
+                    fused_tensor_size += split_node.args[1][i]  # type: ignore[operator, assignment, index]
             # update the split sections
-            split_sections[indices[0]] = fused_tensor_size
+            split_sections[indices[0]] = calculate_fused_tensor_size(
+                split_node, indices
+            )
             # padding others with zeros to keep the same dict size
             for i in indices[1:]:
                 split_sections[i] = 0
@@ -1221,7 +1370,94 @@ def merge_getitem_cat(match: Match, split_sections: List[int], dim: int):
                 split_node = new_split_node
                 split_sections = new_split_sections
 
-                counters["inductor"]["getitem_cat_merged"] += 1
+                counters["inductor"]["merge_getitem_cat_pass"] += 1
+
+
+# ############pattern to be optimized is#########
+
+#                 split_node(dim=1)  -> user=multiple
+#       /     \         ...       /         \
+# getitem    getitem          getitem     getitem   -> user=multiple
+#    \       \                    /            \
+#          other_op /cat(user=mul, dim=1)             other_op
+#                      |
+
+# ################after transformation#############
+
+#                 split_node(dim=1)         -> -> user=multiple
+#       /     \         ...       /         \
+# getitem    getitem          getitem     getitem   -> user=multiple
+#    \       \                    /           \
+#                          other_op
+
+
+@register_graph_pattern(
+    CallFunction(
+        torch.cat,
+        getitem_split,
+        dim=Ignored(),
+        _users=MULTIPLE,
+    ),
+    pass_dict=construct_pattern_matcher_pass("mutate_cat_pass"),
+)
+def mutate_cat_node(match: Match, split_sections: List[int], dim: int):
+    if not isinstance(split_sections, (list, tuple)):  # Unnormalized split
+        return
+    graph = match.graph
+    split_node = next(node for node in match.nodes if node.target == torch.split)
+    split_input, split_size, split_dim = _get_split_args_default(split_node)
+    # if the cat and split have different dims, return
+    # Find the next users (i.e. users after the getitem)
+    next_users = find_next_users(split_node)
+    for cat_user in next_users:
+        if cat_user.target == torch.cat:
+            cat_dim = get_arg_value(cat_user, 1, "dim") or 0
+            # check that all getitems in the cat_user from the same node
+            # check the input of the cat has all getitem from the split
+            if split_dim != cat_dim or not has_same_parent_node(cat_user):
+                continue
+            # find the index of getitems to be cat
+            indices, idx_to_getitem = [], {}
+            for getitem in cat_user.args[0]:  # type: ignore[union-attr]
+                indices.append(getitem.args[1])  # type: ignore[union-attr]
+                idx_to_getitem[getitem.args[1]] = getitem  # type: ignore[union-attr]
+            # the gettitems to be merged must be consecutive, otherwise
+            # returned sliced tensor could be wrong
+            if not is_sorted_and_consecutive(indices):
+                continue
+            # case 1: the cat uses all getitems from the split
+            if len(split_sections) == len(cat_user.args[0]):  # type: ignore[arg-type]
+                # replace the users of the cat node to be the input of the split node
+                cat_user.replace_all_uses_with(split_node.args[0])
+                # remove the cat node
+                graph.erase_node(cat_user)
+                counters["inductor"]["mutate_cat_pass"] += 1
+            # case 2: the cat uses some getitems from the split
+            elif is_node_meta_valid(split_node.args[0]):  # type: ignore[arg-type]
+                # check the split dim, and construct the slice tuple
+                start_fused_size = calculate_fused_tensor_size(
+                    split_node, list(range(indices[0]))
+                )
+                end_fused_size = start_fused_size + calculate_fused_tensor_size(
+                    split_node, indices
+                )
+                slice_list = []
+                for i in range(len(split_node.args[0].meta["example_value"].shape)):  # type: ignore[union-attr]
+                    if i != split_dim:
+                        slice_list.append(slice(None, None, None))
+                    else:
+                        slice_list.append(slice(start_fused_size, end_fused_size, None))
+                with graph.inserting_after(split_node):
+                    slice_node = graph.call_function(
+                        operator.getitem,
+                        args=(split_node.args[0], tuple(slice_list)),
+                    )
+                    cat_user.replace_all_uses_with(slice_node)
+                    slice_node.meta.update(cat_user.meta)
+
+                # remove the cat node
+                graph.erase_node(cat_user)
+                counters["inductor"]["mutate_cat_pass"] += 1
 
 
 # noqa: W605
@@ -1255,12 +1491,9 @@ def merge_getitem_cat(match: Match, split_sections: List[int], dim: int):
             torch.stack,
             getitem_split,
             dim=Ignored(),
-            _users=1,
         ),
-        _users=1,
     ),
-    pass_dict=merge_getitem_cat_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("merge_stack_tahn_unbind_pass"),
 )
 @register_graph_pattern(
     CallFunction(
@@ -1269,12 +1502,9 @@ def merge_getitem_cat(match: Match, split_sections: List[int], dim: int):
             torch.stack,
             tensors=getitem_split,
             dim=Ignored(),
-            _users=1,
         ),
-        _users=1,
     ),
-    pass_dict=merge_getitem_cat_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("merge_stack_tahn_unbind_pass"),
 )
 @register_graph_pattern(
     CallFunction(
@@ -1283,12 +1513,9 @@ def merge_getitem_cat(match: Match, split_sections: List[int], dim: int):
             torch.stack,
             getitem_split,
             Ignored(),
-            _users=1,
         ),
-        _users=1,
     ),
-    pass_dict=merge_getitem_cat_pass,
-    extra_check=config_flag("split_cat_fx_passes"),
+    pass_dict=construct_pattern_matcher_pass("merge_stack_tahn_unbind_pass"),
 )
 def merge_stack_tahn_unbind(match: Match, split_sections: List[int], dim: int):
     if not isinstance(split_sections, (list, tuple)):  # Unnormalized split
@@ -1303,37 +1530,41 @@ def merge_stack_tahn_unbind(match: Match, split_sections: List[int], dim: int):
     for user in next_users:
         # stack user only has one user
         if user.target == torch.stack:
-            if not safe_to_abort_node(user):
-                continue
+            stack_dim = get_arg_value(user, 1, "dim") or 0
             unbind_user = find_next_users(user)[0]
             if unbind_user.target != torch.unbind:
                 continue
             unbind_dim = get_arg_value(unbind_user, 1, "dim") or 0
-            stack_dim = get_arg_value(user, 1, "dim") or 0
-            # stack and unbind shouldhave the same dim
-            if unbind_user.target != torch.unbind or stack_dim != unbind_dim:
+            # stack and unbind should have the same dim
+            # check the all getitems in the user from the same node
+            # check all the getitems only has single user
+            if (
+                stack_dim != unbind_dim
+                or not has_same_parent_node(user)
+                or not all(len(arg.users) == 1 for arg in user.args[0])  # type: ignore[union-attr]
+            ):
                 continue
             # find the index of getitems to be stacked
             indices = []
             split_sections_for_unbind = []
-            for arg in user.args[0]:
-                indices.append(arg.args[1])
-                split_sections_for_unbind.append(split_sections[arg.args[1]])
-            # indices may not be necessarily sorted, we sort them first
-            indices.sort()
+            for arg in user.args[0]:  # type: ignore[union-attr]
+                indices.append(arg.args[1])  # type: ignore[union-attr]
+                split_sections_for_unbind.append(split_sections[arg.args[1]])  # type: ignore[union-attr]
             # the gettitems to be merged must be consecutive, otherwise
             # returned sliced tensor could be wrong
-            if indices[len(indices) - 1] - indices[0] + 1 != len(indices):
+            if not is_sorted_and_consecutive(indices):
                 continue
             # update the arg of stack user, only keep the first getitem
-            user.update_arg(0, user.args[0][0])
+            user.update_arg(0, user.args[0][0])  # type: ignore[index]
             # calculate the fused tensor sizes in the indices
             fused_tensor_size = 0
-            for i in range(len(split_node.args[1])):
+            for i in range(len(split_node.args[1])):  # type: ignore[arg-type]
                 if i in indices:
-                    fused_tensor_size += split_node.args[1][i]
+                    fused_tensor_size += split_node.args[1][i]  # type: ignore[operator, index, assignment]
             # update the split sections
-            split_sections[indices[0]] = fused_tensor_size
+            split_sections[indices[0]] = calculate_fused_tensor_size(
+                split_node, indices
+            )
             # padding others with zeros to keep the same dict size
             for i in indices[1:]:
                 split_sections[i] = 0
@@ -1378,4 +1609,4 @@ def merge_stack_tahn_unbind(match: Match, split_sections: List[int], dim: int):
                 split_node = new_split_node
                 split_sections = new_split_sections
 
-                counters["inductor"]["stack_tahn_unbind_merged"] += 1
+                counters["inductor"]["merge_stack_tahn_unbind_pass"] += 1
diff --git a/torch/_inductor/fx_utils.py b/torch/_inductor/fx_utils.py
index d492a4b8dca21..5ccff50c1d45f 100644
--- a/torch/_inductor/fx_utils.py
+++ b/torch/_inductor/fx_utils.py
@@ -1,8 +1,17 @@
+import operator
 from collections import defaultdict
 from typing import Any, Callable, DefaultDict, Dict, Optional, Tuple, Type
 
+import sympy
+
 import torch
 import torch.fx
+from torch.fx.experimental.symbolic_shapes import (
+    compute_unbacked_bindings,
+    rebind_unbacked,
+    statically_known_true,
+    sym_eq,
+)
 from torch.utils import _pytree as pytree
 from torch.utils._pytree import tree_map
 from .virtualized import V
@@ -78,6 +87,9 @@ def incremental_update(self):
         for node in self.graph.nodes:
             existing_storages[get_node_storage(node)] += 1
 
+        def is_intlist_same(new, old):
+            return statically_known_true(sym_eq(new, old))
+
         def is_fake_tensor_same(new, old):
             if type(new) != type(old):
                 return False
@@ -87,11 +99,31 @@ def is_fake_tensor_same(new, old):
                 return all(
                     is_fake_tensor_same(new_i, old_i) for new_i, old_i in zip(new, old)
                 )
-            assert isinstance(new, torch.Tensor)
-            if new.shape != old.shape or new.layout != old.layout:
+            if new is None:
+                return old is None
+            if not isinstance(new, torch.Tensor):
+                assert isinstance(
+                    new, (torch.SymInt, torch.SymBool, torch.SymFloat)
+                ), f"Unknown type {type(new)} in {self.graph}"
+                return (
+                    new.node.shape_env._maybe_evaluate_static(
+                        sympy.Eq(new.node.expr, old.node.expr)
+                    )
+                    == sympy.true
+                )
+            if not is_intlist_same(new.shape, old.shape) or new.layout != old.layout:
+                return False
+            if new.layout == torch.strided and (
+                not is_intlist_same(new.stride(), old.stride())
+                or not statically_known_true(
+                    new.storage_offset() == old.storage_offset()
+                )
+            ):
                 return False
-            if new.layout == torch.strided and new.stride() != old.stride():
+
+            if new.device != old.device:
                 return False
+
             if get_storage(new) == get_storage(old):
                 return True
 
@@ -103,44 +135,52 @@ def is_fake_tensor_same(new, old):
                 return True
             return False
 
+        def should_process_node(node):
+            # node.target for nodes returning true from this function
+            # are called under fake mode and does not work for inductor
+            # lowerings. We check if the node.target is an aten operator
+            # or operator.getitem which is used when returning multiple
+            # tensors from an op.
+            return node.op == "call_function" and (
+                isinstance(node.target, torch._ops.OpOverload)
+                or node.target == operator.getitem
+            )
+
+        to_process = set()
         for node in self.graph.nodes:
-            if self.hash_node(node) in self.processed_hashes:
+            if (
+                self.hash_node(node) in self.processed_hashes
+                and id(node) not in to_process
+            ):
                 continue
 
-            def is_aten_node(node):
-                return node.op == "call_function" and isinstance(
-                    node.target, torch._ops.OpOverload
-                )
+            if not should_process_node(node):
+                continue
 
-            if not is_aten_node(node):
+            is_valid, args, kwargs = get_fake_args_kwargs(node)
+            if not is_valid:
                 continue
+            with V.fake_mode:
+                new_fake_tensor = node.target(*args, **kwargs)
+            if "val" in node.meta and is_fake_tensor_same(
+                new_fake_tensor, node.meta["val"]
+            ):
+                continue
+
+            rebind_unbacked(V.fake_mode.shape_env, node, new_fake_tensor)
 
-            processing = [node]
-            while len(processing) > 0:
-                updating_node = processing.pop()
-                if updating_node in processed:
-                    continue
-                if is_aten_node(updating_node):
-                    continue
-
-                is_valid, args, kwargs = get_fake_args_kwargs(updating_node)
-                if not is_valid:
-                    continue
-                with V.fake_mode:
-                    new_fake_tensor = updating_node.target(*args, **kwargs)
-                if "val" in updating_node.meta and is_fake_tensor_same(
-                    new_fake_tensor, updating_node.meta["val"]
-                ):
-                    continue
-                updating_node.meta["val"] = new_fake_tensor
-
-                # todo(chilli): This code path is not exercised by our existing
-                # tests - add a test
-                existing_storages[get_node_storage(new_fake_tensor)] += 1
-                processed.add(updating_node)
-                processing.extend(updating_node.users)
-
-                self.processed_hashes.add(self.hash_node(updating_node))
+            node.meta["val"] = new_fake_tensor
+            if (shape_env := V.fake_mode.shape_env) and (
+                symbol_to_path := compute_unbacked_bindings(shape_env, new_fake_tensor)
+            ):
+                # Refresh the bindings to the new symbols
+                node.meta["unbacked_bindings"] = symbol_to_path
+
+            existing_storages[get_node_storage(node)] += 1
+
+            to_process.update([id(user) for user in node.users])
+
+            self.processed_hashes.add(self.hash_node(node))
 
 
 def get_storage(t: torch.Tensor) -> int:
@@ -175,3 +215,35 @@ def get_fake_args_kwargs(x: torch.fx.Node) -> Tuple[bool, Tuple[Any], Dict[str,
     ):
         return False, args, kwargs
     return True, args, kwargs
+
+
+def is_node_realized(node: torch.fx.Node) -> bool:
+    """Returns true if a node is always realized when lowered to inductor IR.
+
+    NOTE: This may return some false negatives. e.g. it doesn't
+    handle buffers realized heuristically during lowering, or
+    buffers realized indirectly through view ops.
+    """
+    from torch._inductor.lowering import fallbacks, needs_realized_inputs
+
+    def is_buffer(node: torch.fx.Node) -> bool:
+        if node.op == "call_function" and node.target is operator.getitem:
+            # For nodes with multiple outputs, we get the fx graph:
+            #     foo = torch.ops.aten.foo(...)
+            #     getitem = foo[0]
+            #     getitem_1 = foo[1]
+            # where we need to check if foo is a fallback kernel
+            return is_buffer(node.args[0])  # type: ignore[arg-type]
+        return node.op in ("placeholder", "output") or node.target in fallbacks
+
+    if is_buffer(node):
+        return True
+
+    def realizes_inputs(node: torch.fx.Node) -> bool:
+        return node.op == "output" or node.target in needs_realized_inputs
+
+    if any(realizes_inputs(user) for user in node.users):
+        return True
+
+    # Otherwise, assume node isn't realized
+    return False
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 3759ed41e89a4..1926993e9b1bf 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -7,7 +7,18 @@
 import time
 from collections import defaultdict
 from contextlib import contextmanager
-from typing import Any, Callable, DefaultDict, Dict, List, Optional, Set, Tuple
+from typing import (
+    Any,
+    Callable,
+    DefaultDict,
+    Dict,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+)
 
 import sympy
 
@@ -16,10 +27,19 @@
 import torch.fx
 from torch._decomp import get_decompositions
 from torch._dynamo.utils import defake, dynamo_timed
-from torch._logging import LazyString
+from torch._logging import LazyString, trace_structured
+from torch._prims_common import make_channels_last_strides_for
 from torch._subclasses.fake_tensor import FakeTensor
+from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental.sym_node import magic_methods, method_to_operator
-from torch.fx.experimental.symbolic_shapes import has_free_symbols, ShapeEnv, SymTypes
+from torch.fx.experimental.symbolic_shapes import (
+    free_unbacked_symbols,
+    has_free_symbols,
+    resolve_unbacked_bindings,
+    RuntimeAssert,
+    ShapeEnv,
+    SymTypes,
+)
 from torch.utils._mode_utils import no_dispatch
 
 from . import config, ir
@@ -30,7 +50,9 @@
     get_wrapper_codegen_for_device,
     register_backend_for_device,
 )
-from .codegen.wrapper import CppWrapperCodeGen, CudaWrapperCodeGen, WrapperCodeGen
+from .codegen.cpp_wrapper_cpu import CppWrapperCpu
+from .codegen.cpp_wrapper_cuda import CppWrapperCuda
+from .codegen.wrapper import WrapperCodeGen
 from .exc import (
     CppWrapperCodeGenError,
     LoweringException,
@@ -47,6 +69,7 @@
     TensorBox,
 )
 from .lowering import (
+    constrain_to_fx_strides,
     FALLBACK_ALLOW_LIST,
     fallback_handler,
     fallback_node_due_to_unsupported_type,
@@ -57,13 +80,25 @@
     unsupported_output_tensor,
 )
 from .sizevars import SizeVarAllocator
-from .utils import convert_shape_to_inductor, gather_origins, get_sympy_Expr_dtype
+from .utils import (
+    convert_shape_to_inductor,
+    gather_origins,
+    get_cloned_parameter_buffer_name,
+    get_sympy_Expr_dtype,
+    maybe_get_suppress_shape_guards_ctx,
+    should_assume_input_aligned,
+)
 from .virtualized import V
 
+if TYPE_CHECKING:
+    from torch._higher_order_ops.effects import _EffectType
+
 log = logging.getLogger(__name__)
 perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
 output_code_log = torch._logging.getArtifactLogger(__name__, "output_code")
+aten = torch.ops.aten
 
+_post_grad_graph_counter = itertools.count()
 
 if config.is_fbcode():
     from torch._inductor.fb.utils import log_module_code
@@ -84,12 +119,16 @@ def supported_dtype_of_cpp_wrapper(dtype, cuda):
         torch.uint8,
         torch.bool,
         torch.bfloat16,
+        torch.complex32,
         torch.complex64,
+        torch.complex128,
         torch.float16,
     }
     if cuda:
         supported_dtype.add(torch.float8_e4m3fn)
         supported_dtype.add(torch.float8_e5m2)
+        supported_dtype.add(torch.float8_e4m3fnuz)
+        supported_dtype.add(torch.float8_e5m2fnuz)
 
     return dtype in supported_dtype
 
@@ -117,6 +156,73 @@ def is_magic_method(op):
     return op in magic_ops
 
 
+def getattr_recursive(obj, target):
+    target_atoms = target.split(".")
+    attr_itr = obj
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(
+                f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}"
+            )
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
+
+
+def mark_nodes_dislike_padding(g):
+    """
+    Nodes like convolution/convolution_backward want its input to be dense.
+    If we pad their inputs, we result in extra calls to copy kernels!  On the other hand, padding usually helps reduction.
+
+    The pass finds nodes that dislike padding. These are nodes that can be reached
+    from a convolution/convolution_backward in the backward direction without
+    going thru a reduction.
+    """
+    if not config.comprehensive_padding:
+        return
+    ops_dislike_padding = {
+        aten.convolution,
+        aten.convolution_backward,
+    }
+    # what's a better way to collect the reduction ops?
+    ops_like_padding = {
+        aten.var_mean,
+        aten.sum,
+        aten.mean,
+        aten.prod,
+        aten.any,
+        aten.amin,
+        aten.amax,
+        aten.min,
+        aten.max,
+        aten.argmin,
+        aten.argmax,
+        aten.scatter_reduce,
+    }
+
+    def _get_overload_packet(node):
+        return (
+            node.target._overloadpacket
+            if node.op == "call_function" and hasattr(node.target, "_overloadpacket")
+            else None
+        )
+
+    for cur in reversed(g.nodes):
+        op = _get_overload_packet(cur)
+        if not op:
+            continue
+        if op in ops_dislike_padding:
+            cur.meta["dislike_padding"] = True
+
+        if cur.meta.get("dislike_padding", False):
+            # propagate
+            for prior in cur.all_input_nodes:
+                prior_op = _get_overload_packet(prior)
+                if not prior_op:
+                    continue
+                if prior_op not in ops_like_padding:
+                    prior.meta["dislike_padding"] = True
+
+
 class GraphLowering(torch.fx.Interpreter):
     graph_outputs: List[ir.IRNode]
 
@@ -167,13 +273,22 @@ def init_backend_registration(self):
         if get_scheduling_for_device("cpu") is None:
             from .codegen.cpp import CppScheduling
 
-            register_backend_for_device("cpu", CppScheduling, WrapperCodeGen)
+            register_backend_for_device(
+                "cpu", CppScheduling, WrapperCodeGen, CppWrapperCpu
+            )
 
         if get_scheduling_for_device("cuda") is None:
             from .codegen.cuda_combined_scheduling import CUDACombinedScheduling
 
             # CUDACombinedScheduling combines Triton and CUDA C++ scheduling for CUDA devices via delegation
-            register_backend_for_device("cuda", CUDACombinedScheduling, WrapperCodeGen)
+            register_backend_for_device(
+                "cuda", CUDACombinedScheduling, WrapperCodeGen, CppWrapperCuda
+            )
+
+        if get_scheduling_for_device("xpu") is None:
+            from .codegen.triton import TritonScheduling
+
+            register_backend_for_device("xpu", TritonScheduling, WrapperCodeGen)
 
     def __init__(
         self,
@@ -184,10 +299,15 @@ def __init__(
         graph_id=None,
         cpp_wrapper=False,
         aot_mode=False,
-        user_visible_outputs=frozenset(),
+        user_visible_outputs=None,
         layout_opt=None,
         extern_node_serializer=None,
         is_inference=False,
+        is_const_graph=False,
+        const_output_index=None,
+        const_code=None,
+        const_module=None,
+        name=None,
     ):
         super().__init__(gm)
 
@@ -199,6 +319,9 @@ def __init__(
         )
         self.num_channels_last_conv = 0
         self.is_inference = is_inference
+        self.is_const_graph = is_const_graph
+        self.const_code = const_code
+        self.const_module = const_module
 
         self.extra_traceback = False  # we do our own error wrapping
         if shape_env is None:
@@ -208,14 +331,33 @@ def __init__(
             self._shape_env = shape_env
             self.reuse_shape_env = True
         self._shape_env = shape_env
+        # We are going to start code generating runtime asserts, so make sure
+        # you don't start adding new ones in the lowering process
+        shape_env.freeze_runtime_asserts()
+        # We're going to mutate ras_by_symbol as we finish generating them
+        self.ras_by_symbol: Dict[
+            sympy.Symbol, List[RuntimeAssert]
+        ] = shape_env.deferred_runtime_asserts.copy()
+        self.bound_unbacked_symbols: Set[sympy.Symbol] = set()
         self.sizevars = SizeVarAllocator(shape_env)
+        self.graph_input_names: List[str] = []
         self.graph_inputs: Dict[str, TensorBox] = {}
         self.graph_inputs_original: Dict[str, InputBuffer] = {}
-        self.device_types: Set[str] = set()
-        self.device_idxs: Set[int] = set()
+        self.device_types: Set[str] = (
+            const_module.device_types if const_module else set()
+        )
+        self.device_idxs: Set[int] = const_module.device_idxs if const_module else set()
         self.cuda = False
         self.buffers: List[ir.Buffer] = []
-        self.constants: Dict[str, torch.Tensor] = {}
+        self.const_output_index: Dict[str, int] = (
+            const_output_index if const_output_index else {}
+        )
+        self.folded_constants: Set[str] = (
+            set(const_output_index.keys()) if const_output_index else set()
+        )
+        self.constants: Dict[str, torch.Tensor] = (
+            const_module.constants if const_module else {}
+        )
         self.constant_reprs: Dict[str, str] = {}
         self.removed_buffers: Set[str] = set()
         self.removed_inplace_buffers: Set[str] = set()
@@ -237,16 +379,27 @@ def __init__(
         self.name_to_buffer: Dict[str, ir.Buffer] = {}
         self.name_to_users: DefaultDict[str, List[ir.IRNode]] = defaultdict(list)
         self.creation_time = time.time()
-        self.name = "GraphLowering"
+        self.name = name
         self.cpp_wrapper = cpp_wrapper
+
+        # record multi_kernel choice for cpp_wrapper so the second pass knows
+        # which sub-kernel is picked. Copy cpp_wrapper to another variable
+        # since cpp_wrapper flag is set to false for the first pass of codegen.
+        self.record_multi_kernel_choice = cpp_wrapper
+        self.multi_kernel_to_choice: Dict[str, int] = {}
+
         self.aot_mode = aot_mode
         self.graph_id = graph_id
+        self.post_grad_graph_id = next(_post_grad_graph_counter)
         self.scheduler: "torch._inductor.scheduler.Scheduler" = None  # type: ignore[assignment]
         self.nodes_prefer_channels_last = (
             self.find_nodes_prefer_channels_last() if self.layout_opt else set()
         )
+        mark_nodes_dislike_padding(gm.graph)
         self._warned_fallback = {"aten.convolution_backward"}
-        self.user_visible_outputs = user_visible_outputs
+        self.user_visible_outputs = (
+            user_visible_outputs if user_visible_outputs is not None else {}
+        )
         self.cache_key: str = ""  # This is the cache key for the compiled artifact
         self.cache_path: str = ""  # This is the path in the filesystem where the compiled artifact is stored
         self.cache_linemap: List[
@@ -255,14 +408,23 @@ def __init__(
             []
         )  # This is the linemap used by the profiler to mark custom compiled kernels getting run
         # Used if lowering encounters cases where cudagraphs are not supported
-        self.disable_cudagraphs = False
-        self.disable_cudagraphs_reason = ""
+        self.disable_cudagraphs_reason: Optional[str] = None
+
+        # only keeping one node per device for stack trace purposes
+        self.device_node_mapping: Dict[torch.device, torch.fx.Node] = {}
         self.orig_gm: torch.fx.GraphModule = gm.__copy__()
         self.dynamo_flat_name_to_original_fqn = self.module.meta.get(
             "dynamo_flat_name_to_original_fqn", {}
         )
+        self.allocated_constant_name = (
+            const_module.allocated_constant_name if const_module is not None else {}
+        )
         self.init_backend_registration()
 
+        self.effectful_ops: Dict[_EffectType, ir.Buffer] = {}
+
+        self.aligned_inputs: Set[str] = set()
+
     @staticmethod
     def decide_layout_opt(gm, *, is_inference) -> bool:
         """
@@ -283,23 +445,19 @@ def decide_layout_opt(gm, *, is_inference) -> bool:
         if nconv == 0:
             return False
 
-        # NHWC perf issue on ROCm5.7 first noted here https://github.com/pytorch/pytorch/pull/110319
-        if torch.version.hip and torch.cuda.is_available():
-            return False
-
-        # For cpu backend and mkldnn enabled, we always using channels_last for a better performance.
+        # For cpu backend and mkldnn enabled, we always use channels_last for better performance.
         if (
-            all(
+            torch.backends.mkldnn.enabled
+            and torch.backends.mkldnn.is_available()
+            and all(
                 n.args[idx].meta["val"].device == torch.device("cpu")
                 for n in conv_nodes
                 for idx in [0, 1]
             )
-            and torch.backends.mkldnn.enabled
-            and torch.backends.mkldnn.is_available()
         ):
             return True
 
-        # Followering models are skipped due to this:
+        # Following models are skipped due to this:
         # jx_nest_base
         # volo_d1_224
         if len(list(gm.graph.nodes)) >= 300 * nconv:
@@ -402,7 +560,7 @@ def is_small_channel(n):
         #
         # The following heuristics skip using channels-last if the model contains
         # grouped convolution with in-channels > 1.
-        if any(is_grouped(n) for n in conv_nodes):
+        if any(map(is_grouped, conv_nodes)):
             log.debug(
                 "Skip layout opt because found grouped convolution with >1 in_channels!"
             )
@@ -415,7 +573,7 @@ def is_small_channel(n):
         # - phlippe_densenet (slightly worse)
         # - Background_Matting (1.22x -> 0.821x)
         # - pytorch_CycleGAN_and_pix2pix (1.597x -> 1.294x)
-        if any(is_in_out_channel(n) for n in conv_nodes):
+        if any(map(is_in_out_channel, conv_nodes)):
             log.debug(
                 "Skip layout opt because some convolutions have smaller out_channel"
             )
@@ -423,12 +581,43 @@ def is_small_channel(n):
 
         # Following models are skipped due to this:
         # - functorch_maml_omniglot
-        if all(is_small_channel(n) for n in conv_nodes):
+        if all(map(is_small_channel, conv_nodes)):
             log.debug("Skip layout opt because all convolution channels are too small")
             return False
 
         return True
 
+    def qualify_name(self, name: str) -> str:
+        """Prepend the given name with the graph name if any."""
+        if self.name is not None:
+            return f"{self.name}_{name}"
+        return name
+
+    def make_subgraph(
+        self,
+        gm: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        subgraph_name: str,
+    ) -> "GraphLowering":
+        """
+        Make a subgraph of the current graph with all inherited
+        parts, except the graph module (`gm`) and `example_inputs`.
+        The subgraphs are lowered separately, but intended to be
+        inlined in the parent graph's codegening. Hence the need
+        for maintaining the same `shape_env` and other properties.
+        The subgraph name is qualified by the parent graph's name.
+        """
+        return GraphLowering(
+            gm=gm,
+            example_inputs=example_inputs,
+            shape_env=self._shape_env,
+            cpp_wrapper=self.cpp_wrapper,
+            aot_mode=self.aot_mode,
+            extern_node_serializer=self.extern_node_serializer,
+            is_inference=self.is_inference,
+            name=self.qualify_name(subgraph_name),
+        )
+
     def find_nodes_prefer_channels_last(self):
         """
         The rule to decide if an node prefer channels last is simple.
@@ -475,8 +664,7 @@ def find_nodes_prefer_channels_last(self):
         # - sebotnet33ts_256
         for n in self.module.graph.nodes:
             if n in output_set:
-                for child in n.users:
-                    output_set.add(child)
+                output_set.update(n.users)
 
         return output_set
 
@@ -489,6 +677,8 @@ def add_device_info(self, device: torch.device):
         self.device_types.add(device.type)
         if device.index is not None:
             self.device_idxs.add(device.index)
+        if V.graph.current_node and device not in self.device_node_mapping:
+            self.device_node_mapping[device] = V.graph.current_node
 
     @property
     def fake_mode(self):
@@ -499,6 +689,14 @@ def get_buffer(self, buffer_name: str):
             return self.name_to_buffer[buffer_name]
         if buffer_name in self.graph_inputs:
             return self.graph_inputs[buffer_name]
+        if buffer_name in self.constants:
+            data = V.graph.constants[buffer_name]
+            return ir.ConstantBuffer(
+                buffer_name,
+                ir.FixedLayout(
+                    data.device, data.dtype, *V.graph.static_sizes_strides(data)
+                ),
+            )
         return None
 
     def get_dtype(self, buffer_name: str):
@@ -531,17 +729,23 @@ def get_numel(self, buffer_name: str):
     def run(self, *args):
         return super().run(*args)
 
-    def register_buffer(self, buffer: ir.Buffer):
-        name = f"buf{len(self.buffers)}"
+    def register_buffer(self, buffer: ir.Buffer, *, set_name: bool = False):
+        name = self.qualify_name(f"buf{len(self.buffers)}")
         self.buffers.append(buffer)
         self.name_to_buffer[name] = buffer
         # Skip empty CPU tensor so that CUDA graphs can succeed, see https://github.com/pytorch/pytorch/pull/114144
-        if not isinstance(buffer, ir.ComputedBuffer) or not buffer.is_zero_elements():
+        if (
+            not (isinstance(buffer, ir.ComputedBuffer) and buffer.is_zero_elements())
+            and buffer.get_device() is not None
+        ):
             self.add_device_info(buffer.get_device())
+
+        if set_name:
+            buffer.name = name
         return name
 
     def register_list(self, buffer_names: List[str]):
-        name = "list_" + "_".join(buffer_names)
+        name = self.qualify_name("list_" + "_".join(buffer_names))
         self.lists[name] = buffer_names
         return name
 
@@ -580,23 +784,43 @@ def mark_buffer_mutated(self, name: str):
         for user in self.name_to_users[name]:
             user.realize()
 
+    def get_original_value_of_constant(self, name: str):
+        """
+        In AOTI, module buffers may have been mutated during the tracing and compilation.
+        Thus we need to read from previously stored original buffers, to make sure the
+        generated model.so uses correct initial values.
+        """
+        assert name in self.allocated_constant_name and name in self.constants, (
+            "Can not find the original value for " + name
+        )
+        orig_name = get_cloned_parameter_buffer_name(self.allocated_constant_name[name])
+        return (
+            self.module.meta[orig_name]
+            if orig_name in self.module.meta
+            else self.constants[name]
+        )
+
     def add_tensor_constant(self, data, name=None):
         def allocate(name):
-            for constant_name, value in self.constants.items():
-                if (
-                    not data.is_mkldnn
-                    and data.size() == value.size()
-                    and data.stride() == value.stride()
-                    and data.dtype == value.dtype
-                    and data.device == value.device
-                    and torch.eq(data, value).all()
-                ):
-                    return constant_name
+            if not config.aot_inductor.use_runtime_constant_folding:
+                for constant_name, value in self.constants.items():
+                    if (
+                        not data.is_mkldnn
+                        and data.size() == value.size()
+                        and data.stride() == value.stride()
+                        and data.dtype == value.dtype
+                        and data.device == value.device
+                        and data.untyped_storage().data_ptr()
+                        == value.untyped_storage().data_ptr()
+                        and data.storage_offset() == value.storage_offset()
+                    ):
+                        return constant_name
 
             if name is None:
                 name = f"constant{len(self.constants)}"
             if name[0].isdigit():
                 name = f"constant_{name}"
+            name = self.qualify_name(name)
             # We may generate a var name for each constant in the codegen.
             # Let's only keep sane characters.
             prefix = re.sub(r"[^a-zA-Z0-9_]", "_", name)
@@ -613,11 +837,12 @@ def allocate(name):
             )
             return name
 
-        name = allocate(name)
+        new_name = allocate(name)
+        self.allocated_constant_name[new_name] = name
 
         return TensorBox.create(
             ir.ConstantBuffer(
-                name,
+                new_name,
                 FixedLayout(data.device, data.dtype, *self.static_sizes_strides(data)),
             )
         )
@@ -637,6 +862,7 @@ def constant_name(self, name: str, device_override: Optional[torch.device]):
 
     def placeholder(self, target: str, args, kwargs):
         example = super().placeholder(target, args, kwargs)
+        self.graph_input_names.append(target)
         if isinstance(example, SymTypes):
             expr = example.node.expr
             self.graph_inputs[target] = expr
@@ -645,6 +871,10 @@ def placeholder(self, target: str, args, kwargs):
             expr = sympy.sympify(example)
             self.graph_inputs[target] = expr
             return expr
+        if isinstance(example, BackwardState):
+            # Ignored arg, must be unused
+            # Alternately we could filter this out in AotAutograd
+            return None
         assert isinstance(example, torch.Tensor), example
         # todo(chilli): We can remove the last check once we turn buffers into
         # static shape tensors. That's a hack to workaround Inductor believing
@@ -656,6 +886,7 @@ def placeholder(self, target: str, args, kwargs):
         else:
             sizes, strides = self.symbolic_sizes_strides(example)
         # TODO(jansel): handle input aliasing
+        target = self.qualify_name(target)
         tensor = TensorBox.create(
             InputBuffer(
                 target,
@@ -665,6 +896,22 @@ def placeholder(self, target: str, args, kwargs):
         self.graph_inputs[target] = tensor
         self.graph_inputs_original[target] = tensor.data.data
         self.add_device_info(example.device)
+
+        # Note: [Input Alignment handling in Inductor]
+        # Alignment matters for generating efficient code. Some operations,
+        # e.g. vectorized loads, can only be performed on aligned inputs.
+        #
+        # But if we codegen assuming aligned inputs and then get unaligned
+        # inputs at runtime, then we are forced to clone - which is bad for
+        # both perf and memory usage.
+        #
+        # One option would be to guard on storage_offset%ALIGNMENT, and then
+        # codegen based on this. But storage_offset guards turned out to be
+        # expensive and cause recompiles; Instead, we're generating code
+        # based on the alignment of the example input without guarding.
+        with maybe_get_suppress_shape_guards_ctx():
+            if should_assume_input_aligned(example):
+                self.aligned_inputs.add(target)
         return tensor
 
     def call_function(self, target, args, kwargs):
@@ -675,6 +922,23 @@ def call_function(self, target, args, kwargs):
             # passthrough lowerings from .pattern_matcher
             return target(*args, **kwargs)
 
+        def get_custom_op_layout_constraints(target, args, kwargs):
+            # Custom operations that require preserving stride order
+            # which run through implicit fallback must constrain their
+            # arguments' fx strides
+            layout_constraint = None
+            if torch._C.Tag.needs_fixed_stride_order in target.tags:
+                # We have to set the current args because call_function will immediately
+                # evaluate this lowering after creating the fallback, without evaluating
+                # the layout constraint
+                args, kwargs = constrain_to_fx_strides(
+                    self.current_node, *args, **kwargs
+                )
+                # Also register the layout constraint so when the fallback
+                # is used again, we can constrain the args to the same layout
+                layout_constraint = constrain_to_fx_strides
+            return layout_constraint, args, kwargs
+
         if target not in lowerings:
             assert isinstance(
                 target, torch._ops.OpOverload
@@ -683,6 +947,9 @@ def call_function(self, target, args, kwargs):
             if base_name in FALLBACK_ALLOW_LIST:
                 make_fallback(target)
             elif config.implicit_fallbacks:
+                layout_constraint, args, kwargs = get_custom_op_layout_constraints(
+                    target, args, kwargs
+                )
                 error = (
                     MissingOperatorWithDecomp
                     if get_decompositions([target])
@@ -692,7 +959,8 @@ def call_function(self, target, args, kwargs):
                     "Creating implicit fallback for:\n%s",
                     error.operator_str(target, args, kwargs),
                 )
-                make_fallback(target)
+                make_fallback(target, layout_constraint)
+
             elif get_decompositions([target]):
                 # There isn't a good way to dynamically patch this in
                 # since AOT Autograd already ran.  The error message tells
@@ -719,9 +987,16 @@ def can_inline_constant(t: torch.Tensor) -> bool:
 
     def get_attr(self, target, args, kwargs):
         # this is a constant
-        value = getattr(self.module, target)
+        value = getattr_recursive(self.module, target)
+
+        if isinstance(value, torch.fx.GraphModule):
+            return ir.Subgraph(name=target, graph_module=value)
 
-        if config.always_keep_tensor_constants or unsupported_output_tensor(value):
+        if (
+            config.aot_inductor.use_runtime_constant_folding
+            or config.always_keep_tensor_constants
+            or unsupported_output_tensor(value)
+        ):
             return self.add_tensor_constant(value, target)
 
         with no_dispatch():
@@ -736,13 +1011,16 @@ def get_attr(self, target, args, kwargs):
         return self.add_tensor_constant(value, target)
 
     def call_module(self, target, args, kwargs):
-        raise AssertionError()
+        raise AssertionError
 
     def call_method(self, target, args, kwargs):
-        raise AssertionError()
+        raise AssertionError
 
     def output(self, target, args, kwargs):
         result = super().output(target, args, kwargs)
+        if not isinstance(result, (tuple, list)):
+            # nested subgraphs can have singleton outputs
+            result = (result,)
         assert isinstance(result, (tuple, list)), type(result)
         assert all(
             isinstance(
@@ -755,11 +1033,33 @@ def output(self, target, args, kwargs):
                     sympy.Expr,
                     sympy.logic.boolalg.Boolean,
                     int,
+                    ir.EffectfulKernel,
                 ),
             )
             for x in result
         ), result
-        self.graph_outputs = [ir.ExternKernel.realize_input(x) for x in result]
+
+        fx_node_args = V.graph.current_node.args[0]  # type: ignore[arg-type]
+        if not isinstance(fx_node_args, (tuple, list)):
+            # nested subgraphs can have singleton outputs
+            fx_node_args = (fx_node_args,)
+        result = [ir.ExternKernel.realize_input(x) for x in result]
+        result_correct_strides = []
+
+        assert len(fx_node_args) == len(result)
+        for r, fx_node in zip(result, fx_node_args):
+            if not isinstance(r, (ir.TensorBox, ir.BaseView)):
+                result_correct_strides.append(r)
+            else:
+                # AOT Autograd tries to detect stride divergence of inductor from output metadata.
+                # Here, we try to avoid spurious divergence by matching insignificant strides such as
+                result_correct_strides.append(
+                    self.try_match_insignificant_strides(
+                        r, fx_node.meta["val"].stride()
+                    )
+                )
+
+        self.graph_outputs = result_correct_strides
         value: ir.IRNode
         for name, value in self.graph_inputs.items():
             assert isinstance(
@@ -775,7 +1075,9 @@ def output(self, target, args, kwargs):
             value = value.data
             if not isinstance(value, InputBuffer) or value.get_name() != name:
                 # one of our inputs was mutated, need to turn that into a copy
-                ir.MutationLayout.realize_into(value, self.graph_inputs_original[name])
+                ir.MutationLayoutSHOULDREMOVE.realize_into(
+                    value, self.graph_inputs_original[name]
+                )
                 # replace output with mutated input
                 try:
                     ind = self.graph_outputs.index(value_storage_box)
@@ -803,10 +1105,67 @@ def set_current_node(self, node: torch.fx.Node):
         finally:
             self.current_node = old
 
+    def try_match_insignificant_strides(
+        self,
+        tensor,
+        meta_strides_inp: Tuple[Union[int, torch.SymInt], ...],
+    ) -> ir.TensorBox:
+        """
+        Tries to match the strides of the tensor to those in the meta_strides. Strides of insignificant
+        dimensions - size 0 or 1 - will be updated.
+
+        If there are real stride differences (NHWC vs NCHW) then the input will be returned.
+        """
+
+        # should have already been realized
+        assert torch._inductor.ir.is_storage_and_layout(tensor)
+
+        meta_strides = [
+            s.node.expr if isinstance(s, torch.SymInt) else s for s in meta_strides_inp
+        ]
+
+        if all(
+            self.sizevars.statically_known_equals(s1, s2)
+            for s1, s2 in zip(meta_strides, tensor.get_stride())
+        ):
+            return tensor
+
+        def significant_strides_equal(shape, meta_strides, tensor_strides):
+            for dim, s1, s2 in zip(shape, meta_strides, tensor_strides):
+                if self.sizevars.statically_known_leq(dim, 1):  # type: ignore[arg-type]
+                    continue
+
+                if not self.sizevars.statically_known_equals(s1, s2):
+                    return False
+
+            return True
+
+        if not significant_strides_equal(
+            tensor.get_size(), meta_strides, tensor.get_stride()
+        ):
+            return tensor
+
+        storage, old_layout = torch._inductor.ir.as_storage_and_layout(tensor)
+        new_stride = list(old_layout.stride)
+        for i, s in enumerate(tensor.get_size()):
+            if self.sizevars.statically_known_leq(s, 1):  # type: ignore[arg-type]
+                new_stride[i] = meta_strides[i]
+
+        new_layout = torch._inductor.ir.FixedLayout(
+            old_layout.device,
+            old_layout.dtype,
+            old_layout.size,
+            new_stride,
+            old_layout.offset,
+        )
+        return ir.TensorBox(torch._inductor.ir.ReinterpretView(storage, new_layout))
+
     def run_node(self, n: torch.fx.Node):
         def debug(msg):
             log.debug("lowering %s %s", LazyString(n.format_node), msg)
 
+        buffer_watermark = len(self.buffers)
+
         origins = {n}
         if n.op == "call_function":
             args, kwargs = self.fetch_args_kwargs_from_env(n)
@@ -821,11 +1180,11 @@ def debug(msg):
             ):
                 debug("fallback_handler")
                 result = fallback_handler(n.target, add_to_fallback_set=False)(
-                    *args, **kwargs
+                    *args, **kwargs  # type: ignore[possibly-undefined]
                 )
             elif n.op == "call_function" and n.target in layout_constraints:
                 debug("layout_constraints")
-                args, kwargs = layout_constraints[n.target](n, *args, **kwargs)
+                args, kwargs = layout_constraints[n.target](n, *args, **kwargs)  # type: ignore[index]
                 result = self.call_function(n.target, args, kwargs)
             elif is_magic_method(n.target):
                 # TODO: this is sus, it probably should be handled in the
@@ -850,11 +1209,27 @@ def debug(msg):
                 torch.ops.aten.as_strided.default,
                 torch.ops.aten.as_strided_.default,
                 torch.ops.aten.as_strided_scatter.default,
+                torch.ops.aten.resize.default,
+                torch.ops.aten.resize_as.default,
             ]
             is_output = any(user.op == "output" for user in n.users)
             is_input_for_as_strided = any(
                 user.target in as_strided_ops for user in n.users
             )
+
+            if n.meta.get("inductor_realize_to_strides", False) and isinstance(
+                result, TensorBox
+            ):
+                result.realize()
+                strides = n.meta["val"].stride()
+                sym_strides = torch._inductor.utils.any_is_symbolic(*strides)
+                if (
+                    not hasattr(result, "get_stride")
+                    or result.get_stride() != strides
+                    and not sym_strides
+                ):
+                    stride_order = ir.get_stride_order(strides)
+                    result = ir.ExternKernel.require_stride_order(result, stride_order)
             if (
                 is_output
                 and isinstance(result, TensorBox)
@@ -868,9 +1243,10 @@ def debug(msg):
             ):
                 strides = n.meta["val"].stride()
                 dense = torch._prims_common.is_non_overlapping_and_dense(n.meta["val"])
+                unbacked_symbols_in_strides = len(free_unbacked_symbols(strides)) > 0
                 # requiring a stride order for a non-dense output wouldn't
                 # recreate the same strides, and would fail with view, defer for now.
-                if dense and len(strides):
+                if not unbacked_symbols_in_strides and dense and len(strides):
                     stride_order = ir.get_stride_order(strides)
                     if (
                         len(result.get_size()) == 4
@@ -879,7 +1255,14 @@ def debug(msg):
                         and not is_input_for_as_strided
                     ):
                         stride_order = ir.NHWC_STRIDE_ORDER
-                    result = ir.ExternKernel.require_stride_order(result, stride_order)
+
+                    allow_padding = (
+                        n.name not in self.user_visible_outputs
+                        and not is_input_for_as_strided
+                    )
+                    result = ir.ExternKernel.require_stride_order(
+                        result, stride_order, allow_padding=allow_padding
+                    )
 
             # Realize if (1) any user need inputs realized, or (2) there is
             # already too many reads and rematerializing can be bad.
@@ -903,26 +1286,42 @@ def debug(msg):
                             torch.ops.aten.mm.default,
                             torch.ops.aten._int_mm.default,
                         ]
+                        need_fixed_channels_last_layout = []
                         if not self.layout_opt:
                             need_fixed_layout.append(torch.ops.aten.convolution.default)
                         if torch._C._has_mkldnn:
                             need_fixed_layout += [
+                                torch.ops.mkldnn._linear_pointwise.default,
+                                torch.ops.mkldnn._linear_pointwise.binary,
+                                torch.ops.aten.mkldnn_rnn_layer.default,
+                                torch.ops.onednn.qlinear_pointwise.default,
+                                torch.ops.onednn.qlinear_pointwise.tensor,
+                            ]
+                            need_fixed_channels_last_layout += [
                                 torch.ops.mkldnn._convolution_pointwise.default,
                                 torch.ops.mkldnn._convolution_pointwise.binary,
                                 torch.ops.mkldnn._convolution_pointwise_.binary,
                                 torch.ops.mkldnn._convolution_transpose_pointwise.default,
-                                torch.ops.mkldnn._linear_pointwise.default,
-                                torch.ops.mkldnn._linear_pointwise.binary,
-                                torch.ops.aten.mkldnn_rnn_layer.default,
                                 torch.ops.onednn.qconv2d_pointwise.default,
                                 torch.ops.onednn.qconv2d_pointwise.binary,
-                                torch.ops.onednn.qlinear_pointwise.default,
                             ]
                             if torch._C.has_mkl:
                                 need_fixed_layout += [torch.ops.mkl._mkl_linear.default]
                         if user.target in need_fixed_layout:
                             result = ir.ExternKernel.require_stride_order(
-                                result, ir.get_stride_order(n.meta["val"].stride())
+                                result,
+                                ir.get_stride_order(n.meta["val"].stride()),
+                                allow_padding=True,
+                            )
+                        if (
+                            user.target in need_fixed_channels_last_layout
+                            and n is user.args[0]
+                        ):
+                            result = ir.ExternKernel.require_stride_order(
+                                result,
+                                ir.get_stride_order(
+                                    make_channels_last_strides_for(n.meta["val"].shape)
+                                ),
                             )
                     if user.op == "output":
                         if isinstance(result.data.data, (Pointwise, Reduction)):
@@ -944,7 +1343,7 @@ def debug(msg):
                 curr = result.data.data
                 if isinstance(curr, Pointwise):
                     # Use inner fn as a rough proxy. Good enough.
-                    if curr.inner_fn_str_len() > config.realize_bytes_threshold:
+                    if curr.has_large_inner_fn():
                         result.realize()
 
         # This is not complete, but it doesn't have to be: origin_node
@@ -974,13 +1373,111 @@ def debug(msg):
 
         self.register_users_of(result)
 
+        new_unbacked_defs = set()
+        for i in range(buffer_watermark, len(self.buffers)):
+            new_unbacked_defs |= self.buffers[i].get_unbacked_symbol_defs()
+
+        def format_buffers():
+            r = []
+            for b in self.buffers[buffer_watermark:]:
+                r.append(
+                    f"unbacked_symbol_defs={b.get_unbacked_symbol_defs()} in:\n{b}\n"
+                )
+            return "***\n".join(r)
+
+        if n.op != "placeholder":
+            # Note [Backwards runtime asserts]
+            # Backwards poses an interesting problem for deferred runtime
+            # asserts.  In the easy case, we may solely close over data
+            # dependent sized tensors, and there are no binding sites for
+            # unbacked SymInts.  In this case, we can just drop all the
+            # runtime asserts on the floor: no non-placeholder bindings, no
+            # problem.
+            #
+            # However, it is *possible* for a fresh runtime assert to show up
+            # between forwards and backwards.  Right now, the freezing process
+            # that happens when we lower forwards means that we will freeze
+            # runtime asserts, and then the moment the backwards lowering
+            # process attempts to add a new deferred runtime assert, we will
+            # fail.  Let's say you remove that assert.  Now when we get here,
+            # we need to make sure we actually emit these asserts (because we
+            # can't emit them in forwards, we already compiled it).  So we
+            # have to do something here.  But we don't want to reemit ALL
+            # deferred runtime asserts, we only want to emit the NEW ones.
+            # Therefore needing some sort of stratification in the ShapeEnv.
+            # This is all doable, it just hasn't been done yet.
+            shape_env = V.graph.sizevars.shape_env
+
+            for i0 in new_unbacked_defs:
+                ras = self.ras_by_symbol.pop(i0, [])
+                # NB: size-like not needed, we won't retrace
+                vr = shape_env.var_to_range[i0]
+                if not shape_env._default_unspecified_value_range().issubset(vr):
+
+                    def convert(s):
+                        try:
+                            return int(s)
+                        except TypeError:
+                            return None
+
+                    if (lower := convert(vr.lower)) is not None:
+                        self.register_buffer(
+                            ir.AssertScalar(i0 >= vr.lower, f"{i0} >= {vr.lower}"),
+                            set_name=True,
+                        )
+                    if (upper := convert(vr.upper)) is not None:
+                        self.register_buffer(
+                            ir.AssertScalar(i0 <= vr.upper, f"{i0} <= {vr.upper}"),
+                            set_name=True,
+                        )
+
+                for ra in ras:
+                    fvs = free_unbacked_symbols(ra.expr)
+                    missing = fvs - self.bound_unbacked_symbols
+                    if missing:
+                        i1 = sorted(missing, key=lambda x: str(x))[0]
+                        self.ras_by_symbol.setdefault(i1, []).append(ra)
+                    else:
+                        self.register_buffer(
+                            ir.AssertScalar(ra.expr, f"{ra.expr}"), set_name=True
+                        )
+
+            self.bound_unbacked_symbols |= new_unbacked_defs
+
+            unbacked_bindings = resolve_unbacked_bindings(
+                V.graph.sizevars.shape_env, n.meta.get("unbacked_bindings", {})
+            )
+            # When we do lowering, it is possible we reallocate unbacked SymInts.
+            # So we need to line up the unbacked SymInts when performing the test
+            # here
+            #
+            # In principle, we could permit lowering to introduce MORE unbacked
+            # SymInts: as long as all the old unbacked ones are accounted for,
+            # it's fine for inductor to introduce extra calls to item()/unbacked()
+            # whatever.  This actually happens in practice when an unbacked SymInt
+            # gets memoized away; naively, when Inductor reprocesses a kernel, it
+            # doesn't know that the memo still applies, and ends up allocating a
+            # new symbol.  However, this is generally a bad thing: we may still
+            # end up needing to test equalities on the symbols, and a fresh
+            # symbol is likely to hit lots of GuardOnDataDependent errors that
+            # we already know facts for.
+            renamed_unbacked_bindings = {
+                V.fake_mode.shape_env.unbacked_renamings.get(s, s)
+                for s in unbacked_bindings.keys()
+            }
+            assert new_unbacked_defs >= renamed_unbacked_bindings, (
+                f"failed {new_unbacked_defs} >= {renamed_unbacked_bindings} (inductor >= fx)\n"
+                f"fx node is: {n.format_node()}\n"
+                f"new buffers are:\n\n{format_buffers()}"
+            )
+
         return result
 
     def validate_can_generate_cpp_wrapper(self):
         if config.disable_cpp_codegen:
             raise CppWrapperCodeGenError("C++ codegen is disabled")
 
-        if sys.platform != "linux":
+        if sys.platform not in ["linux", "darwin"]:
             raise CppWrapperCodeGenError(f"Unsupported platform {sys.platform}")
 
         for value in self.graph_inputs.values():
@@ -999,13 +1496,10 @@ def init_wrapper_code(self):
         self.cuda = "cuda" in self.device_types
         if self.cpp_wrapper:
             self.validate_can_generate_cpp_wrapper()
-            self.wrapper_code = (
-                CudaWrapperCodeGen() if self.cuda else CppWrapperCodeGen()
-            )
-            return
 
         device_types = self.device_types.copy()
         device_types.discard("cpu")
+        device_types.discard("meta")
         # TODO(Eikan): Only support mixing cpu and other device now.
         assert len(device_types) <= 1, "Does not support mixing {}".format(
             "+".join(device_types)
@@ -1014,10 +1508,20 @@ def init_wrapper_code(self):
         device_type = "cpu" if only_cpu else device_types.pop()
 
         self.device_ops = get_device_op_overrides(device_type)
-        wrapper_code_gen_cls = get_wrapper_codegen_for_device(device_type)
+        wrapper_code_gen_cls = get_wrapper_codegen_for_device(
+            device_type, self.cpp_wrapper
+        )
         assert wrapper_code_gen_cls is not None, f"Device {device_type} not supported"
         self.wrapper_code = wrapper_code_gen_cls()
 
+        if self.const_module:
+            # If we have const module, we could reuse the kernels
+            # This could avoid duplication and save time on doing recompilation (if Triton.)
+            self.wrapper_code._names_iter = self.const_module.wrapper_code._names_iter
+            self.wrapper_code.src_to_kernel = (
+                self.const_module.wrapper_code.src_to_kernel
+            )
+
     def codegen_with_cpp_wrapper(self):
         """
         For CPU, the cpp wrapper codegen is done in one pass.
@@ -1057,6 +1561,25 @@ def materialize(x):
             else:
                 real_inputs = [materialize(x) for x in V.real_inputs]
 
+            if self.mutated_inputs:
+                from .compile_fx import clone_preserve_strides
+
+                mutated_input_idxs = [
+                    idx
+                    for idx, name in enumerate(self.graph_inputs)
+                    if name in self.mutated_inputs
+                    and isinstance(real_inputs[idx], torch.Tensor)
+                ]
+                for idx in mutated_input_idxs:
+                    # clone mutated Tensor inputs to avoid mutating them in
+                    # the first pass of the CPP wrapper-based compilation, as
+                    # this will lead to a side effect on the example inputs:
+                    # e.g. if torch.compile(f)(x) if called on input-mutating
+                    # f, the inputs x will be mutated twice in the process:
+                    # once here, and again when running the compiled model;
+                    # this will also lead to a numerically incorrect output
+                    real_inputs[idx] = clone_preserve_strides(real_inputs[idx])
+
             with torch.utils._python_dispatch._disable_current_modes():
                 assert self.example_inputs is not None
                 compiled(real_inputs)
@@ -1067,6 +1590,8 @@ def materialize(x):
             self.cpp_wrapper = True
             self.removed_buffers.clear()
             self.inplaced_to_remove.clear()
+            V.graph.sizevars.precomputed_replacements.clear()
+            V.graph.sizevars.inv_precomputed_replacements.clear()
             return self.codegen()
         else:
             # cpu
@@ -1079,8 +1604,31 @@ def codegen(self):
 
         self.scheduler = Scheduler(self.buffers)
         V.debug.draw_orig_fx_graph(self.orig_gm, self.scheduler.nodes)
+
+        self.wrapper_code.push_codegened_graph(self)
+        self.scheduler.codegen()
+        result = self.wrapper_code.generate(self.is_inference)
+        self.wrapper_code.pop_codegened_graph()
+        return result
+
+    def codegen_subgraph(self, parent_graph):
+        """
+        This is a more compact version of the `codegen()` above
+        where we codegen this graph as a subgraph of some parent
+        graph. The parent graph is passed as an argument: the
+        intention is to inline codegening of the subgraph in
+        the parent graph's wrapper code (including the generated
+        kerenls). The wrapper code is not finalized (via `.generate()`
+        call), as this will be done in the parent graph's `codegen()`.
+        """
+        from .scheduler import Scheduler
+
+        self.wrapper_code = parent_graph.wrapper_code
+        self.device_ops = parent_graph.device_ops
+        self.cpp_wrapper = parent_graph.cpp_wrapper
+
+        self.scheduler = Scheduler(self.buffers)
         self.scheduler.codegen()
-        return self.wrapper_code.generate(self.is_inference)
 
     def count_bytes(self):
         from .scheduler import Scheduler
@@ -1097,7 +1645,7 @@ def count_bytes(self):
             node_runtimes.append((node, node.get_estimated_runtime()))
         return total_bytes, node_counts, node_runtimes
 
-    @dynamo_timed
+    @dynamo_timed(phase_name="code_gen")
     def compile_to_module(self):
         from .codecache import PyCodeCache
 
@@ -1120,6 +1668,11 @@ def compile_to_module(self):
         log_module_code(mod.__file__)
         log.debug("Output code written to: %s", mod.__file__)
         output_code_log.debug("Output code: \n%s", code)
+        trace_structured(
+            "inductor_output_code",
+            lambda: {"filename": mod.__file__},
+            payload_fn=lambda: code,
+        )
         output_code_log.info("Output code written to: %s", mod.__file__)
         if config.benchmark_kernel:
             print(f"Compiled module path: {mod.__file__}", file=sys.stderr)
@@ -1129,7 +1682,7 @@ def compile_to_module(self):
 
     def compile_to_fn(self):
         if self.aot_mode:
-            from .codecache import AotCodeCache
+            from .codecache import AotCodeCompiler
 
             assert self.cpp_wrapper, "AOT mode only supports C++ wrapper"
             code, linemap = self.codegen_with_cpp_wrapper()
@@ -1150,7 +1703,7 @@ def compile_to_fn(self):
                 )
 
             # Directly return the file path with the compiled code
-            return AotCodeCache.compile(
+            return AotCodeCompiler.compile(
                 self, code, serialized_extern_kernel_nodes, cuda=self.cuda
             )
         else:
diff --git a/torch/_inductor/hooks.py b/torch/_inductor/hooks.py
index 2b7eba55ba8b9..2b558f4350a79 100644
--- a/torch/_inductor/hooks.py
+++ b/torch/_inductor/hooks.py
@@ -1,7 +1,11 @@
 import contextlib
+from typing import Callable, List, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import torch
 
 # Executed in the order they're registered
-INTERMEDIATE_HOOKS = []
+INTERMEDIATE_HOOKS: List[Callable[[str, "torch.Tensor"], None]] = []
 
 
 @contextlib.contextmanager
diff --git a/torch/_inductor/index_propagation.py b/torch/_inductor/index_propagation.py
index ff4c578723e73..6bc5def57d650 100644
--- a/torch/_inductor/index_propagation.py
+++ b/torch/_inductor/index_propagation.py
@@ -28,17 +28,33 @@
 from typing_extensions import TypeAlias
 
 import torch
-from torch._prims_common import is_boolean_dtype, is_integer_dtype
+from torch._prims_common import dtype_to_type, is_integer_dtype
 from torch.utils._sympy.functions import FloorDiv, ModularIndexing, Where
 
 
+_ExprType = Union[sympy.Expr, float, int, bool]
+
+
+def _is_constant(val: _ExprType):
+    if isinstance(val, sympy.Basic):
+        return val.is_number
+    return isinstance(val, (int, float, bool))
+
+
 @dataclass
 class TypedExpr:
     """A SymPy expression with associated type"""
 
-    expr: sympy.Expr
+    expr: _ExprType
     dtype: torch.dtype
 
+    def is_constant(self):
+        return _is_constant(self.expr)
+
+    def __post_init__(self):
+        if _is_constant(self.expr):
+            self.expr = dtype_to_type(self.dtype)(self.expr)
+
 
 class SymPyOps:
     """An ops handler where all IR values are SymPy expressions
@@ -54,31 +70,17 @@ def identity(value: Any) -> Any:
 
     @staticmethod
     def constant(value: Union[int, float, bool], dtype: torch.dtype) -> TypedExpr:
-        if is_boolean_dtype(dtype):
-            expr = sympy.Integer(bool(value))
-        elif is_integer_dtype(dtype):
-            expr = sympy.Integer(int(value))
-        else:
-            expr = sympy.Float(float(value))
-        return TypedExpr(expr, dtype)
+        return TypedExpr(value, dtype)
 
     @staticmethod
-    def index_expr(value: sympy.Expr, dtype: torch.dtype) -> Union[int, TypedExpr]:
-        if isinstance(value, int):
-            value = sympy.Integer(value)
+    def index_expr(value: Union[sympy.Expr, int], dtype: torch.dtype) -> TypedExpr:
         return TypedExpr(value, dtype)
 
     @staticmethod
     def to_dtype(
-        value: Any, dtype: torch.dtype, src_dtype: Optional[torch.dtype] = None
-    ) -> Union[int, TypedExpr]:
-        if isinstance(value.expr, (sympy.Integer, sympy.Float)):
-            return SymPyOps.constant(value.expr, dtype)
-        elif is_integer_dtype(dtype) and is_integer_dtype(value.dtype):
-            return SymPyOps.index_expr(value.expr, dtype)
-        else:
-            # TODO: Inductor doesn't handle floating point in sympy expressions well at the moment
-            return NotImplemented
+        value: TypedExpr, dtype: torch.dtype, src_dtype: Optional[torch.dtype] = None
+    ) -> TypedExpr:
+        return TypedExpr(value.expr, dtype)
 
     @staticmethod
     def square(x: TypedExpr) -> TypedExpr:
@@ -112,7 +114,7 @@ def floordiv(x: TypedExpr, y: TypedExpr) -> TypedExpr:
         return TypedExpr(FloorDiv(x.expr, y.expr), result_type)
 
     @staticmethod
-    def remainder(x: TypedExpr, y: TypedExpr) -> Optional[TypedExpr]:
+    def mod(x: TypedExpr, y: TypedExpr) -> Optional[TypedExpr]:
         result_type = torch.promote_types(x.dtype, y.dtype)
         if not is_integer_dtype(result_type):
             return NotImplemented
@@ -120,6 +122,24 @@ def remainder(x: TypedExpr, y: TypedExpr) -> Optional[TypedExpr]:
         result_expr = ModularIndexing(x.expr, sympy.Integer(1), y.expr)
         return TypedExpr(result_expr, result_type)
 
+    @staticmethod
+    def remainder(x: TypedExpr, y: TypedExpr) -> Optional[TypedExpr]:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        if not is_integer_dtype(result_type):
+            return NotImplemented
+
+        x_expr = sympy.sympify(x.expr)
+        y_expr = sympy.sympify(y.expr)
+        # In these cases, remainder in Python == remainder in C++, so this transformation
+        # is sound
+        if (
+            x_expr.is_nonnegative is not None
+            and x_expr.is_nonnegative == y_expr.is_positive
+        ):
+            result_expr = ModularIndexing(x.expr, sympy.Integer(1), y.expr)
+            return TypedExpr(result_expr, result_type)
+        return NotImplemented
+
     @staticmethod
     def minimum(x: TypedExpr, y: TypedExpr) -> TypedExpr:
         result_type = torch.promote_types(x.dtype, y.dtype)
@@ -162,10 +182,9 @@ def __init__(self, inner: Any):
 
     def materialize_expr(self, expr: sympy.Expr, dtype: torch.dtype) -> Any:
         # Construct a new constant/index_expr from the SymPy expression
-        if isinstance(expr, sympy.Integer):
-            return self._inner.constant(int(expr), dtype)
-        elif expr.is_number:
-            return self._inner.constant(float(expr), dtype)
+        if _is_constant(expr):
+            val = dtype_to_type(dtype)(expr)
+            return self._inner.constant(val, dtype)
         return self._inner.index_expr(expr, dtype)
 
     def unwrap(self, a: Union[Any, IndexPropVar]) -> Any:
@@ -224,7 +243,7 @@ def unwrap(a: Union[Any, IndexPropVar]) -> Any:
         is_valid_expr = new_expr is not NotImplemented and (
             # Inductor doesn't expect floating point in sympy expressions, but
             # allow floating point constants to be propagated
-            isinstance(new_expr.expr, sympy.Number)
+            new_expr.is_constant()
             or new_expr.expr.is_integer
         )
         if not is_valid_expr:
diff --git a/torch/_inductor/inductor_prims.py b/torch/_inductor/inductor_prims.py
index 69f9807120ac7..0a00650b1c389 100644
--- a/torch/_inductor/inductor_prims.py
+++ b/torch/_inductor/inductor_prims.py
@@ -16,8 +16,15 @@ def make_prim(
     doc: str = "",
     tags: Optional[Sequence[torch.Tag]] = None,
 ):
-    def meta(*args, **kwargs):
-        return _prims.TensorMeta(impl_aten(*args, **kwargs))
+    if isinstance(return_type, tuple):
+
+        def meta(*args, **kwargs):
+            return tuple(_prims.TensorMeta(o) for o in impl_aten(*args, **kwargs))
+
+    else:
+
+        def meta(*args, **kwargs):
+            return _prims.TensorMeta(impl_aten(*args, **kwargs))
 
     return _prims._make_prim(
         schema=schema,
@@ -88,3 +95,36 @@ def eager_force_stride(input_tensor: Tensor, stride) -> Tensor:
     ),
     doc="Unsafe index_put_ (doesn't issue device asserts)",
 )
+fma = make_prim(
+    "fma(Tensor a, Tensor b, Tensor c) -> Tensor",
+    lambda a, b, c: (a * b) + c,
+    doc="Fused multiply add: fma(a, b, c) -> (a * b) + c without rounding after the multiplication",
+)
+
+
+def _low_memory_max_pool2d_with_offsets_aten(
+    self,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode,
+):
+    vals, indices = torch.ops.aten.max_pool2d_with_indices(
+        self, kernel_size, stride, padding, dilation, ceil_mode
+    )
+    return vals, indices.to(torch.int8)
+
+
+_low_memory_max_pool2d_with_offsets = make_prim(
+    "_low_memory_max_pool2d_with_offsets(Tensor self, SymInt[2] kernel_size, SymInt[2] stride,  SymInt[2] padding, SymInt[2] dilation, bool ceil_mode) -> (Tensor, Tensor)",  # noqa: B950
+    _low_memory_max_pool2d_with_offsets_aten,
+    return_type=(_prims.RETURN_TYPE.NEW, _prims.RETURN_TYPE.NEW),
+    doc="Instead of returning indices, returns indices offsets.",
+)
+
+_low_memory_max_pool2d_offsets_to_indices = make_prim(
+    "_low_memory_max_pool2d_offsets_to_indices(Tensor self, SymInt kernel_w, SymInt input_w, SymInt[2] stride, SymInt[2] padding) -> Tensor",  # noqa: B950
+    lambda self, *args: self.to(torch.int64),
+    doc="Convert small int offsets to regular indices.",
+)
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 5db8a0bec1d80..9a9ca1ce2d573 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -8,9 +8,7 @@
 import textwrap
 import traceback
 from contextlib import nullcontext
-from enum import Enum
 from functools import partial
-from inspect import signature
 from typing import (
     Any,
     Callable,
@@ -22,6 +20,7 @@
     Sequence,
     Set,
     Tuple,
+    TYPE_CHECKING,
     Union,
 )
 from unittest.mock import patch
@@ -38,6 +37,8 @@
 from torch._dynamo.device_interface import get_interface_for_device
 from torch._dynamo.utils import identity
 from torch._export.serde.serialize import GraphModuleSerializer
+from torch._higher_order_ops.auto_functionalize import can_auto_functionalize
+from torch._inductor import metrics
 from torch._prims_common import (
     compute_required_storage_length,
     is_boolean_dtype,
@@ -47,32 +48,52 @@
     StrideType,
 )
 from torch._subclasses.fake_tensor import get_schema_info
-from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols, SymTypes
+from torch.fx.experimental.symbolic_shapes import (
+    CallMethodKey,
+    compute_unbacked_bindings,
+    DivideByKey,
+    free_unbacked_symbols,
+    rebind_unbacked,
+    resolve_unbacked_bindings,
+    SymTypes,
+)
 from torch.utils._sympy.functions import CleanDiv, FloorDiv, ModularIndexing
+from torch.utils._sympy.symbol import SymT
 
 from . import config, dependencies
 from .codegen.common import index_prevent_reordering
 from .dependencies import (
+    extract_free_unbacked_symbols,
     extract_input_node_reduction_ranges,
     extract_read_writes,
     var_builder,
 )
+from .ops_handler import OpCounterCSE
+from .runtime.hints import ReductionHint
+from .runtime.runtime_utils import do_bench, do_bench_cpu
 from .utils import (
     argsort,
     cache_on_self,
+    ceildiv,
     convert_shape_to_inductor,
     convert_shape_to_symint,
     developer_warning,
     get_kernel_metadata,
+    is_cpu_device,
     is_dynamic,
+    is_gpu,
     pad_listlike,
     sympy_dot,
+    sympy_index_symbol,
+    sympy_index_symbol_with_prefix,
     sympy_product,
     sympy_subs,
-    sympy_symbol,
 )
 from .virtualized import ops, V
 
+if TYPE_CHECKING:
+    from .graph import GraphLowering
+
 log = logging.getLogger(__name__)
 indent = functools.partial(textwrap.indent, prefix="  ")
 aten = torch.ops.aten
@@ -115,7 +136,9 @@ def validate_ir(node_or_nodes):
     def _check_tensorbox(nodes):
         # Could expand this to check deeper properties
         # (e.g. TensorBox points to View or StorageBox)
-        if isinstance(nodes, (list, tuple)):
+        if nodes is None:
+            pass
+        elif isinstance(nodes, (list, tuple)):
             for node in nodes:
                 _check_tensorbox(node)
         elif isinstance(nodes, dict):
@@ -129,9 +152,9 @@ def _check_tensorbox(nodes):
                     DynamicScalar,
                     AssertScalar,
                     TensorBox,
-                    sympy.Symbol,
                     sympy.logic.boolalg.Boolean,
                     Expr,
+                    EffectfulKernel,
                 ),
             ), f"Found {type(nodes)}, which is not a supported top level IR node. See [Note: Inductor IR]"
 
@@ -174,12 +197,14 @@ def reindex(index):
 
 
 NHWC_STRIDE_ORDER = [3, 0, 2, 1]
+NHWDC_STRIDE_ORDER = [4, 0, 3, 2, 1]
 
 
 def stride_order2fill_order(order):
     """
     Convert stride order to fill order
     For channel last format,
+
     stride order = [3, 0, 2, 1] and fill order = [1, 3, 2, 0]
     """
     lookup = {pos: idx for idx, pos in enumerate(order)}
@@ -210,9 +235,9 @@ def ir_node_to_tensor(x, guard_shape=True):
     size = [shape_fn(s) for s in x.get_size()]
     stride: StrideType
     if is_storage_and_layout(x):
-        stride = [shape_fn(s) for s in x.get_layout().stride]
+        stride = [shape_fn(s) for s in x.get_layout().stride]  # type: ignore[misc]
     else:
-        stride = make_contiguous_strides_for(size)
+        stride = make_contiguous_strides_for(size)  # type: ignore[arg-type]
     dtype = x.get_dtype()
     device = x.get_device()
     size = convert_shape_to_symint(size)
@@ -240,7 +265,7 @@ def get_device_type(x):
 
 
 def is_triton(x):
-    return get_device_type(x) == "cuda"
+    return is_gpu(get_device_type(x))
 
 
 def is_cpu(x):
@@ -286,6 +311,9 @@ def is_user_of(self, name):
     def get_read_names(self):
         return {dep.name for dep in self.get_reads()}
 
+    def get_dtype(self):
+        return self.dtype
+
     def get_layout(self):
         raise NotImplementedError(f"get_layout() is not implemented by {type(self)}!")
 
@@ -296,7 +324,7 @@ def get_numel(self):
         return sympy_product(self.get_size())
 
     def is_zero_elements(self):
-        return V.graph.sizevars.is_expr_static_and_true(sympy.Eq(self.get_numel(), 0))
+        return V.graph.sizevars.is_expr_static_and_true(sympy.Eq(self.get_numel(), 0))  # type: ignore[arg-type]
 
     def realize(self):
         """
@@ -323,7 +351,7 @@ def codegen_reference(self, writer=None):
     # defined, while having no effect at runtime. We cannot create stub implementations here because other parts of
     # the code dynamically check for defined attributes.
     get_device: Callable[[], torch.device]
-    get_dtype: Callable[[], torch.dtype]
+    dtype: torch.dtype
     get_name: Callable[[], str]
     get_reads: Callable[[], Any]
     get_stride: Callable[[], Any]
@@ -333,6 +361,7 @@ def codegen_reference(self, writer=None):
     make_indexer: Callable[[], Callable[[Any], Any]]
     mark_reuse: Callable[[int], None]
     realize_hint: Callable[[], None]
+    get_unbacked_symbol_uses: Callable[[], Set[sympy.Symbol]]
 
 
 @dataclasses.dataclass
@@ -342,6 +371,12 @@ class Loops(IRNode):
     inner_fn: Callable[..., Any]
     ranges: List[Expr]
 
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        return set().union(
+            *(free_unbacked_symbols(e) for e in self.ranges),
+            self.inner_fn_free_unbacked_symbols(),
+        )
+
     def __str__(self, names=("ranges",)):
         return self.str_helper(
             [
@@ -359,9 +394,6 @@ def __post_init__(self):
 
     __repr__ = __str__
 
-    def get_dtype(self):
-        return self.dtype
-
     def get_device(self):
         return self.device
 
@@ -389,19 +421,38 @@ def create(cls, *args, **kwargs):
         return TensorBox.create(r)
 
     @staticmethod
-    def _index(ranges, prefix="i"):
+    def _index(ranges, prefix=SymT.INDEX):
         return [
-            sympy.Integer(0) if s == 1 else sympy_symbol(f"{prefix}{n}")
+            sympy.Integer(0) if s == 1 else sympy_index_symbol_with_prefix(prefix, n)
             for n, s in enumerate(ranges)
         ]
 
     @cache_on_self
-    def inner_fn_str_len(self):
-        return len(self.inner_fn_str())
+    def inner_fn_opcount(self):
+        from .ir import FlexibleLayout
+
+        opcounter = OpCounterCSE(V.MockHandler())
+
+        with V.set_ops_handler(opcounter), patch.object(
+            FlexibleLayout, "allow_indexing", True
+        ):
+            result = self.inner_fn(*self.inner_fn_args())
+            return opcounter.op_count
+
+    def inner_fn_args(self):
+        return (self._index(self.ranges),)
 
     def inner_fn_str(self):
+        return V.KernelFormatterHandler.ir_to_string(
+            self.inner_fn, *self.inner_fn_args()
+        )
+
+    def has_large_inner_fn(self):
+        return self.inner_fn_opcount() > config.realize_opcount_threshold
+
+    def inner_fn_free_unbacked_symbols(self):
         index = self._index(self.ranges)
-        return V.KernelFormatterHandler.ir_to_string(self.inner_fn, index)
+        return extract_free_unbacked_symbols(self.inner_fn, index)
 
     def get_reads(self):
         with patch.object(FlexibleLayout, "allow_indexing", True):
@@ -493,18 +544,6 @@ def store_output(self, output_name, indexer, vars):
         )
 
 
-class ReductionHint(Enum):
-    INNER = 0
-    OUTER = 1
-    OUTER_TINY = 2
-    DEFAULT = 3
-
-
-class TileHint(Enum):
-    SQUARE = 0
-    DEFAULT = 1
-
-
 REDUCTION_COMBINE_FN = {
     "any": ops_wrapper("logical_or"),
     "max": ops_wrapper("maximum"),
@@ -515,7 +554,7 @@ class TileHint(Enum):
 }
 
 
-def get_reduction_combine_fn(reduction_type, dtype):
+def get_reduction_combine_fn(reduction_type, dtype, arg_break_ties_left=True):
     if reduction_type in REDUCTION_COMBINE_FN:
         combine_fn = REDUCTION_COMBINE_FN[reduction_type]
     elif reduction_type in {"argmax", "argmin"}:
@@ -536,9 +575,12 @@ def combine_fn(a, b):
                 mask = ops.logical_or(mask, ops.gt(a_isnan, b_isnan))
                 equal = ops.logical_or(equal, ops.logical_and(a_isnan, b_isnan))
 
-            mask = ops.logical_or(
-                mask, ops.logical_and(equal, ops.lt(a_index, b_index))
+            tie = (
+                ops.lt(a_index, b_index)
+                if arg_break_ties_left
+                else ops.gt(a_index, b_index)
             )
+            mask = ops.logical_or(mask, ops.logical_and(equal, tie))
             return (
                 ops.where(mask, a_value, b_value),
                 ops.where(mask, a_index, b_index),
@@ -581,6 +623,11 @@ def __str__(self):
     def __repr__(self):
         return self.__str__()
 
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        return super().get_unbacked_symbol_uses() | set().union(
+            *(free_unbacked_symbols(e) for e in self.reduction_ranges)
+        )
+
     def get_reduction_size(self):
         return self.reduction_ranges
 
@@ -599,14 +646,15 @@ def store_reduction(self, output_name, indexer, vars, reduction_vars):
     def index_length(self):
         return len(self.ranges) + len(self.reduction_ranges)
 
-    def inner_fn_str(self):
+    def inner_fn_args(self):
         index = self._index(self.ranges)
-        rindex = self._index(self.reduction_ranges, "r")
-        return V.KernelFormatterHandler.ir_to_string(
-            self.inner_fn,
-            index,
-            rindex,
-        )
+        rindex = self._index(self.reduction_ranges, SymT.RINDEX)
+        return (index, rindex)
+
+    def inner_fn_free_unbacked_symbols(self):
+        index = self._index(self.ranges)
+        rindex = self._index(self.reduction_ranges, SymT.RINDEX)
+        return extract_free_unbacked_symbols(self.inner_fn, index, rindex)
 
     def constant_to_device(self, device):
         """Move this to a given device. Requires that all reads are to constants."""
@@ -642,7 +690,7 @@ def _is_static(x):
         numel_hint = V.graph.sizevars.symbolic_hint(sympy_product(ranges))
 
         should_split = (
-            is_triton(device)
+            get_device_type(device) == "cuda"
             and reduction_type
             not in {
                 "argmax",
@@ -743,14 +791,7 @@ def outer_reduction_splits(reduction_numel_hint, numel_hint):
             if split == 1:
                 # No need to split.
                 return ReductionHint.INNER, split
-            if (
-                len(ranges) == 0
-                and input_node is not None
-                and isinstance(input_node, TensorBox)
-            ):
-                # Only handles the case where keep_dim = False.
-                # Otherwise, we need to propagate reduction dim info to the stage where
-                # the intermediate loader of the first Reduction is generated.
+            if input_node is not None and isinstance(input_node, TensorBox):
                 new_ranges, new_reduction_ranges = extract_input_node_reduction_ranges(
                     input_node
                 )
@@ -1096,7 +1137,7 @@ def _multilayer_wrap_loader(
     ):
         reindex = View.dynamic_reshape_indexer(reduction_ranges, [reduction_numel])
         need_mask = not V.graph.sizevars.is_expr_static_and_true(
-            sympy.Eq(reduction_numel % split, 0)
+            sympy.Eq(reduction_numel % split, 0)  # type: ignore[arg-type]
         )
 
         def wrapper_fn(index, reduction_index):
@@ -1128,13 +1169,20 @@ def _multilayer_wrap_loader_existing_ranges(
         new_reduction_ranges,
         default,
     ):
-        assert len(original_ranges) == 0, f"{original_ranges}= is not equal to []"
+        assert all(
+            r == 1 for r in original_ranges
+        ), f"Only enabled for numel_hint == 1, found {original_ranges=}"
         reindex = View.dynamic_reshape_indexer(
             original_reduction_ranges, tuple(new_ranges) + tuple(new_reduction_ranges)
         )
 
-        def wrapper_fn(index, reduction_index):
-            return loader([], reindex(tuple(index) + tuple(reduction_index)))
+        def wrapper_fn(merged_index, new_reduction_index):
+            original_idx = merged_index[: len(original_ranges)]
+            new_index = merged_index[len(original_ranges) :]
+            return loader(
+                original_idx,
+                reindex(tuple(new_index) + tuple(new_reduction_index)),
+            )
 
         return wrapper_fn
 
@@ -1232,7 +1280,7 @@ def create_multilayer(
             wrapper_fn,
             ranges,
             reduction_ranges,
-            [*ranges, split],
+            [*ranges, split],  # type: ignore[list-item]
             [block_size],
             reduction_type,
             split,
@@ -1273,7 +1321,7 @@ def create_multilayer_existing_ranges(
             wrapper_fn,
             original_ranges,
             original_reduction_ranges,
-            new_ranges,
+            [*original_ranges, *new_ranges],
             new_reduction_ranges,
             reduction_type,
             -1,
@@ -1468,7 +1516,7 @@ def create_multilayer(  # type: ignore[override]
         """
         reduction_numel = sympy_product(reduction_ranges)
         need_mask = not V.graph.sizevars.is_expr_static_and_true(
-            sympy.Eq(reduction_numel % split, 0)
+            sympy.Eq(reduction_numel % split, 0)  # type: ignore[arg-type]
         )
 
         if need_mask and reduction_type != "welford_combine":
@@ -1508,7 +1556,7 @@ def constant(idx, reduction_idx, value):
                 )
                 for loader in inner_fns
             ),
-            [*ranges, split],
+            [*ranges, split],  # type: ignore[list-item]
             [block_size],
             reduction_type,
             reduction_hint,
@@ -1533,7 +1581,7 @@ def intermediate_loader_fn(index, reduction_index, loader):
                 for i in intermediates
             ),
             ranges,
-            [split],
+            [split],  # type: ignore[list-item]
             # welford_reduce turns one input into three outputs, which are combined with welford_combine
             "welford_combine",
             reduction_hint,
@@ -1544,22 +1592,35 @@ def intermediate_loader_fn(index, reduction_index, loader):
 class Scan(Loops):
     scan_ranges: List[Expr]
     size: List[Expr]
-    combine_fn: Callable[..., Any]
+    combine_fn: Callable[[Tuple[Any, ...], Tuple[Any, ...]], Tuple[Any, ...]]
     reindex: Callable[[List[Expr], List[Expr]], List[Expr]]
     reduction_hint: ReductionHint
-    init: Any
+    output_index: int
+    # output_index indexes the following tuples
+    dtypes: Tuple[torch.dtype, ...]
+    inner_fns: Tuple[Callable[..., Any], ...]
 
     # HACK we mimick reduction
 
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        # TODO: Can combine_fn/reindex close over unbacked symbols? If so, we
+        # need to explicitly represent the closure so we can pull out unbacked
+        # symbols here
+        return (
+            super().get_unbacked_symbol_uses()
+            | set().union(*(free_unbacked_symbols(e) for e in self.scan_ranges))
+            | set().union(*(free_unbacked_symbols(e) for e in self.size))
+        )
+
     def __post_init__(self):
         assert len(self.ranges) + len(self.scan_ranges) == len(self.size)
         super().__post_init__()
 
     def store_reduction(self, output_name, indexer, vars, scan_vars):
         idx = self.reindex(vars, scan_vars)
-        value = self.inner_fn(idx)
-        result = ops.scan(self.dtype, self.combine_fn, value, self.init)
-        return ops.store(output_name, indexer(idx), result)
+        values = [inner_fn(idx) for inner_fn in self.inner_fns]
+        result = ops.scan(self.dtypes, self.combine_fn, values)
+        return ops.store(output_name, indexer(idx), result[self.output_index])
 
     def get_reduction_type(self):
         # return self.scan_op
@@ -1577,85 +1638,108 @@ def get_pointwise_size(self):
     def index_length(self):
         return len(self.ranges) + len(self.scan_ranges)
 
-    def inner_fn_str(self):
+    def inner_fn_args(self):
         index = self._index(self.ranges)
-        rindex = self._index(self.scan_ranges, "r")
-        return V.KernelFormatterHandler.ir_to_string(
-            self.inner_fn,
-            index,
-            rindex,
-        )
+        rindex = self._index(self.scan_ranges, SymT.RINDEX)
+        idx = self.reindex(index, rindex)
+        return (idx,)
+
+    def inner_fn_free_unbacked_symbols(self):
+        index = self._index(self.ranges)
+        rindex = self._index(self.scan_ranges, SymT.RINDEX)
+        idx = self.reindex(index, rindex)
+        return extract_free_unbacked_symbols(self.inner_fn, idx)
 
     @classmethod
     def create(
         cls,
         device: torch.device,
-        dtype: torch.dtype,
-        inner_fn: Callable[[List[Expr]], Any],
+        dtypes: Tuple[torch.dtype, ...],
+        inner_fns: Tuple[Callable[[List[Expr]], Any], ...],
         size: List[Expr],
         axis: int,
-        combine_fn: Callable[..., Any],
-        init: Any,
+        combine_fn: Callable[[Tuple[Any, ...], Tuple[Any, ...]], Tuple[Any, ...]],
         reduction_hint: ReductionHint = ReductionHint.DEFAULT,
-    ) -> Optional["TensorBox"]:
+        **kwargs,
+    ) -> List[Optional["TensorBox"]]:
         pointwise_ranges = [*size[:axis], *size[axis + 1 :]]
         scan_ranges = [size[axis]]
 
-        if device.type != "cuda":
+        if not is_gpu(device.type):
             # TODO: CPU support
-            return None
+            return [None] * len(dtypes)
 
-        if torch.version.hip is not None:
-            # TODO: ROCm support
-            return None
+        if torch.version.hip is not None and len(dtypes) > 1:
+            # TODO: Remove this when ROCm triton adds support for multiple inputs
+            return [None] * len(dtypes)
 
         sizevars = V.graph.sizevars
         scan_numel = sizevars.simplify(sympy_product(scan_ranges))
 
+        assert len(dtypes) == len(inner_fns)
+
         # Scan with a single element is just a copy
-        if sizevars.is_expr_static_and_true(sympy.Le(scan_numel, 1)):
-            return Pointwise.create(
-                device=device,
-                dtype=dtype,
-                inner_fn=inner_fn,
-                ranges=size,
-            )
+        if sizevars.is_expr_static_and_true(sympy.Le(scan_numel, 1)):  # type: ignore[arg-type]
+            return [
+                Pointwise.create(
+                    device=device,
+                    dtype=dtypes[output_index],
+                    inner_fn=inner_fns[output_index],
+                    ranges=size,
+                )
+                for output_index in range(len(dtypes))
+            ]
 
         reduction_hint, num_splits = cls.num_splits(
             device=device,
-            dtype=dtype,
-            inner_fn=inner_fn,
+            dtype=dtypes[0],
+            inner_fn=inner_fns[0],
             axis=axis,
             pointwise_ranges=pointwise_ranges,
             scan_ranges=scan_ranges,
             combine_fn=combine_fn,
             scan_numel=scan_numel,
         )
-        if num_splits > 1:
-            # TODO: Support splitting
-            return None
+        scan_type = Scan if num_splits <= 1 else SplitScan
+
+        if num_splits > 1 and torch.version.hip is not None:
+            # Fallback for split-scan on ROCm
+            return [None] * len(dtypes)
+
+        if num_splits > 1 and len(dtypes) > 1:
+            # Fallback for split-scans for multiple inputs
+            return [None] * len(dtypes)
 
         def reindex(index, scan_index):
             assert len(scan_index) == len(scan_ranges)
             assert len(index) == len(pointwise_ranges)
             return [*index[:axis], *scan_index, *index[axis:]]
 
-        result = TensorBox.create(
-            Scan(
-                device=device,
-                dtype=dtype,
-                inner_fn=inner_fn,
-                size=size,
-                ranges=pointwise_ranges,
-                scan_ranges=scan_ranges,
-                combine_fn=combine_fn,
-                reindex=reindex,
-                init=init,
-                reduction_hint=reduction_hint,
+        results = [
+            TensorBox.create(
+                scan_type(
+                    device=device,
+                    dtype=dtypes[output_index],
+                    dtypes=dtypes,
+                    inner_fn=inner_fns[output_index],
+                    inner_fns=inner_fns,
+                    size=size,
+                    ranges=pointwise_ranges,
+                    scan_ranges=scan_ranges,
+                    combine_fn=combine_fn,
+                    reindex=reindex,
+                    reduction_hint=reduction_hint,
+                    output_index=output_index,
+                    **kwargs,
+                )
             )
-        )
-        result.realize()
-        return result
+            for output_index in range(len(dtypes))
+        ]
+
+        for result in results:
+            result.realize()
+
+        return results
 
     @classmethod
     def num_splits(
@@ -1666,7 +1750,7 @@ def num_splits(
         axis: int,
         pointwise_ranges: List[Expr],
         scan_ranges: List[Expr],
-        combine_fn: Callable[..., Any],
+        combine_fn: Callable[[Tuple[Any, ...], Tuple[Any, ...]], Tuple[Any, ...]],
         scan_numel: Expr,
     ):
         # TODO: custom splitting heuristic for scan
@@ -1685,6 +1769,12 @@ def wrapper_fn(idx, reduction_idx):
         )
 
 
+# This signifies a scan op that should go through TritonSplitScanKernel codegen on CUDA.
+@dataclasses.dataclass
+class SplitScan(Scan):
+    pass
+
+
 def is_storage_and_layout(x):
     try:
         as_storage_and_layout(x, freeze=False)
@@ -1696,19 +1786,31 @@ def is_storage_and_layout(x):
 def is_contiguous_storage_and_layout(x):
     try:
         buffer, layout = as_storage_and_layout(x, freeze=False)
+        # pad the stride here so we will NOT claim an tensor as contiguous
+        # if a padding is gonna happen.
+        if layout.should_pad_strides():
+            layout.pad_strides()
         return layout.is_contiguous()
     except NotImplementedError:
         return False
 
 
-def as_storage_and_layout(x, freeze=True, want_contiguous=False, stride_order=None):
-    """Try to simplify x into a StorageBox and a Layout"""
+def as_storage_and_layout(
+    x, freeze=True, want_contiguous=False, stride_order=None, allow_padding=False
+):
+    """
+    Try to simplify x into a StorageBox and a Layout.
+
+    allow_padding only affect how we apply stride_order. When allow_padding
+    is True, we have the freedom to add padding when applying the stride_order.
+    """
     if isinstance(x, TensorBox):
         return as_storage_and_layout(
             x.data,
             freeze=freeze,
             want_contiguous=want_contiguous,
             stride_order=stride_order,
+            allow_padding=allow_padding,
         )
     if isinstance(x, StorageBox) and isinstance(x.data, Buffer):
         if freeze:
@@ -1716,7 +1818,9 @@ def as_storage_and_layout(x, freeze=True, want_contiguous=False, stride_order=No
                 x.data.freeze_layout()
                 assert x.data.layout.is_contiguous()
             elif stride_order is not None:
-                x.data.freeze_layout_with_stride_order(stride_order)
+                x.data.freeze_layout_with_stride_order(
+                    stride_order, allow_padding=allow_padding
+                )
             else:
                 x.data.decide_layout()
         return x, x.data.layout
@@ -1748,6 +1852,9 @@ def is_stride_order_storage_and_layout(x, stride_order):
 class BaseView(IRNode):
     data: IRNode
 
+    def get_unbacked_symbol_uses(self):
+        return self.data.get_unbacked_symbol_uses()
+
     def make_reindexer(self):
         raise NotImplementedError(f"make_reindexer NYI on {self}")
 
@@ -1769,8 +1876,9 @@ def loader(idx):
 
         return loader
 
-    def get_dtype(self):
-        return self.data.get_dtype()
+    @property
+    def dtype(self):
+        return self.data.dtype
 
     def get_layout(self):
         return self.data.get_layout()
@@ -1843,12 +1951,12 @@ def _normalize_size(x, new_size):
             elif old_size[i] is None or old_size[i] == 1:
                 pass
             else:
-                # Expect broadcast compatibility
-                new_size[i] = V.graph.sizevars.expect_equals(
-                    new_size[i],
-                    old_size[i],
-                    msg=f"Broadcast failed in ExpandView({x.get_size()}, {new_size}) on dimension {i}",
-                )
+                # NB: new_size[i] == old_size[i] is known because the meta
+                # formula was expected to have taught us this equality.
+                # We can't conveniently check it right now because
+                # statically_known_equals doesn't know to consult preexisting
+                # guards
+                pass
         return new_size
 
     @classmethod
@@ -1926,7 +2034,7 @@ def get_size(self):
 
     def make_reindexer(self):
         inv = {j: i for i, j in enumerate(self.dims)}
-        inv = [inv[i] for i in range(len(self.dims))]
+        inv = [inv[i] for i in range(len(self.dims))]  # type: ignore[index]
         assert set(inv) == set(range(len(self.dims)))
 
         def reindex(index):
@@ -2002,7 +2110,9 @@ def make_reindexer(self):
         return self.reindex
 
     def reindex_str(self):
-        index_old = [sympy_symbol(f"i{n}") for n in range(len(self.size))]
+        index_old = [
+            sympy_index_symbol_with_prefix(SymT.INDEX, n) for n in range(len(self.size))
+        ]
         index_new = list(self.reindex(index_old))
         return f"lambda {', '.join(map(str, index_old))}: {index_new}"
 
@@ -2059,7 +2169,8 @@ def fake_reindex(index):
             if unbacked_symbols_in_sizes and (not is_contiguous_storage_and_layout(x)):
                 # realize x; otherwise, the dynamic_reshape_indexer below will fail
                 # due to the size_hint's inability to process unbacked SymInts
-                x.realize()
+                x = ExternKernel.realize_input(x)
+
             storage, old_layout = as_contiguous_storage_and_layout(x)
             new_layout = FixedLayout(
                 old_layout.device,
@@ -2106,7 +2217,11 @@ def _dynamic_reshape_indexer(old_size, new_size):
         Perform a reshape entirely by modifying indexing math
         """
         size_hint = V.graph.sizevars.size_hint
-        vars = [sympy_symbol(f"view{i}") for i in range(len(new_size))]
+        # TODO: These symbols may not escape, if they don't assert so and
+        # treat them as temporary
+        vars = [
+            sympy_index_symbol_with_prefix(SymT.VIEW, i) for i in range(len(new_size))
+        ]
 
         stack_new = list(zip(vars, new_size))
         stack_old = list(old_size)
@@ -2142,24 +2257,24 @@ def _dynamic_reshape_indexer(old_size, new_size):
                     size_old = size_old * modulus
                 V.graph.sizevars.guard_equals(size_new, size_old)
             else:
-                raise AssertionError()
+                raise AssertionError
 
         while stack_old:
             size_old = stack_old.pop()
-            V.graph.sizevars.guard_equals(size_old, 1)
+            V.graph.sizevars.guard_equals(size_old, 1)  # type: ignore[arg-type]
             view_expr.append(sympy.Integer(0))
 
         while stack_new:
             var, size_new = stack_new.pop()
-            V.graph.sizevars.guard_equals(size_new, 1)
+            V.graph.sizevars.guard_equals(size_new, 1)  # type: ignore[arg-type]
 
-        view_expr = list(reversed(view_expr))
+        view_expr.reverse()
         assert len(view_expr) == len(old_size)
 
         def reindex(index):
             assert len(index) == len(vars), (len(index), len(vars))
             replacements = dict(zip(vars, index))
-            return tuple(sympy_subs(x, replacements) for x in view_expr)
+            return tuple(sympy_subs(x, replacements) for x in view_expr)  # type: ignore[arg-type]
 
         return reindex
 
@@ -2194,7 +2309,8 @@ def get_device(self):
     def get_origin_node(self):
         return None
 
-    def get_dtype(self):
+    @property
+    def dtype(self):
         return self.layout.dtype
 
     def get_size(self):
@@ -2219,6 +2335,13 @@ def get_layout(self):
     def freeze_layout(self):
         pass
 
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        return (
+            free_unbacked_symbols(self.layout.size)
+            | free_unbacked_symbols(self.layout.stride)
+            | free_unbacked_symbols(self.layout.offset)
+        )
+
     def codegen_reference(self, writer=None):
         # reinterpret_tensor is similar to as_strided except:
         # - offset is added to the existing offset (rather than replacing it)
@@ -2234,7 +2357,36 @@ def codegen_reference(self, writer=None):
 
 class SliceView(View):
     @classmethod
-    def create(cls, x, dim, start, end, step=1):
+    def normalize_start_end(cls, x, dim, start, end):
+        """
+        Normalize start and end such that both are in the range
+        [0, x.get_size()[dim]] and start <= end.
+        """
+        sizevars = V.graph.sizevars
+        dim_size = x.get_size()[dim]
+
+        if any(free_unbacked_symbols(x) for x in (start, end, dim_size)):
+
+            def clamp(x, lower, upper):
+                return sympy.Min(sympy.Max(x, lower), upper)
+
+        else:
+
+            def clamp(x, lower, upper):
+                return sizevars.evaluate_min(sizevars.evaluate_max(x, lower), upper)
+
+        def clamp_wrap(val, lower, upper, default):
+            if val is None:
+                return default
+            val = cls.handle_negative_index(val, dim_size)
+            return clamp(val, lower, upper)
+
+        start = clamp_wrap(start, 0, dim_size, 0)
+        end = clamp_wrap(end, start, dim_size, dim_size)
+        return start, end
+
+    @classmethod
+    def create(cls, x, dim, start, end, step=1, clamp=True):
         step = sympy.expand(step)
         assert step > 0
         try:
@@ -2246,15 +2398,11 @@ def create(cls, x, dim, start, end, step=1):
         sizevars = V.graph.sizevars
         new_size = list(x.get_size())
 
-        start = cls.handle_negative_index(start, new_size[dim])
-        end = cls.handle_negative_index(end, new_size[dim])
-
-        if free_unbacked_symbols(start) or free_unbacked_symbols(end):
-            end = sympy.Min(end, new_size[dim])
-            start = sympy.Min(start, end)
-        else:
-            end = sizevars.evaluate_min(end, new_size[dim])
-            start = sizevars.evaluate_min(start, end)
+        # NB: Ordinarily we default to clamping.
+        # We only don't clamp for split_with_sizes. For split_with_sizes, sizes should be already valid
+        # failing in this situation is ok, since invalid sizes could trigger silent errors.
+        if clamp:
+            start, end = cls.normalize_start_end(x, dim, start, end)
 
         new_size[dim] = FloorDiv(end - start + (step - 1), step)
 
@@ -2289,9 +2437,6 @@ class BaseConstant(IRNode):
     def get_size(self):
         return ()
 
-    def get_dtype(self):
-        return self.dtype
-
     def get_device(self):
         return self.device
 
@@ -2355,6 +2500,15 @@ def is_contiguous_strides_for_shape(stride, shape):
     )
 
 
+def get_align_for_dtype(dtype):
+    """
+    CUDA max memory transaction size is 128 bytes for a warp.
+    We pick `128 // dtype.itemsize` as alighment so GPU can do coalesced
+    memory access.
+    """
+    return 128 // dtype.itemsize
+
+
 @dataclasses.dataclass
 class Layout(IRNode):
     def __init__(
@@ -2393,12 +2547,13 @@ def __str__(self):
     def is_contiguous(self):
         return is_contiguous_strides_for_shape(self.stride, self.size)
 
-    def is_channels_last_contiguous(self):
-        ndim = len(self.size)
-        if ndim not in [4, 5]:
+    @staticmethod
+    def is_channels_last_contiguous(shape, strides):
+        ndim = len(shape)
+        if ndim not in [4, 5] or shape[1] == 1:
             return False
         for left, right, size in zip(
-            self.stride, make_channels_last_strides_for(self.size), self.size
+            strides, make_channels_last_strides_for(shape), shape  # type: ignore[arg-type]
         ):
             if size != 1 and left != right:
                 return False
@@ -2450,7 +2605,96 @@ def is_channels_last_stride_ordered(self):
         order = [len(order)] + order
         return self.is_stride_ordered(order)
 
+    @staticmethod
+    def _pad_strides(in_strides, size, dtype):
+        """
+        The padding does not change stride order but makes sure all strides larger
+        than the threshold are multiple of align.
+        """
+        align = get_align_for_dtype(dtype)
+        if len(in_strides) == 0:
+            return in_strides
+
+        if not config.pad_channels_last and Layout.is_channels_last_contiguous(
+            size, in_strides
+        ):
+            return in_strides
+
+        current_fx_node = V.get_current_node()
+        if hasattr(current_fx_node, "meta") and current_fx_node.meta.get(
+            "dislike_padding", False
+        ):
+            return in_strides
+
+        # get_stride_order does not work with dynamic shape. Also we can not
+        # statically decide if a padding is needed or how much padding we should
+        # do for dynamic shape.
+        #
+        # Skip padding the strides for dynamic shape for now.
+        if not all(
+            isinstance(s, (int, sympy.Integer))
+            for s in itertools.chain(in_strides, size)
+        ):
+            return in_strides
+
+        stride_order = get_stride_order(in_strides)
+        fill_order = stride_order2fill_order(stride_order)
+
+        new_strides = [0 for _ in range(len(in_strides))]
+        # since we pad when the layout is flexible, we can decide the
+        # smallest stride to be 1.
+        new_strides[fill_order[0]] = 1
+
+        # Don't align a too small stride since that causes too much memory increase.
+        # Pad too small stride may also cause perf loss. We may result in many tiny data blocks
+        # with gaps in between. That causes less coalesced GPU memory access!
+        #
+        # Initially we pick 320 as the threshold since for alignement=16,
+        # that results in at most 5% memory cost.
+        #
+        # But later on we raise the threshold to 1024 to avoid interfere with persistent reduction.
+        # Let's say an inner reduction has a row size 513. Inductor will generate
+        # persistent reduction code.
+        # If we do padding, the strides are not contiguous any more. Inductor
+        # uses a much smaller threshold for persistent reduction in this case and
+        # generates potentially worse non-persistent reduction code.
+        #
+        # This change turns HF AllenaiLongformerBase amp training from a loss of 1.09x to a win of 1.05x.
+        # (baseline: 71.09ms, padding w/o this change: 77.38ms, padding with this change: 67.77ms)
+        align_stride_threshold = 1024
+        padded = False
+        for rank, idx in enumerate(fill_order[1:], start=1):
+            prev_idx = fill_order[rank - 1]
+            stride = new_strides[prev_idx] * size[prev_idx]
+
+            if stride > align_stride_threshold and stride % align != 0:
+                stride = ceildiv(stride, align) * align
+                padded = True
+            new_strides[idx] = stride
+
+        if not padded:
+            # Consider a tensor with shape [256, 1, 5, 5]
+            # Avoid strides like [25, 5, 5, 1] being padded to equivalent strides
+            # [25, 25, 5, 1].
+            return in_strides
+
+        metrics.num_comprehensive_padding += 1
+        return new_strides
+
+    def pad_strides(self):
+        assert isinstance(self, FlexibleLayout)
+        assert self._stride is not None
+        self._stride = self._pad_strides(self._stride, self.size, self.dtype)
+
+    def should_pad_strides(self):
+        return config.comprehensive_padding and isinstance(self, FlexibleLayout)
+
     def as_fixed(self):
+        if isinstance(self, FixedLayout):
+            return self
+
+        if self.should_pad_strides():
+            self.pad_strides()
         return FixedLayout(
             self.device,
             self.dtype,
@@ -2475,7 +2719,7 @@ def __eq__(self, other) -> bool:
         )
 
     def storage_size(self) -> sympy.Expr:
-        return compute_required_storage_length(self.size, self.stride, self.offset)
+        return compute_required_storage_length(self.size, self.stride, self.offset)  # type: ignore[arg-type, return-value]
 
 
 class FixedLayout(Layout):
@@ -2494,9 +2738,9 @@ def __init__(
         super().__init__(
             device,
             dtype,
-            size,
+            size,  # type: ignore[arg-type]
             stride,
-            offset,
+            offset,  # type: ignore[arg-type]
         )
 
     def make_indexer(self):
@@ -2556,6 +2800,31 @@ def stride_ordered(sizes, order):
         fill_order = stride_order2fill_order(order)
         return FlexibleLayout.fill_ordered(sizes, fill_order)
 
+    @staticmethod
+    def stride_ordered_for_memory_format(sizes, memory_format):
+        """
+        Create a stride based on a memory format.
+
+        Memory format is translasted into a stride order,
+        so channels_last is the same as:
+            FlexibleLayout.stride_ordered(sizes, [3, 0, 2, 1])
+
+        This interface does not support memory_format `torch.preserve_format`
+        which should be used to deduce a format from another source
+        """
+        if memory_format == torch.channels_last:
+            return FlexibleLayout.stride_ordered(sizes, NHWC_STRIDE_ORDER)
+        elif memory_format == torch.channels_last_3d:
+            return FlexibleLayout.stride_ordered(sizes, NHWDC_STRIDE_ORDER)
+        elif memory_format == torch.contiguous_format:
+            return FlexibleLayout.contiguous_strides(sizes)
+        else:
+            log.debug(
+                "stride_ordered_for_memory_format, unsuppored memory_format: %s",
+                memory_format,
+            )
+            raise NotImplementedError
+
     @staticmethod
     def same_ordered(sizes, stride):
         """
@@ -2569,30 +2838,40 @@ def same_ordered(sizes, stride):
         fill_order = sorted(range(len(stride)), key=stride.__getitem__)
         return FlexibleLayout.fill_ordered(sizes, fill_order)
 
-    def as_stride_order(self, order):
+    def as_stride_order(self, order, allow_padding=False):
+        new_stride = self.stride_ordered(self.size, order)
+        if self.should_pad_strides() and allow_padding:
+            new_stride = self._pad_strides(new_stride, self.size, self.dtype)
+
         return FixedLayout(
             self.device,
             self.dtype,
             self.size,
-            self.stride_ordered(self.size, order),
+            new_stride,
             self.offset,
         )
 
     def as_fill_order(self, order):
+        new_stride = self.fill_ordered(self.size, order)
+        if self.should_pad_strides():
+            new_stride = self._pad_strides(new_stride, self.size, self.dtype)
         return FixedLayout(
             self.device,
             self.dtype,
             self.size,
-            self.fill_ordered(self.size, order),
+            new_stride,
             self.offset,
         )
 
     def as_same_order(self, stride):
+        new_stride = self.same_ordered(self.size, stride)
+        if self.should_pad_strides():
+            new_stride = self._pad_strides(new_stride, self.size, self.dtype)
         return FixedLayout(
             self.device,
             self.dtype,
             self.size,
-            self.same_ordered(self.size, stride),
+            new_stride,
             self.offset,
         )
 
@@ -2604,8 +2883,8 @@ def __init__(self, device, dtype, size, stride_order=None):
         super().__init__(device, dtype, size, strides)
 
 
-class AliasedLayout(Layout):
-    """Shares the same storage as another tensor"""
+class NonOwningLayout(Layout):
+    """Is a view into the storage of another tensor"""
 
     def __init__(self, view: Union[BaseView, "TensorBox"]):
         layout = view.get_layout()
@@ -2626,7 +2905,7 @@ def maybe_guard_aligned(self):
             return True
         from .compile_fx import ALIGNMENT
 
-        return V.graph.sizevars.statically_known_multiple_of(offset, ALIGNMENT)
+        return V.graph.sizevars.statically_known_multiple_of(offset, ALIGNMENT)  # type: ignore[arg-type]
 
 
 class NoneLayout(IRNode):
@@ -2650,7 +2929,7 @@ def as_fixed(self):
         return self
 
 
-class MutationLayout(Layout):
+class MutationLayoutSHOULDREMOVE(Layout):
     def __init__(self, target: IRNode):
         super().__init__(
             target.get_device(),
@@ -2671,7 +2950,7 @@ def storage_size(self) -> sympy.Expr:
 
     def get_buffer(self) -> "Buffer":
         def unwrap_views(target):
-            if isinstance(target, MutationLayout):
+            if isinstance(target, MutationLayoutSHOULDREMOVE):
                 return unwrap_views(target.target)
             if isinstance(target, BaseView):
                 return unwrap_views(target.unwrap_view())
@@ -2680,7 +2959,9 @@ def unwrap_views(target):
             return target
 
         result = unwrap_views(self.target)
-        assert isinstance(result, Buffer), "MutationLayout must refer to a buffer"
+        assert isinstance(
+            result, Buffer
+        ), "MutationLayoutSHOULDREMOVE must refer to a buffer"
         return result
 
     def real_layout(self):
@@ -2718,7 +2999,7 @@ def realize_into(cls, src, dst, unsafe_alias=False):
 
         src.realize()
         assert isinstance(src.data.layout, FlexibleLayout)
-        src.data.layout = MutationLayout(dst)
+        src.data.layout = MutationLayoutSHOULDREMOVE(dst)
         return src.data
 
     def as_fixed(self):
@@ -2746,7 +3027,7 @@ def make_indexer(self):
         return self.layout.make_indexer()
 
     def get_name(self) -> str:
-        assert self.name
+        assert self.name, self
         return self.name
 
     def get_device(self):
@@ -2755,7 +3036,8 @@ def get_device(self):
     def get_origin_node(self):
         return self.origin_node
 
-    def get_dtype(self):
+    @property
+    def dtype(self):
         return getattr(self.layout, "dtype", None)
 
     def get_size(self):
@@ -2777,12 +3059,12 @@ def is_extern(self):
         return False
 
     def freeze_layout(self):
-        if not isinstance(self.layout, (MultiOutputLayout, AliasedLayout)):
+        if not isinstance(self.layout, (MultiOutputLayout, NonOwningLayout)):
             self.layout = self.layout.as_fixed()
 
-    def freeze_layout_with_stride_order(self, order):
+    def freeze_layout_with_stride_order(self, order, allow_padding=False):
         assert isinstance(self.layout, FlexibleLayout)
-        self.layout = self.layout.as_stride_order(order)
+        self.layout = self.layout.as_stride_order(order, allow_padding=allow_padding)
 
     def freeze_layout_with_fill_order(self, order):
         assert isinstance(self.layout, FlexibleLayout)
@@ -2793,7 +3075,7 @@ def freeze_layout_with_same_order(self, stride):
         self.layout = self.layout.as_same_order(stride)
 
     def is_zero_elements(self):
-        return V.graph.sizevars.is_expr_static_and_true(sympy.Eq(self.get_numel(), 0))
+        return V.graph.sizevars.is_expr_static_and_true(sympy.Eq(self.get_numel(), 0))  # type: ignore[arg-type]
 
     def make_loader(self):
         # Loading from a zero-element buffer is a no-op
@@ -2815,13 +3097,13 @@ def codegen_reference(self, writer=None):
     def decide_layout(self):
         pass
 
-    def get_alias_names(self):
-        if isinstance(self.layout, AliasedLayout):
+    def get_inputs_that_alias_output(self):
+        if isinstance(self.layout, NonOwningLayout):
             return [self.layout.view.get_name()]
         return ()
 
     def get_mutation_names(self):
-        if isinstance(self.layout, MutationLayout):
+        if isinstance(self.layout, MutationLayoutSHOULDREMOVE):
             return [self.layout.target.get_name()]
         return ()
 
@@ -2836,46 +3118,7 @@ def get_reads(self):
         return self.get_read_writes().reads
 
     def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
-        """
-        Returns the unbacked symbols which are defined by this IR node,
-        because this is a data-dependent IR node, or item()
-        """
-        # So this is a little unusual.  In principle, you could imagine
-        # defining a MultiOutputLayout buffer so that it DOES define
-        # unbacked symints.  However, we can't easily tell what symints
-        # such a buffer defines, because MultiOutputLayout doesn't actually
-        # define any useful information about what it returns.
-        #
-        # An easier and better approach is to delay the symint allocation
-        # to the MultiOutput IR nodes, which are when we actually extract
-        # out the buffers and know what their sizes are.
-        #
-        # There are two subleties here:
-        #
-        # 1. Suppose you have a kernel that produces out1: (i0,), out2: (i0,)
-        #    Both of these actually count as defs!  The scheduler will just
-        #    arbitrarily pick one of these as the canonical definer and
-        #    ensure it stays live.  It's not a big deal if we pick the
-        #    wrong one because tuple accesses are cheap, and all this means
-        #    is we accidentally keep a MultiOutput node live when it wasn't
-        #    strictly necessary.
-        #
-        # 2. Suppose you have a MultiOutput buffer whose size is (i0,), but
-        #    the MultiOutputLayout buffer it is projecting from isn't actually
-        #    dynamic; it has i0 as one of the arguments.  We cannot tell this
-        #    directly from MultiOutput, we have to look at the input buffer's
-        #    uses to work this out.  No big deal.
-        if isinstance(self.layout, (NoneLayout, MultiOutputLayout)):
-            return set()
-
-        # This kernel defines all unbacked symbols... that it didn't get in as
-        # arguments!
-        defs = (
-            free_unbacked_symbols(self.get_size())
-            | free_unbacked_symbols(self.get_stride())
-            | free_unbacked_symbols(self.get_offset())
-        )
-        return defs - self.get_unbacked_symbol_uses()
+        return set()
 
     def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
         """
@@ -2894,32 +3137,6 @@ def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
         """
         return set()
 
-    def codegen_unbacked_symbol_defs(self, wrapper):
-        # NB: If it is possible for other ir node types to return unbacked
-        # symints, you need to make sure their codegen calls this method.
-        # Don't forget to update get_unbacked_symbol_defs too.
-        symbols_to_define = self.get_unbacked_symbol_defs()
-        for i, s in enumerate(self.get_size()):
-            if s in symbols_to_define:
-                wrapper.writeline(
-                    f"{wrapper.codegen_unbacked_symbol_decl(s)} = {self.get_name()}.size({i}){wrapper.ending}"
-                )
-                symbols_to_define.remove(s)
-        for i, s in enumerate(self.get_stride()):
-            if s in symbols_to_define:
-                wrapper.writeline(
-                    f"{wrapper.codegen_unbacked_symbol_decl(s)} = {self.get_name()}.stride({i}){wrapper.ending}"
-                )
-                symbols_to_define.remove(s)
-        if (s := self.get_offset()) in symbols_to_define:
-            wrapper.writeline(
-                f"{wrapper.codegen_unbacked_symbol_decl(s)} = {self.get_name()}.storage_offset(){wrapper.ending}"
-            )
-            symbols_to_define.remove(s)
-        assert (
-            not symbols_to_define
-        ), f"unbacked symint {s} not written out, check comment above"
-
     def realize(self):
         pass
 
@@ -2959,6 +3176,9 @@ def constant_to_device(self, device):
 
 
 class NoneAsConstantBuffer(IRNode):
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        return set()
+
     def codegen_reference(self, writer=None):
         return V.graph.wrapper_code.none_str
 
@@ -2968,13 +3188,11 @@ def __init__(self, shape):
         super().__init__()
         self.shape = shape
 
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        return free_unbacked_symbols(self.shape)
+
     def codegen_reference(self, writer=None):
-        expr = V.graph.wrapper_code.expr_printer(V.graph.sizevars.simplify(self.shape))
-        if V.graph.cpp_wrapper:
-            # wrap scalar to 0-d tensor for cpp wrapper
-            return f"torch::tensor({expr})"
-        else:
-            return expr
+        return V.graph.wrapper_code.expr_printer(V.graph.sizevars.simplify(self.shape))
 
 
 @dataclasses.dataclass
@@ -3032,6 +3250,7 @@ def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
             free_unbacked_symbols(self.get_size())
             | free_unbacked_symbols(self.get_stride())
             | free_unbacked_symbols(self.get_offset())
+            | self.data.get_unbacked_symbol_uses()
         )
 
     def make_loader(self):
@@ -3092,7 +3311,7 @@ def get_fill_order(self):
                 else:
                     indices = index_vars
                 stride_lengths = [
-                    V.graph.sizevars.stride_hints(expr, indices) for expr in reads
+                    V.graph.sizevars.stride_hints(expr, indices) for expr in reads  # type: ignore[arg-type]
                 ]
                 from .scheduler import pick_loop_order
 
@@ -3108,16 +3327,8 @@ def decide_layout(self):
             else:
                 self.freeze_layout()
 
-    def simplify_and_reorder(self):
-        """
-        This is a main place where we do loop transformations in a
-        backend-agnostic way.
-
-        Here we:
-            1) Remove any 1 dimensions
-            2) Fuse contiguous dimensions together
-            3) Reorder dimensions based on stride orders
-        """
+    @cache_on_self
+    def get_default_sizes_body(self):
         args, var_ranges = dependencies.index_vars_squeeze(
             self.data.get_pointwise_size(), self.data.get_reduction_size(), prefix="q"
         )
@@ -3127,17 +3338,6 @@ def simplify_and_reorder(self):
                 (args if self.get_reduction_type() else args[:1]),
                 var_ranges,
             )
-        index_formulas = [*body.indexing_exprs.values()]
-        reads_bufs = [
-            V.graph.name_to_buffer[reads_name]
-            if reads_name in V.graph.name_to_buffer.keys()
-            else None
-            for reads_name in body.reads_name2expr.keys()
-        ]
-        memory_addrs = [
-            *body.reads_name2expr.values(),
-            *body.writes_name2expr.values(),
-        ]
         index_vars = []
         reduce_vars: List[Any] = []
         index_size = []
@@ -3151,6 +3351,65 @@ def simplify_and_reorder(self):
                 assert v in args[1]
                 reduce_vars.append(v)
                 reduce_size.append(s)
+        return (index_size, reduce_size), body, (index_vars, reduce_vars)
+
+    def simplify_and_reorder(
+        self,
+        extra_indexing_constraints: Optional[Tuple[Dict[Any, Any], List[Any]]] = None,
+    ):
+        """
+        This is a main place where we do loop transformations in a
+        backend-agnostic way.
+
+        Here we:
+            1) Remove any 1 dimensions
+            2) Fuse contiguous dimensions together
+            3) Reorder dimensions based on stride orders
+
+        Optional argument extra_indexing_constraints can be used to append additional
+        indexing expressions to existing ones derived from buffer's body. This can be useful
+        to fuse scheduler nodes with compatible ranges, e.g. (s0*s1*...,) and (s0, s1, s2, ...)
+        on CPU by preventing indexing simplifications and obtaining index/reduce ranges for
+        the scheduler node compatible with other nodes.
+        """
+        (
+            (index_size, reduce_size),
+            body,
+            (index_vars, reduce_vars),
+        ) = self.get_default_sizes_body()
+
+        index_formulas = [*body.indexing_exprs.values()]
+        if extra_indexing_constraints is not None:
+            assert (
+                isinstance(extra_indexing_constraints, tuple)
+                and len(extra_indexing_constraints) == 2
+            )
+            extra_indexing_ranges, extra_indexing_expr = extra_indexing_constraints
+            assert isinstance(extra_indexing_ranges, dict)
+            assert isinstance(extra_indexing_expr, list)
+            assert all(isinstance(f, Expr) for f in extra_indexing_expr)
+
+            expected_var_ranges = body.var_ranges
+            assert expected_var_ranges == extra_indexing_ranges, (
+                expected_var_ranges,
+                extra_indexing_ranges,
+            )
+            # remove already existing expressions
+            extra_indexing_expr = [
+                e for e in extra_indexing_expr if e not in index_formulas
+            ]
+            index_formulas += extra_indexing_expr
+
+        reads_bufs = [
+            V.graph.name_to_buffer[reads_name]
+            if reads_name in V.graph.name_to_buffer.keys()
+            else None
+            for reads_name in body.reads_name2expr.keys()
+        ]
+        memory_addrs = [
+            *body.reads_name2expr.values(),
+            *body.writes_name2expr.values(),
+        ]
 
         # the reordering_reindex in reads' simplify_reorder_and_tile
         reordering_reindex = [same_reorder(range(len(index_vars)))] * len(memory_addrs)
@@ -3301,7 +3560,10 @@ def is_no_op(self):
     def should_allocate(self):
         return True
 
-    def simplify_and_reorder(self):
+    def simplify_and_reorder(
+        self,
+        extra_indexing_constraints: Optional[Tuple[Dict[Any, Any], List[Any]]] = None,
+    ):
         return (
             (
                 self.get_size(),
@@ -3312,35 +3574,162 @@ def simplify_and_reorder(self):
 
 
 class TritonTemplateBuffer(TemplateBuffer):
-    pass
-
-
-class CUDATemplateBuffer(TemplateBuffer):
     def __init__(
         self,
         layout,
         inputs,
         make_kernel_render,
-        workspace_size: int,
-        template: "CUDATemplate",  # type: ignore[name-defined]  # noqa: F821
+        debug_extra=None,
+        mutated_inputs: Optional[Iterable[IRNode]] = None,
     ):
+        """
+        NOTE:[TritonTemplates with multiple outputs]
+        We want the ability for TritonTemplates to output multiple tensors. Triton
+        kernels have no notion of outputs and this is done by creating tensors that
+        are then mutated by the kernel. Currenlty our STORE_OUTPUT codegen doesn't
+        support creating multinode outputs for triton templates.
+        We work around this by creating an extra input buffer during the lowering
+        and we mark them as mutated inputs.
+        """
         super().__init__(layout, inputs, make_kernel_render)
-        # Global memory (in bytes) needed for this template.
-        self.workspace_size = workspace_size
-        self.template = template
+        self.debug_extra = debug_extra
+        self.mutated_inputs = mutated_inputs
+        if mutated_inputs is not None:
+            # Ensure that the mutated inputs are only allowed for certain nodes
+            allowed_set = {torch.ops.higher_order.flex_attention}
+            current_node = V.graph.current_node.target
+            assert (
+                current_node in allowed_set
+            ), f"Mutated inputs are only allowed for {allowed_set} but got {current_node}"
+            mark_node_as_mutating(self, *mutated_inputs)
 
-    def get_workspace_size(self):
-        return self.workspace_size if self.workspace_size is not None else 0
+    def __str__(self):
+        out = f"TritonTemplateBuffer(layout={self.layout}, {self.debug_extra})"
+        return out
 
 
-@dataclasses.dataclass
-class InputsKernel(Buffer):
-    inputs: List[Buffer]
+PrimitiveInfoType = Union[int, float, bool, str, List[Union[int, str, float, bool]]]
 
-    def get_read_writes_input(self, x):
-        return dependencies.StarDep(x.get_name())
 
-    def get_read_writes(self):
+class ChoiceCaller:
+    """
+    Represents a possible choice used in autotune_process.py.
+    During autotuning, self.benchmark() is first called to get benchmark result,
+    and if this choice is selected, self.output_node() is called to get the output_node.
+
+    Children classes: TritonTemplateCaller, CUDATemplateCaller.
+    """
+
+    def __init__(self, name, input_nodes, layout):
+        super().__init__()
+        self.name = name
+        self.layout = layout
+        self.input_nodes = input_nodes
+
+    def benchmark(self, *args, out) -> float:
+        algo = self.to_callable()
+        if is_cpu_device(args):
+            return do_bench_cpu(lambda: algo(*args, out=out))
+        else:
+            return do_bench(lambda: algo(*args, out=out))
+
+    def call_name(self) -> str:
+        raise NotImplementedError
+
+    def to_callable(self):
+        raise NotImplementedError
+
+    def hash_key(self) -> str:
+        raise NotImplementedError
+
+    def output_node(self) -> "TensorBox":
+        raise NotImplementedError
+
+    def info_dict(self) -> Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        return {}
+
+
+class TritonTemplateCallerBase(ChoiceCaller):
+    def get_make_kernel_render(self) -> Any:
+        raise NotImplementedError
+
+
+class MultiTemplateBuffer(TritonTemplateBuffer):
+    """
+    Represents a Buffer with multiple backing implementation choices.
+
+    Choices can be TritonTemplates or ExternKernels. During scheduling if there is a potential
+    epilogue we will benchmark each of the choices with the epilogue to determine an implementation.
+    Otherwise, the fastest base choice will be chosen.
+    """
+
+    def __init__(
+        self,
+        layout: Layout,
+        inputs: List[IRNode],
+        choice_timings: Callable[[], Dict[ChoiceCaller, float]],
+    ):
+        super().__init__(layout=layout, inputs=inputs, make_kernel_render=None)
+        self._choice_timings_fn = choice_timings
+        self._choice_timings: Optional[Dict[ChoiceCaller, float]] = None
+        self.original_inputs = inputs
+
+    @property
+    def choice_timings(self) -> Dict[ChoiceCaller, float]:
+        if self._choice_timings is None:
+            self._choice_timings = self._choice_timings_fn()
+        return self._choice_timings
+
+    @contextlib.contextmanager
+    def swap_as_triton_caller(self, caller: TritonTemplateCallerBase):
+        assert isinstance(caller, torch._inductor.select_algorithm.TritonTemplateCaller)
+        assert self.layout == caller.layout
+
+        render = self.make_kernel_render
+        self.make_kernel_render = caller.get_make_kernel_render()
+        try:
+            yield
+        finally:
+            self.make_kernel_render = render
+
+    def finalize_as_triton_caller(self, caller: TritonTemplateCallerBase):
+        assert isinstance(caller, torch._inductor.select_algorithm.TritonTemplateCaller)
+        assert self.layout.size == caller.layout.size
+        assert self.layout.stride == caller.layout.stride
+        self.make_kernel_render = caller.get_make_kernel_render()
+
+    def get_min_choice(self) -> Tuple[ChoiceCaller, float]:
+        min_choice = min(self.choice_timings, key=self.choice_timings.get)  # type: ignore[arg-type]
+        return (min_choice, self.choice_timings[min_choice])
+
+
+class CUDATemplateBuffer(TemplateBuffer):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        make_kernel_render,
+        workspace_size: int,
+        template: "CUDATemplate",  # type: ignore[name-defined]  # noqa: F821
+    ):
+        super().__init__(layout, inputs, make_kernel_render)
+        # Global memory (in bytes) needed for this template.
+        self.workspace_size = workspace_size
+        self.template = template
+
+    def get_workspace_size(self):
+        return self.workspace_size if self.workspace_size is not None else 0
+
+
+@dataclasses.dataclass
+class InputsKernel(Buffer):
+    inputs: List[Buffer]
+
+    def get_read_writes_input(self, x):
+        return dependencies.StarDep(x.get_name())
+
+    def get_read_writes(self):
         star_dep = []
         for input in self.inputs:
             if isinstance(input, list):
@@ -3357,14 +3746,20 @@ def get_read_writes(self):
             op_counts=collections.Counter(),
         )
 
-    @staticmethod
-    def unwrap_storage_for_input(x):
+    @classmethod
+    def unwrap_storage_for_input(cls, x):
         if isinstance(x, TensorBox):
             x = x.data
         if isinstance(x, StorageBox):
             x = x.data
         if isinstance(x, BaseView) and not isinstance(x, ReinterpretView):
             x = ExternKernel.realize_input(x)
+        if isinstance(x, TensorBox):
+            # when converting to ReinterpretView fails in the
+            # realize_input call above, the result will be wrapped
+            # into TensorBox / StorageBox pair as a result of the
+            # cls.copy_input call; so we should unwrap recursively
+            return cls.unwrap_storage_for_input(x)
         assert isinstance(x, (Buffer, ReinterpretView)), x
         return x
 
@@ -3423,13 +3818,25 @@ def create(cls, inputs, dim):
             x = inputs[i]
             if is_storage_and_layout(x):
                 layout = x.get_layout()
-                if (
-                    isinstance(layout, FixedLayout)
-                    and layout.is_channels_last_contiguous()
-                ):
+                if isinstance(
+                    layout, FixedLayout
+                ) and Layout.is_channels_last_contiguous(layout.size, layout.stride):
                     # use CL stride for the output
                     output_stride = make_channels_last_strides_for(new_size)
                     break
+        any_input_is_storage_and_layout = any(is_storage_and_layout(x) for x in inputs)
+        fx_node_args = V.graph.current_node.args[0]
+        assert isinstance(fx_node_args, list)
+        # If any of the inputs has meta tensor and the meta tensor is in CL format, use CL format for the output
+        if any_input_is_storage_and_layout is False and any(
+            "val" in arg.meta
+            and (
+                arg.meta["val"].is_contiguous(memory_format=torch.channels_last)
+                or arg.meta["val"].is_contiguous(memory_format=torch.channels_last_3d)
+            )
+            for arg in fx_node_args
+        ):
+            output_stride = make_channels_last_strides_for(new_size)
 
         concat_kernel = ConcatKernel(
             name=None,
@@ -3446,7 +3853,9 @@ def create(cls, inputs, dim):
         for i in range(len(inputs)):
             input_buffer = cls.realize_into(
                 inputs[i],
-                SliceView.create(kernel, dim, offsets_start[i], offsets_end[i]),
+                SliceView.create(
+                    kernel, dim, offsets_start[i], offsets_end[i], clamp=False
+                ),
             )
             concat_kernel.inputs.append(input_buffer)
 
@@ -3457,7 +3866,7 @@ def create(cls, inputs, dim):
 
             if (
                 input_unwrapped.is_input_buffer()
-                and inputs[i].get_device().type == "cuda"
+                and is_gpu(inputs[i].get_device().type)
                 and not is_dynamic(input_buffer)
             ):
                 buffer_names.append(input_buffer.get_name())
@@ -3498,7 +3907,7 @@ def realize_into(cls, src, dst):
             # ExternKernelAlloc has specific requirements for output layout, should create a copy
             assert hasattr(src.data, "layout")
             if cls.can_realize_into_without_copy(src):
-                src.data.layout = AliasedLayout(dst)
+                src.data.layout = NonOwningLayout(dst)
                 return src.data
         # introduce a copy
         pw = Pointwise.create(
@@ -3523,9 +3932,84 @@ class ExternKernel(InputsKernel):
     output_view: Optional[ReinterpretView] = None
     python_kernel_name: Optional[str] = None
     cpp_kernel_name: Optional[str] = None
+    # FIXME: in some cases we sill need to explicitly pass in ordered_kwargs_for_cpp_kernel
+    # We shouldn't need to do this since the information can be retrieved from op_overload._schema.
     ordered_kwargs_for_cpp_kernel: Iterable[str] = dataclasses.field(
         default_factory=list
     )
+    op_overload: Optional[
+        Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator]
+    ] = None
+    arg_properties: Optional[List[Dict[str, Any]]] = None
+    kwarg_properties: Optional[Dict[str, Dict[str, Any]]] = None
+    unbacked_bindings: Dict[sympy.Symbol, pytree.KeyPath] = dataclasses.field(
+        default_factory=dict
+    )
+
+    def __init__(
+        self,
+        name,
+        layout,
+        inputs,
+        constant_args=(),
+        kwargs=None,
+        output_view=None,
+        python_kernel_name=None,
+        cpp_kernel_name=None,
+        ordered_kwargs_for_cpp_kernel=(),
+        op_overload=None,
+    ):
+        super().__init__(
+            name,
+            layout,
+            inputs,
+        )
+        self.constant_args = constant_args
+        self.kwargs = kwargs if kwargs else {}
+        self.output_view = output_view
+        self.python_kernel_name = python_kernel_name
+        self.cpp_kernel_name = cpp_kernel_name
+        self.ordered_kwargs_for_cpp_kernel = ordered_kwargs_for_cpp_kernel
+        self.op_overload = op_overload
+        self.collect_arg_kwarg_properties()
+        self.unbacked_bindings = {}
+        self.fx_node = V.graph.current_node
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        return set()
+
+    def collect_arg_kwarg_properties(self):
+        # if self.op_overload is torch._ops.OpOverload, we can use its schema to collect additional
+        # information for args and kwargs, e.g. type and default value, to help with the cpp wrapper codegen
+        if (
+            isinstance(self.op_overload, torch._ops.OpOverload)
+            and not self.ordered_kwargs_for_cpp_kernel
+        ):
+            self.ordered_kwargs_for_cpp_kernel = [
+                x.name for x in self.op_overload._schema.arguments if x.kwarg_only
+            ]
+        self.arg_properties = (
+            [
+                {
+                    "name": x.name,
+                    "type": x.real_type,
+                    "default_value": x.default_value,
+                }
+                for x in self.op_overload._schema.arguments
+                if not x.kwarg_only
+            ]
+            if isinstance(self.op_overload, torch._ops.OpOverload)
+            else [{} for i in range(len(self.inputs))]
+        )
+        self.kwarg_properties = (
+            {
+                x.name: {"type": x.real_type, "default_value": x.default_value}
+                for x in self.op_overload._schema.arguments
+                if x.kwarg_only
+            }
+            if isinstance(self.op_overload, torch._ops.OpOverload)
+            else {}
+        )
 
     def decide_layout(self):
         if isinstance(self.layout, FlexibleLayout):
@@ -3538,7 +4022,7 @@ def codegen_comment(self, wrapper):
             wrapper.writeline(origin_str)
 
     def codegen(self, wrapper):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def get_kernel_name(self):
         return self.cpp_kernel_name if V.graph.cpp_wrapper else self.python_kernel_name
@@ -3557,8 +4041,16 @@ def copy_input(x):
         return pw
 
     @classmethod
-    def process_kernel(cls, kernel, *args, **kwargs):
-        binded_args = signature(kernel).bind(*args, **kwargs).arguments
+    def process_kernel(
+        cls, kernel, *args, **kwargs
+    ) -> Tuple[
+        Any,
+        List[Any],
+        List[Any],
+        Callable[[Any, Any], Any],
+        Optional[Dict[sympy.Symbol, pytree.KeyPath]],
+    ]:
+        binded_args = {"args": args, "kwargs": kwargs}
 
         args_flat, args_spec = pytree.tree_flatten(binded_args)
 
@@ -3594,9 +4086,9 @@ def unflatten_args(new_tensor_args, new_non_tensor_args):
             if is_storage_and_layout(x):
                 as_storage_and_layout(x, freeze=True)
 
-        # We don't have generic shape formulas, so just burn in the
-        # shapes and run an example input.
-        # TODO(jansel): replace this with dynamic shape formulas
+        # Rerun fake tensor propagation, because Inductor may have changed the
+        # strides of inputs and we need to determine accurately what the
+        # output stride will be.
         example_args = []
 
         # We need to retain the constant values of fake tensors that we originally
@@ -3611,6 +4103,13 @@ def unflatten_args(new_tensor_args, new_non_tensor_args):
         new_args, new_kwargs = unflatten_args(example_args, non_tensor_args)
         example_output = kernel(*new_args, **new_kwargs)
 
+        unbacked_bindings: Optional[Dict[sympy.Symbol, pytree.KeyPath]] = None
+        if shape_env := V.fake_mode.shape_env:
+            rebind_unbacked(shape_env, V.current_node, example_output)
+            unbacked_bindings = compute_unbacked_bindings(
+                shape_env, example_output, V.current_node.meta.get("val")
+            )
+
         example_out_li = (
             [example_output]
             if not isinstance(example_output, (list, tuple))
@@ -3618,18 +4117,18 @@ def unflatten_args(new_tensor_args, new_non_tensor_args):
         )
         for t in example_out_li:
             if isinstance(t, torch.Tensor) and t.is_sparse:
-                V.graph.disable_cudagraphs = True
                 msg = "sparsity not handled. Please file issue for sparse inference weights."
                 if stack_trace := V.graph.current_node.meta.get("stack_trace", None):
                     msg = f"{msg} Found from : \n {stack_trace}"
                 V.graph.disable_cudagraphs_reason = msg
 
-        # TODO: Unconditionally do this, not just when example_output has
-        # unbacked symbols
-        if maybe_free_unbacked_symbols(example_output):
-            example_output = V.graph.current_node.meta["val"]
-
-        return example_output, tensor_args, non_tensor_args, unflatten_args
+        return (
+            example_output,
+            tensor_args,
+            non_tensor_args,
+            unflatten_args,
+            unbacked_bindings,
+        )
 
     @classmethod
     def convert_to_reinterpret_view(cls, x):
@@ -3644,7 +4143,30 @@ def convert_to_reinterpret_view(cls, x):
 
         # NOTE: Don't use extract_read_writes here as it fails when
         # make_loader() inlines the computation
-        x.unwrap_view().freeze_layout()
+        x_unwrap_view = x.unwrap_view()
+        x_unwrap_view_fx_node = V.graph.get_buffer(
+            x_unwrap_view.get_name()
+        ).get_origin_node()
+        # Prefer channels last format according to how the format is set from eager.
+        if (
+            x_unwrap_view_fx_node is not None
+            and "val" in x_unwrap_view_fx_node.meta
+            and isinstance(x_unwrap_view.layout, FlexibleLayout)
+            and (
+                x_unwrap_view_fx_node.meta["val"].is_contiguous(
+                    memory_format=torch.channels_last
+                )
+                or x_unwrap_view_fx_node.meta["val"].is_contiguous(
+                    memory_format=torch.channels_last_3d
+                )
+            )
+        ):
+            x_unwrap_view.freeze_layout_with_same_order(
+                make_channels_last_strides_for(x_unwrap_view.get_size())
+            )
+        else:
+            x_unwrap_view.freeze_layout()
+
         index_args, var_ranges = dependencies.index_vars_squeeze(
             x.get_size(), prefix="r"
         )
@@ -3663,7 +4185,7 @@ def convert_to_reinterpret_view(cls, x):
                 offset,
                 index,
             )
-            raise NotImplementedError()
+            raise NotImplementedError
 
         return ReinterpretView(
             data=x.data,
@@ -3716,28 +4238,45 @@ def require_stride1(cls, x):
         return cls.copy_input(x)
 
     @classmethod
-    def require_stride_order(cls, x, order):
+    def require_stride_order(cls, x, order, allow_padding=False):
         if x.get_numel() == 0:  # Layout doesn't matter
             return x
 
         # require x to have the layout as strided_ordered as order
         if is_storage_and_layout(x):
-            while isinstance(x.get_layout(), AliasedLayout):
+            while isinstance(x.get_layout(), NonOwningLayout):
                 x = x.get_layout().view
             if isinstance(x.get_layout(), FlexibleLayout):
+                # If the the FlexibleLayout already has the size and stride in the required order,
+                # freeze it to a FixedLayout by using its current size and stride.
+                # The behavior of using its current size and stride or the given order can be different
+                # if the size and stride has ambiguilty, for example for a 4D input where the iC = 1:
+                # size=[s0, 1, 28, 28], stride=[784, 784, 28, 1]. If the required order is [3, 0, 2, 1] (channels last),
+                # the current size and stride already satisfies this order.
+                # However by freezing it to the required order, the layout will be changed to:
+                # size=[s0, 1, 28, 28], stride=[784, 1, 28, 1]), which is not actually necessary.
+
                 # fix flexiblelayout to be FixedLayout with stride_order
                 as_storage_and_layout(
-                    x, freeze=True, want_contiguous=False, stride_order=order
+                    x,
+                    freeze=True,
+                    want_contiguous=False,
+                    stride_order=get_stride_order(
+                        V.graph.sizevars.size_hints(x.get_layout().stride)
+                    )
+                    if is_stride_order_storage_and_layout(x, order)
+                    else order,
+                    allow_padding=allow_padding,
                 )
                 return x
             elif isinstance(
                 x.get_layout(), FixedLayout
             ) and x.get_layout().is_stride_ordered(order):
                 return x
-            elif isinstance(x.get_layout(), MutationLayout):
+            elif isinstance(x.get_layout(), MutationLayoutSHOULDREMOVE):
                 if isinstance(x.get_layout().real_layout(), FlexibleLayout):
                     raise AssertionError(
-                        "the MutationLayout's real layout shouldn't be FlexibleLayout"
+                        "the MutationLayoutSHOULDREMOVE's real layout shouldn't be FlexibleLayout"
                     )
                 elif isinstance(
                     x.get_layout().real_layout(), FixedLayout
@@ -3756,11 +4295,17 @@ def require_stride_order(cls, x, order):
         ):
             try:
                 x.data = cls.convert_to_reinterpret_view(x.data)
-                return cls.require_stride_order(x, order)
+                return cls.require_stride_order(x, order, allow_padding=allow_padding)
             except NotImplementedError:
                 pass
         x = cls.copy_input(x)
-        as_storage_and_layout(x, freeze=True, want_contiguous=False, stride_order=order)
+        as_storage_and_layout(
+            x,
+            freeze=True,
+            want_contiguous=False,
+            stride_order=order,
+            allow_padding=allow_padding,
+        )
         assert is_stride_order_storage_and_layout(x, order)
         return x
 
@@ -3768,6 +4313,10 @@ def require_stride_order(cls, x, order):
     def require_channels_last(cls, x):
         return cls.require_stride_order(x, NHWC_STRIDE_ORDER)
 
+    @classmethod
+    def require_channels_last_3d(cls, x):
+        return cls.require_stride_order(x, NHWDC_STRIDE_ORDER)
+
     @classmethod
     def require_contiguous(cls, x):
         return cls.require_stride_order(x, list(reversed(range(len(x.get_size())))))
@@ -3780,62 +4329,69 @@ def codegen_const_args(self):
 
     def codegen_args(self):
         args = []
-        for x in self.inputs:
+        for i, x in enumerate(self.inputs):
             if isinstance(x, list):
                 names = [i.codegen_reference() for i in x]
                 codegen_reference = f'[{", ".join(names)}]'
                 args.append(codegen_reference)
             else:
-                args.append(x.codegen_reference())
+                if V.graph.cpp_wrapper:
+                    assert self.arg_properties and i < len(
+                        self.arg_properties
+                    ), "Invalid arg_properties accessing"
+                    type_ = self.arg_properties[i].get("type")
+                    args.append(
+                        V.graph.wrapper_code.val_to_cpp_arg_str(  # type: ignore[arg-type]
+                            type_, x
+                        )
+                    )
+                else:
+                    args.append(x.codegen_reference())
         args.extend(self.codegen_const_args())
         return args
 
     def get_kwargs_value(self, arg_name):
         if arg_name in self.kwargs:
             return self.kwargs.get(arg_name)
-        if (
-            hasattr(self, "kwargs_default_value")
-            and arg_name in self.kwargs_default_value
-        ):
-            return self.kwargs_default_value.get(arg_name).get("value")
-        raise AssertionError(
-            f"arg {arg_name} not found in self.kwargs or self.kwargs_default_value"
-        )
-
-    def is_legacy_abi_kernel(self):
-        return False
+        if self.kwarg_properties and self.kwarg_properties.get(arg_name):
+            return self.kwarg_properties.get(arg_name).get("default_value")  # type: ignore[union-attr]
+        else:
+            raise AssertionError(f"{arg_name} not in self.kwarg_properties")
 
-    def codegen_kwargs(self):
+    def codegen_kwargs(self, skip_out=False):
         if V.graph.cpp_wrapper:
-            # FIXME: we should unconditionally fill self.kwargs with missing default values
-            # instead of carrying an extra self.ordered_kwargs_for_cpp_kernel
-            if self.kwargs and not self.ordered_kwargs_for_cpp_kernel:
-                raise AssertionError("ordered_kwargs_for_cpp_kernel is missing")
             kwargs = []
             for arg_name in self.ordered_kwargs_for_cpp_kernel:
+                if skip_out and arg_name == "out":
+                    # ExternKernelOut has its own logic for inserting the out parameter
+                    continue
+
                 v = self.get_kwargs_value(arg_name)
                 if isinstance(v, sympy.Expr):
                     kwargs.append(v)
                 else:
-                    # FIXME We should let ExternKernel have access to the cpp schema where possible.
-                    if hasattr(self, "kwargs_default_value"):
-                        type_ = self.kwargs_default_value.get(arg_name).get("type")
-                    else:
-                        type_ = None
+                    type_ = (
+                        self.kwarg_properties.get(arg_name).get("type")  # type: ignore[union-attr]
+                        if self.kwarg_properties and arg_name in self.kwarg_properties
+                        else None
+                    )
                     kwargs.append(
-                        V.graph.wrapper_code.val_to_cpp_arg_str(
-                            type_, v, self.is_legacy_abi_kernel()
+                        V.graph.wrapper_code.val_to_cpp_arg_str(  # type: ignore[arg-type]
+                            type_, v
                         )
                     )
         else:
             kwargs = [
-                f"{k}={V.graph.wrapper_code.val_to_arg_str(v)}"
+                f"{k}={V.graph.wrapper_code.val_to_arg_str(v)}"  # type: ignore[misc]
                 for k, v in self.kwargs.items()
             ]
         return kwargs
 
     def codegen_size_asserts(self, wrapper):
         if config.size_asserts and not V.graph.cpp_wrapper:
+            # comparing strides for 0 size tensor is tricky. Ignore them for now.
+            if sympy_product(self.get_size()) == 0:
+                return
             size = V.graph.wrapper_code.codegen_shape_tuple(self.get_size())
             stride = V.graph.wrapper_code.codegen_shape_tuple(self.get_stride())
             wrapper.writeline(
@@ -3860,7 +4416,8 @@ def canonicalize(self):
         sizes = self.get_size()
         strides = self.get_stride()
         strides = [sizevars.size_hint(x) for x in strides]
-        index_vars = [sympy_symbol(f"d{i}") for i in range(len(sizes))]
+        # TODO: I can't tell if the symbols here are temporary
+        index_vars = [sympy_index_symbol(f"d{i}") for i in range(len(sizes))]
         # reorder index vars according to stride
         index_order = sorted(range(len(strides)), key=strides.__getitem__, reverse=True)
         lookup = {pos: idx for idx, pos in enumerate(index_order)}
@@ -3878,7 +4435,7 @@ def canonicalize(self):
         _, add_var = var_builder("c")
         replacement = dict(zip(index_vars, reindex([add_var(x) for x in new_sizes])))
 
-        index = sympy_subs(sympy.expand(index), replacement)
+        index = sympy_subs(sympy.expand(index), replacement)  # type: ignore[arg-type]
         return index, tuple(new_sizes)
 
     def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
@@ -3910,12 +4467,12 @@ def __str__(self):
 class ExternKernelOut(ExternKernel):
     def codegen(self, wrapper):
         self.codegen_comment(wrapper)
-        args = [*self.codegen_args(), *self.codegen_kwargs()]
+        args = [*self.codegen_args(), *self.codegen_kwargs(skip_out=True)]
         wrapper.generate_extern_kernel_out(
-            self.output_view,
+            self.get_kernel_name(),
             self.codegen_reference(),
+            self.output_view.codegen_reference() if self.output_view else None,
             args,
-            self.get_kernel_name(),
         )
 
     def __init__(
@@ -3928,15 +4485,21 @@ def __init__(
         python_kernel_name=None,
         cpp_kernel_name=None,
         ordered_kwargs_for_cpp_kernel=(),
+        op_overload=None,
     ):
         super().__init__(
-            None, layout, self.unwrap_storage(inputs), constant_args, kwargs or {}
+            None,
+            layout,
+            self.unwrap_storage(inputs),
+            constant_args,
+            kwargs or {},
+            None,
+            python_kernel_name,
+            cpp_kernel_name,
+            ordered_kwargs_for_cpp_kernel,
+            op_overload,
         )
-        self.output_view = output_view
         self.name = V.graph.register_buffer(self)
-        self.python_kernel_name = python_kernel_name
-        self.cpp_kernel_name = cpp_kernel_name
-        self.ordered_kwargs_for_cpp_kernel = ordered_kwargs_for_cpp_kernel
 
     def should_allocate(self):
         return True
@@ -3954,7 +4517,13 @@ def __init__(self, count: int, device: torch.device):
             inputs=[],
             constant_args=[limits.min, limits.max, [count]],
             python_kernel_name="aten.randint.low_out",
-            cpp_kernel_name="at::randint_out",
+            # FIXME: Ideally we should only use at::_ops::randint_low_out::call here,
+            # but the signature is different from is at::randint_out. Again,
+            # we can simplify the code when only keeping an ABI-compatible version.
+            cpp_kernel_name="at::_ops::randint_low_out::call"
+            if config.abi_compatible
+            else "at::randint_out",
+            op_overload=aten.randint.low_out,
         )
 
 
@@ -3975,14 +4544,21 @@ def __init__(
         python_kernel_name=None,
         cpp_kernel_name=None,
         ordered_kwargs_for_cpp_kernel=(),
+        op_overload=None,
     ):
         super().__init__(
-            None, layout, self.unwrap_storage(inputs), constant_args, kwargs or {}
+            None,
+            layout,
+            self.unwrap_storage(inputs),
+            constant_args,
+            kwargs or {},
+            None,
+            python_kernel_name,
+            cpp_kernel_name,
+            ordered_kwargs_for_cpp_kernel,
+            op_overload,
         )
         self.name = V.graph.register_buffer(self)
-        self.python_kernel_name = python_kernel_name
-        self.cpp_kernel_name = cpp_kernel_name
-        self.ordered_kwargs_for_cpp_kernel = ordered_kwargs_for_cpp_kernel
 
     def should_allocate(self):
         return False
@@ -4008,7 +4584,7 @@ def codegen(self, wrapper):
         kernel, configs = self.get_kernel_and_configs()
 
         # Definition of kernel
-        new_name = wrapper.define_user_defined_triton_kernel(
+        new_name, triton_meta = wrapper.define_user_defined_triton_kernel(
             kernel, configs, self.kwargs
         )
 
@@ -4026,6 +4602,7 @@ def codegen(self, wrapper):
             self.grid,
             configs,
             args,
+            triton_meta,
         )
 
     def should_allocate(self):
@@ -4036,10 +4613,18 @@ def has_side_effects(self):
         # modifies input in place, do not let it get DCEd
         return True
 
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        # add unbacked symbols used in the grid to the ones used
+        # in the kwargs (the latter is generated by ExternKernel)
+        return super().get_unbacked_symbol_uses() | free_unbacked_symbols(self.grid)
+
     def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
         return set()
 
     def get_mutation_names(self):
+        # NB: Inductor only allows a node to mutate 0 or 1 buffers.
+        # To get around that, we create MutationOutputs which marks their
+        # assigned input as mutable, thus, adhering to Inductor's constraint.
         return []
 
     def __init__(self, *, kernel_idx, grid, kernel_args):
@@ -4069,28 +4654,38 @@ def __init__(self, *, kernel_idx, grid, kernel_args):
         self.kernel_idx = kernel_idx
         self.grid = grid
 
-        kernel, _ = self.get_kernel_and_configs()
+        kernel, configs = self.get_kernel_and_configs()
         # If we are autotuning, not all arguments will be passed
         self.ordered_kwargs_for_cpp_kernel = [
             arg for arg in kernel.arg_names if arg in kernel_args
         ]
 
-        mark_node_as_mutating(
-            self, *[a for a in kernel_args.values() if isinstance(a, TensorBox)]
-        )
+        from torch._higher_order_ops.triton_kernel_wrap import identify_mutated_tensors
 
-    def get_alias_names(self):
-        return [i.get_name() for i in self.inputs]
+        autotuned_kwargs = configs[0].kwargs if len(configs) > 0 else {}
+        self.mutable_args = [
+            kernel_args[key]
+            for key in identify_mutated_tensors(
+                kernel, {**kernel_args, **autotuned_kwargs}
+            )
+        ]
+        mark_node_as_mutating(self, *self.mutable_args)
 
+    def get_inputs_that_alias_output(self):
+        return [i.get_name() for i in self.mutable_args]
 
-def mark_node_as_mutating(cur_buffer, *mutated_ops):
+
+def mark_node_as_mutating(cur_buffer, *mutated_ops: IRNode):
     """
     Allows ops in mutated_ops to be marked as being mutated as well as
     indicates to the scheduler that these ops depend on cur_buffer.
     """
     for op in mutated_ops:
-        assert isinstance(op, TensorBox)
+        assert isinstance(
+            op, IRNode
+        ), f"{op} op is type {type(op)} and is not an IRNode"
         V.graph.mark_buffer_mutated(op.get_name())
+        assert hasattr(op, "layout")
         MutationOutput(op.layout, op, cur_buffer)
 
 
@@ -4111,7 +4706,7 @@ def is_no_op(self):
     def has_side_effects(self):
         return True
 
-    def get_alias_names(self):
+    def get_inputs_that_alias_output(self):
         return [self.inputs[0].get_name()]
 
 
@@ -4144,19 +4739,78 @@ def __init__(self, x, *constant_args):
         )
         self.name = V.graph.register_buffer(self)
         self.python_kernel_name = "aten.bernoulli_"
-        self.cpp_kernel_name = "at::native::bernoulli_"
+        self.cpp_kernel_name = (
+            "aoti_torch_bernoulli_"
+            if config.abi_compatible
+            else "at::native::bernoulli_"
+        )
         mark_node_as_mutating(self, x)
 
 
-class AccumulateGrad(ExternKernel):
+# Used to deal with torch.complex types
+class InplaceCopyFallback(ExternKernel):
+    """
+    This needs to be a custom class to handle mutation properly
+    """
+
+    def codegen(self, wrapper):
+        (dst, src, non_blocking) = self.codegen_args()
+        wrapper.writeline(
+            f"{self.get_kernel_name()}({dst}, {src}, {non_blocking}){wrapper.ending}"
+        )
+
+    def should_allocate(self):
+        return False
+
+    def get_mutation_names(self):
+        return [self.inputs[0].get_name()]
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        return set()
+
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args,
+    ):
+        super().__init__(
+            None,
+            layout,
+            inputs,
+            constant_args,
+            python_kernel_name="aten.copy_",
+            cpp_kernel_name=(
+                "aoti_torch_copy_" if config.abi_compatible else "at::_ops::copy_::call"
+            ),
+        )
+        self.name = V.graph.register_buffer(self)
+
+    @classmethod
+    def create(cls, dst, src, non_blocking: bool = False):
+        inputs = [cls.realize_input(t) for t in [dst, src]]
+        constant_args = (non_blocking,)
+        result = InplaceCopyFallback(
+            NoneLayout(dst.get_device()),  # type: ignore[arg-type]
+            inputs,
+            constant_args,
+        )
+        mark_node_as_mutating(result, dst)
+        return result
+
+
+class MutatingFirstArgExternKernel(ExternKernel):
     """
     This needs to be a custom class to handle mutation properly
     """
 
     def codegen(self, wrapper):
-        (variable, new_grad) = (t.codegen_reference() for t in self.inputs)
+        argrefs = [
+            *(t.codegen_reference() for t in self.inputs),
+            *map(repr, self.constant_args),
+        ]
         wrapper.writeline(
-            f"{self.get_kernel_name()}({variable}, {new_grad}){wrapper.ending}"
+            f"{self.get_kernel_name()}({', '.join(argrefs)}){wrapper.ending}"
         )
 
     def should_allocate(self):
@@ -4168,18 +4822,50 @@ def get_mutation_names(self):
     def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
         return set()
 
-    def __init__(self, variable, new_grad):
+    def has_side_effects(self):
+        return True
+
+
+class ResizeStorageBytes(MutatingFirstArgExternKernel):
+    def __init__(self, variable, new_size):
+        assert isinstance(new_size, int), "TODO: dynamic shapes"
         super().__init__(
             None,
             NoneLayout(variable.get_device()),  # type: ignore[arg-type]
-            self.unwrap_storage([variable, new_grad]),
+            self.unwrap_storage([variable]),
+            constant_args=(new_size,),
         )
+        V.graph.mark_buffer_mutated(variable.get_name())
         self.name = V.graph.register_buffer(self)
-        self.python_kernel_name = "inductor_ops.accumulate_grad_"
-        self.cpp_kernel_name = "torch::inductor::accumulate_grad_"
+        self.python_kernel_name = "inductor_ops.resize_storage_bytes_"
+        self.cpp_kernel_name = "torch::inductor::resize_storage_bytes_"
+        V.graph.never_reuse_buffers.add(variable.data.get_name())
         mark_node_as_mutating(self, variable)
 
 
+class BindNNParameter(ExternKernelAlloc):
+    def __init__(self, variable, placeholder):
+        variable.freeze_layout()
+        super().__init__(
+            variable.get_layout(),
+            [variable, placeholder],
+            python_kernel_name="torch.ops.prims._bind_nn_parameter",
+        )
+        V.graph.never_reuse_buffers.add(variable.data.get_name())
+        V.graph.never_reuse_buffers.add(placeholder.get_name())
+        V.graph.never_reuse_buffers.add(self.get_name())
+        mark_node_as_mutating(self, variable, placeholder)
+
+    def get_inputs_that_alias_output(self):
+        return [self.inputs[0].get_name(), self.inputs[1].get_name()]
+
+    def get_mutation_names(self):
+        return [self.inputs[1].get_name()]
+
+    def has_side_effects(self):
+        return True
+
+
 class ScatterFallback(ExternKernel):
     """
     This needs to be a custom class to handle mutation properly.
@@ -4203,7 +4889,7 @@ def codegen(self, wrapper):
         wrapper.generate_scatter_fallback(
             x,
             [x, self.constant_args[0], index, src],
-            self.get_kernel_name(),
+            self.cpp_kernel_name,
             self.python_kernel_name,
             self.src_is_tensor,
             reduce,
@@ -4213,25 +4899,6 @@ def codegen(self, wrapper):
     def should_allocate(self):
         return False
 
-    def get_cpp_kernel(self):
-        reduce = self.kwargs["reduce"]
-        if self.python_kernel_name == "aten.scatter_":
-            if self.src_is_tensor:
-                kernel = (
-                    "at::scatter_out" if reduce is None else "at::scatter_reduce_out"
-                )
-            else:
-                assert (
-                    reduce is None
-                ), "Expect reduce to be None for aten.scatter_ with scalar src"
-                kernel = "at::scatter_out"
-        else:
-            assert (
-                reduce is not None
-            ), "Expect reduce to be not None for aten.scatter_reduce_"
-            kernel = "at::scatter_reduce_out"
-        return kernel
-
     def get_mutation_names(self):
         return [self.inputs[0].get_name()]
 
@@ -4240,7 +4907,7 @@ def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
 
     def __init__(
         self,
-        python_kernel_name,
+        op_overload,
         x,
         dim: int,
         index,
@@ -4249,7 +4916,6 @@ def __init__(
         reduce: Optional[str] = None,
         include_self: bool = True,
     ):
-        assert python_kernel_name in {"aten.scatter_", "aten.scatter_reduce_"}
         self.src_is_tensor = isinstance(src, TensorBox)
 
         constant_args: Tuple[Any, ...]
@@ -4266,11 +4932,11 @@ def __init__(
             self.unwrap_storage(tensors),
             constant_args,
             {"reduce": reduce, "include_self": include_self},
+            python_kernel_name=str(op_overload),
+            ordered_kwargs_for_cpp_kernel=["reduce", "include_self"],
+            op_overload=op_overload,
         )
-
-        self.python_kernel_name = python_kernel_name
-        self.cpp_kernel_name = self.get_cpp_kernel()
-        self.ordered_kwargs_for_cpp_kernel = ["reduce", "include_self"]
+        self.cpp_kernel_name = get_aten_cpp_kernel_name(op_overload)
         self.name = V.graph.register_buffer(self)
         mark_node_as_mutating(self, x)
 
@@ -4290,13 +4956,8 @@ def codegen(self, wrapper):
             else:
                 indices.append(V.graph.wrapper_code.none_str)
 
-        indices_str = f"{V.graph.wrapper_code.open_bracket}{', '.join(indices)}{V.graph.wrapper_code.closed_bracket}"
-        args = [x, indices_str, values, *self.codegen_const_args()]
-        wrapper.writeline(
-            wrapper.wrap_kernel_call(
-                self.get_kernel_name(),
-                args,
-            )
+        wrapper.generate_index_put_fallback(
+            self.get_kernel_name(), x, indices, values, *self.codegen_const_args()
         )
 
     def should_allocate(self):
@@ -4308,28 +4969,36 @@ def get_mutation_names(self):
     def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
         return set()
 
-    def __init__(self, x, indices, values, accumulate):
+    def __init__(self, op_overload, x, indices, values, accumulate):
         self.indices = indices
         valid_indices = [i for i in indices if i is not None]
         tensors = [self.realize_input(x) for x in [x, values, *valid_indices]]
+        cpp_kernel_name = (
+            "aoti_torch_index_put_out" if config.abi_compatible else "at::index_put_out"
+        )
         super().__init__(
             None,
             NoneLayout(x.get_device()),  # type: ignore[arg-type]
             self.unwrap_storage(tensors),
             (accumulate,),
+            python_kernel_name="aten.index_put_",
+            cpp_kernel_name=cpp_kernel_name,
+            op_overload=op_overload,
         )
         self.name = V.graph.register_buffer(self)
-        self.cpp_kernel_name = "at::index_put_"
-        self.python_kernel_name = "aten.index_put_"
         mark_node_as_mutating(self, x)
 
 
 class DeviceCopy(ExternKernelOut):
     @classmethod
     def create(cls, x, device):
-        if not x.is_extern() and all(
-            (r.name in V.graph.constants and isinstance(r, dependencies.MemoryDep))
-            for r in x.get_reads()
+        if (
+            not x.is_extern()
+            and all(
+                (r.name in V.graph.constants and isinstance(r, dependencies.MemoryDep))
+                for r in x.get_reads()
+            )
+            and not config.aot_inductor.use_runtime_constant_folding
         ):
             return x.constant_to_device(device)
 
@@ -4366,24 +5035,11 @@ def get_reads(self):
     def should_allocate(self):
         return False
 
-    # TODO: handle bools carefully
-    def __init__(self, sym, data):
-        super().__init__(None, NoneLayout(torch.device("cpu")), [data])  # type: ignore[arg-type]
-        if isinstance(sym, sympy.Symbol):
-            self.sym = sym
-            self.is_bool = False
-        else:
-            # Special case for boolean.  For Reasons(TM), we don't represent
-            # boolean variables directly in sympy; instead, we generate an
-            # indicator integer variable which we then convert to a boolean by
-            # testing i0 == 1.  We have to identify the underlying indicator
-            # variable, and then bind i0 to the appropriate integer value
-            # based on the runtime boolean.
-            assert isinstance(sym, sympy.Eq), sym
-            assert isinstance(sym.args[0], sympy.Symbol), sym
-            assert sym.args[1] == 1, sym
-            self.sym = sym.args[0]
-            self.is_bool = True
+    def __init__(self, sym, keypath, data):
+        data.realize()
+        super().__init__(None, NoneLayout(torch.device("cpu")), self.unwrap_storage([data]))  # type: ignore[arg-type]
+        self.sym = sym
+        self.keypath = keypath
 
     def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
         return {self.sym}
@@ -4421,14 +5077,23 @@ def get_unbacked_symbol_uses(self):
         return free_unbacked_symbols(self.scalar)
 
     def codegen(self, wrapper):
-        assert not V.graph.cpp_wrapper, "NYI"
-        wrapper.writeline(
-            f"if not {V.graph.wrapper_code.codegen_python_sizevar(self.scalar)}:"
-        )
-        wrapper.writeline(f"    raise RuntimeError({repr(self.msg)})")
-        # No one should ever use this buffer, but for uniformity
-        # define the variable and assign it None
-        wrapper.writeline(f"{self.get_name()} = None")
+        if V.graph.cpp_wrapper:
+            pass
+        else:
+            # NB: It is EXTREMELY important not to simplify the scalar under
+            # assertion here, because simplify is done with respect to
+            # runtime asserts.  So if you have "u0 == 0" in the runtime
+            # asserts, if you subsequently try to simplify(u0 == 0), you will
+            # get True (because we've already runtime assert'ed that it's
+            # true).  But we're code generating the actual runtime assert
+            # here!!
+            wrapper.writeline(
+                f"if not {V.graph.wrapper_code.codegen_python_sizevar(self.scalar, simplify=False)}:"
+            )
+            wrapper.writeline(f"    raise RuntimeError({repr(self.msg)})")
+            # No one should ever use this buffer, but for uniformity
+            # define the variable and assign it None
+            wrapper.writeline(f"{self.get_name()} = None")
 
 
 @dataclasses.dataclass
@@ -4438,13 +5103,19 @@ class ExternKernelNode:
 
 
 has_c_shim = {
+    aten._embedding_bag.default,
+    aten._fft_c2c.default,
+    aten._scaled_dot_product_efficient_attention.default,
     aten._scaled_dot_product_flash_attention.default,
+    aten._scaled_mm.default,
     aten.addmm.out,
     aten.bmm.out,
+    aten.copy_.default,
     aten.mm.out,
-    aten._scaled_mm.default,
     aten.repeat_interleave.Tensor,
     aten.nonzero.default,
+    aten.view.dtype,
+    aten.view_as_real.default,
 }
 
 
@@ -4456,11 +5127,12 @@ def get_aten_cpp_kernel_name(kernel):
     assert (
         isinstance(kernel, torch._ops.OpOverload) and kernel.namespace == "aten"
     ), "Invalid aten kernel"
-    return (
-        f"at::{kernel.__name__.split('.')[0]}"
+    opname = (
+        kernel.__name__.split(".")[0]
         if kernel._overloadname == "default"
-        else f"at::_ops::{kernel.__name__.replace('.', '_')}::call"
+        else kernel.__name__.replace(".", "_")
     )
+    return f"at::_ops::{opname}::call"
 
 
 class FallbackKernel(ExternKernelAlloc):
@@ -4474,18 +5146,21 @@ def __init__(
         nontensor_args,
         unflatten_args,
         kwargs=None,
+        *,
+        unbacked_bindings=None,
     ):
         super().__init__(
             layout,
             tuple(tensor_args),
             tuple(nontensor_args),
+            op_overload=kernel,
         )
         # We need output buffers for generating kernel arguments in the
         # abi-compatible mode, where we retrieve outputs by pass each individual
         # output through the abi-compatible interface.
         self.outputs: Sequence[Any] = []
         self.use_runtime_dispatch = False
-        self.abi_compatible_kernel = None
+        self.unbacked_bindings = unbacked_bindings
 
         assert isinstance(
             kernel,
@@ -4500,6 +5175,138 @@ def __init__(
         self.kwargs = {} if kwargs is None else kwargs
         V.graph.warn_fallback(self.python_kernel_name)
 
+        # args that are aliased
+        self.alias_names: List[str] = []
+        # args that are mutated AND returned from the op
+        self.mutation_names: List[str] = []
+
+        if isinstance(self.op_overload, torch._ops.HigherOrderOperator):
+            # We assume here that HOPs with FallbackKernel are functional.
+            # This may not always be true! HOPs must individually opt-in to
+            # FallbackKernel, so please check this if you opt-in.
+            return
+
+        if "_c10d_functional" in self.op_overload.name():
+            # _c10d_functional kernels are lowered into _CollectiveKernel which
+            # derives from FallbackKernel for the cpp codegen. The kernels
+            # don't pass the can_auto_functionalize check, but their mutation
+            # is handled properly by _CollectiveKernel.
+            return
+
+        schema = self.op_overload._schema
+
+        # NOTE: [FallbackKernel supported operators]
+        # We only support three types of operators:
+        # - functional ops
+        # - view ops
+        # - inplace aten ops
+        # - mutating ops that are auto-functionalizable. That is,
+        # the operator may mutate any number of inputs, but its outputs
+        # may not alias any of the inputs.
+        #
+        # The unsupported cases usually do not show up here (because
+        # AOTAutograd functionalized them away); the only way for an in-place
+        # op to show up here is if a lowering or pass introduced it.
+        if torch._library.utils.mutates_and_returns_first_arg(self.op_overload):
+            self.mutation_names.append(tensor_args[0].get_name())
+            return
+
+        if schema.is_mutable and not can_auto_functionalize(kernel):
+            raise NotImplementedError(
+                f"NYI: Can't generate FallbackKernel for {kernel}"
+            )
+
+        schema_args = schema.arguments
+        args, kwargs = self.unflatten_args(self.inputs, self.constant_args)
+
+        def handle_aliasing_and_mutation(info, arg):
+            # Assertions to make sure we didn't mismatch args
+            if isinstance(info.type, torch.ListType):
+                assert isinstance(arg, (list, tuple))
+            is_optional_tensor = isinstance(
+                info.type, torch.OptionalType
+            ) and isinstance(info.type.getElementType(), torch.TensorType)
+            if is_optional_tensor or isinstance(info.type, torch.TensorType):
+                # PyTorch also accepts None and scalar types for args marked as "Tensor".
+                # We're not going to check all of them here.
+                assert not isinstance(arg, (tuple, list))
+
+            if arg is None:
+                return
+            if info.alias_info is None:
+                return
+            # can_auto_functionalize already filters out mutable List[Tensor].
+            # We can support this in the future, but this is very uncommon.
+            assert isinstance(info.type, torch.TensorType) or is_optional_tensor
+            self.alias_names.append(arg.get_name())
+            if info.alias_info.is_write:
+                mark_node_as_mutating(self, arg)
+
+        for info, arg in torch._library.utils.zip_schema(schema, args, kwargs):
+            handle_aliasing_and_mutation(info, arg)
+
+    def codegen_unbacked_symbol_defs(self, wrapper):
+        if not hasattr(self, "unbacked_bindings"):
+            return
+
+        unbacked_bindings = resolve_unbacked_bindings(
+            V.graph.sizevars.shape_env, self.unbacked_bindings
+        )
+
+        if not unbacked_bindings:
+            return
+
+        for s, keypath in unbacked_bindings.items():
+
+            def go(expr, keypath):
+                if keypath == ():
+                    return expr
+
+                if (
+                    len(keypath) >= 2
+                    and isinstance(keypath[0], CallMethodKey)
+                    and isinstance(keypath[1], pytree.SequenceKey)
+                ):
+                    return go(
+                        f"{expr}.{keypath[0].name}({keypath[1].idx})", keypath[2:]
+                    )
+                elif isinstance(keypath[0], CallMethodKey):
+                    return go(f"{expr}.{keypath[0].name}()", keypath[1:])
+                elif isinstance(keypath[0], pytree.SequenceKey):
+                    return go(f"{expr}[{keypath[0].idx}]", keypath[1:])
+                elif isinstance(keypath[0], DivideByKey):
+                    # TODO: need to assert divisibility
+                    # TODO: this is invalid C++ codegen
+                    return go(f"{expr}.__floordiv__({keypath[0].divisor})", keypath[1:])
+                else:
+                    raise AssertionError(f"unrecognized keypath {keypath}")
+
+            def go_outer():
+                if V.graph.cpp_wrapper and config.abi_compatible:
+                    # Special handling for the top level buffer access,
+                    # because self.get_name() is actually never bound; the
+                    # individual output arguments are bound by
+                    # generate_c_shim_fallback_kernel
+                    if len(self.outputs) == 1:
+                        return go(self.outputs[0].get_name(), keypath)
+                    else:
+                        assert isinstance(keypath[0], pytree.SequenceKey)
+                        return go(self.outputs[keypath[0].idx].get_name(), keypath[1:])
+                else:
+                    return go(self.get_name(), keypath)
+
+            wrapper.writeline(
+                f"{wrapper.codegen_unbacked_symbol_decl(s)} = {go_outer()}{wrapper.ending}"
+            )
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        if unbacked_bindings := getattr(self, "unbacked_bindings", None):
+            return resolve_unbacked_bindings(
+                V.graph.sizevars.shape_env, unbacked_bindings
+            ).keys()
+        else:
+            return set()
+
     def set_cpp_kernel(self, kernel):
         from .codegen.wrapper import get_cpp_op_schema
 
@@ -4526,12 +5333,6 @@ def is_not_write(arg):
 
         self.cpp_op_schema = get_cpp_op_schema(kernel)
         self.init_args_default_value(kernel._schema)
-        self.ordered_kwargs_for_cpp_kernel = [
-            x.name for x in kernel._schema.arguments if x.kwarg_only
-        ]
-
-    def is_legacy_abi_kernel(self):
-        return "_scaled_dot_product_flash_attention" in str(self.python_kernel_name)
 
     def init_args_default_value(self, schema):
         self.args_default_value = [
@@ -4567,30 +5368,6 @@ def get_pos_arg_value(self, pos, kwargs):
         )
         return arg_default_value
 
-    # Generate abi-compatible kernel names for shim kernels.
-    # Each individual shim kernel may have its own versioning rule.
-    # However, we don't expect we would end up with too many of such rules.
-    def _get_abi_compatible_kernel(self):
-        if not V.graph.cpp_wrapper:
-            return self.python_kernel_name
-
-        def sdpa_ver_fn():
-            # For sdpa, we need the v2 version only if any optional
-            # kwarg is missing.
-            if any(
-                self.get_kwargs_value(arg_name) is None
-                for arg_name in self.ordered_kwargs_for_cpp_kernel
-            ):
-                return f"{self.cpp_kernel_name}_v2"
-            else:
-                return self.cpp_kernel_name
-
-        kernel_to_ver = {"at::_scaled_dot_product_flash_attention": sdpa_ver_fn}
-        ver_fn = kernel_to_ver.get(self.cpp_kernel_name, None)  # type: ignore[arg-type]
-        if ver_fn is not None:
-            return ver_fn()
-        return self.cpp_kernel_name
-
     def codegen_args(self):
         @dataclasses.dataclass
         class Shim:
@@ -4601,15 +5378,9 @@ def __repr__(self):
 
         tensor_args = [Shim(x.codegen_reference()) for x in self.inputs]
         args, kwargs = self.unflatten_args(tensor_args, self.constant_args)
-        # Now we setup abi_compatible_kernel after self.python_kernel_name
-        # and kwargs are adjusted appropriately.
-        self.abi_compatible_kernel = self._get_abi_compatible_kernel()
-
         if V.graph.cpp_wrapper and isinstance(self.op_overload, torch._ops.OpOverload):
             args = [
-                V.graph.wrapper_code.val_to_cpp_arg_str(
-                    param.real_type, x, self.is_legacy_abi_kernel()
-                )
+                V.graph.wrapper_code.val_to_cpp_arg_str(param.real_type, x)
                 for param, x in zip(self.op_overload._schema.arguments, args)
             ]
         else:
@@ -4643,26 +5414,22 @@ def find_device(tensor_args, example_output):
             if len(devices) == 1:
                 return devices[0]
             for device in devices:
-                if device.type == "cuda":
+                if is_gpu(device.type):
                     return device
             return devices[0]
         return None
 
     def has_side_effects(self):
-        # TODO - some fallbacks are still OpOverloadPackets
-        if not isinstance(self.op_overload, torch._ops.OpOverload):
+        if isinstance(self.op_overload, torch._ops.HigherOrderOperator):
             return False
         return get_schema_info(self.op_overload).is_mutable()
 
-    def get_alias_names(self):
-        # TODO - some fallbacks are still OpOverloadPackets
-        if not isinstance(self.op_overload, torch._ops.OpOverload):
-            return []
-        if torch._inductor.utils.is_view(self.op_overload):
-            # TODO - use op._schema.arguments alias_info to figure out
-            # precise list
-            return [inp.get_name() for inp in self.inputs]
-        return []
+    def get_inputs_that_alias_output(self):
+        return self.alias_names
+
+    def get_mutation_names(self):
+        assert len(self.mutation_names) <= 1
+        return self.mutation_names
 
     def fill_non_provided_args(self, args, kwargs, convert_val_to_str=False):
         assert isinstance(args, (list, tuple))
@@ -4746,7 +5513,7 @@ def handle_single_output(return_type, output):
         node = ExternKernelNode(
             name=self.get_name(),
             node=export_schema.Node(
-                target=self.op_overload.name(),
+                target=self.op_overload.name(),  # type: ignore[union-attr]
                 inputs=named_arguments,
                 outputs=output_arguments,
                 metadata={},
@@ -4759,11 +5526,26 @@ def handle_single_output(return_type, output):
 
     def codegen(self, wrapper):
         kernel = self.op_overload
-        if kernel.namespace == "aten":
+        if kernel.namespace == "aten":  # type: ignore[union-attr]
             # Aten Fallback Ops
             assert isinstance(kernel, torch._ops.OpOverload)
+
+            if (
+                kernel == aten.mul.Tensor
+                and len(self.inputs) == 1
+                and len(self.constant_args) == 1
+            ):
+                # When aten.mul.Tensor's second arg is constant, cpp wrapper expects to call mul_Scalar
+                kernel = aten.mul.Scalar
+
             if V.graph.cpp_wrapper:
-                if config.is_fbcode() and kernel not in has_c_shim:
+                if (
+                    config.is_fbcode()
+                    and kernel not in has_c_shim
+                    # C shim v2 is torchgen-ed, which should cover all aten ops.
+                    # If you do hit a missed op, please update gen_aoti_c_shim.py.
+                    and config.c_shim_version == "1"
+                ):
                     log.warning(
                         "%s is missing a c-shim implementation, using proxy executor as fallback",
                         kernel,
@@ -4772,49 +5554,42 @@ def codegen(self, wrapper):
                     self.set_cpp_kernel(kernel)
                 else:
                     self.cpp_kernel_name = get_aten_cpp_kernel_name(kernel)
-                    schema = kernel._schema
+                    schema = kernel._schema  # type: ignore[union-attr]
                     self.init_args_default_value(schema)
-                    self.ordered_kwargs_for_cpp_kernel = [
-                        x.name for x in schema.arguments if x.kwarg_only
-                    ]
-                    self.kwargs_default_value = {
-                        x.name: {"type": x.real_type, "value": x.default_value}
-                        for x in schema.arguments
-                        if x.kwarg_only
-                    }
             else:
                 self.python_kernel_name = str(kernel)
-
-        elif isinstance(kernel, torch._ops.HigherOrderOperator):
-            if getattr(torch._prims.rng_prims, kernel.__name__, None) is kernel:
-                self.python_kernel_name = f"torch._prims.rng_prims.{kernel.__name__}"
+        elif kernel.namespace == "_quantized":  # type: ignore[union-attr]
+            # Internal Quantized Fallback Ops
+            assert isinstance(kernel, torch._ops.OpOverload)
+            if V.graph.cpp_wrapper:
+                self.set_cpp_kernel(kernel)
+                if not config.abi_compatible:
+                    self.use_runtime_dispatch = True
             else:
-                raise NotImplementedError(
-                    "Unable to find HigherOrderOperator kernel name"
-                )
+                self.python_kernel_name = str(kernel)
+        elif isinstance(kernel, torch._ops.HigherOrderOperator):
+            self.python_kernel_name = f"torch.ops.higher_order.{kernel.__name__}"
         else:
             # For non-aten OpOverload, i.e. custom ops
+            self.python_kernel_name = f"{kernel.__module__.replace('._ops.', '.ops.')}.{kernel.__name__}"  # type: ignore[union-attr]
             if V.graph.cpp_wrapper:
                 self.use_runtime_dispatch = True
                 self.set_cpp_kernel(kernel)
-            else:
-                self.python_kernel_name = (
-                    f"{kernel.__module__.replace('._ops.', '.ops.')}.{kernel.__name__}"
-                )
 
         if self.use_runtime_dispatch:
             self.codegen_comment(wrapper)
 
             exported_args = None
             args = None
-            if config.is_fbcode() and V.graph.cpp_wrapper:
+            if config.abi_compatible:
                 exported_args = self.export_extern_kernel_node()
             else:
                 args = [*self.codegen_args(), *self.codegen_kwargs()]
 
             wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
                 self.get_name(),
-                self.get_kernel_name(),
+                self.python_kernel_name,
+                self.cpp_kernel_name,
                 args,
                 self.cpp_op_schema,
                 self.cpp_kernel_key,
@@ -4830,6 +5605,8 @@ def codegen(self, wrapper):
             if isinstance(self.layout, Layout):
                 self.codegen_size_asserts(wrapper)
 
+        self.codegen_unbacked_symbol_defs(wrapper)
+
     @staticmethod
     def tensor_to_layout(output: torch.Tensor):
         return FixedLayout(
@@ -4851,18 +5628,31 @@ def create(cls, kernel, *args, **kwargs):
                 tensor_args,
                 non_tensor_args,
                 unflatten_args,
+                unbacked_bindings,
             ) = cls.process_kernel(kernel, *args, **kwargs)
 
-        device = cls.find_device(tensor_args, example_output)
-        assert device, "Not sure where to find device info"
+        if example_output is None:
+            packed = cls(
+                NoneLayout(None),
+                kernel,
+                tensor_args,
+                non_tensor_args,
+                unflatten_args,
+                unbacked_bindings=unbacked_bindings,
+            )
 
-        packed = cls(
-            MultiOutputLayout(device),
-            kernel,
-            tensor_args,
-            non_tensor_args,
-            unflatten_args,
-        )
+        else:
+            device = cls.find_device(tensor_args, example_output)
+            assert device, "Not sure where to find device info"
+
+            packed = cls(
+                MultiOutputLayout(device),
+                kernel,
+                tensor_args,
+                non_tensor_args,
+                unflatten_args,
+                unbacked_bindings=unbacked_bindings,
+            )
 
         def generate_output(output, indices):
             if isinstance(output, (list, tuple)):
@@ -4903,68 +5693,34 @@ def apply_constraint(self):
 
 
 @dataclasses.dataclass
-class ComplexView(ExternKernelAlloc):
+class ComplexView(FallbackKernel):
     """View a complex number as two dtyped numbers or vice versa"""
 
     def should_allocate(self):
         return False
 
-    def get_alias_names(self):
+    def get_inputs_that_alias_output(self):
         # Signal to codegen that our output buffer isn't safe to reuse
         return [self.inputs[0].get_name()]
 
     def __init__(
         self,
         layout,
-        python_kernel_name,
-        cpp_kernel_name,
+        kernel,
         tensor_args,
         nontensor_args,
+        unflatten_args,
+        *,
+        unbacked_bindings=None,
     ):
         super().__init__(
             layout,
-            tuple(tensor_args),
-            tuple(nontensor_args),
-        )
-        # We need output buffers for generating kernel arguments in the
-        # abi-compatible mode, where we retrieve outputs by pass each individual
-        # output through the abi-compatible interface.
-        self.outputs: Sequence[Any] = []
-        self.python_kernel_name = python_kernel_name
-        self.cpp_kernel_name = cpp_kernel_name
-
-    @classmethod
-    def create(cls, kernel, *args, **kwargs):
-        context = V.graph.fake_mode
-        with context:
-            (
-                example_output,
-                tensor_args,
-                non_tensor_args,
-                unflatten_args,
-            ) = cls.process_kernel(kernel, *args, **kwargs)
-
-        device = FallbackKernel.find_device(tensor_args, example_output)
-        assert device, "Not sure where to find device info"
-
-        packed = ComplexView(
-            MultiOutputLayout(device),
-            str(kernel),
-            get_aten_cpp_kernel_name(kernel),
+            kernel,
             tensor_args,
-            non_tensor_args,
-        )
-
-        layout = FixedLayout(
-            example_output.device,
-            example_output.dtype,
-            convert_shape_to_inductor(example_output.size()),
-            convert_shape_to_inductor(example_output.stride()),
+            nontensor_args,
+            unflatten_args,
+            unbacked_bindings=unbacked_bindings,
         )
-        outputs = MultiOutput(layout, packed, [])
-
-        packed.outputs = [outputs]
-        return outputs
 
 
 @dataclasses.dataclass
@@ -4979,18 +5735,18 @@ class MultiOutput(ExternKernel):
     def codegen_list_tuple_access(self, basename, indices):
         if len(indices) > 0:
             itype, i = indices[0]
-            if itype == list:
+            if issubclass(itype, list):
                 return self.codegen_list_tuple_access(f"{basename}[{i}]", indices[1:])
-            elif itype == tuple:
+            elif issubclass(itype, tuple):
                 # cpp wrapper code needs to use std::get<> to access a tuple
                 tuple_access = V.graph.wrapper_code.codegen_tuple_access(
                     basename, self.get_name(), str(i)
                 )
                 return self.codegen_list_tuple_access(tuple_access, indices[1:])
-            elif itype == dict:
+            elif issubclass(itype, dict):
                 return self.codegen_list_tuple_access(f"{basename}['{i}']", indices[1:])
             else:
-                raise AssertionError("non supported index type")
+                raise AssertionError("non supported index type: ", itype)
         else:
             return basename
 
@@ -4999,7 +5755,6 @@ def codegen(self, wrapper):
             self.get_name(),
             self.codegen_list_tuple_access(self.inputs[0].get_name(), self.indices),
         )
-        self.codegen_unbacked_symbol_defs(wrapper)
 
     def __init__(self, layout, input, indices: List[Tuple[Any, ...]]):
         super().__init__(None, layout, [input], ())
@@ -5012,12 +5767,12 @@ def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
     def should_allocate(self):
         return False
 
-    def get_alias_names(self):
+    def get_inputs_that_alias_output(self):
         return [
             inp.get_name()
             for inp in self.inputs
-            if isinstance(inp, (FallbackKernel, ComplexView))
-            and len(inp.get_alias_names()) > 0
+            if isinstance(inp, FallbackKernel)
+            and len(inp.get_inputs_that_alias_output()) > 0
         ]
 
 
@@ -5141,9 +5896,22 @@ def _original_deconv_weight_size(
 
         req_stride_order = [0] + list(reversed(range(1, len(stride) + 1)))
         req_stride_order = [len(req_stride_order)] + req_stride_order
-        output_stride = make_channels_last_strides_for(output_size)
 
     x = cls.require_stride_order(x, req_stride_order)
+
+    # We won't do weight prepack for Conv if dynamic_shapes.
+    # In static shape cases, since weight is prepacked, we'll always force output to be channels last in the Conv kernel.
+    # In dynamic shape cases, for input with channels = 1, like tensor of size (s0, 1, 28, 28) and stride (784, 784, 28, 1),
+    # x = cls.require_stride_order(x, req_stride_order) where req_stride_order is in the channels last order
+    # won't change the stride of this tensor since stride for dimensions of size 1 is ignored. While in Conv kernel,
+    # this tensor is considered as channels first and the output will be in contiguous format.
+    # To align the behavior of the Conv kernel, we set the output_stride in such case to be contiguous instead of channels last.
+    dynamic_shapes = not all(isinstance(i, int) for i in (output_size))
+    if dynamic_shapes and is_contiguous_storage_and_layout(x):
+        output_stride = make_contiguous_strides_for(output_size)
+    else:
+        output_stride = make_channels_last_strides_for(output_size)
+
     assert x.get_device().type == "cpu" and weight.get_device().type == "cpu"
     inputs = [x, weight]
 
@@ -5240,7 +6008,8 @@ def __init__(
     def codegen(self, wrapper):
         wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
             self.get_name(),
-            self.get_kernel_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
             self.codegen_args(),
             self.cpp_op_schema,
             self.cpp_kernel_key,
@@ -5315,7 +6084,8 @@ def __init__(
     def codegen(self, wrapper):
         wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
             self.get_name(),
-            self.get_kernel_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
             self.codegen_args(),
             self.cpp_op_schema,
             self.cpp_kernel_key,
@@ -5405,7 +6175,8 @@ def __init__(
     def codegen(self, wrapper):
         wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
             self.get_name(),
-            self.get_kernel_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
             self.codegen_args(),
             self.cpp_op_schema,
             self.cpp_kernel_key,
@@ -5491,7 +6262,8 @@ def __init__(
     def codegen(self, wrapper):
         wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
             self.get_name(),
-            self.get_kernel_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
             self.codegen_args(),
             self.cpp_op_schema,
             self.cpp_kernel_key,
@@ -5545,7 +6317,8 @@ def __init__(
     def codegen(self, wrapper):
         wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
             self.get_name(),
-            self.get_kernel_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
             self.codegen_args(),
             self.cpp_op_schema,
             self.cpp_kernel_key,
@@ -5611,7 +6384,8 @@ def __init__(
     def codegen(self, wrapper):
         wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
             self.get_name(),
-            self.get_kernel_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
             self.codegen_args(),
             self.cpp_op_schema,
             self.cpp_kernel_key,
@@ -5682,7 +6456,8 @@ def __init__(
     def codegen(self, wrapper):
         wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
             self.get_name(),
-            self.get_kernel_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
             self.codegen_args(),
             self.cpp_op_schema,
             self.cpp_kernel_key,
@@ -5882,7 +6657,7 @@ def __init__(
                 torch::List<int64_t> padding,
                 torch::List<int64_t> dilation,
                 int64_t groups,
-                double inv_output_scale,
+                double output_scale,
                 int64_t output_zero_point,
                 c10::optional<c10::ScalarType> output_dtype,
                 c10::string_view attr,
@@ -5935,7 +6710,8 @@ def codegen(self, wrapper):
         )
         wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
             self.get_name(),
-            self.get_kernel_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
             codegen_args,
             self.cpp_op_schema,
             self.cpp_kernel_key,
@@ -6057,7 +6833,7 @@ def __init__(
                 torch::List<int64_t> padding,
                 torch::List<int64_t> dilation,
                 int64_t groups,
-                double inv_output_scale,
+                double output_scale,
                 int64_t output_zero_point,
                 c10::optional<c10::ScalarType> output_dtype,
                 c10::string_view binary_attr,
@@ -6120,7 +6896,8 @@ def codegen(self, wrapper):
         )
         wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
             self.get_name(),
-            self.get_kernel_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
             conv_args,
             self.cpp_op_schema,
             self.cpp_kernel_key,
@@ -6229,6 +7006,8 @@ def __init__(
         layout,
         inputs,
         constant_args=(),
+        has_bias=True,
+        x_scale_zp_are_tensors=False,
     ):
         """
         if bias is not None
@@ -6240,31 +7019,42 @@ def __init__(
             - const_args is: [bias, x_scale, x_zp, o_inv_scale, o_zp,
               fp32_output, unary_attr, unary_scalars, unary_algorithm]
         """
-        self.has_bias = len(inputs) == 5
+        self.has_bias = has_bias
+        self.x_scale_zp_are_tensors = x_scale_zp_are_tensors
         super().__init__(
             layout,
             inputs,
             constant_args,
             None,
-            python_kernel_name="torch.ops.onednn.qlinear_pointwise",
+            python_kernel_name=(
+                "torch.ops.onednn.qlinear_pointwise.tensor"
+                if x_scale_zp_are_tensors
+                else "torch.ops.onednn.qlinear_pointwise.default"
+            ),
             cpp_kernel_name="onednn::qlinear_pointwise",
         )
+        self.cpp_kernel_overload_name = "tensor" if x_scale_zp_are_tensors else ""
         self.cpp_kernel_key = "qlinear_pointwise"
-        self.cpp_op_schema = """
+        x_scale_type_str, x_zp_type_str = (
+            ("at::Tensor", "at::Tensor")
+            if x_scale_zp_are_tensors
+            else ("double", "int64_t")
+        )
+        self.cpp_op_schema = f"""
             at::Tensor(
                 at::Tensor act,
-                double act_scale,
-                int64_t act_zero_point,
+                {x_scale_type_str} act_scale,
+                {x_zp_type_str} act_zero_point,
                 at::Tensor weight,
                 at::Tensor weight_scales,
                 at::Tensor weight_zero_points,
                 c10::optional<at::Tensor> bias,
-                double inv_output_scale,
+                double output_scale,
                 int64_t output_zero_point,
                 c10::optional<c10::ScalarType> output_dtype,
-                std::string post_op_name,
+                c10::string_view post_op_name,
                 torch::List<c10::optional<at::Scalar>> post_op_args,
-                std::string post_op_algorithm)"""
+                c10::string_view post_op_algorithm)"""
 
     def codegen(self, wrapper):
         # Parser the inputs and constant
@@ -6276,16 +7066,29 @@ def codegen(self, wrapper):
         packed_weight = args[1]
         bias = args[2] if self.has_bias else const_args[0]
         w_scale, w_zp = args[-2], args[-1]
-        (
-            x_scale,
-            x_zp,
-            o_inv_scale,
-            o_zp,
-            output_dtype,
-            unary_attr,
-            unary_scalars,
-            unary_algorithm,
-        ) = const_args[-8:]
+        if self.x_scale_zp_are_tensors:
+            assert len(args) >= 4
+            x_scale, x_zp = args[-4], args[-3]
+            (
+                o_inv_scale,
+                o_zp,
+                output_dtype,
+                unary_attr,
+                unary_scalars,
+                unary_algorithm,
+            ) = const_args[-6:]
+        else:
+            assert len(const_args) >= 8
+            (
+                x_scale,
+                x_zp,
+                o_inv_scale,
+                o_zp,
+                output_dtype,
+                unary_attr,
+                unary_scalars,
+                unary_algorithm,
+            ) = const_args[-8:]
 
         codegen_args = (
             x,
@@ -6304,10 +7107,12 @@ def codegen(self, wrapper):
         )
         wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
             self.get_name(),
-            self.get_kernel_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
             codegen_args,
             self.cpp_op_schema,
             self.cpp_kernel_key,
+            self.cpp_kernel_overload_name,
         )
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
@@ -6336,12 +7141,19 @@ def create(
             bias,
         )
 
+        if isinstance(x_scale, TensorBox) and isinstance(x_zp, TensorBox):
+            x_scale.realize()
+            x_zp.realize()
+            inputs = inputs + [x_scale, x_zp]
+            x_scale_zp_are_tensors = True
+        else:
+            assert isinstance(x_scale, float) and isinstance(x_zp, int)
+            constant_args = constant_args + [x_scale, x_zp]
+            x_scale_zp_are_tensors = False
         w_scale.realize()
         w_zp.realize()
         inputs = inputs + [w_scale, w_zp]
         constant_args = constant_args + [
-            x_scale,
-            x_zp,
             o_inv_scale,
             output_zero_point,
             output_dtype,
@@ -6360,6 +7172,8 @@ def create(
             layout=kernel_layout,
             inputs=inputs,
             constant_args=constant_args,
+            has_bias=(bias is not None),
+            x_scale_zp_are_tensors=x_scale_zp_are_tensors,
         )
 
 
@@ -6380,6 +7194,9 @@ def __getattr__(self, name):
     def realize(self):
         return self.data.realize()
 
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        return self.data.get_unbacked_symbol_uses()
+
     def codegen_reference(self, writer=None):
         return self.data.codegen_reference(writer)
 
@@ -6393,6 +7210,10 @@ def get_layout(self):
     def get_size(self):
         return self.data.get_size()
 
+    @property
+    def dtype(self):
+        return self.data.dtype
+
     def __str__(self):
         if isinstance(self.data, MutableBox):
             line0 = f"{type(self).__name__}({type(self.data).__name__}("
@@ -6425,6 +7246,12 @@ def is_input_buffer(self):
             return self.data.get_name() in V.graph.graph_inputs
         return False
 
+    def is_module_buffer(self):
+        return (
+            isinstance(self.data, (ConstantBuffer))
+            and self.data.get_name() in V.graph.constants
+        )
+
     def realize(self):
         if isinstance(
             self.data,
@@ -6469,7 +7296,7 @@ def realize_hint(self):
     def has_exceeded_max_reads(self):
         return isinstance(self.data, Pointwise) and (
             self.num_reads() > config.realize_acc_reads_threshold
-            or self.inner_fn_str_len() > config.realize_bytes_threshold
+            or self.has_large_inner_fn()
         )
 
     def mark_reuse(self, users):
@@ -6491,7 +7318,7 @@ def should_realize_on_cpu(loops: Union[Pointwise, Reduction]):
             and isinstance(self.data, (Pointwise, Reduction))
             and (
                 self.num_reads() > config.realize_reads_threshold
-                or len(self.inner_fn_str()) > config.realize_bytes_threshold
+                or self.has_large_inner_fn()
                 or (is_cpu(self.data) and should_realize_on_cpu(self.data))
             )
         ):
@@ -6531,79 +7358,385 @@ def is_pointwise_non_scalar_tensor_num_reads_larger_than_one(self):
         )
 
 
-class InterpreterShim(torch.fx.Interpreter):
-    @staticmethod
-    @functools.lru_cache(None)
-    def _dummy_gm():
-        return torch.fx.symbolic_trace(identity)
+@dataclasses.dataclass
+class Subgraph(IRNode):
+    name: str
+    graph_module: torch.fx.GraphModule
+    graph: Optional["GraphLowering"] = None
 
-    def __init__(self, graph, submodules):
-        # call super() with a placeholder to avoid constructing a
-        # GraphModule which is very expensive (it does codegen).
-        super().__init__(self._dummy_gm(), garbage_collect_values=False)
-        self.module = self
-        self.graph = graph
-        self.submodules = submodules
-        self.extra_traceback = False
-        self.fetch_attr = submodules.__getitem__
-        self.current_node = None
 
-    def run_node(self, n: torch.fx.Node) -> Any:
-        self.current_node = n
-        return super().run_node(n)
+def _has_aliased_buffers(buffers):
+    buffers = [
+        buffer.unwrap_view() if isinstance(buffer, ReinterpretView) else buffer
+        for buffer in buffers
+    ]
+    # assuming the same buffer is represented by the same IRNode object
+    return len({id(buffer) for buffer in buffers}) < len(buffers)
 
-    def run(self, *args, **kwargs):
-        with V.set_interpreter_handler(self):
-            return super().run(*args, **kwargs)
 
+@dataclasses.dataclass
+class Conditional(ExternKernel):
+    predicate: Optional[IRNode] = None
+    operands: Optional[List[TensorBox]] = None
+    true_subgraph: Optional[Subgraph] = None
+    false_subgraph: Optional[Subgraph] = None
+    outputs: Optional[List[MultiOutput]] = None
 
-class LoopBody:
-    """
-    Captures the body of a Loops subclass into an FX graph.  Persists any
-    indexing simplifications and makes it easier to analyze loop bodies.
-    """
+    def __init__(
+        self,
+        predicate: IRNode,
+        operands: List[TensorBox],
+        true_subgraph: Subgraph,
+        false_subgraph: Subgraph,
+        layout: MultiOutputLayout,
+    ):
+        self.predicate = predicate
+        self.operands = operands
+        self.true_subgraph = true_subgraph
+        self.false_subgraph = false_subgraph
 
-    def __init__(self, fn, args, var_ranges):
-        super().__init__()
-        self.var_ranges = var_ranges
-        self.indexing_exprs = {}
-        self.indexing_exprs_name = {}
-        self.reads = []
-        self.writes = []
-        self.reads_name2expr = {}
-        self.writes_name2expr = {}
-        self.other = []
-        self.submodules = {"get_index": self.get_index}
-        self.subblocks = {}
-        self.indirect_vars = []
-        self.root_block = LoopBodyBlock(self, fn, args)
-        self.indexing = None
+        inputs = []
+        if not isinstance(predicate, ShapeAsConstantBuffer):
+            inputs.append(predicate)
+        inputs.extend(operands)
 
-    @cache_on_self
-    def get_nodes(self):
-        all_graphs = itertools.chain(
-            (self.root_block.graph,),
-            (block.graph for block in self.subblocks.values()),
+        super().__init__(
+            name=None,
+            layout=layout,  # type: ignore[arg-type]
+            inputs=inputs,  # type: ignore[list-item]
         )
-        return [node for graph in all_graphs for node in graph.nodes]
 
-    @cache_on_self
-    def bounds(self):
-        # Doing a local import to avoid dumping all the code here
-        from .bounds import BoundVars
+        self.name = V.graph.register_buffer(self)
 
-        return BoundVars(self)
+    @classmethod
+    def create(
+        cls,
+        predicate: TensorBox,
+        true_fn: Subgraph,
+        false_fn: Subgraph,
+        operands: List[TensorBox],
+    ):
+        predicate = cls.realize_input(predicate)
+        operands = [cls.realize_input(x) for x in operands]
+
+        fx_operands = V.graph.current_node.args[-1]
+        fake_operands = [x.meta["val"] for x in fx_operands]  # type: ignore[union-attr]
+
+        for subgraph in (true_fn, false_fn):
+            if subgraph.graph is None:
+                # create and lower subgraphs
+                subgraph.graph = V.graph.make_subgraph(
+                    gm=subgraph.graph_module,
+                    example_inputs=fake_operands,
+                    subgraph_name=subgraph.name,
+                )
+                with V.set_graph_handler(subgraph.graph):
+                    subgraph.graph.run(*fake_operands)
 
-    def debug_str(self):
-        lines = [f"var_ranges = {dict(self.var_ranges)}"]
-        lines.extend([f"{name} = {val}" for name, val in self.indexing_exprs.items()])
-        lines.extend(
-            [
-                block.debug_str(name)
-                for name, block in itertools.chain(
-                    [("body", self.root_block)], self.subblocks.items()
+        true_outputs = true_fn.graph.graph_outputs  # type: ignore[union-attr]
+        false_outputs = true_fn.graph.graph_outputs  # type: ignore[union-attr]
+
+        for name, outputs in (("true_fn", true_outputs), ("false_fn", false_outputs)):
+            if _has_aliased_buffers(true_outputs):
+                raise AssertionError(
+                    "Output aliasing is currently not supported in compiled torch.cond. "
+                    f"The outputs of the {name} subgraph of torch.cond are aliased: {outputs}"
                 )
-            ]
+
+        # make sure true and false outputs are structurally equivalent
+        assert len(true_outputs) == len(false_outputs), (true_outputs, false_outputs)
+        for i, (to, fo) in enumerate(zip(true_outputs, false_outputs)):
+            assert to.get_size() == fo.get_size(), (i, to, fo)
+            assert to.get_stride() == fo.get_stride(), (i, to, fo)
+            assert to.get_device() == fo.get_device(), (i, to, fo)
+            assert to.get_dtype() == fo.get_dtype(), (i, to, fo)
+            assert to.get_layout().offset == fo.get_layout().offset, (i, to, fo)
+
+        if not isinstance(predicate, ShapeAsConstantBuffer):
+            # use predicate device for consistent codegen-ing
+            device = predicate.get_device()
+        else:
+            # predicate is not a Tensor: use first operand's device
+            assert (
+                len(operands) > 0
+            ), "When predicate is not a Tensor, there must be at least one operand in torch.cond."
+            device = operands[0].get_device()
+
+        conditional = Conditional(
+            predicate=predicate,
+            operands=operands,
+            true_subgraph=true_fn,
+            false_subgraph=false_fn,
+            layout=MultiOutputLayout(device),
+        )
+
+        outputs = [
+            MultiOutput(
+                FixedLayout(
+                    device=output.get_device(),
+                    dtype=output.get_dtype(),
+                    size=output.get_size(),
+                    stride=output.get_stride(),
+                    offset=output.get_layout().offset,
+                ),
+                conditional,
+                [(list, i)],
+            )
+            # as the true and false outputs are equivalent,
+            # we can use either of them here as a "template"
+            for i, output in enumerate(true_outputs)
+        ]
+
+        conditional.outputs = outputs
+        return outputs
+
+    def codegen(self, wrapper):
+        wrapper.codegen_conditional(self)
+
+
+@dataclasses.dataclass
+class WhileLoop(ExternKernel):
+    carried_inputs: Optional[List[TensorBox]] = None
+    additional_inputs: Optional[List[TensorBox]] = None
+    cond_subgraph: Optional[Subgraph] = None
+    body_subgraph: Optional[Subgraph] = None
+    outputs: Optional[List[MultiOutput]] = None
+
+    def __init__(
+        self,
+        carried_inputs: List[TensorBox],
+        additional_inputs: List[TensorBox],
+        cond_subgraph: Subgraph,
+        body_subgraph: Subgraph,
+        layout: MultiOutputLayout,
+    ):
+        self.carried_inputs = carried_inputs
+        self.additional_inputs = additional_inputs
+        self.cond_subgraph = cond_subgraph
+        self.body_subgraph = body_subgraph
+
+        super().__init__(
+            name=None,
+            layout=layout,  # type: ignore[arg-type]
+            inputs=carried_inputs + additional_inputs,  # type: ignore[list-item]
+        )
+
+        self.name = V.graph.register_buffer(self)
+
+    @classmethod
+    def create(
+        cls,
+        cond_fn: Subgraph,
+        body_fn: Subgraph,
+        carried_inputs: List[TensorBox],
+        additional_inputs: List[TensorBox],
+    ):
+        carried_inputs = [cls.realize_input(x) for x in carried_inputs]
+        additional_inputs = [cls.realize_input(x) for x in additional_inputs]
+        all_inputs = carried_inputs + additional_inputs
+
+        fx_all_inputs = V.graph.current_node.args[-2] + V.graph.current_node.args[-1]  # type: ignore[operator]
+        fake_all_inputs = [x.meta["val"] for x in fx_all_inputs]  # type: ignore[union-attr]
+
+        for subgraph in (cond_fn, body_fn):
+            if subgraph.graph is None:
+                # create and lower subgraphs
+                subgraph.graph = V.graph.make_subgraph(
+                    gm=subgraph.graph_module,
+                    example_inputs=fx_all_inputs,  # type: ignore[arg-type]
+                    subgraph_name=subgraph.name,
+                )
+                with V.set_graph_handler(subgraph.graph):
+                    subgraph.graph.run(*fake_all_inputs)
+
+        cond_outputs = cond_fn.graph.graph_outputs  # type: ignore[union-attr]
+        body_outputs = body_fn.graph.graph_outputs  # type: ignore[union-attr]
+
+        if _has_aliased_buffers(body_outputs):
+            raise AssertionError(
+                "Output aliasing is currently not supported in compiled torch.while_loop. "
+                f"The outputs of the body_fn subgraph of torch.while_loop are aliased: {body_outputs}"
+            )
+
+        # make sure cond_fn returns a boolean scalar Tensor
+        assert len(cond_outputs) == 1, cond_outputs
+        assert cond_outputs[0].get_dtype() == torch.bool, cond_outputs
+        assert len(cond_outputs[0].get_size()) == 0, cond_outputs
+
+        assert (
+            len(all_inputs) > 0
+        ), "torch.while_loop is assumed to have at least one operand."
+
+        device = all_inputs[0].get_device()
+
+        # make sure carried_inputs and body outputs are structurally equivalent
+        assert len(carried_inputs) == len(body_outputs), (carried_inputs, body_outputs)
+        for i, (op, bo) in enumerate(zip(carried_inputs, body_outputs)):
+            assert op.get_size() == bo.get_size(), (i, op, bo)
+            assert op.get_stride() == bo.get_stride(), (i, op, bo)
+            # assume all carried_inputs and outputs are on the same device
+            # as the MultiOutputLayout below requires single device
+            assert op.get_device() == bo.get_device() == device, (i, op, bo, device)
+            assert op.get_dtype() == bo.get_dtype(), (i, op, bo)
+            assert op.get_layout().offset == bo.get_layout().offset, (i, op, bo)
+
+        while_loop = WhileLoop(
+            carried_inputs=carried_inputs,
+            additional_inputs=additional_inputs,
+            cond_subgraph=cond_fn,
+            body_subgraph=body_fn,
+            # asserted above that there is at least one operand
+            layout=MultiOutputLayout(device),
+        )
+
+        outputs = [
+            MultiOutput(
+                FixedLayout(
+                    device=output.get_device(),
+                    dtype=output.get_dtype(),
+                    size=output.get_size(),
+                    stride=output.get_stride(),
+                    offset=output.get_layout().offset,
+                ),
+                while_loop,
+                [(list, i)],
+            )
+            for i, output in enumerate(body_outputs)
+        ]
+
+        for inp, out in zip(carried_inputs, outputs):
+            if inp.get_name() in V.graph.graph_inputs:
+                # if a carried input of the while_loop is a graph input,
+                # it can be returned as is when the number of iterations
+                # is zero. due to this, we can't (generally) reuse the
+                # output buffers corresponding to the graph inputs, as
+                # the inputs may end up being mutated.
+                V.graph.never_reuse_buffers.add(out.get_name())
+
+        while_loop.outputs = outputs
+        return outputs
+
+    def codegen(self, wrapper):
+        wrapper.codegen_while_loop(self)
+
+
+class EffectfulKernel(FallbackKernel):
+    def __init__(
+        self,
+        layout,
+        kernel,
+        tensor_args,
+        nontensor_args,
+        unflatten_args,
+        kwargs=None,
+        *,
+        unbacked_bindings=None,
+    ):
+        super().__init__(
+            NoneLayout(layout.device),
+            kernel,
+            tensor_args,
+            nontensor_args,
+            unflatten_args,
+            kwargs=None,
+            unbacked_bindings=unbacked_bindings,
+        )
+
+        from torch._higher_order_ops.effects import get_effect_key
+
+        effect_type = get_effect_key(kernel, (*nontensor_args, *tensor_args), kwargs)
+        assert effect_type is not None
+        self.effect_type = effect_type
+        self.prev_effect_buffer = V.graph.effectful_ops.get(effect_type, None)
+        V.graph.effectful_ops[effect_type] = self
+
+    def get_read_writes(self):
+        read_writes = super().get_read_writes()
+
+        if self.prev_effect_buffer is not None:
+            read_writes.reads.add(
+                dependencies.StarDep(self.prev_effect_buffer.get_name())
+            )
+
+        return read_writes
+
+    def has_side_effects(self):
+        return True
+
+
+class InterpreterShim(torch.fx.Interpreter):
+    @staticmethod
+    @functools.lru_cache(None)
+    def _dummy_gm():
+        return torch.fx.symbolic_trace(identity)
+
+    def __init__(self, graph, submodules):
+        # call super() with a placeholder to avoid constructing a
+        # GraphModule which is very expensive (it does codegen).
+        super().__init__(self._dummy_gm(), garbage_collect_values=False)
+        self.module = self  # type: ignore[assignment]
+        self.graph = graph
+        self.submodules = submodules
+        self.extra_traceback = False
+        self.fetch_attr = submodules.__getitem__
+        self.current_node = None
+
+    def run_node(self, n: torch.fx.Node) -> Any:
+        self.current_node = n
+        return super().run_node(n)
+
+    def run(self, *args, **kwargs):
+        with V.set_interpreter_handler(self):
+            return super().run(*args, **kwargs)
+
+
+class LoopBody:
+    """
+    Captures the body of a Loops subclass into an FX graph.  Persists any
+    indexing simplifications and makes it easier to analyze loop bodies.
+    """
+
+    def __init__(self, fn, args, var_ranges):
+        super().__init__()
+        self.var_ranges = var_ranges
+        self.indexing_exprs = {}
+        self.indexing_exprs_name = {}
+        self.reads = []
+        self.writes = []
+        self.reads_name2expr = {}
+        self.writes_name2expr = {}
+        self.other = []
+        self.submodules = {"get_index": self.get_index}
+        self.subblocks = {}
+        self.indirect_vars = []
+        self.root_block = LoopBodyBlock(self, fn, args)
+        self.indexing = None
+
+    @cache_on_self
+    def get_nodes(self):
+        all_graphs = itertools.chain(
+            (self.root_block.graph,),
+            (block.graph for block in self.subblocks.values()),
+        )
+        return [node for graph in all_graphs for node in graph.nodes]
+
+    @cache_on_self
+    def bounds(self):
+        # Doing a local import to avoid dumping all the code here
+        from .bounds import BoundVars
+
+        return BoundVars(self)
+
+    def debug_str(self):
+        lines = [f"var_ranges = {dict(self.var_ranges)}"]
+        lines.extend([f"{name} = {val}" for name, val in self.indexing_exprs.items()])
+        lines.extend(
+            [
+                block.debug_str(name)
+                for name, block in itertools.chain(
+                    [("body", self.root_block)], self.subblocks.items()
+                )
+            ]
         )
         return "\n".join(lines)
 
@@ -6627,8 +7760,7 @@ def add_submodule(self, block, prefix):
         return name
 
     def add_indirect(self, size):
-        name = f"indirect{len(self.indirect_vars)}"
-        var = sympy_symbol(name)
+        var = sympy_index_symbol_with_prefix(SymT.INDIRECT, len(self.indirect_vars))
         self.indirect_vars.append(var)
         return var
 
@@ -6736,15 +7868,29 @@ def shim(mask, other):
 
             @staticmethod
             def scan(
-                dtype_proxy, combine_fn: Callable[..., Any], value_proxy, init_proxy
+                dtype_proxy,
+                combine_fn: Callable[
+                    [Tuple[Any, ...], Tuple[Any, ...]], Tuple[Any, ...]
+                ],
+                value_proxy,
             ):
-                def shim(dtype, value, init):
-                    return V.ops.scan(dtype, combine_fn, value, init)
+                def shim(dtypes, values):
+                    return V.ops.scan(dtypes, combine_fn, values)
 
                 name = self.body.add_submodule(shim, "scan")
-                return tracer.create_proxy(
-                    "call_module", name, (dtype_proxy, value_proxy, init_proxy), {}
+                result = tracer.create_proxy(
+                    "call_module",
+                    name,
+                    (dtype_proxy, value_proxy),
+                    {},
                 )
+                # Proxies are iterable, but some methods expect tuples/lists
+                return tuple(result[i] for i in range(len(value_proxy)))
+
+            def frexp(self, value_proxy):
+                result = self._inner.frexp(value_proxy)
+                # Proxies are iterable, but some methods expect tuples/lists
+                return (result[0], result[1])
 
             @staticmethod
             def indirect_indexing(index_proxy, size, check=True):
@@ -6807,521 +7953,6 @@ def debug_str(self, name="block"):
         )
 
 
-class Wait(ExternKernelAlloc):
-    """
-    Wait should not be used by itself.  It should always be constructed in tandem
-    with a collective op that produces a work to wait on.
-    """
-
-    def __init__(
-        self,
-        layout,
-        inputs,
-        constant_args=(),
-    ):
-        super().__init__(layout, inputs, constant_args)
-
-    def should_allocate(self):
-        return False
-
-    def codegen(self, wrapper):
-        from .codegen.wrapper import ReuseLine
-
-        wrapper.add_import_once(
-            "from torch.distributed._functional_collectives_impl import _wait_tensor"
-        )
-        (input_collective,) = (t.codegen_reference() for t in self.inputs)
-        wrapper.writeline(f"{input_collective} = _wait_tensor({input_collective})")
-
-        # wait op still needs to produce a 'buffer' that represents the tensor output.
-        # this is a symbolic gesture, and it gets handled by WrapperCodegen.
-        # codegen outputs a '# reuse' line that assigns the input buffer here ('input_collective')
-        # to a new name (`self.get_name()`) and `del`s the old name.
-        wrapper.writeline(ReuseLine(wrapper, self.inputs[0], self, delete_old=False))
-
-    @classmethod
-    def create(cls, collective_op: "TensorBox"):
-        # TODO(whc) i'm not sure what's going on here, this probably means I missed something upstream
-        collective_op.decide_layout()
-        return Wait(
-            layout=AliasedLayout(collective_op),
-            inputs=[collective_op],
-        )
-
-    def get_alias_names(self):
-        # Signal to codegen that our output buffer isn't safe to reuse
-        return [self.inputs[0].codegen_reference()]
-
-    def get_mutation_names(self):
-        # The generated `_wait_tensor` op mutates the input tensor
-        return [self.inputs[0].codegen_reference()]
-
-
-class CollectiveKernel(ExternKernel):
-    """
-    Each collective should follow the pattern:
-    - extend InPlaceCollectiveKernel or OutOfPlaceCollectiveKernel.
-    - the kernel delegates into c10d processgroup, which returns a 'work' obj
-    - the work obj is registered via _register_tensor_work so it can be waited on later
-    """
-
-    def __init__(self, layout, inputs, constant_args):
-        super().__init__(None, layout, inputs, constant_args)
-        self.name = V.graph.register_buffer(self)
-
-    def should_emit_register_tensor_work(self):
-        return True
-
-    def should_emit_find_or_create_pg(self):
-        return True
-
-    def codegen_collective(self, wrapper, output_name, input_names):
-        # factor so the boilerplate can be handled in CollectiveKernel.codegen
-        raise NotImplementedError("Must implement")
-
-    def codegen_output(self, wrapper, output_name, input_names):
-        # factor so the boilerplate can be handled in CollectiveKernel.codegen
-        raise NotImplementedError("Must implement")
-
-    @classmethod
-    def wrap_inputs_as_inplace(cls, inputs):
-        def wrap_input(var):
-            op = InPlaceHint(
-                FlexibleLayout(var.get_device(), var.get_dtype(), var.get_size()), var
-            )
-            return TensorBox.create(op)
-
-        return list(map(wrap_input, inputs))
-
-    def codegen(self, wrapper):
-        wrapper.add_import_once("import torch.distributed as dist")
-        wrapper.add_import_once("import torch.distributed.distributed_c10d as c10d")
-        wrapper.add_import_once(
-            "import torch.distributed._functional_collectives_impl as fun_col_impl"
-        )
-        # extract references to our args in string form for codegen output
-        input_names = [t.codegen_reference() for t in self.inputs]
-        output_name = self.get_name()
-        tag, ranks, group_size = self.constant_args
-
-        if self.should_emit_find_or_create_pg():
-            # TODO: avoid more than one ref of the same pg (even though they are cached inside the api)
-            wrapper.writeline(
-                f"{output_name}_pg = c10d._find_or_create_pg_by_ranks_and_tag('{tag}', {ranks}, {group_size})"
-            )
-
-        self.codegen_output(wrapper, output_name, input_names)
-        self.codegen_collective(wrapper, output_name, input_names)
-        if self.should_emit_register_tensor_work():
-            wrapper.writeline(
-                f"fun_col_impl._register_tensor_work({output_name}, {output_name}_work)"
-            )
-
-
-class InPlaceCollectiveKernel(CollectiveKernel):
-    """
-    InPlaceCollectiveKernel are those with in-out arguments such as all_reduce.
-    Extend this kernel if your collective needs to modify its inputs in-place.
-    """
-
-    def __init__(self, layout, inputs, constant_args):
-        super().__init__(layout, inputs, constant_args)
-
-    def should_allocate(self):
-        return False
-
-    def has_side_effects(self):
-        return True
-
-    def codegen_output(self, wrapper, output_name, input_names):
-        if len(input_names) > 1:
-            wrapper.writeline(f"{output_name} = [{','.join(input_names)}] ")
-        else:
-            wrapper.writeline(f"{output_name} = {input_names[0]}")
-
-
-class OutOfPlaceCollectiveKernel(CollectiveKernel):
-    """
-    OutOfPlaceCollectiveKernel are those that allocate their
-    outputs and leave their inputs inplace, such as all_gather.
-    """
-
-    def __init__(self, layout, inputs, outputs, constant_args):
-        super().__init__(layout, inputs + outputs, constant_args)
-        self.outputs = outputs
-        self.original_inputs = inputs
-        # NOTE: As seen in issue #108780, output buffers of out-of-place collectives
-        # could be incorrectly reused. As a safety measure, here we just ban the reuse of them.
-        # TODO: A better fix is to figure out how to propagate the aliases properly,
-        # so that the buffer is only reused after all its users have consumed it.
-        for x in self.outputs:
-            V.graph.never_reuse_buffers.add(x.name)
-
-    def should_allocate(self):
-        return False
-
-    def has_side_effects(self):
-        return True
-
-    def codegen_output(self, wrapper, output_name, input_names):
-        input_names = [t.codegen_reference() for t in self.original_inputs]
-        wrapper.writeline(f"{output_name}_inputs = [{','.join(input_names)}]")
-        wrapper.writeline(f"{output_name} = [{','.join(x.name for x in self.outputs)}]")
-
-    @classmethod
-    def create_output_buffers(cls, inputs, size_cb=None):
-        outputs = []
-        for input in inputs:
-            new_size = input.get_size()
-            if size_cb is not None:
-                size_cb(new_size)
-            # new_size[0] *= group_size
-
-            buff = OutputBuffer(
-                layout=FlexibleLayout(
-                    device=input.get_device(),
-                    dtype=input.get_dtype(),
-                    size=new_size,
-                ),
-            )
-            outputs.append(buff)
-        return outputs
-
-    @classmethod
-    def create_output_nodes(cls, coll, output_buffers):
-        return [
-            MultiOutputNoSizeAssert(
-                out_t.layout,
-                coll,
-                f"[{i}]",
-            )
-            for i, out_t in enumerate(output_buffers)
-        ]
-
-
-class InPlaceHint(ExternKernel):
-    """
-    Helper OP to encode an in/out argument that tries to make it inplace whenever possible.
-    Wrap the input of your inplace op to enable this behavior.
-
-    The design is based on two key decisions:
-    - this node is responsible for allocating the in/out buffer used by the collective.
-        This is controlled by the ``should_allocate`` method that returns True here and
-        False for the collective node
-    - The scheduler special-case this node and enable it to reuse its input.
-    """
-
-    def codegen(self, wrapper):
-        input_name = self.inputs[0].codegen_reference()
-        output_name = self.get_name()
-        if not wrapper.did_reuse(self, self.inputs[0]):
-            wrapper.writeline(f"{output_name}.copy_({input_name}) #no reuse")
-
-    def __init__(self, layout, input):
-        input = self.realize_input(input)
-        super().__init__(None, layout, self.unwrap_storage([input]), ())
-        self.name = V.graph.register_buffer(self)
-
-    def should_allocate(self):
-        return True
-
-
-class OutputBuffer(ExternKernel):
-    """
-    Represent the output buffer used by ops that require multiple of them
-    """
-
-    def __init__(self, layout):
-        super().__init__(name=None, layout=layout, inputs=[])
-        self.name = V.graph.register_buffer(self)
-
-    def should_allocate(self):
-        return True
-
-    def codegen(self, wrapper):
-        wrapper.writeline(f"# collective out buffer {self.name}")
-
-
-class MultiOutputNoSizeAssert(MultiOutput):
-    """
-    Extract partial output from a multi-output OP.
-    Works like MultiOutput but doesn't assert size. This must be a property guaranteed by the op emitting this.
-    """
-
-    def __init__(self, layout, input, index):
-        super().__init__(layout, input, [])
-        self.index = index
-
-    def codegen(self, wrapper):
-        wrapper.writeline(
-            f"{self.get_name()} = {self.inputs[0].get_name()}{self.index}"
-        )
-
-
-class Broadcast(InPlaceCollectiveKernel):
-    def __init__(self, layout, inputs, constant_args, src):
-        super().__init__(layout, inputs, constant_args)
-        self.src = src
-
-    def get_mutation_names(self):
-        return [self.inputs[0].get_name()]
-
-    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
-        return set()
-
-    @classmethod
-    def create(
-        cls, x: "TensorBox", src: int, tag: str, ranks: List[int], group_size: int
-    ):
-        inplace_inputs = cls.wrap_inputs_as_inplace([x])
-        packed = Broadcast(
-            layout=NoneLayout(inplace_inputs[0].get_device()),  # type: ignore[arg-type]
-            inputs=inplace_inputs,
-            constant_args=[tag, ranks, group_size],
-            src=src,
-        )
-        mark_node_as_mutating(packed, inplace_inputs[0])
-        return inplace_inputs[0]
-
-    def codegen_collective(self, wrapper, output_name, input_names):
-        wrapper.writeline(
-            f"{output_name}_work = dist.broadcast("
-            f"{output_name}, async_op=True, group={output_name}_pg, src={self.src})"
-        )
-
-
-class AllReduceCoalesced(InPlaceCollectiveKernel):
-    def __init__(self, layout, inputs, constant_args, reduce_op):
-        super().__init__(layout, inputs, constant_args)
-        self.reduce_op = reduce_op
-
-    def should_allocate(self):
-        return False
-
-    def get_mutation_names(self):
-        return [self.inputs[0].get_name()]
-
-    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
-        return set()
-
-    @classmethod
-    def create(
-        cls,
-        inputs: List["TensorBox"],
-        reduce_op: str,
-        tag: str,
-        ranks: List[int],
-        group_size: int,
-    ):
-        inplace_inputs = cls.wrap_inputs_as_inplace(inputs)
-        packed = AllReduceCoalesced(
-            layout=NoneLayout(inplace_inputs[0].get_device()),  # type: ignore[arg-type]
-            inputs=inplace_inputs,
-            constant_args=[tag, ranks, group_size],
-            reduce_op=reduce_op,
-        )
-        mark_node_as_mutating(packed, inplace_inputs[0])
-        return inplace_inputs
-
-    def codegen_collective(self, wrapper, output_name, input_names):
-        wrapper.writeline(
-            f"{output_name}_work = dist.all_reduce_coalesced("
-            f"{output_name}, "
-            f"op=fun_col_impl._str_to_reduce_op('{str(self.reduce_op)}'), "
-            f"group={output_name}_pg, "
-            "async_op=True)"
-        )
-
-
-class AllReduce(InPlaceCollectiveKernel):
-    def __init__(self, layout, inputs, constant_args, reduce_op):
-        super().__init__(layout, inputs, constant_args)
-        self.reduce_op = reduce_op
-
-    def get_mutation_names(self):
-        return [self.inputs[0].get_name()]
-
-    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
-        return set()
-
-    @classmethod
-    def create(
-        cls, x: "TensorBox", reduce_op: str, tag: str, ranks: List[int], group_size: int
-    ):
-        inplace_inputs = cls.wrap_inputs_as_inplace([x])
-
-        packed = AllReduce(
-            layout=NoneLayout(inplace_inputs[0].get_device()),  # type: ignore[arg-type]
-            inputs=inplace_inputs,
-            constant_args=[tag, ranks, group_size],
-            reduce_op=reduce_op,
-        )
-        mark_node_as_mutating(packed, inplace_inputs[0])
-        return inplace_inputs[0]
-
-    def codegen_collective(self, wrapper, output_name, input_names):
-        wrapper.writeline(
-            f"{output_name}_work = dist.all_reduce("
-            f"{output_name}, async_op=True, group={output_name}_pg, op=fun_col_impl._str_to_reduce_op('{str(self.reduce_op)}'))"
-        )
-
-
-class AllGatherIntoTensor(OutOfPlaceCollectiveKernel):
-    def __init__(self, layout, inputs, outputs, constant_args):
-        super().__init__(layout, inputs, outputs, constant_args)
-
-    @classmethod
-    def create(cls, x: "TensorBox", tag: str, ranks: List[int], group_size: int):
-        inputs = [cls.realize_input(x)]
-
-        def compute_size(new_size):
-            new_size[0] *= group_size
-
-        outputs = cls.create_output_buffers(inputs, compute_size)
-
-        layout = MultiOutputLayout(inputs[0].get_device())
-
-        packed = AllGatherIntoTensor(
-            layout=layout,
-            inputs=inputs,
-            outputs=outputs,
-            constant_args=[tag, ranks, group_size],
-        )
-        return cls.create_output_nodes(packed, outputs)[0]
-
-    def codegen_collective(self, wrapper, output_name, input_names):
-        wrapper.writeline(
-            f"{output_name}_work = dist.all_gather_into_tensor("
-            f"{output_name}[0], {output_name}_inputs[0], async_op=True, group={output_name}_pg)"
-        )
-
-
-class ReduceScatterTensor(OutOfPlaceCollectiveKernel):
-    def __init__(self, layout, inputs, outputs, constant_args, reduce_op):
-        super().__init__(layout, inputs, outputs, constant_args)
-        self.reduce_op = reduce_op
-
-    @classmethod
-    def create(
-        cls,
-        x: "TensorBox",
-        reduce_op: str,
-        tag: str,
-        ranks: List[int],
-        group_size: int,
-    ):
-        inputs = [cls.realize_input(x)]
-
-        def compute_size(new_size):
-            new_size[0] //= group_size
-
-        outputs = cls.create_output_buffers(inputs, compute_size)
-
-        layout = MultiOutputLayout(inputs[0].get_device())
-
-        packed = ReduceScatterTensor(
-            layout=layout,
-            inputs=inputs,
-            outputs=outputs,
-            constant_args=[tag, ranks, group_size],
-            reduce_op=reduce_op,
-        )
-        return cls.create_output_nodes(packed, outputs)[0]
-
-    def codegen_collective(self, wrapper, output_name, input_names):
-        wrapper.writeline(
-            f"{output_name}_work = dist.reduce_scatter_tensor("
-            f"{output_name}[0], {output_name}_inputs[0], "
-            f"async_op=True, group={output_name}_pg, op=fun_col_impl._str_to_reduce_op('{str(self.reduce_op)}'))"
-        )
-
-
-class AllGatherIntoTensorCoalesced(OutOfPlaceCollectiveKernel):
-    def __init__(self, layout, inputs, outputs, constant_args):
-        super().__init__(layout, inputs, outputs, constant_args)
-
-    @classmethod
-    def create(
-        cls,
-        inputs: List["TensorBox"],
-        tag: str,
-        ranks: List[int],
-        group_size: int,
-    ):
-        inputs = [cls.realize_input(x) for x in inputs]
-
-        def compute_size(new_size):
-            new_size[0] *= group_size
-
-        outputs = cls.create_output_buffers(inputs, compute_size)
-
-        layout = MultiOutputLayout(inputs[0].get_device())
-
-        packed = AllGatherIntoTensorCoalesced(
-            layout=layout,
-            inputs=inputs,
-            outputs=outputs,
-            constant_args=[tag, ranks, group_size],
-        )
-
-        return outputs
-        # return cls.create_output_nodes(packed, outputs)
-
-    def codegen_collective(self, wrapper, output_name, input_names):
-        wrapper.writeline(
-            f"{output_name}_work = fun_col_impl._all_gather_into_tensor_coalesced_fallback("
-            f"output_tensors={output_name}, "
-            f"input_tensors={output_name}_inputs, "
-            f"group={output_name}_pg, "
-            "async_op=True)"
-        )
-
-
-class ReduceScatterTensorCoalesced(OutOfPlaceCollectiveKernel):
-    def __init__(self, layout, inputs, outputs, constant_args, reduce_op):
-        super().__init__(layout, inputs, outputs, constant_args)
-        self.reduce_op = reduce_op
-
-    @classmethod
-    def create(
-        cls,
-        inputs: List["TensorBox"],
-        reduce_op: str,
-        tag: str,
-        ranks: List[int],
-        group_size: int,
-    ):
-        inputs = [cls.realize_input(x) for x in inputs]
-
-        def compute_size(new_size):
-            new_size[0] //= group_size
-
-        outputs = cls.create_output_buffers(inputs, compute_size)
-
-        layout = MultiOutputLayout(inputs[0].get_device())
-
-        _ = ReduceScatterTensorCoalesced(
-            layout=layout,
-            inputs=inputs,
-            outputs=outputs,
-            constant_args=[tag, ranks, group_size],
-            reduce_op=reduce_op,
-        )
-
-        return outputs
-
-    def codegen_collective(self, wrapper, output_name, input_names):
-        wrapper.writeline(
-            f"{output_name}_work = fun_col_impl._reduce_scatter_tensor_coalesced_fallback("
-            f"output_tensors={output_name}, "
-            f"input_tensors={output_name}_inputs, "
-            f"op=fun_col_impl._str_to_reduce_op('{str(self.reduce_op)}'), "
-            f"group={output_name}_pg, "
-            "async_op=True)"
-        )
-
-
-# TODO(yifu): replace the CollectiveKernel IR hierarchy with _CollectiveKernel.
 class _CollectiveKernel(FallbackKernel):
     def should_allocate(self):
         return False
@@ -7353,13 +7984,17 @@ def set_cpp_kernel(self, kernel):
     def create_inplace(
         cls, kernel, inputs: Union[TensorBox, List[TensorBox]], *args, **kwargs
     ) -> None:
+        cpp_kernel_name = kernel._name
+        python_kernel_name = cpp_kernel_name.replace("::", ".")
         with V.graph.fake_mode:
             (
                 example_output,
                 tensor_args,
                 non_tensor_args,
                 unflatten_args,
+                unbacked_bindings,
             ) = cls.process_kernel(kernel, inputs, *args, **kwargs)
+        assert not unbacked_bindings, f"{kernel} {unbacked_bindings}"
         for tensor_arg in tensor_args:
             tensor_arg.realize()
 
@@ -7370,7 +8005,15 @@ def create_inplace(
             non_tensor_args,
             unflatten_args,
         )
-        pytree.tree_map(lambda x: MutationOutput(x.layout, x, packed), inputs)
+        packed.cpp_kernel_name = cpp_kernel_name
+        packed.python_kernel_name = python_kernel_name
+
+        def mark_mutation(x):
+            if isinstance(x.data, BaseView):
+                x = x.data.unwrap_view()
+            MutationOutput(x.layout, x, packed)
+
+        pytree.tree_map(lambda inp: mark_mutation(inp), inputs)
 
     # NOTE: [Out-of-Place Collective Safety]
     # Between the initiation and completion of an out-of-place collective:
@@ -7398,13 +8041,17 @@ def create_inplace(
     def create_out_of_place(
         cls, kernel, inputs: Union[TensorBox, List[TensorBox]], *args, **kwargs
     ):
+        cpp_kernel_name = kernel._name
+        python_kernel_name = cpp_kernel_name.replace("::", ".")
         with V.graph.fake_mode:
             (
                 example_output,
                 tensor_args,
                 non_tensor_args,
                 unflatten_args,
+                unbacked_bindings,
             ) = cls.process_kernel(kernel, inputs, *args, **kwargs)
+        assert not unbacked_bindings, f"{kernel}, {unbacked_bindings}"
         for tensor_arg in tensor_args:
             tensor_arg.realize()
 
@@ -7417,6 +8064,8 @@ def create_out_of_place(
                 non_tensor_args,
                 unflatten_args,
             )
+            packed.cpp_kernel_name = cpp_kernel_name
+            packed.python_kernel_name = python_kernel_name
             packed.outputs = [
                 MultiOutput(
                     cls.tensor_to_layout(tensor),
@@ -7434,6 +8083,8 @@ def create_out_of_place(
                 non_tensor_args,
                 unflatten_args,
             )
+            packed.cpp_kernel_name = cpp_kernel_name
+            packed.python_kernel_name = python_kernel_name
             packed.outputs = [packed]
             return packed
 
@@ -7445,11 +8096,16 @@ def get_volatile_reads(self):
             # Out-of-place single-output
             return [inp.inputs[0]]
         elif isinstance(inp, MultiOutput):
-            # Out-of-place multi-output
+            # This can be two things:
+            # 1. Out-of-place multi-output coll
+            # 2. In-place coll with inputs coming from another MultiOutput
             coll = inp.inputs[0]
-            assert isinstance(coll, _CollectiveKernel)
-            _, idx = inp.indices[0]
-            return [coll.inputs[idx]]
+            # Case 1
+            if isinstance(coll, _CollectiveKernel):
+                _, idx = inp.indices[0]
+                return [coll.inputs[idx]]
+            # Case 2
+            return []
         else:
             # In-place requires no additional deps handling for volatile
             # reads since the inputs are mutated.
@@ -7463,7 +8119,9 @@ def create_wait(cls, kernel, inp: TensorBox) -> None:
                 tensor_args,
                 non_tensor_args,
                 unflatten_args,
+                unbacked_bindings,
             ) = cls.process_kernel(kernel, inp)
+        assert not unbacked_bindings, f"{kernel} {unbacked_bindings}"
         packed = cls(
             NoneLayout(inp.get_device()),
             kernel,
@@ -7471,6 +8129,8 @@ def create_wait(cls, kernel, inp: TensorBox) -> None:
             non_tensor_args,
             unflatten_args,
         )
+        if isinstance(inp.data, BaseView):
+            inp = inp.data.unwrap_view()
         MutationOutput(inp.layout, inp, packed)
 
     def get_read_writes(self):
@@ -7499,69 +8159,3 @@ def maybe_free_unbacked_symbols(s):
         return free_unbacked_symbols(s)
     else:
         return set()
-
-
-class AllToAllSingle(OutOfPlaceCollectiveKernel):
-    def __init__(
-        self,
-        layout,
-        inputs,
-        outputs,
-        constant_args,
-        output_split_sizes,
-        input_split_sizes,
-    ):
-        super().__init__(layout, inputs, outputs, constant_args)
-        self.output_split_sizes = output_split_sizes
-        self.input_split_sizes = input_split_sizes
-
-    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
-        r = set()
-        if self.output_split_sizes is not None:
-            r |= free_unbacked_symbols(self.output_split_sizes)
-        if self.input_split_sizes is not None:
-            r |= free_unbacked_symbols(self.input_split_sizes)
-        return r
-
-    @classmethod
-    def create(
-        cls,
-        x: "TensorBox",
-        output_split_sizes: Optional[List[Expr]],
-        input_split_sizes: Optional[List[Expr]],
-        tag: str,
-        ranks: List[int],
-        group_size: int,
-    ):
-        inputs = [cls.realize_input(x)]
-
-        def compute_size(new_size):
-            if output_split_sizes is not None:
-                new_size[0] = sum(output_split_sizes)
-
-        outputs = cls.create_output_buffers(inputs, compute_size)
-
-        layout = MultiOutputLayout(inputs[0].get_device())
-
-        packed = AllToAllSingle(
-            layout=layout,
-            inputs=inputs,
-            outputs=outputs,
-            constant_args=[tag, ranks, group_size],
-            output_split_sizes=output_split_sizes,
-            input_split_sizes=input_split_sizes,
-        )
-        return cls.create_output_nodes(packed, outputs)[0]
-
-    def codegen_collective(self, wrapper, output_name, input_names):
-        tag, ranks, group_size = self.constant_args
-
-        # TODO: might be necessary to do some pretty printing on
-        # split sizes
-        wrapper.writeline(
-            f"{output_name}_work = dist.all_to_all_single("
-            f"{output_name}[0], {output_name}_inputs[0], "
-            f"output_split_sizes={self.output_split_sizes}, "
-            f"input_split_sizes={self.input_split_sizes}, "
-            f"group={output_name}_pg, async_op=True)"
-        )
diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py
index 1878cef79f0f5..143c616fcb84e 100644
--- a/torch/_inductor/kernel/bmm.py
+++ b/torch/_inductor/kernel/bmm.py
@@ -1,15 +1,26 @@
+import logging
+
 import torch
 
-from ..lowering import register_lowering
+from .. import ir, lowering as L
 from ..select_algorithm import (
     autotune_select_algorithm,
     ExternKernelChoice,
     TritonTemplate,
 )
-from ..utils import ceildiv as cdiv, use_aten_gemm_kernels, use_triton_template
+from ..utils import (
+    ceildiv as cdiv,
+    use_aten_gemm_kernels,
+    use_cutlass_template,
+    use_triton_template,
+)
+from ..virtualized import V
+
+from .mm import _is_static_problem
 
 from .mm_common import addmm_epilogue, mm_args, mm_configs, mm_options
 
+log = logging.getLogger(__name__)
 aten = torch.ops.aten
 
 
@@ -85,8 +96,45 @@ def bmm_grid(b, m, n, meta):
 aten_baddbmm = ExternKernelChoice(torch.baddbmm, "at::baddbmm_out")
 
 
-@register_lowering(aten.bmm)
+@L.register_lowering(aten.bmm)
 def tuned_bmm(mat1, mat2, *, layout=None):
+    if all(x.get_device().type == "cpu" for x in [mat1, mat2]):
+        # decompose to small ops when memory bound
+        if mat1.get_size()[1] == 1 or mat2.get_size()[2] == 1:
+            mat1 = L.unsqueeze(mat1, -1)
+            mat2 = L.unsqueeze(mat2, 1)
+            return L.sum_(L.mul(mat1, mat2), axis=2)
+
+        def is_valid_to_require_contiguous(t):
+            if not ir.is_storage_and_layout(t):
+                return True
+            _, layout = ir.as_storage_and_layout(t, freeze=False)
+            return isinstance(layout, ir.FlexibleLayout)
+
+        def is_preferred_layout_as_bmm_input(sizes, strides):
+            # contiguous on one of the last two dims
+            return (
+                strides[-1] == 1 and (sizes[-2] == 1 or strides[-2] >= sizes[-1])
+            ) or (strides[-2] == 1 and (sizes[-1] == 1 or strides[-1] >= sizes[-2]))
+
+        # Make the input of bmm contiguous
+        # if it is not contiguous on either of the last two dims,
+        # because bmm cpu implementation would do contiguous() if not.
+        # This is to avoid additional copies in bmm.
+        def may_require_contiguous(t, meta_t):
+            sizes = meta_t.meta["val"].size()
+            strides = meta_t.meta["val"].stride()
+            if not is_preferred_layout_as_bmm_input(sizes, strides):
+                t = ir.ExternKernel.require_contiguous(t)
+            return t
+
+        if is_valid_to_require_contiguous(mat1):
+            meta_mat1 = V.graph.current_node.args[0]
+            mat1 = may_require_contiguous(mat1, meta_mat1)
+        if is_valid_to_require_contiguous(mat2):
+            meta_mat2 = V.graph.current_node.args[1]
+            mat2 = may_require_contiguous(mat2, meta_mat2)
+
     m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
 
     # options to tune from
@@ -99,12 +147,21 @@ def tuned_bmm(mat1, mat2, *, layout=None):
                 layout=layout,
                 **mm_options(config, m, n, k, layout),
             )
+    static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout)
+    if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
+        from ..codegen.cuda.gemm_template import CUTLASSGemmTemplate
+
+        CUTLASSGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2])
+
+    if len(choices) == 0:
+        log.warning("No choices for GEMM, using ATen backend as fallback")
+        choices.append(aten_bmm.bind((mat1, mat2), layout))
 
     return autotune_select_algorithm("bmm", choices, [mat1, mat2], layout)
 
 
 # Don't register this since it is slower than decomposing it
-# @register_lowering(aten.baddbmm)
+# @L.register_lowering(aten.baddbmm)
 def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     m, n, k, layout, mat1, mat2, inp = mm_args(mat1, mat2, inp, layout=layout)
 
diff --git a/torch/_inductor/kernel/conv.py b/torch/_inductor/kernel/conv.py
index 849058349778c..37a760a90e1e0 100644
--- a/torch/_inductor/kernel/conv.py
+++ b/torch/_inductor/kernel/conv.py
@@ -2,11 +2,10 @@
 
 import functools
 import logging
-from typing import cast, List, Optional, Sequence, Tuple, TypedDict
+from typing import cast, List, Optional, Sequence, Tuple, TYPE_CHECKING, TypedDict
 
 import torch
 from .. import config, ir
-from ..ir import TensorBox
 
 from ..lowering import (
     add_layout_constraint,
@@ -30,6 +29,9 @@
 from ..virtualized import V
 from .mm_common import filtered_configs
 
+if TYPE_CHECKING:
+    from ..ir import TensorBox
+
 log = logging.getLogger(__name__)
 
 
@@ -203,6 +205,7 @@ def conv_grid(n, c, h, w, meta):
     torch.convolution,
     "at::convolution",
     has_out_variant=False,
+    op_overload=aten.convolution.default,
 )
 
 
@@ -243,14 +246,14 @@ def conv_layout(
             ir.ir_node_to_tensor(weight, guard_shape=True),
             ir.ir_node_to_tensor(bias, guard_shape=True),
             stride,
-            tuple(V.graph.sizevars.size_hint(p) for p in padding),
+            tuple(V.graph.sizevars.size_hint(p) for p in padding),  # type: ignore[arg-type]
             dilation,
             transposed,
-            tuple(V.graph.sizevars.size_hint(p) for p in output_padding),
+            tuple(V.graph.sizevars.size_hint(p) for p in output_padding),  # type: ignore[arg-type]
             groups,
         )
         sizes = ir.convert_shape_to_inductor(output.size())
-        stride = ir.convert_shape_to_inductor(output.stride())
+        stride = ir.convert_shape_to_inductor(output.stride())  # type: ignore[assignment]
 
     return ir.FixedLayout(
         x.get_device(),
@@ -359,6 +362,7 @@ def channels_last_conv():
         and not transposed
         and is_zeros(output_padding)
         and groups == 1
+        and sympy_product(x.get_size()) > 0
     ):
         return convert_1x1_conv_to_mm(x, weight, bias)
 
@@ -407,10 +411,15 @@ def channels_last_conv():
         bias.realize()
         bias.freeze_layout()
         V.graph.sizevars.evaluate_static_shapes(bias.get_size())
-
     choices = [
-        aten_convolution.bind(args, layout, ordered_kwargs_for_cpp_kernel, **kwargs)
+        aten_convolution.bind(
+            args,
+            layout,
+            ordered_kwargs_for_cpp_kernel,
+            **kwargs,
+        )
     ]
+
     if (
         use_triton_template(layout)
         # templates only support these:
@@ -419,7 +428,7 @@ def channels_last_conv():
         and not transposed
         and is_zeros(output_padding)
         # there are some odd models where this check fails (e.g. shufflenet_v2_x1_0)
-        and V.graph.sizevars.statically_known_equals(in_chan, x.get_size()[1])
+        and V.graph.sizevars.statically_known_equals(in_chan, x.get_size()[1])  # type: ignore[arg-type]
     ):
         if (
             is_ones(kernel_shape)
diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py
new file mode 100644
index 0000000000000..15a99faa7b37f
--- /dev/null
+++ b/torch/_inductor/kernel/flex_attention.py
@@ -0,0 +1,360 @@
+""" Triton Implementation of the flex_attention Kernel"""
+import logging
+from typing import Any, List
+
+import torch
+from .. import config
+from ..lowering import empty_strided, lowerings, register_lowering
+from ..select_algorithm import autotune_select_algorithm, TritonTemplate
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+
+
+def sdpa_grid(batch_size, num_heads, num_queries, d_model, meta):
+    """How is this kernel parallelized?
+    We create a grid of (batch_size * num_heads, ceil_div(n_queries, query_block_size), 1)
+    Each block is responsible for iterating over blocks of keys and values calculating
+    the final attention output.
+    """
+    import triton
+
+    return (triton.cdiv(num_queries, meta["BLOCK_M"]), batch_size * num_heads, 1)
+
+
+sdpa_template = TritonTemplate(
+    name="sdpa",
+    grid=sdpa_grid,
+    source=r"""
+{{def_kernel("Q", "K", "V", "LSE")}}
+    # Sub notation for this kernel:
+    # Q: Query, K: Key, V: Value
+    # M: Number of queries, N: Number of keys/values, D: Model dimension
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head
+    # (Modifiable) Config options:
+    # BLOCK_M
+    # BLOCK_N
+    # SCORE_MOD_IS_LINEAR: Is the score modifier linear? If so, we can lift the
+    # change of base out of the loop
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad
+
+    # Define Q Strides
+    stride_qz = {{stride("Q", 0)}}
+    stride_qh = {{stride("Q", 1)}}
+    stride_qm = {{stride("Q", 2)}}
+    stride_qk = {{stride("Q", 3)}}
+    # Define K Strides
+    stride_kz = {{stride("K", 0)}}
+    stride_kh = {{stride("K", 1)}}
+    stride_kn = {{stride("K", 2)}}
+    stride_kk = {{stride("K", 3)}}
+    # Define V Strides
+    stride_vz = {{stride("V", 0)}}
+    stride_vh = {{stride("V", 1)}}
+    stride_vk = {{stride("V", 2)}}
+    stride_vn = {{stride("V", 3)}}
+
+    Z = {{size("Q", 0)}}
+    H = {{size("Q", 1)}}
+    N_CTX = {{size("Q", 2)}}
+
+    qk_scale = 1.0
+    MATMUL_PRECISION = Q.dtype.element_ty
+
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+
+    qkv_offset = off_hz * stride_qh
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + qkv_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    K_block_ptr = tl.make_block_ptr(
+        base=K + qkv_offset,
+        shape=(BLOCK_DMODEL, N_CTX),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1)
+    )
+    V_block_ptr = tl.make_block_ptr(
+        base=V + qkv_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_vk, stride_vn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+    q = tl.load(Q_block_ptr)
+    if SCORE_MOD_IS_LINEAR:
+        qk_scale *= 1.44269504
+    q = (q * qk_scale).to(MATMUL_PRECISION)
+    # loop over k, v and update accumulator
+    lo = 0
+    hi = N_CTX
+    for start_n in range(lo, hi, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- load k, v --
+        k = tl.load(K_block_ptr)
+        v = tl.load(V_block_ptr)
+        # -- compute qk ---
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k.to(MATMUL_PRECISION), acc=qk)
+        # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+        m = offs_m[:, None]
+        n = start_n + offs_n[None, :]
+        {{ modification(
+            score="qk",
+            b="off_hz // H",
+            h="off_hz % H",
+            m="m",
+            n="n",
+            out="qk"
+        ) | indent_except_first(2) }}
+        # TODO: In the case that score_mod is linear, this can be LICMed
+        if not SCORE_MOD_IS_LINEAR:
+            qk *= 1.44269504
+        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        # -- compute scaling constant ---
+        row_max = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, row_max)
+
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        if not ROWS_GUARANTEED_SAFE:
+            masked_out_rows = (m_i_new == float("-inf"))
+            alpha = tl.where(masked_out_rows, 0, alpha)
+            p = tl.where(masked_out_rows[:, None], 0, p)
+
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc = tl.dot(p.to(MATMUL_PRECISION), v.to(MATMUL_PRECISION), acc)
+
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+        # update pointers
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+
+    # Store output and logsumexp
+    acc = acc / l_i[:, None]
+    idx_z = tl.program_id(1) // H
+    idx_h = tl.program_id(1) % H
+    idx_m = offs_m[:, None]
+    idx_d = tl.arange(0, BLOCK_DMODEL)[None, :]
+
+    # TODO generalize and add proper mask support
+    mask = (idx_m != -1) & (idx_d != -1)
+    {{store_output(("idx_z", "idx_h", "idx_m", "idx_d"), "acc")}}
+
+    # TODO dont want to write this if we dont require grad
+    if OUTPUT_LOGSUMEXP:
+        l_ptrs = LSE + off_hz * N_CTX + offs_m
+        lse = m_i + tl.math.log2(l_i)
+        tl.store(l_ptrs, lse)
+ """,
+)
+
+
+_h100_default_config = {
+    (torch.float32, 64): (128, 32, 4, 3),
+    (torch.float32, 128): (32, 64, 4, 3),
+    (torch.float32, 256): (32, 32, 4, 3),
+    (torch.bfloat16, 64): (128, 64, 4, 3),
+    (torch.bfloat16, 128): (64, 32, 4, 3),
+    (torch.bfloat16, 256): (64, 32, 4, 3),
+}
+
+_a100_default_config = {
+    (torch.float32, 64): (128, 32, 4, 3),
+    (torch.float32, 128): (128, 32, 4, 3),
+    (torch.float32, 256): (64, 16, 4, 3),
+    (torch.bfloat16, 64): (128, 64, 4, 3),
+    (torch.bfloat16, 128): (128, 32, 4, 3),
+    (torch.bfloat16, 256): (32, 64, 4, 3),
+}
+
+
+def _get_default_config(query):
+    dtype = query.get_dtype()
+    head_dim = query.get_size()[-1]
+    default_config = None
+
+    if head_dim <= 256 and torch.cuda.get_device_capability() >= (9, 0):  # H100
+        if dtype == torch.float32:
+            default_config = (64, 64, 4, 3)
+        else:
+            default_config = (128, 64, 4, 3)
+        default_config = _h100_default_config.get((dtype, head_dim), default_config)
+    elif head_dim <= 256 and torch.cuda.get_device_capability() >= (8, 0):  # A100
+        if dtype == torch.float32:
+            default_config = (64, 64, 4, 3)
+        else:
+            default_config = (128, 64, 4, 3)
+        default_config = _a100_default_config.get((dtype, head_dim), default_config)
+    else:  # modest hardware or extremely large head_dim
+        if dtype == torch.float32:
+            default_config = (32, 16, 4, 3)
+        else:
+            default_config = (64, 32, 4, 3)
+
+    return default_config
+
+
+# TODO: We probably also need a layout constraint?
+@register_lowering(torch.ops.higher_order.flex_attention, type_promotion_kind=None)
+def flex_attention(*args, **kwargs):
+    from torch._prims_common import make_contiguous_strides_for
+    from ..ir import (
+        ComputedBuffer,
+        FixedLayout,
+        FlexibleLayout,
+        InputBuffer,
+        StorageBox,
+        TensorBox,
+    )
+
+    query, key, value, subgraph, *other_buffers = args
+
+    def create_placeholder(name: str, dtype: torch.dtype) -> InputBuffer:
+        return TensorBox.create(
+            InputBuffer(
+                name,
+                FixedLayout(
+                    query.get_device(),
+                    dtype,
+                    [
+                        1,
+                    ],
+                    [
+                        1,
+                    ],
+                ),
+            )
+        )
+
+    scalar_inps = ["score", "b", "h", "m", "n"]
+    env = {}
+    cnt = 0
+    placeholder_inps = [
+        create_placeholder(name, dtype)
+        for name, dtype in [
+            ("score", query.get_dtype()),
+            ("b", torch.int32),
+            ("h", torch.int32),
+            ("m", torch.int32),
+            ("n", torch.int32),
+        ]
+    ]
+    for node in subgraph.graph_module.graph.nodes:
+        # There are two classes of placeholder inpts that we need
+        # to handle differently. For the first n_scalar_inps inputs
+        # we expect that these placeholders were generated by the make_fx call
+        # in the flex Attention HOP. So we need to create a new placeholder
+        # TensorBox for each of these inputs. For the rest of the inputs we
+        # expect that these are lifted inputs that fill up the '*other_buffers'
+        # tuple and already have corresponding TensorBoxes passed in as args.
+        if node.op == "placeholder":
+            is_lifted_input = cnt >= len(scalar_inps)
+            env[node] = args[cnt - 1] if is_lifted_input else placeholder_inps[cnt]
+            cnt += 1
+        elif node.op == "call_function":
+            # For call_function we use the defulat lowerings and pass in the
+            # already created TensorBoxes as args
+            from torch.utils._pytree import tree_map
+
+            env[node] = lowerings[node.target](
+                *tree_map(lambda x: env[x] if x in env else x, node.args)
+            )
+        elif node.op == "output":
+            # For the output node we need to create a ComputedBuffer
+            # which represents the actual score modification
+
+            output_buffer = env[node.args[0]]
+            assert isinstance(output_buffer.data, StorageBox), (
+                "The output node for the flex attention subgraph must be a StorageBox, but got: ",
+                type(output_buffer),
+            )
+            # Create the ComputedBuffer directly that will be inlined into the modification block
+            subgraph_buffer = ComputedBuffer(
+                name=None,
+                layout=FlexibleLayout(
+                    device=output_buffer.data.get_device(),
+                    dtype=output_buffer.data.get_dtype(),
+                    size=output_buffer.data.get_size(),
+                ),
+                data=output_buffer.data.data,  # type: ignore[arg-type]
+            )
+
+            layout = FixedLayout(
+                output_buffer.get_device(),
+                query.get_dtype(),
+                query.get_size(),
+                make_contiguous_strides_for(query.get_size()),
+            )
+            # see NOTE:[TritonTemplates with multiple outputs]
+            logsumexp_shape = query.get_size()[:-1]  # [B, H, M]
+            logsumexp = empty_strided(
+                logsumexp_shape,
+                None,
+                dtype=torch.float32,  # The logsumexp is always stored in fp32 regardless of the input dtype
+                device=output_buffer.get_device(),
+            )
+            choices: List[Any] = []
+            configs: List[Any] = []
+            configs.append(_get_default_config(query))
+            if config.max_autotune:
+                configs += [
+                    (128, 64, 4, 3),
+                    (128, 128, 4, 3),
+                    (128, 128, 8, 2),
+                    (64, 128, 4, 3),
+                    (64, 64, 4, 3),
+                ]
+            # Note, we don't need to pass in the captured buffers explicitly
+            # because they're implicitly added by the score_mod function
+            # We do need to explicitly pass it in for autotuning though.
+            for BLOCK_M, BLOCK_N, num_warps, num_stages in configs:
+                sdpa_template.maybe_append_choice(
+                    choices=choices,
+                    input_nodes=[query, key, value, logsumexp],
+                    layout=layout,
+                    subgraphs=subgraph_buffer,
+                    mutated_inputs=[
+                        logsumexp,
+                    ],
+                    num_stages=num_stages,
+                    num_warps=num_warps,
+                    BLOCK_M=BLOCK_M,
+                    BLOCK_N=BLOCK_N,
+                    BLOCK_DMODEL=query.get_size()[-1],
+                    # For now, we always assume the "sound" option
+                    SCORE_MOD_IS_LINEAR=False,
+                    ROWS_GUARANTEED_SAFE=False,
+                    OUTPUT_LOGSUMEXP=True,
+                )
+            inputs_for_autotuning = [query, key, value, logsumexp] + list(other_buffers)
+            return (
+                autotune_select_algorithm(
+                    "sdpa", choices, inputs_for_autotuning, layout
+                ),
+                logsumexp,
+            )
+    raise ValueError("TemplatedAttention was passed a subgraph with no output node!")
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
index 3c268aaa62f9d..ab95ee83f387a 100644
--- a/torch/_inductor/kernel/mm.py
+++ b/torch/_inductor/kernel/mm.py
@@ -1,14 +1,18 @@
+import functools
 import logging
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import torch
 from torch._inductor.virtualized import V
 from .. import config as inductor_config
 from ..codegen.cuda.gemm_template import CUTLASSGemmTemplate
+from ..codegen.wrapper import WrapperCodeGen
+from ..ir import FlexibleLayout
 from ..lowering import register_lowering
 from ..select_algorithm import (
     autotune_select_algorithm,
     ExternKernelChoice,
+    NoValidChoicesError,
     TritonTemplate,
 )
 from ..utils import (
@@ -94,7 +98,9 @@
 aten_mm = ExternKernelChoice(torch.mm, "at::mm_out")
 
 
-aten_addmm = ExternKernelChoice(torch.addmm, "at::addmm_out")
+aten_addmm = ExternKernelChoice(
+    torch.addmm, "at::addmm_out", op_overload=aten.addmm.default
+)
 
 aten__int_mm = ExternKernelChoice(torch._int_mm, "at::_int_mm")
 
@@ -121,10 +127,18 @@ def bias_addmm(inp, mat1, mat2, *, out=None, alpha=1, beta=1):
 def tuned_mm(mat1, mat2, *, layout=None):
     m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
 
-    # options to tune from
-    choices = [aten_mm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
+    aten_layout = layout
+    if not use_max_autotune():
+        aten_layout = FlexibleLayout(
+            device=layout.device, dtype=layout.dtype, size=layout.size
+        )
 
-    if m * n != 0 and use_triton_template(layout):
+    # options to tune from
+    choices = (
+        [aten_mm.bind((mat1, mat2), aten_layout)] if use_aten_gemm_kernels() else []
+    )
+    static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout)
+    if is_nonzero and use_triton_template(layout):
         for config in mm_configs(m, n, k):
             mm_template.maybe_append_choice(
                 choices,
@@ -133,26 +147,38 @@ def tuned_mm(mat1, mat2, *, layout=None):
                 **mm_options(config, m, n, k, layout),
             )
 
-    if m * n != 0 and use_cutlass_template(layout):
-        CUTLASSGemmTemplate.add_cutlass_gemm_choices(
-            choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True
-        )
-
-    from torch._inductor.ir import FixedLayout, FlexibleLayout
-
-    if (
-        len(choices) == 1
-        and use_aten_gemm_kernels()
-        and isinstance(layout, FixedLayout)
-    ):
-        # If we are not autotuning, we can swap to a FlexibleLayout
-        # in order to get fusion optimizations to kick in, e.g. ConcatFusion
-        layout = FlexibleLayout(
-            device=layout.device, dtype=layout.dtype, size=layout.size
-        )
-        choices = [aten_mm.bind((mat1, mat2), layout)]
-
-    return autotune_select_algorithm("mm", choices, [mat1, mat2], layout)
+    if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
+        CUTLASSGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2])
+
+    if len(choices) == 0 and not use_aten_gemm_kernels():
+        log.warning("No choices for GEMM, using ATen backend as fallback")
+        choices.append(aten_mm.bind((mat1, mat2), aten_layout))
+    try:
+        return autotune_select_algorithm("mm", choices, [mat1, mat2], layout)
+    except NoValidChoicesError:
+        log.warning("All choices for GEMM were invalid, using ATen backend as fallback")
+        return aten_mm.bind((mat1, mat2), aten_layout).output_node()
+
+
+def _is_static_problem(inputs_tensors, layout):
+    # checks whether all input tensors and the output layout
+    # have a static shape by attempting to convert the dimensions
+    # to int
+    static_shape = True
+    static_size = WrapperCodeGen.statically_known_list_of_ints_or_none(layout.size)
+    if static_size is None:
+        nonzero = True
+        for s in layout.size:
+            sz = WrapperCodeGen.statically_known_int_or_none(s)
+            if sz is not None and sz == 0:
+                nonzero = False
+                break
+        return False, nonzero
+    numel = 1
+    for dim in static_size:
+        numel *= dim
+    nonzero = numel > 0
+    return static_shape, nonzero
 
 
 @register_lowering(aten._int_mm, type_promotion_kind=None)
@@ -160,12 +186,22 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
     m, n, k, layout, mat1, mat2 = mm_args(
         mat1, mat2, layout=layout, out_dtype=torch.int32
     )
+    static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout)
+    use_cutlass = static_shape and is_nonzero and use_cutlass_template(layout, m, n, k)
+
     choices = (
         [aten__int_mm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
     )
-    if m * n != 0 and use_triton_template(layout, enable_int32=True):
-        # TODO: Re-enable eager mode implementation once cuBLAS is fixed
+
+    # TODO: Re-enable eager mode implementation once cuBLAS is fixed
+    if use_cutlass or use_triton_template(layout, enable_int32=True):
         choices = []
+
+    if use_cutlass:
+        CUTLASSGemmTemplate.add_cutlass_gemm_choices(
+            choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True
+        )
+    if is_nonzero and use_triton_template(layout, enable_int32=True):
         for config in int8_mm_configs(m, n, k):
             mm_template.maybe_append_choice(
                 choices,
@@ -173,21 +209,39 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
                 layout=layout,
                 **mm_options(config, m, n, k, layout),
             )
-    return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout)
+    if len(choices) == 0:
+        log.warning(
+            "No choices for integer GEMM avaialbe using configured backends, using ATen backend as fallback"
+        )
+        choices = [aten__int_mm.bind((mat1, mat2), layout)]
+
+    try:
+        return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout)
+    except NoValidChoicesError:
+        log.warning("All choices for GEMM were invalid, using ATen backend as fallback")
+        choices = [aten__int_mm.bind((mat1, mat2), layout)]
+        return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout)
 
 
 @register_lowering(aten.addmm, type_promotion_kind=None)
 def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     ordered_kwargs_for_cpp_kernel = ("beta", "alpha")
-
     m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
-    if m * n == 0 or not use_max_autotune():
+    static_shape, is_nonzero = _is_static_problem([inp, mat1, mat2], layout)
+    if (not is_nonzero) or (not use_max_autotune()):
+        # Use a FlexibleLayout if we are not autotuning.
+        # This allows padding strides for the output.
+        from torch._inductor.ir import FixedLayout, FlexibleLayout
+
+        if isinstance(layout, FixedLayout):
+            layout = FlexibleLayout(
+                device=layout.device, dtype=layout.dtype, size=layout.size
+            )
         choices = (
             [
                 aten_addmm.bind(
                     (inp, mat1, mat2),
                     layout,
-                    ordered_kwargs_for_cpp_kernel,
                     alpha=alpha,
                     beta=beta,
                 )
@@ -202,7 +256,6 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
             aten_addmm.bind(
                 (inp_expanded, mat1, mat2),
                 layout,
-                ordered_kwargs_for_cpp_kernel,
                 alpha=alpha,
                 beta=beta,
             )
@@ -225,7 +278,7 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
             ),
         )
 
-    if use_triton_template(layout):
+    if is_nonzero and use_triton_template(layout):
         for config in mm_configs(m, n, k):
             mm_template.maybe_append_choice(
                 choices,
@@ -236,20 +289,64 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
                 epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
             )
 
-    if use_cutlass_template(layout):
-        CUTLASSGemmTemplate.add_cutlass_gemm_choices(
-            choices,
+    if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
+        # Filter out a known cause of CUDA illegal memory access errors
+        # broadcasting on the last dim of the bias term seems not to be working
+        # in the linear GEMM epilogue used by addmm.
+        if (
+            WrapperCodeGen.statically_known_int_or_none(inp_expanded.layout.stride[-1])
+            != 0
+        ):
+            CUTLASSGemmTemplate.add_cutlass_gemm_choices(
+                choices,
+                layout,
+                [mat1, mat2, inp_expanded],
+                alpha=alpha,
+                beta=beta,
+            )
+
+    add_aten_fallback = False
+    if len(choices) == 0:
+        log.warning("No choices for GEMM, using ATen backend as fallback")
+        add_aten_fallback = True
+
+    if add_aten_fallback:
+        choices.append(
+            aten_addmm.bind(
+                (inp_expanded, mat1, mat2),
+                layout,
+                ordered_kwargs_for_cpp_kernel,
+                alpha=alpha,
+                beta=beta,
+            )
+        )
+
+        if (
+            inp_expanded.get_stride()[0] == 0
+            and inp_expanded.get_device().type == "cuda"
+            and inductor_config.triton.autotune_cublasLt
+        ):
+            # unexpand inp to make sure fused addmm from cublasLt is used
+            choices.insert(
+                0,
+                aten_bias_addmm.bind(
+                    (inp_expanded, mat1, mat2), layout, alpha=alpha, beta=beta
+                ),
+            )
+    try:
+        return autotune_select_algorithm(
+            "addmm", choices, [inp_expanded, mat1, mat2], layout
+        )
+    except NoValidChoicesError:
+        log.warning("All choices for GEMM were invalid, using ATen backend as fallback")
+        fallback_choice = aten_addmm.bind(
+            (inp, mat1, mat2),
             layout,
-            [mat1, mat2, inp_expanded],
+            ordered_kwargs_for_cpp_kernel,
             alpha=alpha,
             beta=beta,
-            input_reorder=[2, 0, 1],
-            fuseable=False,
         )
-
-    return autotune_select_algorithm(
-        "addmm", choices, [inp_expanded, mat1, mat2], layout
-    )
+        return fallback_choice.output_node()
 
 
 def fallback_mixed_mm(mat1, mat2, *, out):
@@ -259,23 +356,45 @@ def fallback_mixed_mm(mat1, mat2, *, out):
 aten_fallback_mixed_mm = ExternKernelChoice(fallback_mixed_mm, None)
 
 
+@functools.lru_cache(None)
+def _is_sm7x_or_older_gpu(index: Optional[int]) -> bool:
+    props = torch.cuda.get_device_properties(index or 0)
+    return props.major <= 7
+
+
 def tuned_mixed_mm(mat1, mat2, mat2_dtype):
     m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=None)
-    choices = [aten_fallback_mixed_mm.bind((mat1, mat2), layout)]
-    if mat1.layout.dtype != torch.float32 and not mat2.layout.is_contiguous():
-        # can't use triton kernel unless one of these is true
-        return autotune_select_algorithm("mixed_mm", choices, [mat1, mat2], layout)
+    static_shape, is_nonzero = _is_static_problem([mat1, mat2], layout)
+
+    fallback = aten_fallback_mixed_mm.bind((mat1, mat2), layout)
+
+    choices = [fallback]
+
+    # can't use triton kernel unless one of these is true or if running on v100 (numerical issues)
+    skip_triton = (
+        mat1.layout.dtype != torch.float32 and not mat2.layout.is_contiguous()
+    ) or _is_sm7x_or_older_gpu(layout.device.index)
+
     if inductor_config.force_mixed_mm:
         choices = []
-    b_prologue_cast_type = f"tl.{mat2_dtype}".replace("torch.", "")
-    has_int8_tensor = _is_int8_mat(mat1) or _is_int8_mat(mat2)
-    for config in mm_configs(m, n, k, has_int8_tensor=has_int8_tensor):
-        mm_template.maybe_append_choice(
-            choices,
-            input_nodes=(mat1, mat2),
-            layout=layout,
-            **mm_options(config, m, n, k, layout, b_prologue_cast_type),
+    if not skip_triton:
+        b_prologue_cast_type = f"tl.{mat2_dtype}".replace("torch.", "")
+        has_int8_tensor = _is_int8_mat(mat1) or _is_int8_mat(mat2)
+        for config in mm_configs(m, n, k, has_int8_tensor=has_int8_tensor):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout, b_prologue_cast_type),
+            )
+
+    if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
+        CUTLASSGemmTemplate.add_cutlass_gemm_choices(
+            choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True
         )
+
+    if skip_triton and not choices:
+        choices = [fallback]
     return autotune_select_algorithm("mixed_mm", choices, [mat1, mat2], layout)
 
 
diff --git a/torch/_inductor/kernel/mm_common.py b/torch/_inductor/kernel/mm_common.py
index 03a8cde3ae9f8..5a7f60e59102f 100644
--- a/torch/_inductor/kernel/mm_common.py
+++ b/torch/_inductor/kernel/mm_common.py
@@ -9,7 +9,8 @@
 from torch._inductor.virtualized import V
 
 from .. import config as inductor_config
-from ..utils import ceildiv as cdiv, next_power_of_2
+from ..runtime.runtime_utils import next_power_of_2
+from ..utils import ceildiv as cdiv
 
 log = logging.getLogger(__name__)
 
@@ -36,7 +37,7 @@ def filtered_configs(
     m = max(
         next_power_of_2(
             V.graph.sizevars.size_hint(
-                m, fallback=torch._inductor.config.unbacked_symint_fallback
+                m, fallback=torch._inductor.config.unbacked_symint_fallback  # type: ignore[arg-type]
             )
         ),
         min_block_size,
@@ -44,7 +45,7 @@ def filtered_configs(
     n = max(
         next_power_of_2(
             V.graph.sizevars.size_hint(
-                n, fallback=torch._inductor.config.unbacked_symint_fallback
+                n, fallback=torch._inductor.config.unbacked_symint_fallback  # type: ignore[arg-type]
             )
         ),
         min_block_size,
@@ -52,7 +53,7 @@ def filtered_configs(
     k = max(
         next_power_of_2(
             V.graph.sizevars.size_hint(
-                k, fallback=torch._inductor.config.unbacked_symint_fallback
+                k, fallback=torch._inductor.config.unbacked_symint_fallback  # type: ignore[arg-type]
             )
         ),
         min_block_size,
@@ -65,33 +66,85 @@ def filtered_configs(
         block_k = max(min(block_k, k), min_block_size)
         # each warp computes 16x16 tile = 256
         num_warps = min(num_warps, block_m * block_n // 256)
-        if (block_m, block_n, block_k, num_stages, num_warps) not in used:
-            used.add((block_m, block_n, block_k, num_stages, num_warps))
-            yield triton_config(
-                BLOCK_M=block_m,
-                BLOCK_N=block_n,
-                BLOCK_K=block_k,
-                num_stages=num_stages,
-                num_warps=num_warps,
-            )
+        if torch.version.hip:
+            for matrix_instr_nonkdim in [0, 16]:
+                if matrix_instr_nonkdim != 0 and (
+                    block_m % matrix_instr_nonkdim != 0
+                    or block_n % matrix_instr_nonkdim != 0
+                ):
+                    #  block_m and block_n must be a multiple of matrix_instr_nonkdim
+                    continue
+                if (
+                    block_m,
+                    block_n,
+                    block_k,
+                    num_stages,
+                    num_warps,
+                    matrix_instr_nonkdim,
+                ) not in used:
+                    used.add(
+                        (
+                            block_m,
+                            block_n,
+                            block_k,
+                            num_stages,
+                            num_warps,
+                            matrix_instr_nonkdim,
+                        )
+                    )
+                    yield triton_config(
+                        BLOCK_M=block_m,
+                        BLOCK_N=block_n,
+                        BLOCK_K=block_k,
+                        num_stages=num_stages,
+                        num_warps=num_warps,
+                        matrix_instr_nonkdim=matrix_instr_nonkdim,
+                    )
+        else:
+            if (block_m, block_n, block_k, num_stages, num_warps, 0) not in used:
+                used.add((block_m, block_n, block_k, num_stages, num_warps, 0))
+                yield triton_config(
+                    BLOCK_M=block_m,
+                    BLOCK_N=block_n,
+                    BLOCK_K=block_k,
+                    num_stages=num_stages,
+                    num_warps=num_warps,
+                )
 
 
 # List of dictionaries to store the kernel configs. Configs that evaluate to true
 # will be utilised on the target platform
 mm_kernel_configs = [
     # "BLOCK_M", "BLOCK_N", "BLOCK_K", "num_stages", "num_warps"
+    {"config": (16, 32, 16, 3, 2), "cond": True},
+    {"config": (16, 32, 32, 4, 2), "cond": True},
+    {"config": (16, 32, 32, 5, 2), "cond": True},
+    {"config": (32, 32, 16, 1, 2), "cond": True},
+    {"config": (32, 32, 128, 2, 4), "cond": torch.version.hip is None},
+    {"config": (32, 64, 32, 5, 8), "cond": True},
+    {"config": (64, 32, 32, 5, 8), "cond": True},
+    {"config": (64, 32, 128, 5, 4), "cond": True},
+    {"config": (64, 64, 16, 2, 4), "cond": True},
     {"config": (64, 64, 32, 2, 4), "cond": True},
+    {"config": (64, 64, 64, 3, 8), "cond": True},
+    {"config": (64, 64, 128, 3, 4), "cond": True},
+    {"config": (64, 64, 128, 5, 4), "cond": True},
     {"config": (64, 128, 32, 3, 4), "cond": True},
-    {"config": (128, 64, 32, 3, 4), "cond": True},
     {"config": (64, 128, 32, 4, 8), "cond": True},
+    {"config": (64, 128, 64, 4, 4), "cond": True},
+    {"config": (64, 128, 128, 4, 4), "cond": True},
+    {"config": (128, 64, 32, 2, 2), "cond": True},
+    {"config": (128, 64, 32, 3, 4), "cond": True},
     {"config": (128, 64, 32, 4, 8), "cond": True},
-    {"config": (64, 32, 32, 5, 8), "cond": True},
-    {"config": (32, 64, 32, 5, 8), "cond": True},
+    {"config": (128, 64, 64, 3, 8), "cond": True},
+    {"config": (128, 64, 128, 4, 8), "cond": True},
     {"config": (128, 128, 32, 2, 8), "cond": True},
-    {"config": (64, 64, 64, 3, 8), "cond": True},
-    {"config": (32, 32, 128, 2, 4), "cond": torch.version.hip is None},
-    {"config": (64, 64, 16, 2, 4), "cond": True},
-    {"config": (32, 32, 16, 1, 2), "cond": True},
+    {"config": (128, 128, 32, 3, 4), "cond": True},
+    {"config": (128, 128, 32, 4, 4), "cond": True},
+    {"config": (128, 128, 64, 3, 4), "cond": True},
+    {"config": (128, 128, 64, 3, 8), "cond": True},
+    {"config": (128, 128, 64, 5, 4), "cond": True},
+    {"config": (128, 128, 64, 5, 8), "cond": True},
 ]
 
 int8_mm_kernel_configs = [
diff --git a/torch/_inductor/kernel/mm_plus_mm.py b/torch/_inductor/kernel/mm_plus_mm.py
index e522f538bfae8..95ef6f043dfce 100644
--- a/torch/_inductor/kernel/mm_plus_mm.py
+++ b/torch/_inductor/kernel/mm_plus_mm.py
@@ -207,10 +207,6 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
     ):
         # TODO(jansel): support different K values when this is fixed:
         # https://github.com/openai/triton/issues/967
-        if m1 == m2 and n1 == n2:
-            V.graph.sizevars.guard_equals(m1, m2)
-            V.graph.sizevars.guard_equals(n1, n2)
-            return lowerings[aten.addmm](lowerings[aten.mm](mat3, mat4), mat1, mat2)
         return lowerings[aten.add](
             lowerings[aten.mm](mat1, mat2), lowerings[aten.mm](mat3, mat4)
         )
diff --git a/torch/_inductor/kernel/unpack_mixed_mm.py b/torch/_inductor/kernel/unpack_mixed_mm.py
index d9fcd06358464..c0053b15c16a4 100644
--- a/torch/_inductor/kernel/unpack_mixed_mm.py
+++ b/torch/_inductor/kernel/unpack_mixed_mm.py
@@ -1,9 +1,12 @@
 import logging
-from typing import List
+from typing import List, TYPE_CHECKING
 
-from ..select_algorithm import autotune_select_algorithm, ChoiceCaller, TritonTemplate
+from ..select_algorithm import autotune_select_algorithm, TritonTemplate
 from .mm_common import mm_args, mm_configs, mm_grid, mm_options
 
+if TYPE_CHECKING:
+    from ..ir import ChoiceCaller
+
 log = logging.getLogger(__name__)
 
 uint4x2_mixed_mm_template = TritonTemplate(
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 36341720bd947..f7d2784c7ff58 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -1,17 +1,22 @@
 import functools
 import itertools
 import logging
+import math
+import operator
 import os
 import warnings
 from collections import defaultdict
-from collections.abc import Iterable
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+from unittest.mock import patch
 
 import sympy
 
 import torch
+import torch.ao.quantization.fx._decomposed
 import torch.fx
 import torch.utils._pytree as pytree
+from torch._dynamo.create_parameter_op import _bind_nn_parameter
+from torch._higher_order_ops.associative_scan import associative_scan_op
 from torch._higher_order_ops.triton_kernel_wrap import (
     triton_kernel_wrapper_functional,
     triton_kernel_wrapper_mutation,
@@ -52,23 +57,27 @@
     ceildiv,
     decode_device,
     is_dynamic,
+    is_gpu,
     is_pointwise_use,
+    needs_fallback_due_to_atomic_add_limitations,
     pad_listlike,
     sympy_product,
+    use_scatter_fallback,
 )
 from .virtualized import ops, V
 
 log = logging.getLogger(__name__)
-lowerings = {}
-layout_constraints = {}
-fallbacks = set()
+lowerings: Dict[torch._ops.OpOverload, Callable[..., Any]] = {}
+layout_constraints: Dict[torch._ops.OpOverload, Callable[..., Any]] = {}
+fallbacks: Set[torch._ops.OpOverload] = set()
 aten = torch.ops.aten
 tr_c10d = torch.ops.tr_c10d
 prims = torch.ops.prims
-needs_realized_inputs = set()
-foreach_ops = set()
-inplace_foreach_ops = set()
-inplaceable_foreach_ops = dict()
+needs_realized_inputs: Set[torch._ops.OpOverload] = set()
+foreach_ops: Set[torch._ops.OpOverload] = set()
+inplace_foreach_ops: Set[torch._ops.OpOverload] = set()
+inplaceable_foreach_ops: Dict[torch._ops.OpOverload, torch._ops.OpOverload] = dict()
+quantized_decomposed = torch.ops.quantized_decomposed
 
 
 def assert_nyi(cond, msg):
@@ -81,8 +90,9 @@ def add_needs_realized_inputs(fn):
         return [add_needs_realized_inputs(x) for x in fn]
     needs_realized_inputs.add(fn)
     if isinstance(fn, torch._ops.OpOverloadPacket):
-        for overload in fn.overloads():
-            needs_realized_inputs.add(getattr(fn, overload))
+        needs_realized_inputs.update(
+            getattr(fn, overload) for overload in fn.overloads()
+        )
 
 
 def add_layout_constraint(fn, constraint):
@@ -106,7 +116,6 @@ def add_layout_constraint(fn, constraint):
         aten.mm,
         aten.upsample_nearest2d,
         aten._upsample_nearest_exact2d,
-        aten.upsample_bicubic2d,
         aten._int_mm,
     ]
 )
@@ -201,7 +210,8 @@ def transform_args(args, broadcast, type_promotion_kind, convert_input_to_bool):
             promoting_args = [
                 a
                 for a in args
-                if isinstance(a, (Number, sympy.Expr)) or hasattr(a, "get_dtype")
+                if isinstance(a, (Number, sympy.Expr))
+                or getattr(a, "dtype", None) is not None
             ]
             dtype = get_promoted_dtype(
                 *promoting_args, type_promotion_kind=type_promotion_kind
@@ -248,7 +258,7 @@ def wrapped(*args, **kwargs):
 
     aten_fns = get_overloads(aten_fn)
     foreach_ops.update(aten_fns)
-    lowerings.update({fn: wrapped for fn in aten_fns})
+    lowerings.update(dict.fromkeys(aten_fns, wrapped))
     return wrapped
 
 
@@ -298,7 +308,7 @@ def wrapped(*args, **kwargs):
 
     aten_fn = get_overloads(aten_fn)
 
-    lowerings.update({fn: wrapped for fn in aten_fn})
+    lowerings.update(dict.fromkeys(aten_fn, wrapped))
     return wrapped
 
 
@@ -366,7 +376,7 @@ def const_func(x):
                 return ir.Constant(x, dtype, decode_device(None))
 
         return [const_func(x) for x in inputs]
-    ex = next(x for x in inputs if isinstance(x, (TensorBox, ExpandView)))
+    ex = next(x for x in inputs if isinstance(x, (TensorBox, ExpandView, ir.Constant)))
     out = []
     for x in inputs:
         if isinstance(x, (int, float)):
@@ -395,8 +405,13 @@ def make_pointwise(
     override_fn_when_input_bool=None,
     override_fn_when_cuda_float64=None,
     allow_alpha=False,
+    triton_fallback=None,
 ):
     def inner(*inputs: List[TensorBox], alpha=None):
+        if triton_fallback is not None and any(map(is_triton, inputs)):
+            assert not allow_alpha  # not implemented
+            return triton_fallback(*inputs)
+
         inputs = promote_constants(inputs, override_return_dtype)
         if allow_alpha:
             if alpha is not None and alpha != 1:
@@ -426,7 +441,7 @@ def inner_fn(index):
         if not override_device:
             device = None
             for i in inputs:
-                if i.get_device().type == "cuda":
+                if is_gpu(i.get_device().type):
                     device = i.get_device()
                     break
             if not device:
@@ -505,7 +520,7 @@ def group_args(arg_pairs):
 
                 outputs[output_ind] = output
 
-                if device.type == "cuda" and use_foreach and realize_outputs:
+                if is_gpu(device.type) and use_foreach and realize_outputs:
                     buffer_list.append(output.realize())
 
             if buffer_list:
@@ -530,6 +545,17 @@ def _to_dtype(x):
 
 @register_lowering(prims.convert_element_type, type_promotion_kind=None)
 def _convert_element_type(x: TensorBox, dtype: torch.dtype):
+    if dtype.is_complex or x.get_dtype().is_complex:
+        if x.get_size():
+            # Decompose since aa aten fallback is more friendly for c++ codegen.
+            # This decomposition doesn't work for empty tensor, which needs more investigation.
+            dst = empty_like(x, dtype=dtype)
+            ir.InplaceCopyFallback.create(dst, x)
+            return dst
+        else:
+            return fallback_handler(
+                prims.convert_element_type.default, add_to_fallback_set=False
+            )(x, dtype)
     return to_dtype(x, dtype, copy=True)
 
 
@@ -547,9 +573,8 @@ def _get_primitive_bitwidth(dtype):
     src_bits = _get_primitive_bitwidth(x_dtype)
     dst_bits = _get_primitive_bitwidth(dtype)
     if src_bits != dst_bits:
-        raise NotImplementedError(
-            f"bitcast {x_dtype} to different bitwidth type {dtype} is not supported yet."
-        )
+        # fallback to aten eager implementation for differing bitwidths
+        return fallback_handler(aten.view.dtype)(x, dtype)
 
     def _to_dtype_bitcast(x):
         # Because we may promote tensor type from float16 or bfloat16
@@ -593,6 +618,7 @@ def register_pointwise(
     override_fn_when_input_bool=None,
     allow_alpha=False,
     use_libdevice_for_f64=False,
+    triton_fallback=None,
 ):
     """A pointwise function that maps ops.{name} to inputs"""
     name = name or aten_fn.__name__
@@ -606,8 +632,9 @@ def register_pointwise(
         fn,
         override_return_dtype=override_return_dtype,
         override_fn_when_input_bool=override_fn_when_input_bool,
-        override_fn_when_cuda_float64=fn_libdevice if use_libdevice_for_f64 else None,
+        override_fn_when_cuda_float64=fn_libdevice if use_libdevice_for_f64 else None,  # type: ignore[possibly-undefined]
         allow_alpha=allow_alpha,
+        triton_fallback=triton_fallback,
     )
     fn = register_lowering(
         aten_fn,
@@ -625,6 +652,40 @@ def register_pointwise(
     return fn
 
 
+def register_frexp():
+    """A pointwise function that maps ops.frexp to inputs"""
+    name = "frexp"
+    frexp = ops_wrapper("frexp")
+
+    def frexp0(*args, **kwargs):
+        return frexp(*args, **kwargs)[0]
+
+    def frexp1(*args, **kwargs):
+        return frexp(*args, **kwargs)[1]
+
+    pw_fns = [
+        make_pointwise(frexp0),
+        make_pointwise(frexp1, override_return_dtype=torch.int32),
+    ]
+
+    def fn(*args, **kwargs):
+        return pw_fns[0](*args, **kwargs), pw_fns[1](*args, **kwargs)
+
+    fn = register_lowering(
+        aten.frexp,
+    )(fn)
+
+    if hasattr(prims, name):
+        register_lowering(
+            getattr(prims, name),
+            type_promotion_kind=None,
+        )(fn)
+    return fn
+
+
+register_frexp()
+
+
 def register_foreach_pointwise(
     aten_fn,
     pointwise_lowering_fn,
@@ -751,12 +812,13 @@ def floor(x):
     return make_pointwise(fn)(x)
 
 
-@register_lowering(aten.round)
+@register_lowering(aten.round.default)
 def round(x):
     if is_integer_type(x):
         return clone(x)
-    fn = ops_wrapper("round")
-    return make_pointwise(fn)(x)
+    else:
+        fn = ops_wrapper("round")
+        return make_pointwise(fn)(x)
 
 
 @register_lowering(aten.trunc)
@@ -769,6 +831,8 @@ def trunc(x):
 
 @register_lowering(aten.expand, type_promotion_kind=None)
 def expand(x, sizes):
+    from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+
     (x,) = promote_constants([x])
     if isinstance(x, ir.BaseConstant):
         return ExpandView.create(x, tuple(sizes))
@@ -777,15 +841,13 @@ def expand(x, sizes):
     if tuple(x.get_size()) == tuple(sizes):
         return x
 
-    if not any(V.graph.sizevars.shape_env.is_unbacked_symint(s) for s in x.get_size()):
+    if not free_unbacked_symbols(x.get_size()):
         x_size_product = V.graph.sizevars.size_hint(sympy_product(x.get_size()))
         # TODO: It would be better to realize the input if any of its sizes
         # are unbacked, because typically the size will be non-zero.  However,
         # this cannot be done directly as below as we'll choke on the size_hint
         # here
-        if x_size_product > 0 and not any(
-            V.graph.sizevars.shape_env.is_unbacked_symint(s) for s in sizes
-        ):
+        if x_size_product > 0 and not free_unbacked_symbols(sizes):
             # maybe realize input before broadcasting it
             x.mark_reuse(
                 V.graph.sizevars.size_hint(sympy_product(sizes)) // x_size_product
@@ -879,74 +941,10 @@ def permute(x, dims):
 
 
 @register_lowering(aten.slice, type_promotion_kind=None)
-def slice_(x, dim=0, start=0, end=2**63, step=1):
+def slice_(x, dim=0, start=0, end=2**63, step=1, clamp=True):
     assert isinstance(x, TensorBox)
     dim = _validate_dim(x, dim, 0)
-    dim_size = x.get_size()[dim]
-    if V.graph.sizevars.evaluate_expr(sympy.Lt(start + dim_size, 0)):
-        start = 0
-    if V.graph.sizevars.evaluate_expr(sympy.Lt(end + dim_size, 0)):
-        end = 0
-    return TensorBox(ir.SliceView.create(x.data, dim, start, end, step))
-
-
-@register_lowering(aten.roll, type_promotion_kind=None)
-def roll(a, shifts, dims=tuple()):
-    """
-    This is based on torch._refs.roll(), but uses ModularIndexing().
-
-    We can't use the ref here because it is based on multiple calls to
-    torch.cat() that this will result in terrible code.
-    """
-    # ATen specifies int[1] type for shifts and dims which expands integers to tuples of length 1
-    if not isinstance(shifts, Iterable):
-        shifts = (shifts,)
-    if not isinstance(dims, Iterable):
-        dims = (dims,)
-    dims = [_validate_dim(a, d) for d in dims]
-
-    if sympy_product(a.get_size()) == 0:
-        return clone(a)
-
-    len_shifts = len(shifts)
-    len_dims = len(dims)
-    if len_shifts != 1 or len_dims != 1:
-        if len_shifts == 0:
-            raise RuntimeError("`shifts` required")
-        # Takes care of the case when dims is not specified (default)
-        # By default, the tensor is flattened before shifting, after which the original shape is restored
-        if len_dims == 0 and len_shifts == 1:
-            flat = view(a, [sympy_product(a.get_size())])
-            rolled = roll(flat, shifts, 0)
-            return view(rolled, list(a.get_size()))
-        if len_shifts != len_dims:
-            raise RuntimeError(
-                f"shifts and dimensions must align. shifts: {len_shifts}, dims: {len_dims}"
-            )
-        tail_shifts = shifts[1:]
-        tail_dims = dims[1:]
-        first_dim_rolled = roll(a, shifts[0], dims[0])
-        return roll(first_dim_rolled, tail_shifts, tail_dims)
-
-    (dim,) = dims
-    # TODO: Avoid guarding on shape here
-    size = V.graph.sizevars.evaluate_static_shape(a.get_size()[dim])
-    start = (size - shifts[0]) % size
-    a_loader = a.make_loader()
-
-    def fn(index):
-        index = list(index)
-        index[dim] = ModularIndexing(
-            index[dim] + start, sympy.Integer(1), sympy.expand(size)
-        )
-        return a_loader(index)
-
-    return Pointwise.create(
-        device=a.get_device(),
-        dtype=a.get_dtype(),
-        inner_fn=fn,
-        ranges=a.get_size(),
-    )
+    return TensorBox(ir.SliceView.create(x.data, dim, start, end, step, clamp=clamp))
 
 
 @register_lowering(aten.as_strided, type_promotion_kind=None)
@@ -986,8 +984,8 @@ def pointwise_cat(inputs, dim=0):
     inputs_ranges: List[Tuple[sympy.Expr, sympy.Expr]] = []
     prev_end = 0
     for inp in inputs:
-        inputs_ranges.append((prev_end, prev_end + inp.get_size()[dim]))
-        prev_end = inputs_ranges[-1][-1]
+        inputs_ranges.append((prev_end, prev_end + inp.get_size()[dim]))  # type: ignore[arg-type]
+        prev_end = inputs_ranges[-1][-1]  # type: ignore[assignment]
 
     inputs_loaders = [inp.make_loader() for inp in inputs]
 
@@ -1028,17 +1026,14 @@ def inner_fn(idx):
                 ),
             )
 
-        def get_masked_val(i):
-            if i != len(inputs) - 1:
-                return ops.where(
-                    masks[i],
-                    masked_loads[i],
-                    get_masked_val(i + 1),
-                )
-            else:
-                return masked_loads[-1]
-
-        return get_masked_val(0)
+        next_val = masked_loads[-1]
+        for i in range((len(inputs)) - 2, -1, -1):
+            next_val = ops.where(
+                masks[i],
+                masked_loads[i],
+                next_val,
+            )
+        return next_val
 
     new_size = list(inputs[0].get_size())
     new_size[dim] = inputs_ranges[-1][-1]
@@ -1051,9 +1046,274 @@ def get_masked_val(i):
     )
 
 
+@register_lowering(quantized_decomposed.quantize_per_channel, type_promotion_kind=None)
+def quantized_decomposed_quantize_per_channel(
+    input: TensorBox,
+    scales: TensorBox,
+    zero_points: TensorBox,
+    axis: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> TensorBox:
+    assert len(scales.get_size()) == 1, "expect scales 1 dim"
+    assert len(zero_points.get_size()) == 1, "expect zero_points 1 dim"
+
+    if input.get_dtype() == torch.bfloat16:
+        input = to_dtype(input, torch.float32)
+    assert (
+        input.get_dtype() == torch.float32
+    ), f"Expecting input to have dtype torch.float32, but got dtype: {input.get_dtype()}"
+    assert axis < len(
+        input.get_size()
+    ), f"Expecting axis to be < {len(input.get_size())}"
+
+    input_loader = input.make_loader()
+    scales_loader = scales.make_loader()
+    zero_points_loader = zero_points.make_loader()
+
+    def inner_fn(idx):
+        channel_idx = (idx[axis],)
+
+        input = input_loader(idx)
+        scale = scales_loader(channel_idx)
+        zero_point = zero_points_loader(channel_idx)
+        qmin, qmax = _create_constants(quant_min, quant_max, dtype=torch.float32)
+
+        if scales.dtype != torch.float32:
+            scale = ops.to_dtype(scale, torch.float32)
+        if zero_points.dtype != torch.int32:
+            zero_point = ops.to_dtype(zero_point, torch.int32)
+        inv_scale = ops.reciprocal(scale)
+        val = ops.round(input * inv_scale) + zero_point
+        clamped = ops.maximum(qmin, ops.minimum(qmax, val))
+        return ops.to_dtype(clamped, dtype)
+
+    return Pointwise.create(
+        device=input.get_device(),
+        dtype=dtype,
+        inner_fn=inner_fn,
+        ranges=input.get_size(),
+    )
+
+
+@register_lowering(
+    quantized_decomposed.dequantize_per_channel, type_promotion_kind=None
+)
+def quantized_decomposed_dequantize_per_channel(
+    input: TensorBox,
+    scales: TensorBox,
+    zero_points: TensorBox,
+    axis: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> TensorBox:
+    assert len(scales.get_size()) == 1, "expect scales 1 dim"
+    assert len(zero_points.get_size()) == 1, "expect zero_points 1 dim"
+    assert (
+        input.get_dtype() == dtype
+    ), f"Expecting input to have dtype {dtype}, but got dtype: {input.get_dtype()}"
+    assert axis < len(
+        input.get_size()
+    ), f"Expecting axis to be < {len(input.get_size())}"
+
+    input_loader = input.make_loader()
+    scales_loader = scales.make_loader()
+    zero_points_loader = zero_points.make_loader()
+
+    def inner_fn(idx):
+        channel_idx = (idx[axis],)
+
+        input = input_loader(idx)
+        scale = scales_loader(channel_idx)
+        zero_point = zero_points_loader(channel_idx)
+
+        if scales.dtype != torch.float32:
+            scale = ops.to_dtype(scale, torch.float32)
+        if zero_points.dtype != torch.float32:
+            zero_point = ops.to_dtype(zero_point, torch.float32)
+        val = ops.sub(ops.to_dtype(input, torch.float32), zero_point) * scale
+        return val
+
+    return Pointwise.create(
+        device=input.get_device(),
+        dtype=torch.float32,
+        inner_fn=inner_fn,
+        ranges=input.get_size(),
+    )
+
+
+@register_lowering(
+    quantized_decomposed.quantize_per_tensor.default, type_promotion_kind=None
+)
+def quantized_decomposed_quantize_per_tensor_default(
+    input: TensorBox,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> TensorBox:
+    if input.get_dtype() == torch.bfloat16:
+        input = to_dtype(input, torch.float32)
+    assert (
+        input.get_dtype() == torch.float32
+    ), f"Expecting input to have dtype torch.float32, but got dtype: {input.get_dtype()}"
+
+    input_loader = input.make_loader()
+
+    def inner_fn(idx, scale, zero_point):
+        input = input_loader(idx)
+        inv_scale, zero_point = _create_constants(
+            1.0 / scale, zero_point, dtype=torch.float32
+        )
+        val = ops.round(input * inv_scale) + zero_point
+        qmin, qmax = _create_constants(quant_min, quant_max, dtype=torch.float32)
+        clamped = ops.minimum(ops.maximum(val, qmin), qmax)
+        return ops.to_dtype(clamped, dtype)
+
+    return Pointwise.create(
+        device=input.get_device(),
+        dtype=dtype,
+        inner_fn=functools.partial(
+            inner_fn, scale=float(scale), zero_point=int(zero_point)
+        ),
+        ranges=input.get_size(),
+    )
+
+
+@register_lowering(
+    quantized_decomposed.dequantize_per_tensor.default, type_promotion_kind=None
+)
+def quantized_decomposed_dequantize_per_tensor_default(
+    input: TensorBox,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> TensorBox:
+    assert (
+        input.get_dtype() == dtype
+    ), f"Expecting input to have dtype {dtype}, but got dtype: {input.get_dtype()}"
+
+    input_loader = input.make_loader()
+
+    def inner_fn(idx, scale, zero_point):
+        input = input_loader(idx)
+        scale, zero_point = _create_constants(scale, zero_point, dtype=torch.float32)
+        val = ops.sub(ops.to_dtype(input, torch.float32), zero_point) * scale
+        return val
+
+    return Pointwise.create(
+        device=input.get_device(),
+        dtype=torch.float32,
+        inner_fn=functools.partial(
+            inner_fn, scale=float(scale), zero_point=int(zero_point)
+        ),
+        ranges=input.get_size(),
+    )
+
+
+@register_lowering(
+    quantized_decomposed.quantize_per_tensor.tensor, type_promotion_kind=None
+)
+def quantized_decomposed_quantize_per_tensor_tensor(
+    input: TensorBox,
+    scale: TensorBox,
+    zero_point: TensorBox,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> TensorBox:
+    if input.get_dtype() == torch.bfloat16:
+        input = to_dtype(input, torch.float32)
+    assert (
+        input.get_dtype() == torch.float32
+    ), f"Expecting input to have dtype torch.float32, but got dtype: {input.get_dtype()}"
+    assert len(scale.get_size()) == 0 or (
+        len(scale.get_size()) == 1 and scale.get_size()[0] == 1
+    ), "expect scale as scalar tensor"
+    assert len(zero_point.get_size()) == 0 or (
+        len(zero_point.get_size()) == 1 and zero_point.get_size()[0] == 1
+    ), "expect zero_point as scalar tensor"
+
+    input_loader = input.make_loader()
+    scale_loader = scale.make_loader()
+    zero_point_loader = zero_point.make_loader()
+
+    def inner_fn(idx):
+        input = input_loader(idx)
+        _scale = scale_loader((0,) if len(scale.get_size()) == 1 else ())
+        _zero_point = zero_point_loader((0,) if len(scale.get_size()) == 1 else ())
+        if scale.dtype != torch.float32:
+            _scale = ops.to_dtype(_scale, torch.float32)
+        if zero_point.dtype != torch.float32:
+            _zero_point = ops.to_dtype(_zero_point, torch.float32)
+        val = ops.round(input * ops.reciprocal(_scale)) + _zero_point
+        qmin, qmax = _create_constants(quant_min, quant_max, dtype=torch.float32)
+        clamped = ops.minimum(ops.maximum(val, qmin), qmax)
+        return ops.to_dtype(clamped, dtype)
+
+    return Pointwise.create(
+        device=input.get_device(),
+        dtype=dtype,
+        inner_fn=inner_fn,
+        ranges=input.get_size(),
+    )
+
+
+@register_lowering(
+    quantized_decomposed.dequantize_per_tensor.tensor, type_promotion_kind=None
+)
+def quantized_decomposed_dequantize_per_tensor_tensor(
+    input: TensorBox,
+    scale: TensorBox,
+    zero_point: TensorBox,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> TensorBox:
+    assert len(scale.get_size()) == 0 or (
+        len(scale.get_size()) == 1 and scale.get_size()[0] == 1
+    ), "expect scale as scalar tensor"
+    assert len(zero_point.get_size()) == 0 or (
+        len(zero_point.get_size()) == 1 and zero_point.get_size()[0] == 1
+    ), "expect zero_point as scalar tensor"
+    assert (
+        input.get_dtype() == dtype
+    ), f"Expecting input to have dtype {dtype}, but got dtype: {input.get_dtype()}"
+
+    input_loader = input.make_loader()
+    scale_loader = scale.make_loader()
+    zero_point_loader = zero_point.make_loader()
+
+    def inner_fn(idx):
+        input = input_loader(idx)
+        _scale = scale_loader((0,) if len(scale.get_size()) == 1 else ())
+        _zero_point = zero_point_loader((0,) if len(scale.get_size()) == 1 else ())
+        if scale.dtype != torch.float32:
+            _scale = ops.to_dtype(_scale, torch.float32)
+        if zero_point.dtype != torch.float32:
+            _zero_point = ops.to_dtype(_zero_point, torch.float32)
+        val = ops.sub(ops.to_dtype(input, torch.float32), _zero_point) * _scale
+        return val
+
+    return Pointwise.create(
+        device=input.get_device(),
+        dtype=torch.float32,
+        inner_fn=inner_fn,
+        ranges=input.get_size(),
+    )
+
+
 @register_lowering(aten.cat)
 def cat(inputs, dim=0):
-    if all(input.get_dtype() is torch.uint8 for input in inputs):
+    cpu_device = inputs[0].get_device().type == "cpu"
+    if cpu_device and all(
+        input.get_dtype() in [torch.int8, torch.uint8] for input in inputs
+    ):
         # TODO <leslie> Remove this fallback when we support vectorization
         # code gen with uint8 data type directly.
         for input in inputs:
@@ -1071,6 +1331,18 @@ def cat(inputs, dim=0):
     )
     inputs = [to_dtype(inp, dtype) for inp in inputs]
 
+    def unwrap_tensor(x: Union[TensorBox, ir.StorageBox]) -> ir.IRNode:
+        if isinstance(x, TensorBox):
+            if isinstance(x.data, ir.BaseView):
+                return x.data.unwrap_view()
+            else:
+                return x.data
+
+        if isinstance(x, ir.StorageBox):
+            return x.data
+
+        return x
+
     def should_lower_cat_input(x) -> bool:
         # Unrealized inputs will not be storage and layouts, and we dont want to realize
         # them in case we want to fuse
@@ -1078,21 +1350,62 @@ def should_lower_cat_input(x) -> bool:
             storage, _ = ir.as_storage_and_layout(x, freeze=False)
             return not ir.ConcatKernel.can_realize_into_without_copy(storage)
 
-        if isinstance(x, TensorBox):
-            if isinstance(x.data, ir.BaseView):
-                return should_lower_cat_input(x.data.unwrap_view())
-            else:
-                return should_lower_cat_input(x.data)
-
-        if isinstance(x, ir.StorageBox):
-            return should_lower_cat_input(x.data)
+        if isinstance(x, (TensorBox, ir.StorageBox)):
+            return should_lower_cat_input(unwrap_tensor(x))
 
         if isinstance(x, ir.Pointwise):
             return True
 
         return False
 
-    if len(inputs) <= config.max_pointwise_cat_inputs:
+    def is_reduction(t):
+        return isinstance(t, ir.ComputedBuffer) and isinstance(t.data, ir.Reduction)
+
+    def can_fuse_reduction(t):
+        if isinstance(t, (TensorBox, ir.StorageBox)):
+            return can_fuse_reduction(unwrap_tensor(t))
+        return (
+            is_reduction(t)
+            or isinstance(t, ir.Pointwise)
+            and any(
+                can_fuse_reduction(V.graph.get_buffer(read))
+                for read in t.get_read_names()
+            )
+        )
+
+    # fusing reducutions into computed concat buffer can cause regressions.
+    fusable_reduction = any(can_fuse_reduction(t) for t in inputs)
+
+    # TODO: We observed negative performance impact of pointwise_cat optimization on CPU so disabled it.
+    #             We will revisit this later after enabling vectorization on index_expr.
+    if cpu_device or fusable_reduction:
+        return TensorBox(ir.ConcatKernel.create(inputs, dim))
+
+    def op_count(x):
+        if isinstance(x, (TensorBox, ir.StorageBox)):
+            return op_count(unwrap_tensor(x))
+
+        # this will correspond to a direct memory read
+        if not isinstance(x, ir.Pointwise):
+            return 0
+
+        count = x.inner_fn_opcount()
+        for read in x.get_read_names():
+            count += op_count(V.graph.get_buffer(read))
+
+        return count
+
+    # as of inputs increase, possibility for register spilling also increases
+    # past a certain threshold of inputs we only fuse if the if the input kernels
+    # are simple
+    # not sure if we want to expose to users via config since logic may change in future
+    MAX_COMPLEX_POINTWISE_CAT = 8
+    MAX_SIMPLE_OP_COUNT = 2
+
+    if len(inputs) <= MAX_COMPLEX_POINTWISE_CAT or (
+        (len(inputs) <= config.max_pointwise_cat_inputs)
+        and all(op_count(t) <= MAX_SIMPLE_OP_COUNT for t in inputs)
+    ):
         pointwise_uses = all(is_pointwise_use(use) for use in V.current_node.users)
         all_pointwise_inputs = all(should_lower_cat_input(inp) for inp in inputs)
         any_pointwise_inputs = any(should_lower_cat_input(inp) for inp in inputs)
@@ -1116,9 +1429,19 @@ def diagonal(input, offset: int = 0, dim1: int = 0, dim2: int = 1):
 
     offset_negative = V.graph.sizevars.evaluate_expr(sympy.Lt(offset, 0))
     if offset_negative:
-        diag_size = max(min(original_shape[dim1] + offset, original_shape[dim2]), 0)
+        diag_size = V.graph.sizevars.evaluate_max(
+            V.graph.sizevars.evaluate_min(
+                original_shape[dim1] + offset, original_shape[dim2]
+            ),
+            0,  # type: ignore[arg-type]
+        )
     else:
-        diag_size = max(min(original_shape[dim1], original_shape[dim2] - offset), 0)
+        diag_size = V.graph.sizevars.evaluate_max(
+            V.graph.sizevars.evaluate_min(
+                original_shape[dim1], original_shape[dim2] - offset
+            ),
+            0,  # type: ignore[arg-type]
+        )
 
     base_idx = (0, 0)
     if offset_negative:
@@ -1168,27 +1491,27 @@ def select(x, dim, idx):
 
 
 @register_lowering(aten.split, type_promotion_kind=None)
-def split(x, sizes, dim=0):
+def split(x, sizes, dim=0, clamp=True):
     dim = _validate_dim(x, dim, 0)
-    x_size = V.graph.sizevars.evaluate_static_shape(x.get_size()[dim])
     if isinstance(sizes, sympy.Expr):
         # TODO: We don't have to guard on sizes per se, but the number
         # of splits must stay constant
         sizes = V.graph.sizevars.evaluate_static_shape(sizes)
     if isinstance(sizes, (int, sympy.Integer)):
+        x_size = V.graph.sizevars.evaluate_static_shape(x.get_size()[dim])
         sizes = [sizes] * ((x_size + sizes - 1) // sizes)
     result = []
     start = 0
     for size in sizes:
         end = start + size
-        result.append(slice_(x, dim, start, end))
+        result.append(slice_(x, dim, start, end, clamp=clamp))
         start = end
     return result
 
 
 @register_lowering(aten.split_with_sizes, type_promotion_kind=None)
 def split_with_sizes(x, sizes, dim=0):
-    return split(x, sizes, dim)
+    return split(x, sizes, dim, clamp=False)
 
 
 @register_lowering(aten.unbind, type_promotion_kind=None)
@@ -1213,7 +1536,7 @@ def unfold(x, dimension, size, step):
     dim_size = sizes[dim]
     sizevars = V.graph.sizevars
     sizevars.guard_leq(size, dim_size)
-    sizevars.guard_lt(0, step)
+    sizevars.guard_lt(0, step)  # type: ignore[arg-type]
 
     new_dim_size = FloorDiv(dim_size - size, step) + 1
     if sizevars.size_hint(dim_size) > 0:
@@ -1264,363 +1587,6 @@ def glu(x, dim=-1):
     return mul(a, sigmoid(b))
 
 
-def register_onednn_fusion_ops():
-    if torch._C._has_mkldnn:
-        cpu_needs_realized_inputs = [
-            torch.ops.mkldnn._convolution_pointwise,
-            torch.ops.mkldnn._convolution_pointwise_,
-            torch.ops.mkldnn._convolution_transpose_pointwise,
-            torch.ops.mkldnn._linear_pointwise,
-            aten.mkldnn_rnn_layer.default,
-            torch.ops.onednn.qconv2d_pointwise,
-        ]
-
-        @register_lowering(torch.ops.mkldnn._convolution_pointwise)
-        def convolution_unary(
-            x: TensorBox,
-            weight: TensorBox,
-            bias: TensorBox,
-            padding,
-            stride,
-            dilation,
-            groups,
-            attr,
-            scalars,
-            algorithm,
-        ):
-            return TensorBox.create(
-                ir.ConvolutionUnary.create(
-                    x,
-                    weight,
-                    bias,
-                    padding,
-                    stride,
-                    dilation,
-                    groups,
-                    attr,
-                    scalars,
-                    algorithm,
-                )
-            )
-
-        @register_lowering(torch.ops.mkldnn._convolution_pointwise.binary)
-        def convolution_binary(
-            x: TensorBox,
-            other: TensorBox,
-            weight: TensorBox,
-            bias: TensorBox,
-            padding,
-            stride,
-            dilation,
-            groups,
-            binary_attr,
-            binary_alpha,
-            unary_attr,
-            unary_scalars,
-            unary_algorithm,
-        ):
-            return TensorBox.create(
-                ir.ConvolutionBinary.create(
-                    x,
-                    other,
-                    weight,
-                    bias,
-                    padding,
-                    stride,
-                    dilation,
-                    groups,
-                    binary_attr,
-                    binary_alpha,
-                    unary_attr,
-                    unary_scalars,
-                    unary_algorithm,
-                )
-            )
-
-        @register_lowering(torch.ops.mkldnn._convolution_pointwise_.binary)
-        def convolution_binary_inplace(
-            x: TensorBox,
-            other: TensorBox,
-            weight: TensorBox,
-            bias: TensorBox,
-            padding,
-            stride,
-            dilation,
-            groups,
-            binary_attr,
-            binary_alpha,
-            unary_attr,
-            unary_scalars,
-            unary_algorithm,
-        ):
-            return TensorBox.create(
-                ir.ConvolutionBinaryInplace.create(
-                    x,
-                    other,
-                    weight,
-                    bias,
-                    padding,
-                    stride,
-                    dilation,
-                    groups,
-                    binary_attr,
-                    binary_alpha,
-                    unary_attr,
-                    unary_scalars,
-                    unary_algorithm,
-                )
-            )
-
-        @register_lowering(torch.ops.mkldnn._linear_pointwise)
-        def linear_unary(
-            x: TensorBox, w: TensorBox, b: TensorBox, attr, scalars, algorithm
-        ):
-            return TensorBox.create(
-                ir.LinearUnary.create(x, w, b, attr, scalars, algorithm)
-            )
-
-        @register_lowering(torch.ops.mkldnn._linear_pointwise.binary)
-        def linear_binary(x: TensorBox, y: TensorBox, w: TensorBox, b: TensorBox, attr):
-            return TensorBox.create(ir.LinearBinary.create(x, y, w, b, attr))
-
-        @register_lowering(torch.ops.mkldnn._convolution_transpose_pointwise)
-        def convolution_transpose_unary(
-            x: TensorBox,
-            weight: TensorBox,
-            bias: TensorBox,
-            padding,
-            output_padding,
-            stride,
-            dilation,
-            groups,
-            attr,
-            scalars,
-            algorithm,
-        ):
-            return TensorBox.create(
-                ir.ConvolutionTransposeUnary.create(
-                    x,
-                    weight,
-                    bias,
-                    padding,
-                    output_padding,
-                    stride,
-                    dilation,
-                    groups,
-                    attr,
-                    scalars,
-                    algorithm,
-                )
-            )
-
-        @register_lowering(aten.mkldnn_rnn_layer.default)
-        def mkldnn_rnn_layer(
-            x: TensorBox,
-            w0: TensorBox,
-            w1: TensorBox,
-            w2: TensorBox,
-            w3: TensorBox,
-            hx: TensorBox,
-            cx: TensorBox,
-            reverse: bool,
-            batch_sizes: List[int],
-            mode: int,
-            hidden_size: int,
-            num_layers: int,
-            has_biases: bool,
-            bidirectional: bool,
-            batch_first: bool,
-            train: bool,
-        ):
-            return pytree.tree_map(
-                TensorBox.create,
-                ir.MkldnnRnnLayer.create(
-                    x,
-                    w0,
-                    w1,
-                    w2,
-                    w3,
-                    hx,
-                    cx,
-                    reverse,
-                    batch_sizes,
-                    mode,
-                    hidden_size,
-                    num_layers,
-                    has_biases,
-                    bidirectional,
-                    batch_first,
-                    train,
-                ),
-            )
-
-        @register_lowering(torch.ops.onednn.qconv2d_pointwise, type_promotion_kind=None)
-        def qconvolution_unary(
-            x: TensorBox,
-            x_scale,
-            x_zp,
-            packed_weight: TensorBox,
-            w_scale: TensorBox,
-            w_zp: TensorBox,
-            bias: TensorBox,
-            stride,
-            padding,
-            dilation,
-            groups,
-            o_inv_scale,
-            o_zero_point,
-            output_dtype,
-            attr,
-            scalars,
-            algorithm,
-        ):
-            return TensorBox.create(
-                ir.QConvPointWisePT2E.create(
-                    x,
-                    x_scale,
-                    x_zp,
-                    packed_weight,
-                    w_scale,
-                    w_zp,
-                    bias,
-                    stride,
-                    padding,
-                    dilation,
-                    groups,
-                    o_inv_scale,
-                    o_zero_point,
-                    output_dtype,
-                    attr,
-                    scalars,
-                    algorithm,
-                )
-            )
-
-        @register_lowering(
-            torch.ops.onednn.qconv2d_pointwise.binary, type_promotion_kind=None
-        )
-        def qconvolution_binary(
-            x: TensorBox,
-            x_scale,
-            x_zp,
-            accum: TensorBox,
-            accum_scale,
-            accum_zp,
-            packed_weight: TensorBox,
-            w_scale: TensorBox,
-            w_zp: TensorBox,
-            bias: TensorBox,
-            stride,
-            padding,
-            dilation,
-            groups,
-            o_inv_scale,
-            o_zero_point,
-            output_dtype,
-            binary_attr,
-            alpha,
-            unary_attr,
-            unary_scalars,
-            unary_algorithmm,
-        ):
-            if (
-                binary_attr == "sum"
-                and output_dtype in [torch.float32, torch.bfloat16]
-                and accum.get_dtype() in [torch.float32, torch.bfloat16]
-                and accum.get_dtype() != output_dtype
-            ):
-                # For int8-mixed-bf16 quantization and inplace add,
-                # there is case when accum dtype is float32 but output dtype is bfloat16.
-                # Since the accum will be inplaced changed with post op sum,
-                # we will do accum dtype convertion here.
-                accum = to_dtype(accum, output_dtype)
-            return TensorBox.create(
-                ir.QConvPointWiseBinaryPT2E.create(
-                    x,
-                    x_scale,
-                    x_zp,
-                    accum,
-                    accum_scale,
-                    accum_zp,
-                    packed_weight,
-                    w_scale,
-                    w_zp,
-                    bias,
-                    stride,
-                    padding,
-                    dilation,
-                    groups,
-                    o_inv_scale,
-                    o_zero_point,
-                    output_dtype,
-                    binary_attr,
-                    alpha,
-                    unary_attr,
-                    unary_scalars,
-                    unary_algorithmm,
-                )
-            )
-
-        @register_lowering(torch.ops.onednn.qlinear_pointwise, type_promotion_kind=None)
-        def qlinear_unary(
-            x: TensorBox,
-            x_scale,
-            x_zp,
-            packed_weight: TensorBox,
-            w_scale: TensorBox,
-            w_zp: TensorBox,
-            bias: TensorBox,
-            o_inv_scale,
-            o_zero_point,
-            output_dtype,
-            attr,
-            scalars,
-            algorithm,
-        ):
-            return TensorBox.create(
-                ir.QLinearPointwisePT2E.create(
-                    x,
-                    x_scale,
-                    x_zp,
-                    packed_weight,
-                    w_scale,
-                    w_zp,
-                    bias,
-                    o_inv_scale,
-                    o_zero_point,
-                    output_dtype,
-                    attr,
-                    scalars,
-                    algorithm,
-                )
-            )
-
-        if torch._C.has_mkl:
-            cpu_needs_realized_inputs.append(torch.ops.mkl._mkl_linear)
-
-            @register_lowering(torch.ops.mkl._mkl_linear)
-            def mkl_packed_linear(
-                x: TensorBox,
-                packed_w: TensorBox,
-                orig_w: TensorBox,
-                b: TensorBox,
-                batch_size,
-            ):
-                result = TensorBox.create(
-                    ir.MKLPackedLinear.create(x, packed_w, orig_w, batch_size)
-                )
-                if b is not None:
-                    result = add(result, b)
-                return result
-
-        add_needs_realized_inputs(cpu_needs_realized_inputs)
-    else:
-        pass
-
-
-register_onednn_fusion_ops()
-
-
 def fallback_handler(kernel, add_to_fallback_set=True):
     if add_to_fallback_set:
         fallbacks.add(kernel)
@@ -1646,7 +1612,10 @@ def unsupported_input_tensor(t: torch._subclasses.FakeTensor, parent=None):
     "Do not support reading or writing to this tensor"
     if t.is_complex():
         # Complex views are supported with IR ComplexView
-        if parent and parent.target == torch.ops.aten.view.dtype:
+        if parent and parent.target in (
+            torch.ops.aten.view.dtype,
+            torch.ops.prims.convert_element_type.default,
+        ):
             return False
         _warn_complex_not_supported()
         return True
@@ -1827,7 +1796,7 @@ def bernoulli_p(x, *args):
 # This shouldn't be called in general
 @register_lowering(aten._foobar)
 def _foobar(_):
-    raise AssertionError()
+    raise AssertionError
 
 
 @functools.lru_cache(1)
@@ -1942,8 +1911,8 @@ def inner_fn(index):
         return ops.randint64(
             seed_loader([]),
             ops.index_expr(random_pos(index), torch.int32),
-            low,
-            high,
+            ops.index_expr(low, torch.int64),
+            ops.index_expr(high, torch.int64),
         )
 
     return Pointwise.create(
@@ -2041,16 +2010,6 @@ def apply_constraint(arg, fx_arg):
 FALLBACK_ALLOW_LIST = {
     "torchvision::roi_align",
 }
-make_fallback(aten._adaptive_avg_pool2d_backward, require_dense)
-make_fallback(aten.convolution_backward, constrain_to_fx_strides)
-make_fallback(aten._cudnn_rnn, require_dense)
-make_fallback(aten._cudnn_rnn_backward, require_contiguous)
-make_fallback(aten._embedding_bag, require_contiguous)
-make_fallback(aten._embedding_bag_forward_only, require_contiguous)
-make_fallback(aten._fused_moving_avg_obs_fq_helper)
-make_fallback(aten._fused_moving_avg_obs_fq_helper_functional)
-make_fallback(aten.grid_sampler_2d_backward, require_dense)
-make_fallback(aten.randperm)
 
 
 def sdpa_constraint(fx_node, *args, **kwargs):
@@ -2061,14 +2020,16 @@ def apply_constraint(arg, fx_arg):
             return arg
 
         meta_val = fx_arg.meta["val"]
-        if not meta_val.is_cuda:
-            return arg
+        meta_stride = meta_val.stride()
 
-        stride_order = ir.get_stride_order(meta_val.stride())
+        stride_order = ir.get_stride_order(meta_stride)
         if stride_order and stride_order[-1] != 0:
             # contiguous stride order
             stride_order = list(reversed(range(len(arg.get_size()))))
 
+        if not meta_val.is_cuda:
+            return ir.ExternKernel.require_stride_order(arg, stride_order)
+
         # This is the minimum alignment required by SDPA kernels for attention_bias.
         # This value can be found in pytorch/aten/src/ATen/native/transformers/attention.cpp preprocess_mask
         ALIGNMENT = 8
@@ -2089,7 +2050,9 @@ def is_aligned_realized_tensor(x):
         try:
             arg.get_stride()
             if is_aligned_realized_tensor(arg):
-                return arg
+                return V.graph.try_match_insignificant_strides(
+                    ir.ExternKernel.realize_input(arg), meta_stride
+                )
         except AttributeError:
             pass
 
@@ -2099,7 +2062,9 @@ def is_aligned(x):
         if isinstance(arg.data, ir.BaseView):
             if not is_aligned(arg):
                 if is_aligned(arg.unwrap_view()):
-                    return arg
+                    return V.graph.try_match_insignificant_strides(
+                        ir.ExternKernel.realize_input(arg), meta_stride
+                    )
 
         return ir.ExternKernel.require_stride_order(arg, stride_order)
 
@@ -2110,84 +2075,98 @@ def is_aligned(x):
     return args, kwargs
 
 
-make_fallback(
-    aten._scaled_dot_product_efficient_attention.default,
-    sdpa_constraint,
-    warn=False,
-)
-make_fallback(
-    aten._scaled_dot_product_efficient_attention_backward.default,
-    sdpa_constraint,
-    warn=False,
-)
-make_fallback(
-    aten._scaled_dot_product_flash_attention.default,
-    sdpa_constraint,
-    warn=False,
-)
-make_fallback(
-    aten._scaled_dot_product_flash_attention_backward.default,
-    sdpa_constraint,
-    warn=False,
-)
-make_fallback(
-    aten._scaled_dot_product_flash_attention_for_cpu.default,
-    sdpa_constraint,
-    warn=False,
-)
-make_fallback(
-    aten._scaled_dot_product_flash_attention_for_cpu_backward.default,
-    sdpa_constraint,
-    warn=False,
-)
-make_fallback(aten._flash_attention_forward.default, sdpa_constraint)
-make_fallback(aten._flash_attention_backward.default, sdpa_constraint)
-make_fallback(aten._efficient_attention_forward.default, sdpa_constraint)
-make_fallback(aten._efficient_attention_backward.default, sdpa_constraint)
-make_fallback(aten.sort)
-make_fallback(aten.sort.stable)
-make_fallback(aten._sparse_coo_tensor_with_dims_and_tensors)
-make_fallback(aten._thnn_fused_lstm_cell, require_dense)
-make_fallback(aten.topk)
-make_fallback(aten.upsample_bicubic2d_backward, require_contiguous)
-make_fallback(aten._scaled_mm.default, constrain_to_fx_strides)
+# WIP
+make_fallback(aten._adaptive_avg_pool3d)  # @isuruf
+make_fallback(aten.adaptive_max_pool3d)  # @isuruf
+make_fallback(aten.fractional_max_pool3d)  # @isuruf
+make_fallback(aten.max_pool3d_with_indices)  # @isuruf (can this one be implemented?)
 
-# TODO: This is done, just need to enable support in TorchInductor for complex types.
-make_fallback(aten.view_as_complex, require_contiguous)
 
-# The following were added as a result of https://github.com/pytorch/pytorch/pull/94039 to pass tests
-# It's not necessarily a priority to implement these
-make_fallback(aten.upsample_linear1d)
-make_fallback(aten.upsample_trilinear3d)
-make_fallback(aten.upsample_linear1d_backward)
-make_fallback(aten.upsample_trilinear3d_backward)
-make_fallback(aten._adaptive_avg_pool3d)
-make_fallback(aten.adaptive_max_pool2d)
-make_fallback(aten.adaptive_max_pool3d)
+# 1) Easy
+make_fallback(aten.uniform, warn=False)
+make_fallback(aten.exponential.default, warn=False)  # (fails accuracy on test_torch.py)
+make_fallback(aten._pdist_forward)  # Has decomp. Needs benchmarks
+make_fallback(aten.soft_margin_loss_backward, warn=False)  # py_impl?
+make_fallback(aten.searchsorted)  # bucketized is implemented (see eager impl)
+
+
+# 1.5) Easy or Impossible
+make_fallback(aten._cdist_forward)  # p=2 should be feasible
+make_fallback(aten._cdist_backward)
+
+# 2) Medium
+make_fallback(aten.max_unpool2d)
+make_fallback(aten.max_unpool3d)
+make_fallback(aten._trilinear)
+
+
+# 3) Difficult
+# Scans
+# See the discussion at
+# https://dev-discuss.pytorch.org/t/pytorch-sparse-gnn-compiler-rfc/1644/19
+make_fallback(aten.segment_reduce.default)
+make_fallback(aten._segment_reduce_backward.default)
+
+# Histogram (need to implement Histogram IR)
+make_fallback(aten.histc)
+make_fallback(aten.histogram.bin_ct)
+make_fallback(aten._histogramdd_bin_edges.default)
+make_fallback(aten._histogramdd_from_bin_cts.default)
+
+# Need templated kernel
 make_fallback(aten.addbmm)
 make_fallback(aten.addmv, warn=False)
 make_fallback(aten._addmm_activation, warn=False)
-make_fallback(aten.avg_pool3d)
-make_fallback(aten._cdist_forward)
-make_fallback(aten.cummax)
-make_fallback(aten.cummin)
-make_fallback(aten.digamma, warn=False)
-make_fallback(aten._efficientzerotensor)
+
+# Need templated kernel. Probably impossible to write efficiently
+make_fallback(aten.convolution_backward, constrain_to_fx_strides)
+make_fallback(aten._cudnn_rnn, require_dense)
+make_fallback(aten._cudnn_rnn_backward, require_contiguous)
+
+# Haven't checked but sound difficult / impossible
+make_fallback(aten._embedding_bag, require_contiguous)
+make_fallback(aten._embedding_bag_forward_only, require_contiguous)
+make_fallback(aten._embedding_bag_dense_backward)
 make_fallback(aten._embedding_bag_per_sample_weights_backward)
-make_fallback(aten._efficientzerotensor)
 make_fallback(aten._embedding_bag_per_sample_weights_backward)
-make_fallback(aten.fractional_max_pool2d)
-make_fallback(aten.fractional_max_pool3d)
-make_fallback(aten.frexp)
-make_fallback(aten.geqrf)
-make_fallback(aten.histc)
-make_fallback(aten.i0)
-make_fallback(aten.igamma, warn=False)
-make_fallback(aten.igammac, warn=False)
-make_fallback(aten.isin)
+make_fallback(aten._fused_moving_avg_obs_fq_helper)
+make_fallback(aten._fused_moving_avg_obs_fq_helper_functional)
+
+
+# 4) Backwards (try py_impl'ing them) when fwd is written as a decomp
+make_fallback(aten.avg_pool3d_backward)
+make_fallback(aten.max_pool3d_with_indices_backward)
+make_fallback(aten._adaptive_avg_pool2d_backward, require_dense)
+make_fallback(aten._adaptive_avg_pool3d_backward)
+make_fallback(aten.adaptive_max_pool2d_backward)
+make_fallback(aten.adaptive_max_pool3d_backward)
+make_fallback(aten.fractional_max_pool2d_backward)
+make_fallback(aten.fractional_max_pool3d_backward)
+make_fallback(aten.replication_pad1d_backward)
+make_fallback(aten.replication_pad2d_backward)
+make_fallback(aten.upsample_linear1d_backward)
+make_fallback(aten.upsample_bicubic2d_backward, require_contiguous)
+make_fallback(aten.upsample_trilinear3d_backward)
+make_fallback(aten.grid_sampler_2d_backward, require_dense)
+make_fallback(aten._pdist_backward)
+
+
+# 5) Impossible (missing triton/CPU features)
+
+# Sorting / Sorting-like
+make_fallback(aten.sort)
+make_fallback(aten.sort.stable)
 make_fallback(aten.kthvalue)
-make_fallback(aten.linalg_cholesky_ex)
-make_fallback(aten.linalg_cross)
+make_fallback(aten.topk)
+make_fallback(aten.mode)
+make_fallback(aten.median)
+make_fallback(aten.nanmedian)
+make_fallback(aten.randperm)
+# see: https://github.com/pytorch/pytorch/pull/121354
+make_fallback(aten.resize_)
+make_fallback(aten.resize_as_)
+
+# Linalg
 make_fallback(aten._linalg_det)
 make_fallback(aten.linalg_householder_product)
 make_fallback(aten.linalg_inv_ex)
@@ -2202,92 +2181,86 @@ def is_aligned(x):
 make_fallback(aten._linalg_solve_ex)
 make_fallback(aten.linalg_solve_triangular)
 make_fallback(aten._linalg_svd)
-make_fallback(aten.logcumsumexp)
 make_fallback(aten.lu_unpack)
-make_fallback(aten.max_pool3d_with_indices)
-make_fallback(aten.max_unpool2d)
-make_fallback(aten.max_unpool3d)
-make_fallback(aten.median)
-make_fallback(aten.mode)
-make_fallback(aten.nanmedian)
 make_fallback(aten.ormqr)
-make_fallback(aten._pdist_forward)
-make_fallback(aten.pixel_shuffle)
-make_fallback(aten.pixel_unshuffle)
-make_fallback(aten.polygamma)
-make_fallback(aten.put)
-make_fallback(aten.resize)
-make_fallback(aten.resize_)
-make_fallback(aten.resize_as)
-make_fallback(aten.resize_as_)
-make_fallback(aten.searchsorted)
-make_fallback(aten.special_airy_ai)
-make_fallback(aten.special_bessel_j0, warn=False)
-make_fallback(aten.special_bessel_j1, warn=False)
-make_fallback(aten.special_bessel_y0, warn=False)
-make_fallback(aten.special_bessel_y1)
-make_fallback(aten.special_chebyshev_polynomial_t)
-make_fallback(aten.special_chebyshev_polynomial_u)
-make_fallback(aten.special_erfcx, warn=False)
-make_fallback(aten.special_hermite_polynomial_h)
-make_fallback(aten.special_hermite_polynomial_he)
-make_fallback(aten.special_i0e, warn=False)
-make_fallback(aten.special_i1, warn=False)
-make_fallback(aten.special_i1e, warn=False)
-make_fallback(aten.special_laguerre_polynomial_l)
-make_fallback(aten.special_modified_bessel_i0)
-make_fallback(aten.special_modified_bessel_i1)
-make_fallback(aten.special_modified_bessel_k0)
-make_fallback(aten.special_modified_bessel_k1)
-make_fallback(aten.special_ndtri, warn=False)
-make_fallback(aten.special_scaled_modified_bessel_k0)
-make_fallback(aten.special_scaled_modified_bessel_k1)
-make_fallback(aten.special_spherical_bessel_j0, warn=False)
-make_fallback(aten.special_zeta, warn=False)
-make_fallback(aten._trilinear)
-make_fallback(aten.uniform, warn=False)
-make_fallback(aten._adaptive_avg_pool3d_backward)
-make_fallback(aten.adaptive_max_pool2d_backward)
-make_fallback(aten.adaptive_max_pool3d_backward)
-make_fallback(aten.avg_pool3d_backward)
-make_fallback(aten._cdist_backward)
-make_fallback(aten._embedding_bag_dense_backward)
-make_fallback(aten.fractional_max_pool2d_backward)
-make_fallback(aten.fractional_max_pool3d_backward)
 make_fallback(aten._linalg_check_errors)
-make_fallback(aten.max_pool3d_with_indices_backward)
-make_fallback(aten._pdist_backward)
-make_fallback(aten.replication_pad1d_backward)
-make_fallback(aten.replication_pad2d_backward)
-make_fallback(aten.soft_margin_loss_backward, warn=False)
 make_fallback(aten.linalg_pinv.atol_rtol_tensor)
-make_fallback(aten.segment_reduce.default)
-make_fallback(aten._segment_reduce_backward.default)
-make_fallback(aten.angle)
+make_fallback(aten._linalg_eigh)
+make_fallback(aten.triangular_solve)
+make_fallback(aten.linalg_cholesky_ex)
 make_fallback(aten.cholesky_inverse)
 make_fallback(aten.cholesky_solve)
-make_fallback(aten._fft_r2c)
-make_fallback(aten.histogram.bin_ct)
-make_fallback(aten._histogramdd_bin_edges.default)
-make_fallback(aten._histogramdd_from_bin_cts.default)
-make_fallback(aten.index_reduce)
+make_fallback(aten.geqrf)
+make_fallback(aten._fft_r2c)  # needs complex as well
+
+# Data dependent (are these necessary?)
+make_fallback(aten.nonzero.default)
+
+# Misc
+make_fallback(aten.gcd.default, warn=False)
+make_fallback(aten._thnn_fused_lstm_cell, require_dense)
+make_fallback(torch._prims.rng_prims.run_and_save_rng_state)
+make_fallback(torch._prims.rng_prims.run_with_rng_state)
+
+# Implmented / Half implemented
+# Scans. Implemented for CUDA, missing CPU
 make_fallback(aten.masked_scatter)
 make_fallback(aten.masked_scatter_backward)
+
+# Complex number support
+make_fallback(aten.view_as_complex, require_contiguous)
+make_fallback(aten.angle)  # needs complex
+
+# Needs efficentzerotensor
+make_fallback(aten._efficientzerotensor)
+
+# Needs Sparse
+make_fallback(aten._sparse_coo_tensor_with_dims_and_tensors)
 make_fallback(aten.to_sparse)
 make_fallback(aten._to_sparse)
-make_fallback(aten.triangular_solve)
-make_fallback(aten.gcd.default, warn=False)
-make_fallback(aten._linalg_eigh)
-make_fallback(aten.zeros.names)
 
-# these are data-dependent, highly unlikely you can write a lowering
-make_fallback(aten.nonzero.default)
+# Needs dimname support
+make_fallback(aten.zeros.names)
 
-make_fallback(torch._prims.rng_prims.run_and_save_rng_state)
-make_fallback(torch._prims.rng_prims.run_with_rng_state)
+# 6) Pattern-matched
+make_fallback(
+    aten._scaled_dot_product_efficient_attention.default,
+    sdpa_constraint,
+    warn=False,
+)
+make_fallback(
+    aten._scaled_dot_product_efficient_attention_backward.default,
+    sdpa_constraint,
+    warn=False,
+)
+make_fallback(
+    aten._scaled_dot_product_flash_attention.default,
+    sdpa_constraint,
+    warn=False,
+)
+make_fallback(
+    aten._scaled_dot_product_flash_attention_backward.default,
+    sdpa_constraint,
+    warn=False,
+)
+make_fallback(
+    aten._scaled_dot_product_flash_attention_for_cpu.default,
+    sdpa_constraint,
+    warn=False,
+)
+make_fallback(
+    aten._scaled_dot_product_flash_attention_for_cpu_backward.default,
+    sdpa_constraint,
+    warn=False,
+)
+make_fallback(aten._flash_attention_forward.default, sdpa_constraint)
+make_fallback(aten._flash_attention_backward.default, sdpa_constraint)
+make_fallback(aten._efficient_attention_forward.default, sdpa_constraint)
+make_fallback(aten._efficient_attention_backward.default, sdpa_constraint)
+make_fallback(aten._scaled_mm.default, constrain_to_fx_strides)
 
-# fails accuracy on test_torch.py, and explicit fallback required to avoid warn=True on implicit
-make_fallback(aten.exponential.default, warn=False)
+# index_reduce requires fallback when use_scatter_fallback(...) returns True
+make_fallback(aten.index_reduce)
 
 
 # Register with type_promotion_kind None.
@@ -2369,8 +2342,8 @@ def select_scatter(x, src, dim: int, index: int):
     dim = _validate_dim(x, dim, 0)
     if V.graph.sizevars.evaluate_expr(sympy.Lt(index, 0)):
         index = index + x.get_size()[dim]
-    V.graph.sizevars.guard_leq(0, index)
-    V.graph.sizevars.guard_lt(index, x.get_size()[dim])
+    V.graph.sizevars.guard_leq(0, index)  # type: ignore[arg-type]
+    V.graph.sizevars.guard_lt(index, x.get_size()[dim])  # type: ignore[arg-type]
     src = expand(unsqueeze(src, dim), x.get_size())
     src_loader = src.make_loader()
 
@@ -2398,14 +2371,8 @@ def slice_scatter(x, src, dim=0, start=None, end=None, step=1):
     x_loader = x.make_loader()
     dim = _validate_dim(x, dim, 0)
     dim_size = x.get_size()[dim]
-    if start is not None and V.graph.sizevars.evaluate_expr(sympy.Lt(start, 0)):
-        start = start + dim_size
-    if end is not None and V.graph.sizevars.evaluate_expr(sympy.Lt(end, 0)):
-        end = end + dim_size
-    if start is None:
-        start = 0
-    if end is None or V.graph.sizevars.statically_known_leq(x.get_size()[dim], end):
-        end = dim_size
+
+    start, end = ir.SliceView.normalize_start_end(x, dim, start, end)
 
     src_size = list(x.get_size())
     src_size[dim] = FloorDiv(end - start + (step - 1), step)
@@ -2547,6 +2514,8 @@ def long_tensor(data):
 
 @register_lowering(aten._local_scalar_dense)
 def _local_scalar_dense(data):
+    from torch.fx.experimental.symbolic_shapes import resolve_unbacked_bindings
+
     # This is interesting!  Most lowerings return tensors, so you can just
     # return the buffer you allocated and it will get used (or not used, if
     # it's dead.)  But _local_scalar_dense (aka item) returns an int,
@@ -2557,18 +2526,44 @@ def _local_scalar_dense(data):
     # solely responsible for generating this .item().  The buffer is
     # not used for anything (notice we discard it); at codegen time,
     # the "buffer" just gets assigned None.
-    sym = V.graph.current_node.meta["val"].node.expr
-    buffer = ir.DynamicScalar(sym, data)
+    unbacked_bindings = resolve_unbacked_bindings(
+        V.graph.sizevars.shape_env, V.graph.current_node.meta["unbacked_bindings"]
+    )
+    assert len(unbacked_bindings) == 1, unbacked_bindings
+    # NB: Have to be very careful here.  V.graph.current_node.meta["val"]
+    # seemingly also contains a symbol which you want to do binding for,
+    # but it actually isn't.  In particular, if we have later performed
+    # a deferred runtime assert saying that u0 == s0, you will actually
+    # see s0 from expr!  This is bad because we need to actually generate
+    # the assert that says u0 == s0, so we need to know where to get u0
+    # from (this call).  In particular, we must use unbacked_bindings, which
+    # is guaranteed to have the original, unreplaced symbol in question.
+    #
+    # NB2: Another thing we have to be very careful about are symbol bindings
+    # that require nontrivial refinement, e.g., when you have a binding site
+    # x: Sym(u0 * 4) = y.item().  Here, the code generation must do a division
+    # in order to appropriately bind u0.  This is communicated via the keypath
+    # in unbacked_bindings, and we need to hold onto it in order to generate
+    # code appropriately for this case.
+    binding_sym, keypath = next(iter(unbacked_bindings.items()))
+    buffer = ir.DynamicScalar(binding_sym, keypath, data)
     buffer.name = V.graph.register_buffer(buffer)
-    return sym
+    # NB: the replaced expr is OK to use directly downstream, we want
+    # simplifications in this case!
+    val = V.graph.current_node.meta["val"]
+    if isinstance(val, (torch.SymInt, torch.SymFloat, torch.SymBool)):
+        return val.node.expr
+    else:
+        return sympy.sympify(val)
 
 
 @register_lowering(aten._assert_scalar)
 def _assert_scalar(data, msg):
-    buffer = ir.AssertScalar(data, msg)
-    # This buffer isn't used by anyone (it returns None), so we must explicitly register it
-    buffer.name = V.graph.register_buffer(buffer)
-    return buffer
+    # NB: These will be handled at codegen time
+    # Not sure if we are guaranteed to be able to serve out truth from the
+    # deferred_runtime_asserts, TODO: try this assert out
+    # assert bool(data.scalar), data
+    return None
 
 
 def _full(fill_value, device, dtype, size):
@@ -2624,6 +2619,10 @@ def inner(
         dtype = dtype or torch.get_default_dtype()
         if len(size) == 1 and isinstance(size[0], (list, tuple, torch.Size)):
             size = tuple(size[0])
+        # See https://github.com/pytorch/pytorch/issues/118102
+        # All sizes at lowering time should be sympy.Symbol, not SymInt!
+        for s in size:
+            assert not isinstance(s, torch.SymInt)
         size = [sympy.expand(s) for s in size]
         return _full(fill_value, device, dtype, size)
 
@@ -2999,7 +2998,6 @@ def index_put_as_masked_fill(self, indices, value, accumulate):
 def index_put_fallback(self, indices, values, accumulate):
     deterministic = torch.are_deterministic_algorithms_enabled()
     if is_triton(values) and (accumulate or deterministic):
-        V.graph.disable_cudagraphs = True
         msg = (
             "index put with accumulate."
             if not deterministic
@@ -3009,7 +3007,7 @@ def index_put_fallback(self, indices, values, accumulate):
             msg = f"{msg} Found from : \n {stack_trace}"
         V.graph.disable_cudagraphs_reason = msg
 
-    ir.IndexPutFallback(self, indices, values, accumulate)
+    ir.IndexPutFallback(V.graph.current_node.target, self, indices, values, accumulate)
     return self
 
 
@@ -3023,11 +3021,6 @@ def _unsafe_index_put_(self, indices, values, accumulate=False):
     return index_put_impl_(self, indices, values, accumulate, check=False)
 
 
-def needs_fallback_due_to_atomic_add_limitations(dtype):
-    # tl.atomic_add does NOT support the following types
-    return dtype in {torch.int64, torch.bool, torch.bfloat16}
-
-
 def index_put_impl_(self, indices, values, accumulate, check):
     # Dispatch to masked fill for single boolean index with single value
     if (
@@ -3108,7 +3101,7 @@ def index_put_impl_(self, indices, values, accumulate, check):
     )
     buffer = ir.ComputedBuffer(
         None,
-        ir.MutationLayout(self),
+        ir.MutationLayoutSHOULDREMOVE(self),
         scatter,
     )
     buffer.name = V.graph.register_buffer(buffer)
@@ -3168,7 +3161,7 @@ def scatter(x, dim: int, index, src, **kwargs):
 
 
 def scatter_fallback(
-    fn,
+    op_overload: torch._ops.OpOverload,
     self,
     dim: int,
     index,
@@ -3177,26 +3170,23 @@ def scatter_fallback(
     reduce: Optional[str] = None,
     include_self: bool = True,
 ):
-    reduce_ty = "add" if fn == "aten.scatter_" else "sum"
-    if (
-        reduce not in {None, reduce_ty}
-        or (
-            isinstance(src, TensorBox)
-            and src.get_device().type == torch.device("cuda").type
-            and needs_fallback_due_to_atomic_add_limitations(src.get_dtype())
-        )
-        or (
-            fn == "aten.scatter_reduce_"
-            and reduce == "sum"
-            and isinstance(src, TensorBox)
-            and src.get_device() == torch.device("cpu")
-            and config.cpp.fallback_scatter_reduce_sum
-        )
-        or (reduce == reduce_ty and self.get_dtype() in {torch.bool, torch.int64})
-        or torch.are_deterministic_algorithms_enabled()
+    src_is_tensor = isinstance(src, TensorBox)
+    if use_scatter_fallback(
+        op_overload,
+        reduce,
+        self.get_dtype(),
+        src.get_dtype() if src_is_tensor else type(src),
+        src.get_device().type if src_is_tensor else "not impl",
+        src_is_tensor,
     ):
         ir.ScatterFallback(
-            fn, self, dim, index, src, reduce=reduce, include_self=include_self
+            op_overload,
+            self,
+            dim,
+            index,
+            src,
+            reduce=reduce,
+            include_self=include_self,
         )
         return self
 
@@ -3207,18 +3197,18 @@ def scatter_fallback(
 def scatter_(self, dim: int, index, src, *, reduce: Optional[str] = None):
     assert reduce in {None, "add", "multiply"}
 
-    fallback_result = scatter_fallback(
-        "aten.scatter_", self, dim, index, src, reduce=reduce
-    )
-
-    if fallback_result:
-        return fallback_result
+    if reduce is None:
+        op_overload = getattr(aten.scatter_, V.graph.current_node.target._overloadname)  # type: ignore[union-attr]
+        fallback_result = scatter_fallback(
+            op_overload, self, dim, index, src, reduce=reduce
+        )
+        if fallback_result is not None:
+            return fallback_result
 
     if reduce == "add":
         reduce = "sum"
     elif reduce == "multiply":
         reduce = "prod"
-
     return scatter_reduce_(self, dim, index, src, reduce)
 
 
@@ -3241,8 +3231,12 @@ def scatter_reduce(x, dim: int, index, src, reduction_type, **kwargs):
 def scatter_reduce_(self, dim: int, index, src, reduce, *, include_self: bool = True):
     assert reduce in {None, "sum", "prod", "mean", "amax", "amin"}
 
+    assert (
+        len(aten.scatter_reduce_.overloads()) == 1
+        and "two" in aten.scatter_reduce_.overloads()
+    ), "aten.scatter_reduce_.two is not the unique overload of aten.scatter_reduce_"
     fallback_result = scatter_fallback(
-        "aten.scatter_reduce_",
+        aten.scatter_reduce_.two,
         self,
         dim,
         index,
@@ -3310,7 +3304,7 @@ def backend_reduce_str(reduce):
         )
         buffer = ir.ComputedBuffer(
             None,
-            ir.MutationLayout(self),
+            ir.MutationLayoutSHOULDREMOVE(self),
             zero_out,
         )
         buffer.name = V.graph.register_buffer(buffer)
@@ -3328,7 +3322,7 @@ def backend_reduce_str(reduce):
     )
     buffer = ir.ComputedBuffer(
         None,
-        ir.MutationLayout(self),
+        ir.MutationLayoutSHOULDREMOVE(self),
         scatter,
     )
     buffer.name = V.graph.register_buffer(buffer)
@@ -3354,10 +3348,10 @@ def upsample_nearestnd(
     assert len(scales_x) == n
     o_sizes = output_size
 
-    scales = [i / o for i, o in zip(i_sizes, o_sizes)]
+    inv_scales = [i / o for i, o in zip(i_sizes, o_sizes)]
     for i, scale in enumerate(scales_x):
-        if scale:
-            scales[i] = scale
+        if scale is not None:
+            inv_scales[i] = 1.0 / scale
 
     def scale_fn(x, scale, size):
         # Nearest Exact: input_index = round(scale * (output_index + 0.5) - 0.5)
@@ -3374,7 +3368,7 @@ def fn(idx):
         x = idx[-n:]
         b = idx[:-n]
         return x_loader(
-            [*b, *[scale_fn(i, s, size) for i, s, size in zip(x, scales, i_sizes)]]
+            [*b, *[scale_fn(i, s, size) for i, s, size in zip(x, inv_scales, i_sizes)]]
         )
 
     return Pointwise.create(
@@ -3437,115 +3431,6 @@ def _create_constants(*args, dtype):
     return tuple(ops.constant(a, dtype) for a in args)
 
 
-@register_lowering(aten.upsample_bicubic2d.default)
-def upsample_bicubic2d_default(
-    x,
-    output_size,
-    align_corners: bool,
-    scales_h: Optional[float] = None,
-    scales_w: Optional[float] = None,
-):
-    x.realize_hint()
-    x_loader = x.make_loader()
-
-    N, C, iH, iW = x.get_size()
-    oH, oW = output_size
-
-    iH = V.graph.sizevars.evaluate_static_shape(iH)
-    iW = V.graph.sizevars.evaluate_static_shape(iW)
-
-    def get_int_dtype(maxval):
-        if maxval > torch.iinfo(torch.int32).max:
-            return torch.int64
-        return torch.int32
-
-    def compute_scale(in_size, out_size, align_corners, scale=None):
-        if align_corners:
-            return (in_size - 1) / (out_size - 1) if out_size > 1 else 0
-        else:
-            return 1 / scale if scale is not None and scale > 0 else in_size / out_size
-
-    def compute_source_index(scale, dst_index, align_corners):
-        dst_index_ie = ops.index_expr(dst_index, torch.float32)
-        scale = ops.constant(scale, torch.float32)
-        if align_corners:
-            return ops.mul(scale, dst_index_ie)
-        else:
-            half = ops.constant(0.5, torch.float32)
-            return scale * (dst_index_ie + half) - half
-
-    def cubic_convolution1(x, A):
-        _Ap2, _Ap3, _1 = _create_constants(A + 2, A + 3, 1, dtype=torch.float32)
-        return (_Ap2 * x - _Ap3) * x * x + _1
-
-    def cubic_convolution2(x, A):
-        _A, _4A, _5A, _8A = _create_constants(
-            A, 4 * A, 5 * A, 8 * A, dtype=torch.float32
-        )
-        return ((_A * x - _5A) * x + _8A) * x - _4A
-
-    def get_cubic_upsample_coefficients(t):
-        A = -0.75
-        _1 = ops.constant(1.0, torch.float32)
-        c0 = cubic_convolution2(ops.add(t, _1), A)
-        c1 = cubic_convolution1(t, A)
-
-        x2 = ops.sub(_1, t)
-        c2 = cubic_convolution1(x2, A)
-        c3 = cubic_convolution2(ops.add(x2, _1), A)
-        return (c0, c1, c2, c3)
-
-    def cubic_interp1d(xs, t):
-        cs = get_cubic_upsample_coefficients(t)
-        # dot product between xs and cs
-        return xs[0] * cs[0] + xs[1] * cs[1] + xs[2] * cs[2] + xs[3] * cs[3]
-
-    height_scale = compute_scale(iH, oH, align_corners, scales_h)
-    width_scale = compute_scale(iW, oW, align_corners, scales_h)
-
-    def clamp(v, min, max):
-        return ops.maximum(min, ops.minimum(max, v))
-
-    def fn(idx):
-        n, c, oy, ox = idx
-
-        real_x = compute_source_index(width_scale, ox, align_corners)
-        in_x = ops.floor(real_x)
-        t_x = ops.sub(real_x, in_x)
-
-        real_y = compute_source_index(height_scale, oy, align_corners)
-        in_y = ops.floor(real_y)
-        t_y = ops.sub(real_y, in_y)
-
-        def load_bounded(fy, fx):
-            # TODO(Lezcano) Here we may not need to set-up a device_size
-            _0 = ops.constant(0, torch.int32)
-            iHm1 = ops.constant(iH - 1, torch.int32)
-            iWm1 = ops.constant(iW - 1, torch.int32)
-            iy = ops.indirect_indexing(clamp(fy, _0, iHm1), iH, check=False)
-            ix = ops.indirect_indexing(clamp(fx, _0, iWm1), iW, check=False)
-            return x_loader([n, c, iy, ix])
-
-        iy = ops.to_dtype(in_y, get_int_dtype(iH + 1))
-        ix = ops.to_dtype(in_x, get_int_dtype(iW + 1))
-        iys_ofs = tuple(ops.add(iy, ofs) for ofs in (-1, 0, 1, 2))
-        ixs_ofs = tuple(ops.add(ix, ofs) for ofs in (-1, 0, 1, 2))
-
-        def get_x_interp(y):
-            coeffs_x = tuple(load_bounded(y, x) for x in ixs_ofs)
-            return cubic_interp1d(coeffs_x, t_x)
-
-        coeffs_y = tuple(get_x_interp(y) for y in iys_ofs)
-        return cubic_interp1d(coeffs_y, t_y)
-
-    return Pointwise.create(
-        device=x.get_device(),
-        dtype=x.get_dtype(),
-        inner_fn=fn,
-        ranges=[N, C, sympy.Integer(oH), sympy.Integer(oW)],
-    )
-
-
 @register_lowering(aten.reflection_pad1d_backward)
 @register_lowering(aten.reflection_pad2d_backward)
 @register_lowering(aten.reflection_pad3d_backward)
@@ -3630,8 +3515,8 @@ def accumulate(grad, out, index_ranges):
                     out = right_reflect[i]
                     index_range = (xyz[i], dhw[i] - padding_right[i], dhw[i] - 1)
 
-                outs.append(out)
-                index_ranges.append(index_range)
+                outs.append(out)  # type: ignore[possibly-undefined]
+                index_ranges.append(index_range)  # type: ignore[possibly-undefined]
 
             grad = accumulate(grad, outs, index_ranges)
 
@@ -3681,7 +3566,7 @@ def constant_pad_nd(x, padding, fill_value=0):
     # if padding is a complicated expression, hoist it
     bounds_precomp: List[Tuple[sympy.Symbol, Any]] = []
     for l, h in bounds:
-        bounds_precomp.append((V.graph.sizevars.lookup_precomputed_size(l), h))
+        bounds_precomp.append((V.graph.sizevars.lookup_precomputed_size(l), h))  # type: ignore[arg-type]
 
     output_size = list(sizes[:n])
     mask_sizes = []
@@ -3738,29 +3623,31 @@ def range_mask(i: sympy.Expr, high: sympy.Expr, low: sympy.Expr):
     )
 
 
-def constant_boundary_condition_2d(x, fill_value, padding=None, pad_fill_value=1.0):
-    *_, h, w = x.get_size()
+def constant_boundary_condition(
+    x, fill_value, padding=None, pad_fill_value=1.0, dim=None
+):
+    h = x.get_size()[-dim:]
     x_loader = x.make_loader()
-    padding_h = padding[0] if padding else 0
-    padding_w = padding[1] if padding else 0
+    padding_h = padding or [0] * dim
 
     def load(index):
-        *prefix, ih, iw = index
+        prefix = index[:-dim]
+        ih = index[-dim:]
 
-        mask = ops.and_(
-            range_mask(ih, h + padding_h, -padding_h),
-            range_mask(iw, w + padding_w, -padding_w),
+        mask = functools.reduce(
+            ops.and_,
+            [range_mask(ih[i], h[i] + padding_h[i], -padding_h[i]) for i in range(dim)],
         )
         return (
             ops.masked(
                 mask,
-                lambda: constant_boundary_condition_2d(x, pad_fill_value)(
-                    [*prefix, ih, iw]
+                lambda: constant_boundary_condition(x, pad_fill_value, dim=dim)(
+                    [*prefix, *ih]
                 ),
                 fill_value,
             )
             if padding
-            else ops.masked(mask, lambda: x_loader([*prefix, ih, iw]), fill_value)
+            else ops.masked(mask, lambda: x_loader([*prefix, *ih]), fill_value)
         )
 
     return load
@@ -3777,8 +3664,8 @@ def pooling_size(x, i, kernel_size, stride, padding, ceil_mode):
         )
         if V.graph.sizevars.size_hint((x_alt - 1) * stride[i] - x - padding[i]) >= 0:
             # Sliding windows must start within the input or left padding
-            x_alt -= 1
-            V.graph.sizevars.guard_leq(0, x_alt * stride[i] - x - padding[i])
+            x_alt -= 1  # type: ignore[assignment]
+            V.graph.sizevars.guard_leq(0, x_alt * stride[i] - x - padding[i])  # type: ignore[arg-type]
         if V.graph.sizevars.size_hint(x_out - x_alt) == 0:
             # ceil mode is actually a no-op, lets guard on that
             V.graph.sizevars.guard_equals(x_out, x_alt)
@@ -3788,15 +3675,14 @@ def pooling_size(x, i, kernel_size, stride, padding, ceil_mode):
     return x_out, ceil_mode
 
 
-fallback_max_pool2d_with_indices = fallback_handler(
-    aten.max_pool2d_with_indices.default,
-    add_to_fallback_set=False,
-)
+def should_fallback_max_pool2d_with_indices(kernel_size, dilation):
+    kernel_size = pad_listlike(kernel_size, 2)
+    window_size = kernel_size[0] * kernel_size[1]
+    return (window_size > 25) or any(d > 1 for d in dilation)
 
 
-@register_lowering(aten.max_pool2d_with_indices, type_promotion_kind=None)
-def max_pool2d_with_indices(
-    x, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False
+def max_pool2d_checks(
+    x, kernel_size, stride, padding, dilation, *, assert_fallback=None
 ):
     if padding == 0:
         padding = [0, 0]
@@ -3804,6 +3690,7 @@ def max_pool2d_with_indices(
         dilation = [1, 1]
     if not stride:
         stride = kernel_size
+
     kernel_size = pad_listlike(kernel_size, 2)
     stride = pad_listlike(stride, 2)
     padding = pad_listlike(padding, 2)
@@ -3816,36 +3703,51 @@ def max_pool2d_with_indices(
     assert len(dilation) == 2
     assert len(x.get_size()) in (3, 4)
 
+    use_fallback = should_fallback_max_pool2d_with_indices(kernel_size, dilation)
+    if assert_fallback is not None:
+        assert use_fallback == assert_fallback
+
+    return kernel_size, stride, padding, dilation, use_fallback
+
+
+@register_lowering(prims._low_memory_max_pool2d_with_offsets, type_promotion_kind=None)
+def _low_memory_max_pool2d_with_offsets(
+    x,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode=False,
+):
+    # assert we are not on a fallback path, the inductor decomp should have guaranteed this
+    kernel_size, stride, padding, dilation, _ = max_pool2d_checks(
+        x, kernel_size, stride, padding, dilation, assert_fallback=False
+    )
+
     x.realize_hint()
     *batch, h, w = x.get_size()
 
     h_out, ceil_mode1 = pooling_size(h, 0, kernel_size, stride, padding, ceil_mode)
     w_out, ceil_mode2 = pooling_size(w, 1, kernel_size, stride, padding, ceil_mode)
 
+    new_size = list(batch) + [h_out, w_out]
     if padding[0] or padding[1] or ceil_mode1 or ceil_mode2:
-        x_loader = constant_boundary_condition_2d(x, float("-inf"))
+        x_loader = constant_boundary_condition(x, float("-inf"), dim=2)
     else:
         x_loader = x.make_loader()
 
-    new_size = list(batch) + [h_out, w_out]
-    window_size = kernel_size[0] * kernel_size[1]
-
-    if window_size > 25 or any(d != 1 for d in dilation):
-        # Kernel size too big. Results in hard-to-optimize Triton code. Use fallback.
-        return fallback_max_pool2d_with_indices(
-            x, kernel_size, stride, padding, dilation, ceil_mode
-        )
-
     def fn(idx, return_index):
         *prefix, bh, bw = idx
         maxval = None
         maxindex = None
-        for ih, iw in itertools.product(range(kernel_size[0]), range(kernel_size[1])):
-            ih = bh * stride[0] + ih - padding[0]
-            iw = bw * stride[1] + iw - padding[1]
+        for h_inc, w_inc in itertools.product(
+            range(kernel_size[0]), range(kernel_size[1])
+        ):
+            ih = bh * stride[0] + h_inc - padding[0]
+            iw = bw * stride[1] + w_inc - padding[1]
             val = x_loader([*prefix, ih, iw])
             if return_index:
-                index = ops.index_expr(ih * w + iw, torch.int64)
+                index = ops.index_expr(h_inc * kernel_size[1] + w_inc, torch.int8)
                 if maxindex is None:
                     maxindex = index
                 else:
@@ -3859,20 +3761,58 @@ def fn(idx, return_index):
         else:
             return maxval
 
-    r1 = Pointwise.create(
+    out = Pointwise.create(
         device=x.get_device(),
         dtype=x.get_dtype(),
         inner_fn=functools.partial(fn, return_index=False),
         ranges=new_size,
     )
-    r2 = Pointwise.create(
+    offsets = Pointwise.create(
         device=x.get_device(),
-        dtype=torch.int64,
+        dtype=torch.int8,
         inner_fn=functools.partial(fn, return_index=True),
         ranges=new_size,
     )
-    # TODO(jansel): should we force these to be realized?
-    return r1, r2
+    return out, offsets
+
+
+@register_lowering(
+    prims._low_memory_max_pool2d_offsets_to_indices, type_promotion_kind=None
+)
+def _low_memory_max_pool2d_offsets_to_indices(
+    offsets, kernel_width, input_width, stride, padding
+):
+    # TODO: Generalize to other max pooling flavors, and arbitrary dim
+
+    offsets_loader = offsets.make_loader()
+
+    def increments_to_index(h_inc, w_inc, bh, bw):
+        w_in = ops.index_expr(input_width, torch.int64)
+        hbase = ops.index_expr(bh * stride[0] - padding[0], torch.int64)
+        wbase = ops.index_expr(bw * stride[1] - padding[1], torch.int64)
+        ih = hbase + h_inc
+        iw = wbase + w_inc
+        return ih * w_in + iw
+
+    def offsets_to_indices(idx):
+        *prefix, bh, bw = idx
+        offset = offsets_loader([*prefix, bh, bw])
+        kw_const = ops.constant(kernel_width, torch.int32)
+        h_inc = offset // kw_const
+        w_inc = offset - (h_inc * kw_const)
+        return increments_to_index(h_inc, w_inc, bh, bw)
+
+    indices = Pointwise.create(
+        device=offsets.get_device(),
+        dtype=torch.int64,
+        inner_fn=offsets_to_indices,
+        ranges=offsets.get_size(),
+    )
+    return indices
+
+
+# Fallback selected when we do not decompose to the low-memory path.
+make_fallback(aten.max_pool2d_with_indices)
 
 
 fallback_max_pool2d_with_indices_backward = fallback_handler(
@@ -3929,19 +3869,12 @@ def max_pool2d_with_indices_backward(
     is_channels_last = (x_stride is not None and x_stride[1] == 1) or (
         gO_stride is not None and gO_stride[1] == 1
     )
-    autotune = (
-        config.coordinate_descent_tuning
-        or config.max_autotune
-        or config.max_autotune_pointwise
-    )
-    if any(d != 1 for d in dilation) or (is_channels_last and not autotune):
-        # don't codegen channels-last when autotune is not enabled, it's very slow
+    if any(d != 1 for d in dilation):
+        # dilation NYI
         return fallback_max_pool2d_with_indices_backward(
             grad_output, x, kernel_size, stride, padding, dilation, ceil_mode, indices
         )
 
-    indices.realize_hint()
-
     *batch, height, width = x.get_size()
     *_, pooled_height, pooled_width = grad_output.get_size()
 
@@ -3950,16 +3883,12 @@ def max_pool2d_with_indices_backward(
     new_size = list(x.get_size())
 
     h_window_size = max(
-        [
-            max(h // stride[0] - max(0, (h - kernel_size[0]) // stride[0]), 1)
-            for h in range(kernel_size[0] * 2)
-        ]
+        max(h // stride[0] - max(0, (h - kernel_size[0]) // stride[0]), 1)
+        for h in range(kernel_size[0] * 2)
     )
     w_window_size = max(
-        [
-            max(w // stride[1] - max(0, (w - kernel_size[1]) // stride[1]), 1)
-            for w in range(kernel_size[1] * 2)
-        ]
+        max(w // stride[1] - max(0, (w - kernel_size[1]) // stride[1]), 1)
+        for w in range(kernel_size[1] * 2)
     )
 
     window_size = h_window_size * w_window_size
@@ -4031,15 +3960,19 @@ def fn(idx):
         assert gradient is not None
         return gradient
 
-    return Pointwise.create(
+    out = Pointwise.create(
         device=grad_output.get_device(),
         dtype=grad_output.get_dtype(),
         inner_fn=fn,
         ranges=new_size,
     )
+    if is_channels_last:
+        return ir.ExternKernel.require_channels_last(out)
+    else:
+        return out
 
 
-def pad_adaptive_loader(x):
+def pad_adaptive_loader(x, pad_val=0.0):
     *_, h, w = x.get_size()
     x_loader = x.make_loader()
 
@@ -4062,7 +3995,7 @@ def load(prefix, increments, start_indices, end_indices):
         return ops.masked(
             mask,
             lambda: x_loader([*prefix, h_start_index + ih, w_start_index + iw]),
-            0.0,
+            pad_val,
         )
 
     return load
@@ -4173,6 +4106,243 @@ def fn(idx):
     return rv
 
 
+def _adaptive_pooling_idx_max(kernel_maxes, in_sizes, out_sizes, return_index, loader):
+    # NOTE: There is some duplication between this and addaptive_avg_pool2d and max_pool2d
+    # Look into refactoring/deduplication after #116418 is merged.
+    h_in, w_in = in_sizes
+    h_out, w_out = out_sizes
+
+    def start_index(index, out_dim, inp_dim):
+        return FloorDiv((index * inp_dim), out_dim)
+
+    def end_index(index, out_dim, inp_dim):
+        return FloorDiv((index + 1) * inp_dim + out_dim - 1, out_dim)
+
+    h_start_index_fn = functools.partial(start_index, out_dim=h_out, inp_dim=h_in)
+    h_end_index_fn = functools.partial(end_index, out_dim=h_out, inp_dim=h_in)
+    w_start_index_fn = functools.partial(start_index, out_dim=w_out, inp_dim=w_in)
+    w_end_index_fn = functools.partial(end_index, out_dim=w_out, inp_dim=w_in)
+
+    def fn_max(idx):
+        *prefix, bh, bw = idx
+
+        h_start_index = h_start_index_fn(bh)
+        h_end_index = h_end_index_fn(bh)
+
+        w_start_index = w_start_index_fn(bw)
+        w_end_index = w_end_index_fn(bw)
+        maxval = None
+        maxindex = None
+        for ih, iw in itertools.product(range(kernel_maxes[0]), range(kernel_maxes[1])):
+            val = loader(
+                prefix,
+                [ih, iw],
+                [h_start_index, w_start_index],
+                [h_end_index, w_end_index],
+            )
+            index = ops.index_expr(
+                (h_start_index + ih) * w_in + w_start_index + iw, torch.int64
+            )
+            if return_index:
+                if maxindex is None:
+                    maxindex = index
+                else:
+                    maxindex = ops.where(ops.gt(val, maxval), index, maxindex)
+            if maxval is None:
+                maxval = val
+            else:
+                maxval = ops.maximum(val, maxval)
+        if return_index:
+            return maxindex
+        else:
+            return maxval
+
+    return fn_max
+
+
+fallback_adaptive_max_pool2d = fallback_handler(
+    aten.adaptive_max_pool2d.default, add_to_fallback_set=False
+)
+
+
+@register_lowering(aten.adaptive_max_pool2d)
+def adaptive_max_pool2d(x, output_size):
+    assert isinstance(x, TensorBox)
+    assert len(output_size) == 2
+    x.realize_hint()
+
+    *batch, h_in, w_in = x.get_size()
+
+    h_in = V.graph.sizevars.evaluate_static_shape(h_in)
+    w_in = V.graph.sizevars.evaluate_static_shape(w_in)
+
+    h_out, w_out = output_size
+
+    if h_out == 0 or w_out == 0:
+        o_size = [*batch, h_out, w_out]
+        return empty(o_size, dtype=x.get_dtype(), device=x.get_device()), empty(
+            o_size, dtype=torch.int64, device=x.get_device()
+        )
+    if h_in % h_out == 0 and w_in % w_out == 0:
+        kernel_size = [h_in // h_out, w_in // w_out]
+        if should_fallback_max_pool2d_with_indices(kernel_size, dilation=[1, 1]):
+            return max_pool2d_with_indices(x, kernel_size)  # type: ignore[name-defined]   # noqa: F821
+        else:
+            v, offsets = _low_memory_max_pool2d_with_offsets(
+                x,
+                kernel_size,
+                stride=kernel_size,
+                padding=[0, 0],
+                dilation=[1, 1],
+                ceil_mode=False,
+            )
+            indices = _low_memory_max_pool2d_offsets_to_indices(
+                offsets, kernel_size[1], w_in, kernel_size, padding=[0, 0]
+            )
+            return v, indices
+
+    h_kernel_max = ceildiv((h_in + h_out - 1), h_out)
+    w_kernel_max = ceildiv((w_in + w_out - 1), w_out)
+
+    new_size = list(batch) + [h_out, w_out]
+    dtype = x.get_dtype()
+
+    window_size = h_kernel_max * w_kernel_max
+    if window_size > 25:
+        # Kernel size too big. Results in hard-to-optimize Triton code. Use fallback.
+        return fallback_adaptive_max_pool2d(x, output_size)
+
+    inner_func_max_val = _adaptive_pooling_idx_max(
+        kernel_maxes=[h_kernel_max, w_kernel_max],
+        in_sizes=[h_in, w_in],
+        out_sizes=[h_out, w_out],
+        return_index=False,
+        loader=pad_adaptive_loader(x, float("-inf")),
+    )
+
+    inner_func_max_idx = _adaptive_pooling_idx_max(
+        kernel_maxes=[h_kernel_max, w_kernel_max],
+        in_sizes=[h_in, w_in],
+        out_sizes=[h_out, w_out],
+        return_index=True,
+        loader=pad_adaptive_loader(x, float("-inf")),
+    )
+
+    rv = Pointwise.create(
+        device=x.get_device(),
+        dtype=dtype,
+        inner_fn=inner_func_max_val,
+        ranges=new_size,
+    )
+    ri = Pointwise.create(
+        device=x.get_device(),
+        dtype=torch.int64,
+        inner_fn=inner_func_max_idx,
+        ranges=new_size,
+    )
+    return rv, ri
+
+
+fallback_fractional_max_pool2d = fallback_handler(
+    aten.fractional_max_pool2d.default, add_to_fallback_set=False
+)
+
+
+def _fractional_pooling_offsets(samples, in_sz, out_sz, kernel_sz, dim):
+    out_sz = out_sz[dim]
+    in_sz = in_sz[dim]
+    kernel_sz = kernel_sz[dim]
+    alpha = (in_sz - kernel_sz) / (out_sz - 1)
+    samples_loader = samples.make_loader()
+
+    def load(prefix, i):
+        sample = samples_loader([*prefix, dim])
+        i_expr = ops.index_expr(i, samples.get_dtype())
+        alpha_expr = ops.index_expr(alpha, samples.get_dtype())
+        seq_i = ops.floor((i_expr + sample) * alpha_expr) - ops.floor(
+            sample * alpha_expr
+        )
+        seq_i = ops.to_dtype(seq_i, torch.int64)
+
+        mask = ops.lt(
+            i_expr,
+            ops.index_expr(out_sz - 1, torch.int64),
+        )
+        return ops.where(mask, seq_i, ops.index_expr(in_sz - kernel_sz, torch.int64))
+
+    return load
+
+
+@register_lowering(aten.fractional_max_pool2d)
+def fractional_max_pool2d(x, kernel_size, output_size, random_samples):
+    x.realize_hint()
+    *batch, inp_h, inp_w = x.get_size()
+    kernel_h, kernel_w = kernel_size
+    h_out, w_out = output_size
+
+    if kernel_h * kernel_w >= 25:
+        return fallback_fractional_max_pool2d(
+            x, kernel_size, output_size, random_samples
+        )
+
+    gen_offsets_for_dim = functools.partial(
+        _fractional_pooling_offsets,
+        samples=random_samples,
+        in_sz=[inp_h, inp_w],
+        out_sz=output_size,
+        kernel_sz=kernel_size,
+    )
+
+    h_index_fn = gen_offsets_for_dim(dim=0)
+    w_index_fn = gen_offsets_for_dim(dim=1)
+    x_loader = x.make_loader()
+
+    def fn(idx, return_index):
+        *prefix, bh, bw = idx
+
+        h_start_index = ops.indirect_indexing(h_index_fn(prefix, bh), inp_h)
+        w_start_index = ops.indirect_indexing(w_index_fn(prefix, bw), inp_w)
+
+        maxval = None
+        maxindex = None
+        for ih, iw in itertools.product(range(kernel_size[0]), range(kernel_size[1])):
+            val = x_loader([*prefix, h_start_index + ih, w_start_index + iw])
+            if return_index:
+                index = ops.index_expr(
+                    (h_start_index + ih) * inp_w + w_start_index + iw, torch.int64
+                )
+                if maxindex is None:
+                    maxindex = index
+                else:
+                    maxindex = ops.where(
+                        ops.or_(ops.gt(val, maxval), ops.isnan(val)), index, maxindex
+                    )
+            if maxval is None:
+                maxval = val
+            else:
+                maxval = ops.maximum(val, maxval)
+        if return_index:
+            return maxindex
+        else:
+            return maxval
+
+    new_size = list(batch) + [h_out, w_out]
+    rv = Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=functools.partial(fn, return_index=False),
+        ranges=new_size,
+    )
+
+    ri = Pointwise.create(
+        device=x.get_device(),
+        dtype=torch.int64,
+        inner_fn=functools.partial(fn, return_index=True),
+        ranges=new_size,
+    )
+    return rv, ri
+
+
 @register_lowering(aten.upsample_nearest2d_backward.default)
 def upsample_nearest2d_backward(
     x, output_size=None, input_size=None, scales_h=None, scales_w=None
@@ -4225,6 +4395,9 @@ def fn(idx):
 fallback_avg_pool2d = fallback_handler(
     aten.avg_pool2d.default, add_to_fallback_set=False
 )
+fallback_avg_pool3d = fallback_handler(
+    aten.avg_pool3d.default, add_to_fallback_set=False
+)
 
 
 @register_lowering(aten.avg_pool2d, type_promotion_kind=None)
@@ -4236,41 +4409,97 @@ def avg_pool2d(
     ceil_mode=False,
     count_include_pad=True,
     divisor_override=None,
+):
+    return _avg_poolnd(
+        x,
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        divisor_override,
+        dim=2,
+    )
+
+
+@register_lowering(aten.avg_pool3d, type_promotion_kind=None)
+def avg_pool3d(
+    x,
+    kernel_size,
+    stride=(),
+    padding=0,
+    ceil_mode=False,
+    count_include_pad=True,
+    divisor_override=None,
+):
+    return _avg_poolnd(
+        x,
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        divisor_override,
+        dim=3,
+    )
+
+
+def _avg_poolnd(
+    x,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    divisor_override,
+    dim,
 ):
     if not stride:
         stride = kernel_size
     if not padding:
-        padding = [0, 0]
-    kernel_size = pad_listlike(kernel_size, 2)
-    stride = pad_listlike(stride, 2)
-    padding = pad_listlike(padding, 2)
+        padding = [0] * dim
+    kernel_size = pad_listlike(kernel_size, dim)
+    stride = pad_listlike(stride, dim)
+    padding = pad_listlike(padding, dim)
 
     assert isinstance(x, TensorBox)
-    assert len(kernel_size) == 2
-    assert len(stride) == 2
-    assert len(padding) == 2
-    assert len(x.get_size()) in (3, 4)
+    assert len(kernel_size) == dim
+    assert len(stride) == dim
+    assert len(padding) == dim
+    assert len(x.get_size()) in (dim + 1, dim + 2)
 
     x.realize_hint()
-    *batch, h, w = x.get_size()
+    batch = x.get_size()[:-dim]
+    h = x.get_size()[-dim:]
 
-    h_out, ceil_mode1 = pooling_size(h, 0, kernel_size, stride, padding, ceil_mode)
-    w_out, ceil_mode2 = pooling_size(w, 1, kernel_size, stride, padding, ceil_mode)
+    h_out, ceil_modes = zip(
+        *[
+            pooling_size(h[i], i, kernel_size, stride, padding, ceil_mode)
+            for i in range(dim)
+        ]
+    )
 
-    if padding[0] or padding[1] or ceil_mode1 or ceil_mode2:
-        x_loader = constant_boundary_condition_2d(x, 0.0)
+    if any(padding) or any(ceil_modes):
+        x_loader = constant_boundary_condition(x, 0.0, dim=dim)
         had_padding = True
     else:
         x_loader = x.make_loader()
         had_padding = False
 
-    new_size = list(batch) + [h_out, w_out]
+    new_size = list(batch) + list(h_out)
     dtype = x.get_dtype()
 
-    window_size = kernel_size[0] * kernel_size[1]
+    window_size = functools.reduce(operator.mul, kernel_size)
     if window_size > 25:
         # Kernel size too big. Results in hard-to-optimize Triton code. Use fallback.
-        return fallback_avg_pool2d(
+        if dim == 2:
+            fallback = fallback_avg_pool2d
+        elif dim == 3:
+            fallback = fallback_avg_pool3d
+        else:
+            raise ValueError(f"Unknown dim: {dim}")
+
+        return fallback(
             x,
             kernel_size,
             stride,
@@ -4281,12 +4510,12 @@ def avg_pool2d(
         )
 
     def fn_sum(idx, loader):
-        *prefix, bh, bw = idx
+        prefix = idx[:-dim]
+        b = idx[-dim:]
         total = None
-        for ih, iw in itertools.product(range(kernel_size[0]), range(kernel_size[1])):
-            ih = bh * stride[0] + ih - padding[0]
-            iw = bw * stride[1] + iw - padding[1]
-            val = loader([*prefix, ih, iw])
+        for ih in itertools.product(*[range(kernel_size[i]) for i in range(dim)]):
+            inp = [b[i] * stride[i] + ih[i] - padding[i] for i in range(dim)]
+            val = loader([*prefix, *inp])
             if total is None:
                 total = val
             else:
@@ -4297,19 +4526,28 @@ def fn_sum(idx, loader):
         if divisor_override:
             scale = 1 / divisor_override
         else:
-            scale = 1.0 / (kernel_size[0] * kernel_size[1])
+            scale = 1.0 / window_size
 
         def fn(idx):
             return ops.mul(fn_sum(idx, x_loader), ops.constant(scale, dtype))
 
     else:
-        ones_loader = constant_boundary_condition_2d(
-            ones_like(x), 0.0, padding if count_include_pad else None
-        )
 
         def fn(idx):
-            # TODO(jansel): optimize to do `int(x<h)` rather than `x<h?1:0`
-            return ops.truediv(fn_sum(idx, x_loader), fn_sum(idx, ones_loader))
+            prefix = idx[:-dim]
+            bh = idx[-dim:]
+
+            divide_factors = []
+            for i in range(dim):
+                hstart = bh[i] * stride[i] - padding[i]
+                hend = sympy.Min(hstart + kernel_size[i], h[i] + padding[i])
+                if not count_include_pad:
+                    hstart = sympy.Max(hstart, 0)
+                    hend = sympy.Min(hend, h[i])
+                factor = ops.index_expr(hend - hstart, torch.int32)
+                divide_factors.append(factor)
+            divide_factor = functools.reduce(ops.mul, divide_factors)
+            return ops.div(fn_sum(idx, x_loader), divide_factor)
 
     rv = Pointwise.create(
         device=x.get_device(),
@@ -4366,16 +4604,12 @@ def avg_pool2d_backward(
     dtype = x.get_dtype()
 
     h_window_size = max(
-        [
-            max(h // stride[0] - max(0, (h - kernel_size[0]) // stride[0]), 1)
-            for h in range(kernel_size[0] * 2)
-        ]
+        max(h // stride[0] - max(0, (h - kernel_size[0]) // stride[0]), 1)
+        for h in range(kernel_size[0] * 2)
     )
     w_window_size = max(
-        [
-            max(w // stride[1] - max(0, (w - kernel_size[1]) // stride[1]), 1)
-            for w in range(kernel_size[1] * 2)
-        ]
+        max(w // stride[1] - max(0, (w - kernel_size[1]) // stride[1]), 1)
+        for w in range(kernel_size[1] * 2)
     )
 
     window_size = h_window_size * w_window_size
@@ -4584,12 +4818,12 @@ def _make_scan_inner(x, *, axis, dtype):
     if dtype is not None:
         x = to_dtype(x, dtype)
     size = x.get_size()
-    axis = _validate_reduction_axis(x, axis)[0]
+    axis = _validate_dim(x, axis)
 
     return dict(
         device=x.get_device(),
-        dtype=x.get_dtype(),
-        inner_fn=x.make_loader(),
+        dtypes=(x.get_dtype(),),
+        inner_fns=(x.make_loader(),),
         size=x.get_size(),
         axis=axis,
     )
@@ -4827,14 +5061,19 @@ def mutate_to(changed, val, unsafe_alias=False):
         assert isinstance(val, ir.StorageBox)
 
     if isinstance(changed_data, ir.StorageBox) and not (
-        changed_data.is_input_buffer() or isinstance(changed_data.data, ir.NopKernel)
+        changed_data.is_input_buffer()
+        # In AOTI, module parameters and buffers are not lifted as graph inputs
+        or changed_data.is_module_buffer()
+        or isinstance(changed_data.data, ir.NopKernel)
     ):
         # Fast path, just swing the data pointer
         val.realize()
         changed_data.data = val.data
         return changed
 
-    ir.MutationLayout.realize_into(val, changed_data, unsafe_alias=unsafe_alias)
+    ir.MutationLayoutSHOULDREMOVE.realize_into(
+        val, changed_data, unsafe_alias=unsafe_alias
+    )
     return changed
 
 
@@ -4887,6 +5126,35 @@ def mul(a, b):
         return make_pointwise(fn)(a, b)
 
 
+def get_constant_value(x: ir.IRNode) -> Optional[ir.Constant]:
+    """Try convert an arbitrary IR node into an ir.Constant value"""
+
+    # First try unwrapping the IRNode to see if it is already an ir.Constant
+    # Optional step, but avoids unnecessary inner_fn evaluation.
+    if isinstance(x, ir.MutableBox):
+        return get_constant_value(x.data)
+    if isinstance(x, ir.BaseView):
+        return get_constant_value(x.unwrap_view())
+    if isinstance(x, ir.Constant):
+        return x
+
+    # If the unwrapped node is not an ir.Constant, try evaluating inner_fn
+    # to see if the returned value is from an `ops.constant` call
+    if not isinstance(x, ir.Loops):
+        return None
+
+    handler = torch._inductor.ops_handler.ExtractConstantsHandler(x.get_device())
+    with V.set_ops_handler(handler), patch.object(
+        ir.FlexibleLayout, "allow_indexing", True
+    ):
+        out = x.inner_fn(*x.inner_fn_args())
+
+    assert isinstance(out, torch._inductor.virtualized.OpsValue)
+    if isinstance(out.value, ir.Constant):
+        return out.value
+    return None
+
+
 # NOTE: prims.div maps to a / b in C, so performs truncation division on
 #   integer inputs and true division for floating and complex inputs.
 @register_lowering([prims.div], broadcast=True)
@@ -4896,6 +5164,14 @@ def div_prim(a, b):
     if is_integral:
         return truncdiv(a, b)
 
+    if (divisor := get_constant_value(b)) is not None:
+        # Replace divide by constant with multiply by reciprocal
+        if divisor.value == 0:
+            reciprocal = math.copysign(float("inf"), divisor.value)
+        else:
+            reciprocal = 1.0 / divisor.value
+        return mul(a, reciprocal)
+
     def fn(*args):
         return ops.truediv(*args)
 
@@ -4956,6 +5232,9 @@ def sum_(x, axis=None, keepdims=False, *, dtype=None):
 
 fallback_cumsum = fallback_handler(aten.cumsum.default)
 fallback_cumprod = fallback_handler(aten.cumprod.default)
+fallback_logcumsumexp = fallback_handler(aten.logcumsumexp.default)
+fallback_cummax = fallback_handler(aten.cummax.default)
+fallback_cummin = fallback_handler(aten.cummin.default)
 
 
 @register_lowering(aten.cumsum)
@@ -4965,8 +5244,18 @@ def cumsum(x, axis=None, dtype=None):
     ) and dtype is None:
         dtype = torch.int64
 
+    if len(x.get_size()) == 0:
+        assert axis in [0, -1]
+        dtype = dtype or x.get_dtype()
+        return to_dtype(x, dtype, copy=True)
+
+    def combine_fn(a_tuple, b_tuple):
+        (a,) = a_tuple
+        (b,) = b_tuple
+        return (ops.add(a, b),)
+
     kwargs = _make_scan_inner(x, axis=axis, dtype=dtype)
-    result = ir.Scan.create(**kwargs, combine_fn=ops.add, init=0)
+    (result,) = ir.Scan.create(**kwargs, combine_fn=combine_fn)
     if result is None:
         return fallback_cumsum(x, dim=axis, dtype=dtype)
     return result
@@ -4979,13 +5268,105 @@ def cumprod(x, axis=None, dtype=None):
     ) and dtype is None:
         dtype = torch.int64
 
+    if len(x.get_size()) == 0:
+        assert axis in [0, -1]
+        dtype = dtype or x.get_dtype()
+        return to_dtype(x, dtype, copy=True)
+
+    def combine_fn(a_tuple, b_tuple):
+        (a,) = a_tuple
+        (b,) = b_tuple
+        return (ops.mul(a, b),)
+
     kwargs = _make_scan_inner(x, axis=axis, dtype=dtype)
-    result = ir.Scan.create(**kwargs, combine_fn=ops.mul, init=1)
+    (result,) = ir.Scan.create(**kwargs, combine_fn=combine_fn)
     if result is None:
         return fallback_cumprod(x, dim=axis, dtype=dtype)
     return result
 
 
+@register_lowering(aten.logcumsumexp)
+def logcumsumexp(x, dim):
+    def log_add_exp_helper(a_tuple, b_tuple):
+        (a,) = a_tuple
+        (b,) = b_tuple
+        min_v = ops.minimum(a, b)
+        max_v = ops.maximum(a, b)
+        mask = (min_v != max_v) | (~ops.isinf(min_v))
+        return (ops.where(mask, ops.log1p(ops.exp(min_v - max_v)) + max_v, a),)
+
+    dtype = x.get_dtype()
+    if len(x.get_size()) == 0:
+        assert dim in [0, -1]
+        return clone(x)
+
+    kwargs = _make_scan_inner(x, axis=dim, dtype=dtype)
+    (result,) = ir.Scan.create(**kwargs, combine_fn=log_add_exp_helper)
+    if result is None:
+        return fallback_logcumsumexp(x, dim=dim)
+    return result
+
+
+@register_lowering(aten.cummax, type_promotion_kind=None)
+def cummax(x, axis=None):
+    if len(x.get_size()) == 0:
+        assert axis in [0, -1]
+        return clone(x), torch.empty_like(x, dtype=torch.int64)
+
+    dtype = x.get_dtype()
+    combine_fn = ir.get_reduction_combine_fn(
+        "argmax", dtype=dtype, arg_break_ties_left=False
+    )
+
+    min_value = (
+        False
+        if dtype is torch.bool
+        else (
+            torch.finfo(dtype).min
+            if dtype.is_floating_point
+            else torch.iinfo(dtype).min
+        )
+    )
+
+    kwargs = _make_scan_inner(x, axis=axis, dtype=dtype)
+    kwargs["dtypes"] = (dtype, torch.int64)
+    kwargs["inner_fns"] = (x.make_loader(), lambda _: "rindex")
+    values, indices = ir.Scan.create(**kwargs, combine_fn=combine_fn)
+    if values is None:
+        return fallback_cummax(x, dim=axis)
+    return values, indices
+
+
+@register_lowering(aten.cummin, type_promotion_kind=None)
+def cummin(x, axis=None):
+    if len(x.get_size()) == 0:
+        assert axis in [0, -1]
+        return clone(x), torch.empty_like(x, dtype=torch.int64)
+
+    dtype = x.get_dtype()
+    combine_fn = ir.get_reduction_combine_fn(
+        "argmin", dtype=dtype, arg_break_ties_left=False
+    )
+
+    max_value = (
+        True
+        if dtype is torch.bool
+        else (
+            torch.finfo(dtype).max
+            if dtype.is_floating_point
+            else torch.iinfo(dtype).max
+        )
+    )
+
+    kwargs = _make_scan_inner(x, axis=axis, dtype=dtype)
+    kwargs["dtypes"] = (dtype, torch.int64)
+    kwargs["inner_fns"] = (x.make_loader(), lambda _: "rindex")
+    values, indices = ir.Scan.create(**kwargs, combine_fn=combine_fn)
+    if values is None:
+        return fallback_cummin(x, dim=axis)
+    return values, indices
+
+
 @register_lowering(aten.prod)
 def prod(x, axis=None, keepdims=False, *, dtype=None):
     if (
@@ -5040,9 +5421,12 @@ def reduce_min(x, dim=None, keepdim=False):
 )
 
 
-def register_pointwise_numeric(op):
+def register_pointwise_numeric(op, name=None, triton_fallback=None):
     return register_pointwise(
-        op, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+        op,
+        name=name,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        triton_fallback=triton_fallback,
     )
 
 
@@ -5112,6 +5496,7 @@ def register_pointwise_numeric_ldf64(op):
 register_lowering(aten.clamp_min)(maximum)
 register_lowering(aten.clamp_max)(minimum)
 neg = register_pointwise(aten.neg)
+abs = register_pointwise(aten.abs)
 reciprocal = register_pointwise_numeric(aten.reciprocal)
 register_pointwise(aten.remainder)
 sign = register_pointwise(aten.sign, override_fn_when_input_bool="identity")
@@ -5141,8 +5526,52 @@ def register_pointwise_numeric_ldf64(op):
 register_pointwise_numeric(aten.erfinv)
 register_pointwise_numeric(aten.hypot)
 register_pointwise_numeric(aten.log10)
+register_pointwise_numeric(aten.log2)
 register_pointwise_numeric(aten.nextafter)
 
+from .codegen.common import pointwise_overrides_data
+
+
+def _get_pointwise_overrides(ns, name):
+    data = pointwise_overrides_data[name]
+    op = getattr(ns, data.name, None)
+    if op is None:
+        return
+
+    def make_triton_fallback(op):
+        if data.triton is None:
+            return fallback_handler(op)
+
+    if isinstance(op, torch._ops.OpOverloadPacket):
+        for olname in op.overloads():
+            ol = getattr(op, olname)
+            yield ol, data.type_promotion_kind, make_triton_fallback(ol)
+    else:
+        yield op, data.type_promotion_kind, make_triton_fallback(op)
+
+
+for name in pointwise_overrides_data:
+    for op, type_promotion_kind, triton_fallback in _get_pointwise_overrides(
+        aten, name
+    ):
+        register_pointwise(
+            op,
+            name=name,
+            type_promotion_kind=type_promotion_kind,
+            triton_fallback=triton_fallback,
+        )
+
+    for op, type_promotion_kind, triton_fallback in _get_pointwise_overrides(
+        prims, name
+    ):
+        register_pointwise(
+            op,
+            name=name,
+            type_promotion_kind=type_promotion_kind,
+            triton_fallback=triton_fallback,
+        )
+
+
 foreach_add_list = register_foreach_pointwise(
     aten._foreach_add.List, add, allow_alpha=True
 )
@@ -5255,10 +5684,8 @@ def fn(*args, **kwargs):
 
 
 @register_lowering(aten.sym_constrain_range)
-def sym_constrain_range(a, min, max):
-    tracing_context = torch._guards.TracingContext.get()
-    assert a in tracing_context.fake_mode.shape_env.var_to_range
-    return a
+def sym_constrain_range(a, min=None, max=None):
+    return None
 
 
 @register_lowering(aten.sym_size.int)
@@ -5306,86 +5733,211 @@ def _realize(x):
     return clone(x)
 
 
-@register_lowering(torch.ops.inductor.accumulate_grad_)
-def accumulate_grad_(variable, new_grad):
-    # TODO(jansel): decompose into `variable.grad += new_grad` when variable.grad is defined
+@register_lowering(torch.ops.inductor.resize_storage_bytes_)
+def resize_storage_bytes_(variable, new_size):
     variable.realize()
-    new_grad.realize()
-    ir.AccumulateGrad(variable, new_grad)
+    ir.ResizeStorageBytes(variable, new_size)
     return variable
 
 
+@register_lowering(_bind_nn_parameter)
+def create_nn_parameter(self, placeholder):
+    self.realize()
+    return TensorBox.create(ir.BindNNParameter(self, placeholder))
+
+
+@register_lowering(torch.ops.aten.resize)
+def resize(x, size, *, memory_format=None):
+    assert isinstance(x, TensorBox)
+    assert isinstance(size, (list, tuple))
+
+    if memory_format is None:
+        memory_format = torch.contiguous_format
+    if memory_format == torch.preserve_format:
+        raise RuntimeError(f"unsupported memory format: {memory_format}")
+
+    if memory_format == torch.channels_last:
+        assert len(size) == 4
+    if memory_format == torch.channels_last_3d:
+        assert len(size) == 5
+
+    old_numel = x.get_numel()
+    dtype = x.get_dtype()
+    device = x.get_device()
+
+    if isinstance(x.data, ir.BaseView):
+        x.data = x.data.unwrap_view()
+
+    if (
+        torch.are_deterministic_algorithms_enabled()
+        and torch.utils.deterministic.fill_uninitialized_memory  # type: ignore[attr-defined]
+    ):
+        if is_float_dtype(dtype):
+            uninitalized_val = float("nan")
+        elif is_integer_dtype(dtype):
+            uninitalized_val = torch.iinfo(dtype).max
+        else:
+            uninitalized_val = True
+    else:
+        # using zero as that is what empty does
+        uninitalized_val = 0.0
+
+    if V.graph.sizevars.statically_known_equals(old_numel, 0):  # type: ignore[arg-type]
+        return full(size, uninitalized_val, dtype=dtype, device=device)
+
+    x_flat = as_strided(
+        x,
+        [
+            old_numel,
+        ],
+        [
+            1,
+        ],
+    )
+    flat_loader = x_flat.make_loader()
+    out_stride = ir.FlexibleLayout.stride_ordered_for_memory_format(size, memory_format)
+    out_indexer = ir.FixedLayout(device, dtype, size, out_stride).make_indexer()
+
+    def inner_fn(idx):
+        flat_index = out_indexer(idx)
+        flat_index_expr = ops.index_expr(flat_index, torch.int64)
+        limit = ops.index_expr(old_numel, torch.int64)
+        mask = ops.lt(flat_index_expr, limit)
+        return ops.masked(mask, lambda: flat_loader([flat_index]), uninitalized_val)
+
+    out = Pointwise.create(
+        device=device, dtype=dtype, inner_fn=inner_fn, ranges=list(size)
+    )
+    return out
+
+
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+
+make_fallback(auto_functionalized)
+
+
 @register_lowering(triton_kernel_wrapper_mutation)
-def triton_kernel_wrap_(*, kernel_idx, grid, kwargs):
-    ir.UserDefinedTritonKernel(kernel_idx=kernel_idx, grid=grid, kernel_args=kwargs)
+def triton_kernel_wrap_(*, kernel_idx, constant_args_idx, grid, kwargs):
+    from torch._higher_order_ops.triton_kernel_wrap import kernel_side_table
+
+    constant_args = kernel_side_table.get_constant_args(constant_args_idx)
+    ir.UserDefinedTritonKernel(
+        kernel_idx=kernel_idx,
+        grid=grid,
+        kernel_args={**kwargs, **constant_args},
+    )
     return {key: val for key, val in kwargs.items() if isinstance(val, TensorBox)}
 
 
 @register_lowering(triton_kernel_wrapper_functional)
-def triton_kernel_wrap(*, kernel_idx, grid, kwargs, tensors_to_clone):
-    kwargs = {
-        key: (clone_preserve_reinterpret_view(x) if key in tensors_to_clone else x)
-        for key, x in kwargs.items()
-    }
-    return triton_kernel_wrap_(kernel_idx=kernel_idx, grid=grid, kwargs=kwargs)
+def triton_kernel_wrap(
+    *, kernel_idx, constant_args_idx, grid, kwargs, tensors_to_clone
+):
+    new_kwargs = {}
+    for name, value in kwargs.items():
+        if isinstance(value, ir.TensorBox):
+            x = value.data
+            has_non_rv_views = False
+            while isinstance(x, ir.BaseView):
+                if not isinstance(x, ir.ReinterpretView):
+                    has_non_rv_views = True
+                    break
+                x = x.data
+            if has_non_rv_views:
+                # we realize the inputs wrapped into any view which is not
+                # ReinterpretView to convert them into ReinterpretView during
+                # realization; all views being ReinterpretView is assumed by
+                # the downstream code (e.g., preserving ReinterpretView in
+                # cloning; layout should be available in mutation marking)
+                value = ir.TensorBox(ir.ExternKernel.realize_input(value))
+            if name in tensors_to_clone:
+                value = clone_preserve_reinterpret_view(value)
+        new_kwargs[name] = value
+
+    return triton_kernel_wrap_(
+        kernel_idx=kernel_idx,
+        constant_args_idx=constant_args_idx,
+        grid=grid,
+        kwargs=new_kwargs,
+    )
 
 
-try:
-    import torch.distributed._functional_collectives
+@register_lowering(torch.ops.higher_order.cond)
+def cond(pred, true_fn, false_fn, operands):
+    if is_triton(pred) or any(map(is_triton, operands)):
+        msg = "control flow operator: torch.cond."
+        if stack_trace := V.graph.current_node.meta.get("stack_trace", None):
+            msg = f"{msg} Found from : \n {stack_trace}"
+        V.graph.disable_cudagraphs_reason = msg
 
-    c10d_functional = torch.ops.c10d_functional
+    result = ir.Conditional.create(pred, true_fn, false_fn, operands)
+    return list(map(TensorBox.create, result))
 
-    @register_lowering(c10d_functional.wait_tensor)
-    def wait(input):
-        return TensorBox.create(ir.Wait.create(input))
 
-    @register_lowering(c10d_functional.broadcast)
-    def broadcast(input, src, tag, ranks, group_size):
-        return ir.Broadcast.create(input, src, tag, ranks, group_size)
+@register_lowering(torch.ops.higher_order.while_loop)
+def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
+    if any(map(is_triton, carried_inputs + additional_inputs)):
+        msg = "control flow operator: torch.while_loop."
+        if stack_trace := V.graph.current_node.meta.get("stack_trace", None):
+            msg = f"{msg} Found from : \n {stack_trace}"
+        V.graph.disable_cudagraphs_reason = msg
 
-    @register_lowering(c10d_functional.all_reduce)
-    def allreduce(input, reduce_op, tag, ranks, group_size):
-        return ir.AllReduce.create(input, reduce_op, tag, ranks, group_size)
+    result = ir.WhileLoop.create(cond_fn, body_fn, carried_inputs, additional_inputs)
+    return list(map(TensorBox.create, result))
 
-    @register_lowering(c10d_functional.all_gather_into_tensor)
-    def all_gather_into_tensor(shard, tag, ranks, group_size):
-        return TensorBox.create(
-            ir.AllGatherIntoTensor.create(
-                ir.ExternKernel.require_contiguous(shard), tag, ranks, group_size
-            )
-        )
 
-    @register_lowering(c10d_functional.reduce_scatter_tensor)
-    def reduce_scatter_tensor(input, reduce_op, tag, ranks, group_size):
-        return TensorBox.create(
-            ir.ReduceScatterTensor.create(input, reduce_op, tag, ranks, group_size)
+@register_lowering(associative_scan_op, type_promotion_kind=None)
+def associative_scan(combine_fn: ir.Subgraph, input, dim: int):
+    from .subgraph_lowering import InputDescriptor, lower_pointwise_subgraph
+
+    subgraph_inputs = [
+        InputDescriptor(dtype=x.get_dtype(), device=x.get_device())
+        for x in itertools.chain(input, input)
+    ]
+    lowered_combine_fn = lower_pointwise_subgraph(combine_fn, subgraph_inputs)
+
+    def wrapped_combine_fn(lhs, rhs):
+        return lowered_combine_fn(
+            *pytree.tree_leaves(lhs),
+            *pytree.tree_leaves(rhs),
         )
 
-    @register_lowering(c10d_functional.all_reduce_coalesced)
-    def all_reduce_coalesced(input, reduce_op, tag, ranks, group_size):
-        return ir.AllReduceCoalesced.create(input, reduce_op, tag, ranks, group_size)
+    kwargs = _make_scan_inner(input[0], axis=dim, dtype=None)
+    kwargs["dtypes"] = tuple(x.get_dtype() for x in input)
+    kwargs["inner_fns"] = tuple(x.make_loader() for x in input)
+    result = ir.Scan.create(**kwargs, combine_fn=wrapped_combine_fn)
+    if result is None:
+        raise RuntimeError("Unable to generate code for associative_scan op")
+    return result
 
-    @register_lowering(c10d_functional.all_gather_into_tensor_coalesced)
-    def all_gather_into_tensor_coalesced(self, tag, ranks, group_size):
-        result = ir.AllGatherIntoTensorCoalesced.create(self, tag, ranks, group_size)
-        return list(map(TensorBox.create, result))
 
-    @register_lowering(c10d_functional.reduce_scatter_tensor_coalesced)
-    def reduce_scatter_tensor_coalesced(self, reduceOp, tag, ranks, group_size):
-        result = ir.ReduceScatterTensorCoalesced.create(
-            self, reduceOp, tag, ranks, group_size
-        )
-        return list(map(TensorBox.create, result))
+@register_lowering(torch.ops.prims._sink_tokens.default)
+def _sink_tokens(tokens):
+    return None
 
-    @register_lowering(c10d_functional.all_to_all_single)
-    def all_to_all_single(
-        self, output_split_sizes, input_split_sizes, tag, ranks, group_size
-    ):
-        return TensorBox.create(
-            ir.AllToAllSingle.create(
-                self, output_split_sizes, input_split_sizes, tag, ranks, group_size
-            )
-        )
+
+@register_lowering(torch.ops.higher_order.with_effects)
+def with_effects(token, op, *args, **kwargs):
+    result = ir.EffectfulKernel.create(op, *args, **kwargs)
+
+    from torch._higher_order_ops.effects import get_effect_key
+
+    effect_type = get_effect_key(op, args, kwargs)
+    assert effect_type is not None
+    effectful_kernel = V.graph.effectful_ops[effect_type]
+
+    if result is None:
+        return (effectful_kernel,)
+
+    result = pytree.tree_map_only(ir.MultiOutput, TensorBox.create, result)
+    if not isinstance(result, (list, tuple)):
+        return (effectful_kernel, result)
+    else:
+        return (effectful_kernel, *result)
+
+
+try:
+    import torch.distributed._functional_collectives
 
     _c10d_functional = torch.ops._c10d_functional
 
@@ -5485,12 +6037,39 @@ def _all_to_all_single(inp, output_split_sizes, input_split_sizes, group_name):
             )
         )
 
+    @register_lowering(_c10d_functional.broadcast)
+    def _broadcast(inp, src, group_name):
+        inp = clone(inp)
+        ir._CollectiveKernel.create_inplace(
+            _c10d_functional.broadcast_.default, inp, src, group_name
+        )
+        return inp
+
+    @register_lowering(_c10d_functional.broadcast_)
+    def _broadcast_(inp, src, group_name):
+        ir._CollectiveKernel.create_inplace(
+            _c10d_functional.broadcast_.default, inp, src, group_name
+        )
+        return inp
+
     @register_lowering(_c10d_functional.wait_tensor)
     def _wait_tensor(inp):
         ir._WaitKernel.create_wait(_c10d_functional.wait_tensor.default, inp)
         return inp
 
-except ImportError:
+    @register_lowering(torch.ops._dtensor.shard_dim_alltoall)
+    def _shard_dim_alltoall(inp, gather_dim, shard_dim, group_name):
+        return ir.TensorBox.create(
+            ir._CollectiveKernel.create_out_of_place(
+                torch.ops._dtensor.shard_dim_alltoall.default,
+                inp,
+                gather_dim,
+                shard_dim,
+                group_name,
+            )
+        )
+
+except (AttributeError, ImportError):
     log.info(
         "Inductor support for distributed collectives depends on building torch.distributed"
     )
@@ -5503,3 +6082,8 @@ def _wait_tensor(inp):
 from . import quantized_lowerings
 
 quantized_lowerings.register_quantized_ops()
+quantized_lowerings.register_woq_mm_ops()
+
+from . import mkldnn_lowerings
+
+mkldnn_lowerings.register_onednn_fusion_ops()
diff --git a/torch/_inductor/metrics.py b/torch/_inductor/metrics.py
index 6d3055e6bc97d..3a1b83045f4a0 100644
--- a/torch/_inductor/metrics.py
+++ b/torch/_inductor/metrics.py
@@ -1,11 +1,13 @@
 from __future__ import annotations
 
 import csv
+import inspect
 import os
+import re
 from dataclasses import dataclass
 from functools import lru_cache
 
-from typing import List, Set, Tuple, TYPE_CHECKING, Union
+from typing import Dict, List, Set, Tuple, TYPE_CHECKING, Union
 
 from torch._inductor import config
 from torch._inductor.utils import get_benchmark_name
@@ -37,8 +39,11 @@
 # counters for tracking to_dtype inserted
 cpp_to_dtype_count = 0
 
-# counters for tracking cpp_wrapper disabled
-disable_cpp_wrapper = 0
+# The length counts the number of outer loop fusions.
+# Each element counts the number of inner kernels in each outer loop fusion.
+cpp_outer_loop_fused_inner_counts: List[int] = []
+
+num_comprehensive_padding = 0
 
 
 # reset all counters
@@ -48,7 +53,8 @@ def reset():
     global num_bytes_accessed, nodes_num_elem
     global ir_nodes_pre_fusion
     global cpp_to_dtype_count
-    global disable_cpp_wrapper
+    global cpp_outer_loop_fused_inner_counts
+    global num_comprehensive_padding
 
     generated_kernel_count = 0
     generated_cpp_vec_kernel_count = 0
@@ -57,10 +63,68 @@ def reset():
     node_runtimes.clear()
     ir_nodes_pre_fusion = 0
     cpp_to_dtype_count = 0
-    disable_cpp_wrapper = 0
+    cpp_outer_loop_fused_inner_counts.clear()
+    num_comprehensive_padding = 0
+
+
+@dataclass
+class CachedMetricsDeltas:
+    """
+    The subset of metrics we want update across cache hits, e.g., the
+    FxGraphCache.
+    """
+
+    generated_kernel_count: int
+    generated_cpp_vec_kernel_count: int
+    ir_nodes_pre_fusion: int
+    cpp_to_dtype_count: int
+
 
+class CachedMetricsHelper:
+    """
+    A helper class to help calculate and apply counter deltas for those
+    metrics we want to save with cache entries (e.g., FxGraphCache) and
+    apply on a cache hit.
+    """
+
+    def __init__(self):
+        global generated_kernel_count
+        global generated_cpp_vec_kernel_count
+        global ir_nodes_pre_fusion
+        global cpp_to_dtype_count
+
+        self.generated_kernel_count = generated_kernel_count
+        self.generated_cpp_vec_kernel_count = generated_cpp_vec_kernel_count
+        self.ir_nodes_pre_fusion = ir_nodes_pre_fusion
+        self.cpp_to_dtype_count = cpp_to_dtype_count
+
+    def get_deltas(self) -> CachedMetricsDeltas:
+        global generated_kernel_count
+        global generated_cpp_vec_kernel_count
+        global ir_nodes_pre_fusion
+        global cpp_to_dtype_count
+
+        return CachedMetricsDeltas(
+            generated_kernel_count - self.generated_kernel_count,
+            generated_cpp_vec_kernel_count - self.generated_cpp_vec_kernel_count,
+            ir_nodes_pre_fusion - self.ir_nodes_pre_fusion,
+            cpp_to_dtype_count - self.cpp_to_dtype_count,
+        )
+
+    @staticmethod
+    def apply_deltas(delta: CachedMetricsDeltas):
+        global generated_kernel_count
+        global generated_cpp_vec_kernel_count
+        global ir_nodes_pre_fusion
+        global cpp_to_dtype_count
+
+        generated_kernel_count += delta.generated_kernel_count
+        generated_cpp_vec_kernel_count += delta.generated_cpp_vec_kernel_count
+        ir_nodes_pre_fusion += delta.ir_nodes_pre_fusion
+        cpp_to_dtype_count += delta.cpp_to_dtype_count
 
-REGISTERED_METRIC_TABLES = {}
+
+REGISTERED_METRIC_TABLES: Dict[str, MetricTable] = {}
 
 
 @dataclass
@@ -146,6 +210,194 @@ def register_table(name, column_names):
     ],
 )
 
+# track the perf difference between persistent reduction and non-persistent
+# reductions
+MetricTable.register_table(
+    "persistent_red_perf",
+    [
+        "kernel1_name",
+        "kernel2_name",
+        "kernel1_latency",
+        "kernel2_latency",
+        "size_hints",
+        "reduction_hint",
+        "speedup",
+    ],
+)
+
+# Log the fusion failures due to indexing mismatch
+MetricTable.register_table(
+    "fusion_failure_due_to_indexing_mismatch",
+    [
+        "pre_grad_graph_id",
+        "post_grad_graph_id",
+        "node1_name",
+        "node2_name",
+        "node1_debug_str",
+        "node2_debug_str",
+        "common_buffer_names",
+        "failure_reason",
+    ],
+)
+
+# Log metadata for pointwise/reduction kernels. E.g., model name, kernel path, numel, rnumel, reduction hint
+MetricTable.register_table(
+    "kernel_metadata",
+    [
+        "kernel_name",
+        "kernel_path",
+        "kernel_category",  # pointwise/reduction/foreach etc.
+        "size_hints",
+        "reduction_hint",
+        "line_of_code",
+        "num_load",
+        "num_store",
+        "num_for_loop",
+        "num_atomic_add",
+        "num_args",
+        # xyz numel can be different to size_hints since size_hints are rounded
+        # up to the nearest power of 2.
+        # Inductor kernel will burn in the xyz numel in kernel code for static
+        # shape kernels.
+        # Logging them will be helpful to find unaligned shape for reduction
+        "xnumel",
+        "ynumel",
+        "rnumel",
+        "kernel_args_num_gb",
+    ],
+)
+
+
+def _parse_kernel_fn_code(kernel_module_code):
+    """
+    The kernel_module_code is the python module that contains kernel function code.
+    kernel function is the proper triton kernel function annotated with
+    @triton.jit
+    """
+    from .codecache import PyCodeCache
+    from .wrapper_benchmark import get_triton_kernel
+
+    mod = PyCodeCache.load(kernel_module_code)
+    kernel = get_triton_kernel(mod)
+    # kernel is a CachingAutotune; kernel.fn is the JITFunction;
+    # kernel.fn.fn is the function being decorate by triton.jit
+    return inspect.getsource(kernel.fn.fn)
+
+
+def _parse_kernel_line_of_code(proper_kernel_fn_code):
+    """
+    Return the line of code for the kernel excluding the decorators.
+    """
+    return len(proper_kernel_fn_code.splitlines())
+
+
+def _parse_size_hints(kernel_module_code, kernel_category):
+    if kernel_category == "foreach":
+        # foreach kernel does not have size_hints
+        return None
+    m = re.search(r"size_hints=(\[[0-9, ]*\]),", kernel_module_code)
+    assert m, "size_hints missing!"
+    return m.group(1)
+
+
+def _parse_reduction_hint(kernel_category, kernel_module_code):
+    if kernel_category not in ("reduction", "persistent_reduction"):
+        return None
+    m = re.search(r"reduction_hint=ReductionHint\.(\w*),", kernel_module_code)
+    assert m, "reduction_hint not found in kernel source code!"
+    return m.group(1)
+
+
+def _count_pattern(proper_kernel_fn_code, pattern):
+    return proper_kernel_fn_code.count(pattern)
+
+
+def _count_args(proper_kernel_fn_code):
+    def_line = proper_kernel_fn_code.splitlines()[0]
+    assert def_line.startswith("def ")
+    start_idx = def_line.index("(")
+    end_idx = def_line.index("):")
+    decl_csv = def_line[start_idx + 1 : end_idx]
+    comps = decl_csv.split(",")
+    return len(comps)
+
+
+def _parse_proper_kernel_fn_code(kernel_fn_code):
+    """
+    Skip decorators.
+    """
+    start_pos = kernel_fn_code.index("def ")
+    return kernel_fn_code[start_pos:]
+
+
+def _parse_numel(proper_kernel_fn_code, numel_arg_name):
+    m = re.search(f"{numel_arg_name} = ([\\d]+)", proper_kernel_fn_code)
+    if m:
+        return int(m.group(1))
+    else:
+        return None
+
+
+def _parse_kernel_args_num_gb(kernel_fn_code, kernel_category):
+    """
+    inductor meta looks like:
+        inductor_meta={... 'mutated_arg_names': [], 'no_x_dim': False, 'kernel_num_gb': 2.0},
+    """
+    m = re.search(r".kernel_num_gb.:\s*([0-9.]+)", kernel_fn_code)
+    if m:
+        return float(m.group(1))
+    else:
+        """
+        There are a few cases that kernel_num_gdb field can be missing:
+        1. the field will be missing if config.benchmark_kernel and
+           config.profile_bandwidth are false
+        2. even if config.benchmark_kernel or config.profile_bandwidth is true.
+           foreach kernel does not have kernel_num_gb field in the metadata
+        """
+        return None
+
+
+def log_kernel_metadata(kernel_name, kernel_path, kernel_module_code):
+    """
+    An utility to log kernel metadata. We may parse metadata from kernel source code here.
+
+    It's fine to parse the generated kernel code here since the logging is
+    disabled by default. It would hurt compilation time.
+    """
+    from .wrapper_benchmark import get_kernel_category_by_source_code
+
+    kernel_category = get_kernel_category_by_source_code(kernel_module_code)
+    reduction_hint = _parse_reduction_hint(kernel_category, kernel_module_code)
+    size_hints = _parse_size_hints(kernel_module_code, kernel_category)
+    kernel_fn_code = _parse_kernel_fn_code(kernel_module_code)
+
+    proper_kernel_fn_code = _parse_proper_kernel_fn_code(kernel_fn_code)
+
+    # the line of code excluding the decortors
+    kernel_line_of_code = _parse_kernel_line_of_code(proper_kernel_fn_code)
+
+    get_metric_table("kernel_metadata").add_row(
+        lambda: {
+            "kernel_name": kernel_name,
+            "kernel_path": kernel_path,
+            "kernel_category": kernel_category,
+            "size_hints": size_hints,
+            "reduction_hint": reduction_hint,
+            "line_of_code": kernel_line_of_code,
+            "num_load": _count_pattern(proper_kernel_fn_code, "tl.load"),
+            "num_store": _count_pattern(proper_kernel_fn_code, "tl.store"),
+            "num_for_loop": _count_pattern(proper_kernel_fn_code, "for "),
+            "num_atomic_add": _count_pattern(proper_kernel_fn_code, "tl.atomic_add"),
+            "num_args": _count_args(proper_kernel_fn_code),
+            "xnumel": _parse_numel(proper_kernel_fn_code, "xnumel"),
+            "ynumel": _parse_numel(proper_kernel_fn_code, "ynumel"),
+            "rnumel": _parse_numel(proper_kernel_fn_code, "rnumel"),
+            "kernel_args_num_gb": _parse_kernel_args_num_gb(
+                kernel_fn_code, kernel_category
+            ),
+        }
+    )
+
 
 def purge_old_log_files():
     """
diff --git a/torch/_inductor/mkldnn_lowerings.py b/torch/_inductor/mkldnn_lowerings.py
new file mode 100644
index 0000000000000..0ebccbf27ea3b
--- /dev/null
+++ b/torch/_inductor/mkldnn_lowerings.py
@@ -0,0 +1,361 @@
+from typing import List
+
+import torch
+import torch.utils._pytree as pytree
+from . import ir
+from .ir import TensorBox
+from .lowering import add, add_needs_realized_inputs, aten, register_lowering, to_dtype
+
+
+def register_onednn_fusion_ops():
+    if torch._C._has_mkldnn:
+        cpu_needs_realized_inputs = [
+            torch.ops.mkldnn._convolution_pointwise,
+            torch.ops.mkldnn._convolution_pointwise_,
+            torch.ops.mkldnn._convolution_transpose_pointwise,
+            torch.ops.mkldnn._linear_pointwise,
+            aten.mkldnn_rnn_layer.default,
+            torch.ops.onednn.qconv2d_pointwise,
+        ]
+
+        @register_lowering(torch.ops.mkldnn._convolution_pointwise)
+        def convolution_unary(
+            x: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            stride,
+            dilation,
+            groups,
+            attr,
+            scalars,
+            algorithm,
+        ):
+            return TensorBox.create(
+                ir.ConvolutionUnary.create(
+                    x,
+                    weight,
+                    bias,
+                    padding,
+                    stride,
+                    dilation,
+                    groups,
+                    attr,
+                    scalars,
+                    algorithm,
+                )
+            )
+
+        @register_lowering(torch.ops.mkldnn._convolution_pointwise.binary)
+        def convolution_binary(
+            x: TensorBox,
+            other: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            stride,
+            dilation,
+            groups,
+            binary_attr,
+            binary_alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        ):
+            return TensorBox.create(
+                ir.ConvolutionBinary.create(
+                    x,
+                    other,
+                    weight,
+                    bias,
+                    padding,
+                    stride,
+                    dilation,
+                    groups,
+                    binary_attr,
+                    binary_alpha,
+                    unary_attr,
+                    unary_scalars,
+                    unary_algorithm,
+                )
+            )
+
+        @register_lowering(torch.ops.mkldnn._convolution_pointwise_.binary)
+        def convolution_binary_inplace(
+            x: TensorBox,
+            other: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            stride,
+            dilation,
+            groups,
+            binary_attr,
+            binary_alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        ):
+            return TensorBox.create(
+                ir.ConvolutionBinaryInplace.create(
+                    x,
+                    other,
+                    weight,
+                    bias,
+                    padding,
+                    stride,
+                    dilation,
+                    groups,
+                    binary_attr,
+                    binary_alpha,
+                    unary_attr,
+                    unary_scalars,
+                    unary_algorithm,
+                )
+            )
+
+        @register_lowering(torch.ops.mkldnn._linear_pointwise)
+        def linear_unary(
+            x: TensorBox, w: TensorBox, b: TensorBox, attr, scalars, algorithm
+        ):
+            return TensorBox.create(
+                ir.LinearUnary.create(x, w, b, attr, scalars, algorithm)
+            )
+
+        @register_lowering(torch.ops.mkldnn._linear_pointwise.binary)
+        def linear_binary(x: TensorBox, y: TensorBox, w: TensorBox, b: TensorBox, attr):
+            return TensorBox.create(ir.LinearBinary.create(x, y, w, b, attr))
+
+        @register_lowering(torch.ops.mkldnn._convolution_transpose_pointwise)
+        def convolution_transpose_unary(
+            x: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            output_padding,
+            stride,
+            dilation,
+            groups,
+            attr,
+            scalars,
+            algorithm,
+        ):
+            return TensorBox.create(
+                ir.ConvolutionTransposeUnary.create(
+                    x,
+                    weight,
+                    bias,
+                    padding,
+                    output_padding,
+                    stride,
+                    dilation,
+                    groups,
+                    attr,
+                    scalars,
+                    algorithm,
+                )
+            )
+
+        @register_lowering(aten.mkldnn_rnn_layer.default)
+        def mkldnn_rnn_layer(
+            x: TensorBox,
+            w0: TensorBox,
+            w1: TensorBox,
+            w2: TensorBox,
+            w3: TensorBox,
+            hx: TensorBox,
+            cx: TensorBox,
+            reverse: bool,
+            batch_sizes: List[int],
+            mode: int,
+            hidden_size: int,
+            num_layers: int,
+            has_biases: bool,
+            bidirectional: bool,
+            batch_first: bool,
+            train: bool,
+        ):
+            return pytree.tree_map(
+                TensorBox.create,
+                ir.MkldnnRnnLayer.create(
+                    x,
+                    w0,
+                    w1,
+                    w2,
+                    w3,
+                    hx,
+                    cx,
+                    reverse,
+                    batch_sizes,
+                    mode,
+                    hidden_size,
+                    num_layers,
+                    has_biases,
+                    bidirectional,
+                    batch_first,
+                    train,
+                ),
+            )
+
+        @register_lowering(torch.ops.onednn.qconv2d_pointwise, type_promotion_kind=None)
+        def qconvolution_unary(
+            x: TensorBox,
+            x_scale,
+            x_zp,
+            packed_weight: TensorBox,
+            w_scale: TensorBox,
+            w_zp: TensorBox,
+            bias: TensorBox,
+            stride,
+            padding,
+            dilation,
+            groups,
+            o_inv_scale,
+            o_zero_point,
+            output_dtype,
+            attr,
+            scalars,
+            algorithm,
+        ):
+            return TensorBox.create(
+                ir.QConvPointWisePT2E.create(
+                    x,
+                    x_scale,
+                    x_zp,
+                    packed_weight,
+                    w_scale,
+                    w_zp,
+                    bias,
+                    stride,
+                    padding,
+                    dilation,
+                    groups,
+                    o_inv_scale,
+                    o_zero_point,
+                    output_dtype,
+                    attr,
+                    scalars,
+                    algorithm,
+                )
+            )
+
+        @register_lowering(
+            torch.ops.onednn.qconv2d_pointwise.binary, type_promotion_kind=None
+        )
+        def qconvolution_binary(
+            x: TensorBox,
+            x_scale,
+            x_zp,
+            accum: TensorBox,
+            accum_scale,
+            accum_zp,
+            packed_weight: TensorBox,
+            w_scale: TensorBox,
+            w_zp: TensorBox,
+            bias: TensorBox,
+            stride,
+            padding,
+            dilation,
+            groups,
+            o_inv_scale,
+            o_zero_point,
+            output_dtype,
+            binary_attr,
+            alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithmm,
+        ):
+            if (
+                binary_attr == "sum"
+                and output_dtype in [torch.float32, torch.bfloat16]
+                and accum.get_dtype() in [torch.float32, torch.bfloat16]
+                and accum.get_dtype() != output_dtype
+            ):
+                # For int8-mixed-bf16 quantization and inplace add,
+                # there is case when accum dtype is float32 but output dtype is bfloat16.
+                # Since the accum will be inplaced changed with post op sum,
+                # we will do accum dtype convertion here.
+                accum = to_dtype(accum, output_dtype)
+            return TensorBox.create(
+                ir.QConvPointWiseBinaryPT2E.create(
+                    x,
+                    x_scale,
+                    x_zp,
+                    accum,
+                    accum_scale,
+                    accum_zp,
+                    packed_weight,
+                    w_scale,
+                    w_zp,
+                    bias,
+                    stride,
+                    padding,
+                    dilation,
+                    groups,
+                    o_inv_scale,
+                    o_zero_point,
+                    output_dtype,
+                    binary_attr,
+                    alpha,
+                    unary_attr,
+                    unary_scalars,
+                    unary_algorithmm,
+                )
+            )
+
+        @register_lowering(torch.ops.onednn.qlinear_pointwise, type_promotion_kind=None)
+        def qlinear_unary(
+            x: TensorBox,
+            x_scale,
+            x_zp,
+            packed_weight: TensorBox,
+            w_scale: TensorBox,
+            w_zp: TensorBox,
+            bias: TensorBox,
+            o_inv_scale,
+            o_zero_point,
+            output_dtype,
+            attr,
+            scalars,
+            algorithm,
+        ):
+            return TensorBox.create(
+                ir.QLinearPointwisePT2E.create(
+                    x,
+                    x_scale,
+                    x_zp,
+                    packed_weight,
+                    w_scale,
+                    w_zp,
+                    bias,
+                    o_inv_scale,
+                    o_zero_point,
+                    output_dtype,
+                    attr,
+                    scalars,
+                    algorithm,
+                )
+            )
+
+        if torch._C.has_mkl:
+            cpu_needs_realized_inputs.append(torch.ops.mkl._mkl_linear)
+
+            @register_lowering(torch.ops.mkl._mkl_linear)
+            def mkl_packed_linear(
+                x: TensorBox,
+                packed_w: TensorBox,
+                orig_w: TensorBox,
+                b: TensorBox,
+                batch_size,
+            ):
+                result = TensorBox.create(
+                    ir.MKLPackedLinear.create(x, packed_w, orig_w, batch_size)
+                )
+                if b is not None:
+                    result = add(result, b)
+                return result
+
+        add_needs_realized_inputs(cpu_needs_realized_inputs)
+    else:
+        pass
diff --git a/torch/_inductor/ops_handler.py b/torch/_inductor/ops_handler.py
new file mode 100644
index 0000000000000..88f9d406c2e18
--- /dev/null
+++ b/torch/_inductor/ops_handler.py
@@ -0,0 +1,763 @@
+import itertools
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    Literal,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+)
+from unittest.mock import patch
+
+import sympy
+from typing_extensions import Protocol
+
+import torch
+import torch.utils._pytree as pytree
+from torch.fx.graph import inplace_methods, magic_methods
+from .utils import IndentedBuffer, reduction_num_outputs, sympy_index_symbol, sympy_str
+
+T = TypeVar("T")
+StoreMode = Optional[Literal["atomic_add"]]
+ReductionType = Literal[
+    "argmax",
+    "argmin",
+    "welford_reduce",
+    "welford_combine",
+    "any",
+    "max",
+    "min",
+    "prod",
+    "sum",
+    "xor_sum",
+]
+
+
+def _arg_str(a) -> str:
+    if isinstance(a, sympy.Expr):
+        return sympy_str(a)
+    return str(a)
+
+
+# NB: This is not done as a parent class, because our ops handlers
+# implementations make heavy use of __getattr__ magic, and pre-existing
+# stubs for methods would interfere with this mechanism.
+#
+# TODO: A superclass that does desugaring for operations like
+# reciprocal/square might be useful.
+class OpsHandler(Protocol[T]):
+    """
+    Protocol describing the set of valid operations on ``torch._inductor.virtualized.ops``,
+    as well as the contract for op handlers.  The type T signifies the domain
+    of the abstract analysis AKA what all of the functions return / take as arguments
+    anywhere compute occurs.
+
+    While these operators are typically dtype polymorphic (e.g., you can use mul
+    on both integers and floats), they do NOT do promotion and usually return the
+    same dtype as the input.  You are expected to have handled type promotion
+    during ATen decompositions.  Most operators correspond exactly to pointwise
+    operations as defined by torch, so when in doubt about semantics, check the
+    corresponding torch documentation.  These are all scalar operations (so they
+    are defined to operate on a single element at a time.)
+
+    For convenience, many operators take a src_dtype which indicates what the dtype
+    of the input argument is.  Although in principle this can be derived by an
+    analysis, providing this for ops where it is useful helps avoid having to repeatedly
+    recompute dtype in code generation.
+
+    Note that this often describes a class of static methods, for stateless
+    ops handlers.
+
+    Handlers are often defined using ``__getattr__`` metaprogramming, which means
+    that you cannot declare that a type implements a protocol by inheriting from
+    it (as the type stubs count as attribute declarations and impede the getattr
+    magic method from being called).  Instead, define a function that casts an
+    argument of your type to the protocol, which is sufficient to induce mypy to
+    test that the protocol is implemented correctly.  Search for ``_typecheck_``
+    in this file to see some examples.  If you see an obscure error where a
+    class doesn't implement a Protocol, but mypy doesn't say why, check to see
+    that ``__getattr__`` is typed correctly (typically, it is not possible to
+    type ``__getattr__`` without typing it as ``Callable[..., Any]``)
+    """
+
+    def constant(self, value: Union[bool, float, int], dtype: torch.dtype) -> T:
+        """Produces a scalar constant of type dtype."""
+        ...
+
+    def load_seed(self, name: str, offset: T):
+        """Computes inductor_prims.lookup_seed."""
+        ...
+
+    def rand(self, seed: T, offset: T) -> T:
+        """Computes inductor_prims.random with mode="rand".  offset has dtype int32."""
+        ...
+
+    def randn(self, seed: T, offset: T) -> T:
+        """Computes inductor_prims.random with mode="randn".  offset has dtype int32."""
+        ...
+
+    def randint64(self, seed: T, offset: T, low: T, high: T) -> T:
+        """Computes inductor_prims.randint.  offset has dtype int32."""
+        ...
+
+    def masked(self, mask: T, body: Callable[[], T], other: T) -> T:
+        """
+        Computes body, but only perform loads/stores if the boolean mask
+        evaluates to true.  For example, you would use this if you needed to
+        perform an indirect load that may not be valid on some elements;
+        without masking, invalid accesses can cause IMAs.  When mask is true,
+        the result is the result of body; otherwise it is other.
+
+        Contrast this with ops.where, which can multiplex between two values
+        that have been unconditionally computed.
+        """
+        ...
+
+    def where(self, condition: T, input: T, other: T) -> T:
+        """
+        Computes torch.where: when condition is true, return input; otherwise return other.
+        """
+        ...
+
+    def index_expr(self, expr: sympy.Expr, dtype: torch.dtype) -> T:
+        """
+        Converts a sympy expression into a scalar of type dtype.  expr is typically
+        an indexing expression, thus the name; however, it can also be used in
+        non-indexing situations.
+        """
+        ...
+
+    def to_dtype(
+        self, x: T, dtype: torch.dtype, src_dtype: Optional[torch.dtype] = None
+    ) -> T:
+        """
+        Convert x to dtype.  src_dtype can be optionally set to specify what the original
+        dtype of x was, which can improve code generation (used by torch to(dtype=dtype)).
+        """
+        ...
+
+    def to_dtype_bitcast(self, x: T, dtype: torch.dtype, src_dtype: torch.dtype) -> T:
+        """
+        Reinterpret cast x to dtype (reinterpreting the bits in memory as another dtype.)
+        src_dtype must be the original type of x.
+        """
+        ...
+
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # These operations are only available in a "kernel" context.  Check
+    # torch._inductor.codegen.common.CSEProxy for their typical implementation
+    # in op handler (routing to their respective implementations in the kernel
+    # handler)
+    #
+    # Importantly, inside a kernel, indexing and mask variables are available
+    # in scope, which are typically used by sympy.Expr indexing.
+
+    def indirect_indexing(
+        self, x: T, size: sympy.Expr, check: bool = True
+    ) -> sympy.Expr:
+        """
+        Convert an integral x into a sympy.Expr that can be subsequently used in
+        indexing computation.  'size' represents an upper bound on the what valid
+        indexes can be; when 'check' is True, we check that the x is in bounds.
+
+        NB: This is typically mandatory to implement for any analysis, because you
+        MUST return a valid sympy.Expr of some sort (even if it's a meaningless symbol).
+        """
+        ...
+
+    def load(self, name: str, index: sympy.Expr) -> T:
+        """
+        Load from the memory location 'name', offset by some indexing expression 'index'.
+        """
+        ...
+
+    def store(
+        self,
+        name: str,
+        index: sympy.Expr,
+        value: T,
+        mode: StoreMode = None,
+    ) -> None:
+        """
+        Store 'value' to the memory location 'name' offset by 'expr'.  If
+        specified, 'mode' can require the store to be an atomic addition.
+        """
+        ...
+
+    # TODO: Better explain how the "collective" semantics of these ops;
+    # remember that the input value is a scalar, you can't reduce on it in the
+    # traditional sense!
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: T,
+    ) -> Union[T, Tuple[T, ...]]:
+        """
+        Perform a 'reduction_type' reduction on 'value' of dtype 'src_dtype',
+        using 'dtype' as the accumulation dtype for the reduction.  The result
+        is an intermediate computation which should be stored to the final
+        location using 'ops.store_reduction'.
+
+        Valid reduction types are .  For Welford reduction types, this
+        function returns multiple outputs; consult reduction_num_outputs to
+        determine the amount in metaprogramming applications.
+        """
+        ...
+
+    # TODO: in practice, this seems to actually return None, but not returning
+    # a T makes common __getattr__ idioms not type correctly.  Figure out if
+    # this should be returning something.
+    def store_reduction(self, name: str, index: sympy.Expr, value: T) -> T:
+        """
+        Store the fully accumulated result of 'reduction' to the memory
+        location 'name' offset by 'expr'.
+        """
+        ...
+
+    def scan(
+        self,
+        dtypes: Tuple[torch.dtype, ...],
+        combine_fn: Callable[[Tuple[T, ...], Tuple[T, ...]], Tuple[T, ...]],
+        values: Tuple[T, ...],
+    ) -> Tuple[T, ...]:
+        """
+        Perform an associative scan on 'value'.
+        """
+        # TODO: Improve the description with some pseudocode
+        ...
+
+    def bucketize(
+        self,
+        values: T,
+        offsets_name: str,
+        offsets_size: sympy.Expr,
+        indexing_dtype: torch.dtype,
+        right: bool,
+    ) -> T:
+        # See [Note: Inductor bucketize op]
+        ...
+
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # The following ops have semantics that correspond exactly to the torch
+    # operation with the same corresponding name.
+
+    def abs(self, x0: T) -> T:
+        ...
+
+    def exp(self, x0: T) -> T:
+        ...
+
+    def exp2(self, x0: T) -> T:
+        ...
+
+    def expm1(self, x0: T) -> T:
+        ...
+
+    def sqrt(self, x0: T) -> T:
+        ...
+
+    def relu(self, x0: T) -> T:
+        ...
+
+    def minimum(self, x0: T, x1: T) -> T:
+        ...
+
+    def maximum(self, x0: T, x1: T) -> T:
+        ...
+
+    def cos(self, x0: T) -> T:
+        ...
+
+    def sin(self, x0: T) -> T:
+        ...
+
+    def lgamma(self, x0: T) -> T:
+        ...
+
+    def erf(self, x0: T) -> T:
+        ...
+
+    def cosh(self, x0: T) -> T:
+        ...
+
+    def sinh(self, x0: T) -> T:
+        ...
+
+    def acos(self, x0: T) -> T:
+        ...
+
+    def acosh(self, x0: T) -> T:
+        ...
+
+    def asin(self, x0: T) -> T:
+        ...
+
+    def asinh(self, x0: T) -> T:
+        ...
+
+    def atan2(self, x0: T, x1: T) -> T:
+        ...
+
+    def atan(self, x0: T) -> T:
+        ...
+
+    def atanh(self, x0: T) -> T:
+        ...
+
+    def copysign(self, x0: T, x1: T) -> T:
+        ...
+
+    def erfc(self, x0: T) -> T:
+        ...
+
+    def erfinv(self, x0: T) -> T:
+        ...
+
+    def frexp(self, x0: T):
+        ...
+
+    def hypot(self, x0: T, x1: T) -> T:
+        ...
+
+    def log10(self, x0: T) -> T:
+        ...
+
+    def log2(self, x0: T) -> T:
+        ...
+
+    def nextafter(self, x0: T, x1: T) -> T:
+        ...
+
+    def logical_and(self, x0: T, x1: T) -> T:
+        ...
+
+    def logical_not(self, x0: T) -> T:
+        ...
+
+    def logical_or(self, x0: T, x1: T) -> T:
+        ...
+
+    def logical_xor(self, x0: T, x1: T) -> T:
+        ...
+
+    def bitwise_and(self, x0: T, x1: T) -> T:
+        ...
+
+    def bitwise_not(self, x0: T) -> T:
+        ...
+
+    def bitwise_or(self, x0: T, x1: T) -> T:
+        ...
+
+    def bitwise_xor(self, x0: T, x1: T) -> T:
+        ...
+
+    def bitwise_left_shift(self, x0: T, x1: T) -> T:
+        ...
+
+    def bitwise_right_shift(self, x0: T, x1: T) -> T:
+        ...
+
+    def rsqrt(self, x0: T) -> T:
+        ...
+
+    def log1p(self, x0: T) -> T:
+        ...
+
+    def tan(self, x0: T) -> T:
+        ...
+
+    def tanh(self, x0: T) -> T:
+        ...
+
+    def sigmoid(self, x0: T) -> T:
+        ...
+
+    def signbit(self, x0: T) -> T:
+        ...
+
+    def fmod(self, x0: T, x1: T) -> T:
+        ...
+
+    def log(self, x0: T) -> T:
+        ...
+
+    def isinf(self, x0: T) -> T:
+        ...
+
+    def isnan(self, x0: T) -> T:
+        ...
+
+    def round(self, x0: T) -> T:
+        ...
+
+    def floor(self, x0: T) -> T:
+        ...
+
+    def sign(self, x0: T) -> T:
+        ...
+
+    def to_int(self, x0: T) -> T:
+        ...
+
+    def trunc(self, x0: T) -> T:
+        ...
+
+    def truncdiv(self, x0: T, x1: T) -> T:
+        ...
+
+    def ceil(self, x0: T) -> T:
+        ...
+
+    def neg(self, x0: T) -> T:
+        ...
+
+    def reciprocal(self, x0: T) -> T:
+        ...
+
+    def eq(self, x0: T, x1: T) -> T:
+        ...
+
+    def ne(self, x0: T, x1: T) -> T:
+        ...
+
+    def lt(self, x0: T, x1: T) -> T:
+        ...
+
+    def gt(self, x0: T, x1: T) -> T:
+        ...
+
+    def le(self, x0: T, x1: T) -> T:
+        ...
+
+    def ge(self, x0: T, x1: T) -> T:
+        ...
+
+    def add(self, x0: T, x1: T) -> T:
+        ...
+
+    def sub(self, x0: T, x1: T) -> T:
+        ...
+
+    def mul(self, x0: T, x1: T) -> T:
+        ...
+
+    def floordiv(self, x0: T, x1: T) -> T:
+        ...
+
+    def truediv(self, x0: T, x1: T) -> T:
+        ...
+
+    def div(self, x0: T, x1: T) -> T:
+        ...
+
+    def mod(self, x0: T, x1: T) -> T:
+        ...
+
+    def pow(self, x0: T, x1: T) -> T:
+        ...
+
+    def and_(self, x0: T, x1: T) -> T:
+        ...
+
+    def or_(self, x0: T, x1: T) -> T:
+        ...
+
+    def xor(self, x0: T, x1: T) -> T:
+        ...
+
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # In CUDA, optimized implementations of other mathematical operations are
+    # offered separately via libdevice for double precision computation (in
+    # Triton, these go to tl.math rather than tl).  We lower to these
+    # operators when doing FP64 on CUDA.  Note that some operators
+    # unconditional go to tl.math.
+    #
+    # TODO(ezyang): Is this really the best way to do this?  What if we have
+    # abs internally route to tl.math automatically when given a double
+    # precision input?  One reason is that when doing codegen, we often don't
+    # know what the dtype of the inputs are!  (In principle we do know, but
+    # for many analyses it's not conveniently available.)
+
+    def libdevice_abs(self, x0: T) -> T:
+        ...
+
+    def libdevice_exp(self, x0: T) -> T:
+        ...
+
+    def libdevice_sqrt(self, x0: T) -> T:
+        ...
+
+    def libdevice_cos(self, x0: T) -> T:
+        ...
+
+    def libdevice_sin(self, x0: T) -> T:
+        ...
+
+    def libdevice_sigmoid(self, x0: T) -> T:
+        ...
+
+    def libdevice_log(self, x0: T) -> T:
+        ...
+
+
+class NoopHandler:
+    def __getattr__(self, name):
+        if name == "name":
+            return "NoopHandler"
+
+        def inner(*args, **kwargs):
+            return None
+
+        return inner
+
+    @staticmethod
+    def masked(mask, body, other) -> None:
+        return None
+
+    @staticmethod
+    def frexp(x) -> Tuple[None, None]:
+        return (None, None)
+
+    @staticmethod
+    def scan(dtypes, combine_fn, values) -> Tuple[None, ...]:
+        return tuple(None for i in range(len(values)))
+
+    @staticmethod
+    def indirect_indexing(index_var, size, check=True) -> sympy.Symbol:
+        return sympy.Integer(0)
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_NoopHandler(h: NoopHandler) -> OpsHandler[None]:
+    return h
+
+
+class MockHandler:
+    def __getattr__(self, name):
+        if name == "name":
+            return "MockHandler"
+
+        def inner(*args, **kwargs):
+            fargs = [_arg_str(a) for a in args]
+            fargs.extend(f"{k}={v}" for k, v in kwargs.items())
+            return f"ops.{name}({', '.join(fargs)})"
+
+        return inner
+
+    @staticmethod
+    def masked(mask, body, other) -> str:
+        return f"ops.masked({mask}, {body()}, {other})"
+
+    @staticmethod
+    def frexp(x):
+        return (f"ops.frexp({x})[0]", f"ops.frexp({x})[1]")
+
+    @staticmethod
+    def scan(dtypes, combine_fn, values):
+        return tuple(
+            f"ops.scan({dtypes}, {combine_fn}, {values})[{i}]"
+            for i in range(len(values))
+        )
+
+    @staticmethod
+    def indirect_indexing(index_var, size, check=True) -> sympy.Symbol:
+        return sympy_index_symbol(str(index_var))
+
+    @classmethod
+    def _init_cls(cls):
+        def make_handler(format_string):
+            @staticmethod  # type: ignore[misc]
+            def inner(*args):
+                return format_string.format(*args)
+
+            return inner
+
+        for name, format_string in itertools.chain(
+            magic_methods.items(), inplace_methods.items()
+        ):
+            setattr(cls, name, make_handler(format_string))
+
+
+MockHandler._init_cls()
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_MockHandler(h: MockHandler) -> OpsHandler[str]:
+    return h
+
+
+class KernelFormatterHandler:
+    def __init__(self, parent_handler):
+        self.parent_handler = parent_handler
+        self.output = IndentedBuffer(1)
+        self.var_counter = itertools.count()
+
+    @staticmethod
+    def ir_to_string(ir_fn, index, rindex=None) -> str:
+        from .ir import FlexibleLayout
+        from .virtualized import V
+
+        args = [index, rindex] if rindex is not None else [index]
+        names = ["index", "rindex"] if rindex is not None else ["index"]
+        formatter = KernelFormatterHandler(MockHandler())
+
+        with formatter.output.indent(-1):
+            formatter.output.writeline(f"def inner_fn({', '.join(names)}):")
+        for name, arg in zip(names, args):
+            if arg:
+                lhs = ", ".join(
+                    [
+                        str("_" if isinstance(v, (int, sympy.Integer)) else v)
+                        for v in arg
+                    ]
+                )
+                formatter.output.writeline(f"{lhs} = {name}")
+
+        with V.set_ops_handler(formatter), patch.object(
+            FlexibleLayout, "allow_indexing", True
+        ):
+            result = ir_fn(*args)
+            return formatter.getvalue(result)
+
+    def __getattr__(self, name) -> Callable[..., Any]:
+        def inner(*args, **kwargs):
+            line = getattr(self.parent_handler, name)(*args, **kwargs)
+            if name == "indirect_indexing":
+                return line
+
+            def write(line):
+                # replace line with a new variable name
+                varname = f"tmp{next(self.var_counter)}"
+                self.output.writeline(f"{varname} = {line}")
+                return varname
+
+            return pytree.tree_map(write, line)
+
+        return inner
+
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[str, Tuple[str, ...]],
+    ) -> Union[str, Tuple[str, ...]]:
+        line = self.parent_handler.reduction(dtype, src_dtype, reduction_type, value)
+        num_values = reduction_num_outputs(reduction_type)
+        varnames = [f"tmp{next(self.var_counter)}" for _ in range(num_values)]
+        self.output.writeline(f"{','.join(varnames)} = {line}")
+        return tuple(varnames) if num_values > 1 else varnames[0]
+
+    def getvalue(self, result):
+        self.output.writeline(f"return {result}")
+        return self.output.getvalue()
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_KernelFormatterHandler(h: KernelFormatterHandler) -> OpsHandler[str]:
+    return h
+
+
+class WrapperHandler(Generic[T]):
+    def __init__(self, inner: OpsHandler[T]):
+        self._inner = inner
+
+    def __getattr__(self, item):
+        return getattr(self._inner, item)
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_WrapperHandler(h: WrapperHandler[T]) -> OpsHandler[T]:
+    return h
+
+
+class OpCounterCSE:
+    """Shim to count how many ops are used"""
+
+    def __init__(self, inner):
+        super().__init__()
+        self.parent_handler = inner
+        self.op_count = 0
+        self.var_names = {}
+
+    def __getattr__(self, name):
+        def inner(*args, **kwargs):
+            val = getattr(self.parent_handler, name)(*args, **kwargs)
+            if name == "indirect_indexing":
+                return val
+
+            def count(val):
+                if val not in self.var_names:
+                    varname = f"tmp{self.op_count}"
+                    self.op_count += 1
+                    self.var_names[val] = varname
+                    return varname
+                else:
+                    return self.var_names[val]
+
+            return pytree.tree_map(count, val)
+
+        return inner
+
+
+def _typecheck_OpCounterCSE(h: OpCounterCSE) -> OpsHandler[str]:
+    return h
+
+
+class ExtractConstantsHandler(NoopHandler):
+    def __init__(self, device):
+        self.device = device
+
+    def constant(self, value: Any, dtype: torch.dtype) -> "torch._inductor.ir.Constant":
+        from torch._inductor import ir
+
+        return ir.Constant(value=value, dtype=dtype, device=self.device)
+
+
+def _typecheck_ExtractConstantsHandler(h: ExtractConstantsHandler) -> OpsHandler[Any]:
+    return h
+
+
+class SimpleCSEHandler(WrapperHandler[T]):
+    """Wraps the underlying handler with a CSE pass
+
+    NOTE: Compared to codegen level CSE this is simplified as it
+    doesn't support stores which require load cache invalidation.
+    """
+
+    def __init__(self, inner: OpsHandler[T]):
+        super().__init__(inner)
+        self.cse_cache: Dict[str, Union[T, Tuple[T, ...]]] = {}
+        self.mock = MockHandler()
+
+    def indirect_indexing(self, *args, **kwargs) -> sympy.Expr:
+        return super().indirect_indexing(*args, **kwargs)  # type: ignore[misc]
+
+    def store(self, *args, **kwargs) -> T:
+        raise NotImplementedError("store not implemented")
+
+    def store_reduction(self, *args, **kwargs) -> T:
+        raise NotImplementedError("store not implemented")
+
+    def __getattr__(self, name) -> Callable[..., Any]:
+        def inner(*args, **kwargs):
+            key = getattr(self.mock, name)(*args, **kwargs)
+            val = self.cse_cache.get(key)
+            if val is not None:
+                return val
+
+            val = getattr(self._inner, name)(*args, **kwargs)
+            self.cse_cache[key] = val
+            return val
+
+        return inner
+
+
+def _typecheck_SimpleCSEHandler(h: SimpleCSEHandler[Any]) -> OpsHandler[Any]:
+    return h
diff --git a/torch/_inductor/optimize_indexing.py b/torch/_inductor/optimize_indexing.py
index 966a7571b8d30..0d5f2d0b2db7d 100644
--- a/torch/_inductor/optimize_indexing.py
+++ b/torch/_inductor/optimize_indexing.py
@@ -27,7 +27,7 @@ def val_expressable_in_32_bits(val):
         iinfo = torch.iinfo(torch.int32)
         return val <= iinfo.max and val >= iinfo.min
 
-    raise Exception(f"Unexpected value {val}")
+    raise TypeError(f"Unexpected value {val}")
 
 
 def range_expressable_in_32_bits(range):
@@ -71,7 +71,7 @@ def skip_filter(node):
                     # TODO - not sure if we should be doing int/float casts while tracing,
                     # might interfere with sympy.
 
-                    index_val_int = ValueRanges(
+                    index_val_int = ValueRanges[sympy.Expr](
                         int(index_val.lower), int(index_val.upper)
                     )
                     if not range_expressable_in_32_bits(index_val_int):
diff --git a/torch/_inductor/pattern_matcher.py b/torch/_inductor/pattern_matcher.py
index 9d0bfc8bee4b7..0a57ce7006d5c 100644
--- a/torch/_inductor/pattern_matcher.py
+++ b/torch/_inductor/pattern_matcher.py
@@ -2,12 +2,17 @@
 
 import dataclasses
 import functools
+import importlib
 import inspect
 import itertools
 import logging
+import operator
 import os
 import re
+import textwrap
+import typing
 from collections import defaultdict
+from pathlib import Path
 from typing import (
     Any,
     Callable,
@@ -18,10 +23,11 @@
     NoReturn,
     Optional,
     Set,
+    Tuple,
     Union,
 )
 
-from typing_extensions import TypeGuard
+from typing_extensions import Self, TypeGuard
 
 import torch
 import torch._guards
@@ -30,8 +36,8 @@
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.utils import counters
 from torch._prims_common import is_integer_dtype
-from torch.fx import Node
 from torch.fx.experimental.proxy_tensor import make_fx, maybe_disable_fake_tensor_mode
+from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 
 from .._functorch import config as functorch_config
@@ -43,6 +49,9 @@
 from .decomposition import select_decomp_table
 from .lowering import fallback_node_due_to_unsupported_type
 
+if typing.TYPE_CHECKING:
+    from torch.fx import Node
+
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
 prims = torch.ops.prims
@@ -52,7 +61,9 @@
 
 
 class Multiple:
-    pass
+    def __init__(self):
+        # Ensure we're really a singleton.
+        assert "MULTIPLE" not in globals() or self is MULTIPLE
 
 
 # Sentinel indicating multiple quantities can be matched
@@ -121,10 +132,10 @@ def replace_with_graph(self, replacement_graph, args):
             self, self.ctx.graph, replacement_graph, args
         )
 
-    def replace_by_example(self, replacement_fn, args, trace_fn=None):
+    def replace_by_example(self, replacement_fn, args, trace_fn=None, run_dce=True):
         assert self.ctx
         if trace_fn is None:
-            trace_fn = fwd_only
+            trace_fn = functools.partial(fwd_only, run_dce=run_dce)
         replacement = trace_fn(
             replacement_fn, torch.fx.map_arg(args, lambda arg: arg.meta["val"])
         )
@@ -209,7 +220,7 @@ class PatternExpr:
     def _match(
         self, node: torch.fx.Node, ctx: MatchContext
     ) -> Union[Match, FailedMatch]:
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def match(self, node: torch.fx.Node) -> Union[Match, FailedMatch]:
         try:
@@ -227,6 +238,14 @@ def find_anchor_nodes(self, ctx: MatchContext, searched):
         if self in ctx.pattern_to_node:
             yield ctx.pattern_to_node[self]
 
+    def pattern_eq(self, other: Any) -> bool:
+        """
+        Compare two `PatternExpr`s and return true if they are the
+        same. Note this is NOT matching a pattern - it is comparing the pattern
+        structures (for debugging).
+        """
+        return isinstance(other, self.__class__)
+
 
 class Arg(PatternExpr):
     """
@@ -268,6 +287,10 @@ def __repr__(self):
     def _match(self, node: NodeOrConstant, ctx: MatchContext):
         return Match(self, kwargs={self.name: node})  # matches anything
 
+    def pattern_eq(self, other: Any) -> bool:
+        other = typing.cast(Self, other)  # super makes sure this is true
+        return super().pattern_eq(other) and self.name == other.name
+
 
 class ExclusiveKeywordArg(PatternExpr):
     """
@@ -288,6 +311,10 @@ def _match(self, node: NodeOrConstant, ctx: MatchContext):
         ctx.exclusive_node_set.append(node)
         return Match(self, kwargs={self.name: node})  # matches anything
 
+    def pattern_eq(self, other: Any) -> bool:
+        other = typing.cast(Self, other)  # super makes sure this is true
+        return super().pattern_eq(other) and self.name == other.name
+
 
 class _TargetExpr(PatternExpr):
     """
@@ -296,7 +323,7 @@ class _TargetExpr(PatternExpr):
 
     op: Optional[str] = None
 
-    def __init__(self, fns, users=1):
+    def __init__(self, fns, users: Union[Multiple, int] = 1):
         if not self.op:
             raise NotImplementedError("Shouldn't directly use _BaseNodeMatch")
         super().__init__()
@@ -307,7 +334,7 @@ def __init__(self, fns, users=1):
 
         self.fns: List[Union[Callable[..., Any], str]] = fns
         self.fns_set: Set[Union[Callable[..., Any], str]] = set(fns)
-        self.users: Union[int, Multiple] = users
+        self.users = users
 
     def fns_repr(self) -> str:
         first_repr = self.fns[0]
@@ -324,13 +351,19 @@ def fns_repr(self) -> str:
             return first_repr
 
     def __repr__(self):
-        return f"{self.__class__.__name__}({self.fns_repr()})"
+        if self.users is MULTIPLE:
+            comma_users = ", MULTIPLE"
+        elif self.users != 1:
+            comma_users = f", {self.users})"
+        else:
+            comma_users = ""
+        return f"{self.__class__.__name__}({self.fns_repr()}{comma_users})"
 
     def has_multiple_users(self) -> bool:
         return isinstance(self.users, Multiple) or self.users > 1
 
     def find_anchor_nodes(self, ctx: MatchContext, searched):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def _match_fns(self, node: torch.fx.Node):
         return (
@@ -346,6 +379,15 @@ def _match_users(self, node: torch.fx.Node, ctx: MatchContext):
             or len(node.users) == self.users
         )
 
+    def pattern_eq(self, other: Any) -> bool:
+        other = typing.cast(Self, other)  # super makes sure this is true
+        return (
+            super().pattern_eq(other)
+            and self.op == other.op
+            and self.fns == other.fns
+            and self.users == other.users
+        )
+
 
 class _TargetArgsExpr(_TargetExpr):
     """
@@ -391,6 +433,10 @@ def __repr__(self):
             *map(repr, self.args),
             *[f"{k}={v}" for k, v in self.kwargs.items()],
         ]
+        if self.users is MULTIPLE:
+            args.append("_users=MULTIPLE")
+        elif self.users != 1:
+            args.append(f"_users={self.users}")
         return f"{self.__class__.__name__}({', '.join(args)})"
 
     def pretty_print(self, pp: PatternPrettyPrinter):
@@ -399,9 +445,9 @@ def pretty_print(self, pp: PatternPrettyPrinter):
             *(pp.pretty_print(x) for x in self.args),
             *[f"{k}={pp.pretty_print(v)}" for k, v in self.kwargs.items()],
         ]
-        if isinstance(self.users, Multiple):
+        if self.users is MULTIPLE:
             args.append("_users=MULTIPLE")
-        elif self.users > 1:
+        elif self.users != 1:
             args.append(f"_users={self.users}")
 
         joiner_str = ", "
@@ -480,6 +526,17 @@ def find_anchor_nodes(self, ctx: MatchContext, searched):
                                 yield node
                                 searched.add(node)
 
+    def pattern_eq(self, other: Any) -> bool:
+        other = typing.cast(Self, other)  # super makes sure this is true
+        return (
+            super().pattern_eq(other)
+            and self.flat_args_kwargs[1] == other.flat_args_kwargs[1]
+            and all(
+                a.pattern_eq(b) if isinstance(a, PatternExpr) else a == b
+                for a, b in zip(self.flat_args_kwargs[0], other.flat_args_kwargs[0])
+            )
+        )
+
 
 class CallFunction(_TargetArgsExpr):
     """
@@ -551,7 +608,7 @@ def __init__(self, pattern: PatternExpr, partial=False):
     def __repr__(self):
         return f"{self.__class__.__name__}({self.pattern})"
 
-    def _match(self, node: List[torch.fx.Node], ctx: MatchContext):
+    def _match(self, node: List[torch.fx.Node], ctx: MatchContext):  # type: ignore[override]
         if not isinstance(node, (list, tuple)) or len(node) == 0:
             return FailedMatch("non_list")
         m = Match(self)
@@ -575,12 +632,21 @@ def _match(self, node: List[torch.fx.Node], ctx: MatchContext):
             return FailedMatch("list: no_match")
         return m.bundle()
 
+    def pattern_eq(self, other: Any) -> bool:
+        other = typing.cast(Self, other)  # super makes sure this is true
+        return (
+            super().pattern_eq(other)
+            and self.pattern.pattern_eq(other.pattern)
+            and self.partial == other.partial
+        )
+
 
 class MultiOutputPattern(PatternExpr):
     def __init__(self, outputs):
         super().__init__()
         assert all(isinstance(x, (PatternExpr, type(None))) for x in outputs), outputs
         self.outputs: List[Optional[PatternExpr]] = outputs
+        self.op = outputs[0].op
 
     @property
     def fns(self):
@@ -629,6 +695,17 @@ def match(self, node: torch.fx.Node) -> Union[Match, FailedMatch]:
         except FailedMatch as e:
             return e
 
+    def pattern_eq(self, other: Any) -> bool:
+        other = typing.cast(Self, other)  # super makes sure this is true
+        return (
+            super().pattern_eq(other)
+            and len(self.outputs) == len(other.outputs)
+            and all(
+                a.pattern_eq(b) if isinstance(a, PatternExpr) else a == b
+                for a, b in zip(self.outputs, other.outputs)
+            )
+        )
+
 
 class RepeatedExpr(PatternExpr):
     """
@@ -639,6 +716,7 @@ def __init__(self, inner_pattern: PatternExpr):
         super().__init__()
         assert hasattr(inner_pattern, "fns")
         self.inner_pattern = inner_pattern
+        self.op = inner_pattern.op  # type: ignore[attr-defined]
 
     @property
     def fns(self):
@@ -661,6 +739,12 @@ def _match(self, node: torch.fx.Node, ctx: MatchContext):
             m.extend(anchor_m)
         return m
 
+    def pattern_eq(self, other: Any) -> bool:
+        other = typing.cast(Self, other)  # super makes sure this is true
+        return super().pattern_eq(other) and self.inner_pattern.pattern_eq(
+            other.inner_pattern
+        )
+
 
 class PatternPrettyPrinter:
     """
@@ -721,7 +805,7 @@ class PatternEntry:
     extra_check: Callable[[Match], bool]
 
     def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def register(self, pass_dicts, target=None, prepend=False):
         if target is None:
@@ -729,10 +813,11 @@ def register(self, pass_dicts, target=None, prepend=False):
             for fn in self.pattern.fns:
                 self.register(pass_dicts, fn, prepend=prepend)
         elif isinstance(pass_dicts, (dict, PatternMatcherPass)):
+            assert hasattr(self.pattern, "op")
             if prepend:
-                pass_dicts[target].insert(0, self)
+                pass_dicts[(self.pattern.op, target)].insert(0, self)
             else:
-                pass_dicts[target].append(self)
+                pass_dicts[(self.pattern.op, target)].append(self)
         else:
             for x in pass_dicts:
                 self.register(x, target, prepend=prepend)
@@ -780,9 +865,9 @@ def replace_with_graph(
         first_node = output_nodes[0]
 
         class Replacer(torch.fx.Interpreter):
-            call_method = None
-            call_module = None
-            get_attr = None
+            call_method = None  # type: ignore[assignment]
+            call_module = None  # type: ignore[assignment]
+            get_attr = None  # type: ignore[assignment]
 
             def run_node(self, node) -> Any:
                 if node.op in ("placeholder", "output"):
@@ -811,40 +896,105 @@ def run_node(self, node) -> Any:
                 for n in output_nodes
                 if isinstance(n, torch.fx.Node)
             ]
-            last_node = min(indices, key=lambda tup: tup[0])[1]
-
-        def percolate_tags(node, recompute_tag):
-            for arg in node.all_input_nodes:
-                if hasattr(arg, "meta"):
+            last_node = min(indices, key=operator.itemgetter(0))[1]
+
+        def percolate_tags(node, recompute_tag, input_stops):
+            queue = [node]
+            visited = set()
+
+            while queue:
+                arg = queue.pop()
+                if (
+                    arg not in visited
+                    and arg not in input_stops
+                    and hasattr(arg, "meta")
+                ):
+                    visited.add(arg)
                     arg.meta["recompute"] = recompute_tag
-                    percolate_tags(arg, recompute_tag)
+                    queue.extend(arg.all_input_nodes)
 
         with graph.inserting_before(last_node):
             replacement = Replacer(replacement_graph).run(*args)
             if isinstance(replacement, torch.fx.Node):
                 replacement = [replacement]
-            assert len(replacement) == len(output_nodes)
-            for old, new in zip(output_nodes, replacement):
+
+            def maybe_getitem(node):
+                if node.op != "call_function":
+                    return None
+                if node.target != operator.getitem:
+                    return None
+                assert len(node.args) == 2
+                return node.args[1]
+
+            def replace(old, new):
                 if old is None:
                     assert new is None
-                elif new is None:
+                    return
+                assert isinstance(old, torch.fx.Node)
+                if new is None:
                     old.replace_all_uses_with(None)
-                else:
+                    graph.erase_node(old)
+                    return
+                if isinstance(new, torch.fx.Node):
                     if "val" not in new.meta:
                         new.meta.update(old.meta)
 
                     # Preserve the recompute tags in the replacement graph. We
                     # look at the recompute tags of the original output node to
                     # propagate the tag from the output all the way to the input
-                    # args in the replacement graph.
+                    # args (named as args in the replace_with_graph).
                     # Note that this is best effort. Since patterns are from
                     # many to many, there is no easy way to correctly map the
                     # recomputable tags. It is possible in some scenarios that we
                     # incorrectly tag some nodes as recomputables.
                     if "recompute" in old.meta:
-                        percolate_tags(new, old.meta["recompute"])
+                        percolate_tags(new, old.meta["recompute"], args)
 
                     old.replace_all_uses_with(new)
+                    graph.erase_node(old)
+                    return
+
+                # `new` is not a node: it's a list of nodes.
+                #
+                # This happens when we want to replace a node that has a single
+                # packed return with multiple unpacked returns. We need to do
+                # some graph surgery here.
+                #
+                # Example:
+                #   def original_graph(x):
+                #      a = op(x)
+                #      b = a[0]
+                #      c = a[1]
+                #      ...
+                #
+                # Assume that we want to replace op(x) with the graph
+                #   def new_op(x):
+                #      w = x + 1
+                #      z = x + 2
+                #      return (w, z)
+                #
+                # We need to replace `op` with the contents of `new_op`,
+                # and then rewrite a[0] to be w and a[1] to be z, as so:
+                #   def new_graph(x):
+                #     w = x + 1
+                #     z = x + 2
+                #     b = w
+                #     c = z
+                #     ...
+                old_uses = list(old.users.keys())
+                for user in old_uses:
+                    idx = maybe_getitem(user)
+                    if idx is None:
+                        raise AssertionError("can't handle")
+                    replace(user, new[idx])
+                graph.erase_node(old)
+
+            if len(output_nodes) == len(replacement):
+                for old, new in zip(output_nodes, replacement):
+                    replace(old, new)
+            else:
+                assert len(output_nodes) == 1
+                replace(output_nodes[0], replacement)
 
         match.erase_nodes(graph)
 
@@ -852,7 +1002,7 @@ def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node):
         self.replace_with_graph(
             match,
             graph,
-            match.replacement_graph,
+            match.replacement_graph,  # type: ignore[arg-type]
             self.normalize_args(*match.args, **match.kwargs),
         )
 
@@ -861,6 +1011,14 @@ def _return_true(match):
     return True
 
 
+def log_trace_failure(search_fn, e):
+    log.info(
+        "Replacement pattern %s failed to apply due to shape mismatch: %s",
+        search_fn.__name__,
+        e,
+    )
+
+
 def register_replacement(
     search_fn,
     replace_fn,
@@ -885,7 +1043,7 @@ def register_replacement(
         pass_dict: dict of passes to register to
         extra_check: additional check to run on match(using real shapes)
     """
-    argnames = [*inspect.signature(search_fn).parameters.keys()]
+    argnames_static = [*inspect.signature(search_fn).parameters.keys()]
 
     def check_fn(match: Match):
         """
@@ -894,6 +1052,7 @@ def check_fn(match: Match):
 
         Recheck the match with the correct shapes.
         """
+        argnames = list(argnames_static)
         for name in argnames:
             if name not in match.kwargs:
                 raise RuntimeError(
@@ -906,6 +1065,7 @@ def check_fn(match: Match):
                 [match.kwargs[name] for name in argnames], lambda n: n.meta["val"]
             )
         )
+        sym_args: List[torch.SymInt] = []
         with torch._dynamo.utils.detect_fake_mode(args):
             for i, grad in enumerate(requires_grad):
                 if isinstance(args[i], torch.Tensor):
@@ -919,31 +1079,79 @@ def check_fn(match: Match):
                         device=args[i].device,
                         requires_grad=grad,
                     )
-            try:
-                specific_graph = trace_fn(search_fn, args)
-            except RuntimeError as e:
-                log.info(
-                    "Replacement pattern %s failed to apply due to shape mismatch: %s",
-                    search_fn.__name__,
-                    e,
+                    for v in itertools.chain(args[i].shape, args[i].stride()):
+                        if isinstance(v, torch.SymInt) and all(
+                            guard_size_oblivious(v != a) for a in sym_args
+                        ):
+                            sym_args.append(v)
+
+            # If we were given a pre-traced pattern then use that instead of
+            # retracing. Note that this means the pattern has to be independent
+            # of its args.
+            specific_pattern = search_fn_pattern
+
+            if not specific_pattern:
+                if sym_args:
+                    # AOT Autograd and make fx will dedupe symbolic shape size
+                    # accesses of sym ints that appear as inputs
+                    # We don't want the sym_size uses to interfere with pattern matching
+                    # so we provide them as inputs.
+                    # Later, when we actually do the replacement, the symbolic shape
+                    # sizes will get re-traced and added to the graph.
+
+                    def search_fn_new(*args_new):
+                        return search_fn(*args_new[len(args_new) - len(args) :])
+
+                    try:
+                        specific_graph = trace_fn(search_fn_new, sym_args + args)
+                    except RuntimeError as e:
+                        log_trace_failure(search_fn, e)
+                        return False
+
+                    # correct argnames in the graph
+                    sym_arg_names = []
+                    for i, placeholder in zip(
+                        range(len(sym_args) + len(args)),
+                        specific_graph.graph.nodes,
+                    ):
+                        if i < len(sym_args):
+                            sym_arg_names.append(placeholder.target)
+                            continue
+
+                        with specific_graph.graph.inserting_after(placeholder):
+                            new_node = specific_graph.graph.placeholder(
+                                argnames[i - len(sym_args)]
+                            )
+                            new_node.target = new_node.name
+                            placeholder.replace_all_uses_with(new_node)
+                            specific_graph.graph.erase_node(placeholder)
+
+                    argnames = sym_arg_names + argnames
+                else:
+                    try:
+                        specific_graph = trace_fn(search_fn, args)
+                    except RuntimeError as e:
+                        log_trace_failure(search_fn, e)
+                        return False
+
+                specific_pattern = fx_to_pattern(
+                    specific_graph,
+                    argnames=argnames,
+                    exclusive_arg_names=exclusive_arg_names,
+                    scalar_workaround=scalar_workaround,
                 )
-                return False
-            specific_pattern = fx_to_pattern(
-                specific_graph,
-                argnames=argnames,
-                exclusive_arg_names=exclusive_arg_names,
-                scalar_workaround=scalar_workaround,
-            )
-            specific_pattern_match = specific_pattern.match(match.output_nodes()[0])
+
+            specific_pattern_match = specific_pattern.match(match.output_nodes()[0])  # type: ignore[arg-type]
+
             if specific_pattern_match and extra_check(specific_pattern_match):
                 # trace the pattern using the shapes from the user program
-                match.replacement_graph = trace_fn(replace_fn, args)
+                match.replacement_graph = trace_fn(replace_fn, args)  # type: ignore[assignment]
                 return True
             return False
 
     def normalize_args(**kwargs):
         args = []
-        for name in argnames:
+        for name in argnames_static:
             args.append(kwargs.pop(name))
         for i in range(1, len(kwargs) + 1):
             if f"tangents_{i}" not in kwargs:
@@ -986,6 +1194,150 @@ def normalize_args(**kwargs):
         return pattern.pattern
 
 
+_serialized_patterns: Set[str] = set()
+
+
+def _serialize_pattern(
+    unique_name: str,
+    search_fn,
+    example_inputs: Iterable[Any],
+    trace_fn: Callable[[Callable[..., Any], Iterable[Any]], torch.fx.GraphModule],
+    scalar_workaround,
+):
+    def get_file_template() -> str:
+        auto_generated_msg = textwrap.dedent(
+            """\
+            # This is an auto-generated file. Please do not modify it by hand.
+            # To re-generate, run:
+            # cd ~/pytorch && python torchgen/fuse/gen_patterns.py
+            """
+        )
+
+        file_template = textwrap.dedent(
+            """\
+            # mypy: ignore-errors
+
+            # noqa: F401, E501
+            {msg}
+            import torch
+            import torch._inductor
+
+            aten = torch.ops.aten
+            prims = torch.ops.prims
+
+            """
+        ).format(msg=auto_generated_msg)
+
+        pattern_matcher_imports = []
+        for name in dir(torch._inductor.pattern_matcher):
+            attr = getattr(torch._inductor.pattern_matcher, name)
+            if isinstance(attr, type) and issubclass(attr, (PatternExpr, _TargetExpr)):
+                pattern_matcher_imports.append(name)
+
+        formatted_imports = ",\n   ".join(pattern_matcher_imports)
+        formatted_imports = f"from torch._inductor.pattern_matcher import (\n   {formatted_imports},\n)\n"
+        return f"{file_template}{formatted_imports}"
+
+    if not SERIALIZED_PATTERN_PATH.is_dir():
+        raise RuntimeError(
+            f"Could not find serialized patterns directory at {SERIALIZED_PATTERN_PATH}"
+        )
+
+    pattern_name = search_fn.__name__
+
+    from torch._functorch import config as functorch_config
+
+    with functorch_config.patch(functionalize_rng_ops=False):
+        pattern = gen_pattern(search_fn, example_inputs, trace_fn, scalar_workaround)
+
+    serialized_pattern = PatternPrettyPrinter.run(pattern, output_name=unique_name)
+    if pattern_name not in _serialized_patterns:
+        write_mode = "w"
+        _serialized_patterns.add(pattern_name)
+    else:
+        write_mode = "a"
+
+    file_template = get_file_template()
+
+    with open(SERIALIZED_PATTERN_PATH / f"{pattern_name}.py", write_mode) as f:
+        if write_mode == "w":
+            f.write(file_template)
+        else:
+            f.write("\n\n")
+        f.write(serialized_pattern)
+        f.write("\n")
+
+
+SERIALIZED_PATTERN_PATH = Path(__file__).parent / "fx_passes" / "serialized_patterns"
+
+# This is the set of serialized patterns that we've registered.  Used by
+# test_serialized_patterns_up_to_date() to ensure the patterns are up
+# to date.
+_known_precompiled_patterns: List[
+    Tuple[
+        Any,
+        Iterable[Any],
+        Callable[[Callable[..., Any], Iterable[Any]], torch.fx.GraphModule],
+        Any,
+        str,
+    ]
+] = []
+
+
+def gen_register_replacement(
+    unique_name: str,
+    search_fn,
+    replace_fn,
+    example_inputs: Iterable[Any],
+    trace_fn: Callable[[Callable[..., Any], Iterable[Any]], torch.fx.GraphModule],
+    pass_dicts,
+    extra_check=_return_true,
+    scalar_workaround=(),
+    exclusive_arg_names=(),
+):
+    # Make sure the example_inputs is materialized.
+    example_inputs = tuple(example_inputs)
+
+    if "PYTORCH_GEN_PATTERNS" in os.environ:
+        pat = _serialize_pattern(
+            unique_name, search_fn, example_inputs, trace_fn, scalar_workaround
+        )
+    else:
+        pattern_name = search_fn.__name__
+        m = importlib.import_module(
+            f"torch._inductor.fx_passes.serialized_patterns.{pattern_name}"
+        )
+        if not m or not hasattr(m, unique_name):
+            log.warning(
+                "Precompiled pattern %r not found. Run torchen/fuse/gen_patterns.py.",
+                unique_name,
+            )
+        pat = getattr(m, unique_name)
+
+    for arg in pytree.tree_iter(example_inputs):
+        if torch._subclasses.fake_tensor.is_fake(arg) and arg.constant is not None:
+            # This can be a problem - small fake tensors (e.g. `tensor(2)`) will
+            # hold onto their original constant value - and by stashing it here
+            # will cause a memory leak if the constant value is on GPU.
+            # Since this is just an optimization we can clear it out.
+            arg.constant = None
+
+    _known_precompiled_patterns.append(
+        (search_fn, example_inputs, trace_fn, scalar_workaround, pat)
+    )
+    register_replacement(
+        search_fn,
+        replace_fn,
+        example_inputs,
+        trace_fn,
+        pass_dicts,
+        extra_check,
+        scalar_workaround,
+        exclusive_arg_names,
+        search_fn_pattern=pat,
+    )
+
+
 @functorch_config.patch(functionalize_rng_ops=False)
 def gen_pattern(
     search_fn, example_inputs, trace_fn, scalar_workaround=(), exclusive_arg_names=()
@@ -1052,26 +1404,26 @@ def decorator(handler):
     return decorator
 
 
-def is_start_of_fx_graph(graph: torch.fx.GraphModule, node: torch.fx.Node) -> bool:
+def is_start_of_fx_graph(graph: torch.fx.Graph, node: torch.fx.Node) -> bool:
     # first node in the graph
     return node is next(iter(graph.nodes))
 
 
 # match: copy_, relu_, _set_grad_enabled, manual_seed, enter_functional_autocast, etc
-_mutation_op_re = re.compile(r"_$|(\b|_)(set|enter|exit|seed)(\b|_)")
+_mutation_op_re = re.compile(r"_$|_[.]|(\b|_)(set|enter|exit|seed)(\b|_)")
 
 
 def is_mutation_op(node: torch.fx.Node) -> bool:
     if node.op == "call_function":
-        if _mutation_op_re.search(node.target.__name__):
+        if _mutation_op_re.search(node.target.__name__):  # type: ignore[union-attr]
             return True
     elif node.op == "call_method":
-        if _mutation_op_re.search(node.target):
+        if _mutation_op_re.search(node.target):  # type: ignore[union-attr, arg-type]
             return True
     return node.kwargs.get("out") is not None
 
 
-def get_mutation_region_id(graph: torch.fx.GraphModule, node: torch.fx.Node) -> int:
+def get_mutation_region_id(graph: torch.fx.Graph, node: torch.fx.Node) -> int:
     n = node
     while "mutation_region_id" not in n.meta and not is_start_of_fx_graph(graph, n):
         n = n.prev
@@ -1097,14 +1449,17 @@ def compute_mutation_region_ids(graph: torch.fx.GraphModule):
 
 
 class PatternMatcherPass:
-    def __init__(self, prevent_match_across_mutations=False):
+    def __init__(
+        self, prevent_match_across_mutations=False, pass_name: Optional[str] = None
+    ):
         super().__init__()
         self.patterns: DefaultDict[
-            torch.fx.node.Target, List[PatternEntry]
+            Tuple[str, torch.fx.node.Target], List[PatternEntry]
         ] = defaultdict(list)
         self.prevent_match_across_mutations = prevent_match_across_mutations
+        self.pass_name = pass_name
 
-    def __getitem__(self, item: torch.fx.node.Target) -> List[PatternEntry]:
+    def __getitem__(self, item: Tuple[str, torch.fx.node.Target]) -> List[PatternEntry]:
         return self.patterns[item]
 
     def apply(self, graph: torch.fx.GraphModule) -> int:
@@ -1119,36 +1474,45 @@ def apply(self, graph: torch.fx.GraphModule) -> int:
                 get_mutation_region_id, graph
             )
         count = 0
-        for node in reversed(graph.nodes):
+        nodes = []
+        has_call_module = False
+        for op, target in self.patterns:
+            if op == "call_module":
+                has_call_module = True
+            else:
+                nodes.append(graph.find_nodes(op=op, target=target, sort=False))
+        if has_call_module:
+            nodes.append(graph.find_nodes(op="call_module", sort=False))
+        for node in sorted(itertools.chain.from_iterable(nodes), reverse=True):
             target = extract_target(node)
-            if (
-                node.op in ["call_function", "call_method", "call_module"]
-                and target in self.patterns
-            ):
-                # conservatively not applying pattern for cpu input,
-                # since some of the patterns induce codegen and split nodes.
-                # Note: we will only skip cpu compute if disable_cpp_codegen=True
-                if fallback_node_due_to_unsupported_type(node, allow_cpu_inputs=False):
+            if node.op == "call_module":
+                if (node.op, target) not in self.patterns:
                     continue
 
-                for entry in self.patterns[target]:
-                    if node._erased:
-                        break
-                    m = entry.pattern.match(node)
-                    # pattern match crosses mutation barrier - discard
-                    if (
-                        self.prevent_match_across_mutations
-                        and is_match(m)
-                        and len(set(map(get_mutation_region_id_partial, m.nodes))) != 1
-                    ):
-                        continue
-                    if os.environ.get("TORCHINDUCTOR_PATTERN_MATCH_DEBUG") == node.name:
-                        log.warning("%s%s %s %s", node, node.args, m, entry.pattern)
-                    if is_match(m) and entry.extra_check(m):
-                        count += 1
-                        entry.apply(m, graph, node)
-                        counters["inductor"]["pattern_matcher_count"] += 1
-                        counters["inductor"]["pattern_matcher_nodes"] += len(m.nodes)
+            # conservatively not applying pattern for cpu input,
+            # since some of the patterns induce codegen and split nodes.
+            # Note: we will only skip cpu compute if disable_cpp_codegen=True
+            if fallback_node_due_to_unsupported_type(node, allow_cpu_inputs=False):
+                continue
+
+            for entry in self.patterns[(node.op, target)]:
+                if node._erased:
+                    break
+                m = entry.pattern.match(node)
+                # pattern match crosses mutation barrier - discard
+                if (
+                    self.prevent_match_across_mutations
+                    and is_match(m)
+                    and len(set(map(get_mutation_region_id_partial, m.nodes))) != 1  # type: ignore[possibly-undefined]
+                ):
+                    continue
+                if os.environ.get("TORCHINDUCTOR_PATTERN_MATCH_DEBUG") == node.name:
+                    log.warning("%s%s %s %s", node, node.args, m, entry.pattern)
+                if is_match(m) and entry.extra_check(m):
+                    count += 1
+                    entry.apply(m, graph, node)  # type: ignore[arg-type]
+                    counters["inductor"]["pattern_matcher_count"] += 1
+                    counters["inductor"]["pattern_matcher_nodes"] += len(m.nodes)
         return count
 
     def clear(self):
@@ -1156,7 +1520,7 @@ def clear(self):
 
 
 def _not_implemented(*args, **kwargs) -> NoReturn:
-    raise NotImplementedError()
+    raise NotImplementedError
 
 
 def fx_to_pattern(
@@ -1232,12 +1596,13 @@ def run_node(self, n):
 
 
 @torch.no_grad()
-def fwd_only(fn, args) -> torch.fx.GraphModule:
+def fwd_only(fn, args, *, run_dce=True) -> torch.fx.GraphModule:
     """Build a normalized inference graph, for use with fx_to_pattern"""
     # TODO - look into using aot autograd, asserting no mutating ops here
     with enable_python_dispatcher():
-        gm = make_fx(fn, select_decomp_table())(*args)
-    gm.graph.eliminate_dead_code()
+        gm = make_fx(fn, select_decomp_table(), tracing_mode="real")(*args)
+    if run_dce:
+        gm.graph.eliminate_dead_code()
     gm.recompile()
     return gm
 
@@ -1274,7 +1639,7 @@ def record_joint_graph(joint_graph, inputs, **kwargs):
     GraphPatternEntry(
         pattern=pattern, handler=pointless_view, extra_check=_return_true
     ).register(matcher_pass.patterns)
-    matcher_pass.apply(gm.graph)
+    matcher_pass.apply(gm.graph)  # type: ignore[arg-type]
 
     # remove in/out specs
     gm.graph._codegen = torch.fx.graph.CodeGen()
@@ -1378,7 +1743,7 @@ def get_arg_value(
     return (
         node.args[arg_number]
         if len(node.args) > arg_number
-        else node.kwargs.get(kwarg_name)
+        else node.kwargs.get(kwarg_name)  # type: ignore[arg-type]
     )
 
 
@@ -1396,5 +1761,5 @@ def extract_target(node: Node):
      as a function.
     """
     if node.op == "call_module":
-        return getattr(node.graph.owning_module, node.target).__class__
+        return getattr(node.graph.owning_module, node.target).__class__  # type: ignore[arg-type]
     return node.target
diff --git a/torch/_inductor/quantized_lowerings.py b/torch/_inductor/quantized_lowerings.py
index 97818a6cb7e92..7b4edf0627dd5 100644
--- a/torch/_inductor/quantized_lowerings.py
+++ b/torch/_inductor/quantized_lowerings.py
@@ -1,15 +1,30 @@
 import torch
+from . import lowering
 
+quantized = torch.ops.quantized
+_quantized = torch.ops._quantized
+aten = torch.ops.aten
 
-def register_quantized_ops():
-    from . import lowering
-
-    quantized = torch.ops.quantized
 
+def register_quantized_ops():
     lowering.add_needs_realized_inputs(
         [
             quantized.max_pool2d,
+            _quantized.wrapped_fbgemm_pack_gemm_matrix_fp16,
+            _quantized.wrapped_fbgemm_linear_fp16_weight,
         ]
     )
 
     lowering.make_fallback(quantized.max_pool2d)
+    lowering.make_fallback(_quantized.wrapped_fbgemm_pack_gemm_matrix_fp16)
+    lowering.make_fallback(_quantized.wrapped_fbgemm_linear_fp16_weight)
+
+
+def register_woq_mm_ops():
+    lowering.add_needs_realized_inputs(
+        [
+            aten._weight_int8pack_mm,
+        ]
+    )
+
+    lowering.make_fallback(aten._weight_int8pack_mm)
diff --git a/torch/_inductor/runtime/__init__.py b/torch/_inductor/runtime/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/torch/_inductor/runtime/compile_tasks.py b/torch/_inductor/runtime/compile_tasks.py
new file mode 100644
index 0000000000000..66a36703da457
--- /dev/null
+++ b/torch/_inductor/runtime/compile_tasks.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+import functools
+import os
+import sys
+import warnings
+from types import ModuleType
+from typing import Any, Callable
+
+
+def _reload_triton_kernel_in_subproc(reload_module, kernel_name):
+    return _module_to_triton_kernel(reload_module(), kernel_name)
+
+
+def _module_to_triton_kernel(mod, kernel_name):
+    kernel = getattr(mod, kernel_name)
+    kernel._reload_in_subproc = functools.partial(
+        _reload_triton_kernel_in_subproc,
+        mod._reload_in_subproc,
+        kernel_name,
+    )
+    return kernel
+
+
+def _reload_python_module_in_subproc(key, path):
+    codecache = sys.modules.get("torch._inductor.codecache")
+    if codecache:
+        return codecache.PyCodeCache.load_by_key_path(key, path)
+    else:
+        return _reload_python_module(key, path)
+
+
+def _reload_python_module(key, path):
+    with open(path) as f:
+        try:
+            code = compile(f.read(), path, "exec")
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to import {path}\n{type(e).__name__}: {e}"
+            ) from None
+        mod = ModuleType(f"{__name__}.{key}")
+        mod.__file__ = path
+        mod.key = key  # type: ignore[attr-defined]
+        exec(code, mod.__dict__, mod.__dict__)
+        sys.modules[mod.__name__] = mod
+        return mod
+
+
+@functools.lru_cache(None)
+def _set_triton_ptxas_path() -> None:
+    if os.environ.get("TRITON_PTXAS_PATH") is not None:
+        return
+    ptxas_path = os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "..", "bin", "ptxas")
+    )
+    if not os.path.exists(ptxas_path):
+        return
+    if os.path.isfile(ptxas_path) and os.access(ptxas_path, os.X_OK):
+        os.environ["TRITON_PTXAS_PATH"] = ptxas_path
+    else:
+        warnings.warn(f"{ptxas_path} exists but is not an executable")
+
+
+def _worker_compile_triton(
+    load_kernel: Callable[[], Any],
+):
+    _set_triton_ptxas_path()
+    load_kernel().precompile(warm_cache_only=True)
diff --git a/torch/_inductor/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
similarity index 92%
rename from torch/_inductor/coordinate_descent_tuner.py
rename to torch/_inductor/runtime/coordinate_descent_tuner.py
index d0e1efd1f3504..b5d10478a03c9 100644
--- a/torch/_inductor/coordinate_descent_tuner.py
+++ b/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -3,16 +3,15 @@
 import logging
 from typing import Callable, Optional
 
-from torch.utils._triton import has_triton
-from .utils import red_text, triton_config_to_hashable
+from .hints import TRITON_MAX_BLOCK
 
-if has_triton():
+from .runtime_utils import red_text, triton_config_to_hashable
+
+try:
     import triton
-else:
+except ImportError:
     triton = None
 
-from . import config as inductor_config
-
 log = logging.getLogger(__name__)
 
 
@@ -45,36 +44,38 @@ class CoordescTuner:
           i.e., there are multiple local optima..
     """
 
-    def __init__(self, is_mm=False, name="unknown", size_hints=None):
+    def __init__(
+        self, is_mm=False, name="unknown", size_hints=None, inductor_meta=None
+    ):
         self.is_mm = is_mm  # we will tune num_stages for mm
         self.cached_benchmark_results = {}
         self.name = name
         self.size_hints = size_hints
+        self.inductor_meta = inductor_meta or {}
 
     def get_xmax(self):
-        xmax = inductor_config.triton.max_block["X"]
+        xmax = TRITON_MAX_BLOCK["X"]
         if self.size_hints and len(self.size_hints) > 0:
             xmax = min(xmax, self.size_hints[0])
         return xmax
 
     def get_ymax(self):
-        ymax = inductor_config.triton.max_block["Y"]
+        ymax = TRITON_MAX_BLOCK["Y"]
         if self.size_hints and len(self.size_hints) > 1:
             ymax = min(ymax, self.size_hints[1])
         return ymax
 
     def get_zmax(self):
-        zmax = inductor_config.triton.max_block["Z"]
+        zmax = TRITON_MAX_BLOCK["Z"]
         if self.size_hints and len(self.size_hints) > 2:
             zmax = min(zmax, self.size_hints[2])
         return zmax
 
     def get_rmax(self):
+        rmax = TRITON_MAX_BLOCK["R"]
         if self.size_hints and len(self.size_hints) > 0:
-            return self.size_hints[-1]  # the last one is for reduction
-        else:
-            # large enough. We should not pick this large RBLOCK anyway
-            return 2**30
+            rmax = min(rmax, self.size_hints[-1])  # the last one is for reduction
+        return rmax
 
     def get_warpsmax(self):
         # Currently, CUDA has a maximum of 1024 threads, so 32 is the max
@@ -196,7 +197,7 @@ def check_all_tuning_directions(
             candidate_values = self.get_neighbour_values(
                 field,
                 old_value,
-                radius=inductor_config.coordinate_descent_search_radius,
+                radius=self.inductor_meta.get("coordinate_descent_search_radius", 1),
                 include_self=True,
             )
             candidate_values_list.append(candidate_values)
@@ -288,7 +289,9 @@ def autotune(
                         improved = True
                         best_config, best_timing = candidate_config, candidate_timing
 
-            if not improved and inductor_config.coordinate_descent_check_all_directions:
+            if not improved and self.inductor_meta.get(
+                "coordinate_descent_check_all_directions"
+            ):
                 old_best_timing = best_timing
                 improved, best_config, best_timing = self.check_all_tuning_directions(
                     func, best_config, best_timing
diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
new file mode 100644
index 0000000000000..325f37ae25047
--- /dev/null
+++ b/torch/_inductor/runtime/hints.py
@@ -0,0 +1,129 @@
+import collections
+import typing
+from dataclasses import fields
+from enum import auto, Enum
+from typing import Optional
+
+
+# NOTE: if these fail asserts submit a PR to increase them
+TRITON_MAX_BLOCK = {
+    "X": 2048,
+    "Y": 1024,
+    "Z": 1024,
+    "R": 4096 * 16,  # * 16 is multi-kernel only
+}
+
+
+class ReductionHint(Enum):
+    INNER = 0
+    OUTER = 1
+    OUTER_TINY = 2
+    DEFAULT = 3
+
+
+class TileHint(Enum):
+    SQUARE = 0
+    DEFAULT = 1
+
+
+# Attempt to import AttrsDescriptor from Triton
+try:
+    from triton.compiler.compiler import AttrsDescriptor
+
+    attrs_descriptor_available = True
+    # Determine if 'ids_of_folded_args' is a valid field for AttrsDescriptor
+    attr_desc_fields = {f.name for f in fields(AttrsDescriptor)}
+    ids_of_folded_args_available = "ids_of_folded_args" in attr_desc_fields
+    divisible_by_8_available = "divisible_by_8" in attr_desc_fields
+except ImportError:
+    attrs_descriptor_available = False
+
+# Define `instance_descriptor` function with clear conditional handling
+if attrs_descriptor_available:
+
+    def instance_descriptor(
+        divisible_by_16=None,
+        equal_to_1=None,
+        ids_of_folded_args=None,
+        divisible_by_8=None,
+    ):
+        # Prepare the arguments for AttrsDescriptor
+        kwargs = {
+            "divisible_by_16": divisible_by_16,
+            "equal_to_1": equal_to_1,
+        }
+
+        # Conditionally add 'ids_of_folded_args' if it's available in AttrsDescriptor
+        if ids_of_folded_args_available:
+            kwargs["ids_of_folded_args"] = ids_of_folded_args
+        if divisible_by_8_available:
+            kwargs["divisible_by_8"] = divisible_by_8
+
+        # Instantiate AttrsDescriptor with the prepared arguments
+        return AttrsDescriptor(**kwargs)
+
+else:
+    # Define a namedtuple as a fallback when AttrsDescriptor is not available
+    instance_descriptor = collections.namedtuple(  # type: ignore[no-redef]
+        "instance_descriptor",
+        ["divisible_by_16", "equal_to_1", "ids_of_folded_args", "divisible_by_8"],
+        defaults=[tuple(), tuple(), tuple(), tuple()],
+    )
+
+
+_NUM_THREADS_PER_WARP = 32
+
+
+class HeuristicType(Enum):
+    PERSISTENT_REDUCTION = auto()
+    POINTWISE = auto()
+    REDUCTION = auto()
+    SPLIT_SCAN = auto()
+    TEMPLATE = auto()
+    USER_AUTOTUNE = auto()
+
+
+class AutotuneHint(Enum):
+    ELEMENTS_PER_WARP_32 = 0
+
+    # Triton codegen tries to codegen set of AutotuneHints.
+    # Enum.__repr__ looks like "<AutotuneHint.ELEMENTS_PER_WARP_32: 0>""
+    # which isn't valid python.
+    # Enum.__str__ will just return "AutotuneHint.ELEMENTS_PER_WARP_32".
+    __repr__ = Enum.__str__
+
+
+class DeviceProperties(typing.NamedTuple):
+    """Copy device properties into a data structure not requiring torch to be imported"""
+
+    type: str  # type: ignore[assignment]
+    index: int  # type: ignore[assignment]
+    cc: int
+    major: Optional[int] = None
+    regs_per_multiprocessor: Optional[int] = None
+    max_threads_per_multi_processor: Optional[int] = None
+    multi_processor_count: Optional[int] = None
+
+    @classmethod
+    def create(cls, device):
+        import torch
+        from torch._dynamo.device_interface import get_interface_for_device
+
+        device_type = device.type if torch.version.hip is None else "hip"
+        device_interface = get_interface_for_device(device)
+        if device_type == "cuda":
+            props = device_interface.get_device_properties(device)
+            return cls(
+                type=device_type,
+                index=device.index,
+                cc=device_interface.get_compute_capability(device),
+                major=props.major,
+                regs_per_multiprocessor=props.regs_per_multiprocessor,
+                max_threads_per_multi_processor=props.max_threads_per_multi_processor,
+                multi_processor_count=props.multi_processor_count,
+            )
+        return cls(
+            type=device_type,
+            index=device.index,
+            cc=device_interface.get_compute_capability(device),
+        )
diff --git a/torch/_inductor/runtime/runtime_utils.py b/torch/_inductor/runtime/runtime_utils.py
new file mode 100644
index 0000000000000..b7b42c3c3c101
--- /dev/null
+++ b/torch/_inductor/runtime/runtime_utils.py
@@ -0,0 +1,182 @@
+from __future__ import annotations
+
+import functools
+import getpass
+import inspect
+import operator
+import os
+import re
+import tempfile
+import time
+
+import torch
+
+
+def conditional_product(*args):
+    return functools.reduce(operator.mul, [x for x in args if x])
+
+
+def ceildiv(numer: int, denom: int) -> int:
+    return -(numer // -denom)
+
+
+def next_power_of_2(n: int) -> int:
+    """Return the smallest power of 2 greater than or equal to n"""
+    n -= 1
+    n |= n >> 1
+    n |= n >> 2
+    n |= n >> 4
+    n |= n >> 8
+    n |= n >> 16
+    n |= n >> 32
+    n += 1
+    return n
+
+
+def get_num_bytes(*args: torch.Tensor, num_in_out_args: int = 0) -> int:
+    """
+    Return the total number of bytes the arguments of tensor type takes.
+
+    For in/out args, tensor sizes are counted twice: once for reading and
+    once for writing.
+
+    The first num_in_out_args arguments are in out tensors.
+    """
+    return sum(
+        arg.numel() * arg.element_size() * (1 + int(i < num_in_out_args))
+        for i, arg in enumerate(args)
+        if isinstance(arg, torch.Tensor)
+    )
+
+
+def triton_config_to_hashable(cfg):
+    """
+    Convert triton config to a tuple that can uniquely identify it. We can use
+    the return value as a dictionary key.
+    """
+    items = sorted(cfg.kwargs.items())
+    items.append(("num_warps", cfg.num_warps))
+    items.append(("num_stages", cfg.num_stages))
+    return tuple(items)
+
+
+def create_bandwidth_info_str(ms, num_gb, gb_per_s, prefix="", suffix="", color=True):
+    info_str = f"{prefix}{ms:.3f}ms    \t{num_gb:.3f} GB \t {gb_per_s:7.2f}GB/s{suffix}"
+    slow = ms > 0.012 and gb_per_s < 650
+    return red_text(info_str) if color and slow else info_str
+
+
+def get_max_y_grid():
+    return 65535
+
+
+def do_bench(*args, **kwargs):
+    @functools.lru_cache(None)
+    def load_triton():
+        try:
+            # NB: Lazily load triton, as importing triton is slow
+            # see https://github.com/openai/triton/issues/1599
+            from triton.testing import do_bench as triton_do_bench
+        except ImportError as exc:
+            raise NotImplementedError("requires Triton") from exc
+
+        # triton PR https://github.com/openai/triton/pull/1513 change the
+        # quantile fields name from 'percentiles' to 'quantiles'
+        # and change the default value from (0.5, 0.2, 0.8) to None.
+        # This may break inductor since a caller expects a tuple may get a item.
+        #
+        # Add a wrapper to maintain the same behavior for inductor.
+        # Maybe we should have own implementation of this function?
+        return triton_do_bench, (
+            "quantiles"
+            if inspect.signature(triton_do_bench).parameters.get("quantiles")
+            is not None
+            else "percentiles"
+        )
+
+    triton_do_bench, quantile_field_name = load_triton()
+
+    if quantile_field_name not in kwargs:
+        kwargs[quantile_field_name] = (0.5, 0.2, 0.8)
+    return triton_do_bench(*args, **kwargs)[0]
+
+
+def do_bench_cpu(fn, warmup=5, times=20):
+    assert times > 0
+    for _ in range(warmup):
+        fn()
+    durations = []
+    for _ in range(times):
+        t0 = time.perf_counter()
+        fn()
+        t1 = time.perf_counter()
+        durations.append((t1 - t0) * 1000)
+    # return the median time
+    sorted_durations = sorted(durations)
+    if times % 2 == 0:
+        return (sorted_durations[times // 2 - 1] + sorted_durations[times // 2]) / 2
+    else:
+        return sorted_durations[times // 2]
+
+
+def cache_dir() -> str:
+    cache_dir = os.environ.get("TORCHINDUCTOR_CACHE_DIR")
+    if cache_dir is None:
+        sanitized_username = re.sub(r'[\\/:*?"<>|]', "_", getpass.getuser())
+        os.environ["TORCHINDUCTOR_CACHE_DIR"] = cache_dir = os.path.join(
+            tempfile.gettempdir(),
+            "torchinductor_" + sanitized_username,
+        )
+    os.makedirs(cache_dir, exist_ok=True)
+    return cache_dir
+
+
+HAS_COLORAMA = True
+try:
+    import colorama
+except ImportError:
+    HAS_COLORAMA = False
+
+
+def _color_text(msg, color):
+    if not HAS_COLORAMA:
+        return msg
+
+    return getattr(colorama.Fore, color.upper()) + msg + colorama.Fore.RESET
+
+
+def green_text(msg):
+    return _color_text(msg, "green")
+
+
+def yellow_text(msg):
+    return _color_text(msg, "yellow")
+
+
+def red_text(msg):
+    return _color_text(msg, "red")
+
+
+def blue_text(msg):
+    return _color_text(msg, "blue")
+
+
+def get_first_attr(obj, *attrs):
+    """
+    Return the first available attribute or throw an exception if none is present.
+    """
+    for attr in attrs:
+        if hasattr(obj, attr):
+            return getattr(obj, attr)
+
+    raise AssertionError(f"{obj} does not has any of the attributes: {attrs}")
+
+
+try:
+    dynamo_timed = torch._dynamo.utils.dynamo_timed
+except AttributeError:  # Compile workers only have a mock version of torch
+
+    def dynamo_timed(original_function=None, phase_name=None):
+        if original_function:
+            return original_function
+        return dynamo_timed
diff --git a/torch/_inductor/runtime/triton_helpers.py b/torch/_inductor/runtime/triton_helpers.py
new file mode 100644
index 0000000000000..71b746bdf49ac
--- /dev/null
+++ b/torch/_inductor/runtime/triton_helpers.py
@@ -0,0 +1,379 @@
+try:
+    import triton
+    import triton.language as tl
+except ImportError:
+
+    class triton:  # type: ignore[no-redef]
+        @staticmethod
+        def jit(x):
+            return x
+
+    class tl:  # type: ignore[no-redef]
+        constexpr = None  # type: ignore[var-annotated]
+        math = None  # type: ignore[var-annotated]
+        extra = None  # type: ignore[var-annotated]
+
+
+# In the latest triton, math functions were shuffled around into different modules:
+# https://github.com/openai/triton/pull/3172
+if hasattr(tl.extra, "cuda") and hasattr(tl.extra.cuda, "libdevice"):
+    libdevice = tl.extra.cuda.libdevice
+    math = tl.math
+elif hasattr(tl.extra, "intel") and hasattr(tl.extra.intel, "libdevice"):
+    libdevice = tl.extra.intel.libdevice
+    math = tl.math
+else:
+    libdevice = tl.math
+    math = tl
+
+
+@triton.jit
+def promote_to_tensor(x):
+    # Addition promotes to tensor for us
+    return x + tl.zeros((1,), tl.int1)
+
+
+@triton.jit
+def is_floating(x):
+    return promote_to_tensor(x).dtype.is_floating()
+
+
+@triton.jit
+def _prod_accumulate(a, b):
+    return a * b
+
+
+@triton.jit
+def prod(input, axis):
+    return tl.reduce(input, axis, _prod_accumulate)
+
+
+@triton.jit
+def minimum(a, b):
+    mask = a < b
+    if is_floating(a):
+        mask |= a != a
+    return tl.where(mask, a, b)
+
+
+@triton.jit
+def maximum(a, b):
+    mask = a > b
+    if is_floating(a):
+        mask |= a != a
+    return tl.where(mask, a, b)
+
+
+@triton.jit
+def min2(a, dim):
+    return tl.reduce(a, dim, minimum)
+
+
+@triton.jit
+def max2(a, dim):
+    return tl.reduce(a, dim, maximum)
+
+
+@triton.jit
+def minimum_with_index(a_value, a_index, b_value, b_index):
+    mask = a_value < b_value
+    equal = a_value == b_value
+    if is_floating(a_value):
+        a_isnan = a_value != a_value
+        b_isnan = b_value != b_value
+        mask |= a_isnan and not b_isnan
+        # Consider NaNs as equal
+        equal |= a_isnan and b_isnan
+
+    # Prefer lowest index if values are equal
+    mask |= equal & (a_index < b_index)
+    return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index)
+
+
+@triton.jit
+def maximum_with_index(a_value, a_index, b_value, b_index):
+    mask = a_value > b_value
+    equal = a_value == b_value
+    if is_floating(a_value):
+        a_isnan = a_value != a_value
+        b_isnan = b_value != b_value
+        mask |= a_isnan and not b_isnan
+        # Consider NaNs as equal
+        equal |= a_isnan and b_isnan
+
+    # Prefer lowest index if values are equal
+    mask |= equal & (a_index < b_index)
+    return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index)
+
+
+@triton.jit
+def min_with_index(value, index, dim):
+    return tl.reduce((value, index), dim, minimum_with_index)
+
+
+@triton.jit
+def max_with_index(value, index, dim):
+    return tl.reduce((value, index), dim, maximum_with_index)
+
+
+@triton.jit
+def welford_reduce(value, mean, m2, weight, first_iteration):
+    if first_iteration:
+        new_weight = tl.full(weight.shape, 1, weight.dtype)
+        new_mean = value
+        new_m2 = tl.zeros_like(m2)
+    else:
+        delta = value - mean
+        new_weight = weight + 1
+        new_mean = mean + delta / new_weight
+        new_m2 = m2 + delta * (value - new_mean)
+    return new_mean, new_m2, new_weight
+
+
+@triton.jit
+def welford_combine(mean_1, m2_1, weight_1, mean_2, m2_2, weight_2):
+    delta = mean_2 - mean_1
+    new_weight = weight_1 + weight_2
+    w2_over_w = tl.where(new_weight == 0.0, 0.0, weight_2 / new_weight)
+    return (
+        mean_1 + delta * w2_over_w,
+        m2_1 + m2_2 + delta * delta * weight_1 * w2_over_w,
+        new_weight,
+    )
+
+
+@triton.jit
+def welford(mean, m2, weight, dim):
+    return tl.reduce((mean, m2, weight), dim, welford_combine)
+
+
+@triton.jit
+def device_assert_then(cond, msg, r):
+    tl.device_assert(cond, msg)
+    return r
+
+
+@triton.jit
+def randint64(seed, offset, low, high):
+    r0, r1, r2, r3 = tl.randint4x(seed, offset)
+    r0 = r0.to(tl.uint64)
+    r1 = r1.to(tl.uint64)
+    result = r0 | (r1 << 32)
+    size = high - low
+    result = result % size.to(tl.uint64)
+    result = result.to(tl.int64) + low
+    return result
+
+
+@triton.jit
+def _any_combine(a, b):
+    return a | b
+
+
+@triton.jit
+def any(a, dim):
+    return tl.reduce(a, dim, _any_combine)
+
+
+@triton.jit
+def bucketize_binary_search(
+    values,  # 1D tensor
+    offsets_ptr,
+    indexing_dtype,
+    right,  # bool: if true, use intervals closed on the left; see [Note: Inductor bucketize op]
+    OFFSETS_SIZE: int,
+    BLOCK_SHAPE,  # tuple/list of block shape
+):
+    """
+    See [Note: Inductor bucketize op]
+    """
+
+    low = tl.zeros(BLOCK_SHAPE, dtype=indexing_dtype)
+    high = tl.full(BLOCK_SHAPE, OFFSETS_SIZE, dtype=indexing_dtype)
+
+    full_range = OFFSETS_SIZE + 1
+    while full_range > 1:
+        mid = (high + low) // 2
+        mask = mid < OFFSETS_SIZE
+        bucket_upper_bound = tl.load(offsets_ptr + mid, mask=mask)
+        if right:
+            is_above = values >= bucket_upper_bound
+        else:
+            is_above = values > bucket_upper_bound
+
+        low = tl.where(is_above & mask, mid + 1, low)
+        high = tl.where(is_above, high, mid)
+
+        full_range = (full_range + 1) // 2
+
+    return low
+
+
+@triton.jit
+def pack_value_flag(
+    value,
+    flag,
+    DTYPE_VALUE_AS_UINT: tl.constexpr,
+    DTYPE_PACK: tl.constexpr,
+):
+    # Workaround for triton bug, tensor.to doesn't unwrap constexpr values
+    DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT)
+    bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth
+    uv = value.to(DTYPE_VALUE_AS_UINT, bitcast=True).to(DTYPE_PACK)
+    return flag.to(DTYPE_PACK) | (uv << bitwidth)
+
+
+@triton.jit
+def unpack_value(
+    pack,
+    DTYPE_VALUE,
+    DTYPE_VALUE_AS_UINT,
+):
+    # Workaround for triton bug, tensor.to doesn't unwrap constexpr values
+    DTYPE_VALUE = tl.core._constexpr_to_value(DTYPE_VALUE)
+    DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT)
+    bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth
+    value_uint = (pack >> bitwidth).to(DTYPE_VALUE_AS_UINT)
+    return value_uint.to(DTYPE_VALUE, bitcast=True)
+
+
+@triton.jit
+def unpack_flag(pack, DTYPE_FLAG):
+    return pack.to(DTYPE_FLAG)
+
+
+@triton.jit
+def exclusive_scan_decoupled_lookback(
+    scratch_base,
+    block_value,
+    index,
+    combine_fn,
+    DTYPE_VALUE_AS_UINT: tl.constexpr,
+    DTYPE_PACK: tl.constexpr,
+):
+    """Compute exclusive scan of a scalar value between blocks
+
+    Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back
+
+    scratch_base: Pointer to scratch space in global memory
+    block_value: Scalar value for this block
+    index: Scalar index of this block relative to the current scan
+    combine_fn: Function ``(value, value) -> value`` which is scanned over
+    DTYPE_VALUE_AS_UINT: A tl.uint{n} type equal in size to ``block_value``
+    DTYPE_PACK: Unsigned type twice the width of block_value
+
+    NOTE: This function is limited to values which are 32-bits or less because
+    we need to pack (value, flag) into a single unsigned int.
+    """
+    # Publish block sum so subsequent blocks don't get stuck waiting for us
+    DTYPE_VALUE = block_value.dtype
+    pack = pack_value_flag(
+        block_value,
+        tl.full(block_value.shape, 1, DTYPE_VALUE_AS_UINT),
+        DTYPE_VALUE_AS_UINT,
+        DTYPE_PACK,
+    )
+    if index > 0:
+        tl.atomic_xchg(scratch_base + index, pack, sem="relaxed")
+
+    # Calculate exclusive prefix scan
+    exclusive_prefix = tl.zeros([], DTYPE_VALUE)
+    prefix_valid = False
+    test_target = index - 1
+    while test_target >= 0:
+        # tl.atomic_load
+        flag = tl.full([], 0, DTYPE_VALUE_AS_UINT)
+        while flag == 0:
+            pack = tl.atomic_add(scratch_base + test_target, 0, sem="relaxed")
+            flag = unpack_flag(pack, DTYPE_VALUE_AS_UINT)
+
+        value = unpack_value(pack, DTYPE_VALUE, DTYPE_VALUE_AS_UINT)
+        if prefix_valid:
+            exclusive_prefix = combine_fn(value, exclusive_prefix)
+        else:
+            exclusive_prefix = value
+            prefix_valid = True
+
+        if flag == 2:
+            test_target = -1
+        else:
+            test_target = test_target - 1
+
+    # Make inclusive block sum visible to other blocks
+    if prefix_valid:
+        inclusive_prefix = combine_fn(exclusive_prefix, block_value)
+    else:
+        inclusive_prefix = block_value
+    pack = pack_value_flag(
+        inclusive_prefix,
+        tl.full([], 2, DTYPE_VALUE_AS_UINT),
+        DTYPE_VALUE_AS_UINT,
+        DTYPE_PACK,
+    )
+    tl.atomic_xchg(scratch_base + index, pack, sem="relaxed")
+    return exclusive_prefix
+
+
+@triton.jit
+def exclusive_scan_decoupled_lookback_64(scratch_base, block_value, index, combine_fn):
+    """Compute exclusive scan of a scalar value between blocks
+
+    Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back
+
+    scratch_base: Pointer to scratch space in global memory
+    block_value: Scalar value for this block, must be 64-bits wide
+    index: Scalar index of this block relative to the current scan
+    combine_fn: Function ``(value, value) -> value`` which is scanned over
+    init: Scalar value equal to the identiy of combine_fn
+    """
+    # Publish block sum so subsequent blocks don't get stuck waiting for us
+    if index > 0:
+        block_value_u64 = block_value.to(tl.uint64, bitcast=True)
+        tl.store(scratch_base + 3 * index + 1, block_value_u64)
+        tl.debug_barrier()
+        flag_one = tl.full([], 1, tl.uint64)
+        tl.atomic_xchg(scratch_base + 3 * index + 0, flag_one, sem="release")
+
+    # Calculate exclusive prefix scan
+    exclusive_prefix = tl.zeros([], block_value.dtype)
+    prefix_valid = False
+    test_target = index - 1
+    while test_target >= 0:
+        flag = tl.full([], 0, tl.uint64)
+        while flag == 0:
+            flag = tl.atomic_add(scratch_base + 3 * test_target + 0, 0, sem="acquire")
+
+        value_u64 = tl.load(scratch_base + 3 * test_target + flag.to(tl.int32))
+        value = value_u64.to(block_value.dtype, bitcast=True)
+        if prefix_valid:
+            exclusive_prefix = combine_fn(value, exclusive_prefix)
+        else:
+            exclusive_prefix = value
+            prefix_valid = True
+
+        if flag == 2:
+            test_target = -1
+        else:
+            test_target = test_target - 1
+
+    # Make inclusive block sum visible to other blocks
+    if prefix_valid:
+        inclusive_prefix = combine_fn(exclusive_prefix, block_value)
+    else:
+        inclusive_prefix = block_value
+    inclusive_prefix_u64 = inclusive_prefix.to(tl.uint64, bitcast=True)
+    tl.store(scratch_base + 3 * index + 2, inclusive_prefix_u64)
+    tl.debug_barrier()
+    flag_two = tl.full([], 2, tl.uint64)
+    tl.atomic_xchg(scratch_base + 3 * index + 0, flag_two, sem="release")
+
+    return exclusive_prefix
+
+
+@triton.jit
+def frexp(x):
+    # TODO(isuruf): use inline_asm_elementwise here
+    y = libdevice.ilogb(x) + 1
+    exponent = tl.where(x == 0, 0, y)
+    mantissa = tl.where(x == 0, 0, libdevice.ldexp(x, -y))
+    return mantissa, exponent
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
new file mode 100644
index 0000000000000..0dc43f06493de
--- /dev/null
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -0,0 +1,1714 @@
+import builtins
+import copy
+import functools
+import hashlib
+import inspect
+import json
+import logging
+import math
+import operator
+import os
+import os.path
+import re
+import threading
+import time
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple
+
+import torch
+
+from .coordinate_descent_tuner import CoordescTuner
+
+from .hints import (
+    _NUM_THREADS_PER_WARP,
+    AutotuneHint,
+    DeviceProperties,
+    HeuristicType,
+    ReductionHint,
+    TileHint,
+    TRITON_MAX_BLOCK,
+)
+from .runtime_utils import (
+    cache_dir,
+    ceildiv,
+    conditional_product,
+    create_bandwidth_info_str,
+    do_bench,
+    dynamo_timed,
+    get_first_attr,
+    get_max_y_grid,
+    get_num_bytes,
+    next_power_of_2,
+    triton_config_to_hashable,
+)
+
+try:
+    import triton
+except ImportError:
+    triton = None
+
+if triton is not None:
+    from triton import Config
+    from triton.runtime.autotuner import OutOfResources
+    from triton.runtime.jit import KernelInterface
+
+    try:
+        from triton.compiler.compiler import ASTSource
+    except ImportError:
+        ASTSource = None
+
+    try:
+        from triton.backends.compiler import GPUTarget
+    except ImportError:
+        GPUTarget = None
+else:
+    Config = object
+    KernelInterface = object
+    OutOfResources = object
+    ASTSource = None
+    GPUTarget = None
+
+try:
+    autograd_profiler = torch.autograd.profiler
+except AttributeError:  # Compile workers only have a mock version of torch
+
+    class autograd_profiler:  # type: ignore[no-redef]
+        _is_profiler_enabled = False
+
+
+log = logging.getLogger(__name__)
+
+
+def autotune_hints_to_configs(
+    hints: Set[AutotuneHint], size_hints, block_size: int
+) -> List[Config]:
+    """
+    AutotuneHints can be attached to the metadata of triton kernels for providing
+    suggestions about what to try for autotuning. One reason to do this is if there are
+    some configs that are only useful in specific scenarios, in which case we can avoid
+    wasting compile time on autotuning unless we know we are in one of those scenarios.
+
+    Based on those hints, this function will generate a list of additional autotuning
+    configs to try.
+    """
+    xyz_options: Tuple[Tuple[int, Optional[int], Optional[int]], ...]
+    configs = []
+
+    for hint in hints:
+        if hint == AutotuneHint.ELEMENTS_PER_WARP_32:
+            if len(size_hints) == 1:
+                xyz_options = ((block_size // 4, None, None),)
+            elif len(size_hints) == 2:
+                xyz_options = ((block_size // 4, 1, None), (1, block_size // 4, None))
+            elif len(size_hints) == 3:
+                xyz_options = (
+                    (block_size // 4, 1, 1),
+                    (1, block_size // 4, 1),
+                    (1, 1, block_size // 4),
+                )
+            for xyz in xyz_options:
+                configs.append(
+                    triton_config(
+                        size_hints,
+                        *xyz,
+                        num_elements_per_warp=32,
+                    )
+                )
+
+    return configs
+
+
+def disable_pointwise_autotuning(inductor_meta):
+    # Autotuning can give different benchmarking results from run to run, and
+    # therefore we disable autotuning when use_deterministic flag is on.
+    if inductor_meta.get("are_deterministic_algorithms_enabled"):
+        return True
+    return not inductor_meta.get("autotune_pointwise", True)
+
+
+class CachingAutotuner(KernelInterface):
+    """
+    Simplified version of Triton autotuner that has no invalidation
+    key and caches the best config to disk to improve cold start times.
+    Unlike the main triton Autotuner, this version can precompile all
+    configs, and does not rely on the Triton JIT.
+    """
+
+    def __init__(
+        self,
+        fn,
+        triton_meta,  # passed directly to triton
+        configs,
+        save_cache_hook,
+        mutated_arg_names,
+        heuristic_type,
+        size_hints=None,
+        inductor_meta=None,  # metadata not relevant to triton
+        custom_kernel=False,  # whether the kernel is inductor-generated or custom
+        filename: Optional[str] = None,
+    ):
+        super().__init__()
+
+        assert len(configs) > 0, "Non-empty TritonConfig list required for compiling"
+        self.fn = fn
+        self.device_props: DeviceProperties = triton_meta["device"]
+        self.triton_meta = {
+            **triton_meta,
+            "device": self.device_props.index,
+            "device_type": self.device_props.type,
+        }
+        self.inductor_meta = {} if inductor_meta is None else inductor_meta
+        self.save_cache_hook = save_cache_hook
+        self.mutated_arg_names = mutated_arg_names
+        self.configs = configs
+        self.heuristic_type = heuristic_type
+        self.custom_kernel = custom_kernel
+        self.cuda_kernel_saved = False
+        if log.isEnabledFor(logging.DEBUG):
+            log.debug(
+                "CachingAutotuner gets %d configs for %s",
+                len(self.configs),
+                self.fn.__name__,
+            )
+            for c in self.configs:
+                log.debug(c)
+
+        self.launchers = []  # type: ignore[var-annotated]
+        self.lock = threading.Lock()
+        if os.getenv("TRITON_CACHE_DIR") is None:
+            os.environ["TRITON_CACHE_DIR"] = os.path.join(
+                cache_dir(),
+                "triton",
+                str(self.triton_meta.get("device", 0)),
+            )
+
+        self.size_hints = size_hints
+        self.coordesc_tuner = CoordescTuner(
+            is_mm=False,
+            name=self.fn.__name__,
+            size_hints=size_hints,
+            inductor_meta=self.inductor_meta,
+        )
+        self.filename = filename
+
+    def precompile(self, warm_cache_only=False):
+        with self.lock:
+            if self.launchers:
+                return
+            self.launchers = []
+            compiled_binaries = []
+            if not self.configs:
+                raise RuntimeError("No triton configs are available")
+            for c in self.configs:
+                try:
+                    compiled_binary, launcher = self._precompile_config(
+                        c, warm_cache_only
+                    )
+                except OutOfResources as e:
+                    if len(self.configs) == 1:
+                        # There are no valid Triton configs
+                        raise e
+                    # Skip the config if we run out of resource
+                    continue
+                self.launchers.append(launcher)
+                compiled_binaries.append(compiled_binary)
+
+            if len(self.launchers) == 0:
+                raise RuntimeError(
+                    "No valid triton configs. Report a fatal compilation error"
+                )
+
+            seen_configs = set(self.configs)
+
+            device_prop = self.device_props
+            if (
+                self.inductor_meta.get("dynamic_scale_rblock", True)
+                and self.heuristic_type == HeuristicType.REDUCTION
+                and self.size_hints is not None
+                # Disable for AMDGPU/Intel as Triton is not ready to return n_regs for a compiled_binary.
+                and device_prop.type == "cuda"
+                and device_prop.major
+                and device_prop.major >= 8
+            ):
+                assert device_prop.regs_per_multiprocessor
+                assert device_prop.max_threads_per_multi_processor
+                assert device_prop.multi_processor_count
+                for triton_config, compiled_binary in zip(
+                    self.configs, compiled_binaries
+                ):
+                    assert len(self.size_hints) == 2
+                    xblock = triton_config.kwargs.get("XBLOCK", 1)
+                    rblock = triton_config.kwargs["RBLOCK"]
+                    total_block = (self.size_hints[0] + xblock - 1) // xblock
+                    nreg = getattr(compiled_binary, "n_regs", None)
+                    if nreg is None:
+                        continue
+
+                    # make sure rblock is not too small
+                    if rblock <= 64:
+                        continue
+
+                    # each SM of A100 has 65536 32-bit registers. To maximize
+                    # the theoretical occupancy, we need run 2048 threads on each
+                    # SM. So each thread should use no more than 65536 / 2048
+                    # = 32 registers. In cases where occupancy matters, and each
+                    # thread uses too many registers, reduce RBLOCK to reduce
+                    # the register usage.
+                    # For kernel https://gist.github.com/shunting314/e4cccc031fe30d378b9b23c08c238cbd
+                    # from PLBartForCausalLM, latency improve from
+                    # 7.795ms to 4.883ms.
+                    #
+                    if (
+                        nreg
+                        <= device_prop.regs_per_multiprocessor
+                        // device_prop.max_threads_per_multi_processor
+                    ):
+                        continue
+
+                    nreg_per_warp = nreg * 32
+                    nreg_per_block = nreg_per_warp * triton_config.num_warps
+
+                    # Previously we set max_blocks_per_sm to 'max_threads_per_multi_processo / (32 * num_warps)'
+                    # The formula below is a tighter upper bound since we have the assumption that
+                    #   nreg > device_prop.regs_per_multiprocessor // device_prop.max_threads_per_multi_processor
+                    # due to the if condition above and:
+                    #   regs_per_multiprocessor / nreg_per_block
+                    #   = regs_per_multiprocessor / (nreg * 32 * num_warps)
+                    #   < regs_per_multiprocessor / ((regs_per_multiprocessor / max_threads_per_multi_processor) * 32 * num_warps)
+                    #   = max_threads_per_multi_processor / (32 * num_warps)
+                    # Using a tigher upper bound can reveal more optimization opportunities.
+                    max_blocks_per_sm = max(
+                        device_prop.regs_per_multiprocessor // nreg_per_block, 1
+                    )
+
+                    if (
+                        total_block
+                        <= max_blocks_per_sm * device_prop.multi_processor_count
+                    ):
+                        # no need to improve occupancy
+                        continue
+                    new_config = copy.deepcopy(triton_config)
+                    new_config.kwargs["RBLOCK"] = rblock // 2
+                    if new_config in seen_configs:
+                        continue
+                    seen_configs.add(new_config)
+                    self.launchers.append(
+                        self._precompile_config(new_config, warm_cache_only)[1]
+                    )
+            self.configs = None
+
+    def get_device_interface(self):
+        # this code cannot run in compile workers, because it imports from torch
+        from torch._dynamo.device_interface import get_interface_for_device
+
+        return get_interface_for_device(self.device_props.type.replace("hip", "cuda"))
+
+    def _precompile_config(self, cfg: Config, warm_cache_only: bool):
+        """Ahead of time compile a given autotuner config."""
+        compile_meta = copy.deepcopy(self.triton_meta)
+        for k, v in cfg.kwargs.items():
+            if self.device_props.type != "hip":
+                if k == "matrix_instr_nonkdim":
+                    compile_meta["matrix_instr_nonkdim"] = v
+                    continue
+                if k == "waves_per_eu":
+                    compile_meta["waves_per_eu"] = v
+                    continue
+            compile_meta["constants"][self.fn.arg_names.index(k)] = v
+        compile_meta["num_warps"] = cfg.num_warps
+        compile_meta["num_stages"] = cfg.num_stages
+        compile_meta["debug"] = self.inductor_meta.get(
+            "assert_indirect_indexing", True
+        ) and not self.inductor_meta.get("is_hip", False)
+
+        # device type will be "hip" rather than "cuda" here
+        compile_meta["device_type"] = self.device_props.type
+        compile_meta["cc"] = self.device_props.cc
+
+        if ASTSource:
+            compile_args = (
+                ASTSource(
+                    self.fn,
+                    compile_meta["signature"],
+                    compile_meta["constants"],
+                    compile_meta["configs"][0],
+                ),
+            )
+
+            cc_str = str(compile_meta["cc"])
+            if "gfx10" in cc_str or "gfx11" in cc_str:
+                rocm_warp_size = 32
+            else:
+                rocm_warp_size = 64
+
+            if GPUTarget:
+                target = GPUTarget(
+                    compile_meta["device_type"],
+                    compile_meta["cc"],
+                    rocm_warp_size if torch.version.hip else 32,
+                )
+            else:
+                target = (
+                    (compile_meta["device_type"], compile_meta["cc"])
+                    if not torch.version.hip
+                    else [
+                        compile_meta["device_type"],
+                        compile_meta["cc"],
+                        rocm_warp_size,
+                    ]
+                )
+
+            options = {
+                "num_warps": compile_meta["num_warps"],
+                "num_stages": compile_meta["num_stages"],
+                "debug": compile_meta["debug"],
+            }
+            if self.device_props.type != "hip":
+                if "waves_per_eu" in compile_meta:
+                    options["waves_per_eu"] = compile_meta["waves_per_eu"]
+                if "matrix_instr_nonkdim" in compile_meta:
+                    options["matrix_instr_nonkdim"] = compile_meta[
+                        "matrix_instr_nonkdim"
+                    ]
+            compile_kwargs = {
+                "target": target,
+                "options": options,
+            }
+        else:
+            compile_args = (self.fn,)
+            compile_kwargs = compile_meta
+
+        if warm_cache_only:
+            return (
+                triton.compile(*compile_args, **compile_kwargs),
+                None,
+            )
+
+        # importing from torch is safe now that precompile has returned
+        from torch._dynamo.device_interface import DeviceGuard
+
+        device_interface = self.get_device_interface()
+
+        # load binary to the correct device
+        with DeviceGuard(device_interface, compile_meta["device"]):  # type: ignore[attr-defined]
+            # need to initialize context
+            device_interface.synchronize(device_interface.current_device())
+
+            try:
+                binary = triton.compile(*compile_args, **compile_kwargs)
+            except Exception:
+                log.exception(
+                    "Triton compilation failed: %s\n%s\nmetadata: %s",
+                    self.inductor_meta.get("kernel_name", "triton_"),
+                    self.fn.src,
+                    compile_meta,
+                )
+                raise
+            binary._init_handles()
+
+        call_args = [
+            arg
+            for i, arg in enumerate(self.fn.arg_names)
+            if i not in self.fn.constexprs
+        ]
+        def_args = [name for name in self.fn.arg_names if name not in cfg.kwargs]
+
+        binary_shared = (
+            binary.shared if hasattr(binary, "shared") else binary.metadata.shared
+        )
+
+        scope = {
+            "grid_meta": cfg.kwargs,
+            "bin": binary,
+            "launch_enter_hook": binary.launch_enter_hook,
+            "launch_exit_hook": binary.launch_exit_hook,
+            "metadata": binary.packed_metadata
+            if hasattr(binary, "packed_metadata")
+            else binary.metadata,
+            "shared": binary_shared,
+        }
+
+        scope["num_warps"] = (
+            binary.num_warps
+            if hasattr(binary, "num_warps")
+            else binary.metadata.num_warps
+        )
+
+        scope["cta_args"] = (
+            (binary.num_ctas, *get_first_attr(binary, "cluster_dims", "clusterDims"))
+            if hasattr(binary, "num_ctas")
+            else (
+                (binary.metadata.num_ctas, *binary.metadata.cluster_dims)
+                if hasattr(binary, "metadata")
+                else ()
+            )
+        )
+
+        scope["function"] = get_first_attr(binary, "function", "cu_function")
+
+        def get_launch_args_without_kernel_launch_metadata(
+            grid,
+            grid_0,
+            grid_1,
+            grid_2,
+            stream,
+            function,
+            metadata,
+            bin,
+            launch_enter_hook,
+            launch_exit_hook,
+            num_warps,
+            shared,
+            cta_args,
+            args,
+        ):
+            """
+            Construct launch args before CompiledKernel.launch_metadata is added.
+            """
+            return (
+                grid_0,
+                grid_1,
+                grid_2,
+                num_warps,
+                *cta_args,
+                shared,
+                stream,
+                function,
+                launch_enter_hook,
+                launch_exit_hook,
+                metadata,
+            )
+
+        # Getting the kernel launch args is extremely perf-sensitive.  Evaluating
+        # `bin.launch_metadata` is relatively expensive, and returns None unless a
+        # `launch_enter_hook` is installed.  So if we don't have that hook installed,
+        # we want to burn None in to the launch args with zero overhead.
+        # See https://github.com/pytorch/pytorch/issues/123597
+        if binary.launch_enter_hook:
+
+            def get_launch_args_with_kernel_launch_metadata(
+                grid,
+                grid_0,
+                grid_1,
+                grid_2,
+                stream,
+                function,
+                metadata,
+                bin,
+                launch_enter_hook,
+                launch_exit_hook,
+                num_warps,
+                shared,
+                cta_args,
+                args,
+            ):
+                """
+                Construct launch args after CompiledKernel.launch_metadata is added
+                by https://github.com/openai/triton/pull/3492 .
+                """
+                return (
+                    grid_0,
+                    grid_1,
+                    grid_2,
+                    stream,
+                    function,
+                    metadata,
+                    bin.launch_metadata(grid, stream, *args),
+                    launch_enter_hook,
+                    launch_exit_hook,
+                )
+
+        else:
+
+            def get_launch_args_with_kernel_launch_metadata(
+                grid,
+                grid_0,
+                grid_1,
+                grid_2,
+                stream,
+                function,
+                metadata,
+                bin,
+                launch_enter_hook,
+                launch_exit_hook,
+                num_warps,
+                shared,
+                cta_args,
+                args,
+            ):
+                """
+                Construct launch args after CompiledKernel.launch_metadata is added
+                by https://github.com/openai/triton/pull/3492 .
+                """
+                return (
+                    grid_0,
+                    grid_1,
+                    grid_2,
+                    stream,
+                    function,
+                    metadata,
+                    None,
+                    launch_enter_hook,
+                    launch_exit_hook,
+                )
+
+        scope["get_launch_args"] = (
+            get_launch_args_with_kernel_launch_metadata
+            if hasattr(binary, "launch_metadata")
+            else get_launch_args_without_kernel_launch_metadata
+        )
+
+        scope["runner"] = get_first_attr(binary, "run", "c_wrapper")
+
+        exec(
+            f"""
+            def launcher({', '.join(def_args)}, grid, stream):
+                if callable(grid):
+                    grid_0, grid_1, grid_2 = grid(grid_meta)
+                else:
+                    grid_0, grid_1, grid_2 = grid
+
+                args = {', '.join(call_args)},
+                launch_args = get_launch_args(
+                    grid, grid_0, grid_1, grid_2, stream, function,
+                    metadata, bin, launch_enter_hook, launch_exit_hook,
+                    num_warps, shared, cta_args, args
+                )
+                runner(*launch_args, *args)
+                return bin
+            """.lstrip(),
+            scope,
+        )
+
+        launcher = scope["launcher"]
+        launcher.config = cfg
+        launcher.n_regs = getattr(binary, "n_regs", None)
+        launcher.n_spills = getattr(binary, "n_spills", None)
+        launcher.shared = binary_shared
+        launcher.store_cubin = self.inductor_meta.get("store_cubin", False)
+        # store this global variable to avoid the high overhead of reading it when calling run
+        if launcher.store_cubin:
+            launcher.fn = self.fn
+            launcher.bin = binary
+
+        return binary, launcher
+
+    def bench(self, launcher, *args, grid, **kwargs):
+        """Measure the performance of a given launcher"""
+        # we don't skip configs wiht spilled registers when auto-tuning custom
+        # (user-written) Triton kernels, as (i) we don't have any knowledge or
+        # control over the kernel code; (ii) there is empirical evidence that
+        # for some (complicated) custom Triton kernels, a register-spilling
+        # config may yield the best latency.
+        if not self.custom_kernel and launcher.n_spills > self.inductor_meta.get(
+            "spill_threshold", 16
+        ):
+            log.debug(
+                "Skip config %s because of register spilling: %d",
+                launcher.config,
+                launcher.n_spills,
+            )
+            return float("inf")
+
+        device_interface = self.get_device_interface()
+        stream = device_interface.get_raw_stream(  # type: ignore[call-arg]
+            device_interface.current_device()
+        )
+
+        def kernel_call():
+            if launcher.config.pre_hook is not None:
+                launcher.config.pre_hook(
+                    {**dict(zip(self.arg_names, args)), **launcher.config.kwargs}
+                )
+
+            cloned_args, cloned_kwargs = self.clone_args(*args, **kwargs)
+            launcher(
+                *cloned_args,
+                **cloned_kwargs,
+                grid=grid,
+                stream=stream,
+            )
+
+        return do_bench(kernel_call, rep=40, fast_flush=True)
+
+    def clone_args(self, *args, **kwargs) -> Tuple[List[Any], Dict[str, Any]]:
+        from ..compile_fx import clone_preserve_strides
+
+        # clone inplace buffers to avoid autotune contaminating them if
+        # the kernel does in-place stores. avoid cloning other buffers because
+        # it leads to increase memory use
+        cloned_args = []
+        for i, arg in enumerate(args):
+            if self.fn.arg_names[i] in self.mutated_arg_names:
+                assert isinstance(arg, torch.Tensor)
+                cloned_args.append(clone_preserve_strides(arg))
+            else:
+                cloned_args.append(arg)
+
+        cloned_kwargs: Dict[str, Any] = {}
+        for name, arg in kwargs.items():
+            if name in self.mutated_arg_names:
+                assert isinstance(arg, torch.Tensor)
+                cloned_kwargs[name] = clone_preserve_strides(arg)
+            else:
+                cloned_kwargs[name] = arg
+
+        return cloned_args, cloned_kwargs
+
+    @dynamo_timed
+    def benchmark_all_configs(self, *args, **kwargs):
+        timings = {
+            launcher: self.bench(launcher, *args, **kwargs)
+            for launcher in self.launchers
+        }
+
+        for k, v in timings.items():
+            self.coordesc_tuner.cache_benchmark_result(k.config, v)
+
+        if log.isEnabledFor(logging.DEBUG):
+            log.debug("Benchmark all input configs for %s, get:", self.fn.__name__)
+            for k, v in timings.items():
+                log.debug(
+                    "%s: %f, nreg %d, nspill %d, #shared-mem %s",
+                    k.config,
+                    v,
+                    k.n_regs,
+                    k.n_spills,
+                    k.shared,
+                )
+
+        return timings
+
+    def autotune_to_one_config(self, *args, **kwargs):
+        """Do the actual autotuning"""
+        start_time = time.time_ns()
+        timings = self.benchmark_all_configs(*args, **kwargs)
+        time_taken_ns = time.time_ns() - start_time
+        self.launchers = [builtins.min(timings, key=timings.get)]
+        if self.save_cache_hook:
+            self.save_cache_hook(self.launchers[0].config, time_taken_ns)
+
+    def save_cuda_kernel(self, grid, stream, launcher):
+        if callable(grid):
+            grid_x, grid_y, grid_z = grid(launcher.config.kwargs)
+        else:
+            grid_x, grid_y, grid_z = grid
+
+        key = self.inductor_meta.get("kernel_name", None)  # unique kernel name
+        assert key is not None, "kernel_name can not be None"
+        params = {
+            "mangled_name": launcher.bin.metadata.name
+            if hasattr(launcher.bin.metadata, "name")
+            else launcher.bin.metadata["name"],
+            "grid_x": grid_x,
+            "grid_y": grid_y,
+            "grid_z": grid_z,
+            "x_block": launcher.config.kwargs.get("XBLOCK", 1),
+            "y_block": launcher.config.kwargs.get("YBLOCK", None),
+            "z_block": launcher.config.kwargs.get("ZBLOCK", None),
+            "num_warps": launcher.bin.num_warps
+            if hasattr(launcher.bin, "num_warps")
+            else launcher.bin.metadata.num_warps,
+            "shared_mem": launcher.bin.shared
+            if hasattr(launcher.bin, "shared")
+            else launcher.bin.metadata.shared,
+            "stream": stream,
+            # User defined triton kernels will have arbitrary kwarg names
+            "meta": launcher.config.kwargs,
+        }
+
+        from torch._inductor.codecache import CudaKernelParamCache
+
+        binary = (
+            launcher.bin.asm["cubin"]
+            if self.device_props.type != "hip"
+            else launcher.bin.asm["hsaco"]
+        )
+        CudaKernelParamCache.set(key, params, binary)
+
+        self.cuda_kernel_saved = True
+
+    def coordinate_descent_tuning(self, launcher, *args, **kwargs):
+        """
+        Coordinate descent tuning can be run with or without max-autotune.
+
+        The only difference between these two is the starting config for coordinate_descent tuning.
+        E.g., assuming regular autotune only get one config C1; while max-autotune get 4 configs C1, C2, C3, C4
+        and max-autotune figure out C3 is the best.
+
+        Then if coordinate desecnt tuning is run with max-autotune disabled, it will start from C1;
+        while if coordinate descent tuning is run with max-autotune enabled, it will start from C3.
+        """
+        if (
+            self.heuristic_type == HeuristicType.TEMPLATE
+            or self.heuristic_type == HeuristicType.USER_AUTOTUNE
+        ):
+            # skip triton template
+            return launcher
+
+        cloned_args, _ = self.clone_args(*args)
+        config2launcher = {launcher.config: launcher}
+
+        def benchmark_one_config(config):
+            with self.lock:
+                _, launcher = self._precompile_config(config, False)
+            config2launcher[config] = launcher
+
+            out = self.bench(launcher, *cloned_args, **kwargs)
+            log.debug(
+                "COORDESC: %s: %f, nreg %d, nspill %d, #shared-mem %d",
+                launcher.config,
+                out,
+                launcher.n_regs,
+                launcher.n_spills,
+                launcher.shared,
+            )
+            return out
+
+        assert not (
+            self.heuristic_type == HeuristicType.PERSISTENT_REDUCTION
+            and "RBLOCK" in launcher.config.kwargs
+        ), "Coordinate descent tuner relies on the assumption that persistent reduction's triton config does not have RBLOCK"
+        start_time = time.time_ns()
+        best_config = self.coordesc_tuner.autotune(
+            benchmark_one_config, launcher.config, None
+        )
+        time_taken_ns = time.time_ns() - start_time
+        best_config.found_by_coordesc = True
+
+        if self.save_cache_hook:
+            self.save_cache_hook(best_config, time_taken_ns, found_by_coordesc=True)
+        return config2launcher.get(best_config)
+
+    def run(self, *args, grid, stream, **kwargs):
+        if len(self.launchers) != 1:
+            if len(self.launchers) == 0:
+                self.precompile()
+            if len(self.launchers) > 1:
+                self.autotune_to_one_config(*args, grid=grid, **kwargs)
+
+        if not getattr(
+            self.launchers[0].config, "found_by_coordesc", False
+        ) and self.inductor_meta.get("coordinate_descent_tuning", False):
+            self.launchers = [
+                self.coordinate_descent_tuning(
+                    self.launchers[0], *args, grid=grid, **kwargs
+                )
+            ]
+
+        (launcher,) = self.launchers
+        if launcher.store_cubin:
+            self.save_cuda_kernel(grid, stream, launcher)
+
+        if launcher.config.pre_hook is not None:
+            launcher.config.pre_hook(
+                {**dict(zip(self.arg_names, args)), **launcher.config.kwargs, **kwargs}
+            )
+
+        # guard the record function and only call it if profiling is currently
+        # in progress, to reduce latency when profiler is not turned on. Note that
+        # the "if" statement (instead of, say, a contextlib.nullcontext) is intentional;
+        # it is faster than entering and exiting a context manager, even if the context
+        # manager is a nullcontext.
+        if autograd_profiler._is_profiler_enabled:
+            # grid can be a tuple of ints or a string.
+            if isinstance(grid, tuple):
+                grid_info = str(grid)
+            else:
+                grid_info = getattr(grid, "grid_fn_str", "")
+            with torch._C._profiler._RecordFunctionFast(
+                self.inductor_meta.get("kernel_name", "triton kernel"),
+                args,
+                {
+                    "kernel_file": "" if self.filename is None else self.filename,
+                    "kernel_backend": "triton",
+                    "grid": grid_info,
+                    "stream": stream,
+                },
+            ):
+                return launcher(
+                    *args,
+                    **kwargs,
+                    grid=grid,
+                    stream=stream,
+                )
+        else:
+            return launcher(
+                *args,
+                **kwargs,
+                grid=grid,
+                stream=stream,
+            )
+
+
+def _find_names(obj):
+    import gc
+    import inspect
+
+    frame = inspect.currentframe()
+    while frame is not None:
+        frame.f_locals
+        frame = frame.f_back
+    obj_names = []
+    for referrer in gc.get_referrers(obj):
+        if isinstance(referrer, dict):
+            for k, v in referrer.items():
+                if v is obj:
+                    obj_names.append(k)
+    return obj_names
+
+
+collected_calls: List[Any] = []
+
+
+def start_graph():
+    collected_calls.clear()
+
+
+def end_graph(output_file):
+    if len(collected_calls) == 0:
+        return
+    overall_time = sum(call[0] for call in collected_calls)
+    overall_gb = sum(call[1] for call in collected_calls)
+    cur_file = inspect.stack()[1].filename
+    summary_str = (
+        f"SUMMARY ({cur_file})\n"
+        f"{overall_time:.2f}ms   \t {overall_gb:.2f} GB\t {overall_gb/(overall_time/1e3):.2f}GB/s"
+    )
+    print(summary_str)
+    print()
+    if output_file is not None:
+        # sort perf numbers in descending order, i.e. placing the
+        # most runtime-heavy kernels at the top of the list
+        sorted_calls = sorted(collected_calls, key=lambda c: float(c[0]), reverse=True)
+        try:
+            with open(output_file, "a") as file:
+                log.debug("Save profile bandwidth results to %s", output_file)
+                file.write("====================\n")
+                file.write(f"TRITON KERNELS BANDWIDTH INFO ({cur_file})\n")
+                for ms, num_gb, gb_per_s, kernel_name in sorted_calls:
+                    # also display the runtime percentage for each kernel
+                    percentage = f"{ms/overall_time*100:.2f}%"
+                    suffix = f" \t {percentage} \t {kernel_name}"
+                    bw_info_str = create_bandwidth_info_str(
+                        ms,
+                        num_gb,
+                        gb_per_s,
+                        suffix=suffix,
+                        color=False,
+                    )
+                    file.write(bw_info_str + "\n")
+                file.write(f"{summary_str}\n\n")
+        except Exception as e:
+            log.warning(
+                "failed to write profile bandwidth result into %s: %s",
+                output_file,
+                e,
+            )
+
+
+class DebugAutotuner(CachingAutotuner):
+    def __init__(self, *args, regex_filter="", **kwargs):
+        self.regex_filter = regex_filter
+        super().__init__(*args, **kwargs)
+        self.cached = None
+
+    def run(self, *args, grid, stream):
+        possible_names = _find_names(self)
+        kernel_name = f"{max(possible_names, key=len)}"
+        if not re.match(self.regex_filter, kernel_name):
+            return
+        super().run(*args, grid=grid, stream=stream)
+        (launcher,) = self.launchers
+
+        if self.cached is None:
+            ms = self.bench(launcher, *args, grid=grid)
+            num_in_out_ptrs = len(
+                [
+                    arg_name
+                    for arg_name in self.fn.arg_names
+                    if arg_name.startswith("in_out_ptr")
+                ]
+            )
+            num_gb = self.inductor_meta.get("kernel_num_gb", None)
+            if num_gb is None:
+                num_gb = get_num_bytes(*args, num_in_out_args=num_in_out_ptrs) / 1e9
+            gb_per_s = num_gb / (ms / 1e3)
+            self.cached = ms, num_gb, gb_per_s, kernel_name
+            collected_calls.append((ms, num_gb, gb_per_s, kernel_name))
+            print(
+                create_bandwidth_info_str(
+                    ms, num_gb, gb_per_s, suffix=f" \t {kernel_name}"
+                )
+            )
+
+
+def hash_configs(configs: List[Config]):
+    """
+    Hash used to check for changes in configurations
+    """
+    hasher = hashlib.sha256()
+    for cfg in configs:
+        hasher.update(
+            f"{sorted(cfg.kwargs.items())} {cfg.num_warps} {cfg.num_stages}\n".encode()
+        )
+    return hasher.hexdigest()
+
+
+def load_cached_autotuning(
+    best_config,
+    configs_hash: str,
+    configs: List[Config],
+    inductor_meta: Dict[str, Any],
+):
+    if best_config is None:
+        return None
+    if best_config.pop("configs_hash", None) != configs_hash:
+        return None
+
+    # Remove time taken for comparison
+    best_config.pop("time_taken_ms", None)
+
+    if inductor_meta.get("coordinate_descent_tuning") and best_config.pop(
+        "found_by_coordesc", False
+    ):
+        num_warps = best_config.pop("num_warps")
+        num_stages = best_config.pop("num_stages")
+        triton_config = Config(best_config, num_warps=num_warps, num_stages=num_stages)
+        triton_config.found_by_coordesc = True
+        return triton_config
+
+    matching_configs = [
+        cfg
+        for cfg in configs
+        if all(val == best_config.get(key) for key, val in cfg.kwargs.items())
+        and cfg.num_warps == best_config.get("num_warps")
+        and cfg.num_stages == best_config.get("num_stages")
+    ]
+    if len(matching_configs) != 1:
+        return None
+
+    return matching_configs[0]
+
+
+def should_use_remote_autotune_cache(inductor_meta):
+    if inductor_meta.get("autotune_remote_cache"):
+        return True
+    if not inductor_meta.get("is_fbcode"):
+        return False
+    if inductor_meta.get("is_hip"):
+        return False
+
+    from triton.runtime.fb_memcache import MEMCACHE_VERSION
+
+    return MEMCACHE_VERSION >= torch._utils_internal.justknobs_getval_int(
+        "pytorch/remote_cache:autotune_memcache_version"
+    )
+
+
+def cached_autotune(
+    size_hints: Optional[List[int]],
+    configs: List[Config],
+    triton_meta,
+    heuristic_type,
+    filename=None,
+    inductor_meta=None,
+    custom_kernel=False,
+):
+    """
+    A copy of triton.autotune that calls our subclass.  Our subclass
+    has additional debugging, error handling, and on-disk caching.
+    """
+    configs = unique_configs(configs)
+    assert len(configs) == 1 or filename
+    save_cache_hook: Optional[Callable[[Any, Any, Any], Any]]
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+
+    # on disk caching logic and/or remote caching
+    if filename is not None and (
+        len(configs) > 1 or inductor_meta.get("coordinate_descent_tuning")
+    ):
+        configs_hash = hash_configs(configs)
+
+        cache_filename = None
+        remote_cache = None
+        remote_cache_key = None
+        if inductor_meta.get("autotune_local_cache", True):
+            cache_filename = os.path.splitext(filename)[0] + ".best_config"
+        if should_use_remote_autotune_cache(inductor_meta):
+            backend_hash = inductor_meta.get("backend_hash", None)
+            if backend_hash is not None:
+                key = backend_hash + configs_hash + "autotune-best-config-v2"
+                key = hashlib.sha256(key.encode("utf-8")).hexdigest()
+
+                try:
+                    if inductor_meta.get("is_fbcode"):
+                        remote_cache = triton.runtime.fb_memcache.FbMemcacheRemoteAutotuneCacheBackend(
+                            key
+                        )
+                    else:
+                        remote_cache = triton.runtime.cache.RedisRemoteCacheBackend(key)
+                except Exception:
+                    remote_cache = None
+                    log.warning("Unable to create a remote cache", exc_info=True)
+                # we already sha256 hash the source contents
+                remote_cache_key = os.path.basename(filename)
+            else:
+                log.debug(
+                    "backend_hash is not passed on the inductor_meta, unable to use autotune remote cache"
+                )
+
+        best_config = None
+        if cache_filename is not None and os.path.exists(cache_filename):
+            with open(cache_filename) as fd:
+                best_config = json.loads(fd.read())
+        elif remote_cache is not None and remote_cache_key is not None:
+            best_config = remote_cache.get(remote_cache_key)
+
+        best_config = load_cached_autotuning(
+            best_config, configs_hash, configs, inductor_meta
+        )
+        if best_config:
+            configs = [best_config]
+
+        def save_cache_hook(cfg, time_taken_ns, found_by_coordesc=False):
+            data = {
+                **cfg.kwargs,
+                "num_warps": cfg.num_warps,
+                "num_stages": cfg.num_stages,
+                "configs_hash": configs_hash,
+                "found_by_coordesc": found_by_coordesc,
+                "time_taken_ms": time_taken_ns // 1000000,  # Convert from NS to MS
+            }
+            if cache_filename is not None:
+                with open(cache_filename, "w") as fd:
+                    fd.write(json.dumps(data))
+            if remote_cache is not None and remote_cache_key is not None:
+                remote_cache.put(remote_cache_key, data)
+
+            if log.isEnabledFor(logging.DEBUG):
+                type_str = "coordesc" if found_by_coordesc else "heuristic"
+                log.debug("Save %s tuning result to %s", type_str, cache_filename)
+
+    else:
+        save_cache_hook = None
+
+    mutated_arg_names = inductor_meta.pop("mutated_arg_names", ())
+
+    def decorator(fn):
+        # Remove XBLOCK from config if it's not a function argument.
+        # This way, coordinate descent tuning will not try to tune it.
+        #
+        # Context: When TritonKernel.no_x_dim is True, we hardcode XBLOCK to 1.
+        import inspect
+
+        if "XBLOCK" not in inspect.signature(fn.fn).parameters:
+            for tconfig in configs:
+                if "XBLOCK" in tconfig.kwargs:
+                    assert tconfig.kwargs["XBLOCK"] == 1
+                    tconfig.kwargs.pop("XBLOCK")
+
+        if inductor_meta.get("profile_bandwidth"):
+            return DebugAutotuner(
+                fn,
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                regex_filter=inductor_meta["profile_bandwidth_regex"],
+                configs=configs,
+                save_cache_hook=save_cache_hook,
+                mutated_arg_names=mutated_arg_names,
+                heuristic_type=heuristic_type,
+                size_hints=size_hints,
+                custom_kernel=custom_kernel,
+                filename=filename,
+            )
+        return CachingAutotuner(
+            fn,
+            triton_meta=triton_meta,
+            inductor_meta=inductor_meta,
+            configs=configs,
+            save_cache_hook=save_cache_hook,
+            mutated_arg_names=mutated_arg_names,
+            heuristic_type=heuristic_type,
+            size_hints=size_hints,
+            custom_kernel=custom_kernel,
+            filename=filename,
+        )
+
+    return decorator
+
+
+def unique_configs(configs: List[Config]):
+    """Remove duplicate configurations"""
+    seen = set()
+    pruned_configs = []
+
+    for cfg in configs:
+        key = triton_config_to_hashable(cfg)
+        if key not in seen:
+            seen.add(key)
+            pruned_configs.append(cfg)
+    return pruned_configs
+
+
+def check_config(cfg, *, xnumel=None, ynumel=None, znumel=None):
+    for numel, label in zip((xnumel, ynumel, znumel), "XYZ"):
+        if numel is None:
+            continue
+        block = cfg[f"{label}BLOCK"]
+        if numel == 1:
+            assert block == 1, (
+                f"TritonKernel.indexing assumes numel == 1 => BLOCK == 1"
+                f" but {label.lower()}numel=={numel} and {label}BLOCK={block} (cfg={cfg})."
+            )
+        max_block = TRITON_MAX_BLOCK[label]
+        max_block_str = f'config.triton.max_block["{label}"]'
+        assert max_block % block == 0, (
+            f"TritonKernel.indexing assumes {label}BLOCK divides {max_block_str}"
+            f" but {label}BLOCK={block} and {max_block_str}={max_block} (cfg={cfg})."
+        )
+
+
+def triton_config(
+    size_hints,
+    x,
+    y=None,
+    z=None,
+    num_stages=1,
+    num_elements_per_warp=256,
+    min_elem_per_thread=0,
+) -> Config:
+    """
+    Construct a pointwise triton config with some adjustment heuristics
+    based on size_hints. Size_hints is a tuple of numels in each tile
+    dimension and will be rounded up to the nearest power of 2.
+
+    num_elements_per_warp is a suggestion for controlling how many warps
+    the triton config should contain. e.g.: if x=16, y=8, z=4 then
+    num_elements = 16*8*4 = 512. Then if we set num_elements_per_warp=128,
+    we'll launch 512 (elem) / 128 (elem/warp) = 4 warps. Note that it's
+    just a suggestion, and sometimes other adjustment heuristics will
+    override the num_elements_per_warp.
+
+    min_elem_per_thread controls the minimum number of elements
+    processed by each thread. It's always enforced.
+    """
+    # Ideally we want to read this from some device config
+
+    # for a 2d size_hints [a, b], a should be mapped to YBLOCK rather than XBLOCK
+    size_hints = list(reversed(size_hints))
+
+    maxGridSize = [2147483647, 65535, 65535]
+
+    target = conditional_product(x, y, z)
+    if conditional_product(*size_hints) < target:
+        target //= 8
+
+    # shrink sizes to size hints
+    x = min(x, size_hints[0])
+    if y:
+        y = min(y, size_hints[1])
+    if z:
+        z = min(z, size_hints[2])
+
+    # if we are below original block size, scale up where we can;
+    # or if the calculated grid size is larger than the limit, we bump up the corresponding dimension
+    while x < min(size_hints[0], TRITON_MAX_BLOCK["X"]) and (
+        x * maxGridSize[0] < size_hints[0] or conditional_product(x, y, z) < target
+    ):
+        x *= 2
+    while (
+        y
+        and y < min(size_hints[1], TRITON_MAX_BLOCK["Y"])
+        and (
+            y * maxGridSize[1] < size_hints[1] or conditional_product(x, y, z) < target
+        )
+    ):
+        y *= 2
+    while (
+        z
+        and z < min(size_hints[2], TRITON_MAX_BLOCK["Z"])
+        and (
+            z * maxGridSize[2] < size_hints[2] or conditional_product(x, y, z) < target
+        )
+    ):
+        z *= 2
+
+    num_warps = next_power_of_2(
+        min(max(conditional_product(x, y, z) // num_elements_per_warp, 1), 8)
+    )
+    # we are going to arrive at 2 warps only if bs was too small due to
+    # numel being too small. However to workaround some ptx bugs we still
+    # want at least 4 warps if there's enough elements per thread
+    # given that this is a rare situation, don't expect this to affect perf
+    # in general
+    # see https://github.com/pytorch/pytorch/pull/97950
+    num_warps = max(num_warps, 4) if conditional_product(x, y, z) >= 128 else num_warps
+    xnumel = size_hints[0]
+    ynumel = size_hints[1] if y else None
+    znumel = size_hints[2] if z else None
+
+    # Increase x to satisfy min_elem_per_thread requirements.
+    block_size = max(
+        conditional_product(x, y, z),
+        min_elem_per_thread * _NUM_THREADS_PER_WARP * num_warps,
+    )
+    x *= math.ceil(block_size / conditional_product(x, y, z))
+
+    cfg = {"XBLOCK": x}
+    if y:
+        cfg["YBLOCK"] = y
+    if z:
+        cfg["ZBLOCK"] = z
+    check_config(cfg, xnumel=xnumel, ynumel=ynumel, znumel=znumel)
+    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+
+
+def triton_config_reduction(size_hints, x, r, num_stages=1, num_warps=None) -> Config:
+    """
+    Construct a reduction triton config with some adjustment heuristics
+    based on size_hints. Size_hints is a tuple of numels in each tile
+    dimension and will be rounded up to the nearest power of 2.
+    """
+
+    target = conditional_product(x, r)
+    if conditional_product(*size_hints) < target:
+        target //= 8
+
+    # shrink sizes to size hints
+    x = min(x, size_hints[0])
+    r = min(r, size_hints[1])
+
+    # if we are below original block size, scale up where we can
+    while x < size_hints[0] and conditional_product(x, r) < target:
+        x *= 2
+    while r < size_hints[1] and conditional_product(x, r) < target:
+        r *= 2
+
+    cfg = {"XBLOCK": x, "RBLOCK": r}
+    if num_warps is None:
+        num_warps = conditional_product(x, r) // 128
+    # On AMD GPU each warp has 64 lanes which is double the size on NV GPU,
+    # therefore using half the number of warps here correspondingly.
+    default_num_warps = 4 if torch.version.hip else 8
+    min_num_warps = 1 if torch.version.hip else 2
+    num_warps = next_power_of_2(min(max(num_warps, min_num_warps), default_num_warps))
+    check_config(cfg, xnumel=size_hints[0])
+    assert r <= TRITON_MAX_BLOCK["R"], f"increase TRITON_MAX_BLOCK['r'] to {r}"
+    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+
+
+def triton_config_tiled_reduction(size_hints, x, y, r, num_stages=1):
+    """
+    Construct a tile reduction triton config with some adjustment
+    heuristics based on size_hints. Size_hints is a tuple of numels in
+    each tile dimension and will be rounded up to the nearest power of 2.
+    """
+
+    target = conditional_product(x, y, r)
+    if conditional_product(*size_hints) < target:
+        target //= 8
+
+    # shrink sizes to size hints
+    x = min(x, size_hints[0])
+    y = min(y, size_hints[1])
+    r = min(r, size_hints[2])
+
+    # if we are below original block size, scale up where we can
+    while x < size_hints[0] and conditional_product(x, y, r) < target:
+        x *= 2
+    while r < size_hints[2] and conditional_product(x, y, r) < target:
+        r *= 2
+    while y < size_hints[1] and conditional_product(x, y, r) < target:
+        y *= 2
+
+    cfg = {"XBLOCK": x, "YBLOCK": y, "RBLOCK": r}
+    num_warps = next_power_of_2(min(max(conditional_product(x, y, r) // 256, 1), 8))
+    check_config(cfg, xnumel=size_hints[0], ynumel=size_hints[1])
+    assert r <= TRITON_MAX_BLOCK["R"], f"increase TRITON_MAX_BLOCK['r'] to {r}"
+    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+
+
+def pointwise(
+    size_hints,
+    triton_meta,
+    tile_hint=None,
+    filename=None,
+    min_elem_per_thread=0,
+    inductor_meta=None,
+):
+    """
+    Construct @triton.heuristics() based on size_hints.
+    """
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    assert not inductor_meta.get("no_x_dim")
+
+    numel = functools.reduce(operator.mul, size_hints)
+    bs = max(256, min(numel // 128, 1024))
+
+    hinted_configs = autotune_hints_to_configs(
+        inductor_meta.get("autotune_hints", set()), size_hints, bs
+    )
+
+    triton_config_with_settings = functools.partial(
+        triton_config, min_elem_per_thread=min_elem_per_thread
+    )
+
+    if len(size_hints) == 1:
+        if disable_pointwise_autotuning(inductor_meta) and not (
+            inductor_meta.get("max_autotune")
+            or inductor_meta.get("max_autotune_pointwise")
+        ):
+            return cached_autotune(
+                size_hints,
+                [triton_config_with_settings(size_hints, bs)],
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                heuristic_type=HeuristicType.POINTWISE,
+                filename=filename,
+            )
+        else:
+            return cached_autotune(
+                size_hints,
+                [
+                    triton_config_with_settings(
+                        size_hints, bs, num_elements_per_warp=256
+                    ),
+                    triton_config_with_settings(
+                        size_hints, bs // 2, num_elements_per_warp=64
+                    ),
+                    *hinted_configs,
+                ],
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                heuristic_type=HeuristicType.POINTWISE,
+                filename=filename,
+            )
+    if len(size_hints) == 2:
+        if (
+            disable_pointwise_autotuning(inductor_meta) or tile_hint == TileHint.SQUARE
+        ) and not (
+            inductor_meta.get("max_autotune")
+            or inductor_meta.get("max_autotune_pointwise")
+        ):
+            return cached_autotune(
+                size_hints,
+                [triton_config_with_settings(size_hints, 32, 32)],
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                heuristic_type=HeuristicType.POINTWISE,
+                filename=filename,
+            )
+        return cached_autotune(
+            size_hints,
+            [
+                triton_config_with_settings(size_hints, 32, 32),
+                triton_config_with_settings(size_hints, 64, 64),  # ~8% better for fp16
+                triton_config_with_settings(size_hints, 256, 16),
+                triton_config_with_settings(size_hints, 16, 256),
+                triton_config_with_settings(size_hints, bs, 1),
+                triton_config_with_settings(size_hints, 1, bs),
+                *hinted_configs,
+            ],
+            triton_meta=triton_meta,
+            inductor_meta=inductor_meta,
+            filename=filename,
+            heuristic_type=HeuristicType.POINTWISE,
+        )
+    if len(size_hints) == 3:
+        if disable_pointwise_autotuning(inductor_meta):
+            return cached_autotune(
+                size_hints,
+                [triton_config_with_settings(size_hints, 16, 16, 16)],
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                heuristic_type=HeuristicType.POINTWISE,
+                filename=filename,
+            )
+        return cached_autotune(
+            size_hints,
+            [
+                triton_config_with_settings(size_hints, 16, 16, 16),
+                triton_config_with_settings(size_hints, 64, 8, 8),
+                triton_config_with_settings(size_hints, 8, 64, 8),
+                triton_config_with_settings(size_hints, 8, 8, 64),
+                triton_config_with_settings(size_hints, bs, 1, 1),
+                triton_config_with_settings(size_hints, 1, bs, 1),
+                triton_config_with_settings(size_hints, 1, 1, bs),
+                *hinted_configs,
+            ],
+            triton_meta=triton_meta,
+            inductor_meta=inductor_meta,
+            filename=filename,
+            heuristic_type=HeuristicType.POINTWISE,
+        )
+    raise NotImplementedError(f"size_hints: {size_hints}")
+
+
+def _reduction_configs(
+    *, size_hints: List[int], inductor_meta: Dict[str, Any]
+) -> List[Config]:
+    reduction_hint = inductor_meta.get("reduction_hint", None)
+    assert len(size_hints) == 2
+    rnumel = size_hints[-1]
+
+    contiguous_config = triton_config_reduction(
+        size_hints, 1, (rnumel if 256 <= rnumel < 2048 else 2048)
+    )
+    outer_config = triton_config_reduction(size_hints, 64, 8)
+    tiny_config = triton_config_reduction(
+        size_hints, 2 * (256 // rnumel) if rnumel <= 256 else 1, min(rnumel, 2048)
+    )
+    if inductor_meta.get("max_autotune") or inductor_meta.get("max_autotune_pointwise"):
+        pass  # skip all these cases
+    elif reduction_hint == ReductionHint.INNER:
+        return [contiguous_config]
+    elif reduction_hint == ReductionHint.OUTER:
+        return [outer_config]
+    elif reduction_hint == ReductionHint.OUTER_TINY:
+        return [tiny_config]
+    if disable_pointwise_autotuning(inductor_meta):
+        return [triton_config_reduction(size_hints, 32, 128)]
+    return [
+        contiguous_config,
+        outer_config,
+        tiny_config,
+        triton_config_reduction(size_hints, 64, 64),
+        triton_config_reduction(size_hints, 8, 512),
+        # halve the XBLOCK/RBLOCK compared to outer_config
+        # TODO: this may only be beneficial when each iteration of the reduction
+        # is quite heavy. E.g. https://gist.github.com/shunting314/189a8ef69f90db9d614a823385147a72
+        triton_config_reduction(size_hints, 64, 4, num_warps=8),
+    ]
+
+
+def reduction(
+    size_hints,
+    reduction_hint=False,
+    triton_meta=None,
+    filename=None,
+    inductor_meta=None,
+):
+    """args to @triton.heuristics()"""
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    inductor_meta["reduction_hint"] = reduction_hint
+    if inductor_meta.get("no_x_dim"):
+        size_hints = [1, *size_hints[1:]]
+
+    assert triton_meta is not None
+    rnumel = size_hints[-1]
+    if len(size_hints) != 2:
+        raise NotImplementedError(f"size_hints: {size_hints}")
+
+    configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta)
+    return cached_autotune(
+        size_hints,
+        configs=configs,
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.REDUCTION,
+        filename=filename,
+    )
+
+
+def persistent_reduction(
+    size_hints,
+    reduction_hint=False,
+    triton_meta=None,
+    filename=None,
+    inductor_meta=None,
+):
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    inductor_meta["reduction_hint"] = reduction_hint
+    if inductor_meta.get("no_x_dim"):
+        size_hints = [1, *size_hints[1:]]
+
+    xnumel, rnumel = size_hints
+
+    configs = [
+        triton_config_reduction(size_hints, xblock, rnumel)
+        for xblock in (1, 8, 32, 128)
+        if xblock == 1 or (rnumel * xblock <= 4096 and xblock <= xnumel)
+    ]
+
+    # TODO(jansel): we should be able to improve these heuristics
+    if reduction_hint == ReductionHint.INNER and rnumel >= 256:
+        configs = configs[:1]
+    elif reduction_hint == ReductionHint.OUTER:
+        configs = configs[-1:]
+    elif reduction_hint == ReductionHint.OUTER_TINY:
+        configs = [
+            triton_config_reduction(
+                size_hints, 2 * (256 // rnumel) if rnumel <= 256 else 1, rnumel
+            )
+        ]
+    for c in configs:
+        # we don't need RBLOCK for persistent reduction
+        c.kwargs.pop("RBLOCK")
+
+    if disable_pointwise_autotuning(inductor_meta):
+        configs = configs[:1]
+
+    return cached_autotune(
+        size_hints,
+        configs,
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        filename=filename,
+        heuristic_type=HeuristicType.PERSISTENT_REDUCTION,
+    )
+
+
+def split_scan(
+    size_hints,
+    reduction_hint=False,
+    triton_meta=None,
+    filename=None,
+    inductor_meta=None,
+):
+    """Heuristic for TritonSplitScanKernel"""
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    inductor_meta["reduction_hint"] = reduction_hint
+    if inductor_meta.get("no_x_dim"):
+        size_hints = [1, *size_hints[1:]]
+
+    assert triton_meta is not None
+    if len(size_hints) != 2:
+        raise NotImplementedError(f"size_hints: {size_hints}")
+
+    configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta)
+
+    # Fixup configs to enforce the minimum RBLOCK size
+    min_rblock = inductor_meta.get("min_split_scan_rblock", 256)
+    for cfg in configs:
+        if cfg.kwargs["RBLOCK"] < min_rblock:
+            cfg.kwargs["RBLOCK"] = min_rblock
+
+    return cached_autotune(
+        size_hints,
+        configs=configs,
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.SPLIT_SCAN,
+        filename=filename,
+    )
+
+
+def template(num_stages, num_warps, triton_meta, filename=None, inductor_meta=None):
+    """
+    Compile a triton template
+    """
+    return cached_autotune(
+        None,
+        [triton.Config({}, num_stages=num_stages, num_warps=num_warps)],
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.TEMPLATE,
+        filename=filename,
+    )
+
+
+def user_autotune(
+    configs, triton_meta, filename=None, inductor_meta=None, custom_kernel=False
+):
+    """
+    Compile a user defined triton kernel
+    """
+    defaults = inspect.signature(triton.Config).parameters
+    default_num_stages = defaults["num_stages"].default
+    default_num_warps = defaults["num_warps"].default
+
+    if len(configs) == 0:
+        configs = [
+            triton.Config(
+                {}, num_stages=default_num_stages, num_warps=default_num_warps
+            )
+        ]
+    else:
+        configs = [
+            triton.Config(
+                c.get("kwargs", {}),
+                num_stages=c.get("num_stages", default_num_stages),
+                num_warps=c.get("num_warps", default_num_warps),
+            )
+            for c in configs
+        ]
+
+    return cached_autotune(
+        None,
+        configs,
+        triton_meta=triton_meta,
+        heuristic_type=HeuristicType.USER_AUTOTUNE,
+        filename=filename,
+        inductor_meta=inductor_meta,
+        custom_kernel=custom_kernel,
+    )
+
+
+def foreach(triton_meta, num_warps, filename=None, inductor_meta=None):
+    """
+    Compile a triton foreach kernel
+    """
+    return cached_autotune(
+        None,
+        [triton.Config({}, num_stages=1, num_warps=num_warps)],
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.TEMPLATE,
+        filename=filename,
+    )
+
+
+def grid(*numels):
+    """Helper function to compute triton grids"""
+    if len(numels) == 1:
+        xnumel, ynumel, znumel = numels[0], None, None
+    elif len(numels) == 2:
+        xnumel, ynumel, znumel = numels[1], numels[0], None
+    elif len(numels) == 3:
+        xnumel, ynumel, znumel = numels[2], numels[1], numels[0]
+    else:
+        raise AssertionError(f"invalid size for numels {len(numels)}")
+
+    def get_grid_dim(numel, block):
+        if numel is None:
+            return 1
+        if block is None:
+            return numel
+        return ceildiv(numel, block)
+
+    def grid_fn(meta):
+        x_grid = get_grid_dim(xnumel, meta.get("XBLOCK", 1))
+        y_grid = get_grid_dim(ynumel, meta.get("YBLOCK", None))
+
+        max_y_grid = get_max_y_grid()
+        if znumel is None:
+            div = ceildiv(y_grid, max_y_grid)
+            y_grid = y_grid // div
+            z_grid = div
+        else:
+            z_grid = get_grid_dim(znumel, meta.get("ZBLOCK", None))
+            torch._check(
+                y_grid <= max_y_grid,
+                lambda: f"Generated y grid beyond 2^16 ({y_grid}) not supported with z dimension present. File issue",
+            )
+
+        return (
+            x_grid,
+            y_grid,
+            z_grid,
+        )
+
+    setattr(grid_fn, "grid_fn_str", f"grid({numels})")  # noqa: B010
+
+    return grid_fn
+
+
+def split_scan_grid(xnumel, rnumel):
+    def grid_fn(meta):
+        assert meta.get("XBLOCK", 1) == 1
+        return (ceildiv(rnumel, meta.get("RBLOCK", 1)), xnumel, 1)
+
+    grid_fn_str = f"split_scan_grid({xnumel}, {rnumel})"
+    setattr(grid_fn, "grid_fn_str", grid_fn_str)  # noqa: B010
+
+    return grid_fn
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index 82263fff5724d..ae29be5ac5922 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -4,6 +4,7 @@
 import itertools
 import logging
 import math
+import operator
 import os
 import pprint
 import textwrap
@@ -25,26 +26,29 @@
 import sympy
 
 import torch
-from torch._dynamo.utils import dynamo_timed
+from torch._dynamo.utils import counters, dynamo_timed
 from torch._inductor.metrics import get_metric_table, is_metric_table_enabled
-from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+from torch.utils._sympy.symbol import free_symbol_is_type, SymT
 from torch.utils._triton import has_triton
 
 from . import comms, config, dependencies, ir, metrics
+from .codecache import write_text
 from .codegen.common import get_scheduling_for_device, Kernel
 from .comm_analysis import estimate_nccl_collective_runtime
-from .dependencies import StarDep, WeakDep
+from .dependencies import Dep, MemoryDep, StarDep, WeakDep
 from .ir import ComputedBuffer, MultiOutput, MultiOutputLayout
+from .runtime.runtime_utils import green_text, red_text
 from .sizevars import SimplifyIndexing
 from .utils import (
     cache_on_self,
     cmp,
-    free_symbol_has,
+    device_need_guard,
     get_device_tflops,
     get_dtype_size,
     get_gpu_dram_gbps,
-    green_text,
-    red_text,
+    is_collective,
+    is_gpu,
+    is_wait,
     sympy_product,
 )
 from .virtualized import V
@@ -94,7 +98,7 @@ def __init__(self, dep):
     def is_reduction(self):
         return False
 
-    def get_alias_names(self):
+    def get_inputs_that_alias_output(self):
         return ()
 
     def get_name(self):
@@ -103,11 +107,39 @@ def get_name(self):
     __repr__ = get_name
 
 
-def fuse(node1: "BaseSchedulerNode", node2: "BaseSchedulerNode"):
-    if node1.is_foreach() or node2.is_foreach():
-        return ForeachKernelSchedulerNode.fuse(node1, node2)
-    else:
-        return FusedSchedulerNode.fuse(node1, node2)
+def _prune_redundant_deps(node, name_to_fused_node):
+    """
+    Prunes weakdeps intended for mutation ordering
+    on an upstream fused node if after fusion there is another dependency
+    on the fused upstream node, making the weakdep redundant
+
+    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
+    be incrementally removed, enabling other fusions, ensuring they are fused in order.
+    """
+    name_to_dep_count: Counter[str] = collections.Counter()
+
+    for dep in node.unmet_dependencies:
+        if not isinstance(dep, WeakDep):
+            name_to_dep_count[name_to_fused_node[dep.name].get_name()] += 1
+
+    def should_prune(dep):
+        if isinstance(dep, WeakDep):
+            is_redundant = (
+                name_to_dep_count[name_to_fused_node[dep.name].get_name()] > 0
+            )
+            # These can occur because fused nodes always gather deps from their snodes
+            # If B has a weakdep on A
+            # B gets fused with C, then any time BC is fused, the weakdep will reappear
+            is_self_dep = name_to_fused_node[dep.name] == node
+            return is_redundant or is_self_dep
+        else:
+            return False
+
+    deps_to_prune = {dep for dep in node.unmet_dependencies if should_prune(dep)}
+
+    if deps_to_prune:
+        node.unmet_dependencies = node.unmet_dependencies - deps_to_prune
+        node.set_read_writes(node.read_writes.remove_reads(deps_to_prune))
 
 
 # TODO(xmfan): reuse an existing mapping for this if it exists, or formalize this into ir.py:ExternKernel
@@ -195,7 +227,7 @@ def set_last_usage(
         self.last_usage = used_buffers - future_used_buffers
 
     def get_aliases(self):
-        return self.node.get_alias_names()
+        return self.node.get_inputs_that_alias_output()
 
     def get_mutations(self):
         return self.node.get_mutation_names()
@@ -220,14 +252,17 @@ def used_buffer_names(self) -> Set[str]:
     def used_or_aliased_buffer_names(self) -> Set[str]:
         used_names = set()
 
-        for dep in itertools.chain(self.read_writes.reads, self.read_writes.writes):
-            used_names.add(dep.name)
-            if V.graph.name_to_buffer.get(dep.name):
-                layout = V.graph.name_to_buffer[dep.name].get_layout()
-                # needed to avoid deallocating aliased buffer
-                # if there are still uses of aliases ahead
-                if isinstance(layout, ir.AliasedLayout):
-                    used_names.add(layout.view.data.get_name())
+        deps = [
+            dep.name
+            for dep in itertools.chain(self.read_writes.reads, self.read_writes.writes)
+        ]
+        while len(deps) > 0:
+            dep = deps.pop()
+            used_names.add(dep)
+            if V.graph.name_to_buffer.get(dep):
+                for alias in V.graph.name_to_buffer[dep].get_inputs_that_alias_output():
+                    if alias not in used_names:
+                        deps.append(alias)
         return used_names
 
     def prune_deps(self):
@@ -246,38 +281,7 @@ def should_prune(dep):
         self.set_read_writes(self.read_writes.remove_reads(to_remove))
 
     def prune_redundant_deps(self, name_to_fused_node):
-        """
-        Prunes weakdeps intended for mutation ordering
-        on an upstream fused node if after fusion there is another dependency
-        on the fused upstream node, making the weakdep redundant
-
-        In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
-        be incrementally removed, enabling other fusions, ensuring they are fused in order.
-        """
-        name_to_dep_count: Counter[str] = collections.Counter()
-
-        for dep in self.unmet_dependencies:
-            if not isinstance(dep, WeakDep):
-                name_to_dep_count[name_to_fused_node[dep.name].get_name()] += 1
-
-        def should_prune(dep):
-            if isinstance(dep, WeakDep):
-                is_redundant = (
-                    name_to_dep_count[name_to_fused_node[dep.name].get_name()] > 0
-                )
-                # These can occur because fused nodes always gather deps from their snodes
-                # If B has a weakdep on A
-                # B gets fused with C, then any time BC is fused, the weakdep will reappear
-                is_self_dep = name_to_fused_node[dep.name] == self
-                return is_redundant or is_self_dep
-            else:
-                return False
-
-        deps_to_prune = {dep for dep in self.unmet_dependencies if should_prune(dep)}
-
-        if deps_to_prune:
-            self.unmet_dependencies = self.unmet_dependencies - deps_to_prune
-            self.set_read_writes(self.read_writes.remove_reads(deps_to_prune))
+        _prune_redundant_deps(self, name_to_fused_node)
 
     def get_name(self) -> str:
         return self.node.get_name()
@@ -297,6 +301,9 @@ def get_device(self):
     def is_reduction(self):
         return False
 
+    def is_split_scan(self):
+        return False
+
     def is_template(self):
         return False
 
@@ -321,19 +328,12 @@ def decide_inplace_update(self):
             return
 
         if isinstance(self, (SchedulerNode,)) and (
-            self.node.get_alias_names() or self.node.get_mutation_names()
+            self.node.get_inputs_that_alias_output() or self.node.get_mutation_names()
         ):
             return
 
         if (
-            (
-                isinstance(self, (SchedulerNode,))
-                # o what have i done.  lets make this an api
-                or (
-                    isinstance(self, ExternKernelSchedulerNode)
-                    and isinstance(self.node, (ir.AllReduce, ir.InPlaceHint))
-                )
-            )
+            isinstance(self, (SchedulerNode,))
             and config.inplace_buffers
             and (
                 not isinstance(V.kernel, torch._inductor.codegen.triton.TritonKernel)
@@ -348,7 +348,11 @@ def decide_inplace_update(self):
                 input_node: Optional[
                     BaseSchedulerNode
                 ] = self.scheduler.name_to_node.get(read.name)
-                if input_node and V.graph.wrapper_code.can_reuse(input_node, self):
+                if (
+                    input_node
+                    and V.graph.wrapper_code.can_reuse(input_node, self)
+                    and not isinstance(input_node, NopKernelSchedulerNode)
+                ):
                     assert input_node.users is not None
                     remaining_uses = [
                         x
@@ -364,15 +368,14 @@ def decide_inplace_update(self):
                             input_node.node.get_layout(),
                             (
                                 ir.MultiOutputLayout,
-                                ir.MutationLayout,
-                                ir.AliasedLayout,
+                                ir.MutationLayoutSHOULDREMOVE,
                             ),
                         )
                         and not (
                             isinstance(
                                 input_node.node, (ir.FallbackKernel, ir.MultiOutput)
                             )
-                            and len(input_node.node.get_alias_names()) > 0
+                            and len(input_node.node.get_inputs_that_alias_output()) > 0
                         )
                         and buffer_reuse_key(input_node.node)
                         == buffer_reuse_key(self.node)
@@ -405,7 +408,7 @@ def allocate(self):
             return
 
         if isinstance(self, (SchedulerNode,)) and (
-            self.node.get_alias_names() or self.node.get_mutation_names()
+            self.node.get_inputs_that_alias_output() or self.node.get_mutation_names()
         ):
             V.graph.wrapper_code.codegen_allocation(self.node)
             return
@@ -532,7 +535,7 @@ def is_materialized(buf, snodes):
         node_bytes = 0
 
         for buf_name in reads | writes:
-            buf_accessed_elems = sum([node_numel for dep in buf_accesses[buf_name]])
+            buf_accessed_elems = sum(node_numel for dep in buf_accesses[buf_name])
             buf: Union[ir.Buffer, ir.TensorBox]
             if buf_name in V.graph.name_to_buffer:
                 buf = V.graph.name_to_buffer[buf_name]
@@ -577,10 +580,20 @@ def get_estimated_runtime(self) -> float:
             layout = self.node.get_layout()
             dtype = self.node.get_dtype()
 
-        if "cuda" != layout.device.type:
+        if not is_gpu(layout.device.type):
             # default to no reordering based on runtime
             return 0
 
+        # Collective kernels
+        if is_collective(self.node):
+            return estimate_nccl_collective_runtime(self.node)
+        elif is_wait(self.node):
+            # ir.Wait is only used for collective ops.
+            # The time needed for the collective op is already estimated and considered
+            # when we are processing the collective op IR node, so ir.Wait takes 0 time
+            # since it doesn't take extra time to get the result after the collective is completed.
+            return 0
+
         try:
             gpu_memory_bandwidth = get_gpu_dram_gbps()
             gpu_flops = get_device_tflops(dtype) * 10**12
@@ -598,9 +611,15 @@ def get_estimated_runtime(self) -> float:
                 from torch._subclasses.fake_tensor import FakeTensorMode
                 from torch.utils.flop_counter import FlopCounterMode
 
-                with FakeTensorMode(), FlopCounterMode(
+                assert self.node.fx_node is not None
+                with FakeTensorMode() as fake_mode, FlopCounterMode(
                     display=False
-                ) as flop_counter_mode:
+                ) as flop_counter_mode, V.set_current_node(
+                    self.node.fx_node
+                ), V.set_fake_mode(
+                    fake_mode
+                ):
+                    assert V.current_node is not None
                     from .ir import ir_node_to_tensor
 
                     fake_inputs = [
@@ -626,16 +645,6 @@ def get_estimated_runtime(self) -> float:
             # Return estimated runtime in nanoseconds (bytes / gbps)
             return self.get_read_write_buffers_sizes() / gpu_memory_bandwidth
 
-        # Collective kernels
-        if isinstance(self.node, ir.CollectiveKernel):
-            return estimate_nccl_collective_runtime(self)
-        elif isinstance(self.node, ir.Wait):
-            # ir.Wait is only used for collective ops.
-            # The time needed for the collective op is already estimated and considered
-            # when we are processing the collective op IR node, so ir.Wait takes 0 time
-            # since it doesn't take extra time to get the result after the collective is completed.
-            return 0
-
         return 0
 
 
@@ -649,30 +658,34 @@ def is_extern(self):
     def has_side_effects(self):
         return hasattr(self.node, "has_side_effects") and self.node.has_side_effects()
 
-    def can_inplace(self, read_dep: dependencies.MemoryDep):
-        if self.get_aliases() or self.is_template():
-            return False
 
-        if read_dep.name not in self.scheduler.name_to_node:
-            # don't allow reuse of an 'input' buffer, we don't own it
-            # (would this have been fixed if I tracked mutations properly above?)
-            return False
-        if not isinstance(
-            self.node, (torch._inductor.ir.AllReduce, torch._inductor.ir.InPlaceHint)
-        ):
-            # TODO make this a property of the IR
-            return False
+class NopKernelSchedulerNode(BaseSchedulerNode):
+    pass
 
-        if len(self.read_writes.writes) == 1:
-            write_dep = next(iter(self.read_writes.writes))
-            numel_diff = read_dep.get_numel() - write_dep.get_numel()
-            return V.graph.sizevars.simplify(numel_diff) == 0
 
-        return False
+def debug_triton_code(node: Union["SchedulerNode", "FusedSchedulerNode"]) -> List[str]:
+    lines = []
+    is_multi_template = node.is_template() and isinstance(
+        node.get_template_node(), ir.MultiTemplateBuffer
+    )
+    if is_multi_template and node.get_template_node().make_kernel_render is None:
+        lines.append(f"{node.get_name()} Unfinalized multi template buffer")
+    else:
+        snodes = (node,) if isinstance(node, SchedulerNode) else node.snodes
+        device = snodes[0].get_device()
+        backend = node.scheduler.get_backend(device)
+        V.graph.scheduler.current_device = device
 
+        # Don't increment kernel count when generating debug string.
+        # This will confuse some unit tests that check the number of
+        # generated kernels.
+        old_generated_kernel_count = metrics.generated_kernel_count
+        triton_code = backend.generate_kernel_code_from_nodes(snodes).strip()
+        metrics.generated_kernel_count = old_generated_kernel_count
 
-class NopKernelSchedulerNode(BaseSchedulerNode):
-    pass
+        lines.append(f"{node.get_name()} Triton code:")
+        lines.append(textwrap.indent(triton_code, "    "))
+    return lines
 
 
 class SchedulerNode(BaseSchedulerNode):
@@ -680,18 +693,24 @@ def __init__(
         self,
         scheduler: "Scheduler",
         node: Union[ir.ComputedBuffer, ir.TemplateBuffer],
-        group_fn,
     ):
         super().__init__(scheduler, node)
-        (
-            self._sizes,
-            self._body,
-        ) = node.simplify_and_reorder()
+        self._compute_attrs()
+
+    def _compute_attrs(
+        self,
+        extra_indexing_constraints: Optional[Tuple[Dict[Any, Any], List[Any]]] = None,
+    ):
+        assert isinstance(self.node, (ir.ComputedBuffer, ir.TemplateBuffer))
+        self._sizes, self._body = self.node.simplify_and_reorder(
+            extra_indexing_constraints=extra_indexing_constraints
+        )
 
-        self.group = (node.get_device(), group_fn(self._sizes))
+        group_fn = self.scheduler.get_backend(self.node.get_device()).group_fn
+        self.group = (self.node.get_device(), group_fn(self._sizes))
 
-        if isinstance(node, ir.TemplateBuffer):
-            self.set_read_writes(node.normalized_read_writes())
+        if isinstance(self.node, ir.TemplateBuffer):
+            self.set_read_writes(self.node.normalized_read_writes())
         else:
             self.set_read_writes(
                 dependencies.extract_read_writes(
@@ -699,6 +718,11 @@ def __init__(
                 )
             )
 
+    def recompute_size_and_body(
+        self, extra_indexing_constraints: Tuple[Dict[Any, Any], List[Any]]
+    ):
+        self._compute_attrs(extra_indexing_constraints=extra_indexing_constraints)
+
     def debug_str_extra(self) -> str:
         name = self.get_name()
         lines = [
@@ -706,6 +730,10 @@ def debug_str_extra(self) -> str:
             f"{name}.group.iteration = {self.group[1]}",
             f"{name}.sizes = {self._sizes}",
         ]
+        for dep in self.read_writes.reads_and_writes():
+            buf_name = dep.name
+            buf = V.graph.get_buffer(buf_name)
+            lines.append(f"{buf_name}_layout = {pformat(buf.layout)}")
         if self.get_aliases():
             lines.append(f"{name}.aliases = {pformat(self.get_aliases())}")
         if self.get_mutations():
@@ -713,6 +741,10 @@ def debug_str_extra(self) -> str:
         if isinstance(self._body, ir.LoopBody):
             lines.append(f"class {name}_loop_body:")
             lines.append(textwrap.indent(self._body.debug_str(), "    "))
+
+        if ir.is_triton(self.node.get_device()):
+            lines.extend(debug_triton_code(self))
+
         return "\n".join(lines)
 
     def get_ranges(self):
@@ -724,9 +756,20 @@ def is_reduction(self):
         ), f"{type(self.node)=}"
         return bool(self.node.get_reduction_type())
 
+    def is_split_scan(self):
+        assert isinstance(
+            self.node, (ir.ComputedBuffer, ir.TemplateBuffer)
+        ), f"{type(self.node)=}"
+        return isinstance(self.node, ir.ComputedBuffer) and isinstance(
+            self.node.data, ir.SplitScan
+        )
+
     def is_template(self):
         return isinstance(self.node, ir.TemplateBuffer)
 
+    def get_template_node(self):
+        return self.node if self.is_template() else None
+
     def run(self, *index_vars):
         self.decide_inplace_update()
         self.mark_run()
@@ -840,8 +883,8 @@ def __init__(self, scheduler: "Scheduler", snodes: List[SchedulerNode]):
             for dep in set.union(*[x.unmet_dependencies for x in snodes])
             if dep.name not in self.get_names()
         } - self.read_writes.writes
-        self.min_order = min([x.min_order for x in self.snodes])
-        self.max_order = max([x.max_order for x in self.snodes])
+        self.min_order = min(x.min_order for x in self.snodes)
+        self.max_order = max(x.max_order for x in self.snodes)
 
     @cache_on_self
     def get_name(self) -> str:
@@ -859,6 +902,10 @@ def debug_str_extra(self) -> str:
             f"{self.get_name()}.snodes[{i}] =\n{node.debug_str()}"
             for i, node in enumerate(self.snodes)
         ]
+        device = self.snodes[0].node.get_device()
+        if ir.is_triton(device):
+            lines.extend(debug_triton_code(self))
+
         return textwrap.indent("\n".join(lines).rstrip(), "    ")
 
     def set_last_usage(
@@ -892,6 +939,10 @@ def __repr__(self):
     def is_reduction(self):
         return any(x.is_reduction() for x in self.snodes)
 
+    @cache_on_self
+    def is_split_scan(self):
+        return any(x.is_split_scan() for x in self.snodes)
+
     @cache_on_self
     def is_template(self):
         return any(x.is_template() for x in self.snodes)
@@ -952,6 +1003,26 @@ def allocate(self):
     def can_free(self):
         raise NotImplementedError
 
+    def debug_str(self) -> str:
+        """Longer form printout for trace logs"""
+        name = self.get_name()
+        node_typestr = ",".join(type(n).__name__ for n in self.snodes)
+        lines = [
+            f"{name}: {type(self).__name__}({node_typestr})",
+            f"{name}.writes = {pformat(self.read_writes.writes)}",
+            f"{name}.unmet_dependencies = {pformat(self.unmet_dependencies)}",
+            f"{name}.met_dependencies = {pformat(self.read_writes.reads - self.unmet_dependencies)}",
+            f"{name}.users = {self.users}",
+        ]
+        try:
+            lines += [
+                self.debug_str_extra(),
+            ]
+        except Exception:
+            log.warning("Ignoring error in debug_str()", exc_info=True)
+
+        return "\n".join(lines).rstrip()
+
 
 class ForeachKernelSchedulerNode(FusedSchedulerNode):
     """Scheduler node which consists of a list of scheduler nodes that each operate on a
@@ -1038,7 +1109,7 @@ def fuse(cls, producer, consumer):
                 else:
                     fused_nodes.append(node)
 
-        return cls(producer.scheduler, fused_nodes, prev_node_1, prev_node_2)
+        return cls(producer.scheduler, fused_nodes, prev_node_1, prev_node_2)  # type: ignore[possibly-undefined]
 
     def __init__(
         self,
@@ -1122,6 +1193,8 @@ def get_first_name(self):
         return self.snodes[0].get_first_name()
 
     def prune_redundant_deps(self, name_to_fused_node):
+        _prune_redundant_deps(self, name_to_fused_node)
+
         for node in self.snodes:
             node.prune_redundant_deps(name_to_fused_node)
 
@@ -1204,11 +1277,11 @@ class Scheduler:
     @dynamo_timed
     def __init__(self, nodes):
         super().__init__()
+        V.graph.scheduler = self
         self.backends = {}
         self.fuse_cache = {}
         self.post_grad_graph_id = next(_post_grad_graph_counter)
 
-        self.nodes = []
         self.available_buffer_names = {
             *V.graph.graph_inputs.keys(),
             *V.graph.constants.keys(),
@@ -1226,7 +1299,7 @@ def __init__(self, nodes):
         }
         self.name_to_fused_node: Dict[
             str, BaseSchedulerNode
-        ] = dict()  # set in fuse_nods()
+        ] = dict()  # set in fuse_nodes()
 
         # mutation_real_name: Maps back to the original name for codegen
         # Example:
@@ -1260,6 +1333,7 @@ def __init__(self, nodes):
         self.topological_sort_schedule()
         self.logged_slow_fusion = set()
         self.fuse_nodes()
+        self.finalize_multi_template_buffers()
         if config.reorder_for_compute_comm_overlap:
             # Refresh node_users and inverse_users to reflect fused nodes
             self.compute_node_users()
@@ -1305,8 +1379,7 @@ def create_scheduler_node(self, node):
         if node.is_no_op():
             return NopKernelSchedulerNode(self, node)
         elif isinstance(node, (ir.ComputedBuffer, ir.TemplateBuffer)):
-            group_fn = self.get_backend(node.get_device()).group_fn
-            return SchedulerNode(self, node, group_fn)
+            return SchedulerNode(self, node)
         elif isinstance(node, ir.ExternKernel):
             return ExternKernelSchedulerNode(self, node)
         else:
@@ -1427,6 +1500,13 @@ def add_user(used_by_name, user_node, can_inplace=False, is_weak=False):
 
         unbacked_symbol_to_origin_node = {}
 
+        # NB: None means that the dependency is on an input.  Don't actually
+        # generate a dependency because if we do, Inductor will start trying
+        # to free the unbacked int but that's pointless
+        for name, val in V.graph.graph_inputs.items():
+            if isinstance(val, sympy.Symbol):
+                unbacked_symbol_to_origin_node[val] = None
+
         for node in self.nodes:
             log.debug("scheduling %s", node.node)
 
@@ -1441,7 +1521,7 @@ def add_user(used_by_name, user_node, can_inplace=False, is_weak=False):
                 # because if a MultiOutputLayout buffer propagates an unbacked
                 # symint to multiple outputs, they will all claim to def it.
                 if s not in unbacked_symbol_to_origin_node:
-                    unbacked_symbol_to_origin_node[s] = node
+                    unbacked_symbol_to_origin_node[s] = node.get_name()
 
             unbacked_symbol_uses = sorted(
                 node.node.get_unbacked_symbol_uses(), key=lambda x: x.name
@@ -1451,7 +1531,8 @@ def add_user(used_by_name, user_node, can_inplace=False, is_weak=False):
                 assert (
                     s in unbacked_symbol_to_origin_node
                 ), f"{s} not in {unbacked_symbol_to_origin_node}"
-                node.add_fake_dep(StarDep(unbacked_symbol_to_origin_node[s].get_name()))
+                if (r := unbacked_symbol_to_origin_node[s]) is not None:
+                    node.add_fake_dep(StarDep(r))
 
             # a node will mutate either 0 or 1 buffers
             assert len(node.get_mutations()) <= 1
@@ -1492,12 +1573,11 @@ def add_user(used_by_name, user_node, can_inplace=False, is_weak=False):
 
         # make sure unbacked symints aren't dead-code-eliminated
         for node in V.graph.graph_outputs:
-            if isinstance(node, ir.ShapeAsConstantBuffer):
-                for s in free_unbacked_symbols(node.shape):
-                    assert (
-                        s in unbacked_symbol_to_origin_node
-                    ), f"{s} not in {unbacked_symbol_to_origin_node.keys()}"
-                    node_name = unbacked_symbol_to_origin_node[s].node.name
+            for s in node.get_unbacked_symbol_uses():
+                assert (
+                    s in unbacked_symbol_to_origin_node
+                ), f"{s} not in {unbacked_symbol_to_origin_node.keys()}"
+                if (node_name := unbacked_symbol_to_origin_node[s]) is not None:
                     log.debug(
                         "scheduling output %s for unbacked symint %s", node_name, s
                     )
@@ -1508,6 +1588,9 @@ def add_user(used_by_name, user_node, can_inplace=False, is_weak=False):
             if name in V.graph.graph_inputs:
                 add_user(name, OutputNode(StarDep(name)))
                 V.graph.mutated_inputs.add(name)
+            elif name in V.graph.constants:
+                # In AOTI, module parameters and buffers are not lifted as graph inputs
+                add_user(name, OutputNode(StarDep(name)))
 
         inp_names = {
             name: index for index, name in enumerate(V.graph.graph_inputs.keys())
@@ -1635,7 +1718,9 @@ def fuse_nodes(self):
         for i in range(10):
             old_len = len(self.nodes)
             fusion_log.debug(
-                "===== attempting fusion (%d/10): %d nodes =====", i + 1, old_len
+                "===== attempting fusion (%d/10): %d nodes =====",
+                i + 1,
+                old_len,
             )
             self.fuse_nodes_once()
             new_len = len(self.nodes)
@@ -1649,27 +1734,83 @@ def fuse_nodes(self):
                 fusion_log.debug("===== fusion complete (%d iterations) =====", i + 1)
                 break
 
-    def benchmark_fused_nodes(self, nodes):
+    def benchmark_fused_nodes(self, nodes) -> Tuple[float, str]:
         """
         Benchmark fused list of nodes and return the execution time
         in milliseconds on randomly generated inputs.
         """
         assert len(nodes) > 0
         device = nodes[0].get_device()
-        V.graph.scheduler = self
         self.current_device = device
         backend = self.get_backend(device)
         return backend.benchmark_fused_nodes(nodes)
 
+    def finalize_multi_template_buffers(self):
+        def replace_buffer(orig_node: ir.MultiTemplateBuffer, new_node: ir.Buffer):
+            replaced_name = new_node.name
+            orig_name = orig_node.get_name()
+            assert isinstance(orig_name, str) and isinstance(replaced_name, str)
+
+            del V.graph.name_to_buffer[replaced_name]
+            new_node.name = orig_name
+
+            V.graph.buffers.remove(orig_node)
+            V.graph.name_to_buffer[orig_name] = new_node
+
+        for i, node in enumerate(self.nodes):
+            if isinstance(node, SchedulerNode) and isinstance(
+                node.node, ir.MultiTemplateBuffer
+            ):
+                multi_node = node.node
+                min_node_unfused, _ = multi_node.get_min_choice()
+
+                if isinstance(
+                    min_node_unfused,
+                    torch._inductor.ir.TritonTemplateCallerBase,
+                ):
+                    node.node.finalize_as_triton_caller(min_node_unfused)
+                    continue
+
+                out_tensorbox = min_node_unfused.output_node()
+                out_storage = out_tensorbox.data
+                assert isinstance(out_storage, ir.StorageBox)
+                out_buffer = out_storage.data
+                assert isinstance(out_buffer, ir.Buffer)
+
+                out_buffer.layout = multi_node.layout
+                replace_buffer(multi_node, out_buffer)
+                new_scheduler_node = self.create_scheduler_node(out_buffer)
+
+                self.nodes[i] = new_scheduler_node
+                self.name_to_node[node.get_name()] = new_scheduler_node
+                self.name_to_fused_node[node.get_name()] = new_scheduler_node
+
+                new_scheduler_node.users = node.users
+                new_scheduler_node.min_order = node.min_order
+                new_scheduler_node.max_order = node.max_order
+                new_scheduler_node.last_usage = node.last_usage
+                for user in new_scheduler_node.users:
+                    user.node.inverse_users.remove(node)
+                    user.node.inverse_users.append(new_scheduler_node)
+
     def speedup_by_fusion(self, node1, node2):
         """
         If config.benchmark_fusion is False, always return True.
         Otherwise, return True if fusion can brings speedup.
         """
-        if not config.benchmark_fusion:
+
+        is_multi_template = node1.is_template() and isinstance(
+            node1.get_template_node(), ir.MultiTemplateBuffer
+        )
+        if not config.benchmark_fusion and not is_multi_template:
             return True
 
-        if node1.is_template():
+        if (
+            node1.is_template()
+            and not isinstance(node1.get_template_node(), ir.TritonTemplateBuffer)
+            or node1.is_foreach()
+            or node2.is_foreach()
+        ):
             # TODO support benchmarking epilogue fusion
             return True
 
@@ -1698,42 +1839,88 @@ def speedup_by_fusion(self, node1, node2):
 
         why = WhyNoFuse(node1, node2)
 
-        try:
-            ms1, path1 = self.benchmark_fused_nodes(node_list_1)
-            if math.isinf(ms1):
-                why("register spilling of the first kernel")
-                return False
+        def log_fusion(ms_fused, ms1, ms2):
+            if fusion_log.isEnabledFor(logging.DEBUG):
+                if ms_fused < ms1 + ms2:
+                    fusion_log.debug(
+                        "can fuse (benchmark): fusing %s with %s cause %sx speedup",
+                        node1.get_names(),
+                        node2.get_names(),
+                        green_text(f"{(ms1 + ms2) / ms_fused:.3f}"),
+                    )
+                else:
+                    fusion_log.debug(
+                        "cannot fuse (benchmark): fusing %s with %s cause %sx slowdown",
+                        node1.get_names(),
+                        node2.get_names(),
+                        red_text(f"{ms_fused / (ms1 + ms2):.3f}"),
+                    )
+
+        if isinstance(node1, SchedulerNode) and isinstance(
+            node1.node, ir.MultiTemplateBuffer
+        ):
+            multi_node = node1.node
+            choice_timings = multi_node.choice_timings
+
+            _, ms1 = multi_node.get_min_choice()
             ms2, path2 = self.benchmark_fused_nodes(node_list_2)
-            if math.isinf(ms2):
-                why("register spilling of the second kernel")
-                return False
-            ms_fused, path_fused = self.benchmark_fused_nodes(node_list_fused)
-            if math.isinf(ms_fused):
-                why("register spilling of the fused kernel")
-                return False
-        except CompilationError as e:
-            # workaround triton issue: https://github.com/openai/triton/issues/2151
-            if "Loop-carried variable" in str(e):
-                return True  # allow fusion
-            else:
-                raise
 
-        if fusion_log.isEnabledFor(logging.DEBUG):
-            if ms_fused < ms1 + ms2:
-                fusion_log.debug(
-                    "can fuse (benchmark): fusing %s with %s cause %sx speedup",
-                    node1.get_names(),
-                    node2.get_names(),
-                    green_text(f"{(ms1 + ms2) / ms_fused:.3f}"),
-                )
+            min_ms_fused = float("inf")
+            ms_fused_choice = None
+
+            triton_choices = 0
+
+            for choice, unfused_time in choice_timings.items():
+                if not isinstance(choice, torch._inductor.ir.TritonTemplateCallerBase):
+                    continue
+
+                if unfused_time >= ms1 + ms2:
+                    continue
+
+                triton_choices += 1
+                if triton_choices > config.max_epilogue_benchmarked_choices:
+                    break
+
+                # TODO - parallel compile triton templates
+                # TODO - should prune/skip choices that are not within certain % of best choice
+                with node1.node.swap_as_triton_caller(choice):
+                    ms_fused, _ = self.benchmark_fused_nodes(node_list_fused)
+
+                    if ms_fused < min_ms_fused:
+                        min_ms_fused = ms_fused
+                        ms_fused_choice = choice
+
+            log_fusion(min_ms_fused, ms1, ms2)
+
+            # after we do a fusion, we finalize a triton template.
+            # TODO - could preserve multi template and choices for subsequent fusions
+            if min_ms_fused < (ms1 + ms2) and ms_fused_choice is not None:
+                node1.node.finalize_as_triton_caller(ms_fused_choice)
+                return True
             else:
-                fusion_log.debug(
-                    "cannot fuse (benchmark): fusing %s with %s cause %sx slowdown",
-                    node1.get_names(),
-                    node2.get_names(),
-                    red_text(f"{ms_fused / (ms1 + ms2):.3f}"),
-                )
+                return False
+        else:
+            try:
+                ms1, path1 = self.benchmark_fused_nodes(node_list_1)
+                if math.isinf(ms1):
+                    why("register spilling of the first kernel")
+                    return False
+                ms2, path2 = self.benchmark_fused_nodes(node_list_2)
+                if math.isinf(ms2):
+                    why("register spilling of the second kernel")
+                    return False
+                ms_fused, path_fused = self.benchmark_fused_nodes(node_list_fused)
+                if math.isinf(ms_fused):
+                    why("register spilling of the fused kernel")
+                    return False
+            except CompilationError as e:
+                # workaround triton issue: https://github.com/openai/triton/issues/2151
+                if "Loop-carried variable" in str(e):
+                    return True  # allow fusion
+                else:
+                    raise
 
+        log_fusion(ms_fused, ms1, ms2)
         if (
             is_metric_table_enabled("slow_fusion")
             and ms_fused >= ms1 + ms2
@@ -1773,7 +1960,10 @@ def fuse_nodes_once(self):
                 fusion_log.debug(
                     "fusing %s with %s", node1.get_name(), node2.get_name()
                 )
-                node3 = fuse(node1, node2)
+
+                # above can_fuse asserts that node2 has the same device
+                device = node1.get_device()
+                node3 = self.get_backend(device).fuse(node1, node2)
                 fused_nodes.remove(node1)
                 fused_nodes.remove(node2)
                 fused_nodes.add(node3)
@@ -1827,6 +2017,9 @@ def check_all_pairs(nodes):
             for node_grouping in group_grouping.values():
                 check_all_pairs(node_grouping)
 
+        possible_fusions = self.get_possible_fusions_with_highest_priority(
+            possible_fusions
+        )
         possible_fusions.sort(key=self.score_fusion_key, reverse=True)
         fusion_log.debug("found %d possible fusions", len(possible_fusions))
         return possible_fusions
@@ -1895,6 +2088,60 @@ def can_fusion_increase_peak_memory(
         )
         return proximity_score > 64
 
+    def decide_fusion_fail_reason(self, node1, node2, common_buf_names):
+        """
+        Try to decide reasons why fusion fail due to no shared memory even though
+        there are common buffers.
+        """
+        reasons = {}
+        node1_name2dep = {dep.name: dep for dep in node1.read_writes.reads_and_writes()}
+        node2_name2dep = {dep.name: dep for dep in node2.read_writes.reads_and_writes()}
+
+        for buf_name in common_buf_names:
+            buf = V.graph.get_buffer(buf_name)
+            lhs_dep = node1_name2dep[buf_name]
+            rhs_dep = node2_name2dep[buf_name]
+
+            if lhs_dep.get_numel() != rhs_dep.get_numel():
+                reasons[
+                    buf_name
+                ] = f"different numel: {lhs_dep.get_numel()} v.s. {rhs_dep.get_numel()}"
+                continue
+
+            # same numel but different MemoryDep.size. Should be broadcasting
+            if sympy_product(lhs_dep.size) != sympy_product(rhs_dep.size):
+                reasons[buf_name] = "broadcast"
+                continue
+
+            if not isinstance(lhs_dep, MemoryDep) or not isinstance(rhs_dep, MemoryDep):
+                reasons[
+                    buf_name
+                ] = f"not MemoryDep: {type(lhs_dep)} v.s. {type(rhs_dep)}"
+                continue
+
+            lhs_off = lhs_dep.get_offset()
+            rhs_off = rhs_dep.get_offset()
+            if lhs_off != rhs_off:
+                # One example is in transformer, we use a concatenated linear layer
+                # to project Q/K/V and then split the result. The 3 splits will
+                # point to the same buffer with different offsets.
+                reasons[buf_name] = f"different offset: {lhs_off} v.s. {rhs_off}"
+                continue
+
+            if (
+                lhs_dep.normalize_with_stride_order()
+                == rhs_dep.normalize_with_stride_order()
+            ):
+                reasons[buf_name] = f"Mismatch loop orders: {lhs_dep} v.s. {rhs_dep}"
+                continue
+
+            # Add more rules here
+            reasons[
+                buf_name
+            ] = f"Unknown reason: {lhs_dep} v.s. {rhs_dep}. Layout: {buf.layout}"
+
+        return str(reasons)
+
     def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
         """
         Determine if it is possible to combine node1 and node2 into a
@@ -1919,9 +2166,6 @@ def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
             why("node2 is extern or nop")
             return False
 
-        if node1.is_foreach() or node2.is_foreach():
-            return ForeachKernelSchedulerNode.can_fuse(node1, node2)
-
         if node2.get_names() & node1.ancestors:
             why("node1 must go before node2")
             return False
@@ -1967,6 +2211,28 @@ def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
         if no_shared_data and (
             not config.aggressive_fusion or node1.is_reduction() or node2.is_reduction()
         ):
+            if is_metric_table_enabled("fusion_failure_due_to_indexing_mismatch"):
+                common_buf_names = (
+                    node1.read_writes.buffer_names() & node2.read_writes.buffer_names()
+                )
+                if len(common_buf_names) > 0:
+                    get_metric_table("fusion_failure_due_to_indexing_mismatch").add_row(
+                        lambda: {
+                            "pre_grad_graph_id": V.graph.graph_id,
+                            "post_grad_graph_id": V.graph.post_grad_graph_id,
+                            "node1_name": node1.get_name(),
+                            "node2_name": node2.get_name(),
+                            "node1_debug_str": write_text(node1.debug_str()),
+                            "node2_debug_str": write_text(node2.debug_str()),
+                            "common_buffer_names": list(common_buf_names),
+                            "failure_reason": self.decide_fusion_fail_reason(
+                                node1, node2, common_buf_names
+                            ),
+                        }
+                    )
+
+                    why("no shared data due to indexing mismatch")
+                    return False
             why("no shared data")
             return False  # heuristic not needed for correctness
 
@@ -1996,26 +2262,32 @@ def can_fuse_vertical(self, node1, node2):
         We can fuse them if all the reads of node2 either match
         corresponding writes in node1, or are written by nodes that can
         be scheduled before the fusion of node1 and node2.
+
+        We also disable fusion of a write subsequent to a read if the reads
+        and writes do not align.
         """
         node1_names = node1.get_names()
         computed_deps = set()
         why = WhyNoFuse(node1, node2)
 
+        # StarDep doesn't match MemoryDep, different indices don't match
+        # However, broadcasting sometimes strips dimensions, and if that's the case
+        # we still can match unmet dep
+        # if there's indirect indexing, don't match it
+        def fusable_read_and_write(read: Dep, write: Dep):
+            return (
+                self.mutation_renames.get(read.name, read.name) == write.name
+                and (isinstance(read, MemoryDep) and isinstance(write, MemoryDep))
+                and not free_symbol_is_type(read.index, SymT.TMP)
+                and not free_symbol_is_type(write.index, SymT.TMP)
+                and read.index == write.index
+                and len(read.size) >= len(write.size)
+                and read.size[: len(write.size)] == write.size
+            )
+
         for rd in node2.unmet_dependencies:
             for cd in node1.read_writes.writes:
-                # StarDep doesn't match MemoryDep, different indices don't match
-                # However, broadcasting sometimes strips dimensions, and if that's the case
-                # we still can match unmet dep
-                # if there's indirect indexing, don't match it
-                if (
-                    rd.name == cd.name
-                    and type(rd) == type(cd)
-                    and not free_symbol_has(rd.index, "tmp")
-                    and not free_symbol_has(cd.index, "tmp")
-                    and rd.index == cd.index
-                    and len(rd.size) >= len(cd.size)
-                    and rd.size[: len(cd.size)] == cd.size
-                ):
+                if fusable_read_and_write(rd, cd):
                     computed_deps.add(rd)
 
         remaining_deps = {dep.name for dep in node2.unmet_dependencies - computed_deps}
@@ -2030,6 +2302,19 @@ def can_fuse_vertical(self, node1, node2):
             if node1_names & self.name_to_fused_node[name].ancestors:
                 why("intermediate nodes between node1 & node2")
                 return False
+
+        # similar to can_inplace, if we are going to fuse a write subsequent to a read
+        # require that the indexing and size is the same
+        for write in node2.read_writes.writes:
+            for read in node1.read_writes.reads:
+                if write.name != self.mutation_renames.get(read.name, read.name):
+                    continue
+
+                # bail on StarDep
+                if not fusable_read_and_write(read=read, write=write):
+                    why("fusing a write into a read with different indexing formula")
+                    return False
+
         return True
 
     def score_fusion(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
@@ -2066,6 +2351,36 @@ def score_fusion_memory(self, node1, node2):
         }
         return sum(dep.numbytes_hint() for dep in common_memory_deps)
 
+    def get_possible_fusions_with_highest_priority(self, possible_fusions):
+        # Group the possible fusions based on their priority from the backend.
+        # Only return the group of possible fusions with highest priority.
+        if len(possible_fusions) == 0:
+            return possible_fusions
+        possible_fusions_group_by_priority: Dict[
+            int, List[Tuple["BaseSchedulerNode", "BaseSchedulerNode"]]
+        ] = {}
+
+        for node1, node2 in possible_fusions:
+            assert node1.get_device() == node2.get_device()
+            device = node1.get_device()
+            fusion_pair_priority = int(
+                self.get_backend(device).get_fusion_pair_priority(node1, node2)
+            )
+            if fusion_pair_priority not in possible_fusions_group_by_priority:
+                possible_fusions_group_by_priority[fusion_pair_priority] = [
+                    (node1, node2),
+                ]
+            else:
+                possible_fusions_group_by_priority[fusion_pair_priority].append(
+                    (node1, node2)
+                )
+        # return the possible fusions with highest priority
+        possible_fusions_with_highest_priority = min(
+            possible_fusions_group_by_priority.items(), key=operator.itemgetter(0)
+        )[1]
+        assert len(possible_fusions_with_highest_priority) > 0
+        return possible_fusions_with_highest_priority
+
     def score_fusion_key(self, nodes):
         """
         Shim for list.sort(key=...)
@@ -2078,9 +2393,7 @@ def compute_last_usage(self):
         Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
         """
 
-        future_used_buffers = set()
-        for node_name in V.graph.get_output_names():
-            future_used_buffers.add(node_name)
+        future_used_buffers = set(V.graph.get_output_names())
 
         for node in reversed(self.nodes):
             node.set_last_usage(future_used_buffers, self.mutation_real_name)
@@ -2170,6 +2483,7 @@ def codegen_extern_call(self, scheduler_node: ExternKernelSchedulerNode):
         # the current kernel from where 'allocate' retrieve those decisions.
         # We have to make sure there is a non-NULL kernel handler to store
         # those inplace update decisions.
+        counters["inductor"]["extern_calls"] += 1
         with V.set_kernel_handler(Kernel(increase_kernel_count=False)):
             scheduler_node.decide_inplace_update()
             scheduler_node.allocate()
@@ -2180,7 +2494,7 @@ def codegen_extern_call(self, scheduler_node: ExternKernelSchedulerNode):
 
     def create_backend(self, device: torch.device):
         assert (
-            device.type != "cuda" or device.index is not None
+            not is_gpu(device.type) or device.index is not None
         ), f"{device} should have been normalized in lowering"
         V.graph.add_device_info(device)
 
@@ -2188,13 +2502,15 @@ def create_backend(self, device: torch.device):
         if device_scheduling is None:
             raise RuntimeError(f"Unsupported device type: {device.type}")
 
-        if device.type == "cuda" and not has_triton():
-            device_props = torch.cuda.get_device_properties(device)
-            if device_props.major < 7:
+        if not has_triton():
+            if (
+                device.type == "cuda"
+                and (device_props := torch.cuda.get_device_properties(device)).major < 7
+            ):
                 raise RuntimeError(
                     f"Found {device_props.name} which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 7.0, but your device is of CUDA capability {device_props.major}.{device_props.minor}"  # noqa: B950
                 )
-            else:
+            elif is_gpu(device.type):
                 raise RuntimeError(
                     "Cannot find a working triton installation. More information on installing Triton can be found at https://github.com/openai/triton"  # noqa: B950
                 )
@@ -2212,9 +2528,13 @@ def get_order(n):
                 self.origin_to_index.update({n: i for i, n in enumerate(n.graph.nodes)})
             return self.origin_to_index[n]
 
-        origins = [(get_order(e), e) for n in node.get_nodes() for e in n.node.origins]
+        # Use a dict to have ordering
+        origins = {
+            (get_order(e), e): None for n in node.get_nodes() for e in n.node.origins
+        }
+        origins = list(origins.keys())
         if origins:
-            _, last = max(origins)
+            _, last = max(origins, key=operator.itemgetter(0))
             V.graph.wrapper_code.enter_context(last)
 
     @dynamo_timed
@@ -2234,8 +2554,9 @@ def codegen(self):
 
             self.enter_context(node)
 
-            if not isinstance(node, NopKernelSchedulerNode):
-                device = node.get_device()
+            if not isinstance(node, NopKernelSchedulerNode) and (
+                device := node.get_device()
+            ):
                 if (
                     device != self.current_device
                     or node.is_extern()
@@ -2243,26 +2564,27 @@ def codegen(self):
                 ):
                     self.flush()
                 if device != self.current_device:
-                    if device.type == "cuda":
-                        if self.current_device and self.current_device.type == "cuda":
-                            V.graph.wrapper_code.codegen_device_guard_exit()
+                    if self.current_device and device_need_guard(
+                        self.current_device.type
+                    ):
+                        V.graph.wrapper_code.codegen_device_guard_exit()
+                    if device_need_guard(device.type):
                         assert device.index is not None, "device should have an index"
                         V.graph.wrapper_code.codegen_device_guard_enter(device.index)
-                    elif self.current_device and self.current_device.type == "cuda":
-                        V.graph.wrapper_code.codegen_device_guard_exit()
+
                     self.current_device = device
 
             self.buffer_names_to_free.update(node.last_usage)
 
             if node.is_template():
                 node, *epilogue = node.get_nodes()
-                self.get_backend(device).codegen_template(node, epilogue)
+                self.get_backend(device).codegen_template(node, epilogue)  # type: ignore[possibly-undefined]
             elif node.is_extern():
                 self.codegen_extern_call(node)
             elif node.is_foreach():
-                self.get_backend(device).codegen_foreach(node)
+                self.get_backend(device).codegen_foreach(node)  # type: ignore[possibly-undefined]
             elif isinstance(node, (FusedSchedulerNode, SchedulerNode)):
-                self.get_backend(device).codegen_nodes(node.get_nodes())
+                self.get_backend(device).codegen_node(node)  # type: ignore[possibly-undefined]
             else:
                 assert isinstance(node, NopKernelSchedulerNode)
                 node.allocate()
@@ -2271,27 +2593,25 @@ def codegen(self):
                 V.graph.wrapper_code.generate_inf_and_nan_checker(node)
 
             if config.triton.debug_sync_kernel:
-                self.get_backend(device).codegen_sync()
+                self.get_backend(device).codegen_sync()  # type: ignore[possibly-undefined]
 
             self.available_buffer_names.update(node.get_names())
 
             if not isinstance(node, NopKernelSchedulerNode):
                 device = node.get_device()
-                if self.get_backend(device).ready_to_flush():
+                if device is not None and self.get_backend(device).ready_to_flush():
                     self.flush()
 
+        if self.current_device and device_need_guard(self.current_device.type):
+            # exit the outermost CUDA device guard. this is
+            # important for nested indentation codegen-ing.
+            V.graph.wrapper_code.codegen_device_guard_exit()
+
         self.flush()
 
-    def is_unaligned_buffer(self, buf_name):
-        if buf_name in V.graph.graph_inputs or buf_name in V.graph.constants:
-            # all graph inputs or constants are assumed to be aligned
-            return False
+    def get_buffer_layout(self, buf_name: str) -> ir.Layout:
         node = self.name_to_node[buf_name]
-        layout = node.node.get_layout()
-        if isinstance(layout, ir.AliasedLayout):
-            return not layout.maybe_guard_aligned()
-        else:
-            return False
+        return node.node.get_layout()
 
 
 class BaseScheduling:
@@ -2299,19 +2619,28 @@ def can_fuse_vertical(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
         """
         Check whether node1 and node2 can be vertically fused or not.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def can_fuse_horizontal(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
         """
         Check whether node1 and node2 can be horizontally fused or not.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
+
+    def fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        """
+        Fuse two nodes
+        """
+        if node1.is_foreach() or node2.is_foreach():
+            return ForeachKernelSchedulerNode.fuse(node1, node2)
+        else:
+            return FusedSchedulerNode.fuse(node1, node2)
 
     def group_fn(self, sizes):
         """
         Process the iteration sizes in case a transformation needs to be applied.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def codegen_template(
         self, template_node: SchedulerNode, epilogue_nodes: List[SchedulerNode]
@@ -2322,19 +2651,19 @@ def codegen_template(
         This function is only available for triton now. If the third-party backend behaves as a sub-class
         of TritonScheduling, it can override it or reuse it.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
-    def codegen_nodes(self, nodes: List[BaseSchedulerNode]):
+    def codegen_node(self, node: Union[FusedSchedulerNode, SchedulerNode]):
         """
         Generate a kernel given a list of pre-fused nodes.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def codegen_sync(self):
         """
         Generate synchronization code for the kernel. This method depends on the hardware characteristics.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def ready_to_flush(self) -> bool:
         """
@@ -2347,11 +2676,18 @@ def flush(self):
         """
         Flush the generated kernel and python wrapper code to the source code file.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def benchmark_fused_nodes(self, nodes):
         """
         Benchmark fused list of nodes and return the execution time
         in milliseconds on randomly generated inputs.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
+
+    def get_fusion_pair_priority(self, node1, node2) -> int:
+        """
+        Return an unsigned integer which represents the priority of this fusion pair.
+        The smaller is with higher priority.
+        """
+        return 0
diff --git a/torch/_inductor/script.ld b/torch/_inductor/script.ld
new file mode 100644
index 0000000000000..5a052e984fcd7
--- /dev/null
+++ b/torch/_inductor/script.ld
@@ -0,0 +1,8 @@
+SECTIONS {
+  /* By default, in LLD 16, .lrodata is placed immediately after .rodata.
+   * However, .lrodata can be very large in our compiled models, which leads to
+   * relocation out-of-range errors for relative relocations. So we place it
+   * after other the sections that are referenced from .text using relative
+   * relocations. This is the default behavior in GNU ld. */
+  .lrodata : { *(.lrodata) }
+ } INSERT AFTER .bss;
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index aba6805148124..68847c0f7aa0f 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -3,12 +3,16 @@
 import inspect
 import itertools
 import logging
+
+import math
+import operator
 import sys
 import textwrap
 import time
+from concurrent.futures import ThreadPoolExecutor
 from io import StringIO
 
-from typing import Any, Callable, Dict, List, Optional, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from unittest.mock import patch
 
 import sympy
@@ -20,11 +24,31 @@
 from . import config, ir
 from .autotune_process import TensorMeta, TritonBenchmarkRequest
 from .codecache import code_hash, PersistentCache, PyCodeCache
-from .codegen.common import ChoiceCaller, IndentedBuffer, KernelTemplate
-from .codegen.triton import texpr, TritonKernel, TritonPrinter, TritonScheduling
+from .codegen.common import IndentedBuffer, KernelTemplate
+
+from .codegen.triton import (
+    gen_common_triton_imports,
+    texpr,
+    TritonKernel,
+    TritonPrinter,
+    TritonScheduling,
+)
+
 from .codegen.triton_utils import config_of, signature_to_meta
 from .exc import CUDACompileError
-from .utils import do_bench, Placeholder, sympy_dot, sympy_product, unique
+from .ir import ChoiceCaller, PrimitiveInfoType
+from .runtime.hints import DeviceProperties
+from .runtime.runtime_utils import do_bench, do_bench_cpu
+from .utils import (
+    get_dtype_size,
+    is_cpu_device,
+    Placeholder,
+    restore_stdout_stderr,
+    sympy_dot,
+    sympy_index_symbol,
+    sympy_product,
+    unique,
+)
 from .virtualized import V
 
 log = logging.getLogger(__name__)
@@ -76,10 +100,11 @@ def __init__(
         grid_fn,
         meta,
         call_sizes,
-        use_jit=True,
+        use_jit=False,
         prefix_args=0,
         suffix_args=0,
         epilogue_fn=identity,
+        subgraphs=None,
         *,
         index_dtype,
     ):
@@ -105,35 +130,63 @@ def __init__(
         self.suffix_args = suffix_args
         self.epilogue_fn = epilogue_fn
         self.render_hooks = dict()
+        self.triton_meta: Optional[Dict[str, object]] = None
+        # For Templated Attention
+        self.subgraphs = subgraphs
 
     def need_numel_args(self):
         return False
 
-    def jit_line(self):
+    def estimate_kernel_num_bytes(self):
+        """
+        Estimate the total number of bytes this kernel takes.
+        For in/out nodes, sizes are counted twice: once for reading and
+        once for writing.
+        """
+        ninplace_args = len(unique(self.args.inplace_buffers.values()))
+        num_bytes = []
+        for i, inp in enumerate(itertools.chain(self.input_nodes, (self.output_node,))):
+            size = V.graph.sizevars.size_hints(inp.get_size())
+            numel = functools.reduce(operator.mul, size)
+            dtype_size = get_dtype_size(inp.get_dtype())
+            num_bytes.append(numel * dtype_size * (1 + int(i < ninplace_args)))
+        return sum(num_bytes)
+
+    def jit_lines(self):
         if self.use_jit:
             return "@triton.jit"
 
         argdefs, _, signature = self.args.python_argdefs()
         triton_meta = {
             "signature": signature_to_meta(signature, size_dtype=self.index_dtype),
-            "device": V.graph.scheduler.current_device.index,
-            "device_type": V.graph.scheduler.current_device.type,
+            "device": DeviceProperties.create(self.output_node.get_device()),
             "constants": {},
         }
         triton_meta["configs"] = [config_of(signature)]
+        for arg_num in triton_meta["configs"][0].equal_to_1:  # type: ignore[index]
+            triton_meta["constants"][arg_num] = 1  # type: ignore[index]
+        matrix_instr_nonkdim = self.meta.get("matrix_instr_nonkdim", 0)
+        if matrix_instr_nonkdim != 0:
+            triton_meta["matrix_instr_nonkdim"] = matrix_instr_nonkdim
 
-        inductor_meta = {"kernel_name": str(Placeholder.DESCRIPTIVE_NAME)}
-        return textwrap.dedent(
-            f"""
-            @template(
+        self.triton_meta = triton_meta
+
+        inductor_meta = {
+            "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
+            **TritonKernel.inductor_meta_common(),
+        }
+        if config.profile_bandwidth or config.benchmark_kernel:
+            num_gb = self.estimate_kernel_num_bytes() / 1e9
+            inductor_meta["kernel_num_gb"] = num_gb
+        return f"""
+            @triton_heuristics.template(
                 num_stages={self.num_stages},
                 num_warps={self.num_warps},
                 triton_meta={triton_meta!r},
                 inductor_meta={inductor_meta!r},
             )
             @triton.jit
-            """
-        )
+        """
 
     def def_kernel(self, *argnames):
         """
@@ -180,21 +233,14 @@ def def_kernel(self, *argnames):
         def hook():
             # python_argdefs() cannot be run until after the rest of the template lazily adds more args
             arg_defs, *_ = self.args.python_argdefs()
-            return "\n".join(
-                [
-                    "import triton.language as tl",
-                    "import triton",
-                    "from torch._inductor.triton_heuristics import template",
-                    "from torch._inductor.utils import instance_descriptor",
-                    "from torch._inductor import triton_helpers",
-                    TritonKernel.gen_attr_descriptor_import(),
-                    "",
-                    self.jit_line(),
-                    f"def {self.kernel_name}({', '.join(arg_defs)}):",
-                    self.defines,
-                    renames.getvalue(),
-                ]
-            )
+            code = IndentedBuffer()
+            code.splice(gen_common_triton_imports())
+            code.splice(self.jit_lines())
+            code.writeline(f"def {self.kernel_name}({', '.join(arg_defs)}):")
+            with code.indent():
+                code.splice(self.defines)
+                code.splice(renames.getvalue())
+            return code.getvalue()
 
         assert "<DEF_KERNEL>" not in self.render_hooks
         self.render_hooks["<DEF_KERNEL>"] = hook
@@ -226,7 +272,56 @@ def stride(self, name, index):
             val = self.named_input_nodes[name].get_stride()[index]
         return texpr(self.rename_indexing(val))
 
-    def store_output(self, indices, val, mask):
+    def modification(self, **fixed_inputs) -> str:
+        """This function generates the code body to populate
+        a 'modification' placeholder within a template
+
+        TODO come up with standardized way to modify templates, with
+        potential multiple modifications
+        """
+
+        def add_input(name):
+            return self.args.input(name)
+
+        class PlaceholderSubstitution(V.WrapperHandler):  # type: ignore[name-defined]
+            self.name = "PlaceholderSubstitution"
+
+            def load(self, name: str, index: sympy.Expr):
+                if name not in fixed_inputs:
+                    # If it's not a fixed input, it's a load from a captured
+                    # tensor
+                    var = add_input(name)
+                    return f"tl.load({var} + {index})"
+
+                return f"({fixed_inputs[name]})"
+
+            def indirect_indexing(self, index_var, size, check):
+                return sympy_index_symbol(str(index_var))
+
+        # if self.modification_cache is None:
+        with V.set_ops_handler(PlaceholderSubstitution(V.ops)):
+            assert isinstance(
+                self.subgraphs, ir.ComputedBuffer
+            ), "Expected the subgraph to be a ComputedBuffer"
+            if isinstance(self.subgraphs.data, ir.InputBuffer):
+                out = self.subgraphs.data.make_loader()((1,))
+            else:
+                out = self.subgraphs.data.inner_fn((1,))
+
+        self.codegen_body()
+        self.body.writeline(f"{fixed_inputs['out']} = {out.value}")
+
+        body_val = self.body.getvalue()
+        self.body.clear()
+        self.cse.invalidate(set())
+        return body_val
+
+    def store_output(
+        self,
+        indices: Union[List[Any], Tuple[Any]],
+        val: str,
+        mask: Optional[str] = None,
+    ):
         """
         Hook called from template code to store the final output
         (if the buffer hasn't been optimized away), then append any
@@ -234,7 +329,7 @@ def store_output(self, indices, val, mask):
         """
         assert isinstance(indices, (list, tuple))
         assert isinstance(val, str)
-        assert isinstance(mask, str)
+        assert isinstance(mask, (str, type(None)))
         assert self.template_mask is None
         indices = list(map(TritonPrinter.paren, indices))
         index_symbols = [sympy.Symbol(x) for x in indices]
@@ -319,6 +414,7 @@ def template_env(self):
                 self.stride,
                 self.store_output,
                 self.make_load,
+                self.modification,
             ]
         }
 
@@ -329,6 +425,7 @@ def indexing(
         dense_indexing=False,
         copy_shape=None,
         override_mask=None,
+        block_ptr=False,
     ):
         """
         Override the default indexing to use our custom mask and force
@@ -339,6 +436,7 @@ def indexing(
             dense_indexing=False,
             copy_shape=self.template_mask,
             override_mask=self.template_mask,
+            block_ptr=block_ptr,
         )
 
     def initialize_range_tree(self, pid_cache):
@@ -374,6 +472,7 @@ def call_kernel(self, name: str, node: Optional[ir.IRNode] = None):
                 call_args,
                 device_index=V.graph.scheduler.current_device.index,
                 grid=grid,
+                triton_meta=self.triton_meta,
             )
         else:
             stream_name = wrapper.write_get_raw_stream(
@@ -425,8 +524,26 @@ def generate(
         prefix_args=0,
         suffix_args=0,
         epilogue_fn=identity,
+        subgraphs=None,
+        mutated_inputs=None,
         **kwargs,
     ):
+        """This function generates a TritonTemplateCaller
+
+        Args:
+            input_nodes: List of input nodes
+            layout: Output layout
+            num_stages: Number of stages for triton launch
+            num_warps: Number of warps for triton launch
+            prefix_args: Number of input nodes to be passed as arguments
+            suffix_args: Number of input nodes to be passed as arguments
+            epilogue_fn: Optional epilogue function to be called on the output
+            subgraphs: Optional subgraphs to be passed as arguments, these will be inlined
+                into the triton template string
+            mutated_inputs: Optional list of input nodes that are mutated by the kernel, this is helpful
+                if you need to return multiple outputs. You can pass them as inputs and mark them as
+                being mutated by the kernel.
+        """
         assert self.template, "requires jinja2"
         defines = StringIO()
         for name, val in kwargs.items():
@@ -455,13 +572,14 @@ def generate(
             suffix_args=suffix_args,
             epilogue_fn=epilogue_fn,
             index_dtype="tl.int32",
+            subgraphs=subgraphs,
         )
         with patch.object(
             V.graph, "get_dtype", self._fake_get_dtype(fake_out)
         ), TritonTemplateKernel(
             kernel_name=kernel_name,
             output_node=fake_out,
-            use_jit=True,
+            use_jit=False,
             **kernel_options,
         ) as kernel:
             try:
@@ -485,16 +603,25 @@ def generate(
                 + "-"
             )
             mod = PyCodeCache.load(code, extra)
-            _, call_args, _ = kernel.args.python_argdefs()
 
-        expected_args = list(unique(x.get_name() for x in input_nodes))
-        expected_args.extend([fake_out.get_name()])
-        assert list(call_args)[: len(expected_args)] == expected_args, (
-            call_args,
-            expected_args,
+        input_call_args = tuple(kernel.args.input_buffers.keys())
+        output_call_args = tuple(kernel.args.output_buffers.keys())
+
+        # We expect the input_buffer order to be [*input_nodes, *captured_buffers]
+        expected_input_args = tuple(unique(x.get_name() for x in input_nodes))
+        expected_output_args = (fake_out.get_name(),)
+        assert input_call_args[: len(expected_input_args)] == expected_input_args, (
+            input_call_args,
+            expected_input_args,
         )
+        assert output_call_args == expected_output_args, (
+            output_call_args,
+            expected_output_args,
+        )
+
+        full_input_nodes = tuple([V.graph.get_buffer(k) for k in input_call_args])
         extra_args = V.graph.sizevars.size_hints(
-            map(sympy.expand, call_args[len(expected_args) :]),
+            map(sympy.expand, tuple(kernel.args.sizevars.keys())),
             fallback=config.unbacked_symint_fallback,
         )
 
@@ -531,17 +658,32 @@ def make_kernel_render(out_node):
             extra_args=extra_args,
             num_stages=num_stages,
             num_warps=num_warps,
-            input_tensor_meta=TensorMeta.from_irnodes(input_nodes),
+            matrix_instr_nonkdim=kwargs.get("matrix_instr_nonkdim", 0),
+            input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes),
             output_tensor_meta=TensorMeta.from_irnodes(layout),
         )
 
         return TritonTemplateCaller(
             kernel_hash_name,
-            input_nodes,
+            full_input_nodes,
             layout,
             make_kernel_render,
             extra.strip("-").replace("-", ", "),
             bmreq,
+            log_info={
+                "tile_shape": str(
+                    (
+                        kwargs.get("BLOCK_M", -1),
+                        kwargs.get("BLOCK_K", -1),
+                        kwargs.get("BLOCK_N", -1),
+                    )
+                ),
+                "num_stages": num_stages,
+                "num_warps": num_warps,
+                "allow_tf32": str(kwargs.get("ALLOW_TF32", None)),
+                "acc_type": str(kwargs.get("ACC_TYPE", None)),
+            },
+            mutated_inputs=mutated_inputs,
         )
 
 
@@ -553,6 +695,8 @@ def __init__(
         *,
         name=None,
         has_out_variant=True,
+        op_overload=None,
+        use_fallback_kernel=False,
     ):
         super().__init__()
         name = name or kernel.__name__
@@ -562,6 +706,8 @@ def __init__(
         self.cpp_kernel_name = cpp_kernel
         self.has_out_variant = has_out_variant
         setattr(extern_kernels, name, kernel)
+        self.op_overload = op_overload
+        self.use_fallback_kernel = use_fallback_kernel
 
     def to_callable(self):
         return getattr(extern_kernels, self.name)
@@ -583,26 +729,58 @@ def hash_key(self):
             pass
         return code_hash("-".join(parts))
 
-    def bind(self, input_nodes, layout, ordered_kwargs_for_cpp_kernel=(), **kwargs):
+    def bind(
+        self,
+        input_nodes,
+        layout,
+        ordered_kwargs_for_cpp_kernel=(),
+        **kwargs,
+    ):
         self.ordered_kwargs_for_cpp_kernel = ordered_kwargs_for_cpp_kernel
         return ExternKernelCaller(
             self, input_nodes, layout, kwargs, has_out_variant=self.has_out_variant
         )
 
 
-class TritonTemplateCaller(ChoiceCaller):
+class TritonTemplateCaller(ir.TritonTemplateCallerBase):
     def __init__(
-        self, name, input_nodes, layout, make_kernel_render, debug_extra, bmreq
+        self,
+        name,
+        input_nodes,
+        layout,
+        make_kernel_render,
+        debug_extra,
+        bmreq,
+        log_info: Optional[
+            Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]
+        ] = None,
+        mutated_inputs=None,
     ):
         super().__init__(name, input_nodes, layout)
         self.make_kernel_render = make_kernel_render
         self.debug_extra = debug_extra
-        self.bmreq = bmreq
+        self.bmreq: TritonBenchmarkRequest = bmreq
+        if log_info is None:
+            log_info = {}
+        self.log_info: Dict[str, Any] = log_info
+        self.log_info.update(
+            {
+                "backend": "Triton",
+                "grid": str(self.bmreq.grid),
+                "num_stages": self.bmreq.num_stages,
+                "num_warps": self.bmreq.num_warps,
+            }
+        )
+        self.mutated_inputs = mutated_inputs
 
     def benchmark(self, *args, out):
         assert self.bmreq is not None
         return self.bmreq.benchmark(*args, output_tensor=out)
 
+    def precompile(self):
+        assert self.bmreq is not None
+        self.bmreq.precompile()
+
     def __str__(self):
         return f"TritonTemplateCaller({self.bmreq.module_path}, {self.debug_extra})"
 
@@ -623,9 +801,18 @@ def output_node(self):
                 layout=self.layout,
                 inputs=self.input_nodes,
                 make_kernel_render=self.make_kernel_render,
+                debug_extra=self.debug_extra,
+                mutated_inputs=self.mutated_inputs,
             )
         )
 
+    def info_dict(self) -> Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        return self.log_info
+
+    def get_make_kernel_render(self):
+        return self.make_kernel_render
+
 
 class ExternKernelCaller(ChoiceCaller):
     def __init__(
@@ -646,6 +833,9 @@ def __str__(self):
         return f"ExternKernelCaller({self.choice.call_name()})"
 
     def benchmark(self, *args, out):
+        if out.numel() == 0:
+            # no need to run the kerrnel of do benchmarking
+            return 0.0
         if self.has_out_variant:
             return super().benchmark(*args, out=out)
         else:
@@ -655,7 +845,10 @@ def benchmark(self, *args, out):
                 out_new, tuple(out.size()), tuple(out.stride())
             )
             out.copy_(out_new)  # for correctness checking
-            return do_bench(lambda: algo(*args))
+            if is_cpu_device(args):
+                return do_bench_cpu(lambda: algo(*args))
+            else:
+                return do_bench(lambda: algo(*args))
 
     def to_callable(self):
         fn = self.choice.to_callable()
@@ -677,21 +870,33 @@ def hash_key(self):
         )
 
     def output_node(self):
-        cls: Union[Type[ir.ExternKernelOut], Type[ir.ExternKernelAlloc]]
-        if self.has_out_variant:
-            cls = ir.ExternKernelOut
+        if config.abi_compatible and self.choice.use_fallback_kernel:
+            assert (
+                self.choice.op_overload is not None
+            ), "Please provide an op_overload to use ir.FallbackKernel"
+            inner = ir.FallbackKernel.create(
+                self.choice.op_overload, *self.input_nodes, **self.kwargs
+            )
         else:
-            cls = ir.ExternKernelAlloc
-        return ir.TensorBox.create(
-            cls(
+            cls = ir.ExternKernelOut if self.has_out_variant else ir.ExternKernelAlloc
+            inner = cls(
                 layout=self.layout,
                 inputs=self.input_nodes,
                 python_kernel_name=self.choice.call_name(),
                 cpp_kernel_name=self.choice.cpp_kernel_name,
                 ordered_kwargs_for_cpp_kernel=self.choice.ordered_kwargs_for_cpp_kernel,
+                op_overload=self.choice.op_overload,
                 kwargs=self.kwargs,
             )
-        )
+
+        return ir.TensorBox.create(inner)
+
+    def info_dict(self) -> Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        return {
+            "backend": "extern",
+            "kernel_call_name": self.choice.call_name(),
+        }
 
 
 class ErrorFromChoice(RuntimeError):
@@ -701,7 +906,20 @@ def __init__(self, msg, choice: ChoiceCaller, inputs_str):
         self.choice = choice
 
 
+class NoValidChoicesError(RuntimeError):
+    pass
+
+
 class AlgorithmSelectorCache(PersistentCache):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # the autotuning will get occur in the scheduler, so there is
+        # no guarantee that the first lowering for a given key will also be the
+        # first to benchmark it. share a single precompilation function for all lowerings
+        # of a particular key
+        self.precompile_cache: Dict[str, Callable[[], None]] = {}
+
     def __call__(
         self,
         name,
@@ -714,13 +932,23 @@ def __call__(
         # arg, the function will be called instead of
         # generating a random torch.Tensor for benchmarking.
         input_gen_fns: Optional[Dict[int, Callable[[ir.Buffer], torch.Tensor]]] = None,
+        precompilation_timeout_seconds: int = 60 * 60,
+        return_multi_template=False,
     ):
         from .codegen.cuda.cuda_kernel import CUDATemplateCaller
 
+        # Templates selected with input_gen_fns require specific input data to avoid IMA
+        # Passing custom input gen fns to benchmark_fusion NYI, so skip deferred template selection
+        if input_gen_fns is not None:
+            return_multi_template = False
+
+        # TODO - assert that we have not mutating kernels here
+
         # TODO(nmacchioni): remove once CI tests are fixed
         choices = [choice for choice in choices if choice is not None]
+
         if len(choices) == 0:
-            raise RuntimeError(
+            raise NoValidChoicesError(
                 "No choices to select, please consider adding ATEN into max_autotune_gemm_backends "
                 "config (defined in torch/_inductor/config.py) to allow at least one choice. "
             )
@@ -735,6 +963,121 @@ def __call__(
         def make_benchmark_fn():
             return self.make_benchmark_fn(choices, input_nodes, layout, input_gen_fns)
 
+        inputs_key = repr([self.key_of(x) for x in input_nodes])
+
+        def precompile(choices) -> Callable[[], None]:
+            def no_op(*args, **kwargs):
+                return
+
+            if (
+                precompilation_timeout_seconds is None
+                or precompilation_timeout_seconds <= 0
+            ):
+                return no_op
+            num_workers = min(
+                config.compile_threads,
+                torch.get_num_threads(),
+                len(choices),
+            )
+            if num_workers <= 0:
+                return no_op
+
+            # https://github.com/python/cpython/issues/106905
+            if (
+                sys.version_info.major == 3
+                and sys.version_info.minor == 11
+                and sys.version_info.micro <= 8
+            ):
+                return no_op
+
+            # TODO - debug issue
+            if torch.version.hip:
+                return no_op
+
+            # check local and global cache before precompiling
+            timings = self.lookup(
+                choices,
+                name,
+                inputs_key,
+                benchmark=None,
+            )
+
+            if timings:
+                return no_op
+
+            if config.search_autotune_cache and not (
+                config.max_autotune or config.max_autotune_gemm
+            ):
+                return no_op
+
+            precompile_key = (
+                f"{name}: {inputs_key} : {torch.get_float32_matmul_precision()}"
+            )
+            if precompile_func := self.precompile_cache.get(precompile_key):
+                return precompile_func
+
+            log.info(
+                "Multithreaded precompilation for %d choices using %d worker threads",
+                len(choices),
+                num_workers,
+            )
+
+            # In rare circumstances, because python threads inherit global state,
+            # thread pool executor can race and leave stdout/stderr in a state
+            # different than the original values. we explicitly restore the state
+            # here to avoid this issue.
+
+            initial_stdout = sys.stdout
+            initial_stderr = sys.stderr
+
+            def precompile_with_captured_stdout(choice):
+                with restore_stdout_stderr(initial_stdout, initial_stderr):
+                    return choice.precompile()
+
+            executor = ThreadPoolExecutor(max_workers=num_workers)
+            futures = executor.map(
+                lambda c: precompile_with_captured_stdout(c),
+                [c for c in choices if hasattr(c, "precompile")],
+                timeout=precompilation_timeout_seconds,
+            )
+
+            @functools.lru_cache(None)
+            @restore_stdout_stderr(initial_stdout, initial_stderr)
+            def wait_on_futures():
+                counters["inductor"]["select_algorithm_precompile"] += 1
+                try:
+                    iterator = iter(futures)
+                    while True:
+                        try:
+                            next(iterator)
+                        except CUDACompileError:
+                            log.error(  # noqa: G201
+                                "CUDA Compilation error", exc_info=True
+                            )
+                except TimeoutError:
+                    log.warning(
+                        f"Precompilation timed out after {precompilation_timeout_seconds} seconds."  # noqa: G004
+                    )
+                except StopIteration:
+                    pass
+                except Exception as e:
+                    try:
+                        from triton.runtime.autotuner import OutOfResources
+
+                        if isinstance(e, OutOfResources):
+                            # This config is invalid due to requiring too many resources
+                            pass
+                        else:
+                            raise e
+                    except ImportError:
+                        raise e
+
+                executor.shutdown(wait=True)
+
+            self.precompile_cache[precompile_key] = wait_on_futures
+
+            return wait_on_futures
+
         def autotune(choices):
             return make_benchmark_fn()(choices)
 
@@ -744,25 +1087,74 @@ def autotune(choices):
             # do the optional warmup
             tuning_pool.initialize()
 
-        autotune_start_ts = time.time()
-        timings = self.lookup(
-            choices,
-            name,
-            repr([self.key_of(x) for x in input_nodes]),
-            autotune,
-        )
-        autotune_elapse = time.time() - autotune_start_ts
+        def do_autotuning(precompile_fn):
+            precompile_start_ts = time.time()
+            precompile_fn()
+            precompile_elapse = time.time() - precompile_start_ts
+
+            autotune_start_ts = time.time()
+            timings = self.lookup(
+                choices,
+                name,
+                inputs_key,
+                autotune,
+            )
+            autotune_elapse = time.time() - autotune_start_ts
+
+            if make_benchmark_fn.cache_info().currsize:
+                counters["inductor"]["select_algorithm_autotune"] += 1
+
+            if (
+                make_benchmark_fn.cache_info().currsize
+                or log.getEffectiveLevel() == logging.DEBUG
+                or config.trace.log_autotuning_results
+            ):
+                self.log_results(
+                    name, input_nodes, timings, autotune_elapse, precompile_elapse
+                )
+
+            return timings
+
+        precompile_fn = precompile(choices)
+
+        if return_multi_template and (config.max_autotune or config.max_autotune_gemm):
+
+            def get_timings():
+                timings = do_autotuning(precompile_fn)
+                min_extern_choice = float("inf")
+                for choice, timing in timings.items():
+                    if isinstance(choice, ExternKernelCaller):
+                        min_extern_choice = min(min_extern_choice, timing)
+
+                timings = {
+                    choice: time
+                    for choice, time in timings.items()
+                    if (
+                        time <= min_extern_choice
+                        or not isinstance(choice, ExternKernelCaller)
+                    )
+                }
+
+                return timings
+
+            return torch._inductor.ir.TensorBox.create(
+                torch._inductor.ir.MultiTemplateBuffer(
+                    layout,
+                    input_nodes,
+                    get_timings,
+                )
+            )
+
+        # TODO - dont want to precompile if we have a cache hit
+        timings = do_autotuning(precompile_fn)
         if timings == {} or choices[0] not in timings:
             return choices[0].output_node()
 
-        if make_benchmark_fn.cache_info().currsize:
-            counters["inductor"]["select_algorithm_autotune"] += 1
-        if (
-            make_benchmark_fn.cache_info().currsize
-            or log.getEffectiveLevel() == logging.DEBUG
-        ):
-            self.log_results(name, input_nodes, timings, autotune_elapse)
-        selected_choice = builtins.min(timings, key=timings.__getitem__).output_node()
+        selected_key = builtins.min(timings, key=timings.__getitem__)
+        selected_time = timings[selected_key]
+        if (not isinstance(selected_time, float)) or (not math.isfinite(selected_time)):
+            raise NoValidChoicesError
+        selected_choice = selected_key.output_node()
         log.debug("selected choice: %s", str(selected_choice))
         return selected_choice
 
@@ -777,43 +1169,47 @@ def make_benchmark_fn(
         if input_gen_fns is None:
             input_gen_fns = {}
 
-        # de-duplicate args
-        unique_example_inputs = {
-            x.get_name(): input_gen_fns.get(i, cls.benchmark_example_value)(x)
-            for i, x in enumerate(input_nodes)
-        }
-        example_inputs = list(unique_example_inputs.values())
-        example_inputs_extern = [
-            torch.as_strided(
-                unique_example_inputs[input_node.get_name()],
-                V.graph.sizevars.size_hints(
-                    input_node.get_size(),
-                    fallback=config.unbacked_symint_fallback,
-                ),
-                V.graph.sizevars.size_hints(
-                    input_node.get_stride(),
-                    fallback=config.unbacked_symint_fallback,
-                ),
-                V.graph.sizevars.size_hint(
-                    input_node.get_layout().offset,
-                    fallback=config.unbacked_symint_fallback,
-                ),
+        def get_inputs():
+            # de-duplicate args
+            unique_example_inputs = {
+                x.get_name(): input_gen_fns.get(i, cls.benchmark_example_value)(x)
+                for i, x in enumerate(input_nodes)
+            }
+            example_inputs = list(unique_example_inputs.values())
+            example_inputs_extern = [
+                torch.as_strided(
+                    unique_example_inputs[input_node.get_name()],
+                    V.graph.sizevars.size_hints(
+                        input_node.get_size(),
+                        fallback=config.unbacked_symint_fallback,
+                    ),
+                    V.graph.sizevars.size_hints(
+                        input_node.get_stride(),
+                        fallback=config.unbacked_symint_fallback,
+                    ),
+                    V.graph.sizevars.size_hint(
+                        input_node.get_layout().offset,
+                        fallback=config.unbacked_symint_fallback,
+                    ),
+                )
+                for input_node in input_nodes
+            ]
+
+            out = cls.benchmark_example_value(layout)
+            out_extern = torch.as_strided(
+                out, out.size(), out.stride(), V.graph.sizevars.size_hint(layout.offset)
             )
-            for input_node in input_nodes
-        ]
+            expected = None
+            if VERIFY:
+                choices[0].benchmark(*example_inputs_extern, out=out_extern)
+                expected = out_extern.clone()
 
-        out = cls.benchmark_example_value(layout)
-        out_extern = torch.as_strided(
-            out, out.size(), out.stride(), V.graph.sizevars.size_hint(layout.offset)
-        )
-        if VERIFY:
-            choices[0].benchmark(*example_inputs_extern, out=out_extern)
-            expected = out_extern.clone()
+            return example_inputs, example_inputs_extern, out, out_extern, expected
 
         if DEBUG:
             print(f"{len(choices)} tuning requests:")
 
-        def debug_str():
+        def debug_str(example_inputs, out):
             def tensor_repr(x):
                 return (
                     f"torch.empty_strided({tuple(x.size())!r}, {tuple(x.stride())!r}, "
@@ -828,7 +1224,9 @@ def tensor_repr(x):
             lines += ["]", f"out = {tensor_repr(out)}", ""]
             return "\n".join(lines)
 
-        def benchmark_choice_in_current_process(choice):
+        def benchmark_choice_in_current_process(
+            choice, example_inputs, example_inputs_extern, out, out_extern, expected
+        ):
             out.zero_()
             if isinstance(choice, ExternKernelCaller):
                 # aten kernels want the offset baked in for sliced tensors
@@ -836,35 +1234,52 @@ def benchmark_choice_in_current_process(choice):
             else:
                 # triton templates want the base pointer for sliced tensors
                 result = choice.benchmark(*example_inputs, out=out)
-            if VERIFY:
+            if VERIFY and expected is not None:
                 torch.testing.assert_close(out_extern, expected, **VERIFY)
-            torch.cuda.synchronize()  # shake out any CUDA errors
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()  # shake out any CUDA errors
             return result
 
         def benchmark_in_current_process(choices):
+            inputs = get_inputs()
+            example_inputs, _, out, _, _ = inputs
             timings = {}
             for choice in choices:
                 try:
-                    timing = benchmark_choice_in_current_process(choice)
+                    timing = benchmark_choice_in_current_process(choice, *inputs)
                 except CUDACompileError as e:
-                    log.warning(
-                        "CUDA compilation error: \n%s. \nIgnore this choice.", str(e)
+                    log.error(
+                        "CUDA compilation error during autotuning: \n%s. \nIgnoring this choice.",
+                        str(e),
                     )
                     timing = float("inf")
                 except RuntimeError as e:
                     msg = str(e)
                     if "invalid argument" in msg:
                         msg += "\n\nThis may mean this GPU is too small for max_autotune mode.\n\n"
-                        log.warning(msg)
-                        timing = float("inf")
                     else:
                         if "illegal memory access" in msg:
                             msg += "\n\nEither error in template or triton bug.\n"
-                        raise ErrorFromChoice(msg, choice, debug_str())  # noqa: TRY200
+                    log.error(
+                        "Runtime error during autotuning: \n%s. \nIgnoring this choice.",
+                        msg,
+                    )
+                    timing = float("inf")
                 except AssertionError as e:
                     raise AssertionError(  # noqa: TRY200
                         f"Incorrect result from choice {choice}\n\n{e}"
                     )
+                except Exception as e:
+                    try:
+                        from triton.runtime.autotuner import OutOfResources
+
+                        if isinstance(e, OutOfResources):
+                            log.warning(e)
+                            timing = float("inf")
+                        else:
+                            raise e
+                    except ImportError:
+                        raise e
 
                 timings[choice] = timing
 
@@ -891,7 +1306,14 @@ def benchmark_in_sub_process(choices):
         return benchmark
 
     @staticmethod
-    def log_results(name, input_nodes, timings, elapse):
+    def log_results(
+        name: str,
+        input_nodes: List[ir.IRNode],
+        timings: Dict[ChoiceCaller, float],
+        elapse: float,
+        precompile_elapse: float,
+    ):
+        V.debug.log_autotuning_results(name, input_nodes, timings, elapse)
         if not (config.max_autotune or config.max_autotune_gemm) or not PRINT_AUTOTUNE:
             return
         sizes = ", ".join(
@@ -926,7 +1348,10 @@ def log_results(name, input_nodes, timings, elapse):
         autotune_type_str = (
             "SubProcess" if config.autotune_in_subproc else "SingleProcess"
         )
-        sys.stderr.write(f"{autotune_type_str} AUTOTUNE takes {elapse:.4f} seconds\n")
+        sys.stderr.write(
+            f"{autotune_type_str} AUTOTUNE benchmarking takes {elapse:.4f} seconds and {precompile_elapse:.4f}"
+            " seconds precompiling\n"
+        )
 
     @staticmethod
     def benchmark_example_value(node):
@@ -981,13 +1406,19 @@ def key_of(node):
         )
 
 
-_ALGORITHM_SELECTOR_CACHE = None
+_ALGORITHM_SELECTOR_CACHE: Optional[AlgorithmSelectorCache] = None
 
 
 def autotune_select_algorithm(*args, **kwargs):
     global _ALGORITHM_SELECTOR_CACHE
     if _ALGORITHM_SELECTOR_CACHE is None:
         _ALGORITHM_SELECTOR_CACHE = AlgorithmSelectorCache()
+
+    if "return_multi_template" not in kwargs:
+        kwargs[
+            "return_multi_template"
+        ] = torch._inductor.config.benchmark_multi_templates
+
     return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
 
 
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index b8ed515d46b1b..65a9cb8379078 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -1,16 +1,33 @@
 import functools
 import itertools
 import logging
-from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
 
 import sympy
 from sympy import Expr
 
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.utils._sympy.functions import FloorDiv, ModularIndexing
+from torch.utils._sympy.symbol import symbol_is_type, SymT
 from torch.utils._sympy.value_ranges import bound_sympy
 
-from .utils import sympy_subs, sympy_symbol, VarRanges
+from .utils import (
+    sympy_index_symbol,
+    sympy_index_symbol_with_prefix,
+    sympy_subs,
+    VarRanges,
+)
 from .virtualized import V
 
 log = logging.getLogger(__name__)
@@ -128,7 +145,7 @@ def visit_modular_indexing(base, divisor, modulus):
                 # actual iteration range is to size-1
                 iter_ranges_zero = {k: 0 for k, v in var_ranges.items()}
                 base_lowest = sympy_subs(base, iter_ranges_zero)
-                if self.statically_known_leq(0, base_lowest):
+                if self.statically_known_leq(0, base_lowest):  # type: ignore[arg-type]
                     # can't replace with indexing div if base can be negative
                     base_pos = True
                 else:
@@ -191,7 +208,7 @@ def can_merge_dims(a, b):
                     # approximate test passed, try sound version
                     va = index_vars[a]
                     vb = index_vars[b]
-                    v = sympy_symbol("_merge_tester")
+                    v = sympy_index_symbol("_merge_tester")
                     expr1 = sympy_subs(index_formulas[k], {va: v * sizes[a], vb: 0})
                     expr2 = sympy_subs(index_formulas[k], {va: 0, vb: v})
                     if self.simplify(expr1) == self.simplify(expr2):
@@ -272,7 +289,7 @@ def statically_known_equals(self, left: Expr, right: Expr) -> bool:
         """
         Returns a bool indicating if it is sound to optimize as if left and right are equal.
         """
-        return self.is_expr_static_and_true(sympy.Eq(left, right))
+        return self.is_expr_static_and_true(sympy.Eq(left, right))  # type: ignore[arg-type]
 
     # See Note - [On Statically Known]
     def statically_known_list_equals(self, left: List[Expr], right: List[Expr]) -> bool:
@@ -307,7 +324,7 @@ def statically_known_multiple_of(self, numerator: Expr, denominator: Expr) -> bo
         Return a bool indicating if it is sound to optimize for the numerator being a multiple of the denominator.
         """
         expr = sympy.Eq(numerator % denominator, 0)
-        return self.is_expr_static_and_true(expr)
+        return self.is_expr_static_and_true(expr)  # type: ignore[arg-type]
 
     # The guard functions require you to ALREADY KNOW that a particular
     # condition holds.  If you don't know (you want to guard on an expression
@@ -316,9 +333,9 @@ def statically_known_multiple_of(self, numerator: Expr, denominator: Expr) -> bo
 
     def guard_equals(self, left: Expr, right: Expr) -> Expr:
         if isinstance(left, Expr):
-            left = sympy_subs(left, self.inv_precomputed_replacements)
+            left = sympy_subs(left, self.inv_precomputed_replacements)  # type: ignore[arg-type]
         if isinstance(right, Expr):
-            right = sympy_subs(right, self.inv_precomputed_replacements)
+            right = sympy_subs(right, self.inv_precomputed_replacements)  # type: ignore[arg-type]
         assert self.shape_env.evaluate_expr(sympy.Eq(left, right))
         return left
 
@@ -328,20 +345,22 @@ def guard_leq(self, left: Expr, right: Expr) -> None:
     def guard_lt(self, left: Expr, right: Expr) -> None:
         assert self.shape_env.evaluate_expr(sympy.Lt(left, right))
 
-    def expect_true(self, expr: Expr, *, msg: str) -> None:
-        expr = sympy_subs(expr, self.inv_precomputed_replacements)
-        self.shape_env.defer_runtime_assert(expr, msg, fx_node=V.graph.current_node)
-
-    def expect_equals(self, left: Expr, right: Expr, *, msg: str) -> Expr:
-        # Prefer returning the expression without unbacked symints
-        if self.shape_env.is_unbacked_symint(left):
-            self.expect_true(sympy.Eq(left, right), msg=msg)
-            return right
-        elif self.shape_env.is_unbacked_symint(right):
-            self.expect_true(sympy.Eq(left, right), msg=msg)
-            return left
-        else:
-            return self.guard_equals(left, right)
+    def guarded_order(self, seq):
+        """
+        Return the order of a sequence as a permutation of range(len(seq)) and guard on that order not changing.
+        Used for generating block_ptrs.
+        """
+        seq = [*map(self.remove_precomputed_replacements, seq)]
+        seq = [(self.size_hint(var), orig_idx, var) for orig_idx, var in enumerate(seq)]
+        seq.sort()
+        order = [-1] * len(seq)
+        last_var = None
+        for new_index, (_, orig_index, var) in enumerate(seq):
+            order[orig_index] = new_index
+            if last_var is not None:
+                self.guard_leq(last_var, var)
+            last_var = var
+        return order
 
     # The evaluate functions evaluate some symbolic sympy expression
     # (NB: not necessarily an Expr) and return what the concrete result
@@ -366,6 +385,13 @@ def evaluate_min(self, left: Expr, right: Expr) -> Expr:
             self.guard_leq(right, left)
             return right
 
+    def evaluate_max(self, left: Expr, right: Expr) -> Expr:
+        """return the larger of left and right, and guard on that choice"""
+        # Always choose the opposite of eval min for consistency
+        # This means min(a, b) and max(a, b) produce the same guards
+        min_val = self.evaluate_min(left, right)
+        return right if min_val is left else left
+
     def evaluate_static_shape(self, left: Expr) -> int:
         right = self.size_hint(left)
         self.guard_equals(left, sympy.Integer(right))
@@ -375,8 +401,8 @@ def evaluate_static_shapes(self, left: List[Expr]) -> List[int]:
         return [self.evaluate_static_shape(x) for x in left]
 
     def remove_precomputed_replacements(self, expr: Expr) -> Expr:
-        if any(s.name.startswith("ps") for s in expr.free_symbols):
-            return sympy_subs(expr, self.inv_precomputed_replacements)
+        if any(symbol_is_type(s, SymT.PRECOMPUTED_SIZE) for s in expr.free_symbols):  # type: ignore[attr-defined]
+            return sympy_subs(expr, self.inv_precomputed_replacements)  # type: ignore[arg-type]
         return expr
 
     def symbolic_hint(self, expr: Expr) -> Expr:
@@ -386,11 +412,12 @@ def symbolic_hint(self, expr: Expr) -> Expr:
             return expr
         free_symbols = expr.free_symbols
         if not free_symbols:
-            return int(expr)
+            return int(expr)  # type: ignore[return-value]
         expr = self.remove_precomputed_replacements(expr)
         return sympy_subs(expr, self.var_to_val)
 
     def size_hint(self, expr: Expr, *, fallback: Optional[int] = None) -> int:
+        expr = self.simplify(expr)
         out = self.symbolic_hint(expr)
         if not isinstance(out, (int, sympy.Integer)) and fallback is not None:
             # Use the provided heuristic fallback hint
@@ -398,9 +425,9 @@ def size_hint(self, expr: Expr, *, fallback: Optional[int] = None) -> int:
                 s: self.shape_env.var_to_range.get(s, None) for s in expr.free_symbols
             }
             if all(vr is not None for vr in sym_vrs.values()):
-                expr_vr = bound_sympy(expr, sym_vrs)
-                lower = self.size_hint(expr_vr.lower)
-                upper = self.size_hint(expr_vr.upper)
+                expr_vr = bound_sympy(expr, sym_vrs)  # type: ignore[arg-type]
+                lower = self.size_hint(expr_vr.lower)  # type: ignore[arg-type]
+                upper = self.size_hint(expr_vr.upper)  # type: ignore[arg-type]
                 fallback = min(max(fallback, lower), upper)
             return fallback
         try:
@@ -440,8 +467,8 @@ def make_stride_vars_cache(self):
 
         def stride_vars(
             index: Expr,
-            vars: List[sympy.Symbol],
-            support_vars: Optional[List[sympy.Symbol]] = None,
+            vars: Sequence[sympy.Symbol],
+            support_vars: Optional[Sequence[sympy.Symbol]] = None,
         ) -> List[Expr]:
             if not support_vars:
                 support_vars = vars
@@ -450,7 +477,10 @@ def stride_vars(
         return stride_vars
 
     def _stride_vars(
-        self, index: Expr, vars: List[sympy.Symbol], support_vars: List[sympy.Symbol]
+        self,
+        index: Expr,
+        vars: Sequence[sympy.Symbol],
+        support_vars: Sequence[sympy.Symbol],
     ) -> List[Expr]:
         """Convert an indexing expression back into strides
 
@@ -494,12 +524,12 @@ def offset_var(self, index: Expr, vars: List[sympy.Symbol]) -> Expr:
     def stride_hints(
         self,
         index: Expr,
-        vars: List[sympy.Symbol],
-        support_vars: Optional[List[sympy.Symbol]] = None,
+        vars: Sequence[sympy.Symbol],
+        support_vars: Optional[Sequence[sympy.Symbol]] = None,
     ) -> List[int]:
         for v in index.free_symbols:
-            if v.name.startswith("indirect"):
-                index = sympy_subs(index, {v: 0})
+            if symbol_is_type(v, SymT.INDIRECT):  # type: ignore[attr-defined]
+                index = sympy_subs(index, {v: 0})  # type: ignore[dict-item]
         result = []
         for s in self.stride_vars(index, vars, support_vars):
             try:
@@ -514,7 +544,7 @@ def stride_order(self, index: Expr, vars: List[sympy.Symbol]) -> List[int]:
         order.sort(key=lambda x: (strides[x] == 0, strides[x]))
         return order
 
-    def lookup_precomputed_size(self, expr: Expr) -> sympy.Symbol:
+    def lookup_precomputed_size(self, expr: Expr) -> Expr:
         if (
             isinstance(expr, (int, sympy.Symbol, sympy.Number))
             or expr.is_number
@@ -523,7 +553,9 @@ def lookup_precomputed_size(self, expr: Expr) -> sympy.Symbol:
             return expr
         expr = self.remove_precomputed_replacements(expr)
         if expr not in self.precomputed_replacements:
-            sym = sympy_symbol(f"ps{len(self.precomputed_replacements)}")
+            sym = sympy_index_symbol_with_prefix(
+                SymT.PRECOMPUTED_SIZE, len(self.precomputed_replacements)
+            )
             self.precomputed_replacements[expr] = sym
             self.inv_precomputed_replacements[sym] = expr
         return self.precomputed_replacements[expr]
diff --git a/torch/_inductor/subgraph_lowering.py b/torch/_inductor/subgraph_lowering.py
new file mode 100644
index 0000000000000..9413ac1b2659c
--- /dev/null
+++ b/torch/_inductor/subgraph_lowering.py
@@ -0,0 +1,142 @@
+"""Utilities for lowering subgraphs used by higher order operators
+
+"""
+
+import functools
+import operator
+from dataclasses import dataclass
+from typing import List, Optional, TypeVar
+
+import torch
+
+from . import ir
+from .exc import SubgraphLoweringException
+from .ops_handler import SimpleCSEHandler
+from .virtualized import ops, V, WrapperHandler
+
+T = TypeVar("T")
+
+
+class PointwiseSubgraphLowering(torch.fx.Interpreter):
+    graph_outputs: Optional[List[ir.IRNode]]
+
+    def __init__(
+        self,
+        gm: torch.fx.GraphModule,
+        root_graph_lowering: "torch._inductor.graph.GraphLowering",
+    ):
+        super().__init__(gm)
+        self.graph_outputs = None
+        self.root_graph = root_graph_lowering
+
+    @property
+    def sizevars(self):
+        return self.root_graph.sizevars
+
+    def mark_buffer_mutated(self, name):
+        raise SubgraphLoweringException("Mutations are not supported in this context")
+
+    def register_buffer(self, data):
+        raise SubgraphLoweringException(
+            "Buffer creation is not supported in this context"
+        )
+
+    def call_function(self, target, args, kwargs):
+        from .lowering import lowerings
+
+        if target is operator.getitem and isinstance(args[0], (list, tuple, dict)):
+            return super().call_function(target, args, kwargs)
+
+        assert isinstance(target, torch._ops.OpOverload)
+
+        if target not in lowerings:
+            raise SubgraphLoweringException(
+                f"{target} not supported in subgraph, (missing lowering)"
+            )
+
+        if torch.Tag.pointwise not in target.tags:
+            raise SubgraphLoweringException(
+                f"Only pointwise operators are supported in this context, but got {target}"
+            )
+
+        return lowerings[target](*args, **kwargs)
+
+    def output(self, target, args, kwargs):
+        assert len(args) == 1
+        self.graph_outputs = args[0]
+
+
+@dataclass
+class InputDescriptor:
+    dtype: torch.dtype
+    device: torch.device
+
+
+class TracingOpsHandler(WrapperHandler[T]):
+    def __init__(self, tracer, num_inputs):
+        parent = tracer.create_proxy("placeholder", "ops", (), {})
+        super().__init__(parent)
+        self.tracer = tracer
+
+        self.placeholders = [
+            self.tracer.create_proxy("placeholder", f"input{i}", (), {})
+            for i in range(num_inputs)
+        ]
+
+    def placeholder(self, idx):
+        return self.placeholders[idx]
+
+    def output(self, *args):
+        return self.tracer.create_node(
+            "output", "output", (tuple(self.tracer.create_arg(a) for a in args),), {}
+        )
+
+
+def lower_pointwise_subgraph(subgraph: ir.Subgraph, inputs: List[InputDescriptor]):
+    # Lower subgraph to ir.Pointwise nodes
+    def fake_inner_fn(loop_idx, input_idx):
+        return ops.placeholder(input_idx)
+
+    graph_inputs = [
+        ir.Pointwise.create(
+            device=desc.device,
+            dtype=desc.dtype,
+            inner_fn=functools.partial(fake_inner_fn, input_idx=i),
+            ranges=[],
+        )
+        for i, desc in enumerate(inputs)
+    ]
+    gm = subgraph.graph_module
+    pw_subgraph = PointwiseSubgraphLowering(gm, root_graph_lowering=V.graph)
+    with V.set_graph_handler(pw_subgraph):  # type: ignore[arg-type]
+        pw_subgraph.run(*graph_inputs)
+
+    # Combine multiple pointwise computations into a single graph module
+    # Do this by tracing through each individually and doing CSE
+    tracer = torch.fx.Tracer()
+    tracer.graph = torch.fx.Graph(tracer_cls=tracer.__class__)
+    trace_ops = SimpleCSEHandler(TracingOpsHandler(tracer, len(inputs)))
+    assert pw_subgraph.graph_outputs is not None
+
+    with V.set_ops_handler(trace_ops):
+        output_irs = []
+
+        for out_var in pw_subgraph.graph_outputs:
+            assert isinstance(out_var, ir.TensorBox), type(out_var)
+            assert out_var.get_size() == []
+            assert isinstance(out_var.data, ir.StorageBox)
+            assert isinstance(out_var.data.data, ir.Pointwise)
+
+            idx = ()
+            ir_out = out_var.data.data.inner_fn(idx)
+
+            output_irs.append(ir_out)
+
+        ops.output(*output_irs)
+
+    lowered_gm = torch.fx.GraphModule({}, tracer.graph)
+
+    def inner_fn(*args, **kwargs):
+        return lowered_gm(V.get_ops_handler(), *args, **kwargs)
+
+    return inner_fn
diff --git a/torch/_inductor/test_case.py b/torch/_inductor/test_case.py
new file mode 100644
index 0000000000000..3933c9dbc0043
--- /dev/null
+++ b/torch/_inductor/test_case.py
@@ -0,0 +1,35 @@
+import contextlib
+import os
+
+from torch._dynamo.test_case import (
+    run_tests as dynamo_run_tests,
+    TestCase as DynamoTestCase,
+)
+
+from torch._inductor import config
+from torch._inductor.utils import fresh_inductor_cache
+
+
+def run_tests(needs=()):
+    dynamo_run_tests(needs)
+
+
+class TestCase(DynamoTestCase):
+    """
+    A base TestCase for inductor tests. Enables FX graph caching and isolates
+    the cache directory for each test.
+    """
+
+    def setUp(self):
+        super().setUp()
+        self._inductor_test_stack = contextlib.ExitStack()
+        self._inductor_test_stack.enter_context(config.patch({"fx_graph_cache": True}))
+        if (
+            os.environ.get("INDUCTOR_TEST_DISABLE_FRESH_CACHE") != "1"
+            and os.environ.get("TORCH_COMPILE_DEBUG") != "1"
+        ):
+            self._inductor_test_stack.enter_context(fresh_inductor_cache())
+
+    def tearDown(self):
+        super().tearDown()
+        self._inductor_test_stack.close()
diff --git a/torch/_inductor/triton_helpers.py b/torch/_inductor/triton_helpers.py
deleted file mode 100644
index e6be9c8756c68..0000000000000
--- a/torch/_inductor/triton_helpers.py
+++ /dev/null
@@ -1,182 +0,0 @@
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def promote_to_tensor(x):
-    # Addition promotes to tensor for us
-    return x + tl.zeros((1,), tl.int1)
-
-
-@triton.jit
-def is_floating(x):
-    return promote_to_tensor(x).dtype.is_floating()
-
-
-@triton.jit
-def _prod_accumulate(a, b):
-    return a * b
-
-
-@triton.jit
-def prod(input, axis):
-    return tl.reduce(input, axis, _prod_accumulate)
-
-
-@triton.jit
-def minimum(a, b):
-    mask = a < b
-    if is_floating(a):
-        mask |= a != a
-    return tl.where(mask, a, b)
-
-
-@triton.jit
-def maximum(a, b):
-    mask = a > b
-    if is_floating(a):
-        mask |= a != a
-    return tl.where(mask, a, b)
-
-
-@triton.jit
-def min2(a, dim):
-    return tl.reduce(a, dim, minimum)
-
-
-@triton.jit
-def max2(a, dim):
-    return tl.reduce(a, dim, maximum)
-
-
-@triton.jit
-def minimum_with_index(a_value, a_index, b_value, b_index):
-    mask = a_value < b_value
-    equal = a_value == b_value
-    if is_floating(a_value):
-        a_isnan = a_value != a_value
-        b_isnan = b_value != b_value
-        mask |= a_isnan and not b_isnan
-        # Consider NaNs as equal
-        equal |= a_isnan and b_isnan
-
-    # Prefer lowest index if values are equal
-    mask |= equal & (a_index < b_index)
-    return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index)
-
-
-@triton.jit
-def maximum_with_index(a_value, a_index, b_value, b_index):
-    mask = a_value > b_value
-    equal = a_value == b_value
-    if is_floating(a_value):
-        a_isnan = a_value != a_value
-        b_isnan = b_value != b_value
-        mask |= a_isnan and not b_isnan
-        # Consider NaNs as equal
-        equal |= a_isnan and b_isnan
-
-    # Prefer lowest index if values are equal
-    mask |= equal & (a_index < b_index)
-    return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index)
-
-
-@triton.jit
-def min_with_index(value, index, dim):
-    return tl.reduce((value, index), dim, minimum_with_index)
-
-
-@triton.jit
-def max_with_index(value, index, dim):
-    return tl.reduce((value, index), dim, maximum_with_index)
-
-
-@triton.jit
-def welford_reduce(value, mean, m2, weight):
-    delta = value - mean
-    new_weight = weight + 1
-    new_mean = mean + delta / new_weight
-    return (
-        new_mean,
-        m2 + delta * (value - new_mean),
-        new_weight,
-    )
-
-
-@triton.jit
-def welford_combine(mean_1, m2_1, weight_1, mean_2, m2_2, weight_2):
-    delta = mean_2 - mean_1
-    new_weight = weight_1 + weight_2
-    w2_over_w = tl.where(new_weight == 0.0, 0.0, weight_2 / new_weight)
-    return (
-        mean_1 + delta * w2_over_w,
-        m2_1 + m2_2 + delta * delta * weight_1 * w2_over_w,
-        new_weight,
-    )
-
-
-@triton.jit
-def welford(mean, m2, weight, dim):
-    return tl.reduce((mean, m2, weight), dim, welford_combine)
-
-
-@triton.jit
-def device_assert_then(cond, msg, r):
-    tl.device_assert(cond, msg)
-    return r
-
-
-@triton.jit
-def randint64(seed, offset, low, high):
-    r0, r1, r2, r3 = tl.randint4x(seed, offset)
-    r0 = r0.to(tl.uint64)
-    r1 = r1.to(tl.uint64)
-    result = r0 | (r1 << 32)
-    size = high - low
-    result = result % size.to(tl.uint64)
-    result = result.to(tl.int64) + low
-    return result
-
-
-@triton.jit
-def _any_combine(a, b):
-    return a | b
-
-
-@triton.jit
-def any(a, dim):
-    return tl.reduce(a, dim, _any_combine)
-
-
-@triton.jit
-def bucketize_binary_search(
-    values,  # 1D tensor
-    offsets_ptr,
-    indexing_dtype,
-    right,  # bool: if true, use intervals closed on the left; see [Note: Inductor bucketize op]
-    OFFSETS_SIZE: int,
-    BLOCK_SHAPE,  # tuple/list of block shape
-):
-    """
-    See [Note: Inductor bucketize op]
-    """
-
-    low = tl.zeros(BLOCK_SHAPE, dtype=indexing_dtype)
-    high = tl.full(BLOCK_SHAPE, OFFSETS_SIZE, dtype=indexing_dtype)
-
-    full_range = OFFSETS_SIZE + 1
-    while full_range > 1:
-        mid = (high + low) // 2
-        mask = mid < OFFSETS_SIZE
-        bucket_upper_bound = tl.load(offsets_ptr + mid, mask=mask)
-        if right:
-            is_above = values >= bucket_upper_bound
-        else:
-            is_above = values > bucket_upper_bound
-
-        low = tl.where(is_above & mask, mid + 1, low)
-        high = tl.where(is_above, high, mid)
-
-        full_range = (full_range + 1) // 2
-
-    return low
diff --git a/torch/_inductor/triton_heuristics.py b/torch/_inductor/triton_heuristics.py
deleted file mode 100644
index 601b02f0dd659..0000000000000
--- a/torch/_inductor/triton_heuristics.py
+++ /dev/null
@@ -1,1368 +0,0 @@
-import builtins
-import copy
-import functools
-import hashlib
-import inspect
-import json
-import logging
-import math
-import operator
-import os
-import os.path
-import re
-import threading
-from enum import auto, Enum
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple
-
-import torch
-
-import torch.autograd.profiler as autograd_profiler
-from torch._dynamo.device_interface import get_interface_for_device
-from torch._dynamo.utils import dynamo_timed, get_first_attr
-from torch.utils._triton import has_triton_package
-
-from . import config
-from .codecache import cache_dir, CudaKernelParamCache
-from .coordinate_descent_tuner import CoordescTuner
-
-from .ir import ReductionHint, TileHint
-from .utils import (
-    ceildiv,
-    conditional_product,
-    create_bandwidth_info_str,
-    do_bench,
-    get_num_bytes,
-    next_power_of_2,
-    triton_config_to_hashable,
-)
-
-
-log = logging.getLogger(__name__)
-
-if has_triton_package():
-    import triton
-    from triton import Config
-    from triton.runtime.autotuner import OutOfResources
-    from triton.runtime.jit import KernelInterface
-
-    try:
-        from triton.compiler.compiler import ASTSource
-    except ImportError:
-        ASTSource = None
-else:
-    Config = object
-    triton = None
-    KernelInterface = object
-    OutOfResources = object
-    ASTSource = None
-
-
-_NUM_THREADS_PER_WARP = 32
-
-
-class HeuristicType(Enum):
-    POINTWISE = auto()
-    REDUCTION = auto()
-    PERSISTENT_REDUCTION = auto()
-    TEMPLATE = auto()
-    USER_AUTOTUNE = auto()
-
-
-class AutotuneHint(Enum):
-    ELEMENTS_PER_WARP_32 = 0
-
-    # Triton codegen tries to codegen set of AutotuneHints.
-    # Enum.__repr__ looks like "<AutotuneHint.ELEMENTS_PER_WARP_32: 0>""
-    # which isn't valid python.
-    # Enum.__str__ will just return "AutotuneHint.ELEMENTS_PER_WARP_32".
-    __repr__ = Enum.__str__
-
-
-def autotune_hints_to_configs(
-    hints: Set[AutotuneHint], size_hints, block_size: int
-) -> List[Config]:
-    """
-    AutotuneHints can be attached to the metadata of triton kernels for providing
-    suggestions about what to try for autotuning. One reason to do this is if there are
-    some configs that are only useful in specific scenarios, in which case we can avoid
-    wasting compile time on autotuning unless we know we are in one of those scenarios.
-
-    Based on those hints, this function will generate a list of additional autotuning
-    configs to try.
-    """
-    xyz_options: Tuple[Tuple[int, Optional[int], Optional[int]], ...]
-    configs = []
-
-    for hint in hints:
-        if hint == AutotuneHint.ELEMENTS_PER_WARP_32:
-            if len(size_hints) == 1:
-                xyz_options = ((block_size // 4, None, None),)
-            elif len(size_hints) == 2:
-                xyz_options = ((block_size // 4, 1, None), (1, block_size // 4, None))
-            elif len(size_hints) == 3:
-                xyz_options = (
-                    (block_size // 4, 1, 1),
-                    (1, block_size // 4, 1),
-                    (1, 1, block_size // 4),
-                )
-            for xyz in xyz_options:
-                configs.append(
-                    triton_config(
-                        size_hints,
-                        *xyz,
-                        num_elements_per_warp=32,
-                    )
-                )
-
-    return configs
-
-
-def disable_pointwise_autotuning():
-    # Autotuning can give different benchmarking results from run to run, and
-    # therefore we disable autotuning when use_deterministic flag is on.
-    if torch.are_deterministic_algorithms_enabled():
-        return True
-    return not config.triton.autotune_pointwise
-
-
-class CachingAutotuner(KernelInterface):
-    """
-    Simplified version of Triton autotuner that has no invalidation
-    key and caches the best config to disk to improve cold start times.
-    Unlike the main triton Autotuner, this version can precompile all
-    configs, and does not rely on the Triton JIT.
-    """
-
-    def __init__(
-        self,
-        fn,
-        triton_meta,  # passed directly to triton
-        configs,
-        save_cache_hook,
-        mutated_arg_names,
-        heuristic_type,
-        size_hints=None,
-        inductor_meta=None,  # metadata not relevant to triton
-    ):
-        super().__init__()
-
-        self.fn = fn
-        self.triton_meta = triton_meta
-        self.inductor_meta = {} if inductor_meta is None else inductor_meta
-        self.save_cache_hook = save_cache_hook
-        self.mutated_arg_names = mutated_arg_names
-        self.configs = configs
-        self.heuristic_type = heuristic_type
-
-        # Align the default design that default as cuda
-        self.device_type = (
-            triton_meta["device_type"] if "device_type" in triton_meta else "cuda"
-        )
-        self.gpu_device = get_interface_for_device(self.device_type)
-
-        if log.isEnabledFor(logging.DEBUG):
-            log.debug("CachingAutotuner gets %d configs", len(self.configs))
-            for c in self.configs:
-                log.debug(c)
-
-        self.launchers = []
-        self.lock = threading.Lock()
-        if os.getenv("TRITON_CACHE_DIR") is None:
-            os.environ["TRITON_CACHE_DIR"] = os.path.join(
-                cache_dir(),
-                "triton",
-                str(self.triton_meta.get("device", 0)),
-            )
-
-        self.size_hints = size_hints
-        self.coordesc_tuner = CoordescTuner(
-            is_mm=False, name=self.fn.__name__, size_hints=size_hints
-        )
-
-        # pre-create the profiler context manager to reduce latency
-        self.record_function_ctx = torch._C._profiler._RecordFunctionFast(
-            self.inductor_meta.get("kernel_name", "triton kernel")
-        )
-
-    def precompile(self, warm_cache_only_with_cc=None):
-        with self.lock:
-            if self.launchers:
-                return
-            self.launchers = []
-            compiled_binaries = []
-            for c in self.configs:
-                try:
-                    compiled_binary, launcher = self._precompile_config(
-                        c, warm_cache_only_with_cc
-                    )
-                except OutOfResources:
-                    # Skip the config if we run out of resource
-                    continue
-                self.launchers.append(launcher)
-                compiled_binaries.append(compiled_binary)
-
-            if len(self.launchers) == 0:
-                raise RuntimeError(
-                    "No valid triton configs. Report a fatal compilation error"
-                )
-
-            seen_configs = set(self.configs)
-
-            device_prop = self.gpu_device.Worker.get_device_properties(
-                self.triton_meta["device"]
-            )
-            if (
-                config.dynamic_scale_rblock
-                and self.heuristic_type == HeuristicType.REDUCTION
-                and self.size_hints is not None
-                # Disable for AMDGPU as Triton is not ready to return n_regs for a compiled_binary.
-                and torch.version.hip is None
-                and device_prop.major >= 8
-            ):
-                for triton_config, compiled_binary in zip(
-                    self.configs, compiled_binaries
-                ):
-                    assert len(self.size_hints) == 2
-                    xblock = triton_config.kwargs["XBLOCK"]
-                    rblock = triton_config.kwargs["RBLOCK"]
-                    total_block = (self.size_hints[0] + xblock - 1) // xblock
-                    nreg = getattr(compiled_binary, "n_regs", None)
-                    if nreg is None:
-                        continue
-
-                    # make sure rblock is not too small
-                    if rblock <= 64:
-                        continue
-
-                    # each SM of A100 has 65536 32-bit registers. To maximize
-                    # the theoretical occupancy, we need run 2048 threads on each
-                    # SM. So each thread should use no more than 65536 / 2048
-                    # = 32 registers. In cases where occupancy matters, and each
-                    # thread uses too many registers, reduce RBLOCK to reduce
-                    # the register usage.
-                    # For kernel https://gist.github.com/shunting314/e4cccc031fe30d378b9b23c08c238cbd
-                    # from PLBartForCausalLM, latency improve from
-                    # 7.795ms to 4.883ms.
-                    #
-                    if (
-                        nreg
-                        <= device_prop.regs_per_multiprocessor
-                        // device_prop.max_threads_per_multi_processor
-                    ):
-                        continue
-
-                    nreg_per_warp = nreg * 32
-                    nreg_per_block = nreg_per_warp * triton_config.num_warps
-
-                    # Previously we set max_blocks_per_sm to 'max_threads_per_multi_processo / (32 * num_warps)'
-                    # The formula below is a tighter upper bound since we have the assumption that
-                    #   nreg > device_prop.regs_per_multiprocessor // device_prop.max_threads_per_multi_processor
-                    # due to the if condition above and:
-                    #   regs_per_multiprocessor / nreg_per_block
-                    #   = regs_per_multiprocessor / (nreg * 32 * num_warps)
-                    #   < regs_per_multiprocessor / ((regs_per_multiprocessor / max_threads_per_multi_processor) * 32 * num_warps)
-                    #   = max_threads_per_multi_processor / (32 * num_warps)
-                    # Using a tigher upper bound can reveal more optimization opportunities.
-                    max_blocks_per_sm = max(
-                        device_prop.regs_per_multiprocessor // nreg_per_block, 1
-                    )
-
-                    if (
-                        total_block
-                        <= max_blocks_per_sm * device_prop.multi_processor_count
-                    ):
-                        # no need to improve occupancy
-                        continue
-                    new_config = copy.deepcopy(triton_config)
-                    new_config.kwargs["RBLOCK"] = rblock // 2
-                    if new_config in seen_configs:
-                        continue
-                    seen_configs.add(new_config)
-                    self.launchers.append(
-                        self._precompile_config(new_config, warm_cache_only_with_cc)[1]
-                    )
-            self.configs = None
-
-    def _precompile_config(self, cfg: Config, warm_cache_only_with_cc: Optional[int]):
-        """Ahead of time compile a given autotuner config."""
-        compile_meta = copy.deepcopy(self.triton_meta)
-        for k, v in cfg.kwargs.items():
-            compile_meta["constants"][self.fn.arg_names.index(k)] = v
-        compile_meta["num_warps"] = cfg.num_warps
-        compile_meta["num_stages"] = cfg.num_stages
-        compile_meta["debug"] = (
-            config.assert_indirect_indexing and torch.version.hip is None
-        )
-
-        # Setting device_type="hip" required on ROCm to pass down to triton
-        compile_meta["device_type"] = (
-            self.device_type if torch.version.hip is None else "hip"
-        )
-
-        if warm_cache_only_with_cc:
-            cc = warm_cache_only_with_cc
-        else:
-            # Use device_type 'cuda' for both cuda and hip devices to retrieve
-            # the compute capability.
-            device_type = self.device_type if torch.version.hip is None else "cuda"
-            device_id = compile_meta["device"]
-            device = torch.device(device_type, device_id)
-            cc = self.gpu_device.get_compute_capability(device)
-
-        compile_meta["cc"] = cc
-
-        if ASTSource:
-            compile_args = (
-                ASTSource(
-                    self.fn,
-                    compile_meta["signature"],
-                    compile_meta["constants"],
-                    compile_meta["configs"][0],
-                ),
-            )
-
-            target = (compile_meta["device_type"], cc)
-            options = {
-                "num_warps": compile_meta["num_warps"],
-                "num_stages": compile_meta["num_stages"],
-                "debug": compile_meta["debug"],
-            }
-            compile_kwargs = {
-                "target": target,
-                "options": options,
-            }
-        else:
-            compile_args = (self.fn,)
-            compile_kwargs = compile_meta
-
-        if warm_cache_only_with_cc:
-            return (
-                triton.compile(*compile_args, **compile_kwargs),
-                None,
-            )
-
-        # load binary to the correct device
-        with self.gpu_device.device(compile_meta["device"]):  # type: ignore[attr-defined]
-            # need to initialize context
-            self.gpu_device.synchronize(self.gpu_device.current_device())
-
-            binary = triton.compile(*compile_args, **compile_kwargs)
-            binary._init_handles()
-
-        call_args = [
-            arg
-            for i, arg in enumerate(self.fn.arg_names)
-            if i not in self.fn.constexprs
-        ]
-        def_args = [name for name in self.fn.arg_names if name not in cfg.kwargs]
-
-        scope = {
-            "grid_meta": cfg.kwargs,
-            "bin": binary,
-            "torch": torch,
-            "set_device": self.gpu_device.set_device,
-            "current_device": self.gpu_device.current_device,
-        }
-
-        scope["runner"] = get_first_attr(binary, "run", "c_wrapper")
-        scope["function"] = get_first_attr(binary, "function", "cu_function")
-        cluster_dims = get_first_attr(binary, "cluster_dims", "clusterDims")
-        scope["cta_args"] = (
-            (binary.num_ctas, *cluster_dims) if hasattr(binary, "num_ctas") else ()
-        )
-
-        exec(
-            f"""
-            def launcher({', '.join(def_args)}, grid, stream):
-                if callable(grid):
-                    grid_0, grid_1, grid_2 = grid(grid_meta)
-                else:
-                    grid_0, grid_1, grid_2 = grid
-
-                runner(grid_0, grid_1, grid_2, bin.num_warps,
-                            *cta_args, bin.shared,
-                            stream, function, None, None, None,
-                            {', '.join(call_args)})
-                return bin
-            """.lstrip(),
-            scope,
-        )
-
-        launcher = scope["launcher"]
-        launcher.config = cfg
-        launcher.n_regs = getattr(binary, "n_regs", None)
-        launcher.n_spills = getattr(binary, "n_spills", None)
-        launcher.shared = getattr(binary, "shared", None)
-        launcher.store_cubin = config.triton.store_cubin
-        # store this global variable to avoid the high overhead of reading it when calling run
-        if launcher.store_cubin:
-            launcher.fn = self.fn
-            launcher.bin = binary
-
-        return binary, launcher
-
-    def bench(self, launcher, *args, grid, **kwargs):
-        """Measure the performance of a given launcher"""
-        if launcher.n_spills > config.triton.spill_threshold:
-            log.debug(
-                "Skip config %s because of register spilling: %d",
-                launcher.config,
-                launcher.n_spills,
-            )
-            return float("inf")
-
-        stream = self.gpu_device.get_raw_stream(  # type: ignore[call-arg]
-            self.gpu_device.current_device()
-        )
-
-        def kernel_call():
-            if launcher.config.pre_hook is not None:
-                launcher.config.pre_hook(
-                    {**dict(zip(self.arg_names, args)), **launcher.config.kwargs}
-                )
-
-            cloned_args, cloned_kwargs = self.clone_args(*args, **kwargs)
-            launcher(
-                *cloned_args,
-                **cloned_kwargs,
-                grid=grid,
-                stream=stream,
-            )
-
-        return do_bench(kernel_call, rep=40, fast_flush=True)
-
-    def clone_args(self, *args, **kwargs) -> Tuple[List[Any], Dict[str, Any]]:
-        from .compile_fx import clone_preserve_strides
-
-        # clone inplace buffers to avoid autotune contaminating them if
-        # the kernel does in-place stores. avoid cloning other buffers because
-        # it leads to increase memory use
-        cloned_args = []
-        for i, arg in enumerate(args):
-            if self.fn.arg_names[i] in self.mutated_arg_names:
-                assert isinstance(arg, torch.Tensor)
-                cloned_args.append(clone_preserve_strides(arg))
-            else:
-                cloned_args.append(arg)
-
-        cloned_kwargs: Dict[str, Any] = {}
-        for name, arg in kwargs.items():
-            if name in self.mutated_arg_names:
-                assert isinstance(arg, torch.Tensor)
-                cloned_kwargs[name] = clone_preserve_strides(arg)
-            else:
-                cloned_kwargs[name] = arg
-
-        return cloned_args, cloned_kwargs
-
-    @dynamo_timed
-    def benchmark_all_configs(self, *args, **kwargs):
-        timings = {
-            launcher: self.bench(launcher, *args, **kwargs)
-            for launcher in self.launchers
-        }
-
-        for k, v in timings.items():
-            self.coordesc_tuner.cache_benchmark_result(k.config, v)
-
-        if log.isEnabledFor(logging.DEBUG):
-            log.debug("Benchmark all input configs get:")
-            for k, v in timings.items():
-                log.debug(
-                    "%s: %f, nreg %d, nspill %d, #shared-mem %d",
-                    k.config,
-                    v,
-                    k.n_regs,
-                    k.n_spills,
-                    k.shared,
-                )
-
-        return timings
-
-    def autotune_to_one_config(self, *args, **kwargs):
-        """Do the actual autotuning"""
-        timings = self.benchmark_all_configs(*args, **kwargs)
-        self.launchers = [builtins.min(timings, key=timings.get)]
-        if self.save_cache_hook:
-            self.save_cache_hook(self.launchers[0].config)
-
-    def save_cuda_kernel(self, grid, stream, launcher):
-        if callable(grid):
-            grid_x, grid_y, grid_z = grid(launcher.config.kwargs)
-        else:
-            grid_x, grid_y, grid_z = grid
-
-        key = self.inductor_meta.get("kernel_name", None)  # unique kernel name
-        assert key is not None, "kernel_name can not be None"
-        params = {
-            "mangled_name": launcher.bin.metadata["name"],
-            "grid_x": grid_x,
-            "grid_y": grid_y,
-            "grid_z": grid_z,
-            "x_block": launcher.config.kwargs.get("XBLOCK", 1),
-            "y_block": launcher.config.kwargs.get("YBLOCK", None),
-            "z_block": launcher.config.kwargs.get("ZBLOCK", None),
-            "num_warps": launcher.bin.num_warps,
-            "shared_mem": launcher.bin.shared,
-            "stream": stream,
-            # User defined triton kernels will have arbitrary kwarg names
-            "meta": launcher.config.kwargs,
-        }
-
-        if torch.version.hip is None:
-            CudaKernelParamCache.set(key, params, launcher.bin.asm["cubin"])
-        else:
-            # There is some divergence between CUDA and ROCm here.
-            # On ROCm's triton we only have the the path to the binary, not the binary itself.
-            # For ROCm we will copy the binary to the new location instead of writing to file
-            import pathlib
-
-            launcher.bin.asm["hsaco"] = pathlib.Path(
-                launcher.bin.asm["hsaco_path"]
-            ).read_bytes()
-            CudaKernelParamCache.set(key, params, launcher.bin.asm["hsaco"])
-
-    def coordinate_descent_tuning(self, launcher, *args, **kwargs):
-        """
-        Coordinate descent tuning can be run with or without max-autotune.
-
-        The only difference between these two is the starting config for coordinate_descent tuning.
-        E.g., assuming regular autotune only get one config C1; while max-autotune get 4 configs C1, C2, C3, C4
-        and max-autotune figure out C3 is the best.
-
-        Then if coordinate descnt tuning is run with max-autotune disabled, it will start from C1;
-        while if coordinate descent tuning is run with max-autotune enabled, it will start from C3.
-        """
-        if (
-            self.heuristic_type == HeuristicType.TEMPLATE
-            or self.heuristic_type == HeuristicType.USER_AUTOTUNE
-        ):
-            # skip triton template
-            return launcher
-
-        cloned_args, _ = self.clone_args(*args)
-        config2launcher = {launcher.config: launcher}
-
-        def benchmark_one_config(config):
-            with self.lock:
-                _, launcher = self._precompile_config(config, None)
-            config2launcher[config] = launcher
-
-            out = self.bench(launcher, *cloned_args, **kwargs)
-            log.debug(
-                "COORDESC: %s: %f, nreg %d, nspill %d, #shared-mem %d",
-                launcher.config,
-                out,
-                launcher.n_regs,
-                launcher.n_spills,
-                launcher.shared,
-            )
-            return out
-
-        assert not (
-            self.heuristic_type == HeuristicType.PERSISTENT_REDUCTION
-            and "RBLOCK" in launcher.config.kwargs
-        ), "Coordinate descent tuner relies on the assumption that persistent reduction's triton config does not have RBLOCK"
-        best_config = self.coordesc_tuner.autotune(
-            benchmark_one_config, launcher.config, None
-        )
-        best_config.found_by_coordesc = True
-
-        if self.save_cache_hook:
-            self.save_cache_hook(best_config, found_by_coordesc=True)
-        return config2launcher.get(best_config)
-
-    def run(self, *args, grid, stream, **kwargs):
-        if len(self.launchers) != 1:
-            if len(self.launchers) == 0:
-                self.precompile()
-            if len(self.launchers) > 1:
-                self.autotune_to_one_config(*args, grid=grid, **kwargs)
-
-        if (
-            not getattr(self.launchers[0].config, "found_by_coordesc", False)
-            and config.coordinate_descent_tuning
-        ):
-            self.launchers = [
-                self.coordinate_descent_tuning(
-                    self.launchers[0], *args, grid=grid, **kwargs
-                )
-            ]
-
-        (launcher,) = self.launchers
-        if launcher.store_cubin:
-            self.save_cuda_kernel(grid, stream, launcher)
-
-        if launcher.config.pre_hook is not None:
-            launcher.config.pre_hook(
-                {**dict(zip(self.arg_names, args)), **launcher.config.kwargs, **kwargs}
-            )
-
-        # guard the record_function_ctx and only call it if profiling is currently
-        # in progress, to reduce latency when profiler is not turned on. Note that
-        # the "if" statement (instead of, say, a contextlib.nullcontext) is intentional;
-        # it is faster than entering and exiting a context manager, even if the context
-        # manager is a nullcontext.
-        if autograd_profiler._is_profiler_enabled:
-            with self.record_function_ctx:
-                return launcher(
-                    *args,
-                    **kwargs,
-                    grid=grid,
-                    stream=stream,
-                )
-        else:
-            return launcher(
-                *args,
-                **kwargs,
-                grid=grid,
-                stream=stream,
-            )
-
-
-def _find_names(obj):
-    import gc
-    import inspect
-
-    frame = inspect.currentframe()
-    while frame is not None:
-        frame.f_locals
-        frame = frame.f_back
-    obj_names = []
-    for referrer in gc.get_referrers(obj):
-        if isinstance(referrer, dict):
-            for k, v in referrer.items():
-                if v is obj:
-                    obj_names.append(k)
-    return obj_names
-
-
-collected_calls: List[Any] = []
-
-
-def start_graph():
-    collected_calls.clear()
-
-
-def end_graph():
-    if len(collected_calls) == 0:
-        return
-    overall_time = sum(call[0] for call in collected_calls)
-    overall_gb = sum(call[1] for call in collected_calls)
-    cur_file = inspect.stack()[1].filename
-    summary_str = (
-        f"SUMMARY ({cur_file})\n"
-        f"{overall_time:.2f}ms   \t {overall_gb:.2f} GB\t {overall_gb/(overall_time/1e3):.2f}GB/s"
-    )
-    print(summary_str)
-    print()
-    output_file = config.profile_bandwidth_output
-    if output_file is not None:
-        # sort perf numbers in descending order, i.e. placing the
-        # most runtime-heavy kernels at the top of the list
-        sorted_calls = sorted(collected_calls, key=lambda c: float(c[0]), reverse=True)
-        try:
-            with open(output_file, "a") as file:
-                log.debug("Save profile bandwidth results to %s", output_file)
-                file.write("====================\n")
-                file.write(f"TRITON KERNELS BANDWIDTH INFO ({cur_file})\n")
-                for ms, num_gb, gb_per_s, kernel_name in sorted_calls:
-                    # also display the runtime percentage for each kernel
-                    percentage = f"{ms/overall_time*100:.2f}%"
-                    suffix = f" \t {percentage} \t {kernel_name}"
-                    bw_info_str = create_bandwidth_info_str(
-                        ms, num_gb, gb_per_s, suffix=suffix
-                    )
-                    file.write(bw_info_str + "\n")
-                file.write(f"{summary_str}\n\n")
-        except Exception as e:
-            log.warning(
-                "failed to write profile bandwidth result into %s: %s",
-                output_file,
-                e,
-            )
-
-
-class DebugAutotuner(CachingAutotuner):
-    def __init__(self, *args, regex_filter="", **kwargs):
-        self.regex_filter = regex_filter
-        super().__init__(*args, **kwargs)
-        self.cached = None
-
-    def run(self, *args, grid, stream):
-        possible_names = _find_names(self)
-        kernel_name = f"{max(possible_names, key=len)}"
-        if not re.match(self.regex_filter, kernel_name):
-            return
-        super().run(*args, grid=grid, stream=stream)
-        (launcher,) = self.launchers
-
-        if self.cached is None:
-            ms = self.bench(launcher, *args, grid=grid)
-            num_in_out_ptrs = len(
-                [
-                    arg_name
-                    for arg_name in self.fn.arg_names
-                    if arg_name.startswith("in_out_ptr")
-                ]
-            )
-            num_gb = get_num_bytes(*args, num_in_out_args=num_in_out_ptrs) / 1e9
-            gb_per_s = num_gb / (ms / 1e3)
-            self.cached = (ms, num_gb, gb_per_s, kernel_name)
-        else:
-            ms, num_gb, gb_per_s, kernel_name = self.cached
-        collected_calls.append((ms, num_gb, gb_per_s, kernel_name))
-        print(
-            create_bandwidth_info_str(ms, num_gb, gb_per_s, suffix=f" \t {kernel_name}")
-        )
-
-
-def hash_configs(configs: List[Config]):
-    """
-    Hash used to check for changes in configurations
-    """
-    hasher = hashlib.sha256()
-    for cfg in configs:
-        hasher.update(
-            f"{sorted(cfg.kwargs.items())} {cfg.num_warps} {cfg.num_stages}\n".encode()
-        )
-    return hasher.hexdigest()
-
-
-def load_cached_autotuning(
-    cache_filename: str, configs_hash: str, configs: List[Config]
-):
-    """
-    Read a cached autotuning result from disk
-    """
-    if not os.path.exists(cache_filename):
-        return None
-
-    with open(cache_filename) as fd:
-        best_config = json.loads(fd.read())
-    if best_config.pop("configs_hash", None) != configs_hash:
-        return None
-
-    if config.coordinate_descent_tuning and best_config.pop("found_by_coordesc", False):
-        num_warps = best_config.pop("num_warps")
-        num_stages = best_config.pop("num_stages")
-        triton_config = Config(best_config, num_warps=num_warps, num_stages=num_stages)
-        triton_config.found_by_coordesc = True
-        return triton_config
-
-    matching_configs = [
-        cfg
-        for cfg in configs
-        if all(val == best_config.get(key) for key, val in cfg.kwargs.items())
-        and cfg.num_warps == best_config.get("num_warps")
-        and cfg.num_stages == best_config.get("num_stages")
-    ]
-    if len(matching_configs) != 1:
-        return None
-
-    return matching_configs[0]
-
-
-def cached_autotune(
-    size_hints: Optional[List[int]],
-    configs: List[Config],
-    triton_meta,
-    heuristic_type,
-    filename=None,
-    inductor_meta=None,
-):
-    """
-    A copy of triton.autotune that calls our subclass.  Our subclass
-    has additional debugging, error handling, and on-disk caching.
-    """
-    configs = unique_configs(configs)
-    assert len(configs) == 1 or filename
-    save_cache_hook: Optional[Callable[[Any, Any], Any]]
-    inductor_meta = {} if inductor_meta is None else inductor_meta
-
-    # on disk caching logic
-    if filename is not None and (len(configs) > 1 or config.coordinate_descent_tuning):
-        cache_filename = os.path.splitext(filename)[0] + ".best_config"
-        configs_hash = hash_configs(configs)
-        best_config = load_cached_autotuning(cache_filename, configs_hash, configs)
-        if best_config:
-            configs = [best_config]
-
-        def save_cache_hook(cfg, found_by_coordesc=False):
-            with open(cache_filename, "w") as fd:
-                fd.write(
-                    json.dumps(
-                        {
-                            **cfg.kwargs,
-                            "num_warps": cfg.num_warps,
-                            "num_stages": cfg.num_stages,
-                            "configs_hash": configs_hash,
-                            "found_by_coordesc": found_by_coordesc,
-                        }
-                    )
-                )
-            if log.isEnabledFor(logging.DEBUG):
-                type_str = "coordesc" if found_by_coordesc else "heuristic"
-                log.debug("Save %s tuning result to %s", type_str, cache_filename)
-
-    else:
-        save_cache_hook = None
-
-    mutated_arg_names = inductor_meta.pop("mutated_arg_names", ())
-
-    def decorator(fn):
-        # Remove XBLOCK from config if it's not a function argument.
-        # This way, coordinate descent tuning will not try to tune it.
-        #
-        # Context: When TritonKernel.no_x_dim is True, we hardcode XBLOCK to 1.
-        import inspect
-
-        if "XBLOCK" not in inspect.signature(fn.fn).parameters:
-            for tconfig in configs:
-                if "XBLOCK" in tconfig.kwargs:
-                    assert tconfig.kwargs["XBLOCK"] == 1
-                    tconfig.kwargs.pop("XBLOCK")
-
-        if config.profile_bandwidth:
-            return DebugAutotuner(
-                fn,
-                triton_meta=triton_meta,
-                inductor_meta=inductor_meta,
-                regex_filter=config.profile_bandwidth_regex,
-                configs=configs,
-                save_cache_hook=save_cache_hook,
-                mutated_arg_names=mutated_arg_names,
-                heuristic_type=heuristic_type,
-                size_hints=size_hints,
-            )
-        return CachingAutotuner(
-            fn,
-            triton_meta=triton_meta,
-            inductor_meta=inductor_meta,
-            configs=configs,
-            save_cache_hook=save_cache_hook,
-            mutated_arg_names=mutated_arg_names,
-            heuristic_type=heuristic_type,
-            size_hints=size_hints,
-        )
-
-    return decorator
-
-
-def unique_configs(configs: List[Config]):
-    """Remove duplicate configurations"""
-    seen = set()
-    pruned_configs = []
-
-    for cfg in configs:
-        key = triton_config_to_hashable(cfg)
-        if key not in seen:
-            seen.add(key)
-            pruned_configs.append(cfg)
-    return pruned_configs
-
-
-def check_config(cfg, *, xnumel=None, ynumel=None, znumel=None):
-    for numel, label in zip((xnumel, ynumel, znumel), "XYZ"):
-        if numel is None:
-            continue
-        block = cfg[f"{label}BLOCK"]
-        if numel == 1:
-            assert block == 1, (
-                f"TritonKernel.indexing assumes numel == 1 => BLOCK == 1"
-                f" but {label.lower()}numel=={numel} and {label}BLOCK={block} (cfg={cfg})."
-            )
-        max_block = config.triton.max_block[label]
-        max_block_str = f'config.triton.max_block["{label}"]'
-        assert max_block % block == 0, (
-            f"TritonKernel.indexing assumes {label}BLOCK divides {max_block_str}"
-            f" but {label}BLOCK={block} and {max_block_str}={max_block} (cfg={cfg})."
-        )
-
-
-def triton_config(
-    size_hints,
-    x,
-    y=None,
-    z=None,
-    num_stages=1,
-    num_elements_per_warp=256,
-    min_elem_per_thread=0,
-) -> Config:
-    """
-    Construct a pointwise triton config with some adjustment heuristics
-    based on size_hints. Size_hints is a tuple of numels in each tile
-    dimension and will be rounded up to the nearest power of 2.
-
-    num_elements_per_warp is a suggestion for controlling how many warps
-    the triton config should contain. e.g.: if x=16, y=8, z=4 then
-    num_elements = 16*8*4 = 512. Then if we set num_elements_per_warp=128,
-    we'll launch 512 (elem) / 128 (elem/warp) = 4 warps. Note that it's
-    just a suggestion, and sometimes other adjustment heuristics will
-    override the num_elements_per_warp.
-
-    min_elem_per_thread controls the minimum number of elements
-    processed by each thread. It's always enforced.
-    """
-    # Ideally we want to read this from some device config
-
-    # for a 2d size_hints [a, b], a should be mapped to YBLOCK rather than XBLOCK
-    size_hints = list(reversed(size_hints))
-
-    maxGridSize = [2147483647, 65535, 65535]
-
-    target = conditional_product(x, y, z)
-    if conditional_product(*size_hints) < target:
-        target //= 8
-
-    # shrink sizes to size hints
-    x = min(x, size_hints[0])
-    if y:
-        y = min(y, size_hints[1])
-    if z:
-        z = min(z, size_hints[2])
-
-    # if we are below original block size, scale up where we can;
-    # or if the calculated grid size is larger than the limit, we bump up the corresponding dimension
-    while x < min(size_hints[0], config.triton.max_block["X"]) and (
-        x * maxGridSize[0] < size_hints[0] or conditional_product(x, y, z) < target
-    ):
-        x *= 2
-    while (
-        y
-        and y < min(size_hints[1], config.triton.max_block["Y"])
-        and (
-            y * maxGridSize[1] < size_hints[1] or conditional_product(x, y, z) < target
-        )
-    ):
-        y *= 2
-    while (
-        z
-        and z < min(size_hints[2], config.triton.max_block["Z"])
-        and (
-            z * maxGridSize[2] < size_hints[2] or conditional_product(x, y, z) < target
-        )
-    ):
-        z *= 2
-
-    num_warps = next_power_of_2(
-        min(max(conditional_product(x, y, z) // num_elements_per_warp, 1), 8)
-    )
-    # we are going to arrive at 2 warps only if bs was too small due to
-    # numel being too small. However to workaround some ptx bugs we still
-    # want at least 4 warps if there's enough elements per thread
-    # given that this is a rare situation, don't expect this to affect perf
-    # in general
-    # see https://github.com/pytorch/pytorch/pull/97950
-    num_warps = max(num_warps, 4) if conditional_product(x, y, z) >= 128 else num_warps
-    xnumel = size_hints[0]
-    ynumel = size_hints[1] if y else None
-    znumel = size_hints[2] if z else None
-
-    # Increase x to satisfy min_elem_per_thread requirements.
-    block_size = max(
-        conditional_product(x, y, z),
-        min_elem_per_thread * _NUM_THREADS_PER_WARP * num_warps,
-    )
-    x *= math.ceil(block_size / conditional_product(x, y, z))
-
-    cfg = {"XBLOCK": x}
-    if y:
-        cfg["YBLOCK"] = y
-    if z:
-        cfg["ZBLOCK"] = z
-    check_config(cfg, xnumel=xnumel, ynumel=ynumel, znumel=znumel)
-    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
-
-
-def triton_config_reduction(size_hints, x, r, num_stages=1, num_warps=None) -> Config:
-    """
-    Construct a reduction triton config with some adjustment heuristics
-    based on size_hints. Size_hints is a tuple of numels in each tile
-    dimension and will be rounded up to the nearest power of 2.
-    """
-
-    target = conditional_product(x, r)
-    if conditional_product(*size_hints) < target:
-        target //= 8
-
-    # shrink sizes to size hints
-    x = min(x, size_hints[0])
-    r = min(r, size_hints[1])
-
-    # if we are below original block size, scale up where we can
-    while x < size_hints[0] and conditional_product(x, r) < target:
-        x *= 2
-    while r < size_hints[1] and conditional_product(x, r) < target:
-        r *= 2
-
-    cfg = {"XBLOCK": x, "RBLOCK": r}
-    if num_warps is None:
-        num_warps = conditional_product(x, r) // 128
-    num_warps = next_power_of_2(min(max(num_warps, 2), 8))
-    check_config(cfg, xnumel=size_hints[0])
-    assert (
-        r <= config.triton.max_block["R"]
-    ), f"increase config.triton.MAX_BLOCK['r'] to {r}"
-    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
-
-
-def triton_config_tiled_reduction(size_hints, x, y, r, num_stages=1):
-    """
-    Construct a tile reduction triton config with some adjustment
-    heuristics based on size_hints. Size_hints is a tuple of numels in
-    each tile dimension and will be rounded up to the nearest power of 2.
-    """
-
-    target = conditional_product(x, y, r)
-    if conditional_product(*size_hints) < target:
-        target //= 8
-
-    # shrink sizes to size hints
-    x = min(x, size_hints[0])
-    y = min(y, size_hints[1])
-    r = min(r, size_hints[2])
-
-    # if we are below original block size, scale up where we can
-    while x < size_hints[0] and conditional_product(x, y, r) < target:
-        x *= 2
-    while r < size_hints[2] and conditional_product(x, y, r) < target:
-        r *= 2
-    while y < size_hints[1] and conditional_product(x, y, r) < target:
-        y *= 2
-
-    cfg = {"XBLOCK": x, "YBLOCK": y, "RBLOCK": r}
-    num_warps = next_power_of_2(min(max(conditional_product(x, y, r) // 256, 1), 8))
-    check_config(cfg, xnumel=size_hints[0], ynumel=size_hints[1])
-    assert (
-        r <= config.triton.max_block["R"]
-    ), f"increase config.triton.MAX_BLOCK['r'] to {r}"
-    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
-
-
-def pointwise(
-    size_hints,
-    triton_meta,
-    tile_hint=None,
-    filename=None,
-    min_elem_per_thread=0,
-    inductor_meta=None,
-):
-    """
-    Construct @triton.heuristics() based on size_hints.
-    """
-    inductor_meta = {} if inductor_meta is None else inductor_meta
-    assert not inductor_meta.get("no_x_dim")
-
-    numel = functools.reduce(operator.mul, size_hints)
-    bs = max(256, min(numel // 128, 1024))
-
-    hinted_configs = autotune_hints_to_configs(
-        inductor_meta.get("autotune_hints", set()), size_hints, bs
-    )
-
-    triton_config_with_settings = functools.partial(
-        triton_config, min_elem_per_thread=min_elem_per_thread
-    )
-
-    if len(size_hints) == 1:
-        if disable_pointwise_autotuning() and not (
-            config.max_autotune or config.max_autotune_pointwise
-        ):
-            return cached_autotune(
-                size_hints,
-                [triton_config_with_settings(size_hints, bs)],
-                triton_meta=triton_meta,
-                inductor_meta=inductor_meta,
-                heuristic_type=HeuristicType.POINTWISE,
-                filename=filename,
-            )
-        else:
-            return cached_autotune(
-                size_hints,
-                [
-                    triton_config_with_settings(
-                        size_hints, bs, num_elements_per_warp=256
-                    ),
-                    triton_config_with_settings(
-                        size_hints, bs // 2, num_elements_per_warp=64
-                    ),
-                    *hinted_configs,
-                ],
-                triton_meta=triton_meta,
-                inductor_meta=inductor_meta,
-                heuristic_type=HeuristicType.POINTWISE,
-                filename=filename,
-            )
-    if len(size_hints) == 2:
-        if (disable_pointwise_autotuning() or tile_hint == TileHint.SQUARE) and not (
-            config.max_autotune or config.max_autotune_pointwise
-        ):
-            return cached_autotune(
-                size_hints,
-                [triton_config_with_settings(size_hints, 32, 32)],
-                triton_meta=triton_meta,
-                inductor_meta=inductor_meta,
-                heuristic_type=HeuristicType.POINTWISE,
-                filename=filename,
-            )
-        return cached_autotune(
-            size_hints,
-            [
-                triton_config_with_settings(size_hints, 32, 32),
-                triton_config_with_settings(size_hints, 64, 64),  # ~8% better for fp16
-                triton_config_with_settings(size_hints, 256, 16),
-                triton_config_with_settings(size_hints, 16, 256),
-                triton_config_with_settings(size_hints, bs, 1),
-                triton_config_with_settings(size_hints, 1, bs),
-                *hinted_configs,
-            ],
-            triton_meta=triton_meta,
-            inductor_meta=inductor_meta,
-            filename=filename,
-            heuristic_type=HeuristicType.POINTWISE,
-        )
-    if len(size_hints) == 3:
-        if disable_pointwise_autotuning():
-            return cached_autotune(
-                size_hints,
-                [triton_config_with_settings(size_hints, 16, 16, 16)],
-                triton_meta=triton_meta,
-                inductor_meta=inductor_meta,
-                heuristic_type=HeuristicType.POINTWISE,
-                filename=filename,
-            )
-        return cached_autotune(
-            size_hints,
-            [
-                triton_config_with_settings(size_hints, 16, 16, 16),
-                triton_config_with_settings(size_hints, 64, 8, 8),
-                triton_config_with_settings(size_hints, 8, 64, 8),
-                triton_config_with_settings(size_hints, 8, 8, 64),
-                triton_config_with_settings(size_hints, bs, 1, 1),
-                triton_config_with_settings(size_hints, 1, bs, 1),
-                triton_config_with_settings(size_hints, 1, 1, bs),
-                *hinted_configs,
-            ],
-            triton_meta=triton_meta,
-            inductor_meta=inductor_meta,
-            filename=filename,
-            heuristic_type=HeuristicType.POINTWISE,
-        )
-    raise NotImplementedError(f"size_hints: {size_hints}")
-
-
-def reduction(
-    size_hints,
-    reduction_hint=False,
-    triton_meta=None,
-    filename=None,
-    inductor_meta=None,
-):
-    """args to @triton.heuristics()"""
-    inductor_meta = {} if inductor_meta is None else inductor_meta
-    if inductor_meta.get("no_x_dim"):
-        size_hints = [1, *size_hints[1:]]
-
-    assert triton_meta is not None
-    rnumel = size_hints[-1]
-    if len(size_hints) == 2:
-        contiguous_config = triton_config_reduction(
-            size_hints, 1, (rnumel if 256 <= rnumel < 2048 else 2048)
-        )
-        outer_config = triton_config_reduction(size_hints, 64, 8)
-        tiny_config = triton_config_reduction(
-            size_hints, 2 * (256 // rnumel) if rnumel <= 256 else 1, min(rnumel, 2048)
-        )
-        if config.max_autotune or config.max_autotune_pointwise:
-            pass  # skip all these cases
-        elif reduction_hint == ReductionHint.INNER:
-            return cached_autotune(
-                size_hints,
-                [contiguous_config],
-                triton_meta=triton_meta,
-                inductor_meta=inductor_meta,
-                heuristic_type=HeuristicType.REDUCTION,
-                filename=filename,
-            )
-        elif reduction_hint == ReductionHint.OUTER:
-            return cached_autotune(
-                size_hints,
-                [outer_config],
-                triton_meta=triton_meta,
-                inductor_meta=inductor_meta,
-                heuristic_type=HeuristicType.REDUCTION,
-                filename=filename,
-            )
-        elif reduction_hint == ReductionHint.OUTER_TINY:
-            return cached_autotune(
-                size_hints,
-                [tiny_config],
-                triton_meta=triton_meta,
-                inductor_meta=inductor_meta,
-                heuristic_type=HeuristicType.REDUCTION,
-                filename=filename,
-            )
-        if disable_pointwise_autotuning():
-            return cached_autotune(
-                size_hints,
-                [triton_config_reduction(size_hints, 32, 128)],
-                triton_meta=triton_meta,
-                inductor_meta=inductor_meta,
-                heuristic_type=HeuristicType.REDUCTION,
-                filename=filename,
-            )
-        return cached_autotune(
-            size_hints,
-            [
-                contiguous_config,
-                outer_config,
-                tiny_config,
-                triton_config_reduction(size_hints, 64, 64),
-                triton_config_reduction(size_hints, 8, 512),
-                # halve the XBLOCK/RBLOCK compared to outer_config
-                # TODO: this may only be beneficial when each iteration of the reduction
-                # is quite heavy. E.g. https://gist.github.com/shunting314/189a8ef69f90db9d614a823385147a72
-                triton_config_reduction(size_hints, 64, 4, num_warps=8),
-            ],
-            triton_meta=triton_meta,
-            inductor_meta=inductor_meta,
-            filename=filename,
-            heuristic_type=HeuristicType.REDUCTION,
-        )
-    raise NotImplementedError(f"size_hints: {size_hints}")
-
-
-def persistent_reduction(
-    size_hints,
-    reduction_hint=False,
-    triton_meta=None,
-    filename=None,
-    inductor_meta=None,
-):
-    if inductor_meta and inductor_meta.get("no_x_dim"):
-        size_hints = [1, *size_hints[1:]]
-    xnumel, rnumel = size_hints
-
-    configs = [
-        triton_config_reduction(size_hints, xblock, rnumel)
-        for xblock in (1, 8, 32, 128)
-        if rnumel * xblock <= 4096 and xblock <= xnumel
-    ]
-
-    # TODO(jansel): we should be able to improve these heuristics
-    if reduction_hint == ReductionHint.INNER and rnumel >= 256:
-        configs = configs[:1]
-    elif reduction_hint == ReductionHint.OUTER:
-        configs = configs[-1:]
-    elif reduction_hint == ReductionHint.OUTER_TINY:
-        configs = [
-            triton_config_reduction(
-                size_hints, 2 * (256 // rnumel) if rnumel <= 256 else 1, rnumel
-            )
-        ]
-    for c in configs:
-        # we don't need RBLOCK for persistent reduction
-        c.kwargs.pop("RBLOCK")
-
-    if disable_pointwise_autotuning():
-        configs = configs[:1]
-
-    return cached_autotune(
-        size_hints,
-        configs,
-        triton_meta=triton_meta,
-        inductor_meta=inductor_meta,
-        filename=filename,
-        heuristic_type=HeuristicType.PERSISTENT_REDUCTION,
-    )
-
-
-def template(num_stages, num_warps, triton_meta, filename=None, inductor_meta=None):
-    """
-    Compile a triton template
-    """
-    return cached_autotune(
-        None,
-        [triton.Config({}, num_stages=num_stages, num_warps=num_warps)],
-        triton_meta=triton_meta,
-        inductor_meta=inductor_meta,
-        heuristic_type=HeuristicType.TEMPLATE,
-        filename=filename,
-    )
-
-
-def user_autotune(configs, triton_meta, filename=None, inductor_meta=None):
-    """
-    Compile a user defined triton kernel
-    """
-    defaults = inspect.signature(triton.Config).parameters
-    default_num_stages = defaults["num_stages"].default
-    default_num_warps = defaults["num_warps"].default
-
-    if len(configs) == 0:
-        configs = [
-            triton.Config(
-                {}, num_stages=default_num_stages, num_warps=default_num_warps
-            )
-        ]
-    else:
-        configs = [
-            triton.Config(
-                c.get("kwargs", {}),
-                num_stages=c.get("num_stages", default_num_stages),
-                num_warps=c.get("num_warps", default_num_warps),
-            )
-            for c in configs
-        ]
-
-    return cached_autotune(
-        None,
-        configs,
-        triton_meta=triton_meta,
-        heuristic_type=HeuristicType.USER_AUTOTUNE,
-        filename=filename,
-        inductor_meta=inductor_meta,
-    )
-
-
-def foreach(triton_meta, num_warps, filename=None, inductor_meta=None):
-    """
-    Compile a triton foreach kernel
-    """
-    return cached_autotune(
-        None,
-        [triton.Config({}, num_stages=1, num_warps=num_warps)],
-        triton_meta=triton_meta,
-        inductor_meta=inductor_meta,
-        heuristic_type=HeuristicType.TEMPLATE,
-        filename=filename,
-    )
-
-
-def grid(*numels):
-    """Helper function to compute triton grids"""
-    if len(numels) == 1:
-        xnumel, ynumel, znumel = numels[0], None, None
-    elif len(numels) == 2:
-        xnumel, ynumel, znumel = numels[1], numels[0], None
-    elif len(numels) == 3:
-        xnumel, ynumel, znumel = numels[2], numels[1], numels[0]
-    else:
-        raise AssertionError(f"invalid size for numels {len(numels)}")
-
-    def get_grid_dim(numel, block):
-        if numel is None:
-            return 1
-        if block is None:
-            return numel
-        return ceildiv(numel, block)
-
-    def grid_fn(meta):
-        return (
-            get_grid_dim(xnumel, meta.get("XBLOCK", 1)),
-            get_grid_dim(ynumel, meta.get("YBLOCK", None)),
-            get_grid_dim(znumel, meta.get("ZBLOCK", None)),
-        )
-
-    return grid_fn
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 973bca92e04c9..be61e017c11c0 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -2,23 +2,24 @@
 
 import collections
 import contextlib
+import dataclasses
 import enum
 import functools
-import getpass
 import inspect
+import io
 import itertools
 import logging
 import math
 import operator
 import os
 import platform
-import re
 import shutil
 import sys
 import tempfile
 import textwrap
 import time
 import unittest
+from datetime import datetime
 from io import StringIO
 from typing import (
     Any,
@@ -42,17 +43,23 @@
 
 import torch
 from torch._dynamo.device_interface import get_interface_for_device
+from torch._dynamo.utils import detect_fake_mode
 from torch.autograd import DeviceType
 from torch.autograd.profiler_util import EventList
+from torch.fx.passes.shape_prop import ShapeProp
 from torch.utils._sympy.functions import CeilDiv, CleanDiv, FloorDiv, ModularIndexing
-
+from torch.utils._sympy.symbol import make_symbol, SymT
+from torch.utils._sympy.value_ranges import bound_sympy, ValueRanges
 from . import config
+from .runtime.runtime_utils import ceildiv as runtime_ceildiv
 
 log = logging.getLogger(__name__)
 
 _T = TypeVar("_T")
 VarRanges = Dict[sympy.Expr, sympy.Expr]
 
+ALIGNMENT = 16
+
 
 def do_bench_using_profiling(fn: Callable[[], Any], warmup=25, rep=100) -> float:
     """
@@ -131,42 +138,11 @@ def do_bench_using_profiling(fn: Callable[[], Any], warmup=25, rep=100) -> float
     log.debug("profiling time breakdown")
     log.debug(actual_events.table(row_limit=-1))
 
-    res = sum(event.cuda_time_total for event in actual_events) / 1000.0 / n_repeat
+    res = sum(event.device_time_total for event in actual_events) / 1000.0 / n_repeat
     log.debug("profiling results: %s ms", res)
     return res
 
 
-def do_bench(*args, **kwargs):
-    @functools.lru_cache(None)
-    def load_triton():
-        try:
-            # NB: Lazily load triton, as importing triton is slow
-            # see https://github.com/openai/triton/issues/1599
-            from triton.testing import do_bench as triton_do_bench
-        except ImportError as exc:
-            raise NotImplementedError("requires Triton") from exc
-
-        # triton PR https://github.com/openai/triton/pull/1513 change the
-        # quantile fields name from 'percentiles' to 'quantiles'
-        # and change the default value from (0.5, 0.2, 0.8) to None.
-        # This may break inductor since a caller expects a tuple may get a item.
-        #
-        # Add a wrapper to maintain the same behavior for inductor.
-        # Maybe we should have own implementation of this function?
-        return triton_do_bench, (
-            "quantiles"
-            if inspect.signature(triton_do_bench).parameters.get("quantiles")
-            is not None
-            else "percentiles"
-        )
-
-    triton_do_bench, quantile_field_name = load_triton()
-
-    if quantile_field_name not in kwargs:
-        kwargs[quantile_field_name] = (0.5, 0.2, 0.8)
-    return triton_do_bench(*args, **kwargs)[0]
-
-
 @functools.lru_cache(None)
 def has_torchvision_roi_align() -> bool:
     try:
@@ -179,16 +155,12 @@ def has_torchvision_roi_align() -> bool:
         return False
 
 
-def conditional_product(*args):
-    return functools.reduce(operator.mul, [x for x in args if x])
-
-
 def decode_device(device: Union[Optional[torch.device], str]) -> torch.device:
     if device is None:
         return torch.tensor(0.0).device  # default device
     if isinstance(device, str):
         device = torch.device(device)
-    if device.type != "cpu" and device.index is None:
+    if device.type not in ("cpu", "meta") and device.index is None:
         device_interface = get_interface_for_device(device.type)
         return torch.device(device.type, index=device_interface.Worker.current_device())
     return device
@@ -218,20 +190,42 @@ def ceildiv(
     assert isinstance(numer, int) and isinstance(
         denom, int
     ), f"{numer}: {type(numer)}, {denom}: {type(denom)}"
-    return -(numer // -denom)
-
-
-def next_power_of_2(n: int) -> int:
-    """Return the smallest power of 2 greater than or equal to n"""
-    assert n <= 2**32, "32-bit only"
-    n -= 1
-    n |= n >> 1
-    n |= n >> 2
-    n |= n >> 4
-    n |= n >> 8
-    n |= n >> 16
-    n += 1
-    return n
+    return runtime_ceildiv(numer, denom)
+
+
+def _type_of(key):
+    # Use the function here to get rid of dependencies on the Triton during the codegen.
+    # Refer to Triton implementation here:
+    # https://github.com/openai/triton/blob/98b5945d2aef679e00ebca8e07c35c3658ec76de/python/triton/runtime/jit.py#L238
+    # `None` is nullptr.  Implicitly convert to *i8.
+    if key is None:
+        return "*i8"
+    dtype_str = str(key).split(".")[-1]
+    tys = {
+        "bool": "i1",
+        "float8e4nv": "fp8e4nv",
+        "float8e5": "fp8e5",
+        "float8e4b15": "fp8e4b15",
+        "float8e4b15x4": "fp8e4b15x4",
+        "float8_e4m3fn": "fp8e4nv",
+        "float8_e5m2": "fp8e5",
+        "float16": "fp16",
+        "bfloat16": "bf16",
+        "float32": "fp32",
+        "float64": "fp64",
+        "int8": "i8",
+        "int16": "i16",
+        "int32": "i32",
+        "int64": "i64",
+        "uint8": "u8",
+        "uint16": "u16",
+        "uint32": "u32",
+        "uint64": "u64",
+    }
+    # reinterpret can create triton type
+    for v in list(tys.values()):
+        tys[v] = v
+    return key if isinstance(key, str) else f"*{tys[dtype_str]}"
 
 
 def convert_shape_to_inductor(
@@ -331,7 +325,7 @@ def timed(
         synchronize(device)
     t1 = time.perf_counter()
     # GC the result after timing
-    assert result is not None
+    assert result is not None  # type: ignore[possibly-undefined]
     return t1 - t0
 
 
@@ -440,7 +434,9 @@ def get_fused_kernel_name(node_schedule, descriptive_names):
         sources = [
             origin.meta["original_aten"]._overloadpacket.__name__
             for origin in all_origins
-            if origin.op == "call_function" and "original_aten" in origin.meta
+            if origin.op == "call_function"
+            and "original_aten" in origin.meta
+            and origin.meta["original_aten"] is not None
         ]
         sources = sorted(set(sources))
     elif descriptive_names == "torch":
@@ -471,7 +467,7 @@ def get_kernel_metadata(node_schedule, wrapper):
     from_node_dict = collections.defaultdict(list)
     original_aten_dict = collections.defaultdict(list)
     for node in inductor_nodes:
-        if "original_aten" in node.meta:
+        if "original_aten" in node.meta and node.meta["original_aten"] is not None:
             key = str(node.meta["original_aten"]._overloadpacket)
             original_aten_dict[key].append(node.name)
         if "from_node" in node.meta:
@@ -544,7 +540,36 @@ def sympy_str(expr: sympy.Expr) -> str:
     return str(expr)
 
 
-def sympy_symbol(name: str) -> sympy.Symbol:
+def get_bounds_index_expr(index):
+    from .virtualized import V
+
+    # If this expression does not come from an FX node, we compute its bounds
+    if (
+        config.compute_all_bounds
+        and (fx_node := getattr(V.interpreter, "current_node", None))
+        and fx_node.target != "index_expr"
+    ):
+        return bound_sympy(index)
+    else:
+        return ValueRanges.unknown()
+
+
+def sympy_index_symbol_with_prefix(prefix: SymT, idx: int) -> sympy.Symbol:
+    """
+    Used to generate an integer-nonnegative symbol.
+    """
+    # This should never be used for creating shape/stride symbols, as those
+    # should all be allocated before Inductor.
+    assert prefix != SymT.SIZE
+    # NOTE: shape symbols are positive (> 0), but index variables are only
+    # non-negative (>= 0).
+    return make_symbol(prefix, idx, integer=True, nonnegative=True)
+
+
+def sympy_index_symbol(name: str) -> sympy.Symbol:
+    """
+    Used to generate an integer-nonnegative symbol.
+    """
     # This should never be used for creating shape/stride symbols, as those
     # should all be allocated before Inductor.
     assert name[0] != "s"
@@ -553,30 +578,43 @@ def sympy_symbol(name: str) -> sympy.Symbol:
     return sympy.Symbol(name, integer=True, nonnegative=True)
 
 
-def sympy_subs(expr: sympy.Expr, replacements: Dict[Any, Any]) -> sympy.Expr:
+def sympy_subs(expr: sympy.Expr, replacements: Dict[sympy.Expr, Any]) -> sympy.Expr:
     """
-    xreplace is faster than subs, but is way more picky
+    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
+    have the same replaced expression integer and nonnegative properties.
     """
 
-    def promote_strings(key):
-        if isinstance(key, str):
-            return sympy_symbol(key)
-        return key
+    def to_symbol(replaced, replacement):
+        assert isinstance(replaced, sympy.Expr)
+        if isinstance(replacement, str):
+            return sympy.Symbol(
+                replacement,
+                integer=replaced.is_integer,  # type: ignore[attr-defined]
+                nonnegative=replaced.is_nonnegative,  # type: ignore[attr-defined]
+            )
+        else:
+            return replacement
 
+    # xreplace is faster than subs, but is way more picky
     return sympy.sympify(expr).xreplace(
-        {promote_strings(k): promote_strings(v) for k, v in replacements.items()}
+        {k: to_symbol(k, v) for k, v in replacements.items()}
     )
 
 
-def free_symbol_startswith(index: sympy.Expr, prefix: str):
-    return any(v.name.startswith(prefix) for v in index.free_symbols)
+def is_symbolic(a: Any) -> bool:
+    return isinstance(a, torch.SymInt) or (
+        isinstance(a, torch.Tensor)
+        and any(is_symbolic(x) for x in itertools.chain(a.size(), a.stride()))
+    )
 
 
-def free_symbol_has(index: sympy.Expr, pattern: str):
-    return any(pattern in v.name for v in index.free_symbols)
+def any_is_symbolic(*args: Any) -> bool:
+    return any(is_symbolic(a) for a in args)
 
 
-def has_incompatible_cudagraph_ops(gm):
+def get_first_incompatible_cudagraph_node(gm):
+    from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+
     forbidden_set = {
         "aten._fused_moving_avg_obs_fq_helper.default",
         "aten._fused_moving_avg_obs_fq_helper_functional.default",
@@ -610,38 +648,44 @@ def has_incompatible_cudagraph_ops(gm):
         )
     for node in gm.graph.nodes:
         if str(node.target) in forbidden_set:
-            return True
-        if hasattr(node.target, "tags"):
-            if torch.Tag.dynamic_output_shape in node.target.tags:
-                return True
-            if torch.Tag.data_dependent_output in node.target.tags:
-                return True
-    return False
+            return node
+        if (val := node.meta.get("val")) is not None and free_unbacked_symbols(val):
+            return node
+    return None
 
 
-try:
-    from triton.compiler.compiler import AttrsDescriptor as instance_descriptor
-except ImportError:
-    # To support older version of triton which does not have AttrsDescriptor
-    # class
-    instance_descriptor = collections.namedtuple(  # type: ignore[no-redef]
-        "instance_descriptor",
-        ["divisible_by_16", "equal_to_1", "ids_of_folded_args", "divisible_by_8"],
-        defaults=[tuple(), tuple(), tuple(), tuple()],
-    )
+def has_incompatible_cudagraph_ops(gm):
+    return get_first_incompatible_cudagraph_node(gm) is not None
 
 
-@functools.lru_cache(None)
-def cache_dir() -> str:
-    cache_dir = os.environ.get("TORCHINDUCTOR_CACHE_DIR")
-    if cache_dir is None:
-        sanitized_username = re.sub(r'[\\/:*?"<>|]', "_", getpass.getuser())
-        cache_dir = os.path.join(
-            tempfile.gettempdir(),
-            "torchinductor_" + sanitized_username,
-        )
-    os.makedirs(cache_dir, exist_ok=True)
-    return cache_dir
+def output_node(gm: torch.fx.GraphModule):
+    """Get the output node from an FX graph"""
+    last_node = next(iter(reversed(gm.graph.nodes)))
+    assert last_node.op == "output"
+    return last_node
+
+
+_registered_caches: List[Any] = []
+
+
+def clear_on_fresh_inductor_cache(obj: Any):
+    """
+    Use this decorator to register any caches that should be cache_clear'd
+    with fresh_inductor_cache().
+    """
+    if not hasattr(obj, "cache_clear") or not callable(obj.cache_clear):
+        raise AttributeError(f"{obj} does not have a cache_clear method")
+
+    _registered_caches.append(obj)
+    return obj
+
+
+def clear_inductor_caches():
+    """
+    Clear all registered caches.
+    """
+    for obj in _registered_caches:
+        obj.cache_clear()
 
 
 @contextlib.contextmanager
@@ -652,7 +696,10 @@ def fresh_inductor_cache(cache_entries=None):
     Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
     generated with this cache instance.
     """
-    with tempfile.TemporaryDirectory() as inductor_cache_dir:
+    clear_inductor_caches()
+
+    inductor_cache_dir = tempfile.mkdtemp()
+    try:
         with mock.patch.dict(
             os.environ, {"TORCHINDUCTOR_CACHE_DIR": inductor_cache_dir}
         ):
@@ -670,6 +717,10 @@ def fresh_inductor_cache(cache_entries=None):
                                 if ".lock" not in f
                             }
                         )
+        shutil.rmtree(inductor_cache_dir)
+    except Exception:
+        log.warning("on error, temporary cache dir kept at %s", inductor_cache_dir)
+        raise
 
 
 def argsort(seq) -> List[int]:
@@ -772,6 +823,12 @@ def ctx():
 
         return ctx()
 
+    def do_indent(self, offset=1):
+        self._indent += offset
+
+    def do_unindent(self, offset=1):
+        self._indent -= offset
+
     def splice(self, other_code, strip=False):
         if isinstance(other_code, IndentedBuffer):
             dedent = float("inf")
@@ -795,6 +852,30 @@ def splice(self, other_code, strip=False):
             for line in other_code.split("\n"):
                 self.writeline(line)
 
+    def map(self, func: Callable[[Any], Any]) -> IndentedBuffer:
+        res = IndentedBuffer(initial_indent=self._indent)
+        res._lines = [func(line) for line in self._lines]
+        return res
+
+    def __repr__(self):
+        return f"{type(self)}({self.getvalue()})"
+
+    def __add__(self, other):
+        assert self._indent == other._indent
+        res = IndentedBuffer(initial_indent=self._indent)
+        res.writelines(self._lines)
+        res.writelines(other._lines)
+        return res
+
+
+@contextlib.contextmanager
+def restore_stdout_stderr(initial_stdout, initial_stderr):
+    try:
+        yield
+    finally:
+        sys.stdout = initial_stdout
+        sys.stderr = initial_stderr
+
 
 class DeferredLineBase:
     """A line that can be 'unwritten' at a later time"""
@@ -806,11 +887,11 @@ def __init__(self, line):
 
     def __call__(self) -> Optional[str]:
         """Returns either self.line or None to indicate the line has been 'unwritten'"""
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def _new_line(self, line: str) -> DeferredLineBase:
         """Returns a new deferred line with the same condition"""
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def with_prefix(self, prefix):
         return self._new_line(f"{prefix}{self.line}")
@@ -829,10 +910,14 @@ def __len__(self):
 
 
 @functools.lru_cache(None)
-def is_big_gpu(index):
-    sms = torch.cuda.get_device_properties(index).multi_processor_count
-    if sms < 80:  # V100
-        log.warning("not enough SMs to use max_autotune_gemm mode")
+def is_big_gpu(index) -> bool:
+    min_sms = 68  # 3080
+    avail_sms = torch.cuda.get_device_properties(index).multi_processor_count
+    if avail_sms < min_sms:
+        log.warning(
+            "Not enough SMs to use max_autotune_gemm mode",
+            extra={"min_sms": min_sms, "avail_sms": avail_sms},
+        )
         return False
     return True
 
@@ -867,14 +952,19 @@ def use_triton_template(layout, *, enable_int32=False):
     )
 
 
-def use_cutlass_template(layout):
+def use_cutlass_template(layout, m, n, k):
+    from .virtualized import V
+
+    gemm_size = V.graph.sizevars.size_hint(m * n * k, fallback=-1)
+    if gemm_size <= 0 or gemm_size < config.cuda.cutlass_backend_min_gemm_size:
+        return False
     from .codegen.cuda.cutlass_utils import try_import_cutlass
 
     # Do not use cutlass template on ROCm
     if torch.version.hip:
         return False
 
-    layout_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+    layout_dtypes = [torch.float16, torch.bfloat16, torch.float32, torch.int32]
     res = _use_template_for_cuda(layout, layout_dtypes) and _use_autotune_backend(
         "CUTLASS"
     )
@@ -915,7 +1005,7 @@ def run_and_get_code(fn, *args, **kwargs):
     from .graph import GraphLowering
 
     compile_to_module = GraphLowering.compile_to_module
-    source_codes = []
+    source_codes: List[str] = []
 
     def patched_compile_to_module(self):
         mod = compile_to_module(self)
@@ -923,14 +1013,62 @@ def patched_compile_to_module(self):
             source_codes.append(f.read())
         return mod
 
-    with mock.patch.object(
-        GraphLowering, "compile_to_module", patched_compile_to_module
-    ):
-        torch._dynamo.reset()
-        result = fn(*args, **kwargs)
+    # If FX code caching is enabled, a hit prevents getting the code.
+    with config.patch({"fx_graph_cache": False}):
+        with mock.patch.object(
+            GraphLowering, "compile_to_module", patched_compile_to_module
+        ):
+            torch._dynamo.reset()
+            result = fn(*args, **kwargs)
     return result, source_codes
 
 
+def get_code(fn, *args, **kwargs):
+    """Get the inductor-generated code, but skip any actual compilation or running."""
+    from .graph import GraphLowering
+
+    source_codes: List[str] = []
+
+    def patched_compile_to_module(self: GraphLowering):
+        class DummyModule:
+            """This is empty to replace the generated triton module"""
+
+            def __init__(self):
+                pass
+
+            def call(self, *args, **kwargs):
+                # Don't do anything when called
+                pass
+
+        code, _ = (
+            self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
+        )
+        # Skip all the actual compiling.
+
+        source_codes.append(code)
+        return DummyModule()
+
+    # If FX code caching is enabled, a hit prevents getting the code.
+    with config.patch({"fx_graph_cache": False}):
+        with mock.patch.object(
+            GraphLowering, "compile_to_module", patched_compile_to_module
+        ):
+            torch._dynamo.reset()
+            # Note the return here is None
+            _ = fn(*args, **kwargs)
+
+    return source_codes
+
+
+def get_triton_code(fn, *args, **kwargs):
+    source_codes = get_code(fn, *args, **kwargs)
+    # Can have two outputs if backwards was eagerly compiled
+    assert (
+        1 <= len(source_codes) <= 2
+    ), f"expected one or two code outputs got {len(source_codes)}"
+    return source_codes[0]
+
+
 def run_and_get_triton_code(fn, *args, **kwargs):
     _, source_codes = run_and_get_code(fn, *args, **kwargs)
     # Can have two outputs if backwards was eagerly compiled
@@ -987,35 +1125,6 @@ def developer_warning(msg):
         log.info(msg)
 
 
-def get_num_bytes(*args: torch.Tensor, num_in_out_args: int = 0) -> int:
-    """
-    Return the total number of bytes the arguments of tensor type takes.
-
-    For in/out args, tensor sizes are counted twice: once for reading and
-    once for writing.
-
-    The first num_in_out_args arguments are in out tensors.
-    """
-    return sum(
-        arg.numel() * arg.element_size() * (1 + int(i < num_in_out_args))
-        for i, arg in enumerate(args)
-        if isinstance(arg, torch.Tensor)
-    )
-
-
-def create_bandwidth_info_str(ms, num_gb, gb_per_s, prefix="", suffix=""):
-    info_str = f"{prefix}{ms:.3f}ms    \t{num_gb:.3f} GB \t {gb_per_s:7.2f}GB/s{suffix}"
-    try:
-        import colorama
-
-        if ms > 0.012 and gb_per_s < 650:
-            info_str = colorama.Fore.RED + info_str + colorama.Fore.RESET
-    except ImportError:
-        log.warning("Colorama is not installed. Install it if you want colored output")
-
-    return info_str
-
-
 def get_benchmark_name():
     """
     An experimental API used only when config.benchmark_kernel is true.
@@ -1067,7 +1176,7 @@ def get_sympy_Expr_dtype(val: sympy.Expr) -> torch.dtype:
     assert isinstance(
         val, sympy.Expr
     ), "only support sympy.Expr as input to get_sympy_Expr_dtype"
-    if val.is_integer:
+    if val.is_integer:  # type: ignore[attr-defined]
         return torch.int64
     else:
         return torch.float64
@@ -1082,45 +1191,11 @@ def maybe_profile(should_profile, *args, **kwargs):
         yield
 
 
-def triton_config_to_hashable(cfg):
-    """
-    Convert triton config to a tuple that can uniquely identify it. We can use
-    the return value as a dictionary key.
-    """
-    items = sorted(cfg.kwargs.items())
-    items.append(("num_warps", cfg.num_warps))
-    items.append(("num_stages", cfg.num_stages))
-    return tuple(items)
-
-
-HAS_COLORAMA = True
-try:
-    import colorama
-except ImportError:
-    HAS_COLORAMA = False
-
-
-def _color_text(msg, color):
-    if not HAS_COLORAMA:
-        return msg
-
-    return getattr(colorama.Fore, color.upper()) + msg + colorama.Fore.RESET
-
-
-def green_text(msg):
-    return _color_text(msg, "green")
-
-
-def yellow_text(msg):
-    return _color_text(msg, "yellow")
-
-
-def red_text(msg):
-    return _color_text(msg, "red")
-
-
-def blue_text(msg):
-    return _color_text(msg, "blue")
+def parallel_num_threads():
+    threads = config.cpp.threads
+    if threads < 1:
+        threads = torch.get_num_threads()
+    return threads
 
 
 @functools.lru_cache(None)
@@ -1131,9 +1206,9 @@ def get_device_tflops(dtype):
 
     if inspect.signature(get_max_simd_tflops).parameters.get("clock_rate"):
         # Triton API change in https://github.com/openai/triton/pull/2293
-        from triton.testing import nvsmi
+        from torch._utils_internal import max_clock_rate
 
-        sm_clock = nvsmi(["clocks.max.sm"])[0]
+        sm_clock = max_clock_rate()
         if dtype in (torch.float16, torch.bfloat16):
             return get_max_tensorcore_tflops(dtype, sm_clock)
 
@@ -1158,6 +1233,12 @@ def get_gpu_dram_gbps():
     return get_dram_gbps()
 
 
+def get_gpu_shared_memory():
+    from triton.runtime import driver
+
+    return driver.active.utils.get_device_properties(0).get("max_shared_mem", 0)
+
+
 def is_welford_reduction(reduction_type):
     return reduction_type.startswith("welford")
 
@@ -1204,3 +1285,226 @@ class Placeholder(enum.Enum):
     # The descriptive name of the triton kernel; when unique_kernel_names = False, this
     # placeholder will be replaced with a string with more information.
     DESCRIPTIVE_NAME = "DESCRIPTIVE_NAME"
+
+
+def pass_execution_and_save(func, gm, inp, msg):
+    from .pattern_matcher import stable_topological_sort
+
+    with tempfile.NamedTemporaryFile(
+        mode="w",
+        encoding="utf-8",
+        delete=False,
+    ) as f:
+        before_io = io.StringIO()
+        after_io = io.StringIO()
+        ShapeProp(gm=gm, fake_mode=detect_fake_mode(inp)).propagate(*inp)
+        print(f"Before:\n{gm.graph}", file=f)
+        print(gm.graph, file=before_io)
+        start_time = datetime.now()
+        func(gm.graph)
+        time_elapsed = datetime.now() - start_time
+        # recompile graph
+        stable_topological_sort(gm.graph)
+        gm.graph.lint()
+        gm.recompile()
+
+        print(f"After:\n{gm.graph}", file=f)
+        print(gm.graph, file=after_io)
+        t = before_io.getvalue() == after_io.getvalue()
+        log.info(
+            "%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s",
+            msg,
+            f.name,
+            t,
+            time_elapsed,
+        )
+
+
+def is_collective(node):
+    from . import ir
+
+    return type(node) == ir._CollectiveKernel
+
+
+def is_wait(node):
+    from . import ir
+
+    return type(node) == ir._WaitKernel
+
+
+def num_fw_fixed_arguments(dynamo_gm_num_inputs: int, aot_fw_gm_num_inputs: int):
+    "Computes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)"
+    num_rng_seed_offset_inputs = (
+        2 if torch._functorch.config.functionalize_rng_ops else 0
+    )
+    return aot_fw_gm_num_inputs - dynamo_gm_num_inputs - num_rng_seed_offset_inputs
+
+
+def count_tangents(fx_g: torch.fx.GraphModule):
+    """
+    Infers which inputs are static for a backwards graph
+    """
+
+    def is_saved_tensor(x):
+        return (
+            "tangents" not in x.name
+            and "bwd_seed" not in x.name
+            and "bwd_base_offset" not in x.name
+        )
+
+    arg_count = 0
+    static_arg_idxs = []
+    for n in fx_g.graph.nodes:
+        if n.op == "placeholder":
+            if is_saved_tensor(n):
+                static_arg_idxs.append(arg_count)
+            arg_count += 1
+
+    assert static_arg_idxs == list(range(len(static_arg_idxs)))
+    return len(static_arg_idxs)
+
+
+@dataclasses.dataclass
+class BoxedBool:
+    value: bool
+
+    def __bool__(self):
+        return self.value
+
+    @staticmethod
+    def disable(obj):
+        if isinstance(obj, BoxedBool):
+            obj.value = False
+            return obj
+        return False
+
+
+@contextlib.contextmanager
+def collect_defined_kernels(kernel_list):
+    from .codegen.wrapper import WrapperCodeGen
+
+    orig_define_kernel = WrapperCodeGen.define_kernel
+
+    def new_define_kernel(wrapper, name, kernel_code, metadata, *args, **kwargs):
+        nonlocal kernel_list
+        kernel_list.append(kernel_code)
+        return orig_define_kernel(wrapper, name, kernel_code, metadata, *args, **kwargs)
+
+    with unittest.mock.patch.object(WrapperCodeGen, "define_kernel", new_define_kernel):
+        yield
+
+
+def get_cloned_parameter_buffer_name(name: str):
+    return name + "__original__"
+
+
+def is_gpu(device: str):
+    return device in ["cuda", "xpu"]
+
+
+def device_need_guard(device: str):
+    assert isinstance(device, str)
+    return is_gpu(device)
+
+
+def needs_fallback_due_to_atomic_add_limitations(dtype):
+    # tl.atomic_add does NOT support the following types
+    return dtype in {torch.int64, torch.bool, torch.bfloat16}
+
+
+def use_scatter_fallback(
+    op_overload: torch._ops.OpOverload,
+    reduction_type,
+    self_dtype,
+    src_dtype,
+    src_device_type,
+    src_is_tensor,
+):
+    reduce_ty = (
+        "add" if op_overload.overloadpacket == torch.ops.aten.scatter_ else "sum"
+    )
+
+    return (
+        reduction_type not in {None, reduce_ty}
+        or (
+            src_is_tensor
+            and is_gpu(src_device_type)
+            and needs_fallback_due_to_atomic_add_limitations(src_dtype)
+        )
+        or (
+            op_overload.overloadpacket == torch.ops.aten.scatter_reduce_
+            and reduction_type == "sum"
+            and src_is_tensor
+            and src_device_type == "cpu"
+            and config.cpp.fallback_scatter_reduce_sum
+            and (config.cpp.dynamic_threads or parallel_num_threads() != 1)
+        )
+        or (reduction_type == reduce_ty and self_dtype in {torch.bool, torch.int64})
+        or torch.are_deterministic_algorithms_enabled()
+    )
+
+
+def dump_node_schedule(node_schedule):
+    """
+    An API that can be used in pdb to dump a node_schedule.
+    Right mainly dump the read/write dependencies but can add more as needed.
+    """
+    from torch._inductor.codegen.triton import DisableReduction, EnableReduction
+    from torch._inductor.scheduler import SchedulerNode
+
+    print(f"Node schedule with {len(node_schedule)} nodes")
+    for idx, node in enumerate(node_schedule):
+        print(f" {idx:3}:")
+        if node is EnableReduction:
+            print("enable reduction")
+        elif node is DisableReduction:
+            print("disable reduction")
+        elif isinstance(node, SchedulerNode):
+            is_red = node.is_reduction()
+            print(f"{'red' if is_red else 'pw'} scheduler node")
+            if is_red:
+                print(f"original reduction hint {node.node.data.reduction_hint}")  # type: ignore[attr-defined]
+            print("ReadDep:")
+            for dep in node.read_writes.reads:
+                print(dep)
+            print("WriteDep:")
+            for dep in node.read_writes.writes:
+                print(dep)
+        else:
+            raise RuntimeError(f"Unrecognized node type: {type(node)}")
+
+
+def tensor_is_aligned(tensor: torch.Tensor):
+    # See Note: [Input Alignment handling in Inductor]
+    # Right now, we don't try to guard on the alignment of the storage offset.
+    # When this comment was written, non-symbolic storage_offsets are not guarded on
+    # but symbolic storage_offsets are. For consistency, we suppress guard creation
+    # upon performing this check: that ensures that we don't add recompiles when we
+    # add this logic.
+    return (tensor.storage_offset() * get_dtype_size(tensor.dtype)) % ALIGNMENT == 0
+
+
+def should_assume_input_aligned(example_input: torch.Tensor):
+    # See Note: [Input Alignment handling in Inductor]
+
+    # right now, we only care about alignment for cuda tensors.
+    if example_input.device.type != "cuda":
+        return False
+    return config.assume_aligned_inputs or tensor_is_aligned(example_input)
+
+
+def maybe_get_suppress_shape_guards_ctx():
+    # Try to get TracingContext.try_get().fake_mode.shape_env.suppress_guards()
+    # If it's not available, return a nullcontext.
+
+    # If we're dealing with cudagraphs, we might not have a tracing_context
+    tracing_context = torch._guards.TracingContext.try_get()
+    if not tracing_context:
+        return contextlib.nullcontext()
+
+    # In standalone inductor compile mode, we might not have a shape_env attached to the fake mode
+    shape_env = tracing_context.fake_mode.shape_env
+    if not shape_env:
+        return contextlib.nullcontext()
+
+    return shape_env.suppress_guards()
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py
index c7fb27419b642..07c6ea8190a61 100644
--- a/torch/_inductor/virtualized.py
+++ b/torch/_inductor/virtualized.py
@@ -1,38 +1,114 @@
+"""
+This file provides a number of "global" variables/handlers that are actually
+thread local and dynamically scoped, with Inductor patching them to various
+implementations depending on the situation.
+
+These handlers are interacted with in a fairly stylized way.  Typically,
+we will import V from this module::
+
+    from .virtualized import V
+
+Various handlers are accessible as attributes on this module; for example,
+you might access ``V.graph.sizevars.size_hint`` to resolve a size hint associated with
+a number.
+
+There are a few distinct usage patterns for virtualized global variables:
+
+1. Implicit argument passing.  Examples: ``V.current_node``, ``V.aot_compilation``.
+   Use ``V.set_current_node`` to change what the current node is while we're
+   executing some region of code, so code inside that region can query ``V.current_node``
+   to find out what it is.  This is often more convenient than manually threading
+   the current node as an argument through all call stacks.
+
+2. Per-compilation global state.  Examples: ``V.fake_mode``, ``V.graph``.  For a
+   given ``compile_fx`` invocation, these typically don't change, but they are
+   associated with some internal state so they cannot just be global functions.
+   We install these objects at the beginning of compilation and then you can
+   conveniently access them without having to pass them around.
+
+3. Alternate define-by-run interpretations.  Examples: ``V.ops``, ``V.kernel``.
+   A commonly used IR in Inductor is define-by-run: instead of maintaining
+   explicit syntax data structures, we instead represent loop bodies as
+   callable functions, which internally invoke operations defined on
+   ``V.ops``.  To perform semantic analysis, print or code generate these
+   operations, we dynamically patch ``V.ops`` with an alternate handler with
+   the intended semantics and then run the callable function.  For example, to
+   extract out a traditional (FX) graph representation of the define-by-run
+   IR, simply install a handler that records each ``ops`` call to a graph.
+
+   TODO: Define a parent class / protocol that defines all of the operations
+   V.ops is expected to support.
+
+It is typically an error to access a virtualized global without having installed
+an appropriate handler (you will get a NullHandler), although in some cases we
+provide a default implementation.
+
+One last thing: although most virtualized globals are accessed via ``V``, ``ops`` is
+ubiquitous enough to have its own top level variable, so you will typically see
+``ops.constant(...)`` rather than ``V.ops.constant(...)``.  In fact, these are not
+equivalent; the former interface supports arithmetic overloads like ``x + y``
+instead of forcing ``ops.add(x, y)``, so it should be preferred.
+
+Some operators are seemingly unused, but they are implicitly used by ops_wrapper.
+In particular, we typically have an operator for every basic pointwise PyTorch operation
+supported.
+"""
+
 from __future__ import annotations
 
-import itertools
-from contextlib import contextmanager
-from itertools import chain
+from contextlib import AbstractContextManager, contextmanager
 from threading import local
-from typing import Any, Callable, TYPE_CHECKING, Union
-from unittest.mock import patch
-
-import sympy
-
-from torch._inductor.utils import IndentedBuffer
-
-from torch.fx.graph import inplace_methods, magic_methods
+from typing import Any, Callable, Generic, List, Type, TYPE_CHECKING, TypeVar, Union
 
-from .utils import reduction_num_outputs, sympy_str, sympy_symbol
+from .ops_handler import (  # noqa: F401
+    KernelFormatterHandler,
+    MockHandler,
+    OpsHandler,
+    ReductionType,
+    StoreMode,
+    WrapperHandler,
+)
 
 if TYPE_CHECKING:
+    import torch
+    from torch._inductor.debug import DebugContext
     from torch._inductor.graph import GraphLowering
+    from torch._inductor.ir import InterpreterShim
+    from torch._subclasses import FakeTensorMode
 
 threadlocal = local()
 
+T = TypeVar("T")
 
-class Virtualized:
+
+class NullHandler:
+    """
+    Sentinel indicating that a global variable is unset ala None.  Typically,
+    attempting to access the global variable before it's set is an error, but with
+    NullHandler it won't fail until you try to access an attribute on it.
     """
-    A global variable that redirects via thread local variable
+
+    pass
+
+
+class Virtualized(Generic[T]):
+    """
+    Implements a global variable that redirects via thread local variable
+    (NB: construct this class to create the global variable; this is not
+    a singleton class!)
 
     This allows us to swap in different op implementations in codegen.
+
+    NB: Despite the fact that we typically call these "handlers" (e.g., NullHandler is
+    the default value of the variable), we sometimes use these variables to
+    store other things, like booleans.
     """
 
-    def __init__(self, vname: str, default):
+    def __init__(self, vname: str, default: Union[Callable[[], T], Type[NullHandler]]):
         self._key: str = f"__torchinductor_{vname}"
         self._default = default
 
-    def _set_handler(self, value):
+    def _set_handler(self, value: T) -> AbstractContextManager[None]:
         prior = self._get_handler()
         setattr(threadlocal, self._key, value)
 
@@ -45,20 +121,19 @@ def ctx():
 
         return ctx()
 
-    def _get_handler(self):
+    def _get_handler(self) -> T:
         try:
             return getattr(threadlocal, self._key)
         except AttributeError:
-            return self._default()
+            # TODO: To be honest, I feel we probably should just error in this
+            # case, instead of making a null handler that will probably error
+            # when you getattr on it
+            return self._default()  # type: ignore[return-value]
 
-    def __getattr__(self, name):
+    def __getattr__(self, name: str) -> Any:
         return getattr(self._get_handler(), name)
 
 
-class NullHandler:
-    pass
-
-
 class NullKernelHandler(NullHandler):
     """
     We need access `V.kernel.removed_buffers` in DeferredLine class when there
@@ -75,124 +150,17 @@ def __init__(self):
         self.index_dtype = "tl.int64"
 
 
-def _arg_str(a) -> str:
-    if isinstance(a, sympy.Expr):
-        return sympy_str(a)
-    return str(a)
-
-
-class MockHandler:
-    def __getattr__(self, name):
-        if name == "name":
-            return "MockHandler"
-
-        def inner(*args, **kwargs):
-            fargs = [_arg_str(a) for a in args]
-            fargs.extend(f"{k}={v}" for k, v in kwargs.items())
-            return f"ops.{name}({', '.join(fargs)})"
-
-        return inner
-
-    @staticmethod
-    def masked(mask, body, other) -> str:
-        return f"ops.masked({mask}, {body()}, {other})"
-
-    @staticmethod
-    def indirect_indexing(index_var, size, check=True) -> sympy.Symbol:
-        return sympy_symbol(f"({str(index_var)})")
-
-    @classmethod
-    def _init_cls(cls):
-        def make_handler(format_string):
-            @staticmethod  # type: ignore[misc]
-            def inner(*args):
-                return format_string.format(*args)
-
-            return inner
-
-        for name, format_string in chain(
-            magic_methods.items(), inplace_methods.items()
-        ):
-            setattr(cls, name, make_handler(format_string))
-
-
-class KernelFormatterHandler:
-    def __init__(self, parent_handler):
-        self.parent_handler = parent_handler
-        self.output = IndentedBuffer(1)
-        self.var_counter = itertools.count()
-
-    @staticmethod
-    def ir_to_string(ir_fn, index, rindex=None) -> str:
-        from .ir import FlexibleLayout
-
-        args = [index, rindex] if rindex is not None else [index]
-        names = ["index", "rindex"] if rindex is not None else ["index"]
-        formatter = KernelFormatterHandler(MockHandler())
-
-        with formatter.output.indent(-1):
-            formatter.output.writeline(f"def inner_fn({', '.join(names)}):")
-        for name, arg in zip(names, args):
-            if arg:
-                lhs = ", ".join(
-                    [
-                        str("_" if isinstance(v, (int, sympy.Integer)) else v)
-                        for v in arg
-                    ]
-                )
-                formatter.output.writeline(f"{lhs} = {name}")
-
-        with V.set_ops_handler(formatter), patch.object(
-            FlexibleLayout, "allow_indexing", True
-        ):
-            result = ir_fn(*args)
-            return formatter.getvalue(result)
-
-    def __getattr__(self, name) -> Callable[..., str]:
-        def inner(*args, **kwargs):
-            line = getattr(self.parent_handler, name)(*args, **kwargs)
-            if name == "indirect_indexing":
-                return line
-            # replace line with a new variable name
-            varname = f"tmp{next(self.var_counter)}"
-            self.output.writeline(f"{varname} = {line}")
-            return varname
-
-        return inner
-
-    def reduction(
-        self, dtype, src_dtype, reduction_type, value
-    ) -> Union[tuple[str, ...], str]:
-        line = self.parent_handler.reduction(dtype, src_dtype, reduction_type, value)
-        num_values = reduction_num_outputs(reduction_type)
-        varnames = [f"tmp{next(self.var_counter)}" for _ in range(num_values)]
-        self.output.writeline(f"{','.join(varnames)} = {line}")
-        return tuple(varnames) if num_values > 1 else varnames[0]
-
-    def getvalue(self, result):
-        self.output.writeline(f"return {result}")
-        return self.output.getvalue()
-
-
-class WrapperHandler:
-    def __init__(self, inner):
-        self._inner = inner
-
-    def __getattr__(self, item):
-        return getattr(self._inner, item)
-
-
-MockHandler._init_cls()
-
-_ops = Virtualized("ops", MockHandler)
-_graph = Virtualized("graph", NullHandler)
-_real_inputs = Virtualized("real_inputs", NullHandler)
-_fake_mode = Virtualized("fake_mode", NullHandler)
-_kernel = Virtualized("kernel", NullKernelHandler)
-_debug = Virtualized("debug", NullHandler)
-_interpreter = Virtualized("interpreter", NullHandler)
-_aot_compilation = Virtualized("aot_compilation", NullHandler)
-_current_node = Virtualized("current_node", NullHandler)
+_ops: Virtualized[OpsHandler[Any]] = Virtualized("ops", MockHandler)
+_graph: Virtualized[GraphLowering] = Virtualized("graph", NullHandler)
+_real_inputs: Virtualized[List[torch.Tensor]] = Virtualized("real_inputs", NullHandler)
+_fake_mode: Virtualized[FakeTensorMode] = Virtualized("fake_mode", NullHandler)
+_kernel: Virtualized[NullKernelHandler] = Virtualized(
+    "kernel", NullKernelHandler
+)  # TODO: improve type
+_debug: Virtualized[DebugContext] = Virtualized("debug", NullHandler)
+_interpreter: Virtualized[InterpreterShim] = Virtualized("interpreter", NullHandler)
+_aot_compilation: Virtualized[bool] = Virtualized("aot_compilation", NullHandler)
+_current_node: Virtualized[torch.fx.Node] = Virtualized("current_node", NullHandler)
 
 
 class OpsValue:
@@ -244,6 +212,42 @@ def __mod__(self, other):
     def __pow__(self, other):
         return ops.pow(self, other)
 
+    def __lt__(self, other):
+        return ops.lt(self, other)
+
+    def __le__(self, other):
+        return ops.le(self, other)
+
+    def __eq__(self, other):
+        return ops.eq(self, other)
+
+    def __ne__(self, other):
+        return ops.ne(self, other)
+
+    def __gt__(self, other):
+        return ops.gt(self, other)
+
+    def __ge__(self, other):
+        return ops.ge(self, other)
+
+    def __and__(self, other):
+        return ops.bitwise_and(self, other)
+
+    def __or__(self, other):
+        return ops.bitwise_or(self, other)
+
+    def __xor__(self, other):
+        return ops.bitwise_xor(self, other)
+
+    def __invert__(self):
+        return ops.bitwise_not(self)
+
+    def __rshfit__(self, n):
+        return ops.bitwise_right_shift(self, n)
+
+    def __lshift__(self, n):
+        return ops.bitwise_left_shift(self, n)
+
 
 class OpsWrapper:
     """This wraps any returned IR values into an `OpsValue` instance, so that we
@@ -281,8 +285,6 @@ def indirect_indexing(index, size, check=True):
 
 ops = OpsWrapper()
 
-_MockHandler = MockHandler
-
 
 class _V:
     MockHandler = MockHandler
@@ -299,13 +301,13 @@ class _V:
     set_kernel_handler: Callable[[Any], Any] = _kernel._set_handler
     set_debug_handler: Callable[[Any], Any] = _debug._set_handler
     set_interpreter_handler: Callable[[Any], Any] = _interpreter._set_handler
-    set_aot_compilation: Callable[[Any], Any] = _aot_compilation._set_handler
+    set_aot_compilation: Callable[[bool], Any] = _aot_compilation._set_handler
     get_aot_compilation: Callable[[], Any] = _aot_compilation._get_handler
     set_current_node: Callable[[Any], Any] = _current_node._set_handler
     get_current_node: Callable[[], Any] = _current_node._get_handler
 
     @property
-    def ops(self) -> _MockHandler:
+    def ops(self) -> OpsHandler[Any]:
         """The operator handler specific to the current codegen task"""
         return _ops._get_handler()
 
diff --git a/torch/_inductor/wrapper_benchmark.py b/torch/_inductor/wrapper_benchmark.py
index 66b2b71458f70..31b81bba4a97d 100644
--- a/torch/_inductor/wrapper_benchmark.py
+++ b/torch/_inductor/wrapper_benchmark.py
@@ -4,14 +4,15 @@
 
 import torch
 from torch.autograd import DeviceType
-from .utils import create_bandwidth_info_str, do_bench, get_num_bytes
+from .runtime.runtime_utils import create_bandwidth_info_str, do_bench, get_num_bytes
 
 _kernel_category_choices = [
+    "foreach",
+    "persistent_reduction",
     "pointwise",
     "reduction",
-    "persistent_reduction",
+    "split_scan",
     "template",
-    "foreach",
 ]
 
 
@@ -20,7 +21,9 @@ def get_kernel_category_by_source_code(src_code):
     Similar to get_kernel_category but use the source code. Call this API
     if we have not compile the src_code to module yet.
     """
-    choices = [ch for ch in _kernel_category_choices if f"@{ch}" in src_code]
+    choices = [
+        ch for ch in _kernel_category_choices if f"@triton_heuristics.{ch}" in src_code
+    ]
     if len(choices) == 1:
         return choices[0]
     else:
@@ -45,6 +48,18 @@ def get_kernel_category(kernel_mod):
         return "unknown"
 
 
+def get_triton_kernel(mod):
+    from torch._inductor.runtime.triton_heuristics import CachingAutotuner
+
+    cand_list = [
+        v
+        for k, v in mod.__dict__.items()
+        if k.startswith("triton_") and isinstance(v, CachingAutotuner)
+    ]
+    assert len(cand_list) == 1
+    return cand_list[0]
+
+
 def benchmark_all_kernels(benchmark_name, benchmark_all_configs):
     """
     An experimental API used only when config.benchmark_kernel is true.
@@ -57,17 +72,6 @@ def benchmark_all_kernels(benchmark_name, benchmark_all_configs):
     """
     from torch._inductor.codecache import PyCodeCache
 
-    def get_triton_kernel(mod):
-        from torch._inductor.triton_heuristics import CachingAutotuner
-
-        cand_list = [
-            v
-            for k, v in mod.__dict__.items()
-            if k.startswith("triton_") and isinstance(v, CachingAutotuner)
-        ]
-        assert len(cand_list) == 1
-        return cand_list[0]
-
     nfound = 0
     for kernel_key, kernel_mod in PyCodeCache.cache.items():
         if not hasattr(kernel_mod, "get_args") or not hasattr(kernel_mod, "call"):
@@ -83,7 +87,9 @@ def get_triton_kernel(mod):
                 if arg_name.startswith("in_out_ptr")
             ]
         )
-        num_gb = get_num_bytes(*args, num_in_out_args=num_in_out_ptrs) / 1e9
+        num_gb = triton_kernel.inductor_meta.get("kernel_num_gb", None)
+        if num_gb is None:
+            num_gb = get_num_bytes(*args, num_in_out_args=num_in_out_ptrs) / 1e9
 
         def get_info_str(ms, n_regs, n_spills, shared, prefix=""):
             if not any(x is None for x in [n_regs, n_spills, shared]):
@@ -274,9 +280,7 @@ def compiled_module_main(benchmark_name, benchmark_compiled_module_fn):
     else:
         times = 10
         repeat = 10
-        wall_time_ms = (
-            benchmark_compiled_module_fn(times=times, repeat=repeat) / times * 1000
-        )
+        wall_time_ms = benchmark_compiled_module_fn(times=times, repeat=repeat) * 1000
 
         if not args.profile:
             return
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index be1b86f5c8601..bed50a21f528e 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -65,7 +65,7 @@
 
     LockType = _thread.LockType
 except ImportError:
-    import _dummy_thread
+    import _dummy_thread  # type: ignore[import-not-found]
 
     LockType = _dummy_thread.LockType
 
@@ -105,9 +105,7 @@ def createResolutionCallbackFromEnv(lookup_base):
 
     def lookupInModule(qualified_name, module):
         if "." in qualified_name:
-            parts = qualified_name.split(".")
-            base = parts[0]
-            remaining_pieces = ".".join(parts[1:])
+            base, remaining_pieces = qualified_name.split(".", maxsplit=1)
             module_value = getattr(module, base)
             return lookupInModule(remaining_pieces, module_value)
         else:
@@ -382,7 +380,12 @@ def get_type_hint_captures(fn):
     # This may happen in cases where the function is synthesized dynamically at runtime.
     src = loader.get_source(fn)
     if src is None:
-        src = inspect.getsource(fn)
+        try:
+            src = inspect.getsource(fn)
+        except OSError as e:
+            raise OSError(
+                f"Failed to get source for {fn} using inspect.getsource"
+            ) from e
 
     # Gather a dictionary of parameter name -> type, skipping any parameters whose annotated
     # types are strings. These are only understood by TorchScript in the context of a type annotation
@@ -937,7 +940,7 @@ def get_class_name_lineno(method) -> Tuple[str, int]:
 
 
 # (qualified_name, class name) => class_fileno
-_overloaded_method_class_fileno = {}
+_overloaded_method_class_fileno: Dict[Tuple[str, str], int] = {}
 
 
 def _overload_method(func):
@@ -983,7 +986,7 @@ def _get_overloaded_methods(method, mod_class):
     mod_class_fileno = get_source_lines_and_file(mod_class)[1]
     mod_end_fileno = mod_class_fileno + len(get_source_lines_and_file(mod_class)[0])
     if not (method_line_no >= mod_class_fileno and method_line_no <= mod_end_fileno):
-        raise Exception(
+        raise AssertionError(
             "Overloads are not useable when a module is redeclared within the same file: "
             + str(method)
         )
@@ -1099,8 +1102,10 @@ def is_rref_instance(obj) -> bool:
 
 
 def is_final(ann) -> bool:
-    return ann.__module__ in {"typing", "typing_extensions"} and (
-        get_origin(ann) is Final or isinstance(ann, type(Final))
+    return (
+        hasattr(ann, "__module__")
+        and ann.__module__ in {"typing", "typing_extensions"}
+        and (get_origin(ann) is Final or isinstance(ann, type(Final)))
     )
 
 
diff --git a/torch/_library/__init__.py b/torch/_library/__init__.py
index 8638a01bdf461..a417ec18fef1f 100644
--- a/torch/_library/__init__.py
+++ b/torch/_library/__init__.py
@@ -1,3 +1,6 @@
 import torch._library.abstract_impl
+import torch._library.autograd
 import torch._library.simple_registry
 import torch._library.utils
+
+from torch._library.fake_class_registry import register_fake_class
diff --git a/torch/_library/abstract_impl.py b/torch/_library/abstract_impl.py
index e09d3eace9b74..14d6d8c46235f 100644
--- a/torch/_library/abstract_impl.py
+++ b/torch/_library/abstract_impl.py
@@ -8,7 +8,7 @@
 
 
 class AbstractImplHolder:
-    """A holder where one can register an abstract impl to."""
+    """A holder where one can register an fake impl to."""
 
     def __init__(self, qualname: str):
         self.qualname: str = qualname
@@ -16,58 +16,58 @@ def __init__(self, qualname: str):
         self.lib: Optional[torch.library.Library] = None
 
     def register(self, func: Callable, source: str) -> RegistrationHandle:
-        """Register an abstract impl.
+        """Register an fake impl.
 
         Returns a RegistrationHandle that one can use to de-register this
-        abstract impl.
+        fake impl.
         """
         if self.kernel is not None:
             raise RuntimeError(
-                f"impl_abstract(...): the operator {self.qualname} "
-                f"already has an abstract impl registered at "
+                f"register_fake(...): the operator {self.qualname} "
+                f"already has an fake impl registered at "
                 f"{self.kernel.source}."
             )
         if torch._C._dispatch_has_kernel_for_dispatch_key(self.qualname, "Meta"):
             raise RuntimeError(
-                f"impl_abstract(...): the operator {self.qualname} "
+                f"register_fake(...): the operator {self.qualname} "
                 f"already has an DispatchKey::Meta implementation via a "
                 f"pre-existing torch.library or TORCH_LIBRARY registration. "
                 f"Please either remove that registration or don't call "
-                f"impl_abstract."
+                f"register_fake."
             )
 
         if torch._C._dispatch_has_kernel_for_dispatch_key(
             self.qualname, "CompositeImplicitAutograd"
         ):
             raise RuntimeError(
-                f"impl_abstract(...): the operator {self.qualname} "
+                f"register_fake(...): the operator {self.qualname} "
                 f"already has an implementation for this device type via a "
                 f"pre-existing registration to "
                 f"DispatchKey::CompositeImplicitAutograd."
-                f"CompositeImplicitAutograd operators do not need an abstract "
+                f"CompositeImplicitAutograd operators do not need an fake "
                 f"impl; "
                 f"instead, the operator will decompose into its constituents "
                 f"and those "
-                f"can have abstract impls defined on them."
+                f"can have fake impls defined on them."
             )
 
         # Store the kernel in this holder
         self.kernel = Kernel(func, source)
 
-        # Also register the abstract impl to Meta key
+        # Also register the fake impl to Meta key
         if self.lib is None:
             ns = self.qualname.split("::")[0]
             self.lib = torch.library.Library(ns, "FRAGMENT")
         meta_kernel = construct_meta_kernel(self.qualname, self)
         self.lib.impl(self.qualname, meta_kernel, "Meta")
 
-        def deregister_abstract_impl():
+        def deregister_fake_class():
             if self.lib:
                 self.lib._destroy()
                 self.lib = None
             self.kernel = None
 
-        return RegistrationHandle(deregister_abstract_impl)
+        return RegistrationHandle(deregister_fake_class)
 
 
 def construct_meta_kernel(
@@ -116,11 +116,12 @@ def set_ctx_getter(ctx_getter):
 
 class AbstractImplCtx:
     """
-    Context object for writing abstract implementations for custom operators.
+    Context object for writing fake implementations for custom operators.
     """
 
-    def __init__(self, _shape_env, _op):
-        self._shape_env = _shape_env
+    def __init__(self, _fake_mode, _op):
+        self._fake_mode = _fake_mode
+        self._shape_env = _fake_mode.shape_env
         self._op = _op
 
     def create_unbacked_symint(self, *, min=2, max=None) -> torch.SymInt:
@@ -132,7 +133,7 @@ def create_unbacked_symint(self, *, min=2, max=None) -> torch.SymInt:
     def new_dynamic_size(self, *, min=0, max=None) -> torch.SymInt:
         """Constructs a new symint (symbolic int) representing a data-dependent value.
 
-        This is useful for writing the abstract implementation (which is necessary
+        This is useful for writing the fake implementation (which is necessary
         for torch.compile) for a CustomOp where an output Tensor has a size
         that depends on the data of the input Tensors.
 
@@ -160,10 +161,10 @@ def new_dynamic_size(self, *, min=0, max=None) -> torch.SymInt:
             >>> lib = torch.library.Library("mymodule", "FRAGMENT")
             >>> lib.define("mymodule::custom_nonzero(Tensor x) -> Tensor")
             >>>
-            >>> @torch.library.impl_abstract("mymodule::custom_nonzero")
-            >>> def custom_nonzero_abstract(x):
+            >>> @torch.library.register_fake("mymodule::custom_nonzero")
+            >>> def _(x):
             >>>     # Number of nonzero-elements is data-dependent.
-            >>>     # Since we cannot peek at the data in an abstract impl,
+            >>>     # Since we cannot peek at the data in an fake impl,
             >>>     # we use the ctx object to construct a new symint that
             >>>     # represents the data-dependent size.
             >>>     ctx = torch.library.get_ctx()
@@ -173,7 +174,7 @@ def new_dynamic_size(self, *, min=0, max=None) -> torch.SymInt:
             >>>     return result
             >>>
             >>> @torch.library.impl(lib, "custom_nonzero", "CPU")
-            >>> def custom_nonzero_cpu(x):
+            >>> def _(x):
             >>>     x_np = x.numpy()
             >>>     res = np.stack(np.nonzero(x_np), axis=1)
             >>>     return torch.tensor(res, device=x.device)
diff --git a/torch/_library/autograd.py b/torch/_library/autograd.py
new file mode 100644
index 0000000000000..ebd35361a9403
--- /dev/null
+++ b/torch/_library/autograd.py
@@ -0,0 +1,220 @@
+import dataclasses
+from typing import Any, Callable, Optional, Protocol
+
+from .. import _C, _ops, autograd, Tensor
+
+from ..utils import _pytree
+from . import utils
+
+
+class InfoProtocol(Protocol):
+    _backward_fn: Optional[Callable]
+    _setup_context_fn: Optional[Callable]
+
+
+@dataclasses.dataclass
+class Info:
+    _backward_fn: Optional[Callable]
+    _setup_context_fn: Optional[Callable]
+
+
+def make_autograd_impl(op: _ops.OpOverload, info: InfoProtocol) -> Callable:
+    name: str = f"GeneratedBackwardFor_{op._namespace}_{op._opname}_{op._overloadname}"
+
+    saved_keyset = None
+    saved_keyword_only_args = None
+    has_kwarg_only_args = utils.has_kwarg_only_args(op._schema)
+
+    def forward(ctx, *args):
+        with _C._AutoDispatchBelowAutograd():
+            nonlocal saved_keyset, saved_keyword_only_args
+            keyset = saved_keyset
+            assert keyset is not None, "Should have been set by autograd_impl"
+            saved_keyset = None
+            kwargs = saved_keyword_only_args
+            assert kwargs is not None, "Should have been set by autograd_impl"
+            saved_keyword_only_args = None
+            result = op.redispatch(keyset & _C._after_autograd_keyset, *args, **kwargs)
+            if info._setup_context_fn:
+                # The Dispatcher will remove args that are equal to their default
+                # values from (args, kwargs). We're going to add it back so that
+                # the user can access them.
+                #
+                # This is OK to do: The Dispatcher removed the args for serialization
+                # FC/BC reasons (that is, a graph will not store args that are equal
+                # to their default values), but that doesn't matter here. If the user
+                # adds a new default arg, then they must update
+                # their setup_context (along with the rest of their operator
+                # registrations)
+                args, kwargs = utils.fill_defaults(op._schema, args, kwargs)
+
+                if has_kwarg_only_args:
+                    info._setup_context_fn(
+                        ctx=ctx, inputs=args, keyword_only_inputs=kwargs, output=result
+                    )
+                else:
+                    info._setup_context_fn(ctx=ctx, inputs=args, output=result)
+            return result
+
+    def backward(ctx, *grads):
+        if info._backward_fn:
+            result = info._backward_fn(ctx, *grads)
+            return result
+        raise RuntimeError(
+            f"Trying to backward through {op} but no autograd "
+            f"formula was registered. "
+            f"Please use register_autograd to add one."
+        )
+
+    Generated = type(
+        name,
+        (autograd.Function,),
+        {
+            "forward": staticmethod(forward),
+            "backward": staticmethod(backward),
+        },
+    )
+
+    schema = op._schema
+    if any(
+        utils.is_tensorlist_like_type(a.type)
+        for a in (*schema.arguments, *schema.returns)
+    ):
+        Generated = supports_tensorlist(Generated)
+
+    # The dispatcher passes any keyword-only-args as kwargs and the
+    # rest of the args (even if specified as kwargs) as args.
+    def autograd_impl(keyset, *args, **keyword_only_args):
+        # We set a nonlocal to ferry keyset from here to the forward.
+        # This supports recursive calls (we implement the forward carefully so
+        # that it'll read saved_keyset before making a recursive call to the op).
+        nonlocal saved_keyset, saved_keyword_only_args
+        assert saved_keyset is None
+        saved_keyset = keyset
+        assert saved_keyword_only_args is None
+        saved_keyword_only_args = keyword_only_args
+        result = Generated.apply(*args)  # type: ignore[attr-defined]
+        return result
+
+    return autograd_impl
+
+
+def supports_tensorlist(cls: Any) -> Any:
+    """Allows a given autograd.Function class to support List[Tensor] inputs/outputs.
+
+    Regular autograd.Function has a constraint that it only directly supports autograd for
+    Tensors. Applying @supports_tensorlist enables an autograd.Function to support
+    autograd for List[Tensor] inputs and outputs.
+    """
+    # NB: All calls to the autograd.Function.apply shares these variables
+    # We assume that only one call to .apply happens at a time. This means that
+    # you cannot call the autograd.Function recursively (e.g. from its own forward).
+    input_spec: Optional[spec_t] = None
+    output_spec: Optional[spec_t] = None
+    result_is_tuple = None
+
+    orig_forward = cls.forward
+    orig_backward = cls.backward
+    orig_apply = cls.apply
+
+    def new_forward(ctx, *args):
+        if input_spec is None:
+            raise NotImplementedError(
+                "NYI: calling supports_tensorlist autograd.Function.forward directly. "
+                "You should probably be calling .apply instead. "
+                "Please file an issue if not."
+            )
+        args = unflatten(list(args), input_spec)
+        result = orig_forward(ctx, *args)
+        nonlocal output_spec
+        nonlocal result_is_tuple
+        result_is_tuple = isinstance(result, tuple)
+        if not result_is_tuple:
+            result = (result,)
+        nonlocal output_spec
+        flat_result, output_spec = flatten(result, not_list_of_tensor)
+
+        # Save the input_spec/output_spec for backward because another call to
+        # .apply will override the nonlocals.
+        if hasattr(ctx, "_pt_metadata"):
+            raise RuntimeError(
+                "Please don't set ctx._pt_metadata; PyTorch uses it to store info"
+            )
+        ctx._pt_metadata = (input_spec, output_spec)
+
+        return tuple(flat_result)
+
+    def new_backward(ctx, *grads):
+        if not hasattr(ctx, "_pt_metadata"):
+            raise NotImplementedError(
+                "NYI: calling supports_tensorlist autograd.Function.backward directly. "
+                "This will automatically get called by PyTorch autograd. "
+                "Please file an issue if you need this."
+            )
+
+        input_spec, output_spec = ctx._pt_metadata
+        grads = unflatten(list(grads), output_spec)
+        grad_inputs = orig_backward(ctx, *grads)
+        if not isinstance(grad_inputs, tuple):
+            grad_inputs = (grad_inputs,)
+        # Assume that any Nones in the backward are Tensors.
+        # If the forward has an arg that is [1, 2, 3], the backward should
+        # return None as the grad.
+        # If the forward has an arg that is [tensor, tensor], the backward
+        # may return [None, None], [grad, None], [None, grad], or [grad, grad].
+        flat_grad_inputs, grad_inputs_spec = flatten(
+            grad_inputs, not_list_of_optional_tensor
+        )
+        if grad_inputs_spec != input_spec:
+            raise RuntimeError(
+                f"Expected the return from backward to be of the same structure "
+                f"as the inputs. Got: {grad_inputs_spec} (return from backward), "
+                f"{input_spec} (inputs)"
+            )
+        return tuple(flat_grad_inputs)
+
+    def new_apply(*args):
+        nonlocal input_spec
+        if input_spec is not None:
+            raise NotImplementedError(
+                "NYI: Recursive call to autograd.Function decorated with "
+                "`supports_tensorlist`. Please file an issue."
+            )
+        try:
+            flat_args, input_spec = flatten(args, is_leaf=not_list_of_tensor)
+            result = orig_apply(*flat_args)  # type: ignore[misc]
+        finally:
+            input_spec = None
+        assert output_spec is not None
+        result = unflatten(list(result), output_spec)
+        if not result_is_tuple:
+            assert isinstance(result, tuple)
+            assert len(result) == 1
+            return result[0]
+        return result
+
+    cls.forward = new_forward
+    cls.backward = new_backward
+    cls.apply = new_apply
+    return cls
+
+
+def not_list_of_tensor(tree):
+    if isinstance(tree, tuple):
+        return False
+    if isinstance(tree, list):
+        return any(not isinstance(l, Tensor) for l in tree)
+    return True
+
+
+def not_list_of_optional_tensor(tree):
+    if isinstance(tree, tuple):
+        return False
+    if isinstance(tree, list):
+        return any(l is not None and not isinstance(l, Tensor) for l in tree)
+    return True
+
+
+flatten = _pytree.tree_flatten
+unflatten = _pytree.tree_unflatten
+spec_t = _pytree.TreeSpec
diff --git a/torch/_library/custom_ops.py b/torch/_library/custom_ops.py
new file mode 100644
index 0000000000000..bbe4ac7a2aac7
--- /dev/null
+++ b/torch/_library/custom_ops.py
@@ -0,0 +1,569 @@
+import inspect
+import weakref
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
+
+from torch.utils._exposed_in import exposed_in
+
+from .. import _C, _library, _ops, autograd, library, Tensor
+from . import utils
+
+
+device_types_t = Optional[Union[str, Sequence[str]]]
+
+
+@exposed_in("torch.library")
+def custom_op(
+    name: str,
+    /,
+    *,
+    mutates_args: Iterable[str],
+    device_types: device_types_t = None,
+    schema: Optional[str] = None,
+) -> Callable:
+    """Wraps a function into custom operator.
+
+    Reasons why you may want to create a custom op include:
+    - Wrapping a third-party library or custom kernel to work with PyTorch
+    subsystems like Autograd.
+    - Preventing torch.compile/export/FX tracing from peeking inside your function.
+
+    This API is used as a decorator around a function (please see examples).
+    The provided function must have type hints; these are needed to interface
+    with PyTorch's various subsystems.
+
+    Args:
+        name (str): A name for the custom op that looks like "{namespace}::{name}",
+            e.g. "mylib::my_linear". The name is used as the op's stable identifier
+            in PyTorch subsystems (e.g. torch.export, FX graphs).
+            To avoid name collisions, please use your project name as the namespace;
+            e.g. all custom ops in pytorch/fbgemm use "fbgemm" as the namespace.
+        mutates_args (Iterable[str]): The names of args that the function mutates.
+            This MUST be accurate, otherwise, the behavior is undefined.
+        device_types (None | str | Sequence[str]): The device type(s) the function
+            is valid for. If no device type is provided, then the function
+            is used as the default implementation for all device types.
+            Examples: "cpu", "cuda".
+        schema (None | str): A schema string for the operator. If None
+            (recommended) we'll infer a schema for the operator from its type
+            annotations. We recommend letting us infer a schema unless you
+            have a specific reason not to.
+            Example: "(Tensor x, int y) -> (Tensor, Tensor)".
+
+    .. note::
+        We recommend not passing in a ``schema`` arg and instead letting us infer
+        it from the type annotations. It is error-prone to write your own schema.
+        You may wish to provide your own schema if our interpretation of
+        the type annotation is not what you want.
+        For more info on how to write a schema string, see
+        `here <https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#func>`_
+
+    Examples::
+        >>> import torch
+        >>> from torch import Tensor
+        >>> from torch.library import custom_op
+        >>> import numpy as np
+        >>>
+        >>> @custom_op("mylib::numpy_sin", mutates_args=())
+        >>> def numpy_sin(x: Tensor) -> Tensor:
+        >>>     x_np = x.cpu().numpy()
+        >>>     y_np = np.sin(x_np)
+        >>>     return torch.from_numpy(y_np).to(device=x.device)
+        >>>
+        >>> x = torch.randn(3)
+        >>> y = numpy_sin(x)
+        >>> assert torch.allclose(y, x.sin())
+        >>>
+        >>> # Example of a custom op that only works for one device type.
+        >>> @custom_op("mylib::numpy_sin_cpu", mutates_args=(), device_types="cpu")
+        >>> def numpy_sin_cpu(x: Tensor) -> Tensor:
+        >>>     x_np = x.numpy()
+        >>>     y_np = np.sin(x_np)
+        >>>     return torch.from_numpy(y_np)
+        >>>
+        >>> x = torch.randn(3)
+        >>> y = numpy_sin_cpu(x)
+        >>> assert torch.allclose(y, x.sin())
+        >>>
+        >>> # Example of a custom op that mutates an input
+        >>> @custom_op("mylib::numpy_sin_inplace", mutates_args={"x"}, device_types="cpu")
+        >>> def numpy_sin_inplace(x: Tensor) -> None:
+        >>>     x_np = x.numpy()
+        >>>     np.sin(x_np, out=x_np)
+        >>>
+        >>> x = torch.randn(3)
+        >>> expected = x.sin()
+        >>> numpy_sin_inplace(x)
+        >>> assert torch.allclose(x, expected)
+
+    """
+
+    def inner(fn):
+        import torch
+
+        if schema is None:
+            import torch._custom_op.impl
+
+            schema_str = torch._custom_op.impl.infer_schema(fn, mutates_args)
+        else:
+            schema_str = schema
+        namespace, opname = name.split("::")
+        result = CustomOpDef(namespace, opname, schema_str, fn)
+        if schema is not None:
+            # Check that schema's alias annotations match those of `mutates_args`.
+            expected = set()
+            for arg in result._opoverload._schema.arguments:
+                if arg.alias_info is not None and arg.alias_info.is_write:
+                    expected.add(arg.name)
+            if expected != set(mutates_args):
+                raise ValueError(
+                    f"Attempted to create a custom op with `mutates_args={mutates_args}` "
+                    f"and `schema={schema}. The schema suggests that the op mutates {expected}"
+                    f"which is different from what was provided to us in `mutates_args`. "
+                    f"Please make these consistent."
+                )
+        result.register_kernel(device_types)(fn)
+        return result
+
+    return inner
+
+
+class CustomOpDef:
+    """CustomOpDef is a wrapper around a function that turns it into a custom op.
+
+    It has various methods for registering additional behavior for this
+    custom op.
+
+    You should not instantiate CustomOpDef directly; instead, use the
+    :func:`torch.library.custom_op` API.
+    """
+
+    def __init__(self, namespace: str, name: str, schema: str, fn: Callable) -> None:
+        # Fields used to interface with the PyTorch dispatcher
+        self._namespace = namespace
+        self._name = name
+        self._schema = schema
+
+        self._init_fn = fn
+
+        self._backend_fns: Dict[Union[str, None], Callable] = {}
+        self._abstract_fn: Optional[Callable] = None
+        self._setup_context_fn: Optional[Callable] = None
+        self._backward_fn: Optional[Callable] = None
+
+        self._lib = get_library_allowing_overwrite(self._namespace, self._name)
+        self._register_to_dispatcher()
+        OPDEFS[self._qualname] = self
+
+    @property
+    def _qualname(self) -> str:
+        return f"{self._namespace}::{self._name}"
+
+    def __repr__(self) -> str:
+        return f"<CustomOpDef({self._qualname})>"
+
+    def register_kernel(
+        self, device_types: device_types_t, fn: Optional[Callable] = None, /
+    ) -> Callable:
+        """Register an implementation for a device type for this operator.
+
+        Some valid device_types are: "cpu", "cuda", "xla", "mps", "ipu", "xpu".
+        This API may be used as a decorator.
+
+        Args:
+            fn (Callable): The function to register as the implementation for
+                the given device types.
+            device_types (str | Sequence[str]): The device device_types to register an impl to.
+
+        Examples::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+            >>> import torch
+            >>> from torch import Tensor
+            >>> from torch.library import custom_op
+            >>> import numpy as np
+            >>>
+            >>> # Create a custom op that works on cpu
+            >>> @custom_op("mylib::numpy_sin", mutates_args=(), device_types="cpu")
+            >>> def numpy_sin(x: Tensor) -> Tensor:
+            >>>     x_np = x.numpy()
+            >>>     y_np = np.sin(x_np)
+            >>>     return torch.from_numpy(y_np)
+            >>>
+            >>> # Add implementations for the cuda device
+            >>> @numpy_sin.register_kernel("cuda")
+            >>> def _(x):
+            >>>     x_np = x.cpu().numpy()
+            >>>     y_np = np.sin(x_np)
+            >>>     return torch.from_numpy(y_np).to(device=x.device)
+            >>>
+            >>> x_cpu = torch.randn(3)
+            >>> x_cuda = x_cpu.cuda()
+            >>> assert torch.allclose(numpy_sin(x_cpu), x_cpu.sin())
+            >>> assert torch.allclose(numpy_sin(x_cuda), x_cuda.sin())
+
+        """
+
+        def inner(fn):
+            if device_types is None or isinstance(device_types, str):
+                dtypes: List[Union[str, None]] = [device_types]
+            else:
+                dtypes = list(device_types)
+            for device_type in dtypes:
+                if device_type not in self._backend_fns:
+
+                    def backend_impl(*args, **kwargs):
+                        # Checks the assumption that outputs cannot alias
+                        # inputs or other outputs.
+                        storages = {
+                            id(tensor.untyped_storage())
+                            for tensor in iter_tensors(args, kwargs)
+                        }
+
+                        result = self._backend_fns[device_type](*args, **kwargs)
+
+                        tuple_result = result
+                        if not isinstance(result, tuple):
+                            tuple_result = (result,)
+                        for tensor in iter_tensors(tuple_result, {}):
+                            key = id(tensor.untyped_storage())
+                            if id(tensor.untyped_storage()) in storages:
+                                fn = self._backend_fns[device_type]
+                                module = inspect.getmodule(fn)
+                                raise RuntimeError(
+                                    f"Tensors returned from custom ops (1) must not "
+                                    f"be inputs to the custom op and (2) may not alias "
+                                    f"any inputs or other returns. Please clone the "
+                                    f"the offending output tensors (e.g. output.clone()) "
+                                    f"or refactor your code. "
+                                    f"Offending op: {self._name} (with implementation in {module})"
+                                )
+                            storages.add(key)
+                        return result
+
+                    if device_type is None:
+                        self._lib.impl(
+                            self._name, backend_impl, "CompositeExplicitAutograd"
+                        )
+                    else:
+                        self._lib.impl(
+                            self._name,
+                            backend_impl,
+                            _C._dispatch_key_for_device(device_type),
+                        )
+                self._backend_fns[device_type] = fn
+            return fn
+
+        # See NOTE: [Supporting decorator and non-decorator usage]
+        if fn is None:
+            return inner
+        return inner(fn)
+
+    def register_fake(self, fn: Callable, /) -> Callable:
+        r"""Register a FakeTensor implementation for this custom op.
+
+        This is necessary to get the operator to work efficiently with torch.compile.
+
+        The Fake impl (sometimes also known as a meta kernel or abstract impl)
+        specifies the behavior of this operator on Tensors that carry no data.
+        Given some input Tensors with certain properties
+        (sizes/strides/storage_offset/device), it specifies what the properties of
+        the output Tensors are.
+
+        Please see :func:`torch.library.impl_abstract` for more details.
+
+        Args:
+            fn (Callable): The function to register as the FakeTensor
+                implementation.
+
+        Examples:
+            >>> import torch
+            >>> import numpy as np
+            >>> from torch import Tensor
+            >>>
+            >>> # Example 1: an operator without data-dependent output shape
+            >>> @torch.library.custom_op("mylib::linear", mutates_args=())
+            >>> def linear(x: Tensor, weight: Tensor, bias: Tensor) -> Tensor:
+            >>>     return (x @ weight.t()) + bias
+            >>>
+            >>> @linear.register_fake
+            >>> def _(x, weight, bias):
+            >>>     assert x.dim() == 2
+            >>>     assert weight.dim() == 2
+            >>>     assert bias.dim() == 1
+            >>>     assert x.shape[1] == weight.shape[1]
+            >>>     assert weight.shape[0] == bias.shape[0]
+            >>>     assert x.device == weight.device
+            >>>     return x.new_empty(x.size(0), weight.size(0))
+            >>>
+            >>> x = torch.randn(2, 2)
+            >>> weight = torch.randn(2, 2)
+            >>> bias = torch.randn(2)
+            >>> # xdoctest: +SKIP("Requires Python <= 3.11")
+            >>> out = torch.compile(linear, fullgraph=True)(x, weight, bias)
+            >>> # xdoctest: +SKIP("Requires Python <= 3.11")
+            >>> assert torch.allclose(out, torch.nn.functional.linear(x, weight, bias))
+            >>>
+            >>> # Example 2: an operator with data-dependent output shape
+            >>> @torch.library.custom_op("mylib::nonzero", mutates_args=())
+            >>> def nonzero(x: Tensor) -> Tensor:
+            >>>     x_np = x.cpu().numpy()
+            >>>     res = np.stack(np.nonzero(x_np), axis=1)
+            >>>     return torch.tensor(res, device=x.device)
+            >>>
+            >>> @nonzero.register_fake
+            >>> def _(x):
+            >>>     # Number of nonzero-elements is data-dependent.
+            >>>     # Since we cannot peek at the data in an abstract impl,
+            >>>     # we use the ctx object to construct a new symint that
+            >>>     # represents the data-dependent size.
+            >>>     ctx = torch.library.get_ctx()
+            >>>     nnz = ctx.new_dynamic_size()
+            >>>     shape = [nnz, x.dim()]
+            >>>     result = x.new_empty(shape, dtype=torch.int64)
+            >>>     return result
+            >>>
+            >>> x = torch.tensor([0, 1, 2, 0, 0, 1])
+            >>> # xdoctest: +SKIP("Requires Python <= 3.11")
+            >>> out = torch.compile(nonzero, fullgraph=True)(x)
+            >>> # xdoctest: +SKIP("Requires Python <= 3.11")
+            >>> assert torch.allclose(out, x.nonzero())
+
+        """
+        self._abstract_fn = fn
+        return fn
+
+    def register_autograd(
+        self,
+        backward: Callable,
+        /,
+        *,
+        setup_context: Optional[Callable] = None,
+    ) -> None:
+        r"""Register a backward formula for this custom op.
+
+        In order for an operator to work with autograd, you need to register
+        a backward formula:
+        1. You must tell us how to compute gradients during the backward pass
+        by providing us a "backward" function.
+        2. If you need any values from the forward to compute gradients, you can
+        use `setup_context` to save values for backward.
+
+        ``backward_fn`` runs during the backward pass. It accepts ``(ctx, *grads)``:
+        - ``grads`` is one or more gradients. The number of gradients matches
+        the number of outputs of the operator.
+        The ``ctx`` object is `the same ctx object <context_method_mixins>`_ used by
+        :class:`torch.autograd.Function`. The semantics of ``backward_fn`` are the
+        same as :meth:`torch.autograd.Function.backward`.
+
+        ``setup_context(ctx, inputs, output)`` runs during the forward pass.
+        Please save quantities needed for backward onto the ``ctx`` object via
+        either :meth:`torch.autograd.function.FunctionCtx.save_for_backward`
+        or assigning them as attributes of ``ctx``. If your custom op has
+        kwarg-only arguments, we expect the signature of ``setup_context``
+        to be ``setup_context(ctx, inputs, keyword_only_inputs, output)``.
+
+        Both ``setup_context_fn`` and ``backward_fn`` must be traceable. That is,
+        they may not directly access :meth:`torch.Tensor.data_ptr` and they must
+        not depend on or mutate global state. If you need a non-traceable backward,
+        you can make it a separate custom_op that you call inside ``backward_fn``.
+
+        Examples:
+            >>> import torch
+            >>> import numpy as np
+            >>> from torch import Tensor
+            >>>
+            >>> @torch.library.custom_op("mylib::numpy_sin", mutates_args=())
+            >>> def numpy_sin(x: Tensor) -> Tensor:
+            >>>     x_np = x.cpu().numpy()
+            >>>     y_np = np.sin(x_np)
+            >>>     return torch.from_numpy(y_np).to(device=x.device)
+            >>>
+            >>> def setup_context(ctx, inputs, output) -> Tensor:
+            >>>     x, = inputs
+            >>>     ctx.save_for_backward(x)
+            >>>
+            >>> def backward(ctx, grad):
+            >>>     x, = ctx.saved_tensors
+            >>>     return grad * x.cos()
+            >>>
+            >>> numpy_sin.register_autograd(backward, setup_context=setup_context)
+            >>>
+            >>> x = torch.randn(3, requires_grad=True)
+            >>> y = numpy_sin(x)
+            >>> grad_x, = torch.autograd.grad(y, x, torch.ones_like(y))
+            >>> assert torch.allclose(grad_x, x.cos())
+            >>>
+            >>> # Example with a keyword-only arg
+            >>> @torch.library.custom_op("mylib::numpy_mul", mutates_args=())
+            >>> def numpy_mul(x: Tensor, *, val: float) -> Tensor:
+            >>>     x_np = x.cpu().numpy()
+            >>>     y_np = x_np * val
+            >>>     return torch.from_numpy(y_np).to(device=x.device)
+            >>>
+            >>> def setup_context(ctx, inputs, keyword_only_inputs, output) -> Tensor:
+            >>>     ctx.val = keyword_only_inputs["val"]
+            >>>
+            >>> def backward(ctx, grad):
+            >>>     return grad * ctx.val
+            >>>
+            >>> numpy_mul.register_autograd(backward, setup_context=setup_context)
+            >>>
+            >>> x = torch.randn(3, requires_grad=True)
+            >>> y = numpy_mul(x, val=3.14)
+            >>> grad_x, = torch.autograd.grad(y, x, torch.ones_like(y))
+            >>> assert torch.allclose(grad_x, torch.full_like(x, 3.14))
+
+        """
+        schema = self._opoverload._schema
+        if not _library.utils.is_functional_schema(schema):
+            raise RuntimeError(
+                f"Cannot register autograd formula for non-functional operator "
+                f"{self} with schema {schema}. Please create "
+                f"a functional operator and register an autograd formula for that."
+            )
+
+        self._backward_fn = backward
+        self._setup_context_fn = setup_context
+
+    def _register_to_dispatcher(self) -> None:
+        lib = self._lib
+        schema_str = self._name + self._schema
+        cpp_schema = _C.parse_schema(schema_str)
+        if utils.has_kwarg_only_tensors(cpp_schema):
+            # If you want to support this, the progression is:
+            # - supporting kwarg-only Tensors that are non-differentiable
+            # - supporting kwarg-only Tensors (regardless of differentiability)
+            raise NotImplementedError(
+                f"custom_op with kwarg-only Tensor args. Please make your "
+                f"tensors not kwarg-only. Got: {schema_str}"
+            )
+
+        lib.define(
+            schema_str,
+            tags=[_C.Tag.pt2_compliant_tag, _C.Tag.needs_fixed_stride_order],
+        )
+        self._opoverload = _library.utils.lookup_op(self._qualname)
+
+        def fake_impl(*args, **kwargs):
+            if self._abstract_fn is None:
+                if _library.utils.can_generate_trivial_fake_impl(self._opoverload):
+                    return None
+                raise RuntimeError(
+                    f"There was no fake impl registered for {self}. "
+                    f"This is necessary for torch.compile/export/fx tracing to work. "
+                    f"Please use `{self._init_fn.__name__}.register_fake` to add an "
+                    f"fake impl."
+                )
+            return self._abstract_fn(*args, **kwargs)
+
+        lib._register_fake(self._name, fake_impl, _stacklevel=4)
+
+        autograd_impl = _library.autograd.make_autograd_impl(self._opoverload, self)
+        lib.impl(self._name, autograd_impl, "Autograd", with_keyset=True)
+
+        schema = self._opoverload._schema
+        if schema.is_mutable:
+
+            def adinplaceorview_impl(keyset, *args, **kwargs):
+                for arg, val in _library.utils.zip_schema(schema, args, kwargs):
+                    if not arg.alias_info:
+                        continue
+                    if not arg.alias_info.is_write:
+                        continue
+                    if isinstance(val, Tensor):
+                        autograd.graph.increment_version(val)
+                    elif isinstance(val, (tuple, list)):
+                        for v in val:
+                            if isinstance(v, Tensor):
+                                autograd.graph.increment_version(v)
+                with _C._AutoDispatchBelowADInplaceOrView():
+                    return self._opoverload.redispatch(
+                        keyset & _C._after_ADInplaceOrView_keyset, *args, **kwargs
+                    )
+
+            lib.impl(
+                self._name,
+                adinplaceorview_impl,
+                "ADInplaceOrView",
+                with_keyset=True,
+            )
+
+    def __call__(self, *args, **kwargs):
+        return self._opoverload(*args, **kwargs)
+
+
+# NOTE: [Supporting decorator and non-decorator usage]
+#
+# Some APIs may be both used as a decorator and not as a decorator.
+# For example:
+#
+# >>> def fn(x):
+# >>>     return x.sin()
+# >>>
+# >>> # Usage 1: not as a decorator
+# >>> numpy_sin.register_kernel("cuda", fn)
+# >>>
+# >>> # Usage 2: as a decorator
+# >>> @numpy_sin.register_kernel("cuda")
+# >>> def fn2(x):
+# >>>     return x.sin
+#
+# The way we support this is that `register_kernel` accepts an optional `fn`.
+# If `fn` is provided (Usage 1), then we know that the user is using it not
+# as a decorator.
+# If `fn` is not provided (Usage 2), then `register_kernel` needs to return a
+# decorator.
+
+
+OPDEF_TO_LIB: Dict[str, "library.Library"] = {}
+OPDEFS: weakref.WeakValueDictionary = weakref.WeakValueDictionary()
+
+
+def get_library_allowing_overwrite(namespace: str, name: str) -> "library.Library":
+    qualname = f"{namespace}::{name}"
+
+    if qualname in OPDEF_TO_LIB:
+        OPDEF_TO_LIB[qualname]._destroy()
+        del OPDEF_TO_LIB[qualname]
+
+    lib = library.Library(namespace, "FRAGMENT")
+    OPDEF_TO_LIB[qualname] = lib
+    return lib
+
+
+def iter_tensors(
+    args: Tuple[Any], kwargs: Dict[str, Any], allowed_nesting: int = 1
+) -> Iterator[Tensor]:
+    def check(arg):
+        if isinstance(arg, Tensor):
+            yield arg
+        elif allowed_nesting > 0 and isinstance(arg, (tuple, list)):
+            yield from iter_tensors(tuple(arg), {}, allowed_nesting - 1)
+
+    for arg in args:
+        yield from check(arg)
+    for kwarg in kwargs.values():
+        yield from check(kwarg)
+
+
+def _maybe_get_opdef(
+    op: Union[CustomOpDef, _ops.OpOverload, str]
+) -> Optional[CustomOpDef]:
+    if isinstance(op, CustomOpDef):
+        return op
+    if isinstance(op, _ops.OpOverload):
+        op = op._name
+    assert isinstance(op, str)
+    if op in OPDEFS:
+        return OPDEFS[op]
+    return None
diff --git a/torch/_library/fake_class_registry.py b/torch/_library/fake_class_registry.py
new file mode 100644
index 0000000000000..32a9aa8c87110
--- /dev/null
+++ b/torch/_library/fake_class_registry.py
@@ -0,0 +1,268 @@
+import logging
+from typing import Any, Dict, Optional, Protocol, Tuple
+
+import torch
+
+from torch._library.utils import parse_namespace
+
+log = logging.getLogger(__name__)
+
+
+class FakeScriptObject:
+    def __init__(self, wrapped_obj: Any, script_class_name: str):
+        self.wrapped_obj = wrapped_obj
+
+        # The fully qualified name of the class of original script object
+        self.script_class_name = script_class_name
+
+
+class HasStaticMethodFromReal(Protocol):
+    @classmethod
+    def from_real(cls, real_obj: torch.ScriptObject):
+        pass
+
+
+class FakeClassRegistry:
+    def __init__(self):
+        self._registered_class: Dict[str, Any] = {}
+
+    def has_impl(self, full_qualname: str) -> bool:
+        return full_qualname in self._registered_class
+
+    def get_impl(self, full_qualname: str) -> Any:
+        self._check_registered(full_qualname)
+        return self._registered_class[full_qualname]
+
+    def register(self, full_qualname: str, fake_class=None) -> None:
+        if self.has_impl(full_qualname):
+            log.warning(
+                "%s is already registered. Previous fake class is overrided with  %s.",
+                full_qualname,
+                fake_class,
+            )
+        self._registered_class[full_qualname] = fake_class
+
+    def deregister(self, full_qualname: str) -> Any:
+        if not self.has_impl(full_qualname):
+            log.warning(
+                "Cannot deregister %s. Please use register_fake_class to register it first."
+                " Or do you dereigster it twice?",
+                full_qualname,
+            )
+        else:
+            return self._registered_class.pop(full_qualname)
+
+    def clear(self) -> None:
+        self._registered_class.clear()
+
+    def _check_registered(self, full_qualname: str) -> None:
+        if full_qualname not in self._registered_class:
+            raise RuntimeError(
+                f"{full_qualname} is not registered. Please use register_fake_class to register it first."
+            )
+
+
+global_fake_class_registry = FakeClassRegistry()
+
+
+# TODO: add this check at compile time for __obj_flatten__.
+def _check_valid_flat_script_obj(flat_x):
+    if not isinstance(flat_x, tuple):
+        raise RuntimeError("Expect flat x to be a tuple.")
+
+    for tp in flat_x:
+        if not isinstance(tp, tuple):
+            raise RuntimeError("Expect flat x to be a tuple of tuples.")
+
+        if not len(tp) == 2 or not isinstance(tp[0], str):
+            raise RuntimeError(
+                "Expect element of flat x to be a tuple of two elements with first element being a string"
+            )
+
+
+def to_fake_obj(fake_mode, x: torch.ScriptObject) -> FakeScriptObject:
+    import torch.utils._pytree as pytree
+
+    flat_x = x.__obj_flatten__()  # type: ignore[attr-defined]
+
+    _check_valid_flat_script_obj(flat_x)
+
+    fake_flattened = pytree.tree_map_only(
+        torch.Tensor,
+        lambda t: fake_mode.from_tensor(t),
+        flat_x,
+    )
+
+    fake_x = _find_fake_class_for_script_object(x).__obj_unflatten__(fake_flattened)
+
+    def _call_torchbind(method_name):
+        from torch._higher_order_ops.torchbind import call_torchbind
+
+        def wrapped(self_, *args, **kwargs):
+            return call_torchbind(self_, method_name, *args, **kwargs)
+
+        return wrapped
+
+    fake_x_wrapped = FakeScriptObject(fake_x, x._type().qualified_name())  # type: ignore[attr-defined]
+    for name in x._method_names():  # type: ignore[attr-defined]
+        attr = getattr(fake_x, name, None)
+        if attr:
+            if not callable(attr):
+                raise RuntimeError(f"Expect {name} to be a callable but got {attr}.")
+
+            setattr(
+                fake_x_wrapped,
+                name,
+                _call_torchbind(name).__get__(fake_x_wrapped),
+            )
+        else:
+            log.warning("fake object of %s doesn't implement method %s.", x, name)
+    return fake_x_wrapped
+
+
+def register_fake_class(qualname, fake_class: Optional[HasStaticMethodFromReal] = None):
+    r"""Register a fake implementation for this class.
+
+    It's in the same spirit of registering a fake implementation for
+    an operator but with the difference that it
+    associates a fake class with the original torch bind class (registered
+    with torch::class_). In this way, torch.compile can handle them properly
+    in components such as Dynamo and AOTAutograd.
+
+    This API may be used as a decorator (see example). For the fake class, users
+    are required to provide a from_real classmethod that takes a real object and
+    returns an instance of the fake class. All tensors in the fake object should also
+    be properly fakified with to_fake_tensor() in from_real.
+
+    Examples:
+        # For a custom class Foo defined in test_custom_class_registration.cpp:
+        TORCH_LIBRARY(_TorchScriptTesting, m) {
+          m.class_<TensorQueue>("_TensorQueue")
+            .def(torch::init<at::Tensor>())
+            .def("push", &TensorQueue::push)
+            .def("pop", &TensorQueue::pop)
+            .def("top", &TensorQueue::top)
+            .def("size", &TensorQueue::size)
+            .def("clone_queue", &TensorQueue::clone_queue)
+            .def_pickle(
+                // __getstate__
+                [](const c10::intrusive_ptr<TensorQueue>& self)
+                    -> c10::Dict<std::string, at::Tensor> {
+                  return self->serialize();
+                },
+                // __setstate__
+                [](c10::Dict<std::string, at::Tensor> data)
+                    -> c10::intrusive_ptr<TensorQueue> {
+                  return c10::make_intrusive<TensorQueue>(std::move(data));
+                });
+            };
+        # We could register a fake class FakeTensorQueue in Python as follows:
+        import torch
+
+        @torch._library.register_fake_class("_TorchScriptTesting::_TensorQueue")
+        class FakeTensorQueue:
+            def __init__(self, queue):
+                self.queue = queue
+
+            @classmethod
+            def __obj_unflatten__(cls, flattened_ctx):
+                ctx = {flattened_ctx[0]: flattened_ctx[1]}
+                return cls(**ctx)
+
+            def push(self, x):
+                self.queue.append(x)
+
+            def pop(self):
+                return self.queue.pop(0)
+
+            def size(self):
+                return len(self.queue)
+
+    """
+
+    def inner(fake_class: HasStaticMethodFromReal):
+        ns, name = parse_namespace(qualname)
+
+        # This also checks whether the refered torch::class_ exists.
+        torchbind_class = torch._C._get_custom_class_python_wrapper(ns, name)
+
+        from_method = getattr(fake_class, _CONVERT_FROM_REAL_NAME, None)
+        if not from_method:
+            raise RuntimeError(
+                f"{fake_class} doesn't define a classmethod {_CONVERT_FROM_REAL_NAME}."
+            )
+
+        if not isinstance(fake_class.__dict__[_CONVERT_FROM_REAL_NAME], classmethod):
+            raise RuntimeError(
+                f"{_CONVERT_FROM_REAL_NAME} method is not a classmethod."
+            )
+
+        global_fake_class_registry.register(_full_qual_class_name(qualname), fake_class)
+        return fake_class
+
+    if fake_class is None:
+        return inner
+    return inner(fake_class)
+
+
+def deregister_fake_class(qualname):
+    return global_fake_class_registry.deregister(_full_qual_class_name(qualname))
+
+
+def has_fake_class(full_qualname) -> bool:
+    return global_fake_class_registry.has_impl(full_qualname)
+
+
+def find_fake_class(full_qualname) -> Optional[Any]:
+    if not has_fake_class(full_qualname):
+        return None
+    return global_fake_class_registry.get_impl(full_qualname)
+
+
+def _full_qual_class_name(qualname: str) -> str:
+    ns, name = parse_namespace(qualname)
+    return "__torch__.torch.classes." + ns + "." + name
+
+
+# Return the namespace and class name from fully qualified name.
+def _ns_and_class_name(full_qualname: str) -> Tuple[str, str]:
+    splits = full_qualname.split(".")
+    assert len(splits) == 5
+    _torch, torch_ns, classes, ns, class_name = splits
+    return ns, class_name
+
+
+def _find_fake_class_for_script_object(x: torch.ScriptObject) -> Any:
+    full_qualname = x._type().qualified_name()  # type: ignore[attr-defined]
+    ns, class_name = _ns_and_class_name(full_qualname)
+    fake_class = find_fake_class(full_qualname)
+    if fake_class is None:
+        raise RuntimeError(
+            f" ScriptObject's {full_qualname} haven't registered a fake class."
+            f" Please use register_fake_class({ns}::{class_name}) to annotate a fake class for the script obj."
+            f" Specifically, create a python class that implements a fake version for all the methods"
+            f" that're used in the program and put annotated class in the program e.g. after loading the library."
+            f" The fake methods can be written in the same way as a meta kernel for an operator but need to additionally"
+            f" simulate the object's states. Be sure to add a {_CONVERT_FROM_REAL_NAME} classmethod"
+            f" to enable creating a fake obj from a real one."
+        )
+    return fake_class
+
+
+_CONVERT_FROM_REAL_NAME = "__obj_unflatten__"
+
+
+def _fake_obj_from_real(fake_mode, x) -> Any:
+    fake_class = _find_fake_class_for_script_object(x)
+
+    from_real_method = getattr(fake_class, _CONVERT_FROM_REAL_NAME, None)
+    if not from_real_method:
+        raise RuntimeError(
+            f"{fake_class} must define a classmethod {_CONVERT_FROM_REAL_NAME}"
+            f" that converts the real object to the fake object."
+        )
+
+    # from_real defined by user need the ctx to fakify the tensor states.
+    ctx = torch._library.abstract_impl.AbstractImplCtx(fake_mode, None)
+    with torch._library.abstract_impl.set_ctx_getter(lambda: ctx):
+        return fake_class.from_real(x)
diff --git a/torch/_library/infer_schema.py b/torch/_library/infer_schema.py
new file mode 100644
index 0000000000000..fd03f9182434c
--- /dev/null
+++ b/torch/_library/infer_schema.py
@@ -0,0 +1,163 @@
+import inspect
+import typing
+
+from .. import device, dtype, Tensor, types
+
+
+def infer_schema(prototype_function: typing.Callable, mutates_args=()) -> str:
+    """Given a function with type hints, parses a schema.
+
+    We make some assumptions to make our lives easier that correspond to how people
+    write custom ops in real life:
+    - none of the outputs alias any of the inputs or each other.
+    - only the args listed in mutates_args are being mutated.
+
+    Callers (e.g. the custom ops API) are responsible for checking these assumptions.
+    """
+    sig = inspect.signature(prototype_function)
+
+    def error_fn(what):
+        raise ValueError(
+            f"infer_schema(func): {what} " f"Got func with signature {sig})"
+        )
+
+    params = []
+    seen_args = set()
+    saw_kwarg_only_arg = False
+    for idx, (name, param) in enumerate(sig.parameters.items()):
+        if not supported_param(param):
+            error_fn("We do not support positional-only args, varargs, or varkwargs.")
+
+        if param.kind == inspect.Parameter.KEYWORD_ONLY:
+            # The first time we see a kwarg-only arg, add "*" to the schema.
+            if not saw_kwarg_only_arg:
+                params.append("*")
+                saw_kwarg_only_arg = True
+
+        if param.annotation is inspect.Parameter.empty:
+            error_fn(f"Parameter {name} must have a type annotation.")
+
+        if param.annotation not in SUPPORTED_PARAM_TYPES.keys():
+            error_fn(
+                f"Parameter {name} has unsupported type {param.annotation}. "
+                f"The valid types are: {SUPPORTED_PARAM_TYPES.keys()}."
+            )
+
+        schema_type = SUPPORTED_PARAM_TYPES[param.annotation]
+        if name in mutates_args:
+            if not schema_type.startswith("Tensor"):
+                error_fn(
+                    f"Parameter {name} is in mutable_args but only Tensors or collections of Tensors can be mutated"
+                )
+            schema_type = f"Tensor(a{idx}!){schema_type[len('Tensor'):]}"
+        seen_args.add(name)
+        if param.default is inspect.Parameter.empty:
+            params.append(f"{schema_type} {name}")
+        else:
+            if param.default is not None and not isinstance(
+                param.default, (int, float, bool)
+            ):
+                error_fn(
+                    f"Parameter {name} has an unsupported default value (we only support "
+                    f"int, float, bool, None). Please file an issue on GitHub so we can "
+                    f"prioritize this."
+                )
+            params.append(f"{schema_type} {name}={param.default}")
+    mutates_args_not_seen = set(mutates_args) - seen_args
+    if len(mutates_args_not_seen) > 0:
+        error_fn(
+            f"{mutates_args_not_seen} in mutates_args were not found in "
+            f"the custom op's signature. "
+            f"mutates_args should contain the names of all args that the "
+            f"custom op mutates."
+        )
+    ret = parse_return(sig.return_annotation, error_fn)
+    return f"({', '.join(params)}) -> {ret}"
+
+
+def derived_types(
+    base_type, cpp_type, list_base, optional_base_list, optional_list_base
+):
+    result = [
+        (base_type, cpp_type),
+        (typing.Optional[base_type], f"{cpp_type}?"),
+    ]
+
+    def derived_seq_types(typ):
+        return [
+            typing.Sequence[typ],  # type: ignore[valid-type]
+            typing.List[typ],  # type: ignore[valid-type]
+        ]
+
+    if list_base:
+        for seq_typ in derived_seq_types(base_type):
+            result.append((seq_typ, f"{cpp_type}[]"))  # type: ignore[valid-type]
+    if optional_base_list:
+        for seq_typ in derived_seq_types(typing.Optional[base_type]):
+            result.append((seq_typ, f"{cpp_type}?[]"))  # type: ignore[valid-type]
+    if optional_list_base:
+        for seq_typ in derived_seq_types(base_type):  # type: ignore[valid-type]
+            result.append((typing.Optional[seq_typ], f"{cpp_type}[]?"))  # type: ignore[valid-type]
+    return result
+
+
+def get_supported_param_types():
+    data = [
+        # (python type, schema type, type[] variant, type?[] variant, type[]? variant
+        (Tensor, "Tensor", True, True, False),
+        (int, "SymInt", True, False, True),
+        (float, "float", True, False, True),
+        (bool, "bool", True, False, True),
+        (str, "str", False, False, False),
+        (types.Number, "Scalar", True, False, False),
+        (dtype, "ScalarType", False, False, False),
+        (device, "Device", False, False, False),
+    ]
+    result = []
+    for line in data:
+        result.extend(derived_types(*line))
+    return dict(result)
+
+
+SUPPORTED_RETURN_TYPES = {
+    Tensor: "Tensor",
+    typing.List[Tensor]: "Tensor[]",
+    int: "SymInt",
+    float: "float",
+    bool: "bool",
+    types.Number: "Scalar",
+}
+
+
+def parse_return(annotation, error_fn):
+    if annotation is None:
+        return "()"
+
+    origin = typing.get_origin(annotation)
+    if origin is not tuple:
+        if annotation not in SUPPORTED_RETURN_TYPES.keys():
+            error_fn(
+                f"Return has unsupported type {annotation}. "
+                f"The valid types are: {SUPPORTED_RETURN_TYPES}."
+            )
+        return SUPPORTED_RETURN_TYPES[annotation]
+
+    args = typing.get_args(annotation)
+    for arg in args:
+        if arg not in SUPPORTED_RETURN_TYPES:
+            error_fn(
+                f"Return has unsupported type {annotation}. "
+                f"The valid types are: {SUPPORTED_RETURN_TYPES}."
+            )
+
+    return "(" + ", ".join([SUPPORTED_RETURN_TYPES[arg] for arg in args]) + ")"
+
+
+SUPPORTED_PARAM_TYPES = get_supported_param_types()
+
+
+def supported_param(param: inspect.Parameter) -> bool:
+    return param.kind in (
+        inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        inspect.Parameter.KEYWORD_ONLY,
+    )
diff --git a/torch/_library/simple_registry.py b/torch/_library/simple_registry.py
index 121326609670f..64a543e99b0b4 100644
--- a/torch/_library/simple_registry.py
+++ b/torch/_library/simple_registry.py
@@ -8,11 +8,11 @@ class SimpleLibraryRegistry:
 
     The "simple" torch.library APIs are a higher-level API on top of the
     raw PyTorch DispatchKey registration APIs that includes:
-    - abstract impl
+    - fake impl
 
     Registrations for these APIs do not go into the PyTorch dispatcher's
     table because they may not directly involve a DispatchKey. For example,
-    the abstract impl is a Python function that gets invoked by FakeTensor.
+    the fake impl is a Python function that gets invoked by FakeTensor.
     Instead, we manage them here.
 
     SimpleLibraryRegistry is a mapping from a fully qualified operator name
diff --git a/torch/_library/utils.py b/torch/_library/utils.py
index 3f3ba7ff01f0d..d3577dbbf9d1d 100644
--- a/torch/_library/utils.py
+++ b/torch/_library/utils.py
@@ -1,9 +1,11 @@
 import dataclasses
 import inspect
 import sys
-from typing import Any, Callable, Tuple
+from typing import Any, Callable, Dict, Iterable, Tuple
 
 import torch
+import torch._utils_internal as _utils_internal
+from torch import _C
 
 
 @dataclasses.dataclass
@@ -53,7 +55,7 @@ def parse_namespace(qualname: str) -> Tuple[str, str]:
     return splits[0], splits[1]
 
 
-def lookup_op(qualname: str) -> torch._ops.OpOverloadPacket:
+def lookup_op(qualname: str) -> torch._ops.OpOverload:
     namespace, name = parse_namespace(qualname)
     if "." in name:
         name, overload = name.split(".")
@@ -78,21 +80,178 @@ def is_functional_schema(schema: Any) -> bool:
     - it has at least one return
     """
 
+    def is_functional(schema):
+        if schema.is_mutable:
+            return False
+        rets = schema.returns
+        is_non_mutating_view = len(rets) > 0 and any(
+            r.alias_info is not None and not r.alias_info.is_write for r in rets
+        )
+        if is_non_mutating_view:
+            return False
+        if not schema.returns:
+            return False
+        return True
+
+    if isinstance(schema, torch._C.FunctionSchema):
+        return is_functional(schema)
+
     # Lazy import because not all PyTorch builds have torchgen
-    from torchgen.model import FunctionSchema, SchemaKind
+    from torchgen.model import FunctionSchema
 
-    assert isinstance(schema, (str, FunctionSchema))
     if isinstance(schema, str):
         schema = FunctionSchema.parse(schema)
+    assert isinstance(schema, FunctionSchema)
+    return is_functional(schema)
 
-    if schema.kind() != SchemaKind.functional:
-        return False
-    rets = schema.returns
-    is_non_mutating_view = len(rets) > 0 and any(
-        r.annotation is not None and not r.annotation.is_write for r in rets
+
+# should be torch._C.JitType but that annotation is busted
+def is_tensorlist_like_type(typ: Any) -> bool:
+    return (
+        typ == _C.ListType(_C.TensorType.get())
+        or typ == _C.ListType(_C.OptionalType(_C.TensorType.get()))
+        or typ == _C.OptionalType(_C.ListType(_C.TensorType.get()))
+        or typ == _C.OptionalType(_C.ListType(_C.OptionalType(_C.TensorType.get())))
     )
-    if is_non_mutating_view:
+
+
+# should be torch._C.JitType but that annotation is busted
+def is_tensor_like_type(typ: Any) -> bool:
+    return typ == _C.TensorType.get() or typ == _C.OptionalType(_C.TensorType.get())
+
+
+def mutates_and_returns_first_arg(op: torch._ops.OpOverload):
+    """Check if an op is an inplace aten op, i.e. it mutates and returns the first arg.
+
+    TODO: torchgen/model.py's FunctionSchema.parse is the source of truth for this,
+    but not all PyTorch builds have torchgen (due to the yaml dependency being weird).
+    Figure this out.
+
+    Example: add_(Tensor(a!) x, Tensor y) -> Tensor(a)
+    """
+    if op.namespace != "aten":
+        return False
+    schema = op._schema
+    if not len(schema.returns) == 1:
+        return False
+    if schema.returns[0].alias_info is None:
+        return False
+    alias_set = schema.returns[0].alias_info.after_set
+    if len(alias_set) != 1:
+        return False
+    loc = next(iter(alias_set))
+    if len(schema.arguments) < 1:
+        return False
+    first_arg = schema.arguments[0]
+    if first_arg.alias_info is None:
+        return False
+    if not first_arg.alias_info.is_write:
+        return False
+    alias_set = first_arg.alias_info.after_set
+    if len(alias_set) != 1:
+        return False
+    if loc != next(iter(alias_set)):
+        return False
+    for arg in schema.arguments[1:]:
+        if arg.alias_info is not None:
+            return False
+    return True
+
+
+def fill_defaults(schema, args, kwargs):
+    new_args = []
+    new_kwargs = {}
+    for i in range(len(schema.arguments)):
+        info = schema.arguments[i]
+        if info.kwarg_only:
+            if info.name in kwargs:
+                new_kwargs[info.name] = kwargs[info.name]
+            else:
+                new_kwargs[info.name] = info.default_value
+        else:
+            if i < len(args):
+                new_args.append(args[i])
+            else:
+                new_args.append(info.default_value)
+    return tuple(new_args), new_kwargs
+
+
+def zip_schema(
+    schema: _C.FunctionSchema, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+) -> Iterable[Tuple[_C.Argument, Any]]:
+    """zips schema.arguments and (args, kwargs) together.
+
+    Assumes that (args, kwargs) were the inputs to some torch._ops.OpOverload:
+    that is, kwargs must be keyword-only arguments and default values may be omitted.
+    """
+    assert len(schema.arguments) >= len(args) + len(kwargs)
+    for i in range(len(schema.arguments)):
+        info = schema.arguments[i]
+        if info.kwarg_only:
+            if info.name in kwargs:
+                yield info, kwargs[info.name]
+            continue
+        if i >= len(args):
+            # args that are equal to their default values are not populated
+            # if they are followed by args that are equal to their defaults.
+            # Skip these.
+            continue
+        yield info, args[i]
+    return
+
+
+def can_generate_trivial_fake_impl(op: torch._ops.OpOverload) -> bool:
+    assert isinstance(op, torch._ops.OpOverload)
+    if is_builtin(op):
+        # We control the built-ins. These may (in rare cases)
+        # do input metadata mutation (which we have banned on custom ops)
+        return False
+    schema = op._schema
+    # It's suspicious if the op is not mutable but returns nothing, so we return False out of an abundance of caution
+    if not schema.is_mutable:
         return False
-    if not schema.returns:
+    if len(schema.returns) > 0:
         return False
+    # If the op returns nothing, then it has a trivial fake impl.
     return True
+
+
+def requires_set_python_module() -> bool:
+    """If an op was defined in C++ and extended from Python using the
+    torch.library APIs, returns if we require that there have been a
+    m.set_python_module("mylib.ops") call from C++ that associates
+    the C++ op with a python module.
+    """
+    return getattr(_utils_internal, "REQUIRES_SET_PYTHON_MODULE", True)
+
+
+def handle_dispatch_mode(curr_mode, op_overload, *args, **kwargs):
+    assert isinstance(curr_mode, torch.utils._python_dispatch.TorchDispatchMode)
+    overload_types = []
+    args_flattened, _ = torch.utils._pytree.tree_flatten((args, kwargs.values()))
+    for a in args_flattened:
+        # TODO: need to double check the semantics of the "types" argument to torch_dispatch.
+        # It's generated in PyInterpreter.cpp, but seems to be generated in two places,
+        # where in one case we only include tensors with the python key, and in another
+        # we include **all** tensors.
+        if isinstance(a, torch.Tensor) and torch._C._dispatch_keys(a).has(
+            torch._C.DispatchKey.Python
+        ):
+            overload_types.append(type(a))
+    # TODO: check that I got these args correct (in C++, we pass in "0000"??)
+
+    return curr_mode.__torch_dispatch__(op_overload, overload_types, args, kwargs)
+
+
+def has_kwarg_only_args(schema: _C.FunctionSchema):
+    return any(a.kwarg_only for a in schema.arguments)
+
+
+def has_kwarg_only_tensors(schema: _C.FunctionSchema):
+    for a in schema.arguments:
+        if not (is_tensor_like_type(a.type) or is_tensorlist_like_type(a.type)):
+            continue
+        if not a.kwarg_only:
+            continue
+        return True
+    return False
diff --git a/torch/_lobpcg.py b/torch/_lobpcg.py
index a5ed5cf8fcfd2..6ca1e7294217a 100644
--- a/torch/_lobpcg.py
+++ b/torch/_lobpcg.py
@@ -648,7 +648,7 @@ def _lobpcg(
         bparams["ortho_use_drop"] = bparams.get("ortho_use_drop", False)
 
     if not torch.jit.is_scripting():
-        LOBPCG.call_tracker = LOBPCG_call_tracker  # type: ignore[assignment]
+        LOBPCG.call_tracker = LOBPCG_call_tracker  # type: ignore[method-assign]
 
     if len(A.shape) > 2:
         N = int(torch.prod(torch.tensor(A.shape[:-2])))
@@ -672,7 +672,7 @@ def _lobpcg(
             bXret[i] = worker.X[:, :k]
 
         if not torch.jit.is_scripting():
-            LOBPCG.call_tracker = LOBPCG_call_tracker_orig  # type: ignore[assignment]
+            LOBPCG.call_tracker = LOBPCG_call_tracker_orig  # type: ignore[method-assign]
 
         return bE.reshape(A.shape[:-2] + (k,)), bXret.reshape(A.shape[:-2] + (m, k))
 
@@ -684,7 +684,7 @@ def _lobpcg(
     worker.run()
 
     if not torch.jit.is_scripting():
-        LOBPCG.call_tracker = LOBPCG_call_tracker_orig  # type: ignore[assignment]
+        LOBPCG.call_tracker = LOBPCG_call_tracker_orig  # type: ignore[method-assign]
 
     return worker.E[:k], worker.X[:, :k]
 
diff --git a/torch/_logging/__init__.py b/torch/_logging/__init__.py
index 1260d6aec102c..4cfe8ba099371 100644
--- a/torch/_logging/__init__.py
+++ b/torch/_logging/__init__.py
@@ -11,5 +11,6 @@
     getArtifactLogger,
     LazyString,
     set_logs,
+    trace_structured,
     warning_once,
 )
diff --git a/torch/_logging/_internal.py b/torch/_logging/_internal.py
index 2c18a30baf0e6..106616ec0dea1 100644
--- a/torch/_logging/_internal.py
+++ b/torch/_logging/_internal.py
@@ -1,19 +1,40 @@
 import functools
+import hashlib
 import itertools
+import json
 import logging
 import os
+import os.path
 import re
+import tempfile
 from dataclasses import dataclass, field
 from importlib import __import__
-from typing import Dict, List, Optional, Set, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 from weakref import WeakSet
 
+import torch._logging.structured
+from torch.utils._traceback import CapturedTraceback
+
 log = logging.getLogger(__name__)
 
+# This is a synthetic logger which doesn't correspond to an actual logger,
+# but handles all of our "tracing" logging, which is structured and doesn't go
+# to stderr but always goes to a dedicated log file.  We don't put these
+# loggers in the classic module hierarchy, because we don't want a suppression
+# of logs to also cause a trace to get suppressed (traces typically are not
+# collected, unless we are in prod, in which case they always are collected.)
+#
+# TODO: Maybe we should allow for some sub-hierarchy so you can control which
+# traces you want to collect, for performance reasons.
+#
+# See https://docs.google.com/document/d/1CX_hJ0PNy9f3R1y8TJrfkSeLkvGjjjLU84BSXgS2AZ8/edit
+trace_log = logging.getLogger("torch.__trace")
+
 DEFAULT_LOG_LEVEL = logging.WARNING
 LOG_ENV_VAR = "TORCH_LOGS"
 LOG_OUT_ENV_VAR = "TORCH_LOGS_OUT"
 LOG_FORMAT_ENV_VAR = "TORCH_LOGS_FORMAT"
+TRACE_ENV_VAR = "TORCH_TRACE"
 
 
 @dataclass
@@ -154,14 +175,14 @@ def clear(self):
 
 # sample usage: torch._logging.set_logs(**torch._logging.DEFAULT_LOGGING)
 DEFAULT_LOGGING = {
-    "dynamo": logging.INFO,
-    "graph_code": True,
-    "aot": logging.INFO,
+    "dynamo": logging.DEBUG,
+    "aot": logging.DEBUG,
+    "inductor": logging.DEBUG,
+    "ddp_graphs": True,
     "graph_breaks": True,
+    "guards": True,
     "recompiles": True,
     "dynamic": logging.INFO,
-    "guards": True,
-    "trace_source": True,
 }
 
 
@@ -191,6 +212,7 @@ def set_logs(
     recompiles_verbose: bool = False,
     trace_source: bool = False,
     trace_call: bool = False,
+    trace_bytecode: bool = False,
     output_code: bool = False,
     schedule: bool = False,
     perf_hints: bool = False,
@@ -198,7 +220,11 @@ def set_logs(
     onnx_diagnostics: bool = False,
     fusion: bool = False,
     overlap: bool = False,
+    export: Optional[int] = None,
     modules: Optional[Dict[str, Union[int, bool]]] = None,
+    cudagraphs: bool = False,
+    sym_node: bool = False,
+    compiled_autograd_verbose: bool = False,
 ):
     """
     Sets the log level for individual components and toggles individual log
@@ -213,7 +239,7 @@ def set_logs(
     A component is a set of related features in PyTorch. All of the log
     messages emitted from a given component have their own log levels. If the
     log level of a particular message has priority greater than or equal to its
-    component's log level setting, it is emitted. Otherwise, it is supressed.
+    component's log level setting, it is emitted. Otherwise, it is suppressed.
     This allows you to, for instance, silence large groups of log messages that
     are not relevant to you and increase verbosity of logs for components that
     are relevant. The expected log level values, ordered from highest to lowest
@@ -283,6 +309,9 @@ def set_logs(
         aot_joint_graph (:class:`bool`):
             Whether to emit the joint forward-backward graph generated by AOTAutograd. Default: ``False``
 
+        inductor (:class:`Optional[int]`):
+            Whether to log information from inductor cudagraphs. Default: ``logging.WARN``
+
         ddp_graphs (:class:`bool`):
             Whether to emit graphs generated by DDPOptimizer. Default: ``False``
 
@@ -321,6 +350,10 @@ def set_logs(
             Whether to emit detailed line location when TorchDynamo creates an FX node
             corresponding to function call. Python 3.11+ only. Default: ``False``
 
+        trace_bytecode (:class:`bool`):
+            Whether to emit bytecode instructions and traced stack state as TorchDynamo
+            traces bytecode. Default: ``False``
+
         output_code (:class:`bool`):
             Whether to emit the TorchInductor output code. Default: ``False``
 
@@ -342,6 +375,12 @@ def set_logs(
         overlap (:class:`bool`):
             Whether to emit detailed Inductor compute/comm overlap decisions. Default: ``False``
 
+        sym_node (:class:`bool`):
+            Whether to emit debug info for various SymNode opterations. Default: ``False``
+
+        export (:class:`Optional[int]`):
+            The log level for export. Default: ``logging.WARN``
+
         modules (dict):
             This argument provides an alternate way to specify the above log
             component and artifact settings, in the format of a keyword args
@@ -432,6 +471,7 @@ def _set_logs(**kwargs):
         recompiles_verbose=recompiles_verbose,
         trace_source=trace_source,
         trace_call=trace_call,
+        trace_bytecode=trace_bytecode,
         output_code=output_code,
         schedule=schedule,
         perf_hints=perf_hints,
@@ -440,6 +480,10 @@ def _set_logs(**kwargs):
         onnx_diagnostics=onnx_diagnostics,
         fusion=fusion,
         overlap=overlap,
+        sym_node=sym_node,
+        export=export,
+        cudagraphs=cudagraphs,
+        compiled_autograd_verbose=compiled_autograd_verbose,
     )
 
 
@@ -681,6 +725,10 @@ def _has_registered_parent(log_qname):
 
 # apply custom formats to artifacts when necessary
 class TorchLogsFormatter(logging.Formatter):
+    def __init__(self, *, trace: bool = False):
+        super().__init__()
+        self._is_trace = trace
+
     def format(self, record):
         artifact_name = getattr(logging.getLogger(record.name), "artifact_name", None)
         if artifact_name is not None:
@@ -691,7 +739,7 @@ def format(self, record):
                 return artifact_formatter.format(record)
 
         record.message = record.getMessage()
-        record.asctime = self.formatTime(record, self.datefmt)
+        record.asctime = self.formatTime(record, "%m%d %H:%M:%S")
 
         # exception handling - copied from logging.Formatter.format
         s = record.message
@@ -709,17 +757,46 @@ def format(self, record):
                 s = s + "\n"
             s = s + self.formatStack(record.stack_info)
 
-        lines = s.split("\n")
         record.rankprefix = ""
-        if dist.is_available() and dist.is_initialized():
+        if not self._is_trace and dist.is_available() and dist.is_initialized():
             record.rankprefix = f"[rank{dist.get_rank()}]:"
 
         record.traceid = ""
-        if (trace_id := torch._guards.CompileContext.current_trace_id()) is not None:
+        if (
+            not self._is_trace
+            and (trace_id := torch._guards.CompileContext.current_trace_id())
+            is not None
+        ):
             record.traceid = f" [{trace_id}]"
 
-        prefix = f"{record.rankprefix}[{record.asctime}]{record.traceid} {record.name}: [{record.levelname}]"
-        return "\n".join(f"{prefix} {l}" for l in lines)
+        glog_level_to_abbr = {
+            "DEBUG": "V",  # V is for VERBOSE in glog
+            "INFO": "I",
+            "WARNING": "W",
+            "ERROR": "E",
+            "CRITICAL": "C",
+        }
+
+        shortlevel = glog_level_to_abbr.get(record.levelname, record.levelname)
+
+        record.artifactprefix = ""
+        if artifact_name is not None:
+            record.artifactprefix = f" [__{artifact_name}]"
+
+        prefix = (
+            f"{record.rankprefix}{shortlevel}{record.asctime}.{int(record.msecs*1000):06d} {record.thread} "
+            f"{os.path.relpath(record.pathname, os.path.dirname(os.path.dirname(torch.__file__)))}:"
+            f"{record.lineno}]{record.traceid}{record.artifactprefix}"
+        )
+        if self._is_trace:
+            assert s == ""
+            r = f"{prefix} {json.dumps(record.metadata)}"
+            if record.payload is not None:
+                r += "".join(f"\n\t{l}" for l in record.payload.split("\n"))
+            return r
+        else:
+            lines = s.split("\n")
+            return "\n".join(f"{prefix} {l}" for l in lines)
 
 
 def _default_formatter():
@@ -727,6 +804,8 @@ def _default_formatter():
     if fmt is None:
         return TorchLogsFormatter()
     else:
+        if fmt in ("short", "basic"):
+            fmt = logging.BASIC_FORMAT
         return logging.Formatter(fmt)
 
 
@@ -777,6 +856,9 @@ def _reset_logs():
         log.setLevel(logging.NOTSET)
         log.propagate = True
 
+    trace_log.propagate = False
+    _clear_handlers(trace_log)
+
 
 def _get_log_state():
     return log_state
@@ -831,6 +913,105 @@ def _init_logs(log_file_name=None):
         log = logging.getLogger(artifact_log_qname)
         configure_artifact_log(log)
 
+    # Setup handler for the special trace_log, with different default
+    # configuration
+    trace_dir_name = os.environ.get(TRACE_ENV_VAR, None)
+    # This handler may remove itself if trace_dir_name is None and we are not
+    # actually in an FB environment.  This allows us to defer actually
+    # initializing it until we actually need to log anything.  This is
+    # important because JK initializes a C++ singleton, which will pork our
+    # process if we subsequently fork.
+    handler = LazyTraceHandler(trace_dir_name)
+    # This log is ALWAYS at debug level.  We will additionally test if there
+    # are any handlers before deciding to actually call logging on this.  Do
+    # not manually call
+    trace_log.setLevel(logging.DEBUG)
+    trace_log_handler = _track_handler(handler)
+    trace_log_handler.setFormatter(TorchLogsFormatter(trace=True))
+    trace_log.addHandler(trace_log_handler)
+
+
+class LazyTraceHandler(logging.StreamHandler):
+    """Like FileHandler, but the file is allocated lazily only upon the first log message"""
+
+    def __init__(self, root_dir: Optional[str]):
+        # This is implemented in the same way that delay is implemented on
+        # FileHandler
+        self.root_dir = root_dir
+        logging.Handler.__init__(self)
+        self.stream = None
+        self._builtin_open = open
+
+    # cloned from FileHandler in cpython
+    def close(self):
+        self.acquire()
+        try:
+            try:
+                if self.stream:
+                    try:
+                        self.flush()
+                    finally:
+                        stream = self.stream
+                        self.stream = None
+                        if hasattr(stream, "close"):
+                            stream.close()
+            finally:
+                # Issue #19523: call unconditionally to
+                # prevent a handler leak when delay is set
+                # Also see Issue #42378: we also rely on
+                # self._closed being set to True there
+                logging.StreamHandler.close(self)
+        finally:
+            self.release()
+
+    def emit(self, record):
+        if self.stream is None:
+            ok = False
+            if self.root_dir is None:
+                TRACE_LOG_DIR = "/logs"
+                open_func = self._builtin_open
+
+                import torch.version as torch_version
+
+                if hasattr(torch_version, "git_version"):
+                    log.info("LazyTraceHandler: disabled because not fbcode")
+                elif not torch._utils_internal.justknobs_check("pytorch/trace:enable"):
+                    log.info(
+                        "LazyTraceHandler: disabled because justknobs_check('pytorch/trace:enable') returned False"
+                    )
+                elif not os.path.exists(TRACE_LOG_DIR):
+                    log.info(
+                        "LazyTraceHandler: disabled because %s does not exist",
+                        TRACE_LOG_DIR,
+                    )
+                elif not os.access(TRACE_LOG_DIR, os.W_OK):
+                    log.info(
+                        "LazyTraceHandler: disabled because %s is not writeable",
+                        TRACE_LOG_DIR,
+                    )
+                else:
+                    self.root_dir = TRACE_LOG_DIR
+
+            if self.root_dir is not None:
+                os.makedirs(self.root_dir, exist_ok=True)
+                ranksuffix = ""
+                if dist.is_available() and dist.is_initialized():
+                    ranksuffix = f"rank_{dist.get_rank()}_"
+                self.stream = tempfile.NamedTemporaryFile(
+                    mode="w+",
+                    suffix=".log",
+                    prefix=f"dedicated_log_torch_trace_{ranksuffix}",
+                    dir=self.root_dir,
+                    delete=False,
+                )
+                log.info("LazyTraceHandler: logging to %s", self.stream.name)
+            else:
+                # We go poof, remove and no-op
+                trace_log.removeHandler(self)
+                return
+        if self.stream:
+            super().emit(record)
+
 
 @functools.lru_cache(None)
 def warning_once(logger_obj, *args, **kwargs):
@@ -853,5 +1034,69 @@ def __str__(self):
         return self.func(*self.args, **self.kwargs)
 
 
+def trace_structured(
+    name: str,
+    # NB: metadata expected to be dict so adding more info is forward compatible
+    # Tuple[str, int] is a special case for string interning
+    metadata_fn: Callable[[], Union[Dict[str, Any], Tuple[str, int]]] = dict,
+    *,
+    payload_fn: Callable[[], Optional[Union[str, object]]] = lambda: None,
+    suppress_context: bool = False,
+):
+    """
+    metadata is an arbitrary JSON compatible struct, but it's expected to not be
+    too long (e.g., less than 1MB)
+
+    payload is an arbitrary string, which can be arbitrarily long (but expected to have
+    newlines so no lines are too long)
+    """
+    assert "name" not in ["rank", "frame_id", "frame_compile_id", "attempt"]
+    assert callable(
+        metadata_fn
+    ), f"metadata_fn should be callable, but got {type(metadata_fn)}"
+    assert callable(
+        payload_fn
+    ), f"payload_fn should be callable, but got {type(payload_fn)}"
+    # trace_log never propagates and is ALWAYS DEBUG, so also check that there
+    # are handlers instead of checking the log level
+    if trace_log.handlers:
+        record: Dict[str, object] = {}
+        record[name] = metadata_fn()
+        if not suppress_context:
+            # TODO: Actually, the rank probably should just be emitted once at
+            # the top, and not repeatedly spammed in all the logs, since it
+            # never changes and we assume no interleaving
+            if dist.is_available() and dist.is_initialized():
+                record["rank"] = dist.get_rank()
+            if (
+                trace_id := torch._guards.CompileContext.current_trace_id()
+            ) is not None:
+                record["frame_id"] = trace_id.compile_id.frame_id
+                record["frame_compile_id"] = trace_id.compile_id.frame_compile_id
+                record["attempt"] = trace_id.attempt
+            else:
+                # Record the stack of the log call to better diagnose why we
+                # don't have a frame id for it
+                record["stack"] = torch._logging.structured.from_traceback(
+                    CapturedTraceback.extract(skip=1).summary()
+                )
+        payload = payload_fn()
+        if payload is not None:
+            if not isinstance(payload, str):
+                if isinstance(payload, list):
+                    # special case to look better
+                    payload = "[\n" + ",\n".join(json.dumps(i) for i in payload) + "\n]"
+                else:
+                    # force newlines so we are unlikely to overflow line limit
+                    payload = json.dumps(payload, indent=0)
+            h = hashlib.md5()
+            h.update(payload.encode("utf-8"))
+            record["has_payload"] = h.hexdigest()
+        trace_log.debug(
+            "", extra={"metadata": record, "payload": payload}, stacklevel=2
+        )
+
+
 import torch._guards
+import torch._utils_internal
 import torch.distributed as dist
diff --git a/torch/_logging/_registrations.py b/torch/_logging/_registrations.py
index 14fba955c9f58..0530c12df3047 100644
--- a/torch/_logging/_registrations.py
+++ b/torch/_logging/_registrations.py
@@ -1,7 +1,11 @@
 # flake8: noqa: B950
 from ._internal import register_artifact, register_log
 
-DYNAMIC = ["torch.fx.experimental.symbolic_shapes", "torch.fx.experimental.sym_node"]
+DYNAMIC = [
+    "torch.fx.experimental.symbolic_shapes",
+    "torch.fx.experimental.sym_node",
+    "torch.fx.experimental.recording",
+]
 DISTRIBUTED = [
     "torch.distributed",
     "torch._dynamo.backends.distributed",
@@ -11,7 +15,13 @@
 register_log("dynamo", ["torch._dynamo", *DYNAMIC])
 register_log("aot", ["torch._functorch.aot_autograd", "torch._functorch._aot_autograd"])
 register_log("autograd", "torch.autograd")
-register_log("inductor", "torch._inductor")
+register_log("inductor", ["torch._inductor", "torch._inductor.cudagraph_trees"])
+
+register_artifact(
+    "cudagraphs",
+    "Logs information from wrapping inductor generated code with cudagraphs.",
+)
+
 register_log("dynamic", DYNAMIC)
 register_log("torch", "torch")
 register_log("distributed", DISTRIBUTED)
@@ -23,6 +33,7 @@
 )
 register_log("dist_fsdp", ["torch.distributed.fsdp"])
 register_log("onnx", "torch.onnx")
+register_log("export", ["torch._dynamo", "torch.export", *DYNAMIC])
 
 register_artifact(
     "guards",
@@ -51,6 +62,10 @@
     "trace_call",
     "Like trace_source, but it will give you the per-expression blow-by-blow if your Python is recent enough.",
 )
+register_artifact(
+    "trace_bytecode",
+    "As we trace bytecode, prints the instruction and the current stack.",
+)
 register_artifact(
     "aot_graphs",
     "Prints the FX forward and backward graph generated by AOTDispatch, after partitioning. Useful to understand what's being given to Inductor",
@@ -69,6 +84,11 @@
     "Prints various logs in compiled_autograd, including but not limited to the graphs. Useful for debugging compiled_autograd.",
     visible=True,
 )
+register_artifact(
+    "compiled_autograd_verbose",
+    "Will affect performance. Prints compiled_autograd logs with C++ info e.g. autograd node -> fx node mapping",
+    off_by_default=True,
+)
 register_artifact(
     "ddp_graphs",
     "Only relevant for compiling DDP. DDP splits into multiple graphs to trigger comms early. This will print each individual graph here.",
@@ -118,5 +138,10 @@
     "Detailed Inductor compute/comm overlap decisions",
     off_by_default=True,
 )
+register_artifact(
+    "sym_node",
+    "Logs extra info for various SymNode operations",
+    off_by_default=True,
+)
 
 register_artifact("custom_format_test_artifact", "Testing only", log_format="")
diff --git a/torch/_logging/structured.py b/torch/_logging/structured.py
new file mode 100644
index 0000000000000..c8ac982cf1df2
--- /dev/null
+++ b/torch/_logging/structured.py
@@ -0,0 +1,37 @@
+"""
+Utilities for converting data types into structured JSON for dumping.
+"""
+
+import traceback
+from typing import Dict, Sequence
+
+import torch._logging._internal
+
+
+INTERN_TABLE: Dict[str, int] = {}
+
+
+def intern_string(s: str) -> int:
+    r = INTERN_TABLE.get(s, None)
+    if r is None:
+        r = len(INTERN_TABLE)
+        INTERN_TABLE[s] = r
+        torch._logging._internal.trace_structured(
+            "str", lambda: (s, r), suppress_context=True
+        )
+    return r
+
+
+def from_traceback(tb: Sequence[traceback.FrameSummary]) -> object:
+    r = []
+    for frame in tb:
+        # dict naming convention here coincides with
+        # python/combined_traceback.cpp
+        r.append(
+            {
+                "line": frame.lineno,
+                "name": frame.name,
+                "filename": intern_string(frame.filename),
+            }
+        )
+    return r
diff --git a/torch/_lowrank.py b/torch/_lowrank.py
index fe5a1f3da71d0..7a920ef4a4555 100644
--- a/torch/_lowrank.py
+++ b/torch/_lowrank.py
@@ -116,7 +116,8 @@ def svd_lowrank(
                                integer, and defaults to 2
 
         M (Tensor, optional): the input tensor's mean of size
-                              :math:`(*, 1, n)`.
+                              :math:`(*, 1, n)`, which will be broadcasted
+                              to the size of A in this function.
 
     References::
         - Nathan Halko, Per-Gunnar Martinsson, and Joel Tropp, Finding
@@ -149,6 +150,7 @@ def _svd_lowrank(
     if M is None:
         M_t = None
     else:
+        M = M.broadcast_to(A.size())
         M_t = _utils.transpose(M)
     A_t = _utils.transpose(A)
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 2c3d9e5bee933..a7e41de58c5d3 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -21,6 +21,7 @@
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     IntLike,
     make_contiguous_strides_for,
+    Number,
     TensorLike,
 )
 
@@ -317,7 +318,8 @@ def meta_randperm_default(
     )
 
 
-@register_meta(aten.randint.default)
+@register_meta([aten.randint.default, aten.randint.out])
+@out_wrapper()
 def meta_randint(
     high, size, *, dtype=torch.long, layout=None, device=None, pin_memory=None
 ):
@@ -326,7 +328,8 @@ def meta_randint(
     )
 
 
-@register_meta(aten.randint.low)
+@register_meta([aten.randint.low, aten.randint.low_out])
+@out_wrapper()
 def meta_randint_low(
     low,
     high,
@@ -342,7 +345,8 @@ def meta_randint_low(
     )
 
 
-@register_meta(aten.rand.default)
+@register_meta([aten.rand.default, aten.rand.out])
+@out_wrapper()
 def meta_rand_default(size, *, dtype=None, layout=None, device=None, pin_memory=None):
     return torch.empty(
         size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
@@ -428,6 +432,66 @@ def meta_sparse_structured_linear(
     return output
 
 
+@register_meta(aten._sparse_semi_structured_mm)
+def meta_sparse_structured_mm(
+    mat1: Tensor,
+    mat1_meta: Tensor,
+    mat2: Tensor,
+    out_dtype: Optional[torch.dtype] = None,
+):
+    assert len(mat1.shape) == 2
+    assert len(mat1_meta.shape) == 2
+    assert len(mat2.shape) == 2
+    assert mat1.size(1) == mat2.size(0) / 2
+    output_sizes = [mat1.size(0), mat2.size(1)]
+
+    if out_dtype is not None:
+        assert (
+            mat2.dtype == torch.int8 and out_dtype == torch.int32
+        ), "out_dtype is only supported for i8i8->i32 linear operator"
+    output = mat2.new_empty(
+        output_sizes,
+        dtype=mat2.dtype if out_dtype is None else out_dtype,
+    )
+
+    return output
+
+
+@register_meta(aten._sparse_semi_structured_addmm)
+def meta_sparse_structured_addmm(
+    input: Tensor,
+    mat1: Tensor,
+    mat1_meta: Tensor,
+    mat2: Tensor,
+    *,
+    alpha=1,
+    beta=1,
+    out_dtype: Optional[torch.dtype] = None,
+):
+    assert (
+        len(input.shape) == 1
+    ), "only input broadcasted to columns of mat1 * mat2 product is supported"
+    assert len(mat1.shape) == 2
+    assert len(mat1_meta.shape) == 2
+    assert len(mat2.shape) == 2
+    assert input.size(0) == mat1.size(
+        0
+    ), "only input broadcasted to columns of mat1 * mat2 product is supported"
+    assert mat1.size(1) == mat2.size(0) / 2
+    output_sizes = [mat1.size(0), mat2.size(1)]
+
+    if out_dtype is not None:
+        assert (
+            mat2.dtype == torch.int8 and out_dtype == torch.int32
+        ), "out_dtype is only supported for i8i8->i32 linear operator"
+    output = mat2.new_empty(
+        output_sizes,
+        dtype=mat2.dtype if out_dtype is None else out_dtype,
+    )
+
+    return output
+
+
 @register_meta(aten._cslt_sparse_mm)
 def meta__cslt_sparse_mm(
     compressed_A: torch.Tensor,
@@ -438,6 +502,7 @@ def meta__cslt_sparse_mm(
     transpose_result: bool = False,
 ):
     assert dense_B.dtype in {
+        torch.float32,
         torch.float16,
         torch.bfloat16,
         torch.int8,
@@ -454,9 +519,11 @@ def meta__cslt_sparse_mm(
         assert m == bias.size(0)
 
     if out_dtype is not None:
-        assert (
-            is_int8_input_type and out_dtype == torch.float16
-        ), "out_dtype is only supported for i8i8->fp16 matmul"
+        assert is_int8_input_type and out_dtype in {
+            torch.float16,
+            torch.bfloat16,
+            torch.int32,
+        }, "out_dtype is only supported for i8i8->fp16, bf16, or i32 matmul"
     output_shape = (n, m) if transpose_result else (m, n)
     result = dense_B.new_empty(output_shape, dtype=out_dtype)
     return result
@@ -594,6 +661,11 @@ def assert_async_meta(val, assert_msg):
     return
 
 
+@register_meta(aten._print.default)
+def print_meta(s):
+    return
+
+
 @register_meta(aten._make_dep_token.default)
 def make_dep_token(
     *,
@@ -603,7 +675,7 @@ def make_dep_token(
     pin_memory=None,
     memory_format=None,
 ):
-    return torch.empty([], device="meta")
+    return torch.empty(0, device="meta")
 
 
 @register_meta(aten.sym_constrain_range.default)
@@ -780,6 +852,32 @@ def meta__linalg_eigh(
     return vals, vecs
 
 
+@register_meta([aten._linalg_eigvals.default, aten.linalg_eigvals.out])
+@out_wrapper()
+def meta__linalg_eigvals(input: Tensor) -> Tensor:
+    squareCheckInputs(input, "linalg.eigvals")
+    complex_dtype = (
+        input.dtype
+        if utils.is_complex_dtype(input.dtype)
+        else utils.corresponding_complex_dtype(input.dtype)
+    )
+    return input.new_empty(input.shape[:-1], dtype=complex_dtype)
+
+
+@register_meta([aten.linalg_eig])
+@out_wrapper("eigenvalues", "eigenvectors")
+def meta_linalg_eig(input: Tensor):
+    squareCheckInputs(input, "linalg.eig")
+    complex_dtype = (
+        input.dtype
+        if utils.is_complex_dtype(input.dtype)
+        else utils.corresponding_complex_dtype(input.dtype)
+    )
+    values = input.new_empty(input.shape[:-1], dtype=complex_dtype)
+    vectors = input.new_empty(input.shape, dtype=complex_dtype)
+    return values, vectors
+
+
 def cloneBatchedColumnMajor(src: Tensor) -> Tensor:
     return src.mT.clone(memory_format=torch.contiguous_format).transpose(-2, -1)
 
@@ -1147,7 +1245,7 @@ def _parse_qr_mode(mode: str) -> Tuple[bool, bool]:
                 f"but expected one of 'reduced' (default), 'r', or 'complete'"
             ),
         )
-    return compute_q, reduced
+    return compute_q, reduced  # type: ignore[possibly-undefined]
 
 
 @register_meta([aten.linalg_qr.default, aten.linalg_qr.out])
@@ -1412,7 +1510,7 @@ def triangular_solve_meta(
         cloned_coefficient = self.new_empty([0])
     else:
         torch._check(False, lambda: "triangular_solve: Got an unexpected layout.")
-    return solution, cloned_coefficient
+    return solution, cloned_coefficient  # type: ignore[possibly-undefined]
 
 
 # From aten/src/ATen/native/LinearAlgebra.cpp
@@ -1809,7 +1907,7 @@ def _pad3d_common(input, padding, *, is_reflection):
     )
 
     if batch_mode:
-        return input.new_empty((nbatch, nplane, output_d, output_h, output_w))
+        return input.new_empty((nbatch, nplane, output_d, output_h, output_w))  # type: ignore[possibly-undefined]
     else:
         return input.new_empty((nplane, output_d, output_h, output_w))
 
@@ -2142,6 +2240,11 @@ def pick_memory_format():
         output_padding if is_transposed else None,
     )
 
+    input_channels_dim = 1
+    output_channels_dim = 1
+    if input_tensor.size(input_channels_dim) == 0:
+        shape_out[output_channels_dim] = 0
+
     out = input_tensor.new_empty(shape_out)
     out = out.to(memory_format=pick_memory_format())  # type: ignore[call-overload]
     return out
@@ -2170,6 +2273,8 @@ def meta_mkldnn_convolution_default(
         )
         out = input_tensor.new_empty(shape_out)
         out_memory_format = torch.channels_last
+        if input_tensor.dim() == 5:
+            out_memory_format = torch.channels_last_3d
         out = out.to(memory_format=out_memory_format)  # type: ignore[call-overload]
         return out
 
@@ -2236,6 +2341,7 @@ def meta_qconv2d_pointwise(
         return out
 
     @register_meta(torch.ops.onednn.qlinear_pointwise.default)
+    @register_meta(torch.ops.onednn.qlinear_pointwise.tensor)
     def meta_qlinear_pointwise(
         x,
         x_scale,
@@ -3070,6 +3176,7 @@ def register(op):
         aten._foreach_log1p,
         aten._foreach_log2,
         aten._foreach_neg,
+        aten._foreach_norm,
         aten._foreach_reciprocal,
         aten._foreach_round,
         aten._foreach_sigmoid,
@@ -3293,6 +3400,29 @@ def meta__foreach_addcop__scalar(self, tensor1, tensor2, scalar=1):
     )
 
 
+@register_meta(
+    [
+        aten._foreach_addcdiv_.ScalarList,
+        aten._foreach_addcmul_.ScalarList,
+    ]
+)
+def meta__foreach_addcop__scalarlist(self, tensor1, tensor2, scalars):
+    torch._check(
+        all(isinstance(l, List) for l in [self, tensor1, tensor2, scalars]),
+        lambda: (
+            "_foreach_addc*_ op expects arguments of type: List[Tensor], List[Tensor], List[Tensor], List[Scalar], "
+            f"but got {type(self)}, {type(tensor1)}, {type(tensor2)}, and {type(scalars)}"
+        ),
+    )
+    torch._check(len(self) > 0, lambda: "input tensor list must not be empty.")
+    torch._check(
+        len(self) == len(tensor1)
+        and len(self) == len(tensor2)
+        and len(self) == len(scalars),
+        lambda: "All input tensor lists must have the same length",
+    )
+
+
 @register_meta([aten._fused_adam_.default])
 def meta__fused_adam_(
     self,
@@ -3404,8 +3534,8 @@ def meta__weight_int4pack_mm(x, w, q_group_size, q_scale_and_zeros):
     torch._check(x.dim() == 2, lambda: "x must be a 2D tensor")
     torch._check(w.dim() == 4, lambda: "w must be a 4D tensor")
     torch._check(
-        x.dtype is torch.bfloat16,
-        lambda: f"expected x to be bf16, got {x.dtype}",
+        x.dtype in [torch.float32, torch.float16, torch.bfloat16],
+        lambda: f"expected x to be f32/f16/bf16, got {x.dtype}",
     )
     torch._check(
         w.dtype is torch.int32,
@@ -3414,6 +3544,21 @@ def meta__weight_int4pack_mm(x, w, q_group_size, q_scale_and_zeros):
     return x.new_empty(x.size(0), w.size(0) * 8, dtype=x.dtype)
 
 
+@register_meta([aten._weight_int8pack_mm])
+def meta__weight_int8pack_mm(x, w, q_scales):
+    torch._check(x.dim() == 2, lambda: "x must be a 2D tensor")
+    torch._check(
+        x.dtype in [torch.float32, torch.float16, torch.bfloat16],
+        lambda: f"expected x to be f32/f16/bf16, got {x.dtype}",
+    )
+    torch._check(w.dim() == 2, lambda: "w must be a 2D tensor")
+    torch._check(
+        w.dtype is torch.int8,
+        lambda: f"expected w to be int8, got {w.dtype}",
+    )
+    return x.new_empty(x.size(0), w.size(0), dtype=x.dtype)
+
+
 @register_meta(aten._cdist_forward.default)
 def meta_cdist_forward(x1, x2, p, compute_mode):
     torch._check(
@@ -3876,8 +4021,11 @@ def pooling_output_shape(inputSize, kernelSize, pad, stride, dilation, ceil_mode
     torch._check(stride != 0, lambda: "stride should not be zero")
     torch._check(pad >= 0, lambda: f"pad must be non-negative, but got pad: {pad}")
     torch._check(
-        pad <= kernelSize // 2,
-        lambda: f"pad should be at most half of kernel size, but got pad={pad} and kernel_size={kernelSize}",
+        pad <= ((kernelSize - 1) * dilation + 1) // 2,
+        lambda: (
+            f"pad should be at most half of effective kernel size, but got pad={pad}, "
+            f"kernel_size={kernelSize} and dilation={dilation}"
+        ),
     )
     return pooling_output_shape_pad_lr(
         inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode
@@ -4304,6 +4452,91 @@ def meta_max_pool2d_with_indices(
     )
 
 
+@register_meta(aten.fractional_max_pool2d.default)
+def meta_fractional_max_pool2d(self_, kernel_size, output_size, random_samples):
+    torch._check(
+        self_.ndim in (3, 4),
+        lambda: f"fractional_max_pool2d: Expected 3D or 4D tensor, but got: {self_.ndim}",
+    )
+    ndim = self_.ndim
+
+    for d in range(ndim - 3, ndim):
+        torch._check(
+            self_.size(d) > 0,
+            f"fractional_max_pool2d: Expected input to have non-zero "
+            f" size for non-batch dimenions, but got {self_.size()} with dimension {d} empty",
+        )
+
+    # the check and message are out of sync, but this matches the structured meta
+    torch._check(
+        len(kernel_size) == 2,
+        lambda: "fractional_max_pool2d: kernel_size must"
+        "either be a single int or tuple of Ints",
+    )
+    torch._check(
+        len(output_size) == 2,
+        lambda: "fractional_max_pool2d: output_size must "
+        "either be a single int or tuple of Ints",
+    )
+
+    input_channels = self_.size(-3)
+    input_height = self_.size(-2)
+    input_width = self_.size(-1)
+    if ndim == 4:
+        input_batch = self_.size(0)
+    else:
+        input_batch = 1
+
+    torch._check(
+        self_.dtype == random_samples.dtype,
+        lambda: "Expect _random_samples to have the same dtype as input",
+    )
+    torch._check(
+        random_samples.ndim == 3,
+        lambda: f"Expect _random samples to have 3 dimensions got, {random_samples.ndim}",
+    )
+
+    n = random_samples.size(0)
+    c = random_samples.size(1)
+    d = random_samples.size(2)
+    torch._check(
+        n >= input_batch,
+        "Expect _random_samples.size(0) no less then input batch size.",
+    )
+    torch._check(
+        c == input_channels,
+        lambda: "Expect _random_samples.size(1) equals to input channel size.",
+    )
+    torch._check(d == 2, lambda: f"Expect _random_samples.size(2) equals to 2 got {d}.")
+
+    torch._check(
+        output_size[0] + kernel_size[0] - 1 <= input_height,
+        lambda: f"fractional_max_pool2d: kernel height {kernel_size[0]} is too large relative to input height {input_height}",
+    )
+    torch._check(
+        output_size[1] + kernel_size[1] - 1 <= input_width,
+        lambda: f"fractional_max_pool2d: kernel width {kernel_size[1]} is too large relative to input width {input_width}",
+    )
+
+    if self_.dim() == 4:
+        size = [input_batch, input_channels, output_size[0], output_size[1]]
+    else:
+        size = [input_channels, output_size[0], output_size[1]]
+
+    return (
+        torch.empty(
+            size,
+            dtype=self_.dtype,
+            device=self_.device,
+        ),
+        torch.empty(
+            size,
+            dtype=torch.int64,
+            device=self_.device,
+        ),
+    )
+
+
 @register_meta(aten.max_unpool2d)
 @out_wrapper()
 def meta_max_unpool2d(self_, indices, output_size):
@@ -4896,8 +5129,10 @@ def gather_shape_check(self, dim, index):
 
 @register_meta(aten.gather.default)
 def meta_gather(self, dim, index, sparse_grad=False):
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
     wrapped_dim = maybe_wrap_dim(dim, self.dim())
-    is_index_empty = index.numel() == 0
+    is_index_empty = guard_size_oblivious(index.numel() == 0)
     if not is_index_empty:
         torch._check(
             index.dtype == torch.long,
@@ -4936,7 +5171,9 @@ def get_operator_enum(reduce_, use_new_options=False):
 
 # From aten/src/ATen/native/ScatterGatherChecks.h
 def scatter_gather_dtype_check(method_name, self, index, src_opt=None):
-    if index.numel() != 0:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if guard_size_oblivious(index.numel() != 0):
         torch._check(
             index.dtype == torch.long,
             lambda: f"{method_name}(): Expected dtype int64 for index",
@@ -4955,7 +5192,9 @@ def ensure_nonempty_dim(dim):
 
 # From aten/src/ATen/native/ScatterGatherChecks.h
 def scatter_shape_check(self, dim, index, src_opt=None):
-    if index.numel() == 0:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if guard_size_oblivious(index.numel() == 0):
         return
     torch._check(
         ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()),
@@ -5051,67 +5290,6 @@ def meta_scatter_(self, dim, index, src_or_value, reduce=None):
     return self
 
 
-@register_meta(
-    [
-        aten._scaled_dot_product_flash_attention,
-    ]
-)
-def meta__scaled_dot_product_flash(
-    query: Tensor,
-    key: Tensor,
-    value: Tensor,
-    dropout_p: float = 0.0,
-    is_causal: bool = False,
-    return_debug_mask: bool = False,
-    scale: Optional[float] = None,
-):
-    batch_size = query.size(0)
-    num_heads = query.size(1)
-    max_seqlen_batch_q = query.size(2)
-    head_dim = query.size(3)
-    max_seqlen_batch_k = key.size(2)
-
-    query_t = query.transpose(1, 2)
-    attention = torch.empty_like(query_t).transpose(1, 2)
-    logsumexp = torch.empty(
-        (batch_size, num_heads, max_seqlen_batch_q),
-        dtype=torch.float,
-        device=query.device,
-    )
-
-    if return_debug_mask:
-        blocksize_c = 128 if head_dim > 64 else 256
-        max_seqlen_k = math.ceil(max_seqlen_batch_q / blocksize_c)
-        if max_seqlen_batch_k <= 128:
-            max_seqlen_k = 128
-        elif max_seqlen_batch_k <= 256:
-            max_seqlen_k = 256
-        debug_mask = torch.empty(
-            (batch_size, num_heads, max_seqlen_batch_q, max_seqlen_k),
-            dtype=query.dtype,
-            device=query.device,
-        )
-    else:
-        debug_mask = torch.empty(0, dtype=query.dtype, device=query.device)
-
-    # Note [Seed and Offset]: device for seed and offset below depends on whether we are
-    # capturing or not, but at the time of tracing we don't know if we
-    # are going to use cudagraphs or not, so we return meta tensors here
-    # it's possible we'll need to have some special handling in inductor for sdpa
-
-    return (
-        attention,
-        logsumexp,
-        None,
-        None,
-        max_seqlen_batch_q,
-        max_seqlen_batch_k,
-        torch.empty((), dtype=torch.long, device="meta"),
-        torch.empty((), dtype=torch.long, device="meta"),
-        debug_mask,
-    )
-
-
 @register_meta(
     [
         aten._scaled_dot_product_flash_attention_backward,
@@ -5197,7 +5375,7 @@ def meta__scaled_dot_product_flash_attention_for_cpu_backward(
     scale: Optional[float] = None,
 ):
     # cpus's grad layout is different from cuda's,
-    # i.e. (batch_size, seq_len，num_heads, head_dim）
+    # i.e. (batch_size, seq_len,num_heads, head_dim)
     batch_size = query.size(0)
     num_heads = query.size(1)
     head_dim = query.size(3)
@@ -5226,50 +5404,6 @@ def meta__scaled_dot_product_flash_attention_for_cpu_backward(
     return grad_q, grad_k, grad_v
 
 
-@register_meta(
-    [
-        aten._scaled_dot_product_efficient_attention,
-    ]
-)
-def meta__scaled_dot_product_efficient(
-    query: Tensor,
-    key: Tensor,
-    value: Tensor,
-    attn_bias: Optional[Tensor],
-    compute_log_sumexp: bool,
-    dropout_p=0.0,
-    is_causal: bool = False,
-    scale: Optional[float] = None,
-):
-    query = query.transpose(1, 2)
-    key = key.transpose(1, 2)
-    value = value.transpose(1, 2)
-
-    B = query.size(0)
-    M = query.size(1)
-    N = key.size(1)
-    num_heads = query.size(-2)
-    K = query.size(-1)
-    Kv = value.size(-1)
-
-    res = torch.empty(B, M, num_heads, Kv, dtype=query.dtype, device=query.device)
-
-    logsumexp_dim = math.ceil(M / 32) * 32 if compute_log_sumexp else 0
-    logsum_exp = torch.empty(
-        (B, num_heads, logsumexp_dim),
-        dtype=torch.float,
-        device=query.device,
-    )
-
-    res = res.transpose(1, 2)
-
-    # See Note [Seed and Offset]:
-    seed = torch.empty((), dtype=torch.long, device="meta")
-    offset = torch.empty((), dtype=torch.long, device="meta")
-
-    return res, logsum_exp, seed, offset
-
-
 @register_meta(
     [
         aten._scaled_dot_product_efficient_attention_backward,
@@ -5330,64 +5464,6 @@ def meta__scaled_dot_product_efficient_backward(
     return grad_q, grad_k, grad_v, grad_bias
 
 
-@register_meta(
-    [
-        aten._flash_attention_forward,
-    ]
-)
-def meta__flash_attention_forward(
-    query: Tensor,
-    key: Tensor,
-    value: Tensor,
-    cum_seq_q: Optional[Tensor],
-    cum_seq_k: Optional[Tensor],
-    max_q: int,
-    max_k: int,
-    dropout_p: float,
-    is_causal: bool,
-    return_debug_mask: bool,
-    scale: Optional[float] = None,
-):
-    batch_size = query.size(0)
-    max_seqlen_batch_q = query.size(1)
-    num_heads = query.size(2)
-    head_dim = query.size(3)
-
-    max_seqlen_batch_k = key.size(1)
-
-    # Cuda Path
-    attention = torch.empty_like(query)
-    logsumexp = torch.empty(
-        (batch_size, num_heads, max_seqlen_batch_q),
-        dtype=torch.float,
-        device=query.device,
-    )
-
-    if return_debug_mask:
-        blocksize_c = 128 if head_dim > 64 else 256
-        max_seqlen_k = math.ceil(max_seqlen_batch_q / blocksize_c)
-        if max_seqlen_batch_k <= 128:
-            max_seqlen_k = 128
-        elif max_seqlen_batch_k <= 256:
-            max_seqlen_k = 256
-        debug_mask = torch.empty(
-            (batch_size, num_heads, max_seqlen_batch_q, max_seqlen_k),
-            dtype=query.dtype,
-            device=query.device,
-        )
-    else:
-        debug_mask = torch.empty(0, dtype=query.dtype, device=query.device)
-
-    # See Note [Seed and Offset]:
-    return (
-        attention,
-        logsumexp,
-        torch.empty((), dtype=torch.long, device="meta"),
-        torch.empty((), dtype=torch.long, device="meta"),
-        debug_mask,
-    )
-
-
 @register_meta(
     [
         aten._flash_attention_backward,
@@ -5417,50 +5493,6 @@ def meta__flash_attention_backward(
     return grad_query, grad_key, grad_value
 
 
-@register_meta(
-    [
-        aten._efficient_attention_forward,
-    ]
-)
-def meta__efficient_attention_forward(
-    query: Tensor,
-    key: Tensor,
-    value: Tensor,
-    bias: Optional[Tensor],
-    cu_seqlens_q: Optional[Tensor],
-    cu_seqlens_k: Optional[Tensor],
-    max_seqlen_q: Optional[int],
-    max_seqlen_k: Optional[int],
-    dropout_p: float,
-    custom_mask_type: int,
-    compute_log_sumexp: bool = False,
-    scale: Optional[float] = None,
-    causal_diagonal: Optional[Tensor] = None,
-    seqlen_k: Optional[Tensor] = None,
-):
-    B = query.size(0)
-    M = query.size(1)
-    N = key.size(1)
-    num_heads = query.size(-2)
-    K = query.size(-1)
-    Kv = value.size(-1)
-
-    res = torch.empty(B, M, num_heads, Kv, dtype=query.dtype, device=query.device)
-
-    logsumexp_dim = math.ceil(M / 32) * 32 if compute_log_sumexp else 0
-    logsum_exp = torch.empty(
-        (B, num_heads, logsumexp_dim),
-        dtype=torch.float,
-        device=query.device,
-    )
-
-    # See Note [Seed and Offset]:
-    seed = torch.empty((), dtype=torch.long, device="meta")
-    offset = torch.empty((), dtype=torch.long, device="meta")
-
-    return res, logsum_exp, seed, offset, M, N
-
-
 @register_meta(
     [
         aten._efficient_attention_backward,
@@ -5474,8 +5506,8 @@ def meta__efficient_attention_backward(
     bias: Optional[Tensor],
     cu_seqlens_q: Optional[Tensor],
     cu_seqlens_k: Optional[Tensor],
-    max_seqlen_q: int,
-    max_seqlen_k: int,
+    max_seqlen_q: torch.SymInt,
+    max_seqlen_k: torch.SymInt,
     logsumexp: Tensor,
     dropout_p: float,
     philox_seed: Tensor,
@@ -5520,7 +5552,12 @@ def is_col_major(shape, stride):
         return stride[0] == 1 and stride[1] == shape[0]
 
     def is_fp8_type(dtype):
-        return dtype in (torch.float8_e4m3fn, torch.float8_e5m2)
+        return dtype in (
+            torch.float8_e4m3fn,
+            torch.float8_e5m2,
+            torch.float8_e4m3fnuz,
+            torch.float8_e5m2fnuz,
+        )
 
     torch._check(
         self.dim() == 2 and mat2.dim() == 2,
@@ -6049,8 +6086,36 @@ def meta_bucketize(self, boundaries, *, out_int32=False, right=False):
     ).contiguous()
 
 
-@register_meta(aten._upsample_bilinear2d_aa.default)
-def meta_upsample_bilinear2d_aa(
+@register_meta([aten.histc])
+@out_wrapper()
+def meta_histc(input, bins=100, min=0, max=0):
+    fn_name = "histc()"
+    if device_hint(input) == "cpu":
+        torch._check(
+            input.is_floating_point(),
+            lambda: f"\"histogram_cpu\" not implemented for '{input.dtype}'",
+        )
+    torch._check(
+        isinstance(bins, IntLike),
+        lambda: f"{fn_name}: argument 'bins' must be int, not {type(bins)}",
+    )
+    torch._check(bins > 0, lambda: f"{fn_name}: bins must be > 0, but got {bins}")
+    torch._check(
+        isinstance(min, Number),
+        lambda: f"{fn_name}: argument 'min' must be Number, not {type(min)}",
+    )
+    torch._check(
+        isinstance(max, Number),
+        lambda: f"{fn_name}: argument 'max' must be Number, not {type(max)}",
+    )
+    torch._check(max >= min, lambda: "{fn_name}: max must be larger than min")
+    return torch.empty(bins, device=input.device, dtype=input.dtype)
+
+
+@register_meta(
+    [aten._upsample_bilinear2d_aa.default, aten._upsample_bicubic2d_aa.default]
+)
+def meta_upsample_bimode2d_aa(
     input, output_size, align_corners, scales_h=None, scales_w=None
 ):
     full_output_size = upsample_common_check(
@@ -6149,6 +6214,74 @@ def meta_searchsorted(
         return torch.empty((), dtype=dtype, device=sorted_sequence.device)
 
 
+def _check_for_unsupported_isin_dtype(dtype):
+    torch._check(
+        dtype not in [torch.bool, torch.bfloat16, torch.complex128, torch.complex64],
+        lambda: f"Unsupported input type encountered for isin(): {dtype}",
+    )
+
+
+@register_meta(aten._embedding_bag_dense_backward)
+def meta_embedding_bag_dense_backward(
+    grad,
+    indices,
+    offset2bag,
+    bag_size,
+    maximum_indices,
+    num_weights,
+    scale_grad_by_freq,
+    mode,
+    per_sample_weights,
+    padding_idx=-1,
+):
+    torch._check(
+        grad.dtype in [torch.float16, torch.bfloat16, torch.float32, torch.float64],
+        lambda: f"Unsupported input type encountered: {grad.dtype}",
+    )
+    MODE_SUM, MODE_MEAN, MODE_MAX = range(3)
+    if mode == MODE_MAX:
+        torch._check(maximum_indices is not None)
+    index_grad_weight = grad.new_empty((num_weights, grad.size(1)))
+    return index_grad_weight
+
+
+@register_meta(aten._embedding_bag_per_sample_weights_backward)
+def meta_embedding_bag_per_sample_weights_backward(
+    grad, weight, indices, offsets, offset2bag, mode, padding_idx=-1
+):
+    MODE_SUM, MODE_MEAN, MODE_MAX = range(3)
+    embedding_features = grad.size(1)
+    torch._check(
+        mode == MODE_SUM,
+        "embedding_bag_backward: per_sample_weights only supported for mode='sum'",
+    )
+    torch._check(grad.dim() == 2)
+    torch._check(indices.dim() == 1)
+    num_samples = indices.size(0)
+    torch._check(weight.dim() == 2)
+    torch._check(weight.size(1) == embedding_features)
+    output = grad.new_empty((num_samples,))
+    return output
+
+
+@register_meta(aten.isin)
+@out_wrapper()
+def meta_isin(elements, test_elements, *, assume_unique=False, invert=False):
+    torch._check(
+        isinstance(elements, Tensor) or isinstance(test_elements, Tensor),
+        lambda: "At least one of elements and test_elements must be a Tensor.",
+    )
+    if not isinstance(elements, Tensor):
+        elements = torch.tensor(elements, device=test_elements.device)
+
+    if not isinstance(test_elements, Tensor):
+        test_elements = torch.tensor(test_elements, device=elements.device)
+
+    _check_for_unsupported_isin_dtype(elements.dtype)
+    _check_for_unsupported_isin_dtype(test_elements.dtype)
+    return torch.empty_like(elements, dtype=torch.bool)
+
+
 @register_meta(aten.polygamma)
 @out_wrapper()
 def meta_polygamma(n: int, self: Tensor) -> Tensor:
@@ -6160,6 +6293,22 @@ def meta_polygamma(n: int, self: Tensor) -> Tensor:
     return torch.empty_like(self, dtype=result_dtype)
 
 
+@register_meta(aten.channel_shuffle.default)
+def meta_channel_shuffle(input, groups):
+    # Assume the input shape is (*, C, H, W), where * represents any number of leading dimensions
+    *leading_dims, C, H, W = input.size()
+    # The output shape is the same as the input
+    return torch.empty(
+        *leading_dims,
+        C,
+        H,
+        W,
+        dtype=input.dtype,
+        layout=input.layout,
+        device=input.device,
+    )
+
+
 def _create_unary_float_meta_func(func):
     @register_meta(func)
     @out_wrapper()
@@ -6195,9 +6344,16 @@ def _f(x, y):
 
 _create_binary_float_meta_func(aten.special_chebyshev_polynomial_t)
 _create_binary_float_meta_func(aten.special_chebyshev_polynomial_u)
+_create_binary_float_meta_func(aten.special_chebyshev_polynomial_v)
+_create_binary_float_meta_func(aten.special_chebyshev_polynomial_w)
+_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_t)
+_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_u)
+_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_v)
+_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_w)
 _create_binary_float_meta_func(aten.special_hermite_polynomial_h)
 _create_binary_float_meta_func(aten.special_hermite_polynomial_he)
 _create_binary_float_meta_func(aten.special_laguerre_polynomial_l)
+_create_binary_float_meta_func(aten.special_legendre_polynomial_p)
 
 
 # We must also trigger meta registrations from PrimTorch ref
diff --git a/torch/_numpy/__init__.py b/torch/_numpy/__init__.py
index bb7b6fefd18c4..4f0b9a0a5a194 100644
--- a/torch/_numpy/__init__.py
+++ b/torch/_numpy/__init__.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from . import fft, linalg, random
 from ._dtypes import *  # noqa: F403
 from ._funcs import *  # noqa: F403
diff --git a/torch/_numpy/_binary_ufuncs_impl.py b/torch/_numpy/_binary_ufuncs_impl.py
index a103dcc03c9a1..34268c5911d66 100644
--- a/torch/_numpy/_binary_ufuncs_impl.py
+++ b/torch/_numpy/_binary_ufuncs_impl.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 """Export torch work functions for binary ufuncs, rename/tweak to match numpy.
 This listing is further exported to public symbols in the `torch._numpy/_ufuncs.py` module.
 """
diff --git a/torch/_numpy/_casting_dicts.py b/torch/_numpy/_casting_dicts.py
index e215079ed4d0a..513e73ef2efe4 100644
--- a/torch/_numpy/_casting_dicts.py
+++ b/torch/_numpy/_casting_dicts.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 
 # These two dicts are autogenerated with autogen/gen_dtypes.py,
diff --git a/torch/_numpy/_dtypes.py b/torch/_numpy/_dtypes.py
index b13064d6321a4..f8b8f4f722bec 100644
--- a/torch/_numpy/_dtypes.py
+++ b/torch/_numpy/_dtypes.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 """ Define analogs of numpy dtypes supported by pytorch.
 Define the scalar types and supported dtypes and numpy <--> torch dtype mappings.
 """
diff --git a/torch/_numpy/_dtypes_impl.py b/torch/_numpy/_dtypes_impl.py
index d0964c060590f..2809c69e999c0 100644
--- a/torch/_numpy/_dtypes_impl.py
+++ b/torch/_numpy/_dtypes_impl.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 """Dtypes/scalar type implementaions with torch dtypes.
 
 Here `dtype` is always a torch.dtype, this module knows nothing about
diff --git a/torch/_numpy/_funcs.py b/torch/_numpy/_funcs.py
index 7f6fc230b242f..666922adc709e 100644
--- a/torch/_numpy/_funcs.py
+++ b/torch/_numpy/_funcs.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import inspect
 import itertools
 
diff --git a/torch/_numpy/_funcs_impl.py b/torch/_numpy/_funcs_impl.py
index 2d9b61e204ef8..de165d5db768e 100644
--- a/torch/_numpy/_funcs_impl.py
+++ b/torch/_numpy/_funcs_impl.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 """A thin pytorch / numpy compat layer.
 
 Things imported from here have numpy-compatible signatures but operate on
@@ -10,20 +12,22 @@
 import builtins
 import itertools
 import operator
-from typing import Optional, Sequence
+from typing import Optional, Sequence, TYPE_CHECKING
 
 import torch
 
 from . import _dtypes_impl, _util
-from ._normalizations import (
-    ArrayLike,
-    ArrayLikeOrScalar,
-    CastingModes,
-    DTypeLike,
-    NDArray,
-    NotImplementedType,
-    OutArray,
-)
+
+if TYPE_CHECKING:
+    from ._normalizations import (
+        ArrayLike,
+        ArrayLikeOrScalar,
+        CastingModes,
+        DTypeLike,
+        NDArray,
+        NotImplementedType,
+        OutArray,
+    )
 
 
 def copy(
@@ -215,7 +219,7 @@ def _split_helper_int(tensor, indices_or_sections, axis, strict=False):
     l, n = tensor.shape[axis], indices_or_sections
 
     if n <= 0:
-        raise ValueError()
+        raise ValueError
 
     if l % n == 0:
         num, sz = n, l // n
@@ -887,22 +891,22 @@ def take_along_axis(arr: ArrayLike, indices: ArrayLike, axis):
 
 def put(
     a: NDArray,
-    ind: ArrayLike,
-    v: ArrayLike,
+    indices: ArrayLike,
+    values: ArrayLike,
     mode: NotImplementedType = "raise",
 ):
-    v = v.type(a.dtype)
-    # If ind is larger than v, expand v to at least the size of ind. Any
+    v = values.type(a.dtype)
+    # If indices is larger than v, expand v to at least the size of indices. Any
     # unnecessary trailing elements are then trimmed.
-    if ind.numel() > v.numel():
-        ratio = (ind.numel() + v.numel() - 1) // v.numel()
+    if indices.numel() > v.numel():
+        ratio = (indices.numel() + v.numel() - 1) // v.numel()
         v = v.unsqueeze(0).expand((ratio,) + v.shape)
     # Trim unnecessary elements, regardless if v was expanded or not. Note
-    # np.put() trims v to match ind by default too.
-    if ind.numel() < v.numel():
+    # np.put() trims v to match indices by default too.
+    if indices.numel() < v.numel():
         v = v.flatten()
-        v = v[: ind.numel()]
-    a.put_(ind, v)
+        v = v[: indices.numel()]
+    a.put_(indices, v)
     return None
 
 
@@ -2006,7 +2010,7 @@ def min_scalar_type(a: ArrayLike, /):
     from ._dtypes import DType
 
     if a.numel() > 1:
-        # numpy docs: "For non-scalar array a, returns the vector’s dtype unmodified."
+        # numpy docs: "For non-scalar array a, returns the vector's dtype unmodified."
         return DType(a.dtype)
 
     if a.dtype == torch.bool:
diff --git a/torch/_numpy/_getlimits.py b/torch/_numpy/_getlimits.py
index dc2cc845f7529..b0c46094e8782 100644
--- a/torch/_numpy/_getlimits.py
+++ b/torch/_numpy/_getlimits.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 
 from . import _dtypes
diff --git a/torch/_numpy/_ndarray.py b/torch/_numpy/_ndarray.py
index 252852da1a339..203a12a8b90b0 100644
--- a/torch/_numpy/_ndarray.py
+++ b/torch/_numpy/_ndarray.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from __future__ import annotations
 
 import builtins
@@ -166,6 +168,14 @@ def _upcast_int_indices(index):
     return index
 
 
+# Used to indicate that a parameter is unspecified (as opposed to explicitly
+# `None`)
+class _Unspecified:
+    pass
+
+
+_Unspecified.unspecified = _Unspecified()
+
 ###############################################################
 #                      ndarray class                          #
 ###############################################################
@@ -283,7 +293,17 @@ def imag(self, value):
         self.tensor.imag = asarray(value).tensor
 
     # ctors
-    def astype(self, dtype):
+    def astype(self, dtype, order="K", casting="unsafe", subok=True, copy=True):
+        if order != "K":
+            raise NotImplementedError(f"astype(..., order={order} is not implemented.")
+        if casting != "unsafe":
+            raise NotImplementedError(
+                f"astype(..., casting={casting} is not implemented."
+            )
+        if not subok:
+            raise NotImplementedError(f"astype(..., subok={subok} is not implemented.")
+        if not copy:
+            raise NotImplementedError(f"astype(..., copy={copy} is not implemented.")
         torch_dtype = _dtypes.dtype(dtype).torch_dtype
         t = self.tensor.to(torch_dtype)
         return ndarray(t)
@@ -324,7 +344,11 @@ def resize(self, *new_shape, refcheck=False):
             b = self.tensor.flatten()  # does not copy
             b[old_numel:].zero_()
 
-    def view(self, dtype):
+    def view(self, dtype=_Unspecified.unspecified, type=_Unspecified.unspecified):
+        if dtype is _Unspecified.unspecified:
+            dtype = self.dtype
+        if type is not _Unspecified.unspecified:
+            raise NotImplementedError(f"view(..., type={type} is not implemented.")
         torch_dtype = _dtypes.dtype(dtype).torch_dtype
         tview = self.tensor.view(torch_dtype)
         return ndarray(tview)
@@ -488,7 +512,7 @@ def array(obj, dtype=None, *, copy=True, order="K", subok=False, ndmin=0, like=N
     if like is not None:
         raise NotImplementedError("'like' parameter is not supported.")
     if order != "K":
-        raise NotImplementedError()
+        raise NotImplementedError
 
     # a happy path
     if (
diff --git a/torch/_numpy/_normalizations.py b/torch/_numpy/_normalizations.py
index 8a08dd36b111c..1cf2f56eaa3f1 100644
--- a/torch/_numpy/_normalizations.py
+++ b/torch/_numpy/_normalizations.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 """ "Normalize" arguments: convert array_likes to tensors, dtypes to torch dtypes and so on.
 """
 from __future__ import annotations
@@ -106,9 +108,13 @@ def normalize_outarray(arg, parm=None):
     # almost normalize_ndarray, only return the array, not its tensor
     if arg is None:
         return arg
-
     from ._ndarray import ndarray
 
+    # Dynamo can pass torch tensors as out arguments,
+    # wrap it in an ndarray before processing
+    if isinstance(arg, torch.Tensor):
+        arg = ndarray(arg)
+
     if not isinstance(arg, ndarray):
         raise TypeError(f"'{parm.name}' must be an array")
     return arg
@@ -168,7 +174,7 @@ def maybe_copy_to(out, result, promote_scalar_result=False):
             maybe_copy_to(o, r, promote_scalar_result) for o, r in zip(out, result)
         )
     else:
-        raise AssertionError()  # We should never hit this path
+        raise AssertionError  # We should never hit this path
 
 
 def wrap_tensors(result):
diff --git a/torch/_numpy/_reductions_impl.py b/torch/_numpy/_reductions_impl.py
index b1dd4ef5463d7..c1c46ad2a4774 100644
--- a/torch/_numpy/_reductions_impl.py
+++ b/torch/_numpy/_reductions_impl.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 """ Implementation of reduction operations, to be wrapped into arrays, dtypes etc
 in the 'public' layer.
 
@@ -6,19 +8,21 @@
 from __future__ import annotations
 
 import functools
-from typing import Optional
+from typing import Optional, TYPE_CHECKING
 
 import torch
 
 from . import _dtypes_impl, _util
-from ._normalizations import (
-    ArrayLike,
-    AxisLike,
-    DTypeLike,
-    KeepDims,
-    NotImplementedType,
-    OutArray,
-)
+
+if TYPE_CHECKING:
+    from ._normalizations import (
+        ArrayLike,
+        AxisLike,
+        DTypeLike,
+        KeepDims,
+        NotImplementedType,
+        OutArray,
+    )
 
 
 def _deco_axis_expand(func):
diff --git a/torch/_numpy/_ufuncs.py b/torch/_numpy/_ufuncs.py
index c5f8a278ffa75..0543aad2f7a6b 100644
--- a/torch/_numpy/_ufuncs.py
+++ b/torch/_numpy/_ufuncs.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from __future__ import annotations
 
 from typing import Optional
diff --git a/torch/_numpy/_unary_ufuncs_impl.py b/torch/_numpy/_unary_ufuncs_impl.py
index 2dc5340127542..ce4a30cc40e71 100644
--- a/torch/_numpy/_unary_ufuncs_impl.py
+++ b/torch/_numpy/_unary_ufuncs_impl.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 """Export torch work functions for unary ufuncs, rename/tweak to match numpy.
 This listing is further exported to public symbols in the `_numpy/_ufuncs.py` module.
 """
diff --git a/torch/_numpy/_util.py b/torch/_numpy/_util.py
index c272e9974352a..ff219d930731c 100644
--- a/torch/_numpy/_util.py
+++ b/torch/_numpy/_util.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 """Assorted utilities, which do not need anything other then torch and stdlib.
 """
 
diff --git a/torch/_numpy/fft.py b/torch/_numpy/fft.py
index 63f45a7b49982..b7d2f8365dbbd 100644
--- a/torch/_numpy/fft.py
+++ b/torch/_numpy/fft.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from __future__ import annotations
 
 import functools
diff --git a/torch/_numpy/linalg.py b/torch/_numpy/linalg.py
index 5a67a7fc01671..2232419db1b2e 100644
--- a/torch/_numpy/linalg.py
+++ b/torch/_numpy/linalg.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from __future__ import annotations
 
 import functools
diff --git a/torch/_numpy/random.py b/torch/_numpy/random.py
index 1b3656b87967e..b10a4c667c8c6 100644
--- a/torch/_numpy/random.py
+++ b/torch/_numpy/random.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 """Wrapper to mimic (parts of) np.random API surface.
 
 NumPy has strict guarantees on reproducibility etc; here we don't give any.
diff --git a/torch/_numpy/testing/__init__.py b/torch/_numpy/testing/__init__.py
index 941eba116c42c..8df5389fbe8cb 100644
--- a/torch/_numpy/testing/__init__.py
+++ b/torch/_numpy/testing/__init__.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from .utils import (
     _gen_alignment_data,
     assert_,
diff --git a/torch/_numpy/testing/utils.py b/torch/_numpy/testing/utils.py
index 476e8fa43b5c4..0eed3cd852700 100644
--- a/torch/_numpy/testing/utils.py
+++ b/torch/_numpy/testing/utils.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 """
 Utility function to facilitate testing.
 
diff --git a/torch/_ops.py b/torch/_ops.py
index a329ff6b04f55..f5d7313591dbd 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -4,7 +4,7 @@
 import inspect
 import sys
 import types
-from typing import Any, Callable, Dict, Type, Union
+from typing import Any, Callable, Dict, List, Set, Type, Union
 
 import torch._C
 import torch.utils._pytree as pytree
@@ -85,7 +85,7 @@ def __init__(self):
         self.functorch_table = {}
 
     def __call__(self, *args, **kwargs):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def has_kernel_for_dispatch_key(self, k):
         return k in self.py_kernels
@@ -149,7 +149,7 @@ def functionalize_dk_fn(*args, **kwargs):
             return fn(_CppFunctionalizeAPI(), *args, **kwargs)
 
         def functionalize_dispatch_mode_fn(mode, *args, **kwargs):
-            return fn(_PythonFunctionalizeAPI(), *args, **kwargs)
+            return fn(_PythonFunctionalizeAPI(mode), *args, **kwargs)
 
         def functionalize_functorch_fn(interpreter, *args, **kwargs):
             return fn(_FunctorchFunctionalizeAPI(interpreter), *args, **kwargs)
@@ -165,7 +165,7 @@ def functionalize_functorch_fn(interpreter, *args, **kwargs):
         return fn
 
     def name(self):
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 is_included_in_alias = torch._C._dispatch_is_included_in_alias
@@ -227,7 +227,7 @@ def resolve_key(op: OperatorBase, k: DispatchKey):  # type: ignore[valid-type]
     raise NotImplementedError(f"could not find kernel for {op} at dispatch key {k}")
 
 
-_higher_order_ops = {}
+_higher_order_ops: Dict[str, "HigherOrderOperator"] = {}
 
 _HIGHER_ORDER_OP_DEFAULT_FALLTHROUGH_DISPATCH_KEYS = [
     DispatchKey.PythonDispatcher,  # type: ignore[attr-defined]
@@ -261,11 +261,20 @@ def __init__(self, name):
         if self.__class__ is HigherOrderOperator:
             self_name_space = "." + self.namespace if self.namespace else ""
             self.__module__ = self.__module__ + self_name_space
+
         self.non_fallthrough_keys = torch._C._dispatch_keyset_full()
 
         for dispatch_key in _HIGHER_ORDER_OP_DEFAULT_FALLTHROUGH_DISPATCH_KEYS:
             self.fallthrough(dispatch_key)
 
+        # [NOTE] We have to register pre-dispatch key implementation
+        # because sometimes HOP use aot-dispatch tracing to detect certaion
+        # mutations. This is problematic when we are functionalizing HOP
+        # during pre-dispatch because when the inner tracer starts, it will see
+        # that PreDispatch key is still active. In that case, we just redispatch
+        # it to next key. This is only safe to do when PreDispatch key stack has no
+        # active modes.
+
     def py_impl(self, k):
         if isinstance(k, torch._C.DispatchKey) and not self.non_fallthrough_keys.has(k):
             self.non_fallthrough_keys = self.non_fallthrough_keys.add(k)
@@ -335,7 +344,13 @@ def dispatch(self, dispatch_key, *args, **kwargs):
                 f"could not find kernel for HigherOrderOperator {self._name} "
                 f"at dispatch key {final_key} (resolved from {dispatch_key})"
             )
-        self._dispatch_cache[dispatch_key] = self.py_kernels[final_key]
+
+        # [NOTE] We shouldn't cache PreDispatch kernel here because depending
+        # on what modes are active, predispatch behaviour is different.
+        # Also we do same thing for normal ops:
+        # See Note [Not Caching Per-Dispatch-Key Mode Handlers]
+        if dispatch_key != torch._C.DispatchKey.PreDispatch:
+            self._dispatch_cache[dispatch_key] = self.py_kernels[final_key]
         kernel = self.py_kernels[final_key]
         # It's illegal to register DispatchKey to py_kernels, since there's no
         # C++ kernel to call into
@@ -427,14 +442,30 @@ def unset_mode_pre_dispatch(mode_key):
         torch._C._TorchDispatchModeKey.PROXY,
         torch._C._TorchDispatchModeKey.FUNCTIONAL,
     )
-    if mode_key == torch._C._TorchDispatchModeKey.PROXY:
-        current_mode = current_mode_stack_pre_dispatch.get(0)
-        mode_stack_state_for_pre_dispatch().set(0, None)
-        return current_mode
-    else:
-        current_mode = current_mode_stack_pre_dispatch.get(1)
-        mode_stack_state_for_pre_dispatch().set(1, None)
-        return current_mode
+
+    def _unset_mode():
+        if mode_key == torch._C._TorchDispatchModeKey.PROXY:
+            current_mode = current_mode_stack_pre_dispatch.get(0)
+            mode_stack_state_for_pre_dispatch().set(0, None)
+            return current_mode
+        else:
+            current_mode = current_mode_stack_pre_dispatch.get(1)
+            mode_stack_state_for_pre_dispatch().set(1, None)
+            return current_mode
+
+    current_mode = _unset_mode()
+
+    new_pre_dispatch_len = _len_torch_dispatch_stack_pre_dispatch()
+    # When we are unsetting a mode, we need to check if there is
+    # active mode left on the PreDispatch key. If there is nothing
+    # active, we need to remove PreDispatch key from local dispatch include
+    # set.
+    if new_pre_dispatch_len == 0:
+        torch._C._dispatch_tls_set_dispatch_key_included(
+            torch._C.DispatchKey.PreDispatch, False
+        )
+
+    return current_mode
 
 
 def _set_mode_pre_dispatch(mode):
@@ -442,30 +473,39 @@ def _set_mode_pre_dispatch(mode):
     from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
 
     assert isinstance(mode, (FunctionalTensorMode, ProxyTorchDispatchMode))
+
+    previous_mode_stack_len = _len_torch_dispatch_stack_pre_dispatch()
     if isinstance(mode, FunctionalTensorMode):
         current_mode = mode_stack_state_for_pre_dispatch().get(1)
         assert current_mode is None
         mode_stack_state_for_pre_dispatch().set(1, mode)
-        return
-
-    current_mode = mode_stack_state_for_pre_dispatch().get(0)
-    assert current_mode is None
-    mode_stack_state_for_pre_dispatch().set(0, mode)
+    else:
+        current_mode = mode_stack_state_for_pre_dispatch().get(0)
+        assert current_mode is None
+        mode_stack_state_for_pre_dispatch().set(0, mode)
+
+    # When we are setting a mode, we need to check if there is
+    # active mode left on the PreDispatch key. If there was nothing
+    # active before setting this mode, it means that PreDispatch key
+    # was turned off. So we need to turn it on again.
+    if previous_mode_stack_len == 0:
+        torch._C._dispatch_tls_set_dispatch_key_included(
+            torch._C.DispatchKey.PreDispatch, True
+        )
 
 
 def _pop_mode_from_pre_dispatch():
     mode_stack = mode_stack_state_for_pre_dispatch()
+    pre_dispatch_len = _len_torch_dispatch_stack_pre_dispatch()
+
+    if pre_dispatch_len == 0:
+        raise AssertionError("Trying to pop empty mode stack")
+
     if mode_stack.get(1) is not None:
-        res = mode_stack.get(1)
-        mode_stack.set(1, None)
-        return res
+        return unset_mode_pre_dispatch(torch._C._TorchDispatchModeKey.FUNCTIONAL)
 
     if mode_stack.get(0) is not None:
-        res = mode_stack.get(0)
-        mode_stack.set(0, None)
-        return res
-
-    raise AssertionError("Trying to pop empty mode stack")
+        return unset_mode_pre_dispatch(torch._C._TorchDispatchModeKey.PROXY)
 
 
 def _len_torch_dispatch_stack_pre_dispatch():
@@ -500,7 +540,7 @@ def mode_stack_state_for_pre_dispatch():
     return _mode_stack_state_for_pre_dispatch
 
 
-cached_ops = set()
+cached_ops: Set["OpOverload"] = set()
 
 
 def add_cached_op(op_overload):
@@ -539,6 +579,9 @@ def __init__(self, overloadpacket, op, op_dk, schema, tags):
         op.__module__ = overloadpacket.__module__
         self.__qualname__ = self._name
         self.__annotations__ = {}
+        # Only compute the OperatorHandle when we need it. Not all OpOverloads have
+        # OperatorHandles (the TorchScript ones don't...)
+        self._lazy_handle = None
 
         # If the OpOverload was constructed from a Library.def in Python.
         self._defined_in_python = self.__qualname__ in torch.library._defs
@@ -556,6 +599,22 @@ def __init__(self, overloadpacket, op, op_dk, schema, tags):
                 is_write = a.alias_info.is_write or is_write
         self.is_view = is_write is not None and not is_write
 
+    @property
+    def _namespace(self):
+        return self._schema.name.split("::")[0]
+
+    @property
+    def _opname(self):
+        return self._schema.name.split("::")[1]
+
+    @property
+    def _handle(self):
+        if self._lazy_handle is None:
+            self._lazy_handle = torch._C._dispatch_find_schema_or_throw(
+                self._schema.name, self._schema.overload_name
+            )
+        return self._lazy_handle
+
     # it's a no-op since OpOverload object is immutable and must be unique for a given op overload.
     def __deepcopy__(self, memo=None):
         return self
@@ -567,8 +626,13 @@ def __repr__(self):
 
     def __call__(self_, *args, **kwargs):  # noqa: B902
         # use `self_` to avoid naming collide with aten ops arguments that
-        # named "self". This way, all the aten ops can be called by kwargs.
-        return self_._op(*args, **(kwargs or {}))
+        # are named "self". This way, all the aten ops can be called by kwargs.
+        return self_._op(*args, **kwargs)
+
+    def redispatch(self_, keyset, *args, **kwargs):  # noqa: B902
+        # use `self_` to avoid naming collide with aten ops arguments that
+        # are named "self". This way, all the aten ops can be called by kwargs.
+        return self_._handle.redispatch_boxed(keyset, *args, **kwargs)
 
     def __hash__(self):
         return hash(self._op)
@@ -621,7 +685,10 @@ def _get_dispatch(self, key):
         assert key not in self._dispatch_cache, f"{self} {key}"
 
         if key == torch._C.DispatchKey.Python:
-            if not self.python_key_mode_table:
+            if (
+                not isinstance(self, TorchBindOpOverload)
+                and not self.python_key_mode_table
+            ):
                 self._dispatch_cache[key] = key
                 add_cached_op(self)
                 return key
@@ -635,12 +702,18 @@ def handler(*args, **kwargs):
                 assert (
                     curr_mode is not None
                 ), "Illegal invocation of dispatch on torch._C.DispatchKey.Python without a mode."
+
                 if curr_mode not in self.python_key_mode_table:
-                    # TODO: This path is slow, should generally encourage this
-                    # case to not happen
-                    return self._op_dk(key, *args, **kwargs)
-                # TODO(voz): The idea behind this is that we do not yet support dispatch by key + mode, only key.
-                return self.python_key_mode_table[curr_mode](*args, **kwargs)
+                    if isinstance(self, TorchBindOpOverload):
+                        with torch.utils._python_dispatch._pop_mode_temporarily() as mode:
+                            return torch._library.utils.handle_dispatch_mode(
+                                mode, self, *args, **kwargs
+                            )
+                    else:
+                        return self._op_dk(key, *args, **kwargs)
+
+                with torch.utils._python_dispatch._pop_mode_temporarily() as mode:
+                    return self.python_key_mode_table[curr_mode](mode, *args, **kwargs)
 
             self._dispatch_cache[key] = handler
             add_cached_op(self)
@@ -668,23 +741,8 @@ def _temporarily_pop_modes_from_pre_dispatch():
                             _set_mode_pre_dispatch(top_mode)
 
                     with _temporarily_pop_modes_from_pre_dispatch() as curr_mode:
-                        assert isinstance(curr_mode, TorchDispatchMode)
-                        overload_types = []
-                        args_flattened, _ = torch.utils._pytree.tree_flatten(
-                            (args, kwargs.values())
-                        )
-                        for a in args_flattened:
-                            # TODO: need to double check the semantics of the "types" argument to torch_dispatch.
-                            # It's generated in PyInterpreter.cpp, but seems to be generated in two places,
-                            # where in one case we only include tensors with the python key, and in another
-                            # we include **all** tensors.
-                            if isinstance(a, torch.Tensor) and torch._C._dispatch_keys(
-                                a
-                            ).has(torch._C.DispatchKey.Python):
-                                overload_types.append(type(a))
-                        # TODO: check that I got these args correct (in C++, we pass in "0000"??)
-                        return curr_mode.__torch_dispatch__(
-                            self, overload_types, args, kwargs
+                        return torch._library.utils.handle_dispatch_mode(
+                            curr_mode, self, *args, **kwargs
                         )
 
                 # Note [Not Caching Per-Dispatch-Key Mode Handlers]
@@ -712,7 +770,6 @@ def _temporarily_pop_modes_from_pre_dispatch():
                     add_cached_op(self)
                 return handler
 
-        # print(self, key, final_key)
         r = self.py_kernels.get(final_key, final_key)
         if cache_result:
             self._dispatch_cache[key] = r
@@ -737,6 +794,128 @@ def tags(self):
     # TODO: add more methods to expose information about input and output arguments
 
 
+# TorchBindOpOverload are those custom ops which have at least one overload's
+# schema consists of torch.ScriptObject (i.e. custom class) input.
+# TorchBindOpOverload will skip C++ dispatcher and purely dispatched in python
+# when its inputs contain FakeScriptObject in a similar way as higher order ops.
+class TorchBindOpOverload(OpOverload):
+    def _fallthrough_keys(self) -> List[DispatchKey]:
+        # TODO: we should be calling the fallback for these, but a fallthrough is almost close
+        # enough to the fallback in most cases that we care about.
+        _DEFAULT_FALLTHROUGH_KEYS = [
+            DispatchKey.Autograd,
+            DispatchKey.AutogradCPU,
+            DispatchKey.AutogradCUDA,
+            DispatchKey.ADInplaceOrView,
+            DispatchKey.BackendSelect,
+            DispatchKey.PythonTLSSnapshot,
+            DispatchKey.PythonDispatcher,
+        ]
+
+        def _may_use_fallthrough_instead_of_fallback(key: DispatchKey):
+            if torch._C._dispatch_has_kernel_for_dispatch_key(self.name(), key):
+                return torch._C._dispatch_kernel_for_dispatch_key_is_fallthrough(
+                    self.name(), key
+                )
+
+            return (
+                key not in self.py_kernels
+                or self.py_kernels[key] is torch.library.fallthrough_kernel
+            )
+
+        return [
+            key
+            for key in _DEFAULT_FALLTHROUGH_KEYS
+            if _may_use_fallthrough_instead_of_fallback(key)
+        ]
+
+    @contextlib.contextmanager
+    def _register_as_effectful_op_temporarily(self):
+        from torch._higher_order_ops.effects import (
+            _EffectType,
+            _register_effectful_op,
+            SIDE_EFFECTS,
+        )
+
+        try:
+            if self not in SIDE_EFFECTS:
+                _register_effectful_op(self, _EffectType.ORDERED)
+            yield
+        finally:
+            if self in SIDE_EFFECTS:
+                del SIDE_EFFECTS[self]
+
+    # use `self_` to avoid naming collide with arguments that
+    # are named "self". This way, they can be called by kwargs.
+    def __call__(self_, *args, **kwargs):  # noqa: B902
+        if _must_dispatch_in_python(args, kwargs):
+            # When any inputs are FakeScriptObject, we need to
+            # skip c++ dispatcher and dispatch in python through _get_dispatch of python_dispatcher
+            # because C++ dispatcher will check the schema and cannot recognize FakeScriptObject.
+            #
+            # Note:
+            # 1. We only register the torchbind op temporarily as effectful op because we only want
+            #    the effect token functionalization logic to be applied during tracing. Otherwise, the behavior
+            #    of the eagerly executing the op might change after tracing.
+            # 2. We don't want to register the op as effectful for all torchbind ops in ctor because this might
+            #    cause unexpected behavior for some autograd.profiler ops e.g. profiler._record_function_exit._RecordFunction.
+            with self_._register_as_effectful_op_temporarily():
+                return self_._dispatch_in_python(
+                    args, kwargs, self_._fallthrough_keys()
+                )
+        return self_._op(*args, **kwargs)
+
+    def _dispatch_in_python(self, args, kwargs, fallthrough_keys):
+        non_fallthrough_keys = torch._C._dispatch_keyset_full()
+        for key in fallthrough_keys:
+            non_fallthrough_keys = non_fallthrough_keys.remove(key)
+
+        dispatch_key_set = _compute_keyset(args, kwargs, non_fallthrough_keys)
+        dispatch_key = dispatch_key_set.highestPriorityTypeId()
+
+        handler = (
+            self._get_dispatch(dispatch_key)
+            if dispatch_key not in self._dispatch_cache
+            else self._dispatch_cache[dispatch_key]
+        )
+
+        if isinstance(handler, DispatchKey):
+            # fallthrough keys can be registered at runtime via torch.library.impl
+            # so need to add it to fallthrough_keys and re-dispatch.
+            if torch._C._dispatch_kernel_for_dispatch_key_is_fallthrough(
+                self.name(), dispatch_key
+            ):
+                return self._dispatch_in_python(
+                    args, kwargs, fallthrough_keys + [dispatch_key]
+                )
+
+            raise RuntimeError(
+                f"Torchbind op {self} received a FakeScriptObject input when dispatching {handler}."
+                f" but no python implementation is found."
+                f" Please file an issue on this when you encounter this error."
+                f" This error can happen when you export or compile the model."
+                f" It can still happpen even if a C++ implementation for {dispatch_key}. "
+                f" has been registered. That's because FakeScriptObject purely lives in python and cannot work "
+                f" with a C++ implementation."
+            )
+
+        assert isinstance(handler, Callable)  # type: ignore[arg-type]
+        return handler(*args, **kwargs)
+
+
+def _must_dispatch_in_python(args, kwargs):
+    return pytree.tree_any(
+        lambda obj: isinstance(
+            obj, torch._library.fake_class_registry.FakeScriptObject
+        ),
+        (args, kwargs),
+    )
+
+
+def _has_script_object_arg(schema: torch.FunctionSchema) -> bool:
+    return any(isinstance(arg.type, torch.ClassType) for arg in schema.arguments)
+
+
 # OpOverloadPacket class contains pointer to a base unresolved operator that doesn't correspond to a specific operator
 # You can obtain an OpOverload object through attribute query.
 class OpOverloadPacket:
@@ -748,6 +927,9 @@ def __init__(self, qualified_op_name, op_name, op, overload_names):
         self._op = op
         self._overload_names = overload_names
         self._dir = []
+        self._has_torchbind_op_overload = any(
+            _has_script_object_arg(schema) for schema in self._schemas.values()
+        )
 
     # it's a no-op since OpOverloadPacket object is immutable and must be unique for a given op.
     def __deepcopy__(self, memo=None):
@@ -768,6 +950,13 @@ def __str__(self):
     def op(self):
         return self._op
 
+    @property
+    def _schemas(self):
+        return {
+            overload_name: torch._C._get_schema(self._qualified_op_name, overload_name)
+            for overload_name in self._overload_names
+        }
+
     def __getattr__(self, key):
         # It is not a valid op_name when __file__ is passed in
         if key == "__file__":
@@ -801,7 +990,11 @@ def __getattr__(self, key):
                 self._qualified_op_name, use_key
             )
             schema = torch._C._get_schema(self._qualified_op_name, use_key)
-            overload = OpOverload(self, op_, op_dk_, schema, tags)
+            overload = (
+                OpOverload(self, op_, op_dk_, schema, tags)
+                if not _has_script_object_arg(schema)
+                else TorchBindOpOverload(self, op_, op_dk_, schema, tags)
+            )
             # cache the overload object
             setattr(self, key, overload)
             self._dir.append(key)
@@ -822,6 +1015,12 @@ def __call__(self_, *args, **kwargs):  # noqa: B902
         # is still callable from JIT
         # We save the function ptr as the `op` attribute on
         # OpOverloadPacket to access it here.
+
+        # Directly calling OverloadPacket goes into C++, which will check
+        # the schema and cause an error for torchbind op when inputs consist of FakeScriptObject so we
+        # intercept it here and call TorchBindOpverload instead.
+        if self_._has_torchbind_op_overload and _must_dispatch_in_python(args, kwargs):
+            return _call_overload_packet_from_python(self_, args, kwargs)
         return self_._op(*args, **(kwargs or {}))
 
     # TODO: use this to make a __dir__
@@ -829,6 +1028,46 @@ def overloads(self):
         return [n if n else "default" for n in self._overload_names]
 
 
+# Note - this mirrors the logic of the cpp_function defined in jit/python/init.cpp
+# _jit_get_operations, which calls _get_operation_for_overload_or_packet.
+def _call_overload_packet_from_python(op: OpOverloadPacket, args, kwargs):
+    # Re-use the torch function handling logic in cpp
+    torch_function_called, ret = torch._C._maybe_call_torch_function_for_op_packet(
+        op, *args, **kwargs
+    )
+
+    if torch_function_called:
+        return ret
+
+    # The following mirrors getOpWithStack.
+    # In cpp, we do a schema matching for the arguments, and call ToIValue to
+    # to check whether the arguments are valid. But need to do similar things here
+    # and check the schema whether the FakeScriptObject is the corresponding fake class
+    # of the actual class used in schema.
+    exceptions = {}
+    found_op = None
+    for overload_name in op.overloads():
+        op_overload = getattr(op, overload_name)
+        try:
+            _ = torch._C._check_schema_allow_fake_script_object(
+                op_overload._schema, *args, **kwargs
+            )
+            found_op = op_overload
+            break
+        except RuntimeError as e:
+            exceptions[overload_name] = e
+
+    if found_op:
+        return found_op(*args, **kwargs)
+
+    err_msg = (
+        f"Fail to match any TorchBindOverload of {op} with following exceptions:\n"
+    )
+    for i, (key, msg) in enumerate(exceptions.items()):
+        err_msg += f"Overload name {key}:\n {msg}\n"
+    raise RuntimeError(err_msg)
+
+
 # Resolution of torch.fn is different from torch.ops.aten.fn
 # torch.fn uses the Python argparser, matches with the
 # appropriate schema, and calls into the unboxed version of the method
@@ -1005,4 +1244,4 @@ def load_library(self, path):
 
 
 # The ops "namespace"
-ops = _Ops()
+ops: _Ops = _Ops()
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index e25aec08a266e..116671703e49a 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -113,6 +113,7 @@
     "fmax",
     "fmin",
     "fmod",
+    "frexp",
     "gcd",
     "ge",
     "gt",
@@ -211,6 +212,11 @@
     "fft_r2c",
     "fft_c2c",
     "fft_c2r",
+    #
+    # prims for making/sinking tokens
+    #
+    "_make_token",
+    "_sink_tokens",
 ]
 
 
@@ -246,10 +252,10 @@ def TensorMeta(
         assert dtype is not None
         assert device is not None
 
-    shape = inferred_shape if shape is None else tuple(shape)
-    strides = inferred_strides if strides is None else tuple(strides)
-    dtype = inferred_dtype if dtype is None else dtype
-    device = inferred_device if device is None else device
+    shape = inferred_shape if shape is None else tuple(shape)  # type: ignore[possibly-undefined]
+    strides = inferred_strides if strides is None else tuple(strides)  # type: ignore[possibly-undefined]
+    dtype = inferred_dtype if dtype is None else dtype  # type: ignore[possibly-undefined]
+    device = inferred_device if device is None else device  # type: ignore[possibly-undefined]
 
     if isinstance(device, str):
         device = torch.device(device)
@@ -1210,6 +1216,8 @@ def _as_strided_aten(
 def _broadcast_in_dim_meta(
     a: TensorLikeType, shape: ShapeType, broadcast_dimensions: Sequence[int]
 ):
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
     # Type checks
     assert isinstance(a, TensorLike)
     assert isinstance(shape, Sequence)
@@ -1235,7 +1243,11 @@ def _greater_than_reduce(acc, x):
 
     # shape must be broadcastable to
     for idx, new_idx in enumerate(broadcast_dimensions):
-        assert a.shape[idx] == 1 or a.shape[idx] == shape[new_idx]
+        if not guard_size_oblivious(a.shape[idx] == 1):
+            torch._check(
+                a.shape[idx] == shape[new_idx],
+                lambda: f"{a.shape[idx]} must be broadcastable to {shape[new_idx]}",
+            )
 
     new_strides = []
     original_idx = 0
@@ -1243,13 +1255,13 @@ def _greater_than_reduce(acc, x):
         if idx in broadcast_dimensions:
             # Assigns a stride of zero to dimensions
             # which were actually broadcast
-            if a.shape[original_idx] != shape[idx]:
+            if guard_size_oblivious(a.shape[original_idx] != shape[idx]):
                 new_strides.append(0)
             else:
                 new_strides.append(a.stride()[original_idx])
             original_idx = original_idx + 1
         else:
-            if shape[idx] != 1:
+            if guard_size_oblivious(shape[idx] != 1):
                 new_strides.append(0)
             elif original_idx == a.ndim:
                 new_strides.append(1)
@@ -1325,6 +1337,8 @@ def _collapse_view_helper(
 ) -> Tuple[Optional[ShapeType], Optional[StrideType]]:
     assert isinstance(a, TensorLike)
 
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
     _validate_collapse_args(a, start, end)
 
     # Special-case for zero dimensional tensors
@@ -1341,21 +1355,28 @@ def _collapse_view_helper(
     length = shape[end]
     stride = strides[end]
     for idx in range(end - 1, start - 1, -1):
-        if shape[idx] == 0 or shape[idx + 1] == 0:
+        if guard_size_oblivious(shape[idx] == 0) or guard_size_oblivious(
+            shape[idx + 1] == 0
+        ):
             length = 0
             stride = 0
             break
 
-        if shape[idx] == 1:
+        if guard_size_oblivious(shape[idx] == 1):
             continue
 
         length = length * shape[idx]
-        stride = min(stride, strides[idx])
+        if guard_size_oblivious(stride < strides[idx]):
+            stride = stride
+        else:
+            stride = strides[idx]
 
         if (
-            a.numel() > 0
-            and shape[idx + 1] != 1
-            and not (strides[idx] == strides[idx + 1] * shape[idx + 1])
+            guard_size_oblivious(a.numel() > 0)
+            and guard_size_oblivious(shape[idx + 1] != 1)
+            and not guard_size_oblivious(
+                strides[idx] == strides[idx + 1] * shape[idx + 1]
+            )
         ):
             return None, None
 
@@ -1363,7 +1384,7 @@ def _collapse_view_helper(
     new_strides = strides[:start] + (stride,) + strides[end + 1 :]
 
     # NOTE: when the input has no elements it's restrided as if it were contiguous
-    if a.numel() == 0:
+    if guard_size_oblivious(a.numel() == 0):
         new_strides = utils.make_contiguous_strides_for(new_shape)
 
     return new_shape, new_strides
@@ -1647,8 +1668,9 @@ def _split_dim_meta(a: TensorLikeType, dim: int, outer_length: int) -> TensorLik
     inner_length = a.shape[dim] // outer_length
 
     if (a.shape[dim] % outer_length) != 0:
-        msg = "Attempting to split dimension of length {}, but outer length of {} divides it with a remainder!".format(
-            a.shape[dim], outer_length
+        msg = (
+            f"Attempting to split dimension of length {a.shape[dim]}, "
+            f"but outer length of {outer_length} divides it with a remainder!"
         )
         raise ValueError(msg)
 
@@ -1726,9 +1748,7 @@ def _squeeze_meta(a: TensorLikeType, dimensions: Sequence) -> TensorLikeType:
 
 def _transpose_meta(a: TensorLikeType, permutation: DimsSequenceType) -> TensorLikeType:
     if a.ndim != len(permutation):
-        msg = "Attempting to permute a tensor of rank {}, but received a permutation of length {}!".format(
-            a.ndim, len(permutation)
-        )
+        msg = f"Attempting to permute a tensor of rank {a.ndim}, but received a permutation of length {len(permutation)}!"
         raise ValueError(msg)
 
     if not utils.is_valid_permutation(a.ndim, permutation):
@@ -1888,11 +1908,15 @@ def _collapse_aten(a: Tensor, start: int, end: int) -> Tensor:
 
 
 # TODO: review stride logic
+# NB: unlike torch.cat, this is more strict about empty tensors and dim is
+# never negative
 def _cat_meta(tensors: Sequence[TensorLikeType], dim: int) -> TensorLikeType:
     # Verifies same shape (except in the concat dimension)
+    assert dim >= 0
     shape = tensors[0].shape
     concat_length = 0
     for tensor_idx, tensor in enumerate(tensors):
+        assert len(shape) == len(tensor.shape)
         for idx, (common_length, length) in enumerate(zip(shape, tensor.shape)):
             if idx == dim:
                 concat_length = concat_length + length
@@ -2993,5 +3017,49 @@ def _fft_c2r_aten(
     doc=_fft_c2r_doc,
 )
 
+
+def _frexp_meta(self: TensorLikeType) -> Tuple[TensorLikeType, TensorLikeType]:
+    torch._check(
+        self.dtype.is_floating_point,
+        lambda: "torch.frexp() only supports floating-point dtypes",
+    )
+    return torch.empty_like(self), torch.empty_like(self, dtype=torch.int32)
+
+
+frexp = _make_prim(
+    schema="frexp(Tensor self) -> (Tensor mantissa, Tensor exponent)",
+    meta=_frexp_meta,
+    return_type=(RETURN_TYPE.NEW, RETURN_TYPE.NEW),
+    impl_aten=torch.frexp,
+    doc="",
+)
+
+
+def _make_token_aten() -> TensorLikeType:
+    return torch.empty(0)
+
+
+_make_token = _make_prim(
+    schema="_make_token() -> Tensor",
+    meta=_make_token_aten,
+    return_type=RETURN_TYPE.NEW,
+    impl_aten=_make_token_aten,
+    doc="Creates a token used for keeping track of side effects.",
+)
+
+
+def _sink_tokens_aten(tokens) -> None:
+    pass
+
+
+_sink_tokens = _make_prim(
+    schema="_sink_tokens(Tensor[] tokens) -> ()",
+    meta=_sink_tokens_aten,
+    return_type=RETURN_TYPE.NONE,
+    impl_aten=_sink_tokens_aten,
+    doc="Sink all of the tokens which were previously used for keeping track of side effects.",
+)
+
+
 register_rng_prims()
 register_debug_prims()
diff --git a/torch/_prims/debug_prims.py b/torch/_prims/debug_prims.py
index cf8d9caacb4cd..ea3854d04bbd8 100644
--- a/torch/_prims/debug_prims.py
+++ b/torch/_prims/debug_prims.py
@@ -1,11 +1,10 @@
 import contextlib
-from typing import Sequence
+from typing import Optional
 
 import torch
-from torch._custom_op.impl import custom_op
 from torch.utils._content_store import ContentStoreReader
 
-LOAD_TENSOR_READER = None
+LOAD_TENSOR_READER: Optional[ContentStoreReader] = None
 
 
 @contextlib.contextmanager
@@ -26,18 +25,12 @@ def load_tensor_reader(loc):
 
 
 def register_debug_prims():
-    @custom_op("debugprims::load_tensor")
-    def load_tensor(  # type: ignore[empty-body]
-        name: str,
-        size: Sequence[int],
-        stride: Sequence[int],
-        *,
-        dtype: torch.dtype,
-        device: torch.device,
-    ) -> torch.Tensor:
-        ...
-
-    @load_tensor.impl_factory()
+    torch.library.define(
+        "debugprims::load_tensor",
+        "(str name, int[] size, int[] stride, *, ScalarType dtype, Device device) -> Tensor",
+    )
+
+    @torch.library.impl("debugprims::load_tensor", "BackendSelect")
     def load_tensor_factory(name, size, stride, dtype, device):
         if LOAD_TENSOR_READER is None:
             from torch._dynamo.testing import rand_strided
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index db8673024d0b7..ca025e917e77e 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -12,6 +12,7 @@
     Callable,
     cast,
     List,
+    NamedTuple,
     Optional,
     overload,
     Sequence,
@@ -21,6 +22,8 @@
     Union,
 )
 
+from typing_extensions import TypeAlias
+
 
 if TYPE_CHECKING:
     # Import the following modules during type checking to enable code intelligence features,
@@ -33,26 +36,27 @@
 from torch import sym_float, sym_int, sym_max
 
 
-ShapeType = Union[torch.Size, List[int], Tuple[int, ...]]
-StrideType = Union[List[int], Tuple[int, ...]]
-DimsType = Union[int, List[int], Tuple[int, ...]]
-DimsSequenceType = Union[List[int], Tuple[int, ...]]
+ShapeType: TypeAlias = Union[torch.Size, List[int], Tuple[int, ...]]
+StrideType: TypeAlias = Union[List[int], Tuple[int, ...]]
+DimsType: TypeAlias = Union[int, List[int], Tuple[int, ...]]
+DimsSequenceType: TypeAlias = Union[List[int], Tuple[int, ...]]
 # TODO: Type[torch.SymInt], Type[torch.SymFloat]
-NumberTypeType = Union[Type[bool], Type[int], Type[float], Type[complex]]
+NumberTypeType: TypeAlias = Union[Type[bool], Type[int], Type[float], Type[complex]]
 # TODO: This needs a lot more type annotations
 # NumberType = Union[bool, int, float, complex, torch.SymInt, torch.SymFloat]
-NumberType = Union[bool, int, float, complex]
-RealNumberType = Union[bool, int, float]
+NumberType: TypeAlias = Union[bool, int, float, complex]
+RealNumberType: TypeAlias = Union[bool, int, float]
 
-Number = (bool, int, float, complex, torch.SymInt, torch.SymFloat)
+Number = (bool, int, float, complex, torch.SymInt, torch.SymFloat, torch.SymBool)
 # I don't call it Integral because numbers.Integral includes bool, but IntLike
 # does not
 Dim = int
 IntLike = (int, torch.SymInt)
 FloatLike = (float, torch.SymFloat)
+BoolLike = (bool, torch.SymBool)
 IntWithoutSymInt = int
 FloatWithoutSymFloat = float
-DeviceLikeType = Union[str, torch.device, int]
+DeviceLikeType: TypeAlias = Union[str, torch.device, int]
 Tensor = torch.Tensor
 
 
@@ -63,7 +67,7 @@
     torch.sym_int,
     torch.sym_max,
     torch.sym_min,
-    torch.sym_sqrt,
+    torch._sym_sqrt,  # type: ignore[attr-defined]
     torch.sym_ite,
     torch.Tensor.dim,
     torch.Tensor.ndim.__get__,  # type: ignore[attr-defined]
@@ -82,18 +86,21 @@
     torch.Tensor.__format__,
     torch.Tensor.__repr__,
     torch.Tensor.requires_grad.__get__,  # type: ignore[attr-defined]
+    torch.Tensor.__getitem__,
 }
 
 
 TensorLikeType = torch.Tensor
 TensorLike = torch.Tensor
-TensorSequenceType = Union[List[TensorLikeType], Tuple[TensorLikeType, ...]]
-TensorOrNumberLikeType = Union[TensorLikeType, NumberType]
+TensorSequenceType: TypeAlias = Union[List[TensorLikeType], Tuple[TensorLikeType, ...]]
+TensorOrNumberLikeType: TypeAlias = Union[TensorLikeType, NumberType]
 
 CustomOutParamAnnotation = "__custom_out_param__"
 
 
 def same_shape(a: ShapeType, b: ShapeType, *, allow_rhs_unbacked=False) -> bool:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
     if len(a) != len(b):
         return False
 
@@ -103,7 +110,13 @@ def same_shape(a: ShapeType, b: ShapeType, *, allow_rhs_unbacked=False) -> bool:
             # with each other
             if isinstance(y, torch.SymInt):
                 continue
-        if x != y:
+        # NB: Naively, you would not expect to have to do an oblivious guard
+        # here because there is seemingly no broadcasting here, but in fact we
+        # use this in some situations to determine if we need to do an expand
+        # on the tensor because they don't line up, so you can definitely end
+        # up trying to prove u0 != 1 in this situation.  See
+        # python test/test_proxy_tensor.py -k test_cumsum_unbacked
+        if guard_size_oblivious(x != y):
             return False
 
     return True
@@ -128,6 +141,7 @@ def compare_tensor_meta(
     check_strides=False,
     *,
     allow_rhs_unbacked=False,
+    check_conj=True,
 ):
     """
     Checks that two tensor likes have the same shape,
@@ -169,10 +183,11 @@ def compare_tensor_meta(
             msg = f"Storage offset mismatch! Storage offsets are {a.storage_offset()} and {b.storage_offset()}!"
             raise RuntimeError(msg)
 
-    if a.is_conj() != b.is_conj():
-        raise RuntimeError(
-            f"Conj mismatch! is_conj is set to {a.is_conj()} and {b.is_conj()}"
-        )
+    if check_conj:
+        if a.is_conj() != b.is_conj():
+            raise RuntimeError(
+                f"Conj mismatch! is_conj is set to {a.is_conj()} and {b.is_conj()}"
+            )
 
     if a.is_neg() != b.is_neg():
         raise RuntimeError(
@@ -218,16 +233,18 @@ def is_contiguous(a: TensorLikeType) -> bool:
     Tensors are contiguous when they have no elements,
     one element, or when they have "nested" strides.
     """
-    if a.numel() < 2:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if guard_size_oblivious(a.numel() < 2):
         return True
 
     expected_stride = 1
     for x, y in reversed(tuple(zip(a.shape, a.stride()))):
         # Skips checking strides when a dimension has length 1
-        if x == 1:
+        if guard_size_oblivious(x == 1):
             continue
 
-        if y != expected_stride:
+        if guard_size_oblivious(y != expected_stride):
             return False
         expected_stride = expected_stride * x
 
@@ -333,6 +350,8 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
     its dimensions that is contiguous.
     """
 
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
     if a.is_sparse:
         return False
 
@@ -349,11 +368,34 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
 
     # Checks that there exists a permutation of the strides s.t. the tensor would be contiguous
     # Sorts (length, stride) pairs by stride
-    lengths_and_strides = sorted(zip(a.shape, a.stride()), key=operator.itemgetter(1))
+    #
+    # This sort is done in a size-oblivious way, which helps if we do a
+    # comparison like 2048*u0 > u0; we just want this to return True
+    # (and not worry about what if u0 is zero).
+    class K(NamedTuple):
+        size: int
+        stride: int
+
+        def __lt__(self, other):
+            return guard_size_oblivious(self.stride < other.stride)
+
+        def __gt__(self, other):
+            return guard_size_oblivious(self.stride > other.stride)
+
+        def __le__(self, other):
+            return guard_size_oblivious(self.stride <= other.stride)
+
+        def __ge__(self, other):
+            return guard_size_oblivious(self.stride >= other.stride)
+
+        def __eq__(self, other):
+            return guard_size_oblivious(self.stride == other.stride)
+
+    lengths_and_strides = sorted(map(K, a.shape, a.stride()))
 
     expected_stride = 1
     for length, stride in lengths_and_strides:
-        if length == 1:
+        if guard_size_oblivious(length == 1):
             continue
 
         if stride != expected_stride:
@@ -375,6 +417,8 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
 def compute_elementwise_output_logical_to_physical_perm(
     *tensors, _skip_checks=False
 ) -> List[int]:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
     if not _skip_checks and len(tensors) == 0:
         msg = "Can't compute elementwise output strides for zero tensors!"
         raise ValueError(msg)
@@ -421,17 +465,19 @@ def should_swap(idx_a, idx_b):
             stride_a = tensor.stride()[idx_a]
             stride_b = tensor.stride()[idx_b]
 
-            if stride_a == 0 or stride_b == 0:
+            if guard_size_oblivious(stride_a == 0) or guard_size_oblivious(
+                stride_b == 0
+            ):
                 continue
 
-            if stride_a < stride_b:
+            if guard_size_oblivious(stride_a < stride_b):
                 return -1
 
-            if stride_a > stride_b:
+            if guard_size_oblivious(stride_a > stride_b):
                 return 1
 
             # stride_a == stride_b
-            if shape[idx_a] > shape[idx_b]:
+            if guard_size_oblivious(shape[idx_a] > shape[idx_b]):
                 return 1
 
         # Note: this case is hit if all strides are zero,
@@ -826,7 +872,7 @@ def infer_size_shapes(a: ShapeType, b: ShapeType) -> Tuple[int, ...]:
             (sizeA == sizeB) or (sizeA == 1) or (sizeB == 1),
             lambda: (
                 f"The size of tensor a ({sizeA}) must match the size of "
-                f"tensor b ({sizeB}) at non-singleton dimension {i}"
+                f"tensor b ({sizeB}) at non-jagged dimension {i}"
             ),
         )
 
@@ -886,7 +932,16 @@ def infer_size(shape: ShapeType, numel: int) -> Tuple[int, ...]:
     return tuple(shape)
 
 
-_integer_dtypes = (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)
+_integer_dtypes = (
+    torch.uint8,
+    torch.uint16,
+    torch.uint32,
+    torch.uint64,
+    torch.int8,
+    torch.int16,
+    torch.int32,
+    torch.int64,
+)
 _low_precision_dtypes = (torch.float16, torch.bfloat16, torch.complex32)
 _complex_dtypes = (torch.complex32, torch.complex64, torch.complex128)
 
@@ -1291,14 +1346,19 @@ class RETURN_TYPE(Enum):
     NEW = (0,)
     VIEW = (1,)
     INPLACE = (2,)
+    NONE = (3,)
 
 
 # TODO: when NumberType contains the sym types, can simplify this
-def number_type(x: Union[NumberType, torch.SymInt, torch.SymFloat]) -> Type:
+def number_type(
+    x: Union[NumberType, torch.SymInt, torch.SymFloat, torch.SymBool]
+) -> Type:
     if isinstance(x, torch.SymInt):
         return int
     elif isinstance(x, torch.SymFloat):
         return float
+    elif isinstance(x, torch.SymBool):
+        return bool
     else:
         return type(x)
 
@@ -1537,13 +1597,13 @@ def make_contiguous_strides_for(
     if not shape:
         return ()
 
-    from torch.fx.experimental.symbolic_shapes import is_singleton
+    from torch.fx.experimental.symbolic_shapes import is_nested_int
 
     multiplier = 1
     strides = []
     for l in reversed(shape):
         strides.append(multiplier)
-        multiplier *= l if is_singleton(l) else sym_max(l, 1)
+        multiplier *= l if is_nested_int(l) else sym_max(l, 1)
 
     result = tuple(reversed(strides))
 
@@ -1699,8 +1759,10 @@ def compute_required_storage_length(
     40
 
     """
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
     # Short-circuits if the shape has no elements
-    if reduce(operator.mul, shape, 1) == 0:
+    if guard_size_oblivious(reduce(operator.mul, shape, 1) == 0):
         return 0
 
     max_offset = sum((x - 1) * y for x, y in zip(shape, strides))
@@ -1718,10 +1780,9 @@ def check_in_bounds_for_storage(
     required_length = compute_required_storage_length(shape, strides, storage_offset)
     if a.size() < required_length:
         msg = (
-            "Can't view a storage of size {} with an offset of {}, shape of {}, and strides of {}, "
-            "which requires a storage of size {}".format(
-                a.size(), storage_offset, str(shape), str(strides), required_length
-            )
+            f"Can't view a storage of size {a.size()} with an offset of {storage_offset}, "
+            f"shape of {str(shape)}, and strides of {str(strides)}, "
+            f"which requires a storage of size {required_length}"
         )
         raise ValueError(msg)
 
diff --git a/torch/_prims_common/wrappers.py b/torch/_prims_common/wrappers.py
index aa1e8332d037d..9057edc87594d 100644
--- a/torch/_prims_common/wrappers.py
+++ b/torch/_prims_common/wrappers.py
@@ -170,9 +170,13 @@ def _resize_output_check(out: TensorLikeType, shape: ShapeType):
 
 
 # TODO: handle tuples of tensors
-def _maybe_resize_out(out: TensorLikeType, shape: ShapeType):
+def _maybe_resize_out(
+    out: TensorLikeType,
+    shape: ShapeType,
+    memory_format: Optional[torch.memory_format] = None,
+):
     if _resize_output_check(out, shape):
-        return out.resize_(shape)
+        return out.resize_(shape, memory_format=memory_format)
     else:
         return out
 
@@ -182,8 +186,9 @@ def _safe_copy_out(
 ):
     # Checks same device
     if copy_from.device != copy_to.device:
-        msg = "Attempting to copy from device {} to device {}, but cross-device copies are not allowed!".format(
-            copy_from.device, copy_to.device
+        msg = (
+            f"Attempting to copy from device {copy_from.device} "
+            f"to device {copy_to.device}, but cross-device copies are not allowed!"
         )
         raise RuntimeError(msg)
 
@@ -204,7 +209,12 @@ def _safe_copy_out(
     return copy_to.copy_(copy_from)
 
 
-def out_wrapper(*out_names: str, exact_dtype: bool = False):
+def out_wrapper(
+    *out_names: str,
+    exact_dtype: bool = False,
+    pass_is_out: bool = False,
+    preserve_memory_format=False,
+):
     # The wrapped function needs to convert the output parameters to ensure
     # compatibility between the Python API (which always uses "out" as the
     # parameter name and may be a tuple) and the Aten API (which may have
@@ -218,6 +228,9 @@ def out_wrapper(*out_names: str, exact_dtype: bool = False):
 
     is_tensor = len(out_names) == 1
 
+    def maybe_compute_memory_format(t):
+        return utils.suggest_memory_format(t) if preserve_memory_format else None
+
     def _out_wrapper(fn: Callable) -> Callable:
         """
         Adds the out parameter to a Python reference.
@@ -246,8 +259,10 @@ def _fn(*args, out=None, **kwargs):
                     out_attr = getattr(out, k)
                     if k not in kwargs:
                         kwargs[k] = out_attr
-
-            result = fn(*args, **kwargs)
+            if pass_is_out:
+                result = fn(*args, is_out=(out is not None), **kwargs)
+            else:
+                result = fn(*args, **kwargs)
             assert (
                 isinstance(result, TensorLike)
                 and is_tensor
@@ -274,7 +289,9 @@ def _fn(*args, out=None, **kwargs):
                 if is_tensor:
                     assert isinstance(out, TensorLike)
                     # These two operations are done in-place
-                    _maybe_resize_out(out, result.shape)
+                    _maybe_resize_out(
+                        out, result.shape, maybe_compute_memory_format(result)
+                    )
                     _safe_copy_out(copy_from=result, copy_to=out, exact_dtype=exact_dtype)  # type: ignore[arg-type]
                 else:
                     assert isinstance(out, Tuple)  # type: ignore[arg-type]
@@ -284,7 +301,7 @@ def _fn(*args, out=None, **kwargs):
                     )
                     for r, o in zip(result, out):
                         # These two operations are done in-place
-                        _maybe_resize_out(o, r.shape)
+                        _maybe_resize_out(o, r.shape, maybe_compute_memory_format(r))
                         _safe_copy_out(copy_from=r, copy_to=o, exact_dtype=exact_dtype)  # type: ignore[arg-type]
             else:
                 out = result
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 7af5d43adde68..ac6a60d0078c7 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -17,6 +17,7 @@
 import torch._prims_common as utils
 from torch import sym_float, sym_int
 from torch._prims_common import (
+    BoolLike,
     DeviceLikeType,
     Dim,
     DimsSequenceType,
@@ -371,6 +372,8 @@ def handle_noncontiguous_outputs(input_tlist, output):
 
 
 def _broadcast_shapes(*_shapes):
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
     shapes = tuple(
         (x,) if isinstance(x, IntLike) else x
         for x in filter(lambda x: x is not None, _shapes)
@@ -391,13 +394,13 @@ def _broadcast_shapes(*_shapes):
     ] * reduce(max, (len(shape) for shape in shapes))
     for arg_idx, shape in enumerate(shapes):
         for idx in range(-1, -1 - len(shape), -1):
-            if common_shape[idx] == 1:
+            if guard_size_oblivious(common_shape[idx] == 1):
                 if shape[idx] < 0:
                     raise ValueError(
                         "Attempting to broadcast a dimension with negative length!"
                     )
                 common_shape[idx] = shape[idx]
-            elif shape[idx] != 1:
+            elif guard_size_oblivious(shape[idx] != 1):
                 if common_shape[idx] != shape[idx]:
                     raise RuntimeError(
                         f"Attempting to broadcast a dimension of length {shape[idx]} at {idx}! "
@@ -483,7 +486,7 @@ def _make_alias(fn, name):
     """
     This function defines an alias of another function and sets its __name__ argument.
     It also sets its __module__ argument to the module of the caller.
-    Note that when naïvely doing `alias = fn`, we have that `alias.__name__ == "fn"`, and
+    Note that when naively doing `alias = fn`, we have that `alias.__name__ == "fn"`, and
     `alias.__module__ == fn.__module__`.
     """
 
@@ -737,7 +740,7 @@ def isreal(a: TensorLikeType) -> TensorLikeType:
 
 # TODO: if this is special maybe it should be defined there and imported here?
 @_make_elementwise_unary_reference(
-    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, aten_op=aten.special_i0
+    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, aten_op=aten.i0
 )
 def i0(a):
     return prims.bessel_i0(a)
@@ -870,13 +873,19 @@ def reciprocal(a):
     return prims.reciprocal(a)
 
 
-# TODO: round takes additional kwargs
-@_make_elementwise_unary_reference(
-    ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
-    aten_op=None,  # TODO: this does need a decomp, but kwarg handling is needed
+@register_decomposition(aten.round)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
-def round(a):
-    return prims.round(a)
+def round(a: TensorLikeType, *, decimals: int = 0) -> TensorLikeType:
+    if decimals == 0:
+        return prims.round(a)
+    else:
+        ten_pow = 10**decimals
+        ten_neg_pow = 10 ** (-decimals)
+        return prims.mul(prims.round(prims.mul(a, ten_pow)), ten_neg_pow)
 
 
 @_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
@@ -1138,9 +1147,7 @@ def copysign(
     if isinstance(b, Number) and isinstance(a, Tensor):
         b = scalar_tensor(b, dtype=a.dtype, device=a.device)
     elif isinstance(a, Tensor) and isinstance(b, Tensor) and a.device != b.device:
-        msg = "Expected divisor (b) to be on the same device ({}) as dividend (a), but it is found on {}!".format(
-            a.device, b.device
-        )
+        msg = f"Expected divisor (b) to be on the same device ({a.device}) as dividend (a), but it is found on {b.device}!"
         raise RuntimeError(msg)
     return where(signbit(b), neg(abs(a)), abs(a))
 
@@ -1280,9 +1287,7 @@ def floor_divide(
         a = scalar_tensor(a, dtype=b.dtype, device=b.device)
     elif isinstance(a, Tensor) and isinstance(b, Tensor) and a.device != b.device:
         if a.device == torch.device("cpu"):
-            msg = "Expected divisor (b) to be on the same device ({}) as dividend (a), but it is found on {}!".format(
-                a.device, b.device
-            )
+            msg = f"Expected divisor (b) to be on the same device ({a.device}) as dividend (a), but it is found on {b.device}!"
             raise RuntimeError(msg)
         else:
             b = prims.device_put(b, device=a.device)
@@ -1360,6 +1365,12 @@ def fmod(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
     return prims.fmod(a, b)
 
 
+@register_decomposition(aten.frexp)
+@out_wrapper("mantissa", "exponent")
+def frexp(self: TensorLikeType) -> Tuple[TensorLikeType, TensorLikeType]:
+    return torch.return_types.frexp(prims.frexp(self))
+
+
 @_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     supports_lhs_python_scalar=False,
@@ -1664,16 +1675,18 @@ def remainder(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
 
 
 # reverse sub
+@register_decomposition(aten.rsub)
+@out_wrapper()
 def rsub(
     a: Union[TensorLikeType, NumberType],
     b: Union[TensorLikeType, NumberType],
-    *,
-    alpha: Optional[NumberType] = None,
+    alpha: NumberType = 1,
 ):
     if isinstance(a, Number):
         msg = "Received a Number for the first argument, but expected a Tensor"
         raise ValueError(msg)
-    return sub(b, a, alpha=alpha)
+
+    return torch.sub(b, a, alpha=alpha)
 
 
 # TODO: consider refactoring this with add impl
@@ -1688,7 +1701,7 @@ def sub(
     a: Union[TensorLikeType, NumberType],
     b: Union[TensorLikeType, NumberType],
     *,
-    alpha: Optional[NumberType] = None,
+    alpha: NumberType = 1,
 ):
     """
     Reference implementation of torch.sub
@@ -1696,7 +1709,7 @@ def sub(
 
     a, b = _maybe_broadcast(a, b)
 
-    if alpha is not None:
+    if alpha != 1:
         dtype = a.dtype if isinstance(a, TensorLike) else b.dtype  # type: ignore[union-attr]
         python_type = utils.dtype_to_type(dtype)
         if not utils.is_weakly_lesser_type(type(alpha), python_type):
@@ -1920,9 +1933,7 @@ def clone(
 
 def copy_to(a: Tensor, b: Tensor, *, allow_cross_device=True):
     if not allow_cross_device and a.device != b.device:
-        msg = "Attempting to copy from device {} to device {}, but cross-device copies are not allowed!".format(
-            b.device, a.device
-        )
+        msg = f"Attempting to copy from device {b.device} to device {a.device}, but cross-device copies are not allowed!"
         raise RuntimeError(msg)
 
     return prims.copy_to(a, b)
@@ -2151,7 +2162,7 @@ def _reduction(
     computation_dtype, result_dtype = utils.reduction_dtypes(
         a, output_dtype_kind, dtype
     )
-    a = _maybe_convert_to_dtype(a, computation_dtype)  # type: ignore[assignment]
+    a = _maybe_convert_to_dtype(a, computation_dtype)  # type: ignore[method-assign]
     result = prim(a, dims)
     if keepdims:
         output_shape = [a.shape[i] if i not in dims else 1 for i in range(a.ndim)]
@@ -2470,7 +2481,7 @@ def mean(
     nelem = 1 if a.ndim == 0 else reduce(operator.mul, (a.shape[i] for i in dims), 1)
     result = true_divide(result, nelem)
     result_dtype = a.dtype if dtype is None else dtype
-    result = _maybe_convert_to_dtype(result, result_dtype)  # type: ignore[assignment]
+    result = _maybe_convert_to_dtype(result, result_dtype)  # type: ignore[method-assign]
     if out is not None:
         assert isinstance(out, TensorLike)
         out = _maybe_resize_out(out, result.shape)
@@ -2702,18 +2713,64 @@ def cat_compute_output_memory_format(inputs):
 
     utils.check_same_device(*tensors, allow_cpu_scalar_tensors=False)
 
-    for t in tensors:
-        # match logic in legacy_cat_wrap_dim
-        if t.ndim == 1 and t.size(0) == 0:
-            continue
-        dim = utils.canonicalize_dim(t.ndim, dim)
-        utils.validate_idx(t.ndim, dim)
-        break
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    # This is a bit tricky.  Naively, you would expect to just pick one
+    # arbitrary tensor and check that all tensors match this tensor.  However,
+    # there is legacy behavior which says that if you have a 1-D empty tensor
+    # (0,), this is permissible.  So you can't assume that all the tensors
+    # have same dimensionality, and you can't assume that the first tensor is
+    # the correct stencil.
+    #
+    # We'll implement this in a few passes.  First, we will try to infer the
+    # ndim of the cat output.  If this ndim != 1, then we know that all ndim =
+    # 1 inputs must be empty, or are errors.  If this ndim == 1, then life
+    # is easy (the legacy special case coincides with regular handling).
+    #
+    # NB: The regular implementation of cat just filters out empty inputs,
+    # but we do it slightly different here for better handling for unbacked
+    # SymInts
+
+    example = None
+    for i, t in enumerate(tensors):
+        if example is None:
+            if t.ndim != 1:
+                example = t
+        else:
+            if t.ndim != 1:
+                torch._check(
+                    t.ndim == example.ndim,
+                    lambda: "Number of dimensions of tensors must match.  "
+                    f"Expected {example.ndim}-D tensors, but got {t.ndim}-D for "
+                    f"tensor number {i} in the list",
+                )
+
+    if example is None:
+        # example is None if everything is 1-D.  If so, just arbitrarily pick
+        # the first one
+        example = tensors[0]
+
+    shape = example.shape
+    filtered = []
+    for tensor_idx, tensor in enumerate(tensors):
+        if len(shape) != len(tensor.shape):
+            assert tensor.ndim == 1  # we've already checked this above
+            # Don't suggest the legacy behavior in the error message
+            torch._check(
+                tensor.shape[0] == 0,
+                lambda: f"Number of dimensions of tensors must match.  "
+                f"Expected {example.ndim}-D tensors, but got 1-D for "
+                f"tensor number {tensor_idx} in the list",
+            )
+        else:
+            # Remove inputs that are 1-D, zero size
+            if tensor.ndim == 1 and guard_size_oblivious(tensor.shape[0] == 0):
+                continue
+            # Don't bother checking size match, prims.cat will handle it
+            filtered.append(tensor)
 
     memory_format = cat_compute_output_memory_format(tensors)
 
-    # Filters tensors with one dimension of length zero
-    filtered = tuple(x for x in tensors if not (x.ndim == 1 and x.numel() == 0))
     if len(filtered) == 0:
         t = tensors[0]
 
@@ -2731,6 +2788,9 @@ def cat_compute_output_memory_format(inputs):
             memory_format=memory_format,
         )
 
+    dim = utils.canonicalize_dim(filtered[0].ndim, dim)
+    utils.validate_idx(filtered[0].ndim, dim)
+
     return prims.cat(filtered, dim).clone(memory_format=memory_format)
 
 
@@ -2852,6 +2912,8 @@ def dstack(tensors: TensorSequenceType) -> TensorLikeType:
 
 @register_decomposition(aten.expand)
 def expand(a: Tensor, *shape) -> Tensor:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
     # NOTE: cannot use utils.extract_shape_from_varargs here
     # because that also validates the shape, but the shape
     # given to expand may be "invalid"
@@ -2869,7 +2931,9 @@ def expand(a: Tensor, *shape) -> Tensor:
         offset_idx = idx + offset
         requested_length = shape[offset_idx]
         torch._check(
-            requested_length == x or x == 1 or requested_length == -1,
+            guard_size_oblivious(requested_length == x)
+            or guard_size_oblivious(x == 1)
+            or requested_length == -1,
             lambda: f"expand: attempting to expand a dimension of length {x}!",
         )
 
@@ -3534,7 +3598,7 @@ def repeat(a: Tensor, *repeat_shape) -> Tensor:
 
     # derive permute order by sorting urtensor strides
     enumerated_stride = list(enumerate(urtensor_stride))
-    enumerated_stride.sort(key=lambda item: item[1], reverse=True)
+    enumerated_stride.sort(key=operator.itemgetter(1), reverse=True)
     permute_order, sorted_stride = zip(*enumerated_stride)
 
     # add new and expand dimensions according to urtensor
@@ -3551,18 +3615,16 @@ def repeat(a: Tensor, *repeat_shape) -> Tensor:
 
 
 def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorLikeType:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious, sym_eq
+
     # Creates a valid shape
     shape = utils.extract_shape_from_varargs(shape, validate=False)
     # Reshape may be given a shape with a -1 length
     # This indicates that the dimension's length should be inferred
     shape = utils.infer_size(shape, a.numel())
 
-    # Short-circuits if shape is the same
-    if tuple(a.shape) == tuple(shape):
-        return prims.view_of(a)
-
     # Special-cases tensors with no elements
-    if a.numel() == 0:
+    if guard_size_oblivious(a.numel() == 0):
         return as_strided(a, shape, utils.make_contiguous_strides_for(shape))
 
     # Special-cases reshaping zero dim tensors
@@ -3571,7 +3633,10 @@ def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorL
         for length in shape:
             assert length == 1
             _a = unsqueeze(_a, -1)
-        return _a
+        if _a is a:
+            return prims.view_of(a)
+        else:
+            return _a
 
     # Special-cases reshaping to zero dim tensors
     if len(shape) == 0:
@@ -3579,7 +3644,10 @@ def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorL
         for length in a.shape:
             assert length == 1
             _a = squeeze(_a, -1)
-        return _a
+        if _a is a:
+            return prims.view_of(a)
+        else:
+            return _a
 
     # Handles general case: a 1+D tensor reshaped into a distinct 1+D shape
 
@@ -3614,7 +3682,7 @@ def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorL
             continue
 
         # Skips dimensions that are already the correct length
-        if length == a_.shape[idx]:
+        if guard_size_oblivious(length == a_.shape[idx]):
             idx = idx + 1
             continue
 
@@ -3623,7 +3691,7 @@ def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorL
         # specify the same number of elements above
         accum = a_.shape[idx]
         end = idx
-        while accum % length != 0:
+        while guard_size_oblivious(accum % length != 0):
             end = end + 1
             accum = accum * a_.shape[end]
         if end != idx:
@@ -3637,25 +3705,29 @@ def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorL
                 if allow_copy:
                     return prims.reshape(a, shape)
 
-                msg = "Cannot view a tensor with shape {} and strides {} as a tensor with shape {}!".format(
-                    a.shape, a.stride(), shape
-                )
+                msg = f"Cannot view a tensor with shape {a.shape} and strides {a.stride()} as a tensor with shape {shape}!"
                 raise ValueError(msg)
 
             a_ = flatten(a_, idx, end)
 
         # Splits the (possibly flattened) dimension to create the desired dim length
-        if accum != length:
+        if guard_size_oblivious(accum != length):
             a_ = prims.split_dim(a_, idx, length)
 
         idx = idx + 1
 
     # Squeezes tail
     while idx < a_.ndim:
-        assert a_.shape[idx] == 1
+        torch._check(
+            a_.shape[idx] == 1,
+            lambda: f"a.size({idx}) expected to be 1 but got {a_.shape[idx]}",
+        )
         a_ = squeeze(a_, idx)
 
-    return a_
+    if a_ is a:
+        return prims.view_of(a)
+    else:
+        return a_
 
 
 # CompositeImplicitAutograd - don't register decomp
@@ -3687,7 +3759,7 @@ def roll(
     # Avoid modulo by zero
     if a.numel() == 0:
         # Keeping this as ref for now as FakeTensor runs into some issues with complex tensors
-        return clone(a)
+        return a.clone()
 
     if a.dim() == 0 and len(dims) > 0:
         raise IndexError(
@@ -3718,9 +3790,8 @@ def roll(
     dim = dims[0]
     size = a.shape[dim]
     start = (size - shifts[0]) % size
-    t0 = torch.narrow(a, dim, start, size - start)
-    t1 = torch.narrow(a, dim, 0, start)
-    return torch.cat((t0, t1), dim)
+    idx = torch.arange(size, device=a.device)
+    return a.index_select(dim, torch.fmod(start + idx, size))
 
 
 @register_decomposition(aten.rot90)
@@ -3828,12 +3899,14 @@ def unflatten(a: TensorLikeType, dim: int, sizes: ShapeType) -> TensorLikeType:
 
 @register_decomposition(aten.unbind)
 def unbind(t: TensorLikeType, dim: int = 0) -> TensorSequenceType:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
     dim = utils.canonicalize_dim(t.ndim, dim)
     torch._check_index(
         len(t.shape) > 0,
         lambda: "Dimension specified as 0 but tensor has no dimensions",
     )
-    if t.shape[dim] == 0:
+    if guard_size_oblivious(t.shape[dim] == 0):
         return tuple()
     else:
         return tuple(
@@ -3958,6 +4031,8 @@ def index_select(x: TensorLike, dim: int, index: TensorLike):
 
 @register_decomposition(aten.squeeze.dims)
 def squeeze(a: TensorLikeType, dim: Optional[DimsType] = None) -> TensorLikeType:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
     if dim is None:
         dims = tuple(idx for idx, size in enumerate(a.shape) if size == 1)
         return prims.squeeze(a, dims) if dims else prims.view_of(a)
@@ -3971,7 +4046,7 @@ def squeeze(a: TensorLikeType, dim: Optional[DimsType] = None) -> TensorLikeType
         return prims.view_of(a)
 
     # Note: squeeze does not modify tensors when the given dim is not a dimension of length 1
-    dims = tuple(d for d in dims if a.shape[d] == 1)
+    dims = tuple(d for d in dims if guard_size_oblivious(a.shape[d] == 1))
     if len(dims) == 0:
         return prims.view_of(a)
     if len(dims) == 1:
@@ -3998,8 +4073,9 @@ def tensor_split(
     # If indices_or_sections is a tensor, it must be a CPU Long tensor
     if isinstance(indices_or_sections, TensorLike):
         if not indices_or_sections.device.type == "cpu":
-            msg = "tensor_split: if indices_or_sections is a tensor it must be on the CPU, but received one on {}".format(
-                indices_or_sections.device
+            msg = (
+                f"tensor_split: if indices_or_sections is a tensor it must be on the CPU, "
+                f"but received one on {indices_or_sections.device}"
             )
             raise ValueError(msg)
         if indices_or_sections.dtype != torch.long:
@@ -4231,17 +4307,17 @@ def diag_embed(
     """
     Reference implementation of torch.diag_embed
     """
+    # convert from negative dims
+    rank = t.ndim + 1
+    dim1 = utils.canonicalize_dim(rank=rank, idx=dim1)
+    dim2 = utils.canonicalize_dim(rank=rank, idx=dim2)
+
     # as per the docs, exchanging dims is equivalent to changing the sign of
     # offset
     if dim1 > dim2:
         dim1, dim2 = dim2, dim1
         offset = -offset
 
-    # convert from negative dims
-    rank = t.ndim + 1
-    dim1 = utils.canonicalize_dim(rank=rank, idx=dim1)
-    dim2 = utils.canonicalize_dim(rank=rank, idx=dim2)
-
     torch._check(
         dim1 != dim2, lambda: f"diagonal dimensions cannot be identical {dim1}, {dim2}"
     )
@@ -4860,9 +4936,10 @@ def is_finite(x):
         lambda: f"step must be finite but got {step}",
     )
 
+    args = (start, end, step)
+    integer_args = builtins.all(isinstance(arg, IntLike) for arg in args)
+
     if dtype is None:
-        args = (start, end, step)
-        integer_args = builtins.all(isinstance(arg, IntLike) for arg in args)
         dtype = torch.int64 if integer_args else torch.get_default_dtype()
 
     is_integer = utils.is_integer_dtype(dtype)
@@ -4875,22 +4952,21 @@ def is_finite(x):
     # other integral dtypes we don't. Weird... but needed to match ATen shapes.
     if dtype == torch.int64:
         # Uses floordiv to avoid ceil in inductor.
-        sgn = bool(xstep > 0) - bool(xstep < 0)
-        length = (xend - xstart + xstep - sgn) // xstep
+        sgn = bool(xstep > 0) - bool(xstep < 0)  # type: ignore[possibly-undefined]
+        length = (xend - xstart + xstep - sgn) // xstep  # type: ignore[possibly-undefined]
     else:
         length = math.ceil((end - start) / step)
 
     if is_integer:
         return prims.iota(
             length,
-            start=xstart,
-            step=xstep,
+            start=xstart,  # type: ignore[possibly-undefined]
+            step=xstep,  # type: ignore[possibly-undefined]
             dtype=dtype,
             device=device,
             requires_grad=requires_grad,
         )
 
-    computation_dtype = utils.get_acc_type(dtype, device)
     index = prims.iota(
         length,
         start=0,
@@ -4899,6 +4975,10 @@ def is_finite(x):
         device=device,
         requires_grad=False,
     )
+
+    computation_dtype = (
+        torch.long if integer_args else utils.get_acc_type(dtype, device)
+    )
     index = _maybe_convert_to_dtype(index, computation_dtype)
     result = start + step * index
     result = _maybe_convert_to_dtype(result, dtype)
@@ -5483,7 +5563,10 @@ def masked_fill(a: TensorLikeType, mask: TensorLikeType, value: TensorOrNumberLi
             lambda: f"only supports a 0-dimensional value tensor, but got tensor with {value_ndim} dimension",
         )
         # `masked_fill` allows cpu scalar to be moved to cuda and xpu but not otherwise.
-        is_cpu_scalar = a.device.type in ["cuda", "xpu"] and value.device.type == "cpu"
+        is_cpu_scalar = (
+            a.device.type in ["cuda", "xpu", torch._C._get_privateuse1_backend_name()]
+            and value.device.type == "cpu"
+        )
         torch._check(
             is_cpu_scalar or value.device == a.device,
             lambda: "Expected `value` to be on same device as `a`",
@@ -6081,6 +6164,19 @@ def vdot(self, other):
     return (self.conj_physical() * other).sum()
 
 
+@register_decomposition(aten.select_scatter)
+@out_wrapper()
+def select_scatter(x: TensorLikeType, src: TensorLikeType, dim: int, index: int):
+    dim = utils.canonicalize_dim(x.ndim, dim)
+    mask_shape = [1] * x.ndim
+    mask_shape[dim] = -1
+    if index < 0:
+        index = index + x.shape[dim]
+    mask = torch.arange(x.shape[dim], device=x.device).view(mask_shape) == index
+    src = torch.unsqueeze(src, dim).expand(x.shape)
+    return torch.where(mask, src, x)
+
+
 # inplace
 abs_ = _make_inplace(abs)
 acos_ = _make_inplace(acos)
@@ -6216,7 +6312,7 @@ def _infer_scalar_type(obj):
         return torch.get_default_dtype()
     if isinstance(obj, IntLike) and not isinstance(obj, bool):  # careful!
         return torch.int64
-    if isinstance(obj, bool):
+    if isinstance(obj, BoolLike):
         return torch.bool
     if isinstance(obj, complex):
         default_dtype = torch.get_default_dtype()
@@ -6224,6 +6320,8 @@ def _infer_scalar_type(obj):
             return torch.cfloat
         elif default_dtype is torch.double:
             return torch.cdouble
+        elif default_dtype is torch.half:
+            return torch.chalf
         else:
             raise RuntimeError("invalid default scalar type for complex")
     if isinstance(obj, torch.Tensor):
@@ -6260,21 +6358,14 @@ def _infer_scalar_type(obj):
 
 # Analogous to recursive_store
 # xref: recursive_store in torch/csrc/utils/tensor_new.cpp
-def _recursive_build(sizes, dim, scalarType, obj):
-    ndim = len(sizes)
-    assert dim <= ndim
-    if dim == ndim:
+def _recursive_build(scalarType: torch.dtype, obj: TensorOrNumberLikeType):
+    if isinstance(obj, Tensor) and obj.ndim <= 1:
+        return obj.detach().to(dtype=scalarType, device="cpu", copy=True).view(())
+    elif isinstance(obj, Number):
         return torch.scalar_tensor(obj, dtype=scalarType)
-    n = sizes[dim]
+
     seq = obj
-    seq_size = len(seq)
-    if seq_size != n:
-        raise ValueError(
-            f"expected sequence of length {n} at dim {dim} (got {seq_size})"
-        )
-    return torch.stack(
-        [_recursive_build(sizes, dim + 1, scalarType, item) for item in seq]
-    )
+    return torch.stack([_recursive_build(scalarType, item) for item in seq])
 
 
 # xref: internal_new_from_data in torch/csrc/utils/tensor_new.cpp
@@ -6311,7 +6402,6 @@ def _internal_new_from_data(
     # TODO: test for numpy input with PyArray_Check
 
     device = device_opt if device_opt is not None else options["device"]
-    sizes = _compute_sizes(data, scalar_type)
     inferred_scalar_type = _infer_scalar_type(data) if type_inference else scalar_type
 
     # NB: Don't need to avoid tracing, as we aren't going to do any manual
@@ -6326,7 +6416,7 @@ def _internal_new_from_data(
         # of a freshly allocated CPU tensor.  Here, we're going to do an
         # alternate, heinously slow implementation: turn each individual
         # scalar into a tensor, and then repeatedly cat them together
-        tensor = _recursive_build(sizes, 0, inferred_scalar_type, data)
+        tensor = _recursive_build(inferred_scalar_type, data)
 
         tensor = tensor.to(device, inferred_scalar_type, non_blocking=False, copy=False)
 
@@ -6357,7 +6447,8 @@ def tensor(data, *, dtype=None, device=None, pin_memory=False, requires_grad=Fal
         pin_memory=pin_memory,
     )
     new_tensor.detach_()
-    new_tensor.requires_grad_(requires_grad)
+    if requires_grad:
+        new_tensor.requires_grad_(requires_grad)
     return new_tensor
 
 
diff --git a/torch/_refs/fft.py b/torch/_refs/fft.py
index cc2cae10fb0d8..06dda2d3accb7 100644
--- a/torch/_refs/fft.py
+++ b/torch/_refs/fft.py
@@ -66,7 +66,7 @@ def _promote_type_fft(
         dtype = torch.get_default_dtype()
 
     allowed_types = [torch.float32, torch.float64]
-    maybe_support_half = device.type in ["cuda", "meta"] and not torch.version.hip
+    maybe_support_half = device.type in ["cuda", "meta"]
 
     if maybe_support_half:
         allowed_types.append(torch.float16)
@@ -312,7 +312,7 @@ def _canonicalize_fft_shape_and_dim_args(
 
         # Translate any -1 values in shape to the default length
         ret_shape = tuple(
-            s if s != -1 else input_sizes[d] for (s, d) in zip(shape, ret_dims)
+            s if s != -1 else input_sizes[d] for (s, d) in zip(shape, ret_dims)  # type: ignore[possibly-undefined]
         )
     elif dim is None:
         # No shape, no dim
@@ -320,12 +320,12 @@ def _canonicalize_fft_shape_and_dim_args(
         ret_shape = tuple(input_sizes)
     else:
         # No shape, has dim
-        ret_shape = tuple(input_sizes[d] for d in ret_dims)
+        ret_shape = tuple(input_sizes[d] for d in ret_dims)  # type: ignore[possibly-undefined]
 
     for n in ret_shape:
         torch._check(n > 0, lambda: f"Invalid number of data points ({n}) specified")
 
-    return _ShapeAndDims(shape=ret_shape, dims=ret_dims)
+    return _ShapeAndDims(shape=ret_shape, dims=ret_dims)  # type: ignore[possibly-undefined]
 
 
 def _prod(xs: Iterable[int]) -> int:
diff --git a/torch/_refs/linalg/__init__.py b/torch/_refs/linalg/__init__.py
index 824cd56438fec..a1b59e94d2758 100644
--- a/torch/_refs/linalg/__init__.py
+++ b/torch/_refs/linalg/__init__.py
@@ -16,6 +16,7 @@
     Dim,
     DimsType,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
+    IntLike,
     NumberType,
     TensorLikeType,
 )
@@ -26,7 +27,16 @@
 )
 
 
-__all__ = ["diagonal", "matrix_norm", "norm", "svd", "svdvals", "vector_norm", "vecdot"]
+__all__ = [
+    "diagonal",
+    "matrix_norm",
+    "norm",
+    "svd",
+    "svdvals",
+    "vector_norm",
+    "vecdot",
+    "cross",
+]
 
 
 def _check_norm_dtype(dtype: Optional[torch.dtype], x_dtype: torch.dtype, fn_name: str):
@@ -53,8 +63,31 @@ def _check_norm_dtype(dtype: Optional[torch.dtype], x_dtype: torch.dtype, fn_nam
         )
 
 
+import operator
+
 # Utilities should come BEFORE this import
 from torch._decomp import register_decomposition
+from torch._decomp.decompositions import pw_cast_for_opmath
+
+
+@register_decomposition(torch._ops.ops.aten.linalg_cross)
+@out_wrapper()
+@pw_cast_for_opmath
+def cross(a: Tensor, b: Tensor, dim: int = -1):
+    torch._check(
+        a.ndim == b.ndim,
+        lambda: "linalg.cross: inputs must have the same number of dimensions.",
+    )
+    torch._check(
+        a.size(dim) == 3 and b.size(dim) == 3,
+        lambda: f"linalg.cross: inputs dim {dim} must have length 3, got {a.size(dim)} and {b.size(dim)}",
+    )
+    a, b = torch.broadcast_tensors(a, b)
+    dim = utils.canonicalize_dim(a.ndim, dim)
+    idx = torch.arange(3, device=a.device)
+    return a.index_select(dim, (idx + 1) % 3) * b.index_select(
+        dim, (idx + 2) % 3
+    ) - a.index_select(dim, (idx + 2) % 3) * b.index_select(dim, (idx + 1) % 3)
 
 
 def diagonal(
@@ -71,7 +104,7 @@ def diagonal(
 @out_wrapper(exact_dtype=True)
 def vector_norm(
     x: TensorLikeType,
-    ord: float = 2.0,
+    ord: Union[float, int] = 2,
     dim: Optional[DimsType] = None,
     keepdim: bool = False,
     *,
@@ -118,7 +151,8 @@ def vector_norm(
         x = _maybe_convert_to_dtype(x, computation_dtype)  # type: ignore[assignment]
         reduce_sum = partial(torch.sum, dim=dim, keepdim=keepdim)
 
-        if not (ord % 2.0 == 0.0 and utils.is_float_dtype(x.dtype)):
+        is_ord_even = ord % 2 == 0 if isinstance(ord, IntLike) else ord % 2.0 == 0.0
+        if not (is_ord_even and utils.is_float_dtype(x.dtype)):
             x = torch.abs(x)
         return to_result_dtype(torch.pow(reduce_sum(torch.pow(x, ord)), 1.0 / ord))  # type: ignore[return-value]
 
@@ -133,7 +167,7 @@ def _backshift_permutation(dim0, dim1, ndim):
 
 def _inverse_permutation(perm):
     # Given a permutation, returns its inverse. It's equivalent to argsort on an array
-    return [i for i, j in sorted(enumerate(perm), key=lambda i_j: i_j[1])]
+    return [i for i, j in sorted(enumerate(perm), key=operator.itemgetter(1))]
 
 
 # CompositeImplicitAutograd
diff --git a/torch/_refs/nn/functional/__init__.py b/torch/_refs/nn/functional/__init__.py
index 9fb1b373f56ea..dd06febbcd6cc 100644
--- a/torch/_refs/nn/functional/__init__.py
+++ b/torch/_refs/nn/functional/__init__.py
@@ -600,7 +600,7 @@ def margin_ranking_loss(
     margin: float = 0.0,
     reduction: str = "mean",
 ) -> TensorLikeType:
-    # loss_without_reduction = max(0, −target * (input1 − input2) + margin)
+    # loss_without_reduction = max(0, -target * (input1 - input2) + margin)
     if input1.ndim != input2.ndim or input1.ndim != target.ndim:
         raise RuntimeError(
             "margin_ranking_loss : All input tensors should have same dimension but got sizes: "
@@ -898,6 +898,9 @@ def triplet_margin_loss(
         # msg = "size_average and reduce args are deprecated, please use reduction argument."
         reduction = _get_string_reduction_arg(size_average=size_average, reduce=reduce)
 
+    if margin <= 0:
+        raise ValueError(f"margin must be greater than 0, got {margin}")
+
     # torch.nn.functional.triplet_margin_with_distance_loss has no ref defined
     # since it's a pure Python implementation.  Use this helper instead.
     return _triplet_margin_with_distance_loss(
@@ -986,6 +989,10 @@ def hardtanh(
             raise RuntimeError(
                 "Cannot do hardtanh on an unsigned type with negative limits"
             )
+
+    if min_val > max_val:  # type: ignore[operator]
+        raise ValueError("min_val cannot be greater than max_val")
+
     return torch.clamp(a, min_val, max_val)  # type: ignore[arg-type]
 
 
@@ -1166,6 +1173,62 @@ def pdist(a: TensorLikeType, p: float = 2) -> TensorLikeType:
     return t.flatten().index_select(0, i[0] * t.shape[0] + i[1])
 
 
+@register_decomposition(aten.pixel_shuffle)
+@out_wrapper()
+def pixel_shuffle(self: Tensor, upscale_factor: int):
+    torch._check(
+        self.dim() >= 3,
+        lambda: f"pixel_shuffle expects input to have at least 3 dimensions, but got input with {self.dim} dimension(s)",
+    )
+    batch = self.shape[:-3]
+    C_out = self.shape[-3] // upscale_factor**2
+    HW_out = (self.shape[-2] * upscale_factor, self.shape[-1] * upscale_factor)
+    n = len(batch)
+    B_dims = range(n)
+    C_dim, r1_dim, r2_dim, H_dim, W_dim = range(n, n + 5)
+    return (
+        self.view(
+            *batch,
+            C_out,
+            upscale_factor,
+            upscale_factor,
+            self.shape[-2],
+            self.shape[-1],
+        )
+        .permute(*B_dims, C_dim, H_dim, r1_dim, W_dim, r2_dim)
+        .reshape(*batch, C_out, *HW_out)
+        .clone(memory_format=utils.suggest_memory_format(self))
+    )
+
+
+@register_decomposition(aten.pixel_unshuffle)
+@out_wrapper()
+def pixel_unshuffle(self: Tensor, downscale_factor: int):
+    torch._check(
+        self.dim() >= 3,
+        lambda: f"pixel_unshuffle expects input to have at least 3 dimensions, but got input with {self.dim} dimension(s)",
+    )
+    batch = self.shape[:-3]
+    C_out = self.shape[-3] * downscale_factor**2
+    HW_out = (self.shape[-2] // downscale_factor, self.shape[-1] // downscale_factor)
+    n = len(batch)
+    B_dims = range(n)
+    C_dim, H_dim, r1_dim, W_dim, r2_dim = range(n, n + 5)
+    return (
+        self.view(
+            *batch,
+            self.shape[-3],
+            HW_out[0],
+            downscale_factor,
+            HW_out[1],
+            downscale_factor,
+        )
+        .permute(*B_dims, C_dim, r1_dim, r2_dim, H_dim, W_dim)
+        .reshape(*batch, C_out, *HW_out)
+        .clone(memory_format=utils.suggest_memory_format(self))
+    )
+
+
 # Needed as aten.{celu_,elu_...} exist (even if they don't have the in-place kwarg)
 celu_ = _make_inplace(celu)
 elu_ = _make_inplace(elu)
diff --git a/torch/_refs/special/__init__.py b/torch/_refs/special/__init__.py
index 048de83506d29..14ec33cf208fe 100644
--- a/torch/_refs/special/__init__.py
+++ b/torch/_refs/special/__init__.py
@@ -116,7 +116,7 @@ def i1e(a: TensorLikeType) -> TensorLikeType:
     type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
 )
 def log_ndtr(a: TensorLikeType) -> TensorLikeType:
-    # Note: M_SQRT1_2 is the value of 1 / √2
+    # Note: M_SQRT1_2 is the value of 1 / sqrt(2)
     M_SQRT1_2 = 0.707106781186547524400844362104849039
     t = a * M_SQRT1_2
     return torch.where(
@@ -185,7 +185,7 @@ def multigammaln(a: TensorLikeType, p: int) -> TensorLikeType:
     type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
 )
 def ndtr(a: TensorLikeType) -> TensorLikeType:
-    # Note: M_SQRT1_2 is the value of 1 / √2
+    # Note: M_SQRT1_2 is the value of 1 / sqrt(2)
     M_SQRT1_2 = 0.707106781186547524400844362104849039
     a_sqrt_2 = a * M_SQRT1_2
     return (1 + torch.erf(a_sqrt_2)) * 0.5
diff --git a/torch/_size_docs.py b/torch/_size_docs.py
new file mode 100644
index 0000000000000..58587be32f1d9
--- /dev/null
+++ b/torch/_size_docs.py
@@ -0,0 +1,38 @@
+"""Adds docstrings to torch.Size functions"""
+
+import torch._C
+from torch._C import _add_docstr as add_docstr
+
+
+def add_docstr_all(method, docstr):
+    add_docstr(getattr(torch._C.Size, method), docstr)
+
+
+add_docstr_all(
+    "numel",
+    """
+numel() -> int
+
+Returns the number of elements a :class:`torch.Tensor` with the given size would contain.
+
+More formally, for a tensor ``x = tensor.ones(10, 10)`` with size ``s = torch.Size([10, 10])``,
+``x.numel() == x.size().numel() == s.numel() == 100`` holds true.
+
+Example::
+    >>> x=torch.ones(10, 10)
+    >>> s=x.size()
+    >>> s
+    torch.Size([10, 10])
+    >>> s.numel()
+    100
+    >>> x.numel() == s.numel()
+    True
+
+
+.. warning::
+
+    This function does not return the number of dimensions described by :class:`torch.Size`, but instead the number
+    of elements a :class:`torch.Tensor` with that size would contain.
+
+""",
+)
diff --git a/torch/_streambase.py b/torch/_streambase.py
index 1d4737563ddb6..5a0df2c22ba95 100644
--- a/torch/_streambase.py
+++ b/torch/_streambase.py
@@ -6,27 +6,27 @@ class _StreamBase(ABC):
 
     @abstractmethod
     def wait_event(self, event):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @abstractmethod
     def wait_stream(self, stream):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @abstractmethod
     def record_event(self, event=None):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @abstractmethod
     def query(self):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @abstractmethod
     def synchronize(self):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @abstractmethod
     def __eq__(self, stream):
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 class _EventBase(ABC):
@@ -34,12 +34,12 @@ class _EventBase(ABC):
 
     @abstractmethod
     def wait(self, stream=None):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @abstractmethod
     def query(self):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @abstractmethod
     def synchronize(self):
-        raise NotImplementedError()
+        raise NotImplementedError
diff --git a/torch/_subclasses/fake_impls.py b/torch/_subclasses/fake_impls.py
new file mode 100644
index 0000000000000..a0cc772c18901
--- /dev/null
+++ b/torch/_subclasses/fake_impls.py
@@ -0,0 +1,1152 @@
+# mypy: ignore-errors
+
+import functools
+import itertools
+import math
+import sys
+from typing import Callable, Union
+
+import torch
+import torch._custom_op
+import torch._logging
+
+from torch._ops import OpOverload
+from torch._prims_common import (
+    elementwise_dtypes,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    is_boolean_dtype,
+    is_float_dtype,
+    is_integer_dtype,
+)
+
+from torch._subclasses.fake_tensor import (
+    DataDependentOutputException,
+    DynamicOutputShapeException,
+    FakeTensor,
+    in_kernel_invocation_manager,
+    run_fallback_kernel,
+    UnsupportedOperatorException,
+)
+from torch.fx.operator_schemas import normalize_function
+
+from torch.utils._stats import count_label
+
+pytree = torch.utils._pytree
+
+__all__ = [
+    "op_implementations_checks",
+    "get_fast_op_impls",
+    "stride_incorrect_op",
+    "has_meta",
+]
+
+op_implementations_dict = {}
+op_implementations_checks = []
+
+
+aten = torch._ops.ops.aten
+
+
+def ordered_set(*items):
+    return dict.fromkeys(items, True)
+
+
+# This function indicates if the backend device
+# supports non-contiguous tensors
+def is_noncontiguous_supported(device):
+    if device.type == "hpu":
+        return False
+    return True
+
+
+_like_tensor_constructors = ordered_set(
+    aten.empty_like.default,
+    aten.empty_like.out,
+    aten.full_like.default,
+    aten.full_like.out,
+    aten.ones_like.default,
+    aten.ones_like.out,
+    aten.rand_like.default,
+    aten.rand_like.out,
+    aten.randn_like.default,
+    aten.randn_like.out,
+    aten.randint_like.default,
+    aten.randint_like.out,
+    aten.randint_like.low_dtype,
+    aten.randint_like.low_dtype_out,
+    aten.zeros_like.default,
+    aten.zeros_like.out,
+    aten.new_empty.default,
+    aten.new_empty.out,
+    aten.new_empty_strided.default,
+    aten.new_empty_strided.out,
+    aten.new_full.default,
+    aten.new_full.out,
+    aten.new_zeros.default,
+    aten.new_zeros.out,
+    aten.new_ones.default,
+    aten.new_ones.out,
+)
+
+
+_device_not_kwarg_ops = ordered_set(
+    aten._resize_output_.default,
+    aten._nested_tensor_from_tensor_list.default,
+    aten._nested_tensor_from_tensor_list.out,
+    aten.pin_memory.default,
+    aten.is_pinned.default,
+    aten.to.device,
+    aten.to.prim_Device,
+    aten._pin_memory.default,
+    aten._pin_memory.out,
+    aten._resize_output.default,
+    aten._resize_output.out,
+)
+
+# this op is never actually used
+_non_kwarg_device_constructors = (aten._list_to_tensor,)
+
+
+def contains_tensor_types(type):
+    tensor_type = torch._C.TensorType.get()
+    return type.isSubtypeOf(tensor_type) or any(
+        contains_tensor_types(e) for e in type.containedTypes()
+    )
+
+
+@functools.lru_cache(None)
+def _is_tensor_constructor(func: OpOverload):
+    assert isinstance(func, OpOverload)
+    schema = func._schema
+    if any(contains_tensor_types(arg.type) for arg in schema.arguments):
+        return False
+    # TODO: no real reason to restrict multiple outputs
+    return (
+        len(schema.returns) == 1 and schema.returns[0].type is torch._C.TensorType.get()
+    )
+
+
+def register_op_impl(run_impl_check: Union[Callable[[OpOverload], bool], OpOverload]):
+    def impl_decorator(op_impl):
+        if isinstance(run_impl_check, OpOverload):
+            assert (
+                run_impl_check not in op_implementations_dict
+            ), f"duplicate registration: {run_impl_check}"
+            op_implementations_dict[run_impl_check] = op_impl
+        elif isinstance(run_impl_check, (list, tuple)):
+            for op in run_impl_check:
+                register_op_impl(op)(op_impl)
+        else:
+            assert callable(run_impl_check)
+            op_implementations_checks.append((run_impl_check, op_impl))
+
+        return op_impl
+
+    return impl_decorator
+
+
+@register_op_impl(op_implementations_dict.__contains__)
+def dispatch_to_op_implementations_dict(fake_mode, func, *args, **kwargs):
+    return op_implementations_dict[func](fake_mode, func, *args, **kwargs)
+
+
+@register_op_impl(_is_tensor_constructor)
+@register_op_impl([*_like_tensor_constructors])
+def constructors(fake_mode, func, *args, **kwargs):
+    assert func not in _non_kwarg_device_constructors
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+    if "names" in kwargs:
+        raise UnsupportedOperatorException(
+            "torch.compile doesn't support named tensors"
+        )
+
+    if func in _like_tensor_constructors:
+        default_device = new_kwargs["input"].device
+        # TODO: file issue
+        args = (new_kwargs.pop("input"),)
+    else:
+        # cpu is default device if none is specified
+        default_device = torch.device("cpu")
+        args = ()
+    out_device = new_kwargs.pop("device", None)
+    out_device = out_device if out_device is not None else default_device
+    new_kwargs["device"] = torch.device("meta")
+    # _like constructors have fake tensor inputs (maybe this causes the non-like
+    # to fail? hmmm)
+    with in_kernel_invocation_manager(fake_mode):
+        r = func(*args, **new_kwargs)
+    return FakeTensor(fake_mode, r, out_device)
+
+
+@register_op_impl(aten.to.prim_Device)
+@register_op_impl(aten.to.device)
+def non_kwarg_to(fake_mode, func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args, kwargs, normalize_to_only_use_kwargs=True
+    )
+    input_device = new_kwargs["device"]
+    out_device = input_device if input_device else new_kwargs["input"].device
+    new_kwargs["device"] = torch.device("meta")
+    inp = new_kwargs.pop("input")
+    with in_kernel_invocation_manager(fake_mode):
+        r = func(inp, **new_kwargs)
+    # TODO: I think this does the wrong thing if r is inp
+    return fake_mode.fake_tensor_converter.from_meta_and_device(
+        fake_mode, r, out_device
+    )
+
+
+def stride_incorrect_op(op):
+    if op.namespace not in ("aten", "prims"):
+        return False
+    if op is aten._fft_c2c.default:
+        return False
+
+    op_name = op.name()
+    if "fft" in op_name:
+        return True
+    return False
+
+
+# These operators have meta implementations with incorrect strides
+@register_op_impl(stride_incorrect_op)
+def wordaround_stride_incorrect_op(fake_mode, func, *args, **kwargs):
+    # This is a workaround for meta implmentations with incorrect strides
+
+    def is_symbolic(x):
+        if isinstance(x, FakeTensor):
+            return x._has_symbolic_sizes_strides
+        if isinstance(x, (torch.SymInt, torch.SymFloat, torch.SymBool)):
+            return True
+        return False
+
+    # For static shapes, we can fall back to eager for the real strides
+    if fake_mode.allow_fallback_kernels:
+        require_dynamic = any(
+            is_symbolic(x) for x in itertools.chain(args, kwargs.values())
+        )
+        if not require_dynamic:
+            flat_args, args_spec = pytree.tree_flatten((args, kwargs))
+            return run_fallback_kernel(fake_mode, func, flat_args, args_spec, None)
+
+    raise UnsupportedOperatorException(func)
+
+
+# Dont default to default device handling,
+# since the device of `the_template` is ignored
+@register_op_impl(aten.resize_as_.default)
+def resize_as_(fake_mode, func, *args, **kwargs):
+    with in_kernel_invocation_manager(fake_mode):
+        return func(*args, **kwargs)
+
+
+@register_op_impl(aten._sparse_coo_tensor_with_dims_and_tensors.default)
+def _sparse_coo_tensor_with_dims_and_tensors(fake_mode, func, *args, **kwargs):
+    # TODO: remove me
+    return constructors(fake_mode, func, *args, **kwargs)
+
+
+# index.Tensor data-dependent in only some conditions
+@register_op_impl(
+    lambda func: torch.Tag.dynamic_output_shape in func.tags
+    and func
+    not in [aten.index.Tensor, aten.nonzero.default, aten.repeat_interleave.Tensor]
+)
+def dyn_shape(fake_mode, func, *args, **kwargs):
+    raise DynamicOutputShapeException(func)
+
+
+@register_op_impl(aten._unique2.default)
+def unique2(
+    fake_mode, func, arg, sorted=True, return_inverse=False, return_counts=False
+):
+    if (
+        fake_mode.shape_env is None
+        or not fake_mode.shape_env.allow_dynamic_output_shape_ops
+    ):
+        # Without symints/symfloats, cannot handle this
+        raise DynamicOutputShapeException(func)
+
+    if arg.unique_memo is None:
+        # Avoid importing sympy at a module level
+        from torch.fx.experimental.symbolic_shapes import (
+            _constrain_range_for_size,
+            has_free_symbols,
+        )
+
+        if not has_free_symbols(arg.numel()) and arg.numel() == 0:
+            # If numel is zero, then the output size must be zero.
+            # In this case, we must not allocate an unbacked SymInt,
+            # because if we do, it will immediately get refined to
+            # zero, but this will be inconsistent with size oblivious
+            # tests (which will continue to claim that the unbacked
+            # symint cannot equal zero).  We could also unconditionally
+            # allocate an unbacked SymInt and not refine its range,
+            # but this seems more precise.
+            nnz = arg._unique_memo = 0
+            arg._unique_memo_vc = arg._version
+        else:
+            nnz = fake_mode.shape_env.create_unbacked_symint()
+
+            maxval = sys.maxsize - 1
+
+            if not has_free_symbols(arg.numel()):
+                maxval = int(arg.numel())
+
+            _constrain_range_for_size(nnz, max=maxval)
+
+        arg.unique_memo = nnz
+
+    ret = [arg.new_empty((arg.unique_memo,))]
+
+    if return_inverse:
+        ret.append(torch.empty_like(arg))
+    else:
+        ret.append(arg.new_empty(0))
+
+    if return_counts:
+        ret.append(torch.empty_like(arg))
+    else:
+        ret.append(arg.new_empty(0))
+
+    return tuple(ret)
+
+
+@register_op_impl(aten.repeat_interleave.Tensor)
+def repeat_interleave_tensor(fake_mode, func, repeats, output_size=None):
+    if output_size is None:
+        if (
+            fake_mode.shape_env is None
+            or not fake_mode.shape_env.allow_dynamic_output_shape_ops
+        ):
+            raise DynamicOutputShapeException(func)
+
+        output_size = fake_mode.shape_env.create_unbacked_symint()
+
+        # Avoid importing sympy at a module level
+        from torch.fx.experimental.symbolic_shapes import _constrain_range_for_size
+
+        _constrain_range_for_size(output_size)
+        # TODO: consider a memo
+    return repeats.new_empty(output_size)
+
+
+@register_op_impl(torch.ops.aten._local_scalar_dense.default)
+def local_scalar_dense(fake_mode, func, arg):
+    if fake_mode.shape_env is None or (
+        not fake_mode.shape_env.allow_scalar_outputs
+        and not fake_mode.allow_scalar_outputs
+    ):
+        # Without symints/symfloats, cannot handle this
+        raise DataDependentOutputException(func)
+    if is_float_dtype(arg.dtype):
+        return fake_mode.shape_env.create_unbacked_symfloat()
+    elif is_integer_dtype(arg.dtype):
+        return fake_mode.shape_env.create_unbacked_symint()
+    elif is_boolean_dtype(arg.dtype):
+        return fake_mode.shape_env.create_unbacked_symbool()
+    else:
+        raise NotImplementedError(f"local_scalar_dense/item NYI for {arg.dtype}")
+
+
+@register_op_impl(torch.ops.aten.nonzero.default)
+def nonzero(fake_mode, func, arg):
+    if (
+        fake_mode.shape_env is None
+        or not fake_mode.shape_env.allow_dynamic_output_shape_ops
+    ):
+        # Without symints/symfloats, cannot handle this
+        raise DynamicOutputShapeException(func)
+
+    if arg.nonzero_memo is not None:
+        nnz = arg.nonzero_memo
+    else:
+        # Avoid importing sympy at a module level
+        from torch.fx.experimental.symbolic_shapes import (
+            _constrain_range_for_size,
+            has_free_symbols,
+        )
+
+        if not has_free_symbols(arg.numel()) and arg.numel() == 0:
+            # If numel is zero, then the output size must be zero.
+            # In this case, we must not allocate an unbacked SymInt,
+            # because if we do, it will immediately get refined to
+            # zero, but this will be inconsistent with size oblivious
+            # tests (which will continue to claim that the unbacked
+            # symint cannot equal zero).  We could also unconditionally
+            # allocate an unbacked SymInt and not refine its range,
+            # but this seems more precise.
+            nnz = arg._nonzero_memo = 0
+            arg._nonzero_memo_vc = arg._version
+            arg._nonzero_memo_epoch = fake_mode.epoch
+        else:
+            nnz = fake_mode.shape_env.create_unbacked_symint()
+
+            maxval = sys.maxsize - 1
+
+            if not has_free_symbols(arg.numel()):
+                maxval = int(arg.numel())
+
+            _constrain_range_for_size(nnz, max=maxval)
+
+            if not torch.is_inference_mode_enabled():
+                # arg._version N/A in inference mode
+                arg._nonzero_memo = nnz
+                arg._nonzero_memo_vc = arg._version
+                arg._nonzero_memo_epoch = fake_mode.epoch
+
+    return arg.new_empty((nnz, arg.dim()), dtype=torch.int64)
+
+
+@register_op_impl(torch.ops.aten.masked_select.default)
+def masked_select(fake_mode, func, self, mask):
+    if (
+        fake_mode.shape_env is None
+        or not fake_mode.shape_env.allow_dynamic_output_shape_ops
+    ):
+        # Without symints/symfloats, cannot handle this
+        raise DynamicOutputShapeException(func)
+
+    nnz = fake_mode.shape_env.create_unbacked_symint()
+
+    # see nonzero for commentary
+    maxval = sys.maxsize - 1
+
+    # Avoid importing sympy at a module level
+    from torch.fx.experimental.symbolic_shapes import (
+        _constrain_range_for_size,
+        has_free_symbols,
+    )
+
+    if not has_free_symbols(self.numel()):
+        if self.numel() > 2:
+            maxval = int(self.numel())
+
+    _constrain_range_for_size(nnz, max=maxval)
+
+    return self.new_empty((nnz,))
+
+
+# NB: this must be ordered after local_scalar_dense
+@register_op_impl(lambda func: torch.Tag.data_dependent_output in func.tags)
+def data_dep(fake_mode, func, *args, **kwargs):
+    raise DataDependentOutputException(func)
+
+
+# Bool Indices get Expanded as Masks
+# See: IndexingUtils.h:expandTensors
+def check_no_bool_index_tensors(func, self, indices):
+    for index in indices:
+        if index is not None and index.dtype in (torch.bool, torch.uint8):
+            raise DynamicOutputShapeException(func)
+
+
+def run_and_return_new_tensor_of_input_device(fake_mode, func, args, kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    out_device = new_kwargs["input"].device
+    with in_kernel_invocation_manager(fake_mode):
+        out = func(*args, **kwargs)
+        if not is_noncontiguous_supported(out_device):
+            out = out.new_empty(out.shape)
+
+    if out is new_kwargs["input"]:
+        return out  # copy_
+    return FakeTensor(fake_mode, out, out_device)
+
+
+_is_builtin_namespaces = ordered_set("aten", "prims", "prim")
+
+
+def is_builtin(op):
+    return op.namespace in _is_builtin_namespaces
+
+
+def has_meta(func):
+    return torch._C._dispatch_has_computed_kernel_for_dispatch_key(func.name(), "Meta")
+
+
+@register_op_impl(
+    lambda func: is_builtin(func) and "foreach" in func.name() and has_meta(func)
+)
+def foreach_run_and_map_input_device(fake_mode, func, *args, **kwargs):
+    tensor_lists = []
+    for arg in itertools.chain(args, kwargs.values()):
+        if (
+            isinstance(arg, (list, tuple))
+            and len(arg)
+            and isinstance(arg[0], torch.Tensor)
+        ):
+            tensor_lists.append(arg)
+
+    try:
+        with in_kernel_invocation_manager(fake_mode):
+            out_meta = func(*args, **kwargs)
+    except NotImplementedError as not_implemented_error:
+        return NotImplemented
+
+    if not out_meta:
+        return out_meta
+
+    assert tensor_lists
+    out_fake = []
+
+    for i, meta_t in enumerate(out_meta):
+        device, _ = FakeTensor._find_common_device(func, [tl[i] for tl in tensor_lists])
+        out_fake.append(
+            fake_mode.fake_tensor_converter.from_meta_and_device(
+                fake_mode, meta_t, device
+            )
+        )
+
+    return out_fake
+
+
+# Dont default to default device handling,
+# Since op can take in non-zero sized cpu
+# index tensors with cuda self
+@register_op_impl(aten.index.Tensor)
+def index_tensor(fake_mode, func, *args, **kwargs):
+    from torch._meta_registrations import meta_index_Tensor
+
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    out_device = new_kwargs["input"].device
+    # ensure nonzero call goes to fake tensor
+    with fake_mode:
+        out = meta_index_Tensor(*args, **kwargs)
+        return out.to(out_device)
+
+
+# Can take mixed meta/non-meta arguments; the meta registration
+# will roughly do the right thing even when given real devices
+@register_op_impl(aten._embedding_bag.default)
+def embedding_bag(fake_mode, func, *args, **kwargs):
+    from torch._meta_registrations import meta_embedding_bag
+
+    with fake_mode:
+        return meta_embedding_bag(*args, **kwargs)
+
+
+# takes in multiple-devices, dont default to default device handling
+@register_op_impl(aten._unsafe_index_put.default)
+@register_op_impl(aten.copy.default)
+@register_op_impl(aten.copy_.default)
+@register_op_impl(aten.slice_scatter.default)
+def multi_device_op_default(fake_mode, func, *args, **kwargs):
+    return run_and_return_new_tensor_of_input_device(fake_mode, func, args, kwargs)
+
+
+# same with multi_device_op_default, but return the input
+@register_op_impl(aten.copy.out)
+@register_op_impl(aten.slice_scatter.out)
+def multi_device_op_out(fake_mode, func, *args, **kwargs):
+    with in_kernel_invocation_manager(fake_mode):
+        out = func(*args, **kwargs)
+
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    return new_kwargs["input"]
+
+
+@register_op_impl(aten.index_put.default)
+@register_op_impl(aten.index_put_.default)
+def index_put_impl(fake_mode, func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    values = new_kwargs["values"]
+    self_device = new_kwargs["input"].fake_device
+    torch._check(
+        self_device == values.fake_device or (values.ndim == 0 and values.numel() == 1),
+        lambda: f"Mismatching {func} device between self ({self_device}) and values ({values.device})",
+    )
+
+    out = run_and_return_new_tensor_of_input_device(fake_mode, func, args, kwargs)
+    if func is aten.index_put_.default:
+        return new_kwargs["input"]
+    else:
+        return out
+
+
+@register_op_impl(aten._nested_tensor_from_tensor_list.default)
+@register_op_impl(aten._nested_tensor_from_tensor_list.out)
+@register_op_impl(aten._nested_view_from_buffer.default)
+@register_op_impl(aten._nested_view_from_buffer_copy.default)
+def nested_tensors_unsupported(fake_mode, func, *args, **kwargs):
+    raise UnsupportedOperatorException(
+        "torch.compile does not support strided NestedTensor"
+    )
+
+
+@register_op_impl(
+    [
+        x
+        for x in _device_not_kwarg_ops
+        if x
+        not in (
+            # these are already registered elsewhere
+            aten.to.device,
+            aten.to.prim_Device,
+            aten._nested_tensor_from_tensor_list.default,
+            aten._nested_tensor_from_tensor_list.out,
+        )
+    ]
+)
+def nyi(fake_mode, func, *args, **kwargs):
+    assert func not in _device_not_kwarg_ops, f"NYI: {func}"
+
+
+@register_op_impl([aten.convolution.default, aten.convolution_backward.default])
+def conv(fake_mode, func, *args, **kwargs):
+    _, kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+    device = kwargs["input"].fake_device
+    # need to re-enable mode so the tensors report fake device
+    with fake_mode:
+        # if the input is unsqueezed is done in Convolution.cpp we get segfault
+        k = kwargs["weight"].ndim
+        batch = kwargs["input"].shape[0]
+
+        # Avoid importing sympy at a module level
+        from torch.fx.experimental.symbolic_shapes import has_hint
+
+        if not has_hint(batch):
+            # TODO: We can make this a little more faithful with best effort
+            # channels last detection (but only if it's statically obvious!)
+            mem_fmt = None
+        elif k == 3 and not kwargs["input"].is_mkldnn and not kwargs["input"].is_xpu:
+            mem_fmt = None
+        else:
+            if func is aten.convolution.default:
+                conv_backend = torch._C._select_conv_backend(**kwargs)
+            else:
+                conv_backend = torch._C._select_conv_backend(
+                    kwargs["input"],
+                    kwargs["weight"],
+                    bias=None,
+                    stride=kwargs["stride"],
+                    padding=kwargs["padding"],
+                    dilation=kwargs["dilation"],
+                    transposed=kwargs["transposed"],
+                    output_padding=kwargs["output_padding"],
+                    groups=kwargs["groups"],
+                    bias_sizes=kwargs["bias_sizes"],
+                )
+            mem_fmt = torch._C._conv_determine_backend_memory_format(
+                kwargs["input"], kwargs["weight"], conv_backend
+            )
+
+    def convert(t, mem_fmt):
+        if t is None:
+            return t
+        if mem_fmt is not None:
+            t = t.to(memory_format=mem_fmt)
+        return FakeTensor(fake_mode, t, device)
+
+    with in_kernel_invocation_manager(fake_mode):
+        out = func(**kwargs)
+
+        if func is aten.convolution.default:
+            return convert(out, mem_fmt)
+        else:
+            return (
+                convert(out[0], mem_fmt),
+                convert(out[1], mem_fmt),
+                convert(out[2], None),
+            )
+
+
+@register_op_impl(aten._scaled_dot_product_flash_attention.default)
+def meta__scaled_dot_product_flash(fake_mode, func, *args, **kwargs):
+    _, kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    query = kwargs["query"]
+    key = kwargs["key"]
+    return_debug_mask = kwargs["return_debug_mask"]
+    # unused: value, dropout_p, is_causal, scale
+
+    def convert_tensor(t, device):
+        return FakeTensor(fake_mode, t, device)
+
+    batch_size = query.size(0)
+    num_heads = query.size(1)
+    max_seqlen_batch_q = query.size(2)
+    head_dim = query.size(3)
+    max_seqlen_batch_k = key.size(2)
+
+    query_t = query.transpose(1, 2)
+    # empty_like already returns a fake tensor so we don't need to convert it
+    attention = torch.empty_like(query_t).transpose(1, 2)
+    logsumexp = convert_tensor(
+        torch.empty(
+            (batch_size, num_heads, max_seqlen_batch_q),
+            dtype=torch.float,
+            device="meta",
+        ),
+        device=query.device,
+    )
+
+    if return_debug_mask:
+        blocksize_c = 128 if head_dim > 64 else 256
+        max_seqlen_k = math.ceil(max_seqlen_batch_q / blocksize_c)
+        if max_seqlen_batch_k <= 128:
+            max_seqlen_k = 128
+        elif max_seqlen_batch_k <= 256:
+            max_seqlen_k = 256
+        debug_mask = convert_tensor(
+            torch.empty(
+                (batch_size, num_heads, max_seqlen_batch_q, max_seqlen_k),
+                dtype=query.dtype,
+                device="meta",
+            ),
+            device=query.device,
+        )
+    else:
+        debug_mask = convert_tensor(
+            torch.empty(0, dtype=query.dtype, device="meta"),
+            query.device,
+        )
+
+    # Note [Seed and Offset]: device for seed and offset below depends on whether we are
+    # capturing or not, but at the time of tracing we don't know if we
+    # are going to use cudagraphs or not, so we return meta tensors here
+    # it's possible we'll need to have some special handling in inductor for sdpa
+
+    return (
+        attention,
+        logsumexp,
+        None,
+        None,
+        max_seqlen_batch_q,
+        max_seqlen_batch_k,
+        convert_tensor(torch.empty((), dtype=torch.long, device="meta"), query.device),
+        convert_tensor(torch.empty((), dtype=torch.long, device="meta"), query.device),
+        debug_mask,
+    )
+
+
+@register_op_impl(aten._scaled_dot_product_efficient_attention.default)
+def meta__scaled_dot_product_efficient(fake_mode, func, *args, **kwargs):
+    _, kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    query = kwargs["query"]
+    key = kwargs["key"]
+    value = kwargs["value"]
+    compute_log_sumexp = kwargs["compute_log_sumexp"]
+    # unused: attn_bias, dropout_p, is_causal, scale
+
+    def convert_tensor(t, device):
+        return FakeTensor(fake_mode, t, device)
+
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
+
+    B = query.size(0)
+    M = query.size(1)
+    N = key.size(1)
+    num_heads = query.size(-2)
+    K = query.size(-1)
+    Kv = value.size(-1)
+
+    res = convert_tensor(
+        torch.empty(B, M, num_heads, Kv, dtype=query.dtype, device="meta"),
+        query.device,
+    )
+
+    logsumexp_dim = math.ceil(M / 32) * 32 if compute_log_sumexp else 0
+    logsum_exp = convert_tensor(
+        torch.empty(
+            (B, num_heads, logsumexp_dim),
+            dtype=torch.float,
+            device="meta",
+        ),
+        query.device,
+    )
+
+    res = res.transpose(1, 2)
+
+    # See Note [Seed and Offset]:
+    seed = convert_tensor(
+        torch.empty((), dtype=torch.long, device="meta"), query.device
+    )
+    offset = convert_tensor(
+        torch.empty((), dtype=torch.long, device="meta"), query.device
+    )
+
+    return res, logsum_exp, seed, offset
+
+
+@register_op_impl(aten._flash_attention_forward.default)
+def meta__flash_attention_forward(fake_mode, func, *args, **kwargs):
+    _, kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    query = kwargs["query"]
+    key = kwargs["key"]
+    cum_seq_q = kwargs["cum_seq_q"]
+    cum_seq_k = kwargs["cum_seq_k"]
+    max_q = kwargs["max_q"]
+    max_k = kwargs["max_k"]
+    return_debug_mask = kwargs["return_debug_mask"]
+    # unused: value, dropout_p, is_causal, scale
+
+    def convert_tensor(t, device):
+        return FakeTensor(fake_mode, t, device)
+
+    # NB: there are two underlying paths:
+    # 1. normal dense path; expect 4D inputs of shape (batch_size, seqlen, num_heads, head_dim)
+    # 2. varseqlen path; expect 3D inputs of shape (total, num_heads, head_dim) where total
+    #    includes all batch item sequences. cum_seq_q / cum_seq_k contain offsets into total
+    batch_size = query.size(0) if cum_seq_q is None else cum_seq_q.numel() - 1
+    max_seqlen_batch_q = query.size(1) if cum_seq_q is None else max_q
+    max_seqlen_batch_k = key.size(1) if cum_seq_k is None else max_k
+    num_heads = query.size(-2)
+    head_dim = query.size(-1)
+
+    # Cuda Path
+    # note: empty_like already returns a fake tensor, we don't need to wrap it
+    attention = torch.empty_like(query)
+    logsumexp = convert_tensor(
+        torch.empty(
+            (batch_size, num_heads, max_seqlen_batch_q),
+            dtype=torch.float,
+            device="meta",
+        ),
+        device=query.device,
+    )
+
+    if return_debug_mask:
+        blocksize_c = 128 if head_dim > 64 else 256
+        max_seqlen_k = math.ceil(max_seqlen_batch_q / blocksize_c)
+        if max_seqlen_batch_k <= 128:
+            max_seqlen_k = 128
+        elif max_seqlen_batch_k <= 256:
+            max_seqlen_k = 256
+        debug_mask = convert_tensor(
+            torch.empty(
+                (batch_size, num_heads, max_seqlen_batch_q, max_seqlen_k),
+                dtype=query.dtype,
+                device="meta",
+            ),
+            query.device,
+        )
+    else:
+        debug_mask = convert_tensor(
+            torch.empty(0, dtype=query.dtype, device="meta"),
+            query.device,
+        )
+
+    # See Note [Seed and Offset]:
+    return (
+        attention,
+        logsumexp,
+        convert_tensor(torch.empty((), dtype=torch.long, device="meta"), query.device),
+        convert_tensor(torch.empty((), dtype=torch.long, device="meta"), query.device),
+        debug_mask,
+    )
+
+
+@register_op_impl(aten._efficient_attention_forward.default)
+def meta__efficient_attention_forward(fake_mode, func, *args, **kwargs):
+    _, kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    query = kwargs["query"]
+    key = kwargs["key"]
+    value = kwargs["value"]
+    cu_seqlens_q = kwargs["cu_seqlens_q"]
+    max_seqlen_q = kwargs["max_seqlen_q"]
+    max_seqlen_k = kwargs["max_seqlen_k"]
+    compute_log_sumexp = kwargs["compute_log_sumexp"]
+    # unused: bias, cu_seqlens_k, dropout_p, custom_mask_type, scale, causal_diagonal, seqlen_k
+
+    def convert_tensor(t, device):
+        return FakeTensor(fake_mode, t, device)
+
+    B = query.size(0)
+    M = query.size(1)
+    N = key.size(1)
+    num_heads = query.size(-2)
+    K = query.size(-1)
+    Kv = value.size(-1)
+
+    res = convert_tensor(
+        torch.empty(B, M, num_heads, Kv, dtype=query.dtype, device="meta"),
+        query.device,
+    )
+
+    logsumexp_batch_dim = cu_seqlens_q.size(0) - 1 if (cu_seqlens_q is not None) else B
+    actual_max_seqlen_q = M
+    if cu_seqlens_q is not None:
+        assert max_seqlen_q is not None
+        actual_max_seqlen_q = max_seqlen_q
+    actual_max_seqlen_k = max_seqlen_k if max_seqlen_k is not None else N
+    logsumexp_dim = (
+        math.ceil(actual_max_seqlen_q / 32) * 32 if compute_log_sumexp else 0
+    )
+    logsum_exp = convert_tensor(
+        torch.empty(
+            (logsumexp_batch_dim, num_heads, logsumexp_dim),
+            dtype=torch.float,
+            device="meta",
+        ),
+        query.device,
+    )
+
+    # See Note [Seed and Offset]:
+    seed = convert_tensor(
+        torch.empty((), dtype=torch.long, device="meta"), query.device
+    )
+    offset = convert_tensor(
+        torch.empty((), dtype=torch.long, device="meta"), query.device
+    )
+
+    return res, logsum_exp, seed, offset, actual_max_seqlen_q, actual_max_seqlen_k
+
+
+@register_op_impl(torch.ops.aten._pack_padded_sequence.default)
+def _pack_padded_sequence(fake_mode, func, inputs, lengths, batch_first):
+    if (
+        fake_mode.shape_env is None
+        or not fake_mode.shape_env.allow_dynamic_output_shape_ops
+    ):
+        # Without symints/symfloats, cannot handle this
+        raise DynamicOutputShapeException(func)
+
+    new_batch_size = fake_mode.shape_env.create_unbacked_symint()
+
+    from torch.fx.experimental.symbolic_shapes import _constrain_range_for_size
+
+    _constrain_range_for_size(new_batch_size)
+
+    if not batch_first:
+        # Inputs should have shape (batch_size, seq_len, *)
+        inputs = inputs.transpose(0, 1)
+
+    res_size = inputs.shape[1:]
+    packed_data = inputs.new_empty(res_size)
+    batch_size = inputs.new_empty((new_batch_size,))
+    return (packed_data, batch_size)
+
+
+FAST_OP_IMPLEMENTATIONS = {}
+
+
+# Unlike register_op_impl, these don't do the slow iteration for
+# run_impl_check, and these run BEFORE decompositions
+def register_fast_op_impl(func: OpOverload):
+    def impl_decorator(op_impl):
+        FAST_OP_IMPLEMENTATIONS[func] = op_impl
+        return op_impl
+
+    return impl_decorator
+
+
+# infer_size_impl in ExpandUtils
+def infer_size(a, b):
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    dimsA = len(a)
+    dimsB = len(b)
+    ndim = max(dimsA, dimsB)
+    expandedSizes = [0] * ndim
+    for i in range(ndim - 1, -1, -1):
+        offset = ndim - 1 - i
+        dimA = dimsA - 1 - offset
+        dimB = dimsB - 1 - offset
+        sizeA = a[dimA] if dimA >= 0 else 1
+        sizeB = b[dimB] if dimB >= 0 else 1
+
+        # NB: It is very important to test for broadcasting, before testing
+        # sizeA == sizeB.  This is because the broadcasting tests are likely
+        # to be statically known (in particular, if sizeA/sizeB is unbacked
+        # but size-like, we will unsoundly assume they never equal 1), but
+        # the sizeA == sizeB test may not be statically known.  However, once
+        # we have established that no broadcasting is happening, the
+        # sizeA == sizeB is now expect_true and we can defer it as a runtime
+        # assert (this works because Python will return the terminal
+        # expression of an or statement as-is, without bool()'ing it; if this
+        # were not the case, we'd need to write this using torch.sym_or() or
+        # something like that).
+        torch._check(
+            guard_size_oblivious(sizeA == 1)
+            or guard_size_oblivious(sizeB == 1)
+            or sizeA == sizeB,
+            lambda: f"The size of tensor a ({sizeA}) "
+            f"must match the size of tensor b ({sizeB}) "
+            f"at non-singleton dimension {i})",
+        )
+        expandedSizes[i] = sizeB if guard_size_oblivious(sizeA == 1) else sizeA
+    return tuple(expandedSizes)
+
+
+def make_fast_binary_impl(slow_ref):
+    def fast_binary_impl(mode, *args, **kwargs):
+        def slow(msg):
+            count_label(f"slow {msg}")
+            with mode:
+                return slow_ref(*args, **kwargs)
+
+        count_label("attempt fast")
+
+        # Fast path (based off of TensorIterator fast path).
+        # Unfortunately, there is no way to easily deduplicate
+        # this with either the TensorIterator C++ implementation
+        # (which we don't want to SymIntify, and also the algorithm
+        # here is slightly different from TensorIterator to allow
+        # for broadcasting), nor the PrimTorch implementation
+        # (which does not actually implement a fast path.)
+
+        operands = args
+
+        # compute_shape
+        has_scalars = False
+        has_tensors = False
+        final_shape = None
+        for op in operands:
+            shape = op.shape if isinstance(op, torch.Tensor) else ()
+            if len(shape) == 0:
+                has_scalars = True
+            else:
+                has_tensors = True
+            if final_shape is None:
+                final_shape = shape
+            # TODO: Minor optimization: track if the shapes
+            # were equal so you can skip the equality check
+            # below if unnecessary
+            final_shape = infer_size(final_shape, shape)
+        assert final_shape is not None
+
+        # Do some extra safety checks to see if the output
+        # stride is obvious
+        for op in operands:
+            if (
+                isinstance(op, torch.Tensor)
+                and len(op.shape) == len(final_shape)
+                and op.shape == final_shape
+            ):
+                break
+        else:
+            return slow("both tensors nontrivially broadcast")
+
+        # compute_types
+        cpu = torch.device("cpu")
+        common_device = cpu
+        common_dtype = None
+        output_dtype = None
+        has_different_input_dtypes = False
+        for op in operands:
+            if not isinstance(op, torch.Tensor):
+                # Use elementwise_dtypes for the tricky case
+                has_different_input_dtypes = True
+                continue
+            if common_device == cpu and not op.device.type == "cpu":
+                common_device = op.device
+            # Slightly simplified here as target_dtype cannot vary
+            if common_dtype is None:
+                common_dtype = op.dtype
+            elif common_dtype != op.dtype:
+                has_different_input_dtypes = True
+
+        if has_different_input_dtypes:
+            # compute promotion
+            # TODO: we don't need the compute type
+            _, common_dtype = elementwise_dtypes(
+                *operands, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+            )
+
+        # check all tensors on same device
+        # cpu scalars are assumed allow
+        current_cpu_scalars_on_non_cpu = 0
+        max_cpu_scalars_on_non_cpu = 1  # hard coded atm
+        for op in operands:
+            if not isinstance(op, torch.Tensor):
+                continue
+            if common_device != cpu and op.dim() == 0 and op.device == cpu:
+                if current_cpu_scalars_on_non_cpu >= max_cpu_scalars_on_non_cpu:
+                    return slow("error")
+                current_cpu_scalars_on_non_cpu += 1
+            elif op.device != common_device:
+                return slow("error")
+
+        # compute_fast_setup_type
+        is_contiguous = True
+        is_channels_last = True
+        # TODO: is_non-overlapping_and_dense (not bound from Python
+        # no inplace, no out, everything defined
+
+        if is_noncontiguous_supported(common_device):
+            for op in operands:
+                if not isinstance(op, torch.Tensor):
+                    continue
+                is_contiguous = is_contiguous and op.is_contiguous(
+                    memory_format=torch.contiguous_format
+                )
+                is_channels_last = is_channels_last and op.is_contiguous(
+                    memory_format=torch.channels_last
+                )
+        if is_contiguous:
+            # do contiguous
+            count_label("fast is_contiguous")
+            return FakeTensor(
+                mode,
+                torch.empty(
+                    final_shape,
+                    dtype=common_dtype,
+                    device="meta",
+                    memory_format=torch.contiguous_format,
+                ),
+                device=common_device,
+            )
+        if is_channels_last:
+            count_label("fast channels_last")
+            # do channels last
+            return FakeTensor(
+                mode,
+                torch.empty(
+                    final_shape,
+                    dtype=common_dtype,
+                    device="meta",
+                    memory_format=torch.channels_last,
+                ),
+                device=common_device,
+            )
+
+        return slow("no contiguity match")
+
+    return fast_binary_impl
+
+
+@functools.lru_cache(None)
+def get_fast_op_impls():
+    import torch._refs
+
+    register_fast_op_impl(torch.ops.aten.add.Tensor)(
+        make_fast_binary_impl(torch._refs.add)
+    )
+    register_fast_op_impl(torch.ops.aten.sub.Tensor)(
+        make_fast_binary_impl(torch._refs.sub)
+    )
+    register_fast_op_impl(torch.ops.aten.mul.Tensor)(make_fast_binary_impl(torch._refs.mul))  # type: ignore[has-type]
+    register_fast_op_impl(torch.ops.aten.div.Tensor)(
+        make_fast_binary_impl(torch._refs.div)
+    )
+    return FAST_OP_IMPLEMENTATIONS
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index c8e5d955df574..2d701d065e32a 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -1,29 +1,41 @@
 import contextlib
 import functools
-import itertools
 import logging
 import os
-import sys
 import traceback
 import weakref
+from collections import defaultdict
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+    TypeVar,
+    Union,
+)
 from weakref import ReferenceType
 
 import torch
 import torch._custom_op
 import torch._logging
+from torch._C._functorch import is_functorch_wrapped_tensor
 
 from torch._guards import Source
 from torch._ops import OpOverload
-from torch._prims_common import (
-    elementwise_dtypes,
-    ELEMENTWISE_TYPE_PROMOTION_KIND,
-    is_boolean_dtype,
-    is_float_dtype,
-    is_integer_dtype,
+from torch._prims_common import suggest_memory_format
+from torch._subclasses.meta_utils import (
+    assert_eq,
+    assert_metadata_eq,
+    is_sparse_any,
+    is_sparse_compressed,
+    MetaConverter,
 )
-from torch._subclasses.meta_utils import MetaConverter
 from torch._utils import render_call
 from torch.fx.operator_schemas import normalize_function
 from torch.multiprocessing.reductions import StorageWeakRef
@@ -33,15 +45,34 @@
     is_traceable_wrapper_subclass,
     TorchDispatchMode,
 )
+from torch.utils._pytree import PyTree, tree_map, tree_map_
+from torch.utils._stats import count
+from torch.utils._traceback import CapturedTraceback
+
+if TYPE_CHECKING:
+    from torch.fx.experimental.symbolic_shapes import ShapeEnv
+    from torch.types import _bool
+
 
-from torch.utils._pytree import PyTree, tree_map
-from torch.utils._stats import count, count_label
-from torch.utils.weak import WeakIdRef
+class _Unassigned:
+    pass
+
+
+_UNASSIGNED = _Unassigned()
 
 DimList = List
 
 log = logging.getLogger(__name__)
-not_implemented_log = torch._logging.getArtifactLogger(__name__, "not_implemented")
+
+# TODO: Hack to unblock https://github.com/pytorch/pytorch/pull/108186
+# Proper fix tracked by https://github.com/pytorch/pytorch/issues/120105
+try:
+    not_implemented_log = torch._logging.getArtifactLogger(__name__, "not_implemented")
+except ValueError as e:
+    if "'not_implemented' not registered" in str(e):
+        import logging as not_implemented_log
+    else:
+        raise e
 
 pytree = torch.utils._pytree
 T = TypeVar("T")
@@ -89,70 +120,7 @@ class UnsupportedOperatorException(RuntimeError):
 
 
 def ordered_set(*items):
-    return {k: True for k in items}
-
-
-_device_not_kwarg_ops = ordered_set(
-    aten._resize_output_.default,
-    aten._nested_tensor_from_tensor_list.default,
-    aten._nested_tensor_from_tensor_list.out,
-    aten.pin_memory.default,
-    aten.is_pinned.default,
-    aten.to.device,
-    aten.to.prim_Device,
-    aten._pin_memory.default,
-    aten._pin_memory.out,
-    aten._resize_output.default,
-    aten._resize_output.out,
-)
-
-# this op is never actually used
-_non_kwarg_device_constructors = (aten._list_to_tensor,)
-
-
-# This function indicates if the backend device
-# supports non-contiguous tensors
-def is_noncontiguous_supported(device):
-    if device.type == "hpu":
-        return False
-    return True
-
-
-def contains_tensor_types(type):
-    tensor_type = torch._C.TensorType.get()
-    return type.isSubtypeOf(tensor_type) or any(
-        contains_tensor_types(e) for e in type.containedTypes()
-    )
-
-
-_like_tensor_constructors = ordered_set(
-    aten.empty_like.default,
-    aten.empty_like.out,
-    aten.full_like.default,
-    aten.full_like.out,
-    aten.ones_like.default,
-    aten.ones_like.out,
-    aten.rand_like.default,
-    aten.rand_like.out,
-    aten.randn_like.default,
-    aten.randn_like.out,
-    aten.randint_like.default,
-    aten.randint_like.out,
-    aten.randint_like.low_dtype,
-    aten.randint_like.low_dtype_out,
-    aten.zeros_like.default,
-    aten.zeros_like.out,
-    aten.new_empty.default,
-    aten.new_empty.out,
-    aten.new_empty_strided.default,
-    aten.new_empty_strided.out,
-    aten.new_full.default,
-    aten.new_full.out,
-    aten.new_zeros.default,
-    aten.new_zeros.out,
-    aten.new_ones.default,
-    aten.new_ones.out,
-)
+    return dict.fromkeys(items, True)
 
 
 @contextlib.contextmanager
@@ -165,18 +133,6 @@ def unset_fake_temporarily():
             torch._C._set_dispatch_mode(old)
 
 
-@functools.lru_cache(None)
-def _is_tensor_constructor(func: OpOverload):
-    assert isinstance(func, OpOverload)
-    schema = func._schema
-    if any(contains_tensor_types(arg.type) for arg in schema.arguments):
-        return False
-    # TODO: no real reason to restrict multiple outputs
-    return (
-        len(schema.returns) == 1 and schema.returns[0].type is torch._C.TensorType.get()
-    )
-
-
 def is_fake(x):
     if isinstance(x, FakeTensor):
         return True
@@ -192,7 +148,7 @@ def is_fake(x):
         reapply_views = torch._C._functionalization_reapply_views_tls()
         unwrapped = torch._C._functorch._unwrap_functional_tensor(x, reapply_views)
         return is_fake(unwrapped)
-    elif isinstance(x, torch.Tensor) and torch._C._functorch.is_batchedtensor(x):
+    elif isinstance(x, torch.Tensor) and is_functorch_wrapped_tensor(x):
         unwrapped = torch._C._functorch.get_unwrapped(x)
         return is_fake(unwrapped)
     return False
@@ -213,7 +169,7 @@ def maybe_get_fake_mode(t):
         reapply_views = torch._C._functionalization_reapply_views_tls()
         unwrapped = torch._C._functorch._unwrap_functional_tensor(t, reapply_views)
         return maybe_get_fake_mode(unwrapped)
-    elif isinstance(t, torch.Tensor) and torch._C._functorch.is_batchedtensor(t):
+    elif isinstance(t, torch.Tensor) and is_functorch_wrapped_tensor(t):
         unwrapped = torch._C._functorch.get_unwrapped(t)
         return maybe_get_fake_mode(unwrapped)
     return None
@@ -234,8 +190,12 @@ def torch_decomp_decompositions(func):
     from torch._decomp import decomposition_table
 
     decompositions = torch._decomp.decompositions
-    decomp_attrs = [getattr(decompositions, attr) for attr in dir(decompositions)]
-    return decomposition_table[func] in decomp_attrs
+    # Note that the function in the decomposition table might be
+    # different from the one in the module because of the difference
+    # in out handling in aten API and torch public API
+    return decomposition_table[func].__module__.startswith(
+        "torch._decomp"
+    ) and decomposition_table[func].__name__ in dir(decompositions)
 
 
 def tree_flatten_only(ty: Type[T], tree: PyTree):
@@ -291,29 +251,21 @@ def invalidate_constant_aliases(self, tensor):
         del self.constant_storage_mapping[weak_st]
 
     def _get_memo(self, t):
-        if WeakIdRef(t) in self.tensor_memo:
-            out = self.tensor_memo[WeakIdRef(t)]
-            out._fix_weakref()
-            return out
-        return None
+        tid = self.meta_converter.describer.lookup_tensor.get(t)
+        if tid is None:
+            return None
+        return self.tensor_memo.get(tid)
 
     def set_tensor_memo(self, t, v):
-        th = WeakIdRef(t)
-
-        # hold a weak ref to self, otherwise it will be kept alive
-        # by the del_ten closure
-        self_weak_ref = weakref.ref(self)
-
-        def del_ten():
-            self_ref = self_weak_ref()
-            if self_ref is None:
-                return
-            # on shutdown, th may not be in memo
-            self_ref.tensor_memo.pop(th, None)
-
-        weakref.finalize(t, del_ten)
-        self.tensor_memo[th] = v
+        tid = self.meta_converter.describer.get_tensor_id(t)
+        self.meta_converter.tensor_memo[tid] = v
 
+    # You can have a real tensor that you need to convert into a fake tensor.
+    # If you have a meta tensor already, call from_meta_and_device.
+    #
+    # You're allowed to pass a meta tensor to be turned into a fake
+    # tensor; although an odd thing to do, this can occur if you're doing
+    # cross ref testing and the inner test is already operating on meta tensors.
     def from_real_tensor(
         self,
         fake_mode,
@@ -323,7 +275,6 @@ def from_real_tensor(
         *,
         source=None,
         symbolic_context=None,
-        memoized_only=False,
     ):
         # see note [Tensor Fakification and Symbol Caching]
         if not symbolic_context and not source and shape_env:
@@ -335,8 +286,6 @@ def from_real_tensor(
         maybe_memo = self._get_memo(t)
         if maybe_memo is not None:
             return maybe_memo
-        if memoized_only:
-            return None
         existing_device = t.device
         # not yet supported in metatensors
         if t.is_quantized:
@@ -345,6 +294,8 @@ def from_real_tensor(
             assert not make_constant
 
         def mk_fake_tensor(make_meta_t):
+            from torch._dynamo.utils import clone_input
+
             # NB: don't use in_kernel_invocation_manager. to
             # ensure FakeTensor can internally do constant computation
             # as necessary.  Invocation manager is "more correct" as
@@ -357,7 +308,21 @@ def mk_fake_tensor(make_meta_t):
                     fake_mode,
                     make_meta_t(),
                     existing_device,
+                    # TODO: callback might be used in recursive contexts, in
+                    # which case using t is wrong!  BUG!
                     constant=t if make_constant else None,
+                    # TODO: This won't preserve aliasing relationships, so if
+                    # there is mutation you won't see it reflect elsewhere.
+                    # This is fine because propagate_real_tensors isn't
+                    # intended to give you exact results and some inaccuracy
+                    # is OK, although if its use case expands we would want to
+                    # do something similar to meta converter, but poking in
+                    # real tensors at the storage cloning phase
+                    real_tensor=(
+                        (t if make_constant else clone_input(t))
+                        if fake_mode.propagate_real_tensors
+                        else None
+                    ),
                 )
 
         out = self.meta_converter(
@@ -379,6 +344,8 @@ def from_meta_and_device(self, fake_mode, t, device):
         assert (
             t.device.type == "meta"
         ), f"tensor's device must be `meta`, got {t.device.type} instead"
+        # This is a bit abusive (this is not the "real" tensor) but whatever,
+        # the meta tensor should be fresh so there's no way to get it wrong
         maybe_memo = self._get_memo(t)
         if maybe_memo is not None:
             return maybe_memo
@@ -386,704 +353,6 @@ def from_meta_and_device(self, fake_mode, t, device):
         self.set_tensor_memo(t, out)
         return out
 
-    # You can have a real tensor that you need to convert into a fake tensor.
-    # If you have a meta tensor already, call from_meta_and_device.
-    #
-    # You're allowed to pass a meta tensor to be turned into a fake
-    # tensor; although an odd thing to do, this can occur if you're doing
-    # cross ref testing and the inner test is already operating on meta tensors.
-    def __call__(
-        self,
-        fake_mode,
-        t,
-        *,
-        make_constant=False,
-        shape_env=None,
-        source=None,
-        symbolic_context=None,
-        memoized_only=False,
-    ):
-        return self.from_real_tensor(
-            fake_mode,
-            t,
-            make_constant,
-            shape_env=shape_env,
-            source=source,
-            symbolic_context=symbolic_context,
-            memoized_only=memoized_only,
-        )
-
-
-op_implementations_dict = {}
-op_implementations_checks = []
-
-
-def register_op_impl(run_impl_check: Union[Callable[[OpOverload], bool], OpOverload]):
-    def impl_decorator(op_impl):
-        if isinstance(run_impl_check, OpOverload):
-            assert (
-                run_impl_check not in op_implementations_dict
-            ), f"duplicate registration: {run_impl_check}"
-            op_implementations_dict[run_impl_check] = op_impl
-        elif isinstance(run_impl_check, (list, tuple)):
-            for op in run_impl_check:
-                register_op_impl(op)(op_impl)
-        else:
-            assert callable(run_impl_check)
-            op_implementations_checks.append((run_impl_check, op_impl))
-
-        return op_impl
-
-    return impl_decorator
-
-
-@register_op_impl(op_implementations_dict.__contains__)
-def dispatch_to_op_implementations_dict(fake_mode, func, *args, **kwargs):
-    return op_implementations_dict[func](fake_mode, func, *args, **kwargs)
-
-
-@register_op_impl(_is_tensor_constructor)
-@register_op_impl([*_like_tensor_constructors])
-def constructors(fake_mode, func, *args, **kwargs):
-    assert func not in _non_kwarg_device_constructors
-    _, new_kwargs = normalize_function(
-        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
-    )
-    if func in _like_tensor_constructors:
-        default_device = new_kwargs["input"].device
-        # TODO: file issue
-        args = (new_kwargs.pop("input"),)
-    else:
-        # cpu is default device if none is specified
-        default_device = torch.device("cpu")
-        args = ()
-    out_device = new_kwargs.pop("device", None)
-    out_device = out_device if out_device is not None else default_device
-    new_kwargs["device"] = torch.device("meta")
-    # _like constructors have fake tensor inputs (maybe this causes the non-like
-    # to fail? hmmm)
-    with in_kernel_invocation_manager(fake_mode):
-        r = func(*args, **new_kwargs)
-    return FakeTensor(fake_mode, r, out_device)
-
-
-@register_op_impl(aten.to.prim_Device)
-@register_op_impl(aten.to.device)
-def non_kwarg_to(fake_mode, func, *args, **kwargs):
-    _, new_kwargs = normalize_function(
-        func, args, kwargs, normalize_to_only_use_kwargs=True
-    )
-    input_device = new_kwargs["device"]
-    out_device = input_device if input_device else new_kwargs["input"].device
-    new_kwargs["device"] = torch.device("meta")
-    inp = new_kwargs.pop("input")
-    with in_kernel_invocation_manager(fake_mode):
-        r = func(inp, **new_kwargs)
-    # TODO: I think this does the wrong thing if r is inp
-    return fake_mode.fake_tensor_converter.from_meta_and_device(
-        fake_mode, r, out_device
-    )
-
-
-def stride_incorrect_op(op):
-    if op.namespace not in ("aten", "prims"):
-        return False
-    if op is aten._fft_c2c.default:
-        return False
-
-    op_name = op.name()
-    if "fft" in op_name:
-        return True
-    return False
-
-
-# These operators have meta implementations with incorrect strides
-@register_op_impl(stride_incorrect_op)
-def wordaround_stride_incorrect_op(fake_mode, func, *args, **kwargs):
-    # This is a workaround for meta implmentations with incorrect strides
-
-    def is_symbolic(x):
-        if isinstance(x, FakeTensor):
-            return x._has_symbolic_sizes_strides
-        if isinstance(x, (torch.SymInt, torch.SymFloat, torch.SymBool)):
-            return True
-        return False
-
-    # For static shapes, we can fall back to eager for the real strides
-    if fake_mode.allow_fallback_kernels:
-        require_dynamic = any(
-            is_symbolic(x) for x in itertools.chain(args, kwargs.values())
-        )
-        if not require_dynamic:
-            flat_args, args_spec = pytree.tree_flatten((args, kwargs))
-            return run_fallback_kernel(fake_mode, func, flat_args, args_spec, None)
-
-    raise UnsupportedOperatorException(func)
-
-
-# Dont default to default device handling,
-# since the device of `the_template` is ignored
-@register_op_impl(aten.resize_as_.default)
-def resize_as_(fake_mode, func, *args, **kwargs):
-    with in_kernel_invocation_manager(fake_mode):
-        return func(*args, **kwargs)
-
-
-@register_op_impl(aten._sparse_coo_tensor_with_dims_and_tensors.default)
-def _sparse_coo_tensor_with_dims_and_tensors(fake_mode, func, *args, **kwargs):
-    # TODO: remove me
-    return constructors(fake_mode, func, *args, **kwargs)
-
-
-# index.Tensor data-dependent in only some conditions
-@register_op_impl(
-    lambda func: torch.Tag.dynamic_output_shape in func.tags
-    and func
-    not in [aten.index.Tensor, aten.nonzero.default, aten.repeat_interleave.Tensor]
-)
-def dyn_shape(fake_mode, func, *args, **kwargs):
-    raise DynamicOutputShapeException(func)
-
-
-@register_op_impl(aten.repeat_interleave.Tensor)
-def repeat_interleave_tensor(fake_mode, func, repeats, output_size=None):
-    if output_size is None:
-        if (
-            fake_mode.shape_env is None
-            or not fake_mode.shape_env.allow_dynamic_output_shape_ops
-        ):
-            raise DynamicOutputShapeException(func)
-
-        output_size = fake_mode.shape_env.create_unbacked_symint()
-
-        # Avoid importing sympy at a module level
-        from torch.fx.experimental.symbolic_shapes import _constrain_range_for_size
-
-        _constrain_range_for_size(output_size)
-        # TODO: consider a memo
-    return repeats.new_empty(output_size)
-
-
-@register_op_impl(torch.ops.aten._local_scalar_dense.default)
-def local_scalar_dense(fake_mode, func, arg):
-    if fake_mode.shape_env is None or not fake_mode.shape_env.allow_scalar_outputs:
-        # Without symints/symfloats, cannot handle this
-        raise DataDependentOutputException(func)
-    if is_float_dtype(arg.dtype):
-        return fake_mode.shape_env.create_unbacked_symfloat()
-    elif is_integer_dtype(arg.dtype):
-        return fake_mode.shape_env.create_unbacked_symint()
-    elif is_boolean_dtype(arg.dtype):
-        return fake_mode.shape_env.create_unbacked_symbool()
-    else:
-        raise NotImplementedError(f"local_scalar_dense/item NYI for {arg.dtype}")
-
-
-@register_op_impl(torch.ops.aten.nonzero.default)
-def nonzero(fake_mode, func, arg):
-    if (
-        fake_mode.shape_env is None
-        or not fake_mode.shape_env.allow_dynamic_output_shape_ops
-    ):
-        # Without symints/symfloats, cannot handle this
-        raise DynamicOutputShapeException(func)
-
-    if arg.nonzero_memo is None:
-        nnz = fake_mode.shape_env.create_unbacked_symint()
-
-        # This is unsound, but it works well in practice
-        # See https://docs.google.com/document/d/1lFRYAJo5nrfxRhwIzGnfi2pbLpU6T4ytSRSuLJ5qebI/edit#
-        # TODO: Add a config knob to turn off this unsound behavior
-        #
-        # NB: If numel < 2, the bounds here might be COMPLETELY
-        # disjoint with what can actually occur.  But this is fine:
-        # remember, the hypothesis is that if your later code works
-        # with N >= 2, it will work with N = 1 and N = 0.
-        maxval = sys.maxsize - 1
-
-        # Avoid importing sympy at a module level
-        from torch.fx.experimental.symbolic_shapes import (
-            _constrain_range_for_size,
-            has_free_symbols,
-        )
-
-        if not has_free_symbols(arg.numel()):
-            # Don't upgrade the range if numel is less than two, since we then
-            # have an empty range which makes things go explodey.  We also
-            # don't allow for 2 because that would specialize the unbacked
-            # SymInt to 2, which is also likely to be buggy.
-            if arg.numel() > 2:
-                maxval = int(arg.numel())
-
-        _constrain_range_for_size(nnz, max=maxval)
-
-        arg._nonzero_memo = nnz
-        arg._nonzero_memo_vc = arg._version
-
-    return arg.new_empty((arg.nonzero_memo, arg.dim()), dtype=torch.int64)
-
-
-@register_op_impl(torch.ops.aten.masked_select.default)
-def masked_select(fake_mode, func, self, mask):
-    if (
-        fake_mode.shape_env is None
-        or not fake_mode.shape_env.allow_dynamic_output_shape_ops
-    ):
-        # Without symints/symfloats, cannot handle this
-        raise DynamicOutputShapeException(func)
-
-    nnz = fake_mode.shape_env.create_unbacked_symint()
-
-    # see nonzero for commentary
-    maxval = sys.maxsize - 1
-
-    # Avoid importing sympy at a module level
-    from torch.fx.experimental.symbolic_shapes import (
-        _constrain_range_for_size,
-        has_free_symbols,
-    )
-
-    if not has_free_symbols(self.numel()):
-        if self.numel() > 2:
-            maxval = int(self.numel())
-
-    _constrain_range_for_size(nnz, max=maxval)
-
-    return self.new_empty((nnz,))
-
-
-# NB: this must be ordered after local_scalar_dense
-@register_op_impl(lambda func: torch.Tag.data_dependent_output in func.tags)
-def data_dep(fake_mode, func, *args, **kwargs):
-    raise DataDependentOutputException(func)
-
-
-# Bool Indices get Expanded as Masks
-# See: IndexingUtils.h:expandTensors
-def check_no_bool_index_tensors(func, self, indices):
-    for index in indices:
-        if index is not None and index.dtype in (torch.bool, torch.uint8):
-            raise DynamicOutputShapeException(func)
-
-
-def run_and_return_new_tensor_of_input_device(fake_mode, func, args, kwargs):
-    _, new_kwargs = normalize_function(
-        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
-    )
-
-    out_device = new_kwargs["input"].device
-    with in_kernel_invocation_manager(fake_mode):
-        out = func(*args, **kwargs)
-        if not is_noncontiguous_supported(out_device):
-            out = out.new_empty(out.shape)
-
-    if out is new_kwargs["input"]:
-        return out  # copy_
-    return FakeTensor(fake_mode, out, out_device)
-
-
-_is_builtin_namespaces = ordered_set("aten", "prims", "prim")
-
-
-def is_builtin(op):
-    return op.namespace in _is_builtin_namespaces
-
-
-def has_meta(func):
-    return torch._C._dispatch_has_computed_kernel_for_dispatch_key(func.name(), "Meta")
-
-
-@register_op_impl(
-    lambda func: is_builtin(func) and "foreach" in func.name() and has_meta(func)
-)
-def foreach_run_and_map_input_device(fake_mode, func, *args, **kwargs):
-    tensor_lists = []
-    for arg in itertools.chain(args, kwargs.values()):
-        if (
-            isinstance(arg, (list, tuple))
-            and len(arg)
-            and isinstance(arg[0], torch.Tensor)
-        ):
-            tensor_lists.append(arg)
-
-    try:
-        with in_kernel_invocation_manager(fake_mode):
-            out_meta = func(*args, **kwargs)
-    except NotImplementedError as not_implemented_error:
-        return NotImplemented
-
-    if not out_meta:
-        return out_meta
-
-    assert tensor_lists
-    out_fake = []
-
-    for i, meta_t in enumerate(out_meta):
-        device, _ = FakeTensor._find_common_device(func, [tl[i] for tl in tensor_lists])
-        out_fake.append(
-            fake_mode.fake_tensor_converter.from_meta_and_device(
-                fake_mode, meta_t, device
-            )
-        )
-
-    return out_fake
-
-
-# Dont default to default device handling,
-# Since op can take in non-zero sized cpu
-# index tensors with cuda self
-@register_op_impl(aten.index.Tensor)
-def index_tensor(fake_mode, func, *args, **kwargs):
-    from torch._meta_registrations import meta_index_Tensor
-
-    _, new_kwargs = normalize_function(
-        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
-    )
-
-    out_device = new_kwargs["input"].device
-    # ensure nonzero call goes to fake tensor
-    with fake_mode:
-        out = meta_index_Tensor(*args, **kwargs)
-        return out.to(out_device)
-
-
-# Can take mixed meta/non-meta arguments; the meta registration
-# will roughly do the right thing even when given real devices
-@register_op_impl(aten._embedding_bag.default)
-def embedding_bag(fake_mode, func, *args, **kwargs):
-    from torch._meta_registrations import meta_embedding_bag
-
-    with fake_mode:
-        return meta_embedding_bag(*args, **kwargs)
-
-
-# takes in multiple-devices, dont default to default device handling
-@register_op_impl(aten._unsafe_index_put.default)
-@register_op_impl(aten.copy.default)
-@register_op_impl(aten.copy_.default)
-@register_op_impl(aten.slice_scatter.default)
-def multi_device_op_default(fake_mode, func, *args, **kwargs):
-    return run_and_return_new_tensor_of_input_device(fake_mode, func, args, kwargs)
-
-
-# same with multi_device_op_default, but return the input
-@register_op_impl(aten.copy.out)
-@register_op_impl(aten.slice_scatter.out)
-def multi_device_op_out(fake_mode, func, *args, **kwargs):
-    with in_kernel_invocation_manager(fake_mode):
-        out = func(*args, **kwargs)
-
-    _, new_kwargs = normalize_function(
-        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
-    )
-
-    return new_kwargs["input"]
-
-
-@register_op_impl(aten.index_put.default)
-@register_op_impl(aten.index_put_.default)
-def index_put_impl(fake_mode, func, *args, **kwargs):
-    _, new_kwargs = normalize_function(
-        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
-    )
-
-    values = new_kwargs["values"]
-    self_device = new_kwargs["input"].fake_device
-    torch._check(
-        self_device == values.fake_device or (values.ndim == 0 and values.numel() == 1),
-        lambda: f"Mismatching {func} device between self ({self_device}) and values ({values.device})",
-    )
-
-    out = run_and_return_new_tensor_of_input_device(fake_mode, func, args, kwargs)
-    if func is aten.index_put_.default:
-        return new_kwargs["input"]
-    else:
-        return out
-
-
-@register_op_impl(aten._nested_tensor_from_tensor_list.default)
-@register_op_impl(aten._nested_tensor_from_tensor_list.out)
-def nested_tensors_unsupported(fake_mode, func, *args, **kwargs):
-    raise UnsupportedOperatorException(
-        "torch.compile does not support strided NestedTensor"
-    )
-
-
-@register_op_impl(
-    [
-        x
-        for x in _device_not_kwarg_ops
-        if x
-        not in (
-            # these are already registered elsewhere
-            aten.to.device,
-            aten.to.prim_Device,
-            aten._nested_tensor_from_tensor_list.default,
-            aten._nested_tensor_from_tensor_list.out,
-        )
-    ]
-)
-def nyi(fake_mode, func, *args, **kwargs):
-    assert func not in _device_not_kwarg_ops, f"NYI: {func}"
-
-
-@register_op_impl([aten.convolution.default, aten.convolution_backward.default])
-def conv(fake_mode, func, *args, **kwargs):
-    _, kwargs = normalize_function(
-        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
-    )
-    device = kwargs["input"].fake_device
-    # need to re-enable mode so the tensors report fake device
-    with fake_mode:
-        # if the input is unsqueezed is done in Convolution.cpp we get segfault
-        k = kwargs["weight"].ndim
-        batch = kwargs["input"].shape[0]
-
-        # Avoid importing sympy at a module level
-        from torch.fx.experimental.symbolic_shapes import has_hint
-
-        if not has_hint(batch):
-            # TODO: We can make this a little more faithful with best effort
-            # channels last detection (but only if it's statically obvious!)
-            mem_fmt = None
-        elif k == 3 and not kwargs["input"].is_mkldnn and not kwargs["input"].is_xpu:
-            mem_fmt = None
-        else:
-            if func is aten.convolution.default:
-                conv_backend = torch._C._select_conv_backend(**kwargs)
-            else:
-                conv_backend = torch._C._select_conv_backend(
-                    kwargs["input"],
-                    kwargs["weight"],
-                    bias=None,
-                    stride=kwargs["stride"],
-                    padding=kwargs["padding"],
-                    dilation=kwargs["dilation"],
-                    transposed=kwargs["transposed"],
-                    output_padding=kwargs["output_padding"],
-                    groups=kwargs["groups"],
-                    bias_sizes=kwargs["bias_sizes"],
-                )
-            mem_fmt = torch._C._conv_determine_backend_memory_format(
-                kwargs["input"], kwargs["weight"], conv_backend
-            )
-
-    def convert(t, mem_fmt):
-        if t is None:
-            return t
-        if mem_fmt is not None:
-            t = t.to(memory_format=mem_fmt)
-        return FakeTensor(fake_mode, t, device)
-
-    with in_kernel_invocation_manager(fake_mode):
-        out = func(**kwargs)
-
-        if func is aten.convolution.default:
-            return convert(out, mem_fmt)
-        else:
-            return (
-                convert(out[0], mem_fmt),
-                convert(out[1], mem_fmt),
-                convert(out[2], None),
-            )
-
-
-FAST_OP_IMPLEMENTATIONS = {}
-
-
-# Unlike register_op_impl, these don't do the slow iteration for
-# run_impl_check, and these run BEFORE decompositions
-def register_fast_op_impl(func: OpOverload):
-    def impl_decorator(op_impl):
-        FAST_OP_IMPLEMENTATIONS[func] = op_impl
-        return op_impl
-
-    return impl_decorator
-
-
-# infer_size_impl in ExpandUtils
-def infer_size(a, b):
-    dimsA = len(a)
-    dimsB = len(b)
-    ndim = max(dimsA, dimsB)
-    expandedSizes = [0] * ndim
-    for i in range(ndim - 1, -1, -1):
-        offset = ndim - 1 - i
-        dimA = dimsA - 1 - offset
-        dimB = dimsB - 1 - offset
-        sizeA = a[dimA] if dimA >= 0 else 1
-        sizeB = b[dimB] if dimB >= 0 else 1
-
-        # NB: It is very important to test for broadcasting, before testing
-        # sizeA == sizeB.  This is because the broadcasting tests are likely
-        # to be statically known (in particular, if sizeA/sizeB is unbacked
-        # but size-like, we will unsoundly assume they never equal 1), but
-        # the sizeA == sizeB test may not be statically known.  However, once
-        # we have established that no broadcasting is happening, the
-        # sizeA == sizeB is now expect_true and we can defer it as a runtime
-        # assert (this works because Python will return the terminal
-        # expression of an or statement as-is, without bool()'ing it; if this
-        # were not the case, we'd need to write this using torch.sym_or() or
-        # something like that).
-        torch._check(
-            sizeA == 1 or sizeB == 1 or sizeA == sizeB,
-            lambda: f"The size of tensor a ({sizeA}) "
-            f"must match the size of tensor b ({sizeB}) "
-            f"at non-singleton dimension {i})",
-        )
-        expandedSizes[i] = sizeB if sizeA == 1 else sizeA
-    return tuple(expandedSizes)
-
-
-def make_fast_binary_impl(slow_ref):
-    def fast_binary_impl(mode, *args, **kwargs):
-        def slow(msg):
-            count_label(f"slow {msg}")
-            with mode:
-                return slow_ref(*args, **kwargs)
-
-        count_label("attempt fast")
-
-        # Fast path (based off of TensorIterator fast path).
-        # Unfortunately, there is no way to easily deduplicate
-        # this with either the TensorIterator C++ implementation
-        # (which we don't want to SymIntify, and also the algorithm
-        # here is slightly different from TensorIterator to allow
-        # for broadcasting), nor the PrimTorch implementation
-        # (which does not actually implement a fast path.)
-
-        operands = args
-
-        # compute_shape
-        has_scalars = False
-        has_tensors = False
-        final_shape = None
-        for op in operands:
-            shape = op.shape if isinstance(op, torch.Tensor) else ()
-            if len(shape) == 0:
-                has_scalars = True
-            else:
-                has_tensors = True
-            if final_shape is None:
-                final_shape = shape
-            # TODO: Minor optimization: track if the shapes
-            # were equal so you can skip the equality check
-            # below if unnecessary
-            final_shape = infer_size(final_shape, shape)
-        assert final_shape is not None
-
-        # Do some extra safety checks to see if the output
-        # stride is obvious
-        for op in operands:
-            if isinstance(op, torch.Tensor) and op.shape == final_shape:
-                break
-        else:
-            return slow("both tensors nontrivially broadcast")
-
-        # compute_types
-        cpu = torch.device("cpu")
-        common_device = cpu
-        common_dtype = None
-        output_dtype = None
-        has_different_input_dtypes = False
-        for op in operands:
-            if not isinstance(op, torch.Tensor):
-                # Use elementwise_dtypes for the tricky case
-                has_different_input_dtypes = True
-                continue
-            if common_device == cpu and not op.device.type == "cpu":
-                common_device = op.device
-            # Slightly simplified here as target_dtype cannot vary
-            if common_dtype is None:
-                common_dtype = op.dtype
-            elif common_dtype != op.dtype:
-                has_different_input_dtypes = True
-
-        if has_different_input_dtypes:
-            # compute promotion
-            # TODO: we don't need the compute type
-            _, common_dtype = elementwise_dtypes(
-                *operands, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
-            )
-
-        # check all tensors on same device
-        # cpu scalars are assumed allow
-        current_cpu_scalars_on_non_cpu = 0
-        max_cpu_scalars_on_non_cpu = 1  # hard coded atm
-        for op in operands:
-            if not isinstance(op, torch.Tensor):
-                continue
-            if common_device != cpu and op.dim() == 0 and op.device == cpu:
-                if current_cpu_scalars_on_non_cpu >= max_cpu_scalars_on_non_cpu:
-                    return slow("error")
-                current_cpu_scalars_on_non_cpu += 1
-            elif op.device != common_device:
-                return slow("error")
-
-        # compute_fast_setup_type
-        is_contiguous = True
-        is_channels_last = True
-        # TODO: is_non-overlapping_and_dense (not bound from Python
-        # no inplace, no out, everything defined
-
-        if is_noncontiguous_supported(common_device):
-            for op in operands:
-                if not isinstance(op, torch.Tensor):
-                    continue
-                is_contiguous = is_contiguous and op.is_contiguous(
-                    memory_format=torch.contiguous_format
-                )
-                is_channels_last = is_channels_last and op.is_contiguous(
-                    memory_format=torch.channels_last
-                )
-        if is_contiguous:
-            # do contiguous
-            count_label("fast is_contiguous")
-            return FakeTensor(
-                mode,
-                torch.empty(
-                    final_shape,
-                    dtype=common_dtype,
-                    device="meta",
-                    memory_format=torch.contiguous_format,
-                ),
-                device=common_device,
-            )
-        if is_channels_last:
-            count_label("fast channels_last")
-            # do channels last
-            return FakeTensor(
-                mode,
-                torch.empty(
-                    final_shape,
-                    dtype=common_dtype,
-                    device="meta",
-                    memory_format=torch.channels_last,
-                ),
-                device=common_device,
-            )
-
-        return slow("no contiguity match")
-
-    return fast_binary_impl
-
-
-@functools.lru_cache(None)
-def get_fast_op_impls():
-    import torch._refs
-
-    register_fast_op_impl(torch.ops.aten.add.Tensor)(
-        make_fast_binary_impl(torch._refs.add)
-    )
-    register_fast_op_impl(torch.ops.aten.sub.Tensor)(
-        make_fast_binary_impl(torch._refs.sub)
-    )
-    register_fast_op_impl(torch.ops.aten.mul.Tensor)(make_fast_binary_impl(torch._refs.mul))  # type: ignore[has-type]
-    register_fast_op_impl(torch.ops.aten.div.Tensor)(
-        make_fast_binary_impl(torch._refs.div)
-    )
-    return FAST_OP_IMPLEMENTATIONS
-
 
 @functools.lru_cache(None)
 def init_cuda_context():
@@ -1101,15 +370,17 @@ def in_kernel_invocation_manager(fake_mode):
     meta_in_tls = torch._C._meta_in_tls_dispatch_include()
     assert meta_in_tls == prev_in_kernel, f"{meta_in_tls}, {prev_in_kernel}"
 
-    guard = torch._C._DisableTorchDispatch()  # type: ignore[attr-defined]
-    fake_mode.in_kernel_invocation = True
-    torch._C._set_meta_in_tls_dispatch_include(True)
-    try:
-        yield
-    finally:
-        fake_mode.in_kernel_invocation = prev_in_kernel
-        torch._C._set_meta_in_tls_dispatch_include(prev_in_kernel)
-        del guard
+    with torch._C._DisableTorchDispatch():
+        fake_mode.in_kernel_invocation = True
+        # Unfortunately _set_meta_in_tls_dispatch_include(False) can leave
+        # `Dense` turned on (because it's implied by `Meta`)
+        with torch._C._PreserveDispatchKeyGuard():
+            torch._C._set_meta_in_tls_dispatch_include(True)
+            try:
+                yield
+            finally:
+                fake_mode.in_kernel_invocation = prev_in_kernel
+                # torch._C._set_meta_in_tls_dispatch_include(prev_in_kernel)
 
 
 # Return if the function allows Python numbers to bind to Tensors
@@ -1120,7 +391,7 @@ def should_allow_numbers_as_tensors(func: OpOverload):
 
 
 class FakeTensorConfig:
-    debug = os.environ.get("TORCH_FAKE_TENSOR_DEBUG", False)
+    debug = os.environ.get("TORCH_FAKE_TENSOR_DEBUG", "0") == "1"
 
 
 class FakeTensor(torch.Tensor):
@@ -1135,6 +406,7 @@ class FakeTensor(torch.Tensor):
     fake_device: torch.device
     fake_mode: "FakeTensorMode"
     constant: Optional[torch.Tensor]
+    real_tensor: Optional[torch.Tensor]
 
     # This memorizes the unbacked SymInt representing the number of nonzero
     # elements in this tensor.  This is helpful if you do something like
@@ -1144,6 +416,11 @@ class FakeTensor(torch.Tensor):
     # TODO: Generalize this as needed, e.g., into a trie of memos
     _nonzero_memo: Optional[torch.SymInt]
     _nonzero_memo_vc: Optional[int]
+    # When we retrace, we need to invalidate all the memos so that we can
+    # accurately identify the first time unbacked SymInts are allocated.
+    # This is only relevant for inputs; for intermediates, they will get fresh
+    # fake tensors so you won't have a memo anyway
+    _nonzero_memo_epoch: Optional[int]
 
     # Indicates to our torch_dispatch dispatching infra that
     # this is an "infra" mode with lower dispatching precedence.
@@ -1155,11 +432,39 @@ def nonzero_memo(self):
             return None
         # Version counter based tracking isn't 100% sound but it's close
         # enough
-        if self._nonzero_memo_vc != self._version:
+        if (
+            self._nonzero_memo_vc != self._version
+            or self._nonzero_memo_epoch != self.fake_mode.epoch
+        ):
             self._nonzero_memo = None
             return None
         return self._nonzero_memo
 
+    # This memorizes the unbacked SymInt representing the number of unique
+    # elements in this tensor.  This is helpful if you do something like
+    # calling torch.unique(x) multiple times and should
+    # give a consistent unbacked SymInt.  It needs to be invalidated in the
+    # same way constant is.
+    # TODO: Generalize this as needed, e.g., into a trie of memos
+    _unique_memo: Optional[torch.SymInt]
+    _unique_memo_vc: Optional[int]
+
+    @property
+    def unique_memo(self):
+        if self._unique_memo is None:
+            return None
+        # Version counter based tracking isn't 100% sound but it's close
+        # enough
+        if self._unique_memo_vc != self._version:
+            self._unique_memo = None
+            return None
+        return self._unique_memo
+
+    @unique_memo.setter
+    def unique_memo(self, value):
+        self._unique_memo = value
+        self._unique_memo_vc = self._version
+
     @property
     def device(self):
         if self.fake_mode.in_kernel_invocation:
@@ -1182,8 +487,15 @@ def device(self):
     # that have dispatch keys which are higher than the "meta" key:
     # https://github.com/pytorch/pytorch/blob/main/c10/core/DispatchKey.h#L189
 
+    # We don't support named tensors; graph break
+    @property
+    def names(self):
+        raise UnsupportedFakeTensorException(
+            "torch.compile doesn't support named tensors"
+        )
+
     @staticmethod
-    def __new__(cls, fake_mode, elem, device, constant=None):
+    def __new__(cls, fake_mode, elem, device, constant=None, real_tensor=None):
         self = torch.Tensor._make_subclass(
             cls,
             elem,
@@ -1191,6 +503,10 @@ def __new__(cls, fake_mode, elem, device, constant=None):
             dispatch_device=True,
             device_for_backend_keys=device,
         )
+        if not fake_mode._allow_unsafe_data_ptr_access:
+            torch._C._set_throw_on_mutable_data_ptr(self)
+        else:
+            torch._C._set_warn_deprecated_on_mutable_data_ptr(self)
 
         assert elem.device.type == "meta", elem.device.type
         device = device if isinstance(device, torch.device) else torch.device(device)
@@ -1212,19 +528,24 @@ def __new__(cls, fake_mode, elem, device, constant=None):
             in ["cuda", "hpu", "xpu", torch._C._get_privateuse1_backend_name()]
             and device.index is None
         ):
-            device = torch.device(
-                f"{device.type}:{getattr(torch, device.type).current_device()}"
-            )
+            if getattr(torch, device.type).is_initialized():
+                device = torch.device(
+                    f"{device.type}:{getattr(torch, device.type).current_device()}"
+                )
+            else:
+                device = torch.device(f"{device.type}:0")
         self.fake_device = device  # type: ignore[attr-defined]
         self.fake_mode = fake_mode  # type: ignore[attr-defined]
         self.constant = constant  # type: ignore[attr-defined]
+        assert not isinstance(real_tensor, FakeTensor)
+        self.real_tensor = real_tensor  # type: ignore[attr-defined]
         self._nonzero_memo = None  # type: ignore[attr-defined]
         self._nonzero_memo_vc = None  # type: ignore[attr-defined]
+        self._unique_memo = None  # type: ignore[attr-defined]
+        self._unique_memo_vc = None  # type: ignore[attr-defined]
 
         if FakeTensorConfig.debug:
-            import traceback
-
-            self._debug_trace = traceback.extract_stack()  # type: ignore[attr-defined]
+            self._debug_trace = CapturedTraceback.extract()  # type: ignore[attr-defined]
         return self
 
     # In some circumstances, a conventional torch.Tensor constructor
@@ -1264,6 +585,18 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
             else:
                 return args[0].fake_device
 
+        # this handler must be done inside FakeTensor subclass, not mode, because
+        # we can end up dispatching here when we have a fake tensor with
+        # symbolic sizes running under in_kernel_invocation_manager.
+        # The subclass is asked to handle this query because size (not
+        # sym_size) was called, but we are unable to serve it directly because
+        # there are symbolic sizes in the class.  The use of
+        # in_kernel_invocation_manager means it's incorrect to activate a
+        # mode to actually handle this (this caused
+        # https://github.com/pytorch/pytorch/issues/122772).
+        if handler := _DISPATCH_META_HANDLERS.get(func):
+            return handler(args)
+
         # Because fake mode can return NotImplemented (if it sees a subclass
         # it doesn't know how to deal with), this test here is important
         # because the next dispatch after a fake mode will attempt to use
@@ -1304,6 +637,8 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
             )
             return NotImplemented
 
+        assert not fake_mode.in_kernel_invocation
+
         with fake_mode:  # type: ignore[attr-defined]
             return func(*args, **kwargs)
 
@@ -1387,11 +722,113 @@ def tolist(self):
         for _ in range(self.shape[0]):
             s = shape_env.create_unbacked_symint()
             # max value?
-            torch._constrain_as_size(s, min=2)
+            torch._check_is_size(s)
+            torch._check(s >= 2)
             out.append(s)
         return out
 
-    __torch_function__ = torch._C._disabled_torch_function_impl
+
+@dataclass(frozen=True)
+class TensorMetadata:
+    """
+    The Tensor metadata relevant to hashing FakeTensors when caching.
+    """
+
+    dtype: torch.dtype
+    shape: torch.Size
+    stride: Tuple[Any, ...]
+    device: torch.device
+    layout: torch.layout
+    memory_format: Optional[torch.memory_format]
+    storage_offset: int
+    requires_grad: bool
+    is_quantized: bool
+    is_conj: bool
+    is_neg: bool
+    is_inference: bool
+    is_sparse: bool  # read: is sparse COO
+    is_coalesced: Optional[bool]
+    dense_dim: Optional[int]
+    sparse_dim: Optional[int]
+
+
+def extract_tensor_metadata(t: torch.Tensor) -> "TensorMetadata":
+    """
+    Extract the TensorMetadata of a tensor.
+    """
+    memory_format: Optional[torch.memory_format] = suggest_memory_format(t)
+    if is_sparse_any(t) or not t.is_contiguous(memory_format=memory_format):
+        memory_format = None
+
+    return TensorMetadata(
+        dtype=t.dtype,
+        shape=t.shape,
+        stride=t.stride() if t.layout == torch.strided else (),
+        device=t.device,
+        layout=t.layout,
+        memory_format=memory_format,
+        storage_offset=t.storage_offset(),
+        requires_grad=t.requires_grad,
+        is_quantized=t.is_quantized,
+        is_conj=t.is_conj(),
+        is_neg=t.is_neg(),
+        is_inference=t.is_inference(),
+        is_sparse=t.is_sparse,
+        is_coalesced=t.is_coalesced() if t.is_sparse else None,
+        dense_dim=t.dense_dim() if t.is_sparse else None,
+        sparse_dim=t.sparse_dim() if t.is_sparse else None,
+    )
+
+
+class _DispatchCacheKey(list):
+    """
+    Key for the FakeTensor dispatch cache. Inspired by (copied from)
+    _HashedSeq from the functools.lru_cache implementation.
+    """
+
+    __slots__ = "hashvalue"  # noqa: PLC0205
+
+    def __init__(self, tup, hash=hash):
+        self[:] = tup
+        self.hashvalue = hash(tup)
+
+    def __hash__(self):
+        return self.hashvalue
+
+
+@dataclass(frozen=True)
+class _DispatchCacheEntry:
+    """
+    Entry type for the FakeTensor dispatch cache. Accounts for two possibilities:
+    1) The op is inplace, and a hit means we need to alias the argument at a given
+    index. 2) We need to synthesize a new FakeTensor given tensor metadata. For view
+    ops, we further capture the index of the arg to alias.
+    """
+
+    inplace_idx: Optional[int] = None
+    metadata: Optional[TensorMetadata] = None
+    view_idx: Optional[int] = None
+
+
+@dataclass(frozen=True)
+class _BypassDispatchCache(Exception):
+    """
+    Signals cases that should skip FakeTensor caching.
+    """
+
+    reason: str
+
+
+@dataclass(frozen=True)
+class DispatchCacheInfo:
+    """
+    Information about the state of the FakeTensor dispatch cache.
+    """
+
+    hits: int
+    misses: int
+    bypasses: Dict[str, int]
+    size: int
 
 
 # We keep one instantiation of `fake_tensor_converter` active
@@ -1404,6 +841,15 @@ def tolist(self):
 
 
 class FakeTensorMode(TorchDispatchMode):
+    cache: Dict[_DispatchCacheKey, _DispatchCacheEntry] = {}
+    cache_hits: int = 0
+    cache_misses: int = 0
+    cache_bypasses: Dict[str, int] = defaultdict(int)
+    # Every time you retrace using the same fake tensor mode, you should
+    # advance the epoch so we don't reuse unbacked memos
+    epoch: int = 0
+    in_kernel_invocation: bool = False
+
     def __init__(
         self,
         *,
@@ -1420,9 +866,28 @@ def __init__(
         else:
             self.static_shapes = shape_env is None
 
+        import torch._dynamo.config
         import torch._functorch.config
 
+        # This is temporarily patched to True in Dynamo to grandfather in some
+        # places where we unconditionally allow scalar outputs, TO BE REMOVED
+        self.allow_scalar_outputs = False
+
+        self.propagate_real_tensors = (
+            torch._functorch.config.fake_tensor_propagate_real_tensors
+        )
+
+        self._allow_unsafe_data_ptr_access = (
+            torch._functorch.config.fake_tensor_allow_unsafe_data_ptr_access
+        )
         self.allow_meta = torch._functorch.config.fake_tensor_allow_meta
+        self.cache_enabled = (
+            torch._dynamo.config.fake_tensor_cache_enabled
+            and not self.propagate_real_tensors
+        )
+        self.cache_crosscheck_enabled = (
+            torch._dynamo.config.fake_tensor_cache_crosscheck_enabled
+        )
 
         # A flag that controls, whether we want to invoke ops on mix of
         # real weights/global variables and fake inputs
@@ -1445,9 +910,11 @@ def __init__(
         # in_kernel_invocation
         # If another fake mode was already active when we enter, we also stash it here.
         # That way when we exit, we know to re-enable the previous fake mode.
-        self.enter_stack: List[Tuple[bool, Optional[FakeTensorMode]]] = []
+        self.enter_stack: List[
+            Tuple[bool, Optional[TorchDispatchMode], Optional[_bool]]
+        ] = []
 
-        self.shape_env = shape_env
+        self.shape_env: ShapeEnv = shape_env
 
         self.stack = "".join(traceback.format_stack())
 
@@ -1470,6 +937,15 @@ def __init__(
     def is_our_fake(self, t):
         return isinstance(t, FakeTensor) and t.fake_mode is self
 
+    # If we should avoid device init. This changes the behavior of various APIs:
+    # - We avoid constant-prop on Tensors with ops that move them to another device
+    # - We change the torch.tensor ctor contract to never materialize
+    #   tensors on device
+    #   (see NOTE: [torch.tensor, lift_fresh, and device movement])
+    @property
+    def avoid_device_init(self):
+        return not torch.cuda.is_available()
+
     @count
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # FakeTensorMode should not be set when we're inside of it.
@@ -1484,27 +960,342 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
     # No-op if FakeTensorMode is already in use
     def __enter__(self):
+        prev_only_lift_cpu_tensors = None
+        if self.avoid_device_init:
+            # See NOTE: [torch.tensor, lift_fresh, and device movement]
+            prev_only_lift_cpu_tensors = torch._C._only_lift_cpu_tensors()
+            torch._C._set_only_lift_cpu_tensors(True)
         maybe_prev_fake_mode = torch._C._unset_dispatch_mode(self._mode_key)
         if self is not maybe_prev_fake_mode:
-            self.enter_stack.append((True, maybe_prev_fake_mode))
+            self.enter_stack.append(
+                (True, maybe_prev_fake_mode, prev_only_lift_cpu_tensors)
+            )
             return super().__enter__()
         else:
             # no-op (still need to re-set the fake mode though since we unset it)
             torch._C._set_dispatch_mode(self)
-            self.enter_stack.append((False, None))
+            self.enter_stack.append((False, None, prev_only_lift_cpu_tensors))
         return self
 
     def __exit__(self, a, b, c):
-        live, maybe_prev_fake_mode = self.enter_stack.pop()
+        (
+            live,
+            maybe_prev_fake_mode,
+            maybe_prev_only_lift_cpu_tensors,
+        ) = self.enter_stack.pop()
         if live:
             out = super().__exit__(a, b, c)
             # Re-enable the previous fake mode, if there was one.
             if maybe_prev_fake_mode is not None:
                 torch._C._set_dispatch_mode(maybe_prev_fake_mode)
+            if maybe_prev_only_lift_cpu_tensors is not None:
+                torch._C._set_only_lift_cpu_tensors(maybe_prev_only_lift_cpu_tensors)
+
+    @classmethod
+    def cache_info(cls) -> DispatchCacheInfo:
+        """
+        Query the state of the dispatch cache.
+        """
+        return DispatchCacheInfo(
+            FakeTensorMode.cache_hits,
+            FakeTensorMode.cache_misses,
+            dict(FakeTensorMode.cache_bypasses),
+            len(FakeTensorMode.cache),
+        )
+
+    @classmethod
+    def cache_clear(cls):
+        """
+        Clear the dispatch cache.
+        """
+        cls.cache_hits = 0
+        cls.cache_misses = 0
+        cls.cache_bypasses.clear()
+        cls.cache.clear()
+
+    def _cached_dispatch_impl(
+        self,
+        func: OpOverload,
+        types: Tuple[Any, ...],
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ):
+        """
+        Lookup a cache entry for the given arguments. If none exists, dispatch
+        and cache the result (if the result is eligible for caching).
+        """
+        output: Union[FakeTensor, _Unassigned] = _UNASSIGNED
+        try:
+            key = self._cache_key(func, args, kwargs)
+            entry = FakeTensorMode.cache.get(key, None)
+            if entry is not None:
+                output = self._output_from_cache_entry(entry, func, args)
+                FakeTensorMode.cache_hits += 1
+                if self.cache_crosscheck_enabled:
+                    # For debugging / testing: Validate that the output synthesized
+                    # from the cache matches the output created by normal dispatch.
+                    self._crosscheck_cache_output(output, func, types, args, kwargs)
+            else:
+                output = self._dispatch_impl(func, types, args, kwargs)
+                entry = self._make_cache_entry(key, func, args, kwargs, output)
+                FakeTensorMode.cache[key] = entry
+                FakeTensorMode.cache_misses += 1
+        except _BypassDispatchCache as e:
+            FakeTensorMode.cache_bypasses[e.reason] += 1
+
+        if output is _UNASSIGNED:
+            output = self._dispatch_impl(func, types, args, kwargs)
+
+        return output
+
+    def _cache_key(
+        self,
+        func: OpOverload,
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> _DispatchCacheKey:
+        """
+        Create a cache key given the dispatch args. Raises _BypassDispatchCache
+        for any situation that precludes caching.
+        """
+        # Avoid caching for any ops that would require a more sophisticated
+        # caching implementation, e.g., data dependent ops or ops that modify
+        # the inputs.
+        if torch.Tag.data_dependent_output in func.tags:
+            raise _BypassDispatchCache("data dependent output")
+
+        if torch.Tag.dynamic_output_shape in func.tags:
+            raise _BypassDispatchCache("dynamic output shape")
+
+        if torch.Tag.inplace_view in func.tags:
+            raise _BypassDispatchCache("inplace view")
+
+        if func == aten._unsafe_view.default:
+            raise _BypassDispatchCache("unsafe view")
+
+        if func in self.lift_fns:
+            raise _BypassDispatchCache("lift")
+
+        if not torch._library.utils.is_builtin(func):
+            raise _BypassDispatchCache("non-builtin")
+
+        # In order to handle storage aliasing, we need to establish the alias
+        # for any view op on a cache hit. But CompositeImplicitAutograd ops may
+        # or may not alias the input, so just punt on caching these.
+        if func.is_view and torch._C._dispatch_has_kernel_for_dispatch_key(
+            func.name(), torch._C.DispatchKey.CompositeImplicitAutograd
+        ):
+            raise _BypassDispatchCache("CompositeImplicitAutograd")
+
+        key_values = (
+            func,
+            # Translate any FakeTensor args to metadata.
+            self._prep_args_for_hash(args) if args else (),
+            self._prep_args_for_hash(kwargs) if kwargs else (),
+            # Capture the default_dtype mode since that can affect the output tensor,
+            # e.g., when operating on constant float values.
+            torch.get_default_dtype(),
+            # Capture the current device to support, e.g., cache tensor creation,
+            # where there isn't necessarily a tensor to take the device from.
+            torch._C._get_default_device(),
+            # We want to create tensors from cached metadata only when the inference
+            # mode is the same.
+            torch.is_inference_mode_enabled(),
+            # Shape env settings could affect behavior. One example seen in the wild:
+            # Disallowing dynamic shapes can introduce a DynamicOutputShapeException
+            # where it wasn't seen on a previous instance of the same op.
+            self.shape_env.settings if self.shape_env else None,
+        )
+        return _DispatchCacheKey(key_values)
+
+    def _prep_args_for_hash(self, args: Any) -> Any:
+        """
+        Translate the provided args into a form suitable for caching at FakeTensor
+        dispatch, i.e., convert unhashable types like lists & dicts into tuples and
+        convert FakeTensors into metadata. Raises _BypassDispatchCache to signal
+        unsupported cases that should bypass caching.
+        """
+        if isinstance(args, dict):
+            args = list(args.keys()) + list(args.values())
+
+        result: List[Any] = []
+        for arg in args:
+            if isinstance(arg, FakeTensor):
+                if not self.is_our_fake(arg):
+                    raise _BypassDispatchCache("not our fake")
+                if arg._has_symbolic_sizes_strides:
+                    raise _BypassDispatchCache("symbolic shape")
+                if arg.constant is not None:
+                    raise _BypassDispatchCache("constant attribute")
+                if arg.is_sparse:
+                    raise _BypassDispatchCache("sparse tensor")
+                if is_sparse_compressed(arg):
+                    raise _BypassDispatchCache("sparse compressed tensor")
+                result.append(extract_tensor_metadata(arg))
+            elif isinstance(arg, torch.Tensor):
+                raise _BypassDispatchCache("non-fake tensor")
+            elif isinstance(arg, (torch.SymBool, torch.SymInt, torch.SymFloat)):
+                raise _BypassDispatchCache("symbolic shape")
+            elif isinstance(arg, (list, tuple, dict)):
+                result.extend(self._prep_args_for_hash(arg))
+            else:
+                # It's important to capture the type of the arg since, e.g., 1 and 1.0
+                # hash to the same value, but can produce different dtypes for the
+                # output tensor.
+                result.append((type(arg), arg))
+
+        return tuple(result)
+
+    def _make_cache_entry(
+        self,
+        key: _DispatchCacheKey,
+        func: OpOverload,
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+        output: FakeTensor,
+    ) -> _DispatchCacheEntry:
+        """
+        Make a cache entry object for the given 'output' Tensor. Raises
+        _BypassDispatchCache if the output tensor has characteristics that
+        prevent caching it.
+        """
+        # Some ops return tuples of Tensors, but it's rare, so avoid
+        # the complexity of caching other types.
+        if not isinstance(output, FakeTensor):
+            raise _BypassDispatchCache("non-FakeTensor output")
+
+        # Avoid caching FakeTensors with constants attached since those
+        # can be invalidated.
+        if output.constant is not None:
+            raise _BypassDispatchCache("constant attribute")
+
+        # TODO: support caching sparse outputs?
+        if output.is_sparse:
+            raise _BypassDispatchCache("sparse output")
+
+        if is_sparse_compressed(output):
+            raise _BypassDispatchCache("sparse compressed output")
+
+        # Can an in-place op really reference a kwarg? If so, then we need
+        # to extend the implementation to handle it.
+        for kval in kwargs.values():
+            if id(kval) == id(output):
+                raise _BypassDispatchCache("kwarg aliases output")
+
+        # If this is an in-place op, the entry records which input arg is aliased.
+        for idx in range(len(args)):
+            if id(args[idx]) == id(output):
+                return _DispatchCacheEntry(
+                    inplace_idx=idx, metadata=None, view_idx=None
+                )
+
+        # Otherwise, create an entry that records the output tensor's metadata.
+        view_idx = None
+        if func.is_view:
+            idxs = [i for i, t in enumerate(args) if isinstance(t, torch.Tensor)]
+            assert len(idxs) == 1
+            view_idx = idxs[0]
+
+        metadata = extract_tensor_metadata(output)
+        entry = _DispatchCacheEntry(
+            inplace_idx=None, metadata=metadata, view_idx=view_idx
+        )
+
+        # N.B.: Some checks for bypassing the cache would be performed on the
+        # output tensor synthesized from the cached metadata. As an optimization,
+        # we can synthesize a tensor here and do the checks on that instance.
+        # This approach keeps the (more frequent) cache-hit path as lightweight
+        # as possible.
+        synth_output = self._output_from_cache_entry(entry, func, args)
+
+        # Make sure the dispatch_key_set from the synthesized output tensor will
+        # be the same.
+        synth_key_set = torch._C._dispatch_key_set(synth_output)
+        key_set = torch._C._dispatch_key_set(output)
+        if synth_key_set != key_set:
+            raise _BypassDispatchCache("dispatch_key_set mismatch")
+
+        return entry
+
+    def _output_from_cache_entry(
+        self, entry: _DispatchCacheEntry, func: OpOverload, args: Tuple[Any, ...]
+    ) -> FakeTensor:
+        """
+        Create a new FakeTensor from the cache entry.
+        """
+        if entry.inplace_idx is not None:
+            # This is an in-place op; return the aliased arg.
+            return args[entry.inplace_idx]
+
+        # Synthesize a new FakeTensor with the cached metadata.
+        metadata = entry.metadata
+        assert metadata and not metadata.is_sparse
+
+        empty = torch.empty_strided(
+            metadata.shape,
+            metadata.stride,
+            dtype=metadata.dtype,
+            layout=metadata.layout,
+            device="meta",
+            requires_grad=metadata.requires_grad,
+        )
+
+        if metadata.is_conj:
+            torch._C._set_conj(empty, True)
+        if metadata.is_neg:
+            torch._C._set_neg(empty, True)
+
+        maybe_suppress: Callable[[], Any] = contextlib.nullcontext
+        if self.shape_env is not None:
+            maybe_suppress = self.shape_env.suppress_guards
+
+        if func.is_view:
+            # For view ops, the storage should be the same as the tensor input.
+            storage = args[cast(int, entry.view_idx)].untyped_storage()
+            with in_kernel_invocation_manager(self), maybe_suppress():
+                empty.set_(
+                    storage, metadata.storage_offset, metadata.shape, metadata.stride
+                )
+        elif metadata.storage_offset != 0:
+            storage = empty.untyped_storage()
+            with in_kernel_invocation_manager(self), maybe_suppress():
+                empty.set_(
+                    storage, metadata.storage_offset, metadata.shape, metadata.stride
+                )
+
+        return FakeTensor(self, empty, metadata.device)
+
+    def _crosscheck_cache_output(
+        self,
+        output: FakeTensor,
+        func: OpOverload,
+        types: Tuple[Any, ...],
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ):
+        """
+        Helper to validate that the output synthesized from the cache matches
+        the output created by normal dispatch.
+        """
+        try:
+            true_output = self._dispatch_impl(func, types, args, kwargs)
+        except Exception as e:
+            raise RuntimeError(
+                f"FakeTensor cache crosscheck failure: func={func}, "
+                f"args={args}, kwargs={kwargs}: Dispatch raised={e}"
+            ) from e
+        try:
+            assert_metadata_eq(assert_eq, true_output, output)
+        except Exception as e:
+            raise RuntimeError(
+                f"FakeTensor cache crosscheck failure: func={func}, "
+                f"args={args}, kwargs={kwargs}"
+            ) from e
 
     def dispatch(self, func, types, args=(), kwargs=None):
         kwargs = kwargs or {}
-        log.debug("%s %s %s", func, args, kwargs)
+        with no_dispatch():
+            log.debug("%s %s %s", func, args, kwargs)
 
         if func in _DISPATCH_META_HANDLERS:
             return _DISPATCH_META_HANDLERS[func](args)
@@ -1518,11 +1309,17 @@ def dispatch(self, func, types, args=(), kwargs=None):
 
         # Some attribute queries that can be serviced directly
         # See Note [is_coalesced is dispatched]
-        if func in _DISPATCH_HANDLE_DIRECLTY:
+        if func in _DISPATCH_HANDLE_DIRECTLY:
             # NB: no_dispatch is ok here too, this func is very simple
             with in_kernel_invocation_manager(self):
                 return func(*args, **kwargs)
 
+        if self.cache_enabled:
+            return self._cached_dispatch_impl(func, types, args, kwargs)
+        else:
+            return self._dispatch_impl(func, types, args, kwargs)
+
+    def _dispatch_impl(self, func, types, args, kwargs) -> FakeTensor:
         flat_args, args_spec = pytree.tree_flatten((args, kwargs))
 
         flat_arg_fake_tensors = [
@@ -1567,7 +1364,7 @@ def maybe_to_constant(t):
                 # therefore the exact type test above
                 with no_dispatch():
                     out = out.clone()
-                return converter(self, out, make_constant=True)
+                return converter.from_real_tensor(self, out, make_constant=True)
 
         # See [subclass inputs] below
         # NB: If you're seeing a mysterious infinite loop involving fake
@@ -1591,7 +1388,20 @@ def maybe_to_constant(t):
             assert len(kwargs) == 0 and len(args) == 1, f"{args} {kwargs}"
 
             if type(args[0]) is torch.Tensor:
-                return converter(self, args[0])
+                return converter.from_real_tensor(self, args[0])
+
+        # If we are trying to avoid device init, then we need to avoid constant
+        # prop on constant tensors for ops that change devices.
+        avoiding_device_init = False
+        if self.avoid_device_init:
+            if (
+                func == torch.ops.aten._to_copy.default
+                and "device" in kwargs
+                and kwargs["device"] != "cpu"
+            ):
+                avoiding_device_init = True
+            if func == torch.ops.prims.device_put.default:
+                avoiding_device_init = True
 
         # Recompute flat_arg_fake_tensors here again in case some of the inputs
         # were real tensors and fakified in validate_and_convert_non_fake_tensors
@@ -1617,6 +1427,7 @@ def maybe_to_constant(t):
             and all_constant
             and len(flat_arg_fake_tensors) != 0
             and not has_symbolic_sizes
+            and not avoiding_device_init
         ):
             const_flat_args = [maybe_to_constant(a) for a in flat_args]
             const_args, const_kwargs = pytree.tree_unflatten(const_flat_args, args_spec)
@@ -1633,7 +1444,7 @@ def maybe_to_constant(t):
             if all_constant:
                 return pytree.tree_map_only(
                     torch.Tensor,
-                    lambda t: converter(self, t, make_constant=True),
+                    lambda t: converter.from_real_tensor(self, t, make_constant=True),
                     out,
                 )
 
@@ -1647,11 +1458,84 @@ def maybe_to_constant(t):
         args, kwargs = pytree.tree_unflatten(flat_args, args_spec)
         self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs)
 
+        def maybe_to_real_tensor(t):
+            if isinstance(t, FakeTensor):
+                return t.real_tensor
+            elif isinstance(t, SymTypes):
+                return t.node.pytype(
+                    t.node.expr.xreplace(self.shape_env.var_to_val).xreplace(
+                        self.shape_env.unbacked_var_to_val
+                    )
+                )
+            else:
+                return t
+
+        from torch.fx.experimental.symbolic_shapes import (
+            compute_unbacked_bindings,
+            free_unbacked_symbols,
+            SymTypes,
+        )
+
+        nil = object()
+
+        real_out = nil
+        if (
+            self.propagate_real_tensors
+            and all(e.real_tensor is not None for e in flat_arg_fake_tensors)
+            # TODO: Handle SymFloat/SymBool
+            and not any(
+                (
+                    isinstance(a, torch.SymInt)
+                    and (syms := free_unbacked_symbols(a))
+                    and any(s not in self.shape_env.unbacked_var_to_val for s in syms)
+                )
+                for a in flat_args
+            )
+        ):
+            real_flat_args = [maybe_to_real_tensor(a) for a in flat_args]
+            real_args, real_kwargs = pytree.tree_unflatten(real_flat_args, args_spec)
+            real_out = func(*real_args, **real_kwargs)
+        elif self.propagate_real_tensors:
+            # This can happen occasionally legitimately, specifically when you
+            # are inside the meta of a data dependent operation and you create
+            # a tensor on an unbacked SymInt; at this point in time we don't
+            # know what the unbacked SymInt is, but we will know later.
+            # However, if there's a bug in the condition above, this condition
+            # will also trigger.
+            log.debug(
+                "propagate_real_tensors skipped %s(%s, %s) %s",
+                func,
+                flat_arg_fake_tensors,
+                flat_args,
+                self.shape_env.unbacked_var_to_val,
+            )
+
+        def maybe_propagate_real_tensors(fake_out):
+            import sympy
+
+            def go(t, real_t):
+                if isinstance(t, FakeTensor):
+                    # NB: unconditionally overwrite
+                    t.real_tensor = real_t
+                elif isinstance(t, SymTypes) and free_unbacked_symbols(t):
+                    if isinstance(t.node.expr, sympy.Symbol):
+                        self.shape_env.set_unbacked_var_to_val(t.node.expr, real_t)
+
+            if real_out is not nil:
+                tree_map_(go, fake_out, real_out)
+
+                # If a data-dependent op is used in a decomposition, we
+                # may need to get the unbacked settings "early"
+                # TODO: Is this really needed?
+                compute_unbacked_bindings(self.shape_env, fake_out, peek=True)
+
+            return fake_out
+
         # Try for fastpath
         if has_symbolic_sizes:
             fast_impl = get_fast_op_impls().get(func)
             if fast_impl is not None:
-                return fast_impl(self, *args, **kwargs)
+                return maybe_propagate_real_tensors(fast_impl(self, *args, **kwargs))
 
         # If there's a Python meta, prefer that over the decomposition
         from torch._decomp import meta_table as meta_table
@@ -1690,7 +1574,9 @@ def maybe_to_constant(t):
             and not stride_incorrect_op(func)
         ):
             with self:
-                return func.prim_meta_impl(*args, **kwargs)
+                return maybe_propagate_real_tensors(
+                    func.prim_meta_impl(*args, **kwargs)
+                )
 
         # Users can register FakeTensor rules for custom operators
         # Call them if they exist.
@@ -1698,10 +1584,10 @@ def maybe_to_constant(t):
             func.name()
         ).abstract_impl.kernel
         if maybe_abstract_impl:
-            ctx = torch._library.abstract_impl.AbstractImplCtx(self.shape_env, func)
+            ctx = torch._library.abstract_impl.AbstractImplCtx(self, func)
             with torch._library.abstract_impl.set_ctx_getter(lambda: ctx), self:
                 result = maybe_abstract_impl(*args, **kwargs)
-                return result
+                return maybe_propagate_real_tensors(result)
 
         # special handling for funcs registered through `register_op_impl`,
         # e.g., manipulating args on constructor calls to construct meta tensors
@@ -1710,13 +1596,13 @@ def maybe_to_constant(t):
             if run_impl_check(func):
                 op_impl_out = op_impl(self, func, *args, **kwargs)
                 if op_impl_out != NotImplemented:
-                    return op_impl_out
+                    return maybe_propagate_real_tensors(op_impl_out)
 
         def maybe_run_unsafe_fallback(error=None):
             # We infer the meta of a custom ops that return None to just
             # return None. custom ops are not allowed to mutate metadata
             # of their inputs, so this is safe.
-            if can_generate_trivial_abstract_impl(func):
+            if torch._library.utils.can_generate_trivial_fake_impl(func):
                 return None
             # no meta kernel registered, fallback to kernel for the device
             if has_symbolic_sizes or not self.can_run_unsafe_fallback(func):
@@ -1728,7 +1614,7 @@ def maybe_run_unsafe_fallback(error=None):
         # Optimization: If there is no Meta kernel, it takes a surprisingly long
         # amount of time to catch the NotImplementedError, so we check it here.
         if not has_meta(func):
-            return maybe_run_unsafe_fallback()
+            return maybe_propagate_real_tensors(maybe_run_unsafe_fallback())
 
         # run kernel registered to meta for func, which include
         # python meta registrations, prims, decomps, and c++ meta fns (structured kernels)
@@ -1738,9 +1624,14 @@ def maybe_run_unsafe_fallback(error=None):
                 r = func(*args, **kwargs)
         except NotImplementedError as not_implemented_error:
             return maybe_run_unsafe_fallback(not_implemented_error)
+        except Exception:
+            log.exception("failed while attempting to run meta for %s", func)
+            raise
 
-        return self.wrap_meta_outputs_with_default_device_logic(
-            r, func, flat_args, device=kwargs.get("device")
+        return maybe_propagate_real_tensors(
+            self.wrap_meta_outputs_with_default_device_logic(
+                r, func, flat_args, device=kwargs.get("device")
+            )
         )
 
     # WARNING: DO NOT add any additional namespaces/operators here if they refer to operators
@@ -1799,7 +1690,7 @@ def validate_and_convert_non_fake_tensors(
         If not, try to convert them to fake tensors.
         Returns the original args, kwargs, and a flattened list of (args, kwargs) that are fake tensors.
         """
-        flat_arg_fake_tensors = []
+        flat_arg_fake_tensors: List[Any] = []
 
         def validate(x):
             if not isinstance(x, torch.Tensor):
@@ -1809,19 +1700,19 @@ def validate(x):
             if not self.is_our_fake(x):
                 if torch.Tag.inplace_view in func.tags:
                     args, kwargs = pytree.tree_unflatten(flat_args, args_spec)
-                    raise Exception(
+                    raise AssertionError(
                         f"Can't call metadata mutating ops on non-Fake Tensor inputs. Found in {render_call(func, args, kwargs)}"
                     )
                 if not self.allow_non_fake_inputs:
                     if isinstance(x, FakeTensor) and x.fake_mode is not self:
                         raise AssertionError("Mixing fake modes NYI")
                     args, kwargs = pytree.tree_unflatten(flat_args, args_spec)
-                    raise Exception(
+                    raise AssertionError(
                         f"Please convert all Tensors to FakeTensors first or instantiate FakeTensorMode "
                         f"with 'allow_non_fake_inputs'. Found in {render_call(func, args, kwargs)}"
                     )
 
-                x = converter(self, x)
+                x = converter.from_real_tensor(self, x)
 
             flat_arg_fake_tensors.append(x)
             return x
@@ -1861,7 +1752,7 @@ def wrap(e):
                     # Under FakeTensorMode, op accepts scalar only inputs, such as aten.add/sub/mul/div,
                     # returns a real scalar tensor on CPU. See TensorMeta() in _prims/__init__.py for details.
                     # We thus directly convert real tensor to fake tensor.
-                    return converter(self, e)
+                    return converter.from_real_tensor(self, e)
                 else:
                     return converter.from_meta_and_device(
                         self, e, device or common_device
@@ -1925,11 +1816,8 @@ def from_tensor(
         static_shapes=None,
         source: Optional[Source] = None,
         symbolic_context=None,
-        # Setting this flag will force FakeTensorMode to return `None` if attempting to convert a tensor we have not
-        # seen before.
-        memoized_only=False,
     ):
-        shape_env = self.shape_env
+        shape_env: Optional[ShapeEnv] = self.shape_env
         if static_shapes is None:
             static_shapes = self.static_shapes
         if static_shapes:
@@ -1937,19 +1825,12 @@ def from_tensor(
                 symbolic_context is None
             ), "cannot set both static_shapes and symbolic_context"
             shape_env = None
-        # see note [Tensor Fakification and Symbol Caching]
-        if not symbolic_context and not source and not static_shapes:
-            if tracing_context := torch._guards.TracingContext.try_get():
-                if tensor in tracing_context.tensor_to_context:
-                    symbolic_context = tracing_context.tensor_to_context[tensor]
-                    source = symbolic_context.tensor_source
-        return self.fake_tensor_converter(
+        return self.fake_tensor_converter.from_real_tensor(
             self,
             tensor,
             shape_env=shape_env,
             source=source,
             symbolic_context=symbolic_context,
-            memoized_only=memoized_only,
         )
 
 
@@ -2008,29 +1889,13 @@ def map_out(e):
             if id(e) in inp_impls:
                 return inp_impls[id(e)]
             else:
-                return fake_mode.fake_tensor_converter(fake_mode, e)
+                return fake_mode.fake_tensor_converter.from_real_tensor(fake_mode, e)
         else:
             return e
 
     return pytree.tree_map(map_out, r)
 
 
-def can_generate_trivial_abstract_impl(op: torch._ops.OpOverload) -> bool:
-    assert isinstance(op, torch._ops.OpOverload)
-    if torch._library.utils.is_builtin(op):
-        # We control the built-ins. These may (in rare cases)
-        # do input metadata mutation (which we have banned on custom ops)
-        return False
-    schema = op._schema
-    # It's suspicious if the op is not mutable but returns nothing, so we return False out of an abundance of caution
-    if not schema.is_mutable:
-        return False
-    if len(schema.returns) > 0:
-        return False
-    # If the op returns nothing, then it has a trivial abstract impl.
-    return True
-
-
 # Just for use to allow copying a module to fake tensors,
 # does not apply elsewhere
 class FakeCopyMode(TorchFunctionMode):
@@ -2081,8 +1946,19 @@ def _device_handler(args):
     torch.ops.aten.storage_offset.default: lambda args: int(args[0].storage_offset()),
 }
 
-_DISPATCH_HANDLE_DIRECLTY = ordered_set(
+_DISPATCH_HANDLE_DIRECTLY = ordered_set(
     torch.ops.aten.is_coalesced.default,
     torch.ops.aten.dense_dim.default,
     torch.ops.aten.sparse_dim.default,
 )
+
+from torch._subclasses.fake_impls import (  # noqa: F401
+    _device_not_kwarg_ops,  # noqa: F401
+    _is_tensor_constructor,  # noqa: F401
+    _like_tensor_constructors,  # noqa: F401
+    contains_tensor_types,  # noqa: F401
+    get_fast_op_impls,
+    has_meta,
+    op_implementations_checks,
+    stride_incorrect_op,
+)
diff --git a/torch/_subclasses/fake_utils.py b/torch/_subclasses/fake_utils.py
index 1814a2a82374c..ffe62842e4565 100644
--- a/torch/_subclasses/fake_utils.py
+++ b/torch/_subclasses/fake_utils.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import functools
 import warnings
 from typing import Callable, Union
diff --git a/torch/_subclasses/functional_tensor.py b/torch/_subclasses/functional_tensor.py
index 255f979269585..fb2a81b8aeb20 100644
--- a/torch/_subclasses/functional_tensor.py
+++ b/torch/_subclasses/functional_tensor.py
@@ -1,6 +1,7 @@
 import contextlib
+import warnings
 from abc import ABC, abstractmethod
-from typing import Any, Callable, ContextManager, Tuple
+from typing import Any, Callable, ContextManager, Dict, Optional, Tuple, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -8,6 +9,7 @@
 from torch._ops import _get_dispatch_mode_pre_dispatch
 from torch.utils._python_dispatch import (
     _detect_functional_mode,
+    _disable_infra_mode,
     return_and_correct_aliasing,
     TorchDispatchMode,
 )
@@ -118,14 +120,10 @@ def __new__(cls, elem):
             False,  # dispatch_layout
             extra_dispatch_keys,  # _extra_dispatch_keys
         )
+        torch._C._set_throw_on_mutable_data_ptr(out)
         out.elem = elem
         return out
 
-    # Need to disable default torch_function. Why?
-    # Default torch_function will always wrap outputs into a subclass if they aren't already a subclass.
-    # We actually.. don't want to do this sometimes, see Note [FunctionalTensorMode inputs are sometimes plain tensors]
-    __torch_function__ = torch._C._disabled_torch_function_impl
-
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         unrecognized_types = [
             t
@@ -145,15 +143,18 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # In theory we don't have to do this - but if we want to service metadata requests here,
         # we need to carefully make sure all metadata is accurate (including metadata mutations)
         if func in FunctionalTensor.metadata_fns:
-
-            def unwrap(x):
-                return x.elem
-
-            assert len(args) == 1 and isinstance(args[0], FunctionalTensor)
-            assert len(kwargs) == 0
             # All metadata accesses should be plumbed to the inner tensor, that way we don't have to worry
             # about the problem of keeping metadata in sync between the wrapper and inner tensor.
             # This also alleviates us from having to manually handle metadata mutations on the wrapper.
+            assert len(kwargs) == 0
+            if func in [
+                torch.ops.aten.is_strides_like_format.default,
+                torch.ops.aten.is_contiguous.memory_format,
+            ]:
+                assert len(args) == 2 and isinstance(args[0], FunctionalTensor)
+                return func(args[0].elem, args[1])
+            assert len(args) == 1 and isinstance(args[0], FunctionalTensor)
+
             return func(args[0].elem)
         # Originally I tried to implement my subclass without giving it a torch_dispatch, but I gave up:
         # - _make_wrapper_subclass requires a __torch_dispatch__
@@ -209,16 +210,45 @@ def sync(self) -> None:
     def mark_mutation_hidden_from_autograd(self) -> None:
         torch._functionalize_mark_mutation_hidden_from_autograd(self.elem)
 
+    def tolist(self) -> Any:
+        if self.elem.dim() == 0:
+            return self.elem.item()
+        elif self.elem.dim() == 1:
+            return [elem.item() for elem in self.elem]
+        else:
+            return [elem.tolist() for elem in self.elem]
+
+    def to(self, *args, **kwargs):
+        if _detect_functional_mode().export:
+            # If copy is specified as pos arg, it's always the second one.
+            if len([arg for arg in args if isinstance(arg, bool)]) <= 1:
+                return super().to(*args, **{**kwargs, "copy": True})
+        return super().to(*args, **kwargs)
+
 
 class FunctionalTensorMode(TorchDispatchMode):
-    def __init__(self, pre_dispatch=False):
+    def __init__(self, pre_dispatch=False, export=False, _allow_token_discovery=False):
+        self.export = export
         self.is_on_stack = False
         self.enter_stack = []
         # Indicates to our torch_dispatch dispatching infra that
         # this is an "infra" mode with lower dispatching precedence.
         self._mode_key = torch._C._TorchDispatchModeKey.FUNCTIONAL
+        self.pre_dispatch = pre_dispatch
         # This will be turned off later for pre-dispatch functionalization
         self._dispatch_key = torch._C.DispatchKey.PreDispatch if pre_dispatch else None  # type: ignore[attr-defined]
+        # Map of effect type (ex. _EffectType.ORDERED) to a token. The tokens help keep
+        # track of the ordering between side effectful operations.
+        self._tokens: Dict[Any, torch.Tensor] = {}
+
+        # Functionalization runs twice in AOTAutograd, once in
+        # `run_functionalized_fw_and_collect_metadata` to collect metadata to
+        # see which tensors need to be functionalized and discover how many
+        # tokens we need, and another time in `make_fx` which does the actual
+        # tracing to replace ops with their functional variants and handling
+        # side-effectful ops. In the second stage there should be no token
+        # discovery. This flag distinguishes between the two stages.
+        self._allow_token_discovery = _allow_token_discovery
 
     # No-op if FunctionalTensorMode is already in use
     def __enter__(self):
@@ -260,6 +290,10 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             return NotImplemented
 
         def _can_decompose(func):
+            # See https://github.com/pytorch/pytorch/pull/115258#issuecomment-1900755832
+            # We never decompose dropout in export
+            if self.export and func == torch.ops.aten.dropout.default:
+                return False
             # TODO (tmanlaibaatar)
             # Eventually, we don't want to decompose any aten op at all
             # but there is a safety and coverage gap that we need to close
@@ -282,7 +316,16 @@ def _can_decompose(func):
                 alias_info = len(
                     [i for i in func._schema.arguments if i.alias_info is not None]
                 )
-                return alias_info != 0 or func._schema.is_mutable
+                should_decompose = alias_info != 0 or func._schema.is_mutable
+                if not should_decompose:
+                    if func.namespace not in ["aten", "prim"]:
+                        warnings.warn(
+                            f"At pre-dispatch tracing, we will assume that any "
+                            f"custom op that is marked with CompositeImplicitAutograd "
+                            f"and functional are safe to not decompose. We found {func}"
+                            f" to be one such op."
+                        )
+                return should_decompose
             return True
 
         if (
@@ -309,10 +352,7 @@ def wrap(x):
                 return FunctionalTensor(x)
             return x
 
-        any_functional_inputs = False
-
         def unwrap(x):
-            any_functional_inputs = True
             return x.elem
 
         from torch._higher_order_ops.auto_functionalize import (
@@ -325,8 +365,21 @@ def unwrap(x):
         ) and not torch._C._dispatch_has_kernel_for_dispatch_key(
             func.name(), torch._C.DispatchKey.Functionalize
         ):
+            # it doesn't matter what mode we use here because
+            # the implementation of do_auto_functionalize doesn't
+            # interact with FunctionalTensorMode at all
             return do_auto_functionalize(func, args, kwargs)
 
+        from torch._higher_order_ops.effects import handle_effects, has_effects
+
+        if has_effects(func, args, kwargs):
+            assert not torch._C._dispatch_has_kernel_for_dispatch_key(
+                func.name(), torch._C.DispatchKey.Functionalize
+            )
+            return handle_effects(
+                self._allow_token_discovery, self._tokens, func, args, kwargs
+            )
+
         args_unwrapped, kwargs_unwrapped = pytree.tree_map_only(
             FunctionalTensor, unwrap, (args, kwargs)
         )
@@ -377,6 +430,13 @@ def unwrap(x):
                         *args_unwrapped,
                         **kwargs_unwrapped,
                     )
+                    # We don't allow any mutation on result of dropout or _to_copy
+                    if self.export:
+                        if func in (
+                            torch.ops.aten.dropout.default,
+                            torch.ops.aten._to_copy.default,
+                        ):
+                            torch._freeze_functional_tensor(outs_unwrapped)  # type: ignore[attr-defined]
                     outs_wrapped = pytree.tree_map_only(
                         torch.Tensor, wrap, outs_unwrapped
                     )
@@ -413,27 +473,8 @@ def unwrap(x):
 
 
 @contextlib.contextmanager
-def maybe_disable_functional_mode():
-    maybe_func_mode = torch._C._unset_dispatch_mode(
-        torch._C._TorchDispatchModeKey.FUNCTIONAL
-    )
-    try:
-        yield
-    finally:
-        if maybe_func_mode is not None:
-            torch._C._set_dispatch_mode(maybe_func_mode)
-
-
-# TODO: clean up the redundancy here,
-# unify on a single context manager for all mode keys.
-@contextlib.contextmanager
-def unset_functional_temporarily():
-    old = torch._C._unset_dispatch_mode(torch._C._TorchDispatchModeKey.FUNCTIONAL)
-    try:
-        yield old
-    finally:
-        if old is not None:
-            torch._C._set_dispatch_mode(old)
+def disable_functional_mode():
+    return _disable_infra_mode(torch._C._TorchDispatchModeKey.FUNCTIONAL)
 
 
 # This is similar to torch.func.functionalize, but:
@@ -444,7 +485,7 @@ def unset_functional_temporarily():
 # - Doing so means that it does not automatically compose with other
 #   functorch transforms, since these transforms always run above __torch_dispatch__.
 #   That's why this util lives here, and not in functorch.
-def dispatch_functionalize(func):
+def dispatch_functionalize(func, mode: FunctionalTensorMode = FunctionalTensorMode()):
     # TODO: pull these from aot autograd
     def to_fun(t):
         if isinstance(t, torch.Tensor):
@@ -464,13 +505,7 @@ def inner(*args, **kwargs):
         disable_above = torch._C._ExcludeDispatchKeyGuard(
             torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize)
         )
-        current_functional_mode = _detect_functional_mode()
-        functional_mode = (
-            current_functional_mode
-            if current_functional_mode
-            else FunctionalTensorMode()
-        )
-        with disable_above, functional_mode:
+        with disable_above, mode:
             func_args = pytree.tree_map_only(torch.Tensor, to_fun, args)
             func_kwargs = pytree.tree_map_only(torch.Tensor, to_fun, kwargs)
             func_outputs = func(*func_args, **func_kwargs)
@@ -487,7 +522,9 @@ def wrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
         pass
 
     @abstractmethod
-    def unwrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
+    def unwrap_tensors(
+        self, args: Union[torch.Tensor, Tuple[torch.Tensor, ...]]
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]:
         pass
 
     @abstractmethod
@@ -516,31 +553,36 @@ def mark_mutation_hidden_from_autograd(self, tensor) -> None:
 
 
 class PythonFunctionalizeAPI(BaseFunctionalizeAPI):
-    def __init__(self, pre_dispatch: bool = False) -> None:
+    def __init__(
+        self, mode: Optional[FunctionalTensorMode] = None, pre_dispatch: bool = False
+    ) -> None:
         super().__init__()
+        self.mode = mode if mode else FunctionalTensorMode()
         self.pre_dispatch = pre_dispatch
 
     def wrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
-        if self.pre_dispatch:
-            with FunctionalTensorMode(True):
-                return torch.utils._pytree.tree_map_only(
-                    torch.Tensor, FunctionalTensor.to_functional, args
-                )
-        with FunctionalTensorMode():
+        with self.mode:
             return torch.utils._pytree.tree_map_only(
                 torch.Tensor, FunctionalTensor.to_functional, args
             )
 
-    def unwrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
+    def unwrap_tensors(
+        self, args: Union[torch.Tensor, Tuple[torch.Tensor, ...]]
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]:
         return torch.utils._pytree.tree_map_only(
             FunctionalTensor, FunctionalTensor.from_functional, args
         )
 
     def functionalize(self, inner_f: Callable) -> Callable:
-        return dispatch_functionalize(inner_f)
+        return dispatch_functionalize(inner_f, self.mode)
 
     def redispatch_to_next(self) -> ContextManager:
-        return unset_functional_temporarily()
+        # [NOTE] We don't do anything here because at the time
+        # we exercise this path, we would have already popped the
+        # FunctionalTensorMode from mode stack. Since FunctionalTensorMode
+        # is now stateful, it is better to explicitly pass in correct mode
+        # directly instead of globally setting it.
+        return contextlib.nullcontext()
 
     def replace(self, input_tensor, output_tensor) -> None:
         assert isinstance(input_tensor, FunctionalTensor)
@@ -566,7 +608,9 @@ def wrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
 
         return _wrap_all_tensors_to_functional(args, level=0)
 
-    def unwrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
+    def unwrap_tensors(
+        self, args: Union[torch.Tensor, Tuple[torch.Tensor, ...]]
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]:
         from torch._functorch.eager_transforms import (
             _unwrap_all_tensors_from_functional,
         )
@@ -603,7 +647,9 @@ def wrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
 
         return _wrap_all_tensors_to_functional(args, level=self.interpreter.level())
 
-    def unwrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
+    def unwrap_tensors(
+        self, args: Union[torch.Tensor, Tuple[torch.Tensor, ...]]
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]:
         from torch._functorch.eager_transforms import (
             _unwrap_all_tensors_from_functional,
         )
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index f7ee50cb66e48..1cdd7405b39b4 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -1,34 +1,54 @@
+from __future__ import annotations
+
 import contextlib
 import warnings
 import weakref
-from typing import ContextManager, List, Optional, Tuple, TYPE_CHECKING
+
+from dataclasses import dataclass
+from typing import (
+    Any,
+    Callable,
+    ContextManager,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+    Union,
+)
+
+from typing_extensions import TypeAlias
 
 import torch
 from torch._C._functorch import (
     _add_batch_dim,
     _unwrap_functional_tensor,
     _wrap_functional_tensor,
-    current_level,
     get_unwrapped,
     is_batchedtensor,
+    is_functorch_wrapped_tensor,
+    is_gradtrackingtensor,
+    is_legacy_batchedtensor,
     maybe_get_bdim,
     maybe_get_level,
     peek_interpreter_stack,
-    TransformType,
 )
-from torch._guards import Source
 
-from torch.multiprocessing.reductions import StorageWeakRef
-from torch.utils._python_dispatch import (
-    is_traceable_wrapper_subclass,
-    transform_subclass,
-)
-from torch.utils.weak import WeakIdRef
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+from torch.utils.weak import WeakIdKeyDictionary
 
 if TYPE_CHECKING:
+    from torch._C._autograd import CreationMeta
+    from torch._C._functorch import CInterpreter
+    from torch._guards import Source
+
+    # Import here to avoid cycle
+    from torch._subclasses.fake_tensor import FakeTensorMode
+
     # Import the following modules during type checking to enable code intelligence features,
     # Do not import unconditionally, as they import sympy and importing sympy is very slow
-    from torch.fx.experimental.symbolic_shapes import SymbolicContext
+    from torch.fx.experimental.symbolic_shapes import ShapeEnv, SymbolicContext
 
 DimList = List
 
@@ -51,32 +71,44 @@ def assert_eq(a, b):
     assert a == b, f"{a} != {b}"
 
 
-def assert_metadata_eq(assert_eq, m1, m2, *, skip_symbolic=False):
+def assert_metadata_eq(
+    assert_eq,
+    m1: Union[MetaTensorDesc, torch.Tensor],
+    m2: torch.Tensor,
+    *,
+    skip_symbolic=False,
+    skip_leaf=False,
+):
+    if isinstance(m1, torch.Tensor):
+        m1 = MetaTensorDescriber().describe_tensor(m1)
+
     def go(m1, m2):
         assert_eq(m1.dtype, m2.dtype)
         if not skip_symbolic:
             assert_eq(m1.shape, m2.shape)
         assert_eq(m1.requires_grad, m2.requires_grad)
-        assert_eq(m1.is_leaf, m2.is_leaf)
-        assert_eq(m1.grad_fn is None, m2.grad_fn is None)
+        if not skip_leaf:
+            assert_eq(m1.is_leaf, m2.is_leaf)
+        # MetaTensorDesc doesn't store grad_fn; inferred from leaf
+        # assert_eq(m1.grad_fn is None, m2.grad_fn is None)
         assert_eq(m1.is_sparse, m2.is_sparse)
-        assert_eq(m1.is_inference(), m2.is_inference())
-        assert_eq(m1.is_conj(), m2.is_conj())
-        assert_eq(m1.is_neg(), m2.is_neg())
-        assert_eq(safe_grad(m1) is not None, safe_grad(m2) is not None)
-        if safe_grad(m1) is not None:
-            go(safe_grad(m1), safe_grad(m2))
+        assert_eq(m1.is_inference, m2.is_inference())
+        assert_eq(m1.is_conj, m2.is_conj())
+        assert_eq(m1.is_neg, m2.is_neg())
+        assert_eq(m1.grad is not None, safe_grad(m2) is not None)
+        if m1.grad is not None:
+            go(m1.grad, safe_grad(m2))
         if m1.is_sparse:
-            assert_eq(m1.dense_dim(), m2.dense_dim())
-            assert_eq(m1.sparse_dim(), m2.sparse_dim())
-            assert_eq(m1.is_coalesced(), m2.is_coalesced())
+            assert_eq(m1.dense_dim, m2.dense_dim())
+            assert_eq(m1.sparse_dim, m2.sparse_dim())
+            assert_eq(m1.is_coalesced, m2.is_coalesced())
         else:
             if not skip_symbolic:
-                assert_eq(m1.stride(), m2.stride())
-                assert_eq(m1.storage_offset(), m2.storage_offset())
-            assert_eq(m1._is_view(), m2._is_view())
-            if m1._is_view():
-                go(m1._base, m2._base)
+                assert_eq(m1.stride, m2.stride())
+                assert_eq(m1.storage_offset, m2.storage_offset())
+            assert_eq(m1.is_view, m2._is_view())
+            if m1.is_view:
+                go(m1.base, m2._base)
         # TODO: test if is resizable (no direct query for this atm)
         # TODO: audit AutogradMeta to see if it matches
         # TODO: test forward AD
@@ -84,6 +116,339 @@ def go(m1, m2):
     return go(m1, m2)
 
 
+def is_sparse_coo(t):
+    return isinstance(t, torch.Tensor) and t.layout is torch.sparse_coo
+
+
+def is_sparse_compressed_layout(layout):
+    return layout in {
+        torch.sparse_csr,
+        torch.sparse_csc,
+        torch.sparse_bsr,
+        torch.sparse_bsc,
+    }
+
+
+def is_sparse_compressed(t):
+    return isinstance(t, torch.Tensor) and is_sparse_compressed_layout(t.layout)
+
+
+def is_sparse_any(t):
+    return is_sparse_coo(t) or is_sparse_compressed(t)
+
+
+# Don't use id() directly, because those can get reallocated over time.
+MetaStorageId: TypeAlias = int
+MetaTensorId: TypeAlias = int
+
+
+class MetaTensorDescriber:
+    """
+    Given a Tensor/Storage, generate a MetaTensorDesc/MetaStorageDesc
+    for it, which is enough information to reconstruct a meta tensor/fake tensor
+    corresponding to a Tensor as faithfully as possible.
+
+    This is a stateful conversion object because we keep track of the IDs
+    of the tensors/storages passed to us, so we can consistently give
+    the same ID when we see the same tensor/storage.
+    """
+
+    def __init__(self):
+        self.next_tensor_id: MetaTensorId = 0
+        self.next_storage_id: MetaStorageId = 0
+        # Tensor -> int
+        self.lookup_tensor = WeakIdKeyDictionary()
+        # Storage -> int
+        self.lookup_storage = WeakIdKeyDictionary()
+
+    def get_tensor_id(self, t: torch.Tensor):
+        if t not in self.lookup_tensor:
+            self.lookup_tensor[t] = self.next_tensor_id
+            self.next_tensor_id += 1
+        return self.lookup_tensor[t]
+
+    def get_storage_id(self, s: torch.UntypedStorage):
+        if s not in self.lookup_storage:
+            self.lookup_storage[s] = self.next_storage_id
+            self.next_storage_id += 1
+        return self.lookup_storage[s]
+
+    # NB: the describe functions NOT maintain a cache and will happily regen the
+    # description
+
+    def describe_storage(self, s: torch.UntypedStorage):
+        return MetaStorageDesc(
+            id=self.get_storage_id(s),
+            size=s.size(),
+        )
+
+    def describe_tensor(self, t: torch.Tensor, recurse: bool = True):
+        is_leaf = safe_is_leaf(t)
+        is_view = t._is_view()
+        is_sparse = t.is_sparse
+        layout = t.layout
+        is_nested = t.is_nested
+        is_traceable_wrapper_subclass_v = is_traceable_wrapper_subclass(t)
+        is_functorch_wrapped = is_functorch_wrapped_tensor(t)
+        is_mkldnn = t.is_mkldnn
+        is_batchedtensor_v = is_batchedtensor(t)
+        is_legacy_batchedtensor_v = is_legacy_batchedtensor(t)
+        is_gradtrackingtensor_v = is_gradtrackingtensor(t)
+        is_functorch_batched_or_grad = is_batchedtensor_v or is_gradtrackingtensor_v
+        is_functional = torch._is_functional_tensor(t)
+
+        storage = None
+        # NB: For compatibility, I default this to zero, as sometimes people
+        # still have stuffed zero into storage offset even though the tensor
+        # doesn't meaningfully have an offset
+        storage_offset = 0
+        if not (
+            is_sparse
+            or is_sparse_compressed_layout(layout)
+            or (is_nested and not is_traceable_wrapper_subclass_v)
+            or is_mkldnn
+            # TODO: TBH, functorch wrapped tensors probably should have
+            # storage associated with them
+            or is_functorch_wrapped
+            or is_legacy_batchedtensor_v
+        ):
+            # NB: We actually don't use storage to do views, but might as well
+            # put it in for accuracy
+            storage = self.describe_storage(t.untyped_storage())
+            storage_offset = t.storage_offset()
+
+        stride = None
+        if not (
+            is_sparse
+            or is_sparse_compressed_layout(layout)
+            or (is_nested and not is_traceable_wrapper_subclass_v)
+        ):
+            # stride/storage_offset are called from is_functorch_wrapped,
+            # view_from_base, empty_create_subclass,
+            # sym_sizes_strides_storage_offset (empty_create)
+            stride = t.stride()
+
+        # NB: this technically should refer to functorch unwrapped tensor, but
+        # I am (perhaps abusively) using it to store both the functorch and
+        # non-functorch functional tensor
+        unwrapped = None
+        autograd_meta_from = None
+        current_level = None
+        if is_batchedtensor_v or is_gradtrackingtensor_v:
+            unwrapped = self.describe_tensor(get_unwrapped(t))
+        # xla and lazy tensors present as functional tensors, but we want them
+        # to be handled specially
+        elif is_functional and t.device.type not in ("xla", "lazy"):
+            if t._is_view():
+                raise RuntimeError(
+                    "Cannot safely fakify a view because this process drops the view information right now."
+                )
+            if not is_functorch_wrapped:
+                torch._sync(t)
+                unwrapped = self.describe_tensor(torch._from_functional_tensor(t))
+                autograd_meta_from = t
+            else:
+                reapply_views = torch._C._functionalization_reapply_views_tls()
+                # NB: has side effects!
+                unwrapped = self.describe_tensor(
+                    _unwrap_functional_tensor(t, reapply_views)
+                )
+                # TODO: It's pretty suspicious that functional tensors don't have
+                # valid level and thus we just grab whatever the current level
+                # is
+                current_level = torch._C._functorch.current_level()
+
+        maybe_functorch_stack = None
+        if is_functorch_wrapped:
+            with torch._functorch.pyfunctorch.temporarily_clear_interpreter_stack() as maybe_functorch_stack:
+                pass
+
+        attrs = None
+        ctx = None
+        type_v = None
+        if is_traceable_wrapper_subclass_v:
+            assert hasattr(t, "__tensor_flatten__")
+            raw_attrs, ctx = t.__tensor_flatten__()
+            attrs = {attr: self.describe_tensor(getattr(t, attr)) for attr in raw_attrs}
+            type_v = type(t)
+
+        # TODO: Is it important to enable torch.inference_mode before querying
+        # these values?
+        return MetaTensorDesc(
+            id=self.get_tensor_id(t),
+            storage=storage,
+            is_inference=t.is_inference(),
+            is_leaf=is_leaf,
+            requires_grad=t.requires_grad,
+            # NB: ndim should be OK too but there is a disaster at
+            # python test/dynamo/test_subclasses.py -k test_user_overidden_property_unsupported
+            # Actually, this means that we have a little bit of a problem
+            # here, which is that there is some sensitivity to how exactly an
+            # access is done if you have a __torch_function__ subclass.  Maybe
+            # should disable torch function before doing accesses?
+            ndim=t.dim(),
+            dtype=t.dtype,
+            is_sparse=is_sparse,
+            is_mkldnn=is_mkldnn,
+            is_functorch_wrapped=is_functorch_wrapped,
+            is_batchedtensor=is_batchedtensor_v,
+            is_legacy_batchedtensor=is_legacy_batchedtensor_v,
+            is_gradtrackingtensor=is_gradtrackingtensor_v,
+            is_view=is_view,
+            is_conj=t.is_conj(),
+            is_neg=t.is_neg(),
+            is_traceable_wrapper_subclass=is_traceable_wrapper_subclass_v,
+            is_nested=is_nested,
+            is_functional=is_functional,
+            layout=layout,
+            device=t.device,
+            size=t.size(),
+            stride=stride,
+            storage_offset=storage_offset,
+            dynamo_dynamic_indices=list(getattr(t, "_dynamo_dynamic_indices", set())),
+            sparse_dim=t.sparse_dim()
+            if t.is_sparse or is_sparse_compressed(t)
+            else None,
+            dense_dim=t.dense_dim() if t.is_sparse or is_sparse_compressed(t) else None,
+            is_coalesced=t.is_coalesced() if t.is_sparse else None,
+            # TODO: I actually think recursing here is correct, but we have at
+            # least an infinite cycle from base -> values -> base
+            # https://github.com/pytorch/pytorch/issues/122089
+            crow_indices=self.describe_tensor(t.crow_indices(), recurse=False)
+            if recurse and t.layout in {torch.sparse_csr, torch.sparse_bsr}
+            else None,
+            col_indices=self.describe_tensor(t.col_indices(), recurse=False)
+            if recurse and t.layout in {torch.sparse_csr, torch.sparse_bsr}
+            else None,
+            ccol_indices=self.describe_tensor(t.ccol_indices(), recurse=False)
+            if recurse and t.layout in {torch.sparse_csc, torch.sparse_bsc}
+            else None,
+            row_indices=self.describe_tensor(t.row_indices(), recurse=False)
+            if recurse and t.layout in {torch.sparse_csc, torch.sparse_bsc}
+            else None,
+            values=self.describe_tensor(t.values(), recurse=False)
+            if recurse and is_sparse_compressed(t)
+            else None,
+            grad=self.describe_tensor(safe_grad(t))
+            if safe_grad(t) is not None
+            else None,
+            creation_meta=torch._C._autograd._get_creation_meta(t)
+            if t._is_view()
+            else None,
+            unwrapped=unwrapped,
+            level=maybe_get_level(t)
+            if is_batchedtensor_v or is_gradtrackingtensor_v
+            else None,
+            bdim=maybe_get_bdim(t) if is_batchedtensor_v else None,
+            base=self.describe_tensor(t._base)
+            if recurse and t._is_view() and t._base is not None
+            else None,
+            fake_mode=torch._subclasses.fake_tensor.maybe_get_fake_mode(t),
+            view_func=t._view_func_unsafe,
+            attrs=attrs,
+            ctx=ctx,
+            type=type_v,
+            # NB: even if functorch is enabled, don't actually save the
+            # interpreter stack here unless we are actually functorch wrapped;
+            # it's irrelevant for non-functorch stuff
+            functorch_stack=maybe_functorch_stack,
+            autograd_meta_from=autograd_meta_from,
+            current_level=current_level,
+        )
+
+
+@dataclass(frozen=True)
+class MetaStorageDesc:
+    id: MetaStorageId
+    size: int
+
+
+@dataclass(frozen=True)
+class MetaTensorDesc:
+    id: MetaTensorId
+    is_inference: bool
+    is_leaf: bool
+    requires_grad: bool
+    ndim: int
+    dtype: torch.dtype
+    is_sparse: bool
+    is_mkldnn: bool
+    is_functorch_wrapped: bool
+    is_batchedtensor: bool
+    is_legacy_batchedtensor: bool
+    is_gradtrackingtensor: bool
+    is_view: bool
+    is_nested: bool
+    is_traceable_wrapper_subclass: bool
+    is_functional: bool
+    is_conj: bool
+    is_neg: bool
+    device: torch.device
+    layout: torch.layout
+    # NB: Sometimes, size, stride and storage_offset contain SymInt, in which
+    # case this is NOT serializable.  That only happens when you're
+    # re-fakeifying a fake tensor with an existing ShapeEnv... maybe we
+    # can get rid of this use case entirely
+    # NB: size could potentially be None as you can override it and make it
+    # throw an error, but we don't currently have any subclasses that do this
+    # except C++ nested tensor but we're going to have nested int to make this
+    # defined on NJT
+    size: Tuple[int, ...]
+    dynamo_dynamic_indices: List[int]
+    stride: Optional[Tuple[int, ...]] = None
+    storage_offset: int = 0
+    storage: Optional[MetaStorageDesc] = None
+    sparse_dim: Optional[int] = None  # is_sparse, is_sparse_compressed
+    dense_dim: Optional[int] = None  # is_sparse, is_sparse_compressed
+    is_coalesced: Optional[bool] = None  # is_sparse
+    crow_indices: Optional[MetaTensorDesc] = None  # is_sparse_compressed
+    col_indices: Optional[MetaTensorDesc] = None  # is_sparse_compressed
+    ccol_indices: Optional[MetaTensorDesc] = None  # is_sparse_compressed
+    row_indices: Optional[MetaTensorDesc] = None  # is_sparse_compressed
+    values: Optional[MetaTensorDesc] = None  # is_sparse_compressed
+    unwrapped: Optional[MetaTensorDesc] = None  # is_functorch_wrapped
+    bdim: Optional[int] = None  # is_functorch_wrapped
+    base: Optional[MetaTensorDesc] = None  # is_view
+    attrs: Optional[Dict[str, MetaTensorDesc]] = None  # is_traceable_wrapper_subclass
+    creation_meta: Optional[CreationMeta] = None
+    grad: Optional[MetaTensorDesc] = None
+
+    # Everything below is NOT serializable, need some more work
+    ctx: Optional[object] = None  # is_traceable_wrapper_subclass
+    type: Optional[Type] = None  # is_traceable_wrapper_subclass
+    fake_mode: Optional[FakeTensorMode] = None
+    view_func: Optional[
+        Callable[
+            [
+                torch.Tensor,
+                Callable[[int], int],
+                Callable[[torch.Tensor], torch.Tensor],
+            ],
+            torch.Tensor,
+        ]
+    ] = None
+    # level looks serializable, but actually it is meaningless without
+    # the functorch_stack below
+    level: Optional[int] = None  # is_functorch_wrapped
+    current_level: Optional[int] = None
+    functorch_stack: Optional[List[CInterpreter]] = None
+    autograd_meta_from: Optional[torch.Tensor] = None
+
+    # Faithfully serializing functorch tensors will not be too difficult.
+    # We only need to consider grad/vmap interpreters, and their internal
+    # state is only bools (mostly what the grad enabled/disabled state
+    # should be in the lower layer).  Beyond that, tensors just need to
+    # precisely indicate which particular interpreter they correspond
+    # to (we then replace level with a pointer to the interpreter stack.)
+    # However, this use of functorch is very "non-lexical" so it's not
+    # entirely clear how to make it all lexical again, so we haven't done
+    # it for now.
+
+    @property
+    def shape(self):
+        return self.size
+
+
 # This is a class for converting multiple tensors into meta tensors which
 # share the same view/storage structure.  The operation model is you allocate
 # one of these, and then call it repeatedly on all the tensors you want to
@@ -93,89 +458,41 @@ def go(m1, m2):
 # and tensor storages.
 class MetaConverter:
     def __init__(self):
-        self.storage_memo = {}
+        # Maps MetaStorageId to UntypedStorage
+        self.storage_memo: weakref.WeakValueDictionary = weakref.WeakValueDictionary()
+        # Maps MetaTensorId to torch.Tensor (typically a meta tensor or
+        # FakeTensor)
         self.tensor_memo: weakref.WeakValueDictionary = weakref.WeakValueDictionary()
-        self.maybe_storages_to_delete = []
-        self.check_expired_frequency = 128
-        self.check_expired_count = 0
         self.hit = 0
         self.miss = 0
         self.del_hook = None
         self.arg_cnt = 0
+        self.describer = MetaTensorDescriber()
 
     def successful(self):
         return self.hit > 0 and self.miss == 0
 
-    def check_for_expired_weak_storages(self):
-        new_li = []
-        stor_to_delete = []
-        for obj in self.maybe_storages_to_delete:
-            if not obj.expired():
-                new_li.append(obj)
-            else:
-                stor_to_delete.append(obj)
-        for obj in stor_to_delete:
-            self.storage_memo.pop(obj, None)
-        self.maybe_storages_to_delete = new_li
-
-        # if for some reason we have aquired many storages which have not expired
-        # even though a tensor with their storage has expired (aliasing or otherwise)
-        # check for expired storages less often so as to bound the amount of work we
-        # do checking for expired storages
-        self.check_expired_frequency = max(
-            self.check_expired_frequency, len(self.maybe_storages_to_delete)
-        )
+    def get_tensor_memo(self, t: MetaTensorDesc):
+        return self.tensor_memo.get(t.id, None)
 
-    def get_tensor_memo(self, t):
-        return self.tensor_memo.get(WeakIdRef(t), None)
+    def set_tensor_memo(self, t: MetaTensorDesc, v):
+        self.tensor_memo[t.id] = v
 
-    def set_tensor_memo(self, t, v):
-        # hold a weak ref to self, otherwise it will be kept alive
-        # by the del_ten closure
-        self_weak_ref = weakref.ref(self)
-        if t.is_sparse or t.is_mkldnn or is_batchedtensor(t):
-            weak_st = None
-        else:
-            weak_st = StorageWeakRef(t._typed_storage())
-        tensor_ref_key = WeakIdRef(t)
-
-        def del_ten():
-            # tensor outlives the converter
-            self_ref = self_weak_ref()
-            if self_ref is None:
-                return
-            # on shutdown, tensor_ref_key may not be in memo
-            self_ref.tensor_memo.pop(tensor_ref_key, None)
-            if weak_st and weak_st.expired():
-                self_ref.storage_memo.pop(weak_st, None)
-            elif weak_st is not None:
-                # [expired-storages]
-                # NB: even though the tensor has died,
-                # the deallocation of its storage can take longer,
-                # even when the storage has no other uses/views.
-                # In this case, the StorageWeakRef object will be kept alive
-                # longer than it needs to be, however the storage itself
-                # will be deallocated. We retain the possibly dead storages
-                # and periodically check if any of them are expired and
-                # can be freed.
-                self_ref.maybe_storages_to_delete.append(weak_st)
-
-        weakref.finalize(t, del_ten)
-        self.tensor_memo[tensor_ref_key] = v
-
-    # NB: doesn't actually return a storage, because meta storage is
-    # not supported
-    def meta_storage(self, s, callback):
-        # NB: TypedStorage is freshly allocated and cannot be used as hash
-        # key index.
-
-        # Use a Weak Ref to s in order to not leak memory
-        swr = StorageWeakRef(s)
-        if swr not in self.storage_memo:
-            self.storage_memo[swr] = callback(
-                lambda: torch.empty(s.size(), dtype=torch.uint8, device="meta")
+    def get_storage_memo(self, s: MetaStorageDesc):
+        return self.storage_memo.get(s.id, None)
+
+    def set_storage_memo(self, s: MetaStorageDesc, v):
+        self.storage_memo[s.id] = v
+
+    def meta_storage(self, s: MetaStorageDesc, callback):
+        if self.get_storage_memo(s) is None:
+            r_s = callback(
+                lambda: torch.empty(s.size, dtype=torch.uint8, device="meta")
             ).untyped_storage()
-        return self.storage_memo[swr]
+            self.set_storage_memo(s, r_s)
+            return r_s
+        else:
+            return self.get_storage_memo(s)
 
     # This function assumes that it's possible to do the conversion
     # NB: name here is used in a conventional way by Dynamo; it corresponds
@@ -185,11 +502,11 @@ def meta_storage(self, s, callback):
     # other users of this may not need it this property to be upheld.)
     def meta_tensor(
         self,
-        t,
-        shape_env=None,
+        t: MetaTensorDesc,
+        shape_env: Optional[ShapeEnv] = None,
         callback=lambda t: t(),
         source: Optional[Source] = None,
-        symbolic_context: Optional["SymbolicContext"] = None,
+        symbolic_context: Optional[SymbolicContext] = None,
     ):
         if source is None:
             from torch._dynamo.source import ConstantSource
@@ -233,30 +550,47 @@ def meta_tensor(
         # This is too aggressive: we do duck sizing and 0/1 simplification
         # as we allocate variables, and we do need to register guards for
         # these cases.
-        maybe_suppress = contextlib.nullcontext
+        maybe_suppress: Callable[[], Any] = contextlib.nullcontext
         if shape_env is not None:
             maybe_suppress = shape_env.suppress_guards
 
         def sym_sizes_strides_storage_offset(
-            t, src, symbolic_context=symbolic_context
+            t: MetaTensorDesc, src, symbolic_context=symbolic_context
         ) -> Tuple[Tuple[int, ...], Tuple[int, ...], int]:
+            assert t.stride is not None
             if shape_env is not None:
-                fake_mode = torch._subclasses.fake_tensor.maybe_get_fake_mode(t)
+                fake_mode = t.fake_mode
                 if fake_mode is not None and fake_mode.shape_env is shape_env:
                     # Don't reallocate the sizes; the shape envs are the same,
                     # so reuse the old sizes/strides/etc
-                    return (t.size(), t.stride(), t.storage_offset())
+                    return (t.size, t.stride, t.storage_offset)
                 else:
-                    return shape_env.create_symbolic_sizes_strides_storage_offset(
-                        t,
+                    # TODO: deduplicate this
+                    t_size = tuple(
+                        shape_env._maybe_specialize_sym_int_with_hint(sz)
+                        for sz in t.size
+                    )
+                    t_stride = tuple(
+                        shape_env._maybe_specialize_sym_int_with_hint(sd)
+                        for sd in t.stride
+                    )
+                    t_storage_offset = shape_env._maybe_specialize_sym_int_with_hint(
+                        t.storage_offset
+                    )
+                    return shape_env._create_symbolic_sizes_strides_storage_offset(
+                        t_size,
+                        t_stride,
+                        t_storage_offset,
+                        [d in t.dynamo_dynamic_indices for d in range(t.ndim)],
                         src,
                         symbolic_context=symbolic_context,
                     )
             else:
-                assert symbolic_context is None
-            return (t.size(), t.stride(), t.storage_offset())
+                return (t.size, t.stride, t.storage_offset)
 
-        def empty_create(inner_t, inner_src, symbolic_context=symbolic_context):
+        def empty_create(
+            inner_t: MetaTensorDesc, inner_src, symbolic_context=symbolic_context
+        ):
             (
                 inner_sizes,
                 inner_strides,
@@ -269,21 +603,288 @@ def empty_create(inner_t, inner_src, symbolic_context=symbolic_context):
                 device="meta",
             )
 
-        # see expired-storages
-        self.check_expired_count += 1
-        if self.check_expired_count >= self.check_expired_frequency:
-            self.check_for_expired_weak_storages()
-            self.check_expired_count = 0
+        # Creates a subclass instance with empty inner tensors according to the specified
+        # symbolic context.
+        def empty_create_subclass(
+            t: MetaTensorDesc,
+            outer_size,
+            outer_stride,
+            symbolic_context=symbolic_context,
+            callback=callback,
+            source=source,
+        ):
+            from torch._dynamo.source import AttrSource
+            from torch.fx.experimental.symbolic_shapes import SubclassSymbolicContext
+
+            assert t.attrs is not None
+            assert t.type is not None
+            # NB: t.ctx could be None if the subclass in question has no
+            # meaningful context
+
+            assert symbolic_context is None or isinstance(
+                symbolic_context, SubclassSymbolicContext
+            )
+
+            # Note: transform_subclass will use __tensor_unflatten__ to generate
+            # a fresh subclass wrapper with outer sizes / strides according to the
+            # outer symbolic context (passed in to this function). Inner size / stride
+            # / storage offset symbols are allocated according to the appropriate inner
+            # symbolic contexts, after which the checks in transform_subclass() will
+            # relate them to the outer metadata as possible.
+            #
+            # Morally, the code here is same as transform_subclass, but we've
+            # written it from scratch to read EmptyCreateSubclass
+
+            outer_size = outer_size if outer_size is not None else t.size
+            outer_stride = outer_stride if outer_stride is not None else t.stride
+
+            transformed_tensors_dict = {
+                attr: callback(
+                    lambda: empty_create(
+                        inner_t,
+                        AttrSource(source, attr),
+                        symbolic_context=(
+                            None
+                            if symbolic_context is None
+                            else symbolic_context.inner_contexts[attr]
+                        ),
+                    )
+                )
+                for attr, inner_t in t.attrs.items()
+            }
+
+            sub = t.type.__tensor_unflatten__(
+                transformed_tensors_dict, t.ctx, outer_size, outer_stride
+            )
+
+            # NB: Purposefully guard here to simplify the inner / outer symbols.
+            # Using sym_eq() for symbolic comparison can result in an expression that's too
+            # difficult to guard on, so we use == here.
+            assert sub.shape == outer_size, (
+                f"Expected return value from {t.type}__tensor_unflatten__() to have "
+                f"shape equal to {outer_size}, but got: {sub.shape}"
+            )
+            assert sub.stride() == outer_stride, (
+                f"Expected return value from {t.type}__tensor_unflatten__() to have "
+                f"stride equal to {outer_stride}, but got: {sub.stride()}"
+            )
+
+            return sub
+
+        # Returns an all-dynamic symbolic context used for metafying the given tensor with
+        # fully dynamic dims. This is useful when fake-ifying intermediate tensors in
+        # closed-over ViewFunc state, as we don't have symbolic contexts for them, but we
+        # don't want to over-specialize during view replay.
+        def all_dynamic_symbolic_context(
+            t: MetaTensorDesc, source, shape_env, callback
+        ):
+            from torch._dynamo.source import AttrSource
+            from torch.fx.experimental.symbolic_shapes import (
+                DimDynamic,
+                StatelessSymbolicContext,
+                SubclassSymbolicContext,
+            )
+
+            view_base_context: Optional[SymbolicContext] = None
+            if t.is_view:
+                assert t.base is not None
+                view_base_context = all_dynamic_symbolic_context(
+                    t.base, AttrSource(source, "_base"), shape_env, callback
+                )
+
+            t_symbolic_context: SymbolicContext
+            t_dynamic_sizes = [DimDynamic.DYNAMIC] * t.ndim
+            if t.is_traceable_wrapper_subclass:
+                assert t.attrs is not None
+                inner_contexts: Dict[str, SymbolicContext] = {}
+                for attr, inner in t.attrs.items():
+                    assert isinstance(attr, str)
+                    inner_contexts[attr] = all_dynamic_symbolic_context(
+                        inner, AttrSource(source, attr), shape_env, callback
+                    )
+                t_symbolic_context = SubclassSymbolicContext(
+                    dynamic_sizes=t_dynamic_sizes,
+                    constraint_sizes=[None] * t.ndim,
+                    inner_contexts=inner_contexts,
+                    tensor_source=source,
+                    view_base_context=view_base_context,
+                )
+            else:
+                t_symbolic_context = StatelessSymbolicContext(
+                    dynamic_sizes=t_dynamic_sizes,
+                    constraint_sizes=[None] * t.ndim,
+                    view_base_context=view_base_context,
+                )
+
+            return t_symbolic_context
+
+        # Returns a fake-ified version of an input view tensor t, given an already fake-ified
+        # base. At a high level, we want two things:
+        #   1. fake_t should have the same view relationship to the given fake base as the
+        #      input t has to its _base.
+        #   2. fake_t should have symbolic sizes / strides / storage offset according to the
+        #      appropriate symbolic context (i.e. from the automatic dynamic algorithm).
+        #
+        # We currently take different strategies across view types:
+        #   * For dense -> dense views, accomplish both (1) and (2) simultaneously via an
+        #     as_strided() call on the fake-ified base, passing symbolic metadata.
+        #   * For views involving subclasses, perform view replay using view funcs to
+        #     achieve (1). It's necessary for (2) to swap out any closed-over state in
+        #     the view funcs with symbolicized SymInts and fake-ified tensors. Doing this
+        #     avoids specialization (and thus over-eager simplification of symbols) that
+        #     could occur during view replay on the fake-ified base.
+        #
+        # Examples:
+        #   * t.unsqueeze(-1) with dense t is a dense -> dense view. It can be modeled
+        #     with an as_strided() call on the fake base passing symbolic metadata.
+        #   * sub.select(dim=0, index=3) is a subclass -> subclass view. The index arg
+        #     is made symbolic to avoid invalid specialization and view replay is then
+        #     done to reconstruct the view.
+        #   * _nested_from_jagged(values, offsets) is a dense -> subclass view
+        #     that returns a subclass instance from a dense values tensor. The offsets
+        #     tensor is closed over in the view func, as it can be considered view metadata.
+        #     First, the offsets tensor is fake-ified according to the inner symbolic
+        #     context and with the correct relationship to the outer size / stride metadata.
+        #     Then view replay is done, swapping in the fake offsets so the view replay output
+        #     is fully fake with no invalid specialization.
+        def view_from_base(
+            base: torch.Tensor, t: MetaTensorDesc, source=source, shape_env=shape_env
+        ):
+            # fake-ify t's metadata according to the outer symbolic context
+            (sizes, strides, storage_offset) = sym_sizes_strides_storage_offset(
+                t, source
+            )
+            if (
+                not t.is_traceable_wrapper_subclass
+                and not is_traceable_wrapper_subclass(base)
+            ):
+                # Dense -> Dense view case uses as_strided() to construct view relationship.
+                # TODO: Change this logic to use view replay for consistency?
+                # It's likely there is no view func available.
+                return base.as_strided(sizes, strides, storage_offset)
+
+            from torch._dynamo.source import EphemeralSource
+            from torch.fx.experimental.symbolic_shapes import (
+                StatelessSymbolicContext,
+                sym_eq,
+            )
+
+            def symint_visitor_fn(s):
+                nonlocal symbolic_context
+                from torch.fx.experimental.symbolic_shapes import DimDynamic
+
+                all_static_sizes = (
+                    symbolic_context is not None
+                    and isinstance(symbolic_context, StatelessSymbolicContext)
+                    and all(
+                        x is DimDynamic.STATIC for x in symbolic_context.dynamic_sizes
+                    )
+                )
+                # Can't just rely on shape env being None - dynamo always initializes it
+                if all_static_sizes or shape_env is None:
+                    return s
+
+                # NB: The symbol here is expected to be simplified out because we a priori
+                # allocate inner and outer symbols according to the appropriate symbolic
+                # contexts and prefer those over this symbol during symbol simplification
+                # (via usage of EphemeralSource below). This -shouldn't- happen, but if
+                # this symbol somehow leaks out beyond the view tensor's shape metadata, our
+                # assumption of it being simplified out will fail and it may be guarded on,
+                # which will hard error.
+                sym_source = EphemeralSource("symint_visitor_fn")
+                symbol = shape_env.create_symbol(s, sym_source)
+                return shape_env.create_symintnode(symbol, hint=s, source=sym_source)
+
+            real_to_fake_mapping = {}
+            if t.is_traceable_wrapper_subclass:
+                assert t.attrs is not None
+                # NB: t.ctx could be None if the subclass in question has no
+                # meaningful context
+                assert t.type is not None
+
+                # Fake-ify t naively here; this is only done so we can get fake-ified inner
+                # tensors with the correct relationships to the outer sizes / strides for use
+                # in view replay. It's done beforehand here because it's not easy to do when
+                # visiting tensors one-by-one during view replay.
+                #
+                # Example:
+                #   Consider a Dense -> NJT view. NJT has (values, offsets) components and we
+                #   want a view of values with the offsets closed over. As the offsets component
+                #   is needed to describe the output view, it's important that it's fakeified
+                #   correctly.
+                fake_t = empty_create_subclass(
+                    t, outer_size=sizes, outer_stride=strides
+                )
+                attrs, _ = fake_t.__tensor_flatten__()
+                for attr in attrs:
+                    real_to_fake_mapping[t.attrs[attr].id] = getattr(fake_t, attr)
+
+            def tensor_visitor_fn(
+                visited_t: torch.Tensor,
+                shape_env=shape_env,
+                callback=callback,
+                source=source,
+            ):
+                # It's possible to close over an undefined tensor (e.g. NJT's lengths).
+                if visited_t is None:
+                    return None
+
+                # NB: visited_t being a Tensor here is very naughty!  Should
+                # have already been described
+
+                # Fake inner tensors of view subclasses will come from the mapping built above.
+                visited_id = self.describer.get_tensor_id(visited_t)
+                fake_visited_t = real_to_fake_mapping.get(visited_id, None)
+                if fake_visited_t is not None:
+                    return fake_visited_t
+
+                visited_desc = self.describer.describe_tensor(visited_t)
+
+                # For other closed-over tensor state, fake-ify it as all dynamic with an
+                # ephemeral source. This avoids invalid specialization during view replay.
+                # If we find that in practice the usage of ephemeral sources isn't enough
+                # to guarantee that we don't have guards on these symbols, we may need to
+                # explicitly suppress guards (as is done for _base in the dense -> dense
+                # view case).
+                temp_source = EphemeralSource("tensor_visitor_fn")
+                return self.meta_tensor(
+                    visited_desc,
+                    shape_env,
+                    callback,
+                    source=temp_source,
+                    symbolic_context=all_dynamic_symbolic_context(
+                        visited_desc, temp_source, shape_env, callback
+                    ),
+                )
+
+            # Replay the view, swapping out any non-symbolic SymInts or real tensors
+            # for symbolic SymInts or fake tensors.
+            assert t.view_func is not None
+            fake_t = t.view_func(base, symint_visitor_fn, tensor_visitor_fn)
+
+            # Ensure the output has symbolic shapes according to the outer symbolic context.
+            # These checks should simplify out any symbols created for closed-over view func
+            # SymInts.
+            torch._check(sym_eq(fake_t.size(), sizes))
+            torch._check(sym_eq(fake_t.stride(), strides))
+            torch._check(sym_eq(fake_t.storage_offset(), storage_offset))
+            return fake_t
 
         if self.get_tensor_memo(t) is None:
-            with torch.inference_mode(t.is_inference()):
+            GRAD_TENSOR_SENTINEL_VALUE = -2
+
+            with torch.inference_mode(t.is_inference):
                 if t.is_sparse:
-                    is_leaf = safe_is_leaf(t)
+                    is_leaf = t.is_leaf
+
+                    # The lambda function below is similar to
+                    # `t.to(device='meta')` except the latter
+                    # preserves nnz value
                     r = callback(
                         lambda: torch.ops.aten._sparse_coo_tensor_with_dims(
-                            t.sparse_dim(),
-                            t.dense_dim(),
-                            t.shape,
+                            t.sparse_dim,
+                            t.dense_dim,
+                            t.size,
                             dtype=t.dtype,
                             layout=torch.sparse_coo,
                             device="meta",
@@ -295,14 +896,56 @@ def empty_create(inner_t, inner_src, symbolic_context=symbolic_context):
                     # which means that it will get caught by fake tensor mode.
                     # Ordinarily this would error, but there's some logic in
                     # fake tensor ensure this doesn't happen.
-                    r._coalesced_(t.is_coalesced())
+                    r._coalesced_(t.is_coalesced)
                     if t.requires_grad:
                         r.requires_grad = True
                     if t.requires_grad and not is_leaf:
+                        # This should probably use DelayedError,
+                        # but clone is fine for now for sparse tensors.
+                        # (DelayedError does not work for sparse because it causes
+                        # the Fake sparse tensor to "lose" its fakeness)
+                        r = r.clone()
                         with torch.enable_grad():
-                            r = r.clone()
-                            r._coalesced_(t.is_coalesced())
-                elif t.is_nested and not is_traceable_wrapper_subclass(t):
+                            r._coalesced_(t.is_coalesced)
+                elif is_sparse_compressed_layout(t.layout):
+                    is_leaf = t.is_leaf
+
+                    if t.layout in {torch.sparse_bsr, torch.sparse_bsc}:
+                        assert t.sparse_dim is not None
+                        assert t.dense_dim is not None
+                        assert t.values is not None
+                        batch_dim = t.ndim - t.sparse_dim - t.dense_dim
+                        blocksize = t.values.shape[batch_dim + 1 : batch_dim + 3]
+                    else:
+                        blocksize = ()
+                    if t.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                        assert t.crow_indices is not None
+                        index_dtype = t.crow_indices.dtype
+                    else:
+                        assert t.ccol_indices is not None
+                        index_dtype = t.ccol_indices.dtype
+
+                    r = callback(
+                        lambda: torch.ops.aten._sparse_compressed_tensor_with_dims(
+                            0,
+                            t.dense_dim,
+                            t.shape,
+                            blocksize,
+                            index_dtype,
+                            layout=t.layout,
+                            dtype=t.dtype,
+                            device="meta",
+                        )
+                    )
+                    assert safe_is_leaf(r), "the callback you passed in doesn't detach"
+                    if t.requires_grad:
+                        r.requires_grad = True
+                    if t.requires_grad and not is_leaf:
+                        r = torch._C._functions.DelayedError(
+                            "Internal error: Tried to backward() through example input",
+                            1,
+                        )(r)
+                elif t.is_nested and not t.is_traceable_wrapper_subclass:
                     # TODO: Handle this better in Dynamo?
                     # There are checks there now, but this can still be triggered by a dense
                     # tensor graph input that is a view of a strided NT.
@@ -312,7 +955,7 @@ def empty_create(inner_t, inner_src, symbolic_context=symbolic_context):
                         "strided nested tensors are not supported by meta conversion"
                     )
                 elif t.is_mkldnn:
-                    is_leaf = safe_is_leaf(t)
+                    is_leaf = t.is_leaf
                     sizes, strides, _storage_offset = sym_sizes_strides_storage_offset(
                         t, source
                     )
@@ -325,20 +968,84 @@ def empty_create(inner_t, inner_src, symbolic_context=symbolic_context):
                     if t.requires_grad:
                         r.requires_grad = True
                     if t.requires_grad and not is_leaf:
-                        with torch.enable_grad():
-                            r = r.clone()
-                elif is_batchedtensor(t):
-                    # Wraps a BatchedTensor in a FakeTensor
-                    def _to_fake_tensor(t):
-                        if is_batchedtensor(t):
-                            ft = _to_fake_tensor(get_unwrapped(t))
-                            lvl = maybe_get_level(t)
-                            bdim = maybe_get_bdim(t)
-                            r = _add_batch_dim(ft, bdim, lvl)
+                        r = torch._C._functions.DelayedError(
+                            "Internal error: Tried to backward() through example input",
+                            1,
+                        )(r)
+                elif t.is_functorch_wrapped:
+                    if t.is_view:
+                        from torch._dynamo.exc import unimplemented
+
+                        unimplemented(
+                            "view functorch tensors are not supported by meta conversion"
+                        )
+
+                    # Wraps a functorch tensor class (BatchedTensor, GradTrackingTensor)
+                    # in a FakeTensor
+                    def _to_fake_tensor(t: MetaTensorDesc):
+                        # TODO: why aren't the recursive calls going to
+                        # meta_tensor
+                        if t.is_batchedtensor:
+                            assert t.unwrapped is not None
+                            assert t.level is not None
+                            assert t.bdim is not None
+                            ft = _to_fake_tensor(t.unwrapped)
+                            lvl = t.level
+                            bdim = t.bdim
+                            # You cannot create functorch tensors without
+                            # having the ambient funtorch interpreter stack
+                            # available, as the level refers to things in the
+                            # stack
+                            with torch._functorch.pyfunctorch.temporarily_restore_interpreter_stack(
+                                t.functorch_stack
+                            ):
+                                r = _add_batch_dim(ft, bdim, lvl)
+                        elif t.is_gradtrackingtensor:
+                            assert t.unwrapped is not None
+                            assert t.level is not None
+                            disable_functorch = torch._C._DisableFuncTorch
+                            with disable_functorch():
+                                ft = _to_fake_tensor(t.unwrapped)
+                            lvl = t.level
+                            if lvl == GRAD_TENSOR_SENTINEL_VALUE:
+                                r = ft
+                            else:
+                                with torch._functorch.pyfunctorch.temporarily_restore_interpreter_stack(
+                                    t.functorch_stack
+                                ):
+                                    r = torch._C._functorch._wrap_for_grad(ft, lvl)
+
+                            is_leaf = t.is_leaf
+                            if t.requires_grad and safe_is_leaf(r):
+                                r.requires_grad = True
+                            elif t.requires_grad and not is_leaf:
+                                r = torch._C._functions.DelayedError(  # type: ignore[assignment]
+                                    "Internal error: Tried to backward() through example input",
+                                    1,
+                                )(
+                                    r  # type: ignore[arg-type]
+                                )
+                        elif t.is_functional:
+                            assert t.unwrapped is not None
+                            assert t.current_level is not None
+                            ft = self.meta_tensor(
+                                t.unwrapped,
+                                shape_env=shape_env,
+                                callback=callback,
+                                # NB: reuse these exactly, we treat the
+                                # functional tensor as "invisible".
+                                # TODO: Actually this all probably doesn't
+                                # work, take a closer look.
+                                source=source,
+                                symbolic_context=symbolic_context,
+                            )
+                            r = _wrap_functional_tensor(ft, t.current_level)
+                            # TODO: is_leaf/requires_grad?
                         else:
-                            # regular tensor
-                            sizes = t.size()
-                            strides = t.stride()
+                            assert t.stride is not None
+
+                            sizes = t.size
+                            strides = t.stride
                             r = callback(
                                 lambda: torch.empty_strided(
                                     sizes,
@@ -351,31 +1058,45 @@ def _to_fake_tensor(t):
 
                     r = _to_fake_tensor(t)
 
-                elif t._is_view():
+                elif t.is_functional and t.device.type not in ["xla", "lazy"]:
+                    assert t.unwrapped is not None
+                    assert not t.is_functorch_wrapped  # handled above
+                    unwrapped = self.meta_tensor(
+                        t.unwrapped,
+                        shape_env=shape_env,
+                        callback=callback,
+                        source=source,
+                        symbolic_context=symbolic_context,
+                    )
+                    r = torch._to_functional_tensor(unwrapped)
+                    torch._mirror_autograd_meta_to(t.autograd_meta_from, r)  # type: ignore[attr-defined]
+
+                elif t.is_view:
                     # Construct views in two steps: recursively meta-fy their
                     # base, and then create view(s) off that.  NB: doing it
                     # directly from storage is WRONG because this won't cause
                     # version counters to get shared.
-                    assert t._is_view()
 
-                    from torch._dynamo.source import AttrSource
-                    from torch.fx.experimental.symbolic_shapes import (
-                        DimDynamic,
-                        StatelessSymbolicContext,
-                    )
+                    assert t.base is not None
 
-                    if shape_env and not t.is_nested and not t._base.is_nested:
-                        base_symbolic_context = StatelessSymbolicContext(
-                            dynamic_sizes=[DimDynamic.STATIC] * t._base.dim(),
-                            constraint_sizes=[None] * t._base.dim(),
+                    base_symbolic_context = None
+                    if shape_env and symbolic_context is not None:
+                        from torch.fx.experimental.symbolic_shapes import (
+                            StatelessSymbolicContext,
                         )
-                    else:
-                        base_symbolic_context = None
+
+                        assert isinstance(symbolic_context, StatelessSymbolicContext)
+                        # NB: This should generally be set when the input is a view,
+                        # but the exception right now is for fake-ifying grads, which is
+                        # a work in progress.
+                        if symbolic_context.view_base_context is not None:
+                            base_symbolic_context = symbolic_context.view_base_context
+
                     base = self.meta_tensor(
-                        t._base,
+                        t.base,
                         shape_env,
                         callback,
-                        source=AttrSource(source, "_base"),
+                        source=torch._dynamo.source.AttrSource(source, "_base"),
                         symbolic_context=base_symbolic_context,
                     )
 
@@ -427,40 +1148,18 @@ def is_c_of_r(complex_dtype, real_dtype):
                         #
                         # So we may have to do *two* views out of the base to
                         # recreate this situation.
-                        def _view_from_base(base, t):
-                            if t.is_nested:
-                                # Nested tensors do not support as_strided, and
-                                # hence,always have _view_func available.
-                                #
-                                # The unsafe version of _view_func omits
-                                # checking whether the base passed in has the same
-                                # metadata as the original base the view_func
-                                # was originally executed with. (1) It is OK here,
-                                # because we're calling it on the meta-ified base,
-                                # so the metadata is guaranteed to be the same.
-                                # (2) It is necessary because we don't actually
-                                # want to guard on the base's metadata here.
-                                return t._view_func_unsafe(base)
-                            else:
-                                (
-                                    sizes,
-                                    strides,
-                                    storage_offset,
-                                ) = sym_sizes_strides_storage_offset(t, source)
-                                return base.as_strided(sizes, strides, storage_offset)
-
-                        if safe_is_leaf(t):
+                        if t.is_leaf:
                             # Leaf views that track view metadata are created by
                             # creating a view inside a no_grad block
                             with torch.no_grad(), maybe_suppress():
-                                r = _view_from_base(base, t)
+                                r = view_from_base(base, t)
                             # As it's a leaf, we can directly assign requires_grad
                             r.requires_grad = t.requires_grad
                         else:
-                            if t._base.requires_grad == t.requires_grad:
+                            if t.base.requires_grad == t.requires_grad:
                                 # Easy case, just run the view op
                                 with torch.enable_grad(), maybe_suppress():
-                                    r = _view_from_base(base, t)
+                                    r = view_from_base(base, t)
 
                                 # NB: We don't actaully faithfully replicate
                                 # autograd connectivity, but that doesn't matter
@@ -475,24 +1174,19 @@ def _view_from_base(base, t):
                                     mid = base.view(base.shape)
                                 mid.requires_grad = t.requires_grad
                                 with torch.enable_grad(), maybe_suppress():
-                                    r = _view_from_base(mid, t)
+                                    r = view_from_base(mid, t)
                         # The CreationMeta influences whether or not inplace
                         # mutation is an error or not.  So we need to make
                         # sure we properly propagate this as well.
-                        torch._C._autograd._set_creation_meta(
-                            r, torch._C._autograd._get_creation_meta(t)
-                        )
+                        assert t.creation_meta is not None
+                        torch._C._autograd._set_creation_meta(r, t.creation_meta)
                     finally:
                         torch._C._dispatch_tls_set_dispatch_key_excluded(
                             torch._C.DispatchKey.ADInplaceOrView, old_exclude
                         )
 
                 else:
-                    is_leaf = safe_is_leaf(t)
-
-                    from torch.fx.experimental.symbolic_shapes import (
-                        SubclassSymbolicContext,
-                    )
+                    is_leaf = t.is_leaf
 
                     (
                         sizes,
@@ -502,31 +1196,9 @@ def _view_from_base(base, t):
 
                     # If we have a subclass that desugars into dense tensors,
                     # perform our callback on each inner tensor.
-                    if is_traceable_wrapper_subclass(t):
-                        # Note: transform_subclass will use __tensor_unflatten__ to generate
-                        # a fresh subclass wrapper. We assume that if the inner tensors of
-                        # the subclass are given symbolic sizes, their sizes will be used
-                        # to construct the (symbolic) sizes of the wrapper tensor.
-                        from torch._dynamo.source import AttrSource
-
-                        assert symbolic_context is None or isinstance(
-                            symbolic_context, SubclassSymbolicContext
-                        )
-                        r = transform_subclass(
-                            t,
-                            lambda attr, inner_t: callback(
-                                lambda: empty_create(
-                                    inner_t,
-                                    AttrSource(source, attr),
-                                    symbolic_context=(
-                                        None
-                                        if symbolic_context is None
-                                        else symbolic_context.inner_contexts[attr]
-                                    ),
-                                )
-                            ),
-                            outer_size=sizes,
-                            outer_stride=strides,
+                    if t.is_traceable_wrapper_subclass:
+                        r = empty_create_subclass(
+                            t, outer_size=sizes, outer_stride=strides
                         )
                     else:
                         r = callback(
@@ -537,26 +1209,35 @@ def _view_from_base(base, t):
                                 device="meta",
                             )
                         )
+
                     assert safe_is_leaf(r), "the callback you passed in doesn't detach"
                     if t.requires_grad:
                         r.requires_grad = t.requires_grad
                         if not is_leaf:
                             # Fake up some autograd history.
-                            with torch.enable_grad():
-                                # preserve_format is the default, but we want to
-                                # emphasize how important it is to preserve
-                                # format here
-                                r = r.clone(memory_format=torch.preserve_format)
+                            # Note: we *used* to call .clone() here to mock up some autograd history.
+                            # This is bad for subclasses.
+                            # Consider the case where you have a wrapper subclass that is contiguous,
+                            # but its inner tensor is noncontiguous().
+                            # .clone() (or other ops) will have the side effect of changing
+                            # the metadata of the inner tensor.
+                            # So instead, we now have a dedicated fn to set autograd history,
+                            # without inadvertently changing other metadata.
+                            r = torch._C._functions.DelayedError(
+                                "Internal error: Tried to backward() through example input",
+                                1,
+                            )(r)
 
                     # Graph-Break for wrapped tensors
-                    if not is_batchedtensor(
-                        t
-                    ) and torch._C._functorch.is_functorch_wrapped_tensor(t):
+                    if (
+                        not (t.is_batchedtensor or t.is_gradtrackingtensor)
+                        and t.is_functorch_wrapped
+                    ) or t.is_legacy_batchedtensor:
                         return NotImplemented
 
-                    s = t.untyped_storage()
-                    swr = StorageWeakRef(s)
-                    if swr not in self.storage_memo and (
+                    s = t.storage
+                    assert s is not None
+                    if s.id not in self.storage_memo and (
                         r.is_nested
                         or (
                             r.stride() == strides
@@ -564,7 +1245,7 @@ def _view_from_base(base, t):
                         )
                     ):
                         # You're normal and happy, install the fresh storage into the memo
-                        self.storage_memo[swr] = r.untyped_storage()
+                        self.set_storage_memo(s, r.untyped_storage())
                     else:
                         # You're in crazy town; somehow you gave us a tensor
                         # that wasn't a view, but had nonzero storage offset,
@@ -603,23 +1284,28 @@ def _view_from_base(base, t):
                         mb_fake_mode = maybe_get_fake_mode(r)
                         if mb_fake_mode is not None:
                             maybe_fake_mgr = in_kernel_invocation_manager(mb_fake_mode)
-                        with maybe_fake_mgr, torch.no_grad():
+                        with maybe_fake_mgr, torch.no_grad(), maybe_suppress():
                             r.set_(r_s, storage_offset, sizes, strides)
 
-                if safe_grad(t) is not None:
+                if t.grad is not None:
                     from torch._dynamo.source import AttrSource
 
+                    # TODO: Use a valid grad-specific symbolic context instead of recycling
+                    # the one from t. This isn't correct if e.g. t._is_view() != t.grad._is_view().
                     r.grad = self.meta_tensor(
-                        safe_grad(t),
+                        t.grad,
                         shape_env,
                         callback,
                         source=AttrSource(source, "grad"),
                         symbolic_context=symbolic_context,
                     )
-                torch._C._set_conj(r, t.is_conj())
-                torch._C._set_neg(r, t.is_neg())
+                torch._C._set_conj(r, t.is_conj)
+                torch._C._set_neg(r, t.is_neg)
             # This can be skipped if necessary for performance reasons
-            assert_metadata_eq(assert_eq, t, r, skip_symbolic=True)
+            skip_leaf = (
+                t.is_gradtrackingtensor and t.level == GRAD_TENSOR_SENTINEL_VALUE
+            )
+            assert_metadata_eq(assert_eq, t, r, skip_symbolic=True, skip_leaf=skip_leaf)
             self.set_tensor_memo(t, r)
 
         return self.get_tensor_memo(t)
@@ -636,83 +1322,27 @@ def __call__(
         # TODO: zero tensors?  We appear to have eliminated them by
         # excluding complex for now
 
+        # Filter out cases we don't support
+        # TODO: This can probably be simplified quite a bit
         if isinstance(t, torch.Tensor) or is_traceable_wrapper_subclass(t):
-            if t.device.type != "xla" and any(
-                [
-                    t.is_sparse_csr,
-                    t.layout in [torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc],
-                    t.is_quantized,
-                    t._is_view() and t._base is not None and t._base.is_sparse,
-                    torch._is_functional_tensor(t),
-                    t.device.type in ("lazy"),
-                    # We need a way to test if a tensor is batched but there
-                    # is no official APi to do it
-                    # torch._C._is_batched(t),
-                ]
+            if (
+                # Lazy tensors are not supported.  Note that XLA is
+                # implemented on top of lazy tensor, not excluded here; we
+                # have some special handling for it; this is for XLA Dynamo
+                # integration
+                t.device.type == "lazy"
+                or
+                # Quantization is not supported
+                t.is_quantized
+                or
+                # Views out of sparse tensors not currently supported (plain
+                # sparse is supported htough)
+                (t._is_view() and t._base is not None and t._base.is_sparse)
             ):
-                # TODO: sparse should support meta
-                # NB technically to('meta') does work but our logging
-                # instrumentation will see the meta conversions and the
-                # tests all break so we just exclude this.  In any case
-                # the to conversion isn't really right anyhow.
-
-                if torch._is_functional_tensor(t) and t.device.type != "lazy":
-                    if t._is_view():
-                        raise RuntimeError(
-                            "Cannot safely fakify a view because this process drops the view information right now."
-                        )
-
-                    st = peek_interpreter_stack()
-                    assert (
-                        st is None or st.key() == TransformType.Functionalize
-                    ), "Expect st to be either None or have Functionalize transform key."
-                    if st is None:
-                        # the case of AOTAutograd
-                        torch._sync(t)
-                        unwrap_t = torch._from_functional_tensor(t)
-                        with torch._dispatch.python.suspend_functionalization():
-                            fake_t = self.meta_tensor(
-                                unwrap_t,
-                                shape_env=shape_env,
-                                callback=callback,
-                                source=source,
-                                symbolic_context=symbolic_context,
-                            )
-                        out = torch._to_functional_tensor(fake_t)
-                        torch._mirror_autograd_meta_to(fake_t, out)
-                        return out
-                    else:
-                        # torch.func.functionalize
-                        reapply_views = torch._C._functionalization_reapply_views_tls()
-                        unwrap_t = _unwrap_functional_tensor(t, reapply_views)
-                        pop_st_ctx = (
-                            torch._functorch.pyfunctorch.temporarily_pop_interpreter_stack()
-                        )
-                        with pop_st_ctx:
-                            fake_t = self.meta_tensor(
-                                unwrap_t,
-                                shape_env=shape_env,
-                                callback=callback,
-                                source=source,
-                                symbolic_context=symbolic_context,
-                            )
-                        return _wrap_functional_tensor(fake_t, current_level())
                 self.miss += 1
                 return NotImplemented
             else:
                 self.hit += 1
-                r = self.meta_tensor(
-                    t,
-                    shape_env=shape_env,
-                    callback=callback,
-                    source=source,
-                    symbolic_context=symbolic_context,
-                )
-                if type(t) is torch.nn.Parameter:
-                    # NB: Cannot directly use Parameter constructor
-                    # because that would force a detach, not desirable
-                    r._is_param = True
-                return r
         elif torch.overrides.is_tensor_like(t):
             self.miss += 1
             return NotImplemented
@@ -720,5 +1350,36 @@ def __call__(
             # non-Tensor types don't count as hit or miss
             return t
 
+        # Describe the tensor.  NB: do NOT disable ambient modes, we may need
+        # to query them when figuring out what to put in here
+        t_desc = self.describer.describe_tensor(t)
+
+        # Do the meta-fication.  Here, we disable all the ambient modes, to
+        # better simulate what would be like to re-fakeify from a fresh
+        # process
+        with contextlib.ExitStack() as exit_stack:
+            exit_stack.enter_context(torch._dispatch.python.suspend_functionalization())
+            st = peek_interpreter_stack()
+            if st is not None:
+                exit_stack.enter_context(
+                    torch._functorch.pyfunctorch.temporarily_clear_interpreter_stack()
+                )
+
+            r = self.meta_tensor(
+                t_desc,
+                shape_env=shape_env,
+                callback=callback,
+                source=source,
+                symbolic_context=symbolic_context,
+            )
+
+        if type(t) is torch.nn.Parameter:
+            # NB: Cannot directly use Parameter constructor
+            # because that would force a detach, not desirable
+            r._is_param = True
+
+        # TODO: return the description for later
+        return r
+
 
 import torch._prims_common as utils
diff --git a/torch/_subclasses/schema_check_mode.py b/torch/_subclasses/schema_check_mode.py
index 411e81325aab4..72a2082a162df 100644
--- a/torch/_subclasses/schema_check_mode.py
+++ b/torch/_subclasses/schema_check_mode.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from collections import namedtuple
 from copy import deepcopy
 from itertools import combinations
diff --git a/torch/_tensor.py b/torch/_tensor.py
index 7c799d1f62bbc..4c8727c2cdd7c 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -101,7 +101,7 @@ def __deepcopy__(self, memo):
             if (
                 self.is_sparse
                 or self.device.type
-                in ["lazy", "xla", "mtia", "mps", "ort", "meta", "ipu"]
+                in ["lazy", "xla", "mtia", "mps", "maia", "meta", "ipu"]
                 or (
                     not torch._C._has_storage(self)
                     and self.device.type == torch._C._get_privateuse1_backend_name()
@@ -249,7 +249,7 @@ def _reduce_ex_internal(self, proto):
         # See Note [Don't serialize hooks]
         torch.utils.hooks.warn_if_has_hooks(self)
         backward_hooks: Dict[Any, Any] = OrderedDict()
-        # Note: Numpy array is chosen to be the rebuild component for XLA, MTIA, ORT Tensors.
+        # Note: Numpy array is chosen to be the rebuild component for XLA, MTIA, MAIA Tensors.
         # We considered a few options:
         # 1. CPU tensor can't be used here.
         #    Otherwise in torch.load CPU storage is reconstructed with randomly
@@ -259,7 +259,7 @@ def _reduce_ex_internal(self, proto):
         # 2. Python list is not a good fit due to performance reason.
         #    `tolist()` converts every single element in the tensor into python objects
         #    and serialize them one by one.
-        if self.device.type in ["xla", "mtia", "ort"] or (
+        if self.device.type in ["xla", "mtia", "maia"] or (
             not torch._C._has_storage(self)
             and self.device.type == torch._C._get_privateuse1_backend_name()
         ):
@@ -378,9 +378,18 @@ def _reduce_ex_internal(self, proto):
             )
             return (torch._utils._rebuild_nested_tensor, args_nested)
         elif (
-            self.data_ptr() == 0
-            and type(self) is not torch.Tensor
+            type(self) is not torch.Tensor
             and type(self).__torch_dispatch__ is not torch.Tensor.__torch_dispatch__
+            and (
+                isinstance(
+                    self,
+                    (
+                        torch._subclasses.fake_tensor.FakeTensor,
+                        torch._subclasses.functional_tensor.FunctionalTensor,
+                    ),
+                )
+                or self.data_ptr() == 0
+            )
         ):
             arg_wrapper_subclass = (
                 type(self),
@@ -394,15 +403,7 @@ def _reduce_ex_internal(self, proto):
             )
             return (torch._utils._rebuild_wrapper_subclass, arg_wrapper_subclass)
         else:
-            v3_dtypes = [
-                torch.float8_e5m2,
-                torch.float8_e4m3fn,
-                torch.bits8,
-                torch.bits16,
-                torch.bits1x8,
-                torch.bits2x4,
-                torch.bits4x2,
-            ]
+            v3_dtypes = torch.storage._new_dtypes()
             if self.dtype in v3_dtypes:
                 rebuild_func = torch._utils._rebuild_tensor_v3
                 storage = self.untyped_storage()
@@ -634,7 +635,7 @@ def trim(str):
             trim(
                 r"""reinforce() was removed.
             Use torch.distributions instead.
-            See https://pytorch.org/docs/master/distributions.html
+            See https://pytorch.org/docs/main/distributions.html
 
             Instead of:
 
@@ -708,6 +709,36 @@ def share_memory_(self):
         self._typed_storage()._share_memory_()
         return self
 
+    def module_load(self, other, assign=False):
+        r"""Defines how to transform ``other`` when loading it into ``self`` in :meth:`~nn.Module.load_state_dict`.
+
+        Used when :func:`~torch.__future__.get_swap_module_params_on_conversion` is ``True``.
+
+        It is expected that ``self`` is a parameter or buffer in an ``nn.Module`` and ``other`` is the
+        value in the state dictionary with the corresponding key, this method defines
+        how ``other`` is remapped before being swapped with ``self`` via
+        :func:`~torch.utils.swap_tensors`` in ``module.load_state_dict()``.
+
+        .. note::
+            This method should always return a new object that is not ``self`` or ``other``.
+            For example, the default implementation returns ``self.copy_(other).detach()``
+            if ``assign`` is ``False`` or ``other.detach()`` if ``assign`` is ``True``.
+
+        Args:
+            other (Tensor): value in state dict with key corresponding to ``self``
+            assign (bool): the assign argument passed to :meth:`nn.Module.load_state_dict`
+
+        """
+        if has_torch_function_variadic(self, other):
+            return handle_torch_function(
+                Tensor.module_load, (self, other), self, other, assign=assign
+            )
+
+        if assign:
+            return other.detach()
+        else:
+            return self.copy_(other).detach()
+
     def __reversed__(self):
         r"""Reverses the tensor along dimension 0."""
         if has_torch_function_unary(self):
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index b9ad5a129b92c..88cae5b27aa39 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -3167,7 +3167,7 @@ def callable(a, b) -> number
 Example:
 
     >>> self = torch.tensor([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])
-    >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]])
+    >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]], dtype=torch.bool)
     >>> source = torch.tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
     >>> self.masked_scatter_(mask, source)
     tensor([[0, 0, 0, 0, 1],
@@ -4380,7 +4380,7 @@ def callable(a, b) -> number
 add_docstr_all(
     "scatter_",
     r"""
-scatter_(dim, index, src, reduce=None) -> Tensor
+scatter_(dim, index, src, *, reduce=None) -> Tensor
 
 Writes all values from the tensor :attr:`src` into :attr:`self` at the indices
 specified in the :attr:`index` tensor. For each value in :attr:`src`, its output
@@ -4443,7 +4443,9 @@ def callable(a, b) -> number
     index (LongTensor): the indices of elements to scatter, can be either empty
         or of the same dimensionality as ``src``. When empty, the operation
         returns ``self`` unchanged.
-    src (Tensor or float): the source element(s) to scatter.
+    src (Tensor): the source element(s) to scatter.
+
+Keyword args:
     reduce (str, optional): reduction operation to apply, can be either
         ``'add'`` or ``'multiply'``.
 
@@ -4473,6 +4475,32 @@ def callable(a, b) -> number
     tensor([[2.0000, 2.0000, 3.2300, 2.0000],
             [2.0000, 2.0000, 2.0000, 3.2300]])
 
+.. function:: scatter_(dim, index, value, *, reduce=None) -> Tensor:
+   :noindex:
+
+Writes the value from :attr:`value` into :attr:`self` at the indices
+specified in the :attr:`index` tensor.  This operation is equivalent to the previous version,
+with the :attr:`src` tensor filled entirely with :attr:`value`.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter, can be either empty
+        or of the same dimensionality as ``src``. When empty, the operation
+        returns ``self`` unchanged.
+    value (Scalar): the value to scatter.
+
+Keyword args:
+    reduce (str, optional): reduction operation to apply, can be either
+        ``'add'`` or ``'multiply'``.
+
+Example::
+
+    >>> index = torch.tensor([[0, 1]])
+    >>> value = 2
+    >>> torch.zeros(3, 5).scatter_(0, index, value)
+    tensor([[2., 0., 0., 0., 0.],
+            [0., 2., 0., 0., 0.],
+            [0., 0., 0., 0., 0.]])
 """,
 )
 
@@ -6556,7 +6584,7 @@ def callable(a, b) -> number
 Example:
 
     >>> self = torch.tensor([0, 0, 0, 0, 0])
-    >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]])
+    >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]], dtype=torch.bool)
     >>> source = torch.tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
     >>> self.masked_scatter(mask, source)
     tensor([[0, 0, 0, 0, 1],
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index 1293a0fd61aec..6903b49715ecd 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -445,20 +445,27 @@ def _str_intern(inp, *, tensor_contents=None):
         suffixes.append("size=" + str(tuple(self.shape)))
         from torch._subclasses.fake_tensor import FakeTensor
 
-        if not self.is_meta and not isinstance(self, FakeTensor):
+        is_meta = self.is_meta or isinstance(self, FakeTensor)
+        if not is_meta:
             suffixes.append("nnz=" + str(self._nnz()))
         if not has_default_dtype:
             suffixes.append("dtype=" + str(self.dtype))
         if not custom_contents_provided:
             indices_prefix = "indices=tensor("
             indices = self._indices().detach()
-            indices_str = _tensor_str(indices, indent + len(indices_prefix))
-            if indices.numel() == 0:
+            if is_meta:
+                indices_str = "..."
+            else:
+                indices_str = _tensor_str(indices, indent + len(indices_prefix))
+            if indices.numel() == 0 or is_meta:
                 indices_str += ", size=" + str(tuple(indices.shape))
             values_prefix = "values=tensor("
             values = self._values().detach()
-            values_str = _tensor_str(values, indent + len(values_prefix))
-            if values.numel() == 0:
+            if is_meta:
+                values_str = "..."
+            else:
+                values_str = _tensor_str(values, indent + len(values_prefix))
+            if values.numel() == 0 or is_meta:
                 values_str += ", size=" + str(tuple(values.shape))
             tensor_str = (
                 indices_prefix
@@ -475,8 +482,12 @@ def _str_intern(inp, *, tensor_contents=None):
         torch.sparse_bsr,
         torch.sparse_bsc,
     }:
+        from torch._subclasses.fake_tensor import FakeTensor
+
         suffixes.append("size=" + str(tuple(self.shape)))
-        suffixes.append("nnz=" + str(self._nnz()))
+        is_meta = self.is_meta or isinstance(self, FakeTensor)
+        if not is_meta:
+            suffixes.append("nnz=" + str(self._nnz()))
         if not has_default_dtype:
             suffixes.append("dtype=" + str(self.dtype))
         if not custom_contents_provided:
@@ -492,24 +503,33 @@ def _str_intern(inp, *, tensor_contents=None):
                 cdimname, pdimname = "column", "row"
             compressed_indices_prefix = f"c{cdimname[:3]}_indices=tensor("
             compressed_indices = compressed_indices_method(self).detach()
-            compressed_indices_str = _tensor_str(
-                compressed_indices, indent + len(compressed_indices_prefix)
-            )
-            if compressed_indices.numel() == 0:
+            if is_meta:
+                compressed_indices_str = "..."
+            else:
+                compressed_indices_str = _tensor_str(
+                    compressed_indices, indent + len(compressed_indices_prefix)
+                )
+            if compressed_indices.numel() == 0 or is_meta:
                 compressed_indices_str += ", size=" + str(
                     tuple(compressed_indices.shape)
                 )
             plain_indices_prefix = f"{pdimname[:3]}_indices=tensor("
             plain_indices = plain_indices_method(self).detach()
-            plain_indices_str = _tensor_str(
-                plain_indices, indent + len(plain_indices_prefix)
-            )
-            if plain_indices.numel() == 0:
+            if is_meta:
+                plain_indices_str = "..."
+            else:
+                plain_indices_str = _tensor_str(
+                    plain_indices, indent + len(plain_indices_prefix)
+                )
+            if plain_indices.numel() == 0 or is_meta:
                 plain_indices_str += ", size=" + str(tuple(plain_indices.shape))
             values_prefix = "values=tensor("
             values = self.values().detach()
-            values_str = _tensor_str(values, indent + len(values_prefix))
-            if values.numel() == 0:
+            if is_meta:
+                values_str = "..."
+            else:
+                values_str = _tensor_str(values, indent + len(values_prefix))
+            if values.numel() == 0 or is_meta:
                 values_str += ", size=" + str(tuple(values.shape))
             tensor_str = (
                 compressed_indices_prefix
@@ -610,7 +630,7 @@ def indented_str(s, indent):
         # no-grad mode. See: https://github.com/pytorch/pytorch/issues/99968
         grad_fn_name = "Invalid"
 
-    if grad_fn_name is None and grad_fn is not None:
+    if grad_fn_name is None and grad_fn is not None:  # type: ignore[possibly-undefined]
         grad_fn_name = type(grad_fn).__name__
         if grad_fn_name == "CppFunction":
             grad_fn_name = grad_fn.name().rsplit("::", 1)[-1]
@@ -627,7 +647,7 @@ def indented_str(s, indent):
         suffixes.append(f"tangent={tangent}")
 
     string_repr = _add_suffixes(
-        prefix + tensor_str, suffixes, indent, force_newline=self.is_sparse
+        prefix + tensor_str, suffixes, indent, force_newline=self.is_sparse  # type: ignore[possibly-undefined]
     )
 
     # Check if this instance is flagged as a parameter and change the repr accordingly.
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index fc26cf93ee92c..895dbc9edd244 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -1,4 +1,4 @@
-"""Adds docstrings to functions defined in the torch._C"""
+"""Adds docstrings to functions defined in the torch._C module."""
 
 import re
 
@@ -7,7 +7,8 @@
 
 
 def parse_kwargs(desc):
-    """Maps a description of args to a dictionary of {argname: description}.
+    r"""Map a description of args to a dictionary of {argname: description}.
+
     Input:
         ('    weight (Tensor): a weight tensor\n' +
          '        Some optional description')
@@ -24,6 +25,7 @@ def parse_kwargs(desc):
 
 
 def merge_dicts(*dicts):
+    """Merge dictionaries into a single dictionary."""
     return {x: d[x] for d in dicts for x in d}
 
 
@@ -1741,11 +1743,49 @@ def merge_dicts(*dicts):
 
 Arguments:
     tensors (sequence of Tensors): sequence of tensors to concatenate
-    dim (int): dimension to insert. Has to be between 0 and the number
-        of dimensions of concatenated tensors (inclusive)
+    dim (int, optional): dimension to insert. Has to be between 0 and the number
+        of dimensions of concatenated tensors (inclusive). Default: 0
 
 Keyword args:
     {out}
+
+Example::
+
+    >>> x = torch.randn(2, 3)
+    >>> x
+    tensor([[ 0.3367,  0.1288,  0.2345],
+            [ 0.2303, -1.1229, -0.1863]])
+    >>> x = torch.stack((x, x)) # same as torch.stack((x, x), dim=0)
+    >>> x
+    tensor([[[ 0.3367,  0.1288,  0.2345],
+             [ 0.2303, -1.1229, -0.1863]],
+
+            [[ 0.3367,  0.1288,  0.2345],
+             [ 0.2303, -1.1229, -0.1863]]])
+    >>> x.size()
+    torch.Size([2, 2, 3])
+    >>> x = torch.stack((x, x), dim=1)
+    tensor([[[ 0.3367,  0.1288,  0.2345],
+             [ 0.3367,  0.1288,  0.2345]],
+
+            [[ 0.2303, -1.1229, -0.1863],
+             [ 0.2303, -1.1229, -0.1863]]])
+    >>> x = torch.stack((x, x), dim=2)
+    tensor([[[ 0.3367,  0.3367],
+             [ 0.1288,  0.1288],
+             [ 0.2345,  0.2345]],
+
+            [[ 0.2303,  0.2303],
+             [-1.1229, -1.1229],
+             [-0.1863, -0.1863]]])
+    >>> x = torch.stack((x, x), dim=-1)
+    tensor([[[ 0.3367,  0.3367],
+             [ 0.1288,  0.1288],
+             [ 0.2345,  0.2345]],
+
+            [[ 0.2303,  0.2303],
+             [-1.1229, -1.1229],
+             [-0.1863, -0.1863]]])
 """.format(
         **common_args
     ),
@@ -2265,8 +2305,8 @@ def merge_dicts(*dicts):
         times each observation should be repeated. Its numel must equal the number of columns of :attr:`input`.
         Must have integral dtype. Ignored if ``None``. Defaults to ``None``.
     aweights (tensor, optional): A Scalar or 1D array of observation vector weights.
-        These relative weights are typically large for observations considered “important” and smaller for
-        observations considered less “important”. Its numel must equal the number of columns of :attr:`input`.
+        These relative weights are typically large for observations considered "important" and smaller for
+        observations considered less "important". Its numel must equal the number of columns of :attr:`input`.
         Must have floating point dtype. Ignored if ``None``. Defaults to ``None``.
 
 Returns:
@@ -2306,7 +2346,7 @@ def merge_dicts(*dicts):
 
 Concatenates the given sequence of :attr:`seq` tensors in the given dimension.
 All tensors must either have the same shape (except in the concatenating
-dimension) or be empty.
+dimension) or be a 1-D empty tensor with size ``(0,)``.
 
 :func:`torch.cat` can be seen as an inverse operation for :func:`torch.split`
 and :func:`torch.chunk`.
@@ -3413,13 +3453,11 @@ def merge_dicts(*dicts):
 
 Example::
 
-    >>> a = torch.randn(10)
+    >>> a = torch.randint(1, 20, (10,))
     >>> a
-    tensor([-0.8286, -0.4890,  0.5155,  0.8443,  0.1865, -0.1752, -2.0595,
-             0.1850, -1.1571, -0.4243])
+    tensor([13,  7,  3, 10, 13,  3, 15, 10,  9, 10])
     >>> torch.cumsum(a, dim=0)
-    tensor([-0.8286, -1.3175, -0.8020,  0.0423,  0.2289,  0.0537, -2.0058,
-            -1.8209, -2.9780, -3.4022])
+    tensor([13, 20, 23, 33, 46, 49, 64, 74, 83, 93])
 """.format(
         **reduceops_common_args
     ),
@@ -4428,7 +4466,7 @@ def merge_dicts(*dicts):
 Note that either of the following must be true:
 
 1. :attr:`count` is a positive non-zero number, and the total number of bytes
-in the buffer is less than :attr:`offset` plus :attr:`count` times the size
+in the buffer is more than :attr:`offset` plus :attr:`count` times the size
 (in bytes) of :attr:`dtype`.
 
 2. :attr:`count` is negative, and the length (number of bytes) of the buffer
@@ -4735,7 +4773,7 @@ def merge_dicts(*dicts):
 The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
 accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
 improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
-is estimated using `Taylor’s theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
 Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
 it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
 
@@ -6902,10 +6940,11 @@ def merge_dicts(*dicts):
     r"""
 mean(input, *, dtype=None) -> Tensor
 
-Returns the mean value of all elements in the :attr:`input` tensor.
+Returns the mean value of all elements in the :attr:`input` tensor. Input must be floating point or complex.
 
 Args:
-    {input}
+    input (Tensor):
+      the input tensor, either of floating point or complex dtype
 
 Keyword args:
     {dtype}
@@ -7712,12 +7751,16 @@ def merge_dicts(*dicts):
 
 Example::
 
-    >>> a = torch.randint(10, (5,))
-    >>> a
-    tensor([6, 5, 1, 0, 2])
-    >>> b = a + (torch.randn(50, 1) * 5).long()
+    >>> b = torch.tensor(
+           [[0, 0, 0, 2, 0, 0, 2],
+            [0, 3, 0, 0, 2, 0, 1],
+            [2, 2, 2, 0, 0, 0, 3],
+            [2, 2, 3, 0, 1, 1, 0],
+            [1, 1, 0, 0, 2, 0, 2]])
     >>> torch.mode(b, 0)
-    torch.return_types.mode(values=tensor([6, 5, 1, 0, 2]), indices=tensor([2, 2, 2, 2, 2]))
+    torch.return_types.mode(
+    values=tensor([0, 2, 0, 0, 0, 0, 2]),
+    indices=tensor([1, 3, 4, 4, 2, 4, 4]))
 """.format(
         **single_dim_common
     ),
@@ -7830,9 +7873,8 @@ def merge_dicts(*dicts):
     >>> weights = torch.tensor([0, 10, 3, 0], dtype=torch.float) # create a tensor of weights
     >>> torch.multinomial(weights, 2)
     tensor([1, 2])
-    >>> torch.multinomial(weights, 4) # ERROR!
-    RuntimeError: invalid argument 2: invalid multinomial distribution (with replacement=False,
-    not enough non-negative category to sample) at ../aten/src/TH/generic/THTensorRandom.cpp:320
+    >>> torch.multinomial(weights, 5) # ERROR!
+    RuntimeError: cannot sample n_sample > prob_dist.size(-1) samples without replacement
     >>> torch.multinomial(weights, 4, replacement=True)
     tensor([ 2,  1,  1,  1])
 """.format(
@@ -9214,8 +9256,22 @@ def merge_dicts(*dicts):
 .. math::
     \text{{out}}_{{i}} \sim \mathcal{{N}}(0, 1)
 
+For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+unit variance as
+
+.. math::
+    \text{{out}}_{{i}} \sim \mathcal{{CN}}(0, 1)
+
+This is equivalent to separately sampling the real :math:`(\operatorname{{Re}})` and imaginary
+:math:`(\operatorname{{Im}})` part of :math:`\text{{out}}_i` as
+
+.. math::
+    \operatorname{{Re}}(\text{{out}}_{{i}}) \sim \mathcal{{N}}(0, \frac{{1}}{{2}}),\quad
+    \operatorname{{Im}}(\text{{out}}_{{i}}) \sim \mathcal{{N}}(0, \frac{{1}}{{2}})
+
 The shape of the tensor is defined by the variable argument :attr:`size`.
 
+
 Args:
     size (int...): a sequence of integers defining the shape of the output tensor.
         Can be a variable number of arguments or a collection like a list or tuple.
@@ -9236,6 +9292,8 @@ def merge_dicts(*dicts):
     >>> torch.randn(2, 3)
     tensor([[ 1.5954,  2.8929, -1.0923],
             [ 1.1719, -0.4709, -0.1996]])
+
+.. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
 """.format(
         **factory_common_args
     ),
@@ -9247,8 +9305,8 @@ def merge_dicts(*dicts):
 randn_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
 
 Returns a tensor with the same size as :attr:`input` that is filled with
-random numbers from a normal distribution with mean 0 and variance 1.
-``torch.randn_like(input)`` is equivalent to
+random numbers from a normal distribution with mean 0 and variance 1. Please refer to :func:`torch.randn` for the
+sampling process of complex dtypes. ``torch.randn_like(input)`` is equivalent to
 ``torch.randn(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
 
 Args:
@@ -9850,7 +9908,7 @@ def merge_dicts(*dicts):
 
 Returns ``True`` if your system supports flushing denormal numbers and it
 successfully configures flush denormal mode.  :meth:`~torch.set_flush_denormal`
-is only supported on x86 architectures supporting SSE3.
+is supported on x86 architectures supporting SSE3 and AArch64 architecture.
 
 Args:
     mode (bool): Controls whether to enable flush denormal mode or not
@@ -13663,6 +13721,58 @@ def merge_dicts(*dicts):
 """,
 )
 
+add_docstr(
+    torch.Generator.graphsafe_set_state,
+    r"""
+Generator.graphsafe_set_state(state) -> None
+
+Sets the state of the generator to the specified state in a manner that is safe for use in graph capture.
+This method is crucial for ensuring that the generator's state can be captured in the CUDA graph.
+
+Arguments:
+    state (torch.Generator): A Generator point to the new state for the generator, typically obtained from `graphsafe_get_state`.
+
+Example:
+    >>> g_cuda = torch.Generator(device='cuda')
+    >>> g_cuda_other = torch.Generator(device='cuda')
+    >>> current_state = g_cuda_other.graphsafe_get_state()
+    >>> g_cuda.graphsafe_set_state(current_state)
+""",
+)
+
+add_docstr(
+    torch.Generator.graphsafe_get_state,
+    r"""
+Generator.graphsafe_get_state() -> torch.Generator
+
+Retrieves the current state of the generator in a manner that is safe for graph capture.
+This method is crucial for ensuring that the generator's state can be captured in the CUDA graph.
+
+Returns:
+    torch.Generator: A Generator point to the current state of the generator
+
+Example:
+    >>> g_cuda = torch.Generator(device='cuda')
+    >>> current_state = g_cuda.graphsafe_get_state()
+""",
+)
+
+add_docstr(
+    torch.Generator.clone_state,
+    r"""
+Generator.clone_state() -> torch.Generator
+
+Clones the current state of the generator and returns a new generator pointing to this cloned state.
+This method is beneficial for preserving a particular state of a generator to restore at a later point.
+
+Returns:
+    torch.Generator: A Generator pointing to the newly cloned state.
+
+Example:
+    >>> g_cuda = torch.Generator(device='cuda')
+    >>> cloned_state = g_cuda.clone_state()
+""",
+)
 
 add_docstr(
     torch.Generator.manual_seed,
@@ -13759,7 +13869,7 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.searchsorted,
     r"""
-searchsorted(sorted_sequence, values, *, out_int32=False, right=False, side='left', out=None, sorter=None) -> Tensor
+searchsorted(sorted_sequence, values, *, out_int32=False, right=False, side=None, out=None, sorter=None) -> Tensor
 
 Find the indices from the *innermost* dimension of :attr:`sorted_sequence` such that, if the
 corresponding values in :attr:`values` were inserted before the indices, when sorted, the order
@@ -13806,7 +13916,7 @@ def merge_dicts(*dicts):
                             preferred. It will error if :attr:`side` is set to "left" while this is True.
     side (str, optional): the same as :attr:`right` but preferred. "left" corresponds to False for :attr:`right`
                             and "right" corresponds to True for :attr:`right`. It will error if this is set to
-                            "left" while :attr:`right` is True.
+                            "left" while :attr:`right` is True. Default value is None.
     out (Tensor, optional): the output tensor, must be the same size as :attr:`values` if provided.
     sorter (LongTensor, optional): if provided, a tensor matching the shape of the unsorted
                             :attr:`sorted_sequence` containing a sequence of indices that sort it in the
diff --git a/torch/_utils.py b/torch/_utils.py
index 3315d6548d17c..2e48fe9a1a9de 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -1,10 +1,13 @@
 import copyreg
 import functools
+import logging
 import sys
 import traceback
 import warnings
 from collections import defaultdict
-from typing import Any, DefaultDict, List, Optional
+from typing import Any, Callable, DefaultDict, Generic, List, Optional
+
+from typing_extensions import ParamSpec
 
 import torch
 
@@ -177,7 +180,7 @@ def _get_async_or_non_blocking(function_name, non_blocking, kwargs):
 # be a TypedStorage
 def _rebuild_tensor(storage, storage_offset, size, stride):
     # first construct a tensor with the correct dtype/device
-    t = torch.tensor([], dtype=storage.dtype, device=storage._untyped_storage.device)
+    t = torch.empty((0,), dtype=storage.dtype, device=storage._untyped_storage.device)
     return t.set_(storage._untyped_storage, storage_offset, size, stride)
 
 
@@ -221,8 +224,8 @@ def _rebuild_tensor_v3(
     dtype,
     metadata=None,
 ):
-    t = torch.tensor(
-        [],
+    t = torch.empty(
+        (0,),
         dtype=dtype,
         device=storage._untyped_storage.device,
         requires_grad=requires_grad,
@@ -353,6 +356,7 @@ def _rebuild_wrapper_subclass(
         cls,
         size,
         strides=stride,
+        dtype=dtype,
         storage_offset=storage_offset,
         layout=layout,
         device=device,
@@ -710,6 +714,8 @@ def _get_available_device_type():
         return "cuda"
     if hasattr(torch, "xpu") and torch.xpu.is_available():  # type: ignore[attr-defined]
         return "xpu"
+    if hasattr(torch, "mtia") and torch.mtia.is_available():
+        return "mtia"
     custom_backend_name = torch._C._get_privateuse1_backend_name()
     custom_device_mod = getattr(torch, custom_backend_name, None)
     if custom_device_mod and custom_device_mod.is_available():
@@ -724,6 +730,8 @@ def _get_device_attr(get_member):
         return get_member(torch.cuda)
     if device_type and device_type.lower() == "xpu":
         return get_member(torch.xpu)  # type: ignore[attr-defined]
+    if device_type and device_type.lower() == "mtia":
+        return get_member(torch.mtia)
     if device_type == torch._C._get_privateuse1_backend_name():
         return get_member(getattr(torch, device_type))
     # add more available device types here
@@ -848,9 +856,13 @@ def classproperty(func):
     return _ClassPropertyDescriptor(func)
 
 
-# Whether we are compiling with torch.compile or not
-def is_compiling():
-    return False
+def is_compiling() -> bool:
+    """
+    Indicates whether we are tracing/compiling with torch.compile() or torch.export().
+
+    TODO(khabinov): we should deprecate this function and use torch.compiler.is_compiling().
+    """
+    return torch.compiler.is_compiling()
 
 
 def _functionalize_sync(t):
@@ -891,3 +903,65 @@ def _get_device_module(device_type: str):
             f"Device '{device_type}' does not have a corresponding module registered as 'torch.{device_type}'."
         )
     return device_module
+
+
+def _dummy_type(name: str) -> type:
+    def get_err_fn(is_init: bool):
+        def err_fn(obj, *args, **kwargs):
+            if is_init:
+                class_name = obj.__class__.__name__
+            else:
+                class_name = obj.__name__
+            raise RuntimeError(f"Tried to instantiate dummy base class {class_name}")
+
+        return err_fn
+
+    return type(
+        name, (object,), {"__init__": get_err_fn(True), "__new__": get_err_fn(False)}
+    )
+
+
+class _LazySeedTracker:
+    # Since seeding is memory-less, only track the latest seed.
+    # Note: `manual_seed_all` followed by `manual_seed` overwrites
+    # the seed on current device. We track the order of **latest**
+    # calls between these two API.
+    def __init__(self):
+        self.manual_seed_all_cb = None
+        self.manual_seed_cb = None
+        self.call_order = []
+
+    def queue_seed_all(self, cb, traceback):
+        self.manual_seed_all_cb = (cb, traceback)
+        # update seed_all to be latest
+        self.call_order = [self.manual_seed_cb, self.manual_seed_all_cb]
+
+    def queue_seed(self, cb, traceback):
+        self.manual_seed_cb = (cb, traceback)
+        # update seed to be latest
+        self.call_order = [self.manual_seed_all_cb, self.manual_seed_cb]
+
+    def get_calls(self) -> List:
+        return self.call_order
+
+
+logger = logging.getLogger(__name__)
+P = ParamSpec("P")
+
+
+class CallbackRegistry(Generic[P]):
+    def __init__(self, name: str):
+        self.name = name
+        self.callback_list: List[Callable[P, None]] = []
+
+    def add_callback(self, cb: Callable[P, None]) -> None:
+        self.callback_list.append(cb)
+
+    def fire_callbacks(self, *args: P.args, **kwargs: P.kwargs) -> None:
+        for cb in self.callback_list:
+            try:
+                cb(*args, **kwargs)
+            except Exception as e:
+                logger.exception(
+                    "Exception in callback for %s registered with gpu trace", self.name
+                )
diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py
index 20c017c05b802..3da8bc2186f54 100644
--- a/torch/_utils_internal.py
+++ b/torch/_utils_internal.py
@@ -1,3 +1,4 @@
+import functools
 import logging
 import os
 import sys
@@ -51,16 +52,28 @@ def resolve_library_path(path: str) -> str:
 def throw_abstract_impl_not_imported_error(opname, module, context):
     if module in sys.modules:
         raise NotImplementedError(
-            f"{opname}: We could not find the abstract impl for this operator. "
+            f"{opname}: We could not find the fake impl for this operator. "
         )
     else:
         raise NotImplementedError(
-            f"{opname}: We could not find the abstract impl for this operator. "
+            f"{opname}: We could not find the fake impl for this operator. "
             f"The operator specified that you may need to import the '{module}' "
-            f"Python module to load the abstract impl. {context}"
+            f"Python module to load the fake impl. {context}"
         )
 
 
+# Meta only, act as nop otherwise.
+def compile_time_strobelight_meta(phase_name):
+    def compile_time_strobelight_meta_inner(function):
+        @functools.wraps(function)
+        def wrapper_function(*args, **kwargs):
+            return function(*args, **kwargs)
+
+        return wrapper_function
+
+    return compile_time_strobelight_meta_inner
+
+
 # Meta only, see
 # https://www.internalfb.com/intern/wiki/ML_Workflow_Observability/User_Guides/Adding_instrumentation_to_your_code/
 #
@@ -82,10 +95,87 @@ def log_compilation_event(metrics):
     log.info("%s", metrics)
 
 
-def print_graph(graph, msg: str):
+def upload_graph(graph):
+    pass
+
+
+def set_pytorch_distributed_envs_from_justknobs():
     pass
 
 
+def log_export_usage(**kwargs):
+    pass
+
+
+def log_torchscript_usage(api: str):
+    _ = api
+    return
+
+
+def export_api_rollout_check() -> bool:
+    return False
+
+
+def justknobs_check(name: str) -> bool:
+    """
+    This function can be used to killswitch functionality in FB prod,
+    where you can toggle this value to False in JK without having to
+    do a code push.  In OSS, we always have everything turned on all
+    the time, because downstream users can simply choose to not update
+    PyTorch.  (If more fine-grained enable/disable is needed, we could
+    potentially have a map we lookup name in to toggle behavior.  But
+    the point is that it's all tied to source code in OSS, since there's
+    no live server to query.)
+
+    This is the bare minimum functionality I needed to do some killswitches.
+    We have a more detailed plan at
+    https://docs.google.com/document/d/1Ukerh9_42SeGh89J-tGtecpHBPwGlkQ043pddkKb3PU/edit
+    In particular, in some circumstances it may be necessary to read in
+    a knob once at process start, and then use it consistently for the
+    rest of the process.  Future functionality will codify these patterns
+    into a better high level API.
+
+    WARNING: Do NOT call this function at module import time, JK is not
+    fork safe and you will break anyone who forks the process and then
+    hits JK again.
+    """
+    return True
+
+
+def justknobs_getval_int(name: str) -> int:
+    """
+    Read warning on justknobs_check
+    """
+    return 0
+
+
+@functools.lru_cache(None)
+def max_clock_rate():
+    if not torch.version.hip:
+        from triton.testing import nvsmi
+
+        return nvsmi(["clocks.max.sm"])[0]
+    else:
+        # Manually set max-clock speeds on ROCm until equivalent nvmsi
+        # functionality in triton.testing or via pyamdsmi enablement. Required
+        # for test_snode_runtime unit tests.
+        gcn_arch = str(torch.cuda.get_device_properties(0).gcnArchName.split(":", 1)[0])
+        if "gfx94" in gcn_arch:
+            return 1700
+        elif "gfx90a" in gcn_arch:
+            return 1700
+        elif "gfx908" in gcn_arch:
+            return 1502
+        elif "gfx11" in gcn_arch:
+            return 1700
+        elif "gfx103" in gcn_arch:
+            return 1967
+        elif "gfx101" in gcn_arch:
+            return 1144
+        else:
+            return 1100
+
+
 TEST_MASTER_ADDR = "127.0.0.1"
 TEST_MASTER_PORT = 29500
 # USE_GLOBAL_DEPS controls whether __init__.py tries to load
@@ -94,3 +184,13 @@ def print_graph(graph, msg: str):
 # USE_RTLD_GLOBAL_WITH_LIBTORCH controls whether __init__.py tries to load
 # _C.so with RTLD_GLOBAL during the call to dlopen.
 USE_RTLD_GLOBAL_WITH_LIBTORCH = False
+# If an op was defined in C++ and extended from Python using the
+# torch.library.register_fake, returns if we require that there be a
+# m.set_python_module("mylib.ops") call from C++ that associates
+# the C++ op with a python module.
+REQUIRES_SET_PYTHON_MODULE = False
+
+
+def maybe_upload_prof_stats_to_manifold(profile_path: str) -> None:
+    print("Uploading profile stats (fb-only otherwise no-op)")
+    pass
diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py
index 2acf049a384aa..44dd8223862ae 100644
--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@@ -7,6 +7,7 @@
 # - torch types (Storage, dtypes, Tensor, `torch.Size`),
 # - `torch._utils._rebuild` functions.
 # - `torch.nn.Parameter`
+# - `collections.Counter`
 # - `collections.OrderedDict`
 
 # Based of https://github.com/python/cpython/blob/main/Lib/pickle.py
@@ -17,7 +18,7 @@
 # weights = torch.load(buf, weights_only = True)
 
 import functools as _functools
-from collections import OrderedDict
+from collections import Counter, OrderedDict
 from pickle import (
     APPEND,
     APPENDS,
@@ -69,26 +70,16 @@
 def _get_allowed_globals():
     rc: Dict[str, Any] = {
         "collections.OrderedDict": OrderedDict,
+        "collections.Counter": Counter,
         "torch.nn.parameter.Parameter": torch.nn.Parameter,
         "torch.serialization._get_layout": torch.serialization._get_layout,
         "torch.Size": torch.Size,
         "torch.Tensor": torch.Tensor,
     }
     # dtype
-    for t in [
-        torch.complex32,
-        torch.complex64,
-        torch.complex128,
-        torch.float8_e5m2,
-        torch.float8_e4m3fn,
-        torch.float16,
-        torch.float32,
-        torch.float64,
-        torch.int8,
-        torch.int16,
-        torch.int32,
-        torch.int64,
-    ]:
+    for t in torch.storage._dtype_to_storage_type_map().keys():
+        rc[str(t)] = t
+    for t in torch.storage._new_dtypes():
         rc[str(t)] = t
     # Tensor classes
     for tt in torch._tensor_classes:
@@ -102,9 +93,20 @@ def _get_allowed_globals():
             )
         else:
             rc[f"{ts.__module__}.{ts.__name__}"] = ts
+    # Quantization specific
+    for qt in [
+        torch.per_tensor_affine,
+        torch.per_tensor_symmetric,
+        torch.per_channel_affine,
+        torch.per_channel_symmetric,
+        torch.per_channel_affine_float_qparams,
+    ]:
+        rc[str(qt)] = qt
     # Rebuild functions
     for f in [
         torch._utils._rebuild_parameter,
+        torch._utils._rebuild_parameter_with_state,
+        torch._utils._rebuild_qtensor,
         torch._utils._rebuild_tensor,
         torch._utils._rebuild_tensor_v2,
         torch._utils._rebuild_tensor_v3,
diff --git a/torch/amp/__init__.py b/torch/amp/__init__.py
index f080d3a978d32..2884dfeefe3d8 100644
--- a/torch/amp/__init__.py
+++ b/torch/amp/__init__.py
@@ -1 +1,7 @@
-from .autocast_mode import _enter_autocast, _exit_autocast, autocast
+from .autocast_mode import (
+    _enter_autocast,
+    _exit_autocast,
+    autocast,
+    is_autocast_available,
+)
+from .grad_scaler import GradScaler
diff --git a/torch/amp/autocast_mode.py b/torch/amp/autocast_mode.py
index 30c6aefcf1bda..03641cfd4c464 100644
--- a/torch/amp/autocast_mode.py
+++ b/torch/amp/autocast_mode.py
@@ -6,7 +6,19 @@
 import torch
 from torch.types import _dtype
 
-__all__ = ["autocast_decorator", "autocast"]
+__all__ = ["autocast_decorator", "autocast", "is_autocast_available"]
+
+
+def is_autocast_available(device_type: str) -> bool:
+    r"""
+    Return a bool indicating if autocast is available on :attr:`device_type`.
+
+    Args:
+        device_type(str):  Device type to use. Possible values are: 'cuda', 'cpu', 'xpu' and so on.
+            The type is the same as the `type` attribute of a :class:`torch.device`.
+            Thus, you may obtain the device type of a tensor using `Tensor.device.type`.
+    """
+    return torch._C._is_autocast_available(device_type)
 
 
 def autocast_decorator(autocast_instance, func):
@@ -179,7 +191,10 @@ def forward(self, x):
                                      Thus, you may obtain the device type of a tensor using `Tensor.device.type`.
         enabled(bool, optional):  Whether autocasting should be enabled in the region.
             Default: ``True``
-        dtype(torch_dtype, optional):  Whether to use torch.float16 or torch.bfloat16.
+        dtype(torch_dtype, optional):  Data type for ops run in autocast. It uses the default value
+            (``torch.float16`` for CUDA and ``torch.bfloat16`` for CPU), given by
+            :func:`~torch.get_autocast_dtype`, if :attr:`dtype` is ``None``.
+            Default: ``None``
         cache_enabled(bool, optional):  Whether the weight cache inside autocast should be enabled.
             Default: ``True``
     """
@@ -191,43 +206,33 @@ def __init__(
         enabled: bool = True,
         cache_enabled: Optional[bool] = None,
     ):
+        if not isinstance(device_type, str):
+            raise ValueError(
+                f"Expected `device_type` of type `str`, got: `{type(device_type)}`"
+            )
+        if dtype is None:
+            dtype = torch.get_autocast_dtype(device_type)
         if torch._jit_internal.is_scripting():
             self._enabled = enabled
             self.device = device_type
             self.fast_dtype = dtype
-            # TODO: support get_autocast_gpu/cpu_dtype
             assert dtype is not None
             return
         self.device = device_type
+        if not is_autocast_available(self.device):
+            raise RuntimeError(
+                f"User specified an unsupported autocast device_type '{self.device}'"
+            )
         self.custom_backend_name = torch._C._get_privateuse1_backend_name()
-        if self.device == "cuda":
-            self.fast_dtype = torch.get_autocast_gpu_dtype()
-        elif self.device == "cpu":
-            self.fast_dtype = torch.get_autocast_cpu_dtype()
-        elif self.device == "xpu":
-            self.fast_dtype = torch.xpu.get_autocast_xpu_dtype()  # type: ignore[attr-defined]
-        elif self.device == "ipu":
-            self.fast_dtype = torch.get_autocast_ipu_dtype()  # type: ignore[attr-defined]
-        elif self.device == "hpu":
-            self.fast_dtype = torch.hpu.get_autocast_hpu_dtype()  # type: ignore[attr-defined]
-        elif self.device == "xla":
-            self.fast_dtype = torch.get_autocast_xla_dtype()  # type: ignore[attr-defined]
-        elif self.device == self.custom_backend_name:
+        self.fast_dtype = torch.get_autocast_dtype(self.device)
+        if self.device == self.custom_backend_name:
             necessary_funcs = [
-                "is_autocast_enabled",
-                "set_autocast_enabled",
-                "get_autocast_dtype",
-                "set_autocast_dtype",
                 "get_amp_supported_dtype",
             ]
             message = f"Tried to use AMP with the `{self.custom_backend_name}` backend, but the backend has not "
             message += "registered a module or  the module miss some necessary funcs. The backend should register "
             message += "a module by `torch._register_device_module`, and the module must have these funcs: \n"
-            message += "`is_autocast_enabled() -> bool`, `set_autocast_enabled(bool) -> None`, "
-            message += "`get_autocast_dtype() -> torch.dtype`, `set_autocast_dtype(torch.dtype) "
-            message += (
-                "-> None` and `get_amp_supported_dtype() -> List[torch.dtype]`. \n"
-            )
+            message += "`get_amp_supported_dtype() -> List[torch.dtype]`. \n"
 
             assert hasattr(torch, self.custom_backend_name), message
             self.custom_device_mod = getattr(torch, self.custom_backend_name)
@@ -236,11 +241,6 @@ def __init__(
                     message + f"But the func `{func}` is missing. \n"
                 )
 
-            self.fast_dtype = self.custom_device_mod.get_autocast_dtype()
-        else:
-            raise RuntimeError(
-                f"User specified an unsupported autocast device_type '{self.device}'"
-            )
         self._cache_enabled = torch.is_autocast_cache_enabled()
         if (
             enabled
@@ -323,48 +323,11 @@ def __enter__(self):
             return self
 
         self.prev_cache_enabled = torch.is_autocast_cache_enabled()
-        if self.device == "cpu":
-            self.prev = torch.is_autocast_cpu_enabled()
-            self.prev_fastdtype = torch.get_autocast_cpu_dtype()
-            torch.set_autocast_cpu_enabled(self._enabled)
-            torch.set_autocast_cpu_dtype(self.fast_dtype)  # type: ignore[arg-type]
-            torch.autocast_increment_nesting()
-        elif self.device == "xpu":
-            self.prev = torch.xpu.is_autocast_xpu_enabled()  # type: ignore[attr-defined]
-            self.prev_fastdtype = torch.xpu.get_autocast_xpu_dtype()  # type: ignore[attr-defined]
-            torch.xpu.set_autocast_xpu_enabled(self._enabled)  # type: ignore[attr-defined]
-            torch.xpu.set_autocast_xpu_dtype(self.fast_dtype)  # type: ignore[attr-defined]
-            torch.autocast_increment_nesting()
-        elif self.device == "ipu":
-            self.prev = torch.is_autocast_ipu_enabled()  # type: ignore[attr-defined]
-            self.prev_fastdtype = torch.get_autocast_ipu_dtype()  # type: ignore[attr-defined]
-            torch.set_autocast_ipu_enabled(self._enabled)  # type: ignore[attr-defined]
-            torch.set_autocast_ipu_dtype(self.fast_dtype)  # type: ignore[attr-defined]
-            torch.autocast_increment_nesting()
-        elif self.device == "hpu":
-            self.prev = torch.hpu.is_autocast_hpu_enabled()  # type: ignore[attr-defined]
-            self.prev_fastdtype = torch.hpu.get_autocast_hpu_dtype()  # type: ignore[attr-defined]
-            torch.hpu.set_autocast_hpu_enabled(self._enabled)  # type: ignore[attr-defined]
-            torch.hpu.set_autocast_hpu_dtype(self.fast_dtype)  # type: ignore[attr-defined]
-            torch.autocast_increment_nesting()
-        elif self.device == "xla":
-            self.prev = torch.is_autocast_xla_enabled()  # type: ignore[attr-defined]
-            self.prev_fastdtype = torch.get_autocast_xla_dtype()  # type: ignore[attr-defined]
-            torch.set_autocast_xla_enabled(self._enabled)  # type: ignore[attr-defined]
-            torch.set_autocast_xla_dtype(self.fast_dtype)  # type: ignore[attr-defined]
-            torch.autocast_increment_nesting()
-        elif self.device == self.custom_backend_name:
-            self.prev = self.custom_device_mod.is_autocast_enabled()
-            self.prev_fastdtype = self.custom_device_mod.get_autocast_dtype()
-            self.custom_device_mod.set_autocast_enabled(self._enabled)
-            self.custom_device_mod.set_autocast_dtype(self.fast_dtype)
-            torch.autocast_increment_nesting()
-        else:
-            self.prev = torch.is_autocast_enabled()
-            self.prev_fastdtype = torch.get_autocast_gpu_dtype()
-            torch.set_autocast_gpu_dtype(self.fast_dtype)  # type: ignore[arg-type]
-            torch.set_autocast_enabled(self._enabled)
-            torch.autocast_increment_nesting()
+        self.prev = torch.is_autocast_enabled(self.device)
+        self.prev_fastdtype = torch.get_autocast_dtype(self.device)
+        torch.set_autocast_enabled(self.device, self._enabled)
+        torch.set_autocast_dtype(self.device, self.fast_dtype)  # type: ignore[arg-type]
+        torch.autocast_increment_nesting()
         torch.set_autocast_cache_enabled(self._cache_enabled)
 
     def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):  # type: ignore[override]
@@ -372,41 +335,10 @@ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):  # type: ignore[ov
             return
 
         # Drop the cache when we exit to a nesting level that's outside any instance of autocast.
-        if self.device == "cpu":
-            if torch.autocast_decrement_nesting() == 0:
-                torch.clear_autocast_cache()
-            torch.set_autocast_cpu_enabled(self.prev)
-            torch.set_autocast_cpu_dtype(self.prev_fastdtype)
-        elif self.device == "xpu":
-            if torch.autocast_decrement_nesting() == 0:
-                torch.clear_autocast_cache()
-            torch.xpu.set_autocast_xpu_enabled(self.prev)  # type: ignore[attr-defined]
-            torch.xpu.set_autocast_xpu_dtype(self.prev_fastdtype)  # type: ignore[attr-defined]
-        elif self.device == "ipu":
-            if torch.autocast_decrement_nesting() == 0:
-                torch.clear_autocast_cache()
-            torch.set_autocast_ipu_enabled(self.prev)  # type: ignore[attr-defined]
-            torch.set_autocast_ipu_dtype(self.prev_fastdtype)  # type: ignore[attr-defined]
-        elif self.device == "hpu":
-            if torch.autocast_decrement_nesting() == 0:
-                torch.clear_autocast_cache()
-            torch.hpu.set_autocast_hpu_enabled(self.prev)  # type: ignore[attr-defined]
-            torch.hpu.set_autocast_hpu_dtype(self.prev_fastdtype)  # type: ignore[attr-defined]
-        elif self.device == "xla":
-            if torch.autocast_decrement_nesting() == 0:
-                torch.clear_autocast_cache()
-            torch.set_autocast_xla_enabled(self.prev)  # type: ignore[attr-defined]
-            torch.set_autocast_xla_dtype(self.prev_fastdtype)  # type: ignore[attr-defined]
-        elif self.device == self.custom_backend_name:
-            if torch.autocast_decrement_nesting() == 0:
-                torch.clear_autocast_cache()
-            self.custom_device_mod.set_autocast_enabled(self.prev)
-            self.custom_device_mod.set_autocast_dtype(self.prev_fastdtype)
-        else:
-            if torch.autocast_decrement_nesting() == 0:
-                torch.clear_autocast_cache()
-            torch.set_autocast_enabled(self.prev)
-            torch.set_autocast_gpu_dtype(self.prev_fastdtype)
+        if torch.autocast_decrement_nesting() == 0:
+            torch.clear_autocast_cache()
+        torch.set_autocast_enabled(self.device, self.prev)
+        torch.set_autocast_dtype(self.device, self.prev_fastdtype)
         torch.set_autocast_cache_enabled(self.prev_cache_enabled)
         return False
 
diff --git a/torch/amp/grad_scaler.py b/torch/amp/grad_scaler.py
new file mode 100644
index 0000000000000..a72c6246c99e3
--- /dev/null
+++ b/torch/amp/grad_scaler.py
@@ -0,0 +1,681 @@
+from __future__ import annotations
+
+import inspect
+import warnings
+from collections import abc, defaultdict
+from enum import Enum
+from typing import Any, cast, Dict, Iterable, List, Optional, overload, Tuple, Union
+
+import torch
+
+
+__all__ = ["OptState", "GradScaler"]
+
+
+class _MultiDeviceReplicator:
+    """Lazily serves copies of a tensor to requested devices.
+
+    Copies are cached per-device.
+    """
+
+    def __init__(self, master_tensor: torch.Tensor) -> None:
+        self.master = master_tensor
+        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
+
+    def get(self, device: torch.device) -> torch.Tensor:
+        retval = self._per_device_tensors.get(device, None)
+        if retval is None:
+            retval = self.master.to(device=device, non_blocking=True, copy=True)
+            self._per_device_tensors[device] = retval
+        return retval
+
+
+# Defines default_factory for GradScaler's _per_optimizer_states defaultdict,
+# as well as associated "enum" values.  Prefers defining these at top level because
+# - Lambdas can't be pickled, so we don't want to supply a lambda as the factory.
+# - Defining READY, UNSCALED, STEPPED and _refresh_per_optimizer_state within GradScaler
+#   causes a circular reference, which we'd rather avoid.
+class OptState(Enum):
+    READY = 0
+    UNSCALED = 1
+    STEPPED = 2
+
+
+def _refresh_per_optimizer_state() -> Dict[str, Any]:
+    return {"stage": OptState.READY, "found_inf_per_device": {}}
+
+
+class GradScaler:
+    """An instance ``scaler`` of :class:`GradScaler`.
+
+    Helps perform the steps of gradient scaling
+    conveniently.
+
+    * ``scaler.scale(loss)`` multiplies a given loss by ``scaler``'s current scale factor.
+    * ``scaler.step(optimizer)`` safely unscales gradients and calls ``optimizer.step()``.
+    * ``scaler.update()`` updates ``scaler``'s scale factor.
+
+    Example::
+
+        # Creates a GradScaler once at the beginning of training.
+        scaler = GradScaler()
+
+        for epoch in epochs:
+            for input, target in data:
+                optimizer.zero_grad()
+                output = model(input)
+                loss = loss_fn(output, target)
+
+                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
+                scaler.scale(loss).backward()
+
+                # scaler.step() first unscales gradients of the optimizer's params.
+                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
+                # otherwise, optimizer.step() is skipped.
+                scaler.step(optimizer)
+
+                # Updates the scale for next iteration.
+                scaler.update()
+
+    See the :ref:`Automatic Mixed Precision examples<amp-examples>` for usage
+    (along with autocasting) in more complex cases like gradient clipping, gradient accumulation, gradient penalty,
+    and multiple losses/optimizers.
+
+    ``scaler`` dynamically estimates the scale factor each iteration.  To minimize gradient underflow,
+    a large scale factor should be used.  However, ``float16`` values can "overflow" (become inf or NaN) if
+    the scale factor is too large.  Therefore, the optimal scale factor is the largest factor that can be used
+    without incurring inf or NaN gradient values.
+    ``scaler`` approximates the optimal scale factor over time by checking the gradients for infs and NaNs during every
+    ``scaler.step(optimizer)`` (or optional separate ``scaler.unscale_(optimizer)``, see :meth:`unscale_`).
+
+    * If infs/NaNs are found, ``scaler.step(optimizer)`` skips the underlying ``optimizer.step()`` (so the params
+      themselves remain uncorrupted) and ``update()`` multiplies the scale by ``backoff_factor``.
+
+    * If no infs/NaNs are found, ``scaler.step(optimizer)`` runs the underlying ``optimizer.step()`` as usual.
+      If ``growth_interval`` unskipped iterations occur consecutively, ``update()`` multiplies the scale by
+      ``growth_factor``.
+
+    The scale factor often causes infs/NaNs to appear in gradients for the first few iterations as its
+    value calibrates.  ``scaler.step`` will skip the underlying ``optimizer.step()`` for these
+    iterations.  After that, step skipping should occur rarely (once every few hundred or thousand iterations).
+
+    Args:
+        device (str, optional, default="cuda"): Device type to use. Possible values are: 'cuda' and 'cpu'.
+            The type is the same as the `type` attribute of a :class:`torch.device`.
+            Thus, you may obtain the device type of a tensor using `Tensor.device.type`.
+        init_scale (float, optional, default=2.**16):  Initial scale factor.
+        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
+            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
+        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
+            :meth:`update` if inf/NaN gradients occur in an iteration.
+        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
+            that must occur for the scale to be multiplied by ``growth_factor``.
+        enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
+            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
+            Default: ``True``
+    """
+
+    def __init__(
+        self,
+        device: str = "cuda",
+        init_scale: float = 2.0**16,
+        growth_factor: float = 2.0,
+        backoff_factor: float = 0.5,
+        growth_interval: int = 2000,
+        enabled: bool = True,
+    ) -> None:
+        self._device = device
+        self._enabled = enabled
+        if self._device == "cuda":
+            if enabled and torch.cuda.amp.common.amp_definitely_not_available():
+                warnings.warn(
+                    "torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling."
+                )
+                self._enabled = False
+
+        if self._enabled:
+            assert growth_factor > 1.0, "The growth factor must be > 1.0."
+            assert backoff_factor < 1.0, "The backoff factor must be < 1.0."
+
+            self._init_scale = init_scale
+            # self._scale will be lazily initialized during the first call to scale()
+            self._scale: Optional[torch.Tensor] = None
+            self._growth_factor = growth_factor
+            self._backoff_factor = backoff_factor
+            self._growth_interval = growth_interval
+            self._init_growth_tracker = 0
+            # self._growth_tracker will be lazily initialized during the first call to scale()
+            self._growth_tracker: Optional[torch.Tensor] = None
+            self._per_optimizer_states: Dict[int, Dict[str, Any]] = defaultdict(
+                _refresh_per_optimizer_state
+            )
+
+    def _check_scale_growth_tracker(
+        self, funcname: str
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        fix = "This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration."
+        assert self._scale is not None, (
+            f"Attempted {funcname} but _scale is None.  " + fix
+        )
+        assert self._growth_tracker is not None, (
+            f"Attempted {funcname} but _growth_tracker is None.  " + fix
+        )
+        return (self._scale, self._growth_tracker)
+
+    def _lazy_init_scale_growth_tracker(self, dev: torch.device) -> None:
+        assert self._growth_tracker is None, "_growth_tracker initialized before _scale"
+        self._scale = torch.full((), self._init_scale, dtype=torch.float32, device=dev)
+        self._growth_tracker = torch.full(
+            (), self._init_growth_tracker, dtype=torch.int32, device=dev
+        )
+
+    @overload
+    def scale(self, outputs: torch.Tensor) -> torch.Tensor:
+        ...
+
+    @overload
+    def scale(self, outputs: List[torch.Tensor]) -> List[torch.Tensor]:
+        ...
+
+    @overload
+    def scale(self, outputs: Tuple[torch.Tensor, ...]) -> Tuple[torch.Tensor, ...]:
+        ...
+
+    @overload
+    def scale(self, outputs: Iterable[torch.Tensor]) -> Iterable[torch.Tensor]:
+        ...
+
+    def scale(
+        self,
+        outputs: Union[torch.Tensor, Iterable[torch.Tensor]],
+    ) -> Union[torch.Tensor, Iterable[torch.Tensor]]:
+        """
+        Multiplies ('scales') a tensor or list of tensors by the scale factor.
+
+        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
+        unmodified.
+
+        Args:
+            outputs (Tensor or iterable of Tensors):  Outputs to scale.
+        """
+        if not self._enabled:
+            return outputs
+
+        # Short-circuit for the common case.
+        if isinstance(outputs, torch.Tensor):
+            if self._scale is None:
+                self._lazy_init_scale_growth_tracker(outputs.device)
+            assert self._scale is not None
+            return outputs * self._scale.to(device=outputs.device, non_blocking=True)
+
+        # Invoke the more complex machinery only if we're treating multiple outputs.
+        stash: List[
+            _MultiDeviceReplicator
+        ] = []  # holds a reference that can be overwritten by apply_scale
+
+        def apply_scale(val: Union[torch.Tensor, Iterable[torch.Tensor]]):
+            if isinstance(val, torch.Tensor):
+                if len(stash) == 0:
+                    if self._scale is None:
+                        self._lazy_init_scale_growth_tracker(val.device)
+                    assert self._scale is not None
+                    stash.append(_MultiDeviceReplicator(self._scale))
+                return val * stash[0].get(val.device)
+            if isinstance(val, abc.Iterable):
+                iterable = map(apply_scale, val)
+                if isinstance(val, (list, tuple)):
+                    return type(val)(iterable)
+                return iterable
+            raise ValueError("outputs must be a Tensor or an iterable of Tensors")
+
+        return apply_scale(outputs)
+
+    def _unscale_grads_(
+        self,
+        optimizer: torch.optim.Optimizer,
+        inv_scale: torch.Tensor,
+        found_inf: torch.Tensor,
+        allow_fp16: bool,
+    ) -> Dict[torch.device, torch.Tensor]:
+        per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
+        per_device_found_inf = _MultiDeviceReplicator(found_inf)
+
+        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+        # There could be hundreds of grads, so we'd like to iterate through them just once.
+        # However, we don't know their devices or dtypes in advance.
+
+        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+        # Google says mypy struggles with defaultdicts type annotations.
+        per_device_and_dtype_grads: Dict[
+            torch.device, Dict[torch.dtype, List[torch.Tensor]]
+        ] = defaultdict(lambda: defaultdict(list))
+        with torch.no_grad():
+            for group in optimizer.param_groups:
+                for param in group["params"]:
+                    assert isinstance(param, torch.Tensor)
+                    if param.grad is None:
+                        continue
+                    if (not allow_fp16) and param.grad.dtype == torch.float16:
+                        raise ValueError("Attempting to unscale FP16 gradients.")
+                    if param.grad.is_sparse:
+                        # is_coalesced() == False means the sparse grad has values with duplicate indices.
+                        # coalesce() deduplicates indices and adds all values that have the same index.
+                        # For scaled fp16 values, there's a good chance coalescing will cause overflow,
+                        # so we should check the coalesced _values().
+                        if param.grad.dtype is torch.float16:
+                            param.grad = param.grad.coalesce()
+                        to_unscale = param.grad._values()
+                    else:
+                        to_unscale = param.grad
+
+                    # TODO: is there a way to split by device and dtype without appending in the inner loop?
+                    per_device_and_dtype_grads[to_unscale.device][
+                        to_unscale.dtype
+                    ].append(to_unscale)
+
+            for device, per_dtype_grads in per_device_and_dtype_grads.items():
+                for grads in per_dtype_grads.values():
+                    torch._amp_foreach_non_finite_check_and_unscale_(
+                        grads,
+                        per_device_found_inf.get(device),
+                        per_device_inv_scale.get(device),
+                    )
+
+        return per_device_found_inf._per_device_tensors
+
+    def unscale_(self, optimizer: torch.optim.Optimizer) -> None:
+        """
+        Divides ("unscales") the optimizer's gradient tensors by the scale factor.
+
+        :meth:`unscale_` is optional, serving cases where you need to
+        :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
+        between the backward pass(es) and :meth:`step`.
+        If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
+
+        Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
+
+            ...
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+            scaler.step(optimizer)
+            scaler.update()
+
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
+
+        .. note::
+            :meth:`unscale_` does not incur a CPU-GPU sync.
+
+        .. warning::
+            :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
+            and only after all gradients for that optimizer's assigned parameters have been accumulated.
+            Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
+
+        .. warning::
+            :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
+        """
+        if not self._enabled:
+            return
+
+        self._check_scale_growth_tracker("unscale_")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+        if optimizer_state["stage"] is OptState.UNSCALED:
+            raise RuntimeError(
+                "unscale_() has already been called on this optimizer since the last update()."
+            )
+        elif optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError("unscale_() is being called after step().")
+
+        # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+        assert self._scale is not None
+        inv_scale = self._scale.double().reciprocal().float()
+        found_inf = torch.full((), 0.0, dtype=torch.float32, device=self._scale.device)
+
+        optimizer_state["found_inf_per_device"] = self._unscale_grads_(
+            optimizer, inv_scale, found_inf, False
+        )
+        optimizer_state["stage"] = OptState.UNSCALED
+
+    def _maybe_opt_step(
+        self,
+        optimizer: torch.optim.Optimizer,
+        optimizer_state: Dict[str, Any],
+        *args: Any,
+        **kwargs: Any,
+    ) -> Optional[float]:
+        retval: Optional[float] = None
+        if not sum(v.item() for v in optimizer_state["found_inf_per_device"].values()):
+            retval = optimizer.step(*args, **kwargs)
+        return retval
+
+    def step(
+        self, optimizer: torch.optim.Optimizer, *args: Any, **kwargs: Any
+    ) -> Optional[float]:
+        """Invoke ``unscale_(optimizer)`` followed by parameter update, if gradients are not infs/NaN.
+
+        :meth:`step` carries out the following two operations:
+
+        1.  Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer``
+            earlier in the iteration).  As part of the :meth:`unscale_`, gradients are checked for infs/NaNs.
+        2.  If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled
+            gradients.  Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params.
+
+        ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``.
+
+        Returns the return value of ``optimizer.step(*args, **kwargs)``.
+
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients.
+            args:  Any arguments.
+            kwargs:  Any keyword arguments.
+
+        .. warning::
+            Closure use is not currently supported.
+        """
+        if not self._enabled:
+            return optimizer.step(*args, **kwargs)
+
+        if "closure" in kwargs:
+            raise RuntimeError(
+                "Closure use is not currently supported if GradScaler is enabled."
+            )
+
+        self._check_scale_growth_tracker("step")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+        if optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError(
+                "step() has already been called since the last update()."
+            )
+
+        retval: Optional[float] = None
+
+        if getattr(optimizer, "_step_supports_amp_scaling", False):
+            # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly.
+            # The contract with custom optimizers is that their step() should accept an additional,
+            # optional grad_scaler kwarg.  We append self to the kwargs so the custom optimizer has full information:
+            # it can query its own state, invoke unscale_ on itself, etc
+            # The contract above is being deprecated to avoid introducing `grad_scaler: GradScaler` argument
+            # to `Optimizer.step`. The new behavior is going to add two Tensor attributes of `grad_scale`
+            # and `found_inf` to the passed optimizer so that the optimizer can utilize those
+            # to skip the parameter updates or unscale gradients before updating parameters in
+            # the fused kernel, e.g. `FusedAdamMathFunctor`.
+            # In this behavior, `GradScaler._check_inf_per_device` is called if `OptState.READY`,
+            # while the method is expected to be called by users side, i.e. their optimizers.
+            kwargs_ = kwargs
+            has_grad_scaler_kwarg = (
+                "grad_scaler" in inspect.signature(optimizer.step).parameters
+            )
+            if has_grad_scaler_kwarg:
+                warnings.warn(
+                    "GradScaler is going to stop passing itself as a keyword argument to the passed "
+                    "optimizer. In the near future GradScaler registers `grad_scale: Tensor` and "
+                    "`found_inf: Tensor` to the passed optimizer and let the optimizer use them directly.",
+                    FutureWarning,
+                )
+                kwargs_.update({"grad_scaler": self})
+            else:
+                if optimizer_state["stage"] is OptState.READY:
+                    self._check_inf_per_device(optimizer)
+                scaler = self._get_scale_async()
+                assert scaler is not None
+                found_inf = cast(
+                    torch.Tensor,
+                    sum(
+                        [  # noqa: C419
+                            t.to(scaler.device, non_blocking=True)
+                            for t in optimizer_state["found_inf_per_device"].values()
+                        ]
+                    ),
+                )
+                optimizer.grad_scale = (  # type: ignore[attr-defined]
+                    None if optimizer_state["stage"] == OptState.UNSCALED else scaler
+                )
+                optimizer.found_inf = found_inf  # type: ignore[attr-defined]
+            retval = optimizer.step(*args, **kwargs_)
+            optimizer_state["stage"] = OptState.STEPPED
+            if not has_grad_scaler_kwarg:
+                del optimizer.grad_scale  # type: ignore[attr-defined]
+                del optimizer.found_inf  # type: ignore[attr-defined]
+            return retval
+
+        if optimizer_state["stage"] is OptState.READY:
+            self.unscale_(optimizer)
+
+        assert (
+            len(optimizer_state["found_inf_per_device"]) > 0
+        ), "No inf checks were recorded for this optimizer."
+
+        retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)
+
+        optimizer_state["stage"] = OptState.STEPPED
+
+        return retval
+
+    def update(self, new_scale: Optional[Union[float, torch.Tensor]] = None) -> None:
+        """Update the scale factor.
+
+        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
+        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
+        the scale is multiplied by ``growth_factor`` to increase it.
+
+        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
+        used directly, it's used to fill GradScaler's internal scale tensor. So if
+        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
+        affect the scale GradScaler uses internally.)
+
+        Args:
+            new_scale (float or :class:`torch.Tensor`, optional, default=None):  New scale factor.
+
+        .. warning::
+            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
+            been invoked for all optimizers used this iteration.
+
+        .. warning::
+            For performance reasons, we do not check the scale factor value to avoid synchronizations,
+            so the scale factor is not guaranteed to be above 1. If the scale falls below 1 and/or
+            you are seeing NaNs in your gradients or loss, something is likely wrong. For example,
+            bf16-pretrained models are often incompatible with AMP/fp16 due to differing dynamic ranges.
+        """
+        if not self._enabled:
+            return
+
+        _scale, _growth_tracker = self._check_scale_growth_tracker("update")
+
+        if new_scale is not None:
+            assert self._scale is not None
+            # Accept a new user-defined scale.
+            if isinstance(new_scale, float):
+                self._scale.fill_(new_scale)
+            else:
+                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor or \
+                    torch.FloatTensor with requires_grad=False."
+                assert new_scale.device.type == self._device, reason
+                assert new_scale.numel() == 1, reason
+                assert new_scale.requires_grad is False, reason
+                self._scale.copy_(new_scale)
+        else:
+            # Consume shared inf/nan data collected from optimizers to update the scale.
+            # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+            found_infs = [
+                found_inf.to(device=_scale.device, non_blocking=True)
+                for state in self._per_optimizer_states.values()
+                for found_inf in state["found_inf_per_device"].values()
+            ]
+
+            assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+
+            found_inf_combined = found_infs[0]
+            if len(found_infs) > 1:
+                for i in range(1, len(found_infs)):
+                    found_inf_combined += found_infs[i]
+
+            torch._amp_update_scale_(
+                _scale,
+                _growth_tracker,
+                found_inf_combined,
+                self._growth_factor,
+                self._backoff_factor,
+                self._growth_interval,
+            )
+
+        # To prepare for next iteration, clear the data collected from optimizers this iteration.
+        self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+
+    def _get_scale_async(self) -> Optional[torch.Tensor]:
+        return self._scale
+
+    def get_scale(self) -> float:
+        """Return a Python float containing the current scale, or 1.0 if scaling is disabled.
+
+        .. warning::
+            :meth:`get_scale` incurs a CPU-GPU sync.
+        """
+        if self._enabled:
+            return (
+                self._init_scale
+                if (scale := self._get_scale_async()) is None
+                else cast(float, scale.item())
+            )
+        return 1.0
+
+    def get_growth_factor(self) -> float:
+        r"""Return a Python float containing the scale growth factor."""
+        return self._growth_factor
+
+    def set_growth_factor(self, new_factor: float) -> None:
+        r"""Set a new scale growth factor.
+
+        Args:
+            new_scale (float):  Value to use as the new scale growth factor.
+        """
+        self._growth_factor = new_factor
+
+    def get_backoff_factor(self) -> float:
+        r"""Return a Python float containing the scale backoff factor."""
+        return self._backoff_factor
+
+    def set_backoff_factor(self, new_factor: float) -> None:
+        r"""Set a new scale backoff factor.
+
+        Args:
+            new_scale (float):  Value to use as the new scale backoff factor.
+        """
+        self._backoff_factor = new_factor
+
+    def get_growth_interval(self) -> int:
+        r"""Return a Python int containing the growth interval."""
+        return self._growth_interval
+
+    def set_growth_interval(self, new_interval: int) -> None:
+        r"""Set a new growth interval.
+
+        Args:
+            new_interval (int):  Value to use as the new growth interval.
+        """
+        self._growth_interval = new_interval
+
+    def _get_growth_tracker(self) -> int:
+        if self._enabled:
+            return (
+                self._init_growth_tracker
+                if self._growth_tracker is None
+                else cast(int, self._growth_tracker.item())
+            )
+        return 0
+
+    def is_enabled(self) -> bool:
+        r"""Return a bool indicating whether this instance is enabled."""
+        return self._enabled
+
+    def state_dict(self) -> Dict[str, Any]:
+        r"""Return the state of the scaler as a :class:`dict`.
+
+        It contains five entries:
+
+        * ``"scale"`` - a Python float containing the current scale
+        * ``"growth_factor"`` - a Python float containing the current growth factor
+        * ``"backoff_factor"`` - a Python float containing the current backoff factor
+        * ``"growth_interval"`` - a Python int containing the current growth interval
+        * ``"_growth_tracker"`` - a Python int containing the number of recent consecutive unskipped steps.
+
+        If this instance is not enabled, returns an empty dict.
+
+        .. note::
+           If you wish to checkpoint the scaler's state after a particular iteration, :meth:`state_dict`
+           should be called after :meth:`update`.
+        """
+        if self._enabled:
+            return {
+                "scale": self.get_scale(),
+                "growth_factor": self._growth_factor,
+                "backoff_factor": self._backoff_factor,
+                "growth_interval": self._growth_interval,
+                "_growth_tracker": self._get_growth_tracker(),
+            }
+        return {}
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        r"""Load the scaler state.
+
+        If this instance is disabled, :meth:`load_state_dict` is a no-op.
+
+        Args:
+           state_dict(dict): scaler state.  Should be an object returned from a call to :meth:`state_dict`.
+        """
+        if not self._enabled:
+            return
+
+        if len(state_dict) == 0:
+            raise RuntimeError(
+                "The source state dict is empty, possibly because it was saved "
+                "from a disabled instance of GradScaler."
+            )
+
+        self._init_scale = cast(float, state_dict["scale"])
+        if self._scale is not None:
+            self._scale.fill_(state_dict["scale"])
+        self._growth_factor = cast(float, state_dict["growth_factor"])
+        self._backoff_factor = cast(float, state_dict["backoff_factor"])
+        self._growth_interval = cast(int, state_dict["growth_interval"])
+        self._init_growth_tracker = cast(int, state_dict["_growth_tracker"])
+        if self._growth_tracker is not None:
+            self._growth_tracker.fill_(state_dict["_growth_tracker"])
+
+    def __getstate__(self) -> Dict[str, Any]:
+        state = self.__dict__.copy()
+        if self._enabled:
+            assert len(self._per_optimizer_states) == 0, (
+                "A GradScaler instance may only be pickled at the beginning "
+                "of an iteration, or at the end after scaler.update()."
+            )
+            # Pickling _scale and _growth_tracker Tensors directly triggers
+            # "warnings.warn("pickle support for Storage will be removed in 1.5..."
+            # so instead, we set the unpickled instance up to reinitialize them lazily.
+            state["_init_scale"] = self.get_scale()
+            state["_init_growth_tracker"] = self._get_growth_tracker()
+            state["_scale"] = None
+            state["_growth_tracker"] = None
+        return state
+
+    def __setstate__(self, state: Dict[str, Any]) -> None:
+        self.__dict__.update(state)
+
+    def _check_inf_per_device(self, optimizer: torch.optim.Optimizer) -> Dict[str, Any]:
+        _scale, _ = self._check_scale_growth_tracker("_check_inf_per_device")
+
+        dummy_inv_scale = torch.full((), 1.0, dtype=torch.float32, device=_scale.device)
+        found_inf = torch.full((), 0.0, dtype=torch.float32, device=_scale.device)
+
+        self._per_optimizer_states[id(optimizer)][
+            "found_inf_per_device"
+        ] = self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
+
+        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
+
+    def _found_inf_per_device(self, optimizer: torch.optim.Optimizer) -> Dict[str, Any]:
+        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
diff --git a/torch/ao/nn/intrinsic/modules/fused.py b/torch/ao/nn/intrinsic/modules/fused.py
index 2b4c6f489e99a..4fff70cd76b2b 100644
--- a/torch/ao/nn/intrinsic/modules/fused.py
+++ b/torch/ao/nn/intrinsic/modules/fused.py
@@ -39,8 +39,7 @@ class LinearReLU(_FusedModule):
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, linear, relu):
         assert type_before_parametrizations(linear) == Linear and type_before_parametrizations(relu) == ReLU, \
-            'Incorrect types for input modules{}{}'.format(
-                type_before_parametrizations(linear), type_before_parametrizations(relu))
+            f'Incorrect types for input modules{type_before_parametrizations(linear)}{type_before_parametrizations(relu)}'
         super().__init__(linear, relu)
 
 class ConvBn1d(_FusedModule):
@@ -64,8 +63,7 @@ class ConvBnReLU1d(_FusedModule):
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, conv, bn, relu):
         assert type_before_parametrizations(conv) == Conv1d and type_before_parametrizations(bn) == BatchNorm1d and \
-            type_before_parametrizations(relu) == ReLU, 'Incorrect types for input modules{}{}{}' \
-            .format(type_before_parametrizations(conv), type_before_parametrizations(bn), type_before_parametrizations(relu))
+            type_before_parametrizations(relu) == ReLU, f'Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(bn)}{type_before_parametrizations(relu)}'  # noqa: B950
         super().__init__(conv, bn, relu)
 
 class ConvBnReLU2d(_FusedModule):
@@ -73,8 +71,7 @@ class ConvBnReLU2d(_FusedModule):
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, conv, bn, relu):
         assert type_before_parametrizations(conv) == Conv2d and type_before_parametrizations(bn) == BatchNorm2d and \
-            type_before_parametrizations(relu) == ReLU, 'Incorrect types for input modules{}{}{}' \
-            .format(type_before_parametrizations(conv), type_before_parametrizations(bn), type_before_parametrizations(relu))
+            type_before_parametrizations(relu) == ReLU, f'Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(bn)}{type_before_parametrizations(relu)}'  # noqa: B950
         super().__init__(conv, bn, relu)
 
 class ConvBn3d(_FusedModule):
@@ -90,8 +87,7 @@ class ConvBnReLU3d(_FusedModule):
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, conv, bn, relu):
         assert type_before_parametrizations(conv) == Conv3d and type_before_parametrizations(bn) == BatchNorm3d and \
-            type_before_parametrizations(relu) == ReLU, 'Incorrect types for input modules{}{}{}' \
-            .format(type_before_parametrizations(conv), type_before_parametrizations(bn), type_before_parametrizations(relu))
+            type_before_parametrizations(relu) == ReLU, f'Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(bn)}{type_before_parametrizations(relu)}'  # noqa: B950
         super().__init__(conv, bn, relu)
 
 
@@ -100,8 +96,7 @@ class BNReLU2d(_FusedModule):
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, batch_norm, relu):
         assert type_before_parametrizations(batch_norm) == BatchNorm2d and type_before_parametrizations(relu) == ReLU, \
-            'Incorrect types for input modules{}{}'.format(
-                type_before_parametrizations(batch_norm), type_before_parametrizations(relu))
+            f'Incorrect types for input modules{type_before_parametrizations(batch_norm)}{type_before_parametrizations(relu)}'
         super().__init__(batch_norm, relu)
 
 class BNReLU3d(_FusedModule):
@@ -109,8 +104,7 @@ class BNReLU3d(_FusedModule):
     During quantization this will be replaced with the corresponding fused module."""
     def __init__(self, batch_norm, relu):
         assert type_before_parametrizations(batch_norm) == BatchNorm3d and type_before_parametrizations(relu) == ReLU, \
-            'Incorrect types for input modules{}{}'.format(
-                type_before_parametrizations(batch_norm), type_before_parametrizations(relu))
+            f'Incorrect types for input modules{type_before_parametrizations(batch_norm)}{type_before_parametrizations(relu)}'
         super().__init__(batch_norm, relu)
 
 
diff --git a/torch/ao/nn/intrinsic/qat/modules/conv_fused.py b/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
index e3ec75481bd61..906206e18e64f 100644
--- a/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
+++ b/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
@@ -188,7 +188,7 @@ def _forward_slow(self, input):
 
         if self.bn.training:
             avg_dims = [0] + list(range(2, len(self.weight.shape)))
-            batch_mean = conv_out.mean(avg_dims)
+            batch_mean = conv_out.mean(avg_dims)  # type: ignore[possibly-undefined]
             batch_var = torch.square(conv_out - batch_mean.reshape(bias_shape)).mean(
                 avg_dims
             )
@@ -379,7 +379,7 @@ class ConvBn1d(_ConvBnNd, nn.Conv1d):
 
     """
     _FLOAT_BN_MODULE = nn.BatchNorm1d
-    _FLOAT_RELU_MODULE = None
+    _FLOAT_RELU_MODULE: None = None
     _FLOAT_MODULE = nni.ConvBn1d
     _FLOAT_CONV_MODULE = nn.Conv1d
 
@@ -468,9 +468,9 @@ class ConvReLU1d(nnqat.Conv1d, nni._FusedModule):
         weight_fake_quant: fake quant module for weight
 
     """
-    _FLOAT_MODULE = nni.ConvReLU1d
+    _FLOAT_MODULE = nni.ConvReLU1d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE = nn.Conv1d
-    _FLOAT_BN_MODULE = None
+    _FLOAT_BN_MODULE: None = None
     _FLOAT_RELU_MODULE = nn.ReLU
 
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
@@ -513,7 +513,7 @@ class ConvBn2d(_ConvBnNd, nn.Conv2d):
     _FLOAT_MODULE = nni.ConvBn2d
     _FLOAT_CONV_MODULE = nn.Conv2d
     _FLOAT_BN_MODULE = nn.BatchNorm2d
-    _FLOAT_RELU_MODULE = None
+    _FLOAT_RELU_MODULE: None = None
 
     def __init__(self,
                  # ConvNd args
@@ -600,9 +600,9 @@ class ConvReLU2d(nnqat.Conv2d, nni._FusedModule):
         weight_fake_quant: fake quant module for weight
 
     """
-    _FLOAT_MODULE = nni.ConvReLU2d
+    _FLOAT_MODULE = nni.ConvReLU2d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE = nn.Conv2d
-    _FLOAT_BN_MODULE = None
+    _FLOAT_BN_MODULE: None = None
     _FLOAT_RELU_MODULE = nn.ReLU
 
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
@@ -645,7 +645,7 @@ class ConvBn3d(_ConvBnNd, nn.Conv3d):
     _FLOAT_MODULE = nni.ConvBn3d
     _FLOAT_CONV_MODULE = nn.Conv3d
     _FLOAT_BN_MODULE = nn.BatchNorm3d
-    _FLOAT_RELU_MODULE = None
+    _FLOAT_RELU_MODULE: None = None
 
     def __init__(
         self,
@@ -773,9 +773,9 @@ class ConvReLU3d(nnqat.Conv3d, nni._FusedModule):
         weight_fake_quant: fake quant module for weight
 
     """
-    _FLOAT_MODULE = nni.ConvReLU3d
+    _FLOAT_MODULE = nni.ConvReLU3d  # type: ignore[assignment]
     _FLOAT_CONV_MODULE = nn.Conv3d
-    _FLOAT_BN_MODULE = None
+    _FLOAT_BN_MODULE: None = None
     _FLOAT_RELU_MODULE = nn.ReLU
 
     def __init__(
diff --git a/torch/ao/nn/intrinsic/qat/modules/linear_relu.py b/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
index 11d11047c2c72..97f7a1dbc3396 100644
--- a/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
@@ -26,7 +26,7 @@ class LinearReLU(nnqat.Linear, nni._FusedModule):
         >>> print(output.size())
         torch.Size([128, 30])
     """
-    _FLOAT_MODULE = nni.LinearReLU
+    _FLOAT_MODULE = nni.LinearReLU  # type: ignore[assignment]
 
     def __init__(self, in_features, out_features, bias=True,
                  qconfig=None):
diff --git a/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py b/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
index 17cb48f80fda9..e774a72dc8229 100644
--- a/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
@@ -27,7 +27,7 @@ class LinearReLU(nnq.Linear):
         >>> print(output.size())
         torch.Size([128, 30])
     """
-    _FLOAT_MODULE = nni.LinearReLU
+    _FLOAT_MODULE = nni.LinearReLU  # type: ignore[assignment]
 
     def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8):
         super().__init__(in_features, out_features, bias, dtype)
@@ -63,7 +63,7 @@ class LinearLeakyReLU(nnq.Linear):
         >>> print(output.size())
         torch.Size([128, 30])
     """
-    _FLOAT_MODULE = nni.LinearLeakyReLU
+    _FLOAT_MODULE = nni.LinearLeakyReLU  # type: ignore[assignment]
 
     def __init__(self, in_features, out_features, negative_slope, bias=True, dtype=torch.qint8):
         super().__init__(in_features, out_features, bias, dtype)
@@ -131,7 +131,7 @@ class LinearTanh(nnq.Linear):
         >>> print(output.size())
         torch.Size([128, 30])
     """
-    _FLOAT_MODULE = nni.LinearTanh
+    _FLOAT_MODULE = nni.LinearTanh  # type: ignore[assignment]
 
     def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8):
         super().__init__(in_features, out_features, bias, dtype)
diff --git a/torch/ao/nn/quantizable/modules/rnn.py b/torch/ao/nn/quantizable/modules/rnn.py
index 882c63d699dcf..2c57d1ae9bc51 100644
--- a/torch/ao/nn/quantizable/modules/rnn.py
+++ b/torch/ao/nn/quantizable/modules/rnn.py
@@ -303,7 +303,7 @@ def __init__(self, input_size: int, hidden_size: int,
         self.batch_first = batch_first
         self.dropout = float(dropout)
         self.bidirectional = bidirectional
-        self.training = False  # We don't want to train using this module
+        self.training = False  # Default to eval mode. If we want to train, we will explicitly set to training.
         num_directions = 2 if bidirectional else 1
 
         if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \
@@ -392,9 +392,14 @@ def from_float(cls, other, qconfig=None):
         for idx in range(other.num_layers):
             observed.layers[idx] = _LSTMLayer.from_float(other, idx, qconfig,
                                                          batch_first=False)
-        # TODO: Remove setting observed to eval to enable QAT.
-        observed.eval()
-        observed = torch.ao.quantization.prepare(observed, inplace=True)
+
+        # Prepare the model
+        if other.training:
+            observed.train()
+            observed = torch.ao.quantization.prepare_qat(observed, inplace=True)
+        else:
+            observed.eval()
+            observed = torch.ao.quantization.prepare(observed, inplace=True)
         return observed
 
     @classmethod
diff --git a/torch/ao/nn/quantized/dynamic/modules/conv.py b/torch/ao/nn/quantized/dynamic/modules/conv.py
index f1af779641365..54d2b7e83fedf 100644
--- a/torch/ao/nn/quantized/dynamic/modules/conv.py
+++ b/torch/ao/nn/quantized/dynamic/modules/conv.py
@@ -56,9 +56,7 @@ def __init__(self,
                  dtype=None,
                  reduce_range=True):
         warnings.warn(
-            "The current implementation of the {} module has poor numerical accuracy and its use is not recommended".format(
-                self._get_name()
-            )
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
         )
         factory_kwargs = {'device': device, 'dtype': dtype}
         kernel_size = _single(kernel_size)
@@ -121,9 +119,8 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
                  padding_mode='zeros', device=None, dtype=None):
         warnings.warn(
-            "The current implementation of the {} module has poor numerical accuracy and its use is not recommended".format(
-                self._get_name()
-            )
+            f"The current implementation of the {self._get_name()} module "
+            "has poor numerical accuracy and its use is not recommended"
         )
         factory_kwargs = {'device': device, 'dtype': dtype}
         kernel_size = _pair(kernel_size)
@@ -186,9 +183,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
                  padding_mode='zeros', device=None, dtype=None):
         warnings.warn(
-            "The current implementation of the {} module has poor numerical accuracy and its use is not recommended".format(
-                self._get_name()
-            )
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
         )
         assert padding_mode != 'reflect', "Conv3d does not support reflection padding"
         factory_kwargs = {'device': device, 'dtype': dtype}
@@ -256,9 +251,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, output_padding=0, groups=1, bias=True,
                  dilation=1, padding_mode='zeros', device=None, dtype=None):
         warnings.warn(
-            "The current implementation of the {} module has poor numerical accuracy and its use is not recommended".format(
-                self._get_name()
-            )
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
         )
         factory_kwargs = {'device': device, 'dtype': dtype}
         super().__init__(
@@ -317,9 +310,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, output_padding=0, groups=1, bias=True,
                  dilation=1, padding_mode='zeros', device=None, dtype=None):
         warnings.warn(
-            "The current implementation of the {} module has poor numerical accuracy and its use is not recommended".format(
-                self._get_name()
-            )
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
         )
         factory_kwargs = {'device': device, 'dtype': dtype}
         super().__init__(
@@ -378,9 +369,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, output_padding=0, groups=1, bias=True,
                  dilation=1, padding_mode='zeros', device=None, dtype=None):
         warnings.warn(
-            "The current implementation of the {} module has poor numerical accuracy and its use is not recommended".format(
-                self._get_name()
-            )
+            f"The current implementation of the {self._get_name()} module has poor numerical accuracy and its use is not recommended"  # noqa: B950
         )
         factory_kwargs = {'device': device, 'dtype': dtype}
         super().__init__(
diff --git a/torch/ao/nn/quantized/dynamic/modules/linear.py b/torch/ao/nn/quantized/dynamic/modules/linear.py
index a8a366e57f53c..bf77aa04f0cba 100644
--- a/torch/ao/nn/quantized/dynamic/modules/linear.py
+++ b/torch/ao/nn/quantized/dynamic/modules/linear.py
@@ -64,9 +64,7 @@ def _get_name(self):
         return 'DynamicQuantizedLinear'
 
     def extra_repr(self):
-        extra_repr_str = 'in_features={}, out_features={}, dtype={}'.format(
-            self.in_features, self.out_features, self._packed_params.dtype
-        )
+        extra_repr_str = f'in_features={self.in_features}, out_features={self.out_features}, dtype={self._packed_params.dtype}'
         if self._packed_params.dtype == torch.qint8:
             extra_repr_str += f', qscheme={self.weight().qscheme()}'
         return extra_repr_str
diff --git a/torch/ao/nn/quantized/modules/activation.py b/torch/ao/nn/quantized/modules/activation.py
index 74b518dea4c2a..6fcd223e50499 100644
--- a/torch/ao/nn/quantized/modules/activation.py
+++ b/torch/ao/nn/quantized/modules/activation.py
@@ -231,7 +231,7 @@ def from_observed(cls, other):
 
         if converted.bias_v is not None:
             bias_v = converted._parameters.pop('bias_v')
-            sc, zp = torch._choose_qparams_per_tensor(bias_k,
+            sc, zp = torch._choose_qparams_per_tensor(bias_k,  # type: ignore[possibly-undefined]
                                                       reduce_range=False)
             bias_v = torch.quantize_per_tensor(bias_v, sc, zp, torch.quint8)
             setattr(converted, 'bias_v', bias_v)  # noqa: B010
diff --git a/torch/ao/nn/quantized/modules/conv.py b/torch/ao/nn/quantized/modules/conv.py
index 22a1101437594..ad1a51ee9c3b1 100644
--- a/torch/ao/nn/quantized/modules/conv.py
+++ b/torch/ao/nn/quantized/modules/conv.py
@@ -306,8 +306,8 @@ class Conv1d(_ConvNd):
     _FLOAT_MODULE = nn.Conv1d
     _NNIQAT_CONV_BN_MODULE = nniqat.ConvBn1d
     _NNI_CONV_RELU_MODULE = nni.ConvReLU1d
-    _NNI_CONV_ADD_MODULE = None
-    _NNI_CONV_ADD_RELU_MODULE = None
+    _NNI_CONV_ADD_MODULE: None = None
+    _NNI_CONV_ADD_RELU_MODULE: None = None
 
     def __init__(self,
                  in_channels: int,
@@ -520,8 +520,8 @@ class Conv3d(_ConvNd):
     _FLOAT_MODULE = nn.Conv3d
     _NNIQAT_CONV_BN_MODULE = nniqat.ConvBn3d
     _NNI_CONV_RELU_MODULE = nni.ConvReLU3d
-    _NNI_CONV_ADD_MODULE = None
-    _NNI_CONV_ADD_RELU_MODULE = None
+    _NNI_CONV_ADD_MODULE: None = None
+    _NNI_CONV_ADD_RELU_MODULE: None = None
 
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
diff --git a/torch/ao/nn/quantized/modules/embedding_ops.py b/torch/ao/nn/quantized/modules/embedding_ops.py
index c4389a60d9b08..25de7fa9b3cf1 100644
--- a/torch/ao/nn/quantized/modules/embedding_ops.py
+++ b/torch/ao/nn/quantized/modules/embedding_ops.py
@@ -125,9 +125,8 @@ def __repr__(self):
         return _hide_packed_params_repr(self, EmbeddingPackedParams)
 
     def extra_repr(self):
-        extra_repr_str = 'num_embeddings={}, embedding_dim={}, dtype={}, qscheme={}'.format(
-            self.num_embeddings, self.embedding_dim, self._packed_params.dtype, self.weight().qscheme()
-        )
+        extra_repr_str = (f'num_embeddings={self.num_embeddings}, embedding_dim={self.embedding_dim}, '
+                          f'dtype={self._packed_params.dtype}, qscheme={self.weight().qscheme()}')
 
         return extra_repr_str
 
diff --git a/torch/ao/nn/quantized/modules/linear.py b/torch/ao/nn/quantized/modules/linear.py
index 213934e62962a..9d988104a71d5 100644
--- a/torch/ao/nn/quantized/modules/linear.py
+++ b/torch/ao/nn/quantized/modules/linear.py
@@ -24,7 +24,7 @@ def __init__(self, dtype=torch.qint8):
             wq = torch._empty_affine_quantized([1, 1], scale=1.0, zero_point=0, dtype=torch.qint8)
         elif self.dtype == torch.float16:
             wq = torch.zeros([1, 1], dtype=torch.float)
-        self.set_weight_bias(wq, None)
+        self.set_weight_bias(wq, None)  # type: ignore[possibly-undefined]
 
     @torch.jit.export
     def set_weight_bias(self, weight: torch.Tensor, bias: Optional[torch.Tensor]) -> None:
@@ -157,9 +157,8 @@ def _get_name(self):
         return 'QuantizedLinear'
 
     def extra_repr(self):
-        return 'in_features={}, out_features={}, scale={}, zero_point={}, qscheme={}'.format(
-            self.in_features, self.out_features, self.scale, self.zero_point, self.weight().qscheme()
-        )
+        return f'in_features={self.in_features}, out_features={self.out_features}, scale={self.scale}, ' \
+               f'zero_point={self.zero_point}, qscheme={self.weight().qscheme()}'
 
     def __repr__(self):
         return _hide_packed_params_repr(self, LinearPackedParams)
diff --git a/torch/ao/nn/quantized/modules/rnn.py b/torch/ao/nn/quantized/modules/rnn.py
index 25551c5b6d420..deb14856a9ef9 100644
--- a/torch/ao/nn/quantized/modules/rnn.py
+++ b/torch/ao/nn/quantized/modules/rnn.py
@@ -44,7 +44,7 @@ def from_float(cls, *args, **kwargs):
 
     @classmethod
     def from_observed(cls, other):
-        assert type(other) == cls._FLOAT_MODULE
+        assert type(other) == cls._FLOAT_MODULE  # type: ignore[has-type]
         converted = torch.ao.quantization.convert(other, inplace=False,
                                                   remove_qconfig=True)
         converted.__class__ = cls
diff --git a/torch/ao/nn/quantized/reference/modules/rnn.py b/torch/ao/nn/quantized/reference/modules/rnn.py
index 9f44667c270b5..4120338ce271a 100644
--- a/torch/ao/nn/quantized/reference/modules/rnn.py
+++ b/torch/ao/nn/quantized/reference/modules/rnn.py
@@ -435,7 +435,7 @@ def forward(self, input, hx=None):  # noqa: F811
             hx = (h_zeros, c_zeros)
         else:
             if batch_sizes is None:  # If not PackedSequence input.
-                if is_batched:
+                if is_batched:  # type: ignore[possibly-undefined]
                     if (hx[0].dim() != 3 or hx[1].dim() != 3):
                         msg = ("For batched 3-D input, hx and cx should "
                                f"also be 3-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors")
@@ -465,8 +465,8 @@ def forward(self, input, hx=None):  # noqa: F811
             output_packed = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
         else:
-            if not is_batched:
-                output = output.squeeze(batch_dim)
+            if not is_batched:  # type: ignore[possibly-undefined]
+                output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
                 hidden = (hidden[0].squeeze(1), hidden[1].squeeze(1))
             return output, self.permute_hidden(hidden, unsorted_indices)
 
@@ -589,8 +589,8 @@ def forward(self, input, hx=None):  # noqa: F811
             output_packed = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
         else:
-            if not is_batched:
-                output = output.squeeze(batch_dim)
+            if not is_batched:  # type: ignore[possibly-undefined]
+                output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
                 hidden = hidden.squeeze(1)
 
             return output, self.permute_hidden(hidden, unsorted_indices)
diff --git a/torch/ao/nn/quantized/reference/modules/utils.py b/torch/ao/nn/quantized/reference/modules/utils.py
index 2c1f52cdf884f..c4f4d0b46efd9 100644
--- a/torch/ao/nn/quantized/reference/modules/utils.py
+++ b/torch/ao/nn/quantized/reference/modules/utils.py
@@ -19,7 +19,7 @@ def _init_weight_qparams(self, weight_qparams, device):
         assert self.weight_qscheme in [
             None, torch.per_tensor_affine, torch.per_channel_affine,
             torch.per_channel_affine_float_qparams], \
-            Exception(f"qscheme: {self.weight_qscheme} is not support in reference quantized {self._get_name()}")
+            f"qscheme: {self.weight_qscheme} is not support in reference quantized {self._get_name()}"
         if self.weight_dtype in [torch.quint8, torch.qint8, torch.quint4x2, torch.qint32]:
             zero_point_dtype = weight_qparams["zero_point"].dtype if \
                 isinstance(weight_qparams["zero_point"], torch.Tensor) else \
@@ -176,7 +176,7 @@ def _quantize_weight_decomposed(
                 weight_quant_max,
                 weight_dtype_)  # type: ignore[arg-type]
             return weight
-    raise Exception(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}")
+    raise ValueError(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}")
 
 def _dequantize_weight_decomposed(
         weight: torch.Tensor,
@@ -226,7 +226,7 @@ def _dequantize_weight_decomposed(
                 weight_quant_max,
                 weight_dtype_)  # type: ignore[arg-type]
             return weight
-    raise Exception(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}")
+    raise ValueError(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}")
 
 def _quantize_weight(
         weight: torch.Tensor,
@@ -250,7 +250,7 @@ def _quantize_weight(
                 weight, weight_scale,
                 weight_zero_point, weight_axis_int, weight_dtype)  # type: ignore[arg-type]
             return weight
-    raise Exception(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}")
+    raise ValueError(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}")
 
 def _quantize_and_dequantize_weight_decomposed(
         weight: torch.Tensor,
diff --git a/torch/ao/nn/sparse/quantized/linear.py b/torch/ao/nn/sparse/quantized/linear.py
index db48a825756b1..71caa8cbab611 100644
--- a/torch/ao/nn/sparse/quantized/linear.py
+++ b/torch/ao/nn/sparse/quantized/linear.py
@@ -99,9 +99,8 @@ def _get_name(cls):
         return 'SparseQuantizedLinear'
 
     def extra_repr(self):
-        return 'in_features={}, out_features={}, scale={}, zero_point={}, qscheme={}'.format(
-            self.in_features, self.out_features, self.scale, self.zero_point, self.weight().qscheme()
-        )
+        return (f'in_features={self.in_features}, out_features={self.out_features}, scale={self.scale}, '
+                f'zero_point={self.zero_point}, qscheme={self.weight().qscheme()}')
 
     def __repr__(self):
         return _hide_packed_params_repr(self, LinearPackedParams)
diff --git a/torch/ao/ns/_numeric_suite_fx.py b/torch/ao/ns/_numeric_suite_fx.py
index b26dbadc00682..ec5fdaede073a 100644
--- a/torch/ao/ns/_numeric_suite_fx.py
+++ b/torch/ao/ns/_numeric_suite_fx.py
@@ -125,7 +125,6 @@
 from torch.ao.quantization.fx.graph_module import _get_observed_graph_module_attr
 from torch.ao.quantization.fx.qconfig_mapping_utils import _generate_node_name_to_qconfig
 from torch.ao.quantization.fx.quantize_handler import _get_pattern_to_quantize_handlers
-from torch.ao.quantization.qconfig import QConfigAny
 from torch.ao.quantization import QConfigMapping
 from torch.ao.ns.fx.n_shadows_utils import (
     OutputProp,
@@ -140,7 +139,10 @@
 )
 from torch.ao.ns.fx.qconfig_multi_mapping import QConfigMultiMapping
 
-from typing import Dict, Tuple, Callable, List, Optional, Set, Any, Type
+from typing import Dict, Tuple, Callable, List, Optional, Set, Any, Type, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from torch.ao.quantization.qconfig import QConfigAny
 
 RNNReturnType = Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
 
diff --git a/torch/ao/ns/fx/graph_passes.py b/torch/ao/ns/fx/graph_passes.py
index edd5284cf6eb6..fbd03426790d5 100644
--- a/torch/ao/ns/fx/graph_passes.py
+++ b/torch/ao/ns/fx/graph_passes.py
@@ -759,7 +759,7 @@ def load_arg(a):
                 continue
 
             fqn_base_a = _maybe_get_fqn(subgraph_a.base_op_node, gm_a)
-            fqn_base_b = _maybe_get_fqn(subgraph_b.base_op_node, gm_b)
+            fqn_base_b = _maybe_get_fqn(subgraph_b.base_op_node, gm_b)  # type: ignore[possibly-undefined]
 
             if node_b_is_start_node:
 
@@ -817,7 +817,7 @@ def load_arg(a):
                 # cast dtype from the dtype of node_c's input to the dtype of
                 # node_a's input (dequant, etc)
                 # prev_node_c = node_c.args[0]
-                prev_node_c = get_normalized_nth_input(node_c, gm_b, 0)
+                prev_node_c = get_normalized_nth_input(node_c, gm_b, 0)  # type: ignore[possibly-undefined]
                 if should_log_inputs:
                     # skip the input logger when inserting a dtype cast
                     if isinstance(prev_node_c, Node):
@@ -901,7 +901,7 @@ def load_arg(a):
                     # input_logger = env_c[dtype_cast_node.name]
                     # Find the first node in the subgraph
                     cur_node = node_a_shadows_c
-                    while get_normalized_nth_input(cur_node, gm_b, 0) != input_logger:
+                    while get_normalized_nth_input(cur_node, gm_b, 0) != input_logger:  # type: ignore[possibly-undefined]
                         cur_node = get_normalized_nth_input(cur_node, gm_b, 0)  # type: ignore[assignment]
                     if isinstance(input_logger, Node):
                         input_logger_mod = getattr(gm_b, input_logger.name)
diff --git a/torch/ao/ns/fx/n_shadows_utils.py b/torch/ao/ns/fx/n_shadows_utils.py
index 297d9b878cd25..1fd6f069ac831 100644
--- a/torch/ao/ns/fx/n_shadows_utils.py
+++ b/torch/ao/ns/fx/n_shadows_utils.py
@@ -92,7 +92,7 @@ def fetch_attr(target : str):
             elif node.op == 'call_module':
                 result = self.modules[node.target](*load_arg(node.args), **load_arg(node.kwargs))
 
-            if isinstance(result, torch.Tensor):
+            if isinstance(result, torch.Tensor):  # type: ignore[possibly-undefined]
                 node.traced_result = result
 
             env[node.name] = result
@@ -375,7 +375,7 @@ def _add_placeholder(
             # TODO(future PR): this is ignoring kwargs, will need to support kwargs
             # for any fusion pattern which has them for a node that is not the
             # first node.
-            cur_args_copy = [cur_node_copy]  # type: ignore[has-type]  # noqa: F821
+            cur_args_copy = [cur_node_copy]  # type: ignore[has-type, possibly-undefined]  # noqa: F821
 
             if len(cur_node_orig.args) > 1:
                 for arg in cur_node_orig.args[1:]:
@@ -399,15 +399,15 @@ def _add_placeholder(
             mod_name = f"mod_{cur_name_idx}"
             setattr(gm, mod_name, orig_mod_copy)
             cur_name_idx += 1
-            cur_node_copy = g.call_module(mod_name, cur_args_copy, cur_kwargs_copy)
+            cur_node_copy = g.call_module(mod_name, cur_args_copy, cur_kwargs_copy)  # type: ignore[possibly-undefined]
 
         elif cur_node_orig.op == 'call_function':
             cur_node_copy = g.call_function(
-                cur_node_orig.target, cur_args_copy, cur_kwargs_copy)
+                cur_node_orig.target, cur_args_copy, cur_kwargs_copy)  # type: ignore[possibly-undefined]
 
         elif cur_node_orig.op == 'call_method':
             cur_node_copy = g.call_method(
-                cur_node_orig.target, cur_args_copy, cur_kwargs_copy)
+                cur_node_orig.target, cur_args_copy, cur_kwargs_copy)  # type: ignore[possibly-undefined]
 
         else:
             raise AssertionError(f'{cur_node_orig.op} not supported yet')
@@ -742,8 +742,7 @@ def _get_subgraph_containing_node(node, subgraphs_dedup):
         insert_submodule_copy = False
         if maybe_subgraph is not None:
             first_node, last_node = maybe_subgraph[0], maybe_subgraph[-1]
-            for node_to_skip in maybe_subgraph:
-                nodes_to_skip.add(node_to_skip)
+            nodes_to_skip.update(maybe_subgraph)
             qconfig = node_name_to_qconfig[first_node.name]
             if qconfig is not None:
                 insert_submodule_copy = True
@@ -873,8 +872,7 @@ def _get_subgraph_containing_node(node, subgraphs_dedup):
         maybe_subgraph = _get_subgraph_containing_node(n, subgraphs_dedup)
         if maybe_subgraph is not None:
             first_node, last_node = maybe_subgraph[0], maybe_subgraph[-1]
-            for node_to_skip in maybe_subgraph:
-                nodes_to_skip.add(node_to_skip)
+            nodes_to_skip.update(maybe_subgraph)
         else:
             first_node, last_node = n, n
 
diff --git a/torch/ao/ns/fx/qconfig_multi_mapping.py b/torch/ao/ns/fx/qconfig_multi_mapping.py
index 20a005d0c8bf9..33efe21e3fe09 100644
--- a/torch/ao/ns/fx/qconfig_multi_mapping.py
+++ b/torch/ao/ns/fx/qconfig_multi_mapping.py
@@ -1,12 +1,14 @@
 from __future__ import annotations
 
 import copy
-from typing import Any, Callable, Dict, List, Union
+from typing import Any, Callable, Dict, List, Union, TYPE_CHECKING
 
 import torch
 from torch.ao.quantization import QConfigMapping
 from torch.ao.quantization.qconfig_mapping import _QCONFIG_STYLE_ORDER
-from torch.ao.quantization.qconfig import QConfigAny
+
+if TYPE_CHECKING:
+    from torch.ao.quantization.qconfig import QConfigAny
 
 __all__ = ["QConfigMultiMapping"]
 
diff --git a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
index c336799c62256..7c03a9f6e36af 100644
--- a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
+++ b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
@@ -402,7 +402,7 @@ def __set_state__(self, state: Dict[str, Any]) -> None:
                 hook = layer.register_forward_pre_hook(self._sparsify_hook(name))
 
             config['layer'] = layer
-            config['hook'] = hook
+            config['hook'] = hook  # type: ignore[possibly-undefined]
 
     def __repr__(self):
         format_string = self.__class__.__name__ + ' ('
diff --git a/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py b/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
index 0e4060f95435b..ad4df426c8e1c 100644
--- a/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
+++ b/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
@@ -30,8 +30,7 @@ class BaseDataScheduler:
     def __init__(self, data_sparsifier, schedule_param: str, last_epoch=-1, verbose=False):
         # Attach sparsifier
         if not isinstance(data_sparsifier, BaseDataSparsifier):
-            raise TypeError('{} is not an instance of torch.ao.pruning.BaseDataSparsifier'.format(
-                type(data_sparsifier).__name__))
+            raise TypeError(f'{type(data_sparsifier).__name__} is not an instance of torch.ao.pruning.BaseDataSparsifier')
         self.data_sparsifier = data_sparsifier
         self.schedule_param = schedule_param
 
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/data_sparsity.py b/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/data_sparsity.py
index c36c35bcf5241..77ca61d599cba 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/data_sparsity.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/data_sparsity.py
@@ -1,7 +1,6 @@
 from collections import defaultdict
 from copy import deepcopy
-import torch
-from typing import Any, Optional, Dict
+from typing import Any, Optional, Dict, TYPE_CHECKING
 import pytorch_lightning as pl  # type: ignore[import]
 
 from ._data_sparstity_utils import (
@@ -10,6 +9,9 @@
     _get_valid_name
 )
 
+if TYPE_CHECKING:
+    import torch
+
 
 class PostTrainingDataSparsity(pl.callbacks.Callback):
     """Lightning callback that enables post-training sparsity.
diff --git a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
index c9fa549dddc29..357421fb5529a 100644
--- a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
+++ b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
@@ -301,7 +301,7 @@ def prune(self) -> None:
 
         for module in self.traced.modules():
             if module_contains_param(module, FakeStructuredSparsity):
-                raise Exception(
+                raise Exception(  # noqa: TRY002
                     f"Error: {module} still contains FakeStructuredSparsity parametrizations!"
                 )
 
diff --git a/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py b/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py
index 4a0d74d6dc933..9e569c14a6c83 100644
--- a/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py
+++ b/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py
@@ -31,7 +31,7 @@ def update_mask(self, module, tensor_name, **kwargs):
 
                 # select weights based on magnitude
                 if weights.dim() <= 1:
-                    raise Exception("Structured pruning can only be applied to a 2+dim weight tensor!")
+                    raise Exception("Structured pruning can only be applied to a 2+dim weight tensor!")  # noqa: TRY002
                 # take norm over all but first dim
                 dims = tuple(range(1, weights.dim()))
                 saliency = weights.norm(dim=dims, p=1)
diff --git a/torch/ao/pruning/_experimental/pruner/prune_functions.py b/torch/ao/pruning/_experimental/pruner/prune_functions.py
index 8278ec642e9da..2b16d4b327a0b 100644
--- a/torch/ao/pruning/_experimental/pruner/prune_functions.py
+++ b/torch/ao/pruning/_experimental/pruner/prune_functions.py
@@ -84,7 +84,7 @@ def _prune_module_bias(module: nn.Module, mask: Tensor) -> None:
         delattr(module, "_bias")
 
 
-def _propogate_module_bias(module: nn.Module, mask: Tensor) -> Optional[Tensor]:
+def _propagate_module_bias(module: nn.Module, mask: Tensor) -> Optional[Tensor]:
     r"""
     In the case that we need to propagate biases, this function will return the biases we need
     """
@@ -117,7 +117,7 @@ def _prune_linear_helper(linear: nn.Linear) -> Tensor:
 
     with torch.no_grad():
         parametrize.remove_parametrizations(linear, "weight", leave_parametrized=True)
-        linear.weight = nn.Parameter(linear.weight[mask])
+        linear.weight = nn.Parameter(linear.weight[mask])  # type: ignore[possibly-undefined]
     linear.out_features = linear.weight.shape[0]
     _remove_bias_handles(linear)
 
@@ -143,7 +143,7 @@ def prune_linear_activation_linear(
     if getattr(linear1, "prune_bias", False):
         _prune_module_bias(linear1, mask)
     else:
-        pruned_biases = _propogate_module_bias(linear1, mask)
+        pruned_biases = _propagate_module_bias(linear1, mask)
         if pruned_biases is not None:
             if activation:
                 pruned_biases = activation(pruned_biases)
@@ -175,7 +175,7 @@ def _prune_conv2d_helper(conv2d: nn.Conv2d) -> Tensor:
 
     with torch.no_grad():
         parametrize.remove_parametrizations(conv2d, "weight", leave_parametrized=True)
-        conv2d.weight = nn.Parameter(conv2d.weight[mask])
+        conv2d.weight = nn.Parameter(conv2d.weight[mask])  # type: ignore[possibly-undefined]
     conv2d.out_channels = conv2d.weight.shape[0]
 
     _remove_bias_handles(conv2d)
@@ -197,7 +197,7 @@ def prune_conv2d_padded(conv2d_1: nn.Conv2d) -> None:
             conv2d_1.bias is not None
         ):  # conv2d_1 has original bias and bias propagated from previous layer
             new_bias = torch.zeros(conv2d_1.bias.shape)
-            new_bias[mask] = conv2d_1.bias[mask]
+            new_bias[mask] = conv2d_1.bias[mask]  # type: ignore[possibly-undefined]
             # adjusted bias that to keep in conv2d_1
             new_bias[~mask] = cast(Tensor, conv2d_1._bias)[~mask]
             # pruned biases that are kept instead of propagated
@@ -209,7 +209,7 @@ def prune_conv2d_padded(conv2d_1: nn.Conv2d) -> None:
         if (
             conv2d_1.bias is not None
         ):  # conv2d_1 has bias propagated from previous layer
-            conv2d_1.bias.data[~mask] = 0
+            conv2d_1.bias.data[~mask] = 0  # type: ignore[possibly-undefined]
 
     if hasattr(conv2d_1, "_bias"):
         delattr(conv2d_1, "_bias")
@@ -251,7 +251,7 @@ def prune_conv2d_activation_conv2d(
         if prune_bias:
             _prune_module_bias(conv2d_1, mask)
         else:
-            pruned_biases = _propogate_module_bias(conv2d_1, mask)
+            pruned_biases = _propagate_module_bias(conv2d_1, mask)
             if pruned_biases is not None:
                 if activation:
                     pruned_biases = activation(pruned_biases)
@@ -335,7 +335,7 @@ def prune_conv2d_pool_flatten_linear(
     if getattr(conv2d, "prune_bias", False):
         _prune_module_bias(conv2d, mask)
     else:
-        pruned_biases = cast(Tensor, _propogate_module_bias(conv2d, mask))
+        pruned_biases = cast(Tensor, _propagate_module_bias(conv2d, mask))
         flattened_pruned_biases = torch.tensor(
             [[bias] * flatten_scale for bias in pruned_biases], device=mask.device
         ).flatten()
diff --git a/torch/ao/pruning/_experimental/pruner/saliency_pruner.py b/torch/ao/pruning/_experimental/pruner/saliency_pruner.py
index f965fa647de9e..7f96f0865d309 100644
--- a/torch/ao/pruning/_experimental/pruner/saliency_pruner.py
+++ b/torch/ao/pruning/_experimental/pruner/saliency_pruner.py
@@ -18,7 +18,7 @@ def update_mask(self, module, tensor_name, **kwargs):
 
         # use negative weights so we can use topk (we prune out the smallest)
         if weights.dim() <= 1:
-            raise Exception("Structured pruning can only be applied to a 2+dim weight tensor!")
+            raise Exception("Structured pruning can only be applied to a 2+dim weight tensor!")  # noqa: TRY002
         saliency = -weights.norm(dim=tuple(range(1, weights.dim())), p=1)
         assert saliency.shape == mask.shape
 
diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
index 04323f68b5d7e..e2b8ee5c810ad 100644
--- a/torch/ao/quantization/__init__.py
+++ b/torch/ao/quantization/__init__.py
@@ -12,7 +12,9 @@
 from .quantize import *  # noqa: F403
 from .quantize_jit import *  # noqa: F403
 from .stubs import *  # noqa: F403
-from .pt2e.eval_utils import _move_exported_model_to_eval as move_exported_model_to_eval
+from .pt2e.export_utils import _move_exported_model_to_eval as move_exported_model_to_eval
+from .pt2e.export_utils import _move_exported_model_to_train as move_exported_model_to_train
+from .pt2e.export_utils import _allow_exported_model_train_eval as allow_exported_model_train_eval
 from .pt2e.generate_numeric_debug_handle import generate_numeric_debug_handle  # noqa: F401
 from typing import Union, List, Callable, Tuple, Optional
 from torch import Tensor
@@ -122,6 +124,8 @@
     "get_static_quant_module_class",
     "load_observer_state_dict",
     "move_exported_model_to_eval",
+    "move_exported_model_to_train",
+    "allow_exported_model_train_eval",
     "no_observer_set",
     "per_channel_weight_observer_range_neg_127_to_127",
     "prepare",
diff --git a/torch/ao/quantization/backend_config/backend_config.py b/torch/ao/quantization/backend_config/backend_config.py
index e5a4d2f3afa34..2288aced09954 100644
--- a/torch/ao/quantization/backend_config/backend_config.py
+++ b/torch/ao/quantization/backend_config/backend_config.py
@@ -1,11 +1,13 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Type, Union, TYPE_CHECKING
 
 import torch
-from torch.ao.quantization.utils import Pattern
 from enum import Enum
 
+if TYPE_CHECKING:
+    from torch.ao.quantization.utils import Pattern
+
 
 __all__ = [
     "BackendConfig",
@@ -79,12 +81,12 @@ class DTypeWithConstraints:
 
     * `quant_min_lower_bound` and `quant_max_upper_bound`: Lower and upper
       bounds for the minimum and maximum quantized values respectively. If
-      the QConfig’s `quant_min` and `quant_max` fall outside this range,
+      the QConfig's `quant_min` and `quant_max` fall outside this range,
       then the QConfig will be ignored.
 
     * `scale_min_lower_bound` and `scale_max_upper_bound`: Lower and upper
       bounds for the minimum and maximum scale values respectively. If the
-      QConfig’s minimum scale value (currently exposed as `eps`) falls below
+      QConfig's minimum scale value (currently exposed as `eps`) falls below
       the lower bound, then the QConfig will be ignored. Note that the upper
       bound is currently not enforced.
 
@@ -130,7 +132,7 @@ class DTypeConfig:
     dtypes here are the same as the semantics of the dtypes specified in
     the observers.
 
-    These dtypes are matched against the ones specified in the user’s
+    These dtypes are matched against the ones specified in the user's
     QConfig. If there is a match, and the QConfig satisfies the constraints
     specified in the DTypeConfig (if any), then we will quantize the given
     pattern using this DTypeConfig. Otherwise, the QConfig is ignored and
diff --git a/torch/ao/quantization/fake_quantize.py b/torch/ao/quantization/fake_quantize.py
index e0a169956fd39..9f0503cf06a51 100644
--- a/torch/ao/quantization/fake_quantize.py
+++ b/torch/ao/quantization/fake_quantize.py
@@ -217,12 +217,10 @@ def forward(self, X):
 
     @torch.jit.export
     def extra_repr(self):
-        return 'fake_quant_enabled={}, observer_enabled={}, ' \
-               'quant_min={}, quant_max={}, dtype={}, qscheme={}, ch_axis={}, ' \
-               'scale={}, zero_point={}'.format(
-                   self.fake_quant_enabled, self.observer_enabled,
-                   self.activation_post_process.quant_min, self.activation_post_process.quant_max,
-                   self.dtype, self.qscheme, self.ch_axis, self.scale, self.zero_point)
+        return f'fake_quant_enabled={self.fake_quant_enabled}, observer_enabled={self.observer_enabled}, ' \
+               f'quant_min={self.activation_post_process.quant_min}, quant_max={self.activation_post_process.quant_max}, ' \
+               f'dtype={self.dtype}, qscheme={self.qscheme}, ch_axis={self.ch_axis}, ' \
+               f'scale={self.scale}, zero_point={self.zero_point}'
 
     def _save_to_state_dict(self, destination, prefix, keep_vars):
         # We cannot currently register scalar values as buffers, so need to manually
@@ -289,11 +287,10 @@ def calculate_qparams(self):
     @torch.jit.export
     def extra_repr(self):
         """Define a string representation of the object's attributes."""
-        return 'fake_quant_enabled={}, observer_enabled={}, scale={}, zero_point={}, ' \
-               'dtype={}, quant_min={}, quant_max={}, qscheme={}'.format(
-                   self.fake_quant_enabled, self.observer_enabled,
-                   self.scale, self.zero_point, self.dtype,
-                   self.activation_post_process.quant_min, self.activation_post_process.quant_max, self.qscheme)
+        return f'fake_quant_enabled={self.fake_quant_enabled}, observer_enabled={self.observer_enabled}, ' \
+               f'scale={self.scale}, zero_point={self.zero_point}, ' \
+               f'dtype={self.dtype}, quant_min={self.activation_post_process.quant_min}, ' \
+               f'quant_max={self.activation_post_process.quant_max}, qscheme={self.qscheme}'
 
 
 class FusedMovingAvgObsFakeQuantize(FakeQuantize):
@@ -335,18 +332,10 @@ def calculate_qparams(self) -> Tuple[torch.Tensor, torch.Tensor]:
     @torch.jit.export
     def extra_repr(self) -> str:
         return (
-            "fake_quant_enabled={}, observer_enabled={}, scale={}, zero_point={}, "
-            "dtype={}, quant_min={}, quant_max={}, qscheme={}, reduce_range={}".format(
-                self.fake_quant_enabled,
-                self.observer_enabled,
-                self.scale,
-                self.zero_point,
-                self.dtype,
-                self.activation_post_process.quant_min,
-                self.activation_post_process.quant_max,
-                self.qscheme,
-                self.activation_post_process.reduce_range,
-            )
+            f"fake_quant_enabled={self.fake_quant_enabled}, observer_enabled={self.observer_enabled}, "
+            f"scale={self.scale}, zero_point={self.zero_point}, dtype={self.dtype}, "
+            f"quant_min={self.activation_post_process.quant_min}, quant_max={self.activation_post_process.quant_max}, "
+            f"qscheme={self.qscheme}, reduce_range={self.activation_post_process.reduce_range}"
         )
 
     def forward(self, X: torch.Tensor) -> torch.Tensor:
diff --git a/torch/ao/quantization/fuser_method_mappings.py b/torch/ao/quantization/fuser_method_mappings.py
index a160346d8a89d..16c0c3a85b8fc 100644
--- a/torch/ao/quantization/fuser_method_mappings.py
+++ b/torch/ao/quantization/fuser_method_mappings.py
@@ -151,7 +151,7 @@ def fuse_convtranspose_bn(is_qat, convt, bn):
         "ConvTranspose and BN both must be in the same mode (train or eval)."
 
     if is_qat:
-        raise Exception("Fusing ConvTranspose+BatchNorm not yet supported in QAT.")
+        raise Exception("Fusing ConvTranspose+BatchNorm not yet supported in QAT.")  # noqa: TRY002
     else:
         return nn.utils.fusion.fuse_conv_bn_eval(convt, bn, transpose=True)
 
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index c64defa89f700..8feafafea2fd4 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -1,19 +1,20 @@
+import math
+from typing import Optional, Tuple
+
 import torch
-from torch.library import Library, impl
+from torch._refs import _unsqueeze_multiple
 from torch.ao.quantization.utils import determine_qparams, validate_qmin_qmax
-from typing import Tuple
-
+from torch.library import impl, Library
 
 # Note: decomposed means decomposed quantized tensor, using decomposed so that the
 # name is not too long
 quantized_decomposed_lib = Library("quantized_decomposed", "DEF")
 
-_DTYPE_TO_QVALUE_BOUNDS = {
-    torch.uint8: (0, 255),
-    torch.int8: (-128, 127),
-    torch.int16: (-(2**15), 2**15 - 1),
-    torch.int32: (-(2**31), 2**31 - 1)
-}
+_INTEGER_DTYPES = [torch.uint8, torch.int8, torch.int16, torch.int32]
+_FLOAT_DTYPES = [torch.float8_e5m2, torch.float8_e4m3fn]
+
+_DTYPE_TO_QVALUE_BOUNDS = {k : (torch.iinfo(k).min, torch.iinfo(k).max) for k in _INTEGER_DTYPES}
+_DTYPE_TO_QVALUE_BOUNDS.update({k : (int(torch.finfo(k).min), int(torch.finfo(k).max)) for k in _FLOAT_DTYPES})
 
 # Helper to check the passed in quant min and max are valid for the dtype
 def _quant_min_max_bounds_check(quant_min, quant_max, dtype):
@@ -57,15 +58,28 @@ def quantize_per_tensor(
        Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
        are not stored in the Tensor, we are storing them in function arguments instead
     """
-    if input.dtype == torch.bfloat16:
+    if input.dtype in [torch.float16, torch.bfloat16]:
         input = input.to(torch.float32)
-
     assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
     _quant_min_max_bounds_check(quant_min, quant_max, dtype)
 
     inv_scale = 1.0 / scale
     return torch.clamp(torch.round(input * inv_scale) + zero_point, quant_min, quant_max).to(dtype)
 
+@impl(quantized_decomposed_lib, "quantize_per_tensor", "Meta")
+def quantize_per_tensor_meta(
+        input: torch.Tensor,
+        scale: float,
+        zero_point: int,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    if input.dtype in [torch.float16, torch.bfloat16]:
+        input = input.to(torch.float32)
+    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    return torch.empty_like(input, dtype=dtype)
+
 quantized_decomposed_lib.define(
     "quantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, "
     "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
@@ -89,7 +103,16 @@ def quantize_per_tensor_tensor(
     return quantize_per_tensor(input, scale.item(), zero_point.item(), quant_min, quant_max, dtype)
 
 @impl(quantized_decomposed_lib, "quantize_per_tensor.tensor", "Meta")
-def quantize_per_tensor_tensor_meta(input, scale, zero_point, quant_min, quant_max, dtype):
+def quantize_per_tensor_tensor_meta(
+        input: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    if input.dtype in [torch.float16, torch.bfloat16]:
+        input = input.to(torch.float32)
     assert zero_point.numel() == 1, f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
     assert scale.numel() == 1, f"Expecting scale tensor to be one element, but received : {scale.numel()}"
     assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
@@ -119,7 +142,14 @@ def quantize_per_tensor_tensor2(
     return quantize_per_tensor(input, scale.item(), zero_point.item(), quant_min.item(), quant_max.item(), dtype)
 
 @impl(quantized_decomposed_lib, "quantize_per_tensor.tensor2", "Meta")
-def quantize_per_tensor_tensor2_meta(input, scale, zero_point, quant_min, quant_max, dtype):
+def quantize_per_tensor_tensor2_meta(
+        input: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        quant_min: torch.Tensor,
+        quant_max: torch.Tensor,
+        dtype: torch.dtype
+) -> torch.Tensor:
     return quantize_per_tensor_tensor_meta(input, scale, zero_point, quant_min, quant_max, dtype)
 
 # Note: quant_min/quant_max/dtype are not used in the operator, but for now it's kept in
@@ -128,7 +158,7 @@ def quantize_per_tensor_tensor2_meta(input, scale, zero_point, quant_min, quant_
 # We will revisit this later if we found there are no use cases for it
 quantized_decomposed_lib.define(
     "dequantize_per_tensor(Tensor input, float scale, int zero_point, "
-    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+    "int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensor")
 
 @impl(quantized_decomposed_lib, "dequantize_per_tensor", "CompositeExplicitAutograd")
 def dequantize_per_tensor(
@@ -137,7 +167,9 @@ def dequantize_per_tensor(
         zero_point: int,
         quant_min: int,
         quant_max: int,
-        dtype: torch.dtype
+        dtype: torch.dtype,
+        *,
+        out_dtype: Optional[torch.dtype] = None
 ) -> torch.Tensor:
     """ Affine dequantization for the Tensor using the same quantization parameters to map
     from quantized values to floating point values
@@ -160,22 +192,40 @@ def dequantize_per_tensor(
        dtype (torch.dtype): dtype for input Tensor (not used in computation,
        reserved for pattern matching)
 
+       out_dtype (torch.dtype?): optional dtype for output Tensor
+
     Returns:
        dequantized float32 Tensor
     """
     assert input.dtype == dtype, f"Expecting input to have dtype: {dtype}, but got {input.dtype}"
+    if out_dtype is None:
+        out_dtype = torch.float32
     if dtype in _DTYPE_TO_QVALUE_BOUNDS:
         # TODO: investigate why
         # (input - zero_point).to(torch.float32) * scale
         # failed the test
-        return (input.to(torch.float32) - zero_point) * scale
+        return (input.to(out_dtype) - zero_point) * scale
     else:
         raise ValueError(f"Unsupported dtype in dequantize_per_tensor: {dtype}")
 
+@impl(quantized_decomposed_lib, "dequantize_per_tensor", "Meta")
+def dequantize_per_tensor_meta(
+    input: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+    *,
+    out_dtype: Optional[torch.dtype] = None
+) -> torch.Tensor:
+    if out_dtype is None:
+        out_dtype = torch.float32
+    return torch.empty_like(input, dtype=out_dtype)
 
 quantized_decomposed_lib.define(
     "dequantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, "
-    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+    "int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensor")
 
 @impl(quantized_decomposed_lib, "dequantize_per_tensor.tensor", "CompositeExplicitAutograd")
 def dequantize_per_tensor_tensor(
@@ -184,7 +234,9 @@ def dequantize_per_tensor_tensor(
         zero_point: torch.Tensor,
         quant_min: int,
         quant_max: int,
-        dtype: torch.dtype
+        dtype: torch.dtype,
+        *,
+        out_dtype: Optional[torch.dtype] = None
 ) -> torch.Tensor:
     """ Affine dequantization for the Tensor using the same quantization parameters to map
     from quantized values to floating point values
@@ -193,22 +245,33 @@ def dequantize_per_tensor_tensor(
     """
     assert zero_point.numel() == 1, f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
     assert scale.numel() == 1, f"Expecting scale tensor to be one element, but received : {scale.numel()}"
-    return dequantize_per_tensor(input, scale.item(), zero_point.item(), quant_min, quant_max, dtype)
+    return dequantize_per_tensor(input, scale.item(), zero_point.item(), quant_min, quant_max, dtype, out_dtype=out_dtype)
 
 @impl(quantized_decomposed_lib, "dequantize_per_tensor.tensor", "Meta")
-def dequantize_per_tensor_tensor_meta(input, scale, zero_point, quant_min, quant_max, dtype):
+def dequantize_per_tensor_tensor_meta(
+        input: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype,
+        *,
+        out_dtype: Optional[torch.dtype] = None
+) -> torch.Tensor:
+    if out_dtype is None:
+        out_dtype = torch.float32
     assert zero_point.numel() == 1, f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
     assert scale.numel() == 1, f"Expecting scale tensor to be one element, but received : {scale.numel()}"
     assert input.dtype == dtype, f"Expecting input to have dtype: {dtype}"
     if dtype in _DTYPE_TO_QVALUE_BOUNDS:
-        return torch.empty_like(input, dtype=torch.float32)
+        return torch.empty_like(input, dtype=out_dtype)
     else:
         raise ValueError(f"Unsupported dtype in dequantize_per_tensor: {dtype}")
 
 # TODO: remove other variants and keep this one
 quantized_decomposed_lib.define(
     "dequantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, "
-    "Tensor quant_min, Tensor quant_max, ScalarType dtype) -> Tensor")
+    "Tensor quant_min, Tensor quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensor")
 
 @impl(quantized_decomposed_lib, "dequantize_per_tensor.tensor2", "CompositeExplicitAutograd")
 def dequantize_per_tensor_tensor2(
@@ -217,7 +280,9 @@ def dequantize_per_tensor_tensor2(
         zero_point: torch.Tensor,
         quant_min: torch.Tensor,
         quant_max: torch.Tensor,
-        dtype: torch.dtype
+        dtype: torch.dtype,
+        *,
+        out_dtype: Optional[torch.dtype] = None
 ) -> torch.Tensor:
     """ Affine dequantization for the Tensor using the same quantization parameters to map
     from quantized values to floating point values
@@ -226,11 +291,21 @@ def dequantize_per_tensor_tensor2(
     """
     assert zero_point.numel() == 1, f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
     assert scale.numel() == 1, f"Expecting scale tensor to be one element, but received : {scale.numel()}"
-    return dequantize_per_tensor(input, scale.item(), zero_point.item(), quant_min.item(), quant_max.item(), dtype)
+    return dequantize_per_tensor(
+        input, scale.item(), zero_point.item(), quant_min.item(), quant_max.item(), dtype, out_dtype=out_dtype)
 
 @impl(quantized_decomposed_lib, "dequantize_per_tensor.tensor2", "Meta")
-def dequantize_per_tensor_tensor2_meta(input, scale, zero_point, quant_min, quant_max, dtype):
-    return dequantize_per_tensor_tensor_meta(input, scale, zero_point, quant_min, quant_max, dtype)
+def dequantize_per_tensor_tensor2_meta(
+        input,
+        scale,
+        zero_point,
+        quant_min,
+        quant_max,
+        dtype,
+        *,
+        out_dtype: Optional[torch.dtype] = None
+) -> torch.Tensor:
+    return dequantize_per_tensor_tensor_meta(input, scale, zero_point, quant_min, quant_max, dtype, out_dtype=out_dtype)
 
 quantized_decomposed_lib.define(
     "choose_qparams.tensor(Tensor input, int quant_min, int quant_max, "
@@ -257,7 +332,11 @@ def choose_qparams_tensor(
        scale (float): quantization parameter for the target quantized Tensor
        zero_point (int): quantization parameter for the target quantized Tensor
     """
-    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    assert input.dtype in [
+        torch.float32,
+        torch.float16,
+        torch.bfloat16,
+    ], f"Expecting input to have dtype torch.float32/16/b16, but got dtype: {input.dtype}"
     assert dtype in _DTYPE_TO_QVALUE_BOUNDS, \
         f"Expecting target dtype to be one of {_DTYPE_TO_QVALUE_BOUNDS.keys()}, but got: {dtype}"
     validate_qmin_qmax(qmin, qmax)
@@ -292,7 +371,11 @@ def choose_qparams_symmetric_tensor(
        scale (float): quantization parameter for the target quantized Tensor
        zero_point (int): quantization parameter for the target quantized Tensor
     """
-    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    assert input.dtype in [
+        torch.float32,
+        torch.float16,
+        torch.bfloat16,
+    ], f"Expecting input to have dtype torch.float32/16/b16, but got dtype: {input.dtype}"
     assert dtype in _DTYPE_TO_QVALUE_BOUNDS, \
         f"Expecting target dtype to be one of {_DTYPE_TO_QVALUE_BOUNDS.keys()}, but got: {dtype}"
     validate_qmin_qmax(qmin, qmax)
@@ -317,7 +400,11 @@ def choose_qparams_tensor_meta(
         eps: float,
         dtype: torch.dtype
 ) -> Tuple[torch.Tensor, torch.Tensor]:
-    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    assert input.dtype in [
+        torch.float32,
+        torch.float16,
+        torch.bfloat16,
+    ], f"Expecting input to have dtype torch.float32/16/b16, but got dtype: {input.dtype}"
     assert quant_min < quant_max, f"Expecting quant_min to be smaller than quant_max but received min: \
         {quant_min} max: {quant_max}"
     return torch.empty(1, dtype=torch.double, device=input.device), torch.empty(1, dtype=torch.int64, device=input.device)
@@ -371,9 +458,8 @@ def quantize_per_channel(
        Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
        are not stored in the Tensor, we are storing them in function arguments instead
     """
-    if input.dtype == torch.bfloat16:
+    if input.dtype in [torch.float16, torch.bfloat16]:
         input = input.to(torch.float32)
-
     assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
     assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
     _quant_min_max_bounds_check(quant_min, quant_max, dtype)
@@ -400,6 +486,8 @@ def quantize_per_channel_meta(
         quant_max: int,
         dtype: torch.dtype
 ) -> torch.Tensor:
+    if input.dtype in [torch.float16, torch.bfloat16]:
+        input = input.to(torch.float32)
     assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
     assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
     _quant_min_max_bounds_check(quant_min, quant_max, dtype)
@@ -410,18 +498,20 @@ def quantize_per_channel_meta(
 # matching in the future
 # We will revisit this later if we found there are no use cases for it
 quantized_decomposed_lib.define(
-    "dequantize_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, "
-    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+    "dequantize_per_channel(Tensor input, Tensor scales, Tensor? zero_points, int axis, "
+    "int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensor")
 
 @impl(quantized_decomposed_lib, "dequantize_per_channel", "CompositeExplicitAutograd")
 def dequantize_per_channel(
         input: torch.Tensor,
         scales: torch.Tensor,
-        zero_points: torch.Tensor,
+        zero_points: Optional[torch.Tensor],
         axis: int,
         quant_min: int,
         quant_max: int,
-        dtype: torch.dtype
+        dtype: torch.dtype,
+        *,
+        out_dtype: Optional[torch.dtype] = None
 ) -> torch.Tensor:
     """ Affine per channel dequantization for the Tensor using the same quantization
     parameters for each channel/axis to map from quantized values to floating point values
@@ -446,20 +536,25 @@ def dequantize_per_channel(
        dtype (torch.dtype): requested dtype for output Tensor (not used in computation,
        reserved for pattern matching)
 
+       out_dtype (torch.dtype?): optional dtype for output Tensor
+
     Returns:
        dequantized float32 Tensor
     """
     assert input.dtype == dtype, f"Expecting input to have dtype {dtype}, but got dtype: {input.dtype}"
+    if out_dtype is None:
+        out_dtype = torch.float32
     assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
     _quant_min_max_bounds_check(quant_min, quant_max, dtype)
     input, permute_axis_list = _permute_to_axis_zero(input, axis)
-    res = torch.zeros_like(input, dtype=torch.float32)
+    res = torch.zeros_like(input, dtype=out_dtype)
 
     for i in range(input.size(0)):
+        zp = zero_points[i] if zero_points is not None else 0
         # TODO: investigate why
-        # (input[i] - zero_points[i]).to(torch.float32) * scales[i]
+        # (input[i] - zero_points[i]).to(out_dtype) * scales[i]
         # failed the test
-        res[i] = (input[i].to(torch.float32) - zero_points[i]) * scales[i]
+        res[i] = (input[i].to(out_dtype) - zp) * scales[i]
 
     out = res.permute(tuple(permute_axis_list))
     return out
@@ -468,13 +563,470 @@ def dequantize_per_channel(
 def dequantize_per_channel_meta(
         input: torch.Tensor,
         scales: torch.Tensor,
-        zero_points: torch.Tensor,
+        zero_points: Optional[torch.Tensor],
         axis: int,
         quant_min: int,
         quant_max: int,
-        dtype: torch.dtype
+        dtype: torch.dtype,
+        *,
+        out_dtype: Optional[torch.dtype] = None
 ) -> torch.Tensor:
     assert input.dtype == dtype, f"Expecting input to have dtype {dtype}, but got dtype: {input.dtype}"
+    if out_dtype is None:
+        out_dtype = torch.float32
     assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
     _quant_min_max_bounds_check(quant_min, quant_max, dtype)
-    return torch.empty_like(input, dtype=torch.float32)
+    return torch.empty_like(input, dtype=out_dtype)
+
+
+quantized_decomposed_lib.define(
+    "choose_qparams_per_token(Tensor input, ScalarType dtype) -> (Tensor, Tensor)"
+)
+
+
+@impl(
+    quantized_decomposed_lib,
+    "choose_qparams_per_token",
+    "CompositeExplicitAutograd",
+)
+def choose_qparams_per_token(
+    input: torch.Tensor,
+    dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Choose quantization parameters for per token quantization. This means for a N dimension Tensor
+    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
+    every N elements with the same quantization parameter. The dimension for scales/zero_points
+    will be (M1 * M2 ... * Mn)
+
+    Args:
+       input (torch.Tensor): original float32/float16 Tensor
+       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
+
+    Returns:
+        scales and zero_points, both float32 Tensors
+    """
+
+    scales = input.abs().amax(dim=-1, keepdim=True)
+    if scales.dtype == torch.float16:
+        scales = (
+            scales.float()
+        )  # want float scales to avoid overflows for fp16, (bf16 has wide enough range)
+    if dtype == torch.int8:
+        n_bits = 8
+        quant_max = 2 ** (n_bits - 1) - 1
+    else:
+        raise Exception(f"unsupported dtype in choose_qparams_per_token: {dtype}")  # noqa: TRY002
+
+    scales = scales.clamp(min=1e-5).div(quant_max)
+    zero_points = torch.zeros_like(scales)
+    return scales, zero_points
+
+
+@impl(
+    quantized_decomposed_lib,
+    "choose_qparams_per_token",
+    "Meta",
+)
+def choose_qparams_per_token_meta(
+    input: torch.Tensor,
+    dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    size = (1, input.size(-1))
+    return torch.empty(size, dtype=torch.double, device=input.device), torch.empty(
+        size, dtype=torch.int64, device=input.device
+    )
+
+
+quantized_decomposed_lib.define(
+    "_choose_qparams_per_token_asymmetric_impl(Tensor input, ScalarType dtype) -> (Tensor, Tensor)"
+)
+
+
+@impl(
+    quantized_decomposed_lib,
+    "_choose_qparams_per_token_asymmetric_impl",
+    "CompositeImplicitAutograd",
+)
+def _choose_qparams_per_token_asymmetric_impl(
+    input: torch.Tensor,
+    dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Choose quantization parameters for per token quantization. This means for a N dimension Tensor
+    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
+    every N elements with the same quantization parameter. The dimension for scales/zero_points
+    will be (M1 * M2 ... * Mn)
+
+    Args:
+       input (torch.Tensor): original float32/float16 Tensor
+       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
+
+    Returns:
+        scales and zero_points, both float32 Tensors
+    """
+    # Based on https://github.com/google/XNNPACK/blob/df156f0cf3db5a4576cc711123eeb54915f82ffc/src/xnnpack/quantization.h#L18
+    qmin, qmax = -128, 127
+    min_val = torch.amin(input, dim=-1, keepdim=True)
+    max_val = torch.amax(input, dim=-1, keepdim=True)
+    min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
+    max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
+    eps = torch.finfo(torch.float32).eps  # use xnnpack eps?
+
+    # scale
+    scale = (max_val_pos - min_val_neg) / float(qmax - qmin)
+    scale = scale.clamp(min=eps)
+
+    # zero point
+    descaled_min = min_val_neg / scale
+    descaled_max = max_val_pos / scale
+    zero_point_from_min_error = qmin + descaled_min
+    zero_point_from_max_error = qmax + descaled_max
+    zero_point = torch.where(
+        zero_point_from_min_error + zero_point_from_max_error > 0,
+        qmin - descaled_min,
+        qmax - descaled_max,
+    )
+    zero_point = torch.clamp(zero_point, qmin, qmax).round()
+
+    return scale.to(torch.float32), zero_point.to(torch.float32)
+
+
+quantized_decomposed_lib.define(
+    "choose_qparams_per_token_asymmetric(Tensor input, ScalarType dtype) -> (Tensor, Tensor)"
+)
+
+
+@impl(
+    quantized_decomposed_lib,
+    "choose_qparams_per_token_asymmetric",
+    "CompositeExplicitAutograd",
+)
+def choose_qparams_per_token_asymmetric(
+    input: torch.Tensor,
+    dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return _choose_qparams_per_token_asymmetric_impl(input, dtype)
+
+
+@impl(
+    quantized_decomposed_lib,
+    "choose_qparams_per_token_asymmetric",
+    "Meta",
+)
+def choose_qparams_per_token_asymmetric_meta(
+    input: torch.Tensor,
+    dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    size = (1, input.size(-1))
+    return torch.empty(size, dtype=torch.double, device=input.device), torch.empty(
+        size, dtype=torch.int64, device=input.device
+    )
+
+
+def _per_token_quant_qparam_dim_check(input, scales, zero_points):
+    num_tokens = math.prod(list(input.size())[:-1])
+    assert (
+        num_tokens == scales.numel()
+    ), f"num_tokens: {num_tokens} scales: {scales.size()}"
+    assert (
+        num_tokens == zero_points.numel()
+    ), f"num_tokens: {num_tokens} zero_points: {zero_points.size()}"
+
+
+quantized_decomposed_lib.define(
+    "quantize_per_token(Tensor input, Tensor scales, Tensor zero_points, "
+    "int quant_min, int quant_max, ScalarType dtype) -> Tensor"
+)
+
+
+@impl(quantized_decomposed_lib, "quantize_per_token", "CompositeExplicitAutograd")
+def quantize_per_token(
+    input: torch.Tensor,
+    scales: torch.Tensor,
+    zero_points: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+):
+    """Per token quantization for the Tensor using the quantization parameters to map
+    from floating point to quantized values. This means for a N dimension Tensor
+    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
+    every N elements with the same quantization parameter. The dimension for scales/zero_points
+    will be (M1 * M2 ... * Mn)
+
+    Args:
+       input (torch.Tensor): original float32 or bfloat16 Tensor
+       scales (float32 torch.Tensor): quantization parameter for per token affine quantization
+       zero_points (int32 torch.Tensor): quantization parameter for per token affine quantization
+       quant_min (int): minimum quantized value for output Tensor
+       quant_max (int): maximum quantized value for output Tensor
+       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor
+
+    Returns:
+       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
+       are not stored in the Tensor, we are storing them in function arguments instead
+    """
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+    _per_token_quant_qparam_dim_check(input, scales, zero_points)
+    input = (
+        torch.round(input / scales + zero_points).clamp(quant_min, quant_max).to(dtype)
+    )
+    return input
+
+
+@impl(quantized_decomposed_lib, "quantize_per_token", "Meta")
+def quantize_per_token_meta(
+    input: torch.Tensor,
+    scales: torch.Tensor,
+    zero_points: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+):
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+    return torch.empty_like(input, dtype=dtype)
+
+
+quantized_decomposed_lib.define(
+    "dequantize_per_token(Tensor input, Tensor scales, Tensor zero_points, "
+    "int quant_min, int quant_max, ScalarType dtype, ScalarType output_dtype) -> Tensor"
+)
+
+
+@impl(quantized_decomposed_lib, "dequantize_per_token", "CompositeExplicitAutograd")
+def dequantize_per_token(
+    input: torch.Tensor,
+    scales: torch.Tensor,
+    zero_points: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+    output_dtype: torch.dtype = torch.float32,
+):
+    """Per token dequantization for the Tensor using the quantization parameters to map
+    from floating point to quantized values. This means for a N dimension Tensor
+    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
+    every N elements with the same quantization parameter. The dimension for scales/zero_points
+    will be (M1 * M2 ... * Mn)
+
+    Args:
+       input (torch.Tensor): quantized Tensor (uint8, int8 etc.)
+       scales (float32 torch.Tensor): quantization parameter for per token affine quantization
+       zero_points (int32 torch.Tensor): quantization parameter for per token affine quantization
+       quant_min (int): minimum quantized value for input Tensor
+       quant_max (int): maximum quantized value for input Tensor
+       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
+       output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor
+
+    Returns:
+       dequantized Tensor with dtype `output_dtype`
+    """
+    input = input - zero_points
+    input = input.to(output_dtype) * scales
+    return input
+
+
+@impl(quantized_decomposed_lib, "dequantize_per_token", "Meta")
+def dequantize_per_token_meta(
+    input: torch.Tensor,
+    scales: torch.Tensor,
+    zero_points: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+    output_dtype: torch.dtype = torch.float32,
+):
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+    # TODO: support fp16
+    return torch.empty_like(input, dtype=output_dtype)
+
+
+quantized_decomposed_lib.define(
+    "quantize_per_channel_group(Tensor input, Tensor scales, Tensor zero_points, int quant_min, "
+    "int quant_max, ScalarType dtype, int group_size) -> Tensor"
+)
+
+
+# TODO: dtype is ignored for now
+@impl(
+    quantized_decomposed_lib, "quantize_per_channel_group", "CompositeExplicitAutograd"
+)
+def quantize_per_channel_group(
+    input: torch.Tensor,
+    scales: torch.Tensor,
+    zero_points: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+    group_size=128,
+):
+    assert group_size > 1
+    # needed for GPTQ single column quantize
+    if group_size > input.shape[-1] and scales.shape[-1] == 1:
+        group_size = input.shape[-1]
+
+    assert input.shape[-1] % group_size == 0
+    assert input.dim() == 2
+
+    # TODO: check for dtype, currently we can't express torch.int4 so it's omitted
+    to_quant = input.reshape(-1, group_size)
+    assert torch.isnan(to_quant).sum() == 0
+
+    scales = scales.reshape(-1, 1)
+    zero_points = zero_points.reshape(-1, 1)
+
+    input_int8 = (
+        to_quant.div(scales)
+        .add(zero_points)
+        .round()
+        .clamp_(quant_min, quant_max)
+        .to(dtype)
+        .reshape_as(input)
+    )
+
+    return input_int8
+
+
+@impl(quantized_decomposed_lib, "quantize_per_channel_group", "Meta")
+def quantize_per_channel_group_meta(
+    input: torch.Tensor,
+    scales: torch.Tensor,
+    zero_points: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+    group_size=128,
+):
+    """Groupwise quantization within each channel for an 2-d Tensor using the quantization parameters
+    to map from floating point to quantized values. This means for each row of a 2-d Tensor
+    (M, N), we calculate scales/zero_points for each `group_size` elements
+    and quantize every `group_size` elements with the same quantization parameter.
+    The dimension for scales/zero_points will be (M * ceil(N, group_size),)
+
+    Args:
+       input (torch.Tensor): original float32 or bfloat16 Tensor
+       scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization
+       zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization
+       quant_min (int): minimum quantized value for output Tensor
+       quant_max (int): maximum quantized value for output Tensor
+       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor
+
+    Returns:
+       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
+       are not stored in the Tensor, we are storing them in function arguments instead
+    """
+    assert group_size > 1
+    # needed for GPTQ single column quantize
+    if group_size > input.shape[-1] and scales.shape[-1] == 1:
+        group_size = input.shape[-1]
+
+    assert input.shape[-1] % group_size == 0
+    assert input.dim() == 2
+    return torch.empty_like(input, dtype=dtype)
+
+
+quantized_decomposed_lib.define(
+    "dequantize_per_channel_group(Tensor input, Tensor scales, Tensor? zero_points, int quant_min, "
+    "int quant_max, ScalarType dtype, int group_size, ScalarType output_dtype) -> Tensor"
+)
+
+
+@impl(
+    quantized_decomposed_lib,
+    "dequantize_per_channel_group",
+    "CompositeExplicitAutograd",
+)
+def dequantize_per_channel_group(
+    w_int8: torch.Tensor,
+    scales: torch.Tensor,
+    zero_points: Optional[torch.Tensor],
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+    group_size: int = 128,
+    output_dtype: torch.dtype = torch.float32,
+):
+    """Groupwise dequantization within each channel for an 2-d Tensor using the quantization parameters
+    to map from floating point to quantized values. This means for each row of a 2-d Tensor
+    (M, N), we calculate scales/zero_points for each `group_size` elements
+    and quantize every `group_size` elements with the same quantization parameter.
+    The dimension for scales/zero_points will be (M * ceil(N, group_size),)
+
+    Args:
+       input (torch.Tensor): quantized Tensor (uint8/int8 etc.)
+       scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization
+       zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization
+       quant_min (int): minimum quantized value for input Tensor
+       quant_max (int): maximum quantized value for input Tensor
+       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
+       output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor
+
+    Returns:
+       dequantized Tensor with dtype `output_dtype`
+    """
+
+    assert group_size > 1
+    # needed for GPTQ single column dequantize
+    if group_size > w_int8.shape[-1] and scales.shape[-1] == 1:
+        group_size = w_int8.shape[-1]
+    assert w_int8.shape[-1] % group_size == 0
+    assert w_int8.dim() == 2
+
+    w_int8_grouped = w_int8.reshape(-1, group_size)
+    scales = scales.reshape(-1, 1)
+    if zero_points is not None:
+        zp = zero_points.reshape(-1, 1)
+    else:
+        zp = torch.zeros([], dtype=torch.int32, device=scales.device)
+    w_dq = w_int8_grouped.sub(zp).mul(scales).reshape_as(w_int8).to(output_dtype)
+    return w_dq
+
+
+quantized_decomposed_lib.define(
+    "fake_quant_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, "
+    "int quant_min, int quant_max) -> Tensor")
+
+class FakeQuantPerChannel(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, scales, zero_points, axis, quant_min, quant_max):
+        if scales.dtype != torch.float32:
+            scales = scales.to(torch.float32)
+        if zero_points.dtype != torch.int32:
+            zero_points = zero_points.to(torch.int32)
+        assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+        assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
+        broadcast_dims = list(range(0, axis)) + list(range(axis + 1, input.ndim))
+        unsqueeze_scales = _unsqueeze_multiple(scales, broadcast_dims)
+        unsqueeze_zero_points = _unsqueeze_multiple(zero_points, broadcast_dims)
+        temp = torch.round(input * (1.0 / unsqueeze_scales)) + unsqueeze_zero_points
+        out = (torch.clamp(temp, quant_min, quant_max) - unsqueeze_zero_points) * unsqueeze_scales
+        mask = torch.logical_and((temp >= quant_min), (temp <= quant_max))
+
+        ctx.save_for_backward(mask)
+        return out
+
+    @staticmethod
+    def backward(ctx, gy):
+        mask, = ctx.saved_tensors
+        return gy * mask, None, None, None, None, None
+
+@impl(quantized_decomposed_lib, "fake_quant_per_channel", "Autograd")
+def fake_quant_per_channel(
+        input: torch.Tensor,
+        scales: torch.Tensor,
+        zero_points: torch.Tensor,
+        axis: int,
+        quant_min: int,
+        quant_max: int,
+) -> torch.Tensor:
+    return FakeQuantPerChannel.apply(input, scales, zero_points, axis, quant_min, quant_max)
+
+@impl(quantized_decomposed_lib, "fake_quant_per_channel", "Meta")
+def fake_quant_per_channel_meta(
+        input: torch.Tensor,
+        scales: torch.Tensor,
+        zero_points: torch.Tensor,
+        axis: int,
+        quant_min: int,
+        quant_max: int,
+) -> torch.Tensor:
+    return torch.empty_like(input)
diff --git a/torch/ao/quantization/fx/_equalize.py b/torch/ao/quantization/fx/_equalize.py
index 55bcb52576b21..b0965b9a7051e 100644
--- a/torch/ao/quantization/fx/_equalize.py
+++ b/torch/ao/quantization/fx/_equalize.py
@@ -19,6 +19,7 @@
     maybe_get_next_module,
     node_arg_is_weight,
 )
+import operator
 
 CUSTOM_MODULE_SUPP_LIST: List[Any] = []
 
@@ -810,7 +811,7 @@ def get_equalization_qconfig_dict(
 
     # Sort the layer_sqnr_dictionary values and get the layers with the lowest
     # SQNR values (aka highest quantization errors)
-    layer_sqnr_sorted = sorted(layer_sqnr_dict.items(), key=lambda item: item[1])
+    layer_sqnr_sorted = sorted(layer_sqnr_dict.items(), key=operator.itemgetter(1))
     layers_to_equalize = layer_sqnr_sorted[:num_layers_to_equalize]
 
     # Constructs an equalization_qconfig_dict that specifies to only equalize
diff --git a/torch/ao/quantization/fx/_model_report/README.md b/torch/ao/quantization/fx/_model_report/README.md
index 6275b49b54e2b..858b271ef56f2 100644
--- a/torch/ao/quantization/fx/_model_report/README.md
+++ b/torch/ao/quantization/fx/_model_report/README.md
@@ -93,7 +93,7 @@ format, it will instead be in a:
       -- feature value
 ```
 
-Essentially, all the informations for each of the modules are consolidated across the different reports.
+Essentially, all the information for each of the modules are consolidated across the different reports.
 Moreover, the modules are kept in the same chronological order they would appear in the model's `forward()` method.
 
 Then, when it comes to the visualizer, there are two main things you can do:
diff --git a/torch/ao/quantization/fx/_model_report/detector.py b/torch/ao/quantization/fx/_model_report/detector.py
index 71986fd17fbe8..b5c7f9fd29764 100644
--- a/torch/ao/quantization/fx/_model_report/detector.py
+++ b/torch/ao/quantization/fx/_model_report/detector.py
@@ -292,7 +292,7 @@ def _detect_per_channel_helper(self, model: nn.Module):
         # get the fully qualified name and check if in list of modules to include and list of modules to ignore
         for fqn, module in model.named_modules():
 
-            is_in_include_list = sum([isinstance(module, x) for x in self.supported_modules]) > 0
+            is_in_include_list = any(isinstance(module, x) for x in self.supported_modules)
 
             # check if the module per_channel is supported
             # based on backend
@@ -515,10 +515,10 @@ def _is_supported(self, module: nn.Module, insert: bool = False) -> bool:
         Returns True if the module is supported by observer, False otherwise
         """
         # check to see if module is of a supported type
-        is_supported_type = sum([isinstance(module, x) for x in self.DEFAULT_DYNAMIC_STATIC_CHECK_SUPPORTED]) > 0
+        is_supported_type = any(isinstance(module, x) for x in self.DEFAULT_DYNAMIC_STATIC_CHECK_SUPPORTED)
 
         # check if it will be supported
-        future_supported_type = sum([isinstance(module, x) for x in self.DEFAULT_DYNAMIC_STATIC_FUTURE_SUPPORTED]) > 0
+        future_supported_type = any(isinstance(module, x) for x in self.DEFAULT_DYNAMIC_STATIC_FUTURE_SUPPORTED)
 
         # supported
         supported = is_supported_type or future_supported_type
@@ -576,7 +576,7 @@ def _generate_dict_info(self, model: GraphModule) -> Dict[str, Any]:
                 post_obs_dist_classif = self.STATIONARY_STR if post_stat > self.tolerance else self.NON_STATIONARY_STR
 
                 # check if current support or future support
-                is_supported_type = sum([isinstance(module, x) for x in self.DEFAULT_DYNAMIC_STATIC_CHECK_SUPPORTED]) > 0
+                is_supported_type = any(isinstance(module, x) for x in self.DEFAULT_DYNAMIC_STATIC_CHECK_SUPPORTED)
 
                 # store the set of important information for this module
                 module_info = {
@@ -789,7 +789,7 @@ def _is_supported(self, module: nn.Module, insert: bool = False) -> bool:
         Returns True if the module is supported by observer, False otherwise
         """
         # check to see if module is of a supported type
-        is_supported_type = sum([type(module) is x for x in self.SUPPORTED_MODULES]) > 0
+        is_supported_type = any(type(module) is x for x in self.SUPPORTED_MODULES)
 
         # this is check for observer insertion
         if insert:
@@ -988,9 +988,8 @@ def _calculate_range_ratio(self, info_dict: Dict, info_str: str, module_fqn: str
         if global_range == 0:
             range_zero_explanation = "We recommend removing this channel as it doesn't provide any useful information."
             raise ValueError(
-                "The range of the {} data for module {} is 0, which means you have a constant value channel. {}".format(
-                    info_str, module_fqn, range_zero_explanation
-                )
+                f"The range of the {info_str} data for module {module_fqn} is 0, "
+                f"which means you have a constant value channel. {range_zero_explanation}"
             )
 
         ratio = per_channel_range / global_range
diff --git a/torch/ao/quantization/fx/_model_report/model_report.py b/torch/ao/quantization/fx/_model_report/model_report.py
index 9ea5ff406d799..724e76ad576fa 100644
--- a/torch/ao/quantization/fx/_model_report/model_report.py
+++ b/torch/ao/quantization/fx/_model_report/model_report.py
@@ -269,11 +269,11 @@ def generate_model_report(
         """
         # if we haven't prepped model for callibration, then we shouldn't generate report yet
         if not self._prepared_flag:
-            raise Exception("Cannot generate report without preparing model for callibration")
+            raise Exception("Cannot generate report without preparing model for callibration")  # noqa: TRY002
 
         # if we already removed the observers, we cannot generate report
         if self._removed_observers:
-            raise Exception("Cannot generate report on model you already removed observers from")
+            raise Exception("Cannot generate report on model you already removed observers from")  # noqa: TRY002
 
         # keep track of all the reports of interest and their outputs
         reports_of_interest = {}
@@ -416,7 +416,7 @@ def generate_visualizer(self) -> ModelReportVisualizer:
         """
         # check if user has generated reports at least once
         if len(self._generated_reports) == 0:
-            raise Exception("Unable to generate visualizers without first generating reports")
+            raise Exception("Unable to generate visualizers without first generating reports")  # noqa: TRY002
 
         # get the ordered dict mapping modules to their full set of collected features / stats
         module_fqns_to_features: OrderedDict = self._reformat_reports_for_visualizer()
@@ -502,11 +502,11 @@ def _generate_module_fqn_to_detector_info_mapping(
         """
         # if we haven't prepped model for callibration, then we shouldn't generate mapping yet
         if not self._prepared_flag:
-            raise Exception("Cannot generate report without preparing model for callibration")
+            raise Exception("Cannot generate report without preparing model for callibration")  # noqa: TRY002
 
         # if we already removed the observers, we cannot mapping
         if self._removed_observers:
-            raise Exception("Cannot generate report on model you already removed observers from")
+            raise Exception("Cannot generate report on model you already removed observers from")  # noqa: TRY002
 
         # keep track of qconfig info for each module across detectors
         detector_qconfig_info_combined: Dict[str, DetectorQConfigInfo] = {}
diff --git a/torch/ao/quantization/fx/_model_report/model_report_observer.py b/torch/ao/quantization/fx/_model_report/model_report_observer.py
index 3ccf692dbe228..eaa45264be7eb 100644
--- a/torch/ao/quantization/fx/_model_report/model_report_observer.py
+++ b/torch/ao/quantization/fx/_model_report/model_report_observer.py
@@ -260,6 +260,6 @@ def reset_batch_and_epoch_values(self):
 
     @torch.jit.export
     def calculate_qparams(self):
-        raise Exception(
+        raise Exception(  # noqa: TRY002
             "calculate_qparams should not be called for ModelReportObserver"
         )
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index 11fb2c9a8924f..ef90f8b71ece8 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from typing import Any, Dict, List, Optional, Set, Tuple, Union, Type, Callable
 from torch.ao.quantization.quant_type import QuantType
 import torch
@@ -82,6 +84,18 @@
     "convert_weighted_module",
 ]
 
+SUPPORTED_QDTYPES = [
+    torch.quint8,
+    torch.qint8,
+    torch.qint32,
+    torch.uint8,
+    torch.int8,
+    torch.int16,
+    torch.int32,
+    torch.float8_e5m2,
+    torch.float8_e4m3fn,
+]
+
 _QSCHEME_TO_CHOOSE_QPARAMS_OP = {
     torch.per_tensor_affine: torch.ops.quantized_decomposed.choose_qparams.tensor,
     torch.per_tensor_symmetric: torch.ops.quantized_decomposed.choose_qparams_symmetric.tensor,
@@ -134,8 +148,7 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
     if hasattr(activation_post_process, "is_dynamic"):
         is_dynamic = activation_post_process.is_dynamic  # type: ignore[assignment]
 
-    if dtype in [torch.quint8, torch.qint8, torch.qint32, torch.uint8, torch.int8, torch.int16, torch.int32] and \
-            (not is_dynamic):
+    if dtype in SUPPORTED_QDTYPES and (not is_dynamic):
         # TODO: probably should cleanup this condition check, it's hard
         # to reason about this if and the following elif
 
@@ -370,7 +383,7 @@ def _replace_observer_with_quantize_dequantize_node(
     if hasattr(activation_post_process, "is_dynamic"):
         is_dynamic = activation_post_process.is_dynamic  # type: ignore[attr-defined, assignment]
 
-    if dtype in [torch.quint8, torch.qint8, torch.qint32] and \
+    if dtype in [torch.quint8, torch.qint8, torch.qint32, torch.float8_e5m2, torch.float8_e4m3fn] and \
             (not is_dynamic):
         # TODO: probably should cleanup this condition check, it's hard
         # to reason about this if and the following elif
@@ -475,15 +488,7 @@ def _is_conversion_supported(activation_post_process: torch.nn.Module) -> bool:
         is_dynamic = activation_post_process.is_dynamic  # type: ignore[attr-defined, assignment]
 
     return (
-        (dtype in [
-            torch.quint8,
-            torch.qint8,
-            torch.qint32,
-            torch.uint8,
-            torch.int8,
-            torch.int16,
-            torch.int32
-        ] and (not is_dynamic)) or  # type: ignore[return-value]
+        (dtype in SUPPORTED_QDTYPES and (not is_dynamic)) or  # type: ignore[return-value]
         is_dynamic or
         dtype == torch.float16
     )
diff --git a/torch/ao/quantization/fx/pattern_utils.py b/torch/ao/quantization/fx/pattern_utils.py
index 29063fa276123..d8648a0aed5e7 100644
--- a/torch/ao/quantization/fx/pattern_utils.py
+++ b/torch/ao/quantization/fx/pattern_utils.py
@@ -15,7 +15,7 @@
 QuantizeHandler = Any
 
 # pattern for conv bn fusion
-_DEFAULT_FUSION_PATTERNS = OrderedDict()
+_DEFAULT_FUSION_PATTERNS: Dict[Pattern, QuantizeHandler] = OrderedDict()
 def _register_fusion_pattern(pattern):
     def insert(fn):
         _DEFAULT_FUSION_PATTERNS[pattern] = fn
@@ -25,13 +25,13 @@ def insert(fn):
 def get_default_fusion_patterns() -> Dict[Pattern, QuantizeHandler]:
     return copy.copy(_DEFAULT_FUSION_PATTERNS)
 
-_DEFAULT_QUANTIZATION_PATTERNS = OrderedDict()
+_DEFAULT_QUANTIZATION_PATTERNS: Dict[Pattern, QuantizeHandler] = OrderedDict()
 
 # Mapping from pattern to activation_post_process(observer/fake_quant) constructor for output activation
 # e.g. pattern: torch.sigmoid,
 #      output_activation_post_process: default_fixed_qparams_range_0to1_fake_quant
-_DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP = {}
-_DEFAULT_OUTPUT_OBSERVER_MAP = {}
+_DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP: Dict[Pattern, QuantizeHandler] = {}
+_DEFAULT_OUTPUT_OBSERVER_MAP: Dict[Pattern, QuantizeHandler] = {}
 
 # Register pattern for both static quantization and qat
 def _register_quant_pattern(pattern, fixed_qparams_observer=None):
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index a8db114b2b480..9ca91ecb49305 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -138,7 +138,9 @@
     torch.uint8,
     torch.int8,
     torch.int16,
-    torch.int32
+    torch.int32,
+    torch.float8_e5m2,
+    torch.float8_e4m3fn,
 ]
 
 _DEFAULT_FP32_OBS_OR_FQ_CTR = PlaceholderObserver.with_args(dtype=torch.float)
@@ -207,7 +209,7 @@ def _create_obs_or_fq_from_qspec(
         kwargs = _get_observer_kwargs(quantization_spec)
         observer_ctr = FixedQParamsObserver.with_args(**kwargs)
         if is_qat:
-            return FixedQParamsFakeQuantize.with_args(observer=observer_ctr)
+            return FixedQParamsFakeQuantize.with_args(observer=observer_ctr)()
         else:
             return observer_ctr()
 
@@ -835,7 +837,7 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
                 maybe_obs_mod = named_modules[maybe_obs_node.target]  # type: ignore[index]
                 if (
                     type(maybe_obs_mod) == type(arg_as_input_act_obs_or_fq) and
-                    maybe_obs_mod.dtype == arg_as_input_target_dtype
+                    maybe_obs_mod.dtype == arg_as_input_target_dtype  # type: ignore[possibly-undefined]
                 ):
                     arg_as_input_act_obs_or_fq = maybe_obs_mod  # type: ignore[assignment]
                     existing_obs_node = maybe_obs_node
@@ -1099,7 +1101,7 @@ def _recursive_maybe_replace_node_with_obs(
         elif maybe_node is None:
             return None
         else:
-            raise Exception("Unhandled type for returned node:", maybe_node)
+            raise Exception("Unhandled type for returned node:", maybe_node)  # noqa: TRY002
 
     new_args = []
     for old_arg in graph_output_node.args:
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index 38176c5907d94..be26332b2485c 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -23,6 +23,7 @@
 )
 from torch.ao.quantization.stubs import DeQuantStub
 from torch.ao.quantization.utils import (
+    _assert_and_get_unique_device,
     activation_is_statically_quantized,
 )
 from torch.ao.quantization.observer import _is_activation_post_process
@@ -137,7 +138,7 @@ def get_linear_prepack_op_for_dtype(dtype):
     elif dtype == torch.qint8:
         return torch.ops.quantized.linear_prepack
     else:
-        raise Exception("can't get linear prepack op for dtype:", dtype)
+        raise Exception("can't get linear prepack op for dtype:", dtype)  # noqa: TRY002
 
 def get_qconv_prepack_op(conv_op: Callable) -> Callable:
     prepack_ops = {
@@ -222,26 +223,13 @@ def load_arg(a):
     graph_module = GraphModule(root, graph)
     return graph_module
 
+# TODO: delete
 def assert_and_get_unique_device(module: torch.nn.Module) -> Any:
     """
     Returns the unique device for a module, or None if no device is found.
     Throws an error if multiple devices are detected.
     """
-    devices = {p.device for p in module.parameters()} | \
-        {p.device for p in module.buffers()}
-    """
-    As a temp workaround for AIMP HHC publish we added CPU check.remove it later. T163614564
-    """
-    if {torch.device("cpu"), torch.device("meta")} == devices:
-        warnings.warn("Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'.")
-        devices = {torch.device("cpu")}
-    ""
-    assert len(devices) <= 1, (
-        "prepare only works with cpu or single-device CUDA modules, "
-        f"but got devices {devices}"
-    )
-    device = next(iter(devices)) if len(devices) > 0 else None
-    return device
+    return _assert_and_get_unique_device(module)
 
 def create_getattr_from_value(module: torch.nn.Module, graph: Graph, prefix: str, value: Any) -> Node:
     """
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index 5a9035602d352..5f075df1cd83f 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -244,6 +244,8 @@ def __init__(
             torch.uint8,
             torch.int16,
             torch.int32,
+            torch.float8_e5m2,
+            torch.float8_e4m3fn,
         )
 
         assert self.dtype in _ALLOWED_DTYPES, f"Default Observer only works for {_ALLOWED_DTYPES} data type"
@@ -762,7 +764,7 @@ def _load_from_state_dict(
         error_msgs: List[str],
     ):
         version = local_metadata.get("version", None)
-        if version is None or version < 3:
+        if version is not None and version < 3:
             local_state = ["min_vals", "max_vals"]
             expected_min_name = "min_vals"
             expected_max_name = "max_vals"
@@ -1189,6 +1191,18 @@ def _combine_histograms(
         orig_hist = orig_hist + interpolated_histogram.to(torch.float)
         return orig_hist
 
+    def reset_histogram(self, x: torch.Tensor, min_val: torch.Tensor, max_val: torch.Tensor) -> None:
+        self.min_val.resize_(min_val.shape)
+        self.min_val.copy_(min_val)
+        self.max_val.resize_(max_val.shape)
+        self.max_val.copy_(max_val)
+        assert (
+            min_val.numel() == 1 and max_val.numel() == 1
+        ), "histogram min/max values must be scalar."
+        torch.histc(
+            x, self.bins, min=min_val, max=max_val, out=self.histogram  # type: ignore[arg-type]
+        )
+
     def forward(self, x_orig: torch.Tensor) -> torch.Tensor:
         if x_orig.numel() == 0:
             return x_orig
@@ -1206,19 +1220,13 @@ def forward(self, x_orig: torch.Tensor) -> torch.Tensor:
         min_val = self.min_val
         max_val = self.max_val
         same_values = min_val.item() == max_val.item()
+        # When (max_val - min_val) is very small, downsample_rate will be large.
+        # This can cause OOM issue in the allocation of histogram_with_output_range tensor.
+        close_values = (self.max_val - self.min_val) < 1e-6
         is_uninitialized = min_val == float("inf") and max_val == float("-inf")
-        if is_uninitialized or same_values:
+        if is_uninitialized or same_values or close_values:
             min_val, max_val = x_min, x_max
-            self.min_val.resize_(min_val.shape)
-            self.min_val.copy_(min_val)
-            self.max_val.resize_(max_val.shape)
-            self.max_val.copy_(max_val)
-            assert (
-                min_val.numel() == 1 and max_val.numel() == 1
-            ), "histogram min/max values must be scalar."
-            torch.histc(
-                x, self.bins, min=min_val, max=max_val, out=self.histogram  # type: ignore[arg-type]
-            )
+            self.reset_histogram(x, min_val, max_val)
         else:
             new_min, new_max = x_min, x_max
             combined_min = torch.min(new_min, min_val)
@@ -1246,21 +1254,29 @@ def forward(self, x_orig: torch.Tensor) -> torch.Tensor:
             if combined_min == min_val and combined_max == max_val:
                 combined_histogram += self.histogram
             else:
-                combined_histogram = self._combine_histograms(
-                    combined_histogram,
-                    self.histogram,
-                    self.upsample_rate,
-                    downsample_rate,
-                    start_idx,
-                    self.bins,
-                )
+                MAX_HISTOGRAM_SIZE = 1e9  # 1 GB
+                histogram_size = self.bins * downsample_rate * 4
+                if histogram_size > MAX_HISTOGRAM_SIZE:
+                    warnings.warn(
+                        "Fail to combine histograms. Fall back to reset histogram."
+                    )
+                    self.reset_histogram(x, x_min, x_max)
+                else:
+                    combined_histogram = self._combine_histograms(
+                        combined_histogram,
+                        self.histogram,
+                        self.upsample_rate,
+                        downsample_rate,
+                        start_idx,
+                        self.bins,
+                    )
+                    self.histogram.detach_().resize_(combined_histogram.shape)
+                    self.histogram.copy_(combined_histogram)
+                    self.min_val.detach_().resize_(combined_min.shape)
+                    self.min_val.copy_(combined_min)
+                    self.max_val.detach_().resize_(combined_max.shape)
+                    self.max_val.copy_(combined_max)
 
-            self.histogram.detach_().resize_(combined_histogram.shape)
-            self.histogram.copy_(combined_histogram)
-            self.min_val.detach_().resize_(combined_min.shape)
-            self.min_val.copy_(combined_min)
-            self.max_val.detach_().resize_(combined_max.shape)
-            self.max_val.copy_(combined_max)
         return x_orig
 
     @torch.jit.export
@@ -1439,7 +1455,7 @@ def extra_repr(self):
 
     @torch.jit.export
     def calculate_qparams(self):
-        raise Exception(
+        raise Exception(  # noqa: TRY002
             "calculate_qparams should not be called for PlaceholderObserver"
         )
 
@@ -1465,7 +1481,7 @@ def forward(self, x):
 
     @torch.jit.export
     def calculate_qparams(self):
-        raise Exception("calculate_qparams should not be called for RecordingObserver")
+        raise Exception("calculate_qparams should not be called for RecordingObserver")  # noqa: TRY002
 
     @torch.jit.export
     def get_tensor_value(self):
@@ -1496,7 +1512,7 @@ def forward(self, x):
 
     @torch.jit.export
     def calculate_qparams(self):
-        raise Exception("calculate_qparams should not be called for NoopObserver")
+        raise Exception("calculate_qparams should not be called for NoopObserver")  # noqa: TRY002
 
 class ReuseInputObserver(ObserverBase):
     r""" This observer is used when we want to reuse the observer from the operator
@@ -1519,7 +1535,7 @@ def forward(self, x):
 
     @torch.jit.export
     def calculate_qparams(self):
-        raise Exception("calculate_qparams should not be called for ReuseInputObserver")
+        raise Exception("calculate_qparams should not be called for ReuseInputObserver")  # noqa: TRY002
 
 def _is_observer_script_module(mod, obs_type_name):
     """Returns true if given mod is an instance of Observer script module."""
@@ -1590,10 +1606,10 @@ def load_observer_state_dict(mod, obs_dict):
                 )
     for k in missing_keys:
         if "observer" in k or "activation_post_process" in k:
-            raise Exception(f"Missing keys for observer {k} in state_dict")
+            raise Exception(f"Missing keys for observer {k} in state_dict")  # noqa: TRY002
     for k in unexpected_keys:
         if "observer" in k or "activation_post_process" in k:
-            raise Exception(f"Unexpected keys for observer {k} in state_dict")
+            raise Exception(f"Unexpected keys for observer {k} in state_dict")  # noqa: TRY002
 
 
 # Restrict activations to be in the range (0,127)
diff --git a/torch/ao/quantization/pt2e/eval_utils.py b/torch/ao/quantization/pt2e/eval_utils.py
deleted file mode 100644
index 7699e61ed6d5a..0000000000000
--- a/torch/ao/quantization/pt2e/eval_utils.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import torch
-import torch.nn.functional as F
-
-
-def _replace_dropout_for_eval(m: torch.fx.GraphModule):
-    """
-    Replace the aten training dropout pattern with a noop, intended for eval.
-
-    For models with dropout torch ops (nn.Dropout, F.dropout), calling model.eval()
-    effectively turns these dropout ops into noops. For exported models, however,
-    this is not done automatically, since the aten dropout patterns previously generated
-    for training remain in the graph. Here we rewrite these dropout patterns with noops
-    to avoid incorrectly applying further dropout during eval.
-
-    See https://github.com/pytorch/pytorch/issues/103681.
-    """
-    # Avoid circular dependencies
-    from .utils import get_aten_graph_module
-
-    # Needed to ensure subgraph matches are self-contained
-    m.graph.eliminate_dead_code()
-    m.recompile()
-
-    for inplace in [False, True]:
-
-        def dropout_train(x):
-            return F.dropout(x, p=0.5, training=True, inplace=inplace)
-
-        def dropout_eval(x):
-            return F.dropout(x, p=0.5, training=False, inplace=inplace)
-
-        example_inputs = (torch.randn(1),)
-        match_pattern = get_aten_graph_module(dropout_train, example_inputs)
-        replacement_pattern = get_aten_graph_module(dropout_eval, example_inputs)
-
-        from torch.fx.subgraph_rewriter import replace_pattern_with_filters
-
-        replace_pattern_with_filters(
-            m,
-            match_pattern,
-            replacement_pattern,
-            match_filters=[],
-            ignore_literals=True,
-        )
-        m.recompile()
-
-
-def _replace_batchnorm_for_eval(m: torch.fx.GraphModule):
-    # TODO(Leslie): This function still fails to support custom momentum and eps value.
-    # Enable this support in future updates.
-
-    # Avoid circular dependencies
-    from .utils import get_aten_graph_module
-
-    # Needed to ensure subgraph matches are self-contained
-    m.graph.eliminate_dead_code()
-    m.recompile()
-
-    def bn_train(
-        x: torch.Tensor,
-        bn_weight: torch.Tensor,
-        bn_bias: torch.Tensor,
-        bn_running_mean: torch.Tensor,
-        bn_running_var: torch.Tensor,
-    ):
-        return F.batch_norm(
-            x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=True
-        )
-
-    def bn_eval(
-        x: torch.Tensor,
-        bn_weight: torch.Tensor,
-        bn_bias: torch.Tensor,
-        bn_running_mean: torch.Tensor,
-        bn_running_var: torch.Tensor,
-    ):
-        return F.batch_norm(
-            x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=False
-        )
-
-    example_inputs = (
-        torch.randn(1, 1, 3, 3),  # x
-        torch.randn(1),  # bn_weight
-        torch.randn(1),  # bn_bias
-        torch.randn(1),  # bn_running_mean
-        torch.randn(1),  # bn_running_var
-    )
-    match_pattern = get_aten_graph_module(bn_train, example_inputs)
-    replacement_pattern = get_aten_graph_module(bn_eval, example_inputs)
-    from torch.fx.subgraph_rewriter import replace_pattern_with_filters
-
-    replace_pattern_with_filters(
-        m,
-        match_pattern,
-        replacement_pattern,
-        match_filters=[],
-        ignore_literals=True,
-    )
-    m.recompile()
-
-
-# TODO: also support move_exported_model_to_train
-def _move_exported_model_to_eval(model: torch.fx.GraphModule):
-    """
-    Move an exported GraphModule to eval mode.
-
-    This is equivalent to model.eval() but only for certain special ops like dropout, batchnorm.
-    QAT users should call this before performing inference on the model.
-    """
-    _replace_dropout_for_eval(model)
-    _replace_batchnorm_for_eval(model)
-    return model
diff --git a/torch/ao/quantization/pt2e/export_utils.py b/torch/ao/quantization/pt2e/export_utils.py
new file mode 100644
index 0000000000000..2e7b9e380dfd6
--- /dev/null
+++ b/torch/ao/quantization/pt2e/export_utils.py
@@ -0,0 +1,223 @@
+import types
+
+import torch
+import torch.nn.functional as F
+
+from torch.ao.quantization.utils import _assert_and_get_unique_device
+
+
+__all__ = [
+    "model_is_exported",
+    "_WrapperModule",
+]
+
+
+class _WrapperModule(torch.nn.Module):
+    """Class to wrap a callable in an :class:`torch.nn.Module`. Use this if you
+    are trying to export a callable.
+    """
+
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, *args, **kwargs):
+        """Simple forward that just calls the ``fn`` provided to :meth:`WrapperModule.__init__`."""
+        return self.fn(*args, **kwargs)
+
+
+def model_is_exported(m: torch.nn.Module) -> bool:
+    """
+    Return True if the `torch.nn.Module` was exported, False otherwise
+    (e.g. if the model was FX symbolically traced or not traced at all).
+    """
+    return isinstance(m, torch.fx.GraphModule) and any(
+        "val" in n.meta for n in m.graph.nodes
+    )
+
+
+def _replace_dropout(m: torch.fx.GraphModule, train_to_eval: bool):
+    """
+    Switch dropout patterns in the model between train and eval modes.
+
+    Dropout has different behavior in train vs eval mode. For exported models,
+    however, calling `model.train()` or `model.eval()` does not automatically switch
+    the dropout behavior between the two modes, so here we need to rewrite the aten
+    dropout patterns manually to achieve the same effect.
+
+    See https://github.com/pytorch/pytorch/issues/103681.
+    """
+    # Avoid circular dependencies
+    from .utils import _get_aten_graph_module_for_pattern
+
+    # Needed to ensure subgraph matches are self-contained
+    m.graph.eliminate_dead_code()
+    m.recompile()
+
+    for inplace in [False, True]:
+
+        def dropout_train(x):
+            return F.dropout(x, p=0.5, training=True, inplace=inplace)
+
+        def dropout_eval(x):
+            return F.dropout(x, p=0.5, training=False, inplace=inplace)
+
+        example_inputs = (torch.randn(1),)
+        if train_to_eval:
+            match_pattern = _get_aten_graph_module_for_pattern(
+                _WrapperModule(dropout_train), example_inputs
+            )
+            replacement_pattern = _get_aten_graph_module_for_pattern(
+                _WrapperModule(dropout_eval), example_inputs
+            )
+        else:
+            match_pattern = _get_aten_graph_module_for_pattern(
+                _WrapperModule(dropout_eval), example_inputs
+            )
+            replacement_pattern = _get_aten_graph_module_for_pattern(
+                _WrapperModule(dropout_train), example_inputs
+            )
+
+        from torch.fx.subgraph_rewriter import replace_pattern_with_filters
+
+        replace_pattern_with_filters(
+            m,
+            match_pattern,
+            replacement_pattern,
+            match_filters=[],
+            ignore_literals=True,
+        )
+        m.recompile()
+
+
+def _replace_batchnorm(m: torch.fx.GraphModule, train_to_eval: bool):
+    """
+    Switch batchnorm patterns in the model between train and eval modes.
+
+    Batchnorm has different behavior in train vs eval mode. For exported models,
+    however, calling `model.train()` or `model.eval()` does not automatically switch
+    the batchnorm behavior between the two modes, so here we need to rewrite the aten
+    batchnorm patterns manually to achieve the same effect.
+    """
+    # TODO(Leslie): This function still fails to support custom momentum and eps value.
+    # Enable this support in future updates.
+
+    # Avoid circular dependencies
+    from .utils import _get_aten_graph_module_for_pattern
+
+    # Needed to ensure subgraph matches are self-contained
+    m.graph.eliminate_dead_code()
+    m.recompile()
+
+    def bn_train(
+        x: torch.Tensor,
+        bn_weight: torch.Tensor,
+        bn_bias: torch.Tensor,
+        bn_running_mean: torch.Tensor,
+        bn_running_var: torch.Tensor,
+    ):
+        return F.batch_norm(
+            x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=True
+        )
+
+    def bn_eval(
+        x: torch.Tensor,
+        bn_weight: torch.Tensor,
+        bn_bias: torch.Tensor,
+        bn_running_mean: torch.Tensor,
+        bn_running_var: torch.Tensor,
+    ):
+        return F.batch_norm(
+            x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=False
+        )
+
+    example_inputs = (
+        torch.randn(1, 1, 3, 3),  # x
+        torch.randn(1),  # bn_weight
+        torch.randn(1),  # bn_bias
+        torch.randn(1),  # bn_running_mean
+        torch.randn(1),  # bn_running_var
+    )
+
+    device = _assert_and_get_unique_device(m)
+    is_cuda = device is not None and device.type == "cuda"
+    bn_train_aten = _get_aten_graph_module_for_pattern(
+        _WrapperModule(bn_train),
+        example_inputs,
+        is_cuda,
+    )
+    bn_eval_aten = _get_aten_graph_module_for_pattern(
+        _WrapperModule(bn_eval),
+        example_inputs,
+        is_cuda,
+    )
+
+    if train_to_eval:
+        match_pattern = bn_train_aten
+        replacement_pattern = bn_eval_aten
+    else:
+        match_pattern = bn_eval_aten
+        replacement_pattern = bn_train_aten
+
+    from torch.fx.subgraph_rewriter import replace_pattern_with_filters
+
+    replace_pattern_with_filters(
+        m,
+        match_pattern,
+        replacement_pattern,
+        match_filters=[],
+        ignore_literals=True,
+    )
+    m.recompile()
+
+
+# TODO: expose these under this namespace?
+def _move_exported_model_to_eval(model: torch.fx.GraphModule):
+    """
+    Move an exported GraphModule to eval mode.
+
+    This is equivalent to model.eval() but only for certain special ops like dropout, batchnorm.
+    QAT users should call this before performing inference on the model.
+    """
+    _replace_dropout(model, train_to_eval=True)
+    _replace_batchnorm(model, train_to_eval=True)
+    return model
+
+
+def _move_exported_model_to_train(model: torch.fx.GraphModule):
+    """
+    Move an exported GraphModule to train mode.
+
+    This is equivalent to model.train() but only for certain special ops like dropout, batchnorm.
+    QAT users should call this before performing training on the model.
+    """
+    _replace_dropout(model, train_to_eval=False)
+    _replace_batchnorm(model, train_to_eval=False)
+    return model
+
+
+def _allow_exported_model_train_eval(model: torch.fx.GraphModule):
+    """
+    Allow users to call `model.train()` and `model.eval()` on an exported model,
+    but with the effect of changing behavior between the two modes limited to special
+    ops only, which are currently dropout and batchnorm.
+
+    Note: This does not achieve the same effect as what `model.train()` and `model.eval()`
+    does in eager models, but only provides an approximation. In particular, user code
+    branching on `training` flag will not function correctly in general because the branch
+    is already specialized at export time. Additionally, other ops beyond dropout and batchnorm
+    that have different train/eval behavior will also not be converted properly.
+    """
+
+    def _train(self, mode: bool = True):
+        if mode:
+            _move_exported_model_to_train(self)
+        else:
+            _move_exported_model_to_eval(self)
+
+    def _eval(self):
+        _move_exported_model_to_eval(self)
+
+    model.train = types.MethodType(_train, model)  # type: ignore[method-assign]
+    model.eval = types.MethodType(_eval, model)  # type: ignore[method-assign]
+    return model
diff --git a/torch/ao/quantization/pt2e/port_metadata_pass.py b/torch/ao/quantization/pt2e/port_metadata_pass.py
index 3f02943146e92..c47e820735787 100644
--- a/torch/ao/quantization/pt2e/port_metadata_pass.py
+++ b/torch/ao/quantization/pt2e/port_metadata_pass.py
@@ -101,10 +101,25 @@ def _port_metadata_for_input_quant_nodes(
         # if the q_node can be traced back to get_attr node
         q_to_get_attr_nodes = [q_node]
         q_node_input = q_node.args[0]
-        while isinstance(q_node_input, torch.fx.Node) and q_node_input.op not in [
-            "placeholder",
-            "get_attr",
-        ]:
+        while (
+            isinstance(q_node_input, torch.fx.Node)
+            and q_node_input.op == "call_function"
+            and q_node_input.target
+            in [
+                torch.ops.aten.flatten.using_ints,
+                torch.ops.aten.permute.default,
+                torch.ops.aten.permute_copy.default,
+                torch.ops.aten.slice_copy.Tensor,
+                torch.ops.aten.squeeze.dim,
+                torch.ops.aten.squeeze_copy.dim,
+                torch.ops.aten.transpose.Dimname,
+                torch.ops.aten.transpose.int,
+                torch.ops.aten.transpose_,
+                torch.ops.aten.view_copy.default,
+                torch.ops.aten.view.default,
+                torch.ops.aten._mkldnn_transpose,
+            ]
+        ):
             q_to_get_attr_nodes.append(q_node_input)
             q_node_input = q_node_input.args[0]
         if isinstance(q_node_input, torch.fx.Node) and q_node_input.op == "get_attr":
diff --git a/torch/ao/quantization/pt2e/prepare.py b/torch/ao/quantization/pt2e/prepare.py
index 6219e1c89ebc7..169a982f62ced 100644
--- a/torch/ao/quantization/pt2e/prepare.py
+++ b/torch/ao/quantization/pt2e/prepare.py
@@ -215,6 +215,8 @@ def _get_edge_or_node_to_group_id(edge_or_node_to_qspec: Dict[EdgeOrNode, Quanti
 
                 # sharing with other users of the producer node
                 # (arg, user)
+                if not isinstance(arg, Node) or not isinstance(n, Node):
+                    raise Exception(f"Expected input_edge to have type Tuple[Node, Node], but got: {arg, n}")  # noqa: TRY002
                 for user in arg.users:
                     if user is n:
                         continue
@@ -376,11 +378,13 @@ def remap_fn(x):
         numeric_debug_handle = node.meta["numeric_debug_handle"]
         node.meta["numeric_debug_handle"] = {remap_fn(k): v for k, v in numeric_debug_handle.items()}
 
-    # Clone has a memory_format kwarg and zeros_like has a pin_memory kwarg
-    # that persist in exported graph. This is just a work around for these.
+    # Clone has a memory_format kwarg, zeros_like has a pin_memory kwarg, and
+    # gelu has a has an approximate kwarg that persist in exported graph.
+    # This is just a work around for these.
     assert (
         node.target == torch.ops.aten.clone.default or
         node.target == torch.ops.aten.zeros_like.default or
+        node.target == torch.ops.aten.gelu.default or
         len(node.kwargs) == 0
     ), " expecting kwargs for aten op IR to be empty"
 
diff --git a/torch/ao/quantization/pt2e/qat_utils.py b/torch/ao/quantization/pt2e/qat_utils.py
index 1a7d0c22c63bf..1f89dbe11c6e4 100644
--- a/torch/ao/quantization/pt2e/qat_utils.py
+++ b/torch/ao/quantization/pt2e/qat_utils.py
@@ -11,6 +11,7 @@
 )
 import torch.nn.functional as F
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
+from torch.ao.quantization.pt2e.export_utils import _WrapperModule
 from torch.ao.quantization.quantizer import (
     DerivedQuantizationSpec,
     EdgeOrNode,
@@ -20,10 +21,11 @@
 from .utils import (
     _conv1d_bn_example_inputs,
     _conv2d_bn_example_inputs,
-    _is_conv,
     _is_bn_node,
+    _is_conv_or_conv_transpose_node,
+    _is_conv_transpose_fn,
     fold_bn_weights_into_conv_node,
-    get_aten_graph_module,
+    _get_aten_graph_module_for_pattern,
 )
 
 if TYPE_CHECKING:
@@ -56,6 +58,7 @@
 def _get_quantized_conv_bn_example_inputs_kwargs(
     is_per_channel: bool,
     has_bias: bool,
+    bias_is_quantized: bool,
     is_cuda: bool,
 ) -> Dict[str, Any]:
     """
@@ -66,8 +69,11 @@ def _get_quantized_conv_bn_example_inputs_kwargs(
     # Per tensor quantization uses literals to represent scale and zero
     # point, so there is no need to include them here as kwargs
     if is_per_channel:
-        kwargs["scale"] = torch.tensor([1], dtype=torch.float)
-        kwargs["zero_point"] = torch.tensor([0], dtype=torch.int)
+        kwargs["weight_scale"] = torch.tensor([1], dtype=torch.float)
+        kwargs["weight_zero_point"] = torch.tensor([0], dtype=torch.int)
+        if has_bias and bias_is_quantized:
+            kwargs["bias_scale"] = torch.tensor([1], dtype=torch.float)
+            kwargs["bias_zero_point"] = torch.tensor([0], dtype=torch.int)
     if has_bias:
         kwargs["conv_bias"] = torch.randn(1)
     if is_cuda:
@@ -89,7 +95,7 @@ def _conv_bn_pattern(
         x = conv_fn(x, conv_weight, conv_bias)
         x = F.batch_norm(x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=True)
         return x
-    return _conv_bn_pattern
+    return _WrapperModule(_conv_bn_pattern)
 
 # TODO: merge this with the `no_conv_bias` case
 def _get_qat_conv_bn_pattern(conv_fn: Callable) -> Callable:
@@ -112,7 +118,8 @@ def _qat_conv_bn_pattern(
         running_std = torch.sqrt(bn_running_var + bn_eps)
         scale_factor = bn_weight / running_std
         weight_shape = [1] * len(conv_weight.shape)
-        weight_shape[0] = -1
+        weight_in_channel_axis = 1 if _is_conv_transpose_fn(conv_fn) else 0
+        weight_shape[weight_in_channel_axis] = -1
         bias_shape = [1] * len(conv_weight.shape)
         bias_shape[1] = -1
         scaled_weight = conv_weight * scale_factor.reshape(weight_shape)
@@ -122,7 +129,7 @@ def _qat_conv_bn_pattern(
         x = x + conv_bias.reshape(bias_shape)
         x = F.batch_norm(x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=True, eps=bn_eps)
         return x
-    return _qat_conv_bn_pattern
+    return _WrapperModule(_qat_conv_bn_pattern)
 
 def _get_qat_conv_bn_pattern_no_conv_bias(conv_fn: Callable) -> Callable:
     def _qat_conv_bn_pattern_no_conv_bias(
@@ -143,7 +150,8 @@ def _qat_conv_bn_pattern_no_conv_bias(
         running_std = torch.sqrt(bn_running_var + bn_eps)
         scale_factor = bn_weight / running_std
         weight_shape = [1] * len(conv_weight.shape)
-        weight_shape[0] = -1
+        weight_in_channel_axis = 1 if _is_conv_transpose_fn(conv_fn) else 0
+        weight_shape[weight_in_channel_axis] = -1
         bias_shape = [1] * len(conv_weight.shape)
         bias_shape[1] = -1
         scaled_weight = conv_weight * scale_factor.reshape(weight_shape)
@@ -151,9 +159,9 @@ def _qat_conv_bn_pattern_no_conv_bias(
         x = x / scale_factor.reshape(bias_shape)
         x = F.batch_norm(x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=True, eps=bn_eps)
         return x
-    return _qat_conv_bn_pattern_no_conv_bias
+    return _WrapperModule(_qat_conv_bn_pattern_no_conv_bias)
 
-def _append_qdq(x, is_per_channel, kwargs):
+def _append_qdq(x, is_per_channel, is_bias, kwargs):
     """
     Helper function to append q-dq ops after `x`, using dummy values for the qparams
     and qmin/qmax. We use dummy values here because we match with `ignore_literals=True`
@@ -163,8 +171,10 @@ def _append_qdq(x, is_per_channel, kwargs):
     """
     # Dummy args to be passed into q-dq ops
     per_channel_axis = 0
-    scale = kwargs["scale"] if is_per_channel else 1.0
-    zp = kwargs["zero_point"] if is_per_channel else 0
+    scale_key = "bias_scale" if is_bias else "weight_scale"
+    zp_key = "bias_zero_point" if is_bias else "weight_zero_point"
+    scale = kwargs[scale_key] if is_per_channel else 1.0
+    zp = kwargs[zp_key] if is_per_channel else 0
     qmin = -127
     qmax = 127
     dtype = torch.int8
@@ -211,11 +221,15 @@ def _quantized_qat_conv_bn_pattern(
         bias_shape = [1] * len(conv_weight.shape)
         bias_shape[1] = -1
         scaled_weight = conv_weight * scale_factor.reshape(weight_shape)
-        scaled_weight = _append_qdq(scaled_weight, is_per_channel, kwargs)
+        scaled_weight = _append_qdq(
+            scaled_weight, is_per_channel, is_bias=False, kwargs=kwargs,
+        )
         if has_bias:
             zero_bias = torch.zeros_like(kwargs["conv_bias"], dtype=x.dtype)
             if bias_is_quantized:
-                zero_bias = _append_qdq(zero_bias, is_per_channel, kwargs)
+                zero_bias = _append_qdq(
+                    zero_bias, is_per_channel, is_bias=True, kwargs=kwargs,
+                )
             x = conv_fn(x, scaled_weight, zero_bias)
         else:
             x = conv_fn(x, scaled_weight, None)
@@ -224,7 +238,7 @@ def _quantized_qat_conv_bn_pattern(
             x = x + kwargs["conv_bias"].reshape(bias_shape)
         x = F.batch_norm(x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=bn_is_training, eps=bn_eps)
         return x
-    return _quantized_qat_conv_bn_pattern
+    return _WrapperModule(_quantized_qat_conv_bn_pattern)
 
 def _get_folded_quantized_qat_conv_bn_pattern(
     is_per_channel: bool,
@@ -248,17 +262,21 @@ def _folded_quantized_qat_conv_bn_pattern(
         bn_running_var: torch.Tensor,
         **kwargs,
     ) -> torch.Tensor:
-        conv_weight = _append_qdq(conv_weight, is_per_channel, kwargs)
+        conv_weight = _append_qdq(
+            conv_weight, is_per_channel, is_bias=False, kwargs=kwargs,
+        )
         if has_bias:
             bias = kwargs["conv_bias"]
             if bias_is_quantized:
-                bias = _append_qdq(bias, is_per_channel, kwargs)
+                bias = _append_qdq(
+                    bias, is_per_channel, is_bias=True, kwargs=kwargs,
+                )
         else:
             bias = None
         x = conv_fn(x, conv_weight, bias)
         x = F.batch_norm(x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=bn_is_training, eps=bn_eps)
         return x
-    return _folded_quantized_qat_conv_bn_pattern
+    return _WrapperModule(_folded_quantized_qat_conv_bn_pattern)
 
 def _has_conv_bias_filter(
     match: "InternalMatch",
@@ -270,7 +288,7 @@ def _has_conv_bias_filter(
     the original graph has bias.
     """
     for n in match.nodes_map.values():
-        if _is_conv(n):
+        if _is_conv_or_conv_transpose_node(n):
             return len(n.args) > 2 and n.args[2] is not None
     raise ValueError("Could not find conv node in matched conv + bn pattern")
 
@@ -324,7 +342,7 @@ def _get_nodes(nodes: List[Node]) -> Tuple[Node, Node, Node]:
         for n in nodes:
             if n.op != "call_function":
                 continue
-            if _is_conv(n):
+            if _is_conv_or_conv_transpose_node(n):
                 assert conv_node is None
                 conv_node = n
             if _is_bn_node(n):
@@ -439,8 +457,8 @@ def _copy_over_literal_conv_args(original_node: Node, new_node: Node):
     Note: Unlike other tensor args like conv weights and biases, literal args are
     preserved in the original nodes after replacement, so we can access them here.
     """
-    assert _is_conv(original_node)
-    assert _is_conv(new_node)
+    assert _is_conv_or_conv_transpose_node(original_node)
+    assert _is_conv_or_conv_transpose_node(new_node)
     # x, weight, bias, [stride, padding, dilation, transposed, output_padding, groups]
     new_args = list(new_node.args)
     if len(new_args) < 3:
@@ -456,8 +474,8 @@ def _update_conv_input_qspec_map_after_replacement(original_node: Node, replacem
     so the keys in the `input_qspec_map` will need to be updated to reflect
     the corresponding nodes in the replacement graph.
     """
-    assert _is_conv(original_node)
-    assert _is_conv(replacement_node)
+    assert _is_conv_or_conv_transpose_node(original_node)
+    assert _is_conv_or_conv_transpose_node(replacement_node)
     if "quantization_annotation" not in original_node.meta:
         return
     original_input_qspec_map = original_node.meta["quantization_annotation"].input_qspec_map
@@ -521,11 +539,12 @@ def _fuse_conv_bn_qat(m: GraphModule) -> GraphModule:
     has_bn = any(_is_bn_node(n) for n in m.graph.nodes)
     if not has_bn:
         return m
-    m = _fuse_conv_bn_qat_helper(m, F.conv1d, _conv1d_bn_example_inputs, is_cuda=False)
-    m = _fuse_conv_bn_qat_helper(m, F.conv2d, _conv2d_bn_example_inputs, is_cuda=False)
-    if torch.cuda.is_available():
-        m = _fuse_conv_bn_qat_helper(m, F.conv1d, _conv1d_bn_example_inputs, is_cuda=True)
-        m = _fuse_conv_bn_qat_helper(m, F.conv2d, _conv2d_bn_example_inputs, is_cuda=True)
+    is_cuda_options = [True, False] if torch.cuda.is_available() else [False]
+    for is_cuda in is_cuda_options:
+        m = _fuse_conv_bn_qat_helper(m, F.conv1d, _conv1d_bn_example_inputs, is_cuda=is_cuda)
+        m = _fuse_conv_bn_qat_helper(m, F.conv2d, _conv2d_bn_example_inputs, is_cuda=is_cuda)
+        m = _fuse_conv_bn_qat_helper(m, F.conv_transpose1d, _conv1d_bn_example_inputs, is_cuda=is_cuda)
+        m = _fuse_conv_bn_qat_helper(m, F.conv_transpose2d, _conv2d_bn_example_inputs, is_cuda=is_cuda)
     return m
 
 def _fuse_conv_bn_qat_helper(
@@ -545,7 +564,7 @@ def _fuse_conv_bn_qat_helper(
     m.graph.eliminate_dead_code()
     m.recompile()
     conv_bn_pattern = _get_conv_bn_pattern(conv_fn)
-    match_pattern = get_aten_graph_module(conv_bn_pattern, example_inputs, is_cuda)
+    match_pattern = _get_aten_graph_module_for_pattern(conv_bn_pattern, example_inputs, is_cuda)
 
     # Step (1): Replace patterns with conv bias
     #
@@ -554,7 +573,7 @@ def _fuse_conv_bn_qat_helper(
     # TODO: use the public replace_pattern API once it also returns replacement nodes
 
     qat_conv_bn_pattern = _get_qat_conv_bn_pattern(conv_fn)
-    replacement_pattern_with_conv_bias = get_aten_graph_module(
+    replacement_pattern_with_conv_bias = _get_aten_graph_module_for_pattern(
         qat_conv_bn_pattern,
         example_inputs,
         is_cuda,
@@ -571,7 +590,7 @@ def _fuse_conv_bn_qat_helper(
     # Step (2): Replace patterns without conv bias
 
     qat_conv_bn_pattern_no_conv_bias = _get_qat_conv_bn_pattern_no_conv_bias(conv_fn)
-    replacement_pattern_no_conv_bias = get_aten_graph_module(
+    replacement_pattern_no_conv_bias = _get_aten_graph_module_for_pattern(
         qat_conv_bn_pattern_no_conv_bias,
         example_inputs,
         is_cuda,
@@ -608,7 +627,7 @@ def _fuse_conv_bn_qat_helper(
         for original_node, replacement_node in _get_conv_bn_pattern_nodes(r).values():
             # Step (3a): Copy over metadata for all nodes in [conv - bn - getitem]
             replacement_node.meta = original_node.meta
-            if _is_conv(original_node):
+            if _is_conv_or_conv_transpose_node(original_node):
                 # Step (3b): Copy over conv literal args
                 _copy_over_literal_conv_args(original_node, replacement_node)
                 # Step (3c): Update old references in the conv node's input_qspec_map
@@ -700,11 +719,12 @@ def _fold_conv_bn_qat(m: GraphModule) -> GraphModule:
     has_bn = any(_is_bn_node(n) for n in m.graph.nodes)
     if not has_bn:
         return m
-    m = _fold_conv_bn_qat_helper(m, F.conv1d, _quantized_conv1d_bn_example_inputs, is_cuda=False)
-    m = _fold_conv_bn_qat_helper(m, F.conv2d, _quantized_conv2d_bn_example_inputs, is_cuda=False)
-    if torch.cuda.is_available():
-        m = _fold_conv_bn_qat_helper(m, F.conv1d, _quantized_conv1d_bn_example_inputs, is_cuda=True)
-        m = _fold_conv_bn_qat_helper(m, F.conv2d, _quantized_conv2d_bn_example_inputs, is_cuda=True)
+    is_cuda_options = [True, False] if torch.cuda.is_available() else [False]
+    for is_cuda in is_cuda_options:
+        m = _fold_conv_bn_qat_helper(m, F.conv1d, _quantized_conv1d_bn_example_inputs, is_cuda=is_cuda)
+        m = _fold_conv_bn_qat_helper(m, F.conv2d, _quantized_conv2d_bn_example_inputs, is_cuda=is_cuda)
+        m = _fold_conv_bn_qat_helper(m, F.conv_transpose1d, _quantized_conv1d_bn_example_inputs, is_cuda=is_cuda)
+        m = _fold_conv_bn_qat_helper(m, F.conv_transpose2d, _quantized_conv2d_bn_example_inputs, is_cuda=is_cuda)
     return m
 
 def _fold_conv_bn_qat_helper(
@@ -733,15 +753,15 @@ def _fold_conv_bn_qat_helper(
         # filter out one of the values for this flag to avoid having duplicate patterns
         if not has_bias and bias_is_quantized:
             continue
-        kwargs = _get_quantized_conv_bn_example_inputs_kwargs(is_per_channel, has_bias, is_cuda)
+        kwargs = _get_quantized_conv_bn_example_inputs_kwargs(is_per_channel, has_bias, bias_is_quantized, is_cuda)
         match_pattern = _get_quantized_qat_conv_bn_pattern(
             is_per_channel, has_bias, bias_is_quantized, conv_fn, bn_is_training
         )
-        match_pattern = get_aten_graph_module(match_pattern, example_inputs, is_cuda, **kwargs)
+        match_pattern = _get_aten_graph_module_for_pattern(match_pattern, example_inputs, is_cuda, **kwargs)
         replacement_pattern = _get_folded_quantized_qat_conv_bn_pattern(
             is_per_channel, has_bias, bias_is_quantized, conv_fn, bn_is_training
         )
-        replacement_pattern = get_aten_graph_module(replacement_pattern, example_inputs, is_cuda, **kwargs)
+        replacement_pattern = _get_aten_graph_module_for_pattern(replacement_pattern, example_inputs, is_cuda, **kwargs)
         replacements.extend(
             replace_pattern_with_filters(
                 m,
@@ -779,7 +799,7 @@ def _fold_conv_bn_qat_helper(
 
         # Copy over literal args for conv
         for original_node in _filter_nodes_map(r.nodes_map).values():
-            if _is_conv(original_node):
+            if _is_conv_or_conv_transpose_node(original_node):
                 _copy_over_literal_conv_args(original_node, conv_node)
 
     m.graph.eliminate_dead_code()
diff --git a/torch/ao/quantization/pt2e/representation/rewrite.py b/torch/ao/quantization/pt2e/representation/rewrite.py
index 42e3db762a517..7f5cb2eeb13b8 100644
--- a/torch/ao/quantization/pt2e/representation/rewrite.py
+++ b/torch/ao/quantization/pt2e/representation/rewrite.py
@@ -1,7 +1,8 @@
 import torch
 from torch.fx import GraphModule
+from ..export_utils import _WrapperModule
 from ..utils import (
-    get_aten_graph_module,
+    _get_aten_graph_module_for_pattern,
     remove_tensor_overload_for_qdq_ops,
     _replace_literals_with_new_placeholders,
     _replace_literals_with_existing_placeholders,
@@ -501,8 +502,8 @@ class _RewriteInfo:
 _REWRITE_INFO_LIST = [
     _RewriteInfo(
         _DYNAMIC_QUANTIZED_LINEAR_EXAMPLE_INPUTS,
-        _qdq_dynamic_quantized_linear,
-        _reference_dynamic_quantized_linear,
+        _WrapperModule(_qdq_dynamic_quantized_linear),
+        _WrapperModule(_reference_dynamic_quantized_linear),
         partial(
             _replace_literals_with_existing_placeholders,
             literal_to_ph_idx={
@@ -522,55 +523,56 @@ class _RewriteInfo:
     ),
     _RewriteInfo(
         _QUANTIZED_LINEAR_EXAMPLE_INPUTS,
-        _qdq_quantized_linear,
-        _reference_quantized_linear,
+        _WrapperModule(_qdq_quantized_linear),
+        _WrapperModule(_reference_quantized_linear),
         _replace_literals_with_new_placeholders,
         _replace_literals_with_new_placeholders,
     ),
     _RewriteInfo(
         _QUANTIZED_CONV2d_EXAMPLE_INPUTS,
-        _qdq_quantized_conv2d,
-        _reference_quantized_conv2d,
+        _WrapperModule(_qdq_quantized_conv2d),
+        _WrapperModule(_reference_quantized_conv2d),
         partial(_replace_literals_with_new_placeholders, exclude_literals=[-1]),
         partial(_replace_literals_with_new_placeholders, exclude_literals=[-1]),
     ),
     _RewriteInfo(
         _QUANTIZED_ADD_OR_ADD_RELU_EXAMPLE_INPUTS,
-        _qdq_quantized_add_relu,
-        _reference_quantized_add_relu
+        _WrapperModule(_qdq_quantized_add_relu),
+        _WrapperModule(_reference_quantized_add_relu),
     ),
     _RewriteInfo(
         _QUANTIZED_ADD_OR_ADD_RELU_EXAMPLE_INPUTS,
-        _qdq_quantized_add,
-        _reference_quantized_add
+        _WrapperModule(_qdq_quantized_add),
+        _WrapperModule(_reference_quantized_add),
     ),
     _RewriteInfo(
         _QUANTIZED_MAX_POOL2D_EXAMPLE_INPUTS,
-        _qdq_quantized_max_pool2d,
-        _reference_quantized_max_pool2d,
+        _WrapperModule(_qdq_quantized_max_pool2d),
+        _WrapperModule(_reference_quantized_max_pool2d),
         _replace_literals_with_new_placeholders,
         _replace_literals_with_new_placeholders
     ),
     _RewriteInfo(
         _QUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS,
-        _quantize_per_tensor_int8,
-        _reference_quantize_per_tensor_int8),
+        _WrapperModule(_quantize_per_tensor_int8),
+        _WrapperModule(_reference_quantize_per_tensor_int8),
+    ),
     _RewriteInfo(
         _DEQUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS,
-        _dequantize_per_tensor_int8,
-        _reference_dequantize_per_tensor_int8
+        _WrapperModule(_dequantize_per_tensor_int8),
+        _WrapperModule(_reference_dequantize_per_tensor_int8),
     ),
     _RewriteInfo(
         _QUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS,
-        _quantize_per_channel_int8,
-        _reference_quantize_per_channel_int8,
+        _WrapperModule(_quantize_per_channel_int8),
+        _WrapperModule(_reference_quantize_per_channel_int8),
         _replace_ph_qdq_per_channel_replacement,
         _replace_ph_qdq_per_channel_replacement
     ),
     _RewriteInfo(
         _DEQUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS,
-        _dequantize_per_channel_int8,
-        _reference_dequantize_per_channel_int8,
+        _WrapperModule(_dequantize_per_channel_int8),
+        _WrapperModule(_reference_dequantize_per_channel_int8),
         _replace_ph_qdq_per_channel_replacement,
         _replace_ph_qdq_per_channel_replacement
     ),
@@ -584,9 +586,9 @@ def reference_representation_rewrite(model: GraphModule) -> GraphModule:
         replacement = rewrite_info.replacement
         pattern_post_trans = rewrite_info.pattern_post_trans
         replacement_post_trans = rewrite_info.replacement_post_trans
-        pattern = get_aten_graph_module(pattern, example_inputs)  # type: ignore[arg-type, assignment]
+        pattern = _get_aten_graph_module_for_pattern(pattern, example_inputs)  # type: ignore[arg-type, assignment]
         remove_tensor_overload_for_qdq_ops(pattern)  # type: ignore[arg-type]
-        replacement = get_aten_graph_module(replacement, example_inputs)  # type: ignore[arg-type, assignment]
+        replacement = _get_aten_graph_module_for_pattern(replacement, example_inputs)  # type: ignore[arg-type, assignment]
         remove_tensor_overload_for_qdq_ops(replacement)  # type: ignore[arg-type]
         if pattern_post_trans:
             pattern = pattern_post_trans(pattern)
diff --git a/torch/ao/quantization/pt2e/utils.py b/torch/ao/quantization/pt2e/utils.py
index e187f74c8e5c6..051f02de2da8f 100644
--- a/torch/ao/quantization/pt2e/utils.py
+++ b/torch/ao/quantization/pt2e/utils.py
@@ -7,9 +7,11 @@
     GraphModule,
     Node,
 )
+import torch.nn.functional as F
 from torch.nn.utils.fusion import fuse_conv_bn_weights
 from typing import Any, Callable, Dict, Optional, Tuple, List, Union
 from torch.utils._pytree import LeafSpec
+from torch.export.unflatten import _AttrKind, _assign_attr
 
 # Makes sure that quantized_decomposed ops are registered
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
@@ -19,7 +21,7 @@
 
 __all__ = [
     "fold_bn_weights_into_conv_node",
-    "get_aten_graph_module",
+    "_get_aten_graph_module_for_pattern",
     "remove_tensor_overload_for_qdq_ops",
 ]
 
@@ -132,7 +134,13 @@ def _get_tensor_constant_from_node(node, m):
     if node is None:
         return None
     assert node.op == "get_attr"
-    return getattr(m, node.target)
+    target_atoms = node.target.split('.')
+    attr_itr = m
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}")
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
 
 def _get_all_arguments(orig_args, orig_kwargs, args_schema):
     all_args = []
@@ -159,8 +167,8 @@ def _is_supported_batch_norm_for_training(node: Node):
     ]
     return node.target in supported_ops
 
-# TODO: rename this to _is_conv_node
-def _is_conv(n: Node):
+# TODO: move this to torch/ao/quantization/utils.py
+def _is_conv_node(n: Node):
     """
     Return whether the node refers to an aten conv op.
     """
@@ -169,16 +177,26 @@ def _is_conv(n: Node):
         torch.ops.aten.conv2d.default,
     ]
 
-# TODO: rename this to _is_conv_transpose_node
-def _is_conv_transpose(n: Node):
+def _is_conv_transpose_node(n: Node):
     """
     Return whether the node refers to an aten conv_transpose op.
     """
     return n.op == "call_function" and n.target in [
         torch.ops.aten.conv_transpose1d,
+        torch.ops.aten.conv_transpose1d.default,
         torch.ops.aten.conv_transpose2d,
+        torch.ops.aten.conv_transpose2d.input,
     ]
 
+def _is_conv_or_conv_transpose_node(n: Node):
+    """
+    Return whether the node refers to an aten conv or conv transpose op.
+    """
+    return _is_conv_node(n) or _is_conv_transpose_node(n)
+
+def _is_conv_transpose_fn(conv_fn: Callable):
+    return conv_fn in [F.conv_transpose1d, F.conv_transpose2d]
+
 def _is_bn_node(n: Node):
     return _is_supported_batch_norm_for_training(n) or n.target == torch.ops.aten._native_batch_norm_legit_no_training.default
 
@@ -192,7 +210,7 @@ def fold_bn_weights_into_conv_node(
     # conv args: input, weight, bias, stride, padding, dilation, ...
     conv_w = _get_tensor_constant_from_node(conv_weight_node, m)
     conv_b = _get_tensor_constant_from_node(conv_bias_node, m)
-    transpose = _is_conv_transpose(conv_node)
+    transpose = _is_conv_transpose_node(conv_node)
 
     # eval bn args: input, weight, bias, running mean, running var, momentum, eps
     # train bn args: input, weight, bias, running mean, running var, training, momentum, eps
@@ -221,13 +239,13 @@ def fold_bn_weights_into_conv_node(
     # calling data since the fused_weight and fused_bias are nn.Parameter
     weight_attr_name = conv_weight_node.target
     assert isinstance(weight_attr_name, str)
-    setattr(m, weight_attr_name, fused_weight)
+    _assign_attr(fused_weight, m, weight_attr_name, _AttrKind.PARAMETER)
     if conv_bias_node is not None:
         bias_attr_name = conv_bias_node.target
-        setattr(m, bias_attr_name, fused_bias)  # type: ignore[arg-type]
+        _assign_attr(fused_bias, m, str(bias_attr_name), _AttrKind.PARAMETER)
     else:
         bias_attr_name = weight_attr_name + "_bias"
-        setattr(m, bias_attr_name, fused_bias)  # type: ignore[arg-type]
+        _assign_attr(fused_bias, m, bias_attr_name, _AttrKind.PARAMETER)
         with m.graph.inserting_before(conv_node):
             get_bias_node = m.graph.get_attr(bias_attr_name)
         # NOTE: here we assume the bias of conv is not quantized!
@@ -263,7 +281,7 @@ def _fuse_conv_bn_(m: GraphModule) -> None:
             continue
         bn_node = n
         n = bn_node.args[0]
-        if not _is_conv(n):
+        if not _is_conv_or_conv_transpose_node(n):
             continue
         conv_node = n
         conv_weight_node = conv_node.args[1]
@@ -285,7 +303,7 @@ def _get_node_name_to_scope(model: GraphModule) -> Dict[str, Tuple[str, type]]:
         node_name_to_scope[n.name] = current_scope
     return node_name_to_scope
 
-def get_aten_graph_module(
+def _get_aten_graph_module_for_pattern(
     pattern: Callable,
     example_inputs: Tuple[Any, ...],
     is_cuda: bool = False,
@@ -303,6 +321,16 @@ def get_aten_graph_module(
     )
     aten_pattern.graph.eliminate_dead_code()
     aten_pattern.recompile()
+
+    # ep.module() adds copy_ nodes for the mutated inputs.
+    # For patterns, it doesn't matter
+    for node in aten_pattern.graph.nodes:
+        if node.op == "call_function" and node.target == torch.ops.aten.copy_.default and len(node.users) == 0:
+            aten_pattern.graph.erase_node(node)
+
+    aten_pattern.graph.eliminate_dead_code()
+    aten_pattern.recompile()
+
     return aten_pattern
 
 def remove_tensor_overload_for_qdq_ops(match_pattern: GraphModule) -> None:
@@ -363,8 +391,8 @@ def replacement(self, x):
         return x - 3
 
     example_inputs = (torch.randn(1, 3, 3, 3),)
-    pattern_gm = get_aten_graph_module(pattern, example_inputs)
-    replacement_gm = get_aten_graph_module(pattern, example_inptus)
+    pattern_gm = _get_aten_graph_module_for_pattern(pattern, example_inputs)
+    replacement_gm = _get_aten_graph_module_for_pattern(pattern, example_inptus)
 
     # 2. Before calling replace literals we'll see the following graph:
     def pattern(self, x):
@@ -449,8 +477,8 @@ def replacement(x_i8, scale, zero_point, quant_min, quant_max):
         -128,
         127,
     )
-    pattern_gm = get_aten_graph_module(pattern, example_inputs)
-    replacement_gm = get_aten_graph_module(pattern, example_inptus)
+    pattern_gm = _get_aten_graph_module_for_pattern(pattern, example_inputs)
+    replacement_gm = _get_aten_graph_module_for_pattern(pattern, example_inptus)
 
     # 2. Before calling replace literals we'll see the following graph:
     def pattern(self, x_i8, scale, zero_point, quant_min, quant_max):
@@ -510,11 +538,23 @@ def _disallow_eval_train(model: GraphModule):
     Disallow calling `model.train()` or `model.eval()` on the given GraphModule.
     This is useful for exported models, where these methods don't actually behave as expected.
     """
+    error_message = \
+        """
+        Calling train() or eval() is not supported for exported models.
+        Please call `torch.ao.quantization.move_exported_model_to_train(model)` (or eval) instead.
+
+        If you cannot replace the calls to `model.train()` and `model.eval()`, you may override
+        the behavior for these methods by calling `torch.ao.quantization.allow_exported_model_train_eval(model)`,
+        which does the above automatically for you. Note that this has limited effect on switching
+        behavior between train and eval modes, and should be used only for special ops such as dropout
+        and batchnorm.
+        """
+
     def _train(self, mode: bool = True):
-        raise NotImplementedError("Calling train() is not supported yet.")
+        raise NotImplementedError(error_message)
 
     def _eval(self, mode: bool = True):
-        raise NotImplementedError("Calling eval() is not supported yet.")
+        raise NotImplementedError(error_message)
 
     model.train = types.MethodType(_train, model)  # type: ignore[method-assign]
     model.eval = types.MethodType(_eval, model)  # type: ignore[method-assign]
diff --git a/torch/ao/quantization/quantize_pt2e.py b/torch/ao/quantization/quantize_pt2e.py
index 6e00a95edd25c..d9919aa2e9c5b 100644
--- a/torch/ao/quantization/quantize_pt2e.py
+++ b/torch/ao/quantization/quantize_pt2e.py
@@ -201,18 +201,14 @@ def _quant_node_constraint(n: Node) -> bool:
 def convert_pt2e(
     model: GraphModule,
     use_reference_representation: bool = False,
-    fold_quantize: bool = False,
+    fold_quantize: bool = True,
 ) -> GraphModule:
     """Convert a calibrated/trained model to a quantized model
 
     Args:
       * `model` (torch.fx.GraphModule): calibrated/trained model
       * `use_reference_representation` (bool): boolean flag to indicate whether to produce referece representation or not
-      * `fold_quantize` (bool): boolean flag to indicate whether fold the quantize op or not
-
-    Note: please set `fold_quantize` to True whenever you can, we'll deprecate this flag and
-    make True the default option in the future, to make sure the change doesn't break BC for you, it's
-    better to set the flag to True now.
+      * `fold_quantize` (bool): boolean flag for whether fold the quantize op or not
 
     Returns:
         quantized model, either in q/dq representation or reference representation
diff --git a/torch/ao/quantization/quantizer/composable_quantizer.py b/torch/ao/quantization/quantizer/composable_quantizer.py
index ad55138191fbf..eed0cc0137054 100644
--- a/torch/ao/quantization/quantizer/composable_quantizer.py
+++ b/torch/ao/quantization/quantizer/composable_quantizer.py
@@ -1,13 +1,13 @@
 from __future__ import annotations
 
-from typing import Dict, List
-
-import torch
-
-from torch.fx import Node
+from typing import Dict, List, TYPE_CHECKING
 
 from .quantizer import QuantizationAnnotation, Quantizer
 
+if TYPE_CHECKING:
+    import torch
+    from torch.fx import Node
+
 __all__ = [
     "ComposableQuantizer",
 ]
@@ -67,5 +67,12 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
             self._record_and_validate_annotations(model, quantizer)
         return model
 
+    def transform_for_annotation(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        for quantizer in self.quantizers:
+            model = quantizer.transform_for_annotation(model)
+        return model
+
     def validate(self, model: torch.fx.GraphModule) -> None:
         pass
diff --git a/torch/ao/quantization/quantizer/embedding_quantizer.py b/torch/ao/quantization/quantizer/embedding_quantizer.py
index 8ffd2002e580d..81306943264ba 100644
--- a/torch/ao/quantization/quantizer/embedding_quantizer.py
+++ b/torch/ao/quantization/quantizer/embedding_quantizer.py
@@ -45,9 +45,9 @@ def __init__(self):
 
     @classmethod
     def get_supported_quantization_configs(cls) -> List[QuantizationConfig]:
-        op_configs: Set[QuantizationConfig] = set({})
-        for spec, _ in cls.get_supported_operators():
-            op_configs.add(spec)
+        op_configs: Set[QuantizationConfig] = {
+            spec for spec, _ in cls.get_supported_operators()
+        }
         return list(op_configs)
 
     @classmethod
diff --git a/torch/ao/quantization/quantizer/quantizer.py b/torch/ao/quantization/quantizer/quantizer.py
index 5fd1fd20b4603..a521ff56c34c3 100644
--- a/torch/ao/quantization/quantizer/quantizer.py
+++ b/torch/ao/quantization/quantizer/quantizer.py
@@ -47,6 +47,7 @@ class QuantizationSpec(QuantizationSpecBase):
     is_dynamic: bool = False
 
     def __post_init__(self):
+        # TODO: add init for quant_min/quant_max
         # quant_min must be less than quant_max
         if (
             self.quant_min is not None
@@ -71,6 +72,7 @@ class FixedQParamsQuantizationSpec(QuantizationSpecBase):
     quant_min: Optional[int] = None
     quant_max: Optional[int] = None
     qscheme: Optional[torch.qscheme] = None
+    is_dynamic: bool = False
 
 
 """
@@ -104,6 +106,7 @@ class DerivedQuantizationSpec(QuantizationSpecBase):
     quant_max: Optional[int] = None
     qscheme: Optional[torch.qscheme] = None
     ch_axis: Optional[int] = None
+    is_dynamic: bool = False
 
 
 @dataclass
diff --git a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
index 86a8738b10ac0..4cc05e46c6a70 100644
--- a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
+++ b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
@@ -2,8 +2,19 @@
 import functools
 import itertools
 import operator
+import warnings
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+)
 
 import torch
 import torch.nn.functional as F
@@ -19,7 +30,6 @@
     PlaceholderObserver,
 )
 from torch.ao.quantization.pt2e.graph_utils import find_sequential_partitions
-from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
 from torch.ao.quantization.quantizer.quantizer import (
     QuantizationAnnotation,
     QuantizationSpec,
@@ -42,6 +52,9 @@
     SourcePartition,
 )
 
+if TYPE_CHECKING:
+    from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
+
 __all__ = [
     "X86InductorQuantizer",
     "get_default_x86_inductor_quantization_config",
@@ -57,10 +70,10 @@ class _X86InductorQuantizationAnnotation(QuantizationAnnotation):
     _is_output_of_quantized_pattern: bool = False
 
 
-# Operations that:
-# 1. Operations are optimized to run with int8 when int8 input provided.
-# 2. Operations do not support int8 input and produce fp32 output.
-int8_in_int8_out_ops_pt2e: Set = {
+# Operators that:
+# 1. Operators are optimized to run with int8 when int8 input provided.
+# 2. Operators do not support int8 input and produce fp32 output.
+int8_in_int8_out_ops: Set = {
     torch.ops.aten.max_pool2d.default,
     torch.ops.aten.cat.default,
     torch.ops.aten.avg_pool2d.default,
@@ -68,14 +81,61 @@ class _X86InductorQuantizationAnnotation(QuantizationAnnotation):
     torch.ops.aten.flatten.using_ints,
 }
 
+# Operators that support the int8 data type for quantization config propagation.
+# A superset of int8_in_int8_out_ops incorporating additional operators.
+propagation_quantizable_ops = int8_in_int8_out_ops
+
+# Operators support the int8 data type
+# and recipe is configured by default in X86InductorQuantizer.
+default_quantizable_ops = propagation_quantizable_ops | {
+    torch.ops.aten.conv2d.default,
+    torch.ops.aten.linear.default,
+}
 
-# Operations support the int8 data type and exclude operations such as conv and linear.
-# A superset of int8_in_int8_out_ops_pt2e incorporating additional operators.
-quantizable_ops_pt2e = copy.deepcopy(int8_in_int8_out_ops_pt2e)
+# A superset of default_quantizable_ops includes operators support the int8 data type
+# but not enabled by default recipe of X86InductorQuantizer.
+quantizable_ops = default_quantizable_ops | {
+    torch.ops.aten.matmul.default,
+}
 
 QUANT_ANNOTATION_KEY = "quantization_annotation"
 
 
+def _map_module_function_to_aten_operator_type():
+    module_function_to_aten_operator: Dict[Callable, torch._ops.OpOverloadPacket] = {}
+    map_list = (
+        ([torch.nn.Conv2d, F.conv2d], torch.ops.aten.conv2d.default),
+        ([torch.nn.Linear, F.linear], torch.ops.aten.linear.default),
+        ([torch.nn.MaxPool2d, F.max_pool2d], torch.ops.aten.max_pool2d.default),
+        (
+            [
+                torch.cat,
+            ],
+            torch.ops.aten.cat.default,
+        ),
+        ([torch.nn.AvgPool2d, F.avg_pool2d], torch.ops.aten.avg_pool2d.default),
+        (
+            [torch.nn.AdaptiveAvgPool2d, F.adaptive_avg_pool2d],
+            torch.ops.aten.adaptive_avg_pool2d.default,
+        ),
+        (
+            [
+                torch.flatten,
+            ],
+            torch.ops.aten.flatten.using_ints,
+        ),
+        (
+            [
+                torch.matmul,
+            ],
+            torch.ops.aten.matmul.default,
+        ),
+    )
+    for map_item in map_list:
+        module_function_to_aten_operator.update(dict.fromkeys(map_item[0], map_item[1]))  # type: ignore[call-overload]
+    return module_function_to_aten_operator
+
+
 def _mark_nodes_as_annotated(nodes: List[Node]):
     for node in nodes:
         if node is not None:
@@ -235,17 +295,20 @@ def _get_supported_config_and_operators() -> List[OperatorConfig]:
 
 class X86InductorQuantizer(Quantizer):
     supported_config_and_operators = _get_supported_config_and_operators()
+    module_function_to_aten_operator_type = _map_module_function_to_aten_operator_type()
 
     def __init__(self):
         super().__init__()
         self.global_config: QuantizationConfig = None  # type: ignore[assignment]
-        self.operator_type_config: Dict[str, Optional[QuantizationConfig]] = {}
+        self.operator_type_qconfig: Dict[
+            torch._ops.OpOverloadPacket, Optional[QuantizationConfig]
+        ] = {}
 
     @classmethod
     def get_supported_quantization_configs(cls) -> List[QuantizationConfig]:
-        op_configs: Set[QuantizationConfig] = set({})
-        for spec, _ in cls.supported_config_and_operators:
-            op_configs.add(spec)
+        op_configs: Set[QuantizationConfig] = {
+            spec for spec, _ in cls.supported_config_and_operators
+        }
         return list(op_configs)
 
     @classmethod
@@ -267,12 +330,70 @@ def set_global(self, quantization_config: QuantizationConfig):
         self.global_config = quantization_config
         return self
 
-    def set_config_for_operator_type(
-        self, operator_type: str, quantization_config: QuantizationConfig
-    ):
-        self.operator_type_config[operator_type] = quantization_config
+    def get_global_quantization_config(self):
+        if not isinstance(self.global_config, QuantizationConfig):
+            warnings.warn(
+                "The global_config for X86InductorQuantizer is currently invalid. \
+                Please ensure that you use set_global to establish the global quantization configuration."
+            )
+        return self.global_config
+
+    def set_function_type_qconfig(
+        self,
+        function_type: Callable,
+        quantization_config: Optional[QuantizationConfig],
+    ) -> "X86InductorQuantizer":
+        if function_type in X86InductorQuantizer.module_function_to_aten_operator_type:
+            self._set_aten_operator_qconfig(
+                X86InductorQuantizer.module_function_to_aten_operator_type[
+                    function_type
+                ],
+                quantization_config,
+            )
+        else:
+            warnings.warn(
+                f"function: Unable to customize quantization config for {function_type} by X86InductorQuantizer."
+            )
         return self
 
+    def set_module_type_qconfig(
+        self,
+        module_type: torch.nn.Module,
+        quantization_config: Optional[QuantizationConfig],
+    ) -> "X86InductorQuantizer":
+        if module_type in X86InductorQuantizer.module_function_to_aten_operator_type:
+            self._set_aten_operator_qconfig(
+                X86InductorQuantizer.module_function_to_aten_operator_type[module_type],
+                quantization_config,
+            )
+        else:
+            warnings.warn(
+                f"Module: Unable to customize quantization config for {module_type} by X86InductorQuantizer."
+            )
+        return self
+
+    def _set_aten_operator_qconfig(
+        self,
+        operator_type: torch._ops.OpOverloadPacket,
+        quantization_config: Optional[QuantizationConfig],
+    ) -> "X86InductorQuantizer":
+        if operator_type in quantizable_ops:
+            self.operator_type_qconfig[operator_type] = quantization_config
+        else:
+            warnings.warn(
+                f"operator: Unable to quantize {operator} by X86InductorQuantizer."
+            )
+        return self
+
+    def _get_aten_operator_qconfig(
+        self,
+        operator_type: torch._ops.OpOverloadPacket,
+    ) -> Optional[QuantizationConfig]:
+        if operator_type in self.operator_type_qconfig:
+            assert operator_type in quantizable_ops
+            return self.operator_type_qconfig[operator_type]
+        return self.global_config if operator_type in default_quantizable_ops else None
+
     def _annotate_conv_node_helper(
         self,
         conv_node: torch.fx.Node,
@@ -403,36 +524,31 @@ def _annotate_for_static_quantization_config(
         we need to annotate the output of this pattern.
         """
 
-        config = self.global_config
-
         # Step1: Recipe of fusion patterns like conv/linear.
-        if config.is_qat:
-            # Annotate QAT specific pattern: mainly due to BN not folded in prepare_qat
-            self._annotate_qat_conv2d_fusion_pattern(model, config)
-
-        self._annotate_conv2d_fusion_pattern(model, config)
+        self._annotate_conv2d_fusion_pattern(model)
+        self._annotate_linear_fusion_pattern(model)
+        self._annotate_matmul(model)
 
         # Step2: Recipe to propagate annotation for patterns beside conv/linear.
         # Go through all the nodes from start to end.
         # Recipe refer to https://github.com/intel/intel-extension-for-pytorch/blob/
         # 90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L538
         for node in model.graph.nodes:
-            self._annotation_propagation_quantizable_pattern(node, config)
+            self._annotate_propagation_quantizable_pattern(node)
 
         # Step3: For quantizable ops, such as maxpool2d, we need to quantize its output if it is quantized
         # in inputs. So, we can fuse dq-operator-q into a quantized op.
         # Refer to https://github.com/intel/intel-extension-for-pytorch/blob/
         # 90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L487
         for node in model.graph.nodes:
-            self._annotate_output_for_int8_in_int8_out_pattern(node, config)
+            self._annotate_output_for_int8_in_int8_out_pattern(node)
 
         return model
 
     def _annotate_for_dynamic_quantization_config(
         self, model: torch.fx.GraphModule
     ) -> torch.fx.GraphModule:
-        config = self.global_config
-        self._annotate_linear(model, config)
+        self._annotate_linear_fusion_pattern(model)
         return model
 
     def _annotate_qat_conv2d_fusion_pattern(
@@ -573,7 +689,9 @@ def _annotate_qat_conv2d_bn_unary(
         unary_patterns = [
             [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.ReLU],
             [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.Hardtanh],
+            [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.Hardswish],
             [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.ReLU6],
+            [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.SiLU],
         ]
         for unary_pattern in unary_patterns:
             partitions = find_sequential_partitions(gm, unary_pattern)
@@ -646,15 +764,41 @@ def _annotate_qat_conv2d_bn(
             nodes_to_mark_annotated.extend(list(bn_partition.nodes))
             _mark_nodes_as_annotated(nodes_to_mark_annotated)
 
-    def _annotate_conv2d_fusion_pattern(
-        self, model: torch.fx.GraphModule, config: QuantizationConfig
-    ):
-        self._annotate_conv2d_binary_unary(model, config)
-        self._annotate_conv2d_binary(model, config)
-        self._annotate_conv2d_unary(model, config)
-        self._annotate_conv2d(model, config)
-        self._annotate_linear_unary(model, config)
-        self._annotate_linear(model, config)
+    def _annotate_conv2d_fusion_pattern(self, model: torch.fx.GraphModule):
+        if config := self._get_aten_operator_qconfig(torch.ops.aten.conv2d.default):
+            if config.is_qat:
+                # Annotate QAT specific pattern: mainly due to BN not folded in prepare_qat
+                self._annotate_qat_conv2d_fusion_pattern(model, config)
+            self._annotate_conv2d_binary_unary(model, config)
+            self._annotate_conv2d_binary(model, config)
+            self._annotate_conv2d_unary(model, config)
+            self._annotate_conv2d(model, config)
+
+    def _annotate_linear_fusion_pattern(self, model: torch.fx.GraphModule):
+        if config := self._get_aten_operator_qconfig(torch.ops.aten.linear.default):
+            if config.input_activation and not config.input_activation.is_dynamic:
+                # <TODO> Weiwen: Dynamic Quant of linear unary will be supported in next step
+                self._annotate_linear_binary_unary(model, config)
+                self._annotate_linear_unary(model, config)
+            self._annotate_linear(model, config)
+
+    def _annotate_matmul(self, model: torch.fx.GraphModule):
+        if config := self._get_aten_operator_qconfig(torch.ops.aten.matmul.default):
+            for node in model.graph.nodes:
+                if node.target == torch.ops.aten.matmul.default and not _is_annotated(
+                    [node]
+                ):
+                    input_qspec_map = {}
+                    matmul_node = node
+                    for input_node in matmul_node.args:
+                        input_qspec_map[input_node] = get_input_act_qspec(config)
+                    matmul_node.meta[
+                        QUANT_ANNOTATION_KEY
+                    ] = _X86InductorQuantizationAnnotation(
+                        input_qspec_map=input_qspec_map,
+                        _annotated=True,
+                        _is_output_of_quantized_pattern=True,
+                    )
 
     def _annotate_conv2d_binary_unary(
         self, gm: torch.fx.GraphModule, quantization_config: QuantizationConfig
@@ -751,7 +895,9 @@ def _annotate_conv2d_unary(
         unary_patterns = [
             [torch.nn.Conv2d, torch.nn.ReLU],
             [torch.nn.Conv2d, torch.nn.Hardtanh],
+            [torch.nn.Conv2d, torch.nn.Hardswish],
             [torch.nn.Conv2d, torch.nn.ReLU6],
+            [torch.nn.Conv2d, torch.nn.SiLU],
         ]
         for unary_pattern in unary_patterns:
             partitions = find_sequential_partitions(gm, unary_pattern)
@@ -847,14 +993,13 @@ def _annotate_cat(
             _is_output_of_quantized_pattern=True,
         )
 
-    def _annotation_propagation_quantizable_pattern(
-        self, node: Node, quantization_config: QuantizationConfig
-    ) -> None:
+    def _annotate_propagation_quantizable_pattern(self, node: Node) -> None:
         # Propagate annotation to quantizable patterns.
         if (
-            (node.target in quantizable_ops_pt2e)
+            (node.target in propagation_quantizable_ops)
             and (not _is_any_annotated([node]))
             and (node.op == "call_function")
+            and (quantization_config := self._get_aten_operator_qconfig(node.target))  # type: ignore[arg-type]
         ):
 
             def is_all_inputs_connected_to_quantized_op(input_nodes):
@@ -911,16 +1056,18 @@ def _annotate_output_share_observer_as_input(
             )
         return
 
-    def _annotate_output_for_int8_in_int8_out_pattern(
-        self, node: Node, quantization_config: QuantizationConfig
-    ) -> None:
+    def _annotate_output_for_int8_in_int8_out_pattern(self, node: Node) -> None:
         r"""
-        Check and insert observer at output of node in int8_in_int8_out_ops_pt2e if needed.
+        Check and insert observer at output of node in int8_in_int8_out_ops if needed.
         Recipe refers to https://github.com/intel/intel-extension-for-pytorch/blob/
         90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_utils.py#L495
         """
         edge_or_node: Tuple[Node, Node]
-        if (node.target in int8_in_int8_out_ops_pt2e) and (_is_any_annotated([node])):
+        if (
+            (node.target in int8_in_int8_out_ops)
+            and (_is_any_annotated([node]))
+            and (quantization_config := self._get_aten_operator_qconfig(node.target))  # type: ignore[arg-type]
+        ):
             if node.target == torch.ops.aten.max_pool2d.default:
                 maxpool_node = node
                 if not _is_all_annotated(
@@ -983,6 +1130,7 @@ def _annotate_linear_unary(
             torch.nn.ReLU,
             torch.nn.LeakyReLU,
             torch.nn.Tanh,
+            torch.nn.GELU,
         ]
         fused_partitions: List[tuple] = []
         for postop in postop_list:
@@ -1006,6 +1154,87 @@ def _annotate_linear_unary(
                 _is_output_of_quantized_pattern=True,
             )
 
+    def _annotate_linear_binary_unary(
+        self,
+        gm: torch.fx.GraphModule,
+        quantization_config: QuantizationConfig,
+    ) -> None:
+        # linear + binary_op + (optional) unary op
+        binary_op_list = [operator.add]
+        unary_op_list = [torch.nn.ReLU, None]
+        combinations = itertools.product(binary_op_list, unary_op_list)
+        for binary_op, unary_op in combinations:
+            has_unary = unary_op is not None
+            seq_partition = [torch.nn.Linear, binary_op]
+            if has_unary:
+                seq_partition.append(unary_op)
+            fused_partitions = find_sequential_partitions(gm, seq_partition)
+            for fused_partition in fused_partitions:
+                unary_partition, unary_node = None, None
+                if has_unary:
+                    (
+                        linear_partition,
+                        binary_partition,
+                        unary_partition,
+                    ) = fused_partition
+                    (
+                        linear_node,
+                        binary_node,
+                        unary_node,
+                    ) = self._get_output_nodes_of_partitions(
+                        [linear_partition, binary_partition, unary_partition]
+                    )
+                else:
+                    linear_partition, binary_partition = fused_partition
+                    linear_node, binary_node = self._get_output_nodes_of_partitions(
+                        [linear_partition, binary_partition]
+                    )
+                if len(linear_node.users) != 1:
+                    # Linear Node should only has 1 user node
+                    continue
+                (
+                    linear_node_idx,
+                    extra_input_node_idx,
+                ) = self._get_input_idx_for_binary_node(linear_node, binary_node)
+                if (linear_node_idx is None) or (extra_input_node_idx is None):
+                    continue
+                if linear_node != binary_node.args[linear_node_idx]:
+                    raise ValueError(
+                        f"{linear_node} doesn't match input of binary node"
+                    )
+                assert isinstance(linear_node, Node)
+                if (
+                    linear_node.op != "call_function"
+                    or linear_node.target != torch.ops.aten.linear.default
+                ):
+                    # No linear node found to be fused with add
+                    continue
+                node_list = (
+                    [binary_node, linear_node]
+                    if unary_node is None
+                    else [unary_node, binary_node, linear_node]
+                )
+                if _is_annotated(node_list):
+                    continue
+                self._annotate_linear_node_helper(
+                    linear_node, False, quantization_config
+                )
+                # We don't insert q-dq before the binary input node due to accuracy issues
+                binary_node.meta[
+                    QUANT_ANNOTATION_KEY
+                ] = _X86InductorQuantizationAnnotation(
+                    input_qspec_map={},
+                    _annotated=True,
+                    _is_output_of_quantized_pattern=(not has_unary),
+                )
+                if unary_node is not None:
+                    unary_node.meta[
+                        QUANT_ANNOTATION_KEY
+                    ] = _X86InductorQuantizationAnnotation(
+                        _annotated=True,
+                        _is_output_of_quantized_pattern=True,
+                    )
+
     def validate(self, model: torch.fx.GraphModule) -> None:
         pass
 
diff --git a/torch/ao/quantization/quantizer/xnnpack_quantizer.py b/torch/ao/quantization/quantizer/xnnpack_quantizer.py
index c29d4f69d58f8..f3d1b6ca8b392 100644
--- a/torch/ao/quantization/quantizer/xnnpack_quantizer.py
+++ b/torch/ao/quantization/quantizer/xnnpack_quantizer.py
@@ -3,7 +3,7 @@
 import copy
 import functools
 
-from typing import Any, Callable, Dict, List, Optional, Set
+from typing import Any, Callable, Dict, List, Optional, Set, TYPE_CHECKING
 
 import torch
 import torch._dynamo as torchdynamo
@@ -21,8 +21,6 @@
     PlaceholderObserver,
 )
 
-from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
-
 from torch.ao.quantization.quantizer import QuantizationSpec, Quantizer
 
 from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
@@ -34,7 +32,10 @@
     QuantizationConfig,
 )
 
-from torch.fx import Node
+
+if TYPE_CHECKING:
+    from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
+    from torch.fx import Node
 
 
 __all__ = [
@@ -106,6 +107,10 @@ def get_symmetric_quantization_config(
     is_per_channel: bool = False,
     is_qat: bool = False,
     is_dynamic: bool = False,
+    act_qmin: int = -128,
+    act_qmax: int = 127,
+    weight_qmin: int = -127,
+    weight_qmax: int = 127,
 ):
     extra_args: Dict[str, Any] = {"eps": 2**-12}
     if is_qat:
@@ -125,8 +130,8 @@ def get_symmetric_quantization_config(
 
     act_quantization_spec = QuantizationSpec(
         dtype=torch.int8,
-        quant_min=-128,
-        quant_max=127,
+        quant_min=act_qmin,
+        quant_max=act_qmax,
         qscheme=torch.per_tensor_affine,
         is_dynamic=is_dynamic,
         observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr.with_args(
@@ -153,8 +158,8 @@ def get_symmetric_quantization_config(
             extra_args["observer"] = MovingAveragePerChannelMinMaxObserver  # type: ignore[dict-item]
     weight_quantization_spec = QuantizationSpec(
         dtype=torch.int8,
-        quant_min=-127,
-        quant_max=127,
+        quant_min=weight_qmin,
+        quant_max=weight_qmax,
         qscheme=weight_qscheme,
         ch_axis=0,
         is_dynamic=False,
@@ -207,9 +212,15 @@ def module_name_filter(n: Node) -> bool:
         # }
         # get_attr nodes doesn't have nn_module_stack?
         nn_module_stack = n.meta.get("nn_module_stack", {})
-        names = [
-            n[len("L__self___") :].replace("_", ".") for n in nn_module_stack.keys()
-        ]
+
+        def _normalize_path(n):
+            prefix = 0
+            # TODO This is non standard behavior and should be removed when we migrate off capture_pre_autograd_graph.
+            if n.startswith("L['self']."):
+                prefix = len("L['self'].")
+            return n[prefix:]
+
+        names = [_normalize_path(n) for n, _ in nn_module_stack.values()]
         return module_name in names
 
     return module_name_filter
@@ -228,14 +239,22 @@ def _get_module_type_filter(tp: Callable):
     True  # the node is from the submodule `Sub` (same for `Block` and `Linear` as well)
     """
 
+    tp_str = tp.__module__ + "." + tp.__qualname__
+
     def module_type_filter(n: Node) -> bool:
         # example: {
         #     'L__self___sub': ("L['self'].sub", <class '....Sub'>),
         #     'L__self___sub_linear': ("L['self'].sub.linear", <class 'torch.nn.modules.linear.Linear'>)
         # }
         nn_module_stack = n.meta.get("nn_module_stack", {})
-        types = [t for _, t in nn_module_stack.values()]
-        return tp in types
+        types = []
+        for _, t in nn_module_stack.values():
+            # export() returns str, but older APIs (e.g. capture_pre_autograd_graph)
+            # return type. Handle both cases.
+            if isinstance(t, type):
+                t = t.__module__ + "." + t.__qualname__
+            types.append(t)
+        return tp_str in types
 
     return module_type_filter
 
@@ -257,6 +276,8 @@ class XNNPACKQuantizer(Quantizer):
     STATIC_QAT_ONLY_OPS = [
         "conv_bn_relu",
         "conv_bn",
+        "conv_transpose_bn_relu",
+        "conv_transpose_bn",
     ]
 
     # static quantization ops (both PTQ and QAT)
@@ -266,6 +287,7 @@ class XNNPACKQuantizer(Quantizer):
         "linear",
         "conv_relu",
         "conv",
+        "conv_transpose_relu",
         "adaptive_avg_pool2d",
         # TODO: move this to BoltNNQuantizer?
         "gru_io_only",
@@ -292,9 +314,9 @@ def __init__(self):
 
     @classmethod
     def get_supported_quantization_configs(cls) -> List[QuantizationConfig]:
-        op_configs: Set[QuantizationConfig] = set({})
-        for spec, _ in cls.supported_config_and_operators:
-            op_configs.add(spec)
+        op_configs: Set[QuantizationConfig] = {
+            spec for spec, _ in cls.supported_config_and_operators
+        }
         return list(op_configs)
 
     @classmethod
diff --git a/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py b/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
index a68311615654c..9f1732e573704 100644
--- a/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
+++ b/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
@@ -7,11 +7,14 @@
 import torch.nn.functional as F
 from torch._subclasses import FakeTensor
 from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix
+from torch.ao.quantization.pt2e.export_utils import _WrapperModule
 from torch.ao.quantization.pt2e.graph_utils import find_sequential_partitions
 from torch.ao.quantization.pt2e.utils import (
     _conv1d_bn_example_inputs,
     _conv2d_bn_example_inputs,
-    get_aten_graph_module,
+    _get_aten_graph_module_for_pattern,
+    _is_conv_node,
+    _is_conv_transpose_node,
 )
 from torch.ao.quantization.quantizer import (
     QuantizationAnnotation,
@@ -328,12 +331,12 @@ def _annotate_conv(
     return annotated_partitions
 
 
-@register_annotator("conv_relu")
-def _annotate_conv_relu(
+def _do_annotate_conv_relu(
     gm: torch.fx.GraphModule,
     quantization_config: Optional[QuantizationConfig],
     filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[List[List[Node]]]:
+    is_conv_transpose: bool = False,
+):
     annotated_partitions = []
     for n in gm.graph.nodes:
         if n.op != "call_function" or n.target not in [
@@ -343,15 +346,9 @@ def _annotate_conv_relu(
             continue
         relu_node = n
         maybe_conv_node = n.args[0]
-        if (
-            not isinstance(maybe_conv_node, Node)
-            or maybe_conv_node.op != "call_function"
-            or maybe_conv_node.target
-            not in [
-                torch.ops.aten.conv1d.default,
-                torch.ops.aten.conv2d.default,
-            ]
-        ):
+
+        is_conv_node = _is_conv_transpose_node if is_conv_transpose else _is_conv_node
+        if not isinstance(maybe_conv_node, Node) or not is_conv_node(maybe_conv_node):
             continue
         conv_node = maybe_conv_node
 
@@ -389,6 +386,28 @@ def _annotate_conv_relu(
     return annotated_partitions
 
 
+@register_annotator("conv_relu")
+def _annotate_conv_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    return _do_annotate_conv_relu(
+        gm, quantization_config, filter_fn, is_conv_transpose=False
+    )
+
+
+@register_annotator("conv_transpose_relu")
+def _annotate_conv_transpose_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    return _do_annotate_conv_relu(
+        gm, quantization_config, filter_fn, is_conv_transpose=True
+    )
+
+
 @register_annotator("conv_bn")
 def _annotate_conv_bn(
     gm: torch.fx.GraphModule,
@@ -415,11 +434,42 @@ def _annotate_conv_bn_relu(
     return _do_annotate_conv_bn(gm, quantization_config, filter_fn, has_relu=True)
 
 
+@register_annotator("conv_transpose_bn")
+def _annotate_conv_transpose_bn(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    """
+    Find conv_transpose + batchnorm parititions
+    Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
+    """
+    return _do_annotate_conv_bn(
+        gm, quantization_config, filter_fn, has_relu=False, is_conv_transpose=True
+    )
+
+
+@register_annotator("conv_transpose_bn_relu")
+def _annotate_conv_transpose_bn_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    """
+    Find conv_transpose + batchnorm + relu parititions
+    Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
+    """
+    return _do_annotate_conv_bn(
+        gm, quantization_config, filter_fn, has_relu=True, is_conv_transpose=True
+    )
+
+
 def _do_annotate_conv_bn(
     gm: torch.fx.GraphModule,
     quantization_config: Optional[QuantizationConfig],
     filter_fn: Optional[Callable[[Node], bool]],
     has_relu: bool,
+    is_conv_transpose: bool = False,
 ) -> List[List[Node]]:
     """
     Given a function that takes in a `conv_fn` and returns a conv-bn[-relu] pattern,
@@ -445,7 +495,7 @@ def _conv_bn(x, conv_weight, conv_bias, bn_weight, bn_bias, bn_rm, bn_rv):
                 "output": output,
             }
 
-        return _conv_bn
+        return _WrapperModule(_conv_bn)
 
     # Needed for matching, otherwise the matches gets filtered out due to unused
     # nodes returned by batch norm
@@ -453,22 +503,28 @@ def _conv_bn(x, conv_weight, conv_bias, bn_weight, bn_bias, bn_rm, bn_rv):
     gm.recompile()
 
     matches = []
-    combinations = [
-        (F.conv1d, _conv1d_bn_example_inputs),
-        (F.conv2d, _conv2d_bn_example_inputs),
-    ]
+    if is_conv_transpose:
+        combinations = [
+            (F.conv_transpose1d, _conv1d_bn_example_inputs),
+            (F.conv_transpose2d, _conv2d_bn_example_inputs),
+        ]
+    else:
+        combinations = [
+            (F.conv1d, _conv1d_bn_example_inputs),  # type: ignore[list-item]
+            (F.conv2d, _conv2d_bn_example_inputs),  # type: ignore[list-item]
+        ]
 
     # Add `is_cuda` and `relu_is_inplace` dimensions
-    combinations = itertools.product(
+    combinations = itertools.product(  # type: ignore[assignment]
         combinations,
         [True, False] if torch.cuda.is_available() else [False],  # is_cuda
         [True, False] if has_relu else [False],  # relu_is_inplace
     )
 
     # Match against all conv dimensions and cuda variants
-    for (conv_fn, example_inputs), is_cuda, relu_is_inplace in combinations:
-        pattern = get_pattern(conv_fn, relu_is_inplace)
-        pattern = get_aten_graph_module(pattern, example_inputs, is_cuda)
+    for (conv_fn, example_inputs), is_cuda, relu_is_inplace in combinations:  # type: ignore[misc]
+        pattern = get_pattern(conv_fn, relu_is_inplace)  # type: ignore[has-type]
+        pattern = _get_aten_graph_module_for_pattern(pattern, example_inputs, is_cuda)  # type: ignore[has-type]
         pattern.graph.eliminate_dead_code()
         pattern.recompile()
         matcher = SubgraphMatcherWithNameNodeMap(pattern, ignore_literals=True)
@@ -915,7 +971,7 @@ def _annotate_cat(
 
         if cat_node.target != torch.ops.aten.cat.default:
             # TODO: change this to AnnotationException
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 f"Expected cat node: torch.ops.aten.cat.default, but found {cat_node.target}"
                 " please check if you are calling the correct capture API"
             )
diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
index c5ab78c3ccea6..d0de50bbeb57d 100644
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@@ -151,6 +151,8 @@ def to_underlying_dtype(qdtype):
         torch.int8: torch.int8,
         torch.int16: torch.int16,
         torch.int32: torch.int32,
+        torch.float8_e5m2: torch.float8_e5m2,
+        torch.float8_e4m3fn: torch.float8_e4m3fn,
     }
     assert qdtype in DTYPE_MAPPING, "Unsupported dtype: " + str(qdtype)
     return DTYPE_MAPPING[qdtype]
@@ -231,7 +233,9 @@ def activation_is_statically_quantized(qconfig):
             torch.uint8,
             torch.int8,
             torch.int16,
-            torch.int32
+            torch.int32,
+            torch.float8_e5m2,
+            torch.float8_e4m3fn,
         ]
         and (not activation_is_dynamically_quantized(qconfig))
     )
@@ -269,7 +273,9 @@ def weight_is_quantized(qconfig):
         torch.uint8,
         torch.int8,
         torch.int16,
-        torch.int32
+        torch.int32,
+        torch.float8_e5m2,
+        torch.float8_e4m3fn,
     ]
 
 def weight_is_statically_quantized(qconfig):
@@ -305,7 +311,18 @@ def get_quant_type(qconfig):
     assert qconfig is not None
     activation = qconfig.activation()
     weight = qconfig.weight()
-    static_dtypes = [torch.quint8, torch.qint8, torch.quint4x2, torch.qint32, torch.uint8, torch.int8, torch.int16, torch.int32]
+    static_dtypes = [
+        torch.quint8,
+        torch.qint8,
+        torch.quint4x2,
+        torch.qint32,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.float8_e5m2,
+        torch.float8_e4m3fn
+    ]
     if weight.dtype in static_dtypes:
         if hasattr(activation, 'is_dynamic') and activation.is_dynamic:
             return QuantType.DYNAMIC
@@ -320,7 +337,7 @@ def get_quant_type(qconfig):
         elif activation.dtype == torch.float16:
             return QuantType.STATIC
 
-    raise Exception(f"Unrecognized dtype combination in get_quant_type: activation({activation.dtype}),"
+    raise Exception(f"Unrecognized dtype combination in get_quant_type: activation({activation.dtype}),"  # noqa: TRY002
                     f"weight({weight.dtype})")
 
 def check_min_max_valid(min_val: torch.Tensor, max_val: torch.Tensor) -> bool:
@@ -671,6 +688,27 @@ def _patched_module_call(self, *args, **kwargs):
         torch.nn.Module.__call__ = orig_module_call  # type: ignore[method-assign]
     return fqn_to_example_inputs
 
+def _assert_and_get_unique_device(module: torch.nn.Module) -> Any:
+    """
+    Returns the unique device for a module, or None if no device is found.
+    Throws an error if multiple devices are detected.
+    """
+    devices = {p.device for p in module.parameters()} | \
+        {p.device for p in module.buffers()}
+    """
+    As a temp workaround for AIMP HHC publish we added CPU check.remove it later. T163614564
+    """
+    if {torch.device("cpu"), torch.device("meta")} == devices:
+        warnings.warn("Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'.")
+        devices = {torch.device("cpu")}
+    ""
+    assert len(devices) <= 1, (
+        "prepare only works with cpu or single-device CUDA modules, "
+        f"but got devices {devices}"
+    )
+    device = next(iter(devices)) if len(devices) > 0 else None
+    return device
+
 __all__ = [
     "NodePattern",
     "Pattern",
diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index 2a4a311ebf30e..e12aef1158a8e 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -225,7 +225,7 @@ def backward(
         inputs (Sequence[Tensor] or Tensor or Sequence[GradientEdge], optional): Inputs w.r.t. which the gradient
             be will accumulated into ``.grad``. All other Tensors will be ignored. If
             not provided, the gradient is accumulated into all the leaf Tensors that
-            were used to compute the attr::tensors.
+            were used to compute the :attr:`tensors`.
     """
     if torch._C._are_functorch_transforms_active():
         raise RuntimeError(
diff --git a/torch/autograd/forward_ad.py b/torch/autograd/forward_ad.py
index a082f1c5837e0..747b18f0f3696 100644
--- a/torch/autograd/forward_ad.py
+++ b/torch/autograd/forward_ad.py
@@ -60,6 +60,11 @@ def exit_dual_level(*, level=None):
     _current_level = level - 1
 
 
+def _maybe_load_decompositions():
+    if os.environ.get("PYTORCH_JIT", "1") == "1" and __debug__:
+        from torch._decomp import decompositions_for_jvp  # noqa: F401
+
+
 def make_dual(tensor, tangent, *, level=None):
     r"""Associate a tensor value with its tangent to create a "dual tensor" for forward AD gradient computation.
 
@@ -100,8 +105,7 @@ def make_dual(tensor, tangent, *, level=None):
     #         buffer = z
     #     return min - torch.log1p(z), buffer
     #     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
-    if os.environ.get("PYTORCH_JIT", "1") == "1" and __debug__:
-        from torch._decomp import decompositions_for_jvp  # noqa: F401
+    _maybe_load_decompositions()
 
     if level is None:
         level = _current_level
diff --git a/torch/autograd/function.py b/torch/autograd/function.py
index 582e4113a43de..9c624ce5d14bf 100644
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -1,5 +1,6 @@
 import functools
 import inspect
+import itertools
 import warnings
 from collections import OrderedDict
 from typing import Any, List, Optional, Tuple
@@ -17,19 +18,22 @@
     "FunctionMeta",
     "Function",
     "once_differentiable",
-    "traceable",
     "InplaceFunction",
     "NestedIOFunction",
 ]
 
+# Unique id provider for each class inheriting from Function
+# This is incremented in FunctionMeta during class definition
+AUTOGRAD_FUNCTION_COUNTER = itertools.count()
+
 
 # Formerly known as: _ContextMethodMixin
 class FunctionCtx:
     def save_for_backward(self, *tensors: torch.Tensor):
         r"""Save given tensors for a future call to :func:`~Function.backward`.
 
-        ``save_for_backward`` should be called at most once, only from inside the
-        :func:`forward` method, and only with tensors.
+        ``save_for_backward`` should be called at most once, in either the
+        :func:`setup_context` or :func:`forward` methods, and only with tensors.
 
         All tensors intended to be used in the backward pass should be saved
         with ``save_for_backward`` (as opposed to directly on ``ctx``) to prevent
@@ -87,8 +91,9 @@ def save_for_backward(self, *tensors: torch.Tensor):
     def save_for_forward(self, *tensors: torch.Tensor):
         r"""Save given tensors for a future call to :func:`~Function.jvp`.
 
-        ``save_for_forward`` should be only called once, from inside the :func:`forward`
-        method, and only be called with tensors.
+        ``save_for_forward`` should be called at most once, in either the
+        :func:`setup_context` or :func:`forward` methods, and all arguments
+        should be tensors.
 
         In :func:`jvp`, saved objects can be accessed through the :attr:`saved_tensors`
         attribute.
@@ -140,8 +145,8 @@ def save_for_forward(self, *tensors: torch.Tensor):
     def mark_dirty(self, *args: torch.Tensor):
         r"""Mark given tensors as modified in an in-place operation.
 
-        **This should be called at most once, only from inside the**
-        :func:`forward` **method, and all arguments should be inputs.**
+        This should be called at most once, in either the :func:`setup_context`
+        or :func:`forward` methods, and all arguments should be inputs.
 
         Every tensor that's been modified in-place in a call to :func:`forward`
         should be given to this function, to ensure correctness of our checks.
@@ -184,8 +189,8 @@ def mark_shared_storage(self, *pairs):
     def mark_non_differentiable(self, *args: torch.Tensor):
         r"""Mark outputs as non-differentiable.
 
-        **This should be called at most once, only from inside the**
-        :func:`forward` **method, and all arguments should be tensor outputs.**
+        This should be called at most once, in either the :func:`setup_context`
+        or :func:`forward` methods, and all arguments should be tensor outputs.
 
         This will mark outputs as not requiring gradients, increasing the
         efficiency of backward computation. You still need to accept a gradient
@@ -216,7 +221,8 @@ def mark_non_differentiable(self, *args: torch.Tensor):
     def set_materialize_grads(self, value: bool):
         r"""Set whether to materialize grad tensors. Default is ``True``.
 
-        **This should be called only from inside the** :func:`forward` **method**
+        This should be called only from either the :func:`setup_context` or
+        :func:`forward` methods.
 
         If ``True``, undefined grad tensors will be expanded to tensors full of zeros
         prior to calling the :func:`backward` and :func:`jvp` methods.
@@ -274,7 +280,14 @@ def _register_hook(backward_hooks, hook):
 
 
 class BackwardCFunction(_C._FunctionBase, FunctionCtx, _HookMixin):
+    r"""
+    This class is used for internal autograd work. Do not use.
+    """
+
     def apply(self, *args):
+        r"""
+        Apply method used when executing this Node during the backward
+        """
         # _forward_cls is defined by derived class
         # The user should define either backward or vjp but never both.
         backward_fn = self._forward_cls.backward  # type: ignore[attr-defined]
@@ -289,6 +302,9 @@ def apply(self, *args):
         return user_fn(self, *args)
 
     def apply_jvp(self, *args):
+        r"""
+        Apply method used when executing forward mode AD during the forward
+        """
         # _forward_cls is defined by derived class
         return self._forward_cls.jvp(self, *args)  # type: ignore[attr-defined]
 
@@ -309,6 +325,10 @@ def __init__(cls, name, bases, attrs):
         backward_fn = type(
             name + "Backward", (BackwardCFunction,), {"_forward_cls": cls}
         )
+        backward_fn._autograd_function_id = next(AUTOGRAD_FUNCTION_COUNTER)  # type: ignore[attr-defined]
+        backward_fn._compiled_autograd_should_lift = attrs.get(  # type: ignore[attr-defined]
+            "_compiled_autograd_should_lift", True
+        )
         cls._backward_cls = backward_fn
 
         super().__init__(name, bases, attrs)
@@ -318,7 +338,7 @@ class _SingleLevelFunction(
     _C._FunctionBase, FunctionCtx, _HookMixin, metaclass=FunctionMeta
 ):
     @staticmethod
-    def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
+    def forward(*args: Any, **kwargs: Any) -> Any:
         r"""Define the forward of the custom autograd Function.
 
         This function is to be overridden by all subclasses.
@@ -369,10 +389,10 @@ def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> Any:
 
         Either:
 
-        1. Override forward with the signature forward(ctx, *args, **kwargs).
+        1. Override forward with the signature ``forward(ctx, *args, **kwargs)``.
            ``setup_context`` is not overridden. Setting up the ctx for backward
            happens inside the ``forward``.
-        2. Override forward with the signature forward(*args, **kwargs) and
+        2. Override forward with the signature ``forward(*args, **kwargs)`` and
            override ``setup_context``. Setting up the ctx for backward happens
            inside ``setup_context`` (as opposed to inside the ``forward``)
 
@@ -478,6 +498,7 @@ def __init__(self, *args, **kwargs):
             "Instantiating an autograd function will raise an "
             "error in a future version of PyTorch.",
             DeprecationWarning,
+            stacklevel=2,
         )
 
     def __call__(self, *args, **kwargs):
@@ -487,9 +508,6 @@ def __call__(self, *args, **kwargs):
             "(Example: https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function)"
         )
 
-    # for the tracer
-    is_traceable = False
-
     """
     Bool that specifies if PyTorch should attempt to autogenerate
     :func:`torch.vmap` support for this autograd.Function. You may set this to
@@ -543,7 +561,7 @@ def bind_default_args(func, *args, **kwargs):
 
             return bound_args.args
 
-        is_setup_ctx_defined = cls.setup_context != _SingleLevelFunction.setup_context
+        is_setup_ctx_defined = _is_setup_context_defined(cls.setup_context)
         if is_setup_ctx_defined:
             args = bind_default_args(cls.forward, *args, **kwargs)
 
@@ -557,11 +575,19 @@ def bind_default_args(func, *args, **kwargs):
                 "In order to use an autograd.Function with functorch transforms "
                 "(vmap, grad, jvp, jacrev, ...), it must override the setup_context "
                 "staticmethod. For more details, please see "
-                "https://pytorch.org/docs/master/notes/extending.func.html"
+                "https://pytorch.org/docs/main/notes/extending.func.html"
             )
 
         return custom_function_call(cls, *args, **kwargs)
 
+    @staticmethod
+    def _compiled_autograd_key(ctx):
+        return (ctx._autograd_function_id,)
+
+
+def _is_setup_context_defined(fn):
+    return fn != _SingleLevelFunction.setup_context
+
 
 def once_differentiable(fn):
     @functools.wraps(fn)
@@ -610,22 +636,12 @@ def fake_requires_grad(var):
     return wrapper
 
 
-def traceable(fn_cls):
-    r"""Mark Function as traceable for the JIT.
-
-    Traceable functions have additional restrictions - they can't pass any
-    data-dependent values to backward (e.g. Prod passes the output, which makes
-    it non-traceable), and their backward should be implemented entirely in terms
-    of operations on autograd Tensors in all cases.
-
-    DON'T USE THIS DECORATOR. IT IS FOR INTERNAL USE ONLY AND SHOULD BE HANDLED WITH
-    CARE (or can give incorrect results otherwise).
+class InplaceFunction(Function):
+    r"""
+    This class is here only for backward compatibility reasons.
+    Use :class:`Function` instead of this for any new use case.
     """
-    fn_cls.is_traceable = True
-    return fn_cls
-
 
-class InplaceFunction(Function):
     def __init__(self, inplace=False):
         super().__init__()
         self.inplace = inplace
@@ -741,6 +757,10 @@ def unflatten_helper(input, proto):
 
 
 class NestedIOFunction(Function):
+    r"""
+    This class is here only for backward compatibility reasons.
+    Use :class:`Function` instead of this for any new use case.
+    """
     # The 'type: ignore' statements are needed here because these functions are declared as '@staticmethod' in the
     # superclass (Function) but are instance methods here, which mypy reports as incompatible.
 
@@ -761,6 +781,9 @@ def _do_backward(self, gradients, retain_variables):
         return result
 
     def backward(self, *gradients: Any) -> Any:  # type: ignore[override]
+        r"""
+        Shared backward utility.
+        """
         nested_gradients = _unflatten(gradients, self._nested_output)
         result = self.backward_extended(*nested_gradients)  # type: ignore[func-returns-value]
         return tuple(_iter_None_tensors(result))
@@ -768,6 +791,9 @@ def backward(self, *gradients: Any) -> Any:  # type: ignore[override]
     __call__ = _do_forward
 
     def forward(self, *args: Any) -> Any:  # type: ignore[override]
+        r"""
+        Shared forward utility.
+        """
         nested_tensors = _map_tensor_data(self._nested_input)
         result = self.forward_extended(*nested_tensors)  # type: ignore[func-returns-value]
         del self._nested_input
@@ -775,22 +801,40 @@ def forward(self, *args: Any) -> Any:  # type: ignore[override]
         return tuple(_iter_tensors(result))
 
     def save_for_backward(self, *args: Any) -> None:
+        r"""
+        See :meth:`Function.save_for_backward`.
+        """
         self.to_save = tuple(_iter_tensors(args))
         self._to_save_nested = args
 
     @property
     def saved_tensors(self):
+        r"""
+        See :meth:`Function.saved_tensors`.
+        """
         flat_tensors = super().saved_tensors  # type: ignore[misc]
         return _unflatten(flat_tensors, self._to_save_nested)
 
     def mark_dirty(self, *args: Any, **kwargs: Any) -> None:
+        r"""
+        See :meth:`Function.mark_dirty`.
+        """
         self.dirty_tensors = tuple(_iter_tensors((args, kwargs)))
 
     def mark_non_differentiable(self, *args: Any, **kwargs: Any) -> None:
+        r"""
+        See :meth:`Function.mark_non_differentiable`.
+        """
         self.non_differentiable = tuple(_iter_tensors((args, kwargs)))
 
     def forward_extended(self, *input: Any) -> None:
+        r"""
+        User defined forward.
+        """
         raise NotImplementedError
 
     def backward_extended(self, *grad_output: Any) -> None:
+        r"""
+        User defined backward.
+        """
         raise NotImplementedError
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index 258b0e96a4d94..be173c9b9de0e 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -1,4 +1,4 @@
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
@@ -196,6 +196,9 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
         torch._C._set_grad_enabled(self.prev)
 
     def clone(self) -> "set_grad_enabled":
+        r"""
+        Create a copy of this class
+        """
         return self.__class__(self.mode)
 
 
@@ -255,8 +258,6 @@ class inference_mode(_DecoratorContextManager):
     def __init__(self, mode: bool = True) -> None:
         if not torch._jit_internal.is_scripting():
             super().__init__()
-        # Holds a context manager that can enable or disable inference mode
-        self._inference_mode_raii_context: Optional[torch._C._InferenceMode] = None
         self.mode = mode
 
     def __new__(cls, mode=True):
@@ -272,6 +273,9 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
         self._inference_mode_context.__exit__(exc_type, exc_value, traceback)
 
     def clone(self) -> "inference_mode":
+        r"""
+        Create a copy of this class
+        """
         return self.__class__(self.mode)
 
 
@@ -315,6 +319,9 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
         torch._C._set_multithreading_enabled(self.prev)
 
     def clone(self) -> "set_multithreading_enabled":
+        r"""
+        Create a copy of this class
+        """
         return self.__class__(self.mode)
 
 
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index c1fecbbb773fb..f2e6aa22fe946 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -349,8 +349,8 @@ def fn_pack_inps(*inps):
 
 
 def _compute_numerical_gradient(fn, entry, v, norm_v, nbhd_checks_fn):
-    # Performs finite differencing by perturbing `entry` in-place by `v` and
-    # returns the gradient of each of the outputs wrt to x at idx.
+    # Computes numerical directional derivative as finite difference
+    # of function `fn` at input `entry`, perturbed by vector `v`.
     if _is_sparse_compressed_tensor(entry):
         # sparse compressed tensors don't implement sub/add/copy_
         # yet. However, in non-masked semantics context entry and v
@@ -373,7 +373,7 @@ def _compute_numerical_gradient(fn, entry, v, norm_v, nbhd_checks_fn):
 
     def compute(a, b):
         nbhd_checks_fn(a, b)
-        ret = (b - a) / (2 * norm_v)
+        ret = (b - a) / (2 * norm_v)  # use central difference approx
         return ret.detach().reshape(-1)
 
     return tuple(compute(a, b) for (a, b) in zip(outa, outb))
@@ -754,7 +754,7 @@ def _check_analytical_jacobian_attributes(
     inputs, output, nondet_tol, check_grad_dtypes, fast_mode=False, v=None
 ) -> Tuple[torch.Tensor, ...]:
     # This is used by both fast and slow mode:
-    #  - For slow mode, vjps[i][j] is the jth row the Jacobian wrt the ith
+    #  - For slow mode, vjps[i][j] is the jth row of the Jacobian wrt the ith
     #    input.
     #  - For fast mode, vjps[i][0] is a linear combination of the rows
     #    of the Jacobian wrt the ith input
@@ -873,7 +873,8 @@ def _get_analytical_jacobian(inputs, outputs, input_idx, output_idx):
 def _compute_analytical_jacobian_rows(
     vjp_fn, sample_output
 ) -> List[List[Optional[torch.Tensor]]]:
-    # Computes Jacobian row-by-row using backward function `vjp_fn` = v^T J
+    # Computes Jacobian row-by-row by projecting `vjp_fn` = v^T J on standard basis
+    # vectors: vjp_fn(e) = e^T J is a corresponding row of the Jacobian.
     # NB: this function does not assume vjp_fn(v) to return tensors with the same
     # number of elements for different v. This is checked when we later combine the
     # rows into a single tensor.
@@ -881,11 +882,11 @@ def _compute_analytical_jacobian_rows(
         sample_output, memory_format=torch.legacy_contiguous_format
     )
     flat_grad_out = grad_out_base.view(-1)
-    # jacobians_rows[i][j] represents the jth row of the ith input
+    # jacobians_rows[i][j] is the Jacobian jth row for the ith input
     jacobians_rows: List[List[Optional[torch.Tensor]]] = []
     for j in range(flat_grad_out.numel()):
         flat_grad_out.zero_()
-        flat_grad_out[j] = 1.0
+        flat_grad_out[j] = 1.0  # projection for jth row of Jacobian
         grad_inputs = vjp_fn(grad_out_base)
         for i, d_x in enumerate(grad_inputs):
             if j == 0:
diff --git a/torch/autograd/graph.py b/torch/autograd/graph.py
index cb0776302ddb1..19938c183557f 100644
--- a/torch/autograd/graph.py
+++ b/torch/autograd/graph.py
@@ -1,10 +1,24 @@
 import abc
 import collections
 import contextlib
+import functools
 import logging
+import threading
 import weakref
 from collections import defaultdict, namedtuple
-from typing import Any, Callable, Deque, Dict, List, Optional, Sequence, Set, Tuple
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Deque,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
 
 import torch
 from torch.autograd.variable import Variable
@@ -368,13 +382,37 @@ def disable_saved_tensors_hooks(error_message):
             torch._C._autograd._saved_tensors_hooks_disable(maybe_prev_message)
 
 
+class _MultiHandle(RemovableHandle):
+    handles: Tuple[RemovableHandle, ...]
+
+    def __init__(self, handles: Tuple[RemovableHandle, ...]):
+        self.handles = handles
+
+    def remove(self):
+        for handle in self.handles:
+            handle.remove()
+
+    def __getstate__(self):
+        return self.handles
+
+    def __setstate__(self, state):
+        self.handles = state
+
+
 def register_multi_grad_hook(
     tensors: Sequence[torch.Tensor],
-    fn: Callable[[Sequence[Optional[torch.Tensor]]], None],
+    fn: Union[
+        Callable[[Sequence[Optional[torch.Tensor]]], None],
+        Callable[[torch.Tensor], None],
+    ],
+    *,
+    mode: str = "all",
 ):
     r"""Register a multi-grad backward hook.
 
-    The hook will be called after gradients with respect to every tensor in
+    There are two supported modes: ``"all"`` and ``"any"``.
+
+    Under the ``"all"`` mode, the hook will be called after gradients with respect to every tensor in
     :attr:`tensors` have been computed. If a tensor is in :attr:`tensors` but
     is not part of the graph, or if a tensor is not needed to compute the gradients
     for any ``inputs`` specified for the current ``.backward()`` or ``.grad()`` call,
@@ -385,6 +423,10 @@ def register_multi_grad_hook(
     called with those gradients. ``None`` will be passed for tensors that did not
     have their gradients computed.
 
+    Under the ``"any"`` mode, the hook will be called after the first gradient
+    with respect to a tensor in :attr:`tensors` has been computed. The hook
+    will be called with that gradient as its argument.
+
     The hook should not modify its arguments.
 
     This function returns a handle with a method ``handle.remove()`` that removes the hook.
@@ -413,56 +455,69 @@ def register_multi_grad_hook(
         [True, False, True, False]
         >>>
     """
-    count: Dict[int, int] = dict()
-    nb_calls = None
-    buffer: Dict[int, List[Optional[torch.Tensor]]] = dict()
-
-    grad_fns = list(map(_get_grad_fn_or_grad_acc, tensors))
-    len_tensors = len(tensors)
-
-    def get_inner_hook(idx):
-        def inner_hook(grad: torch.Tensor):
-            nonlocal count, nb_calls, buffer
+    supported_modes = ("all", "any")
+    if mode not in supported_modes:
+        raise ValueError(f"Expects mode to be one of {supported_modes} but got {mode}")
+
+    if mode == "all":
+        count: Dict[int, int] = dict()
+        nb_calls = None
+        buffer: Dict[int, List[Optional[torch.Tensor]]] = dict()
+
+        grad_fns = list(map(_get_grad_fn_or_grad_acc, tensors))
+        len_tensors = len(tensors)
+
+        def get_inner_hook(idx):
+            def inner_hook(grad: torch.Tensor):
+                nonlocal count, nb_calls, buffer, fn
+                id = torch._C._current_graph_task_id()
+                assert (
+                    id != -1
+                ), "expected this hook to be called inside a backward call"
+                count[id] = count.get(id, 0)
+                buffer[id] = buffer.get(id, [None] * len_tensors)
+
+                if count[id] == 0:
+                    # On the first call, compute the actual nb_calls and buffer
+                    nb_calls = sum(torch._C._will_engine_execute_node(g) for g in grad_fns)  # type: ignore[attr-defined]
+
+                buffer[id][idx] = grad
+                count[id] += 1
+
+                if count[id] == nb_calls:
+                    fn = cast(Callable[[Sequence[Optional[torch.Tensor]]], None], fn)
+                    fn(buffer[id])
+                    del count[id]
+                    del buffer[id]
+
+            return inner_hook
+
+        handles: Tuple[RemovableHandle] = tuple(
+            t.register_hook(get_inner_hook(i)) for i, t in enumerate(tensors)
+        )
+    elif mode == "any":
+        fn = cast(Callable[[torch.Tensor], None], fn)
+        lock = threading.Lock()
+        ran_hook: Dict[int, bool] = defaultdict(bool)
+
+        @functools.wraps(fn)
+        def wrapped_fn(grad: torch.Tensor):
+            nonlocal ran_hook
             id = torch._C._current_graph_task_id()
             assert id != -1, "expected this hook to be called inside a backward call"
-            count[id] = count.get(id, 0)
-            buffer[id] = buffer.get(id, [None] * len_tensors)
-
-            if count[id] == 0:
-                # On the first call, compute the actual nb_calls and buffer
-                nb_calls = sum(torch._C._will_engine_execute_node(g) for g in grad_fns)  # type: ignore[attr-defined]
-
-            buffer[id][idx] = grad
-            count[id] += 1
-
-            if count[id] == nb_calls:
-                fn(buffer[id])
-                del count[id]
-                del buffer[id]
-
-        return inner_hook
-
-    class Handle(RemovableHandle):
-        handles: Tuple[RemovableHandle, ...]
-
-        def __init__(self, handles: Tuple[RemovableHandle, ...]):
-            self.handles = handles
-
-        def remove(self):
-            for handle in self.handles:
-                handle.remove()
-
-        def __getstate__(self):
-            return self.handles
-
-        def __setstate__(self, state):
-            self.handles = state
-
-    handles: List[RemovableHandle] = []
-    for i, t in enumerate(tensors):
-        handles.append(t.register_hook(get_inner_hook(i)))
+            with lock:
+                prev, ran_hook[id] = ran_hook[id], True
+            if prev:
+                return
+            fn(grad)
+
+        handles = tuple(
+            tensor.register_hook(wrapped_fn)
+            for tensor in tensors
+            if tensor.requires_grad
+        )
 
-    return Handle(tuple(handles))
+    return _MultiHandle(handles)  # type: ignore[possibly-undefined]
 
 
 # NOTE [Allow mutation on tensors saved for backward]
@@ -482,11 +537,33 @@ def __setstate__(self, state):
 
 
 def _get_tid(t) -> Tuple[int, int, int]:
-    return (id(t), t.data_ptr(), t._version)
+    # FIXME: This is almost definitely a bug.
+    if isinstance(
+        t,
+        (
+            torch._subclasses.fake_tensor.FakeTensor,
+            torch._subclasses.functional_tensor.FunctionalTensor,
+        ),
+    ):
+        data_ptr = 0
+    else:
+        data_ptr = t.data_ptr()
+    return (id(t), data_ptr, t._version)
 
 
 def _get_sid(t) -> Tuple[int, int]:
-    return (t.data_ptr(), t._version)
+    # FIXME: This is almost definitely a bug.
+    if isinstance(
+        t,
+        (
+            torch._subclasses.fake_tensor.FakeTensor,
+            torch._subclasses.functional_tensor.FunctionalTensor,
+        ),
+    ):
+        data_ptr = 0
+    else:
+        data_ptr = t.data_ptr()
+    return (data_ptr, t._version)
 
 
 class _Handle:
@@ -692,4 +769,4 @@ def _engine_run_backward(t_outputs, *args, **kwargs):
         )  # Calls into the C++ engine to run the backward pass
     finally:
         if attach_logging_hooks:
-            unregister_hooks()
+            unregister_hooks()  # type: ignore[possibly-undefined]
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 9c5f8bd15169a..e053d89d79835 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -1,4 +1,6 @@
 from collections import defaultdict
+from dataclasses import dataclass
+from time import perf_counter_ns
 from typing import Any, Dict, List, Optional
 from warnings import warn
 
@@ -87,6 +89,18 @@ def _run_on_profiler_stop():
     _set_is_profiler_enabled(False)
 
 
+@dataclass
+class _ProfilerStats:
+    "Profiler timing and stats used by developers to catch issues/regressions"
+    profiling_window_duration_sec: float = 0
+    number_of_events: int = 0
+    profiler_prepare_call_duration_us: int = 0
+    profiler_enable_call_duration_us: int = 0
+    profiler_disable_call_duration_us: int = 0
+    parse_kineto_call_duration_us: int = 0
+    function_events_build_tree_call_duration_us: int = 0
+
+
 class profile:
     """Context manager that manages autograd profiler state and holds a summary of results.
 
@@ -98,8 +112,12 @@ class profile:
     Args:
         enabled (bool, optional): Setting this to False makes this context manager a no-op.
 
-        use_cuda (bool, optional): Enables timing of CUDA events as well using the cudaEvent API.
-            Adds approximately 4us of overhead to each tensor operation.
+        use_cuda (bool, optional): Enables timing of CUDA events as well
+            using the cudaEvent API. (will be deprecated)
+
+        use_device (str, optional): Enables timing of device events.
+            Adds approximately 4us of overhead to each tensor operation when use cuda.
+            The valid devices options are 'cuda', 'xpu' and 'privateuseone'.
 
         record_shapes (bool, optional): If shapes recording is set, information
             about input dimensions will be collected. This allows one to see which
@@ -147,9 +165,9 @@ class profile:
 
     .. warning:
         Due to some CUDA multiprocessing limitations (multiprocessing-cuda-note_),
-        one cannot use the profiler with ``use_cuda = True`` to benchmark
+        one cannot use the profiler with ``use_device = 'cuda'`` to benchmark
         DataLoaders with ``num_workers > 0``. If you wish to benchmark data loading,
-        please use ``use_cuda = False`` or ``num_workers = 0``.
+        please use ``use_device = None`` or ``num_workers = 0``.
 
     Example:
         >>> # xdoctest: +SKIP
@@ -193,9 +211,13 @@ def __init__(
         if not self.enabled:
             return
         self.use_cuda = use_cuda
-        self.use_device: Optional[str] = (
-            use_device if use_device != "privateuseone" else None
-        )
+        if self.use_cuda:
+            warn(
+                "The attribute `use_cuda` will be deprecated soon, please use ``use_device = 'cuda'`` instead."
+            )
+            self.use_device: Optional[str] = "cuda"
+        else:
+            self.use_device = use_device
         self.function_events: Optional[EventList] = None
         self.entered = False
         self.record_shapes = record_shapes
@@ -210,23 +232,31 @@ def __init__(
             experimental_config = _ExperimentalConfig()
         self.experimental_config = experimental_config
         self.kineto_results: Optional[_ProfilerResult] = None
+        self.profiling_start_time_ns = 0
+        self.profiling_end_time_ns = 0
+        self._stats = _ProfilerStats()
 
         if not self.use_cpu:
             assert (
                 use_kineto
             ), "Device-only events supported only with Kineto (use_kineto=True)"
 
-        if self.use_device == "cuda":
-            self.use_device = None
-            self.use_cuda = True
+        if self.use_device is not None:
+            VALID_DEVICE_OPTIONS = ["cuda", "xpu"]
+            if _get_privateuse1_backend_name() != "privateuseone":
+                VALID_DEVICE_OPTIONS.append(_get_privateuse1_backend_name())
+            if self.use_device not in VALID_DEVICE_OPTIONS:
+                warn(f"The {self.use_device} is not a valid device option.")
+                self.use_device = None
 
-        if self.use_device and self.use_device != _get_privateuse1_backend_name():
-            warn(f"{self.use_device} doesn't support profile.")
-            self.use_device = None
+            if self.use_device == "cuda" and not torch.cuda.is_available():
+                warn("CUDA is not available, disabling CUDA profiling")
+                self.use_cuda = False
+                self.use_device = None
 
-        if self.use_cuda and not torch.cuda.is_available():
-            warn("CUDA is not available, disabling CUDA profiling")
-            self.use_cuda = False
+            if self.use_device == "xpu" and not torch.xpu.is_available():
+                warn("XPU is not available, disabling XPU profiling")
+                self.use_device = None
 
         self.kineto_activities = set()
         if self.use_cpu:
@@ -235,14 +265,18 @@ def __init__(
             self.kineto_activities.add(ProfilerActivity.MTIA)
 
         self.profiler_kind = ProfilerState.KINETO
-        if self.use_cuda:
+        if self.use_device == "cuda":
             if not use_kineto or ProfilerActivity.CUDA not in _supported_activities():
                 assert self.use_cpu, "Legacy CUDA profiling requires use_cpu=True"
                 self.profiler_kind = ProfilerState.KINETO_GPU_FALLBACK
             else:
                 self.kineto_activities.add(ProfilerActivity.CUDA)
-
-        if self.use_device:
+        elif self.use_device == "xpu":
+            assert (
+                use_kineto and ProfilerActivity.XPU in _supported_activities()
+            ), "Legacy XPU profiling is not supported. Requires use_kineto=True on XPU devices."
+            self.kineto_activities.add(ProfilerActivity.XPU)
+        elif self.use_device is not None and self.use_device != "privateuseone":
             if (
                 not use_kineto
                 or ProfilerActivity.PrivateUse1 not in _supported_activities()
@@ -253,7 +287,6 @@ def __init__(
                 self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1_FALLBACK
             else:
                 self.kineto_activities.add(ProfilerActivity.PrivateUse1)
-                self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1
 
         assert (
             len(self.kineto_activities) > 0
@@ -281,29 +314,66 @@ def __enter__(self):
 
     def _prepare_trace(self):
         self.entered = True
+        t0 = perf_counter_ns()
         _prepare_profiler(self.config(), self.kineto_activities)
+        t1 = perf_counter_ns()
+        self._stats.profiler_prepare_call_duration_us = int((t1 - t0) / 1000)
 
     def _start_trace(self):
         self.entered = True
         _run_on_profiler_start()
+        t0 = perf_counter_ns()
         _enable_profiler(self.config(), self.kineto_activities)
+        t1 = perf_counter_ns()
+        self._stats.profiler_enable_call_duration_us = int((t1 - t0) / 1000)
+        self.profiling_start_time_ns = t1
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         if not self.enabled:
             return
-        if self.use_cuda:
-            torch.cuda.synchronize()
+        if self.use_device and hasattr(torch, self.use_device):
+            device_module = getattr(torch, self.use_device)
+            if hasattr(device_module, "synchronize"):
+                device_module.synchronize()
+
+        old_function_events: Optional[EventList] = None
+        if self.function_events:
+            old_function_events = self.function_events
+
+        t0 = perf_counter_ns()
+
+        # TODO we are overwriting previous kineto results here
+        # Should combine previous results with the new results otherwise only
+        # the last "repeat" will be recorded in the trace
         self.kineto_results = _disable_profiler()
+        t1 = perf_counter_ns()
+        self._stats.profiler_disable_call_duration_us = int((t1 - t0) / 1000)
+        self.profiling_end_time_ns = t0
+
         _run_on_profiler_stop()
+        t0 = perf_counter_ns()
         parsed_results = self._parse_kineto_results(self.kineto_results)
+        t1 = perf_counter_ns()
+        self._stats.parse_kineto_call_duration_us = int((t1 - t0) / 1000)
+
         self.function_events = EventList(
             parsed_results,
-            use_cuda=self.use_cuda,
             use_device=self.use_device,
             profile_memory=self.profile_memory,
             with_flops=self.with_flops,
         )
+        t0 = perf_counter_ns()
         self.function_events._build_tree()
+        t1 = perf_counter_ns()
+        self._stats.function_events_build_tree_call_duration_us = int((t1 - t0) / 1000)
+
+        self._stats.number_of_events = len(self.function_events)
+        self._stats.profiling_window_duration_sec = (
+            (self.profiling_end_time_ns - self.profiling_start_time_ns) * 1.0 / 1e9
+        )
+        if old_function_events:
+            for evt in old_function_events:
+                self.function_events.append(evt)
         return False
 
     def __repr__(self):
@@ -345,6 +415,10 @@ def table(
     table.__doc__ = EventList.table.__doc__
 
     def export_chrome_trace(self, path):
+        """
+        Exports the collected trace in Chrome JSON format. If kineto is enabled, only
+        last cycle in schedule is exported.
+        """
         self._check_finish()
         if kineto_available():
             self.kineto_results.save(path)  # type: ignore[union-attr]
@@ -386,7 +460,7 @@ def self_cpu_time_total(self):
     def _parse_kineto_results(self, result: _ProfilerResult):
         # result.events() has most of the events - PyTorch op-level and device-level events
 
-        trace_start_us = result.trace_start_us()
+        trace_start_ns = result.trace_start_ns()
         mem_records = [
             [evt, False] for evt in result.events() if evt.name() == MEMORY_EVENT_NAME
         ]
@@ -403,42 +477,39 @@ def _cpu_memory_usage(mem_record):
                 else 0
             )
 
-        def _cuda_memory_usage(mem_record):
-            return (
-                mem_record.nbytes()
-                if mem_record.device_type() in [DeviceType.CUDA, DeviceType.HIP]
-                else 0
-            )
-
-        def _privateuse1_memory_usage(mem_record):
+        def _device_memory_usage(mem_record):
             return (
                 mem_record.nbytes()
-                if mem_record.device_type() in [DeviceType.PrivateUse1]
+                if mem_record.device_type()
+                in [DeviceType.CUDA, DeviceType.PrivateUse1, DeviceType.HIP]
                 else 0
             )
 
-        # Create and return FunctionEvent list
-        function_events = []
+        # Create and return FunctionEvent list, which contains all function events
+        # Here 2 function events are created:
+        # all_function_events contains all events associated with each kineto event from result
+        all_function_events = []
+        # frontend_function_events contains the events in aten or torch frontend level,
+        # whose correlation id is 0
+        frontend_function_events = []
         device_corr_map: Dict[int, List[FunctionEvent]] = {}
         max_evt_id = 0
         for kineto_event in result.events():
             if _filter_name(kineto_event.name()):
                 continue
-            rel_start_us = kineto_event.start_us() - trace_start_us
-            rel_end_us = rel_start_us + kineto_event.duration_us()
-            abs_end_us = kineto_event.start_us() + kineto_event.duration_us()
+            rel_start_ns = kineto_event.start_ns() - trace_start_ns
+            rel_end_ns = rel_start_ns + kineto_event.duration_ns()
+            abs_end_ns = kineto_event.start_ns() + kineto_event.duration_ns()
 
             cpu_memory_usage = 0
-            cuda_memory_usage = 0
-            privateuse1_memory_usage = 0
+            device_memory_usage = 0
             if kineto_event.device_type() == DeviceType.CPU:
                 # find the corresponding memory allocation events
                 for mem_record in mem_records_acc.in_interval(
-                    kineto_event.start_us(), abs_end_us
+                    kineto_event.start_ns() / 1000, abs_end_ns / 1000
                 ):
                     cpu_memory_usage += _cpu_memory_usage(mem_record[0])
-                    cuda_memory_usage += _cuda_memory_usage(mem_record[0])
-                    privateuse1_memory_usage += _privateuse1_memory_usage(mem_record[0])
+                    device_memory_usage += _device_memory_usage(mem_record[0])
                     mem_record[1] = True
 
             is_async = kineto_event.is_async() or (
@@ -450,8 +521,8 @@ def _privateuse1_memory_usage(mem_record):
                 name=_rewrite_name(name=kineto_event.name(), with_wildcard=True),
                 trace_name=_rewrite_name(name=kineto_event.name(), with_wildcard=False),
                 thread=kineto_event.start_thread_id(),
-                start_us=rel_start_us,
-                end_us=rel_end_us,
+                start_us=rel_start_ns / 1000,
+                end_us=rel_end_ns / 1000,
                 fwd_thread=kineto_event.fwd_thread_id(),
                 input_shapes=kineto_event.shapes(),
                 concrete_inputs=kineto_event.concrete_inputs(),
@@ -463,71 +534,79 @@ def _privateuse1_memory_usage(mem_record):
                 scope=kineto_event.scope(),
                 use_device=self.use_device,
                 cpu_memory_usage=cpu_memory_usage,
-                cuda_memory_usage=cuda_memory_usage,
-                privateuse1_memory_usage=privateuse1_memory_usage,
+                device_memory_usage=device_memory_usage,
                 is_async=is_async,
                 sequence_nr=kineto_event.sequence_nr(),
                 device_type=kineto_event.device_type(),
                 device_index=kineto_event.device_index(),
+                device_resource_id=kineto_event.device_resource_id(),
                 flops=kineto_event.flops(),
             )
             max_evt_id = max(max_evt_id, fe.id)
             if fe.device_type == DeviceType.CPU and not fe.is_async:
-                if self.use_device:
+                if self.use_device == "privateuseone":
                     privateuse1_time = kineto_event.privateuse1_elapsed_us()
                     if privateuse1_time > 0:
                         fe.append_kernel(fe.name, fe.device_index, privateuse1_time)
                         fe.is_legacy = True
-                else:
+                elif self.use_device == "cuda":
                     # Check if we have CUDA time as a fallback
                     cuda_time = kineto_event.cuda_elapsed_us()
                     if cuda_time > 0:
                         fe.append_kernel(fe.name, fe.device_index, cuda_time)
                         fe.is_legacy = True
-            function_events.append(fe)
+            all_function_events.append(fe)
             corr_id = kineto_event.linked_correlation_id()
             if corr_id > 0:
                 if corr_id not in device_corr_map:
                     device_corr_map[corr_id] = []
                 device_corr_map[corr_id].append(fe)
+            elif corr_id == 0:
+                frontend_function_events.append(fe)
+            else:
+                raise RuntimeError(
+                    f"Got negative correlation id {corr_id} in profiler post processing"
+                )
 
-        # associate CUDA kernels and CUDA runtime (CPU) with CPU events
-        for fe in function_events:
+        # associate device kernels and device runtime (CPU) with CPU events
+        for fe in frontend_function_events:
             if (
                 fe.device_type == DeviceType.CPU
                 and not fe.is_async
                 and fe.id in device_corr_map
             ):
                 for f_evt in device_corr_map[fe.id]:
-                    if f_evt.device_type == DeviceType.CUDA:
+                    if (
+                        f_evt.device_type == DeviceType.CUDA
+                        or f_evt.device_type == DeviceType.PrivateUse1
+                    ):
                         fe.append_kernel(
                             f_evt.name,
                             f_evt.device_index,
                             f_evt.time_range.end - f_evt.time_range.start,
                         )
                     elif f_evt.device_type == DeviceType.CPU:
-                        # make sure that 'thread' of a CPU Kineto (e.g. CUDA Runtime) event is associated
+                        # make sure that 'thread' of a CPU Kineto (e.g. Device Runtime) event is associated
                         # with the 'thread' of the corresponding linked PyTorch event to properly track
                         # parents and children
                         f_evt.thread = fe.thread
 
         def createFunctionEventForMemoryEvents(evt):
-            rel_start_us = evt.start_us() - trace_start_us
+            rel_start_ns = evt.start_ns() - trace_start_ns
             fe = FunctionEvent(
                 id=max_evt_id,
                 name=evt.name(),
                 trace_name=None,  # not outputting in the trace
                 thread=evt.start_thread_id(),
-                start_us=rel_start_us,
-                end_us=rel_start_us,  # no duration
+                start_us=rel_start_ns / 1000,
+                end_us=rel_start_ns / 1000,  # no duration
                 fwd_thread=evt.start_thread_id(),
                 input_shapes=[],
                 stack=[],
                 scope=0,  # RecordScope::FUNCTION
                 use_device=self.use_device,
                 cpu_memory_usage=_cpu_memory_usage(evt),
-                cuda_memory_usage=_cuda_memory_usage(evt),
-                privateuse1_memory_usage=_privateuse1_memory_usage(evt),
+                device_memory_usage=_device_memory_usage(evt),
                 is_async=False,
                 sequence_nr=-1,
                 device_type=DeviceType.CPU,
@@ -540,17 +619,17 @@ def createFunctionEventForMemoryEvents(evt):
             if not mem_record[1]:
                 max_evt_id += 1
                 fe = createFunctionEventForMemoryEvents(mem_record[0])
-                function_events.append(fe)
+                all_function_events.append(fe)
 
         for oom_record in oom_records:
             max_evt_id += 1
             fe = createFunctionEventForMemoryEvents(oom_record)
-            function_events.append(fe)
+            all_function_events.append(fe)
 
-        function_events.sort(
+        all_function_events.sort(
             key=lambda evt: [evt.time_range.start, -evt.time_range.end]
         )
-        return function_events
+        return all_function_events
 
 
 class record_function(_ContextDecorator):
@@ -876,6 +955,9 @@ def __init__(self):
         self.seen = set()
 
     def see(self, *key):
+        r"""
+        Observe a key and raise an error if it is seen multiple times.
+        """
         if key in self.seen:
             raise RuntimeError("duplicate key: " + str(key))
         self.seen.add(key)
@@ -962,23 +1044,27 @@ class KinetoStepTracker:
 
     We fix this by adding a layer of abstraction before calling step()
     to the kineto library. The idea is to maintain steps per requester in a dict:
-    ```
-    {
-       "ProfilerStep": 100,  # triggered by profiler step() call
-       "Optimizer1Step": 100,   # Optimizer 1 or 2 are just examples, could be SGD, Adam etc
-       "Optimizer2Step": 100,
-    }
-    ```
+
+    .. code-block::
+
+        {
+           "ProfilerStep": 100,  # triggered by profiler step() call
+           "Optimizer1Step": 100,   # Optimizer 1 or 2 are just examples, could be SGD, Adam etc
+           "Optimizer2Step": 100,
+        }
+
     To figure out the global step count just take the max of dict values (100).
 
     If one of the count increments the max will go up.
-    ```
-    {
-       "ProfilerStep": 100,
-       "Optimizer1Step": 101,   # Optimizer1 got incremented first say
-       "Optimizer2Step": 100,
-    }
-    ```
+
+    .. code-block::
+
+        {
+           "ProfilerStep": 100,
+           "Optimizer1Step": 101,   # Optimizer1 got incremented first say
+           "Optimizer2Step": 100,
+        }
+
     Then global step count is 101
     We only call the kineto step() function when global count increments.
 
@@ -986,15 +1072,21 @@ class KinetoStepTracker:
     for now. The result could be incorrect increments of the step count.
     """
 
-    _current_step = -1
+    _current_step = 0
     _step_dict: Dict[str, int] = defaultdict(int)
 
     @classmethod
     def init_step_count(cls, requester: str):
+        r"""
+        Initialize for a given requester.
+        """
         cls._step_dict[requester] = cls._current_step
 
     @classmethod
     def erase_step_count(cls, requester: str) -> bool:
+        r"""
+        Remove a given requester.
+        """
         return cls._step_dict.pop(requester, None) is not None
 
     @classmethod
@@ -1023,4 +1115,7 @@ def increment_step(cls, requester: str) -> int:
 
     @classmethod
     def current_step(cls) -> int:
+        r"""
+        Get the latest step for any requester
+        """
         return cls._current_step
diff --git a/torch/autograd/profiler_legacy.py b/torch/autograd/profiler_legacy.py
index 32700ffb1cf31..f72d366a36778 100644
--- a/torch/autograd/profiler_legacy.py
+++ b/torch/autograd/profiler_legacy.py
@@ -93,7 +93,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         parsed_results = _parse_legacy_records(records)
         self.function_events = EventList(
             parsed_results,
-            use_cuda=self.use_cuda,
+            use_device="cuda" if self.use_cuda else None,
             profile_memory=self.profile_memory,
             with_flops=self.with_flops,
         )
@@ -250,8 +250,9 @@ def _get_record_key(record):
                         entry for entry in start.stack() if _filter_stack_entry(entry)
                     ],
                     scope=start.scope(),
+                    use_device="cuda" if start.has_cuda() else None,
                     cpu_memory_usage=cpu_memory_usage,
-                    cuda_memory_usage=cuda_memory_usage,
+                    device_memory_usage=cuda_memory_usage,
                     is_async=is_async,
                     is_remote=is_remote_event,
                     sequence_nr=start.sequence_nr(),
@@ -287,7 +288,7 @@ def _get_record_key(record):
                         end_us=0,
                         stack=[],
                         cpu_memory_usage=record.cpu_memory_usage(),
-                        cuda_memory_usage=record.cuda_memory_usage(),
+                        device_memory_usage=record.cuda_memory_usage(),
                         is_legacy=True,
                     )
                     functions.append(fe)
diff --git a/torch/autograd/profiler_util.py b/torch/autograd/profiler_util.py
index 5e35b8604b862..92a33e4fb53d5 100644
--- a/torch/autograd/profiler_util.py
+++ b/torch/autograd/profiler_util.py
@@ -26,12 +26,10 @@ class EventList(list):
     """A list of Events (for pretty printing)."""
 
     def __init__(self, *args, **kwargs):
-        use_cuda = kwargs.pop("use_cuda", True)
         use_device = kwargs.pop("use_device", None)
         profile_memory = kwargs.pop("profile_memory", False)
         with_flops = kwargs.pop("with_flops", False)
         super().__init__(*args, **kwargs)
-        self._use_cuda = use_cuda
         self._use_device = use_device
         self._profile_memory = profile_memory
         self._tree_built = False
@@ -164,7 +162,7 @@ def bw_parent(evt):
 
     @property
     def self_cpu_time_total(self):
-        return sum([event.self_cpu_time_total for event in self])
+        return sum(event.self_cpu_time_total for event in self)
 
     def table(
         self,
@@ -181,14 +179,16 @@ def table(
         Args:
             sort_by (str, optional): Attribute used to sort entries. By default
                 they are printed in the same order as they were registered.
-                Valid keys include: ``cpu_time``, ``cuda_time``, ``cpu_time_total``,
-                ``cuda_time_total``, ``cpu_memory_usage``, ``cuda_memory_usage``,
-                ``self_cpu_memory_usage``, ``self_cuda_memory_usage``, ``count``.
+                Valid keys include: ``cpu_time``, ``cuda_time``, ``xpu_time``,
+                ``cpu_time_total``, ``cuda_time_total``, ``xpu_time_total``,
+                ``cpu_memory_usage``, ``cuda_memory_usage``, ``xpu_memory_usage``,
+                ``self_cpu_memory_usage``, ``self_cuda_memory_usage``,
+                ``self_xpu_memory_usage``, ``count``.
             top_level_events_only(bool, optional): Boolean flag to determine the
                 selection of events to display. If true, the profiler will only
                 display events at top level like top-level invocation of python
                 `lstm`, python `add` or other functions, nested events like low-level
-                cpu/cuda ops events are omitted for profiler result readability.
+                cpu/cuda/xpu ops events are omitted for profiler result readability.
 
         Returns:
             A string containing the table.
@@ -267,6 +267,7 @@ def supported_export_stacks_metrics(self):
         return [
             "self_cpu_time_total",
             "self_cuda_time_total",
+            "self_xpu_time_total",
             "self_privateuse1_time_total",
         ]
 
@@ -280,7 +281,12 @@ def export_stacks(self, path: str, metric: str):
         with open(path, "w") as f:
             for evt in self:
                 if evt.stack and len(evt.stack) > 0:
-                    metric_value = getattr(evt, metric)
+                    metric_value = getattr(
+                        evt,
+                        metric.replace("cuda", "device")
+                        .replace("xpu", "device")
+                        .replace("privateuse1", "device"),
+                    )
                     if int(metric_value) > 0:
                         stack_str = ""
                         for entry in reversed(evt.stack):
@@ -325,7 +331,6 @@ def get_key(event, group_by_input_shapes, group_by_stack_n) -> Tuple[str, ...]:
 
         avg_list = EventList(
             stats.values(),
-            use_cuda=self._use_cuda,
             use_device=self._use_device,
             profile_memory=self._profile_memory,
             with_flops=self._with_flops,
@@ -395,26 +400,23 @@ class FormattedTimesMixin:
     """
 
     cpu_time_str = _attr_formatter("cpu_time")
-    cuda_time_str = _attr_formatter("cuda_time")
-    privateuse1_time_str = _attr_formatter("privateuse1_time")
+    device_time_str = _attr_formatter("device_time")
     cpu_time_total_str = _attr_formatter("cpu_time_total")
-    cuda_time_total_str = _attr_formatter("cuda_time_total")
-    privateuse1_time_total_str = _attr_formatter("privateuse1_time_total")
+    device_time_total_str = _attr_formatter("device_time_total")
     self_cpu_time_total_str = _attr_formatter("self_cpu_time_total")
-    self_cuda_time_total_str = _attr_formatter("self_cuda_time_total")
-    self_privateuse1_time_total_str = _attr_formatter("self_privateuse1_time_total")
+    self_device_time_total_str = _attr_formatter("self_device_time_total")
 
     @property
     def cpu_time(self):
         return 0.0 if self.count == 0 else 1.0 * self.cpu_time_total / self.count  # type: ignore[attr-defined]
 
     @property
-    def cuda_time(self):
-        return 0.0 if self.count == 0 else 1.0 * self.cuda_time_total / self.count  # type: ignore[attr-defined]
+    def device_time(self):
+        return 0.0 if self.count == 0 else 1.0 * self.device_time_total / self.count  # type: ignore[attr-defined]
 
     @property
-    def privateuse1_time(self):
-        return 0.0 if self.count == 0 else 1.0 * self.privateuse1_time_total / self.count  # type: ignore[attr-defined]
+    def cuda_time(self):  # To be deprecated
+        return self.device_time
 
 
 class Interval:
@@ -423,6 +425,9 @@ def __init__(self, start, end):
         self.end = end
 
     def elapsed_us(self):
+        r"""
+        Returns the length of the interval
+        """
         return self.end - self.start
 
 
@@ -445,14 +450,14 @@ def __init__(
         scope=0,
         use_device=None,
         cpu_memory_usage=0,
-        cuda_memory_usage=0,
-        privateuse1_memory_usage=0,
+        device_memory_usage=0,
         is_async=False,
         is_remote=False,
         sequence_nr=-1,
         node_id=-1,
         device_type=DeviceType.CPU,
         device_index=0,
+        device_resource_id=None,
         is_legacy=False,
         flops=None,
         trace_name=None,
@@ -475,13 +480,15 @@ def __init__(
         self.scope: int = scope
         self.use_device: Optional[str] = use_device
         self.cpu_memory_usage: int = cpu_memory_usage
-        self.cuda_memory_usage: int = cuda_memory_usage
-        self.privateuse1_memory_usage: int = privateuse1_memory_usage
+        self.device_memory_usage: int = device_memory_usage
         self.is_async: bool = is_async
         self.is_remote: bool = is_remote
         self.sequence_nr: int = sequence_nr
         self.device_type: DeviceType = device_type
         self.device_index: int = device_index
+        self.device_resource_id: int = (
+            thread if device_resource_id is None else device_resource_id
+        )
         self.is_legacy: bool = is_legacy
         self.flops: Optional[int] = flops
 
@@ -519,137 +526,88 @@ def self_cpu_memory_usage(self):
         if self.is_async or self.device_type != DeviceType.CPU:
             return 0
         return self.cpu_memory_usage - sum(
-            [child.cpu_memory_usage for child in self.cpu_children]
+            child.cpu_memory_usage for child in self.cpu_children
         )
 
     @property
-    def self_cuda_memory_usage(self):
+    def self_device_memory_usage(self):
         if self.is_async or self.device_type != DeviceType.CPU:
             return 0
-        return self.cuda_memory_usage - sum(
-            [child.cuda_memory_usage for child in self.cpu_children]
+        return self.device_memory_usage - sum(
+            child.device_memory_usage for child in self.cpu_children
         )
 
     @property
-    def self_privateuse1_memory_usage(self):
-        if self.is_async or self.device_type != DeviceType.CPU:
+    def self_cuda_memory_usage(self):  # To be deprecated
+        self.self_device_memory_usage
+
+    @property
+    def cpu_time_total(self):
+        if self.device_type == DeviceType.CPU:
+            return self.time_range.elapsed_us()
+        else:
             return 0
-        return self.privateuse1_memory_usage - sum(
-            [child.privateuse1_memory_usage for child in self.cpu_children]
-        )
 
     @property
     def self_cpu_time_total(self):
         if self.is_async or self.device_type != DeviceType.CPU:
             return 0
         return self.cpu_time_total - sum(
-            [child.cpu_time_total for child in self.cpu_children]
+            child.cpu_time_total for child in self.cpu_children
         )
 
     @property
-    def cuda_time_total(self):
-        if self.is_async or self.use_device:
+    def device_time_total(self):
+        if self.is_async or not self.use_device:
             return 0
         if self.device_type == DeviceType.CPU:
             if not self.is_legacy:
                 # account for the kernels in the children ops
                 return sum(kinfo.duration for kinfo in self.kernels) + sum(
-                    ch.cuda_time_total for ch in self.cpu_children
+                    ch.device_time_total for ch in self.cpu_children
                 )
             else:
                 # each legacy cpu events has a single (fake) kernel
                 return sum(kinfo.duration for kinfo in self.kernels)
         else:
-            assert self.device_type == DeviceType.CUDA
+            assert self.device_type in [DeviceType.CUDA, DeviceType.PrivateUse1]
             return self.time_range.elapsed_us()
 
     @property
-    def self_cuda_time_total(self):
-        if self.is_async or self.use_device:
-            return 0
-        if self.device_type == DeviceType.CPU:
-            return self.cuda_time_total - sum(
-                [child.cuda_time_total for child in self.cpu_children]
-            )
-        else:
-            assert self.device_type == DeviceType.CUDA
-            return self.cuda_time_total
+    def cuda_time_total(self):  # To be deprecated
+        self.device_time_total
 
     @property
-    def cpu_time_total(self):
-        if self.device_type == DeviceType.CPU:
-            return self.time_range.elapsed_us()
-        else:
-            return 0
-
-    @property
-    def self_privateuse1_time_total(self):
+    def self_device_time_total(self):
         if self.is_async or not self.use_device:
             return 0
         if self.device_type == DeviceType.CPU:
-            return self.privateuse1_time_total - sum(
-                [child.privateuse1_time_total for child in self.cpu_children]
+            return self.device_time_total - sum(
+                [child.device_time_total for child in self.cpu_children]
             )
         else:
-            assert self.device_type == DeviceType.CUDA
-            return self.privateuse1_time_total
+            assert self.device_type in [DeviceType.CUDA, DeviceType.PrivateUse1]
+            return self.device_time_total
 
     @property
-    def privateuse1_time_total(self):
-        if self.is_async or not self.use_device:
-            return 0
-        if self.device_type == DeviceType.CPU:
-            if not self.is_legacy:
-                # account for the kernels in the children ops
-                return sum(kinfo.duration for kinfo in self.kernels) + sum(
-                    ch.privateuse1_time_total for ch in self.cpu_children
-                )
-            else:
-                # each legacy cpu events has a single (fake) kernel
-                return sum(kinfo.duration for kinfo in self.kernels)
-        else:
-            assert self.device_type == DeviceType.PrivateUse1
-            return self.time_range.elapsed_us()
+    def self_cuda_time_total(self):  # To be deprecated
+        self.self_device_time_total
 
     @property
     def key(self):
         return self.name
 
     def __repr__(self):
-        device_name = "cuda" if not self.use_device else self.use_device
-        device_time = (
-            self.cuda_time_str if not self.use_device else self.privateuse1_time_str
-        )
-        device_memory_usage = (
-            self.cuda_memory_usage
-            if not self.use_device
-            else self.privateuse1_memory_usage
-        )
+        device_name = self.use_device
+        device_time = self.device_time_str
+        device_memory_usage = self.device_memory_usage
         return (
-            "<FunctionEvent id={} name={} device_type={} node_id={} cpu_time={} start_us={} end_us={} "
-            "cpu_children={} {}_time={} name={} thread={} input_shapes={} "
-            "cpu_memory_usage={} {}_memory_usage={} is_async={} is_remote={} seq_nr={} is_legacy={}>".format(
-                self.id,
-                self.name,
-                self.device_type,
-                self.node_id,
-                self.cpu_time_str,
-                self.time_range.start,
-                self.time_range.end,
-                str([child.id for child in self.cpu_children]),
-                device_name,
-                device_time,
-                self.name,
-                self.thread,
-                str(self.input_shapes),
-                self.cpu_memory_usage,
-                device_name,
-                device_memory_usage,
-                self.is_async,
-                self.is_remote,
-                self.sequence_nr,
-                self.is_legacy,
-            )
+            f"<FunctionEvent id={self.id} name={self.name} device_type={self.device_type} node_id={self.node_id} "
+            f"cpu_time={self.cpu_time_str} start_us={self.time_range.start} end_us={self.time_range.end} "
+            f"cpu_children={str([child.id for child in self.cpu_children])} {device_name}_time={device_time} "
+            f"name={self.name} thread={self.thread} input_shapes={str(self.input_shapes)} "
+            f"cpu_memory_usage={self.cpu_memory_usage} {device_name}_memory_usage={device_memory_usage} "
+            f"is_async={self.is_async} is_remote={self.is_remote} seq_nr={self.sequence_nr} is_legacy={self.is_legacy}>"
         )
 
 
@@ -664,20 +622,16 @@ def __init__(self):
         self.is_remote: bool = False
         self.use_device: Optional[str] = None
         self.cpu_time_total: int = 0
-        self.cuda_time_total: int = 0
-        self.privateuse1_time_total: int = 0
+        self.device_time_total: int = 0
         self.self_cpu_time_total: int = 0
-        self.self_cuda_time_total: int = 0
-        self.self_privateuse1_time_total: int = 0
+        self.self_device_time_total: int = 0
         self.input_shapes: Optional[List[List[int]]] = None
         self.stack: Optional[List] = None
         self.scope: Optional[int] = None
         self.cpu_memory_usage: int = 0
-        self.cuda_memory_usage: int = 0
-        self.privateuse1_memory_usage: int = 0
+        self.device_memory_usage: int = 0
         self.self_cpu_memory_usage: int = 0
-        self.self_cuda_memory_usage: int = 0
-        self.self_privateuse1_memory_usage: int = 0
+        self.self_device_memory_usage: int = 0
         self.cpu_children: Optional[List[FunctionEvent]] = None
         self.cpu_parent: Optional[FunctionEvent] = None
         self.device_type: DeviceType = DeviceType.CPU
@@ -705,17 +659,13 @@ def add(self, other):
         assert isinstance(other, (FunctionEvent, FunctionEventAvg))
         assert other.key == self.key
         self.cpu_time_total += other.cpu_time_total
-        self.cuda_time_total += other.cuda_time_total
-        self.privateuse1_time_total += other.privateuse1_time_total
+        self.device_time_total += other.device_time_total
         self.self_cpu_time_total += other.self_cpu_time_total
-        self.self_cuda_time_total += other.self_cuda_time_total
-        self.self_privateuse1_time_total += other.self_privateuse1_time_total
+        self.self_device_time_total += other.self_device_time_total
         self.cpu_memory_usage += other.cpu_memory_usage
-        self.cuda_memory_usage += other.cuda_memory_usage
-        self.privateuse1_memory_usage += other.privateuse1_memory_usage
+        self.device_memory_usage += other.device_memory_usage
         self.self_cpu_memory_usage += other.self_cpu_memory_usage
-        self.self_cuda_memory_usage += other.self_cuda_memory_usage
-        self.self_privateuse1_memory_usage += other.self_privateuse1_memory_usage
+        self.self_device_memory_usage += other.self_device_memory_usage
         self.count += other.count
         if self.flops is None:
             self.flops = other.flops
@@ -728,35 +678,13 @@ def __iadd__(self, other):
 
     def __repr__(self):
         device_name = "cuda" if not self.use_device else self.use_device
-        self_device_time = (
-            self.self_cuda_time_total_str
-            if not self.use_device
-            else self.self_privateuse1_time_total_str
-        )
-        device_time = (
-            self.cuda_time_str if not self.use_device else self.privateuse1_time_str
-        )
-        device_memory = (
-            self.cuda_memory_usage
-            if not self.use_device
-            else self.privateuse1_memory_usage
-        )
+        self_device_time = self.self_device_time_total_str
+        device_time = self.device_time_str
+        device_memory = self.device_memory_usage
         return (
-            "<FunctionEventAvg key={} self_cpu_time={} cpu_time={} "
-            " self_{}_time={} {}_time={} input_shapes={} "
-            "cpu_memory_usage={} {}_memory_usage={}>".format(
-                self.key,
-                self.self_cpu_time_total_str,
-                self.cpu_time_str,
-                device_name,
-                self_device_time,
-                device_name,
-                device_time,
-                str(self.input_shapes),
-                self.cpu_memory_usage,
-                device_name,
-                device_memory,
-            )
+            f"<FunctionEventAvg key={self.key} self_cpu_time={self.self_cpu_time_total_str} cpu_time={self.cpu_time_str} "
+            f" self_{device_name}_time={self_device_time} {device_name}_time={device_time} input_shapes={str(self.input_shapes)} "
+            f"cpu_memory_usage={self.cpu_memory_usage} {device_name}_memory_usage={device_memory}>"
         )
 
 
@@ -774,15 +702,19 @@ class MemRecordsAcc:
 
     def __init__(self, mem_records):
         self._mem_records = mem_records
-        self._start_uses: List[int] = []
+        self._start_nses: List[int] = []
         self._indices: List[int] = []
         if len(mem_records) > 0:
-            tmp = sorted([(r[0].start_us(), i) for i, r in enumerate(mem_records)])
-            self._start_uses, self._indices = zip(*tmp)  # type: ignore[assignment]
+            tmp = sorted([(r[0].start_ns(), i) for i, r in enumerate(mem_records)])
+            self._start_nses, self._indices = zip(*tmp)  # type: ignore[assignment]
 
     def in_interval(self, start_us, end_us):
-        start_idx = bisect.bisect_left(self._start_uses, start_us)
-        end_idx = bisect.bisect_right(self._start_uses, end_us)
+        r"""
+        Return all records in the given interval
+        To maintain backward compatibility, convert us to ns in function
+        """
+        start_idx = bisect.bisect_left(self._start_nses, start_us * 1000)
+        end_idx = bisect.bisect_right(self._start_nses, end_us * 1000)
         for i in range(start_idx, end_idx):
             yield self._mem_records[self._indices[i]]
 
@@ -847,19 +779,14 @@ def _build_table(
     if len(events) == 0:
         return ""
 
-    has_cuda_time = any(event.self_cuda_time_total > 0 for event in events)
-    has_cuda_mem = any(event.self_cuda_memory_usage > 0 for event in events)
-    has_privateuse1_time = any(
-        event.self_privateuse1_time_total > 0 for event in events
-    )
-    has_privateuse1_mem = any(
-        event.self_privateuse1_memory_usage > 0 for event in events
-    )
+    has_device_time = any(event.self_device_time_total > 0 for event in events)
+    has_device_mem = any(event.self_device_memory_usage > 0 for event in events)
     use_device = events[0].use_device
-    if not use_device and (has_privateuse1_mem or has_privateuse1_time):
-        raise RuntimeError(
-            "use_device is None, but there is private device performance data."
-        )
+    # Running on PrivateUse1 device with profiler but not enable
+    # ProfilerActivity.PrivateUse1 can also catch privateuse1 memory usage.
+    # Here only need to check has_privateuse1_time if not use_device.
+    if not use_device and has_device_time:
+        raise RuntimeError("use_device is None, but there is device performance data.")
 
     has_input_shapes = any(
         (event.input_shapes is not None and len(event.input_shapes) > 0)
@@ -868,18 +795,26 @@ def _build_table(
 
     if sort_by is not None:
         events = EventList(
-            sorted(events, key=lambda evt: getattr(evt, sort_by), reverse=True),
-            use_cuda=has_cuda_time,
+            sorted(
+                events,
+                key=lambda evt: getattr(
+                    evt,
+                    sort_by.replace("cuda", "device")
+                    .replace("xpu", "device")
+                    .replace("privateuse1", "device"),
+                ),
+                reverse=True,
+            ),
             use_device=use_device,
             profile_memory=profile_memory,
             with_flops=with_flops,
         )
 
-    name_column_width = max([len(evt.key) for evt in events]) + 4
+    name_column_width = max(len(evt.key) for evt in events) + 4
     if max_name_column_width is not None:
         name_column_width = min(name_column_width, max_name_column_width)
 
-    shapes_column_width = max([len(str(evt.input_shapes)) for evt in events]) + 4
+    shapes_column_width = max(len(str(evt.input_shapes)) for evt in events) + 4
     if max_shapes_column_width is not None:
         shapes_column_width = min(shapes_column_width, max_shapes_column_width)
 
@@ -894,7 +829,7 @@ def _build_table(
     has_stack = len(stacks) > 0
     if has_stack:
         src_column_width = (
-            max([max([len(entry) for entry in stack]) for stack in stacks]) + 4
+            max(max(len(entry) for entry in stack) for stack in stacks) + 4
         )
         if max_src_column_width is not None:
             src_column_width = min(src_column_width, max_src_column_width)
@@ -907,23 +842,14 @@ def _build_table(
         "CPU total",
         "CPU time avg",
     ]
-    if has_cuda_time:
+    device_name = use_device.upper() if use_device is not None else "None"
+    if has_device_time:
         headers.extend(
             [
-                "Self CUDA",
-                "Self CUDA %",
-                "CUDA total",
-                "CUDA time avg",
-            ]
-        )
-    if has_privateuse1_time:
-        privateuse1 = use_device.upper()
-        headers.extend(
-            [
-                f"Self {privateuse1}",
-                f"Self {privateuse1} %",
-                f"{privateuse1} total",
-                f"{privateuse1} time avg",
+                f"Self {device_name}",
+                f"Self {device_name} %",
+                f"{device_name} total",
+                f"{device_name} time avg",
             ]
         )
     if profile_memory:
@@ -933,19 +859,11 @@ def _build_table(
                 "Self CPU Mem",
             ]
         )
-        if has_cuda_mem:
-            headers.extend(
-                [
-                    "CUDA Mem",
-                    "Self CUDA Mem",
-                ]
-            )
-        if has_privateuse1_mem:
-            privateuse1 = use_device.upper()
+        if use_device and has_device_mem:
             headers.extend(
                 [
-                    f"{privateuse1} Mem",
-                    f"Self {privateuse1} Mem",
+                    f"{device_name} Mem",
+                    f"Self {device_name} Mem",
                 ]
             )
     headers.append("# of Calls")
@@ -1019,22 +937,16 @@ def append(s):
         result.append(s)
         result.append("\n")  # Yes, newline after the end as well
 
-    sum_self_cpu_time_total = sum([event.self_cpu_time_total for event in events])
-    sum_self_cuda_time_total = 0
-    sum_self_privateuse1_time_total = 0
+    sum_self_cpu_time_total = 0
+    sum_self_device_time_total = 0
     for evt in events:
-        if evt.device_type == DeviceType.CPU:
+        sum_self_cpu_time_total += evt.self_cpu_time_total
+        if evt.device_type == DeviceType.CPU and evt.is_legacy:
             # in legacy profiler, kernel info is stored in cpu events
-            if evt.is_legacy:
-                if not use_device:
-                    sum_self_cuda_time_total += evt.self_cuda_time_total
-                else:
-                    sum_self_privateuse1_time_total += evt.self_privateuse1_time_total
-        elif evt.device_type == DeviceType.CUDA:
+            sum_self_device_time_total += evt.self_device_time_total
+        elif evt.device_type in [DeviceType.CUDA, DeviceType.PrivateUse1]:
             # in kineto profiler, there're events with the correct device type (e.g. CUDA)
-            sum_self_cuda_time_total += evt.self_cuda_time_total
-        elif evt.device_type == DeviceType.PrivateUse1:
-            sum_self_privateuse1_time_total += evt.self_privateuse1_time_total
+            sum_self_device_time_total += evt.self_device_time_total
 
     # Actual printing
     if header is not None:
@@ -1079,28 +991,16 @@ def trim_path(path, src_column_width):
             evt.cpu_time_total_str,  # CPU total
             evt.cpu_time_str,  # CPU time avg
         ]
-        if has_cuda_time:
-            row_values.extend(
-                [
-                    evt.self_cuda_time_total_str,
-                    # CUDA time total %
-                    _format_time_share(
-                        evt.self_cuda_time_total, sum_self_cuda_time_total
-                    ),
-                    evt.cuda_time_total_str,
-                    evt.cuda_time_str,  # Cuda time avg
-                ]
-            )
-        if has_privateuse1_time:
+        if has_device_time:
             row_values.extend(
                 [
-                    evt.self_privateuse1_time_total_str,
-                    # PrivateUse1 time total %
+                    evt.self_device_time_total_str,
+                    # device time total %
                     _format_time_share(
-                        evt.self_privateuse1_time_total, sum_self_privateuse1_time_total
+                        evt.self_device_time_total, sum_self_device_time_total
                     ),
-                    evt.privateuse1_time_total_str,
-                    evt.privateuse1_time_str,  # PrivateUse1 time avg
+                    evt.device_time_total_str,
+                    evt.device_time_str,  # device time avg
                 ]
             )
         if profile_memory:
@@ -1112,22 +1012,13 @@ def trim_path(path, src_column_width):
                     _format_memory(evt.self_cpu_memory_usage),
                 ]
             )
-            if has_cuda_mem:
-                row_values.extend(
-                    [
-                        # CUDA Mem Total
-                        _format_memory(evt.cuda_memory_usage),
-                        # Self CUDA Mem Total
-                        _format_memory(evt.self_cuda_memory_usage),
-                    ]
-                )
-            if has_privateuse1_mem:
+            if use_device and has_device_mem:
                 row_values.extend(
                     [
-                        # PrivateUse1 Mem Total
-                        _format_memory(evt.privateuse1_memory_usage),
-                        # Self PrivateUse1 Mem Total
-                        _format_memory(evt.self_privateuse1_memory_usage),
+                        # Device Mem Total
+                        _format_memory(evt.device_memory_usage),
+                        # Self Device Mem Total
+                        _format_memory(evt.self_device_memory_usage),
                     ]
                 )
         row_values.append(
@@ -1142,7 +1033,7 @@ def trim_path(path, src_column_width):
             if evt.flops <= 0:
                 row_values.append("--")
             else:
-                row_values.append(f"{evt.flops * flops_scale:8.3f}")
+                row_values.append(f"{evt.flops * flops_scale:8.3f}")  # type: ignore[possibly-undefined]
         if has_stack:
             src_field = ""
             if len(evt.stack) > 0:
@@ -1163,10 +1054,9 @@ def trim_path(path, src_column_width):
 
     append(header_sep)
     append(f"Self CPU time total: {_format_time(sum_self_cpu_time_total)}")
-    if has_cuda_time:
-        append(f"Self CUDA time total: {_format_time(sum_self_cuda_time_total)}")
-    if has_privateuse1_time:
+    if has_device_time:
         append(
-            f"Self {use_device.upper()} time total: {_format_time(sum_self_privateuse1_time_total)}"
+            f"Self {use_device.upper() if use_device is not None else 'None'} "
+            f"time total: {_format_time(sum_self_device_time_total)}"
         )
     return "".join(result)
diff --git a/torch/backends/_nnapi/prepare.py b/torch/backends/_nnapi/prepare.py
index d5d69b9d648b3..8b07c3d6e0c6d 100644
--- a/torch/backends/_nnapi/prepare.py
+++ b/torch/backends/_nnapi/prepare.py
@@ -75,7 +75,7 @@ def forward(self, args: List[torch.Tensor]) -> List[torch.Tensor]:
             elif fmt == 1:
                 fixed_args.append(args[idx].permute(0, 2, 3, 1).contiguous())
             else:
-                raise Exception("Invalid mem_fmt")
+                raise ValueError("Invalid mem_fmt")
         comp.run(fixed_args, outs)
         assert len(outs) == len(self.out_mem_fmts)
         for idx in range(len(self.out_templates)):
@@ -87,7 +87,7 @@ def forward(self, args: List[torch.Tensor]) -> List[torch.Tensor]:
             elif fmt == 1:
                 outs[idx] = outs[idx].permute(0, 3, 1, 2)
             else:
-                raise Exception("Invalid mem_fmt")
+                raise ValueError("Invalid mem_fmt")
         return outs
 
 
diff --git a/torch/backends/_nnapi/serializer.py b/torch/backends/_nnapi/serializer.py
index 748132eb07d6e..551fa821df68b 100644
--- a/torch/backends/_nnapi/serializer.py
+++ b/torch/backends/_nnapi/serializer.py
@@ -228,7 +228,7 @@ def use_nchw(self):
             return True
         if self.dim_order is DimOrder.CHANNELS_LAST:
             return False
-        raise Exception("Unknown dim order")
+        raise Exception("Unknown dim order")  # noqa: TRY002
 
 
 def broadcast_shapes(shape1, shape2):
@@ -241,10 +241,14 @@ def broadcast_shapes(shape1, shape2):
     # don't match between PT and NNAPI, even though semantics match.
     if len(s1) > len(s2):
         # s2 = [1] * (len(s1) - len(s2)) + s2
-        raise Exception("Non-equal-rank broadcast is not supported yet.")
+        raise Exception(  # noqa: TRY002
+            "Non-equal-rank broadcast is not supported yet."
+        )  # noqa: TRY002
     if len(s2) > len(s1):
         # s3 = [1] * (len(s2) - len(s1)) + s1
-        raise Exception("Non-equal-rank broadcast is not supported yet.")
+        raise Exception(  # noqa: TRY002
+            "Non-equal-rank broadcast is not supported yet."
+        )  # noqa: TRY002
     ret = []
     for d1, d2 in zip(s1, s2):
         if d1 == 1:
@@ -254,7 +258,9 @@ def broadcast_shapes(shape1, shape2):
         elif d1 == d2:
             ret.append(d1)
         else:
-            raise Exception(f"Cannot broadcast shapes: {shape1} and {shape2}")
+            raise Exception(  # noqa: TRY002
+                f"Cannot broadcast shapes: {shape1} and {shape2}"
+            )  # noqa: TRY002
     return tuple(ret)
 
 
@@ -263,7 +269,7 @@ def get_conv_pool_shape(image_shape, args, out_ch, transpose):
 
     # TODO: Handle dilation
     if args.dilation_h != 1 or args.dilation_w != 1:
-        raise Exception("Dilation not supported yet.")
+        raise Exception("Dilation not supported yet.")  # noqa: TRY002
 
     if transpose:
         out_h = (in_h - 1) * args.stride_h + args.kernel_h - args.pad_t - args.pad_b
@@ -296,7 +302,7 @@ def fix_shape(shape, dim_order):
     if dim_order is DimOrder.UNKNOWN_CONSTANT:
         # XXX think this through
         return shape
-    raise Exception(f"Bad dim_order: {dim_order!r}.")
+    raise Exception(f"Bad dim_order: {dim_order!r}.")  # noqa: TRY002
 
 
 def reverse_map_dim(dim_order, d):
@@ -348,7 +354,7 @@ def get_next_operand_id(self):
     def add_tensor_operand(self, jitval, oper):
         assert isinstance(oper, Operand)
         if jitval in self.jitval_operand_map:
-            raise Exception(f"Duplicate tensor: {jitval!r}")
+            raise Exception(f"Duplicate tensor: {jitval!r}")  # noqa: TRY002
 
         operand_id = self.get_next_operand_id()
         self.operands.append(oper)
@@ -393,16 +399,18 @@ def torch_tensor_to_operand(self, tensor, dim_order):
                     scale = tensor.nnapi_scale
                     zero_point = tensor.nnapi_zero_point
                 else:
-                    raise Exception(
+                    raise Exception(  # noqa: TRY002
                         f"`nnapi_type` needs to be one of {op_codes} for `int16`"
                     )
             else:
-                raise Exception(
+                raise Exception(  # noqa: TRY002
                     "`int16` isn't supported. If you're trying to represent NNAPI"
                     " qint16 with Pytorch int16, set `use_int16_for_qint16 = True`"
                 )
         else:
-            raise Exception(f"Can't handle input with dtype '{tensor.dtype}'")
+            raise Exception(  # noqa: TRY002
+                f"Can't handle input with dtype '{tensor.dtype}'"
+            )  # noqa: TRY002
         return Operand(
             shape=tuple(tensor.shape),
             op_type=op_type,
@@ -491,7 +499,9 @@ def get_tensor_operand_by_jitval_fixed_size(self, jitval):
             if s == 0:
                 # TODO: Improve this error message, possibly after converting
                 # many callsites to support flexible size.
-                raise Exception("Flexible size is not supported for this operand.")
+                raise Exception(  # noqa: TRY002
+                    "Flexible size is not supported for this operand."
+                )  # noqa: TRY002
             if s < 0:
                 # runtime flex
                 LOG.warning("Operand %s has runtime flex shape", oper)
@@ -526,10 +536,12 @@ def add_constant_value(self, jitval, ctype, value):
     def get_constant_value(self, jitval, typekind=None):
         record = self.constants.get(jitval)
         if record is None:
-            raise Exception(f"Could not find constant value for '{jitval!r}'.")
+            raise Exception(  # noqa: TRY002
+                f"Could not find constant value for '{jitval!r}'."
+            )  # noqa: TRY002
         ctype, _ = record
         if typekind is not None and ctype.kind() != typekind:
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 f"Expected constant value of type {typekind}, but got {ctype.kind()} for value '{jitval!r}'"
             )
         return record
@@ -553,7 +565,9 @@ def operand_to_template_torchscript(self, op_id, oper, shape=None):
                 # Runtime flexible shape
                 shape_parts.append("0")
             else:
-                raise Exception("Unknown dim value, dimensions should be >= -1")
+                raise Exception(  # noqa: TRY002
+                    "Unknown dim value, dimensions should be >= -1"
+                )  # noqa: TRY002
             shape_parts.append(",")
         shape_parts.append(")")
         shape_code = "".join(shape_parts)
@@ -574,12 +588,14 @@ def operand_to_template_torchscript(self, op_id, oper, shape=None):
             if self.use_int16_for_qint16:
                 return f"torch.zeros({shape_code}, dtype=torch.int16)"
             else:
-                raise Exception(
+                raise Exception(  # noqa: TRY002
                     "`int16` isn't supported. If you're trying to represent NNAPI"
                     " qint16 with Pytorch int16, set `use_int16_for_qint16 = True`"
                 )
 
-        raise Exception(f"Unsupported output operand type: {oper.op_type}")
+        raise Exception(  # noqa: TRY002
+            f"Unsupported output operand type: {oper.op_type}"
+        )  # noqa: TRY002
 
     def forward_operand_shape(self, out_op_id, out_dim, in_op_id, in_dim):
         self.compute_operand_shape(out_op_id, out_dim, flex_name(in_op_id, in_dim))
@@ -591,7 +607,9 @@ def compute_operand_shape(self, op_id, dim, expr):
 
     def transpose_to_nhwc(self, in_id, oper):
         if oper.shape[2:] != (1, 1):
-            raise Exception("Automatic transpose only supported for H,W == 1,1")
+            raise Exception(  # noqa: TRY002
+                "Automatic transpose only supported for H,W == 1,1"
+            )  # noqa: TRY002
 
         out_oper = oper._replace(dim_order=DimOrder.CHANNELS_LAST)
 
@@ -618,7 +636,7 @@ def transpose_for_broadcast(self, in0_id, in0_oper, in1_id, in1_oper):
         if orders == (DimOrder.CHANNELS_LAST, DimOrder.PRESUMED_CONTIGUOUS):
             return (in0_id, in0_oper) + self.transpose_to_nhwc(in1_id, in1_oper)
 
-        raise Exception(
+        raise Exception(  # noqa: TRY002
             f"Automatic transpose not supported for dim_orders: {in0_oper.dim_order!r}, {in1_oper.dim_order!r}"
         )
 
@@ -627,7 +645,9 @@ def get_size_arg(self, jitval):
         if ctype.kind() == "ListType":
             assert ctype.getElementType().kind() == "IntType"
             return value
-        raise Exception(f"Can't handle size arg of type '{ctype!r}' for '{jitval!r}'")
+        raise Exception(  # noqa: TRY002
+            f"Can't handle size arg of type '{ctype!r}' for '{jitval!r}'"
+        )  # noqa: TRY002
 
     def get_conv_pool_args_2d_from_pack(self, kernel_size, packed_config):
         pc = [i.item() for i in packed_config]
@@ -714,7 +734,9 @@ def serialize_model(self, model, inputs, return_shapes=None):
             return_values = self.tensor_sequences[retn_input]
             retval_count = len(return_values)
         else:
-            raise Exception(f"Unsupported return type: {retn_input.type()}")
+            raise Exception(  # noqa: TRY002
+                f"Unsupported return type: {retn_input.type()}"
+            )  # noqa: TRY002
 
         if return_shapes is not None:
             assert len(return_shapes) == len(return_values)
@@ -888,7 +910,9 @@ def serialize_ints(ints):
     def add_node(self, node):
         adder = self.ADDER_MAP.get(node.kind())
         if not adder:
-            raise Exception(f"Unsupported node kind ({node.kind()!r}) in node {node!r}")
+            raise Exception(  # noqa: TRY002
+                f"Unsupported node kind ({node.kind()!r}) in node {node!r}"
+            )  # noqa: TRY002
         adder(self, node)
 
     def _identity(self, node):
@@ -939,7 +963,7 @@ def add_list_construct(self, node):
         if tensors is not None:
             self.add_tensor_sequence(output, tensors)
         if const_vals is None and tensors is None:
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 f"Unable to handle ListConstruct node.  Neither all constants nor all tensors. {node!r}"
             )
 
@@ -989,7 +1013,7 @@ def add_reshape(self, node):
         is_trivial_reshape = len(shape) == 2 and shape[1] == -1
 
         if in_oper.dim_order != DimOrder.PRESUMED_CONTIGUOUS and not is_trivial_reshape:
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 "Currently, reshape is only supported on NHWC tensors if the target size is [X, -1]."
             )
 
@@ -1022,7 +1046,7 @@ def add_flatten(self, node):
             in_oper.shape[1] == 1 or (in_oper.shape[2] == 1 and in_oper.shape[3] == 1)
         )
         if in_oper.dim_order != DimOrder.PRESUMED_CONTIGUOUS and not is_trivial_flatten:
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 "Currently, flatten is not supported on NHWC tensors unless C=1 or H=W=1"
             )
 
@@ -1038,10 +1062,12 @@ def add_flatten(self, node):
         )
 
         if any(dim == 0 for dim in in_oper.shape[start_dim : end_dim + 1]):
-            raise Exception("Flattening flexible dims is not supported yet")
+            raise Exception(  # noqa: TRY002
+                "Flattening flexible dims is not supported yet"
+            )  # noqa: TRY002
         non_flattened_dims = in_oper.shape[:start_dim] + in_oper.shape[end_dim + 1 :]
         if non_flattened_dims.count(0) > 1:
-            raise Exception("Only 1 dim can be flexible")
+            raise Exception("Only 1 dim can be flexible")  # noqa: TRY002
 
         out_oper = in_oper._replace(
             shape=out_shape, dim_order=DimOrder.PRESUMED_CONTIGUOUS
@@ -1087,7 +1113,7 @@ def add_slice(self, node):
             return
 
         if in_oper.shape[dim_value] == 0:
-            raise Exception("Unable to slice with flexible shape")
+            raise Exception("Unable to slice with flexible shape")  # noqa: TRY002
 
         if stop_value < 0:
             stop_value += in_oper.shape[dim_value]
@@ -1095,7 +1121,9 @@ def add_slice(self, node):
             stop_value = in_oper.shape[dim_value]
 
         if start_value >= stop_value:
-            raise Exception("Slice start value should be less than stop value")
+            raise Exception(  # noqa: TRY002
+                "Slice start value should be less than stop value"
+            )  # noqa: TRY002
 
         out_len = (stop_value - start_value) // step_value
         out_shape = tuple(
@@ -1176,7 +1204,7 @@ def add_cat(self, node):
             shape=change_element(out_oper.shape, dim, out_dim_size)
         )
 
-        if in_oper.dim_order == DimOrder.CHANNELS_LAST:
+        if in_oper.dim_order == DimOrder.CHANNELS_LAST:  # type: ignore[possibly-undefined]
             assert len(out_oper.shape) == 4
             nnapi_dim = [0, 3, 1, 2][dim]
         else:
@@ -1253,7 +1281,7 @@ def add_quantize(self, node):
 
         in_id, in_oper = self.get_tensor_operand_by_jitval_fixed_size(node.inputsAt(0))
         if in_oper.dim_order != DimOrder.CHANNELS_LAST:
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 "Most hardware backends prefer NHWC quantized tensors.  "
                 "Try setting `t.nnapi_nhwc = True` on your tensor inputs.  "
             )
@@ -1261,7 +1289,7 @@ def add_quantize(self, node):
         _, zero_point = self.get_constant_value(node.inputsAt(2), "IntType")
         _, scalar_type = self.get_constant_value(node.inputsAt(3), "IntType")
         if scalar_type != TorchScalarTypes.QUINT8.value:
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 "PyTorch NNAPI export only supports quantized tensors "
                 "with the quint8 dtype."
             )
@@ -1346,7 +1374,9 @@ def _do_add_binary(self, node, opcode, fuse_code, *, qparams=None):  # noqa: D40
                 node.inputsAt(0), in1_oper.dim_order
             )
         else:
-            raise Exception(f"Can't do a NNAPI binary op: {opcode} on two constants")
+            raise Exception(  # noqa: TRY002
+                f"Can't do a NNAPI binary op: {opcode} on two constants"
+            )  # noqa: TRY002
 
         assert in0_oper.op_type == in1_oper.op_type
         in0_id, in0_oper, in1_id, in1_oper = self.transpose_for_broadcast(
@@ -1390,7 +1420,9 @@ def add_add_sub_op(self, node, opcode, fuse_code):
 
         _, alpha = self.get_constant_value(node.inputsAt(2), "IntType")
         if alpha != 1:
-            raise Exception("NNAPI does not support add/sub with alpha.")
+            raise Exception(  # noqa: TRY002
+                "NNAPI does not support add/sub with alpha."
+            )  # noqa: TRY002
 
         self._do_add_binary(node, opcode, fuse_code)
 
@@ -1440,7 +1472,9 @@ def add_hardtanh(self, node):
 
         opcode = op_map.get((min_val, max_val))
         if opcode is None:
-            raise Exception("NNAPI only supports hardtanh with args (-1, 1) or (0, 6).")
+            raise Exception(  # noqa: TRY002
+                "NNAPI only supports hardtanh with args (-1, 1) or (0, 6)."
+            )  # noqa: TRY002
 
         inputs = [None] * 1
         inputs[0] = in_id
@@ -1464,7 +1498,7 @@ def add_prelu_op(self, node):
         if w_oper.shape[0] > 1:
             if in_oper.use_nchw():
                 # TODO: Support this by adding trailing 1 dims.
-                raise Exception(
+                raise Exception(  # noqa: TRY002
                     "Per-channel PReLU only supports channels_last right now."
                 )
 
@@ -1473,7 +1507,9 @@ def add_prelu_op(self, node):
             if size > 0:
                 pass
             elif dim <= 1:
-                raise Exception("PReLU requires fixed size for dim 0 and dim 1.")
+                raise Exception(  # noqa: TRY002
+                    "PReLU requires fixed size for dim 0 and dim 1."
+                )  # noqa: TRY002
             else:
                 self.forward_operand_shape(out_id, dim, in_id, dim)
 
@@ -1499,7 +1535,7 @@ def add_pool2d_node(self, node, opcode):
             self.get_size_arg(kernel), stride, padding, dilation
         )
         if args.dilation_h != 1 or args.dilation_w != 1:
-            raise Exception("NNAPI does not support dilated pooling.")
+            raise Exception("NNAPI does not support dilated pooling.")  # noqa: TRY002
 
         image_id, image_oper = self.get_tensor_operand_by_jitval_fixed_size(image)
         assert len(image_oper.shape) == 4
@@ -1545,7 +1581,7 @@ def add_avg_pool2d(self, node):
         _, count_include_pad_value = self.get_constant_value(count_include_pad)
         _, divisor_override_value = self.get_constant_value(divisor_override)
         if not count_include_pad_value or divisor_override_value:
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 "NNAPI doesn't support count_include_pad=False or divisor_override"
             )
 
@@ -1596,7 +1632,7 @@ def add_adaptive_avg_pool2d(self, node):
         assert size_ctype.kind() == "ListType"
         assert size_ctype.getElementType().kind() == "IntType"
         if size_arg != [1, 1]:
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 "NNAPI only supports adaptive_avg_pool2d with output size (1, 1)."
             )
 
@@ -1633,10 +1669,10 @@ def add_upsample_nearest2d(self, node):
         size_ctype, size_arg = self.get_constant_value(size_jit)
 
         if node.inputsSize() == 3:
-            scale_ctype, scale_arg = self.get_constant_value(scale_jit)
+            scale_ctype, scale_arg = self.get_constant_value(scale_jit)  # type: ignore[possibly-undefined]
         else:
-            scale_h_ctype, scale_h_arg = self.get_constant_value(scale_h_jit)
-            scale_w_ctype, scale_w_arg = self.get_constant_value(scale_w_jit)
+            scale_h_ctype, scale_h_arg = self.get_constant_value(scale_h_jit)  # type: ignore[possibly-undefined]
+            scale_w_ctype, scale_w_arg = self.get_constant_value(scale_w_jit)  # type: ignore[possibly-undefined]
 
             # The only way for the 4-argument overload of upsample_nearest2d to
             # have been added to the graph without error is if the scale_h and
@@ -1651,7 +1687,7 @@ def add_upsample_nearest2d(self, node):
         assert len(image_oper.shape) == 4
 
         if size_ctype.kind() != "NoneType" and scale_ctype.kind() != "NoneType":
-            raise Exception("Size and scale cannot both be non-None.")
+            raise Exception("Size and scale cannot both be non-None.")  # noqa: TRY002
         elif size_ctype.kind() != "NoneType":
             assert size_ctype.kind() == "ListType"
             assert size_ctype.getElementType().kind() == "IntType"
@@ -1683,7 +1719,7 @@ def add_upsample_nearest2d(self, node):
             arg_h = self.add_immediate_float_scalar(scale_arg[0])
             arg_w = self.add_immediate_float_scalar(scale_arg[1])
         else:
-            raise Exception("Size and scale cannot both be None.")
+            raise Exception("Size and scale cannot both be None.")  # noqa: TRY002
 
         out_shape = (image_oper.shape[0], image_oper.shape[1], out_h, out_w)
         use_nchw = image_oper.use_nchw()
@@ -1692,7 +1728,7 @@ def add_upsample_nearest2d(self, node):
         )
 
         if image_oper.shape[0] == 0 or image_oper.shape[1] == 0:
-            raise Exception("Flexible batch or channels not supported")
+            raise Exception("Flexible batch or channels not supported")  # noqa: TRY002
 
         # Handle variable input size
         for dim in (2, 3):  # h, w indices
@@ -1706,7 +1742,9 @@ def add_upsample_nearest2d(self, node):
                         f"int({scale_arg[dim - 2]} * {flex_name(image_id, dim)})",
                     )
                 else:
-                    raise Exception("Size and scale cannot both be None.")
+                    raise Exception(  # noqa: TRY002
+                        "Size and scale cannot both be None."
+                    )  # noqa: TRY002
 
         inputs = [None] * 4
         inputs[0] = image_id
@@ -1728,7 +1766,7 @@ def add_addmm(self, node):
             scale_ctype, scale_value = self.get_constant_value(jitval)
             assert scale_ctype.kind() in ("IntType", "FloatType")
             if scale_value != 1:
-                raise Exception(
+                raise Exception(  # noqa: TRY002
                     "NNAPI Fully-Connected does not support alpha and beta."
                 )
 
@@ -1823,7 +1861,7 @@ def add_qlinear(self, node):
         multiplier = input_oper.scale * weight_scale / out_scale
         assert multiplier > 0
         if multiplier >= 1:
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 "Quantized convolution multiplier is greater than 1.  "
                 "This is supported by NNAPI, but not by most hardware backends.  "
                 "Try training a model without quantization-aware training.  "
@@ -2005,7 +2043,7 @@ def add_qconv2d(self, node, fuse_code, transpose=False):
         multiplier = image_oper.scale * weight_scale / out_scale
         assert multiplier > 0
         if multiplier >= 1:
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 "Quantized convolution multiplier is greater than 1.  "
                 "This is supported by NNAPI, but not by most hardware backends.  "
                 "Try training a model without quantization-aware training.  "
@@ -2050,7 +2088,7 @@ def add_conv2d_common(
             depthwise = True
             weight_permutation = (1, 2, 3, 0)
         else:
-            raise Exception("Group convolution not supported yet.")
+            raise Exception("Group convolution not supported yet.")  # noqa: TRY002
 
         # TODO: Transform at load time to share weights with CPU model.
         nnapi_weight_tensor = weight_tensor.permute(*weight_permutation).contiguous()
@@ -2068,7 +2106,9 @@ def add_conv2d_common(
             assert approx_equal(image_oper.scale * weight_oper.scale, bias_oper.scale)
             assert bias_oper.zero_point == 0
         else:
-            raise Exception(f"Unsupported input type for conv2d: {image_oper.op_type}")
+            raise Exception(  # noqa: TRY002
+                f"Unsupported input type for conv2d: {image_oper.op_type}"
+            )  # noqa: TRY002
 
         assert len(image_oper.shape) == 4
         assert len(weight_oper.shape) == 4
@@ -2139,7 +2179,7 @@ def _handle_conv_pool_flexible_input(self, out_id, jit_image, args, transpose):
         if batch == 0:
             self.forward_operand_shape(out_id, 0, image_id, 0)
         if in_ch == 0:
-            raise Exception("Input channels can't be flexible")
+            raise Exception("Input channels can't be flexible")  # noqa: TRY002
         # H & W
         if transpose:
             if in_h == 0:
diff --git a/torch/backends/cuda/__init__.py b/torch/backends/cuda/__init__.py
index 77ce755d14a98..6d079bdf06bf8 100644
--- a/torch/backends/cuda/__init__.py
+++ b/torch/backends/cuda/__init__.py
@@ -1,4 +1,5 @@
 import contextlib
+import warnings
 
 from typing import Union
 
@@ -11,10 +12,13 @@
     "cuFFTPlanCacheManager",
     "cuBLASModule",
     "preferred_linalg_library",
+    "preferred_blas_library",
     "cufft_plan_cache",
     "matmul",
     "SDPBackend",
     "SDPAParams",
+    "enable_cudnn_sdp",
+    "cudnn_sdp_enabled",
     "enable_flash_sdp",
     "flash_sdp_enabled",
     "enable_mem_efficient_sdp",
@@ -204,11 +208,61 @@ def preferred_linalg_library(
     return torch._C._get_linalg_preferred_backend()
 
 
+_BlasBackends = {
+    "cublas": torch._C._BlasBackend.Cublas,
+    "cublaslt": torch._C._BlasBackend.Cublaslt,
+    "hipblaslt": torch._C._BlasBackend.Cublaslt,  # alias
+}
+_BlasBackends_str = ", ".join(_BlasBackends.keys())
+
+
+def preferred_blas_library(
+    backend: Union[None, str, torch._C._BlasBackend] = None
+) -> torch._C._BlasBackend:
+    r"""
+    Override the library PyTorch uses for BLAS operations. Choose between cuBLAS and cuBLASLt.
+
+    .. warning:: This flag is experimental and subject to change.
+
+    When PyTorch runs a CUDA BLAS operation it defaults to cuBLAS even if both cuBLAS and cuBLASLt are available.
+    For PyTorch built for ROCm, hipBLAS and hipBLASLt may offer different performance.
+    This flag (a :class:`str`) allows overriding which BLAS library to use.
+
+    * If `"cublas"` is set then cuBLAS will be used wherever possible.
+    * If `"cublaslt"` is set then cuBLASLt will be used wherever possible.
+    * When no input is given, this function returns the currently preferred library.
+    * User may use the environment variable TORCH_BLAS_PREFER_CUBLASLT=1 to set the preferred library to cuBLASLt
+      globally.
+      This flag only sets the initial value of the preferred library and the preferred library
+      may still be overridden by this function call later in your script.
+
+    Note: When a library is preferred other libraries may still be used if the preferred library
+    doesn't implement the operation(s) called.
+    This flag may achieve better performance if PyTorch's library selection is incorrect
+    for your application's inputs.
+
+    """
+    if backend is None:
+        pass
+    elif isinstance(backend, str):
+        if backend not in _BlasBackends:
+            raise RuntimeError(
+                "Unknown input value. " f"Choose from: {_BlasBackends_str}."
+            )
+        torch._C._set_blas_preferred_backend(_BlasBackends[backend])
+    elif isinstance(backend, torch._C._BlasBackend):
+        torch._C._set_blas_preferred_backend(backend)
+    else:
+        raise RuntimeError("Unknown input value type.")
+
+    return torch._C._get_blas_preferred_backend()
+
+
 from torch._C import _SDPAParams as SDPAParams, _SDPBackend as SDPBackend
 
 # Set the __module__ attribute
-SDPBackend.__module__ = "torch.backends.cuda"
 SDPAParams.__module__ = "torch.backends.cuda"
+SDPAParams.__name__ = "SDPAParams"
 
 
 def flash_sdp_enabled():
@@ -305,11 +359,30 @@ def can_use_efficient_attention(params: SDPAParams, debug: bool = False) -> bool
     return torch._C._can_use_mem_efficient_attention(params, debug)
 
 
+def cudnn_sdp_enabled():
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Returns whether cuDNN scaled dot product attention is enabled or not.
+    """
+    return torch._C._get_cudnn_sdp_enabled()
+
+
+def enable_cudnn_sdp(enabled: bool):
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Enables or disables cuDNN scaled dot product attention.
+    """
+    torch._C._set_sdp_use_cudnn(enabled)
+
+
 @contextlib.contextmanager
 def sdp_kernel(
     enable_flash: bool = True,
     enable_math: bool = True,
     enable_mem_efficient: bool = True,
+    enable_cudnn: bool = True,
 ):
     r"""
     .. warning:: This flag is beta and subject to change.
@@ -317,18 +390,32 @@ def sdp_kernel(
     This context manager can be used to temporarily enable or disable any of the three backends for scaled dot product attention.
     Upon exiting the context manager, the previous state of the flags will be restored.
     """
-    previous_flash: bool = flash_sdp_enabled()
-    previous_mem_efficient: bool = mem_efficient_sdp_enabled()
-    previous_math: bool = math_sdp_enabled()
-    try:
-        enable_flash_sdp(enable_flash)
-        enable_mem_efficient_sdp(enable_mem_efficient)
-        enable_math_sdp(enable_math)
-        yield {}
-    finally:
-        enable_flash_sdp(previous_flash)
-        enable_mem_efficient_sdp(previous_mem_efficient)
-        enable_math_sdp(previous_math)
+    warnings.warn(
+        (
+            "torch.backends.cuda.sdp_kernel() "
+            "is deprecated. In the future, this context manager will be removed. "
+            "Please see, torch.nn.attention.sdpa_kernel() for the new context manager, with updated "
+            "signature."
+        ),
+        FutureWarning,
+    )
+    from torch.nn.attention import sdpa_kernel, SDPBackend
+
+    backend_list = []
+    if enable_flash:
+        backend_list.append(SDPBackend.FLASH_ATTENTION)
+    if enable_mem_efficient:
+        backend_list.append(SDPBackend.EFFICIENT_ATTENTION)
+    if enable_math:
+        backend_list.append(SDPBackend.MATH)
+    if enable_cudnn:
+        backend_list.append(SDPBackend.CUDNN_ATTENTION)
+
+    with sdpa_kernel(backend_list) as context:
+        try:
+            yield context
+        finally:
+            pass
 
 
 cufft_plan_cache = cuFFTPlanCacheManager()
diff --git a/torch/backends/cudnn/__init__.py b/torch/backends/cudnn/__init__.py
index 204868c29b6c9..e00d92f44b280 100644
--- a/torch/backends/cudnn/__init__.py
+++ b/torch/backends/cudnn/__init__.py
@@ -2,6 +2,7 @@
 import sys
 import warnings
 from contextlib import contextmanager
+from typing import Optional
 
 import torch
 from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
@@ -17,7 +18,7 @@
 #
 # to globally disable CuDNN/MIOpen
 
-__cudnn_version = None
+__cudnn_version: Optional[int] = None
 
 if _cudnn is not None:
 
diff --git a/torch/backends/cudnn/rnn.py b/torch/backends/cudnn/rnn.py
index 5ce166c8b28a4..aaf0bd02e8afd 100644
--- a/torch/backends/cudnn/rnn.py
+++ b/torch/backends/cudnn/rnn.py
@@ -18,7 +18,7 @@ def get_cudnn_mode(mode):
     elif mode == "GRU":
         return int(_cudnn.RNNMode.gru)
     else:
-        raise Exception(f"Unknown mode: {mode}")
+        raise Exception(f"Unknown mode: {mode}")  # noqa: TRY002
 
 
 # NB: We don't actually need this class anymore (in fact, we could serialize the
diff --git a/torch/backends/mkldnn/__init__.py b/torch/backends/mkldnn/__init__.py
index b2f936e9791b4..9cdee1cbd5650 100644
--- a/torch/backends/mkldnn/__init__.py
+++ b/torch/backends/mkldnn/__init__.py
@@ -1,6 +1,8 @@
 import sys
 from contextlib import contextmanager
 
+from typing import TYPE_CHECKING
+
 import torch
 from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
 
@@ -86,6 +88,10 @@ def __init__(self, m, name):
     enabled = ContextProp(torch._C._get_mkldnn_enabled, torch._C._set_mkldnn_enabled)
 
 
+if TYPE_CHECKING:
+    enabled: ContextProp
+
+
 # Cool stuff from torch/backends/cudnn/__init__.py and
 # https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
 sys.modules[__name__] = MkldnnModule(sys.modules[__name__], __name__)
diff --git a/torch/backends/mps/__init__.py b/torch/backends/mps/__init__.py
index 33ef6af9b7e9f..8d5e70f06a0a3 100644
--- a/torch/backends/mps/__init__.py
+++ b/torch/backends/mps/__init__.py
@@ -1,5 +1,7 @@
 from functools import lru_cache as _lru_cache
 
+from typing import Optional
+
 import torch
 from ...library import Library as _Library
 
@@ -34,7 +36,7 @@ def is_macos13_or_newer(minor: int = 0) -> bool:
     return torch._C._mps_is_on_macos_or_newer(13, minor)
 
 
-_lib = None
+_lib: Optional[_Library] = None
 
 
 def _init():
@@ -45,9 +47,8 @@ def _init():
     from ..._decomp.decompositions import (
         native_group_norm_backward as _native_group_norm_backward,
     )
-    from ..._refs import native_group_norm as _native_group_norm, var_mean as _var_mean
+    from ..._refs import native_group_norm as _native_group_norm
 
     _lib = _Library("aten", "IMPL")
-    _lib.impl("var_mean.correction", _var_mean, "MPS")
     _lib.impl("native_group_norm", _native_group_norm, "MPS")
     _lib.impl("native_group_norm_backward", _native_group_norm_backward, "MPS")
diff --git a/torch/backends/xeon/run_cpu.py b/torch/backends/xeon/run_cpu.py
index 61e8397b6181e..b67962c7025bf 100644
--- a/torch/backends/xeon/run_cpu.py
+++ b/torch/backends/xeon/run_cpu.py
@@ -133,7 +133,11 @@
 from os.path import expanduser
 from typing import Dict, List
 
-from torch.distributed.elastic.multiprocessing import start_processes, Std
+from torch.distributed.elastic.multiprocessing import (
+    DefaultLogsSpecs,
+    start_processes,
+    Std,
+)
 
 format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 logging.basicConfig(level=logging.INFO, format=format_str)
@@ -174,7 +178,7 @@ def __init__(self, test_input=""):
 
             # physical cores := core column in lscpu output
             #  logical cores :=  cPU column in lscpu output
-            self.node_nums = int(max([line[3] for line in self.cpuinfo])) + 1
+            self.node_nums = int(max(line[3] for line in self.cpuinfo)) + 1
             self.node_physical_cores: List[List[int]] = []  # node_id is index
             self.node_logical_cores: List[List[int]] = []  # node_id is index
             self.physical_core_node_map = {}  # physical core to numa node id
@@ -590,6 +594,9 @@ def launch(self, args):
         launch_args = {}
         launch_envs: Dict[int, Dict] = {}
         launch_tee = {}
+        # check whether is launched from torchrun with --nproc-per-node <num workers>
+        local_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
         for i in range(args.ninstances):
             cmd = []
             cur_process_cores = ""
@@ -615,6 +622,15 @@ def launch(self, args):
                     ]
 
                 core_ranges: List[Dict] = []
+                if local_size > 1:
+                    total_num_cores = len(core_list)
+                    cores_per_rank = total_num_cores // local_size
+                    assert (
+                        cores_per_rank >= 1
+                    ), "At least one core needs to be assigned to each rank"
+                    core_list = core_list[
+                        cores_per_rank * local_rank : cores_per_rank * (local_rank + 1)
+                    ]
                 for core in core_list:
                     if len(core_ranges) == 0:
                         range_elem = {"start": core, "end": core}
@@ -666,8 +682,7 @@ def launch(self, args):
             entrypoint=entrypoint,
             args=launch_args,
             envs=launch_envs,
-            log_dir=args.log_path,
-            tee=launch_tee,
+            logs_specs=DefaultLogsSpecs(log_dir=args.log_path, tee=launch_tee),
         )
         ctx.wait()
 
diff --git a/torch/compiler/__init__.py b/torch/compiler/__init__.py
index e619b9a2d900c..cf0b544e929a4 100644
--- a/torch/compiler/__init__.py
+++ b/torch/compiler/__init__.py
@@ -10,6 +10,8 @@
     "disable",
     "cudagraph_mark_step_begin",
     "wrap_numpy",
+    "is_compiling",
+    "is_dynamo_compiling",
 ]
 
 def compile(*args, **kwargs):
@@ -149,3 +151,43 @@ def wrap_numpy(fn):
     """
     from torch._dynamo.external_utils import wrap_numpy as wrap
     return wrap(fn)
+
+_is_compiling_flag: bool = False
+
+def is_compiling() -> bool:
+    """
+    Indicates whether a graph is executed/traced as part of torch.compile() or torch.export().
+
+    Note that there are 2 other related flags that should deprecated eventually:
+      * torch._dynamo.external_utils.is_compiling()
+      * torch._utils.is_compiling()
+
+    Example::
+
+        >>> def forward(self, x):
+        >>>     if not torch.compiler.is_compiling():
+        >>>        pass # ...logic that is not needed in a compiled/traced graph...
+        >>>
+        >>>     # ...rest of the function...
+    """
+    if torch.jit.is_scripting():
+        return False
+    else:
+        return _is_compiling_flag
+
+def is_dynamo_compiling() -> bool:
+    """
+    Indicates whether a graph is traced via TorchDynamo.
+
+    It's stricter than is_compiling() flag, as it would only be set to True when
+    TorchDynamo is used.
+
+    Example::
+
+        >>> def forward(self, x):
+        >>>     if not torch.compiler.is_dynamo_compiling():
+        >>>        pass # ...logic that is not needed in a TorchDynamo-traced graph...
+        >>>
+        >>>     # ...rest of the function...
+    """
+    return False
diff --git a/torch/cpu/amp/__init__.py b/torch/cpu/amp/__init__.py
index e4fe09f55632e..e72eb3b92a7f8 100644
--- a/torch/cpu/amp/__init__.py
+++ b/torch/cpu/amp/__init__.py
@@ -1 +1,2 @@
 from .autocast_mode import autocast
+from .grad_scaler import GradScaler
diff --git a/torch/cpu/amp/grad_scaler.py b/torch/cpu/amp/grad_scaler.py
new file mode 100644
index 0000000000000..32c1d368da9f4
--- /dev/null
+++ b/torch/cpu/amp/grad_scaler.py
@@ -0,0 +1,27 @@
+import torch
+
+__all__ = ["GradScaler"]
+
+
+class GradScaler(torch.amp.GradScaler):
+    r"""
+    See :class:`torch.amp.GradScaler`.
+    ``torch.cpu.amp.GradScaler(args...)`` is equivalent to ``torch.amp.GradScaler("cpu", args...)``
+    """
+
+    def __init__(
+        self,
+        init_scale: float = 2.0**16,
+        growth_factor: float = 2.0,
+        backoff_factor: float = 0.5,
+        growth_interval: int = 2000,
+        enabled: bool = True,
+    ) -> None:
+        super().__init__(
+            "cpu",
+            init_scale=init_scale,
+            growth_factor=growth_factor,
+            backoff_factor=backoff_factor,
+            growth_interval=growth_interval,
+            enabled=enabled,
+        )
diff --git a/torch/csrc/CudaIPCTypes.h b/torch/csrc/CudaIPCTypes.h
index 58327bae896d6..f4eb3afee1e5b 100644
--- a/torch/csrc/CudaIPCTypes.h
+++ b/torch/csrc/CudaIPCTypes.h
@@ -79,9 +79,8 @@ struct CudaIPCRefCountersFile final {
       std::string handle,
       uint64_t size,
       at::DataPtr data_ptr)
-      : next_offset_(0),
-        size_(size),
-        used_slots_(0),
+      : size_(size),
+
         handle_(std::move(handle)),
         refcounted_shared_mem_(std::move(data_ptr)) {}
 
@@ -119,9 +118,9 @@ struct CudaIPCRefCountersFile final {
   }
 
  private:
-  uint64_t next_offset_;
+  uint64_t next_offset_{0};
   uint64_t size_;
-  uint64_t used_slots_;
+  uint64_t used_slots_{0};
   std::string handle_;
   at::DataPtr refcounted_shared_mem_;
 };
diff --git a/torch/csrc/DataLoader.cpp b/torch/csrc/DataLoader.cpp
index 0a7c468f4ebef..a0c668043b07d 100644
--- a/torch/csrc/DataLoader.cpp
+++ b/torch/csrc/DataLoader.cpp
@@ -20,7 +20,6 @@
 #include <fmt/format.h>
 
 #include <sys/wait.h>
-#include <atomic>
 #include <csignal>
 #include <map>
 #include <set>
@@ -175,21 +174,19 @@ static PyObject* THPModule_errorIfAnyWorkerFails(
 // of pids we are interested in.
 static PyObject* THPModule_setWorkerPIDs(PyObject* module, PyObject* args) {
   HANDLE_TH_ERRORS
-  if (PyTuple_GET_SIZE(args) != 2) {
-    throw TypeError("_set_worker_pids expects exactly 2 arguments.");
-  }
+  TORCH_CHECK_TYPE(
+      PyTuple_GET_SIZE(args) == 2,
+      "_set_worker_pids expects exactly 2 arguments.");
   int64_t key = THPUtils_unpackLong(PyTuple_GET_ITEM(args, 0));
-  if (worker_pids.find(key) != worker_pids.end()) {
-    throw ValueError(
-        "_set_worker_pids should be called only once for each _BaseDataLoaderIter.");
-  }
+  TORCH_CHECK_VALUE(
+      worker_pids.find(key) == worker_pids.end(),
+      "_set_worker_pids should be called only once for each _BaseDataLoaderIter.");
   PyObject* child_pids = PyTuple_GET_ITEM(args, 1);
-  if (!PyTuple_Check(child_pids)) {
-    throw TypeError(
-        "_set_worker_pids expects a tuple for child_pids, but got %s.",
-        Py_TYPE(child_pids)->tp_name);
-  }
-
+  TORCH_CHECK_TYPE(
+      PyTuple_Check(child_pids),
+      "_set_worker_pids expects a tuple for child_pids, but got ",
+      Py_TYPE(child_pids)->tp_name,
+      ".");
   std::set<pid_t> pids_set = {};
   auto size = PyTuple_GET_SIZE(child_pids);
   for (const auto idx : c10::irange(size)) {
@@ -210,11 +207,10 @@ static PyObject* THPModule_removeWorkerPIDs(
 
   int64_t key = THPUtils_unpackLong(loader_id);
   auto it = worker_pids.find(key);
-  if (it == worker_pids.end()) {
-    throw ValueError(fmt::format(
-        "Cannot find worker information for _BaseDataLoaderIter with id {}",
-        key));
-  }
+  TORCH_CHECK_VALUE(
+      it != worker_pids.end(),
+      "Cannot find worker information for _BaseDataLoaderIter with id ",
+      key);
   worker_pids.erase(it);
 
   Py_RETURN_NONE;
diff --git a/torch/csrc/Device.cpp b/torch/csrc/Device.cpp
index 48758771dda6e..aaf04fb4b33d9 100644
--- a/torch/csrc/Device.cpp
+++ b/torch/csrc/Device.cpp
@@ -11,7 +11,6 @@
 #include <c10/util/Exception.h>
 
 #include <structmember.h>
-#include <cstring>
 #include <limits>
 #include <sstream>
 
diff --git a/torch/csrc/Dtype.cpp b/torch/csrc/Dtype.cpp
index 8eee2a02faec5..4b911322ff4cd 100644
--- a/torch/csrc/Dtype.cpp
+++ b/torch/csrc/Dtype.cpp
@@ -7,6 +7,7 @@
 #include <torch/csrc/utils/object_ptr.h>
 #include <torch/csrc/utils/python_numbers.h>
 #include <torch/csrc/utils/python_strings.h>
+#include <torch/csrc/utils/pythoncapi_compat.h>
 #include <torch/csrc/utils/tensor_dtypes.h>
 #include <torch/csrc/utils/tensor_types.h>
 #include <cstring>
@@ -74,21 +75,25 @@ PyObject* THPDtype_reduce(PyObject* _self, PyObject* noargs) {
 }
 
 PyObject* THPDtype_to_real(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
   auto* self = (THPDtype*)_self;
   auto scalar_type = self->scalar_type;
   if (!at::isFloatingType(self->scalar_type)) {
     scalar_type = at::toRealValueType(self->scalar_type);
   }
-  return (PyObject*)torch::getTHPDtype(scalar_type);
+  return Py_NewRef(torch::getTHPDtype(scalar_type));
+  END_HANDLE_TH_ERRORS
 }
 
 PyObject* THPDtype_to_complex(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
   auto* self = (THPDtype*)_self;
   auto scalar_type = self->scalar_type;
   if (!at::isComplexType(self->scalar_type)) {
     scalar_type = at::toComplexType(self->scalar_type);
   }
-  return (PyObject*)torch::getTHPDtype(scalar_type);
+  return Py_NewRef(torch::getTHPDtype(scalar_type));
+  END_HANDLE_TH_ERRORS
 }
 
 typedef PyObject* (*getter)(PyObject*, void*);
diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp
index c35b6da1cb2a9..948a7b7542ebc 100644
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@@ -8,19 +8,14 @@
 #include <torch/csrc/Storage.h>
 #include <torch/csrc/autograd/generated/VariableType.h>
 #include <torch/csrc/utils/cuda_enabled.h>
-#include <torch/csrc/utils/cuda_lazy_init.h>
+#include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/object_ptr.h>
 
 #include <ATen/ATen.h>
 #include <ATen/FunctionalStorageImpl.h>
 
 #include <array>
-#include <memory>
-#include <sstream>
 #include <stdexcept>
-#include <string>
-#include <unordered_map>
-#include <vector>
 
 namespace torch {
 namespace {
diff --git a/torch/csrc/DynamicTypes.h b/torch/csrc/DynamicTypes.h
index 1fd0a9d418fb3..2dd3590aee0bc 100644
--- a/torch/csrc/DynamicTypes.h
+++ b/torch/csrc/DynamicTypes.h
@@ -31,6 +31,7 @@ std::tuple<at::Storage, at::ScalarType, bool> createStorageGetType(
     PyObject* obj);
 bool isStorage(PyObject* obj);
 
+// Both methods below return a borrowed reference!
 TORCH_PYTHON_API THPDtype* getTHPDtype(at::ScalarType scalarType);
 THPLayout* getTHPLayout(at::Layout layout);
 } // namespace torch
diff --git a/torch/csrc/Event.cpp b/torch/csrc/Event.cpp
new file mode 100644
index 0000000000000..b8cf8b25803bd
--- /dev/null
+++ b/torch/csrc/Event.cpp
@@ -0,0 +1,328 @@
+#include <pybind11/pybind11.h>
+#include <torch/csrc/Device.h>
+#include <torch/csrc/Event.h>
+#include <torch/csrc/Stream.h>
+#include <torch/csrc/THP.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/pycfunction_helpers.h>
+#include <torch/csrc/utils/python_arg_parser.h>
+
+#include <c10/core/Event.h>
+#include <c10/core/Stream.h>
+
+#include <c10/core/DeviceType.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <structmember.h>
+#include <string>
+
+PyObject* THPEventClass = nullptr;
+
+static PyObject* THPEvent_pynew(
+    PyTypeObject* type,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+
+  unsigned char enable_timing = 0;
+  unsigned char blocking = 0;
+  unsigned char interprocess = 0;
+
+  static torch::PythonArgParser parser({
+      "Event(Device device=None, *, bool enable_timing=True, bool blocking=False, bool interprocess=False)",
+  });
+
+  torch::ParsedArgs<4> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  auto device = r.deviceOptional(0);
+
+  if (!device.has_value()) {
+    device = at::Device(at::getAccelerator(false).value_or(at::kCPU));
+  }
+  enable_timing = r.toBoolWithDefault(1, true);
+  blocking = r.toBoolWithDefault(2, false);
+  interprocess = r.toBoolWithDefault(3, false);
+
+  THPObjectPtr ptr(type->tp_alloc(type, 0));
+  if (!ptr) {
+    TORCH_CHECK(ptr, "Failed to allocate memory for Event");
+  }
+
+  THPEvent* self = (THPEvent*)ptr.get();
+
+  // TODO: blocking and interprocess are not supported yet. To support them, the
+  // flag system of c10::Event needs to be refactored. C10::Event should also
+  // provide a generic constructor to support blocking and interprocess events.
+  (void)blocking;
+  (void)interprocess;
+
+  new (&self->event) c10::Event(
+      device->type(),
+      (enable_timing ? c10::EventFlag::PYTORCH_DEFAULT
+                     : c10::EventFlag::BACKEND_DEFAULT));
+
+  return (PyObject*)ptr.release();
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THPEvent_new(c10::DeviceType device_type, c10::EventFlag flag) {
+  auto type = (PyTypeObject*)&THPEventType;
+  auto self = THPObjectPtr{type->tp_alloc(type, 0)};
+  TORCH_CHECK(self, "Failed to allocate memory for Event");
+  auto self_ = reinterpret_cast<THPEvent*>(self.get());
+  new (&self_->event) c10::Event(device_type, flag);
+  return self.release();
+}
+
+static void THPEvent_dealloc(THPEvent* self) {
+  {
+    pybind11::gil_scoped_release no_gil{};
+    self->event.~Event();
+  }
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static PyObject* THPEvent_get_device(THPEvent* self, void* unused) {
+  HANDLE_TH_ERRORS
+  at::optional<at::Device> device = self->event.device();
+  if (!device) {
+    Py_RETURN_NONE;
+  }
+  return THPDevice_New(device.value());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPEvent_record(
+    PyObject* _self,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  auto self = (THPEvent*)_self;
+  PyObject* _stream = Py_None;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  constexpr const char* accepted_args[] = {"stream", nullptr};
+  if (!PyArg_ParseTupleAndKeywords(
+          args,
+          kwargs,
+          "|O",
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+          const_cast<char**>(accepted_args),
+          &_stream)) {
+    TORCH_WARN("Parsing THPEvent_record arg fails");
+    return nullptr;
+  }
+  if (_stream != Py_None) {
+    auto stream = (THPStream*)_stream;
+    self->event.record(c10::Stream::unpack3(
+        stream->stream_id,
+        stream->device_index,
+        static_cast<c10::DeviceType>(stream->device_type)));
+  } else {
+    c10::impl::VirtualGuardImpl impl{
+        static_cast<c10::DeviceType>(self->event.device_type())};
+    self->event.record(impl.getStream(impl.getDevice()));
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPEvent_from_ipc_handle(
+    PyObject* _type,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  auto type = (PyTypeObject*)_type;
+
+  static torch::PythonArgParser parser({
+      "from_ipc_handle(Device device, std::string ipc_handle)",
+  });
+  torch::ParsedArgs<2> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  at::Device device = r.device(0);
+  std::string handle_string = r.string(1);
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "torch.Event ipc is not supported yet, please open an issue if you need this!");
+  THPObjectPtr ptr(type->tp_alloc(type, 0));
+  if (!ptr) {
+    return nullptr;
+  }
+  THPEvent* self = (THPEvent*)ptr.get();
+
+  // TODO: for constructing event from ipc handle, the c10::Event needs to have
+  // more general constructor to achieve that.
+  new (&self->event) c10::Event(device.type(), c10::EventFlag::PYTORCH_DEFAULT);
+
+  return (PyObject*)ptr.release();
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPEvent_ipc_handle(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto self = (THPEvent*)_self;
+  (void)self;
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "torch.Event ipc is not supported yet, please open an issue if you need this!");
+  std::string handle = "0";
+  return PyBytes_FromStringAndSize((const char*)&handle, sizeof(handle));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPEvent_wait(
+    PyObject* _self,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS {
+    auto self = (THPEvent*)_self;
+    PyObject* _stream = Py_None;
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+    constexpr const char* accepted_args[] = {"stream", nullptr};
+    if (!PyArg_ParseTupleAndKeywords(
+            args,
+            kwargs,
+            "|O",
+            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+            const_cast<char**>(accepted_args),
+            &_stream)) {
+      TORCH_WARN("Parsing THPEvent_wait arg fails");
+      return nullptr;
+    }
+    if (_stream != Py_None) {
+      auto stream = (THPStream*)_stream;
+      self->event.block(c10::Stream::unpack3(
+          stream->stream_id,
+          stream->device_index,
+          static_cast<c10::DeviceType>(stream->device_type)));
+    } else {
+      c10::impl::VirtualGuardImpl impl{
+          static_cast<c10::DeviceType>(self->event.device_type())};
+      self->event.block(impl.getStream(impl.getDevice()));
+    }
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPEvent_query(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto self = (THPEvent*)_self;
+  return PyBool_FromLong(self->event.query());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPEvent_elapsed_time(PyObject* _self, PyObject* _other) {
+  HANDLE_TH_ERRORS
+  auto self = (THPEvent*)_self;
+  auto other = (THPEvent*)_other;
+  return PyFloat_FromDouble(self->event.elapsedTime(other->event));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPEvent_synchronize(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS {
+    pybind11::gil_scoped_release no_gil{};
+    auto self = (THPEvent*)_self;
+    self->event.synchronize();
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPEvent_evend_id(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto self = (THPEvent*)_self;
+  return PyLong_FromVoidPtr(self->event.eventId());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPEvent_repr(THPEvent* self) {
+  HANDLE_TH_ERRORS
+  return THPUtils_packString(
+      "torch.Event device_type=" +
+      c10::DeviceTypeName(
+          static_cast<c10::DeviceType>(self->event.device_type()), true) +
+      ", device_index=" + std::to_string(self->event.device_index()) +
+      ", event_flag=" +
+      std::to_string(static_cast<int64_t>(self->event.flag())) + ", event_id=" +
+      std::to_string(reinterpret_cast<int64_t>(self->event.eventId())));
+  END_HANDLE_TH_ERRORS
+}
+
+// NOLINTNEXTLINE(*c-arrays*, *global-variables)
+static struct PyGetSetDef THPEvent_properties[] = {
+    {"device", (getter)THPEvent_get_device, nullptr, nullptr, nullptr},
+    {"event_id", (getter)THPEvent_evend_id, nullptr, nullptr, nullptr},
+    {nullptr}};
+
+// NOLINTNEXTLINE(*c-arrays*, *global-variables)
+static PyMethodDef THPEvent_methods[] = {
+    {(char*)"from_ipc_handle",
+     castPyCFunctionWithKeywords(THPEvent_from_ipc_handle),
+     METH_CLASS | METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {(char*)"record",
+     castPyCFunctionWithKeywords(THPEvent_record),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {(char*)"wait",
+     castPyCFunctionWithKeywords(THPEvent_wait),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {(char*)"query", THPEvent_query, METH_NOARGS, nullptr},
+    {(char*)"elapsed_time", THPEvent_elapsed_time, METH_O, nullptr},
+    {(char*)"synchronize", THPEvent_synchronize, METH_NOARGS, nullptr},
+    {(char*)"ipc_handle", THPEvent_ipc_handle, METH_NOARGS, nullptr},
+    {nullptr}};
+
+PyTypeObject THPEventType = {
+    PyVarObject_HEAD_INIT(nullptr, 0) "torch.Event", /* tp_name */
+    sizeof(THPEvent), /* tp_basicsize */
+    0, /* tp_itemsize */
+    (destructor)THPEvent_dealloc, /* tp_dealloc */
+    0, /* tp_vectorcall_offset */
+    nullptr, /* tp_getattr */
+    nullptr, /* tp_setattr */
+    nullptr, /* tp_reserved */
+    (reprfunc)THPEvent_repr, /* tp_repr */
+    nullptr, /* tp_as_number */
+    nullptr, /* tp_as_sequence */
+    nullptr, /* tp_as_mapping */
+    nullptr, /* tp_hash  */
+    nullptr, /* tp_call */
+    nullptr, /* tp_str */
+    nullptr, /* tp_getattro */
+    nullptr, /* tp_setattro */
+    nullptr, /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+    nullptr, /* tp_doc */
+    nullptr, /* tp_traverse */
+    nullptr, /* tp_clear */
+    nullptr, /* tp_richcompare */
+    0, /* tp_weaklistoffset */
+    nullptr, /* tp_iter */
+    nullptr, /* tp_iternext */
+    THPEvent_methods, /* tp_methods */
+    nullptr, /* tp_members */
+    THPEvent_properties, /* tp_getset */
+    nullptr, /* tp_base */
+    nullptr, /* tp_dict */
+    nullptr, /* tp_descr_get */
+    nullptr, /* tp_descr_set */
+    0, /* tp_dictoffset */
+    nullptr, /* tp_init */
+    nullptr, /* tp_alloc */
+    THPEvent_pynew, /* tp_new */
+};
+
+void THPEvent_init(PyObject* module) {
+  THPEventClass = (PyObject*)&THPEventType;
+  if (PyType_Ready(&THPEventType) < 0) {
+    throw python_error();
+  }
+  Py_INCREF(&THPEventType);
+  if (PyModule_AddObject(module, "Event", (PyObject*)&THPEventType) < 0) {
+    throw python_error();
+  }
+}
diff --git a/torch/csrc/Event.h b/torch/csrc/Event.h
new file mode 100644
index 0000000000000..745610d5dd7d6
--- /dev/null
+++ b/torch/csrc/Event.h
@@ -0,0 +1,21 @@
+#ifndef THP_EVENT_INC
+#define THP_EVENT_INC
+
+#include <c10/core/Event.h>
+#include <torch/csrc/python_headers.h>
+
+struct TORCH_API THPEvent {
+  PyObject_HEAD c10::Event event;
+};
+extern PyObject* THPEventClass;
+TORCH_API extern PyTypeObject THPEventType;
+
+TORCH_API void THPEvent_init(PyObject* module);
+TORCH_API PyObject* THPEvent_new(
+    c10::DeviceType device_type,
+    c10::EventFlag flag);
+inline bool THPEvent_Check(PyObject* obj) {
+  return THPEventClass && PyObject_IsInstance(obj, THPEventClass);
+}
+
+#endif // THP_EVENT_INC
diff --git a/torch/csrc/Exceptions.cpp b/torch/csrc/Exceptions.cpp
index 5bd2e5fd8eaf1..a58c62df171ef 100644
--- a/torch/csrc/Exceptions.cpp
+++ b/torch/csrc/Exceptions.cpp
@@ -60,13 +60,15 @@ could not be completed because the input matrix is singular.",
   // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
   ASSERT_TRUE(
       THPException_OutOfMemoryError = PyErr_NewExceptionWithDoc(
-          "torch.cuda.OutOfMemoryError",
-          "Exception raised when CUDA is out of memory",
+          "torch.OutOfMemoryError",
+          "Exception raised when device is out of memory",
           PyExc_RuntimeError,
           nullptr));
+  PyTypeObject* type = (PyTypeObject*)THPException_OutOfMemoryError;
+  type->tp_name = "torch.OutOfMemoryError";
   ASSERT_TRUE(
       PyModule_AddObject(
-          module, "_OutOfMemoryError", THPException_OutOfMemoryError) == 0);
+          module, "OutOfMemoryError", THPException_OutOfMemoryError) == 0);
 
   // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
   ASSERT_TRUE(
@@ -224,13 +226,6 @@ void translate_exception_to_python(const std::exception_ptr& e_ptr) {
   CATCH_ALL_ERRORS(return)
 }
 
-IndexError::IndexError(const char* format, ...) {
-  va_list fmt_args{};
-  va_start(fmt_args, format);
-  msg = formatMessage(format, fmt_args);
-  va_end(fmt_args);
-}
-
 TypeError::TypeError(const char* format, ...) {
   va_list fmt_args{};
   va_start(fmt_args, format);
@@ -238,20 +233,6 @@ TypeError::TypeError(const char* format, ...) {
   va_end(fmt_args);
 }
 
-ValueError::ValueError(const char* format, ...) {
-  va_list fmt_args{};
-  va_start(fmt_args, format);
-  msg = formatMessage(format, fmt_args);
-  va_end(fmt_args);
-}
-
-NotImplementedError::NotImplementedError(const char* format, ...) {
-  va_list fmt_args{};
-  va_start(fmt_args, format);
-  msg = formatMessage(format, fmt_args);
-  va_end(fmt_args);
-}
-
 AttributeError::AttributeError(const char* format, ...) {
   va_list fmt_args{};
   va_start(fmt_args, format);
@@ -259,13 +240,6 @@ AttributeError::AttributeError(const char* format, ...) {
   va_end(fmt_args);
 }
 
-LinAlgError::LinAlgError(const char* format, ...) {
-  va_list fmt_args{};
-  va_start(fmt_args, format);
-  msg = formatMessage(format, fmt_args);
-  va_end(fmt_args);
-}
-
 void PyWarningHandler::InternalHandler::process(const c10::Warning& warning) {
   warning_buffer_.push_back(warning);
 }
diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h
index ccce3d255d384..6b8d923f40909 100644
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@@ -2,8 +2,6 @@
 
 #include <exception>
 #include <memory>
-#include <mutex>
-#include <queue>
 #include <string>
 #include <system_error>
 
@@ -145,7 +143,7 @@ extern PyObject *THPException_FatalError, *THPException_LinAlgError,
 // Throwing this exception means that the python error flags have been already
 // set and control should be immediately returned to the interpreter.
 struct python_error : public std::exception {
-  python_error() {}
+  python_error() = default;
 
   python_error(const python_error& other)
       : type(other.type),
@@ -281,15 +279,6 @@ struct PyTorchError : public std::exception {
 #define TORCH_FORMAT_FUNC(FORMAT_INDEX, VA_ARGS_INDEX)
 #endif
 
-// Translates to Python IndexError
-struct IndexError : public PyTorchError {
-  using PyTorchError::PyTorchError;
-  IndexError(const char* format, ...) TORCH_FORMAT_FUNC(2, 3);
-  PyObject* python_type() override {
-    return PyExc_IndexError;
-  }
-};
-
 // Translates to Python TypeError
 struct TypeError : public PyTorchError {
   using PyTorchError::PyTorchError;
@@ -299,24 +288,6 @@ struct TypeError : public PyTorchError {
   }
 };
 
-// Translates to Python ValueError
-struct ValueError : public PyTorchError {
-  using PyTorchError::PyTorchError;
-  TORCH_PYTHON_API ValueError(const char* format, ...) TORCH_FORMAT_FUNC(2, 3);
-  PyObject* python_type() override {
-    return PyExc_ValueError;
-  }
-};
-
-// Translates to Python NotImplementedError
-struct NotImplementedError : public PyTorchError {
-  NotImplementedError(const char* format, ...) TORCH_FORMAT_FUNC(2, 3);
-  NotImplementedError() = default;
-  PyObject* python_type() override {
-    return PyExc_NotImplementedError;
-  }
-};
-
 // Translates to Python AttributeError
 struct AttributeError : public PyTorchError {
   AttributeError(const char* format, ...) TORCH_FORMAT_FUNC(2, 3);
@@ -325,14 +296,6 @@ struct AttributeError : public PyTorchError {
   }
 };
 
-// Translates to Python LinAlgError
-struct LinAlgError : public PyTorchError {
-  LinAlgError(const char* format, ...) TORCH_FORMAT_FUNC(2, 3);
-  PyObject* python_type() override {
-    return THPException_LinAlgError;
-  }
-};
-
 // ATen warning handler for Python
 struct PyWarningHandler {
   // Move actual handler into a separate class with a noexcept
@@ -368,27 +331,35 @@ struct PyWarningHandler {
 };
 
 namespace detail {
+
+struct noop_gil_scoped_release {
+  // user-defined constructor (i.e. not defaulted) to avoid
+  // unused-variable warnings at usage sites of this class
+  noop_gil_scoped_release() {}
+};
+
+template <bool release_gil>
+using conditional_gil_scoped_release = std::conditional_t<
+    release_gil,
+    pybind11::gil_scoped_release,
+    noop_gil_scoped_release>;
+
 template <typename Func, size_t i>
 using Arg = typename invoke_traits<Func>::template arg<i>::type;
 
-template <typename Func, size_t... Is>
+template <typename Func, size_t... Is, bool release_gil>
 auto wrap_pybind_function_impl_(
+    // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
     Func&& f,
     std::index_sequence<Is...>,
-    bool release_gil) {
-  using result_type = typename invoke_traits<Func>::result_type;
+    std::bool_constant<release_gil>) {
   namespace py = pybind11;
 
   // f=f is needed to handle function references on older compilers
-  return [f = std::forward<Func>(f),
-          release_gil](Arg<Func, Is>... args) -> result_type {
+  return [f = std::forward<Func>(f)](Arg<Func, Is>... args) {
     HANDLE_TH_ERRORS
-    if (release_gil) {
-      py::gil_scoped_release no_gil;
-      return c10::guts::invoke(f, std::forward<Arg<Func, Is>>(args)...);
-    } else {
-      return c10::guts::invoke(f, std::forward<Arg<Func, Is>>(args)...);
-    }
+    conditional_gil_scoped_release<release_gil> no_gil;
+    return c10::guts::invoke(f, std::forward<Arg<Func, Is>>(args)...);
     END_HANDLE_TH_ERRORS_PYBIND
   };
 }
@@ -400,7 +371,9 @@ template <typename Func>
 auto wrap_pybind_function(Func&& f) {
   using traits = invoke_traits<Func>;
   return torch::detail::wrap_pybind_function_impl_(
-      std::forward<Func>(f), std::make_index_sequence<traits::arity>{}, false);
+      std::forward<Func>(f),
+      std::make_index_sequence<traits::arity>{},
+      std::false_type{});
 }
 
 // Wrap a function with TH error, warning handling and releases the GIL.
@@ -409,7 +382,9 @@ template <typename Func>
 auto wrap_pybind_function_no_gil(Func&& f) {
   using traits = invoke_traits<Func>;
   return torch::detail::wrap_pybind_function_impl_(
-      std::forward<Func>(f), std::make_index_sequence<traits::arity>{}, true);
+      std::forward<Func>(f),
+      std::make_index_sequence<traits::arity>{},
+      std::true_type{});
 }
 
 } // namespace torch
diff --git a/torch/csrc/Generator.cpp b/torch/csrc/Generator.cpp
index 1a2f65b804a07..62a478ad5a63b 100644
--- a/torch/csrc/Generator.cpp
+++ b/torch/csrc/Generator.cpp
@@ -143,6 +143,47 @@ uint64_t unpack_uint64(PyObject* pyobj) {
   return unsigned_obj;
 }
 
+static PyObject* THPGenerator_graphSafeGetState(
+    PyObject* _self,
+    PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto& gen = ((THPGenerator*)_self)->cdata;
+
+  // See Note [Acquire lock when using random generators]
+  std::scoped_lock<std::mutex> lock(gen.mutex());
+
+  return THPGenerator_Wrap(gen.graphsafe_get_state());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPGenerator_graphSafeSetState(
+    PyObject* _self,
+    PyObject* _state) {
+  HANDLE_TH_ERRORS
+  auto self = (THPGenerator*)_self;
+  auto& gen = self->cdata;
+
+  // See Note [Acquire lock when using random generators]
+  std::scoped_lock<std::mutex> lock(gen.mutex());
+  gen.graphsafe_set_state(THPGenerator_Unwrap(_state));
+
+  Py_INCREF(self);
+  return (PyObject*)self;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPGenerator_cloneState(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto& gen = ((THPGenerator*)_self)->cdata;
+
+  // See Note [Acquire lock when using random generators]
+  std::scoped_lock<std::mutex> lock(gen.mutex());
+  auto new_generator = gen.clone();
+
+  return THPGenerator_Wrap(new_generator);
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject* THPGenerator_manualSeed(PyObject* _self, PyObject* seed) {
   HANDLE_TH_ERRORS
   auto self = (THPGenerator*)_self;
@@ -218,6 +259,12 @@ static struct PyGetSetDef THPGenerator_properties[] = {
 static PyMethodDef THPGenerator_methods[] = {
     {"get_state", THPGenerator_getState, METH_NOARGS, nullptr},
     {"set_state", THPGenerator_setState, METH_O, nullptr},
+    {"clone_state", THPGenerator_cloneState, METH_NOARGS, nullptr},
+    {"graphsafe_get_state",
+     THPGenerator_graphSafeGetState,
+     METH_NOARGS,
+     nullptr},
+    {"graphsafe_set_state", THPGenerator_graphSafeSetState, METH_O, nullptr},
     {"set_offset", THPGenerator_setOffset, METH_O, nullptr},
     {"manual_seed", THPGenerator_manualSeed, METH_O, nullptr},
     {"seed", THPGenerator_seed, METH_NOARGS, nullptr},
@@ -304,6 +351,14 @@ PyObject* THPGenerator_Wrap(Generator gen) {
       (PyTypeObject*)THPGeneratorClass, std::move(gen));
 }
 
+at::Generator THPGenerator_Unwrap(PyObject* state) {
+  if (!Py_IS_TYPE(state, &THPGeneratorType)) {
+    throw torch::TypeError(
+        "expected a Generator, but got %s", Py_TYPE(state)->tp_name);
+  }
+  return reinterpret_cast<THPGenerator*>(state)->cdata;
+}
+
 // Creates a new Python object for a Generator. The Generator must not already
 // have a PyObject* associated with it.
 PyObject* THPGenerator_NewWithVar(PyTypeObject* type, Generator gen) {
diff --git a/torch/csrc/Generator.h b/torch/csrc/Generator.h
index f5b7b4661eb58..57656c471ecd5 100644
--- a/torch/csrc/Generator.h
+++ b/torch/csrc/Generator.h
@@ -23,6 +23,8 @@ bool THPGenerator_init(PyObject* module);
 
 TORCH_PYTHON_API PyObject* THPGenerator_Wrap(at::Generator gen);
 
+TORCH_PYTHON_API at::Generator THPGenerator_Unwrap(PyObject* state);
+
 // Creates a new Python object for a Generator. The Generator must not already
 // have a PyObject* associated with it.
 PyObject* THPGenerator_NewWithVar(PyTypeObject* type, at::Generator gen);
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 2093ce91e11dc..22a257909bf12 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -1,3 +1,4 @@
+#include <ATen/DeviceAccelerator.h>
 #include <c10/util/Optional.h>
 #include <fmt/core.h>
 #include <sys/types.h>
@@ -8,6 +9,7 @@
 #endif
 
 #include <ATen/ATen.h>
+#include <ATen/BlasBackend.h>
 #include <ATen/DLConvertor.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/LegacyVmapMode.h>
@@ -15,14 +17,18 @@
 #include <ATen/Parallel.h>
 #include <ATen/Utils.h>
 #include <ATen/core/Vitals.h>
+#include <ATen/detail/AcceleratorHooksInterface.h>
 #include <ATen/dlpack.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/ForeachUtils.h>
+#include <ATen/native/Normalization.h>
+#include <c10/core/Device.h>
 #include <c10/core/DispatchKeySet.h>
 #include <c10/util/AbortHandler.h>
 #include <c10/util/Backtrace.h>
 #include <c10/util/Logging.h>
 #include <c10/util/irange.h>
+#include <c10/util/thread_name.h>
 #include <libshm.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -37,6 +43,7 @@
 #include <torch/csrc/Device.h>
 #include <torch/csrc/Dtype.h>
 #include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/Event.h>
 #include <torch/csrc/Generator.h>
 #include <torch/csrc/Layout.h>
 #include <torch/csrc/MemoryFormat.h>
@@ -68,6 +75,7 @@
 #include <torch/csrc/lazy/python/init.h>
 #include <torch/csrc/monitor/python_init.h>
 #include <torch/csrc/mps/Module.h>
+#include <torch/csrc/mtia/Module.h>
 #include <torch/csrc/multiprocessing/init.h>
 #include <torch/csrc/onnx/init.h>
 #include <torch/csrc/profiler/python/init.h>
@@ -90,8 +98,15 @@
 #include <ATen/native/transformers/sdp_utils_cpp.h>
 #include <torch/csrc/profiler/combined_traceback.h>
 #include <sstream>
+
 #ifdef USE_CUDA
+#include <ATen/cuda/CUDAConfig.h>
 #include <ATen/native/transformers/cuda/sdp_utils.h>
+#ifdef __HIP_PLATFORM_AMD__
+#include <ATen/native/cudnn/hip/BatchNorm.h>
+#else
+#include <ATen/native/cudnn/BatchNorm.h>
+#endif
 #endif
 
 #ifdef USE_DISTRIBUTED
@@ -193,6 +208,11 @@ static PyObject* THPModule_initExtension(
   std::string path = THPUtils_unpackString(shm_manager_path);
   libshm_init(path.c_str());
 
+  // The main thread usually launches CPU/GPU/Accelerator kernels and therefore
+  // becomes latency sensitive. If the thread is named, we can debug performance
+  // issues easier.
+  c10::setThreadName("pt_main_thread");
+
   auto module = THPObjectPtr(PyImport_ImportModule("torch"));
   if (!module)
     throw python_error();
@@ -353,11 +373,49 @@ PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) {
   THPVariable* a = reinterpret_cast<THPVariable*>(a_);
   THPVariable* b = reinterpret_cast<THPVariable*>(b_);
 
+  TORCH_CHECK(
+      a->cdata->use_count() == 1,
+      "Expected single reference to a's Tensor object but got ",
+      a->cdata->use_count());
+  TORCH_CHECK(
+      b->cdata->use_count() == 1,
+      "Expected single reference to b's Tensor object but got ",
+      b->cdata->use_count());
+  // weak_use_count() adds 1 if use_count is non-zero
+  TORCH_CHECK(
+      a->cdata->weak_use_count() == 1,
+      "Expected no weakrefs to a's Tensor object but got  ",
+      a->cdata->weak_use_count() - 1);
+  TORCH_CHECK(
+      b->cdata->weak_use_count() == 1,
+      "Expected no weakrefs to b's Tensor object but got  ",
+      b->cdata->weak_use_count() - 1);
+
   // Swap the Tensor Impl
   c10::MaybeOwned<at::Tensor> tmp = a->cdata;
+
+  // The TensorImpls contain PyObjectSlots that have a reference to the PyObject
+  // associated with the TensorImpl. Swap this field as well.
+  c10::optional<PyObject*> mb_obj_a =
+      a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+          getPyInterpreter(), /*ignore_hermetic_tls=*/false);
+  c10::optional<PyObject*> mb_obj_b =
+      b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+          getPyInterpreter(), /*ignore_hermetic_tls=*/false);
+  TORCH_INTERNAL_ASSERT(
+      mb_obj_a.has_value() && mb_obj_b.has_value(),
+      "Both tensors should have PyObjects tagged by the current python interpreter");
+  TORCH_CHECK(mb_obj_a.value() == a_);
+  TORCH_CHECK(mb_obj_b.value() == b_);
+
   a->cdata = b->cdata;
   b->cdata = tmp;
 
+  a->cdata->unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(
+      getPyInterpreter(), a_, c10::impl::PyInterpreterStatus::TAGGED_BY_US);
+  b->cdata->unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(
+      getPyInterpreter(), b_, c10::impl::PyInterpreterStatus::TAGGED_BY_US);
+
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
@@ -687,6 +745,24 @@ PyObject* THPModule_userEnabledMathSDP(PyObject* _unused, PyObject* noargs) {
   else
     Py_RETURN_FALSE;
 }
+PyObject* THPModule_setSDPUseCuDNN(PyObject* _unused, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      PyBool_Check(arg),
+      "set_sdp_use_cudnn expects a bool, "
+      "but got %s",
+      THPUtils_typename(arg));
+  at::globalContext().setSDPUseCuDNN(arg == Py_True);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+PyObject* THPModule_userEnabledCuDNNSDP(PyObject* _unused, PyObject* noargs) {
+  if (at::globalContext().userEnabledCuDNNSDP())
+    Py_RETURN_TRUE;
+  else
+    Py_RETURN_FALSE;
+}
+
 PyObject* THPModule_setUserEnabledCuDNN(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(
@@ -936,6 +1012,25 @@ PyObject* THPModule_allowBF16ReductionCuBLAS(
   Py_RETURN_FALSE;
 }
 
+PyObject* THPModule_setAllowFP16ReductionCPU(PyObject* _unused, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      PyBool_Check(arg),
+      "set_allow_fp16_reduction_cpu expects a bool, "
+      "but got ",
+      THPUtils_typename(arg));
+  at::globalContext().setAllowFP16ReductionCPU(arg == Py_True);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THPModule_allowFP16ReductionCPU(PyObject* _unused, PyObject* noargs) {
+  if (at::globalContext().allowFP16ReductionCPU()) {
+    Py_RETURN_TRUE;
+  }
+  Py_RETURN_FALSE;
+}
+
 PyObject* THPModule_setFlushDenormal(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(
@@ -953,9 +1048,7 @@ PyObject* THPModule_setFlushDenormal(PyObject* _unused, PyObject* arg) {
 PyObject* THPModule_getDefaultDtype(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
   auto scalar_type = torch::tensors::get_default_scalar_type();
-  auto dtype = (PyObject*)torch::getTHPDtype(scalar_type);
-  Py_INCREF(dtype);
-  return dtype;
+  return Py_NewRef(torch::getTHPDtype(scalar_type));
   END_HANDLE_TH_ERRORS
 }
 
@@ -1238,6 +1331,11 @@ static PyMethodDef TorchMethods[] = { // NOLINT
      METH_NOARGS,
      nullptr},
     {"_set_sdp_use_math", THPModule_setSDPUseMath, METH_O, nullptr},
+    {"_get_cudnn_sdp_enabled",
+     THPModule_userEnabledCuDNNSDP,
+     METH_NOARGS,
+     nullptr},
+    {"_set_sdp_use_cudnn", THPModule_setSDPUseCuDNN, METH_O, nullptr},
     {"_get_cudnn_enabled", THPModule_userEnabledCuDNN, METH_NOARGS, nullptr},
     {"_set_cudnn_enabled", THPModule_setUserEnabledCuDNN, METH_O, nullptr},
     {"_get_mkldnn_enabled", THPModule_userEnabledMkldnn, METH_NOARGS, nullptr},
@@ -1306,6 +1404,14 @@ static PyMethodDef TorchMethods[] = { // NOLINT
      THPModule_setAllowBF16ReductionCuBLAS,
      METH_O,
      nullptr},
+    {"_get_cpu_allow_fp16_reduced_precision_reduction",
+     THPModule_allowFP16ReductionCPU,
+     METH_NOARGS,
+     nullptr},
+    {"_set_cpu_allow_fp16_reduced_precision_reduction",
+     THPModule_setAllowFP16ReductionCPU,
+     METH_O,
+     nullptr},
     {"_vmapmode_increment_nesting",
      THPModule_vmapmode_increment_nesting,
      METH_NOARGS,
@@ -1403,6 +1509,15 @@ void initModule(PyObject* module);
 } // namespace torch::cuda
 #endif
 
+#ifdef USE_XPU
+PyMethodDef* THXPModule_methods();
+void THXPStream_init(PyObject* module);
+void THXPEvent_init(PyObject* module);
+namespace torch::xpu {
+void initModule(PyObject* module);
+} // namespace torch::xpu
+#endif
+
 #ifdef USE_ITT
 namespace torch::profiler {
 void initIttBindings(PyObject* module);
@@ -1462,6 +1577,9 @@ PyObject* initModule() {
 #ifdef USE_CUDA
   THPUtils_addPyMethodDefs(methods, THCPModule_methods());
 #endif
+#ifdef USE_XPU
+  THPUtils_addPyMethodDefs(methods, THXPModule_methods());
+#endif
 #if defined(USE_DISTRIBUTED) && defined(USE_C10D)
   THPUtils_addPyMethodDefs(
       methods, torch::distributed::c10d::python_functions());
@@ -1489,6 +1607,7 @@ PyObject* initModule() {
   THPQScheme_init(module);
   THPDevice_init(module);
   THPStream_init(module);
+  THPEvent_init(module);
   ASSERT_TRUE(THPVariable_initModule(module));
   ASSERT_TRUE(THPFunction_initModule(module));
   ASSERT_TRUE(THPEngine_initModule(module));
@@ -1521,6 +1640,10 @@ PyObject* initModule() {
 #ifdef USE_CUDA
   torch::cuda::initModule(module);
 #endif
+#ifdef USE_XPU
+  torch::xpu::initModule(module);
+#endif
+  torch::mtia::initModule(module);
   torch::cpu::initModule(module);
   torch::initVerboseBindings(module);
   ASSERT_TRUE(THPStorage_init(module));
@@ -1535,6 +1658,11 @@ PyObject* initModule() {
   THCPGraph_init(module);
 #endif
 
+#ifdef USE_XPU
+  THXPStream_init(module);
+  THXPEvent_init(module);
+#endif
+
   auto set_module_attr =
       [&](const char* name, PyObject* v, bool incref = true) {
         // PyModule_AddObject steals reference
@@ -1771,11 +1899,14 @@ Call this whenever a new thread is created in order to propagate values from
   py::enum_<sdp::SDPBackend>(
       py_module,
       "_SDPBackend",
-      "Enum class for the scaled dot product attention backends\n\n... warning:: This class is in beta and subject to change.")
+      "An enum-like class that contains the different backends for scaled dot product attention.\n\n... warning:: This class is in beta and subject to change.\n\n"
+      "This backend class is designed to be used with the sdpa_kernel context manager."
+      "See :func: torch.nn.attention.sdpa_kernel for more details.")
       .value("ERROR", sdp::SDPBackend::error)
       .value("MATH", sdp::SDPBackend::math)
       .value("FLASH_ATTENTION", sdp::SDPBackend::flash_attention)
-      .value("EFFICIENT_ATTENTION", sdp::SDPBackend::efficient_attention);
+      .value("EFFICIENT_ATTENTION", sdp::SDPBackend::efficient_attention)
+      .value("CUDNN_ATTENTION", sdp::SDPBackend::cudnn_attention);
 
   py_module.def(
       "_can_use_flash_attention",
@@ -1808,6 +1939,17 @@ Call this whenever a new thread is created in order to propagate values from
     return at::globalContext().linalgPreferredBackend();
   });
 
+  py::enum_<at::BlasBackend>(py_module, "_BlasBackend")
+      .value("Cublas", at::BlasBackend::Cublas)
+      .value("Cublaslt", at::BlasBackend::Cublaslt);
+
+  py_module.def("_set_blas_preferred_backend", [](at::BlasBackend b) {
+    at::globalContext().setBlasPreferredBackend(b);
+  });
+  py_module.def("_get_blas_preferred_backend", []() {
+    return at::globalContext().blasPreferredBackend();
+  });
+
   py_module.def(
       "_construct_storage_from_data_pointer",
       [](int64_t data_ptr, c10::Device device, size_t size_bytes) {
@@ -1836,6 +1978,70 @@ Call this whenever a new thread is created in order to propagate values from
     return at::impl::ThreadLocalPythonObjects::get_state().contains(key);
   });
 
+  py_module.def("_accelerator_hooks_device_count", []() {
+    auto device_type = at::getAccelerator();
+    if (device_type.has_value()) {
+      return at::globalContext()
+          .getAcceleratorHooksInterface(device_type.value())
+          .deviceCount();
+    }
+    return c10::DeviceIndex(-1);
+  });
+
+  py_module.def(
+      "_accelerator_hooks_set_current_device",
+      [](c10::DeviceIndex device_index) {
+        auto device_type = at::getAccelerator();
+        if (device_type.has_value()) {
+          at::globalContext()
+              .getAcceleratorHooksInterface(device_type.value())
+              .setCurrentDevice(device_index);
+        }
+      });
+
+  py_module.def("_accelerator_hooks_get_current_device", []() {
+    auto device_type = at::getAccelerator();
+    if (device_type.has_value()) {
+      return at::globalContext()
+          .getAcceleratorHooksInterface(device_type.value())
+          .getCurrentDevice();
+    }
+    return c10::DeviceIndex(-1);
+  });
+
+  py_module.def(
+      "_accelerator_hooks_exchange_device", [](c10::DeviceIndex device_index) {
+        auto device_type = at::getAccelerator();
+        if (device_type.has_value()) {
+          return at::globalContext()
+              .getAcceleratorHooksInterface(device_type.value())
+              .exchangeDevice(device_index);
+        }
+        return c10::DeviceIndex(-1);
+      });
+
+  py_module.def(
+      "_accelerator_hooks_maybe_exchange_device",
+      [](c10::DeviceIndex device_index) {
+        auto device_type = at::getAccelerator();
+        if (device_type.has_value()) {
+          return at::globalContext()
+              .getAcceleratorHooksInterface(device_type.value())
+              .maybeExchangeDevice(device_index);
+        }
+        return c10::DeviceIndex(-1);
+      });
+
+  py_module.def(
+      "_get_accelerator",
+      [](c10::optional<bool> check = c10::nullopt) {
+        return c10::Device(
+            at::getAccelerator(check.value_or(false))
+                .value_or(c10::DeviceType::CPU),
+            -1);
+      },
+      py::arg("check") = nullptr);
+
 #ifdef USE_CUDA
   PyObject* has_cuda = Py_True;
 #else
@@ -1848,10 +2054,17 @@ Call this whenever a new thread is created in order to propagate values from
   PyObject* has_mps = Py_False;
 #endif
 
+#ifdef USE_XPU
+  PyObject* has_xpu = Py_True;
+#else
+  PyObject* has_xpu = Py_False;
+#endif
+
   ASSERT_TRUE(set_module_attr("_has_cuda", has_cuda));
   ASSERT_TRUE(
       set_module_attr("_has_magma", at::hasMAGMA() ? Py_True : Py_False));
   ASSERT_TRUE(set_module_attr("_has_mps", has_mps));
+  ASSERT_TRUE(set_module_attr("_has_xpu", has_xpu));
   ASSERT_TRUE(
       set_module_attr("_has_mkldnn", at::hasMKLDNN() ? Py_True : Py_False));
 
@@ -1976,6 +2189,66 @@ Call this whenever a new thread is created in order to propagate values from
         return map;
       });
 
+  py_module.def(
+      "_storage_address",
+      [](const at::Tensor& tensor) {
+        return reinterpret_cast<std::intptr_t>(
+            tensor.storage().unsafeGetStorageImpl());
+      },
+      "Gets the memory address of the Tensor's StorageImpl.");
+
+  py_module.def(
+      "_data_address",
+      [](const at::Tensor& tensor) {
+        return reinterpret_cast<std::intptr_t>(tensor.storage().data());
+      },
+      "Gets the memory address of the Tensor's data pointer.");
+
+  py_module.def(
+      "_is_cow_tensor",
+      [](const at::Tensor& tensor) {
+        return c10::impl::cow::is_cow_data_ptr(tensor.storage().data_ptr());
+      },
+      "Checks if a tensor's data pointer is COW");
+
+  py_module.def(
+      "_get_cudnn_batch_norm_reserve_space_size",
+      [](const at::Tensor& input, bool training) {
+#ifdef USE_CUDA
+        return at::native::_get_cudnn_batch_norm_reserve_space_size(
+            input, training);
+#else
+        TORCH_CHECK(false, "PyTorch was not built with cuda");
+#endif
+      },
+      py::arg("input"),
+      py::arg("training"));
+
+  py::enum_<at::native::BatchNormBackend>(py_module, "_BatchNormBackend")
+      .value("Native", at::native::BatchNormBackend::Native)
+      .value("Cudnn", at::native::BatchNormBackend::Cudnn)
+      .value("Miopen", at::native::BatchNormBackend::Miopen);
+
+  py_module.def(
+      "_select_batch_norm_backend",
+      [](const at::Tensor& input,
+         const at::Tensor& weight,
+         const at::Tensor& bias,
+         const at::Tensor& running_mean,
+         const at::Tensor& running_var,
+         bool training,
+         double eps) {
+        return at::native::_select_batch_norm_backend(
+            input, weight, bias, running_mean, running_var, training, eps);
+      },
+      py::arg("input"),
+      py::arg("weight"),
+      py::arg("bias"),
+      py::arg("running_mean"),
+      py::arg("running_var"),
+      py::arg("training"),
+      py::arg("eps"));
+
   const auto& defaultGenerator = at::detail::getDefaultCPUGenerator();
   THPDefaultCPUGenerator =
       (THPGenerator*)THPGenerator_initDefaultGenerator(defaultGenerator);
diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp
index 3cd16ea7b9a38..4582cb2a8340c 100644
--- a/torch/csrc/PyInterpreter.cpp
+++ b/torch/csrc/PyInterpreter.cpp
@@ -18,17 +18,24 @@ namespace {
 // because passing in constexpr char* as template argument breaks some
 // versions of MSVC that are being used internally at Meta.
 // MSVC 14.16.27023 (vs2017_15.9)
-#define CONCRETE_TRACE_CUDA(func_name, ...)                           \
-  at::impl::MaybeSetTLSOnEntryGuard guard;                            \
-  if (Py_IsInitialized()) {                                           \
-    pybind11::gil_scoped_acquire gil;                                 \
-    try {                                                             \
-      py::module mod = py::module::import("torch.utils._cuda_trace"); \
-      py::object hook = mod.attr(func_name).attr("fire_callbacks");   \
-      hook(__VA_ARGS__);                                              \
-    } catch (const std::exception& e) {                               \
-      LOG(ERROR) << "CUDA trace hook execution failed: " << e.what(); \
-    }                                                                 \
+#define CONCRETE_GPU_TRACE(device_type, func_name, ...)                       \
+  at::impl::MaybeSetTLSOnEntryGuard guard;                                    \
+  if (Py_IsInitialized()) {                                                   \
+    pybind11::gil_scoped_acquire gil;                                         \
+    try {                                                                     \
+      /* Masquerade hip as cuda because hip uses `torch.cuda` module. */      \
+      if (device_type == at::kHIP) {                                          \
+        device_type = at::kCUDA;                                              \
+      }                                                                       \
+      std::string module_name = "torch." + DeviceTypeName(device_type, true); \
+      py::module mod = py::module::import(module_name.c_str());               \
+      py::object hook =                                                       \
+          mod.attr("_gpu_trace").attr(func_name).attr("fire_callbacks");      \
+      hook(__VA_ARGS__);                                                      \
+    } catch (const std::exception& e) {                                       \
+      LOG(ERROR) << device_type                                               \
+                 << " trace hook execution failed: " << e.what();             \
+    }                                                                         \
   }
 
 struct ConcretePyInterpreterVTable final
@@ -53,9 +60,11 @@ struct ConcretePyInterpreterVTable final
   void python_op_registration_trampoline(
       const c10::OperatorHandle& op,
       c10::DispatchKey key,
-      torch::jit::Stack* stack) const override {
+      c10::DispatchKeySet keyset,
+      torch::jit::Stack* stack,
+      bool with_keyset) const override {
     torch::impl::dispatch::python_op_registration_trampoline_impl(
-        op, key, stack);
+        op, key, keyset, stack, with_keyset);
   }
   void throw_abstract_impl_not_imported_error(
       std::string opname,
@@ -83,36 +92,51 @@ struct ConcretePyInterpreterVTable final
   c10::SymIntArrayRef sym_strides(const c10::TensorImpl* self) const override;
   c10::SymInt sym_storage_offset(const c10::TensorImpl* self) const override;
 
-  void trace_gpu_event_creation(uintptr_t event) const override {
-    CONCRETE_TRACE_CUDA("CUDAEventCreationCallbacks", event);
-  }
-  void trace_gpu_event_deletion(uintptr_t event) const override {
-    CONCRETE_TRACE_CUDA("CUDAEventDeletionCallbacks", event);
+  void trace_gpu_event_creation(at::DeviceType device_type, uintptr_t event)
+      const override {
+    CONCRETE_GPU_TRACE(device_type, "EventCreationCallbacks", event);
   }
-  void trace_gpu_event_record(uintptr_t event, uintptr_t stream)
+  void trace_gpu_event_deletion(at::DeviceType device_type, uintptr_t event)
       const override {
-    CONCRETE_TRACE_CUDA("CUDAEventRecordCallbacks", event, stream);
+    CONCRETE_GPU_TRACE(device_type, "EventDeletionCallbacks", event);
   }
-  void trace_gpu_event_wait(uintptr_t event, uintptr_t stream) const override {
-    CONCRETE_TRACE_CUDA("CUDAEventWaitCallbacks", event, stream);
+  void trace_gpu_event_record(
+      at::DeviceType device_type,
+      uintptr_t event,
+      uintptr_t stream) const override {
+    CONCRETE_GPU_TRACE(device_type, "EventRecordCallbacks", event, stream);
   }
-  void trace_gpu_memory_allocation(uintptr_t ptr) const override {
-    CONCRETE_TRACE_CUDA("CUDAMemoryAllocationCallbacks", ptr);
+  void trace_gpu_event_wait(
+      at::DeviceType device_type,
+      uintptr_t event,
+      uintptr_t stream) const override {
+    CONCRETE_GPU_TRACE(device_type, "EventWaitCallbacks", event, stream);
+  }
+  void trace_gpu_memory_allocation(at::DeviceType device_type, uintptr_t ptr)
+      const override {
+    CONCRETE_GPU_TRACE(device_type, "MemoryAllocationCallbacks", ptr);
   }
-  void trace_gpu_memory_deallocation(uintptr_t ptr) const override {
-    CONCRETE_TRACE_CUDA("CUDAMemoryDeallocationCallbacks", ptr);
+  void trace_gpu_memory_deallocation(at::DeviceType device_type, uintptr_t ptr)
+      const override {
+    CONCRETE_GPU_TRACE(device_type, "MemoryDeallocationCallbacks", ptr);
   }
-  void trace_gpu_stream_creation(uintptr_t stream) const override {
-    CONCRETE_TRACE_CUDA("CUDAStreamCreationCallbacks", stream);
+  void trace_gpu_stream_creation(at::DeviceType device_type, uintptr_t stream)
+      const override {
+    CONCRETE_GPU_TRACE(device_type, "StreamCreationCallbacks", stream);
   }
-  void trace_gpu_device_synchronization() const override {
-    CONCRETE_TRACE_CUDA("CUDADeviceSynchronizationCallbacks");
+  void trace_gpu_device_synchronization(
+      at::DeviceType device_type) const override {
+    CONCRETE_GPU_TRACE(device_type, "DeviceSynchronizationCallbacks");
   }
-  void trace_gpu_stream_synchronization(uintptr_t stream) const override {
-    CONCRETE_TRACE_CUDA("CUDAStreamSynchronizationCallbacks", stream);
+  void trace_gpu_stream_synchronization(
+      at::DeviceType device_type,
+      uintptr_t stream) const override {
+    CONCRETE_GPU_TRACE(device_type, "StreamSynchronizationCallbacks", stream);
   }
-  void trace_gpu_event_synchronization(uintptr_t event) const override {
-    CONCRETE_TRACE_CUDA("CUDAEventSynchronizationCallbacks", event);
+  void trace_gpu_event_synchronization(
+      at::DeviceType device_type,
+      uintptr_t event) const override {
+    CONCRETE_GPU_TRACE(device_type, "EventSynchronizationCallbacks", event);
   }
 
   void reset_backward_hooks(const c10::TensorImpl* self) const override;
@@ -807,12 +831,16 @@ c10::Layout ConcretePyInterpreterVTable::layout(
       "torch.ops.prim");
 
   TORCH_CHECK(
-      THPLayout_Check(out.ptr()),
+      THPLayout_Check(out.ptr()) || PyLong_Check(out.ptr()),
       "layout returned invalid type ",
       py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())),
       ", expected Layout");
 
-  return toLayout(out.ptr());
+  if (THPLayout_Check(out.ptr())) {
+    return toLayout(out.ptr());
+  } else {
+    return c10::Layout(py::cast<int64_t>(out));
+  }
 }
 
 int64_t ConcretePyInterpreterVTable::numel(const c10::TensorImpl* self) const {
diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
index e19e0ae5a1056..a3f8263303782 100644
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@@ -290,29 +290,50 @@ static void THPStorage_subclass_dealloc(PyObject* self) {
   Py_DECREF(type);
 }
 
-c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
-    c10::StorageImpl::use_byte_size_t use_byte_size,
-    c10::SymInt size_bytes,
-    c10::Allocator* allocator,
-    bool resizable,
-    c10::optional<int64_t> allocator_opt,
-    c10::optional<at::Device> device_opt) {
+static PyObject* THPStorage_pynew(
+    PyTypeObject* type,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      type != &THPStorageType,
+      "Cannot directly construct StorageBase; subclass it and then construct that");
+  static torch::PythonArgParser parser({
+      THPStorageStr "(*, int64_t allocator=None, Device device=None)",
+      THPStorageStr
+      "(int64_t size, *, int64_t allocator=None, Device device=None)",
+      THPStorageStr
+      "(PyObject* sequence, *, int64_t allocator=None, Device device=None)",
+  });
+  torch::ParsedArgs<3> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  int allocator_arg_idx = 0;
+  int device_arg_idx = 1;
+
+  if (r.idx > 0) {
+    allocator_arg_idx = 1;
+    device_arg_idx = 2;
+  }
+
+  c10::optional<int64_t> allocator_opt = r.toInt64Optional(allocator_arg_idx);
+  c10::optional<at::Device> device_opt = r.deviceOptional(device_arg_idx);
+
+  TORCH_CHECK(
+      !allocator_opt.has_value() || !device_opt.has_value(),
+      THPStorageStr,
+      "(): only one or neither of 'allocator' or 'device' can ",
+      "be given, but not both");
+
+  PyObject* self = nullptr;
+  c10::Allocator* allocator = nullptr;
   at::OptionalDeviceGuard device_guard;
-  // This will be non-nullptr only when there is a custom StorageImpl
-  // constructor for the given device
-  c10::StorageImplCreateHelper fptr = nullptr;
-  // For directly passing allocator scenarios, only c10::StorageImpl objects can
-  // be created. If you need to create a storageimpl object of a subclass, you
-  // need to pass in the device information.
+
   if (allocator_opt.has_value()) {
     // NOLINTNEXTLINE(performance-no-int-to-ptr)
     allocator = reinterpret_cast<c10::Allocator*>(allocator_opt.value());
   } else if (device_opt.has_value()) {
     at::Device device = device_opt.value();
-    // We only need to check this here as this is the only case where we can
-    // have a device that is not CPU (and thus for which the StorageImpl
-    // constructor can be overwritten).
-    fptr = c10::GetStorageImplCreate(device.type());
     if (device.type() == at::kCPU) {
       allocator = c10::GetDefaultCPUAllocator();
 #ifdef USE_CUDA
@@ -334,6 +355,8 @@ c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
     } else if (device.type() == at::DeviceType::PrivateUse1) {
       at::globalContext().lazyInitPrivateUse1();
       allocator = c10::GetAllocator(device.type());
+    } else if (device.type() == at::DeviceType::MAIA) {
+      allocator = c10::GetAllocator(device.type());
     } else {
       // NOLINTEND(bugprone-branch-clone)
       TORCH_CHECK(
@@ -347,53 +370,6 @@ c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
     allocator = c10::GetDefaultCPUAllocator();
   }
 
-  if (fptr != nullptr) {
-    return fptr(use_byte_size, std::move(size_bytes), allocator, resizable);
-  }
-
-  // Create a c10::StorageImpl object.
-  return c10::make_intrusive<c10::StorageImpl>(
-      use_byte_size, std::move(size_bytes), allocator, resizable);
-}
-
-static PyObject* THPStorage_pynew(
-    PyTypeObject* type,
-    PyObject* args,
-    PyObject* kwargs) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(
-      type != &THPStorageType,
-      "Cannot directly construct StorageBase; subclass it and then construct that");
-  static torch::PythonArgParser parser({
-      THPStorageStr "(*, int64_t allocator=None, Device device=None)",
-      THPStorageStr
-      "(int64_t size, *, int64_t allocator=None, Device device=None)",
-      THPStorageStr
-      "(PyObject* sequence, *, int64_t allocator=None, Device device=None)",
-  });
-  torch::ParsedArgs<3> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
-
-  int allocator_arg_idx = 0;
-  int device_arg_idx = 1;
-
-  if (r.idx > 0) {
-    allocator_arg_idx = 1;
-    device_arg_idx = 2;
-  }
-
-  c10::optional<int64_t> allocator_opt = r.toInt64Optional(allocator_arg_idx);
-  c10::optional<at::Device> device_opt = r.deviceOptional(device_arg_idx);
-
-  TORCH_CHECK(
-      !allocator_opt.has_value() || !device_opt.has_value(),
-      THPStorageStr,
-      "(): only one or neither of 'allocator' or 'device' can ",
-      "be given, but not both");
-
-  PyObject* self = nullptr;
-  c10::Allocator* allocator = nullptr;
-
   // torch.Storage(*, ...)
   if (r.idx == 0) {
     self = THPStorage_NewWithStorage(
@@ -401,9 +377,9 @@ static PyObject* THPStorage_pynew(
         make_storage_impl(
             c10::StorageImpl::use_byte_size_t(),
             0,
+            at::DataPtr(),
             allocator,
             /*resizable=*/true,
-            allocator_opt,
             device_opt),
         c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
 
@@ -415,9 +391,9 @@ static PyObject* THPStorage_pynew(
         make_storage_impl(
             c10::StorageImpl::use_byte_size_t(),
             size,
+            at::DataPtr(),
             allocator,
             /*resizable=*/true,
-            allocator_opt,
             device_opt),
         c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
 
@@ -440,9 +416,9 @@ static PyObject* THPStorage_pynew(
         make_storage_impl(
             c10::StorageImpl::use_byte_size_t(),
             length,
+            at::DataPtr(),
             allocator,
             /*resizable=*/true,
-            allocator_opt,
             device_opt),
         c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
     THPObjectPtr item;
@@ -522,7 +498,8 @@ static PyObject* THPStorage_get(THPStorage* self, PyObject* index) {
 
     at::StorageImpl* old_storage_impl = storage.unsafeGetStorageImpl();
     c10::raw::intrusive_ptr::incref(old_storage_impl);
-    auto new_storage_impl = c10::make_intrusive<at::StorageImpl>(
+    c10::optional<at::Device> device_opt = old_storage_impl->device();
+    auto new_storage_impl = make_storage_impl(
         c10::StorageImpl::use_byte_size_t(),
 #ifdef THQUANTIZED
         slicelength * sizeof(quantized_t),
@@ -537,7 +514,8 @@ static PyObject* THPStorage_get(THPStorage* self, PyObject* index) {
             },
             old_storage_impl->device()),
         old_storage_impl->allocator(),
-        /* resizable */ false);
+        /* resizable */ false,
+        device_opt);
 
     PyObject* _ret = THPStorage_NewWithStorage(
         Py_TYPE(self),
diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp
index 9bdb3eba14f46..2e75976652677 100644
--- a/torch/csrc/StorageMethods.cpp
+++ b/torch/csrc/StorageMethods.cpp
@@ -31,6 +31,7 @@
 #include <cuda_runtime.h>
 #endif
 
+#include <ATen/detail/PrivateUse1HooksInterface.h>
 #include <ATen/native/Resize.h>
 
 #ifdef _MSC_VER
@@ -62,6 +63,13 @@ static PyObject* THPStorage_dataPtr(PyObject* self, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject* THPStorage_resizable(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  THPStorage_assertNotNull(self);
+  return PyBool_FromLong(THPStorage_Unpack(self).resizable());
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject* THPStorage_copy_(
     PyObject* self,
     PyObject* args,
@@ -134,10 +142,8 @@ static PyObject* THPStorage_resize_(PyObject* self, PyObject* number_arg) {
       THPUtils_typename(number_arg));
   int64_t newsize = THPUtils_unpackLong(number_arg);
   c10::DeviceType device_type = storage.device_type();
-  if (device_type == at::kCPU) {
-    at::native::resize_bytes_cpu(storage.unsafeGetStorageImpl(), newsize);
+  if (device_type == at::kCUDA) {
 #ifdef USE_CUDA
-  } else if (device_type == at::kCUDA) {
     ptrdiff_t size_bytes_i = newsize;
     TORCH_CHECK(
         !c10::overflows<size_t>(size_bytes_i),
@@ -146,42 +152,11 @@ static PyObject* THPStorage_resize_(PyObject* self, PyObject* number_arg) {
         ") cannot be represented as a size_t");
     const auto size_bytes = static_cast<size_t>(size_bytes_i);
     at::native::resize_bytes_cuda(storage.unsafeGetStorageImpl(), size_bytes);
+#else
+    TORCH_CHECK(false, "built without USE_CUDA");
 #endif
-  } else if (device_type == at::kMeta) {
-    at::native::resize_bytes_meta(storage.unsafeGetStorageImpl(), newsize);
-  } else if (device_type == at::kXPU || device_type == at::kPrivateUse1) {
-    ptrdiff_t size_bytes_i = newsize;
-    TORCH_CHECK(
-        !c10::overflows<int64_t>(size_bytes_i),
-        "Requested storage size (",
-        size_bytes_i,
-        ") cannot be represented as a int64_t");
-    const auto size_bytes = static_cast<int64_t>(size_bytes_i);
-    void* original_data_ptr = storage.data_ptr().get();
-
-    auto src_option =
-        c10::TensorOptions().device(storage.device()).dtype(at::kByte);
-    auto src_tensor = at::empty({0}, {}, src_option).set_(storage);
-    src_tensor.resize_({size_bytes});
-
-    // When using resize_ to replace resize_bytes_xxx, in some cases
-    // the original data_ptr is still returned, which is an inconsistent
-    // behavior when compared to resize_bytes_xxx. For these cases,
-    // an additional memory copy and update for storage are required.
-    if (original_data_ptr == src_tensor.storage().data_ptr().get()) {
-      auto new_tensor = at::empty(src_tensor.sizes(), src_tensor.options());
-      new_tensor.copy_(src_tensor);
-      storage.set_data_ptr_noswap(
-          std::move(new_tensor.storage().mutable_data_ptr()));
-      storage.unsafeGetStorageImpl()->set_allocator(
-          new_tensor.storage().unsafeGetStorageImpl()->allocator());
-      storage.set_nbytes(new_tensor.storage().nbytes());
-    }
   } else {
-    TORCH_CHECK(
-        false,
-        "UntypedStorage.resize_: got unexpected device type ",
-        device_type);
+    at::native::resize_bytes_nocuda(storage, newsize);
   }
   Py_INCREF(self);
   return self;
@@ -664,6 +639,7 @@ static PyMethodDef THPStorage_methods[] = {
     {"resize_", THPStorage_resize_, METH_O, nullptr},
     {"nbytes", THPStorage_nbytes, METH_NOARGS, nullptr},
     {"data_ptr", THPStorage_dataPtr, METH_NOARGS, nullptr},
+    {"resizable", THPStorage_resizable, METH_NOARGS, nullptr},
     {"_write_file", THPStorage_writeFile, METH_VARARGS, nullptr},
     {"_new_with_file",
      THPStorage_newWithFile,
diff --git a/torch/csrc/Stream.cpp b/torch/csrc/Stream.cpp
index bd8abb0ecde2d..06dac515c1a5e 100644
--- a/torch/csrc/Stream.cpp
+++ b/torch/csrc/Stream.cpp
@@ -1,10 +1,19 @@
 #include <pybind11/pybind11.h>
 #include <torch/csrc/Device.h>
+#include <torch/csrc/Event.h>
+#include <torch/csrc/Stream.h>
 #include <torch/csrc/THP.h>
 #include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/pycfunction_helpers.h>
 #include <torch/csrc/utils/python_arg_parser.h>
 
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/util/Exception.h>
+#include <c10/util/hash.h>
 #include <structmember.h>
+#include <cstdint>
 
 PyTypeObject* THPStreamClass = nullptr;
 
@@ -13,22 +22,53 @@ static PyObject* THPStream_pynew(
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
-  int64_t stream_id = 0;
-  int64_t device_index = 0;
+
+  int64_t stream_id = -1;
   int64_t device_type = 0;
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-  constexpr const char* kwlist[] = {
-      "stream_id", "device_index", "device_type", nullptr};
-  if (!PyArg_ParseTupleAndKeywords(
-          args,
-          kwargs,
-          "|LLL",
-          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-          const_cast<char**>(kwlist),
-          &stream_id,
-          &device_index,
-          &device_type)) {
-    return nullptr;
+  int64_t device_index = 0;
+  int64_t priority = 0;
+
+  static torch::PythonArgParser parser({
+      "Steram(Device device=None, *, int64_t priority=0)",
+      "Stream(int64_t stream_id, int64_t device_index, int64_t device_type, *, int64_t priority=0)",
+  });
+
+  torch::ParsedArgs<4> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  std::unique_ptr<c10::DeviceGuard> device_guard_ptr;
+
+  if (r.idx == 0) {
+    auto default_accelerator = at::getAccelerator(false);
+    auto device = r.deviceOptional(0);
+    if (device.has_value()) {
+      device_type = static_cast<int64_t>(device->type());
+      device_index = static_cast<int64_t>(device->index());
+      // Initialize device guard if device is not None.
+      device_guard_ptr = std::make_unique<c10::DeviceGuard>(device.value());
+    } else {
+      // If device is None, we will use the current accelerator and index.
+      // If the current accelerator is not set, we will use the CPU as device
+      // type.
+      device_type = static_cast<int64_t>(
+          default_accelerator.value_or(c10::DeviceType::CPU));
+      c10::impl::VirtualGuardImpl impl{
+          static_cast<c10::DeviceType>(device_type)};
+      const auto current_device = impl.getDevice();
+      device_index = current_device.index();
+    }
+    priority = r.toInt64WithDefault(1, 0);
+  } else if (r.idx == 1) {
+    stream_id = r.toInt64WithDefault(0, -1);
+    device_index = r.toInt64WithDefault(1, 0);
+    device_type =
+        r.toInt64WithDefault(2, static_cast<int64_t>(c10::DeviceType::CPU));
+    priority = r.toInt64WithDefault(3, 0);
+  } else {
+    TORCH_CHECK(
+        false,
+        "parse stream arg fails please check the usage: ",
+        parser.get_signatures());
   }
 
   THPObjectPtr ptr(type->tp_alloc(type, 0));
@@ -37,9 +77,29 @@ static PyObject* THPStream_pynew(
   }
 
   THPStream* self = (THPStream*)ptr.get();
-  self->stream_id = stream_id;
-  self->device_index = device_index;
-  self->device_type = device_type;
+
+  // If torch.Stream is not created from existing Stream, then create a new one.
+  // It requires other device backends override getNewStream method. How the new
+  // stream is created is backend specific. Backend should be able to correctly
+  // manage the lifetime of streams.
+  c10::optional<c10::Stream> stream_opt;
+  if (r.idx == 0) {
+    c10::impl::VirtualGuardImpl impl{static_cast<c10::DeviceType>(device_type)};
+    stream_opt = impl.getNewStream(
+        c10::Device(static_cast<c10::DeviceType>(device_type), device_index),
+        static_cast<int>(priority));
+  } else {
+    stream_opt = c10::Stream::unpack3(
+        stream_id,
+        static_cast<c10::DeviceIndex>(device_index),
+        static_cast<c10::DeviceType>(device_type));
+  }
+
+  TORCH_CHECK(stream_opt.has_value(), "Failed to create stream");
+  self->stream_id = static_cast<int64_t>(stream_opt->id());
+  self->device_index = static_cast<int64_t>(stream_opt->device_index());
+  self->device_type = static_cast<int64_t>(stream_opt->device_type());
+
   return (PyObject*)ptr.release();
   END_HANDLE_TH_ERRORS
 }
@@ -73,15 +133,167 @@ static PyObject* THPStream_get_device(THPStream* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject* THPStream_query(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto self = (THPStream*)_self;
+
+  return PyBool_FromLong(c10::Stream::unpack3(
+                             self->stream_id,
+                             self->device_index,
+                             static_cast<c10::DeviceType>(self->device_type))
+                             .query());
+
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPStream_synchronize(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS {
+    pybind11::gil_scoped_release no_gil;
+    auto self = (THPStream*)_self;
+
+    c10::Stream::unpack3(
+        self->stream_id,
+        self->device_index,
+        static_cast<c10::DeviceType>(self->device_type))
+        .synchronize();
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPStream_wait_event(PyObject* _self, PyObject* _event) {
+  HANDLE_TH_ERRORS {
+    auto self = (THPStream*)_self;
+    auto event = (THPEvent*)_event;
+    c10::Stream::unpack3(
+        self->stream_id,
+        self->device_index,
+        static_cast<c10::DeviceType>(self->device_type))
+        .wait(event->event);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPStream_wait_stream(PyObject* _self, PyObject* _other) {
+  HANDLE_TH_ERRORS {
+    auto self = (THPStream*)_self;
+    auto other_stream = (THPStream*)_other;
+    c10::Event new_event(
+        static_cast<c10::DeviceType>(other_stream->device_type),
+        c10::EventFlag::PYTORCH_DEFAULT);
+    new_event.record(c10::Stream::unpack3(
+        other_stream->stream_id,
+        other_stream->device_index,
+        static_cast<c10::DeviceType>(other_stream->device_type)));
+    c10::Stream::unpack3(
+        self->stream_id,
+        self->device_index,
+        static_cast<c10::DeviceType>(self->device_type))
+        .wait(new_event);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPStream_record_event(
+    PyObject* _self,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  auto self = (THPStream*)_self;
+  PyObject* _new_event;
+  PyObject* _event = Py_None;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  constexpr const char* accepted_args[] = {"event", nullptr};
+  if (!PyArg_ParseTupleAndKeywords(
+          args,
+          kwargs,
+          "|O",
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+          const_cast<char**>(accepted_args),
+          &_event)) {
+    TORCH_CHECK(false, "parse record_event arg fails");
+  }
+  if (_event != Py_None) {
+    // Increase the refcount of the event to avoid it being destroyed.
+    Py_INCREF(_event);
+    _new_event = _event;
+  } else {
+    _new_event = THPEvent_new(
+        static_cast<c10::DeviceType>(self->device_type),
+        c10::EventFlag::PYTORCH_DEFAULT);
+  }
+  auto new_event = (THPEvent*)_new_event;
+  TORCH_CHECK(new_event, "event must not be null");
+  new_event->event.record(c10::Stream::unpack3(
+      self->stream_id,
+      self->device_index,
+      static_cast<c10::DeviceType>(self->device_type)));
+  return (PyObject*)new_event;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPStream_repr(THPStream* self) {
+  HANDLE_TH_ERRORS
+  return THPUtils_packString(
+      "torch.Stream device_type=" +
+      c10::DeviceTypeName(
+          static_cast<c10::DeviceType>(self->device_type), true) +
+      ", device_index=" + std::to_string(self->device_index) +
+      ", stream_id=" + std::to_string(self->stream_id));
+  END_HANDLE_TH_ERRORS
+}
+
+static Py_hash_t THPStream_hash(THPStream* self) {
+  return static_cast<long>(at::hash_combine(
+      self->device_type,
+      (at::hash_combine(self->stream_id, self->device_index))));
+}
+
 static PyObject* THPStream_eq(THPStream* self, THPStream* other) {
   HANDLE_TH_ERRORS
   return PyBool_FromLong(
-      self->stream_id == other->stream_id &&
-      self->device_index == other->device_index &&
-      self->device_type == other->device_type);
+      (self->stream_id == other->stream_id) &&
+      (self->device_index == other->device_index) &&
+      (self->device_type == other->device_type));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPStream_ne(THPStream* self, THPStream* other) {
+  HANDLE_TH_ERRORS
+  return PyBool_FromLong(
+      (self->stream_id != other->stream_id) ||
+      (self->device_index != other->device_index) ||
+      (self->device_type != other->device_type));
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject* THPStream_richcompare(
+    PyObject* self,
+    PyObject* other,
+    int op) {
+  PyObject* result = NULL;
+  if (other == Py_None) {
+    result = Py_False;
+  } else {
+    switch (op) {
+      case Py_EQ:
+        result = THPStream_eq((THPStream*)self, (THPStream*)other);
+        break;
+      case Py_NE:
+        result = THPStream_ne((THPStream*)self, (THPStream*)other);
+        break;
+      default:
+        result = Py_False;
+        break;
+    }
+  }
+  Py_XINCREF(result);
+  return result;
+}
+
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
 static struct PyMemberDef THPStream_members[] = {
     {"stream_id",
@@ -108,6 +320,14 @@ static struct PyGetSetDef THPStream_properties[] = {
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
 static PyMethodDef THPStream_methods[] = {
+    {"query", THPStream_query, METH_NOARGS, nullptr},
+    {"synchronize", THPStream_synchronize, METH_NOARGS, nullptr},
+    {"wait_event", THPStream_wait_event, METH_O, nullptr},
+    {"wait_stream", THPStream_wait_stream, METH_O, nullptr},
+    {"record_event",
+     castPyCFunctionWithKeywords(THPStream_record_event),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
     {"__eq__", (PyCFunction)THPStream_eq, METH_O, nullptr},
     {nullptr}};
 
@@ -120,11 +340,11 @@ PyTypeObject THPStreamType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
-    nullptr, /* tp_repr */
+    (reprfunc)THPStream_repr, /* tp_repr */
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
-    nullptr, /* tp_hash  */
+    (hashfunc)THPStream_hash, /* tp_hash  */
     nullptr, /* tp_call */
     nullptr, /* tp_str */
     nullptr, /* tp_getattro */
@@ -135,7 +355,7 @@ PyTypeObject THPStreamType = {
     nullptr, /* tp_doc */
     nullptr, /* tp_traverse */
     nullptr, /* tp_clear */
-    nullptr, /* tp_richcompare */
+    THPStream_richcompare, /* tp_richcompare */
     0, /* tp_weaklistoffset */
     nullptr, /* tp_iter */
     nullptr, /* tp_iternext */
diff --git a/torch/csrc/api/include/torch/all.h b/torch/csrc/api/include/torch/all.h
index 0d2eca31889c6..56ed75c833117 100644
--- a/torch/csrc/api/include/torch/all.h
+++ b/torch/csrc/api/include/torch/all.h
@@ -21,3 +21,4 @@
 #include <torch/types.h>
 #include <torch/utils.h>
 #include <torch/version.h>
+#include <torch/xpu.h>
diff --git a/torch/csrc/api/include/torch/detail/TensorDataContainer.h b/torch/csrc/api/include/torch/detail/TensorDataContainer.h
index 8faf9db66c8ce..4da7cb1f4460f 100644
--- a/torch/csrc/api/include/torch/detail/TensorDataContainer.h
+++ b/torch/csrc/api/include/torch/detail/TensorDataContainer.h
@@ -28,15 +28,6 @@ inline std::ostream& operator<<(
     std::ostream& stream,
     const TensorDataContainer& tensor_data_container);
 
-// FIXME: There is no `operator<<` overload for `at::kBFloat16` type,
-// and we need to convert it to `float` type using `operator float()` function
-// defined in `c10/util/BFloat16.h`.
-// Tracking issue: https://github.com/pytorch/pytorch/issues/28845
-inline std::ostream& operator<<(std::ostream& stream, c10::BFloat16 value) {
-  stream << static_cast<float>(value);
-  return stream;
-}
-
 inline c10::ScalarType compute_desired_dtype(c10::ScalarType scalar_type) {
   if (scalar_type == at::kInt || scalar_type == at::kLong) {
     // C++ `torch::tensor` with an integer type or an `at::ArrayRef` /
diff --git a/torch/csrc/api/include/torch/fft.h b/torch/csrc/api/include/torch/fft.h
index 86ab5050a5f7d..da1f7e518ae54 100644
--- a/torch/csrc/api/include/torch/fft.h
+++ b/torch/csrc/api/include/torch/fft.h
@@ -6,7 +6,7 @@ namespace torch {
 namespace fft {
 
 /// Computes the 1 dimensional fast Fourier transform over a given dimension.
-/// See https://pytorch.org/docs/master/fft.html#torch.fft.fft.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.fft.
 ///
 /// Example:
 /// ```
@@ -22,7 +22,7 @@ inline Tensor fft(
 }
 
 /// Computes the 1 dimensional inverse Fourier transform over a given dimension.
-/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifft.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.ifft.
 ///
 /// Example:
 /// ```
@@ -38,7 +38,7 @@ inline Tensor ifft(
 }
 
 /// Computes the 2-dimensional fast Fourier transform over the given dimensions.
-/// See https://pytorch.org/docs/master/fft.html#torch.fft.fft2.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.fft2.
 ///
 /// Example:
 /// ```
@@ -54,7 +54,7 @@ inline Tensor fft2(
 }
 
 /// Computes the inverse of torch.fft.fft2
-/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifft2.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.ifft2.
 ///
 /// Example:
 /// ```
@@ -70,7 +70,7 @@ inline Tensor ifft2(
 }
 
 /// Computes the N dimensional fast Fourier transform over given dimensions.
-/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftn.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.fftn.
 ///
 /// Example:
 /// ```
@@ -86,7 +86,7 @@ inline Tensor fftn(
 }
 
 /// Computes the N dimensional fast Fourier transform over given dimensions.
-/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifftn.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.ifftn.
 ///
 /// Example:
 /// ```
@@ -102,7 +102,7 @@ inline Tensor ifftn(
 }
 
 /// Computes the 1 dimensional FFT of real input with onesided Hermitian output.
-/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfft.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.rfft.
 ///
 /// Example:
 /// ```
@@ -121,7 +121,7 @@ inline Tensor rfft(
 /// Computes the inverse of torch.fft.rfft
 ///
 /// The input is a onesided Hermitian Fourier domain signal, with real-valued
-/// output. See https://pytorch.org/docs/master/fft.html#torch.fft.irfft
+/// output. See https://pytorch.org/docs/main/fft.html#torch.fft.irfft
 ///
 /// Example:
 /// ```
@@ -138,7 +138,7 @@ inline Tensor irfft(
 }
 
 /// Computes the 2-dimensional FFT of real input. Returns a onesided Hermitian
-/// output. See https://pytorch.org/docs/master/fft.html#torch.fft.rfft2
+/// output. See https://pytorch.org/docs/main/fft.html#torch.fft.rfft2
 ///
 /// Example:
 /// ```
@@ -154,7 +154,7 @@ inline Tensor rfft2(
 }
 
 /// Computes the inverse of torch.fft.rfft2.
-/// See https://pytorch.org/docs/master/fft.html#torch.fft.irfft2.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.irfft2.
 ///
 /// Example:
 /// ```
@@ -170,7 +170,7 @@ inline Tensor irfft2(
 }
 
 /// Computes the N dimensional FFT of real input with onesided Hermitian output.
-/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfftn
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.rfftn
 ///
 /// Example:
 /// ```
@@ -186,7 +186,7 @@ inline Tensor rfftn(
 }
 
 /// Computes the inverse of torch.fft.rfftn.
-/// See https://pytorch.org/docs/master/fft.html#torch.fft.irfftn.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.irfftn.
 ///
 /// Example:
 /// ```
@@ -205,7 +205,7 @@ inline Tensor irfftn(
 ///
 /// The input represents a Hermitian symmetric time domain signal. The returned
 /// Fourier domain representation of such a signal is a real-valued. See
-/// https://pytorch.org/docs/master/fft.html#torch.fft.hfft
+/// https://pytorch.org/docs/main/fft.html#torch.fft.hfft
 ///
 /// Example:
 /// ```
@@ -224,7 +224,7 @@ inline Tensor hfft(
 /// Computes the inverse FFT of a real-valued Fourier domain signal.
 ///
 /// The output is a onesided representation of the Hermitian symmetric time
-/// domain signal. See https://pytorch.org/docs/master/fft.html#torch.fft.ihfft.
+/// domain signal. See https://pytorch.org/docs/main/fft.html#torch.fft.ihfft.
 ///
 /// Example:
 /// ```
@@ -243,7 +243,7 @@ inline Tensor ihfft(
 /// Computes the 2-dimensional FFT of a Hermitian symmetric input signal.
 ///
 /// The input is a onesided representation of the Hermitian symmetric time
-/// domain signal. See https://pytorch.org/docs/master/fft.html#torch.fft.hfft2.
+/// domain signal. See https://pytorch.org/docs/main/fft.html#torch.fft.hfft2.
 ///
 /// Example:
 /// ```
@@ -263,7 +263,7 @@ inline Tensor hfft2(
 ///
 /// The output is a onesided representation of the Hermitian symmetric time
 /// domain signal. See
-/// https://pytorch.org/docs/master/fft.html#torch.fft.ihfft2.
+/// https://pytorch.org/docs/main/fft.html#torch.fft.ihfft2.
 ///
 /// Example:
 /// ```
@@ -282,7 +282,7 @@ inline Tensor ihfft2(
 /// Computes the N-dimensional FFT of a Hermitian symmetric input signal.
 ///
 /// The input is a onesided representation of the Hermitian symmetric time
-/// domain signal. See https://pytorch.org/docs/master/fft.html#torch.fft.hfftn.
+/// domain signal. See https://pytorch.org/docs/main/fft.html#torch.fft.hfftn.
 ///
 /// Example:
 /// ```
@@ -302,7 +302,7 @@ inline Tensor hfftn(
 ///
 /// The output is a onesided representation of the Hermitian symmetric time
 /// domain signal. See
-/// https://pytorch.org/docs/master/fft.html#torch.fft.ihfftn.
+/// https://pytorch.org/docs/main/fft.html#torch.fft.ihfftn.
 ///
 /// Example:
 /// ```
@@ -321,7 +321,7 @@ inline Tensor ihfftn(
 /// Computes the discrete Fourier Transform sample frequencies for a signal of
 /// size n.
 ///
-/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftfreq
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.fftfreq
 ///
 /// Example:
 /// ```
@@ -338,7 +338,7 @@ inline Tensor fftfreq(int64_t n, const TensorOptions& options = {}) {
 /// Computes the sample frequencies for torch.fft.rfft with a signal of size n.
 ///
 /// Like torch.fft.rfft, only the positive frequencies are included.
-/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfftfreq
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.rfftfreq
 ///
 /// Example:
 /// ```
@@ -355,7 +355,7 @@ inline Tensor rfftfreq(int64_t n, const TensorOptions& options) {
 /// Reorders n-dimensional FFT output to have negative frequency terms first, by
 /// a torch.roll operation.
 ///
-/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftshift
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.fftshift
 ///
 /// Example:
 /// ```
@@ -370,7 +370,7 @@ inline Tensor fftshift(
 
 /// Inverse of torch.fft.fftshift
 ///
-/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifftshift
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.ifftshift
 ///
 /// Example:
 /// ```
diff --git a/torch/csrc/api/include/torch/linalg.h b/torch/csrc/api/include/torch/linalg.h
index 3dd59c9f12f87..38010fbfcd4d2 100644
--- a/torch/csrc/api/include/torch/linalg.h
+++ b/torch/csrc/api/include/torch/linalg.h
@@ -445,7 +445,7 @@ inline Tensor& inv_out(Tensor& result, const Tensor& input) {
 
 /// Cholesky decomposition
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.cholesky
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.cholesky
 ///
 /// Example:
 /// ```
@@ -474,7 +474,7 @@ inline Tensor det(const Tensor& self) {
 
 /// Computes the sign and (natural) logarithm of the determinant
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.slogdet
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.slogdet
 inline std::tuple<Tensor, Tensor> slogdet(const Tensor& input) {
   return detail::slogdet(input);
 }
@@ -489,7 +489,7 @@ inline std::tuple<Tensor&, Tensor&> slogdet_out(
 /// Computes eigenvalues and eigenvectors of non-symmetric/non-hermitian
 /// matrices
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.eig
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.eig
 inline std::tuple<Tensor, Tensor> eig(const Tensor& self) {
   return detail::eig(self);
 }
@@ -503,7 +503,7 @@ inline std::tuple<Tensor&, Tensor&> eig_out(
 
 /// Computes eigenvalues of non-symmetric/non-hermitian matrices
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.eigvals
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.eigvals
 inline Tensor eigvals(const Tensor& self) {
   return detail::eigvals(self);
 }
@@ -514,7 +514,7 @@ inline Tensor& eigvals_out(Tensor& result, const Tensor& self) {
 
 /// Computes eigenvalues and eigenvectors
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.eigh
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.eigh
 inline std::tuple<Tensor, Tensor> eigh(
     const Tensor& self,
     c10::string_view uplo) {
@@ -531,7 +531,7 @@ inline std::tuple<Tensor&, Tensor&> eigh_out(
 
 /// Computes eigenvalues
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.eigvalsh
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.eigvalsh
 inline Tensor eigvalsh(const Tensor& self, c10::string_view uplo) {
   return detail::eigvalsh(self, uplo);
 }
@@ -546,7 +546,7 @@ inline Tensor& eigvalsh_out(
 /// Computes the product of Householder matrices
 ///
 /// See
-/// https://pytorch.org/docs/master/linalg.html#torch.linalg.householder_product
+/// https://pytorch.org/docs/main/linalg.html#torch.linalg.householder_product
 inline Tensor householder_product(const Tensor& input, const Tensor& tau) {
   return detail::householder_product(input, tau);
 }
@@ -568,7 +568,7 @@ inline std::tuple<Tensor, Tensor, Tensor, Tensor> lstsq(
 
 /// Computes the matrix exponential
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.matrix_exp
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.matrix_exp
 inline Tensor matrix_exp(const Tensor& input) {
   return detail::matrix_exp(input);
 }
@@ -619,7 +619,7 @@ inline Tensor& linalg_norm_out(
 
 /// Computes the LU factorization with partial pivoting
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.lu_factor
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.lu_factor
 inline std::tuple<Tensor, Tensor> lu_factor(
     const Tensor& input,
     const bool pivot = true) {
@@ -636,7 +636,7 @@ inline std::tuple<Tensor&, Tensor&> lu_factor_out(
 
 /// Computes the LU factorization with partial pivoting
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.lu
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.lu
 inline std::tuple<Tensor, Tensor, Tensor> lu(
     const Tensor& input,
     const bool pivot = true) {
@@ -690,7 +690,7 @@ inline Tensor& norm_out(
   return detail::norm_out(result, self, ord, opt_dim, keepdim, opt_dtype);
 }
 
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.vector_norm
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.vector_norm
 inline Tensor vector_norm(
     const Tensor& self,
     Scalar ord,
@@ -711,7 +711,7 @@ inline Tensor& vector_norm_out(
       result, self, ord, opt_dim, keepdim, opt_dtype);
 }
 
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.matrix_norm
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.matrix_norm
 inline Tensor matrix_norm(
     const Tensor& self,
     const Scalar& ord,
@@ -750,7 +750,7 @@ inline Tensor& matrix_norm_out(
   return detail::matrix_norm_out(self, ord, dim, keepdim, dtype, result);
 }
 
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.matrix_power
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.matrix_power
 inline Tensor matrix_power(const Tensor& self, int64_t n) {
   return detail::matrix_power(self, n);
 }
@@ -759,7 +759,7 @@ inline Tensor& matrix_power_out(const Tensor& self, int64_t n, Tensor& result) {
   return detail::matrix_power_out(self, n, result);
 }
 
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.matrix_rank
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.matrix_rank
 inline Tensor matrix_rank(const Tensor& input, double tol, bool hermitian) {
   return detail::matrix_rank(input, tol, hermitian);
 }
@@ -821,7 +821,7 @@ inline Tensor& matrix_rank_out(
   return detail::matrix_rank_out(result, input, atol, rtol, hermitian);
 }
 
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.multi_dot
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.multi_dot
 inline Tensor multi_dot(TensorList tensors) {
   return detail::multi_dot(tensors);
 }
@@ -832,7 +832,7 @@ inline Tensor& multi_dot_out(TensorList tensors, Tensor& result) {
 
 /// Computes the pseudo-inverse
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.pinv
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.pinv
 inline Tensor pinv(
     const Tensor& input,
     double rcond = 1e-15,
@@ -850,7 +850,7 @@ inline Tensor& pinv_out(
 
 /// Computes the QR decomposition
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.qr
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.qr
 inline std::tuple<Tensor, Tensor> qr(
     const Tensor& input,
     c10::string_view mode = "reduced") {
@@ -869,7 +869,7 @@ inline std::tuple<Tensor&, Tensor&> qr_out(
 
 /// Computes the LDL decomposition
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.ldl_factor_ex
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.ldl_factor_ex
 inline std::tuple<Tensor, Tensor, Tensor> ldl_factor_ex(
     const Tensor& input,
     bool hermitian,
@@ -890,7 +890,7 @@ inline std::tuple<Tensor&, Tensor&, Tensor&> ldl_factor_ex_out(
 
 /// Solve a system of linear equations using the LDL decomposition
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.ldl_solve
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.ldl_solve
 inline Tensor ldl_solve(
     const Tensor& LD,
     const Tensor& pivots,
@@ -910,7 +910,7 @@ inline Tensor& ldl_solve_out(
 
 /// Solves a system linear system AX = B
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.solve_ex
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.solve_ex
 inline std::tuple<Tensor, Tensor> solve_ex(
     const Tensor& input,
     const Tensor& other,
@@ -931,7 +931,7 @@ inline std::tuple<Tensor&, Tensor&> solve_ex_out(
 
 /// Computes a tensor `x` such that `matmul(input, x) = other`.
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.solve
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.solve
 inline Tensor solve(const Tensor& input, const Tensor& other, bool left) {
   return detail::solve(input, other, left);
 }
@@ -949,7 +949,7 @@ inline Tensor& solve_out(
 /// the diagonal
 ///
 /// See
-/// https://pytorch.org/docs/master/linalg.html#torch.linalg.solve_triangular
+/// https://pytorch.org/docs/main/linalg.html#torch.linalg.solve_triangular
 inline Tensor solve_triangular(
     const Tensor& input,
     const Tensor& other,
@@ -972,7 +972,7 @@ inline Tensor& solve_triangular_out(
 
 /// Computes the singular values and singular vectors
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.svd
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.svd
 inline std::tuple<Tensor, Tensor, Tensor> svd(
     const Tensor& input,
     bool full_matrices,
@@ -992,7 +992,7 @@ inline std::tuple<Tensor&, Tensor&, Tensor&> svd_out(
 
 /// Computes the singular values
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.svdvals
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.svdvals
 inline Tensor svdvals(
     const Tensor& input,
     c10::optional<c10::string_view> driver) {
@@ -1008,7 +1008,7 @@ inline Tensor& svdvals_out(
 
 /// Computes the inverse of a tensor
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.tensorinv
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.tensorinv
 ///
 /// Example:
 /// ```
@@ -1026,7 +1026,7 @@ inline Tensor& tensorinv_out(Tensor& result, const Tensor& self, int64_t ind) {
 
 /// Computes a tensor `x` such that `tensordot(input, x, dims=x.dim()) = other`.
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.tensorsolve
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.tensorsolve
 ///
 /// Example:
 /// ```
@@ -1052,7 +1052,7 @@ inline Tensor& tensorsolve_out(
 /// Computes a tensor `inverse_input` such that `dot(input, inverse_input) =
 /// eye(input.size(0))`.
 ///
-/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.inv
+/// See https://pytorch.org/docs/main/linalg.html#torch.linalg.inv
 inline Tensor inv(const Tensor& input) {
   return detail::inv(input);
 }
diff --git a/torch/csrc/api/include/torch/nested.h b/torch/csrc/api/include/torch/nested.h
index d91c878348bd5..524b4d433186c 100644
--- a/torch/csrc/api/include/torch/nested.h
+++ b/torch/csrc/api/include/torch/nested.h
@@ -11,7 +11,7 @@ namespace nested {
 /// Nested tensor
 ///
 /// See
-/// https://pytorch.org/docs/master/nested.html#torch.nested.nested_tensor
+/// https://pytorch.org/docs/main/nested.html#torch.nested.nested_tensor
 ///
 /// ```
 // implemented on python object to allow torch.nested.nested_tensor to be
@@ -67,7 +67,7 @@ inline at::Tensor nested_tensor(
 /// As Nested Tensor
 ///
 /// See
-/// https://pytorch.org/docs/master/nested.html#torch.nested.as_nested_tensor
+/// https://pytorch.org/docs/main/nested.html#torch.nested.as_nested_tensor
 ///
 /// ```
 inline at::Tensor as_nested_tensor(
@@ -81,7 +81,7 @@ inline at::Tensor as_nested_tensor(
 /// Nested to padded tensor
 ///
 /// See
-/// https://pytorch.org/docs/master/nested.html#torch.nested.to_padded_tensor
+/// https://pytorch.org/docs/main/nested.html#torch.nested.to_padded_tensor
 ///
 /// ```
 inline at::Tensor to_padded_tensor(
diff --git a/torch/csrc/api/include/torch/nn/functional/activation.h b/torch/csrc/api/include/torch/nn/functional/activation.h
index 52030da2aa923..9c100287f9559 100644
--- a/torch/csrc/api/include/torch/nn/functional/activation.h
+++ b/torch/csrc/api/include/torch/nn/functional/activation.h
@@ -27,7 +27,7 @@ inline Tensor elu(Tensor input, double alpha, bool inplace) {
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.elu
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.elu
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::ELUFuncOptions` class to
@@ -57,7 +57,7 @@ inline Tensor selu(Tensor input, bool inplace) {
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.selu
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.selu
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::SELUFuncOptions` class to
@@ -83,7 +83,7 @@ inline Tensor hardshrink(const Tensor& input, double lambda) {
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.hardshrink
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.hardshrink
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::HardshrinkFuncOptions`
@@ -119,7 +119,7 @@ inline Tensor hardtanh(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.hardtanh
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.hardtanh
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::HardtanhFuncOptions` class
@@ -154,7 +154,7 @@ inline Tensor leaky_relu(Tensor input, double negative_slope, bool inplace) {
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.leaky_relu
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.leaky_relu
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::LeakyReLUFuncOptions`
@@ -208,7 +208,7 @@ inline Tensor gumbel_softmax(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.gumbel_softmax
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.gumbel_softmax
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::GumbelSoftmaxFuncOptions`
@@ -248,7 +248,7 @@ inline Tensor softmax(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.softmax
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.softmax
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::SoftmaxFuncOptions` class
@@ -285,7 +285,7 @@ inline Tensor softmin(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.softmin
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.softmin
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::SoftminFuncOptions` class
@@ -322,7 +322,7 @@ inline Tensor log_softmax(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.log_softmax
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.log_softmax
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::LogSoftmaxFuncOptions`
@@ -353,7 +353,7 @@ inline Tensor glu(const Tensor& input, int64_t dim) {
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.glu
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.glu
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::GLUFuncOptions` class to
@@ -415,7 +415,7 @@ inline Tensor relu(Tensor input, bool inplace) {
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.relu
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.relu
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::ReLUFuncOptions` class to
@@ -445,7 +445,7 @@ inline Tensor relu6(Tensor input, bool inplace) {
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.relu6
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.relu6
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::ReLU6FuncOptions` class to
@@ -480,7 +480,7 @@ inline Tensor rrelu(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.rrelu
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.rrelu
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::RReLUFuncOptions` class to
@@ -515,7 +515,7 @@ inline Tensor celu(Tensor input, double alpha, bool inplace) {
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.celu
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.celu
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::CELUFuncOptions` class to
@@ -541,7 +541,7 @@ inline Tensor softplus(const Tensor& input, double beta, double threshold) {
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.softplus
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.softplus
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::SoftplusFuncOptions` class
@@ -569,7 +569,7 @@ inline Tensor softshrink(const Tensor& input, double lambda) {
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.softshrink
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.softshrink
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::SoftshrinkFuncOptions`
@@ -617,7 +617,7 @@ inline Tensor threshold(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.threshold
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.threshold
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::ThresholdFuncOptions`
diff --git a/torch/csrc/api/include/torch/nn/functional/batchnorm.h b/torch/csrc/api/include/torch/nn/functional/batchnorm.h
index e7b7325157616..487bd78ad44fe 100644
--- a/torch/csrc/api/include/torch/nn/functional/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/functional/batchnorm.h
@@ -50,7 +50,7 @@ inline Tensor batch_norm(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.batch_norm
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.batch_norm
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::BatchNormFuncOptions`
diff --git a/torch/csrc/api/include/torch/nn/functional/conv.h b/torch/csrc/api/include/torch/nn/functional/conv.h
index 22f8d04ab7345..8f85fb286731a 100644
--- a/torch/csrc/api/include/torch/nn/functional/conv.h
+++ b/torch/csrc/api/include/torch/nn/functional/conv.h
@@ -42,7 +42,7 @@ inline Tensor conv1d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.conv1d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.conv1d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::Conv1dFuncOptions` class
@@ -88,7 +88,7 @@ inline Tensor conv2d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.conv2d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.conv2d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::Conv2dFuncOptions` class
@@ -134,7 +134,7 @@ inline Tensor conv3d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.conv3d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.conv3d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::Conv3dFuncOptions` class
@@ -179,7 +179,7 @@ inline Tensor conv_transpose1d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.conv_transpose1d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.conv_transpose1d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -224,7 +224,7 @@ inline Tensor conv_transpose2d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.conv_transpose2d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.conv_transpose2d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -269,7 +269,7 @@ inline Tensor conv_transpose3d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.conv_transpose3d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.conv_transpose3d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
diff --git a/torch/csrc/api/include/torch/nn/functional/distance.h b/torch/csrc/api/include/torch/nn/functional/distance.h
index 27914017fef22..84f6009fae9d7 100644
--- a/torch/csrc/api/include/torch/nn/functional/distance.h
+++ b/torch/csrc/api/include/torch/nn/functional/distance.h
@@ -19,7 +19,7 @@ inline Tensor cosine_similarity(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.cosine_similarity
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.cosine_similarity
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -55,7 +55,7 @@ inline Tensor pairwise_distance(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.pairwise_distance
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.pairwise_distance
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
diff --git a/torch/csrc/api/include/torch/nn/functional/dropout.h b/torch/csrc/api/include/torch/nn/functional/dropout.h
index cb233e5a01065..6b7953a266c4d 100644
--- a/torch/csrc/api/include/torch/nn/functional/dropout.h
+++ b/torch/csrc/api/include/torch/nn/functional/dropout.h
@@ -27,7 +27,7 @@ inline Tensor dropout(Tensor input, double p, bool training, bool inplace) {
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.dropout
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.dropout
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::DropoutFuncOptions` class
@@ -96,7 +96,7 @@ inline Tensor dropout2d(Tensor input, double p, bool training, bool inplace) {
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.dropout2d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.dropout2d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::Dropout2dFuncOptions`
@@ -128,7 +128,7 @@ inline Tensor dropout3d(Tensor input, double p, bool training, bool inplace) {
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.dropout3d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.dropout3d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::Dropout3dFuncOptions`
@@ -168,7 +168,7 @@ inline Tensor alpha_dropout(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.alpha_dropout
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.alpha_dropout
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::AlphaDropoutFuncOptions`
@@ -209,7 +209,7 @@ inline Tensor feature_alpha_dropout(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.feature_alpha_dropout
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.feature_alpha_dropout
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
diff --git a/torch/csrc/api/include/torch/nn/functional/embedding.h b/torch/csrc/api/include/torch/nn/functional/embedding.h
index 37f373774e913..99432c09d36be 100644
--- a/torch/csrc/api/include/torch/nn/functional/embedding.h
+++ b/torch/csrc/api/include/torch/nn/functional/embedding.h
@@ -58,7 +58,7 @@ inline Tensor embedding(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.embedding
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.embedding
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::EmbeddingFuncOptions`
@@ -176,7 +176,7 @@ inline Tensor embedding_bag(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.embedding_bag
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.embedding_bag
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::EmbeddingBagFuncOptions`
diff --git a/torch/csrc/api/include/torch/nn/functional/fold.h b/torch/csrc/api/include/torch/nn/functional/fold.h
index cd47138e32e9e..4f1716b2881bc 100644
--- a/torch/csrc/api/include/torch/nn/functional/fold.h
+++ b/torch/csrc/api/include/torch/nn/functional/fold.h
@@ -31,7 +31,7 @@ inline Tensor fold(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.fold
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.fold
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::FoldFuncOptions` class to
@@ -77,7 +77,7 @@ inline Tensor unfold(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.unfold
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.unfold
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::UnfoldFuncOptions` class
diff --git a/torch/csrc/api/include/torch/nn/functional/instancenorm.h b/torch/csrc/api/include/torch/nn/functional/instancenorm.h
index bfa42a32f7940..17efaea7a5e55 100644
--- a/torch/csrc/api/include/torch/nn/functional/instancenorm.h
+++ b/torch/csrc/api/include/torch/nn/functional/instancenorm.h
@@ -32,7 +32,7 @@ inline Tensor instance_norm(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.instance_norm
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.instance_norm
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::InstanceNormFuncOptions`
diff --git a/torch/csrc/api/include/torch/nn/functional/loss.h b/torch/csrc/api/include/torch/nn/functional/loss.h
index 691ba4ce3041e..17fa2be1afc7a 100644
--- a/torch/csrc/api/include/torch/nn/functional/loss.h
+++ b/torch/csrc/api/include/torch/nn/functional/loss.h
@@ -20,7 +20,7 @@ inline Tensor l1_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.l1_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.l1_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::L1LossFuncOptions` class
@@ -77,7 +77,7 @@ inline Tensor kl_div(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.kl_div
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.kl_div
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::KLDivFuncOptions` class to
@@ -126,7 +126,7 @@ inline Tensor mse_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.mse_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.mse_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::MSELossFuncOptions` class
@@ -179,7 +179,7 @@ inline Tensor binary_cross_entropy(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.binary_cross_entropy
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.binary_cross_entropy
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -216,7 +216,7 @@ inline Tensor hinge_embedding_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.hinge_embedding_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.hinge_embedding_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -265,7 +265,7 @@ inline Tensor multi_margin_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.multi_margin_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.multi_margin_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -308,7 +308,7 @@ inline Tensor cosine_embedding_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.cosine_embedding_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.cosine_embedding_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -371,7 +371,7 @@ inline Tensor smooth_l1_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.smooth_l1_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.smooth_l1_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::SmoothL1LossFuncOptions`
@@ -391,7 +391,7 @@ inline Tensor smooth_l1_loss(
 }
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.smooth_l1_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.smooth_l1_loss
 /// about the exact behavior of this functional.
 ///
 /// Example:
@@ -443,7 +443,7 @@ inline Tensor huber_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.huber_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.huber_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::HuberLossFuncOptions`
@@ -478,7 +478,7 @@ inline Tensor multilabel_margin_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.multilabel_margin_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.multilabel_margin_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -513,7 +513,7 @@ inline Tensor soft_margin_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.soft_margin_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.soft_margin_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::SoftMarginLossFuncOptions`
@@ -571,7 +571,7 @@ inline Tensor multilabel_soft_margin_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.multilabel_soft_margin_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.multilabel_soft_margin_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -619,7 +619,7 @@ inline Tensor triplet_margin_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.triplet_margin_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.triplet_margin_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -701,7 +701,7 @@ inline Tensor triplet_margin_with_distance_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.triplet_margin_with_distance_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.triplet_margin_with_distance_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -754,7 +754,7 @@ inline Tensor ctc_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.ctc_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.ctc_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::CTCLossFuncOptions` class
@@ -805,7 +805,7 @@ inline Tensor poisson_nll_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.poisson_nll_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.poisson_nll_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::PoissonNLLLossFuncOptions`
@@ -856,7 +856,7 @@ inline Tensor margin_ranking_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.margin_ranking_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.margin_ranking_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -913,7 +913,7 @@ inline Tensor nll_loss(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.nll_loss
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.nll_loss
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::NLLLossFuncOptions` class
@@ -960,7 +960,7 @@ inline Tensor cross_entropy(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.cross_entropy
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.cross_entropy
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::CrossEntropyFuncOptions`
@@ -1014,7 +1014,7 @@ inline Tensor binary_cross_entropy_with_logits(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.binary_cross_entropy_with_logits
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.binary_cross_entropy_with_logits
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
diff --git a/torch/csrc/api/include/torch/nn/functional/normalization.h b/torch/csrc/api/include/torch/nn/functional/normalization.h
index 80e1a933dfb3c..a45fec6ca34f9 100644
--- a/torch/csrc/api/include/torch/nn/functional/normalization.h
+++ b/torch/csrc/api/include/torch/nn/functional/normalization.h
@@ -29,7 +29,7 @@ inline Tensor normalize(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.normalize
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.normalize
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::NormalizeFuncOptions`
@@ -63,7 +63,7 @@ inline Tensor layer_norm(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.layer_norm
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.layer_norm
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::LayerNormFuncOptions`
@@ -143,7 +143,7 @@ inline Tensor local_response_norm(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.local_response_norm
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.local_response_norm
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -184,7 +184,7 @@ inline Tensor group_norm(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.group_norm
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.group_norm
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::GroupNormFuncOptions`
diff --git a/torch/csrc/api/include/torch/nn/functional/padding.h b/torch/csrc/api/include/torch/nn/functional/padding.h
index 7bbf75cfa7554..d4b81fb53f26a 100644
--- a/torch/csrc/api/include/torch/nn/functional/padding.h
+++ b/torch/csrc/api/include/torch/nn/functional/padding.h
@@ -37,7 +37,7 @@ inline Tensor pad(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.pad
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.pad
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::PadFuncOptions` class to
diff --git a/torch/csrc/api/include/torch/nn/functional/pixelshuffle.h b/torch/csrc/api/include/torch/nn/functional/pixelshuffle.h
index 6b962cf814b10..a245002428e2d 100644
--- a/torch/csrc/api/include/torch/nn/functional/pixelshuffle.h
+++ b/torch/csrc/api/include/torch/nn/functional/pixelshuffle.h
@@ -19,7 +19,7 @@ inline Tensor pixel_unshuffle(const Tensor& input, int64_t downscale_factor) {
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.pixel_shuffle
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.pixel_shuffle
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::PixelShuffleFuncOptions`
diff --git a/torch/csrc/api/include/torch/nn/functional/pooling.h b/torch/csrc/api/include/torch/nn/functional/pooling.h
index 8cfa30ca2ce9c..9f9708ce657ec 100644
--- a/torch/csrc/api/include/torch/nn/functional/pooling.h
+++ b/torch/csrc/api/include/torch/nn/functional/pooling.h
@@ -25,7 +25,7 @@ inline Tensor avg_pool1d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.avg_pool1d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.avg_pool1d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::AvgPool1dFuncOptions`
@@ -71,7 +71,7 @@ inline Tensor avg_pool2d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.avg_pool2d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.avg_pool2d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::AvgPool2dFuncOptions`
@@ -118,7 +118,7 @@ inline Tensor avg_pool3d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.avg_pool3d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.avg_pool3d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::AvgPool3dFuncOptions`
@@ -160,7 +160,7 @@ inline Tensor max_pool1d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.max_pool1d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.max_pool1d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::MaxPool1dFuncOptions`
@@ -234,7 +234,7 @@ inline Tensor max_pool2d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.max_pool2d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.max_pool2d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::MaxPool2dFuncOptions`
@@ -308,7 +308,7 @@ inline Tensor max_pool3d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.max_pool3d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.max_pool3d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::MaxPool3dFuncOptions`
@@ -402,7 +402,7 @@ inline Tensor adaptive_max_pool1d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.adaptive_max_pool1d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.adaptive_max_pool1d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -458,7 +458,7 @@ inline Tensor adaptive_max_pool2d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.adaptive_max_pool2d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.adaptive_max_pool2d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -514,7 +514,7 @@ inline Tensor adaptive_max_pool3d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.adaptive_max_pool3d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.adaptive_max_pool3d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -545,7 +545,7 @@ inline Tensor adaptive_avg_pool1d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.adaptive_avg_pool1d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.adaptive_avg_pool1d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -576,7 +576,7 @@ inline Tensor adaptive_avg_pool2d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.adaptive_avg_pool2d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.adaptive_avg_pool2d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -607,7 +607,7 @@ inline Tensor adaptive_avg_pool3d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.adaptive_avg_pool3d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.adaptive_avg_pool3d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for
@@ -700,7 +700,7 @@ inline Tensor max_unpool1d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.max_unpool1d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.max_unpool1d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::MaxUnpool1dFuncOptions`
@@ -743,7 +743,7 @@ inline Tensor max_unpool2d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.max_unpool2d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.max_unpool2d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::MaxUnpool2dFuncOptions`
@@ -786,7 +786,7 @@ inline Tensor max_unpool3d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.max_unpool3d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.max_unpool3d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::MaxUnpool3dFuncOptions`
@@ -1027,7 +1027,7 @@ inline Tensor lp_pool1d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.lp_pool1d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.lp_pool1d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::LPPool1dFuncOptions` class
@@ -1076,7 +1076,7 @@ inline Tensor lp_pool2d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.lp_pool2d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.lp_pool2d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::LPPool2dFuncOptions` class
@@ -1126,7 +1126,7 @@ inline Tensor lp_pool3d(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.lp_pool3d
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.lp_pool3d
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::LPPool3dFuncOptions` class
diff --git a/torch/csrc/api/include/torch/nn/functional/upsampling.h b/torch/csrc/api/include/torch/nn/functional/upsampling.h
index f786b9d6983a0..8fe1b3f00f85d 100644
--- a/torch/csrc/api/include/torch/nn/functional/upsampling.h
+++ b/torch/csrc/api/include/torch/nn/functional/upsampling.h
@@ -18,12 +18,7 @@ inline std::vector<int64_t> _interp_output_size(
         c10::optional<std::vector<int64_t>>,
         c10::optional<std::vector<double>>,
         c10::optional<bool>> closed_over_args) {
-  Tensor input;
-  c10::optional<std::vector<int64_t>> size;
-  c10::optional<std::vector<double>> scale_factor;
-  c10::optional<bool> recompute_scale_factor;
-  std::tie(input, size, scale_factor, recompute_scale_factor) =
-      closed_over_args;
+  auto [input, size, scale_factor, recompute_scale_factor] = closed_over_args;
   if (size == c10::nullopt && scale_factor == c10::nullopt) {
     TORCH_CHECK(false, "either size or scale_factor should be defined");
   }
@@ -185,7 +180,8 @@ inline Tensor interpolate(
     return detail::adaptive_avg_pool3d(
         input, _interp_output_size(3, std::move(closed_over_args)));
   } else if (input.dim() == 3 && std::get_if<enumtype::kLinear>(&mode)) {
-    TORCH_INTERNAL_ASSERT(align_corners != c10::nullopt);
+    TORCH_CHECK(
+        align_corners != c10::nullopt, "align_corners should be specified.");
     return torch::upsample_linear1d(
         input,
         _interp_output_size(1, std::move(closed_over_args)),
@@ -198,7 +194,8 @@ inline Tensor interpolate(
   } else if (input.dim() == 4 && std::get_if<enumtype::kLinear>(&mode)) {
     TORCH_CHECK(false, "Got 4D input, but linear mode needs 3D input");
   } else if (input.dim() == 4 && std::get_if<enumtype::kBilinear>(&mode)) {
-    TORCH_INTERNAL_ASSERT(align_corners != c10::nullopt);
+    TORCH_CHECK(
+        align_corners != c10::nullopt, "align_corners should be specified.");
     if (antialias) {
       return torch::_upsample_bilinear2d_aa(
           input,
@@ -220,7 +217,8 @@ inline Tensor interpolate(
   } else if (input.dim() == 5 && std::get_if<enumtype::kBilinear>(&mode)) {
     TORCH_CHECK(false, "Got 5D input, but bilinear mode needs 4D input");
   } else if (input.dim() == 5 && std::get_if<enumtype::kTrilinear>(&mode)) {
-    TORCH_INTERNAL_ASSERT(align_corners != c10::nullopt);
+    TORCH_CHECK(
+        align_corners != c10::nullopt, "align_corners should be specified.");
     return torch::upsample_trilinear3d(
         input,
         _interp_output_size(3, std::move(closed_over_args)),
@@ -229,7 +227,8 @@ inline Tensor interpolate(
         scale_factor_list.at(1),
         scale_factor_list.at(2));
   } else if (input.dim() == 4 && std::get_if<enumtype::kBicubic>(&mode)) {
-    TORCH_INTERNAL_ASSERT(align_corners != c10::nullopt);
+    TORCH_CHECK(
+        align_corners != c10::nullopt, "align_corners should be specified.");
     if (antialias) {
       return torch::_upsample_bicubic2d_aa(
           input,
@@ -260,7 +259,7 @@ inline Tensor interpolate(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.interpolate
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.interpolate
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::InterpolateFuncOptions`
diff --git a/torch/csrc/api/include/torch/nn/functional/vision.h b/torch/csrc/api/include/torch/nn/functional/vision.h
index 4aa1d3fe0a477..e9cb1eb11ac0f 100644
--- a/torch/csrc/api/include/torch/nn/functional/vision.h
+++ b/torch/csrc/api/include/torch/nn/functional/vision.h
@@ -95,7 +95,7 @@ inline Tensor grid_sample(
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
 
 /// See
-/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.grid_sample
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.grid_sample
 /// about the exact behavior of this functional.
 ///
 /// See the documentation for `torch::nn::functional::GridSampleFuncOptions`
diff --git a/torch/csrc/api/include/torch/nn/modules/activation.h b/torch/csrc/api/include/torch/nn/modules/activation.h
index 68056ec458ebb..6946c474f0d90 100644
--- a/torch/csrc/api/include/torch/nn/modules/activation.h
+++ b/torch/csrc/api/include/torch/nn/modules/activation.h
@@ -14,7 +14,7 @@ namespace nn {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ELU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies elu over a given input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ELU to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ELU to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ELUOptions` class to learn what
@@ -49,7 +49,7 @@ TORCH_MODULE(ELU);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SELU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the selu function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.SELU to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.SELU to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::SELUOptions` class to learn what
@@ -84,7 +84,7 @@ TORCH_MODULE(SELU);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hardshrink ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the hard shrinkage function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Hardshrink to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Hardshrink to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::HardshrinkOptions` class to learn what
@@ -119,7 +119,7 @@ TORCH_MODULE(Hardshrink);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hardtanh ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the HardTanh function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Hardtanh to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Hardtanh to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::HardtanhOptions` class to learn what
@@ -155,7 +155,7 @@ TORCH_MODULE(Hardtanh);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LeakyReLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the LeakyReLU function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.LeakyReLU to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LeakyReLU to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::LeakyReLUOptions` class to learn what
@@ -190,7 +190,7 @@ TORCH_MODULE(LeakyReLU);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LogSigmoid ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the LogSigmoid function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.LogSigmoid to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LogSigmoid to learn
 /// about the exact behavior of this module.
 class TORCH_API LogSigmoidImpl : public torch::nn::Cloneable<LogSigmoidImpl> {
  public:
@@ -211,7 +211,7 @@ TORCH_MODULE(LogSigmoid);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softmax ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the Softmax function.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Softmax to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Softmax to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::SoftmaxOptions` class to learn what
@@ -246,7 +246,7 @@ TORCH_MODULE(Softmax);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softmin ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the Softmin function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Softmin to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Softmin to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::SoftminOptions` class to learn what
@@ -281,7 +281,7 @@ TORCH_MODULE(Softmin);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LogSoftmax ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the LogSoftmax function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.LogSoftmax to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LogSoftmax to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::LogSoftmaxOptions` class to learn what
@@ -317,7 +317,7 @@ TORCH_MODULE(LogSoftmax);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softmax2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the Softmax2d function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Softmax2d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Softmax2d to learn
 /// about the exact behavior of this module.
 class TORCH_API Softmax2dImpl : public torch::nn::Cloneable<Softmax2dImpl> {
  public:
@@ -338,7 +338,7 @@ TORCH_MODULE(Softmax2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PReLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the PReLU function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.PReLU to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.PReLU to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::PReLUOptions` class to learn what
@@ -376,7 +376,7 @@ TORCH_MODULE(PReLU);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the ReLU function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReLU to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReLU to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ReLUOptions` class to learn what
@@ -411,7 +411,7 @@ TORCH_MODULE(ReLU);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReLU6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the ReLU6 function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReLU6 to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReLU6 to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ReLU6Options` class to learn what
@@ -446,7 +446,7 @@ TORCH_MODULE(ReLU6);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RReLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the RReLU function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.RReLU to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.RReLU to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::RReLUOptions` class to learn what
@@ -481,7 +481,7 @@ TORCH_MODULE(RReLU);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CELU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies celu over a given input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.CELU to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.CELU to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::CELUOptions` class to learn what
@@ -516,7 +516,7 @@ TORCH_MODULE(CELU);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies glu over a given input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.GLU to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.GLU to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::GLUOptions` class to learn what
@@ -551,7 +551,7 @@ TORCH_MODULE(GLU);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GELU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies gelu over a given input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.GELU to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.GELU to learn
 /// about the exact behavior of this module.
 class TORCH_API GELUImpl : public torch::nn::Cloneable<GELUImpl> {
  public:
@@ -577,7 +577,7 @@ TORCH_MODULE(GELU);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SiLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies silu over a given input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.SiLU to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.SiLU to learn
 /// about the exact behavior of this module.
 class TORCH_API SiLUImpl : public torch::nn::Cloneable<SiLUImpl> {
  public:
@@ -598,7 +598,7 @@ TORCH_MODULE(SiLU);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Mish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies mish over a given input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Mish to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Mish to learn
 /// about the exact behavior of this module.
 class TORCH_API MishImpl : public torch::nn::Cloneable<MishImpl> {
  public:
@@ -619,7 +619,7 @@ TORCH_MODULE(Mish);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Sigmoid ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies sigmoid over a given input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Sigmoid to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Sigmoid to learn
 /// about the exact behavior of this module.
 class TORCH_API SigmoidImpl : public torch::nn::Cloneable<SigmoidImpl> {
  public:
@@ -640,7 +640,7 @@ TORCH_MODULE(Sigmoid);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softplus ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies softplus over a given input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Softplus to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Softplus to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::SoftplusOptions` class to learn what
@@ -675,7 +675,7 @@ TORCH_MODULE(Softplus);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softshrink ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the soft shrinkage function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Softshrink to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Softshrink to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::SoftshrinkOptions` class to learn what
@@ -710,7 +710,7 @@ TORCH_MODULE(Softshrink);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softsign ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies Softsign over a given input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Softsign to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Softsign to learn
 /// about the exact behavior of this module.
 class TORCH_API SoftsignImpl : public torch::nn::Cloneable<SoftsignImpl> {
  public:
@@ -731,7 +731,7 @@ TORCH_MODULE(Softsign);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tanh ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies Tanh over a given input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Tanh to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Tanh to learn
 /// about the exact behavior of this module.
 class TORCH_API TanhImpl : public torch::nn::Cloneable<TanhImpl> {
  public:
@@ -752,7 +752,7 @@ TORCH_MODULE(Tanh);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tanhshrink ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies Tanhshrink over a given input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Tanhshrink to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Tanhshrink to learn
 /// about the exact behavior of this module.
 class TORCH_API TanhshrinkImpl : public torch::nn::Cloneable<TanhshrinkImpl> {
  public:
@@ -773,7 +773,7 @@ TORCH_MODULE(Tanhshrink);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Threshold ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the Threshold function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Threshold to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Threshold to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ThresholdOptions` class to learn what
@@ -810,7 +810,7 @@ TORCH_MODULE(Threshold);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MultiheadAttention ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the MultiheadAttention function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.MultiheadAttention
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MultiheadAttention
 /// to learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::MultiheadAttentionOptions` class to
diff --git a/torch/csrc/api/include/torch/nn/modules/adaptive.h b/torch/csrc/api/include/torch/nn/modules/adaptive.h
index 939d57dd5d510..609e690d4c7de 100644
--- a/torch/csrc/api/include/torch/nn/modules/adaptive.h
+++ b/torch/csrc/api/include/torch/nn/modules/adaptive.h
@@ -30,7 +30,7 @@ struct TORCH_API ASMoutput {
 /// `Efficient softmax approximation for GPUs`_ by Edouard Grave, Armand Joulin,
 /// Moustapha Cissé, David Grangier, and Hervé Jégou.
 /// See
-/// https://pytorch.org/docs/master/nn.html#torch.nn.AdaptiveLogSoftmaxWithLoss
+/// https://pytorch.org/docs/main/nn.html#torch.nn.AdaptiveLogSoftmaxWithLoss
 /// to learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::AdaptiveLogSoftmaxWithLossOptions`
diff --git a/torch/csrc/api/include/torch/nn/modules/batchnorm.h b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
index 3264d90bd6ed7..ec76c6b4a6fbc 100644
--- a/torch/csrc/api/include/torch/nn/modules/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
@@ -160,7 +160,7 @@ class BatchNormImplBase : public NormImplBase<D, Derived, BatchNormOptions> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the BatchNorm1d function.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.BatchNorm1d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.BatchNorm1d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::BatchNorm1dOptions` class to learn
@@ -190,7 +190,7 @@ TORCH_MODULE(BatchNorm1d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the BatchNorm2d function.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.BatchNorm2d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.BatchNorm2d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::BatchNorm2dOptions` class to learn
@@ -220,7 +220,7 @@ TORCH_MODULE(BatchNorm2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the BatchNorm3d function.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.BatchNorm3d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.BatchNorm3d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::BatchNorm3dOptions` class to learn
diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h
index 68bd10c0db820..65a2d6905c0a9 100644
--- a/torch/csrc/api/include/torch/nn/modules/conv.h
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@@ -107,9 +107,7 @@ class ConvNdImpl : public torch::nn::Cloneable<Derived> {
         /*a=*/std::sqrt(5)); // NOLINT(cppcoreguidelines-avoid-magic-numbers)
 
     if (bias.defined()) {
-      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-      int64_t fan_in, fan_out;
-      std::tie(fan_in, fan_out) = init::_calculate_fan_in_and_fan_out(weight);
+      auto [fan_in, fan_out] = init::_calculate_fan_in_and_fan_out(weight);
       auto bound = 1 / std::sqrt(fan_in);
       init::uniform_(bias, -bound, bound);
     }
@@ -170,7 +168,7 @@ class ConvNdImpl : public torch::nn::Cloneable<Derived> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies convolution over a 1-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv1d to learn about
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Conv1d to learn about
 /// the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::Conv1dOptions` class to learn what
@@ -202,7 +200,7 @@ TORCH_MODULE(Conv1d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies convolution over a 2-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv2d to learn about
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Conv2d to learn about
 /// the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::Conv2dOptions` class to learn what
@@ -237,7 +235,7 @@ TORCH_MODULE(Conv2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies convolution over a 3-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv3d to learn about
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Conv3d to learn about
 /// the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::Conv3dOptions` class to learn what
@@ -327,7 +325,7 @@ class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the ConvTranspose1d function.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ConvTranspose1d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ConvTranspose1d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ConvTranspose1dOptions` class to learn
@@ -369,7 +367,7 @@ TORCH_MODULE(ConvTranspose1d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the ConvTranspose2d function.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ConvTranspose2d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ConvTranspose2d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ConvTranspose2dOptions` class to learn
@@ -411,7 +409,7 @@ TORCH_MODULE(ConvTranspose2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the ConvTranspose3d function.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ConvTranspose3d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ConvTranspose3d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ConvTranspose3dOptions` class to learn
diff --git a/torch/csrc/api/include/torch/nn/modules/distance.h b/torch/csrc/api/include/torch/nn/modules/distance.h
index 93a8724764369..774b01d7e447c 100644
--- a/torch/csrc/api/include/torch/nn/modules/distance.h
+++ b/torch/csrc/api/include/torch/nn/modules/distance.h
@@ -13,7 +13,7 @@ namespace nn {
 
 /// Returns the cosine similarity between :math:`x_1` and :math:`x_2`, computed
 /// along `dim`.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.CosineSimilarity to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.CosineSimilarity to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::CosineSimilarityOptions` class to
@@ -49,7 +49,7 @@ TORCH_MODULE(CosineSimilarity);
 
 /// Returns the batchwise pairwise distance between vectors :math:`v_1`,
 /// :math:`v_2` using the p-norm.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.PairwiseDistance to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.PairwiseDistance to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::PairwiseDistanceOptions` class to
diff --git a/torch/csrc/api/include/torch/nn/modules/dropout.h b/torch/csrc/api/include/torch/nn/modules/dropout.h
index 7cc7dfb80fbd2..a2ebabded6fab 100644
--- a/torch/csrc/api/include/torch/nn/modules/dropout.h
+++ b/torch/csrc/api/include/torch/nn/modules/dropout.h
@@ -41,7 +41,7 @@ class _DropoutNd : public torch::nn::Cloneable<Derived> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Dropout ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies dropout over a 1-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Dropout to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Dropout to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::DropoutOptions` class to learn what
@@ -71,7 +71,7 @@ TORCH_MODULE(Dropout);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Dropout2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies dropout over a 2-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Dropout2d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Dropout2d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::Dropout2dOptions` class to learn what
@@ -101,7 +101,7 @@ TORCH_MODULE(Dropout2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Dropout3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies dropout over a 3-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Dropout3d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Dropout3d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::Dropout3dOptions` class to learn what
@@ -131,7 +131,7 @@ TORCH_MODULE(Dropout3d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AlphaDropout ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies Alpha Dropout over the input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.AlphaDropout to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AlphaDropout to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::AlphaDropoutOptions` class to learn
diff --git a/torch/csrc/api/include/torch/nn/modules/embedding.h b/torch/csrc/api/include/torch/nn/modules/embedding.h
index fcaddd46e83b7..ff61941d3a35b 100644
--- a/torch/csrc/api/include/torch/nn/modules/embedding.h
+++ b/torch/csrc/api/include/torch/nn/modules/embedding.h
@@ -16,7 +16,7 @@ namespace nn {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Performs a lookup in a fixed size embedding table.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Embedding to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Embedding to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::EmbeddingOptions` class to learn what
@@ -92,7 +92,7 @@ class Embedding : public torch::nn::ModuleHolder<EmbeddingImpl> {
 
 /// Computes sums or means of 'bags' of embeddings, without instantiating the
 /// intermediate embeddings.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.EmbeddingBag to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.EmbeddingBag to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::EmbeddingBagOptions` class to learn
diff --git a/torch/csrc/api/include/torch/nn/modules/fold.h b/torch/csrc/api/include/torch/nn/modules/fold.h
index da16381058a85..6b415a99b5ea8 100644
--- a/torch/csrc/api/include/torch/nn/modules/fold.h
+++ b/torch/csrc/api/include/torch/nn/modules/fold.h
@@ -11,7 +11,7 @@ namespace torch {
 namespace nn {
 
 /// Applies fold over a 3-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Fold to learn about
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Fold to learn about
 /// the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::FoldOptions` class to learn what
@@ -49,7 +49,7 @@ TORCH_MODULE(Fold);
 // ============================================================================
 
 /// Applies unfold over a 4-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Unfold to learn about
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Unfold to learn about
 /// the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::UnfoldOptions` class to learn what
diff --git a/torch/csrc/api/include/torch/nn/modules/instancenorm.h b/torch/csrc/api/include/torch/nn/modules/instancenorm.h
index b29ad007de735..66ebb6e7390a9 100644
--- a/torch/csrc/api/include/torch/nn/modules/instancenorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/instancenorm.h
@@ -60,7 +60,7 @@ class InstanceNormImpl
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the InstanceNorm1d function.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.InstanceNorm1d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.InstanceNorm1d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::InstanceNorm1dOptions` class to learn
@@ -91,7 +91,7 @@ TORCH_MODULE(InstanceNorm1d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the InstanceNorm2d function.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.InstanceNorm2d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.InstanceNorm2d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::InstanceNorm2dOptions` class to learn
@@ -122,7 +122,7 @@ TORCH_MODULE(InstanceNorm2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the InstanceNorm3d function.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.InstanceNorm3d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.InstanceNorm3d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::InstanceNorm3dOptions` class to learn
diff --git a/torch/csrc/api/include/torch/nn/modules/linear.h b/torch/csrc/api/include/torch/nn/modules/linear.h
index a58fdb36b43df..4a88ea80afe63 100644
--- a/torch/csrc/api/include/torch/nn/modules/linear.h
+++ b/torch/csrc/api/include/torch/nn/modules/linear.h
@@ -16,7 +16,7 @@ namespace nn {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Identity ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// A placeholder identity operator that is argument-insensitive.
-/// See https://pytorch.org/docs/master/generated/torch.nn.Identity.html to
+/// See https://pytorch.org/docs/main/generated/torch.nn.Identity.html to
 /// learn about the exact behavior of this module.
 class TORCH_API IdentityImpl : public Cloneable<IdentityImpl> {
  public:
@@ -37,7 +37,7 @@ TORCH_MODULE(Identity);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Linear ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies a linear transformation with optional bias.
-/// See https://pytorch.org/docs/master/generated/torch.nn.Linear.html to learn
+/// See https://pytorch.org/docs/main/generated/torch.nn.Linear.html to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::LinearOptions` class to learn what
@@ -85,7 +85,7 @@ TORCH_MODULE(Linear);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Flatten ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// A placeholder for Flatten operator
-/// See https://pytorch.org/docs/master/generated/torch.nn.Flatten.html to learn
+/// See https://pytorch.org/docs/main/generated/torch.nn.Flatten.html to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::FlattenOptions` class to learn what
@@ -122,7 +122,7 @@ TORCH_MODULE(Flatten);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// A placeholder for unflatten operator
-/// See https://pytorch.org/docs/master/generated/torch.nn.Unflatten.html to
+/// See https://pytorch.org/docs/main/generated/torch.nn.Unflatten.html to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::UnflattenOptions` class to learn what
@@ -163,7 +163,7 @@ TORCH_MODULE(Unflatten);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Bilinear ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies a billinear transformation with optional bias.
-/// See https://pytorch.org/docs/master/generated/torch.nn.Bilinear.html to
+/// See https://pytorch.org/docs/main/generated/torch.nn.Bilinear.html to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::BilinearOptions` class to learn what
diff --git a/torch/csrc/api/include/torch/nn/modules/loss.h b/torch/csrc/api/include/torch/nn/modules/loss.h
index f34cfbf593340..747b548b75844 100644
--- a/torch/csrc/api/include/torch/nn/modules/loss.h
+++ b/torch/csrc/api/include/torch/nn/modules/loss.h
@@ -19,7 +19,7 @@ namespace nn {
 
 /// Creates a criterion that measures the mean absolute error (MAE) between each
 /// element in the input : math :`x` and target : `y`.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.L1Loss to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.L1Loss to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::L1LossOptions` class to learn what
@@ -54,7 +54,7 @@ TORCH_MODULE(L1Loss);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// The Kullback-Leibler divergence loss measure
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.KLDivLoss to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.KLDivLoss to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::KLDivLossOptions` class to learn what
@@ -89,7 +89,7 @@ TORCH_MODULE(KLDivLoss);
 
 /// Creates a criterion that measures the mean squared error (squared L2 norm)
 /// between each element in the input :math:`x` and target :math:`y`.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.MSELoss to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MSELoss to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::MSELossOptions` class to learn what
@@ -124,7 +124,7 @@ TORCH_MODULE(MSELoss);
 
 /// Creates a criterion that measures the Binary Cross Entropy
 /// between the target and the output.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.BCELoss to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.BCELoss to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::BCELossOptions` class to learn what
@@ -160,7 +160,7 @@ TORCH_MODULE(BCELoss);
 
 /// Creates a criterion that measures the loss given an input tensor :math:`x`
 /// and a labels tensor :math:`y` (containing 1 or -1).
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.HingeEmbeddingLoss to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.HingeEmbeddingLoss to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::HingeEmbeddingLossOptions` class to
@@ -199,7 +199,7 @@ TORCH_MODULE(HingeEmbeddingLoss);
 /// loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`)
 /// and output :math:`y` (which is a 1D tensor of target class indices, :math:`0
 /// \leq y \leq \text{x.size}(1)-1`). See
-/// https://pytorch.org/docs/master/nn.html#torch.nn.MultiMarginLoss to learn
+/// https://pytorch.org/docs/main/nn.html#torch.nn.MultiMarginLoss to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::MultiMarginLossOptions` class to learn
@@ -238,7 +238,7 @@ TORCH_MODULE(MultiMarginLoss);
 /// -1. This is used for measuring whether two inputs are similar or
 /// dissimilar, using the cosine distance, and is typically used for learning
 /// nonlinear embeddings or semi-supervised learning.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.CosineEmbeddingLoss to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.CosineEmbeddingLoss to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::CosineEmbeddingLossOptions` class to
@@ -280,7 +280,7 @@ TORCH_MODULE(CosineEmbeddingLoss);
 /// element-wise error falls below beta and an L1 term otherwise.
 /// It is less sensitive to outliers than the `MSELoss` and in some cases
 /// prevents exploding gradients (e.g. see the paper `Fast R-CNN` by Ross
-/// Girshick). See https://pytorch.org/docs/master/nn.html#torch.nn.SmoothL1Loss
+/// Girshick). See https://pytorch.org/docs/main/nn.html#torch.nn.SmoothL1Loss
 /// to learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::SmoothL1LossOptions` class to learn
@@ -316,7 +316,7 @@ TORCH_MODULE(SmoothL1Loss);
 
 /// Creates a criterion that uses a squared term if the absolute
 /// element-wise error falls below delta and a delta-scaled L1 term otherwise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.HuberLoss to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.HuberLoss to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::HuberLossOptions` class to learn what
@@ -354,7 +354,7 @@ TORCH_MODULE(HuberLoss);
 /// hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch
 /// `Tensor`) and output :math:`y` (which is a 2D `Tensor` of target class
 /// indices). See
-/// https://pytorch.org/docs/master/nn.html#torch.nn.MultiLabelMarginLoss to
+/// https://pytorch.org/docs/main/nn.html#torch.nn.MultiLabelMarginLoss to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::MultiLabelMarginLossOptions` class to
@@ -392,7 +392,7 @@ TORCH_MODULE(MultiLabelMarginLoss);
 /// Creates a criterion that optimizes a two-class classification
 /// logistic loss between input tensor :math:`x` and target tensor :math:`y`
 /// (containing 1 or -1).
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.SoftMarginLoss to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.SoftMarginLoss to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::SoftMarginLossOptions` class to learn
@@ -429,7 +429,7 @@ TORCH_MODULE(SoftMarginLoss);
 /// Creates a criterion that optimizes a multi-label one-versus-all
 /// loss based on max-entropy, between input :math:`x` and target :math:`y` of
 /// size :math:`(N, C)`. See
-/// https://pytorch.org/docs/master/nn.html#torch.nn.MultiLabelSoftMarginLoss to
+/// https://pytorch.org/docs/main/nn.html#torch.nn.MultiLabelSoftMarginLoss to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::MultiLabelSoftMarginLossOptions` class
@@ -473,7 +473,7 @@ TORCH_MODULE(MultiLabelSoftMarginLoss);
 /// samples. A triplet is composed by `a`, `p` and `n` (i.e., `anchor`,
 /// `positive examples` and `negative examples` respectively). The
 /// shapes of all input tensors should be :math:`(N, D)`.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.TripletMarginLoss to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.TripletMarginLoss to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::TripletMarginLossOptions` class to
@@ -520,7 +520,7 @@ TORCH_MODULE(TripletMarginLoss);
 /// and positive example ("positive distance") and the anchor and negative
 /// example ("negative distance").
 /// See
-/// https://pytorch.org/docs/master/nn.html#torch.nn.TripletMarginWithDistanceLoss
+/// https://pytorch.org/docs/main/nn.html#torch.nn.TripletMarginWithDistanceLoss
 /// to learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::TripletMarginWithDistanceLossOptions`
@@ -563,7 +563,7 @@ TORCH_MODULE(TripletMarginWithDistanceLoss);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CTCLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// The Connectionist Temporal Classification loss.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.CTCLoss to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.CTCLoss to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::CTCLossOptions` class to learn what
@@ -603,7 +603,7 @@ TORCH_MODULE(CTCLoss);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Negative log likelihood loss with Poisson distribution of target.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.PoissonNLLLoss to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.PoissonNLLLoss to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::PoissonNLLLossOptions` class to learn
@@ -641,7 +641,7 @@ TORCH_MODULE(PoissonNLLLoss);
 /// Creates a criterion that measures the loss given
 /// inputs :math:`x1`, :math:`x2`, two 1D mini-batch `Tensors`,
 /// and a label 1D mini-batch tensor :math:`y` (containing 1 or -1).
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.MarginRankingLoss to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MarginRankingLoss to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::MarginRankingLossOptions` class to
@@ -681,7 +681,7 @@ TORCH_MODULE(MarginRankingLoss);
 
 /// The negative log likelihood loss. It is useful to train a classification
 /// problem with `C` classes.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.NLLLoss to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.NLLLoss to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::NLLLossOptions` class to learn what
@@ -720,7 +720,7 @@ TORCH_MODULE(NLLLoss);
 
 /// Creates a criterion that computes cross entropy loss between input and
 /// target. See
-/// https://pytorch.org/docs/master/nn.html#torch.nn.CrossEntropyLoss to learn
+/// https://pytorch.org/docs/main/nn.html#torch.nn.CrossEntropyLoss to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::CrossEntropyLossOptions` class to
@@ -762,7 +762,7 @@ TORCH_MODULE(CrossEntropyLoss);
 /// class. This version is more numerically stable than using a plain `Sigmoid`
 /// followed by a `BCELoss` as, by combining the operations into one layer,
 /// we take advantage of the log-sum-exp trick for numerical stability.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.BCEWithLogitsLoss to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.BCEWithLogitsLoss to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::BCEWithLogitsLossOptions` class to
diff --git a/torch/csrc/api/include/torch/nn/modules/normalization.h b/torch/csrc/api/include/torch/nn/modules/normalization.h
index 2f748ef79d0bc..9bc0b7f9e7fc4 100644
--- a/torch/csrc/api/include/torch/nn/modules/normalization.h
+++ b/torch/csrc/api/include/torch/nn/modules/normalization.h
@@ -17,7 +17,7 @@ namespace nn {
 
 /// Applies Layer Normalization over a mini-batch of inputs as described in
 /// the paper `Layer Normalization`_ .
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.LayerNorm to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LayerNorm to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::LayerNormOptions` class to learn what
@@ -78,7 +78,7 @@ TORCH_MODULE(LayerNorm);
 /// Applies local response normalization over an input signal composed
 /// of several input planes, where channels occupy the second dimension.
 /// Applies normalization across channels.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.LocalResponseNorm to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LocalResponseNorm to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::LocalResponseNormOptions` class to
@@ -152,7 +152,7 @@ TORCH_MODULE(CrossMapLRN2d);
 
 /// Applies Group Normalization over a mini-batch of inputs as described in
 /// the paper `Group Normalization`_ .
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.GroupNorm to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.GroupNorm to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::GroupNormOptions` class to learn what
diff --git a/torch/csrc/api/include/torch/nn/modules/padding.h b/torch/csrc/api/include/torch/nn/modules/padding.h
index 9a93af0dd1192..f051e9a19305c 100644
--- a/torch/csrc/api/include/torch/nn/modules/padding.h
+++ b/torch/csrc/api/include/torch/nn/modules/padding.h
@@ -32,7 +32,7 @@ class TORCH_API ReflectionPadImpl : public torch::nn::Cloneable<Derived> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies ReflectionPad over a 1-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReflectionPad1d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReflectionPad1d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ReflectionPad1dOptions` class to learn
@@ -59,7 +59,7 @@ TORCH_MODULE(ReflectionPad1d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies ReflectionPad over a 2-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReflectionPad2d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReflectionPad2d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ReflectionPad2dOptions` class to learn
@@ -86,7 +86,7 @@ TORCH_MODULE(ReflectionPad2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies ReflectionPad over a 3-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReflectionPad3d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReflectionPad3d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ReflectionPad3dOptions` class to learn
@@ -135,7 +135,7 @@ class TORCH_API ReplicationPadImpl : public torch::nn::Cloneable<Derived> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies ReplicationPad over a 1-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReplicationPad1d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReplicationPad1d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ReplicationPad1dOptions` class to
@@ -162,7 +162,7 @@ TORCH_MODULE(ReplicationPad1d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies ReplicationPad over a 2-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReplicationPad2d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReplicationPad2d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ReplicationPad2dOptions` class to
@@ -189,7 +189,7 @@ TORCH_MODULE(ReplicationPad2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies ReplicationPad over a 3-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReplicationPad3d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReplicationPad3d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ReplicationPad3dOptions` class to
@@ -299,7 +299,7 @@ class TORCH_API ConstantPadImpl : public torch::nn::Cloneable<Derived> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConstantPad1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies ConstantPad over a 1-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ConstantPad1d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ConstantPad1d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ConstantPad1dOptions` class to learn
@@ -325,7 +325,7 @@ TORCH_MODULE(ConstantPad1d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConstantPad2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies ConstantPad over a 2-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ConstantPad2d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ConstantPad2d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ConstantPad2dOptions` class to learn
@@ -351,7 +351,7 @@ TORCH_MODULE(ConstantPad2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConstantPad3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies ConstantPad over a 3-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.ConstantPad3d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ConstantPad3d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::ConstantPad3dOptions` class to learn
diff --git a/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h b/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h
index e47e685191052..7ad916d332f45 100644
--- a/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h
+++ b/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h
@@ -15,7 +15,7 @@ namespace nn {
 /// Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
 /// to a tensor of shape :math:`(*, C, H \times r, W \times r)`, where r is an
 /// upscale factor. See
-/// https://pytorch.org/docs/master/nn.html#torch.nn.PixelShuffle to learn about
+/// https://pytorch.org/docs/main/nn.html#torch.nn.PixelShuffle to learn about
 /// the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::PixelShuffleOptions` class to learn
@@ -52,7 +52,7 @@ TORCH_MODULE(PixelShuffle);
 /// Reverses the PixelShuffle operation by rearranging elements in a tensor of
 /// shape :math:`(*, C, H \times r, W \times r)` to a tensor of shape :math:`(*,
 /// C \times r^2, H, W)`, where r is a downscale factor. See
-/// https://pytorch.org/docs/master/nn.html#torch.nn.PixelUnshuffle to learn
+/// https://pytorch.org/docs/main/nn.html#torch.nn.PixelUnshuffle to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::PixelUnshuffleOptions` class to learn
diff --git a/torch/csrc/api/include/torch/nn/modules/pooling.h b/torch/csrc/api/include/torch/nn/modules/pooling.h
index e7df9b1fe7f37..a9db131b0dd08 100644
--- a/torch/csrc/api/include/torch/nn/modules/pooling.h
+++ b/torch/csrc/api/include/torch/nn/modules/pooling.h
@@ -31,7 +31,7 @@ class TORCH_API AvgPoolImpl : public torch::nn::Cloneable<Derived> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AvgPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies avgpool over a 1-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.AvgPool1d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AvgPool1d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::AvgPool1dOptions` class to learn what
@@ -57,7 +57,7 @@ TORCH_MODULE(AvgPool1d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AvgPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies avgpool over a 2-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.AvgPool2d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AvgPool2d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::AvgPool2dOptions` class to learn what
@@ -83,7 +83,7 @@ TORCH_MODULE(AvgPool2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AvgPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies avgpool over a 3-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.AvgPool3d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AvgPool3d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::AvgPool3dOptions` class to learn what
@@ -128,7 +128,7 @@ class TORCH_API MaxPoolImpl : public torch::nn::Cloneable<Derived> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies maxpool over a 1-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.MaxPool1d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MaxPool1d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::MaxPool1dOptions` class to learn what
@@ -158,7 +158,7 @@ TORCH_MODULE(MaxPool1d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies maxpool over a 2-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.MaxPool2d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MaxPool2d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::MaxPool2dOptions` class to learn what
@@ -188,7 +188,7 @@ TORCH_MODULE(MaxPool2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies maxpool over a 3-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.MaxPool3d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MaxPool3d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::MaxPool3dOptions` class to learn what
@@ -244,7 +244,7 @@ class TORCH_API AdaptiveMaxPoolImpl : public torch::nn::Cloneable<Derived> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveMaxPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies adaptive maxpool over a 1-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.AdaptiveMaxPool1d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AdaptiveMaxPool1d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::AdaptiveMaxPool1dOptions` class to
@@ -277,7 +277,7 @@ TORCH_MODULE(AdaptiveMaxPool1d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveMaxPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies adaptive maxpool over a 2-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.AdaptiveMaxPool2d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AdaptiveMaxPool2d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::AdaptiveMaxPool2dOptions` class to
@@ -314,7 +314,7 @@ TORCH_MODULE(AdaptiveMaxPool2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveMaxPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies adaptive maxpool over a 3-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.AdaptiveMaxPool3d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AdaptiveMaxPool3d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::AdaptiveMaxPool3dOptions` class to
@@ -377,7 +377,7 @@ class TORCH_API AdaptiveAvgPoolImpl : public torch::nn::Cloneable<Derived> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveAvgPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies adaptive avgpool over a 1-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.AdaptiveAvgPool1d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AdaptiveAvgPool1d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::AdaptiveAvgPool1dOptions` class to
@@ -406,7 +406,7 @@ TORCH_MODULE(AdaptiveAvgPool1d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveAvgPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies adaptive avgpool over a 2-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.AdaptiveAvgPool2d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AdaptiveAvgPool2d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::AdaptiveAvgPool2dOptions` class to
@@ -439,7 +439,7 @@ TORCH_MODULE(AdaptiveAvgPool2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveAvgPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies adaptive avgpool over a 3-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.AdaptiveAvgPool3d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AdaptiveAvgPool3d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::AdaptiveAvgPool3dOptions` class to
@@ -491,7 +491,7 @@ class TORCH_API MaxUnpoolImpl : public torch::nn::Cloneable<Derived> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxUnpool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies maxunpool over a 1-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.MaxUnpool1d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MaxUnpool1d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::MaxUnpool1dOptions` class to learn
@@ -523,7 +523,7 @@ TORCH_MODULE(MaxUnpool1d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxUnpool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies maxunpool over a 2-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.MaxUnpool2d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MaxUnpool2d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::MaxUnpool2dOptions` class to learn
@@ -555,7 +555,7 @@ TORCH_MODULE(MaxUnpool2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxUnpool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies maxunpool over a 3-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.MaxUnpool3d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MaxUnpool3d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::MaxUnpool3dOptions` class to learn
@@ -588,7 +588,7 @@ TORCH_MODULE(MaxUnpool3d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies fractional maxpool over a 2-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.FractionalMaxPool2d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.FractionalMaxPool2d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::FractionalMaxPool2dOptions` class to
@@ -633,7 +633,7 @@ TORCH_MODULE(FractionalMaxPool2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies fractional maxpool over a 3-D input.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.FractionalMaxPool3d to
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.FractionalMaxPool3d to
 /// learn about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::FractionalMaxPool3dOptions` class to
@@ -695,7 +695,7 @@ class TORCH_API LPPoolImpl : public torch::nn::Cloneable<Derived> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LPPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the LPPool1d function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.LPPool1d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LPPool1d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::LPPool1dOptions` class to learn what
@@ -722,7 +722,7 @@ TORCH_MODULE(LPPool1d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LPPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the LPPool2d function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.LPPool2d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LPPool2d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::LPPool2dOptions` class to learn what
@@ -750,7 +750,7 @@ TORCH_MODULE(LPPool2d);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LPPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Applies the LPPool3d function element-wise.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.LPPool3d to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LPPool3d to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::LPPool3dOptions` class to learn what
diff --git a/torch/csrc/api/include/torch/nn/modules/rnn.h b/torch/csrc/api/include/torch/nn/modules/rnn.h
index 9c5ac5f3f4594..1799d9f23ac1f 100644
--- a/torch/csrc/api/include/torch/nn/modules/rnn.h
+++ b/torch/csrc/api/include/torch/nn/modules/rnn.h
@@ -91,7 +91,7 @@ class TORCH_API RNNImplBase : public torch::nn::Cloneable<Derived> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RNN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// A multi-layer Elman RNN module with Tanh or ReLU activation.
-/// See https://pytorch.org/docs/master/generated/torch.nn.RNN.html to learn
+/// See https://pytorch.org/docs/main/generated/torch.nn.RNN.html to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::RNNOptions` class to learn what
@@ -140,7 +140,7 @@ TORCH_MODULE(RNN);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LSTM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// A multi-layer long-short-term-memory (LSTM) module.
-/// See https://pytorch.org/docs/master/generated/torch.nn.LSTM.html to learn
+/// See https://pytorch.org/docs/main/generated/torch.nn.LSTM.html to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::LSTMOptions` class to learn what
@@ -205,7 +205,7 @@ TORCH_MODULE(LSTM);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GRU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// A multi-layer gated recurrent unit (GRU) module.
-/// See https://pytorch.org/docs/master/generated/torch.nn.GRU.html to learn
+/// See https://pytorch.org/docs/main/generated/torch.nn.GRU.html to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::GRUOptions` class to learn what
@@ -286,7 +286,7 @@ class TORCH_API RNNCellImplBase : public torch::nn::Cloneable<Derived> {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// An Elman RNN cell with tanh or ReLU non-linearity.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.RNNCell to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.RNNCell to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::RNNCellOptions` class to learn what
@@ -326,7 +326,7 @@ TORCH_MODULE(RNNCell);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// A long short-term memory (LSTM) cell.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.LSTMCell to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LSTMCell to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::LSTMCellOptions` class to learn what
@@ -365,7 +365,7 @@ TORCH_MODULE(LSTMCell);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// A gated recurrent unit (GRU) cell.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.GRUCell to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.GRUCell to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::GRUCellOptions` class to learn what
diff --git a/torch/csrc/api/include/torch/nn/modules/transformercoder.h b/torch/csrc/api/include/torch/nn/modules/transformercoder.h
index fd1998449abd1..5ca4ddea64b8d 100644
--- a/torch/csrc/api/include/torch/nn/modules/transformercoder.h
+++ b/torch/csrc/api/include/torch/nn/modules/transformercoder.h
@@ -20,7 +20,7 @@ namespace nn {
 
 /// TransformerEncoder module.
 /// See
-/// https://pytorch.org/docs/master/generated/torch.nn.TransformerEncoder.html
+/// https://pytorch.org/docs/main/generated/torch.nn.TransformerEncoder.html
 /// to learn abouut the exact behavior of this encoder layer module.
 ///
 /// See the documentation for `torch::nn::TransformerEncoder` class to learn
@@ -79,7 +79,7 @@ TORCH_MODULE(TransformerEncoder);
 
 /// TransformerDecoder is a stack of N decoder layers.
 /// See
-/// https://pytorch.org/docs/master/generated/torch.nn.TransformerDecoder.html
+/// https://pytorch.org/docs/main/generated/torch.nn.TransformerDecoder.html
 /// to learn abouut the exact behavior of this decoder module
 ///
 /// See the documentation for `torch::nn::TransformerDecoderOptions` class to
diff --git a/torch/csrc/api/include/torch/nn/modules/transformerlayer.h b/torch/csrc/api/include/torch/nn/modules/transformerlayer.h
index 0378226b15637..b2d8131870161 100644
--- a/torch/csrc/api/include/torch/nn/modules/transformerlayer.h
+++ b/torch/csrc/api/include/torch/nn/modules/transformerlayer.h
@@ -22,7 +22,7 @@ namespace nn {
 
 /// TransformerEncoderLayer module.
 /// See
-/// https://pytorch.org/docs/master/generated/torch.nn.TransformerEncoderLayer.html
+/// https://pytorch.org/docs/main/generated/torch.nn.TransformerEncoderLayer.html
 /// to learn abouut the exact behavior of this encoder layer model
 ///
 /// See the documentation for `torch::nn::TransformerEncoderLayer` class to
@@ -97,7 +97,7 @@ TORCH_MODULE(TransformerEncoderLayer);
 /// Polosukhin. 2017. Attention is all you need. In Advances in Neural
 /// Information Processing Systems, pages 6000-6010. Users may modify or
 /// implement in a different way during application. See
-/// https://pytorch.org/docs/master/nn.html#transformer-layers to learn about
+/// https://pytorch.org/docs/main/nn.html#transformer-layers to learn about
 /// the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::TransformerDecoderLayerOptions` class
diff --git a/torch/csrc/api/include/torch/nn/modules/upsampling.h b/torch/csrc/api/include/torch/nn/modules/upsampling.h
index 6db8b04d574aa..8520bf632f83e 100644
--- a/torch/csrc/api/include/torch/nn/modules/upsampling.h
+++ b/torch/csrc/api/include/torch/nn/modules/upsampling.h
@@ -18,7 +18,7 @@ namespace nn {
 
 /// Upsamples a given multi-channel 1D (temporal), 2D (spatial) or 3D
 /// (volumetric) data.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.Upsample to learn
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Upsample to learn
 /// about the exact behavior of this module.
 ///
 /// See the documentation for `torch::nn::UpsampleOptions` class to learn what
diff --git a/torch/csrc/api/include/torch/nn/options/upsampling.h b/torch/csrc/api/include/torch/nn/options/upsampling.h
index d03e5f2345f32..ca793beb97725 100644
--- a/torch/csrc/api/include/torch/nn/options/upsampling.h
+++ b/torch/csrc/api/include/torch/nn/options/upsampling.h
@@ -80,8 +80,8 @@ struct TORCH_API InterpolateFuncOptions {
   /// at the corner pixels. If set to "False", the input and output tensors
   /// are aligned by the corner points of their corner pixels, and the
   /// interpolation uses edge value padding for out-of-boundary values, making
-  /// this operation *independent* of input size when :attr:`scale_factor` is
-  /// kept the same. This only has an effect when :attr:`mode` is "linear",
+  /// this operation *independent* of input size when `scale_factor` is
+  /// kept the same.  It is *required* when interpolating mode is "linear",
   /// "bilinear", "bicubic" or "trilinear". Default: "False"
   TORCH_ARG(c10::optional<bool>, align_corners) = c10::nullopt;
 
diff --git a/torch/csrc/api/include/torch/nn/utils/rnn.h b/torch/csrc/api/include/torch/nn/utils/rnn.h
index 27d430179322f..eea517a2b60f3 100644
--- a/torch/csrc/api/include/torch/nn/utils/rnn.h
+++ b/torch/csrc/api/include/torch/nn/utils/rnn.h
@@ -212,8 +212,7 @@ inline PackedSequence pack_padded_sequence(
     input = input.index_select(batch_dim, sorted_indices);
   }
 
-  Tensor data, batch_sizes;
-  std::tie(data, batch_sizes) =
+  auto [data, batch_sizes] =
       torch::_pack_padded_sequence(input, lengths, batch_first);
   return PackedSequence(
       std::move(data), std::move(batch_sizes), std::move(sorted_indices), {});
@@ -262,8 +261,7 @@ inline std::tuple<Tensor, Tensor> pad_packed_sequence(
         max_seq_length);
     max_seq_length = total_length_val;
   }
-  Tensor padded_output, lengths;
-  std::tie(padded_output, lengths) = torch::_pad_packed_sequence(
+  auto [padded_output, lengths] = torch::_pad_packed_sequence(
       sequence.data(),
       sequence.batch_sizes(),
       batch_first,
diff --git a/torch/csrc/api/include/torch/optim.h b/torch/csrc/api/include/torch/optim.h
index 8aef6238ebaa3..d52b34c076dd0 100644
--- a/torch/csrc/api/include/torch/optim.h
+++ b/torch/csrc/api/include/torch/optim.h
@@ -9,4 +9,5 @@
 #include <torch/optim/sgd.h>
 
 #include <torch/optim/schedulers/lr_scheduler.h>
+#include <torch/optim/schedulers/reduce_on_plateau_scheduler.h>
 #include <torch/optim/schedulers/step_lr.h>
diff --git a/torch/csrc/api/include/torch/optim/schedulers/reduce_on_plateau_scheduler.h b/torch/csrc/api/include/torch/optim/schedulers/reduce_on_plateau_scheduler.h
index 8214070104988..ae8892ff4fda6 100644
--- a/torch/csrc/api/include/torch/optim/schedulers/reduce_on_plateau_scheduler.h
+++ b/torch/csrc/api/include/torch/optim/schedulers/reduce_on_plateau_scheduler.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/optim/optimizer.h>
+#include <torch/optim/schedulers/lr_scheduler.h>
 
 #include <torch/csrc/Export.h>
 
diff --git a/torch/csrc/api/include/torch/optim/serialize.h b/torch/csrc/api/include/torch/optim/serialize.h
index e666477c2a1db..7c34450999b62 100644
--- a/torch/csrc/api/include/torch/optim/serialize.h
+++ b/torch/csrc/api/include/torch/optim/serialize.h
@@ -180,7 +180,7 @@ void serialize(serialize::InputArchive& archive, Optimizer& optimizer) {
   detail::serialize<DerivedOptimizerParamOptions>(
       param_groups_archive, saved_param_groups);
 
-  // update state
+  // update state and optimizer options
   TORCH_CHECK(
       saved_param_groups.size() == optimizer.param_groups().size(),
       "loaded state dict has a different number of parameter groups");
@@ -199,6 +199,12 @@ void serialize(serialize::InputArchive& archive, Optimizer& optimizer) {
             std::move(saved_state[param_group_old_key]);
       }
     }
+
+    auto& saved_options = reinterpret_cast<DerivedOptimizerParamOptions&>(
+        *saved_param_groups[i].second);
+    auto& current_options = reinterpret_cast<DerivedOptimizerParamOptions&>(
+        optimizer.param_groups()[i].options());
+    current_options = saved_options;
   }
 }
 
diff --git a/torch/csrc/api/include/torch/serialize.h b/torch/csrc/api/include/torch/serialize.h
index 60ec25b8ffe79..f320542499cce 100644
--- a/torch/csrc/api/include/torch/serialize.h
+++ b/torch/csrc/api/include/torch/serialize.h
@@ -28,7 +28,7 @@ namespace torch {
 ///   torch::nn::Linear model(3, 4);
 ///   torch::save(model, "model.pt");
 ///
-///   torch::optim::SGD sgd(/*lr=*/0.9);
+///   torch::optim::SGD sgd(model->parameters(), 0.9); // 0.9 is learning rate
 ///   std::ostringstream stream;
 ///   // Note that the same stream cannot be used in multiple torch::save(...)
 ///   // invocations, otherwise the header will be corrupted.
@@ -94,7 +94,7 @@ TORCH_API torch::IValue pickle_load(const std::vector<char>& data);
 ///   torch::nn::Linear model(3, 4);
 ///   torch::load(model, "model.pt");
 ///
-///   torch::optim::SGD sgd(/*lr=*/0.9);
+///   torch::optim::SGD sgd(model->parameters(), 0.9); // 0.9 is learning rate
 ///   std::istringstream stream("...");
 ///   torch::load(sgd, stream);
 ///
diff --git a/torch/csrc/api/include/torch/special.h b/torch/csrc/api/include/torch/special.h
index 12e3439130af5..7ad7e7689ebd6 100644
--- a/torch/csrc/api/include/torch/special.h
+++ b/torch/csrc/api/include/torch/special.h
@@ -7,7 +7,7 @@ namespace torch {
 namespace special {
 
 /// Computes the natural logarithm of the absolute value of the gamma function
-/// See https://pytorch.org/docs/master/special.html#torch.special.gammaln.
+/// See https://pytorch.org/docs/main/special.html#torch.special.gammaln.
 ///
 /// Example:
 /// ```
@@ -23,7 +23,7 @@ inline Tensor& gammaln_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes the regularized lower incomplete gamma function
-/// See https://pytorch.org/docs/master/special.html#torch.special.gammainc.
+/// See https://pytorch.org/docs/main/special.html#torch.special.gammainc.
 ///
 /// Example:
 /// ```
@@ -43,7 +43,7 @@ inline Tensor& gammainc_out(
 }
 
 /// Computes the regularized upper incomplete gamma function
-/// See https://pytorch.org/docs/master/special.html#torch.special.gammainc.
+/// See https://pytorch.org/docs/main/special.html#torch.special.gammainc.
 ///
 /// Example:
 /// ```
@@ -63,7 +63,7 @@ inline Tensor& gammaincc_out(
 }
 
 /// Computes the multivariate log-gamma function with dimension `p`, elementwise
-/// See https://pytorch.org/docs/master/special.html#torch.special.multigammaln.
+/// See https://pytorch.org/docs/main/special.html#torch.special.multigammaln.
 ///
 /// Example:
 /// ```
@@ -79,7 +79,7 @@ inline Tensor& multigammaln_out(Tensor& result, const Tensor& self, int64_t p) {
 }
 
 /// Computes the nth derivative of the digamma function on the input.
-/// See https:://pytorch.org/docs/master/special.html#torch.special.polygamma.
+/// See https:://pytorch.org/docs/main/special.html#torch.special.polygamma.
 ///
 /// Example:
 /// ```
@@ -95,7 +95,7 @@ inline Tensor& polygamma_out(Tensor& result, int64_t n, const Tensor& self) {
 }
 
 /// Computes the logarithmic derivative of the gamma function on input
-/// See https://pytorch.org/docs/master/special.html#torch.special.psi
+/// See https://pytorch.org/docs/main/special.html#torch.special.psi
 ///
 /// Example:
 /// ```
@@ -111,7 +111,7 @@ inline Tensor& psi_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes the logarithmic derivative of the gamma function on input
-/// See https://pytorch.org/docs/master/special.html#torch.special.digamma
+/// See https://pytorch.org/docs/main/special.html#torch.special.digamma
 ///
 /// Example:
 /// ```
@@ -127,7 +127,7 @@ inline Tensor& digamma_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes entropy of input, elementwise
-/// See https://pytorch.org/docs/master/special.html#torch.special.entr.
+/// See https://pytorch.org/docs/main/special.html#torch.special.entr.
 ///
 /// Example:
 /// ```
@@ -143,7 +143,7 @@ inline Tensor& entr_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes the error function
-/// See https://pytorch.org/docs/master/special.html#torch.special.erf.
+/// See https://pytorch.org/docs/main/special.html#torch.special.erf.
 ///
 /// Example:
 /// ```
@@ -159,7 +159,7 @@ inline Tensor& erf_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes the complementary error function
-/// See https://pytorch.org/docs/master/special.html#torch.special.erfc.
+/// See https://pytorch.org/docs/main/special.html#torch.special.erfc.
 ///
 /// Example:
 /// ```
@@ -175,7 +175,7 @@ inline Tensor& erfc_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes the scaled complementary error function
-/// See https://pytorch.org/docs/master/special.html#torch.special.erfcx.
+/// See https://pytorch.org/docs/main/special.html#torch.special.erfcx.
 ///
 /// Example:
 /// ```
@@ -191,7 +191,7 @@ inline Tensor& erfcx_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes the inverse error function
-/// See https://pytorch.org/docs/master/special.html#torch.special.erfinv.
+/// See https://pytorch.org/docs/main/special.html#torch.special.erfinv.
 ///
 /// Example:
 /// ```
@@ -208,7 +208,7 @@ inline Tensor& erfinv_out(Tensor& result, const Tensor& self) {
 
 /// Computes the log of summed exponentials of each row of input in the given
 /// dimension dim See
-/// https://pytorch.org/docs/master/special.html#torch.special.logsumexp.
+/// https://pytorch.org/docs/main/special.html#torch.special.logsumexp.
 ///
 /// Example:
 /// ```
@@ -230,7 +230,7 @@ inline Tensor& logsumexp_out(
 /// Computes the argument, x, for which the area under the Gaussian probability
 /// density function (integrated from minus infinity to x) is equal to input,
 /// elementwise. See
-/// https://pytorch.org/docs/master/special.html#torch.special.ndtri
+/// https://pytorch.org/docs/main/special.html#torch.special.ndtri
 ///
 /// Example:
 /// ```
@@ -247,7 +247,7 @@ inline Tensor& ndtri_out(Tensor& result, const Tensor& self) {
 
 /// Computes the log of area under the standard Gaussian probability density
 /// function, integrated from minus infinity to :attr:`input`, elementwise See
-/// https://pytorch.org/docs/master/special.html#torch.special.log_ndtr
+/// https://pytorch.org/docs/main/special.html#torch.special.log_ndtr
 ///
 /// Example:
 /// ```
@@ -263,7 +263,7 @@ inline Tensor& log_ndtr_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes the logit of input, elementwise.
-/// See https://pytorch.org/docs/master/special.html#torch.special.logit.
+/// See https://pytorch.org/docs/main/special.html#torch.special.logit.
 ///
 /// Example:
 /// ```
@@ -280,7 +280,7 @@ inline Tensor& logit_out(Tensor& result, const Tensor& self) {
 
 /// Computes the expit (also known as the logistic sigmoid function) of input,
 /// elementwise See
-/// https://pytorch.org/docs/master/special.html#torch.special.expit.
+/// https://pytorch.org/docs/main/special.html#torch.special.expit.
 ///
 /// Example:
 /// ```
@@ -296,7 +296,7 @@ inline Tensor& expit_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes the base two exponential function of :attr:`input`, elementwise
-/// See https://pytorch.org/docs/master/special.html#torch.special.exp2.
+/// See https://pytorch.org/docs/main/special.html#torch.special.exp2.
 ///
 /// Example:
 /// ```
@@ -312,7 +312,7 @@ inline Tensor& exp2_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes the exponential of the elements minus 1, elementwise
-/// See https://pytorch.org/docs/master/special.html#torch.special.expm1.
+/// See https://pytorch.org/docs/main/special.html#torch.special.expm1.
 ///
 /// Example:
 /// ```
@@ -328,7 +328,7 @@ inline Tensor& expm1_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes x * log(y) for inputs, elementwise
-/// See https://pytorch.org/docs/master/special.html#torch.special.xlogy.
+/// See https://pytorch.org/docs/main/special.html#torch.special.xlogy.
 ///
 /// Example:
 /// ```
@@ -370,7 +370,7 @@ inline Tensor& xlogy_out(
 }
 
 /// Computes x * log1p(y) for inputs, elementwise
-/// See https://pytorch.org/docs/master/special.html#torch.special.xlog1py.
+/// See https://pytorch.org/docs/main/special.html#torch.special.xlog1py.
 ///
 /// Example:
 /// ```
@@ -412,7 +412,7 @@ inline Tensor& xlog1py_out(
 }
 
 /// Computes Hurwitz Zeta function for inputs, elementwise
-/// See https://pytorch.org/docs/master/special.html#torch.special.zeta.
+/// See https://pytorch.org/docs/main/special.html#torch.special.zeta.
 ///
 /// Example:
 /// ```
@@ -455,7 +455,7 @@ inline Tensor& zeta_out(
 
 /// Computes the zeroth order modified Bessel function of the first kind of
 /// input, elementwise See
-/// https://pytorch.org/docs/master/special.html#torch.special.i0
+/// https://pytorch.org/docs/main/special.html#torch.special.i0
 ///
 /// Example:
 /// ```
@@ -472,7 +472,7 @@ inline Tensor& i0_out(Tensor& result, const Tensor& self) {
 
 /// Computes the area under the standard Gaussian probability density function,
 /// integrated from minus infinity to :attr:`input`, elementwise
-/// See https://pytorch.org/docs/master/special.html#torch.special.ndtr
+/// See https://pytorch.org/docs/main/special.html#torch.special.ndtr
 ///
 /// Example:
 /// ```
@@ -489,7 +489,7 @@ inline Tensor& ndtr_out(Tensor& result, const Tensor& self) {
 
 /// Computes the exponentially scaled zeroth order modified Bessel function of
 /// the first kind See
-/// https://pytorch.org/docs/master/special.html#torch.special.i0e.
+/// https://pytorch.org/docs/main/special.html#torch.special.i0e.
 ///
 /// Example:
 /// ```
@@ -505,7 +505,7 @@ inline Tensor& i0e_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes the first order modified Bessel function of the first kind
-/// See https://pytorch.org/docs/master/special.html#torch.special.i1.
+/// See https://pytorch.org/docs/main/special.html#torch.special.i1.
 ///
 /// Example:
 /// ```
@@ -522,7 +522,7 @@ inline Tensor& i1_out(Tensor& result, const Tensor& self) {
 
 /// Computes the exponentially scaled first order modified Bessel function of
 /// the first kind See
-/// https://pytorch.org/docs/master/special.html#torch.special.i1e.
+/// https://pytorch.org/docs/main/special.html#torch.special.i1e.
 ///
 /// Example:
 /// ```
@@ -538,7 +538,7 @@ inline Tensor& i1e_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes the sinc of input, elementwise
-/// See https://pytorch.org/docs/master/special.html#torch.special.sinc.
+/// See https://pytorch.org/docs/main/special.html#torch.special.sinc.
 ///
 /// Example:
 /// ```
@@ -554,7 +554,7 @@ inline Tensor& sinc_out(Tensor& result, const Tensor& self) {
 }
 
 /// Rounds the elements of the input
-/// See https://pytorch.org/docs/master/special.html#torch.special.round.
+/// See https://pytorch.org/docs/main/special.html#torch.special.round.
 ///
 /// Example:
 /// ```
@@ -570,7 +570,7 @@ inline Tensor& round_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes log(1 + x) of the input, elementwise
-/// See https://pytorch.org/docs/master/special.html#torch.special.log1p.
+/// See https://pytorch.org/docs/main/special.html#torch.special.log1p.
 ///
 /// Example:
 /// ```
@@ -586,7 +586,7 @@ inline Tensor& log1p_out(Tensor& result, const Tensor& self) {
 }
 
 /// Computes log followed by softmax(x) of the input
-/// See https://pytorch.org/docs/master/special.html#torch.special.log_softmax.
+/// See https://pytorch.org/docs/main/special.html#torch.special.log_softmax.
 ///
 /// Example:
 /// ```
@@ -601,7 +601,7 @@ inline Tensor log_softmax(
 }
 
 /// Computes softmax of the input along a given dimension
-/// See https://pytorch.org/docs/master/special.html#torch.special.softmax.
+/// See https://pytorch.org/docs/main/special.html#torch.special.softmax.
 ///
 /// Example:
 /// ```
@@ -617,7 +617,7 @@ inline Tensor softmax(
 
 /// Airy function Ai.
 ///
-/// See https://pytorch.org/docs/master/special.html#torch.special.airy_ai.
+/// See https://pytorch.org/docs/main/special.html#torch.special.airy_ai.
 ///
 /// Example:
 ///
@@ -636,7 +636,7 @@ inline Tensor& airy_ai_out(Tensor& y, const Tensor& x) {
 
 /// Bessel function of the first kind of order 0.
 ///
-/// See https://pytorch.org/docs/master/special.html#torch.special.bessel_j0.
+/// See https://pytorch.org/docs/main/special.html#torch.special.bessel_j0.
 ///
 /// Example:
 ///
@@ -655,7 +655,7 @@ inline Tensor& bessel_j0_out(Tensor& result, const Tensor& self) {
 
 /// Bessel function of the first kind of order 1.
 ///
-/// See https://pytorch.org/docs/master/special.html#torch.special.bessel_j1.
+/// See https://pytorch.org/docs/main/special.html#torch.special.bessel_j1.
 ///
 /// Example:
 ///
@@ -674,7 +674,7 @@ inline Tensor& bessel_j1_out(Tensor& result, const Tensor& self) {
 
 /// Bessel function of the second kind of order 0.
 ///
-/// See https://pytorch.org/docs/master/special.html#torch.special.bessel_y0.
+/// See https://pytorch.org/docs/main/special.html#torch.special.bessel_y0.
 ///
 /// Example:
 ///
@@ -693,7 +693,7 @@ inline Tensor& bessel_y0_out(Tensor& result, const Tensor& self) {
 
 /// Bessel function of the second kind of order 1.
 ///
-/// See https://pytorch.org/docs/master/special.html#torch.special.bessel_y1.
+/// See https://pytorch.org/docs/main/special.html#torch.special.bessel_y1.
 ///
 /// Example:
 ///
@@ -713,7 +713,7 @@ inline Tensor& bessel_y1_out(Tensor& result, const Tensor& self) {
 /// Chebyshev polynomial of the first kind.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.chebyshev_polynomial_t.
+/// https://pytorch.org/docs/main/special.html#torch.special.chebyshev_polynomial_t.
 ///
 /// Example:
 ///
@@ -759,7 +759,7 @@ inline Tensor& chebyshev_polynomial_t_out(
 /// Chebyshev polynomial of the second kind.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.chebyshev_polynomial_u.
+/// https://pytorch.org/docs/main/special.html#torch.special.chebyshev_polynomial_u.
 ///
 /// Example:
 ///
@@ -805,7 +805,7 @@ inline Tensor& chebyshev_polynomial_u_out(
 /// Chebyshev polynomial of the third kind.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.chebyshev_polynomial_v.
+/// https://pytorch.org/docs/main/special.html#torch.special.chebyshev_polynomial_v.
 ///
 /// Example:
 ///
@@ -851,7 +851,7 @@ inline Tensor& chebyshev_polynomial_v_out(
 /// Chebyshev polynomial of the fourth kind.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.chebyshev_polynomial_w.
+/// https://pytorch.org/docs/main/special.html#torch.special.chebyshev_polynomial_w.
 ///
 /// Example:
 ///
@@ -897,7 +897,7 @@ inline Tensor& chebyshev_polynomial_w_out(
 /// Physicist’s Hermite polynomial.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.hermite_polynomial_h.
+/// https://pytorch.org/docs/main/special.html#torch.special.hermite_polynomial_h.
 ///
 /// Example:
 ///
@@ -943,7 +943,7 @@ inline Tensor& hermite_polynomial_h_out(
 /// Probabilist’s Hermite polynomial.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.hermite_polynomial_he.
+/// https://pytorch.org/docs/main/special.html#torch.special.hermite_polynomial_he.
 ///
 /// Example:
 ///
@@ -989,7 +989,7 @@ inline Tensor& hermite_polynomial_he_out(
 /// Laguerre polynomial.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.laguerre_polynomial_l.
+/// https://pytorch.org/docs/main/special.html#torch.special.laguerre_polynomial_l.
 ///
 /// Example:
 ///
@@ -1035,7 +1035,7 @@ inline Tensor& laguerre_polynomial_l_out(
 /// Legendre polynomial.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.legendre_polynomial_p.
+/// https://pytorch.org/docs/main/special.html#torch.special.legendre_polynomial_p.
 ///
 /// Example:
 ///
@@ -1081,7 +1081,7 @@ inline Tensor& legendre_polynomial_p_out(
 /// Modified Bessel function of the first kind of order 0.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.modified_bessel_i0.
+/// https://pytorch.org/docs/main/special.html#torch.special.modified_bessel_i0.
 ///
 /// Example:
 ///
@@ -1101,7 +1101,7 @@ inline Tensor& modified_bessel_i0_out(Tensor& result, const Tensor& self) {
 /// Modified Bessel function of the first kind of order 1.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.modified_bessel_i1.
+/// https://pytorch.org/docs/main/special.html#torch.special.modified_bessel_i1.
 ///
 /// Example:
 ///
@@ -1121,7 +1121,7 @@ inline Tensor& modified_bessel_i1_out(Tensor& result, const Tensor& self) {
 /// Modified Bessel function of the second kind of order 0.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.modified_bessel_k0.
+/// https://pytorch.org/docs/main/special.html#torch.special.modified_bessel_k0.
 ///
 /// Example:
 ///
@@ -1141,7 +1141,7 @@ inline Tensor& modified_bessel_k0_out(Tensor& result, const Tensor& self) {
 /// Modified Bessel function of the second kind of order 1.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.modified_bessel_k1.
+/// https://pytorch.org/docs/main/special.html#torch.special.modified_bessel_k1.
 ///
 /// Example:
 ///
@@ -1161,7 +1161,7 @@ inline Tensor& modified_bessel_k1_out(Tensor& result, const Tensor& self) {
 /// Scaled modified Bessel function of the second kind of order 0.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.scaled_modified_bessel_k0.
+/// https://pytorch.org/docs/main/special.html#torch.special.scaled_modified_bessel_k0.
 ///
 /// Example:
 ///
@@ -1181,7 +1181,7 @@ inline Tensor& scaled_modified_bessel_k0_out(Tensor& y, const Tensor& x) {
 /// Scaled modified Bessel function of the second kind of order 1.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.scaled_modified_bessel_k1.
+/// https://pytorch.org/docs/main/special.html#torch.special.scaled_modified_bessel_k1.
 ///
 /// Example:
 ///
@@ -1201,7 +1201,7 @@ inline Tensor& scaled_modified_bessel_k1_out(Tensor& y, const Tensor& x) {
 /// Shifted Chebyshev polynomial of the first kind.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.shifted_chebyshev_polynomial_t.
+/// https://pytorch.org/docs/main/special.html#torch.special.shifted_chebyshev_polynomial_t.
 ///
 /// Example:
 ///
@@ -1247,7 +1247,7 @@ inline Tensor& shifted_chebyshev_polynomial_t_out(
 /// Shifted Chebyshev polynomial of the second kind.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.shifted_chebyshev_polynomial_u.
+/// https://pytorch.org/docs/main/special.html#torch.special.shifted_chebyshev_polynomial_u.
 ///
 /// Example:
 ///
@@ -1293,7 +1293,7 @@ inline Tensor& shifted_chebyshev_polynomial_u_out(
 /// Shifted Chebyshev polynomial of the third kind.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.shifted_chebyshev_polynomial_v.
+/// https://pytorch.org/docs/main/special.html#torch.special.shifted_chebyshev_polynomial_v.
 ///
 /// Example:
 ///
@@ -1339,7 +1339,7 @@ inline Tensor& shifted_chebyshev_polynomial_v_out(
 /// Shifted Chebyshev polynomial of the fourth kind.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.shifted_chebyshev_polynomial_w.
+/// https://pytorch.org/docs/main/special.html#torch.special.shifted_chebyshev_polynomial_w.
 ///
 /// Example:
 ///
@@ -1385,7 +1385,7 @@ inline Tensor& shifted_chebyshev_polynomial_w_out(
 /// Spherical Bessel function of the first kind of order 0.
 ///
 /// See
-/// https://pytorch.org/docs/master/special.html#torch.special.spherical_bessel_j0.
+/// https://pytorch.org/docs/main/special.html#torch.special.spherical_bessel_j0.
 ///
 /// Example:
 ///
diff --git a/torch/csrc/api/include/torch/xpu.h b/torch/csrc/api/include/torch/xpu.h
new file mode 100644
index 0000000000000..76eed364aaf9f
--- /dev/null
+++ b/torch/csrc/api/include/torch/xpu.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace torch::xpu {
+
+/// Returns the number of XPU devices available.
+size_t TORCH_API device_count();
+
+/// Returns true if at least one XPU device is available.
+bool TORCH_API is_available();
+
+/// Sets the seed for the current GPU.
+void TORCH_API manual_seed(uint64_t seed);
+
+/// Sets the seed for all available GPUs.
+void TORCH_API manual_seed_all(uint64_t seed);
+
+/// Waits for all kernels in all streams on a XPU device to complete.
+void TORCH_API synchronize(int64_t device_index);
+
+} // namespace torch::xpu
diff --git a/torch/csrc/api/src/nn/init.cpp b/torch/csrc/api/src/nn/init.cpp
index 83948ea253110..7f62e4f6892eb 100644
--- a/torch/csrc/api/src/nn/init.cpp
+++ b/torch/csrc/api/src/nn/init.cpp
@@ -134,8 +134,7 @@ Tensor orthogonal_(Tensor tensor, double gain) {
   }
 
   // Compute the qr factorization
-  Tensor q, r;
-  std::tie(q, r) = torch::linalg::qr(flattened);
+  auto [q, r] = torch::linalg::qr(flattened);
   // Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
   auto d = torch::diag(r, 0);
   auto ph = d.sign();
diff --git a/torch/csrc/api/src/nn/module.cpp b/torch/csrc/api/src/nn/module.cpp
index 1f569aa14386d..dc206ea1b92bd 100644
--- a/torch/csrc/api/src/nn/module.cpp
+++ b/torch/csrc/api/src/nn/module.cpp
@@ -380,7 +380,7 @@ std::shared_ptr<Module> Module::shared_from_this_checked() const {
   std::shared_ptr<const Module> ptr;
   try {
     ptr = shared_from_this();
-  } catch (const std::bad_weak_ptr& e) {
+  } catch (const std::bad_weak_ptr&) {
     AT_ERROR(
         "It looks like you attempted to retrieve your top-level module "
         "as a shared_ptr, but it is not stored in a shared_ptr. "
diff --git a/torch/csrc/api/src/nn/modules/linear.cpp b/torch/csrc/api/src/nn/modules/linear.cpp
index 2d8d4f4697d11..5651c629c5dd5 100644
--- a/torch/csrc/api/src/nn/modules/linear.cpp
+++ b/torch/csrc/api/src/nn/modules/linear.cpp
@@ -46,9 +46,7 @@ void LinearImpl::reset_parameters() {
   torch::nn::init::kaiming_uniform_(
       weight, std::sqrt(5)); // NOLINT(cppcoreguidelines-avoid-magic-numbers)
   if (bias.defined()) {
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int64_t fan_in, fan_out;
-    std::tie(fan_in, fan_out) =
+    auto [fan_in, fan_out] =
         torch::nn::init::_calculate_fan_in_and_fan_out(weight);
     const auto bound = 1 / std::sqrt(fan_in);
     torch::nn::init::uniform_(bias, -bound, bound);
diff --git a/torch/csrc/api/src/nn/modules/rnn.cpp b/torch/csrc/api/src/nn/modules/rnn.cpp
index 0f78bc7507108..3d0626c52d24d 100644
--- a/torch/csrc/api/src/nn/modules/rnn.cpp
+++ b/torch/csrc/api/src/nn/modules/rnn.cpp
@@ -526,8 +526,7 @@ std::tuple<Tensor, Tensor> RNNImpl::forward(const Tensor& input, Tensor hx) {
   auto sorted_indices = torch::Tensor();
   auto unsorted_indices = torch::Tensor();
 
-  Tensor output, hidden;
-  std::tie(output, hidden) = this->forward_helper(
+  auto [output, hidden] = this->forward_helper(
       input, batch_sizes, sorted_indices, max_batch_size, std::move(hx));
 
   return std::make_tuple(
@@ -543,8 +542,7 @@ std::tuple<PackedSequence, Tensor> RNNImpl::forward_with_packed_input(
   const auto& unsorted_indices = packed_input.unsorted_indices();
   auto max_batch_size = batch_sizes[0].item<int64_t>();
 
-  Tensor output, hidden;
-  std::tie(output, hidden) = this->forward_helper(
+  auto [output, hidden] = this->forward_helper(
       input, batch_sizes, sorted_indices, max_batch_size, std::move(hx));
 
   auto output_packed =
@@ -678,9 +676,7 @@ std::tuple<Tensor, std::tuple<Tensor, Tensor>> LSTMImpl::forward(
   auto sorted_indices = torch::Tensor();
   auto unsorted_indices = torch::Tensor();
 
-  Tensor output;
-  std::tuple<Tensor, Tensor> hidden;
-  std::tie(output, hidden) = this->forward_helper(
+  auto [output, hidden] = this->forward_helper(
       input, batch_sizes, sorted_indices, max_batch_size, std::move(hx_opt));
 
   return std::make_tuple(
@@ -697,9 +693,7 @@ std::tuple<PackedSequence, std::tuple<Tensor, Tensor>> LSTMImpl::
   const auto& unsorted_indices = packed_input.unsorted_indices();
   auto max_batch_size = batch_sizes[0].item<int64_t>();
 
-  Tensor output;
-  std::tuple<Tensor, Tensor> hidden;
-  std::tie(output, hidden) = this->forward_helper(
+  auto [output, hidden] = this->forward_helper(
       input, batch_sizes, sorted_indices, max_batch_size, std::move(hx_opt));
 
   auto output_packed =
@@ -779,8 +773,7 @@ std::tuple<Tensor, Tensor> GRUImpl::forward(const Tensor& input, Tensor hx) {
   auto sorted_indices = torch::Tensor();
   auto unsorted_indices = torch::Tensor();
 
-  Tensor output, hidden;
-  std::tie(output, hidden) = this->forward_helper(
+  auto [output, hidden] = this->forward_helper(
       input, batch_sizes, sorted_indices, max_batch_size, std::move(hx));
 
   return std::make_tuple(
@@ -796,8 +789,7 @@ std::tuple<PackedSequence, Tensor> GRUImpl::forward_with_packed_input(
   const auto& unsorted_indices = packed_input.unsorted_indices();
   auto max_batch_size = batch_sizes[0].item<int64_t>();
 
-  Tensor output, hidden;
-  std::tie(output, hidden) = this->forward_helper(
+  auto [output, hidden] = this->forward_helper(
       input, batch_sizes, sorted_indices, max_batch_size, std::move(hx));
 
   auto output_packed =
diff --git a/torch/csrc/api/src/optim/lbfgs.cpp b/torch/csrc/api/src/optim/lbfgs.cpp
index 48695578549b4..bf54e9a878618 100644
--- a/torch/csrc/api/src/optim/lbfgs.cpp
+++ b/torch/csrc/api/src/optim/lbfgs.cpp
@@ -254,10 +254,7 @@ static std::tuple<double, Tensor, double, int64_t> _strong_wolfe(
   auto d_norm = val(d.abs().max());
   g = g.clone(at::MemoryFormat::Contiguous);
   // evaluate objective and gradient using initial step
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  double f_new;
-  Tensor g_new;
-  std::tie(f_new, g_new) = obj_func(x, t, d);
+  auto [f_new, g_new] = obj_func(x, t, d);
   int64_t ls_func_evals = 1;
   auto gtd_new = g_new.dot(d);
 
@@ -330,9 +327,7 @@ static std::tuple<double, Tensor, double, int64_t> _strong_wolfe(
   // exact point satisfying the criteria
   bool insuf_progress = false;
   // find high and low points in bracket
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int64_t low_pos, high_pos;
-  std::tie(low_pos, high_pos) = bracket_f[0] <= bracket_f[1]
+  auto [low_pos, high_pos] = bracket_f[0] <= bracket_f[1]
       ? std::make_tuple(0, 1)
       : std::make_tuple(1, 0);
   while (!done && (ls_iter < max_ls)) {
diff --git a/torch/csrc/api/src/xpu.cpp b/torch/csrc/api/src/xpu.cpp
new file mode 100644
index 0000000000000..a19d1dcdccd86
--- /dev/null
+++ b/torch/csrc/api/src/xpu.cpp
@@ -0,0 +1,45 @@
+#include <ATen/Context.h>
+#include <torch/xpu.h>
+
+namespace torch::xpu {
+
+size_t device_count() {
+  return at::detail::getXPUHooks().getNumGPUs();
+}
+
+bool is_available() {
+  return xpu::device_count() > 0;
+}
+
+void manual_seed(uint64_t seed) {
+  if (is_available()) {
+    auto index = at::detail::getXPUHooks().current_device();
+    auto gen = at::detail::getXPUHooks().getDefaultXPUGenerator(index);
+    {
+      // See Note [Acquire lock when using random generators]
+      std::lock_guard<std::mutex> lock(gen.mutex());
+      gen.set_current_seed(seed);
+    }
+  }
+}
+
+/// Sets the seed for all available GPUs.
+void manual_seed_all(uint64_t seed) {
+  auto num_gpu = device_count();
+  for (const auto i : c10::irange(num_gpu)) {
+    auto gen = at::detail::getXPUHooks().getDefaultXPUGenerator(i);
+    {
+      // See Note [Acquire lock when using random generators]
+      std::lock_guard<std::mutex> lock(gen.mutex());
+      gen.set_current_seed(seed);
+    }
+  }
+}
+
+void synchronize(int64_t device_index) {
+  TORCH_CHECK(is_available(), "No XPU are available");
+  at::detail::getXPUHooks().deviceSynchronize(
+      static_cast<c10::DeviceIndex>(device_index));
+}
+
+} // namespace torch::xpu
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 40a7905ff3967..4c0c324ad56ec 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -73,8 +73,13 @@ Tensor toNonOptFwGrad(const c10::optional<Tensor>& t) {
 }
 
 Tensor toNonOptPrimal(const c10::optional<Tensor>& t) {
-  return (t.has_value() && t->defined()) ? t->_fw_primal(/*level */ 0)
-                                         : Tensor();
+  if (t.has_value() && t->defined()) {
+    if (t->unsafeGetTensorImpl()->is_wrapped_number()) {
+      return *t;
+    }
+    return t->_fw_primal(/* level */ 0);
+  }
+  return Tensor();
 }
 
 void copy_range(variable_list& out, IndexRange range, const Tensor& t) {
@@ -767,7 +772,7 @@ std::vector<int64_t> reverse_list(const IntArrayRef list) {
 
 Tensor reverse_dim(const Tensor& t, int64_t dim) {
   Tensor index =
-      at::arange(t.size(dim) - 1, -1, -1, t.options().dtype(at::kLong));
+      at::arange(t.sym_size(dim) - 1, -1, -1, t.options().dtype(at::kLong));
   return t.index_select(dim, index);
 }
 
@@ -782,19 +787,19 @@ Tensor prod_safe_zeros_backward(
     return grad.expand_as(inp);
   }
 
-  if (inp.size(dim) == 1) {
+  if (inp.sym_size(dim) == 1) {
     return grad;
   }
 
-  auto ones_size = inp.sizes().vec();
+  auto ones_size = inp.sym_sizes().vec();
   ones_size[dim] = 1;
-  Tensor ones = at::ones(ones_size, grad.options());
+  Tensor ones = at::ones_symint(ones_size, grad.options());
   Tensor exclusive_normal_nocp =
-      at::cat({ones, inp.narrow(dim, 0, inp.size(dim) - 1)}, dim);
+      at::cat({ones, inp.narrow_symint(dim, 0, inp.sym_size(dim) - 1)}, dim);
   Tensor exclusive_normal = exclusive_normal_nocp.cumprod(dim);
 
   Tensor narrow_reverse =
-      reverse_dim(inp.narrow(dim, 1, inp.size(dim) - 1), dim);
+      reverse_dim(inp.narrow_symint(dim, 1, inp.sym_size(dim) - 1), dim);
   Tensor exclusive_reverse_nocp =
       at::cat({std::move(ones), std::move(narrow_reverse)}, dim);
   Tensor exclusive_reverse =
@@ -825,7 +830,7 @@ Tensor prod_backward(
   Tensor zero_idx = (input == 0).nonzero();
   if (zero_idx.sym_numel() == 0) {
     return grad * (result / input).conj();
-  } else if (!at::GradMode::is_enabled() && zero_idx.size(0) > 1) {
+  } else if (!at::GradMode::is_enabled() && zero_idx.sym_size(0) > 1) {
     return at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   } else {
     return prod_safe_zeros_backward(grad, input.contiguous().view(-1), 0)
@@ -842,7 +847,7 @@ Tensor prod_backward(
   if (input.dim() == 0) {
     return grad;
   }
-  dim = at::maybe_wrap_dim(dim, static_cast<int64_t>(input.sizes().size()));
+  dim = at::maybe_wrap_dim(dim, static_cast<int64_t>(input.sym_sizes().size()));
   if (!keepdim) {
     // `prod` reduces the dimension at `dim`,
     // so, unsqueeze `grad` and `result` at dim.
@@ -912,34 +917,35 @@ Tensor logcumsumexp_backward(
 
   // Reference: https://github.com/tensorflow/tensorflow/blob/
   // 2a5910906a0e0f3dbc186ff9db6386d81a63448c/tensorflow/python/ops/math_grad.py#L1832-L1863
-  return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
+
+  auto scalar_min = AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
       at::ScalarType::BFloat16,
       at::typeMetaToScalarType(grad.dtype()),
       "logcumsumexp_backward",
-      [grad, self, result, dim]() {
-        auto grad_min = at::empty_like(grad);
-        auto reverse_logcumsumexp = [dim](auto x) {
-          return at::flip(at::logcumsumexp(at::flip(x, {dim}), dim), {dim});
-        };
+      []() { return c10::Scalar(std::numeric_limits<scalar_t>::lowest()); });
 
-        if (!at::is_complex(grad)) {
-          grad_min.fill_(std::numeric_limits<scalar_t>::lowest());
-          auto log_grad_positive = at::where(grad > 0, grad.log(), grad_min);
-          auto log_grad_negative = at::where(grad < 0, (-grad).log(), grad_min);
+  auto reverse_logcumsumexp = [dim](auto x) {
+    return at::flip(at::logcumsumexp(at::flip(x, {dim}), dim), {dim});
+  };
 
-          auto output_pos =
-              (reverse_logcumsumexp(log_grad_positive - result) + self).exp();
-          auto output_neg =
-              (reverse_logcumsumexp(log_grad_negative - result) + self).exp();
+  if (!at::is_complex(grad)) {
+    auto grad_min = at::scalar_tensor(scalar_min, grad.options());
+    auto log_abs_grad = grad.abs().log();
+    auto log_grad_positive = at::where(grad > 0, log_abs_grad, grad_min);
+    auto log_grad_negative = at::where(grad < 0, log_abs_grad, grad_min);
 
-          return output_pos - output_neg;
-        } else {
-          // no trick separating the positive and negative required
-          auto log_grad = grad.conj().log();
-          auto output = (reverse_logcumsumexp(log_grad - result) + self).exp();
-          return output.conj();
-        }
-      });
+    auto output_pos =
+        (reverse_logcumsumexp(log_grad_positive - result) + self).exp();
+    auto output_neg =
+        (reverse_logcumsumexp(log_grad_negative - result) + self).exp();
+
+    return output_pos - output_neg;
+  } else {
+    // no trick separating the positive and negative required
+    auto log_grad = grad.conj().log();
+    auto output = (reverse_logcumsumexp(log_grad - result) + self).exp();
+    return output.conj();
+  }
 }
 
 Tensor logcumsumexp_jvp(
@@ -1077,9 +1083,11 @@ std::vector<Tensor> cat_tensors_backward(
     }
     auto& shape = sizes[i];
     // If input was empty tensor, gradInput should be empty tensor.
-    if (shape == std::vector<c10::SymInt>({c10::SymInt(0)})) {
-      grad_inputs[i] = at::zeros({0}, grad_val.options());
-      continue;
+    if (shape.size() == 1) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(shape[0].sym_eq(0))) {
+        grad_inputs[i] = at::zeros({0}, grad_val.options());
+        continue;
+      }
     }
     const auto& size = shape[dim];
     accumulate += size;
@@ -3234,12 +3242,11 @@ Tensor as_strided_scatter_backward(
   // take the perf hit and contiguify grad for now.
   auto grad_ = grad.contiguous();
   auto grad_slice = grad_.as_strided_symint(sizes, strides, storage_offset);
-  auto result =
-      grad_.new_zeros_symint(input_geometry.sym_sizes())
-          .as_strided_symint(
-              input_geometry.sym_sizes(), input_geometry.sym_strides());
-  auto result_slice =
-      result.as_strided_symint(sizes, strides, std::move(storage_offset));
+  auto result_buffer = grad_.new_zeros_symint(input_geometry.sym_sizes());
+  auto result = result_buffer.as_strided_symint(
+      input_geometry.sym_sizes(), input_geometry.sym_strides());
+  auto result_slice = result_buffer.as_strided_symint(
+      sizes, strides, std::move(storage_offset));
   result_slice.copy_(grad_slice);
   return result;
 }
@@ -6942,8 +6949,7 @@ Tensor take_backward(
   // For Composite Compliance,
   // if `grad` and `indices` are CCT but `grad_self` is not
   // then we use the out-of-place variant of `put`.
-  if (!isTensorSubclassLike(grad_self) &&
-      areAnyTensorSubclassLike({grad, indices})) {
+  if (areAnyTensorSubclassLike({grad, indices})) {
     return grad_self.put(indices, grad, true);
   }
   return grad_self.put_(indices, grad, true);
@@ -7112,8 +7118,7 @@ Tensor values_backward(const Tensor& grad, const Tensor& self) {
           self.options(),
           /*is_coalesced=*/true);
     } else if (at::sparse_csr::is_sparse_compressed(self)) {
-      Tensor compressed_indices, plain_indices;
-      std::tie(compressed_indices, plain_indices) =
+      auto [compressed_indices, plain_indices] =
           at::sparse_csr::getCompressedPlainIndices(self);
       return at::_sparse_compressed_tensor_unsafe_symint(
           compressed_indices,
diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
index 43e2e6c0082e8..38a63640c11e6 100644
--- a/torch/csrc/autograd/VariableTypeManual.cpp
+++ b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -9,6 +9,7 @@
 #include <torch/csrc/autograd/autograd.h>
 #include <torch/csrc/autograd/functions/utils.h>
 #include <torch/csrc/autograd/generated/VariableType.h>
+#include <torch/csrc/autograd/generated/ViewFuncs.h>
 #include <torch/library.h>
 
 #include <utility>
@@ -50,6 +51,12 @@ std::vector<at::DeprecatedTypeProperties*> allXPUTypes() {
   return allTypesForBackends({Backend::XPU, Backend::SparseXPU});
 }
 
+std::vector<at::DeprecatedTypeProperties*> allPrivateUser1Types() {
+  at::globalContext().lazyInitPrivateUse1();
+  return allTypesForBackends(
+      {Backend::PrivateUse1, Backend::SparsePrivateUse1});
+}
+
 namespace {
 const Variable& checked_cast_variable(
     const Tensor& t,
@@ -468,13 +475,10 @@ static Tensor _fw_primal(
     at::AutoDispatchBelowADInplaceOrView guard;
     return at::alias(self);
   })();
-  std::function<at::Tensor(const at::Tensor&)> func = nullptr;
+  std::unique_ptr<torch::autograd::ViewFunc> func(nullptr);
   std::function<at::Tensor(const at::Tensor&)> rev_func = nullptr;
   if (!self.unsafeGetTensorImpl()->support_as_strided()) {
-    auto size_vec = self.sizes().vec();
-    func = [=](const at::Tensor& input_base) {
-      return input_base.view(size_vec);
-    };
+    func = std::make_unique<ViewViewFunc>(self.sym_sizes());
     rev_func = [=](const at::Tensor& input_view) {
       TORCH_INTERNAL_ASSERT(
           false,
@@ -504,13 +508,10 @@ static Tensor _make_dual(
     at::AutoDispatchBelowADInplaceOrView guard;
     return at::alias(primal);
   })();
-  std::function<at::Tensor(const at::Tensor&)> func = nullptr;
+  std::unique_ptr<torch::autograd::ViewFunc> func(nullptr);
   std::function<at::Tensor(const at::Tensor&)> rev_func = nullptr;
   if (!primal.unsafeGetTensorImpl()->support_as_strided()) {
-    auto size_vec = primal.sizes().vec();
-    func = [=](const at::Tensor& input_base) {
-      return input_base.view(size_vec);
-    };
+    func = std::make_unique<ViewViewFunc>(primal.sym_sizes());
     rev_func = [=](const at::Tensor& input_view) {
       TORCH_INTERNAL_ASSERT(
           false,
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
index 8f87317d84785..b8fa4b6c101a7 100644
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@@ -31,6 +31,12 @@
 
 namespace torch {
 namespace autograd {
+enum class can_mutate_inplace_result {
+  success,
+  non_default_backward_view,
+  view_of_leaf,
+  is_leaf,
+};
 
 // The requires_grad argument is used to know if the inplace operation needs
 // gradient to be setup for it.
@@ -38,24 +44,47 @@ namespace autograd {
 // writing a Tensor that requires gradients inplace into a Tensor that does not
 // require gradients: a = torch.rand(2) b = torch.rand(2, requires_grad=True)
 // a.copy_(b)
+inline can_mutate_inplace_result can_mutate_inplace(
+    const at::Tensor& tensor,
+    bool requires_grad) {
+  if (!requires_grad || !GradMode::is_enabled()) {
+    return can_mutate_inplace_result::success;
+  }
+  auto diff_view_meta = impl::get_view_autograd_meta(tensor);
+  if (diff_view_meta && diff_view_meta->has_bw_view()) {
+    if (diff_view_meta->get_creation_meta() != CreationMeta::DEFAULT) {
+      return can_mutate_inplace_result::non_default_backward_view;
+    }
+    if (tensor.requires_grad() && tensor._base().is_leaf()) {
+      return can_mutate_inplace_result::view_of_leaf;
+    }
+  }
+  if (tensor.requires_grad() && tensor.is_leaf()) {
+    return can_mutate_inplace_result::is_leaf;
+  }
+  return can_mutate_inplace_result::success;
+}
+
 inline void check_inplace(const at::Tensor& tensor, bool requires_grad) {
-  if (requires_grad && GradMode::is_enabled()) {
-    auto diff_view_meta = impl::get_view_autograd_meta(tensor);
-    if (diff_view_meta && diff_view_meta->has_bw_view()) {
-      // This can throw or warn
-      handle_view_on_rebase(diff_view_meta);
-      if (tensor.requires_grad() && tensor._base().is_leaf()) {
-        TORCH_CHECK(
-            false,
-            "a view of a leaf Variable that requires grad is being used in an in-place operation.");
-      }
+  switch (can_mutate_inplace(tensor, requires_grad)) {
+    case can_mutate_inplace_result::success:
+      return;
+    case can_mutate_inplace_result::non_default_backward_view: {
+      return handle_view_on_rebase(impl::get_view_autograd_meta(tensor));
     }
-    if (tensor.requires_grad() && tensor.is_leaf()) {
+    case can_mutate_inplace_result::view_of_leaf:
+      TORCH_CHECK(
+          false,
+          "a view of a leaf Variable that requires grad is being used in an in-place operation.");
+      break;
+
+    case can_mutate_inplace_result::is_leaf:
       TORCH_CHECK(
           false,
           "a leaf Variable that requires grad is being used in an in-place operation.");
-    }
+      break;
   }
+  TORCH_INTERNAL_ASSERT(false);
 }
 
 inline void check_inplace(at::ITensorListRef tensors, bool requires_grad) {
@@ -104,7 +133,7 @@ inline void throw_error_for_complex_autograd(
 
 // TODO: Blegh, bare references
 
-inline void rebase_history(Variable& var, std::shared_ptr<Node> grad_fn) {
+inline void rebase_history(const Variable& var, std::shared_ptr<Node> grad_fn) {
   if (grad_fn && var.defined()) {
     grad_fn->add_input_metadata(var);
     impl::rebase_history(var, {std::move(grad_fn), 0});
@@ -160,7 +189,7 @@ inline at::Tensor as_view(
     const at::Tensor& tensor,
     bool is_bw_differentiable,
     bool is_fw_differentiable,
-    std::function<at::Tensor(const at::Tensor&)> view_func = nullptr,
+    std::unique_ptr<ViewFunc> view_func = nullptr,
     std::function<at::Tensor(const at::Tensor&)> rev_view_func = nullptr,
     CreationMeta creation_meta = CreationMeta::DEFAULT,
     bool allow_tensor_metadata_change = true) {
@@ -208,11 +237,13 @@ inline at::Tensor as_view(
   c10::optional<ViewInfo> new_fw_info;
 
   if (is_bw_differentiable) {
+    auto bw_view_func = view_func ? view_func->clone_and_set() : nullptr;
     if (diff_view_meta && diff_view_meta->has_bw_view()) {
       const auto& base_bw_info = diff_view_meta->get_backward_view();
-      new_bw_info = base_bw_info.chain(base, tensor, view_func, rev_view_func);
+      new_bw_info = base_bw_info.chain(
+          base, tensor, std::move(bw_view_func), rev_view_func);
     } else {
-      new_bw_info = ViewInfo(base, view_func, rev_view_func);
+      new_bw_info = ViewInfo(base, std::move(bw_view_func), rev_view_func);
     }
   } else {
     TORCH_CHECK(
@@ -395,11 +426,8 @@ Return run_jit_decomposition_with_args_for_jvp(
       name,
       " that does not support it because it has not been implemented yet.\nPlease file an issue "
       "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
-      "so that we can prioritize its implementation.\n"
-      "Note that forward AD support for some operators require PyTorch to be built with "
-      "TorchScript and for JIT to be enabled. "
-      "If the environment var PYTORCH_JIT=0 is set or if the library is not built with TorchScript, "
-      "some operators may no longer be used with forward AD.");
+      "so that we can prioritize its implementation or submit a PR adding the implementation to "
+      "derivatives.yaml");
 
   return c10::KernelFunction::makeFromBoxedKernel(
              c10::BoxedKernel::makeFromFunctor(
diff --git a/torch/csrc/autograd/autograd.cpp b/torch/csrc/autograd/autograd.cpp
index 3e5d2cf58198d..fd4265619fccd 100644
--- a/torch/csrc/autograd/autograd.cpp
+++ b/torch/csrc/autograd/autograd.cpp
@@ -126,7 +126,9 @@ static variable_list run_backward(
       }
       TORCH_CHECK(
           input.requires_grad(),
-          "One of the differentiated Tensors does not require grad");
+          "element ",
+          i,
+          " of the input tensors does not require grad");
       if (!grad_fn) {
         // See NOTE [ Autograd Unreachable Input ] for details
         output_edges.emplace_back(std::make_shared<Identity>(), 0);
@@ -149,7 +151,9 @@ static variable_list run_backward(
     for (const auto i : c10::irange(num_inputs)) {
       TORCH_CHECK(
           grad_inputs[i].defined(),
-          "One of the "
+          "element ",
+          i,
+          "of the "
           "differentiated Tensors appears to not have been used "
           "in the graph. Set allow_unused=True if this is the "
           "desired behavior.");
diff --git a/torch/csrc/autograd/autograd_meta.cpp b/torch/csrc/autograd/autograd_meta.cpp
index bea070a973a39..7d8ad75fd67df 100644
--- a/torch/csrc/autograd/autograd_meta.cpp
+++ b/torch/csrc/autograd/autograd_meta.cpp
@@ -116,24 +116,24 @@ bool has_same_meta(const Variable& base, const Variable& other) {
     return false;
   }
   for (const auto i : c10::irange(base.dim())) {
-    if (base.sizes()[i] != other.sizes()[i]) {
+    if (base.sym_sizes()[i] != other.sym_sizes()[i]) {
       return false;
     }
   }
 
   // The check below will always be vacuously true for 0-element tensors
-  if (base.numel() == 0 && other.numel() == 0) {
+  if (base.sym_numel() == 0 && other.sym_numel() == 0) {
     return true;
   }
 
   // 2) The same indices refer to the same elements in storage
-  if (base.storage_offset() != other.storage_offset()) {
+  if (base.sym_storage_offset() != other.sym_storage_offset()) {
     return false;
   }
 
   for (const auto i : c10::irange(base.dim())) {
-    if (base.strides()[i] != other.strides()[i] && base.sizes()[i] != 1 &&
-        base.sizes()[i] != 0) {
+    if (base.sym_strides()[i] != other.sym_strides()[i] &&
+        base.sym_sizes()[i] != 1 && base.sym_sizes()[i] != 0) {
       return false;
     }
   }
@@ -213,7 +213,7 @@ void AutogradMeta::set_fw_grad(
       //   - Copy the given new_grad into this view
       //   - Use this view as the new new_grad
       if (this_view_meta->has_fw_view()) {
-        auto view_info = this_view_meta->get_forward_view();
+        auto& view_info = this_view_meta->get_forward_view();
         auto& base = view_info.base_;
 
         if (!base._fw_grad(level).defined()) {
diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
index b334e6f097f7f..2cfca6817e855 100644
--- a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+++ b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
@@ -184,8 +184,7 @@ static void basicAutogradNotImplementedFallbackImpl(
             // users typically call .backward() and backprop through
             // the entire program).
             if (t.is_view() && is_mutable_output) {
-              // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-              auto& base = const_cast<at::TensorBase&>(t._base());
+              const auto& base = t._base();
               if (base.requires_grad()) {
                 // Can only register_hook on tensors that require grad.
                 base.register_hook([op_name](const at::TensorBase& grad) {
@@ -210,8 +209,7 @@ static void basicAutogradNotImplementedFallbackImpl(
           // rebase_history assumes single Tensor(a!) return, and in general
           // custom ops don't have a good in-place story.
           if (!is_mutable_output) {
-            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-            set_history(const_cast<at::Tensor&>(t), grad_fn);
+            set_history(t, grad_fn);
           }
         },
         stack,
@@ -301,11 +299,26 @@ static void autogradNotImplementedFallbackImpl(
       num_arguments);
 
   const bool any_requires_grad = !tensors_requiring_grad_on_stack.empty();
+  const bool has_out_arg = std::any_of(
+      schema.arguments().begin(),
+      schema.arguments().end(),
+      [](const c10::Argument& arg) { return arg.is_out(); });
 
   _foreach_tensor(
       [&](size_t _, size_t i, const at::Tensor& t) {
         if (schema.is_mutable({c10::SchemaArgType::input, i})) {
-          check_inplace(t, any_requires_grad);
+          if (has_out_arg) {
+            // Normally out argument overloads would not support any arguments
+            // that require grad. However, we loosen this check to maintain
+            // backward compatibility.
+            // See https://github.com/pytorch/pytorch/issues/120988
+            if (can_mutate_inplace(t, any_requires_grad) !=
+                can_mutate_inplace_result::success) {
+              throw_error_out_requires_grad(schema.name().c_str());
+            }
+          } else {
+            check_inplace(t, any_requires_grad);
+          }
         }
       },
       stack,
@@ -365,7 +378,9 @@ static void autogradNotImplementedFallbackImpl(
   _foreach_tensor(
       [&](size_t idx_tensor, size_t idx_ret, const at::Tensor& t) {
         if (at::impl::tensor_has_dispatch(t) ||
-            at::impl::dispatch_mode_enabled())
+            at::impl::dispatch_mode_enabled() ||
+            // NJT offsets are expected to be reused; skip use_count() check
+            op_name == "aten::_nested_get_offsets")
           return;
         if (!is_inplace_output[idx_ret])
           TORCH_INTERNAL_ASSERT(
@@ -418,11 +433,9 @@ static void autogradNotImplementedFallbackImpl(
         [&](size_t idx_tensor, size_t idx_ret, const at::Tensor& t) {
           if (isDifferentiableType(t.scalar_type())) {
             if (is_inplace_output[idx_ret]) {
-              // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-              rebase_history(const_cast<at::Tensor&>(t), grad_fn);
+              rebase_history(t, grad_fn);
             } else {
-              // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-              set_history(const_cast<at::Tensor&>(t), grad_fn);
+              set_history(t, grad_fn);
             }
           }
         },
@@ -530,18 +543,14 @@ static void autogradNotImplementedInplaceOrViewFallbackImpl(
         (*stack)[stack->size() - num_returns + aliased_output_idx];
 
     // See NOTE [ View + Inplace detection ] for more details about this logic
-    const auto erroring_view_func = [op_name = op_name](const at::Tensor&) {
-      // We always need this view_func because otherwise if we do in-place
-      // on this view, we would implicitly use AsStridedBackward instead
-      // of the NotImplemented node. For the cross-dtype/non-strided
-      // cases, we would create something like this anyway
-      TORCH_CHECK(
-          false,
-          "Mutating the view ",
-          op_name,
-          " which does not have a derivative implemented is forbidden.");
-      return at::Tensor();
-    };
+    // We always need this view_func because otherwise if we do in-place
+    // on this view, we would implicitly use AsStridedBackward instead
+    // of the NotImplemented node. For the cross-dtype/non-strided
+    // cases, we would create something like this anyway
+    auto error_msg =
+        ("Mutating the view " + op_name +
+         "which does not have a derivative implemented is forbidden.");
+    auto erroring_view_func = std::make_unique<ErroringViewFunc>(error_msg);
 
     const auto erroring_rev_view_func = [op_name = op_name](const at::Tensor&) {
       TORCH_CHECK(
@@ -560,7 +569,7 @@ static void autogradNotImplementedInplaceOrViewFallbackImpl(
             /* tensor=*/sub_output,
             /* is_bw_differentiable=*/true,
             /* is_fw_differentiable=*/true,
-            /* view_func=*/erroring_view_func,
+            /* view_func=*/std::move(erroring_view_func),
             /* rev_view_func=*/erroring_rev_view_func,
             /* creation_meta=*/
             InferenceMode::is_enabled()
@@ -577,7 +586,7 @@ static void autogradNotImplementedInplaceOrViewFallbackImpl(
           /* tensor=*/std::move(aliased_output_iv).toTensor(),
           /* is_bw_differentiable=*/true,
           /* is_fw_differentiable=*/true,
-          /* view_func=*/erroring_view_func,
+          /* view_func=*/std::move(erroring_view_func),
           /* rev_view_func=*/erroring_rev_view_func,
           /* creation_meta=*/
           InferenceMode::is_enabled()
diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp
index c88880911c5a8..41e2f1991a52b 100644
--- a/torch/csrc/autograd/custom_function.cpp
+++ b/torch/csrc/autograd/custom_function.cpp
@@ -5,28 +5,7 @@
 
 #include <utility>
 
-namespace torch {
-namespace autograd {
-
-VariableInfo::VariableInfo(const Variable& var)
-    : layout(var.layout()),
-      device(var.device()),
-      scalar_type(var.scalar_type()),
-      size(var.sym_sizes().vec()),
-      requires_grad(var.requires_grad()),
-      is_empty(false) {}
-
-VariableInfo::VariableInfo() : requires_grad(false), is_empty(true) {}
-
-Variable VariableInfo::zeros(at::OptionalDeviceGuard& device_guard) const {
-  if (is_empty) {
-    // Return undefined tensor.
-    return at::Tensor();
-  } else {
-    return at::zeros_symint(
-        size, at::TensorOptions(scalar_type).device(device).layout(layout));
-  }
-}
+namespace torch::autograd {
 
 // This function has two main goals:
 //  1) Use the user-provided jvp function to populate the outputs' forward
@@ -131,6 +110,9 @@ static void _process_forward_mode_AD(
       ")");
 
   for (const auto i : c10::irange(num_outputs)) {
+    if (!raw_outputs[i].has_value()) {
+      continue;
+    }
     const auto& out =
         outputs[i].has_value() ? outputs[i].value() : at::Tensor();
     auto out_tensor_impl = raw_outputs[i].value().unsafeGetTensorImpl();
@@ -149,7 +131,6 @@ static void _process_forward_mode_AD(
       continue;
     }
 
-    TORCH_INTERNAL_ASSERT(raw_outputs[i].has_value());
     bool is_input = inputs_mapping.count(out_tensor_impl) > 0;
     bool is_modified = dirty_inputs.count(out_tensor_impl) > 0;
 
@@ -600,5 +581,4 @@ const std::unordered_set<at::TensorImpl*>& AutogradContext::
     get_non_differentiable() const {
   return non_differentiable_;
 }
-} // namespace autograd
-} // namespace torch
+} // namespace torch::autograd
diff --git a/torch/csrc/autograd/custom_function.h b/torch/csrc/autograd/custom_function.h
index 8bf145b44631f..ebabc45334a5d 100644
--- a/torch/csrc/autograd/custom_function.h
+++ b/torch/csrc/autograd/custom_function.h
@@ -6,6 +6,8 @@
 #include <c10/util/irange.h>
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/autograd/variable_info.h>
+#include <torch/csrc/dynamo/compiled_autograd.h>
 #include <vector>
 
 namespace torch::autograd {
@@ -57,11 +59,16 @@ using forward_t = decltype(X::forward(nullptr, std::declval<Args>()...));
 /// `ctx->get_saved_variables` (see
 /// `torch::autograd::AutogradContext::get_saved_variables`) and other saved
 /// data can be accessed from `ctx->saved_data`.
+/// To enable compiled autograd support (torch.compile for backward) for your
+/// custom autograd operation, you can set MyFunction::is_traceable
+/// (see Function::istraceable notes below).
 ///
 /// For example:
 /// ```
 /// class MyFunction : public Function<MyFunction> {
 ///   public:
+///   static constexpr bool is_traceable = true;
+///
 ///   static variable_list forward(AutogradContext *ctx, int n, Variable var) {
 ///      // Save data for backward in context
 ///      ctx->saved_data["n"] = n;
@@ -97,6 +104,15 @@ struct TORCH_API Function {
   template <typename X = T, typename... Args>
   static auto apply(Args&&... args)
       -> std::enable_if_t<std::is_same_v<X, T>, forward_t<X, Args...>>;
+
+  // This flag is for an experimental feature: compiled autograd. Not all
+  // built-in APIs are supported at the moment e.g. mark_dirty and
+  // mark_non_differentiable. Before setting this flag to enable tracing for
+  // your custom function <T>, you need to ensure that the backward function is
+  // traceable i.e. any variables accessed in the backward other than the input
+  // arguments must be handled in a similar manner to built-ins in
+  // CppNode::compiled_args and CppNode::apply_with_saved.
+  static constexpr bool is_traceable = false;
 };
 
 /// Context to save information during `forward` that can be accessed in
@@ -157,20 +173,6 @@ struct TORCH_API AutogradContext {
   friend struct CppNode;
 };
 
-struct TORCH_API VariableInfo {
-  explicit VariableInfo();
-  explicit VariableInfo(const Variable& var);
-
-  Variable zeros(at::OptionalDeviceGuard& device_guard) const;
-
-  at::Layout layout = at::Layout::Strided;
-  at::Device device = at::kCPU;
-  at::ScalarType scalar_type = at::kFloat;
-  std::vector<c10::SymInt> size;
-  bool requires_grad;
-  bool is_empty;
-};
-
 // CppNode<T> is the Node in the autograd graph that represents the user defined
 // backward function for Function<T>. Calls to CppNode::apply are forward to
 // T::backward().
@@ -186,10 +188,62 @@ struct CppNode : public Node {
 
   void set_ctx_grad_fn(const std::shared_ptr<Node>& node);
   void save_variables_to_ctx();
+
+  void compiled_args(CompiledNodeArgs& args) override {
+    if (!T::is_traceable) {
+      throw std::runtime_error(
+          std::string(
+              "compiled_args not implemented for non-traceable node: ") +
+          name());
+    }
+
+    // although neither of the 2 methods below have uniqueness guarantees
+    // it is unlikely for them to collide at the same time
+    args.collect(static_cast<uint64_t>(typeid(T).hash_code()));
+    args.collect(std::string(typeid(T).name()));
+
+    args.collect(ctx_.saved_data);
+    TORCH_INTERNAL_ASSERT(ctx_.non_differentiable_.empty());
+    TORCH_INTERNAL_ASSERT(ctx_.dirty_inputs_.empty());
+    args.collect(ctx_.saved_variables_);
+    TORCH_INTERNAL_ASSERT(ctx_.to_save_.empty());
+    args.collect(ctx_.materialize_grads_);
+    args.collect(ctx_.has_freed_buffers_);
+    args.collect(is_variable_input_);
+    args.collect(input_info_);
+    args.collect(output_info_);
+  }
+
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override {
+    saved.before(ctx_.saved_data);
+    TORCH_INTERNAL_ASSERT(ctx_.non_differentiable_.empty());
+    TORCH_INTERNAL_ASSERT(ctx_.dirty_inputs_.empty());
+    saved.before(ctx_.saved_variables_);
+    TORCH_INTERNAL_ASSERT(ctx_.to_save_.empty());
+    saved.before(ctx_.materialize_grads_);
+    saved.before(ctx_.has_freed_buffers_);
+    saved.before(input_info_);
+    saved.before(output_info_);
+    auto results = apply(variable_list(inputs));
+    saved.after(ctx_.saved_data);
+    TORCH_INTERNAL_ASSERT(ctx_.non_differentiable_.empty());
+    TORCH_INTERNAL_ASSERT(ctx_.dirty_inputs_.empty());
+    saved.after(ctx_.saved_variables_);
+    TORCH_INTERNAL_ASSERT(ctx_.to_save_.empty());
+    saved.after(ctx_.materialize_grads_);
+    saved.after(ctx_.has_freed_buffers_);
+    saved.after(input_info_);
+    saved.after(output_info_);
+    return results;
+  }
 };
 
 struct ExtractVariables : IterArgs<ExtractVariables> {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   std::vector<bool>& is_var_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   variable_list& list_;
   ExtractVariables(std::vector<bool>& is_var, variable_list& list)
       : is_var_(is_var), list_(list) {}
@@ -347,17 +401,17 @@ auto Function<T>::apply(Args&&... args)
 // The logic here is the same as PyNode::apply, so changes to it should be done
 // in both the places
 template <class T>
+// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
 variable_list CppNode<T>::apply(variable_list&& inputs) {
   at::OptionalDeviceGuard _device_guard;
 
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int num_inputs = inputs.size();
+  auto num_inputs = inputs.size();
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   variable_list backward_inputs;
   backward_inputs.reserve(num_inputs);
   for (const auto i : c10::irange(num_inputs)) {
     if (inputs[i].defined() || !ctx_.materialize_grads_) {
-      backward_inputs.emplace_back(inputs[i]);
+      backward_inputs.emplace_back(std::move(inputs[i]));
     } else {
       backward_inputs.emplace_back(output_info_[i].zeros(_device_guard));
     }
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index aaf3709e6386f..0b53841aafe3a 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -8,11 +8,13 @@
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/dynamo/compiled_autograd.h>
 
+#include <ATen/DeviceAccelerator.h>
 #include <ATen/DeviceGuard.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/Parallel.h>
 #include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/detail/PrivateUse1HooksInterface.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -29,6 +31,7 @@
 #include <c10/util/Optional.h>
 #include <c10/util/ThreadLocal.h>
 #include <c10/util/irange.h>
+#include <c10/util/thread_name.h>
 
 #include <atomic>
 #include <chrono>
@@ -45,8 +48,7 @@
 #include <unordered_set>
 #include <utility>
 
-namespace torch {
-namespace autograd {
+namespace torch::autograd {
 
 namespace {
 static bool in_bad_autograd_fork =
@@ -80,7 +82,7 @@ std::atomic<Engine::compiled_autograd_fn> the_compiled_autograd = nullptr;
   reinterpret_cast<Engine::compiled_autograd_fn>(1)
 std::atomic<int32_t> num_threads_in_backwards;
 struct CompiledAutogradThreadingDebugCheck {
-  CompiledAutogradThreadingDebugCheck() : incremented(true) {
+  CompiledAutogradThreadingDebugCheck() {
     num_threads_in_backwards++;
   }
   ~CompiledAutogradThreadingDebugCheck() {
@@ -93,7 +95,7 @@ struct CompiledAutogradThreadingDebugCheck {
   }
 
  private:
-  bool incremented;
+  bool incremented{true};
 };
 
 } // namespace
@@ -179,8 +181,8 @@ C10_DEFINE_TLS_static(std::shared_ptr<ReadyQueue>, tls_local_ready_queue);
 
 // Note [Streaming backwards]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~
-// On CUDA devices the autograd engine's device operations are run on the
-// same stream that ran them in forward. This requires automatically
+// On CUDA/privateuse1 devices the autograd engine's device operations are run
+// on the same stream that ran them in forward. This requires automatically
 // syncing the streams so that function A finishes producing its
 // output before function B consumes it.
 //
@@ -189,9 +191,10 @@ C10_DEFINE_TLS_static(std::shared_ptr<ReadyQueue>, tls_local_ready_queue);
 // recording their streams from forward, and during backward this
 // data is used to sync the producer's stream with the consumer's.
 //
-// When a CUDA function is run either all its inputs were accumulated on the
-// stream used to run the function OR the inputs are on different devices
-// and the function is responsible for properly acquiring them.
+// When a CUDA/privateuse1 function is run either all its inputs were
+// accumulated on the stream used to run the function OR the inputs are on
+// different devices and the function is responsible for properly acquiring
+// them.
 //
 // User-facing stream semantics of a backward() (or torch.autograd.grad())
 // call with respect to surrounding ops are the same as for any other call.
@@ -200,7 +203,7 @@ C10_DEFINE_TLS_static(std::shared_ptr<ReadyQueue>, tls_local_ready_queue);
 //
 // Internally, backward() runs ops (including leaf nodes) on side threads.
 // And streams are thread local. So GraphTask achieves the above semantics by
-//  1. remembering the current streams on all active CUDA devices
+//  1. remembering the current streams on all active CUDA/privateuse1 devices
 //     in the user-facing thread (aka, the thread that called execute() to
 //     launch the GraphTask)
 //  2. remembering the "leaf streams" (streams each backward leaf node ran on)
@@ -345,6 +348,11 @@ void Engine::thread_init(
     int device,
     const std::shared_ptr<ReadyQueue>& ready_queue,
     bool should_increment) {
+  // pthread_setname_np restricts the name to 16 characters including
+  // the null byte.
+  std::string thread_name = "pt_autograd_" + std::to_string(device);
+  c10::setThreadName(thread_name);
+
   c10::set_terminate_handler();
   if (should_increment) {
     increment_non_reentrant_thread_count();
@@ -648,6 +656,29 @@ void Engine::thread_on_exception(
   graph_task->set_exception(std::current_exception(), fn);
 }
 
+namespace {
+std::atomic<uint64_t> graph_task_id{0};
+}
+
+GraphTask::GraphTask(
+    bool keep_graph,
+    bool grad_mode,
+    int reentrant_depth,
+    std::shared_ptr<ReadyQueue> cpu_ready_queue,
+    c10::SmallVector<Node*, 4> graph_roots,
+    bool exit_on_error)
+    : keep_graph_(keep_graph),
+      graph_roots_(std::move(graph_roots)),
+      owner_(NO_DEVICE),
+      reentrant_depth_(reentrant_depth),
+      exit_on_error_(exit_on_error),
+      cpu_ready_queue_(std::move(cpu_ready_queue)),
+      future_result_(c10::make_intrusive<at::ivalue::Future>(
+          c10::ListType::create(c10::TensorType::get()))),
+      id_(graph_task_id.fetch_add(1, std::memory_order_relaxed)) {
+  thread_locals_.set_grad_mode(grad_mode);
+}
+
 bool GraphTask::completed() {
   return outstanding_tasks_.load() == 0 ||
       (exit_on_error_ && has_error_.load());
@@ -675,7 +706,7 @@ void GraphTask::mark_as_completed_and_run_post_processing() {
     // when the callbacks are called.
     lock.unlock();
     future_result_->markCompleted(vars);
-  } catch (std::exception& e) {
+  } catch (std::exception&) {
     future_result_->setErrorIfNeeded(std::current_exception());
   }
 }
@@ -702,18 +733,19 @@ void GraphTask::exec_post_processing() {
   // any grad on its device's current stream.
   if (!leaf_streams.empty()) {
     for (const auto& leaf_stream : leaf_streams) {
-      // stash_current_streams() stashed streams for all device IDs that already
-      // had a CUDA context before the GraphTask executed. For inactive devices,
-      // it stashed a c10::nullopt. I don't expect GraphTask's backward pass ran
-      // leaf nodes on any new devices, so the stashed streams should be enough.
-      // If leaf_stream.device_index() happens to be for a new device,
-      // operator* on the c10::nullopt should throw an error.
+      // stash_current_cuda/privateuse1_streams() stashed streams for all device
+      // IDs that already had a CUDA/privateuse1 context before the GraphTask
+      // executed. For inactive devices, it stashed a c10::nullopt. I don't
+      // expect GraphTask's backward pass ran leaf nodes on any new devices, so
+      // the stashed streams should be enough. If leaf_stream.device_index()
+      // happens to be for a new device, operator* on the c10::nullopt should
+      // throw an error.
       const auto caller_current_stream =
           // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
           *caller_current_streams_[leaf_stream.device_index()];
 
       if (caller_current_stream != leaf_stream) {
-        auto event = c10::Event{c10::DeviceType::CUDA};
+        auto event = c10::Event{leaf_stream.device_type()};
         event.record(leaf_stream);
         caller_current_stream.wait(event);
       }
@@ -847,14 +879,7 @@ void validate_outputs(
       continue;
     }
 
-    if (!metadata.is_same_shape(grad)) {
-      if (metadata.is_expandable_to_shape(grad)) {
-        grad = metadata.reduce_grad(grad);
-      } else {
-        const auto message = metadata.incompatible_shape_error_message(i, grad);
-        TORCH_CHECK(false, format_error(message.str()));
-      }
-    }
+    grad = metadata.maybe_reduce(i, std::move(grad), format_error);
 
     bool input_is_complex =
         isComplexType(c10::typeMetaToScalarType(metadata.options().dtype()));
@@ -972,7 +997,7 @@ void Engine::evaluate_function(
   // ensure they're safe to consume in the context of the present
   // func's stream (if applicable). So we guard onto that stream
   // before working with the grads in any capacity.
-  const auto opt_parent_stream = (*func).stream(c10::DeviceType::CUDA);
+  auto opt_parent_stream = (*func).stream();
   c10::OptionalStreamGuard parent_stream_guard{opt_parent_stream};
 
   // If exec_info_ is not empty, we have to instrument the execution
@@ -990,7 +1015,7 @@ void Engine::evaluate_function(
           *func, InputBuffer::variables(std::move(inputs)));
     }
     if (auto* capture_vec = fn_info.captures_.get()) {
-      const auto opt_parent_stream = (*func).stream(c10::DeviceType::CUDA);
+      auto opt_parent_stream = (*func).stream();
       // Lock mutex for writing to graph_task->captured_vars_.
       std::lock_guard<std::mutex> lock(graph_task->mutex_);
       for (const auto& capture : *capture_vec) {
@@ -1082,7 +1107,7 @@ void Engine::evaluate_function(
       InputBuffer input_buffer(next.function->num_inputs());
 
       // Accumulates into buffer
-      const auto opt_next_stream = next.function->stream(c10::DeviceType::CUDA);
+      auto opt_next_stream = next.function->stream();
       input_buffer.add(
           next.input_nr, std::move(output), opt_parent_stream, opt_next_stream);
 
@@ -1098,7 +1123,7 @@ void Engine::evaluate_function(
       auto& input_buffer = not_ready_it->second;
 
       // Accumulates into buffer
-      const auto opt_next_stream = next.function->stream(c10::DeviceType::CUDA);
+      auto opt_next_stream = next.function->stream();
       input_buffer.add(
           next.input_nr, std::move(output), opt_parent_stream, opt_next_stream);
       if (is_ready) {
@@ -1130,8 +1155,7 @@ auto Engine::compute_dependencies(
     uint64_t min_topo_nr) -> void {
   // Computes the number of dependencies for each function which requires grad
   std::vector<Node*> queue{root};
-  bool might_use_cuda = at::globalContext().hasCUDA();
-  bool will_use_cuda = false;
+  bool will_use_accelerator = false;
 
   // Queue contains all nodes that will start propagating gradients.
   // We no longer have to expand functions that don't require grad.
@@ -1142,8 +1166,8 @@ auto Engine::compute_dependencies(
     if (fn->topological_nr() < min_topo_nr) {
       continue;
     }
-    if (might_use_cuda && !will_use_cuda) {
-      will_use_cuda = fn->stream(c10::DeviceType::CUDA).has_value();
+    if (!will_use_accelerator) {
+      will_use_accelerator = fn->stream().has_value();
     }
     for (const auto& edge : fn->next_edges()) {
       if (auto next_ptr = edge.function.get()) {
@@ -1155,9 +1179,10 @@ auto Engine::compute_dependencies(
     }
   }
 
-  if (will_use_cuda) {
-    // Collects current streams for devices where this process has a context,
-    // so GraphTask::exec_post_processing can sync them with leaf_streams.
+  if (will_use_accelerator) {
+    // Collects current streams for devices where this process has a
+    // context, so GraphTask::exec_post_processing can sync them with
+    // leaf_streams.
     task.stash_current_streams();
   }
 }
@@ -1189,7 +1214,7 @@ auto Engine::execute(
   TORCH_INTERNAL_ASSERT(compiled_autograd != COMPILED_AUTOGRAD_POISON);
 
   // accumulate_grad is true if and only if the frontend call was to
-  // grad(), not backward(). grad() returns the sum of the gradients
+  // backward(), not grad(). grad() returns the sum of the gradients
   // w.r.t. the inputs and thus needs the inputs to be present.
   TORCH_CHECK_VALUE(
       accumulate_grad || !outputs.empty(), "grad requires non-empty inputs.");
@@ -1248,8 +1273,7 @@ auto Engine::execute(
     auto input = inputs.at(0);
 
     const auto input_stream = InputMetadata(input).stream();
-    const auto opt_next_stream =
-        root_edges.at(0).function->stream(c10::DeviceType::CUDA);
+    auto opt_next_stream = root_edges.at(0).function->stream();
     input_buffer.add(
         root_edges.at(0).input_nr,
         std::move(input),
@@ -1517,15 +1541,15 @@ void Engine::add_thread_pool_task(const std::weak_ptr<GraphTask>& graph_task) {
   thread_pool_shared_->work_.notify_one();
 }
 
-// Remembers current streams on all devices where a context has been created.
-// Only called if Engine::execute detects at least one node runs on a cuda
-// stream.
+// Remembers current streams on all devices where a context has been created for
+// This function assumes the accelerator device is available.
 void GraphTask::stash_current_streams() {
-  const auto guard = c10::impl::VirtualGuardImpl{c10::DeviceType::CUDA};
-  auto num_gpus = guard.deviceCount();
-  caller_current_streams_.resize(num_gpus);
-  if (num_gpus > 0) {
-    for (c10::DeviceIndex idx = 0; idx < num_gpus; idx++) {
+  const auto accelerator = at::getAccelerator(true).value();
+  const auto guard = c10::impl::VirtualGuardImpl{accelerator};
+  auto num_devices = guard.deviceCount();
+  caller_current_streams_.resize(num_devices);
+  if (num_devices > 0) {
+    for (c10::DeviceIndex idx = 0; idx < num_devices; idx++) {
 #if defined(USE_ROCM) && (ROCM_VERSION < 50000)
       // If the build targets ROCM, stash streams for all visible devices
       // unconditionally, to work around
@@ -1534,10 +1558,10 @@ void GraphTask::stash_current_streams() {
       // https://github.com/pytorch/pytorch/issues/59750 is fixed.
       if (true) {
 #else
-      if (at::detail::getCUDAHooks().hasPrimaryContext(idx)) {
+      if (at::globalContext().getAcceleratorHooksInterface().hasPrimaryContext(
+              idx)) {
 #endif
-        caller_current_streams_[idx] =
-            guard.getStream({c10::DeviceType::CUDA, idx});
+        caller_current_streams_[idx] = guard.getStream({accelerator, idx});
       } else {
         caller_current_streams_[idx] = c10::nullopt;
       }
@@ -1657,5 +1681,4 @@ void GraphTask::init_to_execute(
   }
 }
 
-} // namespace autograd
-} // namespace torch
+} // namespace torch::autograd
diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h
index 1a279c777836c..0b3f3ae67f0ac 100644
--- a/torch/csrc/autograd/engine.h
+++ b/torch/csrc/autograd/engine.h
@@ -17,13 +17,10 @@
 
 #include <c10/util/CallOnce.h>
 
-#include <deque>
 #include <exception>
 #include <functional>
 #include <memory>
 #include <queue>
-#include <thread>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp
index 20ebc676e3204..2d371db20d269 100644
--- a/torch/csrc/autograd/function.cpp
+++ b/torch/csrc/autograd/function.cpp
@@ -6,10 +6,7 @@
 
 #include <ATen/ATen.h>
 
-#include <algorithm>
-#include <cstdint>
 #include <memory>
-#include <stdexcept>
 #include <string>
 #include <utility>
 #include <vector>
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index f0ec4f3615507..becc73396e66d 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -239,9 +239,13 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
    * elements are on different devices (across multiple GPUs, for example)
    * they may have different streams.
    */
-  c10::optional<c10::Stream> stream(const c10::DeviceType device_type) {
+  c10::optional<c10::Stream> stream() {
+    auto opt_device_type = at::getAccelerator();
+    if (!opt_device_type.has_value()) {
+      return c10::nullopt;
+    }
     for (const auto& metadata : input_metadata_) {
-      if (metadata.device().type() == device_type)
+      if (metadata.device().type() == opt_device_type.value())
         return metadata.stream();
     }
 
diff --git a/torch/csrc/autograd/functions/tensor.cpp b/torch/csrc/autograd/functions/tensor.cpp
index 2f5015f9a2cfb..a47ac8f77cad6 100644
--- a/torch/csrc/autograd/functions/tensor.cpp
+++ b/torch/csrc/autograd/functions/tensor.cpp
@@ -55,7 +55,7 @@ variable_list CopyBackwards::apply_with_saved(
 CopySlices::CopySlices(
     const Variable& base_var,
     at::TensorGeometry view_,
-    std::function<at::Tensor(const at::Tensor&)> view_fn_,
+    std::unique_ptr<ViewFunc> view_fn_,
     std::shared_ptr<Node> fn_)
     : Node(),
       base(base_var),
@@ -98,7 +98,7 @@ inline variable_list CopySlices::apply_impl(
 
   at::Tensor grad_slice;
   if (view_fn) {
-    grad_slice = view_fn(result);
+    grad_slice = (*view_fn)(result);
   } else {
     auto offset = view.sym_storage_offset() - base.sym_storage_offset();
     grad_slice =
diff --git a/torch/csrc/autograd/functions/tensor.h b/torch/csrc/autograd/functions/tensor.h
index 29f9259170f34..6e99ed6ae2aa1 100644
--- a/torch/csrc/autograd/functions/tensor.h
+++ b/torch/csrc/autograd/functions/tensor.h
@@ -79,7 +79,7 @@ struct TORCH_API CopyBackwards : public Node {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 //
 // We need to perform grad_view = fn(grad_view), but out-of-place.
-// view_fn_ is an optional lambda function saved in DifferentiableViewMeta
+// view_fn_ is an optional function saved in DifferentiableViewMeta
 // from forward pass, so that we can recover we when as_strided is not
 // supported. It preserves the invariants:
 //   view = view_fn_(base)
@@ -160,7 +160,7 @@ struct TORCH_API CopySlices : public Node {
   CopySlices(
       const Variable& base_var,
       at::TensorGeometry view_,
-      std::function<at::Tensor(const at::Tensor&)> view_fn_,
+      std::unique_ptr<ViewFunc> view_fn_,
       std::shared_ptr<Node> fn_);
 
   // common code between apply/apply_with_saved
@@ -178,7 +178,7 @@ struct TORCH_API CopySlices : public Node {
   // view and view_fn are redundant and view_fn will be used if available.
   // See Note [View + Inplace update for base tensor] for details.
   at::TensorGeometry view;
-  std::function<at::Tensor(const at::Tensor&)> view_fn;
+  std::unique_ptr<ViewFunc> view_fn;
   std::shared_ptr<Node> fn;
 };
 
diff --git a/torch/csrc/autograd/functions/utils.h b/torch/csrc/autograd/functions/utils.h
index f30ce082a70d7..3cc2575da8f5d 100644
--- a/torch/csrc/autograd/functions/utils.h
+++ b/torch/csrc/autograd/functions/utils.h
@@ -65,7 +65,7 @@ inline bool compute_requires_grad(Args&&... args) {
 }
 
 inline void set_history(
-    at::Tensor& variable,
+    const at::Tensor& variable,
     const std::shared_ptr<Node>& grad_fn) {
   TORCH_CHECK(grad_fn != nullptr);
   if (variable.defined()) {
@@ -81,15 +81,7 @@ inline void set_history(
 }
 
 inline void set_history(
-    std::vector<Variable>&& variables,
-    const std::shared_ptr<Node>& grad_fn) {
-  for (auto& variable : variables) {
-    set_history(variable, grad_fn);
-  }
-}
-
-inline void set_history(
-    std::vector<Variable>& variables,
+    const std::vector<Variable>& variables,
     const std::shared_ptr<Node>& grad_fn) {
   for (auto& variable : variables) {
     set_history(variable, grad_fn);
diff --git a/torch/csrc/autograd/graph_task.h b/torch/csrc/autograd/graph_task.h
index d1176cc1ada82..03a9647cad833 100644
--- a/torch/csrc/autograd/graph_task.h
+++ b/torch/csrc/autograd/graph_task.h
@@ -14,10 +14,6 @@ struct ReadyQueue;
 static constexpr int NO_DEVICE = -2;
 static constexpr int CPU_DEVICE = -1;
 
-namespace {
-std::atomic<uint64_t> graph_task_id{0};
-}
-
 // GraphTask holds metadata needed for a single execution of backward()
 struct GraphTask : std::enable_shared_from_this<GraphTask> {
   std::atomic<uint64_t> outstanding_tasks_{0};
@@ -131,7 +127,7 @@ struct GraphTask : std::enable_shared_from_this<GraphTask> {
   // These will be synced with leaf_streams in exec_post_processing.
   std::vector<c10::optional<c10::Stream>> caller_current_streams_;
 
-  // Collects caller_current_streams_
+  // Collects caller_current_streams_ for the accelerator device.
   void stash_current_streams();
 
   void init_to_execute(
@@ -199,18 +195,7 @@ struct GraphTask : std::enable_shared_from_this<GraphTask> {
       int reentrant_depth,
       std::shared_ptr<ReadyQueue> cpu_ready_queue,
       c10::SmallVector<Node*, 4> graph_roots,
-      bool exit_on_error = false)
-      : keep_graph_(keep_graph),
-        graph_roots_(std::move(graph_roots)),
-        owner_(NO_DEVICE),
-        reentrant_depth_(reentrant_depth),
-        exit_on_error_(exit_on_error),
-        cpu_ready_queue_(std::move(cpu_ready_queue)),
-        future_result_(c10::make_intrusive<at::ivalue::Future>(
-            c10::ListType::create(c10::TensorType::get()))),
-        id_(graph_task_id.fetch_add(1, std::memory_order_relaxed)) {
-    thread_locals_.set_grad_mode(grad_mode);
-  }
+      bool exit_on_error = false);
 
  private:
   // run GraphTask post processing
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 36ed7828573b2..e04d853198fbb 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -162,7 +162,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
       .value("IDEEP", c10::DeviceType::IDEEP)
       .value("HIP", c10::DeviceType::HIP)
       .value("FPGA", c10::DeviceType::FPGA)
-      .value("ORT", c10::DeviceType::ORT)
+      .value("MAIA", c10::DeviceType::MAIA)
       .value("XLA", c10::DeviceType::XLA)
       .value("Vulkan", c10::DeviceType::Vulkan)
       .value("Metal", c10::DeviceType::Metal)
@@ -201,10 +201,10 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
       // together with fwd_thread_id, used to uniquely identify
       // the forward op
       .def("sequence_nr", [](const KinetoEvent& e) { return e.sequenceNr(); })
-      // absolute start time (since unix epoch) in us
-      .def("start_us", [](const KinetoEvent& e) { return e.startUs(); })
-      // duration in us
-      .def("duration_us", [](const KinetoEvent& e) { return e.durationUs(); })
+      // absolute start time (since unix epoch) in ns
+      .def("start_ns", [](const KinetoEvent& e) { return e.startNs(); })
+      // duration in ns
+      .def("duration_ns", [](const KinetoEvent& e) { return e.durationNs(); })
       // used for correlation between high-level PyTorch events
       // and low-level device events
       .def(
@@ -255,7 +255,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
   m.def("_get_sequence_nr", &at::sequence_number::peek);
 
   py::class_<ProfilerResult>(m, "_ProfilerResult")
-      .def("trace_start_us", &ProfilerResult::trace_start_us)
+      .def("trace_start_ns", &ProfilerResult::trace_start_ns)
       .def("events", &ProfilerResult::events)
       .def("experimental_event_tree", &ProfilerResult::event_tree)
 #ifdef USE_KINETO
@@ -270,7 +270,10 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
       py::arg("activities"),
       py::arg("scopes") = std::unordered_set<at::RecordScope>());
   m.def("_disable_profiler", disableProfiler);
-  m.def("_prepare_profiler", prepareProfiler);
+  m.def(
+      "_prepare_profiler",
+      prepareProfiler,
+      py::call_guard<py::gil_scoped_release>());
   m.def("_add_metadata_json", addMetadataJson); // Only if `USE_KINETO` is set
   m.def("_kineto_step", profilerStep); // Only if `USE_KINETO` is set
   m.def("kineto_available", []() { return torch::profiler::kKinetoAvailable; });
@@ -329,6 +332,9 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
     if (at::hasMTIA()) {
       activities.insert(torch::profiler::impl::ActivityType::MTIA);
     }
+    if (c10::get_privateuse1_backend() != "privateuseone") {
+      activities.insert(torch::profiler::impl::ActivityType::PrivateUse1);
+    }
 #endif
     return activities;
   });
@@ -431,7 +437,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
     }
   });
 
-  _C_m.def("_activate_cuda_trace", []() { activateCUDATrace(); });
+  _C_m.def("_activate_gpu_trace", []() { activateGPUTrace(); });
 
   py_context_manager_DEPRECATED<c10::InferenceMode, bool>(
       _C_m, "_InferenceMode");
@@ -471,22 +477,47 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
   Py_RETURN_TRUE;
 }
 
-namespace torch {
-namespace autograd {
+namespace torch::autograd {
 
-static PyObject* set_autocast_enabled(PyObject* _unused, PyObject* arg) {
+static PyObject* set_autocast_enabled(
+    PyObject* _unused,
+    PyObject* args,
+    PyObject* kwargs) {
   HANDLE_TH_ERRORS
-  if (!PyBool_Check(arg)) {
-    throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
+  static PythonArgParser parser(
+      {"set_autocast_enabled(c10::string_view device_type, bool enabled)",
+       "set_autocast_enabled(bool enabled)"}); // this signature is depracated.
+  ParsedArgs<2> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  // Set at::kCUDA as default value to prevent BC-breaking changes.
+  at::DeviceType device_type = at::kCUDA;
+  int enabled_id = 0;
+  if (r.idx == 0) {
+    device_type = at::Device(r.string(0)).type();
+    enabled_id = 1;
   }
-  at::autocast::set_enabled(arg == Py_True);
+  auto enabled = r.toBool(enabled_id);
+  at::autocast::set_autocast_enabled(device_type, enabled);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject* is_autocast_enabled(PyObject* _unused, PyObject* arg) {
+static PyObject* is_autocast_enabled(
+    PyObject* _unused,
+    PyObject* args,
+    PyObject* kwargs) {
   HANDLE_TH_ERRORS
-  if (at::autocast::is_enabled()) {
+  static PythonArgParser parser(
+      {"is_autocast_enabled(c10::string_view device_type)",
+       "is_autocast_enabled()"}); // this signature is depracated.
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  // Set at::kCUDA as default value to prevent BC-breaking changes.
+  at::DeviceType device_type = at::kCUDA;
+  if (r.idx == 0) {
+    device_type = at::Device(r.string(0)).type();
+  }
+  if (at::autocast::is_autocast_enabled(device_type)) {
     Py_RETURN_TRUE;
   } else {
     Py_RETURN_FALSE;
@@ -494,11 +525,46 @@ static PyObject* is_autocast_enabled(PyObject* _unused, PyObject* arg) {
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject* get_autocast_dtype(
+    PyObject* _unused,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser(
+      {"get_autocast_dtype(c10::string_view device_type)"});
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  auto device_type = at::Device(r.string(0)).type();
+  at::ScalarType current_dtype = at::autocast::get_autocast_dtype(device_type);
+  return utils::wrap(current_dtype);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* set_autocast_dtype(
+    PyObject* _unused,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser(
+      {"set_autocast_dtype(c10::string_view device_type, ScalarType dtype)"});
+  ParsedArgs<2> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  auto device_type = at::Device(r.string(0)).type();
+  auto dtype = r.scalartype(1);
+  at::autocast::set_autocast_dtype(device_type, dtype);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject* is_any_autocast_enabled(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  if (at::autocast::is_enabled() || at::autocast::is_cpu_enabled() ||
-      at::autocast::is_xpu_enabled() || at::autocast::is_ipu_enabled() ||
-      at::autocast::is_xla_enabled() || at::autocast::is_hpu_enabled()) {
+  if (at::autocast::is_autocast_enabled(at::kCPU) ||
+      at::autocast::is_autocast_enabled(at::kCUDA) ||
+      at::autocast::is_autocast_enabled(at::kXPU) ||
+      at::autocast::is_autocast_enabled(at::kIPU) ||
+      at::autocast::is_autocast_enabled(at::kXLA) ||
+      at::autocast::is_autocast_enabled(at::kHPU) ||
+      at::autocast::is_autocast_enabled(at::kPrivateUse1)) {
     Py_RETURN_TRUE;
   } else {
     Py_RETURN_FALSE;
@@ -506,19 +572,43 @@ static PyObject* is_any_autocast_enabled(PyObject* _unused, PyObject* arg) {
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject* set_autocast_cpu_enabled(PyObject* _unused, PyObject* arg) {
+static PyObject* is_autocast_available(
+    PyObject* _unused,
+    PyObject* args,
+    PyObject* kwargs) {
   HANDLE_TH_ERRORS
-  if (!PyBool_Check(arg)) {
-    throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
+  static PythonArgParser parser(
+      {"_is_autocast_available(c10::string_view device_type)"});
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  auto device_type = at::Device(r.string(0)).type();
+  if (at::autocast::is_autocast_available(device_type)) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
   }
-  at::autocast::set_cpu_enabled(arg == Py_True);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* set_autocast_cpu_enabled(PyObject* _unused, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK_TYPE(
+      PyBool_Check(arg),
+      "enabled must be a bool (got ",
+      Py_TYPE(arg)->tp_name,
+      ")");
+  TORCH_WARN_DEPRECATION(
+      "torch.set_autocast_cpu_enabled(enabled) is deprecated. Please use torch.set_autocast_enabled('cpu', enabled) instead.")
+  at::autocast::set_autocast_enabled(at::kCPU, arg == Py_True);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* is_autocast_cpu_enabled(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  if (at::autocast::is_cpu_enabled()) {
+  TORCH_WARN_DEPRECATION(
+      "torch.is_autocast_cpu_enabled() is deprecated. Please use torch.is_autocast_enabled('cpu') instead.")
+  if (at::autocast::is_autocast_enabled(at::kCPU)) {
     Py_RETURN_TRUE;
   } else {
     Py_RETURN_FALSE;
@@ -528,17 +618,23 @@ static PyObject* is_autocast_cpu_enabled(PyObject* _unused, PyObject* arg) {
 
 static PyObject* set_autocast_ipu_enabled(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  if (!PyBool_Check(arg)) {
-    throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
-  }
-  at::autocast::set_ipu_enabled(arg == Py_True);
+  TORCH_CHECK_TYPE(
+      PyBool_Check(arg),
+      "enabled must be a bool (got ",
+      Py_TYPE(arg)->tp_name,
+      ")");
+  TORCH_WARN_DEPRECATION(
+      "torch.set_autocast_ipu_enabled(enabled) is deprecated. Please use torch.set_autocast_enabled('ipu', enabled) instead.")
+  at::autocast::set_autocast_enabled(at::kIPU, arg == Py_True);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* is_autocast_ipu_enabled(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  if (at::autocast::is_ipu_enabled()) {
+  TORCH_WARN_DEPRECATION(
+      "torch.is_autocast_ipu_enabled() is deprecated. Please use torch.is_autocast_enabled('ipu') instead.")
+  if (at::autocast::is_autocast_enabled(at::kIPU)) {
     Py_RETURN_TRUE;
   } else {
     Py_RETURN_FALSE;
@@ -548,17 +644,23 @@ static PyObject* is_autocast_ipu_enabled(PyObject* _unused, PyObject* arg) {
 
 static PyObject* set_autocast_xla_enabled(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  if (!PyBool_Check(arg)) {
-    throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
-  }
-  at::autocast::set_xla_enabled(arg == Py_True);
+  TORCH_CHECK_TYPE(
+      PyBool_Check(arg),
+      "enabled must be a bool (got ",
+      Py_TYPE(arg)->tp_name,
+      ")");
+  TORCH_WARN_DEPRECATION(
+      "torch.set_autocast_xla_enabled(enabled) is deprecated. Please use torch.set_autocast_enabled('xla', enabled) instead.")
+  at::autocast::set_autocast_enabled(at::kXLA, arg == Py_True);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* is_autocast_xla_enabled(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  if (at::autocast::is_xla_enabled()) {
+  TORCH_WARN_DEPRECATION(
+      "torch.is_autocast_xla_enabled() is deprecated. Please use torch.is_autocast_enabled('xla') instead.")
+  if (at::autocast::is_autocast_enabled(at::kXLA)) {
     Py_RETURN_TRUE;
   } else {
     Py_RETURN_FALSE;
@@ -568,91 +670,105 @@ static PyObject* is_autocast_xla_enabled(PyObject* _unused, PyObject* arg) {
 
 static PyObject* set_autocast_gpu_dtype(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  if (!THPDtype_Check(arg)) {
-    throw TypeError(
-        "dtype must be a torch.dtype (got %s)", Py_TYPE(arg)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      THPDtype_Check(arg),
+      "dtype must be a torch.dtype (got ",
+      Py_TYPE(arg)->tp_name,
+      ")");
+  TORCH_WARN_DEPRECATION(
+      "torch.set_autocast_gpu_dtype(dtype) is deprecated. Please use torch.set_autocast_dtype('cuda', dtype) instead.")
   at::ScalarType targetType = reinterpret_cast<THPDtype*>(arg)->scalar_type;
-  at::autocast::set_autocast_gpu_dtype(targetType);
+  at::autocast::set_autocast_dtype(at::kCUDA, targetType);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* set_autocast_cpu_dtype(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  if (!THPDtype_Check(arg)) {
-    throw TypeError(
-        "dtype must be a torch.dtype (got %s)", Py_TYPE(arg)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      THPDtype_Check(arg),
+      "dtype must be a torch.dtype (got ",
+      Py_TYPE(arg)->tp_name,
+      ")");
+  TORCH_WARN_DEPRECATION(
+      "torch.set_autocast_cpu_dtype(dtype) is deprecated. Please use torch.set_autocast_dtype('cpu', dtype) instead.")
   at::ScalarType targetType = reinterpret_cast<THPDtype*>(arg)->scalar_type;
-  at::autocast::set_autocast_cpu_dtype(targetType);
+  at::autocast::set_autocast_dtype(at::kCPU, targetType);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* set_autocast_ipu_dtype(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  if (!THPDtype_Check(arg)) {
-    throw TypeError(
-        "dtype must be a torch.dtype (got %s)", Py_TYPE(arg)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      THPDtype_Check(arg),
+      "dtype must be a torch.dtype (got ",
+      Py_TYPE(arg)->tp_name,
+      ")");
+  TORCH_WARN_DEPRECATION(
+      "torch.set_autocast_ipu_dtype(dtype) is deprecated. Please use torch.set_autocast_dtype('ipu', dtype) instead.")
   at::ScalarType targetType = reinterpret_cast<THPDtype*>(arg)->scalar_type;
-  at::autocast::set_autocast_ipu_dtype(targetType);
+  at::autocast::set_autocast_dtype(at::kIPU, targetType);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* set_autocast_xla_dtype(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  if (!THPDtype_Check(arg)) {
-    throw TypeError(
-        "dtype must be a torch.dtype (got %s)", Py_TYPE(arg)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      THPDtype_Check(arg),
+      "dtype must be a torch.dtype (got ",
+      Py_TYPE(arg)->tp_name,
+      ")");
+  TORCH_WARN_DEPRECATION(
+      "torch.set_autocast_xla_dtype(dtype) is deprecated. Please use torch.set_autocast_dtype('xla', dtype) instead.")
   at::ScalarType targetType = reinterpret_cast<THPDtype*>(arg)->scalar_type;
-  at::autocast::set_autocast_xla_dtype(targetType);
+  at::autocast::set_autocast_dtype(at::kXLA, targetType);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* get_autocast_gpu_dtype(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  at::ScalarType current_dtype = at::autocast::get_autocast_gpu_dtype();
-  auto dtype = (PyObject*)torch::getTHPDtype(current_dtype);
-  Py_INCREF(dtype);
-  return dtype;
+  TORCH_WARN_DEPRECATION(
+      "torch.get_autocast_gpu_dtype() is deprecated. Please use torch.get_autocast_dtype('cuda') instead.")
+  at::ScalarType current_dtype = at::autocast::get_autocast_dtype(at::kCUDA);
+  return utils::wrap(current_dtype);
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* get_autocast_cpu_dtype(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  at::ScalarType current_dtype = at::autocast::get_autocast_cpu_dtype();
-  auto dtype = (PyObject*)torch::getTHPDtype(current_dtype);
-  Py_INCREF(dtype);
-  return dtype;
+  TORCH_WARN_DEPRECATION(
+      "torch.get_autocast_cpu_dtype() is deprecated. Please use torch.get_autocast_dtype('cpu') instead.")
+  at::ScalarType current_dtype = at::autocast::get_autocast_dtype(at::kCPU);
+  return utils::wrap(current_dtype);
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* get_autocast_ipu_dtype(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  at::ScalarType current_dtype = at::autocast::get_autocast_ipu_dtype();
-  auto dtype = (PyObject*)torch::getTHPDtype(current_dtype);
-  Py_INCREF(dtype);
-  return dtype;
+  TORCH_WARN_DEPRECATION(
+      "torch.get_autocast_ipu_dtype() is deprecated. Please use torch.get_autocast_dtype('ipu') instead.")
+  at::ScalarType current_dtype = at::autocast::get_autocast_dtype(at::kIPU);
+  return utils::wrap(current_dtype);
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* get_autocast_xla_dtype(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  at::ScalarType current_dtype = at::autocast::get_autocast_xla_dtype();
-  auto dtype = (PyObject*)torch::getTHPDtype(current_dtype);
-  Py_INCREF(dtype);
-  return dtype;
+  TORCH_WARN_DEPRECATION(
+      "torch.get_autocast_xla_dtype() is deprecated. Please use torch.get_autocast_dtype('xla') instead.")
+  at::ScalarType current_dtype = at::autocast::get_autocast_dtype(at::kXLA);
+  return utils::wrap(current_dtype);
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* clear_autocast_cache(PyObject* _unused, PyObject* arg) {
-  HANDLE_TH_ERRORS
-  at::autocast::clear_cache();
+  HANDLE_TH_ERRORS {
+    pybind11::gil_scoped_release no_gil;
+    at::autocast::clear_cache();
+  }
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
@@ -681,9 +797,11 @@ static PyObject* is_autocast_cache_enabled(PyObject* _unused, PyObject* arg) {
 
 static PyObject* set_autocast_cache_enabled(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  if (!PyBool_Check(arg)) {
-    throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      PyBool_Check(arg),
+      "enabled must be a bool (got ",
+      Py_TYPE(arg)->tp_name,
+      ")");
   at::autocast::set_autocast_cache_enabled(arg == Py_True);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
@@ -723,9 +841,11 @@ static PyObject* is_grad_enabled(PyObject* _unused, PyObject* arg) {
 
 static PyObject* set_fwd_grad_enabled(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
-  if (!PyBool_Check(arg)) {
-    throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      PyBool_Check(arg),
+      "enabled must be a bool (got ",
+      Py_TYPE(arg)->tp_name,
+      ")");
   c10::AutogradState::get_tls_state().set_fw_grad_mode(arg == Py_True);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
@@ -967,11 +1087,13 @@ static PyObject* push_on_torch_dispatch_stack(
     if (maybe_mode_key_obj) {
       mode_key = py::cast<c10::impl::TorchDispatchModeKey>(maybe_mode_key_obj);
       c10::impl::TorchDispatchModeTLS::set_mode(
-          std::make_shared<c10::SafePyObject>(arg, getPyInterpreter()),
+          std::make_shared<c10::impl::PyObject_TorchDispatchMode>(
+              arg, getPyInterpreter()),
           mode_key.value());
     } else {
       c10::impl::TorchDispatchModeTLS::push_non_infra_mode_onto_stack(
-          std::make_shared<c10::SafePyObject>(arg, getPyInterpreter()));
+          std::make_shared<c10::impl::PyObject_TorchDispatchMode>(
+              arg, getPyInterpreter()));
     }
     Py_INCREF(arg);
   }
@@ -1035,7 +1157,9 @@ static PyObject* set_dispatch_mode(PyObject* _unused, PyObject* mode) {
 
   Py_INCREF(mode);
   c10::impl::TorchDispatchModeTLS::set_mode(
-      std::make_shared<c10::SafePyObject>(mode, getPyInterpreter()), mode_key);
+      std::make_shared<c10::impl::PyObject_TorchDispatchMode>(
+          mode, getPyInterpreter()),
+      mode_key);
 
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
@@ -1102,9 +1226,27 @@ static PyMethodDef methods[] = { // NOLINT
      is_inference_mode_enabled,
      METH_NOARGS,
      nullptr},
-    {"set_autocast_enabled", set_autocast_enabled, METH_O, nullptr},
-    {"is_autocast_enabled", is_autocast_enabled, METH_NOARGS, nullptr},
+    {"set_autocast_enabled",
+     castPyCFunctionWithKeywords(set_autocast_enabled),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {"is_autocast_enabled",
+     castPyCFunctionWithKeywords(is_autocast_enabled),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {"set_autocast_dtype",
+     castPyCFunctionWithKeywords(set_autocast_dtype),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {"get_autocast_dtype",
+     castPyCFunctionWithKeywords(get_autocast_dtype),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
     {"_is_any_autocast_enabled", is_any_autocast_enabled, METH_NOARGS, nullptr},
+    {"_is_autocast_available",
+     castPyCFunctionWithKeywords(is_autocast_available),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
     {"clear_autocast_cache", clear_autocast_cache, METH_NOARGS, nullptr},
     {"set_autocast_cpu_enabled", set_autocast_cpu_enabled, METH_O, nullptr},
     {"is_autocast_cpu_enabled", is_autocast_cpu_enabled, METH_NOARGS, nullptr},
@@ -1204,5 +1346,4 @@ PyMethodDef* python_functions() {
   return methods;
 }
 
-} // namespace autograd
-} // namespace torch
+} // namespace torch::autograd
diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp
index 8b5dc7bc10358..2adfc1fc7efae 100644
--- a/torch/csrc/autograd/input_buffer.cpp
+++ b/torch/csrc/autograd/input_buffer.cpp
@@ -28,7 +28,7 @@ namespace {
 // TODO: clean this up when https://github.com/pytorch/pytorch/issues/60306 is
 // improved
 void record_stream_any_impl(Variable& var, c10::Stream& stream) {
-  const auto guard = c10::impl::VirtualGuardImpl(c10::DeviceType::CUDA);
+  const auto guard = c10::impl::VirtualGuardImpl(device_of(var).value().type());
 
   if (C10_UNLIKELY(at::isBatchedTensor(var))) {
     auto* impl = at::maybeGetBatchedImpl(var);
@@ -138,30 +138,31 @@ void InputBuffer::add(
 
   // Switches to accumulate device
   // The device (and stream) chosen for accumulation is:
-  //  (1) var is not a CUDA variable. Accumulation happens on var's device.
-  //  (2) var is a CUDA variable and it, the consumer, and the producer share
-  //  the same device:
+  //  (1) var is not a CUDA/privateuse1 variable. Accumulation happens on var's
+  //  device. (2) var is a CUDA/privateuse1 variable and it, the consumer, and
+  //  the producer share the same device:
   //       (2a) Uses the consumer's stream as the accumulation stream
   //       (2b) Syncs the accumulation stream with the producer's stream (if
   //       different) (2c) Accumulates.
-  //  (3) var is a CUDA variable and it shares a device with the consumer but
-  //  not the producer:
+  //  (3) var is a CUDA/privateuse1 variable and it shares a device with the
+  //  consumer but not the producer:
   //       (3a) Uses the consumer's stream as the accumulation stream
   //       (3b) Syncs the accumulation stream with the consumer device's default
   //       stream (3c) Accumulates.
-  //  (4) var is a CUDA variable and it shares a device with the producer but
-  //  not the consumer:
+  //  (4) var is a CUDA/privateuse1 variable and it shares a device with the
+  //  producer but not the consumer:
   //       (4a) Uses the producer device's default stream as the accumulation
   //       stream (4b) Syncs the accumulation stream with the producer's
   //       stream (4c) Accumulates.
-  //  (5) var is a CUDA variable and it does not share a device with the
-  //  consumer or producer.
+  //  (5) var is a CUDA/privateuse1 variable and it does not share a device with
+  //  the consumer or producer.
   //      Accumulation happens on the var device's default stream.
 
   TORCH_INTERNAL_ASSERT(device_of(var));
   c10::optional<c10::Stream> opt_accumulate_stream = c10::nullopt;
+  const auto device_type = device_of(var).value().type();
   // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
-  if (device_of(var)->is_cuda()) {
+  if (device_of(var)->is_cuda() || device_of(var)->is_privateuseone()) {
     const auto on_producer =
         opt_producer_stream && device_of(var) == opt_producer_stream->device();
     const auto on_consumer =
@@ -172,14 +173,14 @@ void InputBuffer::add(
       opt_accumulate_stream = opt_consumer_stream;
       if (opt_accumulate_stream != opt_producer_stream) {
         // (2b)
-        auto event = c10::Event{c10::DeviceType::CUDA};
+        auto event = c10::Event{device_type};
         event.record(*opt_producer_stream);
         opt_accumulate_stream->wait(event);
         record_stream_any_impl(var, *opt_accumulate_stream);
       }
     } else {
       c10::optional<c10::Stream> opt_sync_stream = c10::nullopt;
-      const auto guard = c10::impl::VirtualGuardImpl{c10::DeviceType::CUDA};
+      const auto guard = c10::impl::VirtualGuardImpl{device_type};
       if (on_consumer && !on_producer) {
         // (3a)
         opt_accumulate_stream = opt_consumer_stream;
@@ -197,10 +198,10 @@ void InputBuffer::add(
       if (opt_sync_stream && (opt_accumulate_stream != opt_sync_stream)) {
         // (3b), (4b)
         c10::OptionalDeviceGuard device_guard{opt_sync_stream->device()};
-        auto event = c10::Event{c10::DeviceType::CUDA};
+        auto event = c10::Event{device_type};
         event.record(*opt_sync_stream);
         opt_accumulate_stream->wait(event);
-        const auto guard = c10::impl::VirtualGuardImpl(c10::DeviceType::CUDA);
+        const auto guard = c10::impl::VirtualGuardImpl(device_type);
         record_stream_any_impl(var, *opt_accumulate_stream);
       }
     }
@@ -214,7 +215,7 @@ void InputBuffer::add(
       c10::OptionalStreamGuard stream_guard{opt_accumulate_stream};
       accumulate(buffer, pos, std::move(var));
     } else {
-      // (1) non-CUDA variable
+      // (1) non-CUDA/privateuse1 variable
       //     Accumulation happens on variable's device
       c10::OptionalDeviceGuard device_guard{device_of(var)};
       accumulate(buffer, pos, std::move(var));
diff --git a/torch/csrc/autograd/input_buffer.h b/torch/csrc/autograd/input_buffer.h
index ef5573ae81584..d8ef3396cb6d8 100644
--- a/torch/csrc/autograd/input_buffer.h
+++ b/torch/csrc/autograd/input_buffer.h
@@ -5,7 +5,6 @@
 // values in-place (adding an input twice will accumulate the result).
 // This behaviour is needed and used only in backward graphs.
 
-#include <memory>
 #include <utility>
 #include <vector>
 
diff --git a/torch/csrc/autograd/input_metadata.cpp b/torch/csrc/autograd/input_metadata.cpp
index 723303fe70b44..9f4ea4da52c4a 100644
--- a/torch/csrc/autograd/input_metadata.cpp
+++ b/torch/csrc/autograd/input_metadata.cpp
@@ -53,6 +53,68 @@ at::Tensor InputMetadata::zeros_like() const {
   return at::zeros_symint(shape_as_dim_vector(), options_);
 }
 
+at::Tensor InputMetadata::maybe_reduce(
+    const size_t i,
+    at::Tensor grad,
+    const std::function<std::string(const std::string&)>& format_error) const {
+  auto fail = [&]() {
+    const auto message = incompatible_shape_error_message(i, grad);
+    TORCH_CHECK(false, format_error(message.str()));
+  };
+
+  // Nested tensor makes my brain explode, so I've just hard-coded the logic
+  // for this case, at risk of code duplication.  This logic does NOT do the
+  // careful oblivious logic as seen below
+  if (is_nested_ || is_cpp_nested_tensor() || grad.is_nested() ||
+      ::torch::autograd::is_cpp_nested_tensor(grad)) {
+    if (!is_same_shape(grad)) {
+      if (is_expandable_to_shape(grad)) {
+        return reduce_grad(grad);
+      } else {
+        fail();
+      }
+    } else {
+      return grad;
+    }
+  }
+
+  auto shape = shape_as_dim_vector();
+  auto desired = grad.sym_sizes();
+
+  size_t ndim = shape.size();
+  size_t target_dim = desired.size();
+  if (ndim > target_dim) {
+    fail();
+  }
+  bool needs_reduce = false;
+  for (const auto i : c10::irange(ndim)) {
+    const auto& size = shape[ndim - i - 1];
+    const auto& target = desired[target_dim - i - 1];
+    // The conditions here are written carefully so that we are able to
+    // infer deferred runtime asserts
+    if (TORCH_GUARD_SIZE_OBLIVIOUS(size.sym_eq(1))) {
+      // NB: we could short circuit this once needs_reduce is true but there's
+      // no point since the reduction function will guard on this anyway
+      if (!c10::definitely_true(size.sym_eq(target), __FILE__, __LINE__)) {
+        needs_reduce = true;
+      }
+    } else {
+      if (!size.sym_eq(target).expect_true(__FILE__, __LINE__)) {
+        fail();
+      }
+    }
+  }
+  if (ndim != target_dim) {
+    needs_reduce = true;
+  }
+
+  if (needs_reduce) {
+    return reduce_grad(grad);
+  } else {
+    return grad;
+  }
+}
+
 bool InputMetadata::is_same_shape(const at::Tensor& grad) const {
   if (!is_nestedness_same(grad)) {
     return false;
diff --git a/torch/csrc/autograd/input_metadata.h b/torch/csrc/autograd/input_metadata.h
index da4b9ee1f75fc..1f74e72cae7cf 100644
--- a/torch/csrc/autograd/input_metadata.h
+++ b/torch/csrc/autograd/input_metadata.h
@@ -19,9 +19,6 @@
 #include <ATen/ops/zeros.h>
 #endif
 
-#include <cstdint>
-#include <utility>
-
 namespace torch::autograd {
 
 using SymIntSmallVec = c10::SmallVector<c10::SymInt, c10::kDimVectorStaticSize>;
@@ -76,6 +73,11 @@ struct TORCH_API InputMetadata {
 
   at::Tensor reduce_grad(at::Tensor& grad) const;
 
+  at::Tensor maybe_reduce(
+      const size_t index,
+      at::Tensor grad,
+      const std::function<std::string(const std::string&)>& format_error) const;
+
   std::stringstream incompatible_shape_error_message(
       const size_t index,
       const at::Tensor& grad) const;
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index c68eb1809984e..0c73c8b7a72a1 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -22,9 +22,6 @@
 
 #include <ATen/Context.h>
 
-#include <deque>
-#include <limits>
-#include <sstream>
 #include <stdexcept>
 #include <utility>
 
@@ -53,11 +50,11 @@ namespace autograd {
 namespace profiler {
 
 namespace {
-inline int64_t getTimeUs() {
+inline int64_t getTimeNs() {
 #ifdef USE_KINETO
   return libkineto::timeSinceEpoch(std::chrono::system_clock::now());
 #else
-  return c10::getTime() / 1000;
+  return c10::getTime();
 #endif // USE_KINETO
 }
 
@@ -270,6 +267,8 @@ struct AddGenericMetadata : public MetadataBase {
       addMetadata("Fwd thread id", std::to_string(op_event.forward_tid_));
       addMetadata("Sequence number", std::to_string(op_event.sequence_number_));
     }
+    addMetadata(
+        "Record function id", std::to_string(op_event.record_function_id_));
   }
 
   void operator()(ExtraFields<EventType::Backend>& backend_event) {
@@ -308,7 +307,7 @@ struct KinetoThreadLocalState : public ProfilerStateBase {
       const ProfilerConfig& config,
       std::set<torch::profiler::impl::ActivityType> activities)
       : ProfilerStateBase(config),
-        start_time_(getTimeUs()),
+        start_time_(getTimeNs()),
         record_queue_(config, std::move(activities)) {}
   ~KinetoThreadLocalState() override = default;
 
@@ -375,7 +374,7 @@ struct KinetoThreadLocalState : public ProfilerStateBase {
 
   std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>
   finalizeTrace() {
-    auto end_time = getTimeUs();
+    auto end_time = getTimeNs();
     record_queue_.stop();
 
     std::lock_guard<std::mutex> guard(state_mutex_);
@@ -556,7 +555,9 @@ void prepareProfiler(
           config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK,
       "Supported only in Kineto profiler");
   torch::profiler::impl::kineto::prepareTrace(
-      /*cpuOnly=*/!(at::hasCUDA() || at::hasXPU() || at::hasMTIA()),
+      /*cpuOnly=*/!(
+          at::hasCUDA() || at::hasXPU() || at::hasMTIA() ||
+          c10::get_privateuse1_backend() != "privateuseone"),
       activities,
       config.experimental_config);
 
@@ -773,8 +774,8 @@ const c10::ArrayRef<std::string> KinetoEvent::moduleHierarchy() const {
   return {};
 }
 
-uint64_t KinetoEvent::durationUs() const {
-  return (result_->endTimeNS() - result_->start_time_ns_) / 1000;
+uint64_t KinetoEvent::durationNs() const {
+  return (result_->endTimeNS() - result_->start_time_ns_);
 }
 
 int64_t KinetoEvent::debugHandle() const {
@@ -784,16 +785,16 @@ int64_t KinetoEvent::debugHandle() const {
       [](const auto&) -> int64_t { return -1; }));
 }
 
-uint8_t KinetoEvent::deviceIndex() const {
+int KinetoEvent::deviceIndex() const {
   return result_->visit(c10::overloaded(
       [](const ExtraFields<EventType::Allocation>& i) {
-        return static_cast<uint8_t>(i.device_index_);
+        return static_cast<int>(i.device_index_);
       },
       [](const ExtraFields<EventType::OutOfMemory>& i) {
-        return static_cast<uint8_t>(i.device_index_);
+        return static_cast<int>(i.device_index_);
       },
       [&](const auto&) {
-        return static_cast<uint8_t>(result_->kineto_info_.device);
+        return static_cast<int>(result_->kineto_info_.device);
       }));
 }
 
@@ -855,7 +856,7 @@ FORWARD_FROM_RESULT(endThreadId, endTID())
 FORWARD_FROM_RESULT(activityType, kinetoType())
 FORWARD_FROM_RESULT(name, name())
 FORWARD_FROM_RESULT(deviceType, deviceType())
-FORWARD_FROM_RESULT(startUs, start_time_ns_ / 1000)
+FORWARD_FROM_RESULT(startNs, start_time_ns_)
 FORWARD_FROM_RESULT(correlationId, correlationID())
 FORWARD_FROM_RESULT(deviceResourceId, kineto_info_.resource)
 #undef FORWARD_FROM_RESULT
@@ -907,7 +908,7 @@ ProfilerResult::ProfilerResult(
     std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&&
         trace,
     std::vector<experimental_event_t>&& event_tree)
-    : trace_start_us_(start_time),
+    : trace_start_ns_(start_time),
       events_(std::move(events)),
       trace_(std::move(trace)),
       event_tree_(std::move(event_tree)) {}
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index bfc6f8a73b3d9..64c91df603587 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -46,10 +46,10 @@ struct TORCH_API KinetoEvent {
   int64_t debugHandle() const;
   std::string name() const;
   c10::DeviceType deviceType() const;
-  uint8_t deviceIndex() const;
+  int deviceIndex() const;
   int64_t nBytes() const;
-  uint64_t startUs() const;
-  uint64_t durationUs() const;
+  uint64_t startNs() const;
+  uint64_t durationNs() const;
   bool isAsync() const;
   uint64_t correlationId() const;
   uint64_t linkedCorrelationId() const;
@@ -87,8 +87,8 @@ struct TORCH_API ProfilerResult {
       std::vector<experimental_event_t>&& event_tree);
   ~ProfilerResult();
 
-  uint64_t trace_start_us() const {
-    return trace_start_us_;
+  uint64_t trace_start_ns() const {
+    return trace_start_ns_;
   }
 
   const std::vector<KinetoEvent>& events() const {
@@ -102,7 +102,7 @@ struct TORCH_API ProfilerResult {
   void save(const std::string& path);
 
  private:
-  uint64_t trace_start_us_ = 0;
+  uint64_t trace_start_ns_ = 0;
   std::vector<KinetoEvent> events_;
   std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper> trace_;
   std::vector<experimental_event_t> event_tree_;
diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp
index 5d7dd02312b7f..04c676fc2b497 100644
--- a/torch/csrc/autograd/profiler_legacy.cpp
+++ b/torch/csrc/autograd/profiler_legacy.cpp
@@ -10,9 +10,7 @@
 #include <torch/library.h>
 
 #include <fstream>
-#include <list>
 #include <mutex>
-#include <sstream>
 #include <string>
 #include <vector>
 
@@ -24,9 +22,7 @@
 
 #include <iostream>
 
-namespace torch {
-namespace autograd {
-namespace profiler {
+namespace torch::autograd::profiler {
 
 // We decompose the profiler logic into the following components:
 //
@@ -124,7 +120,6 @@ using torch::profiler::impl::ActiveProfilerType;
 using torch::profiler::impl::ProfilerStateBase;
 
 struct ProfilerLegacyThreadLocalState : public ProfilerStateBase {
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   explicit ProfilerLegacyThreadLocalState(
       const torch::profiler::impl::ProfilerConfig& config)
       : ProfilerStateBase(config), remoteProfiledEvents_{c10::nullopt} {}
@@ -167,15 +162,13 @@ struct ProfilerLegacyThreadLocalState : public ProfilerStateBase {
   }
 
  protected:
-  RangeEventList& getEventList(int64_t thread_id = -1);
+  RangeEventList& getEventList(
+      std::optional<uint64_t> thread_id = std::nullopt);
 
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   std::mutex state_mutex_;
   std::unordered_map<uint64_t, std::shared_ptr<RangeEventList>>
-      // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
       event_lists_map_;
 
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   c10::optional<std::vector<std::vector<LegacyEvent>>> remoteProfiledEvents_;
 };
 
@@ -317,18 +310,18 @@ void ProfilerLegacyThreadLocalState::reportMemoryUsage(
 }
 
 RangeEventList& ProfilerLegacyThreadLocalState::getEventList(
-    int64_t thread_id) {
-  if (thread_id < 0) {
+    std::optional<uint64_t> thread_id) {
+  if (!thread_id.has_value()) {
     thread_id = at::RecordFunction::currentThreadId();
   }
   RangeEventList* list_ptr = nullptr;
   std::lock_guard<std::mutex> guard(state_mutex_);
-  auto it = event_lists_map_.find(thread_id);
+  auto it = event_lists_map_.find(thread_id.value());
   if (it != event_lists_map_.end()) {
     list_ptr = it->second.get();
   } else {
     auto event_list = std::make_shared<RangeEventList>();
-    event_lists_map_[thread_id] = event_list;
+    event_lists_map_[thread_id.value()] = event_list;
     list_ptr = event_list.get();
   }
   return *list_ptr;
@@ -494,7 +487,7 @@ void LegacyEvent::record(bool record_cuda) {
       " elements to reconstruct LegacyEvent.");
 
   // Reconstruct input shapes from ivalues.
-  auto shapeListIValue = ivalues.get(EventIValueIdx::SHAPES);
+  const auto& shapeListIValue = ivalues.get(EventIValueIdx::SHAPES);
   TORCH_INTERNAL_ASSERT(
       shapeListIValue.isList(),
       "Expected profiler shapes IValue to contain type c10::impl::GenericList.");
@@ -504,7 +497,7 @@ void LegacyEvent::record(bool record_cuda) {
   shapes.reserve(shapeList.size());
   for (const auto i : c10::irange(shapeList.size())) {
     std::vector<int64_t> s;
-    auto shapeIValue = shapeList.get(i);
+    const auto& shapeIValue = shapeList.get(i);
     TORCH_INTERNAL_ASSERT(
         shapeIValue.isList(),
         "Expected each profiler shape element to contain shapes of type c10::impl::GenericList.")
@@ -530,8 +523,10 @@ void LegacyEvent::record(bool record_cuda) {
       ivalues.get(EventIValueIdx::CPU_NS).toInt(), // cpu_ns
       ivalues.get(EventIValueIdx::CUDA_RECORDED).toBool(), // was cuda recorded
       ivalues.get(EventIValueIdx::CUDA_MEM_USAGE).toInt(), // cuda memory usage
-      ivalues.get(EventIValueIdx::CUDA_DEVICE).toInt(), // device
-      ivalues.get(EventIValueIdx::CUDA_US).toInt() // cuda_us
+      c10::DeviceIndex(
+          ivalues.get(EventIValueIdx::CUDA_DEVICE).toInt()), // device
+      static_cast<double>(
+          ivalues.get(EventIValueIdx::CUDA_US).toInt()) // cuda_us
   );
   return evt;
 }
@@ -670,9 +665,9 @@ RecordProfile::~RecordProfile() {
     }
     processEvents(events);
   } catch (const std::exception& e) {
-    LOG(ERROR) << e.what() << std::endl;
+    LOG(ERROR) << e.what() << '\n';
   } catch (...) {
-    LOG(ERROR) << "Unknown error" << std::endl;
+    LOG(ERROR) << "Unknown error" << '\n';
   }
 }
 
@@ -680,6 +675,4 @@ void RecordProfile::processEvents(const std::vector<LegacyEvent*>& events) {
   writeProfilerEventsToStream(out_, events);
 }
 
-} // namespace profiler
-} // namespace autograd
-} // namespace torch
+} // namespace torch::autograd::profiler
diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h
index 442df256467e7..e74ddd8a2296e 100644
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@@ -1,13 +1,10 @@
 #pragma once
 
 #include <cstdint>
-#include <forward_list>
 #include <iostream>
 #include <memory>
 #include <mutex>
-#include <sstream>
 #include <string>
-#include <tuple>
 #include <vector>
 
 #include <torch/csrc/Export.h>
@@ -30,7 +27,6 @@ enum class C10_API_ENUM EventKind : uint16_t {
 
 // To be deprecated, once we switch to Kineto profiling
 struct TORCH_API LegacyEvent {
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   LegacyEvent(
       EventKind kind,
       at::StringView name,
@@ -38,46 +34,45 @@ struct TORCH_API LegacyEvent {
       bool record_cuda,
       at::RecordFunctionHandle handle = 0,
       std::vector<std::vector<int64_t>>&& shapes = {},
-      int node_id = -1,
+      int64_t node_id = -1,
       bool is_async = false)
       : name_(std::move(name)),
         kind_(kind),
         thread_id_(thread_id),
         handle_(handle),
-        shapes_(shapes),
+        shapes_(std::move(shapes)),
         node_id_(node_id),
         is_async_(is_async) {
     record(record_cuda);
   }
 
   // Constructor to be used in conjunction with LegacyEvent::fromIValue.
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   LegacyEvent(
       EventKind kind,
       at::StringView name,
       uint16_t thread_id,
       at::RecordFunctionHandle handle,
       std::vector<std::vector<int64_t>>&& shapes,
-      int node_id,
+      int64_t node_id,
       bool is_remote,
       int64_t cpu_memory_usage,
       int64_t cpu_ns,
       bool cuda_recorded,
       int64_t cuda_memory_usage = 0,
-      int device = -1,
+      c10::DeviceIndex device = -1,
       double cuda_us = -1)
       : cpu_ns_(cpu_ns),
         name_(std::move(name)),
         kind_(kind),
         thread_id_(thread_id),
         handle_(handle),
-        shapes_(shapes),
+        shapes_(std::move(shapes)),
         cpu_memory_usage_(cpu_memory_usage),
         cuda_memory_usage_(cuda_memory_usage),
         device_(device),
         node_id_(node_id),
         is_remote_(is_remote),
-        cuda_us_(cuda_us) {
+        cuda_us_(static_cast<int64_t>(cuda_us)) {
     // Sanity check values that were deserialized
     TORCH_INTERNAL_ASSERT(cpu_ns_ > 0);
     if (cuda_recorded) {
@@ -126,12 +121,11 @@ struct TORCH_API LegacyEvent {
   }
 
   double cpuElapsedUs(const LegacyEvent& e) const {
-    // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers)
     return static_cast<double>(e.cpu_ns_ - cpu_ns_) / (1000.0);
   }
 
   void setCpuUs(int64_t cpu_us) {
-    cpu_ns_ = static_cast<double>(cpu_us) * 1000.0;
+    cpu_ns_ = cpu_us * 1000;
   }
 
   double cpuUs() const {
@@ -144,7 +138,7 @@ struct TORCH_API LegacyEvent {
     return cuda_event != nullptr || (isRemote() && device_ != -1);
   }
 
-  int device() const {
+  c10::DeviceIndex device() const {
     return device_;
   }
 
@@ -173,12 +167,12 @@ struct TORCH_API LegacyEvent {
   }
 
   // Node ID corresponding to this event.
-  int nodeId() const {
+  int64_t nodeId() const {
     return node_id_;
   }
 
   // Set Node ID on this event.
-  void setNodeId(int node_id) {
+  void setNodeId(int64_t node_id) {
     node_id_ = node_id;
   }
 
@@ -260,22 +254,22 @@ struct TORCH_API LegacyEvent {
   at::StringView name_;
   EventKind kind_;
   uint64_t thread_id_;
-  uint64_t fwd_thread_id_;
+  uint64_t fwd_thread_id_{0};
   at::RecordFunctionHandle handle_{0};
   std::vector<std::vector<int64_t>> shapes_;
   int64_t cpu_memory_usage_ = 0;
   int64_t cuda_memory_usage_ = 0;
-  int device_ = -1;
+  c10::DeviceIndex device_ = -1;
   torch::profiler::impl::ProfilerVoidEventStub cuda_event = nullptr;
-  int node_id_ = 0;
+  int64_t node_id_ = 0;
   bool is_remote_ = false;
   int64_t cuda_us_ = -1;
   int64_t sequence_nr_ = -1;
   bool is_async_ = false;
 
   std::vector<std::string> stack_;
-  uint8_t scope_;
-  uint64_t correlation_id_;
+  uint8_t scope_{0};
+  uint64_t correlation_id_{0};
   // Extra arguments for computing op flops
   std::unordered_map<std::string, c10::IValue> extra_args_;
   uint64_t flops_ = 0;
@@ -285,7 +279,6 @@ struct TORCH_API LegacyEvent {
 // a std::vector resize from taking a large amount of time inside
 // a profiling  event
 struct RangeEventList {
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,modernize-use-equals-default)
   RangeEventList() {
     events_.reserve(kReservedCapacity);
   }
@@ -298,7 +291,6 @@ struct RangeEventList {
 
   std::vector<LegacyEvent> consolidate() {
     std::lock_guard<std::mutex> lock(mutex_);
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     std::vector<LegacyEvent> result;
     result.insert(
         result.begin(),
@@ -389,12 +381,10 @@ struct TORCH_API TLSLegacyProfilerGuard {
       c10::optional<ProfilerDisableOptions> profilerDisableOptions =
           c10::nullopt)
       : cb_(std::move(resultCallback)),
-        // NOLINTNEXTLINE(performance-move-const-arg)
-        profilerDisableOptions_(std::move(profilerDisableOptions)) {
+        profilerDisableOptions_(profilerDisableOptions) {
     enableProfilerLegacy(cfg);
   }
   ~TLSLegacyProfilerGuard() {
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     thread_event_lists event_lists =
         disableProfilerLegacy(profilerDisableOptions_);
     if (cb_) {
@@ -408,6 +398,7 @@ struct TORCH_API TLSLegacyProfilerGuard {
 
  private:
   c10::optional<std::function<void(const thread_event_lists&)>> cb_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
 };
 
diff --git a/torch/csrc/autograd/python_anomaly_mode.cpp b/torch/csrc/autograd/python_anomaly_mode.cpp
index 6e7fb1484b5ad..35cfbc86e1cff 100644
--- a/torch/csrc/autograd/python_anomaly_mode.cpp
+++ b/torch/csrc/autograd/python_anomaly_mode.cpp
@@ -8,8 +8,6 @@
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_strings.h>
 
-#include <iostream>
-
 namespace torch {
 namespace autograd {
 
diff --git a/torch/csrc/autograd/python_anomaly_mode.h b/torch/csrc/autograd/python_anomaly_mode.h
index 67c34f692cc64..df4b12df863b1 100644
--- a/torch/csrc/autograd/python_anomaly_mode.h
+++ b/torch/csrc/autograd/python_anomaly_mode.h
@@ -14,6 +14,7 @@ struct PyAnomalyMetadata : public AnomalyMetadata {
 
   PyAnomalyMetadata() {
     pybind11::gil_scoped_acquire gil;
+    // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
     dict_ = PyDict_New();
   }
   ~PyAnomalyMetadata() override {
@@ -32,7 +33,7 @@ struct PyAnomalyMetadata : public AnomalyMetadata {
   }
 
  private:
-  PyObject* dict_;
+  PyObject* dict_{nullptr};
 };
 void _print_stack(
     PyObject* trace_stack,
diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp
index 580f50b488460..1682066c63953 100644
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@@ -22,7 +22,6 @@
 #endif
 
 #include <memory> // for unique_ptr
-#include <unordered_set>
 #include <utility>
 
 using namespace torch::autograd;
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 5b2cde312a893..341d2886699a1 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -5,6 +5,7 @@
 #include <c10/util/irange.h>
 #include <pybind11/pybind11.h>
 #include <structmember.h>
+#include <torch/csrc/PyInterpreter.h>
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/pybind.h>
 
@@ -22,19 +23,20 @@
 #include <torch/csrc/autograd/python_cpp_function.h>
 #include <torch/csrc/autograd/python_hook.h>
 #include <torch/csrc/autograd/saved_variable.h>
+#include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/dynamo/compiled_autograd.h>
 #include <torch/csrc/jit/frontend/tracer.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/jit/python/python_tracer.h>
+#include <torch/csrc/profiler/api.h>
 #include <torch/csrc/utils/python_strings.h>
+#include <torch/csrc/utils/tensor_dtypes.h>
 
-#include <exception>
 #include <functional>
 #include <memory>
 #include <stdexcept>
 #include <string>
-#include <tuple>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -70,10 +72,66 @@ void throw_python_error() {
   throw std::move(err);
 }
 
+static PyObject* unpack_saved_variables(
+    THPFunction* self,
+    const std::function<PyObject*(const Variable&)>& unpack_fn) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(!self->has_freed_buffers, ERR_BACKWARD_TWICE);
+  auto& saved_variables = self->saved_variables;
+  if (saved_variables.empty())
+    return PyTuple_New(0);
+
+  auto num_saved = saved_variables.size();
+  THPObjectPtr saved(PyTuple_New(static_cast<Py_ssize_t>(num_saved)));
+  if (!saved)
+    return nullptr;
+  auto saved_for = self->cdata.lock();
+  // This is really a true assert, because we've already tested for the
+  // self->has_freed_buffers case at the beginning of this function:
+  // buffers are freed when PyNode dies; if the buffers are not freed,
+  // PyNode must be live.  (Note that the buffers could be freed
+  // even though the PyNode is live, but that doesn't matter here
+  // because we will never hit this line of code if the buffers are freed--
+  // and in any case saved_for will be non-NULL.)
+  TORCH_INTERNAL_ASSERT(saved_for);
+  for (const auto i : c10::irange(num_saved)) {
+    auto unpacked_var = saved_variables[i].unpack(saved_for);
+    THPObjectPtr value;
+    if (!unpacked_var.defined()) {
+      Py_INCREF(Py_None);
+      value = Py_None;
+    } else {
+      value = unpack_fn(unpacked_var);
+    }
+    PyTuple_SET_ITEM(saved.get(), i, value.release());
+  }
+  return saved.release();
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* to_py_size(const std::vector<c10::SymInt>& size) {
+  c10::SymIntArrayRef sym_sizes(size);
+
+  auto ret = THPObjectPtr(THPSizeType.tp_alloc(
+      &THPSizeType, static_cast<Py_ssize_t>(sym_sizes.size())));
+  if (!ret)
+    throw python_error();
+
+  for (auto i : c10::irange(sym_sizes.size())) {
+    auto symint = sym_sizes[i];
+    if (auto maybe_int = symint.maybe_as_int(); maybe_int.has_value()) {
+      PyTuple_SET_ITEM(ret.get(), i, THPUtils_packInt64(*maybe_int));
+    } else {
+      auto py_symint = py::cast(symint).release().ptr();
+      PyTuple_SET_ITEM(ret.get(), i, py_symint);
+    }
+  }
+  return ret.release();
+}
+
 } // namespace
 
-namespace torch {
-namespace autograd {
+namespace torch::autograd {
 
 // NOTE: this function is written in a way that assumes it's only called for
 // backward; it's used by engine.cpp.  This is responsible for forwarding a call
@@ -84,31 +142,7 @@ auto PyNode::apply(variable_list&& inputs) -> variable_list {
   THPFunction* py_fn = (THPFunction*)obj;
 
   // Massage a C++ variable_list into a Python arguments tuple
-  auto num_inputs = inputs.size();
-  THPObjectPtr pyInputs(PyTuple_New(static_cast<Py_ssize_t>(num_inputs)));
-  if (!pyInputs)
-    throw_python_error();
-  auto& output_info = py_fn->output_info;
-  for (const auto i : c10::irange(num_inputs)) {
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    PyObject* input;
-    if (inputs[i].defined() || !py_fn->materialize_grads ||
-        (input_metadata(i).was_default_constructed() &&
-         !py_fn->materialize_non_diff_grads)) {
-      input = THPVariable_Wrap(inputs[i]);
-    } else {
-      auto zeros_without_gil = [](const VariableInfo& variable,
-                                  at::OptionalDeviceGuard& device_guard) {
-        pybind11::gil_scoped_release gil;
-        return variable.zeros(device_guard);
-      };
-      input =
-          THPVariable_Wrap(zeros_without_gil(output_info[i], _device_guard));
-    }
-    if (!input)
-      throw_python_error();
-    PyTuple_SET_ITEM(pyInputs.get(), i, input);
-  }
+  THPObjectPtr pyInputs(to_py_args(inputs, &_device_guard));
 
   THPObjectPtr apply_fn(PyObject_GetAttrString(obj, "apply"));
   if (!apply_fn)
@@ -146,35 +180,78 @@ auto PyNode::apply(variable_list&& inputs) -> variable_list {
   }
 
   // Massage the Python results tuple back into a C++ variable_list
-  variable_list results;
-  results.reserve(num_outputs);
-  for (int i = 0; i != num_outputs; ++i) {
-    PyObject* output = PyTuple_GET_ITEM(r.get(), i);
-    bool was_variable = is_variable_input[i];
-    if (!was_variable) {
-      if (output != Py_None) {
-        std::string msg("function ");
-        msg += name() + " returned a gradient different than None at position ";
-        msg += std::to_string(i + 1) +
-            ", but the corresponding forward input was not a Variable";
-        throw std::runtime_error(msg);
-      }
+  return to_variable_list(r.get(), is_variable_input);
+}
+
+auto PyNode::defer_to_dynamo(
+    variable_list&& inputs,
+    std::optional<PyObject*> compiler) -> variable_list {
+  pybind11::gil_scoped_acquire gil;
+  at::OptionalDeviceGuard _device_guard;
+  THPFunction* py_fn = (THPFunction*)obj;
+
+  // Massage a C++ variable_list into a Python arguments tuple
+  THPObjectPtr pyInputs(to_py_args(inputs, &_device_guard));
+
+  const auto& is_variable_input = py_fn->is_variable_input;
+  const auto& input_infos = py_fn->input_info;
+  // input_info only contains info from variable inputs and should be a subset
+  TORCH_INTERNAL_ASSERT(is_variable_input.size() >= input_infos.size());
+
+  // The gradients returned in the backwards need to match the number of inputs
+  // to the forward, and their metadata, so we pass the fwdInputs
+  THPObjectPtr fwdInputMetadatas(
+      PyTuple_New(static_cast<Py_ssize_t>(is_variable_input.size())));
+  if (!fwdInputMetadatas)
+    throw python_error();
+
+  int offset = 0;
+  for (const auto i : c10::irange(is_variable_input.size())) {
+    if (!is_variable_input[i]) {
+      // input at i is not a variable, skip index
+      PyTuple_SET_ITEM(fwdInputMetadatas.get(), i, Py_None);
+      offset++;
       continue;
     }
-    if (output == Py_None) {
-      results.emplace_back();
-    } else {
-      if (!THPVariable_Check(output)) {
-        std::string msg("expected Variable or None (got ");
-        msg += THPUtils_typename(output);
-        msg += ")";
-        throw std::runtime_error(msg);
-      }
-      results.emplace_back(THPVariable_Unpack(output));
-    }
+
+    const auto& input_info = input_infos[i - offset];
+
+    PyObject* device(THPDevice_New(input_info.device));
+    if (!device)
+      throw_python_error();
+    // Metadata is a tuple of 4 elements: (layout, device, dtype, size)
+    PyObject* fwdInputMetadata = PyTuple_Pack(
+        4,
+        autograd::utils::wrap(input_info.layout),
+        device,
+        autograd::utils::wrap(input_info.scalar_type),
+        to_py_size(input_info.size));
+    if (!fwdInputMetadata)
+      throw python_error();
+
+    PyTuple_SET_ITEM(fwdInputMetadatas.get(), i, fwdInputMetadata);
   }
+  THPObjectPtr saved_tensors(unpack_saved_variables(
+      py_fn, [](const Variable& var) { return THPVariable_Wrap(var); }));
+  TORCH_INTERNAL_ASSERT(
+      _backward_idx.has_value(),
+      "indices should already be set by compiled_args, called before apply_with_saved");
+  TORCH_INTERNAL_ASSERT(!_backward_state_idx.has_value());
+  THPObjectPtr r(PyObject_CallMethod(
+      *compiler,
+      "proxy_call_backward",
+      "OOOi",
+      pyInputs.get(),
+      fwdInputMetadatas.get(),
+      saved_tensors.get(),
+      *_backward_idx));
 
-  return results;
+  if (!r)
+    throw_python_error();
+  ensure_tuple(r);
+
+  // Massage the Python results tuple back into a C++ variable_list
+  return to_variable_list(r.get(), is_variable_input);
 }
 
 auto PyNode::is_traceable() -> bool {
@@ -209,6 +286,14 @@ auto PyNode::name() const -> std::string {
   return name;
 }
 
+auto PyNode::compiled_autograd_should_lift() const -> bool {
+  pybind11::gil_scoped_acquire gil;
+  static PyObject* attr_name =
+      PyUnicode_InternFromString("_compiled_autograd_should_lift");
+  THPObjectPtr should_lift(PyObject_GetAttr(obj, attr_name));
+  return PyObject_IsTrue(should_lift.get()) == 1;
+}
+
 void PyNode::compiled_args(CompiledNodeArgs& args) {
   static PyObject* method_name =
       PyUnicode_InternFromString("_compiled_autograd_key");
@@ -217,17 +302,17 @@ void PyNode::compiled_args(CompiledNodeArgs& args) {
     throw_python_error();
   TORCH_CHECK(
       PyTuple_CheckExact(pykey.get()),
-      "_compiled_autograd_key shoud return tuple of ints");
+      "_compiled_autograd_key should return tuple of ints");
   auto size = PyTuple_GET_SIZE(pykey.get());
   TORCH_INTERNAL_ASSERT(size > 0);
-  // first value is unique ID of the AotAutograd graph
+  // first value is unique id managed by AUTOGRAD_FUNCTION_COUNTER
   auto key = PyLong_AsSsize_t(PyTuple_GET_ITEM(pykey.get(), 0));
   if (C10_UNLIKELY(key < 0)) {
     TORCH_CHECK(PyErr_Occurred(), "key must be positive");
     throw_python_error();
   }
   args.collect_size(static_cast<size_t>(key));
-  args.collect_size(size);
+  args.collect_size(static_cast<size_t>(size));
 
   auto f = (THPFunction*)obj;
   f->compiled_autograd_symints.clear();
@@ -252,6 +337,19 @@ void PyNode::compiled_args(CompiledNodeArgs& args) {
   args.collect(f->materialize_non_diff_grads);
   args.collect(f->output_info);
   args.collect(f->input_info);
+
+  if (compiled_autograd_should_lift()) {
+    Py_INCREF(obj);
+    _backward_idx =
+        args.add_backward(c10::SafePyObject(obj, getPyInterpreter()));
+  }
+
+  PyObject* bw_state = f->compiled_autograd_backward_state;
+  if (args.cond(bw_state != nullptr)) {
+    Py_INCREF(bw_state);
+    _backward_state_idx = args.add_backward_state(
+        c10::SafePyObject(bw_state, getPyInterpreter()));
+  }
 }
 
 variable_list PyNode::apply_with_saved(
@@ -266,7 +364,28 @@ variable_list PyNode::apply_with_saved(
   saved.before(f->output_info);
   saved.before(f->input_info);
   f->compiled_autograd_tracing = true;
-  auto result = apply(variable_list(inputs));
+  variable_list result;
+  if (!compiled_autograd_should_lift()) {
+    if (_backward_state_idx.has_value()) {
+      PyObject* r = PyObject_CallMethod(
+          saved.get_py_compiler(),
+          "bind_backward_state",
+          "i",
+          *_backward_state_idx);
+      if (r == nullptr) {
+        throw python_error();
+      }
+      THPObjectPtr prior(f->compiled_autograd_backward_state);
+      f->compiled_autograd_backward_state = r;
+      result = apply(variable_list(inputs));
+      Py_CLEAR(f->compiled_autograd_backward_state);
+      f->compiled_autograd_backward_state = prior.release();
+    } else {
+      result = apply(variable_list(inputs));
+    }
+  } else {
+    result = defer_to_dynamo(variable_list(inputs), saved.get_py_compiler());
+  }
   f->compiled_autograd_tracing = false;
   saved.after(f->compiled_autograd_symints);
   saved.after(f->saved_variables);
@@ -277,8 +396,77 @@ variable_list PyNode::apply_with_saved(
   return result;
 }
 
-} // namespace autograd
-} // namespace torch
+PyObject* PyNode::to_py_args(
+    const variable_list& inputs,
+    at::OptionalDeviceGuard* device_guard) {
+  THPFunction* py_fn = (THPFunction*)obj;
+
+  auto zeros_without_gil = [](const VariableInfo& variable,
+                              at::OptionalDeviceGuard& dg) {
+    pybind11::gil_scoped_release gil;
+    return variable.zeros(dg);
+  };
+
+  auto num_inputs = inputs.size();
+  PyObject* pyInputs = PyTuple_New(static_cast<Py_ssize_t>(num_inputs));
+  if (!pyInputs)
+    throw_python_error();
+  auto& output_info = py_fn->output_info;
+  for (const auto i : c10::irange(num_inputs)) {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    PyObject* input;
+    if (inputs[i].defined() || !py_fn->materialize_grads ||
+        (input_metadata(i).was_default_constructed() &&
+         !py_fn->materialize_non_diff_grads)) {
+      input = THPVariable_Wrap(inputs[i]);
+    } else {
+      input =
+          THPVariable_Wrap(zeros_without_gil(output_info[i], *device_guard));
+    }
+    if (!input)
+      throw_python_error();
+    PyTuple_SET_ITEM(pyInputs, i, input);
+  }
+
+  return pyInputs;
+}
+
+variable_list PyNode::to_variable_list(
+    const PyObject* outputs,
+    const std::vector<bool>& is_variable_input) {
+  auto num_outputs = PyTuple_GET_SIZE(outputs);
+  variable_list results;
+  results.reserve(num_outputs);
+  for (int i = 0; i != num_outputs; ++i) {
+    PyObject* output = PyTuple_GET_ITEM(outputs, i);
+    bool was_variable = is_variable_input[i];
+    if (!was_variable) {
+      if (output != Py_None) {
+        std::string msg("function ");
+        msg += name() + " returned a gradient different than None at position ";
+        msg += std::to_string(i + 1) +
+            ", but the corresponding forward input was not a Variable";
+        throw std::runtime_error(msg);
+      }
+      continue;
+    }
+    if (output == Py_None) {
+      results.emplace_back();
+    } else {
+      if (!THPVariable_Check(output)) {
+        std::string msg("expected Variable or None (got ");
+        msg += THPUtils_typename(output);
+        msg += ")";
+        throw std::runtime_error(msg);
+      }
+      results.emplace_back(THPVariable_Unpack(output));
+    }
+  }
+
+  return results;
+}
+
+} // namespace torch::autograd
 
 // Traverse and clear are required for supporting Python's GC cycle handling.
 static int THPFunction_traverse(THPFunction* self, visitproc visit, void* arg) {
@@ -287,6 +475,7 @@ static int THPFunction_traverse(THPFunction* self, visitproc visit, void* arg) {
   Py_VISIT(self->to_save);
   Py_VISIT(self->non_differentiable);
   Py_VISIT(self->dirty_tensors);
+  Py_VISIT(self->compiled_autograd_backward_state);
   Py_VISIT(self->saved_for_forward);
   return 0;
 }
@@ -301,6 +490,7 @@ static int THPFunction_clear(THPFunction* self) {
   Py_CLEAR(self->to_save);
   Py_CLEAR(self->non_differentiable);
   Py_CLEAR(self->dirty_tensors);
+  Py_CLEAR(self->compiled_autograd_backward_state);
   Py_CLEAR(self->saved_for_forward);
 
   self->output_info.clear();
@@ -666,6 +856,8 @@ static std::unordered_set<at::TensorImpl*> _parse_non_differentiable(
 struct UnpackedInput {
   THPObjectPtr input_tuple;
   variable_list input_vars;
+  // record_function_inputs is for RECORD_FUNCTION only
+  std::vector<c10::IValue> record_function_inputs;
 };
 
 struct InputFlags {
@@ -683,6 +875,9 @@ std::pair<UnpackedInput, InputFlags> unpack_input(PyObject* args) {
   auto num_args = PyTuple_GET_SIZE(args);
   unpacked.input_tuple = PyTuple_New(num_args);
   flags.needs_input_grad = PyTuple_New(num_args);
+  bool profiler_need_input = torch::autograd::profiler::profilerEnabled() &&
+      torch::autograd::profiler::getProfilerConfig().report_input_shapes;
+
   for (const auto i : c10::irange(num_args)) {
     PyObject* arg = PyTuple_GET_ITEM(args, i);
 
@@ -698,12 +893,23 @@ std::pair<UnpackedInput, InputFlags> unpack_input(PyObject* args) {
       }
       Py_INCREF(Py_False);
       PyTuple_SET_ITEM(flags.needs_input_grad.get(), i, Py_False);
+
+      if (profiler_need_input) {
+        // The following conversion from PyObject to IValue is expensive
+        // Only do it if profiler is enabled and needs input shapes
+        auto match = torch::jit::tryToInferPrimitiveType(arg);
+        if (match.success()) {
+          unpacked.record_function_inputs.push_back(
+              torch::jit::toIValue(arg, match.type()));
+        }
+      }
     } else {
       const auto& tensor = THPVariable_Unpack(arg);
       unpacked.input_vars.push_back(tensor);
       PyObject* needs_grad = tensor.requires_grad() ? Py_True : Py_False;
       Py_INCREF(needs_grad);
       PyTuple_SET_ITEM(flags.needs_input_grad.get(), i, needs_grad);
+      unpacked.record_function_inputs.emplace_back(tensor);
     }
     Py_INCREF(arg);
     PyTuple_SET_ITEM(unpacked.input_tuple.get(), i, arg);
@@ -1062,8 +1268,7 @@ PyObject* THPFunction_apply(PyObject* cls, PyObject* inputs) {
   // before context has been allocated.
   RECORD_FUNCTION(
       ((PyTypeObject*)cls)->tp_name,
-      std::vector<c10::IValue>(
-          unpacked_input.input_vars.begin(), unpacked_input.input_vars.end()),
+      unpacked_input.record_function_inputs,
       seq_id);
 
   const auto& functorch_tls = at::functorch::functorchTLSAccessor();
@@ -1264,43 +1469,6 @@ int THPFunction_set_materialize_non_diff_grads(
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
-static PyObject* unpack_saved_variables(
-    THPFunction* self,
-    const std::function<PyObject*(const Variable&)>& unpack_fn) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(!self->has_freed_buffers, ERR_BACKWARD_TWICE);
-  auto& saved_variables = self->saved_variables;
-  if (saved_variables.empty())
-    return PyTuple_New(0);
-
-  auto num_saved = saved_variables.size();
-  THPObjectPtr saved(PyTuple_New(static_cast<Py_ssize_t>(num_saved)));
-  if (!saved)
-    return nullptr;
-  auto saved_for = self->cdata.lock();
-  // This is really a true assert, because we've already tested for the
-  // self->has_freed_buffers case at the beginning of this function:
-  // buffers are freed when PyNode dies; if the buffers are not freed,
-  // PyNode must be live.  (Note that the buffers could be freed
-  // even though the PyNode is live, but that doesn't matter here
-  // because we will never hit this line of code if the buffers are freed--
-  // and in any case saved_for will be non-NULL.)
-  TORCH_INTERNAL_ASSERT(saved_for);
-  for (const auto i : c10::irange(num_saved)) {
-    auto unpacked_var = saved_variables[i].unpack(saved_for);
-    THPObjectPtr value;
-    if (!unpacked_var.defined()) {
-      Py_INCREF(Py_None);
-      value = Py_None;
-    } else {
-      value = unpack_fn(unpacked_var);
-    }
-    PyTuple_SET_ITEM(saved.get(), i, value.release());
-  }
-  return saved.release();
-  END_HANDLE_TH_ERRORS
-}
-
 PyObject* THPFunction_saved_tensors(THPFunction* self, void* _unused) {
   HANDLE_TH_ERRORS
   if (self->saved_for_forward) {
@@ -1358,6 +1526,33 @@ PyObject* THPFunction_get_compiled_autograd_symints(
   END_HANDLE_TH_ERRORS
 }
 
+PyObject* THPFunction_get_compiled_autograd_backward_state(
+    PyObject* _self,
+    void* _unused) {
+  HANDLE_TH_ERRORS
+  auto self = (THPFunction*)_self;
+  PyObject* bw_state = self->compiled_autograd_backward_state;
+  if (bw_state == nullptr) {
+    bw_state = Py_None;
+  }
+  Py_INCREF(bw_state);
+  return bw_state;
+  END_HANDLE_TH_ERRORS
+}
+
+int THPFunction_set_compiled_autograd_backward_state(
+    PyObject* _self,
+    PyObject* bw_state,
+    void* _unused) {
+  HANDLE_TH_ERRORS
+  auto self = (THPFunction*)_self;
+  TORCH_INTERNAL_ASSERT(self->compiled_autograd_backward_state == nullptr);
+  Py_INCREF(bw_state);
+  self->compiled_autograd_backward_state = bw_state;
+  return 0;
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
 PyObject* THPFunction_raw_saved_tensors(THPFunction* self, void* _unused) {
   HANDLE_TH_ERRORS
   // User tries to access saved variables after they have been freed
@@ -1538,6 +1733,11 @@ static struct PyGetSetDef THPFunction_properties[] = {
      (setter)THPFunction_set_materialize_non_diff_grads,
      nullptr,
      nullptr},
+    {"_compiled_autograd_backward_state",
+     (getter)THPFunction_get_compiled_autograd_backward_state,
+     (setter)THPFunction_set_compiled_autograd_backward_state,
+     nullptr,
+     nullptr},
     {nullptr}};
 
 // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
diff --git a/torch/csrc/autograd/python_function.h b/torch/csrc/autograd/python_function.h
index 3943c6e29ed48..c2744f365476f 100644
--- a/torch/csrc/autograd/python_function.h
+++ b/torch/csrc/autograd/python_function.h
@@ -13,7 +13,7 @@
 #include <c10/util/Optional.h>
 
 #include <memory>
-#include <utility>
+#include <optional>
 #include <vector>
 
 namespace torch::jit {
@@ -27,7 +27,17 @@ namespace torch::autograd {
 struct PyNode : public Node {
   PyNode(THPObjectPtr obj) : obj(obj.release()) {}
 
+  PyObject* to_py_args(
+      const variable_list& inputs,
+      at::OptionalDeviceGuard* device_guard);
+  variable_list to_variable_list(
+      const PyObject* r,
+      const std::vector<bool>& is_variable_input);
+
   variable_list apply(variable_list&& inputs) override;
+  variable_list defer_to_dynamo(
+      variable_list&& inputs,
+      std::optional<PyObject*> compiler);
 
   void release_variables() override;
   std::string name() const override;
@@ -38,9 +48,19 @@ struct PyNode : public Node {
       const variable_list& inputs,
       SwapSavedVariables& saved) override;
 
+  bool compiled_autograd_should_lift() const;
+
   // THPFunction this Function is wrapping.  Owning!
   PyObject* obj;
 
+  // The AutogradCompilerCall::hooks idx corresponding to this node's backward
+  std::optional<int> _backward_idx;
+
+  // The AutogradCompilerCall::hooks idx corresponding to this node's
+  // backward_state
+  std::optional<int> _backward_state_idx;
+
+  // NOLINTNEXTLINE(bugprone-exception-escape)
   ~PyNode() override {
     // Can't use THPObjectPtr as a field in this class; destructor won't take
     // out GIL!  When I forgot to do this by hand
@@ -105,6 +125,7 @@ struct THPFunction {
   // This is enabled by compiled autograd as a way to signal to AotAutograd it
   // should call the original FX graph rather than compiling.
   bool compiled_autograd_tracing;
+  PyObject* compiled_autograd_backward_state;
   std::vector<c10::SymInt> compiled_autograd_symints;
 
   std::vector<torch::autograd::VariableInfo> output_info;
diff --git a/torch/csrc/autograd/python_legacy_variable.cpp b/torch/csrc/autograd/python_legacy_variable.cpp
index 3f81e87c630c3..858735c8358d4 100644
--- a/torch/csrc/autograd/python_legacy_variable.cpp
+++ b/torch/csrc/autograd/python_legacy_variable.cpp
@@ -54,10 +54,9 @@ static PyObject* THPVariable_pynew(
       throw python_error();
   }
 
-  if (is_volatile && requires_grad) {
-    throw ValueError(
-        "Variable can't be volatile and require_grad at the same time!");
-  }
+  TORCH_CHECK_VALUE(
+      !is_volatile || !requires_grad,
+      "Variable can't be volatile and require_grad at the same time!");
   if (grad_fn && !THPFunction_Check(grad_fn)) {
     throw TypeError(
         "_grad_fn has to be a Function object or None, but got %s",
diff --git a/torch/csrc/autograd/python_torch_functions.h b/torch/csrc/autograd/python_torch_functions.h
index 241aa8f27cc7a..61442c46341df 100644
--- a/torch/csrc/autograd/python_torch_functions.h
+++ b/torch/csrc/autograd/python_torch_functions.h
@@ -1,7 +1,5 @@
 #include <Python.h>
 
-#include <vector>
-
 namespace torch::autograd {
 
 extern PyObject* THPVariableFunctionsModule;
diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
index cc9527a917396..1e94b0cf3468b 100644
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -9,7 +9,7 @@
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/jit/frontend/tracer.h>
-#include <torch/csrc/utils/cuda_lazy_init.h>
+#include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/out_types.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
@@ -61,7 +61,7 @@ inline Tensor dispatch_range(
     const Scalar& end,
     const Scalar& step,
     const TensorOptions& options) {
-  torch::utils::maybe_initialize_cuda(options);
+  torch::utils::maybe_initialize_device(options);
   pybind11::gil_scoped_release no_gil;
   DeviceGuard device_guard(options.device());
   return torch::range(start, end, step, options);
@@ -664,6 +664,25 @@ static PyObject* THPVariable__functionalize_sync(
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject* THPVariable__functionalize_apply_view_metas(
+    PyObject* self,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser(
+      {"_functionalize_apply_view_metas(Tensor tensor, Tensor base)"},
+      /*traceable=*/true);
+
+  ParsedArgs<4> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  auto tensor = r.tensor(0);
+  TORCH_INTERNAL_ASSERT(
+      at::functionalization::impl::isFunctionalTensor(tensor));
+  auto impl = at::functionalization::impl::unsafeGetFunctionalWrapper(tensor);
+  return wrap(impl->apply_view_metas(r.tensor(1)));
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject* THPVariable__functionalize_mark_mutation_hidden_from_autograd(
     PyObject* self,
     PyObject* args,
@@ -777,6 +796,10 @@ static PyMethodDef torch_functions_manual[] = {
      castPyCFunctionWithKeywords(THPVariable__functionalize_sync),
      METH_VARARGS | METH_KEYWORDS | METH_STATIC,
      nullptr},
+    {"_functionalize_apply_view_metas",
+     castPyCFunctionWithKeywords(THPVariable__functionalize_apply_view_metas),
+     METH_VARARGS | METH_KEYWORDS | METH_STATIC,
+     nullptr},
     {"_enable_functionalization",
      castPyCFunctionWithKeywords(THPVariable__enable_functionalization),
      METH_VARARGS | METH_KEYWORDS | METH_STATIC,
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 992dc0d5d3825..98b40867a1287 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -40,7 +40,6 @@
 #include <c10/core/SymIntArrayRef.h>
 #include <structmember.h>
 #include <cstdint>
-#include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
@@ -251,7 +250,7 @@ static PyObject* getPythonTensorClass(c10::Device d) {
   return device_to_py_class_[static_cast<size_t>(d.type())];
 }
 
-void activateCUDATrace() {
+void activateGPUTrace() {
   c10::impl::GPUTrace::set_trace(getPyInterpreter());
 }
 
@@ -525,16 +524,54 @@ static PyObject* THPVariable_fix_weakref(PyObject* self, PyObject* noargs) {
   Py_RETURN_NONE;
 }
 
+// Maps the given python callable over a vector of items, returning a vector
+// of the same type of items.
+template <typename T>
+static std::vector<T> map_py_func(
+    const py::function& func,
+    const std::vector<T>& items) {
+  std::vector<T> new_items;
+  new_items.reserve(items.size());
+  for (auto& item : items) {
+    new_items.emplace_back(py::cast<T>(func(item)));
+  }
+  return new_items;
+}
+
+template <>
+std::vector<at::Tensor> map_py_func(
+    const py::function& func,
+    const std::vector<at::Tensor>& items) {
+  std::vector<at::Tensor> new_items;
+  new_items.reserve(items.size());
+  for (auto& item : items) {
+    auto output = func(item);
+    if (output.is(py::none())) {
+      // treat None value as an undefined tensor
+      new_items.emplace_back();
+    } else {
+      new_items.emplace_back(py::cast<at::Tensor>(output));
+    }
+  }
+  return new_items;
+}
+
 static PyObject* view_func_impl(
-    PyObject* self_,
-    PyObject* arg,
+    PyObject* _self,
+    PyObject* args,
+    PyObject* kwargs,
     bool check_has_same_meta) {
   HANDLE_TH_ERRORS
-  const auto& self = THPVariable_Unpack(self_);
-  TORCH_CHECK(
-      THPVariable_Check(arg),
-      "_view_func expect a single argument that is a Tensor");
-  const auto& new_base = THPVariable_Unpack(arg);
+  const auto& self = THPVariable_Unpack(_self);
+
+  static PythonArgParser parser({
+      "_view_func(Tensor new_base, PyObject* symint_visitor_fn=None, PyObject* tensor_visitor_fn=None)",
+  });
+  ParsedArgs<3> parsed_args{};
+  auto r = parser.parse(_self, args, kwargs, parsed_args);
+  auto new_base = r.tensor(0);
+  PyObject* symint_visitor_fn = r.pyobject(1);
+  PyObject* tensor_visitor_fn = r.pyobject(2);
 
   // Ensure that self is indeed a backward differentiable view
   // If not, we return an undefined Tensor (None) and let the user handle it.
@@ -547,7 +584,29 @@ static PyObject* view_func_impl(
         torch::autograd::utils::has_same_meta(new_base, view_info.base_)) {
       // Do the actual view replay
       if (view_info.has_view_fn()) {
-        out = view_info.view_fn()(new_base);
+        auto& view_func = view_info.view_fn();
+
+        // Determine new SymInt / tensor state as needed.
+        c10::optional<std::vector<c10::SymInt>> new_symints = c10::nullopt;
+        if (symint_visitor_fn != Py_None) {
+          new_symints = map_py_func(
+              py::cast<py::function>(symint_visitor_fn),
+              view_func.get_symints());
+        }
+
+        c10::optional<std::vector<at::Tensor>> new_tensors = c10::nullopt;
+        if (tensor_visitor_fn != Py_None) {
+          new_tensors = map_py_func(
+              py::cast<py::function>(tensor_visitor_fn),
+              view_func.get_tensors());
+        }
+
+        // call view func
+        if (new_symints.has_value() || new_tensors.has_value()) {
+          out = (*view_func.clone_and_set(new_symints, new_tensors))(new_base);
+        } else {
+          out = view_func(new_base);
+        }
       } else {
         out = new_base.as_strided(
             self.sizes(), self.strides(), self.storage_offset());
@@ -558,12 +617,18 @@ static PyObject* view_func_impl(
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject* THPVariable_view_func(PyObject* self_, PyObject* arg) {
-  return view_func_impl(self_, arg, /*check_has_same_meta=*/true);
+static PyObject* THPVariable_view_func(
+    PyObject* self_,
+    PyObject* args,
+    PyObject* kwargs) {
+  return view_func_impl(self_, args, kwargs, /*check_has_same_meta=*/true);
 }
 
-static PyObject* THPVariable_view_func_unsafe(PyObject* self_, PyObject* arg) {
-  return view_func_impl(self_, arg, /*check_has_same_meta=*/false);
+static PyObject* THPVariable_view_func_unsafe(
+    PyObject* self_,
+    PyObject* args,
+    PyObject* kwargs) {
+  return view_func_impl(self_, args, kwargs, /*check_has_same_meta=*/false);
 }
 
 static PyObject* rev_view_func_impl(PyObject* self_, PyObject* arg) {
@@ -607,10 +672,11 @@ static PyObject* THPVariable_as_subclass(
   ParsedArgs<1> parsed_args{};
   auto r = parser.parse(_self, args, kwargs, parsed_args);
   PyObject* cls = r.pyobject(0);
-  if (!PyType_Check(cls)) {
-    throw torch::TypeError(
-        "cls must be a type (got %s)", Py_TYPE(cls)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      PyType_Check(cls),
+      "cls must be a type (got ",
+      Py_TYPE(cls)->tp_name,
+      ")");
   return THPVariable_NewWithVar(
       (PyTypeObject*)cls,
       self.alias(),
@@ -629,10 +695,11 @@ static PyObject* THPVariable_make_subclass(
   ParsedArgs<7> parsed_args{};
   auto r = parser.parse(args, kwargs, parsed_args);
   PyObject* cls = r.pyobject(0);
-  if (!PyType_Check(cls)) {
-    throw torch::TypeError(
-        "cls must be a type (got %s)", Py_TYPE(cls)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      PyType_Check(cls),
+      "cls must be a type (got ",
+      Py_TYPE(cls)->tp_name,
+      ")");
   // guard completely turns off torch dispatch modes, doesn't just pop off the
   // stack
   torch_dispatch_mode::StashTorchDispatchStackGuard td_g;
@@ -948,10 +1015,10 @@ int THPVariable_set_data(THPVariable* self, PyObject* data, void* unused) {
   }
   TORCH_CHECK(
       data, "Deleting tensor data is not allowed. Delete tensor instead!");
-  if (!THPVariable_Check(data)) {
-    throw torch::TypeError(
-        "Variable data has to be a tensor, but got %s", Py_TYPE(data)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      THPVariable_Check(data),
+      "Variable data has to be a tensor, but got ",
+      Py_TYPE(data)->tp_name);
 
   THPVariable_Unpack(self).set_data(THPVariable_Unpack(data));
   return 0;
@@ -1384,13 +1451,13 @@ PyObject* THPVariable_is_mps(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
-PyObject* THPVariable_is_ort(THPVariable* self, void* unused) {
+PyObject* THPVariable_is_maia(THPVariable* self, void* unused) {
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
-    return handle_torch_function_getter(self, "is_ort");
+    return handle_torch_function_getter(self, "is_maia");
   }
   auto& self_ = THPVariable_Unpack(self);
-  return torch::autograd::utils::wrap(self_.is_ort());
+  return torch::autograd::utils::wrap(self_.is_maia());
   END_HANDLE_TH_ERRORS
 }
 
@@ -1460,7 +1527,7 @@ static PyObject* THPVariable_dtype(THPVariable* self, void* unused) {
     return handle_torch_function_getter(self, "dtype");
   }
   auto& self_ = THPVariable_Unpack(self);
-  return torch::autograd::utils::wrap(torch::getTHPDtype(self_.scalar_type()));
+  return torch::autograd::utils::wrap(self_.scalar_type());
   END_HANDLE_TH_ERRORS
 }
 
@@ -1470,7 +1537,7 @@ static PyObject* THPVariable_layout(THPVariable* self, void* unused) {
     return handle_torch_function_getter(self, "layout");
   }
   auto& self_ = THPVariable_Unpack(self);
-  return torch::autograd::utils::wrap(torch::getTHPLayout(self_.layout()));
+  return torch::autograd::utils::wrap(self_.layout());
   END_HANDLE_TH_ERRORS
 }
 
@@ -1607,7 +1674,7 @@ static struct PyGetSetDef THPVariable_properties[] = {
      nullptr},
     {"is_mkldnn", (getter)THPVariable_is_mkldnn, nullptr, nullptr, nullptr},
     {"is_mps", (getter)THPVariable_is_mps, nullptr, nullptr, nullptr},
-    {"is_ort", (getter)THPVariable_is_ort, nullptr, nullptr, nullptr},
+    {"is_maia", (getter)THPVariable_is_maia, nullptr, nullptr, nullptr},
     {"is_vulkan", (getter)THPVariable_is_vulkan, nullptr, nullptr, nullptr},
     {"is_complex", (getter)THPVariable_is_complex, nullptr, nullptr, nullptr},
     {"is_quantized",
@@ -1666,8 +1733,14 @@ static PyMethodDef extra_methods[] = {
      METH_STATIC | METH_VARARGS | METH_KEYWORDS,
      nullptr},
     {"_fix_weakref", THPVariable_fix_weakref, METH_NOARGS, nullptr},
-    {"_view_func", THPVariable_view_func, METH_O, nullptr},
-    {"_view_func_unsafe", THPVariable_view_func_unsafe, METH_O, nullptr},
+    {"_view_func",
+     castPyCFunctionWithKeywords(THPVariable_view_func),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {"_view_func_unsafe",
+     castPyCFunctionWithKeywords(THPVariable_view_func_unsafe),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
     {"_rev_view_func_unsafe",
      THPVariable_rev_view_func_unsafe,
      METH_O,
@@ -2187,11 +2260,59 @@ int THPVariableMetaType_init(PyObject* cls, PyObject* args, PyObject* kwargs) {
   ((PyTypeObject*)cls)->tp_dealloc = (destructor)THPVariable_subclass_dealloc;
   ((PyTypeObject*)cls)->tp_traverse =
       (traverseproc)THPVariable_subclass_traverse;
+
+  // Don't do anything for the base Tensor class
+  if (!THPVariableClass) {
+    return 0;
+  }
+
+  // Forbid subclassing _TensorBase directly
+  py::tuple mro =
+      py::reinterpret_borrow<py::tuple>(((PyTypeObject*)cls)->tp_mro);
+  bool is_subclass_of_thpvariable = false;
+  for (py::handle h : mro) {
+    if (h.ptr() == THPVariableClass) {
+      is_subclass_of_thpvariable = true;
+      break;
+    }
+  }
+  if (!is_subclass_of_thpvariable) {
+    PyErr_SetString(PyExc_RuntimeError, "Cannot subclass _TensorBase directly");
+    return -1;
+  }
+
+  // If the user provided a torch_dispatch implementation, disable
+  // torch_function.
+  py::object torch_dispatch_impl = py::reinterpret_steal<py::object>(
+      PyObject_GetAttrString(cls, "__torch_dispatch__"));
+  py::object torch_dispatch_default = py::reinterpret_steal<py::object>(
+      PyObject_GetAttrString(THPVariableClass, "__torch_dispatch__"));
+  if (torch_dispatch_impl.ptr() != torch_dispatch_default.ptr()) {
+    py::object torch_function_impl = py::reinterpret_steal<py::object>(
+        PyObject_GetAttrString(cls, "__torch_function__"));
+    py::object torch_function_default_bound = py::reinterpret_steal<py::object>(
+        PyObject_GetAttrString(THPVariableClass, "__torch_function__"));
+
+    // Since our __torch_function__ is a classmethod, we need to "unbound" the
+    // method to get the raw function
+    py::object torch_function_default = py::reinterpret_steal<py::object>(
+        PyObject_GetAttrString(torch_function_default_bound.ptr(), "__func__"));
+
+    // User-defined __torch_function__ might not be a classmethod
+    if (PyObject_HasAttrString(torch_function_impl.ptr(), "__func__")) {
+      torch_function_impl = py::reinterpret_steal<py::object>(
+          PyObject_GetAttrString(torch_function_impl.ptr(), "__func__"));
+    }
+    if (torch_function_impl.ptr() == torch_function_default.ptr()) {
+      PyObject_SetAttrString(
+          cls, "__torch_function__", torch::disabled_torch_function_impl());
+    }
+  }
+
   return 0;
 }
 
-namespace torch {
-namespace autograd {
+namespace torch::autograd {
 
 // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
 extern PyMethodDef variable_methods[];
@@ -2213,8 +2334,7 @@ void initTensorImplConversion(PyObject* module) {
     return t->getIntrusivePtr().get();
   });
 }
-} // namespace autograd
-} // namespace torch
+} // namespace torch::autograd
 
 bool THPVariable_initModule(PyObject* module) {
   THPVariableMetaType.tp_base = &PyType_Type;
diff --git a/torch/csrc/autograd/python_variable.h b/torch/csrc/autograd/python_variable.h
index 7ac6d4482bbf0..d0cb13e9f33e1 100644
--- a/torch/csrc/autograd/python_variable.h
+++ b/torch/csrc/autograd/python_variable.h
@@ -3,7 +3,6 @@
 #include <ATen/core/Tensor.h>
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/pythoncapi_compat.h>
-#include <memory>
 
 #include <ATen/core/function_schema.h>
 #include <pybind11/pybind11.h>
@@ -32,7 +31,7 @@ TORCH_PYTHON_API void registerPythonTensorClass(
     const std::string& device,
     PyObject* python_tensor_class);
 
-TORCH_PYTHON_API void activateCUDATrace();
+TORCH_PYTHON_API void activateGPUTrace();
 
 TORCH_PYTHON_API extern PyObject* THPVariableClass;
 TORCH_PYTHON_API extern PyObject* ParameterClass;
@@ -89,7 +88,7 @@ void pushPyOutToStack(
 
 inline PyObject* THPVariable_WrapList(
     const torch::autograd::variable_list& inputs) {
-  PyObject* pyinput = PyList_New(inputs.size());
+  PyObject* pyinput = PyList_New(static_cast<Py_ssize_t>(inputs.size()));
   for (const auto i : c10::irange(inputs.size())) {
     PyList_SET_ITEM(pyinput, i, THPVariable_Wrap(inputs[i]));
   }
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index 3b12802725123..e3cdd04f0965a 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -28,14 +28,11 @@
 #include <c10/util/irange.h>
 
 #include <c10/core/Layout.h>
-#include <tuple>
-#include <vector>
 
 using namespace at;
 using namespace torch::autograd::utils;
 
-namespace torch {
-namespace autograd {
+namespace torch::autograd {
 
 Py_ssize_t THPVariable_length(PyObject* self) {
   HANDLE_TH_ERRORS
@@ -71,7 +68,7 @@ static inline int64_t count_specified_dimensions(PyObject* index) {
   for (Py_ssize_t i = 0; i < size; i++) {
     PyObject* obj = PyTuple_GET_ITEM(
         index, i); // NOLINT(cppcoreguidelines-pro-type-cstyle-cast)
-    if (!THPVariable_CheckExact(obj) && check_has_torch_function(obj))
+    if (check_has_torch_function(obj))
       return -1;
     if (THPVariable_Check(obj)) {
       const auto& var = THPVariable_Unpack(obj);
@@ -91,10 +88,12 @@ static inline int64_t count_specified_dimensions(PyObject* index) {
 }
 
 [[noreturn]] static inline void invalid_index(PyObject* obj) {
-  throw IndexError(
+  TORCH_CHECK_INDEX(
+      false,
       "only integers, slices (`:`), ellipsis (`...`), None and long or byte "
-      "Variables are valid indices (got %s)",
-      Py_TYPE(obj)->tp_name);
+      "Variables are valid indices (got ",
+      Py_TYPE(obj)->tp_name,
+      ")");
 }
 
 static inline Variable sequenceToVariable(
@@ -341,7 +340,7 @@ static inline THPObjectPtr wrapTuple(PyObject* index) {
 // indexing is needed, it calls C++ `at::indexing::dispatch_index`.
 PyObject* THPVariable_getitem(PyObject* self, PyObject* index) {
   HANDLE_TH_ERRORS
-  if (!THPVariable_CheckExact(self) && check_has_torch_function(self)) {
+  if (check_has_torch_function(self)) {
     return handle_torch_function_indexing(self, index);
   }
   const auto& self_ = THPVariable_Unpack(self);
@@ -438,9 +437,8 @@ int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* py_value) {
   if (py_value == nullptr) {
     throw TypeError("Tensor does not support deleting items");
   }
-  if ((!THPVariable_CheckExact(self) && check_has_torch_function(self)) ||
-      (!THPVariable_CheckExact(py_value) &&
-       check_has_torch_function(py_value))) {
+  if ((check_has_torch_function(self)) ||
+      (check_has_torch_function(py_value))) {
     py::object ret = py::reinterpret_steal<py::object>(
         handle_torch_function_indexing(self, index, py_value));
     return 0;
@@ -553,5 +551,4 @@ int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* py_value) {
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
-} // namespace autograd
-} // namespace torch
+} // namespace torch::autograd
diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp
index 8e3f68f15df8d..4bd44339c3b45 100644
--- a/torch/csrc/autograd/saved_variable.cpp
+++ b/torch/csrc/autograd/saved_variable.cpp
@@ -9,8 +9,6 @@
 
 #include <ATen/Tensor.h>
 
-#include <cstdint>
-#include <list>
 #include <memory>
 #include <sstream>
 
diff --git a/torch/csrc/autograd/symbolic.h b/torch/csrc/autograd/symbolic.h
index 1388238737dad..1cec51648432c 100644
--- a/torch/csrc/autograd/symbolic.h
+++ b/torch/csrc/autograd/symbolic.h
@@ -2,7 +2,6 @@
 
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/onnx/onnx.h>
-#include <vector>
 
 namespace torch::autograd {
 
diff --git a/torch/csrc/autograd/utils/grad_layout_contract.h b/torch/csrc/autograd/utils/grad_layout_contract.h
index 37dda0f9acaac..1dad10663dd70 100644
--- a/torch/csrc/autograd/utils/grad_layout_contract.h
+++ b/torch/csrc/autograd/utils/grad_layout_contract.h
@@ -17,6 +17,7 @@ inline bool obeys_layout_contract(
   TORCH_INTERNAL_ASSERT(!grad.is_sparse_csr());
   TORCH_INTERNAL_ASSERT(!variable.is_sparse_csr());
 
+  // NOLINTNEXTLINE(bugprone-branch-clone)
   if (variable.is_nested()) {
     // TODO: Nested Tensor does not have an implementation of detach. The
     // current implementation of nested tensor likely does obey the gradient
diff --git a/torch/csrc/autograd/utils/lambda_post_hook.h b/torch/csrc/autograd/utils/lambda_post_hook.h
index f22a22312159d..ade2389363ebf 100644
--- a/torch/csrc/autograd/utils/lambda_post_hook.h
+++ b/torch/csrc/autograd/utils/lambda_post_hook.h
@@ -9,15 +9,18 @@ namespace utils {
 // Turns lambda into a torch::autograd::FunctionPostHook.
 class LambdaPostHook : public torch::autograd::FunctionPostHook {
   using variable_list = std::vector<torch::autograd::Variable>;
+  using fn_type =
+      std::function<variable_list(const variable_list&, const variable_list&)>;
+  using compiled_fn_type = std::function<void(CompiledNodeArgs&)>;
 
  public:
   // The lambda function takes as arguments the outputs and inputs of the
   // autograd function and can modify the outputs of the autograd function by
   // returning a new output if needed.
-  /* implicit */ LambdaPostHook(
-      std::function<variable_list(const variable_list&, const variable_list&)>
-          fn)
-      : fn_(std::move(fn)) {}
+  /* implicit */ LambdaPostHook(fn_type fn) : fn_(std::move(fn)) {}
+
+  LambdaPostHook(fn_type fn, compiled_fn_type compiled_fn)
+      : fn_(std::move(fn)), compiled_fn_(std::move(compiled_fn)) {}
 
   variable_list operator()(
       const variable_list& outputs,
@@ -25,8 +28,11 @@ class LambdaPostHook : public torch::autograd::FunctionPostHook {
     return fn_(outputs, inputs);
   }
 
+  void compiled_args(CompiledNodeArgs& args) override {}
+
  protected:
   std::function<variable_list(const variable_list&, const variable_list&)> fn_;
+  compiled_fn_type compiled_fn_;
 };
 
 } // namespace utils
diff --git a/torch/csrc/autograd/utils/wrap_outputs.h b/torch/csrc/autograd/utils/wrap_outputs.h
index 90d7051a87d16..c5eae5c82ef7e 100644
--- a/torch/csrc/autograd/utils/wrap_outputs.h
+++ b/torch/csrc/autograd/utils/wrap_outputs.h
@@ -30,6 +30,10 @@ inline PyObject* wrap(bool value) {
   }
 }
 
+inline PyObject* wrap(c10::DeviceIndex value) {
+  return THPUtils_packDeviceIndex(value);
+}
+
 inline PyObject* wrap(int64_t value) {
   return THPUtils_packInt64(value);
 }
@@ -49,21 +53,19 @@ inline PyObject* wrap(void* value) {
 }
 
 inline PyObject* wrap(THPDtype* dtype) {
-  Py_INCREF(dtype);
-  return (PyObject*)dtype;
+  return Py_NewRef(dtype);
 }
 
 inline PyObject* wrap(at::ScalarType scalarType) {
-  return wrap(getTHPDtype(scalarType));
+  return Py_NewRef(getTHPDtype(scalarType));
 }
 
 inline PyObject* wrap(THPLayout* layout) {
-  Py_INCREF(layout);
-  return (PyObject*)layout;
+  return Py_NewRef(layout);
 }
 
 inline PyObject* wrap(at::Layout layout) {
-  return wrap(getTHPLayout(layout));
+  return Py_NewRef(getTHPLayout(layout));
 }
 
 inline PyObject* wrap(at::Tensor tensor) {
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index 86a37a79b4f86..07e37463cbd38 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -7,7 +7,9 @@
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/autograd/functions/accumulate_grad.h>
 #include <torch/csrc/autograd/functions/tensor.h>
+#include <torch/csrc/autograd/functions/utils.h>
 #include <torch/csrc/autograd/generated/Functions.h>
+#include <torch/csrc/autograd/generated/ViewFuncs.h>
 #include <torch/csrc/autograd/utils/error_messages.h>
 
 #include <ATen/ATen.h>
@@ -15,19 +17,29 @@
 #include <ATen/MemoryOverlap.h>
 #include <c10/util/Exception.h>
 
-#include <iostream>
-#include <list>
 #include <memory>
 #include <mutex>
 #include <stdexcept>
 #include <string>
-#include <typeinfo>
 #include <utility>
 #include <vector>
 
 namespace torch {
 namespace autograd {
 
+// Returns a ViewFunc with a corresponding view that matches the shape,
+// stride, and storage offset of the given tensor.
+// NB: On mobile, the as_strided() op and thus the generated AsStridedViewFunc
+// may not be available.
+static std::unique_ptr<ViewFunc> create_view_func_matching(const Variable& t) {
+#ifdef AS_STRIDED_VIEW_FUNC_AVAILABLE
+  return std::make_unique<torch::autograd::generated::AsStridedViewFunc>(
+      t.sym_sizes(), t.sym_strides(), t.sym_storage_offset());
+#else
+  return std::make_unique<ErroringViewFunc>("as_strided() not available");
+#endif
+}
+
 DifferentiableViewMeta::DifferentiableViewMeta(
     at::TensorImpl* self_impl,
     c10::optional<ViewInfo> backward_info,
@@ -61,7 +73,7 @@ DifferentiableViewMeta::DifferentiableViewMeta(
 ViewInfo ViewInfo::chain(
     const Variable& base,
     const Variable& tensor,
-    std::function<Variable(const Variable&)> view_func,
+    std::unique_ptr<ViewFunc> view_func,
     std::function<Variable(const Variable&)> rev_view_func) const {
   // Set `view_func` using the root base as input.
   // `view_func` is used to recover views in backward when either as_strided is
@@ -72,12 +84,8 @@ ViewInfo ViewInfo::chain(
   if (view_func) {
     // both current_view and it's parent have a view_func
     if (view_fn_) {
-      // Copy parent view function to gain ownership
-      auto prev_fn = view_fn_;
-      view_func = [=](const at::Tensor& root_base) {
-        auto temp = prev_fn(root_base);
-        return view_func(temp);
-      };
+      view_func = std::make_unique<ChainedViewFunc>(
+          view_fn_->clone_and_set(), std::move(view_func));
 
       // assume view_fn_ / rev_view_fn_ always exist together or neither are set
       auto prev_rev_fn = rev_view_fn_;
@@ -88,13 +96,9 @@ ViewInfo ViewInfo::chain(
     } else {
       // current_view has a view_func and but it's parent doesn't have one
       if (base.unsafeGetTensorImpl()->support_as_strided()) {
-        auto size = base.sym_sizes().vec();
-        auto stride = base.sym_strides().vec();
-        auto storage_offset = base.sym_storage_offset();
-        view_func = [=](const at::Tensor& root_base) {
-          auto temp = root_base.as_strided_symint(size, stride, storage_offset);
-          return view_func(temp);
-        };
+        auto match_base_view_func = create_view_func_matching(base);
+        view_func = std::make_unique<ChainedViewFunc>(
+            std::move(match_base_view_func), std::move(view_func));
 
         // assume view_fn_ / rev_view_fn_ always exist together or neither are
         // set
@@ -114,12 +118,7 @@ ViewInfo ViewInfo::chain(
         auto error_msg =
             ("Attempted to chain views when the parent view has no view_func() and "
              "does not support as_strided(). This is not supported.");
-
-        view_func = [=](const at::Tensor& root_base) {
-          TORCH_CHECK(false, error_msg);
-          return root_base;
-        };
-
+        view_func = std::make_unique<ErroringViewFunc>(error_msg);
         rev_view_func = [=](const at::Tensor& root_view) {
           TORCH_CHECK(false, error_msg);
           return root_view;
@@ -128,15 +127,9 @@ ViewInfo ViewInfo::chain(
     }
   } else if (view_fn_) {
     // if current_view doesn't have a view_func but it's parent has one
-    // Copy parent view function to gain ownership
-    auto prev_view_fn = view_fn_;
-    auto size = tensor.sym_sizes().vec();
-    auto stride = tensor.sym_strides().vec();
-    auto storage_offset = tensor.sym_storage_offset();
-    view_func = [=](const at::Tensor& root_base) {
-      auto temp = prev_view_fn(root_base);
-      return temp.as_strided_symint(size, stride, storage_offset);
-    };
+    auto match_tensor_view_func = create_view_func_matching(tensor);
+    view_func = std::make_unique<ChainedViewFunc>(
+        view_fn_->clone_and_set(), std::move(match_tensor_view_func));
 
     // assume view_fn_ / rev_view_fn_ always exist together or neither are set
     auto prev_rev_view_fn = rev_view_fn_;
@@ -235,13 +228,18 @@ void rebase_history(const Variable& self, Edge gradient_edge) {
     TORCH_CHECK(
         gradient_edge.function->num_inputs() == 1,
         "Functions which modify views in-place must return a single Variable");
-    auto view_info = diff_view_meta->get_backward_view();
+    const auto& view_info = diff_view_meta->get_backward_view();
     diff_view_meta->output_nr_ = gradient_edge.input_nr;
     auto copy_slices = std::make_shared<CopySlices>(
         view_info.base_,
         at::TensorGeometry(self),
-        view_info.view_fn_,
+        view_info.has_view_fn() ? view_info.view_fn().clone_and_set() : nullptr,
         std::move(gradient_edge.function));
+    if (self.requires_grad()) {
+      // If self did not previously require grad, there are no hooks to move
+      torch::autograd::impl::update_tensor_hooks_on_new_gradfn(
+          view_info.base_, view_info.base_.grad_fn(), copy_slices);
+    }
     set_gradient_edge(view_info.base_, {std::move(copy_slices), 0});
     self.grad_fn(); // trigger an update to the view's grad_fn
     return;
@@ -654,7 +652,7 @@ const std::shared_ptr<torch::autograd::Node>& VariableHooks::grad_fn(
   if (diff_view_meta && diff_view_meta->has_bw_view()) {
     // See NOTE [ View + Inplace detection ]
     std::lock_guard<std::mutex> lock(diff_view_meta->mutex_);
-    auto view_info = diff_view_meta->get_backward_view();
+    auto& view_info = diff_view_meta->get_backward_view();
     if (!diff_view_meta->grad_fn_ && !view_info.base_.requires_grad()) {
       return diff_view_meta->grad_fn_;
     }
@@ -694,7 +692,7 @@ const std::shared_ptr<torch::autograd::Node>& VariableHooks::grad_fn(
       // in VariableType_x.cpp
       //       that would provide a way to recreate the grad_fn chain.
       if (view_info.has_view_fn()) {
-        auto view_fn = view_info.view_fn();
+        auto& view_fn = view_info.view_fn();
         Tensor diff_view;
         {
           // We can reach this path with grad_mode disabled, e.g. engine
@@ -854,5 +852,59 @@ void handle_view_on_rebase(
   }
 }
 
+std::vector<c10::SymInt> ChainedViewFunc::get_symints() const {
+  auto symints = first->get_symints();
+  auto second_symints = second->get_symints();
+  symints.reserve(symints.size() + second_symints.size());
+  symints.insert(
+      symints.end(),
+      std::make_move_iterator(second_symints.begin()),
+      std::make_move_iterator(second_symints.end()));
+  return symints;
+}
+
+std::vector<at::Tensor> ChainedViewFunc::get_tensors() const {
+  auto tensors = first->get_tensors();
+  auto second_tensors = second->get_tensors();
+  tensors.reserve(tensors.size() + second_tensors.size());
+  tensors.insert(
+      tensors.end(),
+      std::make_move_iterator(second_tensors.begin()),
+      std::make_move_iterator(second_tensors.end()));
+  return tensors;
+}
+
+at::Tensor ChainedViewFunc::operator()(const at::Tensor& input_base) const {
+  return (*second)((*first)(input_base));
+}
+
+std::unique_ptr<ViewFunc> ChainedViewFunc::clone_and_set(
+    std::optional<std::vector<c10::SymInt>> symints,
+    std::optional<std::vector<at::Tensor>> tensors) const {
+  std::optional<std::vector<c10::SymInt>> first_symints;
+  std::optional<std::vector<c10::SymInt>> second_symints;
+  if (symints.has_value()) {
+    TORCH_INTERNAL_ASSERT(symints->size() == num_symints());
+    first_symints = std::vector<c10::SymInt>(
+        symints->begin(), symints->begin() + first->num_symints());
+    second_symints = std::vector<c10::SymInt>(
+        symints->begin() + first->num_symints(), symints->end());
+  }
+
+  std::optional<std::vector<at::Tensor>> first_tensors;
+  std::optional<std::vector<at::Tensor>> second_tensors;
+  if (tensors.has_value()) {
+    TORCH_INTERNAL_ASSERT(tensors->size() == num_tensors());
+    first_tensors = std::vector<at::Tensor>(
+        tensors->begin(), tensors->begin() + first->num_tensors());
+    second_tensors = std::vector<at::Tensor>(
+        tensors->begin() + first->num_tensors(), tensors->end());
+  }
+
+  return std::make_unique<ChainedViewFunc>(
+      first->clone_and_set(first_symints, first_tensors),
+      second->clone_and_set(second_symints, second_tensors));
+}
+
 } // namespace autograd
 } // namespace torch
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index 4b42b1dc809d3..aa9ee76f3dc95 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -16,7 +16,6 @@
 #include <cstdint>
 #include <memory>
 #include <mutex>
-#include <stdexcept>
 #include <string>
 #include <utility>
 #include <vector>
@@ -323,6 +322,91 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
   }
 };
 
+/// Base class for view functions, providing reapplication of a view on a new
+/// base. Each view op should get a codegenerated subclass of this class
+/// containing any state needed to reconstruct the view. The class also provides
+/// convenience accessors for saved SymInts / tensor state. This is useful for
+/// e.g. fake-ification, where we want to use symbolic values or fake tensors
+/// instead.
+struct TORCH_API ViewFunc {
+  virtual ~ViewFunc() {}
+  /// Returns any SymInts in the saved state.
+  virtual std::vector<c10::SymInt> get_symints() const {
+    return {};
+  }
+  /// Returns the number of SymInts in the saved state.
+  virtual size_t num_symints() const {
+    return 0;
+  }
+  /// Returns any tensors in the saved state.
+  virtual std::vector<at::Tensor> get_tensors() const {
+    return {};
+  }
+  /// Returns the number of tensors in the saved state.
+  virtual size_t num_tensors() const {
+    return 0;
+  }
+  /// Reapplies the view on the given base using the saved state.
+  virtual at::Tensor operator()(const at::Tensor&) const = 0;
+  /// Returns a clone of this ViewFunc, optionally with the specified saved
+  /// state.
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const = 0;
+
+ protected:
+  /// Sets the values of any SymInts in the saved state. The input vector size
+  /// must match the number of SymInts in the saved state (i.e. the size of the
+  /// list returned by get_symints()).
+  virtual void set_symints(std::vector<c10::SymInt>) {}
+  /// Sets the values of any Tensors in the saved state. The input vector size
+  /// must match the number of Tensors in the saved state (i.e. the size of the
+  /// list returned by get_tensors()).
+  virtual void set_tensors(std::vector<at::Tensor>) {}
+};
+
+/// ViewFunc that represents a chain of two ViewFuncs.
+struct ChainedViewFunc : public ViewFunc {
+  ChainedViewFunc(
+      std::unique_ptr<ViewFunc> first,
+      std::unique_ptr<ViewFunc> second)
+      : first(std::move(first)), second(std::move(second)) {}
+  virtual ~ChainedViewFunc() override{};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override {
+    return first->num_symints() + second->num_symints();
+  }
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override {
+    return first->num_tensors() + second->num_tensors();
+  }
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+ private:
+  std::unique_ptr<ViewFunc> first;
+  std::unique_ptr<ViewFunc> second;
+};
+
+/// ViewFunc that errors with a specified error message when called.
+struct ErroringViewFunc : public ViewFunc {
+  ErroringViewFunc(const std::string& error_msg) : error_msg(error_msg) {}
+  virtual ~ErroringViewFunc() override{};
+  virtual at::Tensor operator()(const at::Tensor&) const override {
+    TORCH_CHECK(false, error_msg);
+  }
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override {
+    return std::make_unique<ErroringViewFunc>(error_msg);
+  }
+
+ private:
+  std::string error_msg;
+};
+
 struct TORCH_API ViewInfo {
   /// The base `Variable`
   /// If this ViewInfo represents a forward (respectively backward) AD gradient,
@@ -332,7 +416,8 @@ struct TORCH_API ViewInfo {
   /// By default we use as_strided to recover views which is more efficient.
   /// view_fn is only saved when as_strided is not supported.
   /// If view_fn has value, we use it to recover views in backward.
-  std::function<Variable(const Variable&)> view_fn_;
+  std::unique_ptr<ViewFunc> view_fn_;
+
   /// Analogue of view_fn but in reverse: given a view -> produce the base by
   /// applying the inverse view.
   std::function<Variable(const Variable&)> rev_view_fn_;
@@ -343,10 +428,10 @@ struct TORCH_API ViewInfo {
     return view_fn_ != nullptr;
   }
 
-  std::function<Variable(const Variable&)> view_fn() const {
+  const ViewFunc& view_fn() const {
     TORCH_CHECK(
         has_view_fn(), "Can only access the view function if it exists.");
-    return view_fn_;
+    return *view_fn_;
   }
 
   std::function<Variable(const Variable&)> rev_view_fn() const {
@@ -367,12 +452,12 @@ struct TORCH_API ViewInfo {
   ViewInfo chain(
       const Variable& base,
       const Variable& tensor,
-      std::function<Variable(const Variable&)> view_func = nullptr,
+      std::unique_ptr<ViewFunc> view_func = nullptr,
       std::function<Variable(const Variable&)> rev_view_func = nullptr) const;
 
   ViewInfo(
       Variable base,
-      std::function<Variable(const Variable&)> view_fn,
+      std::unique_ptr<ViewFunc> view_fn,
       std::function<Variable(const Variable&)> rev_view_fn)
       : base_(std::move(base)),
         view_fn_(std::move(view_fn)),
@@ -596,7 +681,7 @@ TORCH_API void handle_view_on_rebase(
 
 struct TORCH_API DifferentiableViewMeta : public AutogradMeta {
  private:
-  /// Informations about the views
+  /// Information about the views
   c10::optional<ViewInfo> backward_info_;
   c10::optional<ViewInfo> forward_info_;
 
diff --git a/torch/csrc/autograd/variable_info.cpp b/torch/csrc/autograd/variable_info.cpp
new file mode 100644
index 0000000000000..bffd3250fb088
--- /dev/null
+++ b/torch/csrc/autograd/variable_info.cpp
@@ -0,0 +1,32 @@
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/zeros.h>
+#endif
+
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/autograd/variable_info.h>
+
+namespace torch::autograd {
+
+VariableInfo::VariableInfo(const Variable& var)
+    : layout(var.layout()),
+      device(var.device()),
+      scalar_type(var.scalar_type()),
+      size(var.sym_sizes().vec()),
+      requires_grad(var.requires_grad()),
+      is_empty(false) {}
+
+VariableInfo::VariableInfo() : requires_grad(false), is_empty(true) {}
+
+Variable VariableInfo::zeros(at::OptionalDeviceGuard& device_guard) const {
+  if (is_empty) {
+    // Return undefined tensor.
+    return at::Tensor();
+  } else {
+    return at::zeros_symint(
+        size, at::TensorOptions(scalar_type).device(device).layout(layout));
+  }
+}
+
+} // namespace torch::autograd
diff --git a/torch/csrc/autograd/variable_info.h b/torch/csrc/autograd/variable_info.h
new file mode 100644
index 0000000000000..63e88deb0d547
--- /dev/null
+++ b/torch/csrc/autograd/variable_info.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/autograd/variable.h>
+
+namespace torch::autograd {
+
+struct TORCH_API VariableInfo {
+  explicit VariableInfo();
+  explicit VariableInfo(const Variable& var);
+
+  Variable zeros(at::OptionalDeviceGuard& device_guard) const;
+
+  at::Layout layout = at::Layout::Strided;
+  at::Device device = at::kCPU;
+  at::ScalarType scalar_type = at::kFloat;
+  std::vector<c10::SymInt> size;
+  bool requires_grad;
+  bool is_empty;
+};
+
+} // namespace torch::autograd
diff --git a/torch/csrc/copy_utils.h b/torch/csrc/copy_utils.h
index 308706dec825a..4d7b689016bd9 100644
--- a/torch/csrc/copy_utils.h
+++ b/torch/csrc/copy_utils.h
@@ -2,6 +2,7 @@
 
 #include <torch/csrc/Types.h>
 #include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils.h>
 #include <functional>
 #include <vector>
 
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.cpp b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
index 4dfdcd3d280b4..cb7b62387b68b 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.cpp
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
@@ -17,7 +17,7 @@ _AllocationMetadata::_AllocationMetadata()
 
 _AllocationMetadata::_AllocationMetadata(
     size_t size,
-    int device_idx,
+    c10::DeviceIndex device_idx,
     cudaStream_t stream)
     : size(size), device_idx(device_idx), stream(stream) {}
 
@@ -84,7 +84,7 @@ void CUDAPluggableAllocator::set_release_pool(
 
 void* CUDAPluggableAllocator::malloc(
     size_t size,
-    int device,
+    c10::DeviceIndex device,
     cudaStream_t stream) {
   void* r = alloc_fn_(size, device, stream);
   {
@@ -94,19 +94,13 @@ void* CUDAPluggableAllocator::malloc(
   return r;
 }
 
-c10::DataPtr CUDAPluggableAllocator::allocate(size_t size) const {
-  int device = -1;
+c10::DataPtr CUDAPluggableAllocator::allocate(size_t size) {
+  c10::DeviceIndex device = -1;
   C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
-  cudaStream_t stream =
-      c10::cuda::getCurrentCUDAStream(static_cast<c10::DeviceIndex>(device));
-  void* r =
-      const_cast<CUDAPluggableAllocator*>(this)->malloc(size, device, stream);
+  cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device);
+  void* r = this->malloc(size, device, stream);
   c10::DataPtr data_ptr = {
-      r,
-      r,
-      raw_deleter(),
-      c10::Device(
-          c10::DeviceType::CUDA, static_cast<c10::DeviceIndex>(device))};
+      r, r, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
   return data_ptr;
 }
 
@@ -115,24 +109,23 @@ c10::DeleterFnPtr CUDAPluggableAllocator::raw_deleter() const {
 }
 
 void* CUDAPluggableAllocator::raw_alloc(size_t nbytes) {
-  int device = -1;
+  c10::DeviceIndex device = -1;
   C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
-  cudaStream_t stream =
-      c10::cuda::getCurrentCUDAStream(static_cast<c10::DeviceIndex>(device));
+  cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device);
   return malloc(nbytes, device, stream);
 }
 
 void* CUDAPluggableAllocator::raw_alloc_with_stream(
     size_t nbytes,
     cudaStream_t stream) {
-  int device = -1;
+  c10::DeviceIndex device = -1;
   C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
   return malloc(nbytes, device, stream);
 }
 
 void CUDAPluggableAllocator::raw_delete(void* ptr) {
   cudaStream_t stream{};
-  int device_idx = -1;
+  c10::DeviceIndex device_idx = -1;
   size_t size = 0;
   {
     const std::lock_guard<std::mutex> lock(allocator_mutex_);
@@ -159,7 +152,9 @@ bool CUDAPluggableAllocator::initialized() {
   return initialized_;
 }
 
-void CUDAPluggableAllocator::setMemoryFraction(double fraction, int device) {
+void CUDAPluggableAllocator::setMemoryFraction(
+    double fraction,
+    c10::DeviceIndex device) {
   if (memory_fraction_fn_) {
     memory_fraction_fn_(fraction, device);
   }
@@ -171,7 +166,9 @@ void CUDAPluggableAllocator::emptyCache() {
   }
 }
 
-void CUDAPluggableAllocator::cacheInfo(int dev_id, size_t* largestBlock) {
+void CUDAPluggableAllocator::cacheInfo(
+    c10::DeviceIndex device,
+    size_t* largestBlock) {
   TORCH_CHECK(
       false,
       "CUDAPluggableAllocator does not yet support cacheInfo. "
@@ -195,21 +192,21 @@ void CUDAPluggableAllocator::recordStream(
 }
 
 c10::cuda::CUDACachingAllocator::DeviceStats CUDAPluggableAllocator::
-    getDeviceStats(int device) {
+    getDeviceStats(c10::DeviceIndex device) {
   TORCH_CHECK(
       false,
       "CUDAPluggableAllocator does not yet support getDeviceStats. "
       "If you need it, please file an issue describing your use case.");
 }
 
-void CUDAPluggableAllocator::resetAccumulatedStats(int device) {
+void CUDAPluggableAllocator::resetAccumulatedStats(c10::DeviceIndex device) {
   TORCH_CHECK(
       false,
       "CUDAPluggableAllocator does not yet support resetAccumulatedStats. "
       "If you need it, please file an issue describing your use case.");
 }
 
-void CUDAPluggableAllocator::resetPeakStats(int device) {
+void CUDAPluggableAllocator::resetPeakStats(c10::DeviceIndex device) {
   TORCH_CHECK(
       false,
       "CUDAPluggableAllocator does not yet support resetPeakStats. "
@@ -233,7 +230,7 @@ std::shared_ptr<void> CUDAPluggableAllocator::getIpcDevPtr(std::string handle) {
 
 // CUDAGraph interactions
 void CUDAPluggableAllocator::beginAllocateToPool(
-    int device,
+    c10::DeviceIndex device,
     c10::cuda::MempoolId_t mempool_id,
     std::function<bool(cudaStream_t)> filter) {
   if (begin_allocate_to_pool_fn_) {
@@ -242,7 +239,7 @@ void CUDAPluggableAllocator::beginAllocateToPool(
 }
 
 void CUDAPluggableAllocator::endAllocateToPool(
-    int device,
+    c10::DeviceIndex device,
     c10::cuda::MempoolId_t mempool_id) {
   if (end_allocate_to_pool_fn_) {
     end_allocate_to_pool_fn_(device, mempool_id);
@@ -250,7 +247,7 @@ void CUDAPluggableAllocator::endAllocateToPool(
 }
 
 void CUDAPluggableAllocator::releasePool(
-    int device,
+    c10::DeviceIndex device,
     c10::cuda::MempoolId_t mempool_id) {
   if (relase_pool_fn_) {
     relase_pool_fn_(device, mempool_id);
@@ -286,7 +283,7 @@ void CUDAPluggableAllocator::attachAllocatorTraceTracker(
 
 std::shared_ptr<c10::cuda::CUDACachingAllocator::AllocatorState>
 CUDAPluggableAllocator::getCheckpointState(
-    int device,
+    c10::DeviceIndex device,
     at::cuda::MempoolId_t id) {
   TORCH_CHECK(
       false,
@@ -296,7 +293,7 @@ CUDAPluggableAllocator::getCheckpointState(
 
 c10::cuda::CUDACachingAllocator::CheckpointDelta CUDAPluggableAllocator::
     setCheckpointPoolState(
-        int device,
+        c10::DeviceIndex device,
         std::shared_ptr<c10::cuda::CUDACachingAllocator::AllocatorState> pps) {
   TORCH_CHECK(
       false,
@@ -304,8 +301,10 @@ c10::cuda::CUDACachingAllocator::CheckpointDelta CUDAPluggableAllocator::
       "If you need it, please file an issue describing your use case.");
 }
 
-void CUDAPluggableAllocator::enablePeerAccess(int dev, int dev_to_access) {
-  c10::cuda::CUDAGuard device_guard(static_cast<c10::DeviceIndex>(dev));
+void CUDAPluggableAllocator::enablePeerAccess(
+    c10::DeviceIndex dev,
+    c10::DeviceIndex dev_to_access) {
+  c10::cuda::CUDAGuard device_guard(dev);
   cudaError_t err = cudaDeviceEnablePeerAccess(dev_to_access, 0);
   if (err == cudaErrorPeerAccessAlreadyEnabled) {
     // ignore and clear the error if access was already enabled
@@ -330,6 +329,14 @@ std::string CUDAPluggableAllocator::name() {
   return "pluggable";
 }
 
+void CUDAPluggableAllocator::copy_data(
+    void* dest,
+    const void* src,
+    std::size_t count) const {
+  C10_CUDA_CHECK(
+      cudaMemcpy(dest, src, count, cudaMemcpyKind::cudaMemcpyDeviceToDevice));
+}
+
 std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
     current_custom_allocator;
 
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.h b/torch/csrc/cuda/CUDAPluggableAllocator.h
index 4f901c5efb8f6..22a61e48e4a24 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.h
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.h
@@ -7,7 +7,6 @@
 
 #include <c10/cuda/CUDACachingAllocator.h>
 
-#include <array>
 #include <mutex>
 
 namespace torch::cuda::CUDAPluggableAllocator {
@@ -30,9 +29,12 @@ void changeCurrentAllocator(
 
 struct _AllocationMetadata {
   _AllocationMetadata();
-  _AllocationMetadata(size_t size, int device_idx, cudaStream_t stream);
+  _AllocationMetadata(
+      size_t size,
+      c10::DeviceIndex device_idx,
+      cudaStream_t stream);
   size_t size;
-  int device_idx;
+  c10::DeviceIndex device_idx;
   cudaStream_t stream;
 };
 
@@ -67,9 +69,9 @@ struct CUDAPluggableAllocator
   void set_release_pool(
       std::function<void(int, c10::cuda::MempoolId_t)> capture_destroy_fn);
 
-  void* malloc(size_t size, int device, cudaStream_t stream);
+  void* malloc(size_t size, c10::DeviceIndex device, cudaStream_t stream);
 
-  c10::DataPtr allocate(size_t size) const override;
+  c10::DataPtr allocate(size_t size) override;
   c10::DeleterFnPtr raw_deleter() const override;
 
   void* raw_alloc(size_t nbytes) override;
@@ -77,25 +79,27 @@ struct CUDAPluggableAllocator
   void raw_delete(void* ptr) override;
   void init(int device_count) override;
   bool initialized() override;
-  void setMemoryFraction(double fraction, int device) override;
+  void setMemoryFraction(double fraction, c10::DeviceIndex device) override;
   void emptyCache() override;
-  void cacheInfo(int dev_id, size_t* largestBlock) override;
+  void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override;
   void* getBaseAllocation(void* ptr, size_t* size) override;
 
   void recordStream(const c10::DataPtr&, streamType stream) override;
 
   c10::cuda::CUDACachingAllocator::DeviceStats getDeviceStats(
-      int device) override;
-  void resetAccumulatedStats(int device) override;
-  void resetPeakStats(int device) override;
+      c10::DeviceIndex device) override;
+  void resetAccumulatedStats(c10::DeviceIndex device) override;
+  void resetPeakStats(c10::DeviceIndex device) override;
   c10::cuda::CUDACachingAllocator::SnapshotInfo snapshot() override;
   void beginAllocateToPool(
-      int device,
+      c10::DeviceIndex device,
       c10::cuda::MempoolId_t mempool_id,
       std::function<bool(cudaStream_t)>) override;
-  void endAllocateToPool(int device, c10::cuda::MempoolId_t mempool_id)
+  void endAllocateToPool(
+      c10::DeviceIndex device,
+      c10::cuda::MempoolId_t mempool_id) override;
+  void releasePool(c10::DeviceIndex device, c10::cuda::MempoolId_t mempool_id)
       override;
-  void releasePool(int device, c10::cuda::MempoolId_t mempool_id) override;
   std::shared_ptr<void> getIpcDevPtr(std::string handle) override;
   void recordHistory(
       bool enabled,
@@ -107,12 +111,14 @@ struct CUDAPluggableAllocator
   void attachAllocatorTraceTracker(
       c10::cuda::CUDACachingAllocator::AllocatorTraceTracker tracker) override;
   std::shared_ptr<c10::cuda::CUDACachingAllocator::AllocatorState>
-  getCheckpointState(int device, at::cuda::MempoolId_t id) override;
+  getCheckpointState(c10::DeviceIndex device, at::cuda::MempoolId_t id)
+      override;
   c10::cuda::CUDACachingAllocator::CheckpointDelta setCheckpointPoolState(
-      int device,
+      c10::DeviceIndex device,
       std::shared_ptr<c10::cuda::CUDACachingAllocator::AllocatorState> pps)
       override;
-  void enablePeerAccess(int dev, int dev_to_access) override;
+  void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access)
+      override;
   cudaError_t memcpyAsync(
       void* dst,
       int dstDevice,
@@ -122,6 +128,7 @@ struct CUDAPluggableAllocator
       cudaStream_t stream,
       bool p2p_enabled) override;
   std::string name() override;
+  void copy_data(void* dest, const void* src, std::size_t count) const final;
 
  protected:
   std::function<void*(size_t, int, cudaStream_t)> alloc_fn_;
diff --git a/torch/csrc/cuda/Event.cpp b/torch/csrc/cuda/Event.cpp
index 980e3cda0faa1..f2cac157de42e 100644
--- a/torch/csrc/cuda/Event.cpp
+++ b/torch/csrc/cuda/Event.cpp
@@ -31,6 +31,7 @@ static PyObject* THCPEvent_pynew(
           args,
           kwargs,
           "|bbb",
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
           const_cast<char**>(kwlist),
           &enable_timing,
           &blocking,
@@ -98,7 +99,10 @@ static PyObject* THCPEvent_from_ipc_handle(
 }
 
 static void THCPEvent_dealloc(THCPEvent* self) {
-  self->cuda_event.~CUDAEvent();
+  {
+    pybind11::gil_scoped_release no_gil{};
+    self->cuda_event.~CUDAEvent();
+  }
   Py_TYPE(self)->tp_free((PyObject*)self);
 }
 
@@ -173,15 +177,13 @@ static PyObject* THCPEvent_ipc_handle(PyObject* _self, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,
-// cppcoreguidelines-avoid-non-const-global-variables, modernize-avoid-c-arrays)
+// NOLINTNEXTLINE(*c-arrays*, *global-variables)
 static struct PyGetSetDef THCPEvent_properties[] = {
     {"device", (getter)THCPEvent_get_device, nullptr, nullptr, nullptr},
     {"cuda_event", (getter)THCPEvent_get_cuda_event, nullptr, nullptr, nullptr},
     {nullptr}};
 
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,
-// cppcoreguidelines-avoid-non-const-global-variables, modernize-avoid-c-arrays)
+// NOLINTNEXTLINE(*c-arrays*, *global-variables)
 static PyMethodDef THCPEvent_methods[] = {
     {(char*)"from_ipc_handle",
      castPyCFunctionWithKeywords(THCPEvent_from_ipc_handle),
diff --git a/torch/csrc/cuda/Graph.cpp b/torch/csrc/cuda/Graph.cpp
index 1cc514480ed2d..83c60d059f8dd 100644
--- a/torch/csrc/cuda/Graph.cpp
+++ b/torch/csrc/cuda/Graph.cpp
@@ -56,6 +56,16 @@ void THCPGraph_init(PyObject* module) {
       .def(
           "capture_end",
           torch::wrap_pybind_function_no_gil(&at::cuda::CUDAGraph::capture_end))
+      .def(
+          "register_generator_state",
+          [](::at::cuda::CUDAGraph& self, py::handle raw_generator) {
+            auto generator = THPGenerator_Unwrap(raw_generator.ptr());
+            // We've unwrapped Python object to C++ object,
+            // so we could release GIL before calling into C++
+            py::gil_scoped_release release;
+            return self.register_generator_state(generator);
+          },
+          py::arg("generator"))
       .def(
           "replay",
           torch::wrap_pybind_function_no_gil(&at::cuda::CUDAGraph::replay))
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 08a6137633efa..a0c06b8b142b0 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -39,7 +39,7 @@
 #include <torch/csrc/cuda/python_comm.h>
 #include <torch/csrc/profiler/python/combined_traceback.h>
 #include <torch/csrc/python_headers.h>
-#include <torch/csrc/utils/cuda_lazy_init.h>
+#include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
 #include <torch/csrc/utils/python_numbers.h>
@@ -62,7 +62,7 @@ static bool in_bad_fork = false; // True for children forked after cuda init
 // Called in the forked child if cuda has already been initialized
 static void forked_child() {
   in_bad_fork = true;
-  torch::utils::set_requires_cuda_init(true);
+  torch::utils::set_requires_device_init(at::kCUDA, true);
 }
 #endif
 
@@ -80,17 +80,13 @@ static void poison_fork() {
 // CUDA management methods
 ////////////////////////////////////////////////////////////////////////////////
 
-void THCPModule_setDevice(int device) {
-  c10::cuda::set_device(static_cast<c10::DeviceIndex>(device));
-}
-
 PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to setDevice");
-  int64_t device = THPUtils_unpackLong(arg);
+  auto device = THPUtils_unpackLong(arg);
 
-  torch::utils::cuda_lazy_init();
-  THCPModule_setDevice(device);
+  torch::utils::device_lazy_init(at::kCUDA);
+  c10::cuda::set_device(static_cast<c10::DeviceIndex>(device));
 
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
@@ -99,36 +95,36 @@ PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
 PyObject* THCPModule_exchangeDevice(PyObject* self, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchangeDevice");
-  int64_t device = THPUtils_unpackLong(arg);
-  if (device < 0) {
+  auto device_index = THPUtils_unpackDeviceIndex(arg);
+  if (device_index < 0) {
     return THPUtils_packInt32(-1);
   }
 
-  torch::utils::cuda_lazy_init();
-  int current_device = c10::cuda::ExchangeDevice(device);
+  torch::utils::device_lazy_init(at::kCUDA);
+  auto current_device = c10::cuda::ExchangeDevice(device_index);
 
-  return THPUtils_packInt32(current_device);
+  return THPUtils_packDeviceIndex(current_device);
   END_HANDLE_TH_ERRORS
 }
 
 PyObject* THCPModule_maybeExchangeDevice(PyObject* self, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchangeDevice");
-  int64_t device = THPUtils_unpackLong(arg);
-  if (device < 0) {
+  auto device_index = THPUtils_unpackDeviceIndex(arg);
+  if (device_index < 0) {
     return THPUtils_packInt32(-1);
   }
 
-  torch::utils::cuda_lazy_init();
-  int current_device = c10::cuda::MaybeExchangeDevice(device);
+  torch::utils::device_lazy_init(at::kCUDA);
+  auto current_device = c10::cuda::MaybeExchangeDevice(device_index);
 
-  return THPUtils_packInt32(current_device);
+  return THPUtils_packDeviceIndex(current_device);
   END_HANDLE_TH_ERRORS
 }
 
 PyObject* THCPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  torch::utils::cuda_lazy_init();
+  torch::utils::device_lazy_init(at::kCUDA);
   // NOLINTNEXTLINE(bugprone-signed-char-misuse)
   auto device = static_cast<int32_t>(c10::cuda::current_device());
   return THPUtils_packInt32(device);
@@ -155,7 +151,7 @@ PyObject* THCPModule_canDeviceAccessPeer_wrap(PyObject* self, PyObject* args) {
   int64_t device = THPUtils_unpackLong(arg1);
   int64_t peer_device = THPUtils_unpackLong(arg2);
 
-  torch::utils::cuda_lazy_init();
+  torch::utils::device_lazy_init(at::kCUDA);
   auto can_access = at::cuda::canDeviceAccessPeer(device, peer_device);
   return PyBool_FromLong(can_access);
   END_HANDLE_TH_ERRORS
@@ -192,15 +188,13 @@ PyObject* THCPModule_getCurrentStream_wrap(
   HANDLE_TH_ERRORS
   TORCH_CHECK(
       THPUtils_checkLong(device_index), "invalid argument to getCurrentStream");
-  int64_t device = THPUtils_unpackLong(device_index);
-  auto stream = at::cuda::getCurrentCUDAStream(device);
+  auto c10_device_index = THPUtils_unpackDeviceIndex(device_index);
+  auto stream = at::cuda::getCurrentCUDAStream(c10_device_index);
   PyObject* output_tuple = PyTuple_New(3);
   PyTuple_SetItem(
       output_tuple, 0, THPUtils_packInt64(static_cast<int64_t>(stream.id())));
   PyTuple_SetItem(
-      output_tuple,
-      1,
-      THPUtils_packInt64(static_cast<int64_t>(stream.device_index())));
+      output_tuple, 1, THPUtils_packDeviceIndex(stream.device_index()));
   PyTuple_SetItem(
       output_tuple,
       2,
@@ -215,8 +209,9 @@ PyObject* THCPModule_getCurrentStream_raw(
   HANDLE_TH_ERRORS
   TORCH_CHECK(
       THPUtils_checkLong(device_index), "invalid argument to getCurrentStream");
-  int64_t device = THPUtils_unpackLong(device_index);
-  return PyLong_FromVoidPtr(at::cuda::getCurrentCUDAStream(device).stream());
+  auto c10_device_index = THPUtils_unpackDeviceIndex(device_index);
+  return PyLong_FromVoidPtr(
+      at::cuda::getCurrentCUDAStream(c10_device_index).stream());
   END_HANDLE_TH_ERRORS
 }
 
@@ -226,15 +221,13 @@ PyObject* THCPModule_getDefaultStream_wrap(
   HANDLE_TH_ERRORS
   TORCH_CHECK(
       THPUtils_checkLong(device_index), "invalid argument to getDefaultStream");
-  int64_t device = THPUtils_unpackLong(device_index);
-  auto stream = at::cuda::getDefaultCUDAStream(device);
+  auto c10_device_index = THPUtils_unpackDeviceIndex(device_index);
+  auto stream = at::cuda::getDefaultCUDAStream(c10_device_index);
   PyObject* output_tuple = PyTuple_New(3);
   PyTuple_SetItem(
       output_tuple, 0, THPUtils_packInt64(static_cast<int64_t>(stream.id())));
   PyTuple_SetItem(
-      output_tuple,
-      1,
-      THPUtils_packInt64(static_cast<int64_t>(stream.device_index())));
+      output_tuple, 1, THPUtils_packDeviceIndex(stream.device_index()));
   PyTuple_SetItem(
       output_tuple,
       2,
@@ -259,6 +252,7 @@ PyObject* THCPModule_setStream_wrap(
           args,
           kwargs,
           "|LLL",
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
           const_cast<char**>(kwlist),
           &stream_id,
           &device_index,
@@ -266,11 +260,13 @@ PyObject* THCPModule_setStream_wrap(
   }
 
   auto stream = at::cuda::CUDAStream::unpack3(
-      stream_id, device_index, static_cast<c10::DeviceType>(device_type));
+      stream_id,
+      static_cast<c10::DeviceIndex>(device_index),
+      static_cast<c10::DeviceType>(device_type));
 
   auto device = c10::cuda::current_device();
   if (device != stream.device_index()) {
-    THCPModule_setDevice(stream.device_index());
+    c10::cuda::set_device(stream.device_index());
   }
   at::cuda::setCurrentCUDAStream(stream);
   Py_RETURN_NONE;
@@ -514,7 +510,7 @@ PyObject* THCPModule_hasPrimaryContext(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(
       THPUtils_checkLong(arg), "invalid argument to has_primary_context");
-  int64_t device_index = static_cast<int64_t>(THPUtils_unpackLong(arg));
+  auto device_index = THPUtils_unpackDeviceIndex(arg);
   if (c10::cuda::hasPrimaryContext(device_index)) {
     Py_RETURN_TRUE;
   } else {
@@ -537,9 +533,9 @@ PyObject* THCPModule_setMemoryFraction(PyObject* _unused, PyObject* args) {
     return nullptr;
   }
   double fraction = PyFloat_AsDouble(fraction_o);
-  int64_t device = PyLong_AsLongLong(device_o);
+  auto device_index = THPUtils_unpackDeviceIndex(device_o);
 
-  c10::cuda::CUDACachingAllocator::setMemoryFraction(fraction, device);
+  c10::cuda::CUDACachingAllocator::setMemoryFraction(fraction, device_index);
   END_HANDLE_TH_ERRORS
   Py_RETURN_NONE;
 }
@@ -554,7 +550,7 @@ PyObject* THCPModule_emptyCache(PyObject* _unused, PyObject* noargs) {
 PyObject* THCPModule_memoryStats(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to memory_allocated");
-  const int device = (int)THPUtils_unpackLong(arg);
+  const auto device_index = THPUtils_unpackDeviceIndex(arg);
 
   using c10::cuda::CUDACachingAllocator::DeviceStats;
   using c10::cuda::CUDACachingAllocator::Stat;
@@ -582,12 +578,15 @@ PyObject* THCPModule_memoryStats(PyObject* _unused, PyObject* arg) {
   };
 
   const DeviceStats stats =
-      c10::cuda::CUDACachingAllocator::getDeviceStats(device);
+      c10::cuda::CUDACachingAllocator::getDeviceStats(device_index);
 
   py::dict result;
   result["num_alloc_retries"] = stats.num_alloc_retries;
   result["num_ooms"] = stats.num_ooms;
   result["max_split_size"] = stats.max_split_size;
+  result["num_sync_all_streams"] = stats.num_sync_all_streams;
+  result["num_device_alloc"] = stats.num_device_alloc;
+  result["num_device_free"] = stats.num_device_free;
   result["allocation"] = statArrayToDict(stats.allocation);
   result["segment"] = statArrayToDict(stats.segment);
   result["active"] = statArrayToDict(stats.active);
@@ -611,8 +610,8 @@ PyObject* THCPModule_resetAccumulatedMemoryStats(
   TORCH_CHECK(
       THPUtils_checkLong(arg),
       "invalid argument to reset_accumulated_memory_stats");
-  const int device = (int)THPUtils_unpackLong(arg);
-  c10::cuda::CUDACachingAllocator::resetAccumulatedStats(device);
+  const auto device_index = THPUtils_unpackDeviceIndex(arg);
+  c10::cuda::CUDACachingAllocator::resetAccumulatedStats(device_index);
   END_HANDLE_TH_ERRORS
   Py_RETURN_NONE;
 }
@@ -621,8 +620,8 @@ PyObject* THCPModule_resetPeakMemoryStats(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(
       THPUtils_checkLong(arg), "invalid argument to reset_peak_memory_stats");
-  const int device = (int)THPUtils_unpackLong(arg);
-  c10::cuda::CUDACachingAllocator::resetPeakStats(device);
+  const auto device_index = THPUtils_unpackDeviceIndex(arg);
+  c10::cuda::CUDACachingAllocator::resetPeakStats(device_index);
   END_HANDLE_TH_ERRORS
   Py_RETURN_NONE;
 }
@@ -786,9 +785,43 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
     traces.append(trace);
   }
 
+  py::dict allocator_settings;
+  py::str last_allocator_settings_s = "PYTORCH_CUDA_ALLOC_CONF";
+  py::str max_split_size_s = "max_split_size";
+  py::str garbage_collection_threshold_s = "garbage_collection_threshold";
+  py::str expandable_segments_s = "expandable_segments";
+  py::str pinned_num_register_threads_s = "pinned_num_register_threads";
+  py::str release_lock_on_malloc_s = "release_lock_on_cudamalloc";
+  py::str pinned_use_host_register_s = "pinned_use_cuda_host_register";
+  py::str roundup_power2_divisions_s = "roundup_power2_divisions";
+
+  allocator_settings[last_allocator_settings_s] =
+      snapshot.config_metadata.last_allocator_settings;
+  allocator_settings[max_split_size_s] =
+      int64_t(snapshot.config_metadata.max_split_size);
+  allocator_settings[garbage_collection_threshold_s] =
+      snapshot.config_metadata.garbage_collection_threshold;
+  allocator_settings[expandable_segments_s] =
+      snapshot.config_metadata.expandable_segments;
+  allocator_settings[pinned_num_register_threads_s] =
+      int64_t(snapshot.config_metadata.pinned_num_register_threads);
+  allocator_settings[release_lock_on_malloc_s] =
+      snapshot.config_metadata.release_lock_on_malloc;
+  allocator_settings[pinned_use_host_register_s] =
+      snapshot.config_metadata.pinned_use_host_register;
+  unsigned int roundup_key = 1;
+  py::dict roundup_settings;
+  for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
+    py::str roundup_key_s = std::to_string(roundup_key);
+    roundup_settings[roundup_key_s] = int64_t(v);
+    roundup_key *= 2;
+  }
+  allocator_settings[roundup_power2_divisions_s] = roundup_settings;
+
   py::dict result;
   result["segments"] = segments;
   result["device_traces"] = traces;
+  result["allocator_settings"] = allocator_settings;
 
   auto frames = py_symbolize(to_gather_frames);
   for (auto i : c10::irange(frames.size())) {
@@ -834,7 +867,7 @@ PyObject* THCPModule_cudaSetSyncDebugMode(PyObject* _unused, PyObject* arg) {
   TORCH_CHECK(
       debug_mode >= 0 && debug_mode <= 2,
       "invalid value of debug_mode, expected one of 0,1,2");
-  c10::cuda::SyncDebugMode l;
+  c10::cuda::SyncDebugMode l = c10::cuda::SyncDebugMode::L_DISABLED;
   switch (debug_mode) {
     case 0:
       l = c10::cuda::SyncDebugMode::L_DISABLED;
@@ -846,7 +879,6 @@ PyObject* THCPModule_cudaSetSyncDebugMode(PyObject* _unused, PyObject* arg) {
       l = c10::cuda::SyncDebugMode::L_ERROR;
       break;
     default:
-      l = c10::cuda::SyncDebugMode::L_DISABLED;
       break; // can't happen
   }
   c10::cuda::warning_state().set_sync_debug_mode(l);
@@ -877,6 +909,47 @@ PyObject* THCPModule_cudaGetSyncDebugMode(PyObject* self, PyObject* noargs) {
 static void registerCudaDeviceProperties(PyObject* module) {
   // Add _cudaDevicePropertires class to torch._C
   auto m = py::handle(module).cast<py::module>();
+  // CUuuid is defined in either cuda.h or driver_types.h
+  // hipified to hipUUID which is defined in hip_runtime_api.h
+  py::class_<CUuuid>(m, "_CUuuid")
+      .def_property_readonly(
+          "bytes",
+          [](const CUuuid& uuid) {
+            return std::vector<uint8_t>(uuid.bytes, uuid.bytes + 16);
+          })
+      .def("__str__", [](const CUuuid& uuid) {
+        // UUIDs are a 128-bit label. CUDA and HIP store this as char[16].
+        // For string representation, the code here expands this to
+        // 8-4-4-4-12 hex format, so each byte becomes 2 hex characters.
+        // Size is 16x2 hex characters + 4 hyphens + 1 null byte.
+        constexpr size_t size = sizeof(CUuuid) * 2 + 4 + 1;
+        char device_path_str[size] = {0};
+        snprintf(
+            device_path_str,
+            sizeof(device_path_str),
+            "%02x%02x%02x%02x-"
+            "%02x%02x-"
+            "%02x%02x-"
+            "%02x%02x-"
+            "%02x%02x%02x%02x%02x%02x",
+            (uint8_t)uuid.bytes[0],
+            (uint8_t)uuid.bytes[1],
+            (uint8_t)uuid.bytes[2],
+            (uint8_t)uuid.bytes[3],
+            (uint8_t)uuid.bytes[4],
+            (uint8_t)uuid.bytes[5],
+            (uint8_t)uuid.bytes[6],
+            (uint8_t)uuid.bytes[7],
+            (uint8_t)uuid.bytes[8],
+            (uint8_t)uuid.bytes[9],
+            (uint8_t)uuid.bytes[10],
+            (uint8_t)uuid.bytes[11],
+            (uint8_t)uuid.bytes[12],
+            (uint8_t)uuid.bytes[13],
+            (uint8_t)uuid.bytes[14],
+            (uint8_t)uuid.bytes[15]);
+        return std::string(device_path_str);
+      });
   py::class_<cudaDeviceProp>(m, "_CudaDeviceProperties")
       .def_readonly("name", &cudaDeviceProp::name)
       .def_readonly("major", &cudaDeviceProp::major)
@@ -903,6 +976,7 @@ static void registerCudaDeviceProperties(PyObject* module) {
           &cudaDeviceProp::name
 #endif // USE_ROCM
           )
+      .def_readonly("uuid", &cudaDeviceProp::uuid)
       .def("__repr__", [](const cudaDeviceProp& prop) {
         std::ostringstream stream;
         stream << "_CudaDeviceProperties(name='" << prop.name
@@ -910,9 +984,9 @@ static void registerCudaDeviceProperties(PyObject* module) {
 #if USE_ROCM
                << ", gcnArchName='" << prop.gcnArchName << "'"
 #endif // USE_ROCM
-               << ", total_memory=" << prop.totalGlobalMem / (1024 * 1024)
+               << ", total_memory=" << prop.totalGlobalMem / (1024ull * 1024)
                << "MB, multi_processor_count=" << prop.multiProcessorCount
-               << ")";
+               << ", uuid=" << std::string(prop.uuid.bytes, 16) << ")";
         return stream.str();
       });
 
@@ -926,7 +1000,7 @@ static void registerCudaDeviceProperties(PyObject* module) {
       static_cast<void (*)(
           c10::optional<std::string>,
           c10::optional<std::string>,
-          std::string,
+          const std::string&,
           size_t)>(torch::cuda::_record_memory_history));
 
   m.def("_cuda_isHistoryEnabled", []() {
@@ -986,6 +1060,7 @@ void addStorageDeleterFns(
 static void registerCudaPluggableAllocator(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
+  // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<
       c10::cuda::CUDACachingAllocator::CUDAAllocator,
       std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>>(
@@ -996,7 +1071,7 @@ static void registerCudaPluggableAllocator(PyObject* module) {
 
   m.def(
       "_cuda_changeCurrentAllocator",
-      [](std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
+      [](const std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>&
              allocator) {
         torch::cuda::CUDAPluggableAllocator::changeCurrentAllocator(allocator);
       });
@@ -1012,6 +1087,7 @@ static void registerCudaPluggableAllocator(PyObject* module) {
              uint64_t func_ptr) {
             using FuncType = void(int);
             std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
                 reinterpret_cast<FuncType*>(func_ptr);
             self.set_init_fn(func);
           })
@@ -1021,6 +1097,7 @@ static void registerCudaPluggableAllocator(PyObject* module) {
              uint64_t func_ptr) {
             using FuncType = void();
             std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
                 reinterpret_cast<FuncType*>(func_ptr);
             self.set_reset_fn(func);
           })
@@ -1030,6 +1107,7 @@ static void registerCudaPluggableAllocator(PyObject* module) {
              uint64_t func_ptr) {
             using FuncType = void(double, int);
             std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
                 reinterpret_cast<FuncType*>(func_ptr);
             self.set_memory_fraction_fn(func);
           })
@@ -1039,6 +1117,7 @@ static void registerCudaPluggableAllocator(PyObject* module) {
              uint64_t func_ptr) {
             using FuncType = void*(void*, size_t*);
             std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
                 reinterpret_cast<FuncType*>(func_ptr);
             self.set_base_alloc_fn(func);
           })
@@ -1048,6 +1127,7 @@ static void registerCudaPluggableAllocator(PyObject* module) {
              uint64_t func_ptr) {
             using FuncType = void(void*, cudaStream_t);
             std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
                 reinterpret_cast<FuncType*>(func_ptr);
             self.set_record_stream_fn(func);
           })
@@ -1058,6 +1138,7 @@ static void registerCudaPluggableAllocator(PyObject* module) {
             using FuncType = void(
                 int, c10::cuda::MempoolId_t, std::function<bool(cudaStream_t)>);
             std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
                 reinterpret_cast<FuncType*>(func_ptr);
             self.set_begin_allocate_to_pool(func);
           })
@@ -1067,6 +1148,7 @@ static void registerCudaPluggableAllocator(PyObject* module) {
              uint64_t func_ptr) {
             using FuncType = void(int, c10::cuda::MempoolId_t);
             std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
                 reinterpret_cast<FuncType*>(func_ptr);
             self.set_end_allocate_to_pool_fn(func);
           })
@@ -1076,6 +1158,7 @@ static void registerCudaPluggableAllocator(PyObject* module) {
              uint64_t func_ptr) {
             using FuncType = void(int, c10::cuda::MempoolId_t);
             std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
                 reinterpret_cast<FuncType*>(func_ptr);
             self.set_release_pool(func);
           });
@@ -1083,23 +1166,29 @@ static void registerCudaPluggableAllocator(PyObject* module) {
     using MallocFuncType = void*(size_t, int, cudaStream_t);
     using FreeFuncType = void(void*, size_t, int, cudaStream_t);
     std::function<MallocFuncType> malloc_fn =
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<MallocFuncType*>(malloc_ptr);
     std::function<FreeFuncType> free_fn =
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<FreeFuncType*>(free_ptr);
     return torch::cuda::CUDAPluggableAllocator::createCustomAllocator(
         malloc_fn, free_fn);
   });
 
+  // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<
       c10::cuda::CUDACachingAllocator::AllocatorState,
       std::shared_ptr<c10::cuda::CUDACachingAllocator::AllocatorState>>(
       m, "_cuda_CUDAAllocator_AllocatorState");
 
-  m.def("_cuda_getCheckpointState", [](int device, c10::cuda::MempoolId_t id) {
-    return c10::cuda::CUDACachingAllocator::getCheckpointState(device, id);
-  });
+  m.def(
+      "_cuda_getCheckpointState",
+      [](c10::DeviceIndex device, c10::cuda::MempoolId_t id) {
+        return c10::cuda::CUDACachingAllocator::getCheckpointState(device, id);
+      });
 
   m.def("_free_And_Remove_DeleterFn", [](size_t storage_impl_ptr) {
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
     c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
     auto alloc = c10::cuda::CUDACachingAllocator::get();
     auto data_ptr = storage_impl->data_ptr().get();
@@ -1109,12 +1198,14 @@ static void registerCudaPluggableAllocator(PyObject* module) {
     c10::cuda::CUDACachingAllocator::raw_delete(data_ptr);
   });
 
-  m.def("_set_storage_access_error_msg", [](at::Tensor t, std::string s) {
-    t.unsafeGetTensorImpl()
-        ->release_storage_and_set_meta_custom_data_ptr_error_msg_(s);
-  });
+  m.def(
+      "_set_storage_access_error_msg", [](const at::Tensor& t, std::string s) {
+        t.unsafeGetTensorImpl()
+            ->release_storage_and_set_meta_custom_data_ptr_error_msg_(s);
+      });
 
   m.def("_has_Standard_Deleter", [](size_t storage_impl_ptr) {
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
     c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
     auto alloc = c10::cuda::CUDACachingAllocator::get();
     return (storage_impl->data_ptr().get_deleter() == alloc->raw_deleter());
@@ -1137,10 +1228,25 @@ static void registerCudaPluggableAllocator(PyObject* module) {
   });
 
   m.def("_storage_Use_Count", [](size_t storage_impl_ptr) {
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
     c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
     return c10::raw::weak_intrusive_ptr::use_count(storage_impl);
   });
 
+  m.def(
+      "_tensors_data_ptrs_at_indices_equal",
+      [](py::list& tensors, py::list& data_ptrs, py::list& indices) {
+        for (size_t i = 0, end = indices.size(); i < end; ++i) {
+          auto index = indices[i].cast<int64_t>();
+          auto t = tensors[index].cast<at::Tensor>();
+          auto data_ptr = data_ptrs[index].cast<int64_t>();
+          if (reinterpret_cast<int64_t>(t.data_ptr()) != data_ptr) {
+            return false;
+          }
+        }
+        return true;
+      });
+
   m.def(
       "_construct_CUDA_Tensor_From_Storage_And_Metadata",
       [](py::dict& metadata, c10::Storage s) {
@@ -1161,7 +1267,7 @@ static void registerCudaPluggableAllocator(PyObject* module) {
 
   m.def(
       "_cuda_beginAllocateCurrentStreamToPool",
-      [](int device, at::cuda::MempoolId_t mempool_id) {
+      [](c10::DeviceIndex device, at::cuda::MempoolId_t mempool_id) {
         auto stream = at::cuda::getCurrentCUDAStream(device);
         TORCH_CHECK(stream, "Expected stream capture to be under way");
         c10::cuda::CUDACachingAllocator::beginAllocateToPool(
@@ -1172,22 +1278,25 @@ static void registerCudaPluggableAllocator(PyObject* module) {
 
   m.def(
       "_cuda_endAllocateCurrentStreamToPool",
-      [](int device, at::cuda::MempoolId_t mempool_id) {
+      [](c10::DeviceIndex device, at::cuda::MempoolId_t mempool_id) {
         c10::cuda::CUDACachingAllocator::endAllocateToPool(device, mempool_id);
       });
 
-  m.def("_cuda_releasePool", [](int device, at::cuda::MempoolId_t mempool_id) {
-    c10::cuda::CUDACachingAllocator::releasePool(device, mempool_id);
-  });
+  m.def(
+      "_cuda_releasePool",
+      [](c10::DeviceIndex device, at::cuda::MempoolId_t mempool_id) {
+        c10::cuda::CUDACachingAllocator::releasePool(device, mempool_id);
+      });
 
   m.def(
       "_cuda_checkPoolLiveAllocations",
-      [](int device,
+      [](c10::DeviceIndex device,
          at::cuda::MempoolId_t mempool_id,
          const py::set& expected_live_allocations) {
         std::unordered_set<void*> allocations;
         allocations.reserve(expected_live_allocations.size());
         for (auto& elem : expected_live_allocations) {
+          // NOLINTNEXTLINE(performance-no-int-to-ptr)
           allocations.insert(reinterpret_cast<void*>(py::cast<size_t>(elem)));
         }
         return c10::cuda::CUDACachingAllocator::checkPoolLiveAllocations(
@@ -1196,14 +1305,15 @@ static void registerCudaPluggableAllocator(PyObject* module) {
 
   m.def(
       "_cuda_setCheckpointPoolState",
-      [](int device,
+      [](c10::DeviceIndex device,
          std::shared_ptr<c10::cuda::CUDACachingAllocator::AllocatorState> pps,
-         std::vector<size_t> stale_storages_ptr,
-         std::vector<size_t> storages_to_add_deleters_to_ptr = {}) {
+         const std::vector<size_t>& stale_storages_ptr,
+         const std::vector<size_t>& storages_to_add_deleters_to_ptr = {}) {
         std::unordered_set<c10::StorageImpl*> ptr_set;
         // iterate on std::vector for determinism
         std::vector<c10::StorageImpl*> ptrs;
         for (size_t ptr_int : stale_storages_ptr) {
+          // NOLINTNEXTLINE(performance-no-int-to-ptr)
           c10::StorageImpl* ptr = (c10::StorageImpl*)ptr_int;
           if (!ptr_set.count(ptr)) {
             ptrs.push_back(ptr);
@@ -1211,7 +1321,7 @@ static void registerCudaPluggableAllocator(PyObject* module) {
           }
         }
         auto delta = c10::cuda::CUDACachingAllocator::setCheckpointPoolState(
-            device, pps);
+            device, std::move(pps));
         auto& freed_pointers = delta.ptrs_freed;
 
         std::unordered_set<void*> allocd_set;
@@ -1237,7 +1347,10 @@ static void registerCudaPluggableAllocator(PyObject* module) {
 
         removeStorageDeleterFns(ptrs, freed_pointer_set);
         std::vector<c10::StorageImpl*> storages_to_add_deleters_to;
+        storages_to_add_deleters_to.reserve(
+            storages_to_add_deleters_to_ptr.size());
         for (size_t ptr_int : storages_to_add_deleters_to_ptr) {
+          // NOLINTNEXTLINE(performance-no-int-to-ptr)
           storages_to_add_deleters_to.push_back((c10::StorageImpl*)ptr_int);
         }
 
@@ -1250,7 +1363,7 @@ static void bindGetDeviceProperties(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
   m.def(
       "_get_device_properties",
-      [](int device) -> cudaDeviceProp* {
+      [](c10::DeviceIndex device) -> cudaDeviceProp* {
         return at::cuda::getDeviceProperties(device);
       },
       py::return_value_policy::reference);
@@ -1367,9 +1480,7 @@ PyObject* THCPModule_benchmarkLimitCuDNN(PyObject* _unused, PyObject* noargs) {
   return THPUtils_packInt32(at::globalContext().benchmarkLimitCuDNN());
 }
 
-// NOLINTNEXTLINE(modernize-avoid-c-arrays,
-// cppcoreguidelines-avoid-non-const-global-variables,
-// cppcoreguidelines-avoid-c-arrays)
+// NOLINTNEXTLINE(*-c-arrays*, *-global-variables)
 static struct PyMethodDef _THCPModule_methods[] = {
     {"_cuda_init", THCPModule_initExtension, METH_NOARGS, nullptr},
     {"_cuda_setDevice", THCPModule_setDevice_wrap, METH_O, nullptr},
diff --git a/torch/csrc/cuda/Module.h b/torch/csrc/cuda/Module.h
index 23d9079f146f5..0c89e4bc65f25 100644
--- a/torch/csrc/cuda/Module.h
+++ b/torch/csrc/cuda/Module.h
@@ -1,7 +1,6 @@
 #ifndef THCP_CUDA_MODULE_INC
 #define THCP_CUDA_MODULE_INC
 
-void THCPModule_setDevice(int idx);
 PyObject* THCPModule_getDevice_wrap(PyObject* self);
 PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg);
 PyObject* THCPModule_getDeviceName_wrap(PyObject* self, PyObject* arg);
diff --git a/torch/csrc/cuda/Stream.cpp b/torch/csrc/cuda/Stream.cpp
index 9d16361301481..65ea8a600b570 100644
--- a/torch/csrc/cuda/Stream.cpp
+++ b/torch/csrc/cuda/Stream.cpp
@@ -39,6 +39,7 @@ static PyObject* THCPStream_pynew(
           args,
           kwargs,
           "|iLLLK",
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
           const_cast<char**>(kwlist),
           &priority,
           &stream_id,
@@ -59,11 +60,14 @@ static PyObject* THCPStream_pynew(
   }
   at::cuda::CUDAStream stream = (stream_id || device_index || device_type)
       ? at::cuda::CUDAStream::unpack3(
-            stream_id, device_index, static_cast<c10::DeviceType>(device_type))
-      : stream_ptr
-      ? at::cuda::getStreamFromExternal(
-            reinterpret_cast<cudaStream_t>(stream_ptr), current_device)
-      : at::cuda::getStreamFromPool(priority);
+            stream_id,
+            static_cast<c10::DeviceIndex>(device_index),
+            static_cast<c10::DeviceType>(device_type))
+      : stream_ptr ? at::cuda::getStreamFromExternal(
+                         // NOLINTNEXTLINE(performance-no-int-to-ptr)
+                         reinterpret_cast<cudaStream_t>(stream_ptr),
+                         current_device)
+                   : at::cuda::getStreamFromPool(priority);
 
   THCPStream* self = (THCPStream*)ptr.get();
   self->stream_id = static_cast<int64_t>(stream.id());
@@ -133,14 +137,10 @@ static PyObject* THCPStream_eq(PyObject* _self, PyObject* _other) {
   END_HANDLE_TH_ERRORS
 }
 
-// NOLINTNEXTLINE(modernize-avoid-c-arrays,
-// cppcoreguidelines-avoid-non-const-global-variables,
-// cppcoreguidelines-avoid-c-arrays)
+// NOLINTNEXTLINE(*-c-arrays*, *-global-variables)
 static struct PyMemberDef THCPStream_members[] = {{nullptr}};
 
-// NOLINTNEXTLINE(modernize-avoid-c-arrays,
-// cppcoreguidelines-avoid-non-const-global-variables,
-// cppcoreguidelines-avoid-c-arrays)
+// NOLINTNEXTLINE(*-c-arrays*, *-global-variables)
 static struct PyGetSetDef THCPStream_properties[] = {
     {"cuda_stream",
      (getter)THCPStream_get_cuda_stream,
@@ -150,17 +150,15 @@ static struct PyGetSetDef THCPStream_properties[] = {
     {"priority", (getter)THCPStream_get_priority, nullptr, nullptr, nullptr},
     {nullptr}};
 
-// NOLINTNEXTLINE(modernize-avoid-c-arrays,
-// cppcoreguidelines-avoid-non-const-global-variables,
-// cppcoreguidelines-avoid-c-arrays)
+// NOLINTNEXTLINE(*-c-arrays*, *-global-variables)
 static PyMethodDef THCPStream_methods[] = {
-    {(char*)"query", THCPStream_query, METH_NOARGS, nullptr},
-    {(char*)"synchronize", THCPStream_synchronize, METH_NOARGS, nullptr},
-    {(char*)"priority_range",
+    {"query", THCPStream_query, METH_NOARGS, nullptr},
+    {"synchronize", THCPStream_synchronize, METH_NOARGS, nullptr},
+    {"priority_range",
      THCPStream_priority_range,
      METH_STATIC | METH_NOARGS,
      nullptr},
-    {(char*)"__eq__", THCPStream_eq, METH_O, nullptr},
+    {"__eq__", THCPStream_eq, METH_O, nullptr},
     {nullptr}};
 
 PyTypeObject THCPStreamType = {
diff --git a/torch/csrc/cuda/Stream.h b/torch/csrc/cuda/Stream.h
index 6175ac2ea0328..9b7197d74390c 100644
--- a/torch/csrc/cuda/Stream.h
+++ b/torch/csrc/cuda/Stream.h
@@ -5,6 +5,7 @@
 #include <torch/csrc/Stream.h>
 #include <torch/csrc/python_headers.h>
 
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 struct THCPStream : THPStream {
   at::cuda::CUDAStream cuda_stream;
 };
diff --git a/torch/csrc/cuda/Tensor.cpp b/torch/csrc/cuda/Tensor.cpp
index f9486164358d4..45ab602c92434 100644
--- a/torch/csrc/cuda/Tensor.cpp
+++ b/torch/csrc/cuda/Tensor.cpp
@@ -7,9 +7,6 @@
 #include <torch/csrc/python_headers.h>
 #include <structmember.h>
 
-#include <stack>
-#include <tuple>
-#include <vector>
 #include <torch/csrc/cuda/THCP.h>
 
 #include <torch/csrc/utils/tensor_numpy.h>
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index 4863f77fd7452..c8bbec87caefb 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -103,8 +103,9 @@ std::vector<Tensor> broadcast(const Tensor& tensor, IntArrayRef devices) {
     if (device != tensor.get_device()) {
       diff_device_dst_tensors.emplace_back(at::empty(
           tensor.sizes(),
-          tensor.options().device(
-              at::Device(DeviceType::CUDA, device)))); // preserve memory format
+          tensor.options().device(at::Device(
+              DeviceType::CUDA,
+              static_cast<DeviceIndex>(device))))); // preserve memory format
     }
   }
   _broadcast_out_impl(tensor, diff_device_dst_tensors);
@@ -178,7 +179,7 @@ tensor_list2d broadcast_coalesced(
     o.reserve(tensors.size());
 
   unique_type_checker type_checker;
-  at::cuda::CUDAGuard device_guard(devices[0]);
+  at::cuda::CUDAGuard device_guard(static_cast<DeviceIndex>(devices[0]));
   for (auto& chunk : torch::utils::take_tensors(tensors, buffer_size)) {
     auto type_id = chunk.type_id();
     type_checker.show(type_id);
@@ -189,7 +190,7 @@ tensor_list2d broadcast_coalesced(
       auto broadcast_values = broadcast(flat_tuple.second, devices);
       results.reserve(devices.size());
       for (size_t i = 1, num_devices = devices.size(); i < num_devices; ++i) {
-        device_guard.set_index(devices[i]);
+        device_guard.set_index(static_cast<DeviceIndex>(devices[i]));
         auto& device_outputs = outputs[i];
         auto& inds = broadcast_indices[i];
         auto& vals = broadcast_values[i];
@@ -203,7 +204,7 @@ tensor_list2d broadcast_coalesced(
       auto results = broadcast(
           torch::utils::flatten_dense_tensors(chunk.tensors), devices);
       for (size_t i = 1, num_devices = devices.size(); i < num_devices; ++i) {
-        device_guard.set_index(devices[i]);
+        device_guard.set_index(static_cast<DeviceIndex>(devices[i]));
         auto& device_outputs = outputs[i];
         for (auto& var :
              torch::utils::unflatten_dense_tensors(results[i], chunk.tensors)) {
@@ -327,10 +328,10 @@ std::vector<at::Tensor> scatter(
         chunk_sizes->size());
   }
   dim = at::maybe_wrap_dim(dim, tensor);
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::vector<at::Tensor> chunks = chunk_sizes
       ? tensor.split_with_sizes(/*split_sizes=*/*chunk_sizes, /*dim=*/dim)
-      : tensor.chunk(/*chunks=*/devices.size(), /*dim=*/dim);
+      : tensor.chunk(
+            /*chunks=*/static_cast<int64_t>(devices.size()), /*dim=*/dim);
   at::cuda::OptionalCUDAStreamGuard cuda_guard;
   for (const auto i : c10::irange(chunks.size())) {
     const auto device_index = static_cast<int16_t>(devices[i]);
@@ -494,7 +495,9 @@ at::Tensor gather(
   at::Device device(DeviceType::CPU);
   if (!destination_index || *destination_index != -1) {
     device = at::Device(
-        DeviceType::CUDA, destination_index ? *destination_index : -1);
+        DeviceType::CUDA,
+        destination_index ? static_cast<DeviceIndex>(*destination_index)
+                          : DeviceIndex(-1));
   }
 
   at::Tensor result =
diff --git a/torch/csrc/cuda/device_set.h b/torch/csrc/cuda/device_set.h
index 82fa34294d363..c533dae3baad3 100644
--- a/torch/csrc/cuda/device_set.h
+++ b/torch/csrc/cuda/device_set.h
@@ -1,10 +1,11 @@
 #pragma once
 
+#include <c10/cuda/CUDAMacros.h>
 #include <bitset>
+#include <cstddef>
 
 namespace torch {
 
-static constexpr size_t MAX_CUDA_DEVICES = 64;
-using device_set = std::bitset<MAX_CUDA_DEVICES>;
+using device_set = std::bitset<C10_COMPILE_TIME_MAX_GPUS>;
 
 } // namespace torch
diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp
index 1d6ece35cb197..49fefd97e2da1 100644
--- a/torch/csrc/cuda/memory_snapshot.cpp
+++ b/torch/csrc/cuda/memory_snapshot.cpp
@@ -11,7 +11,6 @@ using c10::Dict;
 using c10::IValue;
 using torch::jit::Pickler;
 
-using c10::cuda::CUDACachingAllocator::BlockInfo;
 using c10::cuda::CUDACachingAllocator::SegmentInfo;
 
 namespace {
@@ -68,10 +67,11 @@ std::vector<IValue> ivalue_symbolize(
     for (const auto& e : t) {
       l.push_back(all_frames.at(e));
     }
-    py_unique_frames.push_back(std::move(l));
+    py_unique_frames.emplace_back(std::move(l));
   }
 
   std::vector<IValue> result;
+  result.reserve(to_symbolize.size());
   for (const auto& sc : to_symbolize) {
     result.push_back(py_unique_frames.at(cached_frames.at(sc)));
   }
@@ -132,7 +132,7 @@ static void checkOptionIn(
 void _record_memory_history(
     c10::optional<std::string> enabled,
     c10::optional<std::string> context,
-    std::string stacks,
+    const std::string& stacks,
     size_t max_entries) {
   if (enabled) {
     checkOptionIn(
@@ -215,11 +215,15 @@ std::string _memory_snapshot_pickled() {
   const auto segmentInfoToDict = [&](const SegmentInfo& segmentInfo) {
     auto segmentDict = new_dict();
     segmentDict.insert(device_s, segmentInfo.device);
-    segmentDict.insert(address_s, segmentInfo.address);
-    segmentDict.insert(total_size_s, segmentInfo.total_size);
-    segmentDict.insert(allocated_size_s, segmentInfo.allocated_size);
-    segmentDict.insert(active_size_s, segmentInfo.active_size);
-    segmentDict.insert(requested_size_s, segmentInfo.requested_size);
+    segmentDict.insert(address_s, static_cast<int64_t>(segmentInfo.address));
+    segmentDict.insert(
+        total_size_s, static_cast<int64_t>(segmentInfo.total_size));
+    segmentDict.insert(
+        allocated_size_s, static_cast<int64_t>(segmentInfo.allocated_size));
+    segmentDict.insert(
+        active_size_s, static_cast<int64_t>(segmentInfo.active_size));
+    segmentDict.insert(
+        requested_size_s, static_cast<int64_t>(segmentInfo.requested_size));
     segmentDict.insert(stream_s, int64_t(segmentInfo.stream));
     segmentDict.insert(
         segment_type_s, (segmentInfo.is_large ? large_s : small_s));
@@ -234,9 +238,10 @@ std::string _memory_snapshot_pickled() {
     auto blocks = new_list();
     for (const auto& blockInfo : segmentInfo.blocks) {
       auto blockDict = new_dict();
-      blockDict.insert(address_s, address);
-      blockDict.insert(size_s, blockInfo.size);
-      blockDict.insert(requested_size_s, blockInfo.requested_size);
+      blockDict.insert(address_s, static_cast<int64_t>(address));
+      blockDict.insert(size_s, static_cast<int64_t>(blockInfo.size));
+      blockDict.insert(
+          requested_size_s, static_cast<int64_t>(blockInfo.requested_size));
       blockDict.insert(
           state_s,
           (blockInfo.allocated
@@ -303,7 +308,8 @@ std::string _memory_snapshot_pickled() {
       auto trace_entry = new_dict();
       trace_entry.insert(action_s, action_to_str(te.action_));
       trace_entry.insert(
-          TraceEntry::OOM == te.action_ ? device_free_s : addr_s, te.addr_);
+          TraceEntry::OOM == te.action_ ? device_free_s : addr_s,
+          static_cast<int64_t>(te.addr_));
       trace_entry.insert(size_s, (int64_t)te.size_);
       trace_entry.insert(stream_s, int64_t(te.stream_));
       if (te.context_) {
@@ -317,9 +323,48 @@ std::string _memory_snapshot_pickled() {
     traces.push_back(trace);
   }
 
+  auto allocator_settings = new_dict();
+  IValue last_allocator_settings_s = "PYTORCH_CUDA_ALLOC_CONF";
+  IValue max_split_size_s = "max_split_size";
+  IValue garbage_collection_threshold_s = "garbage_collection_threshold";
+  IValue expandable_segments_s = "expandable_segments";
+  IValue pinned_num_register_threads_s = "pinned_num_register_threads";
+  IValue release_lock_on_malloc_s = "release_lock_on_cudamalloc";
+  IValue pinned_use_host_register_s = "pinned_use_cuda_host_register";
+  IValue roundup_power2_divisions_s = "roundup_power2_divisions";
+
+  allocator_settings.insert(
+      last_allocator_settings_s,
+      snapshot.config_metadata.last_allocator_settings);
+  allocator_settings.insert(
+      max_split_size_s, int64_t(snapshot.config_metadata.max_split_size));
+  allocator_settings.insert(
+      garbage_collection_threshold_s,
+      snapshot.config_metadata.garbage_collection_threshold);
+  allocator_settings.insert(
+      expandable_segments_s, snapshot.config_metadata.expandable_segments);
+  allocator_settings.insert(
+      pinned_num_register_threads_s,
+      int64_t(snapshot.config_metadata.pinned_num_register_threads));
+  allocator_settings.insert(
+      release_lock_on_malloc_s,
+      snapshot.config_metadata.release_lock_on_malloc);
+  allocator_settings.insert(
+      pinned_use_host_register_s,
+      snapshot.config_metadata.pinned_use_host_register);
+  unsigned int roundup_key = 1;
+  auto roundup_settings = new_dict();
+  for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
+    IValue roundup_key_s = std::to_string(roundup_key);
+    roundup_settings.insert(roundup_key_s, int64_t(v));
+    roundup_key *= 2;
+  }
+  allocator_settings.insert(roundup_power2_divisions_s, roundup_settings);
+
   auto result = new_dict();
   result.insert("segments", segments);
   result.insert("device_traces", traces);
+  result.insert("allocator_settings", allocator_settings);
 
   auto frames = ivalue_symbolize(frame_tracebacks);
   for (auto i : c10::irange(frames.size())) {
diff --git a/torch/csrc/cuda/memory_snapshot.h b/torch/csrc/cuda/memory_snapshot.h
index dfffbcb5a88e4..f5f9bdbed1620 100644
--- a/torch/csrc/cuda/memory_snapshot.h
+++ b/torch/csrc/cuda/memory_snapshot.h
@@ -2,6 +2,7 @@
 
 #include <c10/util/Optional.h>
 #include <torch/csrc/Export.h>
+#include <cstdint>
 #include <string>
 
 namespace torch::cuda {
@@ -18,8 +19,8 @@ TORCH_CUDA_CU_API void _record_memory_history(
 TORCH_CUDA_CU_API void _record_memory_history(
     c10::optional<std::string> enabled = "all",
     c10::optional<std::string> context = "all",
-    std::string stacks = "all",
-    size_t max_entries = UINT64_MAX);
+    const std::string& stacks = "all",
+    size_t max_entries = SIZE_MAX);
 
 TORCH_CUDA_CU_API std::string _memory_snapshot_pickled();
 
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index 67b9d54f18b1f..f9b29c38dce5a 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -279,8 +279,9 @@ ArrayRef<ncclComm_t> get_communicators(TensorList inputs) {
   };
   device_list devices = fmap(inputs, get_device);
   auto it = _communicators.find(devices);
-  if (it == _communicators.end())
-    std::tie(it, std::ignore) = _communicators.emplace(devices, devices);
+  if (it == _communicators.end()) {
+    it = _communicators.emplace(devices, devices).first;
+  }
   return it->second.ref();
 }
 
@@ -415,20 +416,18 @@ AutoNcclGroup::AutoNcclGroup() {
   (c10::cuda::getFreeMutex())->lock();
 #endif
   comm_nonblocking_ = false;
+  comm_ = nullptr;
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
   detail::NCCL_CHECK(ncclGroupStart());
 #endif
 }
 
-AutoNcclGroup::AutoNcclGroup(
-    std::vector<ncclComm_t>& comms,
-    bool comm_nonblocking) {
+AutoNcclGroup::AutoNcclGroup(ncclComm_t comm, bool comm_nonblocking) {
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR < 2)
   // nccl < 2.0 cannot be called concurrently with cudaFree
   (c10::cuda::getFreeMutex())->lock();
 #endif
-  // TODO(eqy): can we make comms_ reference?
-  comms_ = comms;
+  comm_ = comm;
   comm_nonblocking_ = comm_nonblocking;
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
   detail::NCCL_CHECK(ncclGroupStart());
@@ -437,10 +436,10 @@ AutoNcclGroup::AutoNcclGroup(
 
 AutoNcclGroup::~AutoNcclGroup() noexcept(false) {
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
-  if (!comm_nonblocking_) {
-    detail::NCCL_CHECK(ncclGroupEnd());
+  if (comm_nonblocking_ && comm_ != nullptr) {
+    detail::NCCL_CHECK_TIMEOUT(ncclGroupEnd(), comm_);
   } else {
-    detail::NCCL_CHECK_TIMEOUT(ncclGroupEnd(), comms_);
+    detail::NCCL_CHECK(ncclGroupEnd());
   }
 #endif
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR < 2)
@@ -584,7 +583,7 @@ void broadcast(
   AutoNcclGroup nccl_group_guard;
   at::cuda::OptionalCUDAGuard device_guard;
   for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; i++) {
-    int device = tensors[i].get_device();
+    auto device = tensors[i].get_device();
     device_guard.set_index(device);
     // Default to the current stream
     const auto stream = (streams.empty() || !streams[i])
@@ -636,7 +635,7 @@ void reduce(
   AutoNcclGroup nccl_group_guard;
   at::cuda::OptionalCUDAGuard device_guard;
   for (const auto i : c10::irange(len)) {
-    int device = inputs[i].device().index();
+    auto device = inputs[i].device().index();
     device_guard.set_index(device);
     // Default to the current stream
     const auto stream = (streams.empty() || !streams[i])
@@ -690,7 +689,7 @@ void all_reduce(
   AutoNcclGroup nccl_group_guard;
   at::cuda::OptionalCUDAGuard device_guard;
   for (const auto i : c10::irange(len)) {
-    int device = inputs[i].device().index();
+    auto device = inputs[i].device().index();
     device_guard.set_index(device);
     // Default to the current stream
     const auto stream = (streams.empty() || !streams[i])
@@ -732,7 +731,7 @@ void reduce_scatter(
   AutoNcclGroup nccl_group_guard;
   at::cuda::OptionalCUDAGuard device_guard;
   for (const auto i : c10::irange(len)) {
-    int device = inputs[i].device().index();
+    auto device = inputs[i].device().index();
     device_guard.set_index(device);
     // Default to the current stream
     const auto stream = (streams.empty() || !streams[i])
@@ -773,7 +772,7 @@ void all_gather(
   AutoNcclGroup nccl_group_guard;
   at::cuda::OptionalCUDAGuard device_guard;
   for (const auto i : c10::irange(len)) {
-    int device = inputs[i].device().index();
+    auto device = inputs[i].device().index();
     device_guard.set_index(device);
     // Default to the current stream
     const auto stream = (streams.empty() || !streams[i])
@@ -819,7 +818,7 @@ void all2all_single_equal_split(
   auto type = to_nccl_data_type(input);
   size_t count = input.numel() / size;
   size_t rankdiff = input.nbytes() / size;
-  const auto* sendbuff = reinterpret_cast<char*>(input.data_ptr());
+  const auto* sendbuff = reinterpret_cast<const char*>(input.const_data_ptr());
   auto* recvbuff = reinterpret_cast<char*>(output.data_ptr());
   auto comm = to_nccl_comm(_comm);
 #if defined(USE_ROCM) && ROCM_VERSION >= 50000
@@ -1041,7 +1040,7 @@ void gather(
 
   size_t count = inputs.numel();
   auto type = to_nccl_data_type(inputs);
-  const auto* sendbuff = reinterpret_cast<char*>(inputs.data_ptr());
+  const auto* sendbuff = reinterpret_cast<const char*>(inputs.const_data_ptr());
 
   NCCL_CHECK(ncclGroupStart());
 
@@ -1098,7 +1097,8 @@ void scatter(
       if (r != root) {
         size_t send_count = inputs[r].numel();
         auto send_type = to_nccl_data_type(inputs[r]);
-        const auto* sendbuff = reinterpret_cast<char*>(inputs[r].data_ptr());
+        const auto* sendbuff =
+            reinterpret_cast<const char*>(inputs[r].const_data_ptr());
         NCCL_CHECK(ncclSend(sendbuff, send_count, send_type, r, comm, stream));
       } else {
         // on its own rank, simply copy it to the output
diff --git a/torch/csrc/cuda/nccl.h b/torch/csrc/cuda/nccl.h
index 14664cc0b9fd1..ebf51b7633abb 100644
--- a/torch/csrc/cuda/nccl.h
+++ b/torch/csrc/cuda/nccl.h
@@ -76,9 +76,9 @@ enum class ncclDataType {
 // manages group and lock lifetimes.
 struct AutoNcclGroup {
   AutoNcclGroup();
-  AutoNcclGroup(std::vector<ncclComm_t>& comms, bool comm_nonblocking);
+  AutoNcclGroup(ncclComm_t comm, bool comm_nonblocking);
   ~AutoNcclGroup() noexcept(false);
-  std::vector<ncclComm_t> comms_;
+  ncclComm_t comm_;
   bool comm_nonblocking_;
 };
 
diff --git a/torch/csrc/cuda/python_comm.cpp b/torch/csrc/cuda/python_comm.cpp
index 3c10ae89da99c..e65bb15103aab 100644
--- a/torch/csrc/cuda/python_comm.cpp
+++ b/torch/csrc/cuda/python_comm.cpp
@@ -18,7 +18,7 @@ void initCommMethods(PyObject* module) {
   m.def(
        "_broadcast_coalesced",
        [](std::vector<at::Tensor>& tensors,
-          std::vector<int64_t> devices,
+          const std::vector<int64_t>& devices,
           size_t buffer_size) {
          return broadcast_coalesced(tensors, devices, buffer_size);
        },
diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp
index 0c3805e12115f..db6f6c680701d 100644
--- a/torch/csrc/cuda/python_nccl.cpp
+++ b/torch/csrc/cuda/python_nccl.cpp
@@ -13,9 +13,6 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/irange.h>
 
-#include <sstream>
-#include <unordered_map>
-
 using namespace at;
 using namespace torch;
 using namespace torch::cuda::nccl;
@@ -24,7 +21,7 @@ using namespace torch::cuda::nccl::detail;
 static const char* COMM_CAPSULE_NAME = "torch.cuda.nccl.Communicator";
 
 PyObject* THCPModule_nccl_version(PyObject* self, PyObject* args) {
-  return PyInt_FromLong(version());
+  return PyLong_FromUnsignedLongLong(version());
 }
 
 PyObject* THCPModule_nccl_version_suffix(PyObject* self, PyObject* args) {
@@ -102,10 +99,10 @@ static std::vector<ncclComm_t> unpack_comms(PyObject* obj, size_t size) {
 
 PyObject* THCPModule_nccl_init_rank(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  int nranks;
-  const char* id;
-  Py_ssize_t id_len;
-  int rank;
+  int nranks = 0;
+  const char* id = nullptr;
+  Py_ssize_t id_len = 0;
+  int rank = 0;
 
   if (!PyArg_ParseTuple(
           args, "is#i:nccl_init_rank", &nranks, &id, &id_len, &rank)) {
@@ -121,7 +118,7 @@ PyObject* THCPModule_nccl_init_rank(PyObject* self, PyObject* args) {
 
   ncclUniqueId commId;
   memcpy(&commId, id, NCCL_UNIQUE_ID_BYTES);
-  ncclComm_t comm;
+  ncclComm_t comm = nullptr;
   {
     pybind11::gil_scoped_release no_gil;
     comm = comm_init_rank(nranks, commId, rank);
@@ -132,8 +129,9 @@ PyObject* THCPModule_nccl_init_rank(PyObject* self, PyObject* args) {
 
 PyObject* THCPModule_nccl_reduce(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  PyObject *_inputs, *_output, *_streams, *_comms;
-  int root, op;
+  PyObject *_inputs = nullptr, *_output = nullptr, *_streams = nullptr,
+           *_comms = nullptr;
+  int root = 0, op = 0;
 
   if (!PyArg_ParseTuple(
           args, "OOiiOO", &_inputs, &_output, &root, &op, &_streams, &_comms)) {
@@ -164,8 +162,9 @@ PyObject* THCPModule_nccl_reduce(PyObject* self, PyObject* args) {
 
 PyObject* THCPModule_nccl_all_reduce(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  PyObject *_inputs, *_outputs, *_streams, *_comms;
-  int op;
+  PyObject *_inputs = nullptr, *_outputs = nullptr, *_streams = nullptr,
+           *_comms = nullptr;
+  int op = 0;
 
   if (!PyArg_ParseTuple(
           args, "OOiOO", &_inputs, &_outputs, &op, &_streams, &_comms)) {
@@ -196,8 +195,8 @@ PyObject* THCPModule_nccl_all_reduce(PyObject* self, PyObject* args) {
 
 PyObject* THCPModule_nccl_broadcast(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  PyObject *_inputs, *_streams, *_comms;
-  int root;
+  PyObject *_inputs = nullptr, *_streams = nullptr, *_comms = nullptr;
+  int root = 0;
 
   if (!PyArg_ParseTuple(args, "OiOO", &_inputs, &root, &_streams, &_comms)) {
     THPUtils_invalidArguments(
@@ -227,7 +226,8 @@ PyObject* THCPModule_nccl_broadcast(PyObject* self, PyObject* args) {
 
 PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  PyObject *_inputs, *_outputs, *_streams, *_comms;
+  PyObject *_inputs = nullptr, *_outputs = nullptr, *_streams = nullptr,
+           *_comms = nullptr;
 
   if (!PyArg_ParseTuple(
           args, "OOOO", &_inputs, &_outputs, &_streams, &_comms)) {
@@ -258,8 +258,9 @@ PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args) {
 
 PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  PyObject *_inputs, *_outputs, *_streams, *_comms;
-  int op;
+  PyObject *_inputs = nullptr, *_outputs = nullptr, *_streams = nullptr,
+           *_comms = nullptr;
+  int op = 0;
 
   if (!PyArg_ParseTuple(
           args, "OOiOO", &_inputs, &_outputs, &op, &_streams, &_comms)) {
@@ -289,9 +290,11 @@ PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args) {
 }
 
 static inline at::Tensor extract_tensor(PyObject* obj) {
-  if (!THPVariable_Check(obj)) {
-    throw torch::TypeError("expected Tensor (got %s)", Py_TYPE(obj)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      THPVariable_Check(obj),
+      "expected Tensor (got ",
+      Py_TYPE(obj)->tp_name,
+      ")");
   return THPVariable_Unpack(obj);
 }
 
@@ -307,10 +310,13 @@ static inline std::vector<at::Tensor> extract_tensors(PyObject* obj) {
   }
   for (Py_ssize_t i = 0; i < length; i++) {
     PyObject* item = PySequence_Fast_GET_ITEM(seq.get(), i);
-    if (!THPVariable_Check(item)) {
-      throw torch::TypeError(
-          "expected Tensor at %d (got %s)", (int)i, Py_TYPE(item)->tp_name);
-    }
+    TORCH_CHECK_TYPE(
+        THPVariable_Check(item),
+        "expected Tensor at ",
+        i,
+        " (got ",
+        Py_TYPE(item)->tp_name,
+        ")");
     list.emplace_back(THPVariable_Unpack(item));
   }
   return list;
diff --git a/torch/csrc/cuda/shared/cudart.cpp b/torch/csrc/cuda/shared/cudart.cpp
index 0b1dece6af968..77cf177e336ff 100644
--- a/torch/csrc/cuda/shared/cudart.cpp
+++ b/torch/csrc/cuda/shared/cudart.cpp
@@ -103,7 +103,7 @@ void initCudartBindings(PyObject* module) {
   cudart.def(
       "cuda"
       "MemGetInfo",
-      [](int device) -> std::pair<size_t, size_t> {
+      [](c10::DeviceIndex device) -> std::pair<size_t, size_t> {
         c10::cuda::CUDAGuard guard(device);
         size_t device_free = 0;
         size_t device_total = 0;
diff --git a/torch/csrc/cuda/shared/cudnn.cpp b/torch/csrc/cuda/shared/cudnn.cpp
index d6b53bc5b3771..30a1383455be1 100644
--- a/torch/csrc/cuda/shared/cudnn.cpp
+++ b/torch/csrc/cuda/shared/cudnn.cpp
@@ -22,11 +22,11 @@ version_tuple getCompileVersion() {
 
 version_tuple getRuntimeVersion() {
 #ifndef USE_STATIC_CUDNN
-  auto version = cudnnGetVersion();
-  auto major = version / 1000;
-  auto minor = (version % 1000) / 100;
-  auto patch = version % 10;
-  return version_tuple(major, minor, patch);
+  int major, minor, patch;
+  cudnnGetProperty(MAJOR_VERSION, &major);
+  cudnnGetProperty(MINOR_VERSION, &minor);
+  cudnnGetProperty(PATCH_LEVEL, &patch);
+  return version_tuple((size_t)major, (size_t)minor, (size_t)patch);
 #else
   return getCompileVersion();
 #endif
@@ -66,8 +66,7 @@ version_tuple getRuntimeVersion() {
 
 size_t getVersionInt() {
   // miopen version is MAJOR*1000000 + MINOR*1000 + PATCH
-  size_t major, minor, patch;
-  std::tie(major, minor, patch) = getRuntimeVersion();
+  auto [major, minor, patch] = getRuntimeVersion();
   return major * 1000000 + minor * 1000 + patch;
 }
 
diff --git a/torch/csrc/distributed/autograd/context/context.cpp b/torch/csrc/distributed/autograd/context/context.cpp
index 5599000720db7..823f58dd1195f 100644
--- a/torch/csrc/distributed/autograd/context/context.cpp
+++ b/torch/csrc/distributed/autograd/context/context.cpp
@@ -90,8 +90,7 @@ void DistAutogradContext::accumulateGrad(
   // CUDA stream restoration from autograd function. Hence, we manually
   // call it here to get the streams correct.
   auto forward_stream =
-      torch::autograd::impl::grad_accumulator(variable)->stream(
-          grad.device().type());
+      torch::autograd::impl::grad_accumulator(variable)->stream();
   c10::OptionalStreamGuard stream_guard(forward_stream);
 
   // No higher order gradients supported in distributed autograd.
diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.cpp b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
index 81d7c01f6d0fc..062a15da4964c 100644
--- a/torch/csrc/distributed/autograd/engine/dist_engine.cpp
+++ b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
@@ -62,9 +62,9 @@ class DistAccumulateGradCaptureHook
       autogradContext_->accumulateGrad(
           accumulateGrad_->variable, inputGrads[0], 3 /* num_expected_refs */);
     }
-    const variable_list kEmptyOuput;
+    const variable_list kEmptyOutput;
     for (const auto& hook : accumulateGrad_->post_hooks()) {
-      (*hook)(kEmptyOuput, inputGrads);
+      (*hook)(kEmptyOutput, inputGrads);
     }
     return inputGrads[0];
   }
@@ -219,8 +219,7 @@ void DistEngine::computeDependencies(
     queue.push(mapEntry.second.get());
   }
 
-  bool might_use_cuda = at::globalContext().hasCUDA();
-  bool will_use_cuda = false;
+  bool will_use_accelerator = false;
 
   edge_list recvBackwardEdges;
   // Traverse the graph.
@@ -229,8 +228,8 @@ void DistEngine::computeDependencies(
     auto fn = queue.front();
     queue.pop();
 
-    if (might_use_cuda && !will_use_cuda) {
-      will_use_cuda = fn->stream(c10::DeviceType::CUDA).has_value();
+    if (!will_use_accelerator) {
+      will_use_accelerator = fn->stream().has_value();
     }
 
     for (const auto& edge : fn->next_edges()) {
@@ -269,9 +268,10 @@ void DistEngine::computeDependencies(
     }
   }
 
-  if (will_use_cuda) {
-    // Collects current streams for devices where this process has a context,
-    // so graphTask::exec_post_processing can sync them with leaf_streams.
+  if (will_use_accelerator) {
+    // Collects current streams for CUDA/ROCM devices where this process has a
+    // context, so graphTask::exec_post_processing can sync them with
+    // leaf_streams.
     graphTask->stash_current_streams();
   }
 
@@ -460,8 +460,7 @@ c10::intrusive_ptr<c10::ivalue::Future> DistEngine::executeSendFunctionAsync(
   // inputs might have been retrieved over the wire on a separate stream and the
   // sendFunction itself runs on a different stream. As a result, we need to
   // manually synchronize those two streams here.
-  const auto& send_backward_stream =
-      sendFunction->stream(c10::DeviceType::CUDA);
+  const auto& send_backward_stream = sendFunction->stream();
   if (send_backward_stream) {
     for (const auto& grad : sendFunction->getGrads()) {
       const auto guard = c10::impl::VirtualGuardImpl{c10::DeviceType::CUDA};
diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp
index 44888a2442f09..c9e8aec439788 100644
--- a/torch/csrc/distributed/c10d/Backend.hpp
+++ b/torch/csrc/distributed/c10d/Backend.hpp
@@ -1,10 +1,6 @@
 #pragma once
 
-#include <condition_variable>
 #include <memory>
-#include <mutex>
-#include <stdexcept>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -369,6 +365,14 @@ class TORCH_API Backend : public torch::CustomClassHolder {
     return pg_name_;
   }
 
+  void setGroupDesc(const std::string& desc) {
+    pg_desc_ = desc;
+  }
+
+  const std::string& getGroupDesc() const {
+    return pg_desc_;
+  }
+
   // See similar functions in ProcessGroup.hpp for context.
   c10::optional<at::Device> getBoundDeviceId() const {
     return bound_device_id_;
@@ -399,6 +403,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
   // remains the same across use of this process group.
   DebugLevel dist_debug_level_;
   std::string pg_name_;
+  std::string pg_desc_;
 
   std::function<void(std::shared_ptr<WorkInfo>)> onCompletionHook_;
 
diff --git a/torch/csrc/distributed/c10d/FakeProcessGroup.hpp b/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
new file mode 100644
index 0000000000000..2736e0e3538d8
--- /dev/null
+++ b/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
@@ -0,0 +1,186 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+
+namespace c10d {
+
+class FakeWork : public Work {
+ public:
+  bool wait(std::chrono::milliseconds timeout) override {
+    return true;
+  }
+
+  c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
+    auto fut = c10::make_intrusive<c10::ivalue::Future>(c10::NoneType::get());
+    fut->markCompleted();
+    return fut;
+  }
+};
+
+class FakeProcessGroup : public Backend {
+ public:
+  FakeProcessGroup(int rank, int size) : Backend(rank, size) {}
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& /* tensors */,
+      const BroadcastOptions& /* opts */ = BroadcastOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceOptions& /* opts */ = AllreduceOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allreduce_sparse(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceOptions& /* opts */ = AllreduceOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceCoalescedOptions& /* opts */ =
+          AllreduceCoalescedOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& /* tensors */,
+      const ReduceOptions& /* opts */ = ReduceOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  // NOTE [allgather on FakeProcessGroup]
+  // Assume each rank have the same input tensor so we just copy to the results
+  // since it's not a real allgather, we simply make this copying logic to let
+  // some simple validation works (i.e. calling allgather to see if each rank
+  // have the same tensor or not).
+  //
+  // NOTE: in general it's not good form to try to make FakeProcessGroup work
+  // with real data, but the reasoning here is that we want FakeProcessGroup to
+  // work with DeviceMesh's init code that have the data validation, which
+  // makes it worth the tradeoff.
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    for (auto& tensor : outputTensors[0]) {
+      tensor.copy_(inputTensors[0]);
+    }
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    auto chunks = outputBuffer.chunk(size_);
+    for (auto& tensor : chunks) {
+      tensor.copy_(inputBuffer);
+    }
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& /* outputTensorLists */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      auto chunks = outputs[i].chunk(size_);
+      for (auto& chunk : chunks) {
+        chunk.copy_(inputs[i]);
+      }
+    }
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& /* outputTensors */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const GatherOptions& /* opts */ = GatherOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<std::vector<at::Tensor>>& /* inputTensors */,
+      const ScatterOptions& /* opts */ = ScatterOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<std::vector<at::Tensor>>& /* inputTensors */,
+      const ReduceScatterOptions& /* opts */ =
+          ReduceScatterOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& /* outputBuffer */,
+      at::Tensor& /* inputBuffer */,
+      const ReduceScatterOptions& /* opts */ =
+          ReduceScatterOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& /* outputs */,
+      std::vector<at::Tensor>& /* inputs */,
+      const ReduceScatterOptions& /* opts */ =
+          ReduceScatterOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& /* outputBuffer */,
+      at::Tensor& /* inputBuffer */,
+      std::vector<int64_t>& /* outputSplitSizes */,
+      std::vector<int64_t>& /* inputSplitSizes */,
+      const AllToAllOptions& /* opts */ = AllToAllOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const AllToAllOptions& opts = AllToAllOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* dstRank */,
+      int /* tag */) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* srcRank */,
+      int /* tag */) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* tag */) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& /* opts */ = BarrierOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+};
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/FileStore.cpp b/torch/csrc/distributed/c10d/FileStore.cpp
index a47deca48b09e..df74adeb6d3a0 100644
--- a/torch/csrc/distributed/c10d/FileStore.cpp
+++ b/torch/csrc/distributed/c10d/FileStore.cpp
@@ -17,11 +17,6 @@
 
 #include <chrono>
 #include <cstdio>
-#include <functional>
-#include <iostream>
-#include <limits>
-#include <sstream>
-#include <system_error>
 #include <thread>
 #include <utility>
 
@@ -95,6 +90,7 @@ class Lock {
     flock(operation);
   }
 
+  // NOLINTNEXTLINE(bugprone-exception-escape)
   ~Lock() {
     unlock();
   }
@@ -260,7 +256,7 @@ off_t refresh(
     File& file,
     off_t pos,
     std::unordered_map<std::string, std::vector<uint8_t>>& cache,
-    const std::string deletePrefix) {
+    const std::string& deletePrefix) {
   auto size = file.size();
   if (size != pos) {
     std::string tmpKey;
@@ -295,6 +291,7 @@ FileStore::FileStore(std::string path, int numWorkers)
   addHelper(refCountKey_, 1);
 }
 
+// NOLINTNEXTLINE(bugprone-exception-escape)
 FileStore::~FileStore() {
   // If the file does not exist - exit.
   // This can happen when FileStore is invoked from python language which has
@@ -429,7 +426,7 @@ int64_t FileStore::getNumKeys() {
   File file(path_, O_RDONLY, timeout_);
   auto lock = file.lockShared();
   pos_ = refresh(file, pos_, cache_, deletePrefix_);
-  return cache_.size();
+  return static_cast<int64_t>(cache_.size());
 }
 
 bool FileStore::deleteKey(const std::string& key) {
diff --git a/torch/csrc/distributed/c10d/Functional.cpp b/torch/csrc/distributed/c10d/Functional.cpp
index 5d3d71d37c0a7..d392c0213b84a 100644
--- a/torch/csrc/distributed/c10d/Functional.cpp
+++ b/torch/csrc/distributed/c10d/Functional.cpp
@@ -1,14 +1,12 @@
-#include <torch/csrc/distributed/c10d/Functional.hpp>
-
-#include <shared_mutex>
-
 #include <ATen/ATen.h>
 #include <ATen/core/op_registration/op_registration.h>
 #include <c10/core/DispatchKey.h>
+#include <torch/csrc/autograd/custom_function.h>
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/RankLocal.hpp>
+#include <utility>
 
 namespace {
 
@@ -16,10 +14,10 @@ class WorkRegistry {
  public:
   void register_work(
       const at::Tensor& tensor,
-      c10::intrusive_ptr<c10d::Work> work) {
-    const auto storage = tensor.storage().getWeakStorageImpl();
+      const c10::intrusive_ptr<c10d::Work>& work) {
+    auto storage = tensor.storage().getWeakStorageImpl();
     std::unique_lock lock(lock_);
-    auto [it, inserted] = registry_.emplace(storage, work);
+    auto [it, inserted] = registry_.try_emplace(std::move(storage), work);
     TORCH_CHECK(
         inserted || it->second != work,
         "The tensor storage is already associated with another work.");
@@ -29,16 +27,34 @@ class WorkRegistry {
     const auto storage = tensor.storage().getWeakStorageImpl();
     std::unique_lock lock(lock_);
     auto it = registry_.find(storage);
-    TORCH_CHECK(
-        it != registry_.end(),
-        "No pending collective is associated with the tensor storage. "
-        "This typically means that the tensor is not a collective output, "
-        "or the tensor has already been waited on.");
+    if (it == registry_.end()) {
+      return nullptr;
+    }
     auto work = it->second;
     registry_.erase(it);
     return work;
   }
 
+  ~WorkRegistry() {
+    // If there are still unwaited work objects, their corresponding process
+    // groups should have already been destroyed at this stage. Any attempts to
+    // wait for these work objects or to destroy them will only result in
+    // confusing errors. Therefore, we simply issue a warning and intentionally
+    // allow the unwaited work objects to leak.
+    if (!registry_.empty()) {
+      TORCH_WARN(
+          "At the time of process termination, there are still ",
+          registry_.size(),
+          " unwaited c10d_functional collective calls. "
+          "Please review your program to ensure c10d_functional.wait_tensor() "
+          "is invoked on all tensors returned from c10d_functional collective "
+          "ops before they are used.");
+    }
+    for (auto& it : registry_) {
+      it.second.release();
+    }
+  }
+
  private:
   std::unordered_map<
       c10::weak_intrusive_ptr<c10::StorageImpl>,
@@ -47,6 +63,26 @@ class WorkRegistry {
   std::mutex lock_;
 };
 
+static WorkRegistry process_registry;
+
+void register_work(
+    const at::Tensor& tensor,
+    const c10::intrusive_ptr<c10d::Work>& work) {
+  if (c10d::get_thread_isolation_mode()) {
+    c10d::RankLocal<WorkRegistry>::get().register_work(tensor, work);
+  } else {
+    process_registry.register_work(tensor, work);
+  }
+}
+
+c10::intrusive_ptr<c10d::Work> pop_work(const at::Tensor& tensor) {
+  if (c10d::get_thread_isolation_mode()) {
+    return c10d::RankLocal<WorkRegistry>::get().pop_work(tensor);
+  } else {
+    return process_registry.pop_work(tensor);
+  }
+}
+
 const std::unordered_map<std::string, c10d::ReduceOp> str_to_reduce_op = {
     {"sum", c10d::ReduceOp(c10d::ReduceOp::RedOpType::SUM)},
     {"avg", c10d::ReduceOp(c10d::ReduceOp::RedOpType::AVG)},
@@ -69,8 +105,9 @@ c10d::ReduceOp to_reduce_op(const std::string& reduce_op) {
 
 at::Tensor& all_reduce_(
     at::Tensor& input,
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     std::string reduce_op,
-    // c10::string_view group_name,
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     std::string group_name) {
   c10d::AllreduceOptions opts;
   opts.reduceOp = to_reduce_op(reduce_op);
@@ -86,13 +123,15 @@ at::Tensor all_reduce(
     const at::Tensor& input,
     std::string reduce_op,
     std::string group_name) {
-  auto output = input.clone();
-  return all_reduce_(output, reduce_op, group_name);
+  auto output = input.clone(at::MemoryFormat::Contiguous);
+  return all_reduce_(output, std::move(reduce_op), std::move(group_name));
 }
 
 std::vector<at::Tensor> all_reduce_coalesced_(
     std::vector<at::Tensor> inputs,
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     std::string reduce_op,
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     std::string group_name) {
   c10d::AllreduceCoalescedOptions opts;
   opts.reduceOp = to_reduce_op(reduce_op);
@@ -106,14 +145,17 @@ std::vector<at::Tensor> all_reduce_coalesced_(
 }
 
 std::vector<at::Tensor> all_reduce_coalesced(
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     std::vector<at::Tensor> inputs,
     std::string reduce_op,
     std::string group_name) {
   std::vector<at::Tensor> outputs;
+  outputs.reserve(inputs.size());
   for (const auto& tensor : inputs) {
-    outputs.push_back(tensor.clone());
+    outputs.push_back(tensor.clone(at::MemoryFormat::Contiguous));
   }
-  return all_reduce_coalesced_(outputs, reduce_op, group_name);
+  return all_reduce_coalesced_(
+      outputs, std::move(reduce_op), std::move(group_name));
 }
 
 at::Tensor allocate_all_gather_output(
@@ -129,15 +171,16 @@ at::Tensor allocate_all_gather_output(
 std::vector<at::Tensor> all_gather_into_tensor_coalesced(
     std::vector<at::Tensor> inputs,
     int64_t group_size,
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     std::string group_name) {
   std::vector<at::Tensor> outputs;
+  outputs.reserve(inputs.size());
   for (const auto& tensor : inputs) {
     outputs.push_back(allocate_all_gather_output(tensor, group_size));
   }
 
   auto group = c10d::resolve_process_group(group_name);
-  auto work = group->allgather_into_tensor_coalesced(
-      outputs, const_cast<std::vector<at::Tensor>&>(inputs));
+  auto work = group->allgather_into_tensor_coalesced(outputs, inputs);
   for (const auto& tensor : outputs) {
     c10d::RankLocal<WorkRegistry>::get().register_work(tensor, work);
   }
@@ -149,7 +192,8 @@ at::Tensor all_gather_into_tensor(
     int64_t group_size,
     std::string group_name) {
   std::vector<at::Tensor> inputs{input};
-  return all_gather_into_tensor_coalesced(inputs, group_size, group_name)[0];
+  return all_gather_into_tensor_coalesced(
+      inputs, group_size, std::move(group_name))[0];
 }
 
 at::Tensor allocate_reduce_scatter_output(
@@ -169,19 +213,21 @@ at::Tensor allocate_reduce_scatter_output(
 
 std::vector<at::Tensor> reduce_scatter_tensor_coalesced(
     std::vector<at::Tensor> inputs,
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     std::string reduce_op,
     int64_t group_size,
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     std::string group_name) {
   c10d::ReduceScatterOptions opts;
   opts.reduceOp = to_reduce_op(reduce_op);
   std::vector<at::Tensor> outputs;
+  outputs.reserve(inputs.size());
   for (const auto& tensor : inputs) {
     outputs.push_back(allocate_reduce_scatter_output(tensor, group_size));
   }
 
   auto group = c10d::resolve_process_group(group_name);
-  auto work = group->reduce_scatter_tensor_coalesced(
-      outputs, const_cast<std::vector<at::Tensor>&>(inputs), opts);
+  auto work = group->reduce_scatter_tensor_coalesced(outputs, inputs, opts);
   for (const auto& tensor : outputs) {
     c10d::RankLocal<WorkRegistry>::get().register_work(tensor, work);
   }
@@ -195,22 +241,24 @@ at::Tensor reduce_scatter_tensor(
     std::string group_name) {
   std::vector<at::Tensor> inputs{input};
   return reduce_scatter_tensor_coalesced(
-      inputs, reduce_op, group_size, group_name)[0];
+      inputs, std::move(reduce_op), group_size, std::move(group_name))[0];
 }
 
 at::Tensor all_to_all_single(
     const at::Tensor& input,
     std::vector<int64_t> output_split_sizes,
     std::vector<int64_t> input_split_sizes,
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     std::string group_name) {
   std::vector<int64_t> output_sizes = input.sizes().vec();
-  output_sizes[0] =
-      std::accumulate(output_split_sizes.begin(), output_split_sizes.end(), 0);
+  output_sizes[0] = std::accumulate(
+      output_split_sizes.begin(), output_split_sizes.end(), int64_t(0));
   auto output = input.new_empty(output_sizes);
 
   auto group = c10d::resolve_process_group(group_name);
   auto work = group->alltoall_base(
       output,
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
       const_cast<at::Tensor&>(input),
       output_split_sizes,
       input_split_sizes);
@@ -218,9 +266,31 @@ at::Tensor all_to_all_single(
   return output;
 }
 
+// NOLINTNEXTLINE(performance-unnecessary-value-param)
+at::Tensor& broadcast_(at::Tensor& input, int64_t src, std::string group_name) {
+  c10d::BroadcastOptions opts;
+  opts.rootRank = src;
+  std::vector<at::Tensor> inputs{input};
+
+  auto group = c10d::resolve_process_group(group_name);
+  auto work = group->broadcast(inputs, opts);
+  c10d::RankLocal<WorkRegistry>::get().register_work(input, work);
+  return input;
+}
+
+at::Tensor broadcast(
+    const at::Tensor& input,
+    int64_t src,
+    std::string group_name) {
+  auto output = input.clone(at::MemoryFormat::Contiguous);
+  return broadcast_(output, src, std::move(group_name));
+}
+
 at::Tensor wait_tensor(const at::Tensor& tensor) {
   auto work = c10d::RankLocal<WorkRegistry>::get().pop_work(tensor);
-  work->wait();
+  if (work != nullptr) {
+    work->wait();
+  }
   return tensor;
 }
 
@@ -288,9 +358,271 @@ TORCH_LIBRARY(_c10d_functional, m) {
           c10::DispatchKey::CompositeExplicitAutograd, ::all_to_all_single),
       {at::Tag::pt2_compliant_tag});
 
+  m.def(
+      "broadcast(Tensor input, int src, str group_name) -> Tensor",
+      torch::dispatch(c10::DispatchKey::CompositeExplicitAutograd, ::broadcast),
+      {at::Tag::pt2_compliant_tag});
+
+  m.def(
+      "broadcast_(Tensor(a!) input, int src, str group_name) -> Tensor(a!)",
+      torch::dispatch(
+          c10::DispatchKey::CompositeExplicitAutograd, ::broadcast_),
+      {at::Tag::pt2_compliant_tag});
+
   m.def(
       "wait_tensor(Tensor tensor) -> Tensor",
       torch::dispatch(
           c10::DispatchKey::CompositeExplicitAutograd, ::wait_tensor),
       {at::Tag::pt2_compliant_tag});
 }
+
+namespace {
+class AllToAllSingle : public torch::autograd::Function<AllToAllSingle> {
+ public:
+  static torch::autograd::Variable forward(
+      torch::autograd::AutogradContext* ctx,
+      const at::Tensor& input,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
+      std::vector<int64_t> output_split_sizes,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
+      std::vector<int64_t> input_split_sizes,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
+      std::string group_name) {
+    // swap sizes for backwards pass
+    ctx->saved_data["output_split_sizes"] = input_split_sizes;
+    ctx->saved_data["input_split_sizes"] = output_split_sizes;
+    ctx->saved_data["group_name"] = group_name;
+
+    return c10::Dispatcher::singleton()
+        .findSchemaOrThrow("_c10d_functional::all_to_all_single", "")
+        .typed<decltype(all_to_all_single)>()
+        .call(input, output_split_sizes, input_split_sizes, group_name);
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_out_list) {
+    const std::vector<int64_t>& output_split_sizes =
+        ctx->saved_data["output_split_sizes"].toIntVector();
+    const std::vector<int64_t>& input_split_sizes =
+        ctx->saved_data["input_split_sizes"].toIntVector();
+    const std::string& group_name = ctx->saved_data["group_name"].toStringRef();
+
+    DCHECK(grad_out_list.size() == 1);
+    auto grad_out = grad_out_list[0].contiguous();
+
+    auto out =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("_c10d_functional::all_to_all_single", "")
+            .typed<decltype(all_to_all_single)>()
+            .call(grad_out, output_split_sizes, input_split_sizes, group_name);
+
+    // do an explicit wait to avoid cuda stream issues
+    // TODO: track active cuda stream in wait
+    out = c10::Dispatcher::singleton()
+              .findSchemaOrThrow("_c10d_functional::wait_tensor", "")
+              .typed<decltype(wait_tensor)>()
+              .call(out);
+
+    return {out, at::Tensor(), at::Tensor(), at::Tensor()};
+  }
+};
+
+at::Tensor all_to_all_single_autograd(
+    const at::Tensor& input,
+    const std::vector<int64_t>& output_split_sizes,
+    const std::vector<int64_t>& input_split_sizes,
+    const std::string& group_name) {
+  return AllToAllSingle::apply(
+      input, output_split_sizes, input_split_sizes, group_name);
+}
+
+class ReduceScatterTensor
+    : public torch::autograd::Function<ReduceScatterTensor> {
+ public:
+  static torch::autograd::Variable forward(
+      torch::autograd::AutogradContext* ctx,
+      const at::Tensor& input,
+      std::string reduce_op,
+      int64_t group_size,
+      std::string group_name) {
+    TORCH_CHECK(reduce_op == "sum", "Only sum reduce op is supported");
+
+    ctx->saved_data["group_size"] = group_size;
+    ctx->saved_data["group_name"] = group_name;
+
+    return c10::Dispatcher::singleton()
+        .findSchemaOrThrow("_c10d_functional::reduce_scatter_tensor", "")
+        .typed<decltype(reduce_scatter_tensor)>()
+        .call(input, reduce_op, group_size, group_name);
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_out_list) {
+    const int64_t group_size = ctx->saved_data["group_size"].toInt();
+    const std::string& group_name = ctx->saved_data["group_name"].toStringRef();
+
+    DCHECK(grad_out_list.size() == 1);
+    auto grad_out = grad_out_list[0];
+
+    auto out =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("_c10d_functional::all_gather_into_tensor", "")
+            .typed<decltype(all_gather_into_tensor)>()
+            .call(grad_out, group_size, group_name);
+
+    // do an explicit wait to avoid cuda stream issues
+    // TODO: track active cuda stream in wait
+    out = c10::Dispatcher::singleton()
+              .findSchemaOrThrow("_c10d_functional::wait_tensor", "")
+              .typed<decltype(wait_tensor)>()
+              .call(out);
+
+    return {
+        out,
+        at::Tensor(),
+        at::Tensor(),
+        at::Tensor(),
+    };
+  }
+};
+
+at::Tensor reduce_scatter_tensor_autograd(
+    const at::Tensor& input,
+    std::string reduce_op,
+    int64_t group_size,
+    std::string group_name) {
+  return ReduceScatterTensor::apply(input, reduce_op, group_size, group_name);
+}
+
+class AllGatherIntoTensor
+    : public torch::autograd::Function<AllGatherIntoTensor> {
+ public:
+  static torch::autograd::Variable forward(
+      torch::autograd::AutogradContext* ctx,
+      const at::Tensor& input,
+      int64_t group_size,
+      std::string group_name) {
+    ctx->saved_data["group_size"] = group_size;
+    ctx->saved_data["group_name"] = group_name;
+
+    return c10::Dispatcher::singleton()
+        .findSchemaOrThrow("_c10d_functional::all_gather_into_tensor", "")
+        .typed<decltype(all_gather_into_tensor)>()
+        .call(input, group_size, group_name);
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_out_list) {
+    const int64_t group_size = ctx->saved_data["group_size"].toInt();
+    const std::string& group_name = ctx->saved_data["group_name"].toStringRef();
+
+    DCHECK(grad_out_list.size() == 1);
+    auto grad_out = grad_out_list[0];
+
+    auto out =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("_c10d_functional::reduce_scatter_tensor", "")
+            .typed<decltype(reduce_scatter_tensor)>()
+            .call(grad_out, "sum", group_size, group_name);
+
+    // do an explicit wait to avoid cuda stream issues
+    // TODO: track active cuda stream in wait
+    out = c10::Dispatcher::singleton()
+              .findSchemaOrThrow("_c10d_functional::wait_tensor", "")
+              .typed<decltype(wait_tensor)>()
+              .call(out);
+
+    return {
+        out,
+        at::Tensor(),
+        at::Tensor(),
+    };
+  }
+};
+
+at::Tensor all_gather_into_tensor_autograd(
+    const at::Tensor& input,
+    int64_t group_size,
+    std::string group_name) {
+  return AllGatherIntoTensor::apply(input, group_size, group_name);
+}
+
+} // namespace
+
+TORCH_LIBRARY(_c10d_functional_autograd, m) {
+  m.def(
+      "all_to_all_single("
+      "Tensor input, "
+      "SymInt[] output_split_sizes, "
+      "SymInt[] input_split_sizes, "
+      "str group_name) -> Tensor",
+      torch::dispatch(c10::DispatchKey::Autograd, ::all_to_all_single_autograd),
+      {at::Tag::pt2_compliant_tag});
+  m.def(
+      "reduce_scatter_tensor("
+      "Tensor input, "
+      "str reduce_op, "
+      "int group_size, "
+      "str group_name) -> Tensor",
+      torch::dispatch(
+          c10::DispatchKey::Autograd, ::reduce_scatter_tensor_autograd),
+      {at::Tag::pt2_compliant_tag});
+  m.def(
+      "all_gather_into_tensor("
+      "Tensor input, "
+      "int group_size, "
+      "str group_name) -> Tensor",
+      torch::dispatch(
+          c10::DispatchKey::Autograd, ::all_gather_into_tensor_autograd),
+      {at::Tag::pt2_compliant_tag});
+}
+
+namespace {
+// DTensor related comm operations, sharing code with functional collective for
+// now
+at::Tensor shard_dim_alltoall(
+    const at::Tensor& input,
+    int64_t gather_dim,
+    int64_t shard_dim,
+    std::string group_name) {
+  auto group = c10d::resolve_process_group(group_name);
+  auto group_size = group->getSize();
+  std::vector<int64_t> output_sizes = input.sizes().vec();
+  if (output_sizes[shard_dim] % group_size != 0) {
+    LOG(WARNING) << "The first dimension of the shard_dim_alltoall input ("
+                 << output_sizes[shard_dim]
+                 << ") is not divisible by the group size (" << group_size
+                 << ").";
+  }
+  output_sizes[shard_dim] = output_sizes[shard_dim] / group_size;
+  std::vector<at::Tensor> inputs;
+  auto length = output_sizes[shard_dim];
+  for (int i = 0; i < group_size; i++) {
+    inputs.push_back(input.narrow(shard_dim, i * length, length).contiguous());
+  }
+  // allocate outputs
+  std::vector<at::Tensor> outputs;
+  for (int i = 0; i < group_size; i++) {
+    outputs.push_back(input.new_empty(output_sizes).contiguous());
+  }
+  auto work = group->alltoall(outputs, inputs);
+
+  work->wait();
+  // TODO: it's very tricky to get the current async behavior work for shard dim
+  // alltoall so for now we just keep this comm op to be synchronous. We can
+  // revisit later how to support the async case with the Work registry.
+  return at::cat(outputs, gather_dim);
+}
+} // namespace
+
+// DTensor comm op registry
+TORCH_LIBRARY(_dtensor, m) {
+  m.def(
+      "shard_dim_alltoall(Tensor input, int gather_dim, int shard_dim, str group_name) -> Tensor",
+      torch::dispatch(
+          c10::DispatchKey::CompositeExplicitAutograd, ::shard_dim_alltoall),
+      {at::Tag::pt2_compliant_tag});
+}
diff --git a/torch/csrc/distributed/c10d/Functional.hpp b/torch/csrc/distributed/c10d/Functional.hpp
deleted file mode 100644
index 03cb8c42c193b..0000000000000
--- a/torch/csrc/distributed/c10d/Functional.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
-
-namespace c10d_functional {
-
-void register_process_group(
-    const std::string& tag,
-    c10::intrusive_ptr<c10d::ProcessGroup> pg);
-
-c10::intrusive_ptr<c10d::ProcessGroup> resolve_process_group(
-    const std::string& tag);
-
-} // namespace c10d_functional
diff --git a/torch/csrc/distributed/c10d/GroupRegistry.cpp b/torch/csrc/distributed/c10d/GroupRegistry.cpp
index 18faff3f4957f..b13b4fa07c28e 100644
--- a/torch/csrc/distributed/c10d/GroupRegistry.cpp
+++ b/torch/csrc/distributed/c10d/GroupRegistry.cpp
@@ -13,7 +13,7 @@ class GroupRegistry {
       const std::string& group_name,
       c10::intrusive_ptr<c10d::ProcessGroup> group) {
     std::unique_lock write_lock(lock_);
-    auto [_, inserted] = registry_.emplace(group_name, group);
+    auto [_, inserted] = registry_.try_emplace(group_name, std::move(group));
     TORCH_CHECK(
         inserted,
         "A process group is already registered under the name",
@@ -38,6 +38,16 @@ class GroupRegistry {
     return group;
   }
 
+  void unregister_group(const std::string& group_name) {
+    std::unique_lock write_lock(lock_);
+    registry_.erase(group_name);
+  }
+
+  void unregister_all_groups() {
+    std::unique_lock write_lock(lock_);
+    registry_.clear();
+  }
+
  private:
   std::map<std::string, c10::weak_intrusive_ptr<c10d::ProcessGroup>> registry_;
   std::shared_mutex lock_;
@@ -47,15 +57,51 @@ class GroupRegistry {
 
 namespace c10d {
 
+static bool thread_isolation_mode = false;
+static GroupRegistry process_registry;
+
+void set_thread_isolation_mode(bool enable) {
+  thread_isolation_mode = enable;
+}
+
+bool get_thread_isolation_mode() {
+  return thread_isolation_mode;
+}
+
 void register_process_group(
     const std::string& group_name,
     c10::intrusive_ptr<c10d::ProcessGroup> group) {
-  RankLocal<::GroupRegistry>::get().register_group(group_name, group);
+  if (thread_isolation_mode) {
+    RankLocal<::GroupRegistry>::get().register_group(
+        group_name, std::move(group));
+  } else {
+    process_registry.register_group(group_name, std::move(group));
+  }
 }
 
 c10::intrusive_ptr<c10d::ProcessGroup> resolve_process_group(
     const std::string& group_name) {
-  return RankLocal<::GroupRegistry>::get().resolve_group(group_name);
+  if (thread_isolation_mode) {
+    return RankLocal<::GroupRegistry>::get().resolve_group(group_name);
+  } else {
+    return process_registry.resolve_group(group_name);
+  }
+}
+
+void unregister_process_group(const std::string& group_name) {
+  if (thread_isolation_mode) {
+    RankLocal<::GroupRegistry>::get().unregister_group(group_name);
+  } else {
+    process_registry.unregister_group(group_name);
+  }
+}
+
+void unregister_all_process_groups() {
+  if (thread_isolation_mode) {
+    RankLocal<::GroupRegistry>::get().unregister_all_groups();
+  } else {
+    process_registry.unregister_all_groups();
+  }
 }
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/GroupRegistry.hpp b/torch/csrc/distributed/c10d/GroupRegistry.hpp
index 3fa0ca69892a6..b22fb1ae8faf3 100644
--- a/torch/csrc/distributed/c10d/GroupRegistry.hpp
+++ b/torch/csrc/distributed/c10d/GroupRegistry.hpp
@@ -4,6 +4,10 @@
 
 namespace c10d {
 
+C10_EXPORT void set_thread_isolation_mode(bool enable);
+
+bool get_thread_isolation_mode();
+
 C10_EXPORT void register_process_group(
     const std::string& group_name,
     c10::intrusive_ptr<c10d::ProcessGroup> group);
@@ -11,4 +15,8 @@ C10_EXPORT void register_process_group(
 C10_EXPORT c10::intrusive_ptr<c10d::ProcessGroup> resolve_process_group(
     const std::string& group_name);
 
+C10_EXPORT void unregister_process_group(const std::string& group_name);
+
+C10_EXPORT void unregister_all_process_groups();
+
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/HashStore.cpp b/torch/csrc/distributed/c10d/HashStore.cpp
index cde8585ca5002..50ed5ca5eb8dd 100644
--- a/torch/csrc/distributed/c10d/HashStore.cpp
+++ b/torch/csrc/distributed/c10d/HashStore.cpp
@@ -1,12 +1,9 @@
 #include <torch/csrc/distributed/c10d/HashStore.hpp>
 
 #include <unistd.h>
-#include <cerrno>
 #include <cstdint>
 
 #include <chrono>
-#include <cstdio>
-#include <system_error>
 
 #include <c10/util/Exception.h>
 
@@ -100,7 +97,7 @@ int64_t HashStore::add(const std::string& key, int64_t i) {
 
 int64_t HashStore::getNumKeys() {
   std::unique_lock<std::mutex> lock(m_);
-  return map_.size();
+  return static_cast<int64_t>(map_.size());
 }
 
 bool HashStore::deleteKey(const std::string& key) {
diff --git a/torch/csrc/distributed/c10d/HashStore.hpp b/torch/csrc/distributed/c10d/HashStore.hpp
index b691de302a389..1453c0a72808a 100644
--- a/torch/csrc/distributed/c10d/HashStore.hpp
+++ b/torch/csrc/distributed/c10d/HashStore.hpp
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <sys/types.h>
-
 #include <condition_variable>
 #include <mutex>
 #include <unordered_map>
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index 720b72d769d59..9a0c77a8623c3 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -9,6 +9,10 @@
 #include <cuda_runtime.h>
 #include <mutex>
 
+namespace {
+constexpr int64_t kCommInitBusyWaitMillis = 10;
+} // namespace
+
 namespace c10d {
 
 ncclComm_t NCCLComm::getNcclComm() {
@@ -26,9 +30,39 @@ ncclComm_t NCCLComm::getNcclComm() {
             ". ",
             commFailureMsg));
   }
+  // only wait for initialization if nonblocking mode is enabled
+  if (!initialized_ && nccl_use_nonblocking()) {
+    waitUntilInitialized(nccl_nonblocking_timeout());
+  }
+
   return ncclComm_;
 }
 
+void NCCLComm::waitUntilInitialized(int timeoutSecs) {
+  auto startTimepoint = std::chrono::steady_clock::now();
+  while (!initialized_) {
+    if (ncclComm_) {
+      ncclResult_t result;
+      ncclCommGetAsyncError(ncclComm_, &result);
+      if (result == ncclSuccess) {
+        LOG(INFO) << "Rank " << rank_ << ": NCCL communicator is initialized.";
+        initialized_ = true;
+        break;
+      }
+    }
+    auto currentTimepoint = std::chrono::steady_clock::now();
+    auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>(
+                           currentTimepoint - startTimepoint)
+                           .count();
+    if (timeElapsed > timeoutSecs) {
+      std::string err = "NCCL timeout in communicator initialization.";
+      TORCH_CHECK_WITH(DistBackendError, false, err);
+    }
+    std::this_thread::sleep_for(
+        std::chrono::milliseconds(kCommInitBusyWaitMillis));
+  }
+}
+
 std::string getNcclVersion() {
   static c10::once_flag ncclGetVersionFlag;
   static std::string versionString;
@@ -135,7 +169,12 @@ std::string getNcclErrorDetailStr(
   std::string interpret;
   std::string err;
 #ifdef ENABLE_NCCL_GET_LAST_ERROR
-  err = "\nLast error:\n" + std::string(ncclGetLastError(NULL));
+  auto ret = ncclGetLastError(NULL);
+  if (ret) {
+    err = "\nLast error:\n" + std::string(ret);
+  } else {
+    err = "\nLast error: Unknown NCCL Error\n";
+  }
 #endif
   switch (error) {
     case ncclUnhandledCudaError:
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index 602cd8260be3b..a4b96a2a40762 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -7,6 +7,7 @@
 
 #include <memory>
 #include <mutex>
+#include <thread>
 
 #include <ATen/ATen.h>
 #include <c10/util/Exception.h>
@@ -86,6 +87,18 @@
     }                                                                         \
   } while (0)
 
+// Macro to throw on a non-successful NCCL return value for NONBLOCKING calls.
+#define C10D_NCCL_CHECK_NONBLOCKING(cmd, failureReason)                       \
+  do {                                                                        \
+    ncclResult_t result = cmd;                                                \
+    if (result != ncclSuccess && result != ncclInProgress) {                  \
+      std::string err = "NCCL error in: " + std::string(__FILE__) + ":" +     \
+          std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(result) + \
+          "\n" + getNcclErrorDetailStr(result, failureReason);                \
+      TORCH_CHECK_WITH(DistBackendError, false, err);                         \
+    }                                                                         \
+  } while (0)
+
 // Macro to throw on a non-successful NCCL return value, non-blocking.
 #define C10D_NCCL_CHECK_TIMEOUT(cmd, comm, failureReason)                     \
   ncclResult_t result = cmd;                                                  \
@@ -113,37 +126,32 @@
     TORCH_CHECK_WITH(DistBackendError, false, err);                           \
   }
 
-#define C10D_NCCL_CHECK_TIMEOUT_GROUPEND(cmd, comms_, failureReason)           \
-  ncclResult_t state = cmd;                                                    \
-  auto startTimepoint = std::chrono::steady_clock::now();                      \
-  if (state == ncclInProgress) {                                               \
-    for (const auto i : c10::irange(comms_.size())) {                          \
-      do {                                                                     \
-        if (nccl_nonblocking_timeout() > 0) {                                  \
-          auto currentTimepoint = std::chrono::steady_clock::now();            \
-          auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>( \
-                                 currentTimepoint - startTimepoint)            \
-                                 .count();                                     \
-          if (timeElapsed > nccl_nonblocking_timeout()) {                      \
-            std::string err = "NCCL timeout in: " + std::string(__FILE__) +    \
-                ":" + std::to_string(__LINE__) + ", " +                        \
-                ncclGetErrorWithVersion(state) + "\n" +                        \
-                getNcclErrorDetailStr(state, failureReason);                   \
-            TORCH_CHECK_WITH(DistBackendError, false, err);                    \
-          }                                                                    \
-        }                                                                      \
-        ncclCommGetAsyncError(comms_[i]->getNcclComm(), &state);               \
-      } while (state == ncclInProgress);                                       \
-      if (state != ncclSuccess) {                                              \
-        break; /* fall through to failed case */                               \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  if (state != ncclSuccess) {                                                  \
-    std::string err = "NCCL error in: " + std::string(__FILE__) + ":" +        \
-        std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(state) +     \
-        "\n" + getNcclErrorDetailStr(state, failureReason);                    \
-    TORCH_CHECK_WITH(DistBackendError, false, err);                            \
+#define C10D_NCCL_CHECK_TIMEOUT_GROUPEND(cmd, comm, failureReason)           \
+  ncclResult_t state = cmd;                                                  \
+  auto startTimepoint = std::chrono::steady_clock::now();                    \
+  if (state == ncclInProgress) {                                             \
+    do {                                                                     \
+      if (nccl_nonblocking_timeout() > 0) {                                  \
+        auto currentTimepoint = std::chrono::steady_clock::now();            \
+        auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>( \
+                               currentTimepoint - startTimepoint)            \
+                               .count();                                     \
+        if (timeElapsed > nccl_nonblocking_timeout()) {                      \
+          std::string err = "NCCL timeout in: " + std::string(__FILE__) +    \
+              ":" + std::to_string(__LINE__) + ", " +                        \
+              ncclGetErrorWithVersion(state) + "\n" +                        \
+              getNcclErrorDetailStr(state, failureReason);                   \
+          TORCH_CHECK_WITH(DistBackendError, false, err);                    \
+        }                                                                    \
+      }                                                                      \
+      ncclCommGetAsyncError(comm->getNcclComm(), &state);                    \
+    } while (state == ncclInProgress);                                       \
+  }                                                                          \
+  if (state != ncclSuccess) {                                                \
+    std::string err = "NCCL error in: " + std::string(__FILE__) + ":" +      \
+        std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(state) +   \
+        "\n" + getNcclErrorDetailStr(state, failureReason);                  \
+    TORCH_CHECK_WITH(DistBackendError, false, err);                          \
   }
 
 // Macro to print and abort on a non-successful NCCL return value.
@@ -190,6 +198,9 @@ class TORCH_API DebugInfoWriter {
   virtual void write(const std::string& ncclTrace);
   static DebugInfoWriter& getWriter(int rank);
   static void registerWriter(std::unique_ptr<DebugInfoWriter> writer);
+  virtual std::string getWriterTarget() {
+    return filename_;
+  }
 
  protected:
   DebugInfoWriter(std::string namePrefix, int rank) {
@@ -209,7 +220,8 @@ class NCCLComm {
       : ncclComm_(ncclComm),
         aborted_(false),
         ncclAsyncErr_(ncclSuccess),
-        commFailureReason_(c10::nullopt) {}
+        commFailureReason_(c10::nullopt),
+        initialized_(false) {}
 
   NCCLComm() : NCCLComm(nullptr) {}
 
@@ -239,6 +251,7 @@ class NCCLComm {
         c10::nullopt);
     comm->ncclId_ = commId;
     comm->rank_ = rank;
+    comm->initialized_ = true;
     return comm;
   }
 
@@ -249,21 +262,26 @@ class NCCLComm {
       ncclUniqueId commId,
       ncclConfig_t& config) {
     auto comm = std::make_shared<NCCLComm>();
+    bool isInitialized = false;
     if (nccl_use_nonblocking()) {
       config.blocking = 0;
-      C10D_NCCL_CHECK_TIMEOUT(
+      LOG(INFO) << "Rank " << rank
+                << ": creating NCCL communicator in nonblocking mode";
+      C10D_NCCL_CHECK_NONBLOCKING(
           ncclCommInitRankConfig(
               &(comm->ncclComm_), numRanks, commId, rank, &config),
-          comm->ncclComm_,
           c10::nullopt);
     } else {
       C10D_NCCL_CHECK(
           ncclCommInitRankConfig(
               &(comm->ncclComm_), numRanks, commId, rank, &config),
           c10::nullopt);
+      // under blocking mode, comm is initialized after NCCL CHECK
+      isInitialized = true;
     }
     comm->ncclId_ = commId;
     comm->rank_ = rank;
+    comm->initialized_ = isInitialized;
     return comm;
   }
 #endif
@@ -280,10 +298,23 @@ class NCCLComm {
             source->ncclComm_, color_id, rank, &(comm->ncclComm_), &config),
         c10::nullopt);
     ++source->ncclCommSplitCounter_;
+    comm->rank_ = rank;
     return comm;
   }
 #endif
 
+#if defined(IS_NCCLX) && defined(NCCL_COMM_DUMP)
+  std::unordered_map<std::string, std::string> ncclCommDump() {
+    std::unordered_map<std::string, std::string> dump;
+    if (isAborted()) {
+      LOG(INFO) << "Communicator was aborted before trying to dump its state.";
+      return dump;
+    }
+    C10D_NCCL_CHECK(::ncclCommDump(ncclComm_, dump), c10::nullopt);
+    return dump;
+  }
+#endif
+
   ncclUniqueId getNcclId() {
     return ncclId_;
   }
@@ -303,6 +334,7 @@ class NCCLComm {
     std::swap(ncclComm_, other.ncclComm_);
     std::swap(aborted_, other.aborted_);
     std::swap(ncclAsyncErr_, other.ncclAsyncErr_);
+    std::swap(initialized_, other.initialized_);
   }
 
   ncclComm_t getNcclComm();
@@ -339,6 +371,9 @@ class NCCLComm {
     // Set true failure reason if provided by ProcessGroupNCCL (e.g. work
     // timeout)
     commFailureReason_ = commFailureReason;
+    LOG(INFO) << "Aborting ncclComm_ " << ncclComm_ << " with reason: "
+              << (commFailureReason ? *commFailureReason
+                                    : "No abort reason provided.");
 #ifndef NCCL_HAS_COMM_NONBLOCKING
     C10D_NCCL_CHECK(::ncclCommAbort(ncclComm_), commFailureReason_);
 #else
@@ -429,6 +464,8 @@ class NCCLComm {
         c10::str(
             "Failed to deregister segment handle ",
             handle,
+            ", with ptr ",
+            ptr,
             " on ncclComm_ ",
             ncclComm_));
     registeredSegmentHandles_.erase(ptr);
@@ -438,7 +475,11 @@ class NCCLComm {
 #endif
   }
 
+  friend class ProcessGroupNCCL;
+
  protected:
+  // a helper function to wait until the communicator is initialized;
+  void waitUntilInitialized(int timeoutSecs);
   ncclComm_t ncclComm_;
   // Unique nccl_id for this communicator.
   ncclUniqueId ncclId_;
@@ -451,6 +492,7 @@ class NCCLComm {
   // Optional reason for communicator failure, provided by ProcessGroupNCCL for
   // better error messaging.
   c10::optional<std::string> commFailureReason_;
+  bool initialized_{false};
 #ifdef NCCL_HAS_COMM_REGISTER
   // Stores handlers for tensors registered by NCCL
   std::unordered_map<void*, void*> registeredSegmentHandles_;
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 736f2313504b3..cf8b7cd966ef5 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -220,6 +220,7 @@ IMPL_ALLREDUCE_COALESCED(PrivateUse1)
             output_tensors, work);                                             \
   }
 
+// NOLINTBEGIN(cppcoreguidelines-pro-type-const-cast)
 IMPL_ALLGATHER(CPU)
 IMPL_ALLGATHER(CUDA)
 IMPL_ALLGATHER(PrivateUse1)
@@ -440,6 +441,7 @@ IMPL_ALLTOALL_BASE(PrivateUse1)
 IMPL_BARRIER(CPU)
 IMPL_BARRIER(CUDA)
 IMPL_BARRIER(PrivateUse1)
+// NOLINTEND(cppcoreguidelines-pro-type-const-cast)
 
 void monitored_barrier_CPU(
     at::Tensor /* unused */,
@@ -465,7 +467,7 @@ allreduce_sparse_cuda_(
                   ->allreduce_sparse(
                       tensor_vec,
                       AllreduceOptions{
-                          *reduce_op.get(),
+                          *reduce_op,
                           std::chrono::milliseconds(timeout),
                           sparse_indices});
 
diff --git a/torch/csrc/distributed/c10d/ParamCommsUtils.cpp b/torch/csrc/distributed/c10d/ParamCommsUtils.cpp
index 6c9466da7a858..de29c56895ac3 100644
--- a/torch/csrc/distributed/c10d/ParamCommsUtils.cpp
+++ b/torch/csrc/distributed/c10d/ParamCommsUtils.cpp
@@ -8,8 +8,9 @@
 namespace torch {
 
 ParamCommsDebugInfo::ParamCommsDebugInfo(
+    std::tuple<std::string, std::string> pgName,
     int rank,
-    std::string&& colName,
+    std::string&& collName,
     int inNelems,
     int outNelems,
     at::ScalarType dType,
@@ -18,15 +19,22 @@ ParamCommsDebugInfo::ParamCommsDebugInfo(
     int globalRankStart,
     int globalRankStride,
     int worldSize)
-    : rank_(rank),
+    : pgName_(std::move(pgName)),
+      rank_(rank),
       worldSize_(worldSize),
-      columnName_(colName),
+      collectiveName_(std::move(collName)),
       inMessageNelems_(inNelems),
       outMessageNelems_(outNelems),
       dType_(dType),
       inputSplitSizes_(std::move(inSplitSizes)),
       outputSplitSizes_(std::move(outSplitSizes)),
       globalRankStart_(globalRankStart),
-      globalRankStride_(globalRankStride) {}
+      globalRankStride_(globalRankStride) {
+  if (globalRankStride > 0) {
+    for (int i = 0; i < worldSize; i++) {
+      groupRanks_.push_back(globalRankStart + i * globalRankStride);
+    }
+  }
+}
 
 } // namespace torch
diff --git a/torch/csrc/distributed/c10d/ParamCommsUtils.hpp b/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
index f1378f4f52e07..61e3405acd3ca 100644
--- a/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
+++ b/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
@@ -13,8 +13,9 @@ class TORCH_API ParamCommsDebugInfo : public c10::DebugInfoBase {
  public:
   ParamCommsDebugInfo() = default;
   ParamCommsDebugInfo(
+      std::tuple<std::string, std::string> pgName,
       int rank,
-      std::string&& colName,
+      std::string&& collName,
       int inNelems,
       int outNelems,
       at::ScalarType dType,
@@ -26,6 +27,14 @@ class TORCH_API ParamCommsDebugInfo : public c10::DebugInfoBase {
 
   ~ParamCommsDebugInfo() override = default;
 
+  const std::string getProcessGroupName() const {
+    return std::get<0>(pgName_);
+  }
+
+  const std::string getProcessGroupDesc() const {
+    return std::get<1>(pgName_);
+  }
+
   int getRank() const {
     return rank_;
   }
@@ -42,8 +51,8 @@ class TORCH_API ParamCommsDebugInfo : public c10::DebugInfoBase {
     return globalRankStride_;
   }
 
-  const std::string getColumnName() const {
-    return columnName_;
+  const std::string getCollectiveName() const {
+    return collectiveName_;
   }
 
   int getInMessageNelems() const {
@@ -66,24 +75,30 @@ class TORCH_API ParamCommsDebugInfo : public c10::DebugInfoBase {
     return outputSplitSizes_;
   }
 
+  const std::vector<int64_t>& getGroupRanks() const {
+    return groupRanks_;
+  }
+
  private:
+  std::tuple<std::string, std::string> pgName_; // <group_name, group_desc>
   int rank_{};
   int worldSize_{};
-  std::string columnName_;
+  std::string collectiveName_;
   int inMessageNelems_{};
   int outMessageNelems_{};
   at::ScalarType dType_ = at::kByte;
   std::vector<int64_t> inputSplitSizes_;
   std::vector<int64_t> outputSplitSizes_;
-  int globalRankStart_;
-  int globalRankStride_;
+  int globalRankStart_{};
+  int globalRankStride_{};
+  std::vector<int64_t> groupRanks_{};
 };
 
 #define RECORD_PARAM_COMMS(                                                    \
     seq,                                                                       \
-    pg_ptr,                                                                    \
+    pgName,                                                                    \
     rank,                                                                      \
-    colName,                                                                   \
+    collName,                                                                  \
     inNelems,                                                                  \
     outNelems,                                                                 \
     dType,                                                                     \
@@ -93,8 +108,9 @@ class TORCH_API ParamCommsDebugInfo : public c10::DebugInfoBase {
     globalRankStride,                                                          \
     worldSize)                                                                 \
   auto paramCommsInfo = std::make_shared<torch::ParamCommsDebugInfo>(          \
+      pgName,                                                                  \
       rank,                                                                    \
-      colName,                                                                 \
+      collName,                                                                \
       inNelems,                                                                \
       outNelems,                                                               \
       dType,                                                                   \
@@ -106,9 +122,9 @@ class TORCH_API ParamCommsDebugInfo : public c10::DebugInfoBase {
   c10::DebugInfoGuard g(c10::DebugInfoKind::PARAM_COMMS_INFO, paramCommsInfo); \
   std::initializer_list<const c10::IValue> paramList = {                       \
       c10::IValue(seq),                                                        \
-      c10::IValue(pg_ptr),                                                     \
+      pgName,                                                                  \
       rank,                                                                    \
-      colName,                                                                 \
+      collName,                                                                \
       inSplitSizes,                                                            \
       outSplitSizes,                                                           \
       globalRankStart,                                                         \
@@ -119,11 +135,11 @@ class TORCH_API ParamCommsDebugInfo : public c10::DebugInfoBase {
 
 #define RECORD_PARAM_COMMS_DATA(                                               \
     seq,                                                                       \
-    pg_ptr,                                                                    \
+    pgName,                                                                    \
     InputTensors,                                                              \
     OutputTensors,                                                             \
     rank,                                                                      \
-    colName,                                                                   \
+    collName,                                                                  \
     inNelems,                                                                  \
     outNelems,                                                                 \
     dType,                                                                     \
@@ -133,8 +149,9 @@ class TORCH_API ParamCommsDebugInfo : public c10::DebugInfoBase {
     globalRankStride,                                                          \
     worldSize)                                                                 \
   auto paramCommsInfo = std::make_shared<torch::ParamCommsDebugInfo>(          \
+      pgName,                                                                  \
       rank,                                                                    \
-      colName,                                                                 \
+      collName,                                                                \
       inNelems,                                                                \
       outNelems,                                                               \
       dType,                                                                   \
@@ -147,9 +164,9 @@ class TORCH_API ParamCommsDebugInfo : public c10::DebugInfoBase {
   std::initializer_list<const c10::IValue> paramList = {                       \
       c10::IValue(InputTensors),                                               \
       c10::IValue(seq),                                                        \
-      c10::IValue(pg_ptr),                                                     \
+      pgName,                                                                  \
       rank,                                                                    \
-      colName,                                                                 \
+      collName,                                                                \
       inSplitSizes,                                                            \
       outSplitSizes,                                                           \
       globalRankStart,                                                         \
diff --git a/torch/csrc/distributed/c10d/PrefixStore.cpp b/torch/csrc/distributed/c10d/PrefixStore.cpp
index 9b1037a884dee..bd69c9ad3dc62 100644
--- a/torch/csrc/distributed/c10d/PrefixStore.cpp
+++ b/torch/csrc/distributed/c10d/PrefixStore.cpp
@@ -83,6 +83,7 @@ void PrefixStore::append(
 std::vector<std::vector<uint8_t>> PrefixStore::multiGet(
     const std::vector<std::string>& keys) {
   std::vector<std::string> prefixed_keys;
+  prefixed_keys.reserve(keys.size());
   for (auto& key : keys) {
     prefixed_keys.push_back(joinKey(key));
   }
@@ -93,6 +94,7 @@ void PrefixStore::multiSet(
     const std::vector<std::string>& keys,
     const std::vector<std::vector<uint8_t>>& values) {
   std::vector<std::string> prefixed_keys;
+  prefixed_keys.reserve(keys.size());
   for (auto& key : keys) {
     prefixed_keys.push_back(joinKey(key));
   }
@@ -108,4 +110,22 @@ c10::intrusive_ptr<Store> PrefixStore::getUnderlyingStore() {
   return store_;
 }
 
+c10::intrusive_ptr<Store> PrefixStore::getUnderlyingNonPrefixStore() {
+  c10::intrusive_ptr<Store> store = store_;
+
+  while (store) {
+    // Attempt to dynamically cast to PrefixStore
+    PrefixStore* asPrefixStore = dynamic_cast<PrefixStore*>(store.get());
+    if (asPrefixStore) {
+      store = asPrefixStore->getUnderlyingStore();
+    } else {
+      break; // We've reached a non-PrefixStore
+    }
+  }
+
+  TORCH_CHECK(
+      store != nullptr, "Underlying Non-PrefixStore shouldn't be null.");
+  return store;
+}
+
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/PrefixStore.hpp b/torch/csrc/distributed/c10d/PrefixStore.hpp
index 74399554b8cd0..19098f0c38f01 100644
--- a/torch/csrc/distributed/c10d/PrefixStore.hpp
+++ b/torch/csrc/distributed/c10d/PrefixStore.hpp
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <torch/csrc/distributed/c10d/Store.hpp>
-#include <memory>
 
 namespace c10d {
 
@@ -53,6 +52,9 @@ class TORCH_API PrefixStore : public Store {
 
   c10::intrusive_ptr<Store> getUnderlyingStore();
 
+  // Recursively to fetch the store before layers of wrapping with PrefixStore.
+  c10::intrusive_ptr<Store> getUnderlyingNonPrefixStore();
+
  protected:
   std::string prefix_;
   c10::intrusive_ptr<Store> store_;
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp
index 9c2f880807291..9aeb07cf452b2 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp
@@ -3,6 +3,7 @@
 
 #include <c10/util/Logging.h>
 #include <fmt/format.h>
+#include <string_view>
 
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
@@ -13,7 +14,7 @@
 
 namespace c10d {
 
-static ProcessGroup::BackendType strToBackendType(std::string backend) {
+static ProcessGroup::BackendType strToBackendType(std::string_view backend) {
   if (backend == "undefined") {
     return ProcessGroup::BackendType::UNDEFINED;
   } else if (backend == "gloo") {
@@ -86,6 +87,10 @@ std::string opTypeToString(OpType opType) {
       return "UNKNOWN";
     case OpType::_REDUCE_SCATTER_BASE:
       return "_REDUCE_SCATTER_BASE";
+    case OpType::COALESCED:
+      return "COALESCED";
+    case OpType::_ALLREDUCE_SPARSE:
+      return "_ALLREDUCE_SPARSE";
     default:
       TORCH_INTERNAL_ASSERT(false, "Unknown op type!");
   }
@@ -107,7 +112,7 @@ c10::intrusive_ptr<Backend> ProcessGroup::getBackend(
   }
 
   // Get the backend type associated with the device
-  ProcessGroup::BackendType backendType;
+  ProcessGroup::BackendType backendType{ProcessGroup::BackendType::UNDEFINED};
   try {
     backendType = deviceTypeToBackendType_.at(deviceType);
   } catch (const std::out_of_range& e) {
@@ -138,8 +143,8 @@ ProcessGroup::ProcessGroup(
     : store_(store),
       rank_(rank),
       size_(size),
-      options_(options),
-      backendType_(strToBackendType(options->backend)),
+      options_(std::move(options)),
+      backendType_(strToBackendType(options_->backend)),
       dist_debug_level_(debug_level()) {
   C10_LOG_API_USAGE_ONCE("c10d.process_group");
 }
@@ -155,7 +160,7 @@ void ProcessGroup::init() {
 }
 
 const std::string& ProcessGroup::getGroupName() const {
-  TORCH_CHECK(deviceTypeToBackend_.size(), "ProcessGroup name not set");
+  TORCH_CHECK(!deviceTypeToBackend_.empty(), "ProcessGroup name not set");
   return deviceTypeToBackend_.begin()->second->getGroupName();
 }
 
@@ -165,6 +170,18 @@ void ProcessGroup::setGroupName(const std::string& name) {
   }
 }
 
+const std::string& ProcessGroup::getGroupDesc() const {
+  return pg_desc_;
+}
+
+void ProcessGroup::setGroupDesc(const std::string& name) {
+  pg_desc_ = name;
+  // Also set the group desc for all backends
+  for (auto& kv : deviceTypeToBackend_) {
+    kv.second->setGroupDesc(name);
+  }
+}
+
 void ProcessGroup::enableCollectivesTiming() {
   for (auto& kv : deviceTypeToBackend_) {
     kv.second->enableCollectivesTiming();
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index a21b98c2e16fa..dcb6d15547864 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -1,10 +1,7 @@
 #pragma once
 
 #include <torch/csrc/distributed/c10d/Backend.hpp>
-#include <condition_variable>
 #include <memory>
-#include <mutex>
-#include <stdexcept>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -674,7 +671,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     std::vector<c10::Device> devices;
     devices.reserve(deviceTypes_.size());
     for (auto& dt : deviceTypes_) {
-      devices.push_back(c10::Device(dt));
+      devices.emplace_back(dt);
     }
     return devices;
   }
@@ -694,6 +691,8 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
 
   const std::string& getGroupName() const;
   void setGroupName(const std::string& name);
+  const std::string& getGroupDesc() const;
+  void setGroupDesc(const std::string& name);
   void enableCollectivesTiming();
 
   void release_resources() override;
@@ -724,10 +723,11 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
   const int size_;
   const c10::intrusive_ptr<Options> options_;
   const BackendType backendType_;
+  std::string pg_desc_;
 
   // Debug level setting. It is parsed once when ProcessGroup is constructed and
   // remains the same across use of this process group.
-  DebugLevel dist_debug_level_;
+  DebugLevel dist_debug_level_{DebugLevel::Off};
 
   // Backend classes for this ProcessGroup
   std::unordered_set<c10::DeviceType> deviceTypes_;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index 14461578ff93e..e95191436b8c8 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -7,8 +7,6 @@
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #include <chrono>
 #include <exception>
-#include <ratio>
-#include <tuple>
 
 #ifdef _WIN32
 #include <gloo/common/win.h>
@@ -22,6 +20,7 @@
 #include <sys/types.h>
 
 #include <type_traits>
+#include <utility>
 
 #include <gloo/allgather.h>
 #include <gloo/allgatherv.h>
@@ -173,9 +172,7 @@ void checkRemainingTime(
 
 typedef void (*ReduceFunc)(void*, const void*, const void*, size_t);
 
-template <
-    typename T,
-    typename std::enable_if<!std::is_integral<T>::value, int>::type = 0>
+template <typename T, std::enable_if_t<!std::is_integral_v<T>, int> = 0>
 ReduceFunc toFunction(const ReduceOp& r) {
   switch (r) {
     case ReduceOp::SUM:
@@ -209,9 +206,7 @@ ReduceFunc toFunction(const ReduceOp& r) {
 }
 
 // Bitwise AND with SFINAE guard for integral types.
-template <
-    typename T,
-    typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
 void band(void* c, const void* a, const void* b, size_t n) {
   auto tc = static_cast<T*>(c);
   auto ta = static_cast<const T*>(a);
@@ -222,9 +217,7 @@ void band(void* c, const void* a, const void* b, size_t n) {
 }
 
 // Bitwise OR with SFINAE guard for integral types.
-template <
-    typename T,
-    typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
 void bor(void* c, const void* a, const void* b, size_t n) {
   auto tc = static_cast<T*>(c);
   auto ta = static_cast<const T*>(a);
@@ -235,9 +228,7 @@ void bor(void* c, const void* a, const void* b, size_t n) {
 }
 
 // Bitwise XOR with SFINAE guard for integral types.
-template <
-    typename T,
-    typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
 void bxor(void* c, const void* a, const void* b, size_t n) {
   auto tc = static_cast<T*>(c);
   auto ta = static_cast<const T*>(a);
@@ -247,9 +238,7 @@ void bxor(void* c, const void* a, const void* b, size_t n) {
   }
 }
 
-template <
-    typename T,
-    typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
 ReduceFunc toFunction(const ReduceOp& r) {
   switch (r) {
     case ReduceOp::SUM:
@@ -323,8 +312,8 @@ at::Tensor pinnedLike(at::Tensor& tensor) {
   auto* allocator = at::detail::getCUDAHooks().getPinnedMemoryAllocator();
   auto storage = c10::Storage(
       c10::Storage::use_byte_size_t(),
-      at::detail::computeStorageNbytes(
-          tensor.sizes(), tensor.strides(), tensor.dtype().itemsize()),
+      static_cast<int64_t>(at::detail::computeStorageNbytes(
+          tensor.sizes(), tensor.strides(), tensor.dtype().itemsize())),
       allocator,
       /*resizable=*/false);
   return at::empty({0}, tensor.options().device(at::kCPU))
@@ -427,7 +416,8 @@ const auto kLoopbackAddress = "127.0.0.1";
 } // namespace
 
 // static
-void ProcessGroupGloo::AsyncWork::execute(c10::intrusive_ptr<AsyncWork> work) {
+void ProcessGroupGloo::AsyncWork::execute(
+    const c10::intrusive_ptr<AsyncWork>& work) {
   if (work->recordFunctionBeforeCallback_) {
     work->recordFunctionBeforeCallback_();
   }
@@ -538,7 +528,8 @@ uint64_t ProcessGroupGloo::AsyncWork::getSequencenumber() const {
   return seq_;
 }
 
-void ProcessGroupGloo::AsyncWork::finishWorkGlooError(std::exception_ptr eptr) {
+void ProcessGroupGloo::AsyncWork::finishWorkGlooError(
+    const std::exception_ptr& eptr) {
   future_->setError(eptr);
   finish(eptr);
 }
@@ -760,10 +751,10 @@ ProcessGroupGloo::ProcessGroupGloo(
     c10::intrusive_ptr<Options> options)
     : Backend(rank, size),
       store_(new GlooStore(store)),
-      options_(options),
+      options_(std::move(options)),
       stop_(false),
       collectiveCounter_(0) {
-  auto& devices = options->devices;
+  auto& devices = options_->devices;
   if (devices.empty()) {
     TORCH_CHECK(false, "No device(s) specified");
   }
@@ -780,13 +771,13 @@ ProcessGroupGloo::ProcessGroupGloo(
   // option is needed if you have a fast NIC that cannot be saturated
   // by a single I/O thread.
   //
-  contexts_.reserve(options->devices.size());
-  for (const auto i : c10::irange(options->devices.size())) {
+  contexts_.reserve(options_->devices.size());
+  for (const auto i : c10::irange(options_->devices.size())) {
     auto context = std::make_shared<::gloo::rendezvous::Context>(rank_, size_);
     auto store = ::gloo::rendezvous::PrefixStore(std::to_string(i), *store_);
-    context->setTimeout(options->timeout);
+    context->setTimeout(options_->timeout);
     try {
-      context->connectFullMesh(store, options->devices[i]);
+      context->connectFullMesh(store, options_->devices[i]);
     } catch (const std::runtime_error& e) {
       auto err = e.what();
       // TORCH_CHECK to print the cpp stacktrace.
@@ -800,9 +791,9 @@ ProcessGroupGloo::ProcessGroupGloo(
   // working on in the workInProgress_ vector. It must have size equal
   // to the number of workers such that they can simply index into it
   // using the worker index they are started with.
-  workInProgress_.resize(options->threads);
+  workInProgress_.resize(options_->threads);
 
-  threads_.resize(options->threads);
+  threads_.resize(options_->threads);
   for (const auto i : c10::irange(threads_.size())) {
     threads_[i] = std::thread(&ProcessGroupGloo::runLoop, this, i);
   }
@@ -854,7 +845,7 @@ void ProcessGroupGloo::runLoop(int workerIndex) {
     // does not immediately block.
     workConsumeCV_.notify_one();
 
-    AsyncWork::execute(std::move(work));
+    AsyncWork::execute(work);
     lock.lock();
     workInProgress_[workerIndex].reset();
   }
@@ -894,7 +885,7 @@ class AsyncBroadcastWork : public ProcessGroupGloo::AsyncWork {
         tag(tag) {}
 
   std::shared_ptr<gloo::Context> context;
-  std::vector<at::Tensor> inputs;
+  std::vector<at::Tensor> inputs{};
   const int rootRank;
   const int rootTensor;
   const uint32_t tag;
@@ -970,8 +961,8 @@ class AsyncBroadcastCUDAWork : public AsyncBroadcastWork {
   }
 
   at::Tensor tmp;
-  std::vector<c10::Stream> streams;
-  std::vector<c10::Event> events;
+  std::vector<c10::Stream> streams{};
+  std::vector<c10::Event> events{};
 };
 
 } // namespace
@@ -1036,11 +1027,11 @@ class AsyncAllreduceWork : public ProcessGroupGloo::AsyncWork {
             inputs),
         context(context),
         inputs(inputs),
-        reduceOp(reduceOp),
+        reduceOp(std::move(reduceOp)),
         tag(tag) {}
 
   std::shared_ptr<gloo::Context> context;
-  std::vector<at::Tensor> inputs;
+  std::vector<at::Tensor> inputs{};
   const ReduceOp reduceOp;
   const uint32_t tag;
 
@@ -1064,7 +1055,7 @@ class AsyncAllreduceWork : public ProcessGroupGloo::AsyncWork {
 
   gloo::AllreduceOptions::Func getFunction(
       const at::ScalarType& dtype,
-      const ReduceOp op) {
+      const ReduceOp& op) {
     gloo::AllreduceOptions::Func fn;
     GENERATE_ALL_TYPES(dtype, getFunction, fn, op);
     return fn;
@@ -1079,7 +1070,7 @@ class AsyncAllreduceCoalescedWork : public AsyncAllreduceWork {
       ReduceOp reduceOp,
       uint32_t tag,
       uint64_t seq)
-      : AsyncAllreduceWork(context, inputs, reduceOp, tag, seq) {}
+      : AsyncAllreduceWork(context, inputs, std::move(reduceOp), tag, seq) {}
 
   void run() override {
     allreduceCoalesced(inputs);
@@ -1122,7 +1113,7 @@ class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork {
         tag(tag) {}
 
   std::shared_ptr<gloo::Context> context;
-  std::vector<at::Tensor> inputs;
+  std::vector<at::Tensor> inputs{};
   const uint32_t tag;
 
   // We share dimensionality about the sparse tensors before collecting
@@ -1143,10 +1134,11 @@ class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork {
     // Construct from an existing metadata tensor to facilitate structured
     // access to metadata from peers, after gathering it.
     explicit SparseTensorMetadata(at::Tensor metadata)
-        : metadata_(metadata), data_(metadata_.mutable_data_ptr<int64_t>()) {
-      AT_ASSERT(metadata.scalar_type() == at::kLong);
-      AT_ASSERT(metadata.dim() == 1);
-      AT_ASSERT(metadata.size(0) == dim);
+        : metadata_(std::move(metadata)),
+          data_(metadata_.mutable_data_ptr<int64_t>()) {
+      AT_ASSERT(metadata_.scalar_type() == at::kLong);
+      AT_ASSERT(metadata_.dim() == 1);
+      AT_ASSERT(metadata_.size(0) == dim);
     }
 
     // Populate the metadata.
@@ -1293,13 +1285,13 @@ class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork {
     const auto sparseDim = tensor.sparse_dim();
 
     std::vector<size_t> counts(context->size);
-    int64_t totalSize = 0;
+    size_t totalSize = 0;
     for (const auto i : c10::irange(metadata.size())) {
       counts[i] = metadata[i].nnz() * sparseDim;
       totalSize += counts[i];
     }
 
-    auto output = at::empty({totalSize}, at::kLong);
+    auto output = at::empty({static_cast<int64_t>(totalSize)}, at::kLong);
 
     // tensors copied from cuda may not be contiguous, get a contiguous
     // tensor before use its data_ptr
@@ -1386,7 +1378,7 @@ class AsyncAllreduceCUDAWork : public AsyncAllreduceWork {
       ReduceOp reduceOp,
       uint32_t tag,
       uint64_t seq)
-      : AsyncAllreduceWork(context, inputs, reduceOp, tag, seq) {
+      : AsyncAllreduceWork(context, inputs, std::move(reduceOp), tag, seq) {
     initializeStreamsEvents(inputs, streams, events);
 
     // Kick off copy from CUDA tensors to pinned CPU tensors.
@@ -1425,8 +1417,8 @@ class AsyncAllreduceCUDAWork : public AsyncAllreduceWork {
   }
 
   std::vector<at::Tensor> tmp;
-  std::vector<c10::Stream> streams;
-  std::vector<c10::Event> events;
+  std::vector<c10::Stream> streams{};
+  std::vector<c10::Event> events{};
 };
 
 class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
@@ -1478,9 +1470,9 @@ class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
     }
   }
 
-  std::vector<at::Tensor> tmp;
-  std::vector<c10::Stream> streams;
-  std::vector<c10::Event> events;
+  std::vector<at::Tensor> tmp{};
+  std::vector<c10::Stream> streams{};
+  std::vector<c10::Event> events{};
 };
 
 } // namespace
@@ -1638,11 +1630,11 @@ class AsyncReduceWork : public ProcessGroupGloo::AsyncWork {
         inputs(inputs),
         rootRank(rootRank),
         rootTensor(rootTensor),
-        reduceOp(reduceOp),
+        reduceOp(std::move(reduceOp)),
         tag(tag) {}
 
   std::shared_ptr<gloo::Context> context;
-  std::vector<at::Tensor> inputs;
+  std::vector<at::Tensor> inputs{};
   const int rootRank;
   const int rootTensor;
   const ReduceOp reduceOp;
@@ -1670,7 +1662,7 @@ class AsyncReduceWork : public ProcessGroupGloo::AsyncWork {
 
   gloo::ReduceOptions::Func getFunction(
       const at::ScalarType& dtype,
-      const ReduceOp op) {
+      const ReduceOp& op) {
     gloo::ReduceOptions::Func fn;
     GENERATE_ALL_TYPES(dtype, getFunction, fn, op);
     return fn;
@@ -1692,7 +1684,7 @@ class AsyncReduceCUDAWork : public AsyncReduceWork {
             inputs,
             rootRank,
             rootTensor,
-            reduceOp,
+            std::move(reduceOp),
             tag,
             seq) {
     initializeStreamsEvents(inputs, streams, events);
@@ -1733,9 +1725,9 @@ class AsyncReduceCUDAWork : public AsyncReduceWork {
     }
   }
 
-  std::vector<at::Tensor> tmp;
-  std::vector<c10::Stream> streams;
-  std::vector<c10::Event> events;
+  std::vector<at::Tensor> tmp{};
+  std::vector<c10::Stream> streams{};
+  std::vector<c10::Event> events{};
 };
 
 } // namespace
@@ -1815,8 +1807,8 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
         tag(tag) {}
 
   std::shared_ptr<gloo::Context> context;
-  std::vector<std::vector<at::Tensor>> outputs;
-  std::vector<at::Tensor> inputs;
+  std::vector<std::vector<at::Tensor>> outputs{};
+  std::vector<at::Tensor> inputs{};
   const uint32_t tag;
 
   void allgather(
@@ -1914,13 +1906,29 @@ class AsyncAllgatherCUDAWork : public AsyncAllgatherWork {
     }
   }
 
-  std::vector<at::Tensor> tmpInputs;
-  std::vector<c10::Stream> inputStreams;
-  std::vector<c10::Event> inputEvents;
+  std::vector<at::Tensor> tmpInputs{};
+  std::vector<c10::Stream> inputStreams{};
+  std::vector<c10::Event> inputEvents{};
+
+  std::vector<std::vector<at::Tensor>> tmpOutputs{};
+  std::vector<c10::Stream> outputStreams{};
+  std::vector<c10::Event> outputEvents{};
+};
+
+// A work that takes an lambda on construction and calls it on wait.
+// It is useful for add a continuation to another work, and/or
+// composing multiple works together.
+class LambdaWork : public Work {
+ public:
+  LambdaWork(std::function<void(void)> fn) : fn_(std::move(fn)) {}
+
+  bool wait(std::chrono::milliseconds /* unused */) override {
+    fn_();
+    return true;
+  }
 
-  std::vector<std::vector<at::Tensor>> tmpOutputs;
-  std::vector<c10::Stream> outputStreams;
-  std::vector<c10::Event> outputEvents;
+ private:
+  std::function<void(void)> fn_;
 };
 
 } // namespace
@@ -1929,23 +1937,46 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::_reduce_scatter_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
     const ReduceScatterOptions& opts) {
-  if (opts.asyncOp) {
-    throw std::runtime_error(
-        "Gloo reduce_scatter_base fallback not supported with async_op=True");
-  }
-  // first all reduce input
-  // TODO we can probably make this fully async by chaining the allreduce
-  // future.
-  std::vector<at::Tensor> inputs = {inputTensor};
-  std::vector<at::Tensor> outputs = {outputTensor};
-  allreduce(inputs)->wait();
-  std::vector<std::vector<at::Tensor>> inputTensors;
-  if (getRank() == 0) {
-    auto chunkedInputs = at::chunk(inputTensor, this->getSize(), 0);
-    inputTensors = {chunkedInputs};
-  }
-  auto scatterWork = this->scatter(outputs, inputTensors);
-  return scatterWork;
+  std::vector<at::Tensor> outputTensors = {outputTensor};
+  std::vector<at::Tensor> inputTensors = {inputTensor};
+  return reduce_scatter_tensor_coalesced(outputTensors, inputTensors, opts);
+}
+
+c10::intrusive_ptr<Work> ProcessGroupGloo::reduce_scatter_tensor_coalesced(
+    std::vector<at::Tensor>& outputTensors,
+    std::vector<at::Tensor>& inputTensors,
+    const ReduceScatterOptions& opts) {
+  if (outputTensors.size() != inputTensors.size()) {
+    TORCH_CHECK(
+        false, "requires input/output tensor lists to have the same length");
+  }
+  const auto rank = getRank();
+  const auto worldSize = getSize();
+  std::vector<at::Tensor> buffers;
+  for (const auto i : c10::irange(inputTensors.size())) {
+    auto inputShape = inputTensors[i].sizes().vec();
+    auto outputShape = outputTensors[i].sizes().vec();
+    TORCH_CHECK_EQ(outputTensors[i].dtype(), inputTensors[i].dtype());
+    TORCH_CHECK_EQ(outputShape[0] * worldSize, inputShape[0]);
+    for (size_t i = 1; i < outputShape.size(); ++i) {
+      TORCH_CHECK_EQ(outputShape[i], inputShape[i]);
+    }
+    buffers.push_back(inputTensors[i].clone());
+  }
+  std::vector<c10::intrusive_ptr<Work>> works;
+  for (const auto i : c10::irange(buffers.size())) {
+    std::vector<at::Tensor> inp = {buffers[i]};
+    AllreduceOptions arOpts;
+    arOpts.reduceOp = opts.reduceOp;
+    works.push_back(allreduce(inp));
+  }
+  return c10::make_intrusive<LambdaWork>(
+      [rank, worldSize, buffers, outputTensors, works = std::move(works)]() {
+        for (const auto i : c10::irange(outputTensors.size())) {
+          works[i]->wait();
+          outputTensors[i].copy_(buffers[i].chunk(worldSize)[rank]);
+        }
+      });
 }
 
 c10::intrusive_ptr<Work> ProcessGroupGloo::_allgather_base(
@@ -2048,8 +2079,8 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
         tag(tag) {}
 
   std::shared_ptr<gloo::Context> context;
-  std::vector<std::vector<at::Tensor>> output_lists;
-  std::vector<at::Tensor> input_list;
+  std::vector<std::vector<at::Tensor>> output_lists{};
+  std::vector<at::Tensor> input_list{};
   const uint32_t tag;
 
   void allgather_coalesced() {
@@ -2153,6 +2184,21 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::allgather_coalesced(
   return work;
 }
 
+c10::intrusive_ptr<Work> ProcessGroupGloo::allgather_into_tensor_coalesced(
+    std::vector<at::Tensor>& outputs,
+    std::vector<at::Tensor>& inputs,
+    const AllgatherOptions& opts) {
+  TORCH_CHECK_EQ(outputs.size(), inputs.size());
+  std::vector<std::vector<at::Tensor>> output_lists(getSize());
+  for (auto& output : outputs) {
+    auto chunks = output.chunk(getSize());
+    for (const auto i : c10::irange(output_lists.size())) {
+      output_lists[i].push_back(std::move(chunks[i]));
+    }
+  }
+  return allgather_coalesced(output_lists, inputs, opts);
+}
+
 namespace {
 
 class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
@@ -2177,8 +2223,8 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
         tag(tag) {}
 
   std::shared_ptr<gloo::Context> context;
-  std::vector<std::vector<at::Tensor>> outputs;
-  std::vector<at::Tensor> inputs;
+  std::vector<std::vector<at::Tensor>> outputs{};
+  std::vector<at::Tensor> inputs{};
   const int root;
   const uint32_t tag;
 
@@ -2283,13 +2329,13 @@ class AsyncGatherCUDAWork : public AsyncGatherWork {
     }
   }
 
-  std::vector<at::Tensor> tmpInputs;
-  std::vector<c10::Stream> inputStreams;
-  std::vector<c10::Event> inputEvents;
+  std::vector<at::Tensor> tmpInputs{};
+  std::vector<c10::Stream> inputStreams{};
+  std::vector<c10::Event> inputEvents{};
 
-  std::vector<std::vector<at::Tensor>> tmpOutputs;
-  std::vector<c10::Stream> outputStreams;
-  std::vector<c10::Event> outputEvents;
+  std::vector<std::vector<at::Tensor>> tmpOutputs{};
+  std::vector<c10::Stream> outputStreams{};
+  std::vector<c10::Event> outputEvents{};
 };
 
 } // namespace
@@ -2383,8 +2429,8 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
         tag(tag) {}
 
   std::shared_ptr<gloo::Context> context;
-  std::vector<at::Tensor> outputs;
-  std::vector<std::vector<at::Tensor>> inputs;
+  std::vector<at::Tensor> outputs{};
+  std::vector<std::vector<at::Tensor>> inputs{};
   const int root;
   const uint32_t tag;
 
@@ -2472,13 +2518,13 @@ class AsyncScatterCUDAWork : public AsyncScatterWork {
     }
   }
 
-  std::vector<at::Tensor> tmpOutputs;
-  std::vector<c10::Stream> outputStreams;
-  std::vector<c10::Event> outputEvents;
+  std::vector<at::Tensor> tmpOutputs{};
+  std::vector<c10::Stream> outputStreams{};
+  std::vector<c10::Event> outputEvents{};
 
-  std::vector<std::vector<at::Tensor>> tmpInputs;
-  std::vector<c10::Stream> inputStreams;
-  std::vector<c10::Event> inputEvents;
+  std::vector<std::vector<at::Tensor>> tmpInputs{};
+  std::vector<c10::Stream> inputStreams{};
+  std::vector<c10::Event> inputEvents{};
 };
 
 } // namespace
@@ -2581,8 +2627,8 @@ class AsyncAlltoallWork : public ProcessGroupGloo::AsyncWork {
   std::shared_ptr<gloo::Context> context;
   at::Tensor outputTensor;
   at::Tensor inputTensor;
-  std::vector<int64_t> outputCounts;
-  std::vector<int64_t> inputCounts;
+  std::vector<int64_t> outputCounts{};
+  std::vector<int64_t> inputCounts{};
   const uint32_t tag;
 
   void alltoall(at::Tensor& outputTensor, at::Tensor& inputTensor) {
@@ -2672,12 +2718,12 @@ class AsyncAlltoallCUDAWork : public AsyncAlltoallWork {
   }
 
   at::Tensor cpuOutput;
-  std::vector<c10::Stream> outputStreams;
-  std::vector<c10::Event> outputEvents;
+  std::vector<c10::Stream> outputStreams{};
+  std::vector<c10::Event> outputEvents{};
 
   at::Tensor cpuInput;
-  std::vector<c10::Stream> inputStreams;
-  std::vector<c10::Event> inputEvents;
+  std::vector<c10::Stream> inputStreams{};
+  std::vector<c10::Event> inputEvents{};
 };
 
 } // namespace
@@ -2843,7 +2889,7 @@ class AsyncBarrierWork : public ProcessGroupGloo::AsyncWork {
         tag(tag) {}
 
   std::shared_ptr<gloo::Context> context;
-  std::vector<c10::weak_intrusive_ptr<AsyncWork>> priorWork;
+  std::vector<c10::weak_intrusive_ptr<AsyncWork>> priorWork{};
   const uint32_t tag;
 
   void run() override {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
index 11158c80e2fcb..d40b205c25601 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -6,7 +6,6 @@
 #include <deque>
 #include <mutex>
 #include <thread>
-#include <unordered_map>
 #include <vector>
 
 #include <gloo/algorithm.h>
@@ -79,7 +78,7 @@ class TORCH_API ProcessGroupGloo : public Backend {
 
     ~AsyncWork() override = default;
 
-    static void execute(c10::intrusive_ptr<AsyncWork> work);
+    static void execute(const c10::intrusive_ptr<AsyncWork>& work);
 
     virtual void run() = 0;
 
@@ -93,7 +92,7 @@ class TORCH_API ProcessGroupGloo : public Backend {
 
    private:
     void finishWorkGloo();
-    void finishWorkGlooError(std::exception_ptr eptr);
+    void finishWorkGlooError(const std::exception_ptr& eptr);
     inline void recordAsyncWorkProfilingInfo(
         const char* profilingTitle,
         const c10::optional<std::vector<at::Tensor>>& inputTensors);
@@ -147,7 +146,7 @@ class TORCH_API ProcessGroupGloo : public Backend {
         const std::vector<std::string>& keys) override {
       std::vector<std::vector<char>> res;
       for (auto& value : store_->multiGet(keys)) {
-        res.emplace_back(std::vector<char>(value.begin(), value.end()));
+        res.emplace_back(value.begin(), value.end());
       }
       return res;
     }
@@ -156,8 +155,9 @@ class TORCH_API ProcessGroupGloo : public Backend {
         const std::vector<std::string>& keys,
         const std::vector<std::vector<char>>& values) override {
       std::vector<std::vector<uint8_t>> u_values;
+      u_values.reserve(values.size());
       for (auto& value : values) {
-        u_values.emplace_back(std::vector<uint8_t>(value.begin(), value.end()));
+        u_values.emplace_back(value.begin(), value.end());
       }
       store_->multiSet(keys, u_values);
     }
@@ -322,6 +322,11 @@ class TORCH_API ProcessGroupGloo : public Backend {
       std::vector<at::Tensor>& input_list,
       const AllgatherOptions& opts = AllgatherOptions()) override;
 
+  c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
   c10::intrusive_ptr<Work> gather(
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs,
@@ -337,6 +342,11 @@ class TORCH_API ProcessGroupGloo : public Backend {
       std::vector<std::vector<at::Tensor>>& inputs,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
+  c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
   c10::intrusive_ptr<Work> alltoall_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
index 939f1202680b6..94d7cd9cca074 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
@@ -3,7 +3,6 @@
 #ifdef USE_C10D_MPI
 
 #include <iostream>
-#include <limits>
 #include <map>
 
 #include <c10/core/DeviceGuard.h>
@@ -107,7 +106,8 @@ c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupMPI::WorkMPI::getFuture() {
   return future_;
 }
 
-void ProcessGroupMPI::WorkMPI::finishWorkMPIError(std::exception_ptr eptr) {
+void ProcessGroupMPI::WorkMPI::finishWorkMPIError(
+    const std::exception_ptr& eptr) {
   future_->setError(eptr);
   finish(eptr);
 }
@@ -132,7 +132,7 @@ ProcessGroupMPI::AsyncWork::~AsyncWork() {
   if (request_ != MPI_REQUEST_NULL) {
     std::cerr
         << "Attempted destruction of AsyncWork before work has completed, "
-        << "terminating the program." << std::endl;
+        << "terminating the program." << '\n';
     std::terminate();
   }
 }
@@ -210,7 +210,7 @@ std::vector<at::Tensor> ProcessGroupMPI::AsyncWork::result() {
 }
 
 void ProcessGroupMPI::AsyncWork::populateException() {
-  std::array<char, MPI_MAX_ERROR_STRING> buf;
+  std::array<char, MPI_MAX_ERROR_STRING> buf{};
   int len = buf.size();
   MPI_CHECK(MPI_Error_string(status_.MPI_ERROR, buf.data(), &len));
   exception_ =
@@ -267,8 +267,8 @@ c10::intrusive_ptr<ProcessGroupMPI> ProcessGroupMPI::createProcessGroupMPI(
 
     // If no ranks are specified, assume we're creating the root group
     if (!ranks.empty()) {
-      MPI_Group worldGroup;
-      MPI_Group ranksGroup;
+      MPI_Group worldGroup{};
+      MPI_Group ranksGroup{};
       MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
       MPI_CHECK(
           MPI_Group_incl(worldGroup, ranks.size(), ranks.data(), &ranksGroup));
@@ -383,7 +383,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::enqueue(
   auto work =
       c10::make_intrusive<WorkMPI>(entry->dst, profilingTitle, inputTensors);
   std::unique_lock<std::mutex> lock(pgMutex_);
-  queue_.push_back(std::make_tuple(std::move(entry), work));
+  queue_.emplace_back(std::move(entry), work);
   lock.unlock();
   queueProduceCV_.notify_one();
   return work;
@@ -539,7 +539,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::gather(
   checkSingleTensor(inputTensors);
 
   if (rank_ != opts.rootRank) {
-    if (outputTensors.size() > 0) {
+    if (!outputTensors.empty()) {
       TORCH_CHECK(
           false,
           "Gather: number of output tensors should be 0 "
@@ -615,7 +615,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::scatter(
   checkSingleTensor(outputTensors);
 
   if (rank_ != opts.rootRank) {
-    if (inputTensors.size() > 0) {
+    if (!inputTensors.empty()) {
       TORCH_CHECK(
           false,
           "Scatter: number of input tensors should be 0 "
@@ -670,7 +670,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::scatter(
     return enqueue(
         std::move(entry),
         "mpi:scatter",
-        inputTensors.size() > 0
+        !inputTensors.empty()
             ? c10::optional<std::vector<at::Tensor>>(inputTensors[0])
             : c10::nullopt);
   } else {
@@ -679,7 +679,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::scatter(
     return enqueue(
         std::move(entry),
         "mpi:scatter",
-        inputTensors.size() > 0
+        !inputTensors.empty()
             ? c10::optional<std::vector<at::Tensor>>(inputTensors[0])
             : c10::nullopt);
   }
@@ -701,7 +701,7 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::alltoall_base(
   checkSingleTensorHelper(inputTensor);
   checkSingleTensorHelper(outputTensor);
 
-  if (outputSplitSizes.size() == 0 && inputSplitSizes.size() == 0) {
+  if (outputSplitSizes.empty() && inputSplitSizes.empty()) {
     // We can use alltoall
     TORCH_CHECK(
         outputTensor.numel() == inputTensor.numel() &&
diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
index e92f195c36a74..dd586dda7024b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
@@ -102,7 +102,7 @@ class TORCH_API ProcessGroupMPI : public Backend {
 
    private:
     void finishWorkMPI();
-    void finishWorkMPIError(std::exception_ptr eptr);
+    void finishWorkMPIError(const std::exception_ptr& eptr);
 
     std::vector<at::Tensor> outputTensors_;
     c10::intrusive_ptr<at::ivalue::Future> future_;
@@ -137,7 +137,7 @@ class TORCH_API ProcessGroupMPI : public Backend {
    private:
     const std::vector<at::Tensor> outputTensors_;
     MPI_Request request_;
-    MPI_Status status_;
+    MPI_Status status_{};
   };
 
   // Constructor will spawn up the worker thread loop
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index c1c86cdfc150a..6cca50daff6c4 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -1,20 +1,11 @@
-#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
-#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
-#include <fstream>
-#include <mutex>
-#include <sstream>
-
-#if defined(__linux__)
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#endif
 
 #ifdef USE_C10D_NCCL
 
 #include <exception>
+#include <fstream>
 #include <map>
+#include <mutex>
+#include <sstream>
 #include <stdexcept>
 #include <tuple>
 #include <unordered_set>
@@ -32,9 +23,13 @@
 #include <c10/util/Optional.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/cuda/nccl.h>
+#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
 #include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
+#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
 #include <torch/csrc/distributed/c10d/TraceUtils.h>
 #include <torch/csrc/distributed/c10d/Utils.hpp>
+#include <torch/csrc/distributed/c10d/logger.hpp>
 #include <torch/torch.h>
 
 namespace c10d {
@@ -85,12 +80,27 @@ ncclDataType_t getNcclDataType(at::ScalarType type) {
   return it->second;
 }
 
+bool complexViewAsRealAllowed(const ReduceOp reduceOp) {
+  switch (reduceOp) {
+    case ReduceOp::SUM:
+      return true;
+    case ReduceOp::AVG:
+      return true;
+    case ReduceOp::PREMUL_SUM:
+      return true;
+    case ReduceOp::UNUSED:
+      return true;
+    default:
+      return false;
+  }
+  return false;
+}
+
 #ifdef ENABLE_NCCL_PREMUL_SUM_SUPPORT
 template <typename T, ncclDataType_t dataType>
 ncclRedOpRAII unpackPreMulSum(
     const ReduceOp& reduceOp,
-    const ncclComm_t& comm,
-    int dev_in_group) {
+    const ncclComm_t& comm) {
   const auto* preMulSupplement =
       reinterpret_cast<NCCLPreMulSumSupplement*>(reduceOp.supplement_.get());
   ncclRedOp_t preMulSum;
@@ -116,8 +126,7 @@ ncclRedOpRAII getNcclReduceOp(
     const ReduceOp& reduceOp,
     at::Tensor& input,
     const ncclDataType_t& dataType,
-    const ncclComm_t& comm,
-    int dev_in_group) {
+    const ncclComm_t& comm) {
   try {
     if (input.scalar_type() == at::kBool) {
       if (reduceOp == ReduceOp::SUM) {
@@ -137,14 +146,11 @@ ncclRedOpRAII getNcclReduceOp(
 #ifdef ENABLE_NCCL_PREMUL_SUM_SUPPORT
       switch (dataType) {
         case ncclHalf:
-          return unpackPreMulSum<at::Half, ncclHalf>(
-              reduceOp, comm, dev_in_group);
+          return unpackPreMulSum<at::Half, ncclHalf>(reduceOp, comm);
         case ncclFloat:
-          return unpackPreMulSum<float, ncclFloat>(
-              reduceOp, comm, dev_in_group);
+          return unpackPreMulSum<float, ncclFloat>(reduceOp, comm);
         case ncclDouble:
-          return unpackPreMulSum<double, ncclDouble>(
-              reduceOp, comm, dev_in_group);
+          return unpackPreMulSum<double, ncclDouble>(reduceOp, comm);
         default:
           C10_THROW_ERROR(
               TypeError, "PreMulSum Data type must be half, float, or double");
@@ -156,7 +162,7 @@ ncclRedOpRAII getNcclReduceOp(
 #endif
     }
     return ncclOp.at(reduceOp);
-  } catch (const std::out_of_range& e) {
+  } catch (const std::out_of_range&) {
     switch (reduceOp) {
       case ReduceOp::AVG:
         C10_THROW_ERROR(
@@ -183,17 +189,9 @@ ncclRedOpRAII getNcclReduceOp(
   }
 }
 
-// Get the deviceList String from the list of devices
-std::string getKeyFromDevices(const std::vector<at::Device>& devices) {
-  std::string deviceList;
-  for (auto& device : devices) {
-    if (deviceList.empty()) {
-      deviceList = std::to_string(device.index());
-    } else {
-      deviceList += "," + std::to_string(device.index());
-    }
-  }
-  return deviceList;
+// Get a key string from device
+inline std::string getKeyFromDevice(at::Device& device) {
+  return std::to_string(device.index());
 }
 
 std::string getKeySendRecv(int myRank, int peer) {
@@ -204,20 +202,9 @@ std::string getKeySendRecv(int myRank, int peer) {
   return sendRecvPair;
 }
 
-// Get the list of devices from list of tensors
-std::vector<at::Device> getDeviceList(const std::vector<at::Tensor>& tensors) {
-  std::vector<at::Device> res;
-  res.reserve(tensors.size());
-  for (auto& tensor : tensors) {
-    // tensors must all be on the same device, or all on distinct devices.
-    // The line below assumes that constraint has already been enforced
-    // (by check_gpu_tensors_same_device or
-    // check_gpu_tensors_different_devices).
-    if (res.size() == 0 || tensor.device() != res[0]) {
-      res.push_back(tensor.device());
-    }
-  }
-  return res;
+// Get device from tensor
+inline at::Device getDevice(at::Tensor& tensor) {
+  return tensor.device();
 }
 
 // [Sync Streams] Helper that lets the input ncclStreams to wait for the current
@@ -233,16 +220,12 @@ std::vector<at::Device> getDeviceList(const std::vector<at::Tensor>& tensors) {
 // which remembers the usage stream (ncclStream), creates an event on the usage
 // stream when GC attempts to free the input tensor, and delays GC until that
 // event is done.
-void syncStreams(
-    const std::vector<at::Device>& devices,
-    std::vector<at::cuda::CUDAEvent>& ncclEvents,
-    std::vector<at::cuda::CUDAStream>& ncclStreams) {
-  for (const auto i : c10::irange(devices.size())) {
-    at::cuda::CUDAStream& ncclStream = ncclStreams[i];
-    at::cuda::CUDAEvent& ncclEvent = ncclEvents[i];
-    ncclEvent.record(at::cuda::getCurrentCUDAStream(devices[i].index()));
-    ncclEvent.block(ncclStream);
-  }
+void syncStream(
+    at::Device& device,
+    at::cuda::CUDAEvent& ncclEvent,
+    at::cuda::CUDAStream& ncclStream) {
+  ncclEvent.record(at::cuda::getCurrentCUDAStream(device.index()));
+  ncclEvent.block(ncclStream);
 }
 
 // Given a ncclUniqueId, convert it to a string representation that can be put
@@ -301,6 +284,9 @@ inline void errorIfCapturingNonCapturableNCCL(c10::cuda::CaptureStatus status) {
 static std::unordered_map<std::shared_ptr<NCCLComm>, int> ncclCommDevIdxMap;
 static std::mutex ncclCommDevIdxMapMutex;
 static bool allocatorHooksAttached = false;
+
+std::atomic<bool> ProcessGroupNCCL::shouldDump_(false);
+
 void cacheAllocatorRegisterHook(
     const c10::cuda::CUDACachingAllocator::TraceEntry& te) {
   // Register after SEGMENT_ALLOC
@@ -337,21 +323,73 @@ void cacheAllocatorDeregisterHook(
   }
 }
 
+#if defined(IS_NCCLX) && defined(NCCL_COMM_DUMP)
+std::string dump_nccl_trace() {
+  std::unordered_map<
+      std::string /* ncclUniqueID */,
+      std::unordered_map<std::string, std::string> /* dump from this comm */>
+      ncclDumpMap;
+  // dump_nccl_trace is only called from the default PG (uid_=0), but we want to
+  // dump from all comms so we need to iterate over ncclCommDevIdxMap, which
+  // is static
+  std::vector<std::shared_ptr<NCCLComm>> allNCCLComms;
+  // within the critical section, we don't want to dump while holding the lock
+  // as dump might hang
+  ncclCommDevIdxMapMutex.lock();
+  for (auto& [ncclComm, _] : ncclCommDevIdxMap) {
+    allNCCLComms.push_back(ncclComm);
+  }
+  ncclCommDevIdxMapMutex.unlock();
+  for (auto& ncclComm : allNCCLComms) {
+    std::string ncclUniqueIDStr = buildNcclUniqueIdStr(ncclComm->getNcclId());
+    ncclDumpMap[ncclUniqueIDStr] = ncclComm->ncclCommDump();
+  }
+  return NCCLTraceBuffer::get()->dump(ncclDumpMap);
+}
+#else
 std::string dump_nccl_trace() {
-  return NCCLTraceBuffer::get()->dump();
+  return NCCLTraceBuffer::get()->dump(c10::nullopt);
 }
+#endif
 
-c10::optional<std::function<std::string()>>& get_cpp_trace_dumper() {
-  static c10::optional<std::function<std::string()>> dumper(c10::nullopt);
+c10::optional<std::function<void(std::function<void(const std::string&)>)>>&
+get_cpp_trace_dumper() {
+  static c10::optional<
+      std::function<void(std::function<void(const std::string&)>)>>
+      dumper(c10::nullopt);
   return dumper;
 }
 
+gil_checker_t& get_gil_checker() {
+  static gil_checker_t gil_checker = nullptr;
+  return gil_checker;
+}
+
+std::future<bool> launchAsyncGilCheck() {
+  std::promise<bool> resultPromise;
+  std::future<bool> resultFuture = resultPromise.get_future();
+  TORCH_CHECK(get_gil_checker(), "Can't check GIL with null GIL checker");
+  std::thread workerThread([promise = std::move(resultPromise)]() mutable {
+    try {
+      auto& gil_checker = get_gil_checker();
+      promise.set_value((*gil_checker)());
+    } catch (...) {
+      promise.set_exception(std::current_exception());
+    }
+  });
+
+  // Detach the thread to allow it to run independently
+  workerThread.detach();
+
+  return resultFuture;
+}
+
 // Return CUDA device with ordinal given by input rank.  If we aren't
 // bound to a specific device, there is no strict guarantee that this
 // heuristic is the correct assignment of ranks to GPUs that Python
 // layers use, but in practice it tends to be.  Fortunately we don't
 // rely on this for correctness of any tensor operations, just for
-// ancillary uses like health checks and barriers.
+// ancillary uses like barriers.
 at::Device ProcessGroupNCCL::guessDeviceForRank() const {
   TORCH_CHECK_WITH(ValueError, rank_ >= 0, "Invalid rank ", rank_);
   if (getBoundDeviceId()) {
@@ -363,7 +401,7 @@ at::Device ProcessGroupNCCL::guessDeviceForRank() const {
   }
 }
 
-const int64_t ProcessGroupNCCL::kWatchdogThreadSleepMillis = 1000;
+const int64_t ProcessGroupNCCL::kWatchdogThreadSleepMillis = 100;
 constexpr int64_t kSynchronizeBusyWaitMillis = 10;
 thread_local uint64_t ProcessGroupNCCL::ncclActiveGroupCounter_ = 0;
 
@@ -388,7 +426,7 @@ std::ostream& operator<<(
 }
 
 ProcessGroupNCCL::WorkNCCL::WorkNCCL(
-    const std::vector<at::Device>& devices,
+    at::Device& device,
     int rank,
     OpType opType,
     uint64_t seq,
@@ -398,7 +436,7 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(
     bool enableTiming,
     DebugLevel distDebugLevel)
     : Work(rank, opType, profilingTitle, inputs),
-      devices_(devices),
+      device_(device),
       workStartTime_(std::chrono::steady_clock::now()),
       seq_(seq),
       timingEnabled_(enableTiming),
@@ -407,28 +445,19 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(
   // Note: The actual events are lazily created when first recorded to with
   // DEFAULT_FLAGS = cudaEventDisableTiming.
   if (enableTiming) {
-    ncclStartEvents_ = std::make_shared<std::vector<at::cuda::CUDAEvent>>();
-    ncclStartEvents_->reserve(devices.size());
-    for (uint32_t i = 0; i < devices.size(); ++i) {
-      ncclStartEvents_->emplace_back(at::cuda::CUDAEvent(cudaEventDefault));
-    }
-  }
-  ncclEndEvents_ = std::make_shared<std::vector<at::cuda::CUDAEvent>>();
-  ncclEndEvents_->reserve(devices.size());
-  for (uint32_t i = 0; i < devices.size(); ++i) {
-    ncclEndEvents_->emplace_back(at::cuda::CUDAEvent(
-        enableTiming ? cudaEventDefault : cudaEventDisableTiming));
+    ncclStartEvent_ = std::make_shared<at::cuda::CUDAEvent>(cudaEventDefault);
   }
-  ncclComms_.resize(devices.size());
+  ncclEndEvent_ = std::make_shared<at::cuda::CUDAEvent>(
+      enableTiming ? cudaEventDefault : cudaEventDisableTiming);
 }
 
 ProcessGroupNCCL::WorkNCCL::WorkNCCL(const WorkNCCL& w)
     : Work(w.rank_, w.opType_),
       std::enable_shared_from_this<WorkNCCL>(w),
-      devices_(w.devices_),
-      ncclStartEvents_(w.ncclStartEvents_),
-      ncclEndEvents_(w.ncclEndEvents_),
-      ncclComms_(w.ncclComms_),
+      device_(w.device_),
+      ncclStartEvent_(w.ncclStartEvent_),
+      ncclEndEvent_(w.ncclEndEvent_),
+      ncclComm_(w.ncclComm_),
       blockingWait_(w.blockingWait_),
       opTimeout_(w.opTimeout_),
       workStartTime_(w.workStartTime_),
@@ -446,22 +475,21 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(const WorkNCCL& w)
 ProcessGroupNCCL::WorkNCCL::~WorkNCCL() = default;
 
 bool ProcessGroupNCCL::WorkNCCL::isCompleted() {
-  checkAndSetException();
+  if (!ncclComm_->isAborted()) {
+    checkAndSetException();
+  }
   return exception() || finishedGPUExecutionInternal();
 }
 
 bool ProcessGroupNCCL::WorkNCCL::isStarted() {
-  checkAndSetException();
+  if (!ncclComm_->isAborted()) {
+    checkAndSetException();
+  }
   return exception() || startedGPUExecutionInternal();
 }
 
 bool ProcessGroupNCCL::WorkNCCL::isSuccess() const {
-  if (exception()) {
-    // Already detected an exception.
-    return false;
-  }
-
-  return !checkForNCCLErrors(ncclComms_) && finishedGPUExecutionInternal();
+  C10_THROW_ERROR(NotImplementedError, "WorkNCCL::isSuccess() is deprecated");
 }
 
 void ProcessGroupNCCL::WorkNCCL::checkAndSetException() {
@@ -470,7 +498,7 @@ void ProcessGroupNCCL::WorkNCCL::checkAndSetException() {
     return;
   }
 
-  auto exception_ptr = checkForNCCLErrors(ncclComms_);
+  auto exception_ptr = checkForNCCLErrors();
   std::unique_lock<std::mutex> lock(mutex_);
   exception_ = exception_ptr;
   if (exception_) {
@@ -502,24 +530,18 @@ bool ProcessGroupNCCL::WorkNCCL::startedGPUExecutionInternal() const {
   if (!timingEnabled_) {
     return false;
   }
-  for (const auto i : c10::irange(devices_.size())) {
-    // Checking the work's corresponding CUDA events' status
-    if (!(*ncclStartEvents_)[i].query()) {
-      return false;
-    }
+  // Checking the work's corresponding CUDA event's status
+  if (!ncclStartEvent_->query()) {
+    return false;
   }
-
   return true;
 }
 
 bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const {
-  for (const auto i : c10::irange(devices_.size())) {
-    // Checking the work's corresponding CUDA events' status
-    if (!(*ncclEndEvents_)[i].query()) {
-      return false;
-    }
+  // Checking the work's corresponding CUDA event's status
+  if (!ncclEndEvent_->query()) {
+    return false;
   }
-
   return true;
 }
 
@@ -579,12 +601,10 @@ void ProcessGroupNCCL::WorkNCCL::synchronize() {
   synchronizeInternal(kNoTimeout);
 }
 
-void ProcessGroupNCCL::WorkNCCL::synchronizeStreams() {
-  for (const auto i : c10::irange(devices_.size())) {
-    auto currentStream = at::cuda::getCurrentCUDAStream(devices_[i].index());
-    // Block the current stream on the NCCL stream
-    (*ncclEndEvents_)[i].block(currentStream);
-  }
+void ProcessGroupNCCL::WorkNCCL::synchronizeStream() {
+  auto currentStream = at::cuda::getCurrentCUDAStream(device_.index());
+  // Block the current stream on the NCCL stream
+  ncclEndEvent_->block(currentStream);
 
   if (avoidRecordStreams_) {
     stashed_for_allocator_safety_->clear();
@@ -594,7 +614,7 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeStreams() {
 // Waiting on the work's corresponding CUDA events
 void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(
     std::chrono::milliseconds timeout) {
-  synchronizeStreams();
+  synchronizeStream();
 
   // In case of blocking, wait for the operation to complete.
   if (blockingWait_) {
@@ -629,13 +649,21 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(
   }
 
   // Device synchronize only after we've completed timeout checks.
-  if (!barrierTensors_.empty()) {
+  if (barrierTensor_.defined()) {
     // If we use the work to do barrier, we should block here
-    at::cuda::OptionalCUDAGuard gpuGuard;
-    for (auto& device : devices_) {
-      gpuGuard.set_index(device.index());
-      AT_CUDA_CHECK(cudaDeviceSynchronize());
-    }
+    // `dist.barrier()` only requires all CPU processes to enter this
+    // function, hence we only need to make sure the dummy all-reduce has
+    // completed. So we would only need to sync the **current stream** back to
+    // host, and do not need to synchronize the entire device (which may have
+    // kernels running on other streams).
+    // Using `cudaStreamSynchronize` instead of `cudaDeviceSynchronize` can:
+    // - lower chance of hang;
+    // - CurrentCUDAStream is usually the context of the next operation in
+    // Python, thus blocking current stream would already block the next
+    // compute kernel;
+    // - achieve better barrier performance.
+    auto currentStream = at::cuda::getCurrentCUDAStream(device_.index());
+    AT_CUDA_CHECK(cudaStreamSynchronize(currentStream));
   }
 }
 
@@ -643,9 +671,9 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(
 bool ProcessGroupNCCL::WorkNCCL::wait(std::chrono::milliseconds timeout) {
   RECORD_PARAM_COMMS(
       static_cast<int>(this->seq_), // seq
-      0, // process group ptr
+      std::make_tuple("", ""), // PG name tuple
       rank_, // rank
-      "wait", // colName
+      "wait", // collective name
       0, // inNelems
       0, // outNelems
       at::kByte, // dType
@@ -653,65 +681,41 @@ bool ProcessGroupNCCL::WorkNCCL::wait(std::chrono::milliseconds timeout) {
       std::vector<int64_t>(), // outSplitSizes
       -1,
       -1,
-      static_cast<int>(devices_.size())); // worldSize
+      static_cast<int>(1)); // number of device?
   synchronizeInternal(timeout);
-  // Always return true, because abort API is not implemented.
+  // TODO(kwen2501): this should be moved to c10d tests, to qualify a NCCL
+  // upgrade. Once a NCCL version is qualified, this code should not be needed
+  // at runtime.
+#ifdef PGNCCL_ENABLE_HASH
   if (distDebugLevel_ >= DebugLevel::Detail) {
     auto numel = getTensorsNumel(*outputs_);
     auto hashValue = hashTensors(*outputs_);
     PRINT_COLLECTIVE_HASH_SIGNATURE(
         "output", opTypeToString(opType_), numel, hashValue);
   }
+#endif
+  // Always return true, because abort API is not implemented.
   return true;
 }
 
 void ProcessGroupNCCL::WorkNCCL::abort() {
   // Abort all communicators of this work
-  for (const auto& ncclComm : ncclComms_) {
-    ncclComm->ncclCommAbort();
-  }
+  ncclComm_->ncclCommAbort();
 
   ncclCommDevIdxMapMutex.lock();
-  for (const auto& comm : ncclComms_) {
-    ncclCommDevIdxMap.erase(comm);
-  }
+  ncclCommDevIdxMap.erase(ncclComm_);
   ncclCommDevIdxMapMutex.unlock();
 }
 
-ProcessGroupNCCL::CoalescedWorkNCCL::CoalescedWorkNCCL(
-    std::vector<ProcessGroupNCCL::WorkNCCL> works,
-    int rank,
-    OpType opType)
-    : Work(rank, opType, nullptr), works_(std::move(works)) {}
-
-ProcessGroupNCCL::CoalescedWorkNCCL::~CoalescedWorkNCCL() = default;
-
-c10::intrusive_ptr<ProcessGroupNCCL::CoalescedWorkNCCL> ProcessGroupNCCL::
-    initCoalescedWork(
-        const std::vector<c10::intrusive_ptr<Work>>& works,
-        int rank,
-        OpType opType) {
-  std::vector<ProcessGroupNCCL::WorkNCCL> ncclWorks;
-  ncclWorks.reserve(works.size());
-  for (auto& work : works) {
-    ncclWorks.push_back(*static_cast<ProcessGroupNCCL::WorkNCCL*>(work.get()));
-  }
-  return c10::make_intrusive<ProcessGroupNCCL::CoalescedWorkNCCL>(
-      ncclWorks, rank, opType);
-}
-
-// Same as calling synchronize().
-bool ProcessGroupNCCL::CoalescedWorkNCCL::wait(
-    std::chrono::milliseconds timeout) {
-  for (auto& w : works_) {
-    w.wait(timeout);
-  }
-  // Always return true, because abort API is not implemented.
-  return true;
-}
-
 static std::atomic<size_t> process_group_id = 0;
 
+constexpr const char* MULTI_DEVICE_ERROR_MSG =
+    "Expecting one tensor only but got multiple. You are probably using multiple "
+    "devices under one thread. The support for such usage has been deprecated. "
+    "For details, please refer to "
+    "https://pytorch.org/docs/stable/distributed.html#multi-gpu-collective-functions. "
+    "ProcessGroupNCCL continues supporting multi-process and multi-thread modes.";
+
 ProcessGroupNCCL::ProcessGroupNCCL(
     const c10::intrusive_ptr<Store>& store,
     int rank,
@@ -732,20 +736,34 @@ ProcessGroupNCCL::ProcessGroupNCCL(
       ValueError,
       at::cuda::getNumGPUs() != 0,
       "ProcessGroupNCCL is only supported with GPUs, no GPUs found!");
+  this->setGroupName(options_->group_name);
   logPrefix_ = createLogPrefix();
   blockingWait_ = getCvarBool(TORCH_NCCL_BLOCKING_WAIT, false);
   asyncErrorHandling_ = static_cast<ErrorHandlingMode>(
       getCvarInt(TORCH_NCCL_ASYNC_ERROR_HANDLING, 3 /*SkipCleanUp*/));
   desyncDebug_ = getCvarBool(TORCH_NCCL_DESYNC_DEBUG, false) ||
       (dist_debug_level_ >= DebugLevel::Detail);
-  dumpOnTimeout_ = getCvarBool(TORCH_NCCL_DUMP_ON_TIMEOUT, false) ||
+  // TODO, we should either deprecate TORCH_NCCL_DUMP_ON_TIMEOUT
+  // or change its name to reflect that dump happens on exception including
+  // both timeout and other errors.
+  dumpOnException_ = getCvarBool(TORCH_NCCL_DUMP_ON_TIMEOUT, false) ||
       (dist_debug_level_ >= DebugLevel::Detail);
   heartbeat_ = 1ULL;
   monitorThreadEnabled_.store(getCvarBool(TORCH_NCCL_ENABLE_MONITORING, true));
   heartbeatTimeoutInSec_ =
       getCvarInt(TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC, 60 * 10 /*10 Mins*/);
+  waitTimeoutDumpInMilSec_ =
+      getCvarInt(TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC, 60 * 1000 /*60 Sec*/);
+  coordCheckIntervalMilSec_ = getCvarInt(TORCH_NCCL_COORD_CHECK_MILSEC, 1000);
   ncclTraceBufferSize_ = getCvarInt(TORCH_NCCL_TRACE_BUFFER_SIZE, 0);
   enableCollecticeHashDebug_ = (dist_debug_level_ >= DebugLevel::Detail);
+  // store_ usually is wrapped with PrefixStore and the prefix is different
+  // across different ProcessGroupNCCL(PG) instances. We need to get the
+  // underlying non-PrefixStore for sharing global information shared across
+  // different PGs.
+  PrefixStore* prefixStore = dynamic_cast<PrefixStore*>(store_.get());
+  globalStore_ =
+      prefixStore ? prefixStore->getUnderlyingNonPrefixStore() : store_;
 #ifdef ENABLE_NCCL_ERROR_CHECKING
   enableTiming_.store(
       getCvarBool(TORCH_NCCL_ENABLE_TIMING, false) || desyncDebug_);
@@ -784,15 +802,6 @@ ProcessGroupNCCL::ProcessGroupNCCL(
     }
   }
 
-  if (getCvarBool(TORCH_ENABLE_NCCL_HEALTH_CHECK, false)) {
-    // Perform health check by initializing dummy communicators and destroying
-    // them. This will help indicate any NCCL-related issues prior to the first
-    // collective.
-    // Run it in a separate thread and wait on CV to handle timeouts, since
-    // majority of getNCCLComm failures are hangs.
-    runHealthCheck();
-  }
-
 #ifdef ENABLE_NCCL_ERROR_CHECKING
   ncclCommWatchdogThread_ =
       std::thread(&ProcessGroupNCCL::ncclCommWatchdog, this);
@@ -802,12 +811,13 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   const std::string OFF = "OFF";
   std::string torch_distributed_debug =
       getCvarString({"TORCH_DISTRIBUTED_DEBUG"}, OFF.c_str());
-  std::string nccl_debug = getCvarString({"NCCL_DEBUG"}, OFF.c_str());
   LOG(INFO) << logPrefix() << "ProcessGroupNCCL initialization options: "
             << "NCCL version: " << getNcclVersion() << ", size: " << size
             << ", global rank: " << globalRank()
             << ", TORCH_NCCL_ASYNC_ERROR_HANDLING: " << asyncErrorHandling_
-            << ", TORCH_NCCL_DUMP_ON_TIMEOUT: " << dumpOnTimeout_
+            << ", TORCH_NCCL_DUMP_ON_TIMEOUT: " << dumpOnException_
+            << ", TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: "
+            << waitTimeoutDumpInMilSec_
             << ", TORCH_NCCL_DESYNC_DEBUG: " << desyncDebug_
             << ", TORCH_NCCL_ENABLE_TIMING: " << enableTiming_.load()
             << ", TORCH_NCCL_BLOCKING_WAIT: " << blockingWait_
@@ -825,7 +835,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
             << monitorThreadEnabled_.load()
             << ", TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: " << heartbeatTimeoutInSec_
             << ", TORCH_NCCL_TRACE_BUFFER_SIZE: " << ncclTraceBufferSize_
-            << ", NCCL_DEBUG: " << nccl_debug << ", ID=" << this->getID();
+            << ", TORCH_NCCL_COORD_CHECK_MILSEC: " << coordCheckIntervalMilSec_
+            << ", PG Name: " << options_->group_name;
 
   if (options_->global_ranks_in_group.empty()) {
     this->globalRankStart = 0;
@@ -859,26 +870,15 @@ ProcessGroupNCCL::ProcessGroupNCCL(
     }
   }
 
-  RECORD_PARAM_COMMS(
-      0, // seq
-      this->getID(),
-      rank, // rank
-      "init", // colName
-      0, // inNelems
-      0, // outNelems
-      at::kByte, // dType
-      std::vector<int64_t>(), // inSplitSizes
-      std::vector<int64_t>(), // outSplitSizes
-      globalRankStart, // globalRankStart
-      globalRankStride, // globalRankStride
-      size_); // worldSize
-
   // Attach hooks to cache allocator to trigger the hooks whenever a traced
   // action is called. In the following hooks, we register a newly allocated
   // segment when SEGMENT_ALLOC action occurs, and deregister a segment when
   // SEGMENT_FREE action occurs.
   // We attach hooks only once at the first PG creation.
+  // Attaching hooks fails if CUDACachingAllocator is not initialized, so
+  // lazyInitCUDA is called (and is a no-op if CUDA is already initialized).
   if (useTensorRegisterAllocatorHook_ && !allocatorHooksAttached) {
+    at::globalContext().lazyInitCUDA();
     c10::cuda::CUDACachingAllocator::attachAllocatorTraceTracker(
         &cacheAllocatorRegisterHook);
     c10::cuda::CUDACachingAllocator::attachAllocatorTraceTracker(
@@ -888,11 +888,10 @@ ProcessGroupNCCL::ProcessGroupNCCL(
 }
 
 void ProcessGroupNCCL::eagerConnectSingleDevice(at::Device device) {
-  std::vector<at::Device> rankDevices = {device};
-  const auto key = getKeyFromDevices(rankDevices);
+  const auto key = getKeyFromDevice(device);
   LOG(INFO) << logPrefix() << "Eagerly connecting nccl backend with device "
             << device;
-  getNCCLComm(key, rankDevices, OpType::ALLREDUCE);
+  getNCCLComm(key, device, OpType::ALLREDUCE);
 }
 
 void ProcessGroupNCCL::performNocolorSplit(at::Device device) {
@@ -900,83 +899,27 @@ void ProcessGroupNCCL::performNocolorSplit(at::Device device) {
   // ranks not in the new subgroup (and ranks that would be in it will
   // just use a new communicator rather than split).
 #ifdef NCCL_HAS_COMM_SPLIT
-  std::vector<at::Device> rankDevices = {device};
-  const auto key = getKeyFromDevices(rankDevices);
+  const auto key = getKeyFromDevice(device);
   LOG(INFO) << logPrefix() << "Performing nocolor split on backend device "
             << device << ", key " << key << ", i am " << this;
-  auto comm = getNCCLComm(key, rankDevices, OpType::ALLREDUCE);
-  TORCH_CHECK_WITH(
-      DistBackendError,
-      comm.size() == 1,
-      "exactly one communicator found for device ",
-      device);
-  NCCLComm::split(comm[0].get(), NCCL_SPLIT_NOCOLOR, rank_, options_->config);
+  auto comm = getNCCLComm(key, device, OpType::ALLREDUCE);
+  NCCLComm::split(comm.get(), NCCL_SPLIT_NOCOLOR, rank_, options_->config);
 #endif
 }
 
 c10::intrusive_ptr<intra_node_comm::IntraNodeComm> ProcessGroupNCCL::
     initIntraNodeComm() {
-  return intra_node_comm::IntraNodeComm::rendezvous(
-      store_, std::to_string(uid_), rank_, size_);
-}
-
-void ProcessGroupNCCL::runHealthCheck() {
-  // Run health check in a separate thread and wait on CV to handle timeouts,
-  // since majority of getNCCLComm failures are hangs.
-
-  struct HealthCheckData {
-    std::mutex healthCheckMutex;
-    std::condition_variable healthCheckCv;
-    bool healthCheckSuccess = false;
-    std::exception_ptr healthCheckException;
-  };
-
-  HealthCheckData healthCheckData;
-  auto t = std::thread([&healthCheckData, this]() {
-    try {
-      std::vector<at::Device> rankDevice = {guessDeviceForRank()};
-
-      const auto key = getKeyFromDevices(rankDevice);
-      // OpType does not matter, only need to set to not go through send/recv
-      // path.
-      getNCCLComm(key, rankDevice, OpType::ALLREDUCE);
-      // Now destroy the communicators and remove them from cache so we don't
-      // use destroyed communicators.
-      destroyNCCLComms(key);
-      // Notify main thread the health check is complete.
-      {
-        std::lock_guard<std::mutex> lk(healthCheckData.healthCheckMutex);
-        healthCheckData.healthCheckSuccess = true;
-      }
-      healthCheckData.healthCheckCv.notify_one();
-    } catch (const std::exception& e) {
-      // Populate exception ptr.
-      healthCheckData.healthCheckException = std::current_exception();
-      // Unblock waiting main thread which will report exception.
-      healthCheckData.healthCheckCv.notify_one();
-    } // Unknown exceptions will just cause the program to terminate.
-  });
-  // We don't need to join the thread, just need to verify health check via the
-  // CV. Hence we detach the thread here.
-  t.detach(); // NOLINT
-  LOG(INFO) << logPrefix() << "will wait up to " << options_->timeout.count()
-            << " msec for NCCL health check to complete.";
-  std::unique_lock<std::mutex> lock(healthCheckData.healthCheckMutex);
-  healthCheckData.healthCheckCv.wait_for(
-      lock, options_->timeout, [&healthCheckData]() {
-        return healthCheckData.healthCheckSuccess;
-      });
-
-  if (healthCheckData.healthCheckException) {
-    std::rethrow_exception(healthCheckData.healthCheckException);
+  using IntraNodeComm = intra_node_comm::IntraNodeComm;
+  if (!IntraNodeComm::isEnabled()) {
+    return nullptr;
+  }
+  auto prefixStore = c10::make_intrusive<PrefixStore>("IntraNodeComm", store_);
+  auto comm = c10::make_intrusive<IntraNodeComm>(prefixStore, rank_, size_);
+  if (comm->rendezvous()) {
+    return comm;
+  } else {
+    return nullptr;
   }
-  // If there is no exception, the likely culprit is a timeout/hang which is how
-  // most communicator init issues manifest themselves.
-  TORCH_CHECK_WITH(
-      DistBackendError,
-      healthCheckData.healthCheckSuccess,
-      "ProcessGroupNCCL: Health check failure: Failed to initialize NCCL communicator on rank ",
-      rank_);
 }
 
 void ProcessGroupNCCL::setSequenceNumberForGroup() {
@@ -1044,68 +987,65 @@ void ProcessGroupNCCL::enableCollectivesTiming() {
   enableTiming_.store(true);
 }
 
-std::future<bool> ProcessGroupNCCL::launchAsyncDebugDump() {
-  std::promise<bool> resultPromise;
-  std::future<bool> resultFuture = resultPromise.get_future();
-
-  std::thread workerThread(
-      [promise = std::move(resultPromise), this]() mutable {
-        try {
-          promise.set_value(dumpDebuggingInfo());
-        } catch (...) {
-          promise.set_exception(std::current_exception());
-        }
-      });
-
-  // Detach the thread to allow it to run independently
-  workerThread.detach();
-
-  return resultFuture;
-}
-
-void ProcessGroupNCCL::waitForDumpOrTimeout(
+void ProcessGroupNCCL::waitForFutureOrTimeout(
     std::future<bool>& fut,
-    size_t timeout_sec) {
+    const std::chrono::milliseconds& timeOutMilSec,
+    const std::string& futDescription,
+    bool throwException) {
+  std::string errorMsg;
   TORCH_CHECK(fut.valid(), "Expected a valid future");
-
-  auto futStatus = fut.wait_for(std::chrono::seconds(timeout_sec));
-  if (futStatus != std::future_status::ready) {
-    TORCH_CHECK(
-        futStatus != std::future_status::deferred,
-        "Expected the dump future to have been launched eagerly.");
-    LOG(INFO)
-        << logPrefix() << "Debug dump timed out and is being abandoned."
-        << " This may be due to slow ADDR2LINE performance processing stacktraces."
-        << " Try TORCH_DISABLE_ADDR2LINE=1 and TORCH_NCCL_TRACE_CPP_STACK=0 to work around.";
+  std::future_status status = fut.wait_for(timeOutMilSec);
+  if (status == std::future_status::ready) {
+    // Calling .get() will re-raise any exception from the future, and we don't
+    // care about the retval
+    try {
+      bool result = fut.get();
+      if (result) {
+        LOG(INFO) << logPrefix()
+                  << "future is successfully executed for: " << futDescription;
+      }
+    } catch (const std::exception& e) {
+      errorMsg = c10::str(
+          logPrefix(),
+          "Exception thrown when waitng for future ",
+          futDescription,
+          ": ",
+          e.what());
+      LOG(ERROR) << errorMsg;
+    } catch (...) {
+      errorMsg = c10::str(
+          logPrefix(),
+          "Unknown exception thrown when waitng for future ",
+          futDescription);
+      LOG(ERROR) << errorMsg;
+    }
+  } else {
+    errorMsg = c10::str(
+        logPrefix(),
+        "Future for ",
+        futDescription,
+        " timed out after ",
+        timeOutMilSec.count(),
+        " ms");
+    LOG(ERROR) << errorMsg;
   }
-
-  // Calling .get() will raise any exception stored in the promise associated
-  // with the future. (but we can ignore the return value, which will be false
-  // if dumping is not enabled)
-  try {
-    fut.get();
-  } catch (const std::exception& e) {
-    LOG(ERROR) << logPrefix() << "Caught exception during async debug dump: \""
-               << e.what() << "\"\n";
-  } catch (...) {
-    LOG(ERROR) << logPrefix()
-               << "Caught unknown exception during async debug dump.";
+  if (throwException && !errorMsg.empty()) {
+    C10_THROW_ERROR(DistBackendError, errorMsg);
   }
 }
 
 void ProcessGroupNCCL::abortCommsFromMap(
-    std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLComm>>>&
-        ncclCommsMap,
+    std::unordered_map<std::string, std::shared_ptr<NCCLComm>>& ncclCommsMap,
     c10::optional<std::string> abortReason) {
   // The process may control multiple devices, loop through the communicators on
   // each device
   for (auto& it : ncclCommsMap) {
     auto& devName = it.first;
-    auto& ncclComms = it.second;
+    auto& ncclComm = it.second;
 
-    for (const auto& ncclComm : ncclComms) {
-      ncclComm->ncclCommAbort(abortReason);
-    }
+    LOG(INFO) << logPrefix() << "ProcessGroupNCCL destroying ncclComm_ "
+              << ncclComm->ncclComm_ << " on CUDA device: " << devName;
+    ncclComm->ncclCommAbort(abortReason);
     // Note that we don't remove the aborted communicators from the
     // cache. The reason is that if we do remove the communicator
     // from the cache, it is possible that a new collective operation
@@ -1118,20 +1058,18 @@ void ProcessGroupNCCL::abortCommsFromMap(
 
     c10::StreamId streamId = -1;
     if (ncclStreams_.find(devName) != ncclStreams_.end()) {
-      auto streams = ncclStreams_.at(devName);
-      if (streams.size() > 0) {
-        streamId = streams[0].id();
-      }
+      auto stream = ncclStreams_.at(devName);
+      streamId = stream.id();
     }
 
-    LOG(INFO) << logPrefix() << "] Destroyed " << ncclComms.size()
-              << "communicators on CUDA device: " << devName
+    LOG(INFO) << logPrefix() << "ProcessGroupNCCL destroyed "
+              << " communicator on CUDA device: " << devName
               << " with stream: " << streamId;
   }
 }
 
 // Abort all communicators on this rank
-void ProcessGroupNCCL::abort(c10::optional<std::string> abortReason) {
+bool ProcessGroupNCCL::abort(c10::optional<std::string> abortReason) {
   // Remove record from global ncclCommDevIdxMapMutex before aboarting,
   // so that a new cache segment would not register to already aborded
   // communicators. Note that ncclCommDevIdxMap is a global container which may
@@ -1139,59 +1077,72 @@ void ProcessGroupNCCL::abort(c10::optional<std::string> abortReason) {
   // for the current PG.
   ncclCommDevIdxMapMutex.lock();
   for (auto& it : devNCCLCommMap_) {
-    auto& ncclComms = it.second;
-    for (const auto& ncclComm : ncclComms) {
-      ncclCommDevIdxMap.erase(ncclComm);
-    }
+    auto& ncclComm = it.second;
+    ncclCommDevIdxMap.erase(ncclComm);
   }
   ncclCommDevIdxMapMutex.unlock();
 
   std::lock_guard<std::mutex> lock(mutex_);
   abortCommsFromMap(devNCCLCommMap_, abortReason);
   abortCommsFromMap(inInitializationCommMap_, abortReason);
+  return true;
 }
 
-void ProcessGroupNCCL::shutdown() {
+void ProcessGroupNCCL::shutdown(c10::optional<std::string> reason) {
   // Don't join threads here since the purpose of this method is to abort all
   // communicators and signal the threads to exit. Joining on the threads could
   // potentially block and hence avoid it in this method.
   terminateProcessGroup_.store(true);
+  workMetaListCV_.notify_one();
 
-  std::string abortReason = c10::str("Process Group shutdown on rank ", rank_);
-  abort(abortReason);
+  // lauch abort asynchrounously and wait for it to complete or timeout
+  LOG(INFO) << logPrefix()
+            << "Launching ProcessGroupNCCL abort asynchrounously.";
+  std::future<bool> fut = std::async(
+      std::launch::async, [this, &reason]() { return this->abort(reason); });
 
-  workMetaListCV_.notify_one();
+  waitForFutureOrTimeout(fut, options_->timeout, "ProcessGroup abort", true);
+  LOG(INFO) << logPrefix() << "ProcessGroupNCCL aborts successfully.";
+
+  // We need to wait for abort to finish before we can safely shut down
+  // heartbeat monitoring thread.
   terminateHeartbeatMonitorThread_.store(true);
   monitorWakeUpCV_.notify_one();
 }
 
 ProcessGroupNCCL::~ProcessGroupNCCL() {
-  terminateProcessGroup_.store(true);
-  workMetaListCV_.notify_one();
+  LOG(INFO) << logPrefix() << "ProcessGroupNCCL destructor entered.";
 
+  if (!terminateProcessGroup_.load()) {
+    LOG(WARNING) << c10::str(
+        "WARNING: process group has NOT been destroyed before it is being destructed. ",
+        "On normal program exit, the application should call destroy_process_group to ",
+        "ensure that any pending NCCL data transfers have finished in this process. "
+        "In rare cases this process can exit before this point and block the progress of "
+        "another member of the process group. This constraint has always been present, "
+        " but this warning has only been added since PyTorch 2.4");
+    // If user haven't explicitly destroy/shutdown process group, destructor
+    // needs to do so
+    shutdown();
+  }
+
+  // Wait for all threads to finish before returning
 #ifdef ENABLE_NCCL_ERROR_CHECKING
   if (ncclCommWatchdogThread_.joinable()) {
     ncclCommWatchdogThread_.join();
+    LOG(INFO) << logPrefix() << "ProcessGroupNCCL watchdog thread joined.";
   }
-#endif
-
-  if (onCompletionHookThread_.joinable())
-    onCompletionHookThread_.join();
-
-  // Abort communicators after all threads have exited to avoid having the
-  // threads dying due to aborted communicator and raising a SIGABRT
-  std::string abortReason = c10::str("Process Group destroyed on rank ", rank_);
-  abort(abortReason);
-
-  // We need to wait for abort to finish before we can safely shut down
-  // heartbeat monitoring thread.
-  terminateHeartbeatMonitorThread_.store(true);
-  monitorWakeUpCV_.notify_one();
-#ifdef ENABLE_NCCL_ERROR_CHECKING
   if (ncclHeartbeatMonitorThread_.joinable()) {
     ncclHeartbeatMonitorThread_.join();
+    LOG(INFO) << logPrefix()
+              << "ProcessGroupNCCL heart beat monitor thread joined.";
   }
 #endif
+  if (onCompletionHookThread_.joinable()) {
+    onCompletionHookThread_.join();
+    LOG(INFO) << logPrefix()
+              << "ProcessGroupNCCL onCompletionHookThread thread joined.";
+  }
 }
 
 bool ProcessGroupNCCL::dumpDebuggingInfo() {
@@ -1207,6 +1158,8 @@ bool ProcessGroupNCCL::dumpDebuggingInfo() {
     // `registerDebugInfoWriter`.
     auto ncclTrace = dump_nccl_trace();
     DebugInfoWriter& writer = DebugInfoWriter::getWriter(globalRank());
+    LOG(INFO) << logPrefix() << "ProcessGroupNCCL dumping nccl trace to "
+              << writer.getWriterTarget();
     writer.write(ncclTrace);
     return true;
   }
@@ -1219,47 +1172,214 @@ void ProcessGroupNCCL::terminateProcess(std::string errMsg) {
   LOG(FATAL) << logPrefix() << errMsg;
 }
 
+int computeDeltaMS(
+    std::chrono::time_point<std::chrono::steady_clock> start,
+    std::chrono::time_point<std::chrono::steady_clock> end) {
+  return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
+      .count();
+}
+
 void ProcessGroupNCCL::heartbeatMonitor() {
   uint64_t heartBeatCounter = 0ULL;
+  std::string errorMsg;
+  std::string exitMsg;
+  bool checkDumpSignal = (dumpOnException_ && uid_ == 0);
+  int monitorPollInterval = checkDumpSignal ? coordCheckIntervalMilSec_
+                                            : heartbeatTimeoutInSec_ * 1000;
+  auto lastTimePollStore = std::chrono::steady_clock::now();
+  auto lastTimeHeartBeatCheck = std::chrono::steady_clock::now();
+  c10::optional<DumpPipe> dumpPipe = c10::nullopt;
+  if (uid_ == 0) {
+    // DumpPipe is one per-trainer process, and its convenient to name them
+    // after 'global' ranks in the system, So we assume processgroup (uid)==0 is
+    // the global PG and has globally unique rank ids across trainers.
+    dumpPipe.emplace(rank_);
+  }
   while (true) {
     // This won't have any lock since this lock is only used here.
     // Please be aware that mutex `monitorMutex_` should not be used
     // somewhere else to avoid the deadlock.
     std::unique_lock<std::mutex> lock(monitorMutex_);
     if (monitorWakeUpCV_.wait_for(
-            lock, std::chrono::seconds(heartbeatTimeoutInSec_), [&] {
+            lock, std::chrono::milliseconds(monitorPollInterval), [&] {
               return terminateHeartbeatMonitorThread_.load();
             })) {
       // For the normal complete or user interception, monitorWakeUpCV_
       // will get notified, we early return and exit heartbeatMonitor.
       return;
     }
+    auto currentTime = std::chrono::steady_clock::now();
+
+    // We put extra functionality in the thread for the default PG (aka, uid_=0)
+    // because the signal is same across different PGs. We only need to run
+    // once per process to avoid duplicate things performed in too many separate
+    // threads. For example, we check a global flag on the TCPStore periodically
+    // to see if any PG on any rank observed a timeout and signaled peers to
+    // dump debugging info, and we avoid hammering the TCPStore from all PGs on
+    // the same rank.
+    if (checkDumpSignal) {
+      // There are two scenarios where monitor thread will dump on timeout:
+      // 1. The local rank is the first to observe a timeout.shouldDump_ will be
+      // set to true.
+      // 2. other ranks detected the timeout and signal the local rank to dump
+      // In addtion, monitor threads will dump if watchdog threads has no
+      // heartbeat or dumpPipe is not empty.
+      if (shouldDump_.load()) {
+        errorMsg = c10::str(
+            logPrefix(),
+            "Received a dump signal from this local rank and will ",
+            "start to dump the debug info. ",
+            "Last enqueued NCCL work: ",
+            lastEnqueuedSeq_,
+            ", last completed NCCL work: ",
+            lastCompletedSeq_,
+            ".");
+        exitMsg = c10::str(
+            "ProcessGroupNCCL's watchdog detected an exception from the local rank. ",
+            "This is most likely caused by incorrect usages of collectives, e.g., wrong ",
+            "sizes used across ranks, the order of collectives is not same for all ranks ",
+            "or the scheduled collective, for some reason, didn't run. Additionally, ",
+            "this can be caused by GIL deadlock or other reasons such as network errors or ",
+            "bugs in the communications library (e.g. NCCL), etc. We tried our best to ",
+            "dump the debug info into the storage to help you debug the issue.");
+        break;
+      }
+      // We poll store to see if some ranks have flagged a timeout when
+      // we haven't polled for `heartbeat_timeout` seconds and there haven't
+      // any work added or removed for `watchdog_timeout` seconds.
+      if (computeDeltaMS(lastWorkListUpdateTime_, currentTime) >=
+              kWatchdogThreadSleepMillis &&
+          computeDeltaMS(lastTimePollStore, currentTime) >=
+              coordCheckIntervalMilSec_) {
+        lastTimePollStore = currentTime;
+        if (globalStore_->check({std::string(EXCEPTION_DUMP)})) {
+          int timeOutRank = -1;
+          if (!shouldDump_.load()) {
+            LOG(ERROR)
+                << logPrefix()
+                << "First PG on this rank detecting the dump signal through tcpstore.";
+          }
+          shouldDump_.store(true);
+          try {
+            auto vec = globalStore_->get(std::string(EXCEPTION_DUMP));
+            TORCH_CHECK_WITH(
+                DistBackendError,
+                vec.size() == sizeof(int),
+                "Invalid size for the timeout rank ID");
+            std::memcpy(&timeOutRank, vec.data(), vec.size());
+          } catch (const std::exception& e) {
+            LOG(ERROR)
+                << "Failed to get timeout rank ID from the global store.";
+          }
+          errorMsg = c10::str(
+              logPrefix(),
+              "Received a global dump signal from rank ",
+              timeOutRank,
+              ", and will start to dump the debug info. ",
+              "Last enqueued NCCL work: ",
+              lastEnqueuedSeq_,
+              ", last completed NCCL work: ",
+              lastCompletedSeq_,
+              ".");
+          exitMsg = c10::str(
+              "ProcessGroupNCCL's watchdog detected a dump signal from rank ",
+              timeOutRank,
+              " and notified the current rank. ",
+              "This is most likely caused by incorrect usages of collectives, e.g., wrong ",
+              "sizes used across ranks, the order of collectives is not same for all ranks ",
+              "or the scheduled collective, for some reason, didn't run. Additionally, ",
+              "this can be caused by GIL deadlock or other reasons such as network errors or ",
+              "bugs in the communications library (e.g. NCCL), etc. We tried our best to ",
+              "dump the debug info into the storage to help you debug the issue.");
+          break;
+        }
+      }
+    }
 
-    // Check the heart beat of watchdog thread.
-    auto heartbeat = heartbeat_.load();
-    if (heartbeat != heartBeatCounter) {
-      heartBeatCounter = heartbeat;
-    } else {
-      // No heartbeat increase detected and timeout.
-      break;
+    if (computeDeltaMS(lastTimeHeartBeatCheck, currentTime) >=
+        heartbeatTimeoutInSec_ * 1000) {
+      // Check the heart beat of watchdog thread.
+      lastTimeHeartBeatCheck = currentTime;
+      auto heartbeat = heartbeat_.load();
+      if (heartbeat != heartBeatCounter) {
+        heartBeatCounter = heartbeat;
+      } else {
+        if (!shouldDump_.load()) {
+          LOG(ERROR)
+              << logPrefix()
+              << "First PG on this rank that detected no heartbeat of its watchdog.";
+        }
+        shouldDump_.store(true);
+        // No heartbeat increase detected and timeout.
+        errorMsg = c10::str(
+            logPrefix(),
+            "Heartbeat monitor timed out! Process will be terminated after dumping debug info.",
+            " workMetaList_.size()=",
+            workMetaList_.size());
+        exitMsg = c10::str(
+            "ProcessGroupNCCL's watchdog got stuck for ",
+            heartbeatTimeoutInSec_,
+            " seconds without making progress in monitoring enqueued collectives. ",
+            "This typically indicates a NCCL/CUDA API hang blocking the watchdog, ",
+            "and could be triggered by another thread holding the GIL inside a ",
+            "CUDA api, or other deadlock-prone behaviors.",
+            "If you suspect the watchdog is not actually stuck and a longer timeout would help, ",
+            "you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value "
+            "or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0)."
+            "If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout "
+            "or false positive abort; otherwise, please attempt to debug the hang. "
+            "workMetaList_.size() = ",
+            workMetaList_.size(),
+            "");
+        break;
+      }
+    }
+    // process a request to dump the trace. only PG uid 0 will respond to dump
+    // requests, but this is fine since all PG's feed into the same flight
+    // recorder and dump. After dump, the training should continue.
+    if (dumpPipe.has_value() && dumpPipe->shouldDump()) {
+      // best effort dump, not waiting for the dump here
+      std::future<bool> fut = std::async(
+          std::launch::async, [this]() { return this->dumpDebuggingInfo(); });
     }
   }
-
-  const auto logMsg = c10::str(
-      logPrefix(),
-      "Heartbeat monitor timed out! Process will be terminated after dumping debug info.",
-      " workMetaList_.size()=",
-      workMetaList_.size());
-  LOG(ERROR) << logMsg;
+  LOG(ERROR) << errorMsg;
 
   auto& cpp_dumper = get_cpp_trace_dumper();
   if (cpp_dumper.has_value()) {
-    LOG(INFO) << "Dumping c++ stacktraces: " << cpp_dumper.value()();
+    LOG(INFO) << "Dumping c++ stacktraces:";
+    cpp_dumper.value()([](const std::string& line) { LOG(INFO) << line; });
+  }
+
+  if (checkDumpSignal && shouldDump_.load()) {
+    // Store debug info to storage if no other thread does it. (By default to
+    // local disk)
+    std::future<bool> asyncDebugDump = std::async(
+        std::launch::async, [this]() { return this->dumpDebuggingInfo(); });
+
+    // wait for the dump until timeout
+    waitForFutureOrTimeout(
+        asyncDebugDump,
+        std::chrono::milliseconds(waitTimeoutDumpInMilSec_),
+        "Flight recorder dump in heartbeatMonitor");
   }
 
-  // Store debug info to storage if no other thread does it. (By default to
-  // local disk)
-  std::future<bool> asyncDebugDump = launchAsyncDebugDump();
+  if (get_gil_checker() != nullptr) {
+    auto fut = launchAsyncGilCheck();
+    auto kGilCheckTimeout = std::chrono::milliseconds(300);
+    auto futStatus = fut.wait_for(kGilCheckTimeout);
+    if (futStatus != std::future_status::ready) {
+      TORCH_CHECK(
+          futStatus != std::future_status::deferred,
+          "Expected the future to have been launched eagerly.");
+      LOG(ERROR)
+          << "Could not acquire GIL within 300 ms on exit, possible GIL induced hang";
+    }
+    LOG(INFO) << "Could acquire GIL on exit";
+  } else {
+    LOG(INFO)
+        << "GIL checker was not registered, perhaps this is a no-python build?";
+  }
 
   // There are two possible cases for the watchdog thread exit:
   // Case one: desync report runs quickly, and it follows the step:
@@ -1269,7 +1389,8 @@ void ProcessGroupNCCL::heartbeatMonitor() {
   // Case two: desync might be slow or get stuck. Or we get stuck in
   // destructors, we will sleep for some time before calling std::abort() to
   // kill the whole process.
-  if ((terminateProcessGroup_.load() || collectiveDebugInfoMode_.load()) &&
+  if ((terminateProcessGroup_.load() || collectiveDebugInfoMode_.load() ||
+       shouldDump_.load()) &&
       !terminateHeartbeatMonitorThread_.load()) {
     // Leave another two mins for desync report generation or process group
     // destroy.
@@ -1285,39 +1406,31 @@ void ProcessGroupNCCL::heartbeatMonitor() {
   // We already log completion inside the thread, so it may not be necessary to
   // check the return value here.  We mainly use a future so we can exit early
   // if done.
-  waitForDumpOrTimeout(asyncDebugDump);
 
   if (!terminateHeartbeatMonitorThread_.load()) {
     // Create a error message reported from MonitorThread, so
     // we throw exception and make the whole process to be killed.
     // TODO(fduwjj): After having a hang debug wiki, we need to update the wiki
     // url here.
-    const auto exitMsg = c10::str(
-        logPrefix(),
-        "ProcessGroupNCCL's watchdog got stuck for ",
-        heartbeatTimeoutInSec_,
-        "seconds without making progress in monitoring enqueued collectives. ",
-        "This typically indicates a NCCL/CUDA API hang blocking the watchdog, ",
-        "and could be triggered by another thread holding the GIL inside a ",
-        "CUDA api, or other deadlock-prone behaviors.",
-        "If you suspect the watchdog is not actually stuck and a longer timeout would help, ",
-        "you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value "
-        "or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0)."
-        "If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout "
-        "or false positive abort; otherwise, please attempt to debug the hang.");
-    terminateProcess(exitMsg);
+    const auto finalExitMsg = c10::str(logPrefix(), exitMsg);
+    if (monitorThreadEnabled_.load()) {
+      terminateProcess(finalExitMsg);
+    } else {
+      LOG(ERROR)
+          << "PGNCCL Monitor Thread is disabled, but would have killed this job:\n"
+          << finalExitMsg;
+    }
   }
 }
 
 void ProcessGroupNCCL::ncclCommWatchdog() {
   try {
-    VLOG(2) << logPrefix() << "NCCL watchdog thread started!";
-    if (monitorThreadEnabled_.load()) {
-      ncclHeartbeatMonitorThread_ =
-          std::thread(&ProcessGroupNCCL::heartbeatMonitor, this);
-    }
+    VLOG(2) << logPrefix() << "Process group watchdog thread started!";
+    ncclHeartbeatMonitorThread_ =
+        std::thread(&ProcessGroupNCCL::heartbeatMonitor, this);
     watchdogHandler();
-    VLOG(2) << logPrefix() << "NCCL watchdog thread terminated normally";
+    VLOG(2) << logPrefix()
+            << "Process group watchdog thread terminated normally";
   } catch (std::exception& e) {
     if (std::string(e.what()).find("driver shutting down") !=
         std::string::npos) {
@@ -1330,7 +1443,7 @@ void ProcessGroupNCCL::ncclCommWatchdog() {
       // Append error message reported from watchdogHandler
       const auto exitMsg = c10::str(
           logPrefix(),
-          "NCCL watchdog thread terminated with exception: ",
+          "Process group watchdog thread terminated with exception: ",
           e.what());
       LOG(ERROR) << exitMsg;
       // TODO(whc) clean up the rethrow - why is it stored in a class var and
@@ -1341,7 +1454,8 @@ void ProcessGroupNCCL::ncclCommWatchdog() {
     }
   } catch (...) {
     const auto exitMsg = c10::str(
-        logPrefix(), "NCCL watchdog thread terminated with exception: unknown");
+        logPrefix(),
+        "Process group watchdog thread terminated with exception: unknown");
     LOG(ERROR) << exitMsg;
     watchDogException_ =
         std::make_exception_ptr(C10_BUILD_ERROR(DistBackendError, exitMsg));
@@ -1378,61 +1492,12 @@ std::string ProcessGroupNCCL::getNCCLWatchdogDebugInfo() {
   return retrieveDesyncReport(store_, "NCCL", rank_, size_);
 }
 
-#if defined(__linux__)
-struct DumpPipe {
-  DumpPipe(int rank) {
-    std::string fileStem =
-        getCvarString({"TORCH_NCCL_DEBUG_INFO_PIPE_FILE"}, "");
-    if (fileStem.empty() ||
-        getCvarInt({"TORCH_NCCL_TRACE_BUFFER_SIZE"}, 0) <= 0) {
-      return;
-    }
-    TORCH_CHECK(!fileStem.empty(), "TORCH_NCCL_DEBUG_INFO_TEMP_FILE is empty");
-    std::string filename = c10::str(fileStem, rank, ".pipe");
-    TORCH_CHECK(
-        unlink(filename.c_str()) != -1 || errno == ENOENT,
-        "Error removing existing named pipe ",
-        filename);
-    TORCH_CHECK(
-        mkfifo(filename.c_str(), 0666) != -1,
-        "Error creating named pipe ",
-        filename);
-    fd_ = open(filename.c_str(), O_RDONLY | O_NONBLOCK);
-    LOG(INFO) << "Pipe file " << filename
-              << " has been opened, write to it to trigger NCCL Debug Dump.";
-    TORCH_CHECK(fd_ != -1, "Error opening named pipe ", filename);
-  }
-  bool shouldDump() {
-    if (fd_ == -1) {
-      return false;
-    }
-    char buf[128];
-    // non-blocking from O_NONBLOCK above.
-    // Ignore EINTR because we already will poll this
-    // again later.
-    ssize_t bytesRead = read(fd_, &buf, 128);
-    return bytesRead > 0;
-  }
-  ~DumpPipe() {
-    if (fd_ != -1) {
-      close(fd_);
-    }
-  }
-
- private:
-  int fd_ = -1;
-};
-#else
-struct DumpPipe {
-  DumpPipe(int rank) {}
-  bool shouldDump() {
-    return false;
-  }
-};
-#endif
-
 std::string ProcessGroupNCCL::createLogPrefix() const {
-  return c10::str("[PG ", uid_, " Rank ", rank_, "] ");
+  if (!pg_desc_.empty() && pg_desc_ != "undefined") {
+    return c10::str("[PG ", pg_name_, " (", pg_desc_, ") Rank ", rank_, "] ");
+  } else {
+    return c10::str("[PG ", pg_name_, " Rank ", rank_, "] ");
+  }
 }
 
 const std::string& ProcessGroupNCCL::logPrefix() const {
@@ -1444,21 +1509,21 @@ const int& ProcessGroupNCCL::globalRank() const {
   return globalRank;
 }
 
+const std::vector<uint64_t>& ProcessGroupNCCL::groupRanks() const {
+  if (options_->global_ranks_in_group.empty() && uid_ == 0) {
+    static std::vector<uint64_t> globalRanks(size_);
+    std::iota(globalRanks.begin(), globalRanks.end(), 0);
+    return globalRanks;
+  }
+  return options_->global_ranks_in_group;
+}
+
 void ProcessGroupNCCL::watchdogHandler() {
   bool done = false;
   lastWorkListUpdateTime_ = std::chrono::steady_clock::now();
-  auto lastTimePollStore = std::chrono::steady_clock::now();
-  c10::optional<std::future<bool>> optAsyncDebugDump;
-
+  auto lastStatusUpdateTime = std::chrono::steady_clock::now();
   std::list<ProcessGroupNCCL::WorkNCCL> completedWorkList;
 
-  c10::optional<DumpPipe> dumpPipe = c10::nullopt;
-  if (uid_ == 0) {
-    // DumpPipe is one per-trainer process, and its convenient to name them
-    // after 'global' ranks in the system, So we assume processgroup (uid)==0 is
-    // the global PG and has globally unique rank ids across trainers.
-    dumpPipe.emplace(rank_);
-  }
   while (!done || !terminateProcessGroup_.load()) {
     std::unique_lock<std::mutex> lock(workMetaListMutex_);
     // We busy-poll the work vector every kWatchdogThreadSleepMillis
@@ -1470,44 +1535,90 @@ void ProcessGroupNCCL::watchdogHandler() {
     // Bump up heart beat by one.
     heartbeat_++;
 
-    // Assuming that we always init a process group containing all ranks,
-    // we only use the watchdog thread to listen for the global signal to dump
-    // and abort. We poll store to see if some ranks have flagged a timeout when
-    // we haven't polled for `heartbeat_timeout` seconds and there haven't
-    // any work added or removed for `watchdog_timeout` seconds.
-    if (dumpOnTimeout_ && uid_ == 0) {
-      auto currentTime = std::chrono::steady_clock::now();
-      auto timeSinceLastWorkListUpdate =
-          std::chrono::duration_cast<std::chrono::milliseconds>(
-              (currentTime - lastWorkListUpdateTime_))
-              .count();
-      auto timeSinceLastPollStore =
-          std::chrono::duration_cast<std::chrono::milliseconds>(
-              (currentTime - lastTimePollStore))
-              .count();
-      if (timeSinceLastWorkListUpdate >= kWatchdogThreadSleepMillis &&
-          timeSinceLastPollStore >= heartbeatTimeoutInSec_ * 1000) {
-        lastTimePollStore = currentTime;
-        if (store_->check({std::string(TIMEOUT_DUMP)}) && !optAsyncDebugDump) {
-          optAsyncDebugDump = launchAsyncDebugDump();
-          waitForDumpOrTimeout(*optAsyncDebugDump);
-          const auto exitMsg = c10::str(
-              logPrefix(),
-              "Another rank reported a timeout and signaled a global abort.");
-          LOG(ERROR) << exitMsg;
-          C10_THROW_ERROR(DistBackendError, exitMsg);
-        }
-      }
+// Some versions of GLOG support less-spammy version of LOG_EVERY_MS
+// in which case we don't want to spam the logs.
+#ifdef LOG_EVERY_MS
+    // Log the progress of this PG periodically
+    C10_LOG_EVERY_MS(INFO, kWorkStatusUpdatePeriodMs) << c10::str(
+        logPrefix(),
+        "NCCL Work update periodically: ",
+        "last enqueued NCCL work: ",
+        lastEnqueuedSeq_,
+        ", last completed NCCL work: ",
+        lastCompletedSeq_,
+        ".");
+#endif
+    auto logger = ::c10d::C10dLogger::getLogger();
+    if (logger &&
+        computeDeltaMS(
+            lastStatusUpdateTime, std::chrono::steady_clock::now()) >=
+            kWorkStatusUpdatePeriodMs) {
+      ::c10d::C10dLoggingData data;
+      // logging integers
+      data.integers["pg_id"] = uid_;
+      data.integers["rank"] = rank_;
+      data.integers["global_rank"] = globalRank();
+      data.integers["last_enqueued_work"] = lastEnqueuedSeq_;
+      data.integers["last_started_work"] = lastStartedSeq_;
+      data.integers["last_completed_work"] = lastCompletedSeq_;
+      // logging strings
+      data.strings["last_enqueued_work_name"] = lastEnqueuedWorkName_;
+      data.strings["last_started_work_name"] = lastStartedWorkName_;
+      data.strings["last_completed_work_name"] = lastCompletedWorkName_;
+      logger->log(data);
+      lastStatusUpdateTime = std::chrono::steady_clock::now();
     }
 
     for (auto it = workMetaList_.begin(); it != workMetaList_.end();
          /* no increment */) {
       auto& work = *it;
-      work.checkAndSetException();
+      // When terminateProcessGroup_ is true, communicators have already been
+      // aborted, So cannot check exception based on them. But watchdog needs to
+      // finish the check for the works that have already been enqueued to
+      // workMetaList_
+      if (!terminateProcessGroup_.load()) {
+        work.checkAndSetException();
+      }
       bool timedOut = work.checkTimeout();
 
       // If work hits an exception (either an error or timeout)
       if (work.exception()) {
+        // log as soon as exception is detected
+        LOG(ERROR) << c10::str(
+            logPrefix(),
+            "Exception (either an error or timeout) detected by watchdog at work: ",
+            work.seq_,
+            ", last enqueued NCCL work: ",
+            lastEnqueuedSeq_,
+            ", last completed NCCL work: ",
+            lastCompletedSeq_,
+            ".");
+        // try to dump flight records if exception happens.
+        // Flight recorder behavior should be independent of desync Debug
+        if (dumpOnException_) {
+          try {
+            auto rank = globalRank();
+            auto vec = std::vector<uint8_t>(
+                reinterpret_cast<uint8_t*>(&rank),
+                reinterpret_cast<uint8_t*>(&rank) + sizeof(rank));
+            globalStore_->set(std::string(EXCEPTION_DUMP), vec);
+            if (!shouldDump_.load()) {
+              LOG(ERROR) << logPrefix()
+                         << "First watchdog to set the dump signal.";
+            }
+            // signal the monitor thread to start dumping
+            shouldDump_.store(true);
+            // This sleep is used to give time for dumping before throwing
+            // exception
+            std::this_thread::sleep_for(
+                std::chrono::seconds(heartbeatTimeoutInSec_));
+          } catch (const std::exception& e) {
+            LOG(ERROR) << logPrefix()
+                       << "Failed to set dump signal in tcpstore. "
+                       << "Error: " << e.what();
+          }
+        }
+
         if (SHOULD_CLEAN_UP(asyncErrorHandling_)) {
           // Abort work and corresponding communicators
           work.abort();
@@ -1518,39 +1629,31 @@ void ProcessGroupNCCL::watchdogHandler() {
 
         // Report desync state in case of timeout
         if (timedOut) {
-          try {
-            if (desyncDebug_ || dumpOnTimeout_) {
-              // Set shutdown mode, so the heartbeat monitor thread will not
-              // abort process immediately.
+          LOG(ERROR) << c10::str(
+              logPrefix(),
+              "Timeout at NCCL work: ",
+              work.seq_,
+              ", last enqueued NCCL work: ",
+              lastEnqueuedSeq_,
+              ", last completed NCCL work: ",
+              lastCompletedSeq_,
+              ".");
+          if (desyncDebug_) {
+            try {
               collectiveDebugInfoMode_.store(true);
-              std::vector<uint8_t> vec(1);
-              store_->set(std::string(TIMEOUT_DUMP), vec);
-            }
-
-            if (dumpOnTimeout_ && !optAsyncDebugDump) {
-              // Store debug info to storage. (By default to local disk)
-              optAsyncDebugDump = launchAsyncDebugDump();
-            }
-
-            if (desyncDebug_) {
               auto desyncMsg = getNCCLWatchdogDebugInfo();
               LOG(ERROR) << logPrefix() << desyncMsg;
+            } catch (const std::exception& e) {
+              LOG(ERROR)
+                  << logPrefix()
+                  << "Failed to retrieve TORCH_NCCL_DESYNC_DEBUG report. "
+                  << " Please file an issue. Error: " << e.what();
+            } catch (...) {
+              LOG(ERROR)
+                  << logPrefix()
+                  << "Failed to rerieve TORCH_NCCL_DESYNC_DEBUG report with unknown error."
+                  << " Please file an issue.";
             }
-
-            if (dumpOnTimeout_) {
-              // Store debug info to storage. (By default to local disk)
-              waitForDumpOrTimeout(*optAsyncDebugDump);
-            }
-
-          } catch (const std::exception& e) {
-            LOG(ERROR) << logPrefix()
-                       << "Failed to retrieve TORCH_NCCL_DESYNC_DEBUG report. "
-                       << " Please file an issue. Error: " << e.what();
-          } catch (...) {
-            LOG(ERROR)
-                << logPrefix()
-                << "Failed to rerieve TORCH_NCCL_DESYNC_DEBUG report with unknown error."
-                << " Please file an issue.";
           }
         }
         // Throw exception
@@ -1567,9 +1670,20 @@ void ProcessGroupNCCL::watchdogHandler() {
         }
       }
 
+      // a work could be started but not completed, so we should not update
+      // lastStartedSeq_ and lastStartedOpName_ if the work state is checked
+      // multiple times after the start
+      if (lastStartedSeq_ < static_cast<int64_t>(work.seq_) &&
+          work.isStarted()) {
+        lastStartedSeq_ = work.seq_;
+        lastStartedWorkName_ = opTypeToString(work.opType_);
+      }
+
       // Clean up completed work
       if (work.isCompleted()) {
-        NCCLTraceBuffer::get()->retire_id(work.trace_id_);
+        lastCompletedSeq_ = work.seq_;
+        lastCompletedWorkName_ = opTypeToString(work.opType_);
+        NCCLTraceBuffer::get()->retire_id(work.trace_id_, true);
         if (onCompletionHook_) {
           // Move Work object to completedWorkList_ to be consumed by the hook
           // thread
@@ -1589,12 +1703,9 @@ void ProcessGroupNCCL::watchdogHandler() {
         // completed.
         ++it;
       }
-    }
-    // process a request to dump the trace. only PG uid 0 will respond to dump
-    // requests, but this is fine since all PG's feed into the same flight
-    // recorder and dump.
-    if (dumpPipe.has_value() && dumpPipe->shouldDump()) {
-      launchAsyncDebugDump();
+      // Increment heartbeat after each work processed,
+      // in case processing is slowed down (but not hung) by cuda api contention
+      heartbeat_++;
     }
     done = workMetaList_.empty();
   }
@@ -1626,6 +1737,7 @@ void ProcessGroupNCCL::runHookLoop() {
                 work.workStartTime_ - std::chrono::steady_clock::now());
         onCompletionHook_(std::make_shared<WorkInfo>(
             work.retrieveOpType(), // OpType
+            work.getSequencenumber(), // seq
             timeStarted, // timeStarted
             std::chrono::system_clock::now(), // timeFinished
             std::chrono::duration<float, std::milli>(
@@ -1667,36 +1779,41 @@ void ProcessGroupNCCL::runHookLoop() {
   }
 }
 
-std::exception_ptr ProcessGroupNCCL::WorkNCCL::checkForNCCLErrors(
-    const std::vector<std::shared_ptr<NCCLComm>>& ncclComms) const {
-  return checkForNCCLErrorsInternal(ncclComms);
+std::exception_ptr ProcessGroupNCCL::WorkNCCL::checkForNCCLErrors() {
+  return checkForNCCLErrorsInternal(ncclComm_);
 }
 
 std::exception_ptr ProcessGroupNCCL::checkForNCCLErrors(
-    const std::vector<std::shared_ptr<NCCLComm>>& ncclComms) {
-  return checkForNCCLErrorsInternal(ncclComms);
+    std::shared_ptr<NCCLComm>& ncclComm) {
+  return checkForNCCLErrorsInternal(ncclComm);
 }
 
 std::exception_ptr ProcessGroupNCCL::checkForNCCLErrorsInternal(
-    const std::vector<std::shared_ptr<NCCLComm>>& ncclComms) {
-  for (const auto& ncclComm : ncclComms) {
-    // Prioritize commFailureReason over checkForNcclError() result if
-    // commFailureReason is set.
-    auto commFailureReason = ncclComm->getNcclCommFailureReason();
-    if (commFailureReason != c10::nullopt) {
-      return std::make_exception_ptr(C10_BUILD_ERROR(
-          DistBackendError,
-          c10::str(
-              "NCCL communicator encountered error set by ProcessGroupNCCL: ",
-              *commFailureReason)));
-    }
-    ncclResult_t ncclAsyncErr = ncclComm->checkForNcclError();
-    if (ncclAsyncErr != ncclSuccess) {
-      return std::make_exception_ptr(C10_BUILD_ERROR(
-          DistBackendError,
-          "NCCL error: " + ncclGetErrorWithVersion(ncclAsyncErr) + "\n" +
-              getNcclErrorDetailStr(ncclAsyncErr)));
-    }
+    std::shared_ptr<NCCLComm>& ncclComm) {
+  // Prioritize commFailureReason over checkForNcclError() result if
+  // commFailureReason is set.
+  auto commFailureReason = ncclComm->getNcclCommFailureReason();
+  if (commFailureReason != c10::nullopt) {
+    return std::make_exception_ptr(C10_BUILD_ERROR(
+        DistBackendError,
+        c10::str(
+            "NCCL communicator encountered error set by ProcessGroupNCCL: ",
+            *commFailureReason)));
+  }
+  ncclResult_t ncclAsyncErr = ncclComm->checkForNcclError();
+  // When nonblocking mode is enabled by TORCH_NCCL_USE_COMM_NONBLOCKING,
+  // ncclInProgress could be returned when there are pending NCCL calls.
+  // In this case, no exception should be thrown
+#ifdef NCCL_HAS_COMM_NONBLOCKING
+  // ncclInProgress is defined only if NCCL_HAS_COMM_NONBLOCKING is defined
+  if (ncclAsyncErr != ncclSuccess && ncclAsyncErr != ncclInProgress) {
+#else
+  if (ncclAsyncErr != ncclSuccess) {
+#endif
+    return std::make_exception_ptr(C10_BUILD_ERROR(
+        DistBackendError,
+        "NCCL error: " + ncclGetErrorWithVersion(ncclAsyncErr) + "\n" +
+            getNcclErrorDetailStr(ncclAsyncErr)));
   }
 
   return nullptr;
@@ -1778,70 +1895,68 @@ void ProcessGroupNCCL::destroyNCCLComms(const std::string& devNCCLCommMapKey) {
         devNCCLCommMapKey,
         " in NCCL communicator map.");
   }
-  std::vector<std::shared_ptr<NCCLComm>>& ncclComms =
-      devNCCLCommMap_[devNCCLCommMapKey];
-  // Loop through communicators and call ncclCommAbort.
-  for (const auto& comm : ncclComms) {
-    // ncclCommDestroy(comm->getNcclComm()) results in segfault when PG is being
-    // destroyed, so using ncclCommAbort here.
-    comm->ncclCommAbort();
-  }
+  std::shared_ptr<NCCLComm>& ncclComm = devNCCLCommMap_[devNCCLCommMapKey];
+  // ncclCommDestroy(comm->getNcclComm()) results in segfault when PG is being
+  // destroyed, so using ncclCommAbort here.
+  ncclComm->ncclCommAbort();
   // Remove communicators from the cache.
   devNCCLCommMap_.erase(devNCCLCommMapKey);
   // Clear used device indices.
   usedDeviceIdxs_.clear();
 
   ncclCommDevIdxMapMutex.lock();
-  for (const auto& comm : ncclComms) {
-    ncclCommDevIdxMap.erase(comm);
-  }
+  ncclCommDevIdxMap.erase(ncclComm);
   ncclCommDevIdxMapMutex.unlock();
 }
 
-std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
-    const std::string& devicesKey,
-    const std::vector<at::Device>& devices,
+std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
+    const std::string& deviceKey,
+    at::Device& device,
     OpType opType,
     int p2pRank,
     bool isSendRecvSelf) {
   // Sanity check
-  if (devicesKey.empty()) {
+  if (deviceKey.empty()) {
     C10_THROW_ERROR(
         DistBackendError,
         "Not able to create/get the NCCL Communicator since "
         "the GPU devices are not known");
   }
   if (bound_device_id_) {
-    for (const auto& device : devices) {
-      if (*bound_device_id_ != device) {
-        LOG(ERROR) << logPrefix() << "Tensor found on device " << device
-                   << " but backend constrained to " << *bound_device_id_;
-        C10_THROW_ERROR(
-            DistBackendError,
-            "Attempt to perform collective on tensor not on device passed to init_process_group");
-      }
+    if (*bound_device_id_ != device) {
+      LOG(ERROR) << logPrefix() << "Tensor found on device " << device
+                 << " but backend constrained to " << *bound_device_id_;
+      C10_THROW_ERROR(
+          DistBackendError,
+          "Attempt to perform collective on tensor not on device passed to init_process_group");
     }
   }
 
-  for (auto& device : devices) {
-    usedDeviceIdxs_.insert(device.index());
-  }
+  usedDeviceIdxs_.insert(device.index());
 
   {
     std::lock_guard<std::mutex> lock(mutex_);
-    if (devNCCLCommMap_.find(devicesKey) != devNCCLCommMap_.end()) {
+    if (devNCCLCommMap_.find(deviceKey) != devNCCLCommMap_.end()) {
       // Reuse the cached communicator if there is one.
-      return devNCCLCommMap_[devicesKey];
+      return devNCCLCommMap_[deviceKey];
     }
   }
 
   // NCCL communicator not cached, create a new entry
-  std::vector<std::shared_ptr<NCCLComm>> ncclComms;
-  ncclComms.resize(devices.size());
+  std::shared_ptr<NCCLComm> ncclComm;
 
   // Create the unique NCCL ID and broadcast it
   ncclUniqueId ncclID;
 
+  // reset log prefix to include group_desc
+  logPrefix_ = createLogPrefix();
+
+#ifdef NCCL_COMM_DESCRIPTION
+  // Pass process group name and description to NCCL communicator
+  std::string commDesc = pg_desc_ + ':' + pg_name_;
+  options_->config.commDesc = strdup(commDesc.c_str());
+#endif
+
   // For batch_isend_irecv, ncclGroupStart() would be called upfront
   bool batchP2P = ncclActiveGroupCounter_ > 0;
   bool singleP2POp = isP2POp(opType, batchP2P);
@@ -1853,23 +1968,27 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
   // For point-to-point communication on the same process, don't need broadcast.
   if (!isSendRecvSelf) {
     // Broadcast so that each process can have a unique NCCL ID
-    broadcastUniqueNCCLID(&ncclID, singleP2POp, devicesKey, p2pRank);
+    auto timeStarted = std::chrono::steady_clock::now();
+    broadcastUniqueNCCLID(&ncclID, singleP2POp, deviceKey, p2pRank);
+    auto timerDeltaMs =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            std::chrono::steady_clock::now() - timeStarted)
+            .count() *
+        1000;
+    LOG(INFO) << logPrefix()
+              << "ProcessGroupNCCL broadcast unique ID through store took "
+              << timerDeltaMs << " ms";
   }
 
   at::cuda::OptionalCUDAGuard gpuGuard;
 
-  std::vector<at::cuda::CUDAStream> streamVal;
-  streamVal.reserve(devices.size());
-
   // [Group Start/End Note] This is used to ensure that nccl communicator will
   // be created before communication primitives are called. Let's look at this
   // example: Using the batch_isend_irecv to send a tensor to a target process.
   // On the sender side, the corresponding underlying NCCL calls will look like
   //   ncclGroupStart() // This is in batch_isend_irecv
-  //   ncclGroupStart() // This is [Note 1]
   //   ncclCommInitRank() // Inside NCCLComm::create
   //   ncclSend()
-  //   ncclGroupEnd() // This is [Note 2]
   //   ncclGroupEnd() // This is in batch_isend_irecv
   // With this pattern, the nccl communicator will be created in the last
   // ncclGroupEnd which means when ncclSend is processed, the passed
@@ -1883,93 +2002,90 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
     C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt);
   }
 
-  // [Note 1] Create the NCCL communicators for each GPU
-  C10D_NCCL_CHECK(ncclGroupStart(), c10::nullopt);
-
-  for (const auto i : c10::irange(devices.size())) {
-    // GPU world size and GPU rank
-    int numRanks, rank;
-
-    if (!singleP2POp) {
-      // Collective, all-to-all, or batch P2P
-      numRanks = getSize() * devices.size();
-      rank = getRank() * devices.size() + i;
-    } else if (isSendRecvSelf) {
-      // Same process send and recv.
-      numRanks = 1;
-      rank = 0;
-    } else {
-      // For single point-to-point operation, there are only 2 processes
-      // involved so the GPU rank is either 0 or 1.
-      numRanks = 2;
-      rank = p2pRank;
-    }
-    // Get the device index
-    int deviceIndex = devices[i].index();
+  // GPU world size and GPU rank
+  int numRanks, rank;
 
-    gpuGuard.set_index(deviceIndex);
+  if (!singleP2POp) {
+    // Collective, all-to-all, or batch P2P
+    numRanks = getSize();
+    rank = getRank();
+  } else if (isSendRecvSelf) {
+    // Same process send and recv.
+    numRanks = 1;
+    rank = 0;
+  } else {
+    // For single point-to-point operation, there are only 2 processes
+    // involved so the GPU rank is either 0 or 1.
+    numRanks = 2;
+    rank = p2pRank;
+  }
+  // Get the device index
+  auto deviceIndex = device.index();
+  gpuGuard.set_index(deviceIndex);
 #ifdef NCCL_HAS_COMM_SPLIT
-    if (options_->split_from) {
-      TORCH_CHECK(
-          options_->split_color != 0,
-          "Must specify a non-zero color when splitting");
-      // Find a valid, healthy communicator to split from if possible.
-      std::lock_guard<std::mutex> lock(options_->split_from->mutex_);
-      auto& other_comms = options_->split_from->devNCCLCommMap_;
-      auto dit = other_comms.find(devicesKey);
-      if (dit != other_comms.end() && !dit->second.empty()) {
-        TORCH_INTERNAL_ASSERT(
-            dit->second.size() == ncclComms.size(),
-            "split_from->devNCCLCommMap_ should be empty or the same size as ncclComms!");
-        if (dit->second[i] && !dit->second[i]->isAborted()) {
-          ncclComms[i] = NCCLComm::split(
-              dit->second[i].get(),
-              options_->split_color,
-              rank,
-              options_->config);
-        }
+  if (options_->split_from) {
+    TORCH_CHECK(
+        options_->split_color != 0,
+        "Must specify a non-zero color when splitting");
+    // Find a valid, healthy communicator to split from if possible.
+    std::lock_guard<std::mutex> lock(options_->split_from->mutex_);
+    auto& other_comms = options_->split_from->devNCCLCommMap_;
+    auto dit = other_comms.find(deviceKey);
+    if (dit != other_comms.end()) {
+      auto& parentComm = dit->second;
+      if (parentComm != nullptr && !parentComm->isAborted()) {
+        ncclComm = NCCLComm::split(
+            parentComm.get(), options_->split_color, rank, options_->config);
       }
     }
+  }
 #endif
 
-    // To simplify conditioonal nesting, just create the ncclComms[i]
-    // entry if it hasn't been yet rather than untangling the
-    // conditions that might have resulted in a split above.
-    if (!ncclComms[i]) {
+  // To simplify conditioonal nesting, just create the ncclComms[i]
+  // entry if it hasn't been yet rather than untangling the
+  // conditions that might have resulted in a split above.
+  if (!ncclComm) {
 #ifdef NCCL_HAS_COMM_NONBLOCKING
-      ncclComms[i] = NCCLComm::create(numRanks, rank, ncclID, options_->config);
+    ncclComm = NCCLComm::create(numRanks, rank, ncclID, options_->config);
 #else
-      ncclComms[i] = NCCLComm::create(numRanks, rank, ncclID);
+    ncclComm = NCCLComm::create(numRanks, rank, ncclID);
 #endif
-    }
-
-    // Creates the NCCL streams
-    streamVal.push_back(
-        at::cuda::getStreamFromPool(options_->is_high_priority_stream));
   }
 
+  // Creates the NCCL streams
+  bool force_high = getCvarBool(TORCH_NCCL_HIGH_PRIORITY, false);
+  auto streamVal = at::cuda::getStreamFromPool(
+      options_->is_high_priority_stream || force_high);
+
   {
     std::lock_guard<std::mutex> lock(mutex_);
-    inInitializationCommMap_.emplace(devicesKey, ncclComms);
+    inInitializationCommMap_.emplace(deviceKey, ncclComm);
   }
 
-  // [Note 2 ]
-#ifndef NCCL_HAS_COMM_NONBLOCKING
-  C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt);
-#else
-  if (!nccl_use_nonblocking()) {
-    C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt);
-  } else {
-    C10D_NCCL_CHECK_TIMEOUT_GROUPEND(ncclGroupEnd(), ncclComms, c10::nullopt);
-  }
-#endif
+  NCCLTraceBuffer::get()->record_pg_ranks(
+      std::make_tuple(pg_name_, pg_desc_), groupRanks());
+
+  RECORD_PARAM_COMMS(
+      0, // seq
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      rank, // rank
+      "init", // collective name
+      0, // inNelems
+      0, // outNelems
+      at::kByte, // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      size_); // worldSize
+
+  LOG(INFO) << logPrefix() << "ProcessGroupNCCL created ncclComm_ "
+            << ncclComm->ncclComm_ << " on CUDA device: " << deviceIndex;
 
   // At this point NCCL should have been initialized, hence we can accurately
   // get the env value even if NCCL sets it by reading from nccl.conf file
-  if (getRank() == 0) {
-    LOG(INFO) << logPrefix()
-              << "NCCL_DEBUG: " << getCvarString({"NCCL_DEBUG"}, "N/A");
-  }
+  LOG(INFO) << logPrefix()
+            << "NCCL_DEBUG: " << getCvarString({"NCCL_DEBUG"}, "N/A");
 
   // See [Group Start/End Note]
   for (const auto i : c10::irange(ncclActiveGroupCounter_)) {
@@ -1977,27 +2093,25 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
     C10D_NCCL_CHECK(ncclGroupStart(), c10::nullopt);
   }
 
-  ncclStreams_.emplace(devicesKey, std::move(streamVal));
+  ncclStreams_.emplace(deviceKey, std::move(streamVal));
 
   // Note: these events are created with the (default) cudaEventDisableTiming
   // flag This flag provides the best performance when used with
   // cudaStreamWaitEvent() and cudaEventQuery(). Since we here don't measure the
   // performance using cudaEvent, this should be set.
-  ncclEvents_.emplace(
-      std::piecewise_construct,
-      std::make_tuple(devicesKey),
-      std::make_tuple(devices.size()));
+  // TODO(kwen2501): is ncclEvents_ used anywhere else?
+  ncclEvents_.emplace(deviceKey, at::cuda::CUDAEvent(cudaEventDisableTiming));
 
   // Record the communicators based on ncclUniqueId.
-  ncclIdToCommMap_.emplace(buildNcclUniqueIdStr(ncclID), ncclComms);
+  ncclIdToCommMap_.emplace(buildNcclUniqueIdStr(ncclID), ncclComm);
 
   // Move the NCCL resource to cache
-  auto it = inInitializationCommMap_.find(devicesKey);
+  auto it = inInitializationCommMap_.find(deviceKey);
   // A previous thread could've already removed devicesKey from
   // inInitializationCommMap_ and added it to devNCCLCommMap_
   if (it != inInitializationCommMap_.end()) {
-    devNCCLCommMap_.emplace(devicesKey, std::move(it->second));
-    inInitializationCommMap_.erase(devicesKey);
+    devNCCLCommMap_.emplace(deviceKey, std::move(it->second));
+    inInitializationCommMap_.erase(deviceKey);
 
     // Now ncclComms are fully initialized.
     // Register all active CUDA memory segments in cache allocator to
@@ -2006,29 +2120,25 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
       auto snapshot = c10::cuda::CUDACachingAllocator::snapshot();
       // Register the segment to a new NCCL communicator if on the same device
       for (const auto& segmentInfo : snapshot.segments) {
-        for (const auto i : c10::irange(devices.size())) {
-          if (segmentInfo.device != devices[i].index())
-            continue;
-          ncclComms[i]->registerSegment(
-              reinterpret_cast<void*>(segmentInfo.address),
-              segmentInfo.total_size);
-        }
-      }
-
-      // Record the mapping between ncclComm and device index so that later
-      // register hook can register a newly allocated segment to communicators
-      // on the same device.
-      // NOTE: we need remove the communicator from this map when it is
-      // destroyed, otherwise may register onto an invalid communicator.
-      ncclCommDevIdxMapMutex.lock();
-      for (const auto i : c10::irange(devices.size())) {
-        ncclCommDevIdxMap.emplace(ncclComms[i], devices[i].index());
+        TORCH_INTERNAL_ASSERT(
+            segmentInfo.device == device.index(),
+            "Mismatch between CUDA memory segment device and current device");
+        ncclComm->registerSegment(
+            reinterpret_cast<void*>(segmentInfo.address),
+            segmentInfo.total_size);
       }
-      ncclCommDevIdxMapMutex.unlock();
     }
+    // Record the mapping between ncclComm and device index so that later
+    // register hook can register a newly allocated segment to communicators
+    // on the same device.
+    // NOTE: we need remove the communicator from this map when it is
+    // destroyed, otherwise may register onto an invalid communicator.
+    ncclCommDevIdxMapMutex.lock();
+    ncclCommDevIdxMap.emplace(ncclComm, device.index());
+    ncclCommDevIdxMapMutex.unlock();
   }
 
-  it = devNCCLCommMap_.find(devicesKey);
+  it = devNCCLCommMap_.find(deviceKey);
   TORCH_INTERNAL_ASSERT(
       it != devNCCLCommMap_.end(), "Communicators not populated in cache!");
 
@@ -2038,9 +2148,8 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
 uint64_t ProcessGroupNCCL::getCommSplitCounter() const {
   uint64_t ret = 0;
   for (const auto& i : ncclIdToCommMap_) {
-    for (const auto& j : i.second) {
-      ret += j->getCommSplitCounter();
-    }
+    auto& ncclComm = i.second;
+    ret += ncclComm->getCommSplitCounter();
   }
   return ret;
 }
@@ -2068,61 +2177,6 @@ void check_gpu_single_tensor(
   }
 }
 
-// Checks that all `tensors' have the same type and shape and reside on distinct
-// GPUs.
-// TODO: test_c10d_nccl.py should consider adding tests for the error conditions
-// here, ie, that deliberately pass invalid tensors and check the right
-// exception is thrown.
-void check_gpu_tensors_different_devices(
-    const std::vector<at::Tensor>& tensors,
-    const bool p2p = false // whether operation is a P2P operation
-) {
-  if (tensors.size() == 0) {
-    C10_THROW_ERROR(ValueError, "Tensor list must be nonempty");
-  }
-  if (tensors.size() > static_cast<size_t>(at::cuda::getNumGPUs())) {
-    C10_THROW_ERROR(
-        ValueError,
-        "Tensor list mustn't be larger than the number of available GPUs");
-  }
-
-  const auto& first = tensors.front();
-
-  // Set for ensuring that tensors are on separate devices.
-  std::unordered_set<decltype(first.get_device())> usedDevices;
-  usedDevices.reserve(tensors.size());
-
-  for (const auto& t : tensors) {
-    if (!t.is_cuda() || t.is_sparse()) {
-      C10_THROW_ERROR(ValueError, "Tensors must be CUDA and dense");
-    }
-    if (t.scalar_type() != first.scalar_type()) {
-      C10_THROW_ERROR(TypeError, "Tensors must have identical type");
-    }
-    if (t.sizes() != first.sizes()) {
-      C10_THROW_ERROR(ValueError, "Tensors must have identical size");
-    }
-    if (t.strides() != first.strides()) {
-      C10_THROW_ERROR(ValueError, "Tensors must have identical strides");
-    }
-    // Skip the following requirements for P2P operations
-    if (!t.is_contiguous(t.suggest_memory_format())) {
-      if (p2p) {
-        TORCH_WARN_ONCE(
-            "Detected non-contiguous tensor in P2P operations. It is user "
-            "responsibility to guarantee that source and destination tensors have "
-            "the same contiguity format.");
-      } else {
-        C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
-      }
-    }
-    const auto inserted = usedDevices.insert(t.get_device()).second;
-    if (!inserted) {
-      C10_THROW_ERROR(ValueError, "Tensors must be on distinct GPU devices");
-    }
-  }
-}
-
 // Checks that all `tensors' have the same type and shape and reside on the same
 // GPU.
 // TODO: test_c10d_nccl.py should consider adding tests for the error conditions
@@ -2171,71 +2225,18 @@ bool check_same_size(const std::vector<at::Tensor>& input_tensors) {
   return true;
 }
 
-// Flatten each list in `tensor_lists' for a gather or scatter operation, and
-// ensure compatibility with the corresponding tensor in `other'.
-std::vector<at::Tensor> flatten_for_scatter_gather(
-    std::vector<std::vector<at::Tensor>>& tensor_lists,
-    std::vector<at::Tensor>& other,
-    size_t world_size) {
-  if (tensor_lists.size() != other.size()) {
-    C10_THROW_ERROR(
-        ValueError,
-        "Tensor list operands to scatter/gather must have the same length");
-  }
-  const auto num_devices = tensor_lists.size();
-
-  std::vector<at::Tensor> flattened;
-  flattened.resize(num_devices);
-
-  for (const auto i : c10::irange(size_t{}, num_devices)) {
-    if (tensor_lists[i].size() != world_size * num_devices) {
-      C10_THROW_ERROR(
-          ValueError,
-          c10::str(
-              "Tensor list input to scatter/gather must match number of collective participants ",
-              "but got ",
-              tensor_lists[i].size(),
-              " inputs",
-              " with world_size ",
-              world_size,
-              " and ",
-              num_devices,
-              " devices."));
-    }
-
-    // Only check device match for the first tensor in the list; the call to
-    // newLikeFlat() below will check the rest.
-    if (tensor_lists[i].front().get_device() != other[i].get_device()) {
-      C10_THROW_ERROR(
-          ValueError,
-          "Corresponding input/output tensors to scatter/gather must all reside"
-          " on the same device");
-    }
-
-    for (const auto& t : tensor_lists[i]) {
-      if (t.numel() != other[i].numel()) {
-        C10_THROW_ERROR(
-            ValueError,
-            "All tensor operands to scatter/gather must have the same number of elements");
-      }
-    }
-    // Flatten the tensors (from all ranks) into a single big tensor.
-    flattened[i] = newLikeFlat(tensor_lists, i);
-  }
-  return flattened;
-}
-
 } // namespace
 
 c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
-    std::vector<at::Device> devices,
+    at::Device& device,
     int rank,
     OpType opType,
     const char* profilingTitle,
     const std::vector<at::Tensor>& inputs,
-    const std::vector<at::Tensor>& outputs) {
+    const std::vector<at::Tensor>& outputs, // TODO(kwen2501): necessary?
+    bool record) {
   auto r = c10::make_intrusive<ProcessGroupNCCL::WorkNCCL>(
-      devices,
+      device,
       rank,
       opType,
       seq_,
@@ -2245,17 +2246,35 @@ c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
       desyncDebug_,
       enableTiming_.load(),
       dist_debug_level_);
-  r->trace_id_ = NCCLTraceBuffer::get()->record(
-      uid_,
-      seq_,
-      profilingTitle,
-      inputs,
-      outputs,
-      r->ncclStartEvents_.get(),
-      r->ncclEndEvents_.get());
+  if (record) {
+    // Ideally record every work that we enqueue, rather than every work we
+    // create.
+    // - at the time of this PR we do not currently enqueue every created work
+    // - but it is unsafe to steal refs to start/end cuda events from Works that
+    //   may go out of scope before flight recorder has retired them,
+    //   so we must ensure that any work that is initialized via initWork will
+    //   be enqueued
+    // - initially, moved record() into workEnqueue(), but found that makes it
+    //   hard to get access to profilingTitle,
+    //   inputs, and outputs for metadata recording, and we don't want to attach
+    //   these objects to the Work becuase it has implications for keeping those
+    //   tensors alive longer and adds overhead when copying Work objects
+    //   between threads
+    r->trace_id_ = NCCLTraceBuffer::get()->record(
+        uid_,
+        std::make_tuple(pg_name_, pg_desc_),
+        seq_,
+        op_id_,
+        profilingTitle ? profilingTitle : "",
+        inputs,
+        outputs,
+        r->ncclStartEvent_.get(),
+        r->ncclEndEvent_.get());
+  }
   return r;
 }
 
+// TODO(kwen2501): deprecate
 std::vector<at::Tensor> ProcessGroupNCCL::WorkNCCL::result() {
   return *outputs_;
 }
@@ -2266,18 +2285,16 @@ c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupNCCL::WorkNCCL::
 }
 
 float ProcessGroupNCCL::WorkNCCL::getDuration() const {
-  TORCH_CHECK(timingEnabled_, "getDuration only works if timing was enabled")
-  TORCH_CHECK(
-      ncclStartEvents_->size() == 1,
-      "getDuration only works for single device per ProcessGroup.");
+  TORCH_CHECK(timingEnabled_, "getDuration only works if timing was enabled");
   TORCH_CHECK(
-      ncclEndEvents_->size() == 1,
-      "getDuration only works for single device per ProcessGroup.");
+      ncclStartEvent_,
+      "getDuration only works if ncclStartEvents_ is populated, true if timing enabled");
   TORCH_CHECK(
-      (*ncclEndEvents_)[0].query(),
-      "getDuration can only be called after work is succeeded.")
-  return (*ncclStartEvents_)[0].elapsed_time((*ncclEndEvents_)[0]);
+      ncclEndEvent_,
+      "getDuration only works if ncclEndEvents_ is populated, which should always be true");
+  return ncclStartEvent_->elapsed_time(*ncclEndEvent_);
 }
+
 uint64_t ProcessGroupNCCL::WorkNCCL::getSequencenumber() const {
   return seq_;
 }
@@ -2291,6 +2308,8 @@ void ProcessGroupNCCL::workEnqueue(
     // needs to be destructed in user thread. Otherwise will
     // get deadlock. Here we enqueue work without outputs_.
     workMetaList_.emplace_back(*work);
+    lastEnqueuedSeq_ = work->seq_;
+    lastEnqueuedWorkName_ = opTypeToString(work->opType_);
     lastWorkListUpdateTime_ = std::chrono::steady_clock::now();
   }
 }
@@ -2302,73 +2321,103 @@ ProcessGroupNCCL::Options::Options(bool is_high_priority_stream)
 static constexpr int CoalActive = 0x01, CoalColl = 0x02, CoalP2P = 0x04;
 
 void ProcessGroupNCCL::startCoalescing() {
-  coalescedDevices_.clear();
-  coalescedComms_.clear();
+  coalescedDevice_.set_index(-1);
+  coalescedComm_ = nullptr;
   coalescing_state_ |= CoalActive;
   groupStart();
+  // Other collective ops bump seq_ before creating a work. Thus, if coalesced
+  // ops bump seq_ only after initing a work they will collide with (reuse) the
+  // seq_ of the last non-coalesced collective.  Previously, seq_ was bumped
+  // inside endCoalescing, but before initWork. Since we now record individual
+  // ops from a coalesce group into the flight recorder, we want to have the
+  // same seq_ for those ops and its 'endCoalescing' op. Hence we bump during
+  // start, which has one minor downside- we burn a seq_ if someone ever does a
+  // 'start' and 'end' coalescing region without doing an operation inbetween.
+  seq_++;
+
+  // Don't bump op_id_ here, becuase startCoalescing isn't a logical operation.
+  // Bump it for each logical op inside the coalescing group.
 }
 
-c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing() {
-  if (!nccl_use_nonblocking() ||
-      coalescedComms_.size() == 0) { // There is no actual work being coalesced
+// `optype` is for specifying a composite optype, such as ALLGATHER and
+// REDUCE_SCATTER
+c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
+  if (coalescedComm_ == nullptr) {
+    // There is no actual work being coalesced, return here
     groupEnd();
-  } else {
-    // `coalescedComms_` should have same set of comms across collectives
-    auto comms = coalescedComms_[0];
-    groupEndNonblocking(comms);
-  }
-
-  coalescing_state_ = 0;
-
-  if (coalescedDevices_.size() == 0) {
-    // There is no actual work being coalesced
+    coalescing_state_ = 0;
     return nullptr;
   }
+  TORCH_CHECK(
+      coalescedDevice_.index() >= 0,
+      "Somthing went wrong. Did you call end_coalescing before start_coalescing?");
 
-  // `coalescedDevices_` should have same set of devices across collectives
-  auto devices = coalescedDevices_[0];
-
-  // Create Work object
-  auto work = initWork(devices, rank_, OpType::COALESCED, "nccl:coalesced");
+  // `coalescedComm_` should have same set of comms across collectives
+  auto comm = coalescedComm_;
+  // `coalescedDevice_` should have same set of devices across collectives
+  auto device = coalescedDevice_;
 
-  // Record stream event
-  // `getKeyFromDevices` is how we get keys for both collectives and batch P2P
-  const auto key = getKeyFromDevices(devices);
-  auto& ncclStreams = ncclStreams_[key];
-  // TODO(eqy): is this still necessary if avoidRecordStreams_ is set?
-  for (const auto i : c10::irange(devices.size())) {
-    auto& devEvent = (*work->ncclEndEvents_)[i];
-    devEvent.record(ncclStreams[i]);
-  }
+  // `getKeyFromDevice` is how we get keys for both collectives and batch P2P
+  const auto key = getKeyFromDevice(device);
+  auto ncclStream = ncclStreams_.at(key);
 
-  // Set appropriate work parameters.
+  // Create Work object
+  c10::cuda::CaptureStatus capture_status =
+      c10::cuda::currentStreamCaptureStatusMayInitCtx();
+  bool enqueue =
+      (coalescing_state_) && capture_status == c10::cuda::CaptureStatus::None;
+  auto work =
+      initWork(device, rank_, optype, "nccl:coalesced", {}, {}, enqueue);
+  work->ncclComm_ = comm;
   work->blockingWait_ = blockingWait_;
   work->avoidRecordStreams_ = avoidRecordStreams_;
   work->opTimeout_ = options_->timeout;
   work->store_ = store_;
+
+  // Record start before ncclGroupEnd
+  if (work->timingEnabled_) {
+    work->ncclStartEvent_->record(ncclStream);
+  }
+
+  if (nccl_use_nonblocking()) {
+    groupEndNonblocking(comm);
+  } else {
+    groupEnd();
+  }
+
+  // Record end after ncclGroupEnd
+  // TODO(eqy): is this still necessary if avoidRecordStreams_ is set?
+  work->ncclEndEvent_->record(ncclStream);
+
   if (avoidRecordStreams_) {
     // other functions expect an initialized ptr if avoidRecordStreams_ is set
     work->stashed_for_allocator_safety_ =
         std::make_shared<std::vector<at::Tensor>>();
   }
-  c10::cuda::CaptureStatus capture_status =
-      c10::cuda::currentStreamCaptureStatusMayInitCtx();
 
-  if ((coalescing_state_ & CoalColl) &&
-      capture_status == c10::cuda::CaptureStatus::None) {
+  // Notify graphs before we check the capture status preemptively
+  at::cuda::CUDAGraph::inc_pending_event_queries();
+
+  if (enqueue) {
     workEnqueue(work);
-    // TODO: it seems we never enqueue work for single send/recv or batch P2P,
-    // see the `pointToPoint` function. This should be fixed. Otherwise, we risk
-    // not being able to abort hanged P2P ops.
+  } else {
+    at::cuda::CUDAGraph::dec_pending_event_queries();
   }
 
+  coalescing_state_ = 0;
+  coalescedComm_ = nullptr;
   return work;
 }
 
+c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing() {
+  // Default OpType to COALESCED if not specified
+  return endCoalescing(OpType::COALESCED);
+}
+
 template <typename Fn, typename PreProcess, typename PostProcess>
 c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
-    std::vector<at::Tensor>& inputs,
-    std::vector<at::Tensor>& outputs,
+    at::Tensor& input,
+    at::Tensor& output,
     Fn fn,
     PreProcess pre,
     PostProcess post,
@@ -2383,6 +2432,167 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
 
   // Bump collective counter
   seq_++;
+  op_id_++;
+
+  auto device = getDevice(input);
+  const auto key = getKeyFromDevice(device);
+  auto ncclComm = getNCCLComm(key, device, opType);
+
+  if (coalescing_state_ & CoalActive) {
+    coalescing_state_ |= CoalColl;
+    if (coalescedDevice_.index() < 0) {
+      coalescedDevice_ = device;
+    } else {
+      TORCH_CHECK(
+          coalescedDevice_.index() == device.index(), MULTI_DEVICE_ERROR_MSG);
+    }
+    if (coalescedComm_ == nullptr) {
+      coalescedComm_ = ncclComm;
+    } else {
+      TORCH_CHECK(coalescedComm_ == ncclComm, MULTI_DEVICE_ERROR_MSG);
+    }
+  }
+
+  // Used many times below, so we stash the unordered_map lookup
+  auto ncclStream = ncclStreams_.at(key);
+
+  // First let NCCL streams wait for input tensors allocation streams
+  syncStream(device, ncclEvents_[key], ncclStream);
+
+  std::vector<at::Tensor> inputs{input};
+  std::vector<at::Tensor> outputs{output};
+
+  bool enqueue =
+      !coalescing_state_ && capture_status == c10::cuda::CaptureStatus::None;
+  auto work =
+      initWork(device, rank_, opType, profilingTitle, inputs, outputs, enqueue);
+
+  // Store references to outputs to be used by WorkNCCL::result and operator<<.
+  work->outputs_ =
+      std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
+
+  if (avoidRecordStreams) {
+    work->stashed_for_allocator_safety_ =
+        std::make_shared<std::vector<at::Tensor>>();
+    work->stashed_for_allocator_safety_->push_back(input);
+  }
+
+  at::cuda::OptionalCUDAGuard gpuGuard;
+
+  // Start event should only be recorded before the ncclGroupStart()
+  if (work->timingEnabled_) {
+    work->ncclStartEvent_->record(ncclStream);
+  }
+
+  pre(ncclStream, work);
+
+  ncclComm_t comm = ncclComm->getNcclComm();
+
+  // Both `inputs' and `outputs' are created on a worker stream and used in
+  // different ncclStreams.  Hence, both must record the ncclStream to
+  // prevent being freed before the collective finishes.
+  //
+  // We only record `inputs' here, and leave recording `outputs' to `fn' for
+  // operations where `inputs' and `outputs' are not the same.
+  //
+  // See [Sync Streams].
+  if (!avoidRecordStreams) {
+    if (!input.is_sparse()) {
+      c10::cuda::CUDACachingAllocator::recordStream(
+          input.storage().data_ptr(), ncclStream);
+    } else {
+      // for sparse input case record streams on both index and value
+      // tensors
+      c10::cuda::CUDACachingAllocator::recordStream(
+          input.values().storage().data_ptr(), ncclStream);
+      c10::cuda::CUDACachingAllocator::recordStream(
+          input.indices().storage().data_ptr(), ncclStream);
+    }
+  }
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  C10D_NCCL_CHECK(
+      fn(input, output, comm, ncclStream),
+      ncclComm->getNcclCommFailureReason());
+#else
+  C10D_NCCL_CHECK_TIMEOUT(
+      fn(input, output, comm, ncclStream),
+      comm,
+      ncclComm->getNcclCommFailureReason());
+#endif
+
+  post(ncclStream, work);
+
+  // End event should only be recorded after the ncclGroupEnd()
+  if (!coalescing_state_) {
+    work->ncclEndEvent_->record(ncclStream);
+  }
+  work->ncclComm_ = ncclComm;
+
+  {
+    c10::cuda::CUDAMultiStreamGuard streamGuard(ncclStream);
+    std::vector<at::Device> devices{device};
+    work->future_ = c10::make_intrusive<at::ivalue::Future>(
+        c10::ListType::create(c10::TensorType::get()), devices);
+
+    // Add a callback that runs profiling end callbacks. wrapCallback() in CUDA
+    // future blocks the stream this callback runs on the corresponding
+    // ncclEndEvents_ ensuring appropriate synchronization.
+    if (work->recordFunctionEndCallback_) {
+      work->future_->addCallback(
+          [work](at::ivalue::Future& /* unused */) {
+            work->recordFunctionEndCallback_();
+          },
+          // uses_future = false allows us to skip synchronization in
+          // ivalue::Future, but is only valid as long as the lambda doesn't use
+          // the "Future" argument.
+          /*uses_future=*/false);
+    }
+    work->future_->markCompleted(at::IValue(*work->outputs_));
+  }
+
+  // Set appropriate work parameters.
+  work->blockingWait_ = blockingWait_;
+  work->avoidRecordStreams_ = avoidRecordStreams;
+  work->opTimeout_ = options_->timeout;
+  work->store_ = store_;
+  // Record size info for debug. We only record the size on the first device as
+  // multi-device per process is deprecated
+  work->numelIn_ = input.numel();
+  work->numelOut_ = output.numel();
+
+  // Notify graphs before we check the capture status preemptively
+  at::cuda::CUDAGraph::inc_pending_event_queries();
+  if (enqueue) {
+    workEnqueue(work);
+  } else {
+    at::cuda::CUDAGraph::dec_pending_event_queries();
+  }
+
+  return work;
+}
+
+template <typename Fn>
+c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
+    std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    Fn fn,
+    OpType opType,
+    const char* profilingTitle,
+    bool avoidRecordStreams) {
+  // Environment setting by the user may add onto collective call's option
+  avoidRecordStreams |= avoidRecordStreams_;
+  c10::cuda::CaptureStatus capture_status =
+      c10::cuda::currentStreamCaptureStatusMayInitCtx();
+  errorIfCapturingNonCapturableNCCL(capture_status);
+
+  // Bump collective counter
+  seq_++;
+  // For coalescingManager collectives, there is no individual c++ call per
+  // collective so there is no flight record and we increment seq_ and op_id_
+  // together. Compare this to startCoalesing/endCoalescing flow where we
+  // increment seq_ once per group and increment op_id_ once per indvidual
+  // operation within the group
+  op_id_++;
 
   // Currently, the API permits one scenario where inputs.size() and
   // outputs.size() are > 0.
@@ -2391,33 +2601,33 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
   //    The group of nccl calls applies the collective separately to each input,
   //    but the group as a whole should be efficient, and might even execute as
   //    a single fused kernel.
-  const auto devices = getDeviceList(inputs);
-  const bool inputs_same_dev = (devices.size() == 1);
-  const auto key = getKeyFromDevices(devices);
-  auto& ncclComms = getNCCLComm(key, devices, opType);
+  auto device = getDevice(inputs[0]);
+  const auto key = getKeyFromDevice(device);
+  auto ncclComm = getNCCLComm(key, device, opType);
 
   if (coalescing_state_ & CoalActive) {
     coalescing_state_ |= CoalColl;
-    coalescedDevices_.push_back(devices);
-    coalescedComms_.push_back(ncclComms);
+    if (coalescedDevice_.index() < 0) {
+      coalescedDevice_ = device;
+    } else {
+      TORCH_CHECK(
+          coalescedDevice_.index() == device.index(), MULTI_DEVICE_ERROR_MSG);
+    }
+    if (coalescedComm_ == nullptr) {
+      coalescedComm_ = ncclComm;
+    } else {
+      TORCH_CHECK(coalescedComm_ == ncclComm, MULTI_DEVICE_ERROR_MSG);
+    }
   }
 
   // Used many times below, so we stash the unordered_map lookup
-  auto& ncclStreams = ncclStreams_[key];
+  auto ncclStream = ncclStreams_.at(key);
 
   // First let NCCL streams wait for input tensors allocation streams
-  syncStreams(devices, ncclEvents_[key], ncclStreams);
-
-  // Work itself will create the CUDA events on all GPUs of tensors
-  bool can_profile = outputs.size() == 1;
+  syncStream(device, ncclEvents_[key], ncclStream);
 
   auto work = initWork(
-      devices,
-      rank_,
-      opType,
-      can_profile ? profilingTitle : nullptr,
-      inputs,
-      outputs);
+      device, rank_, opType, profilingTitle, inputs, outputs, /*record=*/true);
 
   // Store references to outputs to be used by WorkNCCL::result and operator<<.
   work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
@@ -2429,41 +2639,30 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
 
   at::cuda::OptionalCUDAGuard gpuGuard;
 
-  // Start event should only be recorded before the ncclGroupStart()
+  // Start event should only be recorded before the ncclGroupStart() (which
+  // happens inside AutoNcclGroup guard below)
   if (work->timingEnabled_) {
-    for (const auto i : c10::irange(devices.size())) {
-      at::cuda::CUDAStream& ncclStream = ncclStreams[i];
-      (*work->ncclStartEvents_)[i].record(ncclStream);
-    }
+    work->ncclStartEvent_->record(ncclStream);
   }
 
-  pre(ncclStreams, work);
-
-  std::vector<void*> comms_;
-  if (nccl_use_nonblocking()) {
-    for (const auto i : c10::irange(inputs.size())) {
-      decltype(i) stream_comm_i = (inputs_same_dev ? 0 : i);
-      comms_.push_back((void*)ncclComms[stream_comm_i]->getNcclComm());
-    }
-  }
+  ncclComm_t comm = ncclComm->getNcclComm();
 
+// TODO(kwen2501): this should be moved to c10d tests, to qualify a NCCL
+// upgrade. Once a NCCL version is qualified, this code should not be needed at
+// runtime.
+#ifdef PGNCCL_ENABLE_HASH
   if (enableCollecticeHashDebug_.load()) {
     auto numel = getTensorsNumel(inputs);
     auto hashValue = hashTensors(inputs);
     PRINT_COLLECTIVE_HASH_SIGNATURE(
         "input", opTypeToString(opType), numel, hashValue);
   }
+#endif
 
   {
     torch::cuda::nccl::AutoNcclGroup nccl_group_guard(
-        comms_, nccl_use_nonblocking());
+        comm, nccl_use_nonblocking());
     for (const auto i : c10::irange(inputs.size())) {
-      if (!inputs_same_dev || (inputs_same_dev && i == 0)) {
-        gpuGuard.set_index(devices[i].index());
-      }
-      decltype(i) stream_comm_i = (inputs_same_dev ? 0 : i);
-      auto& ncclStream = ncclStreams[stream_comm_i];
-      auto& ncclComm = ncclComms[stream_comm_i];
       // Both `inputs' and `outputs' are created on a worker stream and used in
       // different ncclStreams.  Hence, both must record the ncclStream to
       // prevent being freed before the collective finishes.
@@ -2487,29 +2686,23 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
       }
 #ifndef NCCL_HAS_COMM_NONBLOCKING
       C10D_NCCL_CHECK(
-          fn(inputs[i], outputs[i], ncclComm->getNcclComm(), ncclStream),
+          fn(inputs[i], outputs[i], comm, ncclStream),
           ncclComm->getNcclCommFailureReason());
 #else
       C10D_NCCL_CHECK_TIMEOUT(
-          fn(inputs[i], outputs[i], ncclComm->getNcclComm(), ncclStream),
-          ncclComm->getNcclComm(),
+          fn(inputs[i], outputs[i], comm, ncclStream),
+          comm,
           ncclComm->getNcclCommFailureReason());
 #endif
     }
   }
-  post(ncclStreams, work);
 
-  // End event should only be recorded after the ncclGroupEnd()
-  for (const auto i : c10::irange(devices.size())) {
-    at::cuda::CUDAStream& ncclStream = ncclStreams[i];
-    if (!coalescing_state_) {
-      (*work->ncclEndEvents_)[i].record(ncclStream);
-    }
-    work->ncclComms_[i] = ncclComms[i];
-  }
+  work->ncclEndEvent_->record(ncclStream);
+  work->ncclComm_ = ncclComm;
 
   {
-    c10::cuda::CUDAMultiStreamGuard streamGuard(ncclStreams);
+    c10::cuda::CUDAMultiStreamGuard streamGuard(ncclStream);
+    std::vector<at::Device> devices{device};
     work->future_ = c10::make_intrusive<at::ivalue::Future>(
         c10::ListType::create(c10::TensorType::get()), devices);
 
@@ -2539,21 +2732,47 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
   work->numelIn_ = inputs[0].numel();
   work->numelOut_ = outputs[0].numel();
 
-  // Notify graphs before we check the capture status preemptively
-  at::cuda::CUDAGraph::inc_pending_event_queries();
+  /* Note [cuda graph capture and workEnqueue]
 
-  if (!coalescing_state_ && capture_status == c10::cuda::CaptureStatus::None) {
+  Normal behavior of the C10D watchdog is to query cuda events on work objects
+  periodically, but when cuda graph recording is active these event queries
+  would crash or mess up the recording.
+
+  To ensure we do not enqueue a work object to the watchdog when cuda graph
+  capture is active, we use a one-way sync. We increment a flag pre-emptively,
+  indicating our intent to enqueue a work object. Then we check capture_status
+  to see if (a) capturing is already in progress (we cannot enqueue in this
+  case), (b) capturing hasn't started yet, so we can trust that no capture will
+  start (since a pre-condition of starting a capture is to check the event query
+  count is 0).
+
+  If we are not able to enqueue the work due to capture-in-progress, we finally
+  decrement the counter.
+
+  For this reason we cannot easily move the increment inside workEnqueue unless
+  we also change the semantic of workEnqueue to 'maybeWorkEnqueue'.
+
+  TODO:
+   - Is our design for flight recorder safe in this context?  are we recording
+  any FR events during cudagraph capture? if so, they won't be safe to poll for
+  completion status.
+  */
+  at::cuda::CUDAGraph::inc_pending_event_queries();
+  if (capture_status == c10::cuda::CaptureStatus::None) {
     workEnqueue(work);
   } else {
     at::cuda::CUDAGraph::dec_pending_event_queries();
   }
-
+  // TODO(whc) if the work isn't enqueued, I don't feel great about returning
+  // it, since interactions with it by usercode won't behave normally - they
+  // won't observe work completion, for instance.  Will this lead to silent
+  // problems during capture?
   return work;
 }
 
 template <typename Fn, typename PreProcess, typename PostProcess>
 c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
-    std::vector<at::Tensor>& tensors,
+    at::Tensor& tensor,
     Fn fn,
     int peer,
     OpType opType,
@@ -2573,10 +2792,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
         "collectives.");
   }
 
-  // Bump sequence number, updated in collective() as well
-  seq_++;
-
-  const auto devices = getDeviceList(tensors);
+  auto device = getDevice(tensor);
   std::string key;
   int p2pRank = 0, p2pTargetRank = 0;
   bool isSendRecvSelf = false;
@@ -2586,7 +2802,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
     // For batch P2P, we need to treat it like a collective when selecting
     // communicator, because other ranks can call into this batch other than my
     // rank and my peer
-    key = getKeyFromDevices(devices);
+    key = getKeyFromDevice(device);
     p2pRank = rank_;
     p2pTargetRank = peer;
   } else {
@@ -2595,130 +2811,162 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
     p2pRank = rank_ <= peer ? 0 : 1;
     isSendRecvSelf = rank_ == peer;
     p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank;
+
+    if (!coalescing_state_) {
+      // Bump sequence number. Don't do so if it's a batch P2P, it will be
+      // bumped in `endCoalescing`.
+      seq_++;
+    }
   }
-  auto& ncclComms = getNCCLComm(key, devices, opType, p2pRank, isSendRecvSelf);
+
+  // Bump the logical operation counter regardless of whether this op is
+  // coalesced or individual
+  op_id_++;
+
+  auto ncclComm = getNCCLComm(key, device, opType, p2pRank, isSendRecvSelf);
 
   if (coalescing_state_ & CoalActive) {
     coalescing_state_ |= CoalP2P;
-    coalescedDevices_.push_back(devices);
-    coalescedComms_.push_back(ncclComms);
+    if (coalescedDevice_.index() < 0) {
+      coalescedDevice_ = device;
+    } else {
+      TORCH_CHECK(
+          coalescedDevice_.index() == device.index(), MULTI_DEVICE_ERROR_MSG);
+    }
+    if (coalescedComm_ == nullptr) {
+      coalescedComm_ = ncclComm;
+    } else {
+      TORCH_CHECK(coalescedComm_ == ncclComm, MULTI_DEVICE_ERROR_MSG);
+    }
   }
 
+  // Used many times below, so we stash the unordered_map lookup
+  auto ncclStream = ncclStreams_.at(key);
   // First let NCCL streams wait for input tensors allocation streams
-  syncStreams(devices, ncclEvents_[key], ncclStreams_[key]);
+  syncStream(device, ncclEvents_[key], ncclStream);
 
   // Work itself will create the CUDA events on all GPUs of tensors
-  bool can_profile = tensors.size() == 1;
-  auto work = initWork(
-      devices,
-      rank_,
-      opType,
-      can_profile ? profilingTitle : nullptr,
-      tensors,
-      {});
-
-  // Store references to outputs to be used by WorkNCCL::result and operator<<.
-  // Note that these outputs are only valid for recv(), as send() does not
-  // modify the inputs but we still create these outputs for use cases such as
-  // profiling.
-  work->outputs_ = std::make_shared<std::vector<at::Tensor>>(tensors);
-
+  c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> work;
+  if (coalescing_state_) {
+    // When coalescing, we record events per op that lack timing/state
+    // information becuase there is no 'work' associated with them, and then
+    // later in endCoalescing we record a 'coalesced' Work which has
+    // timing/state updates via watchdog thread, but lacks op metadata such as
+    // input/output sizes and profilingTitle per-op in the group.
+    auto trace_id = NCCLTraceBuffer::get()->record(
+        uid_,
+        std::make_tuple(pg_name_, pg_desc_),
+        seq_,
+        op_id_,
+        profilingTitle,
+        {tensor},
+        {tensor},
+        nullptr,
+        nullptr);
+    // TODO(whc) if we want to make the per-p2p-op flightrecorder entries get
+    // their timings/states updated by proxy when the Work obj representing the
+    // coalesce group gets its update, we could accumulate these trace_ids
+    // together and ask FlightRecorder to take the update from one Work and
+    // apply it to multiple entries
+    (void)trace_id;
+  } else {
+    // Store references to outputs to be used by WorkNCCL::result and
+    // operator<<. Note that these outputs are only valid for recv(), as send()
+    // does not modify the inputs but we still create these outputs for use
+    // cases such as profiling.
+
+    work = initWork(
+        device, rank_, opType, profilingTitle, {tensor}, {}, /*record=*/false);
+    // This bypasses something in Work() that crashes if {tensor} is given as
+    // output, not sure what
+    work->outputs_ = std::make_shared<std::vector<at::Tensor>>();
+    work->outputs_->push_back(tensor);
+    // TODO(whc) becuase we don't pass output {tensor} to initWork, we tell
+    // initWork to not record, and then we manually call record passing all the
+    // information it wants.
+    work->trace_id_ = NCCLTraceBuffer::get()->record(
+        uid_,
+        std::make_tuple(pg_name_, pg_desc_),
+        seq_,
+        op_id_,
+        profilingTitle,
+        {tensor},
+        {tensor},
+        work->ncclStartEvent_.get(),
+        work->ncclEndEvent_.get());
+  }
+
+  // is gpuGuard needed for the if block below, or can i swap them
   at::cuda::OptionalCUDAGuard gpuGuard;
 
-  // Start event should only be recorded before the ncclGroupStart()
-  if (work->timingEnabled_) {
-    for (const auto i : c10::irange(tensors.size())) {
-      at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
-      (*work->ncclStartEvents_)[i].record(ncclStream);
+  if (!coalescing_state_) {
+    // Start event should only be recorded before the ncclGroupStart()
+    if (work->timingEnabled_) {
+      work->ncclStartEvent_->record(ncclStream);
     }
-  }
 
-  pre(ncclStreams_[key], work);
+    pre(ncclStream, work);
+  }
 
-  for (const auto i : c10::irange(tensors.size())) {
-    gpuGuard.set_index(devices[i].index());
-    at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
+  // Both send tensor and recv tensor are created on a worker stream and used
+  // in different ncclStreams.  Hence, both must record the ncclStream to
+  // prevent being freed before the collective finishes.
+  //
+  // See [Sync Streams].
+  c10::cuda::CUDACachingAllocator::recordStream(
+      tensor.storage().data_ptr(), ncclStream);
 
-    // Both send tensor and recv tensor are created on a worker stream and used
-    // in different ncclStreams.  Hence, both must record the ncclStream to
-    // prevent being freed before the collective finishes.
-    //
-    // See [Sync Streams].
-    c10::cuda::CUDACachingAllocator::recordStream(
-        tensors[i].storage().data_ptr(), ncclStream);
-  }
+  // This part seems common to both p2p and coalesced-p2p usage?
+  ncclComm_t comm_ = ncclComm->getNcclComm();
 
-  std::vector<void*> comms_;
-  if (nccl_use_nonblocking()) {
-    for (const auto i : c10::irange(tensors.size())) {
-      comms_.push_back((void*)ncclComms[i]->getNcclComm());
-    }
-  }
-  {
-    torch::cuda::nccl::AutoNcclGroup nccl_group_guard(
-        comms_, nccl_use_nonblocking());
-    for (const auto i : c10::irange(tensors.size())) {
-      gpuGuard.set_index(devices[i].index());
-      at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
 #ifndef NCCL_HAS_COMM_NONBLOCKING
-      C10D_NCCL_CHECK(
-          fn(tensors[i],
-             ncclComms[i]->getNcclComm(),
-             ncclStream,
-             p2pTargetRank),
-          ncclComms[i]->getNcclCommFailureReason());
+  C10D_NCCL_CHECK(
+      fn(tensor, comm_, ncclStream, p2pTargetRank),
+      ncclComm->getNcclCommFailureReason());
 #else
-      C10D_NCCL_CHECK_TIMEOUT(
-          fn(tensors[i],
-             ncclComms[i]->getNcclComm(),
-             ncclStream,
-             p2pTargetRank),
-          ncclComms[i]->getNcclComm(),
-          ncclComms[i]->getNcclCommFailureReason());
+  C10D_NCCL_CHECK_TIMEOUT(
+      fn(tensor, comm_, ncclStream, p2pTargetRank),
+      ncclComm->getNcclComm(),
+      ncclComm->getNcclCommFailureReason());
 #endif
-    }
-  }
 
-  post(ncclStreams_[key]);
+  if (!coalescing_state_) {
+    post(ncclStream);
 
-  // End event should only be recorded after the ncclGroupEnd()
-  for (const auto i : c10::irange(tensors.size())) {
-    at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
-    if (!coalescing_state_) {
-      (*work->ncclEndEvents_)[i].record(ncclStream);
-    }
-    work->ncclComms_[i] = ncclComms[i];
+    // End event should only be recorded after the ncclGroupEnd()
+    work->ncclEndEvent_->record(ncclStream);
+    work->ncclComm_ = ncclComm;
     work->blockingWait_ = blockingWait_;
     work->opTimeout_ = options_->timeout;
     work->store_ = store_;
-  }
+    // Record size info for debug. We only record the size on the first device
+    // as multi-device per process is deprecated
+    work->numelIn_ = work->numelOut_ = tensor.numel();
 
-  // Record size info for debug. We only record the size on the first device as
-  // multi-device per process is deprecated
-  work->numelIn_ = work->numelOut_ = tensors[0].numel();
-
-  // Future only needs to be created and marked completed with outputs for
-  // recv(), but still create future for use cases such as profiling even for
-  // send().
-  {
-    c10::cuda::CUDAMultiStreamGuard streamGuard(ncclStreams_[key]);
-    work->future_ = c10::make_intrusive<at::ivalue::Future>(
-        c10::ListType::create(c10::TensorType::get()), devices);
-    work->future_->markCompleted(at::IValue(*work->outputs_));
-  }
+    // Future only needs to be created and marked completed with outputs for
+    // recv(), but still create future for use cases such as profiling even for
+    // send().
+    {
+      c10::cuda::CUDAMultiStreamGuard streamGuard(ncclStream);
+      std::vector<at::Device> devices{device};
+      work->future_ = c10::make_intrusive<at::ivalue::Future>(
+          c10::ListType::create(c10::TensorType::get()), devices);
+      work->future_->markCompleted(at::IValue(*work->outputs_));
+    }
 
-  // Add a callback that runs profiling end callbacks. wrapCallback() in CUDA
-  // future blocks the stream this callback runs on the corresponding
-  // ncclEndEvents_ ensuring appropriate synchronization.
-  if (work->recordFunctionEndCallback_) {
-    work->future_->addCallback(
-        [work](at::ivalue::Future& /* unused */) {
-          work->recordFunctionEndCallback_();
-        },
-        // uses_future = false allows us to skip synchronization in
-        // ivalue::Future, but is only valid as long as the lambda doesn't use
-        // the "Future" argument.
-        /*uses_future=*/false);
+    // Add a callback that runs profiling end callbacks. wrapCallback() in CUDA
+    // future blocks the stream this callback runs on the corresponding
+    // ncclEndEvents_ ensuring appropriate synchronization.
+    if (work->recordFunctionEndCallback_) {
+      work->future_->addCallback(
+          [work](at::ivalue::Future& /* unused */) {
+            work->recordFunctionEndCallback_();
+          },
+          // uses_future = false allows us to skip synchronization in
+          // ivalue::Future, but is only valid as long as the lambda doesn't use
+          // the "Future" argument.
+          /*uses_future=*/false);
+    }
   }
 
   // Enqueue P2P op so that it can be cancelled by NCCL watchdog
@@ -2730,28 +2978,28 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
 
   if (!coalescing_state_ && capture_status == c10::cuda::CaptureStatus::None) {
     workEnqueue(work);
+    return work;
   } else {
     at::cuda::CUDAGraph::dec_pending_event_queries();
+    return nullptr;
   }
-
-  return work;
 }
 
 template <typename Fn>
 c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
-    std::vector<at::Tensor>& inputs,
-    std::vector<at::Tensor>& outputs,
+    at::Tensor& input,
+    at::Tensor& output,
     Fn fn,
     OpType opType,
     const char* profilingTitle,
     bool avoidRecordStreams) {
   return collective(
-      inputs,
-      outputs,
+      input,
+      output,
       fn,
-      [](std::vector<at::cuda::CUDAStream>&,
+      [](at::cuda::CUDAStream&,
          c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
-      [](std::vector<at::cuda::CUDAStream>&,
+      [](at::cuda::CUDAStream&,
          c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
       opType,
       profilingTitle,
@@ -2760,7 +3008,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
 
 template <typename Fn>
 c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
-    std::vector<at::Tensor>& tensor,
+    at::Tensor& tensor,
     Fn fn,
     int peer,
     OpType opType,
@@ -2770,33 +3018,31 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
       fn,
       peer,
       opType,
-      [](std::vector<at::cuda::CUDAStream>&,
+      [](at::cuda::CUDAStream&,
          c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
-      [](std::vector<at::cuda::CUDAStream>&) {},
+      [](at::cuda::CUDAStream&) {},
       profilingTitle);
 }
 
 c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_sparse(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
-#ifdef IS_NCCL_EXP
-  std::vector<at::Tensor> outputTensors(tensors.size());
-  for (std::vector<at::Tensor>::size_type i = 0; i < tensors.size(); i++) {
-    tensors[i] = tensors[i].coalesce();
-    outputTensors[i] = torch::zeros(
-        tensors[i].sizes(), tensors[i].options().layout(torch::kStrided));
-  }
-  int dev_in_group = 0;
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  auto tensor = tensors.back();
+#ifdef IS_NCCLX
+  tensor = tensor.coalesce();
+  at::Tensor outputTensor =
+      torch::zeros(tensor.sizes(), tensor.options().layout(torch::kStrided));
   auto work = collective(
-      tensors,
-      outputTensors,
+      tensor,
+      outputTensor,
       [&](at::Tensor& input,
           at::Tensor& output,
           ncclComm_t comm,
           at::cuda::CUDAStream& stream) {
         auto ncclDataType = getNcclDataType(input.scalar_type());
-        auto ncclReduceOp = getNcclReduceOp(
-            opts.reduceOp, input, ncclDataType, comm, dev_in_group++);
+        auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
 
         size_t num_elements = output.numel();
         auto indices = input.indices();
@@ -2824,21 +3070,17 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_sparse(
             stream.stream());
         return result;
       },
-      [](std::vector<at::cuda::CUDAStream>& ncclStreams,
+      [](at::cuda::CUDAStream& ncclStream,
          c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
-      [&](std::vector<at::cuda::CUDAStream>& ncclStreams,
+      [&](at::cuda::CUDAStream& ncclStream,
           c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
         // Convert output tensors to sparse and back into tensors.
-        for (const auto i : c10::irange(outputTensors.size())) {
-          at::cuda::CUDAStreamGuard guard(ncclStreams[i]);
-          if (opts.sparseIndices.has_value()) {
-            tensors[i] = at::sparse_coo_tensor(
-                opts.sparseIndices.value(),
-                outputTensors[i],
-                tensors[i].sizes());
-          } else {
-            tensors[i] = outputTensors[i].to_sparse();
-          }
+        at::cuda::CUDAStreamGuard guard(ncclStream);
+        if (opts.sparseIndices.has_value()) {
+          tensor = at::sparse_coo_tensor(
+              opts.sparseIndices.value(), outputTensor, tensor.sizes());
+        } else {
+          tensor = outputTensor.to_sparse();
         }
       },
       OpType::_ALLREDUCE_SPARSE,
@@ -2848,24 +3090,23 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_sparse(
   // If the nccl branch is not "exp" then we just error
   C10_THROW_ERROR(
       Error,
-      "allreduce_sparse is only available in the NCCL experimental branch.");
+      "NCCL does not support all_reduce with sparse tensors. Please use dense tensors instead.");
 #endif
 }
 
 c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_impl(
-    std::vector<at::Tensor>& tensors,
+    at::Tensor& tensor,
     const AllreduceOptions& opts) {
-  int dev_in_group = 0;
   return collective(
-      tensors,
-      tensors,
+      tensor,
+      tensor,
       [&](at::Tensor& input,
           at::Tensor& output,
           ncclComm_t comm,
           at::cuda::CUDAStream& stream) {
         auto ncclDataType = getNcclDataType(input.scalar_type());
-        auto ncclReduceOp = getNcclReduceOp(
-            opts.reduceOp, input, ncclDataType, comm, dev_in_group++);
+        auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
         return ncclAllReduce(
             input.data_ptr(),
             output.data_ptr(),
@@ -2882,28 +3123,36 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_impl(
 c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
-  if (intraNodeComm_ != nullptr && tensors.size() == 1 &&
-      opts.reduceOp == ReduceOp::SUM) {
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  auto tensor = tensors.back();
+  if (tensor.is_complex()) {
+    TORCH_CHECK(
+        complexViewAsRealAllowed(opts.reduceOp),
+        "all_reduce does not support",
+        opts.reduceOp,
+        "on complex tensors");
+    tensor = at::view_as_real(tensor);
+  }
+  check_gpu_single_tensor(tensor);
+
+  if (intraNodeComm_ != nullptr && opts.reduceOp == ReduceOp::SUM) {
     using namespace intra_node_comm;
-    auto algo = intraNodeComm_->selectAllReduceAlgo(tensors[0]);
+    auto algo = intraNodeComm_->selectAllReduceAlgo(tensor);
     if (algo != intra_node_comm::AllReduceAlgo::NONE) {
-      intraNodeComm_->allReduce(tensors[0], algo);
+      intraNodeComm_->allReduce(tensor, algo);
       return c10::make_intrusive<IntraNodeCommWork>();
     }
   }
 
-  check_gpu_tensors_different_devices(tensors);
-
   // @lint-ignore CLANGTIDY
-  auto tensor = tensors.back();
   RECORD_PARAM_COMMS_DATA(
       static_cast<int>(
           this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
-      this->getID(),
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
       tensors, // inputTensors
       tensors, // outputTensors
       rank_, // rank
-      "allreduce", // colName
+      "allreduce", // collective name
       tensor.numel(), // inNelems
       tensor.numel(), // outNelems
       tensor.scalar_type(), // dType
@@ -2914,7 +3163,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
       this->getSize()); // worldSize
 
   // avoidRecordStreams_ note: collective() will stash tensors.
-  return allreduce_impl(tensors, opts);
+  return allreduce_impl(tensor, opts);
 }
 
 c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
@@ -2926,11 +3175,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
   RECORD_PARAM_COMMS_DATA(
       static_cast<int>(
           this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
-      this->getID(),
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
       tensors, // inputTensors
       tensors, // outputTensors
       rank_, // rank
-      "allreduce_coalesced", // colName
+      "allreduce_coalesced", // collective name
       total_numel, // inNelems
       total_numel, // outNelems
       tensors[0].scalar_type(), // dType
@@ -2942,24 +3191,48 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
       this->getSize()); // worldSize
 
   // avoidRecordStreams_ note: collective() will stash tensors.
-  return allreduce_impl(tensors, opts);
+  return collectiveCoalesced(
+      tensors,
+      tensors,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ncclComm_t comm,
+          at::cuda::CUDAStream& stream) {
+        auto ncclDataType = getNcclDataType(input.scalar_type());
+        auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
+        return ncclAllReduce(
+            input.data_ptr(),
+            output.data_ptr(),
+            input.numel(),
+            ncclDataType,
+            ncclReduceOp,
+            comm,
+            stream.stream());
+      },
+      OpType::COALESCED,
+      "nccl:allreduce_coalesced");
 }
 
 c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
     std::vector<at::Tensor>& tensors,
     const BroadcastOptions& opts) {
-  check_gpu_tensors_different_devices(tensors);
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  auto tensor = tensors.back();
+  if (tensor.is_complex()) {
+    tensor = at::view_as_real(tensor);
+  }
+  check_gpu_single_tensor(tensor);
 
   // @lint-ignore CLANGTIDY
-  auto tensor = tensors.back();
   RECORD_PARAM_COMMS_DATA(
       static_cast<int>(
           this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
-      this->getID(),
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
       tensors, // inputTensors
       tensors, // outputTensors
       opts.rootRank, // root rank
-      "broadcast", // colName
+      "broadcast", // collective name
       tensor.numel(), // inNelems
       tensor.numel(), // outNelems
       tensor.scalar_type(), // dType
@@ -2973,13 +3246,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
   bool avoidRecordStreams = avoidRecordStreams_ || (!opts.asyncOp);
 
   return collective(
-      tensors,
-      tensors,
+      tensor,
+      tensor,
       [&](at::Tensor& input,
           at::Tensor& output,
           ncclComm_t comm,
           at::cuda::CUDAStream& stream) {
-        const auto root = opts.rootRank * tensors.size() + opts.rootTensor;
+        const auto root = opts.rootRank + opts.rootTensor;
         return ncclBcast(
             input.data_ptr(),
             input.numel(),
@@ -3001,47 +3274,23 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
 // semantic implemented inside pg_nccl.all_gather also needs to support
 // out-of-place, for which an out-of-place broadcast is required to be added
 c10::intrusive_ptr<Work> ProcessGroupNCCL::_broadcast_oop(
-    std::vector<at::Tensor>& outputTensors,
-    std::vector<at::Tensor>& inputTensors,
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
     const BroadcastOptions& opts) {
-  check_gpu_tensors_different_devices(outputTensors);
-  check_gpu_tensors_different_devices(inputTensors);
-
-  // @lint-ignore CLANGTIDY
-  auto tensor = outputTensors.back();
-  // @lint-ignore CLANGTIDY
-  auto in_tensor = inputTensors.back();
-  if (tensor.numel() != in_tensor.numel()) {
+  if (outputTensor.numel() != inputTensor.numel()) {
     C10_THROW_ERROR(
         ValueError,
         "Tensor input and output of _broadcast_oop must have the same number of elements ");
   }
-  RECORD_PARAM_COMMS_DATA(
-      static_cast<int>(
-          this->getSequenceNumberForGroup() +
-          1), // seq + 1 to match collective increment.
-      this->getID(),
-      inputTensors, // inputTensors
-      outputTensors, // outputTensors
-      opts.rootRank, // root rank
-      "_broadcast_oop", // colName
-      tensor.numel(), // inNelems
-      tensor.numel(), // outNelems
-      tensor.scalar_type(), // dType
-      std::vector<int64_t>(), // inSplitSizes
-      std::vector<int64_t>(), // outSplitSizes
-      globalRankStart, // globalRankStart
-      globalRankStride, // globalRankStride
-      this->getSize()); // worldSize
 
   return collective(
-      inputTensors,
-      outputTensors,
+      inputTensor,
+      outputTensor,
       [&](at::Tensor& input,
           at::Tensor& output,
           ncclComm_t comm,
           at::cuda::CUDAStream& stream) {
-        const auto root = opts.rootRank * inputTensors.size() + opts.rootTensor;
+        const auto root = opts.rootRank + opts.rootTensor;
         return ncclBroadcast(
             input.data_ptr(),
             output.data_ptr(),
@@ -3058,17 +3307,26 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_broadcast_oop(
 c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
     std::vector<at::Tensor>& tensors,
     const ReduceOptions& opts) {
-  check_gpu_tensors_different_devices(tensors);
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   // @lint-ignore CLANGTIDY
   auto tensor = tensors.back();
+  if (tensor.is_complex()) {
+    TORCH_CHECK(
+        complexViewAsRealAllowed(opts.reduceOp),
+        "reduce does not support",
+        opts.reduceOp,
+        "on complex tensors");
+    tensor = at::view_as_real(tensor);
+  }
+  check_gpu_single_tensor(tensor);
   RECORD_PARAM_COMMS_DATA(
       static_cast<int>(
           this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
-      this->getID(),
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
       tensors, // inputTensors
       tensors, // outputTensors
       opts.rootRank, // root rank
-      "reduce", // colName
+      "reduce", // collective name
       tensor.numel(), // inNelems
       tensor.numel(), // outNelems
       tensor.scalar_type(), // dType
@@ -3078,19 +3336,18 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
       globalRankStride, // globalRankStride
       this->getSize()); // worldSize
 
-  int dev_in_group = 0;
   // avoidRecordStreams_ note: collective() will stash tensors.
   return collective(
-      tensors,
-      tensors,
+      tensor,
+      tensor,
       [&](at::Tensor& input,
           at::Tensor& output,
           ncclComm_t comm,
           at::cuda::CUDAStream& stream) {
-        const auto root = opts.rootRank * tensors.size() + opts.rootTensor;
+        const auto root = opts.rootRank + opts.rootTensor;
         auto ncclDataType = getNcclDataType(input.scalar_type());
-        auto ncclReduceOp = getNcclReduceOp(
-            opts.reduceOp, input, ncclDataType, comm, dev_in_group++);
+        auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
         return ncclReduce(
             input.data_ptr(),
             output.data_ptr(),
@@ -3113,49 +3370,26 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
 // semantic implemented inside pg_nccl.reduce_scatter also needs to support
 // out-of-place, for which an out-of-place reduce is required to be added
 c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_oop(
-    std::vector<at::Tensor>& outputTensors,
-    std::vector<at::Tensor>& inputTensors,
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
     const ReduceOptions& opts) {
-  check_gpu_tensors_different_devices(outputTensors);
-  check_gpu_tensors_different_devices(inputTensors);
-  // @lint-ignore CLANGTIDY
-  auto tensor = outputTensors.back();
-  // @lint-ignore CLANGTIDY
-  auto in_tensor = inputTensors.back();
-  if (tensor.numel() != in_tensor.numel()) {
+  if (outputTensor.numel() != inputTensor.numel()) {
     C10_THROW_ERROR(
         ValueError,
         "Tensor input and output of _reduce_oop must have the same number of elements ");
   }
-  RECORD_PARAM_COMMS_DATA(
-      static_cast<int>(
-          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
-      this->getID(),
-      inputTensors, // inputTensors
-      outputTensors, // outputTensors
-      opts.rootRank, // root rank
-      "_reduce_oop", // colName
-      tensor.numel(), // inNelems
-      tensor.numel(), // outNelems
-      tensor.scalar_type(), // dType
-      std::vector<int64_t>(), // inSplitSizes
-      std::vector<int64_t>(), // outSplitSizes
-      globalRankStart, // globalRankStart
-      globalRankStride, // globalRankStride
-      this->getSize()); // worldSize
 
-  int dev_in_group{0};
   return collective(
-      inputTensors,
-      outputTensors,
+      inputTensor,
+      outputTensor,
       [&](at::Tensor& input,
           at::Tensor& output,
           ncclComm_t comm,
           at::cuda::CUDAStream& stream) {
-        const auto root = opts.rootRank * inputTensors.size() + opts.rootTensor;
+        const auto root = opts.rootRank + opts.rootTensor;
         const auto ncclDataType = getNcclDataType(input.scalar_type());
-        const auto ncclReduceOp = getNcclReduceOp(
-            opts.reduceOp, input, ncclDataType, comm, dev_in_group++);
+        const auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
         return ncclReduce(
             input.data_ptr(),
             output.data_ptr(),
@@ -3174,38 +3408,38 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
     std::vector<std::vector<at::Tensor>>& outputTensors,
     std::vector<at::Tensor>& inputTensors,
     const AllgatherOptions& opts) {
-  check_gpu_tensors_different_devices(inputTensors);
+  TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   // @lint-ignore CLANGTIDY
-  bool same_size = check_same_size(outputTensors.back());
+  auto inputTensor = inputTensors.back();
+  check_gpu_single_tensor(inputTensor);
+  // @lint-ignore CLANGTIDY
+  auto outputTensors_ = outputTensors.back();
 
-  if (same_size) {
-    auto outputFlattened =
-        flatten_for_scatter_gather(outputTensors, inputTensors, size_);
-    check_gpu_tensors_different_devices(outputFlattened);
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      inputTensors, // inputTensors
+      outputTensors, // outputTensors
+      rank_, // rank
+      "all_gather", // collective name
+      inputTensor.numel(), // inNelems
+      inputTensor.numel() * // outNelems
+          this->getSize(),
+      inputTensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSize
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
 
-    // @lint-ignore CLANGTIDY
-    auto tensor = inputTensors.back();
-    RECORD_PARAM_COMMS_DATA(
-        static_cast<int>(
-            this->getSequenceNumberForGroup() +
-            1), // seq + 1 to match collective
-        this->getID(),
-        inputTensors, // inputTensors
-        outputTensors, // outputTensors
-        rank_, // rank
-        "all_gather", // colName
-        tensor.numel(), // inNelems
-        tensor.numel() * // outNelems
-            this->getSize(),
-        tensor.scalar_type(), // dType
-        std::vector<int64_t>(), // inSplitSizes
-        std::vector<int64_t>(), // outSplitSize
-        globalRankStart, // globalRankStart
-        globalRankStride, // globalRankStride
-        this->getSize()); // worldSize
+  bool same_size = check_same_size(outputTensors_);
+  if (same_size) {
+    // Flatten a vector of tensors into a single, stacked tensor.
+    at::Tensor outputFlattened = newLikeFlat(outputTensors_);
 
     return collective(
-        inputTensors,
+        inputTensor,
         outputFlattened,
         [&](at::Tensor& input,
             at::Tensor& output,
@@ -3223,7 +3457,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
               comm,
               stream.stream());
         },
-        [](std::vector<at::cuda::CUDAStream>& ncclStreams,
+        [](at::cuda::CUDAStream& ncclStream,
            c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
           // avoidRecordStreams_ note: We actually don't need to stash anything
           // here.
@@ -3236,48 +3470,32 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
           // released back to their allocation streams until after work_ is
           // waited on.
         },
-        [&](std::vector<at::cuda::CUDAStream>& ncclStreams,
+        [&](at::cuda::CUDAStream& ncclStream,
             c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
           // Copy the flattened output tensors to the outputs.
-          for (const auto i : c10::irange(outputTensors.size())) {
-            at::cuda::CUDAStreamGuard guard(ncclStreams[i]);
-            for (const auto j : c10::irange(outputTensors[0].size())) {
-              // See [Sync Streams].
-              if (!avoidRecordStreams_) {
-                c10::cuda::CUDACachingAllocator::recordStream(
-                    outputTensors[i][j].storage().data_ptr(), ncclStreams[i]);
-              }
-              outputTensors[i][j].copy_(outputFlattened[i][j], true);
+          at::cuda::CUDAStreamGuard guard(ncclStream);
+          for (const auto j : c10::irange(outputTensors_.size())) {
+            // See [Sync Streams].
+            if (!avoidRecordStreams_) {
+              c10::cuda::CUDACachingAllocator::recordStream(
+                  outputTensors_[j].storage().data_ptr(), ncclStream);
             }
+            outputTensors_[j].copy_(outputFlattened[j], true);
           }
         },
         OpType::ALLGATHER,
         "nccl:all_gather");
   } else {
-    const auto num_devices = outputTensors.size();
-    const auto num_reduces = outputTensors[0].size();
-    std::vector<c10::intrusive_ptr<Work>> works;
+    const auto num_reduces = outputTensors_.size();
     startCoalescing();
-    for (const auto i : c10::irange(num_reduces)) {
-      std::vector<at::Tensor> inputs_multi_dev(num_devices);
-      std::vector<at::Tensor> outputs_multi_dev(num_devices);
-      for (const auto j : c10::irange(num_devices)) {
-        // @lint-ignore CLANGTIDY
-        outputs_multi_dev[j] = outputTensors[j][i];
-        inputs_multi_dev[j] =
-            // @lint-ignore CLANGTIDY
-            i == (rank_ * num_devices + j) ? inputTensors[j]
-                                           : outputs_multi_dev[j];
-      }
+    for (const int i : c10::irange(num_reduces)) {
+      auto& output = outputTensors_[i];
+      auto& input = (i == rank_) ? inputTensor : output;
       auto broadcastOpts = BroadcastOptions{
-          static_cast<int64_t>(i / num_devices),
-          static_cast<int64_t>(i % num_devices),
-          opts.timeout};
-      auto work =
-          _broadcast_oop(outputs_multi_dev, inputs_multi_dev, broadcastOpts);
-      works.push_back(work);
+          static_cast<int64_t>(i), static_cast<int64_t>(0), opts.timeout};
+      _broadcast_oop(output, input, broadcastOpts);
     }
-    auto work = endCoalescing();
+    auto work = endCoalescing(OpType::ALLGATHER);
     return work;
   }
 }
@@ -3295,7 +3513,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
     std::vector<at::Tensor>& outputs,
     std::vector<at::Tensor>& inputs,
     const AllgatherOptions& opts) {
-  return collective(
+  return collectiveCoalesced(
       inputs,
       outputs,
       [&](at::Tensor& input,
@@ -3318,40 +3536,38 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ReduceScatterOptions& opts) {
-  check_gpu_tensors_different_devices(outputTensors);
+  TORCH_CHECK(outputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   // @lint-ignore CLANGTIDY
-  bool same_size = check_same_size(inputTensors.back());
-
-  if (same_size) {
-    // @lint-ignore CLANGTIDY
-    auto tensor = outputTensors.back();
+  auto outputTensor = outputTensors.back();
+  check_gpu_single_tensor(outputTensor);
+  // @lint-ignore CLANGTIDY
+  auto inputTensors_ = inputTensors.back();
 
-    int dev_in_group{0};
-    auto inputFlattened =
-        flatten_for_scatter_gather(inputTensors, outputTensors, size_);
-    check_gpu_tensors_different_devices(inputFlattened);
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      inputTensors, // inputTensors
+      outputTensors, // outputTensors
+      rank_, // rank
+      "reduce_scatter", // collective name
+      outputTensor.numel() * this->getSize(), // inNelems
+      outputTensor.numel(), // outNelems
+      outputTensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
 
-    RECORD_PARAM_COMMS_DATA(
-        static_cast<int>(
-            this->getSequenceNumberForGroup() +
-            1), // seq + 1 to match collective
-        this->getID(),
-        inputTensors, // inputTensors
-        outputTensors, // outputTensors
-        rank_, // rank
-        "reduce_scatter", // colName
-        tensor.numel() * this->getSize(), // inNelems
-        tensor.numel(), // outNelems
-        tensor.scalar_type(), // dType
-        std::vector<int64_t>(), // inSplitSizes
-        std::vector<int64_t>(), // outSplitSizes
-        globalRankStart, // globalRankStart
-        globalRankStride, // globalRankStride
-        this->getSize()); // worldSize
+  bool same_size = check_same_size(inputTensors_);
+  if (same_size) {
+    // Flatten a vector of tensors into a single, stacked tensor.
+    at::Tensor inputFlattened = newLikeFlat(inputTensors_);
 
     return collective(
         inputFlattened,
-        outputTensors,
+        outputTensor,
         [&](at::Tensor& input,
             at::Tensor& output,
             ncclComm_t comm,
@@ -3361,8 +3577,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
                 output.storage().data_ptr(), stream);
           }
           const auto ncclDataType = getNcclDataType(input.scalar_type());
-          const auto ncclReduceOp = getNcclReduceOp(
-              opts.reduceOp, input, ncclDataType, comm, dev_in_group++);
+          const auto ncclReduceOp =
+              getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
           return ncclReduceScatter(
               input.data_ptr(),
               output.data_ptr(),
@@ -3372,7 +3588,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
               comm,
               stream.stream());
         },
-        [&](std::vector<at::cuda::CUDAStream>& ncclStreams,
+        [&](at::cuda::CUDAStream& ncclStream,
             c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
           if (avoidRecordStreams_) {
             // We only need to stash inputTensors.
@@ -3384,54 +3600,38 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
             //    and should also be held by the user until after waiting on
             //    work_.
             auto& v = work->stashed_for_allocator_safety_;
-            for (const auto i : c10::irange(inputTensors.size())) {
-              v->insert(
-                  v->end(), inputTensors[i].begin(), inputTensors[i].end());
-            }
+            v->insert(v->end(), inputTensors_.begin(), inputTensors_.end());
           }
 
           // Copy the input tensors to the flattened inputs.
-          for (const auto i : c10::irange(inputTensors.size())) {
-            at::cuda::CUDAStreamGuard guard(ncclStreams[i]);
-            for (const auto j : c10::irange(inputTensors[0].size())) {
-              // See [Sync Streams].
-              if (!avoidRecordStreams_) {
-                c10::cuda::CUDACachingAllocator::recordStream(
-                    inputTensors[i][j].storage().data_ptr(), ncclStreams[i]);
-              }
-              inputFlattened[i][j].copy_(inputTensors[i][j], true);
+          at::cuda::CUDAStreamGuard guard(ncclStream);
+          for (const auto j : c10::irange(inputTensors_.size())) {
+            // See [Sync Streams].
+            if (!avoidRecordStreams_) {
+              c10::cuda::CUDACachingAllocator::recordStream(
+                  inputTensors_[j].storage().data_ptr(), ncclStream);
             }
+            inputFlattened[j].copy_(inputTensors_[j], true);
           }
         },
-        [&](std::vector<at::cuda::CUDAStream>&,
+        [&](at::cuda::CUDAStream&,
             c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
         OpType::REDUCE_SCATTER,
         "nccl:reduce_scatter");
   } else {
-    const auto num_devices = inputTensors.size();
-    const auto num_reduces = inputTensors[0].size();
-    std::vector<c10::intrusive_ptr<Work>> works;
+    const auto num_reduces = inputTensors_.size();
     startCoalescing();
-    for (const auto i : c10::irange(num_reduces)) {
-      std::vector<at::Tensor> inputs_multi_dev(num_devices);
-      std::vector<at::Tensor> outputs_multi_dev(num_devices);
-      for (const auto j : c10::irange(num_devices)) {
-        // @lint-ignore CLANGTIDY
-        inputs_multi_dev[j] = inputTensors[j][i];
-        outputs_multi_dev[j] =
-            // @lint-ignore CLANGTIDY
-            i == (rank_ * num_devices + j) ? outputTensors[j]
-                                           : inputs_multi_dev[j];
-      }
+    for (const int i : c10::irange(num_reduces)) {
+      auto& input = inputTensors_[i];
+      auto& output = (i == rank_) ? outputTensor : input;
       auto reduceOpts = ReduceOptions{
           opts.reduceOp,
-          static_cast<int64_t>(i / num_devices),
-          static_cast<int64_t>(i % num_devices),
+          static_cast<int64_t>(i),
+          static_cast<int64_t>(0),
           opts.timeout};
-      auto work = _reduce_oop(outputs_multi_dev, inputs_multi_dev, reduceOpts);
-      works.push_back(work);
+      _reduce_oop(output, input, reduceOpts);
     }
-    auto work = endCoalescing();
+    auto work = endCoalescing(OpType::REDUCE_SCATTER);
     return work;
   }
 }
@@ -3456,11 +3656,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
   RECORD_PARAM_COMMS_DATA(
       static_cast<int>(
           this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
-      this->getID(),
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
       inputTensor, // inputTensor
       outputTensor, // outputTensor
       rank_, // rank
-      "_reduce_scatter_base", // colName
+      "_reduce_scatter_base", // collective name
       inputTensor.numel(), // inNelems
       tensor.numel(), // outNelems
       tensor.scalar_type(), // dtype
@@ -3470,10 +3670,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
       globalRankStride, // globalRankStride
       this->getSize()); // worldSize
 
-  auto inputs = std::vector<at::Tensor>{inputTensor};
-  auto outputs = std::vector<at::Tensor>{outputTensor};
-
-  int dev_in_group = 0;
   // avoidRecordStreams_ note: collective() will stash inputs and outputs.
   // Note 2: for asyncOp = false, we don't want to record streams because we
   // know that the NCCL stream will join back to the "current" stream right
@@ -3486,8 +3682,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
   bool avoidRecordStreams = avoidRecordStreams_ || (!opts.asyncOp);
 
   return collective(
-      inputs,
-      outputs,
+      inputTensor,
+      outputTensor,
       [&](at::Tensor& input,
           at::Tensor& output,
           ncclComm_t comm,
@@ -3497,8 +3693,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
               output.storage().data_ptr(), stream);
         }
         auto ncclDataType = getNcclDataType(input.scalar_type());
-        auto ncclReduceOp = getNcclReduceOp(
-            opts.reduceOp, input, ncclDataType, comm, dev_in_group++);
+        auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
         return ncclReduceScatter(
             input.data_ptr(),
             output.data_ptr(),
@@ -3517,7 +3713,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
     std::vector<at::Tensor>& outputs,
     std::vector<at::Tensor>& inputs,
     const ReduceScatterOptions& opts) {
-  return collective(
+  return collectiveCoalesced(
       inputs,
       outputs,
       [&](at::Tensor& input,
@@ -3529,8 +3725,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
               output.storage().data_ptr(), stream);
         }
         auto ncclDataType = getNcclDataType(input.scalar_type());
-        auto ncclReduceOp = getNcclReduceOp(
-            opts.reduceOp, input, ncclDataType, comm, /*dev_in_group=*/0);
+        auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
         return ncclReduceScatter(
             input.data_ptr(),
             output.data_ptr(),
@@ -3548,9 +3744,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
   RECORD_PARAM_COMMS(
       static_cast<int>(
           this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
-      this->getID(),
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
       rank_, // rank
-      "barrier", // colName
+      "barrier", // collective name
       0, // inNelems
       0, // outNelems
       at::kByte, // dType
@@ -3590,29 +3786,20 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
     }
   }
 
-  std::vector<at::Tensor> barrierTensors;
-  barrierTensors.reserve(devices.size());
-
-  at::cuda::OptionalCUDAGuard gpuGuard;
-  for (auto& device : devices) {
-    gpuGuard.set_index(device.index());
-    barrierTensors.push_back(at::empty(
-        {1},
-        at::TensorOptions().device(at::DeviceType::CUDA).dtype(at::kByte)));
-  }
-
+  // Use one device only
+  auto device = devices.back();
+  at::Tensor barrierTensor =
+      at::empty({1}, at::TensorOptions().device(device).dtype(at::kByte));
   // All reduce to achieve the barrier
-  auto work = allreduce(barrierTensors);
+  auto work = allreduce_impl(barrierTensor);
 
   // Work will take over barrierTensors
   auto ncclWork = dynamic_cast<ProcessGroupNCCL::WorkNCCL*>(work.get());
   TORCH_CHECK(ncclWork);
-  ncclWork->barrierTensors_ = std::move(barrierTensors);
-
+  ncclWork->barrierTensor_ = std::move(barrierTensor);
   return work;
 }
 
-#ifdef ENABLE_NCCL_P2P_SUPPORT
 c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
     at::Tensor& outputTensor,
     at::Tensor& inputTensor,
@@ -3622,18 +3809,15 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
   check_gpu_single_tensor(outputTensor, true);
   check_gpu_single_tensor(inputTensor, true);
   if (outputSplitSizes.size() == 0 && inputSplitSizes.size() == 0) {
-    std::vector<at::Tensor> inputTensors = {inputTensor};
-    std::vector<at::Tensor> outputTensors = {outputTensor};
-
     RECORD_PARAM_COMMS_DATA(
         static_cast<int>(
             this->getSequenceNumberForGroup() +
             1), // seq + 1 to match collective
-        this->getID(),
+        std::make_tuple(pg_name_, pg_desc_), // PG name tuple
         inputTensor, // inputTensor
         outputTensor, // outputTensor
         rank_, // rank
-        "all_to_all", // colName
+        "all_to_all", // collective name
         inputTensor.numel(), // inNelems
         outputTensor.numel(), // outNelems
         inputTensor.scalar_type(), // dType
@@ -3646,8 +3830,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
     // avoidRecordStreams_ note: collective() will stash inputTensors and
     // outputTensors.
     return collective(
-        inputTensors,
-        outputTensors,
+        inputTensor,
+        outputTensor,
         [&](at::Tensor& input,
             at::Tensor& output,
             ncclComm_t comm,
@@ -3666,18 +3850,16 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
   } else {
     c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_);
     c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_);
-    std::vector<at::Tensor> inputTensors = {inputTensor};
-    std::vector<at::Tensor> outputTensors = {outputTensor};
 
     RECORD_PARAM_COMMS_DATA(
         static_cast<int>(
             this->getSequenceNumberForGroup() +
             1), // seq + 1 to match collective
-        this->getID(),
+        std::make_tuple(pg_name_, pg_desc_), // PG name tuple
         inputTensor, // inputTensor
         outputTensor, // outputTensor
         rank_, // rank
-        "all_to_allv", // colName
+        "all_to_allv", // collective name
         inputTensor.numel(), // inNelems
         outputTensor.numel(), // outNelems
         inputTensor.scalar_type(), // dType
@@ -3690,8 +3872,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
     // avoidRecordStreams_ note: collective() will stash inputTensors and
     // outputTensors.
     return collective(
-        inputTensors,
-        outputTensors,
+        inputTensor,
+        outputTensor,
         [&](at::Tensor& input,
             at::Tensor& output,
             ncclComm_t comm,
@@ -3751,11 +3933,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
   RECORD_PARAM_COMMS_DATA(
       static_cast<int>(
           this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
-      this->getID(),
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
       inputTensors, // inputTensors
       outputTensors, // outputTensors
       rank_, // rank
-      "all_to_all", // colName
+      "all_to_all", // collective name
       total_numel, // inNelems
       total_numel, // outNelems
       inputTensors.front().scalar_type(), // dType
@@ -3765,11 +3947,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
       globalRankStride, // globalRankStride
       this->getSize()); // worldSize
 
-  std::vector<at::Tensor> inputTensor0 = {inputTensors[0]};
-  std::vector<at::Tensor> outputTensor0 = {outputTensors[0]};
   return collective(
-      inputTensor0,
-      outputTensor0,
+      inputTensors[0],
+      outputTensors[0],
       [&](at::Tensor& /* unused */,
           at::Tensor& /* unused */,
           ncclComm_t comm,
@@ -3777,7 +3957,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
         torch::cuda::nccl::all2all(outputTensors, inputTensors, comm, stream);
         return ncclSuccess;
       },
-      [&](std::vector<at::cuda::CUDAStream>&,
+      [&](at::cuda::CUDAStream&,
           c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
         if (avoidRecordStreams_) {
           // inputTensor0 and outputTensor0 are stashed redundantly by
@@ -3787,7 +3967,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
           v->insert(v->end(), outputTensors.begin(), outputTensors.end());
         }
       },
-      [](std::vector<at::cuda::CUDAStream>&,
+      [](at::cuda::CUDAStream&,
          c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
       OpType::ALLTOALL,
       "nccl:all_to_all");
@@ -3797,18 +3977,19 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::send(
     std::vector<at::Tensor>& tensors,
     int dstRank,
     int /* unused */) {
-  check_gpu_tensors_different_devices(tensors, true);
-
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   // @lint-ignore CLANGTIDY
   auto tensor = tensors.back();
+  check_gpu_single_tensor(tensor, true);
+
   RECORD_PARAM_COMMS_DATA(
       static_cast<int>(
           this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
-      this->getID(),
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
       tensors, // inputTensors
       tensors, // outputTensors
       dstRank, // dst rank
-      "send", // colName
+      "send", // collective name
       tensor.numel(), // inNelems
       tensor.numel(), // outNelems
       tensor.scalar_type(), // dType
@@ -3819,7 +4000,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::send(
       this->getSize()); // worldSize
 
   auto ret = pointToPoint(
-      tensors,
+      tensor,
       [&](at::Tensor& input,
           ncclComm_t comm,
           at::cuda::CUDAStream& stream,
@@ -3837,18 +4018,19 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::recv(
     std::vector<at::Tensor>& tensors,
     int srcRank,
     int /* unused */) {
-  check_gpu_tensors_different_devices(tensors, true);
-
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   // @lint-ignore CLANGTIDY
   auto tensor = tensors.back();
+  check_gpu_single_tensor(tensor, true);
+
   RECORD_PARAM_COMMS_DATA(
       static_cast<int>(
           this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
-      this->getID(),
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
       tensors, // inputTensors
       tensors, // outputTensors
       srcRank, // src rank
-      "recv", // colName
+      "recv", // collective name
       tensor.numel(), // inNelems
       tensor.numel(), // outNelems
       tensor.scalar_type(), // dType
@@ -3859,7 +4041,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::recv(
       this->getSize()); // worldSize
 
   auto ret = pointToPoint(
-      tensors,
+      tensor,
       [&](at::Tensor& output,
           ncclComm_t comm,
           at::cuda::CUDAStream& stream,
@@ -3872,87 +4054,26 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::recv(
       c10::str("nccl:recv ", rank_, "<-", srcRank).c_str());
   return ret;
 }
-#else
-c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
-    at::Tensor& /* unused */,
-    at::Tensor& /* unused */,
-    std::vector<int64_t>& /* unused */,
-    std::vector<int64_t>& /* unused */,
-    const AllToAllOptions& /* unused */) {
-  C10_THROW_ERROR(
-      NotImplementedError,
-      "ProcessGroupNCCL only supports alltoall* for NCCL lib version >= 2.7.0");
-}
-
-c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
-    std::vector<at::Tensor>& /* unused */,
-    std::vector<at::Tensor>& /* unused */,
-    const AllToAllOptions& /* unused */) {
-  C10_THROW_ERROR(
-      NotImplementedError,
-      "ProcessGroupNCCL only supports alltoall* for NCCL lib version >= 2.7.0");
-}
-
-c10::intrusive_ptr<Work> ProcessGroupNCCL::send(
-    std::vector<at::Tensor>& /* unused */,
-    int /* unused */,
-    int /* unused */) {
-  C10_THROW_ERROR(
-      NotImplementedError,
-      "ProcessGroupNCCL only supports send for NCCL lib version >= 2.7.0");
-}
-
-c10::intrusive_ptr<Work> ProcessGroupNCCL::recv(
-    std::vector<at::Tensor>& /* unused */,
-    int /* unused */,
-    int /* unused */) {
-  C10_THROW_ERROR(
-      NotImplementedError,
-      "ProcessGroupNCCL only supports recv for NCCL lib version >= 2.7.0");
-}
-#endif
 
 void ProcessGroupNCCL::groupStart() {
-#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
   C10D_NCCL_CHECK(ncclGroupStart(), c10::nullopt);
-#endif
   ++ncclActiveGroupCounter_;
 }
 
 void ProcessGroupNCCL::groupEnd() {
-#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
-#ifndef NCCL_HAS_COMM_NONBLOCKING
   C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt);
-#else
-  if (!nccl_use_nonblocking()) {
-    C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt);
-  } else {
-    TORCH_WARN(
-        "ProcessGroupNCCL::groupEnd() called in nonblocking communicator mode without involved communicators specified; gathering all mapped communicators...");
-    std::unique_lock<std::mutex> lock(mutex_);
-    std::vector<std::shared_ptr<NCCLComm>> ncclComms_;
-    for (auto& it : devNCCLCommMap_) {
-      ncclComms_.insert(ncclComms_.end(), it.second.begin(), it.second.end());
-    }
-    C10D_NCCL_CHECK_TIMEOUT_GROUPEND(ncclGroupEnd(), ncclComms_, c10::nullopt);
-  }
-#endif
-#endif
   --ncclActiveGroupCounter_;
 }
 
-void ProcessGroupNCCL::groupEndNonblocking(
-    std::vector<std::shared_ptr<NCCLComm>> comms) {
-#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+void ProcessGroupNCCL::groupEndNonblocking(std::shared_ptr<NCCLComm> comm) {
 #ifndef NCCL_HAS_COMM_NONBLOCKING
   C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt);
 #else
   if (!nccl_use_nonblocking()) {
     C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt);
   } else {
-    C10D_NCCL_CHECK_TIMEOUT_GROUPEND(ncclGroupEnd(), comms, c10::nullopt);
+    C10D_NCCL_CHECK_TIMEOUT_GROUPEND(ncclGroupEnd(), comm, c10::nullopt);
   }
-#endif
 #endif
   --ncclActiveGroupCounter_;
 }
@@ -3966,11 +4087,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
   };
 
   assertRootRank(invalidArgument, opts.rootRank, size_);
-  check_gpu_tensors_different_devices(inputTensors, true);
-  assertSingleElementInput(invalidArgument, inputTensors);
 
+  TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   // @lint-ignore CLANGTIDY
-  auto tensor = inputTensors.back();
+  auto inputTensor = inputTensors.back();
 
   std::vector<at::Tensor> outputs;
 
@@ -3988,8 +4108,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
       invalidArgument(ss.str());
     }
 
-    const auto& options = inputTensors[0].options();
-    const auto& sizes = inputTensors[0].sizes();
+    const auto& options = inputTensor.options();
+    const auto& sizes = inputTensor.sizes();
     assertTypeAndSizesMatch(invalidArgument, outputTensors[0], options, sizes);
     outputs = outputTensors[0];
   } else {
@@ -4006,14 +4126,14 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
   RECORD_PARAM_COMMS_DATA(
       static_cast<int>(
           this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
-      this->getID(),
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
       inputTensors, // inputTensors
       outputTensors, // outputTensors
       opts.rootRank, // root rank
-      "gather", // colName
-      tensor.numel(), // inNelems
-      tensor.numel() * this->getSize(), // outNelems
-      tensor.scalar_type(), // dType
+      "gather", // collective name
+      inputTensor.numel(), // inNelems
+      inputTensor.numel() * this->getSize(), // outNelems
+      inputTensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSize
       globalRankStart, // globalRankStart
@@ -4023,8 +4143,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
   // avoidRecordStreams_ note: collective() will stash inputTensors and
   // outputs, which == outputTensors[0] on the root rank where it matters.
   return collective(
-      inputTensors,
-      outputs,
+      inputTensor,
+      outputs[0], // just to fit the collective interface
       [&](at::Tensor& /* unused */,
           at::Tensor& /* unused */,
           ncclComm_t comm,
@@ -4038,7 +4158,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
             }
           }
         }
-        torch::cuda::nccl::gather(inputTensors[0], outputs, comm, stream, root);
+        torch::cuda::nccl::gather(inputTensor, outputs, comm, stream, root);
         return ncclSuccess;
       },
       OpType::GATHER,
@@ -4054,11 +4174,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::scatter(
   };
 
   assertRootRank(invalidArgument, opts.rootRank, size_);
-  check_gpu_tensors_different_devices(outputTensors, true);
-  assertSingleElementInput(invalidArgument, outputTensors);
 
-  // @lint-ignore CLANGTIDY
-  auto tensor = outputTensors.back();
+  TORCH_CHECK(outputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  auto outputTensor = outputTensors.back();
 
   std::vector<at::Tensor> inputs;
 
@@ -4076,8 +4194,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::scatter(
       invalidArgument(ss.str());
     }
 
-    const auto& options = outputTensors[0].options();
-    const auto& sizes = outputTensors[0].sizes();
+    const auto& options = outputTensor.options();
+    const auto& sizes = outputTensor.sizes();
     assertTypeAndSizesMatch(invalidArgument, inputTensors[0], options, sizes);
     inputs = inputTensors[0];
   } else {
@@ -4095,14 +4213,14 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::scatter(
   RECORD_PARAM_COMMS_DATA(
       static_cast<int>(
           this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
-      this->getID(),
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
       inputTensors, // inputTensors
       outputTensors, // outputTensors
       opts.rootRank, // root rank
-      "scatter", // colName
-      tensor.numel(), // inNelems
-      tensor.numel() * this->getSize(), // outNelems
-      tensor.scalar_type(), // dType
+      "scatter", // collective name
+      outputTensor.numel() * this->getSize(), // inNelems
+      outputTensor.numel(), // outNelems
+      outputTensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSize
       globalRankStart, // globalRankStart
@@ -4114,8 +4232,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::scatter(
   bool avoidRecordStreams = avoidRecordStreams_ || (!opts.asyncOp);
 
   return collective(
-      outputTensors,
-      inputs,
+      outputTensor,
+      inputs[0], // just to fit the collective interface
       [&](at::Tensor& /* unused */,
           at::Tensor& /* unused */,
           ncclComm_t comm,
@@ -4129,8 +4247,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::scatter(
             }
           }
         }
-        torch::cuda::nccl::scatter(
-            inputs, outputTensors[0], comm, stream, root);
+        torch::cuda::nccl::scatter(inputs, outputTensor, comm, stream, root);
         return ncclSuccess;
       },
       OpType::SCATTER,
@@ -4163,29 +4280,23 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_allgather_base(
         "output tensor size must be equal to world_size times input tensor size");
   }
 
-  // @lint-ignore CLANGTIDY
-  const auto& tensor = output_tensor;
   RECORD_PARAM_COMMS_DATA(
       static_cast<int>(
           this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
-      this->getID(),
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
       input_tensor, // inputTensors
       output_tensor, // outputTensors
       rank_, // rank
-      "_allgather_base", // colName
+      "_allgather_base", // collective name
       input_tensor.numel(), // inNelems
-      tensor.numel(), // outNelems
-      tensor.scalar_type(), // dType
+      output_tensor.numel(), // outNelems
+      output_tensor.scalar_type(), // dType
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>(), // outSplitSize
       globalRankStart, // globalRankStart
       globalRankStride, // globalRankStride
       this->getSize()); // worldSize
 
-  // just a wrapper to fit the collective interface
-  auto inputs = std::vector<at::Tensor>{input_tensor};
-  auto outputs = std::vector<at::Tensor>{output_tensor};
-
   // avoidRecordStreams_ note: collective() will stash inputs and outputs.
   // Note 2: for asyncOp = false, we don't want to record streams because we
   // know that the NCCL stream will join back to the "current" stream right
@@ -4198,8 +4309,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_allgather_base(
   bool avoidRecordStreams = avoidRecordStreams_ || (!opts.asyncOp);
 
   return collective(
-      inputs,
-      outputs,
+      input_tensor,
+      output_tensor,
       [&](at::Tensor& input,
           at::Tensor& output,
           ncclComm_t comm,
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index eef677e7724ca..fac9b6f38204e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -2,6 +2,13 @@
 
 #ifdef USE_C10D_NCCL
 
+#if defined(__linux__)
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+
 #include <atomic>
 #include <chrono>
 #include <future>
@@ -13,6 +20,7 @@
 
 #include <torch/csrc/distributed/c10d/Backend.hpp>
 #include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
+#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
 
@@ -28,52 +36,75 @@
 #include <torch/custom_class.h>
 
 namespace c10d {
-// Environment variable which controls whether we perform a NCCL healt check
-// which ensures communicators are healthy at the beginning of init.
-static std::vector<std::string> TORCH_ENABLE_NCCL_HEALTH_CHECK = {
-    "TORCH_ENABLE_NCCL_HEALTH_CHECK",
-    "ENABLE_NCCL_HEALTH_CHECK"};
-
-// Environment variable which controls whether or not wait() is blocking or
-// non-blocking.
+
+// Control whether to always use high priority streams
+static std::vector<std::string> TORCH_NCCL_HIGH_PRIORITY = {
+    "TORCH_NCCL_HIGH_PRIORITY"};
+
+// Control whether or not wait() is blocking or non-blocking.
 static std::vector<std::string> TORCH_NCCL_BLOCKING_WAIT = {
     "TORCH_NCCL_BLOCKING_WAIT",
     "NCCL_BLOCKING_WAIT"};
 
-// Environment variable which controls whether or not we perform Async Error
-// Handling with NCCL.
+// Control whether or not we perform Async Error Handling with NCCL.
 static std::vector<std::string> TORCH_NCCL_ASYNC_ERROR_HANDLING = {
     "TORCH_NCCL_ASYNC_ERROR_HANDLING",
     "NCCL_ASYNC_ERROR_HANDLING"};
 
-// Environment Variable to control whether dumping debug info on watchdog
+// Control whether dumping debug info on watchdog
 // timeout is enabled. This variable must be set together with
 // TORCH_NCCL_ENABLE_MONITORING=1 and TORCH_NCCL_TRACE_BUFFER_SIZE > 0.
 static std::vector<std::string> TORCH_NCCL_DUMP_ON_TIMEOUT = {
     "TORCH_NCCL_DUMP_ON_TIMEOUT"};
 
-// Environment Variable to control whether Desync Debug is enabled.
-// This variable must be set together with TORCH_NCCL_ASYNC_ERROR_HANDLING.
+// Control whether Desync Debug is enabled. This variable must be set
+// together with TORCH_NCCL_ASYNC_ERROR_HANDLING.
 static std::vector<std::string> TORCH_NCCL_DESYNC_DEBUG = {
     "TORCH_NCCL_DESYNC_DEBUG",
     "NCCL_DESYNC_DEBUG"};
 
+// Enable recording start-events for all ProcessGroupNCCL collectives, and
+// compute accurate collective timing per-collective. (Note: end-events are
+// recorded by default. Turn on this flag can increase chances of a watchdog
+// hang due to performing a CUDA event query which eventually calls
+// cudaEventElapsedTime() API.
 static std::vector<std::string> TORCH_NCCL_ENABLE_TIMING = {
     "TORCH_NCCL_ENABLE_TIMING",
     "NCCL_ENABLE_TIMING"};
 
+// Enable monitoring thread which aborts the process when the ProcessGroupNCCL
+// Watchdog thread gets stuck and no heartbeat is detected after
+// TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC. This can happen due to calling CUDA/NCCL
+// APIs that may hang. It is Useful to prevent jobs being stuck for a prolonged
+// time than necessary tying up cluster resources.
 static std::vector<std::string> TORCH_NCCL_ENABLE_MONITORING = {
     "TORCH_NCCL_ENABLE_MONITORING"};
 
+// Control the watchdog heartbeat timeout period after which the monitoring
+// thread will abort the process.
 static std::vector<std::string> TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC = {
     "TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC"};
 
+// The maximum number of events we store in the flight recorder's ring buffer.
+// (One event could be the start or end of a collective, for example).
 static std::vector<std::string> TORCH_NCCL_TRACE_BUFFER_SIZE = {
     "TORCH_NCCL_TRACE_BUFFER_SIZE"};
 
+// Control how much extra time we will wait for dumping the debugging info
+// before we exit and throws timeout exception.
+static std::vector<std::string> TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC = {
+    "TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC"};
+
+// Control the interval inside the watchdog thread to check the coordinated
+// signal from other ranks, e.g. to dump the debugging information.
+static std::vector<std::string> TORCH_NCCL_COORD_CHECK_MILSEC = {
+    "TORCH_NCCL_COORD_CHECK_MILSEC"};
+
 constexpr const char* NCCL_BACKEND_NAME = "nccl";
 
-constexpr const char* TIMEOUT_DUMP = "timeout_dump";
+constexpr const char* EXCEPTION_DUMP = "exception_dump";
+
+constexpr const int kWorkStatusUpdatePeriodMs = 30 * 1000; // 30 seconds
 
 constexpr auto kProcessGroupNCCLDefaultTimeout =
     std::chrono::milliseconds(10 * 60 * 1000);
@@ -116,6 +147,59 @@ static std::vector<std::string> TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK =
     {"TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK",
      "NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK"};
 
+#if defined(__linux__)
+struct DumpPipe {
+  DumpPipe(int rank) {
+    std::string fileStem =
+        getCvarString({"TORCH_NCCL_DEBUG_INFO_PIPE_FILE"}, "");
+    if (fileStem.empty() ||
+        getCvarInt({"TORCH_NCCL_TRACE_BUFFER_SIZE"}, 0) <= 0) {
+      return;
+    }
+    TORCH_CHECK(!fileStem.empty(), "TORCH_NCCL_DEBUG_INFO_TEMP_FILE is empty");
+    std::string filename = c10::str(fileStem, rank, ".pipe");
+    TORCH_CHECK(
+        unlink(filename.c_str()) != -1 || errno == ENOENT,
+        "Error removing existing named pipe ",
+        filename);
+    TORCH_CHECK(
+        mkfifo(filename.c_str(), 0666) != -1,
+        "Error creating named pipe ",
+        filename);
+    fd_ = open(filename.c_str(), O_RDONLY | O_NONBLOCK);
+    LOG(INFO) << "Pipe file " << filename
+              << " has been opened, write to it to trigger NCCL Debug Dump.";
+    TORCH_CHECK(fd_ != -1, "Error opening named pipe ", filename);
+  }
+  bool shouldDump() {
+    if (fd_ == -1) {
+      return false;
+    }
+    char buf[128];
+    // non-blocking from O_NONBLOCK above.
+    // Ignore EINTR because we already will poll this
+    // again later.
+    ssize_t bytesRead = read(fd_, &buf, 128);
+    return bytesRead > 0;
+  }
+  ~DumpPipe() {
+    if (fd_ != -1) {
+      close(fd_);
+    }
+  }
+
+ private:
+  int fd_ = -1;
+};
+#else
+struct DumpPipe {
+  DumpPipe(int rank) {}
+  bool shouldDump() {
+    return false;
+  }
+};
+#endif
+
 // ProcessGroupNCCL implements NCCL bindings for c10d.
 //
 // All functions of the class are expected to be called in the same order
@@ -159,7 +243,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
     // Constructor takes a list of CUDA devices
     WorkNCCL(
-        const std::vector<at::Device>& devices,
+        at::Device& device,
         int rank,
         OpType opType,
         uint64_t seq,
@@ -196,7 +280,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     void synchronize() override;
 
     // Synchronize streams by blocking each on the NCCL stream
-    void synchronizeStreams();
+    void synchronizeStream();
 
     // Helper function to handle exception (throw if needed).
     void handleException(ErrorHandlingMode asyncErrorHandling);
@@ -227,22 +311,20 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
    protected:
     // The cached list of CUDA devices to operate on
-    std::vector<at::Device> devices_;
+    at::Device device_;
 
-    // The start CUDA events of NCCL operator tracking this work item on
-    // multiple CUDA devices. These start CUDA events are needed by desync
-    // debugging if enabled.
-    std::shared_ptr<std::vector<at::cuda::CUDAEvent>> ncclStartEvents_;
+    // The start CUDA event of NCCL operator tracking this work item. These
+    // start CUDA events are needed by desync debugging if enabled.
+    std::shared_ptr<at::cuda::CUDAEvent> ncclStartEvent_;
 
-    // The end CUDA events of NCCL operator tracking this work item on
-    // multiple CUDA devices.
-    std::shared_ptr<std::vector<at::cuda::CUDAEvent>> ncclEndEvents_;
+    // The end CUDA event of NCCL operator tracking this work item.
+    std::shared_ptr<at::cuda::CUDAEvent> ncclEndEvent_;
 
-    // The NCCL communicators used for this work item.
-    std::vector<std::shared_ptr<NCCLComm>> ncclComms_;
+    // The NCCL communicator used for this work item.
+    std::shared_ptr<NCCLComm> ncclComm_;
 
     // Tensors used for barrier op
-    std::vector<at::Tensor> barrierTensors_;
+    at::Tensor barrierTensor_;
 
     // Clone of blockingWait_ from ProcessGroupNCCL.
     bool blockingWait_ = false;
@@ -270,8 +352,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
     // Wrapper method for the static checkForNCCLErrors which can be overridden
     // for tests.
-    virtual std::exception_ptr checkForNCCLErrors(
-        const std::vector<std::shared_ptr<NCCLComm>>& ncclComms) const;
+    virtual std::exception_ptr checkForNCCLErrors();
 
     friend std::ostream& operator<<(
         std::ostream& output,
@@ -303,7 +384,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // TORCH_NCCL_AVOID_RECORD_STREAMS implementation helper.
     // Stores references to participating non-output tensors (ie inputs,
     // flattened intermediates).
-    // We'll clear this list in synchronizeStreams, just after user-facing
+    // We'll clear this list in synchronizeStream, just after user-facing
     // stream(s) are synced with the nccl work stream(s).
     // By keeping these refs (as well as outputs_) alive until after the
     // collective's work rejoins the user-facing streams, we achieve
@@ -323,28 +404,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     friend class ProcessGroupNCCL;
   };
 
-  class CoalescedWorkNCCL
-      : public Work,
-        public std::enable_shared_from_this<CoalescedWorkNCCL> {
-   public:
-    // Constructor takes a list of WorkNCCL works
-    CoalescedWorkNCCL(
-        std::vector<ProcessGroupNCCL::WorkNCCL> works,
-        int rank,
-        OpType opType);
-
-    ~CoalescedWorkNCCL() override;
-
-    // Same as calling synchronize() for NCCL work.
-    bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
-
-   protected:
-    // The cached list of CUDA devices to operate on
-    std::vector<ProcessGroupNCCL::WorkNCCL> works_;
-
-    friend class ProcessGroupNCCL;
-  };
-
   struct Options : Backend::Options {
     // NOTE: timeout in ProcessGroupNCCL::Options denote the timeout for
     // operations. This is only used when blockingWait_ is enabled.
@@ -369,6 +428,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     std::shared_ptr<ProcessGroupNCCL> split_from;
     int64_t split_color{0};
     std::vector<uint64_t> global_ranks_in_group;
+    std::string group_name;
   };
 
   // If you wish to create multiple process groups, each with a potentially
@@ -404,6 +464,10 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   ~ProcessGroupNCCL() override;
 
+  uint64_t getUid() {
+    return static_cast<uint64_t>(uid_);
+  }
+
   c10::intrusive_ptr<Options> getOptions() {
     return options_;
   }
@@ -420,13 +484,16 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   c10::intrusive_ptr<Work> endCoalescing() override;
 
+  // For specifying a composite optype, such as ALLGATHER and REDUCE_SCATTER
+  c10::intrusive_ptr<Work> endCoalescing(OpType optype);
+
   c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& tensors,
       const BroadcastOptions& opts = BroadcastOptions()) override;
 
   c10::intrusive_ptr<Work> _broadcast_oop(
-      std::vector<at::Tensor>& outputTensors,
-      std::vector<at::Tensor>& inputTensors,
+      at::Tensor& outputTensors,
+      at::Tensor& inputTensors,
       const BroadcastOptions& opts = BroadcastOptions());
 
   c10::intrusive_ptr<Work> allreduce_sparse(
@@ -447,8 +514,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       const ReduceOptions& opts = ReduceOptions()) override;
 
   c10::intrusive_ptr<Work> _reduce_oop(
-      std::vector<at::Tensor>& outputTensors,
-      std::vector<at::Tensor>& inputTensors,
+      at::Tensor& outputTensors,
+      at::Tensor& inputTensors,
       const ReduceOptions& opts = ReduceOptions());
 
   c10::intrusive_ptr<Work> allgather(
@@ -515,9 +582,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   void groupEnd();
 
-  void groupEndNonblocking(std::vector<std::shared_ptr<NCCLComm>> comms);
+  void groupEndNonblocking(std::shared_ptr<NCCLComm> comm);
 
-  // Unsupported Ops
   c10::intrusive_ptr<Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
@@ -528,6 +594,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ScatterOptions& opts = ScatterOptions()) override;
 
+  // Unsupported Ops
   c10::intrusive_ptr<Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
       int tag) override;
@@ -553,17 +620,17 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   // Helper function for iteratively aborting communicators in the provided map
   void abortCommsFromMap(
-      std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLComm>>>&
-          ncclCommsMap,
+      std::unordered_map<std::string, std::shared_ptr<NCCLComm>>& ncclCommsMap,
       c10::optional<std::string> abortReason);
 
   c10::intrusive_ptr<intra_node_comm::IntraNodeComm> initIntraNodeComm();
 
   // Provides an API to abort the ProcessGroup (similar to ncclCommAbort)
   // instead of relying on ProcessGroupNCCL destructor.
-  void abort(c10::optional<std::string> abortReason = c10::nullopt);
+  // return true if abort is successful, otherwise false
+  bool abort(c10::optional<std::string> abortReason = c10::nullopt);
 
-  void shutdown();
+  void shutdown(c10::optional<std::string> reason = c10::nullopt);
 
   void eagerConnectSingleDevice(at::Device device) override;
 
@@ -579,30 +646,32 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   // Helper that either looks up the cached NCCL communicators or creates
   // a new set of NCCL communicators as a cache entry
-  std::vector<std::shared_ptr<NCCLComm>>& getNCCLComm(
-      const std::string& devicesKey,
-      const std::vector<at::Device>& devices,
+  std::shared_ptr<NCCLComm> getNCCLComm(
+      const std::string& deviceKey,
+      at::Device& device,
       OpType opType,
       int p2pRank = 0,
       bool isSendRecvSelf = false);
 
   // Wrapper method which can be overridden for tests.
   virtual std::exception_ptr checkForNCCLErrors(
-      const std::vector<std::shared_ptr<NCCLComm>>& ncclComms);
+      std::shared_ptr<NCCLComm>& ncclComm);
 
+  // Ensure thaht if record is True, the work obj will be enqueued via
+  // workEnqueue
   virtual c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> initWork(
-      std::vector<at::Device> devices,
+      at::Device& device,
       int rank,
       OpType opType,
       const char* profilingTitle = nullptr,
       const std::vector<at::Tensor>& inputs = {},
-      const std::vector<at::Tensor>& outputs = {});
+      const std::vector<at::Tensor>& outputs = {},
+      bool record = false);
 
-  virtual c10::intrusive_ptr<ProcessGroupNCCL::CoalescedWorkNCCL>
-  initCoalescedWork(
-      const std::vector<c10::intrusive_ptr<Work>>& works,
-      int rank,
-      OpType opType);
+  // In the timeout case and we will dump debug info such as the NCCL flight
+  // recorder to storage. Down the road, if we have more complicated or blocking
+  // operations, we might need to use a side thread to do it.
+  bool dumpDebuggingInfo();
 
  private:
   int globalRankStart;
@@ -616,8 +685,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   //    void {pre,post}(std::vector<at::cuda::CUDAStream&>);
   template <typename Fn>
   c10::intrusive_ptr<Work> collective(
-      std::vector<at::Tensor>& input,
-      std::vector<at::Tensor>& output,
+      at::Tensor& input,
+      at::Tensor& output,
       Fn fn,
       OpType opType,
       const char* profilingTitle = nullptr,
@@ -625,8 +694,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   template <typename Fn, typename PreProcess, typename PostProcess>
   c10::intrusive_ptr<Work> collective(
-      std::vector<at::Tensor>& input,
-      std::vector<at::Tensor>& output,
+      at::Tensor& input,
+      at::Tensor& output,
       Fn fn,
       PreProcess pre,
       PostProcess post,
@@ -634,19 +703,29 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       const char* profilingTitle = nullptr,
       bool avoidRecordStreams = false);
 
+  template <typename Fn>
+  c10::intrusive_ptr<Work> collectiveCoalesced(
+      std::vector<at::Tensor>& input,
+      std::vector<at::Tensor>& output,
+      Fn fn,
+      OpType opType,
+      const char* profilingTitle = nullptr,
+      bool avoidRecordStreams = false);
+
   // Helper that encapsulates work shared across point-to-point communication
   // primitives. It is the same structure as the helper used for collective
   // communication primitives.
   template <typename Fn>
   c10::intrusive_ptr<Work> pointToPoint(
-      std::vector<at::Tensor>& tensor,
+      at::Tensor& tensor,
       Fn fn,
       int peer,
       OpType opType,
       const char* profilingTitle = nullptr);
+
   template <typename Fn, typename PreProcess, typename PostProcess>
   c10::intrusive_ptr<Work> pointToPoint(
-      std::vector<at::Tensor>& tensor,
+      at::Tensor& tensor,
       Fn fn,
       int peer,
       OpType opType,
@@ -655,13 +734,13 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       const char* profilingTitle);
 
   c10::intrusive_ptr<Work> allreduce_impl(
-      std::vector<at::Tensor>& tensors,
+      at::Tensor& tensor,
       const AllreduceOptions& opts = AllreduceOptions());
 
   // Checks for NCCL errors on each of the communicators and returns an
   // appropriate exception_ptr (nullptr if no errors).
   static std::exception_ptr checkForNCCLErrorsInternal(
-      const std::vector<std::shared_ptr<NCCLComm>>& ncclComms);
+      std::shared_ptr<NCCLComm>& ncclComm);
 
   // Function that runs as part of a separate thread and checks for errors on
   // NCCL communicators. We need a separate thread to check for NCCL errors
@@ -679,17 +758,9 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // guarantee that this heuristic is the correct assignment of ranks
   // to GPUs that Python layers use, but in practice it tends to be.
   // Fortunately we don't rely on this for correctness of any tensor
-  // operations, just for ancillary uses like health checks and
-  // barriers.
+  // operations, just for ancillary uses like barriers.
   at::Device guessDeviceForRank() const;
 
-  // Performs a health check by initializing dummy NCCL communicators and then
-  // destroying them. This will help indicate and signal any NCCL-related issues
-  // prior to the first collective. The actual initialization and subsequent
-  // destruction is ran on a separate thread and the main thread is signalled
-  // about timeouts/errors to report to the application.
-  void runHealthCheck();
-
   // Destroys initialized NCCL communicators in devNCCLComMap_ given by input
   // key. Throws if there are no communicators to destroy. Also removes
   // communicators from the cache and clears used device indices.
@@ -702,11 +773,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   void runHookLoop();
 
-  // In the timeout case and we will dump debug info such as the NCCL flight
-  // recorder to storage. Down the road, if we have more complicated or blocking
-  // operations, we might need to use a side thread to do it.
-  bool dumpDebuggingInfo();
-
   // Desync debug helper
   void logWorkStart(WorkNCCL& work);
 
@@ -726,6 +792,9 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // return the rank_ of the the very first PG created, aka, default global PG.
   const int& globalRank() const;
 
+  // Returns the global ranks of a PG.
+  const std::vector<uint64_t>& groupRanks() const;
+
  protected:
   // Function that runs as part of a separate thread aside from watchdog
   // thread because we need to check the heartbeat from watchdog thread
@@ -737,16 +806,12 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // gets terminated.
   virtual void terminateProcess(std::string errMsg);
 
-  // Create a thread that dumps debug info to the file specified as
-  // ${TORCH_NCCL_DEBUG_INFO_TEMP_FILE}{$RANK}
-  // Serializes all dumping activity, but allows concurrent calls.
-  // Each call returns a future, which can be checked or waited on
-  // for dump completion.
-  std::future<bool> launchAsyncDebugDump();
-
-  // Helper to wait up to the specified timeout and then abandon the dump.
-  // Logs on timeout, and asserts the future's status is as expected.
-  void waitForDumpOrTimeout(std::future<bool>& fut, size_t timeout_sec = 30);
+  // A helper function to wait for a future to complete or timeout.
+  void waitForFutureOrTimeout(
+      std::future<bool>& fut,
+      const std::chrono::milliseconds& timeOutMilSec,
+      const std::string& futDescription,
+      bool throwException = false);
 
   // When watchdog timeout, this function will be called and return debug info
   // for users. For now we only get information from retrieveDesyncReport.
@@ -756,9 +821,16 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   static const int64_t kWatchdogThreadSleepMillis;
 
-  // The store is used to broadcast the NCCL unique ID of rank 0.
+  // The store is used to broadcast the NCCL unique ID of rank 0. This store
+  // comes with prefix and it is different across ProcessGroup NCCL instances
+  // (aka, different ProcessGroups).
   c10::intrusive_ptr<Store> store_;
 
+  // Reference to the store without prefix so that keys are same across all
+  // ProcessGroup NCCL instances and (key, value) pairs written to the store are
+  // global.
+  c10::intrusive_ptr<Store> globalStore_;
+
   bool storeError_{false};
 
   const c10::intrusive_ptr<Options> options_;
@@ -802,16 +874,14 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // communication, the key will be "1:2" on both processes. Note: this is for
   // the scenario where there is only 1 GPU per process. When it comes to
   // multiple GPUs per process, this part may need to redesigned.
-  std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLComm>>>
-      devNCCLCommMap_;
+  std::unordered_map<std::string, std::shared_ptr<NCCLComm>> devNCCLCommMap_;
 
   // The NCCL communicators currently in process of being initialized.
-  std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLComm>>>
+  std::unordered_map<std::string, std::shared_ptr<NCCLComm>>
       inInitializationCommMap_;
 
   // Map from ncclUniqueId to appropriate communicator.
-  std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLComm>>>
-      ncclIdToCommMap_;
+  std::unordered_map<std::string, std::shared_ptr<NCCLComm>> ncclIdToCommMap_;
 
   // Mutex to guard maps like devNCCLCommMap_ and ncclIdToCommMap_.
   std::mutex mutex_;
@@ -822,6 +892,13 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // The time interval used for deciding whether there is no watchdog heartbeat.
   int heartbeatTimeoutInSec_;
 
+  // timeout for the dump to finish.
+  int waitTimeoutDumpInMilSec_;
+
+  // Interval of check coordinated signals in ProcessGroupNCCL from other ranks
+  // e.g., trigger the dump of the debugging info for timeout when notified.
+  int coordCheckIntervalMilSec_;
+
   // Size of ring buffer where we store NCCL Traces for debugging.
   int ncclTraceBufferSize_;
 
@@ -851,6 +928,15 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Whether there are hooks pending to be fired
   std::atomic<bool> hasPendingHooks_;
 
+  // This is the signal from watchdog threads to indicate whether the monitor
+  // thread should dump. Making it static so that it is accessiable from all the
+  // PGs. With this flag, monitor thread would dump debug info under any one of
+  // the 3 conditions: 1: this flag is set to true by the watchdog thread when
+  // it detects a timeout. 2: timeout signal is received from
+  // other ranks through tcpstore 3: no heartbeat of watchdog Note that only the
+  // monitor thread from PG0 should dump the debug info and only once
+  static std::atomic<bool> shouldDump_;
+
   // Mutex to Guard workMetaList_
   std::mutex workMetaListMutex_;
 
@@ -882,11 +968,10 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   void workEnqueue(c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>);
 
   // The CUDA streams used by NCCL kernels
-  std::unordered_map<std::string, std::vector<at::cuda::CUDAStream>>
-      ncclStreams_;
+  std::unordered_map<std::string, at::cuda::CUDAStream> ncclStreams_;
 
   // The CUDA events used to sync NCCL streams
-  std::unordered_map<std::string, std::vector<at::cuda::CUDAEvent>> ncclEvents_;
+  std::unordered_map<std::string, at::cuda::CUDAEvent> ncclEvents_;
 
   // Device Indexes used for all collectives in this group
   std::set<int> usedDeviceIdxs_;
@@ -895,10 +980,10 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   int coalescing_state_ = 0;
 
   // Stores device indexes for all collectives run inside a coalescing block
-  std::vector<std::vector<at::Device>> coalescedDevices_;
+  at::Device coalescedDevice_ = at::Device("cuda");
 
   // Stores communicators for all collectives run inside a coalescing block
-  std::vector<std::vector<std::shared_ptr<NCCLComm>>> coalescedComms_;
+  std::shared_ptr<NCCLComm> coalescedComm_ = nullptr;
 
   // map from the key: "group name + pg counter (ID)" to the
   // unique NCCL ID count. This needs to be group and pg specific
@@ -935,8 +1020,9 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Whether or not to enable timeout root cause analysis.
   bool desyncDebug_;
 
-  // Whether or not to dump debug info on timeout
-  bool dumpOnTimeout_;
+  // Whether or not to dump debug info on exception including both watchdog
+  // timeout and nccl errors.
+  bool dumpOnException_;
 
   // Whether or not to create start CUDAEvent and enable timing for start
   // and end events. Note that enableTiming_ is always true if desyncDebug_
@@ -962,8 +1048,36 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   static thread_local uint64_t ncclActiveGroupCounter_;
 
   // Counting for the sequential number of NCCL collective call.
+  // (specifically, how many actual kernels we launched, which differs from
+  // op_id_ when coalescing is enabled)
   uint64_t seq_{0};
 
+  // Incrementing counter for logical operations (collective or p2p) issued on
+  // the ProcessGroup
+  uint64_t op_id_{0};
+
+  // the sequential number of the last colletive enqueued into workMetaList_
+  // This is useful for indentifying a rank that has not join a collective
+  // initialized to be -1 to indicate no collective has been enqueued
+  int64_t lastEnqueuedSeq_{-1};
+
+  // the name of the last collective enqueued into workMetaList_
+  std::string lastEnqueuedWorkName_;
+
+  // the sequential number of the last colletive started as the kernal
+  int64_t lastStartedSeq_{-1};
+
+  // the name of the last collective started as the kernal
+  std::string lastStartedWorkName_;
+
+  // the sequential number of the last colletive completed marked by
+  // the watchdog thread
+  // initialized to be -1 to indicate no collective has been completed
+  int64_t lastCompletedSeq_{-1};
+
+  // the name of the last collective completed
+  std::string lastCompletedWorkName_;
+
   std::exception_ptr watchDogException_ = nullptr;
 
   size_t uid_;
@@ -976,11 +1090,18 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 TORCH_API std::string dump_nccl_trace();
 
 // Gets a mutable reference to a global optional function.  Heartbeat Monitor
-// will query this function and if available, call it to dump traces. Inside
-// fbcode, we store a function here that uses an internal tool for process
-// tracing
-TORCH_API c10::optional<std::function<std::string()>>& get_cpp_trace_dumper();
-
+// will use this function to dump traces, if available. Inside fbcode, we store
+// a function here that uses an internal tool for process tracing
+TORCH_API c10::optional<
+    std::function<void(std::function<void(const std::string&)>)>>&
+get_cpp_trace_dumper();
+
+// Similar to get_cpp_trace_dumper, this stores a function defined in
+// torch-python layer that lets us check whether the GIL can be acquired,
+// helpful for instrumenting in cases where a hang was observed.
+typedef bool (*gil_checker_t)();
+
+TORCH_API gil_checker_t& get_gil_checker();
 } // namespace c10d
 
 #endif // USE_C10D_NCCL
diff --git a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
index cd6c4aa0108bd..d52adada45868 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
@@ -1,5 +1,6 @@
 #ifdef USE_C10D_UCC
 
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <torch/csrc/distributed/c10d/ProcessGroupUCC.hpp>
 #include <torch/csrc/distributed/c10d/UCCTracing.hpp>
 #include <torch/csrc/distributed/c10d/UCCUtils.hpp>
@@ -11,7 +12,6 @@
 namespace c10d {
 
 namespace {
-constexpr int64_t kBusyWaitMillis = 10;
 
 const std::map<c10::DeviceType, ucc_memory_type_t> ucc_mtype_map = {
     {c10::kCPU, UCC_MEMORY_TYPE_HOST},
@@ -44,7 +44,7 @@ ucc_datatype_t to_ucc_dType(at::Tensor _tensor) {
   }
   try {
     return ucc_dtype_map.at(_tensor.scalar_type());
-  } catch (const std::out_of_range& e) {
+  } catch (const std::out_of_range&) {
     TORCH_CHECK(false, "Not supported data type for UCC");
   }
 }
@@ -77,7 +77,7 @@ ucc_reduction_op_t to_ucc_reduceOp(
 
   try {
     return ucc_op_map.at(_op);
-  } catch (const std::out_of_range& e) {
+  } catch (const std::out_of_range&) {
     TORCH_CHECK(false, "Not supported ReduceOp for UCC");
   }
 }
@@ -528,6 +528,13 @@ void Comm::progress_loop() {
 #ifdef USE_CUDA
     if ((!device_set) && (cuda_device_index != TORCH_UCC_DEVICE_NOT_SET)) {
       c10::cuda::set_device(cuda_device_index);
+      CUcontext pctx = nullptr;
+      at::globalContext().getNVRTC().cuCtxGetCurrent(&pctx);
+      if (C10_UNLIKELY(!pctx)) {
+        at::globalContext().getNVRTC().cuDevicePrimaryCtxRetain(
+            &pctx, cuda_device_index);
+        at::globalContext().getNVRTC().cuCtxSetCurrent(pctx);
+      }
       device_set = true;
     }
 #endif
@@ -701,7 +708,7 @@ void ProcessGroupUCC::runHealthCheck() {
         if (is_last_device) {
           healthCheckData.healthCheckCv.notify_one();
         }
-      } catch (const std::exception& e) {
+      } catch (const std::exception&) {
         // Populate exception ptr.
         healthCheckData.healthCheckException = std::current_exception();
         // Unblock waiting main thread which will report exception.
diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
index 45f5135af6da1..a6086d28e9148 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
@@ -89,15 +89,15 @@ struct CollectiveFingerPrint {
   // Takes a serialized fingerprint from
   // CollectiveFingerPrint::serialize_fingerprint and deserializes it back to a
   // CollectiveFingerPrint struct
-  CollectiveFingerPrint deserialize_fingerprint(at::Tensor serialized_tensor) {
-    OpType optype;
+  CollectiveFingerPrint deserialize_fingerprint(
+      const at::Tensor& serialized_tensor) {
     auto dtypes = std::vector<int8_t>();
     auto device_types = std::vector<int8_t>();
     auto sizes = std::vector<std::vector<int64_t>>();
     int index = 0;
-    int seq = 0;
+    int64_t seq = 0;
     // 1. OpType
-    optype = OpType(serialized_tensor[index].item<int>());
+    auto optype = OpType(serialized_tensor[index].item<int>());
     index++;
     int num_tensors = 0;
     if (index < serialized_tensor.size(0)) {
@@ -383,7 +383,7 @@ bool check_same_size(const std::vector<at::Tensor>& input_tensors) {
 } // namespace
 
 ProcessGroupWrapper::ProcessGroupWrapper(
-    c10::intrusive_ptr<Backend> backend,
+    const c10::intrusive_ptr<Backend>& backend,
     c10::intrusive_ptr<Backend> glooBackend)
     : Backend(backend->getRank(), backend->getSize()),
       backend_(backend),
diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp
index 13503ca3f5a9f..50c0bfca921c5 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp
@@ -11,7 +11,7 @@ namespace c10d {
 class TORCH_API ProcessGroupWrapper : public Backend {
  public:
   explicit ProcessGroupWrapper(
-      c10::intrusive_ptr<Backend> backend,
+      const c10::intrusive_ptr<Backend>& backend,
       c10::intrusive_ptr<Backend> glooBackend);
 
   const std::string getBackendName() const override;
diff --git a/torch/csrc/distributed/c10d/PyProcessGroup.hpp b/torch/csrc/distributed/c10d/PyProcessGroup.hpp
index 05a5f9399d31a..684af0c2ecec2 100644
--- a/torch/csrc/distributed/c10d/PyProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/PyProcessGroup.hpp
@@ -66,6 +66,19 @@ class PyProcessGroup : public ProcessGroup {
         opts);
   }
 
+  c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override {
+    PYBIND11_OVERRIDE(
+        c10::intrusive_ptr<Work>, /* Return type */
+        ProcessGroup, /* Parent class */
+        allgather_into_tensor_coalesced, /* Name of function in C++ */
+        outputTensors,
+        inputTensors,
+        opts);
+  }
+
   c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& tensors,
       const AllreduceOptions& opts = AllreduceOptions()) override {
@@ -77,6 +90,35 @@ class PyProcessGroup : public ProcessGroup {
         opts);
   }
 
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override {
+    PYBIND11_OVERRIDE(
+        c10::intrusive_ptr<Work>, /* Return type */
+        ProcessGroup, /* Parent class */
+        allreduce_coalesced, /* Name of function in C++ */
+        tensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) override {
+    PYBIND11_OVERRIDE(
+        c10::intrusive_ptr<Work>, /* Return type */
+        ProcessGroup, /* Parent class */
+        alltoall_base, /* Name of function in C++ */
+        outputBuffer,
+        inputBuffer,
+        outputSplitSizes,
+        inputSplitSizes,
+        opts);
+  }
+
   c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& opts = BarrierOptions()) override {
     PYBIND11_OVERRIDE(
@@ -110,6 +152,19 @@ class PyProcessGroup : public ProcessGroup {
         opts);
   }
 
+  c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
+    PYBIND11_OVERRIDE(
+        c10::intrusive_ptr<Work>, /* Return type */
+        ProcessGroup, /* Parent class */
+        reduce_scatter_tensor_coalesced, /* Name of function in C++ */
+        outputTensors,
+        inputTensors,
+        opts);
+  }
+
   c10::intrusive_ptr<Work> send(
       std::vector<at::Tensor>& tensors,
       int dstRank,
diff --git a/torch/csrc/distributed/c10d/RankLocal.hpp b/torch/csrc/distributed/c10d/RankLocal.hpp
index 9b361172b22df..b3a649659af4c 100644
--- a/torch/csrc/distributed/c10d/RankLocal.hpp
+++ b/torch/csrc/distributed/c10d/RankLocal.hpp
@@ -31,7 +31,7 @@ class RankLocal {
     const auto node = torch::autograd::get_current_node();
     auto fwd_thread_id = node == nullptr ? at::RecordFunction::currentThreadId()
                                          : node->thread_id();
-    // Optimistically aquire the read lock first, since most likely we are in
+    // Optimistically acquire the read lock first, since most likely we are in
     // an autograd thread and the object has already been constructed.
     {
       std::shared_lock read_lock(lock_);
diff --git a/torch/csrc/distributed/c10d/Store.hpp b/torch/csrc/distributed/c10d/Store.hpp
index 3c0ae960ff7ca..525440e767b47 100644
--- a/torch/csrc/distributed/c10d/Store.hpp
+++ b/torch/csrc/distributed/c10d/Store.hpp
@@ -2,7 +2,6 @@
 
 #include <chrono>
 #include <cstdint>
-#include <stdexcept>
 #include <string>
 #include <vector>
 
diff --git a/torch/csrc/distributed/c10d/TCPStore.cpp b/torch/csrc/distributed/c10d/TCPStore.cpp
index 7de20bbcce5ff..a95f0ebdb1e26 100644
--- a/torch/csrc/distributed/c10d/TCPStore.cpp
+++ b/torch/csrc/distributed/c10d/TCPStore.cpp
@@ -5,13 +5,9 @@
 #include <torch/csrc/distributed/c10d/logging.h>
 
 #include <fcntl.h>
-#include <algorithm>
-#include <array>
 #include <chrono>
 #include <fstream>
 #include <random>
-#include <streambuf>
-#include <system_error>
 #include <thread>
 #include <unordered_map>
 #include <utility>
@@ -257,7 +253,7 @@ class SendBuffer {
   }
 
   void flush() {
-    if (buffer.size() > 0) {
+    if (!buffer.empty()) {
       client.sendRaw(buffer.data(), buffer.size());
       buffer.clear();
     }
@@ -390,7 +386,7 @@ void TCPStore::waitForWorkers() {
   }
 }
 
-void TCPStore::validate(void) {
+void TCPStore::validate() {
   const std::lock_guard<std::mutex> lock(activeOpLock_);
   detail::SendBuffer buffer(*client_, detail::QueryType::VALIDATE);
   buffer.appendValue<std::uint32_t>(c10d::detail::validationMagicNumber);
@@ -625,7 +621,7 @@ bool TCPStore::hasExtendedApi() const {
 std::unordered_map<std::string, std::unordered_map<std::string, double>>
 TCPStore::collectClientCounters() const noexcept {
   std::unordered_map<std::string, std::unordered_map<std::string, double>> res;
-  for (auto kv : clientCounters_) {
+  for (const auto& kv : clientCounters_) {
     res[kv.first] = kv.second.observe();
   }
   return res;
diff --git a/torch/csrc/distributed/c10d/TCPStore.hpp b/torch/csrc/distributed/c10d/TCPStore.hpp
index 3919b494376be..91ed8952661fe 100644
--- a/torch/csrc/distributed/c10d/TCPStore.hpp
+++ b/torch/csrc/distributed/c10d/TCPStore.hpp
@@ -141,7 +141,7 @@ class TORCH_API TCPStore : public Store {
  private:
   int64_t incrementValueBy(const std::string& key, int64_t delta);
 
-  void validate(void);
+  void validate();
 
   std::vector<uint8_t> doGet(const std::string& key);
 
diff --git a/torch/csrc/distributed/c10d/TCPStoreBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreBackend.cpp
index 4c54932e3493d..f949926cbdac3 100644
--- a/torch/csrc/distributed/c10d/TCPStoreBackend.cpp
+++ b/torch/csrc/distributed/c10d/TCPStoreBackend.cpp
@@ -25,11 +25,10 @@
 
 #include <torch/csrc/distributed/c10d/socket.h>
 
-namespace c10d {
-namespace detail {
+namespace c10d::detail {
 
 // Background thread parent class methods
-BackgroundThread::BackgroundThread() {}
+BackgroundThread::BackgroundThread() = default;
 
 BackgroundThread::~BackgroundThread() = default;
 
@@ -324,7 +323,7 @@ void TCPStoreMasterDaemon::doSet(
 }
 
 void TCPStoreMasterDaemon::validateHandler(int socket) {
-  uint32_t validateNumber;
+  uint32_t validateNumber = 0;
   tcputil::recvBytes<uint32_t>(socket, &validateNumber, 1);
   if (validateNumber != detail::validationMagicNumber) {
     TORCH_CHECK(
@@ -539,6 +538,7 @@ void TCPStoreMasterDaemon::run() {
       int rawSocket = socket.handle();
       sockets_.emplace_back(std::move(socket));
       tcputil::addPollfd(fds, rawSocket, POLLIN);
+      addMiscellaneousSocket(rawSocket);
     }
     queryFds(fds);
   }
@@ -611,5 +611,4 @@ std::unique_ptr<BackgroundThread> create_tcpstore_backend(
   return std::make_unique<TCPStoreMasterDaemon>(std::move(socket));
 }
 
-} // namespace detail
-} // namespace c10d
+} // namespace c10d::detail
diff --git a/torch/csrc/distributed/c10d/TCPStoreBackend.hpp b/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
index 572340d142984..10cff9552d696 100644
--- a/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
+++ b/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
@@ -1,8 +1,6 @@
 #pragma once
 
-#include <chrono>
 #include <thread>
-#include <vector>
 
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
 #include <torch/csrc/distributed/c10d/socket.h>
@@ -15,8 +13,7 @@
 #include <unistd.h>
 #endif
 
-namespace c10d {
-namespace detail {
+namespace c10d::detail {
 
 // Magic number for client validation.
 static const uint32_t validationMagicNumber = 0x3C85F7CE;
@@ -63,7 +60,7 @@ class BackgroundThread {
   }
 
  private:
-  std::atomic<bool> is_running_;
+  std::atomic<bool> is_running_{false};
   std::thread daemonThread_{};
 };
 
@@ -73,5 +70,4 @@ std::unique_ptr<BackgroundThread> create_libuv_tcpstore_backend(
     const TCPStoreOptions& opts);
 bool is_libuv_tcpstore_backend_available();
 
-} // namespace detail
-} // namespace c10d
+} // namespace c10d::detail
diff --git a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
index 2899411ae6475..32c069a98f3bd 100644
--- a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
+++ b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
@@ -4,6 +4,7 @@
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include <fmt/format.h>
@@ -15,8 +16,7 @@
 #include <uv.h>
 #endif
 
-namespace c10d {
-namespace detail {
+namespace c10d::detail {
 
 #ifdef TORCH_USE_LIBUV
 
@@ -40,7 +40,7 @@ Other callbacks don't provide exception safety so avoid there.
 #define ALLOC_BUFFER_SIZE ((size_t)4000)
 class UvHandle : public c10::intrusive_ptr_target {
  public:
-  virtual ~UvHandle() {}
+  ~UvHandle() override = default;
 
   c10::intrusive_ptr<UvHandle> iptr() {
     return c10::intrusive_ptr<UvHandle>::reclaim_copy(this);
@@ -83,7 +83,7 @@ class UvHandle : public c10::intrusive_ptr_target {
 };
 
 class UvTcpSocket : public UvHandle {
-  uv_tcp_t client;
+  uv_tcp_t client{};
 
   c10::intrusive_ptr<UvTcpSocket> iptr() {
     return c10::intrusive_ptr<UvTcpSocket>::reclaim_copy(this);
@@ -212,8 +212,8 @@ class UvTcpServer : public UvTcpSocket {
     auto res = c10::make_intrusive<UvTcpServer>(loop);
     res->handleReady();
     try {
-      struct sockaddr_storage addr;
-      int uv_res;
+      struct sockaddr_storage addr {};
+      int uv_res = 0;
       if (useIpv6) {
         uv_res = uv_ip6_addr("::", port, (struct sockaddr_in6*)&addr);
       } else {
@@ -260,7 +260,7 @@ class UvTcpServer : public UvTcpSocket {
     return portNum;
   }
 
-  void accept(c10::intrusive_ptr<UvTcpSocket> socket) {
+  void accept(const c10::intrusive_ptr<UvTcpSocket>& socket) {
     int res =
         uv_accept(unsafeGetStream(), (uv_stream_t*)socket->unsafeGetHandle());
     TORCH_CHECK(
@@ -272,7 +272,7 @@ class UvTcpServer : public UvTcpSocket {
 
  private:
   OnConnectCallback onConnectCb;
-  uint16_t portNum;
+  uint16_t portNum{};
 
   c10::intrusive_ptr<UvTcpServer> iptr() {
     return c10::intrusive_ptr<UvTcpServer>::reclaim_copy(this);
@@ -353,11 +353,11 @@ class WriterPayload : public c10::intrusive_ptr_target {
   WriterPayload(
       std::vector<uint8_t>&& in_data,
       c10::intrusive_ptr<UvHandle> handle)
-      : data(std::move(in_data)), handle(handle) {
+      : data(std::move(in_data)), handle(std::move(handle)) {
     uv_req_set_data((uv_req_t*)&req, this);
   }
 
-  ~WriterPayload() {}
+  ~WriterPayload() override = default;
 
   void send() {
     buf = uv_buf_init((char*)data.data(), data.size());
@@ -387,7 +387,8 @@ class StreamWriter {
   void* operator new(size_t);
 
  public:
-  StreamWriter(c10::intrusive_ptr<UvHandle> handle) : handle(handle) {}
+  StreamWriter(c10::intrusive_ptr<UvHandle> handle)
+      : handle(std::move(handle)) {}
 
   void write1(uint8_t val) {
     data.push_back(val);
@@ -416,19 +417,14 @@ class StreamWriter {
 
 class ChunkedStream {
   std::deque<uv_buf_t> buffers;
-  size_t buff_idx;
-  size_t buff_offset;
-  size_t capacity;
-  size_t buff_offset_commit;
-  size_t read_offset;
+  size_t buff_idx{0};
+  size_t buff_offset{0};
+  size_t capacity{0};
+  size_t buff_offset_commit{0};
+  size_t read_offset{0};
 
  public:
-  ChunkedStream()
-      : buff_idx(0),
-        buff_offset(0),
-        capacity(0),
-        buff_offset_commit(0),
-        read_offset(0) {}
+  ChunkedStream() = default;
 
   size_t buf_count() {
     return buffers.size();
@@ -571,15 +567,15 @@ class LibUVStoreDaemon : public BackgroundThread {
   bool checkKeys(const std::vector<std::string>& keys);
   bool waitKeys(
       const std::vector<std::string>& keys,
-      c10::intrusive_ptr<UvHandle> client);
+      const c10::intrusive_ptr<UvHandle>& client);
   int64_t size();
   int64_t deleteKey(const std::string& key);
   void append(const std::string& key, const std::vector<uint8_t>& value);
 
-  void registerClient(c10::intrusive_ptr<UvHandle> client);
-  void unregisterClient(c10::intrusive_ptr<UvHandle> client);
-  void clearClientWaitState(c10::intrusive_ptr<UvHandle> client);
-  bool isMiscellaneousClient(c10::intrusive_ptr<UvHandle> client);
+  void registerClient(const c10::intrusive_ptr<UvHandle>& client);
+  void unregisterClient(const c10::intrusive_ptr<UvHandle>& client);
+  void clearClientWaitState(const c10::intrusive_ptr<UvHandle>& client);
+  bool isMiscellaneousClient(const c10::intrusive_ptr<UvHandle>& client);
 
   uint16_t get_socket_port(uv_tcp_t* handle);
   void init(const TCPStoreOptions& opts);
@@ -589,10 +585,10 @@ class LibUVStoreDaemon : public BackgroundThread {
   void stop() override;
 
  private:
-  uv_loop_t loop;
+  uv_loop_t loop{};
   c10::intrusive_ptr<UvTcpServer> tcpServer;
 
-  uv_async_t exit_handle;
+  uv_async_t exit_handle{};
   std::unordered_map<std::string, std::vector<uint8_t>> tcpStore_;
   // From key -> the list of UvClient waiting on the key
   std::unordered_map<std::string, std::vector<c10::intrusive_ptr<UvHandle>>>
@@ -707,7 +703,7 @@ class UvClient : public UvTcpSocket {
   }
 
   bool parse_validate_command() {
-    uint32_t validateNumber;
+    uint32_t validateNumber = 0;
     if (!stream.read_value(validateNumber))
       return false;
 
@@ -1044,8 +1040,8 @@ void LibUVStoreDaemon::run() {
     uv_walk(&loop, LibUVStoreDaemon::print_active_handles, nullptr);
   }
 
-  for (auto it = clients_.begin(); it != clients_.end(); ++it) {
-    (*it)->close();
+  for (const auto& client : clients_) {
+    client->close();
   }
   tcpServer->close();
 
@@ -1054,7 +1050,7 @@ void LibUVStoreDaemon::run() {
     uv_walk(&loop, LibUVStoreDaemon::print_active_handles, nullptr);
   }
 
-  while (1) {
+  while (true) {
     res = uv_loop_close(&loop);
     if (res == 0) {
       break;
@@ -1082,7 +1078,7 @@ void LibUVStoreDaemon::stop() {
 }
 
 bool LibUVStoreDaemon::isMiscellaneousClient(
-    c10::intrusive_ptr<UvHandle> client) {
+    const c10::intrusive_ptr<UvHandle>& client) {
   if (miscellaneousClients_.find(client) != miscellaneousClients_.end()) {
     miscellaneousClients_.erase(client);
     return true;
@@ -1090,12 +1086,14 @@ bool LibUVStoreDaemon::isMiscellaneousClient(
   return false;
 }
 
-void LibUVStoreDaemon::registerClient(c10::intrusive_ptr<UvHandle> client) {
+void LibUVStoreDaemon::registerClient(
+    const c10::intrusive_ptr<UvHandle>& client) {
   clients_.insert(client);
   miscellaneousClients_.insert(client);
 }
 
-void LibUVStoreDaemon::unregisterClient(c10::intrusive_ptr<UvHandle> client) {
+void LibUVStoreDaemon::unregisterClient(
+    const c10::intrusive_ptr<UvHandle>& client) {
   clients_.erase(client);
   if (miscellaneousClients_.find(client) != miscellaneousClients_.end()) {
     miscellaneousClients_.erase(client);
@@ -1104,7 +1102,7 @@ void LibUVStoreDaemon::unregisterClient(c10::intrusive_ptr<UvHandle> client) {
 }
 
 void LibUVStoreDaemon::clearClientWaitState(
-    c10::intrusive_ptr<UvHandle> client) {
+    const c10::intrusive_ptr<UvHandle>& client) {
   if (keysAwaited_.find(client) == keysAwaited_.end()) {
     return;
   }
@@ -1152,7 +1150,7 @@ const std::vector<uint8_t>& LibUVStoreDaemon::compareAndSet(
     }
   } else {
     if (pos->second == expectedValue) {
-      pos->second = std::move(newValue);
+      pos->second = newValue;
     }
     wakeupWaitingClients(key);
     return pos->second;
@@ -1192,7 +1190,7 @@ bool LibUVStoreDaemon::checkKeys(const std::vector<std::string>& keys) {
 
 bool LibUVStoreDaemon::waitKeys(
     const std::vector<std::string>& keys,
-    c10::intrusive_ptr<UvHandle> client) {
+    const c10::intrusive_ptr<UvHandle>& client) {
   if (checkKeys(keys)) {
     return true;
   }
@@ -1234,7 +1232,7 @@ void LibUVStoreDaemon::append(
 void LibUVStoreDaemon::wakeupWaitingClients(const std::string& key) {
   auto socketsToWait = waitingSockets_.find(key);
   if (socketsToWait != waitingSockets_.end()) {
-    for (auto client : socketsToWait->second) {
+    for (const auto& client : socketsToWait->second) {
       if (--keysAwaited_[client] == 0) {
         StreamWriter sw(client->iptr());
         sw.write1((uint8_t)WaitResponseType::STOP_WAITING);
@@ -1266,5 +1264,4 @@ bool is_libuv_tcpstore_backend_available() {
 #endif
 }
 
-} // namespace detail
-} // namespace c10d
+} // namespace c10d::detail
diff --git a/torch/csrc/distributed/c10d/TraceUtils.h b/torch/csrc/distributed/c10d/TraceUtils.h
index 5cfdc3b44f0ad..32f0e1f41df01 100644
--- a/torch/csrc/distributed/c10d/TraceUtils.h
+++ b/torch/csrc/distributed/c10d/TraceUtils.h
@@ -13,7 +13,6 @@
 #include <string>
 #include <system_error>
 #include <vector>
-
 namespace c10d {
 
 /* Trace Utils Related to TORCH_NCCL_DESYNC_DEBUG */
@@ -269,6 +268,16 @@ inline std::string retrieveDesyncReport(
 
 #ifdef USE_C10D_NCCL
 
+/* Helper used by work::getDuration() and nccl flight recorder */
+float getDurationFromEvent(
+    at::cuda::CUDAEvent& ncclStartEvent,
+    at::cuda::CUDAEvent& ncclEndEvent) {
+  TORCH_CHECK(
+      ncclEndEvent.query(),
+      "getDuration can only be called after work is succeeded.")
+  return ncclStartEvent.elapsed_time(ncclEndEvent);
+}
+
 DebugInfoWriter::~DebugInfoWriter() = default;
 
 void DebugInfoWriter::write(const std::string& ncclTrace) {
@@ -327,6 +336,28 @@ inline std::string pickle_str(const c10::IValue& v) {
   return std::string(result.begin(), result.end());
 }
 
+inline std::string get_python_cpp_trace() {
+  // usage:
+  // LOG(INFO) << "stacktrace: "
+  //           << get_python_cpp_trace();
+  // warn: might be slow in getting cpp traces
+  // because of slow/broken addr2line
+  // in different system libs
+  std::shared_ptr<torch::CapturedTraceback> tb =
+      torch::CapturedTraceback::gather(
+          /*python=*/true, /*script=*/true, /*cpp=*/true);
+  torch::SymbolizedTracebacks s_tbs = torch::symbolize({tb.get()});
+  const auto& s_tb = s_tbs.tracebacks.at(0);
+  std::stringstream oss;
+  for (auto idx : c10::irange(s_tb.size())) {
+    auto frame_id = s_tb[idx];
+    const auto& frame = s_tbs.all_frames.at(frame_id);
+    oss << "#" << idx << " " << frame.funcname << " from " << frame.filename
+        << ":" << frame.lineno << std::endl;
+  }
+  return oss.str();
+}
+
 inline c10::Dict<c10::IValue, c10::IValue> new_dict() {
   return c10::Dict<c10::IValue, c10::IValue>(
       c10::AnyType::get(), c10::AnyType::get());
@@ -336,6 +367,18 @@ inline c10::List<c10::IValue> new_list() {
   return c10::List<c10::IValue>(c10::AnyType::get());
 }
 
+inline std::string ranks_str(const std::vector<uint64_t>& ranks) {
+  std::string str;
+  for (const auto& rank : ranks) {
+    if (str.empty()) {
+      str = std::to_string(rank);
+    } else {
+      str += ", " + std::to_string(rank);
+    }
+  }
+  return c10::str("[", str, "]");
+}
+
 struct NCCLTraceBuffer {
   static NCCLTraceBuffer* get() {
     // intentionally leak on exit
@@ -348,27 +391,44 @@ struct NCCLTraceBuffer {
     capture_cpp_stack_ = getCvarBool({"TORCH_NCCL_TRACE_CPP_STACK"}, false);
     enabled_ = max_entries_ > 0;
   }
-  using EventList = std::vector<at::cuda::CUDAEvent>;
+  using Event = at::cuda::CUDAEvent;
   struct Entry {
     size_t id_; // incremented id in the trace buffer
                 // used to figure out where in the circular entries
                 // buffer this entry will be located to
                 // update state information
     size_t pg_id_;
-    size_t seq_id_; // as tracked by the process group
-    const char* profiling_name_;
+    std::tuple<std::string, std::string> pg_name_; // <group_name, group_desc>
+
+    // Both seq_id_ and op_id_ are per_pg incrementing counters
+    // seq_id refers to actual kernel launches (e.g. 1 per coalesced group)
+    // op_id refers to logical operations (e.g. one per op inside coalesced
+    // group)
+    size_t seq_id_;
+    size_t op_id_;
+    std::string profiling_name_;
 
     std::shared_ptr<torch::CapturedTraceback> traceback_;
-    // we borrow pointser to start_ and end_ so we can query the state
+    // we borrow pointers to start_ and end_ so we can query the state
     // on reporting. However, once the event is completed, the call
     // to `complete` will clear these.
-    EventList *start_, *end_;
+    Event *start_, *end_;
 
     // timestamp when the entry was created, likely close to the time the work
     // was 'enqueued'- not necessarily started
     c10::time_t time_created_;
+    c10::optional<float> duration_;
+
+    // timestamp when our CPU threads discovered that the kernel started.
+    // will always be _after_ it actually started, and can be very late
+    // if the watchdog thread got stuck on CUDA APIs.
+    c10::optional<c10::time_t> time_discovered_started_;
 
-    const char* state_ = "scheduled";
+    // timestamp when our CPU threads discovered that the kernel completed.
+    // will always be _after_ it actually complated, and can be the same time
+    // as the discovery of the start if the watchdog thread is stuck on CUDA
+    // APIs
+    c10::optional<c10::time_t> time_discovered_completed_;
 
     // size information for input/output tensors
     c10::SmallVector<int, 4> input_dims_;
@@ -385,15 +445,19 @@ struct NCCLTraceBuffer {
   size_t max_entries_ = 0;
   size_t next_ = 0;
   size_t id_ = 0;
+  std::map<std::tuple<std::string, std::string>, std::vector<uint64_t>>
+      pg_name_to_ranks_ = {};
 
   c10::optional<size_t> record(
       size_t pg_id,
+      const std::tuple<std::string, std::string>& pg_name,
       size_t seq_id,
-      const char* profiling_name,
+      size_t op_id,
+      std::string profiling_name,
       const std::vector<at::Tensor>& inputs,
       const std::vector<at::Tensor>& outputs,
-      EventList* start,
-      EventList* end) {
+      Event* start,
+      Event* end) {
     if (!enabled_) {
       return c10::nullopt;
     }
@@ -404,8 +468,10 @@ struct NCCLTraceBuffer {
     auto te = Entry{
         id_,
         pg_id,
+        pg_name,
         seq_id,
-        profiling_name == nullptr ? "" : profiling_name,
+        op_id,
+        std::move(profiling_name),
         std::move(traceback),
         std::move(start),
         std::move(end),
@@ -434,29 +500,27 @@ struct NCCLTraceBuffer {
     return id_++;
   }
 
+  void record_pg_ranks(
+      const std::tuple<std::string, std::string>& pg_name,
+      std::vector<uint64_t> ranks) {
+    if (!enabled_) {
+      return;
+    }
+    std::lock_guard<std::mutex> guard(mutex_);
+    pg_name_to_ranks_[pg_name] = ranks;
+  }
+
   void update_state(Entry& r) {
     if (r.start_ != nullptr) {
-      bool started = true;
-      for (auto& ev : *r.start_) {
-        if (!ev.query()) {
-          started = false;
-          break;
-        }
-      }
-      if (started) {
-        r.state_ = "started";
+      bool started = r.start_->query();
+      if (started && !r.time_discovered_started_) {
+        r.time_discovered_started_ = c10::getTime();
       }
     }
     if (r.end_ != nullptr) {
-      bool completed = true;
-      for (auto& ev : *r.end_) {
-        if (!ev.query()) {
-          completed = false;
-          break;
-        }
-      }
-      if (completed) {
-        r.state_ = "completed";
+      bool completed = r.end_->query();
+      if (completed && !r.time_discovered_completed_) {
+        r.time_discovered_completed_ = c10::getTime();
       }
     }
   }
@@ -475,35 +539,99 @@ struct NCCLTraceBuffer {
     return result;
   }
 
-  void retire_id(c10::optional<size_t> id) {
+  /*
+  Mark an Event as completed and free its events.
+
+  This is called by the watchdog thread, and is asynchronous from the
+  perspective of the main thread.
+
+  compute_duration defaults to true since retire_id is only called in the
+  watchdog thread, which is currently a place we call cuda APIs which may hang,
+  but care should be taken to avoid computing duration in any function that must
+  never hang. (timing must also be enabled for compute_duration - see
+  TORCH_NCCL_ENABLE_TIMING).
+  */
+  void retire_id(c10::optional<size_t> id, bool compute_duration = true) {
     if (!enabled_ || !id) {
       return;
     }
-    std::lock_guard<std::mutex> guard(mutex_);
-    auto& entry = entries_.at(*id % max_entries_);
-    if (entry.id_ == *id) {
-      update_state(entry);
-      entry.retired_ = true;
-      entry.start_ = entry.end_ = nullptr;
+
+    bool can_compute_duration = false;
+    Event* startEvent = nullptr;
+    Event* endEvent = nullptr;
+    c10::optional<float> duration = c10::nullopt;
+
+    std::unique_lock<std::mutex> guard(mutex_);
+
+    Entry* entry = &entries_.at(*id % max_entries_);
+    if (entry->id_ == *id) {
+      update_state(*entry);
+
+      if (compute_duration) {
+        can_compute_duration = entry->time_discovered_completed_.has_value() &&
+            entry->start_ && entry->end_;
+        startEvent = entry->start_;
+        endEvent = entry->end_;
+      }
+    }
+
+    if (can_compute_duration) {
+      // Compute duration without without holding the lock, because
+      // cudaEventDuration() can hang, and we need to acquire the lock before we
+      // can dump(), which we never want to block.
+      guard.unlock();
+      duration = getDurationFromEvent(*startEvent, *endEvent);
+      guard.lock();
+
+      // Refresh the entry pointer, see if the entry has been overwritten
+      entry = &entries_.at(*id % max_entries_);
+      if (entry->id_ != *id) {
+        LOG(INFO)
+            << "retire_id abandoned for id " << *id
+            << ", event was overwritten while waiting to compute duration.";
+        return;
+      }
+      if (duration.has_value()) {
+        entry->duration_ = duration.value();
+      }
     }
+
+    entry->retired_ = true;
+    entry->start_ = entry->end_ = nullptr;
   }
 
-  std::string dump() {
+  std::string dump(
+      const c10::optional<std::unordered_map<
+          std::string,
+          std::unordered_map<std::string, std::string>>>& ncclDumpMap) {
     auto result = dump_entries();
     auto entries = new_list();
-    c10::IValue pg_id_s = "pg_id";
-    c10::IValue seq_id_s = "seq_id";
-    c10::IValue profiling_name_s = "profiling_name";
-    c10::IValue input_sizes_s = "input_sizes";
-    c10::IValue output_sizes_s = "output_sizes";
-    c10::IValue time_created_s = "time_created_us";
-
-    c10::IValue frames_s = "frames";
-    c10::IValue state_s = "state";
-    c10::IValue line_s = "line";
-    c10::IValue name_s = "name";
-    c10::IValue filename_s = "filename";
-    c10::IValue retired_s = "retired";
+    c10::IValue entries_key = "entries";
+    c10::IValue nccl_comm_key = "nccl_comm_state";
+    c10::IValue version_key = "version";
+    // Update whenever changing contents or formatting of the dump
+    // (minor when adding fields, major when changing existing fields)
+    c10::IValue version_val = "1.5";
+    c10::IValue pg_config_key = "pg_config";
+    c10::IValue record_id_key = "record_id";
+    c10::IValue pg_id_key = "pg_id";
+    c10::IValue pg_name_key = "process_group";
+    c10::IValue seq_id_key = "seq_id";
+    c10::IValue op_id_key = "op_id";
+    c10::IValue profiling_name_key = "profiling_name";
+    c10::IValue input_sizes_key = "input_sizes";
+    c10::IValue output_sizes_key = "output_sizes";
+    c10::IValue time_created_key = "time_created_ns";
+    c10::IValue duration_key = "duration_ms";
+
+    c10::IValue frames_key = "frames";
+    c10::IValue state_key = "state";
+    c10::IValue line_key = "line";
+    c10::IValue name_key = "name";
+    c10::IValue filename_key = "filename";
+    c10::IValue retired_key = "retired";
+    c10::IValue time_discovered_started_key = "time_discovered_started_ns";
+    c10::IValue time_discovered_completed_key = "time_discovered_completed_ns";
 
     std::vector<torch::CapturedTraceback*> tracebacks;
     for (auto& e : result) {
@@ -513,9 +641,9 @@ struct NCCLTraceBuffer {
     std::vector<c10::IValue> all_frames;
     for (const auto& f : stracebacks.all_frames) {
       auto d = new_dict();
-      d.insert(name_s, f.funcname);
-      d.insert(filename_s, f.filename);
-      d.insert(line_s, int64_t(f.lineno));
+      d.insert(name_key, f.funcname);
+      d.insert(filename_key, f.filename);
+      d.insert(line_key, int64_t(f.lineno));
       all_frames.emplace_back(std::move(d));
     }
 
@@ -523,10 +651,16 @@ struct NCCLTraceBuffer {
       auto& e = result.at(i);
       auto& tb = stracebacks.tracebacks.at(i);
       auto dict = new_dict();
-      dict.insert(pg_id_s, int64_t(e.pg_id_));
-      dict.insert(seq_id_s, int64_t(e.seq_id_));
-      dict.insert(profiling_name_s, e.profiling_name_);
-      dict.insert(time_created_s, int64_t(e.time_created_ / 1000));
+      dict.insert(record_id_key, int64_t(e.id_));
+      dict.insert(pg_id_key, int64_t(e.pg_id_));
+      dict.insert(pg_name_key, e.pg_name_);
+      dict.insert(seq_id_key, int64_t(e.seq_id_));
+      dict.insert(op_id_key, int64_t(e.op_id_));
+      dict.insert(profiling_name_key, e.profiling_name_);
+      dict.insert(time_created_key, int64_t(e.time_created_));
+      if (e.duration_) {
+        dict.insert(duration_key, *e.duration_);
+      }
 
       auto it = e.sizes_.begin();
       auto read_sizes = [&](const c10::SmallVector<int, 4>& dims) {
@@ -542,19 +676,65 @@ struct NCCLTraceBuffer {
         return sizes;
       };
 
-      dict.insert(input_sizes_s, read_sizes(e.input_dims_));
-      dict.insert(output_sizes_s, read_sizes(e.output_dims_));
-      dict.insert(state_s, e.state_);
-      dict.insert(retired_s, e.retired_);
+      dict.insert(input_sizes_key, read_sizes(e.input_dims_));
+      dict.insert(output_sizes_key, read_sizes(e.output_dims_));
+      if (e.time_discovered_completed_.has_value()) {
+        dict.insert(state_key, "completed");
+      } else if (e.time_discovered_started_.has_value()) {
+        dict.insert(state_key, "started");
+      } else {
+        dict.insert(state_key, "scheduled");
+      }
+
+      dict.insert(
+          time_discovered_started_key,
+          e.time_discovered_started_.has_value()
+              ? int64_t(*e.time_discovered_started_)
+              : c10::IValue());
+      dict.insert(
+          time_discovered_completed_key,
+          e.time_discovered_completed_.has_value()
+              ? int64_t(*e.time_discovered_completed_)
+              : c10::IValue());
+      dict.insert(retired_key, e.retired_);
 
       auto frames = new_list();
       for (int64_t frame : tb) {
         frames.push_back(all_frames.at(frame));
       }
-      dict.insert(frames_s, frames);
+      dict.insert(frames_key, frames);
       entries.push_back(dict);
     }
-    return pickle_str(entries);
+    auto pg_config = new_dict();
+    for (const auto& [pg_name, ranks] : pg_name_to_ranks_) {
+      auto pg_info = new_dict();
+      pg_info.insert("name", std::get<0>(pg_name));
+      pg_info.insert("desc", std::get<1>(pg_name));
+      pg_info.insert("ranks", ranks_str(ranks));
+      pg_config.insert(std::get<0>(pg_name), pg_info);
+    }
+
+    // convert ncclDumpMap into a dictionary
+    auto per_comm_dict = new_dict();
+    if (ncclDumpMap.has_value()) {
+      for (const auto& [ncclId, ncclDump] : ncclDumpMap.value()) {
+        auto inner_dict = new_dict();
+        for (const auto& [key, value] : ncclDump) {
+          inner_dict.insert(key, value);
+        }
+        per_comm_dict.insert(ncclId, inner_dict);
+      }
+    }
+
+    auto dict = new_dict();
+    dict.insert(entries_key, entries);
+    dict.insert(version_key, version_val);
+    if (per_comm_dict.size() > 0) {
+      dict.insert(nccl_comm_key, per_comm_dict);
+    }
+    dict.insert(pg_config_key, pg_config);
+
+    return pickle_str(dict);
   }
 };
 
diff --git a/torch/csrc/distributed/c10d/Types.hpp b/torch/csrc/distributed/c10d/Types.hpp
index dc9a9856965ad..fab819798e555 100644
--- a/torch/csrc/distributed/c10d/Types.hpp
+++ b/torch/csrc/distributed/c10d/Types.hpp
@@ -56,8 +56,8 @@ struct TORCH_API ReduceOp : torch::CustomClassHolder {
 
   ReduceOp(
       RedOpType op,
-      c10::intrusive_ptr<_SupplementBase> optional_supplement) {
-    if (optional_supplement.get()) {
+      const c10::intrusive_ptr<_SupplementBase>& optional_supplement) {
+    if (optional_supplement) {
       op_ = op;
     } else {
       supplement_ = optional_supplement;
@@ -66,14 +66,11 @@ struct TORCH_API ReduceOp : torch::CustomClassHolder {
 
   // The heap resource supplement_, if it exists, is managed by a
   // c10::intrusive_ptr, so constructors and operator= can be simple
-  ReduceOp(const ReduceOp& other)
-      : op_(other.op_), supplement_(other.supplement_) {}
+  ReduceOp(const ReduceOp& other) = default;
+  ReduceOp& operator=(const ReduceOp& other) = default;
 
-  const ReduceOp& operator=(const ReduceOp& other) {
-    op_ = other.op_;
-    supplement_ = other.supplement_;
-    return *this;
-  }
+  ReduceOp(ReduceOp&& other) = default;
+  ReduceOp& operator=(ReduceOp&& other) = default;
 
   operator RedOpType() const {
     return op_;
diff --git a/torch/csrc/distributed/c10d/UCCTracing.cpp b/torch/csrc/distributed/c10d/UCCTracing.cpp
index 3834777181d6d..5558f1a929267 100644
--- a/torch/csrc/distributed/c10d/UCCTracing.cpp
+++ b/torch/csrc/distributed/c10d/UCCTracing.cpp
@@ -150,7 +150,7 @@ void CommTraceLogger::recordComms(
   // record the trace to kineto trace if applicable
   RECORD_PARAM_COMMS(
       static_cast<int64_t>(seqnum), // seq
-      0, // process group ptr
+      std::make_tuple("0", ""), // pg_name tuple
       rank,
       commName.c_str(),
       inNelems,
diff --git a/torch/csrc/distributed/c10d/UnixSockUtils.hpp b/torch/csrc/distributed/c10d/UnixSockUtils.hpp
index ffce091b6c5f0..531f8459aa7ad 100644
--- a/torch/csrc/distributed/c10d/UnixSockUtils.hpp
+++ b/torch/csrc/distributed/c10d/UnixSockUtils.hpp
@@ -2,8 +2,7 @@
 
 #include <torch/csrc/distributed/c10d/Utils.hpp>
 
-namespace c10d {
-namespace tcputil {
+namespace c10d::tcputil {
 
 #define CONNECT_SOCKET_OFFSET 2
 
@@ -23,5 +22,4 @@ inline struct ::pollfd getPollfd(int socket, short events) {
   return res;
 }
 
-} // namespace tcputil
-} // namespace c10d
+} // namespace c10d::tcputil
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
index a4a65a263d222..36add3ad15072 100644
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@@ -21,14 +21,10 @@ typedef SSIZE_T ssize_t;
 
 #include <sys/types.h>
 
-#include <chrono>
 #include <cstdint>
 #include <cstdlib>
 #include <functional>
-#include <limits>
 #include <string>
-#include <system_error>
-#include <tuple>
 #include <vector>
 
 namespace c10d {
@@ -139,7 +135,7 @@ inline int getCvarInt(const std::vector<std::string>& env, int def) {
 
     try {
       ret = std::stoi(val);
-    } catch (std::exception& e) {
+    } catch (std::exception&) {
       TORCH_CHECK(false, "Invalid value for environment variable: " + env[i]);
     }
   }
@@ -284,7 +280,7 @@ inline void assertLayoutMatch(
 }
 
 inline void assertNonEmpty(
-    std::function<void(const std::string&)> fn,
+    const std::function<void(const std::string&)>& fn,
     const at::ArrayRef<at::Tensor> tensors) {
   if (tensors.empty()) {
     fn("requires non-empty tensor list");
@@ -292,7 +288,7 @@ inline void assertNonEmpty(
 }
 
 inline void assertSingleElement(
-    std::function<void(const std::string&)> fn,
+    const std::function<void(const std::string&)>& fn,
     const at::ArrayRef<at::Tensor> tensors) {
   if (tensors.size() != 1) {
     fn("requires a single-element tensor list");
@@ -300,7 +296,7 @@ inline void assertSingleElement(
 }
 
 inline void assertSingleElementInput(
-    std::function<void(const std::string&)> fn,
+    const std::function<void(const std::string&)>& fn,
     const at::ArrayRef<at::Tensor> tensors) {
   if (tensors.size() != 1) {
     fn("requires a single-element input tensor list");
@@ -308,7 +304,7 @@ inline void assertSingleElementInput(
 }
 
 inline void assertSingleElementOutput(
-    std::function<void(const std::string&)> fn,
+    const std::function<void(const std::string&)>& fn,
     const at::ArrayRef<at::Tensor> tensors) {
   if (tensors.size() != 1) {
     fn("requires a single-element output tensor list");
@@ -316,25 +312,25 @@ inline void assertSingleElementOutput(
 }
 
 inline void assertRootRank(
-    std::function<void(const std::string&)> fn,
-    int rank,
-    int size) {
+    const std::function<void(const std::string&)>& fn,
+    int64_t rank,
+    int64_t size) {
   if (rank < 0 || rank >= size) {
     fn("invalid root rank: " + std::to_string(rank));
   }
 }
 
 inline void assertRootTensor(
-    std::function<void(const std::string&)> fn,
-    int rank,
-    int size) {
+    const std::function<void(const std::string&)>& fn,
+    int64_t rank,
+    int64_t size) {
   if (rank < 0 || rank >= size) {
     fn("invalid root tensor: " + std::to_string(rank));
   }
 }
 
 inline void assertDense(
-    std::function<void(const std::string&)> fn,
+    const std::function<void(const std::string&)>& fn,
     const at::ArrayRef<at::Tensor> tensors) {
   const auto& layout = tensors[0].layout();
   if (layout != at::kStrided) {
@@ -343,7 +339,7 @@ inline void assertDense(
 }
 
 inline void assertCPU(
-    std::function<void(const std::string&)> fn,
+    const std::function<void(const std::string&)>& fn,
     const at::ArrayRef<at::Tensor> tensors) {
   const auto& device = tensors[0].device();
   if (device.type() != at::kCPU) {
@@ -352,7 +348,7 @@ inline void assertCPU(
 }
 
 inline void assertSameDevice(
-    std::function<void(const std::string&)> fn,
+    const std::function<void(const std::string&)>& fn,
     const at::ArrayRef<at::Tensor> tensors) {
   if (tensors.size() < 2) {
     return;
diff --git a/torch/csrc/distributed/c10d/WinSockUtils.hpp b/torch/csrc/distributed/c10d/WinSockUtils.hpp
index 9b2b1aa245f84..1a2c749129c78 100644
--- a/torch/csrc/distributed/c10d/WinSockUtils.hpp
+++ b/torch/csrc/distributed/c10d/WinSockUtils.hpp
@@ -2,8 +2,7 @@
 
 #include <torch/csrc/distributed/c10d/Utils.hpp>
 
-namespace c10d {
-namespace tcputil {
+namespace c10d::tcputil {
 
 #define CONNECT_SOCKET_OFFSET 1
 
@@ -23,5 +22,4 @@ inline struct ::pollfd getPollfd(int socket, short events) {
   return res;
 }
 
-} // namespace tcputil
-} // namespace c10d
+} // namespace c10d::tcputil
diff --git a/torch/csrc/distributed/c10d/Work.hpp b/torch/csrc/distributed/c10d/Work.hpp
index 50c6ae03861b7..d106183231706 100644
--- a/torch/csrc/distributed/c10d/Work.hpp
+++ b/torch/csrc/distributed/c10d/Work.hpp
@@ -1,7 +1,8 @@
 #pragma once
 
 #include <ATen/ATen.h>
-#include <stdexcept>
+#include <chrono>
+#include <mutex>
 #include <vector>
 
 constexpr auto kNoTimeout = std::chrono::milliseconds(0);
@@ -144,15 +145,18 @@ class TORCH_API Work : public torch::CustomClassHolder {
 struct TORCH_API WorkInfo {
   WorkInfo(
       const OpType& opType,
+      const uint64_t seq,
       const std::chrono::time_point<std::chrono::system_clock>& timeStarted,
       const std::chrono::time_point<std::chrono::system_clock>& timeFinished,
       const std::chrono::duration<float>& activeDuration)
       : opType(opType),
+        seq(seq),
         timeStarted(timeStarted),
         timeFinished(timeFinished),
         activeDuration(activeDuration) {}
 
   OpType opType;
+  uint64_t seq;
   std::chrono::time_point<std::chrono::system_clock> timeStarted;
   std::chrono::time_point<std::chrono::system_clock> timeFinished;
   std::chrono::duration<float> activeDuration;
diff --git a/torch/csrc/distributed/c10d/comm.cpp b/torch/csrc/distributed/c10d/comm.cpp
index b86bf66603a38..1ded910eaad1d 100644
--- a/torch/csrc/distributed/c10d/comm.cpp
+++ b/torch/csrc/distributed/c10d/comm.cpp
@@ -67,8 +67,7 @@ void broadcast_coalesced(
   // Coalesce tensors into buckets taking into account the maximum buffer size.
   // This routine is multi-device aware, so the tensors can be split across
   // multiple devices and can contain a mix of CPU and CUDA tensors.
-  std::vector<std::vector<size_t>> buckets;
-  std::tie(buckets, std::ignore) =
+  auto [buckets, _] =
       compute_bucket_assignment_by_size(tensors.vec(), {buffer_size});
 
   // Returns tensor at specified index in input tensor list.
diff --git a/torch/csrc/distributed/c10d/default_comm_hooks.cpp b/torch/csrc/distributed/c10d/default_comm_hooks.cpp
index 124bacd2b205a..9dd780e199387 100644
--- a/torch/csrc/distributed/c10d/default_comm_hooks.cpp
+++ b/torch/csrc/distributed/c10d/default_comm_hooks.cpp
@@ -46,7 +46,7 @@ c10::intrusive_ptr<c10::ivalue::Future> FP16CompressCommHook::runHook(
 c10::intrusive_ptr<c10::ivalue::Future> _AllReduceBySumCommHook::runHook(
     GradBucket& bucket) {
   std::vector<at::Tensor> tensors = {bucket.getBufferRef()};
-#ifdef IS_NCCL_EXP
+#ifdef IS_NCCLX
   // case with sparse_metadata_ set and using indices from there
   if (bucket.getSparseGradIndices().has_value()) {
     AllreduceOptions opts = AllreduceOptions();
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 6dbc06e7e068a..7cbd898499c38 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -10,6 +10,7 @@
 #include <torch/csrc/distributed/c10d/HashStore.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupRoundRobin.hpp>
 #endif
+#include <torch/csrc/distributed/c10d/FakeProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/PyProcessGroup.hpp>
 
@@ -51,11 +52,40 @@
 
 namespace {
 
+#ifdef USE_C10D_NCCL
+
+bool acquire_gil() {
+  // basically if this function can acquire the gil, it will return quickly.
+  // if not, it will hang forever.  The idea is to call this from a thread
+  // wrapped in a future, and then check the future after a timeout, to
+  // determine whether we're facing gil contention.
+  if (Py_IsInitialized()) {
+    pybind11::gil_scoped_acquire gil;
+    return true;
+  }
+
+  // If we end up here, its probably still a "pass" from the perspective of
+  // checking whether python is stuck. but currently we don't check the return
+  // value of this function anyway, just check whether it returned quickly vs
+  // timing out.  Taking a long time is the main sign of trouble.  Fast return
+  // with true or with false is both OK from the perspective of debugging python
+  // hangs.
+  return false;
+}
+
+bool registerGilChecker() {
+  c10d::get_gil_checker() = &acquire_gil;
+  return true;
+}
+
+static bool registered = registerGilChecker();
+#endif // USE_C10D_NCCL
+
 // Wrapper to ensure GIL is released before destructing ProcessGroupGloo
 // TODO: move this somewhere more generally useful
 template <typename T>
 class IntrusivePtrNoGilDestructor {
-  c10::intrusive_ptr<T> impl_;
+  c10::intrusive_ptr<T> impl_{};
 
  public:
   IntrusivePtrNoGilDestructor() = default;
@@ -102,9 +132,7 @@ class IntrusivePtrNoGilDestructor {
 
 PYBIND11_DECLARE_HOLDER_TYPE(T, IntrusivePtrNoGilDestructor<T>, true);
 
-namespace torch {
-namespace distributed {
-namespace c10d {
+namespace torch::distributed::c10d {
 
 namespace {
 
@@ -229,7 +257,7 @@ class PythonStore : public ::c10d::Store {
        py::bytes(reinterpret_cast<const char*>(value.data()), value.size()));
   }
 
-  virtual std::vector<std::vector<uint8_t>> multiGet(
+  std::vector<std::vector<uint8_t>> multiGet(
       const std::vector<std::string>& keys) override {
     pybind11::gil_scoped_acquire gil;
     pybind11::function fn = pybind11::get_overload(
@@ -240,15 +268,16 @@ class PythonStore : public ::c10d::Store {
     std::vector<std::string> py_list =
         pybind11::cast<std::vector<std::string>>(fn(keys));
     std::vector<std::vector<uint8_t>> res;
+    res.reserve(py_list.size());
 
     for (auto& str : py_list) {
-      res.emplace_back(std::vector<uint8_t>(str.begin(), str.end()));
+      res.emplace_back(str.begin(), str.end());
     }
 
     return res;
   }
 
-  virtual void multiSet(
+  void multiSet(
       const std::vector<std::string>& keys,
       const std::vector<std::vector<uint8_t>>& values) override {
     pybind11::gil_scoped_acquire gil;
@@ -259,9 +288,10 @@ class PythonStore : public ::c10d::Store {
     }
 
     std::vector<py::bytes> bytes;
+    bytes.reserve(values.size());
     for (auto& value : values) {
       bytes.emplace_back(
-          py::bytes(reinterpret_cast<const char*>(value.data()), value.size()));
+          reinterpret_cast<const char*>(value.data()), value.size());
     }
 
     fn(keys, bytes);
@@ -320,6 +350,7 @@ static PyObject* reduceopmeta___instancecheck__(
   }
   Py_RETURN_FALSE;
 }
+// NOLINTNEXTLINE(*c-arrays)
 static PyMethodDef reduceopmeta_methods[] = {
     {"__instancecheck__",
      (PyCFunction)reduceopmeta___instancecheck__,
@@ -330,6 +361,7 @@ PyTypeObject* GetReduceOpMetaclass() {
   static auto* metaclass = [] {
     PyTypeObject* base_metaclass =
         pybind11::detail::get_internals().default_metaclass;
+    // NOLINTNEXTLINE(*c-arrays)
     PyType_Slot slots[] = {
         {Py_tp_base, base_metaclass},
         {Py_tp_methods, reduceopmeta_methods},
@@ -337,6 +369,7 @@ PyTypeObject* GetReduceOpMetaclass() {
     };
     PyType_Spec spec = {};
     spec.name = "torch._C._distributed_c10d._ReduceOpMeta";
+    // NOLINTNEXTLINE(*-narrowing-conversions)
     spec.basicsize = base_metaclass->tp_basicsize;
     spec.flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
     spec.slots = slots;
@@ -503,7 +536,7 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
             for (const auto& fut : futs) {
               futures.push_back(fut->fut);
             }
-            reducer.install_futures(std::move(futures));
+            reducer.install_futures(futures);
           },
           py::call_guard<py::gil_scoped_release>())
       .def(
@@ -586,7 +619,7 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
       .def(
           "set_logger",
           [](::c10d::Reducer& reducer,
-             const std::shared_ptr<::c10d::Logger> logger) {
+             const std::shared_ptr<::c10d::Logger>& logger) {
             std::weak_ptr<::c10d::Logger> logger_weakref = logger;
             reducer.set_logger(logger_weakref);
           })
@@ -606,7 +639,7 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
           "_update_process_group",
           [](::c10d::Reducer& reducer,
              c10::intrusive_ptr<::c10d::ProcessGroup> new_process_group) {
-            return reducer.update_process_group(new_process_group);
+            return reducer.update_process_group(std::move(new_process_group));
           },
           py::call_guard<py::gil_scoped_release>());
 
@@ -734,6 +767,7 @@ This class does not support ``__members__`` property.)");
           // With the above custom `__eq__`'s, I have to manually support the
           // other types.
           "__eq__",
+          // NOLINTNEXTLINE(performance-unnecessary-value-param)
           [](const ::c10d::ReduceOp& self, py::object) { return false; })
       .def(
           "__hash__",
@@ -764,7 +798,7 @@ This class does not support ``__members__`` property.)");
               return py::make_tuple(r.op_, preMulSupplement->tensor_factor);
             }
           },
-          [](const py::tuple t) {
+          [](const py::tuple& t) {
             // __setstate__
             TORCH_CHECK(t.size() == 2, "Invalid state");
             const auto op =
@@ -813,19 +847,25 @@ This class does not support ``__members__`` property.)");
           py::return_value_policy::copy, // seems safest
           py::call_guard<py::gil_scoped_release>());
 
-  // TODO(yifu): _{register, resolve}_process_group currently only work for
-  // c10d_functional. Later, we'll unify the name -> group mapping across
-  // Python and C++, and spanning both functional and non-functional
-  // collectives.
+  module.def(
+      "_set_thread_isolation_mode",
+      &::c10d::set_thread_isolation_mode,
+      py::arg("enable"));
+
+  // Bindings for GroupRegistry.hpp
+  //
+  // Register a process group in the native registry. Process groups registered
+  // via `_register_process_group` can be resolved from both Python and C++.
   module.def(
       "_register_process_group",
       [](const std::string& group_name,
          c10::intrusive_ptr<::c10d::ProcessGroup> group) {
-        ::c10d::register_process_group(group_name, group);
+        ::c10d::register_process_group(group_name, std::move(group));
       },
       py::arg("group_name"),
       py::arg("group"));
 
+  // Resolve a process group from the native registry
   module.def(
       "_resolve_process_group",
       [](const std::string& group_name) {
@@ -833,6 +873,19 @@ This class does not support ``__members__`` property.)");
       },
       py::arg("group_name"));
 
+  // Remove a group from the native registry
+  module.def(
+      "_unregister_process_group",
+      [](const std::string& group_name) {
+        return ::c10d::unregister_process_group(group_name);
+      },
+      py::arg("group_name"));
+
+  // Remove all process groups from the native registry
+  module.def("_unregister_all_process_groups", []() {
+    return ::c10d::unregister_all_process_groups();
+  });
+
   py::class_<::c10d::BroadcastOptions>(module, "BroadcastOptions")
       .def(py::init<>())
       .def_readwrite("rootRank", &::c10d::BroadcastOptions::rootRank)
@@ -1246,9 +1299,9 @@ Example::
                  const std::vector<std::string>& keys,
                  const std::vector<std::string>& values) {
                 std::vector<std::vector<uint8_t>> vals;
+                vals.reserve(values.size());
                 for (auto& value : values) {
-                  vals.push_back(
-                      std::vector<uint8_t>(value.begin(), value.end()));
+                  vals.emplace_back(value.begin(), value.end());
                 }
                 store.multiSet(keys, vals);
               },
@@ -1428,7 +1481,11 @@ that adds a prefix to each key inserted to the store.
       .def_property_readonly(
           "underlying_store",
           &::c10d::PrefixStore::getUnderlyingStore,
-          R"(Gets the underlying store object that PrefixStore wraps around.)");
+          R"(Gets the underlying store object that PrefixStore wraps around.)")
+      .def_property_readonly(
+          "_underlying_non_prefix_store",
+          &::c10d::PrefixStore::getUnderlyingNonPrefixStore,
+          R"(Recursively to get the store before layers of wrapping with PrefixStore.)");
 
   auto processGroup =
       py::class_<
@@ -1481,7 +1538,7 @@ that adds a prefix to each key inserted to the store.
               "allreduce",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                  std::vector<at::Tensor>& xs,
-                 ::c10d::ReduceOp op) {
+                 const ::c10d::ReduceOp& op) {
                 ::c10d::AllreduceOptions opts;
                 opts.reduceOp = op;
                 return self->allreduce(xs, opts);
@@ -1494,7 +1551,7 @@ that adds a prefix to each key inserted to the store.
               "allreduce",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                  at::Tensor& x,
-                 ::c10d::ReduceOp op) {
+                 const ::c10d::ReduceOp& op) {
                 ::c10d::AllreduceOptions opts;
                 opts.reduceOp = op;
                 std::vector<at::Tensor> xs = {x};
@@ -1522,7 +1579,7 @@ that adds a prefix to each key inserted to the store.
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                  at::Tensor& x,
                  int rootRank,
-                 ::c10d::ReduceOp op) {
+                 const ::c10d::ReduceOp& op) {
                 ::c10d::ReduceOptions opts;
                 opts.reduceOp = op;
                 opts.rootRank = rootRank;
@@ -1633,7 +1690,7 @@ that adds a prefix to each key inserted to the store.
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                  at::Tensor& output,
                  std::vector<at::Tensor>& input,
-                 ::c10d::ReduceOp op) {
+                 const ::c10d::ReduceOp& op) {
                 std::vector<at::Tensor> outputs = {output};
                 std::vector<std::vector<at::Tensor>> inputs = {input};
                 ::c10d::ReduceScatterOptions opts;
@@ -1773,7 +1830,7 @@ that adds a prefix to each key inserted to the store.
                 self->registerOnCompletionHook(
                     [hookWrapper = ::c10d::PythonOnCompletionHook(std::move(
                          hook))](std::shared_ptr<::c10d::WorkInfo> workInfo) {
-                      hookWrapper(workInfo);
+                      hookWrapper(std::move(workInfo));
                     });
               },
               py::arg("hook"),
@@ -1787,6 +1844,7 @@ The hook must have the following signature:
 >>> def hook(work_info: torch._C._distributed_c10d.WorkInfo) -> None:
 >>>     # custom code
 >>>     # work_info.op_type: type of collective of this work
+>>>     # work_info.seq: sequence number of collective of this work
 >>>     # work_info.time_started: system time when user code called this collective
 >>>     # work_info.time_finished: system time when the watchdog thread detected
 >>>     #     completion of this work. Note that, there can be delays between the
@@ -1831,6 +1889,15 @@ The hook must have the following signature:
               "group_name",
               &::c10d::ProcessGroup::getGroupName,
               "(Gets this process group name. It's cluster unique)")
+          .def(
+              "_set_group_desc",
+              &::c10d::ProcessGroup::setGroupDesc,
+              py::call_guard<py::gil_scoped_acquire>(),
+              "Sets the process group description. This is an internal C10D method, do not use.")
+          .def_property_readonly(
+              "group_desc",
+              &::c10d::ProcessGroup::getGroupDesc,
+              "Gets this process group description")
           .def_property(
               "bound_device_id",
               &::c10d::ProcessGroup::getBoundDeviceId,
@@ -1840,7 +1907,7 @@ The hook must have the following signature:
           })
           .def_static("unbox", [](py::object obj) {
               auto typePtr = torch::getCustomClass("__torch__.torch.classes.c10d.ProcessGroup");
-              auto ivalue = torch::jit::toIValue(obj, typePtr);
+              auto ivalue = torch::jit::toIValue(std::move(obj), typePtr);
               return ivalue.toCustomClass<::c10d::ProcessGroup>();
           });
 
@@ -1932,7 +1999,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
               "allreduce",
               [](const c10::intrusive_ptr<::c10d::Backend>& self,
                  std::vector<at::Tensor>& xs,
-                 ::c10d::ReduceOp op) {
+                 const ::c10d::ReduceOp& op) {
                 ::c10d::AllreduceOptions opts;
                 opts.reduceOp = op;
                 return self->allreduce(xs, opts);
@@ -1944,7 +2011,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
               "allreduce",
               [](const c10::intrusive_ptr<::c10d::Backend>& self,
                  at::Tensor& x,
-                 ::c10d::ReduceOp op) {
+                 const ::c10d::ReduceOp& op) {
                 ::c10d::AllreduceOptions opts;
                 opts.reduceOp = op;
                 std::vector<at::Tensor> xs = {x};
@@ -1970,7 +2037,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
               [](const c10::intrusive_ptr<::c10d::Backend>& self,
                  at::Tensor& x,
                  int rootRank,
-                 ::c10d::ReduceOp op) {
+                 const ::c10d::ReduceOp& op) {
                 ::c10d::ReduceOptions opts;
                 opts.reduceOp = op;
                 opts.rootRank = rootRank;
@@ -2073,7 +2140,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
               [](const c10::intrusive_ptr<::c10d::Backend>& self,
                  at::Tensor& output,
                  std::vector<at::Tensor>& input,
-                 ::c10d::ReduceOp op) {
+                 const ::c10d::ReduceOp& op) {
                 std::vector<at::Tensor> outputs = {output};
                 std::vector<std::vector<at::Tensor>> inputs = {input};
                 ::c10d::ReduceScatterOptions opts;
@@ -2195,6 +2262,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
       intrusive_ptr_no_gil_destructor_class_<::c10d::ProcessGroupGloo>(
           module, "ProcessGroupGloo", backend);
 
+  // NOLINTNEXTLINE(bugprone-unused-raii)
   shared_ptr_class_<::gloo::transport::Device>(processGroupGloo, "Device");
 
   intrusive_ptr_class_<::c10d::ProcessGroupGloo::Options>(
@@ -2344,6 +2412,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
               py::call_guard<py::gil_scoped_release>())
           .def_property_readonly(
               "options", &::c10d::ProcessGroupNCCL::getOptions)
+          .def_property_readonly("uid", &::c10d::ProcessGroupNCCL::getUid)
           .def_property(
               "bound_device_id",
               &::c10d::ProcessGroupNCCL::getBoundDeviceId,
@@ -2356,6 +2425,34 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
       "_get_intra_node_comm_usage_counter",
       &::c10d::intra_node_comm::getIntraNodeCommUsageCounter);
 
+  using IntraNodeComm = ::c10d::intra_node_comm::IntraNodeComm;
+  py::class_<IntraNodeComm, c10::intrusive_ptr<IntraNodeComm>>(
+      module, "_IntraNodeComm")
+      .def(
+          py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
+                      size_t rank,
+                      size_t world_size,
+                      c10::optional<size_t> buffer_size) {
+            auto comm = c10::make_intrusive<IntraNodeComm>(
+                store, rank, world_size, buffer_size);
+            if (!comm->rendezvous()) {
+              throw std::runtime_error("IntraNodeComm::rendezvous failed");
+            }
+            return comm;
+          }),
+          py::arg("store"),
+          py::arg("rank"),
+          py::arg("world_size"),
+          py::arg("buffer_size") = c10::nullopt)
+      .def("barrier", &IntraNodeComm::barrier, py::arg("ranks") = py::none())
+      .def("put", &IntraNodeComm::put, py::arg("input"), py::arg("offset") = 0)
+      .def(
+          "get",
+          &IntraNodeComm::get,
+          py::arg("rank"),
+          py::arg("tensor"),
+          py::arg("offset") = 0);
+
 #ifdef NCCL_HAS_COMM_CTA_CGA
   py::class_<ncclConfig_t>(
       processGroupNCCL,
@@ -2426,7 +2523,9 @@ Example::
           "split_color", &::c10d::ProcessGroupNCCL::Options::split_color)
       .def_readwrite(
           "global_ranks_in_group",
-          &::c10d::ProcessGroupNCCL::Options::global_ranks_in_group);
+          &::c10d::ProcessGroupNCCL::Options::global_ranks_in_group)
+      .def_readwrite(
+          "group_name", &::c10d::ProcessGroupNCCL::Options::group_name);
 
 #endif
 
@@ -2441,7 +2540,7 @@ Example::
   processGroupMPI.def_static(
       "create",
       [](std::vector<int> ranks) {
-        return ::c10d::ProcessGroupMPI::createProcessGroupMPI(ranks);
+        return ::c10d::ProcessGroupMPI::createProcessGroupMPI(std::move(ranks));
       },
       py::call_guard<py::gil_scoped_release>());
 #endif
@@ -2490,6 +2589,7 @@ Example::
   py::class_<::c10d::WorkInfo, std::shared_ptr<::c10d::WorkInfo>>(
       module, "WorkInfo")
       .def_readonly("op_type", &::c10d::WorkInfo::opType)
+      .def_readonly("seq", &::c10d::WorkInfo::seq)
       .def_readonly("time_started", &::c10d::WorkInfo::timeStarted)
       .def_readonly("time_finished", &::c10d::WorkInfo::timeFinished)
       .def_readonly("active_duration", &::c10d::WorkInfo::activeDuration);
@@ -2529,6 +2629,16 @@ such as `dist.all_reduce(tensor, async_op=True)`.
       .def(
           "result",
           [](::c10d::Work& work) -> std::vector<at::Tensor> {
+            // Deprecation reason:
+            // Work.result() returns a vector of tensors. This signature is
+            // problematic as some collectives may just return one tensor (e.g
+            // all-reduce), while some others may return multiple tensors (e.g.
+            // all-gather).
+            // Deprecating work.result() would also allow us to remove the
+            // `outputs_` field in the Work class, avoiding an "artificial"
+            // reference to the tensors, which could potentially hold up the
+            // tensors' memory.
+            TORCH_WARN_ONCE(fmt::format(kDeprecationWarning, "Work::result"));
             return work.result();
           })
       .def(
@@ -2607,15 +2717,22 @@ such as `dist.all_reduce(tensor, async_op=True)`.
       .def(
           "boxed",
           [](c10::intrusive_ptr<::c10d::Work> self) {
-            return torch::jit::toPyObject(c10::IValue(self));
+            return torch::jit::toPyObject(c10::IValue(std::move(self)));
           })
       .def_static("unbox", [](py::object obj) {
         auto typePtr =
             torch::getCustomClass("__torch__.torch.classes.c10d.Work");
-        auto ivalue = torch::jit::toIValue(obj, typePtr);
+        auto ivalue = torch::jit::toIValue(std::move(obj), typePtr);
         return ivalue.toCustomClass<::c10d::Work>();
       });
 
+  auto fakeProcessGroup =
+      intrusive_ptr_no_gil_destructor_class_<::c10d::FakeProcessGroup>(
+          module, "FakeProcessGroup", backend)
+          .def(py::init([](int rank, int size) {
+            return c10::make_intrusive<::c10d::FakeProcessGroup>(rank, size);
+          }));
+
   py::class_<c10::DDPLoggingData>(module, "DDPLoggingData")
       .def(py::init<>())
       .def_readwrite("strs_map", &c10::DDPLoggingData::strs_map)
@@ -2675,12 +2792,11 @@ such as `dist.all_reduce(tensor, async_op=True)`.
       // Define a lambda such that the pybind11 prototype can take a std::vector
       // for the tensor list argument, but still pass it to the underlying
       // function as a c10::ArrayRef.
-      [](c10::intrusive_ptr<::c10d::ProcessGroup> process_group,
-         std::vector<at::Tensor> tensors, // NOLINT
+      [](const c10::intrusive_ptr<::c10d::ProcessGroup>& process_group,
+         const std::vector<at::Tensor>& tensors,
          size_t buffer_size,
          int rank) {
-        broadcast_coalesced(
-            std::move(process_group), tensors, buffer_size, rank);
+        broadcast_coalesced(process_group, tensors, buffer_size, rank);
       },
       py::arg("process_group"),
       py::arg("tensors"),
@@ -2763,7 +2879,7 @@ such as `dist.all_reduce(tensor, async_op=True)`.
 
   module.def(
       "_create_work_from_future",
-      [](std::shared_ptr<jit::PythonFutureWrapper> future) {
+      [](const std::shared_ptr<jit::PythonFutureWrapper>& future) {
         return ::c10d::Work::create_from_future(future->fut);
       },
       py::arg("future"),
@@ -2822,6 +2938,4 @@ PyMethodDef* python_functions() {
   return methods;
 }
 
-} // namespace c10d
-} // namespace distributed
-} // namespace torch
+} // namespace torch::distributed::c10d
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cpp b/torch/csrc/distributed/c10d/intra_node_comm.cpp
index 50b0147b30004..d18262ecfa3f5 100644
--- a/torch/csrc/distributed/c10d/intra_node_comm.cpp
+++ b/torch/csrc/distributed/c10d/intra_node_comm.cpp
@@ -6,7 +6,7 @@
 #include <torch/csrc/distributed/c10d/Utils.hpp>
 
 #include <iostream>
-#include <random>
+#include <utility>
 
 #include <fcntl.h>
 #include <pthread.h>
@@ -22,8 +22,7 @@
 
 #include <cuda_runtime.h>
 
-namespace c10d {
-namespace intra_node_comm {
+namespace c10d::intra_node_comm {
 
 static std::vector<std::string> ENABLE_INTRA_NODE_COMM = {
     "ENABLE_INTRA_NODE_COMM"};
@@ -44,37 +43,17 @@ void* initP2pState();
 
 void* initTopoInfo(Topology topology, NvlMesh nvlMesh, size_t rank);
 
-AllReduceAlgo selectAllReduceAlgo(
-    const at::Tensor& input,
-    Topology topology,
-    size_t worldSize);
-
-at::Tensor allReduce(
-    const at::Tensor& input,
-    std::array<void*, kMaxDevices> p2pStates,
-    std::array<void*, kMaxDevices> buffers,
-    void* p2pStatesDev,
-    void* buffersDev,
-    void* topoInfo,
-    size_t rank,
-    size_t worldSize,
-    AllReduceAlgo algo,
-    at::cuda::CUDAStream& stream);
-
 ////////////////////////////////////////////////////////////////////////////////
 // Topology Detection
 ////////////////////////////////////////////////////////////////////////////////
 
-// TODO: find a better way to determine this
-static constexpr size_t kMaxNvLinks = 20;
-
 static std::ostream& operator<<(std::ostream& os, const NvlMesh& nvlMesh) {
   std::ostringstream oss;
   for (size_t i = 0; i < kMaxDevices; ++i) {
     for (size_t j = 0; j < kMaxDevices; ++j) {
       oss << nvlMesh[i][j] << " ";
     }
-    oss << std::endl;
+    oss << '\n';
   }
   os << oss.str();
   return os;
@@ -94,7 +73,7 @@ static bool isSame(NvlMesh lhs, NvlMesh rhs) {
 /**
  * Query the nvlink connection among devices.
  */
-static NvlMesh getNvlMesh(std::vector<std::string> rankToBusId) {
+static NvlMesh getNvlMesh(const std::vector<std::string>& rankToBusId) {
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
   using namespace c10::cuda;
 
@@ -105,17 +84,20 @@ static NvlMesh getNvlMesh(std::vector<std::string> rankToBusId) {
   }
 
   const auto worldSize = rankToBusId.size();
-  std::vector<nvmlDevice_t> devices(worldSize, 0);
+  std::vector<nvmlDevice_t> devices(worldSize, nullptr);
   std::unordered_map<std::string, size_t> busIdToRank;
   std::vector<size_t> switchLinkCount(worldSize, 0);
 
   for (size_t r = 0; r < worldSize; ++r) {
-    busIdToRank.emplace(std::make_pair(rankToBusId[r], r));
+    busIdToRank.emplace(rankToBusId[r], r);
     TORCH_CHECK(
         driverApi->nvmlDeviceGetHandleByPciBusId_v2_(
             rankToBusId[r].c_str(), &devices[r]) == NVML_SUCCESS);
   }
 
+  // TODO: find a better way to determine this
+  constexpr size_t kMaxNvLinks = 20;
+
   // For each device, loop over devices connected to it via NVLink
   for (size_t idx = 0; idx < worldSize; ++idx) {
     for (size_t link = 0; link < kMaxNvLinks; ++link) {
@@ -222,24 +204,21 @@ static Topology detectTopology(const NvlMesh nvlMesh, size_t worldSize) {
 ////////////////////////////////////////////////////////////////////////////////
 
 IntraNodeComm::IntraNodeComm(
-    Topology topology,
-    std::array<void*, kMaxDevices> p2pStates,
-    std::array<void*, kMaxDevices> buffers,
-    void* p2pStatesDev,
-    void* buffersDev,
-    void* topoInfo,
+    c10::intrusive_ptr<c10d::Store> store,
     size_t rank,
-    size_t worldSize)
-    : topology_(topology),
-      p2pStates_(p2pStates),
-      buffers_(buffers),
-      p2pStatesDev_(p2pStatesDev),
-      buffersDev_(buffersDev),
-      topoInfo_(topoInfo),
+    size_t worldSize,
+    c10::optional<size_t> bufferSize)
+    : store_(std::move(store)),
       rank_(rank),
-      worldSize_(worldSize) {}
+      worldSize_(worldSize),
+      bufferSize_(bufferSize.has_value() ? *bufferSize : kDefaultBufferSize) {
+  rendezvous();
+}
 
 IntraNodeComm::~IntraNodeComm() {
+  if (!isInitialized_) {
+    return;
+  }
   // Intentionally releasing resources without synchronizing devices. The
   // teardown logic is safe for propoerly sync'd user program. We don't want
   // improperly sync'd user program to hang here.
@@ -259,17 +238,21 @@ IntraNodeComm::~IntraNodeComm() {
   AT_CUDA_CHECK(cudaFree(buffersDev_));
 }
 
+bool IntraNodeComm::isEnabled() {
+  return getCvarBool(ENABLE_INTRA_NODE_COMM, false);
+}
+
 /**
  * Use c10d::Store to perform allgather on a trivially copyable type.
  */
 template <typename T>
 std::vector<T> storeAllGather(
-    c10::intrusive_ptr<c10d::Store> store,
+    const c10::intrusive_ptr<c10d::Store>& store,
     const std::string& prefix,
     size_t rank,
     size_t worldSize,
     T val) {
-  static_assert(std::is_trivially_copyable<T>::value);
+  static_assert(std::is_trivially_copyable_v<T>);
 
   std::vector<std::string> peerKeys;
   for (size_t r = 0; r < worldSize; ++r) {
@@ -294,26 +277,24 @@ std::vector<T> storeAllGather(
     store->wait({peerKeys[r]});
     auto payload = store->get(peerKeys[r]);
     TORCH_CHECK(payload.size() == sizeof(T));
-    T peerVal;
+    T peerVal{};
     std::memcpy(&peerVal, payload.data(), sizeof(T));
     peerVals.push_back(peerVal);
   }
   return peerVals;
 }
 
-c10::intrusive_ptr<IntraNodeComm> IntraNodeComm::rendezvous(
-    c10::intrusive_ptr<c10d::Store> store,
-    const std::string& prefix,
-    size_t rank,
-    size_t worldSize) {
+bool IntraNodeComm::rendezvous() {
+  if (isInitialized_) {
+    return true;
+  }
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
-  if (!isIntraNodeCommSupported() ||
-      !getCvarBool(ENABLE_INTRA_NODE_COMM, false) || worldSize < 2 ||
-      worldSize > kMaxDevices) {
-    return nullptr;
+  if (!isIntraNodeCommSupported() || !isEnabled() || worldSize_ < 2 ||
+      worldSize_ > kMaxDevices) {
+    return false;
   }
 
-  int deviceIdx = at::cuda::current_device();
+  auto deviceIdx = at::cuda::current_device();
   c10::cuda::CUDAGuard guard(deviceIdx);
 
   // First hand shake: exchange hostname and device bus ID
@@ -334,8 +315,8 @@ c10::intrusive_ptr<IntraNodeComm> IntraNodeComm::rendezvous(
       prop.pciBusID,
       prop.pciDeviceID);
 
-  auto peerDevInfos = storeAllGather(
-      store, prefix + "-IntraNodeCommHandShake-0", rank, worldSize, devInfo);
+  auto peerDevInfos =
+      storeAllGather(store_, "handshake-0", rank_, worldSize_, devInfo);
 
   std::vector<std::string> rankToBusId;
   for (const auto& info : peerDevInfos) {
@@ -343,7 +324,7 @@ c10::intrusive_ptr<IntraNodeComm> IntraNodeComm::rendezvous(
       LOG(WARNING) << "Aborting IntraNodeComm::rendezvous because some "
                       "participants are not on the same host ("
                    << info.hostname << ", " << devInfo.hostname << ")";
-      return nullptr;
+      return false;
     }
     rankToBusId.emplace_back(info.busId);
   }
@@ -352,7 +333,7 @@ c10::intrusive_ptr<IntraNodeComm> IntraNodeComm::rendezvous(
   {
     std::unordered_set uniqueBusIds(rankToBusId.begin(), rankToBusId.end());
     TORCH_CHECK(
-        uniqueBusIds.size() == worldSize,
+        uniqueBusIds.size() == worldSize_,
         "IntraNodeComm::rendezvous: detected overlapping devices across ranks. "
         "Please properly set device via torch.cuda.set_device() before "
         "initiating rendezvous.");
@@ -362,14 +343,14 @@ c10::intrusive_ptr<IntraNodeComm> IntraNodeComm::rendezvous(
   auto nvlMesh = getNvlMesh(rankToBusId);
 
   // Detect topology
-  Topology topology = detectTopology(nvlMesh, worldSize);
+  Topology topology = detectTopology(nvlMesh, worldSize_);
 
   // Initialize p2p state
   auto p2pState = initP2pState();
 
   // Allocate buffer
   void* buffer = nullptr;
-  AT_CUDA_CHECK(cudaMalloc(&buffer, kMaxIntraNodeSize * 2));
+  AT_CUDA_CHECK(cudaMalloc(&buffer, bufferSize_));
 
   // Second handshake: exchange topology and CUDA IPC handles
   struct IpcInfo {
@@ -389,8 +370,8 @@ c10::intrusive_ptr<IntraNodeComm> IntraNodeComm::rendezvous(
       .p2pStateHandle = p2pStateHandle,
       .bufferHandle = bufferHandle};
 
-  auto peerIpcInfos = storeAllGather(
-      store, prefix + "-IntraNodeCommHandShake-2", rank, worldSize, ipcInfo);
+  auto peerIpcInfos =
+      storeAllGather(store_, "handshake-1", rank_, worldSize_, ipcInfo);
 
   for (const auto& info : peerIpcInfos) {
     if (!isSame(info.nvlMesh, peerIpcInfos.front().nvlMesh) ||
@@ -400,13 +381,13 @@ c10::intrusive_ptr<IntraNodeComm> IntraNodeComm::rendezvous(
                    << int(info.topology) << " and " << int(topology) << ")";
       AT_CUDA_CHECK(cudaFree(p2pState));
       AT_CUDA_CHECK(cudaFree(buffer));
-      return nullptr;
+      return false;
     }
   }
 
   std::array<void*, kMaxDevices> p2pStates = {}, buffers = {};
   for (size_t r = 0; r < peerIpcInfos.size(); ++r) {
-    if (r == rank) {
+    if (r == rank_) {
       p2pStates[r] = p2pState;
       buffers[r] = buffer;
     } else {
@@ -433,53 +414,18 @@ c10::intrusive_ptr<IntraNodeComm> IntraNodeComm::rendezvous(
   AT_CUDA_CHECK(cudaMemcpy(
       buffersDev, buffers.data(), sizeof(buffers), cudaMemcpyHostToDevice));
 
-  void* topoInfo = initTopoInfo(topology, nvlMesh, rank);
-  return c10::make_intrusive<IntraNodeComm>(
-      topology,
-      p2pStates,
-      buffers,
-      p2pStatesDev,
-      buffersDev,
-      topoInfo,
-      rank,
-      worldSize);
-#else
-  return nullptr;
-#endif
-}
-
-AllReduceAlgo IntraNodeComm::selectAllReduceAlgo(const at::Tensor& input) {
-  return c10d::intra_node_comm::selectAllReduceAlgo(
-      input, topology_, worldSize_);
-}
+  void* topoInfo = initTopoInfo(topology, nvlMesh, rank_);
 
-static int64_t usageCounter = 0;
-
-at::Tensor IntraNodeComm::allReduce(
-    const at::Tensor& input,
-    AllReduceAlgo algo) {
-  // Report usage for testing purposes.
-  // We don't care about overflowing.
-  ++usageCounter;
-  auto stream = at::cuda::getCurrentCUDAStream();
-  c10::cuda::CUDACachingAllocator::recordStream(
-      input.storage().data_ptr(), stream);
-  return c10d::intra_node_comm::allReduce(
-      input,
-      p2pStates_,
-      buffers_,
-      p2pStatesDev_,
-      buffersDev_,
-      topoInfo_,
-      rank_,
-      worldSize_,
-      algo,
-      stream);
-}
-
-int64_t getIntraNodeCommUsageCounter() {
-  return usageCounter;
+  isInitialized_ = true;
+  topology_ = topology;
+  std::copy(p2pStates.begin(), p2pStates.end(), p2pStates_.begin());
+  std::copy(buffers.begin(), buffers.end(), buffers_.begin());
+  p2pStatesDev_ = p2pStatesDev;
+  buffersDev_ = buffersDev;
+  topoInfo_ = topoInfo;
+  return true;
+#endif
+  return false;
 }
 
-} // namespace intra_node_comm
-} // namespace c10d
+} // namespace c10d::intra_node_comm
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cu b/torch/csrc/distributed/c10d/intra_node_comm.cu
index 41ec0105b16b6..6d72bde221253 100644
--- a/torch/csrc/distributed/c10d/intra_node_comm.cu
+++ b/torch/csrc/distributed/c10d/intra_node_comm.cu
@@ -139,12 +139,21 @@ static __global__ void oneShotAllReduceKernel(
     size_t N_aligned,
     P2pState** p2pStates,
     at::BFloat16** buffers,
-    size_t rank) {
+    size_t rank,
+    bool fuseInputCopy) {
   const size_t numelPerThread = kBytesPerThread / sizeof(at::BFloat16);
   const size_t offset =
       (blockDim.x * blockIdx.x + threadIdx.x) * numelPerThread;
   const size_t stride = blockDim.x * gridDim.x * numelPerThread;
 
+  if (fuseInputCopy) {
+    for (size_t i = offset; i < N_aligned; i += stride) {
+      bf16x8 val;
+      streamLoad128(val, &input[i]);
+      streamStore128(&buffers[rank][i], val);
+    }
+  }
+
   // Wait for all other ranks to enter the kernel
   if (threadIdx.x < kWorldSize) {
     auto targetRank = threadIdx.x;
@@ -347,6 +356,7 @@ static __global__ void hybridCubeMeshAllReduceKernel(
     P2pState** p2pStates,
     at::BFloat16** buffers,
     int hcmInfo[4],
+    size_t bufferSize,
     size_t rank) {
   const size_t numelPerThread = kBytesPerThread / sizeof(at::BFloat16);
   const size_t offset =
@@ -368,8 +378,11 @@ static __global__ void hybridCubeMeshAllReduceKernel(
       buffers[hcmInfo[1]],
       buffers[hcmInfo[2]],
   };
-  at::BFloat16* localRelay = buffers[rank] + kMaxIntraNodeSize / 2;
-  at::BFloat16* remoteRelay = buffers[relayRank] + kMaxIntraNodeSize / 2;
+  // Use the half second half of the buffer as relay
+  at::BFloat16* localRelay =
+      buffers[rank] + (bufferSize / sizeof(at::BFloat16) / 2);
+  at::BFloat16* remoteRelay =
+      buffers[relayRank] + (bufferSize / sizeof(at::BFloat16) / 2);
 
   for (size_t i = offset; i < N_aligned; i += stride) {
     bf16x8 vals[4];
@@ -486,43 +499,47 @@ void* initTopoInfo(Topology topology, NvlMesh nvlMesh, size_t rank) {
   return topoInfo;
 }
 
-at::Tensor oneShotAllReduce(
+at::Tensor IntraNodeComm::oneShotAllReduce(
     const at::Tensor& input,
-    std::array<void*, kMaxDevices> p2pStates,
-    std::array<void*, kMaxDevices> buffers,
-    void* p2pStatesDev,
-    void* buffersDev,
-    size_t rank,
-    size_t worldSize,
     at::cuda::CUDAStream& stream) {
-  checkInput(input, rank);
+  checkInput(input, rank_);
 
-  size_t numelPerWarp = kBytesPerThread / input.element_size() * kWarpSize;
-  size_t N_aligned = alignUp(input.numel(), numelPerWarp);
-  TORCH_CHECK(N_aligned <= kMaxIntraNodeSize / input.element_size());
+  const size_t numelPerWarp = kBytesPerThread / input.element_size() * kWarpSize;
+  const size_t N_aligned = alignUp(input.numel(), numelPerWarp);
+  const bool isAligned = (N_aligned == static_cast<size_t>(input.numel()));
+  TORCH_CHECK(N_aligned <= bufferSize_ / input.element_size());
 
   dim3 blocks, threads;
   getLaunchConfig(N_aligned, input.element_size(), blocks, threads);
 
   at::cuda::OptionalCUDAGuard guard(input.get_device());
-  AT_CUDA_CHECK(cudaMemcpyAsync(
-      buffers[rank],
-      input.data_ptr(),
-      input.numel() * input.element_size(),
-      cudaMemcpyDeviceToDevice,
-      stream));
 
-#define X(kWorldSize, kAligned)                           \
-  if (worldSize == kWorldSize) {                          \
-    oneShotAllReduceKernel<kWorldSize, kAligned>          \
-        <<<blocks, threads, 0, stream>>>(                 \
-            input.data_ptr<at::BFloat16>(),               \
-            input.numel(),                                \
-            N_aligned,                                    \
-            reinterpret_cast<P2pState**>(p2pStatesDev),   \
-            reinterpret_cast<at::BFloat16**>(buffersDev), \
-            rank);                                        \
-    C10_CUDA_KERNEL_LAUNCH_CHECK();                       \
+  // When the input data is small, copying inside the kernel is faster. Because
+  // in such cases, the launch overhead of cudaMemcpyAsync outweighs its
+  // efficiency. Here we consider the input data to be small if the copy loop
+  // can finish in a single iteration.
+  const bool fuseInputCopy = isAligned && blocks.x < kMaxAllReduceBlocks;
+  if (!fuseInputCopy) {
+    AT_CUDA_CHECK(cudaMemcpyAsync(
+        buffers_[rank_],
+        input.data_ptr(),
+        input.numel() * input.element_size(),
+        cudaMemcpyDeviceToDevice,
+        stream));
+  }
+
+#define X(kWorldSize, kAligned)                            \
+  if (worldSize_ == kWorldSize) {                          \
+    oneShotAllReduceKernel<kWorldSize, kAligned>           \
+        <<<blocks, threads, 0, stream>>>(                  \
+            input.data_ptr<at::BFloat16>(),                \
+            input.numel(),                                 \
+            N_aligned,                                     \
+            reinterpret_cast<P2pState**>(p2pStatesDev_),   \
+            reinterpret_cast<at::BFloat16**>(buffersDev_), \
+            rank_,                                         \
+            fuseInputCopy);                                \
+    C10_CUDA_KERNEL_LAUNCH_CHECK();                        \
   }
 
 #define DISPATCH_ALL_WORLD_SIZES(kAligned) \
@@ -534,7 +551,7 @@ at::Tensor oneShotAllReduce(
   X(7, kAligned);                          \
   X(8, kAligned);
 
-  if (N_aligned == static_cast<size_t>(input.numel())) {
+  if (isAligned) {
     DISPATCH_ALL_WORLD_SIZES(true);
   } else {
     DISPATCH_ALL_WORLD_SIZES(false);
@@ -545,21 +562,15 @@ at::Tensor oneShotAllReduce(
   return input;
 }
 
-at::Tensor twoShotAllReduce(
+at::Tensor IntraNodeComm::twoShotAllReduce(
     const at::Tensor& input,
-    std::array<void*, kMaxDevices> p2pStates,
-    std::array<void*, kMaxDevices> buffers,
-    void* p2pStatesDev,
-    void* buffersDev,
-    size_t rank,
-    size_t worldSize,
     at::cuda::CUDAStream& stream) {
-  checkInput(input, rank);
+  checkInput(input, rank_);
 
   size_t numelPerWarp = kBytesPerThread / input.element_size() * kWarpSize;
-  size_t N_aligned = alignUp(input.numel(), worldSize * numelPerWarp);
-  size_t N_per_rank = N_aligned / worldSize;
-  TORCH_CHECK(N_aligned <= kMaxIntraNodeSize / input.element_size());
+  size_t N_aligned = alignUp(input.numel(), worldSize_ * numelPerWarp);
+  size_t N_per_rank = N_aligned / worldSize_;
+  TORCH_CHECK(N_aligned <= bufferSize_ / input.element_size());
 
   dim3 blocks, threads;
   getLaunchConfig(N_per_rank, input.element_size(), blocks, threads);
@@ -570,20 +581,20 @@ at::Tensor twoShotAllReduce(
 
   at::cuda::OptionalCUDAGuard guard(input.get_device());
   AT_CUDA_CHECK(cudaMemcpyAsync(
-      buffers[rank],
+      buffers_[rank_],
       input.data_ptr(),
       input.numel() * input.element_size(),
       cudaMemcpyDeviceToDevice,
       stream));
 
 #define X(kWorldSize)                                                   \
-  if (worldSize == kWorldSize) {                                        \
+  if (worldSize_ == kWorldSize) {                                       \
     twoShotAllReduceKernel<kWorldSize><<<blocks, threads, 0, stream>>>( \
         output.data_ptr<at::BFloat16>(),                                \
         N_aligned,                                                      \
-        reinterpret_cast<P2pState**>(p2pStatesDev),                     \
-        reinterpret_cast<at::BFloat16**>(buffersDev),                   \
-        rank);                                                          \
+        reinterpret_cast<P2pState**>(p2pStatesDev_),                    \
+        reinterpret_cast<at::BFloat16**>(buffersDev_),                  \
+        rank_);                                                         \
     C10_CUDA_KERNEL_LAUNCH_CHECK();                                     \
   }
   X(2);
@@ -606,28 +617,21 @@ at::Tensor twoShotAllReduce(
   return input;
 }
 
-at::Tensor hybridCubeMeshAllReduce(
+at::Tensor IntraNodeComm::hybridCubeMeshAllReduce(
     const at::Tensor& input,
-    std::array<void*, kMaxDevices> p2pStates,
-    std::array<void*, kMaxDevices> buffers,
-    void* p2pStatesDev,
-    void* buffersDev,
-    int hcmInfo[4],
-    size_t rank,
-    size_t worldSize,
     at::cuda::CUDAStream& stream) {
-  checkInput(input, rank);
+  checkInput(input, rank_);
 
   size_t numelPerWarp = kBytesPerThread / input.element_size() * kWarpSize;
   size_t N_aligned = alignUp(input.numel(), numelPerWarp);
-  TORCH_CHECK(N_aligned <= kMaxIntraNodeSize / input.element_size());
+  TORCH_CHECK(N_aligned * 2 <= bufferSize_ / input.element_size());
 
   dim3 blocks, threads;
   getLaunchConfig(N_aligned, input.element_size(), blocks, threads);
 
   at::cuda::OptionalCUDAGuard guard(input.get_device());
   AT_CUDA_CHECK(cudaMemcpyAsync(
-      buffers[rank],
+      buffers_[rank_],
       input.data_ptr(),
       input.numel() * input.element_size(),
       cudaMemcpyDeviceToDevice,
@@ -638,10 +642,11 @@ at::Tensor hybridCubeMeshAllReduce(
       input.data_ptr<at::BFloat16>(),                                      \
       input.numel(),                                                       \
       N_aligned,                                                           \
-      reinterpret_cast<P2pState**>(p2pStatesDev),                          \
-      reinterpret_cast<at::BFloat16**>(buffersDev),                        \
-      hcmInfo,                                                             \
-      rank);                                                               \
+      reinterpret_cast<P2pState**>(p2pStatesDev_),                         \
+      reinterpret_cast<at::BFloat16**>(buffersDev_),                       \
+      static_cast<int*>(topoInfo_),                                        \
+      bufferSize_,                                                         \
+      rank_);                                                              \
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   if (N_aligned == static_cast<size_t>(input.numel())) {
@@ -653,83 +658,132 @@ at::Tensor hybridCubeMeshAllReduce(
   return input;
 }
 
-AllReduceAlgo selectAllReduceAlgo(
-    const at::Tensor& input,
-    Topology topology,
-    size_t worldSize) {
+AllReduceAlgo IntraNodeComm::selectAllReduceAlgo(const at::Tensor& input) {
   // Only support bf16 for now
-  if (input.dtype() != at::kBFloat16 ||
-      static_cast<size_t>(input.numel() * input.element_size()) >
-          kMaxIntraNodeSize) {
+  if (input.dtype() != at::kBFloat16) {
     return AllReduceAlgo::NONE;
   }
-  const auto numel = input.numel();
-  const auto numelPerWarp = kBytesPerThread / input.element_size() * kWarpSize;
-  if (topology == Topology::HYBRID_CUBE_MESH) {
+  const auto inputSize = input.numel() * input.element_size();
+  const auto bytesPerWarp = kBytesPerThread * kWarpSize;
+
+  if (topology_ == Topology::HYBRID_CUBE_MESH) {
     TORCH_CHECK(
-        worldSize == 8, "hyperCubeAllReduce only supports exactly 8 GPUs");
-    if (alignUp(numel, numelPerWarp) <= kHcmThreshBytes) {
+        worldSize_ == 8, "hyperCubeAllReduce only supports exactly 8 GPUs");
+    const auto hcmInputSize = alignUp(inputSize, bytesPerWarp);
+    const auto hcmBufferSizeReq = hcmInputSize * 2;
+    if (hcmInputSize <= kHcmThreshBytes && hcmBufferSizeReq <= bufferSize_) {
       return AllReduceAlgo::HCM;
     }
   }
-  if (topology == Topology::FULLY_CONNECTED) {
-    if (alignUp(numel, numelPerWarp) <= kOneShotThreshBytes) {
+  if (topology_ == Topology::FULLY_CONNECTED) {
+    const auto oneShotInputSize = alignUp(inputSize, bytesPerWarp);
+    const auto oneShotBufferSizeReq = oneShotInputSize;
+    if (oneShotInputSize <= kOneShotThreshBytes &&
+        oneShotBufferSizeReq <= bufferSize_) {
       return AllReduceAlgo::ONE_SHOT;
     }
-    if (alignUp(numel, numelPerWarp * worldSize) <= kTwoShotThreshBytes) {
+
+    const auto twoShotInputSize = alignUp(inputSize, bytesPerWarp * worldSize_);
+    const auto twoShotBufferSizeReq = twoShotInputSize;
+    if (twoShotInputSize <= kTwoShotThreshBytes &&
+        twoShotBufferSizeReq <= bufferSize_) {
       return AllReduceAlgo::TWO_SHOT;
     }
   }
   return AllReduceAlgo::NONE;
 }
 
-at::Tensor allReduce(
+static int64_t usageCounter = 0;
+
+at::Tensor IntraNodeComm::allReduce(
     const at::Tensor& input,
-    std::array<void*, kMaxDevices> p2pStates,
-    std::array<void*, kMaxDevices> buffers,
-    void* p2pStatesDev,
-    void* buffersDev,
-    void* topoInfo,
-    size_t rank,
-    size_t worldSize,
-    AllReduceAlgo algo,
-    at::cuda::CUDAStream& stream) {
+    AllReduceAlgo algo) {
+  // Report usage for testing purposes.
+  // We don't care about overflowing.
+  ++usageCounter;
+  auto stream = at::cuda::getCurrentCUDAStream();
+  c10::cuda::CUDACachingAllocator::recordStream(
+      input.storage().data_ptr(), stream);
   switch (algo) {
     case AllReduceAlgo::ONE_SHOT:
-      return oneShotAllReduce(
-          input,
-          p2pStates,
-          buffers,
-          p2pStatesDev,
-          buffersDev,
-          rank,
-          worldSize,
-          stream);
+      return oneShotAllReduce(input, stream);
     case AllReduceAlgo::TWO_SHOT:
-      return twoShotAllReduce(
-          input,
-          p2pStates,
-          buffers,
-          p2pStatesDev,
-          buffersDev,
-          rank,
-          worldSize,
-          stream);
+      return twoShotAllReduce(input, stream);
     case AllReduceAlgo::HCM:
-      return hybridCubeMeshAllReduce(
-          input,
-          p2pStates,
-          buffers,
-          p2pStatesDev,
-          buffersDev,
-          (int*)topoInfo,
-          rank,
-          worldSize,
-          stream);
+      return hybridCubeMeshAllReduce(input, stream);
     default:
       C10_THROW_ERROR(ValueError, "IntraNodeComm: invalid algo");
   }
 }
 
+int64_t getIntraNodeCommUsageCounter() {
+  return usageCounter;
+}
+
+static __global__ void barrierKernel(
+    P2pState** p2pStates,
+    uint64_t mask,
+    size_t rank,
+    size_t worldSize) {
+  if (threadIdx.x < worldSize && (mask & (1ULL << threadIdx.x))) {
+    auto targetRank = threadIdx.x;
+    releaseSignal(&p2pStates[targetRank]->signals0[0][rank]);
+    acquireSignal(&p2pStates[rank]->signals0[0][targetRank]);
+  }
+}
+
+void IntraNodeComm::barrier(c10::optional<std::vector<int64_t>> ranks) {
+  if (!ranks.has_value()) {
+    ranks = std::vector<int64_t>(worldSize_);
+    std::iota(ranks->begin(), ranks->end(), 0);
+  }
+  uint64_t mask = 0;
+  for (const auto& r : ranks.value()) {
+    TORCH_CHECK(r >= 0 && r < static_cast<int64_t>(worldSize_));
+    mask |= (1ULL << r);
+  }
+  barrierKernel<<<1, kWarpSize, 0, at::cuda::getCurrentCUDAStream()>>>(
+      reinterpret_cast<P2pState**>(p2pStatesDev_), mask, rank_, worldSize_);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+void IntraNodeComm::put(const at::Tensor& tensor, int64_t offset) {
+  TORCH_CHECK(
+      tensor.is_non_overlapping_and_dense(),
+      "IntraNodeComm::put(): tensor must be non-overlapping and dense");
+  size_t sz = tensor.numel() * tensor.element_size();
+  TORCH_CHECK(
+      offset + sz <= bufferSize_,
+      "IntraNodeComm::put(): offset + tensor size exceeded "
+      "p2p buffer size");
+  // This results in "Memcpy PtoP" which does not use SMs for copying
+  AT_CUDA_CHECK(cudaMemcpyAsync(
+      static_cast<char*>(buffers_[rank_]) + offset,
+      static_cast<char*>(tensor.data_ptr()),
+      sz,
+      cudaMemcpyDeviceToDevice,
+      at::cuda::getCurrentCUDAStream()));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+void IntraNodeComm::get(size_t rank, at::Tensor tensor, int64_t offset) {
+  TORCH_CHECK(
+      tensor.is_non_overlapping_and_dense(),
+      "IntraNodeComm::get(): tensor must be non-overlapping and dense");
+  size_t sz = tensor.numel() * tensor.element_size();
+  TORCH_CHECK(
+      offset + sz <= bufferSize_,
+      "IntraNodeComm::get(): offset + tensor size exceeded "
+      "p2p buffer size");
+  // This results in "Memcpy PtoP" which does not use SMs for copying
+  AT_CUDA_CHECK(cudaMemcpyAsync(
+      static_cast<char*>(tensor.data_ptr()),
+      static_cast<char*>(buffers_[rank]) + offset,
+      sz,
+      cudaMemcpyDeviceToDevice,
+      at::cuda::getCurrentCUDAStream()));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
 } // namespace intra_node_comm
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.hpp b/torch/csrc/distributed/c10d/intra_node_comm.hpp
index b49499067894c..0e65ebf8d69e4 100644
--- a/torch/csrc/distributed/c10d/intra_node_comm.hpp
+++ b/torch/csrc/distributed/c10d/intra_node_comm.hpp
@@ -6,11 +6,10 @@
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/Work.hpp>
 
-namespace c10d {
-namespace intra_node_comm {
+namespace c10d::intra_node_comm {
 
 constexpr size_t kMaxDevices = 8;
-constexpr size_t kMaxIntraNodeSize = 10 * 1024 * 1024;
+constexpr size_t kDefaultBufferSize = 10ull * 1024 * 1024;
 
 using NvlMesh = std::array<std::array<size_t, kMaxDevices>, kMaxDevices>;
 using HybridCubeMesh = std::array<std::array<int, 4>, kMaxDevices>;
@@ -22,27 +21,25 @@ enum class AllReduceAlgo { NONE = 0, ONE_SHOT = 1, TWO_SHOT = 2, HCM = 3 };
 class TORCH_API IntraNodeComm : public c10::intrusive_ptr_target {
  public:
   IntraNodeComm(
-      Topology topology,
-      std::array<void*, kMaxDevices> p2pStates,
-      std::array<void*, kMaxDevices> buffers,
-      void* p2pStatesDev,
-      void* buffersDev,
-      void* topoInfo,
+      c10::intrusive_ptr<c10d::Store> store,
       size_t rank,
-      size_t worldSize);
+      size_t worldSize,
+      c10::optional<size_t> bufferSize = c10::nullopt);
+
+  ~IntraNodeComm() override;
 
-  ~IntraNodeComm();
+  static bool isEnabled();
 
   /**
-   * Rendezvous via a c10d::Store.
-   * This function may return nullptr if intra-node comm is not applicable.
-   * It guarantees all participants either succeeds or abort.
+   * Performs rendezvous.
+   * If rendezvous fails, the IntraNodeComm object will be in an invalid
+   * state and it is the caller's responsibility to dispose it.
    */
-  static c10::intrusive_ptr<IntraNodeComm> rendezvous(
-      c10::intrusive_ptr<c10d::Store> store,
-      const std::string& prefix,
-      size_t rank,
-      size_t worldSize);
+  bool rendezvous();
+
+  size_t getBufferSize() {
+    return bufferSize_;
+  }
 
   /**
    * Selects a AllReduceAlgo that we think will outperform nccl.
@@ -52,15 +49,51 @@ class TORCH_API IntraNodeComm : public c10::intrusive_ptr_target {
 
   at::Tensor allReduce(const at::Tensor& input, AllReduceAlgo algo);
 
+  /**
+   * Perform a barrier among the specified ranks.
+   */
+  void barrier(c10::optional<std::vector<int64_t>> ranks = c10::nullopt);
+
+  /**
+   * Puts the given tensor into the p2p buffer of the current rank at the
+   * specified offset.
+   */
+  void put(const at::Tensor& tensor, int64_t offset = 0);
+
+  /**
+   * Fills the given tensor with the data from the specified rank's p2p buffer
+   * at the specified offset.
+   */
+  void get(size_t rank, at::Tensor tensor, int64_t offset = 0);
+
  private:
-  Topology topology_;
-  std::array<void*, kMaxDevices> p2pStates_;
-  std::array<void*, kMaxDevices> buffers_;
-  void* p2pStatesDev_;
-  void* buffersDev_;
-  void* topoInfo_;
+  at::Tensor oneShotAllReduce(
+      const at::Tensor& input,
+      at::cuda::CUDAStream& stream);
+
+  at::Tensor twoShotAllReduce(
+      const at::Tensor& input,
+      at::cuda::CUDAStream& stream);
+
+  at::Tensor hybridCubeMeshAllReduce(
+      const at::Tensor& input,
+      at::cuda::CUDAStream& stream);
+
+  c10::intrusive_ptr<Store> store_;
   size_t rank_;
   size_t worldSize_;
+  size_t bufferSize_;
+
+  /**
+   * Members initialized after rendezvous
+   */
+  bool isInitialized_ = false;
+  Topology topology_ = Topology::UNKNOWN;
+  std::array<void*, kMaxDevices> p2pStates_{};
+  std::array<void*, kMaxDevices> buffers_{};
+  void* p2pStatesDev_{};
+  void* buffersDev_{};
+  void* topoInfo_{};
 };
 
 /**
@@ -102,5 +135,4 @@ class IntraNodeCommWork : public c10d::Work {
 
 TORCH_API int64_t getIntraNodeCommUsageCounter();
 
-} // namespace intra_node_comm
-} // namespace c10d
+} // namespace c10d::intra_node_comm
diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp
index 552529282a429..3ce4880930cb2 100644
--- a/torch/csrc/distributed/c10d/logger.cpp
+++ b/torch/csrc/distributed/c10d/logger.cpp
@@ -410,4 +410,33 @@ at::DDPLoggingData Logger::get_ddp_logging_data() {
   return *ddp_logging_data_;
 }
 
+// initialization of static variables in C10dLogger
+std::unique_ptr<C10dLogger> C10dLogger::logger_ = nullptr;
+std::atomic<bool> C10dLogger::registered_(false);
+
+C10dLogger* C10dLogger::getLogger() {
+  if (!registered_.load()) {
+    return nullptr;
+  }
+  return logger_.get();
+}
+
+void C10dLogger::registerLogger(std::unique_ptr<C10dLogger> logger) {
+  if (registered_.load()) {
+    LOG(WARNING) << "C10dLogger has already been registered.";
+    return;
+  }
+  registered_.store(true);
+  logger_ = std::move(logger);
+}
+
+void C10dLogger::log(const C10dLoggingData& data) {
+  for (const auto& [key, value] : data.integers) {
+    LOG(INFO) << key << ": " << value;
+  }
+  for (const auto& [key, value] : data.strings) {
+    LOG(INFO) << key << ": " << value;
+  }
+  return;
+}
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/logger.hpp b/torch/csrc/distributed/c10d/logger.hpp
index 5ae305ce4eb9f..d2949a4f6745a 100644
--- a/torch/csrc/distributed/c10d/logger.hpp
+++ b/torch/csrc/distributed/c10d/logger.hpp
@@ -1,7 +1,7 @@
 #include <c10/util/Logging.h>
 #include <torch/csrc/distributed/c10d/reducer.hpp>
 
-#include <mutex>
+#include <utility>
 
 namespace c10d {
 
@@ -101,4 +101,38 @@ class TORCH_API Logger {
   long num_iterations_stats_recorded_ = 0;
 };
 
+// a generic logging data struct that holds different types of logging data.
+// starting with key value pairs of strings and integers,
+// It can be extended to more types as needed.
+struct C10dLoggingData {
+  // logging fields that are string types.
+  std::map<std::string, std::string> strings;
+  // logging fields that are int64_t types.
+  std::map<std::string, int64_t> integers;
+};
+
+class TORCH_API C10dLogger {
+ public:
+  C10dLogger(const C10dLogger&) = default;
+  C10dLogger(C10dLogger&&) = delete;
+  C10dLogger& operator=(const C10dLogger&) = default;
+  C10dLogger& operator=(C10dLogger&&) = delete;
+  virtual ~C10dLogger() = default;
+  virtual void log(const C10dLoggingData& data);
+  static C10dLogger* getLogger();
+  static void registerLogger(std::unique_ptr<C10dLogger>);
+
+ protected:
+  // singletion, hide constructor from the public
+  C10dLogger(std::string logDestination)
+      : logDestination_(std::move(logDestination)) {}
+
+  // the name of the destination this logger should log to
+  std::string logDestination_;
+
+ private:
+  static std::unique_ptr<C10dLogger> logger_;
+  static std::atomic<bool> registered_;
+};
+
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/logging.cpp b/torch/csrc/distributed/c10d/logging.cpp
index 8ded400535f82..5d05b5a3a5a88 100644
--- a/torch/csrc/distributed/c10d/logging.cpp
+++ b/torch/csrc/distributed/c10d/logging.cpp
@@ -8,8 +8,7 @@
 
 #include <torch/csrc/distributed/c10d/debug.h>
 
-namespace c10d {
-namespace detail {
+namespace c10d::detail {
 
 bool isLogLevelEnabled(LogLevel level) noexcept {
   // c10 logger does not support debug and trace levels. In order to map higher
@@ -35,5 +34,4 @@ bool isLogLevelEnabled(LogLevel level) noexcept {
   return false;
 }
 
-} // namespace detail
-} // namespace c10d
+} // namespace c10d::detail
diff --git a/torch/csrc/distributed/c10d/quantization/quantization_gpu.cu b/torch/csrc/distributed/c10d/quantization/quantization_gpu.cu
index c9b6185a401c5..48cc7cfc4f3ee 100644
--- a/torch/csrc/distributed/c10d/quantization/quantization_gpu.cu
+++ b/torch/csrc/distributed/c10d/quantization/quantization_gpu.cu
@@ -69,15 +69,16 @@ at::Tensor _float_to_bfloat16_cuda(const at::Tensor& input) {
 
   auto output = at::empty(
       {nrows, output_columns},
-      input.options().dtype(at::kHalf)); // at::kHalf
+#if HAS_NCCL_BF16_DATATYPE
+      input.options().dtype(at::kBFloat16));
+#else
+      input.options().dtype(at::kHalf));
+#endif
 
   if (nrows == 0 || output_columns == 0) {
     return output;
   }
 
-  // TODO: replace Half by BFloat16, after BFloat16 is supported by Nvidia
-  // NCCL input.options().dtype(at::kBFloat16)); // at::kBFloat16
-
   constexpr int threads_per_block = 256;
   const int blockDim_x = std::min(output_columns, threads_per_block);
   dim3 blockDim(blockDim_x, threads_per_block / blockDim_x);
@@ -93,10 +94,13 @@ at::Tensor _float_to_bfloat16_cuda(const at::Tensor& input) {
       input.const_data_ptr<float>(),
       nrows,
       ncols,
-      // TODO: replace Half by BFloat16, after BFloat16 is supported by Nvidia
-      // NCCL
-      reinterpret_cast<uint16_t*>(output.mutable_data_ptr<at::Half>()));
-  //C10_CUDA_KERNEL_LAUNCH_CHECK();
+#if HAS_NCCL_BF16_DATATYPE
+      reinterpret_cast<uint16_t*>(output.mutable_data_ptr<at::BFloat16>())
+#else
+      reinterpret_cast<uint16_t*>(output.mutable_data_ptr<at::Half>())
+#endif
+      );
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   return output;
 }
@@ -134,9 +138,11 @@ at::Tensor _bfloat16_to_float_cuda(const at::Tensor& input) {
       blockDim,
       0,
       at::cuda::getCurrentCUDAStream()>>>(
-      // TODO: replace Half by BFloat16, after BFloat16 is supported by Nvidia
-      // NCCL
+#if HAS_NCCL_BF16_DATATYPE
+      reinterpret_cast<const uint16_t*>(input.const_data_ptr<at::BFloat16>()),
+#else
       reinterpret_cast<const uint16_t*>(input.const_data_ptr<at::Half>()),
+#endif
       nrows,
       ncols,
       output.mutable_data_ptr<float>());
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index b0d6ad690d78e..dd5deab25ccbd 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -20,6 +20,7 @@
 #include <torch/csrc/autograd/utils/lambda_post_hook.h>
 #include <torch/csrc/distributed/c10d/comm.hpp>
 #include <torch/csrc/distributed/c10d/logger.hpp>
+#include <utility>
 
 namespace c10d {
 namespace {
@@ -89,7 +90,7 @@ std::vector<at::Tensor> extractTensors(const c10::IValue& result) {
 Reducer::Reducer(
     std::vector<at::Tensor> params,
     std::vector<std::vector<size_t>> bucket_indices,
-    std::vector<size_t> per_bucket_size_limits,
+    const std::vector<size_t>& per_bucket_size_limits,
     c10::intrusive_ptr<c10d::ProcessGroup> process_group,
     std::vector<bool> expect_sparse_gradients,
     int64_t bucket_bytes_cap,
@@ -185,7 +186,8 @@ Reducer::Reducer(
       hooks_.emplace_back(
           grad_accumulator->add_post_hook(
               std::make_unique<torch::autograd::utils::LambdaPostHook>(
-                  [=](const torch::autograd::variable_list& outputs,
+                  [this, variable_index](
+                      const torch::autograd::variable_list& outputs,
                       const torch::autograd::variable_list& /* unused */) {
 #ifndef _WIN32
                     this->rpc_context_.set(
@@ -193,6 +195,9 @@ Reducer::Reducer(
 #endif
                     this->autograd_hook(variable_index);
                     return outputs;
+                  },
+                  [=](torch::autograd::CompiledNodeArgs& args) {
+                    // Make post_hook an noop if compiled_autograds is enabled.
                   })),
           grad_accumulator);
 
@@ -446,7 +451,7 @@ void Reducer::mark_variable_ready_sparse(size_t variable_index) {
     if (sparse_metadata_) {
       grad = grad.coalesce();
       REDUCER_CHECK(
-          param_names_.size() != 0, logger_, "No parameter names were found");
+          !param_names_.empty(), logger_, "No parameter names were found");
       std::string& param_name = param_names_[variable_index];
       auto iter = sparse_metadata_->find(param_name);
       REDUCER_CHECK(
@@ -525,7 +530,7 @@ void Reducer::push_rebuilt_params_for_all_indices() {
 
 void Reducer::push_rebuilt_params(const size_t& index) {
   rebuilt_params_.push_back(params_[index]);
-  rebuilt_param_indices_.push_back(index);
+  rebuilt_param_indices_.push_back(static_cast<int64_t>(index));
 }
 
 void Reducer::set_divide_factor() {
@@ -629,7 +634,7 @@ void Reducer::delay_all_reduce() {
 }
 
 void Reducer::set_logger(std::weak_ptr<c10d::Logger> logger) {
-  logger_ = logger;
+  logger_ = std::move(logger);
 }
 
 // The function `autograd_hook` is called after the gradient for a
@@ -648,7 +653,7 @@ void Reducer::autograd_hook(size_t index) {
     return;
   }
 
-  grad_ready_order_indices_.push_back(index);
+  grad_ready_order_indices_.push_back(static_cast<int64_t>(index));
 
   // See Note [Skip allreducing local_used_map_dev]
   if (dynamic_graph_find_unused() || static_graph_first_iteration()) {
@@ -663,7 +668,7 @@ void Reducer::autograd_hook(size_t index) {
     auto& variable = get_param_from_index(index);
     runGradCallbackForVariable(variable, [&](auto& grad) {
       if (grad.defined()) {
-        local_used_map_[index] = 1;
+        local_used_map_[static_cast<int64_t>(index)] = 1;
       }
       // The gradient is never modified.
       return false;
@@ -724,7 +729,7 @@ void Reducer::autograd_hook(size_t index) {
 void Reducer::all_reduce_local_used_map() {
   // See Note [Skip allreducing local_used_map_dev]
   // H2D from local_used_map_ to local_used_map_dev_
-  if (local_used_map_dev_.is_cuda()) {
+  if (local_used_map_dev_.is_cuda() || local_used_map_dev_.is_privateuseone()) {
     // Note [local_used_map_ -> local_used_map_dev copying]
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     // We do async H2D to avoid the blocking overhead. The async copy and
@@ -734,8 +739,8 @@ void Reducer::all_reduce_local_used_map() {
     // Correct sequencing with respect to host operations is also
     // essential. The H2D copy_ is stream ordered, while the host's
     // changes to local_used_map_ are host ordered. If a large backlog of
-    // cuda-stream work pushes the copy_ far into the future, and if no
-    // blocking calls occur between now and finalize_backward()** such
+    // cuda/privateuseone-stream work pushes the copy_ far into the future, and
+    // if no blocking calls occur between now and finalize_backward()** such
     // that finalize_backward() re-zeroes local_used_map_ on the host
     // before the stream executes the copy_, copy_ will read those zeros
     // instead of the values we thought we told it to read here. Copying
@@ -766,8 +771,8 @@ void Reducer::all_reduce_local_used_map() {
   } else if (local_used_map_dev_.is_mtia()) {
     // MTIA probably will have special logic in the future, following code might
     // be changed drastically. Therefore, a new if case is created for MTIA, for
-    // now, the implementation is similar to the CUDA one, except for
-    // the pin memory step.
+    // now, the implementation is similar to the CUDA/privateuseone one, except
+    // for the pin memory step.
     auto local_used_map_tmp = at::native::empty_like(
         local_used_map_,
         c10::optTypeMetaToScalarType(local_used_map_.options().dtype_opt()),
@@ -907,7 +912,7 @@ void Reducer::mark_variable_ready(size_t variable_index) {
       all_reduce_local_used_map();
     }
 
-    torch::autograd::Engine::get_default_engine().queue_callback([=] {
+    torch::autograd::Engine::get_default_engine().queue_callback([this] {
       std::lock_guard<std::mutex> lock(this->mutex_);
       if (should_collect_runtime_stats()) {
         record_backward_compute_end_time();
@@ -961,7 +966,9 @@ void Reducer::all_reduce_bucket(Bucket& bucket) {
       const auto offset = bucket.offsets[i];
       const auto length = bucket.lengths[i];
       if (!bucket.bucket_views_in[i].is_alias_of(tensor)) {
-        tensor.narrow(0, offset, length)
+        tensor
+            .narrow(
+                0, static_cast<int64_t>(offset), static_cast<int64_t>(length))
             .copy_(bucket.bucket_views_in[i].flatten());
       }
     }
@@ -1238,7 +1245,10 @@ void Reducer::initialize_bucket_views(Reducer::Bucket& bucket) {
       // AccumulateGrad will do the same when stashing grads for non-dense
       // params.
       bucket.bucket_views_in.push_back(
-          gradients.narrow(0, offset, length).view(v.sizes()));
+          gradients
+              .narrow(
+                  0, static_cast<int64_t>(offset), static_cast<int64_t>(length))
+              .view(v.sizes()));
     }
     // By default `bucket_views_out` and `bucket_views_in` are
     // essentially the same thing.
@@ -1294,7 +1304,10 @@ void Reducer::populate_bucket_views_out(
       // AccumulateGrad will do the same when stashing grads for non-dense
       // params.
       bucket.bucket_views_out.push_back(
-          tensor.narrow(0, offset, length).view(v.sizes()));
+          tensor
+              .narrow(
+                  0, static_cast<int64_t>(offset), static_cast<int64_t>(length))
+              .view(v.sizes()));
     }
   }
 }
@@ -1511,7 +1524,8 @@ void Reducer::finalize_bucket_dense(Bucket& bucket) {
       // parameters are always used. Then we only pay the overhead cost if
       // there is indeed a parameter that is locally unused, because we need
       // to check if it's also globally unused.
-      size_t variable_index = bucket.variable_indices[intra_bucket_index];
+      int64_t variable_index =
+          static_cast<int64_t>(bucket.variable_indices[intra_bucket_index]);
       // Note: global_unused might not be global yet. As we lazily wait for
       // the reduction to complete, it becomes really global only if we get to
       // the point as below where we wait for the reduction work, make D2H
@@ -1711,7 +1725,7 @@ void Reducer::sync_bucket_indices(
   for (const auto i : c10::irange(num_buckets)) {
     auto bucket_size = bucket_indices.at(i).size();
     bucket_sizes.push_back(bucket_size);
-    total_size += bucket_size;
+    total_size += static_cast<int64_t>(bucket_size);
   }
 
   at::TensorOptions options;
@@ -1726,10 +1740,11 @@ void Reducer::sync_bucket_indices(
   for (const auto i : c10::irange(num_buckets)) {
     const auto& bucket_size = bucket_indices.at(i).size();
     for (const auto j : c10::irange(bucket_size)) {
-      indices_accessor[indices_accessor_Index++] = bucket_indices[i][j];
+      indices_accessor[indices_accessor_Index++] =
+          static_cast<int>(bucket_indices[i][j]);
     }
   }
-  indices_accessor[indices_accessor_Index] = num_buckets;
+  indices_accessor[indices_accessor_Index] = static_cast<int>(num_buckets);
 
   // Copy CPU tensor to device tensor, as the process_group_ could be NCCL and
   // it can only broadcast device tensors.
@@ -1765,7 +1780,7 @@ void Reducer::sync_bucket_indices(
   bucket_indices.reserve(num_buckets);
   indices_accessor_Index = 0;
   for (const auto i : c10::irange(num_buckets)) {
-    const auto& bucket_size = bucket_sizes_accessor[i];
+    const auto& bucket_size = bucket_sizes_accessor[static_cast<int64_t>(i)];
     std::vector<size_t> bucket;
     bucket.reserve(bucket_size);
     for (const auto j : c10::irange(bucket_size)) {
@@ -1802,11 +1817,9 @@ bool Reducer::rebuild_buckets() {
           params_.size(),
           " versus rebuilt params size of: ",
           rebuilt_param_indices_.size()));
-  std::vector<std::vector<size_t>> rebuilt_bucket_indices;
   std::vector<size_t> bucket_size_limits;
   bucket_size_limits.push_back(first_bucket_bytes_cap_);
   bucket_size_limits.push_back(bucket_bytes_cap_);
-  std::vector<size_t> per_bucket_size_limits;
   auto ddp_set_last_bucket_as_small =
       (getCvarString({"DDP_SET_LAST_BUCKET_CAP"}, "N/A") == "1");
 
@@ -1819,7 +1832,7 @@ bool Reducer::rebuild_buckets() {
     std::reverse(rebuilt_params_.begin(), rebuilt_params_.end());
     std::reverse(rebuilt_param_indices_.begin(), rebuilt_param_indices_.end());
   }
-  std::tie(rebuilt_bucket_indices, per_bucket_size_limits) =
+  auto [rebuilt_bucket_indices, per_bucket_size_limits] =
       compute_bucket_assignment_by_size(
           rebuilt_params_,
           bucket_size_limits,
@@ -2059,7 +2072,9 @@ struct BucketKey {
   BucketKey(c10::ScalarType type, c10::Device device)
       : type(type), device(device) {}
 
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const*)
   const c10::ScalarType type;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const*)
   const c10::Device device;
 
   // See torch/csrc/utils/hash.h for dispatch code.
@@ -2261,10 +2276,10 @@ void verify_params_across_processes(
   i = 0;
   for (const auto& t : params) {
     for (const auto& sz : t.sizes()) {
-      metadata_accessor[i++] = sz;
+      metadata_accessor[static_cast<int64_t>(i++)] = sz;
     }
     for (const auto& str : t.strides()) {
-      metadata_accessor[i++] = str;
+      metadata_accessor[static_cast<int64_t>(i++)] = str;
     }
   }
 
diff --git a/torch/csrc/distributed/c10d/reducer.hpp b/torch/csrc/distributed/c10d/reducer.hpp
index 43782204be054..e940a56bd650a 100644
--- a/torch/csrc/distributed/c10d/reducer.hpp
+++ b/torch/csrc/distributed/c10d/reducer.hpp
@@ -51,7 +51,7 @@ class TORCH_API Reducer {
   explicit Reducer(
       std::vector<at::Tensor> params,
       std::vector<std::vector<size_t>> bucket_indices,
-      std::vector<size_t> per_bucket_size_limits,
+      const std::vector<size_t>& per_bucket_size_limits,
       c10::intrusive_ptr<c10d::ProcessGroup> process_group,
       std::vector<bool> expect_sparse_gradients,
       int64_t bucket_bytes_cap,
@@ -303,11 +303,9 @@ class TORCH_API Reducer {
   using GradCallback = std::function<bool(at::Tensor&)>;
 #ifndef _WIN32
   static_assert(
-      std::is_same<
+      std::is_same_v<
           GradCallback,
-          torch::distributed::autograd::DistAutogradContext::GradCallback>::
-          value,
-      "");
+          torch::distributed::autograd::DistAutogradContext::GradCallback>);
 #endif
   void runGradCallbackForVariable(at::Tensor& variable, GradCallback&& cb);
 
@@ -540,7 +538,7 @@ class TORCH_API Reducer {
   std::unordered_map<size_t, std::string> param_names_;
   // Variable indices stored sequentially in order of when the gradient is ready
   // for the current backwards pass.
-  std::vector<int> grad_ready_order_indices_;
+  std::vector<int64_t> grad_ready_order_indices_;
   // Bytes capacity of first bucket, can be configured by user
   int64_t first_bucket_bytes_cap_;
   // Per iteration set of parameter indices that have been marked ready.
diff --git a/torch/csrc/distributed/c10d/sequence_num.hpp b/torch/csrc/distributed/c10d/sequence_num.hpp
index 50c800e8d7980..8c80642f42784 100644
--- a/torch/csrc/distributed/c10d/sequence_num.hpp
+++ b/torch/csrc/distributed/c10d/sequence_num.hpp
@@ -3,6 +3,7 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/Optional.h>
 #include <c10/util/irange.h>
+#include <mutex>
 #include <vector>
 
 namespace c10d {
diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp
index 5013f25405dec..093a47a076b0b 100644
--- a/torch/csrc/distributed/c10d/socket.cpp
+++ b/torch/csrc/distributed/c10d/socket.cpp
@@ -8,7 +8,6 @@
 
 #include <cstring>
 #include <system_error>
-#include <thread>
 #include <utility>
 #include <vector>
 
@@ -38,8 +37,7 @@ C10_DIAGNOSTIC_POP()
 
 #include <c10/util/CallOnce.h>
 
-namespace c10d {
-namespace detail {
+namespace c10d::detail {
 namespace {
 #ifdef _WIN32
 
@@ -183,8 +181,7 @@ class SocketImpl {
 
   Handle hnd_;
 };
-} // namespace detail
-} // namespace c10d
+} // namespace c10d::detail
 
 //
 // libfmt formatters for `addrinfo` and `Socket`
@@ -251,8 +248,7 @@ struct formatter<c10d::detail::SocketImpl> {
 
 } // namespace fmt
 
-namespace c10d {
-namespace detail {
+namespace c10d::detail {
 
 SocketImpl::~SocketImpl() {
 #ifdef _WIN32
@@ -458,6 +454,7 @@ class SocketListenOp {
   bool tryListen(const ::addrinfo& addr);
 
   template <typename... Args>
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
   void recordError(fmt::string_view format, Args&&... args) {
     auto msg = fmt::vformat(format, fmt::make_format_args(args...));
 
@@ -614,7 +611,9 @@ class SocketListenFromFdOp {
   std::unique_ptr<SocketImpl> run() const;
 
  private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const int fd_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const std::uint16_t expected_port_;
 };
 
@@ -624,7 +623,7 @@ SocketListenFromFdOp::SocketListenFromFdOp(int fd, std::uint16_t expected_port)
 std::unique_ptr<SocketImpl> SocketListenFromFdOp::run() const {
   C10D_DEBUG("listenFromFd: fd {}, expected port {}", fd_, expected_port_);
 
-  ::sockaddr_storage addr_storage;
+  ::sockaddr_storage addr_storage{};
   ::socklen_t addr_len = sizeof(addr_storage);
   if (::getsockname(
           fd_, reinterpret_cast<::sockaddr*>(&addr_storage), &addr_len) < 0) {
@@ -691,6 +690,7 @@ class SocketConnectOp {
   [[noreturn]] void throwTimeoutError() const;
 
   template <typename... Args>
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
   void recordError(fmt::string_view format, Args&&... args) {
     auto msg = fmt::vformat(format, fmt::make_format_args(args...));
 
@@ -1033,6 +1033,4 @@ bool Socket::waitForInput(std::chrono::milliseconds timeout) {
   return impl_->waitForInput(timeout);
 }
 
-} // namespace detail
-
-} // namespace c10d
+} // namespace c10d::detail
diff --git a/torch/csrc/distributed/rpc/agent_utils.cpp b/torch/csrc/distributed/rpc/agent_utils.cpp
index 985d32a12024d..8eaae18cb2095 100644
--- a/torch/csrc/distributed/rpc/agent_utils.cpp
+++ b/torch/csrc/distributed/rpc/agent_utils.cpp
@@ -176,8 +176,7 @@ int syncCallCount(
     ::c10d::PrefixStore store,
     const int worldSize,
     int activeCalls) {
-  std::string processCountKey, activeCallCountKey, readyKey;
-  std::tie(processCountKey, activeCallCountKey, readyKey) = getNextKeyIds();
+  auto [processCountKey, activeCallCountKey, readyKey] = getNextKeyIds();
 
   // Add to keys which will record the number of processes and active calls
   store.add(activeCallCountKey, activeCalls);
diff --git a/torch/csrc/distributed/rpc/request_callback_impl.cpp b/torch/csrc/distributed/rpc/request_callback_impl.cpp
index 80c9944e2bbfb..5ceb0d23f9b7c 100644
--- a/torch/csrc/distributed/rpc/request_callback_impl.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_impl.cpp
@@ -1,6 +1,5 @@
 #include <torch/csrc/distributed/rpc/request_callback_impl.h>
 
-#include <c10/util/C++17.h>
 #include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/distributed/autograd/context/container.h>
 #include <torch/csrc/distributed/autograd/context/context.h>
diff --git a/torch/csrc/distributed/rpc/script_remote_call.cpp b/torch/csrc/distributed/rpc/script_remote_call.cpp
index 68da06d342ee3..c46234379bab9 100644
--- a/torch/csrc/distributed/rpc/script_remote_call.cpp
+++ b/torch/csrc/distributed/rpc/script_remote_call.cpp
@@ -1,7 +1,6 @@
 #include <torch/csrc/distributed/rpc/rpc_agent.h>
 #include <torch/csrc/distributed/rpc/script_remote_call.h>
 
-#include <c10/util/C++17.h>
 #include <torch/csrc/jit/serialization/pickle.h>
 
 namespace torch {
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 74dc99bbea4d1..0f0cf00201612 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -564,10 +564,7 @@ void TensorPipeAgent::pipeRead(
       GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_);
       streams = getStreamsFromPoolForDevices(devices_);
     }
-    tensorpipe::Allocation tpAllocation;
-    TensorpipeReadBuffers tpBuffers;
-    std::tie(tpAllocation, tpBuffers) =
-        tensorpipeAllocate(tpDescriptor, streams);
+    auto [tpAllocation, tpBuffers] = tensorpipeAllocate(tpDescriptor, streams);
 
     pipe->read(
         std::move(tpAllocation),
@@ -597,10 +594,7 @@ void TensorPipeAgent::pipeWrite(
     std::vector<c10::Device>&& devices,
     std::vector<c10::Stream> streams,
     std::function<void(const tensorpipe::Error&)> fn) noexcept {
-  tensorpipe::Message tpMessage;
-  TensorpipeWriteBuffers tpBuffers;
-
-  std::tie(tpMessage, tpBuffers) =
+  auto [tpMessage, tpBuffers] =
       tensorpipeSerialize(std::move(rpcMessage), std::move(devices), streams);
 
   pipe->write(
@@ -816,11 +810,15 @@ c10::intrusive_ptr<JitFuture> TensorPipeAgent::send(
       // An instance of ClientPipe cannot be copied or moved as it contains a
       // mutex, and to force in-place construction in GCC 5 we need piecewise
       // construction in order to work around an issue.
-      std::tie(it, std::ignore) = connectedPipes_.emplace(
-          std::piecewise_construct,
-          std::forward_as_tuple(toWorkerInfo.id_),
-          std::forward_as_tuple(context_->connect(
-              url, tensorpipe::PipeOptions().remoteName(toWorkerInfo.name_))));
+      it = connectedPipes_
+               .emplace(
+                   std::piecewise_construct,
+                   std::forward_as_tuple(toWorkerInfo.id_),
+                   std::forward_as_tuple(context_->connect(
+                       url,
+                       tensorpipe::PipeOptions().remoteName(
+                           toWorkerInfo.name_))))
+               .first;
     }
   }
   ClientPipe& clientPipe = it->second;
diff --git a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
index 7080e1162b531..968f599752d64 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
@@ -98,7 +98,7 @@ class TensorpipeCudaConverter : public TensorpipeDeviceTypeConverter {
   }
 
   at::DataPtr allocateTensorForReceiving(
-      int deviceIndex,
+      c10::DeviceIndex deviceIndex,
       size_t length,
       const std::vector<c10::Stream>& streams,
       tensorpipe::Allocation& allocation) const override {
diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
index 83060bd6456ee..0b3715f44f86d 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp
@@ -3,6 +3,7 @@
 #ifdef USE_TENSORPIPE
 
 #include <c10/util/irange.h>
+#include <limits>
 
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated")
 #include <tensorpipe/tensorpipe.h>
@@ -74,7 +75,7 @@ class TensorpipeCpuConverter : public TensorpipeDeviceTypeConverter {
   }
 
   at::DataPtr allocateTensorForReceiving(
-      int /* deviceIndex */,
+      c10::DeviceIndex /* deviceIndex */,
       size_t length,
       const std::vector<c10::Stream>& /* streams */,
       tensorpipe::Allocation& allocation) const override {
@@ -151,8 +152,7 @@ std::tuple<tensorpipe::Message, TensorpipeWriteBuffers> tensorpipeSerialize(
   buffers.payload = std::move(rpcMessage->payload());
   // TensorPipe uses the same Message class for both reading and writing, thus
   // it uses non-const pointers even though it doesn't modify them when writing.
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-  char* payloadPtr = const_cast<char*>(buffers.payload.data());
+  char* payloadPtr = buffers.payload.data();
   // kTpMessagePayloadIdx = 2
   tpMessage.payloads.push_back(
       tensorpipe::Message::Payload{payloadPtr, buffers.payload.size()});
@@ -267,8 +267,14 @@ std::pair<tensorpipe::Allocation, TensorpipeReadBuffers> tensorpipeAllocate(
         targetDeviceType);
 
     TORCH_INTERNAL_ASSERT(tpAllocation.tensors.size() == tensorIdx);
+    TORCH_INTERNAL_ASSERT(
+        tensor.targetDevice->index <=
+        std::numeric_limits<c10::DeviceIndex>::max());
     at::DataPtr dataPtr = converter->allocateTensorForReceiving(
-        tensor.targetDevice->index, tensor.length, streams, tpAllocation);
+        static_cast<c10::DeviceIndex>(tensor.targetDevice->index),
+        tensor.length,
+        streams,
+        tpAllocation);
     TORCH_INTERNAL_ASSERT(tpAllocation.tensors.size() == tensorIdx + 1);
 
     buffers.tensors.push_back(std::move(dataPtr));
diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.h b/torch/csrc/distributed/rpc/tensorpipe_utils.h
index bf5d87cacc4b5..1011a9c34c3d8 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_utils.h
+++ b/torch/csrc/distributed/rpc/tensorpipe_utils.h
@@ -35,7 +35,7 @@ class TensorpipeDeviceTypeConverter {
   // Same as above: this method cannot return a tensorpipe::Allocation::Tensor,
   // thus it appends it to the `tensors` field of the tensorpipe::Allocation.
   virtual at::DataPtr allocateTensorForReceiving(
-      int deviceIndex,
+      c10::DeviceIndex deviceIndex,
       size_t length,
       const std::vector<c10::Stream>& streams,
       tensorpipe::Allocation& allocation) const = 0;
diff --git a/torch/csrc/dynamo/cache_entry.cpp b/torch/csrc/dynamo/cache_entry.cpp
new file mode 100644
index 0000000000000..ee783c710a0d8
--- /dev/null
+++ b/torch/csrc/dynamo/cache_entry.cpp
@@ -0,0 +1,54 @@
+#include <torch/csrc/dynamo/cache_entry.h>
+#include <torch/csrc/dynamo/guards.h>
+
+#include <torch/csrc/dynamo/debug_macros.h>
+#include <torch/csrc/dynamo/extra_state.h>
+
+CacheEntry::CacheEntry(const py::handle& guarded_code, PyObject* backend) {
+  this->check_fn = guarded_code.attr("check_fn");
+  this->code = guarded_code.attr("code");
+  this->backend = backend;
+  // TODO - clean this up when enable_cpp_guard_manager is True by default
+  if (py::hasattr(this->check_fn, "root")) {
+    this->root_mgr = torch::dynamo::convert_to_root_guard_manager(
+        this->check_fn.attr("root"));
+  }
+}
+
+CacheEntry::~CacheEntry() {
+  // prevent check_fn from use-after-free when invalidating
+  this->check_fn.attr("cache_entry") = py::none();
+  this->check_fn.attr("extra_state") = py::none();
+}
+
+py::object CacheEntry::next() {
+  NULL_CHECK(this->_owner);
+  auto it = this->_owner_loc;
+  ++it;
+  if (it == this->_owner->cache_entry_list.end()) {
+    return py::none();
+  }
+  return py::cast(*it, py::return_value_policy::reference);
+}
+
+PyCodeObject* CacheEntry_get_code(CacheEntry* e) {
+  return (PyCodeObject*)e->code.ptr();
+}
+
+PyObject* CacheEntry_to_obj(CacheEntry* e) {
+  if (!e) {
+    return py::none().release().ptr();
+  }
+  return py::cast(e, py::return_value_policy::reference).release().ptr();
+}
+
+PyObject* get_backend(PyObject* callback) {
+  py::handle handle = py::handle(callback);
+  while (py::hasattr(handle, "_torchdynamo_orig_callable")) {
+    handle = handle.attr("_torchdynamo_orig_callable");
+  }
+  if (py::hasattr(handle, "compiler_fn")) {
+    handle = handle.attr("compiler_fn");
+  }
+  return handle.ptr();
+}
diff --git a/torch/csrc/dynamo/cache_entry.h b/torch/csrc/dynamo/cache_entry.h
new file mode 100644
index 0000000000000..1b77f70f6e92a
--- /dev/null
+++ b/torch/csrc/dynamo/cache_entry.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <Python.h>
+
+#ifdef __cplusplus
+
+#include <torch/csrc/dynamo/utils.h>
+#include <torch/csrc/utils/pybind.h>
+#include <list>
+
+extern "C" {
+
+#endif
+
+/*
+Our cache resides on the extra scratch space of the code object. The structure
+of the cache is as follows:
+
+-> ExtraState
+  -> CacheEntry (list)
+    -> check_fn
+    -> code
+  -> FrameState
+
+CacheEntry is a linked list node containing the check_fn for guards
+and the optimized code.
+
+The FrameState is a PyDict that enables sharing between different frames. This
+is used to detect dynamism in automatic dynamic shapes.
+
+These two are encapsulated into a ExtraState.
+*/
+
+typedef struct CacheEntry CacheEntry;
+typedef struct ExtraState ExtraState;
+
+#ifdef __cplusplus
+
+typedef struct VISIBILITY_HIDDEN CacheEntry {
+  // check the guards: lambda: <locals of user function>: bool
+  py::object check_fn;
+  // modified user bytecode (protected by check_fn's guards)
+  py::object code;
+  // root guard manager if exists
+  void* root_mgr{nullptr};
+  // backend used to create this cache entry
+  PyObject* backend{nullptr};
+  // Reference to owning ExtraState
+  ExtraState* _owner{nullptr};
+  // Reference to this CacheEntry's location in owner's linked list
+  std::list<CacheEntry>::iterator _owner_loc;
+
+  CacheEntry(const py::handle& guarded_code, PyObject* backend);
+  ~CacheEntry();
+
+  // Warning: returns a reference whose lifetime is controlled by C++
+  py::object next();
+} CacheEntry;
+
+#endif
+
+// Returns borrowed reference
+PyCodeObject* CacheEntry_get_code(CacheEntry* e);
+
+// Returns a borrowed reference to CacheEntry as a PyObject
+// Warning: lifetime is controlled by C++
+PyObject* CacheEntry_to_obj(CacheEntry* e);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/torch/csrc/dynamo/compiled_autograd.h b/torch/csrc/dynamo/compiled_autograd.h
index 6f81fcb1c4007..a92d6ade0c002 100644
--- a/torch/csrc/dynamo/compiled_autograd.h
+++ b/torch/csrc/dynamo/compiled_autograd.h
@@ -1,7 +1,12 @@
 #pragma once
+#include <ATen/TensorGeometry.h>
+#include <ATen/core/ivalue.h>
 #include <c10/core/impl/TorchDispatchModeTLS.h>
-#include <torch/csrc/autograd/custom_function.h>
-#include <torch/csrc/autograd/engine.h>
+#include <c10/util/flat_hash_map.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/input_metadata.h>
+#include <torch/csrc/autograd/saved_variable.h>
+#include <torch/csrc/autograd/variable_info.h>
 #include <torch/csrc/utils/python_stub.h>
 #include <torch/csrc/utils/torch_dispatch_mode.h>
 #include <typeindex>
@@ -29,6 +34,7 @@ struct CacheKeyBuffer {
   }
 
  private:
+  // NOLINTNEXTLINE(*c-array*)
   std::unique_ptr<uint8_t[]> data;
 };
 
@@ -68,7 +74,7 @@ struct NodeCall {
       : id(id_), node(std::move(node_)) {}
 
   void mark_output(int input_nr, int output_idx) {
-    graph_output.emplace_back(std::make_pair(input_nr, output_idx));
+    graph_output.emplace_back(input_nr, output_idx);
   }
 
   uint32_t id;
@@ -162,10 +168,10 @@ struct TensorArgs {
 struct AutogradCompilerCall {
   void add_size_input(const c10::SymInt& s) {
     all_size_inputs.emplace_back(
-        SizeInput(default_dyn_type, s.guard_int(__FILE__, __LINE__)));
+        default_dyn_type, s.guard_int(__FILE__, __LINE__));
   }
 
-  int emplace_hook(c10::SafePyObject&& fn) {
+  size_t emplace_hook(c10::SafePyObject&& fn) {
     hooks.emplace_back(std::move(fn));
     return hooks.size() - 1;
   }
@@ -236,6 +242,48 @@ class CompiledNodeArgs {
     collect(t.first);
     collect(t.second);
   }
+  template <typename V>
+  void collect(const ska::flat_hash_map<std::string, V>& m) {
+    collect_size(m.size());
+
+    std::vector<std::string> keys;
+    keys.reserve(m.size());
+    std::transform(
+        m.begin(), m.end(), std::back_inserter(keys), [](const auto& entry) {
+          return entry.first;
+        });
+    std::sort(keys.begin(), keys.end());
+    for (const auto& k : keys) {
+      collect(k);
+      collect(m.at(k));
+    }
+  }
+  void collect(const at::IValue& iv) {
+    if (iv.isList()) {
+      c10::List<at::IValue> list = iv.toList();
+      collect_size(list.size());
+      for (auto&& value : list) {
+        collect(value);
+      }
+    } else if (iv.isGenericDict()) {
+      c10::Dict<at::IValue, at::IValue> ordered_dict = iv.toGenericDict();
+      collect_size(ordered_dict.size());
+      // NOLINTNEXTLINE(modernize-loop-convert)
+      for (auto it = ordered_dict.begin(); it != ordered_dict.end(); it++) {
+        collect(it->key());
+        collect(it->value());
+      }
+    } else {
+      try {
+        collect(static_cast<uint64_t>(at::IValue::hash(iv)));
+      } catch (const std::runtime_error& e) {
+        std::string msg =
+            "Compiled autograd can not trace unhashable IValues, error: " +
+            std::string(e.what());
+        TORCH_CHECK_NOT_IMPLEMENTED(false, msg);
+      }
+    }
+  }
   void collect(const c10::Scalar& t) {
     auto type = t.type();
     specialize_on_bytes(type);
@@ -361,7 +409,7 @@ class CompiledNodeArgs {
     collect_size(_node_call.pre_hooks.size());
     collect_size(_node_call.post_hooks.size());
     for (const auto& h : _node_call.tensor_pre_hooks) {
-      collect_size(h.second); // index
+      collect_size(static_cast<size_t>(h.second));
     }
   }
 
@@ -371,10 +419,18 @@ class CompiledNodeArgs {
         typeid(*node), _specialization_key, _specialization_key_size);
   }
 
+  size_t add_backward(c10::SafePyObject&& obj) {
+    return _compiler.emplace_hook(std::move(obj));
+  }
+
+  size_t add_backward_state(c10::SafePyObject&& obj) {
+    return _compiler.emplace_hook(std::move(obj));
+  }
+
   void add_tensor_pre_hook(c10::SafePyObject&& obj, int index) {
     auto fn_id = _compiler.emplace_hook(std::move(obj));
     collect_size(fn_id);
-    _node_call.tensor_pre_hooks.emplace_back(std::make_pair(fn_id, index));
+    _node_call.tensor_pre_hooks.emplace_back(fn_id, index);
   }
 
   void add_pre_hook(c10::SafePyObject&& obj) {
@@ -395,7 +451,11 @@ class CompiledNodeArgs {
     _node_call.post_acc_grad_hooks.emplace_back(fn_id);
   }
 
-  void collect_size(size_t s) {
+  // Need to template the size_t to silence internal 32-bit build errors due to
+  // a mix of -Werror, -Wtautological-type-limit-compare and
+  // -Wunknown-pragmas
+  template <typename T>
+  std::enable_if_t<std::is_unsigned_v<T>, void> collect_size(T s) {
     // we expect sizes to be small, so try to cram them into a single byte
     constexpr uint8_t encode_as_u64 = std::numeric_limits<uint8_t>::max();
     constexpr uint8_t encode_as_u32 = encode_as_u64 - 1;
@@ -428,11 +488,11 @@ class CompiledNodeArgs {
   CompiledNodeArgs(AutogradCompilerCall& compiler, NodeCall& node_call)
       : _compiler(compiler),
         _node_call(node_call),
-        _specialization_key_size(0),
-        _specialization_key_storage(1024),
         _specialization_key(
+            // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
             (uint8_t*)std::malloc(_specialization_key_storage)) {}
   ~CompiledNodeArgs() {
+    // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
     std::free(_specialization_key);
   }
   CompiledNodeArgs(const CompiledNodeArgs&) = delete;
@@ -443,6 +503,7 @@ class CompiledNodeArgs {
     while (C10_UNLIKELY(
         _specialization_key_size + sizeof(T) > _specialization_key_storage)) {
       _specialization_key_storage *= 2;
+      // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
       _specialization_key = (uint8_t*)std::realloc(
           _specialization_key, _specialization_key_storage);
     }
@@ -452,8 +513,8 @@ class CompiledNodeArgs {
 
   AutogradCompilerCall& _compiler;
   NodeCall& _node_call;
-  size_t _specialization_key_size;
-  size_t _specialization_key_storage;
+  size_t _specialization_key_size{0};
+  size_t _specialization_key_storage{1024};
   uint8_t* _specialization_key;
 };
 
@@ -461,7 +522,7 @@ struct TraceState {
   TraceState(
       const std::vector<c10::optional<c10::SymInt>>& ss,
       size_t num_outputs)
-      : sym_sizes_index(0), sym_sizes(ss), outputs(num_outputs) {}
+      : sym_sizes(ss), outputs(num_outputs) {}
 
   void debug_asserts() {
     TORCH_INTERNAL_ASSERT(sym_sizes_index == sym_sizes.size());
@@ -471,7 +532,7 @@ struct TraceState {
     return sym_sizes[sym_sizes_index++];
   }
 
-  size_t sym_sizes_index;
+  size_t sym_sizes_index{0};
   std::vector<c10::optional<c10::SymInt>> sym_sizes;
   variable_list outputs;
 };
@@ -516,6 +577,14 @@ class SwapSavedVariables {
     stashed_symints.restore(&t);
   }
 
+  void before(at::IValue& t) {
+    stashed_ivalues.save(&t, at::IValue(t));
+  }
+
+  void after(at::IValue& t) {
+    stashed_ivalues.restore(&t);
+  }
+
   void before(Edge& t) {
     if (t.is_valid()) {
       // need for symints used by validate_outputs
@@ -607,6 +676,27 @@ class SwapSavedVariables {
     }
   }
 
+  template <typename V>
+  void before(ska::flat_hash_map<std::string, V>& m) {
+    std::vector<std::string> keys;
+    keys.reserve(m.size());
+    std::transform(
+        m.begin(), m.end(), std::back_inserter(keys), [](const auto& entry) {
+          return entry.first;
+        });
+    std::sort(keys.begin(), keys.end());
+    for (auto& k : keys) {
+      before(m.at(k));
+    }
+  }
+
+  template <typename V>
+  void after(ska::flat_hash_map<std::string, V>& m) {
+    for (auto& [_, v] : m) {
+      after(v);
+    }
+  }
+
 #define NO_OP_VISIT(T)     \
   void before(const T&) {} \
   void after(const T&) {}
@@ -659,10 +749,8 @@ class SwapSavedVariables {
   template <typename T>
   struct StashedVars : public std::unordered_map<const T*, Stashed<T>> {
     void save(const T* key, T&& value) {
-      auto it = this->find(key);
-      if (it == this->end()) {
-        this->emplace(key, std::move(value));
-      } else {
+      auto [it, inserted] = this->try_emplace(key, std::move(value));
+      if (!inserted) {
         // keep the value from the prior save()
         it->second.count++;
       }
@@ -681,11 +769,14 @@ class SwapSavedVariables {
     }
   };
 
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   AutogradCompilerCall& compiler;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   TraceState& state;
   // This is a borrowed reference, we do not increment ownership, or lower it,
   // it's lifecycle is entirely longer than this objects.
   PyObject* py_compiler;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const NodeCall& curr_node_call;
 
   // These mappings are used to save the prior values when we overwrite things
@@ -693,6 +784,7 @@ class SwapSavedVariables {
   StashedVars<SavedVariable> stashed_variables;
   StashedVars<at::Tensor> stashed_tensors;
   StashedVars<c10::SymInt> stashed_symints;
+  StashedVars<at::IValue> stashed_ivalues;
 };
 
 } // namespace torch::dynamo::autograd
diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c
index 5d249d998dc61..bf710b9ff7e9f 100644
--- a/torch/csrc/dynamo/cpython_defs.c
+++ b/torch/csrc/dynamo/cpython_defs.c
@@ -26,7 +26,7 @@
 
 #define Py_BUILD_CORE
 #include <internal/pycore_pystate.h>
-#define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt
+#define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt, _PyOpcode_Caches
 #include <internal/pycore_opcode.h>
 #undef NEED_OPCODE_TABLES
 #undef Py_BUILD_CORE
@@ -34,9 +34,8 @@
 
 // As a simple way to reduce the impact of ABI changes on the CPython side, this check forces
 // us to manually re-check that the function didn't change on the next major version
-#if PY_VERSION_HEX >= 0x030C0000 // 3.12
-// Spoiler alert: They don't! This will be done in a follow up.
-// #error "Please ensure that the functions below still match the CPython implementation for 3.12"
+#if PY_VERSION_HEX >= 0x030D0000 // 3.13
+#error "Please ensure that the functions below still match the CPython implementation for 3.13"
 #endif
 
 // https://github.com/python/cpython/blob/a7715ccfba5b86ab09f86ec56ac3755c93b46b48/Objects/frameobject.c#L1079
@@ -65,9 +64,197 @@ THP_PyFrame_OpAlreadyRan(_PyInterpreterFrame *frame, int opcode, int oparg)
     return 0;
 }
 
+#if IS_PYTHON_3_12_PLUS
+
+// https://github.com/python/cpython/blob/0325a8a8cdba6c091bcbbb3c995f3bf1d1217012/Objects/frameobject.c#L1136
+// Initialize frame free variables if needed
+// free_vars_copied argument added in order to let caller know that the COPY_FREE_VARS
+// codepath occurred.
+static void
+frame_init_get_vars(_PyInterpreterFrame *frame, int *free_vars_copied)
+{
+    // COPY_FREE_VARS has no quickened forms, so no need to use _PyOpcode_Deopt
+    // here:
+    PyCodeObject *co = frame->f_code;
+    int lasti = _PyInterpreterFrame_LASTI(frame);
+    if (!(lasti < 0 && _PyCode_CODE(co)->op.code == COPY_FREE_VARS
+          && PyFunction_Check(frame->f_funcobj)))
+    {
+        /* Free vars are initialized */
+        return;
+    }
+
+    /* Free vars have not been initialized -- Do that */
+    PyObject *closure = ((PyFunctionObject *)frame->f_funcobj)->func_closure;
+    int offset = PyCode_GetFirstFree(co);
+    for (int i = 0; i < co->co_nfreevars; ++i) {
+        PyObject *o = PyTuple_GET_ITEM(closure, i);
+        frame->localsplus[offset + i] = Py_NewRef(o);
+    }
+    // COPY_FREE_VARS doesn't have inline CACHEs, either:
+    frame->prev_instr = _PyCode_CODE(frame->f_code);
+
+    *free_vars_copied = 1;
+}
+
+// https://github.com/python/cpython/blob/0325a8a8cdba6c091bcbbb3c995f3bf1d1217012/Objects/frameobject.c#L1162
+static int
+frame_get_var(_PyInterpreterFrame *frame, PyCodeObject *co, int i,
+              PyObject **pvalue)
+{
+    _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i);
+
+    /* If the namespace is unoptimized, then one of the
+       following cases applies:
+       1. It does not contain free variables, because it
+          uses import * or is a top-level namespace.
+       2. It is a class namespace.
+       We don't want to accidentally copy free variables
+       into the locals dict used by the class.
+    */
+    if (kind & CO_FAST_FREE && !(co->co_flags & CO_OPTIMIZED)) {
+        return 0;
+    }
+
+    PyObject *value = frame->localsplus[i];
+    if (frame->stacktop) {
+        if (kind & CO_FAST_FREE) {
+            // The cell was set by COPY_FREE_VARS.
+            CHECK(value != NULL && PyCell_Check(value));
+            value = PyCell_GET(value);
+        }
+        else if (kind & CO_FAST_CELL) {
+            // Note that no *_DEREF ops can happen before MAKE_CELL
+            // executes.  So there's no need to duplicate the work
+            // that MAKE_CELL would otherwise do later, if it hasn't
+            // run yet.
+            if (value != NULL) {
+                if (PyCell_Check(value) &&
+                        THP_PyFrame_OpAlreadyRan(frame, MAKE_CELL, i)) {
+                    // (likely) MAKE_CELL must have executed already.
+                    value = PyCell_GET(value);
+                }
+                // (likely) Otherwise it it is an arg (kind & CO_FAST_LOCAL),
+                // with the initial value set when the frame was created...
+                // (unlikely) ...or it was set to some initial value by
+                // an earlier call to PyFrame_LocalsToFast().
+            }
+        }
+    }
+    else {
+        CHECK(value == NULL);
+    }
+    *pvalue = value;
+    return 1;
+}
+
+// https://github.com/python/cpython/blob/0325a8a8cdba6c091bcbbb3c995f3bf1d1217012/Objects/frameobject.c#L1213
+static PyObject *
+THP_PyFrame_GetLocals(_PyInterpreterFrame *frame, int include_hidden, int *free_vars_copied)
+{
+    /* Merge fast locals into f->f_locals */
+    PyObject *locals = frame->f_locals;
+    if (locals == NULL) {
+        locals = frame->f_locals = PyDict_New();
+        if (locals == NULL) {
+            return NULL;
+        }
+    }
+    PyObject *hidden = NULL;
+
+    /* If include_hidden, "hidden" fast locals (from inlined comprehensions in
+       module/class scopes) will be included in the returned dict, but not in
+       frame->f_locals; the returned dict will be a modified copy. Non-hidden
+       locals will still be updated in frame->f_locals. */
+    if (include_hidden) {
+        hidden = PyDict_New();
+        if (hidden == NULL) {
+            return NULL;
+        }
+    }
+
+    frame_init_get_vars(frame, free_vars_copied);
+
+    PyCodeObject *co = frame->f_code;
+    for (int i = 0; i < co->co_nlocalsplus; i++) {
+        PyObject *value;  // borrowed reference
+        if (!frame_get_var(frame, co, i, &value)) {
+            continue;
+        }
+
+        PyObject *name = PyTuple_GET_ITEM(co->co_localsplusnames, i);
+        _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i);
+        if (kind & CO_FAST_HIDDEN) {
+            if (include_hidden && value != NULL) {
+                if (PyObject_SetItem(hidden, name, value) != 0) {
+                    goto error;
+                }
+            }
+            continue;
+        }
+        if (value == NULL) {
+            if (PyObject_DelItem(locals, name) != 0) {
+                if (PyErr_ExceptionMatches(PyExc_KeyError)) {
+                    PyErr_Clear();
+                }
+                else {
+                    goto error;
+                }
+            }
+        }
+        else {
+            if (PyObject_SetItem(locals, name, value) != 0) {
+                goto error;
+            }
+        }
+    }
+
+    if (include_hidden && PyDict_Size(hidden)) {
+        PyObject *innerlocals = PyDict_New();
+        if (innerlocals == NULL) {
+            goto error;
+        }
+        if (PyDict_Merge(innerlocals, locals, 1) != 0) {
+            Py_DECREF(innerlocals);
+            goto error;
+        }
+        if (PyDict_Merge(innerlocals, hidden, 1) != 0) {
+            Py_DECREF(innerlocals);
+            goto error;
+        }
+        locals = innerlocals;
+    }
+    else {
+        Py_INCREF(locals);
+    }
+    Py_CLEAR(hidden);
+
+    return locals;
+
+  error:
+    Py_XDECREF(hidden);
+    return NULL;
+}
+
+// https://github.com/python/cpython/blob/0325a8a8cdba6c091bcbbb3c995f3bf1d1217012/Objects/frameobject.c#L1301
+int
+THP_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame, int *free_vars_copied)
+{
+    PyObject *locals = THP_PyFrame_GetLocals(frame, 0, free_vars_copied);
+    if (locals == NULL) {
+        return -1;
+    }
+    Py_DECREF(locals);
+    return 0;
+}
+
+#else
+
 // https://github.com/python/cpython/blob/a7715ccfba5b86ab09f86ec56ac3755c93b46b48/Objects/frameobject.c#L1182
+// free_vars_copied argument added in order to let caller know that the COPY_FREE_VARS
+// codepath occurred.
 int
-THP_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame) {
+THP_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame, int *free_vars_copied) {
     /* Merge fast locals into f->f_locals */
     PyObject *locals = NULL;
     PyObject **fast = NULL;
@@ -86,13 +273,8 @@ THP_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame) {
     if (lasti < 0 && _Py_OPCODE(_PyCode_CODE(co)[0]) == COPY_FREE_VARS) {
         /* Free vars have not been initialized -- Do that */
         PyCodeObject *co = frame->f_code;
-        #if IS_PYTHON_3_12_PLUS
-        PyObject *closure = ((PyFunctionObject *)frame->f_funcobj)->func_closure;
-        int offset = co->co_nlocals + co->co_ncellvars;
-        #else
         PyObject *closure = frame->f_func->func_closure;
         int offset = co->co_nlocals + co->co_nplaincellvars;
-        #endif
         for (int i = 0; i < co->co_nfreevars; ++i) {
             PyObject *o = PyTuple_GET_ITEM(closure, i);
             Py_INCREF(o);
@@ -100,6 +282,8 @@ THP_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame) {
         }
         // COPY_FREE_VARS doesn't have inline CACHEs, either:
         frame->prev_instr = _PyCode_CODE(frame->f_code);
+
+        *free_vars_copied = 1;
     }
     for (int i = 0; i < co->co_nlocalsplus; i++) {
         _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i);
@@ -164,6 +348,8 @@ THP_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame) {
     return 0;
 }
 
+#endif
+
 // e.g. COPY_FIELD(op, o, globals) becomes
 // PY_XINCREF((o)->func_globals);
 // (op)->func_globals = (o)->func_globals;
@@ -201,6 +387,9 @@ _PyFunction_CopyWithNewCode(PyFunctionObject *o, PyCodeObject* code)
   op->func_weakreflist = NULL;
   COPY_FIELD(op, o, module);
   COPY_FIELD(op, o, annotations);
+  #if IS_PYTHON_3_12_PLUS
+  COPY_FIELD(op, o, typeparams);
+  #endif
   op->vectorcall = o->vectorcall;
   op->func_version = o->func_version;
   PyObject_GC_Track(op);
@@ -209,7 +398,7 @@ _PyFunction_CopyWithNewCode(PyFunctionObject *o, PyCodeObject* code)
 
 // From https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Objects/frameobject.c#L1020
 PyFrameObject*
-THP_PyFrame_New_NoTrack(PyCodeObject *code)
+THP_PyFrame_New_NoTrack(const PyCodeObject *code)
 {
     // DYNAMO: commented out
     // CALL_STAT_INC(frame_objects_created);
@@ -351,6 +540,7 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame)
     }
     Py_XDECREF(frame->frame_obj);
     Py_XDECREF(frame->f_locals);
+    // DYNAMO: additional field for 3.12
     #if IS_PYTHON_3_12_PLUS
     Py_DECREF(frame->f_funcobj);
     #else
@@ -359,4 +549,131 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame)
     Py_DECREF(frame->f_code);
 }
 
+// https://github.com/python/cpython/blob/fad48ea1816be3125ea51edcdfe2f999d6ade796/Objects/obmalloc.c#L635
+void *
+THP_PyObject_VirtualAlloc(size_t size)
+{
+    PyObjectArenaAllocator arena;
+    PyObject_GetArenaAllocator(&arena);
+    return arena.alloc(arena.ctx, size);
+}
+
+// https://github.com/python/cpython/blob/fad48ea1816be3125ea51edcdfe2f999d6ade796/Objects/obmalloc.c#L641
+void
+THP_PyObject_VirtualFree(void *obj, size_t size)
+{
+    PyObjectArenaAllocator arena;
+    PyObject_GetArenaAllocator(&arena);
+    return arena.free(arena.ctx, obj, size);
+}
+
+// https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Python/pystate.c#L728
+static _PyStackChunk*
+allocate_chunk(int size_in_bytes, _PyStackChunk* previous)
+{
+    CHECK(size_in_bytes % sizeof(PyObject **) == 0);
+    _PyStackChunk *res = THP_PyObject_VirtualAlloc(size_in_bytes);
+    if (res == NULL) {
+        return NULL;
+    }
+    res->previous = previous;
+    res->size = size_in_bytes;
+    res->top = 0;
+    return res;
+}
+
+#define DATA_STACK_CHUNK_SIZE (16*1024)
+#define MINIMUM_OVERHEAD 1000
+
+// https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Python/pystate.c#L2182
+static PyObject **
+push_chunk(PyThreadState *tstate, int size)
+{
+    int allocate_size = DATA_STACK_CHUNK_SIZE;
+    while (allocate_size < (int)sizeof(PyObject*)*(size + MINIMUM_OVERHEAD)) {
+        allocate_size *= 2;
+    }
+    _PyStackChunk *new = allocate_chunk(allocate_size, tstate->datastack_chunk);
+    if (new == NULL) {
+        return NULL;
+    }
+    if (tstate->datastack_chunk) {
+        tstate->datastack_chunk->top = tstate->datastack_top -
+                                       &tstate->datastack_chunk->data[0];
+    }
+    tstate->datastack_chunk = new;
+    tstate->datastack_limit = (PyObject **)(((char *)new) + allocate_size);
+    // When new is the "root" chunk (i.e. new->previous == NULL), we can keep
+    // _PyThreadState_PopFrame from freeing it later by "skipping" over the
+    // first element:
+    PyObject **res = &new->data[new->previous == NULL];
+    tstate->datastack_top = res + size;
+    return res;
+}
+
+// https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Include/internal/pycore_frame.h#L199
+static inline bool
+THP_PyThreadState_HasStackSpace(PyThreadState *tstate, size_t size)
+{
+    CHECK(
+        (tstate->datastack_top == NULL && tstate->datastack_limit == NULL)
+        ||
+        (tstate->datastack_top != NULL && tstate->datastack_limit != NULL)
+    );
+    return tstate->datastack_top != NULL &&
+        size < (size_t)(tstate->datastack_limit - tstate->datastack_top);
+}
+
+// https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Python/pystate.c#L2207
+_PyInterpreterFrame *
+THP_PyThreadState_BumpFramePointerSlow(PyThreadState *tstate, size_t size)
+{
+    if (THP_PyThreadState_HasStackSpace(tstate, size)) {
+        _PyInterpreterFrame *res = (_PyInterpreterFrame *)tstate->datastack_top;
+        tstate->datastack_top += size;
+        return res;
+    }
+    if (size > INT_MAX/2) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    return (_PyInterpreterFrame *)push_chunk(tstate, (int)size);
+}
+
+// https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Python/pystate.c#L2222
+void
+THP_PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame * frame)
+{
+    CHECK(tstate->datastack_chunk);
+    PyObject **base = (PyObject **)frame;
+    if (base == &tstate->datastack_chunk->data[0]) {
+        _PyStackChunk *chunk = tstate->datastack_chunk;
+        _PyStackChunk *previous = chunk->previous;
+        // push_chunk ensures that the root chunk is never popped:
+        CHECK(previous);
+        tstate->datastack_top = &previous->data[previous->top];
+        tstate->datastack_chunk = previous;
+        THP_PyObject_VirtualFree(chunk, chunk->size);
+        tstate->datastack_limit = (PyObject **)(((char *)previous) + previous->size);
+    }
+    else {
+        CHECK(tstate->datastack_top);
+        CHECK(tstate->datastack_top >= base);
+        tstate->datastack_top = base;
+    }
+}
+
+
+#endif
+
+#if IS_PYTHON_3_11_PLUS
+
+const uint8_t* THP_PyOpcode_Caches = _PyOpcode_Caches;
+const int THP_PyOpcode_Caches_size = sizeof(_PyOpcode_Caches) / sizeof(uint8_t);
+
+#else
+
+const uint8_t* THP_PyOpcode_Caches = NULL;
+const int THP_PyOpcode_Caches_size = 0;
+
 #endif
diff --git a/torch/csrc/dynamo/cpython_defs.h b/torch/csrc/dynamo/cpython_defs.h
index d9d3efa24a28d..b762f87d69df3 100644
--- a/torch/csrc/dynamo/cpython_defs.h
+++ b/torch/csrc/dynamo/cpython_defs.h
@@ -10,7 +10,9 @@
 
 #include <internal/pycore_frame.h>
 
-int THP_PyFrame_FastToLocalsWithError(_PyInterpreterFrame* frame);
+int THP_PyFrame_FastToLocalsWithError(
+    _PyInterpreterFrame* frame,
+    int* free_vars_copied);
 
 PyFunctionObject* _PyFunction_CopyWithNewCode(
     PyFunctionObject* o,
@@ -18,4 +20,25 @@ PyFunctionObject* _PyFunction_CopyWithNewCode(
 
 void THP_PyFrame_Clear(_PyInterpreterFrame* frame);
 
+_PyInterpreterFrame* THP_PyThreadState_BumpFramePointerSlow(
+    PyThreadState* tstate,
+    size_t size);
+
+void THP_PyThreadState_PopFrame(
+    PyThreadState* tstate,
+    _PyInterpreterFrame* frame);
+
+#endif
+
+// pointers to _PyOpcode_Caches for C++
+#ifdef __cplusplus
+
+extern "C" const uint8_t* THP_PyOpcode_Caches;
+extern "C" const int THP_PyOpcode_Caches_size;
+
+#else
+
+extern const uint8_t* THP_PyOpcode_Caches;
+extern const int THP_PyOpcode_Caches_size;
+
 #endif
diff --git a/torch/csrc/dynamo/debug_macros.h b/torch/csrc/dynamo/debug_macros.h
new file mode 100644
index 0000000000000..2a05938cee6d8
--- /dev/null
+++ b/torch/csrc/dynamo/debug_macros.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <stdio.h>
+
+#ifdef _WIN32
+#define unlikely(x) (x)
+#else
+#define unlikely(x) __builtin_expect((x), 0)
+#endif
+
+#define NULL_CHECK(val)                                         \
+  if (unlikely((val) == NULL)) {                                \
+    fprintf(stderr, "NULL ERROR: %s:%d\n", __FILE__, __LINE__); \
+    PyErr_Print();                                              \
+    abort();                                                    \
+  } else {                                                      \
+  }
+
+// CHECK might be previously declared
+#undef CHECK
+#define CHECK(cond)                                                     \
+  if (unlikely(!(cond))) {                                              \
+    fprintf(stderr, "DEBUG CHECK FAILED: %s:%d\n", __FILE__, __LINE__); \
+    abort();                                                            \
+  } else {                                                              \
+  }
+
+// Uncomment next line to print debug message
+// #define TORCHDYNAMO_DEBUG 1
+#ifdef TORCHDYNAMO_DEBUG
+
+#define DEBUG_CHECK(cond) CHECK(cond)
+#define DEBUG_NULL_CHECK(val) NULL_CHECK(val)
+#define DEBUG_TRACE(msg, ...) \
+  fprintf(stderr, "TRACE[%s:%d] " msg "\n", __func__, __LINE__, __VA_ARGS__)
+#define DEBUG_TRACE0(msg) \
+  fprintf(stderr, "TRACE[%s:%d] " msg "\n", __func__, __LINE__)
+
+#else
+
+#define DEBUG_CHECK(cond)
+#define DEBUG_NULL_CHECK(val)
+#define DEBUG_TRACE(msg, ...)
+#define DEBUG_TRACE0(msg)
+
+#endif
diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
index 003179df2dddd..b6a26f635ec4c 100644
--- a/torch/csrc/dynamo/eval_frame.c
+++ b/torch/csrc/dynamo/eval_frame.c
@@ -1,6 +1,9 @@
 #define PY_SSIZE_T_CLEAN
+#include <torch/csrc/dynamo/cache_entry.h>
 #include <torch/csrc/dynamo/cpp_shim.h>
 #include <torch/csrc/dynamo/cpython_defs.h>
+#include <torch/csrc/dynamo/debug_macros.h>
+#include <torch/csrc/dynamo/extra_state.h>
 #include <torch/csrc/utils/python_compat.h>
 #include <opcode.h>
 #include <stdbool.h>
@@ -129,61 +132,15 @@ THPPyInterpreterFrame* THPPyInterpreterFrame_New(_PyInterpreterFrame* frame) {
 #else
 #define THP_EVAL_API_FRAME_OBJECT PyFrameObject
 
-#define THP_PyFrame_FastToLocalsWithError PyFrame_FastToLocalsWithError
-#endif
-
-#ifdef _WIN32
-#define unlikely(x) (x)
-#else
-#define unlikely(x) __builtin_expect((x), 0)
-#endif
-
-#define NULL_CHECK(val)                                         \
-  if (unlikely((val) == NULL)) {                                \
-    fprintf(stderr, "NULL ERROR: %s:%d\n", __FILE__, __LINE__); \
-    PyErr_Print();                                              \
-    abort();                                                    \
-  } else {                                                      \
-  }
-
-#define CHECK(cond)                                                     \
-  if (unlikely(!(cond))) {                                              \
-    fprintf(stderr, "DEBUG CHECK FAILED: %s:%d\n", __FILE__, __LINE__); \
-    abort();                                                            \
-  } else {                                                              \
-  }
-
-// Uncomment next line to print debug message
-// #define TORCHDYNAMO_DEBUG 1
-
-#ifdef TORCHDYNAMO_DEBUG
-
-#define DEBUG_CHECK(cond) CHECK(cond)
-#define DEBUG_NULL_CHECK(val) NULL_CHECK(val)
-#define DEBUG_TRACE(msg, ...) \
-  fprintf(stderr, "TRACE[%s:%d] " msg "\n", __func__, __LINE__, __VA_ARGS__)
-#define DEBUG_TRACE0(msg) \
-  fprintf(stderr, "TRACE[%s:%d] " msg "\n", __func__, __LINE__)
-
-#else
-
-#define DEBUG_CHECK(cond)
-#define DEBUG_NULL_CHECK(val)
-#define DEBUG_TRACE(msg, ...)
-#define DEBUG_TRACE0(msg)
-
+static int
+THP_PyFrame_FastToLocalsWithError(THP_EVAL_API_FRAME_OBJECT *frame, int *free_vars_copied) {
+  return PyFrame_FastToLocalsWithError(frame);
+}
 #endif
 
-// Flag to just run a frame normally
-#define SKIP_CODE ((void*)0x1)
-
-bool is_dynamo_compiling = false;
-static PyObject* guard_error_hook = NULL;
+PyObject* guard_error_hook = NULL;
 const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup";
 
-// Points to the extra scratch space on the code object
-static Py_ssize_t extra_index = -1;
-
 static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT;
 
 inline static PyObject* eval_frame_callback_get(void) {
@@ -207,7 +164,8 @@ static PyObject* _custom_eval_frame(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
     int throw_flag,
-    PyObject* callback);
+    PyObject* callback,
+    int* should_clear_frame);
 static PyObject *(*previous_eval_frame)(PyThreadState *tstate,
                                         THP_EVAL_API_FRAME_OBJECT* frame, int throw_flag) = NULL;
 
@@ -285,323 +243,6 @@ inline static const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) {
   return PyUnicode_AsUTF8(frame->f_code->co_name);
 }
 
-typedef PyObject FrameState;
-/*
-Our cache resides on the extra scratch space of the code object. The structure
-of the cache is as follows:
-
--> ExtraState
-  -> CacheEntry
-    -> check_fn
-    -> optimized_code
-    -> next
-  -> FrameState
-
-CacheEntry is a linked list, with each node containing the check_fn for guards
-and the optimized code.
-
-The frame_state is a PyDict that enables sharing between different frames. This
-is used to detect dynamism in automatic dynamic shapes.
-
-These two are encapsulated into a ExtraState.
-*/
-
-// Linked list of cache entries, where each cache entry stores
-// the check_fn and the torch.compile optimized python bytecode.
-typedef struct cache_entry {
-  PyObject_HEAD
-  // check the guards: lambda: <locals of user function>: bool
-  PyObject* check_fn;
-  // modified user bytecode (protected by check_fn's guards)
-  PyCodeObject* code;
-  // on a cache miss, linked list of next thing to try
-  struct cache_entry* next;
-} CacheEntry;
-
-static void cache_entry_dealloc(CacheEntry* e);
-
-#define DECLARE_CACHE_ENTRY_ATTR(name) \
-static PyObject* CacheEntry_##name(CacheEntry* self, PyObject* _noargs) { \
-  PyObject* res = (PyObject*)self->name; \
-  Py_INCREF(res); \
-  return res; \
-}
-
-DECLARE_CACHE_ENTRY_ATTR(check_fn)
-DECLARE_CACHE_ENTRY_ATTR(code)
-DECLARE_CACHE_ENTRY_ATTR(next)
-
-static struct PyGetSetDef CacheEntry_properties[] = {
-    {"check_fn", (getter)CacheEntry_check_fn, NULL, NULL, NULL},
-    {"code", (getter)CacheEntry_code, NULL, NULL, NULL},
-    {"next", (getter)CacheEntry_next, NULL, NULL, NULL},
-    {NULL}};
-
-
-static PyObject* cache_entry_new(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
-  CacheEntry *self = (CacheEntry*) type->tp_alloc(type, 0);
-  if (self != NULL) {
-    // The corresponding decrefs for Py_None are in cache_entry_init.
-    Py_INCREF(Py_None);
-    self->check_fn = Py_None;
-    Py_INCREF(Py_None);
-    self->code = (PyCodeObject*)Py_None;
-    Py_INCREF(Py_None);
-    self->next = (CacheEntry*)Py_None;
-  }
-  return (PyObject*)self;
-}
-
-
-static int cache_entry_init(CacheEntry* self, PyObject* args, PyObject* kwds) {
-  PyObject* check_fn = NULL;
-  PyCodeObject* code = NULL;
-  CacheEntry* next = NULL;
-
-  static char *kwlist[] = {"check_fn", "code", "next", NULL};
-
-  int ret = PyArg_ParseTupleAndKeywords(
-    args, kwds, "OOO", kwlist,
-    &check_fn, &code, &next);
-
-  if (!ret) return -1;
-
-  if (check_fn) {
-    PyObject* tmp = self->check_fn;
-    Py_INCREF(check_fn);
-    self->check_fn = check_fn;
-    Py_XDECREF(tmp);
-  }
-
-  if (code) {
-    PyCodeObject* tmp = self->code;
-    Py_INCREF(code);
-    self->code = code;
-    Py_XDECREF(tmp);
-  }
-
-  if (next) {
-    CacheEntry* tmp = self->next;
-    Py_INCREF(next);
-    self->next = next;
-    Py_XDECREF(tmp);
-  }
-  return 0;
-}
-
-static PyTypeObject CacheEntryType = {
-  PyVarObject_HEAD_INIT(NULL, 0)
-  .tp_name = "torch._C.dynamo.eval_frame.CacheEntryWrapper",
-  .tp_basicsize = sizeof(CacheEntry),
-  .tp_itemsize = 0,
-  .tp_flags = Py_TPFLAGS_DEFAULT,
-  .tp_new = cache_entry_new,
-  .tp_init = (initproc)cache_entry_init,
-  .tp_dealloc = (destructor)cache_entry_dealloc,
-  .tp_getset = CacheEntry_properties,
-};
-
-// ExtraState encasulates CacheEntry and FrameState. ExtraState is the highest
-// level of abstraction of what is stored on the extra code object. Previously,
-// we saved different parts on different extra indexes.  We prefer this way
-// because of cleaner abstraction and faster SetExtra access.
-
-// TODO(anijain2305) - Consider making this a PyObject. Benefits are
-//   1) Modular dealloc - destroy_extra_state just becomes Py_DECREF(extra)
-//   2) We can directly send the extra object to convert_frame callback. One
-//   data structure - easier to understand code.
-// There might be some perf impact of going through a PyObject on the critical
-// path, but it should not be too bad.
-typedef struct {
-  // Cache entry for the code object
-  CacheEntry* cache_entry;
-  // Frame state to detect dynamic shape dims
-  FrameState* frame_state;
-} ExtraState;
-
-
-/* CacheEntry helper functions begins */
-
-static CacheEntry* create_cache_entry(
-    CacheEntry* next,
-    PyObject* guarded_code) {
-  // Ownership contract
-  // args
-  //   - next: steals
-  //   - guarded_code: Borrowed
-  //  return
-  //   - CacheEntry*: new reference.
-  PyObject* check_fn = PyObject_GetAttrString(guarded_code, "check_fn"); // new reference
-  PyCodeObject* code = (PyCodeObject*)PyObject_GetAttrString(guarded_code, "code"); // new reference
-
-  // equivalent to CacheEntry(check_fn, code, next) in Python
-  PyObject* args = Py_BuildValue("OOO", check_fn, code, next);
-  CacheEntry* e = (CacheEntry*)PyObject_CallObject((PyObject*)&CacheEntryType, args); // new reference
-  // CacheEntry e is the now the owner of old cachey entry next. This happens
-  // when we incref the next pointer in cache_entry_init.
-  Py_DECREF(next);
-  Py_DECREF(check_fn);
-  Py_DECREF(code);
-  Py_DECREF(args);
-  return e;
-}
-
-static void cache_entry_dealloc(CacheEntry* e) {
-  Py_XDECREF(e->check_fn);
-  Py_XDECREF(e->code);
-  // This will recursively call cache_entry_dealloc for the next items in the
-  // linked list.
-  Py_XDECREF(e->next);
-  Py_TYPE(e)->tp_free((PyObject*)e);
-}
-
-/* CacheEntry helper functions ends */
-
-/* Extractions helper functions begins. They help with NULL and SKIP_CODE corner cases */
-
-inline static CacheEntry* extract_cache_entry(ExtraState* extra_state) {
-  // Helper to extra the cache_entry from the extra state.
-
-  // Ownership contract
-  // args
-  //  - extra_state: Borrowed
-  // return
-  //  - CacheEntry: Borrowed.
-  if (extra_state == NULL || extra_state == SKIP_CODE) {
-    return NULL;
-  }
-  return extra_state->cache_entry;
-}
-
-
-inline static FrameState* extract_frame_state(ExtraState* extra_state) {
-  // Returns either the previously stored frame state or an empty dict.
-
-  // Ownership contract
-  // args
-  //  - extra_state: Borrowed
-  // return
-  //  - extra_state->frame_state: Borrowed.
-  if (extra_state == NULL || extra_state == SKIP_CODE) {
-    return NULL;
-  }
-  return extra_state->frame_state;
-}
-
-/* Extractions helper functions ends */
-
-/* Extra state helper functions begins */
-
-inline static ExtraState* get_extra_state(PyCodeObject* code) {
-  // Ownership contract
-  // args
-  //  - code: Borrowed
-  // return
-  //  - extra_state: Borrowed.
-  ExtraState* extra = NULL;
-  _PyCode_GetExtra((PyObject*)code, extra_index, (void*)&extra);
-  return extra;
-}
-
-inline static void destroy_extra_state(void* obj) {
-  // This is passed as freefunc to _PyEval_RequestCodeExtraIndex. This acts as a
-  // deleter for the object on extra scratch space. This function is called
-  // internally in _PyCode_SetExtra and also during the code deallocation.
-
-  // Destroys the extra state by deleting cache_entry, frame state and finally
-  // freeing the constructed extra state.
-
-  // Developer note - You should not call this function directly. This is called
-  // directly inside set_extra_state. If you are in a situation trying to call
-  // this function, consider if set_extra_state should be called.
-
-  ExtraState* extra = (ExtraState*)obj;
-  if (extra != NULL && extra != SKIP_CODE) {
-    // Cpython gc will call cache_entry_dealloc on its own when the ref count
-    // goes to 0.
-    Py_XDECREF(extra->cache_entry);
-    Py_XDECREF(extra->frame_state);
-    free(extra);
-  }
-}
-
-inline static void set_extra_state(PyCodeObject* code, ExtraState* extra_state) {
-  // Clears the existing object sitting on the extra scratch spance and sets it
-  // up with the new state. Note that _PyCode_SetExtra calls the
-  // destroy_extra_state deleter internally, and therefore we don't call it
-  // explicity here.
-
-  // Ownership contract
-  // args
-  //  - extra_state: Stolen
-  // return
-  //  - there is no return, but the extra_state is stolen, so it becomes
-  //  set_extra_state responsibility to clean it up. It will be deleted during
-  //  the reset_code/skip, when the set_extra_state is called with
-  //  NULL/SKIP_CODE.
-
-  // Invariant - Dont set the extra state for the extra state that is already on
-  // the code object. Otherwise, we will first free up the old extra state
-  // (which is also the new extra state) and write something invalid on the
-  // scratch space.
-  ExtraState* old_extra_state = get_extra_state(code);
-  CHECK(old_extra_state == NULL || old_extra_state == SKIP_CODE || old_extra_state != extra_state);
-  _PyCode_SetExtra((PyObject*)code, extra_index, extra_state);
-}
-
-inline static ExtraState* init_and_set_extra_state(PyCodeObject* code) {
-  // Creates a new extra state and put it on the extra scrach space of the code
-  // object.
-
-  // Ownership contract
-  // args
-  //  - code: Borrowed
-  // return:
-  //   - extra_state: New reference.
-  // These references are then further passed to set_extra_state which becomes
-  // the final owner of these references.
-
-  // Invariant - Extra state should not have been set before, therefore it should be NULL.
-  CHECK(get_extra_state(code) == NULL);
-  ExtraState* extra_state = (ExtraState*)malloc(sizeof(ExtraState));
-  DEBUG_NULL_CHECK(extra_state);
-  // We set the last node in the linked list to Py_None. We incref the Py_None
-  // here, the corresponding decref is in cache_entry_dealloc.
-  Py_INCREF(Py_None);
-  extra_state->cache_entry = (CacheEntry*)Py_None;
-  extra_state->frame_state = PyDict_New();
-  set_extra_state(code, extra_state);
-  return extra_state;
-}
-
-/* Extra state helper functions ends */
-
-/*
-Debugger helper functions.
-*/
-
-PyObject* _debug_get_cache_entry_list(PyObject* self, PyObject* args) {
-  // get the cache entry out of a code object
-  PyObject* object = NULL;
-  if (!PyArg_ParseTuple(args, "O", &object)) {
-    return NULL;
-  }
-  if (!PyCode_Check(object)) {
-    PyErr_SetString(PyExc_TypeError, "expected a code object!");
-    return NULL;
-  }
-  PyCodeObject* code = (PyCodeObject*)object;
-
-  ExtraState* extra = get_extra_state(code);
-  CacheEntry* current_node = extract_cache_entry(extra);
-  if (current_node == NULL)
-  {
-    Py_RETURN_NONE;
-  }
-  Py_INCREF(current_node);
-  return (PyObject*)current_node;
-}
-
 static inline PyObject* call_callback(
     PyObject* callable,
     THP_EVAL_API_FRAME_OBJECT* _frame,
@@ -619,105 +260,58 @@ static inline PyObject* call_callback(
   PyObject* frame = Py_NewRef(_frame);
 #endif
 
+  PyObject* cache_entry_pyobj = CacheEntry_to_obj(cache_entry);
   PyObject* res = PyObject_CallFunction(
     callable,
     "OOO",
     frame,
-    cache_entry,
+    cache_entry_pyobj,
     frame_state);
   Py_DECREF(frame);
+  Py_DECREF(cache_entry_pyobj);
   return res;
 }
 
-static PyObject* call_guard_fail_hook(
-    PyObject* hook,
-    CacheEntry* e,
-    size_t index,
-    PyObject* f_locals) {
-  // call debugging logic when a guard fails
-  return PyObject_CallFunction(
-      hook,
-      "OOOnO",
-      e->check_fn,
-      e->code,
-      f_locals,
-      (Py_ssize_t)index,
-      (e->next == (CacheEntry*)Py_None ? Py_True : Py_False));
-}
+static inline void clear_old_frame_if_python_312_plus(
+  PyThreadState* tstate,
+  THP_EVAL_API_FRAME_OBJECT* frame) {
+#if IS_PYTHON_3_12_PLUS
 
-// Return value: borrowed reference
-// Is either Py_None or a PyCodeObject
-static PyObject* lookup(CacheEntry* e, THP_EVAL_API_FRAME_OBJECT *frame, CacheEntry* prev, size_t index) {
-  if (e == (CacheEntry*)Py_None) {
-    // NB: intentionally not using Py_RETURN_NONE, to return borrowed ref
-    return Py_None;
-  }
-  PyObject *f_locals = frame->f_locals;
-  // remember to update the type signature for GuardFn.__call__ in torch/_dynamo/types.py
-  // if this calling convention changes
-  PyObject* valid = PyObject_CallOneArg(e->check_fn, f_locals);
-  if (unlikely(valid == NULL)) {
-    if (guard_error_hook != NULL) {
-      PyObject *type = NULL, *value = NULL, *traceback = NULL;
-      PyErr_Fetch(&type, &value, &traceback);
-      PyObject* r = call_guard_fail_hook(guard_error_hook, e, index, f_locals);
-      if (r == NULL) {
-        return NULL;
-      }
-      Py_DECREF(r);
-      PyErr_Restore(type, value, traceback);
-    }
-    return NULL;
-  }
-  Py_DECREF(valid);
-  if (valid == Py_True) {
-    // Keep the head as the most recently used cache entry.
-    // If the hit cache entry is not the head of the linked list,
-    // move it to the head
-    if (prev != NULL) {
-        ExtraState* extra = get_extra_state(frame->f_code);
-        // Override the extra state to reflect the updated cache line.
-        CacheEntry* old_cache_entry = extra->cache_entry;
-        prev->next = e->next;
-        e->next = old_cache_entry;
-        extra->cache_entry = e;
-    }
-    return (PyObject*)e->code;
-  }
-  return lookup(e->next, frame, e, index + 1);
+  THP_PyFrame_Clear(frame);
+  THP_PyThreadState_PopFrame(tstate, frame);
+
+#endif
 }
 
 inline static PyObject* eval_custom_code_impl(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
     PyCodeObject* code,
-    int throw_flag) {
+    int throw_flag,
+    int free_vars_copied) {
 
   DEBUG_NULL_CHECK(tstate);
   DEBUG_NULL_CHECK(frame);
   DEBUG_NULL_CHECK(code);
 
-  #if IS_PYTHON_3_11_PLUS
+#if IS_PYTHON_3_11_PLUS
 
   // Generate Python function object and _PyInterpreterFrame in a way similar to
   // https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Python/ceval.c#L1130
-  #if IS_PYTHON_3_12_PLUS
-  // Most of these don't exist in 3.12 anymore.
-  // _PyFunction_CopyWithNewCode and _PyFrame_InitializeSpecials in particular
-  PyFunctionObject* func;
-  PyErr_SetString(PyExc_RuntimeError, "Dynamo is not supported in Python 3.12 yet");
-  return NULL;
-  #else
-  PyFunctionObject* func = _PyFunction_CopyWithNewCode((PyFunctionObject*) frame->f_func, code);
+#if IS_PYTHON_3_12_PLUS
+  PyFunctionObject* old_func = (PyFunctionObject*) frame->f_funcobj;
+  size_t size = code->co_framesize;
+#else
+  PyFunctionObject* old_func = frame->f_func;
+  size_t size = code->co_nlocalsplus + code->co_stacksize + FRAME_SPECIALS_SIZE;
+#endif
+
+  PyFunctionObject* func = _PyFunction_CopyWithNewCode(old_func, code);
   if (func == NULL) {
     return NULL;
   }
-  #endif
 
-  size_t size = code->co_nlocalsplus + code->co_stacksize + FRAME_SPECIALS_SIZE;
-  // THP_EVAL_API_FRAME_OBJECT (_PyInterpreterFrame) is a regular C struct, so
-  // it should be safe to use system malloc over Python malloc, e.g. PyMem_Malloc
-  THP_EVAL_API_FRAME_OBJECT* shadow = malloc(size * sizeof(PyObject*));
+  THP_EVAL_API_FRAME_OBJECT* shadow = THP_PyThreadState_BumpFramePointerSlow(tstate, size);
   if (shadow == NULL) {
     Py_DECREF(func);
     return NULL;
@@ -725,9 +319,11 @@ inline static PyObject* eval_custom_code_impl(
 
   Py_INCREF(func);
   // consumes reference to func
-  #if !(IS_PYTHON_3_12_PLUS)
+#if IS_PYTHON_3_12_PLUS
+  _PyFrame_Initialize(shadow, func, NULL, code, 0);
+#else
   _PyFrame_InitializeSpecials(shadow, func, NULL, code->co_nlocalsplus);
-  #endif
+#endif
 
   PyObject** fastlocals_old = frame->localsplus;
   PyObject** fastlocals_new = shadow->localsplus;
@@ -735,11 +331,21 @@ inline static PyObject* eval_custom_code_impl(
   Py_ssize_t n_new = code->co_nlocalsplus;
 
   // localsplus are XINCREF'd by default eval frame, so all values must be valid.
+#if !(IS_PYTHON_3_12_PLUS)
+  // _PyFrame_Initialize in 3.12 already does this
   for (int i = 0; i < code->co_nlocalsplus; i++) {
     fastlocals_new[i] = NULL;
   }
+#endif
 
-  #else
+  // for 3.11+, if free_vars_copied is true, we do not need to
+  // run the first COPY_FREE_VARS since THP_PyFrame_FastToLocalsWithError
+  // already did the equivalent action.
+  if (free_vars_copied && _Py_OPCODE(_PyCode_CODE(shadow->f_code)[0]) == COPY_FREE_VARS) {
+    shadow->prev_instr = _PyCode_CODE(shadow->f_code);
+  }
+
+#else
 
   THP_EVAL_API_FRAME_OBJECT* shadow = PyFrame_New(tstate, code, frame->f_globals, NULL);
   if (shadow == NULL) {
@@ -751,7 +357,7 @@ inline static PyObject* eval_custom_code_impl(
   Py_ssize_t n_old = frame->f_code->co_nlocals + PyCode_GetNFreevars(frame->f_code) + PyCode_GetNCellvars(frame->f_code);
   Py_ssize_t n_new = code->co_nlocals + PyCode_GetNFreevars(code) + PyCode_GetNCellvars(code);
 
-  #endif
+#endif
 
   // ============== Initialize new frame from old frame ============
   // Python internal for executing a function:
@@ -818,35 +424,47 @@ inline static PyObject* eval_custom_code_impl(
   // conditional test to tell if a variable is not a cell variable
   // this is straightforward in Python 3.11 and higher, as there are bit flags in `co_localspluskinds` to tell if a variable is a cell variable.
   // in Python 3.10 and lower, essentially we are checking if a variable is a new local variable (because of the layout mentioned above, the first variable that is not cell variable is the first new local variable). the corresponding slot in `flocalsplus` is NULL for new local variables.
-  #if IS_PYTHON_3_11_PLUS
+#if IS_PYTHON_3_11_PLUS
     if(!(_PyLocals_GetKind(frame->f_code->co_localspluskinds, i) & CO_FAST_CELL))
     {
       break;
     }
-  #else
+#else
     if(fastlocals_old[i] == NULL)
     {
       break;
     }
-  #endif
+#endif
 
     Py_XINCREF(fastlocals_old[i]);
     fastlocals_new[j] = fastlocals_old[i];
   }
 
+  // NOTE: if you want to evaluate frame instead of shadow in 3.12+,
+  // you need to clear_old_frame_if_python_312_plus the shadow frame BEFORE
+  // calling eval_frame_default (i.e. here) and comment out the
+  // clear_old_frame_if_python_312_plus call on the original frame.
+
   PyObject* result = eval_frame_default(tstate, shadow, throw_flag);
 
-  #if IS_PYTHON_3_11_PLUS
+#if IS_PYTHON_3_12_PLUS
+
+  // frame is cleared by caller
+  Py_DECREF(func);
+
+#elif IS_PYTHON_3_11_PLUS
 
+  // In 3.11, shadow has is_entry set to true, so _PyEvalFrameClearAndPop is not called,
+  // so we manually clear and pop the shadow frame.
   THP_PyFrame_Clear(shadow);
-  free(shadow);
+  THP_PyThreadState_PopFrame(tstate, shadow);
   Py_DECREF(func);
 
-  #else
+#else
 
   Py_DECREF(shadow);
 
-  #endif
+#endif
 
   return result;
 }
@@ -856,13 +474,15 @@ inline static PyObject* eval_custom_code(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
     PyCodeObject* code,
-    int throw_flag) {
+    int throw_flag,
+    int free_vars_copied) {
   _PytorchRecordFunctionState* rf = _pytorch_record_function_enter("Torch-Compiled Region");
   PyObject* result = eval_custom_code_impl(
     tstate,
     frame,
     code,
-    throw_flag
+    throw_flag,
+    free_vars_copied
   );
   _pytorch_record_function_exit(rf);
   return result;
@@ -883,22 +503,33 @@ static PyObject* _custom_eval_frame_shim(
     return eval_frame_default(tstate, frame, throw_flag);
   }
 
-  return _custom_eval_frame(tstate, frame, throw_flag, callback);
+  int should_clear_frame = 0;
+  PyObject* result = _custom_eval_frame(tstate, frame, throw_flag, callback, &should_clear_frame);
+  if (should_clear_frame) {
+    clear_old_frame_if_python_312_plus(tstate, frame);
+  }
+  return result;
 }
 
+// NOTE: In 3.12+, the frame evaluation function (callee) is responsible for clearing/popping
+// the frame, meaning that unless we default evaluate the original frame,
+// we are responsible for clearing it - via clear_old_frame_if_python_312_plus.
+// The should_clear_frame flag is used to indicate whether the frame should be
+// cleared by _custom_eval_frame's caller.
 static PyObject* _custom_eval_frame(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
     int throw_flag,
-    PyObject* callback) {
-  #if IS_PYTHON_3_11_PLUS
+    PyObject* callback,
+    int* should_clear_frame) {
+#if IS_PYTHON_3_11_PLUS
   DEBUG_TRACE(
       "begin %s %s %i %i",
       get_frame_name(frame),
       PyUnicode_AsUTF8(frame->f_code->co_filename),
       frame->f_code->co_firstlineno,
       _PyInterpreterFrame_LASTI(frame));
-  #else
+#else
   DEBUG_TRACE(
       "begin %s %s %i %i %i",
       get_frame_name(frame),
@@ -906,7 +537,7 @@ static PyObject* _custom_eval_frame(
       frame->f_lineno,
       frame->f_lasti,
       frame->f_iblock);
-  #endif
+#endif
 
   if (throw_flag) {
     // When unwinding generators, eval frame is called with throw_flag ==
@@ -943,27 +574,27 @@ static PyObject* _custom_eval_frame(
     extra = init_and_set_extra_state(frame->f_code);
   }
 
-  CacheEntry* cache_entry = extract_cache_entry(extra);
-  FrameState* frame_state = extract_frame_state(extra);
-
   // TODO(jansel): investigate directly using the "fast" representation
-  // TODO(alband): This is WRONG for python3.11+ we pass in a _PyInterpreterFrame
-  // even though we should pass a PyFrameObject.
-  if (THP_PyFrame_FastToLocalsWithError(frame) < 0) {
+  int free_vars_copied = 0;
+  if (THP_PyFrame_FastToLocalsWithError(frame, &free_vars_copied) < 0) {
     DEBUG_TRACE("error %s", get_frame_name(frame));
+    *should_clear_frame = 1;
     return NULL;
   }
 
+  PyObject* backend = get_backend(callback);
+
   // A callback of Py_False indicates "run only" mode, the cache is checked, but
   // we never compile.
   if (callback == Py_False) {
     DEBUG_TRACE("In run only mode %s", get_frame_name(frame));
     _PytorchRecordFunctionState* rf = _pytorch_record_function_enter(cache_lookup_profiler_str);
-    PyObject* maybe_cached_code = lookup(cache_entry, frame, NULL, 0);
+    PyObject* maybe_cached_code = lookup(extra, frame->f_locals, backend);
     _pytorch_record_function_exit(rf);
 
     if (maybe_cached_code == NULL) {
       // guard eval failed, keep propagating
+      *should_clear_frame = 1;
       return NULL;
     } else if (maybe_cached_code == Py_None) {
       DEBUG_TRACE("cache miss %s", get_frame_name(frame));
@@ -972,7 +603,8 @@ static PyObject* _custom_eval_frame(
     PyCodeObject* cached_code = (PyCodeObject*)maybe_cached_code;
     // used cached version
     DEBUG_TRACE("cache hit %s", get_frame_name(frame));
-    return eval_custom_code(tstate, frame, cached_code, throw_flag);
+    *should_clear_frame = 1;
+    return eval_custom_code(tstate, frame, cached_code, throw_flag, free_vars_copied);
   }
   DEBUG_CHECK(PyDict_CheckExact(frame->f_locals));
   DEBUG_CHECK(PyDict_CheckExact(frame->f_globals));
@@ -984,10 +616,11 @@ static PyObject* _custom_eval_frame(
   eval_frame_callback_set(Py_None);
 
   _PytorchRecordFunctionState* rf = _pytorch_record_function_enter(cache_lookup_profiler_str);
-  PyObject* maybe_cached_code = lookup(cache_entry, frame, NULL, 0);
+  PyObject* maybe_cached_code = lookup(extra, frame->f_locals, backend);
   _pytorch_record_function_exit(rf);
   if (maybe_cached_code == NULL) {
     // Python error
+    *should_clear_frame = 1;
     return NULL;
   } else if (maybe_cached_code != Py_None) {
     PyCodeObject* cached_code = (PyCodeObject*)maybe_cached_code;
@@ -995,11 +628,12 @@ static PyObject* _custom_eval_frame(
     DEBUG_TRACE("cache hit %s", get_frame_name(frame));
     // Re-enable custom behavior
     eval_frame_callback_set(callback);
-    return eval_custom_code(tstate, frame, cached_code, throw_flag);
+    *should_clear_frame = 1;
+    return eval_custom_code(tstate, frame, cached_code, throw_flag, free_vars_copied);
   }
   // cache miss
-  // TODO(alband): This is WRONG for python3.11+ we pass in a _PyInterpreterFrame
-  // that gets re-interpreted as a PyObject (which it is NOT!)
+  CacheEntry* cache_entry = extract_cache_entry(extra);
+  FrameState* frame_state = extract_frame_state(extra);
   PyObject* result =
       call_callback(callback, frame, cache_entry, frame_state);
   if (result == NULL) {
@@ -1010,6 +644,7 @@ static PyObject* _custom_eval_frame(
     // cascading failure from internal exceptions.  The upshot is if
     // Dynamo barfs, that's it for Dynamo, even if you catch the exception
     // inside the torch.compile block we won't try to Dynamo anything else.
+    *should_clear_frame = 1;
     return NULL;
   } else if (result != Py_None) {
     DEBUG_TRACE("create cache %s", get_frame_name(frame));
@@ -1018,15 +653,17 @@ static PyObject* _custom_eval_frame(
     // extract_cache_entry returns a borrowed reference. Modifying a borrowed
     // reference seems wrong. Therefore, we directly access the
     // extra->cache_entry. extra wont be NULL here.
-    extra->cache_entry = create_cache_entry(extra->cache_entry, result);
+    CacheEntry* new_cache_entry = create_cache_entry(extra, result, backend);
     Py_DECREF(result);
+
     // Update the existing cache_entry on the extra object. This extra object is
     // sitting on the extra scratch space, we are just changing the cache_entry
     // ptr. As a result, extra now becomes the owner of CacheEntry object. This
     // will be cleaned up when set_extra_state is called.
     // Re-enable custom behavior
     eval_frame_callback_set(callback);
-    return eval_custom_code(tstate, frame, extra->cache_entry->code, throw_flag);
+    *should_clear_frame = 1;
+    return eval_custom_code(tstate, frame, CacheEntry_get_code(new_cache_entry), throw_flag, free_vars_copied);
   } else {
     DEBUG_TRACE("create skip %s", get_frame_name(frame));
     Py_DECREF(result);
@@ -1080,7 +717,6 @@ static PyObject* set_eval_frame(PyObject* new_callback, PyThreadState* tstate) {
   // is installed.
   eval_frame_callback_set(new_callback);
 
-  is_dynamo_compiling = !(new_callback == Py_None);
   return old_callback;
 }
 
@@ -1146,7 +782,6 @@ static PyMethodDef _methods[] = {
     {"unsupported", unsupported, METH_VARARGS, NULL},
     {"skip_code", skip_code, METH_O, NULL},
     {"set_guard_error_hook", set_guard_error_hook, METH_O, NULL},
-    {"_debug_get_cache_entry_list", _debug_get_cache_entry_list, METH_VARARGS, NULL},
     {NULL, NULL, 0, NULL}};
 
 static struct PyModuleDef _module = {
@@ -1156,6 +791,9 @@ static struct PyModuleDef _module = {
     -1,
     _methods};
 
+#if IS_PYTHON_3_12_PLUS
+#define _PyEval_RequestCodeExtraIndex PyUnstable_Eval_RequestCodeExtraIndex
+#endif
 
 PyObject* torch_c_dynamo_eval_frame_init(void) {
   extra_index = _PyEval_RequestCodeExtraIndex(destroy_extra_state);
@@ -1186,15 +824,5 @@ PyObject* torch_c_dynamo_eval_frame_init(void) {
   }
 #endif
 
-
-  if (PyType_Ready(&CacheEntryType) < 0) {
-    return NULL;
-  }
-  Py_INCREF(&CacheEntryType);
-  if (PyModule_AddObject(module, "_CacheEntry", (PyObject *) &CacheEntryType) < 0) {
-      Py_DECREF(&CacheEntryType);
-      return NULL;
-  }
-
   return module;
 }
diff --git a/torch/csrc/dynamo/eval_frame.h b/torch/csrc/dynamo/eval_frame.h
index 58d3d9fccab11..99b16f3198c80 100644
--- a/torch/csrc/dynamo/eval_frame.h
+++ b/torch/csrc/dynamo/eval_frame.h
@@ -3,5 +3,4 @@
 
 extern "C" {
 PyObject* torch_c_dynamo_eval_frame_init(void);
-extern bool is_dynamo_compiling;
 }
diff --git a/torch/csrc/dynamo/extra_state.cpp b/torch/csrc/dynamo/extra_state.cpp
new file mode 100644
index 0000000000000..7c9b4be0009bf
--- /dev/null
+++ b/torch/csrc/dynamo/extra_state.cpp
@@ -0,0 +1,165 @@
+#include <torch/csrc/dynamo/extra_state.h>
+
+#include <torch/csrc/dynamo/cache_entry.h>
+#include <torch/csrc/dynamo/cpython_defs.h>
+#include <torch/csrc/dynamo/debug_macros.h>
+#include <torch/csrc/dynamo/guards.h>
+#include <torch/csrc/utils/python_compat.h>
+
+#if IS_PYTHON_3_12_PLUS
+#define _PyCode_GetExtra PyUnstable_Code_GetExtra
+#define _PyCode_SetExtra PyUnstable_Code_SetExtra
+#endif
+
+Py_ssize_t extra_index = -1;
+
+CacheEntry* ExtraState::get_first_entry() {
+  if (this->cache_entry_list.empty()) {
+    return nullptr;
+  }
+  return &this->cache_entry_list.front();
+}
+
+void ExtraState::move_to_front(CacheEntry* cache_entry) {
+  CHECK(cache_entry->_owner == this);
+  CHECK(!this->cache_entry_list.empty());
+  CHECK(cache_entry == &*cache_entry->_owner_loc);
+  this->cache_entry_list.splice(
+      this->cache_entry_list.begin(),
+      this->cache_entry_list,
+      cache_entry->_owner_loc);
+}
+
+void ExtraState::invalidate(CacheEntry* cache_entry) {
+  CHECK(cache_entry->_owner == this);
+  CHECK(!this->cache_entry_list.empty());
+  CHECK(cache_entry == &*cache_entry->_owner_loc);
+  this->cache_entry_list.erase(cache_entry->_owner_loc);
+}
+
+CacheEntry* extract_cache_entry(ExtraState* extra_state) {
+  if (extra_state == nullptr || extra_state == SKIP_CODE) {
+    return nullptr;
+  }
+  return extra_state->get_first_entry();
+}
+
+FrameState* extract_frame_state(ExtraState* extra_state) {
+  if (extra_state == nullptr || extra_state == SKIP_CODE) {
+    return nullptr;
+  }
+  return (FrameState*)extra_state->frame_state.ptr();
+}
+
+ExtraState* get_extra_state(PyCodeObject* code) {
+  ExtraState* extra = nullptr;
+  _PyCode_GetExtra((PyObject*)code, extra_index, (void**)&extra);
+  return extra;
+}
+
+void destroy_extra_state(void* obj) {
+  ExtraState* extra = (ExtraState*)obj;
+  if (extra != nullptr && extra != SKIP_CODE) {
+    delete extra;
+  }
+}
+
+void set_extra_state(PyCodeObject* code, ExtraState* extra_state) {
+  ExtraState* old_extra_state = get_extra_state(code);
+  CHECK(
+      old_extra_state == nullptr || old_extra_state == SKIP_CODE ||
+      old_extra_state != extra_state);
+  _PyCode_SetExtra((PyObject*)code, extra_index, extra_state);
+}
+
+ExtraState* init_and_set_extra_state(PyCodeObject* code) {
+  // Invariant - Extra state should not have been set before, therefore it
+  // should be nullptr.
+  CHECK(get_extra_state(code) == nullptr);
+  ExtraState* extra_state = new ExtraState();
+  NULL_CHECK(extra_state);
+  set_extra_state(code, extra_state);
+  return extra_state;
+}
+
+PyObject* lookup(
+    ExtraState* extra_state,
+    PyObject* f_locals,
+    const PyObject* backend) {
+  size_t index = 0;
+  CacheEntry* found = nullptr;
+  py::handle locals(f_locals);
+  for (CacheEntry& cache_entry : extra_state->cache_entry_list) {
+    // Check backend. Py_False means run only mode.
+    bool valid = backend == Py_False || cache_entry.backend == backend;
+    if (valid) {
+      try {
+        // TODO(anijain2305) - Clean this up when enable_cpp_guard_manager is
+        // True by default
+        if (cache_entry.root_mgr != nullptr) {
+          valid = torch::dynamo::run_root_guard_manager(
+              cache_entry.root_mgr, f_locals);
+        } else {
+          valid = cache_entry.check_fn(locals).cast<bool>();
+        }
+      } catch (py::error_already_set& e) {
+        if (guard_error_hook) {
+          py::handle guard_error_hook_handle(guard_error_hook);
+          guard_error_hook_handle(
+              cache_entry.check_fn,
+              cache_entry.code,
+              locals,
+              index,
+              index == extra_state->cache_entry_list.size() - 1);
+        }
+        // this function is called from C, so we cannot repropagate
+        // the exception
+        e.restore();
+        return nullptr;
+      }
+    }
+    if (valid) {
+      found = &cache_entry;
+      break;
+    }
+    ++index;
+  }
+  if (found) {
+    extra_state->move_to_front(found);
+    return found->code.ptr();
+  }
+  return py::none().ptr();
+}
+
+CacheEntry* create_cache_entry(
+    ExtraState* extra_state,
+    PyObject* guarded_code,
+    PyObject* backend) {
+  extra_state->cache_entry_list.emplace_front(guarded_code, backend);
+  auto new_iter = extra_state->cache_entry_list.begin();
+  new_iter->_owner = extra_state;
+  new_iter->_owner_loc = new_iter;
+  // Set check_fn references to extra_state and CacheEntry
+  // Warning: lifetime is controlled by C++!
+  py::handle check_fn = py::handle(guarded_code).attr("check_fn");
+  check_fn.attr("cache_entry") =
+      py::cast(*new_iter, py::return_value_policy::reference);
+  check_fn.attr("extra_state") =
+      py::cast(extra_state, py::return_value_policy::reference);
+  return &*new_iter;
+}
+
+py::list _debug_get_cache_entry_list(const py::handle& code_obj) {
+  if (!py::isinstance(code_obj, py::module::import("types").attr("CodeType"))) {
+    throw py::type_error("expected a code object!");
+  }
+  PyCodeObject* code = (PyCodeObject*)code_obj.ptr();
+  ExtraState* extra = get_extra_state(code);
+  py::list result;
+  if (extra && extra != SKIP_CODE) {
+    for (CacheEntry& e : extra->cache_entry_list) {
+      result.append(py::cast(e, py::return_value_policy::reference));
+    }
+  }
+  return result;
+}
diff --git a/torch/csrc/dynamo/extra_state.h b/torch/csrc/dynamo/extra_state.h
new file mode 100644
index 0000000000000..48784f9e4a6e0
--- /dev/null
+++ b/torch/csrc/dynamo/extra_state.h
@@ -0,0 +1,155 @@
+#pragma once
+
+#include <Python.h>
+
+#ifdef __cplusplus
+
+#include <torch/csrc/dynamo/utils.h>
+#include <torch/csrc/utils/pybind.h>
+#include <list>
+
+namespace py = pybind11;
+
+extern "C" {
+
+#endif
+
+// Flag to just run a frame normally
+#define SKIP_CODE ((void*)0x1)
+
+// Points to the extra scratch space on the code object
+extern Py_ssize_t extra_index;
+
+// function to call when cache lookup errors
+extern PyObject* guard_error_hook;
+
+typedef PyObject FrameState;
+typedef struct CacheEntry CacheEntry;
+
+// ExtraState encasulates CacheEntry and FrameState. ExtraState is the highest
+// level of abstraction of what is stored on the extra code object. Previously,
+// we saved different parts on different extra indexes.  We prefer this way
+// because of cleaner abstraction and faster SetExtra access.
+
+#ifdef __cplusplus
+
+typedef struct VISIBILITY_HIDDEN ExtraState {
+  // List of cache entries for compiled code objects
+  std::list<CacheEntry> cache_entry_list;
+  // Frame state to detect dynamic shape dims
+  py::dict frame_state;
+
+  CacheEntry* get_first_entry();
+  void move_to_front(CacheEntry* cache_entry);
+  void invalidate(CacheEntry* cache_entry);
+} ExtraState;
+
+#else
+
+typedef struct ExtraState ExtraState;
+
+#endif
+
+// Helper to extra the cache_entry from the extra state.
+// Ownership contract
+// args
+//  - extra_state: Borrowed
+// return
+//  - CacheEntry: Borrowed.
+CacheEntry* extract_cache_entry(ExtraState* extra_state);
+
+// Returns either the previously stored frame state or an empty dict.
+// Ownership contract
+// args
+//  - extra_state: Borrowed
+// return
+//  - extra_state->frame_state: Borrowed.
+FrameState* extract_frame_state(ExtraState* extra_state);
+
+// Ownership contract
+// args
+//  - code: Borrowed
+// return
+//  - extra_state: Borrowed.
+ExtraState* get_extra_state(PyCodeObject* code);
+
+// This is passed as freefunc to _PyEval_RequestCodeExtraIndex. This acts as a
+// deleter for the object on extra scratch space. This function is called
+// internally in _PyCode_SetExtra and also during the code deallocation.
+
+// Destroys the extra state by deleting cache_entry, frame state and finally
+// freeing the constructed extra state.
+
+// Developer note - You should not call this function directly. This is called
+// directly inside set_extra_state. If you are in a situation trying to call
+// this function, consider if set_extra_state should be called.
+void destroy_extra_state(void* obj);
+
+// Clears the existing object sitting on the extra scratch spance and sets it
+// up with the new state. Note that _PyCode_SetExtra calls the
+// destroy_extra_state deleter internally, and therefore we don't call it
+// explicity here.
+
+// Ownership contract
+// args
+//  - extra_state: Stolen
+// return
+//  - there is no return, but the extra_state is stolen, so it becomes
+//  set_extra_state responsibility to clean it up. It will be deleted during
+//  the reset_code/skip, when the set_extra_state is called with
+//  NULL/SKIP_CODE.
+
+// Invariant - Dont set the extra state for the extra state that is already on
+// the code object. Otherwise, we will first free up the old extra state
+// (which is also the new extra state) and write something invalid on the
+// scratch space.
+void set_extra_state(PyCodeObject* code, ExtraState* extra_state);
+
+// Creates a new extra state and put it on the extra scrach space of the code
+// object.
+
+// Ownership contract
+// args
+//  - code: Borrowed
+// return:
+//   - extra_state: New reference.
+// These references are then further passed to set_extra_state which becomes
+// the final owner of these references.
+ExtraState* init_and_set_extra_state(PyCodeObject* code);
+
+// Lookup the cache held by extra_state.
+// Ownership contract
+// args
+//  - extra_state: Borrowed
+//  - f_locals: Borrowed
+// return:
+//   - Py_None or PyCodeObject: Borrowed reference.
+PyObject* lookup(
+    ExtraState* extra_state,
+    PyObject* f_locals,
+    const PyObject* backend);
+
+// Create a new cache entry at extra_state holding on to guarded_code.
+// Ownership contract
+// args
+//  - extra_state: Borrowed
+//  - guarded_code: Borrowed
+// return:
+//  - cache_entry: Borrowed reference
+CacheEntry* create_cache_entry(
+    ExtraState* extra_state,
+    PyObject* guraded_code,
+    PyObject* callback);
+
+// Extracts the backend fn from the callback.
+PyObject* get_backend(PyObject* callback);
+
+#ifdef __cplusplus
+
+} // extern "C"
+
+// Returns the list of CacheEntry corresponding to code_obj.
+// Warning: returns references whose lifetimes are controlled by C++
+py::list _debug_get_cache_entry_list(const py::handle& code_obj);
+
+#endif
diff --git a/torch/csrc/dynamo/guards.cpp b/torch/csrc/dynamo/guards.cpp
index 45b0e1f77f9dc..3f8f541bbc8c9 100644
--- a/torch/csrc/dynamo/guards.cpp
+++ b/torch/csrc/dynamo/guards.cpp
@@ -1,156 +1,179 @@
 #define PY_SSIZE_T_CLEAN
+#include <ATen/EmptyTensor.h>
 #include <c10/util/flat_hash_map.h>
 #include <torch/csrc/autograd/grad_mode.h>
 #include <torch/csrc/dynamo/guards.h>
 #include <torch/csrc/utils/disable_torch_function.h>
+#include <torch/csrc/utils/python_compat.h>
 #include <torch/csrc/utils/python_numbers.h>
 #include <torch/csrc/utils/python_symnode.h>
+#include <torch/csrc/utils/pythoncapi_compat.h>
 #include <torch/extension.h>
+
+#ifdef USE_CUDA
+#include <ATen/cuda/EmptyTensor.h>
+#endif
+
 #include <sstream>
+#include <utility>
 
-namespace {
+// For TupleIteratorGetItemAccessor, we need a fast way to retrieve the
+// underlying tuple and access the item. Before Python 3.12 version, the
+// datastructure is in tupleobject.c file -
+// https://github.com/python/cpython/blob/9afc6d102d16080535325f645849cd84eb04d57d/Objects/tupleobject.c#L1058-L1062
+// To handle this, we manually copy the struct here and manually cast it to this
+// new struct. From 3.12, the struct is included in the header file.
+#if IS_PYTHON_3_12_PLUS
 
-struct LocalState {
-  // TLS state that changes operators
-  c10::impl::LocalDispatchKeySet dispatch_modifier;
-  bool grad_mode_enabled;
-
-  at::DispatchKeySet apply(at::DispatchKeySet ks) const {
-    return (ks | dispatch_modifier.included_) - dispatch_modifier.excluded_;
-  }
-
-  LocalState()
-      : dispatch_modifier(c10::impl::tls_local_dispatch_key_set()),
-        grad_mode_enabled(at::GradMode::is_enabled()) {}
-};
-
-class TensorCheck {
- public:
-  TensorCheck(
-      const LocalState& state,
-      PyTypeObject* pt,
-      const at::Tensor& v,
-      std::vector<std::optional<c10::SymInt>> dynamic_dims_sizes,
-      std::vector<std::optional<c10::SymInt>> dynamic_dims_strides)
-      : pytype(pt),
-        dispatch_key_(state.apply(v.key_set()).raw_repr()),
-        dtype_(v.dtype().toScalarType()),
-        device_index_(v.device().index()),
-        requires_grad_(v.requires_grad()),
-        sizes_(std::move(dynamic_dims_sizes)),
-        strides_(std::move(dynamic_dims_strides)),
-        dim_(static_cast<int64_t>(sizes_.size())) {
-    // TODO(voz): In cases where sizes_ and strides_ are fully dynamic, should
-    // we just treat this as optional?
-  }
-
-  // See note in guards.py [Note - On Export Tensor Guards]
-  // Logic parallel to here must be maintained in python
-  bool check(const LocalState& state, const at::Tensor& v) {
-    if (dispatch_key_ != state.apply(v.key_set()).raw_repr() ||
-        dtype_ != v.dtype().toScalarType() ||
-        device_index_ != v.device().index() ||
-        requires_grad_ != v.requires_grad()) {
-      return false;
-    }
-    auto ndim = v.ndimension();
-    if (ndim != dim_) {
-      return false;
-    }
-    const auto& sizes = v.sym_sizes();
-    const auto& strides = v.sym_strides();
-    for (auto i : c10::irange(ndim)) {
-      auto known_size = sizes_[i];
-      auto known_stride = strides_[i];
-      if (known_size.has_value()) {
-        if (known_size.value() != sizes[i]) {
-          return false;
-        }
+#define Py_BUILD_CORE
+// Bring _PyTupleIterObject from the header file
+#include <internal/pycore_tuple.h>
+#undef Py_BUILD_CORE
+
+#else
+
+// Manually create _PyTupleIterObject struct
+typedef struct {
+  PyObject_HEAD Py_ssize_t it_index;
+  PyTupleObject* it_seq; /* Set to NULL when iterator is exhausted */
+} _PyTupleIterObject;
+
+#endif // IS_PYTHON_3_12_PLUS
+
+namespace torch::dynamo {
+
+// Macro to skip addition of duplicate guards like EQUALS_MATCH
+#define SKIP_IF_GUARD_ALREADY_PRESENT(name) \
+  if (self.is_leaf_guard_present(name)) {   \
+    return;                                 \
+  }                                         \
+  self.insert_leaf_guard(name);
+
+TensorCheck::TensorCheck(
+    const LocalState& state,
+    PyTypeObject* pt,
+    const at::Tensor& v,
+    std::vector<std::optional<c10::SymInt>> dynamic_dims_sizes,
+    std::vector<std::optional<c10::SymInt>> dynamic_dims_strides)
+    : pytype(pt),
+      dispatch_key_(state.apply(v.key_set()).raw_repr()),
+      dtype_(v.dtype().toScalarType()),
+      device_index_(v.device().index()),
+      requires_grad_(v.requires_grad()),
+      sizes_(std::move(dynamic_dims_sizes)),
+      strides_(std::move(dynamic_dims_strides)),
+      dim_(static_cast<int64_t>(sizes_.size())) {
+  // TODO(voz): In cases where sizes_ and strides_ are fully dynamic, should
+  // we just treat this as optional?
+}
+
+TensorCheck::TensorCheck(
+    const LocalState& state,
+    PyTypeObject* pt,
+    uint64_t dispatch_key,
+    at::ScalarType dtype,
+    at::DeviceIndex device_index,
+    std::vector<std::optional<c10::SymInt>> dynamic_dims_sizes,
+    std::vector<std::optional<c10::SymInt>> dynamic_dims_strides)
+    : pytype(pt),
+      dispatch_key_(dispatch_key),
+      dtype_(dtype),
+      device_index_(device_index),
+      requires_grad_(false),
+      sizes_(std::move(dynamic_dims_sizes)),
+      strides_(std::move(dynamic_dims_strides)),
+      dim_(static_cast<int64_t>(sizes_.size())) {}
+
+// See note in guards.py [Note - On Export Tensor Guards]
+// Logic parallel to here must be maintained in python
+bool TensorCheck::check(const LocalState& state, const at::Tensor& v) {
+  if (dispatch_key_ != state.apply(v.key_set()).raw_repr() ||
+      dtype_ != v.dtype().toScalarType() ||
+      device_index_ != v.device().index() ||
+      requires_grad_ != v.requires_grad()) {
+    return false;
+  }
+  auto ndim = v.ndimension();
+  if (ndim != dim_) {
+    return false;
+  }
+  const auto& sizes = v.sym_sizes();
+  const auto& strides = v.sym_strides();
+  for (auto i : c10::irange(ndim)) {
+    auto known_size = sizes_[i];
+    auto known_stride = strides_[i];
+    if (known_size.has_value()) {
+      if (known_size.value() != sizes[i]) {
+        return false;
       }
-      if (known_stride.has_value()) {
-        if (known_stride.value() != strides[i]) {
-          return false;
-        }
+    }
+    if (known_stride.has_value()) {
+      if (known_stride.value() != strides[i]) {
+        return false;
       }
     }
-    return true;
   }
+  return true;
+}
 
-  std::string check_verbose(
-      const LocalState& state,
-      const at::Tensor& v,
-      const std::string& tensor_name) {
-    std::stringstream fail_reason;
-    fail_reason << "tensor '" << tensor_name << "' ";
-    if (dispatch_key_ != state.apply(v.key_set()).raw_repr()) {
-      // return fmt::format("tensor dispatch key mismatch. expected {}, actual
-      // {}", dispatch_key_, state.apply(v.key_set()).raw_repr());
-      fail_reason << "dispatch key set mismatch. expected "
-                  << c10::DispatchKeySet(
-                         c10::DispatchKeySet::RAW, dispatch_key_)
-                  << ", actual " << state.apply(v.key_set());
-      return fail_reason.str();
-    } else if (dtype_ != v.dtype().toScalarType()) {
-      // return fmt::format("tensor dtype mismatch. expected {}, actual {}",
-      // dtype_, v.dtype().toScalarType());
-      fail_reason << "dtype mismatch. expected " << dtype_ << ", actual "
-                  << v.dtype().toScalarType();
-      return fail_reason.str();
-    } else if (device_index_ != v.device().index()) {
-      fail_reason
-          << "Tensor device index mismatch. Expected device index to be "
-          << device_index_ << ", actual " << v.device().index();
-      return fail_reason.str();
-    } else if (requires_grad_ != v.requires_grad()) {
-      // return fmt::format("tensor requires_grad mismatch. expected {}",
-      // requires_grad_);
-      fail_reason << "requires_grad mismatch. expected requires_grad="
-                  << requires_grad_;
+std::string TensorCheck::check_verbose(
+    const LocalState& state,
+    const at::Tensor& v,
+    const std::string& tensor_name) {
+  std::stringstream fail_reason;
+  fail_reason << "tensor '" << tensor_name << "' ";
+  if (dispatch_key_ != state.apply(v.key_set()).raw_repr()) {
+    // return fmt::format("tensor dispatch key mismatch. expected {}, actual
+    // {}", dispatch_key_, state.apply(v.key_set()).raw_repr());
+    fail_reason << "dispatch key set mismatch. expected "
+                << c10::DispatchKeySet(c10::DispatchKeySet::RAW, dispatch_key_)
+                << ", actual " << state.apply(v.key_set());
+    return fail_reason.str();
+  } else if (dtype_ != v.dtype().toScalarType()) {
+    // return fmt::format("tensor dtype mismatch. expected {}, actual {}",
+    // dtype_, v.dtype().toScalarType());
+    fail_reason << "dtype mismatch. expected " << dtype_ << ", actual "
+                << v.dtype().toScalarType();
+    return fail_reason.str();
+  } else if (device_index_ != v.device().index()) {
+    fail_reason << "Tensor device index mismatch. Expected device index to be "
+                << device_index_ << ", actual " << v.device().index();
+    return fail_reason.str();
+  } else if (requires_grad_ != v.requires_grad()) {
+    // return fmt::format("tensor requires_grad mismatch. expected {}",
+    // requires_grad_);
+    fail_reason << "requires_grad mismatch. expected requires_grad="
+                << requires_grad_;
+    return fail_reason.str();
+  }
+  auto ndim = v.ndimension();
+  if (ndim != dim_) {
+    // return fmt::format("tensor rank mismatch. expected {}, actual {}",
+    // sizes_.size(), ndim);
+    fail_reason << "rank mismatch. expected " << sizes_.size() << ", actual "
+                << ndim;
+    return fail_reason.str();
+  }
+  const auto& sizes = v.sym_sizes();
+  const auto& strides = v.sym_strides();
+  for (auto i : c10::irange(ndim)) {
+    auto known_size = sizes_[i];
+    auto known_stride = strides_[i];
+    if (known_size.has_value() && (known_size.value() != sizes[i])) {
+      fail_reason << "size mismatch at index " << i << ". expected "
+                  << known_size.value() << ", actual " << sizes[i];
       return fail_reason.str();
     }
-    auto ndim = v.ndimension();
-    if (ndim != dim_) {
-      // return fmt::format("tensor rank mismatch. expected {}, actual {}",
-      // sizes_.size(), ndim);
-      fail_reason << "rank mismatch. expected " << sizes_.size() << ", actual "
-                  << ndim;
+    if (known_stride.has_value() && known_stride.value() != strides[i]) {
+      fail_reason << "stride mismatch at index " << i << ". expected "
+                  << known_stride.value() << ", actual " << strides[i];
       return fail_reason.str();
     }
-    const auto& sizes = v.sym_sizes();
-    const auto& strides = v.sym_strides();
-    for (auto i : c10::irange(ndim)) {
-      auto known_size = sizes_[i];
-      auto known_stride = strides_[i];
-      if (known_size.has_value() && (known_size.value() != sizes[i])) {
-        fail_reason << "size mismatch at index " << i << ". expected "
-                    << known_size.value() << ", actual " << sizes[i];
-        return fail_reason.str();
-      }
-      if (known_stride.has_value() && known_stride.value() != strides[i]) {
-        fail_reason << "stride mismatch at index " << i << ". expected "
-                    << known_stride.value() << ", actual " << strides[i];
-        return fail_reason.str();
-      }
-    }
-    return "";
   }
+  return "";
+}
 
-  PyTypeObject* pytype;
-
- private:
-  uint64_t dispatch_key_; // DispatchKeySet includes device/layout
-  at::ScalarType dtype_;
-  // Note(voz): While dispatch_key_ is sufficiently representative of a device
-  // In that keys are more granular AND device specific - they do not
-  // necessarily capture device indices correctly.
-  at::DeviceIndex device_index_;
-  bool requires_grad_;
-  // NB: These are unset if dynamic shapes is enabled.
-  std::vector<std::optional<c10::SymInt>> sizes_;
-  std::vector<std::optional<c10::SymInt>> strides_;
-  // Not strictly required for dense tensors, but nested tensors need it.
-  int64_t dim_;
-};
+namespace {
 
 typedef std::vector<TensorCheck> ChecksList;
 
@@ -431,6 +454,9 @@ static PyMethodDef TensorGuards_methods[] = {
 
 static PyTypeObject TensorGuardsType = {PyVarObject_HEAD_INIT(nullptr, 0)};
 
+// TODO (janimesh) - Remove the PyObject_HEAD part when C++ guard manager is
+// merged.
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 struct GlobalStateGuard {
   PyObject_HEAD;
 
@@ -447,7 +473,7 @@ struct GlobalStateGuard {
     _default_dtype = at::get_default_dtype();
   }
 
-  inline bool check() {
+  inline bool check() const {
     auto& ctx = at::globalContext();
     return (_grad_mode == at::GradMode::is_enabled() &&
             _torch_function == torch::torch_function_enabled() &&
@@ -461,6 +487,31 @@ struct GlobalStateGuard {
         _default_dtype == at::get_default_dtype();
   }
 
+  inline std::string reason() const {
+    std::ostringstream os;
+    auto& ctx = at::globalContext();
+    if (_grad_mode != at::GradMode::is_enabled())
+      os << "grad_mode ";
+    if (_torch_function != torch::torch_function_enabled())
+      os << "torch_function ";
+    if (_deterministic_algorithms != ctx.deterministicAlgorithms())
+      os << "deterministic_algorithms ";
+    if (_deterministic_algorithms_warn_only !=
+        ctx.deterministicAlgorithmsWarnOnly())
+      os << "deterministic_algorithms_warn_only ";
+    if (_allow_tf32 != ctx.allowTF32CuBLAS())
+      os << "allow_tf32 ";
+    if (_allow_fp16_reduce != ctx.allowFP16ReductionCuBLAS())
+      os << "allow_fp16_reduce ";
+    if (_allow_bf16_reduce != ctx.allowBF16ReductionCuBLAS())
+      os << "allow_bf16_reduce ";
+    if (_num_threads != at::get_num_threads())
+      os << "num_threads ";
+    if (_default_dtype != at::get_default_dtype())
+      os << "default_dtype ";
+    return os.str();
+  }
+
   bool _grad_mode;
   bool _torch_function;
   bool _deterministic_algorithms;
@@ -492,11 +543,23 @@ PyObject* GlobalStateGuard_check(
   }
 }
 
+PyObject* GlobalStateGuard_reason(
+    GlobalStateGuard* self,
+    PyObject* args,
+    PyObject* kwargs) {
+  return PyUnicode_FromString(self->reason().c_str());
+}
+
+// NOLINTNEXTLINE(*array*)
 static PyMethodDef GlobalStateGuard_methods[] = {
     {"check",
      (PyCFunction)(void*)GlobalStateGuard_check,
      METH_NOARGS,
      "Return true if global state was the same as at creation time"},
+    {"reason",
+     (PyCFunction)(void*)GlobalStateGuard_reason,
+     METH_NOARGS,
+     "Return string reason for guard check failing"},
     {nullptr}};
 static PyTypeObject GlobalStateGuardType = {PyVarObject_HEAD_INIT(nullptr, 0)};
 
@@ -530,6 +593,44 @@ static PyObject* check_obj_id(PyObject* dummy, PyObject* args) {
   }
 }
 
+#if IS_PYTHON_3_12_PLUS
+
+static std::unordered_map<PyObject*, uint64_t> dict_version_map;
+static int dict_version_watcher_id;
+static uint64_t global_dict_version_id = 0;
+static int dict_version_watch_callback(
+    PyDict_WatchEvent event,
+    PyObject* dict,
+    PyObject* key,
+    PyObject* new_value) noexcept {
+  if (event == PyDict_EVENT_DEALLOCATED) {
+    dict_version_map.erase(dict);
+  } else if (event != PyDict_EVENT_CLONED) {
+    dict_version_map[dict] = global_dict_version_id++;
+  }
+  return 0;
+}
+
+#endif
+
+static uint64_t get_dict_version_unchecked(PyObject* dict) {
+#if IS_PYTHON_3_12_PLUS
+
+  if (PyDict_Watch(dict_version_watcher_id, dict)) {
+    throw std::runtime_error("failed to add version watcher to dict!");
+  }
+  if (!dict_version_map.count(dict)) {
+    dict_version_map[dict] = global_dict_version_id++;
+  }
+  return dict_version_map[dict];
+
+#else
+
+  return ((PyDictObject*)dict)->ma_version_tag;
+
+#endif
+}
+
 static PyObject* dict_version(PyObject* dummy, PyObject* args) {
   // Retrieves the version of a dictionary.
   PyObject* obj = nullptr;
@@ -539,7 +640,7 @@ static PyObject* dict_version(PyObject* dummy, PyObject* args) {
   if (!PyDict_Check(obj)) {
     return nullptr;
   }
-  return THPUtils_packUInt64(((PyDictObject*)obj)->ma_version_tag);
+  return THPUtils_packUInt64(get_dict_version_unchecked(obj));
 }
 
 static PyObject* assert_size_stride(PyObject* dummy, PyObject* args) {
@@ -585,12 +686,70 @@ static PyObject* assert_size_stride(PyObject* dummy, PyObject* args) {
   Py_RETURN_TRUE;
 }
 
+template <typename T>
+inline static void unwrap_size_tuple(PyObject* obj, T& output) {
+  TORCH_CHECK(PyTuple_CheckExact(obj));
+  size_t len = PyTuple_GET_SIZE(obj);
+  output.reserve(len);
+  for (size_t i = 0; i < len; ++i) {
+    auto result = PyLong_AsSsize_t(PyTuple_GET_ITEM(obj, i));
+    TORCH_CHECK(result >= 0);
+    output.emplace_back(result);
+  }
+}
+
+template <typename T>
+inline static void _parse_empty_strided_args(
+    PyObject* args,
+    T& sizes,
+    T& strides,
+    at::ScalarType& dtype) {
+  TORCH_CHECK(PyTuple_CheckExact(args));
+  TORCH_CHECK(PyTuple_GET_SIZE(args) == 3);
+  // note PyTuple_GET_ITEM returns a borrowed ref, so no need for refcounts
+  unwrap_size_tuple(PyTuple_GET_ITEM(args, 0), sizes);
+  unwrap_size_tuple(PyTuple_GET_ITEM(args, 1), strides);
+  PyObject* py_dtype = PyTuple_GET_ITEM(args, 2);
+  TORCH_CHECK(THPDtype_Check(py_dtype));
+  dtype = reinterpret_cast<THPDtype*>(py_dtype)->scalar_type;
+}
+
+static PyObject* _empty_strided_cpu(PyObject* dummy, PyObject* args) {
+  // at::empty_strided is surprising slow.  This is a lower-overhead
+  // version that saves ~2us on every allocation.
+  HANDLE_TH_ERRORS;
+  at::SmallVector<int64_t, 8> sizes;
+  at::SmallVector<int64_t, 8> strides;
+  at::ScalarType dtype{at::ScalarType::Undefined};
+  _parse_empty_strided_args(args, sizes, strides, dtype);
+  return THPVariable_Wrap(at::detail::empty_strided_cpu(sizes, strides, dtype));
+  END_HANDLE_TH_ERRORS;
+}
+
+static PyObject* _empty_strided_cuda(PyObject* dummy, PyObject* args) {
+  // at::empty_strided is surprising slow.  This is lower-overhead.
+  HANDLE_TH_ERRORS;
+#ifdef USE_CUDA
+  at::SmallVector<int64_t, 8> sizes;
+  at::SmallVector<int64_t, 8> strides;
+  at::ScalarType dtype{at::ScalarType::Undefined};
+  _parse_empty_strided_args(args, sizes, strides, dtype);
+  return THPVariable_Wrap(at::detail::empty_strided_cuda(
+      sizes, strides, dtype, c10::DeviceType::CUDA));
+#else
+  TORCH_CHECK(false, "PyTorch compiled without USE_CUDA");
+#endif
+  END_HANDLE_TH_ERRORS;
+}
+
 // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
 static PyMethodDef _methods[] = {
     {"check_type_id", check_type_id, METH_VARARGS, nullptr},
     {"check_obj_id", check_obj_id, METH_VARARGS, nullptr},
     {"assert_size_stride", assert_size_stride, METH_VARARGS, nullptr},
-    {"dict_version", dict_version, METH_VARARGS, NULL},
+    {"dict_version", dict_version, METH_VARARGS, nullptr},
+    {"_empty_strided_cpu", _empty_strided_cpu, METH_VARARGS, nullptr},
+    {"_empty_strided_cuda", _empty_strided_cuda, METH_VARARGS, nullptr},
     {nullptr, nullptr, 0, nullptr}};
 
 static struct PyModuleDef _module = {
@@ -600,53 +759,3245 @@ static struct PyModuleDef _module = {
     -1,
     _methods};
 
-} // namespace
+std::string get_exception_message() {
+  PyObject *ptype = nullptr, *pvalue = nullptr, *ptraceback = nullptr;
+  PyErr_Fetch(&ptype, &pvalue, &ptraceback);
 
-PyObject* torch_c_dynamo_guards_init() {
-  // initialize TensorGuardsType
-  TensorGuardsType.tp_name = "torch._C._dynamo.guards.TensorGuards";
-  TensorGuardsType.tp_basicsize = sizeof(TensorGuards);
-  TensorGuardsType.tp_itemsize = 0;
-  TensorGuardsType.tp_dealloc = (destructor)TensorGuards_dealloc;
-  TensorGuardsType.tp_flags = Py_TPFLAGS_DEFAULT;
-  TensorGuardsType.tp_doc = "Check properties of a torch.Tensor";
-  TensorGuardsType.tp_methods = TensorGuards_methods;
-  TensorGuardsType.tp_init = (initproc)TensorGuards_init;
-  TensorGuardsType.tp_new = TensorGuards_new;
+  PyObject* exc_message_pyobj = PyObject_Str(pvalue);
+  const char* exc_message = PyUnicode_AsUTF8(exc_message_pyobj);
 
-  if (PyType_Ready(&TensorGuardsType) < 0)
-    return nullptr;
+  Py_DECREF(exc_message_pyobj);
+  Py_XDECREF(ptype);
+  Py_XDECREF(pvalue);
+  Py_XDECREF(ptraceback);
+  return std::string(exc_message);
+}
 
-  GlobalStateGuardType.tp_name = "torch._C._dynamo.guards.GlobalStateGuard";
-  GlobalStateGuardType.tp_basicsize = sizeof(GlobalStateGuard);
-  GlobalStateGuardType.tp_itemsize = 0;
-  GlobalStateGuardType.tp_flags = Py_TPFLAGS_DEFAULT;
-  GlobalStateGuardType.tp_doc = "Guard on PyTorch global flags such as no_grad";
-  GlobalStateGuardType.tp_methods = GlobalStateGuard_methods;
-  GlobalStateGuardType.tp_init = (initproc)GlobalStateGuard_init;
-  GlobalStateGuardType.tp_new = PyType_GenericNew;
+/**
+ * Stores relevant guard debug information, e.g., failure str for a LeafGuard
+ * failure. The data structure is also accessible in Python.
+ */
 
-  if (PyType_Ready(&GlobalStateGuardType) < 0)
-    return nullptr;
+class GuardDebugInfo {
+ public:
+  GuardDebugInfo(
+      bool result,
+      py::list verbose_code_parts,
+      int num_guards_executed)
+      : result(result),
+        verbose_code_parts(std::move(verbose_code_parts)),
+        num_guards_executed(num_guards_executed) {}
 
-  auto m = PyModule_Create(&_module);
-  if (m == nullptr)
-    return nullptr;
+  // This constructor is used when guard succeeds.
+  GuardDebugInfo(bool result, int num_guards_executed)
+      : result(result), num_guards_executed(num_guards_executed) {}
 
-  Py_INCREF(&TensorGuardsType);
-  if (PyModule_AddObject(m, "TensorGuards", (PyObject*)&TensorGuardsType) < 0) {
-    Py_DECREF(&TensorGuardsType);
-    Py_DECREF(m);
-    return nullptr;
+  GuardDebugInfo(
+      bool result,
+      const std::string& failed_reason,
+      int num_guards_executed)
+      : GuardDebugInfo(result, num_guards_executed) {
+    verbose_code_parts.append(failed_reason);
   }
 
-  Py_INCREF(&GlobalStateGuardType);
-  if (PyModule_AddObject(
-          m, "GlobalStateGuard", (PyObject*)&GlobalStateGuardType) < 0) {
-    Py_DECREF(&GlobalStateGuardType);
-    Py_DECREF(m);
-    return nullptr;
+  std::string to_string() {
+    std::stringstream ss;
+    ss << "GuardDebugInfo(\n"
+       << "result=" << result << ",\n"
+       << "verbose_code_parts=" << verbose_code_parts << ",\n"
+       << "num_guards_executed=" << num_guards_executed << ")\n";
+    return ss.str();
   }
 
-  return m;
-}
+  // Whether the guard passed or failed.
+  bool result;
+
+  // This is a list of verbose_code_parts for the failed guard. When there are
+  // more than one verbose_code_parts, then recompilation reasoning infra on the
+  // Python side can iterate over this list and eval each string to pinpoint the
+  // exact code part that failed.
+  py::list verbose_code_parts;
+
+  // Total number of executed guards so far. This is helpful in debugging if
+  // shuffling is working.
+  int num_guards_executed;
+};
+
+class GuardManager;
+class RootGuardManager;
+class DictGuardManager;
+
+/**
+ * Base class for the leaf guard in the GuardManager hierarchy.
+ */
+class LeafGuard {
+ public:
+  // Most guards do not need root guard manager.
+  LeafGuard(py::object verbose_code_parts)
+      : _verbose_code_parts(std::move(verbose_code_parts)) {}
+
+  // Guards like TENSOR_MATCH require root_guard_manager to access local_state
+  // shared across all leaf guards.
+  LeafGuard(RootGuardManager* root_guard_manager, py::object verbose_code_parts)
+      : _root_guard_manager(root_guard_manager),
+        _verbose_code_parts(std::move(verbose_code_parts)) {}
+
+  // check function could be called from python. This is useful for debugging
+  // purpose.
+  bool check(py::handle value) {
+    return check_nopybind(value.ptr());
+  }
+
+  GuardDebugInfo check_verbose(py::handle value) {
+    return check_verbose_nopybind(value.ptr());
+  }
+
+  virtual GuardDebugInfo check_verbose_nopybind(
+      PyObject* value) { // borrowed ref
+    bool result = check_nopybind(value);
+    if (!result) {
+      return GuardDebugInfo(result, _verbose_code_parts, 0);
+    }
+    return GuardDebugInfo(true, 0);
+  }
+
+  py::list verbose_code_parts() {
+    return _verbose_code_parts;
+  }
+
+  // This is on the hot path and avoids any refcounting code from pybind. This
+  // is not exposed to Python and can only be called from C++.
+  virtual bool check_nopybind(PyObject* value) = 0;
+  virtual ~LeafGuard() = default;
+
+ protected:
+  // RootGuardManager has state that is common across all guards like
+  // LocalState.
+  RootGuardManager* _root_guard_manager{nullptr};
+
+ private:
+  // This is set while constructing the leaf guard. This is used for identifying
+  // the cause of recompilation.
+  py::list _verbose_code_parts;
+};
+
+/**
+ * Represents a leaf guard that accepts the python guard check function. We
+ * would like to have most of the guards in C++ (to avoid a Python function
+ * call).  But, it will take some time to reach that goal. Also, there might be
+ * cases where its too tedious to write an equivalent C++ guard.
+ *
+ * LAMBDA_GUARD allows us to gradually move to C++. We can start from all
+ * guards of type PythonLambaGuard and incrementally move expensive guards to
+ * C++.
+ */
+class LAMBDA_GUARD : public LeafGuard {
+ public:
+  LAMBDA_GUARD(py::object guard_check_fn, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {
+    if (py::isinstance<py::function>(guard_check_fn)) {
+      _guard_check_fn = py::cast<py::function>(std::move(guard_check_fn));
+    } else {
+      throw py::type_error("LAMBDA_GUARD expects (callable, str)");
+    }
+  }
+
+  // Runs the lambda function with the current f_locals value.
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    PyObject* x = PyObject_CallOneArg(_guard_check_fn.ptr(), value); // new ref
+    if (x == nullptr) {
+      // An exception is caught in the lambda function.
+      PyErr_Clear();
+      return false;
+    }
+    bool result = PyObject_IsTrue(x);
+    Py_DECREF(x);
+    return result;
+  }
+
+  GuardDebugInfo check_verbose_nopybind(PyObject* value) override {
+    PyObject* x = PyObject_CallOneArg(_guard_check_fn.ptr(), value); // new ref
+    if (x == nullptr) {
+      // An exception is caught in the lambda function.
+      std::string exc_message = get_exception_message();
+      PyErr_Clear();
+      return GuardDebugInfo(false, exc_message, 0);
+    }
+    bool result = PyObject_IsTrue(x);
+    Py_DECREF(x);
+    if (result) {
+      return GuardDebugInfo(true, 0);
+    }
+    return GuardDebugInfo(false, verbose_code_parts(), 0);
+  }
+
+ private:
+  // The user provided lambda function for check_fn.
+  py::function _guard_check_fn;
+};
+
+class TYPE_MATCH : public LeafGuard {
+ public:
+  // type_id = id(type(obj))
+  TYPE_MATCH(py::object type_id, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+        _expected(py::cast<intptr_t>(std::move(type_id))) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
+    return Py_TYPE(value) == (void*)_expected;
+  }
+
+ private:
+  // id of the type of the original object.
+  intptr_t _expected;
+};
+
+class ID_MATCH : public LeafGuard {
+ public:
+  // obj_id = id(obj)
+  ID_MATCH(py::object obj_id, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+        _expected(py::cast<intptr_t>(std::move(obj_id))) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
+    return value == (void*)_expected;
+  }
+
+ private:
+  // id of the original object.
+  intptr_t _expected;
+};
+
+class EQUALS_MATCH : public LeafGuard {
+ public:
+  EQUALS_MATCH(py::object value, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+        _value(value),
+        _value_type(Py_TYPE(value.ptr())) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    // Fast path - pointer equality check. Pointer equality checks are ok
+    // because objects guarded with EQUALS_MATCH are immutable.
+    if (value != _value.ptr() && value != _first_passing_value.ptr()) {
+      // Check type
+      if (Py_TYPE(value) != _value_type) {
+        return false;
+      }
+      int result = PyObject_RichCompareBool(value, _value.ptr(), Py_EQ);
+      // Check for exception
+      if (result == -1) {
+        PyErr_Clear();
+        return false;
+      }
+
+      // Cache the value here.
+      if (!_first_passing_value && result) {
+        _first_passing_value = py::cast<py::object>(value);
+      }
+      return result;
+    }
+    return true;
+  }
+
+ private:
+  // value to compare against. This is py::object so that we hold on to the
+  // original value and prevent garbage collection. We run EQUALS_MATCH only on
+  // selected objects which do not have high memory footprint, so holding on to
+  // these objects is ok.
+  py::object _value;
+
+  // Cache the first value whose pointer is not equal to value.ptr(). This is
+  // useful in nn module guards where getattr name is a string, which is same as
+  // a key in the __dict__ but the pointer is different.
+  py::object _first_passing_value;
+
+  // Type of the value
+  PyTypeObject* _value_type;
+};
+
+class TUPLE_ITERATOR_LEN : public LeafGuard {
+ public:
+  TUPLE_ITERATOR_LEN(
+      py::object length,
+      py::object type_id,
+      py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+        _length(py::cast<Py_ssize_t>(std::move(length))),
+        _type_id(py::cast<intptr_t>(std::move(type_id))) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    // Do a type match first.
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
+    if (Py_TYPE(value) != (void*)_type_id) {
+      return false;
+    }
+    _PyTupleIterObject* it = (_PyTupleIterObject*)value;
+    Py_ssize_t length = 0;
+    if (it->it_seq)
+      length = PyTuple_GET_SIZE(it->it_seq) - it->it_index;
+    return length == _length;
+  }
+
+ private:
+  // Length of the guarded list
+  Py_ssize_t _length;
+  intptr_t _type_id;
+};
+
+class LENGTH_CHECK : public LeafGuard {
+ public:
+  LENGTH_CHECK(py::object value, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+        _length(py::cast<Py_ssize_t>(std::move(value))) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    // PySequence_Length returns -1 if the object is not a sequence. So, we
+    // don't have to test for PySequence_Check.
+    return PySequence_Length(value) == _length;
+  }
+
+ private:
+  // Length of the guarded list
+  Py_ssize_t _length;
+};
+
+class DICT_LENGTH : public LeafGuard {
+ public:
+  DICT_LENGTH(py::object value, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+        _length(py::cast<Py_ssize_t>(std::move(value))) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    return PyDict_Check(value) && PyDict_Size(value) == _length;
+  }
+
+ private:
+  // Length of the guarded dict
+  Py_ssize_t _length;
+};
+
+class NOT_NONE : public LeafGuard {
+ public:
+  NOT_NONE(py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    return value != Py_None;
+  }
+};
+
+class DEFAULT_DEVICE : public LeafGuard {
+ public:
+  DEFAULT_DEVICE(py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {
+    py::handle device_module = py::module::import("torch.utils._device");
+    // Save the dict using py::object
+    _utils_device_dict = device_module.attr("__dict__");
+    _device = _utils_device_dict["CURRENT_DEVICE"];
+  }
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    // Create a static interned string. Interned string is faster than creating
+    // a new string every time. Even though its a new reference, we don't dec
+    // ref it. Interned strings are used for things like variable names and are
+    // leaked by design.
+    static PyObject* current_device_str =
+        PyUnicode_InternFromString("CURRENT_DEVICE");
+    PyObject* device = PyDict_GetItem(
+        _utils_device_dict.ptr(), current_device_str); // borrowed ref
+    if (device != _device.ptr()) {
+      int result = PyObject_RichCompareBool(device, _device.ptr(), Py_EQ);
+      if (result == -1) {
+        PyErr_Clear();
+        return false;
+      }
+      return result;
+    }
+    return true;
+  }
+
+ private:
+  // Save the current device and the module dict during the guard construction.
+  py::object _utils_device_dict;
+  py::object _device;
+};
+
+class GLOBAL_STATE : public LeafGuard {
+ public:
+  GLOBAL_STATE(py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {
+    _guard = std::make_unique<GlobalStateGuard>();
+    _guard->init();
+  }
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    // Ignore value arg, this is just to satisfy the interface.
+    return _guard->check();
+  }
+
+  GuardDebugInfo check_verbose_nopybind(PyObject* value) override {
+    if (!_guard->check()) {
+      return GuardDebugInfo(
+          false, "GLOBAL_STATE changed: " + _guard->reason(), 0);
+    }
+    return GuardDebugInfo(true, 1);
+  }
+
+ private:
+  std::unique_ptr<GlobalStateGuard> _guard;
+};
+
+class DATA_PTR_MATCH : public LeafGuard {
+ public:
+  DATA_PTR_MATCH(py::object tensor, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {
+    PyObject* value = tensor.ptr();
+    if (!THPVariable_CheckExact(value) && !THPVariable_Check(value)) {
+      throw std::runtime_error("DATA_PTR_MATCH guard requires a tensor");
+    }
+    _data_ptr = THPVariable_Unpack(value).data_ptr();
+  }
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    if (!THPVariable_CheckExact(value) && !THPVariable_Check(value)) {
+      return false;
+    }
+    void* data_ptr = THPVariable_Unpack(value).data_ptr();
+    return data_ptr == _data_ptr;
+  }
+
+ private:
+  // Original tensor data pointer.
+  void* _data_ptr;
+};
+
+// Checks that an attr is absent in the object. We don't need the opposite
+// HASATTR guard because we can just rely on GetAttrGuardAccessor to act as
+// HASATTR guard.
+class NO_HASATTR : public LeafGuard {
+ public:
+  NO_HASATTR(py::object attr_name, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+        _attr_name(std::move(attr_name)) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    return PyObject_HasAttr(value, _attr_name.ptr()) == 0;
+  }
+
+ private:
+  py::object _attr_name;
+};
+
+// Checks that dict contains or does not contain a key. This happens for
+// PythonSysModulesVariable tracker.
+// TODO(janimesh) - Check if we can use DictGuardManager. The downside could be
+// large number of keys for sys module, so DICT_CONTAINS might still end up
+// being faster.
+class DICT_CONTAINS : public LeafGuard {
+ public:
+  DICT_CONTAINS(bool contains, py::object key, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+        _contains(contains ? 1 : 0),
+        _key(std::move(key)) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    int result = PyDict_Contains(value, _key.ptr());
+    if (result == -1) {
+      PyErr_Clear();
+      return false;
+    }
+    return result == _contains;
+  }
+
+ private:
+  int _contains;
+  py::object _key;
+};
+
+/**
+ * Relational guards compare more than one value. We implement Relational
+ * guards by capturing some state in the guard object. For example for tensor
+ * aliasing guards - tensor X is not tensor Y - we construct one leaf guard
+ * and and install it at as a leaf of two guard managers (one for X and
+ * another for Y). Therefore, this guard is run twice. In the first
+ * invocation, it saves the first value (state) and returns True. In the
+ * second invocation, it compares the saved value with the new value and
+ * returns True if they do not alias.
+ *
+ * We have to be careful about resetting in case the other guards fail and we
+ * have some state in the relational guard. This is done by virtual method
+ * reset_state(). This is called by the RootGuardManager before it exits.
+ *
+ */
+class RelationalGuard : public LeafGuard {
+ public:
+  RelationalGuard(py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {}
+
+  // reset the relational guard state on guard failure. This is called by the
+  // guard manager.
+  virtual void reset_state() = 0;
+};
+
+/**
+ * Checks that tensor x is tensor y.
+ */
+class TENSOR_ALIASING : public RelationalGuard {
+ public:
+  TENSOR_ALIASING(py::object verbose_code_parts)
+      : RelationalGuard(std::move(verbose_code_parts)) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    if (_is_first_call) {
+      _first_tensor = value;
+      _is_first_call = false;
+      return true;
+    }
+    return _first_tensor == value;
+  }
+
+  void reset_state() final {
+    _is_first_call = true;
+  }
+
+ private:
+  bool _is_first_call{true};
+  PyObject* _first_tensor{nullptr};
+};
+
+/**
+ * Checks that none of the tensors alias.
+ */
+class NO_TENSOR_ALIASING : public RelationalGuard {
+ public:
+  NO_TENSOR_ALIASING(
+      const py::list& tensor_names,
+      py::object verbose_code_parts)
+      : RelationalGuard(std::move(verbose_code_parts)),
+        _tensor_names(tensor_names) {
+    _unique_tensors.reserve(tensor_names.size());
+  }
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    // Typically we don't have to increment the ref count here because the
+    // tensors are held in f_locals. But there is a special case for
+    // `from_numpy` source. `from_numpy` converts integers and such into tensors
+    // and these tensors are ephemeral. If we don't incref, those tensors can be
+    // garbage collected, and the next time from_numpy can reuse the memory
+    // address. Therefore, we incref here. They are decref'd in reset_state.
+    Py_INCREF(value);
+    auto insertion = _unique_tensors.insert({value, nullptr});
+    if (!insertion.second) {
+      // No need to clear _unique_tensors, reset_state will do
+      // it.
+      return false;
+    }
+    return true;
+  }
+
+  GuardDebugInfo check_verbose_nopybind(PyObject* value) override {
+    bool result = check_nopybind(value);
+
+    if (!result) {
+      return GuardDebugInfo(
+          false, "Duplicate tensor found where not expected!", 0);
+    }
+    return GuardDebugInfo(true, 1);
+  }
+
+  void reset_state() final {
+    for (auto item : _unique_tensors) {
+      Py_DECREF(item.first);
+    }
+    _unique_tensors.clear();
+  }
+
+ private:
+  py::list _tensor_names;
+  ska::flat_hash_map<PyObject*, std::nullptr_t> _unique_tensors;
+};
+
+class DYNAMIC_INDICES : public LeafGuard {
+  // C++ equivalent of
+  //  code.append(
+  //      f"(({tensor_name}._dynamo_dynamic_indices.issubset({value._dynamo_dynamic_indices}))
+  //      if hasattr({tensor_name}, '_dynamo_dynamic_indices') else True)"  #
+  //      noqa: B950
+  //  )
+ public:
+  DYNAMIC_INDICES(py::set dynamic_indices, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)),
+        _dynamic_indices(std::move(dynamic_indices)) {}
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    // Make an interned string
+    static PyObject* dynamic_indices_str =
+        PyUnicode_InternFromString("_dynamo_dynamic_indices");
+    PyObject* indices = PyObject_GetAttr(value, dynamic_indices_str); // new ref
+    if (indices == nullptr) {
+      // Attr absent. Clear exception.
+      PyErr_Clear();
+      // This is true deliberately. If hasattr fails, we return true.
+      return true;
+    }
+
+    static PyObject* issubset_str = PyUnicode_InternFromString("issubset");
+    PyObject* call_result = PyObject_CallMethodOneArg(
+        indices, issubset_str, _dynamic_indices.ptr()); // new ref
+    bool result = PyObject_IsTrue(call_result);
+    Py_DECREF(call_result);
+    Py_DECREF(indices);
+    return result;
+  }
+
+ private:
+  py::set _dynamic_indices;
+};
+
+class DICT_VERSION : public LeafGuard {
+ public:
+  DICT_VERSION(py::object value, py::object verbose_code_parts)
+      : LeafGuard(std::move(verbose_code_parts)) {
+    if (!PyDict_Check(value.ptr())) {
+      throw py::type_error("DICT_VERSION expects a dict");
+    }
+    _tag = get_dict_version_unchecked(value.ptr());
+  }
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    return PyDict_Check(value) && get_dict_version_unchecked(value) == _tag;
+  }
+
+  // Saved dict version.
+  uint64_t _tag;
+};
+
+// GuardManager can be a pointer to DictGuardManager, but at this point the
+// compiler does not know that DictGuardManager is a derived class of
+// GuardManager (no way to define inheritance relationships in forward
+// declarations), so we forward declare a factory function and define it when
+// both DictGuardManager and GuardManager are fully defined.
+std::unique_ptr<GuardManager> make_guard_manager(
+    RootGuardManager* root,
+    std::string source,
+    py::handle example_value,
+    py::handle guard_manager_enum);
+
+/**
+ * Base class representing a pair of accessor and the associated guard
+ * manager. The accessor defines how to access the child value from the
+ * py::object given to the parent check function.
+ *
+ * GuardAccessors can be considered equivalent to name() method of Source
+ * objects in guards.py. In python, name() method returns a str which we can
+ * then eval in f_locals and f_globals to retrieve the actual py object.
+ * GuardAccessor serves the same purpose. The minor difference is that
+ * GuardManager is a tree structure, so a GuardAccessor just has to retrieve
+ * the value in the next level in this tree and pass it to the child
+ * GuardAccessor.
+ *
+ * GuardAccessor also owns the GuardManager associated with the retrieved
+ * value from the GuardAccessor.
+ */
+class GuardAccessor {
+ public:
+  GuardAccessor(
+      RootGuardManager* root,
+      py::object accessor_key,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : _guard_manager(make_guard_manager(
+            root,
+            source,
+            example_value,
+            guard_manager_enum)),
+        _accessor_key(std::move(accessor_key)),
+        _source(std::move(source)) {}
+
+  // Return by reference as GuardAccessor owns the GuardManager.
+  std::unique_ptr<GuardManager>& get_guard_manager() {
+    return _guard_manager;
+  }
+
+  bool matches_key(const py::handle& key) const {
+    return _accessor_key.equal(key);
+  }
+
+  std::string get_source() {
+    return _source;
+  }
+
+  virtual bool check_nopybind(PyObject* obj) = 0;
+  virtual GuardDebugInfo check_verbose_nopybind(PyObject* obj) = 0;
+  virtual std::string repr() const = 0;
+
+  virtual ~GuardAccessor() = default;
+
+ protected:
+  // Guard manager corresponding to the retrieved value from the
+  // GuardAccessor.
+  std::unique_ptr<GuardManager> _guard_manager;
+  // accessor key could be py::str for getattr, getitem or py::function for
+  // lambda accessor. It is a py::object because we need to keep these accessor
+  // keys alive.
+  py::object _accessor_key;
+
+  // A string that can be eval'd on f_locals or f_globals to access the variable
+  // value. Only used for debugging.
+  std::string _source;
+};
+
+/**
+ * GuardManager encapsulates all the guards related to a particular
+ * py::object. It is a tree structure and consists of 1) Leaf guards - Guards
+ * that are run on the user given object 2) Accessors - Guard accessors (like
+ * getattr, getitem) to access the next value in the tree hierarchy. Accessor
+ * object also holds the child GuardManager.
+ *
+ * Lets look at an example to understand how it works.
+ * class Pair:
+ *     int x = 1;
+ *     int y = 2;
+ *
+ * At compile time
+ * >> guard_mananger = GuardManager()
+ * >> guard_mananger.x.add_lambda_guard(
+ *        lambda x: isinstance(x, Pair),
+ *        lambda x: f"expected Pair, found {type(x)}"
+ *    )
+ * >> guard_mananger.x.add_lambda_guard(lambda x: x == 1, lambda x: f"found
+ * {x}, expected 1")
+ * >> guard_mananger.y.add_lambda_guard(lambda x: x == 2, lambda x: f"found
+ * {x}, expected 2")
+ *
+ * At runtime
+ * >> guard_mananger.check(Pair())
+ *
+ * At compile time we build the tree structure. When we do `guard_manager.x`,
+ * it creates an AttrGuardAccessorNode, initializes a child guard manager with
+ * this accessor node, and adds it as a child. When we do
+ * `guard_manager.x.add_lambda_guard`, we call add_lambda_guard on the newly
+ * created guard manager and register a new leaf guard on it.
+ *
+ * At runtime, the accessor node has an important function of providing a way
+ * to access the value for the child guard. In the above example,
+ * guard_manager.x adds an AttrGuardAccessorNode with attr_name x. When check
+ * function is called, parent GuardManager calls getattr(value, "x") on its
+ * value passed to the check function to call the check function of the child
+ * guard manager.
+ *
+ * Performace optimization for fail fast - An optimization for runtime here is
+ * to sort the execution of child guards depending on the failure count.  This
+ * ensures that we run the guards that are more prone to fail statistically
+ * first. This can improve the cache lookup time when we have multiple cache
+ * entries.
+ */
+
+class GuardManager {
+ public:
+  GuardManager() = delete;
+  GuardManager(RootGuardManager* root, std::string source)
+      : _root(root), _source(std::move(source)) {}
+  GuardManager(const GuardManager& m) = delete;
+  GuardManager& operator=(const GuardManager&) = delete;
+  virtual ~GuardManager() = default;
+
+  RootGuardManager* get_root() {
+    return _root;
+  }
+
+  std::string get_source() {
+    return _source;
+  }
+
+  virtual void add_leaf_guard(std::shared_ptr<LeafGuard> leaf_guard) {
+    _leaf_guards.emplace_back(std::move(leaf_guard));
+  }
+
+  /**
+   * Adds a new guard manager with appropriate Accessor. If the accessor is
+   * already present, we just return the guard manager.
+   */
+  template <typename GuardAccessorT>
+  GuardManager* get_child_manager(
+      py::object accessor_key,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum) {
+    // accessor_key type depends on the GuardAccessorT
+    // for example for GetAttrGuardAccessor - py::str name
+
+    // Return the manager if the guard accessor exists
+    for (const auto& accessor : _accessors) {
+      if (accessor->matches_key(accessor_key)) {
+        return accessor->get_guard_manager().get();
+      }
+    }
+
+    // Construct a new guard accessor
+    _accessors.emplace_back(std::make_unique<GuardAccessorT>(
+        _root,
+        std::move(accessor_key),
+        source,
+        example_value,
+        guard_manager_enum));
+    return _accessors.back()->get_guard_manager().get();
+  }
+
+  // Runs the leaf guards check and then child managers check function.
+  //
+  // NB: There is some code DUPLICATION between this and check_verbose
+  // function. This is intentional. check function is in the hot path and is
+  // kept very simple. The purpose of check_verbose function is to get guard
+  // failure reasoning to understand recompilations. check_verbose function
+  // does not change the state of the guard, e.g., it does not shuffle the
+  // guards and does not change the fail count. For simplicity, we duplicate
+  // the code here.
+  virtual bool check_nopybind(PyObject* value) { // borrowed ref
+    // Iterate over leaf guards
+    for (const auto& guard : _leaf_guards) {
+      if (!guard->check_nopybind(value)) { // early exit
+        _fail_count += 1;
+        // no need of sorting, just return.
+        return false;
+      }
+    }
+
+    // Iterate over accessors.
+    bool result = true;
+    bool failed_on_first = true;
+    for (const auto& accessor : _accessors) {
+      if (!accessor->check_nopybind(value)) { // early exit
+        _fail_count += 1;
+        result = false;
+        // need to sort, so break the loop.
+        break;
+      }
+      failed_on_first = false;
+    }
+
+    // failed_on_first is just an optimization to avoid sorting if we are
+    // failing on the first accessor itself. This is helpful when we have
+    // already sorted the guards once, and dont need to sort again.
+    if (!result && !failed_on_first) {
+      // Inplace sort the child guards by fail count. This moves the guard
+      // with higher fail count earlier in the queue, and enables fail fast
+      // for the next check_verbose.
+
+      // An alternate implementation was to use priority queue directly on
+      // _accessors, but it was rejected because of the complexity of
+      // popping and creating a new pq on each run_guards. Moreover, this sort
+      // is happening on the unhappy path when check_verbose guard
+      // fails. So, its probably ok.
+      std::sort(
+          _accessors.begin(),
+          _accessors.end(),
+          [](const std::unique_ptr<GuardAccessor>& a,
+             const std::unique_ptr<GuardAccessor>& b) {
+            return a->get_guard_manager()->fail_count() >
+                b->get_guard_manager()->fail_count();
+          });
+    }
+
+    return result;
+  }
+
+  // This function has some code duplication with function check. This is
+  // deliberate to keep check function simple and fast.
+  virtual GuardDebugInfo check_verbose_nopybind(
+      PyObject* value) { // borrowed ref
+    int num_guards_executed = 0;
+    // Iterate over leaf guards
+    for (const auto& guard : _leaf_guards) {
+      const GuardDebugInfo& debug_info = guard->check_verbose_nopybind(value);
+      num_guards_executed++;
+      if (!debug_info.result) {
+        return GuardDebugInfo(
+            false, debug_info.verbose_code_parts, num_guards_executed);
+      }
+    }
+
+    // Iterate over accessors
+    for (const auto& accessor : _accessors) {
+      const GuardDebugInfo& debug_info =
+          accessor->check_verbose_nopybind(value);
+      num_guards_executed += debug_info.num_guards_executed;
+      if (!debug_info.result) {
+        return GuardDebugInfo(
+            false, debug_info.verbose_code_parts, num_guards_executed);
+      }
+    }
+
+    return GuardDebugInfo(true, num_guards_executed);
+  }
+
+  int64_t fail_count() const {
+    return _fail_count;
+  }
+
+  // DEBUG function - Returning raw pointers because we can't return unique_ptr
+  // and pybind does not accept a unique_ptr reference return type.
+  virtual std::vector<GuardAccessor*> get_accessors() const {
+    std::vector<GuardAccessor*> ret;
+    ret.reserve(_accessors.size());
+    for (const auto& accessor : _accessors) {
+      ret.emplace_back(accessor.get());
+    }
+    return ret;
+  }
+
+  // DEBUG function - Returning raw pointers because we can't return unique_ptr
+  // and pybind does not accept a unique_ptr reference return type.
+  virtual std::vector<GuardManager*> get_child_managers() {
+    std::vector<GuardManager*> ret;
+    ret.reserve(_accessors.size());
+    for (const auto& accessor : _accessors) {
+      ret.emplace_back(accessor->get_guard_manager().get());
+    }
+    return ret;
+  }
+
+  // DEBUG function - Returning raw pointers because we can't return unique_ptr
+  // and pybind does not accept a unique_ptr reference return type.
+  std::vector<LeafGuard*> get_leaf_guards() const {
+    std::vector<LeafGuard*> ret;
+    ret.reserve(_leaf_guards.size());
+    for (const auto& guard : _leaf_guards) {
+      ret.push_back(guard.get());
+    }
+    return ret;
+  }
+
+  bool is_leaf_guard_present(const std::string& guard_name) {
+    return _inserted_leaf_guards.find(guard_name) !=
+        _inserted_leaf_guards.end();
+  }
+
+  void insert_leaf_guard(const std::string& guard_name) {
+    _inserted_leaf_guards.insert(guard_name);
+  }
+
+ protected:
+  // Keeps a count of how many times this guard manager check function returns
+  // False. This is used for sorting optimization.
+  int64_t _fail_count{0};
+
+ private:
+  // Root of the guard manager, this is the used to install the relational
+  // guard resetters.
+  RootGuardManager* _root;
+
+  // A string that can be used to eval on f_locals or f_globals to get the
+  // value. This is used only to pass on debugging information.
+  std::string _source;
+
+  // A map of which leaf guards are inserted. This is to prevent duplicate
+  // guards like TYPE_MATCH.
+  std::unordered_set<std::string> _inserted_leaf_guards;
+
+  // Leaf guards are the terminal guards on this object, e.g, type check on a
+  // list. These guards have to be run before any children are run.
+  //
+  // These leaf guards are not shufflable. In almost all cases, these guards
+  // will have an order, e,g., type(x) is int guard and x == 5 guard. We also
+  // expect very few leaf guards per GuardManager node.
+  //
+  // NB: Why are leaf guards shared ptr? This is primarily to enable relational
+  // guards like `tensor X is not tensor Y`. These guards require multiple
+  // values. We handle it by creating one guard object that holds state and this
+  // guard is installed in many guard managers, hence a shared ptr.
+  std::vector<std::shared_ptr<LeafGuard>> _leaf_guards;
+
+  // GuardAccessors nodes to access the child guards. These guards are
+  // shufflable. On a guard failure, they are sorted based on their fail count
+  // to enable fail fast for the next check.
+  std::vector<std::unique_ptr<GuardAccessor>> _accessors;
+};
+
+/**
+ * RootGuardManager is the root of the guard tree. This is primarily
+ * constructed to hold the relational guard pointers so that we can reset the
+ * state of those guards on guard failure. All the other important
+ * implementation is in GuardManager class.
+ */
+
+class RootGuardManager : public GuardManager {
+ public:
+  // This is the root node, set its _root member to nullptr
+  RootGuardManager() : GuardManager(this, "L") {}
+
+  // Adds the relational guard resetter
+  void add_relational_guard_resetter(
+      std::shared_ptr<RelationalGuard> relational_guard) {
+    _relational_guard_resetters.emplace_back(std::move(relational_guard));
+  }
+
+  // Python visible API to check guard function.
+  bool check(py::handle value) {
+    return check_nopybind(value.ptr());
+  }
+
+  // Python visible API to check_verbose guard function.
+  GuardDebugInfo check_verbose(py::handle value) {
+    return check_verbose_nopybind(value.ptr());
+  }
+
+  // Fast check function.
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    // Check [Note on GIL interaction with mutex lock] for details on why we
+    // need mutex and its interactions wth GIL.
+    PyThreadState* _save = nullptr;
+    Py_UNBLOCK_THREADS; // ; is added to avoid clang-formatting
+    std::lock_guard<std::mutex> lock_guard(_lock);
+    Py_BLOCK_THREADS; // ; is added to avoid clang-formatting
+
+    // Get the local state. This will be used for TENSOR_MATCH guards.
+    if (_init_local_state) {
+      LocalState state;
+      _local_state = state;
+    }
+
+    if (!GuardManager::check_nopybind(value)) {
+      _reset_relational_guard_state();
+      return false;
+    }
+
+    // Iterate over epilogue leaf guards.
+    for (const auto& guard : _epilogue_lambda_guards) {
+      if (!guard->check_nopybind(value)) { // early exit
+        _reset_relational_guard_state();
+        return false;
+      }
+    }
+    _reset_relational_guard_state();
+    return true;
+  }
+
+  // Fast check_verbose function.
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* value) override { // borrowed ref
+    // Check [Note on GIL interaction with mutex lock] for details on why we
+    // need mutex and its interactions wth GIL.
+    PyThreadState* _save = nullptr;
+    Py_UNBLOCK_THREADS; // ; is added to avoid clang-formatting
+    std::lock_guard<std::mutex> lock_guard(_lock);
+    Py_BLOCK_THREADS; // ; is added to avoid clang-formatting
+
+    // Get the local state. This will be used for TENSOR_MATCH guards.
+    if (_init_local_state) {
+      LocalState state;
+      _local_state = state;
+    }
+
+    GuardDebugInfo debug_info = GuardManager::check_verbose_nopybind(value);
+    if (!debug_info.result) {
+      _reset_relational_guard_state();
+      return debug_info;
+    }
+
+    int num_guards_executed = debug_info.num_guards_executed;
+
+    // Iterate over epilogue leaf guards
+    for (const auto& guard : _epilogue_lambda_guards) {
+      const GuardDebugInfo& tmp_debug_info =
+          guard->check_verbose_nopybind(value);
+      num_guards_executed++;
+      if (!tmp_debug_info.result) {
+        _reset_relational_guard_state();
+        return GuardDebugInfo(
+            false, tmp_debug_info.verbose_code_parts, num_guards_executed);
+      }
+    }
+    _reset_relational_guard_state();
+    return GuardDebugInfo(true, num_guards_executed);
+  }
+
+  void add_epilogue_lambda_guard(std::unique_ptr<LeafGuard> leaf_guard) {
+    _epilogue_lambda_guards.emplace_back(std::move(leaf_guard));
+  }
+
+  void set_init_local_state_flag() {
+    _init_local_state = true;
+  }
+
+  // DEBUG function - Returning raw pointers because we can't return unique_ptr
+  // and pybind does not accept a unique_ptr reference return type.
+  std::vector<LeafGuard*> get_epilogue_lambda_guards() const {
+    std::vector<LeafGuard*> ret;
+    ret.reserve(_epilogue_lambda_guards.size());
+    for (const auto& guard : _epilogue_lambda_guards) {
+      ret.push_back(guard.get());
+    }
+    return ret;
+  }
+
+ private:
+  // Reset the state of all the relational guards on failure.
+  void _reset_relational_guard_state() {
+    for (auto& guard : _relational_guard_resetters) {
+      guard->reset_state();
+    }
+  }
+
+ public:
+  // Local state for TENSOR_MATCH guards.
+  LocalState _local_state;
+
+ private:
+  // All the relational guards under this guard mananger. We only use these
+  // when the guard evaluates to False. This ensures that guard state is reset
+  // on guard failure so that next invocation is clean.
+  std::vector<std::shared_ptr<RelationalGuard>> _relational_guard_resetters;
+
+  // These guards are lambda guards, i.e., the guards that lack C++
+  // implementation. For simplicity, we add these guards at the root. They
+  // MUST be run after all other guard managers have finished to ensure that
+  // the epilogue guards do not step on some nonexistent getattr or getitem.
+  std::vector<std::unique_ptr<LeafGuard>> _epilogue_lambda_guards;
+
+  // [Note on GIL interaction with mutex lock]
+  // We use std::mutex to prevent multiple threads from running
+  // check/check_verbose simultaneously. This is to prevent race condition due
+  // to state changes in RelationalGuard.
+  //
+  // However, we also need to be careful about GIL interaction with mutex. There
+  // is a chance of deadlock
+  //
+  //    Thread 1: has GIL, waiting for lock
+  //    Thread 2: has lock, waiting for GIL
+  //
+  // This can happen when Thread 2 earlier acquired the mutex lock, starting
+  // running the critical section of check function and then called some python
+  // function (like LAMBDA_GUARD) and reached Cpython codebase that checks if it
+  // should release the GIL (typically happens after every few bytecode
+  // instructions). Thread 2 here can decide to release the GIL. Thread 1 can
+  // acquire GIL and reach the mutex, where it will wait forever.
+  //
+  // To avoid this, each thread releases the GIL before acquiring the mutex and
+  // then acquires the GIL again after acquiring the mutex lock by using
+  // Py_BLOCK_THREADS and Py_UNBLOCK_THREADS. This avoids the deadlock.
+  std::mutex _lock;
+
+  // We init LocalState only when this flag it set. This flag is set during
+  // TENSOR_MATCH guard init.
+  bool _init_local_state = false;
+};
+
+/*
+ * Dicts are common in python code. Therefore, we handle guards for dicts
+ * differently and use PyDict_* APIs which are faster than PyObject_* APIs
+ * because of no ref count increments/decrements.
+ *
+ * DictGuardManager relies on the order of dict.keys(). It keeps track of the
+ * indices of dict.keys() to access the key, value pair.
+ */
+typedef std::pair<std::unique_ptr<GuardManager>, std::unique_ptr<GuardManager>>
+    KeyValueManager;
+class DictGuardManager : public GuardManager {
+ public:
+  DictGuardManager(
+      RootGuardManager* root,
+      std::string source,
+      py::handle example_value)
+      : GuardManager(root, std::move(source)),
+        _size(PyDict_Size(example_value.ptr())),
+        _expected_type(Py_TYPE(example_value.ptr())),
+        _is_exact_dict_type(PyDict_CheckExact(example_value.ptr())) {}
+
+  GuardManager* get_key_manager(
+      py::object key_index,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum) {
+    KeyValueManager& key_value_manager =
+        _get_index_manager(std::move(key_index));
+    if (!key_value_manager.first) {
+      key_value_manager.first = make_guard_manager(
+          this->get_root(),
+          std::move(source),
+          example_value,
+          guard_manager_enum);
+    };
+    return key_value_manager.first.get();
+  }
+
+  GuardManager* get_value_manager(
+      py::object key_index,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum) {
+    KeyValueManager& key_value_manager =
+        _get_index_manager(std::move(key_index));
+    if (!key_value_manager.second) {
+      key_value_manager.second = make_guard_manager(
+          this->get_root(),
+          std::move(source),
+          example_value,
+          guard_manager_enum);
+    };
+    return key_value_manager.second.get();
+  }
+
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    // TODO(janimesh) - Implement a fast-path using dict versions.
+
+    if (Py_TYPE(obj) != _expected_type) {
+      _fail_count += 1;
+      return false;
+    }
+
+    if (PyDict_Size(obj) != _size) {
+      _fail_count += 1;
+      return false;
+    }
+
+    // Early return
+    if (_size == 0) {
+      return true;
+    }
+
+    // Invokes the base class's check_nopybind method. We permit a limited set
+    // of leaf guards and accessors within the DictGuardManager framework.
+    // Integrating certain guards or accessors directly within the
+    // DictGuardManager can be challenging. For instance, `type(dict_object)` as
+    // an accessor is permissible, which otherwise would be hard to integrate
+    // directly into DictGuardManager.  Similarly, incorporating guards such as
+    // DICT_CONTAINS and DICT_VERSION as leaf guards offers a simpler solution
+    // than embedding these functionalities within the DictGuardManager itself.
+    if (!GuardManager::check_nopybind(obj)) {
+      _fail_count += 1;
+      // No need to shuffle the child guards, just return.
+      return false;
+    }
+
+    PyObject *key = nullptr, *value = nullptr;
+    Py_ssize_t pos = 0;
+
+    // Points to an element in the _indices vector.
+    size_t index_pointer = 0;
+    // Points to the key index in the dict
+    Py_ssize_t dict_pointer = 0;
+
+    while (index_pointer < _indices.size() &&
+           PyDict_Next(obj, &pos, &key, &value)) {
+      // Skip if dict_pointer is not a saved index.
+      if (dict_pointer == _indices[index_pointer]) {
+        index_pointer += 1;
+        KeyValueManager& key_value_manager = _key_value_managers[dict_pointer];
+        std::unique_ptr<GuardManager>& key_manager = key_value_manager.first;
+        if (key_manager && !key_manager->check_nopybind(key)) {
+          return false;
+        }
+        std::unique_ptr<GuardManager>& value_manager = key_value_manager.second;
+        if (value_manager && !value_manager->check_nopybind(value)) {
+          return false;
+        }
+      }
+      dict_pointer += 1;
+    }
+    return true;
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    if (Py_TYPE(obj) != _expected_type) {
+      return GuardDebugInfo(false, "TYPE_MISMATCH(" + get_source() + ")", 0);
+    }
+
+    if (PyDict_Size(obj) != _size) {
+      return GuardDebugInfo(
+          false, "len(" + get_source() + ") != " + std::to_string(_size), 0);
+    }
+
+    // Early return
+    if (_size == 0) {
+      return GuardDebugInfo(true, 0);
+    }
+
+    // Invokes the base class's check_nopybind method. We permit a limited set
+    // of leaf guards and accessors within the DictGuardManager framework.
+    // Integrating certain guards or accessors directly within the
+    // DictGuardManager can be challenging. For instance, `type(dict_object)` as
+    // an accessor is permissible, which otherwise would be hard to integrate
+    // directly into DictGuardManager.  Similarly, incorporating guards such as
+    // DICT_CONTAINS and DICT_VERSION as leaf guards offers a simpler solution
+    // than embedding these functionalities within the DictGuardManager itself.
+    GuardDebugInfo debug_info = GuardManager::check_verbose_nopybind(obj);
+    if (!debug_info.result) {
+      return debug_info;
+    }
+
+    PyObject *key = nullptr, *value = nullptr;
+    Py_ssize_t pos = 0;
+
+    // Points to an element in the _indices vector.
+    size_t index_pointer = 0;
+    Py_ssize_t dict_pointer = 0;
+
+    int num_guards_executed = 0;
+    while (index_pointer < _indices.size() &&
+           PyDict_Next(obj, &pos, &key, &value)) {
+      // Skip if pos is not a saved index.
+      if (dict_pointer == _indices[index_pointer]) {
+        index_pointer += 1;
+        KeyValueManager& key_value_manager = _key_value_managers[dict_pointer];
+        std::unique_ptr<GuardManager>& key_manager = key_value_manager.first;
+        if (key_manager) {
+          GuardDebugInfo debug_info = key_manager->check_verbose_nopybind(key);
+          num_guards_executed += debug_info.num_guards_executed;
+          if (!debug_info.result) {
+            return GuardDebugInfo(
+                false, debug_info.verbose_code_parts, num_guards_executed);
+          }
+        }
+        std::unique_ptr<GuardManager>& value_manager = key_value_manager.second;
+        if (value_manager) {
+          GuardDebugInfo debug_info =
+              value_manager->check_verbose_nopybind(value);
+          num_guards_executed += debug_info.num_guards_executed;
+          if (!debug_info.result) {
+            return GuardDebugInfo(
+                false, debug_info.verbose_code_parts, num_guards_executed);
+          }
+        }
+      }
+      dict_pointer += 1;
+    }
+    return GuardDebugInfo(true, num_guards_executed);
+  }
+
+  void skip_adding_guard(const py::object& a, const py::object& b) {
+    // The `add_leaf_guard` method in `DictGuardManager` is overridden to block
+    // the addition of leaf guards. However, this is too strict. Python side of
+    // guard management frequently adds TYPE_MATCH and DICT_LENGTH on
+    // DictGuardManager. We could refactor Python side to never call these
+    // guards on dict objects, but that results in messy code. Instead, we just
+    // override these two guards to not go through add_leaf_guard code path and
+    // skip adding guards. This makes the python side easy.
+  }
+
+  void fail_on_get_child_manager(
+      const py::object& a,
+      const std::string& source,
+      const py::object& b) {
+    throw std::runtime_error("Can not add an accessor to DictGuardManager");
+  }
+
+  void add_leaf_guard(std::shared_ptr<LeafGuard> leaf_guard) override {
+    // If you are calling this, you probably want to go through a key, value
+    // child manager and then add a leaf guard on them. DictGuardManager already
+    // has TYPE_MATCH and LENGTH_CHECK built in.
+    throw std::runtime_error("DictGuardManager does not support a leaf_guard");
+  }
+
+  void add_permitted_leaf_guard(std::shared_ptr<LeafGuard> leaf_guard) {
+    // Selectively called for permitted guards.
+    GuardManager::add_leaf_guard(std::move(leaf_guard));
+  }
+
+  // Debug helper - Returning raw pointers because we can't return unique_ptr
+  // and pybind does not accept a unique_ptr reference return type.
+  std::unordered_map<Py_ssize_t, std::pair<GuardManager*, GuardManager*>>
+  get_key_value_managers() {
+    std::unordered_map<Py_ssize_t, std::pair<GuardManager*, GuardManager*>> ret;
+    for (auto index : _indices) {
+      ret[index] = std::make_pair(
+          _key_value_managers[index].first.get(),
+          _key_value_managers[index].second.get());
+    }
+    return ret;
+  }
+
+  bool is_exact_dict_type() {
+    return _is_exact_dict_type;
+  }
+
+ private:
+  /**
+   * Adds a new KeyDictGuardAccessor. If the accessor is already present, we
+   * just return the guard manager.
+   */
+  KeyValueManager& _get_index_manager(py::object key_index) {
+    // Check if the accessor is already present.
+    Py_ssize_t index = py::cast<Py_ssize_t>(std::move(key_index));
+    auto it = _key_value_managers.find(index);
+    if (it != _key_value_managers.end()) {
+      return it->second;
+    }
+    _indices.push_back(index);
+    // Always keep the _indices array sorted
+    std::sort(_indices.begin(), _indices.end());
+    _key_value_managers[index] = std::make_pair(nullptr, nullptr);
+    return _key_value_managers[index];
+  }
+
+ protected: // also used by DictSubclassGuardManager
+  Py_ssize_t _size;
+  // DictGuardManager supports both exact dict type and non-exact dict type.
+  // Therefore, we have to compare the type to early exit.
+  PyTypeObject* _expected_type;
+  bool _is_exact_dict_type; // Useful to check getattr_manager validity.
+  std::vector<Py_ssize_t> _indices;
+  std::unordered_map<Py_ssize_t, KeyValueManager> _key_value_managers;
+};
+
+/**
+ * The DictSubclassGuardManager is designed to work with dict subclasses,
+ * specifically focusing on OrderedDicts. Standard dictionaries leverage the
+ * PyDict_Next function to iterate over keys, values, and items. OrderedDicts,
+ * on the other hand, rely on an additional linked list structure to maintain
+ * keys order. Although PyDict_Next and OrderedDict generally yield the same
+ * order, discrepancies arise when using OrderedDict's move_to_end method (used
+ * in Pytorch hooks). `move_to_end` method only updates the linked list, leaving
+ * PyDict_Next unaffected. Therefore, to accurately capture key ordering in such
+ * cases, DictSubclassGuardManager directly invoke the .keys() method.
+ */
+
+class DictSubclassGuardManager : public DictGuardManager {
+ public:
+  DictSubclassGuardManager(
+      RootGuardManager* root,
+      std::string source,
+      py::handle example_value)
+      : DictGuardManager(root, std::move(source), example_value) {}
+
+ public:
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    // TODO(janimesh) - Implement a fast-path using dict versions.
+
+    if (Py_TYPE(obj) != _expected_type) {
+      _fail_count += 1;
+      return false;
+    }
+
+    if (PyDict_Size(obj) != _size) {
+      _fail_count += 1;
+      return false;
+    }
+
+    // Early return
+    if (_size == 0) {
+      return true;
+    }
+
+    if (!GuardManager::check_nopybind(obj)) { // NOLINT
+      _fail_count += 1;
+      // No need to shuffle the child guards, just return.
+      return false;
+    }
+
+    // Points to an element in the _indices vector.
+    size_t index_pointer = 0;
+    // Points to the key index in the dict
+    Py_ssize_t dict_pointer = 0;
+
+    // Use iter(dict.keys()) to iterate over the keys
+    py::object keys =
+        py::handle(obj).attr("keys")(); // py::object handles the references
+    PyObject* iterator = PyObject_GetIter(keys.ptr()); // new reference
+    PyObject* key = nullptr;
+
+    while (index_pointer < _indices.size() &&
+           (key = PyIter_Next(iterator))) { // new reference
+      if (dict_pointer == _indices[index_pointer]) {
+        KeyValueManager& key_value_manager = _key_value_managers[dict_pointer];
+        std::unique_ptr<GuardManager>& key_manager = key_value_manager.first;
+        if (key_manager && !key_manager->check_nopybind(key)) {
+          Py_DECREF(key);
+          Py_DECREF(iterator);
+          return false;
+        }
+
+        PyObject* value = PyDict_GetItem(obj, key); // borrowed ref
+        std::unique_ptr<GuardManager>& value_manager = key_value_manager.second;
+        if (value_manager && !value_manager->check_nopybind(value)) {
+          Py_DECREF(key);
+          Py_DECREF(iterator);
+          return false;
+        }
+
+        index_pointer++;
+      }
+      dict_pointer++;
+      Py_DECREF(key);
+    }
+
+    Py_DECREF(iterator);
+    return true;
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    if (Py_TYPE(obj) != _expected_type) {
+      return GuardDebugInfo(false, "TYPE_MISMATCH(" + get_source() + ")", 0);
+    }
+
+    if (PyDict_Size(obj) != _size) {
+      return GuardDebugInfo(
+          false, "len(" + get_source() + ") != " + std::to_string(_size), 0);
+    }
+
+    // Early return
+    if (_size == 0) {
+      return GuardDebugInfo(true, 0);
+    }
+
+    GuardDebugInfo debug_info =
+        GuardManager::check_verbose_nopybind(obj); // NOLINT
+    if (!debug_info.result) {
+      return debug_info;
+    }
+
+    // Points to an element in the _indices vector.
+    size_t index_pointer = 0;
+    // Points to the key index in the dict
+    Py_ssize_t dict_pointer = 0;
+
+    int num_guards_executed = 0;
+
+    // Use iter(dict.keys()) to iterate over the keys
+    py::object keys =
+        py::handle(obj).attr("keys")(); // py::object handles the references
+    PyObject* iterator = PyObject_GetIter(keys.ptr()); // new reference
+    PyObject* key = nullptr;
+
+    while (index_pointer < _indices.size() &&
+           (key = PyIter_Next(iterator))) { // new reference
+      if (dict_pointer == _indices[index_pointer]) {
+        KeyValueManager& key_value_manager = _key_value_managers[dict_pointer];
+        std::unique_ptr<GuardManager>& key_manager = key_value_manager.first;
+        if (key_manager) {
+          GuardDebugInfo debug_info = key_manager->check_verbose_nopybind(key);
+          num_guards_executed += debug_info.num_guards_executed;
+          if (!debug_info.result) {
+            Py_DECREF(key);
+            Py_DECREF(iterator);
+            return GuardDebugInfo(
+                false, debug_info.verbose_code_parts, num_guards_executed);
+          }
+        }
+
+        PyObject* value = PyDict_GetItem(obj, key); // borrowed ref
+        std::unique_ptr<GuardManager>& value_manager = key_value_manager.second;
+        if (value_manager) {
+          GuardDebugInfo debug_info =
+              value_manager->check_verbose_nopybind(value);
+          num_guards_executed += debug_info.num_guards_executed;
+          if (!debug_info.result) {
+            Py_DECREF(key);
+            Py_DECREF(iterator);
+            return GuardDebugInfo(
+                false, debug_info.verbose_code_parts, num_guards_executed);
+          }
+        }
+        index_pointer++;
+      }
+      Py_DECREF(key);
+      dict_pointer++;
+    }
+
+    Py_DECREF(iterator);
+    return GuardDebugInfo(true, num_guards_executed);
+  }
+};
+
+std::unique_ptr<GuardManager> make_guard_manager(
+    RootGuardManager* root,
+    std::string source,
+    py::handle example_value,
+    py::handle guard_manager_enum) {
+  static py::object guard_manager_enum_class =
+      py::module_::import("torch._dynamo.guards").attr("GuardManagerType");
+  static py::object base_guard_manager_enum =
+      guard_manager_enum_class.attr("GUARD_MANAGER");
+  static py::object dict_guard_manager_enum =
+      guard_manager_enum_class.attr("DICT_GUARD_MANAGER");
+  static py::object dict_subclass_guard_manager_enum =
+      guard_manager_enum_class.attr("DICT_SUBCLASS_GUARD_MANAGER");
+  if (py::isinstance<py::dict>(example_value)) {
+    // The purpose of having both DictGuardManager and DictSubclassGuardManager
+    // is to handle the variability in how dictionaries and their subclasses
+    // manage key ordering.
+
+    // While inserting dictionary guards (check guards.py), we rely on the
+    // list(d.keys()) ordering. Therefore, the cpp guard equivalent must have
+    // the same keys ordering. For standard dictionaries, .keys() API internally
+    // uses PyDict_Next. So, DictGuardManager directly uses PyDict_Next to
+    // speedup the key fetches.
+
+    // But PyDict_Next might not give correct ordering for subclasses of dict.
+    // For example, OrderedDict override the .keys() API without changing the
+    // underlying datastructure. This leads to different keys ordering than the
+    // one given by PyDict_Next. We use DictSubclassGuardManager to account for
+    // this discrepancy. DictSubclassGuardManager directly calls the .keys() API
+    // to accurately capture key ordering. This approach is less efficient than
+    // using PyDict_Next (handled by DictGuardManager), but it ensures
+    // correctness.
+
+    // Since regular dicts are more common than subclasses of dicts with
+    // overridden keys method, we still optimize for the common case with
+    // DictGuardManager by relying on PyDict_Next.
+
+    if (guard_manager_enum.is(base_guard_manager_enum)) {
+      // For dicts that don't need to guard on keys, we can just rely on the
+      // base GuardManager.
+      return std::make_unique<GuardManager>(root, std::move(source));
+    } else if (guard_manager_enum.is(dict_guard_manager_enum)) {
+      return std::make_unique<DictGuardManager>(
+          root, std::move(source), example_value);
+    } else if (guard_manager_enum.is(dict_subclass_guard_manager_enum))
+      return std::make_unique<DictSubclassGuardManager>(
+          root, std::move(source), example_value);
+    else {
+      throw py::type_error("Invalid guard manager enum");
+    }
+  }
+  return std::make_unique<GuardManager>(root, std::move(source));
+}
+
+class TENSOR_MATCH : public LeafGuard {
+ public:
+  TENSOR_MATCH(
+      RootGuardManager* root_guard_manager,
+      py::object value,
+      py::object dynamic_dims_sizes_py,
+      py::object dynamic_dims_strides_py,
+      py::object tensor_name,
+      py::object verbose_code_parts)
+      : LeafGuard(root_guard_manager, std::move(verbose_code_parts)),
+        _tensor_name(py::cast<py::str>(std::move(tensor_name))) {
+    root_guard_manager->set_init_local_state_flag();
+    PyObject* item = value.ptr();
+    if (!THPVariable_CheckExact(item) && !THPVariable_Check(item)) {
+      PyErr_SetString(PyExc_TypeError, "expected Tensor()");
+      return;
+    }
+    auto tensor = THPVariable_Unpack(item);
+
+    std::vector<std::optional<c10::SymInt>> tensor_dims_size =
+        pyListToVecOptInt(dynamic_dims_sizes_py.ptr());
+    std::vector<std::optional<c10::SymInt>> tensor_dims_stride =
+        pyListToVecOptInt(dynamic_dims_strides_py.ptr());
+
+    tensor_dims_size = tensor_dims_size.empty()
+        ? wrapIntegersInOptional(tensor.sym_sizes())
+        : tensor_dims_size;
+    tensor_dims_stride = tensor_dims_stride.empty()
+        ? wrapIntegersInOptional(tensor.sym_strides())
+        : tensor_dims_stride;
+    LocalState state;
+    _tensor_check = std::make_unique<TensorCheck>(
+        state,
+        Py_TYPE(item),
+        std::move(tensor),
+        std::move(tensor_dims_size),
+        std::move(tensor_dims_stride));
+  }
+
+  bool check_nopybind(PyObject* value) override { // borrowed ref
+    if (Py_TYPE(value) != _tensor_check->pytype) {
+      return false;
+    }
+    return _tensor_check->check(
+        _root_guard_manager->_local_state, THPVariable_Unpack(value));
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* value) override { // borrowed ref
+
+    if (Py_TYPE(value) != _tensor_check->pytype) {
+      std::stringstream fail_reason;
+      PyObject* type_str = PyObject_Str(PyObject_Type(value));
+      fail_reason << "expected type of '" << _tensor_name
+                  << "' to be a tensor type, ";
+      if (!type_str) {
+        fail_reason << "but found a different type";
+      } else {
+        fail_reason << "' but found " << PyUnicode_AsUTF8(type_str);
+      }
+      return GuardDebugInfo(false, fail_reason.str(), 0);
+    }
+
+    std::string fail_reason = _tensor_check->check_verbose(
+        _root_guard_manager->_local_state,
+        THPVariable_Unpack(value),
+        _tensor_name);
+
+    if (!fail_reason.empty()) {
+      return GuardDebugInfo(false, fail_reason, 0);
+    }
+    return GuardDebugInfo(true, 1);
+  }
+
+ private:
+  std::string _tensor_name;
+  std::unique_ptr<TensorCheck> _tensor_check;
+};
+
+/**
+ * Represents __getattr__ acccessor.
+ */
+class GetAttrGuardAccessor : public GuardAccessor {
+ public:
+  GetAttrGuardAccessor(
+      RootGuardManager* root,
+      py::str name,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : GuardAccessor(
+            root,
+            name,
+            std::move(source),
+            example_value,
+            guard_manager_enum),
+        _attr_name(name.ptr()) {}
+
+  // NB: Intentional duplication between check_nopybind and
+  // check_verbose_nopybind.
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    PyObject* x = PyObject_GetAttr(obj, _attr_name); // new ref
+    if (x == nullptr) {
+      // Attribute absent, clear the exception and return false.
+      PyErr_Clear();
+      return false;
+    }
+    bool result = _guard_manager->check_nopybind(x);
+    Py_DECREF(x);
+    return result;
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    PyObject* x = PyObject_GetAttr(obj, _attr_name); // new ref
+    if (x == nullptr) {
+      // Attribute absent, clear the exception and return false.
+      PyErr_Clear();
+      return GuardDebugInfo(
+          false, "getattr failed on source " + get_source(), 0);
+    }
+    GuardDebugInfo result = _guard_manager->check_verbose_nopybind(x);
+    Py_DECREF(x);
+    return result;
+  }
+
+  std::string repr() const override {
+    // Helpful when priting GuardManager tree structure.
+    return "GetAttrGuardAccessor(" + py::str(_attr_name).cast<std::string>() +
+        ")";
+  }
+
+ private:
+  // no need of py::object here because the attr_name is already passed on to
+  // the base class as accessor_key which is a py::object.
+  PyObject* _attr_name;
+};
+
+/**
+ * Represents x.__dict__ acccessor.
+ */
+class GetGenericDictGuardAccessor : public GuardAccessor {
+ public:
+  GetGenericDictGuardAccessor(
+      RootGuardManager* root,
+      py::str name,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : GuardAccessor(
+            root,
+            std::move(name),
+            std::move(source),
+            example_value,
+            guard_manager_enum) {}
+
+  // NB: Intentional duplication between check_nopybind and
+  // check_verbose_nopybind.
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    PyObject* x = PyObject_GenericGetDict(obj, nullptr); // new ref
+    if (x == nullptr) {
+      // Attribute absent, clear the exception and return false.
+      PyErr_Clear();
+      return false;
+    }
+    bool result = _guard_manager->check_nopybind(x);
+    Py_DECREF(x);
+    return result;
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    PyObject* x = PyObject_GenericGetDict(obj, nullptr); // new ref
+    if (x == nullptr) {
+      // Attribute absent, clear the exception and return false.
+      PyErr_Clear();
+      return GuardDebugInfo(
+          false, "getattr failed on source " + get_source(), 0);
+    }
+    GuardDebugInfo result = _guard_manager->check_verbose_nopybind(x);
+    Py_DECREF(x);
+    return result;
+  }
+
+  std::string repr() const override {
+    // Helpful when priting GuardManager tree structure.
+    return "GetGenericDictGuardAccessor";
+  }
+};
+
+/**
+ * Represents __getitem__ acccessor.
+ */
+class GetItemGuardAccessor : public GuardAccessor {
+ public:
+  GetItemGuardAccessor(
+      RootGuardManager* root,
+      py::object name,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : GuardAccessor(
+            root,
+            name,
+            std::move(source),
+            example_value,
+            guard_manager_enum),
+        _attr_name(name.ptr()) {}
+
+  // NB: Intentional duplication between check_nopybind and
+  // check_verbose_nopybind.
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    PyObject* x = PyObject_GetItem(obj, _attr_name); // new ref
+    if (x == nullptr) {
+      PyErr_Clear();
+      return false;
+    }
+    bool result = _guard_manager->check_nopybind(x);
+    Py_DECREF(x);
+    return result;
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    PyObject* x = PyObject_GetItem(obj, _attr_name); // new ref
+    if (x == nullptr) {
+      PyErr_Clear();
+      return GuardDebugInfo(
+          false, std::string("KeyError on ") + get_source(), 0);
+    }
+    GuardDebugInfo result = _guard_manager->check_verbose_nopybind(x);
+    Py_DECREF(x);
+    return result;
+  }
+
+  std::string repr() const override {
+    return "GetItemGuardAccessor(" + py::str(_attr_name).cast<std::string>() +
+        ")";
+  }
+
+ private:
+  // no need of py::object here because the attr_name is already passed on to
+  // the base class as accessor_key which is a py::object.
+  PyObject* _attr_name;
+};
+
+/**
+ * Represents dict[name] acccessor. This is ONLY used for f_locals because its a
+ * dict, and DictGuardManager does not support sorting. We differentiate it from
+ * GetItemGuardAccessor because PyDict_GetItem should be fasten the
+ * PyObject_GetItem.
+ */
+class DictGetItemGuardAccessor : public GuardAccessor {
+ public:
+  DictGetItemGuardAccessor(
+      RootGuardManager* root,
+      py::object key,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : GuardAccessor(
+            root,
+            key,
+            std::move(source),
+            example_value,
+            guard_manager_enum),
+        _key(key.ptr()) {}
+
+  // NB: Intentional duplication between check_nopybind and
+  // check_verbose_nopybind.
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    PyObject* x = PyDict_GetItem(obj, _key); // borrowed ref
+    if (x == nullptr) {
+      PyErr_Clear();
+      return false;
+    }
+    bool result = _guard_manager->check_nopybind(x);
+    return result;
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    PyObject* x = PyDict_GetItem(obj, _key); // borrowed ref
+    if (x == nullptr) {
+      PyErr_Clear();
+      return GuardDebugInfo(
+          false, std::string("KeyError on ") + get_source(), 0);
+    }
+    GuardDebugInfo result = _guard_manager->check_verbose_nopybind(x);
+    return result;
+  }
+
+  std::string repr() const override {
+    return "DictGetItemGuardAccessor(" + py::str(_key).cast<std::string>() +
+        ")";
+  }
+
+ private:
+  PyObject* _key;
+};
+
+/**
+ * Represents list[index] accessor. It is faster than generic
+ * GetItemGuardAccessor.
+ */
+class ListGetItemGuardAccessor : public GuardAccessor {
+ public:
+  ListGetItemGuardAccessor(
+      RootGuardManager* root,
+      const py::object& index,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : GuardAccessor(
+            root,
+            index,
+            std::move(source),
+            example_value,
+            guard_manager_enum),
+        _index(py::cast<Py_ssize_t>(index)) {}
+
+  // NB: Intentional duplication between check_nopybind and
+  // check_verbose_nopybind.
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    PyObject* x = PyList_GetItem(obj, _index); // borrowed ref
+    if (x == nullptr) {
+      PyErr_Clear();
+      return false;
+    }
+    bool result = _guard_manager->check_nopybind(x);
+    return result;
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    PyObject* x = PyList_GetItem(obj, _index); // borrowed ref
+    if (x == nullptr) {
+      PyErr_Clear();
+      return GuardDebugInfo(
+          false, std::string("IndexError on ") + get_source(), 0);
+    }
+    GuardDebugInfo result = _guard_manager->check_verbose_nopybind(x);
+    return result;
+  }
+
+  std::string repr() const override {
+    return "ListGetItemGuardAccessor(" + std::to_string(_index) + ")";
+  }
+
+ private:
+  Py_ssize_t _index;
+};
+
+/**
+ * Represents tuple[index] accessor. It is faster than generic
+ * GetItemGuardAccessor.
+ */
+class TupleGetItemGuardAccessor : public GuardAccessor {
+ public:
+  TupleGetItemGuardAccessor(
+      RootGuardManager* root,
+      const py::object& index,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : GuardAccessor(
+            root,
+            index,
+            std::move(source),
+            example_value,
+            guard_manager_enum),
+        _index(py::cast<Py_ssize_t>(index)) {}
+
+  // NB: Intentional duplication between check_nopybind and
+  // check_verbose_nopybind.
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    PyObject* x = PyTuple_GetItem(obj, _index); // borrowed ref
+    if (x == nullptr) {
+      PyErr_Clear();
+      return false;
+    }
+    bool result = _guard_manager->check_nopybind(x);
+    return result;
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    PyObject* x = PyTuple_GetItem(obj, _index); // borrowed ref
+    if (x == nullptr) {
+      PyErr_Clear();
+      return GuardDebugInfo(
+          false, std::string("IndexError on ") + get_source(), 0);
+    }
+    GuardDebugInfo result = _guard_manager->check_verbose_nopybind(x);
+    return result;
+  }
+
+  std::string repr() const override {
+    return "TupleGetItemGuardAccessor(" + std::to_string(_index) + ")";
+  }
+
+ private:
+  Py_ssize_t _index;
+};
+
+/**
+ * Represents tensor.grad acccessor.
+ */
+class GradGuardAccessor : public GuardAccessor {
+ public:
+  GradGuardAccessor(
+      RootGuardManager* root,
+      py::str name,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : GuardAccessor(
+            root,
+            std::move(name),
+            std::move(source),
+            example_value,
+            guard_manager_enum) {}
+
+  // NB: Intentional duplication between check_nopybind and
+  // check_verbose_nopybind.
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    // check that its a tensor
+    if (!THPVariable_CheckExact(obj) && !THPVariable_Check(obj)) {
+      return false;
+    }
+    PyObject* grad =
+        THPVariable_Wrap(THPVariable_Unpack(obj).grad()); // New reference
+    bool result = _guard_manager->check_nopybind(grad);
+    // For undefined tensor, THPVariable_Wrap returns Py_RETURN_NONE. So, no
+    // need of Py_XDECREF.
+    Py_DECREF(grad);
+    return result;
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    // check that its a tensor
+    if (!THPVariable_CheckExact(obj) && !THPVariable_Check(obj)) {
+      return GuardDebugInfo(
+          false, "not a tensor - grad field is accessed " + get_source(), 0);
+    }
+    PyObject* grad =
+        THPVariable_Wrap(THPVariable_Unpack(obj).grad()); // New reference
+    GuardDebugInfo result = _guard_manager->check_verbose_nopybind(grad);
+    // For undefined tensor, THPVariable_Wrap returns Py_RETURN_NONE. So, no
+    // need of Py_XDECREF.
+    Py_DECREF(grad);
+    return result;
+  }
+
+  std::string repr() const override {
+    // Helpful when priting GuardManager tree structure.
+    return "GradGuardAccessor(grad)";
+  }
+};
+
+/**
+ * Represents func.__defaults__ accessor.
+ */
+class FuncDefaultsGuardAccessor : public GuardAccessor {
+ public:
+  FuncDefaultsGuardAccessor(
+      RootGuardManager* root,
+      py::object name,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : GuardAccessor(
+            root,
+            std::move(name),
+            std::move(source),
+            example_value,
+            guard_manager_enum) {}
+
+  // NB: Intentional duplication between check_nopybind and
+  // check_verbose_nopybind.
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    PyObject* func = obj;
+    if (PyMethod_Check(obj)) {
+      func = PyMethod_GET_FUNCTION(obj); // borrowed ref
+    } else if (PyInstanceMethod_Check(obj)) {
+      func = PyInstanceMethod_GET_FUNCTION(obj); // borrowed ref
+    }
+    PyObject* x = PyFunction_GetDefaults(func); // borrowed ref
+    if (x == nullptr) {
+      PyErr_Clear();
+      return false;
+    }
+    return _guard_manager->check_nopybind(x);
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    PyObject* func = obj;
+    if (PyMethod_Check(obj)) {
+      func = PyMethod_GET_FUNCTION(obj); // borrowed ref
+    } else if (PyInstanceMethod_Check(obj)) {
+      func = PyInstanceMethod_GET_FUNCTION(obj); // borrowed ref
+    }
+    PyObject* x = PyFunction_GetDefaults(func);
+    if (x == nullptr) {
+      PyErr_Clear();
+      return GuardDebugInfo(
+          false,
+          std::string(repr() + ": Not a function on ") + get_source(),
+          0);
+    }
+
+    return _guard_manager->check_verbose_nopybind(x);
+  }
+
+  std::string repr() const override {
+    return "FuncDefaultsGuardAccessor";
+  }
+};
+
+/**
+ * Represents func.__kwdefaults__ accessor.
+ */
+class FuncKwDefaultsGuardAccessor : public GuardAccessor {
+ public:
+  FuncKwDefaultsGuardAccessor(
+      RootGuardManager* root,
+      py::object name,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : GuardAccessor(
+            root,
+            std::move(name),
+            std::move(source),
+            example_value,
+            guard_manager_enum) {}
+
+  // NB: Intentional duplication between check_nopybind and
+  // check_verbose_nopybind.
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    PyObject* func = obj;
+    if (PyMethod_Check(obj)) {
+      func = PyMethod_GET_FUNCTION(obj); // borrowed ref
+    } else if (PyInstanceMethod_Check(obj)) {
+      func = PyInstanceMethod_GET_FUNCTION(obj); // borrowed ref
+    }
+    PyObject* x = PyFunction_GetKwDefaults(func); // borrowed ref
+    if (x == nullptr) {
+      PyErr_Clear();
+      return false;
+    }
+    return _guard_manager->check_nopybind(x);
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    PyObject* func = obj;
+    if (PyMethod_Check(obj)) {
+      func = PyMethod_GET_FUNCTION(obj); // borrowed ref
+    } else if (PyInstanceMethod_Check(obj)) {
+      func = PyInstanceMethod_GET_FUNCTION(obj); // borrowed ref
+    }
+    PyObject* x = PyFunction_GetKwDefaults(func);
+    if (x == nullptr) {
+      PyErr_Clear();
+      return GuardDebugInfo(
+          false,
+          std::string(repr() + ": Not a function on ") + get_source(),
+          0);
+    }
+
+    return _guard_manager->check_verbose_nopybind(x);
+  }
+
+  std::string repr() const override {
+    return "FuncKwDefaultsGuardAccessor";
+  }
+};
+
+/**
+ * Represents f_globals acccessor. This sits as a child accessor of the
+ * RootGuardManager.
+ */
+class GlobalsGuardAccessor : public GuardAccessor {
+ public:
+  GlobalsGuardAccessor(
+      RootGuardManager* root,
+      py::dict globals_dict,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : GuardAccessor(
+            root,
+            globals_dict,
+            std::move(source),
+            example_value,
+            guard_manager_enum),
+        _globals_dict(globals_dict.ptr()) {}
+
+  // NB: Intentional duplication between check_nopybind and
+  // check_verbose_nopybind.
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    // Ignore the obj arg. This is required to satisfy the function signature.
+    // Just pass on the globals dict to the child manager.
+    return _guard_manager->check_nopybind(_globals_dict);
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    // Ignore the obj arg. This is required to satisfy the function signature.
+    // Just pass on the globals dict to the child manager.
+    return _guard_manager->check_verbose_nopybind(_globals_dict);
+  }
+
+  std::string repr() const override {
+    return "GlobalsGuardAccessor";
+  }
+
+ private:
+  // no need of py::object here because the globals_dict is already passed on to
+  // the base class as accessor_key which is a py::object.
+  PyObject* _globals_dict;
+};
+
+/**
+ * Represent type(...) accessor.
+ */
+class TypeGuardAccessor : public GuardAccessor {
+ public:
+  // name = __type_accessor__, a unique string used as attribute name.
+  TypeGuardAccessor(
+      RootGuardManager* root,
+      py::str name,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : GuardAccessor(
+            root,
+            std::move(name),
+            std::move(source),
+            example_value,
+            guard_manager_enum) {}
+
+  // NB: Intentional duplication between check_nopybind and
+  // check_verbose_nopybind.
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    PyObject* x = (PyObject*)Py_TYPE(obj); // borrowed ref
+    return _guard_manager->check_nopybind(x);
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    PyObject* x = (PyObject*)Py_TYPE(obj); // borrowed ref
+    return _guard_manager->check_verbose_nopybind(x);
+  }
+
+  std::string repr() const override {
+    return "TypeGuardAccessor";
+  }
+};
+
+/**
+ * Getitem tuple_iterator accessor.
+ */
+class TupleIteratorGetItemAccessor : public GuardAccessor {
+ public:
+  TupleIteratorGetItemAccessor(
+      RootGuardManager* root,
+      py::object index,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : GuardAccessor(
+            root,
+            index,
+            std::move(source),
+            example_value,
+            guard_manager_enum),
+        _index(py::cast<Py_ssize_t>(std::move(index))) {}
+
+  // NB: Intentional duplication between check_nopybind and
+  // check_verbose_nopybind.
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    _PyTupleIterObject* it = (_PyTupleIterObject*)obj;
+    PyObject* x =
+        PyTuple_GET_ITEM(it->it_seq, it->it_index + _index); // borrowed ref
+    if (x == nullptr) {
+      // Out of range.
+      PyErr_Clear();
+      return false;
+    }
+    bool result = _guard_manager->check_nopybind(x);
+    return result;
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    _PyTupleIterObject* it = (_PyTupleIterObject*)obj;
+    PyObject* x =
+        PyTuple_GET_ITEM(it->it_seq, it->it_index + _index); // borrowed ref
+    if (x == nullptr) {
+      // Out of range.
+      PyErr_Clear();
+      return GuardDebugInfo(false, std::string("IndexError ") + repr(), 0);
+    }
+    GuardDebugInfo result = _guard_manager->check_verbose_nopybind(x);
+    return result;
+  }
+
+  std::string repr() const override {
+    return "TupleIteratorGetItemAccessor(" + std::to_string(_index) + ")";
+  }
+
+ private:
+  Py_ssize_t _index;
+};
+
+/**
+ * GlobalWeakRef accessor. Dynamo can insert a weakref object into the frame
+ * globals. This accessor reads the globals and then calls the weakref object
+ * to get the underlying object. This is a child of GlobalsGuardAccessor.
+ * Therefore, we will get the globals dict while caling check_nopybind.
+ */
+class GlobalWeakRefGuardAccessor : public GuardAccessor {
+ public:
+  GlobalWeakRefGuardAccessor(
+      RootGuardManager* root,
+      py::object global_name,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : GuardAccessor(
+            root,
+            global_name,
+            std::move(source),
+            example_value,
+            guard_manager_enum),
+        _global_name(global_name.ptr()) {}
+
+  // NB: Intentional duplication between check_nopybind and
+  // check_verbose_nopybind.
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    // obj is globals dict because GlobalWeakRefGuardAccessor has to be a
+    // child of GlobalsGuardAccessor.
+    PyObject* weakref = PyDict_GetItem(obj, _global_name); // borrowed ref
+    if (weakref == nullptr) {
+      // The weakref is not in the globals dict.
+      PyErr_Clear();
+      return false;
+    }
+
+    if (!PyWeakref_Check(weakref)) {
+      return false;
+    }
+
+    PyObject* x = PyWeakref_GetObject(weakref); // borrowed ref
+    return _guard_manager->check_nopybind(x);
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    // obj is globals dict because GlobalWeakRefGuardAccessor has to be a
+    // child of GlobalsGuardAccessor.
+    PyObject* weakref = PyDict_GetItem(obj, _global_name); // borrowed ref
+    if (weakref == nullptr) {
+      // The weakref is not in the globals dict.
+      PyErr_Clear();
+      return GuardDebugInfo(
+          false, std::string("KeyError on ") + get_source(), 0);
+    }
+
+    if (!PyWeakref_Check(weakref)) {
+      return GuardDebugInfo(
+          false, std::string("Not a weakref ") + get_source(), 0);
+    }
+
+    PyObject* x = PyWeakref_GetObject(weakref); // borrowed ref
+    return _guard_manager->check_verbose_nopybind(x);
+  }
+
+  std::string repr() const override {
+    return "GlobalWeakRefGuardAccessor(" +
+        py::str(_global_name).cast<std::string>() + ")";
+  }
+
+ private:
+  PyObject* _global_name;
+};
+
+/**
+ * Similar to PythonLambdaLeafGuard, this class is a way to allow developers to
+ * supply accessor as a python function. This is useful for from_numpy source.
+ */
+class PythonLambdaGuardAccessor : public GuardAccessor {
+ public:
+  PythonLambdaGuardAccessor(
+      RootGuardManager* root,
+      py::function accessor_fn,
+      std::string source,
+      py::handle example_value,
+      py::handle guard_manager_enum)
+      : GuardAccessor(
+            root,
+            accessor_fn,
+            std::move(source),
+            example_value,
+            guard_manager_enum),
+        _accessor_fn(std::move(accessor_fn)) {}
+
+  // NB: Intentional duplication between check_nopybind and
+  // check_verbose_nopybind.
+  bool check_nopybind(PyObject* obj) override { // borrowed ref
+    PyObject* x = PyObject_CallOneArg(_accessor_fn.ptr(), obj); // new ref
+    if (x == nullptr) {
+      // The accessor function failed.
+      PyErr_Clear();
+      return false;
+    }
+    bool result = _guard_manager->check_nopybind(x);
+    Py_DECREF(x);
+    return result;
+  }
+
+  GuardDebugInfo check_verbose_nopybind(
+      PyObject* obj) override { // borrowed ref
+    PyObject* x = PyObject_CallOneArg(_accessor_fn.ptr(), obj); // new ref
+    if (x == nullptr) {
+      // The accessor function failed.
+      std::string exc_message = get_exception_message();
+      PyErr_Clear();
+      return GuardDebugInfo(false, exc_message, 0);
+    }
+    GuardDebugInfo result = _guard_manager->check_verbose_nopybind(x);
+    Py_DECREF(x);
+    return result;
+  }
+
+  std::string repr() const override {
+    return "PythonLambdaGuardAccessor";
+  }
+
+ private:
+  py::object _accessor_fn;
+};
+
+void install_tensor_aliasing_guard(
+    GuardManager* x,
+    GuardManager* y,
+    py::object verbose_code_parts) {
+  // Adds tensor X is tensor Y guard. This is a an example of relational guard.
+  // There is one guard object that is shared between two guard managers.
+  std::shared_ptr<RelationalGuard> guard =
+      std::make_shared<TENSOR_ALIASING>(std::move(verbose_code_parts));
+
+  // Register the resetter on the toor gaurd mananger, so that it can reset
+  // the newly added relational guard when the guard eval fails.
+  x->get_root()->add_relational_guard_resetter(guard);
+  x->add_leaf_guard(guard);
+  y->add_leaf_guard(guard);
+}
+
+void install_no_tensor_aliasing_guard(
+    const py::list& guard_managers,
+    py::list tensor_names,
+    py::object verbose_code_parts) {
+  // Adds a guard that checks none of tensors alias. This is a an example of
+  // relational guard. There is one guard object that is shared between multiple
+  // guard managers.
+  std::shared_ptr<RelationalGuard> guard = std::make_shared<NO_TENSOR_ALIASING>(
+      std::move(tensor_names), std::move(verbose_code_parts));
+
+  // Register the resetter on the toor gaurd mananger, so that it can reset
+  // the newly added relational guard when the guard eval fails.
+  py::cast<GuardManager*>(guard_managers[0])
+      ->get_root()
+      ->add_relational_guard_resetter(guard);
+  for (const auto& guard_manager : guard_managers) {
+    py::cast<GuardManager*>(guard_manager)->add_leaf_guard(guard);
+  }
+}
+
+} // namespace
+
+static void* _torchinductor_pyobject_tensor_data_ptr(PyObject* obj) {
+  if (C10_UNLIKELY(
+          obj == nullptr ||
+          (!THPVariable_CheckExact(obj) && !THPVariable_Check(obj)))) {
+    throw std::runtime_error(
+        "_torchinductor_pyobject_tensor_data_ptr: non-tensor input");
+  }
+  return THPVariable_Unpack(obj).data_ptr();
+}
+
+void* convert_to_root_guard_manager(py::object root) {
+  RootGuardManager* root_mgr = std::move(root).cast<RootGuardManager*>();
+  return (void*)root_mgr;
+}
+
+bool run_root_guard_manager(void* root, PyObject* f_locals) {
+  return ((RootGuardManager*)root)->check_nopybind(f_locals);
+}
+
+PyObject* torch_c_dynamo_guards_init() {
+  // initialize TensorGuardsType
+  TensorGuardsType.tp_name = "torch._C._dynamo.guards.TensorGuards";
+  TensorGuardsType.tp_basicsize = sizeof(TensorGuards);
+  TensorGuardsType.tp_itemsize = 0;
+  TensorGuardsType.tp_dealloc = (destructor)TensorGuards_dealloc;
+  TensorGuardsType.tp_flags = Py_TPFLAGS_DEFAULT;
+  TensorGuardsType.tp_doc = "Check properties of a torch.Tensor";
+  TensorGuardsType.tp_methods = TensorGuards_methods;
+  TensorGuardsType.tp_init = (initproc)TensorGuards_init;
+  TensorGuardsType.tp_new = TensorGuards_new;
+
+  if (PyType_Ready(&TensorGuardsType) < 0)
+    return nullptr;
+
+  GlobalStateGuardType.tp_name = "torch._C._dynamo.guards.GlobalStateGuard";
+  GlobalStateGuardType.tp_basicsize = sizeof(GlobalStateGuard);
+  GlobalStateGuardType.tp_itemsize = 0;
+  GlobalStateGuardType.tp_flags = Py_TPFLAGS_DEFAULT;
+  GlobalStateGuardType.tp_doc = "Guard on PyTorch global flags such as no_grad";
+  GlobalStateGuardType.tp_methods = GlobalStateGuard_methods;
+  GlobalStateGuardType.tp_init = (initproc)GlobalStateGuard_init;
+  GlobalStateGuardType.tp_new = PyType_GenericNew;
+
+  if (PyType_Ready(&GlobalStateGuardType) < 0)
+    return nullptr;
+
+  auto m = PyModule_Create(&_module);
+  if (m == nullptr)
+    return nullptr;
+
+  Py_INCREF(&TensorGuardsType);
+  if (PyModule_AddObject(m, "TensorGuards", (PyObject*)&TensorGuardsType) < 0) {
+    Py_DECREF(&TensorGuardsType);
+    Py_DECREF(m);
+    return nullptr;
+  }
+
+  Py_INCREF(&GlobalStateGuardType);
+  if (PyModule_AddObject(
+          m, "GlobalStateGuard", (PyObject*)&GlobalStateGuardType) < 0) {
+    Py_DECREF(&GlobalStateGuardType);
+    Py_DECREF(m);
+    return nullptr;
+  }
+
+  // We expose the address of _torchinductor_pyobject_tensor_data_ptr in order
+  // to allow manual linking in our generated TorchInductor Python bindings.
+  // While regular linking works in most cases, it does not work properly in
+  // fbcode due to janky build setup there.
+  if (PyModule_AddObject(
+          m,
+          "_torchinductor_pyobject_tensor_data_ptr",
+          PyLong_FromVoidPtr(reinterpret_cast<void*>(
+              &_torchinductor_pyobject_tensor_data_ptr))) < 0) {
+    return nullptr;
+  }
+
+  auto py_m = py::handle(m).cast<py::module>();
+  py::class_<GuardDebugInfo, std::unique_ptr<GuardDebugInfo>>(
+      py_m, "GuardDebugInfo")
+      .def(py::init<bool, py::list, int>())
+      .def("__str__", &GuardDebugInfo::to_string)
+      .def_readonly("result", &GuardDebugInfo::result)
+      .def_readonly("verbose_code_parts", &GuardDebugInfo::verbose_code_parts)
+      .def_readonly(
+          "num_guards_executed", &GuardDebugInfo::num_guards_executed);
+
+  // Leaf Guards
+  py::class_<LeafGuard, std::shared_ptr<LeafGuard>>(py_m, "LeafGuard")
+      .def("verbose_code_parts", &LeafGuard::verbose_code_parts);
+  py::class_<LAMBDA_GUARD, LeafGuard, std::shared_ptr<LAMBDA_GUARD>>(
+      py_m, "LAMBDA_GUARD")
+      .def(py::init<py::function, py::list>())
+      .def("__call__", &LAMBDA_GUARD::check);
+  py::class_<TYPE_MATCH, LeafGuard, std::shared_ptr<TYPE_MATCH>>(
+      py_m, "TYPE_MATCH")
+      .def(py::init<py::object, py::list>())
+      .def("__call__", &TYPE_MATCH::check);
+  py::class_<ID_MATCH, LeafGuard, std::shared_ptr<ID_MATCH>>(py_m, "ID_MATCH")
+      .def(py::init<py::object, py::list>())
+      .def("__call__", &ID_MATCH::check);
+  py::class_<EQUALS_MATCH, LeafGuard, std::shared_ptr<EQUALS_MATCH>>(
+      py_m, "EQUALS_MATCH")
+      .def(py::init<py::object, py::list>())
+      .def("__call__", &EQUALS_MATCH::check);
+  py::class_<LENGTH_CHECK, LeafGuard, std::shared_ptr<LENGTH_CHECK>>(
+      py_m, "LENGTH_CHECK")
+      .def(py::init<py::object, py::list>())
+      .def("__call__", &LENGTH_CHECK::check);
+  py::class_<DICT_LENGTH, LeafGuard, std::shared_ptr<DICT_LENGTH>>(
+      py_m, "DICT_LENGTH")
+      .def(py::init<py::object, py::list>())
+      .def("__call__", &DICT_LENGTH::check);
+  py::class_<DEFAULT_DEVICE, LeafGuard, std::shared_ptr<DEFAULT_DEVICE>>(
+      py_m, "DEFAULT_DEVICE")
+      .def(py::init<py::list>())
+      .def("__call__", &DEFAULT_DEVICE::check);
+  py::class_<NOT_NONE, LeafGuard, std::shared_ptr<NOT_NONE>>(py_m, "NOT_NONE")
+      .def(py::init<py::list>())
+      .def("__call__", &NOT_NONE::check);
+  py::class_<
+      TUPLE_ITERATOR_LEN,
+      LeafGuard,
+      std::shared_ptr<TUPLE_ITERATOR_LEN>>(py_m, "TUPLE_ITERATOR_LEN")
+      .def(py::init<py::object, py::object, py::list>())
+      .def("__call__", &TUPLE_ITERATOR_LEN::check);
+  py::class_<GLOBAL_STATE, LeafGuard, std::shared_ptr<GLOBAL_STATE>>(
+      py_m, "GLOBAL_STATE")
+      .def(py::init<py::list>())
+      .def("check_verbose", &GLOBAL_STATE::check_verbose)
+      .def("__call__", &GLOBAL_STATE::check);
+  py::class_<DATA_PTR_MATCH, LeafGuard, std::shared_ptr<DATA_PTR_MATCH>>(
+      py_m, "DATA_PTR_MATCH")
+      .def(py::init<py::object, py::list>())
+      .def("__call__", &DATA_PTR_MATCH::check);
+  py::class_<NO_HASATTR, LeafGuard, std::shared_ptr<NO_HASATTR>>(
+      py_m, "NO_HASATTR")
+      .def(py::init<py::object, py::list>())
+      .def("__call__", &NO_HASATTR::check);
+  py::class_<DICT_CONTAINS, LeafGuard, std::shared_ptr<DICT_CONTAINS>>(
+      py_m, "DICT_CONTAINS")
+      .def(py::init<bool, py::object, py::list>())
+      .def("__call__", &DICT_CONTAINS::check);
+  py::class_<DYNAMIC_INDICES, LeafGuard, std::shared_ptr<DYNAMIC_INDICES>>(
+      py_m, "DYNAMIC_INDICES")
+      .def(py::init<py::set, py::list>())
+      .def("__call__", &DYNAMIC_INDICES::check);
+  py::class_<DICT_VERSION, LeafGuard, std::shared_ptr<DICT_VERSION>>(
+      py_m, "DICT_VERSION")
+      .def(py::init<py::object, py::list>())
+      .def("__call__", &DICT_VERSION::check);
+  py::class_<TENSOR_MATCH, LeafGuard, std::shared_ptr<TENSOR_MATCH>>(
+      py_m, "TENSOR_MATCH")
+      .def(py::init<
+           RootGuardManager*,
+           py::object,
+           py::object,
+           py::object,
+           py::str,
+           py::list>())
+      .def("__call__", &TENSOR_MATCH::check);
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<TENSOR_ALIASING, LeafGuard, std::shared_ptr<TENSOR_ALIASING>>(
+      py_m, "TENSOR_ALIASING");
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      NO_TENSOR_ALIASING,
+      LeafGuard,
+      std::shared_ptr<NO_TENSOR_ALIASING>>(py_m, "NO_TENSOR_ALIASING");
+
+  // Guard Accessors - These are present so that we can iterate over the
+  // GuardManager hierarchy. We intentionally do not provide even an init
+  // function on these, because these should be constructed from within C++.
+  py::class_<GuardAccessor, std::unique_ptr<GuardAccessor>>(
+      py_m, "GuardAccessor")
+      .def("repr", &GuardAccessor::repr);
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      GetAttrGuardAccessor,
+      GuardAccessor,
+      std::unique_ptr<GetAttrGuardAccessor>>(py_m, "GetAttrGuardAccessor");
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      GetGenericDictGuardAccessor,
+      GuardAccessor,
+      std::unique_ptr<GetGenericDictGuardAccessor>>(
+      py_m, "GetGenericDictGuardAccessor");
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      GetItemGuardAccessor,
+      GuardAccessor,
+      std::unique_ptr<GetItemGuardAccessor>>(py_m, "GetItemGuardAccessor");
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      DictGetItemGuardAccessor,
+      GuardAccessor,
+      std::unique_ptr<DictGetItemGuardAccessor>>(
+      py_m, "DictGetItemGuardAccessor");
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      ListGetItemGuardAccessor,
+      GuardAccessor,
+      std::unique_ptr<ListGetItemGuardAccessor>>(
+      py_m, "ListGetItemGuardAccessor");
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      TupleGetItemGuardAccessor,
+      GuardAccessor,
+      std::unique_ptr<TupleGetItemGuardAccessor>>(
+      py_m, "TupleGetItemGuardAccessor");
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      FuncDefaultsGuardAccessor,
+      GuardAccessor,
+      std::unique_ptr<FuncDefaultsGuardAccessor>>(
+      py_m, "FuncDefaultsGuardAccessor");
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      FuncKwDefaultsGuardAccessor,
+      GuardAccessor,
+      std::unique_ptr<FuncKwDefaultsGuardAccessor>>(
+      py_m, "FuncKwDefaultsGuardAccessor");
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      GlobalsGuardAccessor,
+      GuardAccessor,
+      std::unique_ptr<GlobalsGuardAccessor>>(py_m, "GlobalsGuardAccessor");
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      TypeGuardAccessor,
+      GuardAccessor,
+      std::unique_ptr<TypeGuardAccessor>>(py_m, "TypeGuardAccessor");
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      TupleIteratorGetItemAccessor,
+      GuardAccessor,
+      std::unique_ptr<TupleIteratorGetItemAccessor>>(
+      py_m, "TupleIteratorGetItemAccessor");
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      GlobalWeakRefGuardAccessor,
+      GuardAccessor,
+      std::unique_ptr<GlobalWeakRefGuardAccessor>>(
+      py_m, "GlobalWeakRefGuardAccessor");
+
+  // Guard Manager - No constructor in python, python should use
+  // RootGuardManager.
+  py::class_<GuardManager, std::unique_ptr<GuardManager>>(py_m, "GuardManager")
+      // return by reference because GuardManager has the ownership of accessors
+      .def("get_source", &GuardManager::get_source)
+      .def(
+          "get_accessors",
+          &GuardManager::get_accessors,
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of child
+      // managers
+      .def(
+          "get_child_managers",
+          &GuardManager::get_child_managers,
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of leaf
+      // guards
+      .def(
+          "get_leaf_guards",
+          &GuardManager::get_leaf_guards,
+          py::return_value_policy::reference)
+      .def(
+          "add_lambda_guard",
+          [](GuardManager& self,
+             py::object lambda,
+             py::object verbose_code_parts) -> void {
+            self.add_leaf_guard(std::make_shared<LAMBDA_GUARD>(
+                std::move(lambda), std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_type_match_guard",
+          [](GuardManager& self,
+             py::object value,
+             py::object verbose_code_parts) -> void {
+            SKIP_IF_GUARD_ALREADY_PRESENT("TYPE_MATCH");
+            self.add_leaf_guard(std::make_shared<TYPE_MATCH>(
+                std::move(value), std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_id_match_guard",
+          [](GuardManager& self,
+             py::object value,
+             py::object verbose_code_parts) -> void {
+            SKIP_IF_GUARD_ALREADY_PRESENT("ID_MATCH");
+            self.add_leaf_guard(std::make_shared<ID_MATCH>(
+                std::move(value), std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_equals_match_guard",
+          [](GuardManager& self,
+             py::object value,
+             py::object verbose_code_parts) -> void {
+            SKIP_IF_GUARD_ALREADY_PRESENT("EQUALS_MATCH");
+            self.add_leaf_guard(std::make_shared<EQUALS_MATCH>(
+                std::move(value), std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_length_check_guard",
+          [](GuardManager& self,
+             py::object value,
+             py::object verbose_code_parts) -> void {
+            SKIP_IF_GUARD_ALREADY_PRESENT("LENGTH_CHECK");
+            self.add_leaf_guard(std::make_shared<LENGTH_CHECK>(
+                std::move(value), std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_dict_length_check_guard",
+          [](GuardManager& self,
+             py::object value,
+             py::object verbose_code_parts) -> void {
+            SKIP_IF_GUARD_ALREADY_PRESENT("DICT_LENGTH");
+            self.add_leaf_guard(std::make_shared<DICT_LENGTH>(
+                std::move(value), std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_tuple_iterator_length_guard",
+          [](GuardManager& self,
+             py::object length,
+             py::object type_id,
+             py::object verbose_code_parts) -> void {
+            SKIP_IF_GUARD_ALREADY_PRESENT("TUPLE_ITERATOR_LEN");
+            self.add_leaf_guard(std::make_shared<TUPLE_ITERATOR_LEN>(
+                std::move(length),
+                std::move(type_id),
+                std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_default_device_guard",
+          [](GuardManager& self, py::object verbose_code_parts) -> void {
+            self.add_leaf_guard(std::make_shared<DEFAULT_DEVICE>(
+                std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_not_none_guard",
+          [](GuardManager& self, py::object verbose_code_parts) -> void {
+            SKIP_IF_GUARD_ALREADY_PRESENT("NOT_NONE");
+            self.add_leaf_guard(
+                std::make_shared<NOT_NONE>(std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_global_state_guard",
+          [](GuardManager& self, py::object verbose_code_parts) -> void {
+            self.add_leaf_guard(
+                std::make_shared<GLOBAL_STATE>(std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_data_ptr_guard",
+          [](GuardManager& self,
+             py::object data_ptr,
+             py::object verbose_code_parts) -> void {
+            SKIP_IF_GUARD_ALREADY_PRESENT("DATA_PTR_MATCH");
+            self.add_leaf_guard(std::make_shared<DATA_PTR_MATCH>(
+                std::move(data_ptr), std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_no_hasattr_guard",
+          [](GuardManager& self,
+             py::object attr_name,
+             py::object verbose_code_parts) -> void {
+            self.add_leaf_guard(std::make_shared<NO_HASATTR>(
+                std::move(attr_name), std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_dict_contains_guard",
+          [](GuardManager& self,
+             bool contains,
+             py::object key,
+             py::object verbose_code_parts) -> void {
+            self.add_leaf_guard(std::make_shared<DICT_CONTAINS>(
+                contains, std::move(key), std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_dynamic_indices_guard",
+          [](GuardManager& self,
+             py::set value,
+             py::object verbose_code_parts) -> void {
+            self.add_leaf_guard(std::make_shared<DYNAMIC_INDICES>(
+                std::move(value), std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_dict_version_guard",
+          [](GuardManager& self,
+             py::object value,
+             py::object verbose_code_parts) -> void {
+            self.add_leaf_guard(std::make_shared<DICT_VERSION>(
+                std::move(value), std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_tensor_match_guard",
+          [](GuardManager& self,
+             py::object value,
+             py::object sizes,
+             py::object strides,
+             py::object tensor_name,
+             py::object verbose_code_parts) -> void {
+            SKIP_IF_GUARD_ALREADY_PRESENT("TENSOR_MATCH");
+            self.add_leaf_guard(std::make_shared<TENSOR_MATCH>(
+                self.get_root(),
+                std::move(value),
+                std::move(sizes),
+                std::move(strides),
+                std::move(tensor_name),
+                std::move(verbose_code_parts)));
+          })
+
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "getitem_manager",
+          &GuardManager::get_child_manager<GetItemGuardAccessor>,
+          py::arg("key"),
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "dict_getitem_manager",
+          &GuardManager::get_child_manager<DictGetItemGuardAccessor>,
+          py::arg("key"),
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "list_getitem_manager",
+          &GuardManager::get_child_manager<ListGetItemGuardAccessor>,
+          py::arg("key"),
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "tuple_getitem_manager",
+          &GuardManager::get_child_manager<TupleGetItemGuardAccessor>,
+          py::arg("key"),
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "func_defaults_manager",
+          [](GuardManager& self,
+             std::string source,
+             py::object example_value,
+             py::handle guard_manager_enum) -> GuardManager* {
+            // A unique key is used to save as the accessor key.
+            py::str unique_key("__defaults_accessor__");
+            return self.get_child_manager<FuncDefaultsGuardAccessor>(
+                std::move(unique_key),
+                std::move(source),
+                std::move(example_value),
+                guard_manager_enum);
+          },
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "func_kwdefaults_manager",
+          [](GuardManager& self,
+             std::string source,
+             py::object example_value,
+             py::handle guard_manager_enum) -> GuardManager* {
+            // A unique key is used to save as the accessor key.
+            py::str unique_key("__kwdefaults_accessor__");
+            return self.get_child_manager<FuncKwDefaultsGuardAccessor>(
+                std::move(unique_key),
+                std::move(source),
+                std::move(example_value),
+                guard_manager_enum);
+          },
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "globals_dict_manager",
+          &GuardManager::get_child_manager<GlobalsGuardAccessor>,
+          py::arg("f_globals"),
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "type_manager",
+          [](GuardManager& self,
+             std::string source,
+             py::handle example_value,
+             py::handle guard_manager_enum) -> GuardManager* {
+            // A unique key is used to save as the accessor key.
+            py::str unique_key("__type_accessor__");
+            return self.get_child_manager<TypeGuardAccessor>(
+                std::move(unique_key),
+                std::move(source),
+                example_value,
+                guard_manager_enum);
+          },
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "tuple_iterator_getitem_manager",
+          &GuardManager::get_child_manager<TupleIteratorGetItemAccessor>,
+          py::arg("index"),
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "global_weakref_manager",
+          &GuardManager::get_child_manager<GlobalWeakRefGuardAccessor>,
+          py::arg("global_name"),
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "lambda_manager",
+          &GuardManager::get_child_manager<PythonLambdaGuardAccessor>,
+          py::arg("python_lambda"),
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "grad_manager",
+          [](GuardManager& self,
+             std::string source,
+             py::handle example_value,
+             py::handle guard_manager_enum) -> GuardManager* {
+            // A unique key is used to save as the accessor key.
+            py::str unique_key("__grad_accessor__");
+            return self.get_child_manager<GradGuardAccessor>(
+                std::move(unique_key),
+                std::move(source),
+                example_value,
+                guard_manager_enum);
+          },
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "get_generic_dict_manager",
+          [](GuardManager& self,
+             std::string source,
+             py::handle example_value,
+             py::handle guard_manager_enum) -> GuardManager* {
+            // A unique key is used to save as the accessor key.
+            py::str unique_key("__generic_dict_accessor__");
+            return self.get_child_manager<GetGenericDictGuardAccessor>(
+                std::move(unique_key),
+                std::move(source),
+                example_value,
+                guard_manager_enum);
+          },
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+      // return by reference because C++ GuardManager has the ownership of
+      // accessors and guard managers
+      .def(
+          "getattr_manager",
+          &GuardManager::get_child_manager<GetAttrGuardAccessor>,
+          py::arg("attr"),
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference);
+
+  // Root Guard Manager
+  py::class_<RootGuardManager, GuardManager, std::unique_ptr<RootGuardManager>>(
+      py_m, "RootGuardManager")
+      .def(py::init<>())
+      .def("check", &RootGuardManager::check)
+      .def("check_verbose", &RootGuardManager::check_verbose)
+      // return by reference because GuardManager has the ownership of leaf
+      // guards
+      .def(
+          "get_epilogue_lambda_guards",
+          &RootGuardManager::get_epilogue_lambda_guards,
+          py::return_value_policy::reference)
+      .def(
+          "add_epilogue_lambda_guard",
+          [](RootGuardManager& self,
+             py::object lambda,
+             py::object verbose_code_parts) -> void {
+            self.add_epilogue_lambda_guard(std::make_unique<LAMBDA_GUARD>(
+                std::move(lambda), std::move(verbose_code_parts)));
+          });
+
+  // Dict Guard Manager
+  py::class_<DictGuardManager, GuardManager, std::unique_ptr<DictGuardManager>>(
+      py_m, "DictGuardManager")
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "get_key_manager",
+          [](DictGuardManager& self,
+             py::object index,
+             std::string source,
+             py::handle example_value,
+             py::handle guard_manager_enum) -> GuardManager* {
+            return self.get_key_manager(
+                std::move(index),
+                std::move(source),
+                example_value,
+                guard_manager_enum);
+          },
+          py::arg("index"),
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "get_value_manager",
+          [](DictGuardManager& self,
+             py::object index,
+             std::string source,
+             py::handle example_value,
+             py::handle guard_manager_enum) -> GuardManager* {
+            return self.get_value_manager(
+                std::move(index),
+                std::move(source),
+                example_value,
+                guard_manager_enum);
+          },
+          py::arg("index"),
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference)
+      // return by reference because GuardManager has the ownership of leaf
+      // guards
+      .def(
+          "get_key_value_managers",
+          &DictGuardManager::get_key_value_managers,
+          py::return_value_policy::reference)
+      // Skipped leaf guards
+      .def("add_type_match_guard", &DictGuardManager::skip_adding_guard)
+      .def("add_dict_length_check_guard", &DictGuardManager::skip_adding_guard)
+      // Permitted leaf guards
+      .def(
+          "add_dict_contains_guard",
+          [](DictGuardManager& self,
+             bool contains,
+             py::object key,
+             py::object verbose_code_parts) -> void {
+            self.add_permitted_leaf_guard(std::make_shared<DICT_CONTAINS>(
+                contains, std::move(key), std::move(verbose_code_parts)));
+          })
+      .def(
+          "add_dict_version_guard",
+          [](DictGuardManager& self,
+             py::object value,
+             py::object verbose_code_parts) -> void {
+            // DICT_VERSION is used in a very narrow context today to guard on
+            // pytree SUPPPORTED_NODES. We can remove this once we have tags in
+            // DictGuardManager.
+            self.add_permitted_leaf_guard(std::make_shared<DICT_VERSION>(
+                std::move(value), std::move(verbose_code_parts)));
+          })
+      // Not permitted accesssors
+      .def("lambda_manager", &DictGuardManager::fail_on_get_child_manager)
+      .def("getitem_manager", &DictGuardManager::fail_on_get_child_manager)
+      .def("dict_getitem_manager", &DictGuardManager::fail_on_get_child_manager)
+      .def("globals_dict_manager", &DictGuardManager::fail_on_get_child_manager)
+      .def(
+          "tuple_iterator_getitem_manager",
+          &DictGuardManager::fail_on_get_child_manager)
+      .def(
+          "global_weakref_manager",
+          &DictGuardManager::fail_on_get_child_manager)
+      .def("lambda_manager", &DictGuardManager::fail_on_get_child_manager)
+      // Permitted accessors (and also type_manager)
+      // return by reference because GuardManager has the ownership of accessors
+      // and guard managers
+      .def(
+          "getattr_manager",
+          [](DictGuardManager& self,
+             py::object attr_name,
+             std::string source,
+             py::handle example_value,
+             py::handle guard_manager_enum) -> GuardManager* {
+            if (self.is_exact_dict_type()) {
+              throw std::runtime_error(
+                  "getattr_manager on a DictGuardManager is supported only for dict subclasses");
+            }
+            return self.get_child_manager<GetAttrGuardAccessor>(
+                std::move(attr_name),
+                std::move(source),
+                example_value,
+                guard_manager_enum);
+          },
+          py::arg("attr"),
+          py::arg("source"),
+          py::arg("example_value"),
+          py::arg("guard_manager_enum"),
+          py::return_value_policy::reference);
+
+  // Dict Guard Manager
+  py::class_< // NOLINT
+      DictSubclassGuardManager,
+      DictGuardManager,
+      std::unique_ptr<DictSubclassGuardManager>>(
+      py_m, "DictSubclassGuardManager"); // NOLINT
+
+  py_m.def("install_tensor_aliasing_guard", install_tensor_aliasing_guard);
+  py_m.def(
+      "install_no_tensor_aliasing_guard", install_no_tensor_aliasing_guard);
+
+// initialize dict_version_map watcher for 3.12
+#if IS_PYTHON_3_12_PLUS
+
+  dict_version_watcher_id = PyDict_AddWatcher(dict_version_watch_callback);
+  if (dict_version_watcher_id == -1) {
+    throw std::runtime_error("Failed to install dict_version_watch_callback");
+  }
+
+#endif
+
+  return m;
+}
+
+} // namespace torch::dynamo
diff --git a/torch/csrc/dynamo/guards.h b/torch/csrc/dynamo/guards.h
index 78727403351b5..26accf742181a 100644
--- a/torch/csrc/dynamo/guards.h
+++ b/torch/csrc/dynamo/guards.h
@@ -1,4 +1,79 @@
 #pragma once
+#include <c10/core/GradMode.h>
 #include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::dynamo {
 
 PyObject* torch_c_dynamo_guards_init();
+
+// interfaces for extra_state and eval_frame.c because RootGuardManager class is
+// not visible there.
+void* convert_to_root_guard_manager(py::object root);
+bool run_root_guard_manager(void* root, PyObject* f_locals);
+
+struct LocalState {
+  // TLS state that changes operators
+  c10::impl::LocalDispatchKeySet dispatch_modifier;
+  c10::DispatchKeySet override_dispatch_key_set;
+  bool grad_mode_enabled;
+
+  at::DispatchKeySet apply(at::DispatchKeySet ks) const {
+    if (override_dispatch_key_set.empty()) {
+      return (ks | dispatch_modifier.included_) - dispatch_modifier.excluded_;
+    } else {
+      return override_dispatch_key_set;
+    }
+  }
+
+  LocalState()
+      : dispatch_modifier(c10::impl::tls_local_dispatch_key_set()),
+        grad_mode_enabled(at::GradMode::is_enabled()) {}
+
+  void overrideDispatchKeySet(c10::DispatchKeySet ks) {
+    override_dispatch_key_set = ks;
+  }
+};
+
+class TensorCheck {
+ public:
+  TensorCheck(
+      const LocalState& state,
+      PyTypeObject* pt,
+      const at::Tensor& v,
+      std::vector<std::optional<c10::SymInt>> dynamic_dims_sizes,
+      std::vector<std::optional<c10::SymInt>> dynamic_dims_strides);
+
+  TensorCheck(
+      const LocalState& state,
+      PyTypeObject* pt,
+      uint64_t dispatch_key,
+      at::ScalarType dtype,
+      at::DeviceIndex device_index,
+      std::vector<std::optional<c10::SymInt>> dynamic_dims_sizes,
+      std::vector<std::optional<c10::SymInt>> dynamic_dims_strides);
+
+  bool check(const LocalState& state, const at::Tensor& v);
+  std::string check_verbose(
+      const LocalState& state,
+      const at::Tensor& v,
+      const std::string& tensor_name);
+
+  PyTypeObject* pytype;
+
+ private:
+  uint64_t dispatch_key_; // DispatchKeySet includes device/layout
+  at::ScalarType dtype_;
+  // Note(voz): While dispatch_key_ is sufficiently representative of a device
+  // In that keys are more granular AND device specific - they do not
+  // necessarily capture device indices correctly.
+  at::DeviceIndex device_index_;
+  bool requires_grad_;
+  // NB: These are unset if dynamic shapes is enabled.
+  std::vector<std::optional<c10::SymInt>> sizes_;
+  std::vector<std::optional<c10::SymInt>> strides_;
+  // Not strictly required for dense tensors, but nested tensors need it.
+  int64_t dim_;
+};
+
+} // namespace torch::dynamo
diff --git a/torch/csrc/dynamo/init.cpp b/torch/csrc/dynamo/init.cpp
index 1889ca6ffd843..25cf2bf17dd84 100644
--- a/torch/csrc/dynamo/init.cpp
+++ b/torch/csrc/dynamo/init.cpp
@@ -1,15 +1,35 @@
 #include <torch/csrc/dynamo/init.h>
 
+#include <pybind11/stl_bind.h>
 #include <torch/csrc/Exceptions.h>
+#include <torch/csrc/dynamo/cache_entry.h>
+#include <torch/csrc/dynamo/cpython_defs.h>
 #include <torch/csrc/dynamo/eval_frame.h>
+#include <torch/csrc/dynamo/extra_state.h>
 #include <torch/csrc/dynamo/guards.h>
 #include <torch/csrc/dynamo/python_compiled_autograd.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/python_compat.h>
 
 static struct PyModuleDef _module =
     {PyModuleDef_HEAD_INIT, "torch._C._dynamo", "", -1, nullptr};
 
-namespace torch {
-namespace dynamo {
+PYBIND11_MAKE_OPAQUE(std::vector<uint8_t>);
+
+namespace torch::dynamo {
+
+#if IS_PYTHON_3_11_PLUS
+
+std::vector<uint8_t> _PyOpcode_Caches_vec(
+    THP_PyOpcode_Caches,
+    THP_PyOpcode_Caches + THP_PyOpcode_Caches_size);
+
+#else
+
+std::vector<uint8_t> _PyOpcode_Caches_vec;
+
+#endif
+
 using torch::dynamo::autograd::torch_c_dynamo_compiled_autograd_init;
 
 void initDynamoBindings(PyObject* torch) {
@@ -34,7 +54,20 @@ void initDynamoBindings(PyObject* torch) {
       PyModule_AddObject(dynamo, "compiled_autograd", compiled_autograd) != 0) {
     throw python_error();
   }
+
+  auto m = py::handle(eval_frame).cast<py::module>();
+
+  py::class_<CacheEntry>(m, "_CacheEntry")
+      .def_readonly("check_fn", &CacheEntry::check_fn)
+      .def_readonly("code", &CacheEntry::code)
+      .def_property_readonly("next", &CacheEntry::next);
+
+  py::class_<ExtraState>(m, "_ExtraState")
+      .def("invalidate", &ExtraState::invalidate);
+
+  m.def("_debug_get_cache_entry_list", &_debug_get_cache_entry_list);
+  py::bind_vector<std::vector<uint8_t>>(m, "VectorUInt8");
+  m.attr("py_opcode_caches") = _PyOpcode_Caches_vec;
 }
 
-} // namespace dynamo
-} // namespace torch
+} // namespace torch::dynamo
diff --git a/torch/csrc/dynamo/init.h b/torch/csrc/dynamo/init.h
index c496df083dfec..11f92e7416118 100644
--- a/torch/csrc/dynamo/init.h
+++ b/torch/csrc/dynamo/init.h
@@ -2,13 +2,10 @@
 
 // C2039 MSVC
 #include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
 #include <torch/csrc/utils/pybind.h>
 
 #include <Python.h>
 
-namespace torch {
-namespace dynamo {
+namespace torch::dynamo {
 void initDynamoBindings(PyObject* torch);
 }
-} // namespace torch
diff --git a/torch/csrc/dynamo/python_compiled_autograd.cpp b/torch/csrc/dynamo/python_compiled_autograd.cpp
index a60dc9153d55a..dd5ea7cbd094f 100644
--- a/torch/csrc/dynamo/python_compiled_autograd.cpp
+++ b/torch/csrc/dynamo/python_compiled_autograd.cpp
@@ -7,6 +7,7 @@
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/pythoncapi_compat.h>
 #include <iostream>
+#include <sstream>
 #include <vector>
 
 /*
@@ -49,6 +50,14 @@ at trace time.
 namespace torch::dynamo::autograd {
 using c10::SymInt;
 
+// snapshot of python verbose logging toggle
+static bool is_verbose_logging_enabled;
+static constexpr std::string_view VLOG_PREFIX =
+    "[python_compiled_autograd.cpp] ";
+std::ostream& vcout() {
+  return std::cout << VLOG_PREFIX;
+}
+
 static PyObject* wrap_int_list(const std::vector<int64_t>& inputs) {
   PyObject* pyinput = PyTuple_New(static_cast<Py_ssize_t>(inputs.size()));
   for (const auto i : c10::irange(inputs.size())) {
@@ -71,6 +80,7 @@ static PyObject* check(PyObject* pyresult) {
     // see https://github.com/pytorch/pytorch/pull/34845
     python_error err;
     err.persist();
+    // NOLINTNEXTLINE(misc-throw-by-value-catch-by-reference)
     throw err;
   }
   return pyresult;
@@ -89,9 +99,11 @@ struct CacheNode {
     return &_root;
   }
 
-  CacheNode* lookup(const CacheKey& key) {
+  CacheNode* lookup(const CacheKey& key, bool create = true) {
     auto it = next.find(key);
     if (it == next.end()) {
+      if (!create)
+        return nullptr;
       // caller's key is in temporary memory, must copy it
       CacheKeyBuffer buffer(key.key, key.key_size);
       CacheKey key_with_storage(key.node_type, buffer.get(), key.key_size);
@@ -144,12 +156,20 @@ struct CacheNode {
     TORCH_INTERNAL_ASSERT(expected_sizes.size() == call.all_size_inputs.size());
     for (const auto i : c10::irange(len)) {
       auto& expected = expected_sizes[i];
-      if (expected.dyn_type == SizeInput::DYNAMIC ||
-          expected.value != data[i].value) {
-        cache_hit = cache_hit && expected.dyn_type == SizeInput::DYNAMIC;
-        if (expected.value != data[i].value) {
-          expected = SizeInput(SizeInput::DYNAMIC, data[i].value);
+      bool was_dynamic = expected.dyn_type == SizeInput::DYNAMIC;
+      bool changed_value = expected.value != data[i].value;
+      if (changed_value) {
+        if (!was_dynamic) {
+          cache_hit = false;
+          if (is_verbose_logging_enabled) {
+            vcout() << "cache miss: marking sizes[" << i << "] as dynamic"
+                    << std::endl;
+          }
         }
+        expected = SizeInput(SizeInput::DYNAMIC, data[i].value);
+      }
+
+      if (changed_value || was_dynamic) {
         if (call.dyn_size_inputs.empty()) {
           call.dyn_size_inputs.reserve(len);
         }
@@ -165,7 +185,7 @@ struct CacheNode {
     return cache_hit;
   }
 
-  PyObject* wrap_dynamic_inputs() {
+  PyObject* wrap_dynamic_inputs() const {
     size_t dynamic_count = 0;
     size_t idx = 0;
     for (const auto& i : expected_sizes) {
@@ -183,7 +203,8 @@ struct CacheNode {
     return pyinput;
   }
 
-  std::vector<c10::optional<SymInt>> unwrap_dynamic_inputs(PyObject* pyresult) {
+  std::vector<c10::optional<SymInt>> unwrap_dynamic_inputs(
+      PyObject* pyresult) const {
     TORCH_INTERNAL_ASSERT(PyList_CheckExact(pyresult));
     size_t idx = 0;
     size_t result_len = PyList_GET_SIZE(pyresult);
@@ -212,10 +233,7 @@ struct CacheNode {
 
 struct InputBuffers : public std::unordered_map<Node*, InputBuffer> {
   InputBuffer& lookup(Node* function) {
-    auto it = find(function);
-    if (it == end()) {
-      it = emplace(function, InputBuffer(function->num_inputs())).first;
-    }
+    auto it = emplace(function, InputBuffer(function->num_inputs())).first;
     return it->second;
   }
 };
@@ -239,10 +257,21 @@ static PyObject* is_cache_empty(PyObject* dummy, PyObject* args) {
   END_HANDLE_TH_ERRORS;
 }
 
+static PyObject* set_verbose_logging(PyObject* dummy, PyObject* args) {
+  HANDLE_TH_ERRORS;
+  if (!PyArg_ParseTuple(args, "p", &is_verbose_logging_enabled)) {
+    Py_RETURN_FALSE;
+  }
+  Py_RETURN_TRUE;
+  END_HANDLE_TH_ERRORS;
+}
+
+// NOLINTNEXTLINE(*array*)
 static PyMethodDef _methods[] = {
     {"set_autograd_compiler", set_autograd_compiler, METH_VARARGS, nullptr},
     {"clear_cache", clear_cache, METH_NOARGS, nullptr},
     {"is_cache_empty", is_cache_empty, METH_NOARGS, nullptr},
+    {"set_verbose_logging", set_verbose_logging, METH_VARARGS, nullptr},
     {nullptr, nullptr, 0, nullptr}};
 
 static struct PyModuleDef _module = {
@@ -299,27 +328,23 @@ struct ClosingTHPObjectPtr : public THPObjectPtr {
   }
 };
 
-variable_list compiled_autograd(
+// Only call this function while holding GIL
+CacheNode* _compiled_autograd_impl(
     const std::shared_ptr<Node>& graph_root,
     GraphTask& graph_task,
     bool accumulate_grad,
-    const edge_list& output_edges) {
-  TORCH_CHECK(
-      output_edges.empty() || !accumulate_grad,
-      "specifying inputs= with .backward() not yet implemented for compiled autograd")
-  TORCH_CHECK(
-      c10::impl::TorchDispatchModeTLS::stack_len() == 0,
-      "TorchDispatchMode not yet implemented for compiled autograd")
-  static std::mutex lock;
-  std::lock_guard<std::mutex> lock_guard(lock);
-  pybind11::gil_scoped_acquire gil;
-  at::ThreadLocalStateGuard tls_guard(graph_task.thread_locals_);
+    const edge_list& output_edges,
+    THPObjectPtr* graph_arg_inputs,
+    THPObjectPtr* graph_arg_sizes,
+    THPObjectPtr* graph_arg_hooks) {
   std::unordered_map<Node*, int>& dependencies = graph_task.dependencies_;
   std::vector<std::shared_ptr<Node>> worklist{graph_root};
   AutogradCompilerCall compiler_call;
 
   for (const auto i : c10::irange(output_edges.size())) {
-    compiler_call.node_calls.lookup(output_edges[i].function)
+    compiler_call.node_calls
+        .lookup(output_edges[i].function)
+        // NOLINTNEXTLINE(*-narrowing-conversions)
         .mark_output(output_edges[i].input_nr, i);
   }
   const bool check_exec_info = !graph_task.exec_info_.empty();
@@ -341,7 +366,13 @@ variable_list compiled_autograd(
         fn->compiled_args(node_args);
         node_args.collect(call.node->next_edges());
       }
-      cache = cache->lookup(node_args.key());
+      CacheKey key = node_args.key();
+      if (is_verbose_logging_enabled &&
+          cache->lookup(key, /*create=*/false) == nullptr) {
+        vcout() << "Creating cache entry for " << fn->name()
+                << ", with key of size " << key.key_size << std::endl;
+      }
+      cache = cache->lookup(key);
     }
 
     for (const auto& edge : fn->next_edges()) {
@@ -371,12 +402,13 @@ variable_list compiled_autograd(
     // cache miss, need to capture FX graph
     ClosingTHPObjectPtr py_compiler(
         check(PyObject_CallNoArgs((the_autograd_compiler))));
+
     TraceState state = call_begin_capture(
         py_compiler, *cache, compiler_call, output_edges.size());
     InputBuffers input_buffers;
 
-    for (NodeCall* call_ptr : calls) {
-      NodeCall& call = *call_ptr;
+    for (size_t i = 0; i < calls.size(); i++) {
+      NodeCall& call = *calls[i];
       // TODO(jansel): consider adding some of this stuff:
       // guard(local_graph_task); NodeGuard ndguard(task.fn_); const auto
       // opt_parent_stream = (*func).stream(c10::DeviceType::CUDA);
@@ -385,7 +417,9 @@ variable_list compiled_autograd(
       // at::getStepCallbacksUnlessEmpty(at::RecordScope::BACKWARD_FUNCTION);
       // if (C10_UNLIKELY(step_callbacks.has_value())) { ... }
 
-      variable_list inputs = input_buffers.lookup(call.node.get()).buffer;
+      variable_list inputs =
+          std::move(input_buffers.lookup(call.node.get()).buffer);
+      input_buffers.erase(call.node.get());
 
       if (!call.tensor_pre_hooks.empty()) {
         THPObjectPtr pyinputs(THPVariable_WrapList(inputs));
@@ -420,6 +454,16 @@ variable_list compiled_autograd(
         inputs = THPVariable_UnpackList(pyinputs);
       }
 
+      if (is_verbose_logging_enabled) {
+        std::string _node_name = call.node->name();
+        THPObjectPtr node_name(PyUnicode_FromString(_node_name.data()));
+        TORCH_INTERNAL_ASSERT(node_name != nullptr);
+        THPObjectPtr set_node_origin(
+            PyObject_GetAttrString(py_compiler.get(), "set_node_origin"));
+        check(PyObject_CallFunction(
+            set_node_origin, "OI", node_name.get(), i, nullptr));
+      }
+
       SwapSavedVariables saved(compiler_call, state, py_compiler.get(), call);
       variable_list outputs = call.node->apply_with_saved(inputs, saved);
 
@@ -473,9 +517,40 @@ variable_list compiled_autograd(
     }
   }
 
-  THPObjectPtr inputs(THPVariable_WrapList(compiler_call.tensor_args.inputs));
-  THPObjectPtr sizes(wrap_int_list(compiler_call.dyn_size_inputs));
-  THPObjectPtr hooks(convert_hook_list(compiler_call.hooks));
+  *graph_arg_inputs = THPVariable_WrapList(compiler_call.tensor_args.inputs);
+  *graph_arg_sizes = wrap_int_list(compiler_call.dyn_size_inputs);
+  *graph_arg_hooks = convert_hook_list(compiler_call.hooks);
+  return cache;
+}
+
+variable_list compiled_autograd(
+    const std::shared_ptr<Node>& graph_root,
+    GraphTask& graph_task,
+    bool accumulate_grad,
+    const edge_list& output_edges) {
+  TORCH_CHECK(
+      output_edges.empty() || !accumulate_grad,
+      "specifying inputs= with .backward() not yet implemented for compiled autograd")
+  TORCH_CHECK(
+      c10::impl::TorchDispatchModeTLS::stack_len() == 0,
+      "TorchDispatchMode not yet implemented for compiled autograd")
+  static std::mutex lock;
+  std::lock_guard<std::mutex> lock_guard(lock);
+  pybind11::gil_scoped_acquire gil;
+  at::ThreadLocalStateGuard tls_guard(graph_task.thread_locals_);
+
+  THPObjectPtr inputs;
+  THPObjectPtr sizes;
+  THPObjectPtr hooks;
+  CacheNode* cache = _compiled_autograd_impl(
+      graph_root,
+      graph_task,
+      accumulate_grad,
+      output_edges,
+      &inputs,
+      &sizes,
+      &hooks);
+
   THPObjectPtr pyresult(check(PyObject_CallFunctionObjArgs(
       cache->compiled_fn.get(), inputs.get(), sizes.get(), hooks.get(), NULL)));
   variable_list outputs = THPVariable_UnpackList(pyresult);
@@ -485,7 +560,7 @@ variable_list compiled_autograd(
 
 static PyObject* set_autograd_compiler(PyObject* dummy, PyObject* args) {
   HANDLE_TH_ERRORS;
-  PyObject* obj;
+  PyObject* obj = nullptr;
   if (!PyArg_ParseTuple(args, "O", &obj)) {
     return nullptr;
   }
diff --git a/torch/csrc/dynamo/utils.h b/torch/csrc/dynamo/utils.h
new file mode 100644
index 0000000000000..3fd932f0441fe
--- /dev/null
+++ b/torch/csrc/dynamo/utils.h
@@ -0,0 +1,9 @@
+#pragma once
+
+// The visibility attribute is to avoid a warning about storing a field in the
+// struct that has a different visibility (from pybind) than the struct.
+#ifdef _WIN32
+#define VISIBILITY_HIDDEN
+#else
+#define VISIBILITY_HIDDEN __attribute__((visibility("hidden")))
+#endif
diff --git a/torch/csrc/functorch/init.cpp b/torch/csrc/functorch/init.cpp
index 3c987cec2bec5..c2996fe5278a7 100644
--- a/torch/csrc/functorch/init.cpp
+++ b/torch/csrc/functorch/init.cpp
@@ -6,6 +6,7 @@
 
 #include <ATen/FunctionalTensorWrapper.h>
 #include <ATen/WrapDimUtils.h>
+#include <torch/csrc/utils/python_raii.h>
 #include <torch/python.h>
 
 #include <ATen/functorch/BatchRulesHelper.h>
@@ -306,6 +307,10 @@ static bool is_batchedtensor(const Tensor& tensor) {
   return batched != nullptr;
 }
 
+static bool is_legacy_batchedtensor(const Tensor& tensor) {
+  return tensor.unsafeGetTensorImpl()->key_set().has(DispatchKey::Batched);
+}
+
 static bool is_gradtrackingtensor(const Tensor& tensor) {
   auto* wrapped = maybeGetTensorWrapper(tensor);
   return wrapped != nullptr;
@@ -370,6 +375,15 @@ static int64_t currentLevel() {
   return current_level;
 }
 
+static c10::optional<int64_t> maybe_current_level() {
+  auto maybe_layer = maybeCurrentDynamicLayer();
+  if (maybe_layer.has_value()) {
+    int current_level = maybe_layer->layerId();
+    return current_level;
+  }
+  return nullopt;
+}
+
 static void tls_set_vmap_excluded(bool excluded) {
   c10::impl::tls_set_dispatch_key_excluded(
       c10::DispatchKey::FuncTorchBatched, excluded);
@@ -389,6 +403,41 @@ static void dump_local_tls() {
   std::cout << "[Local Exclude] " << tls.excluded_ << std::endl;
 }
 
+namespace {
+
+// An RAII to save and restore the DynamicLayer stack.
+struct PreserveDynamicLayerStack {
+  size_t m_oldDepth;
+
+  ~PreserveDynamicLayerStack() {
+    while (at::functorch::getDynamicLayerStack().size() > m_oldDepth) {
+      const auto& top = at::functorch::getDynamicLayerStack().back();
+      switch (top.key()) {
+        case at::functorch::TransformType::Vmap:
+          _vmap_decrement_nesting();
+          break;
+        case at::functorch::TransformType::Grad:
+          _grad_decrement_nesting();
+          break;
+        case at::functorch::TransformType::Jvp:
+          _jvp_decrement_nesting();
+          break;
+        case at::functorch::TransformType::Functionalize:
+          _func_decrement_nesting();
+          break;
+        case at::functorch::TransformType::Torch:
+          popDynamicLayerAndDeleteMetadata();
+          break;
+      }
+    }
+  }
+
+  PreserveDynamicLayerStack()
+      : m_oldDepth(at::functorch::getDynamicLayerStack().size()) {}
+};
+
+} // anonymous namespace
+
 static std::tuple<Tensor, c10::optional<int64_t>> unwrapBatched(
     const Tensor& tensor,
     int64_t level) {
@@ -469,11 +518,13 @@ void initFuncTorchBindings(PyObject* module) {
   // various debugging things. Maybe we should offer these as first-class APIs
   // on Tensors?
   m.def("is_batchedtensor", &is_batchedtensor);
+  m.def("is_legacy_batchedtensor", &is_legacy_batchedtensor);
   m.def("is_gradtrackingtensor", &is_gradtrackingtensor);
   m.def("is_functionaltensor", &is_functionaltensor);
   m.def("get_unwrapped", &get_unwrapped);
   m.def("maybe_get_level", &maybe_get_level);
   m.def("maybe_get_bdim", &maybe_get_bdim);
+  m.def("maybe_current_level", &maybe_current_level);
   m.def("current_level", &currentLevel);
   m.def("tls_set_vmap_excluded", &tls_set_vmap_excluded);
   m.def("_set_dynamic_layer_keys_included", &_set_dynamic_layer_keys_included);
@@ -482,6 +533,18 @@ void initFuncTorchBindings(PyObject* module) {
   m.def("is_functorch_wrapped_tensor", [](const Tensor& tensor) {
     return maybe_get_level(tensor) != -1;
   });
+  m.def(
+      "get_interpreter_stack", []() -> c10::optional<std::vector<Interpreter>> {
+        const auto& stack = getDynamicLayerStack();
+        if (stack.empty()) {
+          return c10::nullopt;
+        }
+        std::vector<Interpreter> result;
+        for (auto i : stack) {
+          result.push_back(i.interpreter());
+        }
+        return result;
+      });
   m.def("peek_interpreter_stack", []() -> c10::optional<Interpreter> {
     const auto& stack = getDynamicLayerStack();
     if (stack.empty()) {
@@ -535,6 +598,9 @@ void initFuncTorchBindings(PyObject* module) {
       .def(
           "functionalizeAddBackViews",
           &FunctionalizeInterpreterPtr::functionalizeAddBackViews);
+
+  torch::impl::py_context_manager<PreserveDynamicLayerStack>(
+      m, "_PreserveDynamicLayerStack");
 }
 
 } // namespace impl
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner.cpp b/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
index 9f22ca1d446c3..e6b356d857356 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
@@ -9,12 +9,12 @@ namespace torch::inductor {
 AOTIModelContainerRunner::AOTIModelContainerRunner(
     const std::string& model_so_path,
     size_t num_models,
-    bool is_cpu,
+    const std::string& device_str,
     const std::string& cubin_dir) {
   model_so_ = std::make_unique<at::DynamicLibrary>(model_so_path.c_str());
   TORCH_CHECK(model_so_, "Failed to load model: ", model_so_path);
   create_func_ = reinterpret_cast<decltype(create_func_)>(
-      model_so_->sym("AOTInductorModelContainerCreate"));
+      model_so_->sym("AOTInductorModelContainerCreateWithDevice"));
   delete_func_ = reinterpret_cast<decltype(delete_func_)>(
       model_so_->sym("AOTInductorModelContainerDelete"));
   get_num_outputs_func_ = reinterpret_cast<decltype(get_num_outputs_func_)>(
@@ -38,6 +38,8 @@ AOTIModelContainerRunner::AOTIModelContainerRunner(
       reinterpret_cast<decltype(update_inactive_constant_buffer_func_)>(
           model_so_->sym(
               "AOTInductorModelContainerUpdateInactiveConstantBuffer"));
+  run_const_fold_func_ = reinterpret_cast<decltype(run_const_fold_func_)>(
+      model_so_->sym("AOTInductorModelContainerRunConstantFolding"));
   swap_constant_buffer_func_ =
       reinterpret_cast<decltype(swap_constant_buffer_func_)>(
           model_so_->sym("AOTInductorModelContainerSwapConstantBuffer"));
@@ -47,7 +49,7 @@ AOTIModelContainerRunner::AOTIModelContainerRunner(
   AOTI_RUNTIME_ERROR_CODE_CHECK(create_func_(
       &container_handle_,
       num_models,
-      is_cpu,
+      device_str.c_str(),
       cubin_dir.empty() ? nullptr : cubin_dir.c_str()));
 }
 
@@ -136,6 +138,16 @@ void AOTIModelContainerRunner::update_inactive_constant_buffer(
       container_handle_, (AOTInductorConstantMapHandle)&const_map));
 }
 
+void AOTIModelContainerRunner::run_const_fold(
+    bool use_inactive,
+    AOTInductorStreamHandle cuda_stream_handle) {
+  AOTI_RUNTIME_ERROR_CODE_CHECK(run_const_fold_func_(
+      container_handle_,
+      use_inactive,
+      cuda_stream_handle,
+      proxy_executor_handle_));
+}
+
 void AOTIModelContainerRunner::swap_constant_buffer() {
   AOTI_RUNTIME_ERROR_CODE_CHECK(swap_constant_buffer_func_(container_handle_));
 }
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner.h b/torch/csrc/inductor/aoti_runner/model_container_runner.h
index d00104afa4203..bf5932068b591 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner.h
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner.h
@@ -35,6 +35,9 @@ class TORCH_API AOTIModelContainerRunner {
       const TensorConstantMap& const_map,
       bool use_inactive,
       bool validate_full_updates);
+  void run_const_fold(
+      bool use_inactive,
+      AOTInductorStreamHandle cuda_stream_handle = nullptr);
   void swap_constant_buffer();
 
   std::vector<std::string> get_call_spec();
@@ -43,11 +46,11 @@ class TORCH_API AOTIModelContainerRunner {
   AOTIModelContainerRunner(
       const std::string& model_so_path,
       size_t num_models,
-      bool is_cpu,
+      const std::string& device_str,
       const std::string& cubin_dir);
 
   std::unique_ptr<at::DynamicLibrary> model_so_;
-  decltype(&AOTInductorModelContainerCreate) create_func_{nullptr};
+  decltype(&AOTInductorModelContainerCreateWithDevice) create_func_{nullptr};
   decltype(&AOTInductorModelContainerDelete) delete_func_{nullptr};
   decltype(&AOTInductorModelContainerGetNumOutputs) get_num_outputs_func_{
       nullptr};
@@ -64,6 +67,8 @@ class TORCH_API AOTIModelContainerRunner {
       update_constant_buffer_func_{nullptr};
   decltype(&AOTInductorModelContainerUpdateInactiveConstantBuffer)
       update_inactive_constant_buffer_func_{nullptr};
+  decltype(&AOTInductorModelContainerRunConstantFolding) run_const_fold_func_{
+      nullptr};
   decltype(&AOTInductorModelContainerSwapConstantBuffer)
       swap_constant_buffer_func_{nullptr};
   decltype(&AOTInductorModelContainerGetCallSpec) get_call_spec_func_{nullptr};
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp b/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp
index a54ac273102c6..40eb7407ecd31 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp
@@ -3,10 +3,12 @@
 
 namespace torch::inductor {
 
+// NOTICE: Following APIs are subject to change due to active development
+// We provide NO BC guarantee for these APIs
 AOTIModelContainerRunnerCpu::AOTIModelContainerRunnerCpu(
     const std::string& model_so_path,
     size_t num_models)
-    : AOTIModelContainerRunner(model_so_path, num_models, true, "") {}
+    : AOTIModelContainerRunner(model_so_path, num_models, "cpu", "") {}
 
 AOTIModelContainerRunnerCpu::~AOTIModelContainerRunnerCpu() {}
 
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner_cuda.cpp b/torch/csrc/inductor/aoti_runner/model_container_runner_cuda.cpp
index bc45e3ec19488..705a59eb3f394 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner_cuda.cpp
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner_cuda.cpp
@@ -6,8 +6,13 @@ namespace torch::inductor {
 AOTIModelContainerRunnerCuda::AOTIModelContainerRunnerCuda(
     const std::string& model_so_path,
     size_t num_models,
+    const std::string& device_str,
     const std::string& cubin_dir)
-    : AOTIModelContainerRunner(model_so_path, num_models, false, cubin_dir) {}
+    : AOTIModelContainerRunner(
+          model_so_path,
+          num_models,
+          device_str,
+          cubin_dir) {}
 
 AOTIModelContainerRunnerCuda::~AOTIModelContainerRunnerCuda() {}
 
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h b/torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h
index 466301eae37f5..5db82bf413668 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h
@@ -6,11 +6,15 @@
 
 namespace torch::inductor {
 
+// NOTICE: Following APIs are subject to change due to active development
+// We provide NO BC guarantee for these APIs
 class TORCH_API AOTIModelContainerRunnerCuda : public AOTIModelContainerRunner {
  public:
+  // @param device_str: cuda device string, e.g. "cuda", "cuda:0"
   AOTIModelContainerRunnerCuda(
       const std::string& model_so_path,
       size_t num_models = 1,
+      const std::string& device_str = "cuda",
       const std::string& cubin_dir = "");
 
   ~AOTIModelContainerRunnerCuda();
diff --git a/torch/csrc/inductor/aoti_runner/pybind.cpp b/torch/csrc/inductor/aoti_runner/pybind.cpp
index bc9889975084d..b2c6065592bee 100644
--- a/torch/csrc/inductor/aoti_runner/pybind.cpp
+++ b/torch/csrc/inductor/aoti_runner/pybind.cpp
@@ -2,6 +2,8 @@
 #ifdef USE_CUDA
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
 #endif
+#include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
 
 #include <torch/csrc/utils/pybind.h>
 
@@ -25,14 +27,46 @@ void initAOTIRunnerBindings(PyObject* module) {
 #ifdef USE_CUDA
   py::class_<AOTIModelContainerRunnerCuda>(m, "AOTIModelContainerRunnerCuda")
       .def(py::init<const std::string&, int>())
+      .def(py::init<const std::string&, int, const std::string&>())
+      .def(py::init<
+           const std::string&,
+           int,
+           const std::string&,
+           const std::string&>())
       .def("run", &AOTIModelContainerRunnerCuda::run)
       .def("get_call_spec", &AOTIModelContainerRunnerCuda::get_call_spec)
       .def(
           "get_constant_names_to_original_fqns",
-          &AOTIModelContainerRunnerCpu::getConstantNamesToOriginalFQNs)
+          &AOTIModelContainerRunnerCuda::getConstantNamesToOriginalFQNs)
       .def(
           "get_constant_names_to_dtypes",
           &AOTIModelContainerRunnerCuda::getConstantNamesToDtypes);
 #endif
+
+  m.def(
+      "unsafe_alloc_void_ptrs_from_tensors",
+      [](std::vector<at::Tensor>& tensors) {
+        std::vector<AtenTensorHandle> handles =
+            torch::aot_inductor::unsafe_alloc_new_handles_from_tensors(tensors);
+        std::vector<void*> result(
+            reinterpret_cast<void**>(handles.data()),
+            reinterpret_cast<void**>(handles.data()) + handles.size());
+        return result;
+      });
+  m.def("unsafe_alloc_void_ptr_from_tensor", [](at::Tensor& tensor) {
+    return reinterpret_cast<void*>(
+        torch::aot_inductor::new_tensor_handle(std::move(tensor)));
+  });
+  m.def(
+      "alloc_tensors_by_stealing_from_void_ptrs",
+      [](std::vector<void*>& raw_handles) {
+        return torch::aot_inductor::alloc_tensors_by_stealing_from_handles(
+            reinterpret_cast<AtenTensorHandle*>(raw_handles.data()),
+            raw_handles.size());
+      });
+  m.def("alloc_tensor_by_stealing_from_void_ptr", [](void* raw_handle) {
+    return *torch::aot_inductor::tensor_handle_to_tensor_pointer(
+        reinterpret_cast<AtenTensorHandle>(raw_handle));
+  });
 }
 } // namespace torch::inductor
diff --git a/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h b/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
index 415930c447612..436ed3f01f2fa 100644
--- a/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
+++ b/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
@@ -1,7 +1,6 @@
 #pragma once
 
-#include <torch/csrc/inductor/aoti_runtime/model.h>
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
 
 #include <assert.h>
 #include <cstdint>
@@ -155,6 +154,10 @@ class MiniArrayRef final {
 
 using MiniIntArrayRef = MiniArrayRef<int64_t>;
 
+static_assert(
+    sizeof(MiniIntArrayRef) == sizeof(void*) + sizeof(size_t),
+    "changing the size of MiniArrayRef breaks ABI compatibility!");
+
 inline bool is_contiguous_strides_for_shape(
     int64_t ndim,
     const int64_t* strides_ptr,
@@ -190,8 +193,7 @@ class ArrayRefTensor {
         sizes_(sizes),
         strides_(strides),
         device_type_(device_type),
-        device_idx_(device_idx),
-        numel_(arr.size()) {
+        device_idx_(device_idx) {
     assert(sizes.size() == strides.size());
     assert(is_contiguous_strides_for_shape(
         sizes.size(), strides.data(), sizes.data()));
@@ -243,7 +245,7 @@ class ArrayRefTensor {
   }
 
   auto numel() const {
-    return numel_;
+    return arrayRef_.size();
   }
 
   void set_arrayref(MiniArrayRef<T> new_arrayref) {
@@ -258,9 +260,17 @@ class ArrayRefTensor {
   MiniArrayRef<const int64_t> strides_;
   int32_t device_type_ = 0;
   int32_t device_idx_ = 0;
-  int32_t numel_ = 0;
+  // We continue to zero-initialize this field in case we repurpose
+  // the space later; having predictable contents can only help.
+  int32_t unusedDoNotRemoveForABICompatibility_ = 0;
 };
 
+static_assert(
+    sizeof(ArrayRefTensor<int>) ==
+        3 * sizeof(MiniIntArrayRef) + 3 * sizeof(int32_t) +
+            (alignof(ArrayRefTensor<int>) > 4 ? sizeof(int32_t) : 0),
+    "changing the size of ArrayRefTensor breaks ABI compatibility!");
+
 inline AtenTensorHandle reinterpret_tensor_wrapper(
     AtenTensorHandle self,
     int64_t ndim,
@@ -353,5 +363,16 @@ inline AtenTensorHandle expensive_copy_to_tensor_if_needed(
   return handle;
 }
 
+template <typename T>
+const T& convert_arrayref_tensor_to_tensor(const T& t) {
+  return t;
+}
+
+template <typename T>
+RAIIAtenTensorHandle convert_arrayref_tensor_to_tensor(
+    const ArrayRefTensor<T>& art) {
+  return art.expensiveCopyToTensor();
+}
+
 } // namespace aot_inductor
 } // namespace torch
diff --git a/torch/csrc/inductor/aoti_runtime/interface.h b/torch/csrc/inductor/aoti_runtime/interface.h
index 9dce0bdf99ffe..cf30c3742d523 100644
--- a/torch/csrc/inductor/aoti_runtime/interface.h
+++ b/torch/csrc/inductor/aoti_runtime/interface.h
@@ -1,34 +1,10 @@
 #pragma once
 
-#include <stddef.h>
-#include <stdint.h>
-
 // WARNING: Be careful when adding new includes here. This header will be used
 // in model.so, and should not refer to any aten/c10 headers except the stable
 // C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
 // applies to other files under torch/csrc/inductor/aoti_runtime/.
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
-
-#ifdef __GNUC__
-#define AOT_INDUCTOR_EXPORT __attribute__((__visibility__("default")))
-#else // !__GNUC__
-#ifdef _WIN32
-#define AOT_INDUCTOR_EXPORT __declspec(dllexport)
-#else // !_WIN32
-#define AOT_INDUCTOR_EXPORT
-#endif // _WIN32
-#endif // __GNUC__
-
-using AOTIRuntimeError = int32_t;
-#define AOTI_RUNTIME_SUCCESS 0
-#define AOTI_RUNTIME_FAILURE 1
-
-#define AOTI_RUNTIME_ERROR_CODE_CHECK(call)                                \
-  if ((call) != AOTI_RUNTIME_SUCCESS) {                                    \
-    throw std::runtime_error(                                              \
-        std::string(#call " API call failed at ") + __FILE__ + ", line " + \
-        std::to_string(__LINE__));                                         \
-  }
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
 
 extern "C" {
 struct AOTInductorModelOpaque;
@@ -43,13 +19,25 @@ using AOTInductorStreamHandle = AOTInductorStreamOpaque*;
 struct AOTInductorConstantMap;
 using AOTInductorConstantMapHandle = AOTInductorConstantMap*;
 
+// TODO: Deprecate this API. This was kept for BC compatibility.
+// Please use AOTInductorModelContainerCreateWithDevice instead.
+AOTIRuntimeError AOTInductorModelContainerCreate(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    bool is_cpu,
+    const char* cubin_dir);
+
 // Creates an AOTInductor model container. The parameter num_models
 // specifies the number of model instances that may be run concurrently for
 // the same input model.
-AOTIRuntimeError AOTInductorModelContainerCreate(
+// `device_str` MUST NOT be nullptr. It must be a valid device string, e.g.
+// "cpu", "cuda", "cuda:0", etc. If the device index is not specified for CUDA
+// device, runtime will use the device index returned by
+// "cudaGetDevice(&device_idx)"
+AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
     AOTInductorModelContainerHandle* container_handle,
     size_t num_models,
-    bool is_cpu,
+    const char* device_str,
     const char* cubin_dir);
 
 // Deletes the AOTInductor model container.
@@ -76,18 +64,32 @@ AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
     size_t* num_constants);
 
 // Retrieves a constant's name.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
 AOTIRuntimeError AOTInductorModelContainerGetConstantName(
     AOTInductorModelContainerHandle container_handle,
     size_t idx,
     const char** name);
 
 // Retrieves a constant's original FQN.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
 AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
     AOTInductorModelContainerHandle container_handle,
     size_t idx,
     const char** original_fqn);
 
+// Retrieves whether a constant is from folded.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    bool* from_folded);
+
 // Retrieves a constant's dtype.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
 AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
     AOTInductorModelContainerHandle container_handle,
     size_t idx,
@@ -108,6 +110,13 @@ AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
     AOTInductorModelContainerHandle container_handle,
     AOTInductorConstantMapHandle constant_map_handle);
 
+// Run constant folding on constant buffer.
+AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
+    AOTInductorModelContainerHandle container_handle,
+    bool use_inactive,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle);
+
 // Swap the constant buffer being used to the inactive one.
 AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
     AOTInductorModelContainerHandle container_handle);
diff --git a/torch/csrc/inductor/aoti_runtime/model.h b/torch/csrc/inductor/aoti_runtime/model.h
index 35a4d3c0073e4..7ea53dc24b415 100644
--- a/torch/csrc/inductor/aoti_runtime/model.h
+++ b/torch/csrc/inductor/aoti_runtime/model.h
@@ -1,21 +1,20 @@
 #pragma once
 
-#include <functional>
-#include <iostream>
-#include <memory>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
 #include <optional>
-#include <sstream>
+#include <regex>
 #include <stdexcept>
-#include <string>
 #include <unordered_map>
-#include <vector>
 
 // WARNING: Be careful when adding new includes here. This header will be used
 // in model.so, and should not refer to any aten/c10 headers except the stable
 // C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
 // applies to other files under torch/csrc/inductor/aoti_runtime/.
 #include <torch/csrc/inductor/aoti_runtime/device_utils.h>
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
 
 #define AOTI_RUNTIME_CHECK(EXPR, MSG) \
   do {                                \
@@ -25,14 +24,6 @@
     }                                 \
   } while (0)
 
-#if defined(__GNUC__) || defined(__clang__)
-#define AOTI_NOINLINE __attribute__((noinline))
-#elif _MSC_VER
-#define AOTI_NOINLINE __declspec(noinline)
-#else
-#define AOTI_NOINLINE
-#endif
-
 // At codegen time, we write out a binary file called constants.bin.
 // We then turn the raw binary to an object file that exposes this
 // symbol and link it into the final .so.
@@ -40,8 +31,9 @@
 // the "binary-architecture" flag:
 // https://man7.org/linux/man-pages/man1/objcopy.1.html
 // todo: use #embed in C++ 23 once available
-extern const uint8_t _binary_constants_bin_start[];
-extern const uint8_t _binary_constants_bin_end[];
+// The constants are NOT readonly because they may be mutated.
+extern uint8_t _binary_constants_bin_start[];
+extern uint8_t _binary_constants_bin_end[];
 
 #define AOTI_CONST_GPU_ALIGNMENT 64
 
@@ -62,144 +54,34 @@ CUDAPtr RAII_cudaMalloc(size_t num_bytes) {
 
 } // anonymous namespace
 
-AOTI_NOINLINE static void throw_exception(
-    const char* call,
-    const char* file,
-    int64_t line) {
-  std::stringstream ss;
-  ss << call << " API call failed at " << file << ", line " << line;
-  throw std::runtime_error(ss.str());
-}
-
-#define AOTI_TORCH_ERROR_CODE_CHECK(call)       \
-  if ((call) != AOTI_TORCH_SUCCESS) {           \
-    throw_exception(#call, __FILE__, __LINE__); \
-  }
-
-using DeleterFnPtr = void (*)(void*);
-
 namespace torch {
 namespace aot_inductor {
-
-inline void noop_deleter(void*) {}
-
-inline void delete_tensor_object(void* ptr) {
-  AOTI_TORCH_ERROR_CODE_CHECK(
-      aoti_torch_delete_tensor_object(reinterpret_cast<AtenTensorHandle>(ptr)));
-}
-
-// RAIIAtenTensorHandle steals the tensor objects created by the libtorch C ABI
-class RAIIAtenTensorHandle {
- public:
-  RAIIAtenTensorHandle() : handle_(nullptr, noop_deleter) {}
-  RAIIAtenTensorHandle(const RAIIAtenTensorHandle& other) = delete;
-  RAIIAtenTensorHandle& operator=(const RAIIAtenTensorHandle& other) = delete;
-
-  // Steal the ownership from another RAIIAtenTensorHandle using std::move
-  RAIIAtenTensorHandle(RAIIAtenTensorHandle&& other) = default;
-  RAIIAtenTensorHandle& operator=(RAIIAtenTensorHandle&& other) = default;
-
-  // Steal the ownership from raw AtenTensorHandle
-  RAIIAtenTensorHandle(AtenTensorHandle handle)
-      : handle_(handle, delete_tensor_object) {}
-
-  ~RAIIAtenTensorHandle() {
-    handle_.reset();
-  }
-
-  // Return a raw AtenTensorHandle to be used by aoti_torch functions
-  // Note: this function does NOT transfer the ownership of the handle
-  operator AtenTensorHandle() const {
-    return handle_.get();
-  }
-
-  AtenTensorHandle release() {
-    return handle_.release();
-  }
-
-  AtenTensorHandle get() const {
-    return handle_.get();
-  }
-
-  void reset() {
-    handle_.reset();
-  }
-
-  int64_t size(int64_t d) {
-    int64_t size;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_size(handle_.get(), d, &size));
-    return size;
-  }
-
-  int64_t stride(int64_t d) {
-    int64_t stride;
-    AOTI_TORCH_ERROR_CODE_CHECK(
-        aoti_torch_get_stride(handle_.get(), d, &stride));
-    return stride;
-  }
-
-  int64_t storage_offset() {
-    int64_t storage_offset;
-    AOTI_TORCH_ERROR_CODE_CHECK(
-        aoti_torch_get_storage_offset(handle_.get(), &storage_offset));
-    return storage_offset;
-  }
-
- private:
-  std::unique_ptr<AtenTensorOpaque, DeleterFnPtr> handle_;
-};
-
 using ConstantMap = std::unordered_map<std::string, RAIIAtenTensorHandle>;
 
-class ConstantHandle {
- public:
-  ConstantHandle() = default;
-
-  explicit ConstantHandle(AtenTensorHandle handle) : handle_(handle) {
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(handle_, &data_));
-  }
-
-  operator AtenTensorHandle() const {
-    return handle_;
-  }
-
-  AtenTensorHandle tensor() const {
-    return handle_;
+// valid device strs are: cpu, cuda, cuda:0, cuda:1, ...
+// Update the list here if more devices are supported in the future
+inline void parse_device_str(
+    const std::string& device_str,
+    int32_t& device_type,
+    int32_t& device_idx) {
+  std::regex re("(cpu|cuda)(:([0-9]+))?");
+  std::smatch sm;
+  bool matched = std::regex_match(device_str, sm, re);
+  AOTI_RUNTIME_CHECK(matched, "Invalid device: " + device_str);
+
+  if (sm[1].str() == "cpu") {
+    device_type = aoti_torch_device_type_cpu();
+  } else if (sm[1].str() == "cuda") {
+    device_type = aoti_torch_device_type_cuda();
+  } else {
+    AOTI_RUNTIME_CHECK(false, "Invalid device: " + device_str);
+  }
+
+  if (sm[3].matched) {
+    device_idx = stoi(sm[3].str());
+  } else {
+    device_idx = -1;
   }
-
-  void* data_ptr() const {
-    return data_;
-  }
-
- private:
-  AtenTensorHandle handle_;
-  void* data_ = nullptr;
-};
-
-inline void* get_data_ptr_wrapper(const ConstantHandle& constant) {
-  return constant.data_ptr();
-}
-
-inline const ConstantHandle& unwrap_raii_handle_if_needed(
-    const ConstantHandle& handle) {
-  return handle;
-}
-
-// Shouldn't be called.
-inline AtenTensorHandle wrap_with_raii_handle_if_needed(
-    const ConstantHandle& handle) = delete;
-
-// Steal the ownership from raw AtenTensorHandle to RAIIAtenTensorHandle
-inline std::vector<RAIIAtenTensorHandle> steal_from_raw_handles_to_raii_handles(
-    AtenTensorHandle* handles,
-    size_t size) {
-  std::vector<RAIIAtenTensorHandle> result;
-  result.reserve(size);
-  for (size_t i = 0; i < size; i++) {
-    result.emplace_back(handles[i]);
-    handles[i] = nullptr;
-  }
-  return result;
 }
 
 // Defines the base class for AOTInductorModel, which is generated by the
@@ -214,14 +96,18 @@ class AOTInductorModelBase {
       size_t num_inputs,
       size_t num_outputs,
       size_t num_constants,
+      const std::string& device_str,
       std::optional<std::string> cubin_dir)
       : inputs_info_(num_inputs),
         outputs_info_(num_outputs),
         constants_info_(num_constants),
-        cubin_dir_(cubin_dir),
-        device_idx_(-1) {
+        cubin_dir_(cubin_dir) {
+    parse_device_str(device_str, device_type_, device_idx_);
+
 #ifdef USE_CUDA
-    AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx_));
+    if (device_idx_ == -1) {
+      AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx_));
+    }
 #endif // USE_CUDA
   }
 
@@ -270,12 +156,39 @@ class AOTInductorModelBase {
 #endif // USE_CUDA
   }
 
-  void load_constants(bool is_cpu) {
+  std::unordered_map<std::string, AtenTensorHandle> run_const_fold(
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor,
+      bool initialization = false) {
+#ifdef USE_CUDA
+    if (!run_finished_) {
+      cudaEvent_t run_finished;
+      AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
+      run_finished_.emplace(run_finished);
+    }
+#else // USE_CUDA
+    run_finished_ = false;
+#endif // USE_CUDA
+
+    auto* model = static_cast<Model*>(this);
+    auto folded_constants =
+        model->const_run_impl(stream, proxy_executor, initialization);
+
+#ifdef USE_CUDA
+    AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream));
+#else // USE_CUDA
+    run_finished_ = true;
+#endif // USE_CUDA
+
+    return folded_constants;
+  }
+
+  void load_constants() {
     size_t num_constants = this->num_constants();
     constants_map_->reserve(num_constants);
 
     std::vector<size_t> constants_internal_offset(num_constants);
-    if (!is_cpu) {
+    if (device_type_ != aoti_torch_device_type_cpu()) {
       size_t blob_size = 0;
       compute_cuda_constant_blob(blob_size, constants_internal_offset);
 #ifdef USE_CUDA
@@ -285,10 +198,21 @@ class AOTInductorModelBase {
 
     size_t bytes_read = 0;
     for (size_t i = 0; i < num_constants; i++) {
+      bool from_folded = this->constant_from_folded(i);
+#ifndef USE_CUDA
+      if (from_folded) {
+        // We do not reallocate and copy for CPU.
+        continue;
+      }
+#endif // USE_CUDA
       std::string name = this->constant_name(i);
       size_t data_size = this->constant_data_size(i);
       uint8_t* internal_ptr = (data_size != 0)
-          ? constant_ptr(constants_internal_offset[i], bytes_read, data_size)
+          ? constant_ptr(
+                constants_internal_offset[i],
+                bytes_read,
+                data_size,
+                from_folded)
           : nullptr;
       bytes_read += data_size;
 
@@ -299,16 +223,7 @@ class AOTInductorModelBase {
       auto stride = this->constant_stride(i);
       auto offset = this->constant_offset(i);
 
-      auto device_type = aoti_torch_device_type_cuda();
-      if (is_cpu) {
-        device_type = aoti_torch_device_type_cpu();
-      }
-
       AtenTensorHandle tensor_handle;
-      int device_idx = -1; // should be the same as was used for constant_blob_
-#ifdef USE_CUDA
-      AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx));
-#endif // USE_CUDA
       AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob(
           internal_ptr,
           ndim,
@@ -316,8 +231,8 @@ class AOTInductorModelBase {
           stride,
           offset,
           dtype,
-          device_type,
-          device_idx,
+          device_type_,
+          device_idx_,
           &tensor_handle));
       constants_map_->emplace(std::move(name), tensor_handle);
     }
@@ -336,24 +251,33 @@ class AOTInductorModelBase {
     return constants_;
   }
 
+  const int32_t get_device_idx() const {
+    return device_idx_;
+  }
+
   uint8_t* constant_ptr(
       size_t constant_offset,
       size_t bytes_read,
-      size_t data_size) {
+      size_t data_size,
+      bool skip_copy) {
 #ifdef USE_CUDA
     auto* constants_ptr = static_cast<uint8_t*>(constant_blob_.get());
     uint8_t* internal_ptr = constants_ptr + constant_offset;
     // Copy data to GPU memory
     // TODO: Handle shared storage case.
-    AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
-        internal_ptr,
-        _binary_constants_bin_start + bytes_read,
-        data_size,
-        cudaMemcpyHostToDevice));
+    if (!skip_copy) {
+      AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
+          internal_ptr,
+          _get_constants_start() + bytes_read,
+          data_size,
+          cudaMemcpyHostToDevice));
+    }
     return internal_ptr;
-#else // !USE_CUDA
+
+#else
     // get pointer to constant which is packed in model during compile time.
-    return const_cast<uint8_t*>(_binary_constants_bin_start) + bytes_read;
+    AOTI_RUNTIME_CHECK(!skip_copy, "pure cpu mode doesn't support skip copy");
+    return _get_constants_start() + bytes_read;
 #endif // USE_CUDA
   }
 
@@ -428,6 +352,10 @@ class AOTInductorModelBase {
     return constants_info_.at(idx).original_fqn;
   }
 
+  bool constant_from_folded(int64_t idx) const {
+    return constants_info_.at(idx).from_folded;
+  }
+
   const char* get_in_spec() const {
     return in_spec_.c_str();
   }
@@ -507,6 +435,45 @@ class AOTInductorModelBase {
   }
 
  protected:
+  uint8_t* _get_constants_start() {
+#ifndef USE_MMAP_SELF
+    return const_cast<uint8_t*>(_binary_constants_bin_start);
+#else
+    if (self_mmap) {
+      return self_mmap;
+    }
+    Dl_info dl_info;
+    // get pointer to constant which are appended to the binary
+    AOTI_RUNTIME_CHECK(
+        dladdr(__func__, &dl_info), "Can't find shared library name");
+    int fd = open(dl_info.dli_fname, O_RDONLY);
+    AOTI_RUNTIME_CHECK(fd >= 0, "Shared library file cannot be opened");
+    auto fsize = lseek(fd, 0, SEEK_END);
+    auto weights_size =
+        reinterpret_cast<const uint64_t*>(_binary_constants_bin_start)[0];
+    auto magic_number =
+        reinterpret_cast<const uint64_t*>(_binary_constants_bin_start)[1];
+    auto weights_offset = fsize - weights_size;
+    AOTI_RUNTIME_CHECK(
+        (weights_offset & 0x3fff) == 0,
+        "weights_offset must be aligned to 16K boundary");
+    auto ptr = mmap(
+        NULL,
+        weights_size,
+        PROT_READ | PROT_WRITE,
+        MAP_PRIVATE,
+        fd,
+        weights_offset);
+    close(fd);
+    AOTI_RUNTIME_CHECK(ptr != MAP_FAILED, "mmap() failed");
+    self_mmap = static_cast<uint8_t*>(ptr);
+    AOTI_RUNTIME_CHECK(
+        reinterpret_cast<uint64_t*>(
+            self_mmap + weights_size - sizeof(uint64_t))[0] == magic_number,
+        "Weigths data seems corrupt");
+    return self_mmap;
+#endif
+  }
   struct ParamInfo {
     const char* name = nullptr;
   };
@@ -519,6 +486,7 @@ class AOTInductorModelBase {
     int64_t offset;
     size_t data_size;
     const char* original_fqn = nullptr;
+    bool from_folded;
   };
 
   std::vector<ParamInfo> inputs_info_;
@@ -534,6 +502,9 @@ class AOTInductorModelBase {
   // Holds the blob storage for constants' at::Tensor for CUDA.
   CUDAPtr constant_blob_;
 #endif // USE_CUDA
+#ifdef USE_MMAP_SELF
+  uint8_t* self_mmap = NULL;
+#endif
 
   // A directory with CUDA binary files, e.g. compiled kernels, etc.
   const std::optional<std::string> cubin_dir_;
@@ -547,7 +518,8 @@ class AOTInductorModelBase {
 #endif
 
   // Generated model uses this device index to create CUDA guards.
-  int device_idx_;
+  int32_t device_type_;
+  int32_t device_idx_;
 };
 
 // Codegen-ed classes can derive from this to keep pointers to loaded kernels.
@@ -559,9 +531,20 @@ class AOTInductorModelKernelsBase {
 class AOTInductorModel : public AOTInductorModelBase<AOTInductorModel> {
  public:
   AOTInductorModel(
-      std::shared_ptr<ConstantMap>,
-      std::shared_ptr<std::vector<ConstantHandle>>,
-      std::optional<std::string>);
+      std::shared_ptr<ConstantMap> constants_map,
+      std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+      const std::string& device_str,
+      std::optional<std::string> cubin_dir);
+
+  std::unordered_map<std::string, AtenTensorHandle> const_run_impl(
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor,
+      bool initialization = false);
+
+  void _const_run_impl(
+      std::vector<AtenTensorHandle>& output_handles,
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor);
 
   void run_impl(
       AtenTensorHandle*
@@ -583,33 +566,18 @@ class AOTInductorModel : public AOTInductorModelBase<AOTInductorModel> {
   static std::unique_ptr<AOTInductorModel> Create(
       std::shared_ptr<ConstantMap> constants_map,
       std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+      const std::string& device_str,
       std::optional<std::string> cubin_dir) {
     return std::make_unique<AOTInductorModel>(
-        std::move(constants_map), std::move(constants_array), cubin_dir);
+        std::move(constants_map),
+        std::move(constants_array),
+        device_str,
+        cubin_dir);
   }
 
  private:
   std::unique_ptr<AOTInductorModelKernelsBase> kernels_;
 };
 
-#ifdef USE_CUDA
-class AOTICudaStreamGuard {
- public:
-  AOTICudaStreamGuard(cudaStream_t stream, int32_t device_index) {
-    CUDAStreamGuardHandle ptr;
-    AOTI_TORCH_ERROR_CODE_CHECK(
-        aoti_torch_create_cuda_stream_guard(stream, device_index, &ptr));
-    guard_ =
-        std::unique_ptr<void, std::function<void(void*)>>(ptr, [](void* ptr) {
-          AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_cuda_stream_guard(
-              reinterpret_cast<CUDAStreamGuardHandle>(ptr)));
-        });
-  }
-
- private:
-  std::unique_ptr<void, std::function<void(void*)>> guard_;
-};
-#endif // USE_CUDA
-
 } // namespace aot_inductor
 } // namespace torch
diff --git a/torch/csrc/inductor/aoti_runtime/model_container.h b/torch/csrc/inductor/aoti_runtime/model_container.h
index ee911a5b74f5c..78c0b0a7262ef 100644
--- a/torch/csrc/inductor/aoti_runtime/model_container.h
+++ b/torch/csrc/inductor/aoti_runtime/model_container.h
@@ -19,16 +19,17 @@ class AOTInductorModelContainer {
  public:
   AOTInductorModelContainer(
       size_t num_models,
-      bool is_cpu = false,
+      const std::string& device_str,
       std::optional<std::string> cubin_dir = std::nullopt) {
     constants_map_ = std::make_shared<ConstantMap>();
     constants_array_ = std::make_shared<std::vector<ConstantHandle>>();
     use_secondary_ = false;
+    constant_folded_ = false;
     models_.reserve(num_models);
     available_models_.reserve(num_models);
     for (size_t i = 0; i < num_models; ++i) {
       models_.push_back(AOTInductorModel::Create(
-          constants_map_, constants_array_, cubin_dir));
+          constants_map_, constants_array_, device_str, cubin_dir));
       available_models_.push_back(models_.back().get());
     }
 
@@ -53,7 +54,7 @@ class AOTInductorModelContainer {
       output_names_.push_back(model->output_name(i));
     }
 
-    model->load_constants(is_cpu);
+    model->load_constants();
 #ifdef USE_CUDA
     constant_blob_ = model->release_constant_blob();
     constants_internal_offset_.resize(model->num_constants());
@@ -80,6 +81,27 @@ class AOTInductorModelContainer {
       AOTIProxyExecutorHandle proxy_executor) {
     std::shared_lock model_lk(model_exec_mutex_);
     auto* model = get_available_model();
+
+    if (!constant_folded_) {
+      // At this point, constant is not ready yet. We need to call constant
+      // folding before we execute the model. We obtain a unique lock at this
+      // point to make sure constant is ready for all.
+      model_lk.unlock();
+      std::unique_lock constants_folding_lk(model_exec_mutex_);
+      // Double locking to make sure constant folding is only ran once.
+      if (!constant_folded_) {
+        auto folded_const_map = model->run_const_fold(
+            stream, proxy_executor, /* initialization = */ true);
+        update_constant_buffer(
+            folded_const_map,
+            /* use_inactive = */ false,
+            /* validate_full_update = */ false);
+        constant_folded_ = true;
+      }
+      constants_folding_lk.unlock();
+      model_lk.lock();
+    }
+
     try {
       model->run(input_handles, output_handles, stream, proxy_executor);
     } catch (...) {
@@ -102,6 +124,7 @@ class AOTInductorModelContainer {
     return models_[0]->num_constants();
   }
 
+  // retrieve the constant name of constants_info_[idx]
   const char* constant_name(size_t idx) const {
     if (this->num_models() == 0) {
       throw std::runtime_error("No available models in container!");
@@ -109,6 +132,7 @@ class AOTInductorModelContainer {
     return models_[0]->constant_name(idx);
   }
 
+  // retrieve original FQN of constants_info_[idx]
   const char* constant_original_fqn(size_t idx) const {
     if (this->num_models() == 0) {
       throw std::runtime_error("No available models in container!");
@@ -116,6 +140,15 @@ class AOTInductorModelContainer {
     return models_[0]->constant_original_fqn(idx);
   }
 
+  // retrieve whether constant is from folded of constants_info_[idx]
+  bool constant_from_folded(size_t idx) const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->constant_from_folded(idx);
+  }
+
+  // retrieve dtype of constants_info_[idx]
   int32_t constant_dtype(size_t idx) const {
     if (this->num_models() == 0) {
       throw std::runtime_error("No available models in container!");
@@ -123,27 +156,98 @@ class AOTInductorModelContainer {
     return models_[0]->constant_dtype(idx);
   }
 
+  void run_const_fold(
+      bool inactive_buffer,
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor) {
+    std::shared_lock model_lk(model_exec_mutex_);
+    auto* model = get_available_model();
+
+    if (!inactive_buffer) {
+      // We would need to acquire a unique lock if we want to run constant
+      // folding on the active buffer.
+      model_lk.unlock();
+      std::unique_lock constants_folding_lk(model_exec_mutex_);
+      try {
+        auto folded_const_map = model->run_const_fold(stream, proxy_executor);
+        update_constant_buffer(
+            folded_const_map,
+            /* use_inactive = */ false,
+            /* validate_full_update = */ false);
+      } catch (...) {
+        std::lock_guard lk(models_mutex_);
+        available_models_.push_back(model);
+        throw;
+      }
+      constants_folding_lk.unlock();
+      model_lk.lock();
+    } else {
+      // We swap the constant mapping to the inactive buffer in the model to run
+      // const run.
+      auto constants_map = get_constants_map(/* get_inactive= */ true);
+      auto constants_array = get_constants_array(/* get_inactive= */ true);
+
+      try {
+        model->update_constants_map(
+            constants_map, /* remap_constants_array= */ false);
+        model->update_constants_array(constants_array);
+
+        auto folded_const_map = model->run_const_fold(stream, proxy_executor);
+        update_constant_buffer(
+            folded_const_map,
+            /* use_inactive = */ true,
+            /* validate_full_update = */ false);
+
+        // Swap back the model's constants mapping
+        constants_map = get_constants_map(/* get_inactive= */ false);
+        constants_array = get_constants_array(/* get_inactive= */ false);
+        model->update_constants_map(
+            constants_map, /* remap_constants_array= */ false);
+        model->update_constants_array(constants_array);
+      } catch (...) {
+        std::lock_guard lk(models_mutex_);
+        available_models_.push_back(model);
+        throw;
+      }
+    }
+
+    {
+      std::lock_guard lk(models_mutex_);
+      pending_models_.push_back(model);
+    }
+    pending_models_available_.notify_one();
+  }
+
+  bool _is_tensor_constant(const std::string& constant_name) const {
+    return constant_name.rfind("_tensor_constant", 0) == 0;
+  }
   // This function updates the buffer for storing constants.
   // It will update the buffer, the mapping and the array mapping.
   void update_constant_buffer(
       const std::unordered_map<std::string, AtenTensorHandle>& constants_map,
       bool use_inactive,
       bool validate_full_update) {
-#ifdef USE_CUDA
     if (this->num_models() == 0) {
       throw std::runtime_error("No model available in container!");
     }
     auto num_constants = models_[0]->num_constants();
 
-    auto* constants_blob_ptr =
-        static_cast<uint8_t*>(get_constant_blob_ptr(use_inactive));
-    auto constants_map_to_update = get_constants_map(use_inactive);
-
     if (validate_full_update) {
       for (size_t idx = 0; idx < num_constants; idx++) {
+        if (models_[0]->constant_from_folded(idx)) {
+          continue;
+        }
+
         auto constant_name = std::string(models_[0]->constant_name(idx));
         auto it = constants_map.find(constant_name);
         if (it == constants_map.end()) {
+          if (_is_tensor_constant(constant_name)) {
+            // tracing sometimes creates tensors that are non-existent in
+            // original graph. We could skip those and do a direct copy.
+            std::cerr << "[WARNING] Found constant " << constant_name
+                      << " in model, but not provided by user!\n";
+            continue;
+          }
           throw std::runtime_error(
               std::string("Cannot find constants ") + constant_name +
               std::string(" in constants_map!"));
@@ -151,20 +255,34 @@ class AOTInductorModelContainer {
       }
     }
 
+    auto original_constants_map = get_constants_map(!use_inactive);
+    auto constants_map_to_update = get_constants_map(use_inactive);
+
     for (size_t idx = 0; idx < num_constants; idx++) {
       auto constant_name = std::string(models_[0]->constant_name(idx));
       auto it = constants_map.find(constant_name);
-      if (it == constants_map.end()) {
+      if (it == constants_map.end() &&
+          !(_is_tensor_constant(constant_name) && use_inactive)) {
         continue;
       }
 
+#ifdef USE_CUDA
+      AtenTensorHandle tensor;
+      if (_is_tensor_constant(constant_name) && use_inactive) {
+        tensor = original_constants_map->find(constant_name)->second.get();
+      } else {
+        tensor = it->second;
+      }
+      auto* constants_blob_ptr =
+          static_cast<uint8_t*>(get_constant_blob_ptr(use_inactive));
+
       // Move the data to container handled blob.
       uint8_t* internal_constants_ptr =
           constants_blob_ptr + constants_internal_offset_[idx];
       void* user_constant_ptr;
       int64_t constant_size;
-      aoti_torch_get_data_ptr(it->second, &user_constant_ptr);
-      aoti_torch_get_storage_size(it->second, &constant_size);
+      aoti_torch_get_data_ptr(tensor, &user_constant_ptr);
+      aoti_torch_get_storage_size(tensor, &constant_size);
 
       AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
           internal_constants_ptr,
@@ -178,11 +296,10 @@ class AOTInductorModelContainer {
       AtenTensorHandle tensor_handle;
       int64_t* stride;
       int64_t offset;
-      int device_idx = -1;
-      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides(it->second, &stride));
+      int device_idx = models_[0]->get_device_idx();
+      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides(tensor, &stride));
       AOTI_TORCH_ERROR_CODE_CHECK(
-          aoti_torch_get_storage_offset(it->second, &offset));
-      AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx));
+          aoti_torch_get_storage_offset(tensor, &offset));
       AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob(
           internal_constants_ptr,
           models_[0]->constant_ndim(idx),
@@ -193,16 +310,17 @@ class AOTInductorModelContainer {
           aoti_torch_device_type_cuda(),
           device_idx,
           &tensor_handle));
+#else // USE_CUDA
+      AtenTensorHandle tensor_handle = it->second;
+#endif // USE_CUDA
 
       // Now place the tensor to constants_map. Note at this point the ownership
       // of the tensor_handle will be taken over.
       constants_map_to_update->emplace(constant_name, tensor_handle);
     }
-
     // Update the inactive constant array.
     update_array_from_map(
         get_constants_array(use_inactive), constants_map_to_update);
-#endif // USE_CUDA
   }
 
   void update_array_from_map(
@@ -210,8 +328,11 @@ class AOTInductorModelContainer {
       std::shared_ptr<ConstantMap> constants_map) {
     auto num_constants = models_[0]->num_constants();
     for (size_t idx = 0; idx < num_constants; idx++) {
-      constants_array->at(idx) = ConstantHandle(
-          constants_map->find(models_[0]->constant_name(idx))->second);
+      if (constants_map->find(models_[0]->constant_name(idx)) !=
+          constants_map->end()) {
+        constants_array->at(idx) = ConstantHandle(
+            constants_map->find(models_[0]->constant_name(idx))->second);
+      }
     }
   }
 
@@ -281,6 +402,9 @@ class AOTInductorModelContainer {
   // is being used.
   bool use_secondary_;
 
+  // Determine whether we have ran constant folding
+  bool constant_folded_;
+
   // Holds the mapping of constants to at::Tensor.
   // The underlying data of at::Tensor is in either constant_blob_ (for CUDA).
   // or _binary_constants_bin_start (for CPU).
diff --git a/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h b/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h
new file mode 100644
index 0000000000000..5f836a4fac2e2
--- /dev/null
+++ b/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+
+namespace torch {
+namespace aot_inductor {
+
+template <typename T>
+inline RAIIAtenTensorHandle scalar_to_tensor_handle(T value) {
+  throw std::runtime_error("Unsupported scalar_to_tensor_handle");
+}
+
+// Specialize for supported C++ primitive types
+#define AOTI_RUNTIME_SCALAR_TO_TENSOR(dtype, ctype)                         \
+  template <>                                                               \
+  inline RAIIAtenTensorHandle scalar_to_tensor_handle<ctype>(ctype value) { \
+    AtenTensorHandle tensor_handle;                                         \
+    AOTI_TORCH_ERROR_CODE_CHECK(                                            \
+        aoti_torch_scalar_to_tensor_##dtype(value, &tensor_handle));        \
+    return RAIIAtenTensorHandle(tensor_handle);                             \
+  }
+
+AOTI_RUNTIME_SCALAR_TO_TENSOR(float32, float)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(float64, double)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(uint8, uint8_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(uint16, uint16_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(uint32, uint32_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(uint64, uint64_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(int8, int8_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(int16, int16_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(int32, int32_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(int64, int64_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(bool, bool)
+#undef AOTI_RUNTIME_SCALAR_TO_TENSOR
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/torch/csrc/inductor/aoti_runtime/thread_local.h b/torch/csrc/inductor/aoti_runtime/thread_local.h
new file mode 100644
index 0000000000000..41e4c92d0ed15
--- /dev/null
+++ b/torch/csrc/inductor/aoti_runtime/thread_local.h
@@ -0,0 +1,158 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
+
+namespace torch {
+namespace aot_inductor {
+
+template <typename T>
+struct ThreadLocalCachedOutputTensor;
+
+template <>
+struct ThreadLocalCachedOutputTensor<RAIIAtenTensorHandle> {
+  explicit ThreadLocalCachedOutputTensor(const RAIIAtenTensorHandle&) {}
+  void copy_data_from(const RAIIAtenTensorHandle& handle) {
+    throw std::runtime_error("can't happen");
+  }
+
+  AtenTensorHandle tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+template <>
+struct ThreadLocalCachedOutputTensor<AtenTensorHandle> {
+  explicit ThreadLocalCachedOutputTensor(const AtenTensorHandle&) {}
+  void copy_data_from(const AtenTensorHandle& handle) {
+    throw std::runtime_error("can't happen");
+  }
+
+  AtenTensorHandle tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+template <>
+struct ThreadLocalCachedOutputTensor<ConstantHandle> {
+  explicit ThreadLocalCachedOutputTensor(const ConstantHandle&) {}
+  void copy_data_from(const ConstantHandle& handle) {
+    throw std::runtime_error("can't happen");
+  }
+
+  AtenTensorHandle tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+template <typename T>
+struct ThreadLocalCachedOutputTensor<ArrayRefTensor<T>> {
+  explicit ThreadLocalCachedOutputTensor(const ArrayRefTensor<T>& t) {
+    realloc(t);
+  }
+
+  void copy_data_from(const ArrayRefTensor<T>& t) {
+    if (t.numel() > capacity_) {
+      realloc(t);
+    }
+    std::copy(t.data(), t.data() + t.numel(), storage_.get());
+  }
+
+  AtenTensorHandle tensor() const {
+    return tensor_.get();
+  }
+
+ private:
+  void realloc(const ArrayRefTensor<T>& t) {
+    capacity_ = t.numel();
+    storage_ = std::make_unique<T[]>(t.numel());
+    AtenTensorHandle handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob(
+        storage_.get(),
+        t.sizes().size(),
+        t.sizes().data(),
+        t.strides().data(),
+        0,
+        aoti_torch_dtype<std::remove_const_t<T>>(),
+        t.device_type(),
+        t.device_idx(),
+        &handle));
+    tensor_ = handle;
+  }
+
+  std::unique_ptr<T[]> storage_;
+  int64_t capacity_ = 0;
+  RAIIAtenTensorHandle tensor_;
+};
+
+template <typename T>
+struct ThreadLocalCachedOutputArray;
+
+// Just needs to compile, doesn't need to do anything.
+template <>
+struct ThreadLocalCachedOutputArray<RAIIAtenTensorHandle> {
+  explicit ThreadLocalCachedOutputArray(const RAIIAtenTensorHandle&) {
+    throw std::runtime_error("can't happen");
+  }
+
+  // Not supported yet! We would need to put contiguous() or
+  // expect_contiguous() into the ABI.
+  void copy_data_from(const RAIIAtenTensorHandle&) {
+    throw std::runtime_error("can't happen");
+  }
+
+  template <typename U>
+  ArrayRefTensor<U> arrayref_tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+// Just needs to compile, doesn't need to do anything.
+template <>
+struct ThreadLocalCachedOutputArray<ConstantHandle> {
+  explicit ThreadLocalCachedOutputArray(const ConstantHandle&) {
+    throw std::runtime_error("can't happen");
+  }
+
+  // Not supported yet! We would need to put contiguous() or
+  // expect_contiguous() into the ABI.
+  void copy_data_from(const ConstantHandle&) {
+    throw std::runtime_error("can't happen");
+  }
+
+  template <typename U>
+  ArrayRefTensor<U> arrayref_tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+template <typename T>
+struct ThreadLocalCachedOutputArray<ArrayRefTensor<T>> {
+  explicit ThreadLocalCachedOutputArray(const ArrayRefTensor<T>& t) {}
+
+  template <
+      typename U,
+      std::enable_if_t<
+          std::is_same_v<std::remove_const_t<T>, std::remove_const_t<U>>,
+          bool> = true>
+  ArrayRefTensor<T> arrayref_tensor() const {
+    return tensor_;
+  }
+
+  void copy_data_from(const ArrayRefTensor<T>& t) {
+    if (t.numel() > capacity_) {
+      capacity_ = t.numel();
+      storage_ = std::make_unique<T[]>(capacity_);
+    }
+    std::copy(t.data(), t.data() + t.numel(), storage_.get());
+    tensor_ = t;
+    tensor_.set_arrayref(MiniArrayRef<T>(storage_.get(), t.numel()));
+  }
+
+ private:
+  std::unique_ptr<T[]> storage_;
+  uint32_t capacity_ = 0;
+  ArrayRefTensor<T> tensor_;
+};
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/torch/csrc/inductor/aoti_runtime/utils.h b/torch/csrc/inductor/aoti_runtime/utils.h
new file mode 100644
index 0000000000000..8020004b06bc9
--- /dev/null
+++ b/torch/csrc/inductor/aoti_runtime/utils.h
@@ -0,0 +1,177 @@
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define AOTI_NOINLINE __attribute__((noinline))
+#elif _MSC_VER
+#define AOTI_NOINLINE __declspec(noinline)
+#else
+#define AOTI_NOINLINE
+#endif
+
+AOTI_NOINLINE static void throw_exception(
+    const char* call,
+    const char* file,
+    int64_t line) {
+  std::stringstream ss;
+  ss << call << " API call failed at " << file << ", line " << line;
+  throw std::runtime_error(ss.str());
+}
+
+#define AOTI_TORCH_ERROR_CODE_CHECK(call)       \
+  if ((call) != AOTI_TORCH_SUCCESS) {           \
+    throw_exception(#call, __FILE__, __LINE__); \
+  }
+
+using AOTIRuntimeError = int32_t;
+#define AOTI_RUNTIME_SUCCESS 0
+#define AOTI_RUNTIME_FAILURE 1
+
+#define AOTI_RUNTIME_ERROR_CODE_CHECK(call)     \
+  if ((call) != AOTI_RUNTIME_SUCCESS) {         \
+    throw_exception(#call, __FILE__, __LINE__); \
+  }
+
+namespace torch::aot_inductor {
+
+using DeleterFnPtr = void (*)(void*);
+
+inline void noop_deleter(void*) {}
+
+inline void delete_tensor_object(void* ptr) {
+  AOTI_TORCH_ERROR_CODE_CHECK(
+      aoti_torch_delete_tensor_object(reinterpret_cast<AtenTensorHandle>(ptr)));
+}
+
+// RAIIAtenTensorHandle steals the tensor objects created by the libtorch C ABI
+class RAIIAtenTensorHandle {
+ public:
+  RAIIAtenTensorHandle() : handle_(nullptr, noop_deleter) {}
+  RAIIAtenTensorHandle(const RAIIAtenTensorHandle& other) = delete;
+  RAIIAtenTensorHandle& operator=(const RAIIAtenTensorHandle& other) = delete;
+
+  // Steal the ownership from another RAIIAtenTensorHandle using std::move
+  RAIIAtenTensorHandle(RAIIAtenTensorHandle&& other) = default;
+  RAIIAtenTensorHandle& operator=(RAIIAtenTensorHandle&& other) = default;
+
+  // Steal the ownership from raw AtenTensorHandle
+  RAIIAtenTensorHandle(AtenTensorHandle handle)
+      : handle_(handle, delete_tensor_object) {}
+
+  ~RAIIAtenTensorHandle() {
+    handle_.reset();
+  }
+
+  // Return a raw AtenTensorHandle to be used by aoti_torch functions
+  // Note: this function does NOT transfer the ownership of the handle
+  operator AtenTensorHandle() const {
+    return handle_.get();
+  }
+
+  AtenTensorHandle release() {
+    return handle_.release();
+  }
+
+  AtenTensorHandle get() const {
+    return handle_.get();
+  }
+
+  void reset() {
+    handle_.reset();
+  }
+
+  int64_t size(int64_t d) {
+    int64_t size;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_size(handle_.get(), d, &size));
+    return size;
+  }
+
+  int64_t stride(int64_t d) {
+    int64_t stride;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_stride(handle_.get(), d, &stride));
+    return stride;
+  }
+
+  int64_t storage_offset() {
+    int64_t storage_offset;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_storage_offset(handle_.get(), &storage_offset));
+    return storage_offset;
+  }
+
+ private:
+  std::unique_ptr<AtenTensorOpaque, DeleterFnPtr> handle_;
+};
+
+// Steal the ownership from raw AtenTensorHandle to RAIIAtenTensorHandle
+inline std::vector<RAIIAtenTensorHandle> steal_from_raw_handles_to_raii_handles(
+    AtenTensorHandle* handles,
+    size_t size) {
+  std::vector<RAIIAtenTensorHandle> result;
+  result.reserve(size);
+  for (size_t i = 0; i < size; i++) {
+    result.emplace_back(handles[i]);
+    handles[i] = nullptr;
+  }
+  return result;
+}
+
+class ConstantHandle {
+ public:
+  ConstantHandle() = default;
+
+  explicit ConstantHandle(AtenTensorHandle handle) : handle_(handle) {
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(handle_, &data_));
+  }
+
+  operator AtenTensorHandle() const {
+    return handle_;
+  }
+
+  AtenTensorHandle tensor() const {
+    return handle_;
+  }
+
+  void* data_ptr() const {
+    return data_;
+  }
+
+ private:
+  AtenTensorHandle handle_;
+  void* data_ = nullptr;
+};
+
+inline void* get_data_ptr_wrapper(const ConstantHandle& constant) {
+  return constant.data_ptr();
+}
+
+inline const ConstantHandle& unwrap_raii_handle_if_needed(
+    const ConstantHandle& handle) {
+  return handle;
+}
+
+// Shouldn't be called.
+inline AtenTensorHandle wrap_with_raii_handle_if_needed(
+    const ConstantHandle& handle) = delete;
+
+#define CACHE_TORCH_DTYPE(typename) \
+  static auto cached_torch_dtype_##typename = aoti_torch_dtype_##typename()
+
+#define CACHE_TORCH_DEVICE(device)                \
+  static auto cached_torch_device_type_##device = \
+      aoti_torch_device_type_##device()
+
+} // namespace torch::aot_inductor
diff --git a/torch/csrc/inductor/aoti_runtime/utils_cuda.h b/torch/csrc/inductor/aoti_runtime/utils_cuda.h
new file mode 100644
index 0000000000000..0593a2a5a7af5
--- /dev/null
+++ b/torch/csrc/inductor/aoti_runtime/utils_cuda.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#ifdef USE_CUDA
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace torch::aot_inductor {
+
+inline void delete_cuda_guard(void* ptr) {
+  AOTI_TORCH_ERROR_CODE_CHECK(
+      aoti_torch_delete_cuda_guard(reinterpret_cast<CUDAGuardHandle>(ptr)));
+}
+
+inline void delete_cuda_stream_guard(void* ptr) {
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_cuda_stream_guard(
+      reinterpret_cast<CUDAStreamGuardHandle>(ptr)));
+}
+
+class AOTICudaGuard {
+ public:
+  AOTICudaGuard(int32_t device_index) : guard_(nullptr, delete_cuda_guard) {
+    CUDAGuardHandle ptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_create_cuda_guard(device_index, &ptr));
+    guard_.reset(ptr);
+  }
+
+  void set_index(int32_t device_index) {
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_cuda_guard_set_index(guard_.get(), device_index));
+  }
+
+ private:
+  std::unique_ptr<CUDAGuardOpaque, DeleterFnPtr> guard_;
+};
+
+class AOTICudaStreamGuard {
+ public:
+  AOTICudaStreamGuard(cudaStream_t stream, int32_t device_index)
+      : guard_(nullptr, delete_cuda_stream_guard) {
+    CUDAStreamGuardHandle ptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_create_cuda_stream_guard(stream, device_index, &ptr));
+    guard_.reset(ptr);
+  }
+
+ private:
+  std::unique_ptr<CUDAStreamGuardOpaque, DeleterFnPtr> guard_;
+};
+
+} // namespace torch::aot_inductor
+#endif // USE_CUDA
diff --git a/torch/csrc/inductor/aoti_torch/c/shim.h b/torch/csrc/inductor/aoti_torch/c/shim.h
index 3246119970a01..b05c52c6a3876 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim.h
@@ -40,7 +40,12 @@
 #define AOTI_TORCH_EXPORT __attribute__((__visibility__("default")))
 #else // !__GNUC__
 #ifdef _WIN32
-#define AOTI_TORCH_EXPORT __declspec(dllexport)
+// PyTorch2 doesn't currently work on Windows. Exporting these APIs can lead
+// to symbol clashes at link time if libtorch is included in a DLL and binary
+// that depends on the DLL. As a short term fix, we don't export the symbols.
+// In the long term, this will need to be addressed when Windows is supported.
+// #define AOTI_TORCH_EXPORT __declspec(dllexport)
+#define AOTI_TORCH_EXPORT
 #else // !_WIN32
 #define AOTI_TORCH_EXPORT
 #endif // _WIN32
@@ -85,16 +90,82 @@ AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_cuda();
 
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e5m2();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e4m3fn();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e5m2fnuz();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e4m3fnuz();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_bfloat16();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float16();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float32();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float64();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_uint8();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_uint16();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_uint32();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_uint64();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_int8();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_int16();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_int32();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_int64();
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_bool();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex32();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex64();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex128();
+
+// Functions for converting a single-element tensor to a scalar value
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_float32(AtenTensorHandle tensor, float* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_float64(AtenTensorHandle tensor, double* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_uint8(AtenTensorHandle tensor, uint8_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_uint16(AtenTensorHandle tensor, uint16_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_uint32(AtenTensorHandle tensor, uint32_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_uint64(AtenTensorHandle tensor, uint64_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_int8(AtenTensorHandle tensor, int8_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_int16(AtenTensorHandle tensor, int16_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_int32(AtenTensorHandle tensor, int32_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_int64(AtenTensorHandle tensor, int64_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_bool(AtenTensorHandle tensor, bool* ret_value);
+
+// Functions for wrapping a scalar value to a single-element tensor
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_float32(
+    float value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_float64(
+    double value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint8(
+    uint8_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint16(
+    uint16_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint32(
+    uint32_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint64(
+    uint64_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int8(
+    int8_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int16(
+    int16_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int32(
+    int32_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int64(
+    int64_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_scalar_to_tensor_bool(bool value, AtenTensorHandle* ret_new_tensor);
 
 AOTI_TORCH_EXPORT bool aoti_torch_grad_mode_is_enabled();
 AOTI_TORCH_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled);
@@ -196,6 +267,31 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
     AtenTensorHandle* ret // returns new reference
 );
 
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__embedding_bag(
+    AtenTensorHandle weight,
+    AtenTensorHandle indices,
+    AtenTensorHandle offsets,
+    int32_t scale_grad_by_freq,
+    int32_t mode,
+    int32_t sparse,
+    AtenTensorHandle per_sample_weights, // optional argument
+    int32_t include_last_offset,
+    int32_t padding_idx,
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3 // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__fft_c2c(
+    AtenTensorHandle self,
+    const int64_t* dim_ptr,
+    int64_t dim_size,
+    int64_t normalization,
+    int32_t forward,
+    AtenTensorHandle* ret // returns new reference
+);
+
 // This version is deprecated. We will remove it later
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_dot_product_flash_attention(
     AtenTensorHandle query,
@@ -224,7 +320,7 @@ aoti_torch__scaled_dot_product_flash_attention_v2(
     double dropout_p,
     int is_causal,
     int return_debug_mask,
-    double* scale,
+    double* scale, // optional argument
     AtenTensorHandle* ret0, // returns new reference
     AtenTensorHandle* ret1, // returns new reference
     AtenTensorHandle* ret2, // returns new reference
@@ -236,6 +332,22 @@ aoti_torch__scaled_dot_product_flash_attention_v2(
     AtenTensorHandle* ret8 // returns new reference
 );
 
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch__scaled_dot_product_efficient_attention(
+    AtenTensorHandle query,
+    AtenTensorHandle key,
+    AtenTensorHandle value,
+    AtenTensorHandle attn_bias, // optional argument
+    int compute_log_sumexp,
+    double dropout_p,
+    int is_causal,
+    double* scale, // optional argument
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3 // returns new reference
+);
+
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm(
     AtenTensorHandle self,
     AtenTensorHandle mat2,
@@ -270,6 +382,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_convolution(
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_new_uninitialized_tensor(AtenTensorHandle* ret);
 
+// WARNING: This will be deprecated. Use aoti_torch_copy_ instead.
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_tensor_copy_(AtenTensorHandle src, AtenTensorHandle dst);
 
@@ -279,6 +392,13 @@ aoti_torch_tensor_copy_(AtenTensorHandle src, AtenTensorHandle dst);
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_assign_tensors(AtenTensorHandle src, AtenTensorHandle dst);
 
+// Make a shallow copy of the tensor referred to by src and assign
+// it to the handle in the ret_dst. This is similar to the above
+// aoti_torch_assign_tensors function, but creates and sets the
+// ret_dst from within.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_assign_tensors_out(AtenTensorHandle src, AtenTensorHandle* ret_dst);
+
 // This function will create a new tensor object and its pointer is returned
 // through *ret. The caller is responsible for wrapping the tensor pointer
 // with RAIIAtenTensorHandle which will call aoti_torch_delete_tensor_object
@@ -299,11 +419,33 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_bmm_out(
     AtenTensorHandle self,
     AtenTensorHandle mat2);
 
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_copy_(
+    AtenTensorHandle self,
+    AtenTensorHandle src,
+    int32_t non_blocking);
+
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mm_out(
     AtenTensorHandle out,
     AtenTensorHandle self,
     AtenTensorHandle mat2);
 
+// This will soon be deprecated after ao_quantization is complete.
+// Please refrain from using this or increasing callsites.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu_wrapped_fbgemm_pack_gemm_matrix_fp16(
+    AtenTensorHandle weight,
+    AtenTensorHandle* out);
+
+// This will soon be deprecated after ao_quantization is complete.
+// Please refrain from using this or increasing callsites.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu_wrapped_fbgemm_linear_fp16_weight(
+    AtenTensorHandle input,
+    AtenTensorHandle weight,
+    AtenTensorHandle bias,
+    int64_t out_channel,
+    AtenTensorHandle* out);
+
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_nonzero(AtenTensorHandle self, AtenTensorHandle* out);
 
@@ -331,8 +473,45 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_reduce_out(
     const char* reduce,
     int32_t include_self);
 
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_index_put_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    const AtenTensorHandle* indices,
+    const uint32_t num_indices,
+    const AtenTensorHandle values,
+    bool accumulate);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_as_real(
+    AtenTensorHandle self,
+    AtenTensorHandle* ret // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_dtype(
+    AtenTensorHandle self,
+    int32_t dtype,
+    AtenTensorHandle* ret // returns new reference
+);
+
+AOTI_TORCH_EXPORT void aoti_torch_print_tensor_handle(
+    AtenTensorHandle self,
+    const char* msg);
+
 #ifdef USE_CUDA
 
+struct CUDAGuardOpaque;
+using CUDAGuardHandle = CUDAGuardOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_cuda_guard(
+    int32_t device_index,
+    CUDAGuardHandle* ret_guard // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cuda_guard_set_index(CUDAGuardHandle guard, int32_t device_index);
+
 struct CUDAStreamGuardOpaque;
 using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*;
 
@@ -344,6 +523,10 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard(
 
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream);
+
 #endif
 
 // See `ProxyExecutor Design Note` in ir.py for more details
@@ -355,11 +538,36 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_proxy_executor_call_function(
     int num_tensors,
     AtenTensorHandle* flatten_tensor_args);
 
+AOTI_TORCH_EXPORT void aoti_torch_check(
+    bool cond,
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg);
+
+#ifdef STRIP_ERROR_MESSAGES
+#define AOTI_TORCH_CHECK(cond, ...)    \
+  aoti_torch_check(                    \
+      cond,                            \
+      __func__,                        \
+      __FILE__,                        \
+      static_cast<uint32_t>(__LINE__), \
+      TORCH_CHECK_MSG(cond, "", __VA_ARGS__));
+#else
+#define AOTI_TORCH_CHECK(cond, ...)    \
+  aoti_torch_check(                    \
+      cond,                            \
+      __func__,                        \
+      __FILE__,                        \
+      static_cast<uint32_t>(__LINE__), \
+      TORCH_CHECK_MSG(cond, "", ##__VA_ARGS__));
+#endif
+
 #ifdef __cplusplus
 } // extern "C"
 
 template <typename T>
-int32_t aoti_torch_dtype();
+int32_t aoti_torch_dtype() = delete;
 
 #define DEFINE_DTYPE_SPECIALIZATION(ctype, typename) \
   template <>                                        \
@@ -367,10 +575,13 @@ int32_t aoti_torch_dtype();
     return aoti_torch_dtype_##typename();            \
   }
 
-// REVIEW: bfloat16 and half don't seem to actually build? Do I have
-// the wrong types?
-//  DEFINE_DTYPE_SPECIALIZATION(__bfloat16, bfloat16)
-//  DEFINE_DTYPE_SPECIALIZATION(half, float16)
+namespace c10 {
+struct BFloat16;
+struct Half;
+} // namespace c10
+
+DEFINE_DTYPE_SPECIALIZATION(c10::BFloat16, bfloat16)
+DEFINE_DTYPE_SPECIALIZATION(c10::Half, float16)
 DEFINE_DTYPE_SPECIALIZATION(float, float32)
 DEFINE_DTYPE_SPECIALIZATION(double, float64)
 DEFINE_DTYPE_SPECIALIZATION(uint8_t, uint8)
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
new file mode 100644
index 0000000000000..1e22f53bf9ea2
--- /dev/null
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
@@ -0,0 +1,112 @@
+
+// WARNING: THIS FILE IS AUTOGENERATED BY torchgen. DO NOT MODIFY BY HAND.
+
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__addmm_activation(AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha, int32_t use_gelu, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cdist_backward(AtenTensorHandle grad, AtenTensorHandle x1, AtenTensorHandle x2, double p, AtenTensorHandle cdist, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cdist_forward(AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__efficientzerotensor(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_dense_backward(AtenTensorHandle grad, AtenTensorHandle indices, AtenTensorHandle offset2bag, AtenTensorHandle bag_size, AtenTensorHandle maximum_indices, int64_t num_weights, int32_t scale_grad_by_freq, int64_t mode, AtenTensorHandle* per_sample_weights, int64_t padding_idx, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_forward_only(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_per_sample_weights_backward(AtenTensorHandle grad, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, AtenTensorHandle offset2bag, int64_t mode, int64_t padding_idx, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_c2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_moving_avg_obs_fq_helper(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__histogramdd_from_bin_cts(AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pdist_backward(AtenTensorHandle grad, AtenTensorHandle self, double p, AtenTensorHandle pdist, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pdist_forward(AtenTensorHandle self, double p, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_dot_product_flash_attention_for_cpu(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, double dropout_p, int32_t is_causal, AtenTensorHandle* attn_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_dot_product_flash_attention_for_cpu_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, double dropout_p, int32_t is_causal, AtenTensorHandle* attn_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__segment_reduce_backward(AtenTensorHandle grad, AtenTensorHandle output, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* offsets, int64_t axis, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addmv(AtenTensorHandle self, AtenTensorHandle mat, AtenTensorHandle vec, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_angle(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bucketize_Tensor(AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cat(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky_inverse(AtenTensorHandle self, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky_solve(AtenTensorHandle self, AtenTensorHandle input2, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t** bias_sizes, int64_t bias_sizes_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummax(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummin(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cumprod(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cumsum(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_gcd(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_geqrf(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histogram_bin_ct(AtenTensorHandle self, int64_t bins, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_reduce(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, const char* reduce, int32_t include_self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lu_unpack(AtenTensorHandle LU_data, AtenTensorHandle LU_pivots, int32_t unpack_data, int32_t unpack_pivots, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_scatter(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_scatter_backward(AtenTensorHandle grad_output, AtenTensorHandle mask, const int64_t* sizes, int64_t sizes_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool2d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool2d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool3d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool3d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_unpool2d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_unpool3d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_median(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mode(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad1d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_segment_reduce(AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_bicubic2d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_linear1d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_trilinear3d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_complex(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_real(AtenTensorHandle self, AtenTensorHandle* ret0);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
new file mode 100644
index 0000000000000..970c5140cac0e
--- /dev/null
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
@@ -0,0 +1,119 @@
+
+// WARNING: THIS FILE IS AUTOGENERATED BY torchgen. DO NOT MODIFY BY HAND.
+
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__addmm_activation(AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha, int32_t use_gelu, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cdist_backward(AtenTensorHandle grad, AtenTensorHandle x1, AtenTensorHandle x2, double p, AtenTensorHandle cdist, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cdist_forward(AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cudnn_rnn(AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle* weight_buf, AtenTensorHandle hx, AtenTensorHandle* cx, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__efficient_attention_backward(AtenTensorHandle grad_out_, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* bias, AtenTensorHandle out, AtenTensorHandle* cu_seqlens_q, AtenTensorHandle* cu_seqlens_k, int64_t max_seqlen_q, int64_t max_seqlen_k, AtenTensorHandle logsumexp, double dropout_p, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, int64_t custom_mask_type, int32_t bias_requires_grad, double* scale, int64_t* num_splits_key, int64_t* window_size, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__efficient_attention_forward(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* bias, AtenTensorHandle* cu_seqlens_q, AtenTensorHandle* cu_seqlens_k, int64_t* max_seqlen_q, int64_t* max_seqlen_k, double dropout_p, int64_t custom_mask_type, int32_t compute_log_sumexp, double* scale, AtenTensorHandle* causal_diagonal, AtenTensorHandle* seqlen_k, int64_t* window_size, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__efficientzerotensor(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_dense_backward(AtenTensorHandle grad, AtenTensorHandle indices, AtenTensorHandle offset2bag, AtenTensorHandle bag_size, AtenTensorHandle maximum_indices, int64_t num_weights, int32_t scale_grad_by_freq, int64_t mode, AtenTensorHandle* per_sample_weights, int64_t padding_idx, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_forward_only(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_per_sample_weights_backward(AtenTensorHandle grad, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, AtenTensorHandle offset2bag, int64_t mode, int64_t padding_idx, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fft_c2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__flash_attention_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__flash_attention_forward(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* cum_seq_q, AtenTensorHandle* cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_moving_avg_obs_fq_helper(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pdist_backward(AtenTensorHandle grad, AtenTensorHandle self, double p, AtenTensorHandle pdist, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pdist_forward(AtenTensorHandle self, double p, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_efficient_attention(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, int32_t compute_log_sumexp, double dropout_p, int32_t is_causal, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_efficient_attention_backward(AtenTensorHandle grad_out_, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double dropout_p, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, int32_t is_causal, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_flash_attention(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_flash_attention_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle* bias, int32_t* out_dtype, AtenTensorHandle* scale_a, AtenTensorHandle* scale_b, AtenTensorHandle* scale_result, int32_t use_fast_accum, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__segment_reduce_backward(AtenTensorHandle grad, AtenTensorHandle output, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* offsets, int64_t axis, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__thnn_fused_lstm_cell(AtenTensorHandle input_gates, AtenTensorHandle hidden_gates, AtenTensorHandle cx, AtenTensorHandle* input_bias, AtenTensorHandle* hidden_bias, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_addmv(AtenTensorHandle self, AtenTensorHandle mat, AtenTensorHandle vec, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_angle(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bucketize_Tensor(AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cat(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cholesky_inverse(AtenTensorHandle self, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cholesky_solve(AtenTensorHandle self, AtenTensorHandle input2, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_convolution_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t** bias_sizes, int64_t bias_sizes_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cummax(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cummin(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cumprod(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cumsum(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_gcd(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_geqrf(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_reduce(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, const char* reduce, int32_t include_self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_lu_unpack(AtenTensorHandle LU_data, AtenTensorHandle LU_pivots, int32_t unpack_data, int32_t unpack_pivots, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_scatter(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_scatter_backward(AtenTensorHandle grad_output, AtenTensorHandle mask, const int64_t* sizes, int64_t sizes_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool2d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool2d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool3d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool3d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_unpool2d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_unpool3d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_median(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mode(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad1d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_segment_reduce(AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_bicubic2d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_linear1d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_trilinear3d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_as_complex(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_as_real(AtenTensorHandle self, AtenTensorHandle* ret0);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/torch/csrc/inductor/aoti_torch/shim_common.cpp b/torch/csrc/inductor/aoti_torch/shim_common.cpp
index 527d357bc5319..bd45a4a9f0f87 100644
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@@ -17,6 +17,9 @@
 #else
 
 #include <ATen/ops/_addmm_activation.h>
+#include <ATen/ops/_embedding_bag.h>
+#include <ATen/ops/_fft_c2c.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention.h>
 #include <ATen/ops/_scaled_dot_product_flash_attention.h>
 #include <ATen/ops/_scaled_mm.h>
 #include <ATen/ops/addmm.h>
@@ -24,11 +27,17 @@
 #include <ATen/ops/bmm.h>
 #include <ATen/ops/convolution.h>
 #include <ATen/ops/empty_strided.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_fp32_activation.h>
+#include <ATen/ops/fbgemm_pack_gemm_matrix_fp16.h>
 #include <ATen/ops/from_blob.h>
+#include <ATen/ops/index_put.h>
 #include <ATen/ops/mm.h>
 #include <ATen/ops/nonzero.h>
+#include <ATen/ops/scalar_tensor.h>
 #include <ATen/ops/scatter.h>
 #include <ATen/ops/scatter_reduce.h>
+#include <ATen/ops/view_as_real_ops.h>
+#include <ATen/ops/view_ops.h>
 
 #endif
 
@@ -44,17 +53,6 @@ static c10::Device c10_device(int32_t device_type, int32_t device_index) {
         static_cast<c10::DeviceIndex>(device_index));
   }
 }
-
-template <class T>
-c10::optional<T> pointer_to_optional(T* ptr) {
-  return ptr ? c10::make_optional(*ptr) : c10::nullopt;
-}
-
-template <class T, class U, typename = std::enable_if_t<!std::is_same_v<T, U>>>
-c10::optional<T> pointer_to_optional(U* ptr) {
-  return ptr ? c10::make_optional<T>(T(*ptr)) : c10::nullopt;
-}
-
 } // namespace
 
 int32_t aoti_torch_device_type_cpu() {
@@ -65,53 +63,76 @@ int32_t aoti_torch_device_type_cuda() {
   return (int32_t)c10::DeviceType::CUDA;
 }
 
-int32_t aoti_torch_dtype_float8_e5m2() {
-  return (int32_t)c10::ScalarType::Float8_e5m2;
-}
-
-int32_t aoti_torch_dtype_float8_e4m3fn() {
-  return (int32_t)c10::ScalarType::Float8_e4m3fn;
-}
-
-int32_t aoti_torch_dtype_bfloat16() {
-  return (int32_t)c10::ScalarType::BFloat16;
-}
-
-int32_t aoti_torch_dtype_float16() {
-  return (int32_t)c10::ScalarType::Half;
-}
-
-int32_t aoti_torch_dtype_float32() {
-  return (int32_t)c10::ScalarType::Float;
-}
-
-int32_t aoti_torch_dtype_float64() {
-  return (int32_t)c10::ScalarType::Double;
-}
-
-int32_t aoti_torch_dtype_uint8() {
-  return (int32_t)c10::ScalarType::Byte;
-}
-
-int32_t aoti_torch_dtype_int8() {
-  return (int32_t)c10::ScalarType::Char;
-}
-
-int32_t aoti_torch_dtype_int16() {
-  return (int32_t)c10::ScalarType::Short;
-}
+#define AOTI_TORCH_DTYPE_IMPL(dtype, stype) \
+  int32_t aoti_torch_dtype_##dtype() {      \
+    return (int32_t)c10::ScalarType::stype; \
+  }
 
-int32_t aoti_torch_dtype_int32() {
-  return (int32_t)c10::ScalarType::Int;
-}
+AOTI_TORCH_DTYPE_IMPL(float8_e5m2, Float8_e5m2)
+AOTI_TORCH_DTYPE_IMPL(float8_e4m3fn, Float8_e4m3fn)
+AOTI_TORCH_DTYPE_IMPL(float8_e5m2fnuz, Float8_e5m2fnuz)
+AOTI_TORCH_DTYPE_IMPL(float8_e4m3fnuz, Float8_e4m3fnuz)
+AOTI_TORCH_DTYPE_IMPL(bfloat16, BFloat16)
+AOTI_TORCH_DTYPE_IMPL(float16, Half)
+AOTI_TORCH_DTYPE_IMPL(float32, Float)
+AOTI_TORCH_DTYPE_IMPL(float64, Double)
+AOTI_TORCH_DTYPE_IMPL(uint8, Byte)
+AOTI_TORCH_DTYPE_IMPL(uint16, UInt16)
+AOTI_TORCH_DTYPE_IMPL(uint32, UInt32)
+AOTI_TORCH_DTYPE_IMPL(uint64, UInt64)
+AOTI_TORCH_DTYPE_IMPL(int8, Char)
+AOTI_TORCH_DTYPE_IMPL(int16, Short)
+AOTI_TORCH_DTYPE_IMPL(int32, Int)
+AOTI_TORCH_DTYPE_IMPL(int64, Long)
+AOTI_TORCH_DTYPE_IMPL(bool, Bool)
+AOTI_TORCH_DTYPE_IMPL(complex32, ComplexHalf)
+AOTI_TORCH_DTYPE_IMPL(complex64, ComplexFloat)
+AOTI_TORCH_DTYPE_IMPL(complex128, ComplexDouble)
+#undef AOTI_TORCH_DTYPE_IMPL
+
+#define AOTI_TORCH_ITEM_IMPL(dtype, ctype)                     \
+  AOTITorchError aoti_torch_item_##dtype(                      \
+      AtenTensorHandle tensor, ctype* ret_value) {             \
+    AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({               \
+      at::Tensor* t = tensor_handle_to_tensor_pointer(tensor); \
+      *ret_value = t->item().to<ctype>();                      \
+    });                                                        \
+  }
 
-int32_t aoti_torch_dtype_int64() {
-  return (int32_t)c10::ScalarType::Long;
-}
+AOTI_TORCH_ITEM_IMPL(float32, float)
+AOTI_TORCH_ITEM_IMPL(float64, double)
+AOTI_TORCH_ITEM_IMPL(uint8, uint8_t)
+AOTI_TORCH_ITEM_IMPL(uint16, uint16_t)
+AOTI_TORCH_ITEM_IMPL(uint32, uint32_t)
+AOTI_TORCH_ITEM_IMPL(uint64, uint64_t)
+AOTI_TORCH_ITEM_IMPL(int8, int8_t)
+AOTI_TORCH_ITEM_IMPL(int16, int16_t)
+AOTI_TORCH_ITEM_IMPL(int32, int32_t)
+AOTI_TORCH_ITEM_IMPL(int64, int64_t)
+AOTI_TORCH_ITEM_IMPL(bool, bool)
+#undef AOTI_TORCH_ITEM_IMPL
+
+#define AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(dtype, ctype, ttype)                  \
+  AOTITorchError aoti_torch_scalar_to_tensor_##dtype(                          \
+      ctype value, AtenTensorHandle* ret_new_tensor) {                         \
+    AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({                               \
+      *ret_new_tensor =                                                        \
+          new_tensor_handle(at::scalar_tensor(value, c10::ScalarType::ttype)); \
+    });                                                                        \
+  }
 
-int32_t aoti_torch_dtype_bool() {
-  return (int32_t)c10::ScalarType::Bool;
-}
+AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(float32, float, Float)
+AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(float64, double, Double)
+AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(uint8, uint8_t, Byte)
+AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(uint16, uint16_t, UInt16)
+AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(uint32, uint32_t, UInt32)
+AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(uint64, uint64_t, UInt64)
+AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(int8, int8_t, Char)
+AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(int16, int16_t, Short)
+AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(int32, int32_t, Int)
+AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(int64, int64_t, Long)
+AOTI_TORCH_SCALAR_TO_TENSOR_IMPL(bool, bool, Bool)
+#undef AOTI_TORCH_SCALAR_TO_TENSOR_IMPL
 
 bool aoti_torch_grad_mode_is_enabled() {
   return c10::GradMode::is_enabled();
@@ -247,10 +268,8 @@ AOTITorchError aoti_torch__reinterpret_tensor(
     at::Tensor* self_tensor = tensor_handle_to_tensor_pointer(self);
     c10::IntArrayRef sizes(sizes_ptr, ndim);
     c10::IntArrayRef strides(strides_ptr, ndim);
-    at::Tensor* new_tensor =
-        new at::Tensor(torch::inductor::_reinterpret_tensor(
-            *self_tensor, sizes, strides, offset_increment));
-    *ret_new_tensor = tensor_pointer_to_tensor_handle(new_tensor);
+    *ret_new_tensor = new_tensor_handle(torch::inductor::_reinterpret_tensor(
+        *self_tensor, sizes, strides, offset_increment));
   });
 }
 
@@ -267,16 +286,14 @@ AOTITorchError aoti_torch_empty_strided(
     c10::IntArrayRef sizes(sizes_ptr, ndim);
     c10::IntArrayRef strides(strides_ptr, ndim);
     if (c10::DeviceType(device_type) == c10::DeviceType::CPU) {
-      *ret_new_tensor = tensor_pointer_to_tensor_handle(
-          new at::Tensor(at::detail::empty_strided_cpu(
-              sizes, strides, static_cast<c10::ScalarType>(dtype))));
+      *ret_new_tensor = new_tensor_handle(at::detail::empty_strided_cpu(
+          sizes, strides, static_cast<c10::ScalarType>(dtype)));
     } else {
       c10::Device device = c10_device(device_type, device_index);
       c10::TensorOptions options = c10::TensorOptions().device(device).dtype(
           static_cast<c10::ScalarType>(dtype));
-      at::Tensor* new_tensor =
-          new at::Tensor(at::empty_strided(sizes, strides, options));
-      *ret_new_tensor = tensor_pointer_to_tensor_handle(new_tensor);
+      *ret_new_tensor =
+          new_tensor_handle(at::empty_strided(sizes, strides, options));
     }
   });
 }
@@ -297,15 +314,64 @@ AOTITorchError aoti_torch_create_tensor_from_blob(
     c10::Device device = c10_device(device_type, device_index);
     c10::TensorOptions options = c10::TensorOptions().device(device).dtype(
         static_cast<c10::ScalarType>(dtype));
-    at::Tensor* new_tensor = (data != nullptr)
-        ? new at::Tensor(at::for_blob(data, sizes)
-                             .strides(strides)
-                             .storage_offset(storage_offset)
-                             .options(options)
-                             .make_tensor())
+    *ret_new_tensor = new_tensor_handle(
         // data == nullptr can happen for a 0-size tensor
-        : new at::Tensor(at::empty_strided(sizes, strides, options));
-    *ret_new_tensor = tensor_pointer_to_tensor_handle(new_tensor);
+        (data != nullptr) ? at::for_blob(data, sizes)
+                                .strides(strides)
+                                .storage_offset(storage_offset)
+                                .options(options)
+                                .make_tensor()
+                          : at::empty_strided(sizes, strides, options));
+  });
+}
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__embedding_bag(
+    AtenTensorHandle weight,
+    AtenTensorHandle indices,
+    AtenTensorHandle offsets,
+    int32_t scale_grad_by_freq,
+    int32_t mode,
+    int32_t sparse,
+    AtenTensorHandle per_sample_weights, // optional argument
+    int32_t include_last_offset,
+    int32_t padding_idx,
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3 // returns new reference
+) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto [r0, r1, r2, r3] = at::_embedding_bag(
+        *tensor_handle_to_tensor_pointer(weight),
+        *tensor_handle_to_tensor_pointer(indices),
+        *tensor_handle_to_tensor_pointer(offsets),
+        scale_grad_by_freq,
+        mode,
+        sparse,
+        pointer_to_optional(
+            tensor_handle_to_tensor_pointer(per_sample_weights)),
+        include_last_offset,
+        padding_idx);
+
+    *ret0 = new_tensor_handle(std::move(r0));
+    *ret1 = new_tensor_handle(std::move(r1));
+    *ret2 = new_tensor_handle(std::move(r2));
+    *ret3 = new_tensor_handle(std::move(r3));
+  });
+}
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__fft_c2c(
+    AtenTensorHandle self,
+    const int64_t* dim_ptr,
+    int64_t dim_size,
+    int64_t normalization,
+    int32_t forward,
+    AtenTensorHandle* ret // returns new reference
+) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto dim = c10::IntArrayRef(dim_ptr, dim_size);
+    *ret = new_tensor_handle(at::_fft_c2c(
+        *tensor_handle_to_tensor_pointer(self), dim, normalization, forward));
   });
 }
 
@@ -316,7 +382,7 @@ AOTITorchError aoti_torch__scaled_dot_product_flash_attention_v2(
     double dropout_p,
     int is_causal,
     int return_debug_mask,
-    double* scale,
+    double* scale, // optional argument
     AtenTensorHandle* ret0, // returns new reference
     AtenTensorHandle* ret1, // returns new reference
     AtenTensorHandle* ret2, // returns new reference
@@ -342,27 +408,20 @@ AOTITorchError aoti_torch__scaled_dot_product_flash_attention_v2(
             return_debug_mask,
             optional_scale);
 
-    at::Tensor* ret0_tensor = new at::Tensor(std::move(r0));
-    *ret0 = tensor_pointer_to_tensor_handle(ret0_tensor);
-    at::Tensor* ret1_tensor = new at::Tensor(std::move(r1));
-    *ret1 = tensor_pointer_to_tensor_handle(ret1_tensor);
+    *ret0 = new_tensor_handle(std::move(r0));
+    *ret1 = new_tensor_handle(std::move(r1));
     // ret2 and ret3 may be null
     if (ret2) {
-      at::Tensor* ret2_tensor = new at::Tensor(std::move(r2));
-      *ret2 = tensor_pointer_to_tensor_handle(ret2_tensor);
+      *ret2 = new_tensor_handle(std::move(r2));
     }
     if (ret3) {
-      at::Tensor* ret3_tensor = new at::Tensor(std::move(r3));
-      *ret3 = tensor_pointer_to_tensor_handle(ret3_tensor);
+      *ret3 = new_tensor_handle(std::move(r3));
     }
     *ret4 = r4.expect_int();
     *ret5 = r5.expect_int();
-    at::Tensor* ret6_tensor = new at::Tensor(std::move(r6));
-    *ret6 = tensor_pointer_to_tensor_handle(ret6_tensor);
-    at::Tensor* ret7_tensor = new at::Tensor(std::move(r7));
-    *ret7 = tensor_pointer_to_tensor_handle(ret7_tensor);
-    at::Tensor* ret8_tensor = new at::Tensor(std::move(r8));
-    *ret8 = tensor_pointer_to_tensor_handle(ret8_tensor);
+    *ret6 = new_tensor_handle(std::move(r6));
+    *ret7 = new_tensor_handle(std::move(r7));
+    *ret8 = new_tensor_handle(std::move(r8));
   });
 }
 
@@ -403,6 +462,44 @@ AOTITorchError aoti_torch__scaled_dot_product_flash_attention(
       ret8);
 }
 
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch__scaled_dot_product_efficient_attention(
+    AtenTensorHandle query,
+    AtenTensorHandle key,
+    AtenTensorHandle value,
+    AtenTensorHandle attn_bias, // optional argument
+    int compute_log_sumexp,
+    double dropout_p,
+    int is_causal,
+    double* scale, // optional argument
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3 // returns new reference
+) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::Tensor* query_tensor = tensor_handle_to_tensor_pointer(query);
+    at::Tensor* key_tensor = tensor_handle_to_tensor_pointer(key);
+    at::Tensor* value_tensor = tensor_handle_to_tensor_pointer(value);
+    auto optional_attn_bias =
+        pointer_to_optional(tensor_handle_to_tensor_pointer(attn_bias));
+    auto optional_scale = pointer_to_optional(scale);
+    auto [r0, r1, r2, r3] = at::_scaled_dot_product_efficient_attention(
+        *query_tensor,
+        *key_tensor,
+        *value_tensor,
+        optional_attn_bias,
+        compute_log_sumexp,
+        dropout_p,
+        is_causal,
+        optional_scale);
+    *ret0 = new_tensor_handle(std::move(r0));
+    *ret1 = new_tensor_handle(std::move(r1));
+    *ret2 = new_tensor_handle(std::move(r2));
+    *ret3 = new_tensor_handle(std::move(r3));
+  });
+}
+
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_convolution(
     AtenTensorHandle input,
     AtenTensorHandle weight,
@@ -429,7 +526,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_convolution(
     c10::IntArrayRef dilation(dilation_ptr, dilation_size);
     c10::IntArrayRef output_padding(output_padding_ptr, output_padding_size);
 
-    at::Tensor out_tensor = at::convolution(
+    *out = new_tensor_handle(at::convolution(
         *input_tensor,
         *weight_tensor,
         optional_bias,
@@ -438,9 +535,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_convolution(
         dilation,
         static_cast<bool>(transposed),
         output_padding,
-        groups);
-    at::Tensor* out_tensor_ptr = new at::Tensor(std::move(out_tensor));
-    *out = tensor_pointer_to_tensor_handle(out_tensor_ptr);
+        groups));
   });
 }
 
@@ -479,10 +574,8 @@ AOTITorchError aoti_torch__scaled_mm(
         pointer_to_optional(scale_b_tensor),
         pointer_to_optional(scale_result_tensor),
         use_fast_accum);
-    at::Tensor* ret0_tensor = new at::Tensor(std::move(r0));
-    *ret0 = tensor_pointer_to_tensor_handle(ret0_tensor);
-    at::Tensor* ret1_tensor = new at::Tensor(std::move(r1));
-    *ret1 = tensor_pointer_to_tensor_handle(ret1_tensor);
+    *ret0 = new_tensor_handle(std::move(r0));
+    *ret1 = new_tensor_handle(std::move(r1));
   });
 }
 
@@ -490,11 +583,7 @@ AOTITorchError aoti_torch__scaled_mm(
 AOTITorchError aoti_torch_tensor_copy_(
     AtenTensorHandle src,
     AtenTensorHandle dst) {
-  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
-    at::Tensor* src_tensor = tensor_handle_to_tensor_pointer(src);
-    at::Tensor* dst_tensor = tensor_handle_to_tensor_pointer(dst);
-    dst_tensor->copy_(*src_tensor);
-  });
+  return aoti_torch_copy_(dst, src, /*non_blocking=*/0);
 }
 
 AOTITorchError aoti_torch_assign_tensors(
@@ -507,12 +596,20 @@ AOTITorchError aoti_torch_assign_tensors(
   });
 }
 
+AOTITorchError aoti_torch_assign_tensors_out(
+    AtenTensorHandle src,
+    AtenTensorHandle* ret_dst) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::Tensor* src_tensor_ptr = tensor_handle_to_tensor_pointer(src);
+    at::Tensor dst_tensor = *src_tensor_ptr;
+    *ret_dst = new_tensor_handle(std::move(dst_tensor));
+  });
+}
+
 AOTITorchError aoti_torch_clone(AtenTensorHandle self, AtenTensorHandle* ret) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
     at::Tensor* self_tensor = tensor_handle_to_tensor_pointer(self);
-    at::Tensor out_tensor = self_tensor->clone();
-    at::Tensor* out_tensor_ptr = new at::Tensor(std::move(out_tensor));
-    *ret = tensor_pointer_to_tensor_handle(out_tensor_ptr);
+    *ret = new_tensor_handle(self_tensor->clone());
   });
 }
 
@@ -547,6 +644,16 @@ AOTITorchError aoti_torch_bmm_out(
   });
 }
 
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_copy_(
+    AtenTensorHandle self,
+    AtenTensorHandle src,
+    int32_t non_blocking) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    tensor_handle_to_tensor_pointer(self)->copy_(
+        *tensor_handle_to_tensor_pointer(src), non_blocking);
+  });
+}
+
 // TODO: implement a more efficient version instead of calling into aten
 AOTITorchError aoti_torch_mm_out(
     AtenTensorHandle out,
@@ -560,14 +667,38 @@ AOTITorchError aoti_torch_mm_out(
   });
 }
 
+AOTITorchError aoti_torch_cpu_wrapped_fbgemm_pack_gemm_matrix_fp16(
+    AtenTensorHandle weight,
+    AtenTensorHandle* out) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::Tensor* weight_tensor = tensor_handle_to_tensor_pointer(weight);
+
+    *out = new_tensor_handle(at::fbgemm_pack_gemm_matrix_fp16(*weight_tensor));
+  });
+}
+
+AOTITorchError aoti_torch_cpu_wrapped_fbgemm_linear_fp16_weight(
+    AtenTensorHandle input,
+    AtenTensorHandle weight,
+    AtenTensorHandle bias,
+    int64_t out_channel,
+    AtenTensorHandle* out) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::Tensor* input_tensor = tensor_handle_to_tensor_pointer(input);
+    at::Tensor* weight_tensor = tensor_handle_to_tensor_pointer(weight);
+    at::Tensor* bias_tensor = tensor_handle_to_tensor_pointer(bias);
+
+    *out = new_tensor_handle(at::fbgemm_linear_fp16_weight_fp32_activation(
+        *input_tensor, *weight_tensor, *bias_tensor));
+  });
+}
+
 AOTITorchError aoti_torch_nonzero(
     AtenTensorHandle self,
     AtenTensorHandle* out) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
     at::Tensor* self_tensor = tensor_handle_to_tensor_pointer(self);
-    at::Tensor out_tensor = at::nonzero(*self_tensor);
-    at::Tensor* out_tensor_ptr = new at::Tensor(std::move(out_tensor));
-    *out = tensor_pointer_to_tensor_handle(out_tensor_ptr);
+    *out = new_tensor_handle(at::nonzero(*self_tensor));
   });
 }
 
@@ -577,10 +708,8 @@ AOTITorchError aoti_torch_repeat_interleave_Tensor(
     AtenTensorHandle* out) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
     at::Tensor* repeats_tensor = tensor_handle_to_tensor_pointer(repeats);
-    at::Tensor out_tensor = at::_ops::repeat_interleave_Tensor::call(
-        *repeats_tensor, pointer_to_optional<c10::SymInt>(output_size));
-    at::Tensor* out_tensor_ptr = new at::Tensor(std::move(out_tensor));
-    *out = tensor_pointer_to_tensor_handle(out_tensor_ptr);
+    *out = new_tensor_handle(at::_ops::repeat_interleave_Tensor::call(
+        *repeats_tensor, pointer_to_optional<c10::SymInt>(output_size)));
   });
 }
 
@@ -638,6 +767,61 @@ AOTITorchError aoti_torch_scatter_reduce_out(
   });
 }
 
+AOTITorchError aoti_torch_index_put_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    const AtenTensorHandle* indices,
+    const uint32_t num_indices,
+    const AtenTensorHandle values,
+    bool accumulate) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<c10::optional<at::Tensor>> indices_;
+    indices_.reserve(num_indices);
+    for (size_t i = 0; i < num_indices; i++) {
+      indices_.emplace_back(
+          pointer_to_optional(tensor_handle_to_tensor_pointer(indices[i])));
+    }
+    at::Tensor* out_tensor = tensor_handle_to_tensor_pointer(out);
+    at::Tensor* self_tensor = tensor_handle_to_tensor_pointer(self);
+    at::Tensor* values_tensor = tensor_handle_to_tensor_pointer(values);
+    at::index_put_out(
+        *out_tensor, *self_tensor, indices_, *values_tensor, accumulate);
+  });
+}
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_as_real(
+    AtenTensorHandle self,
+    AtenTensorHandle* ret // returns new reference
+) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    *ret = new_tensor_handle(
+        at::_ops::view_as_real::call(*tensor_handle_to_tensor_pointer(self)));
+  });
+}
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_dtype(
+    AtenTensorHandle self,
+    int32_t dtype,
+    AtenTensorHandle* ret // returns new reference
+) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::Tensor* self_tensor = tensor_handle_to_tensor_pointer(self);
+    *ret = new_tensor_handle(at::_ops::view_dtype::call(
+        *self_tensor, static_cast<c10::ScalarType>(dtype)));
+  });
+}
+
+AOTI_TORCH_EXPORT void aoti_torch_print_tensor_handle(
+    AtenTensorHandle self,
+    const char* msg) {
+  at::Tensor* t = tensor_handle_to_tensor_pointer(self);
+  std::cout << "[";
+  if (msg) {
+    std::cout << msg;
+  }
+  std::cout << "]:" << *t << "\n";
+}
+
 // ProxyExecutor
 AOTITorchError aoti_torch_proxy_executor_call_function(
     AOTIProxyExecutorHandle proxy_executor,
@@ -657,6 +841,17 @@ AOTITorchError aoti_torch_proxy_executor_call_function(
   });
 }
 
+void aoti_torch_check(
+    bool cond,
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg) {
+  if (C10_UNLIKELY_OR_CONST(!cond)) {
+    ::c10::detail::torchCheckFail(func, file, line, msg);
+  }
+}
+
 AOTITorchError aoti_torch__alloc_from_pool(
     AtenTensorHandle self,
     int64_t offset_bytes,
@@ -669,12 +864,11 @@ AOTITorchError aoti_torch__alloc_from_pool(
     at::Tensor* self_tensor = tensor_handle_to_tensor_pointer(self);
     c10::IntArrayRef sizes(sizes_ptr, ndim);
     c10::IntArrayRef strides(strides_ptr, ndim);
-    at::Tensor* new_tensor = new at::Tensor(torch::inductor::_alloc_from_pool(
+    *ret_new_tensor = new_tensor_handle(torch::inductor::_alloc_from_pool(
         *self_tensor,
         offset_bytes,
         static_cast<c10::ScalarType>(dtype),
         sizes,
         strides));
-    *ret_new_tensor = tensor_pointer_to_tensor_handle(new_tensor);
   });
 }
diff --git a/torch/csrc/inductor/aoti_torch/shim_cuda.cpp b/torch/csrc/inductor/aoti_torch/shim_cuda.cpp
index edd2a9b0af138..3c90266203ccb 100644
--- a/torch/csrc/inductor/aoti_torch/shim_cuda.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_cuda.cpp
@@ -5,6 +5,29 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
 
+AOTITorchError aoti_torch_create_cuda_guard(
+    int32_t device_index,
+    CUDAGuardHandle* ret_guard // returns new reference
+) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::cuda::CUDAGuard* guard = new at::cuda::CUDAGuard(device_index);
+    *ret_guard = reinterpret_cast<CUDAGuardHandle>(guard);
+  });
+}
+
+AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { delete reinterpret_cast<at::cuda::CUDAGuard*>(guard); });
+}
+
+AOTITorchError aoti_torch_cuda_guard_set_index(
+    CUDAGuardHandle guard,
+    int32_t device_index) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    reinterpret_cast<at::cuda::CUDAGuard*>(guard)->set_index(device_index);
+  });
+}
+
 AOTITorchError aoti_torch_create_cuda_stream_guard(
     void* stream,
     int32_t device_index,
@@ -22,3 +45,10 @@ AOTITorchError aoti_torch_delete_cuda_stream_guard(
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(
       { delete reinterpret_cast<at::cuda::CUDAStreamGuard*>(guard); });
 }
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    *(cudaStream_t*)(ret_stream) = at::cuda::getCurrentCUDAStream(device_index);
+  });
+}
diff --git a/torch/csrc/inductor/aoti_torch/tensor_converter.cpp b/torch/csrc/inductor/aoti_torch/tensor_converter.cpp
index de6c2514814c7..d61fa20bab878 100644
--- a/torch/csrc/inductor/aoti_torch/tensor_converter.cpp
+++ b/torch/csrc/inductor/aoti_torch/tensor_converter.cpp
@@ -1,18 +1,9 @@
-
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
 
 namespace torch {
 namespace aot_inductor {
 
-at::Tensor* tensor_handle_to_tensor_pointer(AtenTensorHandle handle) {
-  return reinterpret_cast<at::Tensor*>(handle);
-}
-
-AtenTensorHandle tensor_pointer_to_tensor_handle(at::Tensor* tensor) {
-  return reinterpret_cast<AtenTensorHandle>(tensor);
-}
-
 std::vector<AtenTensorHandle> unsafe_alloc_new_handles_from_tensors(
     std::vector<at::Tensor>& tensors) {
   std::vector<AtenTensorHandle> result;
@@ -27,14 +18,30 @@ std::vector<AtenTensorHandle> unsafe_alloc_new_handles_from_tensors(
 std::vector<at::Tensor> alloc_tensors_by_stealing_from_handles(
     AtenTensorHandle* handles,
     size_t length) {
+  // Find duplicates by recording the last known index for each handle.
+  std::unordered_map<AtenTensorHandle, size_t> lastKnownIdx;
+  for (size_t i = 0; i < length; i++) {
+    lastKnownIdx[handles[i]] = i;
+  }
+
   std::vector<at::Tensor> result;
   result.reserve(length);
   for (size_t i = 0; i < length; i++) {
-    result.emplace_back(
-        std::move(*tensor_handle_to_tensor_pointer(handles[i])));
-    aoti_torch_delete_tensor_object(handles[i]);
+    if (handles[i] == nullptr) {
+      result.emplace_back();
+      continue;
+    }
+
+    at::Tensor tensor = *tensor_handle_to_tensor_pointer(handles[i]);
+    if (lastKnownIdx[handles[i]] != i) {
+      result.emplace_back(tensor);
+    } else {
+      result.emplace_back(std::move(tensor));
+      aoti_torch_delete_tensor_object(handles[i]);
+    }
     handles[i] = nullptr;
   }
+
   return result;
 }
 
diff --git a/torch/csrc/inductor/aoti_torch/tensor_converter.h b/torch/csrc/inductor/aoti_torch/tensor_converter.h
index 6b43182128695..b29b9cf8dda5e 100644
--- a/torch/csrc/inductor/aoti_torch/tensor_converter.h
+++ b/torch/csrc/inductor/aoti_torch/tensor_converter.h
@@ -1,8 +1,7 @@
 #pragma once
 
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
-
 #include <ATen/Tensor.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
 
 namespace torch {
 namespace aot_inductor {
@@ -10,12 +9,6 @@ namespace aot_inductor {
 // Functions declared here are not meant to be called from the AOTInductor
 // generated model.so
 
-// No ownership transfer, just pointer type conversion
-TORCH_API at::Tensor* tensor_handle_to_tensor_pointer(AtenTensorHandle handle);
-
-// No ownership transfer, just pointer type conversion
-TORCH_API AtenTensorHandle tensor_pointer_to_tensor_handle(at::Tensor* tensor);
-
 // unsafe_alloc_new_handles_from_tensors is used for allocating new aten
 // tensor objects and return them as a vector of AtenTensorHandle (raw
 // pointers), and those pointers will be stolen by model.so.
diff --git a/torch/csrc/inductor/aoti_torch/utils.h b/torch/csrc/inductor/aoti_torch/utils.h
index 80f778b11c75e..a0739afabd5ee 100644
--- a/torch/csrc/inductor/aoti_torch/utils.h
+++ b/torch/csrc/inductor/aoti_torch/utils.h
@@ -1,6 +1,13 @@
 #pragma once
 
+#include <ATen/Tensor.h>
+#include <ATen/core/List.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/util/ArrayRef.h>
 #include <c10/util/Logging.h>
+#include <c10/util/Optional.h>
+#include <c10/util/OptionalArrayRef.h>
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 
 #define AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(...)    \
@@ -14,3 +21,134 @@
     return AOTI_TORCH_FAILURE;                             \
   }                                                        \
   return AOTI_TORCH_SUCCESS;
+
+namespace torch::aot_inductor {
+
+inline at::Tensor* tensor_handle_to_tensor_pointer(AtenTensorHandle handle) {
+  return reinterpret_cast<at::Tensor*>(handle);
+}
+
+inline AtenTensorHandle tensor_pointer_to_tensor_handle(at::Tensor* tensor) {
+  return reinterpret_cast<AtenTensorHandle>(tensor);
+}
+
+inline AtenTensorHandle new_tensor_handle(at::Tensor&& tensor) {
+  at::Tensor* new_tensor = new at::Tensor(std::move(tensor));
+  return tensor_pointer_to_tensor_handle(new_tensor);
+}
+
+// utility functions to convert a pointer to an optional value
+template <class T>
+inline c10::optional<T> pointer_to_optional(T* ptr) {
+  return ptr ? c10::make_optional(*ptr) : c10::nullopt;
+}
+
+template <class T, class U, typename = std::enable_if_t<!std::is_same_v<T, U>>>
+inline c10::optional<T> pointer_to_optional(U* ptr) {
+  return ptr ? c10::make_optional<T>(T(*ptr)) : c10::nullopt;
+}
+
+template <>
+inline c10::optional<at::Tensor> pointer_to_optional(AtenTensorHandle* ptr) {
+  return ptr ? c10::make_optional(*tensor_handle_to_tensor_pointer(*ptr))
+             : c10::nullopt;
+}
+
+template <>
+inline c10::optional<at::Tensor> pointer_to_optional(
+    const AtenTensorHandle* ptr) {
+  return ptr ? c10::make_optional(*tensor_handle_to_tensor_pointer(*ptr))
+             : c10::nullopt;
+}
+
+inline c10::optional<c10::Device> pointer_to_optional_device(
+    int32_t* device_type,
+    int32_t device_index) {
+  return device_type ? c10::make_optional(c10::Device(
+                           static_cast<c10::DeviceType>(*device_type),
+                           static_cast<c10::DeviceIndex>(device_index)))
+                     : c10::nullopt;
+}
+
+// utility functions to convert a pointer to a list
+template <typename T>
+struct is_optional : std::false_type {};
+template <typename T>
+struct is_optional<c10::optional<T>> : std::true_type {};
+
+template <class T>
+inline c10::ArrayRef<T> pointer_to_list(T* ptr, int64_t len) {
+  return c10::ArrayRef<T>(ptr, len);
+}
+
+template <
+    class T,
+    class U,
+    typename = std::enable_if_t<!std::is_same_v<T, U>>,
+    typename = std::enable_if_t<!is_optional<T>::value>>
+inline std::vector<T> pointer_to_list(U* ptr, int64_t len) {
+  // std::vector<T> will be implicitly converted to c10::ArrayRef<T> at the call
+  // site
+  std::vector<T> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(T(ptr[i]));
+  }
+  return result;
+}
+
+template <class T, class U, typename = std::enable_if_t<is_optional<T>::value>>
+inline std::vector<T> pointer_to_list(U** ptr, int64_t len) {
+  // Here U** denotes a list of optional arguments
+  // std::vector<T> will be implicitly converted to c10::ArrayRef<T> at the call
+  // site
+  std::vector<T> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(pointer_to_optional(ptr[i]));
+  }
+  return result;
+}
+
+template <>
+inline std::vector<at::Tensor> pointer_to_list(
+    const AtenTensorHandle* ptr,
+    int64_t len) {
+  std::vector<at::Tensor> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(*tensor_handle_to_tensor_pointer(*ptr));
+  }
+  return result;
+}
+
+template <>
+inline std::vector<c10::optional<at::Tensor>> pointer_to_list(
+    const AtenTensorHandle** ptr,
+    int64_t len) {
+  std::vector<c10::optional<at::Tensor>> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(pointer_to_optional<at::Tensor>(ptr[i]));
+  }
+  return result;
+}
+
+template <int N>
+inline std::array<bool, N> pointer_to_list(const int32_t* ptr) {
+  std::array<bool, N> result;
+  std::copy(ptr, ptr + N, result.begin());
+  return result;
+}
+
+// Utility function to convert a pointer to an optional list of values
+template <class T, class U>
+inline c10::optional<c10::ArrayRef<T>> pointer_to_optional_list(
+    U** ptr,
+    int64_t len) {
+  return ptr
+      ? c10::make_optional<c10::ArrayRef<T>>(pointer_to_list<T>(*ptr, len))
+      : c10::nullopt;
+}
+
+} // namespace torch::aot_inductor
diff --git a/torch/csrc/inductor/inductor_ops.cpp b/torch/csrc/inductor/inductor_ops.cpp
index ca47ab2b650a1..ce7a88bd9b45f 100644
--- a/torch/csrc/inductor/inductor_ops.cpp
+++ b/torch/csrc/inductor/inductor_ops.cpp
@@ -8,6 +8,12 @@
 #include <torch/csrc/inductor/inductor_ops.h>
 #include <torch/library.h>
 
+#include <ATen/native/Resize.h>
+
+#ifdef USE_CUDA
+#include <ATen/native/cuda/Resize.h>
+#endif
+
 namespace torch {
 namespace inductor {
 using namespace at;
@@ -61,11 +67,15 @@ Tensor _reinterpret_tensor(
 static void accumulate_grad_(const Tensor& variable, const Tensor& new_grad) {
   at::Tensor& grad = variable.mutable_grad();
   if (new_grad.device() != kMeta) {
+    // Do not call into this codepath from C++ frontend, instead call directly
+    // into accumulateGrad with num_expected_refs set to 1 Here,
+    // num_expected_refs is set to 2 to steal the gradient when this is called
+    // from Python
     torch::autograd::AccumulateGrad::accumulateGrad(
         variable,
         grad,
         new_grad,
-        1 /* num_expected_refs */,
+        2 /* num_expected_refs */,
         [&grad](at::Tensor&& grad_update) { grad = std::move(grad_update); });
   } else {
     // no shape checking for `device="meta"` to workaround FSDP inplace mutation
@@ -75,6 +85,21 @@ static void accumulate_grad_(const Tensor& variable, const Tensor& new_grad) {
   }
 }
 
+static void resize_storage_bytes_(const Tensor& variable, SymInt new_size) {
+  // similar to THPStorage_resize_ in StorageMethods.cpp, but is traceable
+  if (variable.storage().device_type() == at::kCUDA) {
+    // rocm build has undefined reference to resize_bytes_cuda
+#if defined(USE_CUDA) && !defined(USE_ROCM)
+    at::native::resize_bytes_cuda(
+        variable.storage().unsafeGetStorageImpl(), new_size.expect_int());
+#else
+    TORCH_CHECK(false, "built without cuda");
+#endif
+  } else {
+    at::native::resize_bytes_nocuda(variable.storage(), new_size);
+  }
+}
+
 TORCH_LIBRARY_FRAGMENT(inductor, m) {
   m.def(
       "_mm_plus_mm(Tensor a, Tensor b, Tensor c, Tensor d, Tensor(t!) out) -> Tensor(t!)",
@@ -93,6 +118,11 @@ TORCH_LIBRARY_FRAGMENT(inductor, m) {
       "accumulate_grad_(Tensor variable, Tensor new_grad) -> ()",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, accumulate_grad_),
       {at::Tag::pt2_compliant_tag});
+  m.def(
+      "resize_storage_bytes_(Tensor variable, SymInt new_size) -> ()",
+      dispatch(
+          c10::DispatchKey::CompositeExplicitAutograd, resize_storage_bytes_),
+      {at::Tag::pt2_compliant_tag});
 }
 
 } // namespace inductor
diff --git a/torch/csrc/itt.cpp b/torch/csrc/itt.cpp
index 0e0ad78b4bace..868235d90ad7d 100644
--- a/torch/csrc/itt.cpp
+++ b/torch/csrc/itt.cpp
@@ -1,8 +1,7 @@
 #include <torch/csrc/itt_wrapper.h>
 #include <torch/csrc/utils/pybind.h>
 
-namespace torch {
-namespace profiler {
+namespace torch::profiler {
 void initIttBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
@@ -12,5 +11,4 @@ void initIttBindings(PyObject* module) {
   itt.def("rangePop", itt_range_pop);
   itt.def("mark", itt_mark);
 }
-} // namespace profiler
-} // namespace torch
+} // namespace torch::profiler
diff --git a/torch/csrc/itt_wrapper.cpp b/torch/csrc/itt_wrapper.cpp
index 432ef43135590..2719e11e19523 100644
--- a/torch/csrc/itt_wrapper.cpp
+++ b/torch/csrc/itt_wrapper.cpp
@@ -2,8 +2,7 @@
 #include <torch/csrc/itt_wrapper.h>
 #include <torch/csrc/profiler/stubs/base.h>
 
-namespace torch {
-namespace profiler {
+namespace torch::profiler {
 __itt_domain* _itt_domain = __itt_domain_create("PyTorch");
 
 bool itt_is_available() {
@@ -24,5 +23,4 @@ void itt_mark(const char* msg) {
   __itt_task_begin(_itt_domain, __itt_null, __itt_null, hsMsg);
   __itt_task_end(_itt_domain);
 }
-} // namespace profiler
-} // namespace torch
+} // namespace torch::profiler
diff --git a/torch/csrc/itt_wrapper.h b/torch/csrc/itt_wrapper.h
index a24fcb1a6991e..502af374ff3d3 100644
--- a/torch/csrc/itt_wrapper.h
+++ b/torch/csrc/itt_wrapper.h
@@ -2,13 +2,11 @@
 #define PROFILER_ITT_H
 #include <c10/macros/Export.h>
 
-namespace torch {
-namespace profiler {
+namespace torch::profiler {
 TORCH_API bool itt_is_available();
 TORCH_API void itt_range_push(const char* msg);
 TORCH_API void itt_range_pop();
 TORCH_API void itt_mark(const char* msg);
-} // namespace profiler
-} // namespace torch
+} // namespace torch::profiler
 
 #endif // PROFILER_ITT_H
diff --git a/torch/csrc/jit/OVERVIEW.md b/torch/csrc/jit/OVERVIEW.md
index 2b5374741b1fc..c6a9d3f973d31 100644
--- a/torch/csrc/jit/OVERVIEW.md
+++ b/torch/csrc/jit/OVERVIEW.md
@@ -75,6 +75,7 @@ Sections start with a reference to the source file where the code related to the
   - [Testing Autodiff](#testing-autodiff)
 - [Python Printer](#python-printer)
 - [Python Bindings](#python-bindings)
+  - [Graph Manipulation](#graph-manipulation)
 
 <!-- tocstop -->
 
@@ -1522,3 +1523,13 @@ def forward(self,
 # Python Bindings
 
 TODO: Script Module, torch.jit.trace, __constant__ handling, weak script modules
+
+## Graph Manipulation
+
+Python bindings for manipulating TorchScript IR exists in [python_ir.cpp](https://github.com/pytorch/pytorch/blob/58e7ec5843e63ee044e0a4f5aa2583a056a64078/torch/csrc/jit/python/python_ir.cpp#L4). In general, graph structures should look the same as the representation described above in [Core Program Representation](#core-program-representation).
+
+Things to watch out for:
+* You may need to first inline your graph (`torch._C._jit_pass_inline`) or recursively traverse CallFunction nodes (`for x in graph.findAllNodes("prim::CallFunction")`) if you want to recursively modify your graph and the functions it calls
+* To insert a graph after node n, use the context manager `with graph.insert_point_guard(new_node)`
+
+See more examples in [test_python_ir.py](https://github.com/pytorch/pytorch/blob/main/test/jit/test_python_ir.py)
diff --git a/torch/csrc/jit/api/function_impl.cpp b/torch/csrc/jit/api/function_impl.cpp
index d7e567b8f3f13..c0f0b4e486b4f 100644
--- a/torch/csrc/jit/api/function_impl.cpp
+++ b/torch/csrc/jit/api/function_impl.cpp
@@ -116,8 +116,8 @@ GraphFunction::SpecializationKey GraphFunction::currentSpecialization() const {
   // disabling autodiff pass for mobile build since autocast APIs don't exist
   return SpecializationKey::AutocastOff;
 #else
-  bool cpu_enabled = at::autocast::is_cpu_enabled();
-  bool gpu_enabled = at::autocast::is_enabled();
+  bool cpu_enabled = at::autocast::is_autocast_enabled(at::kCPU);
+  bool gpu_enabled = at::autocast::is_autocast_enabled(at::kCUDA);
   if (cpu_enabled && gpu_enabled) {
     return SpecializationKey::CpuGpuAutocastOn;
   } else if (!cpu_enabled && !gpu_enabled) {
diff --git a/torch/csrc/jit/api/method.h b/torch/csrc/jit/api/method.h
index afbbbfe0f4135..28675e5bd059f 100644
--- a/torch/csrc/jit/api/method.h
+++ b/torch/csrc/jit/api/method.h
@@ -23,6 +23,9 @@ struct TORCH_API Method : public torch::IMethod {
 
   // the module that contains this method.
   Module owner() const;
+  // the raw objectptr that owns this method, for when the method is owned by a
+  // torchbind object.
+  ObjectPtr raw_owner() const;
   void run(Stack& stack);
   void run(Stack&& stack) {
     run(stack);
diff --git a/torch/csrc/jit/api/module.cpp b/torch/csrc/jit/api/module.cpp
index 68cea443b639a..e32d2bba34501 100644
--- a/torch/csrc/jit/api/module.cpp
+++ b/torch/csrc/jit/api/module.cpp
@@ -197,6 +197,9 @@ Method::Method(ModulePtr owner, Function* function)
 Module Method::owner() const {
   return Module(owner_);
 }
+ObjectPtr Method::raw_owner() const {
+  return owner_;
+}
 void Method::run(Stack& stack) {
   stack.insert(stack.begin(), owner()._ivalue()); // self
   RECORD_TORCHSCRIPT_FUNCTION(name(), stack);
diff --git a/torch/csrc/jit/codegen/onednn/kernel.cpp b/torch/csrc/jit/codegen/onednn/kernel.cpp
index dea8f4121fdc8..bc127e7e59de6 100644
--- a/torch/csrc/jit/codegen/onednn/kernel.cpp
+++ b/torch/csrc/jit/codegen/onednn/kernel.cpp
@@ -275,8 +275,7 @@ void LlgaKernel::run(Stack& stack) {
   GRAPH_DEBUG("Preparing runtime tensors");
 #endif
   TensorArgs outputs;
-  RunArgs runInputs, runOutputs;
-  std::tie(runInputs, runOutputs) = prepareRunArgs(inputs, outputs);
+  auto [runInputs, runOutputs] = prepareRunArgs(inputs, outputs);
 #ifdef GRAPH_DEBUG_ENABLED
   GRAPH_DEBUG("Executing partition");
 #endif
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index cd0df41aac7cb..989a6eaf2dfe0 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -2892,15 +2892,12 @@ struct to_ir {
 
     // If it's a tensor, copy the RHS data into it
     if (sliceable->type()->isSubtypeOf(*TensorType::get())) {
-      std::vector<Value*> tensorIndices;
-      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-      Value* sliced;
       // Handle multi-dimensional slicing: first emit int/slice indexing
       // TODO: the Python equivalent code has special-cased copy_to
       // broadcasting to match NumPy semantics (see PR#4853). We can't
       // replicate that without knowing the size of the Tensor; so really that
       // code should be moved into the aten function
-      std::tie(sliced, tensorIndices) = emitIntAndSliceIndexing(
+      auto [sliced, tensorIndices] = emitIntAndSliceIndexing(
           lhs.range(), sliceable, lhs.subscript_exprs());
 
       const auto slicedArg = NamedValue(lhs.range(), sliced);
@@ -5688,7 +5685,7 @@ void runCleanupPasses(std::shared_ptr<Graph>& to_clean) {
   // successive runs of immutable constant prop does not change the graph
   ConstantPropagationImmutableTypes(to_clean);
 
-  // Constant Pooling pass must be after ConstantPropogation, which can create
+  // Constant Pooling pass must be after ConstantPropagation, which can create
   // new constants that needs to be pooled.
   ConstantPooling(to_clean);
 
diff --git a/torch/csrc/jit/frontend/lexer.cpp b/torch/csrc/jit/frontend/lexer.cpp
index 491fce0f47ea3..41a88ad9962c6 100644
--- a/torch/csrc/jit/frontend/lexer.cpp
+++ b/torch/csrc/jit/frontend/lexer.cpp
@@ -78,7 +78,7 @@ C10_EXPORT int stringToKind(const std::string& str) {
   }();
   try {
     return str_to_kind.at(str);
-  } catch (std::out_of_range& err) {
+  } catch (std::out_of_range&) {
     throw std::out_of_range("unknown token in stringToKind");
   }
 }
diff --git a/torch/csrc/jit/frontend/lexer.h b/torch/csrc/jit/frontend/lexer.h
index 5ab926993c1f6..ff59fa98ee7c2 100644
--- a/torch/csrc/jit/frontend/lexer.h
+++ b/torch/csrc/jit/frontend/lexer.h
@@ -1,6 +1,5 @@
 #pragma once
 #include <c10/macros/Macros.h>
-#include <c10/util/C++17.h>
 #include <c10/util/Exception.h>
 #include <torch/csrc/Export.h>
 #include <torch/csrc/jit/frontend/parser_constants.h>
diff --git a/torch/csrc/jit/frontend/schema_matching.cpp b/torch/csrc/jit/frontend/schema_matching.cpp
index 9db749ede984d..0b4fa8ef65b2e 100644
--- a/torch/csrc/jit/frontend/schema_matching.cpp
+++ b/torch/csrc/jit/frontend/schema_matching.cpp
@@ -143,6 +143,14 @@ Value* tryConvertToType(
       } else if (concrete_int) {
         value = graph.insert(aten::Int, {value}, {}, loc);
       }
+    } else if (*value->type() == *BoolType::get()) {
+      if (concrete_float) {
+        value = graph.insert(aten::Float, {value}, {}, loc);
+      } else if (concrete_int) {
+        value = graph.insert(aten::Int, {value}, {}, loc);
+      } else if (concrete_number) {
+        value = graph.insert(aten::Int, {value}, {}, loc);
+      }
     }
 
     // Convert strings to device
diff --git a/torch/csrc/jit/frontend/source_range.cpp b/torch/csrc/jit/frontend/source_range.cpp
index e642c5f35a574..03c366878af99 100644
--- a/torch/csrc/jit/frontend/source_range.cpp
+++ b/torch/csrc/jit/frontend/source_range.cpp
@@ -267,10 +267,7 @@ void SourceRange::print_with_context(
 
   // print out location information
   if (auto flc = file_line_col()) {
-    std::string filename;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    size_t line, col;
-    std::tie(filename, line, col) = *flc;
+    auto [filename, line, col] = *flc;
     out << "  File \"" << filename << "\", line " << line;
     if (!funcname.empty()) {
       out << ", in " << funcname;
diff --git a/torch/csrc/jit/frontend/sugared_value.cpp b/torch/csrc/jit/frontend/sugared_value.cpp
index 3095d0cc01b41..80b5d27fba079 100644
--- a/torch/csrc/jit/frontend/sugared_value.cpp
+++ b/torch/csrc/jit/frontend/sugared_value.cpp
@@ -70,14 +70,26 @@ bool SimpleValue::hasAttr(
     const SourceRange& loc,
     GraphFunction& m,
     const std::string& field) {
-  auto class_type = value_->type()->cast<ClassType>();
-  if (!class_type) {
-    throw ErrorReport(loc) << "hasattr's first argument must be an object, got "
-                           << value_->type()->repr_str() << " instead";
+  if (auto class_type = value_->type()->cast<ClassType>()) {
+    return class_type->hasMethod(field) || class_type->hasAttribute(field) ||
+        class_type->hasConstant(field);
+  } else if (auto tuple_type = value_->type()->cast<TupleType>()) {
+    if (tuple_type->schema()) {
+      for (const auto& arg : tuple_type->schema()->arguments()) {
+        if (arg.name() == field) {
+          return true;
+        }
+      }
+      return false;
+    } else {
+      throw ErrorReport(loc) << "hasattr's first argument must be a object "
+                             << "or NamedTuple, but got a normal Tuple "
+                             << value_->type()->repr_str() << " instead";
+    }
   }
-
-  return class_type->hasMethod(field) || class_type->hasAttribute(field) ||
-      class_type->hasConstant(field);
+  throw ErrorReport(loc) << "hasattr's first argument must be an object or "
+                         << "NamedTuple, got " << value_->type()->repr_str()
+                         << " instead";
 }
 
 // support syntax sugar for x.foo(y, z) by allowing x.foo to return a
@@ -133,7 +145,7 @@ std::shared_ptr<SugaredValue> SimpleValue::attr(
            {"H", "prim"},
            {"mT", "aten"},
            {"mH", "aten"},
-           {"is_ort", "prim"},
+           {"is_maia", "prim"},
            {"itemsize", "prim"},
            {"nbytes", "prim"},
            {"ndim", "prim"},
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index fe8bb67f3647d..29953ecd19a3e 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -228,7 +228,7 @@ AliasDb::AliasDb(
       writeRegistry_(std::make_unique<AliasDb::WriteRegistry>()) {
   analyze(graph_);
 
-  memoryDAG_ = std::make_unique<MemoryDAG>(std::move(memoryDAGBuilder_));
+  memoryDAG_ = std::move(*memoryDAGBuilder_).createMemoryDAG();
   memoryDAGBuilder_ = nullptr; // to make further access a hard error
 
   memoryDAG_->setWildcards(
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index 4ebb0df753504..a320570de5ca9 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -354,10 +354,7 @@ std::ostream& Node::print(
       }
     }
     if (auto file_line_col = r.file_line_col()) {
-      std::string filename;
-      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-      size_t line, col;
-      std::tie(filename, line, col) = *file_line_col;
+      auto [filename, line, col] = *file_line_col;
       out << " # " << filename << ":" << line << ":" << col;
     }
   }
diff --git a/torch/csrc/jit/ir/type_hashing.cpp b/torch/csrc/jit/ir/type_hashing.cpp
index 8c37db506ed0d..5d1c03cb493b2 100644
--- a/torch/csrc/jit/ir/type_hashing.cpp
+++ b/torch/csrc/jit/ir/type_hashing.cpp
@@ -11,13 +11,14 @@ namespace torch::jit {
 namespace {
 size_t hashType(const Type& type) {
   if (auto named_type = type.castRaw<ClassType>()) {
-    return get_hash(named_type->name().value());
+    return c10::get_hash(
+        named_type->name().value(), named_type->compilation_unit());
   }
   size_t hash = 0;
   for (const auto& containedType : type.containedTypes()) {
     hash = at::hash_combine(hash, hashType(*containedType));
   }
-  at::hash_combine(hash, get_hash(type.kind()));
+  hash = at::hash_combine(hash, get_hash(type.kind()));
   return hash;
 }
 } // namespace
diff --git a/torch/csrc/jit/ir/type_hashing.h b/torch/csrc/jit/ir/type_hashing.h
index b225ce2ce94fa..8d82b98cc0848 100644
--- a/torch/csrc/jit/ir/type_hashing.h
+++ b/torch/csrc/jit/ir/type_hashing.h
@@ -6,7 +6,7 @@
 namespace torch {
 namespace jit {
 
-struct HashType {
+struct TORCH_API HashType {
   size_t operator()(const TypePtr& type) const;
   size_t operator()(const c10::ConstTypePtr& type) const;
 };
diff --git a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
index 884ad1a973a4a..09c5df58f0bec 100644
--- a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
+++ b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
@@ -187,10 +187,10 @@ std::stringstream update_bytecode_version(
       "bytecode",
   };
 
-  std::stringstream ouput_model_stream;
+  std::stringstream output_model_stream;
   auto writer_func = [&](const void* buf, size_t nbytes) -> size_t {
-    ouput_model_stream.write(static_cast<const char*>(buf), nbytes);
-    return !ouput_model_stream ? 0 : nbytes;
+    output_model_stream.write(static_cast<const char*>(buf), nbytes);
+    return !output_model_stream ? 0 : nbytes;
   };
 
   PyTorchStreamWriter writer_bytecode(writer_func);
@@ -218,7 +218,7 @@ std::stringstream update_bytecode_version(
       /*use_storage_context=*/true,
       storage_context);
 
-  return ouput_model_stream;
+  return output_model_stream;
 }
 } // namespace
 
@@ -307,10 +307,10 @@ std::stringstream backport_v5_to_v4(std::stringstream& input_model_stream) {
       "bytecode",
   };
 
-  std::stringstream ouput_model_stream;
+  std::stringstream output_model_stream;
   auto writer_func = [&](const void* buf, size_t nbytes) -> size_t {
-    ouput_model_stream.write(static_cast<const char*>(buf), nbytes);
-    return !ouput_model_stream ? 0 : nbytes;
+    output_model_stream.write(static_cast<const char*>(buf), nbytes);
+    return !output_model_stream ? 0 : nbytes;
   };
 
   PyTorchStreamWriter writer(writer_func);
@@ -361,7 +361,7 @@ std::stringstream backport_v5_to_v4(std::stringstream& input_model_stream) {
   auto constants_tuple =
       c10::ivalue::Tuple::create(std::move(constants_values));
   writeArchiveV4(writer, kArchiveNameConstants, constants_tuple);
-  return ouput_model_stream;
+  return output_model_stream;
 }
 
 /*
diff --git a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
index 805f876fdc3f4..de120c8fa1e87 100644
--- a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
+++ b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
@@ -75,9 +75,7 @@ static uint64_t _get_model_bytecode_version_from_bytes(char* data, size_t size);
 uint64_t _get_model_bytecode_version(std::istream& in) {
   auto orig_pos = in.tellg();
   in.seekg(0, in.beg);
-  std::shared_ptr<char> data;
-  size_t size = 0;
-  std::tie(data, size) = get_stream_content(in);
+  auto [data, size] = get_stream_content(in);
   in.seekg(orig_pos, in.beg);
   return _get_model_bytecode_version_from_bytes(data.get(), size);
 }
@@ -89,9 +87,7 @@ uint64_t _get_model_bytecode_version(const std::string& filename) {
 
 uint64_t _get_model_bytecode_version(
     std::shared_ptr<ReadAdapterInterface> rai) {
-  std::shared_ptr<char> data;
-  size_t size = 0;
-  std::tie(data, size) = get_rai_content(rai.get());
+  auto [data, size] = get_rai_content(rai.get());
   return _get_model_bytecode_version_from_bytes(data.get(), size);
 }
 
diff --git a/torch/csrc/jit/mobile/debug_info.cpp b/torch/csrc/jit/mobile/debug_info.cpp
index 05fd60b7886b2..cbd9478e22591 100644
--- a/torch/csrc/jit/mobile/debug_info.cpp
+++ b/torch/csrc/jit/mobile/debug_info.cpp
@@ -119,9 +119,7 @@ MobileDebugTable::MobileDebugTable(
   const c10::string_view suffix(".debug_pkl");
   for (const auto& record_name : record_names) {
     if (c10::string_view(record_name).ends_with(suffix)) {
-      at::DataPtr debug_data;
-      size_t debug_size{0};
-      std::tie(debug_data, debug_size) = reader->getRecord(record_name);
+      auto [debug_data, debug_size] = reader->getRecord(record_name);
       auto ivalueTuple = jit::unpickle(
           reinterpret_cast<const char*>(debug_data.get()),
           debug_size,
@@ -157,9 +155,7 @@ MobileDebugTable::MobileDebugTable(
   }
   const std::string callstack_debug_file("callstack_debug_map.pkl");
   if (reader->hasRecord("callstack_debug_map.pkl")) {
-    at::DataPtr callstack_data;
-    size_t callstack_data_size{0};
-    std::tie(callstack_data, callstack_data_size) =
+    auto [callstack_data, callstack_data_size] =
         reader->getRecord(callstack_debug_file);
     CallStackDebugInfoUnpickler unpickler;
     callstack_ptr_map_ = unpickler.unpickle(
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.cpp b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
index 09b5e9acffc66..f906f4e2b9eb4 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.cpp
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
@@ -827,24 +827,18 @@ mobile::Module load_mobile_module_from_file(
     const std::string& filename,
     c10::optional<c10::Device> device,
     ExtraFilesMap* extra_files) {
-  std::shared_ptr<char> data;
-  size_t size = 0;
-  std::tie(data, size) = get_file_content(filename.c_str());
+  auto [data, size] = get_file_content(filename.c_str());
   return parse_and_initialize_mobile_module(
       std::move(data), size, device, extra_files);
 }
 
 uint64_t get_bytecode_version(std::istream& in) {
-  std::shared_ptr<char> data;
-  size_t size = 0;
-  std::tie(data, size) = get_stream_content(in);
+  auto [data, size] = get_stream_content(in);
   return get_bytecode_version_from_bytes(data.get());
 }
 
 uint64_t get_bytecode_version(const std::string& filename) {
-  std::shared_ptr<char> data;
-  size_t size = 0;
-  std::tie(data, size) = get_file_content(filename.c_str());
+  auto [data, size] = get_file_content(filename.c_str());
   return get_bytecode_version_from_bytes(data.get());
 }
 
@@ -893,9 +887,7 @@ mobile::Module load_mobile_module_from_stream_with_copy(
     std::istream& in,
     c10::optional<at::Device> device,
     ExtraFilesMap* extra_files) {
-  std::shared_ptr<char> data;
-  size_t size = 0;
-  std::tie(data, size) = get_stream_content(in);
+  auto [data, size] = get_stream_content(in);
   return parse_and_initialize_mobile_module(
       std::move(data), size, device, extra_files);
 }
diff --git a/torch/csrc/jit/mobile/import_data.cpp b/torch/csrc/jit/mobile/import_data.cpp
index e9380bbf97f0e..11fbcbc45e3f2 100644
--- a/torch/csrc/jit/mobile/import_data.cpp
+++ b/torch/csrc/jit/mobile/import_data.cpp
@@ -269,18 +269,14 @@ static std::map<std::string, at::Tensor> _load_parameters_bytes(
 std::map<std::string, at::Tensor> _load_parameters(
     std::istream& in,
     c10::optional<at::Device> device) {
-  std::shared_ptr<char> data;
-  size_t size = 0;
-  std::tie(data, size) = get_stream_content(in);
+  auto [data, size] = get_stream_content(in);
   return _load_parameters_bytes(std::move(data), size, device);
 }
 
 std::map<std::string, at::Tensor> _load_parameters(
     const std::string& filename,
     c10::optional<at::Device> device) {
-  std::shared_ptr<char> data;
-  size_t size = 0;
-  std::tie(data, size) = get_file_content(filename.c_str());
+  auto [data, size] = get_file_content(filename.c_str());
   return _load_parameters_bytes(std::move(data), size, device);
 }
 
diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index 5fa90dbecdb2a..3ba01d291d05c 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -395,6 +395,8 @@ bool InterpreterState::run(Stack& stack) {
 }
 
 IValue& InterpreterState::reg(size_t reg) {
+  TORCH_CHECK(
+      reg > 0 && reg <= registers_.size(), "Invalid register index: ", reg);
   return *(registers_.end() - reg);
 }
 
diff --git a/torch/csrc/jit/passes/autocast.cpp b/torch/csrc/jit/passes/autocast.cpp
index 564f70e9da28d..213f569f87b02 100644
--- a/torch/csrc/jit/passes/autocast.cpp
+++ b/torch/csrc/jit/passes/autocast.cpp
@@ -96,17 +96,19 @@ c10::optional<AutocastScope> parseAutocast(
           use.user->s(attr::name) == "fast_dtype") {
         // Search for `prim::SetAttr[name="fast_dtype"]`
         auto ret = constant_as<c10::ScalarType>(use.user->input(1));
-        TORCH_CHECK(
-            ret.has_value() && ret.value() != c10::ScalarType::Undefined,
-            "Autocast dtype argument must be a constant and defined");
-        dtype = ret.value();
+        if (ret.has_value()) {
+          dtype = ret.value();
+        }
       }
     }
     TORCH_CHECK(enabled.has_value(), "Autocast missing _enabled attribute");
+    TORCH_CHECK(!device.empty(), "Autocast missing device attribute");
+    if (dtype == c10::ScalarType::Undefined) {
+      dtype = at::autocast::get_autocast_dtype(c10::Device(device).type());
+    }
     TORCH_CHECK(
         dtype != c10::ScalarType::Undefined,
-        "Autocast missing fast_dtype attribute");
-    TORCH_CHECK(!device.empty(), "Autocast missing device attribute");
+        "Autocast has invalid fast_dtype attribute");
     if (device == "cuda") {
       scope.context.gpu_enabled = enabled.value();
       scope.context.gpu_scalar_type = dtype;
@@ -521,10 +523,10 @@ void Autocast(const std::shared_ptr<Graph>& graph) {
   GRAPH_DUMP("\nBefore Autocast: ", graph);
   if (autocastEnabled()) {
     AutocastContext init = {
-        at::autocast::is_enabled(),
-        at::autocast::is_cpu_enabled(),
-        at::autocast::get_autocast_gpu_dtype(),
-        at::autocast::get_autocast_cpu_dtype()};
+        at::autocast::is_autocast_enabled(at::kCUDA),
+        at::autocast::is_autocast_enabled(at::kCPU),
+        at::autocast::get_autocast_dtype(at::kCUDA),
+        at::autocast::get_autocast_dtype(at::kCPU)};
     handleBlock(graph->block(), init);
   }
   GRAPH_DUMP("\nAfter Autocast: ", graph);
diff --git a/torch/csrc/jit/passes/concat_opt.cpp b/torch/csrc/jit/passes/concat_opt.cpp
index 999fb6aab9787..a3ff088661187 100644
--- a/torch/csrc/jit/passes/concat_opt.cpp
+++ b/torch/csrc/jit/passes/concat_opt.cpp
@@ -1,6 +1,8 @@
 #include <torch/csrc/jit/passes/concat_opt.h>
 
 #include <algorithm>
+#include <deque>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
diff --git a/torch/csrc/jit/passes/fold_conv_bn.cpp b/torch/csrc/jit/passes/fold_conv_bn.cpp
index 8a17339868f06..9df6887d24289 100644
--- a/torch/csrc/jit/passes/fold_conv_bn.cpp
+++ b/torch/csrc/jit/passes/fold_conv_bn.cpp
@@ -120,7 +120,7 @@ class FoldConvBatchNormHelper {
   /**
    * In this step we find all Conv - BatchNorm patterns in the graph
    * and extract the corresponding parameters for these two modules,
-   * and record informations for the modifications of the graph without
+   * and record information for the modifications of the graph without
    * actually performing these modifications.
    */
   void analyze(Module& module, const PatternInfo& pattern);
diff --git a/torch/csrc/jit/passes/frozen_linear_folding.cpp b/torch/csrc/jit/passes/frozen_linear_folding.cpp
index 25e79852cb280..0c500bebb9900 100644
--- a/torch/csrc/jit/passes/frozen_linear_folding.cpp
+++ b/torch/csrc/jit/passes/frozen_linear_folding.cpp
@@ -58,6 +58,23 @@ bool FoldFrozenLinearBatchnorm(Block* b) {
       auto bn_eps = constant_as<double>(bn->namedInput("eps")).value();
       auto linear_w = constant_as<Tensor>(linear->namedInput("weight")).value();
 
+      int64_t linear_out_features = linear_w.size(0);
+      int64_t bn_num_features = bn_rm.size(0);
+
+      // Linear-BN needs to be fused while preserving the shapes of linear
+      // weight/bias. To preserve the shapes of linear weight/bias, the channel
+      // dim of bn needs to be broadcastable with the last dim of linear,
+      // because bn operates over the channel dim, (N, C_in, H, W) while linear
+      // operates over the last dim, (*, H_in). To be broadcastable, the number
+      // of features in bn and the number of output features from linear must
+      // satisfy the following condition:
+      // 1. they are equal, or
+      // 2. the number of features in bn is 1
+      // Otherwise, skip the folding path
+      if (!(linear_out_features == bn_num_features || bn_num_features == 1)) {
+        continue;
+      }
+
       // implementation taken from torch/nn/utils/fusion.py
       Tensor linear_b;
       if (linear->namedInput("bias")->type() == NoneType::get()) {
diff --git a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
index 39e89dad4736a..f6f63de01a498 100644
--- a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
+++ b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
@@ -300,8 +300,7 @@ void MKLDNNLayerNormOp(Stack& stack, bool inplace) {
   auto shape = pop(stack).toDimVector();
   auto input = pop(stack).toTensor();
 
-  at::Tensor dst, mean, rstd;
-  std::tie(dst, mean, rstd) =
+  auto [dst, mean, rstd] =
       at::native::mkldnn_layer_norm_last_index_weight_bias_f32(
           input, shape, weight, bias, eps, inplace);
   push(stack, dst);
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index 630a20a4d265e..f8d7d1bbc1a10 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -15,7 +15,7 @@
 #include <torch/csrc/jit/python/python_ir.h>
 #include <torch/csrc/utils/pybind.h>
 #include <sstream>
-#include <unordered_map>
+#include <unordered_set>
 namespace torch {
 namespace jit {
 
@@ -168,7 +168,7 @@ std::shared_ptr<Graph> ToONNX(
   auto constant_value_map = ConstantValueMap::getInstance();
   ConstantValueMap::ClearMaps();
   auto new_graph = std::make_shared<Graph>(graph->current_scope());
-  std::unordered_map<Value*, Value*> env;
+  py::dict env;
   try {
     BlockToONNX(graph->block(), new_graph->block(), operator_export_type, env);
   } catch (std::runtime_error& ex) {
@@ -187,11 +187,11 @@ std::shared_ptr<Graph> ToONNX(
 // (e.g., if sub block), and we want to convert it into its parent block in onnx
 // graph. In this case, we don't register the input/output or eliminate the dead
 // code.
-std::unordered_map<Value*, Value*> BlockToONNX(
+py::dict BlockToONNX(
     Block* old_block,
     Block* new_block,
     ::torch::onnx::OperatorExportTypes operator_export_type,
-    std::unordered_map<Value*, Value*>& env,
+    py::dict& env,
     bool is_sub_block) {
   torch::autograd::SymbolicContext ctx{};
   ctx.block = new_block;
@@ -204,7 +204,7 @@ std::unordered_map<Value*, Value*> BlockToONNX(
   if (!is_sub_block) {
     for (auto input : old_block->inputs()) {
       auto n = ctx.block->addInput()->copyMetadata(input);
-      env[input] = n;
+      env[py::cast(input)] = py::cast(n);
     }
   }
 
@@ -218,7 +218,9 @@ std::unordered_map<Value*, Value*> BlockToONNX(
   }
 
   for (auto output : old_block->outputs()) {
-    ctx.block->registerOutput(env.at(output));
+    auto py_value = env[py::cast(output)];
+    Value* value = py_value.cast<Value*>();
+    ctx.block->registerOutput(value);
   }
   // Run dce to clean-up unused functional and inplace ops.
   EliminateDeadCode(
@@ -226,7 +228,7 @@ std::unordered_map<Value*, Value*> BlockToONNX(
       true,
       DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
 
-  return {};
+  return py::dict();
 }
 
 bool ConstantFoldCondition(torch::jit::Value* output) {
@@ -241,7 +243,7 @@ void NodeToONNX(
     Node* old_node,
     Block* new_block,
     ::torch::onnx::OperatorExportTypes operator_export_type,
-    std::unordered_map<Value*, Value*>& env) {
+    py::dict& env) {
   py::object onnx = py::module::import("torch.onnx");
   py::object onnx_globals = py::module::import("torch.onnx._globals");
   py::object onnx_registration =
@@ -251,10 +253,12 @@ void NodeToONNX(
 
   // Returns a node that n maps to in the new graph
   auto envFn = [&env](Value* n) -> Value* {
-    auto it = env.find(n);
-    TORCH_CHECK(it != env.end(), "Dangling node reference");
-    TORCH_CHECK(it->second, "Unused node was subsequently used");
-    return it->second;
+    auto py_n = py::cast(n);
+    TORCH_CHECK(env.contains(py_n), "Dangling node reference");
+    auto py_value = env[py_n];
+    TORCH_CHECK(!py_value.is_none(), "Unused node was subsequently used");
+    Value* value = py_value.cast<Value*>();
+    return value;
   };
 
   // Put the new outputs in our environment map, and copy the type from the
@@ -280,12 +284,14 @@ void NodeToONNX(
     for (const auto i : c10::irange(num_old_outputs)) {
       auto old = old_outputs[i];
       if (outputs[i]) {
-        bool exist_in_env =
-            (env.end() !=
-             std::find_if(
-                 env.begin(), env.end(), [&outputs, i](const auto& vt) {
-                   return vt.second == outputs[i];
-                 }));
+        py::object py_output = py::cast(outputs[i]);
+        bool exist_in_env = false;
+        for (auto it : env) {
+          if (it.second.equal(py_output)) {
+            exist_in_env = true;
+            break;
+          }
+        }
         // Update ONNX value debug name with ATen value debug name if existed.
         // Skip if ONNX value already exist in environment.
         // This implies the op is a noop, and the value is owned by
@@ -325,7 +331,7 @@ void NodeToONNX(
           const_node->copyMetadata(node);
           new_block->appendNode(const_node);
           ONNXShapeTypeInference(const_node, empty_params_dict, opset_version);
-          env[old] = const_node->output();
+          env[py::cast(old)] = py::cast(const_node->output());
         } else {
           // An update in ConstantValueMap is also needed here, since
           // the user setType can be only accessed in this step, and it
@@ -348,12 +354,12 @@ void NodeToONNX(
           if (!exist_in_env) {
             outputs[i]->node()->copyMetadata(node);
           }
-          env[old] = outputs[i];
+          env[py::cast(old)] = py::cast(outputs[i]);
         }
       } else {
         // Null output means that the ONNX op doesn't have outputs corresponding
         // to certain PyTorch outputs
-        env[old] = nullptr;
+        env[py::cast(old)] = py::none();
         if (!old->uses().empty()) {
           std::ostringstream ss;
           ss << "symbolic for " << op_name << " returned None for the output "
@@ -373,7 +379,7 @@ void NodeToONNX(
         new_block->owningGraph()->createClone(node, envFn));
     for (const auto i : c10::irange(node->outputs().size())) {
       // n_->outputs()[i]->setType(node->outputs()[i]->type());
-      env[node->output(i)] = n_->output(i);
+      env[py::cast(node->output(i))] = py::cast(n_->output(i));
     }
   };
 
@@ -381,13 +387,15 @@ void NodeToONNX(
   auto inlineAutograd = [&](Node* PythonOpNode) {
     for (auto subblock : PythonOpNode->blocks()) {
       for (const auto i : c10::irange(PythonOpNode->inputs().size())) {
-        env[subblock->inputs()[i]] = env[PythonOpNode->inputs()[i]];
+        env[py::cast(subblock->inputs()[i])] =
+            env[py::cast(PythonOpNode->inputs()[i])];
       }
       for (auto* node : subblock->nodes()) {
         NodeToONNX(node, new_block, operator_export_type, env);
       }
       for (const auto i : c10::irange(PythonOpNode->outputs().size())) {
-        env[PythonOpNode->outputs()[i]] = env[subblock->outputs()[i]];
+        env[py::cast(PythonOpNode->outputs()[i])] =
+            env[py::cast(subblock->outputs()[i])];
       }
     }
   };
@@ -430,30 +438,27 @@ void NodeToONNX(
     }
 
     Graph* g = new_block->owningGraph();
-    std::unordered_set<Node*> nodes_before;
-    for (auto node : g->nodes()) {
-      nodes_before.emplace(node);
-    }
 
     WithInsertPoint insert_point_guard(new_block);
     WithCurrentScope scope_guard(*g, n->scope());
 
     // IMPORTANT: NEVER pass raw pointer of smart pointer managed objects to
     // Python. Check #87343 for details.
+    py::list new_nodes = py::list();
     py::object raw_output = onnx.attr("_run_symbolic_function")(
         g->shared_from_this(),
         new_block,
         n,
         py_inputs,
         env,
+        new_nodes,
         operator_export_type);
 
     // Find new nodes that have been created by _run_symbolic_function and
     // propagate metadata
-    for (auto node : g->nodes()) {
-      if (nodes_before.find(node) == nodes_before.end()) {
-        node->copyMetadata(n);
-      }
+    for (py::handle py_node : new_nodes) {
+      Node* node = py_node.cast<Node*>();
+      node->copyMetadata(n);
     }
 
     // TODO: Assert it's an ATen identifier???
@@ -569,12 +574,14 @@ void NodeToONNX(
       // Call symbolic function
       // IMPORTANT: NEVER pass raw pointer of smart pointer managed objects to
       // Python. Check #87343 for details.
+      py::list new_nodes = py::list();
       py::object raw_output = onnx.attr("_run_symbolic_function")(
           new_block->owningGraph()->shared_from_this(),
           new_block,
           n,
           py_symbolic_args,
           env,
+          new_nodes,
           operator_export_type);
 
       processSymbolicOutput(op->kind().toUnqualString(), n, raw_output);
diff --git a/torch/csrc/jit/passes/onnx.h b/torch/csrc/jit/passes/onnx.h
index 11bee67916404..0e6831b4c8a23 100644
--- a/torch/csrc/jit/passes/onnx.h
+++ b/torch/csrc/jit/passes/onnx.h
@@ -2,7 +2,7 @@
 
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/onnx/onnx.h>
-#include <unordered_map>
+#include <torch/csrc/utils/pybind.h>
 
 namespace torch {
 namespace jit {
@@ -10,17 +10,17 @@ namespace jit {
 TORCH_API std::shared_ptr<Graph> ToONNX(
     std::shared_ptr<Graph>& state,
     ::torch::onnx::OperatorExportTypes operator_export_type);
-TORCH_API std::unordered_map<Value*, Value*> BlockToONNX(
+TORCH_API py::dict BlockToONNX(
     Block* old_block,
     Block* new_block,
     ::torch::onnx::OperatorExportTypes operator_export_type,
-    std::unordered_map<Value*, Value*>& env,
+    py::dict& env,
     bool is_sub_block = false);
 TORCH_API void NodeToONNX(
     Node* old_node,
     Block* new_block,
     ::torch::onnx::OperatorExportTypes operator_export_type,
-    std::unordered_map<Value*, Value*>& env);
+    py::dict& env);
 TORCH_API void RemovePrintOps(std::shared_ptr<Graph>& graph);
 TORCH_API void PreprocessCaffe2Ops(std::shared_ptr<Graph>& graph);
 
diff --git a/torch/csrc/jit/passes/onnx/constant_fold.cpp b/torch/csrc/jit/passes/onnx/constant_fold.cpp
index 309b1648e54f8..1d0457c65a5fb 100644
--- a/torch/csrc/jit/passes/onnx/constant_fold.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_fold.cpp
@@ -483,7 +483,7 @@ c10::optional<at::Tensor> runTorchBackendForOnnx(
     at::Tensor indices = inputTensorValues[1];
     auto q = indices.dim();
     // at::index_select only supports indices with rank <= 1.
-    // See https://pytorch.org/docs/master/generated/torch.index_select.html
+    // See https://pytorch.org/docs/main/generated/torch.index_select.html
     if (q > 1) {
       return c10::nullopt;
     }
diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
index c31c5a3dca1e0..5407d9aa2986b 100644
--- a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
+++ b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
@@ -257,9 +257,7 @@ void FixupONNXLoopBlockInputs(Node* n) {
       Value* input_i = block->inputs().at(i);
       if (input_i->type()->cast<OptionalType>() &&
           !block->outputs().at(i)->type()->cast<OptionalType>()) {
-        TypePtr merged_type;
-        bool inferred = false;
-        std::tie(merged_type, inferred) = MergeInferredType(
+        auto [merged_type, inferred] = MergeInferredType(
             input_i->type()->cast<OptionalType>()->getElementType(),
             block->outputs().at(i)->type());
         if (inferred) {
@@ -336,9 +334,7 @@ void FixupONNXLoopNodeInputs(Node* node, int opset_version) {
     // vice-versa.
     if (!input->type()->cast<OptionalType>() && sub_block_input_optional) {
       if (!input->type()->cast<NoneType>()) {
-        TypePtr merged_type;
-        bool inferred = false;
-        std::tie(merged_type, inferred) = MergeInferredType(
+        auto [merged_type, inferred] = MergeInferredType(
             sub_block_input_optional->getElementType(), input->type());
         if (inferred) {
           sub_block_input_optional = OptionalType::create(merged_type);
diff --git a/torch/csrc/jit/passes/onnx/function_extraction.cpp b/torch/csrc/jit/passes/onnx/function_extraction.cpp
index 7d856852fedd5..d6555c5c5bb70 100644
--- a/torch/csrc/jit/passes/onnx/function_extraction.cpp
+++ b/torch/csrc/jit/passes/onnx/function_extraction.cpp
@@ -877,9 +877,7 @@ std::tuple<FunctionExtractor::scope_ctx_map, node_list> FunctionExtractor::
     }
 
     for (auto* sub_b : n->blocks()) {
-      scope_ctx_map subblock_scope_ctxs;
-      node_list subblock_no_scope_nlist;
-      std::tie(subblock_scope_ctxs, subblock_no_scope_nlist) =
+      auto [subblock_scope_ctxs, subblock_no_scope_nlist] =
           PartitionNodesByScope(sub_b);
 
       for (auto& it : subblock_scope_ctxs) {
diff --git a/torch/csrc/jit/passes/onnx/helper.cpp b/torch/csrc/jit/passes/onnx/helper.cpp
index 2f4757546bc14..d6b2a6385fab4 100644
--- a/torch/csrc/jit/passes/onnx/helper.cpp
+++ b/torch/csrc/jit/passes/onnx/helper.cpp
@@ -91,8 +91,12 @@ c10::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type) {
       return at::kBFloat16;
     case ::torch::onnx::TensorProto_DataType_FLOAT8E5M2:
       return at::kFloat8_e5m2;
+    case ::torch::onnx::TensorProto_DataType_FLOAT8E5M2FNUZ:
+      return at::kFloat8_e5m2fnuz;
     case ::torch::onnx::TensorProto_DataType_FLOAT8E4M3FN:
       return at::kFloat8_e4m3fn;
+    case ::torch::onnx::TensorProto_DataType_FLOAT8E4M3FNUZ:
+      return at::kFloat8_e4m3fnuz;
     default:
       TORCH_CHECK(
           false,
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
index a1a3d8356d1d0..d304ee52592c2 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
@@ -69,7 +69,7 @@ std::unordered_map<int64_t, ConvertedIndex> MergeSliceAndSelectToIndices(
     Node* index_put_node,
     const std::vector<Node*>& slice_and_select_nodes,
     Value* orig_data,
-    const std::unordered_map<Value*, Value*>& env) {
+    const py::dict& env) {
   std::unordered_map<int64_t, ConvertedIndex> dim_index_map;
 
   // Loop over fetched slice and select nodes and convert them to index tensors.
@@ -87,7 +87,10 @@ std::unordered_map<int64_t, ConvertedIndex> MergeSliceAndSelectToIndices(
     int64_t dim = node->inputs().at(1)->node()->t(attr::value).item().toLong();
 
     if (dim < 0) {
-      auto input_type = env.at(orig_data)->type()->expect<TensorType>();
+      // auto input_type = env.at(orig_data)->type()->expect<TensorType>();
+      auto py_value = env[py::cast(orig_data)];
+      Value* value = py_value.cast<Value*>();
+      auto input_type = value->type()->expect<TensorType>();
       if (input_type->dim().has_value()) {
         auto rank = static_cast<int64_t>(input_type->dim().value());
         // Rank of original tensor to index on.
@@ -283,7 +286,7 @@ std::vector<Value*> ReshapeToAdvancedIndexingFormat(
 std::vector<Value*> ConvertIndexPutToONNX(
     Block* new_block,
     Node* old_node,
-    std::unordered_map<Value*, Value*>& env) {
+    py::dict& env) {
   if (old_node->kind() != Symbol::fromQualString("onnx::Placeholder") ||
       (old_node->s(attr::name) != "index_put" &&
        old_node->s(attr::name) != "index_put_")) {
@@ -353,7 +356,9 @@ std::vector<Value*> ConvertIndexPutToONNX(
   // Find onnx outputs corresponding to the aten outputs of index_put.
   std::vector<Value*> outs;
   for (auto o : subblock->return_node()->inputs()) {
-    outs.emplace_back(env[o]);
+    auto py_value = env[py::cast(o)];
+    Value* value = py_value.cast<Value*>();
+    outs.emplace_back(value);
   }
   return outs;
 }
@@ -363,7 +368,7 @@ std::vector<Value*> ConvertIndexPutToONNX(
 std::vector<Value*> ConvertPatternFromSubblock(
     Block* new_block,
     Node* old_node,
-    std::unordered_map<Value*, Value*>& env) {
+    py::dict& env) {
   std::vector<Value*> res;
 
   if (old_node->kind() != Symbol::fromQualString("onnx::Placeholder")) {
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.h b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.h
index 6c1e1f8c6cd2e..3efd0a60e2162 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.h
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/utils/pybind.h>
 
 namespace torch {
 namespace jit {
@@ -38,7 +39,7 @@ namespace jit {
 TORCH_API std::vector<Value*> ConvertPatternFromSubblock(
     Block* new_block,
     Node* old_node,
-    std::unordered_map<Value*, Value*>& env);
+    py::dict& env);
 
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index 4814723621ccf..9e1c17120f654 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -547,7 +547,7 @@ void fixDefaultRnnHiddenState(Block* b, int opset_version) {
       continue;
     }
     // Hidden state is the sixth input for RNN, LSTM, GRU.
-    // See https://pytorch.org/docs/master/nn.html#torch.nn.RNN
+    // See https://pytorch.org/docs/main/nn.html#torch.nn.RNN
     if (n->inputs().size() < 6) {
       continue;
     }
@@ -566,7 +566,7 @@ void fixDefaultLstmCellState(Block* b, int opset_version) {
       continue;
     }
     // Cell state is the seventh input for LSTM.
-    // See https://pytorch.org/docs/master/nn.html#torch.nn.LSTM
+    // See https://pytorch.org/docs/main/nn.html#torch.nn.LSTM
     if (n->inputs().size() < 7) {
       continue;
     }
diff --git a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
index 1467a63a134c0..638acd464adcd 100644
--- a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
+++ b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
@@ -31,6 +31,8 @@ static const std::unordered_map<c10::ScalarType, int, ScalarTypeHashFunction>
         {c10::kBFloat16, 15},
         {c10::kFloat8_e4m3fn, 16},
         {c10::kFloat8_e5m2, 17},
+        {c10::kFloat8_e4m3fnuz, 18},
+        {c10::kFloat8_e5m2fnuz, 19},
 };
 
 static int64_t ScalarTypeToONNXType(const c10::ScalarType& st) {
@@ -69,6 +71,8 @@ static const std::unordered_set<NodeKind> comparisonOps = {
     onnx::LessOrEqual,
 };
 
+static const std::unordered_set<NodeKind> selectorOps = {onnx::Where};
+
 static bool IsStandardOp(const NodeKind& nkind) {
   return standardOps.find(nkind) != standardOps.end();
 }
@@ -77,6 +81,10 @@ static bool IsComparisonOp(const NodeKind& nkind) {
   return comparisonOps.find(nkind) != comparisonOps.end();
 }
 
+static bool IsSelectorOp(const NodeKind& nkind) {
+  return selectorOps.find(nkind) != selectorOps.end();
+}
+
 static TensorTypePtr CreateProfiledTensorTypeWithScalarType(
     const TensorTypePtr& typePtr,
     const c10::ScalarType& scalar_type) {
@@ -85,7 +93,8 @@ static TensorTypePtr CreateProfiledTensorTypeWithScalarType(
 }
 
 static bool IsImplicitCastSupported(const NodeKind& nodeKind) {
-  return IsStandardOp(nodeKind) || IsComparisonOp(nodeKind);
+  return IsStandardOp(nodeKind) || IsComparisonOp(nodeKind) ||
+      IsSelectorOp(nodeKind);
 }
 
 static c10::optional<c10::ScalarType> PromoteScalarTypes(
@@ -102,7 +111,7 @@ static c10::optional<c10::ScalarType> PromoteScalarTypes(
 
 // Type promotion between scalars and tensors
 // per logic here
-// https://pytorch.org/docs/master/tensor_attributes.html#tensor-attributes
+// https://pytorch.org/docs/main/tensor_attributes.html#tensor-attributes
 static c10::optional<c10::ScalarType> PromoteScalarTypesWithCategory(
     const std::vector<c10::ScalarType>& typesFromTensors,
     const std::vector<c10::ScalarType>& typesFromScalars) {
@@ -179,8 +188,16 @@ static c10::optional<c10::ScalarType> InferExpectedScalarType(const Node* n) {
         }
       };
 
+  size_t input_idx = 0;
   std::for_each(
       n->inputs().begin(), n->inputs().end(), [&](const Value* input) {
+        // We skip the 'condition' input (i.e., the first input) in case of
+        // onnx::Where operator.
+        if (IsSelectorOp(n->kind()) && input_idx == 0) {
+          input_idx++;
+          return;
+        }
+
         auto nkind = input->node()->kind();
         if (nkind == onnx::Gather &&
             input->node()->input(0)->node()->kind() == onnx::Shape) {
@@ -230,6 +247,8 @@ static c10::optional<c10::ScalarType> InferExpectedScalarType(const Node* n) {
           } else {
             typesFromTensors.emplace_back(scalar_type.value());
           }
+
+          input_idx++;
         }
       });
 
@@ -253,7 +272,7 @@ static c10::optional<c10::ScalarType> InferExpectedScalarType(const Node* n) {
       // are tensors or scalars. (Previously only scalars support implicit
       // casting).
       // Per logic here
-      // https://pytorch.org/docs/master/tensor_attributes.html#tensor-attributes
+      // https://pytorch.org/docs/main/tensor_attributes.html#tensor-attributes
       st = PromoteScalarTypesWithCategory(typesFromTensors, typesFromScalars);
     }
   }
@@ -290,11 +309,19 @@ static void UpdateScalarTypeForInputs(
     return;
   }
 
+  size_t input_idx = 0;
   for (auto input : n->inputs()) {
     auto input_tensor_type = input->type()->cast<TensorType>();
     auto input_scalar_type =
         input_tensor_type ? input_tensor_type->scalarType() : c10::nullopt;
 
+    // We skip the 'condition' input (i.e., the first input) in case of
+    // onnx:Where operator.
+    if (IsSelectorOp(n->kind()) && input_idx == 0) {
+      input_idx++;
+      continue;
+    }
+
     if ((input->node()->kind() == onnx::Constant) ||
         (input_scalar_type && (*input_scalar_type != scalar_type))) {
       if (input->node()->kind() == onnx::Constant) {
@@ -320,6 +347,8 @@ static void UpdateScalarTypeForInputs(
         n->replaceInputWith(input, cast_node->output());
       }
     }
+
+    input_idx++;
   }
 }
 
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index 6fb75ac203127..186623bf4e049 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -77,10 +77,7 @@ void MergeInferredTypeAndSetMap(
     Value* dest_v,
     TypePtr existing_type,
     TypePtr inferred_type) {
-  TypePtr mergedType;
-  bool inferred;
-  std::tie(mergedType, inferred) =
-      MergeInferredType(existing_type, inferred_type);
+  auto [mergedType, inferred] = MergeInferredType(existing_type, inferred_type);
   dest_v->setType(mergedType);
   ConstantValueMap::SetUseInferredType(dest_v->debugName(), inferred);
 }
@@ -298,10 +295,8 @@ Node* CloneNodeToGraph(
     std::shared_ptr<Graph> n_graph,
     const ParamMap& params_dict,
     int opset_version) {
-  auto vals_to_params_map =
-      buildValueToParamsMap(n->owningGraph()->block(), params_dict);
   auto clone_node = n_graph->createClone(
-      n, [&n_graph, &vals_to_params_map, opset_version](Value* v) {
+      n, [&n_graph, &params_dict, opset_version](Value* v) {
         auto v_n = v->node();
         switch (v_n->kind()) {
           case ::c10::prim::Constant:
@@ -331,9 +326,10 @@ Node* CloneNodeToGraph(
             // If the input value is unknown, set it to graph input in the new
             // graph, and copy over metadata, such as datatype and shape.
             ::c10::optional<at::Tensor> val = ::c10::nullopt;
-            if (vals_to_params_map.find(v) != vals_to_params_map.end()) {
-              val = vals_to_params_map.find(v)->second.second.toTensor();
-            } else if (ConstantValueMap::HasValue(v->debugName())) {
+            auto v0 = params_dict.find(v->debugName());
+            if (v0 != params_dict.end()) {
+              val = v0->second.toTensor();
+            } else {
               val = ConstantValueMap::GetValue(v->debugName());
             }
 
@@ -919,11 +915,15 @@ void ProcessReduceNode(Node* n) {
     size_t rank_0 = input_shape_value_0.value().size();
     std::vector<::c10::ShapeSymbol> final_shape;
     std::vector<int64_t> axes_vector(rank_0);
-    if (!n->hasAttributeS("axes")) {
-      std::iota(axes_vector.begin(), axes_vector.end(), 0);
-    } else {
+    if (n->hasAttributeS("axes")) {
       axes_vector = n->is(attr::axes);
+    } else if (n->inputs().size() > 1) {
+      axes_vector =
+          ConstantValueMap::GetValueInto1DInt64Vector(n->input(1)->debugName());
+    } else {
+      std::iota(axes_vector.begin(), axes_vector.end(), 0);
     }
+
     for (auto idx : c10::irange(axes_vector.size())) {
       if (axes_vector[idx] < 0) {
         axes_vector[idx] += rank_0;
@@ -1968,8 +1968,6 @@ void UpdateReliable(
       nodeTypeReliableForTracer.end();
   if (!inferred && !isTypeReliableForTracer &&
       !output->node()->kind().is_onnx() && no_type_warning) {
-    // TODO(84661): This warning comes before setType in symbolic_fn.
-    // tracked in #84661
     TORCH_WARN(
         "The shape inference of ",
         output->node()->kind().toDisplayString(),
@@ -2056,11 +2054,16 @@ void ONNXShapeTypeInference(
           clone_node->output(i)->debugName();
     }
     // Make inferred_shape_data use name from temporal ONNX graph
-    // instead of original PyTorch graph
-    for (const auto& gs_data : original_shape_data) {
-      const auto onnx_output_name = torch_to_onnx_input.find(gs_data.first);
-      if (onnx_output_name != torch_to_onnx_input.end()) {
-        inferred_shape_data[onnx_output_name->second] = gs_data.second;
+    // instead of original PyTorch graph. Only copy what we need,
+    // which are the inputs of n.
+    for (auto input : n->inputs()) {
+      const auto maybe_shape = original_shape_data.find(input->debugName());
+      if (maybe_shape != original_shape_data.end()) {
+        const auto onnx_output_name =
+            torch_to_onnx_input.find(input->debugName());
+        if (onnx_output_name != torch_to_onnx_input.end()) {
+          inferred_shape_data[onnx_output_name->second] = maybe_shape->second;
+        }
       }
     }
     // Use scalar_type_analysis without low precision cast
diff --git a/torch/csrc/jit/passes/remove_inplace_ops.cpp b/torch/csrc/jit/passes/remove_inplace_ops.cpp
index cf93fd8a24d0a..9d94e43bc465b 100644
--- a/torch/csrc/jit/passes/remove_inplace_ops.cpp
+++ b/torch/csrc/jit/passes/remove_inplace_ops.cpp
@@ -118,7 +118,7 @@ void ImplicitCastForBinaryInplaceOps(Block* b) {
       if ((shape_node->kind() == prim::NumToTensor) &&
           (shape_node->inputs().at(0)->node()->kind() == aten::size)) {
         std::cerr
-            << "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#"
+            << "In-place op on output of tensor.shape. See https://pytorch.org/docs/main/onnx.html#"
             << "avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode"
             << std::endl;
       }
diff --git a/torch/csrc/jit/passes/utils/memory_dag.h b/torch/csrc/jit/passes/utils/memory_dag.h
index c455d3413e704..f3068588dae85 100644
--- a/torch/csrc/jit/passes/utils/memory_dag.h
+++ b/torch/csrc/jit/passes/utils/memory_dag.h
@@ -19,40 +19,52 @@ typedef c10::SparseBitVector<256> MemoryLocations;
 namespace torch {
 namespace jit {
 
-struct Element;
 struct Value;
-class MemoryDAG;
 
 using AliasTypeSet = std::vector<TypePtr>;
 
-/**
- * Helper to build up the points-to graph.
- *
- * We separate the "building" into a different class because it allows us to
- * cache internally to MemoryDAG without worrying about how the DAG structure
- * is mutated.
- */
-class TORCH_API MemoryDAGBuilder {
- public:
-  MemoryDAGBuilder() = default;
-  MemoryDAGBuilder(const MemoryDAGBuilder&) = delete;
-  MemoryDAGBuilder& operator=(const MemoryDAGBuilder&) = delete;
+// `Element` represents a vertex in the points-to graph. It represents
+// anything that could have an aliasing relationship--mostly IR
+// `Value`s, but also wildcards or the type inside a container (e.g. `T`
+// in `List[T]`)
+struct Element {
+  Element(const Value* value_, unsigned index_);
+  // wildcard constructor
+  explicit Element(unsigned index_);
 
-  // Make `from` point at `to`.
-  void makePointerTo(Element* from, Element* to);
+  // Index into the owning DAG's bit vector that represents this element.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  unsigned index;
 
-  void addToContainedElements(Element* contained, Element* container);
+  // All elements that this element *may* point to. It's possible to have
+  // multiple elements that you might point to due to control flow/complex ops
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  MemoryLocations pointsTo;
+  // Backreference for points-to.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  MemoryLocations pointedFrom;
 
-  // Make a fresh Element (i.e. an Element that doesn't point to anything) and
-  // return it.
-  Element* makeFreshValue(const Value* v);
+  // Elements can contain other elements (e.g. List[Tensor])
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  MemoryLocations containedElements;
 
-  friend MemoryDAG;
+  // The values that this element corresponds to. May be empty if this element
+  // doesn't represent a first-class value.
+  // This is for debug information only.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unordered_set<const Value*> values;
 
  private:
-  // `MemoryDAGBuilder` builds up `indexToElementMap_`, then uses
-  // the map to construct the `MemoryDAG`
-  std::vector<std::unique_ptr<Element>> indexToElementMap_;
+  // Make `from` point at `to`.
+  void makePointerTo(Element* from, Element* to);
+
+  friend class MemoryDAG;
+  // We memoize the results of `getMemoryLocations` to speed up queries.
+  // A nullopt means that this cache is not yet populated. Since `MemoryDAG` is
+  // immutable, this cache should never need to be invalidated.
+  mutable c10::optional<MemoryLocations> cachedMemoryLocations_;
+
+  mutable c10::optional<MemoryLocations> cachedAllContainedMemoryLocations_;
 };
 
 // class MemoryDAG
@@ -72,8 +84,8 @@ class TORCH_API MemoryDAGBuilder {
 // which memory locations an element may point to.
 class TORCH_API MemoryDAG {
  public:
-  explicit MemoryDAG(std::unique_ptr<MemoryDAGBuilder> builder)
-      : indexToElementMap_(std::move(builder->indexToElementMap_)) {}
+  explicit MemoryDAG(std::vector<std::unique_ptr<Element>> indexToElementMap)
+      : indexToElementMap_(std::move(indexToElementMap)) {}
   // explicitly delete copy constructor because otherwise windows build is
   // confused for an exported class see
   // https://stackoverflow.com/a/51033485/105137
@@ -127,49 +139,38 @@ class TORCH_API MemoryDAG {
   std::vector<std::unique_ptr<Element>> indexToElementMap_;
 };
 
-// `Element` represents a vertex in the points-to graph. It represents
-// anything that could have an aliasing relationship--mostly IR
-// `Value`s, but also wildcards or the type inside a container (e.g. `T`
-// in `List[T]`)
-struct Element {
-  Element(const Value* value_, unsigned index_);
-  // wildcard constructor
-  explicit Element(unsigned index_);
-
-  // Index into the owning DAG's bit vector that represents this element.
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  unsigned index;
+/**
+ * Helper to build up the points-to graph.
+ *
+ * We separate the "building" into a different class because it allows us to
+ * cache internally to MemoryDAG without worrying about how the DAG structure
+ * is mutated.
+ */
+class TORCH_API MemoryDAGBuilder {
+ public:
+  MemoryDAGBuilder() = default;
+  MemoryDAGBuilder(const MemoryDAGBuilder&) = delete;
+  MemoryDAGBuilder& operator=(const MemoryDAGBuilder&) = delete;
 
-  // All elements that this element *may* point to. It's possible to have
-  // multiple elements that you might point to due to control flow/complex ops
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  MemoryLocations pointsTo;
-  // Backreference for points-to.
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  MemoryLocations pointedFrom;
+  // Make `from` point at `to`.
+  void makePointerTo(Element* from, Element* to);
 
-  // Elements can contain other elements (e.g. List[Tensor])
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  MemoryLocations containedElements;
+  void addToContainedElements(Element* contained, Element* container);
 
-  // The values that this element corresponds to. May be empty if this element
-  // doesn't represent a first-class value.
-  // This is for debug information only.
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  std::unordered_set<const Value*> values;
+  std::unique_ptr<MemoryDAG> createMemoryDAG() && {
+    return std::make_unique<MemoryDAG>(std::move(indexToElementMap_));
+  }
 
- private:
-  // Make `from` point at `to`.
-  void makePointerTo(Element* from, Element* to);
+  // Make a fresh Element (i.e. an Element that doesn't point to anything) and
+  // return it.
+  Element* makeFreshValue(const Value* v);
 
-  friend class MemoryDAG;
-  // We memoize the results of `getMemoryLocations` to speed up queries.
-  // A nullopt means that this cache is not yet populated. Since `MemoryDAG` is
-  // immutable, this cache should never need to be invalidated.
-  mutable c10::optional<MemoryLocations> cachedMemoryLocations_;
+  friend MemoryDAG;
 
-  mutable c10::optional<MemoryLocations> cachedAllContainedMemoryLocations_;
+ private:
+  // `MemoryDAGBuilder` builds up `indexToElementMap_`, then uses
+  // the map to construct the `MemoryDAG`
+  std::vector<std::unique_ptr<Element>> indexToElementMap_;
 };
-
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/vulkan_rewrite.cpp b/torch/csrc/jit/passes/vulkan_rewrite.cpp
index 8bb24a7e92878..1b848d6101c48 100644
--- a/torch/csrc/jit/passes/vulkan_rewrite.cpp
+++ b/torch/csrc/jit/passes/vulkan_rewrite.cpp
@@ -118,6 +118,27 @@ void insertPrePackedConv2dOp(std::shared_ptr<Graph>& graph) {
   transpose_rewriter.runOnGraph(graph);
 }
 
+void insertPrePackedConv1dOp(std::shared_ptr<Graph>& graph) {
+  graph_rewrite_helper::replaceConvolutionWithAtenConv(graph);
+
+  std::string conv_1d_pattern = R"(
+    graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %groups:int):
+        %r = aten::conv1d(%input, %weight, %bias, %stride, %padding, %dilation, %groups)
+        return (%r) )";
+
+  std::string prepacked_ops_conv1d_pattern = R"(
+    graph(%input, %weight, %bias, %stride:int[], %padding:int[], %dilation:int[], %groups:int):
+        %packed_weight_bias = vulkan_prepack::create_conv1d_context(
+            %weight, %bias, %stride, %padding, %dilation, %groups)
+        %r = vulkan_prepack::run_conv1d_context(%input, %packed_weight_bias)
+        return (%r) )";
+
+  SubgraphRewriter rewriter;
+  rewriter.RegisterRewritePattern(
+      conv_1d_pattern, prepacked_ops_conv1d_pattern);
+  rewriter.runOnGraph(graph);
+}
+
 void transferInputOutputBackends(std::shared_ptr<Graph>& graph) {
   // Move inputs to Vulkan backend
   for (Value* input : graph->inputs()) {
@@ -214,6 +235,26 @@ void rewriteQuantizedOps(std::shared_ptr<Graph>& graph) {
       quantized_conv2d_pattern, vk_quantized_conv2d_pattern);
   quantized_conv2d_rewriter.runOnGraph(graph);
 
+  // quantized::conv_transpose2d
+  std::string quantized_conv_transpose2d_pattern = R"(
+    graph(%a_quant, %packed_params, %r_scale, %r_zero_point) :
+      %res = quantized::conv_transpose2d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+      return (%res) )";
+  std::string vk_quantized_conv_transpose2d_pattern = R"(
+    graph(%a_quant, %packed_params, %r_scale, %r_zero_point):
+      %output_min_max : None = prim::Constant()
+      %vk_packed_params : __torch__.torch.classes.vulkan.Conv2dPackedContext = vulkan_quantized_prepack::convert_qtconv2d_context(
+        %packed_params, %output_min_max, %output_min_max)
+      %res = vulkan_prepack::run_qconv2d_context(
+        %a_quant, %r_scale, %r_zero_point, %vk_packed_params)
+      return (%res) )";
+
+  torch::jit::SubgraphRewriter quantized_conv_transpose2d_rewriter;
+  quantized_conv_transpose2d_rewriter.RegisterRewritePattern(
+      quantized_conv_transpose2d_pattern,
+      vk_quantized_conv_transpose2d_pattern);
+  quantized_conv_transpose2d_rewriter.runOnGraph(graph);
+
   // quantized::conv2d_relu
   std::string quantized_conv2d_relu_pattern = R"(
     graph(%a_quant, %packed_params, %r_scale, %r_zero_point) :
@@ -389,6 +430,7 @@ void vulkanInsertPrePackedOps(std::shared_ptr<Graph>& graph) {
   insertPrePackedLinearOp(graph);
   insertPrePackedLayernormOp(graph);
   insertPrePackedConv2dOp(graph);
+  insertPrePackedConv1dOp(graph);
   rewriteQuantizedOps(graph);
   insertPrePackedGruOp(graph);
   insertPrePackedLstmOp(graph);
@@ -420,9 +462,16 @@ void vulkanFoldPrePackingOps(script::Module& m) {
          Symbol::fromQualString("vulkan_prepack::create_tconv2d_context")) ||
         (n->kind() ==
          Symbol::fromQualString("vulkan_prepack::create_qconv2d_context")) ||
+        (n->kind() ==
+         Symbol::fromQualString("vulkan_prepack::create_qtconv2d_context")) ||
         (n->kind() ==
          Symbol::fromQualString(
              "vulkan_quantized_prepack::convert_qconv2d_context")) ||
+        (n->kind() ==
+         Symbol::fromQualString("vulkan_prepack::create_conv1d_context")) ||
+        (n->kind() ==
+         Symbol::fromQualString(
+             "vulkan_quantized_prepack::convert_qtconv2d_context")) ||
         (n->kind() ==
          Symbol::fromQualString(
              "vulkan_quantized_prepack::convert_linear_context")) ||
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 960672e8841a3..a5e3c6059bc84 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1248,6 +1248,11 @@ void initJITBindings(PyObject* module) {
           [](c10::SymNode a, const char* file, int64_t line) {
             return a->expect_size(file, line);
           })
+      .def(
+          "guard_size_oblivious",
+          [](c10::SymNode a, const char* file, int64_t line) {
+            return a->guard_size_oblivious(file, line);
+          })
       .def(
           "has_hint",
           [](c10::SymNode a) {
@@ -1279,20 +1284,30 @@ void initJITBindings(PyObject* module) {
           [](const c10::SymNode& node){
             return node->is_constant();
           })
+      .def(
+          "is_nested_int",
+          [](const c10::SymNode& node) {
+            return node->is_nested_int();
+          })
       .def(
           "is_symbolic",
           [](const c10::SymNode& node) {
             return node->is_symbolic();
           })
       .def(
-          "singleton_int",
+          "nested_int",
           [](const c10::SymNode& node) {
-            return node->singleton_int();
+            return node->nested_int();
           })
       .def(
-          "singleton_coeff",
+          "nested_int_coeff",
           [](const c10::SymNode& node) {
-            return node->singleton_coeff();
+            return node->nested_int_coeff();
+          })
+      .def(
+          "__deepcopy__",
+          [](const c10::SymNode& node, py::handle memo) {
+            return node->clone();
           });
 
   // clang-format on
@@ -1373,6 +1388,7 @@ void initJITBindings(PyObject* module) {
           if (size == 0) {
             return size;
           }
+          py::gil_scoped_acquire acquire;
           auto memory_view = py::memoryview::from_memory(
               reinterpret_cast<const char*>(data), size);
           buffer.attr("write")(std::move(memory_view));
@@ -1386,18 +1402,50 @@ void initJITBindings(PyObject* module) {
           [](PyTorchStreamWriter& self,
              const std::string& name,
              const char* data,
-             size_t size) { return self.writeRecord(name, data, size); })
-      .def("write_end_of_file", &PyTorchStreamWriter::writeEndOfFile)
-      .def("set_min_version", &PyTorchStreamWriter::setMinVersion)
+             size_t size) {
+            // Since we don't know where the data come from, we cannot
+            // release the GIL in this overload
+            return self.writeRecord(name, data, size);
+          })
+      .def(
+          "write_record",
+          [](PyTorchStreamWriter& self,
+             const std::string& name,
+             py::bytes data,
+             size_t size) {
+            // It is not clear from the doc but according to CPython own code,
+            // it is ok to use the result of PyBytes_AsString without the GIL
+            // being held
+            // https://github.com/python/cpython/blob/e2a3e4b7488aff6fdc704a0f258bc315e96c1d6e/Objects/stringlib/join.h#L67
+            const char* data_str = PyBytes_AsString(data.ptr());
+            py::gil_scoped_release release;
+            return self.writeRecord(name, data_str, size);
+          })
+      .def(
+          "write_record",
+          [](PyTorchStreamWriter& self,
+             const std::string& name,
+             c10::Storage data,
+             size_t size) {
+            // Reading Tensor data is always ok without the GIL held
+            py::gil_scoped_release release;
+            return self.writeRecord(
+                name, reinterpret_cast<const char*>(data.data()), size);
+          })
       .def(
           "write_record",
           [](PyTorchStreamWriter& self,
              const std::string& name,
              uintptr_t data,
              size_t size) {
+            TORCH_WARN_ONCE(
+                "write_record(): Passing Storage by data pointer is deprecated and will be an error in ",
+                "the future, please pass the Storage object instead.");
             return self.writeRecord(
                 name, reinterpret_cast<const char*>(data), size);
           })
+      .def("write_end_of_file", &PyTorchStreamWriter::writeEndOfFile)
+      .def("set_min_version", &PyTorchStreamWriter::setMinVersion)
       .def("archive_name", &PyTorchStreamWriter::archiveName)
       .def("serialization_id", &PyTorchStreamWriter::serializationId)
       .def(
@@ -1492,9 +1540,7 @@ void initJITBindings(PyObject* module) {
       .def(
           "get_record",
           [](PyTorchStreamReader& self, const std::string& key) {
-            at::DataPtr data;
-            size_t size = 0;
-            std::tie(data, size) = self.getRecord(key);
+            auto [data, size] = self.getRecord(key);
             return py::bytes(reinterpret_cast<const char*>(data.get()), size);
           })
       .def(
@@ -1625,6 +1671,14 @@ void initJITBindings(PyObject* module) {
         }
       });
 
+  m.def(
+      "_check_schema_allow_fake_script_object",
+      [](const FunctionSchema& schema, py::args args, py::kwargs kwargs) {
+        // checkSchemaAllowFakeScriptObject will throw runtime error if there is
+        // a schema mismatch. Otherwise, it returns true.
+        return checkSchemaAllowFakeScriptObject(schema, args, kwargs);
+      });
+
   m.def(
       "_jit_resolve_packet",
       [](const char* op_name, py::args args, py::kwargs kwargs) {
@@ -1693,6 +1747,20 @@ void initJITBindings(PyObject* module) {
       },
       py::arg("qualified_name"));
 
+  m.def(
+      "_maybe_call_torch_function_for_op_packet",
+      [](py::handle op_overload_packet, py::args args, py::kwargs kwargs) {
+        py::list ns_method =
+            op_overload_packet.attr("_qualified_op_name").attr("split")("::");
+        return _maybe_handle_torch_function(
+            py::cast<std::string>(ns_method[0]),
+            py::cast<std::string>(ns_method[1]),
+            "",
+            false,
+            args,
+            kwargs);
+      });
+
   m.def(
       "parse_ir",
       [](const std::string& input, bool parse_tensor_constants) {
@@ -1829,6 +1897,13 @@ void initJITBindings(PyObject* module) {
             ss << self;
             return ss.str();
           })
+      .def(
+          "__repr__",
+          [](FunctionSchema& self) {
+            std::stringstream ss;
+            ss << self;
+            return ss.str();
+          })
       .def_property_readonly(
           "is_mutable", [](FunctionSchema& self) { return self.is_mutable(); });
   py::class_<Argument>(m, "Argument")
diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index 260962c3854fb..23107d91d99ac 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -114,6 +114,16 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
       if (torch::is_symfloat(py::handle(obj))) {
         return py::cast<c10::SymFloat>(obj).guard_float(__FILE__, __LINE__);
       }
+      if (THPVariable_Check(obj.ptr())) {
+        auto var = py::cast<autograd::Variable>(obj);
+        // NB: We carefully test if the storage is meta, because that is
+        // always accurate even if you have a fake tensor (which is the
+        // primary case we are trying to detect here)
+        if (var.storage().device_type() == c10::kMeta) {
+          throw py::cast_error(
+              "cannot extract float from tensor with meta storage");
+        }
+      }
       return py::cast<double>(obj);
     case TypeKind::ComplexType: {
       auto c_obj = py::cast<std::complex<double>>(obj.ptr());
@@ -145,6 +155,13 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
       if (torch::is_symint(py::handle(obj))) {
         return py::cast<c10::SymInt>(obj).guard_int(__FILE__, __LINE__);
       }
+      if (THPVariable_Check(obj.ptr())) {
+        auto var = py::cast<autograd::Variable>(obj);
+        if (var.storage().device_type() == c10::kMeta) {
+          throw py::cast_error(
+              "cannot extract int from tensor with meta storage");
+        }
+      }
       return py::cast<int64_t>(obj);
     case TypeKind::LayoutType: {
       if (THPLayout_Check(obj.ptr())) {
@@ -195,6 +212,13 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
       if (torch::is_symbool(obj.ptr())) {
         return py::cast<c10::SymBool>(obj).guard_bool(__FILE__, __LINE__);
       }
+      if (THPVariable_Check(obj.ptr())) {
+        auto var = py::cast<autograd::Variable>(obj);
+        if (var.storage().device_type() == c10::kMeta) {
+          throw py::cast_error(
+              "cannot extract bool from tensor with meta storage");
+        }
+      }
       return py::cast<bool>(obj);
     case TypeKind::TupleType: {
       py::tuple tuple = py::cast<py::tuple>(obj);
@@ -641,7 +665,7 @@ py::object toPyObject(IValue ivalue) {
       return std::move(t);
     }
   } else if (ivalue.isDevice()) {
-    return py::cast<py::object>(THPDevice_New(std::move(ivalue).toDevice()));
+    return py::cast(std::move(ivalue).toDevice());
   } else if (ivalue.isStream()) {
     return py::cast(std::move(ivalue).toStream());
   } else if (ivalue.isGenericDict()) {
@@ -757,14 +781,29 @@ std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
   }
 }
 
+// This function is used to check if the schema is valid for the given args and
+// kwargs. It checks script object by checking wether the FakeScriptObject is
+// an instance of the corresponding fake class for the actual class used in
+// schema.
+bool checkSchemaAllowFakeScriptObject(
+    const FunctionSchema& schema,
+    py::args args,
+    const py::kwargs& kwargs) {
+  bool match = false;
+  try {
+    match = matchSchemaAllowFakeScriptObject(schema, std::move(args), kwargs);
+  } catch (schema_match_error& error) {
+    throw std::runtime_error(error.what());
+  }
+  return match;
+}
+
 py::object invokeOperatorFromPython(
     const std::vector<std::shared_ptr<Operator>>& operations,
     py::args args,
     const py::kwargs& kwargs,
     c10::optional<c10::DispatchKey> dk) {
-  auto opWithStack = getOpWithStack(operations, args, kwargs);
-  std::shared_ptr<Operator> found_op = std::get<0>(opWithStack);
-  Stack stack = std::get<1>(opWithStack);
+  auto [found_op, stack] = getOpWithStack(operations, args, kwargs);
   {
     pybind11::gil_scoped_release no_gil_guard;
     if (dk) {
@@ -777,13 +816,13 @@ py::object invokeOperatorFromPython(
   return createPyObjectForStack(std::move(stack));
 }
 
-py::object _get_operation_for_overload_or_packet(
-    const std::vector<std::shared_ptr<Operator>>& operations,
-    Symbol symbol,
-    py::args args,
-    const py::kwargs& kwargs,
+py::tuple _maybe_handle_torch_function(
+    const std::string& ns,
+    const std::string& method_name,
+    const std::string& overload_name,
     bool is_overload,
-    c10::optional<c10::DispatchKey> dk) {
+    py::args args,
+    const py::kwargs& kwargs) {
   std::vector<PyObject*> overloaded_args;
   size_t total_arg_num = args.size() + kwargs.size();
   for (const auto i : c10::irange(args.size())) {
@@ -809,15 +848,11 @@ py::object _get_operation_for_overload_or_packet(
         false /* throw_error */);
   }
   if (!overloaded_args.empty() || at::impl::torch_function_mode_enabled()) {
-    py::object ret;
-    std::string ns = symbol.ns().toUnqualString();
-    std::string method_name = symbol.toUnqualString();
     auto self_func = py::module::import("torch")
                          .attr("ops")
                          .attr(ns.c_str())
                          .attr(method_name.c_str());
     if (is_overload) {
-      auto overload_name = operations[0]->schema().overload_name();
       if (overload_name.empty()) {
         self_func = self_func.attr("default");
       } else {
@@ -826,16 +861,36 @@ py::object _get_operation_for_overload_or_packet(
     }
     std::string module_name("torch.ops");
     module_name.append(ns);
-    return pybind11::reinterpret_steal<py::object>(
-        handle_torch_function_no_python_arg_parser(
-            overloaded_args,
-            args.ptr(),
-            kwargs.ptr(),
-            method_name.c_str(),
-            self_func.ptr(),
-            module_name.c_str()));
+    return py::make_tuple(
+        true,
+        pybind11::reinterpret_steal<py::object>(
+            handle_torch_function_no_python_arg_parser(
+                overloaded_args,
+                args.ptr(),
+                kwargs.ptr(),
+                method_name.c_str(),
+                self_func.ptr(),
+                module_name.c_str())));
   }
-  return invokeOperatorFromPython(operations, args, kwargs, dk);
+  return py::make_tuple(false, py::none());
+}
+
+py::object _get_operation_for_overload_or_packet(
+    const std::vector<std::shared_ptr<Operator>>& operations,
+    Symbol symbol,
+    py::args args,
+    const py::kwargs& kwargs,
+    bool is_overload,
+    c10::optional<c10::DispatchKey> dk) {
+  std::string ns = symbol.ns().toUnqualString();
+  std::string method_name = symbol.toUnqualString();
+  std::string overload_name = operations[0]->schema().overload_name();
+  auto res = _maybe_handle_torch_function(
+      ns, method_name, overload_name, is_overload, args, kwargs);
+  auto torch_function_called = py::cast<bool>(res[0]);
+  return torch_function_called
+      ? res[1]
+      : invokeOperatorFromPython(operations, args, kwargs, dk);
 }
 
 } // namespace torch::jit
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index e145ee09290b0..a78c3e0c0be34 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -359,7 +359,7 @@ inline c10::optional<TypePtr> unifyOrInitializeType(
 
 using InferredType = c10::InferredType;
 
-InferredType tryToInferContainerType(py::handle input);
+InferredType tryToInferContainerType(py::handle input, bool primitiveTypeOnly);
 
 // Try to infer the type of a Python object
 // The type cannot be inferred if:
@@ -496,17 +496,44 @@ inline InferredType tryToInferType(py::handle input) {
   }
 
   // Try container types
-  return tryToInferContainerType(input);
+  return tryToInferContainerType(input, false);
 }
 
-inline InferredType tryToInferContainerType(py::handle input) {
+// This function is similar to tryToInferType, but it only tries to infer
+// primitive types (int, float, bool, complex) or nested container of primitive
+// types.
+inline InferredType tryToInferPrimitiveType(py::handle input) {
+  if (input.is_none()) {
+    return InferredType(NoneType::get());
+  }
+
+  // Only primitive data type
+  if (py::isinstance<py::bool_>(input)) {
+    return InferredType(BoolType::get());
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+  } else if (py::isinstance<py::int_>(input)) {
+    return InferredType(IntType::get());
+  } else if (py::isinstance<py::float_>(input)) {
+    return InferredType(FloatType::get());
+  } else if (PyComplex_CheckExact(input.ptr())) {
+    return InferredType(ComplexType::get());
+  }
+
+  // Try container types
+  return tryToInferContainerType(input, true);
+}
+
+inline InferredType tryToInferContainerType(
+    py::handle input,
+    bool primitiveTypeOnly = false) {
   if (six::isTuple(input)) {
     py::tuple tuple = py::cast<py::tuple>(input);
     std::vector<TypePtr> element_types;
     element_types.reserve(tuple.size());
 
     for (py::handle elem : tuple) {
-      auto type_match = tryToInferType(elem);
+      auto type_match = primitiveTypeOnly ? tryToInferPrimitiveType(elem)
+                                          : tryToInferType(elem);
       if (type_match.success()) {
         element_types.push_back(type_match.type());
       } else {
@@ -528,7 +555,9 @@ inline InferredType tryToInferContainerType(py::handle input) {
 
     for (auto entry : dict) {
       // Try to infer the key type and unify it with the existing one
-      auto entry_key_type_match = tryToInferType(entry.first);
+      auto entry_key_type_match = primitiveTypeOnly
+          ? tryToInferPrimitiveType(entry.first)
+          : tryToInferType(entry.first);
       if (!entry_key_type_match.success()) {
         return entry_key_type_match.reason();
       }
@@ -543,7 +572,9 @@ inline InferredType tryToInferContainerType(py::handle input) {
       }
 
       // Try to infer the value type and unify it with the existing one
-      auto entry_value_type_match = tryToInferType(entry.second);
+      auto entry_value_type_match = primitiveTypeOnly
+          ? tryToInferPrimitiveType(entry.second)
+          : tryToInferType(entry.second);
       if (!entry_value_type_match.success()) {
         return entry_value_type_match.reason();
       }
@@ -571,7 +602,9 @@ inline InferredType tryToInferContainerType(py::handle input) {
 
     TypePtr element_type = nullptr;
     for (auto elem : list) {
-      auto element_type_match = tryToInferType(elem);
+      auto element_type_match = primitiveTypeOnly
+          ? tryToInferPrimitiveType(elem)
+          : tryToInferType(elem);
       if (!element_type_match.success()) {
         return InferredType(c10::str(
             "Could not infer type of list element: ",
@@ -590,16 +623,26 @@ inline InferredType tryToInferContainerType(py::handle input) {
     }
     return InferredType(ListType::create(element_type));
   } else {
-    // TODO: this message is not correct anymore, since this InferredType is
-    // used from a bunch of circumstances unrelated to tracing. We can re-use
-    // this instead of the attribute_failure stuff in concreteType
-    return InferredType(c10::str(
-        "Only tensors and (possibly nested) tuples of tensors, lists, or dicts",
-        "are supported ",
-        "as inputs or outputs of traced functions",
-        ", but instead got value of type ",
-        py::str(input.get_type().attr("__name__")),
-        "."));
+    if (primitiveTypeOnly) {
+      return InferredType(c10::str(
+          "Only tuple, list, or dict (possibly nested) of primitive types (bool, float, int, complex)",
+          "are supported ",
+          "as inputs or outputs of traced functions",
+          ", but instead got value of type ",
+          py::str(input.get_type().attr("__name__")),
+          "."));
+    } else {
+      // TODO: this message is not correct anymore, since this InferredType is
+      // used from a bunch of circumstances unrelated to tracing. We can re-use
+      // this instead of the attribute_failure stuff in concreteType
+      return InferredType(c10::str(
+          "Only tensors and (possibly nested) tuples of tensors, lists, or dicts",
+          "are supported ",
+          "as inputs or outputs of traced functions",
+          ", but instead got value of type ",
+          py::str(input.get_type().attr("__name__")),
+          "."));
+    }
   }
 }
 
@@ -830,6 +873,116 @@ struct VISIBILITY_HIDDEN tuple_slice {
   int64_t e;
 };
 
+inline bool validateFakeScriptObjectSchema(
+    const c10::FunctionSchema& schema,
+    size_t argumentPosition,
+    py::handle object) {
+  auto argument = schema.arguments().at(argumentPosition);
+  auto class_type = argument.real_type()->expect<c10::ClassType>();
+  auto fake_class_registry =
+      py::module::import("torch._library.fake_class_registry");
+  auto fake_class = fake_class_registry.attr("find_fake_class")(
+      class_type->name().value().qualifiedName());
+  if (!py::isinstance(object.attr("wrapped_obj"), fake_class)) {
+    throw schema_match_error(c10::str(
+        schema.formatTypeMismatchMsg(
+            argument,
+            friendlyTypeName(object),
+            argumentPosition,
+            py::repr(object.attr("wrapped_obj"))),
+        "\nCast error details: ",
+        argument.name(),
+        " is expected to be a FakeScriptObject of ",
+        class_type->name().value().qualifiedName()));
+  }
+  return true;
+}
+
+inline bool matchSchemaAllowFakeScriptObject(
+    const FunctionSchema& schema,
+    const tuple_slice& args,
+    const py::kwargs& kwargs) {
+  size_t all_arguments = args.size() + kwargs.size();
+  if (all_arguments > schema.arguments().size()) {
+    throw schema_match_error(c10::str(
+        schema.name(),
+        "() expected at most ",
+        schema.arguments().size(),
+        " argument(s) but received ",
+        all_arguments,
+        " argument(s). Declaration: ",
+        schema));
+  }
+
+  int64_t arg_idx = 0;
+  auto fake_class_registry =
+      py::module::import("torch._library.fake_class_registry");
+
+  // First push all positional args.
+  for (const auto& arg : args) {
+    // ...but refuse to do it if the schema says that this was supposed
+    // to be keyword only
+    if (schema.arguments()[arg_idx].kwarg_only()) {
+      throw schema_match_error(c10::str(
+          schema.name(),
+          "() takes ",
+          arg_idx,
+          " positional argument(s) but ",
+          args.size(),
+          " was/were given.  Declaration: ",
+          schema));
+    }
+    // Use the type information from the schema to convert the PyObject.
+    const auto& argument = schema.arguments().at(arg_idx);
+    if (argument.real_type()->kind() == TypeKind::ClassType &&
+        py::isinstance(arg, fake_class_registry.attr("FakeScriptObject"))) {
+      validateFakeScriptObjectSchema(schema, arg_idx, arg);
+    } else {
+      argumentToIValue(schema, arg_idx, arg);
+    }
+
+    arg_idx++;
+  }
+
+  // Now for every remaining non-positional argument in the schema, look for it
+  // in the kwargs dict and push it if found, or use its default value if it
+  // has one.
+  size_t consumed_kwargs = 0;
+  for (size_t i = arg_idx; i < schema.arguments().size(); ++i) {
+    const auto& arg = schema.arguments()[i];
+    if (kwargs.contains(arg.name().c_str())) {
+      auto cur_kwarg = kwargs[arg.name().c_str()];
+      if (arg.real_type()->kind() == TypeKind::ClassType &&
+          py::isinstance(
+              cur_kwarg, fake_class_registry.attr("FakeScriptObject"))) {
+        validateFakeScriptObjectSchema(schema, i, cur_kwarg);
+      } else {
+        argumentToIValue(schema, i, cur_kwarg);
+      }
+      consumed_kwargs += 1;
+    } else if (arg.default_value()) {
+      continue;
+    } else {
+      throw schema_match_error(c10::str(
+          schema.name(),
+          "() is missing value for argument '",
+          arg.name(),
+          "'. Declaration: ",
+          schema));
+    }
+  }
+
+  if (consumed_kwargs != kwargs.size()) {
+    std::vector<std::string> names;
+    for (const auto& kwarg : kwargs) {
+      names.emplace_back(py::cast<std::string>(kwarg.first));
+    }
+    throw schema_match_error(schema.findErrorInKwargs(names));
+  }
+
+  return true;
+}
+
 inline Stack createStackForSchema(
     const FunctionSchema& schema,
     const tuple_slice& args,
@@ -1104,6 +1257,19 @@ TORCH_PYTHON_API py::object invokeOperatorFromPython(
     const py::kwargs& kwargs,
     c10::optional<c10::DispatchKey> dk = c10::nullopt);
 
+TORCH_PYTHON_API py::tuple _maybe_handle_torch_function(
+    const std::string& ns,
+    const std::string& method_name,
+    const std::string& overload_name,
+    bool is_overload,
+    py::args args,
+    const py::kwargs& kwargs);
+
+TORCH_PYTHON_API bool checkSchemaAllowFakeScriptObject(
+    const FunctionSchema& schema,
+    py::args args,
+    const py::kwargs& kwargs);
+
 TORCH_PYTHON_API py::object _get_operation_for_overload_or_packet(
     const std::vector<std::shared_ptr<Operator>>& operations,
     Symbol symbol,
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index 71e6a2fefa132..7c6c5089b6d38 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -259,31 +259,26 @@ void initPythonIRBindings(PyObject* module_) {
              const std::string& onnx_file_path,
              const NodeAttrNameMap& node_attr_to_name) {
             std::string graph;
-            std::shared_ptr<::ONNX_NAMESPACE::ModelProto> model_proto;
-            RawDataExportMap export_map;
-            SymbolDimMap symbol_map;
-            bool val_use_external_data_format = false;
-            NodeNameMap onnx_node_names;
-            std::tie(
-                model_proto,
-                export_map,
-                symbol_map,
-                val_use_external_data_format,
-                onnx_node_names) =
-                export_onnx(
-                    g,
-                    initializers,
-                    onnx_opset_version,
-                    dynamic_axes,
-                    defer_weight_export,
-                    operator_export_type,
-                    strip_doc_string,
-                    keep_initializers_as_inputs,
-                    custom_opsets,
-                    add_node_names,
-                    val_use_external_data_format,
-                    onnx_file_path,
-                    node_attr_to_name);
+            auto
+                [model_proto,
+                 export_map,
+                 symbol_map,
+                 val_use_external_data_format,
+                 onnx_node_names] =
+                    export_onnx(
+                        g,
+                        initializers,
+                        onnx_opset_version,
+                        dynamic_axes,
+                        defer_weight_export,
+                        operator_export_type,
+                        strip_doc_string,
+                        keep_initializers_as_inputs,
+                        custom_opsets,
+                        add_node_names,
+                        false,
+                        onnx_file_path,
+                        node_attr_to_name);
             std::unordered_map<std::string, py::bytes>
                 python_serialized_export_map;
             for (auto& kv : export_map) {
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index 5358f22bd8027..4b854c884d026 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -82,9 +82,7 @@ FunctionSchema PythonValue::getSchema(
     rets.emplace_back(Argument("0", ret_type, {}, {}, false));
   } else {
     // Use the provided type signature
-    std::vector<TypePtr> arg_types;
-    TypePtr ret_type;
-    std::tie(arg_types, ret_type) =
+    auto [arg_types, ret_type] =
         py::cast<std::pair<std::vector<TypePtr>, TypePtr>>(signature);
 
     // arg_types does not include self but param_names does, so adjust for that
@@ -1022,12 +1020,7 @@ TypePtr registerNamedTuple(
       py::module::import("torch._jit_internal")
           .attr("_get_named_tuple_properties")(obj, loc, py::cpp_function(rcb));
 
-  std::string unqualName;
-  std::vector<std::string> field_names;
-  std::vector<TypePtr> field_types;
-  std::vector<py::object> objects;
-
-  std::tie(unqualName, field_names, field_types, objects) = py::cast<std::tuple<
+  auto [unqualName, field_names, field_types, objects] = py::cast<std::tuple<
       std::string,
       std::vector<std::string>,
       std::vector<TypePtr>,
diff --git a/torch/csrc/jit/python/python_tree_views.cpp b/torch/csrc/jit/python/python_tree_views.cpp
index 18ee1ccee5b64..a171314099c3e 100644
--- a/torch/csrc/jit/python/python_tree_views.cpp
+++ b/torch/csrc/jit/python/python_tree_views.cpp
@@ -34,9 +34,7 @@ struct SourceRangeFactory {
         leading_whitespace_chars_(leading_whitespace_chars) {}
 
   SourceRange create(int line, int start_col, int end_col) {
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    size_t start_byte_offset, end_byte_offset;
-    std::tie(start_byte_offset, end_byte_offset) = line_col_to_byte_offs(
+    auto [start_byte_offset, end_byte_offset] = line_col_to_byte_offs(
         line,
         // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
         start_col + leading_whitespace_chars_,
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index e37ce6bd9110b..22809069f8809 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -30,6 +30,7 @@
 #include <torch/csrc/jit/serialization/import.h>
 #include <torch/csrc/jit/testing/file_check.h>
 
+#include <c10/util/Exception.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/frontend/parser.h>
@@ -49,6 +50,7 @@
 #include <torch/csrc/jit/runtime/logging.h>
 #include <torch/csrc/jit/serialization/export_bytecode.h>
 #include <torch/csrc/jit/serialization/import_source.h>
+#include <torch/csrc/jit/serialization/pickle.h>
 #include <torch/csrc/jit/serialization/python_print.h>
 #include <torch/csrc/jit/testing/hooks_for_testing.h>
 
@@ -73,6 +75,8 @@
 #include <utility>
 #include <vector>
 
+#include <fmt/format.h>
+
 namespace torch::jit {
 
 using ::c10::Argument;
@@ -877,9 +881,7 @@ void initJitScriptBindings(PyObject* module) {
               },
               [](const std::tuple<py::object, std::string>& state_tup)
                   -> Object {
-                py::object state;
-                std::string qualname;
-                std::tie(state, qualname) = state_tup;
+                auto [state, qualname] = state_tup;
                 auto class_type = getCustomClass(qualname);
                 TORCH_CHECK(
                     class_type,
@@ -972,10 +974,11 @@ void initJitScriptBindings(PyObject* module) {
           [mm_name](const Object& self, py::args args, py::kwargs kwargs) {
             auto method = self.find_method(mm_name);
             if (!method) {
-              throw NotImplementedError(
-                  "'%s' is not implemented for %s",
+              std::string msg = fmt::format(
+                  "'{}' is not implemented for {}",
                   mm_name,
-                  self.type()->str().c_str());
+                  self.type()->str());
+              throw c10::NotImplementedError(msg);
             }
             return invokeScriptMethodFromPython(
                 *method,
@@ -1596,7 +1599,10 @@ void initJitScriptBindings(PyObject* module) {
             }
             return std::make_tuple(pp.str(), consts);
           })
-      .def_property_readonly("owner", &Method::owner);
+      .def_property_readonly("owner", &Method::owner)
+      .def_property_readonly("raw_owner", [](const Method& self) {
+        return Object(self.raw_owner());
+      });
   m.def("_generate_upgraders_graph", &generate_upgraders_graph);
   m.def(
       "_calculate_package_version_based_on_upgraders",
@@ -2442,6 +2448,11 @@ void initJitScriptBindings(PyObject* module) {
     return result;
   });
 
+  m.def("_pickle_save", [](IValue v) {
+    auto bytes = torch::jit::pickle_save(std::move(v));
+    return py::bytes(bytes.data(), bytes.size());
+  });
+
   initScriptDictBindings(module);
   initScriptListBindings(module);
 }
diff --git a/torch/csrc/jit/resource_guard.h b/torch/csrc/jit/resource_guard.h
index 48c689b6cbfbd..6c2a2fa64b46a 100644
--- a/torch/csrc/jit/resource_guard.h
+++ b/torch/csrc/jit/resource_guard.h
@@ -6,11 +6,11 @@ namespace jit {
 
 class ResourceGuard {
   std::function<void()> _destructor;
-  bool _released;
+  bool _released{false};
 
  public:
   ResourceGuard(std::function<void()> destructor)
-      : _destructor(std::move(destructor)), _released(false) {}
+      : _destructor(std::move(destructor)) {}
 
   // NOLINTNEXTLINE(bugprone-exception-escape)
   ~ResourceGuard() {
diff --git a/torch/csrc/jit/runtime/autodiff.cpp b/torch/csrc/jit/runtime/autodiff.cpp
index 14ec2289f12f4..0d33abb217ee9 100644
--- a/torch/csrc/jit/runtime/autodiff.cpp
+++ b/torch/csrc/jit/runtime/autodiff.cpp
@@ -391,7 +391,7 @@ static ReverseDetails addReverseInline(Gradient& grad_desc) {
     auto it = grad_map.find(v);
     if (it == grad_map.end()) {
       auto autograd_zero = graph.insertNode(graph.createAutogradZero());
-      std::tie(it, std::ignore) = grad_map.emplace(v, autograd_zero->output());
+      it = grad_map.emplace(v, autograd_zero->output()).first;
     }
     return it->second;
   };
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index 177195e5ea7ce..e5f0f69a45498 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -4,6 +4,7 @@
 #include <ATen/core/ivalue.h>
 #include <ATen/record_function.h>
 #include <c10/core/thread_pool.h>
+#include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/autograd/edge.h>
@@ -206,6 +207,41 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
     }
   }
 
+  class StackSizeDidntChangeGuard {
+   public:
+    StackSizeDidntChangeGuard(const StackSizeDidntChangeGuard&) = delete;
+    StackSizeDidntChangeGuard(StackSizeDidntChangeGuard&&) = delete;
+    StackSizeDidntChangeGuard& operator=(const StackSizeDidntChangeGuard&) =
+        delete;
+    StackSizeDidntChangeGuard& operator=(StackSizeDidntChangeGuard&&) = delete;
+
+    StackSizeDidntChangeGuard(
+        const Frame& frame,
+        const torch::jit::Stack& stack,
+        const Instruction& inst)
+        : frame_(frame), stack_(stack), instX_(inst.X) {
+      // portable maybe_unused attribute.
+      (void)frame_;
+      (void)stack_;
+      (void)instX_;
+      (void)initialSize_;
+    }
+
+    void callAssert() const {
+#ifndef NDEBUG
+      frame_.function->assert_stack_size(instX_, initialSize_, stack_.size());
+#endif
+    }
+
+   private:
+    const Frame& frame_;
+    const torch::jit::Stack& stack_;
+    std::uint32_t instX_;
+    std::size_t initialSize_{stack_.size()};
+  };
+
+  struct C10_UNUSED DoNothing {};
+
 #if defined(__GNUC__) || defined(__clang__)
 #define JIT_USE_COMPUTED_GOTO
 #endif
@@ -215,15 +251,10 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
 // `inst` will the current instruction pointed to by program counter.
 //
 // Instruction blocks should be always declared through `INST` macro and
-// the instruction body should always start with a `INST_GUARD` declaration.
+// the instruction body should always start with a `instGuard()` declaration.
 // Also blocks should be ended properly with either `INST_NEXT` (for going
 // to the next instruction), or `INST_DISPATCH` (for jumping to a computed
-// position using `INST_FETCH`).
-#define INST_FETCH(X) (frame.function->instructions_[frame.pc += (X)])
-#define INST_GUARD                                   \
-  profiling::InstructionSpan span {                  \
-    *frame.function->instructions_source()[frame.pc] \
-  }
+// position using `instFetch`).
 #if defined(JIT_USE_COMPUTED_GOTO)
 #define INST(NAME) \
   NAME:            \
@@ -233,11 +264,12 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
 #define INST(NAME) NAME
 #define INST_DISPATCH break
 #endif
-#define INST_NEXT       \
-  inst = INST_FETCH(1); \
+#define INST_NEXT      \
+  inst = instFetch(1); \
   INST_DISPATCH
 
-  bool runImpl(Stack& stack) {
+  template <bool EnableProfiling>
+  bool runTemplate(Stack& stack) {
     // if we have never run before, then we might have to return the
     // stack when we suspend, record where it starts so we return the right
     // stack
@@ -267,17 +299,36 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
     try {
       while (true) {
         Frame& frame = frames.back();
-        Instruction inst = INST_FETCH(0);
+
+        auto instFetch = [&](auto x) {
+          return frame.function->instructions_[frame.pc += x];
+        };
+
+        auto instGuard = [&] {
+          if constexpr (!EnableProfiling) {
+            return DoNothing{};
+          } else {
+            return profiling::InstructionSpan{
+                *frame.function->instructions_source()[frame.pc]};
+          }
+        };
+
+        Instruction inst = instFetch(0);
+
+        auto stackSizeAssertGuard = [&] {
+          return StackSizeDidntChangeGuard{frame, stack, inst};
+        };
+
         switch (inst.op) {
           case INST(ENTER): {
-            INST_GUARD;
+            auto _ = instGuard();
             const auto& obj = peek(stack, 0, 1);
             TORCH_INTERNAL_ASSERT(obj.isObject());
             entered_objects.push_back(obj);
           }
             INST_NEXT;
           case INST(EXIT): {
-            INST_GUARD;
+            auto _ = instGuard();
             auto obj = entered_objects.back().toObject();
             auto& f = obj->type()->getMethod("__exit__");
             push(stack, std::move(obj));
@@ -289,45 +340,37 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             continue;
           }
           case INST(OP): {
-            INST_GUARD;
-#ifndef NDEBUG
-            size_t init_size = stack.size();
-#endif
+            auto _ = instGuard();
+            auto stackSizeGuard = stackSizeAssertGuard();
             frame.function->operator_table_[inst.X](stack);
-#ifndef NDEBUG
-            frame.function->assert_stack_size(inst.X, init_size, stack.size());
-#endif
+            stackSizeGuard.callAssert();
           }
             INST_NEXT;
           case INST(OPN): {
-            INST_GUARD;
+            auto _ = instGuard();
             stack.emplace_back(inst.N);
-#ifndef NDEBUG
-            size_t init_size = stack.size();
-#endif
+            auto stackSizeGuard = stackSizeAssertGuard();
             frame.function->operator_table_[inst.X](stack);
-#ifndef NDEBUG
-            frame.function->assert_stack_size(inst.X, init_size, stack.size());
-#endif
+            stackSizeGuard.callAssert();
           }
             INST_NEXT;
           case INST(LOAD): {
-            INST_GUARD;
+            auto _ = instGuard();
             stack.emplace_back(reg(inst.X));
           }
             INST_NEXT;
           case INST(MOVE): {
-            INST_GUARD;
+            auto _ = instGuard();
             stack.emplace_back(std::move(reg(inst.X)));
           }
             INST_NEXT;
           case INST(STORE): {
-            INST_GUARD;
+            auto _ = instGuard();
             reg(inst.X) = pop(stack);
           }
             INST_NEXT;
           case INST(STOREN): {
-            INST_GUARD;
+            auto _ = instGuard();
             TORCH_INTERNAL_ASSERT(stack.size() >= inst.N);
             for (size_t i = inst.N; i > 0; --i) {
               reg(inst.X + i - 1) = pop(stack);
@@ -335,28 +378,28 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           }
             INST_NEXT;
           case INST(DROP): {
-            INST_GUARD;
+            auto _ = instGuard();
             stack.pop_back();
           }
             INST_NEXT;
           case INST(DROPR): {
-            INST_GUARD;
+            auto _ = instGuard();
             reg(inst.X) = IValue();
           }
             INST_NEXT;
           case INST(LOADC): {
-            INST_GUARD;
+            auto _ = instGuard();
             stack.emplace_back(frame.function->constant_table_[inst.X]);
           }
             INST_NEXT;
           case INST(GET_ATTR): {
-            INST_GUARD;
+            auto _ = instGuard();
             const auto& userObj = stack.back().toObjectRef();
             stack.back() = userObj.getSlot(inst.X);
           }
             INST_NEXT;
           case INST(SET_ATTR): {
-            INST_GUARD;
+            auto _ = instGuard();
             auto v = pop(stack);
             auto& userObj = stack.back().toObjectRef();
             userObj.setSlot(inst.X, std::move(v));
@@ -364,21 +407,21 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           }
             INST_NEXT;
           case INST(JF): {
-            INST_GUARD;
+            auto _ = instGuard();
             if (pop(stack).toBool()) {
-              inst = INST_FETCH(1);
+              inst = instFetch(1);
             } else {
-              inst = INST_FETCH(inst.X);
+              inst = instFetch(inst.X);
             }
           }
             INST_DISPATCH;
           case INST(JMP): {
-            INST_GUARD;
-            inst = INST_FETCH(inst.X);
+            auto _ = instGuard();
+            inst = instFetch(inst.X);
           }
             INST_DISPATCH;
           case INST(LOOP): {
-            INST_GUARD;
+            auto _ = instGuard();
             // stack: iteration_count, max_iter, cond, loop_carried_deps...
             auto fr = stack.end() - (inst.N + 1);
             int64_t trip_count = fr[0].toInt();
@@ -387,25 +430,25 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             if (trip_count < max_trip_count && cond) {
               fr[2] = trip_count;
               fr[0] = trip_count + 1;
-              inst = INST_FETCH(1);
+              inst = instFetch(1);
             } else {
               size_t n_loop_carried = inst.N - 2;
               for (const auto i : c10::irange(n_loop_carried)) {
                 fr[i] = std::move(fr[i + 3]);
               }
               drop(stack, 3); // iteration_count, max_iter, cond
-              inst = INST_FETCH(inst.X);
+              inst = instFetch(inst.X);
             }
           }
             INST_DISPATCH;
           case INST(CALL): {
-            INST_GUARD;
+            auto _ = instGuard();
             Function* fn = frame.function->function_table_[inst.X];
             callFunction(*fn, stack);
             continue;
           }
           case INST(INTERFACE_CALL): {
-            INST_GUARD;
+            auto _ = instGuard();
             // note the hash table lookup to find the function
             // this can be more optimized if necessary, caching parts
             // of the hashing computation or storing the offset when
@@ -446,7 +489,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             return false;
           }
           case INST(WAIT): {
-            INST_GUARD;
+            auto _ = instGuard();
             auto future = stack.back().toFuture();
             if (!future->completed()) {
               getOrCreateFuture();
@@ -504,7 +547,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           }
             INST_NEXT;
           case INST(PROFILE_OP): {
-            INST_GUARD;
+            auto _ = instGuard();
             auto& frame_id_ref = frame.id;
             if (!frame_id_ref.has_value()) {
               frame_id_ref = Frame::genId();
@@ -516,7 +559,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           }
             INST_NEXT;
           case INST(FAIL_GUARD): {
-            INST_GUARD;
+            auto _ = instGuard();
             // patch FAIL_GUARD back to GUARD
             GRAPH_DEBUG(
                 "Bailout ", inst.X, " triggered via bailout_requests_!");
@@ -525,7 +568,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           }
             INST_NEXT;
           case INST(TYPECHECK): {
-            INST_GUARD;
+            auto _ = instGuard();
             unsigned num_inputs = inst.N, i = 0;
             TORCH_INTERNAL_ASSERT(stack.size() >= num_inputs && num_inputs > 0);
             // Check every input's shape against profiled (expected) shape.
@@ -545,7 +588,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           }
             INST_NEXT;
           case INST(GUARD): {
-            INST_GUARD;
+            auto _ = instGuard();
             if (!stack.back().isTensor()) {
               // stack.back() is an Uninitialized IValue and this is a guard
               // on a block output. Uninitialized IValues are never used
@@ -566,7 +609,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           }
             INST_NEXT;
           case INST(TAIL_CALL): {
-            INST_GUARD;
+            auto _ = instGuard();
             GRAPH_DEBUG("running TAIL_CALL for ", inst.X);
             frame.function->function_table_[inst.X]->ensure_defined();
             size_t remaining_bailout_depth =
@@ -589,22 +632,22 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             continue;
           }
           case INST(LIST_UNPACK): {
-            INST_GUARD;
+            auto _ = instGuard();
             listUnpack(stack, inst.X);
           }
             INST_NEXT;
           case INST(TUPLE_CONSTRUCT): {
-            INST_GUARD;
+            auto _ = instGuard();
             tupleConstruct(stack, inst.X);
           }
             INST_NEXT;
           case INST(TUPLE_SLICE): {
-            INST_GUARD;
+            auto _ = instGuard();
             tupleSlice(stack, inst.X, inst.X + inst.N);
           }
             INST_NEXT;
           case INST(NAMED_TUPLE_CONSTRUCT): {
-            INST_GUARD;
+            auto _ = instGuard();
             namedTupleConstruct(
                 stack,
                 frame.function->type_table_[inst.X]->expect<TupleType>(),
@@ -612,28 +655,28 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           }
             INST_NEXT;
           case INST(LIST_CONSTRUCT): {
-            INST_GUARD;
+            auto _ = instGuard();
             const auto& type =
                 frame.function->type_table_[inst.X]->expectRef<ListType>();
             listConstruct(stack, type, inst.N);
           }
             INST_NEXT;
           case INST(DICT_CONSTRUCT): {
-            INST_GUARD;
+            auto _ = instGuard();
             const auto& type =
                 frame.function->type_table_[inst.X]->expectRef<DictType>();
             dictConstruct(stack, type, inst.N);
           }
             INST_NEXT;
           case INST(CREATE_OBJECT): {
-            INST_GUARD;
+            auto _ = instGuard();
             auto type =
                 frame.function->type_table_[inst.X]->expect<ClassType>();
             createObject(stack, type);
           }
             INST_NEXT;
           case INST(ISINSTANCE): {
-            INST_GUARD;
+            auto _ = instGuard();
             at::ArrayRef<TypePtr> types(
                 &frame.function->type_table_[inst.X],
                 &frame.function->type_table_[inst.X] + inst.N);
@@ -641,84 +684,84 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           }
             INST_NEXT;
           case INST(TUPLE_INDEX): {
-            INST_GUARD;
+            auto _ = instGuard();
             tupleIndex(stack);
           }
             INST_NEXT;
           case INST(RAISE_EXCEPTION): {
-            INST_GUARD;
+            auto _ = instGuard();
             raiseExceptionWithMessage(stack);
           }
             INST_NEXT;
           case INST(UNCHECKED_CAST): {
-            INST_GUARD;
+            auto _ = instGuard();
             noop(stack);
           }
             INST_NEXT;
           case INST(__IS__): {
-            INST_GUARD;
+            auto _ = instGuard();
             is(stack);
           }
             INST_NEXT;
           case INST(UN_INITIALIZED): {
-            INST_GUARD;
+            auto _ = instGuard();
             unInitialized(stack);
           }
             INST_NEXT;
           case INST(__ISNOT__): {
-            INST_GUARD;
+            auto _ = instGuard();
             isNot(stack);
           }
             INST_NEXT;
           case INST(FORMAT): {
-            INST_GUARD;
+            auto _ = instGuard();
             format(stack, inst.X);
           }
             INST_NEXT;
           case INST(DEVICE): {
-            INST_GUARD;
+            auto _ = instGuard();
             device(stack);
           }
             INST_NEXT;
           case INST(DTYPE): {
-            INST_GUARD;
+            auto _ = instGuard();
             TORCH_INTERNAL_ASSERT(!stack.empty());
             dtype(stack);
           }
             INST_NEXT;
           case INST(DIM): {
-            INST_GUARD;
+            auto _ = instGuard();
             TORCH_INTERNAL_ASSERT(!stack.empty());
             dim(stack);
           }
             INST_NEXT;
           case INST(__NOT__): {
-            INST_GUARD;
+            auto _ = instGuard();
             _not(stack);
           }
             INST_NEXT;
           case INST(DICT_INDEX): {
-            INST_GUARD;
+            auto _ = instGuard();
             dictIndex(stack);
           }
             INST_NEXT;
           case INST(TO_LIST): {
-            INST_GUARD;
+            auto _ = instGuard();
             toList(stack);
           }
             INST_NEXT;
           case INST(NUM_TO_TENSOR): {
-            INST_GUARD;
+            auto _ = instGuard();
             numToTensorScalar(stack);
           }
             INST_NEXT;
           case INST(IS_CUDA): {
-            INST_GUARD;
+            auto _ = instGuard();
             isCuda(stack);
           }
             INST_NEXT;
           case INST(FORK): {
-            INST_GUARD;
+            auto _ = instGuard();
             // Move inputs to a separate stack
             auto& forked_fn =
                 toGraphFunction(*frame.function->function_table_[inst.X]);
@@ -734,7 +777,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           }
             INST_NEXT;
           case INST(AWAITABLE): {
-            INST_GUARD;
+            auto _ = instGuard();
             auto fn_ptr = frame.function->function_table_[inst.X];
             auto& fn = toGraphFunction(*fn_ptr);
             auto num_outputs = fn.graph()->outputs().size();
@@ -774,7 +817,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           }
             INST_NEXT;
           case INST(WARN): {
-            INST_GUARD;
+            auto _ = instGuard();
             // Keeps track of which WARN instruction has been executed before,
             // we only want to execute each WARN once to match default Python
             // warning behavior.
@@ -852,10 +895,16 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
 #undef INST_NEXT
 #undef INST_DISPATCH
 #undef INST
-#undef INST_GUARD
-#undef INST_FETCH
 #undef JIT_USE_COMPUTED_GOTO
 
+  bool runImpl(Stack& stack) {
+    if (!profiling::isProfilingOngoing()) {
+      return runTemplate</*EnableProfiling*/ false>(stack);
+    } else {
+      return runTemplate</*EnableProfiling*/ true>(stack);
+    }
+  }
+
   void formatStackTrace(std::ostream& out) {
     format_stack_trace(out, callstack());
   }
diff --git a/torch/csrc/jit/runtime/jit_trace.cpp b/torch/csrc/jit/runtime/jit_trace.cpp
index 6a3941901d6bd..cff9d5f954e81 100644
--- a/torch/csrc/jit/runtime/jit_trace.cpp
+++ b/torch/csrc/jit/runtime/jit_trace.cpp
@@ -248,9 +248,9 @@ void insertTracingNodes(Block* block, ProfilingRecord* pr, TracingData& td) {
 
       GRAPH_DEBUG("Tracing ", getHeader(n));
       auto tracer = traceNode(n, td, stack);
-      auto ouputs_size = n->outputs().size();
-      auto iivs = pop(stack, ouputs_size);
-      for (size_t j = 0; j < ouputs_size; j++) {
+      auto outputs_size = n->outputs().size();
+      auto iivs = pop(stack, outputs_size);
+      for (size_t j = 0; j < outputs_size; j++) {
         auto& iiv = iivs[j];
         if (iiv.isTensor()) {
           auto t = iiv.toTensor();
diff --git a/torch/csrc/jit/runtime/logging.cpp b/torch/csrc/jit/runtime/logging.cpp
index a75b2dd6e582d..efea5b5822979 100644
--- a/torch/csrc/jit/runtime/logging.cpp
+++ b/torch/csrc/jit/runtime/logging.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/runtime/logging.h>
 
 #include <atomic>
+#include <chrono>
 #include <mutex>
 #include <stdexcept>
 #include <unordered_map>
diff --git a/torch/csrc/jit/runtime/register_ops_utils.cpp b/torch/csrc/jit/runtime/register_ops_utils.cpp
index 6d91f28328be3..b926c59e75dee 100644
--- a/torch/csrc/jit/runtime/register_ops_utils.cpp
+++ b/torch/csrc/jit/runtime/register_ops_utils.cpp
@@ -133,7 +133,7 @@ void checkDoubleInRange(double a) {
       a > double(std::numeric_limits<int64_t>::max()) ||
       a < double(std::numeric_limits<int64_t>::min())) {
     throw c10::Error(
-        "Cannot convert float " + c10::to_string(a) + " to integer", "");
+        "Cannot convert float " + c10::to_string(a) + " to integer");
     return;
   }
 }
@@ -265,8 +265,7 @@ void listSelect(Stack& stack) {
   int64_t idx = pop(stack).to<int64_t>();
   c10::List<IValue> list = pop(stack).to<c10::List<IValue>>();
 
-  auto element = getItem(list, idx);
-  push(stack, std::move(element));
+  push(stack, getItem(list, idx));
 }
 
 void listLen(Stack& stack) {
diff --git a/torch/csrc/jit/runtime/register_ops_utils.h b/torch/csrc/jit/runtime/register_ops_utils.h
index 2a269931afbc0..de70cea3a1d50 100644
--- a/torch/csrc/jit/runtime/register_ops_utils.h
+++ b/torch/csrc/jit/runtime/register_ops_utils.h
@@ -32,7 +32,6 @@
 #include <c10/core/thread_pool.h>
 #include <c10/util/SmallVector.h>
 #include <c10/util/irange.h>
-#include <c10/util/math_compat.h>
 #include <c10/util/string_utils.h>
 
 namespace torch::jit {
@@ -123,7 +122,7 @@ double radians(double x);
 
 // Equivalent to list.at(idx)
 template <typename T>
-T getItem(const c10::List<T>& list, int64_t idx) {
+decltype(auto) getItem(const c10::List<T>& list, int64_t idx) {
   const int64_t list_size = list.size();
   const int64_t normalized_idx = normalizeIndex(idx, list_size);
   if (normalized_idx < 0 || normalized_idx >= list_size) {
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index 4d8a0cd89d8ff..ee1c0c9e29ef8 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -81,6 +81,110 @@ c10::List<std::string> splitNoneSeparator(const std::string& string) {
   return splits;
 }
 
+bool isSortableTupleType(
+    const TupleTypePtr& tuple_type,
+    std::stringstream& why_not) {
+  for (const TypePtr& ele_type : tuple_type->containedTypes()) {
+    switch (ele_type->kind()) {
+      case TypeKind::IntType:
+      case TypeKind::BoolType:
+      case TypeKind::FloatType:
+      case TypeKind::StringType:
+      case TypeKind::TensorType:
+        continue;
+      case TypeKind::TupleType:
+        if (!isSortableTupleType(ele_type->expect<TupleType>(), why_not)) {
+          return false;
+        }
+        continue;
+      case TypeKind::ClassType:
+        if (!c10::checkObjectSortSchema(
+                ele_type->expect<ClassType>(), why_not)) {
+          return false;
+        }
+        continue;
+      default:
+        why_not << "Contained elements in " << *tuple_type
+                << " are not sortable. Only Int, Bool, Float, String, Tensor, "
+                << "a User Defined Class with __lt__ method defined or Tuples "
+                << "of aforementionted types can be sorted.";
+        return false;
+    }
+  }
+
+  return true;
+}
+
+bool isSortableListOfObjectsOrTuples(
+    c10::List<IValue>& ivalues,
+    std::stringstream& why_not) {
+  if (ivalues.empty()) {
+    return true;
+  }
+
+  auto type = ivalues.get(0).type();
+  // We assume lists have homogenous types, use first element to determine
+  // best sorting methods. If in the future we need to support heterogenous
+  // types inside list, then sorting needs to have runtime sortable checks.
+  const size_t n = ivalues.size();
+  for (const auto i : c10::irange(n)) {
+    const IValue& v = ivalues.get(i);
+    auto curr_type = v.type();
+    if (*curr_type != *type) {
+      why_not << "Only values of same type can be compared. "
+              << "Found " << type->repr_str() << " and "
+              << curr_type->repr_str();
+      return false;
+    }
+  }
+
+  if (auto tuple_type = type->cast<TupleType>()) {
+    return isSortableTupleType(tuple_type, why_not);
+  }
+
+  if (auto class_type = type->cast<ClassType>()) {
+    return c10::checkObjectSortSchema(class_type, why_not) != nullptr;
+  }
+
+  // Basic types like tensors/ints/floats/bools/strs are not checked in this
+  // method because they should have been schema matched to specialized
+  // aten::sort kernels using listSort<T>.
+  why_not << "Only list of Tensors, ints, floats, bools, strs, "
+          << "a User Defined Class that defines the __lt__ compare method "
+          << "or Tuples of aforementioned types can be sorted, got list of "
+          << type->repr_str() << "\n";
+  return false;
+}
+
+template <bool has_reverse_arg, bool copy_return_list>
+void sort_op(Stack& stack) {
+  bool reverse = has_reverse_arg ? pop(stack).toBool() : false;
+  auto g_list = pop(stack).toList();
+
+  if (copy_return_list) {
+    g_list = g_list.copy();
+  }
+
+  if (!g_list.empty()) {
+    std::stringstream error_str;
+    if (!isSortableListOfObjectsOrTuples(g_list, error_str)) {
+      throw std::runtime_error(error_str.str());
+    }
+
+    c10::IValueComparator comparator;
+    if (reverse) {
+      comparator = c10::getGreaterThanComparator(g_list.get(0));
+    } else {
+      comparator = c10::getLessThanComparator(g_list.get(0));
+    }
+    std::sort(g_list.begin(), g_list.end(), comparator);
+  }
+
+  if (copy_return_list) {
+    push(stack, g_list);
+  }
+}
+
 template <typename T, typename U>
 auto powWrapper(T a, U b) {
   TORCH_CHECK(
@@ -695,7 +799,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
 #if defined BUILD_LITE_INTERPRETER || defined C10_MOBILE
           bool enabled = false;
 #else
-          bool enabled = at::autocast::is_enabled();
+          bool enabled = at::autocast::is_autocast_enabled(at::kCUDA);
 #endif
           push(stack, enabled);
         },
@@ -706,11 +810,26 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
 #if defined BUILD_LITE_INTERPRETER || defined C10_MOBILE
           bool enabled = false;
 #else
-          bool enabled = at::autocast::is_cpu_enabled();
+          bool enabled = at::autocast::is_autocast_enabled(at::kCPU);
 #endif
           push(stack, enabled);
         },
         aliasAnalysisConservative()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::get_autocast_dtype(str device_type) -> ScalarType"),
+        [](Stack& stack) {
+#if defined BUILD_LITE_INTERPRETER || defined C10_MOBILE
+          // autocast is not supported.
+          at::ScalarType dtype = at::ScalarType::Undefined;
+#else
+          at::DeviceType device_type =
+              at::Device(pop(stack).toStringRef()).type();
+          at::ScalarType dtype = at::autocast::get_autocast_dtype(device_type);
+#endif
+          push(stack, dtype);
+        },
+        aliasAnalysisConservative()),
     OperatorGeneratorArgs(
         TORCH_SELECTIVE_SCHEMA("prim::Uninitialized() -> Any"),
         unInitialized,
@@ -2431,11 +2550,11 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs1{
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
-        TORCH_SELECTIVE_SCHEMA("prim::is_ort(Tensor a) -> bool"),
+        TORCH_SELECTIVE_SCHEMA("prim::is_maia(Tensor a) -> bool"),
         [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
-          push(stack, a.is_ort());
+          push(stack, a.is_maia());
         },
         aliasAnalysisFromSchema()),
     OperatorGeneratorArgs(
@@ -2878,6 +2997,15 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs2{
         TORCH_SELECTIVE_SCHEMA("aten::ne.str_list(str[] a, str[] b) -> bool"),
         listNe<std::string>,
         aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA("aten::sorted.any(t[](a) self) -> (t[])"),
+        sort_op</*has_reverse_arg*/ false, /*copy_return_list*/ true>,
+        aliasAnalysisFromSchema()),
+    OperatorGeneratorArgs(
+        TORCH_SELECTIVE_SCHEMA(
+            "aten::sort.any(t[](a!) self, bool reverse=False) -> ()"),
+        sort_op</*has_reverse_arg*/ true, /*copy_return_list*/ false>,
+        aliasAnalysisFromSchema()),
 
 #define DEFINE_CONVERT_BASE_OP(op_name, prefix, char_op) \
   OperatorGeneratorArgs(                                 \
diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
index ac2dd62e64c16..d48a981666c83 100644
--- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
@@ -384,122 +384,6 @@ C10_UNUSED void hashValue(Stack& stack) {
   push(stack, value.hash());
 }
 
-bool isSortableTupleType(
-    const TupleTypePtr& tuple_type,
-    std::stringstream& why_not) {
-  for (const TypePtr& ele_type : tuple_type->containedTypes()) {
-    switch (ele_type->kind()) {
-      case TypeKind::IntType:
-      case TypeKind::BoolType:
-      case TypeKind::FloatType:
-      case TypeKind::StringType:
-      case TypeKind::TensorType:
-        continue;
-      case TypeKind::TupleType:
-        if (!isSortableTupleType(ele_type->expect<TupleType>(), why_not)) {
-          return false;
-        }
-        continue;
-      case TypeKind::ClassType:
-        if (!c10::checkObjectSortSchema(
-                ele_type->expect<ClassType>(), why_not)) {
-          return false;
-        }
-        continue;
-      default:
-        why_not << "Contained elements in " << *tuple_type
-                << " are not sortable. Only Int, Bool, Float, String, Tensor, "
-                << "a User Defined Class with __lt__ method defined or Tuples "
-                << "of aforementionted types can be sorted.";
-        return false;
-    }
-  }
-
-  return true;
-}
-
-bool isSortableListOfObjectsOrTuples(
-    c10::List<IValue>& ivalues,
-    std::stringstream& why_not) {
-  if (ivalues.empty()) {
-    return true;
-  }
-
-  auto type = ivalues.get(0).type();
-  // We assume lists have homogenous types, use first element to determine
-  // best sorting methods. If in the future we need to support heterogenous
-  // types inside list, then sorting needs to have runtime sortable checks.
-  const size_t n = ivalues.size();
-  for (const auto i : c10::irange(n)) {
-    const IValue& v = ivalues.get(i);
-    auto curr_type = v.type();
-    if (*curr_type != *type) {
-      why_not << "Only values of same type can be compared. "
-              << "Found " << type->repr_str() << " and "
-              << curr_type->repr_str();
-      return false;
-    }
-  }
-
-  if (auto tuple_type = type->cast<TupleType>()) {
-    return isSortableTupleType(tuple_type, why_not);
-  }
-
-  if (auto class_type = type->cast<ClassType>()) {
-    return c10::checkObjectSortSchema(class_type, why_not) != nullptr;
-  }
-
-  // Basic types like tensors/ints/floats/bools/strs are not checked in this
-  // method because they should have been schema matched to specialized
-  // aten::sort kernels using listSort<T>.
-  why_not << "Only list of Tensors, ints, floats, bools, strs, "
-          << "a User Defined Class that defines the __lt__ compare method "
-          << "or Tuples of aforementioned types can be sorted, got list of "
-          << type->repr_str() << "\n";
-  return false;
-}
-
-template <bool has_reverse_arg, bool copy_return_list>
-void sort_op(Stack& stack) {
-  bool reverse = has_reverse_arg ? pop(stack).toBool() : false;
-  auto g_list = pop(stack).toList();
-
-  if (copy_return_list) {
-    g_list = g_list.copy();
-  }
-
-  if (!g_list.empty()) {
-    std::stringstream error_str;
-    if (!isSortableListOfObjectsOrTuples(g_list, error_str)) {
-      throw std::runtime_error(error_str.str());
-    }
-
-    c10::IValueComparator comparator;
-    if (reverse) {
-      comparator = c10::getGreaterThanComparator(g_list.get(0));
-    } else {
-      comparator = c10::getLessThanComparator(g_list.get(0));
-    }
-    std::sort(g_list.begin(), g_list.end(), comparator);
-  }
-
-  if (copy_return_list) {
-    push(stack, g_list);
-  }
-}
-
-// NB: this must be registered after the other aten::sort operators
-RegisterOperators regSort({
-    Operator(
-        "aten::sorted.any(t[](a) self) -> (t[])",
-        sort_op</*has_reverse_arg*/ false, /*copy_return_list*/ true>,
-        aliasAnalysisFromSchema()),
-    Operator(
-        "aten::sort.any(t[](a!) self, bool reverse=False) -> ()",
-        sort_op</*has_reverse_arg*/ true, /*copy_return_list*/ false>,
-        aliasAnalysisFromSchema()),
-});
-
 // reference: _output_size in torch/nn/functional.py
 // size can be none, int or intlist
 // scale_factors can be none, float, or floatlist
diff --git a/torch/csrc/jit/runtime/register_special_ops.cpp b/torch/csrc/jit/runtime/register_special_ops.cpp
index 65de32eaa6687..5e33d8cf27d39 100644
--- a/torch/csrc/jit/runtime/register_special_ops.cpp
+++ b/torch/csrc/jit/runtime/register_special_ops.cpp
@@ -33,7 +33,7 @@ c10::AliasAnalysisKind aliasAnalysisConservative() {
 
 void checkListInputType(const c10::TypePtr& elem_type, bool empty_list) {
   if (!elem_type->isSubtypeOf(*NumberType::get()) &&
-      elem_type != BoolType::get()) {
+      !elem_type->isSubtypeOf(*BoolType::get())) {
     std::stringstream error;
     error << "Input must be of ints, floats, or bools, "
           << "got " << elem_type->repr_str();
@@ -292,19 +292,22 @@ RegisterOperators reg({
           aliasAnalysisFromSchema()),
 
     DEFINE_TORCH_TENSOR_OP(
-        float,
-        double,
-        at::native::scalar_tensor(
-            scalar_val,
-            typeMetaToScalarType(c10::get_default_dtype()),
-            c10::nullopt /* layout */,
-            at::kCPU,
-            c10::nullopt /* pin_memory*/))
-        DEFINE_TORCH_TENSOR_OP(int, int64_t, at::scalar_to_tensor(scalar_val))
+        bool,
+        bool,
+        at::empty({}, at::CPU(at::kBool).options()).fill_(scalar_val))
+        DEFINE_TORCH_TENSOR_OP(
+            float,
+            double,
+            at::native::scalar_tensor(
+                scalar_val,
+                typeMetaToScalarType(c10::get_default_dtype()),
+                c10::nullopt /* layout */,
+                at::kCPU,
+                c10::nullopt /* pin_memory*/))
             DEFINE_TORCH_TENSOR_OP(
-                bool,
-                bool,
-                at::empty({}, at::CPU(at::kBool).options()).fill_(scalar_val))
+                int,
+                int64_t,
+                at::scalar_to_tensor(scalar_val))
                 DEFINE_TORCH_TENSOR_OP(
                     complex,
                     c10::complex<double>,
diff --git a/torch/csrc/jit/runtime/script_profile.cpp b/torch/csrc/jit/runtime/script_profile.cpp
index 56cb2cafe7dfc..c31f27223b8b7 100644
--- a/torch/csrc/jit/runtime/script_profile.cpp
+++ b/torch/csrc/jit/runtime/script_profile.cpp
@@ -109,22 +109,18 @@ const auto C10_UNUSED torchBindInitializer = initBindings();
 namespace profiling {
 
 InstructionSpan::InstructionSpan(Node& node) {
-  if (getProfilesRegistry().empty()) {
-    return;
-  }
-
   datapoint_ = std::make_unique<Datapoint>(node.sourceRange());
 }
 
 InstructionSpan::~InstructionSpan() {
-  if (!datapoint_) {
-    return;
-  }
-
   datapoint_->end = std::chrono::steady_clock::now();
   getProfilesRegistry().send(std::move(datapoint_));
 }
 
+bool isProfilingOngoing() {
+  return !getProfilesRegistry().empty();
+}
+
 } // namespace profiling
 
 void ScriptProfile::enable() {
diff --git a/torch/csrc/jit/runtime/script_profile.h b/torch/csrc/jit/runtime/script_profile.h
index 8e08255687cf7..7abaf5d73f83e 100644
--- a/torch/csrc/jit/runtime/script_profile.h
+++ b/torch/csrc/jit/runtime/script_profile.h
@@ -33,6 +33,8 @@ class TORCH_API InstructionSpan {
   std::unique_ptr<Datapoint> datapoint_;
 };
 
+bool TORCH_API isProfilingOngoing();
+
 } // namespace profiling
 
 struct TORCH_API InstructionStats : public CustomClassHolder {
@@ -72,6 +74,8 @@ class TORCH_API SourceStats : public CustomClassHolder {
  * scriptProfile.disable();
  * ...
  *
+ * NOTE: you cannot attach the profiler while the script is running.
+ *
  * To retrieve collected runtime data, users may call dumpStats() and do
  * arbitrary filtering on the data they want. Note that dumpStats() should
  * not be called inside a profiling section.
diff --git a/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp b/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp
index 943d43f02f737..69ccaf705e2c0 100644
--- a/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp
+++ b/torch/csrc/jit/runtime/serialized_shape_function_registry.cpp
@@ -3014,6 +3014,18 @@ def native_batch_norm(input: List[int],
     _1 = torch.append(out, elem)
   return (out, _size, _size)
 
+def _batch_norm_with_update(input: List[int],
+    weight: Optional[List[int]],
+    bias: Optional[List[int]],
+    running_mean: Optional[List[int]],
+    running_var: Optional[List[int]]) -> Tuple[List[int], List[int], List[int], List[int]]:
+  _size = [input[1]]
+  out = annotate(List[int], [])
+  for _0 in range(torch.len(input)):
+    elem = input[_0]
+    _1 = torch.append(out, elem)
+  return (out, _size, _size, [0])
+
 )=====")
 + std::string(R"=====(def cross_entropy_loss(self: List[int],
     target: List[int],
@@ -3312,6 +3324,7 @@ const OperatorMap<std::string>& GetShapeFunctionMappings() {
     {"aten::native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)", "native_batch_norm"},
     {"aten::_native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)", "native_batch_norm"},
     {"aten::_native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)", "native_batch_norm"},
+    {"aten::_batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)", "_batch_norm_with_update"},
     {"aten::cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor", "cross_entropy_loss"},
     {"aten::lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor", "broadcast_three"},
     {"aten::where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor", "broadcast_one_three"},
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index 56c30a7c33f74..48af8ef02afbf 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -240,7 +240,6 @@ class TORCH_API StaticRuntimeMetadata : public torch::CustomClassHolder {
 ///
 class MemoryPlanner;
 class StaticNodeInfo;
-class ProcessedFunction;
 class ProcessedNode;
 class StaticRuntime;
 
@@ -259,6 +258,42 @@ struct TORCH_API SROperatorObserver {
 };
 #endif
 
+class TORCH_API ProcessedFunction {
+ public:
+  ProcessedFunction(
+      Node* node,
+      bool enable_out_variant,
+      bool check_memory_overlap);
+
+  enum class Kind : uint8_t {
+    kOutVariant,
+    kNativeFunction,
+    kInterpreterFallback,
+  };
+
+  void run(ProcessedNode* pnode) const {
+    return f_(pnode);
+  }
+
+  Kind kind() const {
+    return kind_;
+  }
+
+  bool checkMemoryOverlap() const {
+    return check_memory_overlap_;
+  }
+
+  size_t num_outputs() const {
+    return num_outputs_;
+  }
+
+ private:
+  SROperator f_;
+  Kind kind_{ProcessedFunction::Kind::kOutVariant};
+  bool check_memory_overlap_{false};
+  size_t num_outputs_{0};
+};
+
 // A `BlockInfo` instance stores all of the shared state that each
 // `BlockRunner` will need to access. Most of this information is
 // read-only and shared between threads.
@@ -778,42 +813,6 @@ class TORCH_API BlockRunner {
   std::vector<ProcessedNode> nodes_;
 };
 
-class TORCH_API ProcessedFunction {
- public:
-  ProcessedFunction(
-      Node* node,
-      bool enable_out_variant,
-      bool check_memory_overlap);
-
-  enum class Kind : uint8_t {
-    kOutVariant,
-    kNativeFunction,
-    kInterpreterFallback,
-  };
-
-  void run(ProcessedNode* pnode) const {
-    return f_(pnode);
-  }
-
-  Kind kind() const {
-    return kind_;
-  }
-
-  bool checkMemoryOverlap() const {
-    return check_memory_overlap_;
-  }
-
-  size_t num_outputs() const {
-    return num_outputs_;
-  }
-
- private:
-  SROperator f_;
-  Kind kind_{ProcessedFunction::Kind::kOutVariant};
-  bool check_memory_overlap_{false};
-  size_t num_outputs_{0};
-};
-
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 class TORCH_API StaticNodeInfo {
  public:
@@ -939,9 +938,9 @@ class TORCH_API ProcessedNode {
     return values_[outputs_offset_ + i];
   }
 
-  size_t num_outputs() const {
+  uint32_t num_outputs() const {
     DCHECK(fn_ != nullptr);
-    return fn_->num_outputs();
+    return static_cast<uint32_t>(fn_->num_outputs());
   }
 
   C10_NODISCARD c10::ArrayRef<const IValue> outputs() const {
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 0d67ea4196220..b4f4c38c2aaf5 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -2122,7 +2122,7 @@ REGISTER_OPERATOR_FUNCTOR(aten::layer_norm, aten_layer_norm, [](Node* n) -> SROp
           p_node->Output(0).toTensor(), X->sizes(), c10::nullopt);
     }
     at::Tensor& output = p_node->Output(0).toTensor();
-    at::native::layer_norm_cpu_out(output, input, *gamma, *beta, eps, M, N);
+    at::native::layer_norm_cpu_out(output, *X, *gamma, *beta, eps, M, N);
   };
 });
 
diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp
index 511f08f5b9347..fcd2b79e39f6b 100644
--- a/torch/csrc/jit/runtime/static/passes.cpp
+++ b/torch/csrc/jit/runtime/static/passes.cpp
@@ -209,14 +209,14 @@ C10_UNUSED void ClipRangesGather(std::shared_ptr<torch::jit::Graph>& graph) {
 C10_UNUSED void PrecomputeMultiplierShiftForSigridHash(
     std::shared_ptr<torch::jit::Graph>& graph) {
   std::string pattern = R"IR(
-    graph(%a, %b, %c, %d):
-        %y0 : Tensor = fb::sigrid_hash(%a, %b, %c, %d)
+    graph(%a, %b, %c, %d, %e):
+        %y0 : Tensor = fb::sigrid_hash(%a, %b, %c, %d, %e)
         return (%y0)
   )IR";
   std::string split_pattern = R"IR(
-    graph(%a, %b, %c, %d):
+    graph(%a, %b, %c, %d, %e):
         %y0 : Tensor = fb::sigrid_hash_compute_multipler_shift(%c)
-        %y2 : Tensor = fb::sigrid_hash_precompute(%a, %b, %c, %y0, %d)
+        %y2 : Tensor = fb::sigrid_hash_precompute(%a, %b, %c, %y0, %d, %e)
         return (%y2)
   )IR";
   SubgraphRewriter fuse;
diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp
index 16a0a7bcdce80..c23e3b52bfb1b 100644
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@@ -91,7 +91,7 @@ namespace onnx_torch = ::torch::onnx;
 namespace onnx = ::ONNX_NAMESPACE;
 
 const static int kInvalidOpsetVersion = -1;
-const static int kMainOpsetVersion = 19;
+const static int kMainOpsetVersion = 20;
 // Based on OP_SET_ID_VERSION_MAP in
 // https://github.com/onnx/onnx/blob/master/onnx/helper.py.
 constexpr static std::array<int64_t, kMainOpsetVersion + 1>
@@ -116,6 +116,7 @@ constexpr static std::array<int64_t, kMainOpsetVersion + 1>
         8, // opset 17
         8, // opset 18
         9, // opset 19
+        9, // opset 20
 };
 
 std::string getNodeStackTraceString(const Node* n) {
@@ -469,6 +470,10 @@ onnx::TensorProto_DataType ATenTypeToOnnxType(at::ScalarType at_type) {
       return onnx_torch::TensorProto_DataType_FLOAT8E4M3FN;
     case at::kFloat8_e5m2:
       return onnx_torch::TensorProto_DataType_FLOAT8E5M2;
+    case at::kFloat8_e4m3fnuz:
+      return onnx_torch::TensorProto_DataType_FLOAT8E4M3FNUZ;
+    case at::kFloat8_e5m2fnuz:
+      return onnx_torch::TensorProto_DataType_FLOAT8E5M2FNUZ;
     default:
       TORCH_CHECK(
           false,
diff --git a/torch/csrc/jit/serialization/import.cpp b/torch/csrc/jit/serialization/import.cpp
index 5c73ae25244f8..9b585efc03545 100644
--- a/torch/csrc/jit/serialization/import.cpp
+++ b/torch/csrc/jit/serialization/import.cpp
@@ -260,10 +260,7 @@ Module ScriptModuleDeserializer::deserialize(
   for (const auto& kv : extra_files) {
     const std::string& key = "extra/" + kv.first;
     if (reader_->hasRecord(key)) {
-      at::DataPtr meta_ptr;
-      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-      size_t meta_size;
-      std::tie(meta_ptr, meta_size) = reader_->getRecord(key);
+      auto [meta_ptr, meta_size] = reader_->getRecord(key);
       extra_files[kv.first] =
           std::string(static_cast<char*>(meta_ptr.get()), meta_size);
     }
@@ -383,9 +380,7 @@ Module import_ir_module(
     ScriptModuleDeserializer deserializer(std::move(cu), std::move(reader));
     return deserializer.deserialize(device, extra_files, restore_shapes);
   }
-  std::shared_ptr<char> data;
-  size_t size = 0;
-  std::tie(data, size) = get_stream_content(in);
+  auto [data, size] = get_stream_content(in);
   return _load_jit_module_from_bytes(
       data, size, cu, device, extra_files, restore_shapes);
 }
@@ -432,9 +427,7 @@ Module import_ir_module(
     ScriptModuleDeserializer deserializer(std::move(cu), std::move(reader));
     return deserializer.deserialize(device, extra_files, restore_shapes);
   }
-  std::shared_ptr<char> data;
-  size_t size = 0;
-  std::tie(data, size) = get_file_content(filename.c_str());
+  auto [data, size] = get_file_content(filename.c_str());
   return _load_jit_module_from_bytes(
       data, size, cu, device, extra_files, restore_shapes);
 }
diff --git a/torch/csrc/jit/serialization/import_export_helpers.cpp b/torch/csrc/jit/serialization/import_export_helpers.cpp
index dc9577d067d87..b2f97e5e9603c 100644
--- a/torch/csrc/jit/serialization/import_export_helpers.cpp
+++ b/torch/csrc/jit/serialization/import_export_helpers.cpp
@@ -29,19 +29,13 @@ std::shared_ptr<Source> findSourceInArchiveFromQualifier(
   if (!reader.hasRecord(path)) {
     return nullptr;
   }
-  at::DataPtr data;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  size_t size;
-  std::tie(data, size) = reader.getRecord(path);
+  auto [data, size] = reader.getRecord(path);
 
   std::shared_ptr<ConcreteSourceRangeUnpickler> gen_ranges = nullptr;
 
   std::string debug_file = path + ".debug_pkl";
   if (reader.hasRecord(debug_file)) {
-    at::DataPtr debug_data;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    size_t debug_size;
-    std::tie(debug_data, debug_size) = reader.getRecord(debug_file);
+    auto [debug_data, debug_size] = reader.getRecord(debug_file);
     gen_ranges = std::make_shared<ConcreteSourceRangeUnpickler>(
         std::move(debug_data), debug_size);
   }
diff --git a/torch/csrc/jit/serialization/import_legacy.cpp b/torch/csrc/jit/serialization/import_legacy.cpp
index cbc022f8d5b63..85ec2675a9c23 100644
--- a/torch/csrc/jit/serialization/import_legacy.cpp
+++ b/torch/csrc/jit/serialization/import_legacy.cpp
@@ -88,10 +88,7 @@ class ScriptModuleDeserializer final {
 Module ScriptModuleDeserializer::LEGACY_deserialize() {
   torch::ModelDef model_def;
 
-  at::DataPtr data_ptr;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  size_t data_size;
-  std::tie(data_ptr, data_size) = reader_->getRecord("model.json");
+  auto [data_ptr, data_size] = reader_->getRecord("model.json");
   // NB: cannot use JsonStringToMessage, since fbcode's protobuf is too old
   // be consistent with JsonStringToMessage
   std::string url_prefix = "type.googleapis.com";
@@ -145,10 +142,7 @@ Module ScriptModuleDeserializer::LEGACY_deserialize() {
 
 IValue ScriptModuleDeserializer::LEGACY_loadPickleArchive(
     const std::string& name) {
-  at::DataPtr attributes_ptr;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  size_t attributes_size;
-  std::tie(attributes_ptr, attributes_size) = reader_->getRecord(name);
+  auto [attributes_ptr, attributes_size] = reader_->getRecord(name);
   auto ivalue = unpickle(
       reinterpret_cast<const char*>(attributes_ptr.get()),
       attributes_size,
@@ -190,10 +184,7 @@ at::Tensor ScriptModuleDeserializer::LEGACY_loadTensor(
 
   auto storage_it = storageMap.find(record_key);
   if (storage_it == storageMap.end()) {
-    at::DataPtr storage_ptr;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    uint64_t record_size;
-    std::tie(storage_ptr, record_size) = reader_->getRecord(record_key);
+    auto [storage_ptr, record_size] = reader_->getRecord(record_key);
     auto cpu_storage = at::Storage(
         c10::Storage::use_byte_size_t(),
         record_size,
@@ -331,10 +322,7 @@ Module ScriptModuleDeserializer::LEGACY_convertModule(
   // generating code.
   std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr;
   if (module_def.has_torchscript_debug_arena()) {
-    at::DataPtr data;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    size_t size;
-    std::tie(data, size) =
+    auto [data, size] =
         reader_->getRecord(module_def.torchscript_debug_arena().key());
 
     gen_ranges =
@@ -342,10 +330,7 @@ Module ScriptModuleDeserializer::LEGACY_convertModule(
   }
 
   if (module_def.has_torchscript_arena()) {
-    at::DataPtr data;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    size_t size;
-    std::tie(data, size) =
+    auto [data, size] =
         reader_->getRecord(module_def.torchscript_arena().key());
     std::string data_str(static_cast<const char*>(data.get()), size);
     auto src = std::make_shared<Source>(
diff --git a/torch/csrc/jit/serialization/pickle.cpp b/torch/csrc/jit/serialization/pickle.cpp
index 17107209dcf43..be36a4e2d8dd5 100644
--- a/torch/csrc/jit/serialization/pickle.cpp
+++ b/torch/csrc/jit/serialization/pickle.cpp
@@ -4,6 +4,7 @@
 #include <caffe2/serialize/inline_container.h>
 #include <torch/csrc/Export.h>
 #include <torch/csrc/jit/serialization/export.h>
+#include <torch/csrc/jit/serialization/import.h>
 #include <torch/csrc/jit/serialization/import_read.h>
 
 namespace torch::jit {
@@ -73,26 +74,12 @@ std::vector<char> pickle_save(const at::IValue& ivalue) {
 }
 
 #ifndef C10_MOBILE
-class VectorReader : public caffe2::serialize::ReadAdapterInterface {
- public:
-  VectorReader(std::vector<char> data) : data_(std::move(data)) {}
-
-  size_t size() const override {
-    return data_.size();
-  }
-
-  size_t read(uint64_t pos, void* buf, size_t n, const char* what)
-      const override {
-    std::copy(
-        data_.data() + pos,
-        data_.data() + pos + n,
-        reinterpret_cast<char*>(buf));
-    return n;
-  }
-
- private:
-  std::vector<char> data_;
-};
+size_t VectorReader::read(uint64_t pos, void* buf, size_t n, const char* what)
+    const {
+  std::copy(
+      data_.data() + pos, data_.data() + pos + n, reinterpret_cast<char*>(buf));
+  return n;
+}
 #endif
 
 IValue pickle_load(const std::vector<char>& data) {
diff --git a/torch/csrc/jit/serialization/pickle.h b/torch/csrc/jit/serialization/pickle.h
index a546867e74423..b3860604aae17 100644
--- a/torch/csrc/jit/serialization/pickle.h
+++ b/torch/csrc/jit/serialization/pickle.h
@@ -103,5 +103,21 @@ TORCH_API IValue unpickle(
     c10::TypePtr (*type_parser)(const std::string&) =
         Unpickler::defaultTypeParser);
 
+#ifndef C10_MOBILE
+class VectorReader : public caffe2::serialize::ReadAdapterInterface {
+ public:
+  VectorReader(std::vector<char> data) : data_(std::move(data)) {}
+
+  size_t size() const override {
+    return data_.size();
+  }
+
+  size_t read(uint64_t pos, void* buf, size_t n, const char* what)
+      const override;
+
+ private:
+  std::vector<char> data_;
+};
+#endif
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/codegen_external.py b/torch/csrc/jit/tensorexpr/codegen_external.py
index bc69b05162f66..5dcf1b28407dd 100644
--- a/torch/csrc/jit/tensorexpr/codegen_external.py
+++ b/torch/csrc/jit/tensorexpr/codegen_external.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python3
+# mypy: ignore-errors
+
 import argparse
 
 import torchgen.model as model
diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h
index 4950dfbc1983a..64ac1edf8f188 100644
--- a/torch/csrc/jit/tensorexpr/eval.h
+++ b/torch/csrc/jit/tensorexpr/eval.h
@@ -9,7 +9,6 @@
 
 #include <c10/macros/Macros.h>
 #include <c10/util/Logging.h>
-#include <c10/util/math_compat.h>
 #include <c10/util/string_utils.h>
 #include <torch/csrc/jit/tensorexpr/codegen.h>
 #include <torch/csrc/jit/tensorexpr/exceptions.h>
diff --git a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
index 110d4151a364c..847dab451d129 100644
--- a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
@@ -288,9 +288,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
             break;
           }
           int n_pivots = (std::rand() % (int)stmts.size()) + 1;
-          std::vector<StmtPtr> pivots;
-          std::vector<int> chosen_indices;
-          std::tie(pivots, chosen_indices) =
+          auto [pivots, chosen_indices] =
               randomization_helper::select_n_randomly<StmtPtr>(
                   stmts, n_pivots, random_engine);
           std::unordered_set<StmtPtr> pivots_set(pivots.begin(), pivots.end());
@@ -371,9 +369,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
           int num_loops_to_fuse =
               std::max(2, (int)(std::rand() % (int)loops.size()));
 
-          std::vector<ForPtr> loops_to_fuse;
-          std::vector<int> chosen_indices;
-          std::tie(loops_to_fuse, chosen_indices) =
+          auto [loops_to_fuse, chosen_indices] =
               randomization_helper::select_n_randomly<ForPtr>(
                   loops, num_loops_to_fuse, random_engine);
 
diff --git a/torch/csrc/jit/tensorexpr/scripts/bisect.py b/torch/csrc/jit/tensorexpr/scripts/bisect.py
index 029ee5e3dcf7d..fe12eab5097f0 100644
--- a/torch/csrc/jit/tensorexpr/scripts/bisect.py
+++ b/torch/csrc/jit/tensorexpr/scripts/bisect.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import subprocess
 
 import click
diff --git a/torch/csrc/jit/tensorexpr/types.cpp b/torch/csrc/jit/tensorexpr/types.cpp
index 75dc8ec23f274..3791fead0da19 100644
--- a/torch/csrc/jit/tensorexpr/types.cpp
+++ b/torch/csrc/jit/tensorexpr/types.cpp
@@ -74,8 +74,15 @@ int Dtype::byte_size() const {
     scalar_size = sizeof(Type); \
     break;
 
-    AT_FORALL_SCALAR_TYPES_AND5(
-        Bool, Half, BFloat16, Float8_e5m2, Float8_e4m3fn, TYPE_CASE);
+    AT_FORALL_SCALAR_TYPES_AND7(
+        Bool,
+        Half,
+        BFloat16,
+        Float8_e5m2,
+        Float8_e4m3fn,
+        Float8_e5m2fnuz,
+        Float8_e4m3fnuz,
+        TYPE_CASE);
     TYPE_CASE(c10::quint8, QUInt8);
     TYPE_CASE(c10::qint8, QInt8);
 #undef TYPE_CASE
diff --git a/torch/csrc/lazy/core/shape_inference.cpp b/torch/csrc/lazy/core/shape_inference.cpp
index c1b3424c8df43..30f55afea2555 100644
--- a/torch/csrc/lazy/core/shape_inference.cpp
+++ b/torch/csrc/lazy/core/shape_inference.cpp
@@ -177,21 +177,21 @@ std::vector<Shape> compute_shape_abs(const at::Tensor& self) {
 
 std::vector<Shape> compute_shape_bernoulli(
     const at::Tensor& self,
-    c10::optional<at::Generator> generator) {
+    ::std::optional<at::Generator> generator) {
   return {Shape(self.scalar_type(), self.sizes().vec())};
 }
 
 std::vector<Shape> compute_shape_bernoulli(
     const at::Tensor& self,
     double p,
-    c10::optional<at::Generator> generator) {
+    ::std::optional<at::Generator> generator) {
   return compute_shape_bernoulli(self, generator);
 }
 
 std::vector<Shape> compute_shape_binary_cross_entropy(
     const at::Tensor& self,
     const at::Tensor& target,
-    const c10::optional<at::Tensor>& weight,
+    const ::std::optional<at::Tensor>& weight,
     int64_t reduction) {
   if (reduction == at::Reduction::None) {
     return {Shape(self.scalar_type(), self.sizes().vec())};
@@ -203,7 +203,7 @@ std::vector<Shape> compute_shape_binary_cross_entropy_backward(
     const at::Tensor& grad_output,
     const at::Tensor& self,
     const at::Tensor& target,
-    const c10::optional<at::Tensor>& weight,
+    const ::std::optional<at::Tensor>& weight,
     int64_t reduction) {
   return {Shape(self.scalar_type(), self.sizes().vec())};
 }
@@ -286,7 +286,7 @@ std::vector<Shape> compute_shape_convolution_backward(
 std::vector<Shape> compute_shape_convolution(
     const at::Tensor& input,
     const at::Tensor& weight,
-    const c10::optional<at::Tensor>& bias,
+    const ::std::optional<at::Tensor>& bias,
     at::IntArrayRef stride,
     at::IntArrayRef padding,
     at::IntArrayRef dilation,
@@ -390,19 +390,19 @@ std::vector<Shape> compute_shape_embedding(
 }
 
 std::vector<Shape> compute_shape_std(const at::Tensor& self, bool unbiased) {
-  return compute_shape_std(self, c10::nullopt, c10::nullopt, false);
+  return compute_shape_std(self, ::std::nullopt, ::std::nullopt, false);
 }
 std::vector<Shape> compute_shape_std(
     const at::Tensor& self,
     at::OptionalIntArrayRef dim,
     bool unbiased,
     bool keepdim) {
-  return compute_shape_std(self, dim, c10::nullopt, keepdim);
+  return compute_shape_std(self, dim, ::std::nullopt, keepdim);
 }
 std::vector<Shape> compute_shape_std(
     const at::Tensor& self,
     at::OptionalIntArrayRef dim,
-    const c10::optional<at::Scalar>& correction,
+    const ::std::optional<at::Scalar>& correction,
     bool keepdim) {
   if (dim.has_value()) {
     auto shape = at::native::shape_from_dim_mask(
@@ -530,10 +530,10 @@ TORCH_API std::vector<torch::lazy::Shape> compute_shape_cholesky(
 
 std::vector<torch::lazy::Shape> compute_shape_native_batch_norm(
     const at::Tensor& input,
-    const c10::optional<at::Tensor>& weight,
-    const c10::optional<at::Tensor>& bias,
-    const c10::optional<at::Tensor>& running_mean,
-    const c10::optional<at::Tensor>& running_var,
+    const ::std::optional<at::Tensor>& weight,
+    const ::std::optional<at::Tensor>& bias,
+    const ::std::optional<at::Tensor>& running_mean,
+    const ::std::optional<at::Tensor>& running_var,
     bool training,
     double momentum,
     double eps) {
@@ -570,11 +570,11 @@ std::vector<torch::lazy::Shape> compute_shape_native_batch_norm(
 std::vector<torch::lazy::Shape> compute_shape_native_batch_norm_backward(
     const at::Tensor& grad_out,
     const at::Tensor& input,
-    const c10::optional<at::Tensor>& weight,
-    const c10::optional<at::Tensor>& running_mean,
-    const c10::optional<at::Tensor>& running_var,
-    const c10::optional<at::Tensor>& save_mean,
-    const c10::optional<at::Tensor>& save_invstd,
+    const ::std::optional<at::Tensor>& weight,
+    const ::std::optional<at::Tensor>& running_mean,
+    const ::std::optional<at::Tensor>& running_var,
+    const ::std::optional<at::Tensor>& save_mean,
+    const ::std::optional<at::Tensor>& save_invstd,
     bool train,
     double eps,
     ::std::array<bool, 3> output_mask) {
@@ -602,8 +602,8 @@ std::vector<torch::lazy::Shape> compute_shape_native_batch_norm_backward(
 std::vector<Shape> compute_shape_native_layer_norm(
     const at::Tensor& input,
     at::IntArrayRef normalized_shape,
-    const c10::optional<at::Tensor>& weight,
-    const c10::optional<at::Tensor>& bias,
+    const ::std::optional<at::Tensor>& weight,
+    const ::std::optional<at::Tensor>& bias,
     double eps) {
   // Copied from aten/src/ATen/native/layer_norm.cpp::layer_norm_cpu_out.
   auto input_shape = input.sizes().vec();
@@ -631,8 +631,8 @@ std::vector<Shape> compute_shape_native_layer_norm_backward(
     at::IntArrayRef normalized_shape,
     const at::Tensor& mean,
     const at::Tensor& rstd,
-    const c10::optional<at::Tensor>& weight,
-    const c10::optional<at::Tensor>& bias,
+    const ::std::optional<at::Tensor>& weight,
+    const ::std::optional<at::Tensor>& bias,
     ::std::array<bool, 3> output_mask) {
   std::vector<Shape> shapes;
   shapes.emplace_back(
@@ -650,7 +650,7 @@ std::vector<Shape> compute_shape_native_layer_norm_backward(
 
 std::vector<Shape> compute_shape_mean(
     const at::Tensor& self,
-    c10::optional<at::ScalarType> dtype) {
+    ::std::optional<at::ScalarType> dtype) {
   if (dtype.has_value()) {
     return {Shape(dtype.value(), {})};
   }
@@ -661,10 +661,10 @@ std::vector<Shape> compute_shape_new_empty_strided(
     const at::Tensor& self,
     at::IntArrayRef size,
     at::IntArrayRef stride,
-    c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory) {
+    ::std::optional<at::ScalarType> dtype,
+    ::std::optional<at::Layout> layout,
+    ::std::optional<at::Device> device,
+    ::std::optional<bool> pin_memory) {
   return {Shape(dtype.has_value() ? *dtype : self.scalar_type(), size.vec())};
 }
 
@@ -677,7 +677,7 @@ std::vector<Shape> compute_shape_mv(
 std::vector<Shape> compute_shape_native_dropout(
     const at::Tensor& input,
     double p,
-    c10::optional<bool> train) {
+    ::std::optional<bool> train) {
   return {
       Shape(input.scalar_type(), input.sizes().vec()),
       Shape(c10::ScalarType::Bool, input.sizes().vec())};
@@ -692,22 +692,22 @@ std::vector<Shape> compute_shape_native_dropout_backward(
 
 std::vector<Shape> compute_shape_random(
     const at::Tensor& self,
-    c10::optional<at::Generator> generator) {
+    ::std::optional<at::Generator> generator) {
   return {Shape(self.scalar_type(), self.sizes().vec())};
 }
 
 std::vector<Shape> compute_shape_random(
     const at::Tensor& self,
     int64_t to,
-    c10::optional<at::Generator> generator) {
+    ::std::optional<at::Generator> generator) {
   return compute_shape_random(self, generator);
 }
 
 std::vector<Shape> compute_shape_random(
     const at::Tensor& self,
     int64_t from,
-    c10::optional<int64_t> to,
-    c10::optional<at::Generator> generator) {
+    ::std::optional<int64_t> to,
+    ::std::optional<at::Generator> generator) {
   return compute_shape_random(self, generator);
 }
 
@@ -717,7 +717,7 @@ std::vector<Shape> compute_shape_relu(const at::Tensor& self) {
 
 std::vector<Shape> compute_shape_sum(
     const at::Tensor& self,
-    c10::optional<at::ScalarType> dtype) {
+    ::std::optional<at::ScalarType> dtype) {
   if (dtype.has_value()) {
     return {Shape(dtype.value(), {})};
   }
@@ -836,7 +836,7 @@ std::vector<Shape> compute_shape_log_sigmoid_backward(
 std::vector<Shape> compute_shape_nll_loss2d_forward(
     const at::Tensor& self,
     const at::Tensor& target,
-    const c10::optional<at::Tensor>& weight,
+    const ::std::optional<at::Tensor>& weight,
     int64_t reduction,
     int64_t ignore_index) {
   // Based on definition of
@@ -851,7 +851,7 @@ std::vector<Shape> compute_shape_nll_loss2d_backward(
     const at::Tensor& grad_output,
     const at::Tensor& self,
     const at::Tensor& target,
-    const c10::optional<at::Tensor>& weight,
+    const ::std::optional<at::Tensor>& weight,
     int64_t reduction,
     int64_t ignore_index,
     const at::Tensor& total_weight) {
@@ -1075,12 +1075,12 @@ std::vector<Shape> compute_shape_clamp_min(
 
 std::vector<Shape> compute_shape__to_copy(
     const at::Tensor& self,
-    c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory,
+    ::std::optional<at::ScalarType> dtype,
+    ::std::optional<at::Layout> layout,
+    ::std::optional<at::Device> device,
+    ::std::optional<bool> pin_memory,
     bool non_blocking,
-    c10::optional<at::MemoryFormat> memory_format) {
+    ::std::optional<at::MemoryFormat> memory_format) {
   if (dtype) {
     return {Shape(*dtype, self.sizes().vec())};
   }
@@ -1089,7 +1089,7 @@ std::vector<Shape> compute_shape__to_copy(
 
 TORCH_API std::vector<Shape> compute_shape_clone(
     const at::Tensor& self,
-    c10::optional<at::MemoryFormat> memory_format) {
+    ::std::optional<at::MemoryFormat> memory_format) {
   return {Shape(self.scalar_type(), self.sizes().vec())};
 }
 
@@ -1175,7 +1175,7 @@ std::vector<Shape> compute_shape_view(
 std::vector<Shape> compute_shape_cast(
     const Output& input,
     const at::ScalarType& dtype,
-    const c10::optional<at::ScalarType>& stype) {
+    const ::std::optional<at::ScalarType>& stype) {
   Shape shape = input.shape();
   shape.set_scalar_type(dtype);
   return {shape};
@@ -1274,17 +1274,17 @@ std::vector<Shape> compute_shape_select_scatter(
   auto self_meta = at::native::empty_strided_meta_symint(
       self.sym_sizes(),
       self.sym_strides(),
-      /*dtype=*/c10::make_optional(self.scalar_type()),
-      /*layout=*/c10::make_optional(self.layout()),
-      /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/c10::nullopt);
+      /*dtype=*/::std::make_optional(self.scalar_type()),
+      /*layout=*/::std::make_optional(self.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
   auto src_meta = at::native::empty_strided_meta_symint(
       src.sym_sizes(),
       src.sym_strides(),
-      /*dtype=*/c10::make_optional(src.scalar_type()),
-      /*layout=*/c10::make_optional(src.layout()),
-      /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/c10::nullopt);
+      /*dtype=*/::std::make_optional(src.scalar_type()),
+      /*layout=*/::std::make_optional(src.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
   auto out_meta = at::compositeexplicitautogradnonfunctional::select_scatter(
       self_meta, src_meta, dim, index);
   return {Shape(out_meta.scalar_type(), out_meta.sizes().vec())};
@@ -1299,17 +1299,17 @@ std::vector<Shape> compute_shape_diagonal_scatter(
   auto self_meta = at::native::empty_strided_meta_symint(
       self.sym_sizes(),
       self.sym_strides(),
-      /*dtype=*/c10::make_optional(self.scalar_type()),
-      /*layout=*/c10::make_optional(self.layout()),
-      /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/c10::nullopt);
+      /*dtype=*/::std::make_optional(self.scalar_type()),
+      /*layout=*/::std::make_optional(self.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
   auto src_meta = at::native::empty_strided_meta_symint(
       src.sym_sizes(),
       src.sym_strides(),
-      /*dtype=*/c10::make_optional(src.scalar_type()),
-      /*layout=*/c10::make_optional(src.layout()),
-      /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/c10::nullopt);
+      /*dtype=*/::std::make_optional(src.scalar_type()),
+      /*layout=*/::std::make_optional(src.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
   auto out_meta = at::compositeexplicitautogradnonfunctional::diagonal_scatter(
       self_meta, src_meta, offset, dim1, dim2);
   return {Shape(out_meta.scalar_type(), out_meta.sizes().vec())};
@@ -1319,23 +1319,23 @@ std::vector<Shape> compute_shape_slice_scatter_symint(
     const at::Tensor& self,
     const at::Tensor& src,
     int64_t dim,
-    c10::optional<c10::SymInt> start,
-    c10::optional<c10::SymInt> end,
+    ::std::optional<c10::SymInt> start,
+    ::std::optional<c10::SymInt> end,
     c10::SymInt step) {
   auto self_meta = at::native::empty_strided_meta_symint(
       self.sym_sizes(),
       self.sym_strides(),
-      /*dtype=*/c10::make_optional(self.scalar_type()),
-      /*layout=*/c10::make_optional(self.layout()),
-      /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/c10::nullopt);
+      /*dtype=*/::std::make_optional(self.scalar_type()),
+      /*layout=*/::std::make_optional(self.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
   auto src_meta = at::native::empty_strided_meta_symint(
       src.sym_sizes(),
       src.sym_strides(),
-      /*dtype=*/c10::make_optional(src.scalar_type()),
-      /*layout=*/c10::make_optional(src.layout()),
-      /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/c10::nullopt);
+      /*dtype=*/::std::make_optional(src.scalar_type()),
+      /*layout=*/::std::make_optional(src.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
   auto out_meta =
       at::compositeexplicitautogradnonfunctional::slice_scatter_symint(
           self_meta, src_meta, dim, start, end, step);
@@ -1347,21 +1347,21 @@ std::vector<Shape> compute_shape_as_strided_scatter_symint(
     const at::Tensor& src,
     at::SymIntArrayRef size,
     at::SymIntArrayRef stride,
-    c10::optional<c10::SymInt> storage_offset) {
+    ::std::optional<c10::SymInt> storage_offset) {
   auto self_meta = at::native::empty_strided_meta_symint(
       self.sym_sizes(),
       self.sym_strides(),
-      /*dtype=*/c10::make_optional(self.scalar_type()),
-      /*layout=*/c10::make_optional(self.layout()),
-      /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/c10::nullopt);
+      /*dtype=*/::std::make_optional(self.scalar_type()),
+      /*layout=*/::std::make_optional(self.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
   auto src_meta = at::native::empty_strided_meta_symint(
       src.sym_sizes(),
       src.sym_strides(),
-      /*dtype=*/c10::make_optional(src.scalar_type()),
-      /*layout=*/c10::make_optional(src.layout()),
-      /*device=*/c10::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/c10::nullopt);
+      /*dtype=*/::std::make_optional(src.scalar_type()),
+      /*layout=*/::std::make_optional(src.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
   auto out_meta =
       at::compositeexplicitautogradnonfunctional::as_strided_scatter_symint(
           self_meta, src_meta, size, stride, storage_offset);
@@ -1372,7 +1372,7 @@ std::vector<Shape> compute_shape_normal_functional(
     const at::Tensor& self,
     double mean,
     double std,
-    c10::optional<at::Generator> generator) {
+    ::std::optional<at::Generator> generator) {
   return {Shape(self.scalar_type(), self.sizes().vec())};
 }
 
@@ -1380,7 +1380,7 @@ std::vector<Shape> compute_shape_uniform(
     const at::Tensor& self,
     double from,
     double to,
-    c10::optional<at::Generator> generator) {
+    ::std::optional<at::Generator> generator) {
   return {Shape(self.scalar_type(), self.sizes().vec())};
 }
 
diff --git a/torch/csrc/lazy/core/shape_inference.h b/torch/csrc/lazy/core/shape_inference.h
index a8388a0b22357..77eeaaa563187 100644
--- a/torch/csrc/lazy/core/shape_inference.h
+++ b/torch/csrc/lazy/core/shape_inference.h
@@ -24,16 +24,16 @@ TORCH_API std::vector<torch::lazy::Shape> compute_shape__adaptive_avg_pool3d(con
 TORCH_API std::vector<torch::lazy::Shape> compute_shape__adaptive_avg_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_abs(const at::Tensor & self);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_arange_out(const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, at::Tensor & out);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_bernoulli(const at::Tensor & self, c10::optional<at::Generator> generator);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_bernoulli(const at::Tensor & self, double p, c10::optional<at::Generator> generator);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_binary_cross_entropy(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_binary_cross_entropy_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_bernoulli(const at::Tensor & self, ::std::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_bernoulli(const at::Tensor & self, double p, ::std::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_binary_cross_entropy(const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_binary_cross_entropy_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_cat(at::TensorList tensors, int64_t dim);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_cholesky(const at::Tensor & self, bool upper);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_clamp_min(const at::Tensor & self, const at::Scalar & min);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_clone(const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_clone(const at::Tensor & self, ::std::optional<at::MemoryFormat> memory_format);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_constant_pad_nd(const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_convolution(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_embedding(const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_embedding_dense_backward(const at::Tensor & grad_output, const at::Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq);
@@ -57,23 +57,23 @@ TORCH_API std::vector<torch::lazy::Shape> compute_shape_logical_xor(const at::Te
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_masked_fill(const at::Tensor & self, const at::Tensor & mask, const at::Scalar & value);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_masked_fill(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & value);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_max(const at::Tensor & self);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_mean(const at::Tensor & self, c10::optional<at::ScalarType> dtype);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_mean(const at::Tensor & self, ::std::optional<at::ScalarType> dtype);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_min(const at::Tensor & self);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_mv(const at::Tensor & self, const at::Tensor & vec);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_batch_norm(const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double momentum, double eps);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_batch_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_invstd, bool train, double eps, ::std::array<bool,3> output_mask);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_dropout(const at::Tensor & input, double p, c10::optional<bool> train);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_batch_norm(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, const ::std::optional<at::Tensor> & running_mean, const ::std::optional<at::Tensor> & running_var, bool training, double momentum, double eps);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_batch_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & running_mean, const ::std::optional<at::Tensor> & running_var, const ::std::optional<at::Tensor> & save_mean, const ::std::optional<at::Tensor> & save_invstd, bool train, double eps, ::std::array<bool,3> output_mask);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_dropout(const at::Tensor & input, double p, ::std::optional<bool> train);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_dropout_backward(const at::Tensor & grad_output, const at::Tensor & mask, double scale);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_layer_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_new_empty_strided(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_nll_loss2d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_nll_loss2d_forward(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, double eps);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_layer_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_new_empty_strided(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_nll_loss2d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_nll_loss2d_forward(const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_nonzero(const at::Tensor & self);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_normal_functional(const at::Tensor & self, double mean, double std, c10::optional<at::Generator> generator);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, c10::optional<at::Generator> generator);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, int64_t to, c10::optional<at::Generator> generator);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_normal_functional(const at::Tensor & self, double mean, double std, ::std::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, ::std::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, int64_t to, ::std::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, int64_t from, ::std::optional<int64_t> to, ::std::optional<at::Generator> generator);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_relu(const at::Tensor & self);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_repeat(const at::Tensor & self, at::IntArrayRef repeats);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_slogdet(const at::Tensor & self);
@@ -82,9 +82,9 @@ TORCH_API std::vector<torch::lazy::Shape> compute_shape_sort(const at::Tensor &
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_stack(at::TensorList tensors, int64_t dim);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, bool unbiased);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, at::OptionalIntArrayRef dim, const c10::optional<at::Scalar> & correction, bool keepdim);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_sum(const at::Tensor & self, c10::optional<at::ScalarType> dtype);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape__to_copy(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, c10::optional<at::MemoryFormat> memory_format);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, at::OptionalIntArrayRef dim, const ::std::optional<at::Scalar> & correction, bool keepdim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_sum(const at::Tensor & self, ::std::optional<at::ScalarType> dtype);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape__to_copy(const at::Tensor & self, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory, bool non_blocking, ::std::optional<at::MemoryFormat> memory_format);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_take(const at::Tensor & self, const at::Tensor & index);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_trace(const at::Tensor & self);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_zero(const at::Tensor & self);
@@ -92,13 +92,13 @@ TORCH_API std::vector<torch::lazy::Shape> compute_shape_narrow_copy_symint(const
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_hardswish(const at::Tensor & self);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_hardswish_backward(const at::Tensor & grad_output, const at::Tensor & self);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_selu(const at::Tensor & self);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_uniform(const at::Tensor & self, double from, double to, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_uniform(const at::Tensor & self, double from, double to, ::std::optional<at::Generator> generator);
 
 // Non-Native ops
 TORCH_API std::vector<Shape> compute_shape_scalar(const at::Scalar& value, const at::ScalarType& type);
 TORCH_API std::vector<Shape> compute_shape_expand(const Output& input0, const std::vector<int64_t>& size, const bool& is_scalar_expand);
 TORCH_API std::vector<Shape> compute_shape_view(const Output& input0, const std::vector<int64_t>& output_sizes);
-TORCH_API std::vector<Shape> compute_shape_cast(const Output& input0, const at::ScalarType& dtype, const c10::optional<at::ScalarType>& stype);
+TORCH_API std::vector<Shape> compute_shape_cast(const Output& input0, const at::ScalarType& dtype, const ::std::optional<at::ScalarType>& stype);
 
 // View Ops
 // (Now that functionalization pass is used, we should kill these in a later PR)
@@ -117,8 +117,8 @@ TORCH_API std::vector<Shape> compute_shape_unsqueeze(const Output& input, const
 
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_select_scatter(const at::Tensor & self, const at::Tensor & src, int64_t dim, int64_t index);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_diagonal_scatter(const at::Tensor & self, const at::Tensor & src, int64_t offset, int64_t dim1, int64_t dim2);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_slice_scatter_symint(const at::Tensor & self, const at::Tensor & src, int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_as_strided_scatter_symint(const at::Tensor & self, const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_slice_scatter_symint(const at::Tensor & self, const at::Tensor & src, int64_t dim, ::std::optional<c10::SymInt> start, ::std::optional<c10::SymInt> end, c10::SymInt step);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_as_strided_scatter_symint(const at::Tensor & self, const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional<c10::SymInt> storage_offset);
 // clang-format on
 } // namespace lazy
 } // namespace torch
diff --git a/torch/csrc/lazy/core/tensor.cpp b/torch/csrc/lazy/core/tensor.cpp
index 74b519a4ac46f..541a0f6f5a070 100644
--- a/torch/csrc/lazy/core/tensor.cpp
+++ b/torch/csrc/lazy/core/tensor.cpp
@@ -358,8 +358,8 @@ LazyTensorPtr TryGetLtcTensor(const at::Tensor& tensor) {
 
 LazyTensorPtr GetLtcTensor(const at::Tensor& tensor) {
   auto lazy_tensor = TryGetLtcTensor(tensor);
-  CHECK(lazy_tensor) << "Input tensor is not a lazy tensor: "
-                     << tensor.toString();
+  TORCH_CHECK(
+      lazy_tensor, "Input tensor is not a lazy tensor: ", tensor.toString());
   return lazy_tensor;
 }
 
diff --git a/torch/csrc/lazy/core/tensor_impl.cpp b/torch/csrc/lazy/core/tensor_impl.cpp
index 7cbec8112bf5c..8dad8edb7f387 100644
--- a/torch/csrc/lazy/core/tensor_impl.cpp
+++ b/torch/csrc/lazy/core/tensor_impl.cpp
@@ -210,7 +210,7 @@ bool LTCTensorImpl::is_contiguous_custom(c10::MemoryFormat _unused) const {
     return tensor_->CurrentTensorData()->is_contiguous();
   }
   // Only check that the storage is already contiguous.
-  CHECK(is_contiguous_) << "Non-contiguous storage for lazy tensor";
+  TORCH_CHECK(is_contiguous_, "Non-contiguous storage for lazy tensor");
   // TODO: I don't think logic is right, we should check the requested memory
   // format before returning true
   return true;
diff --git a/torch/csrc/lazy/python/init.cpp b/torch/csrc/lazy/python/init.cpp
index ca996c43fb007..616ce56b697e9 100644
--- a/torch/csrc/lazy/python/init.cpp
+++ b/torch/csrc/lazy/python/init.cpp
@@ -261,7 +261,7 @@ void initLazyBindings(PyObject* module) {
             if (tsDataPtr->HasValue()) {
               ivalues.emplace_back(tsDataPtr->data());
             } else {
-              CHECK(tsDataPtr->scalar.has_value());
+              TORCH_CHECK(tsDataPtr->scalar.has_value());
               ivalues.emplace_back(tsDataPtr->scalar.value());
             }
           }
diff --git a/torch/csrc/lazy/test_mnist.py b/torch/csrc/lazy/test_mnist.py
index e0ff82eed50a1..20b94437239be 100644
--- a/torch/csrc/lazy/test_mnist.py
+++ b/torch/csrc/lazy/test_mnist.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import os
 
 import torch
@@ -52,13 +54,9 @@ def train(log_interval, model, device, train_loader, optimizer, epoch):
 
         if batch_idx % log_interval == 0:
             print(
-                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
-                    epoch,
-                    batch_idx * len(data),
-                    len(train_loader.dataset),
-                    100.0 * batch_idx / len(train_loader),
-                    loss.item(),
-                )
+                f"Train Epoch: {epoch} "
+                f"[{batch_idx * len(data)}/{len(train_loader.dataset)} ({100.0 * batch_idx / len(train_loader):.0f}%)]"
+                f"\tLoss: {loss.item():.6f}"
             )
 
 
diff --git a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
index 1cfcc2dfc56f5..927e2ba62c2de 100644
--- a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
@@ -220,7 +220,7 @@ std::vector<torch::lazy::BackendDataPtr> TSBackendImpl::ExecuteComputation(
     } else {
       // TODO(whc) should this check be made more general? it's written somewhat
       // oddly
-      CHECK(
+      TORCH_CHECK(
           static_cast<c10::DeviceType>(default_device_type_->type) !=
               at::kCUDA ||
           ts_data->data().device().type() == at::kCUDA);
diff --git a/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp b/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
index ad1cac4870f51..511a55df0dffa 100644
--- a/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
@@ -33,7 +33,7 @@ void TSLoweringContext::Lower(const Node* node) {
     // First, we call the node lowering function, which exists for newly
     // codegenned or refactored nodes
     TSOpVector ops = tsnode->Lower(function_, this);
-    CHECK(!ops.empty()) << "Failed to lower: " << *node;
+    TORCH_CHECK(!ops.empty(), "Failed to lower: ", *node);
     TORCH_CHECK_EQ(node->num_outputs(), ops.size());
     for (size_t i = 0; i < ops.size(); ++i) {
       AssignOutputOp(torch::lazy::Output(node, i), ops[i]);
diff --git a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
index d01c3fc1e6168..456ff4211ac1a 100644
--- a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
@@ -69,14 +69,14 @@ at::Tensor LazyNativeFunctions::_copy_from(
   if (!self_tensor) {
     // providing a new 'eager' value (self) for an existing lazy tensor (dst)
     static bool sync_update = FLAGS_torch_lazy_ts_tensor_update_sync;
-    CHECK(dst_tensor);
+    TORCH_CHECK(dst_tensor);
     dst_tensor->UpdateFromTensor(self, /*sync=*/sync_update);
   } else if (!dst_tensor) {
     // materializing a lazy tensor (self) and copying its value into eager
     // tensor (dst) detached=false lets us skip a copy in `ToTensor`, which
     // should be safe because we are only going to use the tensor for
     // dst.copy_()
-    CHECK(self_tensor);
+    TORCH_CHECK(self_tensor);
     at::Tensor tensor = self_tensor->ToTensor(/*detached=*/false);
     at::Tensor typed_tensor =
         torch::lazy::CopyTensor(tensor, dst.scalar_type(), /*copy=*/false);
@@ -87,7 +87,7 @@ at::Tensor LazyNativeFunctions::_copy_from(
       // if dest is not backed by IR (e.g. result of some lazy operation),
       // then it should have at::Tensor data backing it instead
       auto dst_tensor_data = dst_tensor->CurrentTensorData();
-      CHECK(dst_tensor_data);
+      TORCH_CHECK(dst_tensor_data);
       auto src_tensor_data = self_tensor->CurrentTensorData();
       if (src_tensor_data) {
         // both src/dst are simply backed by at::Tensor data, no IR- do a
@@ -118,10 +118,10 @@ at::Tensor LazyNativeFunctions::_copy_from_and_resize(
   auto dst_tensor = torch::lazy::TryGetLtcTensor(dst);
   auto self_tensor = torch::lazy::TryGetLtcTensor(self);
   if (!self_tensor) {
-    CHECK(dst_tensor);
+    TORCH_CHECK(dst_tensor);
     dst_tensor->UpdateFromTensorOut(self);
   } else if (!dst_tensor) {
-    CHECK(self_tensor);
+    TORCH_CHECK(self_tensor);
     at::Tensor tensor = self_tensor->ToTensor(/*detached=*/true);
     at::Tensor typed_tensor =
         torch::lazy::CopyTensor(tensor, dst.scalar_type(), /*copy=*/false);
diff --git a/torch/csrc/lazy/ts_backend/ts_node.cpp b/torch/csrc/lazy/ts_backend/ts_node.cpp
index ebe15d06e61b3..597eb840aebf1 100644
--- a/torch/csrc/lazy/ts_backend/ts_node.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_node.cpp
@@ -91,7 +91,7 @@ TSOpVector TensorList::Lower(
     std::shared_ptr<torch::jit::GraphFunction> function,
     TSLoweringContext* loctx) const {
   std::vector<torch::jit::Value*> tensor_list;
-  CHECK(!operands().empty());
+  TORCH_CHECK(!operands().empty());
   for (const torch::lazy::Output& operand : operands()) {
     tensor_list.emplace_back(loctx->GetOutputOp(operand));
   }
diff --git a/torch/csrc/mtia/Module.cpp b/torch/csrc/mtia/Module.cpp
new file mode 100644
index 0000000000000..84cc11f718759
--- /dev/null
+++ b/torch/csrc/mtia/Module.cpp
@@ -0,0 +1,81 @@
+#include <ATen/ATen.h>
+#include <c10/util/CallOnce.h>
+#include <torch/csrc/Generator.h>
+#include <torch/csrc/Stream.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/device_lazy_init.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#ifndef WIN32
+#include <pthread.h>
+#endif
+
+namespace torch {
+namespace mtia {
+
+static bool in_bad_fork = false; // True for children forked after mtia init
+
+#ifndef WIN32
+// Called in the forked child if mtia has already been initialized
+static void forked_child() {
+  in_bad_fork = true;
+  torch::utils::set_requires_device_init(at::kMTIA, true);
+}
+#endif
+
+// Should be called before the first mtia call.
+// Note: This is distinct from initExtension because a stub mtia implementation
+// has some working functions (e.g. device_count) but cannot fully initialize.
+static void poison_fork() {
+#ifndef WIN32
+  static c10::once_flag flag;
+  c10::call_once(flag, [] { pthread_atfork(nullptr, nullptr, forked_child); });
+#endif
+}
+
+void initModule(PyObject* module) {
+  auto m = py::handle(module).cast<py::module>();
+
+  m.def("_mtia_init", []() {
+    TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
+    poison_fork();
+    at::globalContext().lazyInitMTIA();
+  });
+
+  m.def("_mtia_isBuilt", []() {
+    // Check if the MTIAHooks class has been registered with the registry.
+    return at::detail::isMTIAHooksBuilt();
+  });
+
+  m.def("_mtia_isInBadFork", []() { return in_bad_fork; });
+
+  m.def("_mtia_getCurrentStream", [](c10::DeviceIndex device_index) {
+    torch::utils::device_lazy_init(at::kMTIA);
+    return at::detail::getMTIAHooks().getCurrentStream(device_index);
+  });
+
+  m.def("_mtia_deviceSynchronize", [](c10::DeviceIndex device_index) {
+    torch::utils::device_lazy_init(at::kMTIA);
+    at::detail::getMTIAHooks().deviceSynchronize(
+        at::detail::getMTIAHooks().getCurrentDevice());
+  });
+
+  m.def("_mtia_getDefaultStream", [](c10::DeviceIndex device_index) {
+    torch::utils::device_lazy_init(at::kMTIA);
+    return at::detail::getMTIAHooks().getDefaultStream(device_index);
+  });
+
+  m.def("_mtia_setCurrentStream", [](const c10::Stream& stream) {
+    torch::utils::device_lazy_init(at::kMTIA);
+    auto device = at::detail::getMTIAHooks().getCurrentDevice();
+    if (device != stream.device_index()) {
+      at::detail::getMTIAHooks().setCurrentDevice(stream.device_index());
+    }
+    at::detail::getMTIAHooks().setCurrentStream(stream);
+  });
+}
+
+} // namespace mtia
+} // namespace torch
diff --git a/torch/csrc/mtia/Module.h b/torch/csrc/mtia/Module.h
new file mode 100644
index 0000000000000..96a98ed448e16
--- /dev/null
+++ b/torch/csrc/mtia/Module.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch {
+namespace mtia {
+
+// PyMethodDef* python_functions();
+void initModule(PyObject* module);
+
+} // namespace mtia
+} // namespace torch
diff --git a/torch/csrc/onnx/back_compat.h b/torch/csrc/onnx/back_compat.h
index d5e58c8f9d874..9afefe345388f 100644
--- a/torch/csrc/onnx/back_compat.h
+++ b/torch/csrc/onnx/back_compat.h
@@ -12,8 +12,14 @@ namespace torch::onnx {
 // ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FN
 constexpr auto TensorProto_DataType_FLOAT8E4M3FN =
     static_cast<::ONNX_NAMESPACE::TensorProto_DataType>(17);
+// ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FNUZ
+constexpr auto TensorProto_DataType_FLOAT8E4M3FNUZ =
+    static_cast<::ONNX_NAMESPACE::TensorProto_DataType>(18);
 // ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2
 constexpr auto TensorProto_DataType_FLOAT8E5M2 =
     static_cast<::ONNX_NAMESPACE::TensorProto_DataType>(19);
+// ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2FNUZ
+constexpr auto TensorProto_DataType_FLOAT8E5M2FNUZ =
+    static_cast<::ONNX_NAMESPACE::TensorProto_DataType>(20);
 
 } // namespace torch::onnx
diff --git a/torch/csrc/onnx/diagnostics/generated/rules.h b/torch/csrc/onnx/diagnostics/generated/rules.h
index 5635bcebade25..41d2bdd2b3747 100644
--- a/torch/csrc/onnx/diagnostics/generated/rules.h
+++ b/torch/csrc/onnx/diagnostics/generated/rules.h
@@ -8,9 +8,7 @@
  * Diagnostic rules for PyTorch ONNX export.
  */
 
-namespace torch {
-namespace onnx {
-namespace diagnostics {
+namespace torch::onnx::diagnostics {
 
 enum class Rule : uint32_t {
   /**
@@ -102,6 +100,4 @@ static constexpr const char* const kPyRuleNames[] = {
     "find_operator_overloads_in_onnx_registry",
 };
 
-} // namespace diagnostics
-} // namespace onnx
-} // namespace torch
+} // namespace torch::onnx::diagnostics
diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp
index 37e5341fe3f63..825ed46e11a50 100644
--- a/torch/csrc/onnx/init.cpp
+++ b/torch/csrc/onnx/init.cpp
@@ -274,7 +274,11 @@ void initONNXBindings(PyObject* module) {
       .value("COMPLEX128", ::ONNX_NAMESPACE::TensorProto_DataType_COMPLEX128)
       .value("BFLOAT16", ::ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)
       .value("FLOAT8E4M3FN", ::torch::onnx::TensorProto_DataType_FLOAT8E4M3FN)
-      .value("FLOAT8E5M2", ::torch::onnx::TensorProto_DataType_FLOAT8E5M2);
+      .value(
+          "FLOAT8E4M3FNUZ", ::torch::onnx::TensorProto_DataType_FLOAT8E4M3FNUZ)
+      .value("FLOAT8E5M2", ::torch::onnx::TensorProto_DataType_FLOAT8E5M2)
+      .value(
+          "FLOAT8E5M2FNUZ", ::torch::onnx::TensorProto_DataType_FLOAT8E5M2FNUZ);
 
   py::enum_<OperatorExportTypes>(onnx, "OperatorExportTypes")
       .value("ONNX", OperatorExportTypes::ONNX)
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index 6df5e0d6fcd58..1c7c613d23b0b 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -311,6 +311,7 @@ std::unique_ptr<KinetoObserverContext> ThreadLocalSubqueue::begin_op(
           fn.forwardThreadId(),
           fn.scope(),
           fn.isAsync(),
+          fn.handle(),
           fn.debugHandle(),
           fn.name()});
   if (config_.report_input_shapes) {
@@ -591,7 +592,7 @@ int64_t Result::endTimeNS() const {
           Vulkan, start_time_ns_ + (e.in_tree_building_ ? 0 : e.duration_ns_)),
       ATTRIBUTE(Allocation, start_time_ns_),
       ATTRIBUTE(OutOfMemory, start_time_ns_),
-      ATTRIBUTE(Kineto, start_time_ns_ + e.duration_us_ * 1000),
+      ATTRIBUTE(Kineto, start_time_ns_ + e.duration_ns_),
       [&](const auto& e) -> int64_t { return e.end_time_ns_; }));
 
   // In rare cases we're willing to tolerate ops which are missing an end time
@@ -802,23 +803,28 @@ static constexpr const char* indexKey = "Ev Idx";
 
 void passEventsToKineto(
     const std::vector<std::shared_ptr<Result>>& results,
-    uint64_t start_time_us,
-    uint64_t end_time_us,
+    uint64_t start_time_ns,
+    uint64_t end_time_ns,
     const ProfilerConfig& config) {
   using namespace torch::profiler::impl::kineto;
   TraceWrapper cpu_trace(
-      static_cast<int64_t>(start_time_us), "PyTorch Profiler");
+      static_cast<int64_t>(start_time_ns), "PyTorch Profiler");
 
   // Generate Kineto events for each event recorded by the PyTorch profiler.
   for (const auto i : c10::irange(results.size())) {
     const auto& e = results[i];
+    // (TODO): This is a temporary fix for async traces to make sure that we do
+    // not use int64 MIN as end time in Kineto. If we use that value, the
+    // duration will overflow and become a very large positive number. For a
+    // long term solution, add guards in kineto for each activity type
+    int64_t act_end_time = std::max(e->endTimeNS(), e->start_time_ns_);
     auto* activity = cpu_trace.addCPUActivity(
         e->name(),
         e->kinetoType(),
         e->kineto_info_,
         e->correlationID(),
-        e->start_time_ns_ / 1000,
-        e->endTimeNS() / 1000);
+        e->start_time_ns_,
+        act_end_time);
 
     TORCH_INTERNAL_ASSERT(activity || !kKinetoAvailable);
     if (activity) {
@@ -841,7 +847,7 @@ void passEventsToKineto(
   }
 
   // Kineto adds the events that it collected.
-  cpu_trace.transferCpuTrace(static_cast<int64_t>(end_time_us));
+  cpu_trace.transferCpuTrace(static_cast<int64_t>(end_time_ns));
 }
 
 #ifdef USE_KINETO
@@ -950,7 +956,7 @@ class TransferEvents {
         static_cast<int32_t>(activity->resourceId())};
 
     auto event = Result::create(
-        activity->timestamp() * 1000,
+        activity->timestamp(),
         noTID, // Placeholder
         device_and_resource,
         ExtraFields<EventType::Kineto>{
@@ -1097,11 +1103,11 @@ class TransferEvents {
 
 trace_ptr_t addKinetoEvents(
     std::vector<std::shared_ptr<Result>>& results,
-    uint64_t start_time_us,
-    uint64_t end_time_us,
+    uint64_t start_time_ns,
+    uint64_t end_time_ns,
     const ProfilerConfig& config) {
   using namespace torch::profiler::impl::kineto;
-  passEventsToKineto(results, start_time_us, end_time_us, config);
+  passEventsToKineto(results, start_time_ns, end_time_ns, config);
 
   // In on demand mode kineto is directly controlled by other machinery.
   if (config.global()) {
@@ -1352,8 +1358,8 @@ std::pair<
     std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>>
 RecordQueue::getRecords(
     std::function<c10::time_t(c10::approx_time_t)> time_converter,
-    uint64_t start_time_us,
-    uint64_t end_time_us) {
+    uint64_t start_time_ns,
+    uint64_t end_time_ns) {
   auto converter = [&](c10::approx_time_t t) {
     return t == std::numeric_limits<c10::approx_time_t>::min()
         ? std::numeric_limits<c10::time_t>::min()
@@ -1403,10 +1409,19 @@ RecordQueue::getRecords(
   }
 
   if (python_tracer_) {
-    for (const auto& i : python_tracer_->getEvents(
-             converter,
-             python_enters,
-             static_cast<c10::time_t>(end_time_us * 1000))) {
+    std::vector<std::shared_ptr<torch::profiler::impl::Result>> ev;
+    try {
+      ev = python_tracer_->getEvents(
+          converter, python_enters, static_cast<c10::time_t>(end_time_ns));
+    } catch (std::exception& e) {
+      // Normally addKinetoEvents() below will stop the trace - but if an
+      // exception happens here then the events will never be stopped and future
+      // runs will be broken - so make sure to stopTrace() if we see an
+      // exception.
+      torch::profiler::impl::kineto::stopTrace();
+      throw;
+    }
+    for (const auto& i : ev) {
       out.push_back(i);
     }
     python_tracer_.reset();
@@ -1426,7 +1441,7 @@ RecordQueue::getRecords(
     }
   }
 
-  auto trace = addKinetoEvents(out, start_time_us, end_time_us, config_);
+  auto trace = addKinetoEvents(out, start_time_ns, end_time_ns, config_);
 
   std::stable_sort(out.begin(), out.end(), [](const auto& a, const auto& b) {
     return a->start_time_ns_ < b->start_time_ns_;
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index 3678e04bfbdac..3a129b3118d86 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -106,6 +106,7 @@ struct TorchOpBasicFields {
   uint64_t forward_tid_{0};
   at::RecordScope scope_{};
   bool is_async_{false};
+  uint64_t record_function_id_{0};
   int64_t debug_handle_{0};
   std::string name_;
 
@@ -339,7 +340,7 @@ struct ExtraFields<EventType::Kineto> {
   };
 
   std::string name_;
-  int64_t duration_us_{0};
+  int64_t duration_ns_{0};
   uint64_t correlation_id_{0};
   libkineto::ActivityType activity_type_;
   Flow flow;
@@ -631,8 +632,8 @@ class TORCH_API RecordQueue {
       std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>>
   getRecords(
       std::function<c10::time_t(c10::approx_time_t)> time_converter,
-      uint64_t start_time_us,
-      uint64_t end_time_us);
+      uint64_t start_time_ns,
+      uint64_t end_time_ns);
 
  private:
   uint32_t id_;
diff --git a/torch/csrc/profiler/kineto_client_interface.cpp b/torch/csrc/profiler/kineto_client_interface.cpp
index 7fd1e4db5a6c4..bf4b8f2112c50 100644
--- a/torch/csrc/profiler/kineto_client_interface.cpp
+++ b/torch/csrc/profiler/kineto_client_interface.cpp
@@ -1,6 +1,9 @@
 #ifdef USE_KINETO
+#include <ATen/Context.h>
 #include <libkineto.h>
 #include <torch/csrc/autograd/profiler_kineto.h>
+#include <chrono>
+#include <thread>
 
 // Ondemand tracing is not supported on Apple or edge platform
 #if defined(__APPLE__) || defined(EDGE_PROFILER_USE_KINETO)
@@ -72,16 +75,43 @@ class LibKinetoClient : public libkineto::ClientInterface {
 #if ENABLE_GLOBAL_OBSERVER
 namespace {
 
+int get_init_delay() {
+  const char* delay_c = std::getenv("KINETO_DAEMON_INIT_DELAY_S");
+  if (!delay_c) {
+    return -1;
+  }
+  std::string delay_s{delay_c};
+  try {
+    return std::stoi(delay_s);
+  } catch (const std::invalid_argument& _) {
+    return -1;
+  }
+}
+
 struct RegisterLibKinetoClient {
   RegisterLibKinetoClient() {
     static profiler::impl::LibKinetoClient client;
+    libkineto::api().registerClient(&client);
 
-    if (std::getenv("KINETO_USE_DAEMON") != nullptr) {
-      libkineto_init(/*cpuOnly=*/false, /*logOnError=*/true);
+    auto kineto_init = []() {
+      libkineto_init(
+          /*cpuOnly=*/!(at::hasCUDA() || at::hasXPU() || at::hasMTIA()),
+          /*logOnError=*/true);
       libkineto::api().suppressLogMessages();
-    }
+    };
 
-    libkineto::api().registerClient(&client);
+    if (std::getenv("KINETO_USE_DAEMON") != nullptr) {
+      int init_delay_s = get_init_delay();
+      if (init_delay_s > 0) {
+        std::thread t([init_delay_s, kineto_init]() {
+          std::this_thread::sleep_for(std::chrono::seconds(init_delay_s));
+          kineto_init();
+        });
+        t.detach();
+      } else {
+        kineto_init();
+      }
+    }
   }
 } register_libkineto_client;
 
diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp
index 56b089592b1b1..6d6cb46e42773 100644
--- a/torch/csrc/profiler/kineto_shim.cpp
+++ b/torch/csrc/profiler/kineto_shim.cpp
@@ -25,6 +25,8 @@ const std::set<libkineto::ActivityType> kCpuTypes{
     libkineto::ActivityType::CUDA_RUNTIME,
     libkineto::ActivityType::CUDA_DRIVER,
     libkineto::ActivityType::PYTHON_FUNCTION,
+    libkineto::ActivityType::PRIVATEUSE1_RUNTIME,
+    libkineto::ActivityType::PRIVATEUSE1_DRIVER,
 };
 
 const std::set<libkineto::ActivityType> kCudaTypes = {
@@ -47,6 +49,15 @@ const std::set<libkineto::ActivityType> kMtiaTypes = {
     libkineto::ActivityType::MTIA_CCP_EVENTS,
     libkineto::ActivityType::MTIA_RUNTIME,
 };
+const std::set<libkineto::ActivityType> kPrivateUse1Types = {
+    libkineto::ActivityType::GPU_MEMCPY,
+    libkineto::ActivityType::GPU_MEMSET,
+    libkineto::ActivityType::GPU_USER_ANNOTATION,
+    libkineto::ActivityType::CONCURRENT_KERNEL,
+    // PRIVATEUSE1_RUNTIME appears in both kCpuTypes and kPrivateUse1Types.
+    libkineto::ActivityType::PRIVATEUSE1_RUNTIME,
+    libkineto::ActivityType::PRIVATEUSE1_DRIVER,
+};
 } // namespace
 #endif // USE_KINETO
 
@@ -203,6 +214,14 @@ class ExperimentalConfigWrapper {
 };
 } // namespace
 
+bool collectivesProfilerExists() {
+#ifdef KINETO_HAS_NCCL_PROFILER
+  return true;
+#else
+  return false;
+#endif
+}
+
 void prepareTrace(
     const bool cpuOnly,
     const ActivitySet& activities,
@@ -237,6 +256,12 @@ void prepareTrace(
       k_activities.insert(libkineto::ActivityType::CUDA_SYNC);
     }
   }
+  if (collectivesProfilerExists()) {
+    k_activities.insert(libkineto::ActivityType::COLLECTIVE_COMM);
+  }
+  if (activities.count(torch::autograd::profiler::ActivityType::PrivateUse1)) {
+    k_activities.insert(kPrivateUse1Types.begin(), kPrivateUse1Types.end());
+  }
 
   ExperimentalConfigWrapper configWrap(config);
 
@@ -325,17 +350,30 @@ c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
     case libkineto::ActivityType::GPU_USER_ANNOTATION:
     case libkineto::ActivityType::CUDA_PROFILER_RANGE:
     // TODO: T151322015
-    case libkineto::ActivityType::MTIA_CCP_EVENTS:
-      return c10::DeviceType::CUDA;
+    case libkineto::ActivityType::MTIA_CCP_EVENTS: {
+      // PrivateUse1 kineto backend reuse above ActivityTypes,
+      // If PrivateUse1 backend enabled, this should return
+      // c10::DeviceType::PrivateUse1.
+      c10::DeviceType device_type = []() {
+        if (c10::get_privateuse1_backend() != "privateuseone") {
+          return c10::DeviceType::PrivateUse1;
+        }
+        return c10::DeviceType::CUDA;
+      }();
+      return device_type;
+    }
     case libkineto::ActivityType::CPU_OP:
     case libkineto::ActivityType::USER_ANNOTATION:
     case libkineto::ActivityType::EXTERNAL_CORRELATION:
     case libkineto::ActivityType::CUDA_RUNTIME:
+    case libkineto::ActivityType::XPU_RUNTIME:
     case libkineto::ActivityType::CPU_INSTANT_EVENT:
     case libkineto::ActivityType::GLOW_RUNTIME:
     case libkineto::ActivityType::MTIA_RUNTIME:
     case libkineto::ActivityType::PYTHON_FUNCTION:
     case libkineto::ActivityType::CUDA_DRIVER:
+    case libkineto::ActivityType::PRIVATEUSE1_RUNTIME:
+    case libkineto::ActivityType::PRIVATEUSE1_DRIVER:
       return c10::DeviceType::CPU;
     default: {
       TORCH_WARN(
diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h
index e92cbf003d6a1..3ac1917edd922 100644
--- a/torch/csrc/profiler/kineto_shim.h
+++ b/torch/csrc/profiler/kineto_shim.h
@@ -125,6 +125,7 @@ void pushUserCorrelationId(uint64_t correlation_id);
 void popCorrelationId();
 void popUserCorrelationId();
 void recordThreadInfo();
+bool collectivesProfilerExists();
 
 void logInvariantViolation(
     const std::string& assertion,
diff --git a/torch/csrc/profiler/orchestration/observer.h b/torch/csrc/profiler/orchestration/observer.h
index da675e0f3dae8..4230851607608 100644
--- a/torch/csrc/profiler/orchestration/observer.h
+++ b/torch/csrc/profiler/orchestration/observer.h
@@ -17,6 +17,7 @@ enum class C10_API_ENUM ActivityType {
   XPU, // XPU kernels, runtime
   CUDA, // CUDA kernels, runtime
   MTIA, // MTIA kernels, runtime
+  PrivateUse1, // PrivateUse1 kernels, runtime
   NUM_KINETO_ACTIVITIES, // must be the last one
 };
 
diff --git a/torch/csrc/profiler/python/combined_traceback.h b/torch/csrc/profiler/python/combined_traceback.h
index 0dcb9a11c04ac..f71033fd88e70 100644
--- a/torch/csrc/profiler/python/combined_traceback.h
+++ b/torch/csrc/profiler/python/combined_traceback.h
@@ -11,7 +11,7 @@ namespace torch {
 // returns std::vector because one use is to call it with a batch of
 // tracebacks that come from a larger datastructure (e.g. a memory snapshot)
 // and then have more c++ code to put those objects in the right place.
-std::vector<pybind11::object> py_symbolize(
+TORCH_API std::vector<pybind11::object> py_symbolize(
     std::vector<CapturedTraceback*>& to_symbolize);
 
 // requires GIL to be held, frees any pending free frames
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index e6254c323a677..966bf68d3ee42 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/profiler/python/init.h>
 
 #include <ATen/record_function.h>
+#include <c10/core/impl/PyInterpreter.h>
 #include <c10/util/overloaded.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
@@ -139,6 +140,8 @@ namespace profiler {
 namespace {
 struct RecordFunctionFast {
   PyObject_HEAD PyObject* name;
+  PyObject* input_values;
+  PyObject* keyword_values;
   std::unique_ptr<at::RecordFunction> guard;
 };
 
@@ -149,6 +152,8 @@ PyObject* RecordFunctionFast_new(
   RecordFunctionFast* self = (RecordFunctionFast*)subtype->tp_alloc(subtype, 0);
   if (self != nullptr) {
     self->name = nullptr;
+    self->input_values = nullptr;
+    self->keyword_values = nullptr;
     self->guard.reset();
   }
   return (PyObject*)self;
@@ -160,15 +165,21 @@ int RecordFunctionFast_init(
     PyObject* kwargs) {
   auto self = (RecordFunctionFast*)selfGeneric;
   // NOLINTNEXTLINE(*-c-arrays*)
-  constexpr const char* kwlist[] = {"name", nullptr};
+  constexpr const char* kwlist[] = {
+      "name", "input_values", "keyword_values", nullptr};
   PyObject* name = nullptr;
+  PyObject* input_values = nullptr;
+  PyObject* keyword_values = nullptr;
   if (!PyArg_ParseTupleAndKeywords(
           args,
           kwargs,
-          "O",
+          "O|OO", // name is required PyObject, args and kwargs are optional
+                  // PyObjects
           // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
           const_cast<char**>(kwlist),
-          &name)) {
+          &name,
+          &input_values,
+          &keyword_values)) {
     return -1;
   }
   if (name) {
@@ -178,12 +189,26 @@ int RecordFunctionFast_init(
     Py_INCREF(name);
     self->name = name;
   }
+  if (input_values) {
+    TORCH_CHECK(
+        PyList_Check(input_values) || PyTuple_Check(input_values),
+        "input_values must be a list or tuple");
+    Py_INCREF(input_values);
+    self->input_values = input_values;
+  }
+  if (keyword_values) {
+    TORCH_CHECK(PyDict_Check(keyword_values), "keyword_values must be dict");
+    Py_INCREF(keyword_values);
+    self->keyword_values = keyword_values;
+  }
   return 0;
 }
 
 void RecordFunctionFast_dealloc(PyObject* selfGeneric) {
   auto self = (RecordFunctionFast*)selfGeneric;
   Py_CLEAR(self->name);
+  Py_CLEAR(self->input_values);
+  Py_CLEAR(self->keyword_values);
   if (self->guard) {
     self->guard.reset();
   }
@@ -199,7 +224,47 @@ PyObject* RecordFunctionFast_enter(PyObject* selfGeneric, PyObject* unused) {
         "Trying to enter a new record_function_fast context but the guard is unexpectedly already set");
     self->guard =
         std::make_unique<at::RecordFunction>(at::RecordScope::FUNCTION);
-    self->guard->before(THPUtils_unpackString(self->name));
+    std::vector<at::IValue> args;
+    std::unordered_map<std::string, at::IValue> kwargs;
+    bool profiler_need_input = torch::autograd::profiler::profilerEnabled() &&
+        torch::autograd::profiler::getProfilerConfig().report_input_shapes;
+    // parse through args if they exist
+    if (self->input_values != NULL && profiler_need_input) {
+      THPObjectPtr input_fast(
+          PySequence_Fast(self->input_values, "input must be a sequence"));
+      PyObject** input_items = PySequence_Fast_ITEMS(input_fast.get());
+      for (int i = 0; i < PySequence_Fast_GET_SIZE(input_fast.get()); i++) {
+        PyObject* item = input_items[i];
+        auto match = torch::jit::tryToInferType(item);
+        if (match.success()) {
+          args.push_back(torch::jit::toIValue(item, match.type()));
+        }
+      }
+    }
+
+    // parse through kwargs if they exist
+    if (self->keyword_values != NULL && profiler_need_input) {
+      Py_ssize_t pos = 0;
+      PyObject *key, *value;
+      while (PyDict_Next(self->keyword_values, &pos, &key, &value)) {
+        // Get the string representation of the key and value
+        std::string key_str = THPUtils_unpackString(key);
+        at::IValue ivalue;
+        if (THPUtils_checkString(value)) {
+          ivalue = at::IValue(THPUtils_unpackString(value));
+        } else {
+          auto match = torch::jit::tryToInferPrimitiveType(value);
+          if (match.success()) {
+            ivalue = torch::jit::toIValue(value, match.type());
+          } else {
+            TORCH_WARN("Unable to infer type of value for keyword: ", key_str);
+            ivalue = at::IValue("NULL");
+          }
+        }
+        kwargs[key_str] = ivalue;
+      }
+    }
+    self->guard->before(THPUtils_unpackString(self->name), &args, &kwargs);
   }
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
@@ -260,7 +325,8 @@ void initPythonBindings(PyObject* module) {
       .value("CPU", ActivityType::CPU)
       .value("XPU", ActivityType::XPU)
       .value("MTIA", ActivityType::MTIA)
-      .value("CUDA", ActivityType::CUDA);
+      .value("CUDA", ActivityType::CUDA)
+      .value("PrivateUse1", ActivityType::PrivateUse1);
 
   py::class_<ExperimentalConfig>(m, "_ExperimentalConfig")
       .def(
@@ -374,8 +440,7 @@ void initPythonBindings(PyObject* module) {
           "dtype",
           [](const TensorMetadata& metadata) {
             return py::reinterpret_borrow<py::object>(
-                torch::autograd::utils::wrap(
-                    torch::getTHPDtype(metadata.dtype_)));
+                torch::autograd::utils::wrap(metadata.dtype_));
           })
       .def_readonly("dim", &TensorMetadata::dim_)
       .def_readonly("sizes", &TensorMetadata::sizes_)
diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
index f6acb04c59284..3ebd8618bd109 100644
--- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp
+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
@@ -86,7 +86,7 @@ inline std::string getValueShape(
   if (val.isTensor()) {
     auto& tensor = val.toTensor();
     if (tensor.defined() &&
-        !tensor.unsafeGetTensorImpl()->has_symbolic_sizes_strides()) {
+        tensor.unsafeGetTensorImpl()->does_not_have_symbolic_sizes_strides()) {
       return vectorToString(tensor.sizes().vec());
     }
   } else if (val.isTuple()) {
@@ -172,7 +172,7 @@ struct TORCH_API ExecutionTraceObserver {
   enum class RunState { uninitialized, disabled, enabled };
 
   // Mutex for multithreaded access to the shared containers.
-  std::mutex g_mutex{};
+  std::recursive_mutex g_mutex{};
   // Stream to write output JSON.
   std::ofstream out{};
 
@@ -236,6 +236,8 @@ const ExecutionTraceObserver::ID root_id{1};
 
 struct FunctionCallContext : public ObserverContext {
   std::string name;
+  std::string kernel_backend;
+  std::string kernel_file;
   ExecutionTraceObserver::ID op_id{uninitialized_id};
   ExecutionTraceObserver::ID parent_id{uninitialized_id};
   ExecutionTraceObserver::ID fw_parent_id{uninitialized_id};
@@ -273,14 +275,16 @@ static void writeJsonNode(
     const std::string& outputs = "[]",
     const std::string& output_shapes = "[]",
     const std::string& output_types = "[]",
-    const std::string& operator_schema = "") {
+    const std::string& operator_schema = "",
+    const std::string& kernel_backend = "",
+    const std::string& kernel_file = "") {
   out << fmt::format(
       R"JSON(
     {{
       "id": {}, "name": "{}", "ctrl_deps": {},
       "inputs": {{"values": {}, "shapes": {}, "types": {}}},
       "outputs": {{"values": {}, "shapes": {}, "types": {}}},
-      "attrs": [{{"name": "rf_id", "type": "uint64", "value": {}}}, {{"name": "fw_parent", "type": "uint64", "value": {}}}, {{"name": "seq_id", "type": "int64", "value": {}}}, {{"name": "scope", "type": "uint64", "value": {}}}, {{"name": "tid", "type": "uint64", "value": {}}}, {{"name": "fw_tid", "type": "uint64", "value": {}}}, {{"name": "op_schema", "type": "string", "value": "{}"}}]
+      "attrs": [{{"name": "rf_id", "type": "uint64", "value": {}}},{{"name": "fw_parent", "type": "uint64", "value": {}}},{{"name": "seq_id", "type": "int64", "value": {}}},{{"name": "scope", "type": "uint64", "value": {}}},{{"name": "tid", "type": "uint64", "value": {}}},{{"name": "fw_tid", "type": "uint64", "value": {}}},{{"name": "op_schema", "type": "string", "value": "{}"}},{{"name": "kernel_backend", "type": "string", "value": "{}"}},{{"name": "kernel_file", "type": "string", "value": "{}"}}]
     }})JSON",
       id,
       name,
@@ -297,7 +301,9 @@ static void writeJsonNode(
       scope,
       tid,
       fw_tid,
-      operator_schema);
+      operator_schema,
+      kernel_backend,
+      kernel_file);
 }
 
 inline std::string timeString(const std::time_t timepoint) {
@@ -326,7 +332,7 @@ static bool initExecutionTraceStart(ExecutionTraceObserver& ob) {
 
   ob.out << fmt::format(
       R"JSON({{
-  "schema": "1.0.2-chakra.0.0.4", "pid": {}, "time": "{}", "start_ts": {},
+  "schema": "1.0.4-chakra.0.0.4", "pid": {}, "time": "{}", "start_ts": {},
   "nodes": [)JSON",
       ob.pid,
       ob.record_time,
@@ -390,7 +396,7 @@ inline std::string convertIValue(
     size_t itemsize = 0;
     std::string device_str = "";
     // symbolic sizes/strides implies t->storage_offset() will fail
-    if (t->has_storage() && !t->has_symbolic_sizes_strides()) {
+    if (t->has_storage() && t->does_not_have_symbolic_sizes_strides()) {
       auto& t_storage = t->storage();
       storage_id = getObjectID(ob, t_storage.data());
       offset = t->storage_offset();
@@ -442,6 +448,44 @@ inline void appendValueInfo(
   shapes.push_back(getValueShape(val));
 }
 
+inline void handleKernelBackendInfo(
+    FunctionCallContext& fc,
+    const RecordFunction& fn) {
+  // triton kernel related information are in kwinputs
+  const auto& kwinputs = fn.kwinputs();
+  if (kwinputs.find("kernel_backend") != kwinputs.end()) {
+    fc.kernel_backend = kwinputs.at("kernel_backend").toStringRef();
+    if (fc.kernel_backend == "triton") {
+      fc.kernel_file = kwinputs.at("kernel_file").toStringRef();
+      TORCH_INTERNAL_ASSERT(
+          kwinputs.find("kernel_file") != kwinputs.end(),
+          "kernel file is missing in triton kernel");
+      // Remove the path of the file name
+      if (fc.kernel_file.find_last_of('/') != std::string::npos)
+        fc.kernel_file =
+            fc.kernel_file.substr(fc.kernel_file.find_last_of('/') + 1);
+
+      // get grid information
+      TORCH_INTERNAL_ASSERT(
+          kwinputs.find("grid") != kwinputs.end(),
+          "grid is missing in triton kernel");
+      fc.input_values.emplace_back(
+          "\"" + kwinputs.at("grid").toStringRef() + "\"");
+      fc.input_types.emplace_back("\"String\"");
+      fc.input_shapes.emplace_back("[]");
+
+      // get stream information
+      TORCH_INTERNAL_ASSERT(
+          kwinputs.find("stream") != kwinputs.end(),
+          "stream is missing in triton kernel");
+      fc.input_values.emplace_back(
+          std::to_string(kwinputs.at("stream").toInt()));
+      fc.input_types.emplace_back("\"Int\"");
+      fc.input_shapes.emplace_back("[]");
+    }
+  }
+}
+
 static void recordOperatorStart(
     ExecutionTraceObserver& ob,
     FunctionCallContext& fc,
@@ -449,7 +493,7 @@ static void recordOperatorStart(
   auto tid = fn.threadId();
 
   try {
-    const std::lock_guard<std::mutex> lock(ob.g_mutex);
+    const std::lock_guard<std::recursive_mutex> lock(ob.g_mutex);
 
     // if current thread stack is empty, push the root node to the stack first
     if (ob.op_stack[tid].empty()) {
@@ -491,6 +535,9 @@ static void recordOperatorStart(
       appendValueInfo(
           ob, inputs[i], fc.input_values, fc.input_types, fc.input_shapes);
     }
+
+    handleKernelBackendInfo(fc, fn);
+
     fc.parent_id = ob.op_stack[tid].top();
     // get parent id from the forward stack, this can be different for
     // autograd ops, which may execute on a different thread than the original
@@ -583,7 +630,7 @@ static void onFunctionExit(const RecordFunction& fn, ObserverContext* ctx_ptr) {
     std::vector<std::string> output_shapes;
     std::vector<std::string> output_values;
     try {
-      const std::lock_guard<std::mutex> lock(ob->g_mutex);
+      const std::lock_guard<std::recursive_mutex> lock(ob->g_mutex);
       // remove current op id from stack
 
       ob->op_stack[fn.threadId()].pop();
@@ -615,7 +662,9 @@ static void onFunctionExit(const RecordFunction& fn, ObserverContext* ctx_ptr) {
           vectorToString(output_values),
           vectorToString(output_shapes),
           vectorToString(output_types),
-          op_schema_str);
+          op_schema_str,
+          fc.kernel_backend,
+          fc.kernel_file);
       ob->out << ",";
     } catch (const std::exception& e) {
       LOG(WARNING) << "Exception in execution trace observer: [" << fc.name
@@ -679,7 +728,7 @@ void removeExecutionTraceObserver() {
 }
 
 void enableExecutionTraceObserver() {
-  VLOG(1) << "enableExecutionTraceObserver() ";
+  LOG(WARNING) << "Enabling Execution Trace Observer";
   auto& ob = *ObserverManager::get();
   // Make sure we are not already enabled.
   if (ob.getState() == ExecutionTraceObserver::RunState::enabled) {
@@ -691,7 +740,7 @@ void enableExecutionTraceObserver() {
 }
 
 void disableExecutionTraceObserver() {
-  VLOG(1) << "disableExecutionTraceObserver()";
+  LOG(WARNING) << "Disabling Execution Trace Observer";
   auto& ob = *ObserverManager::get();
   if (ob.getState() != ExecutionTraceObserver::RunState::disabled) {
     ob.setState(ExecutionTraceObserver::RunState::disabled);
diff --git a/torch/csrc/profiler/stubs/base.cpp b/torch/csrc/profiler/stubs/base.cpp
index a7b928b44a0a0..7b8396c326567 100644
--- a/torch/csrc/profiler/stubs/base.cpp
+++ b/torch/csrc/profiler/stubs/base.cpp
@@ -12,7 +12,8 @@ namespace {
 struct DefaultStubs : public ProfilerStubs {
   DefaultStubs(const char* name) : name_{name} {}
 
-  void record(int*, ProfilerVoidEventStub*, int64_t*) const override {
+  void record(c10::DeviceIndex*, ProfilerVoidEventStub*, int64_t*)
+      const override {
     fail();
   }
   float elapsed(const ProfilerVoidEventStub*, const ProfilerVoidEventStub*)
diff --git a/torch/csrc/profiler/stubs/base.h b/torch/csrc/profiler/stubs/base.h
index bac3f5ed3787b..e0494e342e44d 100644
--- a/torch/csrc/profiler/stubs/base.h
+++ b/torch/csrc/profiler/stubs/base.h
@@ -3,6 +3,7 @@
 #include <functional>
 #include <memory>
 
+#include <c10/core/Device.h>
 #include <c10/util/strong_type.h>
 #include <torch/csrc/Export.h>
 
@@ -20,7 +21,7 @@ using ProfilerVoidEventStub = std::shared_ptr<void>;
 
 struct TORCH_API ProfilerStubs {
   virtual void record(
-      int* device,
+      c10::DeviceIndex* device,
       ProfilerVoidEventStub* event,
       int64_t* cpu_ns) const = 0;
   virtual float elapsed(
diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp
index 7c8c13034df33..d0cb3746a2169 100644
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@@ -37,8 +37,10 @@ static inline void cudaCheck(cudaError_t result, const char* file, int line) {
 #define TORCH_CUDA_CHECK(result) cudaCheck(result, __FILE__, __LINE__);
 
 struct CUDAMethods : public ProfilerStubs {
-  void record(int* device, ProfilerVoidEventStub* event, int64_t* cpu_ns)
-      const override {
+  void record(
+      c10::DeviceIndex* device,
+      ProfilerVoidEventStub* event,
+      int64_t* cpu_ns) const override {
     if (device) {
       TORCH_CUDA_CHECK(c10::cuda::GetDevice(device));
     }
diff --git a/torch/csrc/profiler/stubs/itt.cpp b/torch/csrc/profiler/stubs/itt.cpp
index 3cd3a1bf021eb..e2a9369503068 100644
--- a/torch/csrc/profiler/stubs/itt.cpp
+++ b/torch/csrc/profiler/stubs/itt.cpp
@@ -10,8 +10,10 @@ namespace impl {
 namespace {
 
 struct ITTMethods : public ProfilerStubs {
-  void record(int* device, ProfilerVoidEventStub* event, int64_t* cpu_ns)
-      const override {}
+  void record(
+      c10::DeviceIndex* device,
+      ProfilerVoidEventStub* event,
+      int64_t* cpu_ns) const override {}
 
   float elapsed(
       const ProfilerVoidEventStub* event,
diff --git a/torch/csrc/profiler/unwind/unwind.cpp b/torch/csrc/profiler/unwind/unwind.cpp
index 6425b358b26eb..f3fbde151b775 100644
--- a/torch/csrc/profiler/unwind/unwind.cpp
+++ b/torch/csrc/profiler/unwind/unwind.cpp
@@ -1,5 +1,6 @@
 #include <c10/util/Exception.h>
 #include <torch/csrc/profiler/unwind/unwind.h>
+#include <torch/csrc/utils/cpp_stacktraces.h>
 
 #if !defined(__linux__) || !defined(__x86_64__) || !defined(__has_include) || \
     !__has_include("ext/stdio_filebuf.h")
@@ -43,6 +44,7 @@ Stats stats() {
 #include <vector>
 
 #include <c10/util/irange.h>
+#include <cxxabi.h>
 #include <torch/csrc/profiler/unwind/communicate.h>
 #include <torch/csrc/profiler/unwind/dwarf_enums.h>
 #include <torch/csrc/profiler/unwind/eh_frame_hdr.h>
@@ -334,6 +336,19 @@ c10::optional<std::pair<std::string, uint64_t>> libraryFor(void* addr) {
 }
 
 struct Symbolizer {
+  Symbolizer() {
+    auto envar = std::getenv("TORCH_ADDR2LINE_BINARY");
+    if (envar != nullptr) {
+      // currently we take user's input as is without checking
+      addr2line_binary_ = envar;
+      TORCH_WARN("Use custom addr2line binary: ", addr2line_binary_);
+    } else {
+      addr2line_binary_ = "addr2line"; // default
+    }
+    if (torch::get_disable_addr2line()) {
+      addr2line_binary_ = nullptr;
+    }
+  }
   static std::lock_guard<std::mutex> guard() {
     static std::mutex mutex;
     return std::lock_guard<std::mutex>(mutex);
@@ -342,6 +357,7 @@ struct Symbolizer {
     static Symbolizer singleton;
     return singleton;
   }
+
   void request(void* addr) {
     if (frame_map_.count(addr)) {
       return;
@@ -351,6 +367,16 @@ struct Symbolizer {
       frame_map_[addr] = Frame{"??", "<unwind unsupported>", 0};
       return;
     }
+    if (addr2line_binary_ == nullptr) {
+      Dl_info dlinfo;
+      std::string funcname = "??";
+      if (dladdr(addr, &dlinfo) && dlinfo.dli_sname) {
+        funcname = demangle(dlinfo.dli_sname);
+      }
+      frame_map_[addr] = Frame{
+          maybe_library->first, std::move(funcname), maybe_library->second - 1};
+      return;
+    }
     has_pending_results_ = true;
     auto& entry = getOrCreate(maybe_library->first);
     entry.queried.push_back(addr);
@@ -382,6 +408,7 @@ struct Symbolizer {
 
  private:
   static constexpr int BLOCK = 1024;
+  const char* addr2line_binary_;
   struct Entry {
     std::unique_ptr<Communicate> comm;
     std::vector<void*> queried;
@@ -396,11 +423,13 @@ struct Symbolizer {
     if (it == entries_.end()) {
       // NOLINTNEXTLINE(*-c-arrays*)
       const char* args[] = {
-          "addr2line", "-C", "-f", "-e", name.c_str(), nullptr};
+          addr2line_binary_, "-C", "-f", "-e", name.c_str(), nullptr};
       it = entries_
                .insert_or_assign(
                    name,
-                   Entry{std::make_unique<Communicate>("addr2line", args), {}})
+                   Entry{
+                       std::make_unique<Communicate>(addr2line_binary_, args),
+                       {}})
                .first;
     }
     return it->second;
@@ -419,6 +448,19 @@ struct Symbolizer {
       frame_map_[e.queried[e.completed]] = std::move(frame);
     }
   }
+  std::string demangle(const std::string& mangled_name) {
+    int status = 0;
+    char* realname =
+        abi::__cxa_demangle(mangled_name.c_str(), nullptr, nullptr, &status);
+    if (status == 0) {
+      std::string demangled_name(realname);
+      // NOLINTNEXTLINE
+      free(realname);
+      return demangled_name;
+    } else {
+      return mangled_name;
+    }
+  }
 };
 
 #ifndef FBCODE_CAFFE2
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index b95175cac23b0..22b645c168673 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -338,7 +338,7 @@ std::vector<std::string> inputTypes(const at::RecordFunction& fn) {
 // ----------------------------------------------------------------------------
 #ifdef USE_DISTRIBUTED
 #ifdef USE_C10D
-static constexpr auto kCommuName = "Collective name";
+static constexpr auto kCommsName = "Collective name";
 static constexpr auto kDtype = "dtype";
 static constexpr auto kInMsgNelems = "In msg nelems";
 static constexpr auto kOutMsgNelems = "Out msg nelems";
@@ -347,6 +347,10 @@ static constexpr auto kOutSplit = "Out split size";
 static constexpr auto kGlobalRankStart = "Global rank start";
 static constexpr auto kGlobalRankStride = "Global rank stride";
 static constexpr auto kGroupSize = "Group size";
+static constexpr auto kProcessGroupName = "Process Group Name";
+static constexpr auto kProcessGroupDesc = "Process Group Description";
+static constexpr auto kGroupRanks = "Process Group Ranks";
+
 static constexpr int32_t kTruncatLength = 30;
 #endif // USE_C10D
 #endif // USE_DISTRIBUTED
@@ -364,7 +368,8 @@ std::unordered_map<std::string, std::string> saveNcclMeta(
     return map;
   }
 
-  map.emplace(kCommuName, fmt::format("\"{}\"", debugInfo->getColumnName()));
+  map.emplace(
+      kCommsName, fmt::format("\"{}\"", debugInfo->getCollectiveName()));
   map.emplace(
       kDtype, fmt::format("\"{}\"", c10::toString(debugInfo->getDType())));
   map.emplace(kInMsgNelems, std::to_string(debugInfo->getInMessageNelems()));
@@ -397,11 +402,38 @@ std::unordered_map<std::string, std::string> saveNcclMeta(
                 outSplitSizes.begin() + kTruncatLength,
                 ", ")));
   }
-  map.emplace(
-      kGlobalRankStart, std::to_string(debugInfo->getGlobalRankStart()));
-  map.emplace(
-      kGlobalRankStride, std::to_string(debugInfo->getGlobalRankStride()));
+  auto globalRankStart = debugInfo->getGlobalRankStart();
+  if (globalRankStart >= 0) {
+    map.emplace(kGlobalRankStart, std::to_string(globalRankStart));
+  }
+  auto globalRankStride = debugInfo->getGlobalRankStride();
+  if (globalRankStride > 0) {
+    map.emplace(kGlobalRankStride, std::to_string(globalRankStride));
+  }
   map.emplace(kGroupSize, std::to_string(debugInfo->getWorldSize()));
+  auto& group_name = debugInfo->getProcessGroupName();
+  if (!group_name.empty()) {
+    map.emplace(kProcessGroupName, fmt::format("\"{}\"", group_name));
+  }
+  auto& group_desc = debugInfo->getProcessGroupDesc();
+  if (!group_desc.empty()) {
+    map.emplace(kProcessGroupDesc, fmt::format("\"{}\"", group_desc));
+  }
+  auto& groupRanks = debugInfo->getGroupRanks();
+  if (!groupRanks.empty() && groupRanks.size() <= kTruncatLength) {
+    map.emplace(
+        kGroupRanks, fmt::format("\"[{}]\"", fmt::join(groupRanks, ", ")));
+  } else if (groupRanks.size() > kTruncatLength) {
+    map.emplace(
+        kGroupRanks,
+        fmt::format(
+            "\"[{}, ..., {}]\"",
+            fmt::join(
+                groupRanks.begin(),
+                groupRanks.begin() + kTruncatLength - 1,
+                ", "),
+            groupRanks.back()));
+  }
 #endif // USE_C10D
 #endif // USE_DISTRIBUTED
   return map;
@@ -612,12 +644,10 @@ uint64_t computeFlops(
     }
     // format of the input is defined in
     // torch.ao.nn.quantized.functional.conv2d()
-    uint64_t minibatch = 0, in_channels = 0, input_h = 0, input_w = 0;
-    uint64_t out_channels = 0, kernel_h = 0, kernel_w = 0;
     const uint64_t conv2d_multiply_factor = 2;
-    std::tie(minibatch, in_channels, input_h, input_w) = std::make_tuple(
+    auto [minibatch, in_channels, input_h, input_w] = std::make_tuple(
         input_sizes[0], input_sizes[1], input_sizes[2], input_sizes[3]);
-    std::tie(out_channels, std::ignore, kernel_h, kernel_w) = std::make_tuple(
+    auto [out_channels, _, kernel_h, kernel_w] = std::make_tuple(
         kernel_sizes[0], kernel_sizes[1], kernel_sizes[2], kernel_sizes[3]);
     uint64_t output_h =
         (input_h + 2 * padding[0] - dilation[0] * (kernel_h - 1) - 1) /
diff --git a/torch/csrc/profiler/util.h b/torch/csrc/profiler/util.h
index df141dcb1e0a0..e27d4084412c8 100644
--- a/torch/csrc/profiler/util.h
+++ b/torch/csrc/profiler/util.h
@@ -27,7 +27,7 @@
       if (torch::profiler::impl::softAssertRaises()) { \
         TORCH_INTERNAL_ASSERT(cond, __VA_ARGS__);      \
       } else {                                         \
-        TORCH_WARN(__VA_ARGS__);                       \
+        TORCH_WARN_ONCE(__VA_ARGS__);                  \
       }                                                \
       return false;                                    \
     }                                                  \
diff --git a/torch/csrc/python_dimname.cpp b/torch/csrc/python_dimname.cpp
index f02c3e3c0ddee..4c85156234b04 100644
--- a/torch/csrc/python_dimname.cpp
+++ b/torch/csrc/python_dimname.cpp
@@ -83,11 +83,10 @@ at::Dimname THPDimname_parse(PyObject* obj) {
     return at::Dimname::wildcard();
   }
 
-  if (!THPUtils_checkString(obj)) {
-    throw torch::TypeError(
-        "expected None or string for Dimname but got %s",
-        Py_TYPE(obj)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      THPUtils_checkString(obj),
+      "expected None or string for Dimname but got ",
+      Py_TYPE(obj)->tp_name);
 
   if (!THPUtils_isInterned(obj)) {
     // internStringInPlace decrefs obj and increfs the result. Because we're
diff --git a/torch/csrc/serialization.h b/torch/csrc/serialization.h
index e2587babf057b..3e10784c24596 100644
--- a/torch/csrc/serialization.h
+++ b/torch/csrc/serialization.h
@@ -1,6 +1,8 @@
 #ifndef THP_SERIALIZATION_INC
 #define THP_SERIALIZATION_INC
 
+#include <c10/core/StorageImpl.h>
+#include <c10/util/intrusive_ptr.h>
 template <class io>
 void doRead(io fildes, void* buf, size_t nbytes);
 
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index 087c2b26172fc..4ea523cedc942 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -13,7 +13,7 @@
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/utils/cuda_enabled.h>
-#include <torch/csrc/utils/cuda_lazy_init.h>
+#include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/python_strings.h>
 #include <torch/csrc/utils/tensor_new.h>
 #include <torch/csrc/utils/tensor_types.h>
@@ -25,8 +25,7 @@
 #include <type_traits>
 #include <vector>
 
-namespace torch {
-namespace tensors {
+namespace torch::tensors {
 
 using namespace at;
 using namespace torch::autograd;
@@ -64,21 +63,17 @@ static Backend default_backend = Backend::CPU;
 static void py_bind_tensor_types(
     const std::vector<PyTensorType*>& tensor_types);
 
-static TypeError unavailable_type(const PyTensorType& type) {
-  return TypeError(
-      "type %s not available. Torch not compiled with CUDA enabled.",
-      type.name);
-}
-
 static PyObject* Tensor_new(
     PyTypeObject* type,
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
   auto& tensor_type = *((PyTensorType*)type);
-  if (tensor_type.is_cuda && !torch::utils::cuda_enabled()) {
-    throw unavailable_type(tensor_type);
-  }
+  TORCH_CHECK_TYPE(
+      !tensor_type.is_cuda || torch::utils::cuda_enabled(),
+      "type ",
+      tensor_type.name,
+      " not available. Torch not compiled with CUDA enabled.")
   if (tensor_type.is_cuda) {
     TORCH_WARN_ONCE(
         "The torch.cuda.*DtypeTensor constructors are no longer recommended. "
@@ -219,29 +214,15 @@ static void py_initialize_tensor_type(
   }
 }
 
-static const char* get_module(Backend backend) {
-  switch (backend) {
-    case Backend::CPU:
-      return "torch";
-    case Backend::CUDA:
-      return "torch.cuda";
-    case Backend::SparseCPU:
-      return "torch.sparse";
-    case Backend::SparseCUDA:
-      return "torch.cuda.sparse";
-    default:
-      AT_ERROR("invalid backend: ", toString(backend));
-  }
-}
-
 static std::string get_name(Backend backend, ScalarType scalarType) {
   std::ostringstream ss;
-  ss << get_module(backend) << "." << toString(scalarType) << "Tensor";
+  ss << torch::utils::backend_to_string(backend) << "." << toString(scalarType)
+     << "Tensor";
   return ss.str();
 }
 
 static THPObjectPtr get_storage_obj(Backend backend, ScalarType dtype) {
-  auto module_name = get_module(backend);
+  auto module_name = torch::utils::backend_to_string(backend);
   auto module_obj = THPObjectPtr(PyImport_ImportModule(module_name));
   if (!module_obj)
     throw python_error();
@@ -249,9 +230,8 @@ static THPObjectPtr get_storage_obj(Backend backend, ScalarType dtype) {
   auto storage_name = std::string(toString(dtype)) + "Storage";
   THPObjectPtr storage(
       PyObject_GetAttrString(module_obj.get(), storage_name.c_str()));
-  if (!storage.get()) {
-    throw TypeError("couldn't find storage object %s", storage_name.c_str());
-  }
+  TORCH_CHECK_TYPE(
+      storage.get(), "couldn't find storage object ", storage_name);
   return storage;
 }
 
@@ -262,8 +242,9 @@ static void set_type(
   // This field is lazily initialized from backend and scalar_type
   type_obj.backend = static_cast<int>(backend);
   type_obj.scalar_type = static_cast<int>(scalarType);
-  type_obj.layout = torch::getTHPLayout(layout_from_backend(backend));
-  type_obj.dtype = torch::getTHPDtype(scalarType);
+  type_obj.layout =
+      (THPLayout*)Py_NewRef(torch::getTHPLayout(layout_from_backend(backend)));
+  type_obj.dtype = (THPDtype*)Py_NewRef(torch::getTHPDtype(scalarType));
   type_obj.is_cuda =
       (backend == at::Backend::CUDA || backend == at::Backend::SparseCUDA);
   type_obj.is_xpu =
@@ -455,9 +436,11 @@ void py_set_default_tensor_type(PyObject* obj) {
       PyTensorType_Check(obj),
       "invalid type object: only floating-point types are supported as the default type");
   PyTensorType* type = (PyTensorType*)obj;
-  if (type->is_cuda && !torch::utils::cuda_enabled()) {
-    throw unavailable_type(*type);
-  }
+  TORCH_CHECK_TYPE(
+      !type->is_cuda || torch::utils::cuda_enabled(),
+      "type ",
+      type->name,
+      " not available. Torch not compiled with CUDA enabled.")
   set_default_tensor_type(type->get_backend(), type->get_scalar_type());
 }
 
@@ -481,5 +464,4 @@ ScalarType get_default_scalar_type() {
   return get_default_dtype_as_scalartype();
 }
 
-} // namespace tensors
-} // namespace torch
+} // namespace torch::tensors
diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp
index eff0d4bda7e61..fda2c45a9c88b 100644
--- a/torch/csrc/utils.cpp
+++ b/torch/csrc/utils.cpp
@@ -220,7 +220,7 @@ void THPPointer<THPStorage>::free() {
 
 void storage_fill(const at::Storage& self, uint8_t value) {
   auto options = c10::TensorOptions().device(self.device()).dtype(at::kByte);
-  auto self_t = at::empty({0}, {}, options).set_(self);
+  auto self_t = at::empty({0}, options).set_(self);
   self_t.fill_(value);
 }
 
@@ -229,7 +229,7 @@ void storage_set(const at::Storage& self, ptrdiff_t idx, uint8_t value) {
       (idx >= 0) && (idx < static_cast<ptrdiff_t>(self.nbytes())),
       "out of bounds");
   auto options = c10::TensorOptions().device(self.device()).dtype(at::kByte);
-  auto self_t = at::empty({0}, {}, options).set_(self);
+  auto self_t = at::empty({0}, options).set_(self);
   self_t[idx].fill_(value);
 }
 
@@ -238,7 +238,7 @@ uint8_t storage_get(const at::Storage& self, ptrdiff_t idx) {
       (idx >= 0) && (idx < static_cast<ptrdiff_t>(self.nbytes())),
       "out of bounds");
   auto options = c10::TensorOptions().device(self.device()).dtype(at::kByte);
-  auto self_t = at::empty({0}, {}, options).set_(self);
+  auto self_t = at::empty({0}, options).set_(self);
   return self_t[idx].item<uint8_t>();
 }
 
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
index e9275fbd333e4..5a610c28d2b1e 100644
--- a/torch/csrc/utils.h
+++ b/torch/csrc/utils.h
@@ -184,18 +184,14 @@ template <typename _real, typename = void>
 struct mod_traits {};
 
 template <typename _real>
-struct mod_traits<
-    _real,
-    typename std::enable_if<std::is_floating_point<_real>::value>::type> {
+struct mod_traits<_real, std::enable_if_t<std::is_floating_point_v<_real>>> {
   static _real mod(_real a, _real b) {
     return fmod(a, b);
   }
 };
 
 template <typename _real>
-struct mod_traits<
-    _real,
-    typename std::enable_if<std::is_integral<_real>::value>::type> {
+struct mod_traits<_real, std::enable_if_t<std::is_integral_v<_real>>> {
   static _real mod(_real a, _real b) {
     return a % b;
   }
diff --git a/torch/csrc/utils/cuda_lazy_init.cpp b/torch/csrc/utils/cuda_lazy_init.cpp
deleted file mode 100644
index 0b86cfe4b8004..0000000000000
--- a/torch/csrc/utils/cuda_lazy_init.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-#include <torch/csrc/utils/cuda_lazy_init.h>
-
-#include <torch/csrc/Exceptions.h>
-#include <torch/csrc/python_headers.h>
-#include <torch/csrc/utils/object_ptr.h>
-
-namespace torch {
-namespace utils {
-namespace {
-
-bool is_initialized = false;
-
-}
-
-void cuda_lazy_init() {
-  pybind11::gil_scoped_acquire g;
-  // Protected by the GIL.  We don't use call_once because under ASAN it
-  // has a buggy implementation that deadlocks if an instance throws an
-  // exception.  In any case, call_once isn't necessary, because we
-  // have taken a lock.
-  if (is_initialized) {
-    return;
-  }
-
-  auto module = THPObjectPtr(PyImport_ImportModule("torch.cuda"));
-  if (!module) {
-    throw python_error();
-  }
-
-  auto res = THPObjectPtr(PyObject_CallMethod(module.get(), "_lazy_init", ""));
-  if (!res) {
-    throw python_error();
-  }
-
-  is_initialized = true;
-}
-
-void set_requires_cuda_init(bool value) {
-  is_initialized = !value;
-}
-
-} // namespace utils
-} // namespace torch
diff --git a/torch/csrc/utils/cuda_lazy_init.h b/torch/csrc/utils/cuda_lazy_init.h
deleted file mode 100644
index 90a8581e63ab3..0000000000000
--- a/torch/csrc/utils/cuda_lazy_init.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#pragma once
-
-#include <c10/core/TensorOptions.h>
-
-// cuda_lazy_init() is always compiled, even for CPU-only builds.
-// Thus, it does not live in the cuda/ folder.
-
-namespace torch {
-namespace utils {
-
-// The INVARIANT is that this function MUST be called before you attempt
-// to get a CUDA Type object from ATen, in any way.  Here are some common
-// ways that a Type object may be retrieved:
-//
-//    - You call getNonVariableType or getNonVariableTypeOpt
-//    - You call toBackend() on a Type
-//
-// It's important to do this correctly, because if you forget to add it
-// you'll get an oblique error message about "Cannot initialize CUDA without
-// ATen_cuda library" if you try to use CUDA functionality from a CPU-only
-// build, which is not good UX.
-//
-void cuda_lazy_init();
-void set_requires_cuda_init(bool value);
-
-static void maybe_initialize_cuda(const at::TensorOptions& options) {
-  if (options.device().is_cuda()) {
-    torch::utils::cuda_lazy_init();
-  }
-}
-
-} // namespace utils
-} // namespace torch
diff --git a/torch/csrc/utils/device_lazy_init.cpp b/torch/csrc/utils/device_lazy_init.cpp
new file mode 100644
index 0000000000000..d03207141273e
--- /dev/null
+++ b/torch/csrc/utils/device_lazy_init.cpp
@@ -0,0 +1,62 @@
+#include <c10/core/impl/TorchDispatchModeTLS.h>
+#include <torch/csrc/utils/device_lazy_init.h>
+
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <iostream>
+namespace torch::utils {
+namespace {
+
+std::array<bool, at::COMPILE_TIME_MAX_DEVICE_TYPES> is_initialized{};
+
+} // anonymous namespace
+
+bool is_device_initialized(at::DeviceType device_type) {
+  pybind11::gil_scoped_acquire g;
+  return is_initialized[static_cast<int>(device_type)];
+}
+
+void device_lazy_init(at::DeviceType device_type) {
+  pybind11::gil_scoped_acquire g;
+  // Protected by the GIL.  We don't use call_once because under ASAN it
+  // has a buggy implementation that deadlocks if an instance throws an
+  // exception.  In any case, call_once isn't necessary, because we
+  // have taken a lock.
+  if (is_device_initialized(device_type)) {
+    return;
+  }
+
+  auto maybe_mode = c10::impl::TorchDispatchModeTLS::get_mode(
+      c10::impl::TorchDispatchModeKey::FAKE);
+  if (maybe_mode) {
+    return;
+  }
+
+  std::string module_name = "torch." + at::DeviceTypeName(device_type, true);
+  auto module = THPObjectPtr(PyImport_ImportModule(module_name.c_str()));
+  if (!module) {
+    throw python_error();
+  }
+
+  if (device_type == at::DeviceType::PrivateUse1) {
+    auto has_lazy_init_method =
+        PyObject_HasAttrString(module.get(), "_lazy_init") == 1;
+    if (!has_lazy_init_method) {
+      return;
+    }
+  }
+
+  auto res = THPObjectPtr(PyObject_CallMethod(module.get(), "_lazy_init", ""));
+  if (!res) {
+    throw python_error();
+  }
+
+  is_initialized[static_cast<int>(device_type)] = true;
+}
+
+void set_requires_device_init(at::DeviceType device_type, bool value) {
+  is_initialized[static_cast<int>(device_type)] = !value;
+}
+
+} // namespace torch::utils
diff --git a/torch/csrc/utils/device_lazy_init.h b/torch/csrc/utils/device_lazy_init.h
new file mode 100644
index 0000000000000..b290ae04d792e
--- /dev/null
+++ b/torch/csrc/utils/device_lazy_init.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <c10/core/TensorOptions.h>
+
+// device_lazy_init() is always compiled, even for CPU-only builds.
+
+namespace torch::utils {
+
+/**
+ * This mechanism of lazy initialization is designed for each device backend.
+ * Currently, CUDA and XPU follow this design. This function `device_lazy_init`
+ * MUST be called before you attempt to access any Type(CUDA or XPU) object
+ * from ATen, in any way. It guarantees that the device runtime status is lazily
+ * initialized when the first runtime API is requested.
+ *
+ * Here are some common ways that a device object may be retrieved:
+ *   - You call getNonVariableType or getNonVariableTypeOpt
+ *   - You call toBackend() on a Type
+ *
+ * It's important to do this correctly, because if you forget to add it you'll
+ * get an oblique error message seems like "Cannot initialize CUDA without
+ * ATen_cuda library" or "Cannot initialize XPU without ATen_xpu library" if you
+ * try to use CUDA or XPU functionality from a CPU-only build, which is not good
+ * UX.
+ */
+void device_lazy_init(at::DeviceType device_type);
+void set_requires_device_init(at::DeviceType device_type, bool value);
+
+static inline void maybe_initialize_device(at::Device& device) {
+  // Add more devices here to enable lazy initialization.
+  if (device.is_cuda() || device.is_xpu() || device.is_privateuseone()) {
+    device_lazy_init(device.type());
+  }
+}
+
+static inline void maybe_initialize_device(c10::optional<at::Device>& device) {
+  if (!device.has_value()) {
+    return;
+  }
+  maybe_initialize_device(device.value());
+}
+
+static inline void maybe_initialize_device(const at::TensorOptions& options) {
+  auto device = options.device();
+  maybe_initialize_device(device);
+}
+
+bool is_device_initialized(at::DeviceType device_type);
+
+} // namespace torch::utils
diff --git a/torch/csrc/utils/pybind.cpp b/torch/csrc/utils/pybind.cpp
index e94d7645e9412..57c10c694861e 100644
--- a/torch/csrc/utils/pybind.cpp
+++ b/torch/csrc/utils/pybind.cpp
@@ -19,6 +19,18 @@ bool type_caster<c10::SymInt>::load(py::handle src, bool) {
   }
 
   auto raw_obj = src.ptr();
+
+  if (THPVariable_Check(raw_obj)) {
+    auto& var = THPVariable_Unpack(raw_obj);
+    if (var.numel() == 1 &&
+        at::isIntegralType(var.dtype().toScalarType(), /*include_bool*/ true)) {
+      auto scalar = var.item();
+      TORCH_INTERNAL_ASSERT(scalar.isIntegral(/*include bool*/ false));
+      value = scalar.toSymInt();
+      return true;
+    }
+  }
+
   if (THPUtils_checkIndex(raw_obj)) {
     value = c10::SymInt{THPUtils_unpackIndex(raw_obj)};
     return true;
diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index e9d95bb4e9771..553738b8999bc 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pythoncapi_compat.h>
 
 #include <ATen/core/Tensor.h>
 #include <ATen/core/jit_type_base.h>
@@ -15,9 +16,6 @@
 #include <torch/csrc/Stream.h>
 #include <torch/csrc/utils/tensor_memoryformats.h>
 
-#include <stdexcept>
-#include <utility>
-
 namespace py = pybind11;
 
 // This makes intrusive_ptr to be available as a custom pybind11 holder type,
@@ -158,7 +156,7 @@ struct type_caster<at::MemoryFormat> {
       at::MemoryFormat src,
       return_value_policy /* policy */,
       handle /* parent */) {
-    return handle(torch::utils::getTHPMemoryFormat(src));
+    return handle(Py_NewRef(torch::utils::getTHPMemoryFormat(src)));
   }
 };
 
@@ -197,6 +195,12 @@ struct type_caster<c10::Stream> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(c10::Stream, _("torch.Stream"));
 
+  // PYBIND11_TYPE_CASTER defines a member field called value. Since c10::Stream
+  // cannot be default-initialized, we provide this constructor to explicitly
+  // initialize that field. The value doesn't matter as it will be overwritten
+  // after a successful call to load.
+  type_caster() : value(c10::Stream::DEFAULT, c10::Device(c10::kCPU, 0)) {}
+
   bool load(handle src, bool) {
     PyObject* obj = src.ptr();
     if (THPStream_Check(obj)) {
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 9fa3384592198..9ea90e8911dbd 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -30,7 +30,7 @@ static std::unordered_map<std::string, ParameterType> type_map = {
     {"double", ParameterType::DOUBLE},
     {"complex", ParameterType::COMPLEX},
     {"TensorList", ParameterType::TENSOR_LIST},
-    {"c10::List<c10::optional<Tensor>>", ParameterType::TENSOR_LIST},
+    {"c10::List<::std::optional<Tensor>>", ParameterType::TENSOR_LIST},
     {"IntArrayRef", ParameterType::INT_LIST},
     {"SymIntArrayRef", ParameterType::SYM_INT_LIST},
     {"ArrayRef<double>", ParameterType::FLOAT_LIST},
@@ -742,10 +742,13 @@ bool is_tensor_list_and_append_overloaded(
         tuple ? PyTuple_GET_ITEM(obj, idx) : PyList_GET_ITEM(obj, idx);
     if (!is_tensor_and_append_overloaded(iobj, overloaded_args)) {
       if (throw_error) {
-        throw TypeError(
-            "expected Tensor as element %d in argument %d, but got %s",
-            static_cast<int>(idx),
+        TORCH_CHECK_TYPE(
+            false,
+            "expected Tensor as element ",
+            idx,
+            " in argument ",
             argnum,
+            ", but got ",
             Py_TYPE(iobj)->tp_name);
       }
       return false;
@@ -781,18 +784,25 @@ static bool is_int_or_symint(PyObject* obj) {
     return true;
   }
 
-  if (THPUtils_checkIndex(obj)) {
-    return true;
-  }
-
-  // FakeTensor(..., size=()) is qualified for SymInt param
-  if (is_dynamo_compiling && THPVariable_Check(obj)) {
+  // FakeTensor(..., size=()) is qualified for SymInt param,
+  // but we can't go via __index__ (below) as we would normally
+  // do for regular tensors, because __index__ first forces a
+  // conversion into an int, which in general you cannot do
+  // if you have an unbacked SymInt.  So this fastpath ensures
+  // that we still allow for fake tensors in this case, but
+  // for regular tensors it's redundant with the test below.
+  if (THPVariable_Check(obj)) {
     auto& var = THPVariable_Unpack(obj);
-    if (var.numel() == 1 && var.sizes().empty() &&
+    if (TORCH_GUARD_SIZE_OBLIVIOUS(var.sym_numel().sym_eq(1)) &&
         at::isIntegralType(var.dtype().toScalarType(), /*include_bool*/ true)) {
       return true;
     }
   }
+
+  if (THPUtils_checkIndex(obj)) {
+    return true;
+  }
+
   return false;
 }
 
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index abf1371e290a1..7bbef2f622ad6 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -355,12 +355,13 @@ inline PythonArgs PythonArgParser::parse(
     PyObject* args,
     PyObject* kwargs,
     ParsedArgs<N>& dst) {
-  if (N < max_args) {
-    throw ValueError(
-        "PythonArgParser: dst ParsedArgs buffer does not have enough capacity, expected %d (got %d)",
-        (int)max_args,
-        N);
-  }
+  TORCH_CHECK_VALUE(
+      N >= max_args,
+      "PythonArgParser: dst ParsedArgs buffer does not have enough capacity, expected ",
+      max_args,
+      " (got ",
+      N,
+      ")");
   return raw_parse(self, args, kwargs, dst.args);
 }
 
@@ -549,16 +550,6 @@ inline std::vector<c10::SymInt> PythonArgs::symintlist(int i) {
     return std::vector<c10::SymInt>(size1, si);
   }
 
-  if (is_dynamo_compiling && size1 > 0 && THPVariable_Check(args[i])) {
-    auto& var = THPVariable_Unpack(args[i]);
-    if (size1 == 1 && var.numel() == 1 && var.sizes().empty() &&
-        at::isIntegralType(var.dtype().toScalarType(), /*include_bool*/ true)) {
-      auto scalar = var.item();
-      TORCH_CHECK(scalar.isIntegral(/*include bool*/ false));
-      return std::vector<c10::SymInt>(size1, scalar.toSymInt());
-    }
-  }
-
   PyObject* arg = args[i];
   auto tuple = PyTuple_Check(arg);
   // NOLINTNEXTLINE(bugprone-branch-clone)
@@ -711,7 +702,7 @@ inline std::vector<double> PythonArgs::getDoublelist(int i) {
         tuple ? PyTuple_GET_ITEM(arg, idx) : PyList_GET_ITEM(arg, idx);
     try {
       res[idx] = THPUtils_unpackDouble(obj);
-    } catch (const std::exception& e) {
+    } catch (const std::exception&) {
       throw TypeError(
           "%s(): argument '%s' must be %s, but found element of type %s at pos %zu",
           signature.name.c_str(),
@@ -817,6 +808,11 @@ inline at::Device toDevice(PyObject* obj) {
   if (THPUtils_checkLong(obj)) {
     const auto device_index = THPUtils_unpackLong(obj);
     TORCH_CHECK(device_index >= 0, "Device index must not be negative");
+    if (c10::is_privateuse1_backend_registered()) {
+      return at::Device(
+          c10::DeviceType::PrivateUse1,
+          static_cast<c10::DeviceIndex>(device_index));
+    }
     return at::Device(
         c10::DeviceType::CUDA, static_cast<c10::DeviceIndex>(device_index));
   }
@@ -960,7 +956,6 @@ inline int64_t PythonArgs::toInt64(int i) {
 }
 
 inline c10::SymInt PythonArgs::toSymInt(int i) {
-  PyObject* obj = args[i];
   if (!args[i]) {
     return c10::SymInt(signature.params[i].default_int);
   }
@@ -971,28 +966,6 @@ inline c10::SymInt PythonArgs::toSymInt(int i) {
         signature.params[i].name, idx, var, c10::IntType::get());
   }
 
-  // convert FakeTensor to SymInt
-  // expect empty sizes, numel = 1
-  // and ScalarType::Int
-  if (is_dynamo_compiling && THPVariable_Check(obj)) {
-    auto& var = THPVariable_Unpack(obj);
-
-    if (var.numel() != 1 || !var.sizes().empty() ||
-        !at::isIntegralType(
-            var.dtype().toScalarType(), /*include_bool*/ true)) {
-      throw TypeError(
-          "%s(): argument '%s' must be %s, failed to convert %s with sizes.empty()=%d",
-          signature.name.c_str(),
-          signature.params[i].name.c_str(),
-          signature.params[i].type_name().c_str(),
-          Py_TYPE(obj)->tp_name,
-          var.sizes().empty());
-    }
-    auto scalar = var.item();
-    TORCH_CHECK(scalar.isIntegral(/*include bool*/ false));
-    return scalar.toSymInt();
-  }
-
   return py::cast<c10::SymInt>(py::handle(args[i]));
 }
 
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index 4148e1b76a8dc..2d115a82289e0 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -5,8 +5,8 @@
 #include <ATen/FuncTorchTLS.h>
 #include <ATen/FunctionalTensorWrapper.h>
 #include <ATen/TensorSubclassLikeUtils.h>
+#include <ATen/core/NestedIntSymNodeImpl.h>
 #include <ATen/core/PythonOpRegistrationTrampoline.h>
-#include <ATen/core/SingletonSymNodeImpl.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 
 #include <ATen/functorch/BatchedTensorImpl.h>
@@ -16,6 +16,7 @@
 #include <torch/csrc/PyInterpreter.h>
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils/tensor_new.h>
 
 #include <c10/util/flat_hash_map.h>
 #include <pybind11/operators.h>
@@ -28,9 +29,7 @@
 
 namespace py = pybind11;
 
-namespace torch {
-namespace impl {
-namespace dispatch {
+namespace torch::impl::dispatch {
 
 // NB: I'd like to index this on OperatorHandle, but I can't, as I can't
 // guarantee that the main interpreter has finish doing all registrations before
@@ -108,11 +107,17 @@ struct EnableHermeticPyObject {
 class PythonKernelHolder : public c10::OperatorKernel {
   c10::SafePyObject func_;
   c10::DispatchKey dispatch_key_;
+  // If "with_keyset", then we expect a keyset as the first arg.
+  bool with_keyset_;
 
  public:
-  PythonKernelHolder(py::object func, c10::DispatchKey dispatch_key)
+  PythonKernelHolder(
+      py::object func,
+      c10::DispatchKey dispatch_key,
+      bool with_keyset = false)
       : func_(func.release().ptr(), getPyInterpreter()),
-        dispatch_key_(dispatch_key) {}
+        dispatch_key_(dispatch_key),
+        with_keyset_(with_keyset) {}
 
   void operator()(
       const c10::OperatorHandle& op,
@@ -127,7 +132,8 @@ class PythonKernelHolder : public c10::OperatorKernel {
       const auto& cur_torch_dispatch_mode_state =
           c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
       cur_torch_dispatch_mode_state->pyinterpreter()
-          ->python_op_registration_trampoline(op, dispatch_key_, stack);
+          ->python_op_registration_trampoline(
+              op, dispatch_key_, keyset, stack, with_keyset_);
       return;
     }
 
@@ -144,7 +150,8 @@ class PythonKernelHolder : public c10::OperatorKernel {
             ivalue.unsafeToTensorImpl()->key_set().has(
                 at::DispatchKey::Python)) {
           (*interpreter)
-              ->python_op_registration_trampoline(op, dispatch_key_, stack);
+              ->python_op_registration_trampoline(
+                  op, dispatch_key_, keyset, stack, with_keyset_);
           return;
         }
       } else if (ivalue.isTensorList() || ivalue.isOptionalTensorList()) {
@@ -159,7 +166,8 @@ class PythonKernelHolder : public c10::OperatorKernel {
           if (interpreter &&
               nv.unsafeToTensorImpl()->key_set().has(at::DispatchKey::Python)) {
             (*interpreter)
-                ->python_op_registration_trampoline(op, dispatch_key_, stack);
+                ->python_op_registration_trampoline(
+                    op, dispatch_key_, keyset, stack, with_keyset_);
             return;
           }
         }
@@ -180,10 +188,11 @@ class PythonKernelHolder : public c10::OperatorKernel {
     EnableHermeticPyObject g2;
 #endif
     auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments);
-    auto obj = py::reinterpret_steal<py::object>(PyObject_Call(
-        func_.ptr(getPyInterpreter()),
-        args_kwargs.first.ptr(),
-        args_kwargs.second.ptr()));
+    auto func =
+        py::reinterpret_borrow<py::object>(func_.ptr(getPyInterpreter()));
+    auto obj = with_keyset_
+        ? func(keyset, *args_kwargs.first, **args_kwargs.second)
+        : func(*args_kwargs.first, **args_kwargs.second);
     if (!obj) {
       throw python_error();
     }
@@ -241,12 +250,39 @@ void initDispatchBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
   py::class_<c10::OperatorHandle>(m, "_DispatchOperatorHandle")
-      .def("schema", &c10::OperatorHandle::schema);
+      .def("schema", &c10::OperatorHandle::schema)
+      .def("debug", &c10::OperatorHandle::debug)
+      .def(
+          "redispatch_boxed",
+          [](py::object self,
+             c10::DispatchKeySet keyset,
+             py::args args,
+             const py::kwargs& kwargs) {
+            auto& handle = self.cast<c10::OperatorHandle&>();
+            auto stack = torch::jit::createStackForSchema(
+                handle.schema(),
+                std::move(args),
+                kwargs,
+                /*self=*/c10::nullopt);
+            {
+              pybind11::gil_scoped_release no_gil_guard;
+              handle.redispatchBoxed(keyset, &stack);
+            }
+            return torch::jit::createPyObjectForStack(std::move(stack));
+          });
 
   m.def("_dispatch_call_boxed", &ophandle_call_boxed);
 
   // TODO: figure out how to do chaining
   py::class_<torch::Library>(m, "_DispatchModule")
+      .def(
+          "reset",
+          [](const py::object& self) {
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
+            self.cast<torch::Library&>().reset();
+            return;
+          },
+          "")
       // Some of these APIs are only for testing and do not work in multipy
       // environment
       .def(
@@ -342,7 +378,8 @@ void initDispatchBindings(PyObject* module) {
              const char* name,
              // TODO: empty string no longer works
              c10::DispatchKey dispatch,
-             py::object func) {
+             py::object func,
+             bool with_keyset) {
             HANDLE_TH_ERRORS
             auto& lib = self.cast<torch::Library&>();
             if (func.is(py::module::import("torch.library")
@@ -358,7 +395,7 @@ void initDispatchBindings(PyObject* module) {
                       dispatch,
                       CppFunction::makeFromBoxedFunctor(
                           std::make_unique<PythonKernelHolder>(
-                              func, dispatch))),
+                              func, dispatch, with_keyset))),
                   register_or_verify());
               python_registrations_[lib._resolve(name)].insert_or_assign(
                   dispatch,
@@ -370,7 +407,8 @@ void initDispatchBindings(PyObject* module) {
           "",
           py::arg("name"),
           py::arg("dispatch"),
-          py::arg("func"))
+          py::arg("func"),
+          py::arg("with_keyset") = false)
       .def(
           "define",
           [](const py::object& self,
@@ -478,6 +516,16 @@ void initDispatchBindings(PyObject* module) {
         return op->hasKernelForDispatchKey(dispatch);
       });
 
+  m.def(
+      // Returns whether or not the kernel for this dispatach key is a
+      // fallthrough kernel
+      "_dispatch_kernel_for_dispatch_key_is_fallthrough",
+      [](const char* name, c10::DispatchKey dispatch) -> bool {
+        auto op =
+            c10::Dispatcher::singleton().findOp(torch::jit::parseName(name));
+        return op->isKernelFallthroughKernel(dispatch);
+      });
+
   m.def(
       "_dispatch_has_kernel_for_any_dispatch_key",
       [](const char* name, c10::DispatchKeySet ks) -> bool {
@@ -661,6 +709,10 @@ void initDispatchBindings(PyObject* module) {
   m.attr("_additional_keys_to_prop_for_wrapper_tensors") =
       py::cast(at::functorch::kKeysToPropagateToWrapper);
 
+  m.attr("_after_autograd_keyset") = py::cast(c10::after_autograd_keyset);
+  m.attr("_after_ADInplaceOrView_keyset") =
+      py::cast(c10::after_ADInplaceOrView_keyset);
+
   m.def("_dispatch_has_backend_fallback", [](c10::DispatchKey t) {
     return c10::Dispatcher::singleton().hasBackendFallbackForDispatchKey(t);
   });
@@ -712,6 +764,8 @@ void initDispatchBindings(PyObject* module) {
       c10::impl::ForceDispatchKeyGuard,
       c10::DispatchKeySet,
       c10::DispatchKeySet>(m, "_ForceDispatchKeyGuard");
+  py_context_manager<c10::impl::ForceDispatchKeyGuard>(
+      m, "_PreserveDispatchKeyGuard");
   py_context_manager<c10::impl::IncludeDispatchKeyGuard, c10::DispatchKey>(
       m, "_IncludeDispatchKeyGuard");
   py_context_manager<c10::impl::ExcludeDispatchKeyGuard, c10::DispatchKeySet>(
@@ -721,6 +775,8 @@ void initDispatchBindings(PyObject* module) {
 
   py_context_manager_DEPRECATED<at::AutoDispatchBelowAutograd>(
       m, "_AutoDispatchBelowAutograd");
+  py_context_manager<at::AutoDispatchBelowADInplaceOrView>(
+      m, "_AutoDispatchBelowADInplaceOrView");
 
   // Prints out the name of every operator that has a kernel registered to the
   // Dispatcher under [dispatch_key]. If no arguments are specified, it'll print
@@ -781,7 +837,7 @@ void initDispatchBindings(PyObject* module) {
   m.def(
       "_dispatch_is_main_interpreter", []() { return isMainPyInterpreter(); });
   m.def("_dispatch_pystub", [](const char* name, const char* overload) {
-    return c10::Dispatcher::singleton().getAbstractImplPyStub(
+    return c10::Dispatcher::singleton().getPyStub(
         c10::OperatorName(name, overload));
   });
 
@@ -815,9 +871,9 @@ void initDispatchBindings(PyObject* module) {
         include_set.has(c10::DispatchKey::FuncTorchDynamicLayerBackMode));
   });
 
-  m.def("_get_singleton_int", [](int64_t data, int64_t coeff) {
+  m.def("_get_nested_int", [](int64_t data, int64_t coeff) {
     return c10::SymInt(c10::SymNode(
-        c10::make_intrusive<c10::SingletonSymNodeImpl>(data, coeff)));
+        c10::make_intrusive<c10::NestedIntSymNodeImpl>(data, coeff)));
   });
 
   m.def("_get_constant_bool_symnode", [](int64_t data) {
@@ -829,6 +885,36 @@ void initDispatchBindings(PyObject* module) {
     return a.sizes(); // NB: NOT sym_size
   });
 
+  m.def("_set_throw_on_mutable_data_ptr", [](const at::Tensor& t) {
+    if (!t.unsafeGetTensorImpl()->has_storage()) {
+      // If the Tensor doesn't have a storage, then accessing .data_ptr()
+      // will already raise an error.
+      return;
+    }
+    // Otherwise, set (on the StorageImpl) that accessing (mutable) data_ptr
+    // will throw.
+    t.unsafeGetTensorImpl()
+        ->storage()
+        .unsafeGetStorageImpl()
+        ->set_throw_on_mutable_data_ptr();
+  });
+
+  // Invariant: you must ONLY call this with FakeTensors.
+  m.def("_set_warn_deprecated_on_mutable_data_ptr", [](const at::Tensor& t) {
+    if (!t.unsafeGetTensorImpl()->has_storage()) {
+      // If the Tensor doesn't have a storage, then accessing .data_ptr()
+      // will already raise an error.
+      return;
+    }
+    t.unsafeGetTensorImpl()
+        ->storage()
+        .unsafeGetStorageImpl()
+        ->set_warn_deprecated_on_mutable_data_ptr();
+  });
+
+  m.def("_only_lift_cpu_tensors", &torch::utils::only_lift_cpu_tensors);
+  m.def("_set_only_lift_cpu_tensors", &torch::utils::set_only_lift_cpu_tensors);
+
   using c10::impl::TorchDispatchModeKey;
   py::enum_<TorchDispatchModeKey>(m, "_TorchDispatchModeKey")
       .value("FUNCTIONAL", TorchDispatchModeKey::FUNCTIONAL)
@@ -840,7 +926,9 @@ void initDispatchBindings(PyObject* module) {
 void python_op_registration_trampoline_impl(
     const c10::OperatorHandle& op,
     c10::DispatchKey key,
-    torch::jit::Stack* stack) {
+    c10::DispatchKeySet keyset,
+    torch::jit::Stack* stack,
+    bool with_keyset) {
   auto arguments = torch::jit::pop(*stack, op.schema().arguments().size());
   py::gil_scoped_acquire g;
   auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments);
@@ -848,14 +936,14 @@ void python_op_registration_trampoline_impl(
   TORCH_INTERNAL_ASSERT(func != nullptr);
   auto* pyobj = func->ptr(getPyInterpreter());
   TORCH_INTERNAL_ASSERT(pyobj != nullptr);
-  auto obj = py::reinterpret_steal<py::object>(
-      PyObject_Call(pyobj, args_kwargs.first.ptr(), args_kwargs.second.ptr()));
+  auto callable = py::reinterpret_borrow<py::object>(pyobj);
+  auto obj = with_keyset
+      ? callable(keyset, *args_kwargs.first, **args_kwargs.second)
+      : callable(*args_kwargs.first, **args_kwargs.second);
   if (!obj) {
     throw python_error();
   }
   pushPyOutToStack(op, stack, obj, "PythonKernelHolder");
 }
 
-} // namespace dispatch
-} // namespace impl
-} // namespace torch
+} // namespace torch::impl::dispatch
diff --git a/torch/csrc/utils/python_dispatch.h b/torch/csrc/utils/python_dispatch.h
index d719de730551b..9549b817ba6ac 100644
--- a/torch/csrc/utils/python_dispatch.h
+++ b/torch/csrc/utils/python_dispatch.h
@@ -10,7 +10,9 @@ void initDispatchBindings(PyObject* module);
 void python_op_registration_trampoline_impl(
     const c10::OperatorHandle& op,
     c10::DispatchKey key,
-    torch::jit::Stack* stack);
+    c10::DispatchKeySet keyset,
+    torch::jit::Stack* stack,
+    bool with_keyset);
 
 } // namespace dispatch
 } // namespace impl
diff --git a/torch/csrc/utils/python_numbers.h b/torch/csrc/utils/python_numbers.h
index 0f276bdf373e7..2a17afdf0e18f 100644
--- a/torch/csrc/utils/python_numbers.h
+++ b/torch/csrc/utils/python_numbers.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/core/Device.h>
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/jit/frontend/tracer.h>
 #include <torch/csrc/python_headers.h>
@@ -12,6 +13,10 @@
 // largest integer that can be represented consecutively in a double
 const int64_t DOUBLE_INT_MAX = 9007199254740992;
 
+inline PyObject* THPUtils_packDeviceIndex(c10::DeviceIndex value) {
+  return PyLong_FromLong(value);
+}
+
 inline PyObject* THPUtils_packInt32(int32_t value) {
   return PyLong_FromLong(value);
 }
@@ -184,3 +189,19 @@ inline bool THPUtils_unpackNumberAsBool(PyObject* obj) {
   // return true in order to keep the same behavior of numpy.
   return (bool)value;
 }
+
+inline c10::DeviceIndex THPUtils_unpackDeviceIndex(PyObject* obj) {
+  int overflow = 0;
+  long value = PyLong_AsLongAndOverflow(obj, &overflow);
+  if (value == -1 && PyErr_Occurred()) {
+    throw python_error();
+  }
+  if (overflow != 0) {
+    throw std::runtime_error("Overflow when unpacking DeviceIndex");
+  }
+  if (value > std::numeric_limits<c10::DeviceIndex>::max() ||
+      value < std::numeric_limits<c10::DeviceIndex>::min()) {
+    throw std::runtime_error("Overflow when unpacking DeviceIndex");
+  }
+  return (c10::DeviceIndex)value;
+}
diff --git a/torch/csrc/utils/python_scalars.h b/torch/csrc/utils/python_scalars.h
index 293952c1de349..2819f56b6bab3 100644
--- a/torch/csrc/utils/python_scalars.h
+++ b/torch/csrc/utils/python_scalars.h
@@ -101,7 +101,7 @@ inline void store_scalar(void* data, at::ScalarType scalarType, PyObject* obj) {
   }
 }
 
-inline PyObject* load_scalar(void* data, at::ScalarType scalarType) {
+inline PyObject* load_scalar(const void* data, at::ScalarType scalarType) {
   switch (scalarType) {
     case at::kByte:
       return THPUtils_packInt64(*(uint8_t*)data);
@@ -127,11 +127,11 @@ inline PyObject* load_scalar(void* data, at::ScalarType scalarType) {
     case at::kDouble:
       return PyFloat_FromDouble(*(double*)data);
     case at::kComplexHalf: {
-      auto data_ = reinterpret_cast<c10::complex<at::Half>*>(data);
+      auto data_ = reinterpret_cast<const c10::complex<at::Half>*>(data);
       return PyComplex_FromDoubles(data_->real(), data_->imag());
     }
     case at::kComplexFloat: {
-      auto data_ = reinterpret_cast<c10::complex<float>*>(data);
+      auto data_ = reinterpret_cast<const c10::complex<float>*>(data);
       return PyComplex_FromDoubles(data_->real(), data_->imag());
     }
     case at::kComplexDouble:
@@ -152,8 +152,8 @@ inline PyObject* load_scalar(void* data, at::ScalarType scalarType) {
       return PyFloat_FromDouble(at::convert<double, at::Float8_e5m2fnuz>(
           *(at::Float8_e5m2fnuz*)data));
     case at::kFloat8_e4m3fnuz:
-      return PyFloat_FromDouble(at::convert<double, at::Float8_e5m2fnuz>(
-          *(at::Float8_e5m2fnuz*)data));
+      return PyFloat_FromDouble(at::convert<double, at::Float8_e4m3fnuz>(
+          *(at::Float8_e4m3fnuz*)data));
     default:
       throw std::runtime_error("invalid type");
   }
diff --git a/torch/csrc/utils/python_symnode.h b/torch/csrc/utils/python_symnode.h
index 112de773ea382..c4814930507bf 100644
--- a/torch/csrc/utils/python_symnode.h
+++ b/torch/csrc/utils/python_symnode.h
@@ -95,6 +95,11 @@ class PythonSymNodeImpl : public c10::SymNodeImpl {
     return getPyObj().attr("is_bool")().is(py::handle(Py_True));
   }
 
+  bool is_nested_int() const override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("is_nested_int")().is(py::handle(Py_True));
+  }
+
   bool has_hint() override {
     py::gil_scoped_acquire acquire;
     return getPyObj().attr("has_hint")().is(py::handle(Py_True));
@@ -125,6 +130,11 @@ class PythonSymNodeImpl : public c10::SymNodeImpl {
     return getPyObj().attr("expect_size")(file, line).cast<bool>();
   }
 
+  bool guard_size_oblivious(const char* file, int64_t line) override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("guard_size_oblivious")(file, line).cast<bool>();
+  }
+
   int64_t int_() override {
     py::gil_scoped_acquire acquire;
     return getPyObj().attr("int_")().cast<int64_t>();
@@ -268,8 +278,8 @@ class PythonSymNodeImpl : public c10::SymNodeImpl {
     return dispatch_common_(__func__);
   }
 
-  py::handle getPyObj() {
-    return py::handle(pyobj_.get()->ptr(getPyInterpreter()));
+  py::handle getPyObj() const {
+    return py::handle(pyobj_->ptr(getPyInterpreter()));
   }
   std::shared_ptr<c10::SafePyObject> pyobj_ = nullptr;
 };
diff --git a/torch/csrc/utils/tensor_apply.cpp b/torch/csrc/utils/tensor_apply.cpp
index 7d7012661fe9c..ffb2c5801751d 100644
--- a/torch/csrc/utils/tensor_apply.cpp
+++ b/torch/csrc/utils/tensor_apply.cpp
@@ -67,27 +67,26 @@ const Tensor& apply_(const Tensor& self, PyObject* fn) {
   if (self.is_meta()) {
     return self; // Just skip
   }
-  if (!self.device().is_cpu()) {
-    throw TypeError("apply_ is only implemented on CPU tensors");
-  }
+  TORCH_CHECK_TYPE(
+      self.device().is_cpu(), "apply_ is only implemented on CPU tensors");
   auto scalarType = self.scalar_type();
   recursive_apply<1>(self.sizes(), scalarType, 0, fn, {{self}});
   return self;
 }
 
 const Tensor& map_(const Tensor& self, const Tensor& other_, PyObject* fn) {
-  if (!other_.options().type_equal(self.options())) {
-    throw TypeError(
-        "map_: expected %s for 'other' (got %s)",
-        self.toString().c_str(),
-        other_.toString().c_str());
-  }
+  TORCH_CHECK_TYPE(
+      other_.options().type_equal(self.options()),
+      "map_: expected ",
+      self.toString(),
+      " for 'other' (got ",
+      other_.toString(),
+      ")");
   if (self.is_meta()) {
     return self; // Just skip
   }
-  if (!self.device().is_cpu()) {
-    throw TypeError("map_ is only implemented on CPU tensors");
-  }
+  TORCH_CHECK_TYPE(
+      self.device().is_cpu(), "map_ is only implemented on CPU tensors");
   c10::MaybeOwned<Tensor> other = expand_inplace(self, other_, "map_");
   auto scalarType = self.scalar_type();
   recursive_apply<2>(self.sizes(), scalarType, 0, fn, {{self, *other}});
@@ -99,25 +98,26 @@ const Tensor& map2_(
     const Tensor& x_,
     const Tensor& y_,
     PyObject* fn) {
-  if (!x_.options().type_equal(self.options())) {
-    throw TypeError(
-        "map2_: expected %s for argument 'x' (got %s)",
-        self.toString().c_str(),
-        x_.toString().c_str());
-  }
-  if (!y_.options().type_equal(self.options())) {
-    throw TypeError(
-        "map2_: expected %s for argument 'y' (got %s)",
-        self.toString().c_str(),
-        y_.toString().c_str());
-  }
+  TORCH_CHECK_TYPE(
+      x_.options().type_equal(self.options()),
+      "map2_: expected ",
+      self.toString(),
+      " for argument 'x' (got ",
+      x_.toString(),
+      ")");
+  TORCH_CHECK_TYPE(
+      y_.options().type_equal(self.options()),
+      "map2_: expected ",
+      self.toString(),
+      " for argument 'y' (got ",
+      y_.toString(),
+      ")");
   if (self.is_meta()) {
     return self; // Just skip
   }
-  if (!self.device().is_cpu() || !x_.device().is_cpu() ||
-      !y_.device().is_cpu()) {
-    throw TypeError("map2_ is only implemented on CPU tensors");
-  }
+  TORCH_CHECK_TYPE(
+      (self.device().is_cpu() && x_.device().is_cpu() && y_.device().is_cpu()),
+      "map2_ is only implemented on CPU tensors");
   auto others = expand_inplace(self, x_, y_, "map2_");
   auto scalarType = self.scalar_type();
   recursive_apply<3>(
diff --git a/torch/csrc/utils/tensor_dtypes.cpp b/torch/csrc/utils/tensor_dtypes.cpp
index 1d59dd2887b3c..200e04eaddb02 100644
--- a/torch/csrc/utils/tensor_dtypes.cpp
+++ b/torch/csrc/utils/tensor_dtypes.cpp
@@ -12,6 +12,20 @@ namespace utils {
 
 std::pair<std::string, std::string> getDtypeNames(at::ScalarType scalarType) {
   switch (scalarType) {
+    case at::ScalarType::UInt1:
+      return std::make_pair("uint1", "bit");
+    case at::ScalarType::UInt2:
+      return std::make_pair("uint2", "");
+    case at::ScalarType::UInt3:
+      return std::make_pair("uint3", "");
+    case at::ScalarType::UInt4:
+      return std::make_pair("uint4", "");
+    case at::ScalarType::UInt5:
+      return std::make_pair("uint5", "");
+    case at::ScalarType::UInt6:
+      return std::make_pair("uint6", "");
+    case at::ScalarType::UInt7:
+      return std::make_pair("uint7", "");
     case at::ScalarType::Byte:
       // no "byte" because byte is signed in numpy and we overload
       // byte to mean bool often
diff --git a/torch/csrc/utils/tensor_list.cpp b/torch/csrc/utils/tensor_list.cpp
index 73d003cf3b031..c72de0b5e9e01 100644
--- a/torch/csrc/utils/tensor_list.cpp
+++ b/torch/csrc/utils/tensor_list.cpp
@@ -13,7 +13,7 @@ namespace torch {
 namespace utils {
 
 static PyObject* recursive_to_list(
-    char* data,
+    const char* data,
     IntArrayRef sizes,
     IntArrayRef strides,
     int64_t dim,
@@ -55,10 +55,10 @@ PyObject* tensor_to_list(const Tensor& tensor) {
     data = data.toBackend(Backend::CPU);
   }
   TORCH_CHECK(
-      tensor.numel() == 0 || data.data_ptr(),
+      tensor.numel() == 0 || data.const_data_ptr(),
       "tolist() shouldn't be called on a tensor with unallocated storage");
   return recursive_to_list(
-      (char*)data.data_ptr(),
+      (const char*)data.const_data_ptr(),
       data.sizes(),
       data.strides(),
       0,
diff --git a/torch/csrc/utils/tensor_memoryformats.cpp b/torch/csrc/utils/tensor_memoryformats.cpp
index aabe2ad407edd..63dafaf5f5ff2 100644
--- a/torch/csrc/utils/tensor_memoryformats.cpp
+++ b/torch/csrc/utils/tensor_memoryformats.cpp
@@ -18,10 +18,12 @@ std::array<PyObject*, static_cast<int>(at::MemoryFormat::NumOptions)>
 } // anonymous namespace
 
 PyObject* getTHPMemoryFormat(at::MemoryFormat memory_format) {
-  return py::reinterpret_borrow<py::object>(
-             memory_format_registry[static_cast<size_t>(memory_format)])
-      .release()
-      .ptr();
+  auto py_memory_format =
+      memory_format_registry[static_cast<int>(memory_format)];
+  if (!py_memory_format) {
+    throw std::invalid_argument("unsupported memory_format");
+  }
+  return py_memory_format;
 }
 
 void initializeMemoryFormats() {
diff --git a/torch/csrc/utils/tensor_memoryformats.h b/torch/csrc/utils/tensor_memoryformats.h
index 6b820bd02882e..b9268070e34cd 100644
--- a/torch/csrc/utils/tensor_memoryformats.h
+++ b/torch/csrc/utils/tensor_memoryformats.h
@@ -7,6 +7,8 @@
 namespace torch::utils {
 
 void initializeMemoryFormats();
+
+// This methods returns a borrowed reference!
 TORCH_PYTHON_API PyObject* getTHPMemoryFormat(c10::MemoryFormat);
 
 } // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 289937ad7e0ed..e1755b5b36248 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -7,7 +7,7 @@
 #include <torch/csrc/Size.h>
 #include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/autograd/variable.h>
-#include <torch/csrc/utils/cuda_lazy_init.h>
+#include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/numpy_stub.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_arg_parser.h>
@@ -44,11 +44,12 @@ using at::Tensor;
 using at::TensorOptions;
 using c10::optional;
 
-namespace torch {
-namespace utils {
+namespace torch::utils {
 namespace {
 const int MAX_DIMS = 128;
 
+thread_local bool kOnlyLiftCPUTensors = false;
+
 TensorOptions build_options(
     c10::TensorOptions options,
     at::ScalarType scalar_type,
@@ -60,12 +61,6 @@ TensorOptions build_options(
   return options;
 }
 
-void maybe_initialize_cuda(const Device& device) {
-  if (device.is_cuda()) {
-    torch::utils::cuda_lazy_init();
-  }
-}
-
 // NB: It appears there is some consistency invariant between options and
 // device, where if device is non-empty, its type must be consistent with the
 // device type in options.
@@ -76,7 +71,7 @@ Tensor new_with_sizes(
     at::ScalarType scalar_type,
     const optional<Device>& device,
     c10::SymIntArrayRef sizes) {
-  maybe_initialize_cuda(options.device());
+  maybe_initialize_device(options.device());
   pybind11::gil_scoped_release no_gil;
   return at::empty_symint(sizes, build_options(options, scalar_type, device));
 }
@@ -104,18 +99,20 @@ std::vector<int64_t> compute_sizes(PyObject* seq, ScalarType scalar_type) {
       length /= static_cast<int64_t>(elementSize(scalar_type));
     }
     sizes.push_back(length);
-    if (sizes.size() > MAX_DIMS) {
-      throw ValueError("too many dimensions '%s'", Py_TYPE(seq)->tp_name);
-    }
+    TORCH_CHECK_VALUE(
+        sizes.size() <= MAX_DIMS,
+        "too many dimensions '",
+        Py_TYPE(seq)->tp_name,
+        "'");
     if (length == 0)
       break;
     PyObject* new_obj = PySequence_GetItem(seq, 0);
-    if (!new_obj) {
-      // This line uses seq so we must NOT override obj before this line
-      throw ValueError(
-          "could not determine the shape of object type '%s'",
-          Py_TYPE(seq)->tp_name);
-    }
+    // This line uses seq so we must NOT override obj before this line
+    TORCH_CHECK_VALUE(
+        new_obj,
+        "could not determine the shape of object type '",
+        Py_TYPE(seq)->tp_name,
+        "'");
     obj = THPObjectPtr(new_obj);
     seq = obj.get();
   }
@@ -159,6 +156,8 @@ ScalarType infer_scalar_type(PyObject* obj) {
         return ScalarType::ComplexFloat;
       case ScalarType::Double:
         return ScalarType::ComplexDouble;
+      case ScalarType::Half:
+        return ScalarType::ComplexHalf;
       default:
         TORCH_CHECK(false, "invalid default scalar type for complex");
     }
@@ -167,9 +166,11 @@ ScalarType infer_scalar_type(PyObject* obj) {
     const auto& var = THPVariable_Unpack(obj);
     return var.scalar_type();
   }
-  if (THPUtils_checkString(obj)) {
-    throw TypeError("new(): invalid data type '%s'", Py_TYPE(obj)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      !THPUtils_checkString(obj),
+      "new(): invalid data type '",
+      Py_TYPE(obj)->tp_name,
+      "'");
   if (PySequence_Check(obj)) {
     c10::optional<ScalarType> scalarType;
     auto length = PySequence_Length(obj);
@@ -183,8 +184,8 @@ ScalarType infer_scalar_type(PyObject* obj) {
       if (!handle)
         throw python_error();
       auto cur_item = handle.get();
-      if (cur_item == obj)
-        throw TypeError("new(): self-referential lists are incompatible");
+      TORCH_CHECK_TYPE(
+          cur_item != obj, "new(): self-referential lists are incompatible");
       ScalarType item_scalarType = infer_scalar_type(cur_item);
       scalarType = (scalarType) ? at::promoteTypes(*scalarType, item_scalarType)
                                 : item_scalarType;
@@ -260,13 +261,15 @@ void recursive_store(
     throw python_error();
   // NOLINTNEXTLINE(bugprone-branch-clone)
   auto seq_size = PySequence_Fast_GET_SIZE(seq.get());
-  if (seq_size != n) {
-    throw ValueError(
-        "expected sequence of length %lld at dim %lld (got %lld)",
-        (long long)n,
-        (long long)dim,
-        (long long)seq_size);
-  }
+  TORCH_CHECK_VALUE(
+      seq_size == n,
+      "expected sequence of length ",
+      n,
+      " at dim ",
+      dim,
+      " (got ",
+      seq_size,
+      ")");
 
   PyObject** items = PySequence_Fast_ITEMS(seq.get());
   for (const auto i : c10::irange(n)) {
@@ -293,9 +296,11 @@ Tensor internal_new_from_data(
     bool copy_numpy,
     bool type_inference,
     bool pin_memory = false) {
-  if (THPUtils_checkString(data)) {
-    throw TypeError("new(): invalid data type '%s'", Py_TYPE(data)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      !THPUtils_checkString(data),
+      "new(): invalid data type '",
+      Py_TYPE(data)->tp_name,
+      "'");
 
   if (THPVariable_Check(data)) {
     TORCH_CHECK(!pin_memory, "Can't pin tensor constructed from a variable");
@@ -311,7 +316,7 @@ Tensor internal_new_from_data(
         type_inference ? var.scalar_type() : scalar_type;
     auto device = device_opt.has_value() ? *device_opt : var.device();
     pybind11::gil_scoped_release no_gil;
-    maybe_initialize_cuda(device);
+    maybe_initialize_device(device);
     return var.to(
         device,
         inferred_scalar_type,
@@ -329,7 +334,7 @@ Tensor internal_new_from_data(
         type_inference ? tensor.scalar_type() : scalar_type;
     auto device = device_opt.has_value() ? *device_opt : options.device();
     pybind11::gil_scoped_release no_gil;
-    maybe_initialize_cuda(device);
+    maybe_initialize_device(device);
     return tensor.to(
         device,
         inferred_scalar_type,
@@ -345,7 +350,7 @@ Tensor internal_new_from_data(
         type_inference ? tensor.scalar_type() : scalar_type;
     auto device = device_opt.has_value() ? *device_opt : options.device();
     pybind11::gil_scoped_release no_gil;
-    maybe_initialize_cuda(device);
+    maybe_initialize_device(device);
     return tensor.to(
         device,
         inferred_scalar_type,
@@ -441,7 +446,7 @@ Tensor internal_new_from_data(
       }
     }
     pybind11::gil_scoped_release no_gil;
-    maybe_initialize_cuda(device);
+    maybe_initialize_device(device);
     // However, it is VERY important that we trace the to() call here (even
     // though the reason this is important is a hack).  Without *some* factory
     // function call that is traced at construction time, we will consider
@@ -450,19 +455,35 @@ Tensor internal_new_from_data(
     // "no observable data dependence".  In an ideal world, we wouldn't trace
     // a to() call but I need to think harder about what exactly we should trace
     // in this case.
-    tensor = tensor.to(
-        device, inferred_scalar_type, /*non_blocking=*/false, /*copy=*/false);
+    if (only_lift_cpu_tensors()) {
+      tensor = tensor.to(
+          inferred_scalar_type, /*non_blocking=*/false, /*copy=*/false);
+
+    } else {
+      tensor = tensor.to(
+          device, inferred_scalar_type, /*non_blocking=*/false, /*copy=*/false);
+    }
   }
 
   // torch.jit.trace will continue to trace out `.to()` instead of `.lift()`,
   // since changing it is BC-breaking.
   at::tracer::impl::NoTracerDispatchMode tracer_guard;
-  // lift has no autograd implementation, so we need to make sure we don't try
-  // to dispatch to it.
-  // TODO: arguably it should have an autograd implementation that noops
-  at::AutoDispatchBelowADInplaceOrView guard;
-
-  return at::lift_fresh(tensor);
+  {
+    // lift has no autograd implementation, so we need to make sure we don't try
+    // to dispatch to it.
+    // TODO: arguably it should have an autograd implementation that noops
+    at::AutoDispatchBelowADInplaceOrView guard;
+    tensor = at::lift_fresh(tensor);
+  }
+  if (only_lift_cpu_tensors() && device.type() != DeviceType::CPU) {
+    if (!device.has_index() &&
+        !torch::utils::is_device_initialized(device.type())) {
+      // Infer device 0 to avoid device init
+      device = c10::Device(device.type(), 0);
+    }
+    tensor = tensor.to(device, /*non_blocking=*/false, /*copy=*/false);
+  }
+  return tensor;
 }
 
 Tensor new_from_data_copy(
@@ -485,10 +506,11 @@ Tensor legacy_new_from_sequence(
     at::ScalarType scalar_type,
     c10::optional<Device> device,
     PyObject* data) {
-  if (!PySequence_Check(data)) {
-    throw TypeError(
-        "new(): data must be a sequence (got %s)", Py_TYPE(data)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      PySequence_Check(data),
+      "new(): data must be a sequence (got ",
+      Py_TYPE(data)->tp_name,
+      ")");
   return internal_new_from_data(
       options,
       scalar_type,
@@ -532,6 +554,7 @@ void check_base_legacy_new(
         c10::DispatchKey::SparseCUDA,
         c10::DispatchKey::SparseHIP,
         c10::DispatchKey::SparseXPU,
+        c10::DispatchKey::SparsePrivateUse1,
     });
     TORCH_CHECK(
         expected_key_set.has(dispatch_key),
@@ -1631,9 +1654,7 @@ Tensor tensor_fromDLPack(PyObject* data) {
   // because cuda ATen types have not been registered in Python yet.
   // so if we have a cuda tensor, then we need to make sure
   // we have called _lazy_init here
-  if (atensor.is_cuda()) {
-    py::module::import("torch.cuda").attr("init")();
-  }
+  maybe_initialize_device(atensor.device());
   return atensor;
 }
 
@@ -1785,5 +1806,12 @@ Tensor asarray(
   return tensor;
 }
 
-} // namespace utils
-} // namespace torch
+bool only_lift_cpu_tensors() {
+  return kOnlyLiftCPUTensors;
+}
+
+void set_only_lift_cpu_tensors(bool value) {
+  kOnlyLiftCPUTensors = value;
+}
+
+} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_new.h b/torch/csrc/utils/tensor_new.h
index 7048660ec3f6a..a1c34bd448882 100644
--- a/torch/csrc/utils/tensor_new.h
+++ b/torch/csrc/utils/tensor_new.h
@@ -8,6 +8,28 @@
 namespace torch {
 namespace utils {
 
+// NOTE: [torch.tensor, lift_fresh, and device movement]
+//
+// The `only_lift_cpu_tensors` flag controls what happens on torch.tensor([1, 2,
+// 3], device="cuda") (or any non-CPU devices).
+//
+// If false (default):
+// - the data gets moved into a CPU Tensor
+// - then, it gets moved to cuda (via .to)
+// - finally, we call lift_fresh() on it.
+// Steps 1 and 2 happen with all modes disabled.
+//
+// If true:
+// - the data gets moved into a CPU Tensor (with correct dtype)
+// - we call lift_fresh() on it
+// - finally, we move it to cuda (via .to)
+// Step 1 happens with all modes disabled.
+//
+// `only_lift_cpu_tensors=true` is useful to prevent CUDA initialization under
+// FakeTensorMode because it avoids moving concrete data to CUDA.
+TORCH_API bool only_lift_cpu_tensors();
+TORCH_API void set_only_lift_cpu_tensors(bool value);
+
 at::Tensor base_tensor_ctor(PyObject* args, PyObject* kwargs);
 at::Tensor legacy_tensor_ctor(
     c10::DispatchKey dispatch_key,
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
index 46c347c9049c3..a94ed7783dfd5 100644
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -218,9 +218,11 @@ at::Tensor tensor_from_numpy(
   if (!is_numpy_available()) {
     throw std::runtime_error("Numpy is not available");
   }
-  if (!PyArray_Check(obj)) {
-    throw TypeError("expected np.ndarray (got %s)", Py_TYPE(obj)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      PyArray_Check(obj),
+      "expected np.ndarray (got ",
+      Py_TYPE(obj)->tp_name,
+      ")");
   auto array = (PyArrayObject*)obj;
 
   // warn_if_not_writable is true when a copy of numpy variable is created.
@@ -235,30 +237,30 @@ at::Tensor tensor_from_numpy(
   // NumPy strides use bytes. Torch strides use element counts.
   auto element_size_in_bytes = PyArray_ITEMSIZE(array);
   for (auto& stride : strides) {
-    if (stride % element_size_in_bytes != 0) {
-      throw ValueError(
-          "given numpy array strides not a multiple of the element byte size. "
-          "Copy the numpy array to reallocate the memory.");
-    }
+    TORCH_CHECK_VALUE(
+        stride % element_size_in_bytes == 0,
+        "given numpy array strides not a multiple of the element byte size. "
+        "Copy the numpy array to reallocate the memory.");
     stride /= element_size_in_bytes;
   }
 
   for (const auto i : c10::irange(ndim)) {
-    if (strides[i] < 0) {
-      throw ValueError(
-          "At least one stride in the given numpy array is negative, "
-          "and tensors with negative strides are not currently supported. "
-          "(You can probably work around this by making a copy of your array "
-          " with array.copy().) ");
-    }
+    TORCH_CHECK_VALUE(
+        strides[i] >= 0,
+        "At least one stride in the given numpy array is negative, "
+        "and tensors with negative strides are not currently supported. "
+        "(You can probably work around this by making a copy of your array "
+        " with array.copy().) ");
   }
 
   void* data_ptr = PyArray_DATA(array);
-  if (!PyArray_EquivByteorders(PyArray_DESCR(array)->byteorder, NPY_NATIVE)) {
-    throw ValueError(
-        "given numpy array has byte order different from the native byte order. "
-        "Conversion between byte orders is currently not supported.");
-  }
+  TORCH_CHECK_VALUE(
+      PyArray_EquivByteorders(PyArray_DESCR(array)->byteorder, NPY_NATIVE),
+      "given numpy array has byte order different from the native byte order. "
+      "Conversion between byte orders is currently not supported.");
+  // This has to go before the INCREF in case the dtype mapping doesn't
+  // exist and an exception is thrown
+  auto torch_dtype = numpy_dtype_to_aten(PyArray_TYPE(array));
   Py_INCREF(obj);
   return at::lift_fresh(at::from_blob(
       data_ptr,
@@ -268,7 +270,7 @@ at::Tensor tensor_from_numpy(
         pybind11::gil_scoped_acquire gil;
         Py_DECREF(obj);
       },
-      at::device(kCPU).dtype(numpy_dtype_to_aten(PyArray_TYPE(array)))));
+      at::device(kCPU).dtype(torch_dtype)));
 }
 
 int aten_to_numpy_dtype(const ScalarType scalar_type) {
@@ -293,6 +295,12 @@ int aten_to_numpy_dtype(const ScalarType scalar_type) {
       return NPY_INT8;
     case kByte:
       return NPY_UINT8;
+    case kUInt16:
+      return NPY_UINT16;
+    case kUInt32:
+      return NPY_UINT32;
+    case kUInt64:
+      return NPY_UINT64;
     case kBool:
       return NPY_BOOL;
     default:
@@ -318,6 +326,12 @@ ScalarType numpy_dtype_to_aten(int dtype) {
       return kChar;
     case NPY_UINT8:
       return kByte;
+    case NPY_UINT16:
+      return kUInt16;
+    case NPY_UINT32:
+      return kUInt32;
+    case NPY_UINT64:
+      return kUInt64;
     case NPY_BOOL:
       return kBool;
     default:
@@ -344,7 +358,7 @@ ScalarType numpy_dtype_to_aten(int dtype) {
     throw python_error();
   throw TypeError(
       "can't convert np.ndarray of type %s. The only supported types are: "
-      "float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.",
+      "float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.",
       ((PyTypeObject*)pytype.get())->tp_name);
 }
 
@@ -397,11 +411,14 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
     }
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     PyArray_Descr* descr;
-    if (!PyArray_DescrConverter(py_typestr, &descr)) {
-      throw ValueError("cannot parse `typestr`");
-    }
+    TORCH_CHECK_VALUE(
+        PyArray_DescrConverter(py_typestr, &descr), "cannot parse `typestr`");
     dtype = numpy_dtype_to_aten(descr->type_num);
+#if NPY_ABI_VERSION >= 0x02000000
+    dtype_size_in_bytes = PyDataType_ELSIZE(descr);
+#else
     dtype_size_in_bytes = descr->elsize;
+#endif
     TORCH_INTERNAL_ASSERT(dtype_size_in_bytes > 0);
   }
 
@@ -445,11 +462,10 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
       // __cuda_array_interface__ strides use bytes. Torch strides use element
       // counts.
       for (auto& stride : strides) {
-        if (stride % dtype_size_in_bytes != 0) {
-          throw ValueError(
-              "given array strides not a multiple of the element byte size. "
-              "Make a copy of the array to reallocate the memory.");
-        }
+        TORCH_CHECK_VALUE(
+            stride % dtype_size_in_bytes == 0,
+            "given array strides not a multiple of the element byte size. "
+            "Make a copy of the array to reallocate the memory.");
         stride /= dtype_size_in_bytes;
       }
     } else {
@@ -457,6 +473,20 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
     }
   }
 
+  const auto target_device = [&]() -> c10::optional<Device> {
+    // note(crcrpar): zero-size arrays come with nullptr.
+    // ref:
+    // https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html#cuda-array-interface-version-3
+    if (data_ptr != nullptr) {
+      return {};
+    } else {
+      const auto current_device = at::detail::getCUDAHooks().current_device();
+      return Device(
+          kCUDA,
+          static_cast<DeviceIndex>(current_device > -1 ? current_device : 0));
+    }
+  }();
+
   Py_INCREF(obj);
   return at::from_blob(
       data_ptr,
@@ -466,7 +496,8 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
         pybind11::gil_scoped_acquire gil;
         Py_DECREF(obj);
       },
-      at::device(kCUDA).dtype(dtype));
+      at::device(kCUDA).dtype(dtype),
+      target_device);
 }
 
 // Mutated only once (during module init); behaves as an immutable variable
diff --git a/torch/csrc/utils/tensor_types.cpp b/torch/csrc/utils/tensor_types.cpp
index bb41b2dd09d13..7dacce7bce238 100644
--- a/torch/csrc/utils/tensor_types.cpp
+++ b/torch/csrc/utils/tensor_types.cpp
@@ -16,15 +16,16 @@
 
 using namespace at;
 
-namespace torch {
-namespace utils {
+namespace torch::utils {
 
-static const char* parse_privateuseone_backend() {
+static const char* parse_privateuseone_backend(bool is_sparse = false) {
   static std::string backend_name = "torch." + get_privateuse1_backend();
-  return backend_name.c_str();
+  static std::string sparse_backend_name = backend_name + ".sparse";
+  return is_sparse == false ? backend_name.c_str()
+                            : sparse_backend_name.c_str();
 }
 
-static const char* backend_to_string(const at::Backend& backend) {
+const char* backend_to_string(const at::Backend& backend) {
   switch (backend) {
     case at::Backend::CPU:
       return "torch";
@@ -50,6 +51,8 @@ static const char* backend_to_string(const at::Backend& backend) {
       return "torch.mtia";
     case at::Backend::PrivateUse1:
       return parse_privateuseone_backend();
+    case at::Backend::SparsePrivateUse1:
+      return parse_privateuseone_backend(true);
     case at::Backend::Lazy:
       return "torch.lazy";
     case at::Backend::XLA:
@@ -78,13 +81,18 @@ std::string type_to_string(const at::DeprecatedTypeProperties& type) {
 at::TensorOptions options_from_string(const std::string& str) {
   static std::string cuda_prefix("torch.cuda.");
   static std::string xpu_prefix("torch.xpu.");
+  static std::string privateUser_prefix(
+      std::string(parse_privateuseone_backend()) + ".");
   static c10::once_flag cpu_once;
   static c10::once_flag cuda_once;
   static c10::once_flag xpu_once;
+  static c10::once_flag privateUser1_once;
   static std::unordered_map<std::string, at::DeprecatedTypeProperties*> cpu_map;
   static std::unordered_map<std::string, at::DeprecatedTypeProperties*> xpu_map;
   static std::unordered_map<std::string, at::DeprecatedTypeProperties*>
       cuda_map;
+  static std::unordered_map<std::string, at::DeprecatedTypeProperties*>
+      privateUser1_map;
 
   const std::unordered_map<std::string, at::DeprecatedTypeProperties*>* map =
       nullptr;
@@ -115,6 +123,17 @@ at::TensorOptions options_from_string(const std::string& str) {
       }
     });
     map = &xpu_map;
+  } else if (
+      std::mismatch(
+          privateUser_prefix.begin(), privateUser_prefix.end(), str.begin())
+          .first == privateUser_prefix.end()) {
+    // torch.foo. foo is privateUser1 name
+    c10::call_once(privateUser1_once, []() {
+      for (auto type : autograd::VariableType::allPrivateUser1Types()) {
+        privateUser1_map.emplace(type_to_string(*type), type);
+      }
+    });
+    map = &privateUser1_map;
   } else {
     c10::call_once(cpu_once, []() {
       for (auto type : autograd::VariableType::allCPUTypes()) {
@@ -125,9 +144,7 @@ at::TensorOptions options_from_string(const std::string& str) {
   }
 
   auto it = map->find(str);
-  if (it == map->end()) {
-    throw ValueError("invalid type: '%s'", str.c_str());
-  }
+  TORCH_CHECK_VALUE(it != map->end(), "invalid type: '", str, "'");
   return it->second->options();
 }
 
@@ -165,5 +182,4 @@ std::vector<std::pair<Backend, ScalarType>> all_declared_types() {
   return ret;
 }
 
-} // namespace utils
-} // namespace torch
+} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_types.h b/torch/csrc/utils/tensor_types.h
index 28ee9b24cecef..601cc920a2e7d 100644
--- a/torch/csrc/utils/tensor_types.h
+++ b/torch/csrc/utils/tensor_types.h
@@ -15,5 +15,8 @@ at::TensorOptions options_from_string(const std::string& str);
 // return a vector of all "declared" types, even those that weren't compiled
 std::vector<std::pair<at::Backend, at::ScalarType>> all_declared_types();
 
+// return python module name of backend, like torch.cuda, torch.foo
+const char* backend_to_string(const at::Backend& backend);
+
 } // namespace utils
 } // namespace torch
diff --git a/torch/csrc/utils/throughput_benchmark-inl.h b/torch/csrc/utils/throughput_benchmark-inl.h
index 9193a32736e86..4334a58683bb8 100644
--- a/torch/csrc/utils/throughput_benchmark-inl.h
+++ b/torch/csrc/utils/throughput_benchmark-inl.h
@@ -8,6 +8,8 @@
 #include <torch/csrc/utils/pybind.h>
 
 #include <ATen/Parallel.h>
+#include <c10/core/GradMode.h>
+#include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/util/irange.h>
 
 namespace torch {
@@ -60,10 +62,18 @@ BenchmarkExecutionStats BenchmarkHelper<Input, Output, Model>::benchmark(
   std::vector<std::thread> callers;
 
   callers.reserve(config.num_calling_threads);
+
+  bool tls_grad_enabled = c10::GradMode::is_enabled();
+  c10::impl::LocalDispatchKeySet tls_key_set =
+      c10::impl::tls_local_dispatch_key_set();
+
   for (const auto thread_id : c10::irange(config.num_calling_threads)) {
     callers.emplace_back([&, thread_id]() {
       // We use conditional variable as a barrier to make sure each thread
       // performs required warmeup iterations before we start measuring
+      c10::GradMode::set_enabled(tls_grad_enabled);
+      c10::impl::_force_tls_local_dispatch_key_set(tls_key_set);
+
       for (const auto j : c10::irange(config.num_warmup_iters)) {
         (void)j;
         runOnce(std::move(thread_inputs[thread_id][input_iters[thread_id]]));
diff --git a/torch/csrc/utils/torch_dispatch_mode.h b/torch/csrc/utils/torch_dispatch_mode.h
index 81729f27df84f..79173aeb3e007 100644
--- a/torch/csrc/utils/torch_dispatch_mode.h
+++ b/torch/csrc/utils/torch_dispatch_mode.h
@@ -29,12 +29,12 @@ struct StashTorchDispatchModeGuard {
     }
   }
 
-  const std::shared_ptr<c10::SafePyObject>& get_cur_mode() {
+  const std::shared_ptr<c10::impl::PyObject_TorchDispatchMode>& get_cur_mode() {
     return saved_mode_;
   }
 
  private:
-  std::shared_ptr<at::SafePyObject> saved_mode_;
+  std::shared_ptr<c10::impl::PyObject_TorchDispatchMode> saved_mode_;
   c10::optional<c10::impl::TorchDispatchModeKey> saved_mode_key_;
 };
 
diff --git a/torch/csrc/xpu/Event.cpp b/torch/csrc/xpu/Event.cpp
new file mode 100644
index 0000000000000..1b88b9697d249
--- /dev/null
+++ b/torch/csrc/xpu/Event.cpp
@@ -0,0 +1,179 @@
+#include <pybind11/pybind11.h>
+#include <torch/csrc/Device.h>
+#include <torch/csrc/THP.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/pycfunction_helpers.h>
+#include <torch/csrc/utils/python_arg_parser.h>
+#include <torch/csrc/xpu/Event.h>
+#include <torch/csrc/xpu/Module.h>
+#include <torch/csrc/xpu/Stream.h>
+
+#include <structmember.h>
+
+PyObject* THXPEventClass = nullptr;
+
+static PyObject* THXPEvent_pynew(
+    PyTypeObject* type,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  unsigned char enable_timing = 0;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  constexpr const char* kwlist[] = {"enable_timing", nullptr};
+  if (!PyArg_ParseTupleAndKeywords(
+          args,
+          kwargs,
+          "|b",
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+          const_cast<char**>(kwlist),
+          &enable_timing)) {
+    return nullptr;
+  }
+
+  THPObjectPtr ptr(type->tp_alloc(type, 0));
+  if (!ptr) {
+    return nullptr;
+  }
+
+  THXPEvent* self = (THXPEvent*)ptr.get();
+
+  new (&self->xpu_event) at::xpu::XPUEvent(enable_timing);
+
+  return (PyObject*)ptr.release();
+  END_HANDLE_TH_ERRORS
+}
+
+static void THXPEvent_dealloc(THXPEvent* self) {
+  {
+    pybind11::gil_scoped_release no_gil{};
+    self->xpu_event.~XPUEvent();
+  }
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static PyObject* THXPEvent_get_sycl_event(THXPEvent* self, void* unused) {
+  HANDLE_TH_ERRORS
+  return PyLong_FromVoidPtr(&self->xpu_event.event());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THXPEvent_get_device(THXPEvent* self, void* unused) {
+  HANDLE_TH_ERRORS
+  at::optional<at::Device> device = self->xpu_event.device();
+  if (!device) {
+    Py_RETURN_NONE;
+  }
+  return THPDevice_New(device.value());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THXPEvent_record(PyObject* _self, PyObject* _stream) {
+  HANDLE_TH_ERRORS
+  auto* self = (THXPEvent*)_self;
+  auto* stream = (THXPStream*)_stream;
+  self->xpu_event.record(stream->xpu_stream);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THXPEvent_wait(PyObject* _self, PyObject* _stream) {
+  HANDLE_TH_ERRORS
+  auto* self = (THXPEvent*)_self;
+  auto* stream = (THXPStream*)_stream;
+  self->xpu_event.block(stream->xpu_stream);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THXPEvent_query(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto* self = (THXPEvent*)_self;
+  return PyBool_FromLong(self->xpu_event.query());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THXPEvent_elapsed_time(PyObject* _self, PyObject* _other) {
+  HANDLE_TH_ERRORS
+  auto* self = (THXPEvent*)_self;
+  auto* other = (THXPEvent*)_other;
+  return PyFloat_FromDouble(self->xpu_event.elapsed_time(other->xpu_event));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THXPEvent_synchronize(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS {
+    pybind11::gil_scoped_release no_gil;
+    auto* self = (THXPEvent*)_self;
+    self->xpu_event.synchronize();
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// NOLINTNEXTLINE(*c-arrays*, *global-variables)
+static struct PyGetSetDef THXPEvent_properties[] = {
+    {"device", (getter)THXPEvent_get_device, nullptr, nullptr, nullptr},
+    {"sycl_event", (getter)THXPEvent_get_sycl_event, nullptr, nullptr, nullptr},
+    {nullptr}};
+
+// NOLINTNEXTLINE(*c-arrays*, *global-variables)
+static PyMethodDef THXPEvent_methods[] = {
+    {(char*)"record", THXPEvent_record, METH_O, nullptr},
+    {(char*)"wait", THXPEvent_wait, METH_O, nullptr},
+    {(char*)"query", THXPEvent_query, METH_NOARGS, nullptr},
+    {(char*)"elapsed_time", THXPEvent_elapsed_time, METH_O, nullptr},
+    {(char*)"synchronize", THXPEvent_synchronize, METH_NOARGS, nullptr},
+    {nullptr}};
+
+PyTypeObject THXPEventType = {
+    PyVarObject_HEAD_INIT(nullptr, 0) "torch._C._XpuEventBase", /* tp_name */
+    sizeof(THXPEvent), /* tp_basicsize */
+    0, /* tp_itemsize */
+    (destructor)THXPEvent_dealloc, /* tp_dealloc */
+    0, /* tp_vectorcall_offset */
+    nullptr, /* tp_getattr */
+    nullptr, /* tp_setattr */
+    nullptr, /* tp_reserved */
+    nullptr, /* tp_repr */
+    nullptr, /* tp_as_number */
+    nullptr, /* tp_as_sequence */
+    nullptr, /* tp_as_mapping */
+    nullptr, /* tp_hash  */
+    nullptr, /* tp_call */
+    nullptr, /* tp_str */
+    nullptr, /* tp_getattro */
+    nullptr, /* tp_setattro */
+    nullptr, /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+    nullptr, /* tp_doc */
+    nullptr, /* tp_traverse */
+    nullptr, /* tp_clear */
+    nullptr, /* tp_richcompare */
+    0, /* tp_weaklistoffset */
+    nullptr, /* tp_iter */
+    nullptr, /* tp_iternext */
+    THXPEvent_methods, /* tp_methods */
+    nullptr, /* tp_members */
+    THXPEvent_properties, /* tp_getset */
+    nullptr, /* tp_base */
+    nullptr, /* tp_dict */
+    nullptr, /* tp_descr_get */
+    nullptr, /* tp_descr_set */
+    0, /* tp_dictoffset */
+    nullptr, /* tp_init */
+    nullptr, /* tp_alloc */
+    THXPEvent_pynew, /* tp_new */
+};
+
+void THXPEvent_init(PyObject* module) {
+  THXPEventClass = (PyObject*)&THXPEventType;
+  if (PyType_Ready(&THXPEventType) < 0) {
+    throw python_error();
+  }
+  Py_INCREF(&THXPEventType);
+  if (PyModule_AddObject(module, "_XpuEventBase", (PyObject*)&THXPEventType) <
+      0) {
+    throw python_error();
+  }
+}
diff --git a/torch/csrc/xpu/Event.h b/torch/csrc/xpu/Event.h
new file mode 100644
index 0000000000000..59f75bd58fa67
--- /dev/null
+++ b/torch/csrc/xpu/Event.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <ATen/xpu/XPUEvent.h>
+#include <torch/csrc/python_headers.h>
+
+struct THXPEvent {
+  PyObject_HEAD at::xpu::XPUEvent xpu_event;
+};
+extern PyObject* THXPEventClass;
+
+void THXPEvent_init(PyObject* module);
+
+inline bool THXPEvent_Check(PyObject* obj) {
+  return THXPEventClass && PyObject_IsInstance(obj, THXPEventClass);
+}
diff --git a/torch/csrc/xpu/Module.cpp b/torch/csrc/xpu/Module.cpp
new file mode 100644
index 0000000000000..7bf8abdef2049
--- /dev/null
+++ b/torch/csrc/xpu/Module.cpp
@@ -0,0 +1,351 @@
+#include <ATen/ATen.h>
+#include <ATen/xpu/XPUContext.h>
+#include <ATen/xpu/XPUGeneratorImpl.h>
+#include <c10/util/CallOnce.h>
+#include <c10/xpu/XPUCachingAllocator.h>
+#include <c10/xpu/XPUFunctions.h>
+#include <torch/csrc/Module.h>
+#include <torch/csrc/THP.h>
+#include <torch/csrc/utils/device_lazy_init.h>
+#include <torch/csrc/utils/pycfunction_helpers.h>
+#include <torch/csrc/utils/python_numbers.h>
+#include <torch/csrc/utils/python_strings.h>
+
+#include <pthread.h>
+
+using namespace torch;
+
+static bool in_bad_fork = false; // True for children forked after xpu init
+
+// Called in the forked child if xpu has already been initialized
+static void forked_child() {
+  in_bad_fork = true;
+  torch::utils::set_requires_device_init(at::kXPU, true);
+}
+
+// Should be called before the first xpu call. It is mainly called in lazy_init.
+// Note: This is distinct from initExtension because a stub xpu implementation
+// has some working functions (e.g. device_count) but cannot fully initialize.
+static void poison_fork() {
+  static c10::once_flag flag;
+  c10::call_once(flag, [] { pthread_atfork(nullptr, nullptr, forked_child); });
+}
+
+// XPU management methods
+
+static PyObject* THXPModule_isInBadFork_wrap(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  return PyBool_FromLong(in_bad_fork);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THXPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to set_device");
+
+  auto device_index = THPUtils_unpackDeviceIndex(arg);
+  c10::xpu::set_device(device_index);
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THXPModule_exchangeDevice_wrap(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchange_device");
+
+  auto device_index = THPUtils_unpackDeviceIndex(arg);
+  if (device_index < 0) {
+    return THPUtils_packInt32(-1);
+  }
+
+  torch::utils::device_lazy_init(at::kXPU);
+  auto current_device = c10::xpu::exchange_device(device_index);
+
+  return THPUtils_packDeviceIndex(current_device);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THXPModule_maybeExchangeDevice_wrap(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      THPUtils_checkLong(arg), "invalid argument to maybe_exchange_device");
+
+  auto device_index = THPUtils_unpackDeviceIndex(arg);
+  if (device_index < 0) {
+    return THPUtils_packInt32(-1);
+  }
+
+  torch::utils::device_lazy_init(at::kXPU);
+  auto current_device = c10::xpu::maybe_exchange_device(device_index);
+
+  return THPUtils_packDeviceIndex(current_device);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THXPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+
+  auto device_index = c10::xpu::current_device();
+
+  return THPUtils_packDeviceIndex(device_index);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THXPModule_getDeviceCount_wrap(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  poison_fork();
+  return THPUtils_packUInt64(at::xpu::device_count());
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THXPModule_getCurrentStream_wrap(
+    PyObject* self,
+    PyObject* device_index) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      THPUtils_checkLong(device_index), "invalid argument to current_stream");
+  auto c10_device_index = THPUtils_unpackDeviceIndex(device_index);
+  auto stream = at::xpu::getCurrentXPUStream(c10_device_index);
+  PyObject* output_tuple = PyTuple_New(3);
+  PyTuple_SetItem(
+      output_tuple, 0, THPUtils_packInt64(static_cast<int64_t>(stream.id())));
+  PyTuple_SetItem(
+      output_tuple, 1, THPUtils_packDeviceIndex(stream.device_index()));
+  PyTuple_SetItem(
+      output_tuple,
+      2,
+      THPUtils_packInt64(static_cast<int64_t>(stream.device_type())));
+  return output_tuple;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THXPModule_getCurrentStream_raw(
+    PyObject* self,
+    PyObject* device_index) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      THPUtils_checkLong(device_index),
+      "invalid argument to getCurrentRawStream");
+  auto c10_device_index = THPUtils_unpackDeviceIndex(device_index);
+  return PyLong_FromVoidPtr(
+      &at::xpu::getCurrentXPUStream(c10_device_index).queue());
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THXPModule_setStream_wrap(
+    PyObject* self,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  int64_t stream_id = 0;
+  int64_t device_index = 0;
+  int64_t device_type = 0;
+
+  // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+  constexpr const char* kwlist[] = {
+      "stream_id", "device_index", "device_type", nullptr};
+  if (!PyArg_ParseTupleAndKeywords(
+          args,
+          kwargs,
+          "|LLL",
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+          const_cast<char**>(kwlist),
+          &stream_id,
+          &device_index,
+          &device_type)) {
+  }
+
+  auto stream = at::xpu::XPUStream::unpack3(
+      stream_id,
+      static_cast<c10::DeviceIndex>(device_index),
+      static_cast<c10::DeviceType>(device_type));
+
+  auto device = c10::xpu::current_device();
+  if (device != stream.device_index()) {
+    c10::xpu::set_device(stream.device_index());
+  }
+  at::xpu::setCurrentXPUStream(stream);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THXPModule_xpuSynchronize(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to synchronize");
+  auto device_index = THPUtils_unpackDeviceIndex(arg);
+  {
+    pybind11::gil_scoped_release no_gil;
+    // Only the SYCL queues we have reserved will be synchronized, see Note
+    // [Synchronize Streams on Device].
+    c10::xpu::syncStreamsOnDevice(device_index);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THXPModule_emptyCache(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  c10::xpu::XPUCachingAllocator::emptyCache();
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+// XPU module initialization
+
+static void registerXpuDeviceProperties(PyObject* module) {
+  // Add _xpuDevicePropertires class to torch._C
+  using namespace c10::xpu;
+  auto get_device_type = [](const DeviceProp& prop) {
+    std::ostringstream stream;
+    using namespace sycl::info;
+    switch (prop.device_type) {
+      case device_type::cpu:
+        stream << "cpu";
+        break;
+      case device_type::gpu:
+        stream << "gpu";
+        break;
+      case device_type::accelerator:
+        stream << "accelerator";
+        break;
+      case device_type::host:
+        stream << "host";
+        break;
+      default:
+        stream << "unknown device type:"
+               << static_cast<typename std::underlying_type<device_type>::type>(
+                      prop.device_type);
+        break;
+    }
+    return stream.str();
+  };
+  auto gpu_subslice_count = [](const DeviceProp& prop) {
+    return (prop.gpu_eu_count / prop.gpu_eu_count_per_subslice);
+  };
+  auto m = py::handle(module).cast<py::module>();
+  py::class_<DeviceProp>(m, "_XpuDeviceProperties")
+      .def_readonly("name", &DeviceProp::name)
+      .def_readonly("platform_name", &DeviceProp::platform_name)
+      .def_readonly("vendor", &DeviceProp::vendor)
+      .def_readonly("driver_version", &DeviceProp::driver_version)
+      .def_readonly("version", &DeviceProp::version)
+      .def_readonly("total_memory", &DeviceProp::global_mem_size)
+      .def_readonly("max_compute_units", &DeviceProp::max_compute_units)
+      .def_readonly("gpu_eu_count", &DeviceProp::gpu_eu_count)
+      .def_property_readonly("gpu_subslice_count", gpu_subslice_count)
+      .def_readonly("max_work_group_size", &DeviceProp::max_work_group_size)
+      .def_readonly("max_num_sub_groups", &DeviceProp::max_num_sub_groups)
+      .def_readonly("sub_group_sizes", &DeviceProp::sub_group_sizes)
+      .def_readonly("has_fp16", &DeviceProp::has_fp16)
+      .def_readonly("has_fp64", &DeviceProp::has_fp64)
+      .def_readonly("has_atomic64", &DeviceProp::has_atomic64)
+      .def_property_readonly("type", get_device_type)
+      .def(
+          "__repr__",
+          [&get_device_type, &gpu_subslice_count](const DeviceProp& prop) {
+            std::ostringstream stream;
+            stream << "_XpuDeviceProperties(name='" << prop.name
+                   << "', platform_name='" << prop.platform_name << "', type='"
+                   << get_device_type(prop) << "', driver_version='"
+                   << prop.driver_version << "', total_memory="
+                   << prop.global_mem_size / (1024ull * 1024)
+                   << "MB, max_compute_units=" << prop.max_compute_units
+                   << ", gpu_eu_count=" << prop.gpu_eu_count
+                   << ", gpu_subslice_count=" << gpu_subslice_count(prop)
+                   << ", max_work_group_size=" << prop.max_work_group_size
+                   << ", max_num_sub_groups=" << prop.max_num_sub_groups
+                   << ", sub_group_sizes=[" << prop.sub_group_sizes
+                   << "], has_fp16=" << prop.has_fp16
+                   << ", has_fp64=" << prop.has_fp64
+                   << ", has_atomic64=" << prop.has_atomic64 << ")";
+            return stream.str();
+          });
+}
+
+static void bindGetDeviceProperties(PyObject* module) {
+  // Add method to torch.xpu
+  auto m = py::handle(module).cast<py::module>();
+  m.def(
+      "_get_device_properties",
+      [](c10::DeviceIndex device) -> c10::xpu::DeviceProp* {
+        return at::xpu::getDeviceProperties(device);
+      },
+      py::return_value_policy::reference);
+}
+
+// Callback for python part. Used for additional initialization of python
+// classes
+static PyObject* THXPModule_initExtension(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
+  poison_fork();
+  at::globalContext().lazyInitXPU();
+
+  auto m = THPObjectPtr(PyImport_ImportModule("torch.xpu"));
+  if (!m)
+    throw python_error();
+
+  auto set_module_attr = [&](const char* name, PyObject* v) {
+    if (PyObject_SetAttrString(m, name, v) < 0) {
+      throw python_error();
+    }
+  };
+
+  auto num_gpus = c10::xpu::device_count();
+  THPObjectPtr default_xpu_generators(
+      PyTuple_New(static_cast<Py_ssize_t>(num_gpus)));
+  for (const auto i : c10::irange(num_gpus)) {
+    const auto& gen = at::xpu::detail::getDefaultXPUGenerator(i);
+    auto* cast_gen = THPGenerator_initDefaultGenerator(gen);
+    PyTuple_SetItem(default_xpu_generators.get(), i, cast_gen);
+  }
+  set_module_attr("default_generators", default_xpu_generators.get());
+  bindGetDeviceProperties(m);
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// NOLINTNEXTLINE(*-c-arrays*, *-global-variables)
+static struct PyMethodDef _THXPModule_methods[] = {
+    {"_xpu_init", THXPModule_initExtension, METH_NOARGS, nullptr},
+    {"_xpu_setDevice", THXPModule_setDevice_wrap, METH_O, nullptr},
+    {"_xpu_exchangeDevice", THXPModule_exchangeDevice_wrap, METH_O, nullptr},
+    {"_xpu_maybeExchangeDevice",
+     THXPModule_maybeExchangeDevice_wrap,
+     METH_O,
+     nullptr},
+    {"_xpu_getDevice", THXPModule_getDevice_wrap, METH_NOARGS, nullptr},
+    {"_xpu_getDeviceCount",
+     THXPModule_getDeviceCount_wrap,
+     METH_NOARGS,
+     nullptr},
+    {"_xpu_isInBadFork", THXPModule_isInBadFork_wrap, METH_NOARGS, nullptr},
+    {"_xpu_getCurrentStream",
+     THXPModule_getCurrentStream_wrap,
+     METH_O,
+     nullptr},
+    {"_xpu_getCurrentRawStream",
+     THXPModule_getCurrentStream_raw,
+     METH_O,
+     nullptr},
+    {"_xpu_setStream",
+     castPyCFunctionWithKeywords(THXPModule_setStream_wrap),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {"_xpu_synchronize", THXPModule_xpuSynchronize, METH_O, nullptr},
+    {"_xpu_emptyCache", THXPModule_emptyCache, METH_NOARGS, nullptr},
+    {nullptr}};
+
+PyMethodDef* THXPModule_methods() {
+  return _THXPModule_methods;
+}
+
+namespace torch::xpu {
+
+void initModule(PyObject* module) {
+  registerXpuDeviceProperties(module);
+}
+
+} // namespace torch::xpu
diff --git a/torch/csrc/xpu/Module.h b/torch/csrc/xpu/Module.h
new file mode 100644
index 0000000000000..eecc65ca42f8b
--- /dev/null
+++ b/torch/csrc/xpu/Module.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+PyMethodDef* THXPModule_methods();
+
+namespace torch::xpu {
+
+void initModule(PyObject* module);
+
+} // namespace torch::xpu
diff --git a/torch/csrc/xpu/Stream.cpp b/torch/csrc/xpu/Stream.cpp
new file mode 100644
index 0000000000000..ee8a31f98a238
--- /dev/null
+++ b/torch/csrc/xpu/Stream.cpp
@@ -0,0 +1,199 @@
+#include <pybind11/pybind11.h>
+#include <torch/csrc/Device.h>
+#include <torch/csrc/THP.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/python_numbers.h>
+#include <torch/csrc/xpu/Module.h>
+#include <torch/csrc/xpu/Stream.h>
+
+#include <structmember.h>
+
+PyObject* THXPStreamClass = nullptr;
+
+static PyObject* THXPStream_pynew(
+    PyTypeObject* type,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+
+  const auto current_device = c10::xpu::current_device();
+
+  int32_t priority = 0;
+  int64_t stream_id = 0;
+  int64_t device_index = 0;
+  int64_t device_type = 0;
+
+  // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+  constexpr const char* kwlist[] = {
+      "priority", "stream_id", "device_index", "device_type", nullptr};
+  if (!PyArg_ParseTupleAndKeywords(
+          args,
+          kwargs,
+          "|iLLL",
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+          const_cast<char**>(kwlist),
+          &priority,
+          &stream_id,
+          &device_index,
+          &device_type)) {
+    return nullptr;
+  }
+
+  THPObjectPtr ptr(type->tp_alloc(type, 0));
+  if (!ptr) {
+    return nullptr;
+  }
+
+  at::xpu::XPUStream stream = (stream_id || device_index || device_type)
+      ? at::xpu::XPUStream::unpack3(
+            stream_id,
+            static_cast<c10::DeviceIndex>(device_index),
+            static_cast<c10::DeviceType>(device_type))
+      : at::xpu::getStreamFromPool(priority, current_device);
+
+  THXPStream* self = (THXPStream*)ptr.get();
+  self->stream_id = static_cast<int64_t>(stream.id());
+  // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+  self->device_index = static_cast<int64_t>(stream.device_index());
+  self->device_type = static_cast<int64_t>(stream.device_type());
+  new (&self->xpu_stream) at::xpu::XPUStream(stream);
+
+  return (PyObject*)ptr.release();
+  END_HANDLE_TH_ERRORS
+}
+
+static void THXPStream_dealloc(THXPStream* self) {
+  self->xpu_stream.~XPUStream();
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static PyObject* THXPStream_get_device(THXPStream* self, void* unused) {
+  HANDLE_TH_ERRORS
+  return THPDevice_New(self->xpu_stream.device());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THXPStream_get_sycl_queue(THXPStream* self, void* unused) {
+  HANDLE_TH_ERRORS
+  return PyLong_FromVoidPtr(&self->xpu_stream.queue());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THXPStream_get_priority(THXPStream* self, void* unused) {
+  HANDLE_TH_ERRORS
+  return THPUtils_packInt64(self->xpu_stream.priority());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THXPStream_priority_range(
+    PyObject* _unused,
+    PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto [least_priority, greatest_priority] =
+      at::xpu::XPUStream::priority_range();
+  return Py_BuildValue("(ii)", least_priority, greatest_priority);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THXPStream_query(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto* self = (THXPStream*)_self;
+  return PyBool_FromLong(self->xpu_stream.query());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THXPStream_synchronize(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS {
+    pybind11::gil_scoped_release no_gil;
+    auto* self = (THXPStream*)_self;
+    self->xpu_stream.synchronize();
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THXPStream_eq(PyObject* _self, PyObject* _other) {
+  HANDLE_TH_ERRORS
+  auto* self = (THXPStream*)_self;
+  auto* other = (THXPStream*)_other;
+  return PyBool_FromLong(self->xpu_stream == other->xpu_stream);
+  END_HANDLE_TH_ERRORS
+}
+
+// NOLINTNEXTLINE(*-c-arrays*, *-global-variables)
+static struct PyMemberDef THXPStream_members[] = {{nullptr}};
+
+// NOLINTNEXTLINE(*-c-arrays*, *-global-variables)
+static struct PyGetSetDef THXPStream_properties[] = {
+    {"sycl_queue",
+     (getter)THXPStream_get_sycl_queue,
+     nullptr,
+     nullptr,
+     nullptr},
+    {"priority", (getter)THXPStream_get_priority, nullptr, nullptr, nullptr},
+    {nullptr}};
+
+// NOLINTNEXTLINE(*-c-arrays*, *-global-variables)
+static PyMethodDef THXPStream_methods[] = {
+    {"query", THXPStream_query, METH_NOARGS, nullptr},
+    {"synchronize", THXPStream_synchronize, METH_NOARGS, nullptr},
+    {"priority_range",
+     THXPStream_priority_range,
+     METH_STATIC | METH_NOARGS,
+     nullptr},
+    {"__eq__", THXPStream_eq, METH_O, nullptr},
+    {nullptr}};
+
+PyTypeObject THXPStreamType = {
+    PyVarObject_HEAD_INIT(nullptr, 0) "torch._C._XpuStreamBase", /* tp_name */
+    sizeof(THXPStream), /* tp_basicsize */
+    0, /* tp_itemsize */
+    (destructor)THXPStream_dealloc, /* tp_dealloc */
+    0, /* tp_vectorcall_offset */
+    nullptr, /* tp_getattr */
+    nullptr, /* tp_setattr */
+    nullptr, /* tp_reserved */
+    nullptr, /* tp_repr */
+    nullptr, /* tp_as_number */
+    nullptr, /* tp_as_sequence */
+    nullptr, /* tp_as_mapping */
+    nullptr, /* tp_hash  */
+    nullptr, /* tp_call */
+    nullptr, /* tp_str */
+    nullptr, /* tp_getattro */
+    nullptr, /* tp_setattro */
+    nullptr, /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+    nullptr, /* tp_doc */
+    nullptr, /* tp_traverse */
+    nullptr, /* tp_clear */
+    nullptr, /* tp_richcompare */
+    0, /* tp_weaklistoffset */
+    nullptr, /* tp_iter */
+    nullptr, /* tp_iternext */
+    THXPStream_methods, /* tp_methods */
+    THXPStream_members, /* tp_members */
+    THXPStream_properties, /* tp_getset */
+    nullptr, /* tp_base */
+    nullptr, /* tp_dict */
+    nullptr, /* tp_descr_get */
+    nullptr, /* tp_descr_set */
+    0, /* tp_dictoffset */
+    nullptr, /* tp_init */
+    nullptr, /* tp_alloc */
+    THXPStream_pynew, /* tp_new */
+};
+
+void THXPStream_init(PyObject* module) {
+  Py_INCREF(THPStreamClass);
+  THXPStreamType.tp_base = THPStreamClass;
+  THXPStreamClass = (PyObject*)&THXPStreamType;
+  if (PyType_Ready(&THXPStreamType) < 0) {
+    throw python_error();
+  }
+  Py_INCREF(&THXPStreamType);
+  if (PyModule_AddObject(module, "_XpuStreamBase", (PyObject*)&THXPStreamType) <
+      0) {
+    throw python_error();
+  }
+}
diff --git a/torch/csrc/xpu/Stream.h b/torch/csrc/xpu/Stream.h
new file mode 100644
index 0000000000000..15c3bf4e3dfa2
--- /dev/null
+++ b/torch/csrc/xpu/Stream.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <c10/xpu/XPUStream.h>
+#include <torch/csrc/Stream.h>
+#include <torch/csrc/python_headers.h>
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct THXPStream : THPStream {
+  at::xpu::XPUStream xpu_stream;
+};
+extern PyObject* THXPStreamClass;
+
+void THXPStream_init(PyObject* module);
+
+inline bool THXPStream_Check(PyObject* obj) {
+  return THXPStreamClass && PyObject_IsInstance(obj, THXPStreamClass);
+}
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index b4abbeafe870b..1344de8b9fde9 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -19,14 +19,14 @@
 import traceback
 import warnings
 from functools import lru_cache
-from typing import Any, cast, List, Optional, Tuple, Union
+from typing import Any, Callable, cast, List, Optional, Tuple, Union
 
 import torch
 import torch._C
 from torch.types import Device
 from .. import device as _device
-from .._utils import classproperty
-from ._utils import _dummy_type, _get_device_index
+from .._utils import _dummy_type, _LazySeedTracker, classproperty
+from ._utils import _get_device_index
 from .graphs import (
     CUDAGraph,
     graph,
@@ -44,7 +44,9 @@
 _initialized = False
 _tls = threading.local()
 _initialization_lock = threading.Lock()
-_queued_calls = []  # don't invoke these until initialization occurs
+_queued_calls: List[
+    Tuple[Callable[[], None], List[str]]
+] = []  # don't invoke these until initialization occurs
 _is_in_bad_fork = getattr(torch._C, "_cuda_isInBadFork", lambda: False)
 _device_t = Union[_device, str, int, None]
 
@@ -57,31 +59,6 @@
 except ImportError as err:
     _PYNVML_ERR = err  # sometimes a lib is installed but the import fails for some other reason, so we log the error for later
 
-
-class _LazySeedTracker:
-    # Since seeding is memory-less, only track the latest seed.
-    # Note: `manual_seed_all` followed by `manual_seed` overwrites
-    # the seed on current device. We track the order of **latest**
-    # calls between these two API.
-    def __init__(self):
-        self.manual_seed_all_cb = None
-        self.manual_seed_cb = None
-        self.call_order = []
-
-    def queue_seed_all(self, cb, traceback):
-        self.manual_seed_all_cb = (cb, traceback)
-        # update seed_all to be latest
-        self.call_order = [self.manual_seed_cb, self.manual_seed_all_cb]
-
-    def queue_seed(self, cb, traceback):
-        self.manual_seed_cb = (cb, traceback)
-        # update seed to be latest
-        self.call_order = [self.manual_seed_all_cb, self.manual_seed_cb]
-
-    def get_calls(self) -> List:
-        return self.call_order
-
-
 _lazy_seed_tracker = _LazySeedTracker()
 
 # Define dummy _CudaDeviceProperties type if PyTorch was compiled without CUDA
@@ -266,7 +243,7 @@ class DeferredCudaCallError(Exception):
     pass
 
 
-OutOfMemoryError = torch._C._OutOfMemoryError
+OutOfMemoryError = torch._C.OutOfMemoryError
 
 
 def init():
@@ -754,22 +731,33 @@ def _get_nvml_device_index(device: Optional[Union[int, Device]]) -> int:
         visible_devices = _transform_uuid_to_ordinals(
             cast(List[str], visible_devices), uuids
         )
-    idx_map = dict(enumerate(cast(List[int], visible_devices)))
-    if idx not in idx_map:
+    visible_devices = cast(List[int], visible_devices)
+    if idx < 0 or idx >= len(visible_devices):
         raise RuntimeError(
             f"device {idx} is not visible (CUDA_VISIBLE_DEVICES={visible_devices})"
         )
-    return idx_map[idx]
+    return visible_devices[idx]
+
+
+_cached_device_count: Optional[int] = None
 
 
-@lru_cache(maxsize=1)
 def device_count() -> int:
     r"""Return the number of GPUs available."""
+    global _cached_device_count
     if not _is_compiled():
         return 0
+    if _cached_device_count is not None:
+        return _cached_device_count
     # bypass _device_count_nvml() if rocm (not supported)
     nvml_count = -1 if torch.version.hip else _device_count_nvml()
-    return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
+    r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
+    # NB: Do not cache the device count prior to CUDA initialization, because
+    # the number of devices can change due to changes to CUDA_VISIBLE_DEVICES
+    # setting prior to CUDA initialization.
+    if _initialized:
+        _cached_device_count = r
+    return r
 
 
 def get_arch_list() -> List[str]:
diff --git a/torch/cuda/_gpu_trace.py b/torch/cuda/_gpu_trace.py
new file mode 100644
index 0000000000000..2a738b002d773
--- /dev/null
+++ b/torch/cuda/_gpu_trace.py
@@ -0,0 +1,75 @@
+from typing import Callable
+
+from torch._utils import CallbackRegistry
+
+
+EventCreationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA event creation"
+)
+EventDeletionCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA event deletion"
+)
+EventRecordCallbacks: "CallbackRegistry[int, int]" = CallbackRegistry(
+    "CUDA event record"
+)
+EventWaitCallbacks: "CallbackRegistry[int, int]" = CallbackRegistry(
+    "CUDA event wait"
+)
+MemoryAllocationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA memory allocation"
+)
+MemoryDeallocationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA memory deallocation"
+)
+StreamCreationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA stream creation"
+)
+DeviceSynchronizationCallbacks: "CallbackRegistry[[]]" = CallbackRegistry(
+    "CUDA device synchronization"
+)
+StreamSynchronizationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA stream synchronization"
+)
+EventSynchronizationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA event synchronization"
+)
+
+
+def register_callback_for_event_creation(cb: Callable[[int], None]) -> None:
+    EventCreationCallbacks.add_callback(cb)
+
+
+def register_callback_for_event_deletion(cb: Callable[[int], None]) -> None:
+    EventDeletionCallbacks.add_callback(cb)
+
+
+def register_callback_for_event_record(cb: Callable[[int, int], None]) -> None:
+    EventRecordCallbacks.add_callback(cb)
+
+
+def register_callback_for_event_wait(cb: Callable[[int, int], None]) -> None:
+    EventWaitCallbacks.add_callback(cb)
+
+
+def register_callback_for_memory_allocation(cb: Callable[[int], None]) -> None:
+    MemoryAllocationCallbacks.add_callback(cb)
+
+
+def register_callback_for_memory_deallocation(cb: Callable[[int], None]) -> None:
+    MemoryDeallocationCallbacks.add_callback(cb)
+
+
+def register_callback_for_stream_creation(cb: Callable[[int], None]) -> None:
+    StreamCreationCallbacks.add_callback(cb)
+
+
+def register_callback_for_device_synchronization(cb: Callable[[], None]) -> None:
+    DeviceSynchronizationCallbacks.add_callback(cb)
+
+
+def register_callback_for_stream_synchronization(cb: Callable[[int], None]) -> None:
+    StreamSynchronizationCallbacks.add_callback(cb)
+
+
+def register_callback_for_event_synchronization(cb: Callable[[int], None]) -> None:
+    EventSynchronizationCallbacks.add_callback(cb)
diff --git a/torch/cuda/_memory_viz.py b/torch/cuda/_memory_viz.py
index a862acd731847..587d7e9c7c5e0 100644
--- a/torch/cuda/_memory_viz.py
+++ b/torch/cuda/_memory_viz.py
@@ -9,6 +9,7 @@
 from itertools import groupby
 import base64
 import warnings
+import operator
 
 cache = lru_cache(None)
 
@@ -354,7 +355,7 @@ def find_segment(addr):
             elif e['action'] == 'oom':
                 size = e['size']
                 free = e['device_free']
-                out.write(f'raise OutOfMemoryError() # {Bytes(size)} requested, {Bytes(free)} free in CUDA\n')
+                out.write(f'raise OutOfMemoryError # {Bytes(size)} requested, {Bytes(free)} free in CUDA\n')
             else:
                 out.write(f'{e}\n')
         out.write(f"TOTAL MEM: {Bytes(count)}")
@@ -492,7 +493,7 @@ def free(alloc, device):
     # create the final snapshot state
     blocks_at_end = [(to_device(tensor_key.device), event['addr'], event['size'], event['frames'])
                      for (tensor_key, version), event in kv_to_elem.items()]
-    for device, blocks in groupby(sorted(blocks_at_end), key=lambda x: x[0]):
+    for device, blocks in groupby(sorted(blocks_at_end), key=operator.itemgetter(0)):
         seg = snapshot['segments'][device]  # type: ignore[index]
         last_addr = seg['address']
         for _, addr, size, frames in blocks:
diff --git a/torch/cuda/_sanitizer.py b/torch/cuda/_sanitizer.py
index c0b0297366db7..89766ba8c1a43 100644
--- a/torch/cuda/_sanitizer.py
+++ b/torch/cuda/_sanitizer.py
@@ -22,7 +22,7 @@
 from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, TypeVar
 
 import torch
-import torch.utils._cuda_trace as cuda_trace
+import torch.cuda._gpu_trace as gpu_trace
 from torch.utils import _pytree as pytree
 from torch.utils._python_dispatch import TorchDispatchMode
 
@@ -528,35 +528,35 @@ def parse_outputs(self, outputs: Any) -> None:
 class CUDASanitizerDispatchMode(TorchDispatchMode):
     def __init__(self):
         self.event_handler = EventHandler()
-        torch._C._activate_cuda_trace()
-        cuda_trace.register_callback_for_cuda_event_creation(
+        torch._C._activate_gpu_trace()
+        gpu_trace.register_callback_for_event_creation(
             self.event_handler._handle_event_creation
         )
-        cuda_trace.register_callback_for_cuda_event_deletion(
+        gpu_trace.register_callback_for_event_deletion(
             self.event_handler._handle_event_deletion
         )
-        cuda_trace.register_callback_for_cuda_event_record(
+        gpu_trace.register_callback_for_event_record(
             self.event_handler._handle_event_record
         )
-        cuda_trace.register_callback_for_cuda_event_wait(
+        gpu_trace.register_callback_for_event_wait(
             self.event_handler._handle_event_wait
         )
-        cuda_trace.register_callback_for_cuda_memory_allocation(
+        gpu_trace.register_callback_for_memory_allocation(
             self.event_handler._handle_memory_allocation
         )
-        cuda_trace.register_callback_for_cuda_memory_deallocation(
+        gpu_trace.register_callback_for_memory_deallocation(
             self.event_handler._handle_memory_deallocation
         )
-        cuda_trace.register_callback_for_cuda_stream_creation(
+        gpu_trace.register_callback_for_stream_creation(
             self.event_handler._handle_stream_creation
         )
-        cuda_trace.register_callback_for_cuda_device_synchronization(
+        gpu_trace.register_callback_for_device_synchronization(
             self.event_handler._handle_device_synchronization
         )
-        cuda_trace.register_callback_for_cuda_stream_synchronization(
+        gpu_trace.register_callback_for_stream_synchronization(
             self.event_handler._handle_stream_synchronization
         )
-        cuda_trace.register_callback_for_cuda_event_synchronization(
+        gpu_trace.register_callback_for_event_synchronization(
             self.event_handler._handle_event_synchronization
         )
 
diff --git a/torch/cuda/_utils.py b/torch/cuda/_utils.py
index 1794ca9ddd1fd..1d0ee8830bd68 100644
--- a/torch/cuda/_utils.py
+++ b/torch/cuda/_utils.py
@@ -36,19 +36,3 @@ def _get_device_index(
         if isinstance(device, torch.cuda.device):
             return device.idx
     return _torch_get_device_index(device, optional, allow_cpu)
-
-
-def _dummy_type(name: str) -> type:
-    def get_err_fn(is_init: bool):
-        def err_fn(obj, *args, **kwargs):
-            if is_init:
-                class_name = obj.__class__.__name__
-            else:
-                class_name = obj.__name__
-            raise RuntimeError(f"Tried to instantiate dummy base class {class_name}")
-
-        return err_fn
-
-    return type(
-        name, (object,), {"__init__": get_err_fn(True), "__new__": get_err_fn(False)}
-    )
diff --git a/torch/cuda/amp/__init__.py b/torch/cuda/amp/__init__.py
index 867637eed0709..003cca1f0cd4e 100644
--- a/torch/cuda/amp/__init__.py
+++ b/torch/cuda/amp/__init__.py
@@ -1,7 +1,9 @@
 from .autocast_mode import autocast, custom_bwd, custom_fwd
+from .common import amp_definitely_not_available
 from .grad_scaler import GradScaler
 
 __all__ = [
+    "amp_definitely_not_available",
     "autocast",
     "custom_bwd",
     "custom_fwd",
diff --git a/torch/cuda/amp/grad_scaler.py b/torch/cuda/amp/grad_scaler.py
index 704ffa8d85100..0ebaa9bced2ca 100644
--- a/torch/cuda/amp/grad_scaler.py
+++ b/torch/cuda/amp/grad_scaler.py
@@ -1,117 +1,13 @@
-from __future__ import annotations
-
-import inspect
-import warnings
-from collections import abc, defaultdict
-from enum import Enum
-from typing import Any, cast, Dict, Iterable, List, Optional, overload, Tuple, Union
-
 import torch
-from .common import amp_definitely_not_available
-
+from torch.amp.grad_scaler import OptState
 
-__all__ = ["OptState", "GradScaler"]
+__all__ = ["GradScaler", "OptState"]
 
 
-class _MultiDeviceReplicator:
-    """Lazily serves copies of a tensor to requested devices.
-
-    Copies are cached per-device.
-    """
-
-    def __init__(self, master_tensor: torch.Tensor) -> None:
-        assert master_tensor.is_cuda or master_tensor.device.type == "xla"
-        self.master = master_tensor
-        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
-
-    def get(self, device: torch.device) -> torch.Tensor:
-        retval = self._per_device_tensors.get(device, None)
-        if retval is None:
-            retval = self.master.to(device=device, non_blocking=True, copy=True)
-            self._per_device_tensors[device] = retval
-        return retval
-
-
-# Defines default_factory for GradScaler's _per_optimizer_states defaultdict,
-# as well as associated "enum" values.  Prefers defining these at top level because
-# - Lambdas can't be pickled, so we don't want to supply a lambda as the factory.
-# - Defining READY, UNSCALED, STEPPED and _refresh_per_optimizer_state within GradScaler
-#   causes a circular reference, which we'd rather avoid.
-class OptState(Enum):
-    READY = 0
-    UNSCALED = 1
-    STEPPED = 2
-
-
-def _refresh_per_optimizer_state() -> Dict[str, Any]:
-    return {"stage": OptState.READY, "found_inf_per_device": {}}
-
-
-class GradScaler:
-    """An instance ``scaler`` of :class:`GradScaler`.
-
-    Helps perform the steps of gradient scaling
-    conveniently.
-
-    * ``scaler.scale(loss)`` multiplies a given loss by ``scaler``'s current scale factor.
-    * ``scaler.step(optimizer)`` safely unscales gradients and calls ``optimizer.step()``.
-    * ``scaler.update()`` updates ``scaler``'s scale factor.
-
-    Example::
-
-        # Creates a GradScaler once at the beginning of training.
-        scaler = GradScaler()
-
-        for epoch in epochs:
-            for input, target in data:
-                optimizer.zero_grad()
-                output = model(input)
-                loss = loss_fn(output, target)
-
-                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
-                scaler.scale(loss).backward()
-
-                # scaler.step() first unscales gradients of the optimizer's params.
-                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
-                # otherwise, optimizer.step() is skipped.
-                scaler.step(optimizer)
-
-                # Updates the scale for next iteration.
-                scaler.update()
-
-    See the :ref:`Automatic Mixed Precision examples<amp-examples>` for usage
-    (along with autocasting) in more complex cases like gradient clipping, gradient accumulation, gradient penalty,
-    and multiple losses/optimizers.
-
-    ``scaler`` dynamically estimates the scale factor each iteration.  To minimize gradient underflow,
-    a large scale factor should be used.  However, ``float16`` values can "overflow" (become inf or NaN) if
-    the scale factor is too large.  Therefore, the optimal scale factor is the largest factor that can be used
-    without incurring inf or NaN gradient values.
-    ``scaler`` approximates the optimal scale factor over time by checking the gradients for infs and NaNs during every
-    ``scaler.step(optimizer)`` (or optional separate ``scaler.unscale_(optimizer)``, see :meth:`unscale_`).
-
-    * If infs/NaNs are found, ``scaler.step(optimizer)`` skips the underlying ``optimizer.step()`` (so the params
-      themselves remain uncorrupted) and ``update()`` multiplies the scale by ``backoff_factor``.
-
-    * If no infs/NaNs are found, ``scaler.step(optimizer)`` runs the underlying ``optimizer.step()`` as usual.
-      If ``growth_interval`` unskipped iterations occur consecutively, ``update()`` multiplies the scale by
-      ``growth_factor``.
-
-    The scale factor often causes infs/NaNs to appear in gradients for the first few iterations as its
-    value calibrates.  ``scaler.step`` will skip the underlying ``optimizer.step()`` for these
-    iterations.  After that, step skipping should occur rarely (once every few hundred or thousand iterations).
-
-    Args:
-        init_scale (float, optional, default=2.**16):  Initial scale factor.
-        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
-            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
-        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
-            :meth:`update` if inf/NaN gradients occur in an iteration.
-        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
-            that must occur for the scale to be multiplied by ``growth_factor``.
-        enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
-            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
-            Default: ``True``
+class GradScaler(torch.amp.GradScaler):
+    r"""
+    See :class:`torch.amp.GradScaler`.
+    ``torch.cuda.amp.GradScaler(args...)`` is equivalent to ``torch.amp.GradScaler("cuda", args...)``
     """
 
     def __init__(
@@ -122,558 +18,11 @@ def __init__(
         growth_interval: int = 2000,
         enabled: bool = True,
     ) -> None:
-        if enabled and amp_definitely_not_available():
-            warnings.warn(
-                "torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling."
-            )
-            self._enabled = False
-        else:
-            self._enabled = enabled
-
-        if self._enabled:
-            assert growth_factor > 1.0, "The growth factor must be > 1.0."
-            assert backoff_factor < 1.0, "The backoff factor must be < 1.0."
-
-            self._init_scale = init_scale
-            # self._scale will be lazily initialized during the first call to scale()
-            self._scale: Optional[torch.Tensor] = None
-            self._growth_factor = growth_factor
-            self._backoff_factor = backoff_factor
-            self._growth_interval = growth_interval
-            self._init_growth_tracker = 0
-            # self._growth_tracker will be lazily initialized during the first call to scale()
-            self._growth_tracker: Optional[torch.Tensor] = None
-            self._per_optimizer_states: Dict[int, Dict[str, Any]] = defaultdict(
-                _refresh_per_optimizer_state
-            )
-
-    def _check_scale_growth_tracker(
-        self, funcname: str
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        fix = "This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration."
-        assert self._scale is not None, (
-            f"Attempted {funcname} but _scale is None.  " + fix
-        )
-        assert self._growth_tracker is not None, (
-            f"Attempted {funcname} but _growth_tracker is None.  " + fix
-        )
-        return (self._scale, self._growth_tracker)
-
-    def _lazy_init_scale_growth_tracker(self, dev: torch.device) -> None:
-        assert self._growth_tracker is None, "_growth_tracker initialized before _scale"
-        self._scale = torch.full((), self._init_scale, dtype=torch.float32, device=dev)
-        self._growth_tracker = torch.full(
-            (), self._init_growth_tracker, dtype=torch.int32, device=dev
+        super().__init__(
+            "cuda",
+            init_scale=init_scale,
+            growth_factor=growth_factor,
+            backoff_factor=backoff_factor,
+            growth_interval=growth_interval,
+            enabled=enabled,
         )
-
-    @overload
-    def scale(self, outputs: torch.Tensor) -> torch.Tensor:
-        ...
-
-    @overload
-    def scale(self, outputs: List[torch.Tensor]) -> List[torch.Tensor]:
-        ...
-
-    @overload
-    def scale(self, outputs: Tuple[torch.Tensor, ...]) -> Tuple[torch.Tensor, ...]:
-        ...
-
-    @overload
-    def scale(self, outputs: Iterable[torch.Tensor]) -> Iterable[torch.Tensor]:
-        ...
-
-    def scale(
-        self,
-        outputs: Union[torch.Tensor, Iterable[torch.Tensor]],
-    ) -> Union[torch.Tensor, Iterable[torch.Tensor]]:
-        """
-        Multiplies ('scales') a tensor or list of tensors by the scale factor.
-
-        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
-        unmodified.
-
-        Args:
-            outputs (Tensor or iterable of Tensors):  Outputs to scale.
-        """
-        if not self._enabled:
-            return outputs
-
-        # Short-circuit for the common case.
-        if isinstance(outputs, torch.Tensor):
-            assert outputs.is_cuda or outputs.device.type == "xla"
-            if self._scale is None:
-                self._lazy_init_scale_growth_tracker(outputs.device)
-            assert self._scale is not None
-            return outputs * self._scale.to(device=outputs.device, non_blocking=True)
-
-        # Invoke the more complex machinery only if we're treating multiple outputs.
-        stash: List[
-            _MultiDeviceReplicator
-        ] = []  # holds a reference that can be overwritten by apply_scale
-
-        def apply_scale(val: Union[torch.Tensor, Iterable[torch.Tensor]]):
-            if isinstance(val, torch.Tensor):
-                assert val.is_cuda or val.device.type == "xla"
-                if len(stash) == 0:
-                    if self._scale is None:
-                        self._lazy_init_scale_growth_tracker(val.device)
-                    assert self._scale is not None
-                    stash.append(_MultiDeviceReplicator(self._scale))
-                return val * stash[0].get(val.device)
-            if isinstance(val, abc.Iterable):
-                iterable = map(apply_scale, val)
-                if isinstance(val, (list, tuple)):
-                    return type(val)(iterable)
-                return iterable
-            raise ValueError("outputs must be a Tensor or an iterable of Tensors")
-
-        return apply_scale(outputs)
-
-    def _unscale_grads_(
-        self,
-        optimizer: torch.optim.Optimizer,
-        inv_scale: torch.Tensor,
-        found_inf: torch.Tensor,
-        allow_fp16: bool,
-    ) -> Dict[torch.device, torch.Tensor]:
-        per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
-        per_device_found_inf = _MultiDeviceReplicator(found_inf)
-
-        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
-        # There could be hundreds of grads, so we'd like to iterate through them just once.
-        # However, we don't know their devices or dtypes in advance.
-
-        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
-        # Google says mypy struggles with defaultdicts type annotations.
-        per_device_and_dtype_grads: Dict[
-            torch.device, Dict[torch.dtype, List[torch.Tensor]]
-        ] = defaultdict(lambda: defaultdict(list))
-        with torch.no_grad():
-            for group in optimizer.param_groups:
-                for param in group["params"]:
-                    assert isinstance(param, torch.Tensor)
-                    if param.grad is None:
-                        continue
-                    if (not allow_fp16) and param.grad.dtype == torch.float16:
-                        raise ValueError("Attempting to unscale FP16 gradients.")
-                    if param.grad.is_sparse:
-                        # is_coalesced() == False means the sparse grad has values with duplicate indices.
-                        # coalesce() deduplicates indices and adds all values that have the same index.
-                        # For scaled fp16 values, there's a good chance coalescing will cause overflow,
-                        # so we should check the coalesced _values().
-                        if param.grad.dtype is torch.float16:
-                            param.grad = param.grad.coalesce()
-                        to_unscale = param.grad._values()
-                    else:
-                        to_unscale = param.grad
-
-                    # TODO: is there a way to split by device and dtype without appending in the inner loop?
-                    per_device_and_dtype_grads[to_unscale.device][
-                        to_unscale.dtype
-                    ].append(to_unscale)
-
-            for device, per_dtype_grads in per_device_and_dtype_grads.items():
-                for grads in per_dtype_grads.values():
-                    torch._amp_foreach_non_finite_check_and_unscale_(
-                        grads,
-                        per_device_found_inf.get(device),
-                        per_device_inv_scale.get(device),
-                    )
-
-        return per_device_found_inf._per_device_tensors
-
-    def unscale_(self, optimizer: torch.optim.Optimizer) -> None:
-        """
-        Divides ("unscales") the optimizer's gradient tensors by the scale factor.
-
-        :meth:`unscale_` is optional, serving cases where you need to
-        :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
-        between the backward pass(es) and :meth:`step`.
-        If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
-
-        Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
-
-            ...
-            scaler.scale(loss).backward()
-            scaler.unscale_(optimizer)
-            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
-            scaler.step(optimizer)
-            scaler.update()
-
-        Args:
-            optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
-
-        .. note::
-            :meth:`unscale_` does not incur a CPU-GPU sync.
-
-        .. warning::
-            :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
-            and only after all gradients for that optimizer's assigned parameters have been accumulated.
-            Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
-
-        .. warning::
-            :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
-        """
-        if not self._enabled:
-            return
-
-        self._check_scale_growth_tracker("unscale_")
-
-        optimizer_state = self._per_optimizer_states[id(optimizer)]
-
-        if optimizer_state["stage"] is OptState.UNSCALED:
-            raise RuntimeError(
-                "unscale_() has already been called on this optimizer since the last update()."
-            )
-        elif optimizer_state["stage"] is OptState.STEPPED:
-            raise RuntimeError("unscale_() is being called after step().")
-
-        # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
-        assert self._scale is not None
-        inv_scale = self._scale.double().reciprocal().float()
-        found_inf = torch.full((), 0.0, dtype=torch.float32, device=self._scale.device)
-
-        optimizer_state["found_inf_per_device"] = self._unscale_grads_(
-            optimizer, inv_scale, found_inf, False
-        )
-        optimizer_state["stage"] = OptState.UNSCALED
-
-    def _maybe_opt_step(
-        self,
-        optimizer: torch.optim.Optimizer,
-        optimizer_state: Dict[str, Any],
-        *args: Any,
-        **kwargs: Any,
-    ) -> Optional[float]:
-        retval: Optional[float] = None
-        if not sum(v.item() for v in optimizer_state["found_inf_per_device"].values()):
-            retval = optimizer.step(*args, **kwargs)
-        return retval
-
-    def step(
-        self, optimizer: torch.optim.Optimizer, *args: Any, **kwargs: Any
-    ) -> Optional[float]:
-        """Invoke ``unscale_(optimizer)`` followed by parameter update, if gradients are not infs/NaN.
-
-        :meth:`step` carries out the following two operations:
-
-        1.  Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer``
-            earlier in the iteration).  As part of the :meth:`unscale_`, gradients are checked for infs/NaNs.
-        2.  If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled
-            gradients.  Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params.
-
-        ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``.
-
-        Returns the return value of ``optimizer.step(*args, **kwargs)``.
-
-        Args:
-            optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients.
-            args:  Any arguments.
-            kwargs:  Any keyword arguments.
-
-        .. warning::
-            Closure use is not currently supported.
-        """
-        if not self._enabled:
-            return optimizer.step(*args, **kwargs)
-
-        if "closure" in kwargs:
-            raise RuntimeError(
-                "Closure use is not currently supported if GradScaler is enabled."
-            )
-
-        self._check_scale_growth_tracker("step")
-
-        optimizer_state = self._per_optimizer_states[id(optimizer)]
-
-        if optimizer_state["stage"] is OptState.STEPPED:
-            raise RuntimeError(
-                "step() has already been called since the last update()."
-            )
-
-        retval: Optional[float] = None
-
-        if getattr(optimizer, "_step_supports_amp_scaling", False):
-            # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly.
-            # The contract with custom optimizers is that their step() should accept an additional,
-            # optional grad_scaler kwarg.  We append self to the kwargs so the custom optimizer has full information:
-            # it can query its own state, invoke unscale_ on itself, etc
-            # The contract above is being deprecated to avoid introducing `grad_scaler: GradScaler` argument
-            # to `Optimizer.step`. The new behavior is going to add two Tensor attributes of `grad_scale`
-            # and `found_inf` to the passed optimizer so that the optimizer can utilize those
-            # to skip the parameter updates or unscale gradients before updating parameters in
-            # the fused kernel, e.g. `FusedAdamMathFunctor`.
-            # In this behavior, `GradScaler._check_inf_per_device` is called if `OptState.READY`,
-            # while the method is expected to be called by users side, i.e. their optimizers.
-            kwargs_ = kwargs
-            has_grad_scaler_kwarg = (
-                "grad_scaler" in inspect.signature(optimizer.step).parameters
-            )
-            if has_grad_scaler_kwarg:
-                warnings.warn(
-                    "GradScaler is going to stop passing itself as a keyword argument to the passed "
-                    "optimizer. In the near future GradScaler registers `grad_scale: Tensor` and "
-                    "`found_inf: Tensor` to the passed optimizer and let the optimizer use them directly.",
-                    FutureWarning,
-                )
-                kwargs_.update({"grad_scaler": self})
-            else:
-                if optimizer_state["stage"] is OptState.READY:
-                    self._check_inf_per_device(optimizer)
-                scaler = self._get_scale_async()
-                assert scaler is not None
-                found_inf = cast(
-                    torch.Tensor,
-                    sum(
-                        [
-                            t.to(scaler.device, non_blocking=True)
-                            for t in optimizer_state["found_inf_per_device"].values()
-                        ]
-                    ),
-                )
-                optimizer.grad_scale = (  # type: ignore[attr-defined]
-                    None if optimizer_state["stage"] == OptState.UNSCALED else scaler
-                )
-                optimizer.found_inf = found_inf  # type: ignore[attr-defined]
-            retval = optimizer.step(*args, **kwargs_)
-            optimizer_state["stage"] = OptState.STEPPED
-            if not has_grad_scaler_kwarg:
-                del optimizer.grad_scale  # type: ignore[attr-defined]
-                del optimizer.found_inf  # type: ignore[attr-defined]
-            return retval
-
-        if optimizer_state["stage"] is OptState.READY:
-            self.unscale_(optimizer)
-
-        assert (
-            len(optimizer_state["found_inf_per_device"]) > 0
-        ), "No inf checks were recorded for this optimizer."
-
-        retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)
-
-        optimizer_state["stage"] = OptState.STEPPED
-
-        return retval
-
-    def update(self, new_scale: Optional[Union[float, torch.Tensor]] = None) -> None:
-        """Update the scale factor.
-
-        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
-        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
-        the scale is multiplied by ``growth_factor`` to increase it.
-
-        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
-        used directly, it's used to fill GradScaler's internal scale tensor. So if
-        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
-        affect the scale GradScaler uses internally.)
-
-        Args:
-            new_scale (float or :class:`torch.cuda.FloatTensor`, optional, default=None):  New scale factor.
-
-        .. warning::
-            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
-            been invoked for all optimizers used this iteration.
-
-        .. warning::
-            For performance reasons, we do not check the scale factor value to avoid synchronizations,
-            so the scale factor is not guaranteed to be above 1. If the scale falls below 1 and/or
-            you are seeing NaNs in your gradients or loss, something is likely wrong. For example,
-            bf16-pretrained models are often incompatible with AMP/fp16 due to differing dynamic ranges.
-        """
-        if not self._enabled:
-            return
-
-        _scale, _growth_tracker = self._check_scale_growth_tracker("update")
-
-        if new_scale is not None:
-            assert self._scale is not None
-            # Accept a new user-defined scale.
-            if isinstance(new_scale, float):
-                self._scale.fill_(new_scale)
-            else:
-                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False."
-                assert isinstance(new_scale, torch.cuda.FloatTensor), reason  # type: ignore[attr-defined]
-                assert new_scale.numel() == 1, reason
-                assert new_scale.requires_grad is False, reason
-                self._scale.copy_(new_scale)
-        else:
-            # Consume shared inf/nan data collected from optimizers to update the scale.
-            # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
-            found_infs = [
-                found_inf.to(device=_scale.device, non_blocking=True)
-                for state in self._per_optimizer_states.values()
-                for found_inf in state["found_inf_per_device"].values()
-            ]
-
-            assert len(found_infs) > 0, "No inf checks were recorded prior to update."
-
-            found_inf_combined = found_infs[0]
-            if len(found_infs) > 1:
-                for i in range(1, len(found_infs)):
-                    found_inf_combined += found_infs[i]
-
-            torch._amp_update_scale_(
-                _scale,
-                _growth_tracker,
-                found_inf_combined,
-                self._growth_factor,
-                self._backoff_factor,
-                self._growth_interval,
-            )
-
-        # To prepare for next iteration, clear the data collected from optimizers this iteration.
-        self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
-
-    def _get_scale_async(self) -> Optional[torch.Tensor]:
-        return self._scale
-
-    def get_scale(self) -> float:
-        """Return a Python float containing the current scale, or 1.0 if scaling is disabled.
-
-        .. warning::
-            :meth:`get_scale` incurs a CPU-GPU sync.
-        """
-        if self._enabled:
-            return (
-                self._init_scale
-                if (scale := self._get_scale_async()) is None
-                else cast(float, scale.item())
-            )
-        return 1.0
-
-    def get_growth_factor(self) -> float:
-        r"""Return a Python float containing the scale growth factor."""
-        return self._growth_factor
-
-    def set_growth_factor(self, new_factor: float) -> None:
-        r"""Set a new scale growth factor.
-
-        Args:
-            new_scale (float):  Value to use as the new scale growth factor.
-        """
-        self._growth_factor = new_factor
-
-    def get_backoff_factor(self) -> float:
-        r"""Return a Python float containing the scale backoff factor."""
-        return self._backoff_factor
-
-    def set_backoff_factor(self, new_factor: float) -> None:
-        r"""Set a new scale backoff factor.
-
-        Args:
-            new_scale (float):  Value to use as the new scale backoff factor.
-        """
-        self._backoff_factor = new_factor
-
-    def get_growth_interval(self) -> int:
-        r"""Return a Python int containing the growth interval."""
-        return self._growth_interval
-
-    def set_growth_interval(self, new_interval: int) -> None:
-        r"""Set a new growth interval.
-
-        Args:
-            new_interval (int):  Value to use as the new growth interval.
-        """
-        self._growth_interval = new_interval
-
-    def _get_growth_tracker(self) -> int:
-        if self._enabled:
-            return (
-                self._init_growth_tracker
-                if self._growth_tracker is None
-                else cast(int, self._growth_tracker.item())
-            )
-        return 0
-
-    def is_enabled(self) -> bool:
-        r"""Return a bool indicating whether this instance is enabled."""
-        return self._enabled
-
-    def state_dict(self) -> Dict[str, Any]:
-        r"""Return the state of the scaler as a :class:`dict`.
-
-        It contains five entries:
-
-        * ``"scale"`` - a Python float containing the current scale
-        * ``"growth_factor"`` - a Python float containing the current growth factor
-        * ``"backoff_factor"`` - a Python float containing the current backoff factor
-        * ``"growth_interval"`` - a Python int containing the current growth interval
-        * ``"_growth_tracker"`` - a Python int containing the number of recent consecutive unskipped steps.
-
-        If this instance is not enabled, returns an empty dict.
-
-        .. note::
-           If you wish to checkpoint the scaler's state after a particular iteration, :meth:`state_dict`
-           should be called after :meth:`update`.
-        """
-        if self._enabled:
-            return {
-                "scale": self.get_scale(),
-                "growth_factor": self._growth_factor,
-                "backoff_factor": self._backoff_factor,
-                "growth_interval": self._growth_interval,
-                "_growth_tracker": self._get_growth_tracker(),
-            }
-        return {}
-
-    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
-        r"""Load the scaler state.
-
-        If this instance is disabled, :meth:`load_state_dict` is a no-op.
-
-        Args:
-           state_dict(dict): scaler state.  Should be an object returned from a call to :meth:`state_dict`.
-        """
-        if not self._enabled:
-            return
-
-        if len(state_dict) == 0:
-            raise RuntimeError(
-                "The source state dict is empty, possibly because it was saved "
-                "from a disabled instance of GradScaler."
-            )
-
-        self._init_scale = cast(float, state_dict["scale"])
-        if self._scale is not None:
-            self._scale.fill_(state_dict["scale"])
-        self._growth_factor = cast(float, state_dict["growth_factor"])
-        self._backoff_factor = cast(float, state_dict["backoff_factor"])
-        self._growth_interval = cast(int, state_dict["growth_interval"])
-        self._init_growth_tracker = cast(int, state_dict["_growth_tracker"])
-        if self._growth_tracker is not None:
-            self._growth_tracker.fill_(state_dict["_growth_tracker"])
-
-    def __getstate__(self) -> Dict[str, Any]:
-        state = self.__dict__.copy()
-        if self._enabled:
-            assert len(self._per_optimizer_states) == 0, (
-                "A GradScaler instance may only be pickled at the beginning "
-                "of an iteration, or at the end after scaler.update()."
-            )
-            # Pickling _scale and _growth_tracker Tensors directly triggers
-            # "warnings.warn("pickle support for Storage will be removed in 1.5..."
-            # so instead, we set the unpickled instance up to reinitialize them lazily.
-            state["_init_scale"] = self.get_scale()
-            state["_init_growth_tracker"] = self._get_growth_tracker()
-            state["_scale"] = None
-            state["_growth_tracker"] = None
-        return state
-
-    def __setstate__(self, state: Dict[str, Any]) -> None:
-        self.__dict__.update(state)
-
-    def _check_inf_per_device(self, optimizer: torch.optim.Optimizer) -> Dict[str, Any]:
-        _scale, _ = self._check_scale_growth_tracker("_check_inf_per_device")
-
-        dummy_inv_scale = torch.full((), 1.0, dtype=torch.float32, device=_scale.device)
-        found_inf = torch.full((), 0.0, dtype=torch.float32, device=_scale.device)
-
-        self._per_optimizer_states[id(optimizer)][
-            "found_inf_per_device"
-        ] = self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
-
-        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
-
-    def _found_inf_per_device(self, optimizer: torch.optim.Optimizer) -> Dict[str, Any]:
-        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
diff --git a/torch/cuda/graphs.py b/torch/cuda/graphs.py
index 372e6c8e09bed..b3bfbab6ad161 100644
--- a/torch/cuda/graphs.py
+++ b/torch/cuda/graphs.py
@@ -1,8 +1,9 @@
 import gc
+from typing import Optional
 
 import torch
 from torch.utils import _pytree
-from ._utils import _dummy_type
+from .._utils import _dummy_type
 
 if not hasattr(torch._C, "_CudaStreamBase"):
     # Define dummy base classes
@@ -141,7 +142,7 @@ class graph:
         https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
     """  # noqa: B950
 
-    default_capture_stream = None
+    default_capture_stream: Optional["torch.cuda.Stream"] = None
 
     def __init__(
         self,
@@ -186,7 +187,7 @@ def __exit__(self, exc_type, exc_value, traceback):
 
 
 def make_graphed_callables(
-    callables, sample_args, num_warmup_iters=3, allow_unused_input=False
+    callables, sample_args, num_warmup_iters=3, allow_unused_input=False, pool=None
 ):
     r"""Accept callables (functions or :class:`nn.Module<torch.nn.Module>`\ s) and returns graphed versions.
 
@@ -217,7 +218,9 @@ def make_graphed_callables(
             11 iterations for warm up. Default: ``3``.
         allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs
             (and therefore their grad is always zero) is an error. Defaults to False.
-
+        pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
+            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
+            with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
     .. note::
         The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
         that's expected for the corresponding real input in the training loop.
@@ -303,7 +306,7 @@ def make_graphed_callables(
     fwd_graphs = [torch.cuda.CUDAGraph() for _ in range(len(callables))]
     bwd_graphs = [torch.cuda.CUDAGraph() for _ in range(len(callables))]
 
-    mempool = graph_pool_handle()
+    mempool = graph_pool_handle() if pool is None else pool
 
     # Warmup
     # Hopefully prevents cudnn benchmarking and other lazy-initialization cuda work
@@ -324,7 +327,7 @@ def make_graphed_callables(
                     only_inputs=True,
                     allow_unused=allow_unused_input,
                 )
-            del outputs, grad_inputs
+            del outputs, grad_inputs  # type: ignore[possibly-undefined]
     torch.cuda.synchronize()
 
     # All captures here share a mempool. To avoid replays corrupting each other's memory,
@@ -383,8 +386,8 @@ def make_graphed_callables(
         per_callable_static_grad_inputs.append(static_grad_inputs)
 
     # Reverses the most recent two lists
-    per_callable_static_grad_outputs = list(reversed(per_callable_static_grad_outputs))
-    per_callable_static_grad_inputs = list(reversed(per_callable_static_grad_inputs))
+    per_callable_static_grad_outputs.reverse()
+    per_callable_static_grad_inputs.reverse()
     # Now for every per_callable list, per_callable_*[i] holds the stuff for the ith callable.
 
     def make_graphed_autograd_function(
diff --git a/torch/cuda/jiterator.py b/torch/cuda/jiterator.py
index 25d25482419e6..1be552555945c 100644
--- a/torch/cuda/jiterator.py
+++ b/torch/cuda/jiterator.py
@@ -38,7 +38,7 @@ def __init__(self, code_string: str):
         )  # DOTALL for matching multiline
 
         if result is None:
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 f"Couldn't parse code, please check correctness:\n {code_string}"
             )
 
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 55022ae829a62..8453842ef14a2 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -14,10 +14,10 @@
 from torch import _C
 
 from torch.types import Device
+from .._utils import _dummy_type
 from . import _get_device_index, _get_nvml_device_index, _lazy_init, is_initialized
 
 from ._memory_viz import memory as _memory, segments as _segments
-from ._utils import _dummy_type
 
 __all__ = [
     "caching_allocator_alloc",
@@ -210,6 +210,11 @@ def memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]:
     - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that
       result in a cache flush and retry.
     - ``"num_ooms"``: number of out-of-memory errors thrown.
+    - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls.
+    - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both
+      cuMemMap and cudaMalloc.
+    - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap
+      and cudaFree.
 
     The caching allocator can be configured via ENV to not split blocks larger than a
     defined size (see Memory Management section of the Cuda Semantics documentation).
@@ -561,13 +566,8 @@ def _format_count(cnt, pref_cnt):
                 freed_prefval = freed
 
             lines.append(
-                " {:<21} | {} | {} | {} | {} ".format(
-                    submetric_name,
-                    formatter(current, current_prefval),
-                    formatter(peak, peak_prefval),
-                    formatter(allocated, allocated_prefval),
-                    formatter(freed, freed_prefval),
-                ),
+                f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | "
+                f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ",
             )
 
     metrics_to_display = [
@@ -586,13 +586,8 @@ def _format_count(cnt, pref_cnt):
         freed = stats[prefix + "freed"]
 
         lines.append(
-            " {:<21} | {} | {} | {} | {} ".format(
-                metric_name,
-                formatter(current, current),
-                formatter(peak, peak),
-                formatter(allocated, allocated),
-                formatter(freed, freed),
-            ),
+            f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | "
+            f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ",
         )
 
     lines.append("=" * 75)
diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py
index 3d417958373ed..22d541f4e2879 100644
--- a/torch/cuda/streams.py
+++ b/torch/cuda/streams.py
@@ -2,7 +2,7 @@
 
 import torch
 from torch._streambase import _EventBase, _StreamBase
-from ._utils import _dummy_type
+from .._utils import _dummy_type
 
 
 if not hasattr(torch._C, "_CudaStreamBase"):
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index 5fb05a3477176..47e0e78a6be27 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -86,7 +86,16 @@ def breakpoint(rank: int = 0):
                 f"Type 'up' to get to the frame that called dist.breakpoint(rank={rank})\n"
             )
             pdb.set_trace()
-        barrier()
+        # If Meta/Python keys are in the TLS, we want to make sure that we ignore them
+        # and hit the (default) CPU/CUDA implementation of barrier.
+        meta_in_tls = torch._C._meta_in_tls_dispatch_include()
+        guard = torch._C._DisableTorchDispatch()  # type: ignore[attr-defined]
+        torch._C._set_meta_in_tls_dispatch_include(False)
+        try:
+            barrier()
+        finally:
+            torch._C._set_meta_in_tls_dispatch_include(meta_in_tls)
+            del guard
 
     if sys.platform != "win32":
         from torch._C._distributed_c10d import (
@@ -108,6 +117,7 @@ def breakpoint(rank: int = 0):
         _coalescing_manager,
         _CoalescingManager,
         _get_process_group_name,
+        get_node_local_rank,
     )
 
     from .rendezvous import (
diff --git a/torch/distributed/_composable/fsdp/__init__.py b/torch/distributed/_composable/fsdp/__init__.py
new file mode 100644
index 0000000000000..191a836632085
--- /dev/null
+++ b/torch/distributed/_composable/fsdp/__init__.py
@@ -0,0 +1,2 @@
+from ._fsdp_api import CPUOffloadPolicy, MixedPrecisionPolicy, OffloadPolicy
+from .fully_shard import FSDPModule, fully_shard, register_fsdp_forward_method
diff --git a/torch/distributed/_composable/fsdp/_fsdp_api.py b/torch/distributed/_composable/fsdp/_fsdp_api.py
new file mode 100644
index 0000000000000..2bf0278ed4889
--- /dev/null
+++ b/torch/distributed/_composable/fsdp/_fsdp_api.py
@@ -0,0 +1,79 @@
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+
+@dataclass(frozen=True)
+class MixedPrecisionPolicy:
+    """
+    This configures FSDP's mixed precision. Unlike autocast, this applies mixed
+    precision at the module level, not op level, which means low-precision
+    activations are saved for backward and high-to-low-precision casts are
+    incurred only at module boundaries.
+
+    FSDP works well with module-level mixed precision since it keeps the
+    high-precision sharded parameters in memory anyway. In other words, FSDP
+    does not require any extra memory to keep a high-precision copy of the
+    parameters for the optimizer step.
+
+    Attributes:
+        param_dtype (Optional[torch.dtype]): This specifies the dtype for
+            the unsharded parameter and hence the dtype for forward/backward
+            computation and the parameter all-gather. If this is ``None``, then
+            the unsharded parameter uses the original dtype. The optimizer step
+            uses the sharded parameter in the original dtype. (Default:
+            ``None``)
+        reduce_dtype (Optional[torch.dtype]): This specifies the dtype for
+            gradient reduction (i.e. reduce-scatter or all-reduce). If this is
+            ``None`` but ``param_dtype`` is not ``None``, then the reduction
+            uses the compute dtype. This can be used to run gradient reduction
+            in full precision while using low precision for compute. If also
+            gradient reduction is disabled via :meth:`set_requires_gradient_sync`,
+            then FSDP will accumulate gradients using ``reduce_dtype``.
+            (Default: ``None``)
+        output_dtype (Optional[torch.dtype]): This specifies the dtype for
+            casting floating-point forward outputs. This can be used to
+            help implement cases where different modules have different mixed
+            precision policies. (Default: ``None``)
+        cast_forward_inputs (bool): This specifies whether FSDP should cast the
+            forward's floating-point input tensors to ``param_dtype`` or not.
+    """
+
+    param_dtype: Optional[torch.dtype] = None
+    reduce_dtype: Optional[torch.dtype] = None
+    output_dtype: Optional[torch.dtype] = None
+    cast_forward_inputs: bool = True
+
+    def __post_init__(self):
+        # Clamp `reduce_dtype` to `None` if no casting is required: since
+        # gradients are computed in `param_dtype`, if `reduce_dtype` matches,
+        # then we do not need extra casting
+        if self.param_dtype == self.reduce_dtype:
+            # Bypass the frozen dataclass checks
+            object.__setattr__(self, "reduce_dtype", None)
+
+
+@dataclass
+class OffloadPolicy:
+    """This base class represents the policy of no offloading."""
+
+
+@dataclass
+class CPUOffloadPolicy(OffloadPolicy):
+    """
+    This offload policy offloads parameters, gradients, and optimizer states to
+    CPU. Sharded parameters are copied host-to-device before all-gather. The
+    all-gathered parameters are freed according to ``reshard_after_forward``.
+    Sharded gradients are copied device-to-host in backward, and the optimizer
+    step runs on CPU with CPU optimizer states.
+
+    Attributes:
+        pin_memory (bool): Whether to pin sharded parameter and gradient
+            memory. Pinning memory allows H2D/D2H copying without blocking the
+            CPU and in turn, overlap with compute, but pinned memory cannot be
+            used by other processes. Set this to ``False`` if you have
+            insufficient CPU memory. (Default: ``True``)
+    """
+
+    pin_memory: bool = True
diff --git a/torch/distributed/_composable/fsdp/_fsdp_collectives.py b/torch/distributed/_composable/fsdp/_fsdp_collectives.py
new file mode 100644
index 0000000000000..f279703151591
--- /dev/null
+++ b/torch/distributed/_composable/fsdp/_fsdp_collectives.py
@@ -0,0 +1,290 @@
+from typing import List, NamedTuple, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed.distributed_c10d import ReduceOp
+from ._fsdp_common import (
+    _get_dim0_padded_size,
+    _raise_assert_with_print,
+    _to_dtype_if_needed,
+)
+from ._fsdp_param import FSDPParam
+
+
+class AllGatherResult(NamedTuple):
+    all_gather_output: torch.Tensor
+    all_gather_event: Optional[torch.cuda.Event]
+    all_gather_work: Optional[dist.distributed_c10d.Work]
+    # For each parameter, the all-gather input dtype for each input
+    param_all_gather_input_dtypes: List[List[torch.dtype]]
+    # For each parameter, the all-gather input numel for each input
+    param_all_gather_input_numels: List[List[int]]
+    # 1D flattened version of `param_all_gather_input_numels` saved to avoid
+    # CPU overhead from recomputing
+    all_gather_input_split_sizes: List[int]
+
+
+@torch.no_grad()
+def foreach_all_gather(
+    fsdp_params: List[FSDPParam],
+    group: dist.ProcessGroup,
+    async_op: bool,
+    all_gather_copy_in_stream: torch.cuda.Stream,
+    all_gather_stream: torch.cuda.Stream,
+    device: torch.device,
+) -> Optional[AllGatherResult]:
+    world_size, rank = group.size(), group.rank()
+    with torch.cuda.stream(all_gather_copy_in_stream):
+        param_all_gather_inputs: List[List[torch.Tensor]] = [
+            fsdp_param.all_gather_inputs for fsdp_param in fsdp_params
+        ]
+        (
+            param_all_gather_input_dtypes,
+            param_all_gather_input_numels,
+            dtype,
+        ) = _get_all_gather_input_metadatas(param_all_gather_inputs)
+        if dtype == torch.uint8:
+            all_gather_inputs = [
+                t.view(torch.uint8) for ts in param_all_gather_inputs for t in ts
+            ]
+        else:
+            all_gather_inputs = [t for ts in param_all_gather_inputs for t in ts]
+        inp_split_sizes = [t.numel() for t in all_gather_inputs]
+        all_gather_input_numel = sum(inp_split_sizes)
+        all_gather_output = torch.empty(
+            (all_gather_input_numel * world_size,), dtype=dtype, device=device
+        )
+        all_gather_input = all_gather_output.narrow(
+            0, all_gather_input_numel * rank, all_gather_input_numel
+        )
+        foreach_copy_dsts = torch.split(all_gather_input, inp_split_sizes)
+        torch._foreach_copy_(foreach_copy_dsts, all_gather_inputs)
+        del param_all_gather_inputs
+    all_gather_stream.wait_stream(all_gather_copy_in_stream)
+    with torch.cuda.stream(all_gather_stream):
+        all_gather_work = dist.all_gather_into_tensor(
+            output_tensor=all_gather_output,
+            input_tensor=all_gather_input,
+            group=group,
+            async_op=async_op,
+        )
+        all_gather_event = all_gather_stream.record_event()
+        return AllGatherResult(
+            all_gather_output,
+            all_gather_event,
+            all_gather_work,
+            param_all_gather_input_dtypes,
+            param_all_gather_input_numels,
+            inp_split_sizes,
+        )
+
+
+@torch.no_grad()
+def foreach_all_gather_copy_out(
+    all_gather_result: AllGatherResult,
+    fsdp_params: List[FSDPParam],
+    group: dist.ProcessGroup,
+) -> None:
+    (
+        all_gather_output,
+        all_gather_event,
+        all_gather_work,
+        param_all_gather_input_dtypes,
+        param_all_gather_input_numels,
+        all_gather_input_split_sizes,
+    ) = all_gather_result
+    if all_gather_event is not None:  # sync op
+        torch.cuda.current_stream().wait_event(all_gather_event)
+    if isinstance(all_gather_work, dist.distributed_c10d.Work):  # async op
+        all_gather_work.wait()
+    world_size, device = group.size(), all_gather_output.device
+    for all_gather_input_numels, all_gather_input_dtypes, fsdp_param in zip(
+        param_all_gather_input_numels, param_all_gather_input_dtypes, fsdp_params
+    ):
+        fsdp_param.init_all_gather_outputs(
+            all_gather_input_numels, all_gather_input_dtypes, world_size, device
+        )  # no-op after 1st call
+        fsdp_param.alloc_all_gather_outputs()
+    all_gather_output = all_gather_output.view(world_size, -1)
+    gen = (t for fsdp_param in fsdp_params for t in fsdp_param.all_gather_outputs)
+    if all_gather_output.dtype == torch.uint8:
+        out = [t.view(world_size, -1).view(torch.uint8) for t in gen]
+    else:
+        out = [t.view(world_size, -1) for t in gen]
+    torch.split_with_sizes_copy(
+        all_gather_output, all_gather_input_split_sizes, dim=1, out=out
+    )
+
+
+@torch.no_grad()
+def foreach_reduce(
+    fsdp_params: List[FSDPParam],
+    unsharded_grads: List[torch.Tensor],
+    reduce_scatter_group: dist.ProcessGroup,
+    reduce_scatter_stream: torch.cuda.Stream,
+    orig_dtype: torch.dtype,
+    reduce_dtype: Optional[torch.dtype],
+    device: torch.device,
+    all_reduce_group: Optional[dist.ProcessGroup],
+    all_reduce_stream: torch.cuda.Stream,
+) -> torch.cuda.Event:
+    """
+    ``unsharded_grads`` owns the references to the gradients computed by
+    autograd, so clearing the list frees the gradients.
+    """
+    grad_dtypes = {grad.dtype for grad in unsharded_grads}
+    if len(grad_dtypes) != 1:
+        # Check this at runtime since it could be a real runtime error if e.g.
+        # fp8 weights do not produce the correct higher precision gradients
+        _raise_assert_with_print(
+            f"FSDP reduce-scatter expects uniform gradient dtype but got {grad_dtypes}"
+        )
+    grad_dtype = unsharded_grads[0].dtype
+    reduce_dtype = reduce_dtype or grad_dtype
+    predivide_factor, postdivide_factor = _get_gradient_divide_factors(
+        reduce_scatter_group, all_reduce_group, reduce_dtype
+    )
+    world_size = reduce_scatter_group.size()
+    padded_unsharded_sizes = tuple(
+        _get_dim0_padded_size(grad.size(), world_size) for grad in unsharded_grads
+    )
+    reduce_scatter_input_numel = sum(s.numel() for s in padded_unsharded_sizes)
+    reduce_scatter_output_numel = reduce_scatter_input_numel // world_size
+    current_stream = torch.cuda.current_stream()
+    reduce_scatter_stream.wait_stream(current_stream)
+    with torch.cuda.stream(reduce_scatter_stream):
+        reduce_scatter_input = torch.empty(
+            (reduce_scatter_input_numel,), dtype=reduce_dtype, device=device
+        )
+        foreach_reduce_scatter_copy_in(
+            unsharded_grads, reduce_scatter_input, world_size
+        )
+        # Only after the copy-in finishes can we free the gradients, which were
+        # computed in the default stream
+        current_stream.wait_stream(reduce_scatter_stream)
+        unsharded_grads.clear()
+        post_reduce_output = reduce_scatter_input.new_empty(
+            (reduce_scatter_output_numel,)
+        )
+        _div_if_needed(reduce_scatter_input, predivide_factor)
+        dist.reduce_scatter_tensor(
+            output=post_reduce_output,
+            input=reduce_scatter_input,
+            group=reduce_scatter_group,
+            op=ReduceOp.AVG if predivide_factor is None else ReduceOp.SUM,
+        )
+    view_out_stream = reduce_scatter_stream
+    if all_reduce_group is not None:
+        view_out_stream = all_reduce_stream
+        all_reduce_stream.wait_stream(reduce_scatter_stream)
+        with torch.cuda.stream(all_reduce_stream):
+            dist.all_reduce(
+                post_reduce_output,
+                group=all_reduce_group,
+                op=ReduceOp.AVG if predivide_factor is None else ReduceOp.SUM,
+            )
+    with torch.cuda.stream(view_out_stream):
+        _div_if_needed(post_reduce_output, postdivide_factor)
+        post_reduce_output = _to_dtype_if_needed(post_reduce_output, orig_dtype)
+        # - View out and accumulate
+        flat_grad_offset = 0  # [0, reduce_scatter_output_numel - 1]
+        for padded_unsharded_size, fsdp_param in zip(
+            padded_unsharded_sizes, fsdp_params
+        ):
+            new_sharded_grad = torch.as_strided(
+                post_reduce_output,
+                size=fsdp_param.sharded_size,
+                stride=fsdp_param.contiguous_sharded_stride,
+                storage_offset=flat_grad_offset,
+            )
+            to_accumulate_grad = fsdp_param.sharded_param.grad is not None
+            if fsdp_param.offload_to_cpu:
+                # Only overlap the D2H copy (copying to pinned memory) if not
+                # accumulating gradients since the CPU add kernel depends on
+                # the copy result and we cannot run the add as a callback
+                non_blocking = fsdp_param.pin_memory and not to_accumulate_grad
+                # Since the GPU sharded gradient is allocated in the RS stream,
+                # we can free it here by not keeping a ref without waiting for
+                # the D2H copy since future RS-stream ops run after the copy
+                new_sharded_grad = new_sharded_grad.to(
+                    torch.device("cpu"), non_blocking=non_blocking
+                )
+                if non_blocking:
+                    # Record an event on which to block the CPU thread to
+                    # ensure that the D2H copy finishes before the optimizer
+                    fsdp_param.grad_offload_event = reduce_scatter_stream.record_event()
+            new_sharded_dtensor_grad = fsdp_param.to_sharded_dtensor(new_sharded_grad)
+            if to_accumulate_grad:
+                fsdp_param.sharded_param.grad += new_sharded_dtensor_grad
+            else:
+                fsdp_param.sharded_param.grad = new_sharded_dtensor_grad
+            padded_sharded_numel = padded_unsharded_size.numel() // world_size
+            flat_grad_offset += padded_sharded_numel
+        post_reduce_view_out_event = view_out_stream.record_event()
+    # The RS output is allocated in the RS stream and used in the default
+    # stream (for optimizer). To ensure its memory is not reused for later
+    # RSs, we do not need extra synchronization since the sharded parameters
+    # hold refs through the end of backward.
+    return post_reduce_view_out_event
+
+
+def foreach_reduce_scatter_copy_in(
+    unsharded_grads: List[torch.Tensor],
+    reduce_scatter_input: torch.Tensor,
+    world_size: int,
+) -> None:
+    reduce_scatter_input = reduce_scatter_input.view(world_size, -1)
+    torch._chunk_cat(
+        unsharded_grads, dim=0, num_chunks=world_size, out=reduce_scatter_input
+    )
+
+
+def _get_all_gather_input_metadatas(
+    param_all_gather_inputs: List[List[torch.Tensor]],
+) -> Tuple[List[List[torch.dtype]], List[List[int]], torch.dtype]:
+    param_all_gather_input_dtypes: List[List[torch.dtype]] = []
+    param_all_gather_input_numels: List[List[int]] = []
+    all_gather_dtype = param_all_gather_inputs[0][0].dtype
+    for all_gather_inputs in param_all_gather_inputs:
+        input_dtypes: List[torch.dtype] = []
+        input_numels: List[int] = []
+        for all_gather_input in all_gather_inputs:
+            if all_gather_input.dtype != all_gather_dtype:
+                all_gather_dtype = torch.uint8
+            input_dtypes.append(all_gather_input.dtype)
+            input_numels.append(all_gather_input.numel())
+        param_all_gather_input_dtypes.append(input_dtypes)
+        param_all_gather_input_numels.append(input_numels)
+    return (
+        param_all_gather_input_dtypes,
+        param_all_gather_input_numels,
+        all_gather_dtype,
+    )
+
+
+def _get_gradient_divide_factors(
+    reduce_scatter_group: dist.ProcessGroup,
+    all_reduce_group: Optional[dist.ProcessGroup],
+    reduce_dtype: torch.dtype,
+) -> Union[Tuple[None, None], Tuple[float, float]]:
+    # For fp32/bf16, we do not need to worry about overflow/underflow, so we
+    # use NCCL's built-in division to avoid separate div kernels
+    if reduce_dtype in (torch.float32, torch.bfloat16):
+        return None, None
+    data_parallel_size = reduce_scatter_group.size()
+    if all_reduce_group is not None:
+        data_parallel_size *= all_reduce_group.size()
+    # Since fp16 has smaller dynamic range than fp32/bf16, we want to avoid
+    # overflow/underflow. For N data parallel workers, each worker computes
+    # g_i, and they collectively reduce (g_1 + ... + g_N) / N. To avoid
+    # overflow/underflow, we divide by ~sqrt(N) before/after the reduction.
+    factor: int = 1
+    while data_parallel_size % factor == 0 and data_parallel_size / factor > factor:
+        factor *= 2
+    factor = float(factor)
+    return (factor, data_parallel_size / factor)
+
+
+def _div_if_needed(tensor: torch.Tensor, div_factor: Optional[float]) -> None:
+    if div_factor is not None and div_factor > 1:
+        tensor.div_(div_factor)
diff --git a/torch/distributed/_composable/fsdp/_fsdp_common.py b/torch/distributed/_composable/fsdp/_fsdp_common.py
new file mode 100644
index 0000000000000..94b0249177697
--- /dev/null
+++ b/torch/distributed/_composable/fsdp/_fsdp_common.py
@@ -0,0 +1,151 @@
+import math
+import traceback
+
+from dataclasses import dataclass
+from enum import auto, Enum
+from typing import Any, cast, List, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed._composable.contract import _get_registry
+from torch.distributed._tensor import DeviceMesh, DTensor, Placement
+
+
+@dataclass
+class DataParallelMeshInfo:
+    mesh: DeviceMesh
+    shard_mesh_dim: Optional[int] = None
+    replicate_mesh_dim: Optional[int] = None
+
+    def __post_init__(self):
+        if self.shard_mesh_dim is None and self.replicate_mesh_dim is None:
+            raise AssertionError(
+                "At least one of shard_mesh_dim and replicate_mesh_dim must not be None"
+            )
+
+
+@dataclass
+class FSDPMeshInfo(DataParallelMeshInfo):
+    def __post_init__(self):
+        super().__post_init__()
+        if self.shard_mesh_dim is None:
+            raise AssertionError("Expects non-None shard_mesh_dim")
+        self.shard_mesh_size: int = self.mesh.size(self.shard_mesh_dim)
+        self.shard_process_group = cast(
+            dist.ProcessGroup, self.mesh.get_group(self.shard_mesh_dim)
+        )
+        self.shard_mesh_rank: int = self.shard_process_group.rank()
+
+
+@dataclass
+class DDPMeshInfo(DataParallelMeshInfo):
+    def __post_init__(self):
+        super().__post_init__()
+        if self.replicate_mesh_dim is None:
+            raise AssertionError("Expects non-None replicate_mesh_dim")
+        self.replicate_mesh_size: int = self.mesh.size(self.replicate_mesh_dim)
+        self.replicate_process_group = cast(
+            dist.ProcessGroup, self.mesh.get_group(self.replicate_mesh_dim)
+        )
+        self.replicate_mesh_rank: int = self.replicate_process_group.rank()
+
+
+@dataclass
+class HSDPMeshInfo(FSDPMeshInfo, DDPMeshInfo):
+    def __post_init__(self):
+        # Calls `FSDPMeshInfo` -> `DDPMeshInfo` -> `DataParallelMeshInfo`
+        super().__post_init__()
+
+
+class TrainingState(Enum):
+    """Describes the training state of one FSDP state / parameter group."""
+
+    # Transition to forward starting pre-forward until post-forward
+    FORWARD = auto()
+    # Transition to pre-backward when unsharding in backward
+    PRE_BACKWARD = auto()
+    # Transition to post-backward when resharding and reducing gradients
+    POST_BACKWARD = auto()
+    # Idle before/after forward or before pre-backward/after post-backward
+    IDLE = auto()
+
+
+def _raise_assert_with_print(*args: Any, **kwargs: Any):
+    print(f"[Rank {dist.get_rank()}] ", end="")
+    print(*args, **kwargs)
+    traceback.print_stack()
+    raise AssertionError(*args, **kwargs)
+
+
+def _is_composable_with_fsdp(module: nn.Module) -> bool:
+    registry = _get_registry(module)
+    if registry is None:
+        return True
+    # Registry keys by function name
+    return "replicate" not in registry
+
+
+def _get_dim0_padded_size(tensor_size: torch.Size, dim0_factor: int) -> torch.Size:
+    padded_dim0 = math.ceil(tensor_size[0] / dim0_factor) * dim0_factor
+    return cast(torch.Size, torch.Size([padded_dim0]) + tensor_size[1:])
+
+
+def _chunk_with_empty(
+    tensor: torch.Tensor, num_chunks: int, dim: int
+) -> List[torch.Tensor]:
+    chunks = list(torch.chunk(tensor, num_chunks, dim=dim))
+    while len(chunks) < num_chunks:
+        chunks.append(chunks[0].new_empty(0))
+    return chunks
+
+
+def _get_dim0_chunked_size(
+    chunk: torch.Tensor, unchunked_size: torch.Size
+) -> torch.Size:
+    if chunk.numel() > 0:
+        return chunk.size()
+    # For 0 numel, we need to preserve trailing dims for DTensor APIs
+    return cast(torch.Size, torch.Size([0]) + unchunked_size[1:])
+
+
+def _from_local_no_grad(
+    local_tensor: torch.Tensor,
+    device_mesh: DeviceMesh,
+    placements: Tuple[Placement, ...],
+    global_size: torch.Size,
+    global_stride: Tuple[int, ...],
+) -> DTensor:
+    """
+    This method is similar to ``DTensor.from_local()`` except it avoids some
+    CPU overhead by avoiding default args and not being differentiable.
+    """
+    return DTensor(
+        # Use the local tensor directly instead of constructing a new tensor
+        # variable, e.g. with `view_as()`, since this is not differentiable
+        local_tensor,
+        device_mesh,
+        placements,
+        shape=global_size,
+        dtype=local_tensor.dtype,
+        requires_grad=local_tensor.requires_grad,
+        stride=global_stride,
+    )
+
+
+def _to_dtype_if_needed(
+    tensor: torch.Tensor, dtype: Optional[torch.dtype]
+) -> torch.Tensor:
+    if dtype is not None and tensor.dtype != dtype:
+        return tensor.to(dtype)
+    return tensor
+
+
+def _cast_fp_tensor(dtype: torch.dtype, x: torch.Tensor) -> torch.Tensor:
+    if (
+        not isinstance(x, torch.Tensor)
+        or not torch.is_floating_point(x)
+        or x.dtype == dtype
+    ):
+        return x
+    return x.to(dtype)
diff --git a/torch/distributed/_composable/fsdp/_fsdp_init.py b/torch/distributed/_composable/fsdp/_fsdp_init.py
new file mode 100644
index 0000000000000..07fd45e9e3d71
--- /dev/null
+++ b/torch/distributed/_composable/fsdp/_fsdp_init.py
@@ -0,0 +1,148 @@
+import itertools
+from typing import List, Optional, Set, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+from torch.distributed._tensor import DeviceMesh, DTensor, init_device_mesh
+from torch.distributed.device_mesh import _get_device_handle
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+from ._fsdp_common import _is_composable_with_fsdp, FSDPMeshInfo, HSDPMeshInfo
+from ._fsdp_state import _get_module_fsdp_state
+
+
+def _get_post_forward_mesh_info(
+    reshard_after_forward: Union[bool, int], mesh_info: FSDPMeshInfo
+) -> Optional[FSDPMeshInfo]:
+    shard_mesh_size = mesh_info.shard_mesh_size
+    if not isinstance(reshard_after_forward, (bool, int)):
+        raise ValueError(
+            "reshard_after_forward should be a bool or an int representing the "
+            f"group size to reshard to, not {reshard_after_forward}"
+        )
+    # NOTE: `isinstance(False, int)` returns `True`.
+    if not isinstance(reshard_after_forward, bool) and isinstance(
+        reshard_after_forward, int
+    ):
+        if (
+            reshard_after_forward < 1
+            or reshard_after_forward > shard_mesh_size
+            or shard_mesh_size % reshard_after_forward != 0
+        ):
+            raise ValueError(
+                "If passing reshard_after_forward as an int, it should be a "
+                f"factor of {shard_mesh_size}, not {reshard_after_forward}"
+            )
+        elif reshard_after_forward == 1:
+            reshard_after_forward = False
+        elif reshard_after_forward == shard_mesh_size:
+            reshard_after_forward = True
+    post_forward_mesh_info = None
+    if reshard_after_forward is True:
+        post_forward_mesh_info = mesh_info
+    elif reshard_after_forward is not False:  # int case
+        # For HSDP, we can flatten the two replicate dims into the 0th dim
+        post_forward_mesh_tensor = mesh_info.mesh.mesh.view(-1, reshard_after_forward)
+        post_forward_mesh = DeviceMesh(
+            mesh_info.mesh.device_type, post_forward_mesh_tensor
+        )
+        post_forward_mesh_info = HSDPMeshInfo(
+            post_forward_mesh, shard_mesh_dim=1, replicate_mesh_dim=0
+        )
+    return post_forward_mesh_info
+
+
+def _init_default_fully_shard_mesh() -> DeviceMesh:
+    """Default to global CUDA mesh if possible else global CPU mesh."""
+    if not dist.distributed_c10d.is_initialized():
+        dist.distributed_c10d.init_process_group()
+    default_pg = dist.distributed_c10d._get_default_group()
+    device_type = "cuda" if torch.cuda.is_available() else "cpu"
+    mesh = init_device_mesh(device_type, mesh_shape=(default_pg.size(),))
+    return mesh
+
+
+def _get_device_from_mesh(mesh: DeviceMesh) -> torch.device:
+    if mesh.device_type == "cpu":
+        return torch.device("cpu")
+    device_handle = _get_device_handle(mesh.device_type)
+    return torch.device(mesh.device_type, device_handle.current_device())
+
+
+def _get_managed_modules(root_module: nn.Module) -> List[nn.Module]:
+    modules: List[nn.Module] = []
+    # Track visisted modules to avoid visiting shared modules multiple times
+    visited_modules: Set[nn.Module] = set()
+
+    def dfs(module: nn.Module) -> None:
+        """
+        Runs a DFS to collect managed modules, not recursing into modules with
+        a non-composable API or ``fully_shard`` already applied.
+        """
+        if not _is_composable_with_fsdp(module):
+            return
+        elif module is not root_module and _get_module_fsdp_state(module) is not None:
+            return  # nested `fully_shard` module
+        visited_modules.add(module)
+        for submodule in module.children():
+            if submodule not in visited_modules:
+                dfs(submodule)
+        modules.append(module)
+
+    dfs(root_module)
+    return modules
+
+
+def _get_managed_states(
+    modules: List[nn.Module],
+) -> Tuple[List[nn.Parameter], List[torch.Tensor]]:
+    params: List[nn.Parameter] = []
+    buffers: List[torch.Tensor] = []
+    # Track visited parameters/buffers to avoid visiting shared parameters and
+    # buffers multiple times
+    visited_params: Set[nn.Parameter] = set()
+    visited_buffers: Set[torch.Tensor] = set()
+    for module in modules:
+        for param in module.parameters(recurse=False):
+            if param not in visited_params:
+                params.append(param)
+                visited_params.add(param)
+        for buffer in module.buffers(recurse=False):
+            if buffer not in visited_buffers:
+                buffers.append(buffer)
+                visited_buffers.add(buffer)
+    return params, buffers
+
+
+def _move_states_to_device(
+    params: List[nn.Parameter],
+    buffers: List[torch.Tensor],
+    device: torch.device,
+) -> None:
+    """
+    We have FSDP move states to device for simpler and faster initialization
+    since FSDP almost always uses CUDA for training. We move parameters/buffers
+    rather than modules since modules to support ignoring parameters/buffers in
+    the future.
+    """
+    # Follow the logic in `nn.Module._apply`
+    for tensor in itertools.chain(params, buffers):
+        if tensor.device == device or tensor.device.type == "meta":
+            # Keep meta-device tensors on meta device for deferred init
+            continue
+        if isinstance(tensor, DTensor):
+            if (dtensor_mesh_type := tensor.device_mesh.device_type) != device.type:
+                raise ValueError(
+                    "Requires DTensor to have mesh of the same type as the FSDP mesh "
+                    f"but got {dtensor_mesh_type} for DTensor and {device.type} for FSDP"
+                )
+            raise AssertionError(
+                f"Expects DTensor to be moved to {dtensor_mesh_type} but got {tensor.device}"
+            )
+        if is_traceable_wrapper_subclass(tensor):
+            with torch.no_grad():  # avoid autograd increasing C++ refcount by 1
+                tensor_on_device = nn.Parameter(tensor.to(device))
+            torch.utils.swap_tensors(tensor, tensor_on_device)
+        else:
+            tensor.data = tensor.to(device)
diff --git a/torch/distributed/_composable/fsdp/_fsdp_param.py b/torch/distributed/_composable/fsdp/_fsdp_param.py
new file mode 100644
index 0000000000000..736a3789e8233
--- /dev/null
+++ b/torch/distributed/_composable/fsdp/_fsdp_param.py
@@ -0,0 +1,628 @@
+import itertools
+from dataclasses import dataclass, field
+from enum import auto, Enum
+from typing import Any, cast, List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+
+from torch._prims_common import make_contiguous_strides_for
+from torch.distributed._functional_collectives import AsyncCollectiveTensor
+from torch.distributed._tensor import DTensor, Placement, Replicate, Shard
+from torch.distributed._tensor.device_mesh import _mesh_resources
+from torch.distributed._tensor.placement_types import DTensorSpec
+from ._fsdp_api import CPUOffloadPolicy, MixedPrecisionPolicy, OffloadPolicy
+from ._fsdp_common import (
+    _chunk_with_empty,
+    _from_local_no_grad,
+    _get_dim0_chunked_size,
+    _raise_assert_with_print,
+    _to_dtype_if_needed,
+    FSDPMeshInfo,
+    HSDPMeshInfo,
+)
+
+"""
+[Note: FSDP tensors]
+FSDP considers the following tensors:
+- Original parameter: parameter passed to :class:`FSDPParam`, i.e. the one
+  on the module when applying FSDP
+- Sharded parameter: sharding the original parameter on dim-0 as a DTensor
+  over the main mesh
+- All-gather inputs: the ``torch.Tensor`` or ``Tensor`` s passed to all-gather,
+  derived from the sharded parameter
+- All-gather output: the ``torch.Tensor`` or ``Tensor`` s resulting from
+  all-gathering the all-gather inputs
+- Unsharded parameter: parameter used for forward/backward computation, derived
+  from the all-gather output; autograd leaf
+
+We define these tensors to describe the general framework that can accomodate
+extensions, where:
+- all-gather-inputs = pre-all-gather-transform(sharded-parameter)
+- unsharded-parameter = post-all-gather-transform(all-gather-outputs)
+
+For the default ``torch.Tensor`` case, there is only one all-gather input, and
+it shares the same underlying tensor data as the sharded parameter, meaning
+that they can be thought of as the same tensors. The same applies for the
+all-gather output and unsharded parameter. For non-``torch.Tensor`` extensions,
+these equivalences may no longer hold due to the pre/post-all-gather
+transforms, and some may have multiple all-gather inputs/outputs (e.g.
+quantized data and scales).
+
+[Note: FSDP and autograd]
+FSDP dynamically frees and allocates the unsharded parameter. Since autograd
+can pack a reference to it or a view to save for backward, we use storage
+resizing to implement the freeing/allocation since that preserves the aliasing.
+This implies that we construct the unsharded parameter object once and write to
+it in-place thereafter. For the default ``torch.Tensor` original parameter
+case, the all-gather output and unsharded parameter share the same
+data, so we use storage resizing on the all-gather output.
+"""
+
+
+class ShardedState(Enum):
+    """
+    - ``SHARDED``: The sharded parameter is registered to the module. It is the
+      only contributor to parameter memory.
+    - ``SHARDED_POST_FORWARD``: The unsharded parameter is resharded to a
+      smaller world size. Since this data should not be used for computation,
+      we do not register it to the module. Users should reshard the module
+      before any in-place modifications. Both it and the sharded parameter
+      contribute to parameter memory.
+    - ``UNSHARDED``: The unsharded parameter is registered to the module. Both
+      it and the sharded parameter contribute to parameter memory.
+    """
+
+    SHARDED = auto()
+    SHARDED_POST_FORWARD = auto()
+    UNSHARDED = auto()
+
+
+@dataclass
+class ParamModuleInfo:
+    """
+    For a parameter, this stores the module and the parameter name to be able
+    to do a parameter swap via ``setattr(module, param_name, ...)`` or to get
+    the parameter via ``getattr(module, param_name)``. We additionally save
+    shared modules and shared parameter names to update them accordingly.
+    """
+
+    # Parameter names are unprefixed, e.g. "weight", not "lin.weight"
+    module: nn.Module
+    param_name: str
+    shared_modules: List[nn.Module] = field(default_factory=list)
+    shared_param_names: List[str] = field(default_factory=list)
+
+
+@dataclass
+class ExtensionsData:
+    # User-defined metadata passed from pre to post-all-gather
+    all_gather_metadata: Optional[Any] = None
+    # Save the all-gather input sizes to unflatten the all-gather outputs to ND
+    all_gather_input_sizes: Sequence[torch.Size] = ()  # ND
+
+    def clear(self):
+        self.all_gather_metadata = None
+        self.all_gather_input_sizes = ()
+
+
+class FSDPParam:
+    """
+    This class manages a parameter with FSDP or FSDP variants applied,
+    implementing dim-0 per-parameter sharding.
+    """
+
+    orig_dtype: torch.dtype
+    param_dtype: Optional[torch.dtype]
+    reduce_dtype: Optional[torch.dtype]
+    _orig_size: torch.Size  # ND
+    sharded_size: torch.Size  # ND
+    contiguous_sharded_stride: Tuple[int, ...]
+    padded_sharded_param_size: torch.Size  # ND
+    sharded_post_forward_size: torch.Size  # ND
+    contiguous_sharded_post_forward_stride: Tuple[int, ...]
+    _sharded_param_data: torch.Tensor  # 1D
+    sharded_param: nn.Parameter  # ND
+    _sharded_post_forward_param_data: Optional[torch.Tensor]  # 1D
+    _sharded_post_forward_param: Optional[nn.Parameter]  # ND
+    _unsharded_param: nn.Parameter  # ND
+    unsharded_accumulated_grad: Optional[torch.Tensor]  # ND
+    _global_placements: Tuple[Placement, ...]
+    _global_size: torch.Size
+    _global_stride: Tuple[int, ...]
+    all_gather_outputs: List[torch.Tensor]  # 1D
+    # DTensor attributes (only defined for DTensor `param`):
+    _tp_spec: DTensorSpec
+    # All-gather extension attributes
+    _extensions_data: ExtensionsData
+    _unsharded_inner_tensors: List[torch.Tensor]
+
+    def __init__(
+        self,
+        param: nn.Parameter,
+        module_info: ParamModuleInfo,
+        mesh_info: FSDPMeshInfo,
+        post_forward_mesh_info: Optional[FSDPMeshInfo],
+        device: torch.device,
+        mp_policy: MixedPrecisionPolicy,
+        offload_policy: OffloadPolicy,
+    ):
+        self._module_info: ParamModuleInfo = module_info
+        self.mesh_info = mesh_info
+        self.post_forward_mesh_info = post_forward_mesh_info
+        self.device = device
+        self.offload_to_cpu: bool = isinstance(offload_policy, CPUOffloadPolicy)
+        self.pin_memory = (
+            self.offload_to_cpu and cast(CPUOffloadPolicy, offload_policy).pin_memory
+        )
+        self.grad_offload_event: Optional[torch.cuda.Event] = None
+        self._init_sharded_param(param, device)
+        if self.post_forward_mesh_info:
+            self._init_sharded_post_forward_param_metadata(param)
+        self._init_extensions()
+        self.all_gather_outputs: List[torch.Tensor] = []
+        self.unsharded_accumulated_grad = None
+        self._param_fqn: Optional[str] = None  # prefixed from root module
+        # TODO: Remove this padding logic once DTensor pads the local tensor:
+        # https://github.com/pytorch/pytorch/issues/113045
+        self._post_load_hook_handle = (
+            module_info.module.register_load_state_dict_post_hook(
+                lambda *args, **kwargs: self.reset_sharded_param()
+            )
+        )
+
+    @torch.no_grad()
+    def _init_sharded_param(self, param: nn.Parameter, device: torch.device):
+        if param.device != device and param.device.type != "meta":
+            raise AssertionError(
+                f"Expects the parameter to already be moved to device {device} but got {param.device}"
+            )
+        # TODO: Replace the sharded DTensor parameter construction logic with
+        # `distribute_tensor` after https://github.com/pytorch/pytorch/issues/116101
+        # TODO: Simplify the following sharded parameter padding logic after
+        # https://github.com/pytorch/pytorch/issues/113045
+        self.is_dtensor = isinstance(param, DTensor)
+        if self.is_dtensor:
+            self._tp_spec = cast(DTensor, param)._spec
+            if (
+                self.mesh_info.shard_mesh_dim != 0
+                or self.mesh_info.replicate_mesh_dim is not None
+            ):
+                raise NotImplementedError("Using TP with HSDP is not supported")
+            dp_mesh, tp_mesh = (self.mesh_info.mesh, self._tp_spec.mesh)
+            dp_global_mesh = _mesh_resources.get_parent_mesh(dp_mesh)
+            tp_global_mesh = _mesh_resources.get_parent_mesh(tp_mesh)
+            if dp_global_mesh != tp_global_mesh or (
+                dp_global_mesh is None or tp_global_mesh is None
+            ):
+                raise AssertionError(
+                    "FSDP requires the DP and TP mesh to have the same parent mesh but got: \n"
+                    f"DP's global mesh: {dp_global_mesh}\nTP's global mesh: {tp_global_mesh}"
+                )
+            self._global_mesh = dp_global_mesh
+            if len(self._tp_spec.placements) != 1:
+                raise NotImplementedError(
+                    f"FSDP only supports 1D TP, not {self._tp_spec.placements}"
+                )
+            global_placements: List[Placement] = [Replicate(), Replicate()]
+            global_dp_mesh_dim = _mesh_resources.get_parent_mesh_dim(dp_mesh)
+            global_tp_mesh_dim = _mesh_resources.get_parent_mesh_dim(tp_mesh)
+            assert global_dp_mesh_dim is not None  # mypy
+            assert global_tp_mesh_dim is not None  # mypy
+            # for PP, DP, TP case, dp mesh dim would be 1, tp mesh dim would be 2
+            # DP/TP would only live in the inner most 2-3 dims (HSDP + TP would be 3)
+            dp_tp_mesh_ndim = dp_mesh.ndim + tp_mesh.ndim
+            outer_mesh_ndim = self._global_mesh.ndim - dp_tp_mesh_ndim
+            if self._global_mesh.ndim > dp_tp_mesh_ndim:
+                global_dp_mesh_dim = global_dp_mesh_dim - outer_mesh_ndim
+                global_tp_mesh_dim = global_tp_mesh_dim - outer_mesh_ndim
+
+            # TODO: Hard code FSDP + TP; need to support HSDP + TP
+            global_placements[global_dp_mesh_dim] = Shard(0)
+            global_placements[global_tp_mesh_dim] = self._tp_spec.placements[0]
+            self._global_placements = tuple(global_placements)
+            self._global_size = param.size()
+            self._global_stride = param.stride()
+            param_data = cast(DTensor, param)._local_tensor
+        else:
+            self._global_mesh = self.mesh_info.mesh
+            if isinstance(self.mesh_info, HSDPMeshInfo):
+                self._global_placements = (Replicate(), Shard(0))
+            else:
+                self._global_placements = (Shard(0),)
+            self._global_size = param.size()
+            self._global_stride = param.stride()
+            param_data = param
+        self._orig_size = param_data.size()
+        shard_rank = self.mesh_info.shard_mesh_rank
+        shard_world_size = self.mesh_info.shard_mesh_size
+        chunks = _chunk_with_empty(param_data, shard_world_size, dim=0)
+        sharded_param = chunks[shard_rank]
+        self.sharded_size = _get_dim0_chunked_size(sharded_param, param_data.size())
+        self.contiguous_sharded_stride = make_contiguous_strides_for(self.sharded_size)
+        padded_sharded_size = chunks[0].size()  # 0th always padded
+        padded_sharded_param = param_data.new_zeros(padded_sharded_size)
+        self.padded_sharded_param_size = padded_sharded_param.size()
+        if sharded_param.numel() > 0:
+            padded_sharded_param[: sharded_param.size(0)].copy_(sharded_param)
+        if self.offload_to_cpu:
+            padded_sharded_param = padded_sharded_param.cpu()
+            if self.pin_memory:
+                padded_sharded_param = padded_sharded_param.pin_memory()
+        self._sharded_param_data = padded_sharded_param.view(-1)
+        self.sharded_param = nn.Parameter(
+            self.to_sharded_dtensor(padded_sharded_param[: sharded_param.size(0)])
+        )
+        self.sharded_param.requires_grad_(param.requires_grad)
+        # Let `param_data` be freed normally when its ref count reaches 0 when
+        # the `fully_shard` call returns to allow provided parameters to alias
+        self._setattr_on_modules(self.sharded_param)
+        self.sharded_state = ShardedState.SHARDED
+
+    def _init_sharded_post_forward_param_metadata(self, param: torch.Tensor) -> None:
+        mesh_info = self.post_forward_mesh_info
+        assert mesh_info is not None  # mypy
+        param_data = param._local_tensor if isinstance(param, DTensor) else param
+        chunks = _chunk_with_empty(param_data, mesh_info.shard_mesh_size, dim=0)
+        self.sharded_post_forward_size = _get_dim0_chunked_size(
+            chunks[mesh_info.shard_mesh_rank], param_data.size()
+        )
+        self.contiguous_sharded_post_forward_stride = make_contiguous_strides_for(
+            self.sharded_post_forward_size
+        )
+
+    def init_dtype_attrs(self, mp_policy: MixedPrecisionPolicy):
+        param_dtype, reduce_dtype = (mp_policy.param_dtype, mp_policy.reduce_dtype)
+        self.orig_dtype = self.sharded_param.dtype
+        # Clamp `param_dtype` to `None` if no casting is required
+        if param_dtype == self.orig_dtype:
+            param_dtype = None
+        self.param_dtype = param_dtype
+        self.reduce_dtype = reduce_dtype
+        # None indicates that the mixed precision is not enabled
+
+    def _init_extensions(self) -> None:
+        inner_tensor = self._sharded_local_tensor
+        has_fsdp_pre_all_gather = hasattr(inner_tensor, "fsdp_pre_all_gather")
+        has_fsdp_post_all_gather = hasattr(inner_tensor, "fsdp_post_all_gather")
+        if has_fsdp_pre_all_gather != has_fsdp_post_all_gather:
+            raise AssertionError(
+                "Both fsdp_pre_all_gather and fsdp_post_all_gather should be defined "
+                f"if using all-gather extensions: {inner_tensor}"
+            )
+        if has_fsdp_pre_all_gather:
+            if self.padded_sharded_param_size != self._sharded_local_tensor.size():
+                raise NotImplementedError(
+                    "FSDP all-gather extensions require even sharding on dim-0.\n"
+                    f"{self._orig_size} is not divisible by FSDP world size {self.mesh_info.mesh.size()}."
+                )
+            self._extensions_data = ExtensionsData()
+        self._unsharded_inner_tensors: List[torch.Tensor] = []
+
+    def init_all_gather_outputs(
+        self,
+        all_gather_input_numels: List[int],
+        all_gather_input_dtypes: List[torch.dtype],
+        world_size: int,
+        device: torch.device,
+    ):
+        if self.all_gather_outputs:
+            return  # already initialized
+        self.all_gather_outputs = [
+            torch.empty(torch.Size([numel * world_size]), dtype=dtype, device=device)
+            for numel, dtype in zip(all_gather_input_numels, all_gather_input_dtypes)
+        ]
+
+    def init_unsharded_param(self):
+        if hasattr(self, "_unsharded_param"):  # after the 1st all-gather
+            inner_tensor = self._sharded_local_tensor
+            if not hasattr(inner_tensor, "fsdp_post_all_gather"):
+                return  # already initialized
+            for tensor in self._unsharded_inner_tensors:
+                alloc_storage(tensor)
+            all_gather_outputs = self._unflatten_all_gather_outputs()
+            inner_tensor.fsdp_post_all_gather(
+                all_gather_outputs,
+                self._extensions_data.all_gather_metadata,
+                self.param_dtype or self.orig_dtype,
+                out=self._unsharded_param,
+            )
+            self._extensions_data.clear()
+            return
+        inner_tensor = self._sharded_local_tensor
+        if hasattr(inner_tensor, "fsdp_post_all_gather"):
+            all_gather_outputs = self._unflatten_all_gather_outputs()
+            (
+                unsharded_tensor,
+                self._unsharded_inner_tensors,
+            ) = inner_tensor.fsdp_post_all_gather(
+                all_gather_outputs,
+                self._extensions_data.all_gather_metadata,
+                self.param_dtype or self.orig_dtype,
+            )
+            self._extensions_data.clear()
+        else:
+            # For the default path (no post-all-gather), the all-gather output
+            # gives the unsharded parameter data directly
+            assert len(self.all_gather_outputs) == 1, f"{len(self.all_gather_outputs)}"
+            unsharded_tensor = self.all_gather_outputs[0]
+        unsharded_param = torch.as_strided(
+            unsharded_tensor,
+            self._orig_size,
+            make_contiguous_strides_for(self._orig_size),
+            storage_offset=0,
+        )
+        if self.is_dtensor:
+            unsharded_param = _from_local_no_grad(
+                unsharded_param,
+                self._tp_spec.mesh,
+                self._tp_spec.placements,
+                self._global_size,
+                self._global_stride,
+            )
+        self._unsharded_param = nn.Parameter(unsharded_param)
+        self._unsharded_param.requires_grad_(self.sharded_param.requires_grad)
+
+    def _unflatten_all_gather_outputs(self) -> Tuple[torch.Tensor, ...]:
+        return tuple(
+            t.view(-1, *s[1:])
+            for t, s in zip(
+                self.all_gather_outputs, self._extensions_data.all_gather_input_sizes
+            )
+        )
+
+    def to_sharded(self) -> None:
+        self._setattr_on_modules(self.sharded_param)
+        self.free_unsharded_param()
+        self.sharded_state = ShardedState.SHARDED
+
+    def to_sharded_post_forward(self) -> None:
+        if self.is_dtensor:
+            raise NotImplementedError(
+                "Resharding to smaller mesh with TP is not supported yet"
+            )
+        self._assert_in_states(ShardedState.UNSHARDED)
+        assert self.post_forward_mesh_info is not None  # mypy
+        assert len(self.all_gather_outputs) == 1
+        shard_world_size = self.post_forward_mesh_info.shard_mesh_size
+        if (numel := self.all_gather_outputs[0].numel()) % shard_world_size != 0:
+            _raise_assert_with_print(
+                f"All-gather output size ({numel}) must be divisible by the shard "
+                f"world size ({shard_world_size})"
+            )
+        shard_rank = self.post_forward_mesh_info.shard_mesh_rank
+        sharded_numel = numel // shard_world_size
+        self._sharded_post_forward_param_data = (
+            self.all_gather_outputs[0].narrow(
+                0, sharded_numel * shard_rank, sharded_numel
+            )
+        ).clone()  # clone to be able to free all-gather output
+        sharded_post_forward_tensor = torch.as_strided(
+            self._sharded_post_forward_param_data,
+            size=self.sharded_post_forward_size,
+            stride=self.contiguous_sharded_post_forward_stride,
+            storage_offset=0,
+        )
+        self._sharded_post_forward_param = nn.Parameter(
+            self.to_sharded_post_forward_dtensor(sharded_post_forward_tensor)
+        )
+        self._setattr_on_modules(self._sharded_post_forward_param)
+        self.free_unsharded_param()
+        self.sharded_state = ShardedState.SHARDED_POST_FORWARD
+
+    def to_unsharded(self) -> None:
+        # Assume that the data has been allocated and all-gathered
+        set_requires_grad_if_needed(self.sharded_param, self._unsharded_param)
+        self._setattr_on_modules(self._unsharded_param)
+        if self.sharded_state == ShardedState.SHARDED_POST_FORWARD:
+            # The data is allocated in the default stream via the post-forward
+            # reshard and must be kept alive for the next all-gather copy-in.
+            # Since we call this method after the copy-out, the data's lifetime
+            # is ensured without further synchronization.
+            self._sharded_post_forward_param = None
+            self._sharded_post_forward_param_data = None  # free
+        self.sharded_state = ShardedState.UNSHARDED
+
+    def _setattr_on_modules(self, param: nn.Parameter) -> None:
+        unsafe_setattr_param(
+            self._module_info.module, self._module_info.param_name, param
+        )
+        for shared_module, shared_param_name in zip(
+            self._module_info.shared_modules, self._module_info.shared_param_names
+        ):
+            unsafe_setattr_param(shared_module, shared_param_name, param)
+
+    def to_sharded_dtensor(self, tensor: torch.Tensor) -> DTensor:
+        """
+        Converts a local tensor representing either the sharded parameter or
+        sharded gradient to DTensor.
+        """
+        if tensor.shape != self.sharded_size:
+            _raise_assert_with_print(
+                f"Expects size {self.sharded_size} but got {tensor.shape}"
+            )
+        return _from_local_no_grad(
+            tensor,
+            self._global_mesh,
+            self._global_placements,
+            self._global_size,
+            self._global_stride,
+        )
+
+    def to_sharded_post_forward_dtensor(self, tensor: torch.Tensor) -> DTensor:
+        if tensor.shape != self.sharded_post_forward_size:
+            _raise_assert_with_print(
+                f"Expects size {self.sharded_post_forward_size} but got {tensor.shape}"
+            )
+        assert isinstance(self.post_forward_mesh_info, HSDPMeshInfo)
+        # TODO: Prefer this DTensor to be read-only and generalize the
+        # placement once we support TP.
+        return _from_local_no_grad(
+            tensor,
+            self.post_forward_mesh_info.mesh,
+            (Replicate(), Shard(0)),
+            self._global_size,
+            self._global_stride,
+        )
+
+    def to_accumulated_grad_if_needed(self) -> None:
+        # Access `_unsharded_param` to bypass the sharded state check since we
+        # prefer to reshard before upcasting the gradient to save memory
+        if (
+            self.reduce_dtype is None
+            or self._unsharded_param.grad is None
+            or self._unsharded_param.grad.dtype == self.reduce_dtype
+        ):
+            return
+        unsharded_grad = self._unsharded_param.grad
+        self._unsharded_param.grad = None
+        self.unsharded_accumulated_grad = unsharded_grad.to(self.reduce_dtype)
+
+    def accumulate_unsharded_grad_if_needed(self) -> None:
+        if (
+            self.unsharded_accumulated_grad is not None
+            and self.unsharded_param.grad is not None
+        ):
+            self.unsharded_accumulated_grad += self.unsharded_param.grad
+            self.unsharded_param.grad = None
+
+    def alloc_all_gather_outputs(self) -> None:
+        for tensor in self.all_gather_outputs:
+            alloc_storage(tensor)
+
+    def free_unsharded_param(self) -> None:
+        for tensor in itertools.chain(
+            self.all_gather_outputs, self._unsharded_inner_tensors
+        ):
+            free_storage(tensor)
+
+    @property
+    def all_gather_inputs(self) -> List[torch.Tensor]:  # 1D
+        self._assert_in_states(ShardedState.SHARDED, ShardedState.SHARDED_POST_FORWARD)
+        if self.sharded_state == ShardedState.SHARDED:
+            if hasattr(self._sharded_local_tensor, "fsdp_pre_all_gather"):
+                sharded_local_tensor = self._sharded_local_tensor
+                if self.offload_to_cpu:
+                    sharded_local_tensor = sharded_local_tensor.to(
+                        self.device, non_blocking=True
+                    )
+                (
+                    all_gather_inputs,
+                    self._extensions_data.all_gather_metadata,
+                ) = sharded_local_tensor.fsdp_pre_all_gather(self.mesh_info.mesh)
+                self._extensions_data.all_gather_input_sizes = [
+                    t.size() for t in all_gather_inputs
+                ]
+                return [t.view(-1) for t in all_gather_inputs]
+            sharded_param_data = self._sharded_param_data
+            if self.offload_to_cpu:
+                sharded_param_data = sharded_param_data.to(
+                    self.device, non_blocking=True
+                )
+            return [_to_dtype_if_needed(sharded_param_data, self.param_dtype)]
+        elif self.sharded_state == ShardedState.SHARDED_POST_FORWARD:
+            if hasattr(self._sharded_local_tensor, "fsdp_pre_all_gather"):
+                raise NotImplementedError
+            all_gather_input = _to_dtype_if_needed(
+                cast(torch.Tensor, self._sharded_post_forward_param_data),
+                self.param_dtype,
+            )
+            return [all_gather_input]
+        return [torch.empty(0)]  # mypy
+
+    @property
+    def unsharded_param(self) -> nn.Parameter:  # ND
+        self._assert_in_states(ShardedState.UNSHARDED)
+        return self._unsharded_param
+
+    @property
+    def unsharded_grad_data(self) -> torch.Tensor:
+        grad = self.unsharded_param.grad
+        assert grad is not None, "Expects unsharded_param.grad to not be None"
+        return self._get_grad_inner_tensor(grad)
+
+    @property
+    def unsharded_accumulated_grad_data(self) -> torch.Tensor:
+        grad = self.unsharded_accumulated_grad
+        assert grad is not None, "Expects unsharded_accumulated_grad to not be None"
+        return self._get_grad_inner_tensor(grad)
+
+    def _get_grad_inner_tensor(self, grad: torch.Tensor) -> torch.Tensor:
+        if self.is_dtensor:
+            if isinstance(grad, AsyncCollectiveTensor):
+                grad = grad.wait()
+            assert isinstance(grad, DTensor), f"{type(grad)}"
+            if any(pl.is_partial() for pl in grad.placements):
+                placements = [
+                    Replicate() if pl.is_partial() else pl for pl in grad.placements
+                ]
+                grad = grad.redistribute(placements=placements)
+            grad = grad._local_tensor
+        return grad
+
+    @property
+    def _sharded_local_tensor(self) -> torch.Tensor:
+        return cast(DTensor, self.sharded_param)._local_tensor
+
+    def _assert_in_states(self, *states: ShardedState) -> None:
+        if self.sharded_state not in states:
+            _raise_assert_with_print(
+                f"Expects to be in one of {states}, not {self.sharded_state}"
+            )
+
+    def reset_sharded_param(self):
+        # For ops like `nn.Module._apply` or `load_state_dict(assign=True)`
+        # that change the sharded parameter tensor, we may need to re-pad the
+        # sharded local tensor and re-save the reference.
+        module_info = self._module_info
+        new_param = getattr(module_info.module, module_info.param_name)
+        if new_param is not self.sharded_param:
+            if torch.__future__.get_swap_module_params_on_conversion():
+                raise AssertionError(
+                    f"Expects swap_tensors to preserve object but got {new_param} "
+                    f"instead of {self.sharded_param}"
+                )
+            self.sharded_param = new_param
+        local_tensor = new_param._local_tensor
+        padded_sharded_size = self.padded_sharded_param_size
+        if local_tensor.size() != padded_sharded_size:
+            padded_local_tensor = local_tensor.new_zeros(padded_sharded_size)
+            padded_local_tensor[: local_tensor.size(0)].copy_(local_tensor)
+            local_tensor = padded_local_tensor
+        if self.pin_memory and not local_tensor.is_pinned():
+            local_tensor = local_tensor.cpu().pin_memory()
+        self._sharded_param_data = local_tensor.view(-1)
+        assert isinstance(self.sharded_param, DTensor)  # mypy
+        self.sharded_param._local_tensor = local_tensor[: self.sharded_size[0]]
+
+
+def alloc_storage(tensor: torch.Tensor) -> None:
+    size = tensor.numel() * tensor.itemsize
+    if (storage := tensor.untyped_storage()).size() != size:
+        storage.resize_(size)
+
+
+def free_storage(tensor: torch.Tensor) -> None:
+    if (storage := tensor.untyped_storage()).size() != 0:
+        storage.resize_(0)
+
+
+# NOTE: These bypass `nn.Module.__setattr__` checks, which incur non-trivial
+# CPU overhead, if the module did not override it. For FSDP, we know we do not
+# need those checks when transitioning between sharded/unsharded parameters.
+def unsafe_setattr_param(
+    module: nn.Module, param_name: str, param: nn.Parameter
+) -> None:
+    if getattr(module.__setattr__, "__func__", None) is nn.Module.__setattr__:
+        module._parameters[param_name] = param
+    else:  # slow path
+        setattr(module, param_name, param)
+
+
+def set_requires_grad_if_needed(
+    src_tensor: torch.Tensor, dst_tensor: torch.Tensor
+) -> None:
+    # Only call `requires_grad_` if needed to avoid the Python <> C++ context
+    # switch overhead
+    if src_tensor.requires_grad != dst_tensor.requires_grad:
+        dst_tensor.requires_grad_(src_tensor.requires_grad)
diff --git a/torch/distributed/_composable/fsdp/_fsdp_param_group.py b/torch/distributed/_composable/fsdp/_fsdp_param_group.py
new file mode 100644
index 0000000000000..9e9813102db3a
--- /dev/null
+++ b/torch/distributed/_composable/fsdp/_fsdp_param_group.py
@@ -0,0 +1,517 @@
+import contextlib
+
+from typing import Any, cast, Dict, List, NamedTuple, Optional, Set, Tuple
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed.fsdp._common_utils import _named_parameters_with_duplicates
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torch.utils.hooks import RemovableHandle
+from ._fsdp_api import MixedPrecisionPolicy, OffloadPolicy
+from ._fsdp_collectives import (
+    AllGatherResult,
+    foreach_all_gather,
+    foreach_all_gather_copy_out,
+    foreach_reduce,
+)
+from ._fsdp_common import FSDPMeshInfo, HSDPMeshInfo, TrainingState
+from ._fsdp_param import FSDPParam, ParamModuleInfo, ShardedState
+
+_ModuleToHandleDict = Dict[nn.Module, RemovableHandle]  # for state dict
+
+
+"""
+[Note: Overlapping all-gather copy-in and all-gather]
+For implicit forward prefetching, we want to overlap the next copy-in with the
+current all-gather. We do so using a separate copy-in stream. However, since
+we have the all-gather input as a view into the output, we must make sure to
+copy into different memory from the current all-gather's output. Thus, we keep
+a reference to the current all-gather's output and have the next FSDP parameter
+group free it after its copy-in. Finally, we have the last FSDP state flush the
+reference to avoid holding onto memory after forward.
+"""
+
+
+class FSDPCommContext:
+    """This has the communication state shared across FSDP states/parameter groups."""
+
+    def init(self):
+        # Setting the all-gather/reduce-scatter streams to be higher priority
+        # can help avoid some issues where their copies in/out are delayed and
+        # block computation
+        high_priority = -1
+        # All-gather state and copy-in stream allow overlapping the next
+        # copy-in with the current all-gather in forward; copy-in overlaps with
+        # reduce-scatter in backward without the separate copy-in stream
+        self.all_gather_copy_in_stream = torch.cuda.Stream(priority=high_priority)
+        self.all_gather_state: Optional[AllGatherState] = None
+        # All-gather stream allows overlapping next all-gather with current
+        # forward compute
+        self.all_gather_stream = torch.cuda.Stream(priority=high_priority)
+        # Reduce-scatter stream gives separate execution "thread" for post-
+        # backward logic like pre/post-gradient division and reduce-scatter
+        self.reduce_scatter_stream = torch.cuda.Stream(priority=high_priority)
+        # Run the HSDP all-reduces concurrently with all-gather/reduce-scatter
+        # since collectives use different network resources and can overlap
+        # in the typical intra-node sharding / inter-node replication case
+        self.all_reduce_stream = torch.cuda.Stream()
+        # Post-forward order for explicit backward prefetching
+        self.post_forward_order: List[FSDPParamGroup] = []  # will cause ref cycles
+
+    def get_all_gather_streams(
+        self, training_state: TrainingState
+    ) -> Tuple[torch.cuda.Stream, torch.cuda.Stream]:
+        if training_state in (TrainingState.FORWARD, TrainingState.PRE_BACKWARD):
+            # Use separate streams for implicit prefetching
+            return self.all_gather_copy_in_stream, self.all_gather_stream
+        current_stream = torch.cuda.current_stream()
+        return current_stream, current_stream
+
+
+# See [Note: Overlapping all-gather copy-in and all-gather]
+class AllGatherState(NamedTuple):
+    all_gather_result: AllGatherResult
+    event: torch.cuda.Event  # all-gather copy-out
+
+
+class FSDPParamGroup:
+    """This class represents a parameter group to communicate together."""
+
+    _orig_dtype: torch.dtype
+    _reduce_dtype: Optional[torch.dtype]
+
+    def __init__(
+        self,
+        params: List[nn.Parameter],
+        module: nn.Module,
+        mesh_info: FSDPMeshInfo,
+        post_forward_mesh_info: Optional[FSDPMeshInfo],
+        device: torch.device,
+        mp_policy: MixedPrecisionPolicy,
+        offload_policy: OffloadPolicy,
+    ):
+        self.module = module  # permit ref cycle because 1:1 lifetime
+        param_module_infos = _get_param_module_infos(params, module)
+        self.fsdp_params = [
+            FSDPParam(
+                param,
+                module_info,
+                mesh_info,
+                post_forward_mesh_info,
+                device,
+                mp_policy,
+                offload_policy,
+            )
+            for param, module_info in zip(params, param_module_infos)
+        ]
+        self.mesh_info = mesh_info
+        self.post_forward_mesh_info = post_forward_mesh_info
+        self.device = device
+        self.mp_policy = mp_policy
+        self._training_state = TrainingState.IDLE
+        # Group's sharded state always matches its parameters' sharded states
+        self._sharded_state = ShardedState.SHARDED
+        self._module_fqn: Optional[str] = None  # prefixed from root module
+
+        # - Hook state
+        self._module_to_pre_save_state_dict_hook_handle: _ModuleToHandleDict = {}
+        self._module_to_pre_load_state_dict_hook_handle: _ModuleToHandleDict = {}
+
+        # - Communication and communication/computation overlap
+        self.comm_ctx = FSDPCommContext()
+        # Group's indices in the shared post-forward order
+        self._post_forward_indices: List[int] = []
+        # Whether to reduce gradients at all (whether for FSDP or HSDP)
+        self.reduce_grads: bool = True
+        # Whether to all-reduce gradients for HSDP; only used if
+        # `self.reduce_grads` is true, in which case setting this to false
+        # means reduce-scatter but no all-reduce
+        self.all_reduce_grads: bool = True
+        # Whether to reshard parameters after backward (only useful for
+        # gradient accumulation)
+        self.reshard_after_backward: bool = True
+
+        # - CUDA events for stream synchronization
+        # Holds the all-gather output buffer, sync objects, and metadata
+        self._all_gather_result: Optional[AllGatherResult] = None
+        # Holds the reduce-scatter/all-reduce view-out CUDA event that marks the end of
+        # the group's post-backward (e.g. reduce-scatter, all-reduce and div), which
+        # should be waited on at the end of backward
+        self._post_reduce_view_out_event: Optional[torch.cuda.Event] = None
+        # Holds the reshard-after-forward CUDA event when resharding to a
+        # different world size, which should be waited on in the next unshard
+        self._reshard_after_forward_event: Optional[torch.cuda.Event] = None
+
+    # Initialization #
+    def _init_mp_dtypes(self) -> None:
+        for fsdp_param in self.fsdp_params:
+            fsdp_param.init_dtype_attrs(self.mp_policy)
+        orig_dtypes = {fsdp_param.orig_dtype for fsdp_param in self.fsdp_params}
+        if len(orig_dtypes) != 1:
+            # This can be relaxed if we copy-out for the reduce-scatter
+            raise AssertionError(
+                f"FSDP expects uniform original parameter dtype but got {orig_dtypes}"
+            )
+        self._orig_dtype = next(iter(orig_dtypes))
+        reduce_dtypes = {fsdp_param.reduce_dtype for fsdp_param in self.fsdp_params}
+        if len(reduce_dtypes) != 1:
+            # This can be relaxed if we issue one reduce-scatter per reduce
+            # dtype (but we would need a way for users to specify multiple
+            # reduce dtypes)
+            raise AssertionError(
+                f"FSDP expects uniform reduce dtype but got {reduce_dtypes}"
+            )
+        self._reduce_dtype = next(iter(reduce_dtypes))
+
+    def lazy_init(self):
+        # Lazy init should be idempotent
+        param_names_on_meta = [
+            fsdp_param._param_fqn
+            for fsdp_param in self.fsdp_params
+            if fsdp_param.sharded_param.device.type == "meta"
+        ]
+        if param_names_on_meta:
+            raise RuntimeError(
+                "FSDP parameters should be materialized from meta device before training, "
+                f"but the following were still on meta device: {param_names_on_meta}\n"
+                "For example, call module.to_empty(device) to materialize to device and "
+                "call module.reset_parameters() on each module to initialize values."
+            )
+        # Initialize mixed precision attributes lazily in case the user changes
+        # the parameter dtypes after construction time but before forward
+        self._init_mp_dtypes()
+        self._register_state_dict_hooks()
+
+    # Runtime #
+    def unshard(self, async_op: bool = False):
+        if self._all_gather_result is not None:  # already called, pending wait
+            return
+        if self.is_unsharded:
+            return  # no-op
+        if self._reshard_after_forward_event is not None:
+            # Resharded parameter data is allocated in the default stream and
+            # used in the all-gather streams
+            self._wait_all_gather_streams_on_event(self._reshard_after_forward_event)
+            self._reshard_after_forward_event = None
+        self._all_gather_result = foreach_all_gather(
+            self.fsdp_params,
+            self._all_gather_process_group,
+            async_op,
+            *self.comm_ctx.get_all_gather_streams(self._training_state),
+            self.device,
+        )
+
+    def wait_for_unshard(self):
+        """
+        1. In forward with implict prefetching, to overlap the current copy-out
+        with the next all-gather, we save a reference to the current all-gather
+        result to free after the next copy-out.
+        2. Otherwise (explicit prefetching or in backward), we free the
+        all-gather result immediately after the current copy-out since we can
+        already overlap the current copy-out with the previous reduce-scatter.
+        """
+        if not self._all_gather_result:
+            return  # no preceding unshard
+        if self._training_state == TrainingState.FORWARD:  # implicit prefetch
+            if prev_all_gather_state := self.comm_ctx.all_gather_state:
+                self._wait_all_gather_streams_on_event(prev_all_gather_state.event)
+                self.comm_ctx.all_gather_state = None  # free the all-gather result
+        foreach_all_gather_copy_out(
+            self._all_gather_result, self.fsdp_params, self._all_gather_process_group
+        )
+        for fsdp_param in self.fsdp_params:
+            fsdp_param.init_unsharded_param()
+        self._to_unsharded()
+        all_gather_copy_out_event = torch.cuda.Event()
+        all_gather_copy_out_event.record()
+        if self._training_state == TrainingState.FORWARD:
+            self.comm_ctx.all_gather_state = AllGatherState(
+                self._all_gather_result, all_gather_copy_out_event
+            )
+        else:
+            self._wait_all_gather_streams_on_event(all_gather_copy_out_event)
+        self._all_gather_result = None  # free unless saved in `all_gather_state`
+
+    def _wait_all_gather_streams_on_event(self, event: torch.cuda.Event):
+        self.comm_ctx.all_gather_copy_in_stream.wait_event(event)
+        self.comm_ctx.all_gather_stream.wait_event(event)
+
+    def reshard(self):
+        if self._training_state == TrainingState.FORWARD:
+            if not self._reshard_after_forward:
+                return
+            if self._use_post_forward_mesh:
+                self._to_sharded_post_forward()
+                self._reshard_after_forward_event = torch.cuda.Event()
+                self._reshard_after_forward_event.record()
+                return
+        self._to_sharded()
+
+    def pre_forward(
+        self, module: nn.Module, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
+        with torch.profiler.record_function("FSDP::pre_forward"):
+            self._training_state = TrainingState.FORWARD
+            self.unshard()
+            self.wait_for_unshard()
+            args, kwargs = self._register_post_backward_hook(args, kwargs)
+            return args, kwargs
+
+    def post_forward(self, module: nn.Module, input: Any, output: Any):
+        with torch.profiler.record_function("FSDP::post_forward"):
+            self.reshard()
+            self._record_post_forward()
+            self._training_state = TrainingState.IDLE
+            return output
+
+    def _record_post_forward(self) -> None:
+        # Since a group has one pre-backward unshard for each forward call
+        # before the backward, we record each usage (with multiplicity)
+        post_forward_index = len(self.comm_ctx.post_forward_order)
+        self.comm_ctx.post_forward_order.append(self)
+        self._post_forward_indices.append(post_forward_index)
+
+    def pre_backward(self, *unused: Any):
+        with torch.profiler.record_function("FSDP::pre_backward"):
+            self._training_state = TrainingState.PRE_BACKWARD
+            self.unshard()  # no-op if prefetched
+            self.wait_for_unshard()
+            self._prefetch_unshard()
+
+    def post_backward(self, *unused: Any):
+        self._training_state = TrainingState.POST_BACKWARD
+        with torch.profiler.record_function("FSDP::post_backward_accumulate"):
+            for fsdp_param in self.fsdp_params:
+                fsdp_param.accumulate_unsharded_grad_if_needed()
+        with torch.profiler.record_function("FSDP::post_backward_reshard"):
+            if not self.reduce_grads:
+                if self.reshard_after_backward:
+                    self.reshard()
+                for fsdp_param in self.fsdp_params:
+                    fsdp_param.to_accumulated_grad_if_needed()
+                return
+            # Save the autograd-computed gradients before resharding to only
+            # access the unsharded parameters when their data is present
+            fsdp_params_with_grad: List[FSDPParam] = []
+            unsharded_grads: List[torch.Tensor] = []
+            for fsdp_param in self.fsdp_params:
+                # May have an accumulated gradient of the reduce dtype if the
+                # previous backward did not reduce-scatter
+                if fsdp_param.unsharded_accumulated_grad is not None:
+                    fsdp_params_with_grad.append(fsdp_param)
+                    unsharded_grads.append(fsdp_param.unsharded_accumulated_grad_data)
+                    fsdp_param.unsharded_accumulated_grad = None
+                elif fsdp_param.unsharded_param.grad is not None:
+                    fsdp_params_with_grad.append(fsdp_param)
+                    unsharded_grads.append(fsdp_param.unsharded_grad_data)
+                    fsdp_param.unsharded_param.grad = None
+            if self.reshard_after_backward:
+                self.reshard()
+        if len(fsdp_params_with_grad) == 0:
+            return
+        with torch.profiler.record_function("FSDP::post_backward_reduce"):
+            self._post_reduce_view_out_event = foreach_reduce(
+                fsdp_params_with_grad,
+                unsharded_grads,
+                self._reduce_scatter_process_group,
+                self.comm_ctx.reduce_scatter_stream,
+                self._orig_dtype,
+                self._reduce_dtype,
+                self.device,
+                self._all_reduce_process_group
+                if self._is_hsdp and self.all_reduce_grads
+                else None,
+                self.comm_ctx.all_reduce_stream,
+            )
+
+    def finalize_backward(self):
+        if self._post_reduce_view_out_event is not None:
+            torch.cuda.current_stream().wait_event(self._post_reduce_view_out_event)
+            self._post_reduce_view_out_event = None
+        for fsdp_param in self.fsdp_params:
+            if fsdp_param.grad_offload_event is not None:
+                fsdp_param.grad_offload_event.synchronize()
+                fsdp_param.grad_offload_event = None
+        self._post_forward_indices.clear()
+
+    def _prefetch_unshard(self):
+        if self._training_state == TrainingState.PRE_BACKWARD:
+            if not self._post_forward_indices:
+                # Can be cleared if running multiple `backward`s
+                return
+            curr_index = self._post_forward_indices.pop()
+            if (target_index := curr_index - 1) < 0:
+                return
+            # Prefetch naively using the reverse post-forward order, which may
+            # have mistargeted prefetches if not all modules used in forward
+            # are used in this backward
+            target_fsdp_param_group = self.comm_ctx.post_forward_order[target_index]
+            with torch.profiler.record_function(
+                "FSDP::backward_prefetch"
+            ), target_fsdp_param_group.use_training_state(TrainingState.PRE_BACKWARD):
+                target_fsdp_param_group.unshard()
+
+    # Utilities #
+    def _to_sharded(self):
+        if not self.is_sharded:
+            for fsdp_param in self.fsdp_params:
+                fsdp_param.to_sharded()
+            self._sharded_state = ShardedState.SHARDED
+
+    def _to_sharded_post_forward(self):
+        if not self.is_sharded_post_forward:
+            for fsdp_param in self.fsdp_params:
+                fsdp_param.to_sharded_post_forward()
+            self._sharded_state = ShardedState.SHARDED_POST_FORWARD
+
+    def _to_unsharded(self):
+        if not self.is_unsharded:
+            for fsdp_param in self.fsdp_params:
+                fsdp_param.to_unsharded()
+            self._sharded_state = ShardedState.UNSHARDED
+
+    @property
+    def is_sharded(self) -> bool:
+        return self._sharded_state == ShardedState.SHARDED
+
+    @property
+    def is_sharded_post_forward(self) -> bool:
+        return self._sharded_state == ShardedState.SHARDED_POST_FORWARD
+
+    @property
+    def is_unsharded(self) -> bool:
+        return self._sharded_state == ShardedState.UNSHARDED
+
+    @contextlib.contextmanager
+    def use_training_state(self, training_state: TrainingState):
+        old_training_state = self._training_state
+        self._training_state = training_state
+        try:
+            yield
+        finally:
+            self._training_state = old_training_state
+
+    # Hook Registration #
+    def _register_post_backward_hook(
+        self, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
+        if not torch.is_grad_enabled():
+            return args, kwargs
+        args_list, args_spec = tree_flatten(args)
+        kwargs_list, kwargs_spec = tree_flatten(kwargs)
+        args_kwargs_list = list(args_list) + list(kwargs_list)
+        inp_tensor_indices: List[int] = []
+        inp_tensors: List[torch.Tensor] = []
+        for i, obj in enumerate(args_kwargs_list):
+            if torch.is_tensor(obj) and obj.requires_grad:
+                inp_tensor_indices.append(i)
+                inp_tensors.append(obj)
+        if len(inp_tensors) == 0:
+            return args, kwargs  # no tensors that require gradients
+        inp_tensors = RegisterPostBackwardFunction.apply(self, *inp_tensors)
+        for inp_tensor_idx, inp_tensor in zip(inp_tensor_indices, inp_tensors):
+            args_kwargs_list[inp_tensor_idx] = inp_tensor
+        args_list = args_kwargs_list[: len(args_list)]
+        kwargs_list = args_kwargs_list[len(args_list) :]
+        args = tree_unflatten(args_list, args_spec)
+        kwargs = tree_unflatten(kwargs_list, kwargs_spec)
+        return args, kwargs
+
+    def _register_state_dict_hooks(self) -> None:
+        num_pre_save_hooks = len(self._module_to_pre_save_state_dict_hook_handle)
+        num_pre_load_hooks = len(self._module_to_pre_load_state_dict_hook_handle)
+        assert (
+            num_pre_save_hooks == num_pre_load_hooks
+        ), f"Pre-save: {num_pre_save_hooks} pre-load: {num_pre_load_hooks}"
+        if num_pre_save_hooks > 0:
+            return  # already registered
+        modules_with_fsdp_params: Set[nn.Module] = {
+            fsdp_param._module_info.module for fsdp_param in self.fsdp_params
+        }
+
+        def to_sharded_hook(*args: Any, **kwargs: Any) -> None:
+            self._to_sharded()
+
+        for module in modules_with_fsdp_params:
+            self._module_to_pre_save_state_dict_hook_handle[
+                module
+            ] = module.register_state_dict_pre_hook(to_sharded_hook)
+            self._module_to_pre_load_state_dict_hook_handle[
+                module
+            ] = module._register_load_state_dict_pre_hook(to_sharded_hook)
+
+    # Properties #
+    @property
+    def _reshard_after_forward(self) -> bool:
+        return self.post_forward_mesh_info is not None
+
+    @property
+    def _use_post_forward_mesh(self) -> bool:
+        return (
+            self._reshard_after_forward
+            and self.mesh_info != self.post_forward_mesh_info
+        )
+
+    @property
+    def _is_hsdp(self) -> bool:
+        return isinstance(self.mesh_info, HSDPMeshInfo)
+
+    @property
+    def _all_gather_process_group(self) -> dist.ProcessGroup:
+        mesh_info = (
+            cast(FSDPMeshInfo, self.post_forward_mesh_info)
+            if self.is_sharded_post_forward
+            else self.mesh_info
+        )
+        assert isinstance(mesh_info, FSDPMeshInfo)
+        return mesh_info.shard_process_group
+
+    @property
+    def _reduce_scatter_process_group(self) -> dist.ProcessGroup:
+        assert isinstance(self.mesh_info, FSDPMeshInfo)
+        return self.mesh_info.shard_process_group
+
+    @property
+    def _all_reduce_process_group(self) -> dist.ProcessGroup:
+        assert isinstance(self.mesh_info, HSDPMeshInfo)
+        return self.mesh_info.replicate_process_group
+
+
+def _get_param_module_infos(
+    params: List[nn.Parameter], module: nn.Module
+) -> List[ParamModuleInfo]:
+    """
+    Shared parameter: lin1.weight = lin2.weight
+    Shared module: mlp.lin1 = mlp.lin2
+    We do not remove duplicates when traversing both modules and parameters to
+    find shared modules' parameters and shared parameters within a module.
+    """
+    params_set = set(params)
+    param_to_module_info: Dict[nn.Parameter, ParamModuleInfo] = {}
+    for _, submodule in module.named_modules(remove_duplicate=False):
+        for param_name, param in _named_parameters_with_duplicates(
+            submodule, recurse=False
+        ):
+            if param in params_set:
+                if param not in param_to_module_info:
+                    param_to_module_info[param] = ParamModuleInfo(submodule, param_name)
+                else:
+                    param_to_module_info[param].shared_modules.append(submodule)
+                    param_to_module_info[param].shared_param_names.append(param_name)
+    if len(param_to_module_info) != len(params):
+        raise AssertionError(f"Some parameters are not in the module tree of {module}")
+    return [param_to_module_info[param] for param in params]
+
+
+class RegisterPostBackwardFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, param_group: FSDPParamGroup, *inputs: torch.Tensor):
+        # All tensors in `inputs` should require gradient
+        ctx.param_group = param_group
+        return inputs
+
+    @staticmethod
+    def backward(ctx, *grads: torch.Tensor):
+        ctx.param_group.post_backward()
+        return (None,) + grads
diff --git a/torch/distributed/_composable/fsdp/_fsdp_state.py b/torch/distributed/_composable/fsdp/_fsdp_state.py
new file mode 100644
index 0000000000000..bab24c283063d
--- /dev/null
+++ b/torch/distributed/_composable/fsdp/_fsdp_state.py
@@ -0,0 +1,254 @@
+import functools
+
+from typing import Any, Dict, List, Optional, Tuple, TYPE_CHECKING
+
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from torch.autograd.graph import register_multi_grad_hook
+from torch.distributed._composable_state import (
+    _get_module_state,
+    _insert_module_state,
+    _State,
+)
+from torch.distributed.utils import _to_kwargs
+from torch.utils._pytree import tree_flatten, tree_map
+from ._fsdp_api import MixedPrecisionPolicy
+from ._fsdp_common import _cast_fp_tensor, TrainingState
+from ._fsdp_param_group import FSDPCommContext, FSDPParamGroup
+
+if TYPE_CHECKING:
+    from ._fsdp_param import FSDPParam
+
+
+class FSDPStateContext:
+    """This has state shared across FSDP states."""
+
+    def __init__(self):
+        # All FSDP states in the root state's module tree
+        self.all_states: List[FSDPState] = []
+        # Iteration's forward root runs the once-per-forward logic; this root
+        # may not be the overall root set by lazy initialization in cases where
+        # only a submodule runs forward (e.g. encoder-only for eval)
+        self.iter_forward_root: Optional[FSDPState] = None
+        # Final callback should only be queued once per backward
+        self.post_backward_final_callback_queued: bool = False
+        # Whether to finalize backward in this backward's final callback
+        self.is_last_backward: bool = True
+
+
+def disable_if_config_true(func):
+    @functools.wraps(func)
+    def fsdp_hook_wrapper(*args, **kwargs):
+        if torch._dynamo.config.skip_fsdp_hooks:
+            return torch._dynamo.disable(func, recursive=True)(*args, **kwargs)
+        else:
+            return func(*args, **kwargs)
+
+    return fsdp_hook_wrapper
+
+
+class FSDPState(_State):
+    def __init__(self):
+        super().__init__()
+        self._fsdp_param_group: Optional[FSDPParamGroup] = None
+        self._is_root: Optional[bool] = None  # root set during lazy init
+        self._state_ctx = FSDPStateContext()
+        self._comm_ctx = FSDPCommContext()
+        self._training_state: TrainingState = TrainingState.IDLE
+
+    # Define a separate init since `__init__` is called in the contract
+    def init(
+        self, module: nn.Module, device: torch.device, mp_policy: MixedPrecisionPolicy
+    ) -> None:
+        _insert_module_state(module, self)
+        self._module = module
+        self._device = device
+        self._mp_policy = mp_policy
+        self._pre_forward_hook_handle = module.register_forward_pre_hook(
+            self._pre_forward, prepend=True, with_kwargs=True
+        )
+        self._post_forward_hook_handle = module.register_forward_hook(
+            self._post_forward, prepend=False
+        )
+
+    def _root_pre_forward(
+        self, module: nn.Module, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
+        self._lazy_init()
+        if self._state_ctx.iter_forward_root is not None:
+            return args, kwargs
+        self._state_ctx.iter_forward_root = self
+        with torch.profiler.record_function("FSDP::root_pre_forward"):
+            # Wait for optimizer before implicitly prefetched all-gathers
+            current_stream = torch.cuda.current_stream()
+            self._comm_ctx.all_gather_copy_in_stream.wait_stream(current_stream)
+            self._comm_ctx.all_gather_stream.wait_stream(current_stream)
+            if self._device.type == "cuda":
+                with torch.profiler.record_function("FSDP::inputs_to_device"):
+                    args_tuple, kwargs_tuple = _to_kwargs(
+                        args, kwargs, self._device, False
+                    )  # same as DDP
+                args, kwargs = args_tuple[0], kwargs_tuple[0]
+        return args, kwargs
+
+    def _lazy_init(self) -> None:
+        """
+        Lazy initialization represents when all modules' parallelisms have
+        finalized (e.g. FSDP has been applied to all desired modules). This
+        means that we can determine which state is the root, and we do so by
+        the 1st state to run forward.
+        """
+        if self._is_root is not None:
+            return  # no-op: already initialized
+        self._is_root = True
+        root_module = self._module
+        for module_name, module in root_module.named_modules():
+            if (state := _get_module_fsdp_state(module)) is None:
+                continue
+            if module is not root_module:
+                if state._is_root is not None:
+                    raise RuntimeError(
+                        "FSDP state has already been lazily initialized for "
+                        f"{module_name}\nFSDP requires running forward through "
+                        "the root module first"
+                    )
+                state._is_root = False
+            self._state_ctx.all_states.append(state)
+        if self._fsdp_param_group:
+            # For the root, do not reshard after forward since for training,
+            # the parameters would be freed and all-gathered immediately
+            self._fsdp_param_group.post_forward_mesh_info = None
+        self._init_fqns()
+        self._init_shared_state()
+        # Run parameter group lazy inits after initializing FQNs for improved
+        # error messages
+        for state in self._state_ctx.all_states:
+            if state._fsdp_param_group:
+                state._fsdp_param_group.lazy_init()
+
+    def _init_shared_state(self) -> None:
+        self._comm_ctx.init()
+        for state in self._state_ctx.all_states:
+            state._state_ctx = self._state_ctx
+            state._comm_ctx = self._comm_ctx
+            if fsdp_param_group := state._fsdp_param_group:
+                fsdp_param_group.comm_ctx = self._comm_ctx
+
+    def _init_fqns(self) -> None:
+        """Sets module and parameter FQN attributes for debugging."""
+        assert self._is_root
+        root_module = self._module
+        param_to_fsdp_param: Dict[nn.Parameter, FSDPParam] = {}
+        module_to_fsdp_param_group: Dict[nn.Module, FSDPParamGroup] = {}
+        for state in self._state_ctx.all_states:
+            if fsdp_param_group := state._fsdp_param_group:
+                for fsdp_param in fsdp_param_group.fsdp_params:
+                    param_to_fsdp_param[fsdp_param.sharded_param] = fsdp_param
+                module_to_fsdp_param_group[fsdp_param_group.module] = fsdp_param_group
+        for param_name, param in root_module.named_parameters():
+            if param in param_to_fsdp_param:
+                param_to_fsdp_param[param]._param_fqn = param_name
+        for module_name, module in root_module.named_modules():
+            if module in module_to_fsdp_param_group:
+                module_to_fsdp_param_group[module]._module_fqn = module_name
+
+    @disable_if_config_true
+    def _pre_forward(
+        self, module: nn.Module, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
+        # When composing with module-hook-based activation checkpointing, the
+        # the pre-backward hook is responsible for the unshard
+        if self._training_state == TrainingState.PRE_BACKWARD:
+            return args, kwargs
+        self._training_state = TrainingState.FORWARD
+        args, kwargs = self._root_pre_forward(module, args, kwargs)
+        if self._mp_policy.cast_forward_inputs and self._mp_policy.param_dtype:
+            with torch.profiler.record_function("FSDP::cast_forward_inputs"):
+                cast_fn = functools.partial(
+                    _cast_fp_tensor, self._mp_policy.param_dtype
+                )
+                args, kwargs = tree_map(cast_fn, args), tree_map(cast_fn, kwargs)
+        if self._fsdp_param_group:
+            args, kwargs = self._fsdp_param_group.pre_forward(module, args, kwargs)
+        return args, kwargs
+
+    @disable_if_config_true
+    def _post_forward(self, module: nn.Module, input: Any, output: Any) -> Any:
+        # When composing with module-hook-based activation checkpointing, the
+        # post-backward hook is responsible for the reshard
+        if self._training_state == TrainingState.PRE_BACKWARD:
+            return output
+        if self._fsdp_param_group:
+            output = self._fsdp_param_group.post_forward(module, input, output)
+        output = self._register_pre_backward_hook(output)
+        self._training_state = TrainingState.IDLE
+        if self._state_ctx.iter_forward_root is self:
+            if all_gather_state := self._comm_ctx.all_gather_state:
+                # Free the last all-gather result if needed; refer to
+                # [Note: Overlapping all-gather copy-in and all-gather]
+                self._comm_ctx.all_gather_copy_in_stream.wait_event(
+                    all_gather_state.event
+                )
+                self._comm_ctx.all_gather_stream.wait_event(all_gather_state.event)
+                self._comm_ctx.all_gather_state = None  # free the all-gather result
+            self._state_ctx.iter_forward_root = None
+        if self._mp_policy.output_dtype is not None:
+            with torch.profiler.record_function("FSDP::cast_forward_outputs"):
+                output = tree_map(
+                    functools.partial(_cast_fp_tensor, self._mp_policy.output_dtype),
+                    output,
+                )
+        return output
+
+    def _pre_backward(self, *unused: Any) -> None:
+        self._training_state = TrainingState.PRE_BACKWARD
+        self._register_root_post_backward_final_callback()
+        if self._fsdp_param_group:
+            self._fsdp_param_group.pre_backward(*unused)
+
+    def _root_post_backward_final_callback(self) -> None:
+        with torch.profiler.record_function("FSDP::root_post_backward_callback"):
+            for state in self._state_ctx.all_states:
+                if state._fsdp_param_group and state._fsdp_param_group.is_unsharded:
+                    # Run post-backward in case forward inputs did not require
+                    # gradient so the autograd backward did not run
+                    state._fsdp_param_group.post_backward()
+                state._training_state = TrainingState.IDLE
+                if state._fsdp_param_group:
+                    state._fsdp_param_group._training_state = TrainingState.IDLE
+                if self._state_ctx.is_last_backward:
+                    state._finalize_backward()
+            if self._state_ctx.is_last_backward:
+                self._comm_ctx.post_forward_order.clear()
+            self._state_ctx.post_backward_final_callback_queued = False
+
+    def _finalize_backward(self) -> None:
+        if self._fsdp_param_group:
+            self._fsdp_param_group.finalize_backward()
+
+    def _register_pre_backward_hook(self, output: Any) -> Any:
+        if not torch.is_grad_enabled():
+            return output
+        flat_outputs, _ = tree_flatten(output)
+        tensors = tuple(
+            t for t in flat_outputs if (torch.is_tensor(t) and t.requires_grad)
+        )
+        if tensors:
+            register_multi_grad_hook(tensors, self._pre_backward, mode="any")
+        return output
+
+    def _register_root_post_backward_final_callback(self):
+        if self._state_ctx.post_backward_final_callback_queued:
+            return
+        self._state_ctx.post_backward_final_callback_queued = True
+        Variable._execution_engine.queue_callback(
+            self._root_post_backward_final_callback
+        )
+
+
+def _get_module_fsdp_state(module: nn.Module) -> Optional[FSDPState]:
+    state = _get_module_state(module)
+    if isinstance(state, FSDPState):
+        return state
+    return None
diff --git a/torch/distributed/_composable/fsdp/fully_shard.py b/torch/distributed/_composable/fsdp/fully_shard.py
new file mode 100644
index 0000000000000..a5204701731c4
--- /dev/null
+++ b/torch/distributed/_composable/fsdp/fully_shard.py
@@ -0,0 +1,349 @@
+import functools
+from typing import Any, cast, Optional, Union
+
+import typing_extensions
+
+import torch
+import torch.nn as nn
+from torch.distributed._composable import contract
+from torch.distributed._tensor import DeviceMesh
+
+from ._fsdp_api import MixedPrecisionPolicy, OffloadPolicy
+from ._fsdp_common import FSDPMeshInfo, HSDPMeshInfo
+from ._fsdp_init import (
+    _get_device_from_mesh,
+    _get_managed_modules,
+    _get_managed_states,
+    _get_post_forward_mesh_info,
+    _init_default_fully_shard_mesh,
+    _move_states_to_device,
+)
+from ._fsdp_param_group import FSDPParamGroup
+from ._fsdp_state import _get_module_fsdp_state, FSDPState
+
+
+# The decorator adds a state object to `module` that can be accessed via
+# `fully_shard.state(module)`. The state object and module are 1:1.
+@contract(state_cls=FSDPState)
+def fully_shard(
+    module: nn.Module,
+    *,
+    mesh: Optional[DeviceMesh] = None,
+    reshard_after_forward: Union[bool, int] = True,
+    mp_policy: MixedPrecisionPolicy = MixedPrecisionPolicy(),
+    offload_policy: OffloadPolicy = OffloadPolicy(),
+):
+    """
+    Shard module parameters across data parallel workers.
+
+    This function applies fully sharded data parallelism (FSDP) or a variant to
+    ``module``, a technique for memory savings at the cost of communication.
+    Parameters are sharded across ``mesh``, and in turn, so are their gradients
+    and optimizer states.
+
+    The sharded parameters are all-gathered to construct the unsharded
+    parameters for forward or backward computation. The unsharded parameters
+    are freed after computation to save memory. The gradients are reduced
+    across the mesh and divided by the mesh size for data parallelism. The
+    optimizer step runs on the sharded parameters.
+
+    Each call to ``fully_shard`` constructs one communication group that
+    includes the parameters in ``module.parameters()`` except those already
+    assigned to a group from a nested call. Each group's parameters and its
+    gradients are communicated together in one collective, respectively.
+    Constructing multiple groups across the model (e.g. "layer by layer")
+    allows for peak memory savings and communication/computation overlap.
+
+    Implementation-wise, the sharded parameters are represented as
+    :class:`DTensor` s, sharded on dim-0, and the unsharded parameters are
+    represented as :class:`Tensor` s. A module forward pre-hook all-gathers the
+    parameters, and a module forward hook frees them. Similar backward hooks
+    gather parameters and later free parameters/reduce gradients.
+
+    Args:
+        mesh (Optional[DeviceMesh]): This data parallel mesh defines the
+            sharding and device. If 1D, then parameters are fully sharded
+            across the 1D mesh (FSDP). If 2D, then parameters are sharded
+            across the 0th dim and replicated across the 1st dim (HSDP). The
+            mesh's device type gives the device type used for communication;
+            if a CUDA or CUDA-like device type, then we use the current device.
+        reshard_after_forward (Union[bool, int]): This controls the parameter
+            behavior after forward and can trade off memory and communication:
+            - If ``True``, then this reshards parameters after forward and
+            all-gathers in backward.
+            - If ``False``, then this keeps the unsharded parameters in memory
+            after forward and avoids the all-gather in backward.
+            - If an ``int``, then this represents the world size to reshard to
+            after forward. It should be a non-trivial divisor of the ``mesh``
+            shard dim size (i.e. excluding 1 and the dim size itself). A choice
+            may be the intra-node size (e.g. ``torch.cuda.device_count()``).
+            This allows the all-gather in backward to be over a smaller world
+            size at the cost of higher memory usage than setting to ``True``.
+            - The root FSDP state has its value specially set to ``False`` as a
+            heuristic since its parameters would typically be immediately
+            all-gathered for backward.
+            - After forward, the parameters registered to the module depend on
+            to this: The registered parameters are the sharded parameters if
+            ``True``; unsharded parameters if ``False``; and the paramters
+            resharded to the smaller mesh otherwise. To modify the parameters
+            between forward and backward, the registered parameters must be the
+            sharded parameters. For ``False`` or an ``int``, this can be done
+            by manually resharding via :meth:`reshard`.
+        mp_policy (MixedPrecisionPolicy): This controls the mixed precision
+            policy, which offers parameter/reduction mixed precision for this
+            module. See :class:`MixedPrecisionPolicy` for details.
+        offload_policy (OffloadPolicy): This controls the offloading policy,
+            which offers parameter/gradient/optimizer state offloading. See
+            :class:`OffloadPolicy` and its subclasses for details.
+    """
+    if isinstance(module, (nn.ModuleList, nn.ModuleDict)):
+        raise ValueError(
+            f"fully_shard does not support containers that do not implement forward: {module}"
+        )
+    mesh = mesh or _init_default_fully_shard_mesh()
+    if mesh.ndim not in (1, 2):
+        raise ValueError(f"fully_shard expects a 1D or 2D DeviceMesh but got {mesh}")
+    elif mesh.ndim == 1:
+        mesh_info = FSDPMeshInfo(mesh, shard_mesh_dim=0)
+    else:
+        mesh_info = HSDPMeshInfo(mesh, shard_mesh_dim=1, replicate_mesh_dim=0)
+    device = _get_device_from_mesh(mesh)
+    post_forward_mesh_info = _get_post_forward_mesh_info(
+        reshard_after_forward, mesh_info
+    )
+
+    state = fully_shard.state(module)
+    state.init(module, device, mp_policy)
+
+    managed_modules = _get_managed_modules(module)
+    params, buffers = _get_managed_states(managed_modules)
+    _move_states_to_device(params, buffers, device)
+    if params:
+        state._fsdp_param_group = FSDPParamGroup(
+            params,
+            module,
+            mesh_info,
+            post_forward_mesh_info,
+            device,
+            mp_policy,
+            offload_policy,
+        )
+
+    # for dynamo
+    for module in managed_modules:
+        module._is_fsdp_managed_module = True  # type: ignore[assignment]
+        module._fsdp_use_orig_params = True  # type: ignore[assignment]
+
+    # Place FSDP leftmost for highest priority in the method resolution order
+    cls = module.__class__
+    dct = {"__deepcopy__": unimplemented_deepcopy}
+    new_cls = type(f"FSDP{cls.__name__}", (FSDPModule, cls), dct)
+    module.__class__ = new_cls
+    return module
+
+
+def unimplemented_deepcopy(*args: Any, **kwargs: Any) -> typing_extensions.Never:
+    raise AssertionError(
+        "FSDP does not support deepcopy. Please use state dict for serialization."
+    )
+
+
+class FSDPModule:
+    def __new__(cls, *args, **kwargs):
+        """
+        Override ``__new__`` to remove the FSDP class and directly construct
+        the original class for cases like indexing into a container module.
+        """
+        # Use index 2 since 0 is the dynamically constructed `FSDP<...>` class
+        # and index 1 is the `FSDPModule` class itself
+        orig_cls = cls.__mro__[2]
+        self = orig_cls.__new__(orig_cls, *args, **kwargs)
+        self.__init__(*args, **kwargs)
+        return self
+
+    def reshard(self) -> None:
+        """
+        Reshards the module's parameters, registering the sharded parameters
+        to the module and freeing the unsharded parameters if needed. This
+        method is *not* recursive.
+        """
+        state = self._get_fsdp_state()
+        if fsdp_param_group := state._fsdp_param_group:
+            fsdp_param_group.reshard()
+
+    def unshard(self, async_op: bool = False) -> Optional["UnshardHandle"]:
+        """
+        Unshards the module's parameters by allocating memory and all-gathering
+        the parameters. This method is *not* recursive.
+
+        Args:
+            async_op (bool): If ``True``, then returns a :class:`UnshardHandle`
+                that has a :meth:`wait` method to wait on the unshard op. If
+                ``False``, then returns ``None`` and waits on the handle inside
+                this function.
+
+        .. note:: If ``async_op=True``, then the user does not have to call
+            :meth:`wait` on the returned handle if waiting on the unshard op
+            in the module's pre-forward is tolerable. FSDP will wait on the
+            pending unshard op in the pre-forward automatically.
+        """
+        state = self._get_fsdp_state()
+        fsdp_param_group = state._fsdp_param_group
+        if fsdp_param_group is not None:
+            fsdp_param_group.lazy_init()
+            fsdp_param_group.unshard(async_op=async_op)
+        handle = UnshardHandle(fsdp_param_group)
+        if async_op:
+            return handle
+        handle.wait()
+        return None
+
+    def set_is_last_backward(self, is_last_backward: bool) -> None:
+        """
+        Sets whether the next backward is the last one, meaning that FSDP
+        should wait for gradient reduction to finish and clear internal data
+        structures used for explicit prefetching.
+        """
+        state = self._get_fsdp_state()
+        state._state_ctx.is_last_backward = is_last_backward
+
+    def set_requires_gradient_sync(
+        self, requires_gradient_sync: bool, recurse: bool = True
+    ) -> None:
+        """
+        Sets if the module should sync gradients. This can be used to implement
+        gradient accumulation without communication. For HSDP, this controls
+        both reduce-scatter and all-reduce together.
+
+        Args:
+            requires_gradient_sync (bool): Whether to reduce gradients for the
+                module's parameters.
+            recurse (bool): Whether to set for all submodules or just the
+                passed-in module.
+        """
+        self_module = cast(nn.Module, self)
+        modules = list(self_module.modules()) if recurse else [self_module]
+        for module in modules:
+            if isinstance(module, FSDPModule):
+                state = module._get_fsdp_state()
+                if fsdp_param_group := state._fsdp_param_group:
+                    fsdp_param_group.reduce_grads = requires_gradient_sync
+                    fsdp_param_group.all_reduce_grads = requires_gradient_sync
+
+    def set_requires_all_reduce(
+        self, requires_all_reduce: bool, recurse: bool = True
+    ) -> None:
+        """
+        Sets if the module should all-reduce gradients. This can be used to
+        implement gradient accumulation with only reduce-scatter but not
+        all-reduce for HSDP.
+        """
+        # TODO: post_reduce_output += fsdp_param.sharded_param.grad
+        # after reduce-scatter and before all-reduce
+        raise NotImplementedError("requires_all_reduce is not yet supported in HSDP")
+        self_module = cast(nn.Module, self)
+        modules = list(self_module.modules()) if recurse else [self_module]
+        for module in modules:
+            if isinstance(module, FSDPModule):
+                state = module._get_fsdp_state()
+                if fsdp_param_group := state._fsdp_param_group:
+                    fsdp_param_group.all_reduce_grads = requires_all_reduce
+
+    def set_reshard_after_backward(
+        self, reshard_after_backward: bool, recurse: bool = True
+    ) -> None:
+        """
+        Sets if the module should reshard parameters after backward. This can
+        be used during gradient accumulation to trade off higher memory for
+        reduced communication.
+
+        Args:
+            reshard_after_backward (bool): Whether to reshard parameters after
+                backward.
+            recurse (bool): Whether to set for all submodules or just the
+                passed-in module.
+        """
+        self_module = cast(nn.Module, self)
+        modules = list(self_module.modules()) if recurse else [self_module]
+        for module in modules:
+            if isinstance(module, FSDPModule):
+                state = module._get_fsdp_state()
+                if fsdp_param_group := state._fsdp_param_group:
+                    fsdp_param_group.reshard_after_backward = reshard_after_backward
+
+    def _get_fsdp_state(self) -> FSDPState:
+        if (state := _get_module_fsdp_state(cast(nn.Module, self))) is None:
+            raise AssertionError(f"No FSDP state found on {self}")
+        return state
+
+    def _apply(self, *args: Any, **kwargs: Any) -> Any:
+        # Reshard to ensure that sharded parameters are registered
+        self.reshard()
+        ret = super()._apply(*args, **kwargs)  # type: ignore[misc]
+        state = self._get_fsdp_state()
+        if not (fsdp_param_group := state._fsdp_param_group):
+            return ret
+        # TODO: Remove this padding logic once DTensor pads the local tensor:
+        # https://github.com/pytorch/pytorch/issues/113045
+        with torch.no_grad():
+            for fsdp_param in fsdp_param_group.fsdp_params:
+                fsdp_param.reset_sharded_param()
+        return ret
+
+
+class UnshardHandle:
+    """
+    A handle to wait on the unshard op.
+
+    Args:
+        fsdp_param_group (FSDPParamGroup, optional): FSDP parameter group to
+            unshard. This should be ``None`` iff the FSDP module does not
+            manage any parameters, meaning the unshard is a no-op.
+    """
+
+    def __init__(self, fsdp_param_group: Optional[FSDPParamGroup]):
+        self._fsdp_param_group = fsdp_param_group
+
+    def wait(self):
+        """
+        Waits on the unshard op.
+
+        This ensures that the current stream can use the unsharded parameters,
+        which are now registered to the module.
+        """
+        if self._fsdp_param_group is not None:
+            self._fsdp_param_group.wait_for_unshard()
+            # Avoid keeping a reference
+            self._fsdp_param_group = None
+
+
+def register_fsdp_forward_method(module: nn.Module, method_name: str) -> None:
+    """
+    Registers a method on ``module`` to be a forward method for FSDP.
+
+    FSDP only knows to run its pre-forward and post-forward hooks on the
+    default :meth:`nn.Module.forward` method. This function patches a user
+    specified method to run the pre/post-forward hooks before/after the method,
+    respectively. If ``module`` is not an :class:`FSDPModule`, then this is a
+    no-op.
+
+    Args:
+        module (nn.Module): Module to register the forward method on.
+        method_name (str): Name of the forward method.
+    """
+    if not isinstance(module, FSDPModule):
+        # Make no-op to allow including both when using/not using FSDP
+        return
+    if not hasattr(module, method_name):
+        raise ValueError(f"{type(module)} does not have a method {method_name}")
+    orig_method = getattr(module, method_name)
+
+    @functools.wraps(orig_method)
+    def wrapped_method(self, *args, **kwargs):
+        fsdp_state = self._get_fsdp_state()
+        args, kwargs = fsdp_state._pre_forward(self, args, kwargs)
+        out = orig_method(*args, **kwargs)
+        return fsdp_state._post_forward(self, args, out)
+
+    # Use `__get__` to make `wrapped_method` an instance method
+    setattr(module, method_name, wrapped_method.__get__(module, type(module)))
diff --git a/torch/distributed/_composable/fully_shard.py b/torch/distributed/_composable/fully_shard.py
index 4d33aa9573674..37e3d1544cd17 100644
--- a/torch/distributed/_composable/fully_shard.py
+++ b/torch/distributed/_composable/fully_shard.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Callable, Iterable, Optional, Union
 
 import torch
@@ -57,6 +58,13 @@ def fully_shard(
     """
     Applies ``FullyShardedDataParallel` (FSDP) semantics to ``module``.
     """
+    warnings.warn(
+        "``torch.distributed._composable.fully_shard`` is being deprecated."
+        "You can contintue to use the wrapper based FSDP."
+        "See usage in: https://github.com/pytorch/pytorch/blob/main/torch/distributed/fsdp/fully_sharded_data_parallel.py."
+        "``torch.distributed._composable.fully_shard`` will be removed after PyTorch 2.5."
+    )
+
     torch._C._log_api_usage_once("torch.distributed.fully_shard")
     # Enforce the new auto wrap policy
     if policy is not None and not isinstance(policy, _Policy):
diff --git a/torch/distributed/_composable/replicate.py b/torch/distributed/_composable/replicate.py
index b3205f9aff035..2f1c1dda005fa 100644
--- a/torch/distributed/_composable/replicate.py
+++ b/torch/distributed/_composable/replicate.py
@@ -1,5 +1,7 @@
 import weakref
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
+from typing import Any, cast, Dict, Iterable, List, Optional, Set, Tuple
+
+import typing_extensions
 
 import torch
 import torch.nn as nn
@@ -19,7 +21,12 @@ def __init__(self) -> None:
         self._param_list: nn.ParameterList = nn.ParameterList()
         # TODO(@fegin): this variable is originally create for testing, we
         # should remove this if possible.
+        self._orig_module = self.module
         self._param_names: List[str] = []
+        self._no_sync: bool = False
+        self._init_args: Optional[Tuple[Any, ...]] = None
+        self._init_kwargs: Dict[str, Any] = {}
+        self._comm_hook_args: List[Any] = []
 
     def _collect_params(
         self,
@@ -53,26 +60,35 @@ def _collect_params(
                 prefix=f"{recurse_prefix}{name}",
             )
 
+    def lazy_init(self) -> None:
+        @torch._disable_dynamo(recursive=True)
+        def _lazy_init():
+            assert self._init_args is not None
+            self.init(*self._init_args, **self._init_kwargs)
+            self.register_comm_hook()
+            self._init_args = tuple()
+            self._init_kwargs = {}
+
+        _lazy_init()
+
     def init(
         self,
         module: nn.Module,
         ignored_modules: Set[nn.Module],
         **kwargs,
     ) -> None:
-        if _is_fully_sharded(module):
-            raise RuntimeError(
-                "Cannot apply `replicate()` on a Module already managed by `fully_shard`"
-            )
-
         if self.has_initialized:
             return
 
         self.has_initialized = True
+
+        device_mesh = kwargs.get("device_mesh", None)
         self.module = module
         ignored_params = {p for m in ignored_modules for p in m.parameters()}
+        from torch.distributed.tensor.parallel.ddp import _localize_dtensor
+
+        _localize_dtensor(module)
         self._collect_params(module, ignored_modules, ignored_params)
-        module.register_forward_pre_hook(self.forward_pre_hook, with_kwargs=True)
-        module.register_forward_hook(self.forward_post_hook)  # type: ignore[arg-type]
 
         if "device_id" in kwargs:
             # replicate() supports a small usability enhancement where
@@ -96,9 +112,21 @@ def init(
         # Weakref to the DDP instance is currently only used for testing.
         replicate.state(self.module)._ddp_weakref = weakref.ref(self._ddp)
 
+    def register_comm_hook(self) -> None:
+        for comm_args, comm_kwargs in self._comm_hook_args:
+            self._ddp.register_comm_hook(*comm_args, **comm_kwargs)
+        self._comm_hook_args.clear()
+
+    def record_init_args(self, *args, **kwargs) -> None:
+        self._init_args = args
+        self._init_kwargs = kwargs
+
     def forward_pre_hook(
         self, module: nn.Module, args: Tuple[Any, ...], kwargs: Dict[str, Any]
     ) -> Any:
+        if self._init_args or self._init_kwargs:
+            self.lazy_init()
+        self._ddp.require_backward_grad_sync = not self._no_sync
         return self._ddp._pre_forward(*args, **kwargs)
 
     def forward_post_hook(
@@ -110,6 +138,39 @@ def forward_post_hook(
         return self._ddp._post_forward(output)
 
 
+def unimplemented_deepcopy(*args: Any, **kwargs: Any) -> typing_extensions.Never:
+    raise AssertionError(
+        "DDP does not support deepcopy. Please use state dict for serialization."
+    )
+
+
+# Follow the same pattern as FSDP/fully_shard
+class DDP:
+    def __new__(cls, *args, **kwargs):
+        """
+        Override ``__new__`` to remove the DDP class and directly construct
+        the original class for cases like indexing into a container module.
+        """
+        # Use index 2 since 0 is the dynamically constructed `DDP<...>` class
+        # and index 1 is the `DDP` class itself
+        orig_cls = cls.__mro__[2]
+        return orig_cls.__new__(orig_cls, *args, **kwargs)
+
+    def set_requires_gradient_sync(self, requires_gradient_sync: bool) -> None:
+        """
+        Sets if the module should sync gradients. This can be used to implement
+        gradient accumulation without communication.
+
+        Args:
+            requires_gradient_sync (bool): Whether to reduce gradients for the
+                module's parameters.
+        """
+        replicate.state(self)._no_sync = not requires_gradient_sync
+
+    def register_comm_hook(self, *args, **kwargs) -> None:
+        replicate.state(self)._comm_hook_args.append((args, kwargs))
+
+
 @contract(state_cls=_ReplicateState)
 def replicate(
     module: nn.Module,
@@ -137,12 +198,46 @@ def replicate(
                 f"but got {type(kwargs['device_id'])}"
             )
 
+    if _is_fully_sharded(module):
+        raise RuntimeError(
+            "Cannot apply `replicate()` on a Module already managed by `fully_shard`"
+        )
+
     if ignored_modules is None:
         ignored_modules = {}
     else:
         ignored_modules = set(ignored_modules)
-    replicate.state(module).init(module, ignored_modules, **kwargs)
 
+    state = cast(_ReplicateState, replicate.state(module))
+    module.register_forward_pre_hook(state.forward_pre_hook, with_kwargs=True)
+    device_mesh = kwargs.get("device_mesh", None)
+    if device_mesh is not None:
+        from torch.distributed.device_mesh import _mesh_resources
+
+        if _mesh_resources.get_parent_mesh(device_mesh) is not None:
+            # TODO: This is a temporary work around to enable DDP + TP.
+            # We should do the logic in DDP so that the 2D implementation is
+            # sound and the state_dict works out of the box.
+            #
+            # This won't conflict with what is done in DDP class as the module
+            # replicate is going to pass is NOT the original module.
+            from torch.distributed.tensor.parallel.ddp import (
+                _localize_dtensor,
+                _reconstruct_dtensor,
+            )
+
+            module.register_forward_pre_hook(_reconstruct_dtensor)
+            module.register_forward_hook(_localize_dtensor)
+
+    module.register_forward_hook(state.forward_post_hook)  # type: ignore[arg-type]
+
+    state.record_init_args(module, ignored_modules, **kwargs)
+
+    # Place DDP leftmost for highest priority in the method resolution order
+    cls = module.__class__
+    dct = {"__deepcopy__": unimplemented_deepcopy}
+    new_cls = type(f"DDP{cls.__name__}", (DDP, cls), dct)
+    module.__class__ = new_cls
     return module
 
 
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py
index 8143ef87c8754..b1250eddf0377 100644
--- a/torch/distributed/_functional_collectives.py
+++ b/torch/distributed/_functional_collectives.py
@@ -5,12 +5,10 @@
 import torch
 import torch.distributed as dist
 import torch.distributed.distributed_c10d as c10d
-from torch._custom_ops import impl_abstract
 from torch.distributed.device_mesh import DeviceMesh
 from torch.fx.experimental.proxy_tensor import get_innermost_proxy_mode
 
 from . import _functional_collectives_impl as fun_col_impl
-from ._functional_collectives_impl import _register_tensor_wrapper
 
 try:
     from torch.utils._cxx_pytree import tree_map_only
@@ -26,9 +24,7 @@ def is_torchdynamo_compiling():
 
 else:
     try:
-        from torch._dynamo.external_utils import (
-            is_compiling as is_torchdynamo_compiling,
-        )
+        from torch.compiler import is_dynamo_compiling as is_torchdynamo_compiling
     except Exception:
         warnings.warn(
             "Unable to import torchdynamo util `is_torchdynamo_compiling`, so won't support torchdynamo correctly"
@@ -100,6 +96,7 @@ def is_torchdynamo_compiling():
     dist.ProcessGroup,
     DeviceMesh,
     Tuple["dist._tensor.DeviceMesh", int],
+    str,
 ]
 
 
@@ -138,7 +135,7 @@ def wait_tensor(tensor):
 
     Waiting follows device semantics, which means blocking on CPU and synchronizing streams on CUDA.
     """
-    return torch.ops.c10d_functional.wait_tensor(tensor)  # type: ignore[attr-defined]
+    return torch.ops._c10d_functional.wait_tensor(tensor)  # type: ignore[attr-defined]
 
 
 def broadcast(self: torch.Tensor, src: int, group: RANK_TYPES, tag: str = ""):
@@ -150,8 +147,8 @@ def broadcast(self: torch.Tensor, src: int, group: RANK_TYPES, tag: str = ""):
         group (ProcessGroup or List[int]): The process group to work on.
         tag (str, optional): A unique identifier for the collective. Default: empty string
     """
-    tag, rankset, group_size = _expand_group(group, tag)
-    tensor = torch.ops.c10d_functional.broadcast(self, src, tag, rankset, group_size)
+    group_name = _resolve_group_name(group, tag)
+    tensor = torch.ops._c10d_functional.broadcast(self, src, group_name)
     return _maybe_wrap_tensor(tensor)
 
 
@@ -172,8 +169,8 @@ def all_reduce(self: torch.Tensor, reduceOp: str, group: RANK_TYPES, tag: str =
     :: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
     that information and perform collective algebraic optimization. Use other forms of input for that.
     """
-    tag, rankset, group_size = _expand_group(group, tag)
-    tensor = torch.ops.c10d_functional.all_reduce(self, reduceOp, tag, rankset, group_size)  # type: ignore[attr-defined]
+    group_name = _resolve_group_name(group, tag)
+    tensor = torch.ops._c10d_functional.all_reduce(self, reduceOp.lower(), group_name)
     return _maybe_wrap_tensor(tensor)
 
 
@@ -200,8 +197,11 @@ def all_gather_tensor(
     that information and perform collective algebraic optimization. Use other forms of input for that.
     """
     assert self.is_contiguous()
-    tag, rankset, group_size = _expand_group(group, tag)
-    tensor = torch.ops.c10d_functional.all_gather_into_tensor(self, tag, rankset, group_size)  # type: ignore[attr-defined]
+    group_name = _resolve_group_name(group, tag)
+    group_size = c10d._get_group_size_by_name(group_name)
+    tensor = torch.ops._c10d_functional.all_gather_into_tensor(
+        self, group_size, group_name
+    )
     res = _maybe_wrap_tensor(tensor)
     # TODO this should be done inside AsyncCollectiveTensor to delay the wait() call
     if gather_dim != 0:
@@ -213,6 +213,39 @@ def all_gather_tensor(
     return res
 
 
+def all_gather_tensor_autograd(
+    self: torch.Tensor,
+    gather_dim: int,
+    group: RANK_TYPES,
+    tag: str = "",
+):
+    """
+    Gather tensor data across from all machines and concatenate over ``gather_dim``.
+
+    Note that it currently only supports gather_dim = 0.
+
+    This function is the same as all_gather_tensor but will propagate the
+    backwards gradient across workers.
+
+    See all_gather_tensor for more details on usage.
+    """
+    group_name = _resolve_group_name(group, tag)
+    group_size = c10d._get_group_size_by_name(group_name)
+
+    tensor = torch.ops._c10d_functional_autograd.all_gather_into_tensor(
+        self, group_size, group_name
+    )
+    res = _FromTorchTensor.apply(tensor)
+    # TODO this should be done inside AsyncCollectiveTensor to delay the wait() call
+    if gather_dim != 0:
+        # torch.cat access the data so we already need to wait here, first do wait
+        # and then chunk + cat avoid us going through ACT dispatching logic again
+        if isinstance(res, AsyncCollectiveTensor):
+            res = res.wait()  # type: ignore[attr-defined]
+        res = torch.cat(torch.chunk(res, group_size, dim=0), dim=gather_dim)
+    return res
+
+
 def reduce_scatter_tensor(
     self: torch.Tensor,
     reduceOp: str,
@@ -235,7 +268,9 @@ def reduce_scatter_tensor(
     :: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
     that information and perform collective algebraic optimization. Use other forms of input for that.
     """
-    tag, rankset, group_size = _expand_group(group, tag)
+    group_name = _resolve_group_name(group, tag)
+    group_size = c10d._get_group_size_by_name(group_name)
+
     assert (
         self.size(scatter_dim) % group_size == 0
     ), f"input dimension 0 ({self.size(0)} must be a multiple of group_size {group_size}"
@@ -243,11 +278,55 @@ def reduce_scatter_tensor(
         tensor_list = torch.chunk(self, group_size, dim=scatter_dim)
         self = torch.cat(tensor_list)
 
-    tensor = torch.ops.c10d_functional.reduce_scatter_tensor(self, reduceOp, tag, rankset, group_size)  # type: ignore[attr-defined]
+    tensor = torch.ops._c10d_functional.reduce_scatter_tensor(
+        self,
+        reduceOp.lower(),
+        group_size,
+        group_name,  # type: ignore[possibly-undefined]
+    )
     res = _maybe_wrap_tensor(tensor)
     return res
 
 
+def reduce_scatter_tensor_autograd(
+    self: torch.Tensor,
+    reduceOp: str,
+    scatter_dim: int,
+    group: RANK_TYPES,
+    tag: str = "",
+):
+    """
+    Reduces the tensor data across all machines in such a way that all get
+    the final result, then scatter the results to corresponding ranks.
+
+    This function is the same as reduce_scatter_tensor but will propagate the
+    backwards gradient across workers.
+
+    Currently only the "sum" reduceOp is supported.
+
+    See reduce_scatter_tensor for more details on usage.
+    """
+
+    group_name = _resolve_group_name(group, tag)
+    group_size = c10d._get_group_size_by_name(group_name)
+
+    assert (
+        self.size(scatter_dim) % group_size == 0
+    ), f"input dimension 0 ({self.size(0)} must be a multiple of group_size {group_size}"
+    if scatter_dim != 0:
+        tensor_list = torch.chunk(self, group_size, dim=scatter_dim)
+        self = torch.cat(tensor_list)
+
+    tensor = torch.ops._c10d_functional_autograd.reduce_scatter_tensor(
+        self,
+        reduceOp.lower(),
+        group_size,
+        group_name,  # type: ignore[possibly-undefined]
+    )
+    res = _FromTorchTensor.apply(tensor)
+    return res
+
+
 def all_reduce_coalesced(
     self: List[torch.Tensor], reduceOp: str, group: RANK_TYPES, tag: str = ""
 ) -> List[torch.Tensor]:
@@ -267,8 +346,12 @@ def all_reduce_coalesced(
     :: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
     that information and perform collective algebraic optimization. Use other forms of input for that.
     """
-    tag, rankset, group_size = _expand_group(group, tag)
-    tensor_list = torch.ops.c10d_functional.all_reduce_coalesced(self, reduceOp, tag, rankset, group_size)  # type: ignore[attr-defined]
+    group_name = _resolve_group_name(group, tag)
+    tensor_list = torch.ops._c10d_functional.all_reduce_coalesced(  # type: ignore[attr-defined]
+        self,
+        reduceOp.lower(),
+        group_name,
+    )
     return list(map(_maybe_wrap_tensor, tensor_list))
 
 
@@ -291,8 +374,13 @@ def all_gather_into_tensor_coalesced(
     :: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
     that information and perform collective algebraic optimization. Use other forms of input for that.
     """
-    tag, rankset, group_size = _expand_group(group, tag)
-    tensor_list = torch.ops.c10d_functional.all_gather_into_tensor_coalesced(self, tag, rankset, group_size)  # type: ignore[attr-defined]
+    group_name = _resolve_group_name(group, tag)
+    group_size = c10d._get_group_size_by_name(group_name)
+    tensor_list = torch.ops._c10d_functional.all_gather_into_tensor_coalesced(  # type: ignore[attr-defined]
+        self,
+        group_size,
+        group_name,
+    )
     return list(map(_maybe_wrap_tensor, tensor_list))
 
 
@@ -318,7 +406,9 @@ def reduce_scatter_tensor_coalesced(
     :: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
     that information and perform collective algebraic optimization. Use other forms of input for that.
     """
-    tag, rankset, group_size = _expand_group(group, tag)
+    group_name = _resolve_group_name(group, tag)
+    group_size = c10d._get_group_size_by_name(group_name)
+
     assert len(scatter_dim) == len(inputs)
     for idx, (dim, tensor) in enumerate(zip(scatter_dim, inputs)):
         assert (
@@ -328,7 +418,12 @@ def reduce_scatter_tensor_coalesced(
             tensor_list = torch.chunk(tensor, group_size, dim=dim)
             inputs[idx] = torch.cat(tensor_list)
 
-    tensor_list = torch.ops.c10d_functional.reduce_scatter_tensor_coalesced(inputs, reduceOp, tag, rankset, group_size)  # type: ignore[attr-defined]
+    tensor_list = torch.ops._c10d_functional.reduce_scatter_tensor_coalesced(  # type: ignore[attr-defined]
+        inputs,
+        reduceOp.lower(),
+        group_size,
+        group_name,  # type: ignore[possibly-undefined]
+    )
 
     return list(map(_maybe_wrap_tensor, tensor_list))
 
@@ -374,11 +469,61 @@ def all_to_all_single(
         assert all(
             isinstance(size, (int, torch.SymInt)) for size in input_split_sizes
         ), input_split_sizes
-    tag, rankset, group_size = _expand_group(group, tag)
-    tensor = torch.ops.c10d_functional.all_to_all_single(self, output_split_sizes, input_split_sizes, tag, rankset, group_size)  # type: ignore[attr-defined]
+    group_name = _resolve_group_name(group, tag)
+    group_size = c10d._get_group_size_by_name(group_name)
+    if output_split_sizes is None or input_split_sizes is None:
+        assert output_split_sizes is None and input_split_sizes is None, (
+            "output_split_sizes and input_split_sizes must either be "
+            "specified together or both set to None"
+        )
+        output_split_sizes = [self.shape[0] // group_size] * group_size
+        input_split_sizes = output_split_sizes
+    tensor = torch.ops._c10d_functional.all_to_all_single(  # type: ignore[attr-defined]
+        self,
+        output_split_sizes,
+        input_split_sizes,
+        group_name,
+    )
     return _maybe_wrap_tensor(tensor)
 
 
+def all_to_all_single_autograd(
+    self: torch.Tensor,
+    output_split_sizes: Optional[List[int]],
+    input_split_sizes: Optional[List[int]],
+    group: RANK_TYPES,
+    tag: str = "",
+) -> torch.Tensor:
+    """
+    Same as all_to_all_single but supports autograd.
+    """
+    if output_split_sizes is not None:
+        assert all(
+            isinstance(size, (int, torch.SymInt)) for size in output_split_sizes
+        ), output_split_sizes
+    if input_split_sizes is not None:
+        assert all(
+            isinstance(size, (int, torch.SymInt)) for size in input_split_sizes
+        ), input_split_sizes
+
+    group_name = _resolve_group_name(group, tag)
+    group_size = c10d._get_group_size_by_name(group_name)
+    if output_split_sizes is None or input_split_sizes is None:
+        assert output_split_sizes is None and input_split_sizes is None, (
+            "output_split_sizes and input_split_sizes must either be "
+            "specified together or both set to None"
+        )
+        output_split_sizes = [self.shape[0] // group_size] * group_size
+        input_split_sizes = output_split_sizes
+    tensor = torch.ops._c10d_functional_autograd.all_to_all_single(  # type: ignore[attr-defined]
+        self,
+        output_split_sizes,
+        input_split_sizes,
+        group_name,
+    )
+    return _FromTorchTensor.apply(tensor)
+
+
 def permute_tensor(
     self: torch.Tensor,
     src_dst: List[int],
@@ -421,10 +566,9 @@ def functional_collective(self, group, tag):
         return _maybe_wrap_tensor(tensor)
     """
     elem: torch.Tensor
+    completed: bool
 
-    __slots__ = ["elem"]
-
-    __torch_function__ = torch._C._disabled_torch_function_impl
+    __slots__ = ["elem", "completed"]
 
     @staticmethod
     def __new__(cls, elem: torch.Tensor):
@@ -436,17 +580,17 @@ def __new__(cls, elem: torch.Tensor):
             dtype=elem.dtype,
             layout=elem.layout,
             device=elem.device,
-            requires_grad=False,
+            requires_grad=elem.requires_grad,
         )
         r.elem = elem
+        r.completed = False
         return r
 
     def __tensor_flatten__(self):
         return ["elem"], None
 
     def tolist(self):
-        wait_tensor(self.elem)
-        return self.elem.tolist()
+        return self.trigger_wait().tolist()
 
     @staticmethod
     def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
@@ -455,16 +599,18 @@ def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
         return AsyncCollectiveTensor(elem)
 
     def __repr__(self):
-        wait_tensor(self.elem)
-        return f"AsyncCollectiveTensor({self.elem})"
+        return f"AsyncCollectiveTensor({self.trigger_wait()})"
 
     def trigger_wait(self):
-        wait_tensor(self.elem)
-        return self
+        if not self.completed:
+            out = wait_tensor(self.elem)
+            self.completed = True
+            return out
+        else:
+            return self.elem
 
     def wait(self) -> torch.Tensor:
-        wait_tensor(self.elem)
-        return self.elem
+        return wait_tensor(self.elem)
 
     def _get_acs_underlying_tensor(self):
         """This method enables  _functional_collectives_impl to test if a tensor is an ACS"""
@@ -477,7 +623,6 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
             # eventually, this avoids pytree slowdown
             res = func(args[0].elem, args[1])
             wrapper_res = AsyncCollectiveTensor(res)
-            _register_tensor_wrapper(wrapper_res)
             return wrapper_res
 
         is_view_op = _is_view_op(func)
@@ -485,14 +630,13 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
         def unwrap(e: AsyncCollectiveTensor):
             # wait_tensor is idepotent and will do stream sync only once
             if not is_view_op:
-                wait_tensor(e.elem)
+                return e.trigger_wait()
             return e.elem
 
         def wrap(e: torch.Tensor):
             # wait_tensor is idepotent and will do stream sync only once
             assert not isinstance(e, AsyncCollectiveTensor)
             res = AsyncCollectiveTensor(e)
-            _register_tensor_wrapper(res)
             return res
 
         unwrapped_args = tree_map_only(AsyncCollectiveTensor, unwrap, args)
@@ -569,7 +713,7 @@ def cast_listint(x):
             group.ndim == 1
         ), "Only 1D mesh is supported, pass in (DeviceMesh, int) together if mesh > 1D"
         # TODO: it should run collective in the whole mesh instead of dim 0
-        tag, rankset = group._dim_group_infos[0]
+        tag, rankset, _ = group._dim_group_infos[0]
         group_size = len(rankset)
     elif isinstance(group, tuple):
         if (
@@ -579,7 +723,7 @@ def cast_listint(x):
         ):
             dmesh = group[0]
             dim = group[1]
-            tag, rankset = dmesh._dim_group_infos[dim]
+            tag, rankset, _ = dmesh._dim_group_infos[dim]
             group_size = len(rankset)
         else:
             raise ValueError("Invalid tuple for group must be (DeviceMesh, int)")
@@ -591,6 +735,62 @@ def cast_listint(x):
     return (tag, rankset, group_size)
 
 
+def _resolve_group_name(group: RANK_TYPES, tag: str = "") -> str:
+    """
+    Given group in RANK_TYPES, return the group name.
+    """
+    # `tag` will be deprecated. See details in:
+    # https://github.com/pytorch/pytorch/issues/93173#issuecomment-1907095208
+    if isinstance(group, dist.ProcessGroup):
+        return group.group_name
+    elif isinstance(group, str):
+        return group
+    elif isinstance(group, DeviceMesh):
+        assert (
+            group.ndim == 1
+        ), "Only 1D mesh is supported, pass in (DeviceMesh, int) together if mesh > 1D"
+        return group._dim_group_infos[0][2]
+    elif isinstance(group, tuple):
+        if (
+            len(group) == 2
+            and isinstance(group[0], DeviceMesh)
+            and isinstance(group[1], int)
+        ):
+            dmesh = group[0]
+            dim = group[1]
+            return dmesh._dim_group_infos[dim][2]
+        else:
+            raise ValueError("Invalid tuple for group must be (DeviceMesh, int)")
+    elif isinstance(group, list):
+        if not is_torchdynamo_compiling():
+            warnings.warn(
+                "The combination of ranks + tag as process group "
+                "identifier has been deprecated. Please switch to "
+                "using ProcessGroup, DeviceMesh, or group name instead."
+            )
+        return c10d._resolve_group_name_by_ranks_and_tag(cast(List[int], group), tag)
+    else:
+        raise ValueError(f"Unsupported group type: {type(group)}, {group}")
+
+
+class _FromTorchTensor(torch.autograd.Function):
+    """
+    _FromTorchTensor allows autograd to propagate from a normal Tensor to an
+    AsyncCollectiveTensor.
+    """
+
+    @staticmethod
+    def forward(  # type: ignore[override]
+        ctx,  # pyre-ignore[2]: Parameter must be annotated.
+        input: torch.Tensor,
+    ) -> torch.Tensor:
+        return _maybe_wrap_tensor(input)
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:  # type: ignore[override]
+        return grad_output
+
+
 def _are_we_tracing() -> bool:
     if is_torchdynamo_compiling():
         return True
@@ -612,7 +812,6 @@ def _maybe_wrap_tensor(self) -> torch.Tensor:
     if _are_we_tracing():
         return wait_tensor(self)
     res = AsyncCollectiveTensor(self)
-    _register_tensor_wrapper(res)
     return cast(torch.Tensor, res)
 
 
@@ -659,6 +858,10 @@ def _all_reduce__meta(inp, *args):
     return inp
 
 
+def _broadcast__meta(inp, *args):
+    return inp
+
+
 def _all_reduce_coalesced__meta(inputs, *args):
     return inputs
 
@@ -719,7 +922,36 @@ def _reduce_scatter_tensor_coalesced_native_meta(
     ]
 
 
-def _register_ops():
+if not torch._running_with_deploy():
+    # Library MUST be defined at module scope or it doesn't work
+    # Creating a "DEF" Library always crashes torch::deploy so we create our
+    # Library instances here guarded against running inside it
+    lib_impl = torch.library.Library("_c10d_functional", "IMPL")
+    lib_impl.impl("all_reduce", _all_reduce_meta, "Meta")
+    lib_impl.impl("all_reduce_", _all_reduce__meta, "Meta")
+    lib_impl.impl("all_reduce_coalesced", _all_reduce_coalesced_meta, "Meta")
+    lib_impl.impl("all_reduce_coalesced_", _all_reduce_coalesced__meta, "Meta")
+    lib_impl.impl("wait_tensor", _wait_tensor_meta, "Meta")
+    lib_impl.impl("all_gather_into_tensor", _all_gather_into_tensor_native_meta, "Meta")
+    lib_impl.impl(
+        "all_gather_into_tensor_coalesced",
+        _all_gather_into_tensor_coalesced_native_meta,
+        "Meta",
+    )
+    lib_impl.impl("reduce_scatter_tensor", _reduce_scatter_tensor_native_meta, "Meta")
+    lib_impl.impl(
+        "reduce_scatter_tensor_coalesced",
+        _reduce_scatter_tensor_coalesced_native_meta,
+        "Meta",
+    )
+    lib_impl.impl("all_to_all_single", _all_to_all_single_meta, "Meta")
+    lib_impl.impl("broadcast", _broadcast_meta, "Meta")
+    lib_impl.impl("broadcast_", _broadcast__meta, "Meta")
+
+    # Register legacy ops for backward compatibility
+    # TODO(yifu): remove these in functional collective beta release
+    legacy_lib = torch.library.Library("c10d_functional", "DEF")
+    legacy_lib_impl = torch.library.Library("c10d_functional", "IMPL")
     ops_defs = [
         "broadcast(Tensor self, int src, str tag, int[] ranks, int group_size) -> Tensor",
         "all_reduce(Tensor self, str reduceOp, str tag, int[] ranks, int group_size) -> Tensor",
@@ -736,43 +968,9 @@ def _register_ops():
     for op_def in ops_defs:
         op_name = op_def[0 : op_def.index("(")]
         backend_impl = getattr(fun_col_impl, f"_{op_name}")
-        meta_impl = getattr(my_module, f"_{op_name}_meta")
-        c10_lib.define(op_def, tags=torch.Tag.pt2_compliant_tag)
-        c10_lib_impl.impl(op_name, backend_impl, "CompositeExplicitAutograd")
-        impl_abstract(f"c10d_functional::{op_name}")(meta_impl)
-
+        legacy_lib.define(op_def, tags=torch.Tag.pt2_compliant_tag)
+        legacy_lib_impl.impl(op_name, backend_impl, "CompositeImplicitAutograd")
 
-if not torch._running_with_deploy():
-    # Library MUST be defined at module scope or it doesn't work
-    # Creating a "DEF" Library always crashes torch::deploy so we create our Library instances here
-    #   guarded against running inside it
-    c10_lib = torch.library.Library("c10d_functional", "DEF")
-    c10_lib_impl = torch.library.Library("c10d_functional", "IMPL")
-    _register_ops()
-
-    _c10_lib_impl = torch.library.Library("_c10d_functional", "IMPL")
-    _c10_lib_impl.impl("all_reduce", _all_reduce_meta, "Meta")
-    _c10_lib_impl.impl("all_reduce_", _all_reduce__meta, "Meta")
-    _c10_lib_impl.impl("all_reduce_coalesced", _all_reduce_coalesced_meta, "Meta")
-    _c10_lib_impl.impl("all_reduce_coalesced_", _all_reduce_coalesced__meta, "Meta")
-    _c10_lib_impl.impl("wait_tensor", _wait_tensor_meta, "Meta")
-    _c10_lib_impl.impl(
-        "all_gather_into_tensor", _all_gather_into_tensor_native_meta, "Meta"
-    )
-    _c10_lib_impl.impl(
-        "all_gather_into_tensor_coalesced",
-        _all_gather_into_tensor_coalesced_native_meta,
-        "Meta",
-    )
-    _c10_lib_impl.impl(
-        "reduce_scatter_tensor", _reduce_scatter_tensor_native_meta, "Meta"
-    )
-    _c10_lib_impl.impl(
-        "reduce_scatter_tensor_coalesced",
-        _reduce_scatter_tensor_coalesced_native_meta,
-        "Meta",
-    )
-    _c10_lib_impl.impl("all_to_all_single", _all_to_all_single_meta, "Meta")
 else:
     warnings.warn(
         "PyTorch Distributed functional collectives do not work with torch::deploy."
@@ -791,8 +989,8 @@ def _register_ops():
 
 
 def all_gather_tensor_inplace(
-    output: torch.Tensor,
-    input: torch.Tensor,
+    output_tensor: torch.Tensor,
+    input_tensor: torch.Tensor,
     group,  # TODO add a type,
     async_op: bool = False,
     tag: str = "",
@@ -801,7 +999,11 @@ def all_gather_tensor_inplace(
     assert (
         not async_op
     ), "Can't remap async version of inplace op to functional collective"
-    return output.copy_(all_gather_tensor(input, gather_dim, group, tag))
+
+    group = group or dist.group.WORLD
+    assert group is not None
+
+    return output_tensor.copy_(all_gather_tensor(input_tensor, gather_dim, group, tag))
 
 
 def reduce_scatter_tensor_inplace(
@@ -816,9 +1018,25 @@ def reduce_scatter_tensor_inplace(
     assert (
         not async_op
     ), "Can't remap async version of inplace op to functional collective"
+
+    group = group or dist.group.WORLD
+    assert group is not None
+
     return output.copy_(reduce_scatter_tensor(input, op, scatter_dim, group, tag))
 
 
+REDUCE_OP_TO_STR = {
+    dist.ReduceOp.SUM: "sum",
+    dist.ReduceOp.AVG: "avg",
+    dist.ReduceOp.PRODUCT: "product",
+    dist.ReduceOp.MIN: "min",
+    dist.ReduceOp.MAX: "max",
+    dist.ReduceOp.BAND: "band",
+    dist.ReduceOp.BOR: "bor",
+    dist.ReduceOp.BXOR: "bxor",
+}
+
+
 def all_reduce_inplace(
     tensor: torch.Tensor,
     op: str = "sum",
@@ -830,12 +1048,77 @@ def all_reduce_inplace(
         not async_op
     ), "Can't remap async version of inplace op to functional collective"
 
+    group = group or dist.group.WORLD
+    assert group is not None
+
     return tensor.copy_(all_reduce(tensor, op, group, tag))
 
 
+def all_to_all_inplace(
+    output: torch.Tensor,
+    input: torch.Tensor,
+    output_split_sizes=None,
+    input_split_sizes=None,
+    group=None,
+    async_op=False,
+    tag: str = "",
+):
+    assert (
+        not async_op
+    ), "Can't remap async version of inplace op to functional collective"
+
+    group = group or dist.group.WORLD
+    assert group is not None
+
+    return output.copy_(
+        all_to_all_single(
+            input,
+            output_split_sizes,
+            input_split_sizes,
+            group,
+            tag,
+        )
+    )
+
+
+def all_gather_inplace(
+    tensor_list: List[torch.Tensor],
+    tensor: torch.Tensor,
+    group=None,
+    async_op=False,
+    tag: str = "",
+):
+    assert (
+        not async_op
+    ), "Can't remap async version of inplace op to functional collective"
+    assert all(
+        t.size(0) == tensor.size(0) for t in tensor_list
+    ), "Remapping variable size all_gather is not yet supported"
+
+    group = group or dist.group.WORLD
+    assert group is not None
+
+    output = all_gather_tensor(tensor, 0, group, tag)
+
+    # Use aten.slice instead of aten.split because the latter causes
+    # tensor.shape(0) to be unnecessarily baked in when it's a SymInt.
+    output_splits = []
+    offset = 0
+    for t in tensor_list:
+        output_splits.append(output[offset : offset + t.size(0)])
+        offset += t.size(0)
+    for dst, src in zip(tensor_list, output_splits):
+        dst.copy_(src)
+    return tensor_list
+
+
 from torch.distributed.distributed_c10d import (
+    _all_gather_base as legacy_all_gather_base,
+    _reduce_scatter_base as legacy_reduce_scatter_base,
+    all_gather as legacy_all_gather,
     all_gather_into_tensor as legacy_allgather,
     all_reduce as legacy_allreduce,
+    all_to_all_single as legacy_all_to_all_single,
     reduce_scatter_tensor as legacy_reducescatter,
 )
 
@@ -845,4 +1128,8 @@ def all_reduce_inplace(
     legacy_allgather: all_gather_tensor_inplace,
     legacy_reducescatter: reduce_scatter_tensor_inplace,
     legacy_allreduce: all_reduce_inplace,
+    legacy_all_to_all_single: all_to_all_inplace,
+    legacy_all_gather: all_gather_inplace,
+    legacy_reduce_scatter_base: reduce_scatter_tensor_inplace,
+    legacy_all_gather_base: all_gather_tensor_inplace,
 }
diff --git a/torch/distributed/_functional_collectives_impl.py b/torch/distributed/_functional_collectives_impl.py
index 743a5cdfcbdef..7abd33e42afa7 100644
--- a/torch/distributed/_functional_collectives_impl.py
+++ b/torch/distributed/_functional_collectives_impl.py
@@ -1,274 +1,73 @@
-import logging
-import warnings
-import weakref
-from typing import cast, List, Optional
+from typing import List, Optional
 
 import torch
-import torch.distributed as dist
 import torch.distributed.distributed_c10d as c10d
 
 """
-Moved eager kernel implementations to a separate file partly for readability and partly as it is currently
-easier in dynamo to set tracing policy on a file-by-file level.
-
-Do not put code in this file that Dynamo is expected to trace into, as dynamo may disallow this whole file.
-
-DEBUG/TESTING HELPERS:
-
-This module includes some helpers that are quite useful when debugging or testing functional collectives:
-
-_tensor_needs_wait
-_outstanding_wait_count
-_wait_all
-
+This file contains the op impls for the legacy (c10d_functional) functional collectives.
+These impls simply call into the native (_c10d_functional) functional collectives.
 """
 
-logger = logging.getLogger(__name__)
-
-data_ptr_to_work = dict()
-work_version = 0
-
-
-class _WaitRegistration:
-    def __init__(self, work):
-        global work_version
-        self.work = work
-        self.version = work_version
-        self.ptrs = set()
-        self.ptr_alias_count = {}
-        self.cleanup_count = 0
-        work_version += 1
-
-    def _register_tensor_ptr(self, data_ptr):
-        global data_ptr_to_work
-        data_ptr_to_work[data_ptr] = self
-        self.ptrs.add(data_ptr)
-
-    def _record_wrapper(self, ptr):
-        self._register_tensor_ptr(ptr)
-        self.ptr_alias_count.setdefault(ptr, 0)
-        self.ptr_alias_count[ptr] += 1
-        self.cleanup_count += 1
-
-    def wait(self):
-        if self.work is not None:
-            self.work.wait()
-            self.work = None
-        self.cleanup()
-
-    def decrement_live_tensor(self, ptr):
-        self.cleanup_count -= 1
-        if self.cleanup_count == 0:
-            self.wait()
-        else:
-            self.ptr_alias_count[ptr] -= 1
-            if (
-                self.ptr_alias_count[ptr] < 1
-                and data_ptr_to_work.get(ptr, None) == self
-            ):
-                del data_ptr_to_work[ptr]
-
-    def cleanup(self):
-        for ptr in self.ptrs:
-            if data_ptr_to_work.get(ptr, None) == self:
-                del data_ptr_to_work[ptr]
-
-
-def _register_tensor_work(tensor_or_list, work_or_list):
-    if not isinstance(tensor_or_list, list):
-        tensor_or_list = [tensor_or_list]
-    if not isinstance(work_or_list, list):
-        reg = _WaitRegistration(work_or_list)
-        for tensor in tensor_or_list:
-            reg._register_tensor_ptr(tensor.data_ptr())
-    else:
-        for tensor, work in zip(tensor_or_list, work_or_list):
-            reg = _WaitRegistration(work)
-            reg._register_tensor_ptr(tensor.data_ptr())
-
-
-def _wait_reg_dec(ptr, wait_reg):
-    wait_reg.decrement_live_tensor(ptr)
-
-
-def _register_tensor_wrapper(tensor) -> None:
-    global data_ptr_to_work
-    data_ptr = tensor.elem.data_ptr()
-    # Note: we should NEVER try to trace this, bc it registers runtime stuff during trace.
-    # Instead, backends must call this themselves when implementing traced collectives.
-    wait_reg = data_ptr_to_work.get(data_ptr, None)
-    if wait_reg is None:
-        warnings.warn(
-            "Trying to register finalizer to AsyncCollectiveTensor but the inner tensor is already gone"
-        )
-    else:
-        # We force the collective to be waited in the case this tensor goes away to reduce the change of deadlocks.
-        # NOTE: we register the callback to the ACT wrapper class, for the following reasons:
-        # 1. The inner tensor is referenced by the associated Work object, so it's uncollective until we release the
-        #  associated work object
-        # 2. There's a n-to-1 relationship between wrappers and inner tensor due to non-waitable ops like view()
-        wait_reg._record_wrapper(data_ptr)
-        weakref.finalize(tensor, _wait_reg_dec, data_ptr, wait_reg)
-
-
-def _wait_tensor(tensor: torch.Tensor) -> torch.Tensor:
-    global data_ptr_to_work
-    data_ptr = tensor.data_ptr()
-    wait_reg = data_ptr_to_work.get(data_ptr)
-    if wait_reg is not None:
-        wait_reg.wait()
-    return tensor
-
-
-def _tensor_needs_wait(tensor: torch.Tensor) -> bool:
-    """Returns true if ```tensor``` needs to be waited. Works with ACS and inner tensors."""
-    if hasattr(tensor, "_get_acs_underlying_tensor"):
-        tensor = tensor._get_acs_underlying_tensor()
-    data_ptr = tensor.data_ptr()
-    wait_reg = data_ptr_to_work.get(data_ptr)
-    return wait_reg is not None and wait_reg.work is not None
-
-
-def _outstanding_wait_count() -> int:
-    """Returns the number of outstanding work objects waiting to be waited (sic)."""
-    return len(data_ptr_to_work)
-
-
-def _wait_all() -> None:
-    """Wait for all outstanding collectives."""
-    for work_reg in list(data_ptr_to_work.values()):
-        work_reg.wait()
-
-
-def _str_to_reduce_op(reduceOp: str) -> dist.ReduceOp:
-    reduceOp = reduceOp.upper()
-    op = dist.ReduceOp.RedOpType.__members__.get(reduceOp)
-    if op is None:
-        raise ValueError(f"Invalid reduce operation {reduceOp}")
-    return cast(dist.ReduceOp, op)
-
-
-"""
-Kernel implementations (for eager runtime only) - should never be traced by torch.compile
-
-These functions should all be bound to dispatcher ops.  During tracing, the op itself should be
-captured in the graph and the backend should implement the op however it prefers.
-"""
-
-
-def _broadcast(self, src, tag, ranks, group_size):
-    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
-    assert group is not None
-
-    inplace_tensor = self.clone(memory_format=torch.contiguous_format)
-    work = dist.broadcast(inplace_tensor, src, group=group, async_op=True)
-    _register_tensor_work(inplace_tensor, work)
 
-    return inplace_tensor
-
-
-# TODO assert if ranks has duplicated entries
-def _all_reduce(self, reduceOp, tag, ranks, group_size):
-    op = _str_to_reduce_op(reduceOp)
-    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
-    assert group is not None
-
-    inplace_tensor = self.clone(memory_format=torch.contiguous_format)
-    work = dist.all_reduce(inplace_tensor, op=op, group=group, async_op=True)
-    _register_tensor_work(inplace_tensor, work)
-
-    return inplace_tensor
-
-
-def _all_reduce_coalesced(self, reduceOp, tag, ranks, group_size):
-    op = _str_to_reduce_op(reduceOp)
-    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
-    assert group is not None
-
-    inplace_tensor_list = [t.clone(memory_format=torch.contiguous_format) for t in self]
-    work = dist.all_reduce_coalesced(
-        inplace_tensor_list, op=op, group=group, async_op=True
+def _broadcast(input, src, tag, ranks, group_size):
+    group_name = c10d._resolve_group_name_by_ranks_and_tag(ranks, tag)
+    return torch.ops._c10d_functional.broadcast(
+        input,
+        src,
+        group_name,
     )
-    _register_tensor_work(inplace_tensor_list, work)
 
-    return inplace_tensor_list
 
+def _all_reduce(input, reduce_op, tag, ranks, group_size):
+    group_name = c10d._resolve_group_name_by_ranks_and_tag(ranks, tag)
+    return torch.ops._c10d_functional.all_reduce(
+        input,
+        reduce_op,
+        group_name,
+    )
 
-def _all_gather_into_tensor(shard, tag, ranks, group_size):
-    # TODO add dim support?
-    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
-    assert group is not None
-    out_size = list(shard.size())
-    out_size[0] *= group_size
-    out_tensor = shard.new_empty(out_size)
-    assert out_tensor.is_contiguous()
-    # FIXME gloo doesn't support _allgather_base
-    if dist.get_backend(group) == dist.Backend.GLOO or shard.is_cpu:
-        tensor_list = list(torch.chunk(out_tensor, group_size))
-        work = dist.all_gather(tensor_list, shard, group=group, async_op=True)
-    else:
-        work = dist.all_gather_into_tensor(
-            out_tensor, shard, group=group, async_op=True
-        )
-    _register_tensor_work(out_tensor, work)
-
-    return out_tensor
 
+def _all_reduce_coalesced(inputs, reduce_op, tag, ranks, group_size):
+    group_name = c10d._resolve_group_name_by_ranks_and_tag(ranks, tag)
+    return torch.ops._c10d_functional.all_reduce_coalesced(
+        inputs,
+        reduce_op,
+        group_name,
+    )
 
-def _all_gather_into_tensor_coalesced(self, tag, rankset, group_size):
-    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, rankset, group_size)
-    assert group is not None
 
-    def mk_out_tensor(shard):
-        out_size = list(shard.size())
-        out_size[0] *= group_size
-        out_tensor = shard.new_empty(out_size)
-        assert out_tensor.is_contiguous()
-        return out_tensor
+def _all_gather_into_tensor(input, tag, ranks, group_size):
+    group_name = c10d._resolve_group_name_by_ranks_and_tag(ranks, tag)
+    return torch.ops._c10d_functional.all_gather_into_tensor(
+        input,
+        group_size,
+        group_name,
+    )
 
-    out_tensors = [mk_out_tensor(t) for t in self]
 
-    work_list = _all_gather_into_tensor_coalesced_fallback(
-        output_tensors=out_tensors, input_tensors=self, group=group, async_op=True
+def _all_gather_into_tensor_coalesced(input, tag, ranks, group_size):
+    group_name = c10d._resolve_group_name_by_ranks_and_tag(ranks, tag)
+    return torch.ops._c10d_functional.all_gather_into_tensor_coalesced(
+        input,
+        group_size,
+        group_name,
     )
 
-    _register_tensor_work(out_tensors, work_list)
-    return out_tensors
-
 
 def _reduce_scatter_tensor(
     input: torch.Tensor,
-    reduceOp: str,
+    reduce_op: str,
     tag: str,
     ranks: List[int],
     group_size: int,
 ):
-    # TODO add dim support?
-    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
-    assert group is not None
-    op = _str_to_reduce_op(reduceOp)
-
-    if dist.get_backend(group) == dist.Backend.GLOO or input.is_cpu:
-        # cpu::gloo backend does not have reduce_scatter we fallback to do all_reduce
-        # + local chunk
-        logger.warning(
-            "ProcessGroupGloo does not support reduce_scatter, falling back with all reduce!"
-        )
-        reduction_input = input.clone()
-        group_rank = dist.get_rank(group)
-        work = dist.all_reduce(reduction_input, op=op, group=group, async_op=True)
-        out_tensor = reduction_input.chunk(group_size, dim=0)[group_rank]
-        _register_tensor_work(out_tensor, work)
-    else:
-        out_size = list(input.size())
-        out_size[0] //= group_size
-        out_tensor = input.new_empty(out_size)
-        work = dist.reduce_scatter_tensor(
-            out_tensor, input, op=op, group=group, async_op=True
-        )
-        _register_tensor_work(out_tensor, work)
-
-    return out_tensor
+    group_name = c10d._resolve_group_name_by_ranks_and_tag(ranks, tag)
+    return torch.ops._c10d_functional.reduce_scatter_tensor(
+        input,
+        reduce_op,
+        group_size,
+        group_name,
+    )
 
 
 def _reduce_scatter_tensor_coalesced(
@@ -278,68 +77,14 @@ def _reduce_scatter_tensor_coalesced(
     ranks: List[int],
     group_size: int,
 ):
-    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
-    assert group is not None
-    op = _str_to_reduce_op(reduce_op)
-
-    def mk_out_tensor(shard):
-        out_size = list(shard.size())
-        out_size[0] //= group_size
-        out_tensor = shard.new_empty(out_size)
-        assert out_tensor.is_contiguous()
-        return out_tensor
-
-    out_tensors = [mk_out_tensor(t) for t in inputs]
-
-    work_list = _reduce_scatter_tensor_coalesced_fallback(
-        output_tensors=out_tensors,
-        input_tensors=inputs,
-        op=op,
-        group=group,
-        async_op=False,
+    group_name = c10d._resolve_group_name_by_ranks_and_tag(ranks, tag)
+    return torch.ops._c10d_functional.reduce_scatter_tensor_coalesced(
+        inputs,
+        reduce_op,
+        group_size,
+        group_name,
     )
 
-    _register_tensor_work(out_tensors, work_list)
-    return out_tensors
-
-
-def _all_gather_into_tensor_coalesced_fallback(
-    output_tensors, input_tensors, group, async_op=False
-):
-    # all_gather_coalesced is useless, it doesn't work under NCCL and does lots of copies under Gloo
-    # all_gather is useless too because it's single tensor
-    # NCCL's PG::all_gather with multiple tensors is broken, it only works for the multi-device setting
-    #  and fails if you mix same-size with different-size tensor lists.
-    # _coalescing_manager crashed NCCL when used with all_gather_into_tensor.
-    if input_tensors[0].is_cpu or not async_op:
-        work_list = []
-        out_tensors_sliced = [
-            list(torch.chunk(out_tensor, dist.get_world_size(group)))
-            for out_tensor in output_tensors
-        ]
-        for shard, out_tensor in zip(input_tensors, out_tensors_sliced):
-            work = c10d.all_gather(out_tensor, shard, group=group, async_op=async_op)
-            work_list.append(work)
-        return work_list
-    else:
-        with c10d._coalescing_manager(group=group, async_ops=True) as cm:
-            for in_t, out_t in zip(input_tensors, output_tensors):
-                dist.all_gather_into_tensor(out_t, in_t, group=group, async_op=True)
-        return cm
-
-
-def _reduce_scatter_tensor_coalesced_fallback(
-    output_tensors, input_tensors, op, group, async_op=False
-):
-    # All the same reasons as the all_gather fallback
-    work_list = []
-    for shard, out_tensor in zip(input_tensors, output_tensors):
-        work = c10d.reduce_scatter_tensor(
-            out_tensor, shard, op=op, group=group, async_op=async_op
-        )
-        work_list.append(work)
-    return work_list
-
 
 def _all_to_all_single(
     input: torch.Tensor,
@@ -349,27 +94,22 @@ def _all_to_all_single(
     ranks: List[int],
     group_size: int,
 ):
-    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
-
-    if output_split_sizes is not None:
-        torch._check(
-            input.dim() >= 1,
-            lambda: f"Expected input to have at least 1 dim but got {input.dim()} dim",
+    if output_split_sizes is None or input_split_sizes is None:
+        assert output_split_sizes is None and input_split_sizes is None, (
+            "output_split_sizes and input_split_sizes must either be "
+            "specified together or both set to None"
         )
-        out_size = list(input.size())
-        out_size[0] = sum(output_split_sizes)
-        out_tensor = input.new_empty(out_size)
-    else:
-        out_tensor = input.new_empty(input.size())
+        output_split_sizes = [input.shape[0] // group_size] * group_size
+        input_split_sizes = output_split_sizes
 
-    work = c10d.all_to_all_single(
-        out_tensor,
+    group_name = c10d._resolve_group_name_by_ranks_and_tag(ranks, tag)
+    return torch.ops._c10d_functional.all_to_all_single(
         input,
-        output_split_sizes=output_split_sizes,
-        input_split_sizes=input_split_sizes,
-        group=group,
-        async_op=True,
+        output_split_sizes,
+        input_split_sizes,
+        group_name,
     )
-    _register_tensor_work(out_tensor, work)
 
-    return out_tensor
+
+def _wait_tensor(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.ops._c10d_functional.wait_tensor(tensor)
diff --git a/torch/distributed/_shard/api.py b/torch/distributed/_shard/api.py
index c3f8cd2aa64ca..9afa7d9e793a2 100644
--- a/torch/distributed/_shard/api.py
+++ b/torch/distributed/_shard/api.py
@@ -1,4 +1,5 @@
 from contextlib import contextmanager
+from typing import Optional
 import torch
 import torch.distributed as dist
 import torch.nn as nn
@@ -65,7 +66,7 @@ def _shard_tensor(
                 f'sharding_spec={sharding_spec} on rank: {current_rank} does not '  # type: ignore[index]
                 f'match with sharding_spec={entry[1]} on rank: {idx}')
 
-    st = sharding_spec.shard(tensor, src_rank=src_rank, process_group=process_group)
+    st = sharding_spec.shard(tensor, src_rank=src_rank, process_group=pg)
 
     return st
 
@@ -120,7 +121,7 @@ def shard_parameter(
     module.register_parameter(param_name, nn.Parameter(st))
 
 # Tracks the current process group in the load context manager.
-_CURRENT_PROCESS_GROUP = None
+_CURRENT_PROCESS_GROUP: Optional[dist.ProcessGroup] = None
 
 @contextmanager
 def load_with_process_group(process_group):
diff --git a/torch/distributed/_shard/sharded_tensor/__init__.py b/torch/distributed/_shard/sharded_tensor/__init__.py
index f2723dca4bfd1..602f75163782f 100644
--- a/torch/distributed/_shard/sharded_tensor/__init__.py
+++ b/torch/distributed/_shard/sharded_tensor/__init__.py
@@ -1,9 +1,12 @@
-
 import functools
-from typing import List
+from typing import List, TYPE_CHECKING
 
 import torch
-import torch.distributed._shard.sharding_spec as shard_spec
+
+if TYPE_CHECKING:
+    from torch.distributed._shard.sharding_spec import ShardingSpec
+else:
+    ShardingSpec = "ShardingSpec"
 
 from .api import (
     _CUSTOM_SHARDED_OPS,
@@ -18,7 +21,7 @@
 from torch.distributed._shard.op_registry_utils import _decorator_func
 
 
-def empty(sharding_spec: shard_spec.ShardingSpec,
+def empty(sharding_spec: ShardingSpec,
           *size,
           dtype=None,
           layout=torch.strided,
@@ -70,7 +73,7 @@ def empty(sharding_spec: shard_spec.ShardingSpec,
         init_rrefs=init_rrefs,
     )
 
-def ones(sharding_spec: shard_spec.ShardingSpec,
+def ones(sharding_spec: ShardingSpec,
          *size,
          dtype=None,
          layout=torch.strided,
@@ -121,7 +124,7 @@ def ones(sharding_spec: shard_spec.ShardingSpec,
         init_rrefs=init_rrefs
     )
 
-def zeros(sharding_spec: shard_spec.ShardingSpec,
+def zeros(sharding_spec: ShardingSpec,
           *size,
           dtype=None,
           layout=torch.strided,
@@ -172,7 +175,7 @@ def zeros(sharding_spec: shard_spec.ShardingSpec,
         init_rrefs=init_rrefs
     )
 
-def full(sharding_spec: shard_spec.ShardingSpec,
+def full(sharding_spec: ShardingSpec,
          size,
          fill_value,
          *,
@@ -184,7 +187,7 @@ def full(sharding_spec: shard_spec.ShardingSpec,
          process_group=None,
          init_rrefs=False) -> ShardedTensor:
     """
-    Creates a :class:`ShardedTensor` filled with fill_value. The tensor’s dtype
+    Creates a :class:`ShardedTensor` filled with fill_value. The tensor's dtype
         is inferred from fill_value. If dtype is specified, it will override the
         inferred type from fill_value. Needs to be called on all ranks in an SPMD fashion.
     Args:
@@ -192,7 +195,7 @@ def full(sharding_spec: shard_spec.ShardingSpec,
             describing how to shard the Tensor.
         size (int...):  a list, tuple, or `torch.Size` of integers defining the shape of the
             output tensor.
-        fill_value (Scalar) – the value to fill the output tensor with.
+        fill_value (Scalar) - the value to fill the output tensor with.
     Keyword args:
         dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
             Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
@@ -225,7 +228,7 @@ def full(sharding_spec: shard_spec.ShardingSpec,
     torch.nn.init.constant_(sharded_tensor, fill_value)  # type: ignore[arg-type]
     return sharded_tensor
 
-def rand(sharding_spec: shard_spec.ShardingSpec,
+def rand(sharding_spec: ShardingSpec,
          *size,
          dtype=None,
          layout=torch.strided,
@@ -278,7 +281,7 @@ def rand(sharding_spec: shard_spec.ShardingSpec,
     torch.nn.init.uniform_(sharded_tensor, 0, 1)  # type: ignore[arg-type]
     return sharded_tensor
 
-def randn(sharding_spec: shard_spec.ShardingSpec,
+def randn(sharding_spec: ShardingSpec,
           *size,
           dtype=None,
           layout=torch.strided,
diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py
index 06141fd20c923..a5e961e4bb785 100644
--- a/torch/distributed/_shard/sharded_tensor/api.py
+++ b/torch/distributed/_shard/sharded_tensor/api.py
@@ -8,6 +8,7 @@
     Sequence,
     Tuple,
     cast,
+    TYPE_CHECKING,
 )
 import copy
 import warnings
@@ -19,7 +20,6 @@
 import torch.distributed as dist
 from torch.distributed import rpc
 from torch.distributed import distributed_c10d
-from torch.distributed._shard.metadata import ShardMetadata
 import torch.distributed._shard.sharding_spec as shard_spec
 from torch.distributed._shard.sharding_spec.api import (
     _dispatch_custom_op,
@@ -47,6 +47,9 @@
 from torch.utils import _pytree as pytree
 import operator
 
+if TYPE_CHECKING:
+    from torch.distributed._shard.metadata import ShardMetadata
+
 # Tracking for sharded tensor objects.
 _sharded_tensor_lock = threading.Lock()
 _sharded_tensor_current_id = 0
@@ -262,7 +265,7 @@ def __init__(
 
         self._metadata.tensor_properties.memory_format = memory_format
 
-        current_rank = dist.get_rank(self._process_group)
+        current_rank = dist.get_rank()  # global rank
 
         for shard_metadata in self._metadata.shards_metadata:
             rank, device = _parse_and_validate_remote_device(self._process_group, shard_metadata.placement)
@@ -281,12 +284,7 @@ def _prepare_init(self, process_group=None, init_rrefs=False):
         self._init_rrefs = init_rrefs
         self._sharded_tensor_id = None
 
-        self._process_group = (
-            process_group
-            if process_group is not None
-            else distributed_c10d._get_default_group()
-        )
-
+        self._process_group = self._normalize_pg(process_group)
         self._remote_shards: Dict[int, List[rpc.RRef[Shard]]] = {}
 
     def _post_init(self):
@@ -674,6 +672,11 @@ def to(self, *args, **kwargs) -> ShardedTensor:
         )
         return st_to
 
+    @classmethod
+    def _normalize_pg(cls, process_group: Optional[dist.ProcessGroup]) -> dist.ProcessGroup:
+        if process_group is not None:
+            return process_group
+        return distributed_c10d._get_default_group()
 
     @classmethod
     def _init_from_local_shards(
@@ -684,12 +687,8 @@ def _init_from_local_shards(
         init_rrefs=False,
     ):
         # STEP 1: Validate the Shardmetadatas locally
-        process_group = (
-            process_group
-            if process_group is not None
-            else distributed_c10d._get_default_group()
-        )
-        current_rank = dist.get_rank(process_group)
+        process_group = cls._normalize_pg(process_group)
+        current_rank = dist.get_rank()  # intentional to get global rank
         world_size = dist.get_world_size(process_group)
 
         local_sharded_tensor_metadata: Optional[ShardedTensorMetadata] = None
@@ -819,12 +818,8 @@ def _init_from_local_tensor(
             tensor_properties
         )
 
-        process_group = (
-            process_group
-            if process_group is not None
-            else distributed_c10d._get_default_group()
-        )
-        current_rank = dist.get_rank(process_group)
+        process_group = cls._normalize_pg(process_group)
+        current_rank = dist.get_rank()  # intentional to get global rank
 
         local_shards: List[Shard] = []
         for shard_metadata in sharded_tensor_metadata.shards_metadata:
@@ -859,12 +854,8 @@ def _init_from_local_shards_and_global_metadata(  # type: ignore[override]
                  not do cross rank validations, and fully rely on the user
                  for the correctness of sharded_tensor_metadata on each rank
         """
-        process_group = (
-            process_group
-            if process_group is not None
-            else distributed_c10d._get_default_group()
-        )
-        current_rank = dist.get_rank(process_group)
+        process_group = cls._normalize_pg(process_group)
+        current_rank = dist.get_rank()  # intentional to get global rank
 
         shards_metadata = sharded_tensor_metadata.shards_metadata
 
diff --git a/torch/distributed/_shard/sharded_tensor/utils.py b/torch/distributed/_shard/sharded_tensor/utils.py
index 9ca0ee0eba743..d904137ba6f0a 100644
--- a/torch/distributed/_shard/sharded_tensor/utils.py
+++ b/torch/distributed/_shard/sharded_tensor/utils.py
@@ -1,19 +1,21 @@
 import collections.abc
 import copy
-from typing import Optional, List, Sequence
+from typing import Optional, List, Sequence, TYPE_CHECKING
 
 import torch
-from torch.distributed import distributed_c10d
+from torch.distributed import distributed_c10d as c10d
 from torch.distributed import rpc
 from torch.distributed._shard.sharding_spec._internals import (
     check_tensor,
     validate_non_overlapping_shards_metadata,
 )
 
-from torch.distributed._shard.metadata import ShardMetadata
 from .metadata import TensorProperties, ShardedTensorMetadata
 from .shard import Shard
 
+if TYPE_CHECKING:
+    from torch.distributed._shard.metadata import ShardMetadata
+
 def _parse_and_validate_remote_device(pg, remote_device):
     if remote_device is None:
         raise ValueError("remote device is None")
@@ -23,20 +25,25 @@ def _parse_and_validate_remote_device(pg, remote_device):
     device = remote_device.device()
 
     # Validate rank, skip validation if rank is not part of process group.
-    if not distributed_c10d._rank_not_in_group(pg):
-        if rank is not None and (rank < 0 or rank >= distributed_c10d.get_world_size(pg)):
-            raise ValueError(f'Invalid rank: {rank}')
+    if rank is not None and not c10d._rank_not_in_group(pg):
+        pg_global_ranks = c10d.get_process_group_ranks(pg)
+        if rank not in pg_global_ranks:
+            raise ValueError(
+                f"Global rank {rank} does not exist in input process group: {pg_global_ranks}"
+            )
 
     if worker_name is not None:
         if not rpc._is_current_rpc_agent_set():
-            raise RuntimeError(f'RPC framework needs to be initialized for using worker names: {worker_name}')
+            raise RuntimeError(
+                f"RPC framework needs to be initialized for using worker names: {worker_name}"
+            )
 
         workers = rpc._get_current_rpc_agent().get_worker_infos()
         for worker in workers:
             if worker.name == worker_name:
                 return worker.id, device
 
-        raise ValueError(f'Invalid worker name: {worker_name}')
+        raise ValueError(f"Invalid worker name: {worker_name}")
 
     return rank, device
 
@@ -97,7 +104,7 @@ def build_metadata_from_local_shards(
     local_shards: List[Shard],
     global_size: torch.Size,
     current_rank: int,
-    pg: distributed_c10d.ProcessGroup
+    pg: c10d.ProcessGroup
 ) -> ShardedTensorMetadata:
 
     assert len(local_shards) > 0, "must have local shards!"
diff --git a/torch/distributed/_shard/sharding_spec/_internals.py b/torch/distributed/_shard/sharding_spec/_internals.py
index d9de0b9851320..e8275063e0382 100644
--- a/torch/distributed/_shard/sharding_spec/_internals.py
+++ b/torch/distributed/_shard/sharding_spec/_internals.py
@@ -206,4 +206,4 @@ def get_chunk_sharding_params(sharding_dim_size, world_size, spec, rank):
             start_pos = current_offsets
             break
         current_offsets += chunk_size
-    return start_pos, chunk_size
+    return start_pos, chunk_size  # type: ignore[possibly-undefined]
diff --git a/torch/distributed/_shard/sharding_spec/api.py b/torch/distributed/_shard/sharding_spec/api.py
index d389bff5ceba2..1824b66a81940 100644
--- a/torch/distributed/_shard/sharding_spec/api.py
+++ b/torch/distributed/_shard/sharding_spec/api.py
@@ -15,6 +15,7 @@
 
 import torch.distributed._shard.sharded_tensor.metadata as sharded_tensor_meta
 from torch.distributed._shard.op_registry_utils import _decorator_func
+import operator
 
 if TYPE_CHECKING:
     # Only include ShardedTensor when do type checking, exclude it
@@ -185,12 +186,14 @@ def _infer_sharding_spec_from_shards_metadata(shards_metadata):
     chunk_sharding_dim = None
     chunk_offset_list = []
     shard_size_list = []
+    shard_offset_list = []
     # collect local shard metadatas from the global sharded_tensor_metadata
     for shard_metadata in shards_metadata:  # type: ignore[attr-defined]
         placements.append(shard_metadata.placement)
         local_offsets = shard_metadata.shard_offsets
         chunk_offset_list.append(sum(local_offsets))
         shard_size_list.append(shard_metadata.shard_sizes)
+        shard_offset_list.append(shard_metadata.shard_offsets)
         shard_dims = [idx for idx, e in enumerate(local_offsets) if e != 0]
         # If the offset is [0, 0, ..., 0] (all zeros),
         # we cannot decide whether how the tensor is sharded.
@@ -212,7 +215,7 @@ def _infer_sharding_spec_from_shards_metadata(shards_metadata):
     if chunk_sharding_dim is not None:
         # Ensure we infer the correct placement order from offsets
         placements = [
-            x for _, x in sorted(zip(chunk_offset_list, placements), key=lambda e: e[0])
+            x for _, x in sorted(zip(chunk_offset_list, placements), key=operator.itemgetter(0))
         ]
 
         from .chunk_sharding_spec import ChunkShardingSpec
@@ -220,16 +223,21 @@ def _infer_sharding_spec_from_shards_metadata(shards_metadata):
             dim=chunk_sharding_dim,
             placements=placements,
         )
+
         shard_sizes = sorted([x[chunk_sharding_dim] for x in shard_size_list])
         shard_total_length = sum(shard_sizes)
+        shard_offsets = sorted([x[chunk_sharding_dim] for x in shard_offset_list])
+
         chunks = len(placements)
         split_size = get_split_size(shard_total_length, chunks)
         chunk_shard_sizes = sorted(
             [
                 get_chunked_dim_size(shard_total_length, split_size, idx)
-                for idx in range(len(placements))
+                for idx in range(chunks)
             ]
         )
-        if shard_sizes == chunk_shard_sizes:
+        # Should match ChunkShardingSpec offsets calculation
+        chunk_shard_offsets = [split_size * idx for idx in range(chunks)]
+        if shard_sizes == chunk_shard_sizes and shard_offsets == chunk_shard_offsets:
             return chunk_spec
     return EnumerableShardingSpec(shards_metadata)
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py
index a96bc1c25fdac..2775dbd9dd8d1 100644
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py
@@ -9,7 +9,7 @@
 from torch.distributed._shard._utils import narrow_tensor
 import torch.distributed as dist
 import torch.distributed.distributed_c10d as distributed_c10d
-from typing import List, Union, TYPE_CHECKING
+from typing import cast, List, Optional, Union, TYPE_CHECKING
 from ._internals import (
     get_chunked_dim_size,
     get_split_size,
@@ -129,11 +129,15 @@ def shard(self, tensor: torch.Tensor, src_rank: int = 0, process_group=None) ->
             pin_memory=tensor.is_pinned()
         )
         current_rank = dist.get_rank(process_group)
+        current_global_rank = dist.get_rank()
         tensor_meta = self.build_metadata(tensor.size(), tensor_properties)
         local_shards = []
         local_tensor = None
         local_metadata = None
-        tensors_to_scatter = [None] * dist.get_world_size(process_group)
+        tensors_to_scatter = cast(
+            List[Optional[torch.Tensor]],
+            [None] * dist.get_world_size(process_group),
+        )
 
         sharding_dim_size = tensor.size()[self.dim]  # type: ignore[index]
         chunks = len(self.placements)
@@ -142,7 +146,7 @@ def shard(self, tensor: torch.Tensor, src_rank: int = 0, process_group=None) ->
         scatter_shape[self.dim] = split_size  # type: ignore[index]
 
         for shard_meta in tensor_meta.shards_metadata:
-            rank, device = _parse_and_validate_remote_device(process_group, shard_meta.placement)
+            remote_global_rank, device = _parse_and_validate_remote_device(process_group, shard_meta.placement)
             if current_rank == src_rank:
                 # Reshape to get shard for this rank and we don't want autograd
                 # recording here for the narrow op and 'local_shard' should be a
@@ -157,9 +161,11 @@ def shard(self, tensor: torch.Tensor, src_rank: int = 0, process_group=None) ->
                 else:
                     tensor_to_scatter = narrowed_tensor.detach().clone().contiguous()
 
-                tensors_to_scatter[rank] = tensor_to_scatter
+                tensors_to_scatter[
+                    dist.get_group_rank(process_group, remote_global_rank)
+                ] = tensor_to_scatter
 
-            if current_rank == rank:
+            if current_global_rank == remote_global_rank:
                 local_tensor = torch.empty(
                     scatter_shape, dtype=tensor.dtype, layout=tensor.layout, device=device)
                 local_metadata = shard_meta
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
index 24727005870de..c869b71d69e77 100644
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
@@ -117,7 +117,7 @@ def _handle_col_wise_sharding_base(
         padding_idx: If specified, the entries at padding_idx do
             not contribute to the gradient; therefore, the embedding
             vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed “pad”.
+            i.e. it remains as a fixed "pad".
             Note that the embedding vector at padding_idx is
             excluded from the reduction.
 
@@ -209,10 +209,8 @@ def _result_distribute_with_col_rearrange(results, input, world_size, weight, pg
     for placement in weight._sharding_spec.placements:
         dim_size = output_split_sizes[placement.rank()]
         start = sum(
-            [
-                split_size if i < placement.rank() else 0
-                for i, split_size in enumerate(output_split_sizes)
-            ]
+            split_size if i < placement.rank() else 0
+            for i, split_size in enumerate(output_split_sizes)
         )
         indices += list(range(start, start + dim_size))
 
@@ -314,7 +312,7 @@ def _handle_row_wise_mask(gather_inp, padding_idx, weight, world_size, rank):
         padding_idx: If specified, the entries at padding_idx do
             not contribute to the gradient; therefore, the embedding
             vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed “pad”.
+            i.e. it remains as a fixed "pad".
             Note that the embedding vector at padding_idx is
             excluded from the reduction.
         weight: weight tensor of Embedding look-up table.
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py
index e1c1cb6380439..c9cfcba1fe1a2 100644
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py
@@ -202,7 +202,7 @@ def _handle_col_wise_sharding(
         padding_idx: If specified, the entries at padding_idx do
             not contribute to the gradient; therefore, the embedding
             vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed “pad”.
+            i.e. it remains as a fixed "pad".
         pg: process group.
 
     Returns: final result of lookup.
@@ -250,7 +250,7 @@ def _handle_row_wise_sharding(
         padding_idx: If specified, the entries at padding_idx do
             not contribute to the gradient; therefore, the embedding
             vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed “pad”.
+            i.e. it remains as a fixed "pad".
         rank: # of cuda process.
         pg: process group.
 
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
index 1fa474e034265..2f954398f9888 100644
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
@@ -268,7 +268,7 @@ def _handle_col_wise_sharding(
         padding_idx: If specified, the entries at padding_idx do
             not contribute to the gradient; therefore, the embedding
             vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed “pad”.
+            i.e. it remains as a fixed "pad".
             Note that the embedding vector at padding_idx is
             excluded from the reduction.
         pg: process group.
@@ -342,7 +342,7 @@ def _handle_row_wise_sharding(
         padding_idx: If specified, the entries at padding_idx do
             not contribute to the gradient; therefore, the embedding
             vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed “pad”.
+            i.e. it remains as a fixed "pad".
             Note that the embedding vector at padding_idx is
             excluded from the reduction.
         rank: # of cuda process.
@@ -395,7 +395,7 @@ def _handle_row_wise_sharding(
     result = torch.nn.functional.embedding_bag(
         lookup_input,
         torch.cat([local_shard, padding_row]),
-        offsets=offsets_list if offsets is not None else offsets,
+        offsets=offsets_list if offsets is not None else offsets,  # type: ignore[possibly-undefined]
         mode=mode if mode != "mean" else "sum",
         per_sample_weights=per_sample_weights,
         max_norm=max_norm,
diff --git a/torch/distributed/_spmd/api.py b/torch/distributed/_spmd/api.py
index bf0ebd58d17a8..000b28bc25402 100644
--- a/torch/distributed/_spmd/api.py
+++ b/torch/distributed/_spmd/api.py
@@ -294,8 +294,8 @@ def _fused_adam_decomp(
 
 
 DEDUP_TARGETS: Set[torch._ops.OpOverload] = {
-    torch.ops.c10d_functional.all_reduce.default,
-    torch.ops.c10d_functional.wait_tensor.default,
+    torch.ops._c10d_functional.all_reduce.default,
+    torch.ops._c10d_functional.wait_tensor.default,
 }
 
 
diff --git a/torch/distributed/_spmd/comm_tensor.py b/torch/distributed/_spmd/comm_tensor.py
index 0f29b9a809e94..292f5b2508612 100644
--- a/torch/distributed/_spmd/comm_tensor.py
+++ b/torch/distributed/_spmd/comm_tensor.py
@@ -6,7 +6,7 @@
 from torch._C import _disabled_torch_function_impl
 from torch.fx.experimental.proxy_tensor import (
     _ProxyTensor,
-    fetch_tensor_proxy,
+    fetch_object_proxy,
     get_innermost_proxy_mode,
     get_proxy_slot,
     set_proxy_slot,
@@ -193,7 +193,7 @@ def set_work(work: torch.distributed._Work, e: Any):
                     lambda e: e.proxy,
                     tree_map_only(
                         torch.Tensor,
-                        fetch_tensor_proxy(tracer),
+                        fetch_object_proxy(tracer),
                         (unwrapped_args, unwrapped_kwargs),
                     ),
                 )
diff --git a/torch/distributed/_spmd/data_parallel.py b/torch/distributed/_spmd/data_parallel.py
index 5afd378dd87db..c1bfac89f60eb 100644
--- a/torch/distributed/_spmd/data_parallel.py
+++ b/torch/distributed/_spmd/data_parallel.py
@@ -6,7 +6,6 @@
 
 import torch
 
-import torch.distributed.distributed_c10d as c10d
 import torch.fx as fx
 import torch.library
 import torch.nn as nn
@@ -134,7 +133,7 @@ def _gen_shard_strategy(
 ) -> PlacementStrategy:
     """Util function to generate a shard strategy on shard_dim."""
     return PlacementStrategy(
-        output_spec=DTensorSpec(mesh=mesh, placements=(Shard(shard_dim),)),
+        output_specs=DTensorSpec(mesh=mesh, placements=(Shard(shard_dim),)),
         input_specs=input_specs,
     )
 
@@ -144,7 +143,7 @@ def _gen_replicate_strategy(
 ) -> PlacementStrategy:
     """Util function to generate a replicate strategy."""
     return PlacementStrategy(
-        output_spec=DTensorSpec(mesh=mesh, placements=(Replicate(),)),
+        output_specs=DTensorSpec(mesh=mesh, placements=(Replicate(),)),
         input_specs=input_specs,
     )
 
@@ -158,9 +157,8 @@ def _gen_partial_strategy(mesh: DeviceMesh) -> PlacementStrategy:
     # TODO: Only NCCL supports AVG so using backend like Gloo would
     # crash, we should figure out a way to support avg reduction
     # for non-NCCL backend
-    reduce_op = c10d.ReduceOp.AVG  # type: ignore[attr-defined]
     return PlacementStrategy(
-        output_spec=DTensorSpec(mesh=mesh, placements=(_Partial(reduce_op),)),
+        output_specs=DTensorSpec(mesh=mesh, placements=(_Partial("avg"),)),
     )
 
 
@@ -312,7 +310,7 @@ def build_data_parallel_strategies(
                         output_spec = batch_dim_analyzer.compute_act_spec(node, mesh)
 
                         shard_strategy = PlacementStrategy(
-                            output_spec=output_spec, input_specs=[arg_node_spec]
+                            output_specs=output_spec, input_specs=[arg_node_spec]
                         )
                         dp_strategy_map[node] = DataParallelStrategy(
                             NodeType.ACT, [shard_strategy]
@@ -457,7 +455,7 @@ def build_data_parallel_strategies(
                         output_spec = batch_dim_analyzer.compute_act_spec(node, mesh)
 
                         act_strategy = PlacementStrategy(
-                            output_spec=output_spec, input_specs=input_specs
+                            output_specs=output_spec, input_specs=input_specs
                         )
 
                         dp_strategy_map[node] = DataParallelStrategy(
@@ -485,7 +483,7 @@ def build_data_parallel_strategies(
 
                     act_spec = batch_dim_analyzer.compute_act_spec(node, mesh)
                     op_strategy = PlacementStrategy(
-                        output_spec=act_spec, input_specs=input_specs
+                        output_specs=act_spec, input_specs=input_specs
                     )
                     dp_strategy_map[node] = DataParallelStrategy(
                         NodeType.ACT, [op_strategy]
@@ -541,7 +539,7 @@ def mark_data_parallel_shardings(
                 # mark activation as sharded on batch dim
                 node_sharding = node_strategies[0]
 
-            node.meta["sharding"] = node_sharding
+            node.meta["sharding"] = node_sharding  # type: ignore[possibly-undefined]
 
             placeholder_idx += 1
         elif node.op == "call_function":
diff --git a/torch/distributed/_spmd/distribute.py b/torch/distributed/_spmd/distribute.py
index 771b064b57b9e..d0d7aba31489f 100644
--- a/torch/distributed/_spmd/distribute.py
+++ b/torch/distributed/_spmd/distribute.py
@@ -350,7 +350,7 @@ def default_factory_op_rule(
 }
 
 
-# Dispatch override for factory ops, as DTensor cannot propogate sharding spec
+# Dispatch override for factory ops, as DTensor cannot propagate sharding spec
 # without DTensor inputs.
 FACTORY_OPS: Dict[torch._ops.OpOverload, Callable] = {
     aten.scalar_tensor.default: default_factory_op_rule,
diff --git a/torch/distributed/_spmd/experimental_ops.py b/torch/distributed/_spmd/experimental_ops.py
index 40188cf10d3cc..46585c9085086 100644
--- a/torch/distributed/_spmd/experimental_ops.py
+++ b/torch/distributed/_spmd/experimental_ops.py
@@ -60,13 +60,11 @@ def _prop__foreach_binop_list(op_schema: OpSchema) -> OutputSharding:
         # and self is replicated.
         return OutputSharding(
             output_spec=None,
-            schema_suggestions=[
-                OpSchema(
-                    op=op_schema.op,
-                    args_schema=(self, self, scalar) if scalar else (self, self),
-                    kwargs_schema=op_schema.kwargs_schema,
-                )
-            ],
+            redistribute_schema=OpSchema(
+                op=op_schema.op,
+                args_schema=(self, self, scalar) if scalar else (self, self),
+                kwargs_schema=op_schema.kwargs_schema,
+            ),
         )
     else:
         return OutputSharding(output_spec=self)
@@ -105,15 +103,13 @@ def _prop__foreach_addcop_scalar(op_schema: OpSchema):
         # and self is replicated.
         return OutputSharding(
             output_spec=None,
-            schema_suggestions=[
-                OpSchema(
-                    op=op_schema.op,
-                    args_schema=(self, self, self, scalar)
-                    if scalar
-                    else (self, self, self),
-                    kwargs_schema=op_schema.kwargs_schema,
-                )
-            ],
+            redistribute_schema=OpSchema(
+                op=op_schema.op,
+                args_schema=(self, self, self, scalar)
+                if scalar
+                else (self, self, self),
+                kwargs_schema=op_schema.kwargs_schema,
+            ),
         )
     else:
         return OutputSharding(output_spec=self)
@@ -153,13 +149,11 @@ def _prop__fused_adam(op_schema: OpSchema):
         )
         return OutputSharding(
             output_spec=None,
-            schema_suggestions=[
-                OpSchema(
-                    op=op_schema.op,
-                    args_schema=new_schemas + op_schema.args_schema[NT:],
-                    kwargs_schema=op_schema.kwargs_schema,
-                )
-            ],
+            redistribute_schema=OpSchema(
+                op=op_schema.op,
+                args_schema=new_schemas + op_schema.args_schema[NT:],
+                kwargs_schema=op_schema.kwargs_schema,
+            ),
         )
     else:
         return OutputSharding(output_spec=(op_schema.args_schema[0],) * NT)  # type: ignore[arg-type]
@@ -183,13 +177,11 @@ def _prop_nll_loss_forward(op_schema: OpSchema) -> OutputSharding:
         )
         return OutputSharding(
             output_spec=None,
-            schema_suggestions=[
-                OpSchema(
-                    op=op_schema.op,
-                    args_schema=(new_self, target) + op_schema.args_schema[2:],
-                    kwargs_schema=op_schema.kwargs_schema,
-                )
-            ],
+            redistribute_schema=OpSchema(
+                op=op_schema.op,
+                args_schema=(new_self, target) + op_schema.args_schema[2:],
+                kwargs_schema=op_schema.kwargs_schema,
+            ),
         )
     else:
         return OutputSharding(
@@ -224,7 +216,7 @@ def _prop_stack(op_schema: OpSchema) -> OutputSharding:
     assert all(
         t.shape == tensors[0].shape for t in tensors
     ), f"expect all tensors to have the same shape, but got {tensors}."
-    # TODO: provide schema_suggestions when placements do not match
+    # TODO: provide redistribute_schema when placements do not match
     assert all(
         t.placements == tensors[0].placements for t in tensors
     ), f"expect all tensors to have the same placements, but got {tensors}."
@@ -376,8 +368,8 @@ def _refine_sharding(
         assert isinstance(output_sharding.output_spec, DTensorSpec)
         return output_sharding.output_spec.placements
     else:
-        assert output_sharding.schema_suggestions is not None
-        out_schema = output_sharding.schema_suggestions[0].args_schema[0]
+        assert output_sharding.redistribute_schema is not None
+        out_schema = output_sharding.redistribute_schema.args_schema[0]
         assert isinstance(out_schema, DTensorSpec)
         return tuple(out_schema.placements)
 
@@ -433,23 +425,21 @@ def prop_slice_scatter(op_schema: OpSchema) -> OutputSharding:
         # otherwise, return the suggestion.
         return OutputSharding(
             output_spec=None,
-            schema_suggestions=[
-                OpSchema(
-                    op=op_schema.op,
-                    args_schema=(
-                        DTensorSpec(
-                            mesh=input.mesh,
-                            placements=input_suggestion,
-                            tensor_meta=input.tensor_meta,
-                        ),
-                        DTensorSpec(
-                            mesh=src.mesh,
-                            placements=input_suggestion,
-                            tensor_meta=src.tensor_meta,
-                        ),
-                    )
-                    + op_schema.args_schema[2:],
-                    kwargs_schema=op_schema.kwargs_schema,
+            redistribute_schema=OpSchema(
+                op=op_schema.op,
+                args_schema=(
+                    DTensorSpec(
+                        mesh=input.mesh,
+                        placements=input_suggestion,
+                        tensor_meta=input.tensor_meta,
+                    ),
+                    DTensorSpec(
+                        mesh=src.mesh,
+                        placements=input_suggestion,
+                        tensor_meta=src.tensor_meta,
+                    ),
                 )
-            ],
+                + op_schema.args_schema[2:],
+                kwargs_schema=op_schema.kwargs_schema,
+            ),
         )
diff --git a/torch/distributed/_spmd/gm_transformation.py b/torch/distributed/_spmd/gm_transformation.py
index f65f786e3da69..ea2be4bb36ce5 100644
--- a/torch/distributed/_spmd/gm_transformation.py
+++ b/torch/distributed/_spmd/gm_transformation.py
@@ -45,7 +45,7 @@ def __call__(self, gm: fx.GraphModule) -> Callable:
                     "iter_graph_main_gm": iter_gm.main_gm.print_readable(False),
                     "iter_graph_cleanup_gm": iter_gm.cleanup_gm.print_readable(False),
                 },
-                graph_folder,
+                graph_folder,  # type: ignore[possibly-undefined]
             )
 
         return iter_gm
diff --git a/torch/distributed/_spmd/graph_optimization.py b/torch/distributed/_spmd/graph_optimization.py
index 09128c2b91f71..10423fb55cd4c 100644
--- a/torch/distributed/_spmd/graph_optimization.py
+++ b/torch/distributed/_spmd/graph_optimization.py
@@ -353,7 +353,7 @@ def _scatter_wait_result(
             gm.graph.node_replace_all_uses_with(orig_wait, wait_output_node)
 
         if last_split_reshape_node == split_node:
-            last_split_reshape_node = wait_output_node
+            last_split_reshape_node = wait_output_node  # type: ignore[possibly-undefined]
 
     need_sort_nodes = sorted(need_sort_nodes, key=lambda node: node_indices[node])
     gm.graph.move_after(need_sort_nodes, last_split_reshape_node)
diff --git a/torch/distributed/_spmd/iter_graph_module.py b/torch/distributed/_spmd/iter_graph_module.py
index 5d4a0daa79192..f1e8e960f361b 100644
--- a/torch/distributed/_spmd/iter_graph_module.py
+++ b/torch/distributed/_spmd/iter_graph_module.py
@@ -295,7 +295,7 @@ def move_to_next_iter_before(
             raise ValueError(
                 "The target nodes for ``move_to_next_iter_before`` must "
                 "satisfy one of the following conditions: 1) the user of the "
-                "node is in the target nodes, 2) the user is the ouput of the "
+                "node is in the target nodes, 2) the user is the output of the "
                 "graph, 3) there are no users -- the node is a side-effect node. "
             )
 
@@ -561,7 +561,7 @@ def node_replace_all_uses_with(
                 delete_user_cb,
                 propagate_meta=propagate_meta,
             )
-        return ret
+        return ret  # type: ignore[possibly-undefined]
 
     def node_add_user(self, node: fx.Node, user: Any) -> None:
         for graph in self._all_graphs:
@@ -607,8 +607,8 @@ def functionalize_optim(self) -> None:
                 "_foreach_add_",
             ):
                 step_node = node
-                self.node_add_user(optim_node, output_node)
-                self.node_add_user(step_node, optim_node)
+                self.node_add_user(optim_node, output_node)  # type: ignore[possibly-undefined]
+                self.node_add_user(step_node, optim_node)  # type: ignore[possibly-undefined]
 
     def defunctionalize_optim(self) -> None:
         # TODO: remove this API after DCE is not used with IterGraph
@@ -624,8 +624,8 @@ def defunctionalize_optim(self) -> None:
                     "_foreach_add_",
                 ):
                     step_node = node
-                    optim_node.users.pop(output_node, None)
-                    step_node.users.pop(optim_node, None)
+                    optim_node.users.pop(output_node, None)  # type: ignore[possibly-undefined]
+                    step_node.users.pop(optim_node, None)  # type: ignore[possibly-undefined]
 
     def freeze_cross_iter_movement(self) -> None:
         self._freeze_cross_iter_movement = True
diff --git a/torch/distributed/_spmd/parallel_mode.py b/torch/distributed/_spmd/parallel_mode.py
index a908109805e3c..2e9c15258d092 100644
--- a/torch/distributed/_spmd/parallel_mode.py
+++ b/torch/distributed/_spmd/parallel_mode.py
@@ -39,7 +39,7 @@ def partition(
         TODO(@wanchaol): some of these arguments are not necessary for
         partitioning, remove the unnecessary ones later.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @abstractmethod
     def transform_and_compile(self, gm: GraphModule) -> GraphModule:
@@ -51,7 +51,7 @@ def transform_and_compile(self, gm: GraphModule) -> GraphModule:
         the distributed environment.
         """
         # TODO: add more necessary arguments to this interface.
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 class DataParallel(ParallelMode):
diff --git a/torch/distributed/_state_dict_utils.py b/torch/distributed/_state_dict_utils.py
index 17c4cdf72ade3..48d1a6bfb9c25 100644
--- a/torch/distributed/_state_dict_utils.py
+++ b/torch/distributed/_state_dict_utils.py
@@ -1,14 +1,39 @@
+import copy
+import io
 import math
-from typing import Any, Callable, Dict, Optional, Tuple, TYPE_CHECKING
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    List,
+    Mapping,
+    MutableMapping,
+    NamedTuple,
+    Optional,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+)
 
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
+from torch.distributed._functional_collectives import AsyncCollectiveTensor
 
 if dist.is_available() or TYPE_CHECKING:
     from torch.distributed import distributed_c10d
     from torch.distributed._shard.sharded_tensor import ShardedTensor
-    from torch.distributed._tensor import DTensor, Replicate
+    from torch.distributed._tensor import distribute_tensor, DTensor, Replicate
+
+
+def _identity_func(
+    obj: torch.Tensor,
+    pg: Optional[dist.ProcessGroup],
+    device: Optional[torch.device],
+    companion_obj: Any,
+) -> torch.Tensor:
+    return obj
 
 
 def _all_gather_sharded_tensor(
@@ -49,60 +74,125 @@ def _all_gather_sharded_tensor(
     return tensor
 
 
+class CompanionMismatch(Exception):
+    ...
+
+
 def _iterate_state_dict(
     iter_object: Any,
     sharded_tensor_func: Callable,
     dtensor_func: Callable,
+    tensor_func: Callable,
     *,
     pg: Optional[dist.ProcessGroup] = None,
     device: Optional[torch.device] = None,
     cpu_offload: bool = False,
+    companion_obj: Any = None,
     ranks_only: Tuple[int, ...] = tuple(),
+    type_check: bool = True,
+    non_blocking: bool = True,
 ) -> Dict[str, Any]:
+    """Iterate through the state dict, applying the given functions to each tensor type.
+
+    Args:
+        iter_object (Any): the target state_dict.
+        sharded_tensor_func (Callable): the function to apply to ShardedTensor
+        dtensor_func (Callable): the function to apply to DTensor
+        tensor_func (Callable): the function to apply to Tensor
+        pg (Optional[dist.ProcessGroup]): process group passed to tensor functions
+        device (Optional[torch.device]): device passed to tensor functions
+        cpu_offload (bool): whether to offload the tensors to CPU memory. This option is ignored
+            if a companion_obj is supplied.
+        companion_obj (Any): A companion object to the state dict. If this object
+            is supplied, we attempt to copy the tensor to the companion object.
+        ranks_only (Tuple[int, ...]): if this tuple is empty, all ranks will
+            have the same state_dicts. Otherwise only ranks that in ``ranks_only``
+            have the same state_dicts. Other ranks will get empty state_dicts.
+        type_check (bool): check if the instance data type is a supported type
+            that can be saved by DCP.  The current supported data types are
+            torch.Tensor, DTensor, int, float, str, list, dict, None.
+        non_blocking (bool): whether to use non-blocking copy when copying to the companion object.
+    """
     # TODO: should we use pytree?
     cpu_device = torch.device("cpu")
     if isinstance(iter_object, ShardedTensor):
-        ret = sharded_tensor_func(iter_object, pg, device)
+        ret = sharded_tensor_func(iter_object, pg, device, companion_obj)
     elif isinstance(iter_object, DTensor):
-        ret = dtensor_func(iter_object, pg, device)
+        ret = dtensor_func(iter_object, pg, device, companion_obj)
+    elif isinstance(iter_object, torch.Tensor):
+        ret = tensor_func(iter_object, pg, device, companion_obj)
     elif (
-        isinstance(iter_object, (torch.Tensor, int, float, str)) or iter_object is None
+        isinstance(iter_object, (int, float, str, bytes, io.BytesIO))
+        or iter_object is None
     ):
         ret = iter_object
     elif isinstance(iter_object, dict):
+        if companion_obj is not None and (
+            not isinstance(companion_obj, dict)
+            or set(companion_obj.keys()) != set(iter_object.keys())
+        ):
+            msg = (
+                ""
+                if isinstance(companion_obj, dict)
+                else f"{set(companion_obj.keys())=} {set(iter_object.keys())=}"
+            )
+            raise CompanionMismatch(msg)
+
         ret = {
             key: _iterate_state_dict(
                 value,
                 sharded_tensor_func,
                 dtensor_func,
+                tensor_func,
                 pg=pg,
                 device=device,
                 cpu_offload=cpu_offload,
+                companion_obj=companion_obj[key] if companion_obj is not None else None,
                 ranks_only=ranks_only,
+                type_check=type_check,
+                non_blocking=non_blocking,
             )
             for key, value in iter_object.items()
         }
     elif isinstance(iter_object, (list, tuple)):
+        if companion_obj is not None and (
+            not isinstance(companion_obj, (list, tuple))
+            or len(companion_obj) != len(iter_object)
+        ):
+            raise CompanionMismatch
+
         ret = [
             _iterate_state_dict(
                 v,
                 sharded_tensor_func,
                 dtensor_func,
+                tensor_func,
                 pg=pg,
                 device=device,
                 cpu_offload=cpu_offload,
+                companion_obj=companion_obj[idx] if companion_obj is not None else None,
                 ranks_only=ranks_only,
+                type_check=type_check,
+                non_blocking=non_blocking,
             )
-            for v in iter_object
+            for idx, v in enumerate(iter_object)
         ]
         if isinstance(iter_object, tuple):
             ret = tuple(ret)
+    elif not type_check:
+        ret = copy.deepcopy(iter_object)
     else:
         raise ValueError(f"Unexpected value type {type(iter_object)}")
 
     if not ranks_only or dist.get_rank(pg) in ranks_only:
-        if isinstance(ret, torch.Tensor) and cpu_offload:
-            ret = ret.to(cpu_device)
+        if isinstance(ret, torch.Tensor):
+            if cpu_offload and companion_obj is None:
+                ret = ret.to(cpu_device)
+
+            if companion_obj is not None:
+                # TODO: support DTensor
+                companion_obj.copy_(ret, non_blocking=non_blocking)
+                ret = companion_obj
     else:
         ret = {} if isinstance(ret, dict) else None
 
@@ -116,6 +206,7 @@ def _gather_state_dict(
     device: Optional[torch.device] = None,
     cpu_offload: bool = False,
     ranks_only: Tuple[int, ...] = tuple(),
+    type_check: bool = True,
 ) -> Dict[str, Any]:
     """
     Given a state_dict, this API gathers all the ShardedTensors or DTensors in
@@ -137,12 +228,15 @@ def _gather_state_dict(
         ranks_only: (Tuple[int, ...]): if this tuple is empty, all ranks will
             have the same state_dicts. Otherwise only ranks that in ``ranks_only``
             have the same state_dicts. Other ranks will get empty state_dicts.
+        type_check: (bool): check if the instance data type is a supported type
+            that can be saved by DCP.  The current supported data types are
+            torch.Tensor, DTensor, int, float, str, list, dict, None.
 
     Returns:
         The gathered state dictionary.
     """
 
-    def sharded_tensor_func(value, pg, device):
+    def sharded_tensor_func(value, pg, device, companion_obj):
         # ShardedTensor does not seem to record the original device type.
         # So if the tensor is moved to CPU, we won't know the original type.
         # As a result, we have to rely on the user to tell us the correct one.
@@ -159,7 +253,7 @@ def sharded_tensor_func(value, pg, device):
             value = output_tensor
         return value
 
-    def dtensor_func(value, pg, device):
+    def dtensor_func(value, pg, device, companion_obj):
         if value.device != value.device_mesh.device_type:
             value = value.to(value.device_mesh.device_type)
         # FSDP all_gather: [Shard(0)] -> [Replicate()]
@@ -172,33 +266,401 @@ def dtensor_func(value, pg, device):
             device_mesh=value.device_mesh,
             placements=placements,
         )
+        # Call `wait()` to force the tensor to be synchronous with respect
+        # to the main stream.
+        # See the discussion in https://github.com/pytorch/pytorch/pull/117799.
         value = value.to_local()
+        if isinstance(value, AsyncCollectiveTensor):
+            value = value.wait()
         return value
 
     return _iterate_state_dict(
         state_dict,
         sharded_tensor_func,
         dtensor_func,
+        _identity_func,
         pg=pg,
         device=device,
         cpu_offload=cpu_offload,
         ranks_only=ranks_only,
+        type_check=type_check,
     )
 
 
 def _offload_state_dict_to_cpu(
     state_dict: Dict[str, Any],
     *,
-    pg: Optional[dist.ProcessGroup] = None,
-    device: Optional[torch.device] = None,
     ranks_only: Tuple[int, ...] = tuple(),
+    type_check: bool = True,
 ) -> Dict[str, Any]:
-    return _iterate_state_dict(
+    """
+    Given a state_dict, this API offload all the tensors to CPU memory.
+
+    Args:
+        state_dict (Dict[str, Any]): the target state_dict.
+        pg (Optional[dist.ProcessGroup]): the process group that is used to
+            gather ShardedTensor. Note that gathering a DTensor will use
+            the DeviceMesh. So this argument will be ignored when gathering a
+            DTensor.
+        ranks_only: (Tuple[int, ...]): if this tuple is empty, all ranks will
+            have the same state_dicts. Otherwise only ranks that in ``ranks_only``
+            have the same state_dicts. Other ranks will get empty state_dicts.
+        type_check: (bool): check if the instance data type is a supported type
+            that can be saved by DCP.  The current supported data types are
+            torch.Tensor, DTensor, int, float, str, list, dict, None.
+
+    Returns:
+        The gathered state dictionary.
+    """
+
+    ret = _iterate_state_dict(
         state_dict,
-        lambda value, pg, device: value,
-        lambda value, pg, device: value,
-        pg=pg,
-        device=device,
+        _identity_func,
+        _identity_func,
+        _identity_func,
+        pg=None,
+        device=None,
         cpu_offload=True,
         ranks_only=ranks_only,
+        type_check=type_check,
+    )
+    return ret
+
+
+def _copy_state_dict(
+    state_dict: Dict[str, Any],
+    copy_state_dict: Dict[str, Any],
+    non_blocking: bool = False,
+) -> Dict[str, Any]:
+    """
+    Copies all tensors in a given state dict into a different state_dict with the
+    same structure. Additionally, a copied state dict with the same value references
+    is returned. Editing the keys on this state dict will not affect the
+    passed in copy_state_dict (but the value references are the same).
+
+    .. warning::
+        It is expected by this function that state_dict and copy_state_dict share
+        the same structure and data types.
+
+    .. warning::
+        The current supported data types are
+            torch.Tensor, DTensor, int, float, str, list, dict, None.
+
+    Args:
+        state_dict (Dict[str, Any]): the target state_dict.
+        copy_state_dict (Dict[str, Any]):
+            The state dict we are copying into. This state_dict must have exactly
+             the same structure as the source `state_dict`.
+        non_blocking: (bool): Whether copy ops should be performed asynchronously
+
+    Returns:
+        State Dict copy
+    """
+
+    return _iterate_state_dict(
+        state_dict,
+        _identity_func,
+        _identity_func,
+        _identity_func,
+        pg=None,
+        device=None,
+        cpu_offload=False,
+        ranks_only=tuple(),
+        companion_obj=copy_state_dict,
+        type_check=True,
+        non_blocking=non_blocking,
+    )
+
+
+def _create_cpu_state_dict(
+    state_dict: Dict[str, Any], pin_memory: bool = False, share_memory: bool = False
+) -> Dict[str, Any]:
+    """
+    Given a state_dict, create another state_dict with the same structure and elements.
+    However, all tensors in the returned state_dict are new tensors on CPU. These
+    tensors can be placed on pin_memory or share_memory based on the provided arguments.
+
+    .. warning::
+        Setting both `pin_memory` and `share_memory` to True significantly increases the
+        latency of this method because of the nuances which require us to register memory
+        as pinned directly as opposed to relying on the pin_memory cache allocator. This
+        option should only be used for long lived tensors which are required to be shared.
+        This is not the case as long as at least one of `pin_memory` or `share_memory` is
+         set to False.
+
+    """
+
+    def tensor_func(
+        obj: torch.Tensor,
+        pg: Optional[dist.ProcessGroup],
+        device: Optional[torch.device],
+        _: Any,
+    ) -> torch.Tensor:
+        if len(obj.size()) == 0:
+            return torch.tensor(0, dtype=obj.dtype)
+
+        if share_memory:
+            t = torch.empty(*tuple(obj.size()), dtype=obj.dtype).share_memory_()
+            if pin_memory:
+                succ = torch.cuda.cudart().cudaHostRegister(
+                    t.data_ptr(),
+                    t.numel() * t.element_size(),
+                    1,  # lines up with 'cudaHostRegisterPortable'
+                )
+                assert (
+                    succ == 0
+                ), f"Pinning shared memory failed with error-code: {succ}"
+            return t
+        elif pin_memory:
+            return torch.empty(*tuple(obj.size()), dtype=obj.dtype).pin_memory()
+        else:
+            return torch.empty(*tuple(obj.size()), dtype=obj.dtype)
+
+    ret = _iterate_state_dict(
+        state_dict,
+        _identity_func,
+        _identity_func,
+        tensor_func,
+        pg=None,
+        device=None,
+        cpu_offload=False,
+        ranks_only=tuple(),
+        type_check=False,
     )
+    return ret
+
+
+def _check_state_dict_similarity(
+    state_dict: Dict[str, Any],
+    compared_state_dict: Dict[str, Any],
+) -> bool:
+    """
+    Given two state_dicts, check if the structures are the same. And
+    if a [key, tensor] pair exist in one state_dict there must be
+    the a corresponding pait, [key, other_tensor], in the other state_dict,
+    where tensor and other_tensor have the same size and dtype.
+
+    Return the check result.
+    """
+
+    def tensor_func(
+        obj: torch.Tensor,
+        pg: Optional[dist.ProcessGroup],
+        device: Optional[torch.device],
+        companion_obj: Any,
+    ) -> torch.Tensor:
+        if companion_obj.dtype != obj.dtype or companion_obj.size() != obj.size():
+            raise CompanionMismatch
+        return obj
+
+    try:
+        _iterate_state_dict(
+            state_dict,
+            _identity_func,
+            _identity_func,
+            tensor_func,
+            pg=None,
+            device=None,
+            cpu_offload=False,
+            ranks_only=tuple(),
+            companion_obj=compared_state_dict,
+            type_check=False,
+        )
+    except CompanionMismatch:
+        return False
+
+    return True
+
+
+class _TensorInfo(NamedTuple):
+    size: torch.Size
+    dtype: torch.dtype
+
+
+def _broadcast_tensors(
+    full_state_dict: Dict[str, Any],
+    local_state_dict: Dict[str, Any],
+    keys: List[str],
+    device: torch.device,
+    pg: Optional[dist.ProcessGroup] = None,
+) -> None:
+    tensors = []
+    for key in keys:
+        if dist.get_rank() == 0:
+            full_state = full_state_dict[key]
+            assert isinstance(full_state, torch.Tensor)
+            full_tensor = full_state.detach().to(device)
+        else:
+            tensor_info = full_state_dict[key]
+            full_tensor = torch.empty(
+                size=tensor_info.size,
+                device=device,
+                dtype=tensor_info.dtype,
+            )
+
+        tensors.append(full_tensor)
+        local_state = local_state_dict.get(key, None)
+        if local_state is None:
+            continue
+        elif isinstance(local_state, DTensor):
+            local_state_dict[key] = (local_state, full_tensor)
+        else:
+            local_state_dict[key] = full_tensor
+
+    if pg is None:
+        pg = dist.distributed_c10d._get_default_group()
+    dist._broadcast_coalesced(pg, tensors, 500, 0)
+
+    for key in keys:
+        _local_state = local_state_dict.get(key, None)
+        if _local_state is None or torch.is_tensor(_local_state):
+            continue
+
+        local_state = _local_state[0]
+        full_tensor = _local_state[1]
+        local_state_dict[key] = distribute_tensor(
+            full_tensor, local_state.device_mesh, local_state.placements
+        )
+
+
+def _broadcast_state_dict(
+    full_state_dict: Dict[str, Any],
+    local_state_dict: Dict[str, Any],
+    device: torch.device,
+    pg: Optional[dist.ProcessGroup] = None,
+) -> None:
+    # Gather the full state dict keys, non tensor values, scalar tensor values,
+    # and tensor information.
+    ret = {}
+    if dist.get_rank() == 0:
+        for key, value in full_state_dict.items():
+            if not torch.is_tensor(value):
+                ret[key] = value
+            elif value.dim() == 0:
+                ret[key] = value.cpu()
+            else:
+                ret[key] = _TensorInfo(value.size(), value.dtype)
+
+    broadcast_list = [ret]
+    dist.broadcast_object_list(broadcast_list, src=0, group=pg)
+    ret = broadcast_list[0]
+
+    # Gather values
+    keys = []
+    for key, value in ret.items():
+        if not isinstance(value, _TensorInfo):
+            if key in local_state_dict:
+                local_state_dict[key] = value
+            continue
+
+        if dist.get_rank() == 0:
+            ret[key] = full_state_dict[key]
+
+        keys.append(key)
+        # Broadcast every 10 tensors, just hardcode the number for now
+        if len(keys) >= 10:
+            _broadcast_tensors(ret, local_state_dict, keys, device, pg)
+            keys.clear()
+
+    if keys:
+        _broadcast_tensors(ret, local_state_dict, keys, device, pg)
+
+
+# These APIs are from torch.distributed.checkpoint.
+# TODO: We should consolidate the code here as some not all modules can depend on
+# DCP.
+PATH_ITEM = Union[str, int]
+OBJ_PATH = Tuple[PATH_ITEM, ...]
+FLATTEN_MAPPING = Dict[str, OBJ_PATH]
+STATE_DICT_TYPE = Dict[str, Any]
+CONTAINER_TYPE = MutableMapping[PATH_ITEM, Any]
+
+
+def _traverse_state_dict(
+    state_dict: STATE_DICT_TYPE,
+    visitor: Callable[[OBJ_PATH, Any], None],
+) -> None:
+    """
+    Invoke ``visitor`` for each value recursively in ``state_dict``.
+    Mapping, list, and tuple will be flattened and other value types are treated
+    as the terminal values and will invoke ``visitor``.
+    """
+
+    def _traverse_obj(path: OBJ_PATH, value: Any) -> None:
+        if isinstance(value, Mapping):
+            for k, v in value.items():
+                _traverse_obj(path + (str(k),), v)
+        elif isinstance(value, (list, tuple)):
+            for i, v in enumerate(value):
+                _traverse_obj(path + (i,), v)
+        else:
+            visitor(path, value)
+
+    for key, value in state_dict.items():
+        _traverse_obj((str(key),), value)
+
+
+def _flatten_state_dict(
+    state_dict: STATE_DICT_TYPE,
+) -> Tuple[STATE_DICT_TYPE, FLATTEN_MAPPING]:
+    """
+    Flatten ``state_dict`` made of nested dicts and lists into a top level dictionary.
+
+    Use ``unflatten_state_dict`` to revert this process.
+    Returns:
+        A tuple with the flatten state_dict and a mapping from original to new state_dict.
+    N.B. The new keys are derived from the object paths, joined by dot.
+        For example: ``{ 'a': {'b':...}}`` results in the key `a.b`.
+    """
+    flattened: STATE_DICT_TYPE = {}
+    mappings: FLATTEN_MAPPING = {}
+
+    def flat_copy(path: OBJ_PATH, value: Any) -> None:
+        new_fqn = ".".join(map(str, path))
+        if new_fqn in flattened:
+            raise ValueError(f"duplicated flatten key {new_fqn}")
+        flattened[new_fqn] = value
+        mappings[new_fqn] = path
+
+    _traverse_state_dict(state_dict, flat_copy)
+    return flattened, mappings
+
+
+def _set_element(root_dict: STATE_DICT_TYPE, path: OBJ_PATH, value: Any) -> None:
+    """Set ``value`` in ``root_dict`` along the ``path`` object path."""
+    cur_container = cast(CONTAINER_TYPE, root_dict)
+
+    def extend_list(lst: List[Any], idx: int) -> None:
+        while len(lst) <= idx:
+            lst.append(None)
+
+    for i in range(1, len(path)):
+        prev_key = path[i - 1]
+        key = path[i]
+        def_val: Union[CONTAINER_TYPE, List[Any]] = {} if type(key) == str else []
+
+        if isinstance(cur_container, Mapping):
+            cur_container = cast(
+                CONTAINER_TYPE, cur_container.setdefault(prev_key, def_val)
+            )
+        else:
+            extend_list(cur_container, prev_key)
+            if cur_container[prev_key] is None:
+                cur_container[prev_key] = def_val
+            cur_container = cur_container[prev_key]
+
+    key = path[-1]
+    if type(key) == int:
+        extend_list(cast(List[Any], cur_container), key)
+
+    cur_container[key] = value
+
+
+def _unflatten_state_dict(
+    state_dict: STATE_DICT_TYPE, mapping: FLATTEN_MAPPING
+) -> STATE_DICT_TYPE:
+    """Restore the original nested state_dict according to ``mapping`` and the flattened ``state_dict``."""
+    nested: STATE_DICT_TYPE = {}
+    for key, value in state_dict.items():
+        _set_element(nested, mapping[key], value)
+    return nested
diff --git a/torch/distributed/_tensor/_collective_utils.py b/torch/distributed/_tensor/_collective_utils.py
index 03226ce1154c0..cd62a76307f26 100644
--- a/torch/distributed/_tensor/_collective_utils.py
+++ b/torch/distributed/_tensor/_collective_utils.py
@@ -1,17 +1,19 @@
 import logging
 import math
+from dataclasses import dataclass
+from functools import lru_cache
 
 from typing import List, Optional
 
 import torch
+import torch.distributed._functional_collectives as funcol
 import torch.distributed._tensor.placement_types as placement_types
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
 from torch.distributed.distributed_c10d import (
-    all_to_all,
+    _get_group_size_by_name,
     broadcast,
     get_global_rank,
     get_rank,
-    get_world_size,
     GroupMember,
     ProcessGroup,
     scatter,
@@ -21,7 +23,44 @@
 logger = logging.getLogger(__name__)
 
 
-# TODO: we need to migrate these APIs to be functional collectives
+if not torch._running_with_deploy():
+
+    @torch.library.register_fake("_dtensor::shard_dim_alltoall")
+    def _shard_dim_alltoall_meta(input, gather_dim, shard_dim, group_name):
+        group_size = _get_group_size_by_name(group_name)
+        stacked_list = [torch.empty_like(input) for _ in range(group_size)]
+        return torch.cat(stacked_list, dim=gather_dim).chunk(group_size, dim=shard_dim)
+
+else:
+    import warnings
+
+    warnings.warn(
+        "PyTorch Distributed functional collectives do not work with torch::deploy."
+    )
+
+
+def shard_dim_alltoall(input, gather_dim, shard_dim, mesh, mesh_dim):
+    if mesh.device_type == "cpu":
+        # Gloo does not support alltoall, so falling back to allgather + chunk
+
+        # TODO: This logs way too much
+        logger.warning(
+            "CPU process group does not support alltoall yet, falling back with allgather + chunk!"
+        )
+        out = funcol.all_gather_tensor(input, gather_dim, (mesh, mesh_dim))
+        if isinstance(out, funcol.AsyncCollectiveTensor):
+            # stick to the same behavior for the alltoall case, remove this once we enable alltoall async
+            out = out.wait()
+        out = torch.chunk(out, mesh.size(mesh_dim), dim=shard_dim)[
+            mesh.get_local_rank(mesh_dim)
+        ]
+        return out.contiguous() if not out.is_contiguous() else out
+
+    group_name = funcol._resolve_group_name((mesh, mesh_dim))
+    # TODO: enable async op for shard_dim_alltoall
+    return torch.ops._dtensor.shard_dim_alltoall(
+        input, gather_dim, shard_dim, group_name
+    )
 
 
 def mesh_scatter(
@@ -120,46 +159,22 @@ def mesh_broadcast(
     return broadcast(tensor, src=src_for_dim, group=dim_group, async_op=async_op)
 
 
-# TODO: test uneven split on GLOO and NCCL
-def mesh_all_to_all(
-    output_tensor_list: List[torch.Tensor],
-    input_tensor_list: List[torch.Tensor],
-    mesh: DeviceMesh,
-    mesh_dim: int = 0,
-    async_op: bool = False,
-) -> Optional[Work]:
-    dim_group = mesh.get_group(mesh_dim)
-    assert isinstance(dim_group, ProcessGroup)
+def pad_tensor(tensor: torch.Tensor, pad_dim: int, pad_size: int) -> torch.Tensor:
+    if pad_size == 0:
+        return tensor
+    pad = [0, 0] * (tensor.ndim - pad_dim)
+    pad[-1] = pad_size
+    return torch.nn.functional.pad(tensor, pad)
 
-    work = None
-    # no direct dist.all_to_all support on 'gloo' so we manually do scatters
-    if mesh.device_type == "cpu":
-        logger.warning(
-            "ProcessGroupGloo does not support all_to_all, falling back with scatters!"
-        )
-        # TODO: pull the handle of uneven case in #492
-        dim_group_size = get_world_size(dim_group)
-        for i in range(dim_group_size):
-            # src need to be global rank
-            src_for_dim = i
-            if dim_group is not GroupMember.WORLD:
-                src_for_dim = get_global_rank(dim_group, i)
-
-            work = scatter(
-                output_tensor_list[i],
-                input_tensor_list if mesh.get_rank() == src_for_dim else [],
-                group=dim_group,
-                src=src_for_dim,
-                async_op=async_op,
-            )
-    else:
-        work = all_to_all(
-            output_tensor_list,
-            input_tensor_list,
-            dim_group,
-            async_op=async_op,
-        )
-    return work
+
+def unpad_tensor(tensor: torch.Tensor, pad_dim: int, pad_size: int) -> torch.Tensor:
+    if pad_size == 0:
+        return tensor
+    return tensor.narrow(
+        pad_dim,
+        start=0,
+        length=tensor.size(pad_dim) - pad_size,
+    )
 
 
 def spec_to_bytes(spec: "placement_types.DTensorSpec") -> int:
@@ -167,65 +182,85 @@ def spec_to_bytes(spec: "placement_types.DTensorSpec") -> int:
     return spec.tensor_meta.dtype.itemsize * math.prod(spec.shape)
 
 
-def get_bandwidth_factor(mesh: DeviceMesh) -> List[float]:
-    # generate bandwidth factor for intra-host/inter-host communication pattern
-    factors = [1.0] * mesh.ndim
-    num_devices_per_host = _mesh_resources.num_devices_per_host(mesh.device_type)
-
-    num_devices = 1
-    for mesh_dim in reversed(range(mesh.ndim)):
-        num_devices *= mesh.size(mesh_dim)
-        if num_devices <= num_devices_per_host:
-            # magic number for intra-host communication bandwidth factor
-            # TODO: see if we need to tweak this or offer a way for user
-            # to specify the bandwidths
-            factors[mesh_dim] = 0.2
-
-    return factors
-
-
-def allgather_cost(num_bytes: float, mesh: DeviceMesh, mesh_dim: int) -> float:
-    num_devices_on_mesh_dim = mesh.size(mesh_dim)
-    bandwidth_factor = get_bandwidth_factor(mesh)[mesh_dim]
-    # constant latency factor + bandwidth cost
-    return (
-        1
-        + bandwidth_factor
-        * num_bytes
-        * (num_devices_on_mesh_dim - 1)
-        / num_devices_on_mesh_dim
-    )
+@dataclass
+class MeshTopoInfo:
+    """
+    Mesh information for collective cost estimation
+    """
 
+    mesh: DeviceMesh
+    mesh_dim_devices: List[int]
+    mesh_dim_bandwidth: List[float]
+    mesh_dim_latency: List[float]
+
+    @staticmethod
+    @lru_cache(None)
+    def build_from_mesh(mesh: DeviceMesh) -> "MeshTopoInfo":
+        # Generate mesh topology info for intra-host/inter-host communication pattern
+        # Note that we made bunch of assumptions for simplicity:
+        # 1. we assume the mesh is homogeneous, and it's gpu/nccl model
+        # 2. we assume gpu arch is Ampere or Hopper
+        # 3. we assume collectives are all ring base algo for now
+        num_devices_per_host = _mesh_resources.num_devices_per_host(mesh.device_type)
+        # the base bw number (intra-node), GB/s
+        base_bw = 87.7
+        mesh_dim_bandwidth = [base_bw] * mesh.ndim
+        # the latency in terms of us (intra-node, nv-link)
+        mesh_dim_latency = [0.6] * mesh.ndim
+        mesh_dim_devices = [1] * mesh.ndim
+
+        total_num_devices = 1
+        for mesh_dim in reversed(range(mesh.ndim)):
+            num_devices = mesh.size(mesh_dim)
+            mesh_dim_devices[mesh_dim] = num_devices
+            total_num_devices *= num_devices
+            if total_num_devices > num_devices_per_host:
+                # magic number for inter-host communication bandwidth/latency factor
+                # This number assumes latest GPU arch, i.e. Ampere or Hopper
+                # TODO: see if we need to tweak this or offer a way for user
+                # to specify the bandwidths/latency
+                mesh_dim_bandwidth[mesh_dim] *= 0.22
+                # set to ethernet latency for inter-host
+                mesh_dim_latency[mesh_dim] = 2.7
+
+        return MeshTopoInfo(
+            mesh, mesh_dim_devices, mesh_dim_bandwidth, mesh_dim_latency
+        )
 
-def allreduce_cost(num_bytes: float, mesh: DeviceMesh, mesh_dim: int) -> float:
-    num_devices_on_mesh_dim = mesh.size(mesh_dim)
-    bandwidth_factor = get_bandwidth_factor(mesh)[mesh_dim]
-    # allreduce have 2x comm bytes compare to allgather/reduce_scatter
-    return (
-        1
-        + 2
-        * bandwidth_factor
-        * num_bytes
-        * (num_devices_on_mesh_dim - 1)
-        / num_devices_on_mesh_dim
-    )
+
+def allgather_cost(bytes_gb: float, mesh_topo: MeshTopoInfo, mesh_dim: int) -> float:
+    num_devices_on_mesh_dim = mesh_topo.mesh_dim_devices[mesh_dim]
+    mesh_dim_bandwidth = mesh_topo.mesh_dim_bandwidth[mesh_dim]
+    num_hops = num_devices_on_mesh_dim - 1
+    # base latency + comm latency
+    latency = 6.6 + num_hops * mesh_topo.mesh_dim_latency[mesh_dim]  # us
+    bw = (bytes_gb * num_hops / num_devices_on_mesh_dim) / mesh_dim_bandwidth  # s
+    return latency + bw * 1e6  # rescale to us
+
+
+def allreduce_cost(bytes_gb: float, mesh_topo: MeshTopoInfo, mesh_dim: int) -> float:
+    num_devices_on_mesh_dim = mesh_topo.mesh_dim_devices[mesh_dim]
+    mesh_dim_bandwidth = mesh_topo.mesh_dim_bandwidth[mesh_dim]
+    # allreduce have almost 2x comm bytes compare to allgather/reduce_scatter
+    num_hops = 2 * num_devices_on_mesh_dim - 1
+
+    latency = 6.6 + num_hops * mesh_topo.mesh_dim_latency[mesh_dim]
+    bw = (bytes_gb * num_hops / num_devices_on_mesh_dim) / mesh_dim_bandwidth
+    return latency + bw * 1e6
 
 
 def reduce_scatter_cost(
-    num_bytes: float,
-    mesh: DeviceMesh,
+    bytes_gb: float,
+    mesh_topo: MeshTopoInfo,
     mesh_dim: int,
 ) -> float:
-    num_devices_on_mesh_dim = mesh.size(mesh_dim)
-    bandwidth_factor = get_bandwidth_factor(mesh)[mesh_dim]
-    # constant latency factor + bandwidth cost
-    return (
-        1
-        + bandwidth_factor
-        * num_bytes
-        * (num_devices_on_mesh_dim - 1)
-        / num_devices_on_mesh_dim
-    )
+    num_devices_on_mesh_dim = mesh_topo.mesh_dim_devices[mesh_dim]
+    mesh_dim_bandwidth = mesh_topo.mesh_dim_bandwidth[mesh_dim]
+    num_hops = num_devices_on_mesh_dim - 1
+    # base latency + comm latency
+    latency = 6.6 + num_hops * mesh_topo.mesh_dim_latency[mesh_dim]
+    bw = (bytes_gb * num_hops / num_devices_on_mesh_dim) / mesh_dim_bandwidth
+    return latency + bw * 1e6
 
 
 def redistribute_cost(
@@ -251,9 +286,11 @@ def redistribute_cost(
         # comm cost is 0 if current spec is already full replication
         return 0.0
 
-    mesh = current_spec.mesh
+    mesh_topo = MeshTopoInfo.build_from_mesh(current_spec.mesh)
     cost = 0.0
-    comm_bytes = spec_to_bytes(current_spec) / current_spec.num_shards
+    comm_bytes_gb = (
+        spec_to_bytes(current_spec) / current_spec.num_shards / 1024 / 1024 / 1024
+    )
     # Transformation that considered for redistribute cost:
     # 1. allgather 2. alltoall
     # 3. allreduce 4. reduce_scatter
@@ -262,23 +299,25 @@ def redistribute_cost(
     ):
         if current == target:
             continue
+
+        num_devices_on_mesh_dim = mesh_topo.mesh_dim_devices[i]
         if current.is_shard() and target.is_replicate():
             # allgather gives larger comm bytes
-            comm_bytes *= mesh.size(i)
+            comm_bytes_gb *= num_devices_on_mesh_dim
             # add up allgather comm cost
-            cost += allgather_cost(comm_bytes, current_spec.mesh, i)
+            cost += allgather_cost(comm_bytes_gb, mesh_topo, i)
         elif current.is_shard() and target.is_shard():
             # should be alltoall comm, since we haven't implement it yet, add penalty
             # to favor allgather instead
-            cost += allgather_cost(comm_bytes, current_spec.mesh, i) + 1.0
+            cost += allgather_cost(comm_bytes_gb, mesh_topo, i) + 1.0
         elif current.is_partial() and target.is_replicate():
             # add up allreduce comm cost
-            cost += allreduce_cost(comm_bytes, current_spec.mesh, i)
+            cost += allreduce_cost(comm_bytes_gb, mesh_topo, i)
         elif current.is_partial() and target.is_shard():
             # add up reduce_scatter comm cost
-            cost += reduce_scatter_cost(comm_bytes, current_spec.mesh, i)
+            cost += reduce_scatter_cost(comm_bytes_gb, mesh_topo, i)
             # after reduce_scatter the comm bytes for further collectives halved.
-            comm_bytes /= mesh.size(i)
+            comm_bytes_gb /= num_devices_on_mesh_dim
         elif current.is_shard() and target.is_partial():
             # ban shard -> partial as it does not make sense to perform
             # this redistribute
diff --git a/torch/distributed/_tensor/_utils.py b/torch/distributed/_tensor/_utils.py
index d256824d7ac47..08c381dd3d1de 100644
--- a/torch/distributed/_tensor/_utils.py
+++ b/torch/distributed/_tensor/_utils.py
@@ -202,3 +202,25 @@ def try_find_mesh_from_args(
             return arg[0].device_mesh
 
     raise ValueError(f"Cannot find device mesh from args for op : {op_call}.")
+
+
+def compute_local_stride(
+    global_stride: ShapeType, mesh: DeviceMesh, placements: Sequence[Placement]
+) -> Tuple[int, ...]:
+    """
+    Compute the stride of a local tensor shard, given the global stride of the DTensor.
+    NOTE: Currently this function is assuming the DTensor is evenly shardable.
+    """
+    stride_divisors = [1] * len(global_stride)
+    for mesh_idx, p in enumerate(placements):
+        if p.is_shard():
+            i = cast(Shard, p).dim
+            # tensor dimension i is sharded on mesh dimension mesh_idx,
+            # so we need to divide all the strides larger than stride[i]
+            # (by the submesh size)
+            for j in range(len(global_stride)):
+                if global_stride[j] > global_stride[i]:
+                    stride_divisors[j] *= mesh.size(mesh_idx)
+    return tuple(
+        global_stride[i] // stride_divisors[i] for i in range(len(global_stride))
+    )
diff --git a/torch/distributed/_tensor/_xla.py b/torch/distributed/_tensor/_xla.py
deleted file mode 100644
index d99357839aa22..0000000000000
--- a/torch/distributed/_tensor/_xla.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import logging
-import os
-from functools import wraps
-from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union
-
-import torch
-
-import torch.nn as nn
-from torch.distributed._tensor.placement_types import Placement, Replicate
-from torch.distributed.device_mesh import DeviceMesh
-
-log = logging.getLogger(__name__)
-
-TORCH_XLA_INITIALIZED = False
-try:
-    import torch_xla.core.xla_model as xm  # type:ignore[import]  # noqa: F401
-    import torch_xla.runtime as xr  # type:ignore[import]
-    from torch_xla.experimental.xla_sharded_tensor import (  # type:ignore[import]
-        XLAShardedTensor,
-    )
-    from torch_xla.experimental.xla_sharding import (  # type:ignore[import]
-        mark_sharding,
-        Mesh,
-        ShardingType,
-    )
-
-    TORCH_XLA_INITIALIZED = True
-except ImportError as e:
-    log.warning(e.msg)
-
-
-# wrapper to check xla test requirements
-def with_xla(func: Callable) -> Callable:
-    assert func is not None
-
-    @wraps(func)  # pyre-ignore[6]
-    def wrapper(
-        self, *args: Tuple[object], **kwargs: Dict[str, Any]  # type: ignore[misc]
-    ) -> None:
-        if TORCH_XLA_INITIALIZED:
-            # TODO(yeounoh) replace this with xr.use_spmd() when we deprecate the flag.
-            os.environ["XLA_USE_SPMD"] = "1"
-            return func(self, *args, **kwargs)  # type: ignore[misc]
-        else:
-            raise ImportError(
-                "torch.distributed._tensor._xla API requires torch_xla package installation."
-            )
-
-    return wrapper
-
-
-@with_xla
-def convert_to_xla_mesh(dt_mesh: DeviceMesh) -> "Mesh":
-    """
-    Convert DTensor `dt_mesh` to XLAShardedTensor `partition_spec`.
-
-    Example (1x4 logical device mesh topology):
-      ```
-      dt_mesh = DeviceMesh("xla", [[1, 2, 3, 4]])
-      dt_mesh.shape
-      >> torch.Size([1, 4])
-
-      mesh = convert_to_xla_mesh(dt_mesh)
-      mesh_shape
-      >> [1, 4]
-      ```
-    """
-    assert dt_mesh.size() == xr.global_runtime_device_count()
-    return Mesh(
-        dt_mesh.mesh.flatten(), tuple(dt_mesh.mesh.size()), dt_mesh.mesh_dim_names
-    )
-
-
-@with_xla
-def convert_to_xla_partition_spec(
-    tensor: torch.Tensor, placements: Sequence[Placement]
-) -> Tuple[Union[Tuple, int, None]]:
-    """
-    Convert DTensor `placements` to XLAShardedTensor `partitoin_spec`.
-    This supports Shard and Replicate Placement types.
-
-    Example:
-      ```
-      # Mesh partitioning, 1/4-th of the input with replicated overlaps.
-      # The first input tensor dimension is sharded across the second mesh
-      # dimension, and the rest is replicated over the first mesh dimension.
-      t = torch.randn(4, 8, 8)
-      dt_mesh = DeviceMesh("xla", torch.arange(8).reshape(2,4))
-      placements = [Replicate(), Shard(0)]
-      my_dtensor = distribute_tensor(t, dt_mesh, placements)
-
-      # `placements = [Replicate(), Shard(0)]` describes sharding per mesh dim,
-      # and this is equivalent to `partition_spec = (1, None, None)` which is
-      # sharding per input tensor dimension.
-      partition_spec = convert_to_xla_partition_spec(t, placements)
-      >> (1, None, None)
-      ```
-    """
-    # per tensor dimension sharding
-    sharding_spec = [None] * len(tensor.shape)
-    for mesh_idx, spec in enumerate(placements):
-        if spec.is_shard():  # type:ignore[truthy-function]
-            # mesh_idx to tensor_idx (spec.dim)
-            tensor_idx = spec.dim  # type:ignore[attr-defined]
-            sharding_spec[tensor_idx] = mesh_idx  # type:ignore[call-overload]
-        elif spec.is_replicate():
-            # spec.dim is already set to None by default
-            continue
-        else:
-            raise ValueError(f"Unsupported placement type: {type(spec).__name__}")
-    return tuple(sharding_spec)  # type:ignore[return-value]
-
-
-@with_xla
-def xla_distribute_tensor(
-    tensor: torch.Tensor,
-    device_mesh: DeviceMesh,
-    placements: Optional[Sequence[Placement]] = None,
-) -> "XLAShardedTensor":
-    """
-    Distribute a torch.Tensor to the `device_mesh` according to the `placements`
-    specified. The rank of `device_mesh` and `placements` must be the same.
-
-    Args:
-        tensor (torch.Tensor): torch.Tensor to be distributed. Note that if you
-            want to shard a tensor on a dimension that is not evenly divisible by
-            the number of devices in that mesh dimension, we use `torch.chunk`
-            semantic to shard the tensor and scatter the shards.
-        device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to distribute the
-            tensor, if not specified, must be called under a DeviceMesh context
-            manager, default: None
-        placements (List[:class:`Placement`], optional): the placements that
-            describes how to place the tensor on DeviceMesh, must have the same
-            number of elements as `device_mesh.ndim`. If not specified, we will
-            by default replicate the tensor across the `device_mesh` from the
-            first rank of each dimension of the `device_mesh`.
-
-    Returns:
-        A :class:`XLAShardedTensor` object
-
-    .. note:: We return a XLAShardedTensor with a global view and access to local shards.
-    The successive ops would be programmed as if on a single-device and without calling
-    any explicit collective ops. The actual sharded computation on the sharding annotated tensor
-    happens lazily, is transparent to the user. In the future, we will introduce
-    a new DTensor type for this kind of programming-mode (single-controller) and return.
-    """
-    # device_mesh is not optional in xla_distribute_tensor
-    dt_mesh = device_mesh
-    assert dt_mesh.device_type == "xla"
-
-    # convert to XLA device mesh
-    xla_mesh = convert_to_xla_mesh(dt_mesh)
-    assert xla_mesh.mesh_shape == tuple(dt_mesh.mesh.size())
-
-    # convert tensor to the corresponding device type if it's not in that device type
-    if not tensor.is_meta:
-        tensor = tensor.to(dt_mesh.device_type)
-    # set default placements to replicated if not specified
-    if placements is None:
-        placements = [Replicate() for _ in range(dt_mesh.ndim)]
-    assert (
-        len(placements) == dt_mesh.ndim
-    ), "`placements` must have the same length as `device_mesh.ndim`! "
-    f"Found placements length: {len(placements)}, and device_mesh.ndim: {dt_mesh.ndim}."
-    # convert placements to xla partition spec
-    partition_spec = convert_to_xla_partition_spec(tensor, placements)
-    assert len(tensor.shape) == len(
-        partition_spec
-    ), "`partition_spec` from `placements` must have the same length as `tensor.length`! "
-    f"Found tensor shape length: {len(tensor.shape)}, and partition_spec length: {len(partition_spec)}."
-
-    global_tensor = tensor
-    if type(tensor).__name__ == "DTensor":
-        raise ValueError(
-            "Cannot distribute a DTensor with local tensor on xla devices."
-            "The input tensor must be global."
-        )
-    if type(tensor).__name__ == "XLAShardedTensor":
-        sharding_type = tensor.sharding_type  # type:ignore[attr-defined]
-        assert (
-            sharding_type is None or sharding_type == ShardingType.REPLICATED
-        ), "XLAShardedTensor `tensor` is already annotated with non-replication sharding. "
-        "Clear the existing sharding annotation first, by callling torch_xla.experimental.xla_sharding.clear_sharding API."
-        global_tensor = tensor.global_tensor  # type:ignore[attr-defined]
-    assert global_tensor is not None, "distributing a tensor should not be None"
-
-    # Annotates sharding and returns an XLAShardedTensor
-    xla_tensor = mark_sharding(global_tensor, xla_mesh, partition_spec)
-    return xla_tensor
-
-
-@with_xla
-def xla_distribute_module(
-    module: nn.Module,
-    device_mesh: Optional[DeviceMesh] = None,
-    partition_fn: Optional[Callable[[str, nn.Module, DeviceMesh], None]] = None,
-    input_fn: Optional[Callable[..., None]] = None,
-    output_fn: Optional[Callable[..., None]] = None,
-) -> nn.Module:
-    raise NotImplementedError
diff --git a/torch/distributed/_tensor/api.py b/torch/distributed/_tensor/api.py
index c487631eed990..42c01b69cc990 100644
--- a/torch/distributed/_tensor/api.py
+++ b/torch/distributed/_tensor/api.py
@@ -1,16 +1,17 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
+import inspect
 import warnings
-from typing import Callable, cast, Optional, Sequence, Tuple
+from typing import Any, Callable, cast, Optional, Sequence, Tuple
 
 import torch
 
-import torch.distributed._functional_collectives as funcol
 import torch.distributed._tensor.dispatch as op_dispatch
 import torch.distributed._tensor.random as random
 import torch.nn as nn
 from torch.distributed._tensor._collective_utils import mesh_broadcast
 from torch.distributed._tensor._utils import compute_global_tensor_info
 from torch.distributed._tensor.placement_types import (
+    _Partial,
     DTensorSpec,
     Placement,
     Replicate,
@@ -63,14 +64,10 @@ def forward(  # type: ignore[override]
         ctx,
         input: "DTensor",
         grad_placements: Optional[Sequence[Placement]],
-        async_output: bool,
     ):
         ctx.dtensor_spec = input._spec
         ctx.grad_placements = grad_placements
         local_tensor = input._local_tensor
-        if not async_output and isinstance(local_tensor, funcol.AsyncCollectiveTensor):
-            # synchronously wait for any pending collectives to get the result tensor
-            local_tensor = local_tensor.wait()
 
         # We need to return a fresh Tensor object there as autograd metadata
         # will be inplaced into it. So we don't want to pollute the Tensor
@@ -84,27 +81,23 @@ def backward(ctx, grad_output: torch.Tensor):  # type: ignore[override]
         grad_placements = ctx.grad_placements
         dtensor_meta = dtensor_spec.tensor_meta
 
-        if grad_placements is not None:
-            grad_spec = DTensorSpec(mesh, grad_placements)
-            grad_output = redistribute_local_tensor(
-                grad_output, grad_spec, dtensor_spec
-            )
-
         _, tensor_stride = compute_global_tensor_info(
             grad_output, mesh, dtensor_spec.placements
         )
+        tensor_stride = tuple(tensor_stride)
+        grad_placements = grad_placements or dtensor_spec.placements
+
         return (
             DTensor(
                 grad_output,
                 mesh,
-                dtensor_spec.placements,
+                grad_placements,
                 shape=dtensor_meta.shape,
                 dtype=dtensor_meta.dtype,
                 requires_grad=grad_output.requires_grad,
-                stride=tuple(tensor_stride),
+                stride=tensor_stride,
             ),
             None,
-            None,
         )
 
 
@@ -205,6 +198,7 @@ class DTensor(torch.Tensor):  # pyre-ignore[13]: pyre is bad at __new__
     _op_dispatcher: op_dispatch.OpDispatcher = op_dispatch.OpDispatcher()
 
     @staticmethod
+    @torch._disable_dynamo
     def __new__(
         cls,
         local_tensor: torch.Tensor,
@@ -226,7 +220,7 @@ def __new__(
             already have tensor initialized and want to shard this tensor),
             consider using `distribute_tensor`.
         """
-        if requires_grad != local_tensor.requires_grad:
+        if local_tensor.requires_grad and not requires_grad:
             warnings.warn(
                 "To construct DTensor from torch.Tensor, it's recommended to "
                 "use local_tensor.detach() and make requires_grad consistent."
@@ -280,9 +274,22 @@ def __tensor_unflatten__(inner_tensors, flatten_spec, outer_size, outer_stride):
             stride=outer_stride,
         )
 
-    __torch_function__ = torch._C._disabled_torch_function_impl
+    def __coerce_tangent_metadata__(self):
+        if not any(isinstance(p, _Partial) for p in self.placements):
+            return self
+        placements = [
+            Replicate() if isinstance(p, _Partial) else p for p in self.placements
+        ]
+        return self.redistribute(device_mesh=self.device_mesh, placements=placements)
+
+    def __coerce_same_metadata_as_tangent__(self, metadata_tensor):
+        return self.redistribute(
+            device_mesh=self.device_mesh,
+            placements=metadata_tensor.placements,
+        )
 
     @classmethod
+    @torch._disable_dynamo
     # pyre-fixme[3]: Return type must be annotated.
     # pyre-fixme[2]: Parameter must be annotated.
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
@@ -402,13 +409,15 @@ def to_local(
         if grad_placements is not None and not isinstance(grad_placements, tuple):
             grad_placements = tuple(grad_placements)
         return _ToTorchTensor.apply(
-            self, grad_placements, True
+            self, grad_placements
         )  # pyre-ignore[16]: autograd func
 
     def redistribute(
         self,
         device_mesh: Optional[DeviceMesh] = None,
         placements: Optional[Sequence[Placement]] = None,
+        *,
+        async_op: bool = False,
     ) -> "DTensor":
         """
         `redistribute` performs necessary collective operations that redistribute the current
@@ -424,6 +433,10 @@ def redistribute(
                 describes how to place the DTensor into the DeviceMesh, must
                 have the same number of elements as `device_mesh.ndim`.
 
+        Keyword args:
+            async_op (bool, optional): whether to perform the DTensor redistribute operation
+                asynchronously or not. Default: False
+
         Returns:
             A :class:`DTensor` object
 
@@ -450,12 +463,8 @@ def redistribute(
                 placements[i] = Shard(placement.dim + self.ndim)
         placements = tuple(placements)
 
-        # Early return the original DTensor if the placements are the same.
-        if self._spec.placements == placements:
-            return self
-
         # pyre-fixme[16]: `Redistribute` has no attribute `apply`.
-        return Redistribute.apply(self, device_mesh, placements)
+        return Redistribute.apply(self, device_mesh, placements, async_op)
 
     def full_tensor(
         self, *, grad_placements: Optional[Sequence[Placement]] = None
@@ -483,10 +492,10 @@ def full_tensor(
         .. note:: `full_tensor` is differentiable.
         """
 
-        # TODO: fix issue with full_tensor() for uneven-sharded tensor
-        # https://github.com/pytorch/pytorch/issues/115310
-        redist_res = self.redistribute(placements=[Replicate()] * self.device_mesh.ndim)
-        return _ToTorchTensor.apply(redist_res, grad_placements, False)
+        redist_res = self.redistribute(
+            placements=[Replicate()] * self.device_mesh.ndim, async_op=False
+        )
+        return _ToTorchTensor.apply(redist_res, grad_placements)
 
     @property
     def device_mesh(self) -> DeviceMesh:
@@ -546,19 +555,25 @@ def distribute_tensor(
     device_mesh = device_mesh or _mesh_resources.get_current_mesh()
     device_type = device_mesh.device_type
     if device_type == "xla":
-        # call PyTorch/XLA SPMD for `xla` backend type device mesh.
-        # This returns XLAShardedTensor
-        from torch.distributed._tensor._xla import xla_distribute_tensor
+        try:
+            # call PyTorch/XLA SPMD for `xla` backend type device mesh.
+            # This returns XLAShardedTensor
+            from torch_xla.distributed.spmd import (  # type:ignore[import]
+                xla_distribute_tensor,
+            )
 
-        return xla_distribute_tensor(
-            tensor, device_mesh, placements
-        )  # type:ignore[return-value]
+            return xla_distribute_tensor(
+                tensor, device_mesh, placements
+            )  # type:ignore[return-value]
+        except ImportError as e:
+            msg = "To use DTensor API with xla, you must install the torch_xla package!"
+            raise ImportError(msg) from e
 
     # instantiate a RNG tracker if haven't. By default DTensor uses an
     # OffsetBasedRNGTracker to perform random operators.
     # TODO: the value assignment to global variable is not the ideal solution
     # we can replace it in future.
-    if is_rng_supported_mesh(device_mesh) and not random._rng_tracker:
+    if not random._rng_tracker and is_rng_supported_mesh(device_mesh):
         random._rng_tracker = OffsetBasedRNGTracker(device_type)
 
     if not tensor.is_leaf:
@@ -580,8 +595,10 @@ def distribute_tensor(
             f"Found placements length: {len(placements)}, and device_mesh.ndim: {device_mesh.ndim}."
         )
     if isinstance(tensor, DTensor):
-        # if the tensor is already a DTensor, we just need to check if the
-        # device mesh and placements are the same
+        # if the tensor is already a DTensor, we need to check:
+        # 1. if the we can further shard this DTensor if the two device mesh belong to
+        #   the same parenet mesh and further sharding is possible.
+        # 2. check if device mesh and placements are the same
         if tensor.device_mesh != device_mesh:
             raise ValueError(
                 f"Cannot distribute a DTensor with device mesh {tensor.device_mesh} "
@@ -595,7 +612,7 @@ def distribute_tensor(
             )
         return tensor
 
-    local_tensor = tensor
+    local_tensor = tensor.detach()
 
     # distribute the tensor according to the placements.
     placements = list(placements)
@@ -620,7 +637,7 @@ def distribute_tensor(
     # detach the local tensor passed to DTensor since after the construction
     # of DTensor, autograd would work on top of DTensor instead of local tensor
     return DTensor(
-        local_tensor.detach().requires_grad_(tensor.requires_grad),
+        local_tensor.requires_grad_(tensor.requires_grad),
         device_mesh,
         placements,
         shape=tensor.size(),
@@ -634,8 +651,8 @@ def distribute_module(
     module: nn.Module,
     device_mesh: Optional[DeviceMesh] = None,
     partition_fn: Optional[Callable[[str, nn.Module, DeviceMesh], None]] = None,
-    input_fn: Optional[Callable[..., None]] = None,
-    output_fn: Optional[Callable[..., None]] = None,
+    input_fn: Optional[Callable[[nn.Module, Any, DeviceMesh], None]] = None,
+    output_fn: Optional[Callable[[nn.Module, Any, DeviceMesh], None]] = None,
 ) -> nn.Module:
     """
     This function converts all module parameters to :class:`DTensor` parameters
@@ -657,11 +674,32 @@ def distribute_module(
 
     Returns:
         A module that contains parameters/buffers that are all `DTensor`s.
+
+    Note:
+        When initialize the DeviceMesh with the `xla` device_type, `distribute_module`
+        return nn.Module with PyTorch/XLA SPMD annotated parameters. See [link](https://github.com/pytorch/pytorch/issues/92909)
+        for more details. The XLA integration is experimental and subject to change.
     """
 
     torch._C._log_api_usage_once("torch.dtensor.distribute_module")
 
     device_mesh = device_mesh or _mesh_resources.get_current_mesh()
+    device_type = device_mesh.device_type
+    if device_type == "xla":
+        try:
+            # This function annotates all module parameters for auto-partitioning with
+            # PyTorch/XLA SPMD or explicitly partition to :class:`XLAShardedTensor` parameters
+            # according to the `partition_fn` specified.
+            from torch_xla.distributed.spmd import (  # type:ignore[import]
+                xla_distribute_module,
+            )
+
+            return xla_distribute_module(
+                module, device_mesh, partition_fn, input_fn, output_fn
+            )  # type:ignore[return-value]
+        except ImportError as e:
+            msg = "To use DTensor API with xla, you must install the torch_xla package!"
+            raise ImportError(msg) from e
 
     def replicate_module_params_buffers(m: nn.Module, mesh: DeviceMesh) -> None:
         # This function loop over the immediate module parameters and
@@ -695,11 +733,43 @@ def replicate_module_params_buffers(m: nn.Module, mesh: DeviceMesh) -> None:
 
     # register input_fn as module forward pre hook
     if input_fn is not None:
-        module.register_forward_pre_hook(lambda _, inputs: input_fn(inputs, device_mesh))  # type: ignore[misc]
-    # register input_fn as module forward hook
+        # check the input_fn signature
+        num_args = len(inspect.signature(input_fn).parameters)
+        if num_args == 2:
+            # input_fn only takes in inputs and device mesh
+            warnings.warn(
+                "Deprecating input_fn that takes two arguments (inputs, device_mesh), "
+                "please use input_fn that takes in (module, inputs, device_mesh) instead!",
+            )
+            module.register_forward_pre_hook(lambda _, inputs: input_fn(inputs, device_mesh))  # type: ignore[call-arg]
+        elif num_args == 3:
+            # input_fn takes in module, inputs, device mesh
+            module.register_forward_pre_hook(
+                lambda mod, inputs: input_fn(mod, inputs, device_mesh)
+            )
+        else:
+            raise ValueError(
+                f"input_fn should take in 3 arguments, but got {num_args} arguments!"
+            )
+    # register output_fn as module forward hook
     if output_fn is not None:
-        module.register_forward_hook(
-            lambda mod, inputs, outputs: output_fn(outputs, device_mesh)  # type: ignore[misc]
-        )
+        num_args = len(inspect.signature(output_fn).parameters)
+        if num_args == 2:
+            # output_fn only takes in outputs and device mesh
+            warnings.warn(
+                "Deprecating output_fn that takes two arguments (inputs, device_mesh), "
+                "please use output_fn that takes in (module, inputs, device_mesh) instead!",
+            )
+            module.register_forward_hook(
+                lambda mod, inputs, outputs: output_fn(outputs, device_mesh)  # type: ignore[call-arg]
+            )
+        elif num_args == 3:
+            module.register_forward_hook(
+                lambda mod, inputs, outputs: output_fn(mod, outputs, device_mesh)
+            )
+        else:
+            raise ValueError(
+                f"output_fn should take in 3 arguments, but got {num_args} arguments!"
+            )
 
     return module
diff --git a/torch/distributed/_tensor/debug/comm_mode.py b/torch/distributed/_tensor/debug/comm_mode.py
index 25852a842352b..254a02d8f0078 100644
--- a/torch/distributed/_tensor/debug/comm_mode.py
+++ b/torch/distributed/_tensor/debug/comm_mode.py
@@ -2,10 +2,33 @@
 from typing import Any, Dict
 
 import torch
+from torch.distributed._tensor.api import DTensor
 from torch.utils._python_dispatch import TorchDispatchMode
 
 
-funcol = torch.ops.c10d_functional
+funcol_native = torch.ops._c10d_functional
+funcol_py = torch.ops.c10d_functional
+funcol_autograd = torch.ops._c10d_functional_autograd
+c10d_ops = torch.ops.c10d
+
+NATIVE_TO_PY_MAPPING = {
+    funcol_native.all_gather_into_tensor: funcol_py.all_gather_into_tensor,
+    funcol_native.all_gather_into_tensor_coalesced: funcol_py.all_gather_into_tensor_coalesced,
+    funcol_native.all_reduce: funcol_py.all_reduce,
+    funcol_native.all_to_all_single: funcol_py.all_to_all_single,
+    funcol_native.broadcast: funcol_py.broadcast,
+    funcol_native.reduce_scatter_tensor: funcol_py.reduce_scatter_tensor,
+    funcol_native.reduce_scatter_tensor_coalesced: funcol_py.reduce_scatter_tensor_coalesced,
+    # functional ops
+    funcol_autograd.all_to_all_single: funcol_py.all_to_all_single,
+}
+
+c10d_collective_ops = {
+    c10d_ops.allreduce_,
+    c10d_ops._allgather_base_,
+    c10d_ops._reduce_scatter_base_,
+    c10d_ops.broadcast_,
+}
 
 
 class CommDebugMode(TorchDispatchMode):
@@ -30,15 +53,12 @@ class CommDebugMode(TorchDispatchMode):
 
     def __init__(self):
         self.comm_counts: Dict[Any, int] = defaultdict(int)
-        self.comm_registry = {
-            funcol.all_gather_into_tensor,
-            funcol.all_gather_into_tensor_coalesced,
-            funcol.all_reduce,
-            funcol.all_to_all_single,
-            funcol.broadcast,
-            funcol.reduce_scatter_tensor,
-            funcol.reduce_scatter_tensor_coalesced,
-        }
+        self.comm_registry = set()
+        for native_op, py_op in NATIVE_TO_PY_MAPPING.items():
+            self.comm_registry.add(native_op)
+            self.comm_registry.add(py_op)
+
+        self.comm_registry.add(torch.ops._dtensor.shard_dim_alltoall)
 
     def get_total_counts(self) -> int:
         return sum(self.comm_counts.values())
@@ -60,10 +80,26 @@ def __exit__(self, *args):
         super().__exit__(*args)
 
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        # When running this mode with DTensor, ordinarily all modes will
+        # run **before** subclasses get a chance to run.
+        # Returning NotImplemented here gives us a chance to let DTensor
+        # run and desugar into comms ops, before CommDebugMode sees them.
+        if any(t == DTensor for t in types):
+            return NotImplemented
         kwargs = kwargs if kwargs else {}
         out = func(*args, **kwargs)
         func_packet = func._overloadpacket
-        if func_packet in self.comm_registry:
+        # We have many tests that use CommDebugMode to verify the occurrence of
+        # collectives. These tests do so by querying comm_counts with legacy
+        # funcol ops as key. For the purpose of native funcol migration, we
+        # need these tests to work for both legacy and native funcol. To avoid
+        # the need to modify all tests to accommodate the two implementations,
+        # we make CommDebugMode translate native funcol ops into legacy funcol
+        # ops until the migration finishes.
+
+        if func_packet in self.comm_registry or func_packet in c10d_collective_ops:
+            if func_packet in NATIVE_TO_PY_MAPPING:
+                func_packet = NATIVE_TO_PY_MAPPING[func_packet]
             self.comm_counts[func_packet] += 1
 
         return out
diff --git a/torch/distributed/_tensor/debug/op_coverage.py b/torch/distributed/_tensor/debug/op_coverage.py
index 9735bd694f69f..ab97362a76d24 100644
--- a/torch/distributed/_tensor/debug/op_coverage.py
+++ b/torch/distributed/_tensor/debug/op_coverage.py
@@ -1,8 +1,10 @@
 from operator import itemgetter
+from typing import List
 
 from functorch.compile import make_boxed_func
 
 import torch
+import torch.fx
 import torch.nn as nn
 from torch._functorch.compilers import aot_module
 from torch._inductor.decomposition import select_decomp_table
@@ -11,7 +13,7 @@
 
 inductor_decomps = select_decomp_table()
 
-graphs = []
+graphs: List[torch.fx.GraphModule] = []
 
 
 def fwd_bwd_compiler(fx_g, _):
diff --git a/torch/distributed/_tensor/debug/visualize_sharding.py b/torch/distributed/_tensor/debug/visualize_sharding.py
index 6bf4c0482f79f..91bc9c2a382c8 100644
--- a/torch/distributed/_tensor/debug/visualize_sharding.py
+++ b/torch/distributed/_tensor/debug/visualize_sharding.py
@@ -130,11 +130,16 @@ def compute_local_shape_and_global_offset(
         return tuple(local_shape), tuple(global_offset)
 
 
-def visualize_sharding(dtensor):
+def visualize_sharding(dtensor, header=""):
     """
     Visualizes sharding in 1D-2D dtensors
     Requires tabulate, install with `pip install tabulate`
+
+    note: no sharding info will be printed for empty tensors
     """
+    if dtensor.numel() == 0:  # we do not print for empty dtensors
+        return
+
     if len(dtensor.shape) >= 3:
         raise RuntimeError(
             "visualize sharding is only implemented for 1D or 2D dtensor"
@@ -143,6 +148,18 @@ def visualize_sharding(dtensor):
     device_mesh = dtensor.device_mesh
     device_type = dtensor.device_mesh.device_type
 
+    if device_mesh.get_coordinate() is None:  # current rank is not in the mesh
+        return
+
+    # Only display the visualization once for each DTensor, on the rank whose
+    # coordinate is 0 on all dimensions. For example, if the mesh is a full mesh,
+    # we will only print on rank 0.
+    local_rank_zero_on_all_dim = all(
+        device_mesh.get_local_rank(mesh_dim=dim) == 0 for dim in range(device_mesh.ndim)
+    )
+    if not local_rank_zero_on_all_dim:
+        return
+
     device_map = _mesh_to_coordinate(device_mesh, device_type)
     all_offsets = []
     for device in device_map:
@@ -153,5 +170,7 @@ def visualize_sharding(dtensor):
 
     # Convert offsets to blocks with row_ranges for tabulate
     blocks = _convert_offset_to_ranges(all_offsets)
-    if device_mesh.get_rank() == 0:
-        print(_create_table(blocks))
+
+    # Print the table
+    print(header)
+    print(_create_table(blocks))
diff --git a/torch/distributed/_tensor/dispatch.py b/torch/distributed/_tensor/dispatch.py
index f82e378c20989..5227fdf077de4 100644
--- a/torch/distributed/_tensor/dispatch.py
+++ b/torch/distributed/_tensor/dispatch.py
@@ -1,7 +1,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
+import contextlib
 import functools
 import operator
-from typing import cast, Dict, List, Optional, Sequence, Tuple
+import warnings
+from typing import cast, Dict, List, Optional, Sequence, Tuple, TYPE_CHECKING
 
 import torch
 
@@ -24,7 +26,9 @@
     convolution_backward_handler,
     convolution_handler,
 )
-from torch.distributed.device_mesh import DeviceMesh
+
+if TYPE_CHECKING:
+    from torch.distributed.device_mesh import DeviceMesh
 
 try:
     from torch.utils import _cxx_pytree as pytree
@@ -164,9 +168,9 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
         else:
             if output_sharding.needs_redistribute:
                 # compute locally with redistribute first if needed
-                assert output_sharding.schema_suggestions is not None
+                assert output_sharding.redistribute_schema is not None
                 self.redistribute_local_args(
-                    op_info, output_sharding.schema_suggestions[0]
+                    op_info, output_sharding.redistribute_schema
                 )
 
             local_tensor_args = (
@@ -179,18 +183,23 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
 
             # run local op computation with potentially modified args/kwargs
             local_tensor_args = cast(Tuple[object, ...], local_tensor_args)
-            if op_call in self._random_ops and is_rng_supported_mesh(mesh):
-                if not random._rng_tracker:
-                    raise RuntimeError(
-                        "A CudaRNGStateTracker instance must be instantiated "
-                        "before executing a random op over a DTensor. "
-                        "Try calling random.manual_seed() or distribute_tensor() "
-                        "before executing a DTensor random op."
-                    )
+            if op_call in self._random_ops:
+                if not random._rng_tracker and is_rng_supported_mesh(mesh):
+                    # Default to `OffsetBasedRNGTracker` if the parallelism API
+                    # did not already construct one
+                    random._rng_tracker = random.OffsetBasedRNGTracker(mesh.device_type)
+
+                first_arg, first_local_arg = cast(dtensor.DTensor, args[0]), cast(
+                    torch.Tensor, local_tensor_args[0]
+                )
+                rng_context = (
+                    random._rng_tracker._distribute_region(first_arg._spec)
+                    if random._rng_tracker and not first_local_arg.is_meta
+                    else contextlib.nullcontext()
+                )
+
                 # For DTensor random operator, run it within a distribute region
-                with random._rng_tracker._distribute_region(
-                    cast(dtensor.DTensor, args[0])._spec
-                ):
+                with rng_context:
                     local_results = op_call(*local_tensor_args, **op_info.local_kwargs)
             else:
                 local_results = op_call(*local_tensor_args, **op_info.local_kwargs)
@@ -199,7 +208,7 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
         if output_sharding.output_spec is None:
             if op_call == aten.equal.default:
                 obj_list = [None for _ in range(dist.get_world_size())]
-                dist.all_gather_object(obj_list, local_results)
+                dist.all_gather_object(obj_list, local_results)  # type: ignore[possibly-undefined]
                 obj_list = list(filter(lambda x: x is not None, obj_list))
                 # perform reduce on the collection with AND op
                 local_results = functools.reduce(operator.and_, obj_list, True)
@@ -229,7 +238,7 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
             assert len(out_dts) >= 1, "out variant should have at least one out arg"
             return tuple(out_dts) if len(out_dts) > 1 else out_dts[0]
         else:
-            return self.wrap(local_results, output_sharding.output_spec)
+            return self.wrap(local_results, output_sharding.output_spec)  # type: ignore[possibly-undefined]
 
     @staticmethod
     def redistribute_local_args(
@@ -300,7 +309,20 @@ def unwrap_to_op_info(
                 else:
                     mesh = arg.device_mesh
             elif isinstance(arg, torch.Tensor):
-                if arg.ndim == 0 or self._allow_implicit_replication:
+                if arg.numel() == 1 and arg.ndim == 1:
+                    warnings.warn(
+                        "Found a non-scalar tensor with numel=1 and ndim!=0, "
+                        "we are implicitly creating a replicated DTensor for it. "
+                        "However, please consider changing it to a scalar tensor "
+                        "or explicitly create a DTensor under distributed enviroment."
+                    )
+
+                # if the arg.numel() == 1, arg.ndim could be 0 or 1.
+                if (
+                    arg.ndim <= 1
+                    and arg.numel() == 1
+                    or self._allow_implicit_replication
+                ):
                     mesh = mesh or try_find_mesh_from_args(op_call, args_list)
                     # scalar tensor can be safely treated as replicated
                     args_schema.append(
@@ -362,40 +384,35 @@ def unwrap_to_op_info(
 
     @staticmethod
     def wrap(res: object, spec: OutputSpecType) -> object:
-        def to_dt(res, spec):
-            assert spec is not None and isinstance(
-                spec, DTensorSpec
-            ), f"output spec does not match with output! Expected DTensorSpec, got {spec}."
-            assert spec.tensor_meta is not None
-            return dtensor.DTensor(
-                res,
-                spec.mesh,
-                spec.placements,
-                shape=spec.tensor_meta.shape,
-                dtype=spec.tensor_meta.dtype,
-                requires_grad=res.requires_grad,
-                stride=spec.tensor_meta.stride,
-            )
-
         if isinstance(res, torch.Tensor):
-            return to_dt(res, spec)
+            if spec is not None:
+                assert isinstance(
+                    spec, DTensorSpec
+                ), f"output spec does not match with output! Expected DTensorSpec, got {spec}."
+                assert spec.tensor_meta is not None
+                return dtensor.DTensor(
+                    res,
+                    spec.mesh,
+                    spec.placements,
+                    shape=spec.tensor_meta.shape,
+                    dtype=spec.tensor_meta.dtype,
+                    requires_grad=res.requires_grad,
+                    stride=spec.tensor_meta.stride,
+                )
+            else:
+                # if output does not have a DTensorSpec due to specific ops, it must be a scalar tensor
+                assert res.ndim == 0, "output tensor should be scalar!"
+                return res
         elif isinstance(res, (list, tuple)):
             assert spec is not None and isinstance(
                 spec, (list, tuple)
             ), f"output spec does not match with output! Expected list/tuple, got {spec}."
             res_list = []
             for e, s in zip(res, spec):
-                # NOTE: local results might return Optional Tensor from ATen op, so we need
-                # to handle that case and make sure we don't wrap None with DTensor.
-                # (i.e. native_layer_norm.backward)
-                if isinstance(e, (list, tuple)) and isinstance(s, (list, tuple)):
-                    res_list.append(type(e)([to_dt(ee, ss) for ee, ss in zip(e, s)]))
-                elif e is not None and s is not None:
-                    res_list.append(to_dt(e, s))
-                else:
-                    res_list.append(None)  # type: ignore[arg-type]
+                res_list.append(OpDispatcher.wrap(e, s))
 
             return tuple(res_list) if isinstance(res, tuple) else res_list
         else:
-            # if the res contains only non tensor values, we simply return it without rewrapping
+            # if the res contains only non tensor values (i.e. int/float/none), we simply return it
+            # without rewrapping to DTensor.
             return res
diff --git a/torch/distributed/_tensor/examples/checkpoint_example.py b/torch/distributed/_tensor/examples/checkpoint_example.py
index 45c410d5079f8..9bccc07d9625d 100644
--- a/torch/distributed/_tensor/examples/checkpoint_example.py
+++ b/torch/distributed/_tensor/examples/checkpoint_example.py
@@ -76,10 +76,10 @@ def parallel_fn(name, module, device_mesh):
                 module.register_parameter(name, dist_param)
 
     # mark input replicating on mesh
-    def input_fn(inputs, device_mesh):
+    def input_fn(mod, inputs, device_mesh):
         return DTensor.from_local(inputs[0], device_mesh, [Replicate(), Replicate()])
 
-    def output_fn(outputs, device_mesh):
+    def output_fn(mod, outputs, device_mesh):
         assert isinstance(outputs, DTensor)
         return outputs.to_local()
 
@@ -117,10 +117,10 @@ def parallel_fn(name, module, device_mesh):
                 module.register_parameter(name, dist_param)
 
     # mark input replicating on mesh
-    def input_fn(inputs, device_mesh):
+    def input_fn(mod, inputs, device_mesh):
         return DTensor.from_local(inputs[0], device_mesh, [Replicate()])
 
-    def output_fn(outputs, device_mesh):
+    def output_fn(mod, outputs, device_mesh):
         assert isinstance(outputs, DTensor)
         return outputs.to_local()
 
diff --git a/torch/distributed/_tensor/examples/torchrec_sharding_example.py b/torch/distributed/_tensor/examples/torchrec_sharding_example.py
new file mode 100644
index 0000000000000..8edbad13301f9
--- /dev/null
+++ b/torch/distributed/_tensor/examples/torchrec_sharding_example.py
@@ -0,0 +1,409 @@
+"""
+The following example demonstrates how to represent torchrec's embedding
+sharding with the DTensor API.
+"""
+import argparse
+import os
+from functools import cached_property
+from typing import List, TYPE_CHECKING
+
+import torch
+
+from torch.distributed._tensor import (
+    DeviceMesh,
+    DTensor,
+    init_device_mesh,
+    Replicate,
+    Shard,
+)
+from torch.distributed._tensor.debug.visualize_sharding import visualize_sharding
+from torch.distributed.checkpoint.metadata import (
+    ChunkStorageMetadata,
+    TensorProperties,
+    TensorStorageMetadata,
+)
+
+if TYPE_CHECKING:
+    from torch.distributed._tensor.placement_types import Placement
+
+
+def get_device_type():
+    return (
+        "cuda"
+        if torch.cuda.is_available() and torch.cuda.device_count() >= 4
+        else "cpu"
+    )
+
+
+aten = torch.ops.aten
+supported_ops = [aten.view.default, aten._to_copy.default]
+
+
+# this torch.Tensor subclass is a wrapper around all local shards associated
+# with a single sharded embedding table.
+class LocalShardsWrapper(torch.Tensor):
+    local_shards: List[torch.Tensor]
+    storage_meta: TensorStorageMetadata
+
+    @staticmethod
+    def __new__(
+        cls, local_shards: List[torch.Tensor], offsets: List[torch.Size]
+    ) -> "LocalShardsWrapper":
+        assert len(local_shards) > 0
+        assert len(local_shards) == len(offsets)
+        assert local_shards[0].ndim == 2
+        # we calculate the total tensor size by "concat" on second tensor dimension
+        cat_tensor_shape = list(local_shards[0].shape)
+        if len(local_shards) > 1:  # column-wise sharding
+            for shard_size in [s.shape for s in local_shards[1:]]:
+                cat_tensor_shape[1] += shard_size[1]
+
+        # according to DCP, each chunk is expected to have the same properties of the
+        # TensorStorageMetadata that includes it. Vice versa, the wrapper's properties
+        # should also be the same with that of its first chunk.
+        wrapper_properties = TensorProperties.create_from_tensor(local_shards[0])
+        wrapper_shape = torch.Size(cat_tensor_shape)
+        chunks_meta = [
+            ChunkStorageMetadata(o, s.shape) for s, o in zip(local_shards, offsets)
+        ]
+
+        r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+            cls,
+            wrapper_shape,
+        )
+        r.shards = local_shards
+        r.storage_meta = TensorStorageMetadata(
+            properties=wrapper_properties,
+            size=wrapper_shape,
+            chunks=chunks_meta,
+        )
+
+        return r
+
+    # necessary for ops dispatching from this subclass to its local shards
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+
+        # TODO: we shall continually extend this function to support more ops if needed
+        if func in supported_ops:
+            res_shards_list = [
+                func(shard, *args[1:], **kwargs) for shard in args[0].shards
+            ]
+            return LocalShardsWrapper(res_shards_list, args[0].shard_offsets)
+        else:
+            raise NotImplementedError(
+                f"{func} is not supported for LocalShardsWrapper!"
+            )
+
+    @property
+    def shards(self) -> List[torch.Tensor]:
+        return self.local_shards
+
+    @shards.setter
+    def shards(self, local_shards: List[torch.Tensor]):
+        self.local_shards = local_shards
+
+    @cached_property
+    def shard_sizes(self) -> List[torch.Size]:
+        return [chunk.sizes for chunk in self.storage_meta.chunks]
+
+    @cached_property
+    def shard_offsets(self) -> List[torch.Size]:
+        return [chunk.offsets for chunk in self.storage_meta.chunks]
+
+
+def run_torchrec_row_wise_even_sharding_example(rank, world_size):
+    # row-wise even sharding example:
+    #   One table is evenly sharded by rows within the global ProcessGroup.
+    #   In our example, the table's num_embedding is 8, and the embedding dim is 16
+    #   The global ProcessGroup has 4 ranks, so each rank will have one 2 by 16 local
+    #   shard.
+
+    # device mesh is a representation of the worker ranks
+    # create a 1-D device mesh that includes every rank
+    device_type = get_device_type()
+    device = torch.device(device_type)
+    device_mesh = init_device_mesh(device_type=device_type, mesh_shape=(world_size,))
+
+    # manually create the embedding table's local shards
+    num_embeddings = 8
+    embedding_dim = 16
+    emb_table_shape = torch.Size([num_embeddings, embedding_dim])
+    # tensor shape
+    local_shard_shape = torch.Size(
+        [num_embeddings // world_size, embedding_dim]  # (local_rows, local_cols)
+    )
+    # tensor offset
+    local_shard_offset = torch.Size((rank * 2, embedding_dim))
+    # tensor
+    local_tensor = torch.randn(local_shard_shape, device=device)
+    # row-wise sharding: one shard per rank
+    # create the local shards wrapper
+    local_shards_wrapper = LocalShardsWrapper(
+        local_shards=[local_tensor],
+        offsets=[local_shard_offset],
+    )
+
+    ###########################################################################
+    # example 1: transform local_shards into DTensor
+    # usage in TorchRec:
+    #   ShardedEmbeddingCollection stores model parallel params in
+    #   _model_parallel_name_to_sharded_tensor which is initialized in
+    #   _initialize_torch_state() and torch.Tensor params are transformed
+    #   into ShardedTensor by ShardedTensor._init_from_local_shards().
+    #
+    #   This allows state_dict() to always return ShardedTensor objects.
+
+    # this is the sharding placement we use in DTensor to represent row-wise sharding
+    # row_wise_sharding_placements means that the global tensor is sharded by first dim
+    # over the 1-d mesh.
+    row_wise_sharding_placements: List[Placement] = [Shard(0)]
+
+    # create a DTensor from the local shard
+    dtensor = DTensor.from_local(
+        local_shards_wrapper, device_mesh, row_wise_sharding_placements, run_check=False
+    )
+
+    # display the DTensor's sharding
+    visualize_sharding(dtensor, header="Row-wise even sharding example in DTensor")
+
+    ###########################################################################
+    # example 2: transform DTensor into local_shards
+    # usage in TorchRec:
+    #   In ShardedEmbeddingCollection's load_state_dict pre hook
+    #   _pre_load_state_dict_hook, if the source param is a ShardedTensor
+    #   then we need to transform it into its local_shards.
+
+    # transform DTensor into LocalShardsWrapper
+    dtensor_local_shards = dtensor.to_local()
+    assert isinstance(dtensor_local_shards, LocalShardsWrapper)
+    shard_tensor = dtensor_local_shards.shards[0]
+    assert torch.equal(shard_tensor, local_tensor)
+    assert dtensor_local_shards.shard_sizes[0] == local_shard_shape  # unwrap shape
+    assert dtensor_local_shards.shard_offsets[0] == local_shard_offset  # unwrap offset
+
+
+def run_torchrec_row_wise_uneven_sharding_example(rank, world_size):
+    # row-wise uneven sharding example:
+    #   One table is unevenly sharded by rows within the global ProcessGroup.
+    #   In our example, the table's num_embedding is 8, and the embedding dim is 16
+    #   The global ProcessGroup has 4 ranks, and each rank will have the local shard
+    #   of shape:
+    #       rank 0: [1, 16]
+    #       rank 1: [3, 16]
+    #       rank 2: [1, 16]
+    #       rank 3: [3, 16]
+
+    # device mesh is a representation of the worker ranks
+    # create a 1-D device mesh that includes every rank
+    device_type = get_device_type()
+    device = torch.device(device_type)
+    device_mesh = init_device_mesh(device_type=device_type, mesh_shape=(world_size,))
+
+    # manually create the embedding table's local shards
+    num_embeddings = 8
+    embedding_dim = 16
+    emb_table_shape = torch.Size([num_embeddings, embedding_dim])
+    # tensor shape
+    local_shard_shape = (
+        torch.Size([1, embedding_dim])
+        if rank % 2 == 0
+        else torch.Size([3, embedding_dim])
+    )
+    # tensor offset
+    local_shard_offset = torch.Size((rank // 2 * 4 + rank % 2 * 1, embedding_dim))
+    # tensor
+    local_tensor = torch.randn(local_shard_shape, device=device)
+    # local shards
+    # row-wise sharding: one shard per rank
+    # create the local shards wrapper
+    local_shards_wrapper = LocalShardsWrapper(
+        local_shards=[local_tensor],
+        offsets=[local_shard_offset],
+    )
+
+    ###########################################################################
+    # example 1: transform local_shards into DTensor
+    # create the DTensorMetadata which torchrec should provide
+    row_wise_sharding_placements: List[Placement] = [Shard(0)]
+
+    # note: for uneven sharding, we need to specify the shape and stride because
+    # DTensor would assume even sharding and compute shape/stride based on the
+    # assumption. Torchrec needs to pass in this information explicitely.
+    # shape/stride are global tensor's shape and stride
+    dtensor = DTensor.from_local(
+        local_shards_wrapper,  # a torch.Tensor subclass
+        device_mesh,  # DeviceMesh
+        row_wise_sharding_placements,  # List[Placement]
+        run_check=False,
+        shape=emb_table_shape,  # this is required for uneven sharding
+        stride=(embedding_dim, 1),
+    )
+    # so far visualize_sharding() cannot print correctly for unevenly sharded DTensor
+    # because it relies on offset computation which assumes even sharding.
+    visualize_sharding(dtensor, header="Row-wise uneven sharding example in DTensor")
+    # check the dtensor has the correct shape and stride on all ranks
+    assert dtensor.shape == emb_table_shape
+    assert dtensor.stride() == (embedding_dim, 1)
+
+    ###########################################################################
+    # example 2: transform DTensor into local_shards
+    # note: DTensor.to_local() always returns a LocalShardsWrapper
+    dtensor_local_shards = dtensor.to_local()
+    assert isinstance(dtensor_local_shards, LocalShardsWrapper)
+    shard_tensor = dtensor_local_shards.shards[0]
+    assert torch.equal(shard_tensor, local_tensor)
+    assert dtensor_local_shards.shard_sizes[0] == local_shard_shape  # unwrap shape
+    assert dtensor_local_shards.shard_offsets[0] == local_shard_offset  # unwrap offset
+
+
+def run_torchrec_table_wise_sharding_example(rank, world_size):
+    # table-wise example:
+    #   each rank in the global ProcessGroup holds one different table.
+    #   In our example, the table's num_embedding is 8, and the embedding dim is 16
+    #   The global ProcessGroup has 4 ranks, so each rank will have one 8 by 16 complete
+    #   table as its local shard.
+
+    device_type = get_device_type()
+    device = torch.device(device_type)
+    # note: without initializing this mesh, the following local_tensor will be put on
+    # device cuda:0.
+    device_mesh = init_device_mesh(device_type=device_type, mesh_shape=(world_size,))
+
+    # manually create the embedding table's local shards
+    num_embeddings = 8
+    embedding_dim = 16
+    emb_table_shape = torch.Size([num_embeddings, embedding_dim])
+
+    # for table i, if the current rank holds the table, then the local shard is
+    # a LocalShardsWrapper containing the tensor; otherwise the local shard is
+    # an empty torch.Tensor
+    table_to_shards = {}  # map {table_id: local shard of table_id}
+    table_to_local_tensor = {}  # map {table_id: local tensor of table_id}
+    # create 4 embedding tables and place them on different ranks
+    # each rank will hold one complete table, and the dict will store
+    # the corresponding local shard.
+    for i in range(world_size):
+        # tensor
+        local_tensor = (
+            torch.randn(*emb_table_shape, device=device)
+            if rank == i
+            else torch.empty(0, device=device)
+        )
+        table_to_local_tensor[i] = local_tensor
+        # tensor shape
+        local_shard_shape = local_tensor.shape
+        # tensor offset
+        local_shard_offset = torch.Size((0, 0))
+        # wrap local shards into a wrapper
+        local_shards_wrapper = (
+            LocalShardsWrapper(
+                local_shards=[local_tensor],
+                offsets=[local_shard_offset],
+            )
+            if rank == i
+            else local_tensor
+        )
+        table_to_shards[i] = local_shards_wrapper
+
+    ###########################################################################
+    # example 1: transform local_shards into DTensor
+    table_to_dtensor = {}  # same purpose as _model_parallel_name_to_sharded_tensor
+    table_wise_sharding_placements = [Replicate()]  # table-wise sharding
+
+    for table_id, local_shards in table_to_shards.items():
+        # create a submesh that only contains the rank we place the table
+        # note that we cannot use ``init_device_mesh'' to create a submesh
+        # so we choose to use the `DeviceMesh` api to directly create a DeviceMesh
+        device_submesh = DeviceMesh(
+            device_type=device_type,
+            mesh=torch.tensor(
+                [table_id], dtype=torch.int64
+            ),  # table ``table_id`` is placed on rank ``table_id``
+        )
+        # create a DTensor from the local shard for the current table
+        # note: for uneven sharding, we need to specify the shape and stride because
+        # DTensor would assume even sharding and compute shape/stride based on the
+        # assumption. Torchrec needs to pass in this information explicitely.
+        dtensor = DTensor.from_local(
+            local_shards,
+            device_submesh,
+            table_wise_sharding_placements,
+            run_check=False,
+            shape=emb_table_shape,  # this is required for uneven sharding
+            stride=(embedding_dim, 1),
+        )
+        table_to_dtensor[table_id] = dtensor
+
+    # print each table's sharding
+    for table_id, dtensor in table_to_dtensor.items():
+        visualize_sharding(
+            dtensor,
+            header=f"Table-wise sharding example in DTensor for Table {table_id}",
+        )
+        # check the dtensor has the correct shape and stride on all ranks
+        assert dtensor.shape == emb_table_shape
+        assert dtensor.stride() == (embedding_dim, 1)
+
+    ###########################################################################
+    # example 2: transform DTensor into torch.Tensor
+    for table_id, local_tensor in table_to_local_tensor.items():
+        # important: note that DTensor.to_local() always returns an empty torch.Tensor
+        # no matter what was passed to DTensor._local_tensor.
+        dtensor_local_shards = table_to_dtensor[table_id].to_local()
+        if rank == table_id:
+            assert isinstance(dtensor_local_shards, LocalShardsWrapper)
+            shard_tensor = dtensor_local_shards.shards[0]
+            assert torch.equal(shard_tensor, local_tensor)  # unwrap tensor
+            assert (
+                dtensor_local_shards.shard_sizes[0] == emb_table_shape
+            )  # unwrap shape
+            assert dtensor_local_shards.shard_offsets[0] == torch.Size(
+                (0, 0)
+            )  # unwrap offset
+        else:
+            assert dtensor_local_shards.numel() == 0
+
+
+def run_example(rank, world_size, example_name):
+    # the dict that stores example code
+    name_to_example_code = {
+        "row-wise-even": run_torchrec_row_wise_even_sharding_example,
+        "row-wise-uneven": run_torchrec_row_wise_uneven_sharding_example,
+        "table-wise": run_torchrec_table_wise_sharding_example,
+    }
+    if example_name not in name_to_example_code:
+        print(f"example for {example_name} does not exist!")
+        return
+
+    # the example to run
+    example_func = name_to_example_code[example_name]
+
+    # set manual seed
+    torch.manual_seed(0)
+
+    # run the example
+    example_func(rank, world_size)
+
+
+if __name__ == "__main__":
+    # this script is launched via torchrun which automatically manages ProcessGroup
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    assert world_size == 4  # our example uses 4 worker ranks
+    # parse the arguments
+    parser = argparse.ArgumentParser(
+        description="torchrec sharding examples",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    example_prompt = (
+        "choose one sharding example from below:\n"
+        "\t1. row-wise-even;\n"
+        "\t2. row-wise-uneven\n"
+        "\t3. table-wise\n"
+        "e.g. you want to try the row-wise even sharding example, please input 'row-wise-even'\n"
+    )
+    parser.add_argument("-e", "--example", help=example_prompt, required=True)
+    args = parser.parse_args()
+    run_example(rank, world_size, args.example)
diff --git a/torch/distributed/_tensor/examples/visualize_sharding_example.py b/torch/distributed/_tensor/examples/visualize_sharding_example.py
index 81e9d7ee96cc6..6e295e147b38b 100644
--- a/torch/distributed/_tensor/examples/visualize_sharding_example.py
+++ b/torch/distributed/_tensor/examples/visualize_sharding_example.py
@@ -5,56 +5,80 @@
 from torch.distributed._tensor.debug.visualize_sharding import visualize_sharding
 
 world_size = int(os.environ["WORLD_SIZE"])
+rank = int(os.environ["RANK"])
 
 # Example 1
 tensor = torch.randn(4, 4)
 mesh = DeviceMesh("cuda", list(range(world_size)))
 dtensor = distribute_tensor(tensor, mesh, [Shard(dim=1)])
-if int(os.environ["LOCAL_RANK"]) == 0:
-    visualize_sharding(dtensor)
-    """
-             Col 0-0    Col 1-1    Col 2-2    Col 3-3
-    -------  ---------  ---------  ---------  ---------
-    Row 0-3  cuda:0   cuda:1   cuda:2   cuda:3
-    """
+visualize_sharding(dtensor)
+"""
+            Col 0-0    Col 1-1    Col 2-2    Col 3-3
+-------  ---------  ---------  ---------  ---------
+Row 0-3  cuda:0   cuda:1   cuda:2   cuda:3
+"""
 
 # Example 2
 tensor = torch.randn(4, 4)
 mesh = DeviceMesh("cuda", list(range(world_size)))
 dtensor = distribute_tensor(tensor, mesh, [Shard(dim=0)])
-if int(os.environ["LOCAL_RANK"]) == 0:
-    visualize_sharding(dtensor)
-    """
-             Col 0-3
-    -------  ---------
-    Row 0-0  cuda:0
-    Row 1-1  cuda:1
-    Row 2-2  cuda:2
-    Row 3-3  cuda:3
-    """
+visualize_sharding(dtensor)
+"""
+            Col 0-3
+-------  ---------
+Row 0-0  cuda:0
+Row 1-1  cuda:1
+Row 2-2  cuda:2
+Row 3-3  cuda:3
+"""
 
 # Example 3
 tensor = torch.randn(4, 4)
 mesh = DeviceMesh("cuda", [[0, 1], [2, 3]])
 dtensor = distribute_tensor(tensor, mesh, [Shard(dim=0), Replicate()])
-if int(os.environ["LOCAL_RANK"]) == 0:
-    visualize_sharding(dtensor)
-    """
-             Col 0-3
-    -------  ------------------
-    Row 0-1  cuda:0, cuda:1
-    Row 2-3  cuda:2, cuda:3
-    """
+visualize_sharding(dtensor)
+"""
+            Col 0-3
+-------  ------------------
+Row 0-1  cuda:0, cuda:1
+Row 2-3  cuda:2, cuda:3
+"""
 
 # Example 4
 tensor = torch.randn(4, 4)
 mesh = DeviceMesh("cuda", [[0, 1], [2, 3]])
 dtensor = distribute_tensor(tensor, mesh, [Replicate(), Shard(dim=0)])
-if int(os.environ["LOCAL_RANK"]) == 0:
-    visualize_sharding(dtensor)
-    """
-             Col 0-3
-    -------  ------------------
-    Row 0-1  cuda:0, cuda:2
-    Row 2-3  cuda:1, cuda:3
-    """
+visualize_sharding(dtensor)
+"""
+            Col 0-3
+-------  ------------------
+Row 0-1  cuda:0, cuda:2
+Row 2-3  cuda:1, cuda:3
+"""
+
+# Example 5: single-rank submesh
+tensor = torch.randn(4, 4)
+mesh = DeviceMesh("cuda", [rank])
+dtensor = distribute_tensor(tensor, mesh, [Replicate()])
+visualize_sharding(dtensor, header=f"Example 5 rank {rank}:")
+"""
+Example 5 rank 0:
+         Col 0-3
+-------  ---------
+Row 0-3  cuda:0
+
+Example 5 rank 1:
+         Col 0-3
+-------  ---------
+Row 0-3  cuda:1
+
+Example 5 rank 2:
+         Col 0-3
+-------  ---------
+Row 0-3  cuda:2
+
+Example 5 rank 3:
+         Col 0-3
+-------  ---------
+Row 0-3  cuda:3
+"""
diff --git a/torch/distributed/_tensor/experimental/__init__.py b/torch/distributed/_tensor/experimental/__init__.py
index e6a9bbe7ecc8f..587eef3011baf 100644
--- a/torch/distributed/_tensor/experimental/__init__.py
+++ b/torch/distributed/_tensor/experimental/__init__.py
@@ -1,6 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
 from contextlib import contextmanager
 
 from torch.distributed._tensor.api import DTensor
+from torch.distributed._tensor.experimental.local_map import local_map
+
+__all__ = ["local_map", "implicit_replication"]
 
 
 @contextmanager
diff --git a/torch/distributed/_tensor/experimental/attention.py b/torch/distributed/_tensor/experimental/attention.py
new file mode 100644
index 0000000000000..eb7703a96ba5f
--- /dev/null
+++ b/torch/distributed/_tensor/experimental/attention.py
@@ -0,0 +1,647 @@
+import contextlib
+import weakref
+from enum import Enum
+from typing import Any, Dict, Generator, List, Optional, Protocol, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch.distributed._functional_collectives as ft_c
+from torch import nn
+from torch.distributed._tensor import distribute_module, DTensor, Replicate
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor.parallel.style import ParallelStyle
+
+aten = torch.ops.aten
+
+
+def sdpa_handler(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> object:
+    # extract local tensor and sharding infos to a OpInfo
+    op_info = DTensor._op_dispatcher.unwrap_to_op_info(op_call, args, kwargs)
+
+    # sharding propagation
+    DTensor._op_dispatcher.sharding_propagator.propagate(op_info)
+    output_sharding = op_info.output_sharding
+    assert output_sharding is not None, "output sharding should not be None"
+    assert not output_sharding.needs_redistribute, "inputs need to be redistributed"
+
+    local_results = _scaled_dot_product_ring_flash_attention(
+        op_info.mesh,
+        *op_info.local_args,  # type: ignore[arg-type]
+        **op_info.local_kwargs,  # type: ignore[arg-type]
+    )
+
+    return DTensor._op_dispatcher.wrap(local_results, output_sharding.output_spec)
+
+
+def _merge_sdpa(
+    chunks: List[torch.Tensor], logsumexps: List[torch.Tensor]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    This merges multiple scaled dot product attention chunks by using the
+    provided logsumexps to rescale the chunks before summing.
+
+    Args:
+        chunks (List[torch.Tensor]): A list of scaled dot product attention chunks
+        logsumexps (List[torch.Tensor]): A list of logsumexps for each chunk
+
+    Returns:
+        out (torch.Tensor): The merged scaled dot product attention
+        softmax_lse (torch.Tensor): The logsumexp of the merged scaled dot product attention
+    """
+    assert len(chunks) == len(logsumexps)
+
+    # LSE may be padded in the sequence dimension such as with memory efficient attention.
+    seq_len = chunks[0].size(2)
+    logsumexps = [lse[:, :, :seq_len] for lse in logsumexps]
+
+    softmax_lse = torch.stack([lse.exp() for lse in logsumexps]).sum(dim=0).log_()
+
+    out = []
+    for i, (chunk, chunk_lse) in enumerate(zip(chunks, logsumexps)):
+        softmax_lse_corrected = torch.exp(chunk_lse - softmax_lse)
+        out_corrected = chunk * softmax_lse_corrected.unsqueeze(-1).to(chunk.dtype)
+        out.append(out_corrected)
+    out = torch.stack(out).sum(dim=0)
+
+    return out, softmax_lse
+
+
+def _scaled_dot_product_ring_flash_attention(
+    mesh: DeviceMesh,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    return_debug_mask: bool = False,
+    *,
+    scale: Optional[float] = None,
+) -> Tuple[torch.Tensor, ...]:
+    if return_debug_mask:
+        raise NotImplementedError("return_debug_mask is not supported yet")
+
+    return _templated_ring_attention(
+        mesh,
+        torch.ops.aten._scaled_dot_product_flash_attention,
+        query=query,
+        key=key,
+        value=value,
+        dropout_p=dropout_p,
+        is_causal=is_causal,
+        scale=scale,
+    )
+
+
+def _scaled_dot_product_ring_efficient_attention(
+    mesh: DeviceMesh,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    compute_log_sumexp: bool = True,
+    *,
+    scale: Optional[float] = None,
+) -> Tuple[torch.Tensor, ...]:
+    if attn_bias is not None:
+        raise NotImplementedError("attn_bias is not supported yet")
+    if not compute_log_sumexp:
+        raise NotImplementedError("compute_log_sumexp must be set")
+
+    return _templated_ring_attention(
+        mesh,
+        torch.ops.aten._scaled_dot_product_efficient_attention,
+        query=query,
+        key=key,
+        value=value,
+        attn_bias=attn_bias,
+        dropout_p=dropout_p,
+        is_causal=is_causal,
+        scale=scale,
+        compute_log_sumexp=compute_log_sumexp,
+    )
+
+
+def _scaled_dot_product_ring_cudnn_attention(
+    mesh: DeviceMesh,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    return_debug_mask: bool = True,
+    *,
+    scale: Optional[float] = None,
+) -> Tuple[torch.Tensor, ...]:
+    if not return_debug_mask:
+        raise NotImplementedError("return_debug_mask must be set")
+
+    return _templated_ring_attention(
+        mesh,
+        torch.ops.aten._scaled_dot_product_cudnn_attention,
+        query=query,
+        key=key,
+        value=value,
+        dropout_p=dropout_p,
+        is_causal=is_causal,
+        return_debug_mask=return_debug_mask,
+        scale=scale,
+    )
+
+
+def _ring_rotate(block: torch.Tensor, pg: dist.ProcessGroup) -> torch.Tensor:
+    rank = dist.get_rank(pg)
+    size = dist.get_world_size(pg)
+
+    # rank 0 sends to rank 1, rank 1 sends to rank 2, ..., rank n-1 sends to rank 0
+    input_split_sizes = [0] * size
+    input_split_sizes[(rank + 1) % size] = len(block)
+    output_split_sizes = [0] * size
+    output_split_sizes[(rank - 1) % size] = len(block)
+
+    out = ft_c.all_to_all_single_autograd(
+        block, input_split_sizes, output_split_sizes, pg
+    )
+    return out
+
+
+class AttentionOp(Protocol):
+    def __call__(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        *args: object,
+        is_causal: bool = False,
+        **kwargs: object,
+    ) -> Tuple[torch.Tensor, ...]:
+        ...
+
+
+def _templated_ring_attention(
+    mesh: DeviceMesh,
+    op: AttentionOp,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    *args: object,
+    is_causal: bool = False,
+    **kwargs: object,
+) -> Tuple[torch.Tensor, ...]:
+    """
+    This is a generalized ring attention implementation that can support multiple attention ops.
+
+    Parameters
+    ----------
+    op:
+        The attention op to use
+    *args:
+        additional args are passed to the op
+    **kwargs:
+        additional kwargs are passed to the op
+
+    Returns
+    -------
+    out:
+        The merged attention output
+    softmax_lse:
+        The logsumexp of the merged attention output
+    """
+    if is_causal and (query.size(2) != key.size(2)):
+        raise NotImplementedError(
+            "is_causal requires the same query and context sequence lengths"
+        )
+
+    if isinstance(mesh, dist.ProcessGroup):
+        pg: Union[dist.ProcessGroup, List[dist.ProcessGroup]] = mesh
+    else:
+        pg = mesh.get_group()
+    assert isinstance(pg, dist.ProcessGroup), "process group must be single dimension"
+    rank = dist.get_rank(pg)
+    size = dist.get_world_size(pg)
+
+    next_kv = None
+
+    chunks = []
+    logsumexps = []
+    for i in range(size):
+        # overlap communication with compute
+        if next_kv is not None:
+            next_kv = ft_c.wait_tensor(next_kv)
+            key = next_kv[: key.numel()].reshape(key.shape)
+            value = next_kv[key.numel() :].reshape(value.shape)
+
+        if i < (size - 1):
+            next_kv = torch.cat([key.flatten(), value.flatten()])
+            next_kv = _ring_rotate(next_kv, pg)
+
+        is_causal_behavior = _is_causal_behavior(
+            rank=rank, world_size=size, i=i, is_causal=is_causal
+        )
+
+        if is_causal_behavior != _CausalBehavior.SKIP:
+            local_results = op(
+                query,
+                key,
+                value,
+                *args,
+                is_causal=is_causal_behavior.value,
+                **kwargs,
+            )
+            chunks.append(local_results[0])
+            logsumexps.append(local_results[1])
+
+    out, softmax_lse = _merge_sdpa(chunks, logsumexps)
+
+    local_results = (out, softmax_lse) + local_results[2:]
+    return local_results
+
+
+def _scaled_dot_product_chunk_flash_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    size: int,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    return_debug_mask: bool = False,
+    *,
+    scale: Optional[float] = None,
+) -> Tuple[torch.Tensor, ...]:
+    """
+    This is a single node chunked implementation of
+    _scaled_dot_product_ring_flash_attention used for verifying
+    the correctness of the backwards pass.
+    """
+
+    if return_debug_mask:
+        raise NotImplementedError("return_debug_mask is not supported yet")
+
+    if is_causal and (query.size(2) != key.size(2)):
+        raise NotImplementedError(
+            "is_causal requires the same query and context sequence lengths"
+        )
+
+    query_len = query.size(2) // size
+    ctx_len = key.size(2) // size
+
+    global_out = []
+    global_softmax_lse = []
+
+    for rank in range(size):
+        chunks = []
+        logsumexps = []
+
+        chunk_query = query[:, :, rank * query_len : (rank + 1) * query_len]
+
+        for i in range(size):
+            src_rank = (rank - i) % size
+            chunk_key = key[:, :, src_rank * ctx_len : (src_rank + 1) * ctx_len]
+            chunk_value = value[:, :, src_rank * ctx_len : (src_rank + 1) * ctx_len]
+
+            is_causal_behavior = _is_causal_behavior(
+                rank=rank, world_size=size, i=i, is_causal=is_causal
+            )
+
+            if is_causal_behavior != _CausalBehavior.SKIP:
+                local_results = torch.ops.aten._scaled_dot_product_flash_attention(
+                    chunk_query,
+                    chunk_key,
+                    chunk_value,
+                    dropout_p=dropout_p,
+                    is_causal=is_causal_behavior.value,
+                    scale=scale,
+                )
+                chunks.append(local_results[0])
+                logsumexps.append(local_results[1])
+
+        out, softmax_lse = _merge_sdpa(chunks, logsumexps)
+        global_out.append(out)
+        global_softmax_lse.append(softmax_lse)
+
+    global_out = torch.concat(global_out, dim=2)
+    global_softmax_lse = torch.concat(global_softmax_lse, dim=2)
+
+    local_results = (global_out, global_softmax_lse) + local_results[2:]
+    return local_results
+
+
+def sdpa_backward_handler(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> object:
+    # Redistribute grad_output tensor to the same placement as output tensor
+    args = list(args)
+    assert isinstance(args[0], DTensor) and isinstance(args[4], DTensor)
+    args[0] = args[0].redistribute(args[4].device_mesh, args[4].placements)
+    args = tuple(args)
+
+    # extract local tensor and sharding infos to a OpInfo
+    op_info = DTensor._op_dispatcher.unwrap_to_op_info(op_call, args, kwargs)
+
+    # sharding propagation
+    DTensor._op_dispatcher.sharding_propagator.propagate(op_info)
+    output_sharding = op_info.output_sharding
+    assert output_sharding is not None, "output sharding should not be None"
+    assert not output_sharding.needs_redistribute, "inputs need to be redistributed"
+
+    local_results = _scaled_dot_product_ring_flash_attention_backward(
+        op_info.mesh,
+        *op_info.local_args,  # type: ignore[arg-type]
+        **op_info.local_kwargs,  # type: ignore[arg-type]
+    )
+
+    return DTensor._op_dispatcher.wrap(local_results, output_sharding.output_spec)
+
+
+def _scaled_dot_product_ring_flash_attention_backward(
+    mesh: DeviceMesh,
+    grad_out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    out: torch.Tensor,
+    softmax_lse: torch.Tensor,
+    cum_seq_q: torch.Tensor,
+    cum_seq_k: torch.Tensor,
+    max_q: int,
+    max_k: int,
+    dropout_p: float,
+    is_causal: bool,
+    philox_seed: torch.Tensor,
+    philox_offset: torch.Tensor,
+    *,
+    scale: Optional[float] = None,
+) -> Tuple[torch.Tensor, ...]:
+    pg = mesh.get_group()
+    assert isinstance(pg, dist.ProcessGroup), "must be single dimension"
+    rank = dist.get_rank(pg)
+    size = dist.get_world_size(pg)
+
+    # rank 0 sends to rank 1, rank 1 sends to rank 2, ..., rank n-1 sends to rank 0
+    right_dsts = list(range(1, size)) + [0]
+
+    next_kv = None
+
+    out_grad_queries = []
+    out_grad_keys = []
+    out_grad_values = []
+
+    for i in range(size):
+        # overlap communication with compute
+        if next_kv is not None:
+            next_kv = ft_c.wait_tensor(next_kv)
+            key = next_kv[: key.numel()].reshape(key.shape)
+            value = next_kv[key.numel() :].reshape(value.shape)
+
+        if i < (size - 1):
+            next_kv = torch.cat([key.flatten(), value.flatten()])
+            next_kv = ft_c.permute_tensor(next_kv, right_dsts, pg)
+
+        is_causal_behavior = _is_causal_behavior(
+            rank=rank, world_size=size, i=i, is_causal=is_causal
+        )
+
+        if is_causal_behavior != _CausalBehavior.SKIP:
+            # we rerun the forwards pass since we don't have a good way to save the
+            # output/logsumexp
+            (
+                output,
+                logsumexp,
+                cum_seq_q,
+                cum_seq_k,
+                max_q,
+                max_k,
+                philox_seed,
+                philox_offset,
+                _,
+            ) = torch.ops.aten._scaled_dot_product_flash_attention(
+                query,
+                key,
+                value,
+                dropout_p=dropout_p,
+                is_causal=is_causal_behavior.value,
+                scale=scale,
+            )
+
+            softmax_lse_corrected = torch.exp(logsumexp - softmax_lse)
+
+            chunk_grad = grad_out * softmax_lse_corrected.conj().unsqueeze(-1).to(
+                grad_out.dtype
+            )
+
+            (
+                grad_query,
+                grad_key,
+                grad_value,
+            ) = torch.ops.aten._scaled_dot_product_flash_attention_backward(
+                grad_out=chunk_grad,
+                query=query,
+                key=key,
+                value=value,
+                out=output,
+                logsumexp=logsumexp,
+                cum_seq_q=cum_seq_q,
+                cum_seq_k=cum_seq_k,
+                max_q=max_q,
+                max_k=max_k,
+                dropout_p=dropout_p,
+                is_causal=is_causal_behavior.value,
+                philox_seed=philox_seed,
+                philox_offset=philox_offset,
+                scale=scale,
+            )
+        else:
+            grad_query = torch.zeros_like(query)
+            grad_key = torch.zeros_like(key)
+            grad_value = torch.zeros_like(value)
+
+        # TODO overlap grad communication
+        if i == 0:
+            out_grad_queries.append(grad_query)
+            out_grad_keys.append(grad_key)
+            out_grad_values.append(grad_value)
+        elif i > 0:
+            grad_dsts = [(-i) % size for i in range(size)]
+
+            grad_kv = torch.cat([grad_key.flatten(), grad_value.flatten()])
+            grad_kv = ft_c.permute_tensor(grad_kv, grad_dsts, pg)
+            grad_kv = ft_c.wait_tensor(grad_kv)
+            grad_key = grad_kv[: grad_key.numel()].reshape(grad_key.shape)
+            grad_value = grad_kv[grad_key.numel() :].reshape(grad_value.shape)
+
+            out_grad_queries.append(grad_query)
+            out_grad_keys.append(grad_key)
+            out_grad_values.append(grad_value)
+
+    # stack and sum to avoid accumulation errors
+    out_grad_query = torch.stack(out_grad_queries).sum(dim=0)
+    out_grad_key = torch.stack(out_grad_keys).sum(dim=0)
+    out_grad_value = torch.stack(out_grad_values).sum(dim=0)
+
+    return out_grad_query, out_grad_key, out_grad_value
+
+
+customized_ops = {
+    aten._scaled_dot_product_flash_attention.default: sdpa_handler,
+    aten._scaled_dot_product_flash_attention_backward.default: sdpa_backward_handler,
+}
+
+
+@contextlib.contextmanager
+def attention_context_parallel() -> Generator[None, None, None]:
+    """
+    This is a context manager that force enables attention context parallel
+    optimizations for all scaled_dot_product_attention ops.
+
+    This currently only supports ring attention and the
+    SDPBackend.FLASH_ATTENTION backend. See sdpa_kernel.
+
+    Non-flash attention backends will result in incorrect results.
+    """
+    old_handlers = DTensor._op_dispatcher._custom_op_handlers
+    DTensor._op_dispatcher._custom_op_handlers = {**old_handlers, **customized_ops}
+
+    yield
+
+    DTensor._op_dispatcher._custom_op_handlers = old_handlers
+
+
+class AttentionContextParallel(ParallelStyle):
+    """
+    Applies context parallel optimizations to the attention layer.
+
+    This will work for nn.MultiHeadedAttention and custom attention layers that
+    call F.scaled_dotproduct_attention with a simliar signature.
+
+    This expects the `forward` method consumes either:
+
+    * a single tensor for self attention
+    * one argument for each of: query, key, value
+
+    This currently only supports ring attention and the
+    SDPBackend.FLASH_ATTENTION backend. See sdpa_kernel.
+
+    Non-flash attention backends will result in incorrect results.
+    """
+
+    # use a weakref dictionary to store context managers for each nn.Module
+    _CONTEXT_MANAGERS: "weakref.WeakKeyDictionary[nn.Module, Any]" = (
+        weakref.WeakKeyDictionary()
+    )
+
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        if not isinstance(device_mesh, DeviceMesh):
+            raise ValueError(
+                f"{type(device_mesh)} is not supported by {type(self)} yet."
+            )
+
+        if not device_mesh.ndim == 1:
+            raise ValueError
+
+        return distribute_module(
+            module,
+            device_mesh,
+            input_fn=self._input_fn,  # type: ignore[arg-type]
+            output_fn=self._output_fn,  # type: ignore[arg-type]
+        )
+
+    @classmethod
+    def _input_fn(
+        cls,
+        module: nn.Module,
+        inputs: Tuple[Union[torch.Tensor, int, float], ...],
+        device_mesh: DeviceMesh,
+    ) -> Tuple[Union[torch.Tensor, int, float], ...]:
+        # TODO(d4l3k); this should be Shard(2), need to fix Linear layer rules
+        placement = [Replicate()]
+
+        def backward_hook(grad: torch.Tensor) -> None:
+            if module in cls._CONTEXT_MANAGERS:
+                cls._CONTEXT_MANAGERS[module].__exit__(None, None, None)
+                del cls._CONTEXT_MANAGERS[module]
+
+        # convert inputs to DTensor
+        inp = []
+        for input in inputs:
+            if isinstance(input, torch.Tensor) and not isinstance(input, DTensor):
+                input = DTensor.from_local(
+                    input, device_mesh, placement, run_check=False
+                )
+
+            if isinstance(input, torch.Tensor) and input.requires_grad:
+                input.register_hook(backward_hook)
+
+            inp.append(input)
+
+        manager = attention_context_parallel()
+        manager.__enter__()
+        cls._CONTEXT_MANAGERS[module] = manager
+
+        return tuple(inp)
+
+    @classmethod
+    def _output_fn(
+        cls,
+        module: nn.Module,
+        outputs: Union[torch.Tensor, Tuple[Union[torch.Tensor, int, float], ...]],
+        device_mesh: DeviceMesh,
+    ) -> Union[
+        Union[torch.Tensor, int, float], Tuple[Union[torch.Tensor, int, float], ...]
+    ]:
+        cls._CONTEXT_MANAGERS[module].__exit__(None, None, None)
+        del cls._CONTEXT_MANAGERS[module]
+
+        def backward_hook(grad: torch.Tensor) -> None:
+            if module not in cls._CONTEXT_MANAGERS:
+                manager = attention_context_parallel()
+                manager.__enter__()
+                cls._CONTEXT_MANAGERS[module] = manager
+
+        # back to local tensor
+        out = []
+        for output in [outputs] if isinstance(outputs, torch.Tensor) else outputs:
+            output = output.to_local() if isinstance(output, DTensor) else output
+
+            if isinstance(output, torch.Tensor) and output.requires_grad:
+                output.register_hook(backward_hook)
+
+            out.append(output)
+
+        if isinstance(outputs, torch.Tensor):
+            return out[0]
+
+        return tuple(out)
+
+
+class _CausalBehavior(Enum):
+    SKIP = None
+    NOT_IS_CAUSAL = False
+    IS_CAUSAL = True
+
+
+def _is_causal_behavior(
+    rank: int, world_size: int, i: int, is_causal: bool
+) -> _CausalBehavior:
+    """
+    Calculate is_causal behavior for each KV block. The attention can either be
+    calculated in full, not at all or with the causal mask applied.
+    """
+    if not is_causal:
+        return _CausalBehavior.NOT_IS_CAUSAL
+
+    if i == 0:
+        return _CausalBehavior.IS_CAUSAL
+
+    source_rank = (rank - i) % world_size
+    if source_rank < rank:
+        return _CausalBehavior.NOT_IS_CAUSAL
+    else:
+        return _CausalBehavior.SKIP
diff --git a/torch/distributed/_tensor/experimental/local_map.py b/torch/distributed/_tensor/experimental/local_map.py
new file mode 100644
index 0000000000000..002ff5542a119
--- /dev/null
+++ b/torch/distributed/_tensor/experimental/local_map.py
@@ -0,0 +1,182 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import Callable, Optional, Sequence, Tuple, Union
+
+import torch
+from torch.distributed._tensor import DeviceMesh, DTensor
+from torch.distributed._tensor.placement_types import Placement
+
+try:
+    from torch.utils import _cxx_pytree as pytree
+except ImportError:
+    from torch.utils import _pytree as pytree  # type: ignore[no-redef]
+
+
+PlacementType = Optional[Sequence[Placement]]
+InputPlacements = Union[PlacementType, Tuple[PlacementType, ...]]
+OutputPlacements = Union[PlacementType, Tuple[PlacementType, ...]]
+
+
+def local_map(
+    func: Callable,
+    out_placements: OutputPlacements,
+    in_placements: Optional[InputPlacements] = None,
+    device_mesh: Optional[DeviceMesh] = None,
+    *,
+    redistribute_inputs: bool = False,
+):
+    """
+    ``local_map`` is an experimental API that allows users to apply on :class:`DTensors`
+    a function that is written to be applied on :class:`~torch.Tensors`.
+
+    Args:
+        func (Callable): the function to be applied on each local shard of
+            :class:`DTensor`s.
+        out_placements (Union[`PlacementType`, Tuple[`PlacementType`, ...]]):
+            the desired placements of the output :class:`DTensor`s. If the `output` of
+            `func` is a Python collection, the `out_placements` will be a Tuple of
+            `PlacementType` values 1:1 mapping to the flattened `output`. For
+            :class:`Tensor` output, the corresponding `PlacementType` will be its
+            placements (a `Tuple[Placement]` value). For non-:class:`Tensor` output,
+            the `PlacementType` will be `None`.
+        in_placements (Union[`PlacementType`, Tuple[`PlacementType`, ...]], optional):
+            the required placements of the input :class:`DTensor`s. If not specified,
+            the input :class:`DTensor` will not be redistributed before passing its local
+            tensor to `func`. Similarly to `out_placements`, `in_placements` should keep
+            a 1:1 mapping to the flattened input of `func`. If a redistribution is
+            required according to `in_placements` and `redistribute_inputs` is `False`,
+            an exception will be raised.
+        device_mesh (:class:`DeviceMesh`, optional):
+            the device mesh that all the :class:`DTensor`s are placed on. If not
+            specified, this will be inferred from the input :class:`DTensor`s' device
+            mesh. `local_map` requires every :class:`DTensor`s to be placed on the same
+            device mesh.
+        redistribute_inputs (bool, optional):
+            the bool value indicating whether to reshard the input :class:`DTensor`s when
+            their placements are different from the required input placements. If this
+            value is `False` and some :class:`DTensor` input has a different placement,
+            an exception will be raised. Default: `False`.
+
+    Returns:
+        A `Callable` that applies `func` to each local shard of the input :class:`DTensor`
+        and returns a :class:`DTensor` constructed from the return value of `func`.
+
+    Raises:
+        AssertionError: If the input :class:`DTensor`s are not placed on the same device
+        mesh, or if they are placed on a different device mesh than the `device_mesh`
+        argument passed in.
+
+        AssertionError: For any non-:class:`DTensor` output, we require its corresponding
+        output placement in `out_placements` be `None`. An AssertionError will be raised
+        if this is not the case.
+
+        ValueError: If `redistribute_inputs=False` but the input :class:`DTensor` needs
+        a redistribution according to `in_placements`.
+
+    Example:
+        >>> # xdoctest: +SKIP("distributed")
+        >>> def mm_allreduce_forward(device_mesh, W, X):
+        >>>     partial_sum_tensor = torch.mm(W, X)
+        >>>     reduced_tensor = funcol.all_reduce(partial_sum_tensor, "sum", device_mesh)
+        >>>     return reduced_tensor
+        >>>
+        >>> W = torch.randn(12, 8, requires_grad=False)
+        >>> X = torch.randn(8, 16, requires_grad=False)
+        >>> Y = torch.mm(W, X)
+        >>> row_wise = [Shard(0)]  # row-wise sharding placements on 1-d mesh
+        >>> col_wise = [Shard(1)]  # col-wise sharding placements on 1-d mesh
+        >>>
+        >>> # local_mm_allreduce_forward is the function wrapped with DTensor/Tensor convertion
+        >>> local_mm_allreduce_forward = local_map(
+        >>>     mm_allreduce_forward,
+        >>>     out_placements=[Replicate()],
+        >>>     in_placements=[col_wise, row_wise],
+        >>>     device_mesh=device_mesh,
+        >>> )
+        >>>
+        >>> W_dt = distribute_tensor(W, device_mesh, col_wise)  # col-wisely sharded W tensor
+        >>> X_dt = distribute_tensor(X, device_mesh, row_wise)  # row-wisely sharded X tensor
+        >>> Y_dt = local_mm_allreduce_forward(W_dt, X_dt)  # apply local_mm_allreduce_forward to DTensors
+
+    NOTE: This API is currently experimental and subject to change
+    """
+
+    def wrapped(*args, **kwargs):
+        # process input args
+        flat_args, args_spec = pytree.tree_flatten(args)
+
+        # we assume every DTensor object is placed on the same device mesh
+        flat_local_args = []
+        nonlocal device_mesh  # access var device_mesh from the outer scope
+        for idx, arg in enumerate(flat_args):
+            if isinstance(arg, DTensor):
+                # TODO: the current code doesn't consider the uneven sharding case
+                # Need to think about what the consequence is when the input DTensor
+                # is uneven sharded.
+                if device_mesh is None:  # infer device mesh from the DTensor arg
+                    device_mesh = arg.device_mesh
+
+                assert arg.device_mesh == device_mesh, (
+                    f"arg {arg} in local_map has a mismatched device mesh:"
+                    f"{arg} has device mesh {arg.device_mesh} while"
+                    f"the expected device mesh is {device_mesh}!"
+                )
+                if in_placements is not None:
+                    spec = (
+                        in_placements[idx]
+                        if isinstance(in_placements, tuple)
+                        else in_placements
+                    )
+                    assert (
+                        spec is not None
+                    ), f"DTensor input {arg} expects placements but received {spec}!"
+
+                    if not isinstance(spec, tuple):
+                        spec = tuple(spec)
+
+                    if arg.placements != spec:
+                        if redistribute_inputs:
+                            # redistribute to input placements
+                            arg = arg.redistribute(device_mesh, spec)
+                        else:
+                            raise ValueError(
+                                f"arg {arg} in local_map has a mismatched placements:"
+                                f"arg placements is {arg.placements} but the input"
+                                f"placements is {spec}!"
+                                "If redistribute_inputs is wanted, set redistribute_inputs=True to local_map."
+                            )
+
+                flat_local_args.append(arg.to_local())
+            else:
+                flat_local_args.append(arg)
+
+        local_args = pytree.tree_unflatten(flat_local_args, args_spec)
+
+        out = func(device_mesh, *local_args, **kwargs)
+
+        # process output
+        flat_out, out_spec = pytree.tree_flatten(out)
+        flat_dist_out = []
+        for idx, out in enumerate(flat_out):
+            spec = (
+                out_placements[idx]
+                if isinstance(out_placements, tuple)
+                else out_placements
+            )
+            if isinstance(out, torch.Tensor):
+                assert not isinstance(
+                    out, DTensor
+                ), f"torch.Tensor output expected but received {type(out)}: {out}"
+
+                flat_dist_out.append(
+                    DTensor.from_local(out, device_mesh, spec, run_check=False)
+                )
+            else:
+                assert (
+                    spec is None
+                ), f"Non-tensor output {out} expects None placements but received {spec}!"
+
+                flat_dist_out.append(out)
+
+        return pytree.tree_unflatten(flat_dist_out, out_spec)
+
+    return wrapped
diff --git a/torch/distributed/_tensor/experimental/tp_transform.py b/torch/distributed/_tensor/experimental/tp_transform.py
index 832c167dcc36f..e3d3faba0e45c 100644
--- a/torch/distributed/_tensor/experimental/tp_transform.py
+++ b/torch/distributed/_tensor/experimental/tp_transform.py
@@ -47,17 +47,24 @@ def tensor_parallel_transformation(
     .. warning::
         This API is experimental and subject to change.
     """
-    # TODO Migrate this to plain function call.
-    return exported_program._transform_do_not_use(
-        TensorParallelTransformPass(
+
+    gm = exported_program.graph_module
+    sig = copy.deepcopy(exported_program.graph_signature)
+    state_dict = copy.copy(exported_program.state_dict)
+
+    with gm._set_replace_hook(sig.get_replace_hook()):
+        res = TensorParallelTransformPass(
             rank,
             world_size,
             device_type,
-            exported_program.state_dict,
+            state_dict,
             exported_program.graph_signature,
             parallel_strategies,
-        )
-    )
+        )(gm)
+        assert res is not None
+        gm = res.graph_module
+
+    return exported_program._update(gm, sig, state_dict)
 
 
 class TensorParallelTransformPass(PassBase):
@@ -203,7 +210,7 @@ def _mark_sharding(
                 placement_strategies[node] = _create_placement_strategy(
                     node,
                     mesh,
-                    placements=arg_strategy.out_spec.placements,
+                    placements=arg_strategy.output_spec.placements,
                     input_specs=_get_input_node_specs(node, placement_strategies),
                 )
                 node.meta["sharding"] = placement_strategies[node]
@@ -228,9 +235,9 @@ def _mark_sharding(
                         op_schema,
                     )
                 placement_strategies[node] = PlacementStrategy(
-                    output_spec=_get_output_spec_from_output_sharding(output_sharding),
-                    input_specs=output_sharding.schema_suggestions[0].args_spec
-                    if output_sharding.schema_suggestions is not None
+                    output_specs=_get_output_spec_from_output_sharding(output_sharding),
+                    input_specs=output_sharding.redistribute_schema.args_spec
+                    if output_sharding.redistribute_schema is not None
                     else _get_input_node_specs(node, placement_strategies),
                 )
                 node.meta["sharding"] = placement_strategies[node]
@@ -268,12 +275,12 @@ def _create_placement_strategy(
     """
     placement = PlacementStrategy(
         input_specs=input_specs,
-        output_spec=DTensorSpec(
+        output_specs=DTensorSpec(
             mesh=mesh,
             placements=placements,
         ),
     )
-    _populate_tensor_meta(node, placement.output_spec)
+    _populate_tensor_meta(node, placement.output_specs)
     return placement
 
 
@@ -338,7 +345,7 @@ def create_output_spec(tensor: FakeTensor) -> DTensorSpec:
         output_spec=pytree.tree_map_only(
             FakeTensor, create_output_spec, node.meta["val"]
         ),
-        schema_suggestions=[new_op_schema],
+        redistribute_schema=new_op_schema,
         failed_reason=f"{node.op} does not have sharding strategy registered",
         needs_redistribute=True,
     )
@@ -451,6 +458,14 @@ def reshard_fn(local_tensor: torch.Tensor) -> torch.Tensor:
     reshard_gm_nodes = list(reshard_gm.graph.nodes)
     input_node = reshard_gm_nodes[0]
     with gm.graph.inserting_before(node):
+        # copy nn_module_stack metadata for output, all-reduce nodes
+        for reshard_node in reshard_gm.graph.nodes:
+            if reshard_node.op not in ["placeholder", "output"]:
+                reshard_node.meta["nn_module_stack"] = (
+                    copy.copy(input_arg.meta["nn_module_stack"])
+                    if not input_arg.op == "placeholder"
+                    else copy.copy(node.meta["nn_module_stack"])
+                )
         output_node = gm.graph.graph_copy(
             reshard_gm.graph,
             val_map={
@@ -481,7 +496,7 @@ def _get_input_node_specs(
     input_specs_list: List[DTensorSpec] = []
     for input_arg in node.all_input_nodes:
         if input_arg in placement_strategies:
-            output_spec = placement_strategies[input_arg].output_spec
+            output_spec = placement_strategies[input_arg].output_specs
             assert isinstance(output_spec, DTensorSpec)
             input_specs_list.append(output_spec)
         else:
@@ -496,7 +511,7 @@ def _get_op_schema(
     Util function to construct the operator schema of a node.
     """
     args_schema_list = pytree.tree_map_only(
-        Node, lambda arg: placement_strategies[arg].output_spec, node.args
+        Node, lambda arg: placement_strategies[arg].output_specs, node.args
     )
     op_schema = OpSchema(
         op=cast(torch._ops.OpOverload, node.target),
@@ -527,7 +542,6 @@ def _shard_state_dict(
         assert fqn in state_dict, f"{fqn} not found in state dict: {state_dict.keys()}"
 
         original_param = state_dict[fqn]
-        assert isinstance(placement_strategy.output_spec, DTensorSpec)
         dtensor_param = distribute_tensor(
             original_param,
             mesh,
diff --git a/torch/distributed/_tensor/op_schema.py b/torch/distributed/_tensor/op_schema.py
index ea2f4041b1001..7d5bd691395be 100644
--- a/torch/distributed/_tensor/op_schema.py
+++ b/torch/distributed/_tensor/op_schema.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from functools import cached_property
-from typing import Dict, List, Optional, Sequence, Tuple, Union
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 import torch
 from torch._ops import OpOverload
@@ -8,9 +8,10 @@
 from torch.distributed.device_mesh import DeviceMesh
 
 try:
-    from torch.utils._cxx_pytree import tree_map_only, TreeSpec
+    from torch.utils._cxx_pytree import tree_leaves, tree_map_only, TreeSpec
 except ImportError:
     from torch.utils._pytree import (  # type: ignore[no-redef, assignment]
+        tree_leaves,
         tree_map_only,
         TreeSpec,
     )
@@ -64,15 +65,15 @@ def _pretty_print_spec(spec: object) -> str:
 @dataclass
 class PlacementStrategy:
     """
-    A placement strategy describes an acceptable sharding placements of the output
+    A placement strategy describes acceptable sharding placements of the output
     and the tensor arguments of an operation.
 
-    note: when the op return value is a single DTensor object, output_spec is
-    DTensorSpec; when the return value is a sequence of Optional[DTensor],
-    output_spec is a sequence of Optional[DTensorSpec].
+    note: when the op return value is a single DTensor object, output_specs is
+    DTensorSpec; when the return value is a tuple of Optional[DTensor],
+    output_specs is a tuple of Optional[DTensorSpec].
     """
 
-    output_spec: Union[DTensorSpec, Tuple[Optional[DTensorSpec], ...]]
+    output_specs: Union[DTensorSpec, Tuple[Optional[DTensorSpec], ...]]
     input_specs: Optional[Sequence[DTensorSpec]] = None
 
     # redistribute costs for this op placement strategy
@@ -82,22 +83,33 @@ class PlacementStrategy:
     redistribute_cost: Optional[List[List[float]]] = None
 
     @cached_property
-    def out_spec(self) -> DTensorSpec:
-        if isinstance(self.output_spec, DTensorSpec):
-            return self.output_spec
+    def output_spec(self) -> DTensorSpec:
+        """
+        This function requires that the strategy have exactly one DTensorSpec as the
+        output spec. If the output_specs is a tuple, we throw an exception.
+        """
+        if isinstance(self.output_specs, DTensorSpec):
+            return self.output_specs
         else:
-            assert len(self.output_spec) > 0, "empty output_spec!"
-            spec = self.output_spec[0]
-            assert isinstance(
-                spec, DTensorSpec
-            ), "If the operator returns a tuple, PlacementStrategy requires the first"
-            f"element in tuple be not None but got: {spec}."
-            return spec
+            raise ValueError(
+                f"function output_spec expects a single DTensorSpec but got: {self.output_specs}"
+            )
+
+    def input_spec(self, index: int = 0) -> DTensorSpec:
+        assert self.input_specs is not None, "input_specs of PlacementStrategy is None!"
+        assert len(self.input_specs) > index, (
+            f"Invalid index {index} for input_specs of length "
+            f"{len(self.input_specs)}: {self.input_specs}"
+        )
+        return self.input_specs[index]
 
     def __str__(self) -> str:
-        input_specs_str = _pretty_print_spec(self.input_specs)
-        output_spec_str = _pretty_print_spec(self.output_spec)
-        return f"{input_specs_str} -> {output_spec_str}"
+        if self.input_specs is not None:
+            input_specs_str = f"{_pretty_print_spec(self.input_specs)} -> "
+        else:
+            input_specs_str = ""
+        output_spec_str = _pretty_print_spec(self.output_specs)
+        return f"{input_specs_str}{output_spec_str}"
 
 
 class StrategyType:
@@ -121,25 +133,33 @@ def __init__(self, strategies: List[PlacementStrategy]) -> None:
     def __str__(self) -> str:
         strategy_list_str = ", ".join([str(strategy) for strategy in self.strategies])
         mesh_shape = self.output_mesh_shape
-        return f"OpStrategy:[{strategy_list_str}] @mesh: {mesh_shape}"
+        return f"[{strategy_list_str}] @ mesh: {mesh_shape}"
 
     def max_num_shards(self) -> int:
         """
         Returns the max number of shards across all placement strategies
         """
-        return max([strategy.out_spec.num_shards for strategy in self.strategies])
+        return max(strategy.output_spec.num_shards for strategy in self.strategies)
 
     @property
     def output_mesh_shape(self):
-        return self.strategies[0].out_spec.mesh.shape
+        output_spec = self.strategies[0].output_specs
+        if isinstance(output_spec, DTensorSpec):
+            return output_spec.mesh.shape
+        else:
+            assert isinstance(
+                output_spec, tuple
+            ), "found no DTensorSpec in the OpStrategy!"
+            assert output_spec[0] is not None
+            return output_spec[0].mesh.shape
 
     @property
     def output_ndim(self):
-        return self.strategies[0].out_spec.ndim
+        return self.strategies[0].output_spec.ndim
 
     @property
     def output_shape(self):
-        return self.strategies[0].out_spec.shape
+        return self.strategies[0].output_spec.shape
 
 
 class TupleStrategy(StrategyType):
@@ -147,7 +167,8 @@ class TupleStrategy(StrategyType):
     TupleStrategy represents the output strategy of this op is a tuple
     of strategy, i.e. If the output of this op is a tuple of tensors or list of tensors
     with possibly different placement strategies, we should return a TupleStrategy that
-    contains a tuple of OpStrategy.
+    contains a tuple of OpStrategy, where each child represents the sharding strategy
+    of "each element" of the tuple/list of tensors the op returns.
 
     NOTE: if the output of the op is a List[Tensor] and they share the same placement
     strategy, then we should return a single OpStrategy instead of a TupleStrategy
@@ -173,15 +194,15 @@ class RuntimeSchemaInfo:
     """
 
     # This static_argnum records static arg "starting index" for ops that have non-tensor
-    # args/kwargs which would affect sharding propagation results. All args after this
-    # index would be hashed to our sharding cache.
+    # args/kwargs which would affect sharding propagation results. All args starting from
+    # this index would be hashed to our sharding cache.
     # Note that only a few ops need this information, e.g. view, transpose, var.dim, etc.
     static_argnum: int = 100
     # This static_kwargkey records static kwarg names which would affect sharding prop
     static_kwargkey: Optional[List[str]] = None
     # each op can decide if it wants to use pytree flatten/unflatten during operator
     # eager execution, by default we don't need to do flatten/unflatten, only if the
-    # op indicate it needs to, this is to accelate eager performance.
+    # op indicate it needs to, this is to accelerate eager performance.
     needs_pytree: bool = False
 
 
@@ -219,12 +240,19 @@ def args_spec(self) -> Tuple[DTensorSpec, ...]:
         """
         # filter out non-relevant values from args schema to get a clean spec list
         # this would mainly be used by sharding propagation rules
+        if self.schema_info is not None and self.schema_info.needs_pytree:
+            return tuple(
+                item
+                for item in tree_leaves(self.args_schema)
+                if isinstance(item, DTensorSpec)
+            )
         return tuple(item for item in self.args_schema if isinstance(item, DTensorSpec))
 
     def __repr__(self) -> str:
+        args_schema = ", ".join([str(arg_schema) for arg_schema in self.args_schema])
         return (
             f"OpSchema(op={self.op},"
-            f" args_schema={self.args_schema},"
+            f" args_schema=({args_schema}),"
             f" kwargs_schema={self.kwargs_schema})"
         )
 
@@ -237,7 +265,7 @@ def __str__(self) -> str:
                 mesh_shape = arg.mesh.shape
             elif isinstance(arg, OpStrategy):
                 assert len(arg.strategies) == 1
-                args_sharding.append(_pretty_print_spec(arg.strategies[0].output_spec))
+                args_sharding.append(_pretty_print_spec(arg.strategies[0].output_specs))
                 mesh_shape = arg.output_mesh_shape
             elif isinstance(arg, TupleStrategy):
                 first_op_strtgy = arg.childs[0]
@@ -246,7 +274,7 @@ def __str__(self) -> str:
                 args_sharding.append(str(arg))
             else:
                 args_sharding.append(str(arg))
-        return f"Op(op={self.op}, args_sharding={', '.join(args_sharding)}@ mesh: {mesh_shape})"
+        return f"Op(op={self.op}, args_sharding={', '.join(args_sharding)} @ mesh: {mesh_shape})"
 
     def __post_init__(self) -> None:
         has_symints = False
@@ -268,9 +296,10 @@ def arg_type_tensor_or_tensor_list_like(self, arg_idx: int) -> bool:
 
         return all(isinstance(e, DTensorSpec) or e is None for e in arg)
 
-    def return_type_tuple_tensors(self) -> bool:
+    def return_type_tuple_tensor_like(self) -> bool:
+        # all dispatch ops could only return Tuple[Tensor] or have None/ints/floats
+        # in the tuple, but the first element must be a Tensor, so this check is enough
         return_types = self.op._schema.returns
-        # all dispatch ops only return Tensor or Tuple[Tensor], so this check if enough
         return len(return_types) > 1 and isinstance(
             return_types[0].type, torch.TensorType
         )
@@ -364,7 +393,14 @@ def _inplace_rewrap_schema_suggestion(self, origin_schema: "OpSchema") -> None:
         suggestion_args_spec = self.args_spec
         new_arg_schema: List[object] = []
         idx_of_args_spec = 0
-        for arg in origin_schema.args_schema:
+        if (
+            origin_schema.schema_info is not None
+            and origin_schema.schema_info.needs_pytree
+        ):
+            args_schema: Sequence[Any] = tree_leaves(origin_schema.args_schema)
+        else:
+            args_schema = origin_schema.args_schema
+        for arg in args_schema:
             if isinstance(arg, DTensorSpec):
                 new_arg_schema.append(suggestion_args_spec[idx_of_args_spec])
                 idx_of_args_spec += 1
@@ -387,7 +423,7 @@ class OutputSharding:
     """
 
     output_spec: OutputSpecType
-    schema_suggestions: Optional[List[OpSchema]] = None
+    redistribute_schema: Optional[OpSchema] = None
     failed_reason: Optional[str] = None
     needs_redistribute: bool = False
 
diff --git a/torch/distributed/_tensor/ops/basic_strategy.py b/torch/distributed/_tensor/ops/basic_strategy.py
index a43327d38f7b3..6c2d87f470d59 100644
--- a/torch/distributed/_tensor/ops/basic_strategy.py
+++ b/torch/distributed/_tensor/ops/basic_strategy.py
@@ -1,7 +1,7 @@
 import itertools
 from dataclasses import dataclass
 
-from typing import List, Tuple
+from typing import List, Set, Tuple
 
 from torch.distributed._tensor.op_schema import OpStrategy, PlacementStrategy
 from torch.distributed._tensor.placement_types import (
@@ -44,10 +44,9 @@ def parse_dims(cls, input_dims: List[str], output_dim: str) -> "EinsumDims":
         Parse the dims and extract the contracting, batch, and free dimensions
         for the left and right hand sides.
         """
-        dim_char_set = set()
+        dim_char_set: Set[str] = set()
         for input_dim in input_dims:
-            for input_char in list(input_dim):
-                dim_char_set.add(input_char)
+            dim_char_set.update(input_dim)
 
         # get a determinisitc order of all dim chars
         all_dim_chars = sorted(dim_char_set)
@@ -178,7 +177,7 @@ def gen_einsum_strategies(
         spec_list = []
         for specs in zip(*strategy_comb):
             spec_list.append(DTensorSpec(mesh, tuple(specs)))
-        strat = PlacementStrategy(output_spec=spec_list[0], input_specs=spec_list[1:])
+        strat = PlacementStrategy(output_specs=spec_list[0], input_specs=spec_list[1:])
         all_strategies.append(strat)
 
     return OpStrategy(all_strategies)
diff --git a/torch/distributed/_tensor/ops/common_rules.py b/torch/distributed/_tensor/ops/common_rules.py
index 7d581995bd816..fc3f9598e7e6d 100644
--- a/torch/distributed/_tensor/ops/common_rules.py
+++ b/torch/distributed/_tensor/ops/common_rules.py
@@ -39,7 +39,7 @@ def _gen_reshard_suggestions(
     suggested_schema._inplace_rewrap_schema_suggestion(op_schema)
     return OutputSharding(
         None,
-        schema_suggestions=[suggested_schema],
+        redistribute_schema=suggested_schema,
         failed_reason="Input placements op sharding propagation failed, need to reshard!",
     )
 
diff --git a/torch/distributed/_tensor/ops/embedding_ops.py b/torch/distributed/_tensor/ops/embedding_ops.py
index ea033af6f6cc1..a993f90fa4d53 100644
--- a/torch/distributed/_tensor/ops/embedding_ops.py
+++ b/torch/distributed/_tensor/ops/embedding_ops.py
@@ -1,97 +1,311 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # implement matrix related ops for distributed tensor
+import itertools
+from dataclasses import dataclass, field
+from typing import cast, List, Optional
 
 import torch
-from torch.distributed._tensor.op_schema import OpSchema, OutputSharding
-from torch.distributed._tensor.ops.utils import register_prop_rule
+import torch.distributed._functional_collectives as funcol
+from torch.distributed._tensor.op_schema import (
+    OpSchema,
+    OpStrategy,
+    PlacementStrategy,
+    StrategyType,
+)
+from torch.distributed._tensor.ops.utils import (
+    generate_redistribute_costs,
+    is_tensor_shardable,
+    register_op_strategy,
+)
 
 from torch.distributed._tensor.placement_types import (
     _Partial,
     DTensorSpec,
+    Placement,
     Replicate,
     Shard,
 )
 
+from torch.distributed.device_mesh import DeviceMesh
+
 aten = torch.ops.aten
 
 
-# TODO: Enable BWD for embedding op.
-@register_prop_rule(aten.embedding.default)
-def embedding_rules(op_schema: OpSchema) -> OutputSharding:
-    weight_spec, inp_spec = op_schema.args_spec
-    if any(placement.is_shard(0) for placement in weight_spec.placements):
-        raise NotImplementedError(
-            "DTensor does not support row-wise sharded embedding operation yet!"
-        )
+@dataclass
+class MaskBuffer:
+    data: Optional[torch.Tensor] = None
 
-    if weight_spec.is_replicated() and inp_spec.placements == [Shard(0)]:
-        # Embedding table is replicated, input ids are sharded along batch
-        # dimension. Output lookups should match input sharding spec in this case.
-        return OutputSharding(
-            output_spec=DTensorSpec(mesh=inp_spec.mesh, placements=inp_spec.placements)
-        )
+    def materialize_mask(self, mask):
+        if self.data is not None:
+            raise RuntimeError("MaskBuffer has already been materialized")
+        self.data = mask
 
-    if inp_spec.is_replicated():
-        weight_dim_map = weight_spec.dim_map
-        output_dim_map = inp_spec.dim_map
-        output_dim_map.append(weight_dim_map[1])
-        return OutputSharding(
-            output_spec=DTensorSpec.from_dim_map(inp_spec.mesh, output_dim_map, [])
-        )
+    def release_mask(self):
+        # TODO: evaluate if we need to release the mask buffer or the buffer
+        # can just have the same lifetime as the _Partial placement
+        if self.data is None:
+            raise RuntimeError("MaskBuffer has not been materialized")
+        self.data = None
 
-    return OutputSharding(
-        output_spec=None,
-        schema_suggestions=[
-            OpSchema(
-                op=op_schema.op,
-                args_schema=(
-                    weight_spec,
-                    DTensorSpec(
-                        mesh=inp_spec.mesh,
-                        placements=tuple([Replicate()] * len(inp_spec.placements)),
-                        tensor_meta=inp_spec.tensor_meta,
-                    ),
-                ),
-                kwargs_schema=op_schema.kwargs_schema,
-            )
-        ],
-    )
-
-
-@register_prop_rule(aten.embedding_renorm_.default)
-def embedding_renorm_rules(op_schema: OpSchema) -> OutputSharding:
-    raise NotImplementedError(
-        "DTensor does not support sharded embedding operation with max_norm yet!"
-    )
-
-
-@register_prop_rule(aten.embedding_dense_backward.default)
-def embedding_dense_backward_rules(op_schema: OpSchema) -> OutputSharding:
-    grad_output, indices = op_schema.args_schema[:2]
-    assert isinstance(grad_output, DTensorSpec)
-    assert isinstance(indices, DTensorSpec)
-    if grad_output.placements == indices.placements:
-        # The embedding table is replicated, and input/oupput activations are
-        # sharded. In this case, gradients for the embedding table should be
-        # Partial.
-        return OutputSharding(
-            output_spec=DTensorSpec(mesh=indices.mesh, placements=(_Partial(),))
+    def apply_mask(self, tensor):
+        if self.data is None:
+            raise RuntimeError("MaskBuffer has not been materialized")
+
+        # NOTE: _MaskPartial is being used by the embedding op and the gather op.
+        # For gather, the mask has the same dimension as the output tensor, whereas
+        # the output of the embedding op has an additional dimension compare to the input,
+        # hence the output masking logic below having two different cases.
+        if tensor.ndim == self.data.ndim:
+            tensor[self.data] = 0.0
+        else:
+            tensor[self.data, :] = 0.0
+
+
+@dataclass(frozen=True)
+class _MaskPartial(_Partial):
+    """
+    A partial mask placement devised for rowwise sharded embedding op, where we need
+    to mask and adjust the indices to the local embedding shard, embedding masking
+    is a special type of the Partial placement
+
+    NOTE: the lifecycle of this MaskPartial placement follows the corresponding DTensor
+    lifecycle, i.e. the indices_mask would only be alive during the lifetime of the DTensor.
+    """
+
+    logical_dim_size: int = -1
+    mask_buffer: MaskBuffer = field(default_factory=MaskBuffer)
+
+    def _partition_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        # override parent logic to perform partial mask for embedding
+        num_chunks = mesh.size(mesh_dim)
+        # get local shard size and offset on the embedding_dim
+        local_shard_size, local_offset_on_dim = Shard._local_shard_size_on_dim(
+            self.logical_dim_size,
+            num_chunks,
+            mesh.get_local_rank(mesh_dim),
+            return_offset=True,
+        )
+        # Build the input mask and save it for the current partial placement
+        # this is so that the output of embedding op can reuse the same partial
+        # placement saved mask to perform mask + reduction
+        mask = (tensor < local_offset_on_dim) | (
+            tensor >= local_offset_on_dim + local_shard_size
         )
-    elif grad_output.placements == [_Partial()] and indices.placements == [Replicate()]:
-        # The embedding table is replicated and the indices is also replicated
-        # (local is a more precise term). This is postional embedding. In this
-        # case, gradients for the embmedding table should be Partial.
-        return OutputSharding(
-            output_spec=DTensorSpec(mesh=indices.mesh, placements=(_Partial(),))
+        # mask the input tensor
+        masked_tensor = tensor.clone() - local_offset_on_dim
+        masked_tensor[mask] = 0
+        # materialize the mask buffer to be used for reduction
+        self.mask_buffer.materialize_mask(mask)
+        return masked_tensor
+
+    def _reduce_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        # by the time we ned reduction, we should have already saved the mask
+        assert self.mask_buffer.data is not None
+
+        # apply the mask to the tensor that pending reduction
+        self.mask_buffer.apply_mask(tensor)
+
+        # clear the mask buffer
+        self.mask_buffer.release_mask()
+
+        # perform sum reduction
+        return funcol.all_reduce(
+            tensor, reduceOp=self.reduce_op, group=(mesh, mesh_dim)
         )
-    elif all(placement.is_replicate() for placement in indices.placements):
-        # BWD for colwise sharding case
-        return OutputSharding(
-            output_spec=DTensorSpec(mesh=indices.mesh, placements=(Shard(1),))
+
+    def _reduce_shard_value(
+        self,
+        tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        shard_spec: Placement,
+    ) -> torch.Tensor:
+        # by the time we ned reduction, we should have already saved the mask
+        assert self.mask_buffer.data is not None
+
+        # apply the mask to the tensor that pending reduction
+        self.mask_buffer.apply_mask(tensor)
+
+        # clear the mask buffer
+        self.mask_buffer.release_mask()
+
+        # call reduce_shard_tensor of the shard_spec.
+        shard_spec = cast(Shard, shard_spec)
+        return shard_spec._reduce_shard_tensor(tensor, mesh, self.reduce_op, mesh_dim)
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, _MaskPartial):
+            return False
+
+        # if either data is not None, we invalidate the sharding cache, as this indicates
+        # the current MaskPartial placement is still in use and should not be used for cache hit.
+        if self.mask_buffer.data is not None or other.mask_buffer.data is not None:
+            return False
+
+        return (
+            self.reduce_op == other.reduce_op
+            and self.logical_dim_size == other.logical_dim_size
         )
-    else:
-        raise NotImplementedError(
-            "Unsupported embedding dense backward schema:\n"
-            f"grad_output - {grad_output}\n"
-            f"indices - {indices}"
+
+    def __hash__(self) -> int:
+        return 1 + hash(
+            (self.logical_dim_size, id(self.mask_buffer.data), self.reduce_op)
         )
+
+    def __repr__(self) -> str:
+        """
+        machine readable representation of the MaskPartial placement
+        """
+        return f"_MaskPartial(logical_dim_size={self.logical_dim_size})"
+
+    def __str__(self) -> str:
+        """
+        human readable representation of the MaskPartial placement
+        """
+        return "MaskP"
+
+
+@register_op_strategy(aten.embedding.default)
+def embedding_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    """
+    This strategy handles embedding op. We have two possible embedding shardings:
+    rowwise and colwise
+    """
+    weight_strategy = cast(OpStrategy, op_schema.args_schema[0])
+    indices_strategy = cast(OpStrategy, op_schema.args_schema[1])
+
+    weight_shape = weight_strategy.output_shape
+    indices_shape = indices_strategy.output_shape
+    output_emd_dim = len(indices_shape)
+
+    all_mesh_dim_strategies = []
+
+    for mesh_dim in range(mesh.ndim):
+        single_mesh_dim_strategies = []
+
+        # placement list stores placements of [output, weight, input_indices]
+        # first we always have replicate all for inputs and output
+        all_replicate: List[Placement] = [Replicate()] * 3
+        single_mesh_dim_strategies.append(all_replicate)
+
+        # colwise sharding, output shard on last dim, weight shard on dim 1, input replicate
+        colwise_sharding = [Shard(output_emd_dim), Shard(1), Replicate()]
+        single_mesh_dim_strategies.append(colwise_sharding)
+
+        # rowwise sharding, output is embedding partial, weight shard on dim 0, input accepts embedding partial
+        embedding_partial_placement = _MaskPartial(logical_dim_size=weight_shape[0])
+
+        # NOTE we want to reuse the same mask partial placement so that we can reuse the same mask that generates
+        # from the input indices and use it for output reduction
+        rowwise_sharding = [
+            embedding_partial_placement,
+            Shard(0),
+            embedding_partial_placement,
+        ]
+        single_mesh_dim_strategies.append(rowwise_sharding)
+
+        # batch dim sharding, weight replicated, input can shard on any dim, output follows input
+        for input_dim in range(len(indices_shape)):
+            batch_sharding = [Shard(input_dim), Replicate(), Shard(input_dim)]
+            single_mesh_dim_strategies.append(batch_sharding)
+
+        all_mesh_dim_strategies.append(single_mesh_dim_strategies)
+
+    strategy_combs = itertools.product(*all_mesh_dim_strategies)
+
+    all_strategies = []
+    for strategy_comb in strategy_combs:
+        spec_list = []
+        for specs in zip(*strategy_comb):
+            spec_list.append(DTensorSpec(mesh, tuple(specs)))
+
+        if is_tensor_shardable(weight_shape, spec_list[1]) and is_tensor_shardable(
+            indices_shape, spec_list[2]
+        ):
+            # only add to the strategy list when both weight and indices are shardable
+            weight_spec, indices_spec = spec_list[1:]
+            redistribute_cost = [
+                generate_redistribute_costs(weight_strategy, weight_spec),
+                generate_redistribute_costs(indices_strategy, indices_spec),
+            ]
+            strat = PlacementStrategy(
+                output_specs=spec_list[0],
+                input_specs=spec_list[1:],
+                redistribute_cost=redistribute_cost,
+            )
+            all_strategies.append(strat)
+
+    return OpStrategy(all_strategies)
+
+
+@register_op_strategy(aten.embedding_dense_backward.default)
+def embedding_dense_backward_strategy(
+    mesh: DeviceMesh, op_schema: OpSchema
+) -> StrategyType:
+    """
+    This strategy handles embedding op. We have two possible embedding shardings:
+    rowwise and colwise
+    """
+    grad_out_strategy = cast(OpStrategy, op_schema.args_schema[0])
+    indices_strategy = cast(OpStrategy, op_schema.args_schema[1])
+
+    grad_out_shape = grad_out_strategy.output_shape
+    indices_shape = indices_strategy.output_shape
+    grad_out_ndim = len(grad_out_shape)
+
+    all_mesh_dim_strategies = []
+
+    for mesh_dim in range(mesh.ndim):
+        single_mesh_dim_strategies = []
+
+        # placement list stores placements of [output, weight, input_indices]
+        # first we always have replicate all for inputs and output
+        all_replicate: List[Placement] = [Replicate()] * 3
+        single_mesh_dim_strategies.append(all_replicate)
+
+        # colwise sharding backward, grad_out shard on last dim, input replicate,
+        # weight grad shard colwise
+        colwise_sharding = [Shard(1), Shard(grad_out_ndim - 1), Replicate()]
+        single_mesh_dim_strategies.append(colwise_sharding)
+
+        # batch dim sharding, weight replicated, grad_out/input have same sharding
+        # that can shard on any dim, weight grad partial
+        for input_dim in range(len(indices_shape)):
+            batch_sharding = [_Partial(), Shard(input_dim), Shard(input_dim)]
+            single_mesh_dim_strategies.append(batch_sharding)
+
+        # grad_out partial, input replicate, weight grad keep partial
+        partial_sharding = [_Partial(), _Partial(), Replicate()]
+        single_mesh_dim_strategies.append(partial_sharding)
+
+        all_mesh_dim_strategies.append(single_mesh_dim_strategies)
+
+    strategy_combs = itertools.product(*all_mesh_dim_strategies)
+
+    all_strategies = []
+    for strategy_comb in strategy_combs:
+        spec_list = []
+        for specs in zip(*strategy_comb):
+            spec_list.append(DTensorSpec(mesh, tuple(specs)))
+
+        if is_tensor_shardable(grad_out_shape, spec_list[1]) and is_tensor_shardable(
+            indices_shape, spec_list[2]
+        ):
+            # only add to the strategy list when both grad_out and indices are shardable
+            grad_out_spec, indices_spec = spec_list[1:]
+            redistribute_cost = [
+                generate_redistribute_costs(grad_out_strategy, grad_out_spec),
+                generate_redistribute_costs(indices_strategy, indices_spec),
+            ]
+            strat = PlacementStrategy(
+                output_specs=spec_list[0],
+                input_specs=spec_list[1:],
+                redistribute_cost=redistribute_cost,
+            )
+            all_strategies.append(strat)
+
+    return OpStrategy(all_strategies)
diff --git a/torch/distributed/_tensor/ops/experimental_ops.py b/torch/distributed/_tensor/ops/experimental_ops.py
index 933c17baf80c3..4f64b6df1b5fb 100644
--- a/torch/distributed/_tensor/ops/experimental_ops.py
+++ b/torch/distributed/_tensor/ops/experimental_ops.py
@@ -2,7 +2,10 @@
 # implement matrix related ops for distributed tensor
 from typing import List
 
-import numpy as np
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    np = None  # type: ignore[assignment]
 
 import torch
 from torch.distributed._tensor.op_schema import OpSchema, OutputSharding
@@ -44,47 +47,3 @@ def bernoulli_rules(op_schema: OpSchema) -> OutputSharding:
     input_spec = op_schema.args_schema[0]
     assert isinstance(input_spec, DTensorSpec)
     return OutputSharding(input_spec)
-
-
-@register_prop_rule(aten.nll_loss_forward.default)
-def nll_loss_forward_rules(op_schema: OpSchema) -> OutputSharding:
-    input_spec = op_schema.args_schema[0]
-    assert isinstance(input_spec, DTensorSpec)
-    assert input_spec.tensor_meta is not None
-    result_shape: List[int] = []
-    result_stride: List[int] = []
-    result_dim = 0
-    total_weight_shape: List[int] = []
-    total_weight_stride: List[int] = []
-    total_weight_dim = 0
-
-    result_tensor_meta = TensorMeta(
-        torch.Size(result_shape),
-        tuple(result_stride),
-        input_spec.tensor_meta.dtype,
-    )
-    total_weight_tensor_meta = TensorMeta(
-        torch.Size(total_weight_shape),
-        tuple(result_stride),
-        input_spec.tensor_meta.dtype,
-    )
-    result_spec = DTensorSpec.from_dim_map(
-        input_spec.mesh,
-        [-1 for _ in range(result_dim)],
-        [],
-        tensor_meta=result_tensor_meta,
-    )
-    total_weight_spec = DTensorSpec.from_dim_map(
-        input_spec.mesh,
-        [-1 for _ in range(total_weight_dim)],
-        [],
-        tensor_meta=total_weight_tensor_meta,
-    )
-    return OutputSharding([result_spec, total_weight_spec])
-
-
-@register_prop_rule(aten.nll_loss_backward.default)
-def nll_loss_backward_rules(op_schema: OpSchema) -> OutputSharding:
-    input_spec = op_schema.args_schema[1]
-    assert isinstance(input_spec, DTensorSpec)
-    return OutputSharding(input_spec)
diff --git a/torch/distributed/_tensor/ops/math_ops.py b/torch/distributed/_tensor/ops/math_ops.py
index cba1b048e7d6e..495a7ae83b820 100644
--- a/torch/distributed/_tensor/ops/math_ops.py
+++ b/torch/distributed/_tensor/ops/math_ops.py
@@ -1,24 +1,26 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
-from typing import cast, List, Optional, Sequence, Tuple
+import math
+from dataclasses import dataclass
+from enum import Enum
+from typing import cast, List, Optional, Sequence, Tuple, Union
 
 import torch
 
-import torch.distributed.distributed_c10d as c10d
 from torch.distributed._tensor.op_schema import (
     OpSchema,
     OpStrategy,
-    OutputSharding,
     PlacementStrategy,
     RuntimeSchemaInfo,
+    TupleStrategy,
 )
-from torch.distributed._tensor.ops.common_rules import pointwise_rule
 from torch.distributed._tensor.ops.utils import (
     as_list,
     generate_redistribute_costs,
+    is_tensor_evenly_shardable,
+    normalize_dim,
     normalize_dims,
     normalize_to_torch_size,
     register_op_strategy,
-    register_prop_rule,
 )
 from torch.distributed._tensor.placement_types import (
     _Partial,
@@ -33,6 +35,117 @@
 aten = torch.ops.aten
 
 
+class Reduction(Enum):
+    NONE = 0
+    MEAN = 1
+    SUM = 2
+
+
+@dataclass(frozen=True)
+class NormReduction:
+    norm_type: Union[int, float, str]
+
+
+ReductionOpType = Union[NormReduction, str]
+
+
+@dataclass(frozen=True)
+class _NormPartial(_Partial):
+    """
+    This placement is used for partial vector norm.
+
+    For p-norms (where p not inf or -inf), the p-norm over n elements computes
+        (sum_i x_i^p)^(1/p)
+    where the sum is from i=1 to n. The reduction op is the p-norm itself.
+    For example, consider 2 ranks, a (4,) tensor sharded on dim-0, and 2-norm:
+        Rank 0: [t1, t2] | Rank 1: [t3, t4]
+    After computing 2-norm per gradient (partial placement):
+        Rank 0: [sqrt(t1^2 + t2^2)] | Rank 1: [sqrt(t3^2 + t4^2)]
+    Converting from partial to replicate wants to ultimately get:
+        Rank 0/1: [sqrt(t1^2 + t2^2 + t3^2 + t4^2)]
+    This can be achieved by computing 2-norm on each rank's result. This holds
+    similarly for inf and -inf norm. For 0-norm, the reduction op is sum.
+    """
+
+    norm_type: Union[int, float, str] = 2
+
+    def __post_init__(self):
+        """Set the appropriate reduce op based on the norm type."""
+        # Use `object.__setattr__` to bypass frozen checks
+        if self.norm_type in (float("inf"), "inf"):
+            object.__setattr__(self, "reduce_op", "max")
+        elif self.norm_type in (float("-inf"), "-inf"):
+            object.__setattr__(self, "reduce_op", "min")
+        elif isinstance(self.norm_type, (int, float)):
+            object.__setattr__(self, "reduce_op", "sum")
+        else:
+            raise NotImplementedError(f"Unsupported norm type: {self.norm_type}")
+
+    def _partition_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        """
+        For example, consider 4 ranks, a (3,) replicated tensor, and 2-norm:
+            Ranks 0 and 1: sqrt(t1^2 + t2^2 + t3^3)
+        To convert from replicated to partial, we want f(x) such that
+            sqrt(t1^2 + t2^2 + t3^3) = sqrt(4f(t1)^2 + 4f(t2)^2 + 4f(t3)^2)
+                                     = sqrt(4) sqrt(f(t1)^2 + f(t2)^2 + f(t3)^2).
+        One such f(x) is f(x) = x / sqrt(4). This generalizes to d ranks and
+        p-norm as f(x) = x / d^(1/p).
+        """
+        if self.reduce_op in ("max", "min"):
+            return tensor
+        elif self.reduce_op == "sum":
+            if self.norm_type == 0:
+                raise NotImplementedError(f"Unsupported norm type:: {self.norm_type}")
+            elif self.norm_type == 1:
+                return tensor / mesh.size(mesh_dim)
+            assert isinstance(self.norm_type, (int, float))
+            return tensor / math.pow(mesh.size(mesh_dim), 1 / self.norm_type)
+        raise NotImplementedError(self.reduce_op)
+
+    def _reduce_shard_value(
+        self,
+        tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        shard_spec: Placement,
+    ) -> torch.Tensor:
+        assert isinstance(shard_spec, Shard), f"{shard_spec}"
+        tensor = self._pre_reduce_transform(tensor)
+        reduced_tensor = super()._reduce_shard_value(tensor, mesh, mesh_dim, shard_spec)
+        return self._post_reduce_transform(reduced_tensor)
+
+    def _reduce_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        tensor = self._pre_reduce_transform(tensor)
+        reduced_tensor = super()._reduce_value(tensor, mesh, mesh_dim)
+        return self._post_reduce_transform(reduced_tensor)
+
+    def _pre_reduce_transform(self, tensor: torch.Tensor) -> torch.Tensor:
+        if self.reduce_op == "sum":
+            assert isinstance(self.norm_type, (int, float)), f"{self.norm_type}"
+            if self.norm_type != 0 and self.norm_type != 1:
+                return tensor**self.norm_type
+        return tensor
+
+    def _post_reduce_transform(self, tensor: torch.Tensor) -> torch.Tensor:
+        if self.reduce_op == "sum":
+            assert isinstance(self.norm_type, (int, float)), f"{self.norm_type}"
+            if self.norm_type != 0 and self.norm_type != 1:
+                return tensor ** (1.0 / self.norm_type)
+        return tensor
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, _NormPartial):
+            return False
+        return self.norm_type == other.norm_type
+
+    def __hash__(self) -> int:
+        return 1 + hash(self.norm_type)
+
+
 def _infer_reduction_dims(dims_arg: object, ndim: int) -> Optional[List[int]]:
     if dims_arg is None:
         return None
@@ -82,7 +195,7 @@ def map_placements_after_reduction(
     placements: Tuple[Placement, ...],
     reduction_dims: List[int],
     reduction_dims_map: List[int],
-    reduction_op: c10d.ReduceOp.RedOpType,
+    reduction_op: ReductionOpType,
 ) -> Tuple[Placement, ...]:
     """
     Map each placement based on the output shape after reduction.
@@ -98,19 +211,25 @@ def map_placements_after_reduction(
             if new_shard_dim == -1 or shard_dim in reduction_dims:
                 # if new_shard_dim collapsed or its in the reduction dims
                 # (i.e. for the case where keepdims=True), we generate partial
-                new_placements.append(_Partial(reduction_op))
+                new_placements.append(get_placement_from_reduction_op(reduction_op))
             else:
                 new_placements.append(Shard(new_shard_dim))
     return tuple(new_placements)
 
 
+def get_placement_from_reduction_op(reduction_op: ReductionOpType) -> Placement:
+    if isinstance(reduction_op, NormReduction):
+        return _NormPartial(norm_type=reduction_op.norm_type)
+    return _Partial(reduction_op)
+
+
 def common_reduction_strategy(
     mesh: DeviceMesh,
     input_strategy: OpStrategy,
     reduce_dims: List[int],
     keep_dim: bool = False,
     reduction_linear: bool = True,
-    reduction_op: c10d.ReduceOp.RedOpType = c10d.ReduceOp.SUM,
+    reduction_op: ReductionOpType = "sum",
 ) -> OpStrategy:
     """
     reduction_linear means that the reduction `f` follows this rule:
@@ -126,15 +245,15 @@ def common_reduction_strategy(
             # input placements for this strategy should clear out pending sum and sharding
             # on the reduction dimension
             input_placements = replicate_reduction_dims(
-                strtg.out_spec.placements, reduce_dims
+                strtg.output_spec.placements, reduce_dims
             )
         else:
-            input_placements = strtg.out_spec.placements
+            input_placements = strtg.output_spec.placements
 
         input_spec = DTensorSpec(
             mesh=mesh,
             placements=input_placements,
-            tensor_meta=strtg.out_spec.tensor_meta,
+            tensor_meta=strtg.output_spec.tensor_meta,
         )
 
         reduce_dims_map = _infer_reduce_dims_map(reduce_dims, input_spec.ndim, keep_dim)
@@ -144,7 +263,7 @@ def common_reduction_strategy(
         redistribute_cost = [generate_redistribute_costs(input_strategy, input_spec)]
         reduction_strategy.strategies.append(
             PlacementStrategy(
-                output_spec=DTensorSpec(
+                output_specs=DTensorSpec(
                     mesh=mesh,
                     placements=out_placements,
                 ),
@@ -157,22 +276,22 @@ def common_reduction_strategy(
 
 
 LINEAR_REDUCTION_OP_MAP = {
-    aten.all.default: c10d.ReduceOp.SUM,
-    aten.all.dim: c10d.ReduceOp.SUM,
-    aten.sum.default: c10d.ReduceOp.SUM,
-    aten.sum.dim_IntList: c10d.ReduceOp.SUM,
-    aten.prod.default: c10d.ReduceOp.PRODUCT,
-    aten.prod.dim_int: c10d.ReduceOp.PRODUCT,
-    aten.prod.int_out: c10d.ReduceOp.PRODUCT,
-    aten.mean.default: c10d.ReduceOp.AVG,
-    aten.mean.dim: c10d.ReduceOp.AVG,
-    aten.mean.out: c10d.ReduceOp.AVG,
-    aten.max.default: c10d.ReduceOp.MAX,
-    aten.max.dim: c10d.ReduceOp.MAX,
-    aten.max.out: c10d.ReduceOp.MAX,
-    aten.min.default: c10d.ReduceOp.MIN,
-    aten.min.dim: c10d.ReduceOp.MIN,
-    aten.min.out: c10d.ReduceOp.MIN,
+    aten.all.default: "sum",
+    aten.all.dim: "sum",
+    aten.sum.default: "sum",
+    aten.sum.dim_IntList: "sum",
+    aten.prod.default: "product",
+    aten.prod.dim_int: "product",
+    aten.prod.int_out: "product",
+    aten.mean.default: "avg",
+    aten.mean.dim: "avg",
+    aten.mean.out: "avg",
+    aten.max.default: "max",
+    aten.max.dim: "max",
+    aten.max.out: "max",
+    aten.min.default: "min",
+    aten.min.dim: "min",
+    aten.min.out: "min",
 }
 
 
@@ -221,38 +340,365 @@ def var_reduction_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
     )
 
 
-@register_prop_rule(
+@register_op_strategy(
+    [aten.linalg_vector_norm.default], schema_info=RuntimeSchemaInfo(1)
+)
+def vector_norm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    args_schema = op_schema.args_schema
+    input_strategy = args_schema[0]
+    assert isinstance(input_strategy, OpStrategy)
+    norm_type = args_schema[1] if len(args_schema) > 1 else 2
+    assert isinstance(norm_type, (int, float, str)), f"{norm_type}"
+    dim = args_schema[2] if len(args_schema) > 2 else None
+    keepdim = args_schema[3] if len(args_schema) > 3 else False
+    dims = _infer_reduction_dims(dim, input_strategy.output_ndim)
+    reduce_dims = list(range(input_strategy.output_ndim)) if dims is None else dims
+    return common_reduction_strategy(
+        mesh,
+        input_strategy,
+        reduce_dims,
+        keep_dim=cast(bool, keepdim),
+        reduction_linear=True,
+        reduction_op=NormReduction(norm_type),
+    )
+
+
+@register_op_strategy(
+    [aten._foreach_norm.Scalar], schema_info=RuntimeSchemaInfo(1, needs_pytree=True)
+)
+def foreach_norm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> TupleStrategy:
+    args_schema = op_schema.args_schema
+    input_tuple_strategy = args_schema[0]
+    assert isinstance(input_tuple_strategy, TupleStrategy)
+    norm_type = args_schema[1]
+    assert isinstance(norm_type, (int, float, str)), f"{norm_type}"
+    output_tuple_strategy_childs: List[OpStrategy] = []
+    for op_strategy in input_tuple_strategy.childs:
+        assert isinstance(op_strategy, OpStrategy), f"{op_strategy}"
+        reduce_dims = list(range(op_strategy.output_ndim))
+        output_strategy = common_reduction_strategy(
+            mesh,
+            op_strategy,
+            reduce_dims,
+            reduction_linear=True,
+            reduction_op=NormReduction(norm_type),
+        )
+        output_tuple_strategy_childs.append(output_strategy)
+    return TupleStrategy(output_tuple_strategy_childs)
+
+
+@register_op_strategy(
     [aten._log_softmax.default, aten._softmax.default], schema_info=RuntimeSchemaInfo(1)
 )
-def softmax_rule(op_schema: OpSchema) -> OutputSharding:
-    input_spec, softmax_dim, _ = op_schema.args_schema
-    input_spec = cast(DTensorSpec, input_spec)
+def softmax_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    input_strategy, softmax_dim, _ = op_schema.args_schema
+    input_strategy = cast(OpStrategy, input_strategy)
     softmax_dim = cast(int, softmax_dim)
-    dim_map = input_spec.dim_map
-    if softmax_dim < len(dim_map) and dim_map[softmax_dim] >= 0:
-        raise RuntimeError("Cannot run softmax on sharding dimension!")
-    return OutputSharding(input_spec)
+    softmax_dim = normalize_dim(softmax_dim, input_strategy.output_ndim)
+
+    output_strategy = OpStrategy([])
+    for idx, input_placement_strategy in enumerate(input_strategy.strategies):
+        redistribute_costs = []
+        input_src_spec = input_placement_strategy.output_spec
+
+        # make sure input is replicated along the softmax dim
+        input_target_spec = DTensorSpec(
+            mesh=mesh,
+            placements=replicate_reduction_dims(
+                input_src_spec.placements, [softmax_dim]
+            ),
+            tensor_meta=input_src_spec.tensor_meta,
+        )
+        redistribute_costs.append(
+            generate_redistribute_costs(input_strategy, input_target_spec)
+        )
+        output_target_spec = input_target_spec
+        output_strategy.strategies.append(
+            PlacementStrategy(
+                output_specs=output_target_spec,
+                input_specs=[input_target_spec],
+                redistribute_cost=redistribute_costs,
+            )
+        )
+
+    return output_strategy
 
 
-@register_prop_rule(
+@register_op_strategy(
     [
         aten._log_softmax_backward_data.default,
         aten._softmax_backward_data.default,
     ],
     schema_info=RuntimeSchemaInfo(2),
 )
-def softmax_bwd_rule(op_schema: OpSchema) -> OutputSharding:
-    grad_out_spec, out_spec, softmax_dim, _ = op_schema.args_schema
-    grad_out_spec = cast(DTensorSpec, grad_out_spec)
-    out_spec = cast(DTensorSpec, out_spec)
+def softmax_backward_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    grad_out_strategy, out_strategy, softmax_dim, _ = op_schema.args_schema
+    grad_out_strategy = cast(OpStrategy, grad_out_strategy)
+    out_strategy = cast(OpStrategy, out_strategy)
     softmax_dim = cast(int, softmax_dim)
-    grad_out_dim_map = grad_out_spec.dim_map
-    out_dim_map = out_spec.dim_map
-    if softmax_dim < len(grad_out_dim_map) and (
-        grad_out_dim_map[softmax_dim] >= 0 or out_dim_map[softmax_dim] >= 0
+    softmax_dim = normalize_dim(softmax_dim, grad_out_strategy.output_ndim)
+
+    grad_in_strategy = OpStrategy([])
+    for grad_out_placement_strat, out_placement_strat in zip(
+        grad_out_strategy.strategies, out_strategy.strategies
     ):
-        raise RuntimeError("Cannot run _softmax_backward_data on sharding dimension!")
-    return pointwise_rule(op_schema)
+        # follow the sharding of the grad_out or out depending on which has more shards
+        grad_out_src_spec = grad_out_placement_strat.output_spec
+        out_src_spec = out_placement_strat.output_spec
+        src_spec = (
+            grad_out_src_spec
+            if grad_out_src_spec.num_shards >= out_src_spec.num_shards
+            else out_src_spec
+        )
+
+        # make sure inputs are replicated along the softmax dim
+        tgt_spec = DTensorSpec(
+            mesh=mesh,
+            placements=replicate_reduction_dims(src_spec.placements, [softmax_dim]),
+        )
+        redist_grad_out_cost = generate_redistribute_costs(grad_out_strategy, tgt_spec)
+        redist_out_cost = generate_redistribute_costs(out_strategy, tgt_spec)
+        grad_in_strategy.strategies.append(
+            PlacementStrategy(
+                output_specs=tgt_spec,
+                redistribute_cost=[redist_grad_out_cost, redist_out_cost],
+            )
+        )
+
+    return grad_in_strategy
+
+
+@register_op_strategy(
+    [aten.nll_loss_forward.default, aten.nll_loss2d_forward.default],
+    schema_info=RuntimeSchemaInfo(3),
+)
+def nll_loss_forward_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    assert len(op_schema.args_schema) == 5
+    (
+        input_strategy,
+        target_strategy,
+        weight_strategy,
+        reduction,
+        _,
+    ) = op_schema.args_schema
+    input_strategy = cast(OpStrategy, input_strategy)
+    target_strategy = cast(OpStrategy, target_strategy)
+    reduction = cast(int, reduction)
+
+    input_shape = input_strategy.output_shape
+    channel_dim = 1 if len(input_shape) >= 2 else 0
+
+    output_strategy = OpStrategy([])
+    for idx, input_placement_strategy in enumerate(input_strategy.strategies):
+        op_args_target_specs = []
+        redistribute_costs = []
+
+        # make sure input is replicated along the channel dim
+        input_src_spec = input_placement_strategy.output_spec
+        input_expected_spec = DTensorSpec(
+            mesh=mesh,
+            placements=replicate_reduction_dims(
+                input_src_spec.placements, [channel_dim]
+            ),
+            tensor_meta=input_src_spec.tensor_meta,
+        )
+        op_args_target_specs.append(input_expected_spec)
+        redistribute_costs.append(
+            generate_redistribute_costs(input_strategy, input_expected_spec)
+        )
+
+        # target doesn't have channel dim, and it follows input on other dims
+        target_src_spec = target_strategy.strategies[idx].output_spec
+        target_expected_spec = DTensorSpec(
+            mesh=mesh,
+            placements=_skip_dim(input_expected_spec.placements, channel_dim),
+            tensor_meta=target_src_spec.tensor_meta,
+        )
+        op_args_target_specs.append(target_expected_spec)
+        redistribute_costs.append(
+            generate_redistribute_costs(target_strategy, target_expected_spec)
+        )
+
+        # weight tensor, if given, has to be a Tensor of size input_shape[channel_dim]
+        # make sure it is replicated
+        if weight_strategy is not None:
+            assert isinstance(weight_strategy, OpStrategy)
+            weight_src_spec = weight_strategy.strategies[idx].output_spec
+            weight_expected_spec = DTensorSpec(
+                mesh=mesh,
+                placements=_replicate_dims_start_at(weight_src_spec.placements),
+                tensor_meta=weight_src_spec.tensor_meta,
+            )
+            op_args_target_specs.append(weight_expected_spec)
+            redistribute_costs.append(
+                generate_redistribute_costs(weight_strategy, weight_expected_spec)
+            )
+
+        if reduction == Reduction.NONE.value:
+            output_expected_spec = target_expected_spec
+            total_weight_expected_spec = DTensorSpec(
+                mesh=mesh, placements=tuple([Replicate()] * mesh.ndim)
+            )
+        else:
+            if reduction == Reduction.MEAN.value:
+                reduction_op = "avg"
+                if not is_tensor_evenly_shardable(
+                    target_expected_spec.shape, target_expected_spec
+                ):
+                    raise ValueError(
+                        "The intermediate results of nll_loss cannot be evenly sharded, \
+                        resulting in biased mean result."
+                    )
+            else:  # reduction == Reduction.SUM.value:
+                reduction_op = "sum"
+            reduce_dims = list(range(target_expected_spec.ndim))
+            reduce_dims_map = _infer_reduce_dims_map(
+                reduce_dims, target_expected_spec.ndim, keep_dim=False
+            )
+            out_placements = map_placements_after_reduction(
+                target_expected_spec.placements,
+                reduce_dims,
+                reduce_dims_map,
+                reduction_op,
+            )
+            output_expected_spec = DTensorSpec(
+                mesh=mesh,
+                placements=out_placements,
+            )
+
+            # whether reduction is sum or mean, the total weight has to be summed up if not replicated
+            total_weight_placements = map_placements_after_reduction(
+                target_expected_spec.placements,
+                reduce_dims,
+                reduce_dims_map,
+                "sum",
+            )
+            total_weight_expected_spec = DTensorSpec(
+                mesh=mesh,
+                placements=total_weight_placements,
+            )
+
+        output_strategy.strategies.append(
+            PlacementStrategy(
+                output_specs=(output_expected_spec, total_weight_expected_spec),
+                input_specs=op_args_target_specs,
+                redistribute_cost=redistribute_costs,
+            )
+        )
+
+    return output_strategy
+
+
+@register_op_strategy(
+    [aten.nll_loss_backward.default, aten.nll_loss2d_backward.default],
+    schema_info=RuntimeSchemaInfo(4),
+)
+def nll_loss_backward_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    assert len(op_schema.args_schema) == 7
+    (
+        grad_out_strategy,
+        input_strategy,
+        target_strategy,
+        weight_strategy,
+        reduction,
+        _,
+        total_weight_strategy,
+    ) = op_schema.args_schema
+    grad_out_strategy = cast(OpStrategy, grad_out_strategy)
+    input_strategy = cast(OpStrategy, input_strategy)
+    target_strategy = cast(OpStrategy, target_strategy)
+    reduction = cast(int, reduction)
+    total_weight_strategy = cast(OpStrategy, total_weight_strategy)
+
+    input_shape = input_strategy.output_shape
+    channel_dim = 1 if len(input_shape) >= 2 else 0
+
+    grad_in_strategy = OpStrategy([])
+    for idx, input_placement_strategy in enumerate(input_strategy.strategies):
+        op_args_target_specs = []
+        redistribute_costs = []
+
+        # make sure input is replicated along the channel dim
+        input_src_spec = input_placement_strategy.output_spec
+        input_expected_spec = DTensorSpec(
+            mesh=mesh,
+            placements=replicate_reduction_dims(
+                input_src_spec.placements, [channel_dim]
+            ),
+            tensor_meta=input_src_spec.tensor_meta,
+        )
+        op_args_target_specs.append(input_expected_spec)
+        redistribute_costs.append(
+            generate_redistribute_costs(input_strategy, input_expected_spec)
+        )
+
+        # target doesn't have channel dim, and it follows input on other dims
+        target_src_spec = target_strategy.strategies[idx].output_spec
+        target_expected_spec = DTensorSpec(
+            mesh=mesh,
+            placements=_skip_dim(input_expected_spec.placements, channel_dim),
+            tensor_meta=target_src_spec.tensor_meta,
+        )
+        op_args_target_specs.append(target_expected_spec)
+        redistribute_costs.append(
+            generate_redistribute_costs(target_strategy, target_expected_spec)
+        )
+
+        # grad_out follows target if there is no reduction;
+        # otherwise, it should be a replicated scalar.
+        grad_out_src_spec = grad_out_strategy.strategies[idx].output_spec
+        if reduction == Reduction.NONE.value:
+            grad_out_expected_spec = target_expected_spec
+        else:
+            grad_out_expected_spec = DTensorSpec(
+                mesh=mesh,
+                placements=_replicate_dims_start_at(grad_out_src_spec.placements),
+                tensor_meta=grad_out_src_spec.tensor_meta,
+            )
+        op_args_target_specs.insert(0, grad_out_expected_spec)
+        redistribute_costs.insert(
+            0, generate_redistribute_costs(grad_out_strategy, grad_out_expected_spec)
+        )
+
+        # weight tensor, if given, has to be a Tensor of size input_shape[channel_dim]
+        # make sure it is replicated
+        if weight_strategy is not None:
+            assert isinstance(weight_strategy, OpStrategy)
+            weight_src_spec = weight_strategy.strategies[idx].output_spec
+            weight_expected_spec = DTensorSpec(
+                mesh=mesh,
+                placements=_replicate_dims_start_at(weight_src_spec.placements),
+                tensor_meta=weight_src_spec.tensor_meta,
+            )
+            op_args_target_specs.append(weight_expected_spec)
+            redistribute_costs.append(
+                generate_redistribute_costs(weight_strategy, weight_expected_spec)
+            )
+
+        # total_weight should always be replicated
+        total_weight_src_spec = total_weight_strategy.strategies[idx].output_spec
+        total_weight_expected_spec = DTensorSpec(
+            mesh=mesh,
+            placements=_replicate_dims_start_at(total_weight_src_spec.placements),
+            tensor_meta=total_weight_src_spec.tensor_meta,
+        )
+        op_args_target_specs.append(total_weight_expected_spec)
+        redistribute_costs.append(
+            generate_redistribute_costs(
+                total_weight_strategy, total_weight_expected_spec
+            )
+        )
+
+        grad_in_expected_spec = input_expected_spec
+        grad_in_strategy.strategies.append(
+            PlacementStrategy(
+                output_specs=grad_in_expected_spec,
+                input_specs=op_args_target_specs,
+                redistribute_cost=redistribute_costs,
+            )
+        )
+
+    return grad_in_strategy
 
 
 @register_op_strategy(
@@ -288,7 +734,7 @@ def layer_norm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
     for idx, input_placement_strategy in enumerate(input_strategy.strategies):
         op_args_target_specs = []
         redistribute_costs = []
-        input_src_spec = input_placement_strategy.out_spec
+        input_src_spec = input_placement_strategy.output_spec
 
         # for the input tensor, we replicate it on the inner dims if necessary
         # TODO: we can avoid forcing the redistribution once we figure out
@@ -305,7 +751,7 @@ def layer_norm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
 
         if weight_strategy is not None:
             assert isinstance(weight_strategy, OpStrategy)
-            weight_src_spec = weight_strategy.strategies[idx].out_spec
+            weight_src_spec = weight_strategy.strategies[idx].output_spec
 
             # for the weight tensor, we replicate it on all dims if necessary
             # TODO: we can avoid forcing the redistribution once we figure out
@@ -322,7 +768,7 @@ def layer_norm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
 
         if bias_strategy is not None:
             assert isinstance(bias_strategy, OpStrategy)
-            bias_src_spec = bias_strategy.strategies[idx].out_spec
+            bias_src_spec = bias_strategy.strategies[idx].output_spec
 
             # for the bias tensor, we replicate it on all dims if necessary
             # TODO: we can avoid forcing the redistribution once we figure out
@@ -341,7 +787,7 @@ def layer_norm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
         output_target_spec = input_target_spec
         output_strategy.strategies.append(
             PlacementStrategy(
-                output_spec=output_target_spec,
+                output_specs=output_target_spec,
                 input_specs=op_args_target_specs,
                 redistribute_cost=redistribute_costs,
             )
@@ -391,7 +837,7 @@ def layer_norm_bwd_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy
         op_args_target_specs = []
         redistribute_costs = []
 
-        input_src_spec = input_placement_strategy.out_spec
+        input_src_spec = input_placement_strategy.output_spec
         # arg: grad_out
         # TODO: change the strategy to the following rule.
         # d_input is basically a product of element-wise mul of
@@ -428,10 +874,10 @@ def layer_norm_bwd_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy
         )
 
         # arg: mean, rstd
-        mean_src_spec = mean_strategy.strategies[idx].out_spec
+        mean_src_spec = mean_strategy.strategies[idx].output_spec
         op_args_target_specs.append(mean_src_spec)
         redistribute_costs.append([0.0 for _ in mean_strategy.strategies])
-        rstd_src_spec = rstd_strategy.strategies[idx].out_spec
+        rstd_src_spec = rstd_strategy.strategies[idx].output_spec
         op_args_target_specs.append(rstd_src_spec)
         redistribute_costs.append([0.0 for _ in rstd_strategy.strategies])
 
@@ -439,7 +885,7 @@ def layer_norm_bwd_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy
         # d_weight = sum(grad_out * (input - mean) / rstd, outer_dim, keepdim=False)
         if output_mask[1]:
             assert isinstance(weight_strategy, OpStrategy)
-            weight_src_spec = weight_strategy.strategies[idx].out_spec
+            weight_src_spec = weight_strategy.strategies[idx].output_spec
             # no need to redistribute weight since they should be replicated
             # in forward pass
             op_args_target_specs.append(weight_src_spec)
@@ -452,7 +898,7 @@ def layer_norm_bwd_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy
                 outer_dims, input_src_spec.ndim, False
             )
             out_placements = map_placements_after_reduction(
-                inp_placements, outer_dims, reduce_dims_map, c10d.ReduceOp.SUM
+                inp_placements, outer_dims, reduce_dims_map, "sum"
             )
             output_specs_list.append(
                 DTensorSpec(
@@ -468,7 +914,7 @@ def layer_norm_bwd_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy
         # d_bias = sum(grad_out, outer_dim, keepdim=False)
         if output_mask[2]:
             assert isinstance(bias_strategy, OpStrategy)
-            bias_src_spec = bias_strategy.strategies[idx].out_spec
+            bias_src_spec = bias_strategy.strategies[idx].output_spec
             # no need to redistribute weight since they should be replicated
             # in forward pass
             op_args_target_specs.append(bias_src_spec)
@@ -485,7 +931,7 @@ def layer_norm_bwd_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy
                 outer_dims, grad_out_spec.ndim, False
             )
             out_placements = map_placements_after_reduction(
-                inp_placements, outer_dims, reduce_dims_map, c10d.ReduceOp.SUM
+                inp_placements, outer_dims, reduce_dims_map, "sum"
             )
             output_specs_list.append(
                 DTensorSpec(
@@ -499,7 +945,7 @@ def layer_norm_bwd_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy
 
         out_tuple_strategy.strategies.append(
             PlacementStrategy(
-                output_spec=tuple(output_specs_list),
+                output_specs=tuple(output_specs_list),
                 input_specs=op_args_target_specs,
                 redistribute_cost=redistribute_costs,
             )
@@ -518,3 +964,16 @@ def _replicate_dims_start_at(
         else:
             new_placements.append(p)  # keep the placement
     return tuple(new_placements)
+
+
+# return new_placements which align with placements but skip the skipped_dim
+def _skip_dim(
+    placements: Tuple[Placement, ...], skipped_dim: int
+) -> Tuple[Placement, ...]:
+    new_placements: List[Placement] = []
+    for p in placements:
+        if isinstance(p, Shard) and p.dim >= skipped_dim:
+            new_placements.append(Shard(p.dim - 1))
+        else:
+            new_placements.append(p)
+    return tuple(new_placements)
diff --git a/torch/distributed/_tensor/ops/matrix_ops.py b/torch/distributed/_tensor/ops/matrix_ops.py
index d45355d7053cb..ed34158f9195a 100644
--- a/torch/distributed/_tensor/ops/matrix_ops.py
+++ b/torch/distributed/_tensor/ops/matrix_ops.py
@@ -1,27 +1,53 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # implement matrix related ops for distributed tensor
+import itertools
+from typing import List, Optional
+
 import torch
-from torch.distributed._tensor.op_schema import OpSchema, OpStrategy, OutputSharding
+from torch.distributed._tensor.op_schema import OpSchema, OpStrategy, PlacementStrategy
 from torch.distributed._tensor.ops.basic_strategy import gen_einsum_strategies
-from torch.distributed._tensor.ops.common_rules import einop_rule
 from torch.distributed._tensor.ops.utils import (
     generate_redistribute_costs,
     infer_broadcast_dims_map,
     is_tensor_shardable,
     map_placements_after_broadcast,
     register_op_strategy,
-    register_prop_rule,
 )
-from torch.distributed._tensor.placement_types import DTensorSpec
+from torch.distributed._tensor.placement_types import (
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+)
 
 from torch.distributed.device_mesh import DeviceMesh
 
 aten = torch.ops.aten
 
 
-@register_prop_rule(aten.t.default)
-def transpose_rule(op_schema: OpSchema) -> OutputSharding:
-    return einop_rule("ij->ji", op_schema, linearity=True)
+@register_op_strategy(aten.t.default)
+def transpose_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    self_strategy = op_schema.args_schema[0]
+    assert isinstance(self_strategy, OpStrategy)
+
+    transpose_strategies = []
+    for input_strategy in self_strategy.strategies:
+        input_spec = input_strategy.output_spec
+        # follow the input spec but transpose the Shard placements
+        output_placements = [
+            Shard(1 - p.dim) if isinstance(p, Shard) else p
+            for p in input_spec.placements
+        ]
+        transpose_strategy = PlacementStrategy(
+            output_specs=DTensorSpec(
+                mesh=input_strategy.output_spec.mesh,
+                placements=tuple(output_placements),
+            ),
+            input_specs=(input_strategy.output_spec,),
+        )
+        transpose_strategies.append(transpose_strategy)
+
+    return OpStrategy(strategies=transpose_strategies)
 
 
 def _mm_like_strategy(
@@ -80,7 +106,7 @@ def _addmm_like_strategy(
         assert strtg.input_specs is not None
         mat1_spec = strtg.input_specs[0]
         mat2_spec = strtg.input_specs[1]
-        out_spec = strtg.out_spec
+        out_spec = strtg.output_spec
 
         # self arg's spec should follow the output of mm, but need
         # to consider broadcast for the self arg
@@ -128,3 +154,403 @@ def bmm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
 @register_op_strategy(aten.baddbmm.default)
 def baddmm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
     return _addmm_like_strategy("bmk,bkn->bmn", mesh, op_schema)
+
+
+@register_op_strategy(aten._scaled_dot_product_flash_attention.default)
+def scaled_dot_product_flash_attention_strategy(
+    mesh: DeviceMesh, op_schema: OpSchema
+) -> OpStrategy:
+    # NOTE: currently we only support some simple strategies to support tensor parallelism
+    # TODO: sdpa might be a good candidate for us to explore decomposed sharding propagation
+    # as it involves: matmul, pointwise, reduction ops together.
+    return_debug_mask = len(op_schema.args_schema) >= 6 and op_schema.args_schema[5]
+    q_input_strategy = op_schema.args_schema[0]
+    assert isinstance(q_input_strategy, OpStrategy)
+    # assuming q/k/v have the same shape
+    qkv_shape = q_input_strategy.output_shape
+
+    all_mesh_dim_strategies = []
+
+    for mesh_dim in range(mesh.ndim):
+        single_mesh_dim_strategies = []
+
+        # placement list stores placements of [outputs, inputs]
+        # in the spda case, we have 3 valid tensor outputs and 3 tensor inputs
+        # first we can always accept full replication for both inputs and outputs
+        all_replicate: List[Placement] = [Replicate()] * 6
+        single_mesh_dim_strategies.append(all_replicate)
+
+        # second we can accept the sharding pattern of tensor parallelism, which
+        # shard on the num of head dim
+        qkv_sharding = Shard(1)  # num head dim
+        output_sharding = Shard(1)  # num head dim
+        logsumexp_sharding = Shard(1)  # num head dim
+        if return_debug_mask:
+            debug_attn_mask_sharding: Placement = Shard(1)  # num head dim
+        else:
+            # empty debug mask, replicated
+            debug_attn_mask_sharding = Replicate()
+
+        num_heads_dim_sharding = [
+            output_sharding,
+            logsumexp_sharding,
+            debug_attn_mask_sharding,
+            qkv_sharding,
+            qkv_sharding,
+            qkv_sharding,
+        ]
+        single_mesh_dim_strategies.append(num_heads_dim_sharding)
+
+        # Context Parallelism: shards on the sequence dim
+        single_mesh_dim_strategies.append(
+            [
+                Shard(2),  # output
+                Shard(2),  # logsumexp
+                Shard(2),  # debugattn
+                Shard(2),  # q
+                Shard(2),  # k
+                Shard(2),  # v
+            ]
+        )
+
+        all_mesh_dim_strategies.append(single_mesh_dim_strategies)
+
+    strategy_combs = itertools.product(*all_mesh_dim_strategies)
+
+    all_strategies = []
+    for strategy_comb in strategy_combs:
+        spec_list = []
+        for specs in zip(*strategy_comb):
+            spec_list.append(DTensorSpec(mesh, tuple(specs)))
+
+        assert len(spec_list) == 6
+        input_expected_specs = spec_list[3:]
+        output_specs: List[Optional[DTensorSpec]] = list(spec_list[:3])
+        # fix up output_specs and fill in None for the int and empty tensor return values
+        for i in range(2, 8):
+            output_specs.insert(i, None)
+        if all(is_tensor_shardable(qkv_shape, spec) for spec in input_expected_specs):
+            # only add to the strategy list when all inputs are shardable
+            redistribute_cost = []
+            for input_idx, spec in enumerate(input_expected_specs):
+                qkv_strategy = op_schema.args_schema[input_idx]
+                assert isinstance(qkv_strategy, OpStrategy)
+                qkv_tensor_meta = qkv_strategy.strategies[0].output_spec.tensor_meta
+                spec.tensor_meta = qkv_tensor_meta
+                redistribute_cost.append(
+                    generate_redistribute_costs(qkv_strategy, spec)
+                )
+
+            strat = PlacementStrategy(
+                output_specs=tuple(output_specs),
+                input_specs=tuple(input_expected_specs),
+                redistribute_cost=redistribute_cost,
+            )
+            all_strategies.append(strat)
+
+    return OpStrategy(all_strategies)
+
+
+@register_op_strategy(aten._scaled_dot_product_flash_attention_backward.default)
+def scaled_dot_product_flash_attention_backward_strategy(
+    mesh: DeviceMesh, op_schema: OpSchema
+) -> OpStrategy:
+    q_input_strategy = op_schema.args_schema[1]
+    assert isinstance(q_input_strategy, OpStrategy)
+    # assuming q/k/v have the same shape
+    qkv_shape = q_input_strategy.output_shape
+
+    tensor_input_indices = [
+        i
+        for i, arg_spec in enumerate(op_schema.args_schema)
+        if isinstance(arg_spec, OpStrategy)
+    ]
+    num_tensor_inputs = len(tensor_input_indices)
+
+    all_mesh_dim_strategies = []
+
+    for mesh_dim in range(mesh.ndim):
+        single_mesh_dim_strategies = []
+
+        # placement list stores placements of [outputs, inputs]
+        # in the spda backward case, we have 3 tensor outputs and 6 to 10 tensor inputs
+        # first we can always accept full replication for both inputs and outputs
+        all_replicate: List[Placement] = [Replicate()] * (3 + num_tensor_inputs)
+
+        single_mesh_dim_strategies.append(all_replicate)
+
+        # second we can accept the sharding pattern of tensor parallelism, which
+        # shard on the num of head dim
+        grad_output_sharding = Shard(1)  # num head dim
+        qkv_sharding = Shard(1)  # num head dim
+        output_sharding = Shard(1)  # num head dim
+        logsumexp_sharding = Shard(1)  # num head dim
+        grad_qkv_sharding = Shard(1)  # num head dim
+
+        num_heads_dim_sharding: List[Placement] = [
+            grad_qkv_sharding,
+            grad_qkv_sharding,
+            grad_qkv_sharding,
+            grad_output_sharding,
+            qkv_sharding,
+            qkv_sharding,
+            qkv_sharding,
+            output_sharding,
+            logsumexp_sharding,
+        ]
+        # accept replicate on the rest tensor inputs, potentially
+        # cum_seq_q, cum_seq_k, philox_seed, philox_offset
+        # at indices 6, 7, 12, 13, respectively
+        num_heads_dim_sharding.extend([Replicate()] * (num_tensor_inputs - 6))
+        single_mesh_dim_strategies.append(num_heads_dim_sharding)
+
+        # Context Parallelism: shards on the sequence dim
+        seq_dim_sharding: List[Placement] = [
+            Shard(2),  # grad_q
+            Shard(2),  # grad_k
+            Shard(2),  # grad_v
+            Shard(2),  # grad_output
+            Shard(2),  # q
+            Shard(2),  # k
+            Shard(2),  # v
+            Shard(2),  # output
+            Shard(2),  # logsumexp
+        ]
+        # accept replicate on the rest tensor inputs, potentially
+        # cum_seq_q, cum_seq_k, philox_seed, philox_offset
+        # at indices 6, 7, 12, 13, respectively
+        seq_dim_sharding.extend([Replicate()] * (num_tensor_inputs - 6))
+        single_mesh_dim_strategies.append(seq_dim_sharding)
+
+        all_mesh_dim_strategies.append(single_mesh_dim_strategies)
+
+    strategy_combs = itertools.product(*all_mesh_dim_strategies)
+
+    all_strategies = []
+    for strategy_comb in strategy_combs:
+        spec_list = []
+        for specs in zip(*strategy_comb):
+            spec_list.append(DTensorSpec(mesh, tuple(specs)))
+
+        assert len(spec_list) == 3 + num_tensor_inputs
+        input_expected_specs = spec_list[3:]
+        output_specs: List[Optional[DTensorSpec]] = list(spec_list[:3])
+        if all(
+            is_tensor_shardable(qkv_shape, spec) for spec in input_expected_specs[:6]
+        ):
+            # only add to the strategy list when all inputs are shardable
+            redistribute_cost = []
+            for input_idx, spec in enumerate(input_expected_specs):
+                qkv_strategy = op_schema.args_schema[tensor_input_indices[input_idx]]
+                assert isinstance(qkv_strategy, OpStrategy)
+                qkv_tensor_meta = qkv_strategy.strategies[0].output_spec.tensor_meta
+                spec.tensor_meta = qkv_tensor_meta
+                redistribute_cost.append(
+                    generate_redistribute_costs(qkv_strategy, spec)
+                )
+
+            strat = PlacementStrategy(
+                output_specs=tuple(output_specs),
+                input_specs=tuple(input_expected_specs),
+                redistribute_cost=redistribute_cost,
+            )
+            all_strategies.append(strat)
+
+    return OpStrategy(all_strategies)
+
+
+@register_op_strategy(aten.constant_pad_nd.default)
+def constant_pad_nd_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    # TODO(d4l3k); implement a more correct strategy for constant_pad_nd
+    return OpStrategy(
+        [
+            PlacementStrategy(
+                output_specs=DTensorSpec(mesh, (Replicate(),)),
+                input_specs=(
+                    DTensorSpec(mesh, (Replicate(),)),
+                    DTensorSpec(mesh, (Replicate(),)),
+                ),
+                redistribute_cost=[[1]],
+            )
+        ]
+    )
+
+
+@register_op_strategy(aten._scaled_dot_product_efficient_attention.default)
+def scaled_dot_product_efficient_attention_strategy(
+    mesh: DeviceMesh, op_schema: OpSchema
+) -> OpStrategy:
+    # NOTE: currently we only support some simple strategies to support tensor parallelism
+    q_input_strategy = op_schema.args_schema[0]
+    assert isinstance(q_input_strategy, OpStrategy)
+    # assuming q/k/v have the same shape
+    qkv_shape = q_input_strategy.output_shape
+    has_attn_bias = op_schema.args_schema[3] is not None
+    compute_log_sumexp = op_schema.args_schema[4]
+
+    all_mesh_dim_strategies = []
+
+    for mesh_dim in range(mesh.ndim):
+        single_mesh_dim_strategies = []
+
+        # placement list stores placements of [outputs, inputs]
+        # in the spda case, we have 2 valid tensor outputs and 3 or 4 tensor inputs
+        # first we can always accept full replication for both inputs and outputs
+        all_replicate: List[Placement] = [Replicate()] * (5 + has_attn_bias)
+        single_mesh_dim_strategies.append(all_replicate)
+
+        # second we can accept the sharding pattern of tensor parallelism, which
+        # shard on the heads dimension
+        qkv_sharding = Shard(1)
+        output_sharding = Shard(1)
+        if compute_log_sumexp:
+            logsumexp_sharding: Placement = Shard(1)
+        else:
+            # empty logsumexp, replicated
+            logsumexp_sharding = Replicate()
+
+        num_heads_dim_sharding = [
+            output_sharding,
+            logsumexp_sharding,
+            qkv_sharding,
+            qkv_sharding,
+            qkv_sharding,
+        ]
+        if has_attn_bias:
+            num_heads_dim_sharding.append(Shard(1))
+        single_mesh_dim_strategies.append(num_heads_dim_sharding)
+
+        all_mesh_dim_strategies.append(single_mesh_dim_strategies)
+
+    strategy_combs = itertools.product(*all_mesh_dim_strategies)
+
+    all_strategies = []
+    for strategy_comb in strategy_combs:
+        spec_list = []
+        for specs in zip(*strategy_comb):
+            spec_list.append(DTensorSpec(mesh, tuple(specs)))
+
+        assert len(spec_list) == (5 + has_attn_bias)
+        input_expected_specs = spec_list[2:]
+        output_specs: List[Optional[DTensorSpec]] = list(spec_list[:2])
+        # fill in None for the scalar tensor return values
+        # namely philox_seed and philox_offset
+        output_specs.extend([None, None])
+        if all(is_tensor_shardable(qkv_shape, spec) for spec in input_expected_specs):
+            # only add to the strategy list when all inputs are shardable
+            redistribute_cost = []
+            for input_idx, spec in enumerate(input_expected_specs):
+                qkv_strategy = op_schema.args_schema[input_idx]
+                assert isinstance(qkv_strategy, OpStrategy)
+                qkv_tensor_meta = qkv_strategy.strategies[0].output_spec.tensor_meta
+                spec.tensor_meta = qkv_tensor_meta
+                redistribute_cost.append(
+                    generate_redistribute_costs(qkv_strategy, spec)
+                )
+
+            strat = PlacementStrategy(
+                output_specs=tuple(output_specs),
+                input_specs=tuple(input_expected_specs),
+                redistribute_cost=redistribute_cost,
+            )
+            all_strategies.append(strat)
+
+    return OpStrategy(all_strategies)
+
+
+@register_op_strategy(aten._scaled_dot_product_efficient_attention_backward.default)
+def scaled_dot_product_efficient_attention_backward_strategy(
+    mesh: DeviceMesh, op_schema: OpSchema
+) -> OpStrategy:
+    q_input_strategy = op_schema.args_schema[1]
+    assert isinstance(q_input_strategy, OpStrategy)
+    # assuming q/k/v have the same shape
+    qkv_shape = q_input_strategy.output_shape
+    has_attn_bias = op_schema.args_schema[4] is not None
+
+    tensor_input_indices = [
+        i
+        for i, arg_spec in enumerate(op_schema.args_schema)
+        if isinstance(arg_spec, OpStrategy)
+    ]
+
+    all_mesh_dim_strategies = []
+
+    for mesh_dim in range(mesh.ndim):
+        single_mesh_dim_strategies = []
+
+        # placement list stores placements of [outputs, inputs]
+        # in the spda backward case, we have 4 tensor outputs and 8 or 9 tensor inputs
+        # NOTE: Output sharding of grad_bias on heads dim if attn_bias is present;
+        #       otherwise grad_bias will be empty and its DTensorSpec will be removed.
+        # first we can always accept full replication for both inputs and outputs
+        all_replicate: List[Placement] = [Replicate()] * (12 + has_attn_bias)
+
+        single_mesh_dim_strategies.append(all_replicate)
+
+        # second we can accept the sharding pattern of tensor parallelism, which
+        # shard on the heads dimension
+        grad_output_sharding = Shard(1)
+        qkv_sharding = Shard(1)
+        output_sharding = Shard(1)
+        logsumexp_sharding = Shard(1)
+        grad_qkv_sharding = Shard(1)
+        grad_bias_sharding = Shard(1)
+
+        num_heads_dim_sharding: List[Placement] = [
+            grad_qkv_sharding,
+            grad_qkv_sharding,
+            grad_qkv_sharding,
+            grad_bias_sharding,
+            grad_output_sharding,
+            qkv_sharding,
+            qkv_sharding,
+            qkv_sharding,
+            # the place for optional input attn_bias,
+            output_sharding,
+            logsumexp_sharding,
+        ]
+        # input sharding of attn_bias on heads dim if present
+        if has_attn_bias:
+            num_heads_dim_sharding.insert(8, Shard(1))
+        # accept replicate on the rest scalar tensor inputs
+        # namely philox_seed and philox_offset
+        num_heads_dim_sharding.extend([Replicate(), Replicate()])
+        single_mesh_dim_strategies.append(num_heads_dim_sharding)
+
+        all_mesh_dim_strategies.append(single_mesh_dim_strategies)
+
+    strategy_combs = itertools.product(*all_mesh_dim_strategies)
+
+    all_strategies = []
+    for strategy_comb in strategy_combs:
+        spec_list = []
+        for specs in zip(*strategy_comb):
+            spec_list.append(DTensorSpec(mesh, tuple(specs)))
+
+        assert len(spec_list) == (12 + has_attn_bias)
+        input_expected_specs = spec_list[4:]
+        output_specs: List[Optional[DTensorSpec]] = list(spec_list[:4])
+        # remove the DTensorSpec of output grad_bias if it's empty
+        if not has_attn_bias:
+            output_specs[-1] = None
+        if all(is_tensor_shardable(qkv_shape, spec) for spec in input_expected_specs):
+            # only add to the strategy list when all inputs are shardable
+            redistribute_cost = []
+            for input_idx, spec in enumerate(input_expected_specs):
+                qkv_strategy = op_schema.args_schema[tensor_input_indices[input_idx]]
+                assert isinstance(qkv_strategy, OpStrategy)
+                qkv_tensor_meta = qkv_strategy.strategies[0].output_spec.tensor_meta
+                spec.tensor_meta = qkv_tensor_meta
+                redistribute_cost.append(
+                    generate_redistribute_costs(qkv_strategy, spec)
+                )
+
+            strat = PlacementStrategy(
+                output_specs=tuple(output_specs),
+                input_specs=tuple(input_expected_specs),
+                redistribute_cost=redistribute_cost,
+            )
+            all_strategies.append(strat)
+
+    return OpStrategy(all_strategies)
diff --git a/torch/distributed/_tensor/ops/pointwise_ops.py b/torch/distributed/_tensor/ops/pointwise_ops.py
index 5a0615840626a..feb9e7aa870e8 100644
--- a/torch/distributed/_tensor/ops/pointwise_ops.py
+++ b/torch/distributed/_tensor/ops/pointwise_ops.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
-from typing import List, Tuple
+from typing import List, Sequence, Tuple
 
 import torch
 
@@ -17,7 +17,6 @@
 from torch.distributed._tensor.ops.utils import (
     generate_redistribute_costs,
     infer_broadcast_dims_map,
-    is_tensor_partial,
     map_placements_after_broadcast,
     normalize_dim,
     register_op_strategy,
@@ -52,13 +51,16 @@
 
 linear_pointwise_ops = [
     aten.div.Scalar,  # this op is linear on the first argument, and the second argument is scalar, so it fits as a linear op.
+    aten.div_.Scalar,  # this op is linear on the first argument, and the second argument is scalar, so it fits as a linear op.
     aten.to.dtype,
     aten.add.Tensor,
+    aten.add_.Tensor,
 ]
 
 
 pointwise_ops = [
     # please keep the entries below alphabetically sorted
+    aten._conj.default,
     aten.abs.default,
     aten.abs.out,
     aten.abs_.default,
@@ -71,7 +73,6 @@
     aten.add.Scalar,
     aten.add.out,
     aten.add_.Scalar,
-    aten.add_.Tensor,
     aten.addcdiv.default,
     aten.addcdiv.out,
     aten.addcdiv_.default,
@@ -283,6 +284,7 @@
     aten.logit.out,
     aten.logit_.default,
     aten.masked_fill.Scalar,
+    aten.maximum.out,
     aten.mul.Scalar,
     aten.mul.Tensor,
     aten.mul.out,
@@ -351,6 +353,8 @@
     aten.sign_.default,
     aten.signbit.default,
     aten.signbit.out,
+    aten.silu.default,
+    aten.silu.out,
     aten.sin.default,
     aten.sin.out,
     aten.sin_.default,
@@ -395,6 +399,7 @@
     # please keep the entries below alphabetically sorted
     aten.gelu_backward.default,
     aten.sigmoid_backward.default,
+    aten.silu_backward.default,
     aten.tanh_backward.default,
     aten.threshold_backward.default,
 ]
@@ -402,17 +407,9 @@
 
 def pointwise_strategy(
     mesh: DeviceMesh, op_schema: OpSchema, linearity: bool = False
-) -> StrategyType:
+) -> OpStrategy:
     max_shards_strategy_index = -1
     max_shards = -1
-    # handle broadcasting
-    common_shape = torch.broadcast_shapes(
-        *[
-            arg.output_shape
-            for arg in op_schema.args_schema
-            if isinstance(arg, OpStrategy)
-        ]
-    )
 
     if _is_inplace_op(op_schema.op):
         # inplace op should follow the first arg strategy
@@ -433,20 +430,30 @@ def pointwise_strategy(
                 max_shards = arg_max_shards
 
         followed_strategy = op_schema.args_schema[max_shards_strategy_index]
-        assert isinstance(followed_strategy, OpStrategy)
-        follow_operand_dims_map = infer_broadcast_dims_map(
-            common_shape, followed_strategy.output_shape
-        )
 
     assert isinstance(
         followed_strategy, OpStrategy
     ), f"no strategy to follow for {op_schema}!"
+    return common_pointwise_strategy(
+        mesh, op_schema.args_schema, followed_strategy, linearity
+    )
+
+
+def common_pointwise_strategy(
+    mesh: DeviceMesh,
+    args_schema: Sequence[object],
+    followed_strategy: OpStrategy,
+    linearity: bool,
+) -> OpStrategy:
+    # handle broadcasting
+    common_shape = torch.broadcast_shapes(
+        *[arg.output_shape for arg in args_schema if isinstance(arg, OpStrategy)]
+    )
     pointwise_strategy = OpStrategy([])
 
     for placement_strategy in followed_strategy.strategies:
-        spec_to_follow = placement_strategy.out_spec
+        spec_to_follow = placement_strategy.output_spec
         out_placements: List[Placement] = []
-
         for placement in spec_to_follow.placements:
             if isinstance(placement, Shard):
                 shard_dim = normalize_dim(placement.dim, len(spec_to_follow.shape))
@@ -461,12 +468,12 @@ def pointwise_strategy(
             else:
                 out_placements.append(placement)
 
-        input_specs = []
+        input_specs: List[DTensorSpec] = []
         redistribute_costs: List[List[float]] = []
-        for idx, input_arg in enumerate(op_schema.args_schema):
+        for idx, input_arg in enumerate(args_schema):
             if isinstance(input_arg, OpStrategy):
                 # every arg follow the out_placements, but need to handle broadcasting
-                input_arg_spec = input_arg.strategies[0].out_spec
+                input_arg_spec = input_arg.strategies[0].output_spec
                 input_arg_dims_map = infer_broadcast_dims_map(
                     common_shape, input_arg_spec.shape
                 )
@@ -487,7 +494,7 @@ def pointwise_strategy(
 
         pointwise_strategy.strategies.append(
             PlacementStrategy(
-                output_spec=DTensorSpec(
+                output_specs=DTensorSpec(
                     mesh=mesh,
                     placements=tuple(out_placements),
                 ),
@@ -520,6 +527,8 @@ def linear_pointwise_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> Strategy
 
 # TODO: add all for_each ops
 for_each_ops = [
+    aten._foreach_abs.default,
+    aten._foreach_abs_.default,
     aten._foreach_addcdiv_.Scalar,
     aten._foreach_addcdiv_.ScalarList,
     aten._foreach_addcdiv_.Tensor,
@@ -527,6 +536,8 @@ def linear_pointwise_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> Strategy
     aten._foreach_addcmul_.Scalar,
     aten._foreach_addcmul_.ScalarList,
     aten._foreach_addcmul_.Tensor,
+    aten._foreach_clamp_max_.Scalar,
+    aten._foreach_clamp_min_.Scalar,
     aten._foreach_div_.List,
     aten._foreach_div_.ScalarList,
     aten._foreach_lerp_.Scalar,
@@ -535,10 +546,12 @@ def linear_pointwise_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> Strategy
     aten._foreach_mul.List,
     aten._foreach_mul_.Scalar,
     aten._foreach_mul_.ScalarList,
+    aten._foreach_mul_.Tensor,
     aten._foreach_mul_.List,
     aten._foreach_neg.default,
     aten._foreach_neg_.default,
     aten._foreach_reciprocal_.default,
+    aten._foreach_sub.List,
     aten._foreach_sub_.Scalar,
     aten._foreach_sqrt.default,
     aten._foreach_sqrt_.default,
@@ -554,13 +567,23 @@ def linear_pointwise_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> Strategy
 ]
 
 
-def foreach_list_strategy(
+def list_pointwise_strategy(
     mesh: DeviceMesh, op_schema: OpSchema, linearity: bool = False
 ) -> StrategyType:
     """
-    for each list op stratgy mostly follow the same logic as pointwise strategy
-    except that it handles list of tensors instead, and normally we don't need to
-    handle implicit broadcasting
+    Apply the pointwise strategy to the zipped arguments. For example, if we
+    run a foreach add of two lists l1 and l2, then we apply the pointwise
+    strategy on each pair (l1[i], l2[i]). If the first argument is a list but
+    the second (or later) one is a tensor, then we broadcast the tensor by
+    replicating it into a list with the length of the first argument.
+
+    Args:
+        mesh (DeviceMesh): device mesh for pointwise ops
+        op_schema (OpSchema): schema of the operator to generate strategy for
+        linearity (bool): specify whether op(a) + op(b) = op(a + b)
+
+    Returns:
+        OpStrategy: generated strategy
     """
 
     def args_tuple_strategies(args_schema: Tuple[object, ...]) -> List[TupleStrategy]:
@@ -568,65 +591,64 @@ def args_tuple_strategies(args_schema: Tuple[object, ...]) -> List[TupleStrategy
         assert isinstance(first_arg, TupleStrategy)
         strategy_len = len(first_arg.childs)
         tuple_strategies: List[TupleStrategy] = []
-        for arg in args_schema:
+        for arg_idx, arg in enumerate(args_schema):
             if isinstance(arg, TupleStrategy):
                 # every tuple strategy should have the same length
                 assert len(arg.childs) == strategy_len
                 tuple_strategies.append(arg)
             elif isinstance(arg, OpStrategy):
-                raise RuntimeError("foreach list op only supports tuple strategy!")
+                if arg_idx > 0:  # implicitly broadcast
+                    tuple_strategies.append(
+                        TupleStrategy([arg for _ in range(strategy_len)])
+                    )
+                else:
+                    raise RuntimeError(
+                        f"list op only supports tuple strategy! {op_schema}"
+                    )
         return tuple_strategies
 
     args_strategies = args_tuple_strategies(op_schema.args_schema)
-
-    # foreach op should follow the first arg strategy
-    follow_strategy = args_strategies[0]
-
-    foreach_strategy_list = []
-    for idx, child_strtgy in enumerate(follow_strategy.childs):
+    follow_strategy: TupleStrategy = args_strategies[0]
+    list_strategy: List[OpStrategy] = []
+    for child_idx, child_strtgy in enumerate(follow_strategy.childs):
         assert isinstance(child_strtgy, OpStrategy)
-
-        strategies = []
-        for strtgy in child_strtgy.strategies:
-            spec_to_follow = strtgy.out_spec
-            if not linearity:
-                assert not is_tensor_partial(
-                    spec_to_follow
-                ), f"{op_schema.op} does not support operation on partial tensor!"
-
-            redistribute_costs: List[List[float]] = []
-
-            for arg_strtgy in args_strategies:
-                child_strtgy = arg_strtgy.childs[idx]
-                assert isinstance(child_strtgy, OpStrategy)
-                redistribute_costs.append(
-                    generate_redistribute_costs(child_strtgy, spec_to_follow)
-                )
-            strategies.append(
-                PlacementStrategy(
-                    output_spec=spec_to_follow, redistribute_cost=redistribute_costs
-                )
-            )
-
-        foreach_strategy_list.append(OpStrategy(strategies))
-
-    tup_strategy = TupleStrategy(foreach_strategy_list)
-    return tup_strategy
+        args_schema: List[StrategyType] = [
+            arg_strategy.childs[child_idx] for arg_strategy in args_strategies
+        ]
+        pointwise_strategy: OpStrategy = common_pointwise_strategy(
+            mesh, args_schema, child_strtgy, linearity
+        )
+        list_strategy.append(pointwise_strategy)
+    return TupleStrategy(list_strategy)
 
 
-def foreach_list_linear_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+def list_linear_pointwise_strategy(
+    mesh: DeviceMesh, op_schema: OpSchema
+) -> StrategyType:
     """
     for each list op stratgy that supports linearity
     """
-    return foreach_list_strategy(mesh, op_schema, linearity=True)
+    return list_pointwise_strategy(mesh, op_schema, linearity=True)
 
 
 for op in for_each_ops:
     register_op_strategy(op, schema_info=RuntimeSchemaInfo(needs_pytree=True))(
-        foreach_list_strategy
+        list_pointwise_strategy
     )
 
 for op in for_each_linearity_ops:
     register_op_strategy(op, schema_info=RuntimeSchemaInfo(needs_pytree=True))(
-        foreach_list_linear_strategy
+        list_linear_pointwise_strategy
+    )
+
+fused_ops = [
+    aten._fused_adam_.default,
+    aten._fused_adam.default,
+    aten._fused_adamw_.default,
+    aten._fused_adamw.default,
+]
+
+for op in fused_ops:
+    register_op_strategy(op, schema_info=RuntimeSchemaInfo(needs_pytree=True))(
+        list_pointwise_strategy
     )
diff --git a/torch/distributed/_tensor/ops/random_ops.py b/torch/distributed/_tensor/ops/random_ops.py
index 528a4c8ff0fe5..b666d1d8bcd0b 100644
--- a/torch/distributed/_tensor/ops/random_ops.py
+++ b/torch/distributed/_tensor/ops/random_ops.py
@@ -21,10 +21,10 @@ def random_op_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
 
     random_strategy = OpStrategy([])
     for arg_strategy in self_strategy.strategies:
-        arg_spec = arg_strategy.out_spec
+        arg_spec = arg_strategy.output_spec
         if is_tensor_partial(arg_spec):
             # TODO: figure out how inplace random op should behave when it's partial
             raise RuntimeError(f"{op_schema.op} with _Partial is not supported yet!")
-        random_strategy.strategies.append(PlacementStrategy(output_spec=arg_spec))
+        random_strategy.strategies.append(PlacementStrategy(output_specs=arg_spec))
 
     return random_strategy
diff --git a/torch/distributed/_tensor/ops/tensor_ops.py b/torch/distributed/_tensor/ops/tensor_ops.py
index 6de5342bb3e7e..4ccd78500f084 100644
--- a/torch/distributed/_tensor/ops/tensor_ops.py
+++ b/torch/distributed/_tensor/ops/tensor_ops.py
@@ -1,9 +1,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
+import itertools
 from typing import cast, List, Optional, Sequence, Tuple
 
 import torch
 
-from torch.distributed._tensor._utils import compute_local_shape
 from torch.distributed._tensor.op_schema import (
     OpSchema,
     OpStrategy,
@@ -11,13 +11,17 @@
     PlacementStrategy,
     RuntimeSchemaInfo,
     StrategyType,
+    TupleStrategy,
 )
 from torch.distributed._tensor.ops.common_rules import pointwise_rule
+from torch.distributed._tensor.ops.embedding_ops import _MaskPartial
 from torch.distributed._tensor.ops.utils import (
+    generate_redistribute_costs,
     is_tensor_dim_sharded,
+    is_tensor_evenly_shardable,
     is_tensor_partial,
+    is_tensor_shardable,
     normalize_dim,
-    prod,
     register_op_strategy,
     register_prop_rule,
 )
@@ -44,9 +48,9 @@ def default_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
         # the tensor metas are distinct between the arguments and outputs
         default_strategy.append(
             PlacementStrategy(
-                output_spec=DTensorSpec(
-                    mesh=strategy.out_spec.mesh,
-                    placements=strategy.out_spec.placements,
+                output_specs=DTensorSpec(
+                    mesh=strategy.output_spec.mesh,
+                    placements=strategy.output_spec.placements,
                 )
             )
         )
@@ -92,7 +96,7 @@ def equal_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
     equal_strategy = OpStrategy([])
 
     for arg_strategy in select_strategy.strategies:
-        arg_spec = arg_strategy.out_spec
+        arg_spec = arg_strategy.output_spec
         if is_tensor_partial(arg_spec):
             # if the arg_spec have partial, reshard to replicate
             # otherwise local shard tensor comparison would be invalid
@@ -103,7 +107,9 @@ def equal_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
                     for p in arg_spec.placements
                 ),
             )
-            equal_strategy.strategies.append(PlacementStrategy(output_spec=output_spec))
+            equal_strategy.strategies.append(
+                PlacementStrategy(output_specs=output_spec)
+            )
         else:
             equal_strategy.strategies.append(PlacementStrategy(arg_spec))
     return equal_strategy
@@ -140,7 +146,7 @@ def create_like_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
     create_like_strategy = OpStrategy([])
     assert isinstance(select_strategy, OpStrategy)
     for arg_strategy in select_strategy.strategies:
-        arg_spec = arg_strategy.out_spec
+        arg_spec = arg_strategy.output_spec
         if is_tensor_partial(arg_spec):
             # if the arg_spec have partial, accept partial
             # in the input_specs but output replicate for
@@ -153,7 +159,7 @@ def create_like_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
                 ),
             )
             create_like_strategy.strategies.append(
-                PlacementStrategy(output_spec=output_spec, input_specs=(arg_spec,))
+                PlacementStrategy(output_specs=output_spec, input_specs=(arg_spec,))
             )
 
         else:
@@ -168,15 +174,51 @@ def create_like_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
         aten.new_full.default,
         aten.new_ones.default,
         aten.new_zeros.default,
-        aten.new_empty_strided.default,  # TODO: re-think new_empty_strided
+        aten.new_empty_strided.default,
     ],
     schema_info=RuntimeSchemaInfo(1, ["dtype"]),
 )
-def new_factory_strategy(mesh: DeviceMesh, _) -> StrategyType:
-    # TODO: maybe we should generate all possible shardings intead of just stay
-    # replicated for new factory methods
-    replica_spec = DTensorSpec(mesh, tuple([Replicate()] * mesh.ndim))
-    return OpStrategy([PlacementStrategy(replica_spec)])
+def new_factory_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    # Currently there are two strategies:
+    # 1. let the output be replicated
+    # 2. let the output follow the input if input and output have the same shape
+    input_strategy = op_schema.args_schema[0]
+    assert isinstance(input_strategy, OpStrategy)
+    input_shape = input_strategy.output_shape
+    output_shape = op_schema.args_schema[1]
+    assert isinstance(output_shape, list)
+
+    new_factory_strategy = OpStrategy([])
+    for arg_strategy in input_strategy.strategies:
+        input_spec = arg_strategy.output_spec
+        replica_spec = DTensorSpec(mesh, tuple([Replicate()] * mesh.ndim))
+        new_factory_strategy.strategies.append(
+            PlacementStrategy(
+                output_specs=replica_spec,
+                input_specs=(input_spec,),
+                redistribute_cost=[[0.0] * mesh.ndim],
+            )
+        )
+
+        if tuple(input_shape) == tuple(output_shape) and input_spec.is_sharded():
+            # NOTE: for new_empty_strided, currently the non-replicate sharding
+            #       is supported only when the shape is evenly shardable
+            if (
+                op_schema.op == aten.new_empty_strided.default
+                and not is_tensor_evenly_shardable(input_shape, input_spec)
+            ):
+                continue
+
+            new_factory_strategy.strategies.append(
+                PlacementStrategy(
+                    output_specs=input_spec,
+                    input_specs=(input_spec,),
+                    # encouraging new tensor placement to be the same as input
+                    redistribute_cost=[[-0.1] * mesh.ndim],
+                )
+            )
+
+    return new_factory_strategy
 
 
 @register_op_strategy(aten.bucketize.Tensor)
@@ -186,11 +228,11 @@ def gen_bucketize_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyTyp
     bucketize_strategy = OpStrategy([])
     assert isinstance(input_strategy, OpStrategy)
     for arg_strategy in input_strategy.strategies:
-        arg_spec = DTensorSpec(mesh, arg_strategy.out_spec.placements)
+        arg_spec = DTensorSpec(mesh, arg_strategy.output_spec.placements)
         replica_spec = DTensorSpec(mesh, tuple([Replicate()] * mesh.ndim))
         bucketize_strategy.strategies.append(
             PlacementStrategy(
-                output_spec=arg_spec, input_specs=(arg_spec, replica_spec)
+                output_specs=arg_spec, input_specs=(arg_spec, replica_spec)
             )
         )
 
@@ -226,21 +268,21 @@ def gen_slice_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
     slice_strategy = OpStrategy([])
 
     for arg_strategy in input_strategy.strategies:
-        arg_spec = arg_strategy.out_spec
+        arg_spec = arg_strategy.output_spec
         if not is_tensor_dim_sharded(arg_spec, dim=slice_dim) or redundant_slice:
             # only add the strategy if the slice dim is not sharded
             out_spec = DTensorSpec(mesh, arg_spec.placements)
-            slice_strategy.strategies.append(PlacementStrategy(output_spec=out_spec))
+            slice_strategy.strategies.append(PlacementStrategy(output_specs=out_spec))
     if not slice_strategy.strategies:
         # if all strategies are filtered out, unsharding all specs on slice dim
         # of the input strategy, and use that as the op strategy
         for arg_strategy in input_strategy.strategies:
-            arg_spec = arg_strategy.out_spec
+            arg_spec = arg_strategy.output_spec
             unshard_spec = DTensorSpec(
                 mesh, unshard_tensor_dim(arg_spec.placements, dim=slice_dim)
             )
             slice_strategy.strategies.append(
-                PlacementStrategy(output_spec=unshard_spec)
+                PlacementStrategy(output_specs=unshard_spec)
             )
     return slice_strategy
 
@@ -288,37 +330,242 @@ def gen_slice_scatter_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> Strateg
     slice_scatter_strategy = OpStrategy([])
     # by default follow the input strategy for both input and src
     for arg_strategy in input_strategy.strategies:
-        arg_spec = arg_strategy.out_spec
+        arg_spec = arg_strategy.output_spec
         if not (
             is_tensor_dim_sharded(arg_spec, dim=slice_dim)
             or is_tensor_partial(arg_spec)
         ):
             # only add the strategy if the slice_scatter dim is not sharded or partial
             slice_scatter_strategy.strategies.append(
-                PlacementStrategy(output_spec=arg_spec)
+                PlacementStrategy(output_specs=arg_spec)
             )
 
     if not slice_scatter_strategy.strategies:
         # if all strategies are filtered out, replicating all specs on slice_scatter dim
         # of the input strategy, and use that as the op strategy
         for arg_strategy in input_strategy.strategies:
-            arg_spec = arg_strategy.out_spec
+            arg_spec = arg_strategy.output_spec
             replicate_spec = DTensorSpec(
                 mesh, replicate_tensor_dim(arg_spec.placements, dim=slice_dim)
             )
             slice_scatter_strategy.strategies.append(
-                PlacementStrategy(output_spec=replicate_spec)
+                PlacementStrategy(output_specs=replicate_spec)
             )
     return slice_scatter_strategy
 
 
 @register_op_strategy(aten._local_scalar_dense.default)
 def replica_only_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
-    """Only allow replication on the input/ouput."""
+    """Only allow replication on the input/output."""
     replicate_spec = DTensorSpec(mesh, tuple([Replicate()] * mesh.ndim))
     return OpStrategy([PlacementStrategy(replicate_spec)])
 
 
+@register_op_strategy(aten.gather.default)
+def gather_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    input_strategy = cast(OpStrategy, op_schema.args_schema[0])
+    dim = cast(int, op_schema.args_schema[1])
+    index_strategy = cast(OpStrategy, op_schema.args_schema[2])
+
+    input_shape = input_strategy.output_shape
+    index_shape = index_strategy.output_shape
+
+    all_mesh_dim_strategies = []
+
+    for mesh_dim in range(mesh.ndim):
+        single_mesh_dim_strategies = []
+
+        # placement list stores placements of [output, input, index]
+        # first we always have replicate all for inputs and output
+        all_replicate: List[Placement] = [Replicate()] * 3
+        single_mesh_dim_strategies.append(all_replicate)
+
+        # input sharding, input sharded, index accepts mask partial, output follows index
+        # this only works when the input is sharded on the gather dimension, and
+        # index has size 1 on the gather dimension
+        if index_shape[dim] == 1:
+            index_partial_placement = _MaskPartial(logical_dim_size=input_shape[dim])
+            input_sharding = [
+                index_partial_placement,
+                Shard(dim),
+                index_partial_placement,
+            ]
+            single_mesh_dim_strategies.append(input_sharding)
+
+        # index sharding, input replicated, index sharded, output follows index
+        # this only works when the sharding dimension is the gather dimension
+        index_sharding = [Shard(dim), Replicate(), Shard(dim)]
+        single_mesh_dim_strategies.append(index_sharding)
+
+        all_mesh_dim_strategies.append(single_mesh_dim_strategies)
+
+    strategy_combs = itertools.product(*all_mesh_dim_strategies)
+
+    all_strategies = []
+    for strategy_comb in strategy_combs:
+        spec_list = []
+        for specs in zip(*strategy_comb):
+            spec_list.append(DTensorSpec(mesh, tuple(specs)))
+
+        if is_tensor_shardable(input_shape, spec_list[1]) and is_tensor_shardable(
+            index_shape, spec_list[2]
+        ):
+            input_spec, index_spec = spec_list[1:]
+            redistribute_cost = [
+                generate_redistribute_costs(input_strategy, input_spec),
+                generate_redistribute_costs(index_strategy, index_spec),
+            ]
+            strat = PlacementStrategy(
+                output_specs=spec_list[0],
+                input_specs=spec_list[1:],
+                redistribute_cost=redistribute_cost,
+            )
+            all_strategies.append(strat)
+
+    return OpStrategy(all_strategies)
+
+
+def _derive_follow_placements_from_tuple_strategy(
+    tuple_strategy: TupleStrategy,
+) -> Sequence[Placement]:
+    """
+    derive the placements to follow from the tuple strategy, mainly used by
+    aten.stack, aten.cat, where each operand have the same shape, and correspondingly
+    expecting the same sharding
+    """
+
+    def merge_placement(
+        cur_placement: Placement, new_placement: Placement
+    ) -> Placement:
+        # semantic if we already have a follow placement, we
+        # check each placement for the current arg placement
+        # to see if we want to merge/adjust the placement to follow
+        # the priority: Partial -> Shard -> Replicate
+        if cur_placement == new_placement:
+            return cur_placement
+
+        if cur_placement.is_partial():
+            if new_placement.is_shard():
+                # follow new placement
+                return new_placement
+            elif new_placement.is_partial():
+                # different partial types, we can't merge and have to replicate all here
+                return Replicate()
+            else:
+                # follow partial
+                return cur_placement
+        elif cur_placement.is_shard():
+            if new_placement.is_shard():
+                # cur/new placement are different sharding (i.e. different shard dim)
+                # currently fallback to replicate all args
+                return Replicate()
+            else:
+                # for partial/replicate, follow the current shard placement
+                return cur_placement
+        else:
+            # current replicate, just follow new placement
+            return new_placement
+
+    follow_placements: Optional[List[Placement]] = None
+    for arg_strategy in tuple_strategy.childs:
+        assert isinstance(arg_strategy, OpStrategy)
+        for placement_strategy in arg_strategy.strategies:
+            arg_placements = placement_strategy.output_spec.placements
+            if follow_placements is None:
+                follow_placements = list(arg_placements)
+                continue
+            mesh_ndim = len(follow_placements)
+            assert follow_placements is not None
+            for mesh_idx in range(mesh_ndim):
+                # merge placements with the priority
+                follow_placements[mesh_idx] = merge_placement(
+                    follow_placements[mesh_idx], arg_placements[mesh_idx]
+                )
+    assert follow_placements is not None, "follow placements should not be None!"
+    return follow_placements
+
+
+def normalize_shard_for_stack(
+    placements: Sequence[Placement], insert_dim: int = 0
+) -> Sequence[Placement]:
+    # stack op would "insert" new dim, so all sharded dim >= the inserted dim need to
+    # be normalized with the new Shard placement
+    normalized_placements: List[Placement] = []
+    for placement in placements:
+        if isinstance(placement, Shard) and placement.dim >= insert_dim:
+            normalized_placements.append(Shard(placement.dim + 1))
+        else:
+            normalized_placements.append(placement)
+    return normalized_placements
+
+
+@register_op_strategy(aten.stack.default, RuntimeSchemaInfo(1, needs_pytree=True))
+def stack_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    args_schema = op_schema.args_schema
+    input_tuple_strategy = args_schema[0]
+    assert isinstance(input_tuple_strategy, TupleStrategy), f"{input_tuple_strategy}"
+    first_input_strategy = input_tuple_strategy.childs[0]
+    assert isinstance(first_input_strategy, OpStrategy), f"{first_input_strategy}"
+    common_input_ndim = first_input_strategy.output_ndim
+    dim = cast(int, args_schema[1]) if len(args_schema) > 1 else 0
+    # normalize the dim to be within the common input ndim
+    dim = normalize_dim(dim, common_input_ndim)
+
+    follow_placements = _derive_follow_placements_from_tuple_strategy(
+        input_tuple_strategy
+    )
+    follow_placements = normalize_shard_for_stack(follow_placements, dim)
+
+    # create op strategy base on the follow placements
+    op_strategy = OpStrategy([])
+
+    input_specs = tuple(
+        DTensorSpec(mesh, tuple(follow_placements))
+        for _ in range(len(input_tuple_strategy.childs))
+    )
+    op_strategy.strategies.append(
+        PlacementStrategy(
+            output_specs=DTensorSpec(mesh, tuple(follow_placements)),
+            input_specs=input_specs,
+        )
+    )
+    return op_strategy
+
+
+@register_op_strategy(aten.cat.default, RuntimeSchemaInfo(1, needs_pytree=True))
+def cat_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    args_schema = op_schema.args_schema
+    input_tuple_strategy = args_schema[0]
+    assert isinstance(input_tuple_strategy, TupleStrategy), f"{input_tuple_strategy}"
+    first_input_strategy = input_tuple_strategy.childs[0]
+    assert isinstance(first_input_strategy, OpStrategy), f"{first_input_strategy}"
+    common_input_ndim = first_input_strategy.output_ndim
+    dim = cast(int, args_schema[1]) if len(args_schema) > 1 else 0
+    # normalize the dim to be within the common input ndim
+    dim = normalize_dim(dim, common_input_ndim)
+
+    follow_placements = _derive_follow_placements_from_tuple_strategy(
+        input_tuple_strategy
+    )
+    # for cat we unshard the cat dim if it is sharded
+    follow_placements = unshard_tensor_dim(follow_placements, dim)
+
+    # create op strategy base on the follow placements
+    op_strategy = OpStrategy([])
+
+    input_specs = tuple(
+        DTensorSpec(mesh, tuple(follow_placements))
+        for _ in range(len(input_tuple_strategy.childs))
+    )
+    op_strategy.strategies.append(
+        PlacementStrategy(
+            output_specs=DTensorSpec(mesh, tuple(follow_placements)),
+            input_specs=input_specs,
+        )
+    )
+    return op_strategy
+
+
 @register_prop_rule(aten.index_select.default, schema_info=RuntimeSchemaInfo(1))
 def prop_index_select(op_schema: OpSchema) -> OutputSharding:
     values_spec, dim, indices_spec = op_schema.args_schema
@@ -338,15 +585,17 @@ def prop_index_select(op_schema: OpSchema) -> OutputSharding:
             kwargs_schema=op_schema.kwargs_schema,
         )
     )
-    if result.schema_suggestions:
-        result.schema_suggestions = [
-            OpSchema(
-                op=op_schema.op,
-                args_schema=(s.args_schema[0], dim, s.args_schema[1][dim]),
-                kwargs_schema=op_schema.kwargs_schema,
-            )
-            for s in result.schema_suggestions
-        ]
+    if result.redistribute_schema:
+        schema_suggestion = result.redistribute_schema
+        result.redistribute_schema = OpSchema(
+            op=op_schema.op,
+            args_schema=(
+                schema_suggestion.args_schema[0],
+                dim,
+                schema_suggestion.args_schema[1][dim],
+            ),
+            kwargs_schema=op_schema.kwargs_schema,
+        )
     return result
 
 
@@ -391,8 +640,8 @@ def prop_index(op_schema: OpSchema) -> OutputSharding:
         assert isinstance(indices_out.output_spec, DTensorSpec)
         indices_spec: DTensorSpec = indices_out.output_spec
     else:
-        assert indices_out.schema_suggestions is not None
-        valid_indices_suggestion = indices_out.schema_suggestions[0]
+        assert indices_out.redistribute_schema is not None
+        valid_indices_suggestion = indices_out.redistribute_schema
         for i, v in enumerate(valid_indices_suggestion.args_spec):
             multi_indices_spec[valid_indices_spec[i][0]] = v
         # we'll need to call pointwise_rule again to see what's our ideal indices_spec and then
@@ -451,171 +700,34 @@ def place(vp: Placement, ip: Placement) -> Placement:
     else:
         result = OutputSharding(
             output_spec=None,
-            schema_suggestions=[
-                OpSchema(
-                    op=op_schema.op,
-                    args_schema=(
-                        DTensorSpec(
-                            mesh=values_spec.mesh,
-                            placements=tuple(
-                                [
-                                    Replicate() if need_reshard_on_values[i] else v
-                                    for i, v in enumerate(values_spec.placements)
-                                ]
-                            ),
-                            tensor_meta=values_spec.tensor_meta,
-                        ),
-                        multi_indices_spec,
-                    ),
-                    kwargs_schema=op_schema.kwargs_schema,
-                )
-            ],
-        )
-        return result
-
-
-@register_prop_rule(
-    aten.cat.default, schema_info=RuntimeSchemaInfo(1, needs_pytree=True)
-)
-def cat_rule(op_schema: OpSchema) -> OutputSharding:
-    # torch.cat requires all tensors must either have the same shape (except
-    # in the concatenating dimension) or be "empty". "Empty" here strictly means
-    # tensor.shape is torch.Size([0]). When tensor.ndim > 1, it will be treated
-    # as a non-empty tensor and the shape must match on non-cat dimensions.
-    def is_empty(spec: DTensorSpec) -> bool:
-        return list(spec.shape) == [0]
-
-    # the first arg is a list of input tensor specs
-    tensor_list_specs = cast(List[DTensorSpec], op_schema.args_schema[0])
-    assert len(tensor_list_specs) > 0, "torch.cat expects a non-empty list of tensors"
-    non_empty_specs = [spec for spec in tensor_list_specs if not is_empty(spec)]
-
-    if len(non_empty_specs) == 0:
-        # all tensors are empty, we can return any output sharding
-        return OutputSharding(
-            output_spec=DTensorSpec(
-                mesh=tensor_list_specs[0].mesh,
-                placements=tensor_list_specs[0].placements,
-            )
-        )
-
-    assert all(
-        spec.ndim == non_empty_specs[0].ndim for spec in non_empty_specs
-    ), f"Expect all tensors to have same shape or empty, but got {tensor_list_specs}"
-    assert all(
-        spec.mesh == tensor_list_specs[0].mesh for spec in tensor_list_specs
-    ), f"Expect all tensors to have same mesh, but got {tensor_list_specs}"
-
-    # ndim will also be the result's ndim
-    ndim = 1
-    for spec in tensor_list_specs:
-        ndim = max(ndim, spec.ndim)
-
-    dim = 0  # default dim = 0
-    if len(op_schema.args_schema) > 1:
-        dim = cast(int, op_schema.args_schema[1])
-    dim = normalize_dim(dim, ndim)
-
-    # Make sure all tensors are replciated on cat dimension
-    need_reshard = False
-    tensor_list_specs_after: List[DTensorSpec] = []
-    for spec in tensor_list_specs:
-        if not is_empty(spec) and (
-            is_tensor_dim_sharded(spec, dim=dim) or is_tensor_partial(spec)
-        ):
-            need_reshard = True
-            tensor_list_specs_after.append(
-                DTensorSpec(
-                    mesh=spec.mesh,
-                    placements=replicate_tensor_dim(spec.placements, dim=dim),
-                    tensor_meta=spec.tensor_meta,
-                )
-            )
-        else:
-            tensor_list_specs_after.append(spec)
-
-    tensor_list_specs = tensor_list_specs_after
-
-    # align non-cat dimensions placements based on reshard cost
-    non_empty_specs = [spec for spec in tensor_list_specs if not is_empty(spec)]
-    mesh = non_empty_specs[0].mesh
-    ndim = non_empty_specs[0].ndim
-    new_placements: List[Placement] = []
-    for mesh_dim in range(mesh.ndim):
-        # compute the minimum cost of resharding on this mesh_dim
-        if any(
-            spec.placements[mesh_dim] != non_empty_specs[0].placements[mesh_dim]
-            for spec in non_empty_specs
-        ):
-            # only reshard if there is a mismatch
-            need_reshard = True
-            reshard_cost = []
-            for shard_dim in range(ndim):
-                # compute the cost of resharding on this shard_dim
-                cost: float = 0.0
-                for spec in non_empty_specs:
-                    global_shape = spec.shape
-                    if global_shape[shard_dim] < mesh.size(mesh_dim):
-                        # found one tensor where the shard_dim is smaller than
-                        # mesh_dim. In this case, we cannot shard on this shard_dim,
-                        # and hence set cost to infinity.
-                        cost = +float("inf")
-                    elif (
-                        is_tensor_dim_sharded(spec, dim=shard_dim)
-                        or prod(global_shape) == 0
-                    ):
-                        continue
-                    else:
-                        local_shape = compute_local_shape(
-                            global_shape, spec.mesh, spec.placements
-                        )
-                        cost += prod(local_shape) * spec.mesh.size(mesh_dim)
-                reshard_cost.append(cost)
-            best_dim = reshard_cost.index(min(reshard_cost))
-            new_placements.append(Shard(best_dim))
-        else:
-            # no mismatch, keep the original placement
-            new_placements.append(non_empty_specs[0].placements[mesh_dim])
-
-    if need_reshard:
-        tensor_list_specs_after = []
-        for spec in tensor_list_specs:
-            if is_empty(spec):
-                tensor_list_specs_after.append(spec)
-            else:
-                tensor_list_specs_after.append(
+            redistribute_schema=OpSchema(
+                op=op_schema.op,
+                args_schema=(
                     DTensorSpec(
-                        mesh=spec.mesh,
-                        placements=tuple(new_placements),
-                        tensor_meta=spec.tensor_meta,
-                    )
-                )
-
-        return OutputSharding(
-            output_spec=None,
-            schema_suggestions=[
-                OpSchema(
-                    op=op_schema.op,
-                    args_schema=(
-                        tuple(tensor_list_specs_after),
-                        *op_schema.args_schema[1:],
+                        mesh=values_spec.mesh,
+                        placements=tuple(
+                            [
+                                Replicate() if need_reshard_on_values[i] else v
+                                for i, v in enumerate(values_spec.placements)
+                            ]
+                        ),
+                        tensor_meta=values_spec.tensor_meta,
                     ),
-                    kwargs_schema=op_schema.kwargs_schema,
+                    multi_indices_spec,
                 ),
-            ],
-        )
-    else:
-        # at this point, the cat dim is not sharded,
-        return OutputSharding(
-            output_spec=DTensorSpec(
-                mesh=non_empty_specs[0].mesh,
-                placements=non_empty_specs[0].placements,
+                kwargs_schema=op_schema.kwargs_schema,
             ),
         )
+        return result
 
 
 @register_prop_rule(
-    [aten.split.Tensor, aten.split_with_sizes.default], schema_info=RuntimeSchemaInfo(1)
+    [
+        aten.split.Tensor,
+        aten.split_with_sizes.default,
+        aten.split_with_sizes_copy.default,
+    ],
+    schema_info=RuntimeSchemaInfo(1),
 )
 def split_rule(op_schema: OpSchema) -> OutputSharding:
     output_spec_list: List[DTensorSpec] = []
@@ -649,13 +761,11 @@ def split_rule(op_schema: OpSchema) -> OutputSharding:
     if need_reshard:
         return OutputSharding(
             None,
-            schema_suggestions=[
-                OpSchema(
-                    op=op_schema.op,
-                    args_schema=(input_spec,) + op_schema.args_schema[1:],
-                    kwargs_schema=op_schema.kwargs_schema,
-                ),
-            ],
+            redistribute_schema=OpSchema(
+                op=op_schema.op,
+                args_schema=(input_spec,) + op_schema.args_schema[1:],
+                kwargs_schema=op_schema.kwargs_schema,
+            ),
         )
 
     def size_split(N, i):
diff --git a/torch/distributed/_tensor/ops/utils.py b/torch/distributed/_tensor/ops/utils.py
index 37854bce176c9..8fe15e3781e52 100644
--- a/torch/distributed/_tensor/ops/utils.py
+++ b/torch/distributed/_tensor/ops/utils.py
@@ -6,7 +6,7 @@
 import torch
 from torch.distributed._tensor._collective_utils import redistribute_cost
 from torch.distributed._tensor.api import DTensor
-from torch.distributed._tensor.op_schema import OpStrategy
+from torch.distributed._tensor.op_schema import OpStrategy, RuntimeSchemaInfo
 from torch.distributed._tensor.placement_types import (
     _Partial,
     DTensorSpec,
@@ -38,11 +38,38 @@ def register_op_strategy(op, schema_info=None):
     # pyre-fixme[53]: Captured variable `func` is not annotated.
     # pyre-fixme[3]: Return type must be annotated.
     # pyre-fixme[2]: Parameter must be annotated.
+
+    # For every ATen op that accepts any args in this list,
+    # the arg itself can impact the strides (and potentially the sharding strategy)
+    # of the output tensor.
+    # thus, we will detect ATen schemas with any of these args and ensure
+    # that they get specialized here.
+    arg_names_that_require_specializing_cache_strategy = [
+        "memory_format",
+    ]
+
     def wrapper(impl):
-        overloads = op if isinstance(op, list) else [op]
+        if isinstance(op, list):
+            overloads = op
+        else:
+            overloads = [op]
+
         for overload in overloads:
+            curr_schema_info = None
+            if schema_info is None:
+                specialized_args = [
+                    a.name
+                    for a in overload._schema.arguments
+                    if a.name in arg_names_that_require_specializing_cache_strategy
+                ]
+                if any(specialized_args):
+                    curr_schema_info = RuntimeSchemaInfo(
+                        static_kwargkey=specialized_args
+                    )
+            else:
+                curr_schema_info = schema_info
             DTensor._op_dispatcher.sharding_propagator.register_op_strategy(
-                overload, impl, schema_info
+                overload, impl, curr_schema_info
             )
         return impl
 
@@ -112,7 +139,23 @@ def is_tensor_shardable(shape: Sequence[int], spec: DTensorSpec) -> bool:
     for i, dim_size in enumerate(shape):
         # TODO: maybe we should determine is_shardable based on
         #       whether it's evenly sharded or not
-        if dim_size < shards_map[i]:
+        if shards_map[i] > 1 and dim_size < shards_map[i]:
+            return False
+
+    return True
+
+
+def is_tensor_evenly_shardable(shape: Sequence[int], spec: DTensorSpec) -> bool:
+    """Check if the shape is evenly shardable according to the spec."""
+    # number of shards in each tensor dimension
+    shards_map = [1] * len(shape)
+    for i, placement in enumerate(spec.placements):
+        if placement.is_shard():
+            shard_dim = cast(Shard, placement).dim
+            shards_map[shard_dim] *= spec.mesh.size(i)
+
+    for i, dim_size in enumerate(shape):
+        if shards_map[i] > 1 and (dim_size % shards_map[i] != 0):
             return False
 
     return True
@@ -178,6 +221,6 @@ def generate_redistribute_costs(
 ) -> List[float]:
     redistribute_costs: List[float] = []
     for strat in src_strategy.strategies:
-        redistribute_costs.append(redistribute_cost(strat.out_spec, dst_spec))
+        redistribute_costs.append(redistribute_cost(strat.output_spec, dst_spec))
 
     return redistribute_costs
diff --git a/torch/distributed/_tensor/ops/view_ops.py b/torch/distributed/_tensor/ops/view_ops.py
index 25fb92f6fcebf..be72cc9509f58 100644
--- a/torch/distributed/_tensor/ops/view_ops.py
+++ b/torch/distributed/_tensor/ops/view_ops.py
@@ -1,6 +1,17 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 from dataclasses import dataclass
-from typing import Callable, cast, Dict, Iterable, Optional, Sequence, Set, Tuple, Union
+from typing import (
+    Callable,
+    cast,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
 
 import torch
 
@@ -207,13 +218,22 @@ def normalize_sizes(sizes: Union[Shape, Tuple[Shape]]) -> Shape:
         raise RuntimeError("Size must be int... or tuple")
 
 
-def dim_flatten(ndim: int) -> DimMap:
+def dim_flatten(ndim: int, start_dim=0, end_dim=-1) -> DimMap:
     if ndim == 0:
         return (Singleton(),)
     elif ndim == 1:
         return (InputDim(0),)
     else:
-        return (Flatten.new(tuple(InputDim(i) for i in range(ndim))),)
+        # only flattening dims from start_dim to end_dim (inclusive)
+        # other dims are passed through
+        if end_dim < 0:
+            end_dim += ndim
+        results: List[DimSpec] = [InputDim(i) for i in range(start_dim)]
+        results.append(
+            Flatten.new(tuple(InputDim(i) for i in range(start_dim, end_dim + 1)))
+        )
+        results.extend([InputDim(i) for i in range(end_dim + 1, ndim)])
+        return tuple(results)
 
 
 def dim_movedim(
@@ -404,6 +424,16 @@ def dim_unsqueeze(ndim: int, dim: int) -> DimMap:
     return dims[:dim] + (Singleton(),) + dims[dim:]
 
 
+def dim_view_as_real(shape: Shape) -> DimMap:
+    ndim = len(shape)
+    results: List[DimSpec] = [InputDim(i) for i in range(ndim - 1)]
+    # each complex number is split into two real numbers,
+    # resulting in one more dimension of size 2
+    results.append(Split(InputDim(ndim - 1), (shape[-1], 2), 0))
+    results.append(Split(InputDim(ndim - 1), (shape[-1], 2), 1))
+    return tuple(results)
+
+
 def dim_reduction(
     ndim: int, dim_or_dims: Optional[Union[int, Sequence[int]]], keepdim: bool
 ) -> DimMap:
@@ -468,6 +498,10 @@ class Op:
         dim_map=lambda input, *shape: view_groups(input.shape, shape),
         shape_argnum=1,
     ),
+    torch.view_as_complex: Op(
+        dim_map=lambda input: dim_flatten(input.ndim, input.ndim - 2)
+    ),
+    torch.view_as_real: Op(dim_map=lambda input: dim_view_as_real(input.shape)),
 }
 
 
@@ -653,7 +687,7 @@ def reshape_prop(op_schema: OpSchema) -> OutputSharding:
                 )
                 return OutputSharding(
                     output_spec=output_dtensor_spec,
-                    schema_suggestions=[suggested_schema],
+                    redistribute_schema=suggested_schema,
                     needs_redistribute=True,
                 )
 
@@ -672,20 +706,18 @@ def reshape_prop(op_schema: OpSchema) -> OutputSharding:
             ]
             return OutputSharding(
                 output_spec=None,
-                schema_suggestions=[
-                    OpSchema(
-                        op=op_schema.op,
-                        args_schema=(
-                            DTensorSpec(
-                                placements=tuple(suggested_placements),
-                                mesh=input_dtensor_spec.mesh,
-                                tensor_meta=input_dtensor_spec.tensor_meta,
-                            ),
-                        )
-                        + op_schema.args_schema[1:],
-                        kwargs_schema=op_schema.kwargs_schema,
+                redistribute_schema=OpSchema(
+                    op=op_schema.op,
+                    args_schema=(
+                        DTensorSpec(
+                            placements=tuple(suggested_placements),
+                            mesh=input_dtensor_spec.mesh,
+                            tensor_meta=input_dtensor_spec.tensor_meta,
+                        ),
                     )
-                ],
+                    + op_schema.args_schema[1:],
+                    kwargs_schema=op_schema.kwargs_schema,
+                ),
             )
 
 
@@ -715,3 +747,5 @@ def reshape_prop(op_schema: OpSchema) -> OutputSharding:
 register_prop_rule_map(
     aten.transpose.int, torch.transpose, schema_info=RuntimeSchemaInfo(1)
 )
+register_prop_rule_map(aten.view_as_complex.default, torch.view_as_complex)
+register_prop_rule_map(aten.view_as_real.default, torch.view_as_real)
diff --git a/torch/distributed/_tensor/placement_types.py b/torch/distributed/_tensor/placement_types.py
index 06834000946d2..6eb19de18abe0 100644
--- a/torch/distributed/_tensor/placement_types.py
+++ b/torch/distributed/_tensor/placement_types.py
@@ -5,9 +5,14 @@
 
 import torch
 import torch.distributed._functional_collectives as funcol
-import torch.distributed.distributed_c10d as c10d
 
-from torch.distributed._tensor._collective_utils import mesh_broadcast, mesh_scatter
+from torch.distributed._tensor._collective_utils import (
+    mesh_broadcast,
+    mesh_scatter,
+    pad_tensor,
+    shard_dim_alltoall,
+    unpad_tensor,
+)
 from torch.distributed.device_mesh import DeviceMesh
 
 
@@ -83,35 +88,15 @@ def _split_tensor(
             for shard, pad_size in zip(tensor_list, pad_sizes):
                 # Fill the empty tensor with zeroes with padding.
                 if with_padding and pad_size > 0:
-                    shard = self._pad_tensor(shard, pad_size)
+                    shard = pad_tensor(shard, self.dim, pad_size)
                 shard = shard.contiguous() if contiguous else shard
                 shard_list.append(shard)
             return shard_list, pad_sizes
         else:
             return tensor_list, pad_sizes
 
-    def _pad_tensor(
-        self,
-        tensor: torch.Tensor,
-        pad_size: int,
-    ) -> torch.Tensor:
-        pad = [0, 0] * (tensor.ndim - self.dim)
-        pad[-1] = pad_size
-        return torch.nn.functional.pad(tensor, pad)
-
-    def _unpad_tensor(
-        self,
-        tensor: torch.Tensor,
-        pad_size: int,
-    ) -> torch.Tensor:
-        return tensor.narrow(
-            self.dim,
-            start=0,
-            length=tensor.size(self.dim) - pad_size,
-        )
-
+    @staticmethod
     def _local_shard_size_on_dim(
-        self,
         size_on_dim: int,
         num_chunks: int,
         rank: int,
@@ -121,25 +106,22 @@ def _local_shard_size_on_dim(
         returns the local shard size and offset on a given tensor dim
         """
         # Compute the chunk size inline with ``torch.chunk``
+        if size_on_dim % num_chunks == 0:
+            full_chunk_size = size_on_dim // num_chunks
+            return full_chunk_size, full_chunk_size * rank if return_offset else -1
+
+        # uneven sharding case
         full_chunk_size = (size_on_dim + num_chunks - 1) // num_chunks
+        shard_starting_idx = full_chunk_size * rank
 
-        # Compute chunk size for each chunk on the dimension.
-        chunk_sizes = [
-            max(
-                min(size_on_dim, full_chunk_size * (idx + 1)) - full_chunk_size * idx,
-                0,
+        if size_on_dim < shard_starting_idx:
+            return 0, size_on_dim if return_offset else -1
+        else:
+            local_shard_size = (
+                min(size_on_dim, shard_starting_idx + full_chunk_size)
+                - shard_starting_idx
             )
-            for idx in range(num_chunks)
-        ]
-        local_shard_size = chunk_sizes[rank]
-
-        local_offset_on_dim = -1
-        if return_offset:
-            # Return global tensor dim size of current dimension if for empty shard
-            # to represent the end of the corresponding tensor dim.
-            local_offset_on_dim = sum(chunk_sizes[:rank])
-
-        return (local_shard_size, local_offset_on_dim)
+            return local_shard_size, shard_starting_idx if return_offset else -1
 
     def _shard_tensor(
         self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
@@ -165,14 +147,14 @@ def _shard_tensor(
         # Only unpad if the local_tensor was padded on the dimension.
         pad_size = pad_sizes[my_coordinate[mesh_dim]]
         if pad_size > 0:
-            output = self._unpad_tensor(output, pad_size)
+            output = unpad_tensor(output, self.dim, pad_size)
         return output
 
     def _reduce_shard_tensor(
         self,
         tensor: torch.Tensor,
         mesh: DeviceMesh,
-        reduce_op: c10d.ReduceOp.RedOpType,
+        reduce_op: str,
         mesh_dim: int,
     ) -> torch.Tensor:
         """
@@ -196,62 +178,130 @@ def _reduce_shard_tensor(
             tensor = tensor.contiguous()
 
         output = funcol.reduce_scatter_tensor(
-            tensor, reduce_op.name, scatter_dim=self.dim, group=(mesh, mesh_dim)
+            tensor, reduce_op, scatter_dim=self.dim, group=(mesh, mesh_dim)
         )
 
         if is_padded:
-            output = self._unpad_tensor(output, pad_sizes[my_coordinate[mesh_dim]])
+            output = unpad_tensor(output, self.dim, pad_sizes[my_coordinate[mesh_dim]])  # type: ignore[possibly-undefined]
         return output
 
     def _to_replicate_tensor(
         self,
         local_tensor: torch.Tensor,
-        size: torch.Size,
         mesh: DeviceMesh,
         mesh_dim: int,
+        current_logical_shape: List[int],
     ) -> torch.Tensor:
         """
         This function all_gather all shards and return a tensor that
         is replicated on the previously sharded mesh dimension
         """
-        my_coordinate = mesh.get_coordinate()
         num_chunks = mesh.size(mesh_dim=mesh_dim)
+        # check if it's uneven, so we need to pad input tensor before all_gather
+        local_shape = list(local_tensor.size())
 
-        if my_coordinate is None:
-            # if rank is not part of mesh, we simply return local_tensor,
-            # which should be an empty tensor
-            return local_tensor
+        logical_dim_size = current_logical_shape[self.dim]
+        is_padded = logical_dim_size % num_chunks != 0
 
-        # check if it needs to pad input tensor before all_gather
-        full_chunk_size = (size[self.dim] + num_chunks - 1) // num_chunks
-        chunk_sizes = [
-            max(
-                min(size[self.dim], full_chunk_size * (idx + 1))
-                - full_chunk_size * idx,
-                0,
-            )
-            for idx in range(num_chunks)
-        ]
-        pad_sizes = [full_chunk_size - chunk_size for chunk_size in chunk_sizes]
-        is_padded = size[self.dim] % num_chunks != 0
+        if is_padded:
+            full_chunk_size = (logical_dim_size + num_chunks - 1) // num_chunks
+            pad_size = full_chunk_size - local_shape[self.dim]
+            local_tensor = pad_tensor(local_tensor, self.dim, pad_size)
 
-        pad_size = pad_sizes[my_coordinate[mesh_dim]]
-        if pad_size > 0:
-            local_tensor = self._pad_tensor(local_tensor, pad_size)
-        local_tensor = local_tensor.contiguous()
+        if not local_tensor.is_contiguous():
+            local_tensor = local_tensor.contiguous()
 
         result = funcol.all_gather_tensor(
             local_tensor,
             gather_dim=self.dim,
             group=(mesh, mesh_dim),
         )
-
-        # Unpad the tensor if the input tensor was padded
         if is_padded:
-            full_pad_size = sum(pad_sizes)
-            result = self._unpad_tensor(result, full_pad_size)
+            unpad_size = full_chunk_size * num_chunks - logical_dim_size  # type: ignore[possibly-undefined]
+            result = unpad_tensor(result, self.dim, unpad_size)
         return result
 
+    def _replicate_to_shard(
+        self,
+        local_tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        shard_index: int,
+    ) -> torch.Tensor:
+        """
+        transform from replicated tensor to a sharded tensor on
+        the current rank, which would perform a local chunk
+        """
+        num_chunks = mesh.size(mesh_dim=mesh_dim)
+        shards, _ = self._split_tensor(
+            local_tensor,
+            num_chunks,
+            with_padding=False,
+            contiguous=False,
+        )
+        return shards[shard_index].clone()
+
+    def _to_new_shard_dim(
+        self,
+        local_tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        current_logical_shape: List[int],
+        new_shard_dim: int,
+    ) -> torch.Tensor:
+        """
+        transform from existing sharded tensor to a new sharded tensor on
+        that shard on a new dimension, which performs an alltoall
+        """
+        my_coordinate = mesh.get_coordinate()
+        if my_coordinate is None:
+            # if rank is not part of mesh, we simply return local_tensor,
+            # which should be an empty tensor
+            return local_tensor
+
+        num_chunks = mesh.size(mesh_dim=mesh_dim)
+
+        old_dim_logical_size = current_logical_shape[self.dim]
+        new_dim_logical_size = current_logical_shape[new_shard_dim]
+        old_dim_padding = old_dim_logical_size % num_chunks != 0
+        new_dim_padding = new_dim_logical_size % num_chunks != 0
+        if old_dim_padding:
+            old_dim_full_chunk_size = (
+                old_dim_logical_size + num_chunks - 1
+            ) // num_chunks
+            old_dim_pad_size = old_dim_full_chunk_size - local_tensor.size(self.dim)
+            local_tensor = pad_tensor(local_tensor, self.dim, old_dim_pad_size)
+        if new_dim_padding:
+            new_dim_full_chunk_size = (
+                new_dim_logical_size + num_chunks - 1
+            ) // num_chunks
+            new_dim_pad_size = new_dim_full_chunk_size * num_chunks - local_tensor.size(
+                new_shard_dim
+            )
+            local_tensor = pad_tensor(local_tensor, new_shard_dim, new_dim_pad_size)
+
+        if not local_tensor.is_contiguous():
+            local_tensor = local_tensor.contiguous()
+
+        new_tensor = shard_dim_alltoall(
+            local_tensor, self.dim, new_shard_dim, mesh, mesh_dim
+        )
+
+        if old_dim_padding:
+            old_dim_unpad_size = (
+                old_dim_full_chunk_size * num_chunks - current_logical_shape[self.dim]  # type: ignore[possibly-undefined]
+            )
+            new_tensor = unpad_tensor(new_tensor, self.dim, old_dim_unpad_size)  # type: ignore[possibly-undefined]
+
+        if new_dim_padding:
+            local_shard_size_on_new_dim = self._local_shard_size_on_dim(
+                new_dim_logical_size, num_chunks, my_coordinate[mesh_dim]
+            )[0]
+            new_dim_unpad_size = new_dim_full_chunk_size - local_shard_size_on_new_dim  # type: ignore[possibly-undefined]
+            new_tensor = unpad_tensor(new_tensor, new_shard_dim, new_dim_unpad_size)  # type: ignore[possibly-undefined]
+
+        return new_tensor
+
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, Shard):
             return False
@@ -314,23 +364,23 @@ def _replicate_tensor(
 
 @dataclass(frozen=True)
 class _Partial(Placement):
-    # This is a default partial placement with element-wise reduce op
-    # when doing reduction it follows the contract of `_to_replicate`
-    # and `_to_shard` to do the reduction and convert the local tensor
-    # to the corresponding state (replicate or shard)
-    #
+    # This is a default _Partial placement with element-wise reduce op
+    # _Partial define three contracts:
+    # 1. _reduce_value: reduce the value of the tensor on the mesh dimension
+    # 2. _reduce_shard_value: reduce_scatter the value of the tensor on the mesh dimension
+    # 3. _partition_value: partition the value of a replicated tensor on the mesh dimension
     # We can implement custom reductions as needed by subclassing this
     # class and override those contracts.
-    reduce_op: c10d.ReduceOp.RedOpType = c10d.ReduceOp.SUM
+    reduce_op: str = "sum"
 
-    def _to_replicate(
+    def _reduce_value(
         self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
     ) -> torch.Tensor:
         return funcol.all_reduce(
-            tensor, reduceOp=self.reduce_op.name, group=(mesh, mesh_dim)
+            tensor, reduceOp=self.reduce_op, group=(mesh, mesh_dim)
         )
 
-    def _to_shard(
+    def _reduce_shard_value(
         self,
         tensor: torch.Tensor,
         mesh: DeviceMesh,
@@ -341,6 +391,18 @@ def _to_shard(
         shard_spec = cast(Shard, shard_spec)
         return shard_spec._reduce_shard_tensor(tensor, mesh, self.reduce_op, mesh_dim)
 
+    def _partition_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        # _partition_value is the conjugate operation of _reduce_value
+        # - i.e. _partition_value on a sum reduce op is just a divison operation
+        # - the _reduce_value on a sum reduce op would just be a sum(allreduce) operation
+        # TODO: if the reduce_op is min/max, etc. the _partition_value should be a
+        # different operation
+        assert self.reduce_op == "sum", "only support replicate to PartialSUM for now!"
+        num_chunks = mesh.size(mesh_dim=mesh_dim)
+        return tensor / num_chunks
+
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, _Partial):
             return False
@@ -353,7 +415,7 @@ def __repr__(self) -> str:
         """
         machine readable representation of the Partial placement
         """
-        return f"_Partial(reduce_op={self.reduce_op})"
+        return f"_Partial({self.reduce_op})"
 
     def __str__(self) -> str:
         """
@@ -457,6 +519,12 @@ def shape(self) -> torch.Size:
             raise ValueError("tensor_meta is not set")
         return self.tensor_meta.shape
 
+    @property
+    def stride(self) -> Tuple[int, ...]:
+        if self.tensor_meta is None:
+            raise ValueError("tensor_meta is not set")
+        return self.tensor_meta.stride
+
     @property
     def ndim(self) -> int:
         if self.tensor_meta is None:
@@ -577,3 +645,22 @@ def is_replicated(self):
         return True if the current DTensorSpec replicates on all mesh dims (devices)
         """
         return all(placement.is_replicate() for placement in self.placements)
+
+    def is_sharded(self):
+        """
+        return True if the current DTensorSpec is sharded on any mesh dims (devices)
+        """
+        return any(placement.is_shard() for placement in self.placements)
+
+    def shallow_copy_with_tensor_meta(
+        self, tensor_meta: Optional[TensorMeta]
+    ) -> "DTensorSpec":
+        """
+        Shallow copy the DTensorSpec with a new tensor_meta.
+        """
+        assert tensor_meta is not None, "shallow copy with no tensor_meta!"
+        return DTensorSpec(
+            self.mesh,
+            self.placements,
+            tensor_meta=tensor_meta,
+        )
diff --git a/torch/distributed/_tensor/random.py b/torch/distributed/_tensor/random.py
index ad0b700fc5df2..e01e96c0356bf 100644
--- a/torch/distributed/_tensor/random.py
+++ b/torch/distributed/_tensor/random.py
@@ -33,20 +33,19 @@ def is_rng_supported_mesh(device_mesh: DeviceMesh) -> bool:
     if device_handle and hasattr(device_handle, "set_rng_state"):
         return True
     else:
+        # TODO: Logs way too much
         warnings.warn(
             f"DTensor random operators may not have complete support on {device_mesh.device_type} device mesh"
         )
         return False
 
 
-def manual_seed(seed: int, device_mesh: DeviceMesh, tp_dim: int = 0) -> None:
+def manual_seed(seed: int, device_mesh: DeviceMesh) -> None:
     """Sets the seed for generating random numbers for the calling rank.
 
     Args:
         seed (int): The desired seed.
         device_mesh (:class:`DeviceMesh`): The device mesh to set the seed.
-        tp_dim (int, optional): The mesh dimension where to apply Tensor Parallel
-            Default: 0
 
     Returns:
         None
@@ -83,7 +82,7 @@ def manual_seed(seed: int, device_mesh: DeviceMesh, tp_dim: int = 0) -> None:
     # the current rank is in mesh
     if device_mesh.get_coordinate() is not None:
         if isinstance(_rng_tracker, TensorParallelRNGTracker):
-            _rng_tracker._manual_seed(device_mesh, seed, tp_dim)
+            _rng_tracker._manual_seed(device_mesh, seed)
         elif isinstance(_rng_tracker, OffsetBasedRNGTracker):
             _rng_tracker._manual_seed(seed)
         else:
@@ -340,13 +339,10 @@ def __init__(self, device_type: str = "cuda"):
 
     def _manual_seed(
         self,
-        device_mesh: DeviceMesh,
+        tp_mesh: DeviceMesh,
         base_seed: int = 1234,
-        tp_dim: int = 0,
     ):
-        coordinate = device_mesh.get_coordinate()
-        assert coordinate is not None
-        tensor_parallel_rank = coordinate[tp_dim]
+        tensor_parallel_rank = tp_mesh.get_local_rank()
         # this magic number 2718 comes from Megatron's code
         # (https://github.com/NVIDIA/Megatron-LM/blob/060415572f4365a2e895f8036c4e37dad0efbdf5/megatron/core/tensor_parallel/random.py#L162-L163)
         MegatronMagicNum = 2718
diff --git a/torch/distributed/_tensor/redistribute.py b/torch/distributed/_tensor/redistribute.py
index 6c10818261de5..5cef7dbb047c2 100644
--- a/torch/distributed/_tensor/redistribute.py
+++ b/torch/distributed/_tensor/redistribute.py
@@ -1,8 +1,11 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
-from typing import cast, Dict, List, Tuple
+from functools import lru_cache
+from typing import cast, Dict, List, NamedTuple, Tuple
 
 import torch
+import torch.distributed._functional_collectives as funcol
 import torch.distributed._tensor.api as dtensor
+from torch.distributed._tensor.device_mesh import DeviceMesh
 from torch.distributed._tensor.placement_types import (
     _Partial,
     DTensorSpec,
@@ -10,72 +13,133 @@
     Replicate,
     Shard,
 )
-from torch.distributed.device_mesh import DeviceMesh
 
 
-_PlacementItem = Tuple[int, Tuple[Placement, Placement]]
+class _TransformInfo(NamedTuple):
+    mesh_dim: int
+    src_dst_placements: Tuple[Placement, Placement]
+    # logical_shape on this mesh dimension
+    logical_shape: List[int]
 
 
-def _replicate_then_shard(val: _PlacementItem) -> int:
+def _replicate_then_shard(val: _TransformInfo) -> int:
     """
-    Replicate from inner to outer dimension.
-    Shard from outer to inner dimension.
+    This is a helper function to allow reordering _TransformInfo list. The high level
+    idea is that we want to reorder the sharding redistributions so that the DTensor
+    redistribution is consistent with its full tensor. This is built on top of two simple
+    assumptions:
+    1. Replication happens from inner to outer dimension. i.e. Shard -> Replicate
+    2. Sharding happens from outer to inner dimension, i.e. Replicate -> Shard
+
+    So we always put the replication first and put sharding later.
     """
-    i, (current, target) = val
-    if (target.is_replicate() or target.is_partial()) and current.is_shard():
-        return -i
-    elif (current.is_replicate() or current.is_partial()) and target.is_shard():
-        return i
+    mesh_dim = val.mesh_dim
+    src, dst = val.src_dst_placements
+    if (dst.is_replicate() or dst.is_partial()) and src.is_shard():
+        return -mesh_dim
+    elif (src.is_replicate() or src.is_partial()) and dst.is_shard():
+        return mesh_dim
     else:
         return 0
 
 
-def _decompose_reshard(val: List[_PlacementItem]) -> List[_PlacementItem]:
+@lru_cache(maxsize=None)
+def _gen_transform_infos(
+    src_spec: DTensorSpec,
+    dst_spec: DTensorSpec,
+) -> List[_TransformInfo]:
     """
-    Decompose Si -> Sj into Si -> R -> Sj
-    There's 2 ways a shardings can differ within a mesh dimension:
-      1) sharding on different tensor dimensions, e.g. Shard(0) -> Shard(1)
-      2) different sub-shards of a repeated shard ("mis-aligned sharding")
-          (Shard(0), Shard(0)) -> (Replicate(), Shard(0))
-          Here the Shard(0) -> Shard(0) for mesh dimension 2 is actually
-          a reshard, because in the first case it's a sub-sharding of an already tensor dimension 0,
-          and in the second case, it's the first sharding on tensor dimension 0.
+    Generate the transform infos from the source placements to the target placements.
+
+    To transform from source to target placement it might have multiple steps, i.e. it
+    might decompose Si -> Sj into Si -> R -> Sj.
+    This would detects if there're mis-aligned shardings between src/dst placements.
+    i.e. (Shard(0), Shard(0)) -> (Replicate(), Shard(0)), in this case Shard(0) -> Shard(0)
+    for mesh dimension 1 actually needs reshard, because in the first case it's a sub-sharding
+    of an already tensor dimension 0, and in the second case, it's the first sharding on tensor
+    dimension 0.
     """
-    # detect mis-aligned repeated shardings
-    from collections import defaultdict
-
-    repeat_dim_current: Dict[int, int] = defaultdict(int)
-    repeat_dim_target: Dict[int, int] = defaultdict(int)
+    src_dim_counts: Dict[int, int] = {}
+    dst_dim_counts: Dict[int, int] = {}
+    transform_infos: List[_TransformInfo] = []
+
+    src_placements = src_spec.placements
+    dst_placements = dst_spec.placements
+    device_mesh = src_spec.device_mesh
+    my_coordinate = device_mesh.get_coordinate()
+    assert my_coordinate is not None
+
+    # logical shape records the logic tensor shape on the mesh dimension
+    # this is useful to ensure uneven sharding gets correct output shape
+    initial_logical_shape = list(src_spec.shape)
+    mesh_dims_to_logical_shape = [initial_logical_shape]
+    mesh_ndim = len(src_placements)
+
+    for i, (src, dst) in enumerate(zip(src_placements, dst_placements)):
+        # detect mis-aligned sharding and build logical shapes
+        current_logical_shape = mesh_dims_to_logical_shape[i]
+        if isinstance(src, Shard):
+            src_dim_counts[src.dim] = src_dim_counts.get(src.dim, 0) + 1
+
+            if i < mesh_ndim - 1:
+                # calculate and save the logical shape for this sharding
+                mesh_dim_size = device_mesh.size(mesh_dim=i)
+                local_shard_size, _ = src._local_shard_size_on_dim(
+                    current_logical_shape[src.dim],
+                    mesh_dim_size,
+                    my_coordinate[i],
+                )
+                new_logical_shape = list(current_logical_shape)
+                new_logical_shape[src.dim] = local_shard_size
+                mesh_dims_to_logical_shape.append(new_logical_shape)
+        else:
+            mesh_dims_to_logical_shape.append(current_logical_shape)
 
-    output: List[_PlacementItem] = []
+        if isinstance(dst, Shard):
+            dst_dim_counts[dst.dim] = dst_dim_counts.get(dst.dim, 0) + 1
 
-    for i, (current, target) in val:
-        # detect mis-aligned sharding
-        if current.is_shard():
-            repeat_dim_current[cast(Shard, current).dim] += 1
-        if target.is_shard():
-            repeat_dim_target[cast(Shard, target).dim] += 1
         if (
-            isinstance(current, Shard)
-            and isinstance(target, Shard)
-            and (
-                current.dim != target.dim
-                or repeat_dim_current[current.dim] != repeat_dim_target[target.dim]
-            )
+            isinstance(src, Shard)
+            and isinstance(dst, Shard)
+            and (mesh_ndim > 1 or src_dim_counts[src.dim] != dst_dim_counts[dst.dim])
         ):
+            # for the case when mesh ndim > 1 or shard dim counts are different
+            # TODO: see if we can optimize the mesh_ndim > 1 case
             # decompose Shard(i) -> Shard(j) into Shard(i) -> Replicate() -> Shard(j)
-            output.append((i, (current, Replicate())))
-            output.append((i, (Replicate(), target)))
+            transform_infos.append(
+                _TransformInfo(
+                    mesh_dim=i,
+                    src_dst_placements=(src, Replicate()),
+                    logical_shape=mesh_dims_to_logical_shape[i],
+                )
+            )
+            transform_infos.append(
+                _TransformInfo(
+                    mesh_dim=i,
+                    src_dst_placements=(Replicate(), dst),
+                    logical_shape=mesh_dims_to_logical_shape[i],
+                )
+            )
         else:
-            output.append((i, (current, target)))
+            transform_infos.append(
+                _TransformInfo(
+                    mesh_dim=i,
+                    src_dst_placements=(src, dst),
+                    logical_shape=mesh_dims_to_logical_shape[i],
+                )
+            )
 
-    return output
+    # sort the pairs by first perform replication then sharding
+    transform_infos.sort(key=_replicate_then_shard)
+    return transform_infos
 
 
 def redistribute_local_tensor(
     local_tensor: torch.Tensor,
     current_spec: DTensorSpec,
     target_spec: DTensorSpec,
+    *,
+    async_op: bool = False,
     is_backward: bool = False,
 ) -> torch.Tensor:
     """
@@ -89,23 +153,21 @@ def redistribute_local_tensor(
         raise NotImplementedError("Cross device mesh comm not supported yet!")
 
     new_local_tensor = None
+    device_mesh = current_spec.mesh
 
-    current_placements = current_spec.placements
-    target_placements = target_spec.placements
-    sorted_placements = list(enumerate(zip(current_placements, target_placements)))
-    sorted_placements = _decompose_reshard(sorted_placements)
-    sorted_placements.sort(key=_replicate_then_shard)
+    my_coordinate = device_mesh.get_coordinate()
 
-    device_mesh = current_spec.mesh
+    if my_coordinate is None:
+        # if rank is not part of mesh, we skip redistribute and simply return local_tensor,
+        # which should be an empty tensor
+        return local_tensor
 
-    for i, (current, target) in sorted_placements:
-        my_coordinate = device_mesh.get_coordinate()
-        num_chunks = device_mesh.size(mesh_dim=i)
+    transform_infos = _gen_transform_infos(current_spec, target_spec)
 
-        if my_coordinate is None:
-            # if rank is not part of mesh, we simply return local_tensor,
-            # which should be an empty tensor
-            return local_tensor
+    for transform_info in transform_infos:
+        i = transform_info.mesh_dim
+        current, target = transform_info.src_dst_placements
+        num_chunks = device_mesh.size(mesh_dim=i)
 
         if current == target:
             # short cut, just use the original local tensor
@@ -116,66 +178,82 @@ def redistribute_local_tensor(
             # Case 1: target is Replicate
             if current.is_partial():
                 partial_spec = cast(_Partial, current)
-                new_local_tensor = partial_spec._to_replicate(
+                new_local_tensor = partial_spec._reduce_value(
                     local_tensor, device_mesh, i
                 )
             elif current.is_shard():
                 current_placement = cast(Shard, current)
                 new_local_tensor = current_placement._to_replicate_tensor(
-                    local_tensor, current_spec.shape, device_mesh, i
+                    local_tensor, device_mesh, i, transform_info.logical_shape
                 )
             else:
                 raise RuntimeError(
-                    f"redistribute from {current_placements} to {target_placements} not supported yet"
+                    f"redistribute from {current} to {target} not supported yet"
                 )
         elif target.is_shard():
             # Case 2: target is Shard
             target_placement = cast(Shard, target)
+            target_dim = target_placement.dim
             if current.is_partial():
                 partial_spec = cast(_Partial, current)
-                new_local_tensor = partial_spec._to_shard(
+                new_local_tensor = partial_spec._reduce_shard_value(
                     local_tensor, device_mesh, i, target_placement
                 )
             elif current.is_replicate():
                 # split the tensor and return the corresponding cloned local shard
-                shards, _ = target_placement._split_tensor(
-                    local_tensor,
-                    num_chunks,
-                    with_padding=False,
-                    contiguous=False,
+                new_local_tensor = target_placement._replicate_to_shard(
+                    local_tensor, device_mesh, i, my_coordinate[i]
                 )
-                new_local_tensor = shards[my_coordinate[i]].clone()
             else:
-                # NOTE: this case shouldn't hit _decompose_sharding, decompose sharding should
-                # decompose Shard(0) -> Shard(1) into Shard(0) -> Replicate -> Shard(1)
                 assert (
                     current.is_shard()
                 ), f"Current placement should be shard but found {current}"
                 shard_spec = cast(Shard, current)
                 if shard_spec.dim != target_placement.dim:
-                    # TODO: enable this with all_to_all
-                    raise NotImplementedError(
-                        "Changing sharding dim is not supported yet!"
+                    new_local_tensor = shard_spec._to_new_shard_dim(
+                        local_tensor,
+                        device_mesh,
+                        i,
+                        transform_info.logical_shape,
+                        target_placement.dim,
                     )
-
         elif target.is_partial():
             if current.is_replicate():
-                # For replicate -> partial forward pass we perform division to num of chunks
-                # and generate parial, and recover it back when pending sum get cleared.
-                # Skip/pass through when replicate -> partial is in backward pass.
+                partial_spec = cast(_Partial, target)
+                # skip the replicate to partial transformation when we are in backward pass
+                # In this case we keep the grad as replicate, this is because we don't
+                # want to convert the replicated gradients back to partial, although
+                # that's logically conform with the same layout, converting the gradients
+                # back to partial is actually useless as you would have to do reduce later
+                # which would be more expensive than keeping it replicate! For this reason,
+                # we keep the replicate grad here.
                 new_local_tensor = (
-                    local_tensor / num_chunks if not is_backward else local_tensor
+                    partial_spec._partition_value(local_tensor, device_mesh, i)
+                    if not is_backward
+                    else local_tensor
                 )
-            else:
-                raise RuntimeError(
-                    f"redistribute from {current_placements} to {target_placements} not supported yet"
+            elif current.is_shard():
+                if not is_backward:
+                    raise RuntimeError(
+                        f"redistribute from {current} to {target} not supported yet"
+                    )
+                # for backward shard -> partial, we just need to convert the shard to replicate
+                current_placement = cast(Shard, current)
+                new_local_tensor = current_placement._to_replicate_tensor(
+                    local_tensor, device_mesh, i, transform_info.logical_shape
                 )
+            else:
+                # partial -> partial no op, should never hit
+                new_local_tensor = local_tensor
 
         assert new_local_tensor is not None
         local_tensor = new_local_tensor
 
     assert new_local_tensor is not None, "redistribute failed!"
 
+    if not async_op and isinstance(new_local_tensor, funcol.AsyncCollectiveTensor):
+        new_local_tensor = new_local_tensor.wait()
+
     return new_local_tensor
 
 
@@ -187,20 +265,29 @@ def forward(  # type: ignore[override]
         input: "dtensor.DTensor",
         device_mesh: DeviceMesh,
         placements: Tuple[Placement, ...],
+        async_op: bool = False,
     ):
         current_spec = input._spec
         ctx.current_spec = current_spec
-        target_spec = DTensorSpec(
-            device_mesh, placements, tensor_meta=input._spec.tensor_meta
-        )
+        ctx.async_op = async_op
+
+        if current_spec.placements != placements:
+            target_spec = DTensorSpec(
+                device_mesh, placements, tensor_meta=input._spec.tensor_meta
+            )
 
-        local_tensor = input._local_tensor
-        output = redistribute_local_tensor(local_tensor, current_spec, target_spec)
+            local_tensor = input._local_tensor
+            output = redistribute_local_tensor(
+                local_tensor, current_spec, target_spec, async_op=async_op
+            )
+        else:
+            # use the same local tensor if placements are the same.
+            output = input._local_tensor
 
         return dtensor.DTensor(
             output,
             device_mesh,
-            target_spec.placements,
+            placements,
             shape=input.shape,
             dtype=input.dtype,
             requires_grad=input.requires_grad,
@@ -210,38 +297,29 @@ def forward(  # type: ignore[override]
     @staticmethod
     def backward(ctx, grad_output: "dtensor.DTensor"):  # type: ignore[override]
         previous_spec = ctx.current_spec
-        # When we run backward pass of redistribute (i.e. manual redistribute from
-        # user code instead of torch_dispatch), we scan first and see if we need
-        # to change the target placement for one special case:
-        #   replicate -> partial.
-        # In this case we keep the grad as replicate, this is because we don't
-        # want to convert the replicated gradients back to partial, although
-        # that's logically conform with the same layout, converting the gradients
-        # back to partial is actually useless as you would have to do reduce later
-        # which would be more expensive than keeping it replicate! For this reason,
-        # we keep the replicate grad here.
-        # TODO: see if this make sense for all cases.
         current_spec = grad_output._spec
+        async_op = ctx.async_op
 
-        target_placements: List[Placement] = []
-        for current, target in zip(current_spec.placements, previous_spec.placements):
-            if not current.is_partial() and target.is_partial():
+        local_tensor = grad_output._local_tensor
+        output = redistribute_local_tensor(
+            local_tensor,
+            current_spec,
+            previous_spec,
+            async_op=async_op,
+            is_backward=True,
+        )
+        # normalize the target placement to replicate if it is partial
+        normalized_placements: List[Placement] = []
+        for previous_placement in previous_spec.placements:
+            if previous_placement.is_partial():
                 # keep target placement to replicate instead of partial in this case
-                target_placements.append(Replicate())
+                normalized_placements.append(Replicate())
             else:
-                target_placements.append(target)
-        target_spec = DTensorSpec(
-            previous_spec.mesh,
-            tuple(target_placements),
-            tensor_meta=previous_spec.tensor_meta,
-        )
-
-        local_tensor = grad_output._local_tensor
-        output = redistribute_local_tensor(local_tensor, current_spec, target_spec)
+                normalized_placements.append(previous_placement)
         output_dtensor = dtensor.DTensor(
             output,
-            target_spec.mesh,
-            target_spec.placements,
+            previous_spec.mesh,
+            tuple(normalized_placements),
             shape=grad_output.shape,
             dtype=grad_output.dtype,
             requires_grad=grad_output.requires_grad,
@@ -252,4 +330,5 @@ def backward(ctx, grad_output: "dtensor.DTensor"):  # type: ignore[override]
             output_dtensor,
             None,
             None,
+            None,
         )
diff --git a/torch/distributed/_tensor/sharding_prop.py b/torch/distributed/_tensor/sharding_prop.py
index c10cf50127751..9acf6aa0c9195 100644
--- a/torch/distributed/_tensor/sharding_prop.py
+++ b/torch/distributed/_tensor/sharding_prop.py
@@ -1,13 +1,16 @@
 from functools import lru_cache
 from itertools import chain
-from typing import Callable, cast, Dict, List, Optional, Sequence, Union
+from typing import Callable, cast, Dict, List, Optional, Sequence, Tuple, Union
 
 import torch
 from torch._ops import OpOverload
 from torch._subclasses import FakeTensorMode
-from torch.distributed._tensor._utils import try_find_mesh_from_args
+from torch.distributed._tensor._utils import (
+    compute_local_shape,
+    compute_local_stride,
+    try_find_mesh_from_args,
+)
 from torch.distributed._tensor.op_schema import (
-    DTensorSpec,
     OpInfo,
     OpSchema,
     OpStrategy,
@@ -18,7 +21,7 @@
     StrategyType,
     TupleStrategy,
 )
-from torch.distributed._tensor.placement_types import TensorMeta
+from torch.distributed._tensor.placement_types import DTensorSpec, TensorMeta
 from torch.distributed.device_mesh import DeviceMesh
 
 aten = torch.ops.aten
@@ -42,6 +45,16 @@ def __init__(self) -> None:
         # op map to save static argnum to decide to reuse sharding prop cache or re-run sharding prop
         self.op_to_schema_info: Dict[OpOverload, RuntimeSchemaInfo] = {}
         self.propagate_op_sharding = lru_cache(None)(self.propagate_op_sharding_non_cached)  # type: ignore[method-assign]
+        # op map to save indices of size (and stride) args which may need to be modified in sharding prop
+        self.op_to_size_and_stride_idx: Dict[
+            OpOverload, Union[int, Tuple[int, int]]
+        ] = {
+            aten.new_empty.default: 1,
+            aten.new_full.default: 1,
+            aten.new_ones.default: 1,
+            aten.new_zeros.default: 1,
+            aten.new_empty_strided.default: (1, 2),
+        }
 
     def register_sharding_prop_rule(
         self,
@@ -69,6 +82,7 @@ def register_op_strategy(
         if schema_info is not None:
             self.op_to_schema_info[op_overload] = schema_info
 
+    @lru_cache
     def _propagate_tensor_meta(
         self, op_schema: OpSchema
     ) -> Union[None, TensorMeta, Sequence[Optional[TensorMeta]]]:
@@ -117,14 +131,14 @@ def _propagate_tensor_meta(
     def _wrap_output_spec_tensor_meta(
         self,
         op: OpOverload,
-        output_spec: OutputSpecType,
+        output_specs: OutputSpecType,
         output_tensor_meta: Union[None, TensorMeta, Sequence[Optional[TensorMeta]]],
     ) -> None:
         """
-        Wrap the output_spec with the tensor metadata from the output.
+        Wrap the output_specs with the tensor metadata from the output.
         """
 
-        if isinstance(output_spec, DTensorSpec):
+        if isinstance(output_specs, DTensorSpec):
             if not isinstance(output_tensor_meta, TensorMeta):
                 # Either error due to ShardingPropagator or due to incorrect OutputSpec
                 if not isinstance(output_tensor_meta, (tuple, list)):
@@ -132,19 +146,19 @@ def _wrap_output_spec_tensor_meta(
                         "ShardingPropagator error: output does not have an associated TensorMeta"
                     )
                 raise ValueError(
-                    f"For the op {op.name()}, `output_spec` has 1 output which does not equal the "
+                    f"For the op {op.name()}, `output_specs` has 1 output which does not equal the "
                     f"number of op outputs: {len(output_tensor_meta)}."
                 )
-            output_spec.tensor_meta = output_tensor_meta
-        elif isinstance(output_spec, (tuple, list)):
+            output_specs.tensor_meta = output_tensor_meta
+        elif isinstance(output_specs, (tuple, list)):
             if not isinstance(output_tensor_meta, (tuple, list)) or len(
-                output_spec
+                output_specs
             ) != len(output_tensor_meta):
                 raise ValueError(
-                    f"For the op {op.name()}, `output_spec` has {len(output_spec)} outputs which does not equal the "
+                    f"For the op {op.name()}, `output_specs` has {len(output_specs)} outputs which does not equal the "
                     f"number of op outputs {_length(output_tensor_meta)}."
                 )
-            for i, spec in enumerate(output_spec):
+            for i, spec in enumerate(output_specs):
                 if isinstance(spec, DTensorSpec):
                     output_tensor_meta_i = output_tensor_meta[i]
                     if not isinstance(output_tensor_meta_i, TensorMeta):
@@ -171,7 +185,7 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
         # special case op, we don't need to propagate for local
         # scalar. TODO: figure out a better way to handle this
         if op_schema.op is aten._local_scalar_dense.default:
-            return OutputSharding(None, [op_schema])
+            return OutputSharding(None, op_schema)
 
         out_tensor_meta = self._propagate_tensor_meta(op_schema)
 
@@ -219,98 +233,135 @@ def spec_to_strategy(spec: object) -> object:
                 needs_redistribute = False
                 expected_input_specs = []
 
-                # in case where the op does not specify input_specs and output_spec
-                # is a DTensorSpec, we use output_spec as the spec for each DTensor
+                # in case where the op does not specify input_specs and output_specs
+                # is a DTensorSpec, we use output_specs as the spec for each DTensor
                 # input arg.
                 if output_strategy.input_specs is None:
-                    assert isinstance(output_strategy.output_spec, DTensorSpec)
+                    assert isinstance(output_strategy.output_specs, DTensorSpec)
 
                 for idx, input_spec in enumerate(op_schema.args_spec):
                     desired_spec = (
-                        output_strategy.out_spec
+                        output_strategy.output_spec
                         if output_strategy.input_specs is None
                         else output_strategy.input_specs[idx]
                     )
-                    expected_input_specs.append(desired_spec)
+                    expected_input_specs.append(
+                        desired_spec.shallow_copy_with_tensor_meta(
+                            input_spec.tensor_meta
+                        )
+                    )
                     if input_spec.placements != desired_spec.placements:
                         needs_redistribute = True
 
                 suggestion_schema = None
                 if needs_redistribute:
-                    reshard_schema = OpSchema(
+                    suggestion_schema = OpSchema(
                         op_schema.op, tuple(expected_input_specs), {}
                     )
-                    reshard_schema._inplace_rewrap_schema_suggestion(op_schema)
-                    suggestion_schema = [reshard_schema]
+                    suggestion_schema._inplace_rewrap_schema_suggestion(op_schema)
+
+                # size and stride args need to be modified for new factory ops, potentially
+                if op_schema.op in self.op_to_size_and_stride_idx:
+                    assert isinstance(output_strategy.output_spec, DTensorSpec)
+                    # It happens when the output has the same shape as the input
+                    # and the input placements are not all Replicate().
+                    if output_strategy.output_spec.is_sharded():
+                        needs_redistribute = True
+                        suggestion_schema = self._adjust_size_and_stride_args(
+                            op_schema, output_strategy.output_spec, mesh
+                        )
 
                 # construct output spec for the op
-                if op_schema.return_type_tuple_tensors():
-                    # for ops return multiple tensors, make output spec return same spec
-                    # returned from the op strategy if output_spec is not a sequence
-                    output_spec: OutputSpecType = output_strategy.output_spec
-                    if isinstance(output_spec, DTensorSpec):
-                        output_spec = tuple(
+                if op_schema.return_type_tuple_tensor_like():
+                    # for ops that return multiple tensors and the output_specs is not
+                    # a tuple, we use a tuple of that single output spec as the new
+                    # output_specs
+                    output_specs: OutputSpecType = output_strategy.output_specs
+                    if isinstance(output_specs, DTensorSpec):
+                        output_specs = tuple(
                             [
                                 # create a new DTensorSpec with the same placement as the
-                                # output_spec in output_strategy
+                                # output_specs in output_strategy
                                 DTensorSpec(
-                                    mesh=output_spec.mesh,
-                                    placements=output_spec.placements,
-                                    tensor_meta=output_spec.tensor_meta,
+                                    mesh=output_specs.mesh,
+                                    placements=output_specs.placements,
+                                    tensor_meta=output_specs.tensor_meta,
                                 )
                                 for _ in range(len(op_schema.op._schema.returns))
                             ]
                         )
                 elif op_schema.return_type_tensor():
-                    output_spec = output_strategy.output_spec
+                    output_specs = output_strategy.output_specs
                 else:
-                    output_spec = None
+                    output_specs = None
 
                 output_sharding = OutputSharding(
-                    output_spec,
+                    output_specs,
                     suggestion_schema,
                     needs_redistribute=needs_redistribute,
                 )
             elif isinstance(op_strategy, TupleStrategy):
-                # tuple strategy output sharding
+                # tuple strategy output sharding processing
+                # runtime selected placement strategy for each TupleStrategy input arg
+                selected_strategies: List[PlacementStrategy] = []
                 out_spec_list: List[DTensorSpec] = []
                 for strategy in op_strategy.childs:
                     assert isinstance(strategy, OpStrategy)
-                    output_strategy = self._select_strategy(strategy)
-                    assert isinstance(output_strategy.output_spec, DTensorSpec)
-                    out_spec_list.append(output_strategy.output_spec)
+                    selected_strategy = self._select_strategy(strategy)
+                    selected_strategies.append(selected_strategy)
+                    out_spec_list.append(selected_strategy.output_spec)
 
                 needs_redistribute = False
                 suggestion_args: List[object] = []
+                tensor_or_list_tensor_arg_idx = 0
+
                 for arg in op_schema.args_schema:
-                    if isinstance(arg, (list, tuple)) and isinstance(
-                        arg[0], DTensorSpec
+                    if (
+                        arg
+                        and isinstance(arg, (list, tuple))
+                        and isinstance(arg[0], DTensorSpec)
                     ):
-                        expected_input_spec_list = []
+                        expected_input_spec_list: List[DTensorSpec] = []
                         for idx, arg_spec in enumerate(arg):
-                            if arg_spec.placements != out_spec_list[idx].placements:
+                            expected_input_spec = selected_strategies[idx].input_spec(
+                                tensor_or_list_tensor_arg_idx
+                            )
+                            expected_input_spec = (
+                                expected_input_spec.shallow_copy_with_tensor_meta(
+                                    arg_spec.tensor_meta
+                                )
+                            )
+                            if arg_spec.placements != expected_input_spec.placements:
                                 needs_redistribute = True
-                            expected_input_spec_list.append(out_spec_list[idx])
+                            expected_input_spec_list.append(expected_input_spec)
                         suggestion_args.append(
                             tuple(expected_input_spec_list)
                             if isinstance(arg, tuple)
                             else expected_input_spec_list
                         )
+                        tensor_or_list_tensor_arg_idx += 1
+
                     elif isinstance(arg, DTensorSpec):
-                        expected_input_spec = out_spec_list[0]
+                        expected_input_spec = selected_strategies[0].input_spec(
+                            tensor_or_list_tensor_arg_idx
+                        )
+                        expected_input_spec = (
+                            expected_input_spec.shallow_copy_with_tensor_meta(
+                                arg.tensor_meta
+                            )
+                        )
                         if arg.placements != expected_input_spec.placements:
                             needs_redistribute = True
                         suggestion_args.append(expected_input_spec)
+                        tensor_or_list_tensor_arg_idx += 1
                     else:
                         suggestion_args.append(arg)
 
                 suggestion_schema = None
                 if needs_redistribute:
-                    reshard_schema = OpSchema(
+                    suggestion_schema = OpSchema(
                         op_schema.op, tuple(suggestion_args), op_schema.kwargs_schema
                     )
-                    # reshard_schema._inplace_rewrap_schema_suggestion(op_schema)
-                    suggestion_schema = [reshard_schema]
 
                 output_sharding = OutputSharding(
                     tuple(out_spec_list) if out_tensor_meta is not None else None,
@@ -346,7 +397,7 @@ def spec_to_strategy(spec: object) -> object:
             # with schema suggestions, which can be used to
             # decide how to do redistribute on inputs
             if output_sharding.output_spec is None:
-                if output_sharding.schema_suggestions is None:
+                if output_sharding.redistribute_schema is None:
                     if output_sharding.failed_reason is not None:
                         raise RuntimeError(
                             f"Sharding propagation failed on op {op_schema}!"
@@ -354,14 +405,12 @@ def spec_to_strategy(spec: object) -> object:
                         )
                 else:
                     # we do auto redistribute on inputs if necessary
-                    # to get an eligible input, which we will pick a
-                    # schema suggestion base on the redistribute cost.
-                    # For now we simply pick the first suggestion.
-                    suggested_input_schema = output_sharding.schema_suggestions[0]
                     # run sharding propagation again with suggested schema
-                    propagation_res = sharding_prop_func(suggested_input_schema)
+                    propagation_res = sharding_prop_func(
+                        output_sharding.redistribute_schema
+                    )
                     # we set the output sharding with the new propagation result
-                    # so that dispatching know both output_spec and schema_suggestions
+                    # so that dispatching know both output_spec and redistribute_schema
                     # exist, which indicates a reshard is needed
                     output_sharding.output_spec = propagation_res.output_spec
                     output_sharding.needs_redistribute = True
@@ -392,3 +441,30 @@ def _select_strategy(self, strategy: OpStrategy) -> PlacementStrategy:
 
         # for eager execution, we just select the one with the minimal redistribute cost
         return strategy.strategies[strategy_costs.index(min(strategy_costs))]
+
+    def _adjust_size_and_stride_args(
+        self, op_schema: OpSchema, spec: DTensorSpec, mesh: DeviceMesh
+    ) -> OpSchema:
+        size_stride_idx = self.op_to_size_and_stride_idx[op_schema.op]
+        if isinstance(size_stride_idx, tuple):
+            size_idx, stride_idx = size_stride_idx
+        else:
+            size_idx = size_stride_idx
+            stride_idx = None
+
+        expected_input_schema = list(op_schema.args_schema)
+        size = cast(list, expected_input_schema[size_idx])
+        # # adjust size to be the same as that of the _local_tensor
+        # # of the DTensor input arg at index 0, which is inferred
+        expected_input_schema[size_idx] = compute_local_shape(
+            size, mesh, spec.placements
+        )
+
+        # adjust the stride arg for aten.new_empty_strided.default
+        if stride_idx:
+            stride = cast(list, expected_input_schema[stride_idx])
+            expected_input_schema[stride_idx] = compute_local_stride(
+                stride, mesh, spec.placements
+            )
+
+        return OpSchema(op_schema.op, tuple(expected_input_schema), {})
diff --git a/torch/distributed/_tensor/tp_conv.py b/torch/distributed/_tensor/tp_conv.py
index 65e120debd03a..ebcc981d2c93a 100644
--- a/torch/distributed/_tensor/tp_conv.py
+++ b/torch/distributed/_tensor/tp_conv.py
@@ -141,7 +141,7 @@ def tp_convolution(
         local_tensor_args = cast(Tuple[object, ...], local_tensor_args_list)
         local_results = op_call(*local_tensor_args, **local_tensor_kwargs)
 
-        # step3 remove extra ouputs from the results
+        # step3 remove extra outputs from the results
         padding_w = padding[1]
         w = local_results.size(3)
         if rank == 0:
diff --git a/torch/distributed/_tools/memory_tracker.py b/torch/distributed/_tools/memory_tracker.py
index 96a3fa497c04c..d8b6765230a19 100644
--- a/torch/distributed/_tools/memory_tracker.py
+++ b/torch/distributed/_tools/memory_tracker.py
@@ -11,12 +11,16 @@
     List,
     no_type_check,
     Sequence,
+    TYPE_CHECKING,
 )
 
 import torch
 import torch.nn as nn
-from torch.utils.hooks import RemovableHandle
 from torch.utils._python_dispatch import TorchDispatchMode
+import operator
+
+if TYPE_CHECKING:
+    from torch.utils.hooks import RemovableHandle
 
 
 BYTES_PER_MB = 1024 * 1024.0
@@ -148,7 +152,7 @@ def summary(self, top: int = 20) -> None:
         print("------------------------------------------------")
         print(f"The number of cuda retries are: {self._num_cuda_retries}")
         print(f"Top {top} ops that generates memory are:")
-        for k, v in sorted(op_diff.items(), key=lambda item: item[1], reverse=True)[
+        for k, v in sorted(op_diff.items(), key=operator.itemgetter(1), reverse=True)[
             :top
         ]:
             print(f"{k}: {v}MB")
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
index acb941ac4fe9f..52f9b419ab147 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
@@ -15,7 +15,7 @@
 # method, and ZeRO requires a functional optimizer to overlap with DDP
 # Passing a `None` instead of an actual gradient indicates to the optimizer
 # to not update the corresponding parameter
-_NO_PARAM_UPDATE = None
+_NO_PARAM_UPDATE: None = None
 
 
 def _perform_local_step(
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
index abdca49f87bf8..bff55327e8474 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
@@ -1,9 +1,16 @@
-from typing import Any, Callable
+from typing import Any, Callable, cast, Tuple
 
 import torch
 import torch.distributed as dist
 
-__all__ = ["allreduce_hook", "fp16_compress_hook", "bf16_compress_hook", "fp16_compress_wrapper", "bf16_compress_wrapper"]
+__all__ = [
+    "allreduce_hook",
+    "fp16_compress_hook",
+    "bf16_compress_hook",
+    "fp16_compress_wrapper",
+    "bf16_compress_wrapper",
+]
+
 
 def _allreduce_fut(
     process_group: dist.ProcessGroup, tensor: torch.Tensor
@@ -44,7 +51,8 @@ def allreduce_hook(
 
 
 def fp16_compress_hook(
-    process_group: dist.ProcessGroup, bucket: dist.GradBucket
+    process_group: dist.ProcessGroup,
+    bucket: dist.GradBucket,
 ) -> torch.futures.Future[torch.Tensor]:
     """
     Compress by casting ``GradBucket`` to ``torch.float16`` divided by process group size.
@@ -62,24 +70,37 @@ def fp16_compress_hook(
     group_to_use = process_group if process_group is not None else dist.group.WORLD
     world_size = group_to_use.size()
 
-    compressed_tensor = bucket.buffer().to(torch.float16).div_(world_size)
-
-    fut = dist.all_reduce(
-        compressed_tensor, group=group_to_use, async_op=True
-    ).get_future()
+    buffer = (
+        cast(Tuple[torch.Tensor, ...], bucket)[0]
+        if isinstance(bucket, tuple)
+        else bucket.buffer()
+    )
+    compressed_tensor = buffer.to(torch.float16).div_(world_size)
 
     def decompress(fut):
-        decompressed_tensor = bucket.buffer()
+        decompressed_tensor = buffer
         # Decompress in place to reduce the peak memory.
         # See: https://github.com/pytorch/pytorch/issues/45968
-        decompressed_tensor.copy_(fut.value()[0])
+        value = fut if isinstance(fut, torch.Tensor) else fut.value()[0]
+        decompressed_tensor.copy_(value)
         return decompressed_tensor
 
-    return fut.then(decompress)
+    if torch._utils.is_compiling():
+        grad = dist._functional_collectives.all_reduce(
+            compressed_tensor, "sum", group_to_use
+        )
+        return decompress(grad)
+    else:
+        fut = dist.all_reduce(
+            compressed_tensor, group=group_to_use, async_op=True
+        ).get_future()
+        return fut.then(decompress)
+
 
 # TODO: create an internal helper function and extract the duplicate code in FP16_compress and BF16_compress.
 def bf16_compress_hook(
-    process_group: dist.ProcessGroup, bucket: dist.GradBucket
+    process_group: dist.ProcessGroup,
+    bucket: dist.GradBucket,
 ) -> torch.futures.Future[torch.Tensor]:
     """
     Warning: This API is experimental, and it requires NCCL version later than 2.9.6.
@@ -98,20 +119,31 @@ def bf16_compress_hook(
     group_to_use = process_group if process_group is not None else dist.group.WORLD
     world_size = group_to_use.size()
 
-    compressed_tensor = bucket.buffer().to(torch.bfloat16).div_(world_size)
-
-    fut = dist.all_reduce(
-        compressed_tensor, group=group_to_use, async_op=True
-    ).get_future()
+    buffer = (
+        cast(Tuple[torch.Tensor, ...], bucket)[0]
+        if isinstance(bucket, tuple)
+        else bucket.buffer()
+    )
+    compressed_tensor = buffer.to(torch.bfloat16).div_(world_size)
 
     def decompress(fut):
-        decompressed_tensor = bucket.buffer()
+        decompressed_tensor = buffer
         # Decompress in place to reduce the peak memory.
         # See: https://github.com/pytorch/pytorch/issues/45968
-        decompressed_tensor.copy_(fut.value()[0])
+        value = fut if isinstance(fut, torch.Tensor) else fut.value()[0]
+        decompressed_tensor.copy_(value)
         return decompressed_tensor
 
-    return fut.then(decompress)
+    if torch._utils.is_compiling():
+        grad = dist._functional_collectives.all_reduce(
+            compressed_tensor, "sum", group_to_use
+        )
+        return decompress(grad)
+    else:
+        fut = dist.all_reduce(
+            compressed_tensor, group=group_to_use, async_op=True
+        ).get_future()
+        return fut.then(decompress)
 
 
 def fp16_compress_wrapper(
@@ -151,6 +183,7 @@ def decompress(fut):
 
     return fp16_compress_wrapper_hook
 
+
 def bf16_compress_wrapper(
     hook: Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]
 ) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
diff --git a/torch/distributed/algorithms/model_averaging/averagers.py b/torch/distributed/algorithms/model_averaging/averagers.py
index 3237733c4ea55..e1f8c0800c508 100644
--- a/torch/distributed/algorithms/model_averaging/averagers.py
+++ b/torch/distributed/algorithms/model_averaging/averagers.py
@@ -104,8 +104,9 @@ def __init__(
 
     def average_parameters(self, params: Union[Iterable[torch.nn.Parameter], Iterable[Dict[str, torch.nn.Parameter]]]):
         """
-        Averages parameters or parameter groups of an optimizer if ``step`` is no less than ``warmup_steps``
-        and it can be divided by ``period``, where ``step`` is increased by 1
+        Averages parameters or parameter groups of an optimizer if ``step`` is no less than ``warmup_steps``.
+
+        Can be divided by ``period``, where ``step`` is increased by 1
         at each iteration in the training loop.
         Args:
             params: The parameters of a model or parameter groups of an optimizer.
diff --git a/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py b/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
index ebc1ee23adcd5..637ae144b379e 100644
--- a/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
+++ b/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
@@ -15,6 +15,7 @@
 class HierarchicalModelAverager(averagers.ModelAverager):
     r"""
     Runs hierarchical model averaging (`hierarchical SGD <https://arxiv.org/pdf/2010.12998.pdf>`_).
+
     Process groups of different sizes are organized in a hierarchy, and they average parameters
     by using different periods concurrently after the warm-up stage.
     This is an extension of :class:`~torch.distributed.algorithms.model_averaging.averagers.PeriodicModelAverager`
@@ -135,8 +136,8 @@ def __init__(self, period_group_size_dict=None, warmup_steps=0, process_group=No
 
     def _find_process_group(self):
         """
-        Returns a process group as the value of an ``period_process_group_dict`` entry,
-        if ``step`` can be divided by a period in the keys of ``period_process_group_dict``.
+        Return a process group as the value of an ``period_process_group_dict`` entry.
+
         If ``step`` can be divided by multiple periods in the keys of ``period_process_group_dict``,
         then the returned process group is the one corresponding to the largest period,
         since this process group will be used for averaging parameters at this ``step``.
@@ -149,7 +150,9 @@ def _find_process_group(self):
 
     def average_parameters(self, params: Union[Iterable[torch.nn.Parameter], Iterable[Dict[str, torch.nn.Parameter]]]):
         """
-        Averages parameters or parameter groups of an optimizer if ``step`` is no less than ``warmup_steps``
+        Averages parameters or parameter groups of an optimizer.
+
+        Averaging only occurs if ``step`` is no less than ``warmup_steps``
         and it can be divided by a period in the keys of ``period_process_group_dict``,
         where ``step`` is increased by 1 at each iteration in the training loop.
         If ``step`` can be divided by multiple periods in the keys of ``period_process_group_dict``,
diff --git a/torch/distributed/algorithms/model_averaging/utils.py b/torch/distributed/algorithms/model_averaging/utils.py
index d207f627be246..eaa1cd2e968db 100644
--- a/torch/distributed/algorithms/model_averaging/utils.py
+++ b/torch/distributed/algorithms/model_averaging/utils.py
@@ -16,6 +16,7 @@ def average_parameters(
 ):
     """
     Averages all the given parameters.
+
     For allreduce efficiency, all the parameters are flattened into a contiguous buffer.
     Thus, it requires extra memory of the same size as the given parameters.
     """
@@ -43,7 +44,9 @@ def average_parameters(
 
 def get_params_to_average(params: Union[Iterable[torch.nn.Parameter], Iterable[Dict[str, torch.nn.Parameter]]]):
     """
-    Returns a list of parameters that need to average, which filters out the parameters that do not contain any gradients.
+    Return a list of parameters that need to average.
+
+    This filters out the parameters that do not contain any gradients.
     Args:
         params: The parameters of a model or parameter groups of an optimizer.
     """
@@ -65,7 +68,5 @@ def get_params_to_average(params: Union[Iterable[torch.nn.Parameter], Iterable[D
 
 
 def average_parameters_or_parameter_groups(params: Union[Iterable[torch.nn.Parameter], Iterable[Dict[str, torch.nn.Parameter]]], process_group: ProcessGroup):
-    """
-    Averages parameters of a model or parameter groups of an optimizer.
-    """
+    """Averages parameters of a model or parameter groups of an optimizer."""
     average_parameters(iter(get_params_to_average(params)), process_group)
diff --git a/torch/distributed/argparse_util.py b/torch/distributed/argparse_util.py
index 4a79a0f292867..a214dadd312a5 100644
--- a/torch/distributed/argparse_util.py
+++ b/torch/distributed/argparse_util.py
@@ -11,7 +11,7 @@
 
 class env(Action):
     """
-    Get argument values from ``PET_{dest}`` before defaultingto the given ``default`` value.
+    Get argument values from ``PET_{dest}`` before defaulting to the given ``default`` value.
 
     For flags (e.g. ``--standalone``)
     use ``check_env`` instead.
diff --git a/torch/distributed/benchmarks/README.md b/torch/distributed/benchmarks/README.md
index f5b1ec6bff2de..e7d6e99e8203d 100644
--- a/torch/distributed/benchmarks/README.md
+++ b/torch/distributed/benchmarks/README.md
@@ -7,22 +7,22 @@ This Benchmark is used to measure distributed training iteration time. It combin
 There are different training paradigms where combining these two techniques might be useful. For example:
 1) If we have a model with a sparse part (large embedding table) and a dense
    part (FC layers), we might want to set the embedding table on a parameter
-   server and replicate the FC layer across multiple trainers using [DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel). The [Distributed RPC framework](https://pytorch.org/docs/master/rpc.html) comes handy to perform embedding lookups on the parameter servers.
-2) Enable hybrid parallelism as described in the [PipeDream](https://arxiv.org/abs/1806.03377) paper. We can use the [Distributed RPC framework](https://pytorch.org/docs/master/rpc.html) to pipeline stages of the model across multiple workers and replicate each stage (if needed) using [DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel).
+   server and replicate the FC layer across multiple trainers using [DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel). The [Distributed RPC framework](https://pytorch.org/docs/main/rpc.html) comes handy to perform embedding lookups on the parameter servers.
+2) Enable hybrid parallelism as described in the [PipeDream](https://arxiv.org/abs/1806.03377) paper. We can use the [Distributed RPC framework](https://pytorch.org/docs/main/rpc.html) to pipeline stages of the model across multiple workers and replicate each stage (if needed) using [DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel).
 
 ## Training Process
 This benchmark focuses on the first paradigm above. The training process is executed as follows:
 
-1) The master creates embedding tables on each of the 8 Parameter Servers and holds an [RRef](https://pytorch.org/docs/master/rpc.html#rref) to it.
+1) The master creates embedding tables on each of the 8 Parameter Servers and holds an [RRef](https://pytorch.org/docs/main/rpc.html#rref) to it.
 2) The master, then kicks off the training loop on the 8 trainers and passes the embedding table RRef to the trainers.
 3) The trainers create a `HybridModel` which performs embedding lookups in all 8 Parameter Servers using the embedding table RRef provided by the master and then executes the FC layer which is wrapped and replicated via DDP (DistributedDataParallel).
 4) The trainer executes the forward pass of the model and uses the loss to
-   execute the backward pass using [Distributed Autograd](https://pytorch.org/docs/master/rpc.html#distributed-autograd-framework).
+   execute the backward pass using [Distributed Autograd](https://pytorch.org/docs/main/rpc.html#distributed-autograd-framework).
 5) As part of the backward pass, the gradients for the FC layer are computed
    first and synced to all trainers via allreduce in DDP.
 6) Next, Distributed Autograd propagates the gradients to the parameter servers,
    where the gradients for the embedding table are updated.
-7) Finally, the [Distributed Optimizer](https://pytorch.org/docs/master/rpc.html#module-torch.distributed.optim) is used to update all parameters.
+7) Finally, the [Distributed Optimizer](https://pytorch.org/docs/main/rpc.html#module-torch.distributed.optim) is used to update all parameters.
 
 
 ## Example Benchmark output:
diff --git a/torch/distributed/benchmarks/benchmark_ddp_rpc.py b/torch/distributed/benchmarks/benchmark_ddp_rpc.py
index b31b72729a9d1..7294fce61ff36 100644
--- a/torch/distributed/benchmarks/benchmark_ddp_rpc.py
+++ b/torch/distributed/benchmarks/benchmark_ddp_rpc.py
@@ -69,7 +69,7 @@ def forward(self, indices, offsets):
         # Make sure combined PS dimension is always bigger or equal than the FC input
         assert NUM_PS * EMBEDDING_DIM >= 512
         dim_normalizer = int(NUM_PS * EMBEDDING_DIM / 512)
-        emb_lookups_reshaped = emb_lookups_cat.reshape(
+        emb_lookups_reshaped = emb_lookups_cat.reshape(  # type: ignore[possibly-undefined]
             [emb_lookups_cat.shape[0] * dim_normalizer, 512]
         )
 
@@ -195,7 +195,7 @@ def get_next_batch(rank):
 
     # Throw away warm-up measurements
     measurements = measurements[WARMUP_CYCLES:]
-    return rank, measurements, batch_size
+    return rank, measurements, batch_size  # type: ignore[possibly-undefined]
 
 
 def run_worker(rank, world_size):
diff --git a/torch/distributed/c10d_logger.py b/torch/distributed/c10d_logger.py
index 984b3841ef3d9..5d2aa9b629910 100644
--- a/torch/distributed/c10d_logger.py
+++ b/torch/distributed/c10d_logger.py
@@ -19,9 +19,11 @@
 
 __all__: List[str] = []
 
+_DEFAULT_DESTINATION = "default"
 
-def _get_or_create_logger() -> logging.Logger:
-    logging_handler, log_handler_name = _get_logging_handler()
+
+def _get_or_create_logger(destination: str = _DEFAULT_DESTINATION) -> logging.Logger:
+    logging_handler, log_handler_name = _get_logging_handler(destination)
     logger = logging.getLogger(f"c10d-{log_handler_name}")
     logger.setLevel(logging.DEBUG)
     formatter = logging.Formatter(
@@ -33,7 +35,7 @@ def _get_or_create_logger() -> logging.Logger:
     return logger
 
 
-def _get_logging_handler(destination: str = "default") -> Tuple[logging.Handler, str]:
+def _get_logging_handler(destination: str = _DEFAULT_DESTINATION) -> Tuple[logging.Handler, str]:
     log_handler = _log_handlers[destination]
     log_handler_name = type(log_handler).__name__
     return (log_handler, log_handler_name)
@@ -45,15 +47,16 @@ def _get_logging_handler(destination: str = "default") -> Tuple[logging.Handler,
 
 def _get_msg_dict(func_name, *args, **kwargs) -> Dict[str, Any]:
     if dist.is_initialized():
+        group = kwargs.get("group") or kwargs.get("process_group")
         msg_dict = {
             "func_name": f"{func_name}",
             "args": f"{args}, {kwargs}",
             "pg_name": f"{dist._get_process_group_name(kwargs.get('pg'))}",  # type: ignore[arg-type]
-            "backend": f"{dist.get_backend(kwargs.get('group'))}",
+            "backend": f"{dist.get_backend(group)}",
             "world_size": f"{dist.get_world_size()}",
-            "group_size": f"{dist.get_world_size(kwargs.get('group'))}",
+            "group_size": f"{dist.get_world_size(group)}",
             "global_rank": f"{dist.get_rank()}",
-            "local_rank": f"{dist.get_rank(kwargs.get('group'))}",
+            "local_rank": f"{dist.get_rank(group)}",
         }
         if msg_dict["backend"] == "nccl":
             nccl_version = torch.cuda.nccl.version()
diff --git a/torch/distributed/checkpoint/__init__.py b/torch/distributed/checkpoint/__init__.py
index 42762279af91a..3262acccac1a6 100644
--- a/torch/distributed/checkpoint/__init__.py
+++ b/torch/distributed/checkpoint/__init__.py
@@ -10,5 +10,5 @@
 from .optimizer import load_sharded_optimizer_state_dict
 from .planner import LoadPlan, LoadPlanner, ReadItem, SavePlan, SavePlanner, WriteItem
 from .state_dict_loader import load, load_state_dict
-from .state_dict_saver import save, save_state_dict
+from .state_dict_saver import async_save, save, save_state_dict
 from .storage import StorageReader, StorageWriter
diff --git a/torch/distributed/checkpoint/_checkpointer.py b/torch/distributed/checkpoint/_checkpointer.py
index 1142cf3369249..a93fe8197dea6 100644
--- a/torch/distributed/checkpoint/_checkpointer.py
+++ b/torch/distributed/checkpoint/_checkpointer.py
@@ -83,12 +83,10 @@ def async_save(
         Returns:
             Future: A future holding the resultant Metadata object from `save`.
         """
-        return saver._async_save(
+        return saver.async_save(
             state_dict,
-            self.storage_writer,
+            storage_writer=self.storage_writer,
             process_group=self.process_group,
-            coordinator_rank=self.coordinator_rank,
-            no_dist=self.no_dist,
             planner=self.save_planner,
         )
 
@@ -98,7 +96,5 @@ def load(self, state_dict: Dict[str, Any]) -> None:
             state_dict,
             storage_reader=self.storage_reader,
             process_group=self.process_group,
-            coordinator_rank=self.coordinator_rank,
-            no_dist=self.no_dist,
             planner=self.load_planner,
         )
diff --git a/torch/distributed/checkpoint/_dedup_save_plans.py b/torch/distributed/checkpoint/_dedup_save_plans.py
new file mode 100644
index 0000000000000..2160c7dc366db
--- /dev/null
+++ b/torch/distributed/checkpoint/_dedup_save_plans.py
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import dataclasses
+from collections import defaultdict
+from typing import Dict, List, Set, TYPE_CHECKING
+
+from torch.distributed.checkpoint.planner import SavePlan, WriteItem
+
+if TYPE_CHECKING:
+    from torch.distributed.checkpoint.metadata import MetadataIndex
+
+__all__ = ["dedup_save_plans"]
+
+
+def dedup_save_plans(all_plans: List[SavePlan]) -> List[SavePlan]:
+    """
+    Removes duplicate entries from appearing on multiple SavePlans. For each duplicate across
+    a set of SavePlans, only the smallest SavePlan in terms of planned storage keeps the entry.
+    """
+
+    write_item_to_plan_indices: Dict[MetadataIndex, Set[int]] = defaultdict(set)
+    write_item_idx_to_write_item: Dict[MetadataIndex, WriteItem] = {}
+    for plan_idx, plan in enumerate(all_plans):
+        for write_item in plan.items:
+            # map each write item to its plan
+            write_item_to_plan_indices[write_item.index].add(plan_idx)
+            write_item_idx_to_write_item[write_item.index] = write_item
+
+    # put item in the plan with the smallest size and remove it from the other plan_indices
+    to_remove: List[Set] = [set() for _ in range(len(all_plans))]
+    plan_to_size = [0] * len(all_plans)
+    for write_item_idx, plan_indices in write_item_to_plan_indices.items():
+        select_plan_idx = min(plan_indices, key=lambda plan_idx: plan_to_size[plan_idx])
+
+        write_item = write_item_idx_to_write_item[write_item_idx]
+        # essentially ignores the storage size of anything that is not a tensor, since
+        # we don't know how much storage they represent
+        plan_to_size[select_plan_idx] += write_item.tensor_storage_size() or 1
+
+        plan_indices.remove(select_plan_idx)
+        for plan_idx in plan_indices:
+            to_remove[plan_idx].add(write_item_idx)
+
+    for plan_idx, remove_set in enumerate(to_remove):
+        new_items = [
+            write_item
+            for write_item in all_plans[plan_idx].items
+            if write_item.index not in remove_set
+        ]
+        all_plans[plan_idx] = dataclasses.replace(all_plans[plan_idx], items=new_items)
+
+    return all_plans
diff --git a/torch/distributed/checkpoint/_dedup_tensors.py b/torch/distributed/checkpoint/_dedup_tensors.py
index a6be92c367ed1..7689b9452e8cc 100644
--- a/torch/distributed/checkpoint/_dedup_tensors.py
+++ b/torch/distributed/checkpoint/_dedup_tensors.py
@@ -1,11 +1,13 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import dataclasses
 import logging
-from typing import Dict, List
+from typing import Dict, List, TYPE_CHECKING
 
-from torch.distributed.checkpoint.metadata import MetadataIndex
 from torch.distributed.checkpoint.planner import SavePlan
 
+if TYPE_CHECKING:
+    from torch.distributed.checkpoint.metadata import MetadataIndex
+
 __all__ = ["dedup_tensors"]
 
 
@@ -43,7 +45,8 @@ def dedup_tensors(all_plans: List[SavePlan]) -> List[SavePlan]:
     for key, plans in replicated_items.items():
         for plan_idx in plans[1:]:
             plan_to_keys.setdefault(plan_idx, []).append(key)
-    logger.info("Duplicate keys to remove: %s", plan_to_keys)
+    if len(plan_to_keys) > 0:
+        logger.info("Duplicate keys to remove: %s", plan_to_keys)
 
     for plan_idx, keys in plan_to_keys.items():
         key_set = set(keys)
diff --git a/torch/distributed/checkpoint/_fsspec_filesystem.py b/torch/distributed/checkpoint/_fsspec_filesystem.py
index 5c988459026c2..98bb637dacd24 100644
--- a/torch/distributed/checkpoint/_fsspec_filesystem.py
+++ b/torch/distributed/checkpoint/_fsspec_filesystem.py
@@ -1,43 +1,21 @@
 # Mypy will not try inferring the types of any 3rd party libraries installed.
 # mypy: ignore-errors
 
-import collections
-import dataclasses
 import io
 import os
-import pickle
-import queue
-import threading
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from typing import Callable, cast, Dict, List, Optional, Union
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Generator, Optional, Union
 
 import fsspec
 from fsspec import AbstractFileSystem
 from fsspec.core import url_to_fs
 
-import torch
-from torch import Tensor
-from torch._utils import _get_device_module
-from torch.distributed._shard._utils import narrow_tensor_by_index
-from torch.distributed.checkpoint.metadata import Metadata, MetadataIndex
-from torch.distributed.checkpoint.planner import (
-    LoadItemType,
-    LoadPlan,
-    LoadPlanner,
-    ReadItem,
-    SavePlan,
-    SavePlanner,
-    WriteItem,
-    WriteItemType,
+from torch.distributed.checkpoint.filesystem import (
+    FileSystemBase,
+    FileSystemReader,
+    FileSystemWriter,
 )
-from torch.distributed.checkpoint.storage import (
-    StorageReader,
-    StorageWriter,
-    WriteResult,
-)
-from torch.distributed.checkpoint.utils import _create_file_view
-from torch.futures import Future
 
 __all__ = [
     "FsspecWriter",
@@ -45,257 +23,50 @@
 ]
 
 
-@dataclass
-class _StorageInfo:
-    """This is the per entry storage info."""
-
-    relative_path: str
-    offset: int
-    length: int
-
-
-@dataclass
-class _StoragePrefix:
-    prefix: str
-
-
-DEFAULT_SUFFIX = ".distcp"
-
-
-def _result_from_write_item(
-    item: WriteItem, size_in_bytes, storage_data
-) -> WriteResult:
-    return WriteResult(
-        index=item.index, size_in_bytes=size_in_bytes, storage_data=storage_data
-    )
-
-
-class _TensorLoader(ABC):
-    @abstractmethod
-    def add(self, size: int, obj: object):
-        pass
-
-    @abstractmethod
-    def start_loading(self):
-        pass
-
-    @abstractmethod
-    def values(self):
-        pass
-
-
-class _SerialCpuLoader(_TensorLoader):
-    def __init__(self, resolve_fun: Callable):
-        self.resolve_fun = resolve_fun
-        self.items = []
-
-    def add(self, size: int, obj: object):
-        self.items.append((size, obj))
-
-    def start_loading(self):
-        pass
-
-    def values(self):
-        for _, obj in self.items:
-            tensor = self.resolve_fun(obj).detach()
-            tensor = tensor.cpu()
-            if tensor.storage().size() != tensor.numel():
-                tensor = tensor.clone()
-            yield (
-                tensor,
-                obj,
-            )
-
-
-class _OverlappingCpuLoader(_TensorLoader):
-    def __init__(
-        self,
-        resolve_fun: Callable,
-        stream: Union[None, io.RawIOBase, torch.Stream] = None,
-        inflight_threshhold: int = 1_000_000,
-    ):
-        self.resolve_fun = resolve_fun
-        self.items = []
-        self.inflight_threshhold = inflight_threshhold
-        self.in_flight_data = 0
-        self.current_items: collections.deque = collections.deque()
-        self.idx = 0
-        self.started = False
-        self.device_type = stream.device_type if stream else torch.device("cuda").type
-        self.device_module = _get_device_module(self.device_type)
-        self.stream = stream or self.device_module.current_stream()
-        if self.stream != self.device_module.current_stream():
-            self.stream.wait_stream(self.device_module.current_stream())
-
-    @property
-    def _done(self):
-        return self.idx >= len(self.items)
-
-    def _drain(self):
-        drained = []
-        if self.in_flight_data >= self.inflight_threshhold:
-            self.stream.synchronize()
-        while self.in_flight_data >= self.inflight_threshhold:
-            val = self.current_items.popleft()
-            self.in_flight_data -= val[0].numel() * val[0].element_size()
-            drained.append(val)
-        return drained
-
-    def _refill(self):
-        with self.device_module.stream(self.stream):
-            while not self._done and self.in_flight_data < self.inflight_threshhold:
-                _, obj = self.items[self.idx]
-                self.idx += 1
-                tensor = self.resolve_fun(obj).detach()
-                if tensor.device.type == self.device_type:
-                    tensor = tensor.to(device="cpu", non_blocking=True)
-                elif tensor.device == torch.device("cpu"):
-                    if tensor.storage().size() != tensor.numel():
-                        # this forces the tensor to be both contiguous and with minimal storage
-                        tensor = tensor.clone()
-
-                self.current_items.append(
-                    (
-                        tensor,
-                        obj,
-                    )
-                )
-                self.in_flight_data += tensor.numel() * tensor.element_size()
-
-    def _finish(self):
-        assert self._done
-        if len(self.current_items) > 0:
-            self.stream.synchronize()
-        return self.current_items
-
-    def add(self, size: int, obj: object):
-        if self.started:
-            raise RuntimeError("cannot add items after loading started")
-        self.items.append((size, obj))
-
-    def start_loading(self):
-        if self.started:
-            return
-        self.started = True
-        self.items.sort(key=lambda x: x[0])
-        self._refill()
-
-    def values(self):
-        self.start_loading()
-        while not self._done:
-            drained = self._drain()
-            self._refill()
-            yield from drained
-
-        yield from self._finish()
-
-
-def _item_size(item: WriteItem) -> int:
-    size = 1
-    assert item.tensor_data is not None
-    # can't use math.prod as PT needs to support older python
-    for s in item.tensor_data.size:
-        size *= s
-
-    dtype = item.tensor_data.properties.dtype
-    return size * torch._utils._element_size(dtype)
-
-
-def _split_by_size_and_type(bins: int, items: List[WriteItem]) -> List[List[WriteItem]]:
-    if bins == 1:
-        return [items]
-
-    bytes_w = [wi for wi in items if wi.type == WriteItemType.BYTE_IO]
-    tensor_w = [wi for wi in items if wi.type != WriteItemType.BYTE_IO]
-
-    buckets: List[List[WriteItem]] = [[] for _ in range(bins)]
-    bucket_sizes = [0 for _ in range(bins)]
+class FileSystem(FileSystemBase):
+    def __init__(self) -> None:
+        self.fs: Optional[AbstractFileSystem] = None
 
-    tensor_w.sort(key=_item_size, reverse=True)
-
-    for i, wi in enumerate(bytes_w):
-        buckets[i % bins].append(wi)
-
-    for wi in tensor_w:
-        # TODO replace with headq
-        idx = min(enumerate(bucket_sizes), key=lambda x: x[1])[0]
-        buckets[idx].append(wi)
-        bucket_sizes[idx] += _item_size(wi)
-
-    return buckets
-
-
-def _write_item(
-    stream: Optional[Union[io.RawIOBase, torch.Stream]],
-    data: Union[io.BytesIO, torch.Tensor],
-    write_item: WriteItem,
-    storage_key: str,
-):
-    offset = stream.tell()
-
-    if write_item.type == WriteItemType.BYTE_IO:
-        assert isinstance(data, io.BytesIO)
-        stream.write(data.getbuffer())
-    else:
-        assert isinstance(data, torch.Tensor)
-        assert data.device == torch.device("cpu")
-        torch.save(data, stream)
-    length = stream.tell() - offset
-
-    return _result_from_write_item(
-        write_item, length, _StorageInfo(storage_key, offset, length)
-    )
+    @contextmanager
+    def create_stream(
+        self, path: Union[str, os.PathLike], mode: str
+    ) -> Generator[io.IOBase, None, None]:
+        assert self.fs is not None
+        with self.fs.transaction:
+            with fsspec.open(str(path), mode) as stream:
+                yield stream
 
+    def concat_path(
+        self, path: Union[str, os.PathLike], suffix: str
+    ) -> Union[str, os.PathLike]:
+        return os.path.join(path, suffix)
 
-def _write_files_from_queue(
-    file_queue: queue.Queue,
-    result_queue: queue.Queue,
-    planner: SavePlanner,
-    inflight_threshhold: int,
-    fs: AbstractFileSystem,
-):
-    try:
-        while True:
-            file_name, storage_key, write_items = file_queue.get_nowait()
-            loader: _TensorLoader
+    def init_path(self, path: Union[str, os.PathLike]) -> Union[str, os.PathLike]:
+        self.fs, _ = url_to_fs(path)
+        return path
 
-            if torch.cuda.is_available() and inflight_threshhold > 0:
-                loader = _OverlappingCpuLoader(
-                    planner.resolve_data,
-                    inflight_threshhold=inflight_threshhold,
-                )
-            else:
-                loader = _SerialCpuLoader(
-                    planner.resolve_data,
-                )
+    def rename(
+        self, path: Union[str, os.PathLike], new_path: Union[str, os.PathLike]
+    ) -> None:
+        self.fs.rename(path, new_path)
 
-            tensor_w = [wi for wi in write_items if wi.type != WriteItemType.BYTE_IO]
-            for write_item in tensor_w:
-                loader.add(_item_size(write_item), write_item)
-            loader.start_loading()
+    def mkdir(self, path: [str, os.PathLike]) -> None:
+        self.fs.makedirs(path, exist_ok=True)
 
-            bytes_w = [wi for wi in write_items if wi.type == WriteItemType.BYTE_IO]
-            write_results = []
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        if isinstance(checkpoint_id, Path):
+            return False
 
-            with fs.transaction:
-                with fsspec.open(file_name, "wb") as stream:
-                    for write_item in bytes_w:
-                        data = planner.resolve_data(write_item)
-                        write_results.append(
-                            _write_item(stream, data, write_item, storage_key)
-                        )
+        try:
+            url_to_fs(checkpoint_id)
+        except ValueError as e:
+            return False
 
-                    for tensor, write_item in loader.values():
-                        assert tensor.is_cpu
-                        write_results.append(
-                            _write_item(stream, tensor, write_item, storage_key)
-                        )
-            result_queue.put(write_results)
-    except queue.Empty:
-        pass
+        return True
 
 
-class FsspecWriter(StorageWriter):
+class FsspecWriter(FileSystemWriter):
     """
     Basic implementation of StorageWriter using FFspec.
 
@@ -313,6 +84,7 @@ def __init__(
         self,
         path: Union[str, os.PathLike],
         single_file_per_rank: bool = True,
+        sync_files: bool = True,
         thread_count: int = 1,
         per_thread_copy_ahead: int = 10_000_000,
     ) -> None:
@@ -320,170 +92,31 @@ def __init__(
         Initialize the writer pointing to `path`.
 
         Args:
-            path: diretory where the checkpoint will be writen to.
+            path: directory where the checkpoint will be written to.
             single_file_per_rank: Produce one file per rank instead of one file per tensor/blob. Default to True.
+            sync_files : force files to be synced to permanent storage. Default to True.
             thread_count: Number of IO threads to use to write. Default to 1.
             per_thread_copy_ahead: How many bytes to copy from the GPU ahead of saving then. Default 10Mb.
 
+        N. B. If sync_files is disabled, there's no guarantee that the checkpoint will be consistent in the case of a failure.
         """
-        super().__init__()
-        self.path = path
-        self.fs, _ = url_to_fs(path)
-        self.single_file_per_rank = single_file_per_rank
-        self.thread_count = thread_count
-        self.per_thread_copy_ahead = per_thread_copy_ahead
-
-    def set_up_storage_writer(self, is_coordinator: bool) -> None:
-        pass
-
-    def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
-        self.fs.makedirs(self.path, exist_ok=True)
-        return plan
-
-    def prepare_global_plan(self, global_plan: List[SavePlan]) -> List[SavePlan]:
-        new_plans = [
-            dataclasses.replace(plan, storage_data=_StoragePrefix(f"__{i}_"))
-            for i, plan in enumerate(global_plan)
-        ]
-        return new_plans
-
-    def write_data(
-        self,
-        plan: SavePlan,
-        planner: SavePlanner,
-    ) -> Future[List[WriteResult]]:
-        storage_plan: _StoragePrefix = plan.storage_data
-        file_count = 0
-
-        def gen_file():
-            nonlocal file_count
-            file_name = f"{storage_plan.prefix}{file_count}{DEFAULT_SUFFIX}"
-            file_count += 1
-            return file_name
-
-        file_queue: queue.Queue = queue.Queue()
-        if self.single_file_per_rank:
-            for bucket in _split_by_size_and_type(self.thread_count, plan.items):
-                file_name = gen_file()
-                file_path = os.path.join(self.path, file_name)
-                file_queue.put((file_path, file_name, bucket))
-        else:
-            for item in plan.items:
-                file_name = gen_file()
-                file_path = os.path.join(self.path, file_name)
-                file_queue.put((file_path, file_name, [item]))
-
-        result_queue: queue.Queue = queue.Queue()
-
-        threads = []
-        for _ in range(1, self.thread_count):
-            t = threading.Thread(
-                target=_write_files_from_queue,
-                args=(
-                    file_queue,
-                    result_queue,
-                    planner,
-                    self.per_thread_copy_ahead,
-                    self.fs,
-                ),
-            )
-            t.start()
-            threads.append(t)
-
-        _write_files_from_queue(
-            file_queue=file_queue,
-            result_queue=result_queue,
-            planner=planner,
-            inflight_threshhold=self.per_thread_copy_ahead,
-            fs=self.fs,
+        super().__init__(
+            path, single_file_per_rank, sync_files, thread_count, per_thread_copy_ahead
         )
+        self.fs = FileSystem()
+        self.path = self.fs.init_path(path)
 
-        for t in threads:
-            t.join()
-
-        res = []
-        try:
-            while True:
-                res += result_queue.get_nowait()
-        except queue.Empty:
-            pass
-
-            fut: Future[List[WriteResult]] = Future()
-            fut.set_result(res)
-            return fut
-
-    def finish(self, metadata: Metadata, results: List[List[WriteResult]]) -> None:
-        storage_md = dict()
-        for wr_list in results:
-            storage_md.update({wr.index: wr.storage_data for wr in wr_list})
-        metadata.storage_data = storage_md
-        metadata_path = os.path.join(self.path, ".metadata")
-
-        with self.fs.transaction:
-            with fsspec.open(metadata_path, "wb") as metadata_file:
-                pickle.dump(metadata, metadata_file)
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        return FileSystem.validate_checkpoint_id(checkpoint_id)
 
 
-class FsspecReader(StorageReader):
+class FsspecReader(FileSystemReader):
     def __init__(self, path: Union[str, os.PathLike]) -> None:
-        super().__init__()
-        self.path = path
-        self.fs, _ = url_to_fs(path)
-        self.storage_data: Dict[MetadataIndex, _StorageInfo] = dict()
-
-    def _slice_file(self, file, sinfo: _StorageInfo):
-        return _create_file_view(file, sinfo.offset, sinfo.length)
-
-    def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
-        # group requests by file
-        per_file: Dict[str, List[ReadItem]] = dict()
-        for read_item in plan.items:
-            item_md = self.storage_data[read_item.storage_index]
-            path = item_md.relative_path
-            per_file.setdefault(path, []).append(read_item)
-
-        for relative_path, reqs in per_file.items():
-            abs_path = os.path.join(self.path, relative_path)
-            with fsspec.open(abs_path, "rb") as file:
-                # TODO sort by offset and cache the reading
-                for req in reqs:
-                    item_md = self.storage_data[req.storage_index]
-                    file_slice = self._slice_file(file, item_md)
-                    if req.type == LoadItemType.BYTE_IO:
-                        bytes = io.BytesIO(file_slice.read(item_md.length))
-                        bytes.seek(0)
-                        planner.load_bytes(req, bytes)
-                    else:
-                        tensor = cast(
-                            Tensor, torch.load(file_slice, map_location="cpu")
-                        )
-                        tensor = narrow_tensor_by_index(
-                            tensor, req.storage_offsets, req.lengths
-                        )
-                        target_tensor = planner.resolve_tensor(req).detach()
-
-                        assert (
-                            target_tensor.size() == tensor.size()
-                        ), f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
-                        target_tensor.copy_(tensor)
-                        planner.commit_tensor(req, target_tensor)
-
-        fut: Future = Future()
-        fut.set_result(None)
-        return fut
-
-    # Implementating the abstract function in StorageReader
-    def read_metadata(self) -> Metadata:
-        metadata_path = os.path.join(self.path, ".metadata")
-        with fsspec.open(metadata_path, "rb") as metadata_file:
-            return pickle.load(metadata_file)
-
-    def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
-        self.storage_data = metadata.storage_data
-        assert self.storage_data is not None
-
-    def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
-        return plan
+        super().__init__(path)
+        self.fs = FileSystem()
+        self.path = self.fs.init_path(path)
 
-    def prepare_global_plan(self, global_plan: List[LoadPlan]) -> List[LoadPlan]:
-        return global_plan
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        return FileSystem.validate_checkpoint_id(checkpoint_id)
diff --git a/torch/distributed/checkpoint/_sharded_tensor_utils.py b/torch/distributed/checkpoint/_sharded_tensor_utils.py
index 582dfc0699241..f71f129e127c7 100644
--- a/torch/distributed/checkpoint/_sharded_tensor_utils.py
+++ b/torch/distributed/checkpoint/_sharded_tensor_utils.py
@@ -1,16 +1,19 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
 import copy
+from typing import TYPE_CHECKING
 
 import torch.distributed as dist
 from torch.distributed._shard.sharded_tensor import Shard, ShardedTensor, ShardMetadata
-from torch.distributed._shard.sharded_tensor.metadata import ShardedTensorMetadata
 from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
 from torch.distributed.remote_device import _remote_device
 
 from ._traverse import OBJ_PATH, set_element, STATE_DICT_ITEM, traverse_state_dict
 from .utils import _element_wise_add, _normalize_device_info
 
+if TYPE_CHECKING:
+    from torch.distributed._shard.sharded_tensor.metadata import ShardedTensorMetadata
+
 
 # TODO: We need to refactor this code.
 def _flatten_sharded_tensors(state_dict: STATE_DICT_TYPE) -> STATE_DICT_TYPE:
diff --git a/torch/distributed/checkpoint/_storage_utils.py b/torch/distributed/checkpoint/_storage_utils.py
new file mode 100644
index 0000000000000..0f5205a1f2030
--- /dev/null
+++ b/torch/distributed/checkpoint/_storage_utils.py
@@ -0,0 +1,50 @@
+import os
+from typing import List, Type, Union
+
+from .filesystem import FileSystemReader, FileSystemWriter
+
+from .storage import StorageReader, StorageWriter
+
+
+def _storage_setup(
+    storage: Union[StorageReader, StorageWriter, None],
+    checkpoint_id: Union[str, os.PathLike, None],
+    reader: bool = False,
+) -> Union[None, StorageReader, StorageWriter]:
+    if storage:
+        if checkpoint_id is not None:
+            storage.reset(checkpoint_id)
+        return storage
+
+    if not checkpoint_id:
+        raise RuntimeError(
+            "`checkpoint_id` must be specificed if "
+            "storage_reader/storage_writer is None."
+        )
+
+    targets: List[Type[Union[StorageReader, StorageWriter]]] = []
+    if reader:
+        targets = [
+            FileSystemReader,
+        ]
+    else:
+        targets = [
+            FileSystemWriter,
+        ]
+    try:
+        from ._fsspec_filesystem import FsspecReader, FsspecWriter
+
+        targets.append(FsspecReader if reader else FsspecWriter)
+    except Exception:
+        pass
+
+    for target in targets:
+        if target.validate_checkpoint_id(checkpoint_id):
+            storage = target(checkpoint_id)  # type: ignore[call-arg]
+            storage.reset(checkpoint_id)
+            return storage
+
+    raise RuntimeError(
+        "Cannot detect which StorageReader or StorageWriter to use. "
+        "Please specify the storage_reader/storage_writer."
+    )
diff --git a/torch/distributed/checkpoint/_traverse.py b/torch/distributed/checkpoint/_traverse.py
index 604b5e1a80c17..5d5e87bf13087 100644
--- a/torch/distributed/checkpoint/_traverse.py
+++ b/torch/distributed/checkpoint/_traverse.py
@@ -39,18 +39,18 @@ def traverse_state_dict(
 ) -> None:
     """
     Invoke ``visitor`` for each value recursively in ``state_dict``.
-
-    Traversal is short-circuited when if finds a collection for which ``keep_visiting_tensors`` evaluates
-    to false for all elements.
-    By default, all collections with at least one ``torch.Tensor`` element are traversed.
-    Visitor takes a path argument that is a tuple of the keys used to reach it.
+    Mapping, list, and tuple will be flattened and other value types are treated
+    as the terminal values and will invoke ``visitor``.
+    Mapping is treated as non terminal node and will be flattened.
+    List and tuple, on the other hand, will not be flattened unless containing other
+    mapping containers or tensors.
     """
 
     # a value is terminal if it has no other containers values inside it
     def _is_terminal(value: STATE_DICT_ITEM) -> bool:
         values: Collection[STATE_DICT_ITEM]
         if isinstance(value, Mapping):
-            values = value.values()
+            return False
         elif isinstance(value, list):
             values = value
         else:
@@ -64,12 +64,12 @@ def _is_terminal(value: STATE_DICT_ITEM) -> bool:
         return True
 
     def _traverse_obj(path: OBJ_PATH, value: STATE_DICT_ITEM) -> None:
-        if _is_terminal(value):
-            visitor(path, value)
-        elif isinstance(value, Mapping):
+        if isinstance(value, Mapping):
             for k, v in value.items():
                 _traverse_obj(path + (str(k),), v)
-        elif isinstance(value, list):
+        elif _is_terminal(value):
+            visitor(path, value)
+        elif isinstance(value, (list, tuple)):
             for i, v in enumerate(value):
                 _traverse_obj(path + (i,), v)
 
diff --git a/torch/distributed/checkpoint/default_planner.py b/torch/distributed/checkpoint/default_planner.py
index 775d5b817185e..0249f4bdf7b19 100644
--- a/torch/distributed/checkpoint/default_planner.py
+++ b/torch/distributed/checkpoint/default_planner.py
@@ -6,12 +6,12 @@
 import operator
 from collections import ChainMap
 from functools import reduce
-from typing import Any, cast, Dict, List, Tuple, Union
+from typing import Any, cast, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch.distributed._shard._utils import narrow_tensor_by_index
 from torch.distributed._tensor import DTensor
-from torch.distributed.checkpoint._dedup_tensors import dedup_tensors
+from torch.distributed.checkpoint._dedup_save_plans import dedup_save_plans
 from torch.distributed.checkpoint._nested_dict import (
     FLATTEN_MAPPING,
     flatten_state_dict,
@@ -25,6 +25,7 @@
     MetadataIndex,
     STATE_DICT_TYPE,
     STORAGE_TYPES,
+    StorageMeta,
     TensorStorageMetadata,
 )
 from torch.distributed.checkpoint.planner import (
@@ -40,6 +41,7 @@
     _create_default_metadata_only_plan,
     _create_read_items,
     _create_write_items,
+    _init_state_dict,
 )
 from torch.distributed.checkpoint.utils import find_state_dict_object
 
@@ -64,14 +66,25 @@ def __init__(
         self,
         flatten_state_dict: bool = True,
         flatten_sharded_tensors: bool = True,
-        dedup_replicated_tensors: bool = True,
+        dedup_replicated_tensors: Optional[bool] = None,
     ) -> None:
         self.flatten_state_dict = flatten_state_dict
         self.flatten_sharded_tensors = flatten_sharded_tensors
-        self.dedup_replicated_tensors = dedup_replicated_tensors
         self.mappings = {}
 
-    def set_up_planner(self, state_dict: STATE_DICT_TYPE, is_coordinator: bool) -> None:
+        if dedup_replicated_tensors is not None:
+            logger.warning(
+                "DefaultSavePlanner's `dedup_replicated_tensors` argument is being "
+                "deprecated, and no longer has any effect. Please remove this argument "
+                "from your call."
+            )
+
+    def set_up_planner(
+        self,
+        state_dict: STATE_DICT_TYPE,
+        storage_meta: Optional[StorageMeta] = None,
+        is_coordinator: bool = False,
+    ) -> None:
         if self.flatten_state_dict:
             state_dict, self.mappings = flatten_state_dict(state_dict)
         if self.flatten_sharded_tensors:
@@ -90,8 +103,7 @@ def create_local_plan(self) -> SavePlan:
     def create_global_plan(
         self, all_plans: List[SavePlan]
     ) -> Tuple[List[SavePlan], Metadata]:
-        if self.dedup_replicated_tensors:
-            all_plans = dedup_tensors(all_plans)
+        all_plans = dedup_save_plans(all_plans)
 
         global_plan, metadata = create_default_global_save_plan(all_plans)
 
@@ -141,6 +153,7 @@ class DefaultLoadPlanner(LoadPlanner):
 
     flatten_state_dict: Handle state_dict with nested dicts
     flatten_sharded_tensors: For FSDP in 2D parallel mode
+    allow_partial_load: If False, will raise a runtime error if a key is present in state_dict, but not in the checkpoint.
     """
 
     original_state_dict: STATE_DICT_TYPE
@@ -150,18 +163,21 @@ def __init__(
         self,
         flatten_state_dict: bool = True,
         flatten_sharded_tensors: bool = True,
+        allow_partial_load: bool = False,
     ) -> None:
         self.flatten_state_dict = flatten_state_dict
         self.flatten_sharded_tensors = flatten_sharded_tensors
         self.original_state_dict = {}
         self.mappings = {}
+        self.allow_partial_load = allow_partial_load
 
     def set_up_planner(
         self,
         state_dict: STATE_DICT_TYPE,
-        metadata: Metadata,
-        is_coordinator: bool,
+        metadata: Optional[Metadata] = None,
+        is_coordinator: bool = False,
     ) -> None:
+        _init_state_dict(state_dict)
         self.original_state_dict = state_dict
 
         if self.flatten_sharded_tensors:
@@ -175,7 +191,10 @@ def set_up_planner(
         self.is_coordinator = is_coordinator
 
     def create_local_plan(self) -> LoadPlan:
-        return create_default_local_load_plan(self.state_dict, self.metadata)
+        assert self.metadata is not None
+        return create_default_local_load_plan(
+            self.state_dict, self.metadata, not self.allow_partial_load
+        )
 
     def create_global_plan(self, global_plan: List[LoadPlan]) -> List[LoadPlan]:
         return create_default_global_load_plan(global_plan)
@@ -209,9 +228,73 @@ def transform_tensor(self, read_item: ReadItem, tensor: torch.Tensor):
         return narrow_tensor_by_index(tensor, read_item.dest_offsets, read_item.lengths)
 
 
+class _EmptyStateDictLoadPlanner(DefaultLoadPlanner):
+    """
+    Extension of DefaultLoadPlanner, which rebuilds state_dict from the saved metadata.
+    Useful for loading in state_dict without first initializing a model, such as
+    when converting a DCP checkpoint into a Torch save file.
+
+    . N.B. `state_dict` must be an empty dictionary when used with this LoadPlanner
+
+    .. warning::
+        Because the entire state dict is initialized, It's recommended to only utilize
+        this LoadPlanner on a single rank or process to avoid OOM.
+
+    """
+
+    def __init__(self, keys=None, *args, **kwargs):
+        self.keys = keys
+        super().__init__(*args, **kwargs)
+
+    def _should_include_key(self, key: str, metadata: Metadata) -> bool:
+        if self.keys is None:
+            return True
+
+        if key in self.keys:
+            True
+
+        unflattened_keys: List[str] = []
+        planner_data = metadata.planner_data.get(key)
+        for unflattened_key in planner_data:
+            if unflattened_keys:
+                unflattened_keys.append(
+                    ".".join([unflattened_keys[-1], unflattened_key])
+                )
+
+            else:
+                unflattened_keys.append(unflattened_key)
+
+        if any(unflattened_key in self.keys for unflattened_key in unflattened_keys):
+            return True
+
+        return False
+
+    def set_up_planner(
+        self,
+        state_dict: STATE_DICT_TYPE,
+        metadata: Optional[Metadata] = None,
+        is_coordinator: bool = False,
+    ) -> None:
+        assert not state_dict
+        assert metadata is not None
+
+        # rebuild the state dict from the metadata
+        for k, v in metadata.state_dict_metadata.items():
+            if not self._should_include_key(k, metadata):
+                continue
+
+            if isinstance(v, TensorStorageMetadata):
+                v = torch.empty(v.size, dtype=v.properties.dtype)  # type: ignore[assignment]
+            if k in metadata.planner_data:
+                set_element(state_dict, metadata.planner_data[k], v)
+            else:
+                state_dict[k] = v
+
+        super().set_up_planner(state_dict, metadata, is_coordinator)
+
+
 def create_default_local_load_plan(
-    state_dict: Dict[str, Any],
-    metadata: Metadata,
+    state_dict: Dict[str, Any], metadata: Metadata, strict: bool = True
 ) -> LoadPlan:
     requests = []
     """
@@ -225,6 +308,13 @@ def create_default_local_load_plan(
     """
 
     for fqn, obj in state_dict.items():
+        # ignore state_dict keys which do not exist in `state_dict` if strict=False
+        if fqn not in metadata.state_dict_metadata:
+            if strict:
+                raise RuntimeError(f"Missing key in checkpoint state_dict: {fqn}.")
+            else:
+                continue
+
         md = metadata.state_dict_metadata[fqn]
         # Since DTensor supports submesh, adding extra check to ensure _create_read_items()
         # gets called only when the current rank is part of the mesh for the corresponding DTensor.
@@ -267,7 +357,10 @@ def create_default_local_save_plan(
         if isinstance(obj, DTensor):
             if obj.device_mesh.get_coordinate() is not None:
                 requests += _create_write_items(fqn, obj)
-        elif isinstance(obj, (torch.Tensor)) or is_coordinator:
+        else:
+            # For the plain tensor and non-tensor values, add the request for all
+            # the ranks. Coordinator will decides whether to deduplicate the
+            # values based on the keys.
             requests += _create_write_items(fqn, obj)
 
     return SavePlan(requests)
diff --git a/torch/distributed/checkpoint/examples/async_checkpointing_example.py b/torch/distributed/checkpoint/examples/async_checkpointing_example.py
index 1e8bec739dc53..d4e2b5268de7f 100644
--- a/torch/distributed/checkpoint/examples/async_checkpointing_example.py
+++ b/torch/distributed/checkpoint/examples/async_checkpointing_example.py
@@ -6,11 +6,11 @@
 
 import torch
 import torch.distributed as dist
+import torch.distributed.checkpoint as dcp
 import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed._tensor.device_mesh import init_device_mesh
-from torch.distributed.checkpoint.filesystem import _FileSystemCheckpointer
 from torch.distributed.checkpoint.state_dict import (
     _patch_model_state_dict,
     _patch_optimizer_state_dict,
@@ -87,7 +87,6 @@ def run(rank, world_size):
 
     model, optim = _init_model(rank, world_size)
     state_dict = {"model": model, "optim": optim}
-    checkpointer = _FileSystemCheckpointer(CHECKPOINT_DIR)
     loss_calc = torch.nn.BCELoss()
 
     f = None
@@ -107,7 +106,9 @@ def run(rank, world_size):
             if epoch % SAVE_PERIOD == 0:
                 if f is not None:
                     f.result()
-                f = checkpointer.async_save(state_dict)
+                f = dcp.state_dict_saver.async_save(
+                    state_dict, checkpoint_id=CHECKPOINT_DIR
+                )
 
             if FAULT_PERIOD > 0 and epoch % FAULT_PERIOD == 0:
                 raise InjectedException("Fault injection!")
@@ -121,7 +122,7 @@ def run(rank, world_size):
             _print("Reloading model from last checkpoint!")
             if f is not None:
                 f.result()
-            checkpointer.load(state_dict)
+            dcp.load(state_dict)
 
 
 if __name__ == "__main__":
diff --git a/torch/distributed/checkpoint/examples/stateful_example.py b/torch/distributed/checkpoint/examples/stateful_example.py
index e631aabba70df..6c23dc3e298fd 100644
--- a/torch/distributed/checkpoint/examples/stateful_example.py
+++ b/torch/distributed/checkpoint/examples/stateful_example.py
@@ -6,10 +6,10 @@
 
 import torch
 import torch.distributed as dist
+import torch.distributed.checkpoint as dcp
 import torch.multiprocessing as mp
 import torch.nn as nn
 from torch.distributed._tensor.device_mesh import init_device_mesh
-from torch.distributed.checkpoint.filesystem import _FileSystemCheckpointer
 from torch.distributed.checkpoint.state_dict import (
     _patch_model_state_dict,
     _patch_optimizer_state_dict,
@@ -78,15 +78,16 @@ def run(rank, world_size, device="cuda"):
     model, optim = _init_model(device, world_size)
     _train(model, optim, train_steps=2)
 
-    checkpointer = _FileSystemCheckpointer(CHECKPOINT_DIR)
-    checkpointer.save(
+    dcp.save(
         state_dict={"model": model, "optimizer": optim},
+        checkpoint_id=CHECKPOINT_DIR,
     )
 
     # presumably do something else
     model, optim = _init_model(device, world_size)
-    checkpointer.load(
+    dcp.load(
         state_dict={"model": model, "optimizer": optim},
+        checkpoint_id=CHECKPOINT_DIR,
     )
     _train(model, optim, train_steps=2)
 
diff --git a/torch/distributed/checkpoint/filesystem.py b/torch/distributed/checkpoint/filesystem.py
index dd8e86af0e932..605a2cbc1215b 100644
--- a/torch/distributed/checkpoint/filesystem.py
+++ b/torch/distributed/checkpoint/filesystem.py
@@ -1,25 +1,43 @@
 import collections
 import dataclasses
 import io
+import operator
 import os
 import pickle
 import queue
 import threading
+import uuid
 from abc import ABC, abstractmethod
+from contextlib import contextmanager
 from dataclasses import dataclass
 from pathlib import Path
-from typing import cast, Dict, List, Optional, Union
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Generator,
+    IO,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
 
 import torch
-import torch.distributed as dist
 from torch import Tensor
-from torch._utils import _get_device_module
+from torch._utils import _get_available_device_type, _get_device_module
 from torch.distributed._shard._utils import narrow_tensor_by_index
-from torch.distributed.checkpoint._checkpointer import _Checkpointer
-from torch.futures import Future
 
-from .metadata import Metadata, MetadataIndex
-from .planner import (
+from torch.distributed.checkpoint.metadata import (
+    Metadata,
+    MetadataIndex,
+    STATE_DICT_TYPE,
+    StorageMeta,
+)
+from torch.distributed.checkpoint.planner import (
     LoadItemType,
     LoadPlan,
     LoadPlanner,
@@ -29,8 +47,14 @@
     WriteItem,
     WriteItemType,
 )
-from .storage import StorageReader, StorageWriter, WriteResult
-from .utils import _create_file_view
+from torch.distributed.checkpoint.staging import BlockingAsyncStager
+from torch.distributed.checkpoint.storage import (
+    StorageReader,
+    StorageWriter,
+    WriteResult,
+)
+from torch.distributed.checkpoint.utils import _create_file_view
+from torch.futures import Future
 
 __all__ = ["FileSystemWriter", "FileSystemReader"]
 
@@ -52,40 +76,36 @@ class _StoragePrefix:
 DEFAULT_SUFFIX = ".distcp"
 
 
-def _result_from_write_item(
-    item: WriteItem, size_in_bytes, storage_data
-) -> WriteResult:
-    return WriteResult(
-        index=item.index, size_in_bytes=size_in_bytes, storage_data=storage_data
-    )
+def _generate_uuid() -> str:
+    return str(uuid.uuid4())
 
 
 class _TensorLoader(ABC):
     @abstractmethod
-    def add(self, size, obj):
+    def add(self, size: int, obj: object) -> None:
         pass
 
     @abstractmethod
-    def start_loading(self):
+    def start_loading(self) -> None:
         pass
 
     @abstractmethod
-    def values(self):
+    def values(self) -> Iterator[Tuple[torch.Tensor, object]]:
         pass
 
 
 class _SerialCpuLoader(_TensorLoader):
-    def __init__(self, resolve_fun):
+    def __init__(self, resolve_fun: Callable) -> None:
         self.resolve_fun = resolve_fun
-        self.items = []
+        self.items: List[Tuple[int, object]] = []
 
-    def add(self, size, obj):
+    def add(self, size: int, obj: object) -> None:
         self.items.append((size, obj))
 
-    def start_loading(self):
+    def start_loading(self) -> None:
         pass
 
-    def values(self):
+    def values(self) -> Iterator[Tuple[torch.Tensor, object]]:
         for _, obj in self.items:
             tensor = self.resolve_fun(obj).detach()
             tensor = tensor.cpu()
@@ -98,25 +118,34 @@ def values(self):
 
 
 class _OverlappingCpuLoader(_TensorLoader):
-    def __init__(self, resolve_fun, stream=None, inflight_threshhold=1_000_000):
+    def __init__(
+        self,
+        resolve_fun: Callable,
+        stream: Optional[torch.Stream] = None,
+        inflight_threshhold: int = 1_000_000,
+    ) -> None:
         self.resolve_fun = resolve_fun
-        self.items = []
+        self.items: List[Tuple[int, object]] = []
         self.inflight_threshhold = inflight_threshhold
         self.in_flight_data = 0
         self.current_items: collections.deque = collections.deque()
         self.idx = 0
         self.started = False
-        self.device_type = stream.device_type if stream else torch.device("cuda").type
+        self.device_type = (
+            stream.device_type if stream else _get_available_device_type()
+        )
         self.device_module = _get_device_module(self.device_type)
-        self.stream = stream or self.device_module.current_stream()
+        self.stream = cast(
+            torch.cuda.Stream, stream or self.device_module.current_stream()
+        )
         if self.stream != self.device_module.current_stream():
             self.stream.wait_stream(self.device_module.current_stream())
 
     @property
-    def _done(self):
+    def _done(self) -> bool:
         return self.idx >= len(self.items)
 
-    def _drain(self):
+    def _drain(self) -> List[Tuple[torch.Tensor, object]]:
         drained = []
         if self.in_flight_data >= self.inflight_threshhold:
             self.stream.synchronize()
@@ -126,7 +155,7 @@ def _drain(self):
             drained.append(val)
         return drained
 
-    def _refill(self):
+    def _refill(self) -> None:
         with self.device_module.stream(self.stream):
             while not self._done and self.in_flight_data < self.inflight_threshhold:
                 _, obj = self.items[self.idx]
@@ -135,7 +164,10 @@ def _refill(self):
                 if tensor.device.type == self.device_type:
                     tensor = tensor.to(device="cpu", non_blocking=True)
                 elif tensor.device == torch.device("cpu"):
-                    if tensor.storage().size() != tensor.numel():
+                    if (
+                        tensor.untyped_storage().size()
+                        != tensor.numel() * tensor.itemsize
+                    ):
                         # this forces the tensor to be both contiguous and with minimal storage
                         tensor = tensor.clone()
 
@@ -147,25 +179,25 @@ def _refill(self):
                 )
                 self.in_flight_data += tensor.numel() * tensor.element_size()
 
-    def _finish(self):
+    def _finish(self) -> Iterable[Tuple[torch.Tensor, object]]:
         assert self._done
         if len(self.current_items) > 0:
             self.stream.synchronize()
         return self.current_items
 
-    def add(self, size, obj):
+    def add(self, size: int, obj: object) -> None:
         if self.started:
             raise RuntimeError("cannot add items after loading started")
         self.items.append((size, obj))
 
-    def start_loading(self):
+    def start_loading(self) -> None:
         if self.started:
             return
         self.started = True
-        self.items.sort(key=lambda x: x[0])
+        self.items.sort(key=operator.itemgetter(0))
         self._refill()
 
-    def values(self):
+    def values(self) -> Iterator[Tuple[torch.Tensor, object]]:
         self.start_loading()
         while not self._done:
             drained = self._drain()
@@ -186,7 +218,7 @@ def _item_size(item: WriteItem) -> int:
     return size * torch._utils._element_size(dtype)
 
 
-def _split_by_size_and_type(bins, items: List[WriteItem]) -> List[List[WriteItem]]:
+def _split_by_size_and_type(bins: int, items: List[WriteItem]) -> List[List[WriteItem]]:
     if bins == 1:
         return [items]
 
@@ -203,14 +235,19 @@ def _split_by_size_and_type(bins, items: List[WriteItem]) -> List[List[WriteItem
 
     for wi in tensor_w:
         # TODO replace with headq
-        idx = min(enumerate(bucket_sizes), key=lambda x: x[1])[0]
+        idx = min(enumerate(bucket_sizes), key=operator.itemgetter(1))[0]
         buckets[idx].append(wi)
         bucket_sizes[idx] += _item_size(wi)
 
     return buckets
 
 
-def _write_item(stream, data, write_item, storage_key):
+def _write_item(
+    stream: io.IOBase,
+    data: Union[io.BytesIO, torch.Tensor],
+    write_item: WriteItem,
+    storage_key: str,
+) -> WriteResult:
     offset = stream.tell()
 
     if write_item.type == WriteItemType.BYTE_IO:
@@ -219,27 +256,44 @@ def _write_item(stream, data, write_item, storage_key):
     else:
         assert isinstance(data, torch.Tensor)
         assert data.device == torch.device("cpu")
-        torch.save(data, stream)
+        torch.save(data, cast(IO[bytes], stream))
     length = stream.tell() - offset
 
-    return _result_from_write_item(
-        write_item, length, _StorageInfo(storage_key, offset, length)
+    return WriteResult(
+        index=write_item.index,
+        size_in_bytes=length,
+        storage_data=_StorageInfo(storage_key, offset, length),
     )
 
 
 def _write_files_from_queue(
+    create_stream: Callable,
     file_queue: queue.Queue,
     result_queue: queue.Queue,
     planner: SavePlanner,
     inflight_threshhold: int,
     use_fsync: bool,
-):
+    thread_count: int,
+) -> None:
     try:
         while True:
             file_name, storage_key, write_items = file_queue.get_nowait()
             loader: _TensorLoader
 
-            if torch.cuda.is_available() and inflight_threshhold > 0:
+            custom_backend_name = torch._C._get_privateuse1_backend_name()
+            custom_device_mod = getattr(torch, custom_backend_name, None)
+
+            # TODO: Using the OverlappingCpuLoader with multiple threads creates significant
+            # performance degredation, observed as being related to cuda stream syncs. We
+            # should try to fix this and use _OverlappingCpuLoader for all threaded cases
+            if (
+                thread_count == 1
+                and (
+                    torch.cuda.is_available()
+                    or (custom_device_mod and custom_device_mod.is_available())
+                )
+                and inflight_threshhold > 0
+            ):
                 loader = _OverlappingCpuLoader(
                     planner.resolve_data,
                     inflight_threshhold=inflight_threshhold,
@@ -257,7 +311,7 @@ def _write_files_from_queue(
             bytes_w = [wi for wi in write_items if wi.type == WriteItemType.BYTE_IO]
             write_results = []
 
-            with file_name.open("wb") as stream:
+            with create_stream(file_name, "wb") as stream:
                 for write_item in bytes_w:
                     data = planner.resolve_data(write_item)
                     write_results.append(
@@ -271,13 +325,91 @@ def _write_files_from_queue(
                     )
 
                 if use_fsync:
-                    os.fsync(stream.fileno())
+                    try:
+                        os.fsync(stream.fileno())
+                    except AttributeError:
+                        os.sync()
             result_queue.put(write_results)
     except queue.Empty:
         pass
 
 
-class FileSystemWriter(StorageWriter):
+class FileSystemBase(ABC):
+    @contextmanager
+    @abstractmethod
+    def create_stream(
+        self, path: Union[str, os.PathLike], mode: str
+    ) -> Generator[io.IOBase, None, None]:
+        ...
+
+    @abstractmethod
+    def concat_path(
+        self, path: Union[str, os.PathLike], suffix: str
+    ) -> Union[str, os.PathLike]:
+        ...
+
+    @abstractmethod
+    def rename(
+        self, path: Union[str, os.PathLike], new_path: Union[str, os.PathLike]
+    ) -> None:
+        ...
+
+    @abstractmethod
+    def init_path(self, path: Union[str, os.PathLike]) -> Union[str, os.PathLike]:
+        ...
+
+    @abstractmethod
+    def mkdir(self, path: Union[str, os.PathLike]) -> None:
+        ...
+
+    @classmethod
+    @abstractmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        ...
+
+
+class FileSystem(FileSystemBase):
+    @contextmanager
+    def create_stream(
+        self, path: Union[str, os.PathLike], mode: str
+    ) -> Generator[io.IOBase, None, None]:
+        with cast(Path, path).open(mode) as stream:
+            yield cast(io.IOBase, stream)
+
+    def concat_path(
+        self, path: Union[str, os.PathLike], suffix: str
+    ) -> Union[str, os.PathLike]:
+        return cast(Path, path) / suffix
+
+    def init_path(self, path: Union[str, os.PathLike]) -> Union[str, os.PathLike]:
+        if not isinstance(path, Path):
+            path = Path(path)
+        return path
+
+    def rename(
+        self, path: Union[str, os.PathLike], new_path: Union[str, os.PathLike]
+    ) -> None:
+        cast(Path, path).rename(cast(Path, new_path))
+
+    def mkdir(self, path: Union[str, os.PathLike]) -> None:
+        cast(Path, path).mkdir(parents=True, exist_ok=True)
+
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        if isinstance(checkpoint_id, Path):
+            return True
+
+        if "://" in str(checkpoint_id):
+            return False
+
+        for p in Path(checkpoint_id).parents:
+            if p.exists() and os.access(str(p), os.W_OK):
+                return True
+
+        return False
+
+
+class _FileSystemWriter(StorageWriter):
     """
     Basic implementation of StorageWriter using file IO.
 
@@ -298,6 +430,8 @@ def __init__(
         sync_files: bool = True,
         thread_count: int = 1,
         per_thread_copy_ahead: int = 10_000_000,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         """
         Initialize the writer pointing to `path`.
@@ -312,25 +446,30 @@ def __init__(
         N. B. If sync_files is disabled, there's no guarantee that the checkpoint will be consistent in the case of a failure.
         """
         super().__init__()
-        if not isinstance(path, Path):
-            path = Path(path)
-        self.path = path
+        self.fs = FileSystem()
+        self.path = self.fs.init_path(path)
         self.single_file_per_rank = single_file_per_rank
         self.sync_files = sync_files
         self.thread_count = thread_count
         self.per_thread_copy_ahead = per_thread_copy_ahead
+        self.save_id = _generate_uuid()
+
+    def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
+        if checkpoint_id:
+            self.path = self.fs.init_path(checkpoint_id)
+        self.save_id = _generate_uuid()
 
     def set_up_storage_writer(self, is_coordinator: bool) -> None:
         pass
 
     def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
-        self.path.mkdir(parents=True, exist_ok=True)
+        self.fs.mkdir(self.path)
         return plan
 
-    def prepare_global_plan(self, global_plan: List[SavePlan]) -> List[SavePlan]:
+    def prepare_global_plan(self, plans: List[SavePlan]) -> List[SavePlan]:
         new_plans = [
             dataclasses.replace(plan, storage_data=_StoragePrefix(f"__{i}_"))
-            for i, plan in enumerate(global_plan)
+            for i, plan in enumerate(plans)
         ]
         return new_plans
 
@@ -352,11 +491,13 @@ def gen_file():
         if self.single_file_per_rank:
             for bucket in _split_by_size_and_type(self.thread_count, plan.items):
                 file_name = gen_file()
-                file_queue.put((self.path / file_name, file_name, bucket))
+                path = self.fs.concat_path(self.path, file_name)
+                file_queue.put((path, file_name, bucket))
         else:
             for item in plan.items:
                 file_name = gen_file()
-                file_queue.put((self.path / file_name, file_name, [item]))
+                path = self.fs.concat_path(self.path, file_name)
+                file_queue.put((path, file_name, [item]))
 
         result_queue: queue.Queue = queue.Queue()
 
@@ -365,22 +506,26 @@ def gen_file():
             t = threading.Thread(
                 target=_write_files_from_queue,
                 args=(
+                    self.fs.create_stream,
                     file_queue,
                     result_queue,
                     planner,
                     self.per_thread_copy_ahead,
                     self.sync_files,
+                    self.thread_count,
                 ),
             )
             t.start()
             threads.append(t)
 
         _write_files_from_queue(
+            create_stream=self.fs.create_stream,
             file_queue=file_queue,
             result_queue=result_queue,
             planner=planner,
             inflight_threshhold=self.per_thread_copy_ahead,
             use_fsync=self.sync_files,
+            thread_count=self.thread_count,
         )
 
         for t in threads:
@@ -402,25 +547,53 @@ def finish(self, metadata: Metadata, results: List[List[WriteResult]]) -> None:
         for wr_list in results:
             storage_md.update({wr.index: wr.storage_data for wr in wr_list})
         metadata.storage_data = storage_md
-        with (self.path / ".metadata.tmp").open("wb") as metadata_file:
+
+        metadata.storage_meta = self.storage_meta()
+
+        tmp_path = cast(Path, self.fs.concat_path(self.path, ".metadata.tmp"))
+        meta_path = cast(Path, self.fs.concat_path(self.path, ".metadata"))
+        with self.fs.create_stream(tmp_path, "wb") as metadata_file:
             pickle.dump(metadata, metadata_file)
             if self.sync_files:
-                os.fsync(metadata_file.fileno())
+                try:
+                    os.fsync(metadata_file.fileno())
+                except AttributeError:
+                    os.sync()
 
-        (self.path / ".metadata.tmp").rename(self.path / ".metadata")
+        self.fs.rename(tmp_path, meta_path)
+
+    def storage_meta(self) -> Optional[StorageMeta]:
+        return StorageMeta(checkpoint_id=self.checkpoint_id, save_id=self.save_id)
+
+    @property
+    def checkpoint_id(self) -> Union[str, os.PathLike]:
+        """
+        return the checkpoint_id that will be used to save the checkpoint.
+        """
+        return self.path
+
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        return FileSystem.validate_checkpoint_id(checkpoint_id)
 
 
 class FileSystemReader(StorageReader):
     def __init__(self, path: Union[str, os.PathLike]) -> None:
         super().__init__()
-        if not isinstance(path, Path):
-            path = Path(path)
-        self.path = path
+        self.fs = FileSystem()
+        self.path = self.fs.init_path(path)
         self.storage_data: Dict[MetadataIndex, _StorageInfo] = dict()
+        self.load_id = _generate_uuid()
 
-    def _slice_file(self, file, sinfo: _StorageInfo):
+    def _slice_file(self, file, sinfo: _StorageInfo) -> io.IOBase:
         return _create_file_view(file, sinfo.offset, sinfo.length)
 
+    def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
+        self.storage_data = dict()
+        if checkpoint_id:
+            self.path = self.fs.init_path(checkpoint_id)
+        self.load_id = _generate_uuid()
+
     def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
         # group requests by file
         per_file: Dict[str, List[ReadItem]] = dict()
@@ -430,18 +603,20 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
             per_file.setdefault(path, []).append(read_item)
 
         for relative_path, reqs in per_file.items():
-            with (self.path / relative_path).open("rb") as file:
+            new_path = self.fs.concat_path(self.path, relative_path)
+            with self.fs.create_stream(new_path, "rb") as stream:
                 # TODO sort by offset and cache the reading
                 for req in reqs:
                     item_md = self.storage_data[req.storage_index]
-                    file_slice = self._slice_file(file, item_md)
+                    file_slice = self._slice_file(stream, item_md)
                     if req.type == LoadItemType.BYTE_IO:
-                        bytes = io.BytesIO(file_slice.read(item_md.length))
-                        bytes.seek(0)
-                        planner.load_bytes(req, bytes)
+                        read_bytes = io.BytesIO(file_slice.read(item_md.length))
+                        read_bytes.seek(0)
+                        planner.load_bytes(req, read_bytes)
                     else:
                         tensor = cast(
-                            Tensor, torch.load(file_slice, map_location="cpu")
+                            Tensor,
+                            torch.load(cast(IO[bytes], file_slice), map_location="cpu"),
                         )
                         tensor = narrow_tensor_by_index(
                             tensor, req.storage_offsets, req.lengths
@@ -460,8 +635,15 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
 
     # Implementing the abstract function in StorageReader
     def read_metadata(self) -> Metadata:
-        with (self.path / ".metadata").open("rb") as metadata_file:
-            return pickle.load(metadata_file)
+        path = self.fs.concat_path(self.path, ".metadata")
+        with self.fs.create_stream(path, "rb") as metadata_file:
+            metadata = pickle.load(metadata_file)
+
+        if getattr(metadata, "storage_meta", None) is None:
+            metadata.storage_meta = StorageMeta()
+        metadata.storage_meta.load_id = self.load_id
+
+        return metadata
 
     def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
         self.storage_data = metadata.storage_data
@@ -470,59 +652,71 @@ def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> Non
     def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
         return plan
 
-    def prepare_global_plan(self, global_plan: List[LoadPlan]) -> List[LoadPlan]:
-        return global_plan
+    def prepare_global_plan(self, plans: List[LoadPlan]) -> List[LoadPlan]:
+        return plans
+
+    @property
+    def checkpoint_id(self) -> Union[str, os.PathLike]:
+        """
+        return the checkpoint_id that will be used to save the checkpoint.
+        """
+        return self.path
+
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        return FileSystem.validate_checkpoint_id(checkpoint_id)
 
 
-class _FileSystemCheckpointer(_Checkpointer):
-    """An implementation of :py:class:`torch.distributed.checkpoint.checkpointer.Checkpointer`
-    for the file system. Wraps the creation and usage of ``FileSystemWriter`` and ``FileSystemReader``.
+class FileSystemWriter(_FileSystemWriter, BlockingAsyncStager):
+    """
+    Basic implementation of StorageWriter using file IO.
 
-    .. warning::
-        This feature is experimental and subject to removal/change.
+    This implementation makes the following assumptions and simplifications:
+
+    * The checkpoint path is an empty or non-existing directory.
+    * File creation is atomic
+
+    The checkpoint consist of one file per write request plus
+    a `.metadata` file with the serialized metadata.
 
     """
 
     def __init__(
         self,
         path: Union[str, os.PathLike],
-        *,
         single_file_per_rank: bool = True,
         sync_files: bool = True,
         thread_count: int = 1,
         per_thread_copy_ahead: int = 10_000_000,
-        process_group: Optional[dist.ProcessGroup] = None,
-        coordinator_rank: int = 0,
-        no_dist: bool = False,
-        load_planner: Optional[LoadPlanner] = None,
-        save_planner: Optional[SavePlanner] = None,
-    ):
-        """Initializes Checkpointing defualts, including ``FileSystemWriter`` and ``FileSystemReader``
+        cache_staged_state_dict: bool = False,
+    ) -> None:
+        """
+        Initialize the writer pointing to `path`.
 
         Args:
-            path: The directory to store/load checkpoints.
+            path: directory where the checkpoint will be written to.
             single_file_per_rank: Produce one file per rank instead of one file per tensor/blob. Default to True.
-            sync_files: force files to be synced to permanent storage. Default to True.
+            sync_files : force files to be synced to permanent storage. Default to True.
             thread_count: Number of IO threads to use to write. Default to 1.
             per_thread_copy_ahead: How many bytes to copy from the GPU ahead of saving then. Default 10Mb.
-            process_group: ProcessGroup to be used for cross-rank synchronization.
-            coordinator_rank: Rank to use to coordinate the checkpoint. rank0 is used by default.
-            no_dist: If ``True``, distributed checkpoint will not load in SPMD style. (Default: ``False``)
-            loader_planner: Instance of LoadPlanner to use when loading.
-            save_planner: Instance of SavePlanner to use when saving.
-        """
-
-        storage_writer = FileSystemWriter(
-            path, single_file_per_rank, sync_files, thread_count, per_thread_copy_ahead
-        )
-        storage_reader = FileSystemReader(path)
+            cache_staged_state_dict: Whether to cache the staged state_dict. This option decreases staging latency
+                at the cost of increases memory usage. Additionally, if this parameter is set to True, it's the expectation
+                that the stager is maintained and re-used for multiple dcp.async_save calls. Default to False.
 
+        N. B. If sync_files is disabled, there's no guarantee that the checkpoint will be consistent in the case of a failure.
+        """
         super().__init__(
-            storage_writer,
-            storage_reader,
-            process_group=process_group,
-            coordinator_rank=coordinator_rank,
-            no_dist=no_dist,
-            load_planner=load_planner,
-            save_planner=save_planner,
+            path=path,
+            single_file_per_rank=single_file_per_rank,
+            sync_files=sync_files,
+            thread_count=thread_count,
+            per_thread_copy_ahead=per_thread_copy_ahead,
+            cache_staged_state_dict=cache_staged_state_dict,
         )
+
+    def stage(self, state_dict: STATE_DICT_TYPE) -> STATE_DICT_TYPE:
+        """Override of AsyncStager.stage"""
+        # in the async case, the state dict is already on CPU, so maintaining this
+        # buffer makes no sense
+        self.per_thread_copy_ahead = 0
+        return super().stage(state_dict)
diff --git a/torch/distributed/checkpoint/format_utils.py b/torch/distributed/checkpoint/format_utils.py
new file mode 100644
index 0000000000000..aca8c454db09e
--- /dev/null
+++ b/torch/distributed/checkpoint/format_utils.py
@@ -0,0 +1,276 @@
+import argparse
+import os
+from enum import Enum
+from typing import cast, Dict, List, Optional, Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed._shard._utils import narrow_tensor_by_index
+from torch.distributed.checkpoint import FileSystemReader, FileSystemWriter
+from torch.distributed.checkpoint._nested_dict import flatten_state_dict
+from torch.distributed.checkpoint.default_planner import (
+    _EmptyStateDictLoadPlanner,
+    DefaultLoadPlanner,
+)
+from torch.distributed.checkpoint.metadata import (
+    Metadata,
+    STATE_DICT_TYPE,
+    STORAGE_TYPES,
+    TensorProperties,
+    TensorStorageMetadata,
+)
+from torch.distributed.checkpoint.planner import LoadItemType, LoadPlan, LoadPlanner
+from torch.distributed.checkpoint.planner_helpers import _create_chunk_list
+from torch.distributed.checkpoint.state_dict_loader import _load_state_dict
+from torch.distributed.checkpoint.state_dict_saver import _save_state_dict
+from torch.distributed.checkpoint.storage import StorageReader
+from torch.futures import Future
+
+
+__all__ = [
+    "dcp_to_torch_save",
+    "torch_save_to_dcp",
+    "BroadcastingTorchSaveReader",
+    "DynamicMetaLoadPlanner",
+]
+
+
+class BroadcastingTorchSaveReader(StorageReader):
+    """
+    StorageReader for reading a Torch Save file. This reader will read the entire checkpoint
+    on the coordinator rank, and then broadcast and shard each tensor to all ranks.
+
+    . N.B. Intended to be used with DynamicMetaLoadPlanner
+
+    .. warning::
+        Current implementation only supports loading Tensors.
+
+    >>> # xdoctest: +SKIP("undefined vars")
+    >>> sd = {"mode": model}
+    >>> dcp.load(
+    >>>    sd,
+    >>>    storage_reader=BroadcastingTorchSaveReader(),
+    >>>    planner=DynamicMetaLoadPlanner(),
+    >>>    checkpoint_id="path_to_model.pt"
+    >>> )
+    """
+
+    def __init__(
+        self,
+        checkpoint_id: Optional[Union[str, os.PathLike]] = None,
+        coordinator_rank: int = 0,
+    ) -> None:
+        self.checkpoint_id = checkpoint_id
+        self.coordinator_rank = coordinator_rank
+
+    def read_metadata(self) -> Metadata:
+        """Extends the default StorageReader to support building the metadata file"""
+        # Metadata is built in planner.set_up_planner, since we are not actually reading metadata from
+        # the disk
+        return Metadata(state_dict_metadata={})
+
+    def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
+        """
+        Reads torch save data on the coordinator rank, and broadcast afterwards
+        this incurrs a communication cost, but avoids having to load
+        the entire checkpoint on each rank, hopefully preventing OOM issues
+        """
+        planner = cast(DefaultLoadPlanner, planner)
+
+        # data is read in on the coordinator rank, and broadcast afterwards
+        # this incurrs a communication cost, but it avoids having to load
+        # the entire checkpoint on each rank, hopefully preventing OOM issues
+        # TODO: read on each host, instead of only the coordinator
+        if self.is_coordinator:
+            assert self.checkpoint_id is not None
+            torch_state_dict = torch.load(self.checkpoint_id, map_location="cpu")
+            if planner.flatten_state_dict:
+                torch_state_dict, _ = flatten_state_dict(torch_state_dict)
+        else:
+            torch_state_dict = None
+
+        for req in plan.items:
+            if req.type == LoadItemType.BYTE_IO:
+                raise RuntimeError(
+                    f"Non-tensor value identified at {req.storage_index.fqn}. "
+                    f"At this time {type(self).__name__} only supports loading Tensors."
+                )
+
+            #  Broadcast the tensor from the coordinator rank
+            if self.is_coordinator:
+                tensor = torch_state_dict[req.storage_index.fqn].cuda()
+            else:
+                tensor = torch.empty_like(planner.state_dict[req.storage_index.fqn])
+
+            dist.broadcast(tensor, src=self.coordinator_rank, async_op=False)
+
+            tensor = narrow_tensor_by_index(tensor, req.storage_offsets, req.lengths)
+            target_tensor = planner.resolve_tensor(req).detach()
+            assert target_tensor.size() == tensor.size(), (
+                f"req {req.storage_index} mismatch sizes, "
+                f"{target_tensor.size()} vs {tensor.size()}"
+            )
+            target_tensor.copy_(tensor)
+            planner.commit_tensor(req, target_tensor)
+
+        fut: Future = Future()
+        fut.set_result(None)
+        return fut
+
+    def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
+        """Implementation of the StorageReader method"""
+        self.is_coordinator = is_coordinator
+        if self.is_coordinator:
+            assert dist.get_rank() == self.coordinator_rank
+
+        assert self.checkpoint_id is not None
+
+    def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
+        """Implementation of the StorageReader method"""
+        return plan
+
+    def prepare_global_plan(self, global_plan: List[LoadPlan]) -> List[LoadPlan]:
+        """Implementation of the StorageReader method"""
+        return global_plan
+
+    def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
+        """Implementation of the StorageReader method"""
+        self.checkpoint_id = checkpoint_id
+
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        """Implementation of the StorageReader method"""
+        return os.path.isfile(checkpoint_id)
+
+
+class DynamicMetaLoadPlanner(DefaultLoadPlanner):
+    """
+    Extension of DefaultLoadPlanner, which creates a new Metadata object based on the passed in state dict,
+    avoiding the need to read metadata from disk. This is useful when reading formats which don't have a
+    metadata file, like Torch Save files.
+
+    . N.B. Intended to be used with BroadcastingTorchSaveReader
+
+    .. warning::
+        Current implementation only supports loading Tensors.
+
+    >>> # xdoctest: +SKIP("undefined vars")
+    >>> sd = {"mode": model}
+    >>> dcp.load(
+    >>>    sd,
+    >>>    storage_reader=BroadcastingTorchSaveReader(),
+    >>>    planner=DynamicMetaLoadPlanner(),
+    >>>    checkpoint_id="path_to_model.pt"
+    >>> )
+    """
+
+    def set_up_planner(
+        self,
+        state_dict: STATE_DICT_TYPE,
+        metadata: Optional[Metadata] = None,
+        is_coordinator: bool = False,
+    ) -> None:
+        """Setups of the planner, extnding default behavior by creating the Metadata object from the state dict"""
+        super().set_up_planner(state_dict, metadata, is_coordinator)
+
+        state_dict_metadata: Dict[str, STORAGE_TYPES] = {}
+        for key, tensor in self.state_dict.items():
+            if not torch.is_tensor(tensor):
+                raise RuntimeError(
+                    f"Non-tensor value identified at {key}. "
+                    f"At this time {type(self).__name__} only supports loading Tensors."
+                )
+
+            state_dict_metadata[key] = TensorStorageMetadata(
+                TensorProperties(dtype=tensor.dtype),
+                tensor.size(),
+                _create_chunk_list(tensor),
+            )
+        self.metadata = Metadata(state_dict_metadata=state_dict_metadata)
+
+
+def dcp_to_torch_save(
+    dcp_checkpoint_dir: Union[str, os.PathLike],
+    torch_save_path: Union[str, os.PathLike],
+):
+    """
+    Given a directory containing a DCP checkpoint, this function will convert it into a
+    Torch save file.
+
+    Args:
+        dcp_checkpoint_dir: Directory containing the DCP checkpoint.
+        torch_save_path: Filename to store the converted Torch save file.
+
+    .. warning::
+        To avoid OOM, it's recommended to only run this function on a single rank.
+    """
+    sd: STATE_DICT_TYPE = {}
+    _load_state_dict(
+        sd,
+        storage_reader=FileSystemReader(dcp_checkpoint_dir),
+        planner=_EmptyStateDictLoadPlanner(),
+        no_dist=True,
+    )
+    torch.save(sd, torch_save_path)
+
+
+def torch_save_to_dcp(
+    torch_save_path: Union[str, os.PathLike],
+    dcp_checkpoint_dir: Union[str, os.PathLike],
+):
+    """
+    Given the location of a torch save file, converts it into a DCP checkpoint.
+
+    Args:
+        torch_save_path: Filename to store the converted Torch save file.
+        dcp_checkpoint_dir: Directory containing the DCP checkpoint.
+
+    .. warning::
+        To avoid OOM, it's recommended to only run this function on a single rank.
+    """
+
+    state_dict = torch.load(torch_save_path)
+    # we don't need stateful behavior here because the expectation is anything loaded by
+    # torch.load would not contain stateful objects.
+    _save_state_dict(
+        state_dict, storage_writer=FileSystemWriter(dcp_checkpoint_dir), no_dist=True
+    )
+
+
+if __name__ == "__main__":
+
+    class FormatMode(Enum):
+        TORCH_TO_DCP = "torch_to_dcp"
+        DCP_TO_TORCH = "dcp_to_torch"
+
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "mode",
+        type=str,
+        help="Conversion mode",
+        choices=[m.value for m in FormatMode],
+        default=FormatMode.TORCH_TO_DCP,
+    )
+    parser.add_argument("src", type=str, help="Path to the source model")
+    parser.add_argument("dst", type=str, help="Path to the destination model")
+    args = parser.parse_args()
+
+    print(
+        f"Converting checkpoint from {args.src} to {args.dst} using method: '{args.mode}'"
+    )
+    checkpoint_missing_warning = (
+        f"No checkpoint found at {args.src}. Skipping conversion."
+    )
+    if args.mode == FormatMode.TORCH_TO_DCP.value:
+        if os.path.isfile(args.src):
+            torch_save_to_dcp(args.src, args.dst)
+        else:
+            print(checkpoint_missing_warning)
+    elif args.mode == FormatMode.DCP_TO_TORCH.value:
+        if os.path.isdir(args.src):
+            dcp_to_torch_save(args.src, args.dst)
+        else:
+            print(checkpoint_missing_warning)
+    else:
+        raise ValueError(f"Unknown conversion mode: {args.mode}")
diff --git a/torch/distributed/checkpoint/logger.py b/torch/distributed/checkpoint/logger.py
new file mode 100644
index 0000000000000..99030db8647e1
--- /dev/null
+++ b/torch/distributed/checkpoint/logger.py
@@ -0,0 +1,84 @@
+import functools
+import time
+from typing import Any, Callable, Dict, List, TypeVar
+
+from typing_extensions import ParamSpec
+
+import torch.distributed.c10d_logger as c10d_logger
+from torch.distributed.checkpoint.logging_handlers import DCP_LOGGER_NAME
+
+__all__: List[str] = []
+
+global _dcp_logger
+_dcp_logger = c10d_logger._get_or_create_logger(DCP_LOGGER_NAME)
+
+_T = TypeVar("_T")
+_P = ParamSpec("_P")
+
+
+def _msg_dict_from_dcp_method_args(*args, **kwargs) -> Dict[str, Any]:
+    """
+    Extracts log data from dcp method args
+    """
+    msg_dict = {}
+
+    # checkpoint ID can be passed in through the serializer or through the checkpoint id directly
+    storage_writer = kwargs.get("storage_writer", None)
+    storage_reader = kwargs.get("storage_reader", None)
+    checkpoint_id = kwargs.get("checkpoint_id", None)
+    if not checkpoint_id and (serializer := storage_writer or storage_reader):
+        checkpoint_id = getattr(serializer, "checkpoint_id", None)
+
+    msg_dict["checkpoint_id"] = str(checkpoint_id)
+
+    return msg_dict
+
+
+def _get_msg_dict(func_name, *args, **kwargs) -> Dict[str, Any]:
+    msg_dict = _msg_dict_from_dcp_method_args(*args, **kwargs)
+    msg_dict.update(c10d_logger._get_msg_dict(func_name, **msg_dict))
+
+    return msg_dict
+
+
+def _dcp_method_logger(
+    log_exceptions: bool = False, **wrapper_kwargs: Any
+) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:  # pyre-ignore
+    """This method decorator logs the start, end, and exception of wrapped events."""
+
+    def decorator(func: Callable[_P, _T]):
+        @functools.wraps(func)
+        def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _T:
+            msg_dict = _get_msg_dict(
+                func.__name__, *args, **{**wrapper_kwargs, **kwargs}
+            )
+
+            # log start event
+            msg_dict["event"] = "start"
+            t0 = time.time_ns()
+            msg_dict["time"] = t0
+            _dcp_logger.debug(msg_dict)
+
+            # exceptions
+            try:
+                result = func(*args, **kwargs)
+            except Exception as error:
+                if log_exceptions:
+                    msg_dict["event"] = "exception"
+                    msg_dict["error"] = f"{error}"
+                    msg_dict["time"] = time.time_ns()
+                    _dcp_logger.error(msg_dict)
+                raise
+
+            # end event
+            msg_dict["event"] = "end"
+            t1 = time.time_ns()
+            msg_dict["time"] = time.time_ns()
+            msg_dict["times_spent"] = t1 - t0
+            _dcp_logger.debug(msg_dict)
+
+            return result
+
+        return wrapper
+
+    return decorator
diff --git a/torch/distributed/checkpoint/logging_handlers.py b/torch/distributed/checkpoint/logging_handlers.py
new file mode 100644
index 0000000000000..119dd52b76797
--- /dev/null
+++ b/torch/distributed/checkpoint/logging_handlers.py
@@ -0,0 +1,15 @@
+import logging
+from typing import List
+
+from torch.distributed.logging_handlers import _log_handlers
+
+
+__all__: List[str] = []
+
+DCP_LOGGER_NAME = "dcp_logger"
+
+_log_handlers.update(
+    {
+        DCP_LOGGER_NAME: logging.NullHandler(),
+    }
+)
diff --git a/torch/distributed/checkpoint/metadata.py b/torch/distributed/checkpoint/metadata.py
index 4ce6250c2d26c..2172ff6abc028 100644
--- a/torch/distributed/checkpoint/metadata.py
+++ b/torch/distributed/checkpoint/metadata.py
@@ -1,9 +1,9 @@
+import os
 from dataclasses import dataclass, field
+from enum import Enum
 from typing import Any, Dict, List, Optional, Sequence, Union
 
 import torch
-from torch.distributed._shard.sharded_tensor import ShardedTensor
-from torch.distributed._shard.sharded_tensor.metadata import TensorProperties
 from torch.distributed.checkpoint.stateful import StatefulT
 
 __all__ = [
@@ -12,17 +12,100 @@
     "BytesStorageMetadata",
     "Metadata",
     "MetadataIndex",
+    "TensorProperties",
 ]
 
 
 @dataclass
 class ChunkStorageMetadata:
-    """Each chunk is expected to have the same properties of the TensorStorageMetadata that includes it."""
+    """
+    Each chunk is expected to have the same properties of the TensorStorageMetadata
+    that includes it.
+    """
 
     offsets: torch.Size
     sizes: torch.Size
 
 
+class _MEM_FORMAT_ENCODING(Enum):
+    """Describe the memory format of a tensor."""
+
+    TORCH_CONTIGUOUS_FORMAT = 0
+    TORCH_CHANNELS_LAST = 1
+    TORCH_PRESERVE_FORMAT = 2
+
+
+@dataclass
+class TensorProperties:
+    """Properties used to create :class:`Tensor`"""
+
+    # Regular tensor fields
+    dtype: torch.dtype = field(default_factory=torch.get_default_dtype)
+    # This field is deprecated.
+    layout: torch.layout = field(default=torch.strided)
+    # This field is deprecated.
+    requires_grad: bool = False
+    # This field is deprecated.
+    memory_format: torch.memory_format = field(default=torch.contiguous_format)
+    # This field is deprecated.
+    pin_memory: bool = False
+
+    def __getstate__(self):
+        # Since torch.memory_format cannot be pickled!
+        memory_format = self.memory_format
+        if memory_format == torch.contiguous_format:
+            mem_format_encoding = _MEM_FORMAT_ENCODING.TORCH_CONTIGUOUS_FORMAT
+        elif memory_format == torch.channels_last:
+            mem_format_encoding = _MEM_FORMAT_ENCODING.TORCH_CHANNELS_LAST
+        elif memory_format == torch.preserve_format:
+            mem_format_encoding = _MEM_FORMAT_ENCODING.TORCH_PRESERVE_FORMAT
+        else:
+            raise RuntimeError(f"Invalid torch.memory_format: {memory_format}")
+
+        return (
+            self.dtype,
+            self.layout,
+            self.requires_grad,
+            mem_format_encoding,
+            self.pin_memory,
+        )
+
+    def __setstate__(
+        self,
+        state,
+    ):
+        (
+            self.dtype,
+            self.layout,
+            self.requires_grad,
+            mem_format_encoding,
+            self.pin_memory,
+        ) = state
+
+        if mem_format_encoding == _MEM_FORMAT_ENCODING.TORCH_CONTIGUOUS_FORMAT:
+            memory_format = torch.contiguous_format
+        elif mem_format_encoding == _MEM_FORMAT_ENCODING.TORCH_CHANNELS_LAST:
+            memory_format = torch.channels_last
+        elif mem_format_encoding == _MEM_FORMAT_ENCODING.TORCH_PRESERVE_FORMAT:
+            memory_format = torch.preserve_format
+        else:
+            raise RuntimeError(
+                f"Invalid torch.memory_format encoding: {mem_format_encoding}"
+            )
+
+        self.memory_format = memory_format
+
+    @staticmethod
+    def create_from_tensor(tensor: torch.Tensor) -> "TensorProperties":
+        return TensorProperties(
+            dtype=tensor.dtype,
+            layout=tensor.layout,
+            requires_grad=tensor.requires_grad,
+            memory_format=torch.contiguous_format,
+            pin_memory=tensor.is_pinned(),
+        )
+
+
 @dataclass
 class TensorStorageMetadata:
     properties: TensorProperties
@@ -35,17 +118,30 @@ class BytesStorageMetadata:
     pass
 
 
-TENSOR_TYPE = Union[torch.Tensor, ShardedTensor]
 STORAGE_TYPES = Union[TensorStorageMetadata, BytesStorageMetadata]
 STATE_DICT_TYPE = Dict[str, Union[StatefulT, Any]]
 
 
+@dataclass
+class StorageMeta:
+    checkpoint_id: Union[str, os.PathLike, None] = None
+    save_id: Optional[str] = None
+    load_id: Optional[str] = None
+
+
 @dataclass
 class Metadata:
+    """This class represents the metadata of the checkpoint."""
+
     # Keys are the same from the `state_dict` used.
     state_dict_metadata: Dict[str, STORAGE_TYPES]
+    # It is the responsibility of the planner and storage plugins to ensure
+    # backward compatibility of the planner_data and storage_data. DCP will
+    # also ensure the backward compatibility of the metadata in this file and
+    # the metadata of the built-in planner and storage plugins.
     planner_data: Any = None
     storage_data: Any = None
+    storage_meta: Optional[StorageMeta] = None
 
 
 @dataclass(frozen=True)
diff --git a/torch/distributed/checkpoint/optimizer.py b/torch/distributed/checkpoint/optimizer.py
index 7dd932b9282e3..26468d046f29a 100644
--- a/torch/distributed/checkpoint/optimizer.py
+++ b/torch/distributed/checkpoint/optimizer.py
@@ -7,7 +7,9 @@
 import torch.distributed as dist
 from torch._utils import _get_device_module
 from torch.distributed._shard.sharded_tensor.api import ShardedTensor
-from torch.distributed._shard.sharded_tensor.metadata import TensorProperties
+from torch.distributed._shard.sharded_tensor.metadata import (
+    TensorProperties as ShardTensorProperties,
+)
 from torch.distributed._shard.sharded_tensor.shard import Shard
 from torch.distributed._shard.sharding_spec.chunk_sharding_spec import ChunkShardingSpec
 from torch.distributed._tensor import DTensor
@@ -19,6 +21,7 @@
     Metadata,
     MetadataIndex,
     STATE_DICT_TYPE,
+    TensorProperties,
     TensorStorageMetadata,
 )
 from torch.distributed.checkpoint.planner import LoadPlan, LoadPlanner
@@ -300,9 +303,15 @@ def load_sharded_optimizer_state_dict(
             spec_key = key_path[2]
             alloc_size = layout_specs.get(spec_key, (None, value.size))[1]
 
-            st_md = sharding_spec.build_metadata(
-                torch.Size(alloc_size), value.properties
+            properties = ShardTensorProperties(
+                dtype=value.properties.dtype,
+                layout=value.properties.layout,
+                requires_grad=value.properties.requires_grad,
+                memory_format=value.properties.memory_format,
+                pin_memory=value.properties.pin_memory,
             )
+
+            st_md = sharding_spec.build_metadata(torch.Size(alloc_size), properties)
             local_shards = []
             current_rank = dist.get_rank(dp_pg)
             for shard_md in st_md.shards_metadata:
diff --git a/torch/distributed/checkpoint/planner.py b/torch/distributed/checkpoint/planner.py
index ebcdfb0454fbb..ad2466a50ee83 100644
--- a/torch/distributed/checkpoint/planner.py
+++ b/torch/distributed/checkpoint/planner.py
@@ -1,13 +1,21 @@
 import abc
 import io
+import operator
 from dataclasses import dataclass
 from enum import auto, Enum
+from functools import reduce
 from typing import Any, List, Optional, Tuple, Union
 
 import torch
-from torch.distributed._shard.sharded_tensor.metadata import TensorProperties
 
-from .metadata import ChunkStorageMetadata, Metadata, MetadataIndex, STATE_DICT_TYPE
+from torch.distributed.checkpoint.metadata import (
+    ChunkStorageMetadata,
+    Metadata,
+    MetadataIndex,
+    STATE_DICT_TYPE,
+    StorageMeta,
+    TensorProperties,
+)
 
 
 __all__ = [
@@ -43,12 +51,28 @@ class TensorWriteData:
 
 @dataclass(frozen=True)
 class WriteItem:
+    """Dataclass which holds information about what needs to be written to storage."""
+
     index: MetadataIndex
     type: WriteItemType
 
     # Value present if it's a tensor write
     tensor_data: Optional[TensorWriteData] = None
 
+    def tensor_storage_size(self) -> Optional[int]:
+        """
+        Calculates the storage size of the underlying tensor, or None if this is not a tensor write.
+
+        Returns:
+            Optional[int] storage size, in bytes of underlying tensor if any.
+        """
+        if self.tensor_data is None:
+            return None
+
+        numels = reduce(operator.mul, self.tensor_data.size, 1)
+        dtype_size = torch._utils._element_size(self.tensor_data.properties.dtype)
+        return numels * dtype_size
+
 
 @dataclass(frozen=True)
 class ReadItem:
@@ -119,9 +143,14 @@ class SavePlanner(abc.ABC):
 
     >>> # xdoctest: +SKIP("undefined vars")
     >>> class RenamePlanner(DefaultSavePlanner):
-    >>>     def set_up_planner(self, state_dict, is_coordinator):
+    >>>     def set_up_planner(
+    >>>         self,
+    >>>         state_dict: STATE_DICT_TYPE,
+    >>>         storage_meta: Optional[StorageMeta],
+    >>>         is_coordinator: bool,
+    >>>     ) -> None:
     >>>         # prefix all keys with `foo_``
-    >>>         super().set_up_planner({"foo_" + k: v for k, v in state_dict.items()}, is_coordinator)
+    >>>         super().set_up_planner({"foo_" + k: v for k, v in state_dict.items()}, storage_meta, is_coordinator)
 
     Modifying local plan and lookup in tandem. This is useful when fine control of how data is persisted
 
@@ -174,7 +203,12 @@ class SavePlanner(abc.ABC):
     """
 
     @abc.abstractmethod
-    def set_up_planner(self, state_dict: STATE_DICT_TYPE, is_coordinator: bool) -> None:
+    def set_up_planner(
+        self,
+        state_dict: STATE_DICT_TYPE,
+        storage_meta: Optional[StorageMeta] = None,
+        is_coordinator: bool = False,
+    ) -> None:
         """
         Initialize this planner to save ``state_dict``.
 
@@ -276,7 +310,12 @@ class LoadPlanner:
 
     >>> # xdoctest: +SKIP("undefined vars")
     >>> class RenamePlanner(DefaultLoadPlanner):
-    >>>     def set_up_planner(self, state_dict, metadata, is_coordinator):
+    >>>     def set_up_planner(
+    >>>         self,
+    >>>         state_dict: STATE_DICT_TYPE,
+    >>>         metadata: Metadata,
+    >>>         is_coordinator: bool,
+    >>>     ) -> None:
     >>>         self.original_state_dict = state_dict
     >>>         state_dict = {"foo_" + k: v for k, v in state_dict.items()}
     >>>
@@ -311,8 +350,8 @@ class LoadPlanner:
     def set_up_planner(
         self,
         state_dict: STATE_DICT_TYPE,
-        metadata: Metadata,
-        is_coordinator: bool,
+        metadata: Optional[Metadata] = None,
+        is_coordinator: bool = False,
     ) -> None:
         """
         Initialize this instance to load data into ``state_dict``.
@@ -356,6 +395,14 @@ def load_bytes(self, read_item: ReadItem, value: io.BytesIO) -> None:
         """
         pass
 
+    def resolve_bytes(self, read_item: ReadItem) -> io.BytesIO:
+        """
+        Return the BytesIO to be used by the StorageReader to load `read_item`.
+
+        The BytesIO should alias with one on the underlying state_dict as StorageReader will replace its contents.
+        """
+        raise NotImplementedError("LoadPlanner.resolve_bytes is not implemented")
+
     @abc.abstractmethod
     def resolve_tensor(self, read_item: ReadItem) -> torch.Tensor:
         """
diff --git a/torch/distributed/checkpoint/planner_helpers.py b/torch/distributed/checkpoint/planner_helpers.py
index c01ea0da254c9..5829ab6111e22 100644
--- a/torch/distributed/checkpoint/planner_helpers.py
+++ b/torch/distributed/checkpoint/planner_helpers.py
@@ -1,18 +1,23 @@
-from typing import Any, List
+from typing import Any, cast, List
 
 import torch
+import torch.distributed as dist
+from torch._utils import _get_device_module
+
 from torch.distributed._shard.metadata import ShardMetadata
 from torch.distributed._shard.sharded_tensor import ShardedTensor
-from torch.distributed._shard.sharded_tensor.metadata import TensorProperties
 from torch.distributed._tensor import DTensor
 from torch.distributed._tensor._utils import compute_local_shape_and_global_offset
 
+from torch.utils._pytree import tree_map_only
+
 from .metadata import (
     BytesStorageMetadata,
     ChunkStorageMetadata,
     MetadataIndex,
     STATE_DICT_TYPE,
     STORAGE_TYPES,
+    TensorProperties,
     TensorStorageMetadata,
 )
 from .planner import (
@@ -47,9 +52,19 @@ def _chunk_for_shard(shard_md: ShardMetadata) -> ChunkStorageMetadata:
 def _sharded_tensor_metadata(
     sharded_tensor: ShardedTensor, shard_md: ShardMetadata
 ) -> TensorWriteData:
+    shard_properties = sharded_tensor.metadata().tensor_properties
+
+    properties = TensorProperties(
+        dtype=shard_properties.dtype,
+        layout=shard_properties.layout,
+        requires_grad=shard_properties.requires_grad,
+        memory_format=shard_properties.memory_format,
+        pin_memory=shard_properties.pin_memory,
+    )
+
     return TensorWriteData(
         chunk=_chunk_for_shard(shard_md),
-        properties=sharded_tensor.metadata().tensor_properties,
+        properties=properties,
         size=sharded_tensor.metadata().size,
     )
 
@@ -68,7 +83,6 @@ def _create_write_items_for_dtensor(fqn: str, tensor: DTensor) -> WriteItem:
                 offsets=offsets,
                 sizes=sizes,
             ),
-            # TODO:update this to not use TensorProperties from ST.
             properties=TensorProperties.create_from_tensor(tensor.to_local()),
             size=tensor.size(),
         ),
@@ -227,21 +241,34 @@ def _create_chunk_from_dtensor(tensor: DTensor) -> ChunkStorageMetadata:
     )
 
 
+def _create_chunk_list(tensor: torch.Tensor) -> List[ChunkStorageMetadata]:
+    if isinstance(tensor, DTensor):
+        local_chunks = [_create_chunk_from_dtensor(tensor)]
+    elif isinstance(tensor, ShardedTensor):
+        local_chunks = [
+            _chunk_for_shard(shard.metadata) for shard in tensor.local_shards()
+        ]
+    elif isinstance(tensor, torch.Tensor):
+        local_chunks = [_create_chunk_from_tensor(tensor)]
+    else:
+        raise ValueError(
+            "Unsupported Type, expecting one of [Tensor, DTensor, ShardedTensor] "
+            f",but got {type(tensor)}"
+        )
+
+    return local_chunks
+
+
 def _create_read_items(fqn: str, md: STORAGE_TYPES, obj: Any) -> List[ReadItem]:
     if not isinstance(md, BytesStorageMetadata):
-        if isinstance(obj, DTensor):
-            local_chunks = [_create_chunk_from_dtensor(obj)]
-        elif isinstance(obj, ShardedTensor):
-            local_chunks = [
-                _chunk_for_shard(shard.metadata) for shard in obj.local_shards()
-            ]
-        elif isinstance(obj, torch.Tensor):
-            local_chunks = [_create_chunk_from_tensor(obj)]
-        else:
+        try:
+            local_chunks = _create_chunk_list(obj)
+        except ValueError as ex:
             raise ValueError(
                 f"Invalid checkpoint metadata for {fqn}, "
-                + f"expected BytesStorageMetadata but found {type(md)}"
-            )
+                + f"expected BytesStorageMetadata but found {type(md)}",
+            ) from ex
+
         return create_read_items_for_chunk_list(fqn, md, local_chunks)
     else:
         return [
@@ -253,3 +280,46 @@ def _create_read_items(fqn: str, md: STORAGE_TYPES, obj: Any) -> List[ReadItem]:
                 length=0,
             )
         ]
+
+
+def _init_state_dict(state_dict: STATE_DICT_TYPE) -> None:
+    state_dict_assigned_storage = tree_map_only(
+        torch.Tensor, lambda v: _init_meta_tensor(v), state_dict
+    )
+    # The inplace version of tree_map_only, tree_map_only_ doesn't seem to work.
+    # So we need to temporariy update the each element in the state dict with meta tensor.
+    for k in state_dict.keys():
+        state_dict[k] = state_dict_assigned_storage[k]
+
+
+def _init_meta_tensor(value: Any) -> Any:
+    """
+    Initializes tensor, moves it to device for torch.Tensor/DTensor on meta device.
+    """
+
+    device = getattr(value, "device", None)
+    # DCP does the initialization if it's meta tensor/DTensor.
+    if device == torch.device("meta"):
+        device_type = dist.distributed_c10d._get_pg_default_device().type
+        device = cast(torch.device, _get_device_module(device_type).current_device())
+        if isinstance(value, DTensor):
+            new_local_tensor = torch.empty_like(value.to_local(), device=device)
+            # We need to pass shape and stride explicitly, since DTensor might be
+            # sharded unevenly.
+            dtensor = DTensor.from_local(
+                new_local_tensor,
+                device_mesh=value.device_mesh,
+                placements=value.placements,
+                shape=value.size(),
+                stride=value.stride(),
+            )
+            return dtensor
+        elif isinstance(value, torch.Tensor):
+            tensor = torch.empty_like(value, device=device)
+            return tensor
+        else:
+            raise RuntimeError(
+                f"Found unsupported type {type(value)} for meta device loading."
+            )
+    else:
+        return value
diff --git a/torch/distributed/checkpoint/staging.py b/torch/distributed/checkpoint/staging.py
new file mode 100644
index 0000000000000..f4ce2673dfc00
--- /dev/null
+++ b/torch/distributed/checkpoint/staging.py
@@ -0,0 +1,120 @@
+from typing import Optional, runtime_checkable
+
+from typing_extensions import Protocol
+
+from torch.distributed._state_dict_utils import (
+    _copy_state_dict,
+    _create_cpu_state_dict,
+    _offload_state_dict_to_cpu,
+)
+
+from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
+
+__all__ = ["AsyncStager", "BlockingAsyncStager"]
+
+
+@runtime_checkable
+class AsyncStager(Protocol):
+    """
+    This protocol is meant to provide customization and extensibility for dcp.async_save, allowing users
+    to customize how data is staged previous to executing the usual dcp.save path in parallel.
+    The expected order of operations (concretely defined in `torch.distributed.state_dict_saver.async_save`)
+    is the following:
+
+    1. AsyncStager.stage_data(state_dict):
+        This call gives the AsyncStager the opportunity to 'stage'
+        the state_dict. The expectation and purpose of staging in this context is to create a "training-safe"
+        representation of the state dict, meaning that any updates to module data after staging is complete
+        should not be reflected in the state dict returned from this method. For example, in the default
+        case a copy of the entire state dict is created on CPU RAM and returned here, allowing users
+        to continue training without risking changes to data which is being serialized.
+
+    2. dcp.save is called on the state_dict returned from stage in parallel. This call is respondsible
+        for serializing the state_dict and writing it to storage.
+
+    3. If AsyncStager.should_synchronize_after_execute is True, this method will be called immediately after
+        the serialization thread starts and before returning from dcp.async_save. If this is set to False,
+        the assumption is the user has defined a custom synchronization point for the the purpose of further
+        optimizing save latency in the training loop (for example, by overlapping staging with the
+        forward/backward pass), and it is the respondsibility of the user to call `AsyncStager.synchronize_staging`
+        at the appropriate time.
+
+    """
+
+    # default to True since the common case is to stage synchronously
+    _synchronize_after_execute: bool = True
+
+    @property
+    def should_synchronize_after_execute(self) -> bool:
+        """
+        Whether to synchronize after executing the stage.
+        """
+
+        return self._synchronize_after_execute
+
+    def stage(self, state_dict: STATE_DICT_TYPE) -> STATE_DICT_TYPE:
+        """
+        Returns a "staged" copy of `state_dict`. The expectation of the staged copy is that it is
+        innoculated from any updates incurred after the stage call is complete.
+        """
+        raise NotImplementedError(
+            f"{self.__class__.__name__} must implement stage method"
+        )
+
+    def synchronize_staging(self) -> None:
+        """
+        In the case `stage` is async in some way, this method should be called to ensure staging
+        is complete and it is safe to begin modifying the original `state_dict`
+        """
+        pass
+
+
+class BlockingAsyncStager(AsyncStager):
+    """
+    An implementation of AsyncStager which stages the state_dict on CPU RAM and blocks until the copy is complete.
+    This implementation also provides an option to optimize stage latency using pinned memory.
+
+    N.B. synchronize_staging is a no-op in this case.
+
+
+    """
+
+    # default to True since the common case is to stage synchronously
+    _synchronize_after_execute: bool = False
+
+    def __init__(
+        self,
+        cache_staged_state_dict: bool = False,
+        type_check: bool = False,
+    ):
+        """
+        Initializes the BlockingAsyncStager.
+
+        Args:
+            cache_staged_state_dict: Whether to cache the staged state_dict. This option decreases staging latency
+                at the cost of increases memory usage. Additionally, if this parameter is set to True, it's the expectation
+                that the stager is maintained and re-used for multiple dcp.async_save calls. Default to False.
+            type_check: Whether to perform a type check during cpu_offload. Defaults to False.
+
+        """
+        self.cache_staged_state_dict = cache_staged_state_dict
+        self.type_check = type_check
+        self.state_dict_cache: Optional[STATE_DICT_TYPE] = None
+
+    def stage(self, state_dict: STATE_DICT_TYPE) -> STATE_DICT_TYPE:
+        """
+        Returns a copy of `state_dict` on the CPU.
+        """
+
+        if not self.cache_staged_state_dict:
+            return _offload_state_dict_to_cpu(state_dict, type_check=self.type_check)
+
+        if self.state_dict_cache is None:
+            self.state_dict_cache = _create_cpu_state_dict(state_dict, pin_memory=True)
+        return _copy_state_dict(state_dict, self.state_dict_cache)
+
+    def synchronize_staging(self) -> None:
+        """
+        No-op function, since staging is blocking.
+        """
+        pass
diff --git a/torch/distributed/checkpoint/state_dict.py b/torch/distributed/checkpoint/state_dict.py
index a57a8f2b4d56e..e3c213a37188e 100644
--- a/torch/distributed/checkpoint/state_dict.py
+++ b/torch/distributed/checkpoint/state_dict.py
@@ -8,6 +8,7 @@
     Callable,
     cast,
     Dict,
+    Generator,
     Iterable,
     List,
     no_type_check,
@@ -22,10 +23,16 @@
 import torch.nn as nn
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed._state_dict_utils import (
+    _broadcast_state_dict,
+    _flatten_state_dict,
     _gather_state_dict,
     _offload_state_dict_to_cpu,
+    _unflatten_state_dict,
 )
 from torch.distributed._tensor import DTensor
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    _CHECKPOINT_PREFIX,
+)
 from torch.distributed.fsdp import (
     FullOptimStateDictConfig,
     FullStateDictConfig,
@@ -42,6 +49,7 @@
 )
 from torch.nn.modules.module import _IncompatibleKeys
 from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils._pytree import tree_map_only
 
 
 FLAT_PARAM = "_flat_param"
@@ -72,7 +80,6 @@ def gc_context():
         yield
     finally:
         # TODO: add logging for the gc details/time
-        gc.collect()
         if is_enabled:
             gc.enable()
 
@@ -107,6 +114,13 @@ class StateDictOptions:
     - ``strict``: the ``strict`` option when ``set_state_dict`` calls
       model.load_state_dict().
       The default value is False.
+
+    - ``broadcast_from_rank0``: when the option is True, rank0 should receive a
+       full state_dict and will broadcast the tensors in the state_dict/
+       optim_state_dict one by one to other ranks. Other ranks will receive
+       the tensors and shard according to the local shards in the model and
+       optimizer. ``full_state_dict`` must be set to True when using this option.
+       This option currently only supports DTensor, not the legacy ShardedTensor.
     """
 
     full_state_dict: bool = False
@@ -114,6 +128,7 @@ class StateDictOptions:
     ignore_frozen_params: bool = False
     keep_submodule_prefixes: bool = True
     strict: bool = True
+    broadcast_from_rank0: bool = False
 
 
 @dataclass
@@ -129,7 +144,13 @@ class _StateDictInfo(StateDictOptions):
     fsdp_modules: List[nn.Module] = field(default_factory=list)
 
 
-def _get_fqns(model: nn.Module, name: str, skip_ddp_prefix: bool = True) -> FQNS_T:
+@functools.lru_cache(maxsize=None)
+def _get_fqns(
+    model: nn.Module,
+    name: str,
+    skip_ddp_prefix: bool = True,
+    skip_compiler_prefix: bool = True,
+) -> FQNS_T:
     """
     This API is used to convert the name of a parameter to the FQNs. For FSDP
     without `use_orig_params`, the name of FlatParameter can be mapped to
@@ -144,6 +165,9 @@ def _get_fqns(model: nn.Module, name: str, skip_ddp_prefix: bool = True) -> FQNS
     Returns:
         The canonical FQNs based on the model traversal.
     """
+
+    # Remove the checkpoint prefix, if it exists.
+    name = name.replace(_CHECKPOINT_PREFIX, "")
     if "." not in name:
         return {name}
 
@@ -167,11 +191,55 @@ def _get_fqns(model: nn.Module, name: str, skip_ddp_prefix: bool = True) -> FQNS
             if curr_obj_name != FSDP_WRAPPED_MODULE:
                 fqn_obj_names.append(curr_obj_name)
                 curr_obj = getattr(curr_obj, curr_obj_name)
+        elif isinstance(curr_obj, torch._dynamo.eval_frame.OptimizedModule):
+            assert curr_obj_name == "_orig_mod"
+            curr_obj = curr_obj._orig_mod
+            if not skip_compiler_prefix:
+                fqn_obj_names.append(curr_obj_name)
         else:
             fqn_obj_names.append(curr_obj_name)
-            curr_obj = getattr(curr_obj, curr_obj_name)
+            if curr_obj_name == nn.modules.module._EXTRA_STATE_KEY_SUFFIX:
+                if i != len(obj_names) - 1:
+                    raise RuntimeError("Expect `_extra_state` to be the last obj name")
+            else:
+                curr_obj = getattr(curr_obj, curr_obj_name)
+
+    return {".".join(fqn_obj_names).replace(_CHECKPOINT_PREFIX, "")}
 
-    return {".".join(fqn_obj_names)}
+
+class _EXTRA_STATE:
+    pass
+
+
+def _iterate_valid_model_state(model):
+    visited_modules: Set[nn.Module] = set()
+
+    def recurse(module: nn.Module, curr_fqn: str) -> Generator:
+        visited_modules.add(module)
+
+        curr_fqn = f"{curr_fqn}." if curr_fqn else ""
+        for name, submodule in module.named_children():
+            if submodule in visited_modules:
+                continue
+            new_fqn = f"{curr_fqn}{name}"
+            yield from recurse(submodule, new_fqn)
+
+        for name, obj in chain(
+            module.named_buffers(recurse=False), module.named_parameters(recurse=False)
+        ):
+            if name in module._non_persistent_buffers_set:
+                continue
+            new_fqn = f"{curr_fqn}{name}"
+            yield new_fqn, obj
+
+        if (
+            getattr(module.__class__, "get_extra_state", nn.Module.get_extra_state)
+            != nn.Module.get_extra_state
+        ):
+            new_fqn = f"{curr_fqn}{nn.modules.module._EXTRA_STATE_KEY_SUFFIX}"
+            yield new_fqn, _EXTRA_STATE()
+
+    yield from recurse(model, "")
 
 
 def _verify_options(
@@ -196,14 +264,16 @@ def _verify_options(
         Union[str, torch.Tensor], Union[Set[str], torch.Tensor]
     ] = {}
     all_fqns = set()
-    for name, param in model.named_parameters():
+    for name, param in _iterate_valid_model_state(model):
         fqns = _get_fqns(model, name)
-        fqn_param_mapping[param] = fqns
+        if not isinstance(param, _EXTRA_STATE):
+            fqn_param_mapping[param] = fqns
         for fqn in fqns:
-            fqn_param_mapping[fqn] = param
+            if not isinstance(param, _EXTRA_STATE):
+                fqn_param_mapping[fqn] = param
             all_fqns.add(fqn)
 
-    submodule_prefixes = set()
+    submodule_prefixes: Set[str] = set()
     if submodules:
         submodules = set(submodules)
         for name, module in model.named_modules():
@@ -211,9 +281,12 @@ def _verify_options(
                 continue
             fqns = _get_fqns(model, name)
             assert len(fqns) == 1, "Submodule FQN should only have 1 instance"
-            for fqn in fqns:
-                submodule_prefixes.add(f"{fqn}.")
+            submodule_prefixes.update(f"{fqn}." for fqn in fqns)
 
+    if options.broadcast_from_rank0 and not options.full_state_dict:
+        raise ValueError(
+            "full_state_dict must be True when broadcast_from_rank0 is True."
+        )
     fsdp_modules = FSDP.fsdp_modules(model)
     state_dict_config: StateDictConfig
     optim_state_dict_config: OptimStateDictConfig
@@ -225,11 +298,14 @@ def _verify_options(
                 offload_to_cpu=options.cpu_offload, rank0_only=options.cpu_offload
             )
             optim_state_dict_config = FullOptimStateDictConfig(
-                offload_to_cpu=options.cpu_offload, rank0_only=options.cpu_offload
+                offload_to_cpu=options.cpu_offload,
+                rank0_only=(options.cpu_offload or options.broadcast_from_rank0),
             )
             state_dict_type = StateDictType.FULL_STATE_DICT
         else:
-            state_dict_config = ShardedStateDictConfig()
+            state_dict_config = ShardedStateDictConfig(
+                offload_to_cpu=options.cpu_offload,
+            )
             optim_state_dict_config = ShardedOptimStateDictConfig(
                 offload_to_cpu=options.cpu_offload,
             )
@@ -262,16 +338,9 @@ def _verify_state_dict(
     optim_state_dict: OptimizerStateType,
     info: _StateDictInfo,
 ) -> None:
-    # FSDP root must exist otherwise FSDP state_dict will be incorrect.
-    has_fsdp_root = False
     for module in info.fsdp_modules:
         fsdp_state = _get_module_fsdp_state_if_fully_sharded_module(module)
         assert fsdp_state is not None, "Expected a fsdp_state with a fsdp module."
-        if fsdp_state._is_root:
-            has_fsdp_root = True
-            break
-    if info.fsdp_modules and not has_fsdp_root:
-        raise RuntimeError("The model has FSDP modules but no FSDP root module exists.")
 
     # Verify if the model_state_dict and optim_state_dict are valid. This API
     # should give the users an explicit error message to debug or report.
@@ -282,6 +351,7 @@ def _verify_state_dict(
         and not info.ignore_frozen_params
         and not (info.cpu_offload and info.full_state_dict)
         and info.strict
+        and not info.broadcast_from_rank0
     ):
         raise RuntimeError(
             "The option indicates that model state_dict is required to save "
@@ -290,8 +360,10 @@ def _verify_state_dict(
         )
 
     if info.handle_optim:
-        if not (optim_state_dict and optim_state_dict[STATE]) and not (
-            info.cpu_offload and info.full_state_dict
+        if (
+            not (optim_state_dict and optim_state_dict[STATE])
+            and not (info.cpu_offload and info.full_state_dict)
+            and (not info.broadcast_from_rank0)
         ):
             raise RuntimeError(
                 "The option indicates that model state_dict is required to save, "
@@ -327,8 +399,9 @@ def _get_model_state_dict(
         assert len(fqns) == 1
         fqn = next(iter(fqns))
         if fqn != key:
-            # As we only support FSDP, DDP, and TP, the only case is
-            # wrapper-based DDP. Verify the assumption is correct.
+            # As we only support FSDP, DDP, and TP, the only cases are
+            # wrapper-based DDP and compiler. Verify if the assumption
+            # is correct.
             def verify(key, fqn) -> bool:
                 if len(fqn) >= len(key):
                     return False
@@ -340,7 +413,7 @@ def verify(key, fqn) -> bool:
                         fqn_idx += 1
                         if fqn_idx == len(fqn_split):
                             return key_idx == len(key_split) - 1
-                    elif key_name == "module":
+                    elif key_name in ("module", "_orig_mod"):
                         continue
                     else:
                         return False
@@ -373,7 +446,7 @@ def verify(key, fqn) -> bool:
                 state_dict.pop(fqn)
 
     for key, p in list(state_dict.items()):
-        if p.is_meta:
+        if torch.is_tensor(p) and p.is_meta:
             state_dict.pop(key)
 
     if info.full_state_dict:
@@ -392,15 +465,35 @@ def _load_model_state_dict(
     state_dict: Dict[str, ValueType],
     info: _StateDictInfo,
 ) -> _IncompatibleKeys:
-    if not info.handle_model or not state_dict:
+    if not info.handle_model or (not state_dict and not info.broadcast_from_rank0):
         return _IncompatibleKeys({}, {})
 
-    for key, _ in model.named_parameters():
+    local_state_dict = {}
+    for key, value in _iterate_valid_model_state(model):
         fqns = _get_fqns(model, key)
-        fqns_with_ddp_prefix = _get_fqns(model, key, skip_ddp_prefix=False)
-        for fqn, fqn_with_ddp_prefix in zip(fqns, fqns_with_ddp_prefix):
-            if fqn != fqn_with_ddp_prefix:
-                state_dict[fqn_with_ddp_prefix] = state_dict.pop(fqn)
+        fqns_with_prefix = _get_fqns(
+            model, key, skip_ddp_prefix=False, skip_compiler_prefix=False
+        )
+
+        for fqn, fqn_with_prefix in zip(fqns, fqns_with_prefix):
+            if (
+                not info.broadcast_from_rank0 or dist.get_rank() == 0
+            ) and fqn != fqn_with_prefix:
+                state_dict[fqn_with_prefix] = state_dict.pop(fqn)
+            local_state_dict[fqn_with_prefix] = value
+
+    if info.broadcast_from_rank0:
+        device = None
+        for key, value in local_state_dict.items():
+            if torch.is_tensor(value) and value.dim() > 0:
+                if device is None:
+                    device = value.device
+                else:
+                    assert device == value.device
+        assert device is not None
+        _broadcast_state_dict(state_dict, local_state_dict, device=device)
+        for fqn, local_state in local_state_dict.items():
+            state_dict[fqn] = local_state
 
     with info.fsdp_context():
         return cast(
@@ -432,7 +525,20 @@ def _init_optim_state(optim: torch.optim.Optimizer) -> None:
                 )
             if param.requires_grad:
                 param.grad = torch.zeros_like(param)
+
+    # Some optimizers will update parameters regardless of grads due to lr, so
+    # make lr to zero when calling `step()`.
+    lrs = []
+    for param_group in optim.param_groups:
+        if "lr" in param_group:
+            lrs.append(param_group["lr"])
+            param_group["lr"] = 0.0
     optim.step(closure=None)
+    # Whether to recover the "lr" should not matter too much as we will
+    # restore checkpointing later.
+    for param_group in optim.param_groups:
+        if "lr" in param_group:
+            param_group["lr"] = lrs.pop(0)
     optim.zero_grad(set_to_none=True)
 
 
@@ -451,6 +557,19 @@ def _get_optim_state_dict(
         if info.fsdp_modules:
             with info.fsdp_context():
                 osd = FSDP.optim_state_dict(model, optim, osd)
+
+            # We need to specially handle FlatParameter FSDP as
+            # FlatParameter FSDP converts the FQNs.
+            # There are no easy ways to do this conversion systematically.
+            # We can only use a string replacment without correctness check.
+            if not osd:
+                continue
+            for k in list(osd[STATE].keys()):
+                if "_orig_mod" in k:
+                    osd[STATE][k.replace("_orig_mod.", "")] = osd[STATE].pop(k)
+            for g in osd[PG]:
+                params = [k.replace("_orig_mod.", "") for k in g[PARAMS]]
+                g[PARAMS] = params
         else:
             params = list(chain.from_iterable(g[PARAMS] for g in optim.param_groups))
             param_pid_mapping = dict(zip(params, range(len(params))))
@@ -553,17 +672,67 @@ def _load_optim_state_dict(
         return
 
     for optim in optimizers:
-        optim_state_dict = _split_optim_state_dict(model, optim, state_dict, info)
+        _init_optim_state(optim)
+        if state_dict:
+            optim_state_dict = _split_optim_state_dict(model, optim, state_dict, info)
+        else:
+            optim_state_dict = {}
         if info.fsdp_modules:
+            # We need to specially handle FlatParameter FSDP as
+            # FlatParameter FSDP converts the FQNs.
+            for original_fqn, _ in model.named_parameters():
+                fqns = _get_fqns(model, original_fqn)
+                fqns_with_compiler = _get_fqns(
+                    model, original_fqn, skip_compiler_prefix=False
+                )
+                if fqns == fqns_with_compiler:
+                    continue
+
+                assert len(fqns) == 1
+                fqn = fqns.pop()
+                fqn_with_compiler = fqns_with_compiler.pop()
+                for g in optim_state_dict[PG]:
+                    val = cast(Dict[str, Any], g)
+                    params = [
+                        key.replace(fqn, fqn_with_compiler) for key in val[PARAMS]
+                    ]
+                    val[PARAMS] = params
+                osd_state = cast(DictValueType, optim_state_dict[STATE])
+                for k in list(osd_state.keys()):
+                    if fqn in k:
+                        osd_state[k.replace(fqn, fqn_with_compiler)] = osd_state.pop(k)
+
             with info.fsdp_context():
                 optim_state_dict = FSDP.optim_state_dict_to_load(
                     model, optim, optim_state_dict
                 )
+        elif info.broadcast_from_rank0:
+            info.full_state_dict = False
+            local_state_dict = _get_optim_state_dict(model, (optim,), info)
+            info.full_state_dict = True
+            device = None
+
+            def _device(t):
+                if t.dim() > 0:
+                    nonlocal device
+                    if device is None:
+                        device = t.device
+                    elif device != t.device:
+                        raise ValueError("Device mismatch")
+                return t
+
+            _ = tree_map_only(torch.Tensor, _device, local_state_dict)
+            assert device is not None
+            flatten_osd, osd_mapping = _flatten_state_dict(optim_state_dict)
+            flatten_local_osd, local_osd_mapping = _flatten_state_dict(local_state_dict)
+            _broadcast_state_dict(flatten_osd, flatten_local_osd, device=device)
+            optim_state_dict = _unflatten_state_dict(
+                flatten_local_osd, local_osd_mapping
+            )
 
         # Note that we do not have to convert the FQN back to param id here if
         # order in optim.param_groups[idx][PARAMS] is the same as the one in
         # optim_state_dict[PG][idx][PARAMS].
-        _init_optim_state(optim)
         _state_dict_fn(optim, "load_state_dict")(state_dict=optim_state_dict)
 
 
@@ -588,6 +757,8 @@ def get_model_state_dict(
 
     Returns:
         The state_dict for ``model``.
+
+    :rtype: typing.Dict[str, ValueType]
     """
     with gc_context():
         info = _verify_options(
@@ -626,6 +797,8 @@ def get_optimizer_state_dict(
 
     Returns:
         The state_dict for ``optimizers``.
+
+    :rtype: OptimizerStateType
     """
     with gc_context():
         optimizers = (
@@ -678,25 +851,25 @@ def get_state_dict(
     optimizer parameter IDs to the canonical FQNs.
 
     Example:
+        >>> # xdoctest: +SKIP
+        >>> import torch
+        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+        >>> from torch.nn.parallel import DistributedDataParallel as DDP
+        >>> from torch.distributed.checkpoint.state_dict import get_state_dict
 
-        import torch
-        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-        from torch.nn.parallel import DistributedDataParallel as DDP
-        from torch.distributed.checkpoint.state_dict import get_state_dict
-
-        fsdp_model = FSDP(copy.deepcopy(model))
-        fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
-        ddp_model = DDP(copy.deepcopy(model))
-        ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
+        >>> fsdp_model = FSDP(copy.deepcopy(model))
+        >>> fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
+        >>> ddp_model = DDP(copy.deepcopy(model))
+        >>> ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
 
 
-        ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
-        fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(fsdp_model, fsdp_optim)
+        >>> ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
+        >>> fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(fsdp_model, fsdp_optim)
 
-        # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
-        # the asserts will fail.
-        assert ddp_state_dict == fsdp_state_dict
-        assert ddp_optim_state == fsdp_optim_state_dict
+        >>> # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
+        >>> # the asserts will fail.
+        >>> assert ddp_state_dict == fsdp_state_dict
+        >>> assert ddp_optim_state == fsdp_optim_state_dict
 
 
     Args:
@@ -711,6 +884,8 @@ def get_state_dict(
 
     Returns:
         ``Tuple`` that contain model state_dict and optimizer state_dict.
+
+    :rtype: typing.Tuple[typing.Dict[str, ValueType], OptimizerStateType]
     """
 
     with gc_context():
@@ -760,9 +935,7 @@ def _unflatten_model_state_dict(
 
 def set_model_state_dict(
     model: nn.Module,
-    model_state_dict: Union[
-        Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]
-    ],
+    model_state_dict: Dict[str, ValueType],
     *,
     options: Optional[StateDictOptions] = None,
 ) -> _IncompatibleKeys:
@@ -773,7 +946,7 @@ def set_model_state_dict(
 
     Args:
         model (nn.Module): the nn.Module to the model.
-        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
+        model_state_dict: (Dict[str, ValueType]):
            the model state_dict to load. If the key of the ``model_state_dict``
            is nn.Module, the key is a submodule of ``model`` and the value should
            be the state_dict of the submodule. When loading the state_dict,
@@ -786,6 +959,8 @@ def set_model_state_dict(
         ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
             * **missing_keys** is a list of str containing the missing keys
             * **unexpected_keys** is a list of str containing the unexpected keys
+
+    :type model_state_dict: typing.Dict[str, ValueType]
     """
     model_state_dict: Dict[str, ValueType] = _unflatten_model_state_dict(
         model, model_state_dict
@@ -821,6 +996,8 @@ def set_optimizer_state_dict(
 
     Returns:
         None
+
+    :type optim_state_dict: typing.OptimizerStateType
     """
     with gc_context():
         optimizers = (
@@ -838,9 +1015,7 @@ def set_state_dict(
     model: nn.Module,
     optimizers: Union[torch.optim.Optimizer, Iterable[torch.optim.Optimizer]],
     *,
-    model_state_dict: Union[
-        Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]
-    ],
+    model_state_dict: Dict[str, ValueType],
     optim_state_dict: OptimizerStateType,
     options: Optional[StateDictOptions] = None,
 ) -> _IncompatibleKeys:
@@ -873,6 +1048,9 @@ def set_state_dict(
         ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
             * **missing_keys** is a list of str containing the missing keys of the model state_dict.
             * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.
+
+    :type model_state_dict: typing.Dict[str, ValueType]
+    :type optim_state_dict: typing.OptimizerStateType
     """
 
     model_state_dict: Dict[str, ValueType] = _unflatten_model_state_dict(
diff --git a/torch/distributed/checkpoint/state_dict_loader.py b/torch/distributed/checkpoint/state_dict_loader.py
index b332613a2c144..b7e1337e6c4f5 100644
--- a/torch/distributed/checkpoint/state_dict_loader.py
+++ b/torch/distributed/checkpoint/state_dict_loader.py
@@ -1,14 +1,18 @@
+import os
 import warnings
-from typing import Any, Dict, Optional
+from typing import Any, cast, Dict, Optional, Set, Union
 
 import torch
 import torch.distributed as dist
+from torch.distributed.checkpoint.default_planner import _EmptyStateDictLoadPlanner
+from torch.distributed.checkpoint.logger import _dcp_method_logger
 from torch.distributed.checkpoint.stateful import Stateful
 
+from ._storage_utils import _storage_setup
 from .default_planner import DefaultLoadPlanner
-from .planner import LoadPlanner
+from .planner import LoadPlan, LoadPlanner
 from .storage import StorageReader
-from .utils import _all_gather_keys, _DistWrapper, _profile
+from .utils import _all_gather_keys, _api_bc_check, _DistWrapper, _profile
 
 __all__ = ["load_state_dict", "load"]
 
@@ -26,6 +30,7 @@ def load_state_dict(
         "'load_state_dict' is deprecated and will be removed in future versions. "
         "Please use 'load' instead."
     )
+    storage_reader.reset()
     with _profile():
         # TODO: test returning `load` here instead.
         return _load_state_dict(
@@ -38,14 +43,15 @@ def load_state_dict(
         )
 
 
+@_dcp_method_logger(log_exceptions=True)
+@_api_bc_check
 def load(
     state_dict: Dict[str, Any],
-    storage_reader: StorageReader,
     *,
-    process_group: Optional[dist.ProcessGroup] = None,
-    coordinator_rank: int = 0,
-    no_dist: bool = False,
+    checkpoint_id: Union[str, os.PathLike, None] = None,
+    storage_reader: Optional[StorageReader] = None,
     planner: Optional[LoadPlanner] = None,
+    process_group: Optional[dist.ProcessGroup] = None,
 ) -> None:
     """
     Load a distributed ``state_dict`` in SPMD style.
@@ -70,21 +76,32 @@ def load(
         pos-processing and non-tensor data properly propagates.
 
     .. note:
-        This function can be used for local inference and load a checkpoint
-        produced by ``save_state_dict`` without having a process group initialized
-        by passing ``no_dist=True`` and by using Tensors instead of ShardedTensors.
+        If no process group is initialized, this function will assume the intent
+        is to load a checkpoint into the local process. This can be useful in the
+        case of local inference, and when using regular Tensors (as opposed to DTensor
+         or ShardedTensor)
+
+    .. note:
+        Rank 0 is assumed to be the coordinator rank.
 
     Args:
-        state_dict (Dict[str, Any]) : The state_dict to load. Note that this
-            state dict will updated in place.
-        storage_reader (StorageReader): StorageReader used to load data from.
-        process_group (ProcessGroup):
+        state_dict (Dict[str, Any]): The state_dict to save.
+        checkpoint_id (Union[str, os.PathLike, None]):
+            The ID of this checkpoint instance. The meaning of the checkpoint_id
+            depends on the storage. It can be a path to a folder or to a file.
+            It can also be a key if the storage is a key-value store.
+            (Default: ``None``)
+        storage_reader (Optional[StorageReader]):
+            Instance of StorageWriter used to perform reads. If this is not
+            specified, DCP will automatically infer the reader based on the
+            checkpoint_id. If checkpoint_id is also None, an exception will
+            be raised. (Default: ``None``)
+        planner (Optional[LoadPlanner]):
+            Instance of LoadPlanner. If this is not specificed, the default
+            planner will be used. (Default: ``None``)
+        process_group (Optional[ProcessGroup]):
             ProcessGroup to be used for cross-rank synchronization.
-        coordinator_rank (int):
-            Rank to use to coordinate the checkpoint.
-            rank0 is used by default.
-        no_dist (bool): If ``True``, distributed checkpoint will not load
-            in SPMD style. (Default: ``False``)
+            (Default: ``None``)
 
     Returns:
         None.
@@ -115,11 +132,21 @@ def load(
         rank has an individual GPU, via ``torch.cuda.set_device()``.
     """
 
+    no_dist = not (dist.is_available() and dist.is_initialized())
+    if no_dist:
+        warnings.warn(
+            "torch.distributed is unavailable or uninitialized, assuming the intent is to load in a single process."
+        )
+
     with _profile():
+        storage_reader = cast(
+            StorageReader, _storage_setup(storage_reader, checkpoint_id, reader=True)
+        )
+
         if no_dist:
             keys = list(state_dict.keys())
         else:
-            keys = _all_gather_keys(state_dict)
+            keys = _all_gather_keys(state_dict, process_group)
             if keys != sorted(state_dict.keys()):
                 warnings.warn(
                     "Detected mismatched keys in state dict after all gather!"
@@ -136,12 +163,11 @@ def load(
             )
 
         _load_state_dict(
-            statetful_sd,
-            storage_reader,
-            process_group,
-            coordinator_rank,
-            no_dist,
-            planner,
+            state_dict=statetful_sd,
+            storage_reader=storage_reader,
+            process_group=process_group,
+            no_dist=no_dist,
+            planner=planner,
         )
         for key in keys:
             if key not in state_dict:
@@ -149,7 +175,6 @@ def load(
             elem = state_dict[key]
             if isinstance(elem, Stateful):
                 elem.load_state_dict(statetful_sd[key])
-            state_dict[key] = elem
 
 
 def _load_state_dict(
@@ -166,6 +191,11 @@ def _load_state_dict(
     if planner is None:
         planner = DefaultLoadPlanner()
 
+    ckpt_kwargs = {}
+    if (ckpt_id := getattr(storage_reader, "checkpoint_id", None)) is not None:
+        ckpt_kwargs["checkpoint_id"] = ckpt_id
+
+    @_dcp_method_logger(**ckpt_kwargs)
     def local_step():
         assert planner is not None
         metadata = storage_reader.read_metadata()
@@ -176,14 +206,16 @@ def local_step():
         local_plan = storage_reader.prepare_local_plan(local_plan)
         return local_plan
 
+    @_dcp_method_logger(**ckpt_kwargs)
     def global_step(all_local_plans):
         assert planner is not None
         all_local_plans = planner.create_global_plan(all_local_plans)
         all_local_plans = storage_reader.prepare_global_plan(all_local_plans)
         return all_local_plans
 
-    central_plan = distW.reduce_scatter("plan", local_step, global_step)
+    central_plan: LoadPlan = distW.reduce_scatter("plan", local_step, global_step)
 
+    @_dcp_method_logger(**ckpt_kwargs)
     def read_data():
         assert planner is not None
         final_local_plan = planner.finish_plan(central_plan)
@@ -193,3 +225,86 @@ def read_data():
         return None
 
     _ = distW.all_gather("read", read_data)
+
+
+def _load_state_dict_from_keys(
+    keys: Optional[Union[Set[str], str]] = None,
+    *,
+    checkpoint_id: Union[str, os.PathLike, None] = None,
+    storage_reader: Optional[StorageReader] = None,
+    process_group: Optional[dist.ProcessGroup] = None,
+) -> Dict[str, Any]:
+    """
+    Load only the specified keys from the checkpoint, if no keys are specified, the entire
+    checkpoint will be loaded. Note, this method completely loads the checkpoint into the
+    current process and is not distributed.
+
+    .. warning::
+
+
+    .. warning::
+
+        All non-tensor data is loaded using `torch.load()`
+
+    .. note:
+        As opposed to the usual pattern, this function does not take a state dict as input
+        and does not load inplace. Instead, a new state dict is directly initialized and read
+        from file.
+
+    .. note:
+        If no process group is initialized, this function will assume the intent
+        is to load a checkpoint into the local process. This can be useful in the
+        case of local inference, and when using regular Tensors (as opposed to DTensor
+         or ShardedTensor)
+
+    .. note:
+        Rank 0 is assumed to be the coordinator rank.
+
+    Args:
+        keys (Optional[Union[Set[str], str]]):
+            Loads any key specified in this set. If no keys are specified, the entire checkpoint
+            is loaded.
+        checkpoint_id (Union[str, os.PathLike, None]):
+            The ID of this checkpoint instance. The meaning of the checkpoint_id
+            depends on the storage. It can be a path to a folder or to a file.
+            It can also be a key if the storage is a key-value store.
+            (Default: ``None``)
+        storage_reader (Optional[StorageReader]):
+            Instance of StorageWriter used to perform reads. If this is not
+            specified, DCP will automatically infer the reader based on the
+            checkpoint_id. If checkpoint_id is also None, an exception will
+            be raised. (Default: ``None``)
+        process_group (Optional[ProcessGroup]):
+            ProcessGroup to be used for cross-rank synchronization.
+            (Default: ``None``)
+
+    Returns:
+        State dict from specified keys
+    """
+    torch._C._log_api_usage_once(
+        "torch.distributed.checkpoint._load_state_dict_from_keys"
+    )
+
+    no_dist = not (dist.is_available() and dist.is_initialized())
+    if no_dist:
+        warnings.warn(
+            "torch.distributed is unavailable or uninitialized, assuming the intent is to load in a single process."
+        )
+
+    storage_reader = cast(
+        StorageReader, _storage_setup(storage_reader, checkpoint_id, reader=True)
+    )
+
+    if isinstance(keys, str):
+        keys = {keys}
+
+    sd: Dict[str, Any] = {}
+    _load_state_dict(
+        state_dict=sd,
+        storage_reader=storage_reader,
+        process_group=process_group,
+        no_dist=no_dist,
+        planner=_EmptyStateDictLoadPlanner(keys=keys or set()),
+    )
+
+    return sd
diff --git a/torch/distributed/checkpoint/state_dict_saver.py b/torch/distributed/checkpoint/state_dict_saver.py
index 4d1164d6aa96d..bc98e2bb009e5 100644
--- a/torch/distributed/checkpoint/state_dict_saver.py
+++ b/torch/distributed/checkpoint/state_dict_saver.py
@@ -1,20 +1,27 @@
+import inspect
+import os
 import warnings
 from concurrent.futures import Future, ThreadPoolExecutor
-from typing import Optional
+from typing import cast, Optional, Union
 
 import torch
 import torch.distributed as dist
 from torch.distributed._state_dict_utils import _offload_state_dict_to_cpu
+
+from torch.distributed.checkpoint._storage_utils import _storage_setup
+from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
+from torch.distributed.checkpoint.logger import _dcp_method_logger
+from torch.distributed.checkpoint.metadata import Metadata, STATE_DICT_TYPE
+from torch.distributed.checkpoint.planner import SavePlan, SavePlanner
+from torch.distributed.checkpoint.staging import AsyncStager
 from torch.distributed.checkpoint.stateful import Stateful
+from torch.distributed.checkpoint.storage import StorageWriter
+from torch.distributed.distributed_c10d import _get_default_group
 
-from .default_planner import DefaultSavePlanner
-from .metadata import Metadata, STATE_DICT_TYPE
-from .planner import SavePlanner
-from .storage import StorageWriter
-from .utils import _DistWrapper, _profile
+from .utils import _api_bc_check, _DistWrapper, _profile
 
 
-__all__ = ["save_state_dict", "save"]
+__all__ = ["save_state_dict", "save", "async_save"]
 
 
 def save_state_dict(
@@ -31,6 +38,8 @@ def save_state_dict(
         "Please use 'save' instead."
     )
 
+    storage_writer.reset()
+
     # TODO: test returning `save` here instead.
     with _profile():
         return _save_state_dict(
@@ -43,14 +52,15 @@ def save_state_dict(
         )
 
 
+@_dcp_method_logger(log_exceptions=True)  # type: ignore[arg-type]
+@_api_bc_check
 def save(
     state_dict: STATE_DICT_TYPE,
-    storage_writer: StorageWriter,
     *,
-    process_group: Optional[dist.ProcessGroup] = None,
-    coordinator_rank: int = 0,
-    no_dist: bool = False,
+    checkpoint_id: Union[str, os.PathLike, None] = None,
+    storage_writer: Optional[StorageWriter] = None,
     planner: Optional[SavePlanner] = None,
+    process_group: Optional[dist.ProcessGroup] = None,
 ) -> Metadata:
     """
     Save a distributed model in SPMD style.
@@ -75,20 +85,31 @@ def save(
         group needs to be passed in.
 
     .. note::
-        This function can be used to save a state_dict without having a process group
-        initialized by passing ``no_dist=True``.
+        If no process group is available, this function assumes the intention is to save the
+         state_dict in the local process.
+
+    .. note:
+        Rank 0 is assumed to be the coordinator rank.
 
 
     Args:
         state_dict (Dict[str, Any]): The state_dict to save.
-        storage_writer (StorageWriter):
-            Instance of StorageWrite use to perform writes.
-        process_group (ProcessGroup):
+        checkpoint_id (Union[str, os.PathLike, None]):
+            The ID of this checkpoint instance. The meaning of the checkpoint_id
+            depends on the storage. It can be a path to a folder or to a file.
+            It can also be a key if the storage is a key-value store.
+            (Default: ``None``)
+        storage_writer (Optional[StorageWriter]):
+            Instance of StorageWriter used to perform writes. If this is not
+            specified, DCP will automatically infer the writer based on the
+            checkpoint_id. If checkpoint_id is also None, an exception will
+            be raised. (Default: ``None``)
+        planner (Optional[SavePlanner]):
+            Instance of SavePlanner. If this is not specificed, the default
+            planner will be used. (Default: ``None``)
+        process_group (Optional[ProcessGroup]):
             ProcessGroup to be used for cross-rank synchronization.
-        coordinator_rank (int): Rank to use to coordinate the checkpoint.
-            rank0 is used by default.
-        no_dist (bool): If ``True``, distributed checkpoint will not save
-            in SPMD style. (Default: ``False``)
+            (Default: ``None``)
 
     Returns:
         Metadata: Metadata object for the saved checkpoint.
@@ -97,11 +118,11 @@ def save(
         >>> # xdoctest: +SKIP
         >>> my_model = MyModule()
 
-        >>> model_state_dict = my_model.state_dict()
+        >>> state_dict = {"model": my_model}
 
         >>> fs_storage_writer = torch.distributed.checkpoint.FileSystemWriter("/checkpoint/1")
-        >>> torch.distributed.checkpoint.save_state_dict(
-        >>>     state_dict=model_state_dict,
+        >>> torch.distributed.checkpoint.save(
+        >>>     state_dict=state_dict,
         >>>     storage_writer=fs_storage_writer,
         >>> )
 
@@ -115,69 +136,115 @@ def save(
     """
     torch._C._log_api_usage_once("torch.distributed.checkpoint.save")
 
+    no_dist = not (dist.is_available() and dist.is_initialized())
+    if no_dist:
+        warnings.warn(
+            "torch.distributed is unavailable or uninitialized, assuming the intent is to save in a single process."
+        )
+
     with _profile():
-        dumpable_state_dict = {}
-        for key, elem in state_dict.items():
-            dumpable_state_dict[key] = (
-                elem.state_dict() if isinstance(elem, Stateful) else elem
-            )
+        storage_writer = cast(
+            StorageWriter, _storage_setup(storage_writer, checkpoint_id, reader=False)
+        )
 
         return _save_state_dict(
-            dumpable_state_dict,
-            storage_writer,
-            process_group,
-            coordinator_rank,
-            no_dist,
-            planner,
+            state_dict=_stateful_to_state_dict(state_dict),
+            storage_writer=storage_writer,
+            process_group=process_group,
+            no_dist=no_dist,
+            planner=planner,
         )
 
 
-def _async_save(
+@_dcp_method_logger(log_exceptions=True)
+def async_save(
     state_dict: STATE_DICT_TYPE,
-    storage_writer: StorageWriter,
     *,
-    process_group: Optional[dist.ProcessGroup] = None,
-    coordinator_rank: int = 0,
-    no_dist: bool = False,
+    checkpoint_id: Union[str, os.PathLike, None] = None,
+    storage_writer: Optional[StorageWriter] = None,
     planner: Optional[SavePlanner] = None,
+    process_group: Optional[dist.ProcessGroup] = None,
 ) -> Future:
-    """Asynchronous version of ``save_state_dict``. This code first de-stages the state_dict on CPU, and then calls
+    """Asynchronous version of ``save``. This code first de-stages the state_dict on CPU, and then calls
     `save` in a separate thread.
 
     .. warning::
-        This feature is experimental and subject to removal/change.
+        This feature is experimental and subject to change.
 
     Args:
         state_dict (Dict[str, Any]): The state_dict to save.
-        storage_writer (StorageWriter):
-            Instance of StorageWrite use to perform writes.
-        process_group (ProcessGroup):
+        checkpoint_id (Union[str, os.PathLike, None]):
+            The ID of this checkpoint instance. The meaning of the checkpoint_id
+            depends on the storage. It can be a path to a folder or to a file.
+            It can also be a key if the storage is a key-value store.
+            (Default: ``None``)
+        storage_writer (Optional[StorageWriter]):
+            Instance of StorageWriter used to perform writes. If this is not
+            specified, DCP will automatically infer the writer based on the
+            checkpoint_id. If checkpoint_id is also None, an exception will
+            be raised. (Default: ``None``)
+        planner (Optional[SavePlanner]):
+            Instance of SavePlanner. If this is not specificed, the default
+            planner will be used. (Default: ``None``)
+        process_group (Optional[ProcessGroup]):
             ProcessGroup to be used for cross-rank synchronization.
-        coordinator_rank (int): Rank to use to coordinate the checkpoint.
-            rank0 is used by default.
-        no_dist (bool): If ``True``, distributed checkpoint will not save
-            in SPMD style. (Default: ``False``)
+            (Default: ``None``)
 
     Returns:
         Future: A future holding the resultant Metadata object from `save`.
 
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> my_model = MyModule()
+
+        >>> state_dict = {"model": my_model}
+
+        >>> fs_storage_writer = torch.distributed.checkpoint.FileSystemWriter("/checkpoint/1")
+        >>> checkpoint_future = torch.distributed.checkpoint.async_save(
+        >>>     state_dict=state_dict,
+        >>>     storage_writer=fs_storage_writer,
+        >>> )
+        >>>
+        >>> # ... do some work ...
+        >>>
+        >>> checkpoint_future.result()
+
     """
-    torch._C._log_api_usage_once("torch.distributed.checkpoint._async_save")
+    torch._C._log_api_usage_once("torch.distributed.checkpoint.async_save")
+
+    if dist.is_available() and dist.is_initialized():
+        pg = process_group or _get_default_group()
+        assert (
+            torch.device("cpu") in pg._device_types  # type: ignore[attr-defined]
+        ), "A CPU backend must be enabled for async save; try initializing process group with 'cpu:gloo,cuda:nccl'"
 
-    cpu_state_dict = _offload_state_dict_to_cpu(_stateful_to_state_dict(state_dict))
+    storage_writer = cast(
+        StorageWriter, _storage_setup(storage_writer, checkpoint_id, reader=False)
+    )
+
+    state_dict = _stateful_to_state_dict(state_dict)
+    if isinstance(storage_writer, AsyncStager):
+        staged_state_dict = storage_writer.stage(state_dict)
+    else:  # provides bwc for storage_writers not implementing AsyncStager
+        staged_state_dict = _offload_state_dict_to_cpu(state_dict, type_check=False)
 
     executor = ThreadPoolExecutor(max_workers=1)
-    f = executor.submit(
-        _save_state_dict,
-        cpu_state_dict,
-        storage_writer,
-        process_group,
-        coordinator_rank,
-        no_dist,
-        planner,
+    f: Future = executor.submit(
+        save,
+        staged_state_dict,
+        checkpoint_id=checkpoint_id,
+        storage_writer=storage_writer,
+        planner=planner,
+        process_group=process_group,
     )
     f.add_done_callback(lambda f: executor.shutdown(wait=False))
 
+    if (
+        isinstance(storage_writer, AsyncStager)
+        and storage_writer.should_synchronize_after_execute
+    ):
+        storage_writer.synchronize_staging()
+
     return f
 
 
@@ -208,14 +275,34 @@ def _save_state_dict(
 
     global_metatadata = None
 
+    ckpt_kwargs = {}
+    if (ckpt_id := getattr(storage_writer, "checkpoint_id", None)) is not None:
+        ckpt_kwargs["checkpoint_id"] = ckpt_id
+
+    @_dcp_method_logger(**ckpt_kwargs)
     def local_step():
         assert planner is not None
-        planner.set_up_planner(state_dict, distW.is_coordinator)
+        storage_meta = storage_writer.storage_meta()
+        if "storage_meta" not in inspect.signature(planner.set_up_planner).parameters:
+            warnings.warn(
+                "The function definition for SavePlanner.set_up_planner has been updated"
+                " to include the storage_meta argument. Please update your implementation"
+                " to include this parameter."
+            )
+            planner.set_up_planner(state_dict, distW.is_coordinator)  # type: ignore[call-arg, arg-type]
+        else:
+            planner.set_up_planner(
+                state_dict=state_dict,
+                storage_meta=storage_meta,
+                is_coordinator=distW.is_coordinator,
+            )
         storage_writer.set_up_storage_writer(distW.is_coordinator)
+
         local_plan = planner.create_local_plan()
         local_plan = storage_writer.prepare_local_plan(local_plan)
         return local_plan
 
+    @_dcp_method_logger(**ckpt_kwargs)
     def global_step(all_local_plans):
         nonlocal global_metatadata
 
@@ -224,8 +311,9 @@ def global_step(all_local_plans):
         all_local_plans = storage_writer.prepare_global_plan(all_local_plans)
         return all_local_plans
 
-    central_plan = distW.reduce_scatter("plan", local_step, global_step)
+    central_plan: SavePlan = distW.reduce_scatter("plan", local_step, global_step)
 
+    @_dcp_method_logger(**ckpt_kwargs)
     def write_data():
         assert planner is not None
         final_local_plan = planner.finish_plan(central_plan)
@@ -234,6 +322,7 @@ def write_data():
         all_writes.wait()
         return all_writes.value()
 
+    @_dcp_method_logger(**ckpt_kwargs)
     def finish_checkpoint(all_results):
         assert global_metatadata is not None
         storage_writer.finish(metadata=global_metatadata, results=all_results)
diff --git a/torch/distributed/checkpoint/storage.py b/torch/distributed/checkpoint/storage.py
index 59282df00a429..bd786671c4526 100644
--- a/torch/distributed/checkpoint/storage.py
+++ b/torch/distributed/checkpoint/storage.py
@@ -1,11 +1,17 @@
 import abc
+import os
 from dataclasses import dataclass
-from typing import Any, List
+from typing import Any, List, Optional, Union
 
-from torch.futures import Future
+from torch.distributed.checkpoint.metadata import Metadata, MetadataIndex, StorageMeta
+from torch.distributed.checkpoint.planner import (
+    LoadPlan,
+    LoadPlanner,
+    SavePlan,
+    SavePlanner,
+)
 
-from .metadata import Metadata, MetadataIndex
-from .planner import LoadPlan, LoadPlanner, SavePlan, SavePlanner
+from torch.futures import Future
 
 __all__ = ["WriteResult", "StorageWriter", "StorageReader"]
 
@@ -28,6 +34,7 @@ class StorageWriter(abc.ABC):
 
     A subclass should expect the following sequence of calls.
 
+    0) (all ranks) set checkpoint_id if users pass a valid checkpoint_id.
     1) (all ranks) set_up_storage_writer()
     2) (all ranks) prepare_local_plan()
     3) (coordinator) prepare_global_plan()
@@ -35,6 +42,24 @@ class StorageWriter(abc.ABC):
     5) (coordinator) finish()
     """
 
+    @abc.abstractmethod
+    def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
+        """
+        Calls to indicates a brand new checkpoint write is going to happen.
+        A checkpoint_id may be present if users set the checkpoint_id for
+        this checkpoint write. The meaning of the checkpiont_id is
+        storage-dependent. It can be a path to a folder/file or a key for
+        a key-value storage.
+
+        Args:
+            checkpoint_id (Union[str, os.PathLike, None]):
+                The ID of this checkpoint instance. The meaning of the checkpoint_id
+                depends on the storage. It can be a path to a folder or to a file.
+                It can also be a key if the storage is a key-value store.
+                (Default: ``None``)
+        """
+        ...
+
     @abc.abstractmethod
     def set_up_storage_writer(self, is_coordinator: bool) -> None:
         """
@@ -123,6 +148,25 @@ def finish(self, metadata: Metadata, results: List[List[WriteResult]]) -> None:
         """
         pass
 
+    @classmethod
+    @abc.abstractmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        """
+        Check if the given checkpoint_id is supported by the stroage. This allow
+        us to enable automatic storage selection.
+        """
+        ...
+
+    def storage_meta(self) -> Optional[StorageMeta]:
+        """
+        Return the storage-specific metadata. This is used to store additional information
+        in a checkpoint that can be useful for providing request-level observability. StorageMeta
+        is passed to the ``SavePlanner`` during save calls. Returns None by default.
+
+        TODO: provide an example
+        """
+        return None
+
 
 class StorageReader(abc.ABC):
     """
@@ -134,6 +178,7 @@ class StorageReader(abc.ABC):
 
     A subclass should expected the following sequence of calls by ``load_state_dict``:
 
+    0) (all ranks) set checkpoint_id if users pass a valid checkpoint_id.
     1) (all ranks) read_metadata()
     2) (all ranks) set_up_storage_reader()
     3) (all ranks) prepare_local_plan()
@@ -141,6 +186,24 @@ class StorageReader(abc.ABC):
     5) (all ranks) read_data()
     """
 
+    @abc.abstractmethod
+    def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
+        """
+        Calls to indicates a brand new checkpoint read is going to happen.
+        A checkpoint_id may be present if users set the checkpoint_id for
+        this checkpoint read. The meaning of the checkpiont_id is
+        storage-dependent. It can be a path to a folder/file or a key for
+        a key-value storage.
+
+        Args:
+            checkpoint_id (Union[str, os.PathLike, None]):
+                The ID of this checkpoint instance. The meaning of the checkpoint_id
+                depends on the storage. It can be a path to a folder or to a file.
+                It can also be a key if the storage is more like a key-value store.
+                (Default: ``None``)
+        """
+        ...
+
     @abc.abstractmethod
     def read_metadata(self) -> Metadata:
         """
@@ -220,3 +283,12 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
             A future that completes once all reads are finished.
         """
         pass
+
+    @classmethod
+    @abc.abstractmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        """
+        Check if the given checkpoint_id is supported by the stroage. This allow
+        us to enable automatic storage selection.
+        """
+        ...
diff --git a/torch/distributed/checkpoint/utils.py b/torch/distributed/checkpoint/utils.py
index 041acc1995e83..d781d9839beaa 100644
--- a/torch/distributed/checkpoint/utils.py
+++ b/torch/distributed/checkpoint/utils.py
@@ -1,8 +1,11 @@
 import cProfile
+import inspect
 import io
 import itertools
 import os
+import warnings
 from contextlib import contextmanager
+from functools import wraps
 from pstats import Stats
 from typing import Any, Callable, cast, Dict, List, Optional, Sequence, TypeVar, Union
 
@@ -35,12 +38,14 @@ def _get_failure_dict(
     )
 
 
-def _all_gather_keys(local_dict: Dict[Any, Any]) -> List[Any]:
+def _all_gather_keys(
+    local_dict: Dict[Any, Any], group: Optional[dist.ProcessGroup] = None
+) -> List[Any]:
     """Gathers all keys, and returns them sorted."""
     keys = list(local_dict.keys())
     gathered_keys: List[List[Any]] = [None] * dist.get_world_size()  # type: ignore[list-item]
 
-    dist.all_gather_object(gathered_keys, keys)
+    dist.all_gather_object(gathered_keys, keys, group=group)
     return sorted(set(itertools.chain.from_iterable(gathered_keys)))
 
 
@@ -395,3 +400,30 @@ def _profile():
             stats.sort_stats("time").print_stats(10)
     else:
         yield
+
+
+def _api_bc_check(func):
+    @wraps(func)
+    def inner_func(*args, **kwargs) -> Any:
+        if len(args) == 2:
+            warnings.warn(
+                f"The argument order of {func.__name__} has been changed. "
+                "Please check the document to avoid future breakages."
+            )
+            sig = inspect.signature(func)
+            kwonlyargs = [
+                p.name for p in sig.parameters.values() if p.kind == p.KEYWORD_ONLY
+            ]
+            if "storage_writer" in kwonlyargs:
+                assert "storage_writer" not in kwargs, (args, kwargs)
+                kwargs["storage_writer"] = args[1]
+            elif "storage_reader" in kwonlyargs:
+                assert "storage_reader" not in kwargs, (args, kwargs)
+                kwargs["storage_reader"] = args[1]
+            else:
+                raise RuntimeError(f"Unexpected kwonlyargs = {kwonlyargs}")
+            return func(args[0], **kwargs)
+        else:
+            return func(*args, **kwargs)
+
+    return inner_func
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
index 697ab2f4a8411..b199e82a17579 100644
--- a/torch/distributed/device_mesh.py
+++ b/torch/distributed/device_mesh.py
@@ -1,6 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import logging
 import math
+import threading
 from typing import Dict, List, Optional, Tuple, TYPE_CHECKING, Union
 
 import torch
@@ -37,6 +38,7 @@ def _init_device_mesh_stub():
         _find_pg_by_ranks_and_tag,
         _get_default_group,
         _get_group_tag,
+        get_process_group_ranks,
         get_rank,
         get_world_size,
         init_process_group,
@@ -56,7 +58,7 @@ def _init_device_mesh_stub():
                 "DeviceMesh requires numpy >= 1.21 to be installed for type checking"
             )
 
-    class _MeshEnv:
+    class _MeshEnv(threading.local):
         def __init__(self) -> None:
             self.mesh_stack: List[DeviceMesh] = []
             self.child_to_parent_mapping: Dict[DeviceMesh, DeviceMesh] = {}
@@ -81,12 +83,12 @@ def create_child_mesh(
                     device_mesh.device_type,
                     mesh_1d,
                     mesh_dim_names=(mesh_dim_name,),
-                    _init_process_groups=False,
+                    _init_backend=False,
                 )
                 if cur_rank in mesh_1d:
                     res_sub_mesh = sub_mesh
 
-            res_sub_mesh._dim_group_infos = [device_mesh._dim_group_infos[mesh_dim]]
+            res_sub_mesh._dim_group_infos = [device_mesh._dim_group_infos[mesh_dim]]  # type: ignore[possibly-undefined]
             # Assign the current DeviceMesh as the parent of the child DeviceMesh.
             self.child_to_parent_mapping[res_sub_mesh] = device_mesh
             return res_sub_mesh
@@ -134,7 +136,7 @@ def get_mesh_dim_by_name(
                     f"Mesh dimension '{mesh_dim_name}' does not exist.",
                     f"Available mesh dimensions are: mesh_dim_names={device_mesh.mesh_dim_names}",
                 )
-            return device_mesh.mesh_dim_names.index(mesh_dim_name)  # type: ignore[union-attr]
+            return not_none(device_mesh.mesh_dim_names.index(mesh_dim_name))
 
     _mesh_resources: _MeshEnv = _MeshEnv()
 
@@ -197,11 +199,13 @@ def __init__(
             mesh: Union[torch.Tensor, "ArrayLike"],
             *,
             mesh_dim_names: Optional[Tuple[str, ...]] = None,
-            _init_process_groups: bool = True,
+            _init_backend: bool = True,
         ) -> None:
             self.device_type = device_type
+            if isinstance(mesh, torch.Tensor) and mesh.device.type != "cpu":
+                raise ValueError(f"`mesh` must be a CPU tensor, got {mesh}")
             self.mesh = (
-                mesh.detach()
+                mesh.detach().to(dtype=torch.int)
                 if isinstance(mesh, torch.Tensor)
                 else torch.tensor(mesh, dtype=torch.int)
             )
@@ -211,16 +215,23 @@ def __init__(
             self._flatten_mesh_list = tuple(self.mesh.flatten().tolist())
             self._hash = hash((self._flatten_mesh_list, self.mesh.shape, id(self)))
 
-            # Skip process group initialization if xla device.
+            # Skip process group initialization if xla device or init backend is False
             # TODO(yeounoh) implement DeviceMesh backend and register XLA backend.
             if device_type != "xla":
                 # always try to create default (world) pg, even if it is not initialized
                 # already. The world pg is used for device mesh identity (rank) on each
                 # process (we need to know if the current global rank is in the mesh or not).
-                self._get_or_create_default_group()
-                if _init_process_groups:
+                if _init_backend:
+                    self._get_or_create_default_group()
                     self._init_process_groups()
 
+                # calculate the coordinates of the current global rank on the mesh
+                rank_coords = (self.mesh == get_rank()).nonzero()
+                assert rank_coords.size(0) in (0, 1)
+                self._coordinate_on_dim: Optional[List[int]] = (
+                    rank_coords[0].tolist() if rank_coords.size(0) > 0 else None
+                )
+
         def _get_or_create_default_group(self):
             default_initialized = is_initialized()
             if not default_initialized:
@@ -248,18 +259,16 @@ def _get_or_create_default_group(self):
                     )
                 device_handle.set_device(get_rank() % num_devices_per_host)
 
-            # calculate the coordinates of the current global rank on the mesh
-            rank_coords = (self.mesh == get_rank()).nonzero()
-            assert rank_coords.size(0) in (0, 1)
-            self._coordinate_on_dim: Optional[List[int]] = (
-                rank_coords[0].tolist() if rank_coords.size(0) > 0 else None
-            )
             return _get_default_group()
 
         def _init_process_groups(self):
-            # group tag/ranks associated with each mesh dimension, each mesh dimension should
-            # have one sub-group per rank
-            dim_group_infos: List[Tuple[str, List[int]]] = []
+            # tag/ranks/group_name associated with each mesh dimension, each
+            # mesh dimension should have one sub-group per rank
+            #
+            # TODO(yifu): remove tag and ranks once we fully migrate to native
+            # functional collectives. See details in:
+            # https://github.com/pytorch/pytorch/issues/93173#issuecomment-1907095208
+            dim_group_infos: List[Tuple[str, List[int], str]] = []
 
             if self.mesh.ndim == 1 and self.mesh.numel() == get_world_size():
                 # if the mesh is the same as world_pg, we just append the default
@@ -269,6 +278,7 @@ def _init_process_groups(self):
                     (
                         _get_group_tag(_get_default_group()),
                         list(range(get_world_size())),
+                        _get_default_group().group_name,
                     )
                 )
             else:
@@ -283,10 +293,12 @@ def _init_process_groups(self):
                     # for each dim and append the groups
                     for dim_mesh in pg_ranks_by_dim:
                         subgroup_ranks = dim_mesh.tolist()
-                        # call new_group regardless of the current rank in the
-                        # pg or not, it's required that all ranks participate
-                        # in subgroup construction
+
+                        # We temporarily revert the re-use subgroup, since it breaks two internal tests.
+                        # Temporarily reverting to resolve test timeout while root-causing.
+                        # TODO: Add two tests to cover internal tests scenarios and re-enable reuse subgroup if exists.
                         dim_group = new_group(ranks=subgroup_ranks)
+
                         # only add to dim_groups if the current rank in the subgroup
                         if self.get_rank() in subgroup_ranks:
                             if len(dim_group_infos) > dim:
@@ -295,7 +307,11 @@ def _init_process_groups(self):
                                     f"in {subgroup_ranks}!"
                                 )
                             dim_group_infos.append(
-                                (_get_group_tag(dim_group), subgroup_ranks)
+                                (
+                                    _get_group_tag(not_none(dim_group)),
+                                    subgroup_ranks,
+                                    dim_group.group_name,
+                                )
                             )
             self._dim_group_infos = dim_group_infos
 
@@ -358,13 +374,16 @@ def __getitem__(self, mesh_dim_name: str) -> "DeviceMesh":
                 >>> # of cross-host(dim 0), and within-host (dim 1).
                 >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
             """
-            if self.mesh.ndim <= 1:
-                raise RuntimeError(
-                    f"Cannot slice a DeviceMesh with {self.mesh.ndim} dimension."
-                )
+            if self.mesh.ndim == 1:
+                if self.mesh_dim_names and mesh_dim_name == self.mesh_dim_names[0]:
+                    return self
+                else:
+                    raise RuntimeError(
+                        f"Invalid mesh_dim_name {mesh_dim_name} specified."
+                    )
+
             mesh_dim = _mesh_resources.get_mesh_dim_by_name(self, mesh_dim_name)
             submesh = _mesh_resources.create_child_mesh(self, mesh_dim, mesh_dim_name)
-
             return submesh
 
         def get_group(
@@ -388,24 +407,45 @@ def get_group(
                 raise RuntimeError("DeviceMesh process groups not initialized!")
 
             if self.mesh.ndim == 1:
-                return not_none(_find_pg_by_ranks_and_tag(*self._dim_group_infos[0]))
+                return not_none(
+                    _find_pg_by_ranks_and_tag(*self._dim_group_infos[0][:2])
+                )
 
             if mesh_dim is not None:
                 if isinstance(mesh_dim, str):
                     mesh_dim = _mesh_resources.get_mesh_dim_by_name(self, mesh_dim)
                 return not_none(
-                    _find_pg_by_ranks_and_tag(*self._dim_group_infos[mesh_dim])
+                    _find_pg_by_ranks_and_tag(*self._dim_group_infos[mesh_dim][:2])
                 )
             else:
                 dim_groups = []
                 for ith_dim in range(self.mesh.ndim):
                     dim_groups.append(
                         not_none(
-                            _find_pg_by_ranks_and_tag(*self._dim_group_infos[ith_dim])
+                            _find_pg_by_ranks_and_tag(
+                                *self._dim_group_infos[ith_dim][:2]
+                            )
                         )
                     )
                 return dim_groups
 
+        @staticmethod
+        def from_group(group: ProcessGroup, device_type: str) -> "DeviceMesh":
+            """
+            Contstructs a :class:`DeviceMesh` with ``device_type`` from an
+            existing :class:`ProcessGroup`.
+
+            The constructed device mesh is assumed to be 1D.
+            """
+            # Manually define `_dim_group_infos` instead of relying on the
+            # normal logic since we already have the PG
+            group_ranks = get_process_group_ranks(group)
+            mesh = DeviceMesh(device_type, group_ranks, _init_backend=False)
+            mesh._dim_group_infos = [
+                (_get_group_tag(group), group_ranks, group.group_name)
+            ]
+            return mesh
+
         def size(self, mesh_dim: Optional[int] = None) -> int:
             return self.mesh.numel() if mesh_dim is None else self.mesh.size(mesh_dim)
 
@@ -459,11 +499,11 @@ def get_local_rank(self, mesh_dim: Optional[Union[int, str]] = None) -> int:
             elif mesh_dim is None:
                 mesh_dim = 0
 
-            mesh_dim_group = self.get_group(mesh_dim)  # type: ignore[arg-type]
+            mesh_dim_group = not_none(self.get_group(mesh_dim))
             assert isinstance(
                 mesh_dim_group, ProcessGroup
             ), "We expect ProcessGroup before calling `get_rank`!"
-            return get_rank(mesh_dim_group)  # type: ignore[arg-type]
+            return not_none(get_rank(mesh_dim_group))
 
         def get_coordinate(self) -> Optional[List[int]]:
             """
@@ -495,6 +535,7 @@ def init_device_mesh(
 
         Args:
             device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
+                Passing in a device type with a GPU index, such as "cuda:0", is not allowed.
             mesh_shape (Tuple[int]): A tuple defining the dimensions of the multi-dimensional array
                 describing the layout of devices.
             mesh_dim_names (Tuple[str], optional): A tuple of mesh dimension names to assign to each dimension
@@ -525,7 +566,17 @@ def init_device_mesh(
                     f"Found len(mesh_dim_names): {len(mesh_dim_names)} and len(mesh_shape):{len(mesh_shape)}.",
                 )
 
-        mesh = torch.arange(math.prod(mesh_shape)).view(mesh_shape)
+        # assume valid device types are all letters
+        if device_type and not device_type.isalpha():
+            raise RuntimeError(
+                f"Device type with GPU index is not supported but got {device_type}. ",
+                "If you maintained a 'torch.device' object, it's recommended to pass in 'device.type'.",
+            )
+
+        # Always initialize the mesh's tensor on CPU, regardless of what the
+        # external device type has been set to be (e.g. meta)
+        with torch.device("cpu"):
+            mesh = torch.arange(math.prod(mesh_shape), dtype=torch.int).view(mesh_shape)
         device_mesh = DeviceMesh(
             device_type=device_type,
             mesh=mesh,
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 41a12a6d49e69..2925e2fb36b2f 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -13,7 +13,7 @@
 import warnings
 from collections import namedtuple
 from datetime import timedelta
-from typing import Any, Callable, Dict, Optional, Tuple, Union, List
+from typing import Any, Callable, Dict, Optional, Tuple, Union, List, TYPE_CHECKING
 
 import torch
 from torch._C._distributed_c10d import (
@@ -34,8 +34,13 @@
     Store,
     DebugLevel,
     get_debug_level,
-    Work
+    Work,
+    _register_process_group,
+    _resolve_process_group,
+    _unregister_all_process_groups,
+    _unregister_process_group,
 )
+from torch._utils_internal import set_pytorch_distributed_envs_from_justknobs
 from .constants import default_pg_timeout, default_pg_nccl_timeout
 from .c10d_logger import _exception_logger, _time_logger
 from .rendezvous import register_rendezvous_handler, rendezvous  # noqa: F401
@@ -46,10 +51,10 @@
     'Backend', 'BackendConfig', 'GroupMember', 'P2POp', 'all_gather', 'all_gather_coalesced',
     'all_gather_object', 'all_reduce',
     'all_reduce_coalesced', 'all_to_all',
-    'all_to_all_single', 'barrier', 'batch_isend_irecv', 'broadcast',
-    'broadcast_object_list', 'destroy_process_group',
+    'all_to_all_single', 'barrier', 'batch_isend_irecv', 'broadcast', 'send_object_list',
+    'recv_object_list', 'broadcast_object_list', 'destroy_process_group',
     'gather', 'gather_object', 'get_backend_config', 'get_backend', 'get_rank',
-    'get_world_size', 'group', 'init_process_group', 'irecv',
+    'get_world_size', 'get_pg_count', 'group', 'init_process_group', 'irecv',
     'is_gloo_available', 'is_initialized', 'is_mpi_available', 'is_backend_available',
     'is_nccl_available', 'is_torchelastic_launched', 'is_ucc_available',
     'isend', 'monitored_barrier', 'new_group', 'new_subgroups',
@@ -283,17 +288,17 @@ def __init__(self, backend: Backend):
         if backend == Backend.UNDEFINED:
             # default config when backend is not specified
             # supported since PyTorch 2.0
-            for device in Backend.default_device_backend_map:
-                if is_backend_available(Backend.default_device_backend_map[device]):
-                    self.device_backend_map[device] = Backend(Backend.default_device_backend_map[device])
+            for device, default_backend in Backend.default_device_backend_map.items():
+                if is_backend_available(default_backend):
+                    if default_backend == Backend.NCCL and not torch.cuda.is_available():
+                        continue
+                    self.device_backend_map[device] = Backend(default_backend)
         elif backend.lower() in Backend.backend_list:
             # Cases for when backend is a single string (without device types)
             # e.g. "nccl", "gloo", "ucc", "mpi"
             supported_devices = Backend.backend_capability[backend.lower()]
             backend_val = Backend(backend)
-            self.device_backend_map = {
-                device : backend_val for device in supported_devices
-            }
+            self.device_backend_map = dict.fromkeys(supported_devices, backend_val)
         elif ":" in backend.lower():
             # Backend specified in "device:backend" format
             # make sure the backend string is in the correct format
@@ -331,7 +336,7 @@ def __init__(self, backend: Backend):
             }
 
         logger.info(
-            f"Using backend config: {self.device_backend_map}"  # noqa: G004
+            "Using backend config: %s", self.device_backend_map
         )
 
     def __repr__(self):
@@ -544,22 +549,20 @@ def pg_default_device(self) -> Dict[ProcessGroup, torch.device]:
         return self._pg_default_device
 
     @property
-    def pg_config_info(self) -> List[Dict[str, Union[int, str, List[int]]]]:
+    def pg_config_info(self) -> List[Dict[str, Any]]:
         """
         Return a list of dict with process groups and backends.
 
         Along with their unique IDs and configurations (types and ranks).
         """
-        config_info: List[Dict[str, Union[int, str, List[int]]]] = []
+        config_info: List[Dict[str, Any]] = []
         default_pg_size = _get_group_size(None)
-        for pg, backend in self.pg_map.items():
-            # backend is a tuple with the first element being the backend type ("nccl", etc.)
-            backend_type = Backend.backend_type_map[backend[0]]
+        for pg in self.pg_map.keys():
             ranks = self.pg_group_ranks[pg]
             config_info.append(
                 {
                     "pg_name": self.pg_names[pg],
-                    "backend_id": pg._backend_id(backend_type),
+                    "pg_desc": pg.group_desc,
                     "backend_config": self.pg_backend_config[pg],
                     "ranks": list(ranks.keys())
                     if len(ranks) != default_pg_size
@@ -620,7 +623,7 @@ def _check_valid_timeout(timeout: Any) -> None:
         )
 
 # Default process group state
-_default_pg_init_method = None
+_default_pg_init_method: Optional[str] = None
 
 STORE_BASED_BARRIER_PREFIX = "store_based_barrier_key"
 
@@ -686,8 +689,8 @@ def _get_pg_default_device(group: Optional[ProcessGroup] = None) -> torch.device
         _world.pg_default_device[group] = devices[0]
 
     logger.info(
-        f"Using device {_world.pg_default_device[group]} for object "  # noqa: G004
-        "collectives."
+        "Using device %s for object "
+        "collectives.", _world.pg_default_device[group]
     )
     return _world.pg_default_device[group]
 
@@ -703,7 +706,7 @@ def _store_based_barrier(rank, store, group_name, rendezvous_count, timeout, log
     """
     store_key = f"{STORE_BASED_BARRIER_PREFIX}:{group_name}"
     store.add(store_key, 1)
-    logger.info("Added key: %s to store for rank: %s", store_key, rank)
+    logger.debug("Added key: %s to store for rank: %s", store_key, rank)
 
     # Now wait for all workers to check in with the store.
     world_size = rendezvous_count
@@ -727,18 +730,17 @@ def _store_based_barrier(rank, store, group_name, rendezvous_count, timeout, log
         except RuntimeError as e:
             worker_count = store.add(store_key, 0)
             # Print status periodically to keep track.
-            logger.info(
+            logger.debug(
                 "Waiting in store based barrier to initialize process group for "
-                "rank: %s, key: %s (world_size=%s, num_workers_joined=%s, timeout=%s)",
-                rank, store_key, world_size, worker_count, timeout
+                "rank: %s, key: %s (world_size=%s, num_workers_joined=%s, timeout=%s error=%s)",
+                rank, store_key, world_size, worker_count, timeout, e
             )
 
             if timedelta(seconds=(time.time() - start)) > timeout:
                 raise DistStoreError(  # noqa: TRY200
                     "Timed out initializing process group in store based barrier on "
-                    "rank {}, for key: {} (world_size={}, num_workers_joined={}, timeout={})".format(
-                        rank, store_key, world_size, worker_count, timeout
-                    )
+                    f"rank {rank}, for key: {store_key} (world_size={world_size}, "
+                    f"num_workers_joined={worker_count}, timeout={timeout} error={e})"
                 )
 
     logger.info(
@@ -840,6 +842,20 @@ def _get_group_size(group) -> int:
     return group.size()
 
 
+def _get_group_size_by_name(group_name: str) -> int:
+    group = _resolve_process_group(group_name)
+    return group.size()
+
+
+def _resolve_group_name_by_ranks_and_tag(ranks: List[int], tag: str) -> str:
+    # TODO(yifu): remove this function once ranks + tag is not a supported
+    # identifier for process group for functional collectives.
+    group = _find_pg_by_ranks_and_tag(tag, ranks)
+    if group is None:
+        raise ValueError("")
+    return group.group_name
+
+
 def _check_single_tensor(param, param_name) -> None:
     """Check that the parameter ``param_name`` is a single tensor."""
     if not isinstance(param, torch.Tensor):
@@ -986,7 +1002,10 @@ def _get_default_group() -> ProcessGroup:
             "Default process group has not been initialized, "
             "please make sure to call init_process_group."
         )
-    return not_none(GroupMember.WORLD)
+    if TYPE_CHECKING:
+        return not_none(GroupMember.WORLD)
+    else:
+        return GroupMember.WORLD
 
 
 def _get_default_store() -> Store:
@@ -1050,6 +1069,72 @@ def get_backend(group: Optional[ProcessGroup] = None) -> Backend:
     pg_store = _world.pg_map[pg] if pg in _world.pg_map else None
     return Backend(not_none(pg_store)[0])
 
+def _get_process_group_uid(pg: ProcessGroup) -> int:
+    backend = None
+    try:
+        backend = pg._get_backend(torch.device("cuda"))
+    except RuntimeError:
+        pass
+    if is_nccl_available() and isinstance(backend, ProcessGroupNCCL):
+        return backend.uid
+    return -1
+
+def _get_pg_config(group: Optional[ProcessGroup] = None) -> Dict[str, Any]:
+    """
+    Return the pg configuration of the given process group.
+
+    """
+    if group is None:
+        pg = _get_default_group()
+    else:
+        pg = group
+    return {
+        "pg_name": _get_process_group_name(pg),
+        "pg_desc": pg.group_desc,
+        "backend_config": get_backend_config(pg),
+        "pg_size": _get_group_size(pg),
+        "ranks": get_process_group_ranks(pg),
+    }
+
+def _get_all_pg_configs() -> List[Dict[str, Any]]:
+    """
+    Return the pg configuration of all the process groups.
+
+    """
+    config_info: List[Dict[str, Any]] = []
+    for pg in _world.pg_map.keys():
+        config_info.append(_get_pg_config(pg))
+    return config_info
+
+def get_pg_count() -> int:
+    """
+    Return the number of process groups.
+
+    """
+    return _world.group_count
+
+def get_node_local_rank() -> int:
+    """
+    Return the local rank of the current process relative to the node.
+
+    Semantically, this is a useful concept for mapping processes to devices.
+    For example, on a node with 8 accelerator you could use the node local rank to decide
+    which accelerator device to bind the process to.
+
+    In practice, the actual assignment of node local ranks is handled by the process launcher outside of pytorch,
+    and communicated via the `LOCAL_RANK` environment variable.
+
+    Torchrun will automatically populate `LOCAL_RANK`, but other launchers may not.  If `LOCAL_RANK` is unspecified,
+    this API will raise an exception.
+
+    """
+    if "LOCAL_RANK" in os.environ:
+        return int(os.environ["LOCAL_RANK"])
+    raise RuntimeError(
+        "LOCAL_RANK is not in the environment, so `get_node_local_rank` can't be used. "
+        "Consider using torchrun or updating your process launcher to specify LOCAL_RANK."
+    )
+
 def _set_pg_timeout(timeout: timedelta, group: Optional[ProcessGroup] = None) -> None:
     """
     Set the timeout for the given process group when users want to use a different timeout instead of
@@ -1071,20 +1156,18 @@ def _set_pg_timeout(timeout: timedelta, group: Optional[ProcessGroup] = None) ->
         None
     """
     if group is None:
-        pg = _get_default_group()
-    else:
-        pg = group
-    if _rank_not_in_group(pg):
+        group = _get_default_group()
+    if _rank_not_in_group(group):
         raise ValueError("Invalid process group specified")
     assert isinstance(group, ProcessGroup)
     devices = group._device_types
     backends = set()
     if torch.device("cpu") in devices and is_gloo_available():
-        backend = pg._get_backend(torch.device("cpu"))
+        backend = group._get_backend(torch.device("cpu"))
         if isinstance(backend, ProcessGroupGloo):
             backends.add(backend)
-    elif torch.device("cuda") in devices:
-        backend = pg._get_backend(torch.device("cuda"))
+    if torch.device("cuda") in devices:
+        backend = group._get_backend(torch.device("cuda"))
         if is_nccl_available() and isinstance(backend, ProcessGroupNCCL):
             backends.add(backend)  # type: ignore[arg-type]
         elif is_gloo_available() and isinstance(backend, ProcessGroupGloo):
@@ -1167,7 +1250,9 @@ def init_process_group(
             NCCL: the communicator is immediately formed (calling
             ``ncclCommInit*`` immediately rather than the normal lazy
             call) and sub-groups will use ``ncclCommSplit`` when
-            possible to avoid unnecessary overhead of group creation.
+            possible to avoid unnecessary overhead of group creation. If you
+            want to know NCCL initialization error early, you can also use this
+            field.
 
     .. note:: To enable ``backend == Backend.MPI``, PyTorch needs to be built from source
         on a system that supports MPI.
@@ -1180,6 +1265,7 @@ def init_process_group(
         "cpu:gloo,cuda:custom_backend".
 
     """
+
     global _world
 
     global _backend
@@ -1188,6 +1274,19 @@ def init_process_group(
     if GroupMember.WORLD is not None:
         raise ValueError("trying to initialize the default process group twice!")
 
+    set_pytorch_distributed_envs_from_justknobs()
+
+    # Depending on the import order, some trace_rules functions may be evaluated
+    # during the import phase. In such a case, these functions may not correctly
+    # add the distributed related rules due to import circular dependency.
+    # We need to clear the lru_cache during the runtime to ensure the correctness
+    # of these trace_rules.
+    #
+    # Since this API must be called before all distributed code being compiled,
+    # clearing the cache here should be safe.
+    if "torch._dynamo" in sys.modules:
+        torch._dynamo.trace_rules.clear_lru_cache()
+
     assert (store is None) or (
         init_method is None
     ), "Cannot specify both init_method and store."
@@ -1223,7 +1322,7 @@ def init_process_group(
             )
 
         default_pg, _ = _new_process_group_helper(
-            -1, -1, [], backend, None, group_name, timeout=timeout
+            -1, -1, [], backend, None, group_name, timeout=timeout, group_desc="default_pg"
         )
         _update_default_pg(default_pg)
     else:
@@ -1249,6 +1348,7 @@ def init_process_group(
             pg_options=pg_options,
             timeout=timeout,
             device_id=device_id,
+            group_desc="default_pg"
         )
         _update_default_pg(default_pg)
 
@@ -1256,6 +1356,23 @@ def init_process_group(
     _backend = _world.pg_map[not_none(GroupMember.WORLD)][0]
     _default_pg_init_method = init_method
 
+    old_hook = sys.excepthook
+
+    def _distributed_excepthook(*args):
+        old_stderr = sys.stderr
+        sys.stderr = buf = io.StringIO()
+        try:
+            old_hook(*args)
+        finally:
+            sys.stderr = old_stderr
+        msg = buf.getvalue()
+        prefix = f"[rank{get_rank()}]"
+        msg = "\n".join(f"{prefix}: {s}" if s != "" else "" for s in msg.split("\n"))
+        sys.stderr.write(msg)
+        sys.stderr.flush()
+
+    sys.excepthook = _distributed_excepthook
+
     if _is_barrier_after_init() == 1:
         # barrier at the end to ensure that once we return from this method, all
         # process groups including global variables (if any) are updated
@@ -1265,7 +1382,7 @@ def init_process_group(
         # these barriers may be unnecessary, as proven by a green CI after
         # removal. An environment variable `TORCH_DIST_INIT_BARRIER` has been
         # added which enables this barrier only when set to 1.
-        logger.info(
+        logger.debug(
             "Performing barrier after ProcessGroup initialization since "
             "TORCH_DIST_INIT_BARRIER = 1"
         )
@@ -1293,11 +1410,26 @@ def _get_split_source(pg):
 
     # If necessary, find a backend to split from by peeling process
     # group wrappers from our potentially wrapped process group.
-    while isinstance(split_from, _ProcessGroupWrapper):
+    while _GLOO_AVAILABLE and isinstance(split_from, _ProcessGroupWrapper):
         split_from = split_from.wrapped_pg
 
     return split_from
 
+def _shutdown_backend(pg):
+    """
+    Try to shut down the backend of a process group.
+    Currently, only ProcessGroupNCCL backend is supported.
+    No op for other backends.
+    """
+    backend = None
+    try:
+        backend = pg._get_backend(torch.device("cuda"))
+    except RuntimeError:
+        pass
+    if is_nccl_available() and isinstance(backend, ProcessGroupNCCL):
+        # explictly call shutdown to ensure that NCCL resources are released
+        backend._shutdown()
+
 def _new_process_group_helper(
     group_size,
     group_rank,
@@ -1309,6 +1441,7 @@ def _new_process_group_helper(
     timeout=None,
     pg_tag=None,
     device_id=None,
+    group_desc=None,
 ):
     """
     Create a new distributed process group.
@@ -1341,6 +1474,8 @@ def _new_process_group_helper(
             _, prefix_store = _world.pg_map[existing_group]
             return existing_group, prefix_store
 
+    group_desc = "undefined" if group_desc is None else group_desc
+
     # The list of group ranks is empty if we're creating the default group.
     is_default_group = len(global_ranks_in_group) == 0
 
@@ -1380,7 +1515,7 @@ def _new_process_group_helper(
     if device_id:
         pg.bound_device_id = device_id
     backend_config = BackendConfig(backend)
-    backend_class: ProcessGroup
+    backend_class: torch._C._distributed_c10d.Backend
     for device, backend_str in backend_config.get_device_backend_map().items():
         # Use the group name as prefix in the default store, such that
         # a single store can be reused by multiple groups.
@@ -1428,6 +1563,7 @@ def _new_process_group_helper(
                 pg_options.split_from = split_from
                 pg_options.split_color = _process_group_color(global_ranks_in_group)
             pg_options.global_ranks_in_group = global_ranks_in_group
+            pg_options.group_name = group_name
             backend_class = ProcessGroupNCCL(
                 backend_prefix_store, group_rank, group_size, pg_options)
             backend_type = ProcessGroup.BackendType.NCCL
@@ -1473,11 +1609,11 @@ def _new_process_group_helper(
         # TODO: This defaults to the old behavior for PythonProcessGroups which overwrites the
         # ProcessGroup instance
         if issubclass(type(backend_class), ProcessGroup):
-            pg = backend_class
+            pg = backend_class  # type: ignore[assignment]
             break
 
         # Process group wrapper initialization for supported PGs when TORCH_DISTRIBUTED_DEBUG is set
-        if backend_str in [Backend.GLOO, Backend.NCCL, Backend.UCC]:
+        if backend_str in [Backend.GLOO, Backend.NCCL, Backend.UCC] or backend_str.upper() in Backend._plugins:
             # In debug mode and if GLOO is available, wrap in a wrapper PG that
             # enables enhanced collective checking for debuggability.
             if get_debug_level() == DebugLevel.DETAIL:
@@ -1514,9 +1650,12 @@ def _new_process_group_helper(
 
     # update global state
     assert group_name is not None
+    assert group_desc is not None
     _world.pg_map[pg] = (backend, prefix_store)
     _world.pg_names[pg] = group_name
     pg._set_group_name(group_name)
+    pg._set_group_desc(group_desc)
+    _register_process_group(group_name, pg)
 
     _world.pg_backend_config[pg] = str(backend_config)
     # "" is the default tag for user PGs
@@ -1566,6 +1705,11 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
         pg._wait_for_pending_works()
 
     if group is None or group == GroupMember.WORLD:
+        # shutdown all backends in the order of pg names. shutting down in order because
+        # ncclCommAbort() was a 'collective' call in some versions of NCCL.
+        for pg_to_shutdown in sorted(_world.pg_names, key=lambda x: _world.pg_names[x], reverse=True):
+            _shutdown_backend(pg_to_shutdown)
+
         _update_default_pg(None)
         _world.pg_map.clear()
         _world.pg_names.clear()
@@ -1575,6 +1719,7 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
         _world.tags_to_pg.clear()
         _world.pg_coalesce_state.clear()
         _world.pg_default_device.clear()
+        _unregister_all_process_groups()
 
         # when process group doesn't have an explicit name (only WORLD (default)
         # process group can have an explicit name), we use global _world.group_count
@@ -1586,6 +1731,7 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
         # process group is in good state, we aren't dealing with failures.
         _world.group_count = 0
     else:
+        _shutdown_backend(pg)
         del _world.pg_map[pg]
         del _world.pg_names[pg]
         del _world.pg_group_ranks[pg]
@@ -1608,6 +1754,7 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
                     _world.tags_to_pg[""].remove(pg)
             except Exception:
                 pass
+        _unregister_process_group(pg.group_name)
 
 
 def get_rank(group: Optional[ProcessGroup] = None) -> int:
@@ -1669,7 +1816,7 @@ def isend(tensor: torch.Tensor, dst: int, group: Optional[ProcessGroup] = None,
 
     Args:
         tensor (Tensor): Tensor to send.
-        dst (int): Destination rank.
+        dst (int): Destination rank on global process group (regardless of ``group`` argument)
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
         tag (int, optional): Tag to match send with remote recv
@@ -1684,6 +1831,9 @@ def isend(tensor: torch.Tensor, dst: int, group: Optional[ProcessGroup] = None,
         _warn_not_in_group("isend")
         return None
 
+    if tensor.is_complex():
+        tensor = torch.view_as_real(tensor)
+
     if group is None or group is GroupMember.WORLD:
         pg = _get_default_group()
     else:
@@ -1701,8 +1851,8 @@ def irecv(tensor: torch.Tensor, src: Optional[int] = None, group: Optional[Proce
 
     Args:
         tensor (Tensor): Tensor to fill with received data.
-        src (int, optional): Source rank. Will receive from any
-            process if unspecified.
+        src (int, optional): Source rank on global process group (regardless of ``group`` argument).
+            Will receive from any process if unspecified.
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
         tag (int, optional): Tag to match recv with remote send
@@ -1717,6 +1867,9 @@ def irecv(tensor: torch.Tensor, src: Optional[int] = None, group: Optional[Proce
         _warn_not_in_group("irecv")
         return None
 
+    if tensor.is_complex():
+        tensor = torch.view_as_real(tensor)
+
     if group is None or group is GroupMember.WORLD:
         pg = _get_default_group()
     else:
@@ -1736,10 +1889,13 @@ def send(tensor: torch.Tensor, dst: int, group: Optional[ProcessGroup] = None, t
     """
     Send a tensor synchronously.
 
+    .. warning::
+        ``tag`` is not supported with the NCCL backend.
+
     Args:
         tensor (Tensor): Tensor to send.
-        dst (int): Destination rank. Destination rank should not be the same
-            as the rank of the current process.
+        dst (int): Destination rank on global process group (regardless of ``group`` argument).
+            Destination rank should not be the same as the rank of the current process.
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
         tag (int, optional): Tag to match send with remote recv
@@ -1756,6 +1912,9 @@ def send(tensor: torch.Tensor, dst: int, group: Optional[ProcessGroup] = None, t
         _warn_not_in_group("send")
         return None
 
+    if tensor.is_complex():
+        tensor = torch.view_as_real(tensor)
+
     if group is None or group is GroupMember.WORLD:
         default_pg = _get_default_group()
         default_pg.send([tensor], dst, tag).wait()
@@ -1768,10 +1927,13 @@ def recv(tensor: torch.Tensor, src: Optional[int] = None, group: Optional[Proces
     """
     Receives a tensor synchronously.
 
+    .. warning::
+        ``tag`` is not supported with the NCCL backend.
+
     Args:
         tensor (Tensor): Tensor to fill with received data.
-        src (int, optional): Source rank. Will receive from any
-            process if unspecified.
+        src (int, optional): Source rank on global process group (regardless of ``group`` argument).
+            Will receive from any process if unspecified.
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
         tag (int, optional): Tag to match recv with remote send
@@ -1786,6 +1948,9 @@ def recv(tensor: torch.Tensor, src: Optional[int] = None, group: Optional[Proces
         _warn_not_in_group("recv")
         return -1
 
+    if tensor.is_complex():
+        tensor = torch.view_as_real(tensor)
+
     if group is None:
         pg = _get_default_group()
     else:
@@ -1911,9 +2076,9 @@ def _coalescing_manager(
         work = group._end_coalescing(device)
 
     if async_ops:
-        cm.append(work)
+        cm.append(work)  # type: ignore[possibly-undefined]
     else:
-        work.wait()
+        work.wait()  # type: ignore[possibly-undefined]
 
 
 def batch_isend_irecv(p2p_op_list):
@@ -1986,7 +2151,7 @@ def broadcast(tensor, src, group=None, async_op=False):
     Args:
         tensor (Tensor): Data to be sent if ``src`` is the rank of current
             process, and tensor to be used to save received data otherwise.
-        src (int): Source rank.
+        src (int): Source rank on global process group (regardless of ``group`` argument).
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
         async_op (bool, optional): Whether this op should be an async op
@@ -2045,25 +2210,26 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False):
         >>> # xdoctest: +SKIP("no rank")
         >>> # All tensors below are of torch.int64 type.
         >>> # We have 2 process groups, 2 ranks.
-        >>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
+        >>> device = torch.device(f'cuda:{rank}')
+        >>> tensor = torch.arange(2, dtype=torch.int64, device=device) + 1 + 2 * rank
         >>> tensor
-        tensor([1, 2]) # Rank 0
-        tensor([3, 4]) # Rank 1
+        tensor([1, 2], device='cuda:0') # Rank 0
+        tensor([3, 4], device='cuda:1') # Rank 1
         >>> dist.all_reduce(tensor, op=ReduceOp.SUM)
         >>> tensor
-        tensor([4, 6]) # Rank 0
-        tensor([4, 6]) # Rank 1
+        tensor([4, 6], device='cuda:0') # Rank 0
+        tensor([4, 6], device='cuda:1') # Rank 1
 
         >>> # All tensors below are of torch.cfloat type.
         >>> # We have 2 process groups, 2 ranks.
-        >>> tensor = torch.tensor([1+1j, 2+2j], dtype=torch.cfloat) + 2 * rank * (1+1j)
+        >>> tensor = torch.tensor([1+1j, 2+2j], dtype=torch.cfloat, device=device) + 2 * rank * (1+1j)
         >>> tensor
-        tensor([1.+1.j, 2.+2.j]) # Rank 0
-        tensor([3.+3.j, 4.+4.j]) # Rank 1
+        tensor([1.+1.j, 2.+2.j], device='cuda:0') # Rank 0
+        tensor([3.+3.j, 4.+4.j], device='cuda:1') # Rank 1
         >>> dist.all_reduce(tensor, op=ReduceOp.SUM)
         >>> tensor
-        tensor([4.+4.j, 6.+6.j]) # Rank 0
-        tensor([4.+4.j, 6.+6.j]) # Rank 1
+        tensor([4.+4.j, 6.+6.j], device='cuda:0') # Rank 0
+        tensor([4.+4.j, 6.+6.j], device='cuda:1') # Rank 1
 
     """
     _check_single_tensor(tensor, "tensor")
@@ -2135,7 +2301,7 @@ def all_reduce_coalesced(tensors, op=ReduceOp.SUM, group=None, async_op=False):
     warnings.warn(
         "torch.distributed.all_reduce_coalesced will be deprecated. If you must "
         "use it, please revisit our documentation later at "
-        "https://pytorch.org/docs/master/distributed.html#collective-functions"
+        "https://pytorch.org/docs/main/distributed.html#collective-functions"
     )
     if isinstance(tensors, torch.Tensor):
         tensors = [tensors]
@@ -2173,7 +2339,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
     Args:
         tensor (Tensor): Input and output of the collective. The function
             operates in-place.
-        dst (int): Destination rank
+        dst (int): Destination rank on global process group (regardless of ``group`` argument)
         op (optional): One of the values from
             ``torch.distributed.ReduceOp``
             enum.  Specifies an operation used for element-wise reductions.
@@ -2220,7 +2386,7 @@ def _object_to_tensor(obj, device, group):
         backend = get_backend(group)
         if backend == Backend.NCCL:
             hash = torch._C._distributed_c10d._hash_tensors([byte_tensor])
-            logger.warning(f"_object_to_tensor size: {byte_tensor.numel()} hash value: {hash}")  # noqa: G004
+            logger.warning("_object_to_tensor size: %s hash value: %s", byte_tensor.numel(), hash)
     local_size = torch.LongTensor([byte_tensor.numel()]).to(device)
     return byte_tensor, local_size
 
@@ -2230,7 +2396,7 @@ def _tensor_to_object(tensor, tensor_size, group):
         backend = get_backend(group)
         if backend == Backend.NCCL:
             hash = torch._C._distributed_c10d._hash_tensors([tensor])
-            logger.warning(f"_tensor_to_object size: {tensor.numel()} hash value: {hash}")  # noqa: G004
+            logger.warning("_tensor_to_object size: %s hash value: %s", tensor.numel(), hash)
     tensor = tensor.cpu()
     buf = tensor.numpy().tobytes()[:tensor_size]
     return _unpickler(io.BytesIO(buf)).load()
@@ -2341,7 +2507,7 @@ def gather_object(obj, object_gather_list=None, dst=0, group=None):
             should be correctly sized as the size of the group for this
             collective and will contain the output. Must be ``None`` on non-dst
             ranks. (default is ``None``)
-        dst (int, optional): Destination rank. (default is 0)
+        dst (int, optional): Destination rank on global process group (regardless of ``group`` argument). (default is 0)
         group: (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used. Default is ``None``.
 
@@ -2426,7 +2592,7 @@ def gather_object(obj, object_gather_list=None, dst=0, group=None):
     # All ranks call gather with equal-sized tensors.
     gather(
         input_tensor,
-        gather_list=output_tensors if my_rank == dst else None,
+        gather_list=output_tensors if my_rank == dst else None,  # type: ignore[possibly-undefined]
         dst=dst,
         group=group,
     )
@@ -2438,6 +2604,188 @@ def gather_object(obj, object_gather_list=None, dst=0, group=None):
         object_gather_list[i] = _tensor_to_object(tensor, tensor_size, group)
 
 
+@_exception_logger
+def send_object_list(object_list, dst, group=None, device=None):
+    """
+    Sends picklable objects in ``object_list`` synchronously.
+
+    Similar to :func:`send`, but Python objects can be passed in.
+    Note that all objects in ``object_list`` must be picklable in order to be
+    sent.
+
+    Args:
+        object_list (List[Any]): List of input objects to sent.
+            Each object must be picklable. Receiver must provide lists of equal sizes.
+        dst (int): Destination rank to send ``object_list`` to.
+            Destination rank is based on global process group (regardless of ``group`` argument)
+        group: (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Default is ``None``.
+        device (``torch.device``, optional): If not None, the objects are
+            serialized and converted to tensors which are moved to the
+            ``device`` before sending. Default is ``None``.
+
+    Returns:
+        ``None``.
+
+    .. note:: For NCCL-based process groups, internal tensor representations
+        of objects must be moved to the GPU device before communication takes
+        place. In this case, the device used is given by
+        ``torch.cuda.current_device()`` and it is the user's responsibility to
+        ensure that this is set so that each rank has an individual GPU, via
+        ``torch.cuda.set_device()``.
+
+    .. warning::
+        :func:`send_object_list` uses ``pickle`` module implicitly, which
+        is known to be insecure. It is possible to construct malicious pickle
+        data which will execute arbitrary code during unpickling. Only call this
+        function with data you trust.
+
+    .. warning::
+        Calling :func:`send_object_list` with GPU tensors is not well supported
+        and inefficient as it incurs GPU -> CPU transfer since tensors would be
+        pickled. Please consider using :func:`send` instead.
+
+    Example::
+        >>> # xdoctest: +SKIP("need process group init")
+        >>> # Note: Process group initialization omitted on each rank.
+        >>> import torch.distributed as dist
+        >>> # Assumes backend is not NCCL
+        >>> device = torch.device("cpu")
+        >>> if dist.get_rank() == 0:
+        >>>     # Assumes world_size of 2.
+        >>>     objects = ["foo", 12, {1: 2}] # any picklable object
+        >>>     dist.send_object_list(objects, dst=1, device=device)
+        >>> else:
+        >>>     objects = [None, None, None]
+        >>>     dist.recv_object_list(objects, src=0, device=device)
+        >>> objects
+        ['foo', 12, {1: 2}]
+    """
+    if get_rank() == dst:
+        raise ValueError(
+            "Invalid destination rank: destination rank should not be the same as "
+            "the rank of the current process."
+        )
+
+    if _rank_not_in_group(group):
+        _warn_not_in_group("send_object_list")
+        return
+
+    # Current device selection.
+    # To preserve backwards compatibility, ``device`` is default to ``None``
+    # in which case we run current logic of device selection, i.e.
+    # ``current_device`` is CUDA if backend is NCCL otherwise CPU device. In the
+    # case it is not ``None`` we move the size and object tensors to be
+    # sent to this device.
+    current_device = device or _get_pg_default_device(group)
+    # Serialize object_list elements to tensors on src rank.
+    tensor_list, size_list = zip(*[_object_to_tensor(obj, current_device, group) for obj in object_list])
+    object_sizes_tensor = torch.cat(size_list)
+
+    # Send object sizes
+    send(object_sizes_tensor, dst=dst, group=group)
+
+    # Concatenate and send serialized object tensors
+    # Note: torch.cat will do an extra memory copy to the current device, if the tensor_list
+    # has only one element, we can skip the copy.
+    if len(tensor_list) == 1:  # type: ignore[possibly-undefined]
+        object_tensor = tensor_list[0]
+    else:
+        object_tensor = torch.cat(tensor_list)
+
+    send(object_tensor, dst=dst, group=group)
+
+
+@_exception_logger
+def recv_object_list(object_list, src=None, group=None, device=None):
+    """
+    Receives picklable objects in ``object_list`` synchronously.
+
+    Similar to :func:`recv`, but can receive Python objects.
+
+    Args:
+        object_list (List[Any]): List of objects to receive into.
+            Must provide a list of sizes equal to the size of the list being sent.
+        src (int, optional): Source rank from which to recv ``object_list``.
+            Source rank is based on global process group (regardless of ``group`` argument)
+            Will receive from any rank if set to None. Default is ``None``.
+        group: (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Default is ``None``.
+        device (``torch.device``, optional): If not None, receives on this device.
+            Default is ``None``.
+
+    Returns:
+        Sender rank. -1 if rank is not part of the group. If rank is part of the group,
+        ``object_list`` will contain the sent objects from ``src`` rank.
+
+    .. note:: For NCCL-based process groups, internal tensor representations
+        of objects must be moved to the GPU device before communication takes
+        place. In this case, the device used is given by
+        ``torch.cuda.current_device()`` and it is the user's responsibility to
+        ensure that this is set so that each rank has an individual GPU, via
+        ``torch.cuda.set_device()``.
+
+    .. warning::
+        :func:`recv_object_list` uses ``pickle`` module implicitly, which
+        is known to be insecure. It is possible to construct malicious pickle
+        data which will execute arbitrary code during unpickling. Only call this
+        function with data you trust.
+
+    .. warning::
+        Calling :func:`recv_object_list` with GPU tensors is not well supported
+        and inefficient as it incurs GPU -> CPU transfer since tensors would be
+        pickled. Please consider using :func:`recv` instead.
+
+    Example::
+        >>> # xdoctest: +SKIP("need process group init")
+        >>> # Note: Process group initialization omitted on each rank.
+        >>> import torch.distributed as dist
+        >>> # Assumes backend is not NCCL
+        >>> device = torch.device("cpu")
+        >>> if dist.get_rank() == 0:
+        >>>     # Assumes world_size of 2.
+        >>>     objects = ["foo", 12, {1: 2}] # any picklable object
+        >>>     dist.send_object_list(objects, dst=1, device=device)
+        >>> else:
+        >>>     objects = [None, None, None]
+        >>>     dist.recv_object_list(objects, src=0, device=device)
+        >>> objects
+        ['foo', 12, {1: 2}]
+    """
+    if _rank_not_in_group(group):
+        _warn_not_in_group("recv_object_list")
+        return -1
+
+    # Current device selection.
+    # To preserve backwards compatibility, ``device`` is default to ``None``
+    # in which case we run current logic of device selection, i.e.
+    # ``current_device`` is CUDA if backend is NCCL otherwise CPU device. In the
+    # case it is not ``None`` we move the size and object tensors to be
+    # received to this device.
+    current_device = device or _get_pg_default_device(group)
+    object_sizes_tensor = torch.empty(len(object_list), dtype=torch.long, device=current_device)
+
+    # Receive object sizes
+    rank_sizes = recv(object_sizes_tensor, src=src, group=group)
+
+    # Tensor to receive serialized objects into.
+    object_tensor = torch.empty(  # type: ignore[call-overload]
+        torch.sum(object_sizes_tensor).item(),  # type: ignore[arg-type]
+        dtype=torch.uint8,
+        device=current_device
+    )
+
+    rank_objects = recv(object_tensor, src=src, group=group)
+    assert rank_sizes == rank_objects, "Mismatch in return ranks for object sizes and objects."
+    # Deserialize objects using their stored sizes.
+    offset = 0
+    for i, obj_size in enumerate(object_sizes_tensor):
+        obj_view = object_tensor[offset : offset + obj_size]
+        obj_view = obj_view.type(torch.uint8)
+        offset += obj_size
+        object_list[i] = _tensor_to_object(obj_view, obj_size, group)
+    return rank_objects
+
 @_exception_logger
 def broadcast_object_list(object_list, src=0, group=None, device=None):
     """
@@ -2452,6 +2800,7 @@ def broadcast_object_list(object_list, src=0, group=None, device=None):
             Each object must be picklable. Only objects on the ``src`` rank will
             be broadcast, but each rank must provide lists of equal sizes.
         src (int): Source rank from which to broadcast ``object_list``.
+            Source rank is based on global process group (regardless of ``group`` argument)
         group: (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used. Default is ``None``.
         device (``torch.device``, optional): If not None, the objects are
@@ -2469,7 +2818,7 @@ def broadcast_object_list(object_list, src=0, group=None, device=None):
         ensure that this is set so that each rank has an individual GPU, via
         ``torch.cuda.set_device()``.
 
-    .. note:: Note that this API differs slightly from the :func:`all_gather`
+    .. note:: Note that this API differs slightly from the :func:`broadcast`
         collective since it does not provide an ``async_op`` handle and thus
         will be a blocking call.
 
@@ -2525,7 +2874,7 @@ def broadcast_object_list(object_list, src=0, group=None, device=None):
     # Note: torch.cat will do an extra memory copy to the current device, if the tensor_list
     # has only one element, we can skip the copy.
     if my_rank == src:
-        if len(tensor_list) == 1:
+        if len(tensor_list) == 1:  # type: ignore[possibly-undefined]
             object_tensor = tensor_list[0]
         else:
             object_tensor = torch.cat(tensor_list)
@@ -2565,8 +2914,8 @@ def scatter_object_list(
         scatter_object_input_list (List[Any]): List of input objects to scatter.
             Each object must be picklable. Only objects on the ``src`` rank will
             be scattered, and the argument can be ``None`` for non-src ranks.
-        src (int): Source rank from which to scatter
-            ``scatter_object_input_list``.
+        src (int): Source rank from which to scatter ``scatter_object_input_list``.
+            Source rank is based on global process group (regardless of ``group`` argument).
         group: (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used. Default is ``None``.
 
@@ -2628,8 +2977,8 @@ def scatter_object_list(
     # Src rank broadcasts the maximum tensor size. This is because all ranks are
     # expected to call into scatter() with equal-sized tensors.
     if my_rank == src:
-        max_tensor_size = max(tensor_sizes)
-        for tensor in tensor_list:
+        max_tensor_size = max(tensor_sizes)  # type: ignore[possibly-undefined]
+        for tensor in tensor_list:  # type: ignore[possibly-undefined]
             tensor.resize_(max_tensor_size)
     else:
         max_tensor_size = torch.tensor([0], dtype=torch.long, device=pg_device)
@@ -2639,7 +2988,7 @@ def scatter_object_list(
     output_tensor = torch.empty(max_tensor_size.item(), dtype=torch.uint8, device=pg_device)
     scatter(
         output_tensor,
-        scatter_list=None if my_rank != src else tensor_list,
+        scatter_list=None if my_rank != src else tensor_list,  # type: ignore[possibly-undefined]
         src=src,
         group=group,
     )
@@ -2648,7 +2997,7 @@ def scatter_object_list(
     obj_tensor_size = torch.tensor([0], dtype=torch.long, device=pg_device)
     scatter(
         obj_tensor_size,
-        scatter_list=None if my_rank != src else tensor_sizes,
+        scatter_list=None if my_rank != src else tensor_sizes,  # type: ignore[possibly-undefined]
         src=src,
         group=group,
     )
@@ -2680,31 +3029,34 @@ def all_gather(tensor_list, tensor, group=None, async_op=False):
         >>> # xdoctest: +SKIP("need process group init")
         >>> # All tensors below are of torch.int64 dtype.
         >>> # We have 2 process groups, 2 ranks.
-        >>> tensor_list = [torch.zeros(2, dtype=torch.int64) for _ in range(2)]
+        >>> device = torch.device(f'cuda:{rank}')
+        >>> tensor_list = [torch.zeros(2, dtype=torch.int64, device=device) for _ in range(2)]
         >>> tensor_list
-        [tensor([0, 0]), tensor([0, 0])] # Rank 0 and 1
-        >>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
+        [tensor([0, 0], device='cuda:0'), tensor([0, 0], device='cuda:0')] # Rank 0
+        [tensor([0, 0], device='cuda:0'), tensor([0, 0], device='cuda:1')] # Rank 1
+        >>> tensor = torch.arange(2, dtype=torch.int64, device=device) + 1 + 2 * rank
         >>> tensor
-        tensor([1, 2]) # Rank 0
-        tensor([3, 4]) # Rank 1
+        tensor([1, 2], device='cuda:0') # Rank 0
+        tensor([3, 4], device='cuda:1') # Rank 1
         >>> dist.all_gather(tensor_list, tensor)
         >>> tensor_list
-        [tensor([1, 2]), tensor([3, 4])] # Rank 0
-        [tensor([1, 2]), tensor([3, 4])] # Rank 1
+        [tensor([1, 2], device='cuda:0'), tensor([3, 4], device='cuda:0')] # Rank 0
+        [tensor([1, 2], device='cuda:1'), tensor([3, 4], device='cuda:1')] # Rank 1
 
         >>> # All tensors below are of torch.cfloat dtype.
         >>> # We have 2 process groups, 2 ranks.
-        >>> tensor_list = [torch.zeros(2, dtype=torch.cfloat) for _ in range(2)]
+        >>> tensor_list = [torch.zeros(2, dtype=torch.cfloat, device=device) for _ in range(2)]
         >>> tensor_list
-        [tensor([0.+0.j, 0.+0.j]), tensor([0.+0.j, 0.+0.j])] # Rank 0 and 1
-        >>> tensor = torch.tensor([1+1j, 2+2j], dtype=torch.cfloat) + 2 * rank * (1+1j)
+        [tensor([0.+0.j, 0.+0.j], device='cuda:0'), tensor([0.+0.j, 0.+0.j], device='cuda:0')] # Rank 0
+        [tensor([0.+0.j, 0.+0.j], device='cuda:1'), tensor([0.+0.j, 0.+0.j], device='cuda:1')] # Rank 1
+        >>> tensor = torch.tensor([1+1j, 2+2j], dtype=torch.cfloat, device=device) + 2 * rank * (1+1j)
         >>> tensor
-        tensor([1.+1.j, 2.+2.j]) # Rank 0
-        tensor([3.+3.j, 4.+4.j]) # Rank 1
+        tensor([1.+1.j, 2.+2.j], device='cuda:0') # Rank 0
+        tensor([3.+3.j, 4.+4.j], device='cuda:1') # Rank 1
         >>> dist.all_gather(tensor_list, tensor)
         >>> tensor_list
-        [tensor([1.+1.j, 2.+2.j]), tensor([3.+3.j, 4.+4.j])] # Rank 0
-        [tensor([1.+1.j, 2.+2.j]), tensor([3.+3.j, 4.+4.j])] # Rank 1
+        [tensor([1.+1.j, 2.+2.j], device='cuda:0'), tensor([3.+3.j, 4.+4.j], device='cuda:0')] # Rank 0
+        [tensor([1.+1.j, 2.+2.j], device='cuda:1'), tensor([3.+3.j, 4.+4.j], device='cuda:1')] # Rank 1
 
     """
     _check_tensor_list(tensor_list, "tensor_list")
@@ -2903,7 +3255,7 @@ def all_gather_coalesced(
     warnings.warn(
         "torch.distributed.all_gather_coalesced will be deprecated. If you must "
         "use it, please revisit our documentation later at "
-        "https://pytorch.org/docs/master/distributed.html#collective-functions"
+        "https://pytorch.org/docs/main/distributed.html#collective-functions"
     )
     # We only check basic compatibility with C++ params here, C++ code will
     # do shape and type checking.
@@ -2963,7 +3315,7 @@ def gather(tensor, gather_list=None, dst=0, group=None, async_op=False):
         gather_list (list[Tensor], optional): List of appropriately-sized
             tensors to use for gathered data (default is None, must be specified
             on the destination rank)
-        dst (int, optional): Destination rank (default is 0)
+        dst (int, optional): Destination rank on global process group (regardless of ``group`` argument). (default is 0)
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
         async_op (bool, optional): Whether this op should be an async op
@@ -3022,7 +3374,8 @@ def scatter(tensor, scatter_list=None, src=0, group=None, async_op=False):
         tensor (Tensor): Output tensor.
         scatter_list (list[Tensor]): List of tensors to scatter (default is
             None, must be specified on the source rank)
-        src (int): Source rank (default is 0)
+        src (int): Source rank on global process group (regardless of ``group`` argument).
+            Default is 0
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
         async_op (bool, optional): Whether this op should be an async op
@@ -3528,6 +3881,10 @@ def barrier(group=GroupMember.WORLD, async_op=False, device_ids=None):
     Returns:
         Async work handle, if async_op is set to True.
         None, if not async_op or if not part of the group
+
+    .. note:: `ProcessGroupNCCL` now relies on stream synchronization instead of
+              device synchronization to block the CPU. Thus, please do not assume that
+              `barrier()` would perform a device synchronization.
     """
     if _rank_not_in_group(group):
         _warn_not_in_group("barrier")
@@ -3629,13 +3986,15 @@ def monitored_barrier(group=GroupMember.WORLD, timeout=None, wait_all_ranks=Fals
 
 
 def _create_process_group_wrapper(
-    wrapped_pg: ProcessGroup,
+    wrapped_pg: torch._C._distributed_c10d.Backend,
     store_prefix: str,
     store: Store,
     rank: int,
     world_size: int,
     timeout: timedelta = default_pg_timeout,
 ):
+    assert _GLOO_AVAILABLE, "ProcessGroupWrapper unsupported without GLOO backend."
+
     # (whc) this appears to be just for the gloo backend? if so, `default_pg_timeout` is appropriate...
 
     # Create a separate prefix store for the helper process group.
@@ -3675,7 +4034,7 @@ def _get_backend_from_str(backend: Optional[str] = None) -> Backend:
 
 
 @_time_logger
-def new_group(ranks=None, timeout=None, backend=None, pg_options=None, use_local_synchronization=False):
+def new_group(ranks=None, timeout=None, backend=None, pg_options=None, use_local_synchronization=False, group_desc=None):
     """
     Create a new distributed group.
 
@@ -3716,6 +4075,7 @@ def new_group(ranks=None, timeout=None, backend=None, pg_options=None, use_local
             barrier at the end of the process group creation. This is different
             in that non-member ranks don't need to call into API and don't
             join the barrier.
+        group_desc (str, optional): a string to describe the process group.
 
     Returns:
         A handle of distributed group that can be given to collective calls or None if the rank is not part of ``ranks``.
@@ -3730,7 +4090,15 @@ def new_group(ranks=None, timeout=None, backend=None, pg_options=None, use_local
     multiple overlaping process groups. To avoid that, make sure all ranks follow the
     same global creation order.
     """
-    return _new_group_with_tag(ranks, timeout, backend, pg_options, None, use_local_synchronization=use_local_synchronization)
+    return _new_group_with_tag(
+        ranks,
+        timeout,
+        backend,
+        pg_options,
+        None,
+        use_local_synchronization=use_local_synchronization,
+        group_desc=group_desc,
+    )
 
 def _new_group_with_tag(
     ranks=None,
@@ -3738,7 +4106,8 @@ def _new_group_with_tag(
     backend=None,
     pg_options=None,
     pg_tag=None,
-    use_local_synchronization=False
+    use_local_synchronization=False,
+    group_desc=None
 ):
     """
     Variant of ``new_group`` that exposes tag creation.
@@ -3810,7 +4179,8 @@ def _new_group_with_tag(
         group_name,
         pg_options=pg_options,
         timeout=timeout,
-        pg_tag=pg_tag
+        pg_tag=pg_tag,
+        group_desc=group_desc
     )
 
     # Create the global rank to group rank mapping
@@ -3850,6 +4220,7 @@ def new_subgroups(
     timeout=None,
     backend=None,
     pg_options=None,
+    group_desc=None,
 ):
     """
     Create subgroups of equal size.
@@ -3902,6 +4273,8 @@ def new_subgroups(
             the construction of specific process groups. i.e. for the ``nccl``
             backend, ``is_high_priority_stream`` can be specified so that
             process group can pick up high priority cuda streams.
+        group_desc (str, optional): A string describing the group. Each subgroup will
+            inherit its group_desc
 
     Returns:
         The subgroup containing the current rank, and all the subgroups used for cleanup.
@@ -3915,7 +4288,7 @@ def new_subgroups(
         >>> tensor = torch.ones(1, device=rank) * rank
         >>> dist.all_reduce(tensor, group=cur_subgroup)
         >>> tensor
-        tensor([8])     # Assume 8 is the number of CUDA devices per machine.
+        tensor([28])  # Assume 8 CUDA devices per machine.  28 is sum(range(8)).
         >>> # Cleanup.
         >>> for subgroup in subgroups:
         >>>     dist.destroy_process_group(subgroup)
@@ -3947,6 +4320,7 @@ def new_subgroups(
             timeout=timeout,
             backend=backend,
             pg_options=pg_options,
+            group_desc=group_desc,
         )
         subgroups.append(subgroup)
 
@@ -3966,6 +4340,7 @@ def new_subgroups_by_enumeration(
     timeout=None,
     backend=None,
     pg_options=None,
+    group_desc=None,
 ):
     """
     Create subgroups by dividing the global world.
@@ -4006,6 +4381,8 @@ def new_subgroups_by_enumeration(
             the construction of specific process groups. i.e. for the ``nccl``
             backend, ``is_high_priority_stream`` can be specified so that
             process group can pick up high priority cuda streams.
+        group_desc (str, optional): A string describing the group. Each subgroup will
+            inherit its group_desc.
 
     Returns:
         The subgroup containing the current rank, and all the subgroups used for cleanup.
@@ -4034,6 +4411,7 @@ def new_subgroups_by_enumeration(
             timeout=timeout,
             backend=backend,
             pg_options=pg_options,
+            group_desc=group_desc,
         )
         subgroups.append(subgroup)
         my_rank = get_rank()
@@ -4115,6 +4493,8 @@ def _get_process_group_store(pg: ProcessGroup) -> Store:
     all_to_all,
     all_reduce_coalesced,
     gather,
+    send_object_list,
+    recv_object_list,
     broadcast_object_list,
     barrier,
     scatter,
diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py
index 23e2ddf5a37e7..15686ec36be7f 100644
--- a/torch/distributed/elastic/agent/server/api.py
+++ b/torch/distributed/elastic/agent/server/api.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+# mypy: ignore-errors
 
 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
@@ -7,7 +7,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import abc
-import functools
 import json
 import os
 import signal
@@ -15,28 +14,37 @@
 import time
 import traceback
 import warnings
-from contextlib import closing
+from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch.distributed.elastic.rendezvous as rdzv
 import torch.distributed.elastic.utils.store as store_util
+from torch.distributed.elastic.rendezvous import RendezvousGracefulExitError
 from torch.distributed import Store
 from torch.distributed.elastic.events import Event, EventSource, record
 from torch.distributed.elastic.metrics import prof, put_metric
 from torch.distributed.elastic.multiprocessing import (
     ProcessFailure,
     SignalException,
-    Std,
 )
+from collections import defaultdict
 from torch.distributed.elastic.utils.logging import get_logger
 
-__all__ = ['WorkerSpec', 'Worker', 'WorkerState', 'WorkerGroup', 'RunResult', 'ElasticAgent', 'SimpleElasticAgent']
+__all__ = [
+    "WorkerSpec",
+    "Worker",
+    "WorkerState",
+    "WorkerGroup",
+    "RunResult",
+    "ElasticAgent",
+    "SimpleElasticAgent",
+]
 _TERMINAL_STATE_SYNC_ID = "torchelastic/agent/terminal_state"
 
 DEFAULT_ROLE = "default"
-log = get_logger(__name__)
+logger = get_logger(__name__)
 
 
 @dataclass
@@ -77,12 +85,10 @@ class WorkerSpec:
     entrypoint: Union[Callable, str, None] = None
     args: Tuple = ()
     max_restarts: int = 3
-    monitor_interval: float = 30.0
+    monitor_interval: float = 0.1
     master_port: Optional[int] = None
     master_addr: Optional[str] = None
     local_addr: Optional[str] = None
-    redirects: Union[Std, Dict[int, Std]] = Std.NONE
-    tee: Union[Std, Dict[int, Std]] = Std.NONE
 
     def __post_init__(self):
         assert self.local_world_size > 0
@@ -377,7 +383,7 @@ def _get_socket_with_port() -> socket.socket:
             return s
         except OSError as e:
             s.close()
-            log.info("Socket creation attempt failed.", exc_info=e)
+            logger.info("Socket creation attempt failed.", exc_info=e)
     raise RuntimeError("Failed to create a socket")
 
 
@@ -414,7 +420,7 @@ class ElasticAgent(abc.ABC):
       if group_result.is_failed():
         # workers failed
         failure = group_result.failures[0]
-        log.exception("worker 0 failed with exit code : %s", failure.exit_code)
+        logger.exception("worker 0 failed with exit code : %s", failure.exit_code)
       else:
         return group_result.return_values[0] # return rank 0's results
 
@@ -433,7 +439,7 @@ def run(self, role: str = DEFAULT_ROLE) -> RunResult:
         Raises:
             Exception - any other failures NOT related to worker process
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @abc.abstractmethod
     def get_worker_group(self, role: str = DEFAULT_ROLE) -> WorkerGroup:
@@ -444,7 +450,7 @@ def get_worker_group(self, role: str = DEFAULT_ROLE) -> WorkerGroup:
         Implementors are encouraged (but not required) to return
         a defensive read-only copy.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 class SimpleElasticAgent(ElasticAgent):
@@ -471,17 +477,17 @@ def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
         This is according to worker spec for the worker group .
         Returns a map of ``local_rank`` to worker ``id``.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @abc.abstractmethod
-    def _stop_workers(self, worker_group: WorkerGroup) -> None:
+    def _stop_workers(self, worker_group: WorkerGroup, is_restart: bool = False) -> None:
         r"""Stop all workers in the given worker group.
 
         Implementors must deal with workers in all states defined by
         ``WorkerState``. That is, it must gracefully handle stopping
         non-existent workers, unhealthy (stuck) workers, etc.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @abc.abstractmethod
     def _monitor_workers(self, worker_group: WorkerGroup) -> RunResult:
@@ -489,16 +495,16 @@ def _monitor_workers(self, worker_group: WorkerGroup) -> RunResult:
 
         This function also returns the new state of the worker group.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @abc.abstractmethod
-    def _shutdown(self, death_sig: signal.Signals = signal.SIGTERM) -> None:
+    def _shutdown(self, death_sig: signal.Signals = signal.SIGTERM, is_restart: bool = False) -> None:
         """Clean up any resources that were allocated during the agent's work.
 
         Args:
             death_sig: Signal to send to the child process, SIGTERM is default
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @staticmethod
     def _set_master_addr_port(
@@ -539,10 +545,12 @@ def _rendezvous(self, worker_group: WorkerGroup) -> None:
         """
         spec = worker_group.spec
 
-        store, group_rank, group_world_size = spec.rdzv_handler.next_rendezvous()
+        with self.record_duration("RENDEZVOUS"):
+            store, group_rank, group_world_size = spec.rdzv_handler.next_rendezvous()
         self._store = store
 
-        workers = self._assign_worker_ranks(store, group_rank, group_world_size, spec)
+        with self.record_duration("ASSIGN_WORKER_RANKS"):
+            workers = self._assign_worker_ranks(store, group_rank, group_world_size, spec)
         worker_group.workers = workers
         worker_group.store = store
         worker_group.group_rank = group_rank
@@ -556,10 +564,11 @@ def _rendezvous(self, worker_group: WorkerGroup) -> None:
                 spec.local_addr,
             )
 
-        master_addr, master_port = self._get_master_addr_port(store)
+        with self.record_duration("GET_MASTER_ADDR_PORT"):
+            master_addr, master_port = self._get_master_addr_port(store)
         restart_count = spec.max_restarts - self._remaining_restarts
 
-        log.info(
+        logger.info(
             "[%(role)s] Rendezvous complete for workers. Result:\n"
             "  restart_count=%(restart_count)s\n"
             "  master_addr=%(master_addr)s\n"
@@ -586,26 +595,6 @@ def _rendezvous(self, worker_group: WorkerGroup) -> None:
             }
         )
 
-    def _get_ranks(
-        self,
-        role_infos: List[_RoleInstanceInfo],
-        role_idx: int,
-        start_idx: int = 0,
-        end_idx: int = -1,
-    ) -> Tuple[int, List[int]]:
-        if end_idx == -1:
-            end_idx = len(role_infos)
-        prefix_sum = 0
-        total_sum = 0
-        for idx in range(start_idx, end_idx):
-            if role_idx > idx:
-                prefix_sum += role_infos[idx].local_world_size
-            total_sum += role_infos[idx].local_world_size
-        return (
-            total_sum,
-            list(range(prefix_sum, prefix_sum + role_infos[role_idx].local_world_size)),
-        )
-
     # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
     #  `torch.distributed.elastic.metrics.prof`.
     @prof
@@ -618,63 +607,86 @@ def _assign_worker_ranks(
 
         1. Each agent writes its configuration(group_rank, group_world_size
            , num_workers) to the common store.
-        2. Each agent retrieves configuration for all agents
-           and performs two level sort using role and rank.
-        3. Determine the global rank: the global rank of the workers for the current
-           agent is the offset of the infos array up to group_rank of the agent.
-           The offset is computed as a sum of local_world_size of all agents that
-           have rank less than the group_rank. The workers would have the ranks:
-           [offset, offset+local_world_size)
+        2. The rank 0 agent reads all the role_info from the store and
+           determines each agents worker ranks.
+        3. Determine the global rank: the global rank of the workers is computed
+           by cumulative sum of the local_world_size for all workers in front of it.
+           For efficiency reasons each worker is assigned a base global rank
+           such that it's workers are in the range [base_global_rank,
+           base_global_rank + local_world_size).
         4. Determine the role rank: The role rank is determined using the algorithms
-           in the point 3 with the exception that the offset is done from the first
-           agent that has the same role as current one and has the minimum group rank.
+           in the point 3 with the exception that the ranks are calculated with
+           respect to the role name.
+        5. The rank 0 agent writes the assigned ranks to the store.
+        6. Each agent reads the assigned ranks from the store.
+
+        Time complexity: each worker O(1), rank0 O(n), overall O(n)
         """
-        role_infos = self._share_and_gather(store, group_rank, group_world_size, spec)
-        my_role_info = role_infos[group_rank]
-        worker_world_size, worker_global_ranks = self._get_ranks(role_infos, group_rank)
-        role_infos = sorted(
-            role_infos, key=functools.cmp_to_key(_RoleInstanceInfo.compare)
-        )
-        role_start_idx, role_end_idx = _RoleInstanceInfo.find_role_boundaries(
-            role_infos, my_role_info.role
-        )
-        role_pos = next(
-            idx
-            for idx, role_info in enumerate(role_infos)
-            if _RoleInstanceInfo.compare(role_info, my_role_info) == 0
+
+        ROLE_INFO_PREFIX = "torchelastic/role_info/"
+        ASSIGNED_RANKS_PREFIX = "torchelastic/assigned_ranks/"
+
+        agent_role_info = _RoleInstanceInfo(
+            spec.role, group_rank, spec.local_world_size
         )
-        role_world_size, role_ranks = self._get_ranks(
-            role_infos, role_pos, role_start_idx, role_end_idx + 1
+        store.set(f"{ROLE_INFO_PREFIX}{group_rank}", agent_role_info.serialize())
+
+        # tcp store is collocated with rank 0 so we can use it to do extra compute to reduce overall # of operations.
+        if group_rank == 0:
+            role_infos_bytes = store.multi_get(
+                [f"torchelastic/role_info/{i}" for i in range(group_world_size)]
+            )
+            role_infos = [
+                _RoleInstanceInfo.deserialize(info_bytes)
+                for info_bytes in role_infos_bytes
+            ]
+
+            role_sizes = defaultdict(lambda: 0)
+            global_size = 0
+            for role_info in role_infos:
+                role_sizes[role_info.role] += role_info.local_world_size
+                global_size += role_info.local_world_size
+
+            base_global_rank = 0
+            role_ranks = defaultdict(lambda: 0)
+
+            keys = []
+            values = []
+            for i, role_info in enumerate(role_infos):
+                keys.append(f"{ASSIGNED_RANKS_PREFIX}{i}")
+                values.append(
+                    json.dumps(
+                        [
+                            base_global_rank,
+                            global_size,
+                            role_ranks[role_info.role],
+                            role_sizes[role_info.role],
+                        ]
+                    )
+                )
+
+                base_global_rank += role_info.local_world_size
+                role_ranks[role_info.role] += role_info.local_world_size
+
+            store.multi_set(keys, values)
+
+        # get will block until the data is available in the store.
+        base_global_rank, global_world_size, base_role_rank, role_world_size = json.loads(
+            store.get(f"{ASSIGNED_RANKS_PREFIX}{group_rank}")
         )
+
         workers = []
-        for ind in range(spec.local_world_size):
+        for local_rank in range(spec.local_world_size):
             worker = Worker(
-                local_rank=ind,
-                global_rank=worker_global_ranks[ind],
-                role_rank=role_ranks[ind],
-                world_size=worker_world_size,
+                local_rank=local_rank,
+                global_rank=base_global_rank + local_rank,
+                role_rank=base_role_rank + local_rank,
+                world_size=global_world_size,
                 role_world_size=role_world_size,
             )
             workers.append(worker)
         return workers
 
-    def _share_and_gather(
-        self, store, group_rank: int, group_world_size: int, spec: WorkerSpec
-    ) -> List:
-        agent_role_info = _RoleInstanceInfo(
-            spec.role, group_rank, spec.local_world_size
-        )
-        key_prefix = "torchelastic/role_info"
-        agent_config_enc = agent_role_info.serialize()
-        role_infos_bytes = store_util.synchronize(
-            store, agent_config_enc, group_rank, group_world_size, key_prefix
-        )
-        role_infos = [
-            _RoleInstanceInfo.deserialize(role_info_bytes)
-            for role_info_bytes in role_infos_bytes
-        ]
-        return role_infos
-
     # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
     #  `torch.distributed.elastic.metrics.prof`.
     @prof
@@ -690,7 +702,7 @@ def _initialize_workers(self, worker_group: WorkerGroup) -> None:
         of state to ``_monitor_workers()`` method
         """
         role = worker_group.spec.role
-        log.info("[%s] Rendezvous'ing worker group", role)
+        logger.info("[%s] Rendezvous'ing worker group", role)
 
         # TODO after stopping workers, wait at least monitor_interval*2 for
         # workers on different nodes to fail on a collective op before waiting
@@ -698,7 +710,7 @@ def _initialize_workers(self, worker_group: WorkerGroup) -> None:
         # at around the same time and reduce false positive rdzv timeout errors
         self._rendezvous(worker_group)
 
-        log.info("[%s] Starting worker group", role)
+        logger.info("[%s] Starting worker group", role)
         worker_ids = self._start_workers(worker_group)
         for local_rank, w_id in worker_ids.items():
             worker = worker_group.workers[local_rank]
@@ -712,8 +724,8 @@ def _initialize_workers(self, worker_group: WorkerGroup) -> None:
     def _restart_workers(self, worker_group: WorkerGroup) -> None:
         """Restart (stops, rendezvous, starts) all local workers in the group."""
         role = worker_group.spec.role
-        log.info("[%s] Stopping worker group", role)
-        self._stop_workers(worker_group)
+        logger.info("[%s] Stopping worker group", role)
+        self._stop_workers(worker_group, is_restart=True)
         worker_group.state = WorkerState.STOPPED
         self._initialize_workers(worker_group)
 
@@ -729,8 +741,10 @@ def run(self, role: str = DEFAULT_ROLE) -> RunResult:
             self._record_metrics(result)
             self._record_worker_events(result)
             return result
+        except RendezvousGracefulExitError as e:
+            logger.info("Rendezvous gracefully exited: %s", e)
         except SignalException as e:
-            log.warning("Received %s death signal, shutting down workers", e.sigval)
+            logger.warning("Received %s death signal, shutting down workers", e.sigval)
             self._shutdown(e.sigval)
             shutdown_called = True
             raise
@@ -770,12 +784,23 @@ def _get_worker_state(self, worker: Worker, result: RunResult) -> str:
         else:
             raise ValueError(f"Unknown worker: {worker.global_rank}")
 
+    @contextmanager
+    def record_duration(self, state: str):
+        start_time = time.perf_counter()
+        try:
+            yield
+        finally:
+            end_time = time.perf_counter()
+            duration_ms = (end_time - start_time) * 1000
+            record(self._construct_event(state=state, source=EventSource.AGENT, duration_ms=duration_ms))
+
     def _construct_event(
         self,
         state: str,
         source: EventSource,
         worker: Optional[Worker] = None,
         raw_error: Optional[str] = None,
+        duration_ms: Optional[float] = None,
     ) -> Event:
         wg = self._worker_group
         spec = wg.spec
@@ -806,6 +831,7 @@ def _construct_event(
             "raw_error": raw_error,
             "metadata": md_str,
             "agent_restarts": spec.max_restarts - self._remaining_restarts,
+            "duration_ms": duration_ms,
         }
         return Event(
             f"torchelastic.worker.status.{state}", source=source, metadata=metadata
@@ -855,7 +881,7 @@ def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult:
         spec = self._worker_group.spec
         role = spec.role
 
-        log.info(
+        logger.info(
             "[%s] starting workers for entrypoint: %s", role, spec.get_entrypoint_name()
         )
 
@@ -874,7 +900,7 @@ def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult:
             put_metric(f"workers.{role}.{state.name.lower()}", 1)
 
             if state == WorkerState.SUCCEEDED:
-                log.info(
+                logger.info(
                     "[%s] worker group successfully finished."
                     " Waiting %s seconds for other agents to finish.",
                     role, self._exit_barrier_timeout
@@ -883,7 +909,7 @@ def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult:
                 return run_result
             elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED}:
                 if self._remaining_restarts > 0:
-                    log.info(
+                    logger.info(
                         "[%s] Worker group %s. "
                         "%s/%s attempts left;"
                         " will restart worker group",
@@ -900,7 +926,7 @@ def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult:
                 num_nodes_waiting = rdzv_handler.num_nodes_waiting()
                 group_rank = self._worker_group.group_rank
                 if num_nodes_waiting > 0:
-                    log.info(
+                    logger.info(
                         "[%s] Detected %s "
                         "new nodes from group_rank=%s; "
                         "will restart worker group",
@@ -908,7 +934,7 @@ def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult:
                     )
                     self._restart_workers(self._worker_group)
             else:
-                raise Exception(f"[{role}] Worker group in {state.name} state")
+                raise Exception(f"[{role}] Worker group in {state.name} state")  # noqa: TRY002
 
     def _exit_barrier(self):
         """
@@ -919,7 +945,7 @@ def _exit_barrier(self):
         acts as a safety guard against user scripts that terminate at different
         times.
         """
-        log.info(
+        logger.info(
             "Local worker group finished (%s). "
             "Waiting %s seconds for other agents to finish",
             self._worker_group.state, self._exit_barrier_timeout
@@ -927,20 +953,19 @@ def _exit_barrier(self):
         start = time.time()
         try:
             store_util.barrier(
-                self._store,
-                self._worker_group.group_rank,
-                self._worker_group.group_world_size,
+                store=self._store,
+                world_size=self._worker_group.group_world_size,
                 key_prefix=_TERMINAL_STATE_SYNC_ID,
                 barrier_timeout=self._exit_barrier_timeout,
             )
-            log.info(
+            logger.info(
                 "Done waiting for other agents. Elapsed: %s seconds", time.time() - start
             )
         except SignalException as e:
-            log.warning("Got termination signal: %s", e.sigval)
+            logger.warning("Got termination signal: %s", e.sigval)
             raise
         except Exception:
-            log.exception(
+            logger.exception(
                 "Error waiting on exit barrier. Elapsed: %s seconds",
                 time.time() - start
             )
diff --git a/torch/distributed/elastic/agent/server/health_check_server.py b/torch/distributed/elastic/agent/server/health_check_server.py
new file mode 100644
index 0000000000000..0c2dea63a2214
--- /dev/null
+++ b/torch/distributed/elastic/agent/server/health_check_server.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable
+
+from torch.distributed.elastic.utils.logging import get_logger
+
+log = get_logger(__name__)
+
+
+class HealthCheckServer:
+    """
+    Interface for health check monitoring server, which can be extended
+    by starting tcp/http server on the specified port.
+
+    Args:
+
+        alive_callback: Callable[[], int], callback to last progress time of agent
+
+        port: int, port number to start tcp/http server
+
+        timeout: int, timeout seconds to decide agent is alive/dead
+    """
+
+    _alive_callback: Callable[[], int]
+    _port: int
+    _timeout: int
+
+    def __init__(
+        self, alive_callback: Callable[[], int], port: int, timeout: int
+    ) -> None:
+        self._alive_callback = alive_callback
+        self._port = port
+        self._timeout = timeout
+
+    def start(self) -> None:
+        """
+        Unsupported functionality for Pytorch, doesn't start any health check server
+        """
+        log.warning("No health check server started")
+
+    def stop(self) -> None:
+        """
+        Function to stop health check server
+        """
+        log.info("Stopping noop health check server.")
+
+
+def create_healthcheck_server(
+    alive_callback: Callable[[], int],
+    port: int,
+    timeout: int,
+) -> HealthCheckServer:
+    """
+    creates health check server object
+    """
+    return HealthCheckServer(alive_callback, port, timeout)
diff --git a/torch/distributed/elastic/agent/server/local_elastic_agent.py b/torch/distributed/elastic/agent/server/local_elastic_agent.py
index 43a8a79457e0a..3aee8ca8f766b 100644
--- a/torch/distributed/elastic/agent/server/local_elastic_agent.py
+++ b/torch/distributed/elastic/agent/server/local_elastic_agent.py
@@ -9,13 +9,12 @@
 
 import json
 import os
-import shutil
 import signal
 import socket
 from string import Template
-import tempfile
+import time
 import uuid
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple, TYPE_CHECKING
 
 import torch.distributed.elastic.timer as timer
 from torch.distributed.elastic import events
@@ -27,21 +26,29 @@
     WorkerSpec,
     WorkerState,
 )
-from torch.distributed.elastic.events.api import EventMetadataValue
+from torch.distributed.elastic.agent.server.health_check_server import (
+    create_healthcheck_server,
+    HealthCheckServer,
+)
 from torch.distributed.elastic.metrics.api import prof
-from torch.distributed.elastic.multiprocessing import PContext, start_processes
+from torch.distributed.elastic.multiprocessing import PContext, start_processes, LogsSpecs
 from torch.distributed.elastic.utils import macros
 from torch.distributed.elastic.utils.logging import get_logger
 
-log = get_logger(__name__)
+if TYPE_CHECKING:
+    from torch.distributed.elastic.events.api import EventMetadataValue
+
+logger = get_logger(__name__)
 
 __all__ = [
     "LocalElasticAgent",
     "TORCHELASTIC_ENABLE_FILE_TIMER",
     "TORCHELASTIC_TIMER_FILE",
+    "TORCHELASTIC_HEALTH_CHECK_PORT",
 ]
 
 TORCHELASTIC_ENABLE_FILE_TIMER = "TORCHELASTIC_ENABLE_FILE_TIMER"
+TORCHELASTIC_HEALTH_CHECK_PORT = "TORCHELASTIC_HEALTH_CHECK_PORT"
 TORCHELASTIC_TIMER_FILE = "TORCHELASTIC_TIMER_FILE"
 
 class LocalElasticAgent(SimpleElasticAgent):
@@ -136,25 +143,20 @@ def main():
     def __init__(
         self,
         spec: WorkerSpec,
+        logs_specs: LogsSpecs,
         start_method="spawn",
         exit_barrier_timeout: float = 300,
-        log_dir: Optional[str] = None,
         log_line_prefix_template: Optional[str] = None,
     ):
         super().__init__(spec, exit_barrier_timeout)
         self._start_method = start_method
         self._pcontext: Optional[PContext] = None
-        rdzv_run_id = spec.rdzv_handler.get_run_id()
-        self._log_dir = self._make_log_dir(log_dir, rdzv_run_id)
+        self._rdzv_handler = spec.rdzv_handler
         self._log_line_prefix_template = log_line_prefix_template
         self._worker_watchdog: Optional[timer.FileTimerServer] = None
+        self._logs_specs = logs_specs
+        self._health_check_server: Optional[HealthCheckServer] = None
 
-    def _make_log_dir(self, log_dir: Optional[str], rdzv_run_id: str):
-        base_log_dir = log_dir or tempfile.mkdtemp(prefix="torchelastic_")
-        os.makedirs(base_log_dir, exist_ok=True)
-        dir = tempfile.mkdtemp(prefix=f"{rdzv_run_id}_", dir=base_log_dir)
-        log.info("log directory set to: %s", dir)
-        return dir
 
     def _setup_local_watchdog(self, envs: Dict[int, Dict[str, str]]) -> None:
         enable_watchdog_env_name = TORCHELASTIC_ENABLE_FILE_TIMER
@@ -164,21 +166,58 @@ def _setup_local_watchdog(self, envs: Dict[int, Dict[str, str]]) -> None:
         if watchdog_enabled is not None and str(watchdog_enabled) == "1":
             if watchdog_file_path is None:
                 watchdog_file_path = "/tmp/watchdog_timer_" + str(uuid.uuid4())
-            log.info("Starting a FileTimerServer with %s ...", watchdog_file_path)
+            logger.info("Starting a FileTimerServer with %s ...", watchdog_file_path)
+            if not envs:
+                logger.warning("Empty envs variables, using empty run_id for FileTimerServer")
+                run_id = ''
+            else:
+                run_id = envs[0]["TORCHELASTIC_RUN_ID"]
             self._worker_watchdog = timer.FileTimerServer(
                 file_path=watchdog_file_path,
+                run_id=run_id,
                 max_interval=0.1,
                 daemon=True,
                 log_event=self._log_watchdog_event)
             self._worker_watchdog.start()
-            log.info("FileTimerServer started")
+            logger.info("FileTimerServer started")
         else:
-            log.info("Environment variable '%s' not found. Do not start FileTimerServer.", enable_watchdog_env_name)
+            logger.info("Environment variable '%s' not found. Do not start FileTimerServer.", enable_watchdog_env_name)
         # Propagate the watchdog file env to worker processes
         if watchdog_file_path is not None:
             for worker_env in envs.values():
                 worker_env[watchdog_file_env_name] = watchdog_file_path
 
+    @staticmethod
+    def _get_current_time_secs() -> int:
+        return int(time.time())
+
+    def _setup_healthcheck(self) -> None:
+        healthcheck_port_env_name = TORCHELASTIC_HEALTH_CHECK_PORT
+        healthcheck_port = os.getenv(healthcheck_port_env_name)
+        if healthcheck_port is not None:
+            logger.info(
+                "Found healthcheck port %s: %s",
+                healthcheck_port_env_name,
+                healthcheck_port,
+            )
+            if self._worker_watchdog is None:
+                logger.info("FileTimerServer doesn't exist, using current time as dummy callback")
+                alive_callback = LocalElasticAgent._get_current_time_secs
+            else:
+                alive_callback = self._worker_watchdog.get_last_progress_time
+
+            self._health_check_server = create_healthcheck_server(
+                alive_callback=alive_callback,
+                port=int(healthcheck_port),
+                timeout=60,
+            )
+            self._health_check_server.start()
+        else:
+            logger.info(
+                "Environment variable '%s' not found. Do not start health check.",
+                healthcheck_port_env_name,
+            )
+
 
     def _get_fq_hostname(self) -> str:
         return socket.getfqdn(socket.gethostname())
@@ -224,8 +263,8 @@ def _log_watchdog_event(
     # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
     #  `torch.distributed.elastic.metrics.prof`.
     @prof
-    def _stop_workers(self, worker_group: WorkerGroup) -> None:
-        self._shutdown()
+    def _stop_workers(self, worker_group: WorkerGroup, is_restart: bool = False) -> None:
+        self._shutdown(is_restart=is_restart)
 
     # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
     #  `torch.distributed.elastic.metrics.prof`.
@@ -280,35 +319,34 @@ def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
             worker_args = macros.substitute(worker_args, str(local_rank))
             args[local_rank] = tuple(worker_args)
 
-        # scaling events do not count towards restarts (gets same attempt #)
-        # remove existing log dir if this restart is due to a scaling event
-        attempt_log_dir = os.path.join(self._log_dir, f"attempt_{restart_count}")
-        shutil.rmtree(attempt_log_dir, ignore_errors=True)
-        os.makedirs(attempt_log_dir)
-
         self._setup_local_watchdog(envs=envs)
+        self._setup_healthcheck()
 
         assert spec.entrypoint is not None
+        assert self._logs_specs is not None
         self._pcontext = start_processes(
             name=spec.role,
             entrypoint=spec.entrypoint,
             args=args,
             envs=envs,
-            log_dir=attempt_log_dir,
+            logs_specs=self._logs_specs,
             log_line_prefixes=log_line_prefixes,
             start_method=self._start_method,
-            redirects=spec.redirects,
-            tee=spec.tee,
         )
 
         return self._pcontext.pids()
 
-    def _shutdown(self, death_sig: signal.Signals = signal.SIGTERM) -> None:
+    def _shutdown(self, death_sig: signal.Signals = signal.SIGTERM, is_restart: bool = False) -> None:
         if self._worker_watchdog is not None:
             self._worker_watchdog.stop()
             self._worker_watchdog = None
+        if self._health_check_server is not None:
+            self._health_check_server.stop()
+            self._health_check_server = None
         if self._pcontext:
             self._pcontext.close(death_sig)
+        if not is_restart and self._rdzv_handler:
+            self._rdzv_handler.shutdown()
 
     # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
     #  `torch.distributed.elastic.metrics.prof`.
@@ -319,7 +357,7 @@ def _monitor_workers(self, worker_group: WorkerGroup) -> RunResult:
         assert self._pcontext is not None
         pc_pids = set(self._pcontext.pids().values())
         if worker_pids != pc_pids:
-            log.error(
+            logger.error(
                 "[%s] worker pids do not match process_context pids."
                 " Expected: %s, actual: %s",
                 role, worker_pids, pc_pids
diff --git a/torch/distributed/elastic/events/api.py b/torch/distributed/elastic/events/api.py
index a1fcca28c7150..62f5d7500922c 100644
--- a/torch/distributed/elastic/events/api.py
+++ b/torch/distributed/elastic/events/api.py
@@ -51,7 +51,7 @@ def deserialize(data: Union[str, "Event"]) -> "Event":
             return data
         if isinstance(data, str):
             data_dict = json.loads(data)
-        data_dict["source"] = EventSource[data_dict["source"]]
+        data_dict["source"] = EventSource[data_dict["source"]]  # type: ignore[possibly-undefined]
         return Event(**data_dict)
 
     def serialize(self) -> str:
@@ -105,7 +105,7 @@ def deserialize(data: Union[str, "RdzvEvent"]) -> "RdzvEvent":
             return data
         if isinstance(data, str):
             data_dict = json.loads(data)
-        data_dict["node_state"] = NodeState[data_dict["node_state"]]
+        data_dict["node_state"] = NodeState[data_dict["node_state"]]  # type: ignore[possibly-undefined]
         return RdzvEvent(**data_dict)
 
     def serialize(self) -> str:
diff --git a/torch/distributed/elastic/metrics/api.py b/torch/distributed/elastic/metrics/api.py
index ef0051332d89b..1499943c78d24 100644
--- a/torch/distributed/elastic/metrics/api.py
+++ b/torch/distributed/elastic/metrics/api.py
@@ -58,7 +58,7 @@ def add_value(self, metric_name: str, metric_value: int):
         )
 
 
-_metrics_map = {}
+_metrics_map: Dict[str, MetricHandler] = {}
 _default_metrics_handler: MetricHandler = NullMetricHandler()
 
 
@@ -126,7 +126,7 @@ def wrapper(*args, **kwargs):
                 put_metric(f"{key}.failure", 1, group)
                 raise
             finally:
-                put_metric(f"{key}.duration.ms", get_elapsed_time_ms(start), group)
+                put_metric(f"{key}.duration.ms", get_elapsed_time_ms(start), group)  # type: ignore[possibly-undefined]
             return result
 
         return wrapper
@@ -164,7 +164,7 @@ def wrapper(*args, **kwargs):
                 publish_metric(
                     group,
                     f"{func.__name__}.duration.ms",
-                    get_elapsed_time_ms(start_time),
+                    get_elapsed_time_ms(start_time),  # type: ignore[possibly-undefined]
                 )
             return result
 
diff --git a/torch/distributed/elastic/multiprocessing/__init__.py b/torch/distributed/elastic/multiprocessing/__init__.py
index c848bfd927194..4e26ab1744a98 100644
--- a/torch/distributed/elastic/multiprocessing/__init__.py
+++ b/torch/distributed/elastic/multiprocessing/__init__.py
@@ -63,10 +63,13 @@ def trainer(a, b, c):
 """
 
 import os
-from typing import Callable, Dict, Optional, Tuple, Union
+from typing import Callable, Dict, Optional, Tuple, Union, Set
 
 from torch.distributed.elastic.multiprocessing.api import (  # noqa: F401
     _validate_full_rank,
+    DefaultLogsSpecs,
+    LogsDest,
+    LogsSpecs,
     MultiprocessContext,
     PContext,
     ProcessFailure,
@@ -86,23 +89,22 @@ def trainer(a, b, c):
     "RunProcsResult",
     "SignalException",
     "Std",
+    "LogsDest",
+    "LogsSpecs",
+    "DefaultLogsSpecs",
     "SubprocessContext",
     "to_map",
 ]
 
-log = get_logger(__name__)
-
 
 def start_processes(
     name: str,
     entrypoint: Union[Callable, str],
     args: Dict[int, Tuple],
     envs: Dict[int, Dict[str, str]],
-    log_dir: str,
+    logs_specs: LogsSpecs,
     log_line_prefixes: Optional[Dict[int, str]] = None,
     start_method: str = "spawn",
-    redirects: Union[Std, Dict[int, Std]] = Std.NONE,
-    tee: Union[Std, Dict[int, Std]] = Std.NONE,
 ) -> PContext:
     """
     Start ``n`` copies of ``entrypoint`` processes with the provided options.
@@ -194,68 +196,14 @@ def start_processes(
                       ignored for binaries
         redirects: which std streams to redirect to a log file
         tee: which std streams to redirect + print to console
+        local_ranks_filter: which ranks' logs to print to console
 
     """
-    # listdir raises FileNotFound or NotADirectoryError so no need to check manually
-    if log_dir != os.devnull and os.listdir(log_dir):
-        raise RuntimeError(
-            f"log_dir: {log_dir} is not empty, please provide an empty log_dir"
-        )
 
     nprocs = len(args)
     _validate_full_rank(args, nprocs, "args")
     _validate_full_rank(envs, nprocs, "envs")
 
-    # create subdirs for each local rank in the logs_dir
-    # logs_dir
-    #       |- 0
-    #          |- error.json
-    #          |- stdout.log
-    #          |- stderr.log
-    #       |- ...
-    #       |- (nprocs-1)
-    redirs = to_map(redirects, nprocs)
-    ts = to_map(tee, nprocs)
-
-    # to tee stdout/stderr we first redirect into a file
-    # then tail -f stdout.log/stderr.log so add tee settings to redirects
-    for local_rank, tee_std in ts.items():
-        redirect_std = redirs[local_rank]
-        redirs[local_rank] = redirect_std | tee_std
-
-    stdouts = {local_rank: "" for local_rank in range(nprocs)}
-    stderrs = {local_rank: "" for local_rank in range(nprocs)}
-    tee_stdouts: Dict[int, str] = {}
-    tee_stderrs: Dict[int, str] = {}
-    error_files = {}
-
-    for local_rank in range(nprocs):
-        if log_dir == os.devnull:
-            tee_stdouts[local_rank] = os.devnull
-            tee_stderrs[local_rank] = os.devnull
-            error_files[local_rank] = os.devnull
-            envs[local_rank]["TORCHELASTIC_ERROR_FILE"] = ""
-        else:
-            clogdir = os.path.join(log_dir, str(local_rank))
-            os.mkdir(clogdir)
-
-            rd = redirs[local_rank]
-            if (rd & Std.OUT) == Std.OUT:
-                stdouts[local_rank] = os.path.join(clogdir, "stdout.log")
-            if (rd & Std.ERR) == Std.ERR:
-                stderrs[local_rank] = os.path.join(clogdir, "stderr.log")
-
-            t = ts[local_rank]
-            if t & Std.OUT == Std.OUT:
-                tee_stdouts[local_rank] = stdouts[local_rank]
-            if t & Std.ERR == Std.ERR:
-                tee_stderrs[local_rank] = stderrs[local_rank]
-
-            error_file = os.path.join(clogdir, "error.json")
-            error_files[local_rank] = error_file
-            log.info("Setting worker%s reply file to: %s", local_rank, error_file)
-            envs[local_rank]["TORCHELASTIC_ERROR_FILE"] = error_file
-
     context: PContext
     if isinstance(entrypoint, str):
         context = SubprocessContext(
@@ -263,11 +211,7 @@ def start_processes(
             entrypoint=entrypoint,
             args=args,
             envs=envs,
-            stdouts=stdouts,
-            stderrs=stderrs,
-            tee_stdouts=tee_stdouts,
-            tee_stderrs=tee_stderrs,
-            error_files=error_files,
+            logs_specs=logs_specs,
             log_line_prefixes=log_line_prefixes,
         )
     else:
@@ -276,13 +220,9 @@ def start_processes(
             entrypoint=entrypoint,
             args=args,
             envs=envs,
-            stdouts=stdouts,
-            stderrs=stderrs,
-            tee_stdouts=tee_stdouts,
-            tee_stderrs=tee_stderrs,
-            error_files=error_files,
             log_line_prefixes=log_line_prefixes,
             start_method=start_method,
+            logs_specs=logs_specs,
         )
 
     try:
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index c7c870bdb0733..72c3955e7d1e5 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -10,9 +10,11 @@
 import logging
 import os
 import re
+import shutil
 import signal
 import subprocess
 import sys
+import tempfile
 import time
 from contextlib import nullcontext
 from dataclasses import dataclass, field
@@ -20,6 +22,7 @@
 from multiprocessing import synchronize
 from types import FrameType
 from typing import Any, Callable, Dict, Optional, Set, Tuple, Union
+from abc import ABC, abstractmethod
 
 import torch.multiprocessing as mp
 from torch.distributed.elastic.multiprocessing.errors import ProcessFailure, record
@@ -27,16 +30,27 @@
     redirect_stderr,
     redirect_stdout,
 )
+
+from torch.distributed.elastic.multiprocessing.subprocess_handler import SubprocessHandler, get_subprocess_handler
 from torch.distributed.elastic.multiprocessing.tail_log import TailLog
 
 IS_WINDOWS = sys.platform == "win32"
 IS_MACOS = sys.platform == "darwin"
 
 
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
-__all__ = ["SignalException", "Std", "to_map", "RunProcsResult", "PContext", "get_std_cm", "MultiprocessContext",
-           "SubprocessHandler", "SubprocessContext"]
+__all__ = [
+    "DefaultLogsSpecs",
+    "SignalException",
+    "Std",
+    "to_map",
+    "RunProcsResult",
+    "PContext",
+    "get_std_cm",
+    "MultiprocessContext",
+    "SubprocessContext",
+]
 
 class SignalException(Exception):
     """
@@ -148,7 +162,7 @@ def to_map(
      to_map({0: Std.OUT, 1: Std.OUT}, local_world_size=2) # returns: {0: Std.OUT, 1: Std.OUT}
     """
     if isinstance(val_or_map, Std):
-        return {i: val_or_map for i in range(local_world_size)}
+        return dict.fromkeys(range(local_world_size), val_or_map)
     else:
         map = {}
         for i in range(local_world_size):
@@ -156,6 +170,214 @@ def to_map(
         return map
 
 
+@dataclass
+class LogsDest:
+    """
+    For each log type, holds mapping of local rank ids to file paths.
+    """
+    stdouts: Dict[int, str] = field(default_factory=dict)
+    stderrs: Dict[int, str] = field(default_factory=dict)
+    tee_stdouts: Dict[int, str] = field(default_factory=dict)
+    tee_stderrs: Dict[int, str] = field(default_factory=dict)
+    error_files: Dict[int, str] = field(default_factory=dict)
+
+
+class LogsSpecs(ABC):
+    """
+    Defines logs processing and redirection for each worker process.
+
+    Args:
+        log_dir:
+            Base directory where logs will be written.
+        redirects:
+            Streams to redirect to files. Pass a single ``Std``
+            enum to redirect for all workers, or a mapping keyed
+            by local_rank to selectively redirect.
+        tee:
+            Streams to duplicate to stdout/stderr.
+            Pass a single ``Std`` enum to duplicate streams for all workers,
+            or a mapping keyed by local_rank to selectively duplicate.
+    """
+
+    def __init__(
+        self,
+        log_dir: Optional[str] = None,
+        redirects: Union[Std, Dict[int, Std]] = Std.NONE,
+        tee: Union[Std, Dict[int, Std]] = Std.NONE,
+        local_ranks_filter: Optional[Set[int]] = None,
+    ) -> None:
+        self._root_log_dir = log_dir
+        self._redirects = redirects
+        self._tee = tee
+        self._local_ranks_filter = local_ranks_filter
+
+    @abstractmethod
+    def reify(self, envs: Dict[int, Dict[str, str]],) -> LogsDest:
+        """
+        Given the environment variables, builds destination of log files for each of the local ranks.
+
+        Envs parameter contains env variables dict for each of the local ranks, where entries are defined in:
+        :func:`~torchelastic.distributed.elastic.agent.server.local_elastic_agent.LocalElasticAgent._start_workers`.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def root_log_dir(self) -> str:
+        pass
+
+class DefaultLogsSpecs(LogsSpecs):
+    """
+    Default LogsSpecs implementation:
+
+    - `log_dir` will be created if it doesn't exist
+    - Generates nested folders for each attempt and rank.
+    """
+    def __init__(
+        self,
+        log_dir: Optional[str] = None,
+        redirects: Union[Std, Dict[int, Std]] = Std.NONE,
+        tee: Union[Std, Dict[int, Std]] = Std.NONE,
+        local_ranks_filter: Optional[Set[int]] = None,
+    ) -> None:
+        if log_dir != os.devnull:
+            if not log_dir:
+                log_dir = tempfile.mkdtemp(prefix="torchelastic_")
+            elif not os.path.exists(log_dir):
+                os.makedirs(log_dir)
+            else:
+                if os.path.isfile(log_dir):
+                    raise NotADirectoryError(f"log_dir: {log_dir} is a file")
+        super().__init__(log_dir, redirects, tee, local_ranks_filter)
+        # initialized only once
+        self._run_log_dir = None
+
+    @property
+    def root_log_dir(self) -> str:
+        return str(self._root_log_dir)
+
+    def _make_log_dir(self, log_dir: Optional[str], rdzv_run_id: str):
+        base_log_dir = log_dir or tempfile.mkdtemp(prefix="torchelastic_")
+        os.makedirs(base_log_dir, exist_ok=True)
+        dir = tempfile.mkdtemp(prefix=f"{rdzv_run_id}_", dir=base_log_dir)
+        logger.info("log directory set to: %s", dir)
+        return dir
+
+    def reify(self, envs: Dict[int, Dict[str, str]],) -> LogsDest:
+        """
+        Uses following scheme to build log destination paths:
+
+        - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/<rank>/stdout.log`
+        - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/<rank>/stderr.log`
+        - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/<rank>/error.json`
+        """
+        nprocs = len(envs)
+        global_env = {}  # use only to query properies that are not dependent on a rank
+        if nprocs > 0:
+            global_env = envs[0]
+        else:
+            logger.warning("Empty envs map provided when defining logging destinations.")
+        # Keys are always defined, but values can be missing in unit tests
+        run_id = global_env.get("TORCHELASTIC_RUN_ID", "test_run_id")
+        restart_count = global_env.get("TORCHELASTIC_RESTART_COUNT", "0")
+
+        attempt_log_dir: str = ""
+        if self._root_log_dir != os.devnull:
+            if not self._run_log_dir:
+                self._run_log_dir = self._make_log_dir(self._root_log_dir, run_id)
+
+            attempt_log_dir = os.path.join(self._run_log_dir, f"attempt_{restart_count}")  # type: ignore[call-overload]
+            shutil.rmtree(attempt_log_dir, ignore_errors=True)
+            os.makedirs(attempt_log_dir)
+
+        if self._root_log_dir == os.devnull:
+            attempt_log_dir = os.devnull
+
+        # create subdirs for each local rank in the logs_dir
+        # logs_dir
+        #       |- 0
+        #          |- error.json
+        #          |- stdout.log
+        #          |- stderr.log
+        #       |- ...
+        #       |- (nprocs-1)
+        redirs = to_map(self._redirects, nprocs)
+        ts = to_map(self._tee, nprocs)
+
+        # to tee stdout/stderr we first redirect into a file
+        # then tail -f stdout.log/stderr.log so add tee settings to redirects
+        for local_rank, tee_std in ts.items():
+            redirect_std = redirs[local_rank]
+            redirs[local_rank] = redirect_std | tee_std
+
+        SYS_STREAM = ""  # special case to indicate to output to console
+        stdouts = dict.fromkeys(range(nprocs), SYS_STREAM)
+        stderrs = dict.fromkeys(range(nprocs), SYS_STREAM)
+        tee_stdouts: Dict[int, str] = {}
+        tee_stderrs: Dict[int, str] = {}
+        error_files = {}
+
+        for local_rank in range(nprocs):
+
+            if attempt_log_dir == os.devnull:
+                tee_stdouts[local_rank] = os.devnull
+                tee_stderrs[local_rank] = os.devnull
+                error_files[local_rank] = os.devnull
+                envs[local_rank]["TORCHELASTIC_ERROR_FILE"] = ""
+            else:
+                clogdir = os.path.join(attempt_log_dir, str(local_rank))
+                os.mkdir(clogdir)
+
+                rd = redirs[local_rank]
+                if (rd & Std.OUT) == Std.OUT:
+                    stdouts[local_rank] = os.path.join(clogdir, "stdout.log")
+                if (rd & Std.ERR) == Std.ERR:
+                    stderrs[local_rank] = os.path.join(clogdir, "stderr.log")
+
+                t = ts[local_rank]
+                if t & Std.OUT == Std.OUT:
+                    tee_stdouts[local_rank] = stdouts[local_rank]
+                if t & Std.ERR == Std.ERR:
+                    tee_stderrs[local_rank] = stderrs[local_rank]
+
+                if self._local_ranks_filter and local_rank not in self._local_ranks_filter:
+                    # If stream is tee'd, only write to file, but don't tail
+                    if local_rank in tee_stdouts:
+                        tee_stdouts.pop(local_rank, None)
+                    if local_rank in tee_stderrs:
+                        tee_stderrs.pop(local_rank, None)
+
+                    # If stream is not redirected, don't print
+                    if stdouts[local_rank] == SYS_STREAM:
+                        stdouts[local_rank] = os.devnull
+                    if stderrs[local_rank] == SYS_STREAM:
+                        stderrs[local_rank] = os.devnull
+
+                error_file = os.path.join(clogdir, "error.json")
+                error_files[local_rank] = error_file
+                logger.info("Setting worker%s reply file to: %s", local_rank, error_file)
+                envs[local_rank]["TORCHELASTIC_ERROR_FILE"] = error_file
+
+        return LogsDest(stdouts, stderrs, tee_stdouts, tee_stderrs, error_files)
+
+    def __repr__(self) -> str:
+        return (
+            f"DefaultLogsSpecs(root_log_dir={self._root_log_dir}, redirects={self._redirects}, "
+            f"tee={self._tee}, local_ranks_filter={self._local_ranks_filter})"
+        )
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, DefaultLogsSpecs):
+            return False
+
+        return (
+            self._root_log_dir == other._root_log_dir
+            and self._redirects == other._redirects
+            and self._tee == other._tee
+            and self._local_ranks_filter == other._local_ranks_filter
+        )
+
+
 @dataclass
 class RunProcsResult:
     """
@@ -196,30 +418,31 @@ def __init__(
         entrypoint: Union[Callable, str],
         args: Dict[int, Tuple],
         envs: Dict[int, Dict[str, str]],
-        stdouts: Dict[int, str],
-        stderrs: Dict[int, str],
-        tee_stdouts: Dict[int, str],
-        tee_stderrs: Dict[int, str],
-        error_files: Dict[int, str],
+        logs_specs: LogsSpecs,
         log_line_prefixes: Optional[Dict[int, str]] = None,
+
     ):
         self.name = name
         # validate that all mappings have the same number of keys and
         # all local ranks are accounted for
         nprocs = len(args)
-        _validate_full_rank(stdouts, nprocs, "stdouts")
-        _validate_full_rank(stderrs, nprocs, "stderrs")
+
+        # TODO log_line_prefixes can be exanded too
+        logs_dest = logs_specs.reify(envs)
+
+        _validate_full_rank(logs_dest.stdouts, nprocs, "stdouts")
+        _validate_full_rank(logs_dest.stderrs, nprocs, "stderrs")
 
         self.entrypoint = entrypoint
         self.args = args
         self.envs = envs
-        self.stdouts = stdouts
-        self.stderrs = stderrs
-        self.error_files = error_files
+        self.stdouts = logs_dest.stdouts
+        self.stderrs = logs_dest.stderrs
+        self.error_files = logs_dest.error_files
         self.nprocs = nprocs
 
-        self._stdout_tail = TailLog(name, tee_stdouts, sys.stdout, log_line_prefixes)
-        self._stderr_tail = TailLog(name, tee_stderrs, sys.stderr, log_line_prefixes)
+        self._stdout_tail = TailLog(name, logs_dest.tee_stdouts, sys.stdout, log_line_prefixes)
+        self._stderr_tail = TailLog(name, logs_dest.tee_stderrs, sys.stderr, log_line_prefixes)
 
     def start(self) -> None:
         """Start processes using parameters defined in the constructor."""
@@ -235,7 +458,7 @@ def start(self) -> None:
     @abc.abstractmethod
     def _start(self) -> None:
         """Start processes using strategy defined in a particular context."""
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @abc.abstractmethod
     def _poll(self) -> Optional[RunProcsResult]:
@@ -246,7 +469,7 @@ def _poll(self) -> Optional[RunProcsResult]:
         successfully or any process fails. Returns ``None`` if
         all processes are still running.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def wait(self, timeout: float = -1, period: float = 1) -> Optional[RunProcsResult]:
         """
@@ -291,7 +514,7 @@ def wait(self, timeout: float = -1, period: float = 1) -> Optional[RunProcsResul
     @abc.abstractmethod
     def pids(self) -> Dict[int, int]:
         """Return pids of processes mapped by their respective local_ranks."""
-        raise NotImplementedError()
+        raise NotImplementedError
 
     @abc.abstractmethod
     def _close(self, death_sig: signal.Signals, timeout: int = 30) -> None:
@@ -299,7 +522,7 @@ def _close(self, death_sig: signal.Signals, timeout: int = 30) -> None:
         Terminates all processes managed by this context and cleans up any
         meta resources (e.g. redirect, error_file files).
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def close(
         self, death_sig: Optional[signal.Signals] = None, timeout: int = 30
@@ -368,12 +591,8 @@ def __init__(
         entrypoint: Callable,
         args: Dict[int, Tuple],
         envs: Dict[int, Dict[str, str]],
-        stdouts: Dict[int, str],
-        stderrs: Dict[int, str],
-        tee_stdouts: Dict[int, str],
-        tee_stderrs: Dict[int, str],
-        error_files: Dict[int, str],
         start_method: str,
+        logs_specs: LogsSpecs,
         log_line_prefixes: Optional[Dict[int, str]] = None,
     ):
         super().__init__(
@@ -381,11 +600,7 @@ def __init__(
             entrypoint,
             args,
             envs,
-            stdouts,
-            stderrs,
-            tee_stdouts,
-            tee_stderrs,
-            error_files,
+            logs_specs,
             log_line_prefixes,
         )
 
@@ -477,7 +692,7 @@ def _poll(self) -> Optional[RunProcsResult]:
             failed_proc = self._pc.processes[failed_local_rank]
             error_filepath = self.error_files[failed_local_rank]
 
-            log.exception(
+            logger.exception(
                 "failed (exitcode: %s)"
                 " local_rank: %s (pid: %s)"
                 " of fn: %s (start_method: %s)",
@@ -509,7 +724,7 @@ def _close(self, death_sig: signal.Signals, timeout: int = 30) -> None:
             return
         for proc in self._pc.processes:
             if proc.is_alive():
-                log.warning("Closing process %s via signal %s", proc.pid, death_sig.name)
+                logger.warning("Closing process %s via signal %s", proc.pid, death_sig.name)
                 try:
                     os.kill(proc.pid, death_sig)
                 except ProcessLookupError:
@@ -524,7 +739,7 @@ def _close(self, death_sig: signal.Signals, timeout: int = 30) -> None:
             proc.join(time_to_wait)
         for proc in self._pc.processes:
             if proc.is_alive():
-                log.warning(
+                logger.warning(
                     "Unable to shutdown process %s via %s, forcefully exiting via %s",
                     proc.pid, death_sig, _get_kill_signal()
                 )
@@ -536,58 +751,6 @@ def _close(self, death_sig: signal.Signals, timeout: int = 30) -> None:
                     pass
             proc.join()
 
-
-class SubprocessHandler:
-    """
-    Convenience wrapper around python's ``subprocess.Popen``. Keeps track of
-    meta-objects associated to the process (e.g. stdout and stderr redirect fds).
-    """
-
-    def __init__(
-        self,
-        entrypoint: str,
-        args: Tuple,
-        env: Dict[str, str],
-        stdout: str,
-        stderr: str,
-    ):
-        self._stdout = open(stdout, "w") if stdout else None
-        self._stderr = open(stderr, "w") if stderr else None
-        # inherit parent environment vars
-        env_vars = os.environ.copy()
-        env_vars.update(env)
-
-        args_str = (entrypoint, *[str(e) for e in args])
-        self.proc: subprocess.Popen = self._popen(args_str, env_vars)
-
-    def _popen(self, args: Tuple, env: Dict[str, str]) -> subprocess.Popen:
-        kwargs: Dict[str, Any] = {}
-        if not IS_WINDOWS:
-            kwargs['start_new_session'] = True
-        return subprocess.Popen(
-            # pyre-fixme[6]: Expected `Union[typing.Sequence[Union[_PathLike[bytes],
-            #  _PathLike[str], bytes, str]], bytes, str]` for 1st param but got
-            #  `Tuple[str, *Tuple[Any, ...]]`.
-            args=args,
-            env=env,
-            stdout=self._stdout,
-            stderr=self._stderr,
-            **kwargs
-        )
-
-    def close(self, death_sig: Optional[signal.Signals] = None) -> None:
-        if not death_sig:
-            death_sig = _get_default_signal()
-        if IS_WINDOWS:
-            self.proc.send_signal(death_sig)
-        else:
-            os.killpg(self.proc.pid, death_sig)
-        if self._stdout:
-            self._stdout.close()
-        if self._stderr:
-            self._stderr.close()
-
-
 class SubprocessContext(PContext):
     """``PContext`` holding worker processes invoked as a binary."""
 
@@ -597,23 +760,16 @@ def __init__(
         entrypoint: str,
         args: Dict[int, Tuple],
         envs: Dict[int, Dict[str, str]],
-        stdouts: Dict[int, str],
-        stderrs: Dict[int, str],
-        tee_stdouts: Dict[int, str],
-        tee_stderrs: Dict[int, str],
-        error_files: Dict[int, str],
+        logs_specs: LogsSpecs,
         log_line_prefixes: Optional[Dict[int, str]] = None,
+
     ):
         super().__init__(
             name,
             entrypoint,
             args,
             envs,
-            stdouts,
-            stderrs,
-            tee_stdouts,
-            tee_stderrs,
-            error_files,
+            logs_specs,
             log_line_prefixes,
         )
 
@@ -628,12 +784,13 @@ def _start(self):
                 "The subprocess handlers already initialized. Most likely the start method got called twice."
             )
         self.subprocess_handlers = {
-            local_rank: SubprocessHandler(
+            local_rank: get_subprocess_handler(
                 entrypoint=self.entrypoint,  # type: ignore[arg-type] # entrypoint is always a str
                 args=self.args[local_rank],
                 env=self.envs[local_rank],
                 stdout=self.stdouts[local_rank],
                 stderr=self.stderrs[local_rank],
+                local_rank_id=local_rank,
             )
             for local_rank in range(self.nprocs)
         }
@@ -666,7 +823,7 @@ def _poll(self) -> Optional[RunProcsResult]:
             )
             if result.is_failed():
                 first_failure = min(result.failures.values(), key=lambda f: f.timestamp)
-                log.error(
+                logger.error(
                     "failed (exitcode: %s)"
                     " local_rank: %s (pid: %s)"
                     " of binary: %s",
@@ -674,9 +831,7 @@ def _poll(self) -> Optional[RunProcsResult]:
                 )
             else:
                 # Populate return with dummy values. This provides consistency with MultiprocessingHandler
-                result.return_values = {
-                    local_rank: None for local_rank in range(self.nprocs)
-                }
+                result.return_values = dict.fromkeys(range(self.nprocs))
 
             return result
         else:  # there are no failures and procs still running
@@ -693,7 +848,7 @@ def _close(self, death_sig: signal.Signals, timeout: int = 30) -> None:
             return
         for handler in self.subprocess_handlers.values():
             if handler.proc.poll() is None:
-                log.warning(
+                logger.warning(
                     "Sending process %s closing signal %s", handler.proc.pid, death_sig.name
                 )
                 handler.close(death_sig=death_sig)
@@ -710,7 +865,7 @@ def _close(self, death_sig: signal.Signals, timeout: int = 30) -> None:
                 pass
         for handler in self.subprocess_handlers.values():
             if handler.proc.poll() is None:
-                log.warning(
+                logger.warning(
                     "Unable to shutdown process %s via %s, forcefully exiting via %s",
                     handler.proc.pid, death_sig, _get_kill_signal()
                 )
diff --git a/torch/distributed/elastic/multiprocessing/errors/__init__.py b/torch/distributed/elastic/multiprocessing/errors/__init__.py
index 3c06aaba77606..95d6a61922453 100644
--- a/torch/distributed/elastic/multiprocessing/errors/__init__.py
+++ b/torch/distributed/elastic/multiprocessing/errors/__init__.py
@@ -67,7 +67,7 @@
 
 __all__ = ["ProcessFailure", "ChildFailedError", "record", "ErrorHandler", "get_error_handler"]
 
-log = get_logger(__name__)
+logger = get_logger(__name__)
 
 
 JSON = Dict
@@ -111,14 +111,14 @@ def __post_init__(self):
             try:
                 with open(self.error_file) as fp:
                     self.error_file_data = json.load(fp)
-                    log.debug(
+                    logger.debug(
                         "User process failed with error data: %s", json.dumps(self.error_file_data, indent=2)
                     )
                     self.message, self.timestamp = self._get_error_data(
                         self.error_file_data
                     )
             except Exception:
-                log.exception("Failed to parse reply file: %s", self.error_file)
+                logger.exception("Failed to parse reply file: %s", self.error_file)
                 raise
         else:
             self._set_no_reply_file()
@@ -345,12 +345,19 @@ def wrapper(*args, **kwargs):
             error_handler.initialize()
             try:
                 return f(*args, **kwargs)
+            except SystemExit as se:
+                # For run_path based entrypoints, SystemExit with code = 0 will never exit.
+                # Handling it here by returning a value:
+                if se.code == 0:
+                    return None
+                else:
+                    raise
             except ChildFailedError as e:
                 rank, failure = e.get_first_failure()
                 if failure.error_file != _NOT_AVAILABLE:
                     error_handler.dump_error_file(failure.error_file, failure.exitcode)
                 else:
-                    log.info(
+                    logger.info(
                         (
                             "local_rank %s FAILED with no error file."
                             " Decorate your entrypoint fn with @record for traceback info."
diff --git a/torch/distributed/elastic/multiprocessing/errors/error_handler.py b/torch/distributed/elastic/multiprocessing/errors/error_handler.py
index 481e9da23b4b1..903731a6a2abf 100644
--- a/torch/distributed/elastic/multiprocessing/errors/error_handler.py
+++ b/torch/distributed/elastic/multiprocessing/errors/error_handler.py
@@ -16,7 +16,7 @@
 
 __all__ = ['ErrorHandler']
 
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 class ErrorHandler:
@@ -89,13 +89,13 @@ def override_error_code_in_rootcause_data(
     ):
         """Modify the rootcause_error read from the file, to correctly set the exit code."""
         if "message" not in rootcause_error:
-            log.warning(
+            logger.warning(
                 "child error file (%s) does not have field `message`. \n"
                 "cannot override error code: %s",
                 rootcause_error_file, error_code
             )
         elif isinstance(rootcause_error["message"], str):
-            log.warning(
+            logger.warning(
                 "child error file (%s) has a new message format. \n"
                 "skipping error code override",
                 rootcause_error_file
@@ -111,7 +111,7 @@ def dump_error_file(self, rootcause_error_file: str, error_code: int = 0):
             # is terminated by signals like SIGSEGV.
             if error_code:
                 self.override_error_code_in_rootcause_data(rootcause_error_file, rootcause_error, error_code)
-            log.debug(
+            logger.debug(
                 "child error file (%s) contents:\n"
                 "%s",
                 rootcause_error_file, json.dumps(rootcause_error, indent=2)
@@ -131,9 +131,9 @@ def dump_error_file(self, rootcause_error_file: str, error_code: int = 0):
             # original error file contents and overwrite the error file.
             self._rm(my_error_file)
             self._write_error_file(my_error_file, json.dumps(rootcause_error))
-            log.info("dumped error file to parent's %s", my_error_file)
+            logger.info("dumped error file to parent's %s", my_error_file)
         else:
-            log.error(
+            logger.error(
                 "no error file defined for parent, to copy child error file (%s)", rootcause_error_file
             )
 
@@ -143,14 +143,14 @@ def _rm(self, my_error_file):
             with open(my_error_file) as fp:
                 try:
                     original = json.dumps(json.load(fp), indent=2)
-                    log.warning(
+                    logger.warning(
                         "%s already exists"
                         " and will be overwritten."
                         " Original contents:\n%s",
                         my_error_file, original
                     )
-                except json.decoder.JSONDecodeError as err:
-                    log.warning(
+                except json.decoder.JSONDecodeError:
+                    logger.warning(
                         "%s already exists"
                         " and will be overwritten."
                         " Unable to load original contents:\n",
diff --git a/torch/distributed/elastic/multiprocessing/subprocess_handler/__init__.py b/torch/distributed/elastic/multiprocessing/subprocess_handler/__init__.py
new file mode 100644
index 0000000000000..4c335964c7322
--- /dev/null
+++ b/torch/distributed/elastic/multiprocessing/subprocess_handler/__init__.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from torch.distributed.elastic.multiprocessing.subprocess_handler.handlers import (
+    get_subprocess_handler,
+)
+from torch.distributed.elastic.multiprocessing.subprocess_handler.subprocess_handler import (
+    SubprocessHandler,
+)
+
+__all__ = ["SubprocessHandler", "get_subprocess_handler"]
diff --git a/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py b/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py
new file mode 100644
index 0000000000000..8d4477452a200
--- /dev/null
+++ b/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict, Tuple
+
+from torch.distributed.elastic.multiprocessing.subprocess_handler.subprocess_handler import (
+    SubprocessHandler,
+)
+
+__all__ = ["get_subprocess_handler"]
+
+
+def get_subprocess_handler(
+    entrypoint: str,
+    args: Tuple,
+    env: Dict[str, str],
+    stdout: str,
+    stderr: str,
+    local_rank_id: int,
+):
+    return SubprocessHandler(
+        entrypoint=entrypoint,
+        args=args,
+        env=env,
+        stdout=stdout,
+        stderr=stderr,
+        local_rank_id=local_rank_id,
+    )
diff --git a/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py b/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
new file mode 100644
index 0000000000000..7cacf98685750
--- /dev/null
+++ b/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import signal
+import subprocess
+import sys
+
+from typing import Any, Dict, Optional, Tuple
+
+__all__ = ["SubprocessHandler"]
+
+IS_WINDOWS = sys.platform == "win32"
+
+
+def _get_default_signal() -> signal.Signals:
+    """Get the default termination signal. SIGTERM for unix, CTRL_C_EVENT for windows."""
+    if IS_WINDOWS:
+        return signal.CTRL_C_EVENT  # type: ignore[attr-defined] # noqa: F821
+    else:
+        return signal.SIGTERM
+
+
+class SubprocessHandler:
+    """
+    Convenience wrapper around python's ``subprocess.Popen``. Keeps track of
+    meta-objects associated to the process (e.g. stdout and stderr redirect fds).
+    """
+
+    def __init__(
+        self,
+        entrypoint: str,
+        args: Tuple,
+        env: Dict[str, str],
+        stdout: str,
+        stderr: str,
+        local_rank_id: int,
+    ):
+        self._stdout = open(stdout, "w") if stdout else None
+        self._stderr = open(stderr, "w") if stderr else None
+        # inherit parent environment vars
+        env_vars = os.environ.copy()
+        env_vars.update(env)
+
+        args_str = (entrypoint, *[str(e) for e in args])
+        self.local_rank_id = local_rank_id
+        self.proc: subprocess.Popen = self._popen(args_str, env_vars)
+
+    def _popen(self, args: Tuple, env: Dict[str, str]) -> subprocess.Popen:
+        kwargs: Dict[str, Any] = {}
+        if not IS_WINDOWS:
+            kwargs["start_new_session"] = True
+        return subprocess.Popen(
+            # pyre-fixme[6]: Expected `Union[typing.Sequence[Union[_PathLike[bytes],
+            #  _PathLike[str], bytes, str]], bytes, str]` for 1st param but got
+            #  `Tuple[str, *Tuple[Any, ...]]`.
+            args=args,
+            env=env,
+            stdout=self._stdout,
+            stderr=self._stderr,
+            **kwargs,
+        )
+
+    def close(self, death_sig: Optional[signal.Signals] = None) -> None:
+        if not death_sig:
+            death_sig = _get_default_signal()
+        if IS_WINDOWS:
+            self.proc.send_signal(death_sig)
+        else:
+            os.killpg(self.proc.pid, death_sig)
+        if self._stdout:
+            self._stdout.close()
+        if self._stderr:
+            self._stderr.close()
diff --git a/torch/distributed/elastic/multiprocessing/tail_log.py b/torch/distributed/elastic/multiprocessing/tail_log.py
index 4b24638d5b0d5..17b0d216e9542 100644
--- a/torch/distributed/elastic/multiprocessing/tail_log.py
+++ b/torch/distributed/elastic/multiprocessing/tail_log.py
@@ -9,14 +9,16 @@
 import logging
 import os
 import time
-from concurrent.futures._base import Future
 from concurrent.futures.thread import ThreadPoolExecutor
 from threading import Event
-from typing import Dict, List, Optional, TextIO
+from typing import Dict, List, Optional, TextIO, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from concurrent.futures._base import Future
 
 __all__ = ["tail_logfile", "TailLog"]
 
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 def tail_logfile(
@@ -28,7 +30,7 @@ def tail_logfile(
             return
         time.sleep(interval_sec)
 
-    with open(file) as fp:
+    with open(file, errors="replace") as fp:
         while True:
             line = fp.readline()
 
@@ -138,7 +140,7 @@ def stop(self) -> None:
             try:
                 f.result()
             except Exception as e:
-                log.error(
+                logger.error(
                     "error in log tailor for %s%s. %s: %s",
                     self._name, local_rank,
                     e.__class__.__qualname__, e,
diff --git a/torch/distributed/elastic/rendezvous/__init__.py b/torch/distributed/elastic/rendezvous/__init__.py
index 2dd0ebf6d155e..668849fa6d207 100644
--- a/torch/distributed/elastic/rendezvous/__init__.py
+++ b/torch/distributed/elastic/rendezvous/__init__.py
@@ -139,6 +139,7 @@ class that implements the rendezvous mechanism described above. It is a backend-
     "RendezvousClosedError",
     "RendezvousConnectionError",
     "RendezvousError",
+    "RendezvousGracefulExitError",
     "RendezvousHandler",
     "RendezvousHandlerCreator",
     "RendezvousHandlerRegistry",
diff --git a/torch/distributed/elastic/rendezvous/api.py b/torch/distributed/elastic/rendezvous/api.py
index 58c442139b5d0..9e096c4ee4090 100644
--- a/torch/distributed/elastic/rendezvous/api.py
+++ b/torch/distributed/elastic/rendezvous/api.py
@@ -29,6 +29,11 @@ class RendezvousConnectionError(RendezvousError):
 class RendezvousStateError(RendezvousError):
     """Raised when the state of a rendezvous is corrupt."""
 
+class RendezvousGracefulExitError(RendezvousError):
+    """Raised when node wasn't not included in rendezvous and gracefully exits.
+
+    Exception is a mechanism to exit the stack, however does not mean a failure.
+    """
 
 class RendezvousHandler(ABC):
     """Main rendezvous interface.
diff --git a/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py b/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
index faf260a3a6be0..62413df02aaea 100644
--- a/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
+++ b/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
@@ -27,7 +27,7 @@
 from .dynamic_rendezvous import RendezvousBackend, Token
 from .utils import _matches_machine_hostname, parse_rendezvous_endpoint
 
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 class C10dRendezvousBackend(RendezvousBackend):
@@ -143,6 +143,8 @@ def _create_tcp_store(params: RendezvousParameters) -> TCPStore:
     else:
         is_host = _matches_machine_hostname(host)
 
+    use_libuv = params.get_as_bool("use_libuv", False)
+
     # The timeout
     read_timeout = cast(int, params.get_as_int("read_timeout", 60))
     if read_timeout <= 0:
@@ -153,7 +155,11 @@ def _create_tcp_store(params: RendezvousParameters) -> TCPStore:
     for is_server in [is_host, False]:
         try:
             store = TCPStore(
-                host, port, is_master=is_server, timeout=timedelta(seconds=read_timeout)
+                host,
+                port,
+                is_master=is_server,
+                timeout=timedelta(seconds=read_timeout),
+                use_libuv=use_libuv,
             )
 
             if is_server:
@@ -161,7 +167,7 @@ def _create_tcp_store(params: RendezvousParameters) -> TCPStore:
                 construct_and_record_rdzv_event(
                     run_id=params.run_id, message=msg, node_state=NodeState.INIT
                 )
-                log.info(msg)
+                logger.info(msg)
 
             break
         except (ValueError, RuntimeError, TimeoutError) as exc:
@@ -176,7 +182,7 @@ def _create_tcp_store(params: RendezvousParameters) -> TCPStore:
                     "The connection to the C10d store has failed. See inner exception for details."
                 ) from exc
 
-    return store
+    return store  # type: ignore[possibly-undefined]
 
 
 def _create_file_store(params: RendezvousParameters) -> FileStore:
diff --git a/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py b/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py
index a24342e10083b..b3b599cb1c33f 100644
--- a/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py
@@ -16,17 +16,15 @@
 from dataclasses import dataclass
 from datetime import datetime, timedelta
 from enum import Enum
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, cast
+from typing import Any, Callable, cast, Dict, List, Optional, Set, Tuple
 
 from torch.distributed import PrefixStore, Store
-from torch.distributed.elastic.events import (
-    NodeState,
-    construct_and_record_rdzv_event,
-)
+from torch.distributed.elastic.events import construct_and_record_rdzv_event, NodeState
 
 from .api import (
     RendezvousClosedError,
     RendezvousError,
+    RendezvousGracefulExitError,
     RendezvousHandler,
     RendezvousParameters,
     RendezvousStateError,
@@ -36,7 +34,7 @@
 
 __all__ = ['RendezvousBackend', 'RendezvousTimeout', 'RendezvousSettings', 'DynamicRendezvousHandler', 'create_handler']
 
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 def get_method_name(depth=2):
@@ -275,6 +273,9 @@ class _RendezvousState:
         wait_list:
             A set of nodes that are waiting to participate in the next round of
             the rendezvous.
+        redundancy_list:
+            A set of nodes that are redundant in the current round and can join
+            the next rendezvous without triggering re-rendezvous.
         last_heartbeats:
             A dictionary containing each node's last heartbeat time.
     """
@@ -285,6 +286,7 @@ class _RendezvousState:
     closed: bool
     participants: Dict[_NodeDesc, int]
     wait_list: Set[_NodeDesc]
+    redundancy_list: Set[_NodeDesc]
     last_heartbeats: Dict[_NodeDesc, datetime]
 
     def __init__(self) -> None:
@@ -294,6 +296,7 @@ def __init__(self) -> None:
         self.closed = False
         self.participants = {}
         self.wait_list = set()
+        self.redundancy_list = set()
         self.last_heartbeats = {}
 
 
@@ -301,11 +304,18 @@ def _remove_participant_epilogue(state: _RendezvousState, settings: RendezvousSe
     if state.complete:
         # If we do not have any participants left, move to the next round.
         if not state.participants:
+            msg = "No participants left in the rendezvous, marking rendezvous as incomplete"
+            logger.debug(msg)
             state.complete = False
 
             state.round += 1
     else:
         if len(state.participants) < settings.min_nodes:
+            msg = (
+                f"Number of participants {len(state.participants)}) less than"
+                f"min_nodes {settings.min_nodes}, clearning deadline in state"
+            )
+            logger.debug(msg)
             state.deadline = None
 
 
@@ -420,7 +430,7 @@ def sync(self) -> Optional[bool]:
         else:
             self._state = _RendezvousState()
 
-        if has_set and self._dead_nodes and log.isEnabledFor(logging.DEBUG):
+        if has_set and self._dead_nodes and logger.isEnabledFor(logging.DEBUG):
             node_list = ", ".join(f"'{dead_node}'" for dead_node in self._dead_nodes)
 
             msg = (
@@ -428,7 +438,7 @@ def sync(self) -> Optional[bool]:
                 f"rendezvous '{self._settings.run_id}' since they had no heartbeat."
             )
             self._record(message=msg)
-            log.debug(msg)
+            logger.debug(msg)
 
         self._token = token
 
@@ -457,6 +467,8 @@ def _sanitize(self) -> None:
         participant_removed = False
 
         for dead_node in self._dead_nodes:
+            msg = f"Detected dead node '{dead_node}', removing it from the rendezvous"
+            logger.debug(msg)
             del state.last_heartbeats[dead_node]
 
             try:
@@ -471,6 +483,11 @@ def _sanitize(self) -> None:
             except KeyError:
                 pass
 
+            try:
+                state.redundancy_list.remove(dead_node)
+            except KeyError:
+                pass
+
         if participant_removed:
             # Common epilogue shared with the _remove_from_participants()
             # function of _DistributedRendezvousOpExecutor.
@@ -493,14 +510,16 @@ class _Action(Enum):
     KEEP_ALIVE = 1
     ADD_TO_PARTICIPANTS = 2
     ADD_TO_WAIT_LIST = 3
-    REMOVE_FROM_PARTICIPANTS = 4
-    REMOVE_FROM_WAIT_LIST = 5
-    MARK_RENDEZVOUS_COMPLETE = 6
-    MARK_RENDEZVOUS_CLOSED = 7
-    SYNC = 8
-    ERROR_CLOSED = 9
-    ERROR_TIMEOUT = 10
-    FINISH = 11
+    ADD_TO_REDUNDANCY_LIST = 4
+    REMOVE_FROM_PARTICIPANTS = 5
+    REMOVE_FROM_WAIT_LIST = 6
+    REMOVE_FROM_REDUNDANCY_LIST = 7
+    MARK_RENDEZVOUS_COMPLETE = 8
+    MARK_RENDEZVOUS_CLOSED = 9
+    SYNC = 10
+    ERROR_CLOSED = 11
+    ERROR_TIMEOUT = 12
+    FINISH = 13
 
 
 class _RendezvousContext:
@@ -536,6 +555,7 @@ def run(
         self,
         state_handler: Callable[[_RendezvousContext, float], _Action],
         deadline: float,
+        update_deadline: Optional[Callable[[timedelta], float]] = None,
     ) -> None:
         """Execute a rendezvous operation.
 
@@ -549,6 +569,9 @@ def run(
             deadline:
                 The time, in seconds, at which the operation will be considered
                 timed-out.
+            update_deadline:
+                Function to generate a new operation deadline if the current
+                node may participate in the next rendezvous.
         """
 
 
@@ -596,10 +619,10 @@ def run(
         self,
         state_handler: Callable[[_RendezvousContext, float], _Action],
         deadline: float,
+        update_deadline: Optional[Callable[[timedelta], float]] = None,
     ) -> None:
         """See base class."""
         action = None
-
         while action != _Action.FINISH:
             # Reads or writes the latest rendezvous state shared by all nodes in
             # the rendezvous. Note that our local changes might get overridden
@@ -618,7 +641,7 @@ def run(
                     )
 
                 self._record(message=msg)
-                log.debug(msg)
+                logger.debug(msg)
 
             self._state = self._state_holder.state
 
@@ -632,10 +655,10 @@ def run(
                 continue
 
             if action == _Action.ERROR_CLOSED:
-                raise RendezvousClosedError()
+                raise RendezvousClosedError
 
             if action == _Action.ERROR_TIMEOUT:
-                raise RendezvousTimeoutError()
+                raise RendezvousTimeoutError
 
             if action == _Action.SYNC:
                 # Delay the execution by one second to avoid overloading the
@@ -648,10 +671,17 @@ def run(
                     self._add_to_participants()
                 elif action == _Action.ADD_TO_WAIT_LIST:
                     self._add_to_wait_list()
+                elif action == _Action.ADD_TO_REDUNDANCY_LIST:
+                    self._add_to_redundancy_list()
                 elif action == _Action.REMOVE_FROM_PARTICIPANTS:
                     self._remove_from_participants()
                 elif action == _Action.REMOVE_FROM_WAIT_LIST:
                     self._remove_from_wait_list()
+                elif action == _Action.REMOVE_FROM_REDUNDANCY_LIST:
+                    self._remove_from_redundancy_list()
+                    # update deadline since the node may participate in rendezvous process
+                    if update_deadline:
+                        deadline = update_deadline(self._settings.timeout.join)
                 elif action == _Action.MARK_RENDEZVOUS_COMPLETE:
                     self._mark_rendezvous_complete()
                 elif action == _Action.MARK_RENDEZVOUS_CLOSED:
@@ -666,7 +696,7 @@ def _keep_alive(self) -> None:
             f"'{self._settings.run_id}'. Pending sync."
         )
         self._record(message=msg)
-        log.debug(msg)
+        logger.debug(msg)
 
         self._state.last_heartbeats[self._node] = datetime.utcnow()
 
@@ -676,7 +706,7 @@ def _add_to_participants(self) -> None:
             f"{self._state.round} of the rendezvous '{self._settings.run_id}'. Pending sync."
         )
         self._record(message=msg)
-        log.debug(msg)
+        logger.debug(msg)
 
         state = self._state
 
@@ -703,19 +733,33 @@ def _add_to_wait_list(self) -> None:
             f"{self._state.round + 1} of the rendezvous '{self._settings.run_id}'. Pending sync."
         )
         self._record(message=msg)
-        log.debug(msg)
+        logger.debug(msg)
 
+        if self._node in self._state.redundancy_list:
+            self._state.redundancy_list.remove(self._node)
         self._state.wait_list.add(self._node)
 
         self._keep_alive()
 
+    def _add_to_redundancy_list(self) -> None:
+        msg = (
+            f"The node '{self._node}' added itself to the redundancy list of round "
+            f"{self._state.round + 1} of the rendezvous '{self._settings.run_id}'. Pending sync."
+        )
+        self._record(message=msg)
+        logger.debug(msg)
+
+        self._state.redundancy_list.add(self._node)
+
+        self._keep_alive()
+
     def _remove_from_participants(self) -> None:
         msg = (
             f"The node '{self._node}' removed itself from the participants of round "
             f"{self._state.round} of the rendezvous '{self._settings.run_id}'. Pending sync."
         )
         self._record(message=msg)
-        log.debug(msg)
+        logger.debug(msg)
 
         state = self._state
 
@@ -733,19 +777,31 @@ def _remove_from_wait_list(self) -> None:
             f"{self._state.round + 1} of the rendezvous '{self._settings.run_id}'. Pending sync."
         )
         self._record(message=msg)
-        log.debug(msg)
+        logger.debug(msg)
 
         self._state.wait_list.remove(self._node)
 
         del self._state.last_heartbeats[self._node]
 
+    def _remove_from_redundancy_list(self) -> None:
+        msg = (
+            f"The node '{self._node}' removed itself from the redunant list of round "
+            f"{self._state.round + 1} of the rendezvous '{self._settings.run_id}'. Pending sync."
+        )
+        self._record(message=msg)
+        logger.debug(msg)
+
+        self._state.redundancy_list.remove(self._node)
+
+        del self._state.last_heartbeats[self._node]
+
     def _mark_rendezvous_complete(self) -> None:
         msg = (
             f"The node '{self._node}' marked round {self._state.round} of the rendezvous "
             f"'{self._settings.run_id}' as complete. Pending sync."
         )
         self._record(message=msg, node_state=NodeState.SUCCEEDED)
-        log.debug(msg)
+        logger.debug(msg)
 
         state = self._state
 
@@ -762,7 +818,7 @@ def _mark_rendezvous_closed(self) -> None:
             "Pending sync."
         )
         self._record(message=msg, node_state=NodeState.SUCCEEDED)
-        log.debug(msg)
+        logger.debug(msg)
 
         self._state.closed = True
 
@@ -796,8 +852,26 @@ def __call__(self, ctx: _RendezvousContext, deadline: float) -> _Action:
 
         # A closed rendezvous means that it no longer accepts new nodes.
         if state.closed:
+            if ctx.node in state.redundancy_list:
+                msg = f"The rendezvous '{ctx.settings.run_id}' is closed, terminating pending rendezvous."
+                raise RendezvousGracefulExitError(msg)
             return _Action.ERROR_CLOSED
 
+        if ctx.node in state.redundancy_list:
+            msg = f"The node {ctx.node} is in redunancy list"
+            logger.debug(msg)
+            # don't apply the timeout logic here, since we want to allow the node to rejoin
+            if len(state.participants) == ctx.settings.max_nodes:
+                if _should_keep_alive(ctx):
+                    return _Action.KEEP_ALIVE
+                else:
+                    return _Action.SYNC
+            else:
+                # transition to waiting state that will respect timeouts.
+                msg = f"The node {ctx.node} is removed from redunancy list"
+                logger.debug(msg)
+                return _Action.REMOVE_FROM_REDUNDANCY_LIST
+
         is_participant = ctx.node in state.participants
 
         # If we are part of the rendezvous and it is already complete there is
@@ -831,13 +905,28 @@ def __call__(self, ctx: _RendezvousContext, deadline: float) -> _Action:
             if len(state.participants) < ctx.settings.max_nodes:
                 if ctx.node not in state.wait_list:
                     return _Action.ADD_TO_WAIT_LIST
+            elif len(state.participants) >= ctx.settings.max_nodes:
+                if ctx.node not in state.redundancy_list and ctx.node not in state.wait_list:
+                    return _Action.ADD_TO_REDUNDANCY_LIST
         elif is_participant:
             # If the rendezvous has enough number of participants including us,
             # check whether we have passed the rendezvous deadline. If yes,
             # complete it.
-            if len(state.participants) >= ctx.settings.min_nodes:
+            if len(state.participants) >= ctx.settings.min_nodes and \
+                    len(state.participants) <= ctx.settings.max_nodes:
                 if cast(datetime, state.deadline) < datetime.utcnow():
+                    msg = (
+                        f"The node '{ctx.node}' marking the rendezvous complete, "
+                        f"quorum established within deadline"
+                    )
+                    logger.debug(msg)
                     return _Action.MARK_RENDEZVOUS_COMPLETE
+                else:
+                    msg = f"The node '{ctx.node}' can't complete rendezvous: deadline reached"
+                    logger.debug(msg)
+            else:
+                msg = f"The node '{ctx.node}' can't complete rendezvous: not enough participants"
+                logger.debug(msg)
         else:
             # The rendezvous is not complete yet and we are not part of it. Try
             # to join.
@@ -1008,7 +1097,7 @@ def next_rendezvous(self) -> Tuple[Store, int, int]:
             f"'{self._settings.run_id}'."
         )
         self._record(message=msg)
-        log.info(msg)
+        logger.info(msg)
 
         try:
             self._stop_heartbeats()
@@ -1023,9 +1112,11 @@ def next_rendezvous(self) -> Tuple[Store, int, int]:
             join_op = _RendezvousJoinOp()
 
             deadline = self._get_deadline(self._settings.timeout.join)
-
             self._op_executor.run(exit_op, deadline)
-            self._op_executor.run(join_op, deadline)
+            self._op_executor.run(
+                join_op,
+                deadline,
+                self._get_deadline)
 
             self._start_heartbeats()
 
@@ -1045,7 +1136,7 @@ def next_rendezvous(self) -> Tuple[Store, int, int]:
             f"{world_size}."
         )
         self._record(message=msg, rank=rank)
-        log.info(msg)
+        logger.info(msg)
 
         return store, rank, world_size
 
@@ -1109,7 +1200,7 @@ def shutdown(self) -> bool:
                 f"'{self._settings.run_id}' due to an error of type {type(ex).__name__}."
             )
             self._record(message=msg, node_state=NodeState.FAILED)
-            log.warning(msg)
+            logger.warning(msg)
 
             return False
         except Exception as e:
@@ -1128,7 +1219,7 @@ def _close(self) -> None:
 
         msg = f"The node '{self._this_node}' has closed the rendezvous '{self._settings.run_id}'."
         self._record(message=msg, node_state=NodeState.SUCCEEDED)
-        log.info(msg)
+        logger.info(msg)
 
     @staticmethod
     def _keep_alive_weak(weak_self) -> None:
@@ -1151,14 +1242,14 @@ def _keep_alive(self) -> None:
                 f"'{self._settings.run_id}'."
             )
             self._record(message=msg)
-            log.debug(msg)
+            logger.debug(msg)
         except RendezvousError as ex:
             msg = (
                 f"The node '{self._this_node}' has failed to send a keep-alive heartbeat to the "
                 f"rendezvous '{self._settings.run_id}' due to an error of type {type(ex).__name__}."
             )
             self._record(message=msg, node_state=NodeState.FAILED)
-            log.warning(msg)
+            logger.warning(msg)
         finally:
             self._heartbeat_lock.release()
 
diff --git a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
index dab767925037b..8997c592f50c2 100644
--- a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
@@ -30,10 +30,10 @@
 _log_handler = logging.StreamHandler(sys.stderr)
 _log_handler.setFormatter(_log_fmt)
 
-log = logging.getLogger(__name__)
-log.propagate = False
-log.setLevel(logging.INFO)
-log.addHandler(_log_handler)
+logger = logging.getLogger(__name__)
+logger.propagate = False
+logger.setLevel(logging.INFO)
+logger.addHandler(_log_handler)
 
 
 # Retryable failure exception means the we were too late to make
@@ -124,7 +124,7 @@ class EtcdRendezvousHandler(RendezvousHandler):
     |                                            | (default 600s)           |
     +--------------------------------------------+--------------------------+
     | last_call_timeout                          | additional wait amount   |
-    |                                            | (“last call”) after min  |
+    |                                            | ("last call") after min  |
     |                                            | number of workers has    |
     |                                            | been reached (defaults   |
     |                                            | to 30s)                  |
@@ -150,7 +150,7 @@ def get_backend(self) -> str:
     def next_rendezvous(self):
         rdzv_version, rank, world_size = self._rdzv_impl.rendezvous_barrier()
 
-        log.info("Creating EtcdStore as the c10d::Store implementation")
+        logger.info("Creating EtcdStore as the c10d::Store implementation")
         store = self._rdzv_impl.setup_kv_store(rdzv_version)
 
         return store, rank, world_size
@@ -183,7 +183,7 @@ def shutdown(self) -> bool:
             self.set_closed()
             return True
         except BaseException as e:
-            log.warning("Shutdown failed. Error occurred: %s", str(e))
+            logger.warning("Shutdown failed. Error occurred: %s", str(e))
             return False
 
 
@@ -207,7 +207,7 @@ def __init__(
         last_call_timeout,
     ):
         self.client = client
-        log.info("Etcd machines: %s", self.client.machines)
+        logger.info("Etcd machines: %s", self.client.machines)
 
         self._prefix = prefix
         self._run_id = run_id
@@ -270,9 +270,9 @@ def rendezvous_barrier(self):
         self._rendezvous_deadline = time.time() + self._timeout
         while True:
             if time.time() > self._rendezvous_deadline:
-                raise RendezvousTimeoutError()
+                raise RendezvousTimeoutError
 
-            log.info("Attempting to join next rendezvous")
+            logger.info("Attempting to join next rendezvous")
             try:
                 # Dis-own our lease in the previous rendezvous, if exists
                 if self._lease_this_rank_stop is not None:
@@ -290,11 +290,11 @@ def rendezvous_barrier(self):
                 time.sleep(1)
 
             except RendezvousTimeoutError:
-                log.info("Rendezvous timeout occurred in EtcdRendezvousHandler")
+                logger.info("Rendezvous timeout occurred in EtcdRendezvousHandler")
                 raise
 
             except RendezvousClosedError:
-                log.info(
+                logger.info(
                     "Rendezvous for run_id=%s was observed to be closed", self._run_id
                 )
                 raise
@@ -307,7 +307,7 @@ def rendezvous_barrier(self):
                 # to avoid spamming etcd
                 # FIXME: there are a few things that fall under this like
                 # etcd.EtcdKeyNotFound, etc, which could be handled more explicitly.
-                log.info("Rendezvous attempt failed, will retry. Reason: %s", e)
+                logger.info("Rendezvous attempt failed, will retry. Reason: %s", e)
                 time.sleep(1)
 
     def init_phase(self):
@@ -332,25 +332,25 @@ def init_phase(self):
         try:
             active_version = self.try_create_rendezvous()
             state = json.loads(active_version.value)
-            log.info("New rendezvous state created: %s", state)
+            logger.info("New rendezvous state created: %s", state)
         except etcd.EtcdAlreadyExist:
             active_version, state = self.get_rdzv_state()
             # Note: it is possible for above query to fail (etcd.EtcdKeyNotFound),
             # but this is ok for us - just means we'll restart from beginning.
-            log.info("Observed existing rendezvous state: %s", state)
+            logger.info("Observed existing rendezvous state: %s", state)
 
         if state["status"] == "closed":
-            raise RendezvousClosedError()
+            raise RendezvousClosedError
 
         if state["status"] == "joinable":
             return self.join_phase(state["version"])
 
         if state["status"] == "final":
             self.handle_existing_rendezvous(state["version"])
-            raise EtcdRendezvousRetryImmediately()
+            raise EtcdRendezvousRetryImmediately
 
         self.try_wait_for_state_change(etcd_index=active_version.etcd_index + 1)
-        raise EtcdRendezvousRetryableFailure()
+        raise EtcdRendezvousRetryableFailure
 
     def join_phase(self, expected_version):
         """
@@ -360,7 +360,7 @@ def join_phase(self, expected_version):
         # Failure to join will propagate an exception, causing a re-entry.
         active_version, this_rank = self.join_rendezvous(expected_version)
         state = json.loads(active_version.value)
-        log.info(
+        logger.info(
             "Joined rendezvous version %s as rank %s. Full state: %s",
             state["version"], this_rank, state
         )
@@ -375,13 +375,13 @@ def join_phase(self, expected_version):
         # when min_num_workers is reached.
 
         if this_rank == self._num_min_workers - 1 and state["status"] == "joinable":
-            log.info("Rank %s is responsible for join last call.", this_rank)
+            logger.info("Rank %s is responsible for join last call.", this_rank)
             last_call_deadline = time.time() + self._last_call_timeout
             self.handle_join_last_call(expected_version, last_call_deadline)
-            log.info("Rank %s finished join last call.", this_rank)
+            logger.info("Rank %s finished join last call.", this_rank)
 
         # Wait for rendezvous state to be frozen, which means a fixed set of peers
-        log.info("Waiting for remaining peers.")
+        logger.info("Waiting for remaining peers.")
         active_version = self.wait_for_peers(expected_version)
         state = json.loads(active_version.value)
 
@@ -398,14 +398,14 @@ def confirm_phase(self, expected_version, this_rank):
         keep-alive TTL keys, and then wait for all other participants to confirm,
         which would then successfully conclude this rendezvous.
         """
-        log.info("All peers arrived. Confirming membership.")
+        logger.info("All peers arrived. Confirming membership.")
         self.confirm_membership(expected_version, this_rank)
 
-        log.info("Waiting for confirmations from all peers.")
+        logger.info("Waiting for confirmations from all peers.")
         active_version = self.wait_for_final(expected_version)
         state = json.loads(active_version.value)
 
-        log.info(
+        logger.info(
             "Rendezvous version %s is complete. Final state: %s",
             state["version"], state
         )
@@ -424,13 +424,13 @@ def handle_existing_rendezvous(self, expected_version):
         #   1. if it's no longer final -> bail out and re-try
         #   2. if keep alives are missing, destroy it and bail out.
         active_state = self.announce_self_waiting(expected_version)
-        log.info(
+        logger.info(
             "Added self to waiting list. Rendezvous full state: %s",
             active_state.value
         )
 
         self.wait_for_rendezvous_to_free(expected_version)
-        log.info("Previously existing rendezvous state changed. Will re-try joining.")
+        logger.info("Previously existing rendezvous state changed. Will re-try joining.")
 
     def try_create_rendezvous(self):
         """
@@ -532,7 +532,7 @@ def join_rendezvous(self, expected_version):
                 return active_version, this_rank
 
             except etcd.EtcdCompareFailed:
-                log.info("Join rendezvous CAS unsuccessful, retrying")
+                logger.info("Join rendezvous CAS unsuccessful, retrying")
 
     def wait_for_peers(self, expected_version):
         """Helper method for the join phase."""
@@ -600,7 +600,7 @@ def confirm_membership(self, expected_version, this_rank):
                 return active_version
 
             except etcd.EtcdCompareFailed:
-                log.info("Confirm membership CAS unsuccessful, retrying")
+                logger.info("Confirm membership CAS unsuccessful, retrying")
 
     def wait_for_final(self, expected_version):
         """Helper method for the confirm phase."""
@@ -632,7 +632,7 @@ def announce_self_waiting(self, expected_version):
             active_version, state = self.get_rdzv_state()
 
             if state["status"] != "final" or state["version"] != expected_version:
-                raise EtcdRendezvousRetryImmediately()
+                raise EtcdRendezvousRetryImmediately
 
             # Increment counter to signal an additional waiting worker.
             state["num_workers_waiting"] += 1
@@ -646,7 +646,7 @@ def announce_self_waiting(self, expected_version):
                 return active_version
 
             except etcd.EtcdCompareFailed:
-                log.info("Announce self as waiting CAS unsuccessful, retrying")
+                logger.info("Announce self as waiting CAS unsuccessful, retrying")
 
     def wait_for_rendezvous_to_free(self, expected_version):
         """
@@ -675,12 +675,12 @@ def wait_for_rendezvous_to_free(self, expected_version):
                 if key not in keep_alive_keys:
                     # This participant didn't renew their lease. We'll declare this
                     # rendezvous version as dead (but only if it hadn't changed)
-                    log.info("Keep-alive key %s is not renewed.", key)
-                    log.info(
+                    logger.info("Keep-alive key %s is not renewed.", key)
+                    logger.info(
                         "Rendezvous version %s is incomplete. ",
                         expected_version
                     )
-                    log.info("Attempting to destroy it.")
+                    logger.info("Attempting to destroy it.")
 
                     # Compare-and-delete operation. Throws if compare failed,
                     # which means rendezvous was already destroyed/re-created/closed,
@@ -690,7 +690,7 @@ def wait_for_rendezvous_to_free(self, expected_version):
                         prevValue=active_version.value,
                     )
 
-                    log.info(
+                    logger.info(
                         "Destroyed rendezvous version %s successfully.",
                         expected_version
                     )
@@ -714,7 +714,7 @@ def wait_for_rendezvous_to_free(self, expected_version):
                 pass
 
             if time.time() > self._rendezvous_deadline:
-                raise RendezvousTimeoutError()
+                raise RendezvousTimeoutError
             active_version, state = self.get_rdzv_state()
 
     def handle_join_last_call(self, expected_version, deadline):
@@ -759,7 +759,7 @@ def handle_join_last_call(self, expected_version, deadline):
                     # We successfully made this rendezvous frozen.
                     return
                 except etcd.EtcdCompareFailed:
-                    log.info("Join last-call transition CAS unsuccessful. Will retry")
+                    logger.info("Join last-call transition CAS unsuccessful. Will retry")
                     cas_delay()
                     active_version, state = self.get_rdzv_state()
                     continue
@@ -785,7 +785,7 @@ def handle_join_last_call(self, expected_version, deadline):
                     etcd_index=active_version.etcd_index + 1, timeout=timeout
                 )
             except etcd.EtcdCompareFailed:
-                log.info("Join last-call TTL refresh CAS unsuccessful, will retry")
+                logger.info("Join last-call TTL refresh CAS unsuccessful, will retry")
                 cas_delay()
                 active_version, state = self.get_rdzv_state()
 
@@ -812,7 +812,7 @@ def set_closed(self):
                 return
 
             except etcd.EtcdCompareFailed:
-                log.info("Set closed CAS unsuccessful, retrying")
+                logger.info("Set closed CAS unsuccessful, retrying")
                 cas_delay()
 
     def get_rdzv_state(self):
@@ -832,7 +832,7 @@ def try_wait_for_state_change(self, etcd_index, timeout=None):
             pass
 
         if time.time() > self._rendezvous_deadline:
-            raise RendezvousTimeoutError()
+            raise RendezvousTimeoutError
 
         # Unfortunately, we have to do another fetch in order to get last etcd_index.
         return self.get_rdzv_state()
@@ -908,7 +908,7 @@ def store_extra_data(self, rdzv_version, key, value):
                 )
                 return
             except etcd.EtcdCompareFailed:
-                log.info("Store extra_data CAS unsuccessful, retrying")
+                logger.info("Store extra_data CAS unsuccessful, retrying")
                 time.sleep(0.1)
 
     def load_extra_data(self, rdzv_version, key, timeout=None):
diff --git a/torch/distributed/elastic/rendezvous/etcd_server.py b/torch/distributed/elastic/rendezvous/etcd_server.py
index e4de931c0f6bd..a28f7cc318397 100644
--- a/torch/distributed/elastic/rendezvous/etcd_server.py
+++ b/torch/distributed/elastic/rendezvous/etcd_server.py
@@ -22,7 +22,7 @@
     pass
 
 
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 def find_free_port():
@@ -57,19 +57,19 @@ def find_free_port():
             s.listen(0)
             return s
         except OSError as e:
-            s.close()
+            s.close()  # type: ignore[possibly-undefined]
             print(f"Socket creation attempt failed: {e}")
     raise RuntimeError("Failed to create a socket")
 
 
 def stop_etcd(subprocess, data_dir: Optional[str] = None):
     if subprocess and subprocess.poll() is None:
-        log.info("stopping etcd server")
+        logger.info("stopping etcd server")
         subprocess.terminate()
         subprocess.wait()
 
     if data_dir:
-        log.info("deleting etcd data dir: %s", data_dir)
+        logger.info("deleting etcd data dir: %s", data_dir)
         shutil.rmtree(data_dir, ignore_errors=True)
 
 
@@ -174,7 +174,7 @@ def start(
             except Exception as e:
                 curr_retries += 1
                 stop_etcd(self._etcd_proc)
-                log.warning(
+                logger.warning(
                     "Failed to start etcd server, got error: %s, retrying", str(e)
                 )
                 if curr_retries >= num_retries:
@@ -207,7 +207,7 @@ def _start(
             )
         )
 
-        log.info("Starting etcd server: [%s]", etcd_cmd)
+        logger.info("Starting etcd server: [%s]", etcd_cmd)
 
         sock.close()
         sock_peer.close()
@@ -234,7 +234,7 @@ def _wait_for_ready(self, timeout: int = 60) -> None:
                     f"Etcd server process exited with the code: {exitcode}"
                 )
             try:
-                log.info("etcd server ready. version: %s", client.version)
+                logger.info("etcd server ready. version: %s", client.version)
                 return
             except Exception:
                 time.sleep(1)
@@ -242,5 +242,5 @@ def _wait_for_ready(self, timeout: int = 60) -> None:
 
     def stop(self) -> None:
         """Stop the server and cleans up auto generated resources (e.g. data dir)."""
-        log.info("EtcdServer stop method called")
+        logger.info("EtcdServer stop method called")
         stop_etcd(self._etcd_proc, self._base_data_dir)
diff --git a/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py b/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py
index fe600213bcc89..a7377ca63fdd4 100644
--- a/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py
@@ -15,7 +15,7 @@
 from torch.distributed.elastic.rendezvous import RendezvousHandler, RendezvousParameters
 from torch.distributed.elastic.rendezvous.utils import parse_rendezvous_endpoint
 
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 _default_timeout_seconds = 600
 
@@ -49,7 +49,7 @@ def get_backend(self) -> str:
         return "static"
 
     def next_rendezvous(self) -> Tuple[Store, int, int]:
-        log.info("Creating TCPStore as the c10d::Store implementation")
+        logger.info("Creating TCPStore as the c10d::Store implementation")
         if not self._store:
             is_master = self.rank == 0
             self._store = TCPStore(  # type: ignore[call-arg]
diff --git a/torch/distributed/elastic/timer/api.py b/torch/distributed/elastic/timer/api.py
index 566a3d4acbc78..0121c98d56d19 100644
--- a/torch/distributed/elastic/timer/api.py
+++ b/torch/distributed/elastic/timer/api.py
@@ -13,7 +13,7 @@
 
 __all__ = ['TimerRequest', 'TimerClient', 'RequestQueue', 'TimerServer', 'configure', 'expires']
 
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 class TimerRequest:
     """
@@ -170,7 +170,7 @@ def _reap_worker_no_throw(self, worker_id: Any) -> bool:
         try:
             return self._reap_worker(worker_id)
         except Exception:
-            log.exception(
+            logger.exception(
                 "Uncaught exception thrown from _reap_worker(), "
                 "check that the implementation correctly catches exceptions",
             )
@@ -181,7 +181,7 @@ def _watchdog_loop(self):
             try:
                 self._run_watchdog()
             except Exception:
-                log.exception("Error running watchdog")
+                logger.exception("Error running watchdog")
 
     def _run_watchdog(self):
         batch_size = max(1, self._request_queue.size())
@@ -190,16 +190,16 @@ def _run_watchdog(self):
         now = time.time()
         reaped_worker_ids = set()
         for worker_id, expired_timers in self.get_expired_timers(now).items():
-            log.info(
+            logger.info(
                 "Reaping worker_id=[%s]."
                 " Expired timers: %s",
                 worker_id, self._get_scopes(expired_timers)
             )
             if self._reap_worker_no_throw(worker_id):
-                log.info("Successfully reaped worker=[%s]", worker_id)
+                logger.info("Successfully reaped worker=[%s]", worker_id)
                 reaped_worker_ids.add(worker_id)
             else:
-                log.error(
+                logger.error(
                     "Error reaping worker=[%s]. Will retry on next watchdog.", worker_id
                 )
         self.clear_timers(reaped_worker_ids)
@@ -208,7 +208,7 @@ def _get_scopes(self, timer_requests):
         return [r.scope_id for r in timer_requests]
 
     def start(self) -> None:
-        log.info(
+        logger.info(
             "Starting %s..."
             " max_interval=%s,"
             " daemon=%s",
@@ -217,21 +217,21 @@ def start(self) -> None:
         self._watchdog_thread = threading.Thread(
             target=self._watchdog_loop, daemon=self._daemon
         )
-        log.info("Starting watchdog thread...")
+        logger.info("Starting watchdog thread...")
         self._watchdog_thread.start()
 
     def stop(self) -> None:
-        log.info("Stopping %s", type(self).__name__)
+        logger.info("Stopping %s", type(self).__name__)
         self._stop_signaled = True
         if self._watchdog_thread:
-            log.info("Stopping watchdog thread...")
+            logger.info("Stopping watchdog thread...")
             self._watchdog_thread.join(self._max_interval)
             self._watchdog_thread = None
         else:
-            log.info("No watchdog thread running, doing nothing")
+            logger.info("No watchdog thread running, doing nothing")
 
 
-_timer_client = None
+_timer_client: Optional[TimerClient] = None
 
 
 def configure(timer_client: TimerClient):
@@ -240,7 +240,7 @@ def configure(timer_client: TimerClient):
     """
     global _timer_client
     _timer_client = timer_client
-    log.info("Timer client configured to: %s", type(_timer_client).__name__)
+    logger.info("Timer client configured to: %s", type(_timer_client).__name__)
 
 
 @contextmanager
diff --git a/torch/distributed/elastic/timer/debug_info_logging.py b/torch/distributed/elastic/timer/debug_info_logging.py
new file mode 100644
index 0000000000000..87af84e281dc7
--- /dev/null
+++ b/torch/distributed/elastic/timer/debug_info_logging.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List
+
+from torch.distributed.elastic.utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def log_debug_info_for_expired_timers(
+    run_id: str,
+    expired_timers: Dict[int, List[str]],
+):
+    if expired_timers:
+        logger.info("Timers expired for run:[%s] [%s].", run_id, expired_timers)
diff --git a/torch/distributed/elastic/timer/file_based_local_timer.py b/torch/distributed/elastic/timer/file_based_local_timer.py
index 26ebce33dcb5b..f2ded8ba84dd3 100644
--- a/torch/distributed/elastic/timer/file_based_local_timer.py
+++ b/torch/distributed/elastic/timer/file_based_local_timer.py
@@ -6,7 +6,6 @@
 
 import io
 import json
-import logging
 import os
 import select
 import signal
@@ -16,10 +15,12 @@
 from typing import Callable, Dict, List, Optional, Set, Tuple
 
 from torch.distributed.elastic.timer.api import TimerClient, TimerRequest
+from torch.distributed.elastic.timer.debug_info_logging import log_debug_info_for_expired_timers
+from torch.distributed.elastic.utils.logging import get_logger
 
 __all__ = ["FileTimerClient", "FileTimerRequest", "FileTimerServer"]
 
-log = logging.getLogger(__name__)
+logger = get_logger(__name__)
 
 class FileTimerRequest(TimerRequest):
     """
@@ -156,11 +157,13 @@ class FileTimerServer:
     def __init__(
         self,
         file_path: str,
+        run_id: str,
         max_interval: float = 10,
         daemon: bool = True,
         log_event: Optional[Callable[[str, Optional[FileTimerRequest]], None]] = None
     ) -> None:
         self._file_path = file_path
+        self._run_id = run_id
         self._max_interval = max_interval
         self._daemon = daemon
         self._timers: Dict[Tuple[int, str], FileTimerRequest] = {}
@@ -174,29 +177,30 @@ def __init__(
         # For test only. Process all requests and stop the server.
         self._run_once = False
         self._log_event = log_event if log_event is not None else lambda name, request: None
+        self._last_progress_time = int(time.time())
 
 
     def start(self) -> None:
-        log.info(
+        logger.info(
             "Starting %s..."
             " max_interval=%s,"
             " daemon=%s",
             type(self).__name__, self._max_interval, self._daemon
         )
         self._watchdog_thread = threading.Thread(target=self._watchdog_loop, daemon=self._daemon)
-        log.info("Starting watchdog thread...")
+        logger.info("Starting watchdog thread...")
         self._watchdog_thread.start()
         self._log_event("watchdog started", None)
 
     def stop(self) -> None:
-        log.info("Stopping %s", type(self).__name__)
+        logger.info("Stopping %s", type(self).__name__)
         self._stop_signaled = True
         if self._watchdog_thread:
-            log.info("Stopping watchdog thread...")
+            logger.info("Stopping watchdog thread...")
             self._watchdog_thread.join(self._max_interval)
             self._watchdog_thread = None
         else:
-            log.info("No watchdog thread running, doing nothing")
+            logger.info("No watchdog thread running, doing nothing")
         if os.path.exists(self._file_path):
             os.remove(self._file_path)
         self._log_event("watchdog stopped", None)
@@ -204,14 +208,26 @@ def stop(self) -> None:
     def run_once(self) -> None:
         self._run_once = True
         if self._watchdog_thread:
-            log.info("Stopping watchdog thread...")
+            logger.info("Stopping watchdog thread...")
             self._watchdog_thread.join()
             self._watchdog_thread = None
         else:
-            log.info("No watchdog thread running, doing nothing")
+            logger.info("No watchdog thread running, doing nothing")
         if os.path.exists(self._file_path):
             os.remove(self._file_path)
 
+    @staticmethod
+    def is_process_running(pid: int):
+        """
+        function to check process is running or not
+        """
+        try:
+            # Check if the process exists and we can send signals to it
+            os.kill(pid, 0)
+            return True
+        except OSError:
+            return False
+
     def _watchdog_loop(self) -> None:
         # Open the pipe in blocking mode blocks the server thread.
         # This is fine for the following reasons:
@@ -225,16 +241,24 @@ def _watchdog_loop(self) -> None:
                     self._run_watchdog(fd)
                     if run_once:
                         break
+                    self._last_progress_time = int(time.time())
                 except Exception:
-                    log.exception("Error running watchdog")
+                    logger.exception("Error running watchdog")
 
     def _run_watchdog(self, fd: io.TextIOWrapper) -> None:
         timer_requests = self._get_requests(fd, self._max_interval)
         self.register_timers(timer_requests)
         now = time.time()
         reaped_worker_pids = set()
-        for worker_pid, expired_timers in self.get_expired_timers(now).items():
-            log.info("Reaping worker_pid=[%s]. Expired timers: %s", worker_pid, self._get_scopes(expired_timers))
+
+        all_expired_timers = self.get_expired_timers(now)
+        log_debug_info_for_expired_timers(
+            self._run_id,
+            {pid: self._get_scopes(expired_timers) for pid, expired_timers in all_expired_timers.items()},
+        )
+
+        for worker_pid, expired_timers in all_expired_timers.items():
+            logger.info("Reaping worker_pid=[%s]. Expired timers: %s", worker_pid, self._get_scopes(expired_timers))
             reaped_worker_pids.add(worker_pid)
             # In case we have multiple expired timers, we find the first timer
             # with a valid signal (>0) in the expiration time order.
@@ -248,13 +272,13 @@ def _run_watchdog(self, fd: io.TextIOWrapper) -> None:
                     expired_timer = timer
                     break
             if signal <= 0:
-                log.info("No signal specified with worker=[%s]. Do not reap it.", worker_pid)
+                logger.info("No signal specified with worker=[%s]. Do not reap it.", worker_pid)
                 continue
             if self._reap_worker(worker_pid, signal):
-                log.info("Successfully reaped worker=[%s] with signal=%s", worker_pid, signal)
+                logger.info("Successfully reaped worker=[%s] with signal=%s", worker_pid, signal)
                 self._log_event("kill worker process", expired_timer)
             else:
-                log.error("Error reaping worker=[%s]. Will retry on next watchdog.", worker_pid)
+                logger.error("Error reaping worker=[%s]. Will retry on next watchdog.", worker_pid)
         self.clear_timers(reaped_worker_pids)
 
     def _get_scopes(self, timer_requests: List[FileTimerRequest]) -> List[str]:
@@ -309,7 +333,7 @@ def register_timers(self, timer_requests: List[FileTimerRequest]) -> None:
 
     def clear_timers(self, worker_pids: Set[int]) -> None:
         for (pid, scope_id) in list(self._timers.keys()):
-            if pid in worker_pids:
+            if pid in worker_pids or not FileTimerServer.is_process_running(pid):
                 del self._timers[(pid, scope_id)]
 
     def get_expired_timers(self, deadline: float) -> Dict[int, List[FileTimerRequest]]:
@@ -326,8 +350,11 @@ def _reap_worker(self, worker_pid: int, signal: int) -> bool:
             os.kill(worker_pid, signal)
             return True
         except ProcessLookupError:
-            log.info("Process with pid=%s does not exist. Skipping", worker_pid)
+            logger.info("Process with pid=%s does not exist. Skipping", worker_pid)
             return True
         except Exception:
-            log.exception("Error terminating pid=%s", worker_pid)
+            logger.exception("Error terminating pid=%s", worker_pid)
         return False
+
+    def get_last_progress_time(self) -> int:
+        return self._last_progress_time
diff --git a/torch/distributed/elastic/timer/local_timer.py b/torch/distributed/elastic/timer/local_timer.py
index 05f467c807a5b..7c87413aef191 100644
--- a/torch/distributed/elastic/timer/local_timer.py
+++ b/torch/distributed/elastic/timer/local_timer.py
@@ -15,7 +15,7 @@
 
 __all__ = ['LocalTimerClient', 'MultiprocessingRequestQueue', 'LocalTimerServer']
 
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 class LocalTimerClient(TimerClient):
     """
@@ -118,8 +118,8 @@ def _reap_worker(self, worker_id: int) -> bool:
             os.kill(worker_id, signal.SIGKILL)
             return True
         except ProcessLookupError:
-            log.info("Process with pid=%s does not exist. Skipping", worker_id)
+            logger.info("Process with pid=%s does not exist. Skipping", worker_id)
             return True
         except Exception:
-            log.exception("Error terminating pid=%s", worker_id)
+            logger.exception("Error terminating pid=%s", worker_id)
         return False
diff --git a/torch/distributed/elastic/utils/distributed.py b/torch/distributed/elastic/utils/distributed.py
index d8b9ac2fac986..bf4a537bbf0e9 100644
--- a/torch/distributed/elastic/utils/distributed.py
+++ b/torch/distributed/elastic/utils/distributed.py
@@ -6,20 +6,22 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import datetime
+import functools
 import socket
 from contextlib import closing
+from typing import Optional
 
 import torch.distributed as dist
 from torch.distributed.elastic.utils.logging import get_logger
+from torch.distributed.elastic.utils.store import barrier
 
 
-log = get_logger(__name__)
+logger = get_logger(__name__)
 
 _ADDRESS_IN_USE = "Address already in use"
 _SOCKET_TIMEOUT = "Socket Timeout"
 
-_MEMBER_CHECKIN = "_tcp_store/num_members"
-_LAST_MEMBER_CHECKIN = "_tcp_store/last_member"
+_TCP_STORE_INIT = "_tcp_store/num_members"
 
 
 def create_c10d_store(
@@ -30,6 +32,7 @@ def create_c10d_store(
     timeout: float = (60 * 10),  # 10 min
     wait_for_workers: bool = True,
     retries=3,
+    use_libuv: Optional[bool] = None,
 ):
     if server_port == -1 and world_size > 1:
         raise ValueError(
@@ -37,7 +40,7 @@ def create_c10d_store(
         )
 
     if server_port != -1:
-        log.info("sever_port: %s, specified, ignoring retries", server_port)
+        logger.info("sever_port: %s, specified, ignoring retries", server_port)
 
     # only retry when server_port is NOT static
     attempt = retries if server_port == -1 else 1
@@ -47,16 +50,18 @@ def create_c10d_store(
         else:
             port = get_free_port()
 
-        log.info(
+        logger.info(
             "Creating c10d store on %s:%s\n"
             "  world_size  : %s\n"
             "  is_server   : %s\n"
-            "  timeout(sec): %s\n",
-            server_addr, port, world_size, is_server, timeout
+            "  timeout(sec): %s\n"
+            "  use_libuv   : %s\n",
+            server_addr, port, world_size, is_server, timeout, use_libuv,
         )
 
         try:
-            store = dist.TCPStore(
+            store_builder = functools.partial(
+                dist.TCPStore,
                 host_name=server_addr,
                 port=port,
                 world_size=world_size,
@@ -64,10 +69,15 @@ def create_c10d_store(
                 timeout=datetime.timedelta(seconds=timeout),
                 wait_for_workers=wait_for_workers,
             )
+            if use_libuv is None:
+                # TCPStore default backend may change, don't specify it unless we explicity told to do so.
+                store = store_builder()
+            else:
+                store = store_builder(use_libuv=use_libuv)
             # skips full rank check when we don't have to wait for all workers
             if wait_for_workers:
-                _check_full_rank(store, world_size)
-            log.info("Successfully created c10d store")
+                _check_full_rank(store, world_size, timeout=timeout)
+            logger.info("Successfully created c10d store")
             return store
         except RuntimeError as e:
             # this is brittle, but the underlying exception type is not properly pybinded
@@ -77,7 +87,7 @@ def create_c10d_store(
             # TODO properly map the exceptions in pybind (c10d/init.cpp)
             if str(e) == _ADDRESS_IN_USE:  # this will only happen on the server
                 if attempt < retries:
-                    log.warning(
+                    logger.warning(
                         "port: %s already in use, attempt: [%s/%s]", port, attempt, retries
                     )
                     attempt += 1
@@ -89,13 +99,9 @@ def create_c10d_store(
                 raise
 
 
-def _check_full_rank(store, world_size):
-    idx = store.add(_MEMBER_CHECKIN, 1)
-    if idx == world_size:
-        store.set(_LAST_MEMBER_CHECKIN, "<val_ignored>")
-
+def _check_full_rank(store, world_size, timeout):
     try:
-        store.get(_LAST_MEMBER_CHECKIN)
+        barrier(store, world_size, key_prefix=_TCP_STORE_INIT, barrier_timeout=timeout)
     except RuntimeError as e:
         if str(e) == _SOCKET_TIMEOUT:
             raise TimeoutError(
@@ -140,5 +146,5 @@ def get_socket_with_port() -> socket.socket:
             return s
         except OSError as e:
             s.close()
-            log.info("Socket creation attempt failed.", exc_info=e)
+            logger.warning("Socket creation attempt failed.", exc_info=e)
     raise RuntimeError("Failed to create a socket")
diff --git a/torch/distributed/elastic/utils/logging.py b/torch/distributed/elastic/utils/logging.py
index e4f1345e4c339..e305d16400cbc 100644
--- a/torch/distributed/elastic/utils/logging.py
+++ b/torch/distributed/elastic/utils/logging.py
@@ -33,9 +33,9 @@ def get_logger(name: Optional[str] = None):
 
 
 def _setup_logger(name: Optional[str] = None):
-    log = logging.getLogger(name)
-    log.setLevel(os.environ.get("LOGLEVEL", get_log_level()))
-    return log
+    logger = logging.getLogger(name)
+    logger.setLevel(os.environ.get("LOGLEVEL", get_log_level()))
+    return logger
 
 
 def _derive_module_name(depth: int = 1) -> Optional[str]:
diff --git a/torch/distributed/elastic/utils/store.py b/torch/distributed/elastic/utils/store.py
index 9c7abab9291c6..719c83b8265de 100644
--- a/torch/distributed/elastic/utils/store.py
+++ b/torch/distributed/elastic/utils/store.py
@@ -8,9 +8,29 @@
 
 from datetime import timedelta
 from typing import List
+from contextlib import contextmanager
 
+_NUM_MEMBERS = "/num_members"
+_LAST_MEMBER_CHECKIN = "/last_member"
 
-def get_all(store, rank: int, prefix: str, size: int):
+@contextmanager
+def store_timeout(store, timeout: float):
+    """
+    This sets the timeout and then restores the old timeout when the context
+    manager exits.
+
+    Args:
+        store: the store to set the timeout on
+        timeout: the timeout to set
+    """
+
+    old_timeout = store.timeout
+    store.set_timeout(timedelta(seconds=timeout))
+    yield
+    store.set_timeout(old_timeout)
+
+
+def get_all(store, rank: int, prefix: str, world_size: int):
     r"""
     Given a store and a prefix, the method goes through the array of keys
     of the following format: ``{prefix}{idx}``, where idx is in a range
@@ -29,17 +49,20 @@ def get_all(store, rank: int, prefix: str, size: int):
      value3 = values[2] # retrieves the data for key torchelastic/data2
 
     """
-    data_arr = []
-    for idx in range(size):
-        data = store.get(f"{prefix}{idx}")
-        data_arr.append(data)
-    store.set(f"{prefix}{rank}.FIN", b"FIN")
+    data_arr = store.multi_get(
+        [f"{prefix}{idx}" for idx in range(world_size)]
+    )
+
+    barrier_key = _barrier_nonblocking(
+        store=store,
+        world_size=world_size,
+        key_prefix=f"{prefix}/finished",
+    )
     if rank == 0:
         # Rank0 runs the TCPStore daemon, as a result it needs to exit last.
         # Otherwise, the barrier may timeout if rank0 process finished the work
         # before other processes finished `get_all` method
-        for node_rank in range(size):
-            store.get(f"{prefix}{node_rank}.FIN")
+        store.get(barrier_key)
 
     return data_arr
 
@@ -50,7 +73,7 @@ def synchronize(
     rank: int,
     world_size: int,
     key_prefix: str,
-    barrier_timeout: float = 300,
+    timeout: float = 300,
 ) -> List[bytes]:
     """
     Synchronizes ``world_size`` agents between each other using the underlying c10d store.
@@ -58,21 +81,47 @@ def synchronize(
 
     Note: The data on the path is not deleted, as a result there can be stale data if
         you use the same key_prefix twice.
+
+    Time complexity: O(N) per worker, O(N^2) globally.
     """
-    store.set_timeout(timedelta(seconds=barrier_timeout))
-    store.set(f"{key_prefix}{rank}", data)
-    agent_data = get_all(store, rank, key_prefix, world_size)
-    return agent_data
+    with store_timeout(store, timeout):
+        store.set(f"{key_prefix}{rank}", data)
+        agent_data = get_all(store, rank, key_prefix, world_size)
+        return agent_data
+
+
+def _barrier_nonblocking(store, world_size: int, key_prefix: str) -> str:
+    """
+    Does all the non-blocking operations for a barrier and returns the final key
+    that can be waited on.
+    """
+    num_members_key = key_prefix + _NUM_MEMBERS
+    last_member_key = key_prefix + _LAST_MEMBER_CHECKIN
+
+
+    idx = store.add(num_members_key, 1)
+    if idx == world_size:
+        store.set(last_member_key, "<val_ignored>")
+
+    return last_member_key
 
 
 def barrier(
-    store, rank: int, world_size: int, key_prefix: str, barrier_timeout: float = 300
+    store, world_size: int, key_prefix: str, barrier_timeout: float = 300
 ) -> None:
     """
-    A global lock between agents.
+    A global lock between agents. This will pause all workers until at least
+    ``world_size`` workers respond.
+
+    This uses a fast incrementing index to assign waiting ranks and a success
+    flag set by the last worker.
+
+    Time complexity: O(1) per worker, O(N) globally.
 
     Note: Since the data is not removed from the store, the barrier can be used
         once per unique ``key_prefix``.
     """
-    data = f"{rank}".encode()
-    synchronize(store, data, rank, world_size, key_prefix, barrier_timeout)
+
+    with store_timeout(store, barrier_timeout):
+        last_member_key = _barrier_nonblocking(store=store, world_size=world_size, key_prefix=key_prefix)
+        store.get(last_member_key)
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index bd4abd888b87b..7d9394ef1fbda 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -28,11 +28,9 @@
 import torch.distributed.fsdp._flat_param as flat_param_file
 import torch.nn as nn
 from torch.distributed._composable_state import _get_module_state, _State
-from torch.distributed._tensor.device_mesh import DeviceMesh
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     _CHECKPOINT_PREFIX,
 )
-from torch.distributed.fsdp._fsdp_extensions import FSDPExtensions
 from torch.distributed.utils import _apply_to_tensors
 from torch.utils._mode_utils import no_dispatch
 
@@ -46,6 +44,8 @@
 )
 
 if TYPE_CHECKING:
+    from torch.distributed.device_mesh import DeviceMesh
+    from torch.distributed.fsdp._fsdp_extensions import FSDPExtensions
     from ._flat_param import FlatParamHandle
 
 FSDP_WRAPPED_MODULE = "_fsdp_wrapped_module"
@@ -139,6 +139,7 @@ def __init__(self) -> None:
         self._gradient_postdivide_factor: int = 0
         self._comm_hook: Optional[Callable] = None
         self._comm_hook_state: Optional[Any] = None
+        self._unshard_event: Optional[torch.cuda.Event] = None
         # Abstract device handle for fsdp compute device. For now,
         # the compute device must implement cuda semantics used by fsdp
         self._device_handle: _FSDPDeviceHandle = _UninitializedDeviceHandle()
@@ -359,14 +360,14 @@ def return_fn(param_to_fqns):
 
 @no_type_check
 def _log_post_backward_hook(
-    state: _FSDPState, handle: "FlatParamHandle", log: logging.Logger
+    state: _FSDPState, handle: "FlatParamHandle", logger: logging.Logger
 ) -> None:
     # Under TORCH_DISTRIBUTED_DEBUG=INFO, log the module names this hook fires for.
     # Below logging of module names this post-bwd hook fires for can help debug certain
     # cases where hooks don't fire, such as under certain activation checkpoint configs.
     if state._use_orig_params and handle._debug_level == dist.DebugLevel.INFO:
         param_fqns = _get_handle_fqns_from_root(state, handle)
-        log.warning("FSDP firing post-backward hooks for parameters %s", param_fqns)
+        logger.warning("FSDP firing post-backward hooks for parameters %s", param_fqns)
 
 
 @no_type_check
diff --git a/torch/distributed/fsdp/_debug_utils.py b/torch/distributed/fsdp/_debug_utils.py
index cf5f1ec03a473..4ed76476e56b6 100644
--- a/torch/distributed/fsdp/_debug_utils.py
+++ b/torch/distributed/fsdp/_debug_utils.py
@@ -6,6 +6,7 @@
 from typing import Dict, Iterator, List, Set, Tuple
 
 import torch
+import torch.distributed as dist
 import torch.distributed.fsdp._flat_param as flat_param_file
 from torch.distributed.fsdp._common_utils import (
     _apply_to_modules,
@@ -53,7 +54,10 @@ def profile(cls, profile_type: str) -> Iterator[None]:
 
     @classmethod
     def dump_and_reset(cls, msg: str) -> None:
-        logger.warning("%s %s", msg, str(cls.results))
+        # This cannot be combined with DETAIL distributed log
+        # as the profiling will be very incorrect.
+        if dist.get_rank() == 0 and dist.get_debug_level() == dist.DebugLevel.INFO:
+            logger.warning("%s %s", msg, cls.results)
         cls.reset()
 
 
diff --git a/torch/distributed/fsdp/_flat_param.py b/torch/distributed/fsdp/_flat_param.py
index 672d4c93d0bc2..2f344d19e9305 100644
--- a/torch/distributed/fsdp/_flat_param.py
+++ b/torch/distributed/fsdp/_flat_param.py
@@ -58,7 +58,7 @@
     "HandleShardingStrategy",
 ]
 
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 """
@@ -511,21 +511,21 @@ def __init__(
         self._use_fake_reduce = os.environ.get(_FSDP_USE_FAKE_REDUCE, "") == "1"
         if self._skip_writeback_check:
             _warn_skip_writeback_check(
-                log,
+                logger,
                 f"Since {_FSDP_SKIP_WRITEBACK_CHECK}=1, FSDP will not check "
                 "for parameter or gradient writeback. Changing parameter or "
                 "gradient storages may lead to silent correctness errors.",
             )
         if self._use_fake_all_gather:
             _warn_use_fake_all_gather(
-                log,
+                logger,
                 f"Since {_FSDP_USE_FAKE_ALL_GATHER}=1, FSDP will not execute "
                 "all-gather ops. Your training will be incorrect, but "
                 "can reveal how much time spent on all-gather ops.",
             )
         if self._use_fake_reduce:
             _warn_use_fake_reduce(
-                log,
+                logger,
                 f"Since {_FSDP_USE_FAKE_REDUCE}=1, FSDP will not execute "
                 "reduce-scatter ops. Your training will be incorrect, but "
                 "can reveal how much time spent on reduce-scatter ops.",
@@ -708,7 +708,7 @@ def _init_flat_param_and_metadata(
             and aligned_numel > 0
             and total_numel != total_numel_without_padding
         ):
-            log.info(
+            logger.debug(
                 "FSDP FlatParameter address alignment created "
                 "%s numel of padding (%s vs. %s)",
                 total_numel - total_numel_without_padding,
@@ -721,7 +721,7 @@ def _init_flat_param_and_metadata(
             numel_to_pad = self.world_size - (total_numel % self.world_size)
             if numel_to_pad > 0 and numel_to_pad < self.world_size:
                 if self.rank == 0:
-                    log.info(
+                    logger.info(
                         "FSDP FlatParameter world size divisibility created "
                         "%s numel of padding",
                         numel_to_pad,
@@ -1394,7 +1394,7 @@ def _all_gather_flat_param(
             tensor_list = list(
                 torch.chunk(padded_unsharded_flat_param, dist.get_world_size(pg))
             )
-            work = dist.all_gather(tensor_list, sharded_flat_param, group=pg)
+            dist.all_gather(tensor_list, sharded_flat_param, group=pg)
         else:
             dist.all_gather_into_tensor(
                 padded_unsharded_flat_param,
@@ -1423,9 +1423,7 @@ def _use_unsharded_flat_param(
         """
         unsharded_size = self.flat_param._unpadded_unsharded_size
         flat_param_part = padded_unsharded_flat_param[: unsharded_size.numel()]
-        flat_param_part_view = flat_param_part.view(
-            unsharded_size
-        )  # this `.view()` is not autograd visible
+        # slicing [:] is not visible to autograd because of .data
         self.flat_param.data = flat_param_part
         in_forward = self._training_state == HandleTrainingState.FORWARD
         in_pre_backward = self._training_state == HandleTrainingState.BACKWARD_PRE
@@ -1731,7 +1729,8 @@ def post_reshard(self):
 
     def _free_unsharded_flat_param(self):
         """
-        Free the padded unsharded flat parameter.
+        Free the padded unsharded flat parameter. We allow this
+        function to be called even when storage is not allocated
 
         The tensor to free depends
         on the calling context since the unshard may have forced full
@@ -1739,7 +1738,6 @@ def _free_unsharded_flat_param(self):
         """
         self._check_sharded_strategy()
         unsharded_flat_param = self._get_padded_unsharded_flat_param()
-        self._check_storage_allocated(unsharded_flat_param)
         self._check_on_compute_device(unsharded_flat_param)
         # Do not free the memory until all ops in the current stream finish
         _no_dispatch_record_stream(
@@ -1769,8 +1767,8 @@ def _use_sharded_flat_param(self) -> None:
             )
         flat_param.data = flat_param._local_shard  # type: ignore[attr-defined]
         if self._use_orig_params:
-            if skip_use_sharded_views:
-                self._unsharded_flat_param_for_skipped_views = unsharded_flat_param
+            if skip_use_sharded_views:  # type: ignore[possibly-undefined]
+                self._unsharded_flat_param_for_skipped_views = unsharded_flat_param  # type: ignore[possibly-undefined]
             else:
                 self._use_sharded_views()
             # For the post-forward reshard, we may try to use sharded gradient
@@ -1778,7 +1776,7 @@ def _use_sharded_flat_param(self) -> None:
             # in `no_sync()`), but for the post-backward reshard, we delay the
             # call to after the reduce-scatter.
             if (
-                in_forward
+                in_forward  # type: ignore[possibly-undefined]
                 # Skip using gradient views if skipped using sharded views
                 # since exposing unsharded parameters with sharded gradients
                 # may be confusing to the user
@@ -2697,22 +2695,30 @@ def _construct_padding_tensor(
 # messasge is passed in)
 @functools.lru_cache(1)
 def _warn_skip_writeback_check(log: logging.Logger, warning: str):
-    log.warning(warning)
+    logger.warning(warning)
 
 
 # Use `lru_cache(1)` to only log the warning once
 @functools.lru_cache(1)
 def _warn_use_fake_all_gather(log: logging.Logger, warning: str):
-    log.warning(warning)
+    logger.warning(warning)
 
 
 # Use `lru_cache(1)` to only log the warning once
 @functools.lru_cache(1)
 def _warn_use_fake_reduce(log: logging.Logger, warning: str):
-    log.warning(warning)
+    logger.warning(warning)
 
 
 def _same_storage(a, b):
+    # Params are DTensors in backward
+    # with SHARD_GRAD_OP + TP
+    from torch.distributed._tensor import DTensor
+
+    if isinstance(a, DTensor):
+        a = a._local_tensor
+    if isinstance(b, DTensor):
+        b = b._local_tensor
     return a.untyped_storage().data_ptr() == b.untyped_storage().data_ptr()
 
 
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 8f5aae1c73e80..fe39a5dc111ea 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -15,6 +15,7 @@
     Optional,
     Set,
     Tuple,
+    TYPE_CHECKING,
     Union,
 )
 
@@ -56,7 +57,11 @@
 from torch.distributed.fsdp.wrap import _Policy
 from torch.distributed.tensor.parallel.fsdp import DTensorExtensions
 from torch.distributed.utils import _sync_params_and_buffers
-from torch.utils.hooks import RemovableHandle
+
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+
+if TYPE_CHECKING:
+    from torch.utils.hooks import RemovableHandle
 
 _TORCHDISTX_AVAIL = True
 try:
@@ -114,8 +119,8 @@ def _init_process_group_state(
             # passed in, there is no way to ensure all wrapped FSDP instances use the same
             # process groups.
             raise ValueError(
-                f"Manual wrapping with {sharding_strategy}",
-                "requires explicit specification of process group or device_mesh.",
+                f"Manual wrapping with {sharding_strategy} "
+                "requires explicit specification of process group or device_mesh."
             )
         else:
             state = _init_process_group_state_for_hybrid_shard(
@@ -202,12 +207,6 @@ def _is_valid_hybrid_shard_pg_type(process_group: Any) -> bool:
 
 @no_type_check
 def _is_valid_hybrid_shard_device_mesh(device_mesh: DeviceMesh) -> bool:
-    parent_mesh = _mesh_resources.get_parent_mesh(device_mesh)
-    if parent_mesh is not None:
-        raise RuntimeError(
-            f"Found device_mesh {device_mesh} passed in has a parent device_mesh {parent_mesh}.",
-            "Hybrid sharding + TP is not supported yet.",
-        )
     return isinstance(device_mesh, DeviceMesh) and device_mesh.ndim == 2
 
 
@@ -444,6 +443,14 @@ def _init_core_state(
                 "the world size is 1."
             )
         sharding_strategy = ShardingStrategy.NO_SHARD
+    elif sharding_strategy == ShardingStrategy.NO_SHARD:
+        warnings.warn(
+            "The `NO_SHARD` sharding strategy is deprecated. If having issues, "
+            "please use DistributedDataParallel instead.",
+            # Level 1 is here, level 2 is from `FullyShardedDataParallel`, and
+            # level 3 is from the true caller
+            stacklevel=3,
+        )
     state.sharding_strategy = sharding_strategy or ShardingStrategy.FULL_SHARD
     state.mixed_precision = mixed_precision or MixedPrecision()
     if mixed_precision is not None:
@@ -465,6 +472,7 @@ def _init_core_state(
         backward_prefetch_limit,
         forward_prefetch_limit,
     )
+    state._unshard_event = None
     # Mapping from fully sharded module to the handles it is responsible to
     # unshard and reshard (see [Note: Fully Sharded Module])
     _fully_sharded_module_to_handle: Dict[nn.Module, FlatParamHandle] = dict()
@@ -883,7 +891,7 @@ def _materialize_meta_module(
         warnings.warn(
             "Unable to call `reset_parameters()` for module on meta "
             f"device with error {str(e)}. Please ensure that your module of"
-            f"type {type(module)} implements a `reset_parameters()` method."
+            f"type {type(module)} implements a `reset_parameters()` method."  # type: ignore[possibly-undefined]
         )
         raise e
 
@@ -992,7 +1000,7 @@ def _move_states_to_device(
                     param.grad.data = param.grad.to(device_from_device_id)
         for buffer in buffers:
             buffer.data = buffer.to(device_from_device_id)
-    elif current_device == cpu_device:
+    elif current_device == cpu_device:  # type: ignore[possibly-undefined]
         _warn_cpu_init()
 
 
@@ -1062,8 +1070,25 @@ def _sync_module_params_and_buffers(
         # Avoid re-synchronizing buffers in case of nested wrapping
         if not getattr(buffer, FSDP_SYNCED, False):
             setattr(buffer, FSDP_SYNCED, True)
-            module_states.append(buffer.detach())
-    module_states.extend(param.detach() for param in params)
+            detached_buffer = buffer.detach()
+            if is_traceable_wrapper_subclass(detached_buffer):
+                # NOTE: Here we assume no nested subclasses, at most one level of subclass
+                # in both model's buffers and params
+                attrs, _ = detached_buffer.__tensor_flatten__()  # type: ignore[attr-defined]
+                inner_buffers = [getattr(detached_buffer, attr) for attr in attrs]
+                module_states.extend(inner_buffers)
+            else:
+                module_states.append(detached_buffer)
+
+    for param in params:
+        detached_param = param.detach()
+        if is_traceable_wrapper_subclass(detached_param):
+            attrs, _ = detached_param.__tensor_flatten__()  # type: ignore[attr-defined]
+            inner_params = [getattr(detached_param, attr) for attr in attrs]
+            module_states.extend(inner_params)
+        else:
+            module_states.append(detached_param)
+
     _check_module_states_for_sync_module_states(module_states)
     _sync_params_and_buffers(
         process_group,
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index cc08dcb72f0d4..dd053ccfaaad3 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -17,6 +17,7 @@
     Sequence,
     Set,
     Tuple,
+    TYPE_CHECKING,
     Union,
 )
 
@@ -24,7 +25,6 @@
 import torch.distributed as dist
 import torch.distributed.fsdp._traversal_utils as traversal_utils
 import torch.nn as nn
-from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed._state_dict_utils import _gather_state_dict
 from torch.distributed._tensor import DTensor, Replicate
 from torch.distributed.distributed_c10d import _get_pg_default_device
@@ -54,6 +54,9 @@
 )
 from torch.utils._pytree import tree_map_only
 
+if TYPE_CHECKING:
+    from torch.distributed._shard.sharded_tensor import ShardedTensor
+
 
 logger = logging.getLogger(__name__)
 
@@ -403,7 +406,7 @@ def _shard_orig_param_state(
             and value.dim() > 0
             and fsdp_state.sharding_strategy != ShardingStrategy.NO_SHARD
         ):
-            value = value.flatten()[intra_param_start_idx : intra_param_end_idx + 1]  # type: ignore[operator]
+            value = value.flatten()[intra_param_start_idx : intra_param_end_idx + 1].clone()  # type: ignore[operator]
         new_optim_state[state_name] = value
     return new_optim_state
 
@@ -1290,7 +1293,7 @@ def _is_named_optimizer(optim_state_dict: Dict[str, Any]) -> bool:
     try:
         key = next(iter(state.keys()))
     except Exception as e:
-        raise Exception(optim_state_dict) from e
+        raise Exception(optim_state_dict) from e  # noqa: TRY002
     return isinstance(key, str)
 
 
@@ -1419,7 +1422,7 @@ def _convert_all_state_info(
                 )
                 gathered_state[name] = scalar_tensor_value
 
-    return dtype, state_buffers
+    return dtype, state_buffers  # type: ignore[possibly-undefined]
 
 
 def _unflatten_orig_param_states(
@@ -1507,7 +1510,7 @@ def _allgather_orig_param_states(
     all tensor states and restore non-tensor states from ``gathered_state_info``.
     """
     fsdp_state = fsdp_param_info.state
-    if fsdp_state.rank == 0:
+    if fsdp_state.rank == 0 and dist.get_debug_level() == dist.DebugLevel.DETAIL:
         logger.warning(
             "CUDA Memory Summary before calling to _allgather_orig_param_states %s",
             torch.cuda.memory_summary(),
@@ -2082,10 +2085,5 @@ def _set_optim_use_dtensor(
                 "DeviceMesh is not compatible with LOCAL_STATE_DICT.",
                 "Please set state_dict_type to SHARDED_STATE_DICT to get DTensor state_dict.",
             )
-        elif state_dict_type == StateDictType.FULL_STATE_DICT:
-            logger.warning(
-                "Found both state_dict_type FULL_STATE_DICT and device_mesh. "  # noqa: G004
-                "Please set state_dict_type to SHARDED_STATE_DICT to get DTensor state_dict."
-            )
         else:
             state_dict_settings.optim_state_dict_config._use_dtensor = True
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 62e243ed9b5e9..89d9638217f1d 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -38,7 +38,7 @@
 )
 from torch.utils import _pytree as pytree
 
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 # Do not include "process_group" to enable hybrid shard and MoE cases
 HOMOGENEOUS_ATTR_NAMES = (
@@ -104,38 +104,6 @@ def _is_fsdp_root(state: _FSDPState, module: nn.Module) -> bool:
     return state._is_root
 
 
-@no_type_check
-def _validate_and_get_hybrid_shard_state(root_module: nn.Module) -> None:
-    """
-    Precondition: ``root_module`` is a ``FullyShardedDataParallel`` instance.
-
-    This checks that all instances using a hybrid sharding strategy have the
-    same intra- and inter-node process groups.
-    """
-    intra_node_pgs: Set[dist.ProcessGroup] = set()
-    inter_node_pgs: Set[dist.ProcessGroup] = set()
-    for fsdp_state in traversal_utils._get_fsdp_states(root_module):
-        # TODO: Change this to handle's sharding strategy if we deprecate
-        # `ShardingStrategy` internally.
-        # https://github.com/pytorch/pytorch/issues/90857
-        if fsdp_state.sharding_strategy in HYBRID_SHARDING_STRATEGIES:
-            intra_node_pgs.add(fsdp_state.process_group)
-            inter_node_pgs.add(fsdp_state._inter_node_pg)
-    if len(intra_node_pgs) == 0 and len(inter_node_pgs) == 0:
-        # No instances use a hybrid sharding strategy
-        return
-    error_prefix = "At least one instance uses a hybrid sharding strategy but has no "
-    if len(intra_node_pgs) > 0 and len(inter_node_pgs) == 0:
-        raise AssertionError(error_prefix + "inter-node process group set")
-    if len(intra_node_pgs) == 0 and len(inter_node_pgs) > 0:
-        raise AssertionError(error_prefix + "intra-node process group set")
-    error_prefix = "Some instances use a hybrid sharding strategy, but "
-    if len(intra_node_pgs) != 1:
-        raise ValueError(error_prefix + "intra-node process groups do not match")
-    if len(inter_node_pgs) != 1:
-        raise ValueError(error_prefix + "inter-node process groups do not match")
-
-
 @no_type_check
 def _lazy_init(
     state: _FSDPState,
@@ -208,7 +176,6 @@ def _share_state_and_init_handle_attrs(
     handle = root_state._handle
     if handle:
         handle.init_flat_param_attributes()
-    _validate_and_get_hybrid_shard_state(root_module)
     attr_name_to_values: Dict[str, Set[Any]] = {}
     for attr_name in HOMOGENEOUS_ATTR_NAMES:
         attr_name_to_values[attr_name] = set()
@@ -449,7 +416,12 @@ def _pre_forward_unshard(
     handle._needs_pre_forward_unshard = False
     # Don't wait during trace
     if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
-        state._device_handle.current_stream().wait_stream(state._unshard_stream)
+        current_stream = state._device_handle.current_stream()
+        if state._unshard_event is not None:
+            current_stream.wait_event(state._unshard_event)
+            state._unshard_event = None
+        else:
+            current_stream.wait_stream(state._unshard_stream)
     with torch.profiler.record_function(
         "FullyShardedDataParallel._pre_forward_prefetch"
     ):
@@ -663,7 +635,7 @@ def _pre_backward_hook(
         and hasattr(handle, "_ran_pre_backward_hook")
         and handle._ran_pre_backward_hook
     ):
-        log.debug("%s %s", id(state), "Not Running pre backward! Already Ran!")
+        logger.debug("%s %s", id(state), "Not Running pre backward! Already Ran!")
         return grad
 
     with torch.profiler.record_function("FullyShardedDataParallel._pre_backward_hook"):
@@ -732,7 +704,7 @@ def _post_backward_hook(
     - Otherwise, the ``_saved_grad_shard`` attribute is the reduced sharded
     gradient (accumulating with any existing gradient).
     """
-    _log_post_backward_hook(state, handle, log)
+    _log_post_backward_hook(state, handle, logger)
     flat_param = handle.flat_param
     flat_param._post_backward_called = True
     with torch.autograd.profiler.record_function(
@@ -790,6 +762,22 @@ def _post_backward_hook(
             )
 
 
+def _post_backward_reshard_only_hook(
+    state: _FSDPState,
+    handle: FlatParamHandle,
+    *unused: Any,
+) -> None:
+    with torch.profiler.record_function(
+        "FullyShardedDataParallel._post_backward_hook_reshard_only"
+    ):
+        # `_pre_backward_hook` may not get executed
+        # if forward output does not require grad
+        # overwrite IDLE state for post-backward prefetching
+        state.training_state = TrainingState.FORWARD_BACKWARD
+        handle._training_state = HandleTrainingState.BACKWARD_POST
+        _post_backward_reshard(state, handle)
+
+
 def _post_backward_reshard(
     state: _FSDPState,
     handle: FlatParamHandle,
@@ -1402,7 +1390,9 @@ def _register_pre_backward_hooks(
     def _register_hook(t: torch.Tensor) -> torch.Tensor:
         if t.requires_grad:
             t.register_hook(
-                functools.partial(_pre_backward_hook, state, module, handle)
+                torch.utils.hooks.unserializable_hook(
+                    functools.partial(_pre_backward_hook, state, module, handle)
+                )
             )
             if handle:
                 handle._needs_pre_backward_unshard = True
@@ -1504,7 +1494,7 @@ def _register_post_backward_reshard_only_hook(
         ]
     assert inp_tensors is not None  # mypy
     hook_handle = register_multi_grad_hook(
-        inp_tensors, functools.partial(_post_backward_reshard, state, handle)
+        inp_tensors, functools.partial(_post_backward_reshard_only_hook, state, handle)
     )
     if torch.distributed._functional_collectives.is_torchdynamo_compiling():
         flat_param._post_backward_hook_handle = hook_handle  # type: ignore[attr-defined, assignment]
diff --git a/torch/distributed/fsdp/_shard_utils.py b/torch/distributed/fsdp/_shard_utils.py
index 38db819866370..8af94b78209b8 100644
--- a/torch/distributed/fsdp/_shard_utils.py
+++ b/torch/distributed/fsdp/_shard_utils.py
@@ -57,7 +57,11 @@ def _create_chunk_sharded_tensor(
         else device.type
     )
     placements = [
-        _get_remote_device_str(r, device_type, num_devices_per_node)
+        _get_remote_device_str(
+            dist.get_global_rank(pg, r),
+            device_type,
+            num_devices_per_node,
+        )
         for r in range(len(chunk_sizes))
     ]
     assert len(chunk_sizes) == len(chunk_offsets) == len(placements)
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index f02beabfd4edf..9489994a3bb4b 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -2,7 +2,17 @@
 import logging
 import math
 import warnings
-from typing import Any, Callable, cast, Dict, Generator, Iterator, no_type_check, Tuple
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Generator,
+    Iterator,
+    List,
+    no_type_check,
+    Tuple,
+)
 
 import torch
 import torch.distributed as dist
@@ -56,6 +66,15 @@
 logger = logging.getLogger(__name__)
 
 
+def _should_unshard_params(fsdp_state: _FSDPState) -> bool:
+    if fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD and (
+        _is_composable(fsdp_state) or fsdp_state._use_orig_params
+    ):
+        return False
+    else:
+        return True
+
+
 def _convert_to_wrapped_module_name(module_name: str) -> str:
     module_name = module_name.replace(f"{FSDP_PREFIX}", "")
     module_name = module_name.replace(f"{FSDP_WRAPPED_MODULE}", "")
@@ -150,10 +169,7 @@ def _common_unshard_pre_state_dict_hook(
     ``_unshard_fsdp_state_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this hook.
     """
     # For composable `fully_shard`, it does not need to unshard parameters for `NO_SHARD` cases.
-    if (
-        _is_composable(fsdp_state)
-        and fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD
-    ):
+    if not _should_unshard_params(fsdp_state):
         return
     _enter_unshard_params_ctx(
         module,
@@ -180,10 +196,7 @@ def _common_unshard_post_state_dict_hook(
     _replace_by_prefix(state_dict, prefix + f"{FSDP_PREFIX}", prefix)
     # Return early for trivial cases
     if not state_dict or not _has_fsdp_params(fsdp_state, module):
-        if not (
-            _is_composable(fsdp_state)
-            and fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD
-        ):
+        if _should_unshard_params(fsdp_state):
             _exit_unshard_params_ctx(module, fsdp_state)
         return state_dict
 
@@ -228,10 +241,7 @@ def _common_unshard_post_state_dict_hook(
 
         param_hook(state_dict, prefix, fqn)
 
-    if not (
-        _is_composable(fsdp_state)
-        and fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD
-    ):
+    if _should_unshard_params(fsdp_state):
         _exit_unshard_params_ctx(module, fsdp_state)
 
     cpu_device = torch.device("cpu")
@@ -292,11 +302,6 @@ def _full_pre_state_dict_hook(
     """
     if getattr(fsdp_state, "_device_mesh", False):
         parent_mesh = _mesh_resources.get_parent_mesh(fsdp_state._device_mesh)
-        if parent_mesh:
-            raise RuntimeError(
-                f"Found FSDP's device_mesh {fsdp_state._device_mesh} has a parent device_mesh {parent_mesh}.",
-                "We do not support FULL_STATE_DICT for 2D FSDP + TP. Please use FSDP SHARDED_STATE_DICT instead.",
-            )
 
     _common_pre_state_dict_hook(module, fsdp_state)
     _common_unshard_pre_state_dict_hook(
@@ -360,10 +365,7 @@ def _full_pre_load_state_dict_hook(
     prefix: str,
 ) -> None:
     _lazy_init(fsdp_state, module)
-    if not (
-        _is_composable(fsdp_state)
-        and fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD
-    ):
+    if _should_unshard_params(fsdp_state):
         with SimpleProfiler.profile("_enter_unshard_params_ctx"):
             _enter_unshard_params_ctx(module, fsdp_state, writeback=True)
     # Add FSDP_PREFIX only for wrapper-based FSDP.
@@ -374,10 +376,7 @@ def _full_pre_load_state_dict_hook(
 def _full_post_load_state_dict_hook(
     module: nn.Module, fsdp_state: _FSDPState, *args, **kwargs
 ) -> None:
-    if not (
-        _is_composable(fsdp_state)
-        and fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD
-    ):
+    if _should_unshard_params(fsdp_state):
         with SimpleProfiler.profile("_exit_unshard_params_ctx"):
             _exit_unshard_params_ctx(module, fsdp_state)
 
@@ -804,11 +803,6 @@ def _set_use_dtensor(fsdp_state: _FSDPState) -> None:
                 "DeviceMesh is not compatible with LOCAL_STATE_DICT.",
                 "Please set state_dict_type to SHARDED_STATE_DICT to get DTensor state_dict.",
             )
-        elif state_dict_type == StateDictType.FULL_STATE_DICT:
-            logger.warning(
-                "Found both state_dict_type FULL_STATE_DICT and device_mesh. "  # noqa: G004
-                "Please set state_dict_type to SHARDED_STATE_DICT to get DTensor state_dict."
-            )
         else:
             fsdp_state._state_dict_config._use_dtensor = True
 
@@ -860,6 +854,7 @@ def _pre_load_state_dict_hook(
 @torch.no_grad()
 def _post_load_state_dict_hook(
     module: nn.Module,
+    incompatible_keys: Tuple[List[str], List[str]],
     *args: Any,
 ) -> None:
     fsdp_state = _get_module_fsdp_state_if_fully_sharded_module(module)
@@ -883,6 +878,15 @@ def _post_load_state_dict_hook(
         # loading state_dict.
         _post_load_state_dict_hook_fn[fsdp_state._state_dict_type](module, fsdp_state)
 
+    # When reporting incompatible keys, trim FSDP prefixes.
+    missing_keys = incompatible_keys[0]
+    unexpected_keys = incompatible_keys[1]
+    for i in range(len(missing_keys)):
+        missing_keys[i] = clean_tensor_name(missing_keys[i])
+
+    for i in range(len(unexpected_keys)):
+        unexpected_keys[i] = clean_tensor_name(unexpected_keys[i])
+
     if fsdp_state._is_root:
         SimpleProfiler.dump_and_reset("FSDP model load_state_dict profiling: ")
 
diff --git a/torch/distributed/fsdp/_unshard_param_utils.py b/torch/distributed/fsdp/_unshard_param_utils.py
index afa6cebe4abe8..7700d631d73e2 100644
--- a/torch/distributed/fsdp/_unshard_param_utils.py
+++ b/torch/distributed/fsdp/_unshard_param_utils.py
@@ -7,13 +7,13 @@
 import torch.nn as nn
 from torch.distributed.fsdp._common_utils import (
     _FSDPState,
+    _get_module_fsdp_state,
     _has_fsdp_params,
     _module_handle,
     HandleTrainingState,
     TrainingState,
 )
 from torch.distributed.fsdp._runtime_utils import (
-    _get_fsdp_root_states_with_modules,
     _lazy_init,
     _reset_flat_param_grad_info_if_needed,
     _reshard,
@@ -231,44 +231,17 @@ def _unshard_fsdp_state_params(
 
 
 @contextlib.contextmanager
-def _unshard_params_recurse(
+def _unshard_params_for_summon(
     module: nn.Module,
     state: _FSDPState,
-    recurse: bool,
     writeback: bool,
     rank0_only: bool,
     offload_to_cpu: bool,
     with_grads: bool,
 ):
-    """
-    This is a helper for :func:`_unshard_params` that recursively calls
-    :func:`_unshard_fsdp_state_params` on FSDP states if ``recurse=True``.
-    NOTE: This runs lazy initialization.
-    """
     _validate_unshard_params_args(
         state, writeback, rank0_only, offload_to_cpu, with_grads
     )
-    if recurse:
-        with contextlib.ExitStack() as stack:
-            # TODO (awgu): The traversal function does not traverse through
-            # incompatible composable APIs. Verify if this is the desired
-            # behavior for this function.
-            for state, fsdp_module in zip(
-                *traversal_utils._get_fsdp_states_with_modules(module)
-            ):
-                stack.enter_context(
-                    _unshard_params_recurse(
-                        module=fsdp_module,
-                        state=state,
-                        recurse=False,
-                        writeback=writeback,
-                        rank0_only=rank0_only,
-                        offload_to_cpu=offload_to_cpu,
-                        with_grads=with_grads,
-                    )
-                )
-            yield
-        return
     _lazy_init(state, module)
     if state.training_state == TrainingState.FORWARD_BACKWARD:
         raise AssertionError(
@@ -306,16 +279,21 @@ def _unshard_params(
     This unshards FSDP-managed parameters for all modules with FSDP applied in
     the module tree rooted at ``module``.
     """
-    root_fsdp_states, root_fsdp_modules = _get_fsdp_root_states_with_modules(module)
+    if not recurse:
+        optional_state = _get_module_fsdp_state(module)
+        if optional_state is None:
+            with contextlib.nullcontext():
+                yield
+            return
+        states_and_modules = ([optional_state], [module])
+    else:
+        states_and_modules = traversal_utils._get_fsdp_states_with_modules(module)
     with contextlib.ExitStack() as stack:
-        for root_fsdp_state, root_fsdp_module in zip(
-            root_fsdp_states, root_fsdp_modules
-        ):
+        for state, module in zip(*states_and_modules):
             stack.enter_context(
-                _unshard_params_recurse(
-                    module=root_fsdp_module,
-                    state=root_fsdp_state,
-                    recurse=recurse,
+                _unshard_params_for_summon(
+                    module=module,
+                    state=state,
                     writeback=writeback,
                     rank0_only=rank0_only,
                     offload_to_cpu=offload_to_cpu,
@@ -323,7 +301,6 @@ def _unshard_params(
                 )
             )
         yield
-    return
 
 
 def _deregister_orig_params(state: _FSDPState, module: nn.Module) -> None:
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 62a8a51272e94..3b549c8229ae5 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import contextlib
 import copy
 import functools
@@ -34,6 +36,7 @@
     _get_param_to_fqns,
     FSDP_PREFIX,
     FSDP_WRAPPED_MODULE,
+    HandleTrainingState,
     TrainingState,
 )
 from torch.distributed.fsdp._dynamo_utils import _annotate_modules_for_dynamo
@@ -61,6 +64,8 @@
     _pre_forward,
     _pre_forward_unshard,
     _root_pre_forward,
+    _unshard,
+    _wait_for_computation_stream,
 )
 from torch.distributed.fsdp._wrap_utils import _auto_wrap
 from torch.distributed.fsdp.api import (
@@ -80,7 +85,7 @@
     StateDictType,
 )
 from torch.distributed.utils import _p_assert
-from ._flat_param import FlatParameter
+from ._flat_param import FlatParameter, FlatParamHandle
 
 from ._optim_utils import (
     _flatten_optim_state_dict,
@@ -98,7 +103,7 @@
     _register_flat_param,
     _register_orig_params,
     _unshard_params,
-    _unshard_params_recurse,
+    _unshard_params_for_summon,
 )
 from .wrap import CustomPolicy, ModuleWrapPolicy
 
@@ -128,6 +133,9 @@ class FullyShardedDataParallel(nn.Module, _FSDPState):
     .. _`Xu et al.`: https://arxiv.org/abs/2004.13336
     .. _DeepSpeed: https://www.deepspeed.ai/
 
+    To understand FSDP internals, refer to the
+    :ref:`fsdp_notes`.
+
     Example::
 
         >>> # xdoctest: +SKIP("undefined variables")
@@ -141,107 +149,91 @@ class FullyShardedDataParallel(nn.Module, _FSDPState):
         >>> loss.backward()
         >>> optim.step()
 
-    .. warning::
-        The optimizer must be initialized *after* the module has been wrapped
-        with FSDP since FSDP will shard and transform the module's parameters
-        in a way that may not preserve the original parameter variables. Thus,
-        the previously initialized optimizer may have stale references to the
-        parameters.
-
-    .. warning::
-        If the destination CUDA device has ID ``dev_id``, either (1)
-        ``module`` should already be placed on that device, (2) the device
-        should be set using ``torch.cuda.set_device(dev_id)``, or (3)
-        ``dev_id`` should be passed into the ``device_id`` constructor
-        argument. This FSDP instance's compute device will be that destination
-        device. For (1) and (3), the FSDP initialization always occurs on GPU.
-        For (2), the FSDP initialization happens on ``module`` 's current
-        device, which may be CPU.
-
-    .. warning::
-        FSDP currently does not support gradient accumulation outside
-        ``no_sync()`` when using CPU offloading. Trying to do so yields
-        incorrect results since FSDP will use the newly-reduced gradient
-        instead of accumulating with any existing gradient.
-
-    .. warning::
-        Changing the original parameter variable names after construction will
-        lead to undefined behavior.
-
-    .. warning::
-        Passing in the ``sync_module_states=True`` flag requires ``module`` to
-        be on GPU or to use the ``device_id`` argument to specify a CUDA device
-        that FSDP will move ``module`` to in the FSDP constructor. This is
-        because ``sync_module_states=True`` requires GPU communication.
-
-    .. warning::
-        As of PyTorch 1.12, FSDP only offers limited support for shared parameters
-        (for example, setting one ``Linear`` layer's weight to another's). In
-        particular, modules that share parameters must be wrapped as part of the
-        same FSDP unit. If enhanced shared parameter support is needed for your
-        use case, please ping https://github.com/pytorch/pytorch/issues/77724
-
-    .. warning::
-        FSDP has some constraints on freezing parameters (i.e. setting
-        ``param.requires_grad=False``). For ``use_orig_params=False``, each
-        FSDP instance must manage parameters that are all frozen or all
-        non-frozen. For ``use_orig_params=True``, FSDP supports mixing frozen
-        and non-frozen, but we recommend not doing so since then the gradient
-        memory usage will be higher than expected (namely, equivalent to not
-        freezing those parameters). This means that ideally, frozen parameters
-        should be isolated into their own ``nn.Module`` s and wrapped
-        separately with FSDP.
-
-    .. note::
-        Attempting to run the forward pass of a submodule that is contained in an
-        FSDP instance is not supported and will result in errors. This is because the
-        submodule's parameters will be sharded, but it itself is not an FSDP instance,
-        so its forward pass will not all-gather the full parameters appropriately.
-        This could potentially happen when attempting to run only the encoder of a
-        encoder-decoder model, and the encoder is not wrapped in its own FSDP instance. To
-        resolve this, please wrap the submodule in its own FSDP unit.
-
-    .. note::
-        FSDP moves input tensors to the ``forward`` method to the GPU compute
-        device, so the user does not need to manually move them from CPU.
-
-    .. warning::
-        The user should not modify the parameters between forward and backward
-        without using the :meth:`summon_full_params` context since the
-        modifications may not persist. Moreover, for ``use_orig_params=False``,
-        accessing the original parameters between forward and backward may
-        raise an illegal memory access.
-
-    .. warning::
-        For ``use_orig_params=True``, ``ShardingStrategy.SHARD_GRAD_OP``
-        exposes the unsharded parameters, not the sharded parameters, after
-        forward since it does not free the unsharded ones, unlike
-        ``ShardingStrategy.FULL_SHARD``. One caveat is that, since gradients
-        are always sharded or ``None``, ``ShardingStrategy.SHARD_GRAD_OP`` will
-        not expose the sharded gradients with the unsharded parameters after
-        forward. If you want to inspect the gradients, try
-        :meth:`summon_full_params` with ``with_grads=True``.
-
-    .. warning::
-        FSDP replaces managed modules' parameters with ``torch.Tensor`` views
-        during forward and backward computation for autograd-related reasons.
-        If your module's forward relies on saved references to the parameters
-        instead of reacquiring the references each iteration, then it will not
-        see FSDP's newly created views, and autograd will not work correctly.
-
-    .. note::
-        With ``limit_all_gathers=True``, you may see a gap in the FSDP
-        pre-forward where the CPU thread is not issuing any kernels. This is
-        intentional and shows the rate limiter in effect. Synchronizing the CPU
-        thread in that way prevents over-allocating memory for subsequent
-        all-gathers, and it should not actually delay GPU kernel execution.
-
-    .. note::
-        When using ``sharding_strategy=ShardingStrategy.HYBRID_SHARD`` with the
-        sharding process group being intra-node and the replication process
-        group being inter-node, setting ``NCCL_CROSS_NIC=1`` can help improve
-        the all-reduce times over the replication process group for some
-        cluster setups.
+    Using FSDP involves wrapping your module and then initializing your
+    optimizer after. This is required since FSDP changes the parameter
+    variables.
+
+    When setting up FSDP, you need to consider the destination CUDA
+    device. If the device has an ID (``dev_id``), you have three options:
+
+    * Place the module on that device
+    * Set the device using ``torch.cuda.set_device(dev_id)``
+    * Pass ``dev_id`` into the ``device_id`` constructor argument.
+
+    This ensures that the FSDP instance's compute device is the
+    destination device. For option 1 and 3, the FSDP initialization
+    always occurs on GPU. For option 2, the FSDP initialization
+    happens on module's current device, which may be a CPU.
+
+    If you're using the ``sync_module_states=True`` flag, you need to
+    ensure that the module is on a GPU or use the ``device_id``
+    argument to specify a CUDA device that FSDP will move the module
+    to in the FSDP constructor. This is necessary because
+    ``sync_module_states=True`` requires GPU communication.
+
+    FSDP also takes care of moving input tensors to the forward method
+    to the GPU compute device, so you don't need to manually move them
+    from CPU.
+
+    For ``use_orig_params=True``,
+    ``ShardingStrategy.SHARD_GRAD_OP`` exposes the unsharded
+    parameters, not the sharded parameters after forward, unlike
+    ``ShardingStrategy.FULL_SHARD``. If you want
+    to inspect the gradients, you can use the ``summon_full_params``
+    method with ``with_grads=True``.
+
+    With ``limit_all_gathers=True``, you may see a gap in the FSDP
+    pre-forward where the CPU thread is not issuing any kernels. This is
+    intentional and shows the rate limiter in effect. Synchronizing the CPU
+    thread in that way prevents over-allocating memory for subsequent
+    all-gathers, and it should not actually delay GPU kernel execution.
+
+    FSDP replaces managed modules' parameters with ``torch.Tensor``
+    views during forward and backward computation for autograd-related
+    reasons. If your module's forward relies on saved references to
+    the parameters instead of reacquiring the references each
+    iteration, then it will not see FSDP's newly created views,
+    and autograd will not work correctly.
+
+    Finally, when using ``sharding_strategy=ShardingStrategy.HYBRID_SHARD``
+    with the sharding process group being intra-node and the
+    replication process group being inter-node, setting
+    ``NCCL_CROSS_NIC=1`` can help improve the all-reduce times over
+    the replication process group for some cluster setups.
+
+    **Limitations**
+
+    There are several limitations to be aware of when using FSDP:
+
+    * FSDP currently does not support gradient accumulation outside
+      ``no_sync()`` when using CPU offloading. This is because FSDP
+      uses the newly-reduced gradient instead of accumulating with any
+      existing gradient, which can lead to incorrect results.
+
+    * FSDP does not support running the forward pass of a submodule
+      that is contained in an FSDP instance. This is because the
+      submodule's parameters will be sharded, but the submodule itself
+      is not an FSDP instance, so its forward pass will not all-gather
+      the full parameters appropriately.
+
+    * FSDP does not work with double backwards due to the way it
+      registers backward hooks.
+
+    * FSDP has some constraints when freezing parameters.
+      For ``use_orig_params=False``, each FSDP instance must manage
+      parameters that are all frozen or all non-frozen. For
+      ``use_orig_params=True``, FSDP supports mixing frozen and
+      non-frozen parameters, but it's recommended to avoid doing so to
+      prevent higher than expected gradient memory usage.
+
+    * As of PyTorch 1.12, FSDP offers limited support for shared
+      parameters. If enhanced shared parameter support is needed for
+      your use case, please post in
+      `this issue <https://github.com/pytorch/pytorch/issues/77724>`__.
+
+    * You should avoid modifying the parameters between forward and
+      backward without using the ``summon_full_params`` context, as
+      the modifications may not persist.
 
     Args:
         module (nn.Module):
@@ -468,7 +460,7 @@ def __init__(
                 "ignored_states": self._ignored_params,
                 "device_mesh": device_mesh,
             }
-            if sharding_strategy in HYBRID_SHARDING_STRATEGIES:
+            if sharding_strategy in HYBRID_SHARDING_STRATEGIES and device_mesh is None:
                 # Share root process groups with children to maintain
                 # the invariant that all FSDP modules will have the same
                 # process groups.
@@ -592,14 +584,13 @@ def apply(self, fn: Callable[[nn.Module], None]) -> "FullyShardedDataParallel":
         """
         uninitialized = self._is_root is None
         self._assert_state(TrainingState.IDLE)
-        # Use `_unshard_params_recurse()` with `recurse=False` instead of
+        # Use `_unshard_params_for_summon()` with `recurse=False` instead of
         # `_unshard_fsdp_state_params()` directly to perform lazy
         # initialization, which is needed to initialize `FlatParameter`
         # parameter attributes as required by the unshard logic
-        with _unshard_params_recurse(
+        with _unshard_params_for_summon(
             self,
             self,
-            recurse=False,
             writeback=True,
             rank0_only=False,
             offload_to_cpu=False,
@@ -607,9 +598,9 @@ def apply(self, fn: Callable[[nn.Module], None]) -> "FullyShardedDataParallel":
         ):
             ret = super().apply(fn)
 
-        # Reset lazy init called in `_unshard_params_recurse()` since `apply()`
-        # may have been called on FSDP instance that is not truly a root, in
-        # which case it will be incorrectly marked as one.
+        # Reset lazy init called in `_unshard_params_for_summon()` since
+        # `apply()` may have been called on FSDP instance that is not truly a
+        # root, in which case it will be incorrectly marked as one.
         if uninitialized and self._is_root:
             for module in traversal_utils._get_fsdp_states(self):
                 module._reset_lazy_init()
@@ -1075,21 +1066,21 @@ def clip_grad_norm_(
         Returns:
             Total norm of the parameters (viewed as a single vector).
 
-        .. note:: If every FSDP instance uses ``NO_SHARD``, meaning that no
-            gradients are sharded across ranks, then you may directly use
-            :func:`torch.nn.utils.clip_grad_norm_`.
+        If every FSDP instance uses ``NO_SHARD``, meaning that no
+        gradients are sharded across ranks, then you may directly use
+        :func:`torch.nn.utils.clip_grad_norm_`.
 
-        .. note:: If at least some FSDP instance uses a sharded strategy (i.e.
-            one other than ``NO_SHARD``), then you should use this method
-            instead of :func:`torch.nn.utils.clip_grad_norm_` since this method
-            handles the fact that gradients are sharded across ranks.
+        If at least some FSDP instance uses a sharded strategy (i.e.
+        one other than ``NO_SHARD``), then you should use this method
+        instead of :func:`torch.nn.utils.clip_grad_norm_` since this method
+        handles the fact that gradients are sharded across ranks.
 
-        .. note:: The total norm returned will have the "largest" dtype across
-            all parameters/gradients as defined by PyTorch's type promotion
-            semantics. For example, if *all* parameters/gradients use a low
-            precision dtype, then the returned norm's dtype will be that low
-            precision dtype, but if there exists at least one parameter/
-            gradient using FP32, then the returned norm's dtype will be FP32.
+        The total norm returned will have the "largest" dtype across
+        all parameters/gradients as defined by PyTorch's type promotion
+        semantics. For example, if *all* parameters/gradients use a low
+        precision dtype, then the returned norm's dtype will be that low
+        precision dtype, but if there exists at least one parameter/
+        gradient using FP32, then the returned norm's dtype will be FP32.
 
         .. warning:: This needs to be called on all ranks since it uses
             collective communications.
@@ -1141,12 +1132,18 @@ def clip_grad_norm_(
         local_sharded_norm = _get_grad_norm(sharded_params, norm_type).to(
             self.compute_device
         )
-        local_nonsharded_norm = _get_grad_norm(nonsharded_params, norm_type).to(
-            self.compute_device
+        local_nonsharded_norm = (
+            _get_grad_norm(nonsharded_params, norm_type).to(self.compute_device)
+            if nonsharded_params
+            else None
         )
         # Reconstruct the total gradient norm depending on the norm type
         if norm_type == math.inf:
-            total_norm = torch.maximum(local_sharded_norm, local_nonsharded_norm)
+            total_norm = (
+                torch.maximum(local_sharded_norm, local_nonsharded_norm)
+                if local_nonsharded_norm is not None
+                else local_sharded_norm
+            )
             dist.all_reduce(
                 total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group
             )
@@ -1155,7 +1152,8 @@ def clip_grad_norm_(
             dist.all_reduce(total_norm, group=self.process_group)
             # All-reducing the local non-sharded norm would count it an extra
             # world-size-many times
-            total_norm += local_nonsharded_norm**norm_type
+            if local_nonsharded_norm is not None:
+                total_norm += local_nonsharded_norm**norm_type
             total_norm = total_norm ** (1.0 / norm_type)
         if self.cpu_offload.offload_params:
             total_norm = total_norm.cpu()
@@ -1166,7 +1164,7 @@ def clip_grad_norm_(
         # `if clip_coef < 1`
         clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
         for grad in grads:
-            grad.detach().mul_(clip_coef_clamped.to(grad.device, grad.dtype))
+            grad.mul_(clip_coef_clamped.to(grad.device, grad.dtype))
         # Use the "largest" dtype by type promotion semantics to use the same
         # dtype as if we did not force local norm computation to be in FP32
         if len(grads) == 0:
@@ -1346,19 +1344,19 @@ def full_optim_state_dict(
         and ``"param_groups"``. The flattened parameters in ``FSDP`` modules
         contained in ``model`` are mapped back to their unflattened parameters.
 
-        .. warning:: This needs to be called on all ranks since it uses
-            collective communications. However, if ``rank0_only=True``, then
-            the state dict is only populated on rank 0, and all other ranks
-            return an empty :class:`dict`.
+        This needs to be called on all ranks since it uses
+        collective communications. However, if ``rank0_only=True``, then
+        the state dict is only populated on rank 0, and all other ranks
+        return an empty :class:`dict`.
 
-        .. warning:: Unlike ``torch.optim.Optimizer.state_dict()``, this method
-            uses full parameter names as keys instead of parameter IDs.
+        Unlike ``torch.optim.Optimizer.state_dict()``, this method
+        uses full parameter names as keys instead of parameter IDs.
 
-        .. note:: Like in :meth:`torch.optim.Optimizer.state_dict`, the tensors
-            contained in the optimizer state dict are not cloned, so there may
-            be aliasing surprises. For best practices, consider saving the
-            returned optimizer state dict immediately, e.g. using
-            ``torch.save()``.
+        Like in :meth:`torch.optim.Optimizer.state_dict`, the tensors
+        contained in the optimizer state dict are not cloned, so there may
+        be aliasing surprises. For best practices, consider saving the
+        returned optimizer state dict immediately, e.g. using
+        ``torch.save()``.
 
         Args:
             model (torch.nn.Module): Root module (which may or may not be a
@@ -1804,7 +1802,7 @@ def optim_state_dict(
             >>> )
             >>> model.load_state_dict(state_dict)
             >>> optim_state_dict = FSDP.optim_state_dict_to_load(
-            >>>     optim_state_dict, model, optim
+            >>>     model, optim, optim_state_dict
             >>> )
             >>> optim.load_state_dict(optim_state_dict)
 
@@ -1992,6 +1990,62 @@ def register_comm_hook(self, state: object, hook: callable):
             fsdp_state._comm_hook = hook
             fsdp_state._comm_hook_state = state
 
+    def _unshard(self, async_op: bool = False):
+        class UnshardHandle:
+            def __init__(
+                self,
+                flat_param_handle: Optional[FlatParamHandle],
+                unshard_event: torch.cuda.Event,
+            ):
+                self._flat_param_handle = flat_param_handle
+                self._unshard_event = unshard_event
+
+            def wait(self):
+                if self._flat_param_handle is not None:
+                    current_stream = (
+                        self._flat_param_handle._device_handle.current_stream()
+                    )
+                    current_stream.wait_event(self._unshard_event)
+                    self._flat_param_handle = None
+
+        if self._handle:
+            with self._use_training_state(
+                TrainingState.FORWARD_BACKWARD, HandleTrainingState.FORWARD
+            ):
+                _unshard(
+                    self, self._handle, self._unshard_stream, self._pre_unshard_stream
+                )
+                self._unshard_event = self._unshard_stream.record_event()
+            self._handle._prefetched = True
+        unshard_handle = UnshardHandle(self._handle, self._unshard_stream)
+        if async_op:
+            return unshard_handle
+        unshard_handle.wait()
+        return None
+
+    def _wait_unshard_streams_on_current_stream(self):
+        _wait_for_computation_stream(
+            self._device_handle.current_stream(),
+            self._unshard_stream,
+            self._pre_unshard_stream,
+        )
+
+    @contextlib.contextmanager
+    def _use_training_state(
+        self, training_state: TrainingState, handle_training_state: HandleTrainingState
+    ):
+        prev_training_state = self.training_state
+        self.training_state = training_state
+        if self._handle:
+            prev_handle_training_state = self._handle._training_state
+            self._handle._training_state = handle_training_state
+        try:
+            yield
+        finally:
+            self.training_state = prev_training_state
+            if self._handle:
+                self._handle._training_state = prev_handle_training_state
+
 
 def _get_grad_norm(
     params: Iterable[nn.Parameter],
diff --git a/torch/distributed/fsdp/sharded_grad_scaler.py b/torch/distributed/fsdp/sharded_grad_scaler.py
index 5ab46b380eff5..b79a50c636b25 100644
--- a/torch/distributed/fsdp/sharded_grad_scaler.py
+++ b/torch/distributed/fsdp/sharded_grad_scaler.py
@@ -4,10 +4,10 @@
 
 import torch
 import torch.distributed as dist
-from torch.cuda.amp.grad_scaler import _MultiDeviceReplicator, GradScaler, OptState
+from torch.amp.grad_scaler import _MultiDeviceReplicator, GradScaler, OptState
 from torch.distributed.distributed_c10d import ProcessGroup
 
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 def _refresh_per_optimizer_state() -> Dict[str, Any]:
@@ -15,7 +15,7 @@ def _refresh_per_optimizer_state() -> Dict[str, Any]:
 
 
 def _is_supported_device(tensor: torch.Tensor) -> bool:
-    return tensor.is_cuda or tensor.device.type in ("xla", "cpu")
+    return tensor.is_cuda or tensor.device.type in ("xla", "cpu", "hpu")
 
 
 class _GeneralMultiDeviceReplicator(_MultiDeviceReplicator):
@@ -81,6 +81,7 @@ class ShardedGradScaler(GradScaler):
 
     def __init__(
         self,
+        device: str = "cuda",
         init_scale: float = 2.0**16,
         backoff_factor: float = 0.5,
         growth_factor: float = 2.0,
@@ -89,6 +90,7 @@ def __init__(
         process_group: Optional[ProcessGroup] = dist.group.WORLD,
     ) -> None:
         super().__init__(
+            device,
             init_scale=init_scale,
             backoff_factor=backoff_factor,
             growth_factor=growth_factor,
@@ -171,7 +173,7 @@ def _foreach_non_finite_check_and_unscale_cpu_(
 
         for grad in grads:
             if grad.device.type != "cpu":
-                log.error(
+                logger.error(
                     "tensor device is %s but was expected to be ``cpu``",
                     grad.device,
                 )
@@ -280,27 +282,28 @@ def unscale_(self, optimizer: torch.optim.Optimizer) -> None:
 
         # Synchronize the detected inf across the ranks
         optimizer_state = self._per_optimizer_states[id(optimizer)]
-        future_handles = []
-
-        for v in optimizer_state["found_inf_per_device"].values():
-            if v.device.type == "cpu":
-                v_on_cuda = v.cuda()
-                future_handles.append(
+        works = []
+        found_inf_on_cpus = []
+        found_inf_on_cudas = []
+
+        for found_inf in optimizer_state["found_inf_per_device"].values():
+            if self._device == "cuda" and found_inf.device.type == "cpu":
+                found_inf_on_cpus.append(found_inf)
+                found_inf_on_cuda = found_inf.cuda()
+                found_inf_on_cudas.append(found_inf_on_cuda)
+                works.append(
                     dist.all_reduce(
-                        v_on_cuda, async_op=True, group=self.process_group
-                    ).get_future()
+                        found_inf_on_cuda, async_op=True, group=self.process_group
+                    )
                 )
-                v.copy_(v_on_cuda.cpu())
             else:
-                future_handles.append(
-                    dist.all_reduce(
-                        v, async_op=True, group=self.process_group
-                    ).get_future()
+                works.append(
+                    dist.all_reduce(found_inf, async_op=True, group=self.process_group)
                 )
-
-        # Make sure that the calls are done before moving out.
-        if future_handles:
-            torch.futures.wait_all(future_handles)
+        for work in works:
+            work.wait()
+        if found_inf_on_cpus:
+            torch._foreach_copy_(found_inf_on_cpus, found_inf_on_cudas)
 
     def _amp_update_scale_cpu_(self, found_inf: torch.Tensor) -> None:
         """
@@ -331,7 +334,7 @@ def update(self, new_scale: Optional[Union[float, torch.Tensor]] = None) -> None
         ``new_scale`` was a tensor, later in-place changes to that tensor will not further
         affect the scale GradScaler uses internally.)
         Args:
-            new_scale (float or :class:`torch.cuda.FloatTensor`, optional, default=None):  New scale factor.
+            new_scale (float or :class:`torch.Tensor`, optional, default=None):  New scale factor.
         .. warning::
             :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
             been invoked for all optimizers used this iteration.
@@ -347,8 +350,9 @@ def update(self, new_scale: Optional[Union[float, torch.Tensor]] = None) -> None
             if isinstance(new_scale, float):
                 self._scale.fill_(new_scale)  # type: ignore[union-attr]
             else:
-                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False."
-                assert isinstance(new_scale, torch.cuda.FloatTensor), reason  # type: ignore[attr-defined]
+                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor or \
+                    torch.FloatTensor with requires_grad=False."
+                assert new_scale.device.type == self._device, reason
                 assert new_scale.numel() == 1, reason
                 assert new_scale.requires_grad is False, reason
                 self._scale.copy_(new_scale)  # type: ignore[union-attr]
diff --git a/torch/distributed/launch.py b/torch/distributed/launch.py
index c7f7979308a29..c95804b8e8bbc 100644
--- a/torch/distributed/launch.py
+++ b/torch/distributed/launch.py
@@ -83,7 +83,7 @@
     >>> # xdoctest: +SKIP
     >>> import argparse
     >>> parser = argparse.ArgumentParser()
-    >>> parser.add_argument("--local-rank", type=int)
+    >>> parser.add_argument("--local-rank", "--local_rank", type=int)
     >>> args = parser.parse_args()
 
 Set your device to local rank using either
@@ -100,6 +100,19 @@
     >>>    # your code to run
     >>>    ...
 
+.. versionchanged:: 2.0.0
+
+    The launcher will passes the ``--local-rank=<rank>`` argument to your script.
+    From PyTorch 2.0.0 onwards, the dashed ``--local-rank`` is preferred over the
+    previously used underscored ``--local_rank``.
+
+    For backward compatibility, it may be necessary for users to handle both
+    cases in their argument parsing code. This means including both ``"--local-rank"``
+    and ``"--local_rank"`` in the argument parser. If only ``"--local_rank"`` is
+    provided, the launcher will trigger an error: "error: unrecognized arguments:
+    --local-rank=<rank>". For training code that only supports PyTorch 2.0.0+,
+    including ``"--local-rank"`` should be sufficient.
+
 3. In your training program, you are supposed to call the following function
 at the beginning to start the distributed backend. It is strongly recommended
 that ``init_method=env://``. Other init methods (e.g. ``tcp://``) may work,
@@ -146,15 +159,11 @@
 
 """
 
-import logging
 import warnings
 
 from torch.distributed.run import get_args_parser, run
 
 
-logger = logging.getLogger(__name__)
-
-
 def parse_args(args):
     parser = get_args_parser()
     parser.add_argument(
diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
index f6cc5702e74d1..20de0a032713a 100644
--- a/torch/distributed/launcher/api.py
+++ b/torch/distributed/launcher/api.py
@@ -14,7 +14,7 @@
 from torch.distributed.elastic import events, metrics
 from torch.distributed.elastic.agent.server.api import WorkerSpec
 from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent
-from torch.distributed.elastic.multiprocessing import SignalException, Std
+from torch.distributed.elastic.multiprocessing import DefaultLogsSpecs, LogsSpecs, SignalException
 from torch.distributed.elastic.multiprocessing.errors import ChildFailedError
 from torch.distributed.elastic.rendezvous import RendezvousParameters
 from torch.distributed.elastic.rendezvous.utils import parse_rendezvous_endpoint
@@ -54,15 +54,10 @@ class LaunchConfig:
                         as a period of monitoring workers.
         start_method: The method is used by the elastic agent to start the
                     workers (spawn, fork, forkserver).
-        log_dir: base log directory where log files are written. If not set,
-                one is created in a tmp dir but NOT removed on exit.
-        redirects: configuration to redirect stdout/stderr to log files.
-                Pass a single ``Std`` enum to redirect all workers,
-                or a mapping keyed by local_rank to selectively redirect.
-        tee: configuration to "tee" stdout/stderr to console + log file.
         metrics_cfg: configuration to initialize metrics.
         local_addr: address of the local node if any. If not set, a lookup on the local
                 machine's FQDN will be performed.
+        local_ranks_filter: ranks for which to show logs in console. If not set, show from all.
     ..note:
         `rdzv_timeout` is a legacy argument that will be removed in future.
         Set the timeout via `rdzv_configs['timeout']`
@@ -72,6 +67,7 @@ class LaunchConfig:
     min_nodes: int
     max_nodes: int
     nproc_per_node: int
+    logs_specs: Optional[LogsSpecs] = None
     run_id: str = ""
     role: str = "default_role"
     rdzv_endpoint: str = ""
@@ -79,12 +75,9 @@ class LaunchConfig:
     rdzv_configs: Dict[str, Any] = field(default_factory=dict)
     rdzv_timeout: int = -1
     max_restarts: int = 3
-    monitor_interval: float = 30
+    monitor_interval: float = 0.1
     start_method: str = "spawn"
-    log_dir: Optional[str] = None
     log_line_prefix_template: Optional[str] = None
-    redirects: Union[Std, Dict[int, Std]] = Std.NONE
-    tee: Union[Std, Dict[int, Std]] = Std.NONE
     metrics_cfg: Dict[str, str] = field(default_factory=dict)
     local_addr: Optional[str] = None
 
@@ -95,6 +88,10 @@ def __post_init__(self):
         elif "timeout" not in self.rdzv_configs:
             self.rdzv_configs["timeout"] = default_timeout
 
+        # Post-processing to enable refactoring to introduce logs_specs due to non-torchrun API usage
+        if self.logs_specs is None:
+            self.logs_specs = DefaultLogsSpecs()
+
 
 class elastic_launch:
     """
@@ -213,7 +210,7 @@ def launch_agent(
             "rdzv_configs": config.rdzv_configs,
             "max_restarts": config.max_restarts,
             "monitor_interval": config.monitor_interval,
-            "log_dir": config.log_dir,
+            "log_dir": config.logs_specs.root_log_dir,  # type: ignore[union-attr]
             "metrics_cfg": config.metrics_cfg
         }
     )
@@ -238,8 +235,6 @@ def launch_agent(
         rdzv_handler=rdzv_registry.get_rendezvous_handler(rdzv_parameters),
         max_restarts=config.max_restarts,
         monitor_interval=config.monitor_interval,
-        redirects=config.redirects,
-        tee=config.tee,
         master_addr=master_addr,
         master_port=master_port,
         local_addr=config.local_addr,
@@ -247,8 +242,8 @@ def launch_agent(
 
     agent = LocalElasticAgent(
         spec=spec,
+        logs_specs=config.logs_specs,  # type: ignore[arg-type]
         start_method=config.start_method,
-        log_dir=config.log_dir,
         log_line_prefix_template=config.log_line_prefix_template,
     )
 
diff --git a/torch/distributed/nn/functional.py b/torch/distributed/nn/functional.py
index 59b3c60009aca..857d090dedbe1 100644
--- a/torch/distributed/nn/functional.py
+++ b/torch/distributed/nn/functional.py
@@ -224,7 +224,7 @@ class _Broadcast(Function):
     def forward(ctx, src, group, tensor):
         ctx.src = src
         ctx.group = group
-        ctx.rank = dist.get_rank()
+        ctx.rank = dist.get_rank(group=group)
         # torch.distributed makes all the calls in place
         # we allocate new tensors to avoid this
         tensor = tensor.clone()
@@ -328,9 +328,9 @@ def forward(ctx, group, tensor):
     @staticmethod
     def backward(ctx, *grad_outputs):
         if dist.get_backend(group=ctx.group) is dist.Backend.NCCL:
-            rank = dist.get_rank()
+            rank = dist.get_rank(group=ctx.group)
             gx = torch.empty_like(grad_outputs[rank])
-            _Reduce_Scatter.apply(ReduceOp.SUM, ctx.group, gx, *grad_outputs)
+            gx = _Reduce_Scatter.apply(ReduceOp.SUM, ctx.group, gx, *grad_outputs)
         else:
             # As many backends doesn't support ReduceScatter, we use AlltoAll with .sum()
             # to emulate the ReduceScatter behavior
diff --git a/torch/distributed/optim/__init__.py b/torch/distributed/optim/__init__.py
index 82886d3c774b7..969699c2a5ac6 100644
--- a/torch/distributed/optim/__init__.py
+++ b/torch/distributed/optim/__init__.py
@@ -24,6 +24,9 @@
 from .named_optimizer import _NamedOptimizer
 from .utils import as_functional_optim
 
+from warnings import warn
+warn("TorchScript support for functional optimizers is"
+     "deprecated and will be removed in a future PyTorch release. Consider using the torch.compile optimizer instead.")
 
 # DistributedOptimizer imports torch.distributed.rpc names, so gate availability
 # based on RPC being available.
diff --git a/torch/distributed/optim/functional_adadelta.py b/torch/distributed/optim/functional_adadelta.py
index 803132e5d7a42..e3e44d4667ae5 100644
--- a/torch/distributed/optim/functional_adadelta.py
+++ b/torch/distributed/optim/functional_adadelta.py
@@ -53,6 +53,7 @@ def step(self, gradients: List[Optional[Tensor]]):
         grads = []
         square_avgs = []
         acc_deltas = []
+        state_steps = []
         lr = self.defaults["lr"]
         rho = self.defaults["rho"]
         eps = self.defaults["eps"]
@@ -85,6 +86,7 @@ def step(self, gradients: List[Optional[Tensor]]):
                 state = self.state[param]
                 square_avgs.append(state["square_avg"])
                 acc_deltas.append(state["acc_delta"])
+                state_steps.append(state["step"])
 
         with torch.no_grad():
             F.adadelta(
@@ -92,6 +94,7 @@ def step(self, gradients: List[Optional[Tensor]]):
                 grads,
                 square_avgs,
                 acc_deltas,
+                state_steps,
                 lr=lr,
                 rho=rho,
                 eps=eps,
diff --git a/torch/distributed/optim/functional_rmsprop.py b/torch/distributed/optim/functional_rmsprop.py
index 4324760df8d53..fc4d7750973cc 100644
--- a/torch/distributed/optim/functional_rmsprop.py
+++ b/torch/distributed/optim/functional_rmsprop.py
@@ -58,6 +58,7 @@ def step(self, gradients: List[Optional[Tensor]]):
         square_avgs = []
         grad_avgs = []
         momentum_buffer_list = []
+        state_steps = []
         lr = self.defaults["lr"]
         alpha = self.defaults["alpha"]
         eps = self.defaults["eps"]
@@ -101,7 +102,7 @@ def step(self, gradients: List[Optional[Tensor]]):
                 if self.centered:
                     grad_avgs.append(state["grad_avg"])
 
-                state["step"] += 1
+                state_steps.append(state["step"])
 
         with torch.no_grad():
             F.rmsprop(
@@ -110,6 +111,7 @@ def step(self, gradients: List[Optional[Tensor]]):
                 square_avgs,
                 grad_avgs,
                 momentum_buffer_list,
+                state_steps,
                 lr=lr,
                 alpha=alpha,
                 eps=eps,
diff --git a/torch/distributed/optim/functional_rprop.py b/torch/distributed/optim/functional_rprop.py
index 6ac6487b0acb3..6018ce943b40b 100644
--- a/torch/distributed/optim/functional_rprop.py
+++ b/torch/distributed/optim/functional_rprop.py
@@ -51,6 +51,7 @@ def step(self, gradients: List[Optional[Tensor]]):
         grads = []
         prevs = []
         step_sizes = []
+        state_steps = []
         lr = self.defaults["lr"]
         etaminus, etaplus = self.etas
         step_size_min, step_size_max = self.step_sizes
@@ -81,8 +82,7 @@ def step(self, gradients: List[Optional[Tensor]]):
                 state = self.state[param]
                 prevs.append(state["prev"])
                 step_sizes.append(state["step_size"])
-
-                state["step"] += 1
+                state_steps.append(state["step"])
 
         with torch.no_grad():
             F.rprop(
@@ -90,6 +90,7 @@ def step(self, gradients: List[Optional[Tensor]]):
                 grads,
                 prevs,
                 step_sizes,
+                state_steps,
                 step_size_min=step_size_min,
                 step_size_max=step_size_max,
                 etaminus=etaminus,
diff --git a/torch/distributed/optim/functional_sgd.py b/torch/distributed/optim/functional_sgd.py
index ff6ce757735ba..4a807a6055719 100644
--- a/torch/distributed/optim/functional_sgd.py
+++ b/torch/distributed/optim/functional_sgd.py
@@ -28,6 +28,7 @@ def __init__(
         nesterov: bool = False,
         maximize: bool = False,
         foreach: bool = False,
+        fused: bool = False,
         _allow_empty_param_list: bool = False,
     ):
         self.defaults = {
@@ -39,6 +40,7 @@ def __init__(
         self.nesterov = nesterov
         self.maximize = maximize
         self.foreach = foreach
+        self.fused = fused
         self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
 
         if len(params) == 0 and not _allow_empty_param_list:
@@ -88,6 +90,9 @@ def step_param(self, param: Tensor, grad: Optional[Tensor]):
                 maximize=self.maximize,
                 has_sparse_grad=has_sparse_grad,
                 foreach=self.foreach,
+                fused=self.fused,
+                grad_scale=None,
+                found_inf=None,
             )
         # update momentum_buffer in state
         state = self.state[param]
@@ -142,6 +147,9 @@ def step(self, gradients: List[Optional[Tensor]]):
                 maximize=self.maximize,
                 has_sparse_grad=has_sparse_grad,
                 foreach=self.foreach,
+                fused=self.fused,
+                grad_scale=None,
+                found_inf=None,
             )
 
         # update momentum_buffers in state
diff --git a/torch/distributed/pipeline/__init__.py b/torch/distributed/pipeline/__init__.py
index e69de29bb2d1d..5bc82f0692c15 100644
--- a/torch/distributed/pipeline/__init__.py
+++ b/torch/distributed/pipeline/__init__.py
@@ -0,0 +1,7 @@
+import warnings
+warnings.warn(
+    "torch.distributed.pipeline is deprecated. For up-to-date pipeline parallel "
+    "implementation, please refer to the PiPPy library under the PyTorch "
+    "organization (Pipeline Parallelism for PyTorch): "
+    "https://github.com/pytorch/PiPPy"
+)
diff --git a/torch/distributed/pipeline/sync/_balance/blockpartition.py b/torch/distributed/pipeline/sync/_balance/blockpartition.py
index 7afe782f6ac8c..ccdf5fe4df990 100644
--- a/torch/distributed/pipeline/sync/_balance/blockpartition.py
+++ b/torch/distributed/pipeline/sync/_balance/blockpartition.py
@@ -4,7 +4,7 @@
 #
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
-"""Implements "Block Partitions of Sequences" by Imre Bárány et al.
+"""Implements "Block Partitions of Sequences" by Imre B\u00e1r\u00e1ny et al.
 
 Paper: https://arxiv.org/pdf/1308.2452.pdf
 
@@ -18,7 +18,7 @@ def solve(sequence: List[int], partitions: int = 1) -> List[List[int]]:
     """Splits a sequence into several partitions to minimize variance for each
     partition.
 
-    The result might not be optimal. However, it can be done only in O(kn³),
+    The result might not be optimal. However, it can be done only in O(kn\u00b3),
     where k is the number of partitions and n is the length of the sequence.
 
     """
@@ -51,14 +51,14 @@ def leaderboard() -> Iterator[Tuple[float, int]]:
 
     while True:
         """
-        (1) Fix p ∈ [k] with M(P) = bp. So Bp is a maximal block of P.
+        (1) Fix p element-of [k] with M(P) = bp. So Bp is a maximal block of P.
         """
         # max_size: M(P)
         max_size, p = max(leaderboard())
 
         while True:
             """
-            (2) If M(P) ≤ m(P) + 1, then stop.
+            (2) If M(P) <= m(P) + 1, then stop.
             """
             # min_size: m(P)
             min_size, q = min(leaderboard())
@@ -67,7 +67,7 @@ def leaderboard() -> Iterator[Tuple[float, int]]:
                 return [sequence[i:j] for i, j in zip([0] + splits[:-1], splits)]
 
             """
-            (3) If M(P) > m(P) + 1, then let m(P) = bq for the q ∈ [k] which is
+            (3) If M(P) > m(P) + 1, then let m(P) = bq for the q element-of [k] which is
             closest to p (ties broken arbitrarily). Thus Bq is a minimal block
             of P. Let Bh be the block next to Bq between Bp and Bq. (Note that
             Bh is a non-empty block: if it were, then m(P) = 0 and we should
@@ -75,21 +75,21 @@ def leaderboard() -> Iterator[Tuple[float, int]]:
             """
             if p < q:
                 """
-                So either p < q and then h = q−1 and we define P ∗ by moving
-                the last element from Bh = Bq−1 to Bq,
+                So either p < q and then h = q-1 and we define P * by moving
+                the last element from Bh = Bq-1 to Bq,
                 """
                 h = q - 1
                 splits[h] -= 1
             else:
                 """
-                or q < p, and then h = q + 1 and P ∗ is obtained by moving the
+                or q < p, and then h = q + 1 and P * is obtained by moving the
                 first element of Bh = Bq+1 to Bq.
                 """
                 h = q + 1
                 splits[q] += 1
 
             """
-            Set P = P ∗ . If p = h, then go to (1), else go to (2).
+            Set P = P * . If p = h, then go to (1), else go to (2).
             """
             if p == h:
                 break
diff --git a/torch/distributed/pipeline/sync/batchnorm.py b/torch/distributed/pipeline/sync/batchnorm.py
index ad375f893318e..868ad50cf3fcf 100644
--- a/torch/distributed/pipeline/sync/batchnorm.py
+++ b/torch/distributed/pipeline/sync/batchnorm.py
@@ -5,7 +5,7 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 """Tracks the running statistics per mini-batch instead of micro-batch."""
-from typing import TypeVar, cast
+from typing import TypeVar, Optional, cast
 
 import torch
 from torch import Tensor, nn
@@ -33,7 +33,7 @@ def __init__(
         self,
         num_features: int,
         eps: float = 1e-5,
-        momentum: float = 0.1,
+        momentum: Optional[float] = 0.1,
         affine: bool = True,
         chunks: int = 1,
     ) -> None:
diff --git a/torch/distributed/pipeline/sync/pipeline.py b/torch/distributed/pipeline/sync/pipeline.py
index 8eccc68183fa9..7cd5e58311697 100644
--- a/torch/distributed/pipeline/sync/pipeline.py
+++ b/torch/distributed/pipeline/sync/pipeline.py
@@ -157,30 +157,30 @@ def compute(
         exc_info: Optional[ExcInfo] = None
 
         # With checkpointing, the autograd graph looks like this diagram:
-        # ┌─────┸──────┐
-        # │    Copy    │
-        # └─────┰──────┘   (fence)
-        # ─ ─ ─ ╂ ─ ─ ─ ─ ─ ─ ─ ─ ─
-        #       ┃          (compute)
-        # ┌─────┸──────┐
-        # │    Wait    │ [1] Synchronize the current stream with the copy stream.
-        # └─────┰──────┘
-        # ┌─────┸──────┐
-        # │ Checkpoint │ [2] Compute a partition within checkpointing.
-        # └─────┰──────┘
-        # ┌─────┸──────┐
-        # │    Wait    │ [3] Synchronize the copy stream with the current stream.
-        # └─────┰──────┘
-        #       ┠ ─ ─ ─ ┐
-        #       ┃ ┌─────┴─────┐
-        #       ┃ │ Recompute │ [4] Schedule the recomputation at backpropagation.
-        #       ┃ └─────┬─────┘
-        #       ┠ ─ ─ ─ ┘
-        #       ┃
-        # ─ ─ ─ ╂ ─ ─ ─ ─ ─ ─ ─ ─ ─
-        # ┌─────┸──────┐   (fence)
-        # │    Copy    │
-        # └─────┰──────┘
+        # +-----+------+
+        # |    Copy    |
+        # +-----+------+   (fence)
+        # - - - + - - - - - - - - -
+        #       |          (compute)
+        # +-----+------+
+        # |    Wait    | [1] Synchronize the current stream with the copy stream.
+        # +-----+------+
+        # +-----+------+
+        # | Checkpoint | [2] Compute a partition within checkpointing.
+        # +-----+------+
+        # +-----+------+
+        # |    Wait    | [3] Synchronize the copy stream with the current stream.
+        # +-----+------+
+        #       + - - - +
+        #       | +-----+-----+
+        #       | | Recompute | [4] Schedule the recomputation at backpropagation.
+        #       | +-----+-----+
+        #       + - - - +
+        #       |
+        # - - - + - - - - - - - - -
+        # +-----+------+   (fence)
+        # |    Copy    |
+        # +-----+------+
         for i, j in schedule:
             batch = batches[i]
             partition = partitions[j]
diff --git a/torch/distributed/pipeline/sync/skip/portal.py b/torch/distributed/pipeline/sync/skip/portal.py
index f3484a1b69d57..335793f4cc137 100644
--- a/torch/distributed/pipeline/sync/skip/portal.py
+++ b/torch/distributed/pipeline/sync/skip/portal.py
@@ -9,7 +9,7 @@
 :class:`PortalOrange`, and :class:`PortalCopy`) out of the computation graph is
 one of the most important feature of :mod:`torchpipe.skip`.
 
-The metaphor is inspired by Portal™ from Valve.
+The metaphor is inspired by Portal(tm) from Valve.
 
 """
 from typing import List, Optional, Tuple
diff --git a/torch/distributed/pipeline/sync/skip/skippable.py b/torch/distributed/pipeline/sync/skip/skippable.py
index 0c01a198f8043..aa20792c849db 100644
--- a/torch/distributed/pipeline/sync/skip/skippable.py
+++ b/torch/distributed/pipeline/sync/skip/skippable.py
@@ -362,16 +362,16 @@ def verify_skippables(module: nn.Sequential) -> None:
         # Layer3 pops "1to3".
 
         nn.Sequential(Layer1(), Layer2())
-        #               └──── ?
+        #               +---- ?
 
         nn.Sequential(Layer2(), Layer3())
-        #                   ? ────┘
+        #                   ? ----+
 
         nn.Sequential(Layer1(), Layer2(), Layer3(), Layer3())
-        #               └───────────────────┘       ^^^^^^
+        #               +-------------------+       ^^^^^^
 
         nn.Sequential(Layer1(), Layer1(), Layer2(), Layer3())
-        #             ^^^^^^      └───────────────────┘
+        #             ^^^^^^      +-------------------+
 
     To use the same name for multiple skip tensors, they must be isolated by
     different namespaces. See :meth:`isolate()
diff --git a/torch/distributed/pipelining/README.md b/torch/distributed/pipelining/README.md
new file mode 100644
index 0000000000000..46a05a22c8ce1
--- /dev/null
+++ b/torch/distributed/pipelining/README.md
@@ -0,0 +1,178 @@
+# Pipeline Parallelism for PyTorch
+
+> [!NOTE]
+> `torch.distributed.pipelining` is a package migrated from the [PiPPy](https://github.com/pytorch/PiPPy) project. It is currently in alpha state and under extensive development. If you need examples that work with our APIs, please refer to PiPPy's [examples](https://github.com/pytorch/PiPPy/tree/main/examples) directory.
+
+[**Why Pipeline Parallel?**](#why-pipeline-parallel)
+| [**What is `torch.distributed.pipelining`?**](#what-is-torchdistributedpipelining)
+| [**Examples**](#examples)
+| [**Techniques Explained**](#techniques-explained)
+
+# Why Pipeline Parallel?
+
+One of the most important techniques for advancing the state of the art in deep learning is scaling. Common techniques for scaling neural networks include _data parallelism_, _tensor/operation parallelism_, and _pipeline parallelism_. In many cases, pipeline parallelism in particular can be an effective technique for scaling, however it is often difficult to implement, requiring intrusive code changes to model code and difficult-to-implement runtime orchestration code. `torch.distributed.pipelining` aims to provide a toolkit that does said things automatically to allow high-productivity scaling of models.
+
+# What is `torch.distributed.pipelining`?
+
+`torch.distributed.pipelining` consists of a compiler and runtime stack for automated pipelining of PyTorch models. Pipelining, or _pipeline parallelism_, is a technique in which the _code_ of the model is partitioned and multiple _micro-batches_ execute different parts of the model code concurrently. To learn more about pipeline parallelism, see [this article](https://www.deepspeed.ai/tutorials/pipeline/).
+
+![pipeline_diagram_web](https://github.com/pytorch/PiPPy/assets/6676466/c93e2fe7-1cd4-49a2-9fd8-231ec9905e0c)
+
+Figure: Pipeline parallel. "F", "B" and "U" denote forward, backward and weight update, respectively. Different colors represent different micro-batches.
+
+`torch.distributed.pipelining` provides the following features that make pipeline parallelism easier:
+
+* Automatic splitting of model code based on your specification. The goal is for the user to provide model code as-is to the system for parallelization, without having to make heavyweight modifications to make parallelism work. The specification is also simple.
+* Support for rich pipeline scheduling paradigms, including GPipe, 1F1B, Interleaved 1F1B and Looped BFS. More schedules will be added and it will be easy to customize your own schedule under `torch.distributed.pipelining`'s framework.
+* First-class support for cross-host pipeline parallelism, as this is where PP is typically used (over slower interconnects).
+* Composability with other PyTorch parallel schemes such as data parallelism (DDP, FSDP) or tensor  parallelism (overall, known as "3d parallelism").
+
+# Examples
+
+In the [PiPPy](https://github.com/pytorch/PiPPy) repo where this package is migrated from, we provide rich examples based on realistic models. In particular, we show how to apply pipelining without any model code change. You can refer to the [HuggingFace examples directory](https://github.com/pytorch/PiPPy/tree/main/examples/huggingface). Popular examples include: [GPT2](https://github.com/pytorch/PiPPy/tree/main/examples/huggingface/pippy_gpt2.py), and [LLaMA](https://github.com/pytorch/PiPPy/tree/main/examples/llama).
+
+# Techniques Explained
+
+`torch.distributed.pipelining` consists of two parts: a _compiler_ and a _runtime_. The compiler takes your model code, splits it up, and transforms it into a `Pipe`, which is a wrapper that describes the model at each pipeline stage and their data-flow relationship. The runtime executes the `PipelineStage`s in parallel, handling things like micro-batch splitting, scheduling, communication, and gradient propagation, etc. We will cover the APIs for these concepts in this section.
+
+## Splitting a Model with `pipeline`
+
+To see how we can split a model into a pipeline, let's first take an example trivial neural network:
+
+```python
+import torch
+
+class MyNetworkBlock(torch.nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_dim, out_dim)
+
+    def forward(self, x):
+        x = self.lin(x)
+        x = torch.relu(x)
+        return x
+
+
+class MyNetwork(torch.nn.Module):
+    def __init__(self, in_dim, layer_dims):
+        super().__init__()
+
+        prev_dim = in_dim
+        for i, dim in enumerate(layer_dims):
+            setattr(self, f'layer{i}', MyNetworkBlock(prev_dim, dim))
+            prev_dim = dim
+
+        self.num_layers = len(layer_dims)
+        # 10 output classes
+        self.output_proj = torch.nn.Linear(layer_dims[-1], 10)
+
+    def forward(self, x):
+        for i in range(self.num_layers):
+            x = getattr(self, f'layer{i}')(x)
+
+        return self.output_proj(x)
+
+
+in_dim = 512
+layer_dims = [512, 1024, 256]
+mn = MyNetwork(in_dim, layer_dims).to(device)
+```
+
+This network is written as free-form Python code; it has not been modified for any specific parallelism technique.
+
+Let us see our first usage of the `torch.distributed.pipelining` interfaces:
+
+```python
+from torch.distributed.pipelining import annotate_split_points, pipeline, Pipe, SplitPoint
+
+annotate_split_points(mn, {'layer0': SplitPoint.END,
+                           'layer1': SplitPoint.END})
+
+batch_size = 32
+example_input = torch.randn(batch_size, in_dim, device=device)
+chunks = 4
+
+pipe = pipeline(mn, chunks, example_args=(example_input,))
+print(pipe)
+
+"""
+************************************* pipe *************************************
+GraphModule(
+  (submod_0): GraphModule(
+    (layer0): InterpreterModule(
+      (lin): InterpreterModule()
+    )
+  )
+  (submod_1): GraphModule(
+    (layer1): InterpreterModule(
+      (lin): InterpreterModule()
+    )
+  )
+  (submod_2): GraphModule(
+    (layer2): InterpreterModule(
+      (lin): InterpreterModule()
+    )
+    (output_proj): InterpreterModule()
+  )
+)
+
+def forward(self, arg8_1):
+    submod_0 = self.submod_0(arg8_1);  arg8_1 = None
+    submod_1 = self.submod_1(submod_0);  submod_0 = None
+    submod_2 = self.submod_2(submod_1);  submod_1 = None
+    return (submod_2,)
+"""
+```
+
+So what's going on here? First, `pipeline` turns our model into a directed acyclic graph (DAG) by tracing the model. Then, it groups together the operations and parameters into _pipeline stages_. Stages are represented as `submod_N` submodules, where `N` is a natural number.
+
+We used `annotate_split_points` to specify that the code should be split and the end of `layer0` and `layer1`. Our code has thus been split into _three_ pipeline stages. Our library also provides `SplitPoint.BEGINNING` if a user wants to split before certain annotation point.
+
+While the `annotate_split_points` API gives users a way to specify the split points without modifying the model, our library also provides an API for in-model annotation: `pipe_split()`. For details, you can read [this example](https://github.com/pytorch/PiPPy/blob/main/test/test_pipe.py).
+
+This covers the basic usage of the `Pipe` API. For more information, please see the documentation.
+
+<!-- (TODO: link to docs when live) -->
+
+## Using PipelineStage for Pipelined Execution
+
+Given the above `Pipe` object, we can use one of the `PipelineStage` classes to execute our model in a pipelined fashion. First off, let us instantiate a `PipelineStage` instance:
+
+```python
+# We are using `torchrun` to run this example with multiple processes.
+# `torchrun` defines two environment variables: `RANK` and `WORLD_SIZE`.
+rank = int(os.environ["RANK"])
+world_size = int(os.environ["WORLD_SIZE"])
+
+# Initialize distributed environment
+import torch.distributed as dist
+dist.init_process_group(rank=rank, world_size=world_size)
+
+# Pipeline stage is our main pipeline runtime. It takes in the pipe object,
+# the rank of this process, and the device.
+from torch.distributed.pipelining import PipelineStage
+stage = PipelineStage(pipe, rank, device)
+```
+
+We can now run the pipeline by attaching the `PipelineStage` to a pipeline schedule, GPipe for example:
+
+```python
+from torch.distributed.pipelining import ScheduleGPipe
+schedule = ScheduleGPipe(stage, chunks)
+
+# Input data
+x = torch.randn(batch_size, in_dim, device=device)
+
+# Run the pipeline with input `x`. Divide the batch into 4 micro-batches
+# and run them in parallel on the pipeline
+if rank == 0:
+    schedule.step(x)
+else:
+    output = schedule.step()
+```
+
+Note that since we split our model into three stages, we must run this script with three workers. For this example, we will use `torchrun` to run multiple processes within a single machine for demonstration purposes. We can collect up all of the code blocks above into a file named [example.py](https://github.com/pytorch/PiPPy/tree/main/examples/basic) and then run it with `torchrun` like so:
+
+```
+torchrun --nproc_per_node=3 example.py
+```
diff --git a/torch/distributed/pipelining/_IR.py b/torch/distributed/pipelining/_IR.py
new file mode 100644
index 0000000000000..e62b3b1656b34
--- /dev/null
+++ b/torch/distributed/pipelining/_IR.py
@@ -0,0 +1,1408 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import copy
+import logging
+import operator
+from dataclasses import dataclass
+from enum import Enum
+from inspect import Parameter, signature, Signature
+from types import MethodType
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.fx as fx
+from torch.export import ExportedProgram
+from torch.fx.node import map_aggregate
+from torch.fx.passes.split_module import split_module
+
+from ._backward import _null_coalesce_accumulate, stage_backward
+from ._debug import PIPPY_VERBOSITY
+from ._unflatten import _assign_attr, _AttrKind, _outline_submodules, _sink_params
+from ._utils import QualnameMapMixin
+from .microbatch import split_args_kwargs_into_chunks, TensorChunkSpec
+
+
+logger = logging.getLogger(__name__)
+
+# TODO:
+# 1. investigate gradient sync for shared parameters. how does DDP do it?
+# 2. Add parameter movement to split_module
+
+
+def _find_loss_from_output_and_spec(output_val, spec_val):
+    if spec_val is False:
+        return None
+    if spec_val is True:
+        if not isinstance(output_val, fx.Node):
+            raise RuntimeError(
+                f"Loss spec must specify a dynamic value but got {output_val}"
+            )
+        return output_val
+
+    if isinstance(spec_val, (tuple, list)):
+        if not isinstance(output_val, (tuple, list)):
+            raise RuntimeError(
+                f"Output value {output_val} must match type of loss specification "
+                f"{spec_val}"
+            )
+        if len(output_val) != len(spec_val):
+            raise RuntimeError(
+                f"Output value {output_val} must match length of loss specification "
+                f"{spec_val}"
+            )
+        for out, spec in zip(output_val, spec_val):
+            loss_val = _find_loss_from_output_and_spec(out, spec)
+            if loss_val is not None:
+                return loss_val
+        raise RuntimeError(f"Did not find loss value in specification {spec_val}")
+
+    if isinstance(spec_val, dict):
+        if not isinstance(output_val, dict):
+            raise RuntimeError(
+                f"Output value {output_val} must match type of loss specification "
+                f"{spec_val}"
+            )
+        if set(output_val.keys()) != set(spec_val.keys()):
+            raise RuntimeError(
+                f"Output value {output_val} must match keys of loss specification "
+                f"{spec_val}"
+            )
+        for k in spec_val:
+            loss_val = _find_loss_from_output_and_spec(output_val[k], spec_val[k])
+            if loss_val is not None:
+                return loss_val
+        raise RuntimeError(f"Did not find loss value in specification {spec_val}")
+
+    raise RuntimeError(f"Unsupported type {type(spec_val)} in loss specification")
+
+
+def _find_loss_output(mod: torch.nn.Module, g: fx.Graph, output_loss_value_spec):
+    output_nodes = [n for n in g.nodes if n.op == "output"]
+    assert len(output_nodes) == 1
+    output_node = output_nodes[0]
+    output_val = output_node.args[0]
+    generated_spec: Any = None
+
+    if isinstance(mod, TrivialLossWrapper):
+        # TrivialLossWrapper is pre-defined by PiPPy.
+        # It has loss as the only output so we can safely assume the first output arg is the loss.
+        assert len(output_node.args) == 1
+        loss_node = output_val
+        generated_spec = TrivialLossWrapper.loss_spec
+    elif output_loss_value_spec is None:
+        # Use default spec, i.e. search for "loss" in output values
+        if isinstance(output_val, dict) and "loss" in output_val.keys():
+            loss_node = output_val["loss"]
+            generated_spec = {k: k == "loss" for k in output_val}
+        else:
+            loss_node = None
+            generated_spec = None
+    else:
+        loss_node = _find_loss_from_output_and_spec(output_val, output_loss_value_spec)
+        generated_spec = output_loss_value_spec
+
+    return loss_node, output_node, generated_spec
+
+
+def _insert_stage_symbolic_backward(
+    g: fx.Graph,
+    loss_node: fx.Node,
+    output_node: fx.Node,
+):
+    # Collect metadata about tuple output values. TODO: move this to split_module or FX IR
+    tuples: Dict[fx.Node, Tuple] = {}
+    for node in reversed(g.nodes):
+        if node.op == "call_function":
+            # In the forward pass, only emit placeholder, module calls, and
+            # getitem calls. If we have a target other than getitem in this
+            # (forward-only) code, there is a bug.
+            assert node.target == operator.getitem, (
+                "Found non-getitem call in forward pass. "
+                "Please report a bug to PiPPy"
+            )
+            assert (
+                len(node.args) == 2
+            ), "Found malformed getitem call. Please report a bug to PiPPy"
+            indexed_value, node_idx = tuple(node.args)
+
+            # indexed_value is a collection that we are indexing into. It could
+            # exist in the tuples map if we've processed another `getitem`
+            # already.
+            existing_list_size = (
+                len(tuples[indexed_value]) if indexed_value in tuples else -1
+            )
+            new_list_size = max(node_idx + 1, existing_list_size)
+
+            reconstructed_list = [None for _ in range(new_list_size)]
+
+            # Copy over existing elements if present
+            if indexed_value in tuples:
+                for i, val in enumerate(tuples[indexed_value]):
+                    reconstructed_list[i] = val
+
+            # Populate value represented by this node
+            reconstructed_list[node_idx] = node
+
+            tuples[indexed_value] = tuple(reconstructed_list)
+
+    # Keep track of nodes that dominate the loss node.
+    # We will only emit backward operations for nodes that can contribute
+    # to the specified loss value.
+    live_nodes = {loss_node: None}
+    val_to_grad: Dict[fx.Node, Optional[fx.Node]] = {loss_node: None}
+
+    def assign_or_accumulate_grad(forward_node, grad_value):
+        if forward_node in val_to_grad and forward_node.op != "placeholder":
+            grad_value = g.call_function(
+                _null_coalesce_accumulate,
+                (val_to_grad[forward_node], grad_value),
+            )
+        val_to_grad[forward_node] = grad_value
+
+    with g.inserting_before(output_node):
+        for node in reversed(g.nodes):
+            if node not in live_nodes:
+                continue
+
+            def add_to_live_nodes(n):
+                live_nodes.setdefault(n, None)
+
+            fx.node.map_arg(node.args, add_to_live_nodes)
+            fx.node.map_arg(node.kwargs, add_to_live_nodes)
+            if node.op == "call_module":
+                output_grads: Union[Tuple[Optional[fx.Node], ...], Optional[fx.Node]]
+                if node in tuples:
+                    stage_output = tuples[node]
+                    output_grads = tuple(val_to_grad.get(n, None) for n in tuples[node])
+                    outputs_with_grads_idxs = [
+                        i for i, n in enumerate(tuples[node]) if n in live_nodes
+                    ]
+                else:
+                    stage_output = (node,)
+                    output_grads = val_to_grad[node]
+                    outputs_with_grads_idxs = [0]
+
+                output_grads = (
+                    (output_grads,)
+                    if not isinstance(output_grads, tuple)
+                    else output_grads
+                )
+
+                grad_call = g.call_function(
+                    stage_backward,
+                    kwargs={
+                        "stage_output": stage_output,
+                        "output_grads": output_grads,
+                        "input_values": list(node.all_input_nodes),
+                        "outputs_with_grads_idxs": outputs_with_grads_idxs,
+                    },
+                )
+                # Insert backward stage debug info
+                kwargs_copy = dict(grad_call.kwargs)
+                grad_call.kwargs = kwargs_copy
+
+                grad_call_proxy = fx.Proxy(grad_call)
+                grads = grad_call_proxy.node
+
+                input_nodes = list(node.all_input_nodes)
+                grads_proxy = fx.Proxy(grads)
+                for i, input_node in enumerate(input_nodes):
+                    assign_or_accumulate_grad(input_node, grads_proxy[i].node)
+
+    return g
+
+
+class PipeSequential(torch.nn.Sequential):
+    @staticmethod
+    def from_sequential(sequential_instance: torch.nn.Sequential):
+        return PipeSequential(*[copy.copy(m) for m in sequential_instance])
+
+    def forward(self, input):
+        for i, module in enumerate(self):
+            input = module(input)
+            if i != len(self) - 1:
+                pipe_split()
+        return input
+
+
+class LossWrapper(torch.nn.Module):
+    """
+    LossWrapper is a convenient abstract class that allows you to wrap up both
+    your model as well as its loss function and specify the connectivity between
+    the inputs, model, loss function, and output value. Example::
+
+        class MyModelWrapper(LossWrapper):
+            def forward(self, x, targets):
+                model_out = self.module(x)
+                loss_value = self.loss_fn(model_out, targets)
+                return loss_value
+
+    The above example defines a connectivity where we expect the forward/loss/backward
+    training procedure to take two arguments (x and targets), pass x into the module
+    to get the output of the feedforward computation, pass the model output and the
+    targets value into the loss function, and get and return the loss value, which will
+    be backpropagated by PiPPy. The above class would then be instantiated like::
+
+        model = ... # instantiate the model
+        loss_fn = torch.nn.MSELoss() # for the sake of demonstration
+
+        wrapper = MyModelWrapper(model, loss_fn)
+        pipe = Pipe.from_tracing(wrapper, ...)
+
+    """
+
+    def __init__(self, module, loss_fn):
+        super().__init__()
+        self.module = module
+        self.loss_fn = loss_fn
+
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError(
+            "This instance of LossWrapper does not have an overridden"
+            "forward(). Please implement forward() to specify the arguments, "
+            "connection between the module and loss, and loss output "
+            "value."
+        )
+
+
+class TrivialLossWrapper(LossWrapper):
+    def forward(self, x, targets):
+        model_out = self.module(x)
+        return self.loss_fn(model_out, targets)
+
+    loss_spec = True
+
+
+# Pipe model representation
+#
+# Pipe can be thought of as an `nn.Sequential++`. That is to say: it specifies
+# a single topological ordering of pipeline "stages" that, when run in series,
+# constitutes all of the operations of the program. However, unlike `nn.Sequential`,
+# Pipe allows non-local usages of values, so long as those uses still respect
+# topological ordering. In particular:
+#
+# 1. Non-local activations. This type of usage can appear in, for example, skip
+#    connections. These values will be directly transmitted from the "def" stage
+#    to all stages that use them skipping intermediate stages. During autograd,
+#    gradients will be propagated back through this skip connection reverse
+#    to how activations propagated in the forward pass.
+# 2. Non-local parameter/module invocations. This occurs when a parameter is used
+#    in a stage downstream of where it is resident. These values can be carried
+#    forward similarly to (1), but in addition one might want to replicate the
+#    value on multiple stages. Gradients for these shared parameters will be
+#    accumulated separately on each stage, but there will be an additional
+#    gradient accumulation before the optimizer step.
+
+
+# Register `_pipe_split()` as an ATen operator. This is required for Export to
+# preserve this marker in the graph.
+torch.library.define("pippy::_pipe_split", "() -> ()")
+
+
+@torch.library.impl("pippy::_pipe_split", "BackendSelect")
+def _pipe_split():
+    return None
+
+
+@torch.library.impl_abstract("pippy::_pipe_split")  # type: ignore[no-redef]
+def _pipe_split():  # noqa: F811
+    return None
+
+
+# Add an alias for convenience
+aten_pipe_split_alias = torch.ops.pippy._pipe_split.default
+
+# Ask Export to preserve the `_pipe_split` op.
+# See examples in pytorch/torch/fx/node.py
+fx.node._side_effectful_functions.add(aten_pipe_split_alias)
+
+
+# User facing API
+def pipe_split():
+    """
+    pipe_split is a special operator that is used to mark the boundary between
+    stages in a module. It is used to split the module into stages. It is a
+    no-op if your annotated module is run eagerly.
+
+    Example:
+    >>> def forward(self, x):
+    >>>     x = torch.mm(x, self.mm_param)
+    >>>     x = torch.relu(x)
+    >>>     pipe_split()
+    >>>     x = self.lin(x)
+    >>>     return x
+
+    The above example will be split into two stages.
+    """
+    return torch.ops.pippy._pipe_split()
+
+
+class MultiUseParameterConfig(Enum):
+    TRANSMIT = 1
+    REPLICATE = 2
+
+
+MultiUseParamSpec = Union[MultiUseParameterConfig, Dict[str, MultiUseParameterConfig]]
+
+
+class DetachExecutor(fx.Interpreter):
+    """
+    Special interpreter to run the split_gm in testing that detaches all inputs to
+    a module invocation. This is needed so that the values at the boundary are
+    leaf modules in autograd execution.
+    """
+
+    def __init__(self, module, garbage_collect_values=True):
+        garbage_collect_values = False
+        super().__init__(module, garbage_collect_values)
+        self.value_remap = {}
+
+    def run(self, *args, initial_env=None):
+        self.value_remap = {}
+        return super().run(*args, initial_env=initial_env)
+
+    def call_module(self, target, args, kwargs):
+        def detach_tensors(a):
+            if isinstance(a, torch.Tensor) and a.requires_grad:
+                if a not in self.value_remap:
+                    new_val = a.detach().requires_grad_(True)
+                    self.value_remap[a] = new_val
+                return self.value_remap[a]
+            else:
+                return a
+
+        """
+        def dont_traverse_size(a):
+            return type(a) != torch.Size
+        """
+
+        args = map_aggregate(
+            args,
+            detach_tensors,  # dont_traverse_size
+        )
+        kwargs = map_aggregate(
+            kwargs,
+            detach_tensors,  # dont_traverse_size
+        )
+
+        return super().call_module(target, args, kwargs)
+
+    def call_function(self, target, args, kwargs):
+        # HACK to reroute saved input tensors to point to the detach()ed version
+        if target == stage_backward:
+            kwargs = dict(kwargs)
+            kwargs["input_values"] = [
+                self.value_remap.get(v, v) for v in kwargs["input_values"]
+            ]
+        return super().call_function(target, args, kwargs)
+
+
+class _NodeReference:
+    def __init__(self, name):
+        self.name = name
+
+    name: str
+
+
+class _LinearNodeList:
+    def __init__(self, node_list):
+        self.serialize_node_list = []
+        for node in node_list:
+            node_args = fx.node.map_arg(node.args, lambda n: _NodeReference(n.name))
+            node_kwargs = fx.node.map_arg(node.kwargs, lambda n: _NodeReference(n.name))
+            serialize_node = fx.Node(
+                graph=None,
+                name=node.name,
+                op=node.op,
+                target=node.target,
+                args=node_args,
+                kwargs=node_kwargs,
+                return_type=node.type,
+            )
+            serialize_node.meta = copy.copy(node.meta)
+            self.serialize_node_list.append(serialize_node)
+
+    def to_graph(self):
+        graph = fx.Graph()
+
+        ref_str_to_node: Dict[str, fx.Node] = {}
+
+        def ref_to_node(arg):
+            if isinstance(arg, _NodeReference):
+                return ref_str_to_node[arg.name]
+            else:
+                return arg
+
+        for node in self.serialize_node_list:
+            node_args = map_aggregate(node.args, ref_to_node)
+            node_kwargs = map_aggregate(node.kwargs, ref_to_node)
+            deser_node = graph.create_node(
+                op=node.op,
+                target=node.target,
+                args=node_args,
+                kwargs=node_kwargs,
+                name=node.name,
+                type_expr=node.type,
+            )
+            ref_str_to_node[node.name] = deser_node
+
+        return graph
+
+
+def _direct_serialization_deserialize(body, nodes):
+    """
+    Custom `__reduce__` method for serialization.
+    DO AS I SAY -- NOT AS I DO. This violates the principle that
+    GraphModules serialize via code export & re-tracing. We allow
+    for this here because **PIPE STAGES SHOULD NOT BE PERSISTED
+    TO DISK -- THIS IS ONLY FOR TRANSMISSION VIA RPC**. Persisting
+    these instances to disk will expose internal implementation
+    details of `fx.Graph` and related data structures and is
+    NOT advised.
+    """
+
+    class DummyModule(torch.nn.Module):
+        def __init__(self, body):
+            super().__init__()
+            self.__dict__.update(body)
+
+    dummy = DummyModule(body)
+
+    return fx.GraphModule(dummy, nodes.to_graph())
+
+
+def _direct_serialization_reduce(self):
+    serialization_dict = dict(self.__dict__)
+    serialization_dict.pop("_graph")
+    return (
+        _direct_serialization_deserialize,
+        (serialization_dict, _LinearNodeList(self.graph.nodes)),
+    )
+
+
+class Pipe(QualnameMapMixin, torch.nn.Module):
+    # Class variables
+    """
+    args_chunk_spec:
+        Chunking specification for positional inputs. (default: `None`)
+    kwargs_chunk_spec:
+        Chunking specification for keyword inputs. (default: `None`)
+    """
+    # args_chunk_spec and kwargs_chunk_spec are used to specify how to chunk
+    # inputs. They are used to create microbatched examples before tracing.
+    # See context managers `ArgsChunkSpec` and `KwargsChunkSpec`.
+    # TODO: Do we need to support `_Replicate`? It's unclear, dropping for now.
+    args_chunk_spec: Optional[Tuple[TensorChunkSpec, ...]] = None
+    kwargs_chunk_spec: Optional[Dict[str, TensorChunkSpec]] = None
+
+    @dataclass
+    class PipeInfo:
+        graph: fx.Graph
+        num_stages: int
+        num_chunks: int
+        has_loss_and_backward: bool
+        args_chunk_spec: Optional[Tuple[Any, ...]] = None
+        kwargs_chunk_spec: Optional[Dict[str, Any]] = None
+
+    def __init__(
+        self,
+        split_gm: fx.GraphModule,
+        splitter_qualname_map: Dict[str, str],
+        num_stages: int,
+        has_loss_and_backward: bool,
+        loss_spec,
+        tracer_qualname_map: Optional[Dict[str, str]] = None,
+    ):
+        # TODO: is there a way not to hard wire init?
+        QualnameMapMixin.__init__(self, splitter_qualname_map, tracer_qualname_map)
+        torch.nn.Module.__init__(self)
+        self.split_gm: fx.GraphModule = split_gm
+        self.executor: DetachExecutor = DetachExecutor(self.split_gm)
+        self.num_stages: int = num_stages
+        self.has_loss_and_backward = has_loss_and_backward
+        self.loss_spec = loss_spec
+        self.pipe_info: Optional[Pipe.PipeInfo] = None
+
+        for node in split_gm.graph.nodes:
+            assert (
+                node.op in {"call_module", "placeholder", "output"}
+                or (node.op, node.target) == ("call_function", operator.getitem)
+                or (node.op, node.target) == ("call_method", "backward")
+                or (node.op, node.target) == ("call_function", stage_backward)
+                or (node.op, node.target)
+                == ("call_function", _null_coalesce_accumulate)
+            ), node
+
+        # Detect replicated parameters so we know that we have to do an additional allreduce
+        # before applying the optimizer
+        #
+        # Note that this also handles the case where there were multiple calls to a single
+        # module from different stages, regardless of whether that module invocation
+        # was handled by the logic above.
+
+        # Map parameter value to a dictionary that maps the user pipeline module
+        # to the local qualname within that module
+        params_to_users: Dict[torch.nn.Parameter, Dict[str, str]] = {}
+
+        for m_qualname, mod in self.split_gm.named_children():
+            for p_qualname, param in mod.named_parameters():
+                params_to_users.setdefault(param, {})
+                params_to_users[param][m_qualname] = p_qualname
+
+        self.replicated_params: List[Dict[str, str]] = [
+            use_mapping
+            for _, use_mapping in params_to_users.items()
+            if len(use_mapping) > 1
+        ]
+
+        # We must break the aliasing relationship between the replicated parameters for correct
+        # numerics in reference runs. If we do not do this, the autograd tape in separate stages
+        # will have a reference to the same tensor value and will erroneously apply gradient
+        # updates multiple times. Therefore, for each replicated parameter set, we deepcopy the
+        # values so that we have separate instances.
+        for param_mapping in self.replicated_params:
+            for submod_name, param_qualname in param_mapping.items():
+                submod = getattr(self.split_gm, submod_name)
+                atoms = param_qualname.split(".")
+                for atom in atoms[:-1]:
+                    submod = getattr(submod, atom)
+                setattr(submod, atoms[-1], copy.deepcopy(getattr(submod, atoms[-1])))
+
+        # Create qualname mapping for each submodule
+        # Dict looks like this:
+        # {submod_name : Dict{old_qualname : new_qualname}}
+        # We save this information here for use during pipeline stage creation.
+        self.submod_qualname_mappings: Dict[str, Dict[str, str]] = {}
+        for m_qualname, mod in self.split_gm.named_children():
+            # "submod_x." prefix
+            mod_prefix = m_qualname + "."
+            mod_qualname_mapping: Dict[str, str] = {}
+            for k, v in self.new_to_old_qualname_mapping.items():
+                if k.startswith(mod_prefix):
+                    # Remove prefix
+                    new_key = k[len(mod_prefix) :]
+                    mod_qualname_mapping.setdefault(new_key, v)
+            self.submod_qualname_mappings[m_qualname] = mod_qualname_mapping
+
+        def throw(self, *args, **kwargs):
+            raise RuntimeError(
+                "To run pipeline locally, invoke the Pipe object directly, not `split_gm`"
+            )
+
+        self.split_gm.forward = throw
+
+        # Make submodules use custom direct-serialized GraphModule
+        i = 0
+        while True:
+            try:
+                name = f"submod_{i}"
+                submod = getattr(self.split_gm, name)
+                submod.__class__.__reduce__ = _direct_serialization_reduce
+                i += 1
+            except AttributeError:
+                break
+
+    def forward(self, *args, **kwargs):
+        executor_args = args
+        if len(kwargs) > 0:
+            parameters = []
+            for node in self.split_gm.graph.nodes:
+                if node.op == "placeholder":
+                    if node.args and len(node.args) > 0:
+                        parameters.append(
+                            Parameter(
+                                node.target,
+                                Parameter.POSITIONAL_OR_KEYWORD,
+                                default=node.args[0],
+                            )
+                        )
+                    else:
+                        parameter_kind = Parameter.POSITIONAL_OR_KEYWORD
+                        param_name = node.target
+                        if node.target.startswith("**"):
+                            parameter_kind = Parameter.VAR_KEYWORD  # type: ignore[assignment]
+                            param_name = param_name[2:]
+                        elif node.target.startswith("*"):
+                            parameter_kind = Parameter.VAR_POSITIONAL  # type: ignore[assignment]
+                            param_name = param_name[1:]
+                        parameters.append(Parameter(param_name, parameter_kind))
+            signature = Signature(parameters)
+            ba = signature.bind(*args, **kwargs)
+            ba.apply_defaults()
+            executor_args = ba.arguments.values()  # type: ignore[assignment]
+
+        res = self.executor.run(*executor_args)
+
+        return res
+
+    def get_stage_module(self, stage_idx: int) -> torch.nn.Module:
+        if stage_idx < 0 or stage_idx >= self.num_stages:
+            raise ValueError(f"Invalid stage index {stage_idx}!")
+        return getattr(self.split_gm, f"submod_{stage_idx}")
+
+    @staticmethod
+    def _number_and_count_forward_stages(gm: fx.GraphModule):
+        num_stages = 0
+        found_idxs: Dict[int, None] = {}
+        for node in gm.graph.nodes:
+            if node.op == "call_module" and node.target.startswith("submod_"):
+                node.meta["stage_idx"] = int(node.target[len("submod_") :])
+                found_idxs.setdefault(node.meta["stage_idx"])
+                num_stages += 1
+
+        # this assert will fail if a split point is inserted before the first layer, which creates empty first submodule
+        # Update: the following assert may fail against some torch versions >=
+        # 2.2.0, as:
+        # submod_0, submod_1, submod_2, ...
+        # may be named as
+        # submod_0, submod_2, submod_4, ...
+        # TODO: investigate
+        # assert all(i in found_idxs for i in range(num_stages))
+
+        return num_stages
+
+    @staticmethod
+    def _from_traced(
+        mod: torch.nn.Module,
+        exported_program: ExportedProgram,
+        multi_use_param_spec: Optional[MultiUseParamSpec] = None,
+        output_loss_value_spec=None,
+        split_policy: Optional[
+            Callable[[torch.fx.GraphModule], torch.fx.GraphModule]
+        ] = None,
+    ):
+        """
+        Additionally, the ``output_loss_value_spec`` value can be specified to disambiguate
+        which value in the output of `forward` is the loss value on which PiPPy should apply
+        backpropagation. For example, if your ``forward`` returns a tuple ``(loss, model_out)``,
+        you can specify ``output_loss_value_spec=(True, False)``. Or, if your ``forward`` returns
+        a dict ``{'loss': loss_value, 'model_out': model_out}``, you can specify
+        ``output_loss_value_spec={'loss': True, 'model_out': False}``
+        """
+
+        traced = exported_program.module()
+
+        if split_policy is not None:
+            logger.info("Auto-splitting model")
+            traced = split_policy(traced)  # type: ignore[arg-type]
+
+        if PIPPY_VERBOSITY == "DEBUG":
+            logger.debug("Traced original model:")
+            traced.print_readable()
+
+        # Deduplicate `get_attr` nodes that refer to the same parameter . Downstream code for moving
+        # parameters relies on the invariant that parameter accesses happen once. This is not necessarily
+        # the case (especially with custom tracers), so fix that up here.
+        get_attr_nodes: Dict[str, fx.Node] = {}
+        for node in traced.graph.nodes:
+            if node.op == "get_attr":
+                get_attr_nodes.setdefault(node.target, node)
+
+                if get_attr_nodes[node.target] != node:
+                    node.replace_all_uses_with(get_attr_nodes[node.target])
+                    traced.graph.erase_node(node)
+
+        # avoid looking at next node by keeping track of previous pipe_split
+        prev_pipe_split_idx = -1
+        pipe_split_nodes_to_erase = set()
+        for i, node in enumerate(traced.graph.nodes):
+            if (node.op, node.target) == ("call_function", pipe_split):
+                if prev_pipe_split_idx == i - 1:
+                    pipe_split_nodes_to_erase.add(node)
+                prev_pipe_split_idx = i
+
+        for node in pipe_split_nodes_to_erase:
+            traced.graph.erase_node(node)
+
+        traced.recompile()
+
+        part_idx = 0
+
+        def split_callback(n: fx.Node):
+            nonlocal part_idx
+            if (n.op, n.target) == (
+                "call_function",
+                aten_pipe_split_alias,
+            ):
+                logger.debug(f"Found pipe_split {part_idx}")  # noqa: G004
+                part_idx += 1
+            return part_idx
+
+        # Ask split_module to return mapping from new qualname to old qualname
+        splitter_qualname_map: Dict[str, str] = {}
+        # TODO: what does split do with module invocations? does it move the modules
+        # into the submodules?
+        split = split_module(traced, mod, split_callback, splitter_qualname_map)
+        # a (custom) tracer can produce dead code like orphan get_attr nodes
+        split.graph.eliminate_dead_code()
+
+        # peephole to remove pipe_split
+        for submodule in split.modules():
+            if isinstance(submodule, fx.GraphModule):
+                for node in submodule.graph.nodes:
+                    if (node.op, node.target) == (
+                        "call_function",
+                        aten_pipe_split_alias,
+                    ):
+                        submodule.graph.erase_node(node)
+                submodule.recompile()
+
+        for name, submodule in split.named_children():
+            if isinstance(submodule, fx.GraphModule):
+                new_submod = _outline_submodules(submodule.graph)
+                # Replace old submod
+                split.register_module(name, new_submod)
+
+        # lift single-use parameter fetches into the modules that use them
+        # TODO: backport this into split_module
+        def delete_user_reference(node, user, delete_node=True):
+            assert len(user.kwargs) == 0
+            use_idxs = [i for i, arg in enumerate(user.args) if arg == node]
+            assert len(use_idxs) == 1
+            args_copy = list(user.args)
+            args_copy.pop(use_idxs[0])
+            user.args = tuple(args_copy)
+            if delete_node:
+                node.graph.erase_node(node)
+            return use_idxs[0]
+
+        # A list of param referrals for deferred deletion.
+        # To be accumulated in `move_param_to_callee`.
+        to_delete = list()
+
+        def move_param_to_callee(
+            root,
+            callee_name,
+            param_fqn,
+        ):
+            # `atoms` is a list of strings representing the path to the
+            # parameter in the original model
+            atoms = param_fqn.split(".")
+            # Recursively find the parent of the parameter
+            mod_itr = split
+            for atom in atoms[:-1]:
+                mod_itr = getattr(mod_itr, atom)
+            param_val = getattr(mod_itr, atoms[-1])
+            is_buffer = atoms[-1] in mod_itr._buffers
+
+            assert isinstance(param_val, torch.Tensor), (
+                f"Expected '{param_fqn}' to be {torch.Tensor} but got {type(param_val)}."
+                + (
+                    f" It might happen if module '{param_fqn}' was passed to some 'leaf function'"
+                    f"(see https://pytorch.org/docs/stable/fx.html#fx.wrap). Please inspect "
+                    f"usages of '{param_fqn}' in the traced graph."
+                    if isinstance(param_val, torch.nn.Module)
+                    else ""
+                )
+            )
+            callee = root.get_submodule(callee_name)
+            assert not hasattr(
+                callee, param_fqn
+            ), f"Module {callee_name} already has a parameter named {param_fqn}"
+            if is_buffer:
+                _assign_attr(
+                    param_val,
+                    callee,
+                    param_fqn,
+                    attr_kind=_AttrKind.BUFFER,
+                    persistent=True,
+                )
+            else:
+                _assign_attr(
+                    param_val,
+                    callee,
+                    param_fqn,
+                    attr_kind=_AttrKind.PARAMETER,
+                )
+            # logger.debug(f"Moved parameter {param_fqn} to {callee_name}")
+
+            # Update qualname mapping
+            # New qualname will have submodule prefix
+            new_qualname = f"{callee_name}.{param_fqn}"
+            if param_fqn in splitter_qualname_map:
+                # Just in case the target name is already in the splitter_qualname_map
+                # returned by split_module() -- we update the mapping using the
+                # new name as a new key
+                splitter_qualname_map[new_qualname] = splitter_qualname_map.pop(
+                    param_fqn
+                )
+            else:
+                splitter_qualname_map[new_qualname] = param_fqn
+
+            # Next step is to replace placeholder of submodule with a get_attr.
+            # Those placeholders are created by `split_module` inside each
+            # submodule.
+            # Update: this step is now moved to `_sink_params` because
+            # `_sink_params` can do it recursively (i.e. for modules inside
+            # submodule)
+
+            to_delete.append((mod_itr, atoms[-1]))
+
+        # Get the list of all parameters in the root module
+        attr_nodes = list(filter(lambda n: n.op == "get_attr", split.graph.nodes))
+        for node in attr_nodes:
+            # Check whether the parameter is used in only one submodule
+            if len(node.users) == 1:
+                user = next(iter(node.users))
+                assert user.op == "call_module"
+                # Move parameter into submodule
+                move_param_to_callee(
+                    split,
+                    user.target,
+                    node.target,
+                )
+            else:
+                # TODO: re-enable support for multi-use parameters
+                raise NotImplementedError(
+                    f"""
+                    Parameter {node.target} used in multiple stages:
+                    {node.users}.
+                    Currently, we do not support multi-use parameters.
+                    """
+                )
+
+        # Deferral deletion: Remove the original attributes (to params) from the
+        # root GraphModule
+        for mod_itr, last_atom in to_delete:
+            delattr(mod_itr, last_atom)
+
+        # After moving the params to their corresponding hierarchies, we also
+        # need to move the `get_attr` nodes from the root of the graph to those
+        # hierarchies.
+        inputs_to_state: Dict[str, str] = {
+            attr.name: attr.target for attr in attr_nodes
+        }
+        # This is done by (1) `_sind_params` at each submodule;
+        for name, submod in split.named_children():
+            if isinstance(submod, fx.GraphModule):
+                _sink_params(submod, inputs_to_state, [])
+                submod.graph.lint()
+                submod.recompile()
+
+        # And (2) remove `get_attr` nodes from the root
+        for node in attr_nodes:
+            if len(node.users) == 1:
+                user = next(iter(node.users))
+                assert user.op == "call_module"
+                use_idx = delete_user_reference(node, user)
+
+        split.graph.lint()
+        split.recompile()
+
+        # Handle multi-use parameters based on user's configuration
+        # TODO: generalize this to sequential
+        multi_use_param_spec = multi_use_param_spec or {}
+
+        multi_use_params_qualnames: Dict[str, Optional[MultiUseParameterConfig]] = {}
+        for node in split.graph.nodes:
+            if node.op == "get_attr" and len(node.users) > 1:
+                multi_use_params_qualnames.setdefault(node.target)
+
+        def set_multi_use_param_spec(
+            multi_use_params_qualnames,
+            multi_use_param_spec,
+        ):
+            for param in multi_use_params_qualnames:
+                if isinstance(multi_use_param_spec, MultiUseParameterConfig):
+                    multi_use_params_qualnames[param] = multi_use_param_spec
+                elif isinstance(multi_use_param_spec, dict):
+                    multi_use_params_qualnames[param] = multi_use_param_spec.get(
+                        param, MultiUseParameterConfig.TRANSMIT
+                    )
+                else:
+                    raise ValueError(
+                        "multi_use_param_spec must be MultiUseParamSpec enum or dict"
+                    )
+
+        def handle_multi_use_params(
+            split,
+            multi_use_params_qualnames,
+        ):
+            # TODO: do we maintain the invariant that `Node.users` is topologically ordered? I don't think so
+            node_to_first_user: Dict[fx.Node, fx.Node] = {}
+            for node in split.graph.nodes:
+                for input in node.all_input_nodes:
+                    if input not in node_to_first_user:
+                        node_to_first_user[input] = node
+
+            for node in split.graph.nodes:
+                if node.op == "get_attr" and node.target in multi_use_params_qualnames:
+                    reuse_type = multi_use_params_qualnames[node.target]
+                    if reuse_type == MultiUseParameterConfig.TRANSMIT:
+                        first_user = node_to_first_user[node]
+                        assert first_user.op == "call_module"
+
+                        use_idx = delete_user_reference(
+                            node, first_user, delete_node=False
+                        )
+
+                        atoms = node.target.split(".")
+                        mod_itr = split
+                        for atom in atoms[:-1]:
+                            mod_itr = getattr(mod_itr, atom)
+                        param_val = getattr(mod_itr, atoms[-1])
+                        is_buffer = atoms[-1] in mod_itr._buffers
+
+                        callee_param_def = move_param_to_callee(  # type: ignore[call-arg]
+                            split,
+                            first_user.target,
+                            param_val,
+                            use_idx,
+                            is_buffer,
+                        )
+
+                        delattr(mod_itr, atoms[-1])
+
+                        # Add extra output to the callee and switch references to the parameter
+                        # access in the pipeline graph to use this.
+                        submod = split.get_submodule(first_user.target)
+                        callee_output_nodes = [
+                            n for n in submod.graph.nodes if n.op == "output"
+                        ]
+                        assert len(callee_output_nodes) == 1
+                        callee_output_node = callee_output_nodes[0]
+
+                        # TODO: zero outputs?
+                        if isinstance(callee_output_node.args[0], tuple):
+                            new_output_args = callee_output_node.args[0] + (
+                                callee_param_def,
+                            )
+                            callee_output_node.args = (new_output_args,)
+                            new_output_idx = len(new_output_args) - 1
+                            promoted_to_tuple = False
+                        else:
+                            new_output_args = (
+                                callee_output_node.args[0],
+                                callee_param_def,
+                            )
+                            callee_output_node.args = (new_output_args,)
+                            new_output_idx = len(new_output_args) - 1
+                            promoted_to_tuple = True
+
+                        submod.graph.lint()
+                        submod.recompile()
+
+                        with split.graph.inserting_after(first_user):
+                            if promoted_to_tuple:
+                                # TODO: test this code path
+                                orig_output_getitem = split.graph.call_function(
+                                    operator.getitem, (first_user, 0)
+                                )
+                                first_user.replace_all_uses_with(orig_output_getitem)
+                                # HACK because the above replace_all_uses with ALSO replaced the instance
+                                # of first_user within the getitem node we just added
+                                orig_output_getitem.args = (
+                                    first_user,
+                                ) + orig_output_getitem.args[1:]
+
+                            transmitted_value_getitem = split.graph.call_function(
+                                operator.getitem,
+                                (first_user, new_output_idx),
+                            )
+                            node.replace_all_uses_with(transmitted_value_getitem)
+                            split.graph.erase_node(node)
+                    elif reuse_type == MultiUseParameterConfig.REPLICATE:
+                        for user in copy.copy(node.users):
+                            use_idx = delete_user_reference(
+                                node, user, delete_node=False
+                            )
+                            atoms = node.target.split(".")
+                            mod_itr = split
+                            for atom in atoms[:-1]:
+                                mod_itr = getattr(mod_itr, atom)
+                            param_val = getattr(mod_itr, atoms[-1])
+                            is_buffer = atoms[-1] in mod_itr._buffers
+
+                            move_param_to_callee(  # type: ignore[call-arg]
+                                split,
+                                user.target,
+                                param_val,
+                                use_idx,
+                                is_buffer,
+                            )
+
+                        atoms = node.target.split(".")
+                        mod_itr = split
+                        for atom in atoms[:-1]:
+                            mod_itr = getattr(mod_itr, atom)
+
+                        delattr(mod_itr, atoms[-1])
+
+                        split.graph.erase_node(node)
+                    else:
+                        raise ValueError(
+                            f"Unknown multi-use config value {reuse_type} specified for {node.target}"
+                        )
+
+        if len(multi_use_params_qualnames) > 0:
+            # TODO: re-enable support for multi-use parameters
+            raise NotImplementedError(
+                "Sharing model parameters between stages are not yet supported. "
+                "Found the following shared parameters in your model: "
+                f"{multi_use_params_qualnames}"
+            )
+            set_multi_use_param_spec(multi_use_params_qualnames, multi_use_param_spec)
+            handle_multi_use_params(split, multi_use_params_qualnames)
+
+        split.delete_all_unused_submodules()
+        split.graph.lint()
+        split.recompile()
+
+        num_stages = Pipe._number_and_count_forward_stages(split)
+
+        has_loss_and_backward = False
+        generated_loss_spec = output_loss_value_spec
+
+        if output_loss_value_spec is not None:
+            loss_node, output_node, generated_loss_spec = _find_loss_output(
+                mod, split.graph, output_loss_value_spec
+            )
+            if loss_node is not None:
+                _insert_stage_symbolic_backward(
+                    split.graph,
+                    loss_node,
+                    output_node,
+                )
+                split.recompile()
+                has_loss_and_backward = True
+                logger.debug("Pipeline is in training mode, backward pass generated")
+            else:
+                raise RuntimeError(
+                    f"Did not find any loss value according to {output_loss_value_spec=}"
+                )
+        else:
+            logger.debug("Pipeline is in inference mode, backward pass not generated")
+
+        # Tracer may modify qualname, get the qualname mapping before and after tracing.
+        # This qualname mapping is different from the mapping before and after splitting.
+        tracer_qualname_map = Pipe._get_param_buffer_mapping(mod, traced)
+
+        logger.debug("Full pipe model:\n" f"{split}")  # noqa: G004
+
+        return Pipe(
+            split,
+            splitter_qualname_map,
+            num_stages,
+            has_loss_and_backward,
+            generated_loss_spec,
+            tracer_qualname_map,
+        )
+
+    def print_readable(self):
+        """
+        Print the pipe in a human-readable format.
+        This will print both the root pipe and each stage module.
+        """
+        self.split_gm.print_readable()
+
+    @staticmethod
+    def _trace_with_export(
+        mod: torch.nn.Module,
+        example_args: Tuple[Any, ...],
+        example_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> ExportedProgram:
+        logger.info("Tracing model ...")
+        ep = torch.export.export(
+            mod,
+            example_args,
+            example_kwargs,
+        )
+        return ep
+
+    @staticmethod
+    def from_tracing(
+        mod: torch.nn.Module,
+        num_chunks: int,
+        example_args: Tuple[Any, ...],
+        example_kwargs: Optional[Dict[str, Any]] = None,
+        split_policy: Optional[Callable[[fx.GraphModule], fx.GraphModule]] = None,
+    ):
+        # If a param will be used in multiple pipeline stages, we default the strategy to REPLICATE'ing the param across
+        # stages instead of TRANSMIT'ting it
+        multi_use_param_spec = MultiUseParameterConfig.REPLICATE
+
+        # Figure out which output is loss from output_chunk_spec
+        output_loss_value_spec: Any = None
+        # Deprecated
+        """
+        if output_chunk_spec is not None:
+            output_loss_value_spec = map_aggregate(
+                output_chunk_spec, lambda v: isinstance(v, _LossReducer)
+            )
+        """
+
+        args_split, kwargs_split = split_args_kwargs_into_chunks(
+            example_args,
+            example_kwargs,
+            num_chunks,
+            Pipe.args_chunk_spec,
+            Pipe.kwargs_chunk_spec,
+        )
+
+        # Trace with export
+        exported_program = Pipe._trace_with_export(
+            mod,
+            example_args=args_split[0],
+            example_kwargs=kwargs_split[0],
+        )
+
+        pipe = Pipe._from_traced(
+            mod,
+            exported_program,
+            multi_use_param_spec,
+            output_loss_value_spec=output_loss_value_spec,
+            split_policy=split_policy,
+        )
+
+        # Users want the first pipeline stage to accept kwargs if the original
+        # program does. This is controlled by the `_codegen` field of the graph,
+        # so we make a copy here. Note: we only want the input spec and not the
+        # output spec, because the output spec is for the last stage. Maybe a
+        # TODO? Not sure yet.
+        split = pipe.split_gm
+        traced = exported_program.module()
+        submod0 = next(iter(split.children()))
+        submod0_sign = signature(submod0.forward)
+        model_sign = signature(traced.forward)
+        if len(model_sign.parameters) != len(submod0_sign.parameters):
+            # We don't change the signature of the first stage if it takes
+            # different number of args than original model
+            logger.info(
+                f"Original model takes {len(model_sign.parameters)} args but the "  # noqa: G004
+                f"first pipeline stage takes {len(submod0_sign.parameters)}. "
+                "Please provide args to respective pipeline stages."
+            )
+        else:
+            # Support kwargs for the first stage
+            submod0.graph._codegen = copy.deepcopy(traced.graph._codegen)
+            # `_replace` is actually not "private" or internal. based on this doc:
+            # To prevent conflicts with field names, the method and attribute names
+            # start with an underscore
+            submod0.graph._codegen.pytree_info = (
+                submod0.graph._codegen.pytree_info._replace(out_spec=None)
+            )
+            submod0.recompile()
+
+        # Create pipe info
+        pipe.pipe_info = Pipe.PipeInfo(
+            graph=pipe.split_gm.graph,
+            num_stages=pipe.num_stages,
+            num_chunks=num_chunks,
+            has_loss_and_backward=pipe.has_loss_and_backward,
+            args_chunk_spec=Pipe.args_chunk_spec,
+            kwargs_chunk_spec=Pipe.kwargs_chunk_spec,
+        )
+        return pipe
+
+    def __str__(self):
+        return self.split_gm.__str__()
+
+    def __repr__(self):
+        return self.split_gm.__repr__()
+
+    def info(self) -> PipeInfo:
+        if self.pipe_info is None:
+            raise RuntimeError(
+                "Pipe info is not available. Please use the `pipeline` method to create the `Pipe` object."
+            )
+        return self.pipe_info
+
+    # TODO: this util comes from pytorch/pytorch#115462, delete it from PiPPy
+    # when PyTorch 2.3 comes with support, or when PiPPy migrates from
+    # `_export_to_torch_ir` to export + unflattener.
+    @staticmethod
+    def _get_param_buffer_mapping(
+        original_module: torch.nn.Module,
+        traced_module: torch.nn.Module,
+    ) -> Dict[str, str]:
+        """
+        Returns a mapping of parameter/buffer names from the new module to the
+        original model. This is to help with restoring the FQN for parameter/buffers
+        of a traced module to what the original module contains.
+        """
+
+        param_lookup: Dict[int, List[str]] = {}
+        buffer_lookup: Dict[int, List[str]] = {}
+        for name, param in original_module.named_parameters(remove_duplicate=False):
+            param_lookup.setdefault(id(param), []).append(name)
+        for name, buffer in original_module.named_buffers(remove_duplicate=False):
+            buffer_lookup.setdefault(id(buffer), []).append(name)
+
+        param_buffer_table: Dict[str, str] = {}
+        for dynamo_name, dynamo_param in traced_module.named_parameters(
+            remove_duplicate=False
+        ):
+            assert dynamo_name not in param_buffer_table
+            if id(dynamo_param) in param_lookup:
+                param_buffer_table[dynamo_name] = param_lookup[id(dynamo_param)].pop()
+
+        for dynamo_name, dynamo_buffer in traced_module.named_buffers(
+            remove_duplicate=False
+        ):
+            assert dynamo_name not in param_buffer_table
+            if id(dynamo_buffer) in buffer_lookup:
+                param_buffer_table[dynamo_name] = buffer_lookup[id(dynamo_buffer)].pop()
+
+        return param_buffer_table
+
+
+class SplitPoint(Enum):
+    BEGINNING = 1
+    END = 2
+
+
+# For backward compatibility, we kept the PipeSplitWrapper class because `class
+# SplitPoint` used to be defined in this class.
+class PipeSplitWrapper:
+    # Create a class alias for BC
+    SplitPoint = SplitPoint
+
+
+def _split_before_forward(self, *args, **kwargs):
+    pipe_split()
+    return self._orig_forward(*args, **kwargs)
+
+
+def _split_after_forward(self, *args, **kwargs):
+    try:
+        return self._orig_forward(*args, **kwargs)
+    finally:
+        pipe_split()
+
+
+def annotate_split_points(mod: torch.nn.Module, spec: Dict[str, SplitPoint]):
+    # TODO: make this implementation out-of-place?
+    for qualname, split_type in spec.items():
+        atoms = qualname.split(".")
+        predecessor_module = mod
+        for i, atom in enumerate(atoms[:-1]):
+            try:
+                predecessor_module = getattr(predecessor_module, atom)
+            except AttributeError as e:
+                raise AttributeError(
+                    f'Specified target {qualname} referenced nonexistent module {".".join(atoms[:i+1])}'
+                )
+
+        mod_to_wrap = getattr(predecessor_module, atoms[-1])
+        mod_to_wrap._orig_forward = mod_to_wrap.forward
+        if split_type == SplitPoint.BEGINNING:
+            mod_to_wrap.forward = MethodType(_split_before_forward, mod_to_wrap)
+        elif split_type == SplitPoint.END:
+            mod_to_wrap.forward = MethodType(_split_after_forward, mod_to_wrap)
+        else:
+            raise ValueError("Unknown split point type.")
+
+
+def pipeline(
+    module: torch.nn.Module,
+    num_chunks: int,
+    example_args: Tuple[Any, ...],
+    example_kwargs: Optional[Dict[str, Any]] = None,
+    split_spec: Optional[Dict[str, SplitPoint]] = None,
+    split_policy: Optional[Callable[[fx.GraphModule], fx.GraphModule]] = None,
+) -> Pipe:
+    """
+    Creates a pipeline representation for the provided module.
+
+    See `Pipe` for more details.
+
+    Arguments
+    ---------
+    module:
+        The module to be transformed into a `Pipe`.
+    num_chunks:
+        The number of microbatches to be run with this pipeline.
+    example_args:
+        Example positional inputs to be used with this pipeline.
+    example_kwargs:
+        Example keyword inputs to be used with this pipeline. (default: `None`)
+    split_spec:
+        A dictionary mapping module names to `SplitPoint`s. (default: `None`)
+    split_policy:
+        The policy to use for splitting the module. (default: `None`)
+
+    Returns
+    -------
+    A pipeline representation of class `Pipe`.
+    """
+    if split_spec is not None and split_policy is not None:
+        raise ValueError(
+            "Cannot specify both `split_spec` and `split_policy`. Please use only one of them."
+        )
+
+    if split_spec is not None:
+        # Annotate split points in the module based on user spec
+        annotate_split_points(module, split_spec)
+        return Pipe.from_tracing(
+            mod=module,
+            num_chunks=num_chunks,
+            example_args=example_args,
+            example_kwargs=example_kwargs,
+        )
+    else:
+        # Use split policy
+        return Pipe.from_tracing(
+            mod=module,
+            num_chunks=num_chunks,
+            example_args=example_args,
+            example_kwargs=example_kwargs,
+            split_policy=split_policy,
+        )
+
+
+# Context manager for setting `args_chunk_spec` during creation of Pipe
+class ArgsChunkSpec:
+    """
+    Example:
+    >>> # xdoctest: +SKIP
+    >>> # There are three positional arguments to the model, and
+    >>> # we are chunking them along dimension 0, 0 and 1, respectively
+    >>> with ArgsChunkSpec((0, 0, 1)):
+    >>>     pipe = pipeline(model, num_chunks, example_args)
+    """
+
+    def __init__(
+        self,
+        chunk_dims: Tuple[int, ...],
+    ):
+        self.args_chunk_spec = map_aggregate(
+            chunk_dims,
+            lambda dim: TensorChunkSpec(dim),
+        )
+
+    def __enter__(self):
+        # Inject into the Pipe class
+        Pipe.args_chunk_spec = self.args_chunk_spec
+        return self.args_chunk_spec
+
+    def __exit__(self, exc_type, exc_val, traceback):
+        # Remove from the Pipe class
+        Pipe.args_chunk_spec = None
+
+
+# Context manager for setting `kwargs_chunk_spec` during creation of Pipe
+class KwargsChunkSpec:
+    """
+    Example:
+    >>> # xdoctest: +SKIP
+    >>> # Chunk dimension 0 for the "id" argument, 1 for the "mask" argument
+    >>> with KwargsChunkSpec({"id": 0, "mask": 1}):
+    >>>     pipe = pipeline(model, num_chunks, (), example_kwargs)
+    """
+
+    def __init__(
+        self,
+        chunk_dims: Dict[str, int],
+    ):
+        self.kwargs_chunk_spec = map_aggregate(
+            chunk_dims,
+            lambda dim: TensorChunkSpec(dim),
+        )
+
+    def __enter__(self):
+        # Inject into the Pipe class
+        Pipe.kwargs_chunk_spec = self.kwargs_chunk_spec
+        return self.kwargs_chunk_spec
+
+    def __exit__(self, exc_type, exc_val, traceback):
+        # Remove from the Pipe class
+        Pipe.kwargs_chunk_spec = None
diff --git a/torch/distributed/pipelining/__init__.py b/torch/distributed/pipelining/__init__.py
new file mode 100644
index 0000000000000..25e15cb19e21c
--- /dev/null
+++ b/torch/distributed/pipelining/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from ._IR import (
+    annotate_split_points,
+    ArgsChunkSpec,
+    KwargsChunkSpec,
+    Pipe,
+    pipe_split,
+    pipeline,
+    SplitPoint,
+)
+
+__all__ = [
+    "Pipe",
+    "pipe_split",
+    "SplitPoint",
+    "annotate_split_points",
+    "pipeline",
+    "ArgsChunkSpec",
+    "KwargsChunkSpec",
+]
diff --git a/torch/distributed/pipelining/_backward.py b/torch/distributed/pipelining/_backward.py
new file mode 100644
index 0000000000000..c3aa9060502bf
--- /dev/null
+++ b/torch/distributed/pipelining/_backward.py
@@ -0,0 +1,117 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import List, Optional
+
+import torch
+
+from ._debug import map_debug_info
+
+
+def stage_backward(
+    stage_output,
+    output_grads,
+    input_values,
+    outputs_with_grads_idxs: Optional[List[int]] = None,  # deprecated, not used
+):
+    """
+    This is a helper function to:
+    1. compute the gradients for the stage inputs, and
+    2. accumulate gradients for the stage module's parameters.
+
+    Given the input value(s) and the corresponding gradient for the output
+    value(s), compute and accumulate gradients for all parameter values (leaves
+    in the autograd trace) as well as return a list of the gradients for the
+    input values
+    """
+    if outputs_with_grads_idxs is not None:
+        # Deprecated, not used in runtime calls, only exists in compiler
+        stage_output = [stage_output[i] for i in outputs_with_grads_idxs]
+        output_grads = [output_grads[i] for i in outputs_with_grads_idxs]
+
+    try:
+        # stage_output may be a composite datatype like dict. Extract all individual
+        # tensor values here
+        stage_output_tensors = []
+        output_grad_tensors = []
+
+        def extract_tensors_with_grads(output_val, grad_val):
+            if isinstance(output_val, torch.Tensor):
+                if not output_val.requires_grad and output_val.grad_fn is None:
+                    return
+                assert isinstance(
+                    grad_val, (torch.Tensor, type(None))
+                ), f"Expected Tensor or None gradient but got {type(grad_val)}"
+                stage_output_tensors.append(output_val)
+                output_grad_tensors.append(grad_val)
+            elif isinstance(output_val, (tuple, list)):
+                if grad_val is None:
+                    return
+                assert isinstance(
+                    grad_val, (tuple, list)
+                ), f"grad_value expected to have type {type(output_val)} but got {type(grad_val)}"
+                assert len(output_val) == len(grad_val)
+                for ov, gv in zip(output_val, grad_val):
+                    extract_tensors_with_grads(ov, gv)
+            elif isinstance(output_val, dict):
+                if grad_val is None:
+                    return
+                assert isinstance(grad_val, dict)
+                assert set(output_val.keys()) == set(grad_val.keys())
+                for k in output_val.keys():
+                    extract_tensors_with_grads(output_val[k], grad_val[k])
+            else:
+                # Output is a non-tensor type; just ignore it
+                pass
+
+        extract_tensors_with_grads(stage_output, output_grads)
+
+        torch.autograd.backward(
+            stage_output_tensors, grad_tensors=output_grad_tensors  # type: ignore[arg-type]
+        )
+
+        # Extract gradients wrt the input values
+        grad_inputs = []
+        for val in input_values:
+            if isinstance(val, torch.Tensor):
+                grad_inputs.append(val.grad)
+            else:
+                grad_inputs.append(None)
+
+        # Alternative impl: `torch.autograd.grad`.
+        # Note that `torch.autograd.grad` will not accumulate gradients into the
+        # model's parameters.
+        """
+        inputs_with_grad = []
+        for val in input_values:
+            if isinstance(val, torch.Tensor) and val.requires_grad:
+                inputs_with_grad.append(val)
+
+        grad_inputs = torch.autograd.grad(
+            stage_output_tensors, inputs_with_grad, output_grad_tensors,  # type: ignore[arg-type]
+        )
+        """
+
+    except Exception as e:
+        exc_msg = f"""
+        Failed to run stage backward:
+        Stage output: {map_debug_info(stage_output)}
+        Output gradient: {map_debug_info(output_grads)}
+        Input: {map_debug_info(input_values)}
+        """
+        raise RuntimeError(exc_msg) from e
+
+    return grad_inputs
+
+
+# TODO: handling requires_grad=False dynamically. Can we analyze this during initial
+# IR emission?
+def _null_coalesce_accumulate(lhs, rhs):
+    """
+    Coalesce two values, even if one of them is null, returning the non-null
+    value.
+    """
+    if lhs is None:
+        return rhs
+    elif rhs is None:
+        return lhs
+    else:
+        return torch.add(lhs, rhs)
diff --git a/torch/distributed/pipelining/_debug.py b/torch/distributed/pipelining/_debug.py
new file mode 100644
index 0000000000000..90f232fdf1a2d
--- /dev/null
+++ b/torch/distributed/pipelining/_debug.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import logging
+import os
+
+import torch
+
+
+# PIPPY_VERBOSITY is an environment variable that controls the logging level.
+# It can be set to one of the following:
+#   - WARNING (default)
+#   - INFO
+#   - DEBUG
+PIPPY_VERBOSITY = os.getenv("PIPPY_VERBOSITY", "WARNING")
+if PIPPY_VERBOSITY not in ["WARNING", "INFO", "DEBUG"]:
+    print(f"Unsupported PIPPY_VERBOSITY level: {PIPPY_VERBOSITY}")
+    PIPPY_VERBOSITY = "WARNING"
+
+logging.getLogger("pippy").setLevel(PIPPY_VERBOSITY)
+# It seems we need to print something to make the level setting effective
+# for child loggers. Doing it here.
+print(f"Setting PiPPy logging level to: {PIPPY_VERBOSITY}")
+
+
+def friendly_debug_info(v):
+    """
+    Helper function to print out debug info in a friendly way.
+    """
+    if isinstance(v, torch.Tensor):
+        return f"Tensor({v.shape}, grad={v.requires_grad})"
+    else:
+        return str(v)
+
+
+def map_debug_info(a):
+    """
+    Helper function to apply `friendly_debug_info` to items in `a`.
+    `a` may be a list, tuple, or dict.
+    """
+    return torch.fx.node.map_aggregate(a, friendly_debug_info)
diff --git a/torch/distributed/pipelining/_unflatten.py b/torch/distributed/pipelining/_unflatten.py
new file mode 100644
index 0000000000000..684fcfbc1d6d7
--- /dev/null
+++ b/torch/distributed/pipelining/_unflatten.py
@@ -0,0 +1,542 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# This file is a copy of private utilities in pytorch/torch/export/unflatten.py
+# pylint: skip-file
+
+import copy
+import operator
+from enum import Enum
+from typing import cast, Dict, List, Optional, Union
+
+import torch
+import torch.fx._pytree as fx_pytree
+import torch.utils._pytree as pytree
+from torch.export.exported_program import (
+    ConstantArgument,
+    ModuleCallSignature,
+    SymIntArgument,
+    TensorArgument,
+)
+from torch.export.unflatten import InterpreterModule
+
+
+class _AttrKind(Enum):
+    PARAMETER = "parameter"
+    BUFFER = "buffer"
+    CONSTANT = "constant"
+
+
+# Assign attribute 'from_obj' to the qualified name 'target' on 'to_module
+# This installs empty Modules where none exist yet if they are subpaths of target
+def _assign_attr(
+    from_obj: Union[torch.Tensor, torch.ScriptObject],
+    to_module: torch.nn.Module,
+    target: str,
+    attr_kind: _AttrKind,
+    persistent: bool = True,
+):
+    *prefix, field = target.split(".")
+    for item in prefix:
+        t = getattr(to_module, item, None)
+
+        if t is None:
+            t = torch.nn.Module()
+            setattr(to_module, item, t)
+        to_module = t
+
+    if attr_kind == _AttrKind.PARAMETER:
+        assert isinstance(from_obj, torch.nn.Parameter)
+        to_module.register_parameter(field, from_obj)
+    elif attr_kind == _AttrKind.BUFFER:
+        assert isinstance(from_obj, torch.Tensor)
+        to_module.register_buffer(field, from_obj, persistent=persistent)
+    elif attr_kind == _AttrKind.CONSTANT:
+        assert isinstance(from_obj, (torch.Tensor, torch.ScriptObject))
+        setattr(to_module, field, from_obj)
+
+
+def _is_prefix(candidate, target):
+    """Check whether `candidate` is a prefix of `target`."""
+    return len(candidate) < len(target) and target[: len(candidate)] == candidate
+
+
+def _compute_accessor(parent_fqn: str, child_fqn: str) -> str:
+    if parent_fqn == "":
+        # Handle the root module correctly.
+        return child_fqn
+
+    parent_split = parent_fqn.split(".")
+    child_split = child_fqn.split(".")
+
+    assert (
+        child_split[: len(parent_split)] == parent_split
+    ), f"Child module '{child_fqn}' is not a descendant of parent module '{parent_fqn}'"
+    return ".".join(child_split[len(parent_split) :])
+
+
+def _verify_graph_equivalence(x: torch.nn.Module, y: torch.nn.Module):
+    def graph_dump(graph: torch.fx.Graph) -> str:
+        ret = []
+        nodes_idx: Dict[int, int] = {}
+
+        def arg_dump(arg) -> str:
+            if isinstance(arg, torch.fx.Node):
+                return "%" + str(nodes_idx[id(arg)])
+            return str(arg)
+
+        for i, node in enumerate(graph.nodes):
+            args_dump = [str(arg) for arg in pytree.tree_map(arg_dump, node.args)]
+            args_dump += [
+                f"{key}={value}"
+                for key, value in pytree.tree_map(arg_dump, node.kwargs).items()
+            ]
+            target = node.target if node.op == "call_function" else ""
+            ret.append(f"{i}: {node.op}[{target}]({', '.join(args_dump)})")
+            nodes_idx[id(node)] = i
+        return "\n".join(ret)
+
+    assert graph_dump(x.graph) == graph_dump(y.graph)
+
+
+def _add_spec(gm: torch.nn.Module, spec) -> str:
+    i = 0
+    while hasattr(gm, f"_spec_{i}"):
+        i += 1
+    name = f"_spec_{i}"
+    setattr(gm, name, spec)
+    return name
+
+
+def _generate_flatten(gm: torch.nn.Module, node, spec) -> torch.fx.Node:
+    name = _add_spec(gm, spec)
+    spec_node = gm.graph.get_attr(name)
+    return gm.graph.call_function(fx_pytree.tree_flatten_spec, (node, spec_node))
+
+
+def _generate_unflatten(gm: torch.nn.Module, nodes, spec) -> torch.fx.Node:
+    name = _add_spec(gm, spec)
+    spec_node = gm.graph.get_attr(name)
+    return gm.graph.call_function(pytree.tree_unflatten, (nodes, spec_node))
+
+
+def _add_submodule(mod: torch.nn.Module, target: str, module_to_add: torch.nn.Module):
+    *prefix, field = target.split(".")
+
+    for item in prefix:
+        submod = getattr(mod, item, None)
+
+        if submod is None:
+            submod = torch.nn.Module()
+            setattr(mod, item, submod)
+
+        if not isinstance(submod, torch.nn.Module):
+            return False
+
+        mod = submod
+
+    mod.add_module(field, module_to_add)
+
+
+class _ModuleFrame:
+    def __init__(
+        self,
+        flat_graph,
+        nodes,
+        seen_nodes,
+        seen_modules,
+        parent,
+        module_stack,
+        module_id,
+        module_call_graph: Optional[Dict[str, ModuleCallSignature]] = None,
+        module: Optional[torch.nn.Module] = None,
+    ):
+        self.flat_graph = flat_graph
+        self.nodes = nodes
+        self.seen_nodes = seen_nodes
+        self.seen_modules = seen_modules
+        self.parent = parent
+        self.module_stack = module_stack
+        self.module_id = module_id
+
+        self.module_call_graph = module_call_graph
+        self.verbose = False
+
+        self.fqn = self.module_stack[-1]
+        if module is not None:
+            self.module = module
+        else:
+            self.module = InterpreterModule(torch.fx.Graph())
+        if self.module_id in self.seen_modules:
+            self.cached_graph_module = self.seen_modules[self.module_id]
+        else:
+            self.cached_graph_module = None
+            self.seen_modules[self.module_id] = self.module
+
+        self.graph = self.module.graph
+
+        # Mapping of nodes in the flat graph to nodes in this graph.
+        self.node_map: Dict[torch.fx.Node, torch.fx.Node] = {}
+        self.node_to_placeholder = {}
+
+        self.parent_call_module: Optional[torch.fx.Node] = None
+        if parent is not None:
+            accessor = _compute_accessor(parent.fqn, self.fqn)
+            _add_submodule(
+                parent.module,
+                accessor,
+                self.module
+                if self.cached_graph_module is None
+                else self.cached_graph_module,
+            )
+            self.parent_call_module = parent.graph.call_module(accessor)
+
+        signature = self.get_signature()
+
+        if signature is not None and self.parent is not None:
+            assert signature.in_spec.num_children == 2
+            args_spec = signature.in_spec.children_specs[0]
+            kwargs_spec = signature.in_spec.children_specs[1]
+            assert args_spec.context is None
+            assert kwargs_spec.context is not None
+
+            with self.graph.inserting_after(None):
+                arg_nodes = []
+                for idx in range(args_spec.num_children):
+                    arg_nodes.append(self.graph.placeholder(f"_positional_arg_{idx}"))
+                kwarg_nodes = {}
+                for name in kwargs_spec.context:
+                    kwarg_nodes[name] = self.graph.placeholder(name)
+                flat_args = _generate_flatten(
+                    self.module,
+                    (tuple(arg_nodes), kwarg_nodes),
+                    signature.in_spec,
+                )
+                for idx, arg in enumerate(signature.inputs):
+                    flat_arg_node = self.graph.create_node(
+                        op="call_function",
+                        target=operator.getitem,
+                        args=(flat_args, idx),
+                        name=arg.name
+                        if not isinstance(arg, ConstantArgument)
+                        else f"_constant_{idx}",
+                    )
+                    if isinstance(arg, ConstantArgument):
+                        continue
+                    flat_arg_node.meta = copy.copy(self.seen_nodes[arg.name].meta)
+                    self.node_to_placeholder[self.seen_nodes[arg.name]] = flat_arg_node
+
+            with self.parent.graph.inserting_before(self.parent_call_module):
+                input_nodes: List[Optional[torch.fx.Node]] = []
+                for input in signature.inputs:
+                    if isinstance(input, ConstantArgument) and input.value is None:
+                        input_nodes.append(None)
+                    else:
+                        assert isinstance(input, (TensorArgument, SymIntArgument))
+                        input_nodes.append(
+                            self.parent.remap_input(self.seen_nodes[input.name])
+                        )
+
+                inputs_node = _generate_unflatten(
+                    self.parent.module,
+                    input_nodes,
+                    signature.in_spec,
+                )
+
+                args_node = self.parent.graph.call_function(
+                    operator.getitem, (inputs_node, 0)
+                )
+                kwargs_node = self.parent.graph.call_function(
+                    operator.getitem, (inputs_node, 1)
+                )
+                arg_nodes = [
+                    self.parent.graph.call_function(operator.getitem, (args_node, i))
+                    for i in range(args_spec.num_children)
+                ]
+                kwarg_nodes = {
+                    k: self.parent.graph.call_function(
+                        operator.getitem, (kwargs_node, k)
+                    )
+                    for k in kwargs_spec.context
+                }
+            assert self.parent_call_module is not None
+            self.parent_call_module.args = tuple(arg_nodes)
+            self.parent_call_module.kwargs = kwarg_nodes
+
+    def add_placeholder(self, x):
+        assert x.graph is self.flat_graph
+        # x is not in subgraph, create a new placeholder for subgraph
+        with self.graph.inserting_before(None):
+            placeholder_node = self.graph.placeholder(x.name, type_expr=x.type)
+        # copy all meta fields, even if some fields might be irrelvant for
+        # the placeholder node
+        placeholder_node.meta = copy.copy(x.meta)
+        self.node_to_placeholder[x] = placeholder_node
+
+    def remap_input(self, x):
+        assert x.graph is self.flat_graph
+        if x in self.node_map:
+            return self.node_map[x]
+        if x not in self.node_to_placeholder:
+            self.add_placeholder(x)
+            if self.parent_call_module is not None:
+                # Important to *prepend* the output to match how we are
+                # inserting placeholder nodes.
+                self.parent_call_module.insert_arg(0, self.parent.remap_input(x))
+        return self.node_to_placeholder[x]
+
+    def get_signature(self):
+        if self.module_call_graph is not None:
+            return self.module_call_graph.get(self.fqn)
+        return None
+
+    def finalize_outputs(self):
+        orig_outputs = []
+        signature = self.get_signature()
+
+        if signature is not None and self.parent is not None:
+            for output in signature.outputs:
+                if isinstance(output, (TensorArgument, SymIntArgument)):
+                    orig_outputs.append(self.seen_nodes[output.name])
+                else:
+                    raise RuntimeError(
+                        f"Unsupported data type for output node: {output}"
+                    )
+
+            tree_out_node = _generate_unflatten(
+                self.module,
+                tuple(
+                    self.node_map[self.seen_nodes[output.name]]
+                    for output in orig_outputs
+                ),
+                signature.out_spec,
+            )
+            parent_out: Optional[torch.fx.Node] = _generate_flatten(
+                self.parent.module, self.parent_call_module, signature.out_spec
+            )
+            graph_outputs: Union[torch.fx.Node, List[torch.fx.Node]] = tree_out_node
+        else:
+            graph_outputs = []
+            # Iterate through nodes we have copied into self.graph.
+            for orig_node in self.node_map.keys():
+                for user_node in orig_node.users:
+                    if user_node.name not in self.seen_nodes:
+                        # external user node, need to expose as an output
+                        orig_outputs.append(orig_node)
+                        graph_outputs.append(self.node_map[orig_node])
+                        break
+
+            parent_out = self.parent_call_module
+            if len(graph_outputs) == 1:
+                graph_outputs = graph_outputs[0]
+
+        assert isinstance(graph_outputs, (list, torch.fx.Node))
+
+        self.graph.output(graph_outputs)
+
+        # Rewrite outputs in parent module
+        if parent_out is None:
+            return
+
+        parent_out.meta["val"] = (
+            graph_outputs.meta.get("val")
+            if isinstance(graph_outputs, torch.fx.Node)
+            else [o.meta.get("val") for o in graph_outputs]
+        )
+
+        if len(orig_outputs) == 1 and signature is None:
+            self.parent.node_map[orig_outputs[0]] = parent_out
+        else:
+            for i, orig_output in enumerate(orig_outputs):
+                # Use Proxy to record getitem access.
+                proxy_out = torch.fx.Proxy(parent_out)[i].node  # type: ignore[index]
+                proxy_out.meta["val"] = orig_output.meta.get("val")
+                self.parent.node_map[orig_output] = proxy_out
+
+        if self.cached_graph_module is not None:
+            _verify_graph_equivalence(self.cached_graph_module, self.module)
+
+    def copy_node(self, node):
+        self.print("copying", node.format_node())
+        self.node_map[node] = self.graph.node_copy(node, self.remap_input)
+        self.seen_nodes[node.name] = node
+
+    def run_outer(self):
+        i = 0
+        for node in self.flat_graph.nodes:
+            self.print(i, node.meta.get("nn_module_stack"), node.format_node())
+            i += 1
+
+        # Copy all graph inputs
+        node_idx: int = 0
+        node = self.nodes[node_idx]
+        while node.op == "placeholder":
+            self.copy_node(node)
+            node_idx += 1
+            node = self.nodes[node_idx]
+
+        self.run_from(node_idx)
+
+        # Copy graph outputs
+        for node in self.flat_graph.nodes:
+            if node.op == "output":
+                self.copy_node(node)
+
+    def print(self, *args, **kwargs):
+        if self.verbose:
+            print(*args, **kwargs)
+
+    def run_from(self, node_idx):
+        module_idx = 0
+        # Walk through the graph, building up a new graph with the right submodules
+        while node_idx < len(self.nodes):
+            node = self.nodes[node_idx]
+            assert node.op != "placeholder"
+
+            self.print()
+            self.print("STEP", node_idx, node.format_node())
+            self.print(self.module_stack)
+            if node.op == "output":
+                if len(self.module_stack) == 1:
+                    # We want the output node of the original graph to be handled
+                    # specially by the outermost stack frame (in run_outer). So
+                    # skip finalization here.
+                    return node_idx
+
+                # We've reached the end of the graph. Wrap up all the existing stack frames.
+                self.finalize_outputs()
+                return node_idx
+
+            node_module_stack = (
+                [path for path, ty in node.meta["nn_module_stack"].values()]
+                if "nn_module_stack" in node.meta
+                else self.module_stack
+            )
+            if node_module_stack[: len(self.module_stack)] != self.module_stack:
+                # This means that the current module is done executing and the
+                # current node is the beginning of a new module.
+                #
+                # In this case, we should finalize this module and return without
+                # incrementing the node counter.
+                self.finalize_outputs()
+                self.print("outlining", self.fqn)
+                self.print(self.graph)
+                return node_idx
+
+            assert node_module_stack is not None
+
+            if _is_prefix(self.module_stack, node_module_stack):
+                # This means that the current node represents the execution of a new
+                # module.
+                next_module = node_module_stack[len(self.module_stack)]
+                self.print("Creating new stack frame for", next_module)
+                # Run a nested version of module outliner from the current node
+                # counter. Once it is complete, continue from that point.
+                node_idx = _ModuleFrame(
+                    self.flat_graph,
+                    self.nodes,
+                    self.seen_nodes,
+                    self.seen_modules,
+                    self,
+                    self.module_stack + [next_module],
+                    list(node.meta["nn_module_stack"].keys())[len(self.module_stack)],
+                    self.module_call_graph,
+                ).run_from(node_idx)
+                module_idx += 1
+                continue
+
+            # The only remaining possibility is that we are in the right stack
+            # frame. Copy the node into this frame's graph and increment the node counter.
+            assert node_module_stack == self.module_stack
+            self.copy_node(node)
+            node_idx += 1
+
+
+def _outline_submodules(orig_graph: torch.fx.Graph):
+    # Create an empty GraphModule to hold the outlined modules
+    new_module = torch.fx.GraphModule(torch.nn.Module(), torch.fx.Graph())
+    seen_nodes: Dict[str, torch.fx.Node] = {}
+    seen_modules: Dict[int, torch.nn.Module] = {}
+    _ModuleFrame(
+        orig_graph,
+        tuple(orig_graph.nodes),
+        seen_nodes,
+        seen_modules,
+        None,
+        [""],
+        "",
+        module=new_module,
+    ).run_outer()
+    new_module.graph.lint()
+    new_module.recompile()
+    return new_module
+
+
+def _sink_params(
+    module: torch.nn.Module,
+    inputs_to_state: Dict[str, str],
+    scope: List[str],
+):
+    """Sink params, buffers, and constants from graph inputs into get_attr nodes.
+
+    Exported modules are purely functional, so they pass their parameters and
+    buffers in as inputs to the graph.
+
+    To replicate eager's semantics, we need to get them from the module state
+    via get_attr instead.
+
+    module: GraphModule, potentially containining nested submodules.
+    inputs_to_state: mapping graph input names to the corresponding key in the state_dict.
+    scope: tracks where we are in the module hierarchy, so that we can emit the
+        right `getattr(self, "foo.bar")` calls, etc.
+    """
+    # We need to use _modules here instead of named_children(), because we
+    # explicitly want duplicate modules to show up in the traversal.
+    for name, submodule in module._modules.items():
+        _sink_params(cast(torch.nn.Module, submodule), inputs_to_state, scope + [name])
+
+    if not hasattr(module, "graph"):
+        # Not all modules have graphs defined, if they are empty modules with no operations (like ParameterList)
+        return
+
+    graph = module.graph
+    inputs = list(filter(lambda n: n.op == "placeholder", graph.nodes))
+    the_last_input = inputs[-1]
+
+    # Also remove from call_module nodes
+    call_module_nodes = filter(lambda n: n.op == "call_module", graph.nodes)
+    for node in call_module_nodes:
+        node.args = tuple(filter(lambda n: n.name not in inputs_to_state, node.args))
+
+    for node in inputs:
+        if node.name not in inputs_to_state:
+            continue
+
+        if len(node.users) > 0:
+            state_name = inputs_to_state[node.name].split(".")
+            # If there's a mismatch beteewn scope name and state name, then there must be multuple scopes
+            # pointing to the same state name, meaning some modules are shared. In such case, we can simply
+            # skip updating the current node because another later iteration will take care of this input
+            # node when the unique match between scope and state name occurs.
+            # To make sure this always happen, we should enforce the invariant that no placeholder node
+            # in the unflattened graph appears in inputs_to_state dict, which means all the extra input
+            # nodes have been handled.
+            if state_name[: len(scope)] != scope:
+                continue
+            attr_path = state_name[len(scope) :]
+            state_attr = _recursive_getattr(module, attr_path)
+            assert isinstance(state_attr, (torch.Tensor, torch.ScriptObject))
+
+            # Make sure the newly created get_attr node is placed after the last placeholder node
+            with graph.inserting_after(the_last_input):
+                new_node = graph.create_node("get_attr", ".".join(attr_path))
+
+            node.replace_all_uses_with(new_node, propagate_meta=True)
+        graph.erase_node(node)
+    if isinstance(module, InterpreterModule):
+        module.finalize()
+
+
+def _recursive_getattr(obj, attr_path):
+    for attr in attr_path:
+        obj = getattr(obj, attr)
+
+    return obj
diff --git a/torch/distributed/pipelining/_utils.py b/torch/distributed/pipelining/_utils.py
new file mode 100644
index 0000000000000..b1c8de43470e0
--- /dev/null
+++ b/torch/distributed/pipelining/_utils.py
@@ -0,0 +1,134 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import logging
+from typing import Dict, Optional
+
+import torch
+from torch import fx
+from torch.export.unflatten import InterpreterModule
+
+
+logger = logging.getLogger(__name__)
+
+
+def flatten_args_detach(args):
+    """
+    Flatten the args into a list form and detach the tensors from computational graph.
+    """
+    flat_detached_args = []
+
+    def extract_tensor_args(a):
+        nonlocal flat_detached_args
+        if isinstance(a, torch.Tensor):
+            val = a.detach().requires_grad_(a.requires_grad)
+            flat_detached_args.append(val)
+            return val
+        else:
+            flat_detached_args.append(a)
+            return a
+
+    new_args = fx.node.map_aggregate(
+        args,
+        extract_tensor_args,
+    )
+
+    return new_args, flat_detached_args
+
+
+def flatten_args(args):
+    """
+    Flatten the args into a list form.
+    """
+    flat_args = []
+
+    def extract_tensor_args(a):
+        nonlocal flat_args
+        flat_args.append(a)
+        return a
+
+    fx.node.map_aggregate(
+        args,
+        extract_tensor_args,
+    )
+
+    return flat_args
+
+
+def modify_graph_op_device(
+    gm: torch.fx.GraphModule,
+    new_device: torch.device,
+):
+    """
+    Modify the device argument of all "call_function" nodes in the graph.  This
+    is useful for moving the graph to a different device. In particular for
+    generator ops, like torch.ones.
+    """
+    modified = False
+    for node in gm.graph.nodes:
+        if node.op == "call_function":
+            if "device" in node.kwargs and node.kwargs["device"] != new_device:
+                logger.debug(
+                    f"Changing device of Node {node.name} from {node.kwargs['device']} to {new_device}"  # noqa: G004
+                )
+                node.update_kwarg("device", new_device)
+                modified = True
+        elif node.op == "call_module":
+            # Recursively modify "device" in submodules
+            submod = gm.get_submodule(node.target)
+            if isinstance(submod, torch.fx.GraphModule):
+                modify_graph_op_device(submod, new_device)
+            elif isinstance(submod, InterpreterModule):
+                # If unflattening has been performed, we need to access its graph module by `.graph_module`
+                modify_graph_op_device(submod.graph_module, new_device)
+            else:
+                logger.warning(
+                    f"Skipping device modification for submodule {node.target} because it is a {type(submod)}"  # noqa: G004
+                )
+
+    if modified:
+        gm.recompile()
+
+
+class QualnameMapMixin:
+    """
+    A mixin class that helps a `Pipe` object to remap its qualnames back to
+    original qualnames.
+    """
+
+    def __init__(
+        self,
+        splitter_qualname_map: Optional[Dict[str, str]] = None,
+        tracer_qualname_map: Optional[Dict[str, str]] = None,
+    ):
+        self.new_to_old_qualname_mapping: Dict[str, str] = splitter_qualname_map or {}
+        self.tracer_qualname_map = tracer_qualname_map
+
+    def remap_qualname(self, qualname: str):
+        # TODO: annoying
+        if qualname.startswith("split_gm."):
+            qualname = qualname[len("split_gm.") :]
+
+        name_before_split = None
+        if qualname in self.new_to_old_qualname_mapping:
+            name_before_split = self.new_to_old_qualname_mapping[qualname]
+        else:
+            # The qualname map does not store recursive items, thus,
+            # when passed a qualname with leaves, we need to perform longest prefix match
+            # Split from the right, one each time
+            split_names = qualname.rsplit(".", 1)
+            leaf = split_names[-1]
+            while len(split_names) > 1:
+                prefix = split_names[0]
+                if prefix in self.new_to_old_qualname_mapping:
+                    old_prefix = self.new_to_old_qualname_mapping[prefix]
+                    name_before_split = ".".join([old_prefix, leaf])
+                    break
+                split_names = prefix.rsplit(".", 1)
+                leaf = ".".join([split_names[-1], leaf])
+
+        if name_before_split is None:
+            raise RuntimeError(f"Could not find mapping for {qualname}")
+
+        if self.tracer_qualname_map is not None:
+            return self.tracer_qualname_map[name_before_split]
+        else:
+            return name_before_split
diff --git a/torch/distributed/pipelining/microbatch.py b/torch/distributed/pipelining/microbatch.py
new file mode 100644
index 0000000000000..5477885e7929b
--- /dev/null
+++ b/torch/distributed/pipelining/microbatch.py
@@ -0,0 +1,420 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from torch.utils._pytree import tree_flatten, tree_unflatten
+
+
+logger = logging.getLogger(__name__)
+
+"""
+_debug_mask_minibatches specifies to send masked versions of the mini-batch
+through instead of micro-batch slices--this can be used for more stable
+numerical testing (see [A Note About Correctness Testing])
+"""
+_debug_mask_minibatches = False
+
+
+class _CustomReducer:
+    """
+    Custom reducer class that can be used to specify a custom operation that
+    reduces losses of multiple microbatches into one value.
+
+    Example:
+        >>> sum_reducer = _CustomReducer(
+        >>>     torch.tensor(0.0),
+        >>>     lambda a, b: a + b
+        >>> )
+    """
+
+    def __init__(self, init_value, reduce_fn):
+        self.init_value = init_value
+        self.reduce_fn = reduce_fn
+
+
+class _LossReducer(_CustomReducer):
+    pass
+
+
+sum_reducer = _LossReducer(torch.tensor(0.0), lambda a, b: a + b)
+
+# Default chunking dimension is 0. This is used for the case where the user did
+# not specify a chunking dimension.
+DEFAULT_CHUNK_DIM = 0
+
+
+# Class used to specify chunking of inputs
+class TensorChunkSpec:
+    def __init__(self, split_dim):
+        self.split_dim = split_dim
+
+    split_dim: int
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__module__}.{self.__class__.__name__}({self.split_dim})"
+        )
+
+    def __str__(self):
+        return f"TensorChunkSpec({self.split_dim})"
+
+
+# Class used to specify replication of inputs
+class _Replicate:
+    pass
+
+
+def _shard_dict_of_args(
+    args_dict,
+    args_chunk_spec,
+    num_chunks,
+):
+    """
+    Given a dictionary of args, and a dictionary of chunking specs, shard the
+    args according to the chunking specs.
+
+    Args:
+        args_dict: Dictionary of args
+        args_chunk_spec: Dictionary of chunking specs
+        num_chunks: Number of chunks to shard the args into
+
+    Returns:
+        args_split: List of sharded args
+    """
+    # Stage 1+2: flatten and shard/replicate
+
+    # args_sharded_replicated : [num args, num flat values, num chunks]
+    args_sharded_replicated = {}
+    arg_specs = []
+
+    real_num_chunks = num_chunks
+    first_tensor = True
+
+    assert len(args_dict) == len(
+        args_chunk_spec
+    ), f"args_dict.keys() = {list(args_dict.keys())} args_chunk_spec.keys() = {list(args_chunk_spec.keys())}"
+
+    for arg_key, arg in args_dict.items():
+        flat, spec = tree_flatten(arg)
+        arg_specs.append(spec)
+
+        chunk_spec = args_chunk_spec[arg_key]
+        assert chunk_spec is not None  # Should have been set by caller
+        chunk_spec_flat, _ = tree_flatten(chunk_spec)
+        if len(flat) != len(chunk_spec_flat):
+            raise ValueError(
+                f"Argument value {arg} did not have the same number of "
+                f"values as as chunk spec {chunk_spec}"
+            )
+
+        sharded_arg_flat = []
+
+        for v, chunk_v in zip(flat, chunk_spec_flat):
+            if chunk_v is _Replicate or not isinstance(v, torch.Tensor):
+                sharded_arg_flat.append([v] * real_num_chunks)
+            elif isinstance(chunk_v, TensorChunkSpec):
+                # TODO: check type of v. If it's a tensor, use chunk (or debug mask).
+                # If it's a collection type, split it as you would expect. Otherwise,
+                # Throw an error
+                assert isinstance(v, torch.Tensor), f"{v} is not a tensor"
+
+                v_split_dim_size = v.size(chunk_v.split_dim)
+                if v_split_dim_size < real_num_chunks:
+                    if first_tensor:
+                        # We can only adjust number of chunks when we hit this
+                        # issue at the first tensor encountered
+                        logger.warning(
+                            f"Tensor size on chunking dimension is {v_split_dim_size}, "  # noqa: G004
+                            f"downsizing the number of chunks from {num_chunks} to {v_split_dim_size}."
+                        )
+                        real_num_chunks = v_split_dim_size
+                    else:
+                        raise RuntimeError(
+                            f"Arg {arg_key} on chunking dimension has a size of {v_split_dim_size}, "
+                            f"smaller than the number of chunks {num_chunks}. "
+                            "PiPPy cannot reduce the number of chunks because "
+                            "other arguments have bigger chunk-dimension sizes. "
+                            "Please adjust your num_chunks setting."
+                        )
+
+                chunk_tensors = torch.tensor_split(
+                    v, real_num_chunks, chunk_v.split_dim
+                )
+
+                if _debug_mask_minibatches:
+                    expanded_chunks = []
+
+                    split_dim_idx = 0
+                    for chunk_tensor in chunk_tensors:
+                        new_val = torch.zeros_like(v)
+                        upper_idx = split_dim_idx + chunk_tensor.size(chunk_v.split_dim)
+
+                        slice_indices = [slice(None, None, None)] * new_val.ndim
+                        slice_indices[chunk_v.split_dim] = slice(
+                            split_dim_idx, upper_idx
+                        )
+                        new_val[slice_indices] = chunk_tensor
+
+                        expanded_chunks.append(new_val)
+
+                        split_dim_idx += chunk_tensor.size(chunk_v.split_dim)
+
+                    sharded_arg_flat.append(expanded_chunks)
+                else:
+                    sharded_arg_flat.append(chunk_tensors)  # type: ignore[arg-type]
+
+                first_tensor = False
+            else:
+                raise TypeError(f"Unrecognized chunk spec: {chunk_v}")
+
+        args_sharded_replicated[arg_key] = sharded_arg_flat
+
+    # chunks_flat : [num chunks, num args, num flat values]
+    chunks_flat = []
+    for chunk_idx in range(real_num_chunks):
+        chunk_args = {}
+        for key, arg in args_sharded_replicated.items():
+            arg_single_chunk = []
+            for v_flat in arg:
+                arg_single_chunk.append(v_flat[chunk_idx])
+            chunk_args[key] = arg_single_chunk
+        chunks_flat.append(chunk_args)
+
+    # args_split : [num chunks, num args]
+    args_split = []
+
+    for chunk in chunks_flat:
+        per_chunk_args = {}
+        assert len(arg_specs) == len(chunk)
+        for (key, arg), arg_spec in zip(chunk.items(), arg_specs):
+            per_chunk_args[key] = tree_unflatten(arg, arg_spec)
+        args_split.append(per_chunk_args)
+
+    return args_split
+
+
+def split_args_kwargs_into_chunks(
+    args: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]],
+    chunks: int,
+    args_chunk_spec: Optional[Tuple[TensorChunkSpec, ...]] = None,
+    kwargs_chunk_spec: Optional[Dict[str, TensorChunkSpec]] = None,
+) -> Tuple[List[Tuple], List[Dict]]:
+    """
+    Given a sequence of args and kwargs, split them into a number of chunks
+    according to  their respective chunking specs.
+
+    Args:
+        args: Tuple of args
+        kwargs: Dict of kwargs
+        chunks: Number of chunks to split the args and kwargs into
+        args_chunk_spec: chunking specs for args, in same shape as args
+        kwargs_chunk_spec: chunking specs for kwargs, in same shape as kwargs
+
+    Returns:
+        args_split: List of sharded args
+        kwargs_split: List of sharded kwargs
+    """
+    # Given `args` and `kwargs`, we want to yield a set of `chunks` args and kwargs such that
+    # the constituent Tensor values have been sharded/replicated according to the `args_chunk_spec`
+    # and `kwargs_chunk_spec` specifications. The steps are as follows:
+    #
+    # 1. Use pytree.tree_flatten to flatten each arg and its spec into nto a 1d array of values.
+    #    To use a running example: suppose our inputs look like
+    #
+    #       args = ([A, [B, C]], D) args_spec = ([None, [None, TensorChunkSpec]], None)
+    #       (kwargs not shown but it's a similar process)
+    #
+    #    Then for this step we would end up with
+    #
+    #       args = ([A, B, C], D) args_spec = ([None, None, TensorChunkSpec], None)
+    #
+    # 2. Shard or replicate the arguments subject to the policy in the spec. Suppose chunks = 2
+    #
+    #       args = ([[A, A], [B, B], [C_1, C_2]], [D, D])
+    #
+    # 3. Rotate the nesting order such that chunks are the outer dimension
+    #
+    #       args_chunks = [
+    #           ([A, B, C_1], D),
+    #           ([A, B, C_2], D),
+    #       ]
+    #
+    # 4. Unflatten each chunk according to the spec
+    #
+    #       args_chunks = [
+    #           ([A, [B, C_1]], D),
+    #           ([A, [B, C_2]], D),
+    #       ]
+
+    # TODO: _debug_mask_minibatches
+    # Handle the case where kwargs is None
+    if kwargs is None:
+        kwargs = {}
+
+    # If user did not provide args_chunk_spec or kwargs_chunk_spec, we extend
+    # their format and use default chunking along dim 0
+    if args_chunk_spec is None:
+        args_chunk_spec = (TensorChunkSpec(DEFAULT_CHUNK_DIM),) * len(args)
+
+    if kwargs_chunk_spec is None:
+        kwargs_chunk_spec = dict.fromkeys(kwargs, TensorChunkSpec(DEFAULT_CHUNK_DIM))
+
+    args_split_dict = _shard_dict_of_args(
+        dict(enumerate(args)),
+        dict(enumerate(args_chunk_spec)),
+        chunks,
+    )
+    real_num_chunks = len(args_split_dict)
+
+    kwargs_split = _shard_dict_of_args(
+        kwargs,
+        kwargs_chunk_spec,
+        real_num_chunks,
+    )
+
+    if len(kwargs_split) < real_num_chunks:
+        # In case kwargs are sharded into less chunks
+        # e.g. when `args` has no tensor, just values
+        real_num_chunks = len(kwargs_split)
+        # Re-shard args
+        args_split_dict = _shard_dict_of_args(
+            dict(enumerate(args)),
+            dict(enumerate(args_chunk_spec)),
+            real_num_chunks,
+        )
+
+    if len(args_split_dict) != len(kwargs_split):
+        raise RuntimeError(
+            "args and kwargs are split into different number of chunks: "
+            f"{len(args_split_dict)}, {len(kwargs_split)}"
+        )
+
+    args_split = []
+    for chunk_args in args_split_dict:
+        args_split.append(tuple(chunk_args[i] for i in range(len(chunk_args))))
+
+    return args_split, kwargs_split
+
+
+def merge_chunks(
+    chunks: List[Any],
+    chunk_spec,
+):
+    """
+    Given a list of chunks, merge them into a single value according to
+    the chunk spec.
+
+    Args:
+        chunks: list of chunks
+        chunk_spec: Chunking spec for the chunks
+
+    Returns:
+        value: Merged value
+    """
+    # This is essentially the inverse of `split_args_kwargs_into_chunks`, so the
+    # steps are similar to the steps in that function but in reverse. Given the
+    # input values:
+    #
+    #       chunks = [
+    #           ([A, [B, C_1]], D),
+    #           ([A, [B, C_2]], D),
+    #       ]
+    #       args_spec = ([None, [None, TensorChunkSpec]], None)
+    #
+    # 1. Flatten the chunks according to the chunk_spec
+    #
+    #       chunks_flat = [
+    #           ([A, B, C_1], D),
+    #           ([A, B, C_2], D),
+    #       ]
+    #
+    # 2. Rotate the nesting order such that chunks are the inner dimension
+    #
+    #       value_inner = ([A, B, [C_1, C_2]], D)
+    #
+    # 3. Concatenate sharded arguments
+    #
+    #       value_combined = ([A, B, C], D)
+    #
+    # 4. Unflatten the combined args given the spec
+    #
+    #       value = ([A, [B, C]], D)
+
+    # Preliminary: flatten the chunk spec
+    if chunk_spec is not None:
+        spec_flattened, flatten_spec = tree_flatten(chunk_spec)
+    else:
+        # If chunk_spec is not provided, we will merge chunks along the default dimension (0), for all output fields
+        # We obtain the output structure by flattening chunk 0 and generate the chunk_spec
+        chunk0_flat, flatten_spec = tree_flatten(chunks[0])
+        spec_flattened = [TensorChunkSpec(DEFAULT_CHUNK_DIM)] * len(chunk0_flat)
+
+    # Stage 1: flatten chunks
+    # chunks_flattened : [num chunks, num args]
+    chunks_flattened = []
+
+    for chunk in chunks:
+        chunk_flattened, _ = tree_flatten(chunk)
+        if len(chunk_flattened) != len(spec_flattened):
+            raise ValueError(f"Chunk {chunk} did not match chunk spec {chunk_spec}")
+
+        chunks_flattened.append(chunk_flattened)
+
+    # Stage 2 and 3: Rotate nesting order s.t. chunks are inner dimension and
+    #                concatenate sharded operands
+    # args_flattened : [num args]
+    args_flattened = []
+    for arg_idx, arg in enumerate(spec_flattened):
+        if isinstance(arg, TensorChunkSpec):
+            partial_values = [
+                chunks_flattened[chunk_idx][arg_idx]
+                for chunk_idx in range(len(chunks_flattened))
+            ]
+
+            if _debug_mask_minibatches:
+                # Infer size of individual chunks by running `tensor_split` again
+                overall_shape = partial_values[0].shape
+                for val in partial_values[1:]:
+                    assert val.shape == overall_shape
+                meta_chunks = torch.tensor_split(
+                    torch.empty(*overall_shape, device="meta"),
+                    sections=len(partial_values),
+                    dim=arg.split_dim,
+                )
+
+                values_to_cat = []
+                chunk_start_idx = 0
+                assert len(partial_values) == len(meta_chunks)
+                for partial_value, meta_chunk in zip(partial_values, meta_chunks):
+                    chunk_end_idx = chunk_start_idx + meta_chunk.size(arg.split_dim)
+
+                    slice_indices = [slice(None, None, None)] * partial_value.ndim
+                    slice_indices[arg.split_dim] = slice(chunk_start_idx, chunk_end_idx)
+                    sliced = partial_value[slice_indices]
+                    values_to_cat.append(sliced)
+
+                    chunk_start_idx = chunk_end_idx
+
+            else:
+                values_to_cat = partial_values
+
+            args_flattened.append(torch.cat(values_to_cat, dim=arg.split_dim))
+        elif isinstance(arg, _CustomReducer):
+            reduced_val = arg.init_value
+
+            for chunk_idx in range(len(chunks_flattened)):
+                reduced_val = arg.reduce_fn(
+                    reduced_val, chunks_flattened[chunk_idx][arg_idx]
+                )
+
+            args_flattened.append(reduced_val)
+        else:
+            value = chunks_flattened[0][arg_idx]
+            for chunk_idx in range(1, len(chunks_flattened)):
+                assert chunks_flattened[chunk_idx][arg_idx] == value
+            args_flattened.append(value)
+
+    # Stage 4: Unflatten combined args
+    return tree_unflatten(args_flattened, flatten_spec)
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index c12c8bf4d6856..9e701067d8807 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -9,14 +9,14 @@
 import os
 import sys
 from datetime import timedelta
-from typing import Dict, Optional
+from typing import Dict, Optional, Callable, Iterator, Tuple
 
 from torch.distributed import FileStore, PrefixStore, Store, TCPStore
 
 from .constants import default_pg_timeout
 
 
-_rendezvous_handlers = {}
+_rendezvous_handlers: Dict[str, Callable[..., Iterator[Tuple[Store, int, int]]]] = {}
 
 
 def register_rendezvous_handler(scheme, handler):
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index 507be2daada16..399c9c39ec617 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -68,6 +68,27 @@
     |                                                       |                                                    |
     +-------------------------------------------------------+----------------------------------------------------+
 
+.. versionchanged:: 2.0.0
+
+    The launcher will pass the ``--local-rank=<rank>`` argument to your script.
+    From PyTorch 2.0.0 onwards, the dashed ``--local-rank`` is preferred over the
+    previously used underscored ``--local_rank``.
+
+    For backward compatibility, it may be necessary for users to handle both
+    cases in their argument parsing code. This means including both ``"--local-rank"``
+    and ``"--local_rank"`` in the argument parser. If only ``"--local_rank"`` is
+    provided, the launcher will trigger an error: "error: unrecognized arguments:
+    --local-rank=<rank>". For training code that only supports PyTorch 2.0.0+,
+    including ``"--local-rank"`` should be sufficient.
+
+    ::
+
+        >>> # xdoctest: +SKIP
+        >>> import argparse
+        >>> parser = argparse.ArgumentParser()
+        >>> parser.add_argument("--local-rank", "--local_rank", type=int)
+        >>> args = parser.parse_args()
+
 The aformentioned changes suffice to migrate from ``torch.distributed.launch`` to ``torchrun``.
 To take advantage of new features such as elasticity, fault-tolerance, and error reporting of ``torchrun``
 please refer to:
@@ -375,12 +396,13 @@ def main():
 import os
 import sys
 import uuid
+import importlib.metadata as metadata
 from argparse import REMAINDER, ArgumentParser
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List, Tuple, Type, Union, Optional, Set
 
 import torch
 from torch.distributed.argparse_util import check_env, env
-from torch.distributed.elastic.multiprocessing import Std
+from torch.distributed.elastic.multiprocessing import DefaultLogsSpecs, LogsSpecs, Std
 from torch.distributed.elastic.multiprocessing.errors import record
 from torch.distributed.elastic.rendezvous.utils import _parse_rendezvous_config
 from torch.distributed.elastic.utils import macros
@@ -388,7 +410,7 @@ def main():
 from torch.distributed.launcher.api import LaunchConfig, elastic_launch
 from torch.utils.backend_registration import _get_custom_mod_func
 
-log = get_logger(__name__)
+logger = get_logger(__name__)
 
 
 def get_args_parser() -> ArgumentParser:
@@ -477,7 +499,7 @@ def get_args_parser() -> ArgumentParser:
         "--monitor_interval",
         action=env,
         type=float,
-        default=5,
+        default=0.1,
         help="Interval, in seconds, to monitor the state of workers.",
     )
     parser.add_argument(
@@ -548,6 +570,17 @@ def get_args_parser() -> ArgumentParser:
         help="Tee std streams into a log file and also to console (see --redirects for format).",
     )
 
+    parser.add_argument(
+        "--local-ranks-filter",
+        "--local_ranks_filter",
+        action=env,
+        type=str,
+        default="",
+        help="Only show logs from specified ranks in console (e.g. [--local_ranks_filter=0,1,2] will "
+        "only show logs from rank 0, 1 and 2). This will only apply to stdout and stderr, not to"
+        "log files saved via --redirect or --tee",
+    )
+
     #
     # Backwards compatible parameters with caffe2.distributed.launch.
     #
@@ -591,6 +624,15 @@ def get_args_parser() -> ArgumentParser:
         "machine's FQDN.",
     )
 
+    parser.add_argument(
+        "--logs-specs",
+        "--logs_specs",
+        default=None,
+        type=str,
+        help="torchrun.logs_specs group entrypoint name, value must be type of LogsSpecs. "
+        "Can be used to override custom logging behavior.",
+    )
+
     #
     # Positional arguments.
     #
@@ -622,7 +664,7 @@ def parse_min_max_nnodes(nnodes: str):
         min_nodes = int(arr[0])
         max_nodes = int(arr[1])
     else:
-        raise RuntimeError(f'nnodes={nnodes} is not in "MIN:MAX" format')
+        raise RuntimeError(f'nnodes={nnodes} is not in "MIN:MAX" format')  # noqa: E231
 
     return min_nodes, max_nodes
 
@@ -659,7 +701,7 @@ def determine_local_world_size(nproc_per_node: str):
         else:
             raise ValueError(f"Unsupported nproc_per_node value: {nproc_per_node}") from e
 
-        log.info(
+        logger.info(
             "Using nproc_per_node=%s,"
             " setting to %s since the instance "
             "has %s %s",
@@ -670,7 +712,7 @@ def determine_local_world_size(nproc_per_node: str):
 
 def get_rdzv_endpoint(args):
     if args.rdzv_backend == "static" and not args.rdzv_endpoint:
-        return f"{args.master_addr}:{args.master_port}"
+        return f"{args.master_addr}:{args.master_port}"  # noqa: E231
     return args.rdzv_endpoint
 
 
@@ -688,6 +730,36 @@ def get_use_env(args) -> bool:
     return args.use_env
 
 
+def _get_logs_specs_class(logs_specs_name: Optional[str]) -> Type[LogsSpecs]:
+    """
+    Attemps to load `torchrun.logs_spec` entrypoint with key of `logs_specs_name` param.
+    Provides plugin mechanism to provide custom implementation of LogsSpecs.
+
+    Returns `DefaultLogsSpecs` when logs_spec_name is None.
+    Raises ValueError when entrypoint for `logs_spec_name` can't be found in entrypoints.
+    """
+    logs_specs_cls = None
+    if logs_specs_name is not None:
+        eps = metadata.entry_points()
+        if hasattr(eps, "select"):  # >= 3.10
+            group = eps.select(group="torchrun.logs_specs")
+            if group.select(name=logs_specs_name):
+                logs_specs_cls = group[logs_specs_name].load()
+
+        elif specs := eps.get("torchrun.logs_specs"):  # < 3.10
+            if entrypoint_list := [ep for ep in specs if ep.name == logs_specs_name]:
+                logs_specs_cls = entrypoint_list[0].load()
+
+        if logs_specs_cls is None:
+            raise ValueError(f"Could not find entrypoint under 'torchrun.logs_specs[{logs_specs_name}]' key")
+
+        logging.info("Using logs_spec '%s' mapped to %s", logs_specs_name, str(logs_specs_cls))
+    else:
+        logs_specs_cls = DefaultLogsSpecs
+
+    return logs_specs_cls
+
+
 def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str]]:
     # If ``args`` not passed, defaults to ``sys.argv[:1]``
     min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes)
@@ -695,7 +767,7 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
     assert args.max_restarts >= 0
 
     if hasattr(args, "master_addr") and args.rdzv_backend != "static" and not args.rdzv_endpoint:
-        log.warning(
+        logger.warning(
             "master_addr is only used for static rdzv_backend and when rdzv_endpoint "
             "is not specified."
         )
@@ -703,7 +775,7 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
     nproc_per_node = determine_local_world_size(args.nproc_per_node)
     if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
         omp_num_threads = 1
-        log.warning(
+        logger.warning(
             "\n*****************************************\n"
             "Setting OMP_NUM_THREADS environment variable for each process to be "
             "%s in default, to avoid your system being overloaded, "
@@ -724,6 +796,24 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
 
     rdzv_endpoint = get_rdzv_endpoint(args)
 
+    ranks: Optional[Set[int]] = None
+    if args.local_ranks_filter:
+        try:
+            ranks = set(map(int, args.local_ranks_filter.split(",")))
+            assert ranks
+        except Exception as e:
+            raise ValueError(
+                "--local_ranks_filter must be a comma-separated list of integers e.g. --local_ranks_filter=0,1,2"
+            ) from e
+
+    logs_specs_cls: Type[LogsSpecs] = _get_logs_specs_class(args.logs_specs)
+    logs_specs = logs_specs_cls(
+        log_dir=args.log_dir,
+        redirects=Std.from_str(args.redirects),
+        tee=Std.from_str(args.tee),
+        local_ranks_filter=ranks,
+    )
+
     config = LaunchConfig(
         min_nodes=min_nodes,
         max_nodes=max_nodes,
@@ -736,11 +826,9 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
         max_restarts=args.max_restarts,
         monitor_interval=args.monitor_interval,
         start_method=args.start_method,
-        redirects=Std.from_str(args.redirects),
-        tee=Std.from_str(args.tee),
-        log_dir=args.log_dir,
         log_line_prefix_template=log_line_prefix_template,
         local_addr=args.local_addr,
+        logs_specs=logs_specs,
     )
 
     with_python = not args.no_python
@@ -789,7 +877,7 @@ def run(args):
         args.rdzv_backend = "c10d"
         args.rdzv_endpoint = "localhost:0"
         args.rdzv_id = str(uuid.uuid4())
-        log.info(
+        logger.info(
             "\n**************************************\n"
             "Rendezvous info:\n"
             "--rdzv-backend=%s "
diff --git a/torch/distributed/tensor/parallel/__init__.py b/torch/distributed/tensor/parallel/__init__.py
index a284d6a757577..990550414ca47 100644
--- a/torch/distributed/tensor/parallel/__init__.py
+++ b/torch/distributed/tensor/parallel/__init__.py
@@ -1,12 +1,14 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 from torch.distributed.tensor.parallel.api import parallelize_module
 
+from torch.distributed.tensor.parallel.loss import loss_parallel
 from torch.distributed.tensor.parallel.style import (
     ColwiseParallel,
     ParallelStyle,
     PrepareModuleInput,
     PrepareModuleOutput,
     RowwiseParallel,
+    SequenceParallel,
 )
 
 __all__ = [
@@ -15,5 +17,7 @@
     "PrepareModuleInput",
     "PrepareModuleOutput",
     "RowwiseParallel",
+    "SequenceParallel",
     "parallelize_module",
+    "loss_parallel"
 ]
diff --git a/torch/distributed/tensor/parallel/_data_parallel_utils.py b/torch/distributed/tensor/parallel/_data_parallel_utils.py
index b56536d121aea..2e1ebfd53ab7b 100644
--- a/torch/distributed/tensor/parallel/_data_parallel_utils.py
+++ b/torch/distributed/tensor/parallel/_data_parallel_utils.py
@@ -36,6 +36,8 @@ def _unflatten_tensor(tensor, spec, *, device_handle=None, compute_stream=None):
         spec.mesh,
         spec.placements,
         run_check=False,
+        shape=spec.shape,
+        stride=spec.stride,
     )
     if tensor.requires_grad:
         # only register the hook if the tensor requires grad
diff --git a/torch/distributed/tensor/parallel/_utils.py b/torch/distributed/tensor/parallel/_utils.py
index b3ba98da0cf5f..c31170a0cd57f 100644
--- a/torch/distributed/tensor/parallel/_utils.py
+++ b/torch/distributed/tensor/parallel/_utils.py
@@ -1,9 +1,7 @@
-import functools
 import warnings
-from typing import Callable, Optional, Tuple, Union
+from typing import Tuple, Union
 
-import torch
-from torch.distributed._tensor import DeviceMesh, DTensor
+from torch.distributed._tensor import DeviceMesh
 from torch.distributed._tensor.placement_types import Placement
 from torch.distributed.device_mesh import _mesh_resources
 try:
@@ -12,16 +10,9 @@
     def is_torchdynamo_compiling():  # type: ignore[misc]
         return False
 
-_PrepareInputType = Callable[
-    [Union[torch.Tensor, DTensor], Optional[DeviceMesh], Optional[int]], DTensor
-]
-
-_PrepareOutputType = Callable[
-    [DTensor, Optional[DeviceMesh], Optional[int]], Union[torch.Tensor, DTensor]
-]
-
 LayoutsType = Union[Placement, Tuple[Placement, ...]]
 
+
 def _deprecate_warnings(func_name: str, extra_msg: str) -> None:
     """
     Inject common validation logics for `_prepare_input` funcs via this decorator.
@@ -34,147 +25,6 @@ def _deprecate_warnings(func_name: str, extra_msg: str) -> None:
         warnings.warn(f"{func_name} is deprecated and will be removed soon. {extra_msg}")
 
 
-def _prepare_input_validate(
-    _prepare_input_func: _PrepareInputType,
-) -> _PrepareInputType:
-    """
-    Inject common validation logics for `_prepare_input` funcs via this decorator.
-
-    Include verifying that input needs to be either
-    a :class:`Tensor` or :class:`DTensor` and only 1D :class:`DeviceMesh`
-    is passed in.
-
-    Args:
-        _prepare_input_func (Callable): The func we want to inject the
-            validation into.
-
-    Returns:
-        func (Callable): Same input function with validation logic added.
-
-    Example::
-        >>> # xdoctest: +SKIP(failing)
-        >>> @_prepare_input_validate
-        >>> def make_input_shard_1d(args, kwargs):
-        >>>   ...
-        >>>
-        >>> # xdoctest: +SKIP(failing)
-        >>> input = torch.rand(...)
-        >>> dtensor = make_input_shard_1d(input, device_mesh, 1)
-        >>> # This will call '_prepare_input_validate' first
-    """
-    @functools.wraps(_prepare_input_func)
-    def wrapper(*args, **kwargs):  # pyre-ignore[2, 3]
-        assert len(args) >= 1, "_prepare_input needs at least one arg."
-        input = args[0]
-        if isinstance(input, (list, tuple)):
-            input = input[0]
-            args = (input, *args[1:])
-        device_mesh = None if len(args) < 2 else args[1]
-
-        if device_mesh is None:
-            if isinstance(input, DTensor):
-                device_mesh = input.device_mesh
-                args = (*args[:1], device_mesh, *args[2:])  # pyre-ignore[60]
-            else:
-                raise RuntimeError("device_mesh is not passed nor can be inferred")
-        if device_mesh.ndim != 1:
-            raise RuntimeError(
-                f"device_mesh has dims {device_mesh.ndim} but expected to be 1"
-                " for input."
-            )
-        return _prepare_input_func(*args, **kwargs)
-
-    return wrapper
-
-
-def _prepare_output_validate(
-    _prepare_output_func: _PrepareOutputType,
-) -> _PrepareOutputType:
-    """
-    Inject common validation logics for _prepare_output funcs via this decorator.
-
-    Include verifying that output needs to be a DTensor
-    and only 1D Device Mesh is passed in.
-
-    Example::
-        >>> # xdoctest: +SKIP(failing)
-        >>> @_prepare_output_validate
-        >>> def make_output_shard_1d(args, kwargs):
-        >>>   ...
-        >>>
-        >>> # xdoctest: +SKIP(failing)
-        >>> dt = distribute(tensor, device_mesh, [Shard(0)])
-        >>> make_output_shard_1d(dt, device_mesh, 1)
-        >>> # This will call '_prepare_output_validate' first
-
-    Args:
-        _prepare_output_func (Callable): The func we want to inject the
-            validation into.
-    Return:
-        func (Callable): Same input func with validation logic added.
-    """
-    @functools.wraps(_prepare_output_func)
-    def wrapper(*args, **kwargs):  # pyre-ignore[2, 3]
-        assert len(args) >= 1, "_prepare_output needs at least one arg."
-        output = args[0]
-        assert isinstance(output, DTensor), (
-            "Expect output of Tensor Parallel to be a DTensor, but found"
-            f" {type(output)}."
-        )
-        if len(args) < 2 or args[1] is None:
-            device_mesh = output.device_mesh
-            args = (*args[:1], device_mesh, *args[2:])  # pyre-ignore[60]
-        else:
-            device_mesh = args[1]
-
-        assert device_mesh.ndim == 1, (
-            f"device_mesh has dims {device_mesh.ndim} but expected to be 1 for"
-            " output."
-        )
-        return _prepare_output_func(*args, **kwargs)
-
-    return wrapper
-
-
-def _create_1d_device_mesh(device_mesh: DeviceMesh, tp_mesh_dim: int = 0) -> DeviceMesh:
-    """
-    Convert a N-D ``device_mesh`` into a 1D ``device_mesh`` for 1D Tensor Parallelism.
-
-    Args:
-        device_mesh (DeviceMesh):
-            :class:``DeviceMesh`` object which describes the mesh topology
-            of devices for the DTensor.
-        tp_mesh_dim (int):
-            the dimension of ``device_mesh`` where we perform
-            Tensor Parallelism on.
-
-    Return:
-        device_mesh (DeviceMesh): 1-D :class:``DeviceMesh`` object that
-            Tensor Parallelism operates on.
-    """
-    assert tp_mesh_dim < device_mesh.ndim and tp_mesh_dim >= -device_mesh.ndim, (
-        f"Expect tp_mesh_dim within range [{-device_mesh.ndim},"
-        f" {device_mesh.ndim}), but found {tp_mesh_dim}."
-    )
-
-    if device_mesh.ndim == 1:
-        return device_mesh
-
-    # swap the current dim to the last dim then reshape to flatten out other
-    # dims, so we can just extract the list of ranks which contains cur_rank.
-    cur_rank = device_mesh.get_rank()
-    pg_ranks_by_dim = device_mesh.mesh.swapdims(-1, tp_mesh_dim).reshape(
-        -1, device_mesh.mesh.size(tp_mesh_dim)
-    )
-    for mesh_1d in pg_ranks_by_dim:
-        sub_mesh = DeviceMesh(device_mesh.device_type, mesh_1d, _init_process_groups=False)
-        if cur_rank in mesh_1d:
-            res_sub_mesh = sub_mesh
-
-    res_sub_mesh._dim_group_infos = [device_mesh._dim_group_infos[tp_mesh_dim]]
-    return res_sub_mesh
-
-
 def _validate_tp_mesh_dim(
     device_mesh: DeviceMesh,
 ) -> None:
@@ -190,17 +40,15 @@ def _validate_tp_mesh_dim(
         `True` if the mesh dimension
         is valid, `False` otherwise.
     """
+    if device_mesh.ndim > 1:
+        raise ValueError(f"Tensor Parallel only accepts a 1D DeviceMesh, but found {device_mesh.ndim}D!"
+                         "If you have a 2-D or N-D device_mesh, consider passing in device_mesh[\"tp\"]")
+
     parent_mesh = _mesh_resources.get_parent_mesh(device_mesh)
     if parent_mesh:
-        if parent_mesh.ndim != 2:
-            raise RuntimeError(
-                f"Found TP device_mesh has a parent mesh with dims {parent_mesh.ndim}",
-                "Currently we only support 2D TP composition with DP.",
-            )
-
-        tp_mesh_dim = _mesh_resources.get_parent_mesh_dim(device_mesh)
-        if tp_mesh_dim != 1:
+        tp_mesh_dim_in_parent = _mesh_resources.get_parent_mesh_dim(device_mesh)
+        if tp_mesh_dim_in_parent != parent_mesh.ndim - 1:
             raise RuntimeError(
-                f"Found TP device_mesh on the {tp_mesh_dim} dimension of its parent mesh.",
+                f"Found TP device_mesh on the {tp_mesh_dim_in_parent} dimension of its parent mesh.",
                 "Currently we only support intranode TP and TP needs to be the innermost dimension on its parent mesh.",
             )
diff --git a/torch/distributed/tensor/parallel/api.py b/torch/distributed/tensor/parallel/api.py
index ce474a3b4a153..f78e9712d304b 100644
--- a/torch/distributed/tensor/parallel/api.py
+++ b/torch/distributed/tensor/parallel/api.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 from typing import Dict, Union
+from fnmatch import fnmatch
 
 import torch
 import torch.distributed._tensor.random as random
@@ -11,7 +12,7 @@
     is_rng_supported_mesh,
     TensorParallelRNGTracker,
 )
-from torch.distributed.tensor.parallel._utils import _create_1d_device_mesh, _validate_tp_mesh_dim, _deprecate_warnings
+from torch.distributed.tensor.parallel._utils import _validate_tp_mesh_dim
 from torch.distributed.tensor.parallel.style import (
     ParallelStyle,
 )
@@ -26,7 +27,6 @@ def parallelize_module(  # type: ignore[return]
     module: nn.Module,
     device_mesh: DeviceMesh,
     parallelize_plan: Union[ParallelStyle, Dict[str, ParallelStyle]],
-    tp_mesh_dim: int = 0,
 ) -> nn.Module:
     """
     Apply Tensor Parallelism in PyTorch by parallelizing modules or sub-modules based on a user-specified plan.
@@ -51,11 +51,6 @@ def parallelize_module(  # type: ignore[return]
             :class:`ParallelStyle` object which contains how
             we prepare input/output for Tensor Parallelism or it can be a
             dict of module FQN and its corresponding :class:`ParallelStyle` object.
-        tp_mesh_dim (int, deprecated):
-            The dimension of ``device_mesh`` where we perform
-            Tensor Parallelism on, this field is deprecated and will be removed in future.
-            If you have a 2-D or N-D :class:`DeviceMesh`, consider passing in device_mesh[\"tp\"]
-
     Return:
         A :class:`nn.Module` object parallelized.
 
@@ -76,46 +71,47 @@ def parallelize_module(  # type: ignore[return]
     """
     torch._C._log_api_usage_once("torch.distributed.tensor.parallel.parallelize_module")
 
+    _validate_tp_mesh_dim(device_mesh)
+
     # instantiate a TP RNG state tracker if it's not there
     if is_rng_supported_mesh(device_mesh) and not isinstance(
         random._rng_tracker, TensorParallelRNGTracker
     ):
         random._rng_tracker = TensorParallelRNGTracker(device_mesh.device_type)
         # TODO: we should allow user to pass in the default seed from a config
-        random._rng_tracker._manual_seed(
-            device_mesh, base_seed=1234, tp_dim=tp_mesh_dim
-        )
+        random._rng_tracker._manual_seed(device_mesh, base_seed=1234)
         # By default we execute random ops in non-tensor-parallel region. If users want
         # to execute in tensor-parallel region, they can manually set this field to True
         # after parallelizing the model.
         random._rng_tracker.distribute_region_enabled = False
 
-    if device_mesh.ndim > 1:
-        _deprecate_warnings("tp_mesh_dim", "If you have a 2-D or N-D device_mesh, consider passing in device_mesh[\"tp\"]")
-        device_mesh = _create_1d_device_mesh(device_mesh, tp_mesh_dim)
-    else:
-        _validate_tp_mesh_dim(device_mesh)
-
-
     if isinstance(parallelize_plan, ParallelStyle):
         return parallelize_plan._apply(module, device_mesh)
     elif isinstance(parallelize_plan, dict):
         for module_path, parallelize_style in parallelize_plan.items():
-            sub_module = module.get_submodule(module_path)
-            parent_module = module
-            if "." in module_path:
-                parent_module_path = ".".join(module_path.split(".")[:-1])
-                parent_module = module.get_submodule(parent_module_path)
-                module_path = module_path.split(".")[-1]
-            parent_module.register_module(  # type: ignore[call-arg] # pyre-ignore[20]
-                module_path,
-                parallelize_module(  # type: ignore[arg-type]
-                    sub_module, device_mesh, parallelize_style  # type: ignore[arg-type] # pyre-ignore[6]
-                ),
-            )
+            path_splits = module_path.split(".")
+            if len(path_splits) == 0:
+                raise ValueError(
+                    "Expect module path to be non-empty, but got empty string!"
+                )
+            while path_splits:
+                atom = path_splits.pop(0)
+                matched_children = filter(
+                    # `t[0]` is child name
+                    lambda t: fnmatch(t[0], atom), module.named_children()
+                )
+                # apply the plan to all matched submodules
+                for _, submodule in matched_children:
+                    if path_splits:
+                        # we haven't reached the leaf, apply in dict style
+                        leaf_path = ".".join(path_splits)   # rest of the path after `atom`
+                        parallelize_module(submodule, device_mesh, {leaf_path: parallelize_style})
+                    else:
+                        # otherwise, directly apply style to this submodule
+                        parallelize_module(submodule, device_mesh, parallelize_style)
         return module
     else:
-        raise RuntimeError(  # pyre-ignore[7]
+        raise TypeError(  # pyre-ignore[7]
             "Expect Union[ParallelStyle, Dict[str, ParallelStyle]] for"
             f" parallelize_plan, {type(parallelize_plan)} found!"
         )
diff --git a/torch/distributed/tensor/parallel/fsdp.py b/torch/distributed/tensor/parallel/fsdp.py
index 3aaa2f57b61e5..d7eae93a7258e 100644
--- a/torch/distributed/tensor/parallel/fsdp.py
+++ b/torch/distributed/tensor/parallel/fsdp.py
@@ -233,10 +233,10 @@ def _chunk_dtensor(
     parent_mesh = _mesh_resources.get_parent_mesh(device_mesh)
     if parent_mesh is None:
         raise RuntimeError("No parent device_mesh is found for FSDP device_mesh.")
-    if parent_mesh.ndim != 2:
+    if parent_mesh.ndim < 2:
         raise RuntimeError(
             f"Found parent device_mesh of ndim={parent_mesh.ndim},",
-            "but only 2D meshes are currently supported.",
+            "but meshes must be at least 2D.",
         )
 
     # We need to explicitly call .detach() to return a new tensor detached from the current graph.
@@ -255,7 +255,7 @@ def _chunk_dtensor(
         shard_placements[0] = DShard(0)  # type: ignore[call-overload]
 
         return DTensor.from_local(
-            tensor, parent_mesh, replicate_placements
+            tensor, parent_mesh, replicate_placements, run_check=False
         ).redistribute(
             device_mesh=parent_mesh,
             placements=shard_placements,
@@ -270,13 +270,16 @@ def _chunk_dtensor(
         # For DTensors, it is sharded across tp dimension first and then sharded across FSDP dimension.
         # TP is the inner dimension and FSDP is the outer dimension.
         # Therefore, shard placements for tensor is (Shard(0), tp_placement).
+        # For higher dimensional meshes, it is replicated across other dimensions. For example, with
+        # HSDP the shard placements for tensor is (Replicate, Shard(0), tp_placement).
         replicate_placements = [Replicate() for _ in range(parent_mesh.ndim)]
         replicate_placements[-1] = tp_placement  # type: ignore[call-overload]
-        shard_placements = [DShard(0) for _ in range(parent_mesh.ndim)]  # type: ignore[misc]
+        shard_placements = [Replicate() for i in range(parent_mesh.ndim)]  # type: ignore[misc]
+        shard_placements[-2] = DShard(0)  # type: ignore[call-overload]
         shard_placements[-1] = tp_placement  # type: ignore[call-overload]
 
         return DTensor.from_local(
-            tensor, parent_mesh, replicate_placements
+            tensor, parent_mesh, replicate_placements, run_check=False
         ).redistribute(
             device_mesh=parent_mesh,
             placements=shard_placements,
@@ -304,7 +307,9 @@ def _all_gather_dtensor(
 
     placements = list(copy.deepcopy(tensor.placements))
     # FSDP + TP: [Shard(0), tp_placement] -> [Replicate(), tp_placement]
-    placements[0] = Replicate()
+    # HSDP + TP: [Replicate(), Shard(0), tp_placement] -> [Replicate(), Replicate(), tp_placement]
+    for i in range(0, len(placements) - 1):
+        placements[i] = Replicate()
     tensor = tensor.redistribute(
         device_mesh=tensor.device_mesh,
         placements=placements,
diff --git a/torch/distributed/tensor/parallel/loss.py b/torch/distributed/tensor/parallel/loss.py
new file mode 100644
index 0000000000000..f7144a38e9232
--- /dev/null
+++ b/torch/distributed/tensor/parallel/loss.py
@@ -0,0 +1,484 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import contextlib
+from typing import cast, Dict, Optional, Tuple
+
+import torch
+import torch._prims_common as utils
+import torch.distributed._functional_collectives as funcol
+import torch.distributed.distributed_c10d as c10d
+from torch import Tensor
+from torch.distributed._tensor import DTensor, Replicate, Shard
+from torch.distributed._tensor.ops.embedding_ops import _MaskPartial
+from torch.distributed._tensor.ops.math_ops import (
+    _skip_dim,
+    Reduction,
+    replicate_reduction_dims,
+)
+from torch.distributed._tensor.placement_types import Placement, TensorMeta
+from torch.distributed.device_mesh import DeviceMesh
+
+aten = torch.ops.aten
+
+
+__all__ = ["loss_parallel"]
+
+
+@contextlib.contextmanager
+def loss_parallel():
+    """
+    A context manager that enables loss parallelism, where efficient parallelized loss computation
+    can be performed when the input is sharded on the class dimension. Currently only the cross-entropy
+    loss is supported.
+
+    Within this context manager, one can use :func:`~torch.nn.functional.cross_entropy` or
+    :class:`~torch.nn.CrossEntropyLoss` as usual, with the following assumptions on the input parameters.
+    The corresponding ``backward()`` call, if any, also needs to happen under this context manager.
+
+    Args:
+        input (:class:`DTensor`):
+            Input logits. Assumed to be sharded on the class dimension.
+        target (Union[:class:`torch.Tensor`, :class:`DTensor`]):
+            Must be ground truth class indices (class probabilities currently not supported).
+            Assumed to be replicated across the ``DeviceMesh``.
+        weight (Union[:class:`torch.Tensor`, :class:`DTensor`], optional):
+            If given, assumed to be replicated across the ``DeviceMesh``.
+        label_smoothing:
+            Currently not supported.
+
+    Returns:
+        A replicated :class:`DTensor`.
+
+    Example:
+        A sharded DTensor is manually created here to showcase the usage.
+        In practice, it is usually the output of a TP module.
+
+        >>> # xdoctest: +SKIP("distributed")
+        >>> from torch.distributed.tensor.parallel import loss_parallel
+        >>> from torch.distributed.device_mesh import init_device_mesh
+        >>> ...
+        >>> device_mesh = init_device_mesh("cuda", (8,))
+        >>> input = torch.randn(4, 16, device="cuda", requires_grad=True)
+        >>> dist_input = distribute_tensor(input, device_mesh, placements=[Shard(1)])
+        >>> target = torch.randint(16, (4,), device="cuda")
+        >>> with loss_parallel():
+        >>>     loss = F.cross_entropy(dist_input, target, reduction="mean")
+        >>>     loss.backward()
+        >>> ...
+    """
+    _enable_custom_loss_ops()
+
+    yield
+
+    _disable_custom_loss_ops()
+
+
+# Currently only needs to support one dimensional DeviceMesh; in general return
+# the mesh_dim with placements[mesh_dim].is_shard(dim)
+def _find_all_reduce_mesh_dim(placements: Tuple[Placement, ...], dim: int) -> int:
+    if not len(placements) == 1:
+        raise ValueError(
+            "Currently loss_parallel() only supports input on one-dimensional DeviceMesh."
+        )
+    if not placements[0].is_shard(dim):
+        raise ValueError(
+            f"loss_parallel() should be enabled only when the input tensor is sharded on dimension {dim}."
+        )
+    return 0
+
+
+def _cast_to_dtensor(
+    tensor, placements: Tuple[Placement, ...], mesh: DeviceMesh
+) -> DTensor:
+    if isinstance(tensor, DTensor):
+        if tensor.placements == placements:
+            return tensor
+        else:
+            raise RuntimeError(f"Expected {placements} but got {tensor.placements}.")
+    elif isinstance(tensor, torch.Tensor):
+        return DTensor.from_local(
+            tensor, device_mesh=mesh, placements=placements, run_check=False
+        )
+    else:
+        raise TypeError(f"Unsupported type {type(tensor)}")
+
+
+def _propagate_tensor_meta(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> TensorMeta:
+    op_info = DTensor._op_dispatcher.unwrap_to_op_info(op_call, args, kwargs)
+    tensor_meta = DTensor._op_dispatcher.sharding_propagator._propagate_tensor_meta(
+        op_info.schema
+    )
+    if isinstance(tensor_meta, TensorMeta):
+        return tensor_meta
+    elif isinstance(tensor_meta, tuple):
+        return tensor_meta[0]
+    else:
+        raise RuntimeError(f"Unexpected tensor meta type: {type(tensor_meta)}.")
+
+
+# NOTE: The implementation follows torch._decomp.decomposition._log_softmax,
+# with all_reduce manually inserted to perform distributed computation.
+def _log_softmax(x, dim, half_to_float, mesh, mesh_dim):
+    x = x.contiguous()
+    if half_to_float:
+        assert x.dtype == torch.half
+    computation_dtype, result_dtype = utils.elementwise_dtypes(
+        x, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+    x = x.to(computation_dtype)
+    if x.numel() == 0:
+        shifted = x
+    else:
+        x_max = torch.amax(x, dim, keepdim=True)
+        x_max = funcol.all_reduce(
+            x_max, reduceOp=c10d.ReduceOp.MAX.name, group=(mesh, mesh_dim)
+        )
+        shifted = x - x_max
+    shifted_sumexp = torch.sum(torch.exp(shifted), dim, keepdim=True)
+    shifted_sumexp = funcol.all_reduce(
+        shifted_sumexp, reduceOp=c10d.ReduceOp.SUM.name, group=(mesh, mesh_dim)
+    )
+    shifted_logsumexp = torch.log(shifted_sumexp)
+    result = shifted - shifted_logsumexp
+    if not half_to_float:
+        result = result.to(result_dtype)
+    return result
+
+
+def _log_softmax_handler(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> object:
+    x = cast(DTensor, args[0])
+    dim = cast(int, args[1])
+    half_to_float = cast(bool, args[2])
+
+    spec = x._spec
+    mesh_dim = _find_all_reduce_mesh_dim(spec.placements, dim)
+
+    output_tensor_meta = _propagate_tensor_meta(op_call, args, kwargs)
+
+    res = _log_softmax(x._local_tensor, dim, half_to_float, spec.mesh, mesh_dim)
+
+    return DTensor(
+        res,
+        spec.mesh,
+        spec.placements,
+        shape=output_tensor_meta.shape,
+        dtype=output_tensor_meta.dtype,
+        requires_grad=res.requires_grad,
+        stride=output_tensor_meta.stride,
+    )
+
+
+# NOTE: As explained below at _nll_loss_and_log_softmax_backward, the
+# _log_softmax_backward_handler does not actually do any computation.
+def _log_softmax_backward_handler(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> object:
+    grad_output = cast(DTensor, args[0])
+    input_dtype = cast(torch.dtype, args[3])
+    return grad_output.to(input_dtype)
+
+
+# NOTE: The implementation follows torch._decomp.decomposition._nll_loss_forward,
+# with customized communication inserted to perform distributed computation.
+def _nll_loss_forward(
+    x: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor],
+    local_weight: Optional[Tensor],
+    reduction: int,
+    ignore_index: int,
+    channel_dim_size: int,
+    mesh: DeviceMesh,
+    mesh_dim: int,
+) -> Tuple[Tensor, Tensor]:
+    n_dims = x.dim()
+    channel_dim = 1
+    if n_dims < 2:
+        channel_dim = 0
+
+    def _weight_view(weight: Tensor) -> Tensor:
+        if n_dims > 1:
+            shape = [
+                1,
+            ] * n_dims
+            shape[channel_dim] = weight.shape[0]
+            w = weight.view(shape)
+        else:
+            w = weight
+        return w
+
+    if weight is not None:
+        w = _weight_view(weight)
+        assert local_weight is not None
+        local_w = _weight_view(local_weight)
+        x = x * local_w
+    safe_target = torch.where(target != ignore_index, target, 0)
+    safe_target_ = safe_target.unsqueeze(channel_dim)
+
+    # The following code block is a distributed version of
+    # result = -torch.gather(self, channel_dim, safe_target_).squeeze(channel_dim)
+    partial_placement = _MaskPartial(logical_dim_size=channel_dim_size)
+    safe_target_partial_ = partial_placement._partition_value(
+        safe_target_, mesh, mesh_dim
+    )
+    result_partial = torch.gather(x, channel_dim, safe_target_partial_)
+    # an all_reduce happens here
+    result_reduced = partial_placement._reduce_value(result_partial, mesh, mesh_dim)
+    result = -result_reduced.squeeze(channel_dim)
+
+    result = torch.where(target != ignore_index, result, 0)
+
+    if reduction == Reduction.NONE.value and n_dims > 1:
+        total_weight = x.new_full((), 0.0)
+        return result, total_weight
+
+    if weight is not None:
+        new_shape = list(x.shape)
+        new_shape[channel_dim] = -1
+        w = w.expand(new_shape)
+        wsum = torch.gather(w, channel_dim, safe_target_).squeeze(channel_dim)
+        wsum = torch.where(target != ignore_index, wsum, 0)
+        total_weight = wsum.sum()
+    else:
+        total_weight = (target != ignore_index).sum().to(x)
+
+    # NOTE: this is correct only on 1D DeviceMesh; o/w additional
+    #       all-reduce on result and total_weight is needed
+    if reduction == Reduction.SUM.value:
+        result = result.sum()
+    elif reduction == Reduction.MEAN.value:
+        result = result.sum() / total_weight
+
+    return result, total_weight
+
+
+def _nll_loss_forward_handler(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> object:
+    x = cast(DTensor, args[0])
+    target = args[1]
+    weight = args[2]
+    reduction = cast(int, args[3])
+    ignore_index = cast(int, args[4])
+
+    channel_dim = 1 if x.dim() >= 2 else 0
+    channel_dim_size = x.shape[channel_dim]
+    spec = x._spec
+    mesh_dim = _find_all_reduce_mesh_dim(spec.placements, channel_dim)
+
+    # Check user input: if target and weight are not DTensors, convert them to DTensors;
+    # if they are DTensors, check that they have the desired placements.
+    target_placements = _skip_dim(
+        replicate_reduction_dims(spec.placements, [channel_dim]), channel_dim
+    )
+    all_replicate_placements = (Replicate(),) * spec.mesh.ndim
+    target = _cast_to_dtensor(target, target_placements, spec.mesh)
+    local_weight = None
+    if weight is not None:
+        weight = _cast_to_dtensor(weight, all_replicate_placements, spec.mesh)
+        # For local computation, both (replicated) weight and (sharded) local_weight
+        # are needed in _nll_loss_forward(). local_weight is generated here using
+        # DTensor API, without incurring any communication.
+        sharded_placements = [
+            Shard(0) if i == mesh_dim else Replicate() for i in range(spec.mesh.ndim)
+        ]
+        local_weight = weight.redistribute(spec.mesh, sharded_placements)._local_tensor
+        assert local_weight.shape[0] == x._local_tensor.shape[channel_dim]
+
+    if reduction == Reduction.NONE.value:
+        output_placements = target_placements
+    else:
+        output_placements = all_replicate_placements
+
+    # tensor inputs to _propagate_tensor_meta need to be DTensors
+    args = list(args)
+    args[1], args[2] = target, weight
+    output_tensor_meta = _propagate_tensor_meta(op_call, tuple(args), kwargs)
+
+    result, total_weight = _nll_loss_forward(
+        x._local_tensor,
+        target._local_tensor,
+        weight._local_tensor if weight is not None else None,
+        local_weight,
+        reduction,
+        ignore_index,
+        channel_dim_size,
+        spec.mesh,
+        mesh_dim,
+    )
+
+    return (
+        DTensor(
+            result,
+            spec.mesh,
+            output_placements,
+            shape=output_tensor_meta.shape,
+            dtype=output_tensor_meta.dtype,
+            requires_grad=result.requires_grad,
+            stride=output_tensor_meta.stride,
+        ),
+        total_weight,
+    )
+
+
+# NOTE: The backward computation of cross_entropy goes through two steps:
+# backward for nll_loss and then backward for log_softmax. In loss parallel,
+# the two steps are fused into the following function (called by _nll_loss_backward_handler)
+# to avoid communication when target contains class indices not class probabilities.
+# Also note that the _log_softmax_backward_handler does not perform computation.
+# The implementation resembles _nll_loss_backward and _log_softmax_backward_data
+# from torch._decomp.decomposition.
+def _nll_loss_and_log_softmax_backward(
+    grad_output: Tensor,
+    x: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor],
+    reduction: int,
+    ignore_index: int,
+    total_weight: Tensor,
+    channel_dim_size: int,
+    mesh: DeviceMesh,
+    mesh_dim: int,
+) -> Tensor:
+    channel_dim = 0 if x.dim() < 2 else 1
+    if reduction == Reduction.MEAN.value:
+        grad_output = grad_output / total_weight
+
+    target = target.unsqueeze(channel_dim)
+    safe_target = torch.where(target != ignore_index, target, 0)
+    grad_input = torch.zeros_like(x)
+
+    # The following code block is a distributed version of
+    # grad_input = torch.scatter(grad_input, channel_dim, safe_target, -1.0)
+    partial_placement = _MaskPartial(logical_dim_size=channel_dim_size)
+    safe_target = safe_target.squeeze(channel_dim).flatten()
+    masked_safe_target = partial_placement._partition_value(safe_target, mesh, mesh_dim)
+    # only update grad_input to -1 if not masked
+    assert partial_placement.mask_buffer.data is not None
+    grad_update = partial_placement.mask_buffer.data.float() - 1.0
+    arange_1d = torch.arange(
+        masked_safe_target.shape[0], device=masked_safe_target.device
+    )
+    # The first two cases with x.dim() <= 2 are for aten.nll_loss_backward.default;
+    # the last case is for aten.nll_loss2d_backward.default.
+    if x.dim() == 1:
+        grad_input[masked_safe_target] = grad_update
+    elif x.dim() == 2:
+        grad_input[arange_1d, masked_safe_target] = grad_update
+    else:
+        grad_input_t = grad_input.transpose(channel_dim, -1)
+        intermidate_shape = grad_input_t.shape
+        grad_input_2d = grad_input_t.reshape(-1, x.shape[channel_dim])
+        grad_input_2d[arange_1d, masked_safe_target] = grad_update
+        grad_input = grad_input_2d.view(intermidate_shape).transpose(channel_dim, -1)
+
+    if grad_input.dim() > grad_output.dim() > 0:
+        grad_output = grad_output.unsqueeze(channel_dim)
+
+    if weight is not None:
+        new_shape = [1 for _ in range(x.dim())]
+        new_shape[channel_dim] = weight.shape[0]
+        weight = weight.reshape(new_shape)
+        # In order for fused computation to work, the following line is rewritten.
+        # grad_output = grad_output * weight
+        new_shape = list(x.shape)
+        new_shape[channel_dim] = -1
+        w = weight.expand(new_shape)
+        w_target = torch.gather(w, channel_dim, target)
+        grad_output = grad_output * w_target
+
+    grad_output = torch.where(target != ignore_index, grad_output, 0)
+
+    # NOTE: Instead of directly returning the grad_input as grad_output for log_softmax,
+    # here we perform backward computation for log_softmax altogether to avoid the
+    # otherwise extra all_gather communication.
+    # return grad_input * grad_output
+    return (grad_input + torch.exp(x)) * grad_output
+
+
+def _nll_loss_backward_handler(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> object:
+    grad_output = cast(DTensor, args[0])
+    x = cast(DTensor, args[1])
+    target = args[2]
+    weight = args[3]
+    reduction = cast(int, args[4])
+    ignore_index = cast(int, args[5])
+    total_weight = cast(Tensor, args[6])
+
+    channel_dim = 1 if x.dim() >= 2 else 0
+    channel_dim_size = x.shape[channel_dim]
+    spec = x._spec
+    mesh_dim = _find_all_reduce_mesh_dim(spec.placements, channel_dim)
+
+    # if target and weight are not DTensors, convert them to DTensors
+    target_placements = _skip_dim(
+        replicate_reduction_dims(spec.placements, [channel_dim]), channel_dim
+    )
+    all_replicate_placements = (Replicate(),) * spec.mesh.ndim
+    target = _cast_to_dtensor(target, target_placements, spec.mesh)
+    if weight is not None:
+        weight = _cast_to_dtensor(weight, all_replicate_placements, spec.mesh)
+
+    # tensor inputs to _propagate_tensor_meta need to be DTensors
+    args = list(args)
+    args[2], args[3] = target, weight
+    args[6] = _cast_to_dtensor(total_weight, all_replicate_placements, spec.mesh)
+    output_tensor_meta = _propagate_tensor_meta(op_call, tuple(args), kwargs)
+
+    result = _nll_loss_and_log_softmax_backward(
+        grad_output._local_tensor,
+        x._local_tensor,
+        target._local_tensor,
+        weight._local_tensor if weight is not None else None,
+        reduction,
+        ignore_index,
+        total_weight,
+        channel_dim_size,
+        spec.mesh,
+        mesh_dim,
+    )
+
+    return DTensor(
+        result,
+        spec.mesh,
+        # the output sharding is the same as input sharding: Shard(channel_dim) on mesh_dim
+        spec.placements,
+        shape=output_tensor_meta.shape,
+        dtype=output_tensor_meta.dtype,
+        requires_grad=result.requires_grad,
+        stride=output_tensor_meta.stride,
+    )
+
+
+customized_loss_ops = {
+    aten._log_softmax.default: _log_softmax_handler,
+    aten._log_softmax_backward_data.default: _log_softmax_backward_handler,
+    aten.nll_loss_forward.default: _nll_loss_forward_handler,
+    aten.nll_loss2d_forward.default: _nll_loss_forward_handler,
+    aten.nll_loss_backward.default: _nll_loss_backward_handler,
+    aten.nll_loss2d_backward.default: _nll_loss_backward_handler,
+}
+
+
+def _enable_custom_loss_ops():
+    DTensor._op_dispatcher._custom_op_handlers.update(customized_loss_ops)
+
+
+def _disable_custom_loss_ops():
+    for custom_op in customized_loss_ops:
+        DTensor._op_dispatcher._custom_op_handlers.pop(custom_op)
diff --git a/torch/distributed/tensor/parallel/style.py b/torch/distributed/tensor/parallel/style.py
index 4db010d99196b..2720f9dca7d08 100644
--- a/torch/distributed/tensor/parallel/style.py
+++ b/torch/distributed/tensor/parallel/style.py
@@ -1,8 +1,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 from abc import ABC, abstractmethod
-from typing import Optional, Union, Tuple
+from typing import Optional, Union, Tuple, Dict
 from functools import partial
 
+import torch
 import torch.nn as nn
 from torch.distributed._tensor import DeviceMesh, DTensor, Placement, Replicate, Shard, distribute_tensor, distribute_module
 
@@ -10,6 +11,7 @@
 __all__ = [
     "ParallelStyle",
     "RowwiseParallel",
+    "SequenceParallel",
     "ColwiseParallel",
     "PrepareModuleInput",
     "PrepareModuleOutput",
@@ -50,15 +52,15 @@ class ColwiseParallel(ParallelStyle):
     Example::
         >>> # xdoctest: +SKIP(failing)
         >>> from torch.distributed.tensor.parallel import parallelize_module, ColwiseParallel
+        >>> from torch.distributed.device_mesh import init_device_mesh
         >>> ...
-        >>> # By default, the input of the "w1" Linear will be annotated to Replicated DTensor
+        >>> m = Model(...)  # m is a nn.Module that contains a "w1" nn.Linear submodule
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>>
+        >>> # By default, the input of the "w1" Linear will be converted to Replicated DTensor
         >>> # and the output of "w1" will return :class:`torch.Tensor` that shards on the last dim.
-        >>>>
-        >>> parallelize_module(
-        >>>     module=block, # this can be a submodule or module
-        >>>     ...,
-        >>>     parallelize_plan={"w1": ColwiseParallel()},
-        >>> )
+        >>>
+        >>> sharded_mod = parallelize_module(m, tp_mesh, {"w1": ColwiseParallel()})
         >>> ...
 
     .. note:: By default ``ColwiseParallel`` output is sharded on the last dimension if the ``output_layouts`` not
@@ -83,7 +85,7 @@ def __init__(
         self.use_local_output = use_local_output
 
     @staticmethod
-    def _prepare_input_fn(input_layouts, desired_input_layouts, inputs, device_mesh):
+    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
         # TODO: figure out dynamo support for instance method and switch this to instance method
 
         # annotate module input placements/sharding with input_layouts
@@ -93,44 +95,47 @@ def _prepare_input_fn(input_layouts, desired_input_layouts, inputs, device_mesh)
 
         # transform the input layouts to the desired layouts of ColwiseParallel
         if input_layouts != desired_input_layouts:
-            input_tensor = input_tensor.redistribute(placements=desired_input_layouts)
+            input_tensor = input_tensor.redistribute(placements=desired_input_layouts, async_op=True)
         return input_tensor
 
-    def _partition_fn(self, name, module, device_mesh):
-        if isinstance(module, nn.Linear):
-            # colwise shard weight/bias to Shard(0), weight be Shard(0)
-            # means Colwise as Linear is input * weight^T + bias, where
-            # weight would become Shard(1)
-            for name, param in module.named_parameters():
-                dist_param = nn.Parameter(
-                    distribute_tensor(param, device_mesh, [Shard(0)])
-                )
-                module.register_parameter(name, dist_param)
-        elif isinstance(module, nn.Embedding):
-            # colwise shard embedding.weight is straight forward as Shard(1)
-            for name, param in module.named_parameters():
-                dist_param = nn.Parameter(
-                    distribute_tensor(param, device_mesh, [Shard(1)])
-                )
-                module.register_parameter(name, dist_param)
-        else:
-            raise NotImplementedError(
-                "ColwiseParallel only supports nn.Linear"
-                f"and nn.Embedding for now, but found {type(module)}!"
+    def _partition_linear_fn(self, name, module, device_mesh):
+        # colwise shard weight/bias to Shard(0), weight be Shard(0)
+        # means Colwise as Linear is input * weight^T + bias, where
+        # weight would become Shard(1)
+        for name, param in module.named_parameters():
+            dist_param = nn.Parameter(
+                distribute_tensor(param, device_mesh, [Shard(0)])
             )
+            module.register_parameter(name, dist_param)
+
+    def _partition_embedding_fn(self, name, module, device_mesh):
+        # colwise shard embedding.weight is straight forward as Shard(1)
+        for name, param in module.named_parameters():
+            dist_param = nn.Parameter(
+                distribute_tensor(param, device_mesh, [Shard(1)])
+            )
+            module.register_parameter(name, dist_param)
 
     @staticmethod
-    def _prepare_output_fn(output_layouts, use_local_output, outputs, device_mesh):
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
         # outputs is a shard on last dimension DTensor, i.e. Shard(-1)
-        outputs = outputs.redistribute(placements=output_layouts)
+        if outputs.placements != output_layouts:
+            outputs = outputs.redistribute(placements=output_layouts, async_op=True)
         # back to local tensor
         return outputs.to_local() if use_local_output else outputs
 
     def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        if isinstance(module, nn.Linear):
+            partition_fn = self._partition_linear_fn
+        elif isinstance(module, nn.Embedding):
+            partition_fn = self._partition_embedding_fn
+        else:
+            raise NotImplementedError("ColwiseParallel currently only support nn.Linear and nn.Embedding!")
+
         return distribute_module(
             module,
             device_mesh,
-            self._partition_fn,
+            partition_fn,
             partial(self._prepare_input_fn, self.input_layouts, self.desired_input_layouts),
             partial(self._prepare_output_fn, self.output_layouts, self.use_local_output),
         )
@@ -138,7 +143,7 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
 
 class RowwiseParallel(ParallelStyle):
     """
-    Partition a compatible nn.Module in a row-wise fashion. Currently supports nn.Linear only.
+    Partition a compatible nn.Module in a row-wise fashion. Currently supports nn.Linear and nn.Embedding.
     Users can compose it with ColwiseParallel to achieve the sharding of more complicated modules.
     (i.e. MLP, Attention)
 
@@ -157,15 +162,15 @@ class RowwiseParallel(ParallelStyle):
     Example::
         >>> # xdoctest: +SKIP(failing)
         >>> from torch.distributed.tensor.parallel import parallelize_module, RowwiseParallel
+        >>> from torch.distributed.device_mesh import init_device_mesh
         >>> ...
-        >>> # By default, the input of the "w2" Linear will be annotated to DTensor that shards on the last dim
+        >>> m = Model(...)  # m is a nn.Module that contains a "w2" nn.Linear submodule
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>>
+        >>> # By default, the input of the "w2" Linear will be converted to DTensor that shards on the last dim
         >>> # and the output of "w2" will return a replicated :class:`torch.Tensor`.
         >>>
-        >>> parallelize_module(
-        >>>     module=block, # this can be a submodule or module
-        >>>     ...,
-        >>>     parallelize_plan={"w2": RowwiseParallel()},
-        >>> )
+        >>> sharded_mod = parallelize_module(m, tp_mesh, {"w2": RowwiseParallel()}),
         >>> ...
     """
 
@@ -179,66 +184,170 @@ def __init__(
         super().__init__()
         self.input_layouts = (input_layouts or Shard(-1), )
         self.output_layouts = (output_layouts or Replicate(), )
-        # rowwise linear runtime sharding:
-        # 1. shard input on last dim
-        # 2. partial output, to replicate -> allreduce, to shard -> reduce_scatter
-        self.desired_input_layouts = (Shard(-1), )
         self.use_local_output = use_local_output
 
     @staticmethod
-    def _prepare_input_fn(input_layouts, desired_input_layouts, inputs, device_mesh):
+    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
         input_tensor = inputs[0]
         if not isinstance(input_tensor, DTensor):
             input_tensor = DTensor.from_local(input_tensor, device_mesh, input_layouts, run_check=False)
 
         if input_layouts != desired_input_layouts:
-            input_tensor = input_tensor.redistribute(placements=desired_input_layouts)
+            input_tensor = input_tensor.redistribute(placements=desired_input_layouts, async_op=True)
         return input_tensor
 
-    def _partition_fn(self, name, module, device_mesh):
-        if isinstance(module, nn.Linear):
-            # Rowwise shard weight to Shard(1), bias to Replicate(), weight be Shard(1)
-            # means Rowwise as Linear is input * weight^T + bias, where
-            # weight would become Shard(0)
-            module.register_parameter("weight", nn.Parameter(
-                distribute_tensor(module.weight, device_mesh, [Shard(1)])
+    def _partition_linear_fn(self, name, module, device_mesh):
+        # Rowwise shard weight to Shard(1), bias to Replicate(), weight be Shard(1)
+        # means Rowwise as nn.Linear is input * weight^T + bias, where
+        # weight would become Shard(0)
+        module.register_parameter("weight", nn.Parameter(
+            distribute_tensor(module.weight, device_mesh, [Shard(1)])
+        ))
+        if module.bias is not None:
+            module.register_parameter("bias", nn.Parameter(
+                distribute_tensor(module.bias, device_mesh, [Replicate()])
             ))
-            if module.bias is not None:
-                module.register_parameter("bias", nn.Parameter(
-                    distribute_tensor(module.bias, device_mesh, [Replicate()])
-                ))
-        else:
-            raise NotImplementedError("RowwiseParallel currently only support nn.Linear!")
+
+    def _partition_embedding_fn(self, name, module, device_mesh):
+        # rowwise shard embedding.weight is Shard(0)
+        for name, param in module.named_parameters():
+            dist_param = nn.Parameter(
+                distribute_tensor(param, device_mesh, [Shard(0)])
+            )
+            module.register_parameter(name, dist_param)
 
     @staticmethod
-    def _prepare_output_fn(output_layouts, use_local_output, outputs, device_mesh):
-        outputs = outputs.redistribute(placements=output_layouts)
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        # Rowwise sharding produces partial output, depending on output layouts:
+        # 1. to replicate -> allreduce
+        # 2. to shard -> reduce_scatter
+        if outputs.placements != output_layouts:
+            outputs = outputs.redistribute(placements=output_layouts, async_op=True)
         # back to local tensor if use_local_output is True
         return outputs.to_local() if use_local_output else outputs
 
     def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        if isinstance(module, nn.Linear):
+            partition_fn = self._partition_linear_fn
+            # rowwise linear runtime sharding requires input tensor shard on last dim
+            self.desired_input_layouts: Tuple[Placement, ...] = (Shard(-1), )
+        elif isinstance(module, nn.Embedding):
+            partition_fn = self._partition_embedding_fn
+            # rowwise embedding runtime sharding requires input tensor replicated
+            self.desired_input_layouts = (Replicate(), )
+        else:
+            raise NotImplementedError("RowwiseParallel currently only support nn.Linear and nn.Embedding!")
+
         return distribute_module(
             module,
             device_mesh,
-            self._partition_fn,
+            partition_fn,
             partial(self._prepare_input_fn, self.input_layouts, self.desired_input_layouts),
             partial(self._prepare_output_fn, self.output_layouts, self.use_local_output),
         )
 
 
+class SequenceParallel(ParallelStyle):
+    """
+    SequenceParallel replicates a compatible ``nn.Module`` parameters and runs the sharded computation with
+    input sharded on the sequence dimension. This currently supports ``nn.LayerNorm``, ``nn.Dropout``, and the
+    `RMSNorm python implementation <https://github.com/facebookresearch/llama/blob/main/llama/model.py#L34>`__
+
+    This style implements the operation that is described in the paper
+    `Reducing Activation Recomputation in Large Transformer Models <https://arxiv.org/abs/2205.05198>`__
+
+    Both the input and output of the ``nn.Module`` will be sharded on the sequence dimension.
+
+    Keyword Args:
+        sequence_dim (int, optional):
+            The sequence dimension of the input tensor for the ``nn.Module``, this is used to annotate the input tensor to
+            become a DTensor that is sharded on the sequence dimension, default: 1.
+        use_local_output (bool, optional):
+            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module output, default: False.
+    Returns:
+        A :class:`ParallelStyle` object that represents Sequence Parallel of the ``nn.Module``.
+
+    Example::
+        >>> # xdoctest: +SKIP(failing)
+        >>> from torch.distributed.tensor.parallel import parallelize_module, SequenceParallel
+        >>> from torch.distributed.device_mesh import init_device_mesh
+        >>> ...
+        >>> m = Model(...)  # m is a nn.Module that contains a "norm" nn.LayerNorm submodule
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>>
+        >>> # By default, the input of the "norm" will be converted to DTensor that shards on the sequence dim
+        >>> # and the output of "norm" will return a sharded on sequence dimension :class:`DTensor`.
+        >>>
+        >>> sharded_mod = parallelize_module(m, tp_mesh, {"norm": SequenceParallel()}),
+        >>> ...
+
+    .. note:: SequenceParallel style assumes ones initialization if there are weights in the nn.Module (i.e.
+        ``nn.LayerNorm`` or ``RMSNorm``, and they by default have ones initialization). If you have custom
+        inits for the weights on those modules, you need to broadcast the weights before/after parallelizing
+        to ensure that they are replicated.
+    """
+    def __init__(
+        self,
+        *,
+        sequence_dim: int = 1,
+        use_local_output: bool = False
+    ):
+        super().__init__()
+        self.sequence_dim = sequence_dim
+        self.use_local_output = use_local_output
+
+    def _replicate_module_fn(self, name: str, module: nn.Module, device_mesh: DeviceMesh):
+        for p_name, param in module.named_parameters():
+            # simple replication with fixed ones_ init from LayerNorm/RMSNorm, which allow
+            # us to simply just use from_local
+            replicated_param = torch.nn.Parameter(
+                DTensor.from_local(param, device_mesh, [Replicate()], run_check=False)
+            )
+            module.register_parameter(p_name, replicated_param)
+
+    @staticmethod
+    def _prepare_input_fn(sequence_dim, mod, inputs, device_mesh):
+        input_tensor = inputs[0]
+        if isinstance(input_tensor, DTensor):
+            return inputs
+        elif isinstance(input_tensor, torch.Tensor):
+            return DTensor.from_local(input_tensor, device_mesh, [Shard(sequence_dim)], run_check=False)
+        else:
+            raise ValueError(f"expecting input of {mod} to be a torch.Tensor or DTensor, but got {input_tensor}")
+
+    @staticmethod
+    def _prepare_output_fn(use_local_output, mod, outputs, device_mesh):
+        return outputs.to_local() if use_local_output else outputs
+
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        return distribute_module(
+            module,
+            device_mesh,
+            self._replicate_module_fn,
+            partial(self._prepare_input_fn, self.sequence_dim),
+            partial(self._prepare_output_fn, self.use_local_output),
+        )
+
+
 class PrepareModuleInput(ParallelStyle):
     """
     Configure the nn.Module's inputs to convert the input tensors of the nn.Module to DTensors at runtime according to
     ``input_layouts``, and perform layout redistribution according to the ``desired_input_layouts``.
 
     Keyword Args:
-        input_layouts (Union[Placement, Tuple[Placement]]):
+        input_layouts (Union[Placement, Tuple[Optional[Placement]]]):
             The DTensor layouts of input tensors for the nn.Module, this is used to convert the input tensors to
             DTensors. If some inputs are not torch.Tensor or no need to convert to DTensors, ``None`` need to be specified
-            as a placeholder.
-        desired_input_layouts (Union[Placement, Tuple[Placement]]):
+            as a placeholder. default: None.
+        desired_input_layouts (Union[Placement, Tuple[Optional[Placement]]]):
             The desired DTensor layout of input tensors for the nn.Module, this is used to ensure the inputs of the nn.Module
-            have the desired DTensor layouts. This argument needs to have the same length with ``input_layouts``.
+            have the desired DTensor layouts. This argument needs to have the same length with ``input_layouts``. default: None.
+        input_kwarg_layouts (Dict[str, Placement]):
+            The DTensor layouts of input kwargs for the nn.Module, this is used to convert the input kwarg tensors to DTensors.
+            default: None
+        desired_input_kwarg_layouts: (Dict[str, Placement]):
+            The desired DTensor layout of input kwargs for the nn.Module, this is used to ensure the inputs of the nn.Module
+            have the desired DTensor layouts. default: None.
         use_local_output (bool, optional):
             Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module inputs, default: False.
     Returns:
@@ -247,12 +356,16 @@ class PrepareModuleInput(ParallelStyle):
     Example::
         >>> # xdoctest: +SKIP(failing)
         >>> from torch.distributed.tensor.parallel import parallelize_module, PrepareModuleInput
+        >>> from torch.distributed.device_mesh import init_device_mesh
         >>> ...
+        >>> block = TransformerBlock(...)  # block is a nn.Module that contains an "attn" Attention submodule
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>>
         >>> # According to the style specified below, the first input of attn will be annotated to Sharded DTensor
         >>> # and then redistributed to Replicated DTensor.
         >>> parallelize_module(
-        >>>     module=block, # this can be a submodule or module
-        >>>     ...,
+        >>>     block, # this can be a submodule or module
+        >>>     tp_mesh,
         >>>     parallelize_plan={
         >>>         "attn": PrepareModuleInput(
         >>>             input_layouts=(Shard(0), None, None, ...),
@@ -265,24 +378,37 @@ class PrepareModuleInput(ParallelStyle):
     def __init__(
         self,
         *,
-        input_layouts: Union[Placement, Tuple[Placement]],
-        desired_input_layouts: Union[Placement, Tuple[Placement]],
+        input_layouts: Optional[Union[Placement, Tuple[Optional[Placement]]]] = None,
+        desired_input_layouts: Optional[Union[Placement, Tuple[Optional[Placement]]]] = None,
+        input_kwarg_layouts: Optional[Dict[str, Placement]] = None,
+        desired_input_kwarg_layouts: Optional[Dict[str, Placement]] = None,
         use_local_output: bool = False
     ):
         self.input_layouts = (input_layouts,) if isinstance(input_layouts, Placement) else input_layouts
         self.desired_input_layouts = \
             (desired_input_layouts,) if isinstance(desired_input_layouts, Placement) else desired_input_layouts
         self.use_local_output = use_local_output
-        assert len(self.input_layouts) == len(self.desired_input_layouts), \
-            "input_layouts and desired_input_layouts should have same length!"
+        if self.input_layouts is not None:
+            assert self.desired_input_layouts is not None, "desired module inputs should not be None!"
+            assert len(self.input_layouts) == len(self.desired_input_layouts), \
+                "input_layouts and desired_input_layouts should have same length!"
+        self.with_kwargs = input_kwarg_layouts is not None
+        self.input_kwarg_layouts = input_kwarg_layouts or {}
+        self.desired_input_kwarg_layouts = desired_input_kwarg_layouts or {}
+        if self.with_kwargs:
+            assert len(self.input_kwarg_layouts) == len(self.desired_input_kwarg_layouts), \
+                "input_kwarg_layouts and desired_input_kwarg_layouts should have same length!"
 
     def _prepare_input_fn(self, inputs, device_mesh):
+        if self.input_layouts is None:
+            return inputs
         prepared_inputs = []
         if not isinstance(inputs, tuple):
             inputs = (inputs,)
         if len(inputs) != len(self.input_layouts):
             raise ValueError("module inputs and input_layouts should have same length!")
 
+        assert self.desired_input_layouts is not None, "desired module inputs should not be None!"
         for inp, input_layout, desired_layout in zip(inputs, self.input_layouts, self.desired_input_layouts):
             if input_layout is not None:
                 if isinstance(inp, DTensor):
@@ -291,15 +417,44 @@ def _prepare_input_fn(self, inputs, device_mesh):
                     dt_inp = inp
                 else:
                     dt_inp = DTensor.from_local(inp, device_mesh, (input_layout,), run_check=False)
-                if input_layout != desired_layout:
+
+                if desired_layout is not None and input_layout != desired_layout:
                     dt_inp = dt_inp.redistribute(placements=(desired_layout,))
                 prepared_inputs.append(dt_inp.to_local() if self.use_local_output else dt_inp)
             else:
                 prepared_inputs.append(inp)
         return tuple(prepared_inputs)
 
+    def _prepare_input_kwarg_fn(self, inputs, kwarg_inputs, device_mesh):
+        prepared_arg_inputs = self._prepare_input_fn(inputs, device_mesh)
+        prepared_kwarg_inputs = {}
+        for kwarg_key in kwarg_inputs.keys():
+            kwarg_val = kwarg_inputs[kwarg_key]
+            input_layout = None
+            if kwarg_key in self.input_kwarg_layouts:
+                input_layout = self.input_kwarg_layouts[kwarg_key]
+                assert isinstance(kwarg_val, torch.Tensor), f"input of key {kwarg_key} to the module should be a Tensor!"
+                kwarg_val = DTensor.from_local(kwarg_val, device_mesh, (input_layout,), run_check=False)
+
+                if kwarg_key in self.desired_input_kwarg_layouts:
+                    desired_layout = self.desired_input_kwarg_layouts[kwarg_key]
+                    if desired_layout != input_layout:
+                        kwarg_val = kwarg_val.redistribute(placements=(desired_layout,))
+
+                prepared_kwarg_inputs[kwarg_key] = kwarg_val.to_local() if self.use_local_output else kwarg_val
+            else:
+                prepared_kwarg_inputs[kwarg_key] = kwarg_val
+
+        return (prepared_arg_inputs, prepared_kwarg_inputs)
+
     def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
-        module.register_forward_pre_hook(lambda _, inputs: self._prepare_input_fn(inputs, device_mesh))  # type: ignore[misc, call-arg]
+        if self.with_kwargs:
+            module.register_forward_pre_hook(
+                lambda _, inputs, kwargs: self._prepare_input_kwarg_fn(inputs, kwargs, device_mesh),
+                with_kwargs=True
+            )  # type: ignore[misc]
+        else:
+            module.register_forward_pre_hook(lambda _, inputs: self._prepare_input_fn(inputs, device_mesh))  # type: ignore[misc, call-arg]
         return module
 
 
@@ -317,25 +472,27 @@ class PrepareModuleOutput(ParallelStyle):
             The desired DTensor layouts of output tensors for the nn.Module, this is used to ensure the outputs of the nn.Module
             have the desired DTensor layouts.
         use_local_output (bool, optional):
-            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module outputs, default: False.
+            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module outputs, default: True.
     Returns:
         A ParallelStyle object that prepares the sharding layouts of the nn.Module's outputs.
 
     Example::
         >>> # xdoctest: +SKIP(failing)
         >>> from torch.distributed.tensor.parallel import parallelize_module, PrepareModuleOutput
+        >>> from torch.distributed.device_mesh import init_device_mesh
         >>> ...
-        >>> # According to the style specified below, the first input of attn will be annotated to Sharded DTensor
-        >>> # and then redistributed to Replicated DTensor.
+        >>> block = TransformerBlock(...)  # block is a nn.Module that contains an "attn" Attention submodule
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>>
+        >>> # According to the style specified below, the output of the TransformerBlock will be converted to Replicated DTensor
+        >>> # and then redistributed to Sharded DTensor.
         >>> parallelize_module(
-        >>>     module=block, # this can be a submodule or module
-        >>>     ...,
-        >>>     parallelize_plan={
-        >>>         "submodule": PrepareModuleOutput(
-        >>>             output_layouts=Replicate(),
-        >>>             desired_output_layouts=Shard(0)
-        >>>         ),
-        >>>     }
+        >>>     block, # this can be a submodule or module
+        >>>     tp_mesh,
+        >>>     parallelize_plan = PrepareModuleOutput(
+        >>>         output_layouts=Replicate(),
+        >>>         desired_output_layouts=Shard(0)
+        >>>     )
         >>> )
     """
     def __init__(
diff --git a/torch/distributions/kl.py b/torch/distributions/kl.py
index 2b9db6ef2558d..923f1edcdf416 100644
--- a/torch/distributions/kl.py
+++ b/torch/distributions/kl.py
@@ -36,9 +36,9 @@
 from .uniform import Uniform
 from .utils import _sum_rightmost, euler_constant as _euler_gamma
 
-_KL_REGISTRY = (
-    {}
-)  # Source of truth mapping a few general (type, type) pairs to functions.
+_KL_REGISTRY: Dict[
+    Tuple[Type, Type], Callable
+] = {}  # Source of truth mapping a few general (type, type) pairs to functions.
 _KL_MEMOIZE: Dict[
     Tuple[Type, Type], Callable
 ] = {}  # Memoized version mapping many specific (type, type) pairs to functions.
@@ -128,9 +128,8 @@ def _dispatch_kl(type_p, type_q):
     right_fun = _KL_REGISTRY[right_p, right_q]
     if left_fun is not right_fun:
         warnings.warn(
-            "Ambiguous kl_divergence({}, {}). Please register_kl({}, {})".format(
-                type_p.__name__, type_q.__name__, left_p.__name__, right_q.__name__
-            ),
+            f"Ambiguous kl_divergence({type_p.__name__}, {type_q.__name__}). "
+            f"Please register_kl({left_p.__name__}, {right_q.__name__})",
             RuntimeWarning,
         )
     return left_fun
diff --git a/torch/distributions/mixture_same_family.py b/torch/distributions/mixture_same_family.py
index b5a853797b4e8..8db242e33253a 100644
--- a/torch/distributions/mixture_same_family.py
+++ b/torch/distributions/mixture_same_family.py
@@ -195,8 +195,8 @@ def _pad(self, x):
         return x.unsqueeze(-1 - self._event_ndims)
 
     def _pad_mixture_dimensions(self, x):
-        dist_batch_ndims = self.batch_shape.numel()
-        cat_batch_ndims = self.mixture_distribution.batch_shape.numel()
+        dist_batch_ndims = len(self.batch_shape)
+        cat_batch_ndims = len(self.mixture_distribution.batch_shape)
         pad_ndims = 0 if cat_batch_ndims == 1 else dist_batch_ndims - cat_batch_ndims
         xs = x.shape
         x = x.reshape(
diff --git a/torch/distributions/transformed_distribution.py b/torch/distributions/transformed_distribution.py
index 060909f38ad06..b2201278ea8db 100644
--- a/torch/distributions/transformed_distribution.py
+++ b/torch/distributions/transformed_distribution.py
@@ -67,9 +67,7 @@ def __init__(self, base_distribution, transforms, validate_args=None):
         transform = ComposeTransform(self.transforms)
         if len(base_shape) < transform.domain.event_dim:
             raise ValueError(
-                "base_distribution needs to have shape with size at least {}, but got {}.".format(
-                    transform.domain.event_dim, base_shape
-                )
+                f"base_distribution needs to have shape with size at least {transform.domain.event_dim}, but got {base_shape}."
             )
         forward_shape = transform.forward_shape(base_shape)
         expanded_base_shape = transform.inverse_shape(forward_shape)
diff --git a/torch/export/__init__.py b/torch/export/__init__.py
index bb9cc092f1f7b..5eea452dbe1be 100644
--- a/torch/export/__init__.py
+++ b/torch/export/__init__.py
@@ -22,7 +22,6 @@
 )
 
 import torch
-import torch.fx._pytree as fx_pytree
 import torch.utils._pytree as pytree
 from torch.fx._compatibility import compatibility
 
@@ -62,7 +61,7 @@
 ]
 
 
-from .dynamic_shapes import Constraint, Dim, dims, dynamic_dim
+from .dynamic_shapes import Constraint, Dim, dims, dynamic_dim, ShapesCollection
 from .exported_program import ExportedProgram, ModuleCallEntry, ModuleCallSignature
 from .graph_signature import ExportBackwardSignature, ExportGraphSignature
 from .unflatten import FlatArgsAdapter, unflatten, UnflattenedModule
@@ -72,12 +71,11 @@
 
 
 def export(
-    f: Callable,
+    mod: torch.nn.Module,
     args: Tuple[Any, ...],
     kwargs: Optional[Dict[str, Any]] = None,
     *,
-    constraints: Optional[List[Constraint]] = None,
-    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any], List[Any]]] = None,
     strict: bool = True,
     preserve_module_call_signature: Tuple[str, ...] = (),
 ) -> ExportedProgram:
@@ -122,22 +120,14 @@ def export(
     ``dynamic_shapes`` argument to your :func:`export` call.
 
     Args:
-        f: The callable to trace.
+        mod: We will trace the forward method of this module.
 
         args: Example positional inputs.
 
         kwargs: Optional example keyword inputs.
 
-        constraints: [DEPRECATED: use ``dynamic_shapes`` instead, see below]
-         An optional list of constraints on the dynamic arguments
-         that specify their possible range of shapes. By default, shapes of
-         input torch.Tensors are assumed to be static. If an input torch.Tensor
-         is expected to have dynamic shapes, please use :func:`dynamic_dim`
-         to define :class:`Constraint` objects that specify the dynamics and the possible
-         range of shapes. See :func:`dynamic_dim` docstring for examples on
-         how to use it.
-
-        dynamic_shapes: Should either be:
+        dynamic_shapes:
+         An optional argument where the type should either be:
          1) a dict from argument names of ``f`` to their dynamic shape specifications,
          2) a tuple that specifies dynamic shape specifications for each input in original order.
          If you are specifying dynamism on keyword args, you will need to pass them in the order that
@@ -175,24 +165,17 @@ def export(
 
     """
     from ._trace import _export
-    from .dynamic_shapes import _process_dynamic_shapes
-
-    if constraints is not None:
-        warnings.warn(
-            "Using `constraints` to specify dynamic shapes for export is DEPRECATED "
-            "and will not be supported in the future. "
-            "Please use `dynamic_shapes` instead (see docs on `torch.export.export`).",
-            DeprecationWarning,
-            stacklevel=2,
+
+    if not isinstance(mod, torch.nn.Module):
+        raise ValueError(
+            f"Expected `mod` to be an instance of `torch.nn.Module`, got {type(mod)}."
         )
-    else:
-        constraints = _process_dynamic_shapes(f, args, kwargs, dynamic_shapes)
 
     return _export(
-        f,
+        mod,
         args,
         kwargs,
-        constraints,
+        dynamic_shapes,
         strict=strict,
         preserve_module_call_signature=preserve_module_call_signature,
     )
@@ -252,6 +235,11 @@ def forward(self, x):
     """
     from torch._export import save
 
+    if not isinstance(ep, ExportedProgram):
+        raise TypeError(
+            f"The 'ep' parameter must be an instance of 'ExportedProgram', got '{type(ep).__name__}' instead."
+        )
+
     save(ep, f, extra_files=extra_files, opset_version=opset_version)
 
 
@@ -313,12 +301,19 @@ def load(
     )
 
 
-def register_dataclass(cls: Type[Any]) -> None:
+def register_dataclass(
+    cls: Type[Any],
+    *,
+    serialized_type_name: Optional[str] = None,
+) -> None:
     """
     Registers a dataclass as a valid input/output type for :func:`torch.export.export`.
 
     Args:
         cls: the dataclass type to register
+        serialized_type_name: The serialized name for the dataclass. This is
+        required if you want to serialize the pytree TreeSpec containing this
+        dataclass.
 
     Example::
 
@@ -344,4 +339,6 @@ def fn(o: InputDataClass) -> torch.Tensor:
 
     from torch._export.utils import register_dataclass_as_pytree_node
 
-    return register_dataclass_as_pytree_node(cls)
+    return register_dataclass_as_pytree_node(
+        cls, serialized_type_name=serialized_type_name
+    )
diff --git a/torch/export/_remove_auto_functionalized_pass.py b/torch/export/_remove_auto_functionalized_pass.py
new file mode 100644
index 0000000000000..c1cea8ec005f4
--- /dev/null
+++ b/torch/export/_remove_auto_functionalized_pass.py
@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import operator
+from typing import List
+
+import torch
+from torch._higher_order_ops.auto_functionalize import (
+    auto_functionalized,
+    get_mutable_arg_names,
+)
+from torch.export import ExportedProgram
+
+
+def _remove_auto_functionalization_from_graph_helper(ep, auto_functionalize_nodes):
+    # Update every use of the HOP
+    for node in reversed(auto_functionalize_nodes):
+        func = node.args[0]
+        original_kwargs = node.kwargs
+        assert isinstance(func, torch._ops.OpOverload)
+
+        with ep.graph.inserting_before(node):
+            # This makes the call_function refer to every arg as a kwarg, this is weird but probably fine?
+            new_node = ep.graph.call_function(func, kwargs=node.kwargs)
+        for k, v in node.meta.items():
+            new_node.meta[k] = v
+
+        # Replace auto_functionalize(func, args) with just func(args)
+        node.replace_all_uses_with(new_node)
+
+        mutable_args_names = get_mutable_arg_names(new_node.target)
+
+        # update the users of the auto_func node (the getitem nodes)
+        for user in list(new_node.users.keys()):
+            assert user.target == operator.getitem
+            # getitem corresponding to a mutated input, just replace all uses with the original input
+            if user.args[1] >= len(func._schema.returns):
+                assert user.args[1] <= len(func._schema.returns) + len(
+                    mutable_args_names
+                )
+
+                # If the result of getitem was used in an output node, update the output spec with the correct name
+                adjusted_index = user.args[1] - len(func._schema.returns)
+                original_arg = original_kwargs[mutable_args_names[adjusted_index]]
+
+                # This is a little fragile/implementation dependent, but the order of the mutable args is the same as the order
+                # of the getitem calls following the HOP.
+                user.replace_all_uses_with(original_arg)
+
+        if len(func._schema.returns) == 1:
+            # If the function has 1 return then it will just directly return the
+            # result -- we don't need a getitem. So we can replace all the
+            # getitem(auto_functionalized, 0) with just the note itself.
+            for user in list(new_node.users.keys()):
+                if user.args[1] == 0:
+                    user.replace_all_uses_with(new_node)
+
+        new_node.meta["val"] = node.meta["val"][: len(func._schema.returns)]
+        ep.graph.erase_node(node)
+
+    ep.graph.eliminate_dead_code()
+
+
+def unsafe_remove_auto_functionalized_pass(
+    ep: ExportedProgram,
+) -> ExportedProgram:
+    """
+    This pass removes an instances of the higher order op 'auto_functionalized',
+    and modifies the calling EP inplace to have the original mutator op.
+    This pass doesn't perform safety checks to make sure that this inplace mutation is safe.
+    """
+    auto_functionalize_nodes: List[torch.fx.Node] = []
+    for module in ep.graph_module.modules():
+        if not isinstance(module, torch.fx.GraphModule):
+            continue
+        for node in ep.graph.nodes:
+            if node.op == "call_function" and node.target is auto_functionalized:
+                auto_functionalize_nodes.append(node)
+
+    with ep.graph_module._set_replace_hook(ep.graph_signature.get_replace_hook()):
+        _remove_auto_functionalization_from_graph_helper(ep, auto_functionalize_nodes)
+
+    return ep
diff --git a/torch/export/_remove_effect_tokens_pass.py b/torch/export/_remove_effect_tokens_pass.py
new file mode 100644
index 0000000000000..235b43b969aa9
--- /dev/null
+++ b/torch/export/_remove_effect_tokens_pass.py
@@ -0,0 +1,134 @@
+import operator
+from typing import List
+
+import torch
+from torch._higher_order_ops.effects import with_effects
+from .exported_program import ExportedProgram
+from .graph_signature import InputKind, InputSpec, OutputKind, OutputSpec, TokenArgument
+
+
+def _remove_effect_tokens_from_graph_helper(
+    ep, num_tokens, input_token_names, output_token_names
+):
+    output_node = None
+    with_effect_nodes: List[torch.fx.Node] = []
+    for module in ep.graph_module.modules():
+        if not isinstance(module, torch.fx.GraphModule):
+            continue
+
+        for node in ep.graph.nodes:
+            if node.op == "output":
+                output_node = node
+                break
+
+            if not (node.op == "call_function" and node.target is with_effects):
+                continue
+
+            with_effect_nodes.append(node)
+
+    # Remove tokens from outputs
+    assert output_node is not None
+    output_args = output_node.args[0]
+    assert len(output_args) >= num_tokens
+    out_token_nodes = output_args[:num_tokens]
+    output_node.args = (tuple(output_args[num_tokens:]),)
+    for out_token in out_token_nodes:
+        assert out_token.name in output_token_names
+        ep.graph.erase_node(out_token)
+
+    # Replace with_effects(token, func, args) with just func(args)
+    for node in reversed(with_effect_nodes):
+        func = node.args[1]
+        assert isinstance(func, torch._ops.OpOverload)
+
+        with ep.graph.inserting_before(node):
+            new_node = ep.graph.call_function(func, node.args[2:])
+        for k, v in node.meta.items():
+            new_node.meta[k] = v
+
+        node.replace_all_uses_with(new_node)
+
+        # Update user getitem nodes
+        for user in list(new_node.users.keys()):
+            assert user.target == operator.getitem
+            # getitem(with_effects, 0) == token
+            if user.args[1] == 0:
+                ep.graph.erase_node(user)
+
+        if len(func._schema.returns) == 1:
+            # If the function has 1 return then it will just directly return the
+            # result -- we don't need a getitem. So we can replace all the
+            # getitem(with_effects, 1) with just the note itself.
+            for user in list(new_node.users.keys()):
+                assert user.args[1] == 1
+                user.replace_all_uses_with(new_node)
+
+            new_node.meta["val"] = node.meta["val"][1]
+        elif len(func._schema.returns) > 1:
+            # If the function has more than 1 return then since we got rid of
+            # the 1st return value (the token), we need to bump all the other
+            # getitem calls by 1 down
+            for user in list(new_node.users.keys()):
+                assert user.args[1] >= 1
+                user.args = (user.args[0], user.args[1] - 1)
+
+            new_node.meta["val"] = node.meta["val"][1:]
+        else:
+            assert len(func._schema.returns) == 0
+            assert len(new_node.users) == 0
+            new_node.meta["val"] = None
+
+        ep.graph.erase_node(node)
+
+    # Remove tokens from inputs
+    placeholders = [node for node in ep.graph.nodes if node.op == "placeholder"]
+    assert len(placeholders) >= num_tokens
+    inp_token_nodes = placeholders[:num_tokens]
+    for inp_token in inp_token_nodes:
+        assert inp_token.name in input_token_names
+        ep.graph.erase_node(inp_token)
+
+    ep.graph.eliminate_dead_code()
+
+
+def _remove_effect_tokens(ep: ExportedProgram) -> ExportedProgram:
+    """
+    Removes the existance of tokens from the exported program, including:
+    - Removes the input and output tokens
+    - Replaces with_effects(token, func, args) with just func(args)
+
+    This function does an inplace modification on the given ExportedProgram.
+    """
+    num_tokens: int = 0
+    input_token_names: List[str] = []
+    new_input_specs: List[InputSpec] = []
+    for inp in ep.graph_signature.input_specs:
+        if inp.kind == InputKind.TOKEN:
+            num_tokens += 1
+            assert isinstance(inp.arg, TokenArgument)
+            input_token_names.append(inp.arg.name)
+        else:
+            new_input_specs.append(inp)
+
+    num_out_tokens: int = 0
+    new_output_specs: List[OutputSpec] = []
+    output_token_names: List[OutputSpec] = []
+    for out in ep.graph_signature.output_specs:
+        if out.kind == OutputKind.TOKEN:
+            num_out_tokens += 1
+            output_token_names.append(out.arg.name)
+        else:
+            new_output_specs.append(out)
+
+    # Update graph signature
+    ep.graph_signature.input_specs = new_input_specs
+    ep.graph_signature.output_specs = new_output_specs
+
+    assert num_tokens == num_out_tokens
+
+    with ep.graph_module._set_replace_hook(ep.graph_signature.get_replace_hook()):
+        _remove_effect_tokens_from_graph_helper(
+            ep, num_tokens, input_token_names, output_token_names
+        )
+
+    return ep
diff --git a/torch/export/_safeguard.py b/torch/export/_safeguard.py
new file mode 100644
index 0000000000000..92fb9b4340416
--- /dev/null
+++ b/torch/export/_safeguard.py
@@ -0,0 +1,43 @@
+import torch
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
+from torch.overrides import TorchFunctionMode
+
+
+class AutogradStateOpsFailSafeguard(TorchFunctionMode):
+    """
+    Detect grad state ops during exporting the graph and fail the process by
+    raising an error, to avoid unexpected behavior. Those grad mode ops could be:
+    `torch.no_grad`
+    `torch.enable_grad`
+    `torch.set_grad_enabled`
+
+    Export with predispatch mode is exempted.
+    """
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+        unsupported_grad_mode_ops = [
+            torch._C._set_grad_enabled,
+        ]
+        # It's only enabled while tracing, by confirming the torch dispatch mode is
+        # any active PROXY. This is to allow the autograd ops out of tracing.
+        current_state = torch._C.is_grad_enabled()
+        if func in unsupported_grad_mode_ops:
+            assert len(args) == 1
+            changed_state = args[0]
+            mode = torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.PROXY)
+            # Intend to check if it's not the pre_dispatch mode. It's allowed to use
+            # autograd ops in pre_dispatch mode, e.g. `torch.no_grad`
+            if (
+                mode
+                and isinstance(mode, ProxyTorchDispatchMode)
+                and not mode.pre_dispatch
+                and changed_state != current_state
+            ):
+                raise RuntimeError(
+                    f"Encountered autograd state manager op {func} trying to change global autograd state "
+                    "while exporting. This is unsafe because we don't capture this op in torch.export "
+                    "today, hence we can't reflect the user intention soundly. You can fix this by "
+                    "adding a torch.no_grad() context around the export call."
+                )
+        return func(*args, **kwargs)
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
index 080c2a74f742d..c41a9016bbfbe 100644
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@@ -1,9 +1,12 @@
-import copy
 import dataclasses
 import functools
+import inspect
+import logging
 import re
-from collections import OrderedDict
-from typing import Any, Callable, Dict, List, Optional, Tuple
+import time
+import warnings
+from contextlib import contextmanager, nullcontext
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 
 import torch
 import torch._dynamo
@@ -11,25 +14,49 @@
 
 import torch.utils._pytree as pytree
 from torch._dynamo.exc import UserError, UserErrorType
-from torch._export.non_strict_utils import make_constraints, make_fake_inputs
+from torch._export.non_strict_utils import (
+    _fakify_script_objects,
+    _gather_constant_attrs,
+    make_constraints,
+    make_fake_inputs,
+    make_fake_params_buffers,
+    produce_guards_and_solve_constraints,
+)
+from torch._export.passes._node_metadata_hook import _node_metadata_hook
 from torch._export.passes.add_runtime_assertions_for_constraints_pass import (
     _AddRuntimeAssertionsForInlineConstraintsPass,
 )
 from torch._export.passes.collect_tracepoints_pass import CollectTracepointsPass
-from torch._export.passes.lift_constant_tensor_pass import lift_constant_tensor_pass
+from torch._export.passes.lift_constants_pass import (
+    ConstantAttrMap,
+    lift_constants_pass,
+    rewrite_script_object_meta,
+)
+from torch._export.utils import placeholder_naming_pass, placeholder_prefixes
+from torch._export.verifier import SpecViolationError
 from torch._export.wrappers import _wrap_submodules
-from torch._functorch.aot_autograd import aot_export_module, GraphSignature
+from torch._functorch.aot_autograd import aot_export_module
 from torch._guards import detect_fake_mode
+
+from torch._library.fake_class_registry import FakeScriptObject
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch._utils_internal import log_export_usage
+from torch.export.dynamic_shapes import _combine_args
+from torch.export.exported_program import OutputKind
+from torch.fx._utils import first_call_function_nn_module_stack
 from torch.fx.experimental.symbolic_shapes import (
     ConstraintViolationError,
+    free_unbacked_symbols,
     GuardOnDataDependentSymNode,
     ShapeEnv,
 )
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
+from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
+from torch.utils._pytree import TreeSpec
 from torch.utils._sympy.value_ranges import ValueRangeError
 
-from .dynamic_shapes import _process_constraints, Constraint
+from ._safeguard import AutogradStateOpsFailSafeguard
+
 from .exported_program import (
     _disable_prexisiting_fake_mode,
     ExportedProgram,
@@ -41,11 +68,15 @@
     _sig_to_specs,
     ArgumentSpec,
     ConstantArgument,
+    CustomObjArgument,
     ExportGraphSignature,
     SymIntArgument,
     TensorArgument,
+    TokenArgument,
 )
 
+log = logging.getLogger(__name__)
+
 
 @dataclasses.dataclass
 class ExportDynamoConfig:
@@ -54,9 +85,34 @@ class ExportDynamoConfig:
     """
 
     allow_rnn: bool = True
+    reorderable_logging_functions: Set[Callable] = dataclasses.field(
+        default_factory=set
+    )
 
 
 DEFAULT_EXPORT_DYNAMO_CONFIG = ExportDynamoConfig()
+DEFAULT_EXPORT_DYNAMO_CONFIG.reorderable_logging_functions = {
+    logging.critical,
+    logging.debug,
+    logging.error,
+    logging.exception,
+    logging.info,
+    logging.log,
+    logging.warning,
+    print,
+    warnings.warn,
+}
+
+
+@contextmanager
+def _ignore_backend_decomps():
+    orig_mkldnn_flag = torch.backends.mkldnn.set_flags(False)
+    orig_nnpack_flag = torch.backends.nnpack.set_flags(False)
+    try:
+        yield
+    finally:
+        torch.backends.mkldnn.set_flags(*orig_mkldnn_flag)
+        torch.backends.nnpack.set_flags(*orig_nnpack_flag)
 
 
 def _convert_input_to_fake(gm, args, kwargs):
@@ -97,19 +153,29 @@ def convert_to_fake(x):
 
 def _replace_param_buffer_names(param_buffer_table, sig):
     for spec in sig.input_specs:
-        spec.target = param_buffer_table.get(spec.target, spec.target)
+        if spec.kind in (
+            InputKind.PARAMETER,
+            InputKind.BUFFER,
+        ):
+            spec.target = param_buffer_table[spec.target]
     for spec in sig.output_specs:
-        spec.target = param_buffer_table.get(spec.target, spec.target)
+        if spec.kind in (
+            OutputKind.BUFFER_MUTATION,
+            OutputKind.GRADIENT_TO_PARAMETER,
+        ):
+            spec.target = param_buffer_table[spec.target]
 
 
-def _reorder_kwargs_by_names(
-    arg_names: List[str], args: Tuple[Any], kwargs: Dict[str, Any]
-):
-    assert len(arg_names) == len(args) + len(kwargs), (
-        f"Total number of arg names is expected to be {len(arg_names)} "
+def _convert_to_positional_args(orig_arg_names, args, kwargs):
+    assert len(orig_arg_names) == len(args) + len(kwargs), (
+        f"Total number of arg names is expected to be {len(orig_arg_names)} "
         f"but got {len(args)} positional args, {len(kwargs)} kwargs."
     )
-    return OrderedDict({kw_name: kwargs[kw_name] for kw_name in arg_names[len(args) :]})
+    reordered_kwargs = [kwargs[kw_name] for kw_name in orig_arg_names[len(args) :]]
+    return (
+        *args,
+        *reordered_kwargs,
+    )
 
 
 def _normalize_nn_module_stack(gm_torch_level, root_cls):
@@ -125,10 +191,14 @@ def _normalize_nn_module_stack(gm_torch_level, root_cls):
             add_root = True
             if nn_module_stack := node.meta.get("nn_module_stack", {}):
                 path, ty = next(iter(nn_module_stack.values()))
-                assert issubclass(ty, torch.nn.Module)
-                # TODO Figure out why sometimes we have root sometimes we don't.
-                if path == root and ty is root_cls:
-                    add_root = False
+                # After deserializing the class `ty` might not exist anymore so
+                # it could be a string
+                if inspect.isclass(ty) and issubclass(ty, torch.nn.Module):
+                    # TODO Figure out why sometimes we have root sometimes we don't.
+                    if path == root and ty is root_cls:
+                        add_root = False
+                else:
+                    assert isinstance(ty, str)
             if add_root:
 
                 def normalize_path(path):
@@ -149,7 +219,10 @@ def __getitem__(self, idx):
                     except Exception:  # TODO(zhxchen17) Remove this.
                         return path
 
-                nn_module_stack = {root_key: (root, root_cls), **nn_module_stack}
+                nn_module_stack = {
+                    root_key: (root, root_cls.__module__ + "." + root_cls.__qualname__),
+                    **nn_module_stack,
+                }
                 node.meta["nn_module_stack"] = {
                     key: (normalize_path(path), ty)
                     for key, (path, ty) in nn_module_stack.items()
@@ -191,6 +264,80 @@ def _get_param_buffer_mapping(
     return param_buffer_table
 
 
+def _remap_constants(
+    orig_constant_attrs: ConstantAttrMap,
+    graph_signature: ExportGraphSignature,
+    constants: Dict[str, Union[torch.Tensor, torch.ScriptObject]],
+) -> None:
+    """Rewrite the graph signature and constants table to use the FQN from the original module."""
+    remap_table: Dict[str, str] = {}
+    for name, value in constants.items():
+        if value in orig_constant_attrs:
+            remap_table[name] = orig_constant_attrs[value]
+
+    for spec in graph_signature.input_specs:
+        if spec.kind in (
+            InputKind.CONSTANT_TENSOR,
+            InputKind.CUSTOM_OBJ,
+        ):
+            orig_target = spec.target
+            assert orig_target is not None
+            spec.target = remap_table.get(orig_target, orig_target)
+
+            constant = constants[orig_target]
+            del constants[orig_target]
+            constants[spec.target] = constant
+
+
+def _rename_constants_nodes(
+    gm: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
+) -> None:
+    """
+    For strict mode, rename constants nodes that were previously annotated as buffers.
+    """
+    # handle name collisions with existing constants
+    node_names = {node.name for node in gm.graph.nodes}
+
+    def rename_constant(name):
+        if name in node_names:
+            n = 1
+            while (dup_name := f"{name}_{n}") in node_names:
+                n += 1
+            name = dup_name
+        node_names.add(name)
+        return name
+
+    # use input specs to map names from buffers to constants
+    buffer_prefix = placeholder_prefixes[InputKind.BUFFER]
+    const_prefix = placeholder_prefixes[InputKind.CONSTANT_TENSOR]
+    buffer_to_constant = {}
+    for spec in graph_signature.input_specs:
+        if spec.kind == InputKind.CONSTANT_TENSOR and not spec.arg.name.startswith(
+            const_prefix
+        ):
+            if spec.arg.name.startswith(buffer_prefix):  # map from buffer to constants
+                c_name = rename_constant(
+                    const_prefix + spec.arg.name[len(buffer_prefix) :]
+                )
+            else:  # lifted constant
+                c_name = rename_constant(const_prefix + spec.arg.name)
+            buffer_to_constant[spec.arg.name] = c_name
+            spec.arg.name = c_name
+    for spec in graph_signature.output_specs:
+        if spec.arg.name in buffer_to_constant:
+            spec.arg.name = buffer_to_constant[spec.arg.name]
+
+    # Rename constants nodes for all modules
+    for mod in gm.modules():
+        if not isinstance(mod, torch.fx.GraphModule):
+            continue
+        for node in mod.graph.nodes:
+            if node.name in buffer_to_constant:
+                node.name = node.target = buffer_to_constant[node.name]
+        mod.recompile()
+
+
 def _restore_state_dict(
     original_module: torch.nn.Module, traced_module: torch.fx.GraphModule
 ) -> None:
@@ -227,23 +374,47 @@ def _restore_state_dict(
     traced_module.recompile()
 
 
+def _get_module_hierarchy(mod: torch.nn.Module) -> Dict[str, str]:
+    return {
+        name: type(m).__name__ for name, m in mod.named_modules(remove_duplicate=False)
+    }
+
+
+def _make_module_call_graph(
+    module_hierarchy: Dict[str, str],
+    in_spec: TreeSpec,
+    out_spec: TreeSpec,
+    module_call_signatures: Dict[str, ModuleCallSignature],
+) -> List[ModuleCallEntry]:
+    ret = [
+        ModuleCallEntry(fqn=fqn, signature=module_call_signatures.get(fqn))
+        for fqn in module_hierarchy
+    ]
+    assert ret[0].fqn == ""
+    ret[0].signature = ModuleCallSignature(
+        inputs=[], outputs=[], in_spec=in_spec, out_spec=out_spec
+    )
+    return ret
+
+
 def _export_to_torch_ir(
     f: Callable,
     args: Tuple[Any, ...],
     kwargs: Optional[Dict[str, Any]] = None,
-    constraints: Optional[List[Constraint]] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any], List[Any]]] = None,
     *,
     preserve_module_call_signature: Tuple[str, ...] = (),
     disable_constraint_solver: bool = False,
     restore_fqn: bool = True,
+    _log_export_usage: bool = True,
 ) -> torch.fx.GraphModule:
     """
     Traces either an nn.Module's forward function or just a callable with PyTorch
     operations inside and produce a torch.fx.GraphModule in torch IR.
     """
 
-    constraints = constraints or []
-    kwargs = kwargs or {}
+    if _log_export_usage:
+        log_export_usage(event="export.private_api", flags={"_export_to_torch_ir"})
 
     if not isinstance(args, tuple):
         raise UserError(
@@ -251,21 +422,21 @@ def _export_to_torch_ir(
             f"Expecting `args` to be a tuple of example positional inputs, got {type(args)}",
         )
 
-    # We convert to nn.Module because __call__ of ExportedProgram
-    # is untracable right now.
-    if isinstance(f, ExportedProgram):
-        f = f.module()
+    kwargs = kwargs or {}
 
     with torch._dynamo.config.patch(dataclasses.asdict(DEFAULT_EXPORT_DYNAMO_CONFIG)):
         try:
             module_call_specs: Dict[str, Dict[str, pytree.TreeSpec]] = {}
-            with _wrap_submodules(f, preserve_module_call_signature, module_call_specs):
+            with _wrap_submodules(
+                f, preserve_module_call_signature, module_call_specs
+            ), _ignore_backend_decomps():
                 gm_torch_level, _ = torch._dynamo.export(
                     f,
-                    constraints=constraints,
+                    dynamic_shapes=dynamic_shapes,  # type: ignore[arg-type]
                     assume_static_by_default=True,
                     tracing_mode="symbolic",
                     disable_constraint_solver=disable_constraint_solver,
+                    _log_export_usage=_log_export_usage,
                 )(
                     *args,
                     **kwargs,
@@ -275,7 +446,7 @@ def _export_to_torch_ir(
         except GuardOnDataDependentSymNode as e:
             raise UserError(  # noqa: TRY200
                 UserErrorType.ANTI_PATTERN,
-                f"Consider annotating your code using torch._constrain_as_*(). {str(e)}",
+                f"Consider annotating your code using torch._check*(). {str(e)}",
                 case_name="constrain_as_size_example",
             )
 
@@ -287,130 +458,118 @@ def _export_to_torch_ir(
     return gm_torch_level
 
 
-def _unlift_user_inputs_to_buffers(
-    gm_torch_level: torch.fx.GraphModule, aot_export_args
-) -> List[str]:
-    flat_args = pytree.tree_leaves(aot_export_args)
-    user_input_names = []
-    with gm_torch_level.graph.inserting_before():
-        for i, (arg, node) in enumerate(zip(flat_args, gm_torch_level.graph.nodes)):
-            assert node.op == "placeholder"
-            user_input_names.append(node.name)
-            if isinstance(arg, torch.Tensor):
-                assert not hasattr(gm_torch_level, node.name)
-                gm_torch_level.register_buffer(node.name, arg)
-                get_attr = gm_torch_level.graph.get_attr(node.name)
-                node.replace_all_uses_with(get_attr)
-                get_attr.meta = copy.copy(node.meta)
-
-    for node in list(gm_torch_level.graph.nodes):
-        if node.op == "placeholder":
-            assert len(node.users) == 0
-            gm_torch_level.graph.erase_node(node)
-    gm_torch_level.recompile()
-    return user_input_names
-
-
-def _lift_buffers_to_user_inputs(
-    gm: torch.fx.GraphModule,
-    graph_signature: GraphSignature,
-    user_input_names: List[str],
-) -> Dict[str, str]:
-    assert len(graph_signature.user_inputs) == 0
-    assert graph_signature.backward_signature is None
-    names = set(user_input_names)
-
-    placeholders = [node for node in gm.graph.nodes if node.op == "placeholder"]
-    # user inputs are always added in the end
-    start = len(graph_signature.parameters)
-    end = start + len(graph_signature.buffers)
-    buffer_nodes = placeholders[start:end]
-    last_placeholder_node = placeholders[-1] if len(placeholders) > 0 else None
-    old_nodes: Dict[str, torch.fx.Node] = {}
-    for node in buffer_nodes:
-        buffer_name = graph_signature.inputs_to_buffers[node.name]
-        if buffer_name not in names:
-            continue
-        old_nodes[buffer_name] = node
-    replaces = {}
-    new_node_names: Dict[str, str] = {}
-    with gm.graph.inserting_after(last_placeholder_node):
-        for name in reversed(user_input_names):
-            new_node = gm.graph.placeholder(name)
-            new_node.target = new_node.name
-            new_node_names[name] = new_node.name
-            if name in old_nodes:
-                old_node = old_nodes[name]
-                new_node.meta = copy.copy(old_node.meta)
-                old_node.replace_all_uses_with(new_node)
-                replaces[old_node.name] = new_node.name
-    new_node_names = dict(reversed(new_node_names.items()))
-    for old_node in old_nodes.values():
-        gm.graph.erase_node(old_node)
-
-    gm.recompile()
-
-    graph_signature.buffers = [b for b in graph_signature.buffers if b not in names]
-    graph_signature.inputs_to_buffers = {
-        i: b for i, b in graph_signature.inputs_to_buffers.items() if b not in names
-    }
-    user_inputs_to_mutate = {
-        o: b for o, b in graph_signature.buffers_to_mutate.items() if b in names
-    }
-    graph_signature.buffers_to_mutate = {
-        o: b for o, b in graph_signature.buffers_to_mutate.items() if b not in names
-    }
-    graph_signature.user_inputs.extend(new_node_names.values())  # type: ignore[arg-type]
-    graph_signature.user_outputs = [
-        replaces[o] if o in replaces else o for o in graph_signature.user_outputs
-    ]
-    return user_inputs_to_mutate  # type: ignore[return-value]
-
-
 def _export_non_strict(
-    mod,
+    mod: torch.nn.Module,
     fake_args,
     fake_kwargs,
     fake_params_buffers,
+    constant_attrs: ConstantAttrMap,
     *,
     transform=lambda x: x,  # TODO(zhxchen17) Revisit if this is needed later.
     pre_dispatch=False,
 ):
+    # [NOTE] If the user is exporting under training mode, we want to detect if there is any
+    # state change in the autograd global state and error. If the user is exporting under inference
+    # mode, we don't care. At predispatch level, we don't care about the state change.
+    is_grad_enabled = torch._C.is_grad_enabled()
+    grad_safe_guard = nullcontext()
+    if not pre_dispatch and is_grad_enabled:
+        grad_safe_guard = AutogradStateOpsFailSafeguard()  # type: ignore[assignment]
+
+    @contextmanager
+    def _compiling_state_context():
+        old_value = torch.compiler._is_compiling_flag
+        try:
+            torch.compiler._is_compiling_flag = True
+            yield
+        finally:
+            torch.compiler._is_compiling_flag = old_value
+
     # This _reparametrize_module makes sure inputs and module.params/buffers have the same fake_mode,
     # otherwise aot_export_module will error out because it sees a mix of fake_modes.
     # And we want aot_export_module to use the fake_tensor mode in dynamo to keep the pipeline easy to reason about.
-    with torch.nn.utils.stateless._reparametrize_module(mod, fake_params_buffers):
+    with torch.nn.utils.stateless._reparametrize_module(
+        mod,
+        fake_params_buffers,
+        tie_weights=True,
+        strict=True,
+        stack_weights=True,
+    ), grad_safe_guard, _ignore_backend_decomps(), _compiling_state_context():  # type: ignore[attr-defined]
         gm, graph_signature = transform(aot_export_module)(
             mod,
-            (*fake_args, *fake_kwargs.values()),
+            fake_args,
             trace_joint=False,
             pre_dispatch=pre_dispatch,
+            kwargs=fake_kwargs,
+        )
+    # TODO unfortunately preserving graph-level metadata is not
+    # working well with aot_export. So we manually copy it.
+    # (The node-level meta is addressed above.)
+    if isinstance(mod, torch.fx.GraphModule) and hasattr(mod, "meta"):
+        gm.meta.update(mod.meta)
+
+    if pre_dispatch:
+        from torch._export.passes.replace_set_grad_with_hop_pass import (
+            replace_set_grad_with_hop_pass,
         )
 
+        gm = replace_set_grad_with_hop_pass(gm)
+
+    # Remove nn_module_stack, stack_trace metadata from all placeholders/inputs nodes.
+    for _mod in gm.modules():
+        if not isinstance(_mod, torch.fx.GraphModule):
+            continue
+        for node in _mod.graph.nodes:
+            if node.op in ["placeholder", "output"]:
+                node.meta.pop("nn_module_stack", None)
+                node.meta.pop("stack_trace", None)
+
     # NOTE: aot_export adds symint metadata for placeholders with int values;
     # since these become specialized, we replace such metadata with the original values
     flat_args = pytree.tree_leaves((fake_args, fake_kwargs))
     index = 0
-    total_param_buffers = len(graph_signature.parameters) + len(graph_signature.buffers)
+    total_non_user_inputs = (
+        len(graph_signature.parameters)
+        + len(graph_signature.buffers)
+        + len(graph_signature.input_tokens)
+    )
     for node in gm.graph.nodes:
         if node.op == "placeholder":
-            if index >= total_param_buffers:
-                user_arg = flat_args[index - total_param_buffers]
+            if index >= total_non_user_inputs:
+                user_arg = flat_args[index - total_non_user_inputs]
                 if not isinstance(user_arg, torch.Tensor):
                     node.meta["val"] = user_arg
             index += 1
 
     is_joint = graph_signature.backward_signature is not None
 
-    def make_argument_spec(node) -> ArgumentSpec:
-        assert "val" in node.meta, f"{node} has no 'val' metadata field"
+    def make_argument_spec(i, node) -> ArgumentSpec:
+        if isinstance(node, (int, bool, float, type(None))):
+            # For const outputs we just directly return this
+            return ConstantArgument(name="", value=node)
+
+        assert (
+            "val" in node.meta
+        ), f"{node} is not a constant or a node with a 'val' metadata field"
         val = node.meta["val"]
-        if isinstance(val, FakeTensor):
+        if i < len(graph_signature.input_tokens):
+            # TODO: We should be checking for a different type, once we add a new type
+            return TokenArgument(name=node.name)
+        elif isinstance(val, FakeTensor):
             return TensorArgument(name=node.name)
         elif isinstance(val, torch.SymInt):
             return SymIntArgument(name=node.name)
+        elif isinstance(val, torch.ScriptObject):
+            return CustomObjArgument(name=node.name, class_fqn=val._type().qualified_name())  # type: ignore[attr-defined]
+        elif isinstance(val, FakeScriptObject):
+            return CustomObjArgument(name=node.name, class_fqn=val.script_class_name)
+        elif isinstance(val, (int, bool, str, float, type(None))):
+            return ConstantArgument(name=node.name, value=val)
         else:
-            return ConstantArgument(value=val)
+            raise AssertionError(
+                f"Encountered an unsupported object of type {type(val)} "
+                f"while writing the metadata for exported program"
+            )
 
     input_specs, output_specs = _sig_to_specs(
         user_inputs=set(graph_signature.user_inputs),
@@ -418,36 +577,59 @@ def make_argument_spec(node) -> ArgumentSpec:
         inputs_to_buffers=graph_signature.inputs_to_buffers,  # type: ignore[arg-type]
         user_outputs=set(graph_signature.user_outputs),  # type: ignore[arg-type]
         buffer_mutations=graph_signature.buffers_to_mutate,  # type: ignore[arg-type]
-        user_input_mutations=gm.meta.get("user_inputs_to_mutate", {}),  # type: ignore[arg-type]
+        user_input_mutations=graph_signature.user_inputs_to_mutate,  # type: ignore[arg-type]
         grad_params=graph_signature.backward_signature.gradients_to_parameters if is_joint else {},  # type: ignore[arg-type, union-attr]
         grad_user_inputs=graph_signature.backward_signature.gradients_to_user_inputs if is_joint else {},  # type: ignore[arg-type, union-attr]
         loss_output=graph_signature.backward_signature.loss_output if is_joint else None,  # type: ignore[arg-type, union-attr]
         inputs=[
-            make_argument_spec(node)
-            for node in gm.graph.nodes
+            make_argument_spec(i, node)
+            for i, node in enumerate(gm.graph.nodes)
             if node.op == "placeholder"
         ],
         outputs=[
-            make_argument_spec(node)
-            for node in pytree.tree_leaves(next(iter(reversed(gm.graph.nodes))).args)
+            make_argument_spec(i, node)
+            for i, node in enumerate(
+                pytree.tree_leaves(next(iter(reversed(gm.graph.nodes))).args)
+            )
         ],
+        input_tokens=graph_signature.input_tokens,
+        output_tokens=graph_signature.output_tokens,
     )
     export_graph_signature = ExportGraphSignature(
         input_specs=input_specs, output_specs=output_specs
     )
 
-    tensor_constants = lift_constant_tensor_pass(gm, export_graph_signature)
+    constants = rewrite_script_object_meta(gm)
+    constants.update(lift_constants_pass(gm, export_graph_signature, constant_attrs))
+
+    # prettify names for placeholder nodes
+    placeholder_naming_pass(
+        gm,
+        export_graph_signature,
+        mod,
+        fake_args,
+        fake_kwargs,
+        fake_params_buffers,
+        constants,
+    )
 
     @dataclasses.dataclass
     class _ExportedProgramNonStrict:
         gm: torch.fx.GraphModule
         sig: ExportGraphSignature
-        tensor_constants: Dict[str, torch.Tensor]
+        constants: Dict[
+            str,
+            Union[
+                torch.Tensor,
+                FakeScriptObject,
+                torch.ScriptObject,
+            ],
+        ]
 
     return _ExportedProgramNonStrict(
         gm,
         export_graph_signature,
-        tensor_constants,
+        constants,
     )
 
 
@@ -461,48 +643,322 @@ def _get_params_buffers(mod: torch.nn.Module) -> Dict[str, torch.Tensor]:
     return params_buffers
 
 
+def _get_forward_arg_names(
+    mod: torch.nn.Module,
+    args: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]] = None,
+) -> List[str]:
+    """
+    Gets the argument names to forward that are used, for restoring the
+    original signature when unlifting the exported program module.
+    - Positional args: retain the original argument names, and enumerate
+        *args as args_0, args_1, ...
+    - Keyword args: retain the original kwarg names in the order specified
+        by the user. This order seems to matter for the current state of
+        export lifted modules.
+    """
+    sig = inspect.signature(mod.forward)
+    _args = sig.bind_partial(*args).arguments
+
+    names: List[str] = []
+    for name, value in _args.items():
+        # handle variable number of positional args
+        if sig.parameters[name].kind == inspect._ParameterKind.VAR_POSITIONAL:
+            names.extend([f"{name}_{i}" for i, _ in enumerate(value)])
+        else:
+            names.append(name)
+    # order of kwargs matters for input spec
+    if kwargs:
+        names.extend([kwarg for kwarg, _ in kwargs.items()])
+
+    return names
+
+
+def _rewrite_dynamo_tensor_constants(
+    orig_mod_buffers: Set[torch.Tensor],
+    traced_mod_buffers: Dict[str, torch.Tensor],
+    graph_signature: ExportGraphSignature,
+    constants: Dict[str, Union[torch.Tensor, torch.ScriptObject]],
+):
+    """Dynamo erroneously marks tensor attributes on modules as a buffers.
+
+    Rewrite them to be tensor constants.
+    """
+    for spec in graph_signature.input_specs:
+        if spec.kind == InputKind.BUFFER:
+            assert spec.target is not None
+            value = traced_mod_buffers[spec.target]
+            if value not in orig_mod_buffers:
+                # This was a tensor constant erroneously marked as a buffer.
+                # Convert it int oa constant in the graph signature, and add its
+                # value to the constants table.
+                spec.kind = InputKind.CONSTANT_TENSOR
+                constants[spec.target] = value
+
+
+def _rewrite_non_persistent_buffers(
+    orig_mod: torch.nn.Module,
+    graph_signature: ExportGraphSignature,
+    constants: Dict[str, Union[torch.Tensor, torch.ScriptObject]],
+):
+    """Dynamo erroneously drops the persistent flag on buffers.
+
+    Rewrite non-persistent buffers to reflect the original module.
+    """
+    state_dict = orig_mod.state_dict()
+    for spec in graph_signature.input_specs:
+        if spec.kind == InputKind.BUFFER:
+            assert spec.target is not None
+            if spec.target not in state_dict:
+                assert spec.target not in constants
+                spec.persistent = False
+                constants[spec.target] = orig_mod.get_buffer(spec.target)
+
+
+def _verify_nn_module_stack(graph_module: torch.fx.GraphModule) -> None:
+    """
+    Perform nn_module_stack checks on the graph.
+    Current constraints:
+        For the top level graph:
+        - populated for 'call_function', 'get_attr'
+        - None for 'placeholder', 'output'
+        For submodule graphs:
+        - None for 'placeholder', output'
+
+    TODO(pianpwk): make this a consistent node-level check once nn_module_stack is populated for cond submodules.
+    """
+    # Check top-level graph for all nodes, all graphs for placeholder & output nodes
+    for i, mod in enumerate([graph_module] + list(graph_module.modules())):
+        if not isinstance(mod, torch.fx.GraphModule):
+            continue
+        for node in mod.graph.nodes:
+            if node.op in ["call_function", "get_attr"]:
+                if i == 0:
+                    if (
+                        nn_module_stack := node.meta.get("nn_module_stack", None)
+                    ) is None:
+                        raise SpecViolationError(
+                            f"Node {node} of type {node.op} is missing nn_module_stack metadata"
+                        )
+                    if not all(
+                        isinstance(k, str)
+                        and isinstance(v, tuple)
+                        and len(v) == 2
+                        and all(isinstance(x, str) for x in v)
+                        for k, v in nn_module_stack.items()
+                    ):
+                        raise SpecViolationError(
+                            f"Node {node} of type {node.op} has incorrect nn_module_stack metadata format"
+                            f"expected Dict[str, Tuple[str, str]], but got {nn_module_stack}"
+                        )
+            elif node.op in ["placeholder", "output"]:
+                if node.meta.get("nn_module_stack", None):
+                    raise SpecViolationError(
+                        f"Node {node} of type {node.op} contains nn_module_stack metadata, this should be None"
+                    )
+
+
+def _verify_stack_trace(graph_module: torch.fx.GraphModule) -> None:
+    """
+    Perform stack trace checks on the graph.
+    Constraints:
+        - None or non-empty str for 'call_function', 'get_attr'
+        - None for 'placeholder', 'output'
+    """
+    for i, mod in enumerate([graph_module] + list(graph_module.modules())):
+        if not isinstance(mod, torch.fx.GraphModule):
+            continue
+        for node in graph_module.graph.nodes:
+            stack_trace = node.meta.get("stack_trace", None)
+            if node.op in ["call_function", "get_attr"]:
+                if not (stack_trace is None or isinstance(stack_trace, str)):
+                    raise SpecViolationError(
+                        f"Node {node} of type {node.op} has invalid stack_trace metadata, "
+                        f"expected a string or None but instead found: {stack_trace}"
+                    )
+            elif node.op in ["placeholder", "output"]:
+                if stack_trace:
+                    raise SpecViolationError(
+                        f"Node {node} of type {node.op} contains stack_trace metadata, "
+                        f"expected None but instead found: {stack_trace}"
+                    )
+
+
+def _verify_placeholder_names(gm: torch.fx.GraphModule, sig: ExportGraphSignature):
+    """
+    Performs a sanity check on the placeholder node names.
+    - User input nodes: no restrictions, should match the original forward() signature
+    - Params/buffers/constants/custom_obj/token nodes: should start with prefixes defined in <placeholder_prefixes>
+    """
+    name_to_kind = {spec.arg.name: spec.kind for spec in sig.input_specs}
+    for mod in gm.modules():
+        if not isinstance(mod, torch.fx.GraphModule):
+            continue
+        for node in mod.graph.nodes:
+            if node.op == "placeholder":
+                if node.name not in name_to_kind:
+                    continue
+                node_kind = name_to_kind[node.name]
+                prefix = placeholder_prefixes[node_kind]
+                if not node.name.startswith(prefix):
+                    raise SpecViolationError(
+                        f"Placeholder node name {node.name} does not follow spec for {node_kind}, name should have prefix: {prefix}"
+                    )
+
+
+def get_ep_stats(ep: ExportedProgram) -> Dict[str, Any]:
+    op_count = 0
+    op_set = set()
+    for m in ep.graph_module.modules():
+        if not isinstance(m, torch.fx.GraphModule):
+            continue
+        for node in m.graph.nodes:
+            if node.op != "call_function":
+                continue
+            op_count += 1
+            assert hasattr(node.target, "__module__")
+            assert hasattr(node.target, "__name__")
+            op_set.add(f"{node.target.__module__}.{node.target.__name__}")
+    return {"op_count": op_count, "op_set": op_set}
+
+
+_EXPORT_FLAGS: Optional[Set[str]] = None
+_EXPORT_MODULE_HIERARCHY: Optional[Dict[str, str]] = None
+
+
+def _log_export_wrapper(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        global _EXPORT_FLAGS, _EXPORT_MODULE_HIERARCHY
+        try:
+            start = time.time()
+            ep = fn(*args, **kwargs)
+            end = time.time()
+            log_export_usage(
+                event="export.time",
+                metrics=end - start,
+                flags=_EXPORT_FLAGS,
+                **get_ep_stats(ep),
+            )
+        except Exception as e:
+            t = type(e)
+            error_type = t.__module__ + "." + t.__qualname__
+            log_export_usage(
+                event="export.error",
+                type=error_type,
+                message=str(e),
+                flags=_EXPORT_FLAGS,
+            )
+            raise e
+        finally:
+            _EXPORT_FLAGS = None
+            _EXPORT_MODULE_HIERARCHY = None
+
+        return ep
+
+    return wrapper
+
+
+@_log_export_wrapper
 @_disable_prexisiting_fake_mode
 def _export(
-    f: Callable,
+    mod: torch.nn.Module,
     args: Tuple[Any, ...],
     kwargs: Optional[Dict[str, Any]] = None,
-    constraints: Optional[List[Constraint]] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any], List[Any]]] = None,
     *,
     strict: bool = True,
     preserve_module_call_signature: Tuple[str, ...] = (),
     pre_dispatch: bool = False,
+    _disable_forced_specializations: Optional[bool] = False,
 ) -> ExportedProgram:
     """
     Traces either an nn.Module's forward function or just a callable with PyTorch
     operations inside and produce a ExportedProgram.
 
     Args:
-        m: the `nn.Module` or callable to trace.
+        f: the `nn.Module` to trace.
 
         args: example positional inputs.
 
         kwargs: optional example keyword inputs.
 
-        constraints: A optional list of constraints on the dynamic arguments specifying
-            their possible range of their shapes
+        dynamic_shapes:
+         An optional argument where the type should either be:
+         1) a dict from argument names of ``f`` to their dynamic shape specifications,
+         2) a tuple that specifies dynamic shape specifications for each input in original order.
+         If you are specifying dynamism on keyword args, you will need to pass them in the order that
+         is defined in the original function signature.
+
+         The dynamic shape of a tensor argument can be specified as either
+         (1) a dict from dynamic dimension indices to :func:`Dim` types, where it is
+         not required to include static dimension indices in this dict, but when they are,
+         they should be mapped to None; or (2) a tuple / list of :func:`Dim` types or None,
+         where the :func:`Dim` types correspond to dynamic dimensions, and static dimensions
+         are denoted by None. Arguments that are dicts or tuples / lists of tensors are
+         recursively specified by using mappings or sequences of contained specifications.
 
         preserve_module_call_signature: A list of submodule paths for which the original
             calling conventions are preserved as metadata.
 
+        _disable_forced_specializations:
+         By default, some inferred dynamic shapes guards/constraints that are not expressible with the current
+         dynamic shapes language will lead to specialization to the concrete input values provided.
+         If _disable_forced_specializations is set to True, we will not specialize, and will not perform runtime
+         checks on such produced guards. Instead, we allow the user to specify arbitrary shapes,
+         and fail during runtime if the inputs are invalid. Constraints expressible with the language
+         (e.g. ranges, linear derived dims) will still be enforced.
+
     Returns:
         An ExportedProgram containing the traced method.
     """
-    constraints = constraints or []
+    if not isinstance(args, tuple):
+        raise UserError(
+            UserErrorType.INVALID_INPUT,
+            f"Expecting `args` to be a tuple of example positional inputs, got {type(args)}",
+        )
+
+    if _disable_forced_specializations and strict:
+        raise UserError(
+            UserErrorType.INVALID_INPUT,
+            "_disable_forced_specializations can be only be specified in non-strict mode.",
+        )
+
+    global _EXPORT_FLAGS, _EXPORT_MODULE_HIERARCHY
+    _EXPORT_MODULE_HIERARCHY = _get_module_hierarchy(mod)
+
+    flags = set()
+    flags.add("strict" if strict else "non_strict")
+    flags.add("pre_dispatch" if pre_dispatch else "aot_dispatch")
+    log_export_usage(event="export.enter", flags=flags)
+    _EXPORT_FLAGS = flags
+
     kwargs = kwargs or {}
+    if isinstance(dynamic_shapes, torch.export.ShapesCollection):
+        dynamic_shapes = dynamic_shapes.dynamic_shapes(mod, args, kwargs)
+
+    flat_args, orig_in_spec = pytree.tree_flatten((args, kwargs))
+    original_state_dict = mod.state_dict(keep_vars=True)
+    forward_arg_names = _get_forward_arg_names(mod, args, kwargs)
 
     if not strict:
-        assert isinstance(f, torch.nn.Module)
-        assert len(preserve_module_call_signature) == 0
-        assert len(kwargs) == 0, "keyword arguments NYI"
         out_spec = None
 
+        module_call_specs: Dict[str, Dict[str, pytree.TreeSpec]] = {}
+
+        def strip_root(x):
+            if isinstance(x, str) and x.startswith("_export_root"):
+                stripped = x[len("_export_root") :]
+                return stripped[1:] if stripped.startswith(".") else stripped
+            return x
+
+        def fixup_key(x):
+            return "L__self__" + strip_root(x)
+
         def _tuplify_outputs(aot_export):
-            def _aot_export_non_strict(mod, args, **kwargs):
+            def _aot_export_non_strict(mod, args, kwargs=None, **flags):
+                kwargs = kwargs or {}
+
                 class Wrapper(torch.nn.Module):
                     def __init__(self, mod):
                         super().__init__()
@@ -510,21 +966,26 @@ def __init__(self, mod):
 
                     def forward(self, *args, **kwargs):
                         nonlocal out_spec
-                        flat_outs, out_spec = pytree.tree_flatten(
-                            self._export_root(*args, **kwargs)
-                        )
+                        if isinstance(self._export_root, torch.fx.GraphModule):
+                            with torch.fx.traceback.preserve_node_meta():
+                                tree_out = torch.fx.Interpreter(self._export_root).run(
+                                    *args, **kwargs
+                                )
+                        else:
+                            tree_out = self._export_root(*args, **kwargs)
+                        flat_outs, out_spec = pytree.tree_flatten(tree_out)
                         return tuple(flat_outs)
 
-                gm, sig = aot_export(Wrapper(mod), args, **kwargs)
-
-                def strip_root(x):
-                    if isinstance(x, str) and x.startswith("_export_root"):
-                        stripped = x[len("_export_root") :]
-                        return stripped[1:] if stripped.startswith(".") else stripped
-                    return x
-
-                def fixup_key(x):
-                    return "L__self__" + strip_root(x)
+                wrapped_mod = Wrapper(mod)
+                # Patch export_root to the signatures so that wrapper module correctly populates the
+                # in/out spec
+                new_preserved_call_signatures = [
+                    "_export_root." + i for i in preserve_module_call_signature
+                ]
+                with _wrap_submodules(
+                    wrapped_mod, new_preserved_call_signatures, module_call_specs
+                ):
+                    gm, sig = aot_export(wrapped_mod, args, kwargs=kwargs, **flags)
 
                 sig.parameters = pytree.tree_map(strip_root, sig.parameters)
                 sig.buffers = pytree.tree_map(strip_root, sig.buffers)
@@ -540,8 +1001,6 @@ def fixup_key(x):
                 for node in gm.graph.nodes:
                     if "nn_module_stack" in node.meta:
                         nn_module_stack = node.meta["nn_module_stack"]
-                        # Delete the wrapper module reference
-                        del nn_module_stack[""]
                         node.meta["nn_module_stack"] = {
                             fixup_key(key): val
                             for key, val in pytree.tree_map(
@@ -553,45 +1012,150 @@ def fixup_key(x):
 
             return _aot_export_non_strict
 
-        fake_mode, fake_args, src_equalities, original_signature = make_fake_inputs(
-            f, args, constraints
+        (
+            fake_mode,
+            fake_args,
+            fake_kwargs,
+            equalities_inputs,
+            original_signature,
+        ) = make_fake_inputs(mod, args, kwargs, dynamic_shapes)
+
+        fake_params_buffers = make_fake_params_buffers(
+            fake_mode, _get_params_buffers(mod)
         )
-        ep_non_strict = _export_non_strict(
-            f, fake_args, {}, f.state_dict(), transform=_tuplify_outputs
+
+        with fake_mode:
+            with _fakify_script_objects(mod, fake_args, fake_kwargs, fake_mode) as (
+                patched_mod,
+                new_fake_args,
+                new_fake_kwargs,
+                new_fake_constant_attrs,
+                map_fake_to_real,
+            ):
+                ep_non_strict = _export_non_strict(
+                    patched_mod,
+                    new_fake_args,
+                    new_fake_kwargs,
+                    fake_params_buffers,
+                    new_fake_constant_attrs,
+                    pre_dispatch=pre_dispatch,
+                    transform=_tuplify_outputs,
+                )
+                # ep_non_strict.constants contains only fake script objects, we need to map them back
+                ep_non_strict.constants = {
+                    fqn: map_fake_to_real[obj]
+                    if isinstance(obj, FakeScriptObject)
+                    else obj
+                    for fqn, obj in ep_non_strict.constants.items()
+                }
+
+            stack_trace = (
+                'File "torch/fx/passes/runtime_assert.py", line 24, '
+                "in insert_deferred_runtime_asserts"
+            )
+            with ep_non_strict.gm._set_create_node_hook(
+                functools.partial(_node_metadata_hook, stack_trace=stack_trace)
+            ):
+                insert_deferred_runtime_asserts(
+                    ep_non_strict.gm,
+                    fake_mode.shape_env,
+                    f"non strict exported program: {first_call_function_nn_module_stack(ep_non_strict.gm.graph)}",
+                    export=True,
+                )
+
+        ep_non_strict.gm.meta["inline_constraints"] = {
+            k: v
+            for k, v in fake_mode.shape_env.var_to_range.items()
+            if free_unbacked_symbols(k)
+        }
+        num_lifted = len(
+            [
+                spec
+                for spec in ep_non_strict.sig.input_specs
+                if spec.kind != InputKind.USER_INPUT
+            ]
         )
-        range_constraints, equality_constraints = make_constraints(
-            fake_mode, src_equalities, original_signature, ep_non_strict.gm
+        try:
+            produce_guards_and_solve_constraints(
+                fake_mode,
+                ep_non_strict.gm,
+                equalities_inputs,
+                original_signature,
+                _disable_forced_specializations=_disable_forced_specializations,
+            )
+        except (ConstraintViolationError, ValueRangeError) as e:
+            raise UserError(UserErrorType.CONSTRAINT_VIOLATION, str(e))  # noqa: TRY200
+
+        combined_args = _combine_args(mod, args, kwargs)
+        range_constraints = make_constraints(
+            fake_mode,
+            ep_non_strict.gm,
+            combined_args,
+            dynamic_shapes,
+            num_lifted,
         )
+
         assert out_spec is not None
-        return ExportedProgram(
-            root=ep_non_strict.gm,
-            graph=ep_non_strict.gm.graph,
+
+        gm = ep_non_strict.gm
+
+        gm.meta["forward_arg_names"] = forward_arg_names
+        module_call_signatures = {
+            strip_root(fqn): ModuleCallSignature(inputs=[], outputs=[], **specs)
+            for fqn, specs in module_call_specs.items()
+        }
+
+        if len(preserve_module_call_signature) > 0:
+            for node in gm.graph.nodes:
+                if node.target == torch.ops.higher_order._export_tracepoint:
+                    if "path" in node.kwargs:
+                        path = strip_root(node.kwargs["path"])
+                        with gm.graph.inserting_before(node):
+                            new_node = gm.graph.create_node(
+                                "call_function",
+                                torch.ops.higher_order._export_tracepoint,
+                                args=node.args,
+                                kwargs={
+                                    "path": path,
+                                    "kind": node.kwargs["kind"],
+                                },
+                            )
+                            new_node.meta = node.meta
+                            node.replace_all_uses_with(new_node)
+                            gm.graph.erase_node(node)
+
+            res = CollectTracepointsPass(module_call_signatures, ep_non_strict.sig)(gm)
+            assert res is not None
+            gm = res.graph_module
+
+        _rewrite_non_persistent_buffers(mod, ep_non_strict.sig, ep_non_strict.constants)
+        _verify_nn_module_stack(gm)
+        _verify_stack_trace(gm)
+        _verify_placeholder_names(gm, ep_non_strict.sig)
+        exported_program = ExportedProgram(
+            root=gm,
+            graph=gm.graph,
             graph_signature=ep_non_strict.sig,
-            state_dict=_get_params_buffers(f),
+            state_dict=original_state_dict,
             range_constraints=range_constraints,
-            module_call_graph=[
-                ModuleCallEntry(
-                    "",
-                    ModuleCallSignature(
-                        [], [], pytree.tree_flatten((args, {}))[1], out_spec
-                    ),
-                )
-            ],
+            module_call_graph=_make_module_call_graph(
+                _EXPORT_MODULE_HIERARCHY, orig_in_spec, out_spec, module_call_signatures
+            ),
             example_inputs=(args, kwargs),
-            tensor_constants=ep_non_strict.tensor_constants,
+            constants=ep_non_strict.constants,
         )
+        return exported_program
 
     gm_torch_level = _export_to_torch_ir(
-        f,
+        mod,
         args,
         kwargs,
-        constraints,
+        dynamic_shapes,
         preserve_module_call_signature=preserve_module_call_signature,
         restore_fqn=False,  # don't need to restore because we will do it later
+        _log_export_usage=False,
     )
 
-    params_buffers = _get_params_buffers(gm_torch_level)
-
     # We detect the fake_mode by looking at gm_torch_level's placeholders, this is the fake_mode created in dynamo.
     (
         fake_args,
@@ -602,7 +1166,7 @@ def fixup_key(x):
 
     # First, we want to pass through the graph to try populating
     # val field for getattr if there is anything missing.
-    # THis can happen when quantization adds extra params and forgets
+    # This can happen when quantization adds extra params and forgets
     # to update "val"
     for node in gm_torch_level.graph.nodes:
         if node.op == "get_attr" and "val" not in node.meta:
@@ -616,8 +1180,8 @@ def fixup_key(x):
                     attr, static_shapes=True
                 )
 
-    # When aot_export lifts the params, we lose the nn_module_stack
-    # and source_fn from the param nodes as they are treated as fresh inputs
+    # When aot_export lifts the params, we lose metadata (e.g. source_fn_stack, stack_trace)
+    # from the param nodes as they are treated as fresh inputs
     # Therefore, we manually extract them before calling into aot_export
     params_buffers_to_node_meta = {}
     for node in gm_torch_level.graph.nodes:
@@ -660,58 +1224,38 @@ def fixup_key(x):
     if out_spec.type not in (list, tuple):
         out_spec = pytree.TreeSpec(tuple, None, [out_spec])
 
-    orig_args = gm_torch_level.graph._codegen.pytree_info.orig_args  # type: ignore[attr-defined]
+    orig_arg_names = gm_torch_level.graph._codegen.pytree_info.orig_args  # type: ignore[attr-defined]
 
     gm_torch_level.graph._codegen = _PyTreeCodeGen(
         _PyTreeInfo(
-            orig_args,
+            orig_arg_names,
             gm_torch_level._in_spec,
             out_spec,
         )
     )
     gm_torch_level.recompile()
 
-    # Restore FQN of param/buffers
-    param_buffer_table: Dict[str, str] = (
-        _get_param_buffer_mapping(f, gm_torch_level)
-        if isinstance(f, torch.nn.Module)
-        else {}
-    )
-
-    if isinstance(f, torch.nn.Module):
-        _normalize_nn_module_stack(gm_torch_level, type(f))
+    _normalize_nn_module_stack(gm_torch_level, type(mod))
 
-    def _process_user_inputs(aot_export):
-        def _aot_export_strict(gm_torch_level: torch.fx.GraphModule, args, **kwargs):
-            user_input_names = _unlift_user_inputs_to_buffers(gm_torch_level, args)
-            gm, graph_signature = aot_export(gm_torch_level, (), **kwargs)
-            user_inputs_to_mutate = _lift_buffers_to_user_inputs(
-                gm, graph_signature, user_input_names
-            )
-            # TODO unfortunately preserving graph-level metadata is not
-            # working well with aot_export. So we manually copy it.
-            # (The node-level meta is addressed above.)
-            gm.meta.update(gm_torch_level.meta)
-            assert "user_inputs_to_mutate" not in gm.meta
-            gm.meta["user_inputs_to_mutate"] = user_inputs_to_mutate
-            return gm, graph_signature
-
-        return _aot_export_strict
-
-    # Note: aot_export_module doesn't accept kwargs, we'd like to reorder the kwargs as an OrderedDict
-    # to follow the order in orig_args and correctly call module
+    # NOTE: graph module expects only positional args
+    constant_attrs = _gather_constant_attrs(mod)
     ep_non_strict = _export_non_strict(
         gm_torch_level,
-        fake_args,
-        _reorder_kwargs_by_names(orig_args, fake_args, fake_kwargs),
+        _convert_to_positional_args(orig_arg_names, fake_args, fake_kwargs),
+        {},
         fake_params_buffers,
-        transform=_process_user_inputs,
+        constant_attrs,
         pre_dispatch=pre_dispatch,
     )
 
     gm = ep_non_strict.gm
     export_graph_signature = ep_non_strict.sig
-    tensor_constants = ep_non_strict.tensor_constants
+    constants = ep_non_strict.constants
+
+    # Don't copy over nn_module_stack, stack_trace metadata for params/buffers nodes
+    for metadata in params_buffers_to_node_meta.values():
+        metadata.pop("nn_module_stack", None)
+        metadata.pop("stack_trace", None)
 
     # After aot_export, set the param/buffer metadata back into placeholders
     # Technically, users can still construct this data from param names
@@ -734,9 +1278,10 @@ def _aot_export_strict(gm_torch_level: torch.fx.GraphModule, args, **kwargs):
 
     gm.meta["inline_constraints"] = {
         k: v
-        for k, v in dynamo_fake_mode.shape_env.runtime_var_to_range.items()
-        if re.match(r"^[if]\d+$", str(k))
+        for k, v in dynamo_fake_mode.shape_env.var_to_range.items()
+        if free_unbacked_symbols(k)
     }
+    gm.meta["forward_arg_names"] = forward_arg_names
 
     num_lifted = next(
         (
@@ -746,19 +1291,36 @@ def _aot_export_strict(gm_torch_level: torch.fx.GraphModule, args, **kwargs):
         ),
         len(export_graph_signature.input_specs),
     )
-    flat_args, orig_in_spec = pytree.tree_flatten((args, kwargs))
-    range_constraints = _process_constraints(
+    combined_args = _combine_args(mod, args, kwargs)
+    range_constraints = make_constraints(
+        dynamo_fake_mode,
         gm,
+        combined_args,
+        dynamic_shapes,
         num_lifted,
-        flat_args,
     )
 
-    if isinstance(f, torch.nn.Module):
-        _replace_param_buffer_names(param_buffer_table, export_graph_signature)
-        params_buffers = {
-            param_buffer_table.get(name, name): tensor
-            for name, tensor in params_buffers.items()
-        }
+    # Do some cleanups on the graph module to restore the state dict to the
+    # expected form. Each of these steps should probably get fixed upstream.
+    # 1. Remove tensor constants that were added as buffers.
+    _rewrite_dynamo_tensor_constants(
+        orig_mod_buffers=set(mod.buffers()),
+        traced_mod_buffers=dict(gm_torch_level.named_buffers()),
+        graph_signature=ep_non_strict.sig,
+        constants=ep_non_strict.constants,
+    )
+    # 2. Restore FQN of param/buffers
+    param_buffer_table: Dict[str, str] = _get_param_buffer_mapping(mod, gm_torch_level)
+    _replace_param_buffer_names(param_buffer_table, export_graph_signature)
+
+    # 3. Remove non-persistent buffers from the graph signature
+    _rewrite_non_persistent_buffers(mod, ep_non_strict.sig, ep_non_strict.constants)
+
+    # 4. Rewrite constants to have the same FQN as the original module.
+    _remap_constants(constant_attrs, export_graph_signature, constants)
+
+    # 5. Rename constants nodes in graph module from buffers to constants
+    _rename_constants_nodes(gm, export_graph_signature)
 
     module_call_signatures = {
         fqn: ModuleCallSignature(inputs=[], outputs=[], **specs)
@@ -770,30 +1332,40 @@ def _aot_export_strict(gm_torch_level: torch.fx.GraphModule, args, **kwargs):
         assert res is not None
         gm = res.graph_module
 
+    # We can't get rid of this yet, since for some reason
+    # insert_deferred_runtime_assertions doesn't add assertions to cond
+    # subgraphs
+    if len(range_constraints) > 0:
+        stack_trace = (
+            'File "torch/_export/passes/add_runtime_assertions_for_constraints_pass.py", line 46, '
+            "in _AddRuntimeAssertionsForInlineConstraintsPass"
+        )
+        with dynamo_fake_mode, gm._set_create_node_hook(
+            functools.partial(_node_metadata_hook, stack_trace=stack_trace)
+        ):
+            res = _AddRuntimeAssertionsForInlineConstraintsPass(range_constraints)(gm)
+        assert res is not None
+        gm = res.graph_module
+
     assert orig_out_spec is not None
+    _verify_nn_module_stack(gm)
+    _verify_stack_trace(gm)
+    _verify_placeholder_names(gm, export_graph_signature)
     exported_program = ExportedProgram(
         root=gm,
         graph=gm.graph,
         graph_signature=export_graph_signature,
-        # TODO(zhxchen17) Return empty state_dict for functions.
-        state_dict=params_buffers,
+        state_dict=original_state_dict,
         range_constraints=range_constraints,
-        module_call_graph=[
-            ModuleCallEntry(
-                "",
-                ModuleCallSignature(
-                    inputs=[], outputs=[], in_spec=orig_in_spec, out_spec=orig_out_spec
-                ),
-            )
-        ]
-        + [ModuleCallEntry(fqn, sig) for fqn, sig in module_call_signatures.items()],
+        module_call_graph=_make_module_call_graph(
+            _EXPORT_MODULE_HIERARCHY,
+            orig_in_spec,
+            orig_out_spec,
+            module_call_signatures,
+        ),
         example_inputs=(args, kwargs),
-        tensor_constants=tensor_constants,
+        constants=constants,
     )
-
-    if len(range_constraints) > 0:
-        exported_program = exported_program._transform_do_not_use(
-            _AddRuntimeAssertionsForInlineConstraintsPass(range_constraints)
-        )
+    log.debug("Exported program from AOTAutograd:\n%s", exported_program)
 
     return exported_program
diff --git a/torch/export/_tree_utils.py b/torch/export/_tree_utils.py
new file mode 100644
index 0000000000000..a1615ebd5f586
--- /dev/null
+++ b/torch/export/_tree_utils.py
@@ -0,0 +1,64 @@
+from typing import Any, Callable, Dict, Optional
+
+from torch.utils._pytree import Context, TreeSpec
+
+
+def reorder_kwargs(user_kwargs: Dict[str, Any], spec: TreeSpec) -> Dict[str, Any]:
+    """Reorder user-provided kwargs to match the order in `spec`. `spec` is
+    expected to be the in_spec of an exported program, i.e. the spec that
+    results from flattening `(args, kwargs)`.
+
+    We need this to provide consistent input ordering, such so that users can
+    pass in foo(a=a, b=b) OR foo(b=b, a=a) and receive the same result.
+    """
+    # Make sure that the spec is actually shaped like (args, kwargs)
+    assert spec.type is tuple
+    assert spec.num_children == 2
+    kwargs_spec = spec.children_specs[1]
+    assert kwargs_spec.type is dict
+
+    if set(user_kwargs) != set(kwargs_spec.context):
+        raise ValueError(
+            f"kwarg key mismatch: "
+            f"Got {list(user_kwargs)} but expected {kwargs_spec.context}"
+        )
+
+    reordered_kwargs = {}
+    for kw in kwargs_spec.context:
+        reordered_kwargs[kw] = user_kwargs[kw]
+
+    return reordered_kwargs
+
+
+def is_equivalent(
+    spec1: TreeSpec,
+    spec2: TreeSpec,
+    equivalence_fn: Callable[[Optional[type], Context, Optional[type], Context], bool],
+) -> bool:
+    """Customizable equivalence check for two TreeSpecs.
+
+    Arguments:
+        spec1: The first TreeSpec to compare
+        spec2: The second TreeSpec to compare
+        equivalence_fn: A function to determine the equivalence of two
+            TreeSpecs by examining their types and contexts. It will be called like:
+
+                equivalence_fn(spec1.type, spec1.context, spec2.type, spec2.context)
+
+            This function will be applied recursively to all children.
+
+    Returns:
+        True if the two TreeSpecs are equivalent, False otherwise.
+    """
+    if not equivalence_fn(spec1.type, spec1.context, spec2.type, spec2.context):
+        return False
+
+    # Recurse on children
+    if len(spec1.children_specs) != len(spec2.children_specs):
+        return False
+
+    for child_spec1, child_spec2 in zip(spec1.children_specs, spec2.children_specs):
+        if not is_equivalent(child_spec1, child_spec2, equivalence_fn):
+            return False
+
+    return True
diff --git a/torch/export/_unlift.py b/torch/export/_unlift.py
index 4b4a9914d4bf2..52ce64e4dcadc 100644
--- a/torch/export/_unlift.py
+++ b/torch/export/_unlift.py
@@ -1,247 +1,223 @@
 import copy
+from itertools import chain
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 import torch.utils._pytree as pytree
-from torch._export.utils import _check_input_constraints_pre_hook
+from torch._export.utils import _check_input_constraints_for_graph
+from torch.export.unflatten import _assign_attr, _AttrKind
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
+from ._remove_effect_tokens_pass import _remove_effect_tokens
 
-from .exported_program import ExportedProgram
+from .exported_program import (
+    ExportedProgram,
+    ExportGraphSignature,
+    InputKind,
+    OutputKind,
+)
 
 
-def _unlift(
-    gm,
-    inp_pos_to_param_buffer_name,
-    in_spec,
-    out_spec,
-    state_dict,
-    tensor_constants,
-    buffers_to_mutate,
-):
-    count = 0
-    buffer_name_to_node = {}
-    # Step 1: make lifted params as get_attr
+@torch._dynamo.disable
+def _check_input_constraints_pre_hook(self, *args, **kwargs):
+    flat_args_with_path, received_spec = pytree.tree_flatten_with_path(args)
+
+    if received_spec != self._in_spec:
+        raise ValueError(  # noqa: TRY200
+            "Trying to flatten user inputs with exported input tree spec: \n"
+            f"{self._in_spec}\n"
+            "but actually got inputs with tree spec of: \n"
+            f"{received_spec}"
+        )
+
+    return _check_input_constraints_for_graph(
+        [node for node in self.graph.nodes if node.op == "placeholder"],
+        flat_args_with_path,
+        self.range_constraints,
+    )
+
+
+def _unlift_inputs_as_getattr(
+    gm: torch.fx.GraphModule,
+    lifted_inputs: List[Optional[str]],
+) -> Tuple[Dict[str, torch.fx.Node], Dict[str, torch.fx.Node]]:
+    """
+    Unlift inputs referring to params/buffers/constants as getattr nodes in the
+    graph
+    """
+    unlifted_name_to_node = {}
+    input_name_to_node = {}
+
+    placeholder_nodes = [node for node in gm.graph.nodes if node.op == "placeholder"]
+    assert len(lifted_inputs) == len(placeholder_nodes)
+    for input_node, lifted_node in zip(placeholder_nodes, lifted_inputs):
+        if lifted_node is None:
+            input_name_to_node[input_node.name] = input_node
+
+        else:
+            with gm.graph.inserting_after(input_node):
+                getattr_node = gm.graph.get_attr(lifted_node)
+                input_node.replace_all_uses_with(getattr_node)
+                metadata = input_node.meta
+                gm.graph.erase_node(input_node)
+                getattr_node.meta = metadata
+                unlifted_name_to_node[lifted_node] = getattr_node
+
+    return unlifted_name_to_node, input_name_to_node
+
+
+def _insert_copy_for_mutations(
+    gm: torch.fx.GraphModule,
+    mutated_outputs: List[Optional[str]],
+    unlifted_name_to_node: Dict[str, torch.fx.Node],
+    input_name_to_node: Dict[str, torch.fx.Node],
+) -> None:
+    """
+    Find the all the buffers and inputs that were mutated and insert copy_
+    operators to reflect mutations.
+    """
+    output_node = None
     for node in gm.graph.nodes:
-        if node.op == "placeholder":
-            if count in inp_pos_to_param_buffer_name:
-                with gm.graph.inserting_after(node):
-                    getattr_node = gm.graph.get_attr(
-                        inp_pos_to_param_buffer_name[count]
-                    )
-                    node.replace_all_uses_with(getattr_node)
-                    metadata = node.meta
-                    gm.graph.erase_node(node)
-                    getattr_node.meta = metadata
-                    buffer_name_to_node[
-                        inp_pos_to_param_buffer_name[count]
-                    ] = getattr_node
-
-            count += 1
-        # Step 2: Find the all the buffers that were mutated and update them
         if node.op == "output":
-            user_output_nodes = []
-            # In the case that the same node is returned multiple times,
-            # node.all_input_nodes will only iterate that node once
-            for return_node in pytree.tree_flatten(node.args)[0]:
-                return_node_name = return_node.name
-                # we found a param/buffer mutation
-                if return_node_name in buffers_to_mutate:
-                    # TODO Fix situation here to replace dot with underscore...
-                    buffer_node_name = buffers_to_mutate[return_node_name].replace(
-                        ".", "_"
-                    )
-                    assert buffer_node_name in buffer_name_to_node
-                    buffer_node = buffer_name_to_node[buffer_node_name]
-                    with gm.graph.inserting_before(node):
-                        buffer_update_node = gm.graph.call_function(
-                            torch.ops.aten.copy_.default, (buffer_node, return_node)
-                        )
-                else:
-                    user_output_nodes.append(return_node)
-            with gm.graph.inserting_before(node):
-                # Only return user outputs
-                new_output = gm.graph.output(tuple(user_output_nodes))
-                node.replace_all_uses_with(new_output)
-                gm.graph.erase_node(node)
-
-    # Step 3: Fix the input/output of the graph now that we deleted
-    # some args.
-    gm.graph.lint()
+            output_node = node
+            break
+    assert output_node is not None
+    outputs = pytree.tree_flatten(output_node.args)[0]
+    assert len(outputs) == len(mutated_outputs)
 
-    if (
-        in_spec.type == tuple
-        and in_spec.num_children == 2
-        and in_spec.children_specs[0].type == tuple
-        and in_spec.children_specs[1].type == dict
-    ):
-        # if in_spec contains the args (tuple) and kwargs (dict)
-        num_args = (
-            in_spec.children_specs[0].num_children
-            + in_spec.children_specs[1].num_children
-        )
-    else:
-        num_args = in_spec.num_children
+    user_output_nodes = []
+    for return_node, mutated_node_name in zip(outputs, mutated_outputs):
+        if mutated_node_name is None:
+            user_output_nodes.append(return_node)
+            continue
+
+        if mutated_node_name in unlifted_name_to_node:
+            mutated_node = unlifted_name_to_node[mutated_node_name]
+        elif mutated_node_name in input_name_to_node:
+            mutated_node = input_name_to_node[mutated_node_name]
+        else:
+            raise RuntimeError(
+                f"Could not find {mutated_node_name} in either buffer or input nodes"
+            )
 
-    names = [f"arg_{i}" for i in range(num_args)]
+        with gm.graph.inserting_before(output_node):
+            _ = gm.graph.call_function(
+                torch.ops.aten.copy_.default, (mutated_node, return_node)
+            )
+
+    with gm.graph.inserting_before(output_node):
+        # Only return user outputs
+        new_output = gm.graph.output(tuple(user_output_nodes))
+        output_node.replace_all_uses_with(new_output)
+        gm.graph.erase_node(output_node)
+
+
+def _get_codegen(
+    in_spec: pytree.TreeSpec,
+    out_spec: Optional[pytree.TreeSpec],
+    forward_arg_names: Optional[List[str]] = None,
+) -> _PyTreeCodeGen:
+    """
+    Create the codegen for the graph module based on the in/out specs
+    """
+    if forward_arg_names:
+        names = forward_arg_names
+    else:
+        if (
+            in_spec.type == tuple
+            and in_spec.num_children == 2
+            and in_spec.children_specs[0].type == tuple
+            and in_spec.children_specs[1].type == dict
+        ):
+            # if in_spec contains the args (tuple) and kwargs (dict)
+            names = [f"arg_{i}" for i in range(in_spec.children_specs[0].num_children)]
+            # add kwarg names
+            names.extend(in_spec.children_specs[1].context)
+        else:
+            names = [f"arg_{i}" for i in range(in_spec.num_children)]
 
-    gm.graph._codegen = _PyTreeCodeGen(
+    return _PyTreeCodeGen(
         _PyTreeInfo(
             names,
             in_spec,
             out_spec,
         )
     )
-    gm.recompile()
 
-    # Step 4: Find state references in HigherOrderOps and recursively
-    # fix them.
-    for node in gm.graph.nodes:
-        if node.op == "call_function" and node.target == torch.ops.cond:
-            pred, true_graph, false_graph, operands = node.args
-            true_gm = getattr(gm, true_graph.name)
-            false_gm = getattr(gm, false_graph.name)
-            inp_pos_to_param_buffer_name_for_submod = {}
-            real_operands = []
-            for ix, operand in enumerate(operands):
-                if operand.target in inp_pos_to_param_buffer_name.values():
-                    inp_pos_to_param_buffer_name_for_submod[ix] = operand.target
-                    if operand.target in state_dict:
-                        value = state_dict[operand.target]
-                    elif operand.target in tensor_constants:
-                        value = tensor_constants[operand.target]
-                    else:
-                        raise RuntimeError("Unable to find value for ", operand.target)
-                    true_gm.register_buffer(operand.target, value)
-                    false_gm.register_buffer(operand.target, value)
-                else:
-                    real_operands.append(operand)
-            node.args = (pred, true_graph, false_graph, real_operands)
-
-            _, in_spec = pytree.tree_flatten(real_operands)
-
-            _unlift(
-                true_gm,
-                inp_pos_to_param_buffer_name_for_submod,
-                in_spec,
-                None,
-                state_dict,
-                tensor_constants,
-                buffers_to_mutate,
-            )
-            _unlift(
-                false_gm,
-                inp_pos_to_param_buffer_name_for_submod,
-                in_spec,
-                None,
-                state_dict,
-                tensor_constants,
-                buffers_to_mutate,
-            )
-        if node.op == "call_function" and node.target.__name__ == "map_impl":
-            body_graph, num_mapped, *operands = node.args
-            body_gm = getattr(gm, body_graph.name)
-            inp_pos_to_buffer_name_for_submod = {}
-            real_operands = []
-            # TODO Fix situation here to replace dot with underscore...
-            state_dict_for_lookup = {
-                key.replace(".", "_"): value for key, value in state_dict.items()
-            }
-            for ix, operand in enumerate(operands):
-                if operand.target in inp_pos_to_param_buffer_name.values():
-                    inp_pos_to_buffer_name_for_submod[ix] = operand.target
-                    if operand.target in state_dict_for_lookup:
-                        value = state_dict_for_lookup[operand.target]
-                    elif operand.target in tensor_constants:
-                        value = tensor_constants[operand.target]
-                    else:
-                        raise RuntimeError(f"Unable to find value for {operand.target}")
-                    body_gm.register_buffer(operand.target, value)
-                else:
-                    real_operands.append(operand)
-            node.args = (body_graph, num_mapped, *real_operands)
-
-            _, in_spec = pytree.tree_flatten(real_operands)
-
-            _unlift(
-                body_gm,
-                inp_pos_to_buffer_name_for_submod,
-                in_spec,
-                None,
-                state_dict,
-                tensor_constants,
-                buffers_to_mutate,
-            )
+
+def _unlift(
+    gm: torch.fx.GraphModule,
+    lifted_inputs: List[Optional[str]],
+    mutated_outputs: List[Optional[str]],
+    in_spec: pytree.TreeSpec,
+    out_spec: Optional[pytree.TreeSpec],
+    state_dict: Dict[str, Any],
+    constants: Dict[str, Any],
+    forward_arg_names: Optional[List[str]] = None,
+):
+    """
+    Args:
+        lifted_inputs: A list matching the graph module's input nodes. For
+        an input node that is referring to a lifted parameter/buffer, this
+        list will contain the fqn the corresponding attribute. Otherwise, this
+        list will contain None. This is used to unlift the lifted parameters as
+        get_attr nodes.
+
+        mutated_outputs: A list matching the graph module's output nodes. For
+        an output node that is referring to a mutated buffer or user input, this
+        list will contain the name of the corresponding buffer or user input
+        that needs to be mutated. Otherwise, this list will contain None. This
+        is used to re-insert an inplace copy_ operator to copy the mutated
+        values back to the original node.
+    """
+    unlifted_name_to_node, input_name_to_node = _unlift_inputs_as_getattr(
+        gm, lifted_inputs
+    )
+    _insert_copy_for_mutations(
+        gm, mutated_outputs, unlifted_name_to_node, input_name_to_node
+    )
+    gm.graph._codegen = _get_codegen(in_spec, out_spec, forward_arg_names)
     gm.graph.lint()
     gm.graph.eliminate_dead_code()
     gm.recompile()
     return gm
 
 
-def _construct_inp_pos_to_param_buffer_name(
-    new_gm, graph_signature, state_dict, tensor_constants=None
-):
-    # TODO Fix the period in params/buffers names later
-    # maybe a pass to replace graph signature with fixed names
-    param_buffer_name_to_corrected_name = {}
-    constant_name_to_corrected_name = {}
-
-    for name, value in state_dict.items():
-        if name in graph_signature.buffers:
-            if "." in name:
-                new_gm.register_buffer(name.replace(".", "_"), value)
-                param_buffer_name_to_corrected_name[name] = name.replace(".", "_")
-            else:
-                new_gm.register_buffer(name, value)
-        if name in graph_signature.parameters:
-            if "." in name:
-                new_gm.register_parameter(name.replace(".", "_"), value)
-                param_buffer_name_to_corrected_name[name] = name.replace(".", "_")
-            else:
-                new_gm.register_parameter(name, value)
-
-    if tensor_constants is not None and len(tensor_constants) > 0:
-        assert hasattr(graph_signature, "lifted_tensor_constants")
-        for name, value in tensor_constants.items():
-            if name in graph_signature.lifted_tensor_constants:
-                if isinstance(value, torch.Tensor):
-                    new_gm.register_buffer(name.replace(".", "_"), value)
-                else:
-                    setattr(new_gm, name.replace(".", "_"), value)
-                constant_name_to_corrected_name[name] = name.replace(".", "_")
-
-    count = 0
-    inp_pos_to_param_buffer_name = {}
-    for node in new_gm.graph.nodes:
-        if node.op == "placeholder":
-            if node.name in graph_signature.inputs_to_buffers:
-                buffer_name = graph_signature.inputs_to_buffers[node.name]
-                if buffer_name in param_buffer_name_to_corrected_name:
-                    inp_pos_to_param_buffer_name[
-                        count
-                    ] = param_buffer_name_to_corrected_name[buffer_name]
-                else:
-                    inp_pos_to_param_buffer_name[count] = buffer_name
-            if node.name in graph_signature.inputs_to_parameters:
-                param_name = graph_signature.inputs_to_parameters[node.name]
-                if param_name in param_buffer_name_to_corrected_name:
-                    inp_pos_to_param_buffer_name[
-                        count
-                    ] = param_buffer_name_to_corrected_name[param_name]
-                else:
-                    inp_pos_to_param_buffer_name[count] = param_name
-            if hasattr(graph_signature, "inputs_to_lifted_tensor_constants"):
-                if node.name in graph_signature.inputs_to_lifted_tensor_constants:
-                    constant_name = graph_signature.inputs_to_lifted_tensor_constants[
-                        node.name
-                    ]
-                    if constant_name in constant_name_to_corrected_name:
-                        inp_pos_to_param_buffer_name[
-                            count
-                        ] = constant_name_to_corrected_name[constant_name]
-                    else:
-                        inp_pos_to_param_buffer_name[count] = constant_name
-            count += 1
-
-    return inp_pos_to_param_buffer_name
+def _register_attrs_to_new_gm(
+    new_gm: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
+    state_dict: Dict[str, Any],
+    constants: Dict[str, Any],
+) -> None:
+    non_persistent_buffers = set(graph_signature.non_persistent_buffers)
+    for name in graph_signature.buffers:
+        if name in non_persistent_buffers:
+            persistent = False
+            value = constants[name]
+        else:
+            persistent = True
+            value = state_dict[name]
+        _assign_attr(
+            value, new_gm, name, attr_kind=_AttrKind.BUFFER, persistent=persistent
+        )
+    for name in graph_signature.parameters:
+        value = state_dict[name]
+        _assign_attr(
+            value,
+            new_gm,
+            name,
+            attr_kind=_AttrKind.PARAMETER,
+        )
+
+    for name in chain(
+        graph_signature.lifted_custom_objs, graph_signature.lifted_tensor_constants
+    ):
+        value = constants[name]
+        _assign_attr(
+            value,
+            new_gm,
+            name,
+            attr_kind=_AttrKind.CONSTANT,
+        )
 
 
 class _StatefulGraphModuleFactory(type):
@@ -265,12 +241,16 @@ def _create(cls, root, graph, range_constraints=None):
 class _StatefulGraphModule(torch.fx.GraphModule, metaclass=_StatefulGraphModuleFactory):
     def __init__(self, root, graph, range_constraints=None):
         super().__init__(root, graph)
+        # Need to fix up non-persistent buffers.
         self.range_constraints = range_constraints or []
 
 
 def _create_stateful_graph_module(
     plain_graph_module: torch.fx.GraphModule,
     range_constraints,
+    # TODO(suo) this should not be optional, but is since we still ahve
+    # capture_pre_autograd_graph grr
+    graph_signature: Optional[ExportGraphSignature] = None,
 ):
     stateful_gm = _StatefulGraphModule._create(
         plain_graph_module,
@@ -280,23 +260,67 @@ def _create_stateful_graph_module(
     stateful_gm.register_forward_pre_hook(
         _check_input_constraints_pre_hook, with_kwargs=True
     )
+
+    if graph_signature is None:
+        return stateful_gm
+    # Fix up non-persistent buffers. torch.fx does not distinguish between
+    # persistent and non-persistent buffers, so we must restore that distinction
+    # here.
+    for buffer in graph_signature.non_persistent_buffers:
+        _assign_attr(
+            plain_graph_module.get_buffer(buffer),
+            stateful_gm,
+            buffer,
+            attr_kind=_AttrKind.BUFFER,
+            persistent=False,
+        )
+
     return stateful_gm
 
 
 def _unlift_exported_program_lifted_states(ep: ExportedProgram) -> torch.nn.Module:
-    new_gm = copy.deepcopy(ep.graph_module)
-    inp_pos_to_param_buffer_name = _construct_inp_pos_to_param_buffer_name(
-        new_gm, ep.graph_signature, ep.state_dict, ep.tensor_constants
-    )
+    ep = _remove_effect_tokens(ep)
+    new_gm = torch.fx.GraphModule(ep.graph_module, copy.deepcopy(ep.graph))
+    _register_attrs_to_new_gm(new_gm, ep.graph_signature, ep.state_dict, ep.constants)
+    forward_arg_names = ep.graph_module.meta.get("forward_arg_names")
+
+    lifted_inputs: List[Optional[str]] = [
+        (
+            in_spec.target
+            if in_spec.kind
+            in (
+                InputKind.BUFFER,
+                InputKind.CONSTANT_TENSOR,
+                InputKind.PARAMETER,
+                InputKind.CUSTOM_OBJ,
+            )
+            else None
+        )
+        for in_spec in ep.graph_signature.input_specs
+    ]
+
+    mutated_outputs: List[Optional[str]] = [
+        (
+            out_spec.target
+            if out_spec.kind
+            in (OutputKind.BUFFER_MUTATION, OutputKind.USER_INPUT_MUTATION)
+            else None
+        )
+        for out_spec in ep.graph_signature.output_specs
+    ]
+
     new_gm = _unlift(
         new_gm,
-        inp_pos_to_param_buffer_name,
+        lifted_inputs,
+        mutated_outputs,
         ep.call_spec.in_spec,
         ep.call_spec.out_spec,
         ep.state_dict,
-        ep.tensor_constants,
-        ep.graph_signature.buffers_to_mutate,
+        ep.constants,
+        forward_arg_names=forward_arg_names,
+    )
+    unlift_gm = _create_stateful_graph_module(
+        new_gm, ep.range_constraints, ep.graph_signature
     )
-    unlift_gm = _create_stateful_graph_module(new_gm, ep.range_constraints)
     unlift_gm.meta.update(ep.graph_module.meta)
     return unlift_gm
diff --git a/torch/export/custom_obj.py b/torch/export/custom_obj.py
new file mode 100644
index 0000000000000..8e7f2080a4ee7
--- /dev/null
+++ b/torch/export/custom_obj.py
@@ -0,0 +1,16 @@
+from dataclasses import dataclass
+
+
+__all__ = ["ScriptObjectMeta"]
+
+
+@dataclass
+class ScriptObjectMeta:
+    """
+    Metadata which is stored on nodes representing ScriptObjects.
+    """
+
+    # Key into constants table to retrieve the real ScriptObject.
+    constant_name: str
+
+    class_fqn: str
diff --git a/torch/export/dynamic_shapes.py b/torch/export/dynamic_shapes.py
index 673e5436bc44f..eba8333323445 100644
--- a/torch/export/dynamic_shapes.py
+++ b/torch/export/dynamic_shapes.py
@@ -8,12 +8,16 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, TYPE_CHECKING, Union
 
 import torch
-from torch._subclasses.fake_tensor import FakeTensor
+from torch.utils._pytree import _get_node_type, BUILTIN_TYPES, SUPPORTED_NODES, tree_map
+
 from .exported_program import ExportedProgram
 
 if TYPE_CHECKING:
-    from ..fx.experimental.symbolic_shapes import StrictMinMaxConstraint
+    from sympy import Symbol
+
+    from torch._guards import Source
 
+    from ..fx.experimental.symbolic_shapes import ShapeEnv, StrictMinMaxConstraint
 
 __all__ = ["Constraint", "Dim", "dims", "dynamic_dim"]
 
@@ -37,6 +41,127 @@ def readable(name, min_, max_):
             return f"Dim('{name}', min={min_})"
         return f"Dim('{name}', min={min_}, max={max_})"
 
+    def __add__(cls, other):
+        # e.g., dim + 1
+        if type(other) is not int:
+            raise NotImplementedError(
+                f"Attempted to add {other} to {cls.__name__}, where an integer was expected. "
+                "(Only increasing linear operations with integer coefficients are supported.)"
+            )
+        return cls._derive(lambda x: x + other)
+
+    def __radd__(cls, other):
+        return cls + other
+
+    def __sub__(cls, other):
+        # e.g., dim - 1
+        if type(other) is not int:
+            raise NotImplementedError(
+                f"Attempted to subtract {other} from {cls.__name__}, where an integer was expected. "
+                "(Only increasing linear operations with integer coefficients are supported.)"
+            )
+        return cls._derive(lambda x: x - other)
+
+    def __rsub__(cls, other):
+        raise NotImplementedError(
+            f"Attempted to negate {cls.__name__}. "
+            "(Only increasing linear operations with integer coefficients are supported.)"
+        )
+
+    def __mul__(cls, other):
+        # e.g., dim * 2
+        if type(other) is not int or other <= 0:
+            raise NotImplementedError(
+                f"Attempted to multiply {other} with {cls.__name__}, where a positive integer was expected. "
+                "(Only increasing linear operations with integer coefficients are supported.)"
+            )
+        return cls._derive(lambda x: x * other)
+
+    def __rmul__(cls, other):
+        return cls * other
+
+    def _derived_name(cls, fn):
+        from sympy import sympify
+
+        return str(fn(sympify(cls.__name__)))
+
+    def _derive(cls, fn):
+        return _DerivedDim(cls._derived_name(fn), (int,), {"root": cls, "fn": fn})
+
+
+class _StaticDim(_Dim):
+    """
+    Meta class for static :func:`Dim` types.
+
+    This class is only for setting and checking static dim constraints,
+    and the user should never interact with it.
+    """
+
+    @property
+    def min(self):
+        return self.value  # type: ignore[attr-defined]
+
+    @property
+    def max(self):
+        return self.value  # type: ignore[attr-defined]
+
+
+class _DerivedDim(_Dim):
+    """
+    Metaclass for derived :func:`Dim` types.
+
+    Currently we only support increasing linear expressions with integer coefficients.
+    In other words, a derived Dim can always be written in the form Ax + B, where
+    x is a regular Dim (i.e., non-derived Dim), A and B are integers, and A is positive.
+    (In particular, the latter ensures that x < y => Ax + B < Ay + B.)
+    These restrictions on the form of derived Dims makes the metatheory simpler: e.g.,
+    it simplifies computing ranges for derived Dims, solving for underlying regular Dims,
+    deciding equalities between derived Dims, and so on.
+
+    The function lambda x: Ax + B is expressed by `fn`, where x is a normal Dim, `root`.
+    The range of a derived Dim is computed by mapping `fn` over the range of its `root`.
+    """
+
+    @property
+    def min(self):
+        # assume that self.fn is an increasing function
+        # TODO(avik): use sympy value range analysis instead?
+        from sympy import Integer
+
+        _min_symint = self.fn(Integer(self.root.min))  # type: ignore[attr-defined]
+        root = self.root  # type: ignore[attr-defined]
+        assert _min_symint >= 0, (
+            f"Expected derived min value of {self.__name__} to be >= 0. "
+            f"Please specify an appropriate min value for {root.__name__} "
+            f"(currently {root.min})."
+        )
+        return int(_min_symint)
+
+    @property
+    def max(self):
+        # assume that self.fn is an increasing function
+        # TODO(avik): use sympy value range analysis instead?
+        from sympy import Integer
+
+        _max_symint = self.fn(Integer(self.root.max))  # type: ignore[attr-defined]
+        root = self.root  # type: ignore[attr-defined]
+        assert _max_symint <= sys.maxsize - 1, (
+            f"Expected derived max value of {self.__name__} to be <= {sys.maxsize - 1}. "
+            f"Please specify an appropriate max value for {root.__name__} "
+            f"(currently {root.max})."
+        )
+        return int(_max_symint)
+
+    def _derive(self, fn):
+        # We support nesting, e.g., 2*dim + 1.
+        # This is implemented by composing operations on the same root.
+        # As a consequence, roots are always regular Dims (i.e., not derived Dims).
+        return _DerivedDim(
+            self._derived_name(fn),
+            (int,),
+            {"root": self.root, "fn": lambda x: fn(self.fn(x))},  # type: ignore[attr-defined]
+        )
+
 
 def Dim(name: str, *, min: Optional[int] = None, max: Optional[int] = None):
     """
@@ -53,7 +178,7 @@ def Dim(name: str, *, min: Optional[int] = None, max: Optional[int] = None):
     Returns:
         A type that can be used in dynamic shape specifications for tensors.
     """
-    _min = 2 if min is None else builtins.max(min, 2)
+    _min = 0 if min is None else min
     _max = sys.maxsize - 1 if max is None else builtins.min(max, sys.maxsize - 1)
     assert _max > _min, f"Cannot create Dim with inconsistent min={min}, max={max}"
     dim = _Dim(name, (int,), {"min": _min, "max": _max})
@@ -85,7 +210,7 @@ class directly; instead, use :func:`dynamic_dim`.
 
 class _ConstraintFactory(type):
     """
-    Metaclass that ensures a private constructor for :class:`Constraint`
+    Metaclass that ensures a private constructor for :class:`_Constraint`
     """
 
     def __call__(cls, *args, **kwargs):
@@ -105,15 +230,17 @@ def _create(
 def _create_constraint(
     w_tensor, t_id, dim, constraint_range, shared=None, debug_name=None
 ):
-    return Constraint._create(w_tensor, t_id, dim, constraint_range, shared, debug_name)
+    return _Constraint._create(
+        w_tensor, t_id, dim, constraint_range, shared, debug_name
+    )
 
 
 @dataclasses.dataclass
-class Constraint(_ConstraintTarget, metaclass=_ConstraintFactory):
+class _Constraint(_ConstraintTarget, metaclass=_ConstraintFactory):
     """
 
     .. warning::
-        Do not construct :class:`Constraint` directly, use :func:`dynamic_dim` instead.
+        Do not construct :class:`_Constraint` directly, use :func:`dynamic_dim` instead.
 
     This represents constraints on input tensor dimensions, e.g., requiring
     them to be fully polymorphic or within some range.
@@ -127,7 +254,7 @@ class Constraint(_ConstraintTarget, metaclass=_ConstraintFactory):
     shared: Optional[_ConstraintTarget] = None
     debug_name: Optional[str] = None
 
-    def _clone_with_range(self, lower=2, upper=math.inf):
+    def _clone_with_range(self, lower=0, upper=math.inf):
         # Import sympy locally
         from torch.fx.experimental.symbolic_shapes import StrictMinMaxConstraint
         from torch.utils._sympy.value_ranges import ValueRanges
@@ -163,8 +290,8 @@ def __bool__(self):
         # and moreover, enforces that any overload of __bool__ must return True or False.
         # FWIW, sympy also raises TypeError in this case.
         raise TypeError(
-            "Cannot determine truth value of Constraint. "
-            "If you are trying to combine Constraint's with logical connectives, "
+            "Cannot determine truth value of _Constraint. "
+            "If you are trying to combine _Constraint's with logical connectives, "
             "you can specify them separately instead."
         )
 
@@ -182,18 +309,10 @@ def serializable_spec(self):
             "dim": self.dim,
             "min": self.constraint_range.vr.lower,
             "max": self.constraint_range.vr.upper,
-            "shared": (
-                None
-                if self.shared is None
-                else {
-                    "t_id": self.shared.t_id,
-                    "dim": self.shared.dim,
-                }
-            ),
         }
 
     def __eq__(self, other):
-        if not isinstance(other, Constraint):
+        if not isinstance(other, _Constraint):
             raise TypeError(
                 "A dynamic dim can be specified equal only to another dynamic dim. "
                 f"Equality with {type(other)} is not supported."
@@ -221,13 +340,69 @@ def __eq__(self, other):
         )
 
 
+@dataclasses.dataclass
+class _PhantomRoot:
+    """
+    This represents the root of a derived Dim where the root does not directly
+    specify the shape of any input dimension, but the derived Dim does.
+
+    e.g., the input shapes 2*dim and dim + 1 are related via a "phantom" dim.
+
+    The fields `name`, `constraint_range`, and `val` carried by a phantom root
+    help create a symbol for it. Any derived dims with this phantom root are
+    backed by expressions over this symbol.
+    """
+
+    name: str
+    constraint_range: "StrictMinMaxConstraint"
+    val: int
+
+
+@dataclasses.dataclass
+class _DerivedConstraint(_ConstraintTarget):
+    """
+    This represents a derived Dim, whose root is either a regular constraint target
+    (which directly specifies the shape of some input dimension) or a phantom root
+    (which does so indirectly).
+    """
+
+    # NOTE: This is not currently a subclass of _Constraint because we do not support
+    # `shared` for derived `Dim`s. Indeed, sharing is a necessary concept only for
+    # legacy constraints based on `dynamic_dim`: equality can be expressed simply by
+    # reusing the same (derived or normal) `Dim`.
+    root: Union[_ConstraintTarget, _PhantomRoot]
+    fn: Callable
+    constraint_range: "StrictMinMaxConstraint"
+    debug_name: Optional[str] = None
+
+    @property
+    def shared(self):
+        # Some code paths expect a union of _Constraint and _DerivedConstraint.
+        # Thus we expose a `shared` field that is always None.
+        # TODO(avik): clean this up
+        return None
+
+    @property
+    def serializable_spec(self):
+        # same as _Constraint.serializable_spec
+        return {
+            "t_id": self.t_id,
+            "dim": self.dim,
+            "min": self.constraint_range.vr.lower,
+            "max": self.constraint_range.vr.upper,
+        }
+
+
+Constraint = Union[_Constraint, _DerivedConstraint]
+
+
 def dynamic_dim(t: torch.Tensor, index: int, debug_name: Optional[str] = None):
     """
     .. warning::
         (This feature is DEPRECATED. See :func:`Dim` instead.)
 
-    :func:`dynamic_dim` constructs a :class:`Constraint` object that describes the dynamism of
-    a dimension ``index`` of tensor ``t``. :class:`Constraint` objects should be passed to
+    :func:`dynamic_dim` constructs a :class:`_Constraint` object that describes the dynamism of
+    a dimension ``index`` of tensor ``t``. :class:`_Constraint` objects should be passed to
     ``constraints`` argument of :func:`export`.
 
     Args:
@@ -235,7 +410,7 @@ def dynamic_dim(t: torch.Tensor, index: int, debug_name: Optional[str] = None):
         index (int): Index of dynamic dimension
 
     Returns:
-        A :class:`Constraint` object that describes shape dynamism. It can be passed to :func:`export` so
+        A :class:`_Constraint` object that describes shape dynamism. It can be passed to :func:`export` so
         that :func:`export` does not assume static size of specified tensor, i.e. keeping it dynamic
         as a symbolic size rather than specializing according to size of example tracing input.
 
@@ -321,88 +496,296 @@ def dynamic_dim(t: torch.Tensor, index: int, debug_name: Optional[str] = None):
         id(t),
         index,
         StrictMinMaxConstraint(
-            vr=ValueRanges(lower=2, upper=sympy.oo), warn_only=False
+            vr=ValueRanges(lower=0, upper=sympy.oo), warn_only=False
         ),
         debug_name=debug_name,
     )
 
 
+def _process_equalities(
+    constraint: Constraint,
+    get_sources: Callable[[int, int], List["Source"]],
+    shape_env: "ShapeEnv",
+    source_pairs: List[Tuple["Source", "Source"]],
+    derived_equalities: List[Tuple["Source", Union["Source", "Symbol"], Callable]],
+    phantom_symbols: Dict[str, "Symbol"],
+):
+    """
+    Updates `source_pairs`, `derived_equalities`, and `phantom_symbols` (which become
+    fields of `EqualityConstraint`) based on a given input `constraint`.
+    """
+
+    source, *other_sources = get_sources(constraint.t_id, constraint.dim)
+    # When t.size()[dim] maps to src0, src1, ..., srcN, we add
+    # constraints that make src0 "equal" to src1, ..., srcN.
+    source_pairs.extend((source, other_source) for other_source in other_sources)
+    if not isinstance(constraint, _DerivedConstraint):
+        if constraint.shared is not None:
+            # Moreover, when t.size()[dim] is specified equal to t'.size()[dim']
+            # and t'.size()[dim'] maps to src1', ..., srcN', we add
+            # constraints that also make src0 "equal" to src1', ..., srcN'.
+            other_sources = get_sources(constraint.shared.t_id, constraint.shared.dim)
+            source_pairs.extend(
+                (source, other_source) for other_source in other_sources
+            )
+    else:
+        # branch based on the root of the _DerivedConstraint
+        if not isinstance(constraint.root, _PhantomRoot):
+            # either root points to an input source
+            root = get_sources(constraint.root.t_id, constraint.root.dim)[0]  # type: ignore[assignment]
+        else:
+            # or root points to a phantom symbol
+            if constraint.root.name in phantom_symbols:
+                root = phantom_symbols[constraint.root.name]  # type: ignore[assignment]
+            else:
+                # create a phantom symbol in the shape env based on the _PhantomRoot
+                root = shape_env.create_symbol(
+                    val=constraint.root.val,
+                    source=torch._dynamo.source.ConstantSource(constraint.root.name),
+                    dynamic_dim=torch.fx.experimental.symbolic_shapes.DimDynamic.DYNAMIC,
+                    constraint_dim=constraint.root.constraint_range,
+                )
+                phantom_symbols[constraint.root.name] = root  # type: ignore[assignment]
+
+        fn = constraint.fn
+        # A derived equality (source, root, fn) informally corresponds to source = fn(root).
+        # Here source describes an input and root might describe another input or a phantom symbol.
+        derived_equalities.append((source, root, fn))
+
+
+def _tree_map(
+    func: Callable[..., Any],
+    tree: Any,
+    *dynamic_shapes: Any,
+) -> Any:
+    """
+    Customized tree_map for mapping pytrees to dynamic_shapes.
+
+    For built-in types (e.g., standard collections) this behaves exactly like tree_map.
+
+    OTOH for a user-defined class C registered with pytree, we cannot assume that a C
+    containing tensors can be mapped to a C containing dynamic shapes (i.e., C may not
+    be a polymorphic container). In that case we use the flattened form of C instead.
+    Thus a C(**tensors) that flattens to (**tensors) will map to (**dynamic_shapes).
+
+    Args:
+        func: function to apply to each (int, float, str, bool, None, torch.Tensor)
+        tree: input pytree
+        dynamic_shapes: zero or more (typically one) dynamic_shapes to match
+
+    Returns:
+        output pytree mapping func to each (int, float, str, bool, None, torch.Tensor)
+    """
+
+    def is_leaf(t):
+        # BUILTIN_TYPES is a subset of SUPPORTED_NODES, the latter being all types
+        # registered with pytree. Types *not* in BUILTIN_TYPES include primitive types
+        # (int, float, str, bool, None, torch.Tensor), which are not in SUPPORTED_NODES,
+        # as well as user-defined classes registered with pytree, which are.
+        return _get_node_type(t) not in BUILTIN_TYPES
+
+    def f(t, *dynamic_shapes):
+        typ = _get_node_type(t)
+        # typ is not in BUILTIN_TYPES
+        if typ in SUPPORTED_NODES:
+            # thus typ is a user-defined class registered with pytree,
+            # in which case flatten and recurse
+            return tree_map(
+                f,
+                SUPPORTED_NODES[typ].flatten_fn(t)[0],
+                *dynamic_shapes,
+                is_leaf=is_leaf,
+            )
+        else:
+            return func(t, *dynamic_shapes)
+
+    return tree_map(f, tree, *dynamic_shapes, is_leaf=is_leaf)
+
+
+def _combine_args(f, args, kwargs):
+    # combine args and kwargs following the signature of f, as it happens
+    # in the body of f when called with *args, **kwargs
+    if isinstance(f, ExportedProgram):
+        f = f.module()
+    signature = (
+        inspect.signature(f.forward)
+        if isinstance(f, torch.nn.Module)
+        else inspect.signature(f)
+    )
+    kwargs = kwargs if kwargs is not None else {}
+    return signature.bind(*args, **kwargs).arguments
+
+
+class ShapesCollection:
+    """
+    Builder for dynamic_shapes.
+    Used to assign dynamic shape specifications to tensors that appear in inputs.
+
+    Example::
+        args = ({"x": tensor_x, "others": [tensor_y, tensor_z]})
+
+        dim = torch.export.Dim(...)
+        dynamic_shapes = torch.export.ShapesCollection()
+        dynamic_shapes[tensor_x] = (dim, dim + 1, 8)
+        dynamic_shapes[tensor_y] = {0: dim * 2}
+        # This is equivalent to the following (now auto-generated):
+        # dynamic_shapes = {"x": (dim, dim + 1, 8), "others": [{0: dim * 2}, None]}
+
+        torch.export(..., args, dynamic_shapes=dynamic_shapes)
+    """
+
+    def __init__(self):
+        self._shapes = {}
+
+    def __setitem__(self, t, shape):
+        assert isinstance(
+            t, torch.Tensor
+        ), f"Cannot assign shape to non-tensor type {type(t)}"
+        # TODO(avik): check that shape is indeed a Shape
+        t_id = id(t)
+        if t_id in self._shapes:
+            _shape = self._shapes[t_id]
+            assert (
+                shape == _shape
+            ), f"Shapes assigned to tensor do not match: expected {_shape}, got {shape}"
+        else:
+            self._shapes[id(t)] = shape
+
+    def __getitem__(self, t):
+        t_id = id(t)
+        if t_id in self._shapes:
+            return self._shapes[t_id]
+        else:
+            return None
+
+    def __len__(self):
+        return len(self._shapes)
+
+    def dynamic_shapes(self, m, args, kwargs=None):
+        """
+        Generate dynamic_shapes.
+        """
+
+        t_ids = set()
+
+        def find_shape(t):
+            t_id = id(t)
+            if t_id in self._shapes:
+                t_ids.add(t_id)
+                return self._shapes[t_id]
+            else:
+                return None
+
+        combined_args = _combine_args(m, args, kwargs)
+        dynamic_shapes = _tree_map(find_shape, combined_args)
+        if any(t_id not in t_ids for t_id in self._shapes):
+            raise ValueError(
+                "Some tensors that were assigned shapes were not found in args. "
+                "Maybe such tensors were copied when passing them as args? "
+                "Maybe such tensors are contained in classes that were not registered with pytree?"
+            )
+        return dynamic_shapes
+
+
 def _process_dynamic_shapes(
     f: Callable,
     args: Tuple[Any, ...],
     kwargs: Optional[Dict[str, Any]] = None,
-    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any], List[Any]]] = None,
 ) -> Optional[List[Constraint]]:
     from torch._dynamo.exc import UserError, UserErrorType
 
     if dynamic_shapes is None or len(dynamic_shapes) == 0:
         return None
 
-    kwargs = kwargs if kwargs is not None else {}
+    # map of Dim names representing input shape dimensions to constraints on them
+    symbols: Dict[str, List[Constraint]] = defaultdict(list)
+    # track roots that do not directly represent input shape dimensions
+    phantom_roots: Dict[str, _PhantomRoot] = {}
+    derived_constraints_with_phantom_root: List[_DerivedConstraint] = []
 
-    from collections.abc import Mapping, Sequence
+    def to_constraint(dim, tensor, i):
+        import sympy
 
-    def tree_zip(combined_args, dynamic_shapes):
-        if isinstance(combined_args, (tuple, list)):
-            if not isinstance(dynamic_shapes, Sequence):
-                raise UserError(
-                    UserErrorType.INVALID_INPUT,
-                    f"Expected dynamic_shapes of a {type(combined_args)} to be a Sequence, "
-                    f"got {dynamic_shapes} instead",
-                )
-            if len(combined_args) != len(dynamic_shapes):
-                raise UserError(
-                    UserErrorType.INVALID_INPUT,
-                    f"Expected {dynamic_shapes} to have {len(combined_args)} items",
-                )
-            for i, shape in enumerate(dynamic_shapes):
-                yield from tree_zip(combined_args[i], shape)
-        elif isinstance(combined_args, dict):
-            if not isinstance(dynamic_shapes, Mapping):
-                raise UserError(
-                    UserErrorType.INVALID_INPUT,
-                    f"Expected dynamic_shapes of a {type(combined_args)} to be a Mapping, "
-                    f"got {dynamic_shapes} instead",
-                )
-            if len(combined_args) != len(dynamic_shapes):
-                raise UserError(
-                    UserErrorType.INVALID_INPUT,
-                    f"Expected {dynamic_shapes} to have {len(combined_args)} items",
+        from torch.fx.experimental.symbolic_shapes import StrictMinMaxConstraint
+        from torch.utils._sympy.solve import try_solve
+        from torch.utils._sympy.value_ranges import ValueRanges
+
+        def root_value():
+            # given tensor.shape[i] is the value of dim = fn(root),
+            # find the value of root
+            symbol = sympy.Symbol(dim.root.__name__, integer=True)
+            expr = dim.fn(symbol)
+            solution = try_solve(sympy.Eq(expr, tensor.shape[i]), symbol)
+            if solution is not None:
+                return int(solution[1])  # type: ignore[call-overload]
+            else:
+                raise UserError(  # noqa: TRY200
+                    UserErrorType.CONSTRAINT_VIOLATION,
+                    f"Expected shape[{i}] = {tensor.shape[i]} of input Tensor to be "
+                    f"of the form {expr}, where {symbol} is an integer",
                 )
-            for k, shape in dynamic_shapes.items():
-                yield from tree_zip(combined_args[k], shape)
-        elif dataclasses.is_dataclass(combined_args):
-            if not type(dynamic_shapes) == type(combined_args):
-                raise UserError(
-                    UserErrorType.INVALID_INPUT,
-                    f"Expected dynamic_shapes of a {type(combined_args)} to be a {type(combined_args)}, "
-                    f"got {dynamic_shapes} instead",
+
+        if isinstance(dim, _DerivedDim):
+            # generate a _DerivedConstraint where the root is:
+            # - either a _ConstraintTarget (if dim.root directly describes an input shape)
+            # - or a _PhantomRoot (otherwise)
+            dim_root = dim.root  # type: ignore[attr-defined]
+            if dim_root.__name__ in symbols:
+                # root represents an input shape dimension
+                root_constraint = symbols[dim_root.__name__][0]
+                root = _ConstraintTarget(
+                    root_constraint.w_tensor,
+                    root_constraint.t_id,
+                    root_constraint.dim,
                 )
-            for f in dataclasses.fields(combined_args):
-                yield from tree_zip(
-                    getattr(combined_args, f.name), getattr(dynamic_shapes, f.name)
+            elif dim_root.__name__ not in phantom_roots:
+                # create a phantom root
+                root = _PhantomRoot(  # type: ignore[assignment]
+                    name=dim_root.__name__,
+                    constraint_range=StrictMinMaxConstraint(
+                        vr=ValueRanges(lower=dim_root.min, upper=dim_root.max),
+                        warn_only=False,
+                    ),
+                    val=root_value(),
                 )
-        elif isinstance(combined_args, torch.Tensor):
-            yield (combined_args, dynamic_shapes)
+                phantom_roots[dim_root.__name__] = root  # type: ignore[assignment]
+            else:
+                root = phantom_roots[dim_root.__name__]  # type: ignore[assignment]
+            constraint = _DerivedConstraint(
+                weakref.ref(tensor),
+                id(tensor),
+                i,
+                root,
+                dim.fn,  # type: ignore[attr-defined]
+                StrictMinMaxConstraint(
+                    vr=ValueRanges(lower=dim.min, upper=dim.max),
+                    warn_only=False,
+                ),
+                debug_name=dim.__name__,
+            )
+            if isinstance(root, _PhantomRoot):
+                # NOTE(avik): since we have not processed all inputs yet, we may replace this
+                # with a root that does represent an input shape dimension later (see below)
+                derived_constraints_with_phantom_root.append(constraint)
+        elif isinstance(dim, _StaticDim):
+            constraint = _create_constraint(
+                weakref.ref(tensor),
+                id(tensor),
+                i,
+                StrictMinMaxConstraint(
+                    vr=ValueRanges(lower=dim.value, upper=dim.value), warn_only=False  # type: ignore[attr-defined]
+                ),
+                debug_name=dim.__name__,
+            )
         else:
-            if dynamic_shapes is not None:
-                raise UserError(
-                    UserErrorType.INVALID_INPUT,
-                    f"Expected dynamic_shapes of a {type(combined_args)} to be None, "
-                    f"got {dynamic_shapes} instead",
-                )
-
-    def to_constraint(dim, tensor, i):
-        constraint = dynamic_dim(tensor, i, debug_name=dim.__name__)
-        if dim.min != 2:
-            constraint = constraint >= dim.min
-        if dim.max != sys.maxsize - 1:
-            constraint = constraint <= dim.max
+            constraint = dynamic_dim(tensor, i, debug_name=dim.__name__)
+            if dim.min != 0:
+                constraint = constraint >= dim.min
+            if dim.max != sys.maxsize - 1:
+                constraint = constraint <= dim.max
         return constraint
 
-    from collections import defaultdict
-
-    symbols = defaultdict(list)
     bounds: Dict[str, Tuple[int, int]] = {}
 
     def check_same_bounds(dim):
@@ -421,11 +804,17 @@ def check_same_bounds(dim):
             bounds[dim.__name__] = (dim.min, dim.max)
 
     def update_symbols(tensor, shape):
+        def _create_static_dim(tensor, i, value):
+            return _StaticDim(str(value), (int,), {"value": value})
+
         if isinstance(shape, dict):
             for i, dim in shape.items():
-                if isinstance(dim, _Dim):
+                if isinstance(dim, (int, _Dim)):
+                    if isinstance(dim, int):
+                        dim = _create_static_dim(tensor, i, dim)
                     check_same_bounds(dim)
-                    symbols[dim.__name__].append(to_constraint(dim, tensor, i))
+                    constraint = to_constraint(dim, tensor, i)
+                    symbols[dim.__name__].append(constraint)
                 else:
                     if dim is not None:
                         raise UserError(
@@ -435,9 +824,12 @@ def update_symbols(tensor, shape):
                         )
         elif isinstance(shape, (tuple, list)):
             for i, dim in enumerate(shape):
-                if isinstance(dim, _Dim):
+                if isinstance(dim, (int, _Dim)):
+                    if isinstance(dim, int):
+                        dim = _create_static_dim(tensor, i, dim)
                     check_same_bounds(dim)
-                    symbols[dim.__name__].append(to_constraint(dim, tensor, i))
+                    constraint = to_constraint(dim, tensor, i)
+                    symbols[dim.__name__].append(constraint)
                 else:
                     if dim is not None:
                         raise UserError(
@@ -452,118 +844,50 @@ def update_symbols(tensor, shape):
                     f"Unexpected dynamic_shape {shape} of Tensor, " "try None instead",
                 )
 
-    import inspect
-
-    if isinstance(f, ExportedProgram):
-        f = f.module()
-    signature = (
-        inspect.signature(f.forward)
-        if isinstance(f, torch.nn.Module)
-        else inspect.signature(f)
-    )
-    combined_args = signature.bind(*args, **kwargs).arguments
-
-    # This means user didn't specify dynamic shapes with argument names.
-    combined_args = combined_args if isinstance(dynamic_shapes, Mapping) else list(combined_args.values())  # type: ignore[assignment]
-    for tensor, shape in tree_zip(combined_args, dynamic_shapes):
-        update_symbols(tensor, shape)
+    def assoc_shapes(combined_args, dynamic_shapes):
+        def assoc_shape(t, dynamic_shape):
+            if isinstance(t, torch.Tensor):
+                update_symbols(t, dynamic_shape)
+            else:
+                if dynamic_shape is not None:
+                    raise UserError(
+                        UserErrorType.INVALID_INPUT,
+                        f"Cannot associate shape {dynamic_shape} to non-tensor type {type(t)}, "
+                        f"expected None",
+                    )
+
+        _tree_map(assoc_shape, combined_args, dynamic_shapes)
+
+    combined_args = _combine_args(f, args, kwargs)
+    if not isinstance(dynamic_shapes, dict):
+        assert isinstance(dynamic_shapes, (tuple, list))
+        combined_args = type(dynamic_shapes)(combined_args.values())  # type: ignore[assignment, misc]
+    assoc_shapes(combined_args, dynamic_shapes)
 
     constraints = []
+    for derived_constraint_with_phantom_root in derived_constraints_with_phantom_root:
+        phantom_root_name = derived_constraint_with_phantom_root.root.name  # type: ignore[union-attr]
+        if phantom_root_name in symbols:
+            # We found an input shape dimension corresponding to this name, so we
+            # do not need a phantom symbol for it after all.
+            # NOTE(avik): Overall we want to maintain the invariant that roots that
+            # are phantom symbols are really "phantom," i.e., they cannot be represented
+            # by any input source. This is important when we are deciding derived equalities,
+            # since we can focus our attention exclusively on input sources: deciding
+            # derived equalities involving phantom symbols are, in comparison, trivial.
+            derived_constraint_with_phantom_root.root = symbols[phantom_root_name][0]
+
     for dynamic_dims in symbols.values():
-        primary, *others = dynamic_dims
-        if others:
-            for other in others:
-                constraints.append(primary == other)
+        if all(
+            isinstance(dynamic_dim, _DerivedConstraint) for dynamic_dim in dynamic_dims
+        ):
+            constraints.extend(dynamic_dims)
         else:
-            constraints.append(primary)
-
-    return constraints
-
-
-def _process_constraints(
-    graph_module: torch.fx.GraphModule,
-    num_lifted_params_buffers: int,
-    example_inputs: List[torch.Tensor],
-) -> Dict:
-    """
-    Process the constraints stored in the graph module to return something more readable.
-
-    Args:
-        graph_module (torch.fx.GraphModule): GraphModule returned from
-            dynamo.export, which contains the "input_shape_constraints" and
-            "inline_constraints" metadata
-
-        example_inputs: Flattened list of example inputs used to export the graph module
-
-    Returns:
-        range_constraints (Dict[sympy.Symbol, ValueRanges]): Mapping of
-            symbols (from SymInts) appearing in the fake tensors in
-            node.meta["val"] to their range constraints, which are a tuple
-            containing (lower, upper) constraints.
-    """
-    from torch._export.passes.add_runtime_assertions_for_constraints_pass import (
-        InputDim,
-    )
-
-    # Import sympy locally
-    from torch.fx.experimental.symbolic_shapes import SymInt
-    from torch.utils._sympy.value_ranges import ValueRanges
-
-    input_shape_constraints = graph_module.meta.get("input_shape_constraints", [])
-    inline_constraints = graph_module.meta.get("inline_constraints", [])
-
-    # Create dict mapping tensor_id to node names
-    tensor_id_to_nodes: Dict[int, List[str]] = defaultdict(list)
-    # Create dict mapping placeholder node names to their nodes
-    placeholder_nodes: Dict[str, torch.fx.Node] = {}
-    for i, node in enumerate(graph_module.graph.nodes):
-        if node.op != "placeholder":
-            # All placeholder nodes should be together in the beginning of the
-            # graph
-            break
-        if i >= num_lifted_params_buffers:
-            example_input = example_inputs[i - num_lifted_params_buffers]
-            tensor_id_to_nodes[id(example_input)].append(node.name)
-            placeholder_nodes[node.name] = node
-
-    # Create dict mapping (node name, dim) a list of range (lower, upper)
-    # constraints
-    multi_range_constraints: Dict[InputDim, List[ValueRanges]] = defaultdict(list)
-    for constraint in input_shape_constraints:
-        for node in tensor_id_to_nodes[constraint["t_id"]]:
-            node_dim = InputDim(node, constraint["dim"])
-
-            # Accumulate range constraints
-            multi_range_constraints[node_dim].append(
-                ValueRanges(constraint["min"], constraint["max"])
-            )
-
-    # Create dict mapping symbol to a singular range (lower, upper)
-    range_constraints: Dict[Any, ValueRanges] = {}
-
-    # Add inline constraints to range_constraints
-    range_constraints = {
-        symbol: inline_constraints[symbol] for symbol in inline_constraints
-    }
-
-    # Add input range constraints to range_constraints
-    for input_dim, multi_range_constraint in multi_range_constraints.items():  # type: ignore[assignment]
-        # Simplify the range constraints into a single range constraint
-        # Ex. ranges [2, 10] and [3, 11] would get merged to [3, 10]
-        min_vals = [rc.lower for rc in multi_range_constraint]
-        max_vals = [rc.upper for rc in multi_range_constraint]
-        min_val = max(min_vals)  # type: ignore[type-var]
-        max_val = min(max_vals)  # type: ignore[type-var]
-        assert min_val <= max_val  # type: ignore[operator]
-
-        # Add input node range constraints
-        val = placeholder_nodes[input_dim.input_name].meta["val"]
-        assert isinstance(val, FakeTensor)
-        symint = val.shape[input_dim.dim]
-        assert isinstance(
-            symint, SymInt
-        ), f"Expected SymInt but got {symint}: {type(symint)}"
-        symbol = symint.node._expr
-        range_constraints[symbol] = ValueRanges(min_val, max_val)
-
-    return range_constraints
+            primary, *others = dynamic_dims
+            if others:
+                for other in others:
+                    constraints.append(primary == other)  # type: ignore[arg-type]
+            else:
+                constraints.append(primary)
+
+    return constraints  # type: ignore[return-value]
diff --git a/torch/export/exported_program.py b/torch/export/exported_program.py
index 9eb144956baa0..51f9592970c93 100644
--- a/torch/export/exported_program.py
+++ b/torch/export/exported_program.py
@@ -1,7 +1,10 @@
 import copy
 import dataclasses
 import functools
+import re
 import types
+import warnings
+from collections import namedtuple
 from typing import (
     Any,
     Callable,
@@ -15,6 +18,8 @@
     Union,
 )
 
+from torch.fx.immutable_collections import immutable_dict, immutable_list
+
 if TYPE_CHECKING:
     # Import the following modules during type checking to enable code intelligence features,
     # such as auto-completion in tools like pylance, even when these modules are not explicitly
@@ -25,8 +30,8 @@
     from torch.utils._sympy.value_ranges import ValueRanges
 
 import torch
-import torch.fx._pytree as fx_pytree
 import torch.utils._pytree as pytree
+from torch.export._tree_utils import is_equivalent, reorder_kwargs
 from torch.fx._compatibility import compatibility
 from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode
 
@@ -37,6 +42,7 @@
     _sig_to_specs,
     ArgumentSpec,
     ConstantArgument,
+    CustomObjArgument,
     ExportGraphSignature,
     InputKind,
     InputSpec,
@@ -44,6 +50,7 @@
     OutputSpec,
     SymIntArgument,
     TensorArgument,
+    TokenArgument,
 )
 
 
@@ -80,6 +87,102 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+def _fx_collection_equivalence_fn(
+    spec1_type: Optional[type],
+    spec1_context: pytree.Context,
+    spec2_type: Optional[type],
+    spec2_context: pytree.Context,
+) -> bool:
+    """Treat containers and their immutable variants as the same type. Otherwise
+    compare as normal.
+    """
+    if spec1_type is None or spec2_type is None:
+        return spec1_type is spec2_type and spec1_context == spec2_context
+
+    if issubclass(spec1_type, (dict, immutable_dict)) and issubclass(
+        spec2_type, (dict, immutable_dict)
+    ):
+        return spec1_context == spec2_context
+
+    if issubclass(spec1_type, (list, immutable_list)) and issubclass(
+        spec2_type, (list, immutable_list)
+    ):
+        return spec1_context == spec2_context
+
+    return spec1_type is spec2_type and spec1_context == spec2_context
+
+
+def _rename_without_collisions(
+    name_map: Dict[str, str],
+    orig_name: str,
+    name: str,
+    is_placeholder: bool = False,
+):
+    """
+    Renames nodes to avoid name collisions, with suffixing.
+    name_map: map from original name to new name
+    orig_name: mapping key
+    name: candidate name (potentially suffixed, e.g. mul_2)
+    is_placeholder: if the node is a placeholder, avoid detecting suffix
+    """
+    if name in name_map.values():
+        # non-placeholder nodes may be suffixed with the count
+        # instead of adding another suffix, we will try to increment it
+        match = re.match(r"(.*)_(\d+)", name)
+        if match and not is_placeholder:
+            name, n = match.group(1), int(match.group(2))
+        else:
+            n = 0
+        while (dup_name := f"{name}_{n + 1}") in name_map.values():
+            n += 1
+        name_map[orig_name] = dup_name
+    else:
+        name_map[orig_name] = name
+    return name_map[orig_name]
+
+
+def _name_hoo_subgraph_placeholders(gm: torch.fx.GraphModule) -> None:
+    """
+    Propagate placeholder names from the top-level graph into HigherOrderOp subgraphs,
+    and handle collisions with non-placeholders by count suffixing.
+    Different HOO subgraph types have different input schemas, so we first enumerate them
+    and gather the top-level named placeholder nodes.
+    """
+    # gather all HOO subgraphs and their top-level named placeholder nodes
+    subgraph_ph_tuples: List[Tuple[torch.fx.GraphModule, List[torch.fx.Node]]] = []
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and isinstance(
+            node.target, torch._ops.HigherOrderOperator
+        ):
+            # HOO subgraphs have varying input schemas, so we enumerate them there
+            if node.target._name == "cond":
+                _, true_graph, false_graph, cond_args = node._args
+                subgraph_ph_tuples.append((getattr(gm, true_graph.target), cond_args))
+                subgraph_ph_tuples.append((getattr(gm, false_graph.target), cond_args))
+            elif node.target._name == "wrap_with_set_grad_enabled":
+                subgraph, phs = node._args[1], node._args[2:]
+                subgraph_ph_tuples.append((getattr(gm, subgraph.target), phs))
+            elif node.target._name == "map_impl":
+                body_graph, array, args = node._args
+                subgraph_ph_tuples.append(
+                    (getattr(gm, body_graph.target), array + args)
+                )
+
+    # propagate names
+    for subgraph, hoo_phs in subgraph_ph_tuples:
+        name_map: Dict[str, str] = {}
+        for i, node in enumerate(subgraph.graph.nodes):
+            if i < len(hoo_phs):  # placeholder, retain name
+                name_map[node.name] = hoo_phs[i].name
+                node.name = node.target = hoo_phs[i].name
+            else:  # non-placeholder, check for collisions
+                node.name = _rename_without_collisions(name_map, node.name, node.name)
+
+        # recurse and recompile
+        _name_hoo_subgraph_placeholders(subgraph)
+        subgraph.recompile()
+
+
 class ExportedProgram:
     """
     Package of a program from :func:`export`. It contains
@@ -106,10 +209,13 @@ def __init__(
         module_call_graph: List[ModuleCallEntry],
         example_inputs: Optional[Tuple[Tuple[Any, ...], Dict[str, Any]]] = None,
         verifier: Optional[Type[Any]] = None,  # TODO Change typing hint to Verifier.
-        tensor_constants: Optional[Dict[str, torch.Tensor]] = None,
+        tensor_constants: Optional[
+            Dict[str, torch.Tensor]
+        ] = None,  # TODO: deprecate this
+        constants: Optional[
+            Dict[str, Union[torch.Tensor, torch._C.ScriptObject]]
+        ] = None,
     ):
-        from torch._export.exported_program import _create_graph_module_for_export
-
         # Remove codegen related things from the graph. It should just be a flat graph.
         graph._codegen = torch.fx.graph.CodeGen()
         self._graph_module = _create_graph_module_for_export(root, graph)
@@ -123,7 +229,8 @@ def __init__(
         self._module_call_graph: List[ModuleCallEntry] = module_call_graph
         self._example_inputs = example_inputs
 
-        self._tensor_constants = tensor_constants or {}
+        self._constants = tensor_constants or constants or {}
+        assert self._constants is not None
 
         from torch._export.verifier import Verifier
 
@@ -185,8 +292,12 @@ def named_buffers(self) -> Iterator[Tuple[str, torch.Tensor]]:
         Returns an iterator over original module buffers, yielding
         both the name of the buffer as well as the buffer itself.
         """
+        non_persistent_buffers = set(self.graph_signature.non_persistent_buffers)
         for buffer_name in self.graph_signature.buffers:
-            yield buffer_name, self.state_dict[buffer_name]
+            if buffer_name in non_persistent_buffers:
+                yield buffer_name, self.constants[buffer_name]
+            else:
+                yield buffer_name, self.state_dict[buffer_name]
 
     @property
     @compatibility(is_backward_compatible=False)
@@ -206,7 +317,7 @@ def example_inputs(self):
     @property
     @compatibility(is_backward_compatible=False)
     def call_spec(self):
-        from torch._export.exported_program import CallSpec
+        CallSpec = namedtuple("CallSpec", ["in_spec", "out_spec"])
 
         if len(self.module_call_graph) == 0:
             return CallSpec(in_spec=None, out_spec=None)
@@ -229,51 +340,97 @@ def dialect(self) -> str:
     @property
     @compatibility(is_backward_compatible=False)
     def tensor_constants(self):
-        return self._tensor_constants
+        return self._constants
 
-    def __call__(self, *args: Any, **kwargs: Any) -> Any:
-        import torch._export.error as error
+    @property
+    @compatibility(is_backward_compatible=False)
+    def constants(self):
+        return self._constants
 
-        if self.call_spec.in_spec is not None:
-            try:
-                user_args = (args, kwargs or {})
-                args = fx_pytree.tree_flatten_spec(
-                    user_args, self.call_spec.in_spec, exact_structural_match=True
-                )  # type: ignore[assignment]
-            except Exception:
-                _, received_spec = pytree.tree_flatten(user_args)
-                raise TypeError(  # noqa: TRY200
-                    "Trying to flatten user inputs with exported input tree spec: \n"
-                    f"{self.call_spec.in_spec}\n"
-                    "but actually got inputs with tree spec of: \n"
-                    f"{received_spec}"
-                )
+    def _get_flat_args_with_check(self, args, kwargs):
+        """Flatten args, kwargs using pytree, then, check specs.
 
-        ordered_params = tuple(
-            self.state_dict[name] for name in self.graph_signature.parameters
-        )
-        ordered_buffers = tuple(
-            self.state_dict[name] for name in self.graph_signature.buffers
-        )
-        if hasattr(self.graph_signature, "lifted_tensor_constants"):
-            ordered_tensor_constants = tuple(
-                self.tensor_constants[name]
-                for name in self.graph_signature.lifted_tensor_constants
+        Args:
+            args: List[Any] original args passed to __call__
+            kwargs: Dict[str, Any] original kwargs passed to __call
+
+        Returns:
+            A tuple of (flat_args, received_spec)
+            flat_args is flattend args / kwargs
+            received_spec is the pytree spec produced while flattening the
+            tuple (args, kwargs)
+        """
+        in_spec = self.call_spec.in_spec
+        if in_spec is not None:
+            kwargs = reorder_kwargs(kwargs, in_spec)
+        flat_args_with_path, received_spec = pytree.tree_flatten_with_path(
+            (args, kwargs)
+        )  # type: ignore[possibly-undefined]
+        self._check_input_constraints(flat_args_with_path)
+        flat_args = tuple(x[1] for x in flat_args_with_path)
+        return flat_args, received_spec
+
+    def _graph_module_flat_inputs(self, args: Any, kwargs: Any) -> Any:
+        """Transform args, kwargs of __call__ to args for graph_module.
+
+        self.graph_module takes stuff from state dict as inputs.
+        The invariant is for ep: ExportedProgram is
+        ep(args, kwargs) ==
+          ep.postprocess(ep.graph_module(ep.graph_module_flat_inputs(args, kwargs)))
+        """
+
+        in_spec = self.call_spec.in_spec
+        flat_args, received_spec = self._get_flat_args_with_check(args, kwargs)
+        if in_spec is not None and not is_equivalent(
+            received_spec, in_spec, _fx_collection_equivalence_fn
+        ):
+            raise ValueError(
+                "Trying to flatten user inputs with exported input tree spec: \n"
+                f"{in_spec}\n"
+                "but actually got inputs with tree spec of: \n"
+                f"{received_spec}"
             )
-        else:
-            ordered_tensor_constants = ()
-        self._check_input_constraints(*args)
+
+        additional_inputs = []
+        for input_ in self.graph_signature.input_specs:
+            if input_.kind == InputKind.USER_INPUT:
+                continue
+            elif input_.kind in (
+                InputKind.PARAMETER,
+                InputKind.BUFFER,
+            ):
+                if input_.persistent is False:
+                    # This is a non-persistent buffer, grab it from our
+                    # constants instead of the state dict.
+                    additional_inputs.append(self.constants[input_.target])
+                else:
+                    additional_inputs.append(self.state_dict[input_.target])
+            elif input_.kind in (
+                InputKind.CONSTANT_TENSOR,
+                InputKind.CUSTOM_OBJ,
+            ):
+                additional_inputs.append(self.constants[input_.target])
+        additional_inputs = tuple(additional_inputs)
 
         # NOTE: calling convention is first params, then buffers, then args as user supplied them.
         # See: torch/_functorch/aot_autograd.py#L1034
-        res = torch.fx.Interpreter(self.graph_module).run(
-            *ordered_params,
-            *ordered_buffers,
-            *ordered_tensor_constants,
-            *args,
-            enable_io_processing=False,
+        return additional_inputs + flat_args
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        raise RuntimeError(
+            "Unable to call ExportedProgram directly. "
+            "You should use `exported_program.module()` instead."
         )
 
+    def _postprocess_graph_module_outputs(self, res, orig_args, orig_kwargs):
+        """Process potential mutations to the input.
+
+        Because self.graph_module is functional, so mutations has to be written
+        back after execution of graph_module.
+        """
+        import torch._export.error as error
+
+        flat_args, _ = self._get_flat_args_with_check(orig_args, orig_kwargs)
         if self.call_spec.out_spec is not None:
             buffer_mutation = self.graph_signature.buffers_to_mutate
             user_input_mutation = self.graph_signature.user_inputs_to_mutate
@@ -315,10 +472,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
                             for i, spec in enumerate(user_inputs)
                             if spec.arg.name == output_spec.target
                         )
-                        args[index].copy_(value)
+                        flat_args[index].copy_(value)
                     else:
                         raise AssertionError(f"Unexpected kind: {output_spec.kind}")
-
         return res
 
     def __str__(self) -> str:
@@ -351,6 +507,16 @@ def _eval(self, mode: bool = True):
         module.eval = types.MethodType(_eval, module)  # type: ignore[method-assign]
         return module
 
+    def _num_lifted_params_buffers(self):
+        return next(
+            (
+                i
+                for i, s in enumerate(self._graph_signature.input_specs)
+                if s.kind == InputKind.USER_INPUT
+            ),
+            len(self._graph_signature.input_specs),
+        )
+
     @_disable_prexisiting_fake_mode
     def run_decompositions(
         self, decomp_table: Optional[Dict[torch._ops.OperatorBase, Callable]] = None
@@ -364,11 +530,13 @@ def run_decompositions(
         For now, we do not decompose joint graphs.
         """
         from torch._decomp import core_aten_decompositions
+        from torch._export.passes._node_metadata_hook import _node_metadata_hook
         from torch._export.passes.add_runtime_assertions_for_constraints_pass import (
             _AddRuntimeAssertionsForInlineConstraintsPass,
         )
-        from torch._export.passes.lift_constant_tensor_pass import (
-            lift_constant_tensor_pass,
+        from torch._export.passes.lift_constants_pass import (
+            ConstantAttrMap,
+            lift_constants_pass,
         )
         from torch._export.passes.replace_sym_size_ops_pass import (
             _replace_sym_size_ops_pass,
@@ -392,9 +560,15 @@ def _get_placeholders(gm):
         for name in buffers_to_remove:
             delattr(self.graph_module, name)
         # TODO(zhxhchen17) Return the new graph_signature directly.
-        gm, graph_signature = aot_export_module(
-            self.graph_module, fake_args, decompositions=decomp_table, trace_joint=False
-        )
+        from torch.export._trace import _ignore_backend_decomps
+
+        with _ignore_backend_decomps():
+            gm, graph_signature = aot_export_module(
+                self.graph_module,
+                fake_args,
+                decompositions=decomp_table,
+                trace_joint=False,
+            )
 
         # Update the signatures with the new placeholder names in case they
         # changed when calling aot_export
@@ -410,6 +584,21 @@ def update_arg(old_arg, new_ph):
         new_placeholders = _get_placeholders(gm)
         new_outputs = list(gm.graph.nodes)[-1].args[0]
 
+        # rename the placeholders
+        assert len(new_placeholders) == len(old_placeholders)
+        for old_ph, new_ph in zip(old_placeholders, new_placeholders):
+            new_ph.name = new_ph.target = old_ph.name
+
+        # handle name collisions with newly decomposed graph nodes
+        name_map = {ph.name: ph.name for ph in new_placeholders}
+        for node in gm.graph.nodes:
+            if node.op == "placeholder":
+                continue
+            node.name = _rename_without_collisions(name_map, node.name, node.name)
+
+        # propagate names to higher order op subgraphs
+        _name_hoo_subgraph_placeholders(gm)
+
         # To match the output target with correct input for input mutations
         # need to find the old to new placeholder map
         old_new_placeholder_map = {
@@ -419,7 +608,12 @@ def update_arg(old_arg, new_ph):
         }
 
         input_specs = [
-            InputSpec(spec.kind, update_arg(spec.arg, new_placeholders[i]), spec.target)
+            InputSpec(
+                spec.kind,
+                update_arg(spec.arg, new_placeholders[i]),
+                spec.target,
+                spec.persistent,
+            )
             for i, spec in enumerate(self.graph_signature.input_specs)
         ]
         output_specs = [
@@ -456,10 +650,33 @@ def update_arg(old_arg, new_ph):
         # (The node-level meta is addressed above.)
         gm.meta.update(self.graph_module.meta)
 
-        new_range_constraints = _get_updated_range_constraints(gm)
+        new_range_constraints = _get_updated_range_constraints(
+            gm,
+            self.range_constraints,
+            _is_executorch=False,
+        )
+
+        constants = lift_constants_pass(gm, new_graph_signature, ConstantAttrMap())
+        for k, v in constants.items():
+            assert k not in self.constants
+            self.constants[k] = v
 
-        lift_constant_tensor_pass(gm, new_graph_signature)
         _replace_sym_size_ops_pass(gm)
+
+        if len(new_range_constraints) > 0:
+            stack_trace = (
+                'File "torch/_export/passes/add_runtime_assertions_for_constraints_pass.py", line 46, '
+                "in _AddRuntimeAssertionsForInlineConstraintsPass"
+            )
+            with gm._set_create_node_hook(
+                functools.partial(_node_metadata_hook, stack_trace=stack_trace)
+            ):
+                res = _AddRuntimeAssertionsForInlineConstraintsPass(
+                    new_range_constraints
+                )(gm)
+            assert res is not None
+            gm = res.graph_module
+
         exported_program = ExportedProgram(
             root=gm,
             graph=gm.graph,
@@ -469,19 +686,18 @@ def update_arg(old_arg, new_ph):
             module_call_graph=copy.deepcopy(self.module_call_graph),
             example_inputs=self.example_inputs,
             verifier=self.verifier,
-            tensor_constants=self.tensor_constants,
+            constants=self.constants,
         )
-
-        if len(new_range_constraints) > 0:
-            exported_program = exported_program._transform_do_not_use(
-                _AddRuntimeAssertionsForInlineConstraintsPass(new_range_constraints)
-            )
-
         return exported_program
 
     def _transform_do_not_use(self, *passes: PassType) -> "ExportedProgram":
         pm = PassManager(list(passes))
-        res = pm(self.graph_module)
+        # Since we abstractly run the passes, we need to disable backend decomp here
+        # again.
+        from torch.export._trace import _ignore_backend_decomps
+
+        with _ignore_backend_decomps():
+            res = pm(self.graph_module)
         transformed_gm = res.graph_module if res is not None else self.graph_module
         assert transformed_gm is not None
 
@@ -507,11 +723,18 @@ def _get_updated_graph_signature(
                 old_input_spec = old_signature.input_specs[i]
                 arg = (
                     old_input_spec.arg
-                    if isinstance(old_input_spec.arg, ConstantArgument)
+                    if isinstance(
+                        old_input_spec.arg, (ConstantArgument, CustomObjArgument)
+                    )
                     else type(old_input_spec.arg)(node.name)
                 )
                 new_input_specs.append(
-                    InputSpec(old_input_spec.kind, arg, old_input_spec.target)
+                    InputSpec(
+                        old_input_spec.kind,
+                        arg,
+                        old_input_spec.target,
+                        old_input_spec.persistent,
+                    )
                 )
 
             output_node = list(new_gm.graph.nodes)[-1]
@@ -525,7 +748,9 @@ def _get_updated_graph_signature(
                 old_output_spec = old_signature.output_specs[i]
                 arg = (
                     old_output_spec.arg
-                    if isinstance(old_output_spec.arg, ConstantArgument)
+                    if isinstance(
+                        old_output_spec.arg, (ConstantArgument, CustomObjArgument)
+                    )
                     else type(old_output_spec.arg)(node.name)
                 )
                 new_output_specs.append(
@@ -544,17 +769,21 @@ def _get_updated_graph_signature(
                 self.graph_signature, transformed_gm
             ),
             state_dict=self.state_dict,
-            range_constraints=_get_updated_range_constraints(transformed_gm),
+            range_constraints=_get_updated_range_constraints(
+                transformed_gm,
+                self.range_constraints,
+                _is_executorch=False,
+            ),
             module_call_graph=copy.deepcopy(self._module_call_graph),
             example_inputs=self.example_inputs,
             verifier=self.verifier,
-            tensor_constants=self.tensor_constants,
+            constants=self.constants,
         )
         transformed_ep.graph_module.meta.update(self.graph_module.meta)
         transformed_ep.graph_module.meta.update(res.graph_module.meta)
         return transformed_ep
 
-    def _check_input_constraints(self, *args):
+    def _check_input_constraints(self, flat_args_with_path):
         from torch._export.utils import _check_input_constraints_for_graph
 
         placeholders = [p for p in self.graph.nodes if p.op == "placeholder"]
@@ -564,15 +793,33 @@ def _check_input_constraints(self, *args):
             if s.kind == InputKind.USER_INPUT
         ]
         _check_input_constraints_for_graph(
-            input_placeholders, args, self.range_constraints
+            input_placeholders, flat_args_with_path, self.range_constraints
         )
 
     def _validate(self):
         self.verifier().check(self)
 
+    # TODO(zhxchen17) Formalize this.
+    def _update(
+        self, graph_module, graph_signature, state_dict=None
+    ) -> "ExportedProgram":
+        return ExportedProgram(
+            root=graph_module,
+            graph=graph_module.graph,
+            graph_signature=graph_signature,
+            state_dict=state_dict or self.state_dict,
+            range_constraints=copy.deepcopy(self.range_constraints),
+            module_call_graph=copy.deepcopy(self._module_call_graph),
+            example_inputs=self.example_inputs,
+            verifier=self.verifier,
+            tensor_constants=self.tensor_constants,
+        )
+
 
 def _get_updated_range_constraints(
     gm: torch.fx.GraphModule,
+    old_range_constraints: "Optional[Dict[sympy.Symbol, Any]]" = None,
+    _is_executorch: bool = True,
 ) -> "Dict[sympy.Symbol, Any]":
     def get_shape_env(gm):
         vals = [
@@ -589,15 +836,60 @@ def get_shape_env(gm):
             if isinstance(v, torch.SymInt):
                 return v.node.shape_env
 
+    # FIXME(tmanlaibaatar) Remove this whole branch once https://github.com/pytorch/pytorch/pull/123764
+    if _is_executorch:
+        assert old_range_constraints is None
+        shape_env = get_shape_env(gm)
+        if shape_env is None:
+            return {}
+        range_constraints = {
+            k: v
+            for k, v in shape_env.var_to_range.items()
+            if k not in shape_env.replacements
+        }
+        # Only when we have an unbacked symint, and it's used as constructor inputs,
+        # runtime_var_to_range will make a difference compated to var_to_range.
+        # e.g. [2, oo) -> [0, oo)
+        for k, v in shape_env.var_to_range.items():
+            if k not in shape_env.replacements:
+                range_constraints[k] = v
+        return range_constraints
+
+    assert old_range_constraints is not None
+
     shape_env = get_shape_env(gm)
     if shape_env is None:
         return {}
+
+    range_constraints = copy.copy(old_range_constraints)
     range_constraints = {
-        k: v
-        for k, v in shape_env.var_to_range.items()
-        if k not in shape_env.replacements
+        k: v for k, v in range_constraints.items() if k not in shape_env.replacements
     }
-    for k, v in shape_env.runtime_var_to_range.items():
-        if k not in shape_env.replacements:
+    # Only when we have an unbacked symint, and it's used as constructor inputs,
+    # runtime_var_to_range will make a difference compated to var_to_range.
+    # e.g. [2, oo) -> [0, oo)
+    for k, v in shape_env.var_to_range.items():
+        if k not in shape_env.replacements and k not in range_constraints:
             range_constraints[k] = v
     return range_constraints
+
+
+def _create_graph_module_for_export(root, graph):
+    try:
+        gm = torch.fx.GraphModule(root, graph)
+    except SyntaxError:
+        # If custom objects stored in memory are being used in the graph,
+        # the generated python code will result in a syntax error on the custom
+        # object, since it is unable to parse the in-memory object. However
+        # we can still run the graph eagerly through torch.fx.Interpreter,
+        # so we will bypass this error.
+        warnings.warn(
+            "Unable to execute the generated python source code from "
+            "the graph. The graph module will no longer be directly callable, "
+            "but you can still run the ExportedProgram, and if needed, you can "
+            "run the graph module eagerly using torch.fx.Interpreter."
+        )
+        gm = torch.fx.GraphModule(root, torch.fx.Graph())
+        gm._graph = graph
+
+    return gm
diff --git a/torch/export/graph_signature.py b/torch/export/graph_signature.py
index 06c7f8e53e62b..ecfd7853400d9 100644
--- a/torch/export/graph_signature.py
+++ b/torch/export/graph_signature.py
@@ -5,6 +5,7 @@
 
 __all__ = [
     "ConstantArgument",
+    "CustomObjArgument",
     "ExportBackwardSignature",
     "ExportGraphSignature",
     "InputKind",
@@ -21,17 +22,35 @@ class TensorArgument:
     name: str
 
 
+@dataclasses.dataclass
+class TokenArgument:
+    name: str
+
+
 @dataclasses.dataclass
 class SymIntArgument:
     name: str
 
 
+@dataclasses.dataclass
+class CustomObjArgument:
+    name: str
+    class_fqn: str
+
+
 @dataclasses.dataclass
 class ConstantArgument:
-    value: Union[int, float, bool, None]
+    name: str
+    value: Union[int, float, bool, str, None]
 
 
-ArgumentSpec = Union[TensorArgument, SymIntArgument, ConstantArgument]
+ArgumentSpec = Union[
+    TensorArgument,
+    SymIntArgument,
+    ConstantArgument,
+    CustomObjArgument,
+    TokenArgument,
+]
 
 
 class InputKind(Enum):
@@ -39,6 +58,8 @@ class InputKind(Enum):
     PARAMETER = auto()
     BUFFER = auto()
     CONSTANT_TENSOR = auto()
+    CUSTOM_OBJ = auto()
+    TOKEN = auto()
 
 
 @dataclasses.dataclass
@@ -46,9 +67,23 @@ class InputSpec:
     kind: InputKind
     arg: ArgumentSpec
     target: Optional[str]
+    persistent: Optional[bool] = None
 
     def __post_init__(self):
-        assert isinstance(self.arg, (TensorArgument, SymIntArgument, ConstantArgument))
+        if self.kind == InputKind.BUFFER:
+            assert (
+                self.persistent is not None
+            ), "Failed to specify persistent flag on BUFFER."
+        assert isinstance(
+            self.arg,
+            (
+                TensorArgument,
+                SymIntArgument,
+                ConstantArgument,
+                CustomObjArgument,
+                TokenArgument,
+            ),
+        ), f"got {type(self.arg)}"
 
 
 class OutputKind(Enum):
@@ -58,6 +93,7 @@ class OutputKind(Enum):
     GRADIENT_TO_PARAMETER = auto()
     GRADIENT_TO_USER_INPUT = auto()
     USER_INPUT_MUTATION = auto()
+    TOKEN = auto()
 
 
 @dataclasses.dataclass
@@ -67,7 +103,9 @@ class OutputSpec:
     target: Optional[str]
 
     def __post_init__(self):
-        assert isinstance(self.arg, (TensorArgument, SymIntArgument, ConstantArgument))
+        assert isinstance(
+            self.arg, (TensorArgument, SymIntArgument, ConstantArgument, TokenArgument)
+        )
 
 
 def _sig_to_specs(
@@ -83,31 +121,46 @@ def _sig_to_specs(
     loss_output: Optional[str],
     inputs: List[ArgumentSpec],
     outputs: List[ArgumentSpec],
+    input_tokens: List[str],
+    output_tokens: List[str],
 ) -> Tuple[List[InputSpec], List[OutputSpec]]:
-    def to_input_spec(i: ArgumentSpec) -> InputSpec:
-        if not isinstance(i, TensorArgument):
-            return InputSpec(kind=InputKind.USER_INPUT, arg=i, target=None)
-        name = i.name
+    def to_input_spec(inp: ArgumentSpec) -> InputSpec:
+        if isinstance(inp, TokenArgument):
+            return InputSpec(kind=InputKind.TOKEN, arg=inp, target=None)
+
+        if not isinstance(inp, TensorArgument):
+            return InputSpec(kind=InputKind.USER_INPUT, arg=inp, target=None)
+        name = inp.name
         if name in user_inputs:
-            return InputSpec(kind=InputKind.USER_INPUT, arg=i, target=None)
+            return InputSpec(kind=InputKind.USER_INPUT, arg=inp, target=None)
         elif name in inputs_to_parameters:
             return InputSpec(
                 kind=InputKind.PARAMETER,
-                arg=i,
+                arg=inp,
                 target=inputs_to_parameters[name],
             )
         elif name in inputs_to_buffers:
             return InputSpec(
-                kind=InputKind.BUFFER, arg=i, target=inputs_to_buffers[name]
+                kind=InputKind.BUFFER,
+                arg=inp,
+                target=inputs_to_buffers[name],
+                # Mark as True for now; we will fix this up to distinguish
+                # persistent from non-persistent later in tracing.
+                # See: rewrite_non_persistent_buffers()
+                # TODO(suo): this is horrible.
+                persistent=True,
             )
         else:
             raise AssertionError(f"Unknown tensor input kind: {name}")
 
     def to_output_spec(idx: int, o: ArgumentSpec) -> OutputSpec:
+        if isinstance(o, TokenArgument):
+            return OutputSpec(kind=OutputKind.TOKEN, arg=o, target=None)
+
         if not isinstance(o, TensorArgument):
             return OutputSpec(kind=OutputKind.USER_OUTPUT, arg=o, target=None)
         name = o.name
-        if idx < len(buffer_mutations) + len(user_input_mutations):
+        if idx < len(buffer_mutations) + len(user_input_mutations) + len(output_tokens):
             if name in buffer_mutations:
                 return OutputSpec(
                     kind=OutputKind.BUFFER_MUTATION,
@@ -144,7 +197,7 @@ def to_output_spec(idx: int, o: ArgumentSpec) -> OutputSpec:
             else:
                 raise AssertionError(f"Unknown tensor output kind: {name}")
 
-    input_specs = [to_input_spec(i) for i in inputs]
+    input_specs = [to_input_spec(inp) for inp in inputs]
     output_specs = [to_output_spec(idx, o) for idx, o in enumerate(outputs)]
     return input_specs, output_specs
 
@@ -253,6 +306,16 @@ def buffers(self) -> Collection[str]:
             if isinstance(s.target, str)
         ]
 
+    @property
+    def non_persistent_buffers(self) -> Collection[str]:
+        return [
+            s.target
+            for s in self.input_specs
+            if s.kind == InputKind.BUFFER
+            if s.persistent is False
+            if isinstance(s.target, str)
+        ]
+
     # A list of lifted constant tensors
     @property
     def lifted_tensor_constants(self) -> Collection[str]:
@@ -264,23 +327,47 @@ def lifted_tensor_constants(self) -> Collection[str]:
             if isinstance(s.target, str)
         ]
 
-    # Graph node names of pytree-flattened inputs of original program
     @property
-    def user_inputs(self) -> Collection[str]:
-        return tuple(
-            s.arg.name
+    def lifted_custom_objs(self) -> Collection[str]:
+        # TODO Make this tuple.
+        return [
+            s.target
             for s in self.input_specs
-            if s.kind == InputKind.USER_INPUT and isinstance(s.arg, TensorArgument)
-        )
+            if s.kind == InputKind.CUSTOM_OBJ
+            if isinstance(s.target, str)
+        ]
+
+    # Graph node names of pytree-flattened inputs of original program
+    @property
+    def user_inputs(self) -> Collection[Union[int, float, bool, None, str]]:
+        user_inputs: List[Union[int, float, bool, None, str]] = []
+        for s in self.input_specs:
+            if s.kind != InputKind.USER_INPUT:
+                continue
+
+            if isinstance(s.arg, (TensorArgument, SymIntArgument, CustomObjArgument)):
+                user_inputs.append(s.arg.name)
+            elif isinstance(s.arg, ConstantArgument):
+                user_inputs.append(s.arg.value)
+            else:
+                raise RuntimeError(f"{s.arg} is not a valid user inputs")
+        return tuple(user_inputs)
 
     # Graph node names of pytree-flattened outputs of original program
     @property
-    def user_outputs(self) -> Collection[str]:
-        return tuple(
-            s.arg.name
-            for s in self.output_specs
-            if s.kind == OutputKind.USER_OUTPUT and isinstance(s.arg, TensorArgument)
-        )
+    def user_outputs(self) -> Collection[Union[int, float, bool, None, str]]:
+        user_outputs: List[Union[int, float, bool, None, str]] = []
+        for s in self.output_specs:
+            if s.kind != OutputKind.USER_OUTPUT:
+                continue
+
+            if isinstance(s.arg, (TensorArgument, SymIntArgument)):
+                user_outputs.append(s.arg.name)
+            elif isinstance(s.arg, ConstantArgument):
+                user_outputs.append(s.arg.value)
+            else:
+                raise RuntimeError(f"{s.arg} is not a valid user output")
+        return tuple(user_outputs)
 
     # A dictionary mapping graph input node names to parameters. If a graph input
     # name is found in this dictionary, it is guranteed to be a lifted parameter.
@@ -299,7 +386,7 @@ def inputs_to_parameters(self) -> Mapping[str, str]:
     @property
     def inputs_to_buffers(self) -> Mapping[str, str]:
         return {
-            s.arg.name: s.target
+            s.arg.name: s.target  # type: ignore[union-attr, misc]
             for s in self.input_specs
             if s.kind == InputKind.BUFFER
             and isinstance(s.arg, TensorArgument)
@@ -339,6 +426,16 @@ def inputs_to_lifted_tensor_constants(self) -> Mapping[str, str]:
             and isinstance(s.target, str)
         }
 
+    @property
+    def inputs_to_lifted_custom_objs(self) -> Mapping[str, str]:
+        return {
+            s.arg.name: s.target
+            for s in self.input_specs
+            if s.kind == InputKind.CUSTOM_OBJ
+            and isinstance(s.arg, CustomObjArgument)
+            and isinstance(s.target, str)
+        }
+
     @property
     def backward_signature(self) -> Optional[ExportBackwardSignature]:
         loss_output = None
@@ -374,6 +471,24 @@ def backward_signature(self) -> Optional[ExportBackwardSignature]:
     def assertion_dep_token(self) -> Optional[Mapping[int, str]]:
         return None
 
+    @property
+    def input_tokens(self) -> List[str]:
+        input_tokens = []
+        for s in self.input_specs:
+            if s.kind == InputKind.TOKEN:
+                assert isinstance(s.arg, TokenArgument)
+                input_tokens.append(s.arg.name)
+        return input_tokens
+
+    @property
+    def output_tokens(self) -> List[str]:
+        output_tokens = []
+        for s in self.output_specs:
+            if s.kind == OutputKind.TOKEN:
+                assert isinstance(s.arg, TokenArgument)
+                output_tokens.append(s.arg.name)
+        return output_tokens
+
     def __post_init__(self) -> None:
         assertion_dep_token = self.assertion_dep_token
         if assertion_dep_token is None:
@@ -391,7 +506,19 @@ def replace_all_uses(self, old: str, new: str):
         """
         assert isinstance(old, str)
         assert isinstance(new, str)
+        arg_types = (TensorArgument, SymIntArgument, CustomObjArgument)
         for o in self.output_specs:
-            if isinstance(o.arg, TensorArgument):
+            if isinstance(o.arg, arg_types):
                 if o.arg.name == old:
                     o.arg.name = new
+        for i in self.input_specs:
+            if isinstance(i.arg, arg_types):
+                if i.arg.name == old:
+                    i.arg.name = new
+
+    def get_replace_hook(self):
+        def _(old, new, user):
+            if user.op in ("output", "input"):
+                self.replace_all_uses(old.name, new)
+
+        return _
diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py
index 6e8ae67af4165..891ed24047506 100644
--- a/torch/export/unflatten.py
+++ b/torch/export/unflatten.py
@@ -1,31 +1,45 @@
 import abc
 import copy
 import operator
+from collections import defaultdict
 from copy import deepcopy
-from typing import Any, cast, Dict, List, Optional, Union
+from enum import Enum
+from itertools import chain
+from typing import Any, cast, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.fx._pytree as fx_pytree
 import torch.utils._pytree as pytree
+from torch._library.fake_class_registry import FakeScriptObject
+from torch.export._tree_utils import reorder_kwargs
 from torch.export.exported_program import (
     ConstantArgument,
     ExportedProgram,
+    InputKind,
     ModuleCallSignature,
     SymIntArgument,
     TensorArgument,
 )
 from torch.fx._symbolic_trace import is_fx_tracing
+from torch.utils._pytree import GetAttrKey, SequenceKey
 
 __all__ = ["InterpreterModule", "UnflattenedModule", "unflatten", "FlatArgsAdapter"]
 
 
+class _AttrKind(Enum):
+    PARAMETER = "parameter"
+    BUFFER = "buffer"
+    CONSTANT = "constant"
+
+
 # Assign attribute 'from_obj' to the qualified name 'target' on 'to_module
 # This installs empty Modules where none exist yet if they are subpaths of target
 def _assign_attr(
-    from_obj: torch.Tensor,
+    from_obj: Union[torch.Tensor, torch.ScriptObject],
     to_module: torch.nn.Module,
     target: str,
-    is_parameter: bool,
+    attr_kind: _AttrKind,
+    persistent: bool = True,
 ):
     *prefix, field = target.split(".")
     for item in prefix:
@@ -36,15 +50,24 @@ def _assign_attr(
             setattr(to_module, item, t)
         to_module = t
 
-    # If it is a tensor and not a parameter attribute of a module, it should be a named buffer.
-    # So, we register it as a named buffer in the target module.
-    if not isinstance(from_obj, torch.Tensor):
-        raise ValueError("Expected only parameters or buffers, got:", type(from_obj))
-
-    if is_parameter:
-        to_module.register_parameter(field, torch.nn.Parameter(from_obj))
-    else:
-        to_module.register_buffer(field, from_obj)
+    if attr_kind == _AttrKind.PARAMETER:
+        assert isinstance(from_obj, torch.nn.Parameter)
+        to_module.register_parameter(field, from_obj)
+    elif attr_kind == _AttrKind.BUFFER:
+        assert isinstance(from_obj, torch.Tensor)
+        to_module.register_buffer(field, from_obj, persistent=persistent)
+    elif attr_kind == _AttrKind.CONSTANT:
+        assert not isinstance(
+            from_obj, FakeScriptObject
+        ), "FakeScriptObject should only exist during tracing."
+        assert isinstance(
+            from_obj,
+            (
+                torch.Tensor,
+                torch.ScriptObject,
+            ),
+        )
+        setattr(to_module, field, from_obj)
 
 
 class InterpreterModule(torch.nn.Module):
@@ -56,16 +79,14 @@ class InterpreterModule(torch.nn.Module):
     def __init__(
         self,
         graph: torch.fx.Graph,
-        module_call_signature: Optional[ModuleCallSignature],
     ):
         super().__init__()
         self.graph = graph
         self.graph.owning_module = self
-        self.module_call_signature = module_call_signature
 
     def forward(self, *args, **kwargs):
         assert self.graph_module is not None, "Didn't finalize this InterpreterModule"
-        if torch._dynamo.is_compiling():
+        if torch.compiler.is_dynamo_compiling():
             # Dynamo cannot trace through torch.fx.Interpreter, so fall back to
             # GraphModule codegen in this instance.
             return self.graph_module(*args, **kwargs)
@@ -112,7 +133,7 @@ def finalize(self):
 
 class FlatArgsAdapter(abc.ABC):
     """
-    Adapts input arguments with `input_spec` to align `target_spec`.
+    Adapts input arguments with ``input_spec`` to align ``target_spec``.
     """
 
     @abc.abstractmethod
@@ -122,7 +143,7 @@ def adapt(
         input_spec: pytree.TreeSpec,
         input_args: List[Any],
     ) -> List[Any]:
-        """NOTE: This adapter may mutate given `flat_args`."""
+        """NOTE: This adapter may mutate given ``input_args_with_path``."""
         ...
 
 
@@ -136,6 +157,8 @@ def __init__(
         if export_module.graph_signature.backward_signature is not None:
             raise ValueError("Unflattening on JointExportModule NYI")
 
+        fqn_list = [entry.fqn for entry in export_module.module_call_graph]
+        assert fqn_list[0] == ""
         export_graph = deepcopy(export_module.graph)
         self.graph_signature = deepcopy(export_module.graph_signature)
         self.graph = torch.fx.Graph()
@@ -152,36 +175,91 @@ def __init__(
 
         state_dict = export_module.state_dict
         for name in self.graph_signature.parameters:
-            cloned = state_dict[name].clone()
+            cloned = torch.nn.Parameter(state_dict[name].clone())
             _assign_attr(
                 cloned,
                 self,
                 name,
-                is_parameter=True,
+                attr_kind=_AttrKind.PARAMETER,
             )
+
+        non_persistent_buffers = set(self.graph_signature.non_persistent_buffers)
         for name in self.graph_signature.buffers:
-            cloned = state_dict[name].clone()
+            if name in non_persistent_buffers:
+                persistent = False
+                cloned = export_module.constants[name].clone()
+            else:
+                persistent = True
+                cloned = state_dict[name].clone()
+
             _assign_attr(
                 cloned,
                 self,
                 name,
-                is_parameter=False,
+                attr_kind=_AttrKind.BUFFER,
+                persistent=persistent,
             )
 
-        inputs_to_state: Dict[str, str] = {
-            **self.graph_signature.inputs_to_parameters,
-            **self.graph_signature.inputs_to_buffers,
-        }
+        for fqn in chain(
+            self.graph_signature.lifted_tensor_constants,
+            self.graph_signature.lifted_custom_objs,
+        ):
+            constant = export_module.constants[fqn]
+            if isinstance(constant, torch.Tensor):
+                constant = constant.clone()
+            _assign_attr(
+                constant,
+                self,
+                fqn,
+                attr_kind=_AttrKind.CONSTANT,
+            )
+
+        # This is to handle parameters/buffers that point to the same tensor
+        # object id -> list of (node_name, target_name)
+        consts_map: Dict[int, List[Tuple[str, str]]] = defaultdict(list)
+
+        def add_to_consts_map(obj_id, node_name, target_name):
+            name_list = consts_map[obj_id]
+            name_list.append((node_name, target_name))
+
+        for s in self.graph_signature.input_specs:
+            if s.kind == InputKind.PARAMETER or (
+                s.kind == InputKind.BUFFER and s.persistent
+            ):
+                assert hasattr(s.arg, "name")
+                assert isinstance(s.target, str)
+                add_to_consts_map(
+                    id(export_module.state_dict[s.target]), s.arg.name, s.target
+                )
+            elif (
+                (s.kind == InputKind.BUFFER and not s.persistent)
+                or s.kind == InputKind.CONSTANT_TENSOR
+                or s.kind == InputKind.CUSTOM_OBJ
+            ):
+                assert hasattr(s.arg, "name")
+                assert isinstance(s.target, str)
+                add_to_consts_map(
+                    id(export_module.constants[s.target]), s.arg.name, s.target
+                )
+
+        # node name -> list of possible targets
+        inputs_to_state: Dict[str, List[str]] = {}
+        for node_target in consts_map.values():
+            targets = [t[1] for t in node_target]
+            for n, _ in node_target:
+                inputs_to_state[n] = targets
 
         _sink_params(self, inputs_to_state, [])
         # Check all input nodes has been processed.
-        for module in self.modules():
-            if not isinstance(module, torch.fx.GraphModule):
+        for name, module in self.named_modules():
+            if not hasattr(module, "graph"):
                 continue
             for node in module.graph.nodes:
                 if node.op != "placeholder":
                     continue
-                assert node.name not in inputs_to_state
+                assert (
+                    node.name not in inputs_to_state
+                ), f"{node.name} was not sunk into the module {name} which has the graph: {module.graph}"
 
         # Cache so we don't have to compute this every time.
         # NOTE: this needs to be kept in sync with the placeholders in
@@ -190,9 +268,32 @@ def __init__(
             node for node in self.graph.nodes if node.op == "placeholder"
         ]
         self.check_input_constraints = True
+        # TODO(zhxchen17) We can register modules ahead of time instead of reorder later.
+        fqn_order = {fqn: i for i, fqn in enumerate(fqn_list)}
+        # In the case of legacy IR, we might be missing some modules from metadata.
+        for name, _ in self.named_modules(remove_duplicate=False):
+            if name not in fqn_order:
+                fqn_order[name] = len(fqn_order)
+        _reorder_submodules(self, fqn_order)
+        assert [fqn for fqn, _ in self.named_modules(remove_duplicate=False)] == list(
+            fqn_order.keys()
+        )
+
+    def _print_graph(self):
+        for fqn, mod in self.named_modules():
+            print(fqn + ":")
+            if hasattr(mod, "graph") and isinstance(mod.graph, torch.fx.Graph):
+                print(mod.graph)
 
     def forward(self, *args, **kwargs):
-        flat_args, in_spec = pytree.tree_flatten((args, kwargs))
+        signature = self.module_call_graph[0].signature
+
+        reordered_kwargs = reorder_kwargs(kwargs, signature.in_spec)
+
+        flat_args_with_path, in_spec = pytree.tree_flatten_with_path(
+            (args, reordered_kwargs)
+        )
+        flat_args = [x[1] for x in flat_args_with_path]
         if is_fx_tracing():
             return_val = torch.fx.Interpreter(self, graph=self.graph).run(
                 *flat_args, enable_io_processing=False
@@ -202,8 +303,6 @@ def forward(self, *args, **kwargs):
                 return return_val[0]
             return return_val
 
-        assert self.module_call_graph[0].fqn == ""
-        signature = self.module_call_graph[0].signature
         if in_spec != signature.in_spec:
             if not self.adapted:
                 print(
@@ -237,8 +336,19 @@ def forward(self, *args, **kwargs):
             # TODO(suo): untangle this.
             from torch._export.utils import _check_input_constraints_for_graph
 
+            if self.adapted is True:
+                # TODO(suo): The FlatArgsAdapter returns a list of flat args,
+                # which we don't have keypaths for. For now, just create a dummy
+                # keypath to associate with the arg.
+                new_flat_args_with_path = [  # type: ignore[var-annotated]
+                    ((SequenceKey(idx=0), GetAttrKey(name="<unknown location>")), arg)
+                    for arg in flat_args
+                ]
+            else:
+                new_flat_args_with_path = flat_args_with_path  # type: ignore[assignment]
+
             _check_input_constraints_for_graph(
-                self.input_placeholders, flat_args, self.range_constraints
+                self.input_placeholders, new_flat_args_with_path, self.range_constraints
             )
         tree_out = torch.fx.Interpreter(self, graph=self.graph).run(
             *flat_args, enable_io_processing=False
@@ -255,10 +365,10 @@ def unflatten(
     hierachy instead of the flat graph that :mod:`torch.export` usually produces.
 
     .. note:: The args/kwargs of unflattened modules will not necessarily match
-    the eager module, so doing a module swap (e.g. :code:`self.submod =
-    new_mod`) will not necessarily work. If you need to swap a module out, you
-    need to set the :code:`preserve_module_call_signature` parameter of
-    :func:`torch.export.export`.
+        the eager module, so doing a module swap (e.g. :code:`self.submod =
+        new_mod`) will not necessarily work. If you need to swap a module out, you
+        need to set the :code:`preserve_module_call_signature` parameter of
+        :func:`torch.export.export`.
 
     Args:
         module (ExportedProgram): The ExportedProgram to unflatten.
@@ -391,6 +501,23 @@ def _generate_unflatten(gm: torch.nn.Module, nodes, spec) -> torch.fx.Node:
     return gm.graph.call_function(pytree.tree_unflatten, (nodes, spec_node))
 
 
+def _get_submodule(mod: torch.nn.Module, target: str):
+    *prefix, field = target.split(".")
+
+    for item in prefix:
+        submod = getattr(mod, item, None)
+
+        if submod is None:
+            return None
+
+        if not isinstance(submod, torch.nn.Module):
+            return None
+
+        mod = submod
+
+    return getattr(mod, field, None)
+
+
 def _add_submodule(mod: torch.nn.Module, target: str, module_to_add: torch.nn.Module):
     *prefix, field = target.split(".")
 
@@ -437,9 +564,7 @@ def __init__(
         if module is not None:
             self.module = module
         else:
-            self.module = InterpreterModule(
-                torch.fx.Graph(), module_call_graph.get(self.fqn)
-            )
+            self.module = InterpreterModule(torch.fx.Graph())
         if self.module_id in self.seen_modules:
             self.cached_graph_module = self.seen_modules[self.module_id]
         else:
@@ -536,6 +661,7 @@ def __init__(
             self.parent_call_module.kwargs = kwarg_nodes
 
     def add_placeholder(self, x):
+        assert self.fqn != "", f"Cannot add placeholder {x} to root module"
         assert x.graph is self.flat_graph
         # x is not in subgraph, create a new placeholder for subgraph
         with self.graph.inserting_before(None):
@@ -605,12 +731,19 @@ def finalize_outputs(self):
         if parent_out is None:
             return
 
+        parent_out.meta["val"] = (
+            graph_outputs.meta.get("val")
+            if isinstance(graph_outputs, torch.fx.Node)
+            else [o.meta.get("val") for o in graph_outputs]
+        )
+
         if len(orig_outputs) == 1 and signature is None:
             self.parent.node_map[orig_outputs[0]] = parent_out
         else:
             for i, orig_output in enumerate(orig_outputs):
                 # Use Proxy to record getitem access.
                 proxy_out = torch.fx.Proxy(parent_out)[i].node  # type: ignore[index]
+                proxy_out.meta["val"] = orig_output.meta.get("val")
                 self.parent.node_map[orig_output] = proxy_out
 
         if self.cached_graph_module is not None:
@@ -732,12 +865,34 @@ def _outline_submodules(orig_graph: torch.fx.Graph, root_module: UnflattenedModu
     ).run_outer()
 
 
+def _reorder_submodules(
+    parent: torch.nn.Module, fqn_order: Dict[str, int], prefix: str = ""
+):
+    # TODO Can be optimized by adding submodules ahead of time.
+    if prefix == "":
+        for fqn in list(fqn_order.keys())[1:]:
+            if _get_submodule(parent, fqn) is None:
+                _add_submodule(parent, fqn, torch.nn.Module())
+
+    children = []
+    for name, child in list(parent._modules.items()):
+        if child is None:
+            continue
+        fqn = prefix + name
+        _reorder_submodules(child, fqn_order, prefix=fqn + ".")
+        delattr(parent, name)
+        children.append((fqn_order[fqn], name, child))
+    children.sort(key=operator.itemgetter(0))
+    for _, name, child in children:
+        parent.register_module(name, child)
+
+
 def _sink_params(
     module: torch.nn.Module,
-    inputs_to_state: Dict[str, str],
+    inputs_to_state: Dict[str, List[str]],
     scope: List[str],
 ):
-    """Sink params and buffers from graph inputs into get_attr nodes.
+    """Sink params, buffers, and constants from graph inputs into get_attr nodes.
 
     Exported modules are purely functional, so they pass their parameters and
     buffers in as inputs to the graph.
@@ -760,7 +915,8 @@ def _sink_params(
         return
 
     graph = module.graph
-    inputs = filter(lambda n: n.op == "placeholder", graph.nodes)
+    inputs = list(filter(lambda n: n.op == "placeholder", graph.nodes))
+    the_last_input = inputs[-1]
 
     # Also remove from call_module nodes
     call_module_nodes = filter(lambda n: n.op == "call_module", graph.nodes)
@@ -772,21 +928,31 @@ def _sink_params(
             continue
 
         if len(node.users) > 0:
-            state_name = inputs_to_state[node.name].split(".")
-            # If there's a mismatch beteewn scope name and state name, then there must be multuple scopes
-            # pointing to the same state name, meaning some modules are shared. In such case, we can simply
-            # skip updating the current node because another later iteration will take care of this input
-            # node when the unique match between scope and state name occurs.
-            # To make sure this always happen, we should enforce the invariant that no placeholder node
-            # in the unflattened graph appears in inputs_to_state dict, which means all the extra input
-            # nodes have been handled.
-            if state_name[: len(scope)] != scope:
+            state_name = None
+            for sn in inputs_to_state[node.name]:
+                sn_split = sn.split(".")
+                if sn_split[: len(scope)] == scope:
+                    state_name = sn_split
+                    break
+
+            # If there's a mismatch beteewn scope name and state name, then
+            # there must be multuple scopes pointing to the same state name,
+            # meaning some modules are shared. In such case, we can simply skip
+            # updating the current node because another later iteration will
+            # take care of this input node when the unique match between scope
+            # and state name occurs.  To make sure this always happen, we should
+            # enforce the invariant that no placeholder node in the unflattened
+            # graph appears in inputs_to_state dict, which means all the extra
+            # input nodes have been handled.
+            if state_name is None:
                 continue
+
             attr_path = state_name[len(scope) :]
             state_attr = _recursive_getattr(module, attr_path)
-            assert isinstance(state_attr, torch.Tensor)
+            assert isinstance(state_attr, (torch.Tensor, torch.ScriptObject))
 
-            with graph.inserting_after(node):
+            # Make sure the newly created get_attr node is placed after the last placeholder node
+            with graph.inserting_after(the_last_input):
                 new_node = graph.create_node("get_attr", ".".join(attr_path))
 
             node.replace_all_uses_with(new_node, propagate_meta=True)
diff --git a/torch/func/__init__.py b/torch/func/__init__.py
index a0f14ff503745..dd0786456dec0 100644
--- a/torch/func/__init__.py
+++ b/torch/func/__init__.py
@@ -1,5 +1,4 @@
 from torch._functorch.eager_transforms import (
-    grad_and_value,
     vjp,
     jvp,
     jacrev,
@@ -8,7 +7,7 @@
     functionalize,
     linearize
 )
-from torch._functorch.apis import grad
+from torch._functorch.apis import grad, grad_and_value
 from torch._functorch.functional_call import functional_call, stack_module_state
 from torch._functorch.batch_norm_replacement import replace_all_batch_norm_modules_
 from torch._functorch.apis import vmap
diff --git a/torch/functional.py b/torch/functional.py
index a6c124177a0c6..7c07ae348631b 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -114,6 +114,9 @@ def broadcast_shapes(*shapes):
                 if max_len < s:
                     max_len = s
         result = [1] * max_len
+
+        from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
         for shape in shapes:
             if isinstance(shape, (int, torch.SymInt)):
                 shape = (shape,)
@@ -121,7 +124,9 @@ def broadcast_shapes(*shapes):
                 for i in range(-1, -1 - len(shape), -1):
                     if shape[i] < 0:
                         raise RuntimeError(f"Trying to create tensor with negative dimension ({shape[i]}): ({shape[i]})")
-                    if shape[i] == 1 or shape[i] == result[i]:
+                    # NB: result is initialized to 1 so this is effectively an
+                    # equals one test
+                    if guard_size_oblivious(shape[i] == 1) or guard_size_oblivious(shape[i] == result[i]):
                         continue
                     if result[i] != 1:
                         raise RuntimeError("Shape mismatch: objects cannot be broadcast to a single shape")
@@ -635,7 +640,7 @@ def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
 
     Returns:
         Tensor: A tensor containing the STFT result with shape `(B?, N, T, C?)` where
-           - `B?` is an optional batch dimnsion from the input
+           - `B?` is an optional batch dimension from the input.
            - `N` is the number of frequency samples, `(n_fft // 2) + 1` for
              `onesided=True`, or otherwise `n_fft`.
            - `T` is the number of frames, `1 + L // hop_length`
@@ -677,7 +682,7 @@ def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
 least squares estimation of the original signal. The algorithm will check using the NOLA condition (
 nonzero overlap).
 
-Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelop
+Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelope
 created by the summation of all the windows is never zero at certain point in time. Specifically,
 :math:`\sum_{t=-\infty}^{\infty} |w|^2[n-t\times hop\_length] \cancel{=} 0`.
 
@@ -1689,7 +1694,7 @@ def norm(input, p: Optional[Union[float, str]] = "fro", dim=None, keepdim=False,
             else:
                 return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype, out=out)  # type: ignore[attr-defined]
 
-def unravel_index(indices: Tensor, shape: Union[int, Sequence[int], torch.Size]) -> List[Tensor]:
+def unravel_index(indices: Tensor, shape: Union[int, Sequence[int], torch.Size]) -> Tuple[Tensor, ...]:
     r"""Converts a tensor of flat indices into a tuple of coordinate tensors that
     index into an arbitrary tensor of the specified shape.
 
@@ -1702,7 +1707,7 @@ def unravel_index(indices: Tensor, shape: Union[int, Sequence[int], torch.Size])
             tensor. All elements must be non-negative.
 
     Returns:
-        tuple of Tensors: Each ``i``-th tensor in the ouput corresponds with
+        tuple of Tensors: Each ``i``-th tensor in the output corresponds with
         dimension ``i`` of :attr:`shape`. Each tensor has the same shape as
         ``indices`` and contains one index into dimension ``i`` for each of the
         flat indices given by ``indices``.
diff --git a/torch/fx/README.md b/torch/fx/README.md
index c7cebbb8b00a6..c0fd45f7a9308 100644
--- a/torch/fx/README.md
+++ b/torch/fx/README.md
@@ -66,19 +66,19 @@ Here, we set up a simple Module that exercises different language features: fetc
 
 # Internal Structure
 
-## [Graph](https://pytorch.org/docs/master/fx.html#torch.fx.Graph) ##
+## [Graph](https://pytorch.org/docs/main/fx.html#torch.fx.Graph) ##
 The `fx.Graph` is a core data structure in FX that represents the operations and their dependencies in a structured format. It consists of a List of `fx.Node` representing individual operations and their inputs and outputs. The Graph enables simple manipulation and analysis of the model structure, which is essential for implementing various transformations and optimizations.
 
 ## Node
 An `fx.Node` is a datastructure that represent individual operations within an `fx.Graph`, it maps to callsites such as operators, methods and modules. Each `fx.Node` keeps track of its inputs, the previous and next nodes, the stacktrace so you can map back the node to a line of code in your python file and some optional metadata stored in a `meta` dict.
 
-## [GraphModule](https://pytorch.org/docs/master/fx.html#torch.fx.GraphModule) ##
+## [GraphModule](https://pytorch.org/docs/main/fx.html#torch.fx.GraphModule) ##
 The `fx.GraphModule` is a subclass of `nn.Module` that holds the transformed Graph, the original module's parameter attributes and its source code. It serves as the primary output of FX transformations and can be used like any other `nn.Module`. `fx.GraphModule` allows for the execution of the transformed model, as it generates a valid forward method based on the Graph's structure.
 
 
 # Tracing
 
-## [Symbolic Tracer](https://pytorch.org/docs/master/fx.html#torch.fx.Tracer) ##
+## [Symbolic Tracer](https://pytorch.org/docs/main/fx.html#torch.fx.Tracer) ##
 
 `Tracer` is the class that implements the symbolic tracing functionality of `torch.fx.symbolic_trace`. A call to `symbolic_trace(m)` is equivalent to `Tracer().trace(m)`. Tracer can be subclassed to override various behaviors of the tracing process. The different behaviors that can be overridden are described in the docstrings of the methods on the class.
 
@@ -103,7 +103,7 @@ During the call to `symbolic_trace`, the parameter `x` is transformed into a Pro
 
 If you're doing graph transforms, you can wrap your own Proxy method around a raw Node so that you can use the overloaded operators to add additional things to a Graph.
 
-## [TorchDynamo](https://pytorch.org/docs/master/compile/technical-overview.html) ##
+## [TorchDynamo](https://pytorch.org/docs/main/torch.compiler_deepdive.html) ##
 
 Tracing has limitations in that it can't deal with dynamic control flow and is limited to outputting a single graph at a time, so a better alternative is the new `torch.compile()` infrastructure where you can output multiple subgraphs in either an aten or torch IR using `torch.fx`. [This tutorial](https://colab.research.google.com/drive/1Zh-Uo3TcTH8yYJF-LLo5rjlHVMtqvMdf) gives more context on how this works.
 
diff --git a/torch/fx/_lazy_graph_module.py b/torch/fx/_lazy_graph_module.py
new file mode 100644
index 0000000000000..a4b4bc0d69d7c
--- /dev/null
+++ b/torch/fx/_lazy_graph_module.py
@@ -0,0 +1,182 @@
+from contextlib import contextmanager
+
+from torch.fx import GraphModule
+from torch.fx.graph_module import (
+    _format_import_block,
+    reduce_graph_module,
+    reduce_package_graph_module,
+)
+from torch.package import PackageExporter, sys_importer
+from ._compatibility import compatibility
+
+_use_lazy_graph_module_flag = False
+_force_skip_lazy_graph_module_flag = False
+
+
+@compatibility(is_backward_compatible=False)
+@contextmanager
+def _force_skip_lazy_graph_module():
+    """
+    Skip using lazy graph module disregarding the setting of _use_lazy_graph_module.
+    Use to skip _LazyGraphModule when testing inductor torchscript related backend.
+
+    torch.jit.script a _LazyGraphModule results in following error:
+        https://gist.github.com/shunting314/5143654c8084aed84ecd19b818258a69
+    """
+    try:
+        global _force_skip_lazy_graph_module_flag
+        prior = _force_skip_lazy_graph_module_flag
+        _force_skip_lazy_graph_module_flag = True
+        yield
+    finally:
+        _force_skip_lazy_graph_module_flag = prior
+
+
+@compatibility(is_backward_compatible=False)
+@contextmanager
+def _use_lazy_graph_module(should_use: bool):
+    try:
+        global _use_lazy_graph_module_flag
+        prior = _use_lazy_graph_module_flag
+        _use_lazy_graph_module_flag = (
+            should_use and not _force_skip_lazy_graph_module_flag
+        )
+        yield
+    finally:
+        _use_lazy_graph_module_flag = prior
+
+
+@compatibility(is_backward_compatible=False)
+def _get_graph_module_cls():
+    return _LazyGraphModule if _use_lazy_graph_module_flag else GraphModule
+
+
+def _make_graph_module(*args, graph_module_cls=None, **kwargs):
+    if graph_module_cls is None:
+        graph_module_cls = _get_graph_module_cls()
+
+    return graph_module_cls(*args, **kwargs)
+
+
+@compatibility(is_backward_compatible=False)
+class _LazyGraphModule(GraphModule):
+    """
+    The main difference between _LazyGraphModule and GraphModule is how recompile happens.
+    GraphModule will do a 'recompile' call to generate python code and the forward method when it's
+    constructed. Later on if the graph get updated, recompile method can be called again to refresh
+    the saved python code and forward method.
+
+    However in some cases especially in inductor, the recompilation can be a waste since we never
+    check the python code for the graph module or call its forward method. A few more concreate
+    examples regarding pattern matching fx passes in inductor:
+    1. some passes will update the graph to be compiled and then call recompile on the GraphModule.
+    2. some passes will trace small pattern function to search it in the graph being compiled and
+       replace the match with the traced graph of a replacement function. The pattern graph and
+       replacement graph are quite small but there are large amount of them. Doing GraphModule.recompile
+       for them in GraphModule.__init__ is also a waste of time.
+
+    However simply skip calling GraphModule.recompile in these scenarios is also dangeruous.
+    People may want to check the python code or call the GraphModule's forward method for debugging purposes.
+
+    The way _LazyGraphModule solves it is, we override the recompile method to just mark the
+    need for recompilation but does not do the actual recompilation. Later on if people really
+    access the compiled python code or call the GraphModule's forward method, we do the real
+    recompilation.
+    """
+
+    @classmethod
+    def from_graphmodule(cls, gm: GraphModule):
+        if isinstance(gm, _LazyGraphModule):
+            return gm
+        else:
+            return _LazyGraphModule(gm, gm.graph)
+
+    @staticmethod
+    def force_recompile(gm):
+        """
+        Sometimes we need force a recompile as a workaround
+        - we want to do the real recompilation before symbolic_trace to avoid error:
+            https://gist.github.com/shunting314/75549c2e82ae07ac1139c94a3583d259
+        """
+        if isinstance(gm, _LazyGraphModule):
+            gm.real_recompile()
+
+    def real_recompile(self):
+        if self._needs_recompile():
+            self._real_recompile()
+
+    @classmethod
+    def _needs_recompile(cls):
+        return cls.forward is cls._lazy_forward
+
+    def _lazy_forward(self, *args, **kwargs):
+        # Call self.real_recompile() rather than self._real_recompile() here.
+        # The _lazy_forward method may be saved and call repeatedly.
+        # Calling self.real_recompile can make sure we skip recompilation if
+        # we have already done so.
+        self.real_recompile()
+        assert not self._needs_recompile()
+
+        # call `__call__` rather than 'forward' since recompilation may
+        # install a wrapper for `__call__` to provide a customized error
+        # message.
+        return self(*args, **kwargs)
+
+    forward = _lazy_forward
+
+    # TODO: we shold handle __reduce_deploy__ the same way as __reduce_package__,
+    # or __reduce__ by calling _real_recompile. But I don't find a good way
+    # to test __reduce_deploy__ out. Also it's very unlikely that LazyGraphModule
+    # will be used in torch::deploy. So it's skipped for now.
+
+    def __reduce_package__(self, exporter: PackageExporter):
+        """
+        Follow GraphModule.__reduce__ but call 'self._real_recompile' rather
+        than 'self.recompile' since for a _LazyGraphModule, self.recompile just
+        mark the need of recompilation and does not return the PythonCode object.
+        """
+        python_code = self._real_recompile()
+        dict_without_graph = self.__dict__.copy()
+        dict_without_graph["_graphmodule_cls_name"] = self.__class__.__name__
+        del dict_without_graph["_graph"]
+
+        generated_module_name = f"fx-generated._{exporter.get_unique_id()}"
+        import_block = _format_import_block(python_code.globals, exporter.importer)
+        module_code = import_block + self.code
+        exporter.save_source_string(generated_module_name, module_code)
+        return (
+            reduce_package_graph_module,
+            (dict_without_graph, generated_module_name),
+        )
+
+    def __reduce__(self):
+        """
+        Follow GraphModule.__reduce__ but call 'self._real_recompile' rather
+        than 'self.recompile' since for a _LazyGraphModule, self.recompile just
+        mark the need of recompilation and does not return the PythonCode object.
+        """
+        python_code = self._real_recompile()
+        dict_without_graph = self.__dict__.copy()
+        import_block = _format_import_block(python_code.globals, sys_importer)
+        del dict_without_graph["_graph"]
+        return (reduce_graph_module, (dict_without_graph, import_block))
+
+    def _real_recompile(self):
+        return super().recompile()
+
+    @classmethod
+    def recompile(cls):
+        cls.forward = cls._lazy_forward
+
+    @property
+    def code(self) -> str:
+        self.real_recompile()
+        return super().code
+
+    def __str__(self) -> str:
+        """
+        str(GraphModule) will access the _code attribute. Make sure recompile
+        happens so _code attribute is available.
+        """
+        self.real_recompile()
+        return super().__str__()
diff --git a/torch/fx/_pytree.py b/torch/fx/_pytree.py
index 2636f8db547a2..29ab0c8679113 100644
--- a/torch/fx/_pytree.py
+++ b/torch/fx/_pytree.py
@@ -1,7 +1,9 @@
 from collections import namedtuple
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type
 
-from torch.utils._pytree import LeafSpec, PyTree, TreeSpec
+import torch.return_types
+
+from torch.utils._pytree import PyTree, TreeSpec
 
 FlattenFuncSpec = Callable[[PyTree, TreeSpec], List]
 FlattenFuncExactMatchSpec = Callable[[PyTree, TreeSpec], bool]
@@ -24,7 +26,7 @@ def tree_flatten_spec(
     spec: TreeSpec,
     exact_structural_match=False,
 ) -> List[Any]:
-    if isinstance(spec, LeafSpec):
+    if spec.is_leaf():
         return [pytree]
     if spec.type not in SUPPORTED_NODES:
         raise RuntimeError(
@@ -87,6 +89,12 @@ def _namedtuple_flatten_spec_exact_match(d: NamedTuple, spec: TreeSpec) -> bool:
     _tuple_flatten_spec,
     _tuple_flatten_spec_exact_match,
 )
+for return_type in torch.return_types.all_return_types:
+    register_pytree_flatten_spec(
+        return_type,
+        _tuple_flatten_spec,
+        _tuple_flatten_spec_exact_match,
+    )
 register_pytree_flatten_spec(
     namedtuple,  # type: ignore[arg-type]
     _namedtuple_flatten_spec,
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index 98a652928c26b..7e0fdba12a787 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -24,10 +24,12 @@
 import torch
 import torch.utils._pytree as pytree
 from torch._C import ScriptObject  # type: ignore[attr-defined]
+from torch._library.fake_class_registry import FakeScriptObject
 
 from ._compatibility import compatibility
 from .graph import _PyTreeCodeGen, _PyTreeInfo, Graph
 from .graph_module import GraphModule
+from ._lazy_graph_module import _make_graph_module
 from .node import Argument, base_types, map_aggregate
 from .proxy import ParameterProxy, Proxy, TracerBase, Scope, ScopeContextManager
 
@@ -307,6 +309,34 @@ def __init__(
         # Mapping of node name to module scope
         self.node_name_to_scope: Dict[str, Tuple[str, type]] = {}
 
+    _qualname_counter: Dict[str, int] = collections.defaultdict(int)
+
+    @compatibility(is_backward_compatible=True)
+    def get_fresh_qualname(self, prefix: str) -> str:
+        """
+        Gets a fresh name for a prefix and returns it. This function ensures
+        that it will not clash with an existing attribute on the graph.
+        """
+        # The idea here is that if the module doesn't have this prefix at all we
+        # should reset the counter to start from the beginning
+        # It's a ... little bit hacky (doesn't cover all cases) but the precise
+        # naming of the prefixes isn't a correctness issue, just a niceness
+        # issue
+        qualname = f"{prefix}0"
+        if not hasattr(self.root, qualname):
+            self._qualname_counter[prefix] = 0
+            return qualname
+
+        i = self._qualname_counter[prefix]
+        while True:
+            qualname = f"{prefix}{i}"
+            i += 1
+            if not hasattr(self.root, qualname):
+                break
+        self._qualname_counter[prefix] = i
+
+        return qualname
+
     @compatibility(is_backward_compatible=True)
     def create_arg(self, a: Any) -> "Argument":
         """
@@ -365,18 +395,14 @@ def create_arg(self, a: Any) -> "Argument":
         # a get_attr to retrieve that tensor. Otherwise, we'll store away the
         # tensor value into a special attribute on the Module s.t. we can
         # retrieve it with a get_attr.
-        if isinstance(a, (torch.Tensor, ScriptObject)):
+        if isinstance(a, (torch.Tensor, ScriptObject, FakeScriptObject)):
             qualname: Optional[str] = self.tensor_attrs.get(a)
 
             # Tensor was not found in the Module hierarchy, stow it away in a
             # special attribute and set the qualname to refer to that
             if not qualname:
-                i = 0
-                while True:
-                    qualname = f"_tensor_constant{i}"
-                    if not hasattr(self.root, qualname):
-                        break
-                    i += 1
+                qualname = self.get_fresh_qualname("_tensor_constant")
+                assert isinstance(qualname, str)
                 self.tensor_attrs[a] = qualname
                 setattr(self.root, qualname, a)
 
@@ -387,12 +413,8 @@ def create_arg(self, a: Any) -> "Argument":
             # witness its construction. Intern this as a constant attribute
 
             # TODO: binary search
-            i = 0
-            while True:
-                qualname = f"_{a.__class__.__name__}_constant_{i}"
-                if not hasattr(self.root, qualname):
-                    break
-                i += 1
+            qualname = self.get_fresh_qualname(f"_{a.__class__.__name__}_constant_")
+            assert isinstance(qualname, str)
             setattr(self.root, qualname, a)
 
             return self.create_node("get_attr", qualname, (), {})
@@ -591,72 +613,6 @@ def create_args_for_root(self, root_fn, is_module, concrete_args=None):
 
         sig = inspect.signature(fn_for_analysis)
 
-        def proxy_placeholder(name: str):
-            if concrete_args is not None and name in concrete_args:
-                cnt = 0
-
-                def replace_ph(x):
-                    nonlocal cnt
-                    cnt += 1
-                    param = sig.parameters[name]
-                    default = (
-                        ()
-                        if param.default is inspect.Parameter.empty
-                        else (param.default,)
-                    )
-                    out = self.create_proxy(
-                        "placeholder", f"{name}_{str(cnt)}", default, {}
-                    )
-                    if isinstance(x, PHBase):
-                        if x != PH:
-                            # Transfer attrs in the case where you're using a placeholder other
-                            # than the singleton PH (PH has no attributes to transfer).
-                            # Proxies were created out of the placeholders.
-                            # Transfer any metadata (put on the placeholders in the form of
-                            # attributes set by the user) from the placeholder to the
-                            # underlying nodes (the proxy is unwrapped by the user, but
-                            # the metadata should hold).
-                            _transfer_attrs(fr=x, to=out.node)
-
-                        return out
-                    # Union[int, bool] == bool in Python <= 3.6
-                    if (
-                        type(x) == bool
-                        or type(x) in base_types
-                        and type(x) != torch.Tensor
-                    ):
-                        torch._assert(
-                            out == x,
-                            f"{name} has been specialized to have value {x} but got another value",
-                        )
-                    elif x is None:
-                        args = (
-                            out,
-                            f"{name} has been specialized to have value None but got another value",
-                        )
-                        self.create_proxy("call_function", _assert_is_none, args, {})
-                    else:
-                        warnings.warn(
-                            f"Was not able to add assertion to guarantee correct input {name} to "
-                            f"specialized function. It is up to the user to make sure that your inputs match the "
-                            f"inputs you specialized the function with."
-                        )
-
-                    return x
-
-                return pytree.tree_map(replace_ph, concrete_args[name])
-            if name[0] == "*":
-                default = ()
-            else:
-                param = sig.parameters[name]
-                default = () if param.default is inspect.Parameter.empty else (param.default,)  # type: ignore[assignment]
-            return self.create_proxy(
-                "placeholder",
-                name,
-                default,
-                {},
-                type_expr=fn_for_analysis.__annotations__.get(name, None)
-            )
 
         # This covers the very specific case where we are passing in flat
         # concrete_args as a tuple, but our traced fn takes (*args, **kwargs).
@@ -689,6 +645,10 @@ def replace_ph(x):
                     f"Tracing expected {len(arg_names)} arguments but got {len(concrete_args)} concrete arguments"
                 )
             concrete_args = dict(zip(arg_names, concrete_args))
+
+        def proxy_placeholder(name):
+            return self._proxy_placeholder(name, concrete_args, sig, fn_for_analysis)
+
         args.extend(proxy_placeholder(names) for names in arg_names)
 
         if co.co_kwonlyargcount > 0 or co.co_flags & HAS_VARSTUFF:
@@ -700,7 +660,7 @@ def replace_ph(x):
             root_fn = _patch_function(root_fn, len(args))
 
         flat_args, in_spec = pytree.tree_flatten(tuple(args))
-        if any(not isinstance(i, pytree.LeafSpec) for i in in_spec.children_specs):
+        if not all(child.is_leaf() for child in in_spec.children_specs):
             # In the case that we have pytree-flattened inputs in
             # `concrete_args`, generate a flattening wrapper around the
             # original root function and return that.
@@ -755,6 +715,14 @@ def trace(
         _is_fx_tracing_flag = True
         try:
             if isinstance(root, torch.nn.Module):
+
+                # do real recompilation for _LazyGraphModule before retracing since the trace
+                # method can not trace the _lazy_forward method. Got error:
+                #   https://gist.github.com/shunting314/75549c2e82ae07ac1139c94a3583d259
+                # without this.
+                from torch.fx._lazy_graph_module import _LazyGraphModule
+                _LazyGraphModule.force_recompile(root)
+
                 self.root = root
 
                 assert hasattr(
@@ -782,11 +750,17 @@ def trace(
             # is some other attribute on the model. Construct a dict mapping Tensor
             # values to the qualified name here for efficiency. This is used downstream
             # in create_arg
-            self.tensor_attrs: Dict[Union[torch.Tensor, ScriptObject], str] = {}
+            self.tensor_attrs: Dict[
+                Union[
+                    torch.Tensor,
+                    ScriptObject,
+                    FakeScriptObject
+                ], str
+            ] = {}
 
             def collect_tensor_attrs(m: torch.nn.Module, prefix_atoms: List[str]):
                 for k, v in m.__dict__.items():
-                    if isinstance(v, (torch.Tensor, ScriptObject)):
+                    if isinstance(v, (torch.Tensor, ScriptObject, FakeScriptObject)):
                         self.tensor_attrs[v] = ".".join(prefix_atoms + [k])
                 for k, v in m.named_children():
                     collect_tensor_attrs(v, prefix_atoms + [k])
@@ -867,6 +841,73 @@ def __deepcopy__(self, memo):
 
         return new_tracer
 
+    def _proxy_placeholder(self, name, concrete_args, sig, fn_for_analysis):
+        if concrete_args is not None and name in concrete_args:
+            cnt = 0
+
+            def replace_ph(x):
+                nonlocal cnt
+                cnt += 1
+                param = sig.parameters[name]
+                default = (
+                    ()
+                    if param.default is inspect.Parameter.empty
+                    else (param.default,)
+                )
+                out = self.create_proxy(
+                    "placeholder", f"{name}_{str(cnt)}", default, {}
+                )
+                if isinstance(x, PHBase):
+                    if x != PH:
+                        # Transfer attrs in the case where you're using a placeholder other
+                        # than the singleton PH (PH has no attributes to transfer).
+                        # Proxies were created out of the placeholders.
+                        # Transfer any metadata (put on the placeholders in the form of
+                        # attributes set by the user) from the placeholder to the
+                        # underlying nodes (the proxy is unwrapped by the user, but
+                        # the metadata should hold).
+                        _transfer_attrs(fr=x, to=out.node)
+
+                    return out
+                # Union[int, bool] == bool in Python <= 3.6
+                if (
+                    type(x) == bool
+                    or type(x) in base_types
+                    and type(x) != torch.Tensor
+                ):
+                    torch._assert(
+                        out == x,
+                        f"{name} has been specialized to have value {x} but got another value",
+                    )
+                elif x is None:
+                    args = (
+                        out,
+                        f"{name} has been specialized to have value None but got another value",
+                    )
+                    self.create_proxy("call_function", _assert_is_none, args, {})
+                else:
+                    warnings.warn(
+                        f"Was not able to add assertion to guarantee correct input {name} to "
+                        f"specialized function. It is up to the user to make sure that your inputs match the "
+                        f"inputs you specialized the function with."
+                    )
+
+                return x
+
+            return pytree.tree_map(replace_ph, concrete_args[name])
+        if name[0] == "*":
+            default = ()
+        else:
+            param = sig.parameters[name]
+            default = () if param.default is inspect.Parameter.empty else (param.default,)  # type: ignore[assignment]
+        return self.create_proxy(
+            "placeholder",
+            name,
+            default,
+            {},
+            type_expr=fn_for_analysis.__annotations__.get(name, None)
+        )
+
 
 # Dictionary of (id(globals dict), function name) => globals_dict to patch for
 # the purposes of the wrap() API.
@@ -949,7 +990,7 @@ class _PatchedFn(NamedTuple):
     orig_fn: Any
 
     def revert(self):
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 class _PatchedFnSetItem(_PatchedFn):
@@ -1180,7 +1221,7 @@ def f(x):
     name = (
         root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
     )
-    return GraphModule(tracer.root, graph, name)
+    return _make_graph_module(tracer.root, graph, name)
 
 
 @wrap
diff --git a/torch/fx/_utils.py b/torch/fx/_utils.py
new file mode 100644
index 0000000000000..5f99d698586cb
--- /dev/null
+++ b/torch/fx/_utils.py
@@ -0,0 +1,56 @@
+from typing import Dict, Optional
+
+import torch
+
+from torch._logging import LazyString
+
+
+def lazy_format_graph_code(name, gm, maybe_id=None):
+    """
+    Returns a LazyString that formats the graph code.
+    """
+
+    def format_name():
+        if maybe_id is not None:
+            return f"{name} {maybe_id}"
+        else:
+            return name
+
+    return LazyString(
+        lambda: _format_graph_code(
+            f"===== {format_name()} =====\n",
+            gm.forward.__code__.co_filename,
+            gm.print_readable(print_output=False),
+        )
+    )
+
+
+def _format_graph_code(name, filename, graph_str):
+    """
+    Returns a string that formats the graph code.
+    """
+    return f"TRACED GRAPH\n {name} {filename} {graph_str}\n"
+
+
+def first_call_function_nn_module_stack(graph: torch.fx.Graph) -> Optional[Dict]:
+    """
+    Returns the nn_module_stack of the first call_function node.
+    """
+    for node in graph.nodes:
+        if node.op == "call_function" and "nn_module_stack" in node.meta:
+            return node.meta["nn_module_stack"]
+    return None
+
+
+def get_node_context(node, num_nodes=2) -> str:
+    """
+    Returns a string of the last num_nodes nodes in the graph.
+    """
+    node_contexts = []
+    cur = node
+    for i in range(num_nodes):
+        node_contexts.append(cur.format_node())
+        if cur.op == "root":
+            break
+        cur = cur.prev
+    return "\n".join(node_contexts[::-1])
diff --git a/torch/fx/experimental/_backward_state.py b/torch/fx/experimental/_backward_state.py
new file mode 100644
index 0000000000000..9c742431857c3
--- /dev/null
+++ b/torch/fx/experimental/_backward_state.py
@@ -0,0 +1,27 @@
+import torch.fx
+
+
+class BackwardState:
+    """
+    BackwardState is used to pass Python hooks from the forwards pass
+    into the backwards pass in Dynamo+Compiled Autograd.
+
+    It is created by TorchDynamo and has special handling there.
+    Dynamo will pass an empty BackwardState to the forwards, then populate
+    members on it (via setattr) only after the forwards graph is finished.
+    Later on, in CompileAutograd we will inline and add the needed guards
+    on the BackwardState.
+
+    BackwardState is identified and has special handling in AOTAutograd.
+    During AOTAutograd:
+        1) BackwardState is an input to the forwards graph
+        2) It must only be used in the backwards
+        3) It will be empty in the forwards
+        4) In the forwards we add a wrapper to save it
+        5) In the backwards it becomes an input
+        6) There can only be one per graph
+
+    BackwardState requires CompiledAutograd.
+    """
+
+    proxy: torch.fx.Proxy
diff --git a/torch/fx/experimental/_config.py b/torch/fx/experimental/_config.py
index 0f2bffa475f48..ead0037c9e185 100644
--- a/torch/fx/experimental/_config.py
+++ b/torch/fx/experimental/_config.py
@@ -1,6 +1,8 @@
 import os
 import sys
 
+from typing import Optional
+
 # [@compile_ignored: debug] Uses z3 for validating the guard optimizations transformations.
 translation_validation = (
     os.environ.get("TORCHDYNAMO_TRANSLATION_VALIDATION", "0") == "1"
@@ -25,6 +27,30 @@
 # the a ShapeEnv with the same state. This should be used only in testing.
 check_shape_env_recorded_events = False
 
+# TODO: Perhaps consider allowing unions for the configs below (so you can hit
+# multiple reps at the same time)
+
+# Give extended debug information if the string representation of a guard
+# matches this.  For example, set this to "Ne(s0, 10)" and whenever we issue
+# this guard, we will generate full Python and C++ backtrace
+# [@compile_ignored: debug]
+extended_debug_guard_added = os.environ.get(
+    "TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED", None
+)
+
+# Give extended debug information when a particular symbol is allocated.  For
+# example, set this to "u2" and whenever we create this symbol, we will
+# generate full Python and C++ backtrace
+# [@compile_ignored: debug]
+extended_debug_create_symbol = os.environ.get(
+    "TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL", None
+)
+
+# Give extended debug information (C++ backtrace) for all extended debug
+# settings as well as errors.  The C++ backtrace is slow and very spammy so we
+# don't include it by default even when you're requesting extended debug.
+# [@compile_ignored: debug]
+extended_debug_cpp = os.environ.get("TORCHDYNAMO_EXTENDED_DEBUG_CPP", "") != ""
 
 # [@compile_ignored: debug] Show a warning for every specialization
 print_specializations = False
@@ -35,7 +61,15 @@
 inject_EVALUATE_EXPR_flip_equality_TESTING_ONLY = False
 
 # [@compile_ignored: debug] Validate that ShapeEnv's version key is updated correctly
-validate_shape_env_verison_key = False
+validate_shape_env_version_key = False
+
+# If we produce more than this many guards on a symbol, force the symbol to
+# get specialized and bail out if this many guards mention this particular
+# symbol.  This may be slightly more aggressive than the true number of guards
+# issued (as we test if we've hit the limit on-the-fly, whereas we may
+# do further simplifications at final guard issuance time that make guards
+# irrelevant.)
+symbol_guard_limit_before_specialize: Optional[int] = None
 
 from torch.utils._config_module import install_config_module
 
diff --git a/torch/fx/experimental/_sym_dispatch_mode.py b/torch/fx/experimental/_sym_dispatch_mode.py
index 52fa6221c3de8..c3385de616836 100644
--- a/torch/fx/experimental/_sym_dispatch_mode.py
+++ b/torch/fx/experimental/_sym_dispatch_mode.py
@@ -1,8 +1,8 @@
-from typing import List, Type
+from typing import List, Optional, Type
 
 __all__ = ["SymDispatchMode", "handle_sym_dispatch", "sym_function_mode"]
 
-SYM_FUNCTION_MODE = None
+SYM_FUNCTION_MODE: Optional["SymDispatchMode"] = None
 
 
 # SymDispatchMode gets invoked whenever an operation is processed on
@@ -22,7 +22,7 @@
 #
 class SymDispatchMode:
     def __sym_dispatch__(self, func, types, args, kwargs):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def __enter__(self):
         global SYM_FUNCTION_MODE
diff --git a/torch/fx/experimental/accelerator_partitioner.py b/torch/fx/experimental/accelerator_partitioner.py
index a09269eff510f..fc28f112323f7 100644
--- a/torch/fx/experimental/accelerator_partitioner.py
+++ b/torch/fx/experimental/accelerator_partitioner.py
@@ -259,7 +259,7 @@ def find_device_for(partition: Partition):
     # Find devices for all the partitions without a device
     found_device = True
     for partition in no_device_partitions:
-        device_to_left_mem_bytes = dict(sorted(device_to_left_mem_bytes.items(), key=lambda item: item[1]))
+        device_to_left_mem_bytes = dict(sorted(device_to_left_mem_bytes.items(), key=operator.itemgetter(1)))
         found_device = find_device_for(partition)
         if not found_device:
             break
@@ -339,7 +339,7 @@ def partition_graph(
             self.find_single_partition(
                 total_size_of_graph, logical_device_id=device_with_max_mem.logical_id
             )
-        elif total_size_of_graph > sum([d.available_mem_bytes for d in self.devices]):
+        elif total_size_of_graph > sum(d.available_mem_bytes for d in self.devices):
             raise RuntimeError("Devices have no enough memory for the module")
         else:
             # Sparse nn based partition
@@ -998,7 +998,7 @@ def swap_node_to_partition(
                 if cost < min_cost:
                     node_pair = [node, n1]
                     min_cost = cost
-            return cost, node_pair
+            return cost, node_pair  # type: ignore[possibly-undefined]
 
         # First use size_base_partition
         self.size_based_partition()
diff --git a/torch/fx/experimental/const_fold.py b/torch/fx/experimental/const_fold.py
index b5010c1e509fa..8176ccb562fa4 100644
--- a/torch/fx/experimental/const_fold.py
+++ b/torch/fx/experimental/const_fold.py
@@ -60,7 +60,7 @@ def run_folding(self):
 
         def _create_param(i):
             return torch.nn.Parameter(
-                i
+                i.detach().clone()
                 if not isinstance(i, int)
                 else torch.Tensor([i]).to(device=self.device_for_folded_attrs),
                 requires_grad=i.requires_grad if isinstance(i, torch.Tensor) else False,
@@ -263,7 +263,7 @@ def mod_partition(node: torch.fx.Node):
     setattr(
         split,
         fx_const_folded_attrs_name,
-        torch.nn.ParameterList() if multiple_outputs else torch.nn.Parameter(),
+        torch.nn.ParameterList() if multiple_outputs else torch.nn.Parameter(),  # type: ignore[possibly-undefined]
     )
     for node in split.graph.nodes:
         if node.op == "call_module" and node.target == const_mod_name:
diff --git a/torch/fx/experimental/migrate_gradual_types/constraint.py b/torch/fx/experimental/migrate_gradual_types/constraint.py
index 0f0d23d018749..3c1f724d26a58 100644
--- a/torch/fx/experimental/migrate_gradual_types/constraint.py
+++ b/torch/fx/experimental/migrate_gradual_types/constraint.py
@@ -152,7 +152,7 @@ def __init__(self, res, rhs1, rhs2):
         self.rhs2 = rhs2
 
     def __repr__(self):
-        return f'{self.res} = {self.rhs1}⊔*{self.rhs2}'
+        return f'{self.res} = {self.rhs1}\u2294*{self.rhs2}'
 
     def __eq__(self, other):
         if isinstance(other, TGreatestUpperBound):
@@ -180,7 +180,7 @@ def __init__(self, res, rhs1, rhs2):
         self.rhs2 = rhs2
 
     def __repr__(self):
-        return f'{self.res} = {self.rhs1}⊔{self.rhs2}'
+        return f'{self.res} = {self.rhs1}\u2294{self.rhs2}'
 
     def __eq__(self, other):
         if isinstance(other, DGreatestUpperBound):
diff --git a/torch/fx/experimental/migrate_gradual_types/operation.py b/torch/fx/experimental/migrate_gradual_types/operation.py
index ec2cb91bbcc17..432cd570bebbf 100644
--- a/torch/fx/experimental/migrate_gradual_types/operation.py
+++ b/torch/fx/experimental/migrate_gradual_types/operation.py
@@ -5,10 +5,10 @@
 op_eq = '='
 op_neq = '!='
 op_imp = '=>'
-op_matching = '⊳'
+op_matching = '\u22b3'  # (contains)
 op_consistency = '~'
-op_precision = '⊑'
-op_leq = '≤'
+op_precision = '\u2291'  # (square image of or equal to)
+op_leq = '\u2264'  # less-than or equal to
 op_lt = '<'
 op_gt = '>'
 op_mod = '%'
diff --git a/torch/fx/experimental/optimization.py b/torch/fx/experimental/optimization.py
index e62b2836fff25..4690ba81b360a 100644
--- a/torch/fx/experimental/optimization.py
+++ b/torch/fx/experimental/optimization.py
@@ -47,7 +47,7 @@ def replace_node_module(node: fx.Node, modules: Dict[str, Any], new_module: torc
     modules[node.target] = new_module
     setattr(modules[parent_name], name, new_module)
 
-def fuse(model: torch.nn.Module, inplace=False) -> torch.nn.Module:
+def fuse(model: torch.nn.Module, inplace=False, no_trace=False) -> torch.nn.Module:
     """
     Fuses convolution/BN layers for inference purposes. Will deepcopy your
     model by default, but can modify the model inplace as well.
@@ -57,7 +57,10 @@ def fuse(model: torch.nn.Module, inplace=False) -> torch.nn.Module:
                 (nn.Conv3d, nn.BatchNorm3d)]
     if not inplace:
         model = copy.deepcopy(model)
-    fx_model = fx.symbolic_trace(model)
+    if not no_trace or not isinstance(model, torch.fx.GraphModule):
+        fx_model = fx.symbolic_trace(model)
+    else:
+        fx_model = model
     modules = dict(fx_model.named_modules())
     new_graph = copy.deepcopy(fx_model.graph)
 
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index 99b8b35ada209..07905b0348473 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 # Copyright (c) Facebook, Inc. and its affiliates.
 # All rights reserved.
 #
@@ -13,7 +15,7 @@
 from weakref import WeakKeyDictionary
 from collections import defaultdict
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode, unset_fake_temporarily, is_fake
-from torch._dispatch.python import enable_python_dispatcher, enable_pre_dispatch
+from torch._dispatch.python import enable_python_dispatcher
 import torch.fx as fx
 from torch.fx.node import _side_effectful_need_to_be_preserved_pre_dispatch
 from torch.fx.passes.shape_prop import _extract_tensor_metadata
@@ -22,24 +24,31 @@
 from dataclasses import dataclass
 import weakref
 import operator
+import traceback
 from torch.utils._stats import count
+from torch.utils._traceback import CapturedTraceback
 import logging
+from torch._library.fake_class_registry import FakeScriptObject
 
 from torch.overrides import TorchFunctionMode
 
 from torch.utils._python_dispatch import (
     TorchDispatchMode,
+    _disable_infra_mode,
+    _push_mode,
+    _unset_infra_mode,
 )
 
+from ._backward_state import BackwardState
 from .sym_node import SymNode
 from ._sym_dispatch_mode import SymDispatchMode
 from torch.fx import Proxy
 import torch.fx.traceback as fx_traceback
 from torch import SymInt, SymFloat, SymBool
-from torch.utils.weak import WeakTensorKeyDictionary
-from torch._ops import unset_mode_pre_dispatch, _set_mode_pre_dispatch, _get_dispatch_mode_pre_dispatch
+from torch.utils.weak import WeakTensorKeyDictionary, WeakIdKeyDictionary, _WeakHashRef
 
 __all__ = ["PythonKeyTracer", "dispatch_trace", "make_fx", "DecompositionInterpreter", "py_sym_types", "get_innermost_proxy_mode"]
+
 aten = torch.ops.aten
 prim = torch.ops.prim
 
@@ -52,8 +61,15 @@
 
 # We currently convert all SymInt to proxies before we use them.
 # This could plausibly be handled at the Dynamo level.
-pytree.register_pytree_node(torch.Size, lambda x: (list(x), None), lambda xs, _: tuple(xs))
-
+pytree.register_pytree_node(
+    torch.Size,
+    lambda xs: (list(xs), None),
+    lambda xs, _: tuple(xs),
+    flatten_with_keys_fn=lambda xs: (
+        [(pytree.SequenceKey(i), x) for i, x in enumerate(xs)],
+        None,
+    ),
+)
 def fake_signature(fn, nargs):
     """FX gets confused by varargs, de-confuse it"""
     argnames = ",".join(f"arg{i}" for i in range(nargs))
@@ -74,6 +90,7 @@ def decompose(decomposition_table):
 no_default = object()
 
 py_sym_types = (SymInt, SymFloat, SymBool)
+
 def is_sym_node(node):
     assert hasattr(node, 'meta'), "All nodes traced with proxy_tensor should have meta"
     return "val" in node.meta and isinstance(node.meta['val'], py_sym_types)
@@ -83,6 +100,9 @@ def set_proxy_slot(obj, tracer, proxy):
         # We DO want to clobber proxies whenever we run an inplace operation
         # on a tensor, and it affects the metadata on the proxy.
         tracer.tensor_tracker[obj] = proxy
+    elif isinstance(obj, (torch.ScriptObject, FakeScriptObject)):
+        # We DO want to clobber proxies, with a similar rationale as for tensors.
+        tracer.script_object_tracker[obj] = proxy
     else:
         # NB: Never clobber pre-existing proxy.  Although the proxies
         # are in principle equivalent, when we do graph partitioning
@@ -90,7 +110,7 @@ def set_proxy_slot(obj, tracer, proxy):
         # This works because primals get their SymInts set first, and
         # THEN later we allocate tangent inputs.  Make sure if a SymInt
         # is derivable from a primal that we use that.
-        assert isinstance(obj, SymNode), type(obj)
+        assert isinstance(obj, py_sym_types), type(obj)
         if obj not in tracer.symnode_tracker:
             tracer.symnode_tracker[obj] = proxy
 
@@ -104,8 +124,10 @@ def has_proxy_slot(obj, tracer):
 def get_proxy_slot(obj, tracer, default=no_default, transform=lambda x: x):
     if isinstance(obj, torch.Tensor):
         tracker = tracer.tensor_tracker
+    elif isinstance(obj, (torch.ScriptObject, FakeScriptObject)):
+        tracker = tracer.script_object_tracker
     else:
-        assert isinstance(obj, SymNode), type(obj)
+        assert isinstance(obj, py_sym_types), type(obj)
         tracker = tracer.symnode_tracker
 
     if obj not in tracker:
@@ -122,6 +144,10 @@ def extract_val(val):
         return snapshot_fake(val)
     elif isinstance(val, py_sym_types):
         return val
+    elif isinstance(val, (torch.ScriptObject, FakeScriptObject)):
+        return val
+    elif isinstance(val, BackwardState):
+        return val
     elif isinstance(val, (list, tuple)):
         return val.__class__([extract_val(x) for x in val])
     elif isinstance(val, torch.Tensor):
@@ -148,6 +174,7 @@ def extract_val(val):
 # ADInplaceOrView, but you shouldn't rely on it.)
 def set_meta(proxy, val):
     proxy.node.meta['val'] = extract_val(val)
+
     # Best effort tensor_meta setting; prefer using val!
     if is_fake(val):
         proxy.node.meta['tensor_meta'] = _extract_tensor_metadata(val)
@@ -166,22 +193,24 @@ def track_tensor(tensor, proxy, *, constant, tracer):
     def try_set_proxy_slot(outer_s, proxy_callable, *args):
         assert callable(proxy_callable)
         if isinstance(outer_s, SymInt):
-            inner_s = outer_s.node
-            set_proxy_slot(inner_s, tracer, thunkify(proxy_callable, outer_s, *args))
-
+            set_proxy_slot(outer_s, tracer, thunkify(proxy_callable, outer_s, *args))
     # The basic idea is that we need to associate each tensor/SymInt
     # with a Proxy.  How do we setup this association?  We just store
     # the proxy on the proxy slot of the object, keyed on the tracer
     # (so that if we have multiple tracers at the same time, they
     # don't clobber each other.)
     for i, s in enumerate(tensor.shape):
-        try_set_proxy_slot(s, lambda x, i: set_meta(torch.ops.aten.sym_size.int(proxy, i), x), i)
+        try_set_proxy_slot(s, lambda x, i: set_meta(
+            tracer.create_proxy('call_function', torch.ops.aten.sym_size.int, (proxy, i), {}), x), i)
 
     for i, s in enumerate(tensor.stride()):
-        try_set_proxy_slot(s, lambda x, i: set_meta(torch.ops.aten.sym_stride.int(proxy, i), x), i)
+        try_set_proxy_slot(s, lambda x, i: set_meta(
+            tracer.create_proxy('call_function', torch.ops.aten.sym_stride.int, (proxy, i), {}), x), i)
 
-    try_set_proxy_slot(tensor.numel(), lambda x: set_meta(torch.ops.aten.sym_numel.default(proxy), x))
-    try_set_proxy_slot(tensor.storage_offset(), lambda x: set_meta(torch.ops.aten.sym_storage_offset.default(proxy), x))
+    try_set_proxy_slot(tensor.numel(), lambda x: set_meta(
+        tracer.create_proxy('call_function', torch.ops.aten.sym_numel.default, (proxy,), {}), x))
+    try_set_proxy_slot(tensor.storage_offset(), lambda x: set_meta(
+        tracer.create_proxy('call_function', torch.ops.aten.sym_storage_offset.default, (proxy,)), x))
     set_proxy_slot(tensor, tracer, _ProxyTensor(proxy, constant))
 
 def track_tensor_tree(inner_res, proxy_res, *, constant, tracer):
@@ -192,7 +221,10 @@ def wrap_with_proxy(e, proxy, constant):
         elif isinstance(e, py_sym_types):
             # NB: eagerly set meta here, so that the numbering is in order
             set_meta(proxy, e)
-            set_proxy_slot(e.node, tracer, lambda: proxy)
+            set_proxy_slot(e, tracer, lambda: proxy)
+        elif isinstance(e, (torch.ScriptObject, FakeScriptObject)):
+            set_proxy_slot(e, tracer, proxy)
+            set_meta(proxy, e)
         elif isinstance(e, (tuple, list)):
             if isinstance(proxy, fx.Proxy):
                 set_meta(proxy, e)
@@ -214,6 +246,9 @@ def wrap_with_proxy(e, proxy, constant):
             # example use case: triton_kernel_wrapper takes arguments as kwargs
             for key, val in e.items():
                 wrap_with_proxy(val, proxy[key], None)
+        elif isinstance(e, BackwardState):
+            set_meta(proxy, e)
+            e.proxy = proxy
         else:
             # intentionally pass on primitives
             pass
@@ -235,24 +270,6 @@ def maybe_disable_fake_tensor_mode():
     # library
     return unset_fake_temporarily()
 
-def _unset_proxy_mode():
-    pre_dispatch_proxy = _get_dispatch_mode_pre_dispatch(torch._C._TorchDispatchModeKey.PROXY)
-    post_dispatch_proxy = torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.PROXY)
-    if pre_dispatch_proxy and post_dispatch_proxy:
-        raise AssertionError("Can't have active proxy mode on both pre and post dispatch mode stack")
-
-    if pre_dispatch_proxy:
-        mode = unset_mode_pre_dispatch(torch._C._TorchDispatchModeKey.PROXY)
-        return mode
-    return torch._C._unset_dispatch_mode(torch._C._TorchDispatchModeKey.PROXY)
-
-def _set_proxy_mode(mode):
-    assert isinstance(mode, ProxyTorchDispatchMode)
-    if mode.pre_dispatch:
-        _set_mode_pre_dispatch(mode)
-    else:
-        torch._C._set_dispatch_mode(mode)
-
 
 @dataclass
 class _ProxyTensor:
@@ -265,19 +282,26 @@ def inner(e):
         n = e.node
         if n.constant is not None:
             return n.constant
+        if e.node.expr.is_number:
+            if isinstance(e, SymBool):
+                return bool(e.node.expr)
+            elif isinstance(e, SymInt):
+                return int(e.node.expr)
+            return float(e.node.expr)
         else:
             # NB: we REQUIRE all symints to be tracked
-            return get_proxy_slot(n, tracer)()
+            return get_proxy_slot(e, tracer)()
     return inner
 
 
-def fetch_tensor_proxy(tracer):
+def fetch_object_proxy(tracer):
     return lambda t: get_proxy_slot(t, tracer, t)
 
 HANDLED_TYPES = (torch.Tensor, torch.nn.Parameter, FakeTensor)
 
 def proxy_call(proxy_mode, func, pre_dispatch, args, kwargs):
     unrecognized_types = []
+    flat_args_kwargs, spec = pytree.tree_flatten((args, kwargs))
 
     def can_handle_tensor(x):
         r = type(x) in HANDLED_TYPES or has_proxy_slot(x, proxy_mode.tracer)
@@ -289,8 +313,13 @@ def can_handle_tensor(x):
 
     # If there are any tensor subclasses, we need to handle those tensor subclasses first
     # TODO: we could use types to test this
-    if not pytree.tree_all_only(torch.Tensor, can_handle_tensor, (args, kwargs)):
-        not_implemented_log.debug("ProxyTensorMode tensors without proxy had unrecognized subclasses: %s", unrecognized_types)
+    if not all(
+        can_handle_tensor(x) for x in flat_args_kwargs if isinstance(x, torch.Tensor)
+    ):
+        not_implemented_log.debug(
+            "ProxyTensorMode tensors without proxy had unrecognized subclasses: %s",
+            unrecognized_types,
+        )
         return NotImplemented
 
     r = maybe_handle_decomp(proxy_mode, func, args, kwargs)
@@ -299,7 +328,9 @@ def can_handle_tensor(x):
 
     # For pre-autograd tracing, we do not want to run CompositeImplicit decomps.
     if not pre_dispatch and func not in [
-        torch.ops.aten.size.default, torch.ops.aten.stride.default, torch.ops.aten.storage_offset.default
+        torch.ops.aten.size.default,
+        torch.ops.aten.stride.default,
+        torch.ops.aten.storage_offset.default,
     ]:
         with proxy_mode:
             r = func.decompose(*args, **kwargs)
@@ -307,41 +338,68 @@ def can_handle_tensor(x):
                 return r
 
     tracer = proxy_mode.tracer
-    f_args, f_kwargs = pytree.tree_map_only(torch.Tensor, fetch_tensor_proxy(tracer), (args, kwargs))
+    f_flat_args_kwargs = [
+        (
+            fetch_object_proxy(tracer)(x)
+            if isinstance(x, (torch.Tensor, torch.ScriptObject, FakeScriptObject))
+            else x
+        )
+        for x in flat_args_kwargs
+    ]
 
     # If there are SymInts, we also should not consider this constant.
     # However, fake tensor handling of SymInts is sufficiently broken that
     # I couldn't write a test for this case
     all_constant = (
-        pytree.tree_all_only(_ProxyTensor, lambda t: t.constant is not None, (f_args, f_kwargs))
+        not any(
+            t.constant is None
+            for t in f_flat_args_kwargs
+            if isinstance(t, _ProxyTensor)
+        )
         # TODO: maybe constant SymInts should also be allowed?  Not sure if
         # this can happen
-        and pytree.tree_all_only((SymInt, SymFloat, SymBool), lambda _: False, (args, kwargs))
+        and not any(
+            isinstance(x, (SymInt, SymFloat, SymBool)) for x in flat_args_kwargs
+        )
     )
 
     if torch.Tag.data_dependent_output in func.tags:
         # Check if all of the Tensor inputs are constants
         if all_constant:
-            const_args, const_kwargs = pytree.tree_map_only(
-                _ProxyTensor, lambda t: t.constant, (f_args, f_kwargs)
+            const_flat_args_kwargs = [
+                t.constant if isinstance(t, _ProxyTensor) else t
+                for t in f_flat_args_kwargs
+            ]
+            const_args, const_kwargs = pytree.tree_unflatten(
+                const_flat_args_kwargs, spec
             )
             with maybe_disable_fake_tensor_mode():
                 return func(*const_args, **const_kwargs)
         # If any of the Tensor inputs are "real" (not FakeTensor), we may
         # incorrectly burn in constants by allowing this access.  Raise
         # an error in this case
-        if proxy_mode._error_on_data_dependent_ops and pytree.tree_all_only(torch.Tensor, lambda t: not is_fake(t), (args, kwargs)):
+        if proxy_mode._error_on_data_dependent_ops and pytree.tree_all_only(
+            torch.Tensor, lambda t: not is_fake(t), (args, kwargs)
+        ):
             raise RuntimeError(
                 f"It appears that you're trying to get value out of a tracing tensor with {func} - erroring out! "
                 "It's likely that this is caused by data-dependent control flow or similar.  "
                 "It may be possible to trace this with dynamic shapes; try setting tracing_mode='symbolic' "
                 "in your make_fx call."
             )
-    proxy_args, proxy_kwargs = pytree.tree_map_only(
-        (SymInt, SymFloat, SymBool),
-        fetch_sym_proxy(proxy_mode.tracer),
-        pytree.tree_map_only(_ProxyTensor, lambda e: e.proxy, (f_args, f_kwargs))
-    )
+
+    proxy_flat_args_kwargs = [
+        e.proxy if isinstance(e, _ProxyTensor) else e for e in f_flat_args_kwargs
+    ]
+    proxy_flat_args_kwargs = [
+        (
+            fetch_sym_proxy(proxy_mode.tracer)(e)
+            if isinstance(e, (SymInt, SymFloat, SymBool))
+            else e
+        )
+        for e in proxy_flat_args_kwargs
+    ]
+    proxy_args, proxy_kwargs = pytree.tree_unflatten(proxy_flat_args_kwargs, spec)
 
     # When we trace through a torch.tensor invocation, you never actually
     # see a torch.ops.aten.tensor call. Instead, the way this function is
@@ -379,13 +437,21 @@ def can_handle_tensor(x):
     if func is torch.ops.aten.lift_fresh.default:
         func = torch.ops.aten.lift_fresh_copy.default
 
-    proxy_out = proxy_mode.tracer.create_proxy('call_function', func, proxy_args, proxy_kwargs,
-                                               name=proxy_mode.tracer.graph._target_to_str(func.overloadpacket.__name__))
+    proxy_out = proxy_mode.tracer.create_proxy(
+        "call_function",
+        func,
+        proxy_args,
+        proxy_kwargs,
+        name=proxy_mode.tracer.graph._target_to_str(func.overloadpacket.__name__),
+    )
 
     # This makes DCE marginally less likely to DCE inplace operations.
     # It is not strictly necessary
     # Kind of a hacky way to test if an op is in-place or not
-    if func.overloadpacket.__name__[-1] == "_" and func.overloadpacket.__name__[0] != "_":
+    if (
+        func.overloadpacket.__name__[-1] == "_"
+        and func.overloadpacket.__name__[0] != "_"
+    ):
         if isinstance(args[0], List):
             # e.g., c10d::allreduce_ returns a list of tensors as the first element
             # in the output.
@@ -417,40 +483,93 @@ def can_handle_tensor(x):
     # element constant computation by testing the numel of the result before
     # propagating const-ness.  Similarly, we don't require the constant to
     # live on CPU, but we could.
-    any_constant = pytree.tree_any_only(_ProxyTensor, lambda t: t.constant is not None, (f_args, f_kwargs))
+    any_constant = any(
+        t.constant is not None
+        for t in f_flat_args_kwargs
+        if isinstance(t, _ProxyTensor)
+    )
 
     constant = None
 
     # If this is a lift, the input tensor is guaranteed to be a
     # constant, so we keep a copy of the original argument along so
     # we can query it if we're asked to item() it at some later point
-    if func is torch.ops.aten.lift_fresh_copy.default and out.numel() <= CONSTANT_NUMEL_LIMIT:
+    if (
+        func is torch.ops.aten.lift_fresh_copy.default
+        and out.numel() <= CONSTANT_NUMEL_LIMIT
+    ):
         with maybe_disable_fake_tensor_mode():
             constant = args[0].clone()
     elif (
         torch.Tag.nondeterministic_seeded not in func.tags
         and all_constant
         and any_constant
-        and pytree.tree_all_only(torch.Tensor, lambda t: t.numel() <= CONSTANT_NUMEL_LIMIT, out)
+        and pytree.tree_all_only(
+            torch.Tensor, lambda t: t.numel() <= CONSTANT_NUMEL_LIMIT, out
+        )
     ):
         # NB: do NOT include factories as constants
         with maybe_disable_fake_tensor_mode():
-            const_args, const_kwargs = pytree.tree_map_only(
-                _ProxyTensor, lambda t: t.constant, (f_args, f_kwargs)
+            const_flat_args_kwargs = [
+                t.constant if isinstance(t, _ProxyTensor) else t
+                for t in f_flat_args_kwargs
+            ]
+            const_args, const_kwargs = pytree.tree_unflatten(
+                const_flat_args_kwargs, spec
             )
             constant = func(*const_args, **const_kwargs)
     else:
         constant = None
 
+    from .symbolic_shapes import compute_unbacked_bindings
+    # Can't use detect_fake_mode here,
+    #
+    # python test/distributed/_tensor/test_dtensor_compile.py -k
+    # test_tp_compile_fullgraph_is_seq_parallel_False
+    #
+    # will fail.  Very strange, it probably isn't right for them to be using
+    # two fake modes there...
+    fake_mode = torch._C._get_dispatch_mode(
+        torch._C._TorchDispatchModeKey.FAKE
+    )
+    if fake_mode and fake_mode.shape_env:
+        if symbol_to_path := compute_unbacked_bindings(fake_mode.shape_env, out):
+            proxy_out.node.meta["unbacked_bindings"] = symbol_to_path
+
     track_tensor_tree(out, proxy_out, constant=constant, tracer=tracer)
     return out
 
+class _SymNodeDict:
+    """
+    Wrapper around a dictionary that will hash SymInts with their nodes
+    """
+    def __init__(self):
+        self.sym_node_dict = {}
+
+    def __setitem__(self, key: py_sym_types, value: Any):
+        self.sym_node_dict[key.node] = value
+
+    def __getitem__(self, key: py_sym_types):
+        return self.sym_node_dict[key.node]
+
+    def __contains__(self, key: py_sym_types):
+        return key.node in self.sym_node_dict
+
+    def get(self, key: py_sym_types, default: Any = None):
+        return self.sym_node_dict.get(key.node, default)
 
 class PythonKeyTracer(Tracer):
     def __init__(self):
         super().__init__(autowrap_modules=())
         self.tensor_tracker = WeakTensorKeyDictionary()
-        self.symnode_tracker = weakref.WeakKeyDictionary()  # type: ignore[var-annotated]
+        self.symnode_tracker = _SymNodeDict()  # type: ignore[var-annotated]
+        self.script_object_tracker = WeakIdKeyDictionary(dict=None, ref_type=_WeakHashRef)
+
+        # Stores the torch function that was called during tracing
+        self.torch_fn_metadata = None
+        # Stores the counts for every torch function called. This is to help
+        # distinguish between different calls to the same torch function.
+        self.torch_fn_counts = {}
 
     # In general, we don't want to make modules leaves. In principle, users of
     # this tracer might want to override this in order to turn a couple specific
@@ -472,12 +591,7 @@ def create_arg(self, a: Any):
             qualname: Optional[str] = None
 
             if not qualname:
-                i = 0
-                while True:
-                    qualname = f'_param_constant{i}'
-                    if not hasattr(self.root, qualname):
-                        break
-                    i += 1
+                qualname = self.get_fresh_qualname("_param_constant")
                 setattr(self.root, qualname, a)
 
             return self.create_node('get_attr', qualname, (), {})
@@ -490,11 +604,46 @@ def unwrap_proxy(self, e):
         if isinstance(e, torch.Tensor):
             return get_proxy_slot(e, self, e, lambda e: e.proxy)
         elif isinstance(e, (torch.SymInt, torch.SymFloat, torch.SymBool)):
-            return get_proxy_slot(e.node, self, e, lambda e: e())
+            return get_proxy_slot(e, self, e, lambda e: e())
+        elif isinstance(e, (torch.ScriptObject, FakeScriptObject)):
+            return get_proxy_slot(e, self, e)
         else:
             return e
 
 
+@contextmanager
+def _temp_remove_pre_dispatch_torch_function_mode():
+    from torch.overrides import _len_torch_function_stack, _pop_mode, _push_mode
+    temp_elements = []
+    pre_dispatch_mode = None
+
+    while _len_torch_function_stack() > 0:
+        mode = _pop_mode()
+        if isinstance(mode, PreDispatchTorchFunctionMode):
+            pre_dispatch_mode = mode
+            break
+        else:
+            temp_elements.append(mode)
+
+    for mode in reversed(temp_elements):
+        _push_mode(mode)
+
+    try:
+        yield
+
+    finally:
+        if pre_dispatch_mode is not None:
+            count = len(temp_elements)
+            while count > 0:
+                mode = _pop_mode()
+                count -= 1
+
+            temp_elements.append(pre_dispatch_mode)
+
+            for mode in reversed(temp_elements):
+                _push_mode(mode)
+
+
 @torch._disable_dynamo
 def dispatch_trace(
         root: Union[torch.nn.Module, Callable],
@@ -502,17 +651,10 @@ def dispatch_trace(
         concrete_args: Optional[Tuple[Any, ...]] = None,
 ) -> GraphModule:
     graph = tracer.trace(root, concrete_args)
+    from torch._inductor.fx_passes.dedupe_symint_uses import dedupe_symints
+    dedupe_symints(graph)
     name = root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
-    return GraphModule(tracer.root, graph, name)
-
-
-@contextlib.contextmanager
-def _pop_proxy_mode_temporarily():
-    old = _unset_proxy_mode()
-    try:
-        yield old
-    finally:
-        _set_proxy_mode(old)
+    return fx._lazy_graph_module._make_graph_module(tracer.root, graph, name)
 
 
 def wrap_key(f, tensors, tracer, pre_dispatch: bool):
@@ -522,7 +664,7 @@ def wrap_key(f, tensors, tracer, pre_dispatch: bool):
     def wrapped(*proxies):
         flat_proxies, proxies_spec = pytree.tree_flatten(proxies)
         assert len(flat_proxies) == len(flat_tensors)
-        with _pop_proxy_mode_temporarily() as m:
+        with disable_proxy_modes_tracing() as m:
             assert isinstance(m, ProxyTorchDispatchMode)
             track_tensor_tree(flat_tensors, flat_proxies, constant=None, tracer=tracer)
 
@@ -532,9 +674,14 @@ def wrapped(*proxies):
             lambda t: get_proxy_slot(t, tracer, t, lambda x: x.proxy),
             out
         )
+        out = pytree.tree_map_only(
+            (torch.ScriptObject, FakeScriptObject),
+            lambda t: get_proxy_slot(t, tracer, t, lambda x: x),
+            out
+        )
         out = pytree.tree_map_only(
             (SymInt, SymFloat, SymBool),
-            lambda t: get_proxy_slot(t.node, tracer)(),
+            lambda t: get_proxy_slot(t, tracer)(),
             out
         )
         return out
@@ -557,6 +704,17 @@ def set_original_aten_op(func):
         yield
 
 
+class TorchFunctionMetadataMode(TorchFunctionMode):
+
+    def __init__(self, tracer):
+        self.tracer = tracer
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+        self.tracer.torch_fn_metadata = func
+        self.tracer.torch_fn_counts[func] = self.tracer.torch_fn_counts.get(func, 0) + 1
+        return func(*args, **kwargs)
+
 
 # This mode is **only** used for pre_dispatch tracing.
 # In particular, we need to make sure that autograd/autocast API's
@@ -569,7 +727,13 @@ def __init__(self, tracer):
     def __torch_function__(self, func, types, args=(), kwargs=None):
         kwargs = kwargs or {}
         if func in _side_effectful_need_to_be_preserved_pre_dispatch:
-            return self.tracer.create_node("call_function", func, args, {})
+            # It's for passing the export verifier which needs to verify the meta['val']
+            # TODO(tmanlaibaatar): we should systematically couple it with expoert verifier,
+            # instead of hardcoding it here.
+            node = self.tracer.create_node("call_function", func, args, {})
+            if func is torch._C._set_grad_enabled:
+                node.meta['val'] = None
+            return node
             # Don't actually run the function! We just want to trace the calls
             # into a graph. We don't actualy want to change global autograd state.
         return func(*args, **kwargs)
@@ -607,7 +771,7 @@ def __enter__(self):
         self._managers.append(m)
         m.__enter__()
         # Stash and store the previous proxy mode (there may or may not be one)
-        maybe_prev_proxy_mode = _unset_proxy_mode()
+        maybe_prev_proxy_mode = _unset_infra_mode(torch._C._TorchDispatchModeKey.PROXY)
         self.enter_stack.append(maybe_prev_proxy_mode)
         return super().__enter__()
 
@@ -619,7 +783,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         # Re-enable the previous proxy mode, if there was one.
         mb_previous_proxy_mode = self.enter_stack.pop()
         if mb_previous_proxy_mode is not None:
-            _set_proxy_mode(mb_previous_proxy_mode)
+            _push_mode(mb_previous_proxy_mode)
 
         if not b:
             return m.__exit__(exc_type, exc_value, traceback)
@@ -657,7 +821,7 @@ def enable(self, b):
 
     def _compute_proxy(self, func, args, out: Union[SymInt, SymFloat, SymBool]):
         n_args = tuple(
-            get_proxy_slot(a.node, self.tracer)().node if isinstance(a, py_sym_types) else a
+            get_proxy_slot(a, self.tracer)().node if isinstance(a, py_sym_types) else a
             for a in args
         )
 
@@ -693,7 +857,7 @@ def __sym_dispatch__(self, func, types, args, kwargs):
         if isinstance(out, py_sym_types):
             # Delays tracing out the proxies on this op until we actually need it
             p_out_thunk = thunkify(self._compute_proxy, func=func, args=args, out=out)
-            set_proxy_slot(out.node, self.tracer, p_out_thunk)
+            set_proxy_slot(out, self.tracer, p_out_thunk)
 
         return out
 
@@ -713,6 +877,12 @@ def __init__(self, module: torch.fx.GraphModule, new_graph: torch.fx.Graph, deco
             self.decomposition_table = {}
         self.mode = ProxyTorchDispatchMode(self.tracer, tracing_mode="real")
 
+        # Stores the torch function that was called during tracing
+        self.tracer.torch_fn_metadata = None
+        # Stores the counts for every torch function called. This is to help
+        # distinguish between different calls to the same torch function.
+        self.tracer.torch_fn_counts = {}
+
     def placeholder(self, target, args, kwargs):
         out = super().placeholder(target, args, kwargs)
         proxy = torch.fx.Proxy(self.new_graph.placeholder(target), self.tracer)
@@ -918,6 +1088,56 @@ def call_module(self, m, forward, args, kwargs):
     def is_leaf_module(self, m, module_qualified_name):
         return False
 
+    def create_node(self, *args, **kwargs):
+        '''
+        Create node and add on metadata.
+        Add nn_module_stack here instead of TracerBase,
+        since calls to make_fx() might not want to record module stack metadata.
+        Add torch_fn by looking at torch_fn_metadata and torch_fn_counts.
+        Add stack_trace by filtering out forward() stack frames.
+        '''
+        node = super().create_node(*args, **kwargs)
+
+        # nn_module_stack
+        if node.op not in ["placeholder", "output"]:
+            if "nn_module_stack" not in node.meta:
+                node.meta["nn_module_stack"] = self.module_stack
+            # convert nn_module_stack from Dict[key, (FQN, class)] -> Dict[str, Tuple[str, str]]
+            for key, (fqn, mod_cls) in node.meta["nn_module_stack"].items():
+                if isinstance(mod_cls, type):
+                    node.meta["nn_module_stack"][key] = (fqn, mod_cls.__module__ + "." + mod_cls.__qualname__)
+
+        # torch_fn
+        if node.op == "call_function" and self.torch_fn_metadata is not None and "torch_fn" not in node.meta:
+            node.meta["torch_fn"] = (
+                f"{self.torch_fn_metadata.__name__}_{self.torch_fn_counts[self.torch_fn_metadata]}",
+                f"{self.torch_fn_metadata.__class__.__name__}.{self.torch_fn_metadata.__name__}"
+            )
+
+        # stack_trace
+        if 'stack_trace' not in node.meta and node.op not in ["placeholder", "output"]:
+            user_frame_summary = CapturedTraceback.extract().summary()
+            if user_frame_summary:
+                # we retain frames from forward() calls, or ops
+                # located in torch/__init__.py (e.g. sym_int, sym_constrain_range, vmap)
+                stack_trace = [frame for frame in user_frame_summary if (
+                    frame.name == 'forward'
+                    or frame.filename.endswith('torch/__init__.py')
+                )]
+                # filter out forward() frames from fx/_symbolic_trace.py, export/_trace.py
+                # this is hardcoded, but leads to a much cleaner stack trace
+                stack_trace = [
+                    frame for frame in stack_trace if not (
+                        frame.filename.endswith('fx/_symbolic_trace.py')
+                        or frame.filename.endswith('export/_trace.py')
+                    )
+                ]
+                if stack_trace:  # empty list for strict mode, dynamo should handle stack_trace
+                    stack_trace = traceback.StackSummary.from_list(stack_trace)
+                    node.meta["stack_trace"] = ''.join(stack_trace.format()).strip()
+
+        return node
+
 
 def make_fx(f,
             decomposition_table=None,
@@ -933,6 +1153,12 @@ def make_fx(f,
     if decomposition_table is None:
         decomposition_table = {}
 
+    if torch.ops.aten.sym_numel.default not in decomposition_table:
+        decomposition_table = {
+            **decomposition_table,
+            torch.ops.aten.sym_numel.default: torch._decomp.decompositions.sym_numel
+        }
+
     @functools.wraps(f)
     def wrapped(*args):
         # Avoid importing sympy at a module level
@@ -952,21 +1178,25 @@ def wrapped(*args):
             import torch._dynamo
             fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args)
             if fake_tensor_mode is None:
-                fake_tensor_mode = FakeTensorMode(
-                    allow_fallback_kernels=True,
-                    allow_non_fake_inputs=_allow_non_fake_inputs,
-                    shape_env=ShapeEnv(),
-                    static_shapes=True,
-                )
+                import torch._functorch.config as _config
+                with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
+                    fake_tensor_mode = FakeTensorMode(
+                        allow_fallback_kernels=True,
+                        allow_non_fake_inputs=_allow_non_fake_inputs,
+                        shape_env=ShapeEnv(),
+                        static_shapes=True,
+                    )
         elif tracing_mode == "symbolic":
             import torch._dynamo
             fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args)
             if fake_tensor_mode is None:
                 shape_env = ShapeEnv()
-                fake_tensor_mode = FakeTensorMode(
-                    allow_fallback_kernels=False,
-                    allow_non_fake_inputs=_allow_non_fake_inputs,
-                    shape_env=shape_env)
+                import torch._functorch.config as _config
+                with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
+                    fake_tensor_mode = FakeTensorMode(
+                        allow_fallback_kernels=False,
+                        allow_non_fake_inputs=_allow_non_fake_inputs,
+                        shape_env=shape_env)
             else:
                 shape_env = fake_tensor_mode.shape_env
                 assert shape_env is not None, "shape_env should be set if tracing with 'symbolic'"
@@ -975,13 +1205,10 @@ def wrapped(*args):
             raise AssertionError(f"Unexpected tracing type: {tracing_mode}")
 
         python_dispatcher_mode: Any = nullcontext()
-        pre_dispatch_mode: Any = nullcontext()
         # pre-autograd tracing uses per-dispatch-key modes,
         # which requires the python dispatcher
         if tracing_mode == "symbolic" or pre_dispatch:
             python_dispatcher_mode = enable_python_dispatcher()
-        if pre_dispatch:
-            pre_dispatch_mode = enable_pre_dispatch()
 
         proxy_function_mode: Any = nullcontext()
         if pre_dispatch:
@@ -1009,7 +1236,10 @@ def wrap_fake(x):
             # NB: don't match on bools
             elif type(x) is int and tracing_mode == "symbolic":
                 return shape_env.create_symintnode(shape_env.create_symbol(x, source, positive=None), hint=x, source=source)
+            elif isinstance(x, torch.ScriptObject):
+                return torch._library.fake_class_registry.to_fake_obj(fake_tensor_mode, x)
 
+            assert not isinstance(x, FakeScriptObject), f"ScriptObject {x} has been fakified. Cannot wrap_fake it again."
             return x
 
         sym_mode = proxy_mode.sym_mode
@@ -1028,14 +1258,16 @@ def wrap_fake(x):
         else:
             func = f
 
+        torch_fn_metadata_mode = TorchFunctionMetadataMode(fx_tracer)
+
         # We disable the autocast cache as the autocast cache causes type conversions on parameters to
         # check a cache, which introduces untracked tensors into the graph
         #
         # We also disable tracing by any other tensor proxy-based tracers except the current. The
         # purpose of `make_fx` is to produce graphmodules as a side effect; its internal execution is
         # thus irrelevant to any external functional trace.
-        with decompose(decomposition_table), fake_tensor_mode, python_dispatcher_mode, pre_dispatch_mode, proxy_function_mode, \
-             sym_mode, proxy_mode, disable_autocast_cache():
+        with decompose(decomposition_table), fake_tensor_mode, python_dispatcher_mode, proxy_function_mode, \
+             sym_mode, torch_fn_metadata_mode, proxy_mode, disable_autocast_cache():
             t = dispatch_trace(wrap_key(func, args, fx_tracer, pre_dispatch), tracer=fx_tracer, concrete_args=tuple(phs))
 
         # TODO: kind of a bad way to do it, should maybe figure out a better way
@@ -1055,20 +1287,8 @@ def get_innermost_proxy_mode():
 
 
 @contextlib.contextmanager
-def disable_proxy_modes_tracing(enable_current=False):
-    # enable_current=True is now a no-op, since only one proxy mode
-    # can live on the stack at a time.
-    # We should kill this API in a future PR.
-    maybe_old = None
-    if not enable_current:
-        # Only one proxy_mode can be "active" at a time.
-        # So we simply remove our active mode.
-        maybe_old = torch._C._unset_dispatch_mode(torch._C._TorchDispatchModeKey.PROXY)
-    try:
-        yield
-    finally:
-        if maybe_old is not None:
-            torch._C._set_dispatch_mode(maybe_old)
+def disable_proxy_modes_tracing():
+    return _disable_infra_mode(torch._C._TorchDispatchModeKey.PROXY)
 
 
 def maybe_handle_decomp(proxy_mode, op, args, kwargs):
diff --git a/torch/fx/experimental/recording.py b/torch/fx/experimental/recording.py
index 9fdc9e1e31079..4bf9ebab17b32 100644
--- a/torch/fx/experimental/recording.py
+++ b/torch/fx/experimental/recording.py
@@ -1,5 +1,7 @@
 import functools
+import inspect
 import itertools
+import logging
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
@@ -7,6 +9,9 @@
 import torch.utils._pytree as pytree
 
 
+log = logging.getLogger(__name__)
+
+
 __all__ = [
     "ShapeEnvEvent",
     "record_shapeenv_event",
@@ -86,7 +91,11 @@ class ShapeEnvEvent:
 
     # Replay itself, but using shape_env as self.
     def run(self, shape_env=None) -> Any:
-        from torch.fx.experimental.symbolic_shapes import ShapeEnv, SymTypes
+        from torch.fx.experimental.symbolic_shapes import (
+            is_symbolic,
+            ShapeEnv,
+            SymTypes,
+        )
 
         # Special handling for the constructor event.
         if self.f is ShapeEnv:
@@ -105,7 +114,7 @@ def run(self, shape_env=None) -> Any:
         # Replace any argument of type SymTypes by a new instance,
         # replacing its ShapeEnv reference.
         args, kwargs = pytree.tree_map_only(
-            SymTypes,
+            lambda x: isinstance(x, SymTypes) and is_symbolic(x),
             lambda a: type(a)(a.node.with_shape_env(shape_env)),
             (args, kwargs),
         )
@@ -115,6 +124,7 @@ def maybe_convert_node(x: Any) -> Any:
             if not isinstance(x, torch.fx.Node):
                 # Don't do anything to x if it's not an FX node.
                 return x
+
             # If, at some point, we created an FX node, it means that translation validation is on.
             # It also means we are building an FX graph for symbolic shapes at shape_env.graph, and
             # we are tracking node names at shape_env.name_to_node.
@@ -155,7 +165,7 @@ def __str__(self) -> str:
         return f"event: {name} ({self.args}, {self.kwargs})"
 
     def is_create_fx_call_function(self) -> bool:
-        return self.name == "create_fx_call_function"
+        return self.name == "_create_fx_call_function"
 
     def is_evaluate_expr(self) -> bool:
         return self.name == "evaluate_expr"
@@ -171,7 +181,7 @@ def is_defer_runtime_assert(self) -> bool:
 # If we find more than one object of any of the above types, we
 # also check that the ShapeEnv instance is the same for all of them.
 def _extract_shape_env_and_assert_equal(args, kwargs):
-    from torch.fx.experimental.symbolic_shapes import ShapeEnv, SymTypes
+    from torch.fx.experimental.symbolic_shapes import is_symbolic, ShapeEnv, SymTypes
 
     def assert_equal(old: Optional[ShapeEnv], new: ShapeEnv) -> ShapeEnv:
         if old is not None:
@@ -182,7 +192,7 @@ def assert_equal(old: Optional[ShapeEnv], new: ShapeEnv) -> ShapeEnv:
     for val in itertools.chain(args, kwargs.values()):
         if isinstance(val, ShapeEnv):
             shape_env = assert_equal(shape_env, val)
-        if isinstance(val, SymTypes):
+        if isinstance(val, SymTypes) and is_symbolic(val):
             shape_env = assert_equal(shape_env, val.node.shape_env)
 
     return shape_env
@@ -211,43 +221,64 @@ def assert_equal(old: Optional[ShapeEnv], new: ShapeEnv) -> ShapeEnv:
 def record_shapeenv_event(*, save_tracked_fakes: bool = False) -> Callable:
     def decorator(fn: Callable) -> Callable:
         assert callable(fn)
+        args = inspect.getfullargspec(fn).args
+        assert args and args[0] == "self", (
+            "record_shapeenv_event should only wrap methods on ShapeEnv; refactor your "
+            "code so that it calls into a method on ShapeEnv"
+        )
         name = fn.__name__
 
         @functools.wraps(fn)
         def wrapper(*args, **kwargs):
             from torch.fx.experimental.symbolic_shapes import ShapeEnv
 
-            if isinstance(args[0], ShapeEnv) and args[0].is_recording:  # type: ignore[has-type]
-                # If ShapeEnv is already recording an event, call the wrapped
-                # function directly.
-                #
-                # NB: here, we skip the check of whether all ShapeEnv instances
-                # are equal, in favor of a faster dispatch.
-                return fn(*args, **kwargs)
-
-            # Retrieve an instance of ShapeEnv.
-            # Assumption: the collection of args and kwargs may not reference
-            # different ShapeEnv instances.
-            self = _extract_shape_env_and_assert_equal(args, kwargs)
-
-            # If we are calling this function without any ShapeEnv instance
-            # alive in its arguments, we don't record and call the original.
-            if self is None:
-                return fn(*args, **kwargs)
-
-            # Otherwise, start recording and call the function.
-            with self.recording():
-                # Take a snapshot of the current tracked_fakes.
-                tracked_fakes = (
-                    self.snapshot_tracked_fakes() if save_tracked_fakes else None
-                )
-                # Record the event for 'fn'.
-                event = ShapeEnvEvent(
-                    fn, list(args), kwargs, tracked_fakes, name=fn.__name__
-                )
-                self.events.append(event)
-                # Play the event on this ShapeEnv.
-                return event.run(self)
+            assert isinstance(args[0], ShapeEnv)
+
+            try:
+                if args[0].is_recording:  # type: ignore[has-type]
+                    # If ShapeEnv is already recording an event, call the wrapped
+                    # function directly.
+                    #
+                    # NB: here, we skip the check of whether all ShapeEnv instances
+                    # are equal, in favor of a faster dispatch.
+                    return fn(*args, **kwargs)
+
+                # Retrieve an instance of ShapeEnv.
+                # Assumption: the collection of args and kwargs may not reference
+                # different ShapeEnv instances.
+                self = _extract_shape_env_and_assert_equal(args, kwargs)
+
+                # If we are calling this function without any ShapeEnv instance
+                # alive in its arguments, we don't record and call the original.
+                if self is None:
+                    return fn(*args, **kwargs)
+
+                # Otherwise, start recording and call the function.
+                with self._recording():
+                    # Take a snapshot of the current tracked_fakes.
+                    tracked_fakes = (
+                        self._snapshot_tracked_fakes() if save_tracked_fakes else None
+                    )
+                    # Record the event for 'fn'.
+                    event = ShapeEnvEvent(
+                        fn, list(args), kwargs, tracked_fakes, name=fn.__name__
+                    )
+                    # Play the event on this ShapeEnv.
+                    # NB: It's important to put the event first, because running
+                    # the event can trigger internal events that must be ordered
+                    # after this event.  However, if an exception happens, we do
+                    # NOT want to have the event in the list, so pop it off from
+                    # the record if an error happened
+                    self.events.append(event)
+                    try:
+                        return event.run(self)
+                    except Exception:
+                        self.events.pop()
+                        raise
+
+            except Exception:
+                log.error("failed while running %s(*%s, **%s)", name, args[1:], kwargs)
+                raise
 
         return wrapper
 
@@ -274,7 +305,8 @@ def replay_shape_env_events(events):
             # change after each event is replayed.
             event.run(shape_env)
         except Exception as e:
-            raise RuntimeError(f"failed when running event: {event}") from e
+            log.error("failed when running event: %s", event)
+            raise
 
     return shape_env
 
diff --git a/torch/fx/experimental/rewriter.py b/torch/fx/experimental/rewriter.py
index c4abe52c8c279..85a95895f7c9e 100644
--- a/torch/fx/experimental/rewriter.py
+++ b/torch/fx/experimental/rewriter.py
@@ -21,6 +21,11 @@ class AST_Rewriter(ast.NodeTransformer):
     https://docs.python.org/3/library/ast.html#ast.NodeTransformer
     """
 
+    # This function checks for new keys added in the globals dict. TorchDynamo
+    # can insert new keys in the global dict and upset the check. Therefore, put
+    # a disable here. This function is an optimization pass and not really
+    # suitable for dynamo tracing anyways.
+    @torch._dynamo.disable
     def rewrite(self, fn: FunctionType):
 
         # Normalize the source lines
diff --git a/torch/fx/experimental/shape_inference/infer_shape.py b/torch/fx/experimental/shape_inference/infer_shape.py
new file mode 100644
index 0000000000000..3c2e0c22bd896
--- /dev/null
+++ b/torch/fx/experimental/shape_inference/infer_shape.py
@@ -0,0 +1,98 @@
+import copy
+from collections import defaultdict
+
+import torch
+from torch._dynamo.source import LocalSource
+from torch._subclasses import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.fx.experimental.shape_inference.infer_symbol_values import (
+    infer_symbol_values,
+)
+from torch.fx.experimental.symbolic_shapes import DimDynamic, ShapeEnv
+from torch.utils import _pytree
+
+
+"""
+This is the function that runs shape inference. It will modify the input graph module so that shapes are annotated.
+"""
+
+
+def infer_shape(gm, input_tensors):
+    # Prepare environments
+    shape_env = ShapeEnv()
+    fake_mode = FakeTensorMode(shape_env=shape_env, allow_non_fake_inputs=True)
+
+    flatten_inputs, spec = _pytree.tree_flatten(input_tensors)
+    dim_count = 1
+    for input_tensor in flatten_inputs:
+        dim_count += input_tensor.dim() - 1
+
+    sample = {f"s{i}": 2 for i in range(dim_count)}
+    init_symints = [
+        mksym(shape_env, v, LocalSource(k), DimDynamic.DYNAMIC)
+        for k, v in sample.items()
+    ]
+    symints = copy.deepcopy(init_symints)
+    symbol_to_idx_dict = {f"s{i}": i for i in range(dim_count)}
+    padding_constraints = defaultdict(list)  # type: ignore[var-annotated]
+
+    complete_flag = False
+    allowed_try_times = dim_count * 2
+
+    while not complete_flag and allowed_try_times > 0:
+        # Create symbolic input tensors
+        with fake_mode:
+            sym_tensors = []
+            i = 1
+            for input_tensor in flatten_inputs:
+                curr_dim = input_tensor.dim()
+                desired_size = [symints[0]] + [
+                    symints[ii] for ii in range(i, i + curr_dim - 1)
+                ]
+                sym_tensor = torch.randn(desired_size)
+                sym_tensors.append(sym_tensor)
+                i += curr_dim - 1
+            sym_tensors = _pytree.tree_unflatten(sym_tensors, spec)
+        try:
+            with fake_mode:
+                make_fx(
+                    gm,
+                    tracing_mode="symbolic",
+                    _allow_non_fake_inputs=True,
+                    pre_dispatch=True,
+                    _allow_fake_constant=True,
+                )(*sym_tensors)
+            complete_flag = True
+            return (gm, input_tensors, fake_mode, symints[0])
+        except RuntimeError as e:
+            if e:
+                infer_symbol_values(
+                    symints,
+                    init_symints,
+                    symbol_to_idx_dict,
+                    padding_constraints,
+                    str(e),
+                )
+                allowed_try_times -= 1
+        except ValueError as e:
+            if e:
+                infer_symbol_values(
+                    symints,
+                    init_symints,
+                    symbol_to_idx_dict,
+                    padding_constraints,
+                    str(e),
+                )
+                allowed_try_times -= 1
+
+
+def mksym(shape_env, value, source, dynamic_dim):
+    return shape_env.create_symintnode(
+        shape_env.create_symbol(
+            value,
+            source=source,
+            dynamic_dim=dynamic_dim,
+        ),
+        hint=value,
+        source=source,
+    )
diff --git a/torch/fx/experimental/shape_inference/infer_symbol_values.py b/torch/fx/experimental/shape_inference/infer_symbol_values.py
new file mode 100644
index 0000000000000..81c86fe381983
--- /dev/null
+++ b/torch/fx/experimental/shape_inference/infer_symbol_values.py
@@ -0,0 +1,133 @@
+import re
+from typing import Any, DefaultDict, Dict, List, Tuple, Union
+
+import numpy as np
+
+import sympy as sp
+import torch
+
+square_brackets_pattern = r"\[([^]]+)\]"
+parentheses_pattern = r"\((.*?)\)"
+s_pattern = r"s\d+"
+
+
+def infer_symbol_values(
+    symints: List[Union[torch.SymInt, int]],
+    init_symints: List[Union[torch.SymInt, int]],
+    symbol_idx_dict: Dict[str, int],
+    padding_constraints: DefaultDict[torch.SymInt, List[Union[sp.Expr, int]]],
+    constraint: str,
+) -> None:
+    if constraint.find("non-singleton") != -1:
+        left_expression, right_expression = re.findall(parentheses_pattern, constraint)
+        calculate_value(left_expression, right_expression, symints, symbol_idx_dict)
+
+    elif constraint.find("first two dimensions of batch2 tensor to be") != -1:
+        matches = re.findall(square_brackets_pattern, constraint)
+        left_expression, right_expression = (
+            matches[i].split(",")[1].strip() for i in (0, 1)
+        )
+        calculate_value(left_expression, right_expression, symints, symbol_idx_dict)
+
+    elif constraint.find("a and b must have same reduction dim") != -1:
+        matches = re.findall(square_brackets_pattern, constraint)
+        left_expression = matches[0].split(",")[1].strip()
+        right_expression = matches[1].split(",")[0].strip()
+        calculate_value(left_expression, right_expression, symints, symbol_idx_dict)
+
+    elif constraint.find("Split sizes add up to") != -1:
+        match_1 = re.search(r"to\s+(.*?)\s+but", constraint)
+        extracted_value_1 = match_1.group(1) if match_1 else None
+        match_2 = re.search(r"of\s+(.*?)$", constraint)
+        extracted_value_2 = match_2.group(1) if match_2 else None
+        calculate_value(extracted_value_1, extracted_value_2, symints, symbol_idx_dict)
+
+    elif constraint.find("is invalid for input of size") != -1:
+        matches = re.findall(square_brackets_pattern, constraint)
+        left_elements = matches[0].split(",")
+        left_equation = sp.sympify(1)
+        left_num = 1
+        right_equation = sp.sympify(constraint.split("size")[1].strip())
+
+        for left_element in left_elements:
+            if sp.sympify(left_element) == sp.sympify("-1"):
+                continue
+            elif sp.sympify(left_element).is_number:
+                left_num *= int(left_element)
+            else:
+                left_equation *= sp.sympify(left_element)
+        right_equation = sp.cancel(right_equation / left_equation)
+
+        right_vars = list(right_equation.free_symbols)
+        for right_var in right_vars:
+            if sp.sympify(right_var) == sp.sympify("s0"):
+                right_equation = sp.cancel(right_equation / right_var)
+                right_vars.remove(right_var)
+
+        var = right_vars[0]
+        idx = symbol_idx_dict[str(var)]
+        if var not in padding_constraints:
+            padding_constraints[var].append(right_equation)
+        update_equation(
+            symints,
+            init_symints,
+            padding_constraints,
+            padding_constraints[var][0],  # type: ignore[arg-type]
+            left_num,
+            var,
+            idx,
+        )
+
+
+def calculate_value(
+    left_expression: Union[str, Any, None],
+    right_expression: Union[str, Any, None],
+    symints: List[Union[torch.SymInt, int]],
+    symbol_idx_dict: Dict[str, int],
+) -> None:
+    var, val = solve_equation(left_expression, right_expression)
+    idx = symbol_idx_dict[var]
+    pre_equation = sp.sympify(f"{symints[idx]}")
+    symints[idx] = pre_equation.subs(sp.sympify(var), val)
+
+
+def solve_equation(
+    left_expression: Union[str, Any, None],
+    right_expression: Union[str, Any, None],
+) -> Tuple[str, int]:
+    expression = f"{left_expression} - {right_expression}"
+    var = re.findall(s_pattern, expression)[0]
+    if re.findall(parentheses_pattern, expression):
+        sub_expression = re.findall(parentheses_pattern, expression)[0]
+        var, coeff = sub_expression.split("//")
+        x = sp.symbols("x")
+        sub_equation = sp.sympify(f"{var} - {coeff} * {x}")
+        modified_equation = (
+            sp.sympify(x) + sp.sympify(expression) - sp.sympify(sub_expression)
+        )
+
+        solution = sp.solve((modified_equation, sub_equation), (x, var))
+        return (var, int(solution[sp.sympify(var)]))
+    else:
+        solution = sp.solve(expression, var)
+        val = int(solution[0])
+        return (var, val)
+
+
+def update_equation(
+    symints: List[Union[torch.SymInt, int]],
+    init_symints: List[Union[torch.SymInt, int]],
+    padding_constraints: DefaultDict[torch.SymInt, List[Union[sp.Expr, int]]],
+    init_eq: sp.Expr,
+    new_mod_num: int,
+    var: torch.SymInt,
+    idx: int,
+) -> None:
+    padding_constraints[var].append(new_mod_num)
+    mod_num = np.lcm.reduce(padding_constraints[var][1:])  # type: ignore[arg-type]
+    eq = mod_num * init_symints[idx]
+    eq_const = [arg for arg in init_eq.args if arg.is_number]
+    if eq_const:
+        rem = int(eq_const[0] % mod_num)
+        eq -= rem
+    symints[idx] = eq
diff --git a/torch/fx/experimental/sym_node.py b/torch/fx/experimental/sym_node.py
index 8f2dff475b2ff..8ec9b816beac9 100644
--- a/torch/fx/experimental/sym_node.py
+++ b/torch/fx/experimental/sym_node.py
@@ -17,6 +17,8 @@
 from functools import lru_cache, update_wrapper
 from typing import Optional, Type, TYPE_CHECKING, Union
 
+import torch
+
 # NB: The sym_* functions are used via getattr() and must be imported here.
 from torch import (  # noqa: F401
     sym_float,
@@ -24,7 +26,6 @@
     sym_max,
     sym_min,
     sym_not,
-    sym_sqrt,
     SymBool,
     SymFloat,
     SymInt,
@@ -39,9 +40,10 @@
     from torch.fx.experimental.symbolic_shapes import ShapeEnv
 
 log = logging.getLogger(__name__)
+sym_node_log = torch._logging.getArtifactLogger(__name__, "sym_node")
 
 
-__all__ = ["SymNode", "method_to_operator", "magic_methods", "sym_sqrt"]
+__all__ = ["SymNode", "method_to_operator", "magic_methods"]
 
 
 SymTypes = (SymInt, SymFloat, SymBool)
@@ -171,6 +173,14 @@ def is_float(self):
     def is_bool(self):
         return self.pytype is bool
 
+    def is_nested_int(self):
+        # Unbacked SymInts cannot be nested int today
+        return (
+            self._hint is not None
+            and isinstance(self._hint, SymInt)
+            and self._hint.node.is_nested_int()
+        )
+
     def wrap_int(self, num):
         assert type(num) is int
         import sympy
@@ -217,9 +227,15 @@ def __repr__(self):
     def abs(self) -> "SymNode":
         return self._abs()  # type: ignore[attr-defined]
 
+    def pos(self) -> "SymNode":
+        return self._pos()  # type: ignore[attr-defined]
+
     def round(self, ndigits=None) -> "SymNode":
         return self._round(ndigits)  # type: ignore[attr-defined]
 
+    def trunc(self) -> "SymNode":
+        return self._trunc()  # type: ignore[attr-defined]
+
     def add(self, other) -> "SymNode":
         return self._add(other)  # type: ignore[attr-defined]
 
@@ -301,9 +317,6 @@ def sym_max(self, other) -> "SymNode":  # noqa: F811
     def sym_ite(self, then_val, else_val) -> "SymNode":
         return self._sym_ite(then_val, else_val)  # type: ignore[attr-defined]
 
-    def sym_sqrt(self) -> "SymNode":
-        return self._sym_sqrt()  # type: ignore[attr-defined]
-
     def is_contiguous(self, sizes, strides) -> "SymNode":
         return self._is_contiguous(sizes, strides)  # type: ignore[attr-defined]
 
@@ -369,7 +382,13 @@ def guard_bool(self, file, line):
             raise
 
     def expect_true(self, file, line):
-        if self.has_hint():
+        from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+
+        if (
+            self.has_hint()
+            and not free_unbacked_symbols(self.expr)
+            and not self.shape_env.prefer_deferred_runtime_asserts_over_guards
+        ):
             # OK to generate guards
             return self.guard_bool(file, line)
         # Generate a deferred runtime assert (this might actually end up doing
@@ -394,13 +413,35 @@ def expect_size(self, file, line):
             _advise_is_size(SymInt(self))
         return r
 
+    def guard_size_oblivious(self, file, line):
+        """
+        Like guard_bool, but if we encounter unbacked symbols, if those symbols
+        are size-like, we will treat them as >= 2 for the purposes of the analysis.
+
+        This CHANGES the runtime semantics, but all size-oblivious sites have been
+        audited to ensure that the runtime semantics don't change in a material way.
+        Acceptable runtime semantic changes are, e.g., squeeze() no longer dropping
+        an unbacked one size, or a tensor reporting as non-contiguous even if it's
+        contiguous if it would have been reported contiguous due to being empty.
+        """
+        # TODO: use the file/line for some useful diagnostic on why a
+        # guard occurred
+        r = self.shape_env.evaluate_expr(
+            self.expr, self.hint, fx_node=self.fx_node, size_oblivious=True
+        )
+        try:
+            return bool(r)
+        except Exception:
+            log.warning("Failed to convert to bool: %s", r)
+            raise
+
     def bool_(self):
         return self.guard_bool("", 0)
 
     def is_symbolic(self):
         return True
 
-    def singleton_int(self):
+    def nested_int(self):
         return None
 
     def is_constant(self):
@@ -409,12 +450,14 @@ def is_constant(self):
 
 # TODO: this probably needs the sizes-strides eval functions
 METHOD_TO_OPERATOR = {
+    "pos": operator.pos,
     "abs": operator.abs,
     "add": operator.add,
     "and": operator.and_,
     "ceil": math.ceil,
     "eq": operator.eq,
     "floor": math.floor,
+    "trunc": math.trunc,
     "floordiv": operator.floordiv,
     "ge": operator.ge,
     "gt": operator.gt,
@@ -436,7 +479,6 @@ def is_constant(self):
     "sym_max": sym_max,
     "sym_min": sym_min,
     "sym_not": sym_not,
-    "sym_sqrt": sym_sqrt,
     "truediv": operator.truediv,
 }
 
@@ -446,10 +488,41 @@ def is_constant(self):
     "ceil",
     "floor",
     "neg",
-    "sym_sqrt",
     "sym_not",
+    "pos",
+    "trunc",
 }
 
+
+# Adding math ops: sqrt, cos, sin, ...
+def _get_sym_node_fn(name):
+    def fn(self):
+        return getattr(self, f"_sym_{name}")()
+
+    return fn
+
+
+math_op_names = (
+    "sqrt",
+    "cos",
+    "cosh",
+    "sin",
+    "sinh",
+    "tan",
+    "tanh",
+    "asin",
+    "acos",
+    "atan",
+)
+for name in math_op_names:
+    sym_name = f"sym_{name}"
+    priv_sym_name = f"_{sym_name}"
+    setattr(SymNode, sym_name, _get_sym_node_fn(name))
+    METHOD_TO_OPERATOR[sym_name] = getattr(torch, priv_sym_name)
+    unary_magic_methods.add(sym_name)
+    __all__.append(sym_name)
+
+
 # Unary methods that are not magic methods
 unary_nonmagic_methods = {
     "is_integer",
@@ -473,8 +546,14 @@ def is_constant(self):
 magic_methods_on_operator_with_trailing_underscore = {"and", "or"}
 
 
-always_float_magic_methods = {"truediv", "sym_float", "sym_sqrt", "pow"}
-always_int_magic_methods = {"ceil", "floor"}
+always_float_magic_methods = {"truediv", "sym_float", "pow"}
+
+for name in math_op_names:
+    sym_name = f"sym_{name}"
+    always_float_magic_methods.add(sym_name)
+
+
+always_int_magic_methods = {"ceil", "floor", "trunc"}
 always_bool_magic_methods = {
     "eq",
     "ne",
@@ -579,6 +658,12 @@ def _sympy_floor(a):
     return _floor_ceil_helper(a, sympy.floor)
 
 
+def _sympy_trunc(a):
+    from torch.utils._sympy.functions import Trunc
+
+    return Trunc(a)
+
+
 def _sympy_ceil(a):
     import sympy
 
@@ -639,10 +724,25 @@ def _sympy_ite(a, t, f):
     return sympy.Piecewise((t, a), (f, True))
 
 
-def _sympy_sqrt(a):
-    import sympy
+current_module = sys.modules[__name__]
 
-    return sympy.sqrt(a)
+
+def _get_sym_math_fn(name):
+    def fn(a):
+        import torch.utils._sympy.functions
+
+        return getattr(torch.utils._sympy.functions, f"OpaqueUnaryFn_{name}")(a)
+
+    return fn
+
+
+for name in math_op_names:
+    priv_sympy_name = f"_sympy_{name}"
+    fn = _get_sym_math_fn(name)
+    fn.__qualname__ = fn.__name__ = priv_sympy_name
+    setattr(current_module, priv_sympy_name, fn)
+
+del fn, name, priv_sympy_name  # type: ignore[possibly-undefined]
 
 
 def _sympy_abs(a):
@@ -677,6 +777,7 @@ def _sympy_is_integer(a):
 magic_methods = {
     **reflectable_magic_methods,
     "sym_not": operator.invert,
+    "pos": operator.pos,
     "eq": _sympy_eq,
     "ne": _sympy_ne,
     "gt": _sympy_gt,
@@ -684,19 +785,26 @@ def _sympy_is_integer(a):
     "le": _sympy_le,
     "ge": _sympy_ge,
     "floor": _sympy_floor,
+    "trunc": _sympy_trunc,
     "sym_float": _sympy_sym_float,
     "ceil": _sympy_ceil,
     "neg": operator.neg,
     "sym_min": _sympy_min,
     "sym_max": _sympy_max,
     "sym_ite": _sympy_ite,
-    "sym_sqrt": _sympy_sqrt,
     "abs": _sympy_abs,
     "round": _sympy_round,
     "is_integer": _sympy_is_integer,
 }
 
 
+for name in math_op_names:
+    sym_name = f"sym_{name}"
+    magic_methods[sym_name] = getattr(current_module, f"_sympy_{name}")
+
+del name, sym_name, math_op_names, current_module  # type: ignore[possibly-undefined]
+
+
 def sympy_is_contiguous(sizes, strides):
     dim = len(sizes)
     return sympy_is_contiguous_generic(sizes, strides, list(range(dim - 1, -1, -1)))
@@ -869,6 +977,7 @@ def binary_magic_impl(self, other):
             log.warning("failed to eval %s(%s, %s)", method, self.expr, other.expr)
             raise
         out = safe_expand(out)
+        sym_node_log.debug("%s %s %s -> %s", func, self.expr, other.expr, out)
         pytype: Type
         # This is not strictly correct. In Python, a**b may return complex when
         # a < 0 and b is a float: (-1)**2.1. Same for sympy.sqrt(-3.14). This
@@ -895,7 +1004,7 @@ def binary_magic_impl(self, other):
 
         # Create a FX node that corresponds to the operation being applied to
         # this node.
-        fx_node, _ = self.shape_env.create_fx_call_function(
+        fx_node, _ = self.shape_env._create_fx_call_function(
             op, (self.fx_node, other.fx_node)
         )
         return SymNode(out, self.shape_env, pytype, out_hint, fx_node=fx_node)
@@ -916,7 +1025,7 @@ def unary_magic_impl(self):
         except Exception:
             log.warning("failed to eval %s(%s)", method, expr)
             raise
-
+        sym_node_log.debug("%s %s -> %s", func, expr, out)
         out_hint = None
         if self.hint is not None:
             out_hint = op(self.hint)
@@ -931,7 +1040,7 @@ def unary_magic_impl(self):
         else:
             pytype = self.pytype
 
-        fx_node, _ = self.shape_env.create_fx_call_function(op, (self.fx_node,))
+        fx_node, _ = self.shape_env._create_fx_call_function(op, (self.fx_node,))
         return SymNode(out, self.shape_env, pytype, out_hint, fx_node=fx_node)
 
     if method in unary_methods:
@@ -969,7 +1078,7 @@ def sym_ite_impl(pred_node, then_node, else_node):
                 raise
 
             out = safe_expand(out)
-            fx_node, _ = pred_node.shape_env.create_fx_call_function(
+            fx_node, _ = pred_node.shape_env._create_fx_call_function(
                 sym_ite, (pred_node.fx_node, then_node.fx_node, else_node.fx_node)
             )
             return SymNode(
@@ -1011,7 +1120,7 @@ def round_impl(self, ndigits=None):
             args = [self.fx_node]
             if ndigits is not None:
                 args.append(ndigits)
-            fx_node, _ = self.shape_env.create_fx_call_function(op, tuple(args))
+            fx_node, _ = self.shape_env._create_fx_call_function(op, tuple(args))
             return SymNode(out, self.shape_env, pytype, out_hint, fx_node=fx_node)
 
         setattr(SymNode, f"_{method_attr}", round_impl)
@@ -1113,7 +1222,7 @@ def _make_user_magic(method, user_type):
     # so that our internal logic can assume everything is nodes
 
     if method in magic_methods_on_operator_with_trailing_underscore:
-        method_attr = f"{method}_"
+        method_attr = f"sym_{method}"
     else:
         method_attr = method
 
@@ -1158,6 +1267,7 @@ def unary_magic_impl(self):
         return wrap_node(getattr(self.node, method_attr)())
 
     def binary_magic_impl(self, other):
+        sym_node_log.debug("MAGIC %s %s %s", method, self, other)
         self = promote(self)
         other = promote(other)
         if is_constant(self):
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 5c02aa7cfa2c2..9b6c97f90a923 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -1,3 +1,13 @@
+# mypy: ignore-errors
+
+"""
+``torch.fx.experimental.symbolic_shapes`` provides interfaces for interacting with
+our symbolic shapes reasoning system that is used heavily in torch.compile.  Although
+this is not generally considered public API, when writing framework code in PyTorch
+as well as extensions to PyTorch (e.g., in custom operator implementations), you may
+need to make use of these APIs to setup dynamic shapes support appropriately.
+"""
+
 import builtins
 import collections
 import functools
@@ -14,7 +24,7 @@
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from enum import Enum
-from functools import lru_cache
+import atexit
 from typing import (
     Any,
     cast,
@@ -30,6 +40,7 @@
     Union,
     TYPE_CHECKING
 )
+from typing_extensions import TypeAlias
 
 import torch
 import torch.fx
@@ -55,6 +66,9 @@
 from torch.utils._sympy.singleton_int import SingletonInt
 from torch.utils._traceback import format_frame, CapturedTraceback
 from torch._utils_internal import signpost_event
+from torch._subclasses.meta_utils import is_sparse_any
+import torch.utils._pytree as pytree
+from torch.utils._sympy.symbol import SymT, make_symbol, symbol_is_type
 
 from torch._logging import LazyString
 
@@ -79,28 +93,77 @@ class GuardOnDataDependentSymNode(RuntimeError):
     "has_symbolic_sizes_strides", "create_contiguous", "ShapeEnv", "is_concrete_int",
     "guard_int", "guard_float", "guard_scalar", "canonicalize_bool_expr",
     "hint_int", "SYMPY_INTERP", "free_symbols", "is_symbol_binding_fx_node",
-    "is_concrete_bool", "is_singleton", "SHAPEENV_EVENT_KEY", "CURRENT_NODE_KEY",
+    "is_concrete_bool", "is_nested_int", "SHAPEENV_EVENT_KEY", "CURRENT_NODE_KEY",
     "has_free_symbols", "sym_eq", "SymbolicContext", "StatelessSymbolicContext",
-    "StatefulSymbolicContext", "SubclassSymbolicContext"
+    "StatefulSymbolicContext", "SubclassSymbolicContext", "statically_known_true",
+    "guard_size_oblivious", "check_consistent",
+    "compute_unbacked_bindings", "ConvertIntKey",
+    "rebind_unbacked", "resolve_unbacked_bindings",
 ]
 
 # FX node metadata keys for symbolic shape FX graph.
 SHAPEENV_EVENT_KEY = "shapeenv_event"
 CURRENT_NODE_KEY = "current_node"
 
+
+def log_lru_cache_stats(wrapped_f):
+    log.debug("lru_cache_stats %s: %s", wrapped_f.__name__, wrapped_f.cumulative_cache_info())
+
+
+# Wrapper on lru_cache that reports statistics at process end
+def lru_cache(maxsize):
+    def inner(f):
+        wrapped_f = functools.lru_cache(maxsize)(f)
+        old_cache_clear = wrapped_f.cache_clear
+        prev_hits = 0
+        prev_misses = 0
+
+        # TODO: There's a ref-cycle here (wrapped_f -> cumulative_cache_info
+        # -> wrapped_f) but cannot be solved with weakref as wrapped_f is not
+        # weakref'able on some versions of Python
+
+        def cumulative_cache_info():
+            cur = wrapped_f.cache_info()
+            return functools._CacheInfo(
+                prev_hits + cur.hits,
+                prev_misses + cur.misses,
+                cur.maxsize,
+                cur.currsize,
+            )
+
+        def new_cache_clear():
+            nonlocal prev_hits, prev_misses
+            cur = wrapped_f.cache_info()
+            prev_hits += cur.hits
+            prev_misses += cur.misses
+            old_cache_clear()
+
+        wrapped_f.cache_clear = new_cache_clear
+        wrapped_f.cumulative_cache_info = cumulative_cache_info
+        if log.isEnabledFor(logging.DEBUG):
+            atexit.register(log_lru_cache_stats, wrapped_f)
+        return wrapped_f
+
+    return inner
+
 # These are modules that contain generic code for interacting with ShapeEnv
 # which are unlikely to identify a particular interesting guard statement
 @lru_cache(None)
-def uninteresting_files():
+def uninteresting_files() -> Set[str]:
     import torch._inductor.sizevars
     import torch._library.abstract_impl
+    import torch._subclasses.meta_utils
+    import torch._subclasses.fake_tensor
     mods = [
         sys.modules[__name__],
         torch.fx.experimental.recording,
         torch.fx.experimental.sym_node,
+        torch.fx.interpreter,
         torch,
         torch._inductor.sizevars,
         torch._library.abstract_impl,
+        torch._subclasses.meta_utils,
+        torch._subclasses.fake_tensor,
     ]
     return {inspect.getfile(m) for m in mods}
 
@@ -113,16 +176,18 @@ def uninteresting_files():
 class ConstraintViolationError(RuntimeError):
     pass
 
-def has_symbolic_sizes_strides(elem):
+def has_symbolic_sizes_strides(elem) -> bool:
     return elem._has_symbolic_sizes_strides
 
-def create_contiguous(shape):
-    strides = [1]
+Int = Union[torch.SymInt, int]
+
+def create_contiguous(shape: Sequence[Int]) -> List[Int]:
+    strides: List[Int] = [1]
     for dim in reversed(shape[:-1]):
         strides.append(dim * strides[-1])
     return list(reversed(strides))
 
-def hint_int(a, fallback=None):
+def hint_int(a: Union[torch.SymInt, int], fallback: Optional[int] = None) -> int:
     """
     Retrieve the hint for an int (based on the underlying real values as observed
     at runtime).  If no hint is available (e.g., because data dependent shapes),
@@ -133,12 +198,14 @@ def hint_int(a, fallback=None):
     assert type(a) is int, a
     return a
 
-def has_hint(a):
+Scalar = Union[torch.SymInt, torch.SymFloat, torch.SymBool, int, float, bool]
+
+def has_hint(a: Scalar) -> bool:
     if isinstance(a, SymTypes):
         return a.node.has_hint()
     return True
 
-def is_concrete_int(a: Union[int, SymInt]):
+def is_concrete_int(a: Union[int, SymInt]) -> bool:
     r""" Utility to check if underlying object
     in SymInt is concrete value. Also returns
     true if integer is passed in.
@@ -156,7 +223,109 @@ def is_concrete_int(a: Union[int, SymInt]):
 
     return False
 
-def canonicalize_bool_expr(expr: sympy.Expr):
+# In obscure Meta only situations, sympy.logic.boolalg doesn't exist at runtime.
+# So make sure only type checker evaluates this alias.
+# Xref: https://www.internalfb.com/diff/D53324783
+SympyBoolean: TypeAlias = "sympy.logic.boolalg.Boolean"
+
+def guard_size_oblivious(expr: Union[torch.SymBool, bool]) -> bool:
+    """
+    Perform a guard on a symbolic boolean expression in a size oblivious way.
+    This is typically used when a non-oblivious test would result in a guard
+    on a data dependent value of which we don't know the value of at compile time.
+    When a guard is tested this way, we may diverge in behavior from how regular
+    PyTorch semantics would treat it.  For more information, see
+    https://github.com/pytorch/pytorch/pull/118579
+    """
+    if isinstance(expr, torch.SymBool):
+        return expr.node.guard_size_oblivious("", 0)
+    else:
+        assert isinstance(expr, bool)
+        return expr
+
+def check_consistent(new, old) -> None:
+    """
+    Test that two "meta" values (typically either Tensor or SymInt) have
+    the same values, e.g., after retracing.  If we don't understand the
+    quantities in question, we'll just skip the consistency check.
+    """
+    # TODO: do boolean equality test too, see
+    # https://github.com/pytorch/pytorch/issues/124110
+    scalar_types = (torch.SymInt, torch.SymFloat, int, float)
+
+    if isinstance(new, torch.Tensor):
+        assert isinstance(old, torch.Tensor)
+        torch._check(old.dim() == new.dim(), lambda: f"{old.shape} != {new.shape} (old != new)")
+        # Do this manually so that each individual test is irrefutable
+        # (TODO: should be a helper for this, maybe sym_eq?  That
+        # gives us a compound expression and I'm not sure it
+        # simplifies right now)
+        for i, j in zip(old.shape, new.shape):
+            torch._check(i == j, lambda: f"{old.shape} != {new.shape} (old != new)")
+    # NB: bool is subclass of int
+    elif isinstance(new, scalar_types) and not isinstance(new, bool):
+        assert isinstance(old, scalar_types) and not isinstance(old, bool), f"{old} != {new}"
+        torch._check(old == new, lambda: f"{old} != {new} (old != new)")
+
+def resolve_unbacked_bindings(shape_env, bindings):
+    if bindings is None:
+        return None
+    return {
+        shape_env.unbacked_renamings.get(k, k): v
+        for k, v in bindings.items()
+    }
+
+def rebind_unbacked(shape_env, n: torch.fx.Node, result):
+    """
+    Suppose we are retracing a pre-existing FX graph that previously had
+    fake tensor propagation (and therefore unbacked SymInts).  When we retrace,
+    we re-propagate fake tensors, which results in new unbacked SymInts.
+    When this happens, we need to tell the shape environment about the equivalence
+    of the old and new unbacked SymInts.  Pass us the old torch.fx.Node (which
+    has the old binding information) and the new result (which we can extract the
+    new unbacked SymInts out from).
+    """
+    from torch._dynamo.tensor_version_op import _tensor_version
+
+    # Inputs never need rebinding
+    if n.op == "placeholder":
+        return
+
+    if bindings := resolve_unbacked_bindings(shape_env, n.meta.get("unbacked_bindings")):
+        for raw_u0, path in bindings.items():
+            u1 = pytree.key_get(result, path)
+            # tensor_version ops get specialized after AOTAutograd, it's OK,
+            # we don't actually want to do asserts on them.  This is all a bit
+            # questionable though
+            if isinstance(u1, int) and n.target is _tensor_version:
+                log.info("rebind_unbacked: discard _tensor_version %s %s -> %s", raw_u0, path, u1)
+                continue
+            raw_u1 = u1.node.expr
+            # Simplify SymBool binding
+            if (
+                isinstance(raw_u1, sympy.Piecewise) and
+                len(raw_u1.args) == 2 and
+                raw_u1.args[0][0] == 1 and
+                isinstance(eq := raw_u1.args[0][1], sympy.Eq) and
+                isinstance(new_raw_u1 := eq.lhs, sympy.Symbol) and
+                shape_env.var_to_range[new_raw_u1].issubset(ValueRanges(0, 1)) and
+                eq.rhs == 1 and
+                raw_u1.args[1] == (0, True)
+            ):
+                # This is what the pattern match above is testing
+                repacked = _sympy_cast_symbool_to_symint_guardless(sympy.Eq(new_raw_u1, 1))
+                assert repacked == raw_u1, f"{repacked} != {raw_u1}"
+                # Cancel the to_int(to_bool(x)). This is sound because x in
+                # [0, 1]
+                raw_u1 = new_raw_u1
+            assert isinstance(raw_u1, sympy.Symbol)
+            # The old and new could be the same if you improperly hit the memo
+            # while retracing.  Make sure you updated FakeTensorMode.epoch
+            assert raw_u0 != raw_u1, f"{raw_u0} possible memo disaster"
+            # Reuse the OLD symbol name
+            shape_env._rename_unbacked_to(raw_u1, raw_u0)
+
+def canonicalize_bool_expr(expr: SympyBoolean) -> SympyBoolean:
     r""" Canonicalize a boolean expression by transforming it into a lt / le
     inequality and moving all the non-constant terms to the rhs.
     We canonicalize And / Ors / Not via cnf and then canonicalize their subexpr
@@ -179,7 +348,11 @@ def canonicalize_bool_expr(expr: sympy.Expr):
         expr = sympy.logic.boolalg.to_cnf(expr)
     return _canonicalize_bool_expr_impl(expr)
 
-def _canonicalize_bool_expr_impl(expr: sympy.Expr):
+def _canonicalize_bool_expr_impl(expr: SympyBoolean) -> SympyBoolean:
+    """
+    After canonicalization, we are guaranteed to have eliminated Ge/Gt relations
+    (rewriting them to Le/Lt, respectively).
+    """
     if isinstance(expr, (sympy.And, sympy.Or)):
         return type(expr)(*map(canonicalize_bool_expr, expr.args))
 
@@ -204,7 +377,7 @@ def _canonicalize_bool_expr_impl(expr: sympy.Expr):
         rhs = -sympy.Add(*cts)
     return t(lhs, rhs)
 
-def is_concrete_bool(a: Union[bool, SymBool]):
+def is_concrete_bool(a: Union[bool, SymBool]) -> bool:
     r""" Utility to check if underlying object
     in SymBool is concrete value. Also returns
     true if integer is passed in.
@@ -221,42 +394,36 @@ def is_concrete_bool(a: Union[bool, SymBool]):
 
     return False
 
-def is_singleton(s):
-    # check for SingletonSymNode
-    if not isinstance(s, torch.SymInt):
-        return False
-    if s.node.singleton_int() is not None:
-        return True
-
-    # check for symbolic variable wrapping a SingletonSymNode (fake-ifying causes this)
-    return (
-        s.node.is_symbolic()
-        and s.node.hint is not None
-        and isinstance(s.node.hint, torch.SymInt)
-        and s.node.hint.node.singleton_int() is not None
-    )
+def is_nested_int(s):
+    return isinstance(s, torch.SymInt) and s.node.is_nested_int()
 
 def _iterate_exprs(val: Union[SymInt, torch.Tensor]) -> Iterable[sympy.Basic]:
     if isinstance(val, SymTypes):
         # This allow applies to the jagged layout NestedTensor case as
-        # singleton ints are not symbolic
+        # nested ints are not symbolic
         if is_symbolic(val):
             yield val.node.expr
     elif isinstance(val, sympy.Basic):
         yield val
     elif isinstance(val, (int, float, bool)):
         pass
+    elif isinstance(val, (tuple, list)):
+        for s in val:
+            yield from _iterate_exprs(s)
+    elif is_sparse_any(val):
+        yield from _iterate_exprs(val.size())
     elif isinstance(val, torch.Tensor):
         yield from _iterate_exprs(val.size())
         yield from _iterate_exprs(val.stride())
         yield from _iterate_exprs(val.storage_offset())
-    elif isinstance(val, (tuple, list)):
-        for s in val:
-            yield from _iterate_exprs(s)
+    elif val is None:
+        pass
     else:
         raise AssertionError(f"cannot extract sympy expressions from {val} {type(val)}")
 
-def free_symbols(val: Union[SymInt, torch.Tensor]) -> Set[sympy.Symbol]:
+def free_symbols(val: Union[SymInt, sympy.Expr, torch.Tensor]) -> Set[sympy.Symbol]:
+    if val is None:
+        return set()
     itr = _iterate_exprs(val)
     # we need at least 1 to call union, so we hand code the identity
     try:
@@ -273,7 +440,7 @@ def has_free_symbols(val: Union[SymInt, torch.Tensor]) -> bool:
 # Like free_symbols, but filtered to only report unbacked symbols
 def free_unbacked_symbols(x):
     # NB: keep synced with is_unbacked_symint
-    return {s for s in free_symbols(x) if s.name.startswith(("i", "f"))}
+    return {s for s in free_symbols(x) if symbol_is_type(s, (SymT.UNBACKED_INT, SymT.UNBACKED_FLOAT))}
 
 # WARNING: Don't use this on Dynamo produced graphs, they don't have meta
 # setup!
@@ -294,6 +461,186 @@ def find_symbol_binding_fx_nodes(graph):
         if is_symbol_binding_fx_node(node)
     }
 
+
+# Analogous to ConvertIntSource
+@dataclass(frozen=True)
+class ConvertIntKey:
+    def __str__(self) -> str:
+        return ".cast_symbool_to_symint_guardless()"
+
+    def get(self, b: bool) -> int:
+        """Get the int value from bool"""
+        return cast_symbool_to_symint_guardless(b)
+
+
+@dataclass(frozen=True)
+class CallMethodKey:
+    name: str
+
+    def __str__(self) -> str:
+        return f".{self.name}()"
+
+    def get(self, o: Any) -> Any:
+        """Call the method on object"""
+        return getattr(o, self.name)()
+
+
+@dataclass(frozen=True)
+class DivideByKey:
+    divisor: int
+
+    def __str__(self) -> str:
+        return f".__floordiv__({self.divisor})"
+
+    def get(self, o: int) -> int:
+        """Divide object by divisor"""
+        return o // self.divisor
+
+
+def compute_unbacked_bindings(shape_env, example_value, old_example_value=None, peek=False):
+    """
+    After having run fake tensor propagation and producing example_value
+    result, traverse example_value looking for freshly bound unbacked
+    symbols and record their paths for later.  It is an error if
+    we have allocated an unbacked SymInt but it cannot be found in
+    example_value.  (NB: this means if you have a multi-output
+    function, you must call this on the tuple of tensor output, you
+    cannot wait!)
+
+    The peek parameter lets you check out what the bindings are without
+    changing the affected list.  This is primarily useful for ensuring
+    unbacked_var_to_val is promptly populated when propagate_real_tensors is on.
+    """
+    if shape_env is None:
+        return
+    if shape_env._ignore_fresh_unbacked_symbols_tls():
+        return
+    fs = shape_env.pending_fresh_unbacked_symbols
+    pending = set(fs)
+    if pending:
+        if not peek:
+            log.info("compute_unbacked_bindings %s", fs)
+            fs.clear()
+
+        def free_unbacked_symbols_with_path(
+            a, path, real=None
+        ) -> Dict[sympy.Symbol, pytree.KeyPath]:
+            r = {}
+            if isinstance(a, (tuple, list)):
+                for i in range(len(a)):
+                    r.update(
+                        free_unbacked_symbols_with_path(
+                            a[i], path + (pytree.SequenceKey(i),),
+                            real=real[i] if real is not None else None
+                        )
+                    )
+            elif isinstance(a, torch.Tensor):
+                r.update(
+                    free_unbacked_symbols_with_path(
+                        a.size(), path + (CallMethodKey("size"),),
+                        real=a.real_tensor.size() if a.real_tensor is not None else None
+                    )
+                )
+                r.update(
+                    free_unbacked_symbols_with_path(
+                        a.stride(), path + (CallMethodKey("stride"),),
+                        real=a.real_tensor.stride() if a.real_tensor is not None else None
+                    )
+                )
+                r.update(
+                    free_unbacked_symbols_with_path(
+                        a.storage_offset(), path + (CallMethodKey("storage_offset"),),
+                        real=a.real_tensor.storage_offset() if a.real_tensor is not None else None
+                    )
+                )
+
+            # NB: Intentionally access _expr, not expr, do not want
+            # simplification!
+            elif (
+                isinstance(a, (torch.SymInt, torch.SymFloat))
+                and isinstance(s := a.node._expr, sympy.Symbol)
+                and s in pending
+            ):
+                r[s] = path
+                if real is not None:
+                    shape_env.set_unbacked_var_to_val(s, real)
+                pending.remove(s)
+            # When an unbacked SymInt is perfectly divisible by an integer
+            # constant, we replace it with the integer constant to improve
+            # reasoning capabilities.  However, in synthetic examples, it is
+            # then possible that the factor never is explicitly allocated.
+            # Fortunately, we can compute it by division.
+            elif (
+                isinstance(a, torch.SymInt)
+                and isinstance(s := a.node._expr, sympy.Mul)
+                and len(s.args) == 2
+                and isinstance(lhs := s.args[0], sympy.Integer)
+                and isinstance(rhs := s.args[1], sympy.Symbol)
+                and rhs in pending
+            ):
+                # TODO: DivideByKey needs to test divisibility at runtime!
+                r[s] = path + (DivideByKey(int(lhs)),)
+                if real is not None:
+                    shape_env.set_unbacked_var_to_val(s, real // int(lhs))
+                pending.remove(rhs)
+            # The annoyance here arises from the fact that SymBool is
+            # allocated by allocating a SymInt and then testing if it's equal
+            # to one.  So you have a complicated binding site logic for this.
+            elif (
+                isinstance(a, torch.SymBool)
+                and isinstance(s := a.node._expr, sympy.Eq)
+                # This must match create_unbacked_symbool EXACTLY
+                and isinstance(s.lhs, sympy.Symbol)
+                and s.rhs == 1
+                and s.lhs in pending
+            ):
+                r[s.lhs] = path + (ConvertIntKey(),)
+                if real is not None:
+                    shape_env.set_unbacked_var_to_val(s, int(real))
+                pending.remove(s.lhs)
+
+            return r
+
+        symbol_to_path = free_unbacked_symbols_with_path(example_value, ())
+        assert not pending, (
+            f"pending {pending} not in {example_value} " +
+            (
+                repr((example_value.stride(), example_value.storage_offset()))
+                if isinstance(example_value, torch.Tensor)
+                else ""
+            )
+        )
+        # Why do we have to do some rebinding here?  If the original FX node
+        # wasn't a binding site because you had a memo hit, but post
+        # translation you aren't a memo hit anymore, there's now a new binding
+        # site... but we know (because it's the same FX node) that the value
+        # is actually the same, they're just not obviously equal anymore.
+        #
+        # The logic here is written carefully, because unlike the
+        # bind_unbacked case, we are not guaranteed to have a symbol for
+        # old_sym.  If we have a symbol, do regular rename unbacked to; but if
+        # we don't, we need to specially eliminate the fresh unbacked symbol
+        # (NB: we are /trusting/ that the memoization is correct, and that we
+        # don't need to generate a new runtime assert.  This is load bearing,
+        # as repropagation can happen after we've frozen runtime asserts.)
+        if old_example_value is not None:
+            for keypath in symbol_to_path.values():
+                old_sym = pytree.key_get(old_example_value, keypath)
+                new_sym = pytree.key_get(example_value, keypath)
+                if (
+                    isinstance(new_sym, SymTypes) and
+                    isinstance(new_s := new_sym.node.expr, sympy.Symbol)
+                ):
+                    if isinstance(old_sym, SymTypes) and (old_s := old_sym.node.expr) != new_s:
+                        if isinstance(old_s, sympy.Symbol):
+                            shape_env._rename_unbacked_to(new_s, old_s)
+                        else:
+                            shape_env._eliminate_unbacked(new_s, old_s)
+                    elif not isinstance(old_sym, SymTypes):
+                        shape_env._eliminate_unbacked(new_s, sympy.sympify(old_sym))
+
+        return symbol_to_path
+
 def definitely_true(a):
     """
     Returns True only if we can tell that a is True, possibly introducing
@@ -335,16 +682,38 @@ def definitely_false(a):
             return False
     return not bool(a)
 
-# TODO: could improve parallel_or/parallel_and by avoiding guards
-# if there exists a quantity that can be handled un-guardedly.  However,
-# for backed SymInts, avoiding guards doesn't really matter in practice,
-# so I chose not to do it.
+def statically_known_true(x: Union[bool, SymBool]) -> bool:
+    """Returns True if x can be simplified to a constant and is true.
+
+    .. note::
+        This function doesn't introduce new guards, so the expression may end
+        up evaluating to true at runtime even if this function returns False.
+
+    Args:
+        x (bool, SymBool): The expression to try statically evaluating
+
+    """
+    if isinstance(x, SymBool):
+        expr = x.node.expr
+        shape_env = x.node.shape_env
+        try:
+            simplified = shape_env._maybe_evaluate_static(expr)
+            if simplified is not None:
+                return bool(simplified)
+        except Exception:
+            log.debug("Could not simplify %s", expr)
+        return False
+    assert isinstance(x, bool)
+    return x
+
 
 def parallel_or(*args):
     """
     Evaluate the logical OR of several arguments, avoiding guarding on
     unbacked SymInts if another argument is definitely True.
     """
+    if any(statically_known_true(a) for a in args):
+        return True
     if any(definitely_true(a) for a in args):
         return True
     return any(args)
@@ -354,6 +723,8 @@ def parallel_and(*args):
     Evaluate the logical FALSE of several arguments, avoiding guarding on
     unbacked SymInts if another argument is definitely False.
     """
+    if any(statically_known_true(torch.sym_not(a)) for a in args):
+        return False
     if any(definitely_false(a) for a in args):
         return False
     return all(args)
@@ -383,22 +754,8 @@ def guard_scalar(a):
         raise AssertionError(f"unrecognized scalar {a}")
 
 
-@record_shapeenv_event()
-def _constrain_symbol_range(shape_env, s: sympy.Symbol, compiler_min: int, compiler_max: int, runtime_min: int, runtime_max: int):
-    log.debug("_constrain_symbol_range %s [%s, %s] [%s, %s]", s, compiler_min, compiler_max, runtime_min, runtime_max)
-    if r := shape_env.var_to_range.get(s, None):
-        shape_env.var_to_range[s] = ValueRanges(
-            builtins.max(r.lower, compiler_min), builtins.min(r.upper, compiler_max)
-        )
-    else:
-        shape_env.var_to_range[s] = ValueRanges(compiler_min, compiler_max)
-
-    if r := shape_env.runtime_var_to_range.get(s, None):
-        shape_env.runtime_var_to_range[s] = ValueRanges(
-            builtins.max(r.lower, runtime_min), builtins.min(r.upper, runtime_max)
-        )
-    else:
-        shape_env.runtime_var_to_range[s] = ValueRanges(runtime_min, runtime_max)
+def _constrain_symbol_range(shape_env, s: sympy.Symbol, compiler_min: int, compiler_max: int):
+    shape_env.constrain_symbol_range(s, compiler_min, compiler_max)
 
 
 def _advise_is_size(a):
@@ -426,7 +783,8 @@ def _advise_is_size(a):
 
     # This must always succeed, because the sole allowed caller _check_is_size
     # was responsible for expect_true'ing this
-    assert a >= 0
+    # This assert triggers expensive sym compute, do not do it until its cheap.
+    # assert a >= 0
 
     # NB: it's important not to constrain range for size for *hinted* SymInts,
     # because it is not only unsound, it will immediately trip our asserts
@@ -436,12 +794,11 @@ def _advise_is_size(a):
     if (
         isinstance(a, SymInt)
         and isinstance(a.node, SymNode)
-        and not a.node.has_hint()
         and isinstance(a.node.expr, sympy.Symbol)
+        and a.node.shape_env.is_unbacked_symint(a.node.expr)
     ):
         _constrain_range_for_size(a)
 
-@record_shapeenv_event()
 def _constrain_range_for_size(a, min: Optional[int] = None, max: Optional[int] = None):
     """
     This function is NOT INTENDED to be used by itself.
@@ -453,34 +810,10 @@ def _constrain_range_for_size(a, min: Optional[int] = None, max: Optional[int] =
     assert isinstance(a, SymInt), "can only constrain range for SymInt"
     assert isinstance(a.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
 
-    if min is None:
-        min = 0
-    if max is None:
-        max = sympy.oo
-
-    if max <= 2:
-        raise ValueError(f"Maximum value to constrain_as_size must be greater than 2, but was {max}")
-
-    if max < min:
-        raise ValueError(
-            "Maximum value to constrain_as_size can't be less than the specified min value, "
-            "received min={min} and max={max}"
-        )
-
-    compiler_min = 2 if min < 2 else min
-
-    _constrain_symbol_range(
-        a.node.shape_env,
-        a.node.expr,
-        compiler_min=compiler_min,
-        compiler_max=max,
-        runtime_min=min,
-        runtime_max=max
-    )
+    a.node.shape_env._constrain_range_for_size(a.node.expr, min, max)
 
 
 # inclusive both ways
-@record_shapeenv_event()
 def constrain_range(a, *, min: Optional[int], max: Optional[int] = None):
     """
     Applies a constraint that the passed in SymInt must lie between min-max
@@ -505,14 +838,6 @@ def constrain_range(a, *, min: Optional[int], max: Optional[int] = None):
     equal to 0/1 (even though these are perfectly possible values at runtime),
     because we generally expect graphs that are valid for N=2 to also be valid
     for N=1.
-
-    .. warning::
-        If you use constrain_range in the context of tracing, we do NOT check
-        that the constraint was actually valid at runtime!  In fact, we
-        cannot (easily) do so, as we currently unsoundly assume that unbacked
-        SymInt can never be zero/one, even if it may actually take on these
-        values at runtime (we assume that a graph that is valid for N=2 will
-        also be valid for N=1).
     """
     if min is None:
         min = -sympy.oo
@@ -530,54 +855,24 @@ def constrain_range(a, *, min: Optional[int], max: Optional[int] = None):
             raise ValueError(f"Invalid value {a} for range [{min}:{max}]")
         return
 
-    if isinstance(a.node.expr, sympy.Integer):
-        if not (min <= int(a.node.expr) <= max):
-            raise ValueRangeError(f"Invalid value {int(a.node.expr)} for range [{min}:{max}]")
-        return
-    assert isinstance(a.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
+    a.node.shape_env._constrain_range(a.node.expr, min, max)
 
-    # TODO: Shouldn't we install a guard if the symbol is backed?  Or is the
-    # semantics that this is an "unchecked" assert (but it this actually
-    # something useful?  Might be better to restrict only for unbacked
-    # SymInt).
-    _constrain_symbol_range(
-        a.node.shape_env,
-        a.node.expr,
-        compiler_min=min,
-        compiler_max=max,
-        runtime_min=min,
-        runtime_max=max
-    )
-
-
-@record_shapeenv_event()
-def constrain_unify(a, b):
+def constrain_unify(a: torch.SymInt, b: torch.SymInt) -> None:
     """
     Given two SymInts, constrain them so that they must be equal.  NB:
     this will not work with SymInts that represent nontrivial expressions
     (yet!)
     """
-    # TODO: Maybe dedupe this with _maybe_guard_eq?
     if not isinstance(a, SymInt):
         if not isinstance(b, SymInt):
             assert a == b
+            return
         else:
-            assert isinstance(b.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
             shape_env = b.node.shape_env
-            shape_env.replacements[b.node.expr] = sympy.Integer(a)
     else:
-        # TODO: Actually, we can support this as long as one of them is a symbol.
-        # NB: We can't actually do "unification" as our operators are not
-        # injective
-        assert isinstance(a.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
         shape_env = a.node.shape_env
-        if not isinstance(b, SymInt):
-            shape_env.replacements[a.node.expr] = sympy.Integer(b)
-        else:
-            assert a.node.shape_env is b.node.shape_env
-            assert isinstance(b.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
-            new_var = shape_env._find(a.node.expr)
-            shape_env.replacements[b.node.expr] = new_var
+
+    shape_env._constrain_unify(a, b)
 
 # Assume that a boolean is true for the purposes of subsequent symbolic
 # reasoning.  This will keep track of corresponding runtime checks to verify
@@ -737,6 +1032,7 @@ class StrictMinMaxConstraint(Constraint):
     vr: ValueRanges
 
     def render(self, source: Source):
+        """Format the constrain equation"""
         # TODO: better printing for -oo and oo
         return f"{self.vr.lower} <= {source.name()} <= {self.vr.upper}"
 
@@ -771,36 +1067,95 @@ def render(self, source: Source):
 @dataclass(frozen=True)
 class EqualityConstraint(Constraint):
     """
-    Given pairs of sources corresponding to pairs of dynamic dimensions that
-    are specified equal, represent them in a union-find data structure so that
-    we can efficiently check whether two such sources are transitively equal.
+    Represent and decide various kinds of equality constraints between input sources.
+
+    A "source pair" is a pair of input sources for dynamic dimensions that
+    are specified equal. We represent `source_pairs` in a union-find forest
+    so that we can efficiently check whether two such sources are transitively equal.
+
+    A "derived equality" relates an input source to an expression over a root.
+    The root can be another input source, corresponding to some dynamic dimension,
+    or a phantom symbol that does not directly represent any dynamic dimension. We
+    represent `derived_equalities` involving input sources in a transitively-closed map
+    so that we can efficiently check whether an input source is transitively equal to
+    a given expression over another input source.
+    (NOTE: In contrast, it is easy to decide whether an input source is transitively equal
+    to a given expression over a phantom symbol; such expressions are already in canonical
+    form and so the problem reduces to symbolic expression equality.)
     """
     source_pairs: List[Tuple[Source, Source]]
+    derived_equalities: List[Tuple[Source, Union[Source, sympy.Symbol], Callable[[sympy.Expr], sympy.Expr]]]
+    phantom_symbols: List[sympy.Symbol]
 
     def __post_init__(self):
-        object.__setattr__(self, "_parents", {})
+        """Pre-processing to answer queries `is_equal` and `is_derived` below.
+
+        Example: Suppose we are given:
+          source_pairs [a = b, b = c]
+          derived_equalities [d = c + 1, e = d - 1]
+        We first construct a union find with source_pairs:
+          _parents = {a: a, b: a, c: a}
+        Then we compute canonical symbolic expressions, recursively applying derived_equalities
+        until we bottom out:
+          _defs = {d: c + 1, e: (c + 1) - 1 aka c}
+        """
+
+        # self._parents is a map from input sources to input sources where, conceptually,
+        # these are directed edges in a union-find forest
+        _parents: Dict[Source, Source] = {}
+        object.__setattr__(self, "_parents", _parents)
+        # self._defs is a map from input sources to "canonical" symbolic expressions,
+        # i.e., unary expressions with symbols that corresponds to regular Dims (i.e.,
+        # not derived Dims)
+        _defs: Dict[Source, sympy.Expr] = {}
+        object.__setattr__(self, "_defs", _defs)
+
         for source1, source2 in self.source_pairs:
+            # preprocess into a union-find forest
             self._union(self._find(source1), self._find(source2))
+        for source, root, fn in self.derived_equalities:
+            # preprocess into a transitively-closed map
+            # NOTE(avik): we reuse the union-find forest for canonicalizing input sources
+            if isinstance(root, sympy.Symbol):
+                self._defs[self._find(source)] = fn(root)
+            else:
+                self._defs[self._find(source)] = fn(self._rewrite(root))
 
     def _find(self, source):
+        # chase edges to find the root of this equivalence class
         if source in self._parents:
             return self._find(self._parents[source])
         else:
             return source
 
     def _union(self, root1, root2):
+        # merge two equivalence classes by adding an edge from one root to the other
         if root1 != root2:
             self._parents[root1] = root2
 
-    def render(self):
-        buf = ", ".join(
-            f"{source1.name()} == {source2.name()}"
-            for (source1, source2) in self.source_pairs
-        )
-        return "{" + buf + "}"
+    def _rewrite(self, src):
+        # always represent the given source by the root of its equivalence class
+        src = self._find(src)
+        if src in self._defs:
+            # simply look up the definition if it exists
+            # NOTE(avik): This works because definitions are always transitively-closed;
+            # otherwise we would have to do recursive rewriting.
+            return self._defs[src]
+        else:
+            # otherwise, create a symbol representing the source
+            return sympy.Symbol(src.name())
 
     def is_equal(self, source1, source2):
-        return self._find(source1) == self._find(source2)
+        return (
+            # check whether source1 and source2 have the same root
+            self._find(source1) == self._find(source2) or
+            # check whether source1 is derived equal to source2
+            self.is_derived(source1, source2, lambda x: x)
+        )
+
+    def is_derived(self, src, symbol_src, fn):
+        # check whether both src and symbol_src have the same definition
+        return self._rewrite(src) == fn(self._rewrite(symbol_src))
 
 
 def _assert_symbol_context(symbolic_context):
@@ -831,6 +1186,10 @@ class StatelessSymbolicContext(SymbolicContext):
     """
     dynamic_sizes: DimList[DimDynamic]
     constraint_sizes: DimList[DimConstraint] = None
+    # If the tensor is a view, this should be populated for the base. It contains
+    # information on how to allocate symbols when recursively fakeifying the base
+    # during view fake-ification.
+    view_base_context: Optional[SymbolicContext] = None
     # TODO: add storage offset and stride symbolic_context
 
     def __post_init__(self):
@@ -903,6 +1262,7 @@ class SubclassSymbolicContext(StatefulSymbolicContext):
     inner_contexts: Dict[str, SymbolicContext] = None
 
     def __post_init__(self):
+        super().__post_init__()
         if self.inner_contexts is None:
             self.inner_contexts = {}
 
@@ -964,9 +1324,15 @@ def _eval_is_non_overlapping_and_dense(sizes, strides):
     return True
 
 
+def _sympy_cast_symbool_to_symint_guardless(x: sympy.Expr) -> sympy.Expr:
+    return sympy.Piecewise((1, x), (0, True))
+
+
 def cast_symbool_to_symint_guardless(symbool: torch.SymBool) -> torch.SymInt:
-    int_sym = sympy.Piecewise((1, symbool.node.expr), (0, True))
-    return symbool.node.shape_env.create_symintnode(int_sym, hint=int(symbool.node.require_hint()))
+    if isinstance(symbool, bool):
+        return 1 if symbool else 0
+    int_sym = _sympy_cast_symbool_to_symint_guardless(symbool.node.expr)
+    return symbool.node.shape_env.create_symintnode(int_sym, hint=int(symbool.node.require_hint()) if has_hint(symbool) else None)
 
 SYMPY_INTERP = {
     'Abs': operator.abs,
@@ -1007,7 +1373,7 @@ def _lru_cache(fn, maxsize=None):
     fn_cache = lru_cache(maxsize)(fn)
     prior_version = 0
 
-    if config.validate_shape_env_verison_key:
+    if config.validate_shape_env_version_key:
         prior_key = None
 
         @functools.wraps(fn)
@@ -1116,15 +1482,13 @@ def print_source(self, source) -> str:
 
     def _print_Symbol(self, expr) -> str:
         assert isinstance(expr, sympy.Symbol), str(type(expr))
-
+        assert self.symbol_to_source.get(expr), (
+            f"Unknown symbol {expr} created by constraints solver"
+        )
         return self.print_source(self.symbol_to_source[expr][0])
 
     def _print_Relational(self, expr):
-        return '{} {} {}'.format(
-            self.parenthesize(expr.lhs, precedence(expr)),
-            expr.rel_op,
-            self.parenthesize(expr.rhs, precedence(expr))
-        )
+        return f'{self.parenthesize(expr.lhs, precedence(expr))} {expr.rel_op} {self.parenthesize(expr.rhs, precedence(expr))}'
 
 
 class DimConstraints:
@@ -1207,7 +1571,7 @@ def mod_handler(*args):
             # would then get k - 2 == s - 2, and thus s = k as the (only, constant) solution!
             base, divisor = args
             base, divisor = self.rewrite_with_congruences(s, base), self.rewrite_with_congruences(s, divisor)
-            mod_reduced = base.subs(self._var_to_val) % divisor.subs(self._var_to_val)
+            mod_reduced = base.xreplace(self._var_to_val) % divisor.xreplace(self._var_to_val)
             congruence = (base - mod_reduced) % divisor
             if congruence != 0:
                 self._congruences[s].add(congruence)
@@ -1222,7 +1586,7 @@ def floor_div_handler(*args):
             # and eliminating b % d as above.
             base, divisor = args
             base, divisor = self.rewrite_with_congruences(s, base), self.rewrite_with_congruences(s, divisor)
-            mod_reduced = base.subs(self._var_to_val) % divisor.subs(self._var_to_val)
+            mod_reduced = base.xreplace(self._var_to_val) % divisor.xreplace(self._var_to_val)
             congruence = (base - mod_reduced) % divisor
             if congruence != 0:
                 self._congruences[s].add(congruence)
@@ -1235,12 +1599,14 @@ def floor_div_handler(*args):
         return expr
 
     def add(self, expr) -> bool:
-        # Add an expression to the set of constraints.
-        # Return whether the expression is a trivial constraint (i.e., an obvious tautology).
+        """Add an expression to the set of constraints.
+
+        Return whether the expression is a trivial constraint (i.e., an obvious tautology).
+        """
         if expr == sympy.true:
             return True
         orig_expr = expr
-        orig_reduced = orig_expr.subs(self._var_to_val)
+        orig_reduced = orig_expr.xreplace(self._var_to_val)
         # TODO(avik): https://github.com/pytorch/pytorch/issues/101093
         # It is possible that `expr` will fail the consistency check because of
         # precision errors. Specifically, on substituting its free symbols with
@@ -1248,6 +1614,9 @@ def add(self, expr) -> bool:
         # a fix for this issue, we delay raising such failures. See solve().
         if orig_reduced == sympy.false:
             self._inconsistencies.append(f"{orig_expr} is inconsistent!")
+        if isinstance(expr, sympy.Ne):
+            # we're not going to do anything useful with these, so drop them
+            return False
         free_symbols = expr.free_symbols
         assert free_symbols, f"Did not expect constraint with no free variables: {expr}"
         if len(free_symbols) > 1:
@@ -1257,10 +1626,12 @@ def add(self, expr) -> bool:
             # univariate: can solve these immediately
             s = next(iter(free_symbols))
             # eliminate // and % (see documentation of `rewrite_with_congruences` above)
+            old_n_congruences = len(self._congruences[s])
             expr = self.rewrite_with_congruences(s, expr)
+            new_n_congruences = len(self._congruences[s])
             if expr == sympy.true:
-                return True
-            reduced = expr.subs(self._var_to_val)
+                return old_n_congruences == new_n_congruences
+            reduced = expr.xreplace(self._var_to_val)
             if reduced == sympy.false:
                 self._inconsistencies.append(
                     f"{expr}, obtained by rewriting {orig_expr} with congruences, "
@@ -1273,6 +1644,7 @@ def add(self, expr) -> bool:
         return False
 
     def add_equality(self, source, expr):
+        """Add an equality constraint"""
         if expr.is_number:
             # specialization, right here
             self._static_results.add(f"{source.name()} == {expr}")
@@ -1280,7 +1652,7 @@ def add_equality(self, source, expr):
             # these will resolve to either specializations or dynamic equality constraints
             self._symbolic_equivalences.append((source, expr))
 
-    def reduce_congruences(self):
+    def _reduce_congruences(self):
         reduced_congruences = {}
         for s, congruences in self._congruences.items():
             remainder_modulus_pairs = []
@@ -1290,7 +1662,7 @@ def reduce_congruences(self):
                 # We are given a congruence of the form base % divisor == 0 with a free variable s. So:
                 # - we transform this into an equation of the form base = divisor * tmp;
                 # - we solve this equation for s to get a linear solution with free variable tmp.
-                tmp = sympy.Symbol("tmp", integer=True)
+                tmp = sympy.Symbol("reduce_congruences_tmp", integer=True)
                 symbol, solution = sympy.solve_linear(base - divisor * tmp, symbols=[s])
                 # See https://docs.sympy.org/latest/modules/solvers/solvers.html#sympy.solvers.solvers.solve_linear
                 # for how to interpret the results.
@@ -1321,7 +1693,7 @@ def reduce_congruences(self):
 
         return reduced_congruences
 
-    def raise_inconsistencies(self):
+    def _raise_inconsistencies(self):
         if self._inconsistencies:
             msg = "\n".join(self._inconsistencies)
             self._inconsistencies.clear()
@@ -1332,7 +1704,7 @@ def _force_specialization(self, s):
         self._static_results.add(f"{self._dcp.symbol_to_source[s][0].name()} == {val}")
         self._substitutions[s] = val
 
-    def specialize_divisor_symbols(self):
+    def _specialize_divisor_symbols(self):
         for expr in self._multivariate_inequalities:
             for atom in expr.atoms(FloorDiv, Mod):
                 _, divisor = atom.args
@@ -1342,8 +1714,8 @@ def specialize_divisor_symbols(self):
         multivariate_inequalities = self._multivariate_inequalities
         self._multivariate_inequalities = set()
         for expr in multivariate_inequalities:
-            self.add(expr.subs(self._substitutions))
-        self.raise_inconsistencies()
+            self.add(expr.xreplace(self._substitutions))
+        self._raise_inconsistencies()
         self._univariate_inequalities = {
             s: exprs
             for s, exprs in self._univariate_inequalities.items()
@@ -1355,8 +1727,13 @@ def specialize_divisor_symbols(self):
             if s not in self._substitutions
         }
 
-    def solve(self, disable_congruences=True, disable_equivalences=True):
-        self.raise_inconsistencies()
+    def solve(
+        self,
+        _disable_forced_specializations=False,
+    ):
+        """Solve the system of constraint equations to find simplified constraints
+        """
+        self._raise_inconsistencies()
         # as long as there are symbols with equalities, solve for them
         # NOTE(avik): this is guaranteed to terminate (#iterations <= #symbols)
         while self._symbols_with_equalities:
@@ -1377,35 +1754,44 @@ def solve(self, disable_congruences=True, disable_equivalences=True):
             multivariate_inequalities = self._multivariate_inequalities
             self._multivariate_inequalities = set()
             for expr in multivariate_inequalities:
-                self.add(expr.subs(s, self._substitutions[s]))
-            self.raise_inconsistencies()
+                self.add(expr.xreplace({s: self._substitutions[s]}))
+            self._raise_inconsistencies()
 
-        self.specialize_divisor_symbols()
+        if not _disable_forced_specializations:
+            self._specialize_divisor_symbols()
 
         # solve linear congruences
         # NOTE(avik): We do not need to solve them for symbols that have already been specialized.
-        reduced_congruences = self.reduce_congruences()
+        reduced_congruences = self._reduce_congruences()
         for s, congruences in reduced_congruences.items():
             for congruence in congruences:
                 # any congruence that cannot be checked becomes a dynamic constraint as well
                 if s not in self._substitutions or not sympy.checksol(congruence, {s: self._substitutions[s]}):
-                    if disable_congruences:
+                    if self._is_supported_congruence(congruence):
+                        base, divisor = congruence.args
+                        tmp_name = f"_{self._dcp.source_name_to_debug_name[self._dcp.symbol_to_source[s][0].name()]}"
+                        tmp = sympy.Symbol(tmp_name, integer=True)
+                        from torch._dynamo.source import ConstantSource
+                        self._dcp.symbol_to_source[tmp] = [ConstantSource(tmp_name)]
+                        r = try_solve(sympy.Eq(base, divisor * tmp), s)
+                        self._dynamic_results.add(self._dcp.doprint(sympy.Eq(s, r[1])))
+                    elif not _disable_forced_specializations:
                         self._force_specialization(s)
                         self._univariate_inequalities.pop(s, None)
-                    else:
-                        self._dynamic_results.add(self._dcp.doprint(sympy.Eq(congruence, 0)))
 
         # remaining symbols have only pure inequalities (no equalities)
         for s, exprs in self._univariate_inequalities.items():
             try:
                 solution = sympy.solvers.inequalities.reduce_inequalities(exprs, s)
                 # because this is univariate, the solution is a dynamic (range) constraint
+                if isinstance(solution, sympy.Or):
+                    solution = next(iter(arg for arg in solution.args if arg.xreplace(self._var_to_val)))
                 if isinstance(solution, sympy.And):
                     for arg in solution.args:
                         self._dynamic_results.add(self._dcp.doprint(arg))
                 else:
                     self._dynamic_results.add(self._dcp.doprint(solution))
-            except NotImplementedError as e:
+            except (NotImplementedError, AssertionError) as e:
                 log.warning("Failed to reduce inequalities: %s", e)
                 for expr in exprs:
                     self._dynamic_results.add(self._dcp.doprint(expr))
@@ -1414,18 +1800,51 @@ def solve(self, disable_congruences=True, disable_equivalences=True):
         symbolic_equivalences = self._symbolic_equivalences
         self._symbolic_equivalences = []
         for source, expr in symbolic_equivalences:
-            if disable_equivalences and not isinstance(expr, sympy.Symbol):
+            if not _disable_forced_specializations and not self._is_supported_equivalence(expr):
                 for s in expr.free_symbols:
                     self._force_specialization(s)
                     sexpr = self._dcp._print_Symbol(s)
                     self._dynamic_results = {r for r in self._dynamic_results if sexpr not in r}
-            self.add_equality(source, expr.subs(self._substitutions))
+            self.add_equality(source, expr.xreplace(self._substitutions))
 
         # remaining symbolic equivalences become dynamic equality constraints
         for source, expr in self._symbolic_equivalences:
             self._dynamic_results.add(f"{self._dcp.print_source(source)} == {self._dcp.doprint(expr)}")
 
+    @classmethod
+    def _is_supported_equivalence(cls, expr):
+        # Currently supported Dim ops are linear expressions with integer coefficients.
+        # So check that expr only contains +, *, ints, and a single occurrence of a symbol.
+        # (See also documentation of dynamic_shapes._DerivedDim.)
+        if isinstance(expr, (sympy.Add, sympy.Mul)):
+            lhs, rhs = expr.args
+            return (
+                (cls._is_supported_equivalence(lhs) and isinstance(rhs, sympy.Integer)) or
+                (isinstance(lhs, sympy.Integer) and cls._is_supported_equivalence(rhs))
+            )
+        return isinstance(expr, sympy.Symbol)
+
+    @classmethod
+    def _is_supported_congruence(cls, congruence):
+        base, divisor = congruence.args
+        # Congruences that can be currently expressed with supported Dim ops are
+        # of the form (x + a) % b == 0, where x is a Dim and a and b are constants.
+        # This allows us to derive x as b*y - a for some Dim y.
+        # (See also documentation of dynamic_shapes._DerivedDim.)
+        if isinstance(base, sympy.Add):
+            lhs, rhs = base.args
+            cond = (
+                (isinstance(lhs, sympy.Symbol) and isinstance(rhs, sympy.Integer)) or
+                (isinstance(lhs, sympy.Integer) and isinstance(rhs, sympy.Symbol))
+            )
+        else:
+            cond = isinstance(base, sympy.Symbol)
+        cond = cond and isinstance(divisor, sympy.Integer)
+        return cond
+
     def forced_specializations(self):
+        """Returns a dictionary of the names of symbols to their specialized value
+        """
         def debug_name(src):
             name = src.name()
             if self._dcp.source_name_to_debug_name:
@@ -1440,6 +1859,9 @@ def debug_name(src):
         }
 
     def remove_redundant_dynamic_results(self):
+        """Remove constraints of the form 2 <= dynamic_dim(...) as 2 is the default
+        lower bound.
+        """
         candidates_for_removal = []
         dynamic_results = set()
         for dc in self._dynamic_results:
@@ -1467,10 +1889,12 @@ def prettify_results(
         constraint_violation_error=None,
         forced_specializations=None,
     ):
+        """Format a message for constraint violation erros"""
         if self._dcp.source_name_to_debug_name:
-            def transform(s):
+
+            def transform(s, inverse=False):
                 for k, v in self._dcp.source_name_to_debug_name.items():
-                    s = s.replace(k, v)
+                    s = s.replace(k, v) if not inverse else s.replace(v, k)
                 return s
 
             results = defaultdict(dict)
@@ -1504,7 +1928,8 @@ def relation_with_digit(expr, op, digit):
                 t = transform(s)
                 if t == s:
                     continue
-                left, op, right = t.split(" ")
+                left, op, right = re.split(r"( == | <= | >= | < | > )", t)
+                op = op.strip()
                 if op == "==" and left == right:
                     continue
                 if right.isdigit():
@@ -1512,8 +1937,17 @@ def relation_with_digit(expr, op, digit):
                 elif left.isdigit():
                     relation_with_digit(right, flip(op), int(left))
                 else:
-                    assert op == "=="
-                    results[left]["eq"] = right
+                    assert op == "==", t
+                    results[left]["eq"] = sympy.sympify(right)
+
+            # order forced specializations based on name
+            forced_specializations = {
+                k: forced_specializations[k]
+                for k in sorted(
+                    forced_specializations.keys(),
+                    key=lambda x: x.split(" = ")[1],
+                )
+            }
 
             buf = ""
             debug_names = set()
@@ -1521,7 +1955,7 @@ def relation_with_digit(expr, op, digit):
                 debug_names.update(k.split(" = ")[0] for k in forced_specializations.keys())
                 buf += (
                     f"Specializations unexpectedly required ({', '.join(debug_names)})! "
-                    "For more information, run with TORCH_LOGS=dynamic.\n"
+                    "For more information, run with TORCH_LOGS=\"+dynamic\".\n"
                 )
                 for s, val in forced_specializations.items():
                     buf += f"  - {s} must be specialized to {val} because the guards generated for it are too complex.\n"
@@ -1534,14 +1968,29 @@ def relation_with_digit(expr, op, digit):
             if match is not None:
                 debug_names.update(match.expand(r'\1').split(', '))
 
+            # order results by source name
+            results = {
+                k: results[k] for k in sorted(
+                    results.keys(),
+                    key=lambda x: transform(x, inverse=True),
+                )
+            }
             for k, c in results.items():
-                if k not in debug_names:
-                    continue
+                # if k not in debug_names:
+                #     continue
                 if "eq" in c:
                     other = c["eq"]
                     if isinstance(other, int):
-                        others.append(f"{k} = None  # {other}")
-                    else:
+                        others.append(f"{k} = {other}")
+                    elif self._is_supported_equivalence(other):
+                        s = next(iter(other.free_symbols))
+                        if str(s) not in results:
+                            modulus, remainder = sympy.polys.polytools.div(other, s)
+                            c_min = c.get("min", 2)
+                            min_ = math.ceil((c_min - remainder) / modulus)
+                            c_max = c.get("max", sys.maxsize - 1)
+                            max_ = math.floor((c_max - remainder) / modulus)
+                            dims.append(f"{s} = Dim('{s}', min={min_}, max={max_})  # {c_min} <= {other} <= {c_max}")
                         others.append(f"{k} = {other}")
                 else:
                     min_ = c.get("min", None)
@@ -1649,10 +2098,24 @@ def print_results(grouped, indent, result_fn):
         return buf
 
 
-
 TLS = threading.local()
 
 
+@dataclass(frozen=True)
+class ShapeEnvSettings:
+    """
+    Encapsulates all shape env settings that could potentially affect
+    FakeTensor dispatch. Used when creating dispatch cache keys.
+    """
+
+    allow_scalar_outputs: bool
+    allow_dynamic_output_shape_ops: bool
+    assume_static_by_default: bool
+    specialize_zero_one: bool
+    duck_shape: bool
+    prefer_deferred_runtime_asserts_over_guards: bool
+
+
 class ShapeEnv:
     # This is a wrapper over the actual __init__ function.
     #
@@ -1706,6 +2169,11 @@ def __init__(
             [ShapeEnvEvent(ShapeEnv, kwargs=kwargs)] if self.should_record_events else []
         )
 
+    # Pro-tip: if you add new field to ShapeEnv, this affects some accept
+    # tests.  Accept their output with:
+    #
+    #   EXPECTTEST_ACCEPT=1 python test/dynamo/test_dynamic_shapes.py -k test_shape_env_equal
+    #
     def _init(
         self, *,
         allow_scalar_outputs=True,
@@ -1731,35 +2199,52 @@ def _init(
         duck_shape=True,
         # For debugging
         co_fields=None,
+        # When True, whenever safe, we will generate a deferred runtime assert
+        # instead of a guard whenever we know that an expression must be True,
+        # otherwise it would be an error, even for backed SymInts (where we
+        # could ostensibly unconditionally generate guards).  This is useful
+        # for export, where preventing "error checking" sizes from showing up
+        # in guards is helpful, since these guards in some sense are overly
+        # pedantic.  See also https://github.com/pytorch/pytorch/issues/121749
+        prefer_deferred_runtime_asserts_over_guards=False,
+        # XXX Add any new settings that could affect FakeTensor evaluation
+        # to: torch._subclasses.fake_tensor._ShapeEnvSettings
     ):
-        # Not directly used by ShapeEnv; indirectly used by FakeTensor
-        self.allow_scalar_outputs = allow_scalar_outputs
-        self.allow_dynamic_output_shape_ops = allow_dynamic_output_shape_ops
+        self.settings = ShapeEnvSettings(
+            # Not directly used by ShapeEnv; indirectly used by FakeTensor
+            allow_scalar_outputs=allow_scalar_outputs,
+            allow_dynamic_output_shape_ops=allow_dynamic_output_shape_ops,
+            # End
+            assume_static_by_default=assume_static_by_default,
+            specialize_zero_one=specialize_zero_one,
+            duck_shape=duck_shape,
+            prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
+        )
+
         self.guards: List[ShapeGuard] = []
         # Maps symbolic ints to their original concrete values
         # Currently populated from tensors
         self.var_to_val: Dict[sympy.Symbol, sympy.Integer] = {}
+        # Like var_to_val, but only set when propagate_real_tensors is on.
+        # Used as last resort to avoid GuardOnDataDependent error
+        self.unbacked_var_to_val: Dict[sympy.Symbol, sympy.Integer] = {}
         # Maps symbolic ints to their min/max range.  These ranges
         # are conservative: the int MUST fall in the range, but the
         # range may contain ints which may not actually appear in
         # practice
         self.var_to_range: Dict[sympy.Symbol, ValueRanges] = {}
         self.source_name_to_debug_name: Dict[str, str] = {}
-        # Maps symbolic ints to their min/max range for runtime checks.
-        # This is because we assume a graph generated with N=2 is general enough
-        # for N < 2. Therefore, it will be too strict to assert N=2 at runtime.
-        self.runtime_var_to_range: Dict[sympy.Symbol, ValueRanges] = {}
         self.var_to_sources: Dict[sympy.Symbol, List[Source]] = {}
         self.var_to_stack: Dict[sympy.Symbol, CapturedTraceback] = {}
-        # Maps symbolic ints to the guards that refine their lower/upper
-        # bound. If one of them is None, it means that there are no guards
-        # that refine that respective bound.
-        self.var_to_guards: Dict[sympy.Symbol, Tuple[Optional[ShapeGuard], Optional[ShapeGuard]]] = {}
         # Maps from sympy ints to expressions representing them
         # Populated from equality guards (i.e. a.shape[0] == b.shape[0])
-        self.replacements: Dict[sympy.Symbol, sympy.Expr] = {}  #
+        self.replacements: Dict[sympy.Symbol, sympy.Expr] = {}
+        self.unbacked_renamings: Dict[sympy.Symbol, sympy.Symbol] = {}
         # Set holds a % b expressions that evaluate to 0.
         self.divisible: Set[sympy.Expr] = set()
+        # Set that holds "size-like" symbols.  When we perform
+        # "size-oblivious" tests, these can be assumed to be >= 2.
+        self.size_like: Set[sympy.Symbol] = set()
         # Duck-shaping says that if two input tensors have the same size,
         # they get assigned the same symbolic variable
         self.val_to_var: Dict[int, sympy.Expr] = {}
@@ -1800,18 +2285,39 @@ def _init(
         # part of the cache key); otherwise we'd have to iterate through
         # deferred_runtime_asserts to compute its length
         self.num_deferred_runtime_asserts = 0
-        self.assume_static_by_default = assume_static_by_default
-        self.specialize_zero_one = specialize_zero_one
-        self.duck_shape = duck_shape
         self.log = log
-        self.log.info("create_env")
+        self.log.debug("create_env")
         self.frozen = False
+        self.runtime_asserts_frozen = False
         self.dim_constraints: Optional[DimConstraints] = None
         self.counter = collections.Counter()
+        # Mapping from sympy.Symbol to the number of guards which mention this
+        # symbol
+        self.symbol_guard_counter = collections.Counter()
         # A selection of important fields on co_field; solely used for
         # signpost_event
         self.co_fields = co_fields if co_fields else {}
 
+        # Whenever we allocate a fresh unbacked Symbol, we add it to this
+        # pending list.  Unbacked symbol allocation can occur at unpredictable
+        # points during meta tensor propagation, but at some point, the we
+        # have to know what the binding site for an unbacked symbol is, and
+        # this is computed when we actually place the node in the graph.  The
+        # important thing is that we always actually handle every unaccounted
+        # for unbacked symbol, so this list helps us keep track of them and
+        # then make sure they are all accounted for.
+        #
+        # We could potentially give rise to errors earlier by lexically
+        # scoping when we do propagation, and only allowing unbacked symbols
+        # to be allocated at this point in time.  However this is inconvenient
+        # to do in Dynamo, because fake tensor propagation is far from when we
+        # analyze binding sites (set_example_value), so we do it in a more
+        # mutatey way.
+        #
+        # NB: fresh unbacked symbols NEVER get substitutions applied to them,
+        # they are binding sites!
+        self.pending_fresh_unbacked_symbols: List[sympy.Symbol] = []
+
         # Version counter used to invalidate cached values
         self._prev_cache_key = self._get_key()
         self._version_counter = 0
@@ -1825,6 +2331,33 @@ def _init(
         self.fx_node_cache: Dict[Tuple[Callable, Tuple[Any, ...]], torch.fx.Node] = {}
         self.source_to_symbol: Dict[str, sympy.Symbol] = {}
 
+        # Suppose you want to replace an unbacked symbol with another
+        # unbacked symbol.  This is error prone because you can cause
+        # references to unbacked symbols to time travel backwards.  E.g.,
+        #
+        # u1 = x.item()
+        # ... use of u1 ...
+        # u2 = y.item()
+        # u3 = z.item()
+        # torch._check(u1 == u2 + u3)
+        #
+        # If you replace u1 with u2 + u3, then the use of u1 now
+        # references u2 and u3 prior to them actually being bound at
+        # runtime.
+        #
+        # To control for this, we track the order unbacked symbols
+        # were allocated, and only allow substitutions if they respect
+        # the dependency from this order; an unbacked symbol can only
+        # be substituted with unbacked symbols that come before it in the
+        # order.
+        #
+        # This also imposes an ordering on the unbacked symbol binding
+        # sites themselves: you are not allowed to reorder unbacked symbol
+        # bindings.  At the moment, this is not tracked, but we potentially
+        # could track this at the IR level using a higher order operator
+        # with something like effect token tracking.
+        self.unbacked_alloc_order: Dict[sympy.Symbol, int] = {}
+
         from torch.fx.experimental.validator import translation_validation_enabled
         self._translation_validation_enabled = translation_validation_enabled()
 
@@ -1847,7 +2380,33 @@ def _init(
             # not be valid.
             self.name_to_node: Dict[str, torch.fx.Node] = {}
 
+    @property
+    def allow_scalar_outputs(self):
+        return self.settings.allow_scalar_outputs
+
+    @property
+    def allow_dynamic_output_shape_ops(self):
+        return self.settings.allow_dynamic_output_shape_ops
+
+    @property
+    def assume_static_by_default(self):
+        return self.settings.assume_static_by_default
+
+    @property
+    def specialize_zero_one(self):
+        return self.settings.specialize_zero_one
+
+    @property
+    def duck_shape(self):
+        return self.settings.duck_shape
+
+    @property
+    def prefer_deferred_runtime_asserts_over_guards(self):
+        return self.settings.prefer_deferred_runtime_asserts_over_guards
+
     def check_equal(self, other: "ShapeEnv") -> None:
+        """Compare another ShapeEnv for equivalence
+        """
         # ShapeEnv fields that are not relevant for the outcome of
         # ShapeEnv.produce_guards call:
         #   - Debugging variables
@@ -1868,6 +2427,7 @@ def check_equal(self, other: "ShapeEnv") -> None:
             "source_name_to_debug_name",
             "_prev_cache_key",
             "_version_counter",
+            "dim_constraints",
         )
 
         # Mapping of the value of each to-be-compared field into the values that
@@ -1888,27 +2448,20 @@ def map_value(key: str, value: Any) -> Any:
             elif key == "guards":
                 # Transform the list of ShapeGuard into a list of expressions.
                 return [g.expr for g in value]
-            elif key == "var_to_guards":
-                # Transform the tuple of optional ShapeGuards of each entry into
-                # a tuple of optional expressions.
-                return {
-                    s: (
-                        lb.expr if lb is not None else None,
-                        ub.expr if ub is not None else None,
-                    )
-                    for s, (lb, ub) in value.items()
-                }
             elif key == "deferred_runtime_asserts":
                 # Transform the list of RuntimeAsserts into a list of expressions.
                 return {s: [ra.expr for ra in ras] for s, ras in value.items()}
             elif key == "name_to_node":
                 # Compare just the set of keys is the same.
                 return set(value.keys())
+            elif key in ["symbol_guard_counter", "pending_fresh_unbacked_symbols"]:
+                # Skip this for comparisons
+                return None
             return value
 
         shape_env_check_state_equal(self, other, non_state_variable_names, map_value)
 
-    def snapshot_tracked_fakes(self) -> Optional[List[Any]]:
+    def _snapshot_tracked_fakes(self) -> Optional[List[Any]]:
         if self.tracked_fakes is None:
             return None
 
@@ -1926,27 +2479,158 @@ def maybe_transform_fake(fake: TrackedFake):
 
         return [maybe_transform_fake(fake) for fake in self.tracked_fakes]
 
-    def inc_tracked_fakes_length(self) -> None:
-        self.tracked_fakes_length += 1
-
-    def set_tracked_fakes_length(self, i: int) -> None:
-        self.tracked_fakes_length = i
-
-    def last_event_index(self) -> int:
+    def _last_event_index(self) -> int:
         return len(self.events) - 1
 
     @contextmanager
-    def recording(self):
+    def _recording(self):
         self.is_recording = True
         try:
             yield
         finally:
             self.is_recording = False
 
+    @record_shapeenv_event()
+    def _eliminate_unbacked(self, orig_s: sympy.Symbol, new_s: sympy.Expr):
+        self._set_replacement(orig_s, new_s, "eliminate_unbacked")
+
+    @record_shapeenv_event()
+    def set_unbacked_var_to_val(self, k: sympy.Symbol, v: int) -> None:
+        """Used only when propagate_real_tensors; registers a value for an
+        unbacked symbol, which can be used last resort to resolve hints."""
+        self.unbacked_var_to_val[k] = v
+
+    # Unlike set_replacement, this records a shapeenv event
+    @record_shapeenv_event()
+    def _rename_unbacked_to(self, orig_s: sympy.Symbol, new_s: sympy.Symbol):
+        assert isinstance(orig_s, sympy.Symbol), orig_s
+        assert isinstance(new_s, sympy.Symbol), new_s
+        assert free_unbacked_symbols(new_s), new_s
+        assert free_unbacked_symbols(orig_s), orig_s
+        if self._ignore_fresh_unbacked_symbols_tls():
+            return
+        dest = self.replacements.get(orig_s)
+        assert not free_unbacked_symbols(dest), f"{orig_s} -> {dest}"
+        self._set_replacement(orig_s, new_s, "rename_unbacked_to")
+        self.unbacked_renamings[orig_s] = new_s
+        if dest is not None:
+            self._set_replacement(new_s, dest, "rename_unbacked_to_dest")
+
+    @record_shapeenv_event()
+    def _constrain_range_for_size(self, a: sympy.Symbol, min: Optional[int] = None, max: Optional[int] = None):
+        if min is None:
+            min = 0
+        if max is None:
+            max = sys.maxsize - 1
+
+        if max < min:
+            raise ValueError(
+                "Maximum value to constrain_as_size can't be less than the specified min value, "
+                "received min={min} and max={max}"
+            )
+
+        self.constrain_symbol_range(
+            a,
+            compiler_min=min,
+            compiler_max=max,
+        )
+        self.size_like.add(a)
+
+    @record_shapeenv_event()
+    def _constrain_range(self, a: sympy.Expr, min: int, max: int):
+        if isinstance(a, sympy.Integer):
+            if not (min <= int(a) <= max):
+                raise ValueRangeError(f"Invalid value {int(a)} for range [{min}:{max}]")
+            return
+        assert isinstance(a, sympy.Symbol), "constraining non-Symbols NYI"
+
+        # TODO: Shouldn't we install a guard if the symbol is backed?  Or is the
+        # semantics that this is an "unchecked" assert (but it this actually
+        # something useful?  Might be better to restrict only for unbacked
+        # SymInt).
+        self.constrain_symbol_range(
+            a,
+            compiler_min=min,
+            compiler_max=max,
+        )
+
+    @record_shapeenv_event()
+    def _constrain_unify(self, a, b):
+        """
+        Given two SymInts, constrain them so that they must be equal.  NB:
+        this will not work with SymInts that represent nontrivial expressions
+        (yet!)
+        """
+        # TODO: this does not install a deferred runtime assert yet
+
+        # TODO: Maybe dedupe this with _maybe_guard_rel?
+        # Update Feb 2024: this is extra important to do, this doesn't handle
+        # unbacked replacements properly nor does it generate deferred runtime
+        # asserts
+        if not isinstance(a, SymInt):
+            if not isinstance(b, SymInt):
+                assert a == b
+            else:
+                assert isinstance(b.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
+                assert b.node.shape_env is self
+                self.replacements[b.node.expr] = sympy.Integer(a)
+        else:
+            # TODO: Actually, we can support this as long as one of them is a symbol.
+            # NB: We can't actually do "unification" as our operators are not
+            # injective
+            assert isinstance(a.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
+            assert a.node.shape_env is self
+            if not isinstance(b, SymInt):
+                self.replacements[a.node.expr] = sympy.Integer(b)
+            else:
+                assert a.node.shape_env is b.node.shape_env
+                assert isinstance(b.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
+                new_var = self._find(a.node.expr)
+                self.replacements[b.node.expr] = new_var
+
+    def _ignore_fresh_unbacked_symbols_tls(self):
+        return getattr(TLS, "ignore_fresh_unbacked_symbols", False)
+
+    @record_shapeenv_event()
+    def _ignore_fresh_unbacked_symbols_enter(self):
+        TLS.ignore_fresh_unbacked_symbols = True
+
+    @record_shapeenv_event()
+    def _ignore_fresh_unbacked_symbols_exit(self):
+        TLS.ignore_fresh_unbacked_symbols = False
+
+    @contextmanager
+    def ignore_fresh_unbacked_symbols(self):
+        """
+        Indicates that the newly allocated unbacked SymInts are being
+        discarded
+        """
+        self._ignore_fresh_unbacked_symbols_enter()
+        try:
+            yield
+        finally:
+            self._ignore_fresh_unbacked_symbols_exit()
+
     @record_shapeenv_event()
     def freeze(self):
+        """Freeze this ShapeEnv to stop accumulating guards
+
+        A frozen ShapeEnv will ignore any further guards generated on it and
+        only emit a warning which may lead to accuracy problems.
+        """
         self.frozen = True
 
+    @record_shapeenv_event()
+    def freeze_runtime_asserts(self):
+        """Freeze this ShapeEnv to stop adding deferred runtime asserts.
+
+        We will error if you try to install a new runtime assert when it is
+        frozen.  This would indicate a lowering violation, or perhaps something
+        we know statically is already True but we are checking it again in a way
+        that is not clearly dischargeable.
+        """
+        self.runtime_asserts_frozen = True
+
     def _create_symbol_for_source(self, source: Source) -> Optional[sympy.Symbol]:
         if not self._translation_validation_enabled:
             return None
@@ -1972,7 +2656,7 @@ def _check_translation_validate(self) -> None:
             self.validator.validate()
 
     @record_shapeenv_event()
-    def create_fx_call_function(
+    def _create_fx_call_function(
             self,
             op: Callable,
             args: Tuple,
@@ -1983,7 +2667,6 @@ def create_fx_call_function(
         fresh = False
 
         if self._translation_validation_enabled and node_key not in self.fx_node_cache:
-            from torch.fx.experimental.validator import z3op
 
             # Presence of None in the arguments implies that we should ignore this operation.
             if any(a is None for a in args):
@@ -1993,17 +2676,16 @@ def create_fx_call_function(
                 return None, fresh
 
             fresh = True
-            lifted_op = z3op(op, self.validator)
 
             # If translation validation is enabled, all arguments must have its
             # own FX node.
             assert all(a is not None for a in args), f"missing arg in FX graph ({op.__name__}): {args}"
-            node = self.fx_node_cache[node_key] = self.graph.call_function(lifted_op, args)
+            node = self.fx_node_cache[node_key] = self.graph.call_function(op, args)
             self.name_to_node[node.name] = node
 
         return self.fx_node_cache.get(node_key, None), fresh
 
-    def create_fx_placeholder_and_z3var(
+    def _create_fx_placeholder_and_z3var(
             self,
             symbol: sympy.Symbol,
             type: Type,
@@ -2029,43 +2711,44 @@ def create_fx_placeholder_and_z3var(
 
         return self.fx_node_cache[node_key]
 
-    def remove_fx_node(self, node: Optional[torch.fx.Node]) -> None:
+    def _remove_fx_node(self, node: Optional[torch.fx.Node]) -> None:
         if self._translation_validation_enabled and node is not None:
             self.name_to_node.pop(node.name)
             self.graph.erase_node(node)
 
-    def add_fx_node_metadata(self, node: torch.fx.Node) -> None:
+    def _add_fx_node_metadata(self, node: torch.fx.Node) -> None:
         from torch._dynamo.utils import get_current_node
 
         if self.should_record_events:
-            node.meta[SHAPEENV_EVENT_KEY] = self.last_event_index()
+            node.meta[SHAPEENV_EVENT_KEY] = self._last_event_index()
             node.meta[CURRENT_NODE_KEY] = get_current_node()
 
     def _suppress_guards_tls(self):
         return getattr(TLS, "suppress_guards", False)
 
     @record_shapeenv_event()
-    def suppress_guards_enter(self):
+    def _suppress_guards_enter(self):
         TLS.suppress_guards = True
 
     @record_shapeenv_event()
-    def suppress_guards_exit(self):
+    def _suppress_guards_exit(self):
         TLS.suppress_guards = False
 
     @contextmanager
     def suppress_guards(self):
-        self.suppress_guards_enter()
+        """Context manager to ignore all guards generated inside"""
+        self._suppress_guards_enter()
         try:
             yield
         finally:
-            self.suppress_guards_exit()
+            self._suppress_guards_exit()
 
     def _get_key(self):
         """
         Defines the current "state" of the guards we've accumulated in this ShapeEnv.
         Determines when we need to invalidate our cache
         """
-        return (len(self.replacements), len(self.divisible), self.num_deferred_runtime_asserts)
+        return (len(self.replacements), len(self.divisible), self.num_deferred_runtime_asserts, len(self.unbacked_var_to_val))
 
     def _update_version_counter(self):
         # The shape environment is queried orders of magnitude more often than
@@ -2119,51 +2802,9 @@ def create_symbolic_sizes_strides_storage_offset(
         introduce new symbolic variables.
         """
 
-        # Dynamo may want to wrap FakeTensors with SymInt sizes up e.g. make_fx(opt_f(), tracing_mode="symbolic").
-        # We create symbols in shape_env using the backed hints behind SymInt.
-
-        # Case 1: when SymInt is backed, dynamo can proceed with FakeTensors that have concrete shape.
-        # produce_guards will trigger specializations on the outer stuff
-
-        # Case 2: when the SymInt is unbacked, we will throw an data dependent error in require_hint().
-        #
-        # It's probably good for now but it's important to note that this approach has implications for
-        # the original shape_env when checking guards in different order.
-
-        # Example:
-        # ---------
-        # Consider a function "opt_f" as shown below:
-
-        # @torch.compile()
-        # def opt_f(x: bool, y: Tensor):
-        #   if x == True:
-        #     return y + torch.randn([4])
-        #   else:
-        #     return y
-        # Depending on the sequence of calls, we might install two different sets of guards:
-
-        # 1. opt_f(False, y):
-        #    - "x == False" (always works for any size y)
-
-        # 2. opt_f(True, y):
-        #    - Triggers recompilation and results in guards like:
-        #      - "x == True and y.size(0) == 4"
-        #      - (or "y.size(0) == 4 and x == True")
-
-        # The order of checking the guards matters. In this specific example:
-        # If True branch guard check precedes False branch and for True branch, y.size(0) check precedes x == True,
-        # we may have an unnessary shape speciliazation for y.
-        def maybe_specialize_sym_int_with_hint(maybe_sym) -> int:
-            assert isinstance(maybe_sym, (int, torch.SymInt))
-            if is_symbolic(maybe_sym):
-                assert maybe_sym.node.shape_env is not self, \
-                    "expect the symbol is created from an shape env other than current one."
-                return maybe_sym.node.require_hint()
-            return maybe_sym
-
-        ex_size = tuple(maybe_specialize_sym_int_with_hint(sz) for sz in ex.size())
-        ex_stride = tuple(maybe_specialize_sym_int_with_hint(sd) for sd in ex.stride())
-        ex_storage_offset = maybe_specialize_sym_int_with_hint(ex.storage_offset())
+        ex_size = tuple(self._maybe_specialize_sym_int_with_hint(sz) for sz in ex.size())
+        ex_stride = tuple(self._maybe_specialize_sym_int_with_hint(sd) for sd in ex.stride())
+        ex_storage_offset = self._maybe_specialize_sym_int_with_hint(ex.storage_offset())
 
         return self._create_symbolic_sizes_strides_storage_offset(
             ex_size,
@@ -2174,6 +2815,48 @@ def maybe_specialize_sym_int_with_hint(maybe_sym) -> int:
             symbolic_context=symbolic_context,
         )
 
+    # Dynamo may want to wrap FakeTensors with SymInt sizes up e.g. make_fx(opt_f(), tracing_mode="symbolic").
+    # We create symbols in shape_env using the backed hints behind SymInt.
+
+    # Case 1: when SymInt is backed, dynamo can proceed with FakeTensors that have concrete shape.
+    # produce_guards will trigger specializations on the outer stuff
+
+    # Case 2: when the SymInt is unbacked, we will throw an data dependent error in require_hint().
+    #
+    # It's probably good for now but it's important to note that this approach has implications for
+    # the original shape_env when checking guards in different order.
+
+    # Example:
+    # ---------
+    # Consider a function "opt_f" as shown below:
+
+    # @torch.compile()
+    # def opt_f(x: bool, y: Tensor):
+    #   if x == True:
+    #     return y + torch.randn([4])
+    #   else:
+    #     return y
+    # Depending on the sequence of calls, we might install two different sets of guards:
+
+    # 1. opt_f(False, y):
+    #    - "x == False" (always works for any size y)
+
+    # 2. opt_f(True, y):
+    #    - Triggers recompilation and results in guards like:
+    #      - "x == True and y.size(0) == 4"
+    #      - (or "y.size(0) == 4 and x == True")
+
+    # The order of checking the guards matters. In this specific example:
+    # If True branch guard check precedes False branch and for True branch, y.size(0) check precedes x == True,
+    # we may have an unnessary shape speciliazation for y.
+    def _maybe_specialize_sym_int_with_hint(self, maybe_sym) -> int:
+        assert isinstance(maybe_sym, (int, torch.SymInt))
+        if is_symbolic(maybe_sym):
+            assert maybe_sym.node.shape_env is not self, \
+                "expect the symbol is created from an shape env other than current one."
+            return maybe_sym.node.require_hint()
+        return maybe_sym
+
     @record_shapeenv_event()
     def _create_symbolic_sizes_strides_storage_offset(
         self,
@@ -2232,15 +2915,18 @@ def _create_symbolic_sizes_strides_storage_offset(
                 for i in range(len(size))
                 if stride[i] is not None and ex_stride[i] >= 0
             }
+
             # iterate over unbound strides in sorted order
-            val_list = sorted(
-                [(ex_stride[i], i) for i in range(len(stride)) if stride[i] is None],
-                key=lambda tup: (
-                    # Order singletons by their coefficients.
-                    # 1 here to order singletons after non-singletons.
-                    (1, tup[0].node.singleton_coeff(), tup[1]) if is_singleton(tup[0])
+            def _nested_int_aware_sort(tup):
+                return (
+                    # Order nested ints by their coefficients.
+                    # 1 here to order nested ints after non-nested-ints.
+                    (1, tup[0].node.nested_int_coeff(), tup[1]) if is_nested_int(tup[0])
                     else (0, *tup)
                 )
+            val_list = sorted(
+                [(ex_stride[i], i) for i in range(len(stride)) if stride[i] is None],
+                key=_nested_int_aware_sort,
             )
             for _, i in val_list:
                 if stride[i] is None and ex_stride[i] in candidates:
@@ -2254,7 +2940,7 @@ def _create_symbolic_sizes_strides_storage_offset(
                         (ex_stride[i], i)
                         for i in range(len(stride))
                         if stride[i] is None
-                    ]
+                    ], key=_nested_int_aware_sort
                 )
                 stride[i] = self.create_symbol(
                     val,
@@ -2292,9 +2978,6 @@ def _create_symbolic_sizes_strides_storage_offset(
             source=TensorPropertySource(source, TensorProperty.STORAGE_OFFSET))
         return tuple(sym_sizes), tuple(sym_stride), sym_storage_offset
 
-    # If you know what the current hint value of the SymInt to be created
-    # is, pass it into hint.  Otherwise, pass None and we will make our best
-    # guess
     @record_shapeenv_event()
     def create_symintnode(
             self,
@@ -2303,6 +2986,13 @@ def create_symintnode(
             hint: Optional[int],
             source: Optional[Source] = None,
     ):
+        """Create a SymInt value from a symbolic expression
+
+        If you know what the current hint value of the SymInt to be created
+        is, pass it into hint.  Otherwise, pass None and we will make our best
+        guess
+
+        """
         source_name = source.name() if source else None
 
         if self._translation_validation_enabled and source is not None:
@@ -2311,7 +3001,7 @@ def create_symintnode(
             assert symbol is not None
 
             # Create a new FX placeholder and Z3 variable for 'symbol'.
-            fx_node = self.create_fx_placeholder_and_z3var(symbol, int)
+            fx_node = self._create_fx_placeholder_and_z3var(symbol, int)
 
             # Add an equality assertion for the newly created symbol and 'sym'.
             self._add_assertion(sympy.Eq(symbol, sym))
@@ -2328,6 +3018,7 @@ def create_symintnode(
 
     @record_shapeenv_event()
     def create_unspecified_symint_and_symbol(self, value, source, dynamic_dim):
+        """Create a SymInt wrapping a new unspecified symbol"""
         return self.create_symintnode(
             self.create_unspecified_symbol(
                 value,
@@ -2339,50 +3030,75 @@ def create_unspecified_symint_and_symbol(self, value, source, dynamic_dim):
         )
 
     def create_symboolnode(self, sym: "sympy.Expr"):
+        """Create a SymBool object from a sympy boolean expression"""
         # This function is only being used in serialization, so we do not track it
         # for validation.
         return SymBool(SymNode(sym, self, bool, None))
 
+    def _log_create_unbacked_symbol(self, prefix: str, symbol, vr: ValueRanges):
+        is_debug = config.extended_debug_create_symbol is not None and str(symbol) in config.extended_debug_create_symbol.split(',')
+        fsummary, maybe_user_loc, maybe_extra_debug = self._get_stack_summary(is_debug)
+        log.info(
+            "%s %s [%s, %s]%s (%s)%s",
+            prefix, symbol, vr.lower, vr.upper, maybe_user_loc, format_frame(fsummary), maybe_extra_debug, stack_info=is_debug
+        )
+
     @record_shapeenv_event()
     def create_unbacked_symfloat(self):
-        symbol: sympy.Symbol = sympy.Symbol(f"f{next(self.unbacked_symfloat_counter)}")
+        """Create a symbolic float without a hint value
+        """
+        symbol: sympy.Symbol = make_symbol(SymT.UNBACKED_FLOAT, next(self.unbacked_symfloat_counter))
         self.counter["create_unbacked_symbol"] += 1
+        if not self._ignore_fresh_unbacked_symbols_tls():
+            self.pending_fresh_unbacked_symbols.append(symbol)
         self.var_to_stack[symbol] = CapturedTraceback.extract(skip=1)
-        self.var_to_range[symbol] = ValueRanges.unknown()
+        vr = self.var_to_range[symbol] = ValueRanges.unknown()
 
         # Create a new FX placeholder and Z3 variable for 'symbol'.
-        fx_node = self.create_fx_placeholder_and_z3var(symbol, float)
+        fx_node = self._create_fx_placeholder_and_z3var(symbol, float)
+
+        self._log_create_unbacked_symbol("create_unbacked_symfloat", symbol, vr)
 
         return SymFloat(SymNode(symbol, self, float, None, fx_node=fx_node))
 
     @record_shapeenv_event()
     def create_unbacked_symint(self):
-        symbol: sympy.Symbol = sympy.Symbol(f"i{next(self.unbacked_symint_counter)}", integer=True)
+        """Create a symbolic integer without a hint value
+        """
+        symbol: sympy.Symbol = make_symbol(SymT.UNBACKED_INT, next(self.unbacked_symint_counter), integer=True)
+        if not self._ignore_fresh_unbacked_symbols_tls():
+            self.pending_fresh_unbacked_symbols.append(symbol)
         self.counter["create_unbacked_symbol"] += 1
         self.var_to_stack[symbol] = CapturedTraceback.extract(skip=1)
         vr = self.var_to_range[symbol] = self._default_unspecified_value_range()
 
         # Create a new FX placeholder and Z3 variable for 'symbol'.
-        fx_node = self.create_fx_placeholder_and_z3var(symbol, int)
+        fx_node = self._create_fx_placeholder_and_z3var(symbol, int)
 
-        fsummary, user_tb, maybe_user_loc = self._get_stack_summary()
-        log.info("create_unbacked_symbol %s [%s, %s]%s (%s)", symbol, vr.lower, vr.upper, maybe_user_loc, format_frame(fsummary))
+        self._log_create_unbacked_symbol("create_unbacked_symint", symbol, vr)
 
         return SymInt(SymNode(symbol, self, int, None, fx_node=fx_node))
 
     def is_unbacked_symint(self, symbol: sympy.Symbol) -> bool:
-        # NB: keep synced with free_unbacked_symbols
-        return str(symbol).startswith("i")
+        """Check if a sympy symbol matches the naming convention for unbacked symbols
+        """
+        return symbol_is_type(symbol, SymT.UNBACKED_INT)
 
     @record_shapeenv_event()
     def create_unbacked_symbool(self):
-        symbol: sympy.Symbol = sympy.Symbol(f"i{next(self.unbacked_symint_counter)}", integer=True)
+        """Create a symbolic boolean without a hint value
+        """
+        symbol: sympy.Symbol = make_symbol(SymT.UNBACKED_INT, next(self.unbacked_symint_counter), integer=True)
+        if not self._ignore_fresh_unbacked_symbols_tls():
+            self.pending_fresh_unbacked_symbols.append(symbol)
         self.counter["create_unbacked_symbol"] += 1
         self.var_to_stack[symbol] = CapturedTraceback.extract(skip=1)
-        self.var_to_range[symbol] = ValueRanges(0, 1)
+        vr = self.var_to_range[symbol] = ValueRanges(0, 1)
 
         # Create a new FX placeholder and Z3 variable for 'symbol'.
-        fx_node = self.create_fx_placeholder_and_z3var(symbol, bool)
+        fx_node = self._create_fx_placeholder_and_z3var(symbol, bool)
+
+        self._log_create_unbacked_symbol("create_unbacked_symbool", symbol, vr)
 
         return SymBool(SymNode(sympy.Eq(symbol, 1), self, bool, None, fx_node=fx_node))
 
@@ -2394,6 +3110,11 @@ def create_unspecified_symbol(
         dynamic_dim: DimDynamic = DimDynamic.DUCK,
         constraint_dim: DimConstraint = None,  # NB: includes None
     ) -> "sympy.Expr":
+        """Create a symbol with an unspecified value
+
+        Compared to standard symbols we do not assume the value is positive,
+        nor do we specialze on zero or one values.
+        """
         # 'positive' is None for unspecified symbols, since we can't
         # assume that it will be neither positive nor negative.
 
@@ -2419,6 +3140,21 @@ def create_symbol(
         do_not_specialize_zero_one: bool = False,
         symbolic_context=None,
     ) -> "sympy.Expr":
+        """Create a new symbol which is tracked by this ShapeEnv
+        """
+        # check if constraint_dim is actually static integer
+        if isinstance(constraint_dim, StrictMinMaxConstraint) and constraint_dim.vr.lower == constraint_dim.vr.upper:
+            dynamic_dim = DimDynamic.STATIC
+            if constraint_dim.vr.lower != val:
+                raise ConstraintViolationError(
+                    f"Static shape constraint of {constraint_dim.vr.lower} does not match input size of {val}, "
+                    f"for {source.name()}"
+                )
+            if symbolic_context:
+                symbolic_context.dynamic_sizes[source.idx] = dynamic_dim
+                symbolic_context.constraint_sizes[source.idx] = None
+            constraint_dim = None
+
         # see note [Tensor Fakification and Symbol Caching]
         source_name = source.name()
         if (isinstance(symbolic_context, StatefulSymbolicContext)
@@ -2464,13 +3200,13 @@ def create_symbol(
             # If we're not duck shaping, we always create a new symbol
             # Even if we're duck shaping, if we haven't seen this particular
             # value before, we also create a new symbol
-            sympy_expr = sympy.Symbol(f"s{len(self.var_to_val)}", positive=positive, integer=True)
+            sympy_expr = make_symbol(SymT.SIZE, len(self.var_to_val), positive=positive, integer=True)
             # We always associate vars to vals
             if isinstance(val, int):
                 self.var_to_val[sympy_expr] = sympy.Integer(val)
             else:
                 # Only used for jagged layout nested tensors
-                self.var_to_val[sympy_expr] = SingletonInt(val.node.singleton_int(), coeff=val.node.singleton_coeff())
+                self.var_to_val[sympy_expr] = SingletonInt(val.node.nested_int(), coeff=val.node.nested_int_coeff())
 
             # Do the appending later, because we always want to populate this
             self.var_to_sources[sympy_expr] = []
@@ -2502,10 +3238,6 @@ def create_symbol(
                 if val not in vr:
                     raise ConstraintViolationError(f"{val} not in range [{vr.lower}, {vr.upper}]")
 
-                # Initialize default runtime range to match compile time range,
-                # for backed SymInts (this is allowed to diverge for unbacked)
-                self.runtime_var_to_range[sympy_expr] = vr
-
                 range_str = f"[{vr.lower}, {vr.upper}]"
             else:
                 # Skip var_range logic for SingletonInt
@@ -2513,7 +3245,24 @@ def create_symbol(
                 range_str = ""
 
             r = sympy_expr
-            self.log.info("create_symbol %s = %s for %s %s", sympy_expr, val, source.name(), range_str)
+
+            is_debug = (
+                config.extended_debug_create_symbol is not None and
+                str(sympy_expr) in config.extended_debug_create_symbol.split(',')
+            )
+            maybe_more_info = ""
+            if not is_debug:
+                maybe_more_info = (
+                    ", for more info run with "
+                    f"TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL=\"{sympy_expr}\""
+                )
+            fsummary, maybe_user_loc, maybe_extra_debug = self._get_stack_summary(is_debug)
+            self.log.info(
+                "create_symbol %s = %s for %s %s%s (%s)%s%s",
+                sympy_expr, val, source.name(), range_str,
+                maybe_user_loc, format_frame(fsummary), maybe_more_info, maybe_extra_debug, stack_info=is_debug
+            )
+
             self.counter["create_symbol"] += 1
         else:
             # This implements duck-shaping: input sizes that match are assigned
@@ -2522,17 +3271,31 @@ def create_symbol(
             self.log.debug("create_symbol %s duck sized %s", r, source.name())
 
         if isinstance(r, sympy.Symbol):
-            self.var_to_sources[r].append(source)
+            r_sources = self.var_to_sources[r]
+            r_sources.append(source)
+            if not source.is_ephemeral() and r_sources[0].is_ephemeral():
+                # prefer non-ephemeral source first since it may be guarded on later
+                r_sources[0], r_sources[-1] = r_sources[-1], r_sources[0]
+
+            # This ensures we get zeros in symbol_guard_counts, which makes
+            # some queries simpler (since we will accumulate mass on 0 this
+            # way)
+            self.symbol_guard_counter[r] = 0
 
         if isinstance(symbolic_context, StatefulSymbolicContext) and source_name:
             symbolic_context.shape_env_to_source_to_symbol_cache[id(self)][source_name] = r
         return r
 
-    def debug_name(self, source):
+    def add_var_to_val(self, expr: sympy.Symbol, val: int):
+        """ Adds a new symbol to the symbolic environment. """
+        assert expr not in self.var_to_val, f"{expr} already exists"
+        self.var_to_val[expr] = sympy.Integer(val)
+
+    def _debug_name(self, source):
         src_name = source.name()
         return self.source_name_to_debug_name.get(src_name, src_name)
 
-    def render_range_for_constraint_violation(self, source, c):
+    def _render_range_for_constraint_violation(self, source, c):
         if isinstance(c, StrictMinMaxConstraint):
             lower, upper = c.vr.lower, c.vr.upper
             default = self._default_value_range()
@@ -2540,31 +3303,16 @@ def render_range_for_constraint_violation(self, source, c):
                 lower = None
             if upper >= default.upper:
                 upper = None
-            c_render = f"{self.debug_name(source)} = {source.name()} in the specified range"
+            c_render = f"{self._debug_name(source)} = {source.name()} in the specified range"
             if lower is not None and upper is not None:
-                c_render += f" {lower} <= {self.debug_name(source)} <= {upper}"
+                c_render += f" {lower} <= {self._debug_name(source)} <= {upper}"
             elif lower is None and upper is not None:
-                c_render += f" {self.debug_name(source)} <= {upper}"
+                c_render += f" {self._debug_name(source)} <= {upper}"
             elif lower is not None and upper is None:
-                c_render += f" {lower} <= {self.debug_name(source)}"
+                c_render += f" {lower} <= {self._debug_name(source)}"
             return c_render
         return c.render(source)
 
-    # Generates a list of guards strings which, when evaluated in a context that
-    # defines tensors for all the sources, returns True or False depending
-    # on if the guards in the list evaluated to True or not.  Primarily used by Dynamo,
-    # but this is also helpful for manual testing of guards (see
-    # evaluate_guards_for_args)
-    #
-    # For convenience in testing, a source is allowed to be a str,
-    # in which case we will assume it is a LocalSource
-    #
-    # simplified lets you omit duck sizing, equality and 0/1 guards.
-    # This is useful for testing when you don't care about the boilerplate
-    # guards, and it may be helpful for user output too (be careful though;
-    # some equality guards are nontrivial!  It would be nice to get simplified
-    # output to print them too).  It's private because it's not
-    # intended for normal use
     def produce_guards(
         self,
         placeholders,
@@ -2572,11 +3320,31 @@ def produce_guards(
         source_ref=lambda n: n.name(),
         *,
         input_contexts: Optional[DimList[SymbolicContext]] = None,
-        equalities_inputs: Optional[Set[Tuple[Source, Source]]] = None,
+        # Encodes user-specified input shape equations of the form s = s' and s = fn(s').
+        # (See docs on EqualityConstraint for details of the encoding.)
+        equalities_inputs: Optional[EqualityConstraint] = None,
         _simplified=False,
+        _disable_forced_specializations=False,
         # Indicates if we should produce guards for known static values.
         ignore_static=True,
     ) -> List[str]:
+        """
+        Generates a list of guards strings which, when evaluated in a context that
+        defines tensors for all the sources, returns True or False depending
+        on if the guards in the list evaluated to True or not.  Primarily used by Dynamo,
+        but this is also helpful for manual testing of guards (see
+        evaluate_guards_for_args)
+
+        For convenience in testing, a source is allowed to be a str,
+        in which case we will assume it is a LocalSource
+
+        simplified lets you omit duck sizing, equality and 0/1 guards.
+        This is useful for testing when you don't care about the boilerplate
+        guards, and it may be helpful for user output too (be careful though;
+        some equality guards are nontrivial!  It would be nice to get simplified
+        output to print them too).  It's private because it's not
+        intended for normal use
+        """
         self.log.info("produce_guards")
 
         # Check if we get to the same ShapeEnv state by replaying the recorded events.
@@ -2593,7 +3361,7 @@ def produce_guards(
             shape_env = replay_shape_env_events(self.events)
             self.check_equal(shape_env)
 
-        assert len(placeholders) == len(sources)
+        assert len(placeholders) == len(sources), f"len({placeholders}) != len({sources})"
         Tensorlike = (torch.Tensor, FakeTensorMeta)
 
         def _create_no_constraints_context(t):
@@ -2637,7 +3405,7 @@ def _create_no_constraints_context(t):
         #
         # - You perform some compute on these symbols, occasionally
         #   introducing guards on boolean expressions on these symbols.
-        #   In particular, whenever we guard on equality (_maybe_guard_eq),
+        #   In particular, whenever we guard on equality (_maybe_guard_rel),
         #   we can simplify shapes; e.g., when s0 == s1 * 2, we can now
         #   replace all occurrences of s0 with s1 * 2.  Sometimes, a
         #   boolean expression evaluation doesn't introduce a guard, as
@@ -2678,7 +3446,7 @@ def _create_no_constraints_context(t):
         # TODO: Make this more efficient by binding all the size/stride/offsets
         # to locals before performing tests on them.
 
-        from torch._dynamo.source import TensorPropertySource, TensorProperty, NegateSource
+        from torch._dynamo.source import TensorPropertySource, TensorProperty
 
         # Actual codegen must be delayed as we don't necessarily know what
         # the symbol mapping is
@@ -2701,22 +3469,52 @@ def is_dim(src):
             for i, src in enumerate(sources):
                 source_index[src.name()] = i
 
-            def get_symbol(tensor_dim_src):
+            def get_expression(tensor_dim_src):
                 fake = placeholders[source_index[tensor_dim_src.base.name()]]
                 symint = fake.shape[tensor_dim_src.idx]
-                assert isinstance(symint, torch.SymInt)
-                return symint.node.expr
+                if isinstance(symint, torch.SymInt):
+                    return symint.node.expr
+                else:
+                    assert type(symint) is int, f"Expected int, got {type(symint)}"
+                    return symint
 
             for src1, src2 in equalities_inputs.source_pairs:
-                s1, s2 = get_symbol(src1), get_symbol(src2)
-                concrete_val = self.evaluate_expr(sympy.Eq(s1, s2))
+                expr1, expr2 = get_expression(src1), get_expression(src2)
+                # Check whether given input shape values satisfy a specified equation s = s'.
+                # - Raise when the equation was violated by the given input shape values.
+                # - Otherwise issue a guard to constrain them.
+                concrete_val = self.evaluate_expr(sympy.Eq(expr1, expr2))
                 if not concrete_val:
                     raise ConstraintViolationError(
-                        f"{src1.name()} = {self.var_to_val[s1]}"
+                        f"{src1.name()} = {expr1.xreplace(self.var_to_val)}"
                         " is not equal to "
-                        f"{src2.name()} = {self.var_to_val[s2]}"
+                        f"{src2.name()} = {expr2.xreplace(self.var_to_val)}"
                     )
 
+            for src, root, fn in equalities_inputs.derived_equalities:
+                expr1 = get_expression(src)
+                # recall that root is either a phantom symbol or an input source
+                expr2, debug_name = (
+                    (root, self.var_to_sources[root][0].name()) if isinstance(root, sympy.Symbol)
+                    else (get_expression(root), self._debug_name(root))
+                )
+                expr2_ = fn(expr2)
+                # Check whether given input shape values satisfy a specified equation s = fn(s').
+                # - Raise when the equation was violated by the given input shape values.
+                # - Otherwise issue a guard to constrain them.
+                concrete_val = self.evaluate_expr(sympy.Eq(expr1, expr2_))
+                if not concrete_val:
+                    raise ConstraintViolationError(
+                        f"Expected input {src.name()} to be equal to "
+                        f"{fn(sympy.Symbol(debug_name))}, "
+                        f"where {debug_name} = {expr2.xreplace(self.var_to_val)}, "
+                        f"but got {expr1.xreplace(self.var_to_val)}"
+                    )
+
+            for phantom_symbol in equalities_inputs.phantom_symbols:
+                # we created additional phantom symbols that are not input shape dimensions
+                symbol_to_source[phantom_symbol].extend(self.var_to_sources[phantom_symbol])
+
         # How do we know what the value of s0 is?  Fresh variables can only be
         # bound by inputs, so there MUST be some other input which binds the
         # variable.  If there is no such input, this is an error in our
@@ -2741,19 +3539,12 @@ def track_symint(source, val, constraint=None):
                     symbol_to_source[s].append(source)
                     if constraint is not None:
                         symbol_to_constraints[s].add(constraint)
-                elif isinstance(-s, sympy.Symbol):
-                    symbol_to_source[-s].append(NegateSource(source))
                 else:
                     constraint_violated = False
                     if isinstance(constraint, StrictMinMaxConstraint):
                         # try inferring the ranges of the expr s
                         sym_vrs = {x: self.var_to_range.get(x, None) for x in s.free_symbols}
-                        if all(vr is not None for vr in sym_vrs.values()):
-                            expr_vr = bound_sympy(s, sym_vrs)
-                            if (expr_vr != constraint.vr):
-                                # the expr and constrain ranges don't match
-                                constraint_violated = True
-                        else:
+                        if any(vr is None for vr in sym_vrs.values()):
                             # some of the free symbols in s don't have ranges
                             constraint_violated = True
                     elif isinstance(constraint, RelaxedUnspecConstraint):
@@ -2768,14 +3559,14 @@ def hint(s):
                             sexpr = ShapeGuardPrinter(symbol_to_source, source_ref, self.var_to_sources).doprint(s)
                             return f"{sexpr}."
 
-                        var_with_range = self.render_range_for_constraint_violation(source, constraint)
+                        var_with_range = self._render_range_for_constraint_violation(source, constraint)
                         msg = (
                             f"Not all values of {var_with_range} are valid because "
-                            f"{self.debug_name(source)} was inferred to be equal to "
+                            f"{self._debug_name(source)} was inferred to be equal to "
                         )
                         record_constraint_violation(
                             constraint.warn_only,
-                            self.debug_name(source),
+                            self._debug_name(source),
                             msg,
                             hint=functools.partial(hint, s),
                         )
@@ -2786,19 +3577,20 @@ def hint(s):
                 input_guards.append((source, s))
                 constraint_violated = False
                 if isinstance(constraint, StrictMinMaxConstraint):
-                    constraint_violated = True
+                    if not (s == constraint.vr.lower == constraint.vr.upper):  # allow static constraints
+                        constraint_violated = True
                 elif isinstance(constraint, RelaxedUnspecConstraint):
                     # Don't complain about 0/1 specialization, we
                     # expect to have to compile in this case anyway
                     if val not in (0, 1):
                         constraint_violated = True
                 if constraint_violated:
-                    var_with_range = self.render_range_for_constraint_violation(source, constraint)
+                    var_with_range = self._render_range_for_constraint_violation(source, constraint)
                     msg = (
                         f"Not all values of {var_with_range} are valid because "
-                        f"{self.debug_name(source)} was inferred to be a constant ({val})."
+                        f"{self._debug_name(source)} was inferred to be a constant ({val})."
                     )
-                    record_constraint_violation(constraint.warn_only, self.debug_name(source), msg)
+                    record_constraint_violation(constraint.warn_only, self._debug_name(source), msg)
 
         for t, source, context in zip(placeholders, sources, input_contexts):
             if isinstance(source, str):
@@ -2834,12 +3626,17 @@ def hint(s):
                 sources_tensors_constraints = [(source, t, context.constraint_sizes)]
 
             for src, curr_t, constraint in sources_tensors_constraints:
-                for i, ss in enumerate(curr_t.size()):
-                    property_source = TensorPropertySource(src, TensorProperty.SIZE, i)
-                    track_symint(property_source, ss, constraint[i])
-                for i, ss in enumerate(curr_t.stride()):
-                    track_symint(TensorPropertySource(src, TensorProperty.STRIDE, i), ss)
-                track_symint(TensorPropertySource(src, TensorProperty.STORAGE_OFFSET), curr_t.storage_offset())
+                if is_sparse_any(curr_t):
+                    for i, ss in enumerate(curr_t.size()):
+                        property_source = TensorPropertySource(src, TensorProperty.SIZE, i)
+                        track_symint(property_source, ss, constraint[i])
+                else:
+                    for i, ss in enumerate(curr_t.size()):
+                        property_source = TensorPropertySource(src, TensorProperty.SIZE, i)
+                        track_symint(property_source, ss, constraint[i])
+                    for i, ss in enumerate(curr_t.stride()):
+                        track_symint(TensorPropertySource(src, TensorProperty.STRIDE, i), ss)
+                    track_symint(TensorPropertySource(src, TensorProperty.STORAGE_OFFSET), curr_t.storage_offset())
 
         # 1. Every input must equal the final simplified symbolic expression
         #    stored on the placeholder.  Given a placeholder (s0*2, s1),
@@ -2883,19 +3680,37 @@ def hint(s):
                 sexpr = ShapeGuardPrinter(symbol_to_source, source_ref, self.var_to_sources).doprint(expr)
                 exprs.append(f"{source_ref(source)} == {sexpr}")
                 if (
-                    isinstance(expr, sympy.Symbol) and
-                    expr in symbol_to_constraints and
                     isinstance(source, TensorPropertySource)
                     and source.prop is TensorProperty.SIZE
-                    and equalities_inputs and
-                    not equalities_inputs.is_equal(source, symbol_to_source[expr][0])
+                    and equalities_inputs
+                    and len(expr.free_symbols) == 1
                 ):
-                    msg = (
-                        f"The values of {self.debug_name(source)} = {source.name()} and "
-                        f"{self.debug_name(symbol_to_source[expr][0])} = {symbol_to_source[expr][0].name()} "
-                        "must always be equal."
-                    )
-                    record_constraint_violation(equalities_inputs.warn_only, self.debug_name(source), msg)
+                    symbol = next(iter(expr.free_symbols))
+                    if (
+                        isinstance(expr, sympy.Symbol) and
+                        expr in symbol_to_constraints and
+                        not equalities_inputs.is_equal(source, symbol_to_source[expr][0])
+                    ):
+                        msg = (
+                            f"The values of {self._debug_name(source)} = {source.name()} and "
+                            f"{self._debug_name(symbol_to_source[expr][0])} = {symbol_to_source[expr][0].name()} "
+                            "must always be equal."
+                        )
+                        record_constraint_violation(equalities_inputs.warn_only, self._debug_name(source), msg)
+
+                    if (
+                        not isinstance(expr, sympy.Symbol) and
+                        symbol in symbol_to_constraints and
+                        not equalities_inputs.is_derived(source, symbol_to_source[symbol][0], lambda x: expr.xreplace({symbol: x}))
+                    ):
+                        src = symbol_to_source[symbol][0]
+                        msg = (
+                            f"The values of {self._debug_name(source)} = {source.name()} must always be related to "
+                            f"the values of {self._debug_name(src)} = {src.name()} by "
+                            f"{self._debug_name(source)} = {expr.xreplace({symbol: sympy.sympify(self._debug_name(src))})}."
+                        )
+                        record_constraint_violation(equalities_inputs.warn_only, self._debug_name(source), msg)
+
                 # NB: Not necessary to report constraint violations here:
                 # constraints are guaranteed to be on symbols (we've already
                 # caught constants and non-atomic expressions), so we only
@@ -2913,6 +3728,12 @@ def issue_guard(guard: ShapeGuard) -> None:
             if expr in issued:
                 return
 
+            # When propagate_real_tensors is on, we may end up with guards on
+            # data dependent variables.  These guards are unissuable, so just ignore them
+            if free_unbacked_symbols(expr):
+                log.warning("propagate_real_tensors: ignoring guard %s", expr)
+                return
+
             issued.add(expr)
 
             try:
@@ -2930,12 +3751,13 @@ def issue_guard(guard: ShapeGuard) -> None:
                     constraints = symbol_to_constraints[symbol]
                     for c in constraints:
                         if isinstance(c, StrictMinMaxConstraint):
-                            var_with_range = self.render_range_for_constraint_violation(source, c)
-                            msg = (
-                                f"Not all values of {var_with_range} "
-                                f"satisfy the generated guard {guard_expr}."
-                            )
-                            record_constraint_violation(c.warn_only, self.debug_name(source), msg)
+                            if not _disable_forced_specializations:
+                                var_with_range = self._render_range_for_constraint_violation(source, c)
+                                msg = (
+                                    f"Not all values of {var_with_range} "
+                                    f"satisfy the generated guard {guard_expr}."
+                                )
+                                record_constraint_violation(c.warn_only, self._debug_name(source), msg)
                         elif isinstance(c, RelaxedUnspecConstraint):
                             # This is fine, we allow guards here as long as it
                             # didn't constrain it to one value  (we don't
@@ -2948,58 +3770,70 @@ def issue_guard(guard: ShapeGuard) -> None:
                 self.log.warning("Failing guard allocated at: \n%s", ''.join(guard.stack.format()))
                 raise
 
-        # First, issue all the non-trivial guards.
+        # First, issue all guards.
+        # This removes all the checks that follow from bounds
+        # We could simply emit those and also the bounds 2 <= size when necessary
         for guard in self.guards:
-            if self._maybe_evaluate_static(guard.expr) is not None:
+            if self._maybe_evaluate_static(guard.expr, axioms=()) is not None:
                 continue
             issue_guard(guard)
 
-        # Then, issue the guards that refine the value range of tracked symbols.
-        # We need to explicitly issue these guards, since they are the ones that
-        # guarantee the symbol's value range. Plus, due to the updated value
-        # range, they may be skipped in the previous step.
-        for symbol, guards in self.var_to_guards.items():
-            if symbol not in symbol_to_source:
-                continue
-            for guard in guards:
-                if guard is not None:
-                    issue_guard(guard)
-
         # 3. Every symbol must be within its value range (this handles 0/1
-        # specialization too).  NB: because we never update value ranges
-        # except in case of explicit user annotation, these are not included
-        # in simplified.  However, when we start updating value ranges
-        # these should probably get reported in tests too
-        if not _simplified:
-            for symbol, sources in symbol_to_source.items():
-                r = self.runtime_var_to_range.get(symbol)
-                if r is None:
-                    if symbol not in self.var_to_range:
-                        continue
-                    r = self.var_to_range[symbol]
-
-                assert sources
-                assert symbol.is_integer
-                g_lower, g_upper = self.var_to_guards.get(symbol, (None, None))
-                bounds = []
-                if r.lower != -sympy.oo and g_lower is None:
-                    if any(is_dim(source) for source in sources):
-                        self.dim_constraints.add(sympy.Ge(symbol, r.lower))
+        # specialization too).
+        for symbol, sources in symbol_to_source.items():
+            r = self.var_to_range.get(symbol)
+            if r is None:
+                if symbol not in self.var_to_range:
+                    continue
+                r = self.var_to_range[symbol]
+
+            assert sources
+            assert symbol.is_integer
+            bounds = []
+            if r.lower != -sympy.oo:
+                if any(is_dim(source) for source in sources):
+                    self.dim_constraints.add(sympy.Ge(symbol, r.lower))
+                # Only print lower bound in simplified mode if it is not the
+                # default
+                if not _simplified or r.lower != self._default_value_range().lower:
                     bounds.append(str(r.lower))
-                bounds.append(source_ref(sources[0]))
-                # NB: This looks like an off-by-one error but it's not: the
-                # upper bound may be sys.maxsize - 1 because we intentionally
-                # exclude sys.maxsize from our bounds to deal with direct
-                # == INT_MAX guards, but it's still dumb to actually test it.
-                # Note that you can be off by a pretty large constant and it
-                # won't matter because sizes in practice will be no where near
-                # the 64-bit limit.
-                if r.upper != sympy.oo and r.upper < sys.maxsize - 1 and g_upper is None:
-                    if any(is_dim(source) for source in sources):
-                        self.dim_constraints.add(sympy.Le(symbol, r.upper))
-                    bounds.append(str(r.upper))
-                if len(bounds) > 1:
-                    exprs.append(" <= ".join(bounds))
+            bounds.append(source_ref(sources[0]))
+            # NB: This looks like an off-by-one error but it's not: the
+            # upper bound may be sys.maxsize - 1 because we intentionally
+            # exclude sys.maxsize from our bounds to deal with direct
+            # == INT_MAX guards, but it's still dumb to actually test it.
+            # Note that you can be off by a pretty large constant and it
+            # won't matter because sizes in practice will be no where near
+            # the 64-bit limit.
+            if r.upper != sympy.oo and r.upper < sys.maxsize - 1:
+                if any(is_dim(source) for source in sources):
+                    self.dim_constraints.add(sympy.Le(symbol, r.upper))
+                # nontrivial upper bound is always interesting
+                bounds.append(str(r.upper))
+            if len(bounds) > 1:
+                exprs.append(" <= ".join(bounds))
+
+                # Check constraints
+                constraints = symbol_to_constraints[symbol]
+                for c in constraints:
+                    if isinstance(c, StrictMinMaxConstraint):
+                        # NB: By default, we have a restrictive range
+                        # 2 <= s0 <= sys.maxsize - 1.  But export users generally
+                        # expect to be able to specify nice ranges like [0, oo]
+                        if not (c.vr & self._default_value_range()).issubset(r):
+                            source = sources[0]
+
+                            expr = sympy.And(sympy.Le(r.lower, symbol), sympy.Le(symbol, r.upper))
+                            guard_expr = ShapeGuardPrinter(symbol_to_source, source_ref, self.var_to_sources).doprint(expr)
+                            var_with_range = self._render_range_for_constraint_violation(source, c)
+                            msg = (
+                                f"Not all values of {var_with_range} satisfy the generated guard {guard_expr}"
+                            )
+                            record_constraint_violation(
+                                c.warn_only,
+                                self._debug_name(source),
+                                msg,
+                            )
 
         if constraint_violations:
             warn_msgs = []
@@ -3018,7 +3852,7 @@ def issue_guard(guard: ShapeGuard) -> None:
                 err = '\n'.join(error_msgs)
                 raise ConstraintViolationError(
                     f"Constraints violated ({debug_names})! "
-                    "For more information, run with TORCH_LOGS=dynamic.\n"
+                    "For more information, run with TORCH_LOGS=\"+dynamic\".\n"
                     f"{err}"
                 )
             elif len(warn_msgs) > 0:
@@ -3032,6 +3866,9 @@ def issue_guard(guard: ShapeGuard) -> None:
                 **self.counter,
                 "num_guards": len(exprs),
                 "free_symbols": sum(1 for v in symbol_to_source.values() if v),
+                # The keys are meaningless from an aggregate perspective, so
+                # don't include them.  Biggest first.
+                "symbol_guard_counts": sorted(self.symbol_guard_counter.values(), reverse=True),
             },
         )
 
@@ -3048,9 +3885,6 @@ def issue_guard(guard: ShapeGuard) -> None:
             # Add value range bound guards for all symbols with no trivial bounds.
             # Reason: '_maybe_evaluate_static' may eliminate guards based on the
             # refined value ranges.
-            #
-            # NB: do NOT use runtime var ranges, they're unsound!  You will
-            # only get correct TV with the compile-time ranges.
             for sym, vr in self.var_to_range.items():
                 if vr.lower != -sympy.oo:
                     self._add_target_expr(sympy.Le(vr.lower, sym))
@@ -3087,26 +3921,30 @@ def evaluate_guards_expression(self, code, args):
         return eval(code, SYMPY_INTERP, {"L": dict(zip(arg_names, args))})
 
     def evaluate_guards_for_args(self, placeholders, args, *, ignore_static=True):
+        """Generate guards for a graph's placeholder values and evaluate the guards with args
+        """
         code = self.produce_guards_expression(placeholders, ignore_static=ignore_static)
         if code:
             return self.evaluate_guards_expression(code, args)
         return True
 
     def bind_symbols(self, placeholders, args):
-        # Given a paired list of placeholders (fake tensors with
-        # symbolic sizes) and concrete arguments (regular tensors
-        # with real sizes), returns a dictionary mapping each
-        # symbol to its real value.  So for example, if you
-        # have a placeholder with size (s0, s1), binding
-        # (2, 4) to it will give you {s0: 2, s1: 4}.  This is
-        # not guaranteed to bind ALL symbols in the ShapeEnv;
-        # we can't bind a symbol if it doesn't occur in any placeholder,
-        # and symbols that already have replacements won't get bindings.
-
-        # This is a little duplicative with evaluate_guards but
-        # it's different enough that it seemed cleanest to make
-        # another copy.  This assumes the guards are already checked,
-        # though if it's cheap we'll check for shenanigans
+        """
+        Given a paired list of placeholders (fake tensors with
+        symbolic sizes) and concrete arguments (regular tensors
+        with real sizes), returns a dictionary mapping each
+        symbol to its real value.  So for example, if you
+        have a placeholder with size (s0, s1), binding
+        (2, 4) to it will give you {s0: 2, s1: 4}.  This is
+        not guaranteed to bind ALL symbols in the ShapeEnv;
+        we can't bind a symbol if it doesn't occur in any placeholder,
+        and symbols that already have replacements won't get bindings.
+
+        This is a little duplicative with evaluate_guards but
+        it's different enough that it seemed cleanest to make
+        another copy.  This assumes the guards are already checked,
+        though if it's cheap we'll check for shenanigans
+        """
         bindings: Dict[sympy.Symbol, int] = {}
 
         def bind_symint(arg, val):
@@ -3140,9 +3978,11 @@ def bind_symint(arg, val):
         return bindings
 
     def get_nontrivial_guards(self):
-        return [self.simplify(guard.expr) for guard in self.guards if self._maybe_evaluate_static(guard.expr) is None]
+        """Returns a list of guard expressions that aren't statically known (i.e. not trivial)"""
+        return [self.simplify(guard.expr) for guard in self.guards if self._maybe_evaluate_static(guard.expr, axioms=()) is None]
 
     def format_guards(self, verbose=False):
+        """Format this shape env's guard expressions with optional traceback info if verbose"""
         def format_tb(tb):
             if not verbose:
                 return ""
@@ -3150,16 +3990,75 @@ def format_tb(tb):
 
         return '\n'.join(f" - {guard.expr}{format_tb(guard.stack)}" for guard in self.guards)
 
-    def get_shape_groups(self):
-        shape_groups = collections.defaultdict(list)
-        for k, v in self.replacements.items():
-            shape_groups[v].append(k)
-        return shape_groups
+    def bound_sympy(self, expr: sympy.Expr, size_oblivious: bool = False) -> ValueRanges:
+        """Given a sympy expression, computes a ValueRanges bound for what values it can be"""
+        var_to_range = {x: self.var_to_range.get(x, None) for x in expr.free_symbols}
+        if size_oblivious:
+            # Clamp values of size-like variables
+            for x in self.size_like & var_to_range.keys():
+                if var_to_range[x] is not None:
+                    var_to_range[x] = ValueRanges(2, sympy.oo)
+        return bound_sympy(expr, var_to_range)
+
+    @_lru_cache
+    def get_axioms(self, symbols: Optional[Tuple["sympy.Symbol"]] = None) -> Tuple["sympy.Expr"]:
+        """
+        Given the symbols in an expression, it returns all the runtime asserts that have those symbols
+        concatenated with all the guards.
+        If symbols is None, it returns all the runtime asserts (and all the guards)
+        """
+        if symbols is None:
+            runtime_asserts = (r.expr
+                               for rs in self.deferred_runtime_asserts.values()
+                               for r in rs)
+        else:
+            runtime_asserts = (r.expr
+                               for s in symbols if s not in self.var_to_val
+                               for r in self.deferred_runtime_asserts.get(s, ()))
+        guards = (g.expr for g in self.guards)
+        return tuple(itertools.chain(guards, runtime_asserts))
+
+    @_lru_cache
+    def get_implications(self,
+                         e: "sympy.Expr",
+                         compute_hint: bool) -> Tuple[Tuple["sympy.Expr", 'sympy.logic.boolalg.BooleanAtom']]:
+        """ Given a expression, it returns a list of predicates that follow from it """
+        equiv = {}
+
+        def add_expr(expr):
+            # Expr and negation
+            equiv[canonicalize_bool_expr(expr)] = sympy.true
+            equiv[canonicalize_bool_expr(sympy.Not(expr))] = sympy.false
+            if isinstance(expr, sympy.Rel):
+                if isinstance(expr, (sympy.Eq, sympy.Ne)):
+                    # multiplying by -1 ensures that equality is commutative
+                    dual = type(expr)(-expr.lhs, -expr.rhs)
+                else:
+                    # multiplying by -1 changes the direction of the inequality
+                    dual = type(expr)(-expr.rhs, -expr.lhs)
+                equiv[canonicalize_bool_expr(dual)] = sympy.true
+                equiv[canonicalize_bool_expr(sympy.Not(dual))] = sympy.false
+
+        if compute_hint:
+            e = canonicalize_bool_expr(e.xreplace(self.var_to_val))
+        add_expr(e)
+        # Other relational expressions this expression implies
+        if isinstance(e, sympy.Eq):
+            add_expr(sympy.Le(e.lhs, e.rhs))
+            add_expr(sympy.Ge(e.lhs, e.rhs))
+        elif isinstance(e, sympy.Lt):
+            add_expr(sympy.Le(e.lhs, e.rhs))
+            add_expr(sympy.Ne(e.lhs, e.rhs))
+            if e.lhs.is_integer and e.rhs.is_integer:
+                add_expr(sympy.Le(e.lhs, e.rhs - 1))
+        elif isinstance(e, sympy.Le):
+            add_expr(sympy.Lt(e.lhs, e.rhs + 1))
+        return tuple(equiv.items())
 
     @_lru_cache
     def _maybe_evaluate_static(
         self, expr: "sympy.Expr", *, unbacked_only: bool = False, compute_hint: bool = False,
-        expect_rational=True,
+        expect_rational=True, size_oblivious: bool = False, axioms: Optional[Tuple[sympy.Expr]] = None
     ) -> "Optional[sympy.Expr]":
         """
         Tries to evaluate expr without introducing guards
@@ -3173,6 +4072,9 @@ def _maybe_evaluate_static(
         hint for the particular hint values of backed SymInts, e.g., if
         s0 happens to be 3 this run, compute_hint will subsitute s0 with 3.
         """
+        # axioms with compute hint NYE
+        assert not compute_hint or not axioms
+
         expr = self.simplify(expr)
 
         if compute_hint:
@@ -3180,30 +4082,15 @@ def _maybe_evaluate_static(
 
         expr = canonicalize_bool_expr(expr)
 
-        symbols = list(expr.free_symbols)
+        # Pattern matching
+        symbols = tuple(expr.free_symbols)
+        if axioms is None:
+            axioms = self.get_axioms(symbols)
+        subst = {}
+        for e in axioms:
+            subst.update(dict(self.get_implications(e, compute_hint=compute_hint)))
 
-        # Apply known runtime asserts
-        for s in symbols:
-            # Unbacked symints only
-            if s in self.var_to_val:
-                continue
-            subst = {}
-            for ra in self.deferred_runtime_asserts.get(s, ()):
-                if compute_hint:
-                    e = canonicalize_bool_expr(ra.expr.xreplace(self.var_to_val))
-                else:
-                    e = ra.expr
-                # e is already canonical
-                subst[e] = sympy.true
-                subst[canonicalize_bool_expr(sympy.Not(e))] = sympy.false
-                if isinstance(e, sympy.Eq):
-                    subst[sympy.Le(e.lhs, e.rhs)] = sympy.true
-                    subst[sympy.Le(-e.lhs, -e.rhs)] = sympy.true
-                    subst[sympy.Lt(e.lhs, e.rhs)] = sympy.false
-                    subst[sympy.Lt(-e.lhs, -e.rhs)] = sympy.false
-
-            # NB: this helps us deal with And/Or connectives
-            expr = expr.subs(subst)
+        expr = expr.xreplace(subst)
 
         # Simplify making use of value range lower bound
         new_shape_env = {}
@@ -3214,11 +4101,15 @@ def _maybe_evaluate_static(
                 # for jagged layout NestedTensors today
                 continue
             vr = self.var_to_range[k]
+            if size_oblivious and k in self.size_like:
+                lower = max(2, vr.lower)
+            else:
+                lower = vr.lower
             # Don't do anything if we don't have a nontrivial lower bound
             # Also don't do anything if we asked only to simplify unbacked
             # SymInt
             if (
-                vr.lower < (-sys.maxsize - 1) // 2 or
+                lower < (-sys.maxsize - 1) // 2 or
                 (unbacked_only and k in self.var_to_val)
             ):
                 new_range_env[k] = vr
@@ -3230,8 +4121,17 @@ def _maybe_evaluate_static(
             # we have to increase it by offset (and conversely, the new
             # variables have to have their value range bounds adjusted as
             # well)
-            s = sympy.Symbol(f"shape_{idx}", positive=True, integer=True)
-            offset = vr.lower - 1
+            s = sympy.Symbol(f"evaluate_static_shape_{idx}", positive=True, integer=True)
+
+            # Note:
+            #   Offset might be a fraction(e.g. aten.split.Tensor), but shapes are always integers.
+            #   Sympy might give unexepected results when comparing an integer with a non-integer
+            #   Therefore, we cast offset to int here.
+            #   For example:
+            #       shape_0 = sympy.Symbol("shape_0", positive=True, integer=True)
+            #       expr = sympy.Eq(shape_0 - 1/3, 4)
+            #       expr.xreplace({}) # False
+            offset = int(lower - 1)
             new_shape_env[k] = s + offset
             new_range_env[s] = SymPyValueRangeAnalysis.add(vr, -offset)
 
@@ -3265,6 +4165,8 @@ def replace(expr, repl):
 
     @_lru_cache
     def replace(self, expr: "sympy.Expr") -> "sympy.Expr":
+        """Apply symbol replacements to any symbols in the given expression
+        """
         replacements = {s: self._find(cast(sympy.Symbol, s)) for s in expr.free_symbols}
         return safe_expand(expr.xreplace(replacements))
 
@@ -3281,6 +4183,8 @@ def _update_divisible(self):
 
     @_lru_cache
     def simplify(self, expr: "sympy.Expr") -> "sympy.Expr":
+        """Use known constraints and replacements to simplify the given expr
+        """
         expr = self.replace(expr)
         # TODO it would seem that this pass is not necessary given the
         # below replacement of // with /, but for nested FloorDivs
@@ -3336,6 +4240,13 @@ def size_hint(self, expr: "sympy.Expr", *, allow_none=False):
                 return r
             if allow_none:
                 return None
+
+            if self.unbacked_var_to_val:
+                unsound_expr = result_expr.xreplace(self.unbacked_var_to_val)
+                if not unsound_expr.free_symbols:
+                    log.warning("propagate_real_tensors size_hint(%s) -> %s", expr, unsound_expr)
+                    return unsound_expr
+
             raise self._make_data_dependent_error(result_expr, expr)
         return result_expr
 
@@ -3345,43 +4256,175 @@ def has_hint(self, expr: "sympy.Expr"):
         result_expr = safe_expand(expr).xreplace(self.var_to_val)
         return result_expr.is_number or self._maybe_evaluate_static(result_expr) is not None
 
-    def _make_data_dependent_error(self, expr, unhinted_expr):
+    def _make_data_dependent_error(self, expr, unhinted_expr, *, size_oblivious_result: Optional[bool] = None):
         # TODO: in a Dynamo context, having user code, and having the
         # name of the local, will be much better
+        size_like_symbols = []
         for s in expr.free_symbols:
             stacktrace = ''.join(self.var_to_stack[s].format())
             self.log.debug("Data dependent variable '%s' allocated at:\n%s", s, stacktrace)
+            if s in self.size_like:
+                size_like_symbols.append(s)
+        size_oblivious_result_msg = ""
+        if size_oblivious_result is not None:
+            size_oblivious_result_msg = (
+                f"ATTENTION: guard_size_oblivious would fix the error, evaluating expression to {size_oblivious_result}.\n"
+                "Maybe you need to add guard_size_oblivious to framework code, see doc below for more guidance.\n\n"
+            )
+        fsummary, maybe_user_loc, maybe_extra_debug = self._get_stack_summary(True)
+        if expr.is_integer:
+            msg = "Could extract specialized integer from data-dependent expression"
+        else:
+            msg = "Could not guard on data-dependent expression"
         return GuardOnDataDependentSymNode(
-            "It appears that you're trying to get a value out of symbolic int/float "
-            "whose value is data-dependent (and thus we do not know the true value.)  "
-            f"The expression we were trying to evaluate is {expr} (unhinted: {unhinted_expr}).  "
-            "Scroll up to see where each of these data-dependent accesses originally occurred."
+            f"{msg} {expr} (unhinted: {unhinted_expr}).  "
+            f"(Size-like symbols: {', '.join(map(str, size_like_symbols)) or 'none'})\n\n"
+            f"{size_oblivious_result_msg}"
+            "Potential framework code culprit (scroll up for full backtrace):\n"
+            f"{''.join(traceback.StackSummary.from_list([fsummary]).format())}\n"
+            "For more information, run with TORCH_LOGS=\"dynamic\"\n"
+            "For extended logs when we create symbols, also add "
+            f"TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL=\"{','.join(map(str, expr.free_symbols))}\"\n"
+            "If you suspect the guard was triggered from C++, add TORCHDYNAMO_EXTENDED_DEBUG_CPP=1\n"
+            "For more debugging help, see "
+            "https://docs.google.com/document/d/1HSuTTVvYH1pTew89Rtpeu84Ht3nQEFTYhAX3Ypa_xJs/edit?usp=sharing\n" +
+            maybe_extra_debug
             # TODO: Help text about how to use our runtime tests to fix this
             # problem
         )
 
-    def _set_replacement(self, a: "sympy.Symbol", expr: "sympy.Expr") -> None:
+    def _update_var_to_range(self, symbol, vr):
+        lower, upper = vr.lower, vr.upper
+
+        # If we have a size-like unbacked SymInt, refuse to refine the range to be
+        # less than two.  This is because when we intersect this range
+        # with [2, inf] for size oblivious tests, the range would be
+        # unsatisfiable.  In other words, once you have a size-like
+        # unbacked SymInt, we can never learn that it is exactly zero or one,
+        # because we would now give inconsistent results for all size
+        # oblivous tests!
+        if upper < 2 and symbol in self.size_like:
+            upper = 2
+
+        # Updates the range and the guards corresponding to each bound of the symbol.
+        if symbol not in self.var_to_range:
+            self.var_to_range[symbol] = ValueRanges(lower, upper)
+        else:
+            self.var_to_range[symbol] &= ValueRanges(lower, upper)
+
+    def _set_replacement(self, a: "sympy.Symbol", tgt: "sympy.Expr", msg: str) -> None:
         """
         Adds or updates a replacement for a symbol.
-        Use this instead of `self.replacements[a] = expr`.
+        Use this instead of `self.replacements[a] = tgt`.
         """
-        if config.print_specializations and isinstance(expr, (sympy.Integer, sympy.Float)):
+
+        # Precondition: a == tgt
+        assert isinstance(a, sympy.Symbol)
+
+        # Handles nested tensor symbolic variables which don't have
+        # var_to_range bounds
+        tgt_bound = None
+        if a in self.var_to_range:
+            src_bound = self.var_to_range[a]
+
+            # If you have x in [2, maxint], then 2*x in [4, 2*maxint].
+            # But we don't really care that the max bound says we can
+            # go beyond the maximum integer size, because we aren't
+            # using bigints anyway.  Arguably, ValueRanges should know
+            # to do this truncation automaticaly (to avoid doing
+            # bigint compute in range analysis), but right now it doesn't
+            # so we need to get rid of some unnecessary precision.
+            int_range = ValueRanges(-sys.maxsize - 1, sys.maxsize - 1)
+
+            def issubset(x, y):
+                return (x & int_range).issubset(y & int_range)
+
+            # First, refine the value range of a based on the computed value range
+            # of tgt.  This is always OK to do, even if we decide not to do the
+            # substitution in the end.  This might be a no-op, if a already has
+            # a tighter bound
+            tgt_bound = self.bound_sympy(tgt)
+            self._update_var_to_range(a, tgt_bound)
+
+            # Next, check if we can update the range of free symbols in tgt
+            # based on the range in a. But only do it if:
+            #  - the source bound non-trivially improves over what we get out of
+            #    the existing bounds.
+            #  - the replacement is univariate and we can invert the tgt expression
+            if not issubset(tgt_bound, src_bound) and len(tgt.free_symbols) == 1:
+                b = next(iter(tgt.free_symbols))
+                # Try to invert the equality
+                r = try_solve(sympy.Eq(a, tgt), b, floordiv_inequality=False)
+                if r is not None:
+                    b_bound = self.bound_sympy(r[1])
+                    self.var_to_range[b] = b_bound & self.var_to_range[b]
+                    tgt_bound = self.bound_sympy(tgt)
+                    assert issubset(tgt_bound, src_bound)
+
+            # TODO: Should we propagate size-like-ness?
+            #
+            # Pros: if u0 is size-like, intuitively u0 == u1 should cause u1
+            # to become size-like.
+            #
+            # Cons: if u0 is size-like, what about u0 - 1 == u1?  You CAN'T
+            # propagate in this case, because what if u0 == 0, then u1 is negative
+            # and clearly isn't a size.  So, at minimum, any f(x) whose value
+            # range isn't [0, inf] given x in [0, inf] cannot propagate
+            # size-like-ness.  But there are many situations where you could
+            # imagine u1 is going to be size-like and actually you just didn't
+            # have a refined enough value range on u0.  Since even innocuous
+            # looking arithmetic operations can destroy size-like-ness, it's
+            # best to not propagate it at all and force the user to annotate it
+            # as necessary.
+            #
+            # Compromise: we preserve size-like-ness only for exact equality
+            # and nothing else.
+            if a in self.size_like and isinstance(tgt, sympy.Symbol):
+                self.size_like.add(tgt)
+            elif isinstance(tgt, sympy.Symbol) and tgt in self.size_like:
+                self.size_like.add(a)
+
+            # Now, decide if we will do the substitution.
+            #
+            #  - If the source has a non-trivial range, only substitute if
+            #    we preserve this range.  Note that we may have propagated
+            #    the src_range to free variables in tgt when tgt is univariate
+            #    and we could find an inverse, which helps us achieve this.
+            #    This ensures we never "forget" about user defined ranges,
+            #    even if they end up being defined on composite formulas
+            #    like s0 + s1.
+            #
+            #  - If the variable is unbacked, only substitute if the substitution
+            #    would preserve the bounds also under size-like-ness conditions.
+
+            if not issubset(tgt_bound, src_bound):
+                self.log.debug("skipped set_replacement %s = %s (%s) [%s not subset of %s]", a, tgt, msg, tgt_bound, src_bound)
+                return
+            elif a in self.size_like:
+                tgt_bound_so = self.bound_sympy(tgt, size_oblivious=True)
+                src_bound_so = self.bound_sympy(a, size_oblivious=True)
+                if not issubset(tgt_bound_so, src_bound_so):
+                    self.log.debug("skipped set_replacement %s = %s (%s) "
+                                   "[%s not subset of %s (size-oblivious conditions)]", a, tgt, msg, tgt_bound_so, src_bound_so)
+                    return
+
+        if config.print_specializations and isinstance(tgt, (sympy.Integer, sympy.Float)):
             # specializing to a constant, which is likely unexpected
 
             # NOTE(avik): It is possible that we try logging the same specialization multiple times, e.g.,
             # when adding a to self.replacements, and again when simplifying an expression containing a.
             # Thus to avoid duplication, checking whether a is in self.replacements isn't enough; if it is,
-            # it must not already map to `expr`. Fortunately this check is cheap because `expr` is a constant.
-            if a not in self.replacements or expr != self.replacements[a]:
-                self.log.warning("Specializing %s to %s", self.var_to_sources[a][0].name(), expr)
+            # it must not already map to `tgt`. Fortunately this check is cheap because `tgt` is a constant.
+            if a not in self.replacements or tgt != self.replacements[a]:
+                self.log.warning("Specializing %s to %s", self.var_to_sources[a][0].name(), tgt)
                 self.log.debug("SPECIALIZATION", stack_info=True)
-        log.info("set_replacement %s = %s", a, expr)
-        self.replacements[a] = expr
+        log.info("set_replacement %s = %s (%s) %s", a, tgt, msg, tgt_bound)
+        self.replacements[a] = tgt
         self._update_version_counter()
 
-        # When specializing 'a == expr', the equality should be also conveyed to
+        # When specializing 'a == tgt', the equality should be also conveyed to
         # Z3, in case an expression uses 'a'.
-        self._add_target_expr(sympy.Eq(a, expr))
+        self._add_target_expr(sympy.Eq(a, tgt))
 
     def _add_divisible(self, expr: "sympy.Expr"):
         self.divisible.add(expr)
@@ -3401,63 +4444,86 @@ def _find(self, a: "sympy.Symbol") -> "sympy.Expr":
             return a
         res = self.replacements[a]
         cur_replace = {s: self._find(s) for s in res.free_symbols}
-        self._set_replacement(a, self.replacements[a].xreplace(cur_replace))
+        replaced, changed = self.replacements[a]._xreplace(cur_replace)
+        if changed:
+            self._set_replacement(a, replaced, "find")
         return self.replacements[a]
 
     @lru_cache(256)
-    def _maybe_guard_eq(self, expr: Union["sympy.Eq", "sympy.Ne"], concrete_bool: bool) -> None:
+    def _maybe_guard_rel(self, expr: "sympy.Rel") -> None:
         """
-        Evaluates the result of an eq call. If true, uses information to
+        The relational guard is guarded to be true.  Use this information to
         simplify shapes (i.e. a == b or a % 5 == 0)
         """
-        assert type(concrete_bool) is bool
-        if isinstance(expr, sympy.Eq):
-            if not concrete_bool:
-                return
-        # NB: Apparently this is load bearing; to see what test fails if
-        # you comment it out run:
+        assert isinstance(expr, sympy.Rel)
+
+        # A good example of what goes wrong if you don't do this is
         # python test/functorch/test_aotdispatch.py -k
         # test_aot_autograd_symbolic_module_exhaustive_nn_LazyConv3d_cpu_float32
-        elif isinstance(expr, sympy.Ne):
-            if concrete_bool:
-                return
+        if isinstance(expr, sympy.Ne):
+            return
+
         free = list(expr.free_symbols)
 
         assert len(free) > 0, f"The expression should not be static by this point: {expr}"
         # In case of really gnarly expression, we don't blow up
         if len(free) > 5:
             return
-        # NB: prioritize unbacked symints for solving by ordering them last
-        free = sorted(free, key=lambda x: (self.size_hint(x, allow_none=True) or sys.maxsize, x.name), reverse=True)  # type: ignore[attr-defined]
+
+        # Prioritize unbacked symints for solving by ordering them last.
+        # Prefer to simplify out lexicographically higher symbols (i.e. simplify out s4 over s3).
+        #   (NB: this unfortunately isn't strictly equivalent to simplifying out newer symbols)
+        # Prefer to simplify out symbols with ephemeral sources.
+        def _smart_symbol_sort(x):
+            has_only_ephemeral_sources = (
+                x in self.var_to_sources and all(s.is_ephemeral() for s in self.var_to_sources[x])
+            )
+            size = self.size_hint(x, allow_none=True) or sys.maxsize
+            name = x.name
+            # 1 puts ephemeral sourced symbols first when sorting in reverse
+            return (1 if has_only_ephemeral_sources else 0, size, name)
+
+        free = sorted(free, key=_smart_symbol_sort, reverse=True)  # type: ignore[attr-defined]
         lhs = expr.lhs
         rhs = expr.rhs
+
+        self._refine_ranges(expr)
+
+        # The rest of this stuff is for equality only
+        if not isinstance(expr, sympy.Eq):
+            return
+
         if not expr.has(Mod):
             try:
                 floor_div_atoms = lhs.atoms(FloorDiv).union(rhs.atoms(FloorDiv))
                 if len(floor_div_atoms) > 0 and any(a.divisor != 1 for a in floor_div_atoms):
                     raise NotImplementedError
-                # short-circuit when no solving is needed
-                if isinstance(lhs, sympy.Symbol) and free_unbacked_symbols(lhs):
-                    self._set_replacement(lhs, self._find(rhs))
-                elif isinstance(rhs, sympy.Symbol) and free_unbacked_symbols(rhs):
-                    self._set_replacement(rhs, self._find(lhs))
+                # Never replace unbacked symbols with other unbacked symbols.
+                # This is error prone because you can cause references to
+                # unbacked symbols to time travel backwards.  E.g.,
+                #
+                # u1 = x.item()
+                # ... use of u1 ...
+                # u2 = y.item()
+                # u3 = z.item()
+                # torch._check(u1 == u2 + u3)
+                #
+                # If you replace u1 with u2 + u3, then the use of u1 now
+                # references u2 and u3 prior to them actually being bound at
+                # runtime.  It's pretty inconvenient to setup control
+                # dependencies for substitutions, so ban it entirely.
+                if isinstance(lhs, sympy.Symbol) and free_unbacked_symbols(lhs) and not free_unbacked_symbols(rhs):
+                    # short-circuit when no solving is needed
+                    self._set_replacement(lhs, self._find(rhs), "trivial_lhs")
+                elif isinstance(rhs, sympy.Symbol) and free_unbacked_symbols(rhs) and not free_unbacked_symbols(lhs):
+                    self._set_replacement(rhs, self._find(lhs), "trivial_rhs")
                 else:
                     r = try_solve(expr, free[0], floordiv_inequality=False)
                     if r is not None and all(t.is_integer for t in sympy.preorder_traversal(r[1])):
                         new_var = self._find(r[1])
-                        ok = False
-                        if self.is_unbacked_symint(free[0]):
-                            # If you have i0 + i1 + i2 = s0, don't substitute i2 =
-                            # s0 - i0 - i1.  Arguably this should be OK but the
-                            # runtime assert machinery is very delicate right now
-                            # so this causes things to fail e.g.,
-                            # test_split_unbacked_sizes
-                            ok = len(free_unbacked_symbols(new_var)) <= 1
-                        else:
-                            # Never substitute backed with unbacked
-                            ok = len(free_unbacked_symbols(new_var)) == 0
+                        ok = len(free_unbacked_symbols(new_var)) == 0
                         if ok:
-                            self._set_replacement(cast(sympy.Symbol, free[0]), new_var)
+                            self._set_replacement(cast(sympy.Symbol, free[0]), new_var, "solve")
             except NotImplementedError:
                 pass
         if expr.has(Mod):
@@ -3484,13 +4550,13 @@ def _maybe_guard_eq(self, expr: Union["sympy.Eq", "sympy.Ne"], concrete_bool: bo
                             # Propagate the value ranges.  It doesn't really
                             # matter if we use truediv or floordiv, because we
                             # have established divisibility.
-                            self.var_to_range[i1] = SymPyValueRangeAnalysis.truediv(
+                            self._update_var_to_range(i1, SymPyValueRangeAnalysis.truediv(
                                 self.var_to_range[i0], ValueRanges.wrap(d)
-                            )
-                            self.runtime_var_to_range[i1] = SymPyValueRangeAnalysis.truediv(
-                                self.runtime_var_to_range[i0], ValueRanges.wrap(d)
-                            )
-                            self._set_replacement(i0, d * i1)
+                            ))
+                            # Propagate size-like-ness
+                            if i0 in self.size_like:
+                                self.size_like.add(i1)
+                            self._set_replacement(i0, d * i1, "divisibility")
 
             except NotImplementedError:
                 pass
@@ -3537,10 +4603,10 @@ def _check_frozen(self, expr, concrete_val):
                     "version": 2,
                 },
             )
-            log.warning("Ignored guard %s == %s, this could result in accuracy problems", expr, concrete_val)
+            log.warning("Ignored guard %s == %s, this could result in accuracy problems", expr, concrete_val, stack_info=True)
 
 
-    def _get_stack_summary(self):
+    def _get_stack_summary(self, is_debug: bool = False):
         fsummary = None
         frame = inspect.currentframe()
         try:
@@ -3563,27 +4629,42 @@ def _get_stack_summary(self):
         if user_tb:
             maybe_user_loc = " at " + format_frame(user_tb[-1])
 
-        return fsummary, user_tb, maybe_user_loc
+        maybe_extra_debug = ""
+        if is_debug and user_tb:
+            maybe_extra_debug = (
+                '\nUser Stack (most recent call last):\n' +
+                '  (snipped, see stack below for prefix)\n' +
+                ''.join(traceback.format_list(user_tb))
+            )
+        if is_debug and config.extended_debug_cpp:
+            cpp_stack = CapturedTraceback.extract(cpp=True)
+            maybe_extra_debug += "\nC++ stack trace:\n" + ''.join(cpp_stack.format())
+        elif is_debug:
+            maybe_extra_debug += (
+                "\nFor C++ stack trace, run with "
+                "TORCHDYNAMO_EXTENDED_DEBUG_CPP=1"
+            )
+
+        return fsummary, maybe_user_loc, maybe_extra_debug
 
-    def _log_guard(self, prefix: str, g):
+    def _log_guard(self, prefix: str, g, forcing_spec: bool):
         if self.log.isEnabledFor(logging.INFO):
-            fsummary, user_tb, maybe_user_loc = self._get_stack_summary()
-
-            # TODO: make this an artifact
-            is_debug = False
-            maybe_extra_debug = ""
-            if is_debug and user_tb:
-                maybe_extra_debug = (
-                    '\nUser Stack (most recent call last):\n' +
-                    '  (snipped, see stack below for prefix)\n' +
-                    ''.join(traceback.format_list(user_tb))
+            str_g = str(g)
+            is_debug = config.extended_debug_guard_added is not None and str_g == config.extended_debug_guard_added
+            fsummary, maybe_user_loc, maybe_extra_debug = self._get_stack_summary(is_debug)
+            maybe_more_info = ""
+            if not is_debug:
+                maybe_more_info = (
+                    ", for more info run with "
+                    f"TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED=\"{str_g}\""
                 )
             self.log.info(
-                "%s %s [guard added]%s (%s)%s",
-                prefix,
-                g,
+                "%s %s [guard added]%s (%s)%s%s",
+                prefix if not forcing_spec else f"{prefix} (forcing_spec)",
+                str_g,
                 maybe_user_loc,
                 format_frame(fsummary),
+                maybe_more_info,
                 maybe_extra_debug,
                 stack_info=is_debug,
             )
@@ -3591,14 +4672,20 @@ def _log_guard(self, prefix: str, g):
     @lru_cache(256)
     @record_shapeenv_event(save_tracked_fakes=True)
     def evaluate_expr(self, orig_expr: "sympy.Expr", hint=None, fx_node=None,
-                      expect_rational=True):
+                      expect_rational=True, size_oblivious: bool = False, *, forcing_spec: bool = False):
         """
         Given an expression, evaluates it, adding guards if necessary
         """
-        if hint is None:
-            concrete_val = self.size_hint(orig_expr)
-        else:
-            concrete_val = sympy.sympify(hint)
+
+        # TODO: split conjunctions and evaluate them separately
+
+        # Don't track this one
+        @functools.lru_cache(None)
+        def compute_concrete_val():
+            if hint is None:
+                return self.size_hint(orig_expr)
+            else:
+                return sympy.sympify(hint)
 
         # Check if:
         #   1. 'translation_validation' is set
@@ -3613,15 +4700,17 @@ def evaluate_expr(self, orig_expr: "sympy.Expr", hint=None, fx_node=None,
                 self._translation_validation_enabled
                 and fx_node is not None
                 and not self._suppress_guards_tls()
+                and not size_oblivious
         ):
+            concrete_val = compute_concrete_val()
             if concrete_val is sympy.true:
-                node, fresh = self.create_fx_call_function(torch._assert, (fx_node,))
+                node, fresh = self._create_fx_call_function(torch._assert, (fx_node,))
             elif concrete_val is sympy.false:
-                neg, _ = self.create_fx_call_function(operator.not_, (fx_node,))
-                node, fresh = self.create_fx_call_function(torch._assert, (neg,))
+                neg, _ = self._create_fx_call_function(operator.not_, (fx_node,))
+                node, fresh = self._create_fx_call_function(torch._assert, (neg,))
             else:
-                eql, _ = self.create_fx_call_function(operator.eq, (fx_node, concrete_val))
-                node, fresh = self.create_fx_call_function(torch._assert, (eql,))
+                eql, _ = self._create_fx_call_function(operator.eq, (fx_node, concrete_val))
+                node, fresh = self._create_fx_call_function(torch._assert, (eql,))
 
             assert node is not None
             # If this is a fresh node, we have to remember the event index that
@@ -3629,7 +4718,7 @@ def evaluate_expr(self, orig_expr: "sympy.Expr", hint=None, fx_node=None,
             # Reason: so that, given an assertion node, we can replay the ShapeEnv
             # events until the point where this assertion node was freshly created.
             if fresh:
-                self.add_fx_node_metadata(node)
+                self._add_fx_node_metadata(node)
 
         # After creating the FX node corresponding to orig_expr, we must make sure that
         # no error will be raised until the end of this function.
@@ -3644,30 +4733,53 @@ def evaluate_expr(self, orig_expr: "sympy.Expr", hint=None, fx_node=None,
         try:
             if orig_expr.is_number:
                 self.log.debug("eval %s [trivial]", orig_expr)
-                # NB: don't test float as there may be precision issues
-                if isinstance(hint, (int, bool)):
+                if hint is not None:
                     assert orig_expr == hint, f"{orig_expr} != {hint}"
                 return orig_expr
 
             expr = orig_expr
 
             static_expr = self._maybe_evaluate_static(expr,
-                                                      expect_rational=expect_rational)
+                                                      expect_rational=expect_rational,
+                                                      size_oblivious=size_oblivious)
             if static_expr is not None:
                 self.log.debug("eval %s == %s [statically known]", orig_expr, static_expr)
-                # NB: don't test float as there may be precision issues
-                if isinstance(hint, (int, bool)):
+                if hint is not None:
                     assert static_expr == hint, f"{static_expr} != {hint}"
                 return static_expr
 
+            concrete_val = None
             if not (expr.free_symbols <= self.var_to_val.keys()):
                 # TODO: dedupe this with _maybe_evaluate_static
                 # Attempt to eliminate the unbacked SymInt
                 new_expr = self._maybe_evaluate_static(expr, unbacked_only=True)
                 if not (new_expr.free_symbols <= self.var_to_val.keys()):
-                    raise self._make_data_dependent_error(expr.xreplace(self.var_to_val), expr)
-                expr = new_expr
+                    size_oblivious_result = None
+                    if not size_oblivious:
+                        size_oblivious_result = self._maybe_evaluate_static(
+                            expr,
+                            expect_rational=expect_rational,
+                            size_oblivious=True
+                        )
+
+                    # Last ditch
+                    if (
+                        self.unbacked_var_to_val and
+                        not (unsound_result := orig_expr.xreplace(self.unbacked_var_to_val)).free_symbols
+                    ):
+                        log.warning("propagate_real_tensors evaluate_expr(%s) -> %s", orig_expr, unsound_result)
+                        concrete_val = unsound_result
+                    else:
+                        raise self._make_data_dependent_error(
+                            expr.xreplace(self.var_to_val),
+                            expr,
+                            size_oblivious_result=size_oblivious_result
+                        )
+                else:
+                    expr = new_expr
 
+            if concrete_val is None:
+                concrete_val = compute_concrete_val()
             self._check_frozen(expr, concrete_val)
 
             if (
@@ -3677,53 +4789,76 @@ def evaluate_expr(self, orig_expr: "sympy.Expr", hint=None, fx_node=None,
             ):
                 expr = sympy.Not(expr)
 
-            if isinstance(expr, (sympy.Eq, sympy.Ne)):
-                self._maybe_guard_eq(expr, bool(concrete_val))
-                # TODO: If we successfully eliminate a symbol via equality, it
-                # is not actually necessary to save a guard for the equality,
-                # as we will implicitly generate a guard when we match that
-                # input against the symbol
-            elif isinstance(concrete_val, sympy.Integer):
+            # Turn this into a boolean expression, no longer need to consult
+            # concrete_val
+            suppress_maybe_guard_rel = False
+            if concrete_val is sympy.true:
+                g = expr
+            elif concrete_val is sympy.false:
+                g = sympy.Not(expr)
+            else:
                 # WARNING: we cannot actually do simplifications on guards
                 # on floating point values, because Sympy generally does not
                 # think expressions on integers can ever be equal to floating
                 # point (e.g., sympy.Eq(s0/6, 0.5) evaluates to False).  Without
                 # very clear algebraic laws that hold for floating point, such
                 # simplifications are error prone anyway, so be sure not to
-                # maybe_guard_eq in those cases.
-                self._maybe_guard_eq(sympy.Eq(expr, concrete_val), True)
-
-            if concrete_val is sympy.true:
-                g = expr
-            elif concrete_val is sympy.false:
-                g = sympy.Not(expr)
-            else:
+                # maybe_guard_rel in those cases.
+                if not isinstance(concrete_val, sympy.Integer):
+                    suppress_maybe_guard_rel = True
                 g = sympy.Eq(expr, concrete_val)  # type: ignore[arg-type]
 
+            if isinstance(g, sympy.Rel):
+                # TODO: If we successfully eliminate a symbol via equality, it
+                # is not actually necessary to save a guard for the equality,
+                # as we will implicitly generate a guard when we match that
+                # input against the symbol.  Probably the easiest way to
+                # implement this is to have maybe_guard_rel return a bool
+                # saying if it "subsumed" the guard (and therefore the guard
+                # is no longer necessary)
+                self._maybe_guard_rel(g)
+
             if not self._suppress_guards_tls():
                 stack = CapturedTraceback.extract(skip=1)
                 guard = ShapeGuard(g, stack)
                 self.guards.append(guard)
         except Exception:
             if fresh:
-                self.remove_fx_node(node)
+                self._remove_fx_node(node)
             raise
         else:
             if not self._suppress_guards_tls():
                 assert guard is not None
 
-                self.refine_ranges(guard)
-
-                self._log_guard("eval", g)
+                self._log_guard("eval", g, forcing_spec=forcing_spec)
+
+                for s in g.free_symbols:
+                    self.symbol_guard_counter[s] += 1
+                    # Forcing_spec to avoid infinite recursion
+                    if (
+                        not forcing_spec and
+                        config.symbol_guard_limit_before_specialize is not None and
+                        self.symbol_guard_counter[s] > config.symbol_guard_limit_before_specialize
+                    ):
+                        # Force specialization
+                        self.log.info(
+                            "symbol_guard_limit_before_specialize=%s exceeded on %s",
+                            config.symbol_guard_limit_before_specialize,
+                            s
+                        )
+                        self.evaluate_expr(s, forcing_spec=True)
             else:
                 self.log.debug("eval %s [guard suppressed]", g)
 
         return concrete_val
 
     def cleanup(self):
-        # Break reference cycles.
-        # This destroys the stacks. If you really want to keep them, we
-        # just need some way to break references on code objects.
+        """
+        Break reference cycles.
+
+        This destroys the stacks. If you really want to keep them, we
+        just need some way to break references on code objects.
+        """
         for g in self.guards:
             g.stack.cleanup()
         for s in self.var_to_stack.values():
@@ -3734,8 +4869,19 @@ def cleanup(self):
 
     @record_shapeenv_event(save_tracked_fakes=True)
     def defer_runtime_assert(self, orig_expr: "sympy.Expr", msg, fx_node=None):
+        """Create an assert that is checked at runtime
+
+        Args:
+            orig_expr (sympy.Expr): Boolean expression to assert is true
+            msg (str): Message to display on assertion failure
+            fx_node (Optional, torch.fx.Node): node in ``self.graph`` corresponding
+                to the expression, if applicable
+
+        """
         expr = orig_expr
 
+        # TODO: split conjunctions and evaluate them separately
+
         static_expr = self._maybe_evaluate_static(expr)
         if static_expr is not None:
             self.log.debug("runtime_assert %s == %s [statically known]", orig_expr, static_expr)
@@ -3743,46 +4889,48 @@ def defer_runtime_assert(self, orig_expr: "sympy.Expr", msg, fx_node=None):
 
         # Attempt to eliminate the unbacked SymInt
         new_expr = self._maybe_evaluate_static(expr, unbacked_only=True)
-        if new_expr.free_symbols <= self.var_to_val.keys():
+        if not self.prefer_deferred_runtime_asserts_over_guards and new_expr.free_symbols <= self.var_to_val.keys():
             # Do a normal guard
             return self.evaluate_expr(new_expr, fx_node=fx_node)
         # NB: Don't use new_expr as expr; it could contain gunk like shape0
         # which we don't want to guard on
 
+        # If you're here because of this assert, read Note [Backwards runtime asserts]
+        # in torch/_inductor/graph.py
+        assert not self.runtime_asserts_frozen, expr
+
         # OK, we're definitely doing a runtime assert now
         if (
             self._translation_validation_enabled
             and fx_node is not None
             and not self._suppress_guards_tls()
         ):
-            node, fresh = self.create_fx_call_function(torch._assert, (fx_node,))
+            node, fresh = self._create_fx_call_function(torch._assert, (fx_node,))
             assert node is not None
             if fresh:
-                self.add_fx_node_metadata(node)
+                self._add_fx_node_metadata(node)
 
         self._check_frozen(expr, sympy.true)
 
-        # eliminate symbols on equality tests
-        if isinstance(expr, sympy.Eq):
-            self._maybe_guard_eq(expr, True)
+        # eliminate symbols on equality tests / refine ranges
+        if isinstance(expr, sympy.Rel):
+            self._maybe_guard_rel(expr)
 
         if not self._suppress_guards_tls():
             # canonicalise to remove equations that are trivially equal
+            orig_expr = expr
             expr = canonicalize_bool_expr(expr)
             stack = CapturedTraceback.extract(skip=1)
             ra = RuntimeAssert(expr, msg, stack)
             # TODO: Do this in a way that is less janky than int(s.name[1:])
-            cands = sorted([s for s in expr.free_symbols if s.name.startswith("i")], key=lambda s: int(s.name[1:]))
-            self.deferred_runtime_asserts.setdefault(cands[-1], []).append(ra)
+            cands = sorted((s for s in expr.free_symbols if symbol_is_type(s, SymT.UNBACKED_INT)), key=lambda s: int(s.name[1:]))
+            # Is None when prefer_deferred_runtime_asserts_over_guards=True
+            # and the guard in question has no unbacked SymInts in front
+            ix = cands[-1] if cands else None
+            self.deferred_runtime_asserts.setdefault(ix, []).append(ra)
             self.num_deferred_runtime_asserts += 1
             self._update_version_counter()
-            # TODO: refine ranges
-            # Unfortunately, range refinement is probably going to not
-            # work most of the time, because we don't support symbols
-            # in ranges.  For example, i0 <= s0 is un-rangeable, because
-            # we can't put s0 in the range.  So this is not very high
-            # priority at the moment.
-            self._log_guard("runtime_assert", expr)
+            self._log_guard("runtime_assert", orig_expr, forcing_spec=False)
         else:
             self.log.debug("runtime_assert %s [guard suppressed]", expr)
 
@@ -3798,8 +4946,8 @@ def defer_runtime_assert(self, orig_expr: "sympy.Expr", msg, fx_node=None):
     #   1. Tries to isolate a variable in the left-hand side
     #   2. Compute the value range of the right-hand side
     #   3. Update the value range of the variable, if better
-    def refine_ranges(self, guard: ShapeGuard) -> None:
-        expr = self.simplify(guard.expr)
+    def _refine_ranges(self, expr: sympy.Expr) -> None:
+        expr = self.simplify(expr)
 
         for symbol in expr.free_symbols:
             assert isinstance(symbol, sympy.Symbol)
@@ -3823,7 +4971,6 @@ def refine_ranges(self, guard: ShapeGuard) -> None:
 
             rhs_vr = bound_sympy(rhs, self.var_to_range)
             _assert_bound_is_rational(rhs, rhs_vr)
-            lower_guard, upper_guard = self.var_to_guards.get(symbol, (None, None))
 
             # Let's suppose that we have a preexisting range for x [0, 100].
             # Now, we issue a guard x > y, where the range for y is [50, 150].
@@ -3838,24 +4985,42 @@ def refine_ranges(self, guard: ShapeGuard) -> None:
                 # Strictly greater relations allow us to refine a bit more, since
                 # x < y implies that the lower bound for x is: y + 1.
                 lower = rhs_vr.lower + int(isinstance(r_expr, sympy.Gt))
-                lower_guard = guard
             if upper > rhs_vr.upper and isinstance(r_expr, (sympy.Eq, sympy.Le, sympy.Lt)):
                 upper = rhs_vr.upper - int(isinstance(r_expr, sympy.Lt))
-                upper_guard = guard
 
             # Do nothing if the new value range is no better than what we already have.
             if vr == ValueRanges(lower, upper):
                 continue
 
             # Updates the range and the guards corresponding to each bound of the symbol.
-            self.var_to_range[symbol] = ValueRanges(lower, upper)
-            self.var_to_guards[symbol] = (lower_guard, upper_guard)
+            self._update_var_to_range(symbol, ValueRanges(lower, upper))
             # Clears the cache, since this update can change the result.
             self._maybe_evaluate_static.cache_clear()
 
+    @lru_cache(maxsize=None)
+    @record_shapeenv_event()
+    def constrain_symbol_range(self, s: sympy.Symbol, compiler_min: int, compiler_max: int):
+        upd_vr = ValueRanges(compiler_min, compiler_max)
+        old_vr = self.var_to_range.get(s, ValueRanges.unknown())
+        self._update_var_to_range(s, upd_vr)
+        if (new_vr := self.var_to_range[s]) != old_vr:
+            log.info("constrain_symbol_range %s [%s, %s]", s, new_vr.lower, new_vr.upper)
+
+
 def _is_int(expr):
     return isinstance(expr, SymInt) and expr.node.expr.is_number
 
 # WARNING: This is legacy, DO NOT USE
 def _is_dim_dynamic(t, d):
     return hasattr(t, "_dynamo_dynamic_indices") and d in t._dynamo_dynamic_indices
+
+class PropagateUnbackedSymInts(torch.fx.Interpreter):
+    def run_node(self, n: torch.fx.Node):
+        """
+        Run an FX node, propagating unbacked Symbol bindings to the new fake tensor
+        """
+        from torch._guards import detect_fake_mode
+
+        result = super().run_node(n)
+        rebind_unbacked(detect_fake_mode().shape_env, n, result)
+        return result
diff --git a/torch/fx/experimental/unification/multipledispatch/conflict.py b/torch/fx/experimental/unification/multipledispatch/conflict.py
index fc64151945655..6c247bd98111b 100644
--- a/torch/fx/experimental/unification/multipledispatch/conflict.py
+++ b/torch/fx/experimental/unification/multipledispatch/conflict.py
@@ -1,5 +1,6 @@
 from .utils import _toposort, groupby
 from .variadic import isvariadic
+import operator
 
 __all__ = ["AmbiguityWarning", "supercedes", "consistent", "ambiguous", "ambiguities", "super_signature",
            "edge", "ordering"]
@@ -68,8 +69,8 @@ def consistent(a, b):
                 p1 += 1
         # We only need to check for variadic ends
         # Variadic types are guaranteed to be the last element
-        return (isvariadic(cur_a) and p2 == len(b) or
-                isvariadic(cur_b) and p1 == len(a))
+        return (isvariadic(cur_a) and p2 == len(b) or  # type: ignore[possibly-undefined]
+                isvariadic(cur_b) and p1 == len(a))  # type: ignore[possibly-undefined]
 
 
 def ambiguous(a, b):
@@ -111,7 +112,7 @@ def ordering(signatures):
     """
     signatures = list(map(tuple, signatures))
     edges = [(a, b) for a in signatures for b in signatures if edge(a, b)]
-    edges = groupby(lambda x: x[0], edges)
+    edges = groupby(operator.itemgetter(0), edges)
     for s in signatures:
         if s not in edges:
             edges[s] = []
diff --git a/torch/fx/experimental/unification/multipledispatch/core.py b/torch/fx/experimental/unification/multipledispatch/core.py
index 6036015f4290c..2a8ed78e52e36 100644
--- a/torch/fx/experimental/unification/multipledispatch/core.py
+++ b/torch/fx/experimental/unification/multipledispatch/core.py
@@ -77,7 +77,7 @@ def ismethod(func):
         return signature.parameters.get('self', None) is not None
     else:
         if sys.version_info.major < 3:
-            spec = inspect.getargspec(func)
+            spec = inspect.getargspec(func)  # type: ignore[attr-defined]
         else:
             spec = inspect.getfullargspec(func)  # type: ignore[union-attr, assignment]
         return spec and spec.args and spec.args[0] == 'self'
diff --git a/torch/fx/experimental/unification/utils.py b/torch/fx/experimental/unification/utils.py
index d74799a714c5d..56cde39319e3c 100644
--- a/torch/fx/experimental/unification/utils.py
+++ b/torch/fx/experimental/unification/utils.py
@@ -82,7 +82,7 @@ def reverse_dict(d):
 def xfail(func):
     try:
         func()
-        raise Exception("XFailed test passed")  # pragma:nocover
+        raise Exception("XFailed test passed")  # pragma:nocover  # noqa: TRY002
     except Exception:
         pass
 
diff --git a/torch/fx/experimental/validator.py b/torch/fx/experimental/validator.py
index a87056fe3c6fd..6dcb59db79793 100644
--- a/torch/fx/experimental/validator.py
+++ b/torch/fx/experimental/validator.py
@@ -243,8 +243,6 @@ def round(self, number: z3.ArithRef, ndigits: Optional[z3.ArithRef] = None) -> z
     #   2. Calls an operation that corresponds to 'op', but works with Z3
     #      inhabitants (left as is if it works as is)
     def z3op(op: Callable, validator: "TranslationValidator") -> Callable:
-        from torch.fx.experimental.sym_node import sym_sqrt
-
         # Operations that have booleans as their argument.
         # This is needed because the argument of some FX nodes were
         # literal integers, instead of booleans. So, whenever this flag
@@ -297,7 +295,7 @@ def wrapper(*args):
             torch.sym_max: lift(ops.max),
             torch.sym_min: lift(ops.min),
             torch.sym_ite: lift(lambda b, t, f: t if b else f),
-            sym_sqrt: lift(ops.sqrt),
+            torch._sym_sqrt: lift(ops.sqrt),  # type: ignore[attr-defined]
             # Not lifted because we only use this function as a
             # marker for adding the expression as validator input.
             torch._assert: torch._assert,
@@ -328,9 +326,8 @@ def placeholder(self, target: Target, args: Tuple[Argument, ...], kwargs: Dict[s
 
         def call_function(self, target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:
             if target != torch._assert:
-                # Actually runs the node target function (which is already
-                # lifted) with its arguments.
-                return super().call_function(target, args, kwargs)
+                # Lift and runs the node target function
+                return super().call_function(z3op(target, self.validator), args, kwargs)  # type: ignore[arg-type]
             # Adds the Z3 expression corresponding to the first argument
             # as a validator input.
             assert len(args) == 1, f"expected 1 argument on assertion. Got: {len(args)} "
@@ -701,7 +698,7 @@ def check_node_fails(node: torch.fx.Node) -> Optional[ValidationException]:
         shape_env.graph.lint()
         return check_shapeenv_fails(shape_env, events[number].tracked_fakes)
 
-    last_exception = check_shapeenv_fails(shape_env, shape_env.snapshot_tracked_fakes())
+    last_exception = check_shapeenv_fails(shape_env, shape_env._snapshot_tracked_fakes())
 
     if not last_exception:
         # We don't actually fail due to a produce_guards call.
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index ffc731d62d622..b9b971ba7ef93 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -1,12 +1,12 @@
-import collections
 from collections import defaultdict
 from .node import Node, Argument, Target, map_arg, _type_repr, _get_qualified_name
 import torch.utils._pytree as pytree
 from . import _pytree as fx_pytree
 from ._compatibility import compatibility
 
+import os
 import contextlib
-from typing import TYPE_CHECKING, Callable, Any, List, Dict, NamedTuple, Optional, Tuple, Set, FrozenSet, Type
+from typing import TYPE_CHECKING, Callable, Any, List, Dict, NamedTuple, Optional, Tuple, Set, FrozenSet, Type, Iterable
 from dataclasses import dataclass
 from contextlib import contextmanager
 import copy
@@ -209,6 +209,8 @@ def _rename_object(self, obj: Any, name: str):
     torch.float16: 'f16',
     torch.float8_e4m3fn: 'f8e4m3fn',
     torch.float8_e5m2: 'f8e5m2',
+    torch.float8_e4m3fnuz: 'f8e4m3fnuz',
+    torch.float8_e5m2fnuz: 'f8e5m2fnuz',
     torch.complex32: 'c32',
     torch.complex64: 'c64',
     torch.complex128: 'c128',
@@ -218,6 +220,8 @@ def _rename_object(self, obj: Any, name: str):
     torch.int64: 'i64',
     torch.bool: 'b8',
     torch.uint8: 'u8',
+    torch.uint32: 'u32',
+    torch.uint64: 'u64',
 }
 
 @compatibility(is_backward_compatible=True)
@@ -266,12 +270,20 @@ def __len__(self):
         return self.graph._len
 
     def __iter__(self):
-        root, direction = self.graph._root, self.direction
-        cur = getattr(root, direction)
-        while cur is not root:
-            if not cur._erased:
-                yield cur
-            cur = getattr(cur, direction)
+        root = self.graph._root
+        if self.direction == "_next":
+            cur = root._next
+            while cur is not root:
+                if not cur._erased:
+                    yield cur
+                cur = cur._next
+        else:
+            assert self.direction == "_prev"
+            cur = root._prev
+            while cur is not root:
+                if not cur._erased:
+                    yield cur
+                cur = cur._prev
 
     def __reversed__(self):
         return _node_list(self.graph, '_next' if self.direction == '_prev' else '_prev')
@@ -284,11 +296,23 @@ class _PyTreeInfo(NamedTuple):
     in_spec: pytree.TreeSpec
     out_spec: Optional[pytree.TreeSpec]
 
+@dataclass(frozen=True)
+class _ParsedStackTrace:
+    """
+    Represents the top-most frame of a parsed stack trace
+    """
+    file: str
+    lineno: str
+    name: str
+    code: str
+
+    def get_summary_str(self):
+        return f'File: {self.file}:{self.lineno} in {self.name}, code: {self.code}'
+
 # get File:lineno code from stack_trace
 def _parse_stack_trace(stack_trace: str):
     if stack_trace is None:
         return None
-    ParsedStackTrace = collections.namedtuple("ParsedStackTrace", ["file", "lineno", "code"])
     pattern = re.compile(r"^File \"(.+)\", line (\d+), in (.+)$")
     lines = stack_trace.strip().split('\n')
     # stacktrace should have innermost frame last, so we
@@ -301,12 +325,12 @@ def _parse_stack_trace(stack_trace: str):
         if matches:
             file = matches.group(1)
             lineno = matches.group(2)
+            name = matches.group(3)
             # next line should be the code
             code = lines[idx + 1].strip()
-            return ParsedStackTrace(file, lineno, code)
+            return _ParsedStackTrace(file, lineno, name, code)
     return None
 
-
 @compatibility(is_backward_compatible=False)
 class CodeGen:
     def __init__(self):
@@ -358,7 +382,8 @@ def additional_globals(self) -> List[Tuple[str, Any]]:
         return []
 
     def _gen_python_code(
-        self, nodes, root_module: str, namespace: _Namespace, *, verbose: bool = False,
+        self, nodes, root_module: str, namespace: _Namespace, *,
+        verbose: bool = False, include_stride: bool = False, include_device: bool = False
     ) -> PythonCode:
         free_vars: List[str] = []
         body: List[str] = []
@@ -367,6 +392,8 @@ def _gen_python_code(
 
         # Wrap string in list to pass by reference
         maybe_return_annotation : List[str] = ['']
+        include_stride = include_stride or (os.environ.get("FX_GRAPH_SHOW_STRIDE", "0") == "1")
+        include_device = include_device or (os.environ.get("FX_GRAPH_SHOW_DEVICE", "0") == "1")
 
         def add_global(name_hint: str, obj: Any):
             """Add an obj to be tracked as a global.
@@ -497,19 +524,15 @@ def append_stacktrace_summary(node : Node):
                         prev_stacktrace = node.stack_trace
                         summary_str = ""
 
-                        parsed_stack_trace = _parse_stack_trace(node.stack_trace)
-
-                        if parsed_stack_trace is not None:
-                            lineno = parsed_stack_trace.lineno
-                            code = parsed_stack_trace.code
-                            summary_str = f'File: {parsed_stack_trace.file}:{lineno}, code: {code}'
+                        if parsed_stack_trace := _parse_stack_trace(node.stack_trace):
+                            summary_str = parsed_stack_trace.get_summary_str()
 
                         body.append(f'\n# {summary_str}\n')
                 elif prev_stacktrace != "":
                     prev_stacktrace = ""
                     body.append('\n# No stacktrace found for following nodes\n')
 
-        def stringify_shape(shape : torch.Size) -> str:
+        def stringify_shape(shape : Iterable) -> str:
             return f"[{', '.join(str(x) for x in shape)}]"
 
         def emit_node(node : Node):
@@ -521,11 +544,14 @@ def emit_node(node : Node):
                 from torch.fx.experimental.proxy_tensor import py_sym_types
                 from torch.fx.passes.shape_prop import TensorMetadata
 
-                meta_val = node.meta.get('val', node.meta.get('tensor_meta', None))
-
+                meta_val = node.meta.get('val', node.meta.get('tensor_meta', node.meta.get('example_value', None)))
                 # use string as annotation, to make it valid python code
                 if isinstance(meta_val, FakeTensor):
-                    maybe_type_annotation = f': "{dtype_abbrs[meta_val.dtype]}{stringify_shape(meta_val.shape)}"'
+                    stride_annotation = f"{stringify_shape(meta_val.stride())}" if include_stride else ""
+                    device_annotation = f"{meta_val.device}" if include_device else ""
+                    maybe_type_annotation = \
+                        f': "{dtype_abbrs[meta_val.dtype]}{stringify_shape(meta_val.shape)}' \
+                        f'{stride_annotation}{device_annotation}"'
                 elif isinstance(meta_val, py_sym_types):
                     maybe_type_annotation = f': "Sym({meta_val})"'
                 elif isinstance(meta_val, TensorMetadata):
@@ -732,6 +758,36 @@ def generate_output(self, output_args):
         else:
             return super().generate_output(output_args)
 
+class _FindNodesLookupTable:
+    """
+    Side table for the graph for the purpose of doing fast queries
+    """
+    def __init__(self):
+        self.table: Dict[Tuple[str, Optional[Target]], Dict[Node, None]] = defaultdict(dict)
+
+    def _key(self, node) -> Tuple[str, Optional[Target]]:
+        return (node.op, node.target if node.op == "call_function" else None)
+
+    def __contains__(self, node) -> bool:
+        return node in self.table[self._key(node)]
+
+    def insert(self, node: Node) -> None:
+        self.table[self._key(node)][node] = None
+
+    def remove(self, node: Node) -> None:
+        self.table[self._key(node)].pop(node)
+
+    def find_nodes(self, *, op: str, target: Optional['Target'] = None):
+        if op == "call_function":
+            assert target is not None
+            return dict(self.table[(op, target)]).keys()
+
+        if target is None:
+            return dict(self.table[(op, None)]).keys()
+
+        # op is call_method, get_attr, call_module
+        return [node for node in self.table[(op, None)].keys() if node.target == target]
+
 @compatibility(is_backward_compatible=True)
 class Graph:
     """
@@ -793,6 +849,7 @@ def __init__(self, owning_module: Optional["GraphModule"] = None, tracer_cls: Op
         self._tracer_extras = tracer_extras
         self._codegen = CodeGen()
         self._co_fields : Dict[str, Any] = {}
+        self._find_nodes_lookup_table = _FindNodesLookupTable()
 
     @property
     def owning_module(self):
@@ -817,6 +874,30 @@ def nodes(self) -> _node_list:
         """
         return _node_list(self)
 
+    @compatibility(is_backward_compatible=False)
+    def find_nodes(self, *, op: str, target: Optional['Target'] = None, sort: bool = True):
+        """
+        Allows for fast query of nodes
+
+        Args:
+
+            op (str): the name of the operation
+
+            target (Optional[Target]): the target of the node. For call_function,
+                the target is required. For other ops, the target is optional.
+
+            sort (bool): whether to return nodes in the order they appear on
+                         on the graph.
+
+        Returns:
+
+            Iteratable of nodes with the requested op and target.
+        """
+        node_list = self._find_nodes_lookup_table.find_nodes(op=op, target=target)
+        if sort:
+            return sorted(node_list)
+        return node_list
+
     @compatibility(is_backward_compatible=True)
     def graph_copy(self, g : 'Graph', val_map : Dict[Node, Node], return_output_node=False) -> 'Optional[Argument]':
         """
@@ -903,9 +984,13 @@ def create_node(self, op: str, target: 'Target',
         name = self._graph_namespace.create_name(candidate, None)
         n = Node(self, name, op, target, args, kwargs, type_expr)
 
+        if self.owning_module is not None and getattr(self.owning_module, "_create_node_hook", None) is not None:
+            self.owning_module._create_node_hook(n)
+
         self._graph_namespace.associate_name_with_obj(name, n)
 
         self._insert(n)
+        self._find_nodes_lookup_table.insert(n)
         self._len += 1
         return n
 
@@ -934,10 +1019,13 @@ def erase_node(self, to_erase : Node) -> None:
         if len(to_erase.users) > 0:
             raise RuntimeError(f'Tried to erase Node {to_erase} but it still had {len(to_erase.users)} '
                                f'users in the graph: {to_erase.users}!')
+        if to_erase.graph != self:
+            raise RuntimeError(f"Attempting to remove {to_erase} from wrong graph!")
         if to_erase._erased:
             warnings.warn(f"erase_node({to_erase}) on an already erased node")
             return
 
+        self._find_nodes_lookup_table.remove(to_erase)
         to_erase._remove_from_list()
         to_erase._erased = True  # iterators may retain handles to erased nodes
         self._len -= 1
@@ -1266,7 +1354,10 @@ def _target_to_str(self, target : Target) -> str:
         return op
 
     @compatibility(is_backward_compatible=True)
-    def python_code(self, root_module: str, *, verbose: bool = False) -> PythonCode:
+    def python_code(
+        self, root_module: str, *,
+        verbose: bool = False, include_stride: bool = False, include_device: bool = False
+    ) -> PythonCode:
         """
         Turn this ``Graph`` into valid Python code.
 
@@ -1325,10 +1416,19 @@ def override_node_repr(graph: Graph):
                     node._repr_fn = orig_repr_fns[node]
 
         with override_node_repr(self):
-            return self._python_code(root_module, namespace, verbose=verbose)
+            return self._python_code(
+                root_module, namespace,
+                verbose=verbose, include_stride=include_stride, include_device=include_device
+            )
 
-    def _python_code(self, root_module: str, namespace: _Namespace, *, verbose: bool = False) -> PythonCode:
-        return self._codegen._gen_python_code(self.nodes, root_module, namespace, verbose=verbose)
+    def _python_code(
+        self, root_module: str, namespace: _Namespace, *,
+        verbose: bool = False, include_stride: bool = False, include_device: bool = False
+    ) -> PythonCode:
+        return self._codegen._gen_python_code(
+            self.nodes, root_module, namespace,
+            verbose=verbose, include_stride=include_stride, include_device=include_device
+        )
 
 
     def __str__(self) -> str:
@@ -1398,6 +1498,8 @@ def check_arg(arg : Node, n : Optional[Node] = None) -> None:
                 raise RuntimeError(f'Node {node} had unknown opcode {node.op}!')
             if node.graph is not self:
                 raise RuntimeError(f'Node \'{node}\' does not belong to this Graph!')
+            if node not in self._find_nodes_lookup_table:
+                raise RuntimeError(f"Node \'{node}\' is not added to the side table")
             map_arg(node.args, lambda arg: check_arg(arg, node))
             map_arg(node.kwargs, lambda arg: check_arg(arg, node))
             seen_values.add(node)
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 8ee898ef20aee..95a9f568443e9 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -1,3 +1,4 @@
+import contextlib
 import copy
 import itertools
 import linecache
@@ -111,9 +112,7 @@ def _format_import_statement(name: str, obj: Any, importer: Importer) -> str:
 
 
 def _format_import_block(globals: Dict[str, Any], importer: Importer):
-    import_strs: Set[str] = set()
-    for name, obj in globals.items():
-        import_strs.add(_format_import_statement(name, obj, importer))
+    import_strs: Set[str] = {_format_import_statement(name, obj, importer) for name, obj in globals.items()}
     # Sort the imports so we have a stable import block that allows us to
     # hash the graph module and get a consistent key for use in a cache.
     return "\n".join(sorted(import_strs))
@@ -194,9 +193,8 @@ def is_leaf_module(self, _: torch.nn.Module, __: str) -> bool:
     # Manually set Tracer class on the reconstructed Graph, to avoid
     # referencing the private local subclass KeepModules.
     graph._tracer_cls = tracer_cls
-    if graph_module_cls is None:
-        graph_module_cls = GraphModule
-    gm = graph_module_cls(com, graph, class_name=graphmodule_cls_name)
+    from ._lazy_graph_module import _make_graph_module
+    gm = _make_graph_module(com, graph, class_name=graphmodule_cls_name, graph_module_cls=graph_module_cls)
 
     # The GraphModule constructor only retains attributes referenced by the graph.
     # In this case, our goal is return a GraphModule as close to identical as the one
@@ -283,7 +281,7 @@ def _generate_error_message(frame_summary: traceback.FrameSummary) -> str:
         all_src_lines = linecache.getlines(frame_summary.filename)
 
         # constituent substrings of the error message
-        tb_repr = traceback.format_exc()
+        tb_repr = torch._dynamo.disable(traceback.format_exc)()
         custom_msg = (
             "Call using an FX-traced Module, "
             f"line {err_lineno} of the traced Module's "
@@ -316,7 +314,6 @@ def __call__(self, obj, *args, **kwargs):
             else:
                 raise e
 
-
 @compatibility(is_backward_compatible=True)
 class GraphModule(torch.nn.Module):
     """
@@ -445,6 +442,8 @@ def __init__(
 
         # Dictionary to store metadata
         self.meta: Dict[str, Any] = {}
+        self._replace_hook = None
+        self._create_node_hook = None
 
     # TorchScript breaks trying to compile the graph setter because of the
     # continued string literal. Issue here: https://github.com/pytorch/pytorch/issues/44842
@@ -775,6 +774,7 @@ def __reduce__(self):
         code to regenerate the underlying ``Graph``
         """
         dict_without_graph = self.__dict__.copy()
+
         python_code = self.recompile()
         import_block = _format_import_block(python_code.globals, sys_importer)
         del dict_without_graph["_graph"]
@@ -799,6 +799,8 @@ def __deepcopy__(self, memo):
             "_state_dict_hooks",
             "_load_state_dict_pre_hooks",
             "_load_state_dict_post_hooks",
+            "_replace_hook",
+            "_create_node_hook",
         ]
         for attr in extra_preserved_attrs:
             if attr in self.__dict__:
@@ -810,16 +812,19 @@ def __deepcopy__(self, memo):
         return res
 
     def __copy__(self):
-        res = GraphModule(self, self.graph)
+        from ._lazy_graph_module import _make_graph_module
+        res = _make_graph_module(self, self.graph)
         res.meta = getattr(self, "meta", {})
         return res
 
     @compatibility(is_backward_compatible=False)
-    def print_readable(self, print_output=True):
+    def print_readable(self, print_output=True, include_stride=False, include_device=False):
         """
         Return the Python code generated for current GraphModule and its children GraphModules
         """
-        verbose_python_code = self._graph.python_code(root_module="self", verbose=True)
+        verbose_python_code = self._graph.python_code(
+            root_module="self", verbose=True, include_stride=include_stride, include_device=include_device
+        )
         module_code = verbose_python_code.src
         module_code = module_code.lstrip("\n")
         module_code = f"class {self._get_name()}(torch.nn.Module):\n" + module_code
@@ -849,6 +854,47 @@ def _replicate_for_data_parallel(self):
         new_gm._is_replica = True
         return new_gm
 
+    @contextlib.contextmanager
+    def _set_replace_hook(self, f):
+        """
+        Takes a callable which will be called everytime when we replace a node
+        to a new node, or change the node's name. Callable takes three arguments:
+        the old node we're changing, and NAME of the new node, followed by the
+        user node which consumes the old node to be replaced.
+        """
+        assert callable(f), "Replace hook must be a callable."
+        prev, self._replace_hook = self._replace_hook, f
+        try:
+            yield
+        finally:
+            self._replace_hook = prev
+
+    @contextlib.contextmanager
+    def _set_create_node_hook(self, f):
+        """
+        Takes a callable which will be called after we create a new node. The
+        callable takes the newly created node as input and returns None.
+        """
+        assert callable(f), "create_node hook must be a callable."
+        prev = self._create_node_hook
+
+        # Add the hook to all submodules
+        for m in self.modules():
+            if isinstance(m, GraphModule):
+                assert m._create_node_hook is prev, (
+                    "create_node_hook is not be the same for all submodules: "
+                    f"Found: {m._create_node_hook}. Previously: {prev}"
+                )
+                m._create_node_hook = f
+
+        try:
+            yield
+        finally:
+            # Restore hook for all submodules
+            for m in self.modules():
+                if isinstance(m, GraphModule):
+                    m._create_node_hook = prev
+
 
 # workarounds for issues in __torch_function__
 
diff --git a/torch/fx/immutable_collections.py b/torch/fx/immutable_collections.py
index 61b2dfdd296cc..7ad3807f23bb6 100644
--- a/torch/fx/immutable_collections.py
+++ b/torch/fx/immutable_collections.py
@@ -2,12 +2,15 @@
 
 from torch.utils._pytree import (
     _dict_flatten,
+    _dict_flatten_with_keys,
     _dict_unflatten,
     _list_flatten,
+    _list_flatten_with_keys,
     _list_unflatten,
     Context,
     register_pytree_node,
 )
+
 from ._compatibility import compatibility
 
 
@@ -36,7 +39,7 @@ def _create_immutable_container(base, mutable_functions):
 
 immutable_list = _create_immutable_container(
     list,
-    [
+    (
         "__delitem__",
         "__iadd__",
         "__imul__",
@@ -47,7 +50,9 @@ def _create_immutable_container(base, mutable_functions):
         "insert",
         "pop",
         "remove",
-    ],
+        "reverse",
+        "sort",
+    ),
 )
 immutable_list.__reduce__ = lambda self: (immutable_list, (tuple(iter(self)),))
 immutable_list.__hash__ = lambda self: hash(tuple(self))
@@ -56,14 +61,16 @@ def _create_immutable_container(base, mutable_functions):
 
 immutable_dict = _create_immutable_container(
     dict,
-    [
+    (
         "__delitem__",
+        "__ior__",
         "__setitem__",
         "clear",
         "pop",
         "popitem",
+        "setdefault",
         "update",
-    ],
+    ),
 )
 immutable_dict.__reduce__ = lambda self: (immutable_dict, (iter(self.items()),))
 immutable_dict.__hash__ = lambda self: hash(tuple(self.items()))
@@ -93,5 +100,17 @@ def _immutable_list_unflatten(
     return immutable_list(_list_unflatten(values, context))
 
 
-register_pytree_node(immutable_dict, _immutable_dict_flatten, _immutable_dict_unflatten)
-register_pytree_node(immutable_list, _immutable_list_flatten, _immutable_list_unflatten)
+register_pytree_node(
+    immutable_dict,
+    _immutable_dict_flatten,
+    _immutable_dict_unflatten,
+    serialized_type_name="torch.fx.immutable_collections.immutable_dict",
+    flatten_with_keys_fn=_dict_flatten_with_keys,
+)
+register_pytree_node(
+    immutable_list,
+    _immutable_list_flatten,
+    _immutable_list_unflatten,
+    serialized_type_name="torch.fx.immutable_collections.immutable_list",
+    flatten_with_keys_fn=_list_flatten_with_keys,
+)
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index c36c5c1004b69..23c006fbbd5f1 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -1,4 +1,5 @@
 from .graph_module import GraphModule
+from ._lazy_graph_module import _make_graph_module
 from .graph import Graph
 from .node import Argument, Node, Target, map_arg, map_aggregate
 from .proxy import Proxy
@@ -211,7 +212,7 @@ def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : D
 
         Args:
             target (Target): The call target for this node. See
-                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                `Node <https://pytorch.org/docs/main/fx.html#torch.fx.Node>`__ for
                 details on semantics
             args (Tuple): Tuple of positional args for this invocation
             kwargs (Dict): Dict of keyword arguments for this invocation
@@ -241,7 +242,7 @@ def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict
 
         Args:
             target (Target): The call target for this node. See
-                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                `Node <https://pytorch.org/docs/main/fx.html#torch.fx.Node>`__ for
                 details on semantics
             args (Tuple): Tuple of positional args for this invocation
             kwargs (Dict): Dict of keyword arguments for this invocation
@@ -259,7 +260,7 @@ def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs :
 
         Args:
             target (Target): The call target for this node. See
-                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                `Node <https://pytorch.org/docs/main/fx.html#torch.fx.Node>`__ for
                 details on semantics
             args (Tuple): Tuple of positional args for this invocation
             kwargs (Dict): Dict of keyword arguments for this invocation
@@ -279,7 +280,7 @@ def call_method(self, target : 'Target', args : Tuple[Argument, ...], kwargs : D
 
         Args:
             target (Target): The call target for this node. See
-                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                `Node <https://pytorch.org/docs/main/fx.html#torch.fx.Node>`__ for
                 details on semantics
             args (Tuple): Tuple of positional args for this invocation
             kwargs (Dict): Dict of keyword arguments for this invocation
@@ -301,7 +302,7 @@ def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : D
 
         Args:
             target (Target): The call target for this node. See
-                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                `Node <https://pytorch.org/docs/main/fx.html#torch.fx.Node>`__ for
                 details on semantics
             args (Tuple): Tuple of positional args for this invocation
             kwargs (Dict): Dict of keyword arguments for this invocation
@@ -325,7 +326,7 @@ def output(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[s
 
         Args:
             target (Target): The call target for this node. See
-                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                `Node <https://pytorch.org/docs/main/fx.html#torch.fx.Node>`__ for
                 details on semantics
             args (Tuple): Tuple of positional args for this invocation
             kwargs (Dict): Dict of keyword arguments for this invocation
@@ -458,7 +459,7 @@ def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : D
 
         Args:
             target (Target): The call target for this node. See
-                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                `Node <https://pytorch.org/docs/main/fx.html#torch.fx.Node>`__ for
                 details on semantics
             args (Tuple): Tuple of positional args for this invocation
             kwargs (Dict): Dict of keyword arguments for this invocation
@@ -476,7 +477,7 @@ def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict
 
         Args:
             target (Target): The call target for this node. See
-                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                `Node <https://pytorch.org/docs/main/fx.html#torch.fx.Node>`__ for
                 details on semantics
             args (Tuple): Tuple of positional args for this invocation
             kwargs (Dict): Dict of keyword arguments for this invocation
@@ -508,4 +509,4 @@ def transform(self) -> GraphModule:
             def strip_proxy(a : Union[Argument, Proxy]) -> Any:
                 return a.node if isinstance(a, Proxy) else a
             self.new_graph.output(map_aggregate(result, strip_proxy))
-        return GraphModule(self.module, self.new_graph)
+        return _make_graph_module(self.module, self.new_graph)
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 616b3888f8976..d9af26c9207f2 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 # Nodes represent a definition of a value in our graph of operators.
 from typing import TYPE_CHECKING, Union, Callable, Any, Tuple, List, Optional, Dict, Set
 from ._compatibility import compatibility
@@ -37,18 +39,23 @@
     torch.amp._exit_autocast,
 }
 
+# TODO: Either refactor this into 2 functions 1 dce for functional graphs and 1 dce for all graphs,
+# or add logic to correctly mark all inplace ops as side effectful.
 _side_effectful_functions: Set[Callable] = {
     torch._assert,
     torch._assert_async,
     _ops.aten._assert_async.msg,
     _ops.aten._assert_scalar.default,
     _ops.aten.copy_.default,
+    _ops.aten.set_.source_Tensor,
+    _ops.aten.index_put_.default,
     _ops.aten.sym_constrain_range.default,
     _ops.aten.sym_constrain_range_for_size.default,
     _ops.profiler._record_function_enter,
     _ops.profiler._record_function_enter_new,
     _ops.profiler._record_function_exit,
     _ops.inductor.accumulate_grad_.default,
+    _ops.inductor.resize_storage_bytes_.default,
 } | _side_effectful_need_to_be_preserved_pre_dispatch
 
 
@@ -231,6 +238,7 @@ def __init__(self, graph: 'Graph', name: str, op: str, target: 'Target',
         self._prev = self
         self._next = self
         self._erased = False
+        self._sort_key = ()
 
         # If set, use this fn to print this node
         self._repr_fn : Optional[Callable[[Node], str]] = None
@@ -283,6 +291,30 @@ def prepend(self, x: 'Node') -> None:
         p._next, x._prev = x, p
         x._next, self._prev = self, x
 
+        # compute x._sort_key
+        psk = x._prev._sort_key
+        nsk = x._next._sort_key
+        if len(psk) > len(nsk):
+            *prefix, idx = psk[:len(nsk) + 1]
+            x._sort_key = (*prefix, idx + 1)
+        elif len(psk) < len(nsk):
+            *prefix, idx = nsk[:len(psk) + 1]
+            x._sort_key = (*prefix, idx - 1)
+        else:  # same length, increase length by 1
+            x._sort_key = (*psk, 0)
+
+    def __gt__(self, other: 'Node'):
+        return self._sort_key > other._sort_key
+
+    def __lt__(self, other: 'Node'):
+        return self._sort_key < other._sort_key
+
+    def __ge__(self, other: 'Node'):
+        return self > other or self == other
+
+    def __le__(self, other: 'Node'):
+        return self < other or self == other
+
     @compatibility(is_backward_compatible=True)
     def append(self, x: 'Node') -> None:
         """
@@ -562,6 +594,7 @@ def replace_all_uses_with(self,
                 replace_with.meta[k] = v
         to_process = list(self.users)
         skipped = []
+        m = self.graph.owning_module
         for use_node in to_process:
             if not delete_user_cb(use_node):
                 skipped.append(use_node)
@@ -573,6 +606,9 @@ def maybe_replace_node(n : Node) -> Node:
                 else:
                     return n
 
+            if getattr(m, "_replace_hook", None):
+                m._replace_hook(old=self, new=replace_with.name, user=use_node)
+
             new_args = map_arg(use_node.args, maybe_replace_node)
             new_kwargs = map_arg(use_node.kwargs, maybe_replace_node)
             assert isinstance(new_args, tuple)
@@ -662,6 +698,10 @@ def replace_input_with(self, old_input: 'Node', new_input: 'Node'):
         def maybe_replace_node(n : Node) -> Node:
             return new_input if n == old_input else n
 
+        m = self.graph.owning_module
+        if getattr(m, "_replace_hook", None):
+            m._replace_hook(old=old_input, new=new_input.name, user=self)
+
         new_args = map_arg(self.args, maybe_replace_node)
         new_kwargs = map_arg(self.kwargs, maybe_replace_node)
         assert isinstance(new_args, tuple)
@@ -675,6 +715,24 @@ def _rename(self, candidate: str):
         self.name = name
         self.graph._graph_namespace._rename_object(self, name)
 
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name == 'name' and hasattr(self, "name"):
+            m = self.graph.owning_module
+            if getattr(m, "_replace_hook", None):
+                assert isinstance(value, str)
+                for user in self.users:
+                    m._replace_hook(old=self, new=value, user=user)
+        update = False
+        if (
+                hasattr(self, name) and
+                hasattr(self.graph, "_find_nodes_lookup_table") and
+                self in self.graph._find_nodes_lookup_table
+        ):
+            update = True
+            self.graph._find_nodes_lookup_table.remove(self)
+        object.__setattr__(self, name, value)
+        if update:
+            self.graph._find_nodes_lookup_table.insert(self)
 
 @compatibility(is_backward_compatible=True)
 def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument:
diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py
index 3e796ba65fc78..142740a322bce 100644
--- a/torch/fx/operator_schemas.py
+++ b/torch/fx/operator_schemas.py
@@ -51,6 +51,7 @@ def __getattr__(self, name):
                       'number' : numbers.Number, 'Future' : torch.jit.Future,
                       'AnyEnumType' : enum.Enum, 'QScheme' : torch.qscheme,
                       '__torch__': _FakeGlobalNamespace(), 'NoneType': type(None),
+                      'Storage': torch.UntypedStorage,
                       't': typing.TypeVar('t')}
 for k in dir(typing):
     _type_eval_globals[k] = getattr(typing, k)
diff --git a/torch/fx/passes/__init__.py b/torch/fx/passes/__init__.py
index 9577d6c66a9ea..f83a2f248fcde 100644
--- a/torch/fx/passes/__init__.py
+++ b/torch/fx/passes/__init__.py
@@ -4,6 +4,7 @@
 from . import operator_support
 from . import param_fetch
 from . import reinplace
+from . import runtime_assert
 from . import shape_prop
 from . import split_module
 from . import split_utils
diff --git a/torch/fx/passes/fake_tensor_prop.py b/torch/fx/passes/fake_tensor_prop.py
index a31953ca6e791..58ee61f100897 100644
--- a/torch/fx/passes/fake_tensor_prop.py
+++ b/torch/fx/passes/fake_tensor_prop.py
@@ -2,10 +2,10 @@
 
 import torch.fx
 from torch.fx import Node
+from torch.fx.node import map_aggregate
 from torch.fx._compatibility import compatibility
 from torch._subclasses.fake_tensor import FakeTensorMode, FakeTensor
-from torch.fx.experimental.proxy_tensor import py_sym_types, snapshot_fake
-from torch.fx.node import map_aggregate
+from torch.fx.experimental.proxy_tensor import snapshot_fake, py_sym_types
 
 __all__ = ['FakeTensorProp']
 
@@ -28,19 +28,13 @@ def __init__(self, module: torch.fx.GraphModule, mode: Optional[FakeTensorMode]
         if mode is None:
             mode = FakeTensorMode()
         self._mode = mode
+        mode.epoch += 1
 
     def run_node(self, n: Node):
-        import sympy
-        from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+        from torch.fx.experimental.symbolic_shapes import rebind_unbacked, compute_unbacked_bindings
 
         result = super().run_node(n)
-        sym = None
-        if (
-            'val' in n.meta and
-            isinstance(v := n.meta['val'], torch.SymInt) and
-            isinstance(v.node.expr, sympy.Symbol) and free_unbacked_symbols(v)
-        ):
-            sym = v
+        rebind_unbacked(self._mode.shape_env, n, result)
 
         def extract_val(obj):
             if isinstance(obj, FakeTensor):
@@ -57,8 +51,9 @@ def extract_val(obj):
         meta = map_aggregate(result, extract_val)
         if meta is not None:
             n.meta['val'] = meta
-            if sym is not None:
-                torch._check(meta == v)
+            if (shape_env := self._mode.shape_env) and (symbol_to_path := compute_unbacked_bindings(shape_env, result)):
+                n.meta["unbacked_bindings"] = symbol_to_path
+
         return result
 
     def propagate(self, *args):
diff --git a/torch/fx/passes/graph_drawer.py b/torch/fx/passes/graph_drawer.py
index 2e418114c468a..9cfb430245257 100644
--- a/torch/fx/passes/graph_drawer.py
+++ b/torch/fx/passes/graph_drawer.py
@@ -412,7 +412,10 @@ def __init__(
                 graph_module: torch.fx.GraphModule,
                 name: str,
                 ignore_getattr: bool = False,
+                ignore_parameters_and_buffers: bool = False,
+                skip_node_names_in_args: bool = True,
                 parse_stack_trace: bool = False,
+                dot_graph_shape: Optional[str] = None,
             ):
                 raise RuntimeError('FXGraphDrawer requires the pydot package to be installed. Please install '
                                    'pydot through your favorite Python package manager.')
diff --git a/torch/fx/passes/infra/partitioner.py b/torch/fx/passes/infra/partitioner.py
index 7693f528af56e..7b36918928d38 100644
--- a/torch/fx/passes/infra/partitioner.py
+++ b/torch/fx/passes/infra/partitioner.py
@@ -1,15 +1,15 @@
-from typing import Dict, List, Set, Iterable, Sequence, Optional, Deque
-
 from torch.fx.passes.utils.fuser_utils import fuse_by_partitions
+import collections
+import itertools
+import logging
+
+from copy import copy
+from typing import Dict, Iterable, List, Optional, Sequence, Set
 
 from torch.fx.graph_module import GraphModule
 from torch.fx.node import Node, _get_qualified_name
 from torch.fx.passes.operator_support import OperatorSupportBase
 
-import logging
-import itertools
-from copy import copy
-from collections import deque
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.WARNING)
@@ -31,6 +31,29 @@ def remove_node(self, node: Node):
     def size(self):
         return len(self.nodes)
 
+class _DependencyViewer:
+    def __init__(self, graph_module: GraphModule):
+        self.upstreams = collections.defaultdict(set)
+        self.downstreams = collections.defaultdict(set)
+
+        for node in graph_module.graph.nodes:
+            for input_node in node.all_input_nodes:
+                # add input_node and input_node's upstream dependency
+                self.upstreams[node].add(input_node)
+                self.upstreams[node].update(self.upstreams[input_node])
+
+        for node in reversed(graph_module.graph.nodes):
+            for output_node in node.users:
+                # add output_node and output_node's downstream dependency
+                self.downstreams[node].add(output_node)
+                self.downstreams[node].update(self.downstreams[output_node])
+
+    def downstreams_of(self, node: Node) -> Set[Node]:
+        return self.downstreams[node]
+
+    def upstreams_of(self, node: Node) -> Set[Node]:
+        return self.upstreams[node]
+
 class CapabilityBasedPartitioner:
 
     def __init__(self,
@@ -49,6 +72,7 @@ def __init__(self,
             if allowed_single_node_partition_ops is not None
             else []
         )
+        self.dependency_viewer = _DependencyViewer(graph_module)
 
     def __is_node_supported(self, node: Node) -> bool:
         return (
@@ -56,6 +80,11 @@ def __is_node_supported(self, node: Node) -> bool:
         )
 
     def propose_partitions(self) -> List[Partition]:
+        # partition_map is a mapping from partition id to a set of partition id's.
+        # The value set contains all the partition ids that can be reached by doing a
+        # DFS starting from the partition id in the key.
+        partition_map : Dict[int, Set] = collections.defaultdict(set)
+
         # assumptions: nodes in candidate list is sorted in topological order
         assignment: Dict[Node, int] = {}   # mapping from node to partition_id
         partitions_by_id: Dict[int, Partition] = {}  # mapping from partition_id to partition
@@ -69,48 +98,46 @@ def maybe_merge_partition(self_id: int, other_id: int):
             merged_nodes = copy(partitions_by_id[self_id].nodes)
             merged_nodes.update(partitions_by_id[other_id].nodes)
 
-            # Note it's ok to use `set` here, since we are only query if a node
-            # has been visited. We are NEVER going to iterate on nodes inside
-            # the set.
-            visited: Set[Node] = set()
-
-            def dfs_iter_find_cycle(root_node):
-                stack : Deque[Node] = deque()
-                stack.append(root_node)
-
-                while stack:
-                    node = stack.pop()
-
-                    if node in visited:
-                        continue
-                    if node in merged_nodes:
-                        return True  # found cycle, return
-
-                    # branching on hitting partition or not
-                    if node in assignment:
-                        # Since partition is not merged in the graph yet, when we
-                        # hit a node in a partition through DFS, we need to
-                        # traverse all nodes in the partition to properly reflect
-                        # dependencies after the fusion
-                        for p_node in partitions_by_id[assignment[node]].nodes:
-                            for user_node in p_node.users:
-                                if user_node not in partitions_by_id[assignment[node]].nodes:
-                                    stack.append(user_node)
-                    else:
-                        for user_node in node.users:
-                            stack.append(user_node)
-
-                    visited.add(node)
+            def dfs_iter_find_cycle(all_user_nodes: Set[Node]):
+                for user_node in all_user_nodes:
+                    visited_partition_ids = set()
+
+                    for path_node in self.dependency_viewer.downstreams_of(user_node):
+                        # If any of the nodes in the dfs path of this node are in the merged_nodes
+                        # list then there is a cycle in the graph.
+                        if path_node in merged_nodes:
+                            return True
+
+                        # If any of the nodes in the dfs path of this node are in the assignment
+                        # map then we have to make sure that the partitions that these nodes belong
+                        # to do not form a cycle with the current partitions being merged. This means
+                        # iterating through all the nodes in all the parititons that are traversed in
+                        # the dfs path and checking if they are in the merged_nodes list.
+                        if path_node in assignment:
+                            partition_id = assignment[path_node]
+                            # If the partition id has already been visited then we know that it doesn't
+                            # form a cycle with the current partitions being merged.
+                            if partition_id in visited_partition_ids:
+                                continue
+                            p_map = partition_map[partition_id]
+                            if self_id in p_map or other_id in p_map:
+                                return True
+
+                            visited_partition_ids.add(partition_id)
 
                 return False
 
             # check if merge would create cyclic dependency.
+            all_user_nodes = set()
             for node in merged_nodes:
                 for user_node in node.users:
-                    if user_node not in merged_nodes and dfs_iter_find_cycle(user_node):
-                        # return false indicating cyclic dependency found and
-                        # merge is aborted
-                        return False
+                    if user_node not in merged_nodes:
+                        all_user_nodes.add(user_node)
+
+            if dfs_iter_find_cycle(all_user_nodes):
+                # return false indicating cyclic dependency found and
+                # merge is aborted
+                return False
 
             # no cyclic dependency found, move forward with the merge
             # updating partition nodes
@@ -121,9 +148,31 @@ def dfs_iter_find_cycle(root_node):
             # delete other partition
             del partitions_by_id[other_id]
 
+            partition_map[self_id] = partition_map[self_id].union(partition_map[other_id])
+            del partition_map[other_id]
+
             return True
 
         def merge_single_node(node: Node, id: Optional[int]):
+            def _update_partition_map(node: Node, id: int):
+                # Iterate through all the downstream nodes of this node and update the partition map
+                # to indicate that there is a path from the partition id of this node to the target
+                # partition id.
+                downstream_nodes = self.dependency_viewer.downstreams_of(node)
+                for curr_node in downstream_nodes:
+                    target_id = assignment.get(curr_node, None)
+                    if target_id is not None:
+                        partition_map[id].add(target_id)
+
+                # Iterate through all the upstream nodes of this node and update the partition map
+                # to indicate that there is a path from the partition id of the upstream node to the
+                # current node's partition id.
+                upstream_nodes = self.dependency_viewer.upstreams_of(node)
+                for curr_node in upstream_nodes:
+                    source_id = assignment.get(curr_node, None)
+                    if source_id is not None:
+                        partition_map[source_id].add(id)
+
             if node in assignment:
                 partitions_by_id[assignment[node]].remove_node(node)
 
@@ -132,9 +181,11 @@ def merge_single_node(node: Node, id: Optional[int]):
             elif id not in partitions_by_id:
                 assignment[node] = id
                 partitions_by_id[id] = Partition(id=id, nodes=[node])
+                _update_partition_map(node, id)
             else:
                 assignment[node] = id
                 partitions_by_id[id].add_node(node)
+                _update_partition_map(node, id)
 
         logger.debug("Proposing partitions...")
 
@@ -209,7 +260,7 @@ def merge_single_node(node: Node, id: Optional[int]):
         for id, partition in partitions_by_id.items():
             logger.debug("partition #%s: %s", id, [node.name for node in partition.nodes])
 
-        return list(partitions_by_id.values())
+        return [partition for partition in partitions_by_id.values() if partition.size() > 0]
 
     def fuse_partitions(self, partitions: List[Partition]) -> GraphModule:
         logger.debug("Fusing partitions...")
diff --git a/torch/fx/passes/infra/pass_manager.py b/torch/fx/passes/infra/pass_manager.py
index b9c50d0cfa270..44de7fcc0b1bd 100644
--- a/torch/fx/passes/infra/pass_manager.py
+++ b/torch/fx/passes/infra/pass_manager.py
@@ -75,7 +75,7 @@ def _topological_sort_passes(
 
     # Contruct a graph mapping nodes to a list of their users
     graph: Dict[Callable, List[Callable]] = {p : [] for p in passes}
-    indegree_map: Dict[Callable, int] = {p : 0 for p in passes}
+    indegree_map: Dict[Callable, int] = dict.fromkeys(passes, 0)
     candidates: Queue = Queue()
     for a in passes:
         for b in passes:
@@ -90,7 +90,7 @@ def _topological_sort_passes(
         if indegree_map[a] == 0:
             candidates.put(a)
 
-    visited: Dict[Callable, bool] = {p : False for p in passes}
+    visited: Dict[Callable, bool] = dict.fromkeys(passes, False)
     sorted_passes: List[Callable] = []
 
     while not candidates.empty():
@@ -293,7 +293,7 @@ def __call__(self, module: nn.Module) -> PassResult:
                         for p in self.passes[:i]
                     ]
                     msg = f"An error occurred when running the '{fn_name}' pass after the following passes: {prev_pass_names}"
-                    raise Exception(msg) from e
+                    raise Exception(msg) from e  # noqa: TRY002
 
             # If the graph no longer changes, then we can stop running these passes
             overall_modified = overall_modified or modified
diff --git a/torch/fx/passes/net_min_base.py b/torch/fx/passes/net_min_base.py
index 3790acd34329f..9d24162500ac9 100644
--- a/torch/fx/passes/net_min_base.py
+++ b/torch/fx/passes/net_min_base.py
@@ -4,6 +4,7 @@
 
 import torch
 import torch.fx
+
 from torch.fx._compatibility import compatibility
 from torch.fx.node import map_arg
 
@@ -111,10 +112,13 @@ def __init__(
         settings: _MinimizerSettingBase,
         module_exporter: Optional[
             Callable[
-                [List[torch.Tensor], torch.fx.GraphModule, str],
+                [Tensors, torch.fx.GraphModule, str],
                 None
             ]
         ] = None,
+        exclusion_fn: Optional[
+            Callable[[NodeList, int, int], None]
+        ] = None,
     ):
         assert isinstance(module, torch.fx.GraphModule)
 
@@ -123,6 +127,7 @@ def __init__(
         self.compare_fn = compare_fn
         self.module_exporter = module_exporter
         self.settings = settings
+        self.exclusion_fn = exclusion_fn
 
         # Stores outputs of run_a function
         self.a_outputs: Dict[str, Any] = {}
@@ -156,14 +161,14 @@ def __init__(
             self.a_outputs[name] = sample_input[i]
             self.b_outputs[name] = sample_input[i]
 
-    def run_a(self, mod: torch.fx.GraphModule, inputs: Tensors) -> TensorOrTensors:
+    def run_a(self, mod: torch.fx.GraphModule, inputs: Tensors, report_idx: int = -1) -> TensorOrTensors:
         """
         Run `mod` with `inputs` and generate output. The output will be compared with
         output of run_b().
         """
         raise RuntimeError("run_a() is not implemented.")
 
-    def run_b(self, mod: torch.fx.GraphModule, inputs: Tensors) -> TensorOrTensors:
+    def run_b(self, mod: torch.fx.GraphModule, inputs: Tensors, report_idx: int = -1) -> TensorOrTensors:
         """
         Run `mod` with `inputs` and generate output. The output will be compared with
         output of run_a().
@@ -316,7 +321,11 @@ def _build_submodule(self, nodes: NodeSet) -> Tuple[torch.fx.GraphModule, str]:
         return split_module, submodule_name
 
     def _run_and_compare(
-        self, split_module: torch.fx.GraphModule, submod_name: str, output_names: Names
+        self,
+        split_module: torch.fx.GraphModule,
+        submod_name: str,
+        output_names: Names,
+        report_idx: int = -1
     ):
         """
         Run the submodule in `split_module` that has name `submod_name`
@@ -335,7 +344,7 @@ def _run_and_compare(
             self.reports.append([])
             self.iteration = 1
 
-        report = self.reports[self.iteration - 1]
+        report = self.reports[report_idx if report_idx >= 0 else self.iteration - 1]
         report.append("Run and compare ...")
 
         if output_names:
@@ -359,8 +368,8 @@ def _run_and_compare(
                 result_key = map_arg(node.args, lambda x: x.name)
 
         try:
-            a_result = self.run_a(submodule, a_input)
-            b_result = self.run_b(submodule, b_input)
+            a_result = self.run_a(submodule, a_input, report_idx)
+            b_result = self.run_b(submodule, b_input, report_idx)
             self._store_outputs(a_result, b_result, submodule)
         except Exception as e:
             report.append(f"Exception raised when running {submod_name}: {e}")
@@ -371,20 +380,20 @@ def _run_and_compare(
         # Compare results
         names: Names = output_names
         if output_names is None:
-            names = [str(v) for v in result_key]
+            names = [str(v) for v in result_key]  # type: ignore[possibly-undefined]
 
         numeric_result, bool_result = self.compare_fn(a_result, b_result, names)
 
-        self.results[result_key] = numeric_result
+        self.results[result_key] = numeric_result  # type: ignore[possibly-undefined]
         report.append(f"Numerical accuracy = {numeric_result}")
         if not bool_result:
             report.append(f"Result mismatch for {result_key}")
             if self.module_exporter:
                 self.module_exporter(
-                    List[torch.Tensor](a_input), submodule, str(result_key[0]) + "_cpu",
+                    a_input, submodule, str(result_key[0]) + "_cpu",
                 )
                 self.module_exporter(
-                    List[torch.Tensor](b_input), submodule, str(result_key[0]) + "_acc",
+                    b_input, submodule, str(result_key[0]) + "_acc",
                 )
             raise FxNetMinimizerResultMismatchError(f"Result mismatch for {result_key}")
 
@@ -394,26 +403,32 @@ def _binary_search_impl(
         """
         Recursive binary search implementation.
         """
+        culprits: NodeSet = set()
         nodes: NodeList = all_nodes[start_idx:end_idx]
 
         report: List[str] = []
-        self.reports.append(report)
+        if self.exclusion_fn is not None:
+            self.exclusion_fn(nodes, start_idx, end_idx)
+            if len(nodes) == 0:
+                report = ["All nodes are excluded by user"]
+                self.reports.append(report)
+                return culprits
+
+        first_node_name = nodes[0].name
+        output_node_name = nodes[-1].name
         self.iteration += 1
-        report.append(f"Binary search iteration {self.iteration}.")
+        self.reports.append(report)
+        report.append(f"Binary search iteration {self.iteration}")
         report.append(
-            f"From node index {start_idx} to {end_idx-1}. "
+            f"From node index {start_idx}:{first_node_name} to {end_idx-1}:{output_node_name}. "
             f"Size of the interested node list is {len(nodes)}"
         )
-
         cur_nodes: NodeSet = set(nodes)
 
-        for node in nodes:
-            if node in self.fusions:
-                cur_nodes.update(self.fusions[node])
-
         try:
             split_module, submod_name = self._build_submodule(cur_nodes)
-            self._run_and_compare(split_module, submod_name, [])
+            self._run_and_compare(split_module, submod_name, [output_node_name])
+
         except (FxNetMinimizerRunFuncError, FxNetMinimizerResultMismatchError):
 
             if len(nodes) == 1:
@@ -471,6 +486,14 @@ def _sequential_traverse(self, nodes: NodeList) -> NodeSet:
             report.append(f"Visit node: {node.name}")
 
             _LOGGER.info("Visit node: %s", node.name)
+            node_list: NodeList = [node]
+            if self.exclusion_fn is not None:
+                self.exclusion_fn(node_list, -1, -1)
+                if len(node_list) == 0:
+                    report.append(f"User exclusion : {node.name}")
+                    self.print_report(report)
+                    return culprits
+
             cur_nodes: NodeSet = {node}
 
             if node in self.fusions:
@@ -495,6 +518,183 @@ def _sequential_traverse(self, nodes: NodeList) -> NodeSet:
 
         return culprits
 
+
+    def _block_traverse_impl(self, nodes: NodeList, start_idx: int, end_idx: int, find_last_node: bool) -> int:
+        """
+        Recursive block search implementation.
+        find_last_node: If True, search for the last node which result in numerics difference
+        if False: find first node in sorted node list
+        """
+        report: List[str] = []
+
+        mid = (start_idx + end_idx) // 2
+        cur_nodes_list: NodeList = nodes[:mid + 1] if find_last_node else nodes[mid:]
+
+        if self.exclusion_fn:
+            self.exclusion_fn(cur_nodes_list, -1, -1)
+
+        cur_nodes = set(cur_nodes_list)
+
+        first_node_name = cur_nodes_list[0].name
+        last_node_name = cur_nodes_list[-1].name
+        target_node_name = last_node_name if find_last_node else first_node_name
+
+        self.iteration += 1
+        self.reports.append(report)
+        report.extend(
+            [
+                "=" * 30,
+                f"Block search iteration {self.iteration}",
+            ]
+        )
+        report.extend(
+            [
+                f"Search for {'last' if find_last_node else 'first'} node in culprits",
+                f"From node index {start_idx}:{nodes[start_idx].name} to {end_idx}:{nodes[end_idx].name}. ",
+                f"Subgraph constructed by {first_node_name} to {last_node_name}",
+                f"Targeting node: {target_node_name}",
+                f"Size of the interested node list is {end_idx - start_idx + 1}",
+            ]
+        )
+        report_idx = len(self.reports) - 1
+
+        try:
+            split_module, submod_name = self._build_submodule(cur_nodes)
+            self._run_and_compare(split_module, submod_name, [last_node_name], report_idx)
+        except (FxNetMinimizerResultMismatchError, FxNetMinimizerRunFuncError):
+            report.append(f"Culprits found from node {first_node_name} to {last_node_name}.")
+
+            if start_idx == mid:
+                report.extend(
+                    [
+                        "This is the last node in the sub-module. ",
+                        "Search in the current branch is successful with node :",
+                        f"{start_idx}, node name: {nodes[start_idx].name}."
+                    ]
+                )
+                self.print_report(report)
+                return start_idx
+
+            report.append(
+                "Proceed to split and lower the halves of the current "
+                "sub-module individually."
+            )
+            self.print_report(report)
+
+            if find_last_node:
+                return self._block_traverse_impl(nodes, start_idx, mid, find_last_node)
+            else:
+                return self._block_traverse_impl(nodes, mid + 1, end_idx, find_last_node)
+        else:
+            report.append(f"Culprits not found from node start to {mid}:{nodes[mid].name}.")
+
+            if start_idx == mid:
+                report.extend(
+                    [
+                        "This is the last node in the sub-module. ",
+                        "Search in the current branch is successful with node",
+                        f"{start_idx}, node name: {nodes[start_idx].name}.",
+                    ]
+                )
+                self.print_report(report)
+                return start_idx + 1 if find_last_node else start_idx - 1
+
+            report.append(
+                "Proceed to split and lower the halves of the current "
+                "sub-module individually."
+            )
+            self.print_report(report)
+
+            if find_last_node:
+                return self._block_traverse_impl(nodes, mid + 1, end_idx, find_last_node)
+            else:
+                return self._block_traverse_impl(nodes, start_idx, mid, find_last_node)
+
+
+    def _block_traverse(self, nodes: NodeList, find_last_node: Optional[bool]) -> NodeSet:
+        """
+        Traverse topologically sorted node list
+        Find minimium block (start_idx, end_idx) which contains the culprit
+        1st pass: search for end_idx by finding the last node in culprit block
+        where Numerical accuracy (0, end_idx) > threshold
+        2nd pass: search for start_idx by finding the first node in culprit block
+        where Numerical accuracy (start_idx, end_idx) < threshold
+        Form minimum block by (start_idx - 1, end_idx)
+        """
+        culprits: NodeSet = set()
+        first_node_name = nodes[0].name
+        last_node_name = nodes[-1].name
+        last_node_report = [f"Block search from {first_node_name} to {last_node_name}"]
+        last_node_report.append("*" * 50)
+        self.reports.append(last_node_report)
+
+        start_idx = 0
+        end_idx = len(nodes) - 1
+        run_both = True if find_last_node is None else False
+
+        # step 1: find (0, end_idx) of culprit block
+        if run_both or find_last_node:
+            last_node_report.append("Start searching for last node in culprit")
+            self.print_report(last_node_report)
+            end_idx = self._block_traverse_impl(nodes, start_idx, end_idx, True)
+            last_node_report.extend(
+                [
+                    "Finish Pass 1",
+                    f"Find end_idx = {end_idx}:{nodes[end_idx].name}"
+                ]
+            )
+            self.print_report(last_node_report)
+
+        # step 2: reduce culprit block to (start_idx, end_idx)
+        if run_both or not find_last_node:
+            first_node_report = ["Start searching for first node in culprit"]
+            self.print_report(first_node_report)
+            start_idx = self._block_traverse_impl(nodes[0:end_idx + 1], start_idx, end_idx, False)
+            first_node_report.append("*" * 50)
+            self.reports.append(first_node_report)
+            first_node_report.extend(
+                [
+                    "Finish Pass 2",
+                    f"Find start_idx = {start_idx}:{nodes[start_idx].name}"
+                ]
+            )
+            self.print_report(first_node_report)
+
+        # step 3: form module with minimum culprits
+        culprits.update(nodes[start_idx:end_idx + 1])
+        result_report = [f"Finish searching, found minimum block ({nodes[start_idx]},{nodes[end_idx]})"]
+        self.reports.append(result_report)
+        self.print_report(result_report)
+        return culprits
+
+
+    def _defined_traverse(self, nodes: NodeList) -> NodeSet:
+        """
+        run user defined `nodes` and determine if it is a culprit.
+        """
+        culprits: NodeSet = set()
+        if self.exclusion_fn is not None:
+            self.exclusion_fn(nodes, -1, -1)
+        if len(nodes) == 0:
+            report = ["All nodes are excluded by user"]
+            self.reports.append(report)
+            return culprits
+
+        first_node_name = nodes[0].name
+        output_node_name = nodes[-1].name
+        report = [f"Defined graph from {first_node_name} to {output_node_name}"]
+        cur_nodes: NodeSet = set(nodes)
+        try:
+            split_module, submod_name = self._build_submodule(cur_nodes)
+            self._run_and_compare(split_module, submod_name, [output_node_name])
+            self.print_report(report)
+        except (FxNetMinimizerResultMismatchError, FxNetMinimizerRunFuncError):
+            report.append(f"Found culprit {cur_nodes}")
+            self.print_report(report)
+            return culprits
+
+        return culprits
+
     def _accumulate_traverse(self, nodes: NodeList) -> NodeSet:
         culprits: NodeSet = set()
         nodes_to_run: NodeSet = set()
@@ -540,7 +740,14 @@ def _skip_traverse_impl(self, all_nodes: NodeList, start_idx: int, end_idx: int)
         """
         culprits: NodeSet = set()
         nodes: NodeList = all_nodes[start_idx:end_idx]
-
+        cur_nodes: NodeSet = set(nodes)
+        if self.exclusion_fn is not None:
+            self.exclusion_fn(nodes, start_idx, end_idx)
+            cur_nodes = set(nodes)
+        else:
+            for node in nodes:
+                if node in self.fusions:
+                    cur_nodes.update(self.fusions[node])
         report: List[str] = []
         self.reports.append(report)
         self.iteration += 1
@@ -550,12 +757,6 @@ def _skip_traverse_impl(self, all_nodes: NodeList, start_idx: int, end_idx: int)
             f"Size of the interested node list is {len(nodes)}"
         )
 
-        cur_nodes: NodeSet = set(nodes)
-
-        for node in nodes:
-            if node in self.fusions:
-                cur_nodes.update(self.fusions[node])
-
         try:
             split_module, submod_name = self._build_submodule(cur_nodes)
             self._run_and_compare(split_module, submod_name, [])
@@ -566,7 +767,7 @@ def _skip_traverse_impl(self, all_nodes: NodeList, start_idx: int, end_idx: int)
             return culprits
         except (FxNetMinimizerRunFuncError):
             culprits.update(cur_nodes)
-            report.append(f"Found culprit from run error: {node}")
+            report.append(f"Found culprit from run error: {cur_nodes}")
             self.print_report(report)
             return culprits
         else:
@@ -666,7 +867,11 @@ def print_reports(self):
             self.print_report(report)
 
     def minimize(
-        self, start: Optional[str] = None, end: Optional[str] = None, skip_nodes: Optional[List] = None,
+        self,
+        start: Optional[str] = None,
+        end: Optional[str] = None,
+        skip_nodes: Optional[List] = None,
+        find_last_node: Optional[bool] = None,
     ) -> NodeSet:
         """
         Minimizing the model from node with name `start` to node with name `end` base
@@ -678,6 +883,12 @@ def minimize(
                 to None, then we'll start with the first node of the model.
             end: The name of the node where we want to terminate minimizing. If
                 set to None, we'll end with the last node of the model.
+            skip_nodes: The names of nodes where we want to skip during minimizing.
+                It'll create subgraphs without these skip nodes under the hood.
+                Only applicable in mode "skip".
+            find_last_node: True if only last_node of a culprits is needed in mode "block".
+                False if only the first_node of a culprits is needed.
+                Only applicable in mode "block".
 
         Returns:
             nodes: A list of nodes that causes FxNetMinimizerRunFuncError or
@@ -703,4 +914,10 @@ def minimize(
                 raise RuntimeError("'skip_nodes' can't be None when 'traverse_method' is 'skip'.")
             return self._skip_traverse(nodes, skip_nodes)
 
+        if self.settings.traverse_method == "defined":
+            return self._defined_traverse(nodes)
+
+        if self.settings.traverse_method == "block":
+            return self._block_traverse(nodes, find_last_node)
+
         raise RuntimeError(f"Unknown traverse method {self.settings.traverse_method}!")
diff --git a/torch/fx/passes/operator_support.py b/torch/fx/passes/operator_support.py
index ed2cca91da8d7..ce050f046eeac 100644
--- a/torch/fx/passes/operator_support.py
+++ b/torch/fx/passes/operator_support.py
@@ -31,7 +31,7 @@ class OperatorSupportBase(abc.ABC):
     def is_node_supported(
         self, submodules: t.Mapping[str, torch.nn.Module], node: torch.fx.Node
     ) -> bool:
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 @compatibility(is_backward_compatible=False)
@@ -188,9 +188,6 @@ def _decline_if_input_dtype(
             node: torch.fx.Node,
         ) -> bool:
             for arg in node.all_input_nodes:
-                # escape dtype check for get_attr node
-                if arg.op == "get_attr":
-                    continue
                 arg_dtype = _get_arg_dtype(arg)
                 if arg_dtype == dtype:
                     return False
diff --git a/torch/fx/passes/pass_manager.py b/torch/fx/passes/pass_manager.py
index 4f103640fa30a..55d5ea0af54d0 100644
--- a/torch/fx/passes/pass_manager.py
+++ b/torch/fx/passes/pass_manager.py
@@ -218,7 +218,7 @@ def add_constraint(self, constraint):
         self.constraints.append(constraint)
         self._validated = False
 
-    def remove_pass(self, _passes: List[Callable]):
+    def remove_pass(self, _passes: List[str]):
         if _passes is None:
             return
         passes_left = []
diff --git a/torch/fx/passes/runtime_assert.py b/torch/fx/passes/runtime_assert.py
new file mode 100644
index 0000000000000..870348af6f69e
--- /dev/null
+++ b/torch/fx/passes/runtime_assert.py
@@ -0,0 +1,305 @@
+import logging
+import operator
+from typing import Any, Dict, Optional, Set, TYPE_CHECKING
+
+# Import sympy and ShapeEnv during TYPE_CHECKING since importing sympy is slow
+if TYPE_CHECKING:
+    from torch.fx.experimental.symbolic_shapes import ShapeEnv
+else:
+    ShapeEnv = Any
+
+import torch
+import torch.utils._pytree as pytree
+from torch import fx
+from torch.fx._compatibility import compatibility
+from torch.fx._utils import lazy_format_graph_code
+from torch.fx.experimental.sym_node import SymNode
+from torch.fx.graph_module import GraphModule
+
+log = logging.getLogger(__name__)
+graph_code_log = torch._logging.getArtifactLogger(__name__, "graph_code")
+
+
+def _get_example_value(node: fx.Node) -> Optional[str]:
+    """
+    Get the example value key for a node, since dynamo uses "example_value"
+    while non-strict export uses "val.
+    """
+    if "example_value" in node.meta:
+        return node.meta["example_value"]
+    elif "val" in node.meta:
+        return node.meta["val"]
+    else:
+        return None
+
+
+@compatibility(is_backward_compatible=True)
+def insert_deferred_runtime_asserts(
+    gm: GraphModule,
+    shape_env: ShapeEnv,
+    name: str,
+    export: bool = False,
+) -> None:
+    """
+    During tracing, we may have discovered that some data-dependent values
+    had runtime assert on them; e.g., torch.empty(x.item()) induces a runtime
+    that x.item() >= 0.  This asserts can happen unpredictably during fake
+    tensor propagation, so we cannot conveniently insert them into the FX graph
+    when they occur.  Instead, we accumulate them in the ShapeEnv, and in this
+    pass insert them into the graph as proper tests.
+    """
+    # Import sympy locally
+    import sympy
+
+    from torch.fx.experimental.symbolic_shapes import (
+        CallMethodKey,
+        cast_symbool_to_symint_guardless,
+        ConvertIntKey,
+        DivideByKey,
+        free_symbols,
+    )
+    from torch.utils._sympy.interp import sympy_interp
+    from torch.utils._sympy.reference import PythonReferenceAnalysis
+
+    # TODO: Request simplification on runtime asserts before emitting them
+    ras_by_symbol = shape_env.deferred_runtime_asserts.copy()
+    graph = gm.graph
+
+    if not any(ras for ras in ras_by_symbol.values()):
+        return
+
+    graph_code_log.debug(
+        "%s",
+        lazy_format_graph_code(f"pre insert_deferred_runtime_asserts {name}", gm),
+    )
+
+    # We are going to mutate the dict
+    symbol_to_proxy: Dict[sympy.Symbol, fx.Proxy] = {}
+    placeholders = set()
+    last_placeholder = None
+    for node in graph.nodes:
+        if node.op != "placeholder":
+            last_placeholder = node
+            break
+        placeholders.add(node)
+    assert last_placeholder is not None
+
+    # Identify what symbols we need to reify.  This isn't strictly needed
+    # but helps reduce churn on the graph
+    needed_symbols: Set[sympy.Symbol] = set()
+    for ras in ras_by_symbol.values():
+        for ra in ras:
+            needed_symbols.update(free_symbols(ra.expr))
+
+    log.debug("needed_symbols = %s", needed_symbols)
+
+    def add_runtime_asserts(ras):
+        for ra in ras:
+            log.debug("inserting runtime assert %s", ra.expr)
+            # Need to process ALL free symbols, not just unbacked ones
+            fvs = free_symbols(ra.expr)
+            missing = fvs - symbol_to_proxy.keys()
+            if missing:
+                i1 = min(missing, key=str)
+                # TODO: Remove relaxing assert on unbacked_symint https://github.com/pytorch/pytorch/issues/119689
+                # assert shape_env.is_unbacked_symint(i1), i1
+                ras_by_symbol.setdefault(i1, []).append(ra)
+            else:
+                # Convert the sympy expression into a sequence of FX
+                # nodes
+                res = sympy_interp(
+                    PythonReferenceAnalysis, symbol_to_proxy, ra.expr
+                ).node
+                graph.call_function(
+                    torch.ops.aten._assert_scalar.default,
+                    # TODO: use ra.msg here, but it's pretty
+                    # useless right now
+                    (
+                        res,
+                        f"Runtime assertion failed for expression {ra.expr} on node '{res}'",
+                    ),
+                )
+
+    for node in graph.nodes:
+        # Placeholders can match symbols, but when we destructure them
+        # with size we have to make sure we insert the nodes after all
+        # the placeholders
+        with graph.inserting_before(
+            node.next if node not in placeholders else last_placeholder.next
+        ):
+            # Unfortunately, this logic still must remain because manual
+            # make_fx calls may not explicitly bind all symbolic ints as
+            # arguments to the function, so we must infer it from the other
+            # arguments
+            if (
+                node in placeholders
+                and (example_value := _get_example_value(node)) is not None
+            ):
+
+                def match_symbol(symint, cb):
+                    if (
+                        isinstance(symint, torch.SymInt)
+                        and isinstance(symint.node, SymNode)
+                        and isinstance(s := symint.node.expr, sympy.Symbol)
+                        and s not in symbol_to_proxy
+                        and s in needed_symbols
+                    ):
+                        symbol_to_proxy[s] = fx.Proxy(cb())
+                        log.debug("symbol_to_proxy[%s] = %s", s, symbol_to_proxy[s])
+
+                match_symbol(example_value, lambda: node)
+                if isinstance(t := example_value, torch.Tensor):
+                    for i, s in enumerate(t.size()):
+                        match_symbol(s, lambda: graph.call_method("size", (node, i)))
+                    for i, s in enumerate(t.stride()):
+                        match_symbol(s, lambda: graph.call_method("stride", (node, i)))
+                    match_symbol(
+                        t.storage_offset(),
+                        lambda: graph.call_method("storage_offset", (node,)),
+                    )
+
+            # Handle asserts that aren't associated with any symbol.  This
+            # doesn't really have to be in the loop as it will only run once,
+            # it just needs to happen right after the placeholders.
+            if node not in placeholders:
+                add_runtime_asserts(ras_by_symbol.pop(None, []))  # type: ignore[call-overload]
+
+            defs = []
+
+            if unbacked_bindings := node.meta.get("unbacked_bindings"):
+                for s, keypath in unbacked_bindings.items():
+                    defs.append(s)
+
+                    # TODO: some CSE when generating these nodes can probably
+                    # help reduce graph size and improve compile itme
+                    def go(node, keypath):
+                        if keypath == ():
+                            return node
+                        if (
+                            len(keypath) >= 2
+                            and isinstance(keypath[0], CallMethodKey)
+                            and isinstance(keypath[1], pytree.SequenceKey)
+                        ):
+                            return go(
+                                graph.call_method(
+                                    keypath[0].name, (node, keypath[1].idx)
+                                ),
+                                keypath[2:],
+                            )
+                        elif isinstance(keypath[0], CallMethodKey):
+                            return go(
+                                graph.call_method(keypath[0].name, (node,)), keypath[1:]
+                            )
+                        elif isinstance(keypath[0], pytree.SequenceKey):
+                            return go(
+                                graph.call_function(
+                                    operator.getitem, (node, keypath[0].idx)
+                                ),
+                                keypath[1:],
+                            )
+                        elif isinstance(keypath[0], ConvertIntKey):
+                            return go(
+                                graph.call_function(
+                                    cast_symbool_to_symint_guardless, (node,)
+                                ),
+                                keypath[1:],
+                            )
+                        elif isinstance(keypath[0], DivideByKey):
+                            # TODO: need to assert divisibility
+                            return go(
+                                graph.call_function(
+                                    operator.floordiv, (node, keypath[0].divisor)
+                                ),
+                                keypath[1:],
+                            )
+                        else:
+                            raise AssertionError(f"unrecognized keypath {keypath}")
+
+                    symbol_to_proxy[s] = fx.Proxy(go(node, keypath))
+                    log.debug("symbol_to_proxy[%s] = %s", s, symbol_to_proxy[s])
+
+            for i0 in defs:
+                ras = ras_by_symbol.pop(i0, [])
+                # Before we perform any asserts, first apply range
+                # refinement.  This is important, because if we are going
+                # to retrace the graph (and we typically are if we send
+                # the graph to AOTAutograd), we need to make sure we apply
+                # range refinement (ala _check_is_size) first, BEFORE we
+                # run any of the asserts.  Otherwise, we may decide to
+                # perform substitutions based on the asserts which we then
+                # can't back out, because value ranges can only be applied
+                # to asserts.)
+                #
+                # A perhaps better long term plan is to avoid this order
+                # dependence by making it possible to refine ranges on
+                # arbitrary expressions, not just symbols.  But it is not
+                # so easy to make use of this information, see
+                # https://twitter.com/ezyang/status/1745801370299482492
+                # We actually made an attempt at this in
+                # https://github.com/pytorch/pytorch/pull/119043
+                # which didn't work.
+                #
+                # Another ideas for how to do this:
+                # - Have bound_sympy be the source of truth of the ranges of any expression
+                # - Cache intermediate results for every subexpression of bound_sympy
+                # - This cache should be possible to edit to refine ranges
+                #
+                # One issue with this proposal is that if
+                # we have a bound on 2x, we are not going to be able to
+                # apply it for 4x.  Similarly, we may have bounds for an
+                # equivalent expression that we are not applying because
+                # it's not a perfect match (e.g. x < y vs y > x)".
+                #
+                # The first issue we already have it and it's impossible
+                # to solve in general, so any implementation on a best
+                # effort basis should do.
+                #
+                # The second issue is a preexisting one. It can be mitigated
+                # with a normalisation algorithm. In general, it may also
+                # be on a best effort basis, but since our grammar is not
+                # terribly difficult, chances are we could even fully
+                # normalise SymPy expressions... who knows.
+
+                if i0 in shape_env.size_like:
+                    if export:
+                        graph.call_function(
+                            torch.ops.aten.sym_constrain_range_for_size.default,
+                            (symbol_to_proxy[i0].node,),
+                        )
+                    else:
+                        graph.call_function(
+                            torch._check_is_size, (symbol_to_proxy[i0].node,)
+                        )
+
+                vr = shape_env.var_to_range[i0]
+                if not shape_env._default_unspecified_value_range().issubset(vr):
+                    # The runtime range is constrained, so add a runtime
+                    # assert and also explicitly refine the range
+                    # (refinement should not be necessary once runtime
+                    # asserts cause refinement, but that's NYI)
+                    def convert(s):
+                        try:
+                            return int(s)
+                        except TypeError:
+                            return None
+
+                    if export:
+                        graph.call_function(
+                            torch.ops.aten.sym_constrain_range.default,
+                            (symbol_to_proxy[i0].node,),
+                            {
+                                "min": convert(vr.lower),
+                                "max": convert(vr.upper),
+                            },
+                        )
+                    else:
+                        graph.call_function(
+                            torch._constrain_as_value,
+                            (
+                                symbol_to_proxy[i0].node,
+                                convert(vr.lower),
+                                convert(vr.upper),
+                            ),
+                        )
+
+                add_runtime_asserts(ras)
diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py
index 6208980a8ac81..3da0fdd76dcf2 100644
--- a/torch/fx/passes/shape_prop.py
+++ b/torch/fx/passes/shape_prop.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 import torch.fx
 import traceback
diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py
index 2a798426ed1b9..4f3eee93eccb4 100644
--- a/torch/fx/passes/split_module.py
+++ b/torch/fx/passes/split_module.py
@@ -1,5 +1,5 @@
 import inspect
-from typing import Any, Callable, Dict, List, Optional, Set
+from typing import Any, Callable, Dict, List, Optional, Set, TYPE_CHECKING
 from collections import OrderedDict
 import logging
 
@@ -8,6 +8,9 @@
 from torch.fx.graph_module import GraphModule
 from torch.fx.node import Node
 
+if TYPE_CHECKING:
+    import sympy  # noqa: F401
+
 __all__ = ["Partition", "split_module"]
 _LOGGER = logging.getLogger(__name__)
 
@@ -44,6 +47,7 @@ def split_module(
     split_callback: Callable[[Node], int],
     qualname_map: Optional[Dict[str, str]] = None,
     keep_original_order: Optional[bool] = False,
+    keep_original_node_name: Optional[bool] = False,
 ):
     """
     Creates subgraphs out of main graph
@@ -149,9 +153,13 @@ def construct_graph(
             default_value = (
                 node.args[0] if len(node.args) > 0 else inspect.Signature.empty
             )
-            base_mod_env[node.name] = base_mod_graph.placeholder(
-                node.target, type_expr=node.type, default_value=default_value
-            )
+            if keep_original_node_name:
+                args = () if default_value is inspect.Signature.empty else (default_value,)
+                base_mod_env[node.name] = base_mod_graph.create_node('placeholder', node.name, args=args, type_expr=node.type)
+            else:
+                base_mod_env[node.name] = base_mod_graph.placeholder(
+                    node.target, type_expr=node.type, default_value=default_value
+                )
             base_mod_env[node.name].meta = node.meta.copy()
         elif node.op == "get_attr":
             base_mod_env[node.name] = base_mod_graph.get_attr(node.target)
@@ -166,10 +174,13 @@ def construct_graph(
 
     partitions: Dict[str, Partition] = {}
     orig_nodes: Dict[str, Node] = {}
+    symbol_to_node: Dict["sympy.Symbol", Node] = {}
 
     def record_cross_partition_use(
         def_node: Node, use_node: Optional[Node]
     ):  # noqa: B950
+        from torch.fx.experimental.symbolic_shapes import free_symbols
+
         defined = getattr(def_node, "_fx_partition", None)
         used = getattr(use_node, "_fx_partition", None)
         if defined != used:
@@ -182,6 +193,9 @@ def record_cross_partition_use(
             if used is not None:
                 use_partition = partitions[used]
                 use_partition.inputs.setdefault(def_node.name)
+                if (def_val := def_node.meta.get("example_value")) is not None:
+                    for s in sorted(free_symbols(def_val), key=str):
+                        use_partition.inputs.setdefault(symbol_to_node[s].name)
                 if defined is not None:
                     use_partition.dependencies.setdefault(defined)
 
@@ -223,8 +237,17 @@ def instantiate_node_partition_mapping(node):
     active_grad = None
     active_autocasts = set()
 
+    import sympy  # noqa: F811
+
     for node in m.graph.nodes:
         if node.op in ["placeholder", "get_attr", "output"]:
+            if (
+                node.op == "placeholder" and
+                (val := node.meta.get("example_value")) is not None and
+                isinstance(val, torch.SymInt) and
+                isinstance(val.node.expr, sympy.Symbol)
+            ):
+                symbol_to_node[val.node.expr] = node
             continue
 
         instantiate_node_partition_mapping(node)
@@ -376,12 +399,14 @@ def instantiate_node_partition_mapping(node):
 
             assert isinstance(gathered_args, tuple)
             assert isinstance(gathered_kwargs, dict)
+            name = node.name if keep_original_node_name else None
             new_node = partition.graph.create_node(
                 op=node.op,
                 target=target,
                 args=gathered_args,
                 kwargs=gathered_kwargs,
                 type_expr=node.type,
+                name=name,
             )
             new_node.meta = node.meta.copy()
             partition.environment[node] = new_node
diff --git a/torch/fx/passes/split_utils.py b/torch/fx/passes/split_utils.py
index 66b396961a61e..1282081af67b5 100644
--- a/torch/fx/passes/split_utils.py
+++ b/torch/fx/passes/split_utils.py
@@ -62,6 +62,7 @@ def split_by_tags(
     gm: torch.fx.GraphModule,
     tags: List[str],
     return_fqn_mapping: bool = False,
+    return_tuple: bool = False,
     GraphModuleCls: Type[torch.fx.GraphModule] = torch.fx.GraphModule,
 ) -> Union[torch.fx.GraphModule, Tuple[torch.fx.GraphModule, Dict[str, str]]]:
     """
@@ -256,11 +257,14 @@ def remap_func(x):
     for comp in all_components:
         outs = tuple(map(node_remapping.__getitem__, comp.orig_outputs))
 
-        # Take care of the args of FX output node. If there's a single
-        # output then the output node args is like (output_single), else
-        # if there're multiple outputs then the output node args is like
-        # ((output_0, output_1, ...)).
-        comp.graph.output(outs[0] if len(outs) == 1 else outs)
+        if return_tuple:
+            comp.graph.output(outs)
+        else:
+            # Take care of the args of FX output node. If there's a single
+            # output then the output node args is like (output_single), else
+            # if there're multiple outputs then the output node args is like
+            # ((output_0, output_1, ...)).
+            comp.graph.output(outs[0] if len(outs) == 1 else outs)
 
         comp.gm, comp_orig_to_split_fqn_mapping = lift_subgraph_as_module(
             gm, subgraph=comp.graph, comp_name=comp.name
@@ -274,7 +278,7 @@ def remap_func(x):
             kwargs=None,
         )
 
-        if len(outs) == 1:
+        if len(outs) == 1 and not return_tuple:
             main_remapping[comp.orig_outputs[0]] = main_node
         else:
             for i, o in enumerate(comp.orig_outputs):
@@ -283,6 +287,7 @@ def remap_func(x):
 
     main_g.output(map_arg(output_node.args[0], main_remapping.__getitem__))
     main_root = HolderModule({comp.name: comp.gm for comp in all_components})
+    main_g._codegen = gm.graph._codegen
 
     # If the output nodes consumes get_attr directly in the original graph,
     # then we need to make sure get_attr is copied to the new graph.
diff --git a/torch/fx/passes/splitter_base.py b/torch/fx/passes/splitter_base.py
index e30d9d3e41540..b37f8ecf1d0c6 100644
--- a/torch/fx/passes/splitter_base.py
+++ b/torch/fx/passes/splitter_base.py
@@ -102,6 +102,7 @@ def __init__(
         self.module = module
         self.operator_support = operator_support
         self.allow_non_tensor = allow_non_tensor
+        self.acc_nodes: NodeSet = set()
 
     def reduce_acc_nodes_non_tensor_input_helper(
         self, cpu_worklist: NodeList
@@ -188,7 +189,7 @@ class FxNetSplitterInternalError(Exception):
 class Subgraph:
     is_acc: bool
     nodes: NodeList
-
+    device_ordinal: Optional[int] = None
 
 @compatibility(is_backward_compatible=False)
 class SplitResult(NamedTuple):
@@ -306,6 +307,7 @@ def __init__(
         operator_support: OperatorSupportBase,
         settings: _SplitterSettingBase,
         non_acc_submodule_name: str = "_run_on_cpu_",
+        return_tuple: bool = False,
     ):
         """
         Preprocesses graph before splitting:
@@ -336,6 +338,9 @@ def __init__(
 
         self.non_acc_submodule_name = non_acc_submodule_name
         self._node_submodule_map: Dict[str, str] = {}
+        self._return_tuple = return_tuple
+
+        self.tags: List[str] = []
 
     # ===============================================================
     # Helpers for ctor and initial state
@@ -429,6 +434,7 @@ def _get_node_style(self, node):
 
         drawer = CustomDrawer(mod, "node_support", ignore_getattr=True)
         dot_graph = drawer.get_main_dot_graph()
+        # pyre-fixme[16]: `pydot.Dot` has no attribute `write_raw`.
         dot_graph.write_raw("node_support.dot")
 
     def node_support_preview(self, dump_graph: bool = False):
@@ -524,6 +530,7 @@ def split_preview(self, dump_graph: bool = False):
             )
             dot_graphs = drawer.get_all_dot_graphs()
             for name, dot_graph in dot_graphs.items():
+                # pyre-fixme[16]: `pydot.Dot` has no attribute `write_raw`.
                 dot_graph.write_raw(f"{name}.dot")
 
         max_qps: float = self.PCIe_BW
@@ -575,7 +582,7 @@ def get_bytes(node: torch.fx.Node):
                     else:
                         total_output_bytes += get_size_of_node(submod, node)[0]
 
-                map_arg(output_node.args, get_bytes)
+                map_arg(output_node.args, get_bytes)  # type: ignore[possibly-undefined]
                 qps = self.PCIe_BW / max(total_input_bytes, total_output_bytes)
                 reports += f"Total input size in bytes is {total_input_bytes}, total output size in bytes is {total_output_bytes},"
                 reports += f" theoretical max qps (bounds by PCIe bandwidth) for this submodule is {qps}.\n"
@@ -834,7 +841,7 @@ def remove_small_acc_subgraphs(self, subgraphs: List[Subgraph]) -> List[Subgraph
         return result
 
     def tag(self, subgraphs: List[Subgraph]):
-        self.tags: List[str] = []
+        self.tags = []
         for subgraph in subgraphs:
             tag = f"_run_on_acc_{len(self.tags)}" if subgraph.is_acc else f"{self.non_acc_submodule_name}{len(self.tags)}"
             self.tags.append(tag)
@@ -846,7 +853,7 @@ def tag(self, subgraphs: List[Subgraph]):
                 self._node_submodule_map[node.name] = tag
 
     def split(self, remove_tag: bool = False) -> torch.fx.GraphModule:
-        split_module = split_by_tags(self.module, self.tags)
+        split_module = split_by_tags(self.module, self.tags, return_tuple=self._return_tuple)
         if remove_tag:
             for node in self.module.graph.nodes:
                 if hasattr(node, "tag"):
diff --git a/torch/fx/passes/tools_common.py b/torch/fx/passes/tools_common.py
index 42032b4b6cad1..7dc757a9c0e56 100644
--- a/torch/fx/passes/tools_common.py
+++ b/torch/fx/passes/tools_common.py
@@ -1,6 +1,7 @@
-from typing import List, Tuple, Union, Dict, Any, Set, Mapping
+from typing import List, Tuple, Union, Dict, Any, Set, Mapping, Optional
 import collections
 from dataclasses import dataclass
+import operator
 
 import torch
 import torch.fx
@@ -123,12 +124,19 @@ def recursive_add_node(
         self,
         fusion_group: "FxNetAccFusionsFinder.FusionGroup",
         inputs: Union[NodeSet, NodeList],
+        visited: Optional[NodeSet] = None,
     ):
         """
         Start from inputs and going reverse topological order. If any upstream node
         is in the fusion group, add all the nodes in this path to fusion group.
         """
         for arg in inputs:
+            # skip the node if already seen
+            if visited is not None:
+                if arg in visited:
+                    continue
+                visited.add(arg)
+
             # Skip placeholder and get_attr because they won't be in the fusion group.
             if arg.op not in CALLABLE_NODE_OPS:
                 continue
@@ -144,7 +152,7 @@ def recursive_add_node(
 
             # Check the upstream nodes of the node, if any of them is in the fusion group
             # we'll add this node to fusion group and return True.
-            if self.recursive_add_node(fusion_group, arg.all_input_nodes):
+            if self.recursive_add_node(fusion_group, arg.all_input_nodes, visited):
                 fusion_group.add_node(arg)
                 return True
 
@@ -172,7 +180,11 @@ def __call__(self) -> Dict[torch.fx.Node, NodeSet]:
             )
             while fusion_group.nodes_need_process:
                 node = fusion_group.nodes_need_process.pop()
-                self.recursive_add_node(fusion_group, fusion_group.inputs)
+                self.recursive_add_node(
+                    fusion_group,
+                    fusion_group.inputs,
+                    visited=set(),
+                )
 
                 # Optionally add downstream nodes
                 if "tensor_meta" not in node.meta:
@@ -183,7 +195,11 @@ def __call__(self) -> Dict[torch.fx.Node, NodeSet]:
                             continue
 
                         fusion_group.add_node(user)
-                        self.recursive_add_node(fusion_group, fusion_group.inputs)
+                        self.recursive_add_node(
+                            fusion_group,
+                            fusion_group.inputs,
+                            visited=set(),
+                        )
 
                 # Add some upstream nodes
                 for arg in node.all_input_nodes:
@@ -198,7 +214,11 @@ def __call__(self) -> Dict[torch.fx.Node, NodeSet]:
                     fusion_group.top_node_idx = min(
                         fusion_group.top_node_idx, self.nodes.index(arg)
                     )
-                    self.recursive_add_node(fusion_group, fusion_group.inputs)
+                    self.recursive_add_node(
+                        fusion_group,
+                        fusion_group.inputs,
+                        visited=set(),
+                    )
 
             if not (set(fusion_group.nodes) <= self.acc_nodes):
                 self.acc_nodes -= fusion_group.nodes
@@ -224,7 +244,32 @@ def legalize_graph(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
     Returns:
         The graph module in-place sorted
     """
-    indeg = {node: 0 for node in gm.graph.nodes}
+
+    # These operators are used for making runtime assertions before any
+    # data-dependent operators occur. We want to prioritize sorting these to
+    # ensure that these assertions appear before any data-dependent operations
+    # in the graph.
+    PRIORITIZED_OPS = [
+        operator.add,
+        operator.mul,
+        operator.sub,
+        operator.floordiv,
+        operator.truediv,
+        operator.mod,
+        operator.le,
+        operator.lt,
+        operator.ge,
+        operator.gt,
+        operator.eq,
+        operator.ne,
+        torch.ops.aten.sym_constrain_range.default,
+        torch.ops.aten.sym_constrain_range_for_size.default,
+        torch.ops.aten._assert_async.msg,
+        torch.ops.aten.scalar_tensor.default,
+        torch.ops.aten._assert_scalar.default,
+    ]
+
+    indeg = dict.fromkeys(gm.graph.nodes, 0)
     new_graph = torch.fx.Graph()
     # Track how many unfulfilled dependencies each node has
     for node in gm.graph.nodes:
@@ -244,7 +289,10 @@ def legalize_graph(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
         for user in cur.users:
             indeg[user] -= 1
             if indeg[user] == 0:
-                queue.append(user)
+                if user.op == "call_function" and user.target in PRIORITIZED_OPS:
+                    queue.appendleft(user)
+                else:
+                    queue.append(user)
     # If the new graph's size is not as large as the old one, then there must be
     # a cycle (i.e. some node's dependencies were not satisfied.)
     if len(new_graph.nodes) < len(gm.graph.nodes):
diff --git a/torch/fx/passes/utils/fuser_utils.py b/torch/fx/passes/utils/fuser_utils.py
index bb16cea4be8a2..8976690ed73a1 100644
--- a/torch/fx/passes/utils/fuser_utils.py
+++ b/torch/fx/passes/utils/fuser_utils.py
@@ -13,7 +13,7 @@
 @compatibility(is_backward_compatible=False)
 def topo_sort(nodes: NodeList) -> NodeList:
     # sort nodes according to the topological order
-    indegree_map = {node : 0 for node in nodes}
+    indegree_map = dict.fromkeys(nodes, 0)
     candidates: SimpleQueue = SimpleQueue()
 
     for node in nodes:
@@ -205,6 +205,8 @@ def insert_subgm(gm: GraphModule, sub_gm: GraphModule, orig_inputs: Tuple[Node,
             # Use Proxy to record getitem access.
             proxy_out = torch.fx.Proxy(module_node)[i].node  # type: ignore[index]
             orig_output.replace_all_uses_with(proxy_out, propagate_meta=True)
+
+        module_node.meta["val"] = tuple(orig_output.meta.get("val", None) for orig_output in orig_outputs)
     return gm
 
 @compatibility(is_backward_compatible=False)
diff --git a/torch/fx/passes/utils/matcher_utils.py b/torch/fx/passes/utils/matcher_utils.py
index 185168a19c59f..00415d10fee75 100644
--- a/torch/fx/passes/utils/matcher_utils.py
+++ b/torch/fx/passes/utils/matcher_utils.py
@@ -106,8 +106,20 @@ def _match_attributes(self, pn: Node, gn: Node) -> bool:
         # Attributes matching is complicated. Right now we only support matching constant tensor
         assert isinstance(pn.target, str), f"pn.target {pn.target} must be a string."
         assert isinstance(gn.target, str), f"gn.target {gn.target} must be a string."
-        pn_value = getattr(pn.graph.owning_module, pn.target)
-        gn_value = getattr(gn.graph.owning_module, gn.target)
+
+        # TODO(tmanlaibaatar) should probably make this actual API
+        def _getattr(model: torch.fx.GraphModule, attr_name: str):
+            *prefix, field = attr_name.split(".")
+            t = model
+            for item in prefix:
+                t = getattr(t, item, None)  # type: ignore[assignment]
+                assert t is not None
+
+            return getattr(t, field)
+
+        pn_value = _getattr(pn.graph.owning_module, pn.target)
+        gn_value = _getattr(gn.graph.owning_module, gn.target)
+
         if type(pn_value) != type(gn_value):
             return False
 
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index 1054b195e8241..f732b21080ddb 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import enum
 import dis
 import copy
@@ -5,13 +7,13 @@
 import torch
 import inspect
 import operator
-import traceback
 import collections
 
 from dataclasses import is_dataclass, fields
 
 
 from .graph import magic_methods, reflectable_magic_methods, Graph
+from torch.utils._traceback import CapturedTraceback
 from typing import Tuple, Dict, OrderedDict, Optional, Any, Iterator, Callable
 from .node import Target, Node, Argument, base_types, map_aggregate
 from ._compatibility import compatibility
@@ -84,7 +86,15 @@ def __exit__(self, *args):
         return
 
 
-_COPY_META_FIELDS = ["nn_module_stack", "source_fn_stack", "original_aten", "recompute", "from_node", "quantization_tag"]
+_COPY_META_FIELDS = [
+    "nn_module_stack",
+    "torch_fn",
+    "source_fn_stack",
+    "original_aten",
+    "recompute",
+    "from_node",
+    "quantization_tag",
+]
 
 
 @compatibility(is_backward_compatible=True)
@@ -156,8 +166,8 @@ def create_node(self, kind : str, target : Target,
             # nodes as is the case with in-place foreach ops. During the
             # BWD pass we retrieve the sequence_nr stored on the current
             # executing autograd Node. See NOTE [ Sequence Number ].
-            if current_meta.get("in_grad_fn", False):
-                new_seq_nr = current_meta["grad_fn_seq_nr"]
+            if current_meta.get("in_grad_fn", 0) > 0:
+                new_seq_nr = current_meta["grad_fn_seq_nr"][-1]
             node.meta["seq_nr"] = new_seq_nr
 
         elif self.module_stack:
@@ -195,12 +205,8 @@ def create_proxy(self, kind: str, target: Target, args: Tuple[Any, ...], kwargs:
             proxy = proxy_factory_fn(node)
 
         if self.record_stack_traces and not proxy.node.stack_trace:
-            user_frame = self._find_user_frame()
-            if user_frame:
-                summary = traceback.extract_stack(user_frame)
-                tb_lines = summary.format()
-                # stack_trace would have innermost frame at the bottom
-                proxy.node.stack_trace = ''.join(tb_lines)
+            proxy.node.stack_trace = ''.join(CapturedTraceback.extract().format())
+
 
         return proxy
 
@@ -370,7 +376,7 @@ class Proxy:
             indexed_item = proxied_value[i]
 
     For a more detailed description into the Proxy internals, check out
-    the "Proxy" section in `torch/fx/OVERVIEW.md`
+    the "Proxy" section in `torch/fx/README.md`
     """
 
     @compatibility(is_backward_compatible=True)
diff --git a/torch/fx/subgraph_rewriter.py b/torch/fx/subgraph_rewriter.py
index da258758d0d40..d0bb4b55a4034 100644
--- a/torch/fx/subgraph_rewriter.py
+++ b/torch/fx/subgraph_rewriter.py
@@ -39,7 +39,10 @@ def _replace_attributes(gm: GraphModule, replacement: torch.nn.Module) -> None:
 
     def try_get_attr(gm: torch.nn.Module, target: str) -> Optional[Any]:
         module_path, _, attr_name = target.rpartition(".")
-        mod: torch.nn.Module = gm.get_submodule(module_path)
+        try:
+            mod: torch.nn.Module = gm.get_submodule(module_path)
+        except AttributeError:
+            return None
         attr = getattr(mod, attr_name, None)
         return attr
 
@@ -291,8 +294,7 @@ def _replace_pattern(
         # Copy the replacement graph over
         user_nodes: Set[Node] = set()
         for n in match.returning_nodes:
-            for user in n.users:
-                user_nodes.add(user)
+            user_nodes.update(n.users)
         assert user_nodes, "The returning_nodes should have at least one user node"
 
         if len(user_nodes) == 1:
@@ -305,7 +307,7 @@ def _replace_pattern(
                     first_user_node = n
                     break
 
-        with original_graph.inserting_before(first_user_node):
+        with original_graph.inserting_before(first_user_node):  # type: ignore[possibly-undefined]
             copied_returning_nodes = original_graph.graph_copy(replacement_graph, val_map)
 
         if isinstance(copied_returning_nodes, Node):
diff --git a/torch/fx/traceback.py b/torch/fx/traceback.py
index cad750b19ff45..a582e03979c46 100644
--- a/torch/fx/traceback.py
+++ b/torch/fx/traceback.py
@@ -15,13 +15,17 @@
 @contextmanager
 def preserve_node_meta():
     global should_preserve_node_meta
+    global current_meta
 
     saved_should_preserve_node_meta = should_preserve_node_meta
+    # Shallow copy is OK since fields of current_meta are not mutated
+    saved_current_meta = current_meta.copy()
     try:
         should_preserve_node_meta = True
         yield
     finally:
         should_preserve_node_meta = saved_should_preserve_node_meta
+        current_meta = saved_current_meta
 
 
 @compatibility(is_backward_compatible=False)
@@ -38,10 +42,8 @@ def set_grad_fn_seq_nr(seq_nr):
 
     if should_preserve_node_meta:
         # The seq_nr is captured by eager mode in the grad_fn during forward
-        current_meta["prev_grad_fn_seq_nr"] = current_meta.get("grad_fn_seq_nr", None)
-        current_meta["prev_in_grad_fn"] = current_meta.get("in_grad_fn", None)
-        current_meta["grad_fn_seq_nr"] = seq_nr
-        current_meta["in_grad_fn"] = True
+        current_meta["grad_fn_seq_nr"] = current_meta.get("grad_fn_seq_nr", []) + [seq_nr]
+        current_meta["in_grad_fn"] = current_meta.get("in_grad_fn", 0) + 1
 
 
 @compatibility(is_backward_compatible=False)
@@ -49,14 +51,15 @@ def reset_grad_fn_seq_nr():
     # NB: reset state properly, this would be helpful towards supporting
     #     reentrant autograd if we actually wanted to do that.
     global current_meta
-
     if should_preserve_node_meta:
-        if current_meta["prev_grad_fn_seq_nr"] is None:
-            assert current_meta["prev_in_grad_fn"] is None
-            del current_meta["grad_fn_seq_nr"]
+        current_level = current_meta.get("in_grad_fn", 0)
+        assert current_level > 0
+        if current_level == 1:
             del current_meta["in_grad_fn"]
-        current_meta["grad_fn_seq_nr"] = current_meta["prev_grad_fn_seq_nr"]
-        current_meta["in_grad_fn"] = current_meta["prev_in_grad_fn"]
+            del current_meta["grad_fn_seq_nr"]
+        else:
+            current_meta["in_grad_fn"] = current_level - 1
+            current_meta["grad_fn_seq_nr"] = current_meta["grad_fn_seq_nr"][:-1]
 
 
 @compatibility(is_backward_compatible=False)
@@ -86,7 +89,7 @@ def set_current_meta(node):
             if "from_node" not in current_meta:
                 current_meta["from_node"] = [(node.name, node.target)]
             elif current_meta["from_node"][-1][0] != node.name:
-                current_meta["from_node"].append((node.name, node.target))
+                current_meta["from_node"] = current_meta["from_node"] + [(node.name, node.target)]
 
             yield
         finally:
diff --git a/torch/hub.py b/torch/hub.py
index a03b3408b5b7b..286dfbaa59b26 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -83,7 +83,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 VAR_DEPENDENCY = 'dependencies'
 MODULE_HUBCONF = 'hubconf.py'
 READ_DATA_CHUNK = 128 * 1024
-_hub_dir = None
+_hub_dir: Optional[str] = None
 
 
 @contextlib.contextmanager
@@ -307,7 +307,7 @@ def _check_repo_is_trusted(repo_owner, repo_name, owner_name_branch, trust_repo,
             if is_trusted:
                 print("The repository is already trusted.")
         elif response.lower() in ("n", "no", ""):
-            raise Exception("Untrusted repository.")
+            raise Exception("Untrusted repository.")  # noqa: TRY002
         else:
             raise ValueError(f"Unrecognized response {response}.")
 
@@ -379,7 +379,7 @@ def set_dir(d):
     _hub_dir = os.path.expanduser(d)
 
 
-def list(github, force_reload=False, skip_validation=False, trust_repo=None):
+def list(github, force_reload=False, skip_validation=False, trust_repo=None, verbose=True):
     r"""
     List all callable entrypoints available in the repo specified by ``github``.
 
@@ -411,6 +411,9 @@ def list(github, force_reload=False, skip_validation=False, trust_repo=None):
               v2.0.
 
             Default is ``None`` and will eventually change to ``"check"`` in v2.0.
+        verbose (bool, optional): If ``False``, mute messages about hitting
+            local caches. Note that the message about first download cannot be
+            muted. Default is ``True``.
 
     Returns:
         list: The available callables entrypoint
@@ -419,7 +422,7 @@ def list(github, force_reload=False, skip_validation=False, trust_repo=None):
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_HUB)
         >>> entrypoints = torch.hub.list('pytorch/vision', force_reload=True)
     """
-    repo_dir = _get_cache_or_reload(github, force_reload, trust_repo, "list", verbose=True,
+    repo_dir = _get_cache_or_reload(github, force_reload, trust_repo, "list", verbose=verbose,
                                     skip_validation=skip_validation)
 
     with _add_to_sys_path(repo_dir):
@@ -650,14 +653,14 @@ def download_url_to_file(url: str, dst: str, hash_prefix: Optional[str] = None,
                 buffer = u.read(READ_DATA_CHUNK)
                 if len(buffer) == 0:
                     break
-                f.write(buffer)
+                f.write(buffer)  # type: ignore[possibly-undefined]
                 if hash_prefix is not None:
-                    sha256.update(buffer)
+                    sha256.update(buffer)  # type: ignore[possibly-undefined]
                 pbar.update(len(buffer))
 
         f.close()
         if hash_prefix is not None:
-            digest = sha256.hexdigest()
+            digest = sha256.hexdigest()  # type: ignore[possibly-undefined]
             if digest[:len(hash_prefix)] != hash_prefix:
                 raise RuntimeError(f'invalid hash value (expected "{hash_prefix}", got "{digest}")')
         shutil.move(f.name, dst)
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index c07d1f61a9a2d..a5b9f5627ea70 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -77,9 +77,11 @@
     "ScriptModule",
     "annotate",
     "enable_onednn_fusion",
+    "export",
     "export_opnames",
     "fork",
     "freeze",
+    "interface",
     "ignore",
     "isinstance",
     "load",
@@ -138,7 +140,7 @@ def annotate(the_type, the_value):
 
     Note that `annotate()` does not help in `__init__` method of `torch.nn.Module` subclasses because it
     is executed in eager mode. To annotate types of `torch.nn.Module` attributes,
-    use :meth:`~torch.jit.Annotate` instead.
+    use :meth:`~torch.jit.Attribute` instead.
 
     Example:
 
diff --git a/torch/jit/_fuser.py b/torch/jit/_fuser.py
index 61408af0d727d..2536827360346 100644
--- a/torch/jit/_fuser.py
+++ b/torch/jit/_fuser.py
@@ -65,13 +65,13 @@ def fuser(name):
         torch._C._jit_set_nvfuser_enabled(False)
         torch._C._jit_set_llga_enabled(False)
     else:
-        raise Exception(f"unrecognized fuser option (name: {name})")
+        raise Exception(f"unrecognized fuser option (name: {name})")  # noqa: TRY002
     try:
         yield
     finally:
         if name in ["fuser1", "fuser3"]:  # NNC or oneDNN Graph
-            torch._C._jit_set_profiling_executor(old_profiling_executor)
-            torch._C._get_graph_executor_optimize(old_profiling_mode)
+            torch._C._jit_set_profiling_executor(old_profiling_executor)  # type: ignore[possibly-undefined]
+            torch._C._get_graph_executor_optimize(old_profiling_mode)  # type: ignore[possibly-undefined]
         # recover the previous values
         torch._C._jit_override_can_fuse_on_cpu(old_cpu_fuse)
         torch._C._jit_override_can_fuse_on_gpu(old_gpu_fuse)
diff --git a/torch/jit/_monkeytype_config.py b/torch/jit/_monkeytype_config.py
index e18e64687846d..3b19e8438d4ea 100644
--- a/torch/jit/_monkeytype_config.py
+++ b/torch/jit/_monkeytype_config.py
@@ -168,7 +168,7 @@ class JitTypeTraceConfig:  # type:  ignore[no-redef]
         def __init__(self):
             pass
 
-    monkeytype_trace = None  # noqa: F811
+    monkeytype_trace = None  # type: ignore[assignment]  # noqa: F811
 
 
 def jit_code_filter(code: CodeType) -> bool:
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index 716b2ef002d1a..a76a0c4a2cb06 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -824,16 +824,12 @@ def check_module_initialized(mod):
         for name, param in mod._parameters.items():
             if param is not None and torch.nn.parameter.is_lazy(param):
                 raise RuntimeError(
-                    "'{}' has uninitialized parameters {}. Did you forget to run a forward pass?".format(
-                        torch.typename(type(mod)), name
-                    )
+                    f"'{torch.typename(type(mod))}' has uninitialized parameters {name}. Did you forget to run a forward pass?"
                 )
         for name, buf in mod._buffers.items():
             if buf is not None and torch.nn.parameter.is_lazy(buf):
                 raise RuntimeError(
-                    "'{}' has uninitialized buffers {}. Did you forget to run a forward pass?".format(
-                        torch.typename(type(mod)), name
-                    )
+                    f"'{torch.typename(type(mod))}' has uninitialized buffers {name}. Did you forget to run a forward pass?"
                 )
 
 
@@ -996,6 +992,10 @@ def try_compile_fn(fn, loc):
             f"Consider manually annotating `{fn}` with @torch.jit.script."
         )
 
+    # The object returned by __prepare_scriptable__ might have a different closure.
+    # Resolve it here to get the right resolution callback.
+    fn = fn.__prepare_scriptable__() if hasattr(fn, "__prepare_scriptable__") else fn  # type: ignore[operator]
+
     # We don't have the actual scope where the function was defined, but we can
     # extract the necessary info from the closed over variables on the function
     # object
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index 5e29c43d45309..c18843f746931 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -19,6 +19,7 @@
 import torch._jit_internal as _jit_internal
 from torch._classes import classes
 from torch._jit_internal import _qualified_name
+from torch._utils_internal import log_torchscript_usage
 from torch.jit._builtins import _register_builtin
 from torch.jit._fuser import _graph_for, _script_method_graph_for
 
@@ -475,7 +476,7 @@ def __setattr__(self, attr, value):
         # RecursiveScriptClass.
         def forward_magic_method(self, method_name, *args, **kwargs):
             if not self._c._has_method(method_name):
-                raise TypeError()
+                raise TypeError
 
             self_method = self.__getattr__(method_name)
             return self_method(*args, **kwargs)
@@ -864,7 +865,7 @@ def forward_magic_method(self, method_name, *args, **kwargs):
             if getattr(self_method, "__func__", None) == getattr(
                 RecursiveScriptModule, method_name
             ):
-                raise NotImplementedError()
+                raise NotImplementedError
             return self_method(*args, **kwargs)
 
         def __iter__(self):
@@ -1287,6 +1288,8 @@ def forward(self, a) -> MyModule:
     if not _enabled:
         return obj
 
+    log_torchscript_usage("script")
+
     if optimize is not None:
         warnings.warn(
             "`optimize` is deprecated and has no effect. Use `with torch.jit.optimized_execution() instead"
@@ -1388,6 +1391,7 @@ def forward(self, a) -> MyModule:
         _check_directly_compile_overloaded(obj)
         maybe_already_compiled_fn = _try_get_jit_cached_function(obj)
         if maybe_already_compiled_fn:
+            maybe_already_compiled_fn._torchdynamo_inline = obj  # type: ignore[attr-defined]
             return maybe_already_compiled_fn
         ast = get_jit_def(obj, obj.__name__)
         if _rcb is None:
@@ -1649,7 +1653,7 @@ def dump_string(self) -> str:
         for source_stats in self.profile._dump_stats():
             source_ref = source_stats.source()
             source_lines = source_ref.text().splitlines()
-            dedent = min([len(line) - len(line.lstrip(" ")) for line in source_lines])
+            dedent = min(len(line) - len(line.lstrip(" ")) for line in source_lines)
             source_lines = [line[dedent:] for line in source_lines]
 
             start_line = source_ref.starting_lineno()
diff --git a/torch/jit/_serialization.py b/torch/jit/_serialization.py
index 00b9254a263c2..514f23cb76d3c 100644
--- a/torch/jit/_serialization.py
+++ b/torch/jit/_serialization.py
@@ -7,9 +7,11 @@
 This is not intended to be imported directly; please use the exposed
 functionalities in `torch.jit`.
 """
+
 import os
 
 import torch
+from torch._utils_internal import log_torchscript_usage
 from torch.jit._recursive import wrap_cpp_module
 from torch.serialization import validate_cuda_device
 
@@ -73,6 +75,7 @@ def forward(self, x):
         extra_files = {'foo.txt': b'bar'}
         torch.jit.save(m, 'scriptmodule.pt', _extra_files=extra_files)
     """
+    log_torchscript_usage("save")
     if _extra_files is None:
         _extra_files = {}
     if isinstance(f, (str, os.PathLike)):
@@ -143,6 +146,7 @@ def load(f, map_location=None, _extra_files=None, _restore_shapes=False):
         import os
         os.remove("scriptmodule.pt")
     """
+    log_torchscript_usage("load")
     if isinstance(f, (str, os.PathLike)):
         if not os.path.exists(f):  # type: ignore[type-var]
             raise ValueError(f"The provided filename {f} does not exist")  # type: ignore[str-bytes-safe]
diff --git a/torch/jit/_shape_functions.py b/torch/jit/_shape_functions.py
index 51515039866da..bef34e28239b5 100644
--- a/torch/jit/_shape_functions.py
+++ b/torch/jit/_shape_functions.py
@@ -1145,6 +1145,17 @@ def native_batch_norm(
     return _copy(input), _size, _size
 
 
+def _batch_norm_with_update(
+    input: List[int],
+    weight: Optional[List[int]],
+    bias: Optional[List[int]],
+    running_mean: Optional[List[int]],
+    running_var: Optional[List[int]],
+) -> Tuple[List[int], List[int], List[int], List[int]]:
+    _size = [input[1]]
+    return _copy(input), _size, _size, [0]
+
+
 def cross_entropy_loss(
     self: List[int],
     target: List[int],
@@ -1430,6 +1441,11 @@ def add_bounded_compute_mapping(
     "aten::_native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)",
     native_batch_norm,
 )
+add_shape_compute_mapping(
+    "_batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)",
+    _batch_norm_with_update,
+)
+
 add_shape_compute_mapping(
     "aten::cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor",
     cross_entropy_loss,
diff --git a/torch/jit/_state.py b/torch/jit/_state.py
index 8b7829f221b1b..1d75415ef80e9 100644
--- a/torch/jit/_state.py
+++ b/torch/jit/_state.py
@@ -7,6 +7,7 @@
 """
 import os
 import weakref
+from typing import Any, Dict, Type
 
 import torch
 
@@ -60,8 +61,8 @@ def enable():
 
 
 # python class => ScriptClass mapping
-_script_classes = {}
-_name_to_pyclass = {}
+_script_classes: Dict[Type[Any], Type[Any]] = {}
+_name_to_pyclass: Dict[str, Type[Any]] = {}
 
 
 def _add_script_class(python_class, script_class):
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index a7052ae4bb013..d4651dab36554 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -7,6 +7,7 @@
 This is not intended to be imported directly; please use the exposed
 functionalities in `torch.jit`.
 """
+
 import contextlib
 
 import copy
@@ -25,6 +26,8 @@
     get_callable_argument_names,
     is_scripting,
 )
+
+from torch._utils_internal import log_torchscript_usage
 from torch.autograd import function
 from torch.jit._script import _CachedForward, script, ScriptModule
 
@@ -254,7 +257,7 @@ def run_fwd_bwd(args, force_trace=False, assert_compiled=False):
         if assert_compiled:
             hits = compiled_fn.hits
         out = model(*args)
-        if assert_compiled and compiled_fn.hits == hits:
+        if assert_compiled and compiled_fn.hits == hits:  # type: ignore[possibly-undefined]
             raise RuntimeError("failed to use the compiled function")
         if not isinstance(out, tuple):
             out = (out,)
@@ -280,7 +283,7 @@ def run_fwd_bwd(args, force_trace=False, assert_compiled=False):
         assert model.has_trace_for(*args)
 
     if is_module:
-        model.load_state_dict(saved_state)
+        model.load_state_dict(saved_state)  # type: ignore[possibly-undefined]
     compiled_outs, compiled_grads = run_fwd_bwd(args, assert_compiled=True)
 
     _verify_equal(uncompiled_outs, compiled_outs)
@@ -539,6 +542,20 @@ def compare_outputs(original, reference, match_what):
                                 atol=default_tolerances(orig, ref)[1],
                                 equal_nan=True,
                             )
+                        elif getattr(orig, "is_nested", None) or getattr(
+                            ref, "is_nested", None
+                        ):
+                            assert getattr(orig, "is_nested", None) == getattr(
+                                ref, "is_nested", None
+                            )
+                            for t_orig, t_ref in zip(orig.unbind(), ref.unbind()):
+                                torch.testing.assert_close(
+                                    t_orig.double(),
+                                    t_ref.double(),
+                                    rtol=check_tolerance,
+                                    atol=default_tolerances(t_orig, t_ref)[1],
+                                    equal_nan=True,
+                                )
                         else:
                             torch.testing.assert_close(
                                 orig.double(),
@@ -789,6 +806,8 @@ def forward(self, x):
             "`optimize` is deprecated and has no effect. Use `with torch.jit.optimized_execution() instead"
         )
 
+    log_torchscript_usage("trace")
+
     if isinstance(func, torch.jit.ScriptModule):
         # it is hard to trace it because the forward method on ScriptModule is already defined, so it
         # would result in an error.
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index 804475b35e1d8..a24fad838353a 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -321,7 +321,7 @@ def split_type_line(type_line):
         arrow_pos = type_line.index("->")
     except ValueError:
         raise RuntimeError(
-            "Syntax error in type annotation (cound't find `->`)"
+            "Syntax error in type annotation (couldn't find `->`)"
         ) from None
     return type_line[start_offset:arrow_pos].strip(), type_line[arrow_pos + 2 :].strip()
 
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 400d2b1bef64c..37237af33d484 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -365,7 +365,7 @@ def _forward(self):
     # for the arguments from type_trace_db
     type_trace_db = torch.jit._script._get_type_trace_db()
     pdt_arg_types = None
-    if monkeytype_trace and not isinstance(fn, _ParsedDef):
+    if monkeytype_trace and not isinstance(fn, _ParsedDef):  # type: ignore[truthy-function]
         qualname = get_qualified_name(fn)
         pdt_arg_types = type_trace_db.get_args_types(qualname)
 
diff --git a/torch/jit/quantized.py b/torch/jit/quantized.py
index 63de5c5bb4632..c7c679c794569 100644
--- a/torch/jit/quantized.py
+++ b/torch/jit/quantized.py
@@ -1,798 +1,99 @@
-import warnings
-
-from typing import List, Optional, Tuple
-
 import torch
-from torch import _VF, Tensor  # noqa: F401
-from torch.nn.utils.rnn import PackedSequence
 
 
 class QuantizedLinear(torch.jit.ScriptModule):
-    __constants__ = ["scale", "zero_point"]
-
     def __init__(self, other):
-        super().__init__()
-        warnings.warn(
-            "torch.jit.QuantizedLinear is deprecated and will be removed in an upcoming "
-            "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.Linear instead."
-        )
-
-        self.in_features = other.in_features
-        self.out_features = other.out_features
-        # Quantize weight and discard the original
-        (
-            self.weight,
-            self.col_offsets,
-            self.scale,
-            self.zero_point,
-        ) = torch.fbgemm_linear_quantize_weight(
-            other.weight.clone(memory_format=torch.contiguous_format).float()
-        )
-        self.weight = torch.nn.Parameter(self.weight, requires_grad=False)
-        self.col_offsets = torch.nn.Parameter(self.col_offsets, requires_grad=False)
-        assert other.bias is not None, "QuantizedLinear requires a bias"
-        self.bias = torch.nn.Parameter(
-            other.bias.clone(memory_format=torch.contiguous_format).float(),
-            requires_grad=False,
-        )
-
-        self.register_buffer(
-            "packed_tensor_ptr",
-            torch.fbgemm_pack_quantized_matrix(
-                self.weight.clone(memory_format=torch.contiguous_format)
-            ),
-        )
-
-    @torch.jit.script_method
-    def _unpack(self):
-        self.packed_tensor_ptr.set_(torch.fbgemm_pack_quantized_matrix(self.weight))
-
-    @torch.jit.script_method
-    def _pack(self):
-        self.packed_tensor_ptr.set_(
-            torch.zeros(torch.jit.annotate(List[int], []), dtype=torch.uint8).detach()
-        )
-
-    @torch.jit.script_method
-    def forward(self, input):
-        out = torch.fbgemm_linear_int8_weight_fp32_activation(
-            input.float(),
-            self.weight,
-            self.packed_tensor_ptr,
-            self.col_offsets,
-            self.scale,
-            self.zero_point,
-            self.bias,
+        raise RuntimeError(
+            "torch.jit.QuantizedLinear is no longer supported. Please use "
+            "torch.ao.nn.quantized.dynamic.Linear instead."
         )
-        return out.to(input.dtype)
-
-    def extra_repr(self):
-        repr = (
-            "in_features={in_features}, out_features={out_features}, "
-            "scale={scale}, zero_point={zero_point}".format(**self.__dict__)
-        )
-        return repr
 
 
 # FP16 weights
 class QuantizedLinearFP16(torch.jit.ScriptModule):
     def __init__(self, other):
         super().__init__()
-        warnings.warn(
-            "torch.jit.QuantizedLinearFP16 is deprecated and will be removed in an upcoming "
-            "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.Linear instead."
-        )
-        self.in_features = other.in_features
-        self.out_features = other.out_features
-        self.original_weight = other.weight
-        self.weight = torch.fbgemm_pack_gemm_matrix_fp16(
-            other.weight.clone(memory_format=torch.contiguous_format).float()
-        )
-        assert other.bias is not None, "QuantizedLinearFP16 requires a bias"
-        self.bias = torch.nn.Parameter(
-            other.bias.clone(memory_format=torch.contiguous_format).float(),
-            requires_grad=False,
-        )
-        self.register_buffer("packed_weight", self.weight)
-
-    @torch.jit.script_method
-    def _unpack(self):
-        self.packed_weight.set_(
-            torch.fbgemm_pack_gemm_matrix_fp16(self.original_weight)
-        )
-
-    @torch.jit.script_method
-    def _pack(self):
-        self.packed_weight.set_(
-            torch.zeros(torch.jit.annotate(List[int], []), dtype=torch.uint8).detach()
-        )
-
-    @torch.jit.script_method
-    def forward(self, input):
-        out = torch.fbgemm_linear_fp16_weight_fp32_activation(
-            input.float(), self.packed_weight, self.bias
+        raise RuntimeError(
+            "torch.jit.QuantizedLinearFP16 is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic.Linear instead."
         )
-        return out
-
-    def extra_repr(self):
-        repr = "in_features={in_features}, out_features={out_features}, ".format(
-            **self.__dict__
-        )
-        return repr
 
 
 # Quantized RNN cell implementations
 class QuantizedRNNCellBase(torch.jit.ScriptModule):
-    __constants__ = [
-        "input_size",
-        "hidden_size",
-        "bias",
-        "scale_hh",
-        "scale_ih",
-        "zero_point_ih",
-        "zero_point_hh",
-    ]
-
     def __init__(self, other):
-        super().__init__()
-        warnings.warn(
-            "torch.jit.QuantizedRNNCellBase is deprecated and will be removed in an upcoming "
-            "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.RNNCell instead."
-        )
-
-        self.input_size = other.input_size
-        self.hidden_size = other.hidden_size
-        self.bias = other.bias
-        if not self.bias:
-            raise ValueError("Quantized RNN cells require bias terms")
-
-        (
-            weight_ih,
-            col_offsets_ih,
-            self.scale_ih,
-            self.zero_point_ih,
-        ) = torch.fbgemm_linear_quantize_weight(
-            other.weight_ih.clone(memory_format=torch.contiguous_format).float()
-        )
-        self.register_buffer("weight_ih", weight_ih)
-        self.register_buffer("col_offsets_ih", col_offsets_ih)
-        (
-            weight_hh,
-            col_offsets_hh,
-            self.scale_hh,
-            self.zero_point_hh,
-        ) = torch.fbgemm_linear_quantize_weight(
-            other.weight_hh.clone(memory_format=torch.contiguous_format).float()
-        )
-        self.register_buffer("weight_hh", weight_hh)
-        self.register_buffer("col_offsets_hh", col_offsets_hh)
-
-        packed_ih = torch.fbgemm_pack_quantized_matrix(self.weight_ih)
-        self.register_buffer("packed_ih", packed_ih)
-        packed_hh = torch.fbgemm_pack_quantized_matrix(self.weight_hh)
-        self.register_buffer("packed_hh", packed_hh)
-
-        self.bias_ih = torch.nn.Parameter(
-            other.bias_ih.clone(memory_format=torch.contiguous_format).float(),
-            requires_grad=False,
-        )
-        self.bias_hh = torch.nn.Parameter(
-            other.bias_hh.clone(memory_format=torch.contiguous_format).float(),
-            requires_grad=False,
-        )
-
-    def extra_repr(self):
-        s = "{input_size}, {hidden_size}"
-        if "bias" in self.__dict__ and self.bias is not True:
-            s += ", bias={bias}"
-        if "nonlinearity" in self.__dict__ and self.nonlinearity != "tanh":
-            s += ", nonlinearity={nonlinearity}"
-        return s.format(**self.__dict__)
-
-    @torch.jit.script_method
-    def check_forward_input(self, input):
-        if input.size(1) != self.input_size:
-            raise RuntimeError(
-                f"input has inconsistent input_size: got {input.size(1)}, expected {self.input_size}"
-            )
-
-    @torch.jit.script_method
-    def check_forward_hidden(
-        self, input: Tensor, hx: Tensor, hidden_label: str = ""
-    ) -> None:
-        if input.size(0) != hx.size(0):
-            raise RuntimeError(
-                f"Input batch size {input.size(0)} doesn't match hidden{hidden_label} batch size {hx.size(0)}"
-            )
-
-        if hx.size(1) != self.hidden_size:
-            raise RuntimeError(
-                f"hidden{hidden_label} has inconsistent hidden_size: got {hx.size(1)}, expected {self.hidden_size}"
-            )
-
-    # TODO: for some reason weak_script_method causes a destruction of the
-    # module to occur, which in turn frees the packed_ih object via its DataPtr
-    # deleter. This is bizarre and should probably get fixed.
-    # @torch._jit_internal.weak_script_method
-    @torch.jit.script_method
-    def _unpack(self):
-        self.packed_ih.set_(torch.fbgemm_pack_quantized_matrix(self.weight_ih))
-        self.packed_hh.set_(torch.fbgemm_pack_quantized_matrix(self.weight_hh))
-
-    # @torch._jit_internal.weak_script_method
-    @torch.jit.script_method
-    def _pack(self):
-        self.packed_ih.set_(
-            torch.zeros(torch.jit.annotate(List[int], []), dtype=torch.uint8).detach()
-        )
-        self.packed_hh.set_(
-            torch.zeros(torch.jit.annotate(List[int], []), dtype=torch.uint8).detach()
+        raise RuntimeError(
+            "torch.jit.QuantizedRNNCellBase is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic.RNNCell instead."
         )
 
 
 class QuantizedRNNCell(QuantizedRNNCellBase):
-    __constants__ = [
-        "input_size",
-        "hidden_size",
-        "bias",
-        "scale_hh",
-        "scale_ih",
-        "zero_point_ih",
-        "zero_point_hh",
-        "nonlinearity",
-    ]
-
     def __init__(self, other):
-        super().__init__(other)
-        warnings.warn(
-            "torch.jit.QuantizedRNNCell is deprecated and will be removed in an upcoming "
-            "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.RNNCell instead."
+        raise RuntimeError(
+            "torch.jit.QuantizedRNNCell is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic.RNNCell instead."
         )
-        self.nonlinearity = other.nonlinearity
-
-    @torch.jit.script_method
-    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
-        self.check_forward_input(input)
-        if hx is None:
-            hx = torch.zeros(
-                input.size(0), self.hidden_size, dtype=input.dtype, device=input.device
-            )
-        self.check_forward_hidden(input, hx, "")
-        if self.nonlinearity == "tanh":
-            ret = _VF.quantized_rnn_tanh_cell(
-                input,
-                hx,
-                self.weight_ih,
-                self.weight_hh,
-                self.bias_ih,
-                self.bias_hh,
-                self.packed_ih,
-                self.packed_hh,
-                self.col_offsets_ih,
-                self.col_offsets_hh,
-                self.scale_ih,
-                self.scale_hh,
-                self.zero_point_ih,
-                self.zero_point_hh,
-            )
-        elif self.nonlinearity == "relu":
-            ret = _VF.quantized_rnn_relu_cell(
-                input,
-                hx,
-                self.weight_ih,
-                self.weight_hh,
-                self.bias_ih,
-                self.bias_hh,
-                self.packed_ih,
-                self.packed_hh,
-                self.col_offsets_ih,
-                self.col_offsets_hh,
-                self.scale_ih,
-                self.scale_hh,
-                self.zero_point_ih,
-                self.zero_point_hh,
-            )
-        else:
-            ret = input  # TODO: remove when jit supports exception flow
-            raise RuntimeError(f"Unknown nonlinearity: {self.nonlinearity}")
-        return ret
 
 
 class QuantizedLSTMCell(QuantizedRNNCellBase):
     def __init__(self, other):
         super().__init__(other)
-        warnings.warn(
-            "torch.jit.QuantizedLSTMCell is deprecated and will be removed in an upcoming "
-            "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.LSTMCell instead."
-        )
-
-    @torch.jit.script_method
-    def forward(
-        self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None
-    ) -> Tuple[Tensor, Tensor]:
-        self.check_forward_input(input)
-        if hx is None:
-            zeros = torch.zeros(
-                input.size(0), self.hidden_size, dtype=input.dtype, device=input.device
-            )
-            hx = (zeros, zeros)
-        self.check_forward_hidden(input, hx[0], "[0]")
-        self.check_forward_hidden(input, hx[1], "[1]")
-        return _VF.quantized_lstm_cell(
-            input,
-            hx,
-            self.weight_ih,
-            self.weight_hh,
-            self.bias_ih,
-            self.bias_hh,
-            self.packed_ih,
-            self.packed_hh,
-            self.col_offsets_ih,
-            self.col_offsets_hh,
-            self.scale_ih,
-            self.scale_hh,
-            self.zero_point_ih,
-            self.zero_point_hh,
+        raise RuntimeError(
+            "torch.jit.QuantizedLSTMCell is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic.LSTMCell instead."
         )
 
 
 class QuantizedGRUCell(QuantizedRNNCellBase):
     def __init__(self, other):
         super().__init__(other)
-        warnings.warn(
-            "torch.jit.QuantizedGRUCell is deprecated and will be removed in an upcoming "
-            "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.GRUCell instead."
+        raise RuntimeError(
+            "torch.jit.QuantizedGRUCell is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic.GRUCell instead."
         )
 
-    @torch.jit.script_method
-    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
-        self.check_forward_input(input)
-        if hx is None:
-            hx = torch.zeros(
-                input.size(0), self.hidden_size, dtype=input.dtype, device=input.device
-            )
-        self.check_forward_hidden(input, hx, "")
-        return _VF.quantized_gru_cell(
-            input,
-            hx,
-            self.weight_ih,
-            self.weight_hh,
-            self.bias_ih,
-            self.bias_hh,
-            self.packed_ih,
-            self.packed_hh,
-            self.col_offsets_ih,
-            self.col_offsets_hh,
-            self.scale_ih,
-            self.scale_hh,
-            self.zero_point_ih,
-            self.zero_point_hh,
-        )
-
-
-def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
-    return tensor.index_select(dim, permutation)
-
 
 class QuantizedRNNBase(torch.jit.ScriptModule):
-    __constants__ = [
-        "mode",
-        "input_size",
-        "hidden_size",
-        "num_layers",
-        "bias",
-        "batch_first",
-        "dropout",
-        "bidirectional",
-        "dtype",
-    ]
-
     def __init__(self, other, dtype=torch.int8):
-        super().__init__()
-        warnings.warn(
-            "torch.jit.QuantizedRNNBase is deprecated and will be removed in an upcoming "
-            "PyTorch release. Please use the torch.ao.nn.quantized.dynamic instead."
+        raise RuntimeError(
+            "torch.jit.QuantizedRNNBase is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic instead."
         )
-        self.mode = other.mode
-        self.input_size = other.input_size
-        self.hidden_size = other.hidden_size
-        self.num_layers = other.num_layers
-        self.bias = other.bias
-        self.batch_first = other.batch_first
-        if self.mode != "GRU":
-            assert not self.batch_first
-        self.dropout = other.dropout
-        self.bidirectional = other.bidirectional
-        num_directions = 2 if self.bidirectional else 1
-        self.dtype = dtype
-
-        assert self.bias
-
-        # TODO: support more than just LSTM
-        if self.mode != "LSTM" and self.mode != "GRU":
-            raise RuntimeError("Only LSTM or GRU is supported for QuantizedRNN")
-
-        if dtype != torch.int8 and dtype != torch.float16:
-            raise RuntimeError(f"Unsupported dtype: {dtype}")
-
-        self.all_weights = []
-        for layer in range(self.num_layers):
-            for direction in range(num_directions):
-                layer_input_size = (
-                    self.input_size if layer == 0 else self.hidden_size * num_directions
-                )
-
-                suffix = "_reverse" if direction == 1 else ""
-
-                def get_weight_bias(ihhh):
-                    weight_name = f"weight_{ihhh}_l{layer}{suffix}"
-                    bias_name = f"bias_{ihhh}_l{layer}{suffix}"
-
-                    weight = getattr(other, weight_name)
-                    bias = getattr(other, bias_name)
-                    return weight, bias
-
-                weight_ih, bias_ih = get_weight_bias("ih")
-                weight_hh, bias_hh = get_weight_bias("hh")
-
-                if dtype == torch.int8:
-                    cell_params = torch.ops.quantized.make_quantized_cell_params(
-                        weight_ih, weight_hh, bias_ih, bias_hh
-                    )
-                else:
-                    packed_ih = torch.ops.quantized.linear_prepack_fp16(
-                        weight_ih.float(), bias_ih
-                    )
-                    packed_hh = torch.ops.quantized.linear_prepack_fp16(
-                        weight_hh.float(), bias_hh
-                    )
-
-                    cell_params = torch.ops.quantized.make_quantized_cell_params_fp16(
-                        packed_ih, packed_hh
-                    )
-
-                setattr(self, f"cell_params_{layer}_{suffix}", cell_params)
-                self.all_weights.append(cell_params)
-
-    @torch.jit.script_method
-    def check_input(self, input: Tensor, batch_sizes: Optional[Tensor]) -> None:
-        expected_input_dim = 2 if batch_sizes is not None else 3
-        if input.dim() != expected_input_dim:
-            raise RuntimeError(
-                f"input must have {expected_input_dim} dimensions, got {input.dim()}"
-            )
-        if self.input_size != input.size(-1):
-            raise RuntimeError(
-                f"input.size(-1) must be equal to input_size. Expected {self.input_size}, got {input.size(-1)}"
-            )
-
-    @torch.jit.script_method
-    def get_expected_hidden_size(
-        self, input: Tensor, batch_sizes: Optional[Tensor]
-    ) -> Tuple[int, int, int]:
-        if batch_sizes is not None:
-            mini_batch = int(batch_sizes[0])
-        else:
-            mini_batch = input.size(0) if self.batch_first else input.size(1)
-        num_directions = 2 if self.bidirectional else 1
-        expected_hidden_size = (
-            self.num_layers * num_directions,
-            mini_batch,
-            self.hidden_size,
-        )
-        return expected_hidden_size
-
-    @torch.jit.script_method
-    def check_hidden_size(
-        self,
-        hx: Tensor,
-        expected_hidden_size: Tuple[int, int, int],
-        msg: str = "Expected hidden size {}, got {}",
-    ) -> None:
-        if hx.size() != expected_hidden_size:
-            raise RuntimeError(msg.format(expected_hidden_size, list(hx.size())))
-
-    @torch.jit.script_method
-    def check_forward_args(
-        self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor]
-    ) -> None:
-        self.check_input(input, batch_sizes)
-        expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
-        self.check_hidden_size(
-            hidden, expected_hidden_size, msg="Expected hidden size {}, got {}"
-        )
-
-    @torch.jit.script_method
-    def permute_hidden(self, hx: Tensor, permutation: Optional[Tensor]) -> Tensor:
-        if permutation is None:
-            return hx
-        return apply_permutation(hx, permutation)
 
 
 class QuantizedLSTM(QuantizedRNNBase):
-    __overloads__ = {"forward": ["forward_packed", "forward_tensor"]}
-
     def __init__(self, other, dtype):
-        super().__init__(other, dtype)
-        warnings.warn(
-            "torch.jit.QuantizedLSTM is deprecated and will be removed in an upcoming "
-            "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.LSTM instead."
-        )
-
-    @torch.jit.script_method
-    def forward_impl(
-        self,
-        input: Tensor,
-        hx: Optional[Tuple[Tensor, Tensor]],
-        batch_sizes: Optional[Tensor],
-        max_batch_size: int,
-        sorted_indices: Optional[Tensor],
-    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
-        if hx is None:
-            num_directions = 2 if self.bidirectional else 1
-            zeros = torch.zeros(
-                self.num_layers * num_directions,
-                max_batch_size,
-                self.hidden_size,
-                dtype=input.dtype,
-                device=input.device,
-            )
-            hx = (zeros, zeros)
-        else:
-            # Each batch of the hidden state should match the input sequence that
-            # the user believes he/she is passing in.
-            hx = self.permute_hidden(hx, sorted_indices)
-
-        self.check_forward_args(input, hx, batch_sizes)
-        assert batch_sizes is None
-        result = torch.quantized_lstm(
-            input,
-            hx,
-            self.all_weights,
-            self.bias,
-            self.num_layers,
-            float(self.dropout),
-            self.training,
-            self.bidirectional,
-            self.batch_first,
-            dtype=self.dtype,
-            use_dynamic=False,
-        )
-        output = result[0]
-        hidden = result[1:]
-
-        return output, hidden
-
-    @torch.jit.script_method
-    def forward_tensor(
-        self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None
-    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
-        batch_sizes = None
-        max_batch_size = input.size(0) if self.batch_first else input.size(1)
-        sorted_indices = None
-        unsorted_indices = None
-
-        output, hidden = self.forward_impl(
-            input, hx, batch_sizes, max_batch_size, sorted_indices
-        )
-
-        return output, self.permute_hidden(hidden, unsorted_indices)
-
-    @torch.jit.script_method
-    def forward_packed(
-        self, input: PackedSequence, hx: Optional[Tuple[Tensor, Tensor]] = None
-    ) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]]:
-        input_, batch_sizes, sorted_indices, unsorted_indices = input
-        max_batch_size = int(batch_sizes[0])
-
-        output, hidden = self.forward_impl(
-            input_, hx, batch_sizes, max_batch_size, sorted_indices
-        )
-
-        output = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
-        return output, self.permute_hidden(hidden, unsorted_indices)
-
-    @torch.jit.script_method
-    def permute_hidden(
-        self, hx: Tuple[Tensor, Tensor], permutation: Optional[Tensor]
-    ) -> Tuple[Tensor, Tensor]:
-        if permutation is None:
-            return hx
-        return apply_permutation(hx[0], permutation), apply_permutation(
-            hx[1], permutation
-        )
-
-    @torch.jit.script_method
-    def check_forward_args(
-        self,
-        input: Tensor,
-        hidden: Tuple[Tensor, Tensor],
-        batch_sizes: Optional[Tensor],
-    ) -> None:
-        self.check_input(input, batch_sizes)
-        expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
-
-        self.check_hidden_size(
-            hidden[0], expected_hidden_size, "Expected hidden[0] size {}, got {}"
+        raise RuntimeError(
+            "torch.jit.QuantizedLSTM is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic.LSTM instead."
         )
-        self.check_hidden_size(
-            hidden[1], expected_hidden_size, "Expected hidden[1] size {}, got {}"
-        )
-
-    def forward(self, input, hx=None):
-        if isinstance(input, PackedSequence):
-            return self.forward_packed(input, hx)
-        else:
-            return self.forward_tensor(input, hx)
 
 
 class QuantizedGRU(QuantizedRNNBase):
-    __overloads__ = {"forward": ["forward_packed", "forward_tensor"]}
-
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        warnings.warn(
-            "torch.jit.QuantizedGRU is deprecated and will be removed in an upcoming "
-            "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.GRU instead."
+        raise RuntimeError(
+            "torch.jit.QuantizedGRU is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic.GRU instead."
         )
 
-    @torch.jit.script_method
-    def forward_impl(
-        self,
-        input: Tensor,
-        hx: Optional[Tensor],
-        batch_sizes: Optional[Tensor],
-        max_batch_size: int,
-        sorted_indices: Optional[Tensor],
-    ) -> Tuple[Tensor, Tensor]:
-        if hx is None:
-            num_directions = 2 if self.bidirectional else 1
-            hx = torch.zeros(
-                self.num_layers * num_directions,
-                max_batch_size,
-                self.hidden_size,
-                dtype=input.dtype,
-                device=input.device,
-            )
-        else:
-            # Each batch of the hidden state should match the input sequence that
-            # the user believes he/she is passing in.
-            hx = self.permute_hidden(hx, sorted_indices)
-
-        self.check_forward_args(input, hx, batch_sizes)
-        if batch_sizes is None:
-            result = torch.quantized_gru(
-                input,
-                hx,
-                self.all_weights,
-                self.bias,
-                self.num_layers,
-                float(self.dropout),
-                self.training,
-                self.bidirectional,
-                self.batch_first,
-            )
-        else:
-            result = torch.quantized_gru(
-                input,
-                batch_sizes,
-                hx,
-                self.all_weights,
-                self.bias,
-                self.num_layers,
-                float(self.dropout),
-                self.training,
-                self.bidirectional,
-            )
-
-        output = result[0]
-        hidden = result[1]
-
-        return output, hidden
-
-    @torch.jit.script_method
-    def forward_tensor(
-        self, input: Tensor, hx: Optional[Tensor] = None
-    ) -> Tuple[Tensor, Tensor]:
-        batch_sizes = None
-        max_batch_size = input.size(0) if self.batch_first else input.size(1)
-        sorted_indices = None
-        unsorted_indices = None
-
-        output, hidden = self.forward_impl(
-            input, hx, batch_sizes, max_batch_size, sorted_indices
-        )
-        return output, self.permute_hidden(hidden, unsorted_indices)
-
-    @torch.jit.script_method
-    def forward_packed(
-        self, input: PackedSequence, hx: Optional[Tensor] = None
-    ) -> Tuple[PackedSequence, Tensor]:
-        input_, batch_sizes, sorted_indices, unsorted_indices = input
-        max_batch_size = int(batch_sizes[0])
-
-        output, hidden = self.forward_impl(
-            input_, hx, batch_sizes, max_batch_size, sorted_indices
-        )
-
-        output = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
-        return output, self.permute_hidden(hidden, unsorted_indices)
-
-    def forward(self, input, hx=None):
-        if isinstance(input, PackedSequence):
-            return self.forward_packed(input, hx)
-        else:
-            return self.forward_tensor(input, hx)
-
 
 def quantize_rnn_cell_modules(module):
-    warnings.warn(
-        "quantize_rnn_cell_modules function has been deprecated. "
+    raise RuntimeError(
+        "quantize_rnn_cell_modules function is no longer supported. "
         "Please use torch.ao.quantization.quantize_dynamic API instead."
     )
-    reassign = {}
-    for name, mod in module.named_modules():
-        if mod is module:
-            continue
-        new_mod = quantize_rnn_cell_modules(mod)
-        if new_mod is not mod:
-            reassign[name] = new_mod
-    for name, mod in reassign.items():
-        setattr(module, name, mod)
-    if isinstance(module, torch.nn.LSTMCell):
-        return QuantizedLSTMCell(module)
-    if isinstance(module, torch.nn.GRUCell):
-        return QuantizedGRUCell(module)
-    if isinstance(module, torch.nn.RNNCell):
-        return QuantizedRNNCell(module)
-    return module
 
 
 def quantize_linear_modules(module, dtype=torch.int8):
-    warnings.warn(
-        "quantize_linear_modules function has been deprecated. "
+    raise RuntimeError(
+        "quantize_linear_modules function is no longer supported. "
         "Please use torch.ao.quantization.quantize_dynamic API instead."
     )
 
-    reassign = {}
-    for name, mod in module.named_modules():
-        if mod is module:
-            continue
-        new_mod = quantize_linear_modules(mod, dtype)
-        if new_mod is not mod:
-            reassign[name] = new_mod
-
-    for name, mod in reassign.items():
-        setattr(module, name, mod)
-    if isinstance(module, torch.nn.Linear):
-        if dtype == torch.int8:
-            return QuantizedLinear(module)
-        elif dtype == torch.float16:
-            return QuantizedLinearFP16(module)
-        else:
-            raise RuntimeError(f"Unsupported dtype: {dtype}")
-    return module
-
 
 def quantize_rnn_modules(module, dtype=torch.int8):
-    warnings.warn(
-        "quantize_rnn_modules function has been deprecated. "
+    raise RuntimeError(
+        "quantize_rnn_modules function is no longer supported. "
         "Please use torch.ao.quantization.quantize_dynamic API instead."
     )
-    reassign = {}
-    for name, mod in module.named_modules():
-        if mod is module:
-            continue
-        new_mod = quantize_rnn_modules(mod, dtype)
-        if new_mod is not mod:
-            reassign[name] = new_mod
-
-    for name, mod in reassign.items():
-        setattr(module, name, mod)
-    if isinstance(module, torch.nn.LSTM):
-        if dtype != torch.int8 and dtype != torch.float16:
-            raise RuntimeError(f"Unsupported dtype: {dtype}")
-        return QuantizedLSTM(module, dtype)
-    if isinstance(module, torch.nn.GRU):
-        return QuantizedGRU(module)
-    return module
diff --git a/torch/jit/supported_ops.py b/torch/jit/supported_ops.py
index 7a670c722744e..c06664a6cff28 100644
--- a/torch/jit/supported_ops.py
+++ b/torch/jit/supported_ops.py
@@ -46,10 +46,10 @@ def _emit_schema(mod, name, schema, arg_start=0, padding=4):
         qualified_name = name
     else:
         qualified_name = f"{mod}.{name}"
-    schema_str = "{}({}) -> {}".format(
-        qualified_name,
-        _emit_args(len(qualified_name) + 1 + padding, schema.arguments[arg_start:]),
-        _emit_rets(schema.returns),
+    schema_str = (
+        f"{qualified_name}"
+        f"({_emit_args(len(qualified_name) + 1 + padding, schema.arguments[arg_start:])}) "
+        f"-> {_emit_rets(schema.returns)}"
     )
     return schema_str
 
@@ -276,7 +276,9 @@ def _get_global_builtins():
         if len(schemas) > 0:
             schematized_ops.append("")
         else:
-            table_row = f'":any:`{fn}`", "{schemaless_op_explanations[fn]}"'
+            table_row = (
+                f'":external+python:py:obj:`{fn}`", "{schemaless_op_explanations[fn]}"'
+            )
             schemaless_ops.append(table_row)
 
     schematized_ops_str = "\n".join(schematized_ops)
diff --git a/torch/lib/libshm/CMakeLists.txt b/torch/lib/libshm/CMakeLists.txt
index 20158a9a2553c..a3b41d0a0b05e 100644
--- a/torch/lib/libshm/CMakeLists.txt
+++ b/torch/lib/libshm/CMakeLists.txt
@@ -7,13 +7,6 @@ if(NOT LIBSHM_INSTALL_LIB_SUBDIR)
   set(LIBSHM_INSTALL_LIB_SUBDIR "lib" CACHE PATH "libshm install library directory")
 endif()
 
-# Flags
-# When using MSVC
-if(MSVC)
-  # we want to respect the standard, and we are bored of those **** .
-  add_definitions(-D_CRT_SECURE_NO_DEPRECATE=1)
-endif(MSVC)
-
 add_library(shm SHARED core.cpp)
 if(HAVE_SOVERSION)
   set_target_properties(shm PROPERTIES
@@ -29,7 +22,7 @@ set_target_properties(shm PROPERTIES
   PREFIX "lib"
   IMPORT_PREFIX "lib"
   CXX_STANDARD 17)
-target_link_libraries(shm PUBLIC torch)
+target_link_libraries(shm PRIVATE torch_cpu)
 
 if(UNIX AND NOT APPLE)
   include(CheckLibraryExists)
@@ -67,7 +60,7 @@ if(UNIX AND NOT APPLE)
 endif()
 
 add_executable(torch_shm_manager manager.cpp)
-target_link_libraries(torch_shm_manager PRIVATE shm)
+target_link_libraries(torch_shm_manager PRIVATE shm c10)
 set_target_properties(torch_shm_manager PROPERTIES
   INSTALL_RPATH "${_rpath_portable_origin}/../lib")
 
diff --git a/torch/lib/libshm/core.cpp b/torch/lib/libshm/core.cpp
index e9ff9b0b6b90f..6f67af213e3c5 100644
--- a/torch/lib/libshm/core.cpp
+++ b/torch/lib/libshm/core.cpp
@@ -11,7 +11,7 @@ std::unordered_map<std::string, ClientSocket> managers;
 std::string manager_executable_path;
 
 AllocInfo get_alloc_info(const char* filename) {
-  AllocInfo info = {0};
+  AllocInfo info = {};
   info.pid = getpid();
   info.free = false;
   size_t len = strlen(filename);
diff --git a/torch/lib/libshm/socket.h b/torch/lib/libshm/socket.h
index aeb0a8bb9d4e7..e3ff98cbc9fb9 100644
--- a/torch/lib/libshm/socket.h
+++ b/torch/lib/libshm/socket.h
@@ -49,7 +49,7 @@ class Socket {
     char* buffer = (char*)_buffer;
     size_t bytes_received = 0;
     ssize_t step_received;
-    struct pollfd pfd = {0};
+    struct pollfd pfd = {};
     pfd.fd = socket_fd;
     pfd.events = POLLIN;
     while (bytes_received < num_bytes) {
@@ -110,7 +110,7 @@ class ManagerServerSocket : public Socket {
       SYSCHECK_ERR_RETURN_NEG1(
           bind(socket_fd, (struct sockaddr*)&address, len));
       SYSCHECK_ERR_RETURN_NEG1(listen(socket_fd, 10));
-    } catch (std::exception& e) {
+    } catch (std::exception&) {
       SYSCHECK_ERR_RETURN_NEG1(close(socket_fd));
       throw;
     }
@@ -146,7 +146,7 @@ class ClientSocket : public Socket {
       size_t len = address_length(address);
       SYSCHECK_ERR_RETURN_NEG1(
           connect(socket_fd, (struct sockaddr*)&address, len));
-    } catch (std::exception& e) {
+    } catch (std::exception&) {
       SYSCHECK_ERR_RETURN_NEG1(close(socket_fd));
       throw;
     }
diff --git a/torch/library.h b/torch/library.h
index 535bd7640f8f9..c38179a6eea1d 100644
--- a/torch/library.h
+++ b/torch/library.h
@@ -370,8 +370,8 @@ inline CppFunction dispatch(c10::DeviceType type, Func&& raw_f) {
         return c10::DispatchKey::Meta;
       case c10::DeviceType::HIP:
         return c10::DispatchKey::HIP;
-      case c10::DeviceType::ORT:
-        return c10::DispatchKey::ORT;
+      case c10::DeviceType::MAIA:
+        return c10::DispatchKey::MAIA;
       case c10::DeviceType::HPU:
         return c10::DispatchKey::HPU;
       case c10::DeviceType::MTIA:
@@ -611,18 +611,23 @@ class TORCH_API Library final {
   }
 
   /// Declares that for all operators that are subsequently def'ed, their
-  /// abstract impls may be found in the given Python module (pymodule).
-  /// This registers some help text that is used if the abstract impl
+  /// fake impls may be found in the given Python module (pymodule).
+  /// This registers some help text that is used if the fake impl
   /// cannot be found.
   ///
   /// Args:
   /// - pymodule: the python module
   /// - context: We may include this in the error message.
-  Library& impl_abstract_pystub(const char* pymodule, const char* context = "") {
-    impl_abstract_pystub_ = {pymodule, context};
+  Library& set_python_module(const char* pymodule, const char* context = "") {
+    python_module_ = {pymodule, context};
     return *this;
   }
 
+  /// Deprecated; use set_python_module instead
+  Library& impl_abstract_pystub(const char* pymodule, const char* context = "") {
+    return set_python_module(pymodule, context);
+  }
+
   /// Define an operator for a schema and then register an implementation for
   /// it.  This is typically what you would use if you aren't planning
   /// on making use of the dispatcher to structure your operator
@@ -837,11 +842,14 @@ class TORCH_API Library final {
   template <class CurClass>
   inline detail::ClassNotSelected class_(detail::SelectiveStr<false> className);
 
+  // De-registers all registrations created with this Library
+  void reset();
+
  private:
   Kind kind_;
   c10::optional<std::string> ns_;
   c10::optional<c10::DispatchKey> dispatch_key_;
-  c10::optional<std::pair<const char*, const char*>> impl_abstract_pystub_;
+  c10::optional<std::pair<const char*, const char*>> python_module_;
   const char* file_;
   uint32_t line_;
 
diff --git a/torch/library.py b/torch/library.py
index c48c2f68c944c..6bd4bd81104e7 100644
--- a/torch/library.py
+++ b/torch/library.py
@@ -1,12 +1,17 @@
 from ._ops import OpOverload
-from typing import Any, Optional, Set, List
+from typing import Any, Optional, Set, List, Union, Callable, Tuple, Dict, Sequence
 import traceback
 import torch
 import weakref
 import functools
 import inspect
 import re
+import contextlib
 import sys
+import warnings
+from torch._library.custom_ops import custom_op, _maybe_get_opdef, device_types_t, CustomOpDef
+import torch._library as _library
+
 
 __all__ = [
     'Library',
@@ -14,7 +19,9 @@
     'define',
     'fallthrough_kernel',
     'impl_abstract',
+    'register_fake',
     'get_ctx',
+    'custom_op',
 ]
 
 # Set containing the combination of (namespace, operator, DispatchKey) for which a new kernel has been registered
@@ -92,8 +99,7 @@ def define(self, schema, alias_analysis="", *, tags=()):
             name of the operator as inferred from the schema.
 
         Example::
-            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LIBRARY)
-            >>> my_lib = Library("foo", "DEF")
+            >>> my_lib = Library("mylib", "DEF")
             >>> my_lib.define("sum(Tensor self) -> Tensor")
         '''
         # This is added because we also want to disallow PURE_FUNCTION alias analysis which is a valid
@@ -109,7 +115,31 @@ def define(self, schema, alias_analysis="", *, tags=()):
         _defs.add(qualname)
         return result
 
-    def impl(self, op_name, fn, dispatch_key=''):
+    def _register_fake(self, op_name, fn, _stacklevel=1):
+        r'''Registers the fake impl for an operator defined in the library.'''
+        source = torch._library.utils.get_source(_stacklevel + 1)
+        frame = sys._getframe(_stacklevel)
+        caller_module = inspect.getmodule(frame)
+        # Can be none if you call register_fake from somewhere there isn't a module
+        # (e.g. __main__)
+        caller_module_name = None if caller_module is None else caller_module.__name__
+
+        # TODO(rzou): We're gonna need to stage this change with torchvision,
+        # since torchvision is github first.
+        if caller_module_name is not None and caller_module_name.startswith("torchvision."):
+            caller_module_name = None
+
+        qualname = f"{self.ns}::{op_name}"
+        entry = torch._library.simple_registry.singleton.find(qualname)
+        if caller_module_name is not None:
+            func_to_register = _check_pystubs_once(fn, qualname, caller_module_name)
+        else:
+            func_to_register = fn
+
+        handle = entry.abstract_impl.register(func_to_register, source)
+        self._registration_handles.append(handle)
+
+    def impl(self, op_name, fn, dispatch_key='', *, with_keyset=False):
         r'''Registers the function implementation for an operator defined in the library.
 
         Args:
@@ -165,16 +195,35 @@ def impl(self, op_name, fn, dispatch_key=''):
                     " for the base ops that it decomposes into.")
 
         assert self.m is not None
-        self.m.impl(name, dispatch_key if dispatch_key != "" else "CompositeImplicitAutograd", fn)
+        self.m.impl(name, dispatch_key if dispatch_key != "" else "CompositeImplicitAutograd", fn, with_keyset)
 
         _impls.add(key)
         self._op_impls.add(key)
 
     def _destroy(self):
+        if self.m is not None:
+            self.m.reset()
         self.m = None
         for handle in self._registration_handles:
             handle.destroy()
         self._registration_handles.clear()
+        global _impls
+        _impls -= self._op_impls
+        for name in self._op_defs:
+            # Delete the cached torch.ops.ns.foo if it was registered.
+            # Otherwise, accessing it leads to a segfault.
+            # It's possible that we only registered an overload in this Library
+            # and another library owns an alive overload.
+            # That's OK - the next time torch.ops.ns.foo gets called, it'll be
+            # recomputed to point at the right collection of overloads.
+            ns, name_with_overload = name.split("::")
+            name = name_with_overload.split(".")[0]
+            if not hasattr(torch.ops, ns):
+                continue
+            namespace = getattr(torch.ops, ns)
+            if not hasattr(namespace, name):
+                continue
+            delattr(namespace, name)
 
 
 def _del_library(captured_impls, op_impls, captured_defs, op_defs, registration_handles):
@@ -184,7 +233,16 @@ def _del_library(captured_impls, op_impls, captured_defs, op_defs, registration_
         handle.destroy()
 
 
-_keep_alive = []
+@contextlib.contextmanager
+def _scoped_library(*args, **kwargs):
+    try:
+        lib = Library(*args, **kwargs)
+        yield lib
+    finally:
+        lib._destroy()
+
+
+_keep_alive: List[Library] = []
 
 
 NAMELESS_SCHEMA = re.compile(r"\(.*\) -> .*")
@@ -202,7 +260,7 @@ def define(qualname, schema, *, lib=None, tags=()):
     This entrypoint defines the custom operator (the first step)
     you must then perform the second step by calling various
     ``impl_*`` APIs, like :func:`torch.library.impl` or
-    :func:`torch.library.impl_abstract`.
+    :func:`torch.library.register_fake`.
 
     Args:
         qualname (str): The qualified name for the operator. Should be
@@ -222,7 +280,6 @@ def define(qualname, schema, *, lib=None, tags=()):
             torch.Tag carefully before applying it.
 
     Example::
-        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LIBRARY)
         >>> import torch
         >>> import numpy as np
         >>>
@@ -230,14 +287,14 @@ def define(qualname, schema, *, lib=None, tags=()):
         >>> torch.library.define("mylib::sin", "(Tensor x) -> Tensor")
         >>>
         >>> # Add implementations for the operator
-        >>> @torch.library.impl("mylibrary::sin", "cpu")
+        >>> @torch.library.impl("mylib::sin", "cpu")
         >>> def f(x):
         >>>     return torch.from_numpy(np.sin(x.numpy()))
         >>>
         >>> # Call the new operator from torch.ops.
         >>> x = torch.randn(3)
         >>> y = torch.ops.mylib.sin(x)
-        >>> assert torch.allclose(y, x)
+        >>> assert torch.allclose(y, x.sin())
 
     """
     if not isinstance(qualname, str):
@@ -290,15 +347,15 @@ def impl(qualname, types, func=None, *, lib=None):
         >>> import numpy as np
         >>>
         >>> # Define the operator
-        >>> torch.library.define("mylibrary::sin", "(Tensor x) -> Tensor")
+        >>> torch.library.define("mylib::mysin", "(Tensor x) -> Tensor")
         >>>
         >>> # Add implementations for the cpu device
-        >>> @torch.library.impl("mylibrary::sin", "cpu")
+        >>> @torch.library.impl("mylib::mysin", "cpu")
         >>> def f(x):
         >>>     return torch.from_numpy(np.sin(x.numpy()))
         >>>
         >>> x = torch.randn(3)
-        >>> y = torch.ops.mylibrary.sin(x)
+        >>> y = torch.ops.mylib.mysin(x)
         >>> assert torch.allclose(y, x.sin())
     """
     if isinstance(types, str):
@@ -352,21 +409,105 @@ def wrap(f):
     return wrap
 
 
-
 def impl_abstract(qualname, func=None, *, lib=None, _stacklevel=1):
-    r"""Register an abstract implementation for this operator.
+    r"""This API was renamed to :func:`torch.library.register_fake` in PyTorch 2.4.
+    Please use that instead.
+    """
+    warnings.warn("torch.library.impl_abstract was renamed to "
+                  "torch.library.register_fake. Please use that instead; "
+                  "we will remove torch.library.impl_abstract in a future "
+                  "version of PyTorch.",
+                  DeprecationWarning, stacklevel=2)
+    if func is not None:
+        _stacklevel = _stacklevel + 1
+    return register_fake(qualname, func, lib=lib, _stacklevel=_stacklevel)
+
+
+_op_identifier = Union[str, "torch._ops.OpOverload", "torch._library.custom_ops.CustomOpDef"]
+
+
+def register_kernel(
+        op: _op_identifier,
+        device_types: device_types_t,
+        func: Optional[Callable] = None,
+        /,
+        *,
+        lib: Optional[Library] = None):
+    """Register an implementation for a device type for this operator.
+
+    Some valid device_types are: "cpu", "cuda", "xla", "mps", "ipu", "xpu".
+    This API may be used as a decorator.
+
+    Args:
+        fn (Callable): The function to register as the implementation for
+            the given device types.
+        device_types (None | str | Sequence[str]): The device_types to register an impl to.
+            If None, we will register to all device types -- please only use
+            this option if your implementation is truly device-type-agnostic.
+
+    Examples::
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> import torch
+        >>> from torch import Tensor
+        >>> from torch.library import custom_op
+        >>> import numpy as np
+        >>>
+        >>> # Create a custom op that works on cpu
+        >>> @custom_op("mylib::numpy_sin", mutates_args=(), device_types="cpu")
+        >>> def numpy_sin(x: Tensor) -> Tensor:
+        >>>     x_np = x.numpy()
+        >>>     y_np = np.sin(x_np)
+        >>>     return torch.from_numpy(y_np)
+        >>>
+        >>> # Add implementations for the cuda device
+        >>> @torch.library.register_kernel("mylib::numpy_sin", "cuda")
+        >>> def _(x):
+        >>>     x_np = x.cpu().numpy()
+        >>>     y_np = np.sin(x_np)
+        >>>     return torch.from_numpy(y_np).to(device=x.device)
+        >>>
+        >>> x_cpu = torch.randn(3)
+        >>> x_cuda = x_cpu.cuda()
+        >>> assert torch.allclose(numpy_sin(x_cpu), x_cpu.sin())
+        >>> assert torch.allclose(numpy_sin(x_cuda), x_cuda.sin())
 
-    An "abstract implementation" specifies the behavior of this operator on
-    Tensors that carry no data. Given some input Tensors with certain properties
-    (sizes/strides/storage_offset/device), it specifies what the properties of
-    the output Tensors are.
+    """
 
-    The abstract implementation has the same signature as the operator.
-    It is run for both FakeTensors and meta tensors. To write an abstract
+    if not isinstance(op, (str, torch._ops.OpOverload, torch._library.custom_ops.CustomOpDef)):
+        raise ValueError("register_kernel(op): got unexpected type for op: {type(op)}")
+    if isinstance(op, torch._ops.OpOverload):
+        op = op._name
+    opdef = _maybe_get_opdef(op)
+    if opdef is not None:
+        return opdef.register_kernel(device_types, func)
+    assert isinstance(op, str)
+    if device_types is None:
+        device_types = "CompositeExplicitAutograd"
+    return impl(op, device_types, func, lib=lib)
+
+
+def register_fake(
+        op: _op_identifier,
+        func: Optional[Callable] = None,
+        /,
+        *,
+        lib: Optional[Library] = None,
+        _stacklevel: int = 1):
+    r"""Register a FakeTensor implementation ("fake impl") for this operator.
+
+    Also sometimes known as a "meta kernel", "abstract impl".
+
+    An "FakeTensor implementation" specifies the behavior of this operator on
+    Tensors that carry no data ("FakeTensor"). Given some input Tensors with
+    certain properties (sizes/strides/storage_offset/device), it specifies
+    what the properties of the output Tensors are.
+
+    The FakeTensor implementation has the same signature as the operator.
+    It is run for both FakeTensors and meta tensors. To write a FakeTensor
     implementation, assume that all Tensor inputs to the operator are
     regular CPU/CUDA/Meta tensors, but they do not have storage, and
     you are trying to return regular CPU/CUDA/Meta tensor(s) as output.
-    The abstract implementation must consist of only PyTorch operations
+    The FakeTensor implementation must consist of only PyTorch operations
     (and may not directly access the storage or data of any input or
     intermediate Tensors).
 
@@ -381,12 +522,12 @@ def impl_abstract(qualname, func=None, *, lib=None, _stacklevel=1):
         >>> from torch import Tensor
         >>>
         >>> # Example 1: an operator without data-dependent output shape
-        >>> torch.library.define(
-        >>>     "mylib::custom_linear",
-        >>>     "(Tensor x, Tensor weight, Tensor bias) -> Tensor")
+        >>> @torch.library.custom_op("mylib::custom_linear", mutates_args=())
+        >>> def custom_linear(x: Tensor, weight: Tensor, bias: Tensor) -> Tensor:
+        >>>     raise NotImplementedError("Implementation goes here")
         >>>
-        >>> @torch.library.impl_abstract("mylib::custom_linear")
-        >>> def custom_linear_abstract(x, weight):
+        >>> @torch.library.register_fake("mylib::custom_linear")
+        >>> def _(x, weight, bias):
         >>>     assert x.dim() == 2
         >>>     assert weight.dim() == 2
         >>>     assert bias.dim() == 1
@@ -396,13 +537,25 @@ def impl_abstract(qualname, func=None, *, lib=None, _stacklevel=1):
         >>>
         >>>     return (x @ weight.t()) + bias
         >>>
+        >>> with torch._subclasses.fake_tensor.FakeTensorMode():
+        >>>     x = torch.randn(2, 3)
+        >>>     w = torch.randn(3, 3)
+        >>>     b = torch.randn(3)
+        >>>     y = torch.ops.mylib.custom_linear(x, w, b)
+        >>>
+        >>> assert y.shape == (2, 3)
+        >>>
         >>> # Example 2: an operator with data-dependent output shape
-        >>> torch.library.define("mylib::custom_nonzero", "(Tensor x) -> Tensor")
+        >>> @torch.library.custom_op("mylib::custom_nonzero", mutates_args=())
+        >>> def custom_nonzero(x: Tensor) -> Tensor:
+        >>>     x_np = x.numpy(force=True)
+        >>>     res = np.stack(np.nonzero(x_np), axis=1)
+        >>>     return torch.tensor(res, device=x.device)
         >>>
-        >>> @torch.library.impl_abstract("mylib::custom_nonzero")
-        >>> def custom_nonzero_abstract(x):
+        >>> @torch.library.register_fake("mylib::custom_nonzero")
+        >>> def _(x):
         >>>     # Number of nonzero-elements is data-dependent.
-        >>>     # Since we cannot peek at the data in an abstract impl,
+        >>>     # Since we cannot peek at the data in an fake impl,
         >>>     # we use the ctx object to construct a new symint that
         >>>     # represents the data-dependent size.
         >>>     ctx = torch.library.get_ctx()
@@ -411,45 +564,159 @@ def impl_abstract(qualname, func=None, *, lib=None, _stacklevel=1):
         >>>     result = x.new_empty(shape, dtype=torch.int64)
         >>>     return result
         >>>
-        >>> @torch.library.impl("mylib::custom_nonzero", "cpu")
-        >>> def custom_nonzero_cpu(x):
-        >>>     x_np = x.numpy()
-        >>>     res = np.stack(np.nonzero(x_np), axis=1)
-        >>>     return torch.tensor(res, device=x.device)
+        >>> from torch.fx.experimental.proxy_tensor import make_fx
+        >>>
+        >>> x = torch.tensor([0, 1, 2, 3, 4, 0])
+        >>> trace = make_fx(torch.ops.mylib.custom_nonzero, tracing_mode="symbolic")(x)
+        >>> trace.print_readable()
+        >>>
+        >>> assert torch.allclose(trace(x), torch.ops.mylib.custom_nonzero(x))
 
     """
-    source = torch._library.utils.get_source(_stacklevel + 1)
-    frame = sys._getframe(_stacklevel)
-    caller_module = inspect.getmodule(frame)
-    # Can be none if you call impl_abstract from somewhere there isn't a module
-    # (e.g. __main__)
-    caller_module_name = None if caller_module is None else caller_module.__name__
-
-    # TODO(rzou): We're gonna need to stage this change with torchvision,
-    # since torchvision is github first.
-    if caller_module_name is not None and caller_module_name.startswith("torchvision."):
-        caller_module_name = None
-
-    def inner(func):
-        entry = torch._library.simple_registry.singleton.find(qualname)
-        if caller_module_name is not None:
-            func_to_register = _check_pystubs_once(func, qualname, caller_module_name)
+    if not isinstance(op, (str, torch._ops.OpOverload, torch._library.custom_ops.CustomOpDef)):
+        raise ValueError("register_fake(op): got unexpected type for op: {type(op)}")
+    if isinstance(op, torch._ops.OpOverload):
+        op = op._name
+    opdef = _maybe_get_opdef(op)
+    if opdef is not None:
+        if func is None:
+            return opdef.register_fake
         else:
-            func_to_register = func
+            return opdef.register_fake(func)
+    assert isinstance(op, str)
 
-        handle = entry.abstract_impl.register(func_to_register, source)
-        if lib is not None:
-            lib._registration_handles.append(handle)
+    stacklevel = _stacklevel
+
+    def register(func):
+        namespace, op_name = torch._library.utils.parse_namespace(op)
+        if lib is None:
+            use_lib = Library(namespace, "FRAGMENT")
+            _keep_alive.append(use_lib)
+        else:
+            use_lib = lib
+        use_lib._register_fake(op_name, func, _stacklevel=stacklevel + 1)
         return func
 
     if func is None:
-        return inner
-    return inner(func)
+        return register
+    else:
+        stacklevel += 1
+        return register(func)
+
+
+def register_autograd(op: _op_identifier, backward: Callable, /, *, setup_context: Optional[Callable] = None, lib=None) -> None:
+    r"""Register a backward formula for this custom op.
+
+    In order for an operator to work with autograd, you need to register
+    a backward formula:
+    1. You must tell us how to compute gradients during the backward pass
+    by providing us a "backward" function.
+    2. If you need any values from the forward to compute gradients, you can
+    use `setup_context` to save values for backward.
+
+    ``backward`` runs during the backward pass. It accepts ``(ctx, *grads)``:
+    - ``grads`` is one or more gradients. The number of gradients matches
+    the number of outputs of the operator.
+    The ``ctx`` object is `the same ctx object <context_method_mixins>`_ used by
+    :class:`torch.autograd.Function`. The semantics of ``backward_fn`` are the
+    same as :meth:`torch.autograd.Function.backward`.
+
+    ``setup_context(ctx, inputs, output)`` runs during the forward pass.
+    Please save quantities needed for backward onto the ``ctx`` object via
+    either :meth:`torch.autograd.function.FunctionCtx.save_for_backward`
+    or assigning them as attributes of ``ctx``. If your custom op has
+    kwarg-only arguments, we expect the signature of ``setup_context``
+    to be ``setup_context(ctx, inputs, keyword_only_inputs, output)``.
+
+    Both ``setup_context_fn`` and ``backward_fn`` must be traceable. That is,
+    they may not directly access :meth:`torch.Tensor.data_ptr` and they must
+    not depend on or mutate global state. If you need a non-traceable backward,
+    you can make it a separate custom_op that you call inside ``backward_fn``.
+
+    Examples:
+        >>> import torch
+        >>> import numpy as np
+        >>> from torch import Tensor
+        >>>
+        >>> @torch.library.custom_op("mylib::numpy_sin", mutates_args=())
+        >>> def numpy_sin(x: Tensor) -> Tensor:
+        >>>     x_np = x.cpu().numpy()
+        >>>     y_np = np.sin(x_np)
+        >>>     return torch.from_numpy(y_np).to(device=x.device)
+        >>>
+        >>> def setup_context(ctx, inputs, output) -> Tensor:
+        >>>     x, = inputs
+        >>>     ctx.save_for_backward(x)
+        >>>
+        >>> def backward(ctx, grad):
+        >>>     x, = ctx.saved_tensors
+        >>>     return grad * x.cos()
+        >>>
+        >>> torch.library.register_autograd("mylib::numpy_sin", backward, setup_context=setup_context)
+        >>>
+        >>> x = torch.randn(3, requires_grad=True)
+        >>> y = numpy_sin(x)
+        >>> grad_x, = torch.autograd.grad(y, x, torch.ones_like(y))
+        >>> assert torch.allclose(grad_x, x.cos())
+        >>>
+        >>> # Example with a keyword-only arg
+        >>> @torch.library.custom_op("mylib::numpy_mul", mutates_args=())
+        >>> def numpy_mul(x: Tensor, *, val: float) -> Tensor:
+        >>>     x_np = x.cpu().numpy()
+        >>>     y_np = x_np * val
+        >>>     return torch.from_numpy(y_np).to(device=x.device)
+        >>>
+        >>> def setup_context(ctx, inputs, keyword_only_inputs, output) -> Tensor:
+        >>>     ctx.val = keyword_only_inputs["val"]
+        >>>
+        >>> def backward(ctx, grad):
+        >>>     return grad * ctx.val
+        >>>
+        >>> torch.library.register_autograd("mylib::numpy_mul", backward, setup_context=setup_context)
+        >>>
+        >>> x = torch.randn(3, requires_grad=True)
+        >>> y = numpy_mul(x, val=3.14)
+        >>> grad_x, = torch.autograd.grad(y, x, torch.ones_like(y))
+        >>> assert torch.allclose(grad_x, torch.full_like(x, 3.14))
+
+    """
+    if not isinstance(op, (str, torch._ops.OpOverload, torch._library.custom_ops.CustomOpDef)):
+        raise ValueError(f"register_autograd(op): got unexpected type for op: {type(op)}")
+    if isinstance(op, torch._ops.OpOverload):
+        op = op._name
+    opdef = _maybe_get_opdef(op)
+    if opdef is not None:
+        opdef.register_autograd(backward, setup_context=setup_context)
+        return
+
+    assert isinstance(op, str)
+    qualname = op
+    op = torch._library.utils.lookup_op(qualname)
+    schema = op._schema
+    if not _library.utils.is_functional_schema(schema):
+        raise RuntimeError(
+            f"Cannot register autograd formula for non-functional operator "
+            f"{op} with schema {schema}. Please create "
+            f"a functional operator and register an autograd formula for that."
+        )
+    if _library.utils.has_kwarg_only_tensors(schema):
+        raise NotImplementedError(
+            f"register_autograd with kwarg-only Tensor args. In the original "
+            f"definition of the op, please make your tensors not kwarg-only. "
+            f"Got: {schema}")
+
+    info = _library.autograd.Info(backward, setup_context)
+    autograd_kernel = _library.autograd.make_autograd_impl(op, info)
+    namespace, opname = torch._library.utils.parse_namespace(qualname)
+    if lib is None:
+        lib = Library(namespace, "FRAGMENT")
+        _keep_alive.append(lib)
+    lib.impl(opname, autograd_kernel, "Autograd", with_keyset=True)
 
 
 # If the op was defined in C++, then we want to make sure there was an
-# m.impl_abstract_pystub(module, ...) call and that the module is the
-# same as the module that called torch.library.impl_abstract.
+# m.set_python_module(module, ...) call and that the module is the
+# same as the module that called torch.library.register_fake.
 def _check_pystubs_once(func, qualname, actual_module_name):
     checked = False
 
@@ -466,21 +733,26 @@ def inner(*args, **kwargs):
         maybe_pystub = torch._C._dispatch_pystub(
             op._schema.name,
             op._schema.overload_name)
-        if not maybe_pystub:
-            raise RuntimeError(
-                f"Operator '{qualname}' was defined in C++ and has a Python "
-                f"abstract impl. In this situation, it is required to have a "
-                f"C++ `m.impl_abstract_pystub` call, but we could not find one."
-                f"Please add a call to `m.impl_abstract_pystub(\"{actual_module_name}\");` "
-                f"to the C++ TORCH_LIBRARY block the operator was "
-                f"defined in.")
-        pystub_module = maybe_pystub[0]
-        if actual_module_name != pystub_module:
-            raise RuntimeError(
-                f"Operator '{qualname}' specified that its python abstract impl "
-                f"is in the Python module '{pystub_module}' but it was actually found "
-                f"in '{actual_module_name}'. Please either move the abstract impl "
-                f"or correct the m.impl_abstract_pystub call.")
+        if maybe_pystub is None:
+            if torch._library.utils.requires_set_python_module():
+                namespace = op.namespace
+                cpp_filename = op._handle.debug()
+                raise RuntimeError(
+                    f"Operator '{qualname}' was defined in C++ and has a Python "
+                    f"fake impl. In this situation, we require there to also be a "
+                    f"companion C++ `m.set_python_module(\"{actual_module_name}\")` "
+                    f"call, but we could not find one. Please add that to "
+                    f"to the top of the C++ TORCH_LIBRARY({namespace}, ...) block the "
+                    f"operator was registered in ({cpp_filename})")
+        else:
+            pystub_module = maybe_pystub[0]
+            if actual_module_name != pystub_module:
+                cpp_filename = op._handle.debug()
+                raise RuntimeError(
+                    f"Operator '{qualname}' specified that its python fake impl "
+                    f"is in the Python module '{pystub_module}' but it was actually found "
+                    f"in '{actual_module_name}'. Please either move the fake impl "
+                    f"or correct the m.set_python_module call ({cpp_filename})")
         checked = True
         return func(*args, **kwargs)
     return inner
@@ -496,7 +768,108 @@ def inner(*args, **kwargs):
 def get_ctx() -> "torch._library.abstract_impl.AbstractImplCtx":
     """get_ctx() returns the current AbstractImplCtx object.
 
-    Calling ``get_ctx()`` is only valid inside of an abstract impl
-    (see :func:`torch.library.impl_abstract` for more usage details.
+    Calling ``get_ctx()`` is only valid inside of an fake impl
+    (see :func:`torch.library.register_fake` for more usage details.
     """
     return torch._library.abstract_impl.global_ctx_getter()
+
+
+_OPCHECK_DEFAULT_UTILS = (
+    "test_schema",
+    "test_autograd_registration",
+    "test_faketensor",
+    "test_aot_dispatch_dynamic",
+)
+
+
+def opcheck(
+    op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket, CustomOpDef],
+    args: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]] = None,
+    *,
+    test_utils: Union[str, Sequence[str]] = _OPCHECK_DEFAULT_UTILS,
+    raise_exception: bool = True,
+) -> Dict[str, str]:
+    """Given an operator and some sample arguments, tests if the operator is
+    registered correctly.
+
+    That is, when you use the torch.library/TORCH_LIBRARY APIs to create a
+    custom op, you specified metadata (e.g. mutability info) about the custom op
+    and these APIs require that the functions you pass them satisfy certain
+    properties (e.g. no data pointer access in the fake/meta/abstract kernel)
+    ``opcheck`` tests these metadata and properties.
+
+    Concretely, we test the following:
+    - test_schema: if the operator's schema is correct.
+    - test_autograd_registration: if autograd was registered correctly.
+    - test_faketensor: If the operator has a FakeTensor kernel
+    (and if it is correct). The FakeTensor kernel is necessary (
+    but not sufficient) for the operator to work with PyTorch compilation
+    APIs (torch.compile/export/FX).
+    - test_aot_dispatch_dynamic: If the operator has correct behavior
+    with PyTorch compilation APIs (torch.compile/export/FX).
+    This checks that the outputs (and gradients, if applicable) are the
+    same under eager-mode PyTorch and torch.compile.
+    This test is a superset of ``test_faketensor``.
+
+    For best results, please call ``opcheck`` multiple times with a
+    representative set of inputs. If your operator supports
+    autograd, please use ``opcheck`` with inputs with ``requires_grad = True``;
+    if your operator supports multiple devices (e.g. CPU and CUDA), please
+    use ``opcheck`` with inputs on all supported devices.
+
+    Args:
+        op: The operator. Must either be a function decorated with
+            :func:`torch.library.custom_op` or an OpOverload/OpOverloadPacket
+            found in torch.ops.* (e.g. torch.ops.aten.sin, torch.ops.mylib.foo)
+        args: The args to the operator
+        kwargs: The kwargs to the operator
+        test_utils: Tests that we should run. Default: all of them.
+            Example: ("test_schema", "test_faketensor")
+        raise_exception: If we should raise an exception on the first
+            error. If False, we will return a dict with information
+            on if each test passed or not.
+
+    .. warning::
+
+        opcheck and :func:`torch.autograd.gradcheck` test different things;
+        opcheck tests if your usage of torch.library APIs is correct while
+        :func:`torch.autograd.gradcheck` tests if your autograd formula is
+        mathematically correct. Use both to test custom ops that support
+        gradient computation.
+
+    Example:
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> @torch.library.custom_op("mylib::numpy_mul", mutates_args=())
+        >>> def numpy_add(x: Tensor, y: float) -> Tensor:
+        >>>     x_np = x.numpy(force=True)
+        >>>     z_np = x_np + y
+        >>>     return torch.from_numpy(z_np).to(x.device)
+        >>>
+        >>> @numpy_sin.register_fake
+        >>> def _(x, y):
+        >>>     return torch.empty_like(x)
+        >>>
+        >>> def setup_context(ctx, inputs, output):
+        >>>     y, = inputs
+        >>>     ctx.y = y
+        >>>
+        >>> def backward(ctx, grad):
+        >>>     return grad * ctx.y, None
+        >>>
+        >>> numpy_sin.register_autograd(backward, setup_context=setup_context)
+        >>>
+        >>> sample_inputs = [
+        >>>     (torch.randn(3), 3.14),
+        >>>     (torch.randn(2, 3, device='cuda'), 2.718),
+        >>>     (torch.randn(1, 10, requires_grad=True), 1.234),
+        >>>     (torch.randn(64, 64, device='cuda', requires_grad=True), 90.18),
+        >>> ]
+        >>>
+        >>> for args in sample_inputs:
+        >>>     torch.library.opcheck(foo, args)
+
+    """
+    import torch.testing._internal.optests as optests
+    return optests.opcheck(op, args, kwargs, test_utils=test_utils, raise_exception=raise_exception)
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index 73057ab08c86c..29df838bab540 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -86,7 +86,7 @@
 the output has the same batch dimensions.
 
 """ + fr"""
-.. note:: {common_notes["sync_note"]}
+.. note:: {common_notes["sync_note_has_ex"].format("torch.linalg.cholesky_ex")}
 """ + r"""
 
 .. seealso::
@@ -222,7 +222,7 @@
 then the output has the same batch dimensions.
 
 """ + fr"""
-.. note:: {common_notes["sync_note"]}
+.. note:: {common_notes["sync_note_has_ex"].format("torch.linalg.inv_ex")}
 """ + r"""
 
 .. note::
@@ -1115,7 +1115,7 @@
     tensor([2])
 
 .. _condition number:
-    https://pytorch.org/docs/master/linalg.html#torch.linalg.cond
+    https://pytorch.org/docs/main/linalg.html#torch.linalg.cond
 .. _full description of these drivers:
     https://www.netlib.org/lapack/lug/node27.html
 """)
@@ -1450,7 +1450,7 @@
     out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
     dtype (:class:`torch.dtype`, optional): type used to perform the accumulation and the return.
         If specified, :attr:`x` is cast to :attr:`dtype` before performing the operation,
-        and the returned tensor’s type will be :attr:`dtype` if real and of its real counterpart if complex.
+        and the returned tensor's type will be :attr:`dtype` if real and of its real counterpart if complex.
         :attr:`dtype` may be complex if :attr:`x` is complex, otherwise it must be real.
         :attr:`x` should be convertible without narrowing to :attr:`dtype`. Default: None
 
@@ -1773,7 +1773,7 @@
     tensor(3.0957e-06)
 
 .. _condition number:
-    https://pytorch.org/docs/master/linalg.html#torch.linalg.cond
+    https://pytorch.org/docs/main/linalg.html#torch.linalg.cond
 .. _the resulting vectors will span the same subspace:
     https://en.wikipedia.org/wiki/Singular_value_decomposition#Singular_values,_singular_vectors,_and_their_relation_to_the_SVD
 """)
@@ -2131,7 +2131,7 @@
     :attr:`A` and :attr:`B` transposed and transposing the output returned by this function.
 
 """ + fr"""
-.. note:: {common_notes["sync_note"]}
+.. note:: {common_notes["sync_note_has_ex"].format("torch.linalg.solve_ex")}
 """ + r"""
 
 .. seealso::
@@ -2332,9 +2332,9 @@
     >>> A = torch.randn(2, 3, 3)
     >>> B1 = torch.randn(2, 3, 4)
     >>> B2 = torch.randn(2, 3, 7)
-    >>> A_factor = torch.linalg.lu_factor(A)
-    >>> X1 = torch.linalg.lu_solve(A_factor, B1)
-    >>> X2 = torch.linalg.lu_solve(A_factor, B2)
+    >>> LU, pivots = torch.linalg.lu_factor(A)
+    >>> X1 = torch.linalg.lu_solve(LU, pivots, B1)
+    >>> X2 = torch.linalg.lu_solve(LU, pivots, B2)
     >>> torch.allclose(A @ X1, B1)
     True
     >>> torch.allclose(A @ X2, B2)
@@ -2392,7 +2392,7 @@
 
     XA = B\mathrlap{\qquad A \in \mathbb{K}^{k \times k}, B \in \mathbb{K}^{n \times k}.}
 
-If  :attr:`adjoint`\ `= True` (and :attr:`left`\ `= True), given an LU factorization of :math:`A`
+If  :attr:`adjoint`\ `= True` (and :attr:`left`\ `= True`), given an LU factorization of :math:`A`
 this function function returns the :math:`X \in \mathbb{K}^{n \times k}` that solves the system
 
 .. math::
diff --git a/torch/masked/_docs.py b/torch/masked/_docs.py
index bf96b49e3e827..fa130bbefbc5c 100644
--- a/torch/masked/_docs.py
+++ b/torch/masked/_docs.py
@@ -1012,7 +1012,7 @@
     input (Tensor): the input tensor
     dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
       Default: None that is equivalent to ``tuple(range(input.ndim))``.
-    unbiased (bool): when True, use Bessel’s correction, otherwise, compute
+    unbiased (bool): when True, use Bessel's correction, otherwise, compute
       the uncorrected sample variance.
 
 Keyword args:
@@ -1148,7 +1148,7 @@
     input (Tensor): the input tensor
     dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
       Default: None that is equivalent to ``tuple(range(input.ndim))``.
-    unbiased (bool): when True, use Bessel’s correction, otherwise, compute
+    unbiased (bool): when True, use Bessel's correction, otherwise, compute
       the uncorrected sample variance.
 
 Keyword args:
diff --git a/torch/masked/_ops.py b/torch/masked/_ops.py
index 083af67f12099..b7872a6d4cf4e 100644
--- a/torch/masked/_ops.py
+++ b/torch/masked/_ops.py
@@ -21,7 +21,7 @@
     DimOrDims = Optional[Tuple[int]]
 
 
-__all__ = []
+__all__: List[str] = []
 
 # All masked reduction/normalization operations have the same
 # signatures. Here we introduce docstring templates that are applied
@@ -210,7 +210,7 @@ def _generate_docstring(func):
 ord (int, float): the order of vector norm. Default: 2.
   See :func:`torch.linalg.vector_norm` for a list of supported norms.""",
         unbiased="""\
-unbiased (bool): when True, use Bessel’s correction, otherwise, compute
+unbiased (bool): when True, use Bessel's correction, otherwise, compute
   the uncorrected sample variance.""",
         eps="""\
 eps (float, optional): small value to avoid division by zero. Default: {default}.""",
@@ -1627,7 +1627,7 @@ def _std_var(
             total = sum(x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype)
         else:
             total = sum(
-                x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype, mask=inmask
+                x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype, mask=inmask  # type: ignore[possibly-undefined]
             )
         if not keepdim:
             count = count.reshape(total.shape)
diff --git a/torch/masked/maskedtensor/_ops_refs.py b/torch/masked/maskedtensor/_ops_refs.py
index 81ca2bda65b9c..81a890af5d65f 100644
--- a/torch/masked/maskedtensor/_ops_refs.py
+++ b/torch/masked/maskedtensor/_ops_refs.py
@@ -1,8 +1,12 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
 from functools import partial
+from typing import Callable, Any, Dict, TYPE_CHECKING
 import torch
 
+if TYPE_CHECKING:
+    import torch._ops
+
 from .binary import (
     _apply_native_binary,
     NATIVE_BINARY_FNS,
@@ -210,7 +214,7 @@ def _function_to_sparse_csr(func, *args, **kwargs):
     return _MaskedToSparseCsr.apply(args[0])
 
 
-_MASKEDTENSOR_DISPATCH_TABLE = {}
+_MASKEDTENSOR_DISPATCH_TABLE: Dict["torch._ops.OpOverload", Callable[..., Any]] = {}
 
 def register_dispatch_func(aten_ops):
     """
diff --git a/torch/mps/__init__.py b/torch/mps/__init__.py
index 52cda4fb0c06c..6118c2b05686e 100644
--- a/torch/mps/__init__.py
+++ b/torch/mps/__init__.py
@@ -4,6 +4,8 @@
 performance can be achieved, by running work on the metal GPU(s).
 See https://developer.apple.com/documentation/metalperformanceshaders for more details.
 """
+from typing import Union
+
 import torch
 from .. import Tensor
 
@@ -19,21 +21,35 @@ def _get_default_mps_generator() -> torch._C.Generator:
     return _default_mps_generator
 
 
+def device_count() -> int:
+    r"""Returns the number of available MPS devices."""
+    return int(torch._C._has_mps and torch._C._mps_is_available())
+
+
 def synchronize() -> None:
     r"""Waits for all kernels in all streams on a MPS device to complete."""
     return torch._C._mps_deviceSynchronize()
 
 
-def get_rng_state() -> Tensor:
-    r"""Returns the random number generator state as a ByteTensor."""
+def get_rng_state(device: Union[int, str, torch.device] = "mps") -> Tensor:
+    r"""Returns the random number generator state as a ByteTensor.
+
+    Args:
+        device (torch.device or int, optional): The device to return the RNG state of.
+            Default: ``'mps'`` (i.e., ``torch.device('mps')``, the current MPS device).
+    """
     return _get_default_mps_generator().get_state()
 
 
-def set_rng_state(new_state: Tensor) -> None:
+def set_rng_state(
+    new_state: Tensor, device: Union[int, str, torch.device] = "mps"
+) -> None:
     r"""Sets the random number generator state.
 
     Args:
         new_state (torch.ByteTensor): The desired state
+        device (torch.device or int, optional): The device to set the RNG state.
+            Default: ``'mps'`` (i.e., ``torch.device('mps')``, the current MPS device).
     """
     new_state_copy = new_state.clone(memory_format=torch.contiguous_format)
     _get_default_mps_generator().set_state(new_state_copy)
@@ -116,6 +132,7 @@ def driver_allocated_memory() -> int:
 from .event import Event
 
 __all__ = [
+    "device_count",
     "get_rng_state",
     "manual_seed",
     "seed",
diff --git a/torch/mtia/__init__.py b/torch/mtia/__init__.py
new file mode 100644
index 0000000000000..4007f0e584f28
--- /dev/null
+++ b/torch/mtia/__init__.py
@@ -0,0 +1,262 @@
+r"""
+This package enables an interface for accessing MTIA backend in python
+"""
+
+import threading
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+
+from torch.types import Device
+
+from .. import device as _device, Tensor
+from .._utils import _dummy_type, _LazySeedTracker, classproperty
+from ._utils import _get_device_index
+
+_device_t = Union[_device, str, int, None]
+
+# torch.mtia.Event/Stream is alias of torch.Event/Stream
+Event = torch.Event
+Stream = torch.Stream
+
+_initialized = False
+_queued_calls: List[
+    Tuple[Callable[[], None], List[str]]
+] = []  # don't invoke these until initialization occurs
+_tls = threading.local()
+_initialization_lock = threading.Lock()
+_lazy_seed_tracker = _LazySeedTracker()
+
+
+def init():
+    _lazy_init()
+
+
+def is_initialized():
+    r"""Return whether PyTorch's MTIA state has been initialized."""
+    return _initialized and not _is_in_bad_fork()
+
+
+def _is_in_bad_fork() -> bool:
+    return torch._C._mtia_isInBadFork()
+
+
+def _lazy_init() -> None:
+    global _initialized, _queued_calls
+    if is_initialized() or hasattr(_tls, "is_initializing"):
+        return
+    with _initialization_lock:
+        # We be double-checked locking, boys!  This is OK because
+        # the above test was GIL protected anyway.  The inner test
+        # is for when a thread blocked on some other thread which was
+        # doing the initialization; when they get the lock, they will
+        # find there is nothing left to do.
+        if is_initialized():
+            return
+        # It is important to prevent other threads from entering _lazy_init
+        # immediately, while we are still guaranteed to have the GIL, because some
+        # of the C calls we make below will release the GIL
+        if _is_in_bad_fork():
+            raise RuntimeError(
+                "Cannot re-initialize MTIA in forked subprocess. To use MTIA with "
+                "multiprocessing, you must use the 'spawn' start method"
+            )
+        if not _is_compiled():
+            raise AssertionError("Torch not compiled with MTIA enabled")
+
+        torch._C._mtia_init()
+        # Some of the queued calls may reentrantly call _lazy_init();
+        # we need to just return without initializing in that case.
+        # However, we must not let any *other* threads in!
+        _tls.is_initializing = True
+
+        for calls in _lazy_seed_tracker.get_calls():
+            if calls:
+                _queued_calls.append(calls)
+
+        try:
+            for queued_call, orig_traceback in _queued_calls:
+                try:
+                    queued_call()
+                except Exception as e:
+                    msg = (
+                        f"MTIA call failed lazily at initialization with error: {str(e)}\n\n"
+                        f"MTIA call was originally invoked at:\n\n{''.join(orig_traceback)}"
+                    )
+                    raise DeferredMtiaCallError(msg) from e
+        finally:
+            delattr(_tls, "is_initializing")
+        _initialized = True
+
+
+class DeferredMtiaCallError(Exception):
+    pass
+
+
+def _is_compiled() -> bool:
+    r"""Return true if compiled with MTIA support."""
+    return torch._C._mtia_isBuilt()
+
+
+def is_available() -> bool:
+    r"""Return true if MTIA device is available"""
+    if not _is_compiled():
+        return False
+    # MTIA has to init devices first to know if there is any devices available.
+    return device_count() > 0
+
+
+def synchronize() -> None:
+    r"""Waits for all jobs in all streams on a MTIA device to complete."""
+    return torch._C._mtia_deviceSynchronize()
+
+
+def device_count() -> int:
+    r"""Return the number of MTIA devices available."""
+    return torch._C._accelerator_hooks_device_count()
+
+
+def current_device() -> int:
+    r"""Return the index of a currently selected device."""
+    return torch._C._accelerator_hooks_get_current_device()
+
+
+def current_stream(device: Optional[_device_t] = None) -> Stream:
+    r"""Return the currently selected :class:`Stream` for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            the currently selected :class:`Stream` for the current device, given
+            by :func:`~torch.mtia.current_device`, if :attr:`device` is ``None``
+            (default).
+    """
+    return torch._C._mtia_getCurrentStream(_get_device_index(device, optional=True))
+
+
+def default_stream(device: Optional[_device_t] = None) -> Stream:
+    r"""Return the default :class:`Stream` for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            the default :class:`Stream` for the current device, given by
+            :func:`~torch.mtia.current_device`, if :attr:`device` is ``None``
+            (default).
+    """
+    return torch._C._mtia_getDefaultStream(_get_device_index(device, optional=True))
+
+
+def set_stream(stream: Stream):
+    r"""Set the current stream.This is a wrapper API to set the stream.
+        Usage of this function is discouraged in favor of the ``stream``
+        context manager.
+
+    Args:
+        stream (Stream): selected stream. This function is a no-op
+            if this argument is ``None``.
+    """
+    if stream is None:
+        return
+    torch._C._mtia_setCurrentStream(stream)
+
+
+class device:
+    r"""Context-manager that changes the selected device.
+
+    Args:
+        device (torch.device or int): device index to select. It's a no-op if
+            this argument is a negative integer or ``None``.
+    """
+
+    def __init__(self, device: Any):
+        self.idx = _get_device_index(device, optional=True)
+        self.prev_idx = -1
+
+    def __enter__(self):
+        self.prev_idx = torch._C._accelerator_hooks_maybe_exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        self.idx = torch._C._accelerator_hooks_maybe_exchange_device(self.prev_idx)
+        return False
+
+
+class StreamContext:
+    r"""Context-manager that selects a given stream.
+
+    All MTIA kernels queued within its context will be enqueued on a selected
+    stream.
+
+    Args:
+        Stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    .. note:: Streams are per-device.
+    """
+
+    cur_stream: Optional["torch.mtia.Stream"]
+
+    def __init__(self, stream: Optional["torch.mtia.Stream"]):
+        self.stream = stream
+        self.idx = _get_device_index(None, True)
+        if not torch.jit.is_scripting():
+            if self.idx is None:
+                self.idx = -1
+
+        self.src_prev_stream = (
+            None if not torch.jit.is_scripting() else torch.mtia.default_stream(None)
+        )
+        self.dst_prev_stream = (
+            None if not torch.jit.is_scripting() else torch.mtia.default_stream(None)
+        )
+
+    def __enter__(self):
+        # Local cur_stream variable for type refinement
+        cur_stream = self.stream
+        # Return if stream is None or MTIA device not available
+        if cur_stream is None or self.idx == -1:
+            return
+        self.src_prev_stream = torch.mtia.current_stream(None)
+
+        # If the stream is not on the current device, then
+        # set the current stream on the device
+        if self.src_prev_stream.device != cur_stream.device:
+            with device(cur_stream.device):
+                self.dst_prev_stream = torch.mtia.current_stream(cur_stream.device)
+        torch.mtia.set_stream(cur_stream)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        # Local cur_stream variable for type refinement
+        cur_stream = self.stream
+        # If stream is None or no MTIA device available, return
+        if cur_stream is None or self.idx == -1:
+            return
+
+        # Reset the stream on the original device
+        # and destination device
+        if self.src_prev_stream.device != cur_stream.device:  # type: ignore[union-attr]
+            torch.mtia.set_stream(self.dst_prev_stream)  # type: ignore[arg-type]
+        torch.mtia.set_stream(self.src_prev_stream)  # type: ignore[arg-type]
+
+
+def stream(stream: Optional["torch.mtia.Stream"]) -> StreamContext:
+    r"""Wrap around the Context-manager StreamContext that selects a given stream.
+
+    Arguments:
+        stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    ..Note:: In eager mode stream is of type Stream class while in JIT it doesn't support torch.mtia.stream
+    """
+    return StreamContext(stream)
+
+
+__all__ = [
+    "init",
+    "is_available",
+    "is_initialized",
+    "synchronize",
+    "device_count",
+    "current_device",
+    "current_stream",
+    "default_stream",
+    "set_stream",
+    "stream",
+    "device",
+]
diff --git a/torch/mtia/_utils.py b/torch/mtia/_utils.py
new file mode 100644
index 0000000000000..090e26f321232
--- /dev/null
+++ b/torch/mtia/_utils.py
@@ -0,0 +1,38 @@
+from typing import Any
+
+import torch
+
+# The _get_device_index has been moved to torch.utils._get_device_index
+from torch._utils import _get_device_index as _torch_get_device_index
+
+
+def _get_device_index(
+    device: Any, optional: bool = False, allow_cpu: bool = False
+) -> int:
+    r"""Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.
+
+    If :attr:`device` is a torch.device object, returns the device index if it
+    is a MTIA device. Note that for a MTIA device without a specified index,
+    i.e., ``torch.device('mtia')``, this will return the current default MTIA
+    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
+    CPU devices will be accepted and ``-1`` will be returned in this case.
+
+    If :attr:`device` is a Python integer, it is returned as is.
+
+    If :attr:`device` is ``None``, this will return the current default MTIA
+    device if :attr:`optional` is ``True``.
+    """
+    if isinstance(device, int):
+        return device
+    if isinstance(device, str):
+        device = torch.device(device)
+    if isinstance(device, torch.device):
+        if allow_cpu:
+            if device.type not in ["mtia", "cpu"]:
+                raise ValueError(f"Expected a mtia or cpu device, but got: {device}")
+        elif device.type != "mtia":
+            raise ValueError(f"Expected a mtia device, but got: {device}")
+    if not torch.jit.is_scripting():
+        if isinstance(device, torch.mtia.device):
+            return device.idx
+    return _torch_get_device_index(device, optional, allow_cpu)
diff --git a/torch/nested/__init__.py b/torch/nested/__init__.py
index c6292ed9378a5..ea1cce595011a 100644
--- a/torch/nested/__init__.py
+++ b/torch/nested/__init__.py
@@ -1,6 +1,7 @@
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
+import torch.nn.functional as F
 from torch import SymInt, Tensor
 from torch._C import _add_docstr, _nested  # type: ignore[attr-defined]
 
@@ -10,6 +11,7 @@
     "to_padded_tensor",
     "as_nested_tensor",
     "nested_tensor",
+    "nested_tensor_from_jagged",
     "narrow",
 ]
 
@@ -17,19 +19,29 @@
 
 
 def as_nested_tensor(
-    tensor_list: List[Tensor],
+    ts: Union[Tensor, List[Tensor], Tuple[Tensor, ...]],
     dtype: Optional[DType] = None,
     device: Optional[Device] = None,
     layout=None
 ) -> Tensor:
     r"""
-    Constructs a nested tensor preserving autograd history from :attr:`tensor_list` a list of tensors.
+    Constructs a nested tensor preserving autograd history from a tensor or a list / tuple of
+    tensors.
 
-    .. note::
-        Tensors within the list are always copied by this function due to current nested tensor semantics.
+    If a nested tensor is passed, it will be returned directly unless the device / dtype / layout
+    differ. Note that converting device / dtype will result in a copy, while converting layout
+    is not currently supported by this function.
+
+    If a non-nested tensor is passed, it is treated as a batch of constituents of consistent size.
+    A copy will be incurred if the passed device / dtype differ from those of the input OR if
+    the input is non-contiguous. Otherwise, the input's storage will be used directly.
+
+    If a tensor list is provided, tensors in the list are always copied during construction of
+    the nested tensor.
 
     Args:
-        tensor_list (List[Tensor]): a list of tensors with the same ndim
+        ts (Tensor or List[Tensor] or Tuple[Tensor]): a tensor to treat as a nested tensor OR a
+            list / tuple of tensors with the same ndim
 
     Keyword arguments:
         dtype (:class:`torch.dtype`, optional): the desired type of returned nested tensor.
@@ -52,23 +64,64 @@ def as_nested_tensor(
         tensor([1., 1., 1.])
         >>> b.grad
         tensor([0., 0., 0., 0., 0.])
+        >>> c = torch.randn(3, 5, requires_grad=True)
+        >>> nt2 = torch.nested.as_nested_tensor(c)
     """
-    if not isinstance(tensor_list, list) or any(
-        not isinstance(t, Tensor) for t in tensor_list
-    ):
+    is_tensor_list = isinstance(ts, (list, tuple)) and all(isinstance(t, Tensor) for t in ts)
+    if not isinstance(ts, Tensor) and not is_tensor_list:
         raise TypeError(
-            "as_nested_tensor(): Expected first argument to be a list of tensors "
+            "as_nested_tensor(): Expected first argument to be a tensor or a list / tuple of tensors "
         )
+    # convert tuple -> list if needed
+    if is_tensor_list and not isinstance(ts, list):
+        ts = list(ts)
+
+    if isinstance(ts, Tensor) and ts.dim() < 2:
+        raise RuntimeError("as_nested_tensor(): Expected tensor argument to have dim() > 1")
+
+    if isinstance(ts, Tensor) and ts.is_nested:
+        if layout == ts.layout:
+            # return input directly or input copied to device / dtype
+            return ts.to(device=device, dtype=dtype)
+        else:
+            # TODO: Just use nt.to(layout=layout) when it exists.
+            raise RuntimeError(
+                "as_nested_tensor(): Converting between nested tensor layouts is not supported")
 
     if layout is None:
         layout = torch.strided
     if layout == torch.strided:
-        return torch._nested_tensor_from_tensor_list(tensor_list, dtype, None, device, None)
+        if isinstance(ts, Tensor):
+            # contiguous() might be necessary to get flattened view.
+            # we could probably be more precise about when to do this as an optimization
+            buffer = ts.contiguous().view(-1).to(device=device, dtype=dtype)
+            nested_sizes = torch.tensor([t.shape for t in ts])
+            return torch._nested_view_from_buffer(
+                buffer,
+                nested_sizes,
+                *torch._nested_compute_contiguous_strides_offsets(nested_sizes))
+        else:
+            assert isinstance(ts, list)
+            return torch._nested_tensor_from_tensor_list(ts, dtype, None, device, None)
     elif layout == torch.jagged:
-        from torch.nested._internal.nested_tensor import jagged_from_list
-
-        nt, _ = jagged_from_list(tensor_list, offsets=None, device=device, dtype=dtype)
-        return nt
+        if isinstance(ts, Tensor):
+            # contiguous() might be necessary to get flattened view.
+            # we could probably be more precise about when to do this as an optimization
+            values = ts.contiguous().flatten(0, 1).to(device=device, dtype=dtype)
+            batch_size = ts.shape[0]
+            seq_len = ts.shape[1]
+            offsets = torch.arange(0, batch_size * seq_len + 1, seq_len,
+                                   device=device, dtype=torch.int64)
+
+            from torch.nested._internal.nested_tensor import nested_view_from_values_offsets
+
+            return nested_view_from_values_offsets(values, offsets)
+        else:
+            from torch.nested._internal.nested_tensor import jagged_from_list
+
+            assert isinstance(ts, list)
+            nt, _ = jagged_from_list(ts, offsets=None, device=device, dtype=dtype)
+            return nt
     else:
         raise RuntimeError(f"Specified layout is unsupported for nested tensors: {layout}")
 
@@ -133,7 +186,7 @@ def as_nested_tensor(
 
 def nested_tensor(tensor_list, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor:
     r"""
-Constructs a nested tensor with no autograd history (also known as a “leaf tensor”, see
+Constructs a nested tensor with no autograd history (also known as a "leaf tensor", see
 :ref:`Autograd mechanics <autograd-mechanics>`) from :attr:`tensor_list` a list of tensors.
 
 Args:
@@ -170,11 +223,8 @@ def nested_tensor(tensor_list, *, dtype=None, layout=None, device=None, requires
             requires_grad=requires_grad,
             pin_memory=pin_memory)
     elif layout == torch.jagged:
-        # Need to:
-        #   * Detach tensors to discard autograd history
-        #   * Wrap lists of scalars as tensors
-        list_of_tensors = [t.detach() if isinstance(t, Tensor) else torch.as_tensor(t)
-                           for t in tensor_list]
+        # Need to wrap lists of scalars as tensors
+        list_of_tensors = [t if isinstance(t, Tensor) else torch.as_tensor(t) for t in tensor_list]
 
         from torch.nested._internal.nested_tensor import jagged_from_list
 
@@ -254,3 +304,87 @@ def narrow(tensor: Tensor, dim: int, start: Union[int, Tensor], length: Union[in
         raise RuntimeError(f"Specified layout is unsupported for nested narrow: {layout}")
 
     return nt
+
+
+def nested_tensor_from_jagged(
+    values: Tensor,
+    offsets: Optional[Tensor] = None,
+    lengths: Optional[Tensor] = None,
+    jagged_dim: Optional[int] = None,
+) -> Tensor:
+    r"""
+Constructs a jagged layout nested tensor from the given jagged components. The jagged layout
+consists of a required values buffer with the jagged dimension packed into a single dimension.
+The offsets / lengths metadata determines how this dimension is split into batch elements
+and are expected to be allocated on the same device as the values buffer.
+
+Expected metadata formats:
+    * offsets: Indices within the packed dimension splitting it into heterogeneously-sized
+      batch elements. Example: [0, 2, 3, 6] indicates that a packed jagged dim of size 6
+      should be conceptually split into batch elements of length [2, 1, 3]. Note that both the
+      beginning and ending offsets are required for kernel convenience (i.e. shape batch_size + 1).
+    * lengths: Lengths of the individual batch elements; shape == batch_size. Example: [2, 1, 3]
+      indicates that a packed jagged dim of size 6 should be conceptually split into batch
+      elements of length [2, 1, 3].
+
+Note that it can be useful to provide both offsets and lengths. This describes a nested tensor
+with "holes", where the offsets indicate the start position of each batch item and the length
+specifies the total number of elements (see example below).
+
+The returned jagged layout nested tensor will be a view of the input values tensor.
+
+Args:
+    values (:class:`torch.Tensor`): The underlying buffer in the shape of
+        (sum_B(*), D_1, ..., D_N). The jagged dimension is packed into a single dimension,
+        with the offsets / lengths metadata used to distinguish batch elements.
+    offsets (optional :class:`torch.Tensor`): Offsets into the jagged dimension of shape B + 1.
+    lengths (optional :class:`torch.Tensor`): Lengths of the batch elements of shape B.
+    jagged_dim (optional int): Indicates which dimension in values is the packed jagged
+        dimension. If None, this is set to dim=1 (i.e. the dimension immediately following
+        the batch dimension). Default: None
+
+Example::
+
+    >>> values = torch.randn(12, 5)
+    >>> offsets = torch.tensor([0, 3, 5, 6, 10, 12])
+    >>> nt = nested_tensor_from_jagged(values, offsets)
+    >>> # 3D shape with the middle dimension jagged
+    >>> nt.shape
+    torch.Size([5, j2, 5])
+    >>> # Length of each item in the batch:
+    >>> offsets.diff()
+    tensor([3, 2, 1, 4, 2])
+
+    >>> values = torch.randn(6, 5)
+    >>> offsets = torch.tensor([0, 2, 3, 6])
+    >>> lengths = torch.tensor([1, 1, 2])
+    >>> # NT with holes
+    >>> nt = nested_tensor_from_jagged(values, offsets, lengths)
+    >>> a, b, c = nt.unbind()
+    >>> # Batch item 1 consists of indices [0, 1)
+    >>> torch.equal(a, values[0:1, :])
+    True
+    >>> # Batch item 2 consists of indices [2, 3)
+    >>> torch.equal(b, values[2:3, :])
+    True
+    >>> # Batch item 3 consists of indices [3, 5)
+    >>> torch.equal(c, values[3:5, :])
+    True
+    """
+    if offsets is None:
+        if lengths is None:
+            raise RuntimeError(
+                "nested_tensor_from_jagged(): At least one of offsets or lengths is required."
+            )
+        else:
+            # TODO: Truly support offsets=None at some point?
+            # For now, just convert lengths -> offsets for kernel convenience
+            offsets = F.pad(lengths.cumsum(0), (1, 0))
+            lengths = None
+
+    if jagged_dim is None:
+        jagged_dim = 1
+
+    from torch.nested._internal.nested_tensor import nested_view_from_values_offsets_lengths
+
+    return nested_view_from_values_offsets_lengths(values, offsets, lengths, ragged_idx=jagged_dim)
diff --git a/torch/nested/_internal/nested_tensor.py b/torch/nested/_internal/nested_tensor.py
index f988f16823d6f..5494633b09bbe 100644
--- a/torch/nested/_internal/nested_tensor.py
+++ b/torch/nested/_internal/nested_tensor.py
@@ -15,7 +15,7 @@ def get_tensor_symint(tensor, *, coeff=1):
     global _tensor_id_counter
     tensor_symint = _tensor_symint_registry.get(tensor)
     if tensor_symint is None:
-        tensor_symint = torch._C._get_singleton_int(_tensor_id_counter, coeff)
+        tensor_symint = torch._C._get_nested_int(_tensor_id_counter, coeff)
         _tensor_id_counter += 1
         _tensor_symint_registry[tensor] = tensor_symint
     return tensor_symint
@@ -30,22 +30,22 @@ class NestedTensor(torch.Tensor):
     _values: torch.Tensor  # type: ignore[assignment]
     _offsets: torch.Tensor
     _lengths: Optional[torch.Tensor]
-    # NOTE [ Singleton ints for ragged sizes and strides ]
+    # NOTE [ Nested ints for ragged sizes and strides ]
     #
     # Jagged layout tensors are tensors that represent a n-dim tensor with a
     # ragged dimension, but are backed by an (n-1)-dim tensor underneath, e.g.,
     # a jagged tensor with outer shape [B, x, D] is represented internally by a
-    # tensor with shape [sum(x), D] where we introduce what we call a singleton
-    # (or skolem) denoted as "x" here (but sometimes denoted with "*" to
+    # tensor with shape [sum(x), D] where we introduce what we call a nested int
+    # denoted as "x" here (but sometimes denoted with "*" to
     # represent the ragged dimension, and sum(x) represents the dim of the inner
     # tensor or equivalently the sum of all the sizes of the constituent
     # tensors' varying lengths.
     #
-    # We also use singleton ints to represent the strides of this tensor.
+    # We also use nested ints to represent the strides of this tensor.
     # For example, a jagged tensor with shape [B, x, D] can be strided in two
     # ways: [xD, D, 1] and [x, 1, sum(x)], where xD represents x multiplied by D
     _size: Tuple[int, ...]
-    _stride: Tuple[int, ...]
+    _strides: Tuple[int, ...]
     # Indicates that the nth dimension is ragged
     _ragged_idx: int
     _metadata_cache: Dict[str, Any]
@@ -85,6 +85,7 @@ def __init__(self, values, offsets, *, lengths=None, **kwargs):
         assert offsets is not None
         assert offsets.ndim == 1
         assert not isinstance(values, NestedTensor)
+        assert values.device == offsets.device
 
         # Query cache for the symint associated with offsets or lengths
         # (create a new one if needed).
@@ -92,35 +93,29 @@ def __init__(self, values, offsets, *, lengths=None, **kwargs):
         ragged_size = get_tensor_symint(ragged_source, coeff=1)
         self._ragged_idx = kwargs.get("_ragged_idx", 1)
         B = offsets.shape[0] - 1
-        Ds = values.shape[: self._ragged_idx - 1] + values.shape[self._ragged_idx :]
-
-        nested_size = [B]
-        nested_size.extend(Ds[: self._ragged_idx - 1])
-        nested_size.append(ragged_size)
-        nested_size.extend(Ds[self._ragged_idx - 1 :])
-        self._size = tuple(nested_size)
+        if lengths is not None:
+            assert B == lengths.shape[0]
 
+        # subtract 1 to convert to values dim space
+        r = self._ragged_idx - 1
+        self._size = (B, *values.shape[:r], ragged_size, *values.shape[r + 1 :])
         stride = values.stride()
-        self._strides = (ragged_size * stride[self._ragged_idx - 1], *stride)
+        self._strides = (ragged_size * stride[r], *stride)
 
-        if values.requires_grad:
-            raise ValueError(
-                "NestedTensor values cannot require grad, please "
-                "detach before passing to NestedTensor constructor"
-            )
         self._values = values
         self._offsets = offsets
         self._lengths = lengths
 
         # holds properties that are computed lazily
-        self._metadata_cache = kwargs.get("_metadata_cache", {})
+        self._metadata_cache = kwargs.get("_metadata_cache") or {}
 
         # collapsed ragged dim must always be dynamic
         torch._dynamo.mark_dynamic(self, self._ragged_idx)
         torch._dynamo.mark_dynamic(self._values, self._ragged_idx - 1)
 
     def values(self):
-        return self._values
+        # dispatch to get proper view relationship
+        return torch._nested_get_values(self)  # type: ignore[attr-defined]
 
     def offsets(self):
         return self._offsets
@@ -184,6 +179,7 @@ def __tensor_flatten__(self):
 
     @staticmethod
     def __tensor_unflatten__(inner_tensors: Dict, meta, outer_size, outer_stride):
+        # inner tensors: _values, _offsets, [_lengths]
         assert len(inner_tensors) >= 2 and len(inner_tensors) <= 3
         values = inner_tensors["_values"]
         offsets = inner_tensors["_offsets"]
@@ -237,6 +233,11 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
             return func(*args, **kwargs)
 
 
+# NB: These fake view autograd.Functions are superseded by real view ops. Don't use them!
+# TODO: Remove ViewBufferFromNested, ViewNestedFromBuffer, and buffer_from_jagged once the
+# internal BC period has passed.
+
+
 # Not actually a view!
 class ViewBufferFromNested(torch.autograd.Function):
     @staticmethod
@@ -244,7 +245,7 @@ def forward(ctx, x: NestedTensor):  # type: ignore[override]
         ctx.save_for_backward(x.offsets())
         ctx.metadata_cache = x._metadata_cache
         ctx.ragged_idx = x._ragged_idx
-        return x.values()
+        return x._values
 
     @staticmethod
     def backward(ctx, gO: torch.Tensor):  # type: ignore[override]
@@ -260,31 +261,25 @@ def backward(ctx, gO: torch.Tensor):  # type: ignore[override]
 # Not actually a view!
 class ViewNestedFromBuffer(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, values: torch.Tensor, offsets: torch.Tensor):  # type: ignore[override]
+    def forward(
+        ctx,
+        values: torch.Tensor,
+        offsets: torch.Tensor,
+        metadata_cache: Optional[Dict[str, Any]] = None,
+    ):  # type: ignore[override]
         return NestedTensor(
             values.detach(),
             offsets=offsets,
+            _metadata_cache=metadata_cache,
         )
 
     @staticmethod
     def backward(ctx, gO: NestedTensor):  # type: ignore[override]
-        return gO.values(), None
+        return gO._values, None, None
 
 
-# Not actually a view!
-# NOTE: @jbschlosser is working on making it a view
-class ViewNonContiguousNestedFromBuffer(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, values: torch.Tensor, offsets: torch.Tensor, lengths: torch.Tensor):  # type: ignore[override]
-        return NestedTensor(
-            values.detach(),
-            offsets=offsets,
-            lengths=lengths,
-        )
-
-    @staticmethod
-    def backward(ctx, gO: NestedTensor):  # type: ignore[override]
-        return gO.values(), None, None
+def buffer_from_jagged(jagged):
+    return ViewBufferFromNested.apply(jagged)
 
 
 # Need to make it obvious that users should be passing in offsets
@@ -330,6 +325,8 @@ def jagged_from_list(
     # Calculate jagged offsets if not provided.
     if offsets is None:
         # Jagged layout specifies that offsets are stored as int64 on the same device as values.
+        # TODO: An alternative way to construct offsets is to use F.pad. This avoids creating
+        # an extra leaf tensor during the forward, potentially resolving compatibility issues.
         offsets = torch.cat(
             [
                 torch.zeros(1, dtype=torch.int64, device=values.device),
@@ -337,11 +334,11 @@ def jagged_from_list(
             ]
         )
 
-    ret_nt = ViewNestedFromBuffer.apply(values, offsets)
+    ret_nt = nested_view_from_values_offsets(values, offsets)
     ret_nt._metadata_cache = {
         # compute this now since it's easy
-        "max_seqlen": max([t.shape[0] for t in tensors]),
-        "min_seqlen": min([t.shape[0] for t in tensors]),
+        "max_seqlen": max(t.shape[0] for t in tensors),
+        "min_seqlen": min(t.shape[0] for t in tensors),
     }
     return (ret_nt, offsets)  # type: ignore[return-value]
 
@@ -398,12 +395,11 @@ def jagged_from_tensor_and_lengths(
     min_seqlen = int(torch.min(lengths).item())
 
     if is_contiguous:
-        ret_nt = ViewNestedFromBuffer.apply(
-            values[offsets[0] : offsets[-1]],
-            offsets - offsets[0],
+        ret_nt = nested_view_from_values_offsets(
+            values[offsets[0] : offsets[-1]], offsets - offsets[0]
         )
     else:
-        ret_nt = ViewNonContiguousNestedFromBuffer.apply(values, offsets, length_list)
+        ret_nt = nested_view_from_values_offsets_lengths(values, offsets, length_list)
 
     # populate metadata cache with computed seqlen extremes
     ret_nt._metadata_cache = {
@@ -414,5 +410,30 @@ def jagged_from_tensor_and_lengths(
     return (ret_nt, offsets, None if is_contiguous else length_list)
 
 
-def buffer_from_jagged(jagged):
-    return ViewBufferFromNested.apply(jagged)
+# NB: A dummy arg is required so that NestedTensor.__torch_dispatch__() is invoked
+# for _nested_view_from_values_offsets(). Sizes don't matter much, but they shouldn't be
+# 0/1 because the dummy can be fake-ified and we want to avoid specializing.
+# This arg is otherwise unused.
+_dummy_instance: Optional[torch.Tensor] = None
+
+
+def _nt_view_dummy() -> torch.Tensor:
+    global _dummy_instance
+    if _dummy_instance is None:
+        _dummy_instance = NestedTensor(
+            values=torch.zeros(3, 3, device="meta"),
+            offsets=torch.zeros(3, device="meta", dtype=torch.int64),
+        ).detach()
+    return _dummy_instance
+
+
+def nested_view_from_values_offsets(values, offsets, ragged_idx=1):
+    return torch._nested_view_from_jagged(  # type: ignore[attr-defined]
+        values, offsets, _nt_view_dummy(), None, ragged_idx
+    )
+
+
+def nested_view_from_values_offsets_lengths(values, offsets, lengths, ragged_idx=1):
+    return torch._nested_view_from_jagged(  # type: ignore[attr-defined]
+        values, offsets, _nt_view_dummy(), lengths, ragged_idx
+    )
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
index b91812a3c2dae..d4c79241f4a61 100644
--- a/torch/nested/_internal/ops.py
+++ b/torch/nested/_internal/ops.py
@@ -7,6 +7,7 @@
 
 from .nested_tensor import NestedTensor
 from typing import *  # noqa: F403
+import torch.nn.functional as F
 from torch.fx.operator_schemas import normalize_function
 
 __all__: List[Any] = []
@@ -21,14 +22,16 @@ def _outer_to_inner_dim(ndim, dim):
     return 0 if dim < 2 else dim - 1
 
 
-def _wrap_jagged_dim(ndim, dim, op_name, convert_to_inner_dim=True):
+def _wrap_jagged_dim(
+    ndim, dim, op_name, convert_to_inner_dim=True, allow_batch_dim=False
+):
     from torch._prims_common import canonicalize_dims
 
     wrapped = canonicalize_dims(ndim, dim)
-    if wrapped < 2:
-        raise RuntimeError(
-            f"{op_name}(): not supported for NestedTensor on dim=0 or dim=1"
-        )
+    if wrapped == 1:
+        raise RuntimeError(f"{op_name}(): not supported for NestedTensor on dim=1")
+    elif wrapped == 0 and not allow_batch_dim:
+        raise RuntimeError(f"{op_name}(): not supported for NestedTensor on dim=0")
     return _outer_to_inner_dim(ndim, wrapped) if convert_to_inner_dim else wrapped
 
 
@@ -56,7 +59,7 @@ def _wrap_jagged_dims(ndim, dims, op_name):
 
 def check_schema(schema_str: str, func, *args, **kwargs) -> None:
     named_arg_types = schema_str.split(", ")
-    num_optional_args = sum([x.endswith("?") for x in named_arg_types])
+    num_optional_args = [x.endswith("?") for x in named_arg_types].count(True)
     min_args = len(named_arg_types) - num_optional_args
 
     # special case: ellipses allows for any number of unchecked args at the end
@@ -95,9 +98,18 @@ def check_schema(schema_str: str, func, *args, **kwargs) -> None:
                 )
             continue
 
-        if not arg_type_check_fns[normalized_arg_type](args[i]):
+        _check_fn = arg_type_check_fns[normalized_arg_type]
+
+        def check_fn(x, is_optional=is_optional):
+            if is_optional:
+                return x is None or _check_fn(x)
+            else:
+                return _check_fn(x)
+
+        if not check_fn(args[i]):
             type_to_desc = {
                 "t": "tensor",
+                "t?": "optional tensor",
                 "jt": "contiguous jagged layout NestedTensor",
                 "jt_all": "jagged layout NestedTensor",
                 "any": "<any type>",
@@ -189,12 +201,12 @@ def lookup_jagged(func, *args, **kwargs) -> Optional[Callable]:
     # Handle pointwise fallbacks
     if torch.Tag.pointwise in func.tags:
         # Assume there aren't additional tensors that aren't the "unary/binary" args
-        num_tensor_args = sum([isinstance(x, torch.Tensor) for x in args])
+        num_tensor_args = sum(isinstance(x, torch.Tensor) for x in args)
         if num_tensor_args == 1:
-            check_schema("self: jt, ...", func, *args, **kwargs)
+            check_schema("self: jt_all, ...", func, *args, **kwargs)
             return functools.partial(jagged_unary_pointwise, func)
         elif num_tensor_args == 2:
-            check_schema("lhs: any, rhs: any", func, *args, **kwargs)
+            check_schema("lhs: any, rhs: any, ...", func, *args, **kwargs)
             return functools.partial(jagged_binary_pointwise, func)
 
     return None
@@ -204,6 +216,7 @@ def extract_kwargs(arg):
     kwargs = {
         "offsets": arg.offsets(),
         "_metadata_cache": arg._metadata_cache,
+        "_ragged_idx": arg._ragged_idx,
     }
     return kwargs
 
@@ -280,20 +293,6 @@ def jagged_torch_function(func, *args, **kwargs):
     if func is torch._C._nn.scaled_dot_product_attention:
         return jagged_scaled_dot_product_attention(*args, **kwargs)
 
-    # Handle reshape() / reshape_as() here because they're CompositeImplicit.
-    # TODO: Do the full view determination logic based on computeStride()
-    if func.__name__ == "reshape":
-        inp = args[0]
-        shape = args[1:]
-
-        return inp.view(shape) if inp.is_contiguous() else inp.contiguous().view(shape)
-
-    if func.__name__ == "reshape_as":
-        inp = args[0]
-        other = args[1]
-
-        return inp.reshape(*other.shape)
-
     # Handle flatten() here because it's CompositeImplicit.
     if func.__name__ == "flatten":
 
@@ -388,16 +387,12 @@ def is_contiguous_general(func, *args, **kwargs):
     if inp.lengths() is not None:
         return False
 
-    # If jagged dim is not 1 it's not contiguous
-    if inp._ragged_idx != 1:
-        return False
-
     new_kwargs["memory_format"] = new_kwargs.get(
         "memory_format", torch.contiguous_format
     )
     if new_kwargs["memory_format"] == torch.preserve_format:
         return True
-    return is_contiguous_for_memory_format(inp.values(), **new_kwargs)
+    return is_contiguous_for_memory_format(inp._values, **new_kwargs)
 
 
 register_jagged_func(
@@ -438,8 +433,10 @@ def linear_backward_default(func, *args, **kwargs):
     return (ds, dw, db)
 
 
-@register_jagged_func(torch.ops.aten._to_copy.default, "self: jt")
+@register_jagged_func(torch.ops.aten._to_copy.default, "self: jt_all")
 def to_copy_default(func, *args, **kwargs):
+    from .nested_tensor import _tensor_symint_registry
+
     _, new_kwargs = normalize_function(
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
     )
@@ -449,18 +446,23 @@ def to_copy_default(func, *args, **kwargs):
     new_kwargs.pop("layout")
 
     new_values = func(inp._values, **new_kwargs)
-    # NB: Purposefully keep offsets on the old device.
-    return NestedTensor(new_values, **extract_kwargs(inp))
+    new_offsets = inp._offsets.to(device=new_values.device)
+    _tensor_symint_registry[new_offsets] = _tensor_symint_registry[inp._offsets]
+    inp_kwargs = extract_kwargs(inp)
+    inp_kwargs["offsets"] = new_offsets
+
+    return NestedTensor(new_values, **inp_kwargs)
 
 
 register_jagged_func(
     [
+        torch.ops.aten.empty_like.default,
         torch.ops.aten.ones_like.default,
         torch.ops.aten.zeros_like.default,
         torch.ops.aten.randn_like.default,
         torch.ops.aten.detach.default,
     ],
-    "self: jt",
+    "self: jt_all",
 )(jagged_unary_pointwise)
 
 
@@ -564,12 +566,38 @@ def chunk_default(func, *args, **kwargs):
 
     inp = new_kwargs.pop("input")
 
-    new_kwargs["dim"] = _wrap_jagged_dim(inp.dim(), new_kwargs["dim"], "chunk")
+    new_kwargs["dim"] = _wrap_jagged_dim(
+        inp.dim(), new_kwargs["dim"], "chunk", allow_batch_dim=True
+    )
 
-    return [
-        NestedTensor(values=x, **extract_kwargs(inp))
-        for x in func(inp._values, **new_kwargs)
-    ]
+    if new_kwargs["dim"] == 0:
+        chunks = new_kwargs["chunks"]
+        dim0_size = inp._size[0]
+        chunk_size = math.ceil(dim0_size / chunks)
+
+        # get _offsets of the chunks
+        lengths = inp._offsets.diff()
+        chunked_lengths = lengths.chunk(chunks)
+        chunked_offsets = [torch.cumsum(x, dim=0) for x in chunked_lengths]
+        chunked_offsets = [F.pad(x, (1, 0), value=0) for x in chunked_offsets]
+        nested_kwargs = [
+            {"offsets": per_offsets, "_ragged_idx": inp._ragged_idx}
+            for per_offsets in chunked_offsets
+        ]
+
+        # get _values of the chunks
+        split_sizes = [x.sum().item() for x in chunked_lengths]
+        chunk_values = inp._values.split(split_sizes)
+
+        return [
+            NestedTensor(values=chunk_values[i], **(nested_kwargs[i]))
+            for i in range(0, chunk_size)
+        ]
+    else:
+        return [
+            NestedTensor(values=x, **extract_kwargs(inp))
+            for x in func(inp._values, **new_kwargs)
+        ]
 
 
 @register_jagged_func(torch.ops.aten.unbind.int, "self: jt_all, dim: any?")
@@ -745,7 +773,9 @@ def is_pinned_default(func, *args, **kwargs):
     return func(inp._values, **new_kwargs)
 
 
-@register_jagged_func(torch.ops.aten.is_same_size.default, "self: jt, other: jt")
+@register_jagged_func(
+    torch.ops.aten.is_same_size.default, "self: jt_all, other: jt_all"
+)
 def is_same_size_default(func, *args, **kwargs):
     return args[0]._size == args[1]._size
 
@@ -805,13 +835,14 @@ def transpose_int(func, *args, **kwargs):
             to_dim = dim1
         else:
             to_dim = dim0
+        inp_kwargs = extract_kwargs(inp)
+        inp_kwargs["_ragged_idx"] = to_dim
         return NestedTensor(
             inp.values().transpose(
                 _outer_to_inner_dim(len(inp._size), dim0),
                 _outer_to_inner_dim(len(inp._size), dim1),
             ),
-            **extract_kwargs(inp),
-            _ragged_idx=to_dim,
+            **inp_kwargs,
         )
 
     new_kwargs["dim0"] = _wrap_jagged_dim(inp.dim(), new_kwargs["dim0"], "transpose")
@@ -820,7 +851,10 @@ def transpose_int(func, *args, **kwargs):
     return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(inp))
 
 
-@register_jagged_func(torch.ops.aten.view.default, "self: jt, size: any")
+@register_jagged_func(
+    [torch.ops.aten.view.default, torch.ops.aten._unsafe_view.default],
+    "self: jt_all, size: any",
+)
 def view_default(func, *args, **kwargs):
     _, new_kwargs = normalize_function(
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
@@ -829,12 +863,40 @@ def view_default(func, *args, **kwargs):
     inp = new_kwargs.pop("input")
     size = new_kwargs.pop("size")
 
+    if inp._ragged_idx != 1 and tuple(inp._size) != tuple(size):
+        raise RuntimeError(
+            f"view(): does not support ragged_idx != 1 except when inp._size == size. "
+            f"inp._size is ({inp._size}) and size is ({size})."
+        )
+
     # Ensure specified size still includes batch and ragged dims
     if len(size) < 3 or not raggedness_matches(inp, size):
         raise RuntimeError(f"view(): cannot view shape {inp._size} as {size}")
 
-    jagged_size = [inp._values.shape[0]] + size[2:]
-    return NestedTensor(func(inp._values, jagged_size), **extract_kwargs(inp))
+    # outer size: the size of the NT, e.g. [3, j0, 10]
+    # inner size: the size of the values, e.g. [8, 10] (e.g. for offsets = [0, 3, 5, 8])
+    # this function gets inner_size[inner_idx] for a given inner_idx.
+    #
+    # example: for outer size [a, b, c, j0, d, e, f]
+    #                         assume that j0 is ragged, other are concrete integers
+    #                         and ragged_idx=3
+    # inner size will be      [b, c, inp._values.size(ragged_idx), d, e, f]
+    # therefore:
+    #    inner_size[0] = outer_size[1]
+    #    inner_size[1] = outer_size[2]
+    #    inner_size[0] = inp._values.size(ragged_idx - 1)
+    #    inner_size[3] = outer_size[4]
+    #    inner_size[4] = outer_size[5]
+    def get_inner_size(inner_idx):
+        nonlocal inp, size
+        if inner_idx == inp._ragged_idx - 1:
+            return inp._values.size(inner_idx)
+        else:
+            return size[inner_idx + 1]
+
+    inner_size = [get_inner_size(i) for i in range(len(size) - 1)]
+
+    return NestedTensor(func(inp._values, inner_size), **extract_kwargs(inp))
 
 
 @register_jagged_func(
@@ -980,3 +1042,85 @@ def embedding_default(func, *args, **kwargs):
     return NestedTensor(
         func(weight, indices._values, **new_kwargs), **extract_kwargs(indices)
     )
+
+
+@register_jagged_func(
+    [
+        torch.ops.aten.values.default,
+        torch.ops.aten._nested_get_values.default,
+    ],
+    "self: jt_all",
+)
+def values_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+
+    # TODO: Handle inference mode properly.
+    # See https://github.com/pytorch/pytorch/issues/112024#issuecomment-1779554292
+    return inp._values.detach()
+
+
+@register_jagged_func(
+    torch.ops.aten._nested_view_from_jagged.default,
+    "values: t, offsets: t, dummy: jt_all, lengths: t?, ragged_idx: any?",
+)
+def _nested_view_from_jagged_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    values, offsets, lengths = (
+        new_kwargs["input"],
+        new_kwargs["offsets"],
+        new_kwargs["lengths"],
+    )
+    ragged_idx = new_kwargs["ragged_idx"]
+
+    return NestedTensor(values, offsets, lengths=lengths, _ragged_idx=ragged_idx)
+
+
+@register_jagged_func(torch.ops.aten._nested_get_offsets.default, "self: jt_all")
+def _nested_get_offsets(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    return inp._offsets
+
+
+@register_jagged_func(torch.ops.aten._nested_get_lengths.default, "self: jt_all")
+def _nested_get_lengths(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    return inp._lengths
+
+
+@register_jagged_func(torch.ops.aten._nested_get_ragged_idx.default, "self: jt_all")
+def _nested_get_ragged_idx(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    return inp._ragged_idx
+
+
+# Make the dummy available on the C++ side.
+@register_jagged_func(torch.ops.aten._nested_get_jagged_dummy.default, "self: any")
+def _nested_get_jagged_dummy(func, *args, **kwargs):
+    from torch.nested._internal.nested_tensor import _nt_view_dummy
+
+    return _nt_view_dummy()
+
+
+with torch.library._scoped_library("aten", "IMPL") as aten:
+    aten.impl("_nested_get_jagged_dummy", _nested_get_jagged_dummy, "CPU")
+    aten.impl("_nested_get_jagged_dummy", _nested_get_jagged_dummy, "CUDA")
+    aten.impl("_nested_get_jagged_dummy", _nested_get_jagged_dummy, "Meta")
diff --git a/torch/nested/_internal/sdpa.py b/torch/nested/_internal/sdpa.py
index 4dc28e5ba2401..c393fb1bf3579 100644
--- a/torch/nested/_internal/sdpa.py
+++ b/torch/nested/_internal/sdpa.py
@@ -1,5 +1,4 @@
 import logging
-import math
 from typing import Optional, Tuple
 
 import torch
@@ -12,9 +11,9 @@
     math_sdp_enabled,
     mem_efficient_sdp_enabled,
     SDPAParams,
-    SDPBackend,
 )
 
+from torch.nn.attention import SDPBackend
 from .nested_tensor import buffer_from_jagged, NestedTensor, ViewNestedFromBuffer
 
 log = logging.getLogger(__name__)
@@ -341,7 +340,7 @@ def _is_safe_to_get_storage_as_tensor(tensor: torch.Tensor):
     # Returns a boolean indicating if contiguous needs to be called for input
     assert isinstance(tensor, NestedTensor)
     offsets = tensor.offsets()
-    strides = tensor._stride
+    strides = tensor._strides
 
     n_tensors = offsets.size(0) - 1
     if n_tensors <= 1:
@@ -603,7 +602,8 @@ def _pad_last_dim(
 
 # TODO: coalesce with torch/nn/utils/attention.py
 def _calculate_scale(query, scale):
-    softmax_scale = scale if scale is not None else math.sqrt(1.0 / query.size(-1))
+    # TODO: Investigate why math.sqrt() isn't properly handled by Dynamo?
+    softmax_scale = scale if scale is not None else torch.sym_sqrt(1.0 / query.size(-1))
     return softmax_scale
 
 
@@ -736,9 +736,38 @@ def jagged_scaled_dot_product_attention(
             attention.squeeze(0), output_nt_info["offsets"]
         ).transpose(1, 2)
     elif backend_choice == SDPBackend.MATH:
-        return torch._scaled_dot_product_attention_math(
+        # save the offsets and shape of the inputs, so we can reshape the final output
+        # query @ key = attn: [B, D1, j0, D'] @ [B, D1, D' j1] = [B, D1, j0, j1]
+        # attn @ value = out: [B, D1, j0, j1] @ [B, D1, j1, D2] = [B, D1, j0, D2]
+        offsets = query.offsets()
+        d1 = query._size[1]
+        d2 = value._size[-1]
+
+        # convert jagged layout Nested Tensor to strided layout Nested Tensor
+        # which support the math implementation of SDPA
+        def get_strided_layout_nested_tensor(jagged_layout_nt):
+            lengths = jagged_layout_nt._offsets[1:] - jagged_layout_nt._offsets[:-1]
+            transpose = torch.transpose(jagged_layout_nt, 1, 2)
+            tensor_list = buffer_from_jagged(transpose).split(list(lengths), dim=0)
+            strided_nt = torch.nested.as_nested_tensor(list(tensor_list))
+            strided_nt = strided_nt.transpose(1, 2).contiguous()
+            return strided_nt
+
+        query = get_strided_layout_nested_tensor(query)
+        key = get_strided_layout_nested_tensor(key)
+        value = get_strided_layout_nested_tensor(value)
+
+        attn_out = torch._scaled_dot_product_attention_math(
             query, key, value, attn_mask, dropout_p, is_causal, scale=scale
         )[0]
+
+        # convert strided layout Nested Tensor back to jagged layout Nested Tensor
+        attn_out = attn_out.transpose(1, 2).contiguous().values()
+        attn_out = attn_out.view(-1, d1, d2)
+        attn_out = ViewNestedFromBuffer.apply(attn_out, offsets)
+        attn_out = attn_out.transpose(1, 2)
+
+        return attn_out
     else:
         raise RuntimeError(
             "No viable backend for scaled_dot_product_attention was found."
diff --git a/torch/nn/__init__.py b/torch/nn/__init__.py
index 6e1275341eec3..3d317b7c09f20 100644
--- a/torch/nn/__init__.py
+++ b/torch/nn/__init__.py
@@ -8,6 +8,7 @@
 from . import init
 from . import functional
 from . import utils
+from . import attention
 
 
 def factory_kwargs(kwargs):
diff --git a/torch/nn/attention/__init__.py b/torch/nn/attention/__init__.py
index 1214b33efb9a7..fc4835f046e6c 100644
--- a/torch/nn/attention/__init__.py
+++ b/torch/nn/attention/__init__.py
@@ -1,16 +1,24 @@
-from typing import List
+""" This module contains functions and classes that alter the behavior of torch.nn.functional.scaled_dot_product_attention """
+import contextlib
+from typing import List, Union
 from warnings import warn
 
 from torch.backends.cuda import (
     can_use_efficient_attention,
     can_use_flash_attention,
+    enable_flash_sdp,
+    enable_math_sdp,
+    enable_mem_efficient_sdp,
+    flash_sdp_enabled,
+    math_sdp_enabled,
+    mem_efficient_sdp_enabled,
     SDPAParams,
 )
 
-__all__: List[str] = []
+__all__: List[str] = ["SDPBackend", "sdpa_kernel", "WARN_FOR_UNFUSED_KERNELS"]
 
 # Note: [SDPA warnings]
-# TODO: Consider using this to sdpa regardless of subclasses
+# TODO: Consider using this for sdpa regardless of subclasses
 # This only effects users of bias subclasses
 # If this is set to True, we will warn the user if they are not using the fused kernels
 # As well, it will raise warnings for all the reasons why the fused kernels can't be run.
@@ -19,6 +27,29 @@
 WARN_FOR_UNFUSED_KERNELS = False
 
 
+from torch._C import _SDPBackend as SDPBackend
+
+# Hacks for Sphinx documentation:
+# https://stackoverflow.com/questions/38765577/overriding-sphinx-autodoc-alias-of-for-import-of-private-class
+SDPBackend = SDPBackend
+r"""An enum-like class that contains the different backends for scaled dot product attention.
+    This backend class is designed to be used with the sdpa_kernel context manager.
+
+    The following Enums are available:
+        - ERROR: An error occurred when trying to determine the backend.
+        - MATH: The math backend for scaled dot product attention.
+        - FLASH_ATTENTION: The flash attention backend for scaled dot product attention.
+        - EFFICIENT_ATTENTION: The efficient attention backend for scaled dot product attention.
+        - CUDNN_ATTENTION: The cuDNN backend for scaled dot product attention.
+
+    See :func:`torch.nn.attention.sdpa_kernel` for more details.
+
+    .. warning:: This class is in beta and subject to change.
+"""
+SDPBackend.__module__ = __name__
+SDPBackend.__name__ = "SDPBackend"
+
+
 def _raise_kernel_warnings(params: SDPAParams) -> None:
     """
     If WARN_FOR_UNFUSED_KERNELS is set to True, this will raise warnings
@@ -31,3 +62,61 @@ def _raise_kernel_warnings(params: SDPAParams) -> None:
         if not can_use_flash_attention(params):
             warn("Flash attention can't be used because:")
             can_use_flash_attention(params, True)
+
+
+@contextlib.contextmanager
+def sdpa_kernel(backends: Union[List[SDPBackend], SDPBackend]):
+    r"""
+    Context manager to select which backend to use for scaled dot product attention.
+
+    .. warning:: This function is beta and subject to change.
+
+    Args:
+        backend (Union[List[SDPBackend], SDPBackend]): A backend or list of backends for scaled dot product attention.
+
+    Example:
+
+    .. code-block:: python
+
+        from torch.nn.functional import scaled_dot_product_attention
+        from torch.nn.attention import SDPBackend, sdpa_kernel
+        # Only enable flash attention backend
+        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+            scaled_dot_product_attention(...)
+
+        # Enable the Math or Efficient attention backends
+        with sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]):
+            scaled_dot_product_attention(...)
+
+    This context manager can be used to select which backend to use for scaled dot product attention.
+    Upon exiting the context manager, the previous state of the flags will be restored, enabling all backends.
+    """
+    assert isinstance(
+        backends, (list, SDPBackend)
+    ), "Backend must be an instance of SDPBackend or a list of SDPBackend instances"
+
+    if isinstance(backends, SDPBackend):
+        backends = [backends]
+
+    backends = set(backends)
+    previous_flash: bool = flash_sdp_enabled()
+    previous_mem_efficient: bool = mem_efficient_sdp_enabled()
+    previous_math: bool = math_sdp_enabled()
+    try:
+        enable_flash = SDPBackend.FLASH_ATTENTION in backends
+        enable_mem_efficient = SDPBackend.EFFICIENT_ATTENTION in backends
+        enable_math = SDPBackend.MATH in backends
+
+        enable_flash_sdp(enable_flash)
+        enable_mem_efficient_sdp(enable_mem_efficient)
+        enable_math_sdp(enable_math)
+        yield {}
+    finally:
+        enable_flash_sdp(previous_flash)
+        enable_mem_efficient_sdp(previous_mem_efficient)
+        enable_math_sdp(previous_math)
+
+
+def _get_flash_version() -> str:
+    """This returns the closest matching tag for the flash attention backend"""
+    return "2.5.6"
diff --git a/torch/nn/attention/_flex_attention.py b/torch/nn/attention/_flex_attention.py
new file mode 100644
index 0000000000000..ee131dfac8524
--- /dev/null
+++ b/torch/nn/attention/_flex_attention.py
@@ -0,0 +1,163 @@
+"""This module implements the user facing API for flex_attention in PyTorch."""
+import functools
+from typing import Callable
+
+import torch
+from torch._higher_order_ops.flex_attention import flex_attention as flex_attention_hop
+from torch._higher_order_ops.utils import _set_compilation_env
+from torch.fx.experimental.proxy_tensor import (
+    _temp_remove_pre_dispatch_torch_function_mode,
+)
+from torch.nn.attention._utils import _validate_sdpa_input
+
+
+def _compose(*fs):
+    """Compose a sequence of score_mod functions."""
+
+    def compose2(f, g):
+        def inner(score, b, h, m, n):
+            return f(g(score, b, h, m, n), b, h, m, n)
+
+        return inner
+
+    return functools.reduce(compose2, fs)
+
+
+_score_mod_signature = Callable[
+    [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor
+]
+
+
+def _flex_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    score_mod: _score_mod_signature,
+) -> torch.Tensor:
+    r"""This function implements scaled dot product attention with an arbitrary attention score modification function.
+
+    This function computes the scaled dot product attention between query, key, and value tensors with a user-defined
+    attention score modification function. The attention score modification function will be applied after the attention
+    scores have been calculated between the query and key tensors. The attention scores are calculated as follows:
+
+    The ``score_mod`` function should have the following signature:
+
+    .. code-block:: python
+
+        def score_mod(
+            score: torch.Tensor,
+            batch: torch.Tensor,
+            head: torch.Tensor,
+            token_q: torch.Tensor,
+            token_kv: torch.Tensor
+        ) -> torch.Tensor:
+
+    Where:
+        - ``score``: A scalar tensor representing the attention score,
+          with the same data type and device as the query, key, and value tensors.
+        - ``batch``, ``head``, ``token_q``, ``token_kv``: Scalar tensors indicating
+          the batch index, head index, query index, and key/value index, respectively.
+          These should have the ``torch.int`` data type and be located on the same device as the score tensor.
+
+    Args:
+        query (Tensor): Query tensor; shape :math:`(B, H, L, E)`.
+        key (Tensor): Key tensor; shape :math:`(B, H, S, E)`.
+        value (Tensor): Value tensor; shape :math:`(B, H, S, Ev)`.
+        score_mod (Callable): Function to modify attention scores
+
+    Returns:
+        output (Tensor): Attention output; shape :math:`(B, H, L, Ev)`.
+
+    Shape legend:
+        - :math:`N: \text{Batch size} ... : \text{Any number of other batch dimensions (optional)}`
+        - :math:`S: \text{Source sequence length}`
+        - :math:`L: \text{Target sequence length}`
+        - :math:`E: \text{Embedding dimension of the query and key}`
+        - :math:`Ev: \text{Embedding dimension of the value}`
+
+    .. warning::
+        `torch.nn.attention.flex_attention` is a prototype feature in PyTorch. It doesn't support training currently.
+        Please look forward to a more stable implementation in a future version of PyTorch.
+        Read more about feature classification at: https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype
+
+    """
+
+    if torch.compiler.is_dynamo_compiling():
+        out, _ = flex_attention_hop(query, key, value, score_mod)
+        return out
+
+    # Some basic input validation
+    _validate_sdpa_input(query, key, value)
+    # This will restriction will be removed in newer version of the kernel
+    if query.size(-2) != key.size(-2):
+        raise ValueError(
+            "NYI: The target sequence length (L) of the query tensor must match the source sequence length (S) of the key tensor."
+        )
+
+    if not torch._dynamo.is_dynamo_supported():
+        raise RuntimeError("flex_attention requires dynamo support.")
+
+    with _set_compilation_env():
+        with torch._dynamo.utils.disable_cache_limit():
+            with _temp_remove_pre_dispatch_torch_function_mode():
+                out, _ = torch.compile(
+                    flex_attention_hop, backend="eager", fullgraph=True
+                )(query, key, value, score_mod)
+                return out
+
+
+"""Some common used score_mod functions for flex_attention in PyTorch."""
+
+
+def _identity(
+    score: torch.Tensor,
+    batch: torch.Tensor,
+    head: torch.Tensor,
+    token_q: torch.Tensor,
+    token_kv: torch.Tensor,
+) -> torch.Tensor:
+    return score
+
+
+def _causal(
+    score: torch.Tensor,
+    batch: torch.Tensor,
+    head: torch.Tensor,
+    token_q: torch.Tensor,
+    token_kv: torch.Tensor,
+) -> torch.Tensor:
+    return torch.where(token_q >= token_kv, score, float("-inf"))
+
+
+def _rel_bias(
+    score: torch.Tensor,
+    batch: torch.Tensor,
+    head: torch.Tensor,
+    token_q: torch.Tensor,
+    token_kv: torch.Tensor,
+) -> torch.Tensor:
+    return score + (token_q - token_kv)
+
+
+def _rel_causal(
+    score: torch.Tensor,
+    batch: torch.Tensor,
+    head: torch.Tensor,
+    token_q: torch.Tensor,
+    token_kv: torch.Tensor,
+) -> torch.Tensor:
+    return torch.where(token_q <= token_kv, score + (token_q - token_kv), float("-inf"))
+
+
+def _generate_alibi_bias(num_heads: int):
+    def _alibi_bias(
+        score: torch.Tensor,
+        batch: torch.Tensor,
+        head: torch.Tensor,
+        token_q: torch.Tensor,
+        token_kv: torch.Tensor,
+    ) -> torch.Tensor:
+        scale = torch.exp2(-((head + 1) * 8.0 / num_heads))
+        return score + (token_kv - token_q) * scale
+
+    return _alibi_bias
diff --git a/torch/nn/attention/bias.py b/torch/nn/attention/bias.py
index 502c173bd6ccf..d54ed8915789d 100644
--- a/torch/nn/attention/bias.py
+++ b/torch/nn/attention/bias.py
@@ -1,4 +1,4 @@
-"""Defines utilities for interacting with scaled_dot_product_attention"""
+"""Defines bias subclasses that work with scaled_dot_product_attention"""
 from enum import auto, IntEnum
 from typing import Optional
 from warnings import warn
@@ -21,6 +21,11 @@
 __all__ = ["causal_upper_left", "causal_lower_right", "CausalVariant", "CausalBias"]
 
 
+torch._dynamo.allow_in_graph(can_use_flash_attention)
+torch._dynamo.allow_in_graph(can_use_efficient_attention)
+torch._dynamo.allow_in_graph(SDPAParams)
+
+
 class CausalVariant(IntEnum):
     r"""
     Enum for causal variants used in attention mechanisms.
@@ -207,9 +212,7 @@ def _dispatch(
                 scale=scale,
             )
         elif attn_mask.variant == CausalVariant.LOWER_RIGHT:
-            _validate_sdpa_input(
-                query, key, value, attn_mask, dropout_p, is_causal, scale
-            )
+            _validate_sdpa_input(query, key, value, None, dropout_p, is_causal, scale)
             sdpa_params = SDPAParams(query, key, value, None, dropout_p, is_causal)
             if can_use_flash_attention(sdpa_params):
                 needs_padding = query.size(-1) % 8 != 0
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 5034f1ddde6a6..57dd5905e6fe9 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1545,6 +1545,8 @@ def hardtanh(input: Tensor, min_val: float = -1., max_val: float = 1., inplace:
     """
     if has_torch_function_unary(input):
         return handle_torch_function(hardtanh, (input,), input, min_val=min_val, max_val=max_val, inplace=inplace)
+    if min_val > max_val:
+        raise ValueError("min_val cannot be greater than max_val")
     if inplace:
         result = torch._C._nn.hardtanh_(input, min_val, max_val)
     else:
@@ -2572,6 +2574,21 @@ def layer_norm(
         )
     return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
 
+def rms_norm(
+    input: Tensor,
+    normalized_shape: List[int],
+    weight: Optional[Tensor] = None,
+    eps: Optional[float] = None,
+) -> Tensor:
+    r"""Apply Root Mean Square Layer Normalization.
+
+    See :class:`~torch.nn.RMSNorm` for details.
+    """
+    if has_torch_function_variadic(input, weight):
+        return handle_torch_function(
+            rms_norm, (input, weight), input, normalized_shape, weight=weight, eps=eps
+        )
+    return torch.rms_norm(input, normalized_shape, weight, eps)
 
 def group_norm(
     input: Tensor, num_groups: int, weight: Optional[Tensor] = None, bias: Optional[Tensor] = None, eps: float = 1e-5
@@ -3143,8 +3160,8 @@ def binary_cross_entropy(
         reduction_enum = _Reduction.get_enum(reduction)
     if target.size() != input.size():
         raise ValueError(
-            "Using a target size ({}) that is different to the input size ({}) is deprecated. "
-            "Please ensure they have the same size.".format(target.size(), input.size())
+            f"Using a target size ({target.size()}) that is different to the input size ({input.size()}) is deprecated. "
+            "Please ensure they have the same size."
         )
 
     if weight is not None:
@@ -3193,7 +3210,7 @@ def binary_cross_entropy_with_logits(
             operations. For a target of size [B, C, H, W] (where B is batch size) pos_weight of
             size [B, C, H, W] will apply different pos_weights to each element of the batch or
             [C, H, W] the same pos_weights across the batch. To apply the same positive weight
-            along all spacial dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1].
+            along all spatial dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1].
             Default: ``None``
 
     Examples::
@@ -4060,7 +4077,7 @@ def interpolate(input: Tensor, size: Optional[int] = None, scale_factor: Optiona
                 # Use slow decomp whose backward will be in terms of index_put
                 # importlib is required because the import cannot be top level
                 # (cycle) and cannot be nested (TS doesn't support)
-                return importlib.import_module('torch._decomp.decompositions').upsample_bilinear2d_vec(
+                return importlib.import_module('torch._decomp.decompositions')._upsample_linear_vec(
                     input, output_size, align_corners, scale_factors)
         return torch._C._nn.upsample_bilinear2d(input, output_size, align_corners, scale_factors)
     if input.dim() == 5 and mode == "trilinear":
@@ -4680,6 +4697,8 @@ def triplet_margin_loss(
         reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
     else:
         reduction_enum = _Reduction.get_enum(reduction)
+    if margin <= 0:
+        raise ValueError(f"margin must be greater than 0, got {margin}")
     return torch.triplet_margin_loss(anchor, positive, negative, margin, p, eps, swap, reduction_enum)
 
 
@@ -4720,6 +4739,10 @@ def triplet_margin_with_distance_loss(
     if reduction not in ("mean", "sum", "none"):
         raise ValueError(f"{reduction} is not a valid value for reduction")
 
+    # Check validity of margin
+    if margin <= 0:
+        raise ValueError(f"margin must be greater than 0, got {margin}")
+
     # Check dimensions
     a_dim = anchor.ndim
     p_dim = positive.ndim
@@ -4965,13 +4988,12 @@ def _in_projection(
 
 Computes scaled dot product attention on query, key and value tensors, using
 an optional attention mask if passed, and applying dropout if a probability
-greater than 0.0 is specified.
+greater than 0.0 is specified. The optional scale argument can only be specified as a keyword argument.
 
 .. code-block:: python
 
     # Efficient implementation equivalent to the following:
     def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> torch.Tensor:
-        # Efficient implementation equivalent to the following:
         L, S = query.size(-2), key.size(-2)
         scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
         attn_bias = torch.zeros(L, S, dtype=query.dtype)
@@ -5010,14 +5032,14 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
     is used, the following functions are provided for enabling and disabling implementations.
     The context manager is the preferred mechanism:
 
-        - :func:`torch.backends.cuda.sdp_kernel`: A context manager used to enable/disable any of the implementations.
-        - :func:`torch.backends.cuda.enable_flash_sdp`: Enables or Disables FlashAttention.
-        - :func:`torch.backends.cuda.enable_mem_efficient_sdp`: Enables or Disables Memory-Efficient Attention.
-        - :func:`torch.backends.cuda.enable_math_sdp`: Enables or Disables the PyTorch C++ implementation.
+        - :func:`torch.nn.attention.sdpa_kernel`: A context manager used to enable or disable any of the implementations.
+        - :func:`torch.backends.cuda.enable_flash_sdp`: Globally enables or disables FlashAttention.
+        - :func:`torch.backends.cuda.enable_mem_efficient_sdp`: Globally enables or disables  Memory-Efficient Attention.
+        - :func:`torch.backends.cuda.enable_math_sdp`: Globally enables or disables  the PyTorch C++ implementation.
 
     Each of the fused kernels has specific input limitations. If the user requires the use of a specific fused implementation,
-    disable the PyTorch C++ implementation using :func:`torch.backends.cuda.sdp_kernel`.
-    In the event that a fused implementation is not available, an error will be raised with the
+    disable the PyTorch C++ implementation using :func:`torch.nn.attention.sdpa_kernel`.
+    In the event that a fused implementation is not available, a warning will be raised with the
     reasons why the fused implementation cannot run.
 
     Due to the nature of fusing floating point operations, the output of this function may be different
@@ -5033,13 +5055,14 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
     query (Tensor): Query tensor; shape :math:`(N, ..., L, E)`.
     key (Tensor): Key tensor; shape :math:`(N, ..., S, E)`.
     value (Tensor): Value tensor; shape :math:`(N, ..., S, Ev)`.
-    attn_mask (optional Tensor): Attention mask; shape :math:`(N, ..., L, S)`. Two types of masks are supported.
+    attn_mask (optional Tensor): Attention mask; shape must be broadcastable to the shape of attention weights,
+        which is :math:`(N,..., L, S)`. Two types of masks are supported.
         A boolean mask where a value of True indicates that the element *should* take part in attention.
         A float mask of the same type as query, key, value that is added to the attention score.
     dropout_p (float): Dropout probability; if greater than 0.0, dropout is applied
     is_causal (bool): If true, assumes upper left causal attention masking and errors if both attn_mask and is_causal
         are set.
-    scale (optional float): Scaling factor applied prior to softmax. If None, the default value is set
+    scale (optional float, keyword-only): Scaling factor applied prior to softmax. If None, the default value is set
         to :math:`\frac{1}{\sqrt{E}}`.
 
 
@@ -5403,7 +5426,7 @@ def multi_head_attention_forward(
         assert bias_v is None
 
     #
-    # reshape q, k, v for multihead attention and make em batch first
+    # reshape q, k, v for multihead attention and make them batch first
     #
     q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
     if static_k is None:
@@ -5459,7 +5482,7 @@ def multi_head_attention_forward(
 
     if need_weights:
         B, Nt, E = q.shape
-        q_scaled = q / math.sqrt(E)
+        q_scaled = q * math.sqrt(1.0 / float(E))
 
         assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights"
 
diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in
index b3878c215ea80..5bb847a0a7271 100644
--- a/torch/nn/functional.pyi.in
+++ b/torch/nn/functional.pyi.in
@@ -299,6 +299,12 @@ def layer_norm(
     bias: Optional[Tensor] = ...,
     eps: float = ...,
 ) -> Tensor: ...
+def rms_norm(
+    input: Tensor,
+    normalized_shape: Sequence[int],
+    weight: Optional[Tensor] = ...,
+    eps: Optional[float] = ...,
+) -> Tensor: ...
 def group_norm(
     input: Tensor,
     num_groups: int,
diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py
index 67916b3ae756f..403d0d547e2bb 100644
--- a/torch/nn/modules/__init__.py
+++ b/torch/nn/modules/__init__.py
@@ -19,7 +19,7 @@
     LazyBatchNorm1d, LazyBatchNorm2d, LazyBatchNorm3d
 from .instancenorm import InstanceNorm1d, InstanceNorm2d, InstanceNorm3d, \
     LazyInstanceNorm1d, LazyInstanceNorm2d, LazyInstanceNorm3d
-from .normalization import LocalResponseNorm, CrossMapLRN2d, LayerNorm, GroupNorm
+from .normalization import LocalResponseNorm, CrossMapLRN2d, LayerNorm, GroupNorm, RMSNorm
 from .dropout import Dropout, Dropout1d, Dropout2d, Dropout3d, AlphaDropout, FeatureAlphaDropout
 from .padding import ReflectionPad1d, ReflectionPad2d, ReflectionPad3d, ReplicationPad1d, ReplicationPad2d, \
     ReplicationPad3d, ZeroPad1d, ZeroPad2d, ZeroPad3d, ConstantPad1d, ConstantPad2d, ConstantPad3d, \
@@ -49,7 +49,7 @@
     'ParameterList', 'ParameterDict', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'MaxPool1d', 'MaxPool2d',
     'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d', 'FractionalMaxPool2d', "FractionalMaxPool3d",
     'LPPool1d', 'LPPool2d', 'LPPool3d', 'LocalResponseNorm', 'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d',
-    'InstanceNorm1d', 'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm', 'SyncBatchNorm',
+    'InstanceNorm1d', 'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm', 'RMSNorm', 'SyncBatchNorm',
     'Dropout', 'Dropout1d', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout',
     'ReflectionPad1d', 'ReflectionPad2d', 'ReflectionPad3d', 'ReplicationPad2d', 'ReplicationPad1d', 'ReplicationPad3d',
     'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCellBase', 'RNNCell',
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 90d95ae981fef..bf15c3342d1de 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -847,10 +847,10 @@ class Softplus(Module):
     """
 
     __constants__ = ['beta', 'threshold']
-    beta: int
-    threshold: int
+    beta: float
+    threshold: float
 
-    def __init__(self, beta: int = 1, threshold: int = 20) -> None:
+    def __init__(self, beta: float = 1.0, threshold: float = 20.0) -> None:
         super().__init__()
         self.beta = beta
         self.threshold = threshold
@@ -949,7 +949,6 @@ class MultiheadAttention(Module):
     - training is disabled (using ``.eval()``)
     - ``add_bias_kv`` is ``False``
     - ``add_zero_attn`` is ``False``
-    - ``batch_first`` is ``True`` and the input is batched
     - ``kdim`` and ``vdim`` are equal to ``embed_dim``
     - if a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ is passed, neither ``key_padding_mask``
       nor ``attn_mask`` is passed
@@ -1334,7 +1333,7 @@ class PReLU(Module):
     .. math::
         \text{PReLU}(x) =
         \begin{cases}
-        x, & \text{ if } x \geq 0 \\
+        x, & \text{ if } x \ge 0 \\
         ax, & \text{ otherwise }
         \end{cases}
 
diff --git a/torch/nn/modules/adaptive.py b/torch/nn/modules/adaptive.py
index 3d61e9d8f59ae..83b37696c8a73 100644
--- a/torch/nn/modules/adaptive.py
+++ b/torch/nn/modules/adaptive.py
@@ -20,7 +20,7 @@ class AdaptiveLogSoftmaxWithLoss(Module):
 
     As described in
     `Efficient softmax approximation for GPUs by Edouard Grave, Armand Joulin,
-    Moustapha Cissé, David Grangier, and Hervé Jégou
+    Moustapha Ciss\u00e9, David Grangier, and Herv\u00e9 J\u00e9gou
     <https://arxiv.org/abs/1609.04309>`__.
 
     Adaptive softmax is an approximate strategy for training models with large
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
index 778fae7137e2a..3c48e56d5e6e9 100644
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@@ -21,7 +21,7 @@ class _NormBase(Module):
     __constants__ = ["track_running_stats", "momentum", "eps", "num_features", "affine"]
     num_features: int
     eps: float
-    momentum: float
+    momentum: Optional[float]
     affine: bool
     track_running_stats: bool
     # WARNING: weight and bias purposely not defined here.
@@ -31,7 +31,7 @@ def __init__(
         self,
         num_features: int,
         eps: float = 1e-5,
-        momentum: float = 0.1,
+        momentum: Optional[float] = 0.1,
         affine: bool = True,
         track_running_stats: bool = True,
         device=None,
@@ -127,7 +127,7 @@ def __init__(
         self,
         num_features: int,
         eps: float = 1e-5,
-        momentum: float = 0.1,
+        momentum: Optional[float] = 0.1,
         affine: bool = True,
         track_running_stats: bool = True,
         device=None,
@@ -677,7 +677,7 @@ def __init__(
         self,
         num_features: int,
         eps: float = 1e-5,
-        momentum: float = 0.1,
+        momentum: Optional[float] = 0.1,
         affine: bool = True,
         track_running_stats: bool = True,
         process_group: Optional[Any] = None,
@@ -781,8 +781,8 @@ def forward(self, input: Tensor) -> Tensor:
                 running_var,
                 self.eps,
                 exponential_average_factor,
-                process_group,
-                world_size,
+                process_group,  # type: ignore[possibly-undefined]
+                world_size,  # type: ignore[possibly-undefined]
             )
 
     @classmethod
diff --git a/torch/nn/modules/channelshuffle.py b/torch/nn/modules/channelshuffle.py
index a9508684828b8..ff4c8c28d194d 100644
--- a/torch/nn/modules/channelshuffle.py
+++ b/torch/nn/modules/channelshuffle.py
@@ -9,38 +9,35 @@ class ChannelShuffle(Module):
     r"""Divides and rearranges the channels in a tensor.
 
     This operation divides the channels in a tensor of shape :math:`(*, C , H, W)`
-    into g groups and rearranges them as :math:`(*, C \frac g, g, H, W)`,
-    while keeping the original tensor shape.
+    into g groups as :math:`(*, \frac{C}{g}, g, H, W)` and shuffles them,
+    while retaining the original tensor shape in the final output.
 
     Args:
         groups (int): number of groups to divide channels in.
 
     Examples::
 
-        >>> # xdoctest: +IGNORE_WANT("FIXME: incorrect want")
         >>> channel_shuffle = nn.ChannelShuffle(2)
-        >>> input = torch.randn(1, 4, 2, 2)
-        >>> print(input)
-        [[[[1, 2],
-           [3, 4]],
-          [[5, 6],
-           [7, 8]],
-          [[9, 10],
-           [11, 12]],
-          [[13, 14],
-           [15, 16]],
-         ]]
+        >>> input = torch.arange(1, 17, dtype=torch.float32).view(1, 4, 2, 2)
+        >>> input
+        tensor([[[[ 1.,  2.],
+                  [ 3.,  4.]],
+                 [[ 5.,  6.],
+                  [ 7.,  8.]],
+                 [[ 9., 10.],
+                  [11., 12.]],
+                 [[13., 14.],
+                  [15., 16.]]]])
         >>> output = channel_shuffle(input)
-        >>> print(output)
-        [[[[1, 2],
-           [3, 4]],
-          [[9, 10],
-           [11, 12]],
-          [[5, 6],
-           [7, 8]],
-          [[13, 14],
-           [15, 16]],
-         ]]
+        >>> output
+        tensor([[[[ 1.,  2.],
+                  [ 3.,  4.]],
+                 [[ 9., 10.],
+                  [11., 12.]],
+                 [[ 5.,  6.],
+                  [ 7.,  8.]],
+                 [[13., 14.],
+                  [15., 16.]]]])
     """
 
     __constants__ = ['groups']
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index 3f894532a64ee..075d5e9865e69 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -204,7 +204,7 @@ class Conv1d(_ConvNd):
       amount of implicit padding applied on both sides.
 
     * :attr:`dilation` controls the spacing between the kernel points; also
-      known as the à trous algorithm. It is harder to describe, but this `link`_
+      known as the \uue0 trous algorithm. It is harder to describe, but this `link`_
       has a nice visualization of what :attr:`dilation` does.
 
     {groups_note}
@@ -341,7 +341,7 @@ class Conv2d(_ConvNd):
       amount of implicit padding applied on both sides.
 
     * :attr:`dilation` controls the spacing between the kernel points; also
-      known as the à trous algorithm. It is harder to describe, but this `link`_
+      known as the \u00e0 trous algorithm. It is harder to describe, but this `link`_
       has a nice visualization of what :attr:`dilation` does.
 
     {groups_note}
@@ -483,7 +483,7 @@ class Conv3d(_ConvNd):
       can be either a string {{'valid', 'same'}} or a tuple of ints giving the
       amount of implicit padding applied on both sides.
 
-    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
       It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
 
     {groups_note}
@@ -638,9 +638,8 @@ def _output_padding(self, input: Tensor, output_size: Optional[List[int]],
                 output_size = output_size[num_non_spatial_dims:]
             if len(output_size) != num_spatial_dims:
                 raise ValueError(
-                    "ConvTranspose{}D: for {}D input, output_size must have {} or {} elements (got {})"
-                    .format(num_spatial_dims, input.dim(), num_spatial_dims,
-                            num_non_spatial_dims + num_spatial_dims, len(output_size)))
+                    f"ConvTranspose{num_spatial_dims}D: for {input.dim()}D input, output_size must have {num_spatial_dims} "
+                    f"or {num_non_spatial_dims + num_spatial_dims} elements (got {len(output_size)})")
 
             min_sizes = torch.jit.annotate(List[int], [])
             max_sizes = torch.jit.annotate(List[int], [])
@@ -691,7 +690,7 @@ class ConvTranspose1d(_ConvTransposeNd):
     * :attr:`output_padding` controls the additional size added to one side
       of the output shape. See note below for details.
 
-    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
       It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does.
 
     {groups_note}
@@ -822,7 +821,7 @@ class ConvTranspose2d(_ConvTransposeNd):
     * :attr:`output_padding` controls the additional size added to one side
       of the output shape. See note below for details.
 
-    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
       It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does.
 
     {groups_note}
@@ -979,7 +978,7 @@ class ConvTranspose3d(_ConvTransposeNd):
     * :attr:`output_padding` controls the additional size added to one side
       of the output shape. See note below for details.
 
-    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
       It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does.
 
     {groups_note}
@@ -1152,7 +1151,7 @@ def reset_parameters(self) -> None:
             super().reset_parameters()  # type: ignore[misc]
 
     # Signature of "initialize_parameters" is incompatible with the definition in supertype LazyModuleMixin
-    def initialize_parameters(self, input) -> None:  # type: ignore[override]
+    def initialize_parameters(self, input: Tensor, *args, **kwargs) -> None:  # type: ignore[override]
         # defined by parent class but using a protocol
         if self.has_uninitialized_params():  # type: ignore[misc]
             self.in_channels = self._get_in_channels(input)
@@ -1176,15 +1175,15 @@ def _get_in_channels(self, input: Tensor) -> int:
         num_dims_no_batch = num_spatial_dims + 1  # +1 for channels dim
         num_dims_batch = num_dims_no_batch + 1
         if input.dim() not in (num_dims_no_batch, num_dims_batch):
-            raise RuntimeError("Expected {}D (unbatched) or {}D (batched) input to {}, but "
-                               "got input of size: {}".format(num_dims_no_batch, num_dims_batch,
-                                                              self.__class__.__name__, input.shape))
+            raise RuntimeError(f"Expected {num_dims_no_batch}D (unbatched) or {num_dims_batch}D (batched) input "
+                               f"to {self.__class__.__name__}, but "
+                               f"got input of size: {input.shape}")
         return input.shape[1] if input.dim() == num_dims_batch else input.shape[0]
 
     # Function to return the number of spatial dims expected for inputs to the module.
     # This is expected to be implemented by subclasses.
     def _get_num_spatial_dims(self) -> int:
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 # LazyConv1d defines weight as a Tensor but derived class defines it as UnitializeParameter
diff --git a/torch/nn/modules/fold.py b/torch/nn/modules/fold.py
index 8ae911252f996..f8cb08362362f 100644
--- a/torch/nn/modules/fold.py
+++ b/torch/nn/modules/fold.py
@@ -41,7 +41,7 @@ class Fold(Module):
       sides for :attr:`padding` number of points for each dimension before
       reshaping.
 
-    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
       It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
 
     Args:
@@ -186,7 +186,7 @@ class Unfold(Module):
       sides for :attr:`padding` number of points for each dimension before
       reshaping.
 
-    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the \u00e0 trous algorithm.
       It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
 
     Args:
diff --git a/torch/nn/modules/instancenorm.py b/torch/nn/modules/instancenorm.py
index d0c37b72448c3..ae187e98b7e62 100644
--- a/torch/nn/modules/instancenorm.py
+++ b/torch/nn/modules/instancenorm.py
@@ -34,8 +34,15 @@ def _handle_no_batch_input(self, input):
 
     def _apply_instance_norm(self, input):
         return F.instance_norm(
-            input, self.running_mean, self.running_var, self.weight, self.bias,
-            self.training or not self.track_running_stats, self.momentum, self.eps)
+            input,
+            self.running_mean,
+            self.running_var,
+            self.weight,
+            self.bias,
+            self.training or not self.track_running_stats,
+            self.momentum if self.momentum is not None else 0.0,
+            self.eps,
+        )
 
     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                               missing_keys, unexpected_keys, error_msgs):
diff --git a/torch/nn/modules/lazy.py b/torch/nn/modules/lazy.py
index 2c033df9e08d0..c4b7459c4acd8 100644
--- a/torch/nn/modules/lazy.py
+++ b/torch/nn/modules/lazy.py
@@ -1,6 +1,5 @@
 import itertools
-import warnings
-from typing import Protocol
+from typing import Protocol, Optional, Type, Any
 
 import torch
 from ..parameter import is_lazy
@@ -171,15 +170,13 @@ class LazyModuleMixin:
 
     # modules inheriting from this will change their __class__ to the specified
     # one after they are fully initialized
-    cls_to_become = None
+    cls_to_become: Optional[Type[Any]] = None
 
     def __init__(self: _LazyProtocol, *args, **kwargs):
         # Mypy doesnt like this super call in a mixin
         super().__init__(*args, **kwargs)  # type: ignore[misc]
         self._load_hook = self._register_load_state_dict_pre_hook(self._lazy_load_hook)
         self._initialize_hook = self.register_forward_pre_hook(self._infer_parameters, with_kwargs=True)
-        warnings.warn('Lazy modules are a new feature under heavy development '
-                      'so changes to the API or functionality can happen at any moment.')
 
     def _save_to_state_dict(self: _LazyProtocol, destination, prefix, keep_vars):
         # This should be ideally implemented as a hook,
diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py
index 83e1b8a368a5f..720c1ca01c157 100644
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@@ -200,9 +200,8 @@ def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
         return F.bilinear(input1, input2, self.weight, self.bias)
 
     def extra_repr(self) -> str:
-        return 'in1_features={}, in2_features={}, out_features={}, bias={}'.format(
-            self.in1_features, self.in2_features, self.out_features, self.bias is not None
-        )
+        return (f'in1_features={self.in1_features}, in2_features={self.in2_features}, '
+                f'out_features={self.out_features}, bias={self.bias is not None}')
 
 
 class LazyLinear(LazyModuleMixin, Linear):
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 1967fdec743ab..637e7a7d457e5 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -221,7 +221,7 @@ def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_in
                  reduce=None, reduction: str = 'mean') -> None:
         warnings.warn("NLLLoss2d has been deprecated. "
                       "Please use NLLLoss instead as a drop-in replacement and see "
-                      "https://pytorch.org/docs/master/nn.html#torch.nn.NLLLoss for more details.")
+                      "https://pytorch.org/docs/main/nn.html#torch.nn.NLLLoss for more details.")
         super().__init__(weight, size_average, ignore_index, reduce, reduction)
 
 
@@ -660,7 +660,7 @@ class BCEWithLogitsLoss(_Loss):
     :math:`p_c > 1` increases the recall, :math:`p_c < 1` increases the precision.
 
     For example, if a dataset contains 100 positive and 300 negative examples of a single class,
-    then `pos_weight` for the class should be equal to :math:`\frac{300}{100}=3`.
+    then ``pos_weight`` for the class should be equal to :math:`\frac{300}{100}=3`.
     The loss would act as if the dataset contains :math:`3\times 100=300` positive examples.
 
     Examples::
@@ -672,6 +672,12 @@ class BCEWithLogitsLoss(_Loss):
         >>> criterion(output, target)  # -log(sigmoid(1.5))
         tensor(0.20...)
 
+    In the above example, the ``pos_weight`` tensor's elements correspond to the 64 distinct classes
+    in a multi-label binary classification scenario. Each element in ``pos_weight`` is designed to adjust the
+    loss function based on the imbalance between negative and positive samples for the respective class.
+    This approach is useful in datasets with varying levels of class imbalance, ensuring that the loss
+    calculation accurately accounts for the distribution in each class.
+
     Args:
         weight (Tensor, optional): a manual rescaling weight given to the loss
             of each batch element. If given, has to be a Tensor of size `nbatch`.
@@ -1506,6 +1512,10 @@ class TripletMarginLoss(_Loss):
     def __init__(self, margin: float = 1.0, p: float = 2., eps: float = 1e-6, swap: bool = False, size_average=None,
                  reduce=None, reduction: str = 'mean'):
         super().__init__(size_average, reduce, reduction)
+        if margin <= 0:
+            raise ValueError(
+                f"TripletMarginLoss: expected margin to be greater than 0, got {margin} instead"
+            )
         self.margin = margin
         self.p = p
         self.eps = eps
@@ -1621,6 +1631,10 @@ class TripletMarginWithDistanceLoss(_Loss):
     def __init__(self, *, distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = None,
                  margin: float = 1.0, swap: bool = False, reduction: str = 'mean'):
         super().__init__(size_average=None, reduce=None, reduction=reduction)
+        if margin <= 0:
+            raise ValueError(
+                f"TripletMarginWithDistanceLoss: expected margin to be greater than 0, got {margin} instead"
+            )
         self.distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = \
             distance_function if distance_function is not None else PairwiseDistance()
         self.margin = margin
@@ -1664,7 +1678,7 @@ class CTCLoss(_Loss):
           :math:`(\operatorname{sum}(\text{target\_lengths}))`,
           where :math:`N = \text{batch size}` and
           :math:`S = \text{max target length, if shape is } (N, S)`.
-          It represent the target sequences. Each element in the target
+          It represents the target sequences. Each element in the target
           sequence is a class index. And the target index cannot be blank (default=0).
           In the :math:`(N, S)` form, targets are padded to the
           length of the longest sequence, and stacked.
@@ -1672,12 +1686,12 @@ class CTCLoss(_Loss):
           the targets are assumed to be un-padded and
           concatenated within 1 dimension.
         - Input_lengths: Tuple or tensor of size :math:`(N)` or :math:`()`,
-          where :math:`N = \text{batch size}`. It represent the lengths of the
+          where :math:`N = \text{batch size}`. It represents the lengths of the
           inputs (must each be :math:`\leq T`). And the lengths are specified
           for each sequence to achieve masking under the assumption that sequences
           are padded to equal lengths.
         - Target_lengths: Tuple or tensor of size :math:`(N)` or :math:`()`,
-          where :math:`N = \text{batch size}`. It represent lengths of the targets.
+          where :math:`N = \text{batch size}`. It represents lengths of the targets.
           Lengths are specified for each sequence to achieve masking under the
           assumption that sequences are padded to equal lengths. If target shape is
           :math:`(N,S)`, target_lengths are effectively the stop index
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 631015d5671c7..73420c0f32e7b 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -13,6 +13,7 @@
 from typing import Union, Tuple, Any, Callable, Iterator, Set, Optional, overload, TypeVar, Mapping, Dict, List
 from typing_extensions import Self
 from ...utils.hooks import RemovableHandle
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
 __all__ = ['register_module_forward_pre_hook', 'register_module_forward_hook',
            'register_module_full_backward_pre_hook', 'register_module_backward_hook',
@@ -281,22 +282,12 @@ def register_module_full_backward_pre_hook(
         This adds global state to the `nn.module` module
         and it is only intended for debugging/profiling purposes.
 
-    The hook will be called every time the gradients for the module are computed.
-    The hook should have the following signature::
+    Hooks registered using this function behave in the same way as those
+    registered by :meth:`torch.nn.Module.register_full_backward_pre_hook`.
+    Refer to its documentation for more details.
 
-        hook(module, grad_output) -> Tensor or None
-
-    The :attr:`grad_output` is a tuple. The hook should
-    not modify its arguments, but it can optionally return a new gradient with
-    respect to the output that will be used in place of :attr:`grad_output` in
-    subsequent computations. Entries in :attr:`grad_output` will be ``None`` for
-    all non-Tensor arguments.
-
-    For technical reasons, when this hook is applied to a Module, its forward function will
-    receive a view of each Tensor passed to the Module. Similarly the caller will receive a view
-    of each Tensor returned by the Module's forward function.
-
-    Global hooks are called before hooks registered with `register_backward_pre_hook`
+    Hooks registered using this function will be called before hooks registered
+    using :meth:`torch.nn.Module.register_full_backward_pre_hook`.
 
     Returns:
         :class:`torch.utils.hooks.RemovableHandle`:
@@ -318,26 +309,12 @@ def register_module_full_backward_hook(
         This adds global state to the `nn.module` module
         and it is only intended for debugging/profiling purposes.
 
-    The hook will be called every time the gradients with respect to a module
-    are computed, i.e. the hook will execute if and only if the gradients with
-    respect to module outputs are computed. The hook should have the following
-    signature::
-
-        hook(module, grad_input, grad_output) -> Tensor or None
-
-    The :attr:`grad_input` and :attr:`grad_output` are tuples. The hook should
-    not modify its arguments, but it can optionally return a new gradient with
-    respect to the input that will be used in place of :attr:`grad_input` in
-    subsequent computations. :attr:`grad_input` will only correspond to the inputs given
-    as positional arguments and all kwarg arguments will not appear in the hook. Entries
-    in :attr:`grad_input` and :attr:`grad_output` will be ``None`` for all non-Tensor
-    arguments.
-
-    For technical reasons, when this hook is applied to a Module, its forward function will
-    receive a view of each Tensor passed to the Module. Similarly the caller will receive a view
-    of each Tensor returned by the Module's forward function.
+    Hooks registered using this function behave in the same way as those
+    registered by :meth:`torch.nn.Module.register_full_backward_hook`.
+    Refer to its documentation for more details.
 
-    Global hooks are called before hooks registered with `register_backward_hook`
+    Hooks registered using this function will be called before hooks registered
+    using :meth:`torch.nn.Module.register_full_backward_hook`.
 
     Returns:
         :class:`torch.utils.hooks.RemovableHandle`:
@@ -454,8 +431,8 @@ def __init__(self, *args, **kwargs) -> None:
 
         # Backward compatibility: no args used to be allowed when call_super_init=False
         if self.call_super_init is False and bool(kwargs):
-            raise TypeError("{}.__init__() got an unexpected keyword argument '{}'"
-                            "".format(type(self).__name__, next(iter(kwargs))))
+            raise TypeError(f"{type(self).__name__}.__init__() got an unexpected keyword argument '{next(iter(kwargs))}'"
+                            "")
 
         if self.call_super_init is False and bool(args):
             raise TypeError(f"{type(self).__name__}.__init__() takes 1 positional argument but {len(args) + 1} were"
@@ -780,7 +757,7 @@ def get_extra_state(self) -> Any:
             "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml "
             "to report this bug.")
 
-    def set_extra_state(self, state: Any):
+    def set_extra_state(self, state: Any) -> None:
         """Set extra state contained in the loaded `state_dict`.
 
         This function is called from :func:`load_state_dict` to handle any extra state
@@ -815,6 +792,8 @@ def compute_should_use_set_data(tensor, tensor_applied):
             else:
                 return False
 
+        should_use_swap_tensors = torch.__future__.get_swap_module_params_on_conversion()
+
         for key, param in self._parameters.items():
             if param is None:
                 continue
@@ -823,8 +802,26 @@ def compute_should_use_set_data(tensor, tensor_applied):
             # `with torch.no_grad():`
             with torch.no_grad():
                 param_applied = fn(param)
-            should_use_set_data = compute_should_use_set_data(param, param_applied)
-            if should_use_set_data:
+            p_should_use_set_data = compute_should_use_set_data(param, param_applied)
+
+            # subclasses may have multiple child tensors so we need to use swap_tensors
+            p_should_use_swap_tensors = should_use_swap_tensors or is_traceable_wrapper_subclass(param_applied)
+
+            param_grad = param.grad
+            if p_should_use_swap_tensors:
+                try:
+                    if param_grad is not None:
+                        # Accessing param.grad makes its at::Tensor's use_count 2, which will prevent swapping.
+                        # Decrement use count of the gradient by setting to None
+                        param.grad = None
+                    param_applied = torch.nn.Parameter(param_applied, requires_grad=param.requires_grad)
+                    torch.utils.swap_tensors(param, param_applied)
+                except Exception as e:
+                    if param_grad is not None:
+                        param.grad = param_grad
+                    raise RuntimeError(f"_apply(): Couldn't swap {self._get_name()}.{key}") from e
+                out_param = param
+            elif p_should_use_set_data:
                 param.data = param_applied
                 out_param = param
             else:
@@ -833,16 +830,23 @@ def compute_should_use_set_data(tensor, tensor_applied):
                 out_param = Parameter(param_applied, param.requires_grad)
                 self._parameters[key] = out_param
 
-            if param.grad is not None:
+            if param_grad is not None:
                 with torch.no_grad():
-                    grad_applied = fn(param.grad)
-                should_use_set_data = compute_should_use_set_data(param.grad, grad_applied)
-                if should_use_set_data:
+                    grad_applied = fn(param_grad)
+                g_should_use_set_data = compute_should_use_set_data(param_grad, grad_applied)
+                if p_should_use_swap_tensors:
+                    grad_applied.requires_grad_(param_grad.requires_grad)
+                    try:
+                        torch.utils.swap_tensors(param_grad, grad_applied)
+                    except Exception as e:
+                        raise RuntimeError(f"_apply(): Couldn't swap {self._get_name()}.{key}.grad") from e
+                    out_param.grad = param_grad
+                elif g_should_use_set_data:
                     assert out_param.grad is not None
                     out_param.grad.data = grad_applied
                 else:
-                    assert param.grad.is_leaf
-                    out_param.grad = grad_applied.requires_grad_(param.grad.requires_grad)
+                    assert param_grad.is_leaf
+                    out_param.grad = grad_applied.requires_grad_(param_grad.requires_grad)
 
         for key, buf in self._buffers.items():
             if buf is not None:
@@ -1032,12 +1036,12 @@ def to_empty(self: T, *, device: Optional[DeviceLikeType], recurse: bool = True)
         return self._apply(lambda t: torch.empty_like(t, device=device), recurse=recurse)
 
     @overload
-    def to(self, device: Optional[DeviceLikeType] = ..., dtype: Optional[Union[dtype, str]] = ...,
+    def to(self, device: Optional[DeviceLikeType] = ..., dtype: Optional[dtype] = ...,
            non_blocking: bool = ...) -> Self:
         ...
 
     @overload
-    def to(self, dtype: Union[dtype, str], non_blocking: bool = ...) -> Self:
+    def to(self, dtype: dtype, non_blocking: bool = ...) -> Self:
         ...
 
     @overload
@@ -1144,10 +1148,27 @@ def to(self, *args, **kwargs):
                     "if a complex module does not work as expected.")
 
         def convert(t):
-            if convert_to_format is not None and t.dim() in (4, 5):
-                return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None,
-                            non_blocking, memory_format=convert_to_format)
-            return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
+            try:
+                if convert_to_format is not None and t.dim() in (4, 5):
+                    return t.to(
+                        device,
+                        dtype if t.is_floating_point() or t.is_complex() else None,
+                        non_blocking,
+                        memory_format=convert_to_format,
+                    )
+                return t.to(
+                    device,
+                    dtype if t.is_floating_point() or t.is_complex() else None,
+                    non_blocking,
+                )
+            except NotImplementedError as e:
+                if str(e) == "Cannot copy out of meta tensor; no data!":
+                    raise NotImplementedError(
+                        f"{e} Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() "
+                        f"when moving module from meta to a different device."
+                    ) from None
+                else:
+                    raise
 
         return self._apply(convert)
 
@@ -1604,9 +1625,9 @@ def _call_impl(self, *args, **kwargs):
             # For now only forward hooks have the always_call option but perhaps
             # this functionality should be added to full backward hooks as well.
             for hook_id, hook in _global_forward_hooks.items():
-                if hook_id in _global_forward_hooks_always_called and hook_id not in called_always_called_hooks:
+                if hook_id in _global_forward_hooks_always_called and hook_id not in called_always_called_hooks:  # type: ignore[possibly-undefined]
                     try:
-                        hook_result = hook(self, args, result)
+                        hook_result = hook(self, args, result)  # type: ignore[possibly-undefined]
                         if hook_result is not None:
                             result = hook_result
                     except Exception as e:
@@ -1615,12 +1636,12 @@ def _call_impl(self, *args, **kwargs):
                         continue
 
             for hook_id, hook in self._forward_hooks.items():
-                if hook_id in self._forward_hooks_always_called and hook_id not in called_always_called_hooks:
+                if hook_id in self._forward_hooks_always_called and hook_id not in called_always_called_hooks:  # type: ignore[possibly-undefined]
                     try:
                         if hook_id in self._forward_hooks_with_kwargs:
-                            hook_result = hook(self, args, kwargs, result)
+                            hook_result = hook(self, args, kwargs, result)  # type: ignore[possibly-undefined]
                         else:
-                            hook_result = hook(self, args, result)
+                            hook_result = hook(self, args, result)  # type: ignore[possibly-undefined]
                         if hook_result is not None:
                             result = hook_result
                     except Exception as e:
@@ -1630,7 +1651,6 @@ def _call_impl(self, *args, **kwargs):
             # raise exception raised in try block
             raise
 
-
     __call__ : Callable[..., Any] = _wrapped_call_impl
 
     def __getstate__(self):
@@ -1771,7 +1791,7 @@ def _register_state_dict_hook(self, hook):
         return handle
 
     def register_state_dict_pre_hook(self, hook):
-        r"""Register a pre-hook for the :meth:`~torch.nn.Module.load_state_dict` method.
+        r"""Register a pre-hook for the :meth:`~torch.nn.Module.state_dict` method.
 
         These hooks will be called with arguments: ``self``, ``prefix``,
         and ``keep_vars`` before calling ``state_dict`` on ``self``. The registered
@@ -1876,7 +1896,7 @@ def state_dict(self, *args, destination=None, prefix='', keep_vars=False):
             # DeprecationWarning is ignored by default
             warnings.warn(
                 "Positional args are being deprecated, use kwargs instead. Refer to "
-                "https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict"
+                "https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict"
                 " for details.")
 
         if destination is None:
@@ -1949,7 +1969,6 @@ def register_load_state_dict_post_hook(self, hook):
         self._load_state_dict_post_hooks[handle.id] = hook
         return handle
 
-
     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                               missing_keys, unexpected_keys, error_msgs):
         r"""Copy parameters and buffers from :attr:`state_dict` into only this module, but not its descendants.
@@ -1994,6 +2013,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
         local_name_params = itertools.chain(self._parameters.items(), persistent_buffers.items())
         local_state = {k: v for k, v in local_name_params if v is not None}
         assign_to_params_buffers = local_metadata.get("assign_to_params_buffers", False)
+        use_swap_tensors = torch.__future__.get_swap_module_params_on_conversion()
 
         for name, param in local_state.items():
             key = prefix + name
@@ -2016,9 +2036,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
 
                 if not is_param_lazy and input_param.shape != param.shape:
                     # local shape should match the one in checkpoint
-                    error_msgs.append('size mismatch for {}: copying a param with shape {} from checkpoint, '
-                                      'the shape in current model is {}.'
-                                      .format(key, input_param.shape, param.shape))
+                    error_msgs.append(f'size mismatch for {key}: copying a param with shape {input_param.shape} from checkpoint, '
+                                      f'the shape in current model is {param.shape}.')
                     continue
 
                 if param.is_meta and not input_param.is_meta and not assign_to_params_buffers:
@@ -2029,17 +2048,31 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
 
                 try:
                     with torch.no_grad():
-                        if assign_to_params_buffers:
+                        if use_swap_tensors:
+                            new_input_param = param.module_load(input_param, assign=assign_to_params_buffers)
+                            if id(new_input_param) == id(input_param) or id(new_input_param) == id(param):
+                                raise RuntimeError("module_load returned one of self or other, please .detach() "
+                                                   "the result if returning one of the inputs in module_load")
+                            if (isinstance(param, torch.nn.Parameter)):
+                                if not isinstance(new_input_param, torch.nn.Parameter):
+                                    new_input_param = torch.nn.Parameter(new_input_param, requires_grad=param.requires_grad)
+                                else:
+                                    new_input_param.requires_grad_(param.requires_grad)
+                            torch.utils.swap_tensors(param, new_input_param)
+                            del new_input_param
+                        elif assign_to_params_buffers:
                             # Shape checks are already done above
-                            if (isinstance(param, torch.nn.Parameter) and
-                                    not isinstance(input_param, torch.nn.Parameter)):
-                                setattr(self, name, torch.nn.Parameter(input_param))
-                            else:
-                                setattr(self, name, input_param)
+                            if (isinstance(param, torch.nn.Parameter)):
+                                if not isinstance(input_param, torch.nn.Parameter):
+                                    input_param = torch.nn.Parameter(input_param, requires_grad=param.requires_grad)
+                                else:
+                                    input_param.requires_grad_(param.requires_grad)
+                            setattr(self, name, input_param)
                         else:
                             param.copy_(input_param)
                 except Exception as ex:
-                    error_msgs.append(f'While copying the parameter named "{key}", '
+                    action = "swapping" if use_swap_tensors else "copying"
+                    error_msgs.append(f'While {action} the parameter named "{key}", '
                                       f'whose dimensions in the model are {param.size()} and '
                                       f'whose dimensions in the checkpoint are {input_param.size()}, '
                                       f'an exception occurred : {ex.args}.'
@@ -2059,9 +2092,12 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
         if strict:
             for key in state_dict.keys():
                 if key.startswith(prefix) and key != extra_state_key:
-                    input_name = key[len(prefix):]
-                    input_name = input_name.split('.', 1)[0]  # get the name of param/buffer/child
-                    if input_name not in self._modules and input_name not in local_state:
+                    input_name = key[len(prefix):].split(".", 1)
+                    # Must be Module if it have attributes
+                    if len(input_name) > 1:
+                        if input_name[0] not in self._modules:
+                            unexpected_keys.append(key)
+                    elif input_name[0] not in local_state:
                         unexpected_keys.append(key)
 
     def load_state_dict(self, state_dict: Mapping[str, Any],
@@ -2074,7 +2110,8 @@ def load_state_dict(self, state_dict: Mapping[str, Any],
 
         .. warning::
             If :attr:`assign` is ``True`` the optimizer must be created after
-            the call to :attr:`load_state_dict`.
+            the call to :attr:`load_state_dict` unless
+            :func:`~torch.__future__.get_swap_module_params_on_conversion` is ``True``.
 
         Args:
             state_dict (dict): a dict containing parameters and
@@ -2082,18 +2119,19 @@ def load_state_dict(self, state_dict: Mapping[str, Any],
             strict (bool, optional): whether to strictly enforce that the keys
                 in :attr:`state_dict` match the keys returned by this module's
                 :meth:`~torch.nn.Module.state_dict` function. Default: ``True``
-            assign (bool, optional): whether to assign items in the state
-                dictionary to their corresponding keys in the module instead
-                of copying them inplace into the module's current parameters and buffers.
-                When ``False``, the properties of the tensors in the current
-                module are preserved while when ``True``, the properties of the
-                Tensors in the state dict are preserved.
+            assign (bool, optional): When ``False``, the properties of the tensors
+                in the current module are preserved while when ``True``, the
+                properties of the Tensors in the state dict are preserved. The only
+                exception is the ``requires_grad`` field of :class:`~torch.nn.Parameter`s
+                for which the value from the module is preserved.
                 Default: ``False``
 
         Returns:
             ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
-                * **missing_keys** is a list of str containing the missing keys
-                * **unexpected_keys** is a list of str containing the unexpected keys
+                * **missing_keys** is a list of str containing any keys that are expected
+                    by this module but missing from the provided ``state_dict``.
+                * **unexpected_keys** is a list of str containing the keys that are not
+                    expected by this module but present in the provided ``state_dict``.
 
         Note:
             If a parameter or buffer is registered as ``None`` and its corresponding key
diff --git a/torch/nn/modules/normalization.py b/torch/nn/modules/normalization.py
index 6502ec2a471f3..97c9c307c5d95 100644
--- a/torch/nn/modules/normalization.py
+++ b/torch/nn/modules/normalization.py
@@ -7,9 +7,9 @@
 from .. import init
 
 from torch import Tensor, Size
-from typing import Union, List, Tuple
+from typing import Union, List, Optional, Tuple
 
-__all__ = ['LocalResponseNorm', 'CrossMapLRN2d', 'LayerNorm', 'GroupNorm']
+__all__ = ['LocalResponseNorm', 'CrossMapLRN2d', 'LayerNorm', 'GroupNorm', 'RMSNorm']
 
 class LocalResponseNorm(Module):
     r"""Applies local response normalization over an input signal.
@@ -292,6 +292,88 @@ def extra_repr(self) -> str:
             'affine={affine}'.format(**self.__dict__)
 
 
+class RMSNorm(Module):
+    r"""Applies Root Mean Square Layer Normalization over a mini-batch of inputs.
+
+    This layer implements the operation as described in
+    the paper `Root Mean Square Layer Normalization <https://arxiv.org/pdf/1910.07467.pdf>`__
+
+    .. math::
+        y = \frac{x}{\sqrt{\mathrm{RMS}[x] + \epsilon}} * \gamma
+
+    The root mean squared norm is taken over the last ``D`` dimensions, where ``D``
+    is the dimension of :attr:`normalized_shape`. For example, if :attr:`normalized_shape`
+    is ``(3, 5)`` (a 2-dimensional shape), the rms norm is computed over
+    the last 2 dimensions of the input.
+
+    Args:
+        normalized_shape (int or list or torch.Size): input shape from an expected input
+            of size
+
+            .. math::
+                [* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1]
+                    \times \ldots \times \text{normalized\_shape}[-1]]
+
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps: a value added to the denominator for numerical stability. Default: :func:`torch.finfo(x.dtype).eps`
+        elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+
+    Shape:
+        - Input: :math:`(N, *)`
+        - Output: :math:`(N, *)` (same shape as input)
+
+    Examples::
+
+        >>> rms_norm = nn.RMSNorm([2, 3])
+        >>> input = torch.randn(2, 2, 3)
+        >>> rms_norm(input)
+
+    """
+    __constants__ = ['normalized_shape', 'eps', 'elementwise_affine']
+    normalized_shape: Tuple[int, ...]
+    eps: Optional[float]
+    elementwise_affine: bool
+
+    def __init__(self, normalized_shape: _shape_t, eps: Optional[float] = None, elementwise_affine: bool = True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            # mypy error: incompatible types in assignment
+            normalized_shape = (normalized_shape,)  # type: ignore[assignment]
+        self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
+        else:
+            self.register_parameter('weight', None)
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        """
+        Resets parameters based on their initialization used in __init__.
+        """
+        if self.elementwise_affine:
+            init.ones_(self.weight)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Runs forward pass.
+        """
+        return F.rms_norm(x, self.normalized_shape, self.weight, self.eps)
+
+    def extra_repr(self) -> str:
+        """
+        Extra information about the module.
+        """
+        return '{normalized_shape}, eps={eps}, ' \
+            'elementwise_affine={elementwise_affine}'.format(**self.__dict__)
+
+
 # TODO: ContrastiveNorm2d
 # TODO: DivisiveNorm2d
 # TODO: SubtractiveNorm2d
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index 367699613a038..38acd9fb430a0 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -524,6 +524,10 @@ class AvgPool1d(_AvgPoolNd):
               L_{out} = \left\lfloor \frac{L_{in} +
               2 \times \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
 
+          Per the note above, if ``ceil_mode`` is True and :math:`(L_{out} - 1) \times \text{stride} \geq L_{in}
+          + \text{padding}`, we skip the last window as it would start in the right padded region, resulting in
+          :math:`L_{out}` being reduced by one.
+
     Examples::
 
         >>> # pool with window of size=3, stride=2
@@ -599,6 +603,12 @@ class AvgPool2d(_AvgPoolNd):
               W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding}[1] -
                 \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
 
+          Per the note above, if ``ceil_mode`` is True and :math:`(H_{out} - 1)\times \text{stride}[0]\geq H_{in}
+          + \text{padding}[0]`, we skip the last window as it would start in the bottom padded region,
+          resulting in :math:`H_{out}` being reduced by one.
+
+          The same applies for :math:`W_{out}`.
+
     Examples::
 
         >>> # pool of square window of size=3, stride=2
@@ -685,6 +695,12 @@ class AvgPool3d(_AvgPoolNd):
               W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] -
                     \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor
 
+          Per the note above, if ``ceil_mode`` is True and :math:`(D_{out} - 1)\times \text{stride}[0]\geq D_{in}
+          + \text{padding}[0]`, we skip the last window as it would start in the padded region,
+          resulting in :math:`D_{out}` being reduced by one.
+
+          The same applies for :math:`W_{out}` and :math:`H_{out}`.
+
     Examples::
 
         >>> # pool of square window of size=3, stride=2
@@ -1084,7 +1100,7 @@ class AdaptiveMaxPool1d(_AdaptiveMaxPoolNd):
 
     output_size: _size_1_t
 
-    def forward(self, input: Tensor) -> Tensor:
+    def forward(self, input: Tensor):
         return F.adaptive_max_pool1d(input, self.output_size, self.return_indices)
 
 
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index 5726e31d419f3..742bec9ebd199 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -183,9 +183,10 @@ def flatten_parameters(self) -> None:
         first_fw = self._flat_weights[0]
         dtype = first_fw.dtype
         for fw in self._flat_weights:
-            if (not isinstance(fw.data, Tensor) or not (fw.data.dtype == dtype) or
-                    not fw.data.is_cuda or
-                    not torch.backends.cudnn.is_acceptable(fw.data)):
+            if (
+                not isinstance(fw, Tensor) or not (fw.dtype == dtype) or
+                not fw.is_cuda or not torch.backends.cudnn.is_acceptable(fw)
+            ):
                 return
 
         # If any parameters alias, we fall back to the slower, copying code path. This is
@@ -213,6 +214,7 @@ def flatten_parameters(self) -> None:
                         self.batch_first, bool(self.bidirectional))
 
     def _apply(self, fn, recurse=True):
+        self._flat_weight_refs = []
         ret = super()._apply(fn, recurse)
 
         # Resets _flat_weights
@@ -379,6 +381,33 @@ class RNN(RNNBase):
     previous layer at time `t-1` or the initial hidden state at time `0`.
     If :attr:`nonlinearity` is ``'relu'``, then :math:`\text{ReLU}` is used instead of :math:`\tanh`.
 
+    .. code-block:: python
+
+        # Efficient implementation equivalent to the following with bidirectional=False
+        def forward(x, h_0=None):
+            if batch_first:
+                x = x.transpose(0, 1)
+            seq_len, batch_size, _ = x.size()
+            if h_0 is None:
+                h_0 = torch.zeros(num_layers, batch_size, hidden_size)
+            h_t_minus_1 = h_0
+            h_t = h_0
+            output = []
+            for t in range(seq_len):
+                for layer in range(num_layers):
+                    h_t[layer] = torch.tanh(
+                        x[t] @ weight_ih[layer].T
+                        + bias_ih[layer]
+                        + h_t_minus_1[layer] @ weight_hh[layer].T
+                        + bias_hh[layer]
+                    )
+                output.append(h_t[-1])
+                h_t_minus_1 = h_t
+            output = torch.stack(output)
+            if batch_first:
+                output = output.transpose(0, 1)
+            return output, h_t
+
     Args:
         input_size: The number of expected features in the input `x`
         hidden_size: The number of features in the hidden state `h`
@@ -480,7 +509,11 @@ def __init__(self, *args, **kwargs):
     def __init__(self, *args, **kwargs):
         if 'proj_size' in kwargs:
             raise ValueError("proj_size argument is only supported for LSTM, not RNN or GRU")
-        self.nonlinearity = kwargs.pop('nonlinearity', 'tanh')
+        if len(args) > 3:
+            self.nonlinearity = args[3]
+            args = args[:3] + args[4:]
+        else:
+            self.nonlinearity = kwargs.pop('nonlinearity', 'tanh')
         if self.nonlinearity == 'tanh':
             mode = 'RNN_TANH'
         elif self.nonlinearity == 'relu':
@@ -575,8 +608,8 @@ def forward(self, input, hx=None):  # noqa: F811
             output_packed = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
 
-        if not is_batched:
-            output = output.squeeze(batch_dim)
+        if not is_batched:  # type: ignore[possibly-undefined]
+            output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
             hidden = hidden.squeeze(1)
 
         return output, self.permute_hidden(hidden, unsorted_indices)
@@ -857,6 +890,7 @@ def forward(self, input, hx=None):  # noqa: F811
                                       max_batch_size, self.hidden_size,
                                       dtype=input.dtype, device=input.device)
                 hx = (h_zeros, c_zeros)
+                self.check_forward_args(input, hx, batch_sizes)
             else:
                 if is_batched:
                     if (hx[0].dim() != 3 or hx[1].dim() != 3):
@@ -887,8 +921,8 @@ def forward(self, input, hx=None):  # noqa: F811
             output_packed = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
         else:
-            if not is_batched:
-                output = output.squeeze(batch_dim)
+            if not is_batched:  # type: ignore[possibly-undefined]
+                output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
                 hidden = (hidden[0].squeeze(1), hidden[1].squeeze(1))
             return output, self.permute_hidden(hidden, unsorted_indices)
 
@@ -1110,8 +1144,8 @@ def forward(self, input, hx=None):  # noqa: F811
             output_packed = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
         else:
-            if not is_batched:
-                output = output.squeeze(batch_dim)
+            if not is_batched:  # type: ignore[possibly-undefined]
+                output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
                 hidden = hidden.squeeze(1)
 
             return output, self.permute_hidden(hidden, unsorted_indices)
diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py
index 011c79180bac0..3c9a8547df320 100644
--- a/torch/nn/modules/transformer.py
+++ b/torch/nn/modules/transformer.py
@@ -132,6 +132,13 @@ def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, t
                 memory_is_causal: bool = False) -> Tensor:
         r"""Take in and process masked source/target sequences.
 
+        .. note::
+
+            If a boolean tensor is provided for any of the [src/tgt/memory]_mask arguments, positions with a ``True`` value are
+            not allowed to participate in the attention,
+            which is the opposite of the definition for :attr:`attn_mask`
+            in :func:`torch.nn.functional.scaled_dot_product_attention`.
+
         Args:
             src: the sequence to the encoder (required).
             tgt: the sequence to the decoder (required).
@@ -176,7 +183,7 @@ def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, t
             - tgt_key_padding_mask: :math:`(T)` for unbatched input otherwise :math:`(N, T)`.
             - memory_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`.
 
-            Note: [src/tgt/memory]_mask ensures that position i is allowed to attend the unmasked
+            Note: [src/tgt/memory]_mask ensures that position :math:`i` is allowed to attend the unmasked
             positions. If a BoolTensor is provided, positions with ``True``
             are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
             is provided, it will be added to the attention weight.
@@ -191,8 +198,8 @@ def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, t
             the output sequence length of a transformer is same as the input sequence
             (i.e. target) length of the decoder.
 
-            where S is the source sequence length, T is the target sequence length, N is the
-            batch size, E is the feature number
+            where :math:`S` is the source sequence length, :math:`T` is the target sequence length, :math:`N` is the
+            batch size, :math:`E` is the feature number
 
         Examples:
             >>> # xdoctest: +SKIP
@@ -256,7 +263,14 @@ class TransformerEncoder(Module):
 
     __constants__ = ['norm']
 
-    def __init__(self, encoder_layer, num_layers, norm=None, enable_nested_tensor=True, mask_check=True):
+    def __init__(
+        self,
+        encoder_layer: "TransformerEncoderLayer",
+        num_layers: int,
+        norm: Optional[Module] = None,
+        enable_nested_tensor: bool = True,
+        mask_check: bool = True
+    ) -> None:
         super().__init__()
         torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
         self.layers = _get_clones(encoder_layer, num_layers)
@@ -314,7 +328,7 @@ def forward(
                 compatibility.
 
         Shape:
-            see the docs in Transformer class.
+            see the docs in :class:`~torch.nn.Transformer`.
         """
         src_key_padding_mask = F._canonical_mask(
             mask=src_key_padding_mask,
@@ -427,7 +441,12 @@ class TransformerDecoder(Module):
 
     __constants__ = ['norm']
 
-    def __init__(self, decoder_layer, num_layers, norm=None):
+    def __init__(
+        self,
+        decoder_layer: "TransformerDecoderLayer",
+        num_layers: int,
+        norm: Optional[Module] = None
+    ) -> None:
         super().__init__()
         torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
         self.layers = _get_clones(decoder_layer, num_layers)
@@ -464,7 +483,7 @@ def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None
                 forward and backward compatibility.
 
         Shape:
-            see the docs in Transformer class.
+            see the docs in :class:`~torch.nn.Transformer`.
         """
         output = tgt
 
@@ -622,7 +641,7 @@ def forward(
                 compatibility.
 
         Shape:
-            see the docs in Transformer class.
+            see the docs in :class:`~torch.nn.Transformer`.
         """
         src_key_padding_mask = F._canonical_mask(
             mask=src_key_padding_mask,
@@ -643,7 +662,6 @@ def forward(
 
         is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled()
 
-        # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
         why_not_sparsity_fast_path = ''
         if not is_fastpath_enabled:
             why_not_sparsity_fast_path = "torch.backends.mha.get_fastpath_enabled() was not True"
@@ -721,7 +739,7 @@ def forward(
                     mask_type,
                 )
 
-
+        # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
         x = src
         if self.norm_first:
             x = x + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask, is_causal=is_causal)
@@ -858,7 +876,7 @@ def forward(
                 forward and backward compatibility.
 
         Shape:
-            see the docs in Transformer class.
+            see the docs in :class:`~torch.nn.Transformer`.
         """
         # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
 
diff --git a/torch/nn/modules/utils.py b/torch/nn/modules/utils.py
index 163e1cabb1862..019dabe3e533f 100644
--- a/torch/nn/modules/utils.py
+++ b/torch/nn/modules/utils.py
@@ -57,22 +57,23 @@ def consume_prefix_in_state_dict_if_present(
         state_dict (OrderedDict): a state-dict to be loaded to the model.
         prefix (str): prefix.
     """
-    keys = sorted(state_dict.keys())
+    keys = list(state_dict.keys())
     for key in keys:
         if key.startswith(prefix):
             newkey = key[len(prefix) :]
             state_dict[newkey] = state_dict.pop(key)
 
     # also strip the prefix in metadata if any.
-    if "_metadata" in state_dict:
-        metadata = state_dict["_metadata"]
-        for key in list(metadata.keys()):
+    if hasattr(state_dict, "_metadata"):
+        keys = list(state_dict._metadata.keys())
+        for key in keys:
             # for the metadata dict, the key can be:
             # '': for the DDP module, which we want to remove.
             # 'module': for the actual model.
             # 'module.xx.xx': for the rest.
-
             if len(key) == 0:
                 continue
-            newkey = key[len(prefix) :]
-            metadata[newkey] = metadata.pop(key)
+            # handling both, 'module' case and  'module.' cases
+            if key == prefix.replace('.', '') or key.startswith(prefix):
+                newkey = key[len(prefix) :]
+                state_dict._metadata[newkey] = state_dict._metadata.pop(key)
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 1b14fa3e808df..b27c960a154cb 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -11,13 +11,12 @@
 from contextlib import contextmanager
 from dataclasses import dataclass, fields, is_dataclass
 from enum import auto, Enum
-from typing import Any, Callable, List, Optional, Type
+from typing import Any, Callable, List, Optional, Tuple, Type, TYPE_CHECKING
 
 import torch
 import torch.distributed as dist
 from torch.autograd import Function, Variable
 from torch.distributed.algorithms.join import Join, Joinable, JoinHook
-
 from torch.utils._pytree import tree_flatten, tree_unflatten
 
 RPC_AVAILABLE = False
@@ -44,6 +43,9 @@
 from ..modules import Module
 from .scatter_gather import gather, scatter_kwargs  # noqa: F401
 
+if TYPE_CHECKING:
+    from torch.utils.hooks import RemovableHandle
+
 __all__ = ["DistributedDataParallel"]
 
 logger = logging.getLogger(__name__)
@@ -242,9 +244,11 @@ def forward(ctx, ddp_weakref, *inputs):
         # None and are not filled with zeros.
         ctx.set_materialize_grads(False)
         ctx.ddp_weakref = ddp_weakref
-        ret = tuple(
-            inp.clone() if isinstance(inp, torch.Tensor) else inp for inp in inputs
-        )
+        ret = inputs
+        if ddp_weakref()._ddp_sink_clone:
+            ret = tuple(
+                inp.clone() if isinstance(inp, torch.Tensor) else inp for inp in inputs
+            )
         return ret
 
     @staticmethod
@@ -292,9 +296,9 @@ def main_hook(self):
         ddp._check_and_sync_module_buffers()
 
         # Check if need to sync in the backward pass
-        work = ddp._check_global_requires_backward_grad_sync(is_joined_rank=True)
-        work.wait()
-        should_sync_backwards = work.result()[0].item() != 0
+        should_sync_backwards = ddp._check_global_requires_backward_grad_sync(
+            is_joined_rank=True
+        )
         # Forward parameter sync is disabled in the next iteration if we
         # are skipping gradient sync this iteration, so set
         # `require_forward_param_sync` accordingly
@@ -617,7 +621,7 @@ class DistributedDataParallel(Module, Joinable):
     """
 
     # used to track whether the given thread is inside ddp forward for torchdynamo purposes
-    _active_ddp_module = None
+    _active_ddp_module: Optional["DistributedDataParallel"] = None
 
     def __init__(
         self,
@@ -649,6 +653,34 @@ def __init__(
                 "need to be set at the same time.",
             )
 
+        if process_group and device_mesh is not None:
+            raise RuntimeError(
+                "Cannot specify both process_group and device_mesh arguments."
+            )
+        elif process_group is None and device_mesh is None:
+            self.process_group = _get_default_group()
+        elif device_mesh is None:
+            self.process_group = process_group
+        else:
+            if device_mesh.ndim != 1:
+                raise RuntimeError(
+                    f"Only 1D device mesh is supported, but got {device_mesh}."
+                )
+            self.device_mesh = device_mesh
+            self.process_group = device_mesh.get_group(mesh_dim=0)
+            from torch.distributed.device_mesh import _mesh_resources
+
+            if _mesh_resources.get_parent_mesh(device_mesh) is not None:
+                # TODO: This is a temporary work around to enable DDP + TP.
+                # We should do the logic in DDP so that the 2D implementation is
+                # sound and the state_dict works out of the box.
+                # This has to be done before check UninitializedParameter.
+                from torch.distributed.tensor.parallel.ddp import (
+                    _pre_dp_module_transform,
+                )
+
+                _pre_dp_module_transform(module)
+
         self._delay_all_reduce_params = []
         if hasattr(module, "_ddp_params_and_buffers_to_ignore"):
             self.parameters_to_ignore = set(module._ddp_params_and_buffers_to_ignore)
@@ -706,11 +738,8 @@ def __init__(
                     ValueError,
                     "DistributedDataParallel device_ids and output_device arguments "
                     "only work with single-device/multiple-device GPU modules or CPU modules, "
-                    "but got device_ids {}, output_device {}, and module parameters {}.".format(
-                        device_ids,
-                        output_device,
-                        {p.device for p in self._module_parameters},
-                    ),
+                    f"but got device_ids {device_ids}, output_device {output_device}, "
+                    f"and module parameters {({p.device for p in self._module_parameters})}.",
                 )
 
             self.device_ids = None
@@ -723,22 +752,6 @@ def __init__(
 
             self.output_device = _get_device_index(output_device, True)
 
-        if process_group and device_mesh is not None:
-            raise RuntimeError(
-                "Cannot specify both process_group and device_mesh arguments."
-            )
-        elif process_group is None and device_mesh is None:
-            self.process_group = _get_default_group()
-        elif device_mesh is None:
-            self.process_group = process_group
-        else:
-            if device_mesh.ndim != 1:
-                raise RuntimeError(
-                    f"Only 1D device mesh is supported, but got {device_mesh}."
-                )
-            self.device_mesh = device_mesh
-            self.process_group = device_mesh.get_group(mesh_dim=0)
-
         self.static_graph = False
         self.dim = dim
         self.module = module
@@ -815,6 +828,8 @@ def __init__(
             param_to_name_mapping,
             static_graph,
         )
+        self._comm_hooks: List[Tuple[Callable, object]] = []
+
         if self.mixed_precision is not None:
             _setup_mixed_precision_params(self.mixed_precision, self.module)
             _cast_buffers(self.mixed_precision, self.module)
@@ -864,6 +879,67 @@ def __init__(
 
         self._lazy_init_ran = False
 
+        # Register the AccumulateGrad post hooks if optimize_ddp is
+        # True. The hooks will be deregistered if compiled_autograd is not
+        # enabled.
+        self._accum_grad_hooks: List[RemovableHandle] = []
+        optimize_ddp = torch._dynamo.config._get_optimize_ddp_mode()
+        self._use_python_reducer = optimize_ddp in (
+            "python_reducer",
+            "python_reducer_without_compiled_forward",
+        )
+        if self._use_python_reducer:
+            torch._inductor.config._fuse_ddp_communication = True
+            torch._inductor.config._fuse_ddp_bucket_size = bucket_cap_mb
+            # Directly adding this to the trace rule will disturb the users
+            # who are using DDPOptimizer.
+            torch._dynamo.trace_rules.LEGACY_MOD_INLINELIST.add(
+                "torch.nn.parallel.distributed"
+            )
+            torch._dynamo.trace_rules.get_legacy_mod_inlinelist.cache_clear()
+        self._force_to_disable_cpp_reducer = (
+            optimize_ddp == "python_reducer_without_compiled_forward"
+        )
+        if self._use_python_reducer:
+            self._register_accum_grad_hook()
+
+        # Whether or not DDPSink performs a clone.
+        self._ddp_sink_clone = True
+
+    def _register_accum_grad_hook(self):
+        import torch.distributed._functional_collectives as fcol
+
+        def compiled_accum_grad_hook(
+            param,
+            *,
+            param_index: int,
+        ):
+            if not self.require_backward_grad_sync:
+                return
+
+            if param.grad is None:
+                return
+
+            if self._comm_hooks:
+                for hook, state in self._comm_hooks:
+                    hook(state, (param.grad, param))
+            else:
+                gradient = param.grad / self.process_group.size()
+                gradient = fcol.all_reduce(gradient, "sum", self.process_group)
+                param.grad.copy_(gradient)
+
+        for index, param in enumerate(self._module_parameters):
+            if not param.requires_grad:
+                continue
+            self._accum_grad_hooks.append(
+                param.register_post_accumulate_grad_hook(
+                    functools.partial(
+                        compiled_accum_grad_hook,
+                        param_index=index,
+                    )
+                )
+            )
+
     def _delayed_all_reduce_hook(self, grad):
         world_size = dist.get_world_size(self.process_group)
 
@@ -882,7 +958,7 @@ def _register_delay_all_reduce_hook(
         # 1. Create gradient buffer
         device = torch.device("cpu") if device_ids is None else device_ids[0]
         self._delay_grad_buffer = torch.zeros(
-            sum([p.numel() for p in self._delay_all_reduce_params]),
+            sum(p.numel() for p in self._delay_all_reduce_params),
             device=device,
         )
 
@@ -1355,8 +1431,11 @@ def _inside_ddp_forward(self):
             DistributedDataParallel._active_ddp_module = None
 
     def _run_ddp_forward(self, *inputs, **kwargs):
-        with self._inside_ddp_forward():
+        if self._use_python_reducer:
             return self.module(*inputs, **kwargs)  # type: ignore[index]
+        else:
+            with self._inside_ddp_forward():
+                return self.module(*inputs, **kwargs)  # type: ignore[index]
 
     def _clear_grad_buffer(self):
         # Making param.grad points to the grad buffers before backward is based on the
@@ -1385,9 +1464,24 @@ def _lazy_init(self):
         self._setup_in_backward_optimizers()
         self._lazy_init_ran = True
 
+    def _should_disable_cpp_reducer(self) -> bool:
+        return self._use_python_reducer and (
+            torch._utils.is_compiling() or self._force_to_disable_cpp_reducer
+        )
+
     def _pre_forward(self, *inputs, **kwargs):
-        if not self._lazy_init_ran:
+        if self._should_disable_cpp_reducer():
+            return inputs, kwargs
+
+        # Disable the python reducer if compiled_autograd is not enabled.
+        if self._accum_grad_hooks:
+            for index, h in enumerate(self._accum_grad_hooks):
+                h.remove()
+            self._accum_grad_hooks.clear()
+
+        if not self._lazy_init_ran and not torch._utils.is_compiling():
             self._lazy_init()
+
         if self._delay_all_reduce_all_params:
             return inputs, kwargs
 
@@ -1451,6 +1545,9 @@ def _pre_forward(self, *inputs, **kwargs):
             return inputs, kwargs
 
     def _post_forward(self, output):
+        if self._should_disable_cpp_reducer():
+            return output
+
         if self._delay_all_reduce_all_params:
             self._clear_grad_buffer()
             return output
@@ -1554,7 +1651,18 @@ def _check_global_requires_backward_grad_sync(self, is_joined_rank):
         work = dist.all_reduce(
             requires_sync_tensor, group=self.process_group, async_op=True
         )
-        return work
+
+        # (kwen2501) This if condition is a plain translation of previous
+        # behavior, i.e. in the `is_joined_rank=False` case, `work.wait()`
+        # is not called and it doesn't care about the result. I am guessing
+        # that it just wants to fire a matching all-reduce and does not want
+        # the main stream to wait.
+        if is_joined_rank:
+            work.wait()
+            should_sync_backwards = requires_sync_tensor.item() != 0
+            return should_sync_backwards
+        else:
+            return None  # Return value is not/should not be used.
 
     # When running in join mode, checks and performs sync of module buffers if
     # the models have buffers that should be synchronized in the forward pass.
@@ -1874,6 +1982,7 @@ def register_comm_hook(self, state: object, hook: Callable):
         self._check_comm_hook(hook)
         assert self.logger is not None
         self.logger._set_comm_hook_name(hook.__qualname__)
+        self._comm_hooks.append((hook, state))
         dist._register_comm_hook(self.reducer, state, hook)
 
     def _register_builtin_comm_hook(self, comm_hook_type):
@@ -2259,3 +2368,16 @@ def _update_process_group(self, new_process_group):
         if not _rank_not_in_group(new_process_group):
             self.process_group = new_process_group
             self.reducer._update_process_group(new_process_group)
+
+    def _set_ddp_sink_clone(self, val: bool):
+        """
+        Sets whether or not DDPSink should clone the output tensors or not.
+        The default is True since if the loss is modified in place we run
+        into the view is modified in-place error.
+
+        Although, cloning the tensors can add significant memory and
+        performance hit if the number and size of tensors are large. As
+        a result, this can be set to False if you are not modifying the
+        loss in place.
+        """
+        self._ddp_sink_clone = val
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
index c6e8ed2816af1..43d4f1cf40008 100644
--- a/torch/nn/parameter.py
+++ b/torch/nn/parameter.py
@@ -87,6 +87,7 @@ class UninitializedTensorMixin:
         torch.Tensor.__hash__,
         torch.Tensor.size,
         torch.Tensor.copy_,
+        torch.Tensor.is_complex,
         torch.Tensor.is_floating_point,
         torch.Tensor.half,
         torch.Tensor.float,
diff --git a/torch/nn/utils/clip_grad.py b/torch/nn/utils/clip_grad.py
index 95a51fd2c6835..6549a6f3e2c8d 100644
--- a/torch/nn/utils/clip_grad.py
+++ b/torch/nn/utils/clip_grad.py
@@ -1,14 +1,27 @@
 import warnings
+import functools
 from typing import Union, Iterable, List, Dict, Tuple, Optional, cast
 
 import torch
-from torch import Tensor, inf
-from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype, _has_foreach_support
+from torch import Tensor
+from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype, _has_foreach_support, _device_has_foreach_support
 
 _tensor_or_tensors = Union[torch.Tensor, Iterable[torch.Tensor]]
 
 __all__ = ['clip_grad_norm_', 'clip_grad_norm', 'clip_grad_value_']
 
+def _no_grad(func):
+    """
+    This wrapper is needed to avoid a circular import when using @torch.no_grad on the exposed functions
+    clip_grad_norm_ and clip_grad_value_ themselves.
+    """
+    def _no_grad_wrapper(*args, **kwargs):
+        with torch.no_grad():
+            return func(*args, **kwargs)
+    functools.update_wrapper(_no_grad_wrapper, func)
+    return _no_grad_wrapper
+
+@_no_grad
 def clip_grad_norm_(
         parameters: _tensor_or_tensors, max_norm: float, norm_type: float = 2.0,
         error_if_nonfinite: bool = False, foreach: Optional[bool] = None) -> torch.Tensor:
@@ -42,23 +55,22 @@ def clip_grad_norm_(
     if len(grads) == 0:
         return torch.tensor(0.)
     first_device = grads[0].device
-    grouped_grads: Dict[Tuple[torch.device, torch.dtype], List[List[Tensor]]] \
-        = _group_tensors_by_device_and_dtype([[g.detach() for g in grads]])  # type: ignore[assignment]
-
-    if norm_type == inf:
-        norms = [torch.linalg.vector_norm(g.detach(), inf).to(first_device) for g in grads]
-        total_norm = norms[0] if len(norms) == 1 else torch.max(torch.stack(norms))
-    else:
-        norms = []
-        for ((device, _), ([grads], _)) in grouped_grads.items():  # type: ignore[assignment]
-            if (foreach is None or foreach) and _has_foreach_support(grads, device=device):
-                norms.extend(torch._foreach_norm(grads, norm_type))
-            elif foreach:
-                raise RuntimeError(f'foreach=True was passed, but can\'t use the foreach API on {device.type} tensors')
-            else:
-                norms.extend([torch.linalg.vector_norm(g, norm_type) for g in grads])
-
-        total_norm = torch.linalg.vector_norm(torch.stack([norm.to(first_device) for norm in norms]), norm_type)
+    grouped_grads: Dict[Tuple[torch.device, torch.dtype], Tuple[List[List[Tensor]], List[int]]] \
+        = _group_tensors_by_device_and_dtype([grads])  # type: ignore[assignment]
+
+    norms: List[Tensor] = []
+    for ((device, _), ([device_grads], _)) in grouped_grads.items():  # type: ignore[assignment]
+        if (
+            (foreach is None and _has_foreach_support(device_grads, device))
+            or (foreach and _device_has_foreach_support(device))
+        ):
+            norms.extend(torch._foreach_norm(device_grads, norm_type))
+        elif foreach:
+            raise RuntimeError(f'foreach=True was passed, but can\'t use the foreach API on {device.type} tensors')
+        else:
+            norms.extend([torch.linalg.vector_norm(g, norm_type) for g in device_grads])
+
+    total_norm = torch.linalg.vector_norm(torch.stack([norm.to(first_device) for norm in norms]), norm_type)
 
     if error_if_nonfinite and torch.logical_or(total_norm.isnan(), total_norm.isinf()):
         raise RuntimeError(
@@ -71,15 +83,18 @@ def clip_grad_norm_(
     # avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization
     # when the gradients do not reside in CPU memory.
     clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
-    for ((device, _), ([grads], _)) in grouped_grads.items():  # type: ignore[assignment]
-        if (foreach is None or foreach) and _has_foreach_support(grads, device=device):  # type: ignore[arg-type]
-            torch._foreach_mul_(grads, clip_coef_clamped.to(device))  # type: ignore[call-overload]
+    for ((device, _), ([device_grads], _)) in grouped_grads.items():  # type: ignore[assignment]
+        if (
+            (foreach is None and _has_foreach_support(device_grads, device))
+            or (foreach and _device_has_foreach_support(device))
+        ):
+            torch._foreach_mul_(device_grads, clip_coef_clamped.to(device))
         elif foreach:
             raise RuntimeError(f'foreach=True was passed, but can\'t use the foreach API on {device.type} tensors')
         else:
             clip_coef_clamped_device = clip_coef_clamped.to(device)
-            for g in grads:
-                g.detach().mul_(clip_coef_clamped_device)
+            for g in device_grads:
+                g.mul_(clip_coef_clamped_device)
 
     return total_norm
 
@@ -98,6 +113,7 @@ def clip_grad_norm(
     return clip_grad_norm_(parameters, max_norm, norm_type, error_if_nonfinite, foreach)
 
 
+@_no_grad
 def clip_grad_value_(parameters: _tensor_or_tensors, clip_value: float, foreach: Optional[bool] = None) -> None:
     r"""Clip the gradients of an iterable of parameters at specified value.
 
@@ -122,12 +138,14 @@ def clip_grad_value_(parameters: _tensor_or_tensors, clip_value: float, foreach:
     grouped_grads = _group_tensors_by_device_and_dtype([grads])
 
     for ((device, _), ([grads], _)) in grouped_grads.items():  # type: ignore[assignment]
-        if (foreach is None or foreach) and _has_foreach_support(cast(List[Tensor], grads), device=device):
+        if (
+            (foreach is None and _has_foreach_support(cast(List[Tensor], grads), device=device))
+            or (foreach and _device_has_foreach_support(device))
+        ):
             torch._foreach_clamp_min_(cast(List[Tensor], grads), -clip_value)
             torch._foreach_clamp_max_(cast(List[Tensor], grads), clip_value)
         elif foreach:
             raise RuntimeError(f'foreach=True was passed, but can\'t use the foreach API on {device.type} tensors')
         else:
-            with torch.no_grad():
-                for grad in grads:
-                    cast(Tensor, grad).clamp_(min=-clip_value, max=clip_value)
+            for grad in grads:
+                cast(Tensor, grad).clamp_(min=-clip_value, max=clip_value)
diff --git a/torch/nn/utils/fusion.py b/torch/nn/utils/fusion.py
index d7abab5007c02..9433d9c376df8 100644
--- a/torch/nn/utils/fusion.py
+++ b/torch/nn/utils/fusion.py
@@ -97,6 +97,20 @@ def fuse_linear_bn_eval(linear: LinearT, bn: torch.nn.modules.batchnorm._BatchNo
     assert not (linear.training or bn.training), "Fusion only for eval!"
     fused_linear = copy.deepcopy(linear)
 
+    """
+    Linear-BN needs to be fused while preserving the shapes of linear weight/bias.
+    To preserve the shapes of linear weight/bias, the channel dim of bn needs to be broadcastable with the last dim of linear,
+    because bn operates over the channel dim, (N, C_in, H, W) while linear operates over the last dim, (*, H_in).
+    To be broadcastable, the number of features in bn and
+    the number of output features from linear must satisfy the following condition:
+    1. they are equal, or
+    2. the number of features in bn is 1
+    Otherwise, skip the folding path
+    """
+    assert (
+        linear.out_features == bn.num_features or bn.num_features == 1
+    ), "To fuse, linear.out_features == bn.num_features or bn.num_features == 1"
+
     assert bn.running_mean is not None and bn.running_var is not None
     fused_linear.weight, fused_linear.bias = fuse_linear_bn_weights(
         fused_linear.weight, fused_linear.bias,
diff --git a/torch/nn/utils/parametrizations.py b/torch/nn/utils/parametrizations.py
index e1dd018f58b80..f9b25bcac0cb7 100644
--- a/torch/nn/utils/parametrizations.py
+++ b/torch/nn/utils/parametrizations.py
@@ -105,7 +105,7 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
             Q = self.base @ Q
         if transposed:
             Q = Q.mT
-        return Q
+        return Q  # type: ignore[possibly-undefined]
 
     @torch.autograd.no_grad()
     def right_inverse(self, Q: torch.Tensor) -> torch.Tensor:
@@ -450,7 +450,7 @@ def _power_method(self, weight_mat: torch.Tensor, n_power_iterations: int) -> No
             # This power iteration produces approximations of `u` and `v`.
             self._u = F.normalize(torch.mv(weight_mat, self._v),      # type: ignore[has-type]
                                   dim=0, eps=self.eps, out=self._u)   # type: ignore[has-type]
-            self._v = F.normalize(torch.mv(weight_mat.t(), self._u),
+            self._v = F.normalize(torch.mv(weight_mat.H, self._u),
                                   dim=0, eps=self.eps, out=self._v)   # type: ignore[has-type]
 
     def forward(self, weight: torch.Tensor) -> torch.Tensor:
@@ -467,7 +467,7 @@ def forward(self, weight: torch.Tensor) -> torch.Tensor:
             # The proper way of computing this should be through F.bilinear, but
             # it seems to have some efficiency issues:
             # https://github.com/pytorch/pytorch/issues/58093
-            sigma = torch.dot(u, torch.mv(weight_mat, v))
+            sigma = torch.vdot(u, torch.mv(weight_mat, v))
             return weight / sigma
 
     def right_inverse(self, value: torch.Tensor) -> torch.Tensor:
diff --git a/torch/nn/utils/parametrize.py b/torch/nn/utils/parametrize.py
index e73aada232abf..f512b7c3b22ae 100644
--- a/torch/nn/utils/parametrize.py
+++ b/torch/nn/utils/parametrize.py
@@ -1,6 +1,8 @@
 import torch
+from torch.__future__ import get_swap_module_params_on_conversion
 from torch.nn.modules.container import ModuleList, ModuleDict, Module
 from torch.nn.parameter import Parameter
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 from torch import Tensor
 
 import collections
@@ -64,6 +66,14 @@ def _register_parameter_or_buffer(module, name, X):
     else:
         module.register_buffer(name, X)
 
+def _maybe_set(dest: Tensor, src: Tensor) -> None:
+    should_swap = get_swap_module_params_on_conversion() or is_traceable_wrapper_subclass(dest)
+    if should_swap:
+        if isinstance(dest, Parameter) and not isinstance(src, Parameter):
+            src = Parameter(src, requires_grad=dest.requires_grad)
+        torch.utils.swap_tensors(dest, src)
+    else:
+        dest.set_(src)  # type: ignore[call-overload]
 
 class ParametrizationList(ModuleList):
     r"""A sequential container that holds and manages the original parameters or buffers of a parametrized :class:`torch.nn.Module`.
@@ -157,7 +167,7 @@ def __init__(
             # Set the original to original so that the user does not need to re-register the parameter
             # manually in the optimiser
             with torch.no_grad():
-                original.set_(new)  # type: ignore[call-overload]
+                _maybe_set(original, new)
             _register_parameter_or_buffer(self, "original", original)
         else:
             for i, originali in enumerate(new):
@@ -170,7 +180,7 @@ def __init__(
                 # add the new parameters to the optimizer after registering the parametrization
                 # (this is documented)
                 if isinstance(original, Parameter):
-                    originali = Parameter(originali)
+                    originali = Parameter(originali, original.requires_grad)
                 originali.requires_grad_(original.requires_grad)
                 _register_parameter_or_buffer(self, f"original{i}", originali)
 
@@ -231,7 +241,7 @@ def right_inverse(self, value: Tensor) -> None:
                         f"while `original` has dtype {self.original.dtype}"
                     )
                 # We know that the result is going to have the same dtype
-                self.original.set_(value)  # type: ignore[call-overload]
+                _maybe_set(self.original, value)
             else:
                 if not isinstance(value, collections.abc.Sequence):
                     raise ValueError(
@@ -255,7 +265,7 @@ def right_inverse(self, value: Tensor) -> None:
                             f"Tensor {i} returned by `right_inverse` has dtype {tensor.dtype} "
                             f"while `original{i}` has dtype {original_i.dtype}"
                         )
-                    original_i.set_(tensor)
+                    _maybe_set(original_i, tensor)
 
     def forward(self) -> Tensor:
         if torch.jit.is_scripting():
@@ -645,18 +655,20 @@ def remove_parametrizations(
             # This way the user does not need to update the optimizer
             with torch.no_grad():
                 if type(original) is torch.Tensor:
-                    original.set_(t)
+                    _maybe_set(original, t)
                 else:
                     try:
-                        original.set_(t)
+                        _maybe_set(original, t)
                     except RuntimeError as e:
                         # TODO: Fix this for tensor subclasses that are parameters:
                         # RuntimeError: set_storage is not allowed on a Tensor created from .data or .detach().
                         raise RuntimeError("Calling remove_parametrizations() with leave_parametrized=True "
                                            "for a parameter that is an instance of a tensor subclass requires "
-                                           "set_() to be implemented correctly for the tensor subclass. Either "
-                                           "set leave_parametrized=False or provide a working implementation for "
-                                           "set_() in the tensor subclass.") from e
+                                           "set_() to be implemented correctly for the tensor subclass."
+                                           "Alternatively, one can opt into the swap_tensors path"
+                                           "Either set leave_parametrized=False or provide a working implementation"
+                                           "for set_() in the tensor subclass or set "
+                                           "torch.__future__.set_swap_module_params_on_conversion(True).") from e
     else:
         if leave_parametrized:
             # We cannot use no_grad because we need to know whether one or more
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py
index 64af207ce09c8..1a62254f0801e 100644
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@@ -56,7 +56,7 @@ class PackedSequence(PackedSequence_):
         However, :attr:`batch_sizes` should always be a CPU ``torch.int64`` tensor.
 
         This invariant is maintained throughout :class:`PackedSequence` class,
-        and all functions that construct a `:class:PackedSequence` in PyTorch
+        and all functions that construct a :class:`PackedSequence` in PyTorch
         (i.e., they only pass in tensors conforming to this constraint).
 
     """
@@ -212,10 +212,10 @@ def pack_padded_sequence(
 ) -> PackedSequence:
     r"""Packs a Tensor containing padded sequences of variable length.
 
-    :attr:`input` can be of size ``T x B x *`` where `T` is the length of the
-    longest sequence (equal to ``lengths[0]``), ``B`` is the batch size, and
-    ``*`` is any number of dimensions (including 0). If ``batch_first`` is
-    ``True``, ``B x T x *`` :attr:`input` is expected.
+    :attr:`input` can be of size ``T x B x *`` where ``T`` is the length of the
+    longest sequence, ``B`` is the batch size, and ``*`` is any number of dimensions
+    (including 0). If :attr:`batch_first` is ``False``, ``T x B x *`` :attr:`input` is expected,
+    ``B x T x *`` otherwise.
 
     For unsorted sequences, use `enforce_sorted = False`. If :attr:`enforce_sorted` is
     ``True``, the sequences should be sorted by length in a decreasing order, i.e.
@@ -233,7 +233,7 @@ def pack_padded_sequence(
         lengths (Tensor or list(int)): list of sequence lengths of each batch
             element (must be on the CPU if provided as a tensor).
         batch_first (bool, optional): if ``True``, the input is expected in ``B x T x *``
-            format.
+            format, ``T x B x *`` otherwise.
         enforce_sorted (bool, optional): if ``True``, the input is expected to
             contain sequences sorted by length in a decreasing order. If
             ``False``, the input will get sorted unconditionally. Default: ``True``.
@@ -275,9 +275,9 @@ def pad_packed_sequence(
 
     It is an inverse operation to :func:`pack_padded_sequence`.
 
-    The returned Tensor's data will be of size ``T x B x *``, where `T` is the length
-    of the longest sequence and `B` is the batch size. If ``batch_first`` is True,
-    the data will be transposed into ``B x T x *`` format.
+    The returned Tensor's data will be of size ``T x B x *`` (if :attr:`batch_first` is ``False``)
+    or ``B x T x *`` (if :attr:`batch_first` is ``True``) , where ``T`` is the length of the longest
+    sequence and ``B`` is the batch size.
 
     Example:
         >>> from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
@@ -305,7 +305,7 @@ def pad_packed_sequence(
     Args:
         sequence (PackedSequence): batch to pad
         batch_first (bool, optional): if ``True``, the output will be in ``B x T x *``
-            format.
+            format, ``T x B x *`` otherwise.
         padding_value (float, optional): values for padded elements.
         total_length (int, optional): if not ``None``, the output will be padded to
             have length :attr:`total_length`. This method will throw :class:`ValueError`
@@ -344,17 +344,15 @@ def pad_sequence(
     batch_first: bool = False,
     padding_value: float = 0.0,
 ) -> Tensor:
-    r"""Pad a list of variable length Tensors with ``padding_value``.
+    r"""Pad a list of variable length Tensors with :attr:`padding_value`.
 
-    ``pad_sequence`` stacks a list of Tensors along a new dimension,
-    and pads them to equal length. For example, if the input is a list of
-    sequences with size ``L x *`` and ``batch_first`` is False, the output is
-    of size ``T x B x *``.
-
-    `B` is batch size. It is equal to the number of elements in ``sequences``.
-    `T` is length of the longest sequence.
-    `L` is length of the sequence.
-    `*` is any number of trailing dimensions, including none.
+    ``pad_sequence`` stacks a list of Tensors along a new dimension, and pads them
+    to equal length. :attr:`sequences` can be list of sequences with size ``L x *``,
+    where `L` is length of the sequence and ``*`` is any number of dimensions
+    (including 0). If :attr:`batch_first` is ``False``, the output is of size
+    ``T x B x *``, and ``B x T x *`` otherwise, where ``B`` is the batch size
+    (the number of elements in :attr:`sequences`), ``T`` is the length of the longest
+    sequence.
 
     Example:
         >>> from torch.nn.utils.rnn import pad_sequence
@@ -371,8 +369,8 @@ def pad_sequence(
 
     Args:
         sequences (list[Tensor]): list of variable length sequences.
-        batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
-            ``T x B x *`` otherwise. Default: False.
+        batch_first (bool, optional): if ``True``, the output will be in ``B x T x *``
+            format, ``T x B x *`` otherwise.
         padding_value (float, optional): value for padded elements. Default: 0.
 
     Returns:
diff --git a/torch/nn/utils/stateless.py b/torch/nn/utils/stateless.py
index ae7ebcdf3df7f..2cb6c7460d4ce 100644
--- a/torch/nn/utils/stateless.py
+++ b/torch/nn/utils/stateless.py
@@ -93,6 +93,7 @@ def _reparametrize_module(
     *,
     tie_weights: bool = False,
     strict: bool = False,
+    stack_weights: bool = False,
 ) -> Iterator[None]:
     if tie_weights:
         untied_parameters_and_buffers = _untie_named_tensors_map(
@@ -127,6 +128,11 @@ def _reparametrize_module(
         )
         yield
     finally:
+        if stack_weights:
+            # When stacking is enabled, we will restore the weights in LIFO order.
+            orig_parameters_and_buffers = dict(
+                reversed(orig_parameters_and_buffers.items())
+            )
         new_parameters_and_buffers, _ = accessor.swap_tensors_dict(
             orig_parameters_and_buffers, allow_missing=True
         )
diff --git a/torch/onnx/README.md b/torch/onnx/README.md
index 9e36515f597a1..f23a419e7708a 100644
--- a/torch/onnx/README.md
+++ b/torch/onnx/README.md
@@ -2,7 +2,7 @@
 
 Torch->ONNX converter / exporter.
 
-- User-facing docs: https://pytorch.org/docs/master/onnx.html
+- User-facing docs: https://pytorch.org/docs/main/onnx.html
 - Developer docs: https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter
 
 > Read the following if you are contributing to `torch.onnx`
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index ad3af0984d4de..4d16ef09c8b3a 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -24,6 +24,8 @@
     symbolic_opset16,
     symbolic_opset17,
     symbolic_opset18,
+    symbolic_opset19,
+    symbolic_opset20,
     utils,
 )
 
@@ -82,6 +84,8 @@
     "symbolic_opset16",
     "symbolic_opset17",
     "symbolic_opset18",
+    "symbolic_opset19",
+    "symbolic_opset20",
     # Enums
     "ExportTypes",
     "OperatorExportTypes",
diff --git a/torch/onnx/_constants.py b/torch/onnx/_constants.py
index bcdcf4fa6112a..6c91b245ed703 100644
--- a/torch/onnx/_constants.py
+++ b/torch/onnx/_constants.py
@@ -4,8 +4,8 @@
 
 ONNX_BASE_OPSET = 9
 ONNX_MIN_OPSET = 7
-ONNX_MAX_OPSET = 19
-ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET = 17
+ONNX_MAX_OPSET = 20
+ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET = 20
 # ONNX_DEFAULT_OPSET generated by tools/onnx/update_default_opset_version.py
 ONNX_DEFAULT_OPSET = 17
 ONNX_CONSTANT_FOLDING_MIN_OPSET = 9
diff --git a/torch/onnx/_internal/_beartype.py b/torch/onnx/_internal/_beartype.py
index 30c0474f2beec..25e1c1cb72998 100644
--- a/torch/onnx/_internal/_beartype.py
+++ b/torch/onnx/_internal/_beartype.py
@@ -120,12 +120,12 @@ def beartype(func):
     _TORCH_ONNX_EXPERIMENTAL_RUNTIME_TYPE_CHECK = os.getenv(
         "TORCH_ONNX_EXPERIMENTAL_RUNTIME_TYPE_CHECK"
     )
-    if _TORCH_ONNX_EXPERIMENTAL_RUNTIME_TYPE_CHECK == "WARNINGS":
-        _runtime_type_check_state = RuntimeTypeCheckState.WARNINGS
+    if _TORCH_ONNX_EXPERIMENTAL_RUNTIME_TYPE_CHECK == "ERRORS":
+        _runtime_type_check_state = RuntimeTypeCheckState.ERRORS
     elif _TORCH_ONNX_EXPERIMENTAL_RUNTIME_TYPE_CHECK == "DISABLED":
         _runtime_type_check_state = RuntimeTypeCheckState.DISABLED
     else:
-        _runtime_type_check_state = RuntimeTypeCheckState.ERRORS
+        _runtime_type_check_state = RuntimeTypeCheckState.WARNINGS
     beartype = _create_beartype_decorator(_runtime_type_check_state)
     # Make sure that the beartype decorator is enabled whichever path we took.
     assert beartype is not None
diff --git a/torch/onnx/_internal/diagnostics/_rules.py b/torch/onnx/_internal/diagnostics/_rules.py
index 4bafac8f7f500..0bfda96c5bce3 100644
--- a/torch/onnx/_internal/diagnostics/_rules.py
+++ b/torch/onnx/_internal/diagnostics/_rules.py
@@ -490,7 +490,7 @@ class _POERules(infra.RuleCollection):
                 },
                 "full_description": {
                     "text": 'Cannot find symbolic function to convert the "call_function" FX node to ONNX. ',
-                    "markdown": 'This error occurs when the ONNX converter is unable to find a corresponding symbolic function\nto convert a "call_function" node in the input graph to its equivalence in ONNX. The "call_function"\nnode represents a normalized function call in PyTorch, such as "torch.aten.ops.add".\n\nTo resolve this error, you can try one of the following:\n\n- If exists, apply the auto-fix suggested by the diagnostic. TODO: this part is not available yet.\n- Rewrite the model using only supported PyTorch operators or functions.\n- Follow this [guide](https://pytorch.org/docs/stable/onnx.html#onnx-script-functions) to write and\n  register a custom symbolic function for the unsupported call_function FX node.\n\nTODO: Replace above link once docs for `dynamo_export` custom op registration are available.\n',
+                    "markdown": 'This error occurs when the ONNX converter is unable to find a corresponding symbolic function\nto convert a "call_function" node in the input graph to its equivalence in ONNX. The "call_function"\nnode represents a normalized function call in PyTorch, such as "torch.aten.ops.add".\n\nTo resolve this error, you can try one of the following:\n\n- If exists, apply the auto-fix suggested by the diagnostic. TODO: this part is not available yet.\n- Rewrite the model using only supported PyTorch operators or functions.\n- Follow this [guide](https://pytorch.org/tutorials/beginner/onnx/onnx_registry_tutorial.html#overview) to write and\n  register a custom symbolic function for the unsupported call_function FX node.\n',
                 },
                 "message_strings": {
                     "default": {
diff --git a/torch/onnx/_internal/diagnostics/infra/_infra.py b/torch/onnx/_internal/diagnostics/infra/_infra.py
index 325cdc44ac725..c118f3e5ae149 100644
--- a/torch/onnx/_internal/diagnostics/infra/_infra.py
+++ b/torch/onnx/_internal/diagnostics/infra/_infra.py
@@ -49,7 +49,7 @@ class Tag(enum.Enum):
 class PatchedPropertyBag(sarif.PropertyBag):
     """Key/value pairs that provide additional information about the object.
 
-    The definition of PropertyBag via SARIF spec is "A property bag is an object (§3.6)
+    The definition of PropertyBag via SARIF spec is "A property bag is an object (section 3.6)
     containing an unordered set of properties with arbitrary names." However it is not
     reflected in the json file, and therefore not captured by the python representation.
     This patch adds additional **kwargs to the `__init__` method to allow recording
@@ -264,7 +264,7 @@ class Invocation:
     # TODO: Implement this.
     # Tracks top level call arguments and diagnostic options.
     def __init__(self) -> None:
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 @dataclasses.dataclass
diff --git a/torch/onnx/_internal/diagnostics/rules.yaml b/torch/onnx/_internal/diagnostics/rules.yaml
index 1a0ce40a1b366..a0d5cd3c872dc 100644
--- a/torch/onnx/_internal/diagnostics/rules.yaml
+++ b/torch/onnx/_internal/diagnostics/rules.yaml
@@ -192,10 +192,9 @@
 
       - If exists, apply the auto-fix suggested by the diagnostic. TODO: this part is not available yet.
       - Rewrite the model using only supported PyTorch operators or functions.
-      - Follow this [guide](https://pytorch.org/docs/stable/onnx.html#onnx-script-functions) to write and
+      - Follow this [guide](https://pytorch.org/tutorials/beginner/onnx/onnx_registry_tutorial.html#overview) to write and
         register a custom symbolic function for the unsupported call_function FX node.
 
-      TODO: Replace above link once docs for `dynamo_export` custom op registration are available.
   message_strings:
     default:
       text: "No symbolic function to convert the \"call_function\" node {target} to ONNX. "
diff --git a/torch/onnx/_internal/exporter.py b/torch/onnx/_internal/exporter.py
index f70b913c92533..7831a362aef72 100644
--- a/torch/onnx/_internal/exporter.py
+++ b/torch/onnx/_internal/exporter.py
@@ -10,6 +10,7 @@
 import logging
 import os
 
+import tempfile
 import warnings
 from collections import defaultdict
 from typing import (
@@ -103,7 +104,7 @@ class ONNXFakeContext:
     fake_mode: fake_tensor.FakeTensorMode
     """The fake tensor mode used for tracing model using fake tensors and parameters."""
 
-    state_dict_paths: Optional[Tuple[Union[str, io.BytesIO]]] = None
+    state_dict_paths: Optional[Tuple[Union[str, io.BytesIO, Dict[str, Any]]]] = None
     """List of paths of files that contain the model :meth:`state_dict`"""
 
 
@@ -127,7 +128,6 @@ def __init__(self) -> None:
         ] = defaultdict(list)
         # FIXME: Avoid importing onnxscript into torch
         from onnxscript.function_libs.torch_lib import (  # type: ignore[import]  # noqa: F401
-            ops,  # TODO(titaiwang): get rid of this import
             registration,
         )
 
@@ -150,7 +150,6 @@ def opset_version(self) -> int:
 
         return self._opset_version
 
-    # TODO(titaiwang): subject to change if multiple opset_version is supported in torchlib
     def _initiate_registry_from_torchlib(
         self, torchlib_registry: torchlib_registry.Registry
     ):
@@ -345,7 +344,9 @@ class ResolvedExportOptions(ExportOptions):
     decomposition_table: Dict[torch._ops.OpOverload, Callable]
     """A dictionary that maps operators to their decomposition functions."""
 
-    onnxfunction_dispatcher: torch.onnx._internal.fx.onnxfunction_dispatcher.OnnxFunctionDispatcher
+    onnxfunction_dispatcher: (
+        torch.onnx._internal.fx.onnxfunction_dispatcher.OnnxFunctionDispatcher
+    )
     """The ONNX dispatcher used to dispatch ATen operators to ONNX functions."""
 
     fx_tracer: FXGraphExtractor
@@ -421,7 +422,6 @@ def resolve(value: Optional[T], fallback: Union[T, Callable[[], T]]) -> T:
                 )
             )
 
-            # TODO(titaiwang, bowbao): Better way to annotate `onnxscript` types in diagnostics.
             from torch.onnx._internal.fx import onnxfunction_dispatcher
 
             self.op_level_debug = resolve(options.op_level_debug, False)
@@ -481,7 +481,7 @@ def enable_fake_mode():
         >>> # Saving model WITHOUT initializers
         >>> onnx_program.save("my_model_without_initializers.onnx")
         >>> # Saving model WITH initializers
-        >>> onnx_program.save("my_model_with_initializers.onnx", model_state_dict=MyModel().state_dict())
+        >>> onnx_program.save("my_model_with_initializers.onnx", model_state=MyModel().state_dict())
 
     .. warning::
         This API is experimental and is *NOT* backward-compatible.
@@ -579,7 +579,7 @@ class LargeProtobufONNXProgramSerializer:
     Fallback to serializing as Protobuf with external data for models larger than 2GB.
     """
 
-    _destination_path: Final[str]
+    _destination_path: Final[str]  # type: ignore[misc]
 
     def __init__(self, destination_path: str):
         self._destination_path = destination_path
@@ -652,14 +652,14 @@ class ONNXProgram:
         model_signature: The model signature for the exported ONNX graph.
     """
 
-    _model_proto: Final[onnx.ModelProto]  # type: ignore[name-defined]
-    _input_adapter: Final[io_adapter.InputAdapter]
-    _output_adapter: Final[io_adapter.OutputAdapter]
-    _diagnostic_context: Final[diagnostics.DiagnosticContext]
-    _fake_context: Final[Optional[ONNXFakeContext]]
-    _export_exception: Final[Optional[Exception]]
-    _model_signature: Final[Optional[torch.export.ExportGraphSignature]]
-    _model_torch: Final[
+    _model_proto: Final[onnx.ModelProto]  # type: ignore[name-defined, misc]
+    _input_adapter: Final[io_adapter.InputAdapter]  # type: ignore[misc]
+    _output_adapter: Final[io_adapter.OutputAdapter]  # type: ignore[misc]
+    _diagnostic_context: Final[diagnostics.DiagnosticContext]  # type: ignore[misc]
+    _fake_context: Final[Optional[ONNXFakeContext]]  # type: ignore[misc]
+    _export_exception: Final[Optional[Exception]]  # type: ignore[misc]
+    _model_signature: Final[Optional[torch.export.ExportGraphSignature]]  # type: ignore[misc]
+    _model_torch: Final[  # type: ignore[misc]
         Optional[Union[torch.nn.Module, Callable, torch_export.ExportedProgram]]
     ]
 
@@ -708,25 +708,53 @@ def __call__(
         Returns:
             The model output as computed by ONNX Runtime
         """
-        import onnxruntime  # type: ignore[import]
 
-        # model specified by the user has precedence, when specified
-        model_with_state_dict = model_with_state_dict or self._model_torch
+        # TODO: If ONNX used absolute paths on the initializers external data files,
+        # users could call ONNXProgram.save and use ONNXProgram.__call__ without the internal save below
+        with contextlib.ExitStack() as stack:
+            # model specified by the user has precedence, when specified
+            model_with_state_dict = model_with_state_dict or self._model_torch
+
+            if self.fake_context:
+                tmpdir_path = stack.enter_context(tempfile.TemporaryDirectory())
+                warnings.warn(
+                    "Cannot run model directly from `ONNXProgram` because"
+                    " the model was exported using `enable_fake_mode`."
+                    " The model will be serialized to disk using a temporary folder ({tmpdir_path})"
+                    " to populate the model with initializers before being execution."
+                )
+                # TODO: Revisit the need of `model_with_state_dict` being a real model and not just its state
+                onnx_model = os.path.join(tmpdir_path, "model.onnx")
+                if isinstance(model_with_state_dict, torch.nn.Module):
+                    model_state = model_with_state_dict.state_dict()
+                elif isinstance(model_with_state_dict, torch_export.ExportedProgram):
+                    model_state = model_with_state_dict.state_dict
+                else:
+                    model_state = None
+                self.save(
+                    onnx_model,
+                    model_state=model_state,
+                )
+            else:
+                onnx_model = self.model_proto.SerializeToString()  # type: ignore[assignment]
 
-        onnx_input = self.adapt_torch_inputs_to_onnx(
-            *args, model_with_state_dict=model_with_state_dict, **kwargs
-        )
-        options = options or ONNXRuntimeOptions()
-        providers = options.execution_providers or onnxruntime.get_available_providers()
-        onnx_model = self.model_proto.SerializeToString()
-        ort_session = onnxruntime.InferenceSession(onnx_model, providers=providers)
-
-        onnxruntime_input = {
-            k.name: v.numpy(force=True)
-            for k, v in zip(ort_session.get_inputs(), onnx_input)
-        }
+            import onnxruntime  # type: ignore[import]
 
-        return ort_session.run(None, onnxruntime_input)
+            onnx_input = self.adapt_torch_inputs_to_onnx(
+                *args, model_with_state_dict=model_with_state_dict, **kwargs
+            )
+            options = options or ONNXRuntimeOptions()
+            providers = (
+                options.execution_providers or onnxruntime.get_available_providers()
+            )
+            ort_session = onnxruntime.InferenceSession(onnx_model, providers=providers)
+
+            onnxruntime_input = {
+                k.name: v.numpy(force=True)
+                for k, v in zip(ort_session.get_inputs(), onnx_input)
+            }
+
+            return ort_session.run(None, onnxruntime_input)
 
     @property
     def model_proto(self) -> onnx.ModelProto:  # type: ignore[name-defined]
@@ -759,6 +787,7 @@ def model_signature(self) -> Optional[torch.export.ExportGraphSignature]:
             the last 2 inputs are user inputs (namely x and b).
             The first output is a buffer mutation (namely my_buffer2) and the last output is the actual model output.
 
+            >>> import pprint
             >>> class CustomModule(torch.nn.Module):
             ...     def __init__(self):
             ...         super().__init__()
@@ -787,23 +816,45 @@ def model_signature(self) -> Optional[torch.export.ExportGraphSignature]:
             >>> inputs = (torch.rand((64, 1, 28, 28), dtype=torch.float32), torch.randn(3))
             >>> exported_program = torch.export.export(CustomModule(), args=inputs)
             >>> onnx_program = torch.onnx.dynamo_export(exported_program, *inputs)
-            >>> print(onnx_program.model_signature)
-            ExportGraphSignature(
-                input_specs=[
-                    InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='arg0_1'), target='conv1.weight'),
-                    InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='arg1_1'), target='conv2.weight'),
-                    InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='arg2_1'), target='fc1.weight'),
-                    InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='arg3_1'), target='fc2.weight'),
-                    InputSpec(kind=<InputKind.BUFFER: 3>, arg=TensorArgument(name='arg4_1'), target='my_buffer2'),
-                    InputSpec(kind=<InputKind.BUFFER: 3>, arg=TensorArgument(name='arg5_1'), target='my_buffer1'),
-                    InputSpec(kind=<InputKind.USER_INPUT: 1>, arg=TensorArgument(name='l_x_'), target=None),
-                    InputSpec(kind=<InputKind.USER_INPUT: 1>, arg=TensorArgument(name='arg1'), target=None)
-                ],
-                output_specs=[
-                    OutputSpec(kind=<OutputKind.BUFFER_MUTATION: 3>, arg=TensorArgument(name='add'), target='my_buffer2'),
-                    OutputSpec(kind=<OutputKind.USER_OUTPUT: 1>, arg=TensorArgument(name='_log_softmax'), target=None)
-                ]
-            )
+            >>> pprint.pprint(onnx_program.model_signature)
+            ExportGraphSignature(input_specs=[InputSpec(kind=<InputKind.PARAMETER: 2>,
+                                                  arg=TensorArgument(name='p_conv1_weight'),
+                                                  target='conv1.weight',
+                                                  persistent=None),
+                                        InputSpec(kind=<InputKind.PARAMETER: 2>,
+                                                  arg=TensorArgument(name='p_conv2_weight'),
+                                                  target='conv2.weight',
+                                                  persistent=None),
+                                        InputSpec(kind=<InputKind.PARAMETER: 2>,
+                                                  arg=TensorArgument(name='p_fc1_weight'),
+                                                  target='fc1.weight',
+                                                  persistent=None),
+                                        InputSpec(kind=<InputKind.PARAMETER: 2>,
+                                                  arg=TensorArgument(name='p_fc2_weight'),
+                                                  target='fc2.weight',
+                                                  persistent=None),
+                                        InputSpec(kind=<InputKind.BUFFER: 3>,
+                                                  arg=TensorArgument(name='b_my_buffer2'),
+                                                  target='my_buffer2',
+                                                  persistent=True),
+                                        InputSpec(kind=<InputKind.BUFFER: 3>,
+                                                  arg=TensorArgument(name='b_my_buffer1'),
+                                                  target='my_buffer1',
+                                                  persistent=True),
+                                        InputSpec(kind=<InputKind.USER_INPUT: 1>,
+                                                  arg=TensorArgument(name='x'),
+                                                  target=None,
+                                                  persistent=None),
+                                        InputSpec(kind=<InputKind.USER_INPUT: 1>,
+                                                  arg=TensorArgument(name='b'),
+                                                  target=None,
+                                                  persistent=None)],
+                           output_specs=[OutputSpec(kind=<OutputKind.BUFFER_MUTATION: 3>,
+                                                    arg=TensorArgument(name='add'),
+                                                    target='my_buffer2'),
+                                         OutputSpec(kind=<OutputKind.USER_OUTPUT: 1>,
+                                                    arg=TensorArgument(name='_log_softmax'),
+                                                    target=None)])
         """
 
         return self._model_signature
@@ -828,7 +879,7 @@ def adapt_torch_inputs_to_onnx(
             Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
         ] = None,
         **model_kwargs,
-    ) -> Sequence[Union[torch.Tensor, int, float, bool]]:
+    ) -> Sequence[Union[torch.Tensor, int, float, bool, torch.dtype]]:
         """Converts the PyTorch model inputs to exported ONNX model inputs format.
 
         Due to design differences, input/output format between PyTorch model and exported
@@ -959,25 +1010,30 @@ def save(
         self,
         destination: Union[str, io.BufferedIOBase],
         *,
-        model_state_dict: Optional[Union[Dict[str, Any], str]] = None,
+        include_initializers: bool = True,
+        model_state: Optional[Union[Dict[str, Any], str]] = None,
         serializer: Optional[ONNXProgramSerializer] = None,
     ) -> None:
         """Saves the in-memory ONNX model to ``destination`` using specified ``serializer``.
 
         Args:
             destination: The destination to save the ONNX model. It can be either a string or a file-like object.
-                When used with ``model_state_dict``, it must be a string with a full path to the destination.
-                In that case, besides saving the ONNX model, a folder with "_initializers" suffix (without extension)
-                will be created to store the each initializer of the ONNX model in a separate file. For example, if the
-                destination is "/path/model.onnx", the initializers will be saved in "/path/model_initializers/" folder.
-            model_state_dict: The state_dict of the PyTorch model containing all weights on it.
-                It can be either a dict as returned by :meth:`model.state_dict`, or a string with a file name.
-                Required when :func:`enable_fake_mode` is used but real initializers are needed on the ONNX graph.
+                When used with ``model_state``, it must be a string with a full path to the destination.
+                If `destination` is a string, besides saving the ONNX model into a file, model weights are also stored
+                in separate files in the same directory as the ONNX model. E.g. for `destination="/path/model.onnx"`,
+                the initializers are saved in "/path/" folder along with "onnx.model".
+            include_initializers: Whether to include initializers in the ONNX graph as external data.
+                Cannot be combined with `model_state_dict`.
+            model_state: The state_dict of the PyTorch model containing all weights on it.
                 It can be either a string with the path to a checkpoint or a dictionary with the actual model state.
-
+                The supported file formats are the same as those supported by `torch.load` and `safetensors.safe_open`.
+                Required when :func:`enable_fake_mode` is used but real initializers are needed on the ONNX graph.
             serializer: The serializer to use. If not specified, the model will be serialized as Protobuf.
         """
 
+        assert (
+            include_initializers is True or model_state is None
+        ), "Cannot specify both `include_initializers=False` and `model_state`."
         if serializer is None:
             if isinstance(destination, str):
                 serializer = LargeProtobufONNXProgramSerializer(destination)
@@ -985,50 +1041,44 @@ def save(
                 serializer = ProtobufONNXProgramSerializer()
 
         # Add initializers when symbolic tracing is enabled
-        _model_state_dict_files: List[Union[str, io.BytesIO]] = []
-        if model_state_dict is not None:
-            if isinstance(model_state_dict, dict):
-                model_state_dict_file = io.BytesIO()
-                torch.save(model_state_dict, model_state_dict_file)
-                model_state_dict_file.seek(0)
-                _model_state_dict_files.append(model_state_dict_file)
-            else:
-                isinstance(
-                    model_state_dict, str
-                ), "model_state_dict must be a path to the model's state_dict or the actual state_dict"
-                _model_state_dict_files.append(model_state_dict)
-        elif self._fake_context and self._fake_context.state_dict_paths:
-            # Load state from previous model.load_state_dict() call within enable_fake_mode() context
-            for path in self._fake_context.state_dict_paths:
-                if path in _model_state_dict_files:
-                    # ignore duplicate
-                    continue
-                try:
-                    extra_state_dict = torch.load(path)
-                    extra_state_dict_file = io.BytesIO()
-                    torch.save(extra_state_dict, extra_state_dict_file)
-                    extra_state_dict_file.seek(0)
-                    _model_state_dict_files.append(extra_state_dict_file)
-                except FileNotFoundError:
-                    # It is ok to ignore transient state_dict file created within context manager
-                    pass
-
-        if _model_state_dict_files:
+        _model_state_files: List[Union[str, io.BytesIO, Dict[str, Any]]] = []
+        if include_initializers:
+            if model_state is not None:
+                assert isinstance(
+                    model_state, (dict, str)
+                ), "model_state must be a path to the model's state_dict or the actual state_dict"
+                # NOTE: For dict, there can be performance penalty or high memory usage that might lead to OOM
+                #       if the dict wasn't loaded with torch.load(..., mmap=True, map_location="cpu")
+                _model_state_files.append(model_state)
+            elif self._fake_context and self._fake_context.state_dict_paths:
+                # Load state from previous model.load_state_dict() call within enable_fake_mode() context
+                for path in self._fake_context.state_dict_paths:
+                    if path in _model_state_files:
+                        # ignore duplicate
+                        continue
+                    if os.path.exists(path):  # type: ignore[arg-type]
+                        _model_state_files.append(path)
+        else:
+            # self.model_proto.graph.initializer.clear() not available in older protobuf versions
+            initializer_count = len(self.model_proto.graph.initializer)
+            for _ in range(initializer_count):
+                del self.model_proto.graph.initializer[0]
+
+        if _model_state_files:
             if not isinstance(destination, str):
                 raise RuntimeError(
-                    "`destination` must be a string with a path when `model_state_dict` is specified."
+                    "`destination` must be a string with a path when `model_state` is specified."
                 )
             destination_path, destination_filename = os.path.split(destination)
+            destination_path = destination_path or os.getcwd()
             onnx_model_location = destination_filename
-            onnx_initializer_location = (
-                destination_filename.split(".")[0] + "_initializers"
-            )
+
             # TODO: Should this be part of the serializer?
             fx_serialization.save_model_with_external_data(
                 destination_path,
                 onnx_model_location,
-                onnx_initializer_location,
-                tuple(_model_state_dict_files),
+                "",  # When initializers >2GB, must be in the same folder as the model
+                tuple(_model_state_files),
                 self.model_proto,
             )
         else:
@@ -1160,7 +1210,7 @@ def __init__(
         self.model_args = model_args
         self.model_kwargs = model_kwargs
 
-        # TODO: Retire FXSymbolicTracer
+        # TODO: https://github.com/pytorch/pytorch/issues/107714
         # NOTE: FXSymbolicTracer would fail in this assert, as it does not use `enable_fake_mode`
         from torch.onnx._internal.fx import fx_symbolic_graph_extractor
 
@@ -1170,11 +1220,22 @@ def __init__(
             self._assert_fake_tensor_mode()
 
     def export(self) -> ONNXProgram:
-        with self.options.diagnostic_context:
+        from torch.export._trace import (  # TODO: Prevent circular dependency
+            DEFAULT_EXPORT_DYNAMO_CONFIG,
+        )
+
+        # TODO: Defer `import onnxscript` out of `import torch` path
+        # https://github.com/pytorch/pytorch/issues/103764
+        from torch.onnx._internal.fx import decomposition_skip
+
+        with self.options.diagnostic_context, decomposition_skip.enable_decomposition_skips(
+            self.options
+        ), torch._dynamo.config.patch(
+            dataclasses.asdict(DEFAULT_EXPORT_DYNAMO_CONFIG)
+        ):
             graph_module = self.options.fx_tracer.generate_fx(
                 self.options, self.model, self.model_args, self.model_kwargs
             )
-
             # TODO: Defer `import onnxscript` out of `import torch` path
             # https://github.com/pytorch/pytorch/issues/103764
             from torch.onnx._internal.fx import fx_onnx_interpreter
@@ -1207,6 +1268,16 @@ def export(self) -> ONNXProgram:
                 self.options.onnx_registry.opset_version,
             )
 
+            try:
+                from onnxscript import optimizer
+
+                onnx_model = optimizer.optimize(onnx_model)
+            except ImportError:
+                warnings.warn(
+                    "ONNXScript optimizer is not available. Skipping optimization. "
+                    "Please `pip install onnxscript -U` to enable post-export optimization."
+                )
+
             return torch.onnx.ONNXProgram(
                 onnx_model,
                 self.options.fx_tracer.input_adapter,
@@ -1276,7 +1347,7 @@ class OnnxExporterError(RuntimeError):
     access to the partial export results and associated metadata.
     """
 
-    onnx_program: Final[ONNXProgram]
+    onnx_program: Final[ONNXProgram]  # type: ignore[misc]
 
     def __init__(self, onnx_program: ONNXProgram, message: str):
         """
diff --git a/torch/onnx/_internal/fx/_pass.py b/torch/onnx/_internal/fx/_pass.py
index e6cb9a740f225..69fa023b9addb 100644
--- a/torch/onnx/_internal/fx/_pass.py
+++ b/torch/onnx/_internal/fx/_pass.py
@@ -10,7 +10,7 @@
 import logging
 import sys
 
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional, Tuple, Union
 
 import torch
 import torch.fx
@@ -32,8 +32,10 @@ def to_onnx_domain_string(self) -> str:
         )
 
     @classmethod
-    def from_python_class(cls, python_class: type) -> PackageInfo:
-        package_name = python_class.__module__.split(".")[0]
+    def from_python_class(cls, python_class_name: Union[type, str]) -> PackageInfo:
+        if isinstance(python_class_name, type):
+            python_class_name = python_class_name.__module__
+        package_name = python_class_name.split(".")[0]
         package = __import__(package_name)
         version = getattr(package, "__version__", None)
         # TODO: Figure out how to retrieve commit hash.
diff --git a/torch/onnx/_internal/fx/decomposition_skip.py b/torch/onnx/_internal/fx/decomposition_skip.py
new file mode 100644
index 0000000000000..425e8604468b8
--- /dev/null
+++ b/torch/onnx/_internal/fx/decomposition_skip.py
@@ -0,0 +1,197 @@
+"""A context manager that disables the decomposition of certain ops during dynamo tracing.
+
+The approach is to temporarily hijack the operator callable with PT2 custom operator.
+The custom operator will not be decomposed and will show up as a single node to be exported to ONNX.
+
+For the time being the decomposition of these ops is otherwise unavoidable.
+
+https://github.com/pytorch/pytorch/issues/116684
+https://github.com/pytorch/pytorch/issues/115883
+
+This solution will no longer be required once the issue is resolved.
+"""
+from __future__ import annotations
+
+import abc
+import contextlib
+
+from typing import Callable, Sequence, Type
+
+from onnxscript.function_libs.torch_lib.ops import (  # type: ignore[import-not-found]
+    core as torchlib_core,
+    nn as torchlib_nn,
+)
+
+import torch
+from torch._decomp import decompositions
+
+_NEW_OP_NAMESPACE: str = "onnx_export"
+"""The namespace for the custom operator."""
+
+
+class DecompSkip(abc.ABC):
+    op_callable: Callable
+    """The original operator callable to skip decomposition."""
+    onnxscript_function: Callable
+    """The ONNXScript function to be registered for exporting the custom operator."""
+
+    new_op_name: str
+    """The name for the custom operator."""
+    new_op_schema: str
+    """The schema for the custom operator. This should match with the signature of the original operator."""
+
+    @classmethod
+    @abc.abstractmethod
+    def register(cls, export_options: torch.onnx.ExportOptions):
+        """Registers the custom operator and overrides the original operator.
+
+        It should do the following steps in order:
+
+        1. Register the custom operator.
+        2. Override the original operator with the replacement callable.
+        3. Register the ONNXScript function for exporting the custom operator.
+        """
+        ...
+
+    @classmethod
+    @abc.abstractmethod
+    def unregister(cls):
+        """Restores the original operator callable."""
+        ...
+
+    @classmethod
+    @abc.abstractmethod
+    def abstract(cls, *args, **kwargs):
+        """An abstract impl (meta kernel) for the operator."""
+        ...
+
+    @classmethod
+    def register_custom_op(cls):
+        """Registers the custom operator."""
+        new_op_qualname = f"{_NEW_OP_NAMESPACE}::{cls.new_op_name}"
+        torch.library.define(new_op_qualname, cls.new_op_schema)
+        torch.library.impl(new_op_qualname, "default", cls.replacement)
+        torch.library.impl_abstract(new_op_qualname, cls.abstract)
+
+    @classmethod
+    def replacement(cls, *args, **kwargs):
+        """A replacement callable for the operator to be hijacked.
+
+        This has the same signature and eager behavior as the original operator.
+        """
+        return cls.op_callable(*args, **kwargs)
+
+
+class UpsampleBilinear2DDecompSkip(DecompSkip):
+    op_callable = torch._C._nn.upsample_bilinear2d  # type: ignore[attr-defined]
+    onnxscript_function = torchlib_nn.aten_upsample_bilinear2d_vec  # type: ignore[attr-defined]
+    new_op_name = "upsample_bilinear2d"
+    new_op_schema = "(Tensor self, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> (Tensor)"
+
+    @classmethod
+    def register(cls, export_options: torch.onnx.ExportOptions):
+        if not hasattr(torch.ops, _NEW_OP_NAMESPACE) or not hasattr(
+            torch.ops.onnx_export, cls.new_op_name
+        ):
+            cls.register_custom_op()
+        torch._C._nn.upsample_bilinear2d = torch.ops.onnx_export.upsample_bilinear2d  # type: ignore[attr-defined]
+        if export_options.onnx_registry is None:
+            export_options.onnx_registry = torch.onnx.OnnxRegistry()
+        registry = export_options.onnx_registry
+        registry.register_op(
+            function=cls.onnxscript_function,
+            namespace=_NEW_OP_NAMESPACE,
+            op_name=cls.new_op_name,
+        )
+
+    @classmethod
+    def unregister(cls):
+        torch._C._nn.upsample_bilinear2d = cls.op_callable  # type: ignore[attr-defined]
+
+    @classmethod
+    def abstract(cls, input, output_size, align_corners, scale_factors):
+        osize = decompositions.upsample_compute_output_size(
+            input.size(), output_size, scale_factors
+        )
+        return torch.empty(
+            (input.size(0), input.size(1), *osize),
+            dtype=input.dtype,
+            device=input.device,
+        )
+
+
+class InstanceNormDecompSkip(DecompSkip):
+    op_callable = torch.instance_norm  # type: ignore[attr-defined]
+    onnxscript_function = torchlib_core.aten_instance_norm  # type: ignore[attr-defined]
+    new_op_name = "instance_norm"
+    new_op_schema = (
+        "(Tensor input, Tensor? weight, Tensor? bias, "
+        "Tensor? running_mean, Tensor? running_var, "
+        "bool use_input_stats, float momentum, float eps, "
+        "bool cudnn_enabled) -> Tensor"
+    )
+
+    @classmethod
+    def register(cls, export_options: torch.onnx.ExportOptions):
+        if not hasattr(torch.ops, _NEW_OP_NAMESPACE) or not hasattr(
+            torch.ops.onnx_export, cls.new_op_name
+        ):
+            cls.register_custom_op()
+
+        torch.instance_norm = torch.ops.onnx_export.instance_norm  # type: ignore[attr-defined]
+        if export_options.onnx_registry is None:
+            export_options.onnx_registry = torch.onnx.OnnxRegistry()
+        registry = export_options.onnx_registry
+        registry.register_op(
+            function=cls.onnxscript_function,
+            namespace=_NEW_OP_NAMESPACE,
+            op_name=cls.new_op_name,
+        )
+
+    @classmethod
+    def unregister(cls):
+        torch.instance_norm = cls.op_callable  # type: ignore[attr-defined]
+
+    @classmethod
+    def abstract(
+        cls,
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        use_input_stats: bool,
+        momentum: float,
+        eps: float,
+        cudnn_enabled: bool,
+    ):
+        return torch.empty(
+            input.size(),
+            dtype=input.dtype,
+            device=input.device,
+        )
+
+
+_DEFAULT_SKIP_LIST = [
+    UpsampleBilinear2DDecompSkip,
+    InstanceNormDecompSkip,
+]
+
+
+@contextlib.contextmanager
+def enable_decomposition_skips(
+    export_options: torch.onnx.ExportOptions,
+    skips: Sequence[Type[DecompSkip]] = _DEFAULT_SKIP_LIST,
+):
+    """A context manager that enables the decomposition skips.
+
+    The original operator callables that are otherwise decomposed are replaced with custom operators.
+    The ONNXScript functions for exporting the custom operators are added to the ONNX registry inside export_options.
+    """
+    try:
+        for skip in skips:
+            skip.register(export_options)
+        yield
+    finally:
+        for skip in skips:
+            skip.unregister()
diff --git a/torch/onnx/_internal/fx/decomposition_table.py b/torch/onnx/_internal/fx/decomposition_table.py
index 4754c44a9b9ca..4f3f705ca8676 100644
--- a/torch/onnx/_internal/fx/decomposition_table.py
+++ b/torch/onnx/_internal/fx/decomposition_table.py
@@ -13,7 +13,7 @@
 from torch.onnx._internal.fx import registration
 
 
-# TODO: OnnxRegistry annotation: beartype is a runtime type checker for python3,
+# NOTE: OnnxRegistry annotation: beartype is a runtime type checker for python3,
 # so it doesn't work with TYPE_CHECKING
 @_beartype.beartype
 def _create_onnx_supports_op_overload_table(
@@ -75,8 +75,6 @@ def _create_onnx_supports_op_overload_table(
     return table
 
 
-# TODO: OnnxRegistry annotation: beartype is a runtime type checker for python3,
-# so it doesn't work with TYPE_CHECKING
 @_beartype.beartype
 def create_onnx_friendly_decomposition_table(
     registry,
diff --git a/torch/onnx/_internal/fx/diagnostics.py b/torch/onnx/_internal/fx/diagnostics.py
index 01a916661e265..11e4c79f2e1a3 100644
--- a/torch/onnx/_internal/fx/diagnostics.py
+++ b/torch/onnx/_internal/fx/diagnostics.py
@@ -179,7 +179,7 @@ def _torch_nn_parameter(obj: torch.nn.Parameter) -> str:
 
 @_format_argument.register
 def _onnxscript_torch_script_tensor(obj: graph_building.TorchScriptTensor) -> str:
-    return f"`TorchScriptTensor({fx_type_utils.from_torch_dtype_to_abbr(obj.dtype)}{_stringify_shape(obj.shape)})`"
+    return f"`TorchScriptTensor({fx_type_utils.from_torch_dtype_to_abbr(obj.dtype)}{_stringify_shape(obj.shape)})`"  # type: ignore[arg-type]  # noqa: B950
 
 
 @_format_argument.register
diff --git a/torch/onnx/_internal/fx/fx_onnx_interpreter.py b/torch/onnx/_internal/fx/fx_onnx_interpreter.py
index b6c26de2066c1..50ead7556f375 100644
--- a/torch/onnx/_internal/fx/fx_onnx_interpreter.py
+++ b/torch/onnx/_internal/fx/fx_onnx_interpreter.py
@@ -112,7 +112,7 @@ def _retrieve_or_adapt_input_to_graph_set(
         #    torch.jit.Value, fx_name_to_onnxscript_value[fx_node_arg.name],
         #    in TorchScript graph.
         return fx_name_to_onnxscript_value[onnx_tensor.name]
-    if isinstance(onnx_tensor, (tuple, list)) and any(
+    elif isinstance(onnx_tensor, (tuple, list)) and any(
         isinstance(node, torch.fx.Node)
         and fx_type_utils.is_torch_symbolic_type(node.meta.get("val"))
         for node in onnx_tensor
@@ -128,15 +128,41 @@ def _retrieve_or_adapt_input_to_graph_set(
                 List[int],
             ]
         ] = []
+        # onnx_tensor contains a list of scalars which could be one of
+        #   - tensor with empty shape,
+        #   - tensor with tensor with shape (1,),
+        #   - torch.SymInt,
+        #   - int
+        #   - ...
+        # They should all be promoted to tensor with shape (1,)
+        # in order to call ONNX's Concat.
         for tensor in onnx_tensor:
+            # Prepare `tensor` as input of ONNX's Concat.
+
             if isinstance(
                 tensor, torch.fx.Node
             ) and fx_type_utils.is_torch_symbolic_type(tensor.meta.get("val")):
-                sequence_mixed_elements.append(fx_name_to_onnxscript_value[tensor.name])
+                # In this case, tensor is a torch.SymInt from Dynamo's perspective.
+                # It might be mapped to tensor with shape () or (1,) in ONNX.
+                element_value = fx_name_to_onnxscript_value[tensor.name]
+                if isinstance(
+                    element_value, onnxscript_graph_building.TorchScriptTensor
+                ):
+                    # All elements sequence_mixed_elements will be send to onnx's Concat
+                    # as inputs. Therefore, they are required to have the same rank.
+                    # Since tensors with rank=0 (i.e., scalar) cannot be concated, all
+                    # scalars are promoted to tensors with shape (1,).
+                    with onnxscript.evaluator.default_as(tracer):
+                        element_value = onnxscript.opset18.Reshape(element_value, [1])  # type: ignore[arg-type, type-var]
+                sequence_mixed_elements.append(element_value)
             elif isinstance(tensor, int):
                 # NOTE: op.Concat doesn't support scalar, so we need to wrap it with
                 # dim, and onnx-script will promote it to tensor(int64)
                 sequence_mixed_elements.append([tensor])
+            else:
+                raise RuntimeError(
+                    f"Unsupported type in sequence_mixed_elements: {type(tensor)}"
+                )
         # Concat all the elements in the sequence.
         # shapes are mapped to tensors in ONNX graph (TorchScriptGraph),
         # so list of sym_ints is concatenated to a tensor before calling ONNX op.
@@ -179,7 +205,6 @@ def _retrieve_or_adapt_input_to_graph_set(
         # torch.device is not supported by onnxscript (no op). We turn it into
         # a string.
         return str(onnx_tensor)
-
     # all other cases, we do nothing.
     return onnx_tensor
 
@@ -253,6 +278,17 @@ def _fill_tensor_shape_type(
                 type(expected_value)
             )
             onnxscript_value.shape = torch.Size([])
+        elif isinstance(expected_value, complex):
+            # From complex scalar to real representation
+            onnxscript_value_to_torch_dtype = (
+                fx_type_utils.from_scalar_type_to_torch_dtype(type(expected_value))
+            )
+            onnxscript_value.dtype = (
+                fx_type_utils.from_complex_to_float(onnxscript_value_to_torch_dtype)
+                if onnxscript_value_to_torch_dtype is not None
+                else None
+            )
+            onnxscript_value.shape = torch.Size([2])
         elif fx_type_utils.is_torch_complex_dtype(expected_value.dtype):
             # Like torch.view_as_real, we flatten complex tensors to real tensors with
             # additional last dimension of 2
@@ -282,7 +318,7 @@ def _fill_in_default_kwargs(
 ) -> Tuple[List[fx_type_utils.Argument], Dict[str, fx_type_utils.Argument]]:
     """Find and Fill in the not provided kwargs with default values."""
 
-    # TODO(titaiwang): aten::sym_size has overload, but fx graph is using
+    # TODO: aten::sym_size has overload, but fx graph is using
     # overloadpacket for some reasons.
     # https://github.com/pytorch/pytorch/issues/97201
     # We manually assigned overload for aten::sym_size.
@@ -333,6 +369,7 @@ def _wrap_fx_args_as_onnxscript_args(
                 float,
                 bool,
                 list,
+                complex,
             ]
         ]
     ],
@@ -581,7 +618,7 @@ def placeholder(
             # NOTE: ONNX doesn't support tensor of complex64/complex128, so we
             # convert them to float32/float64 with real representation.
             if fx_type_utils.is_torch_complex_dtype(fake_tensor.dtype):
-                fake_tensor = torch.view_as_real(fake_tensor)
+                fake_tensor = torch.view_as_real(fake_tensor.resolve_conj())
             output = onnxscript_graph.add_input(
                 input_name=node.name,
                 shape=fake_tensor.shape,
@@ -643,13 +680,13 @@ def call_function(
         # Map FX inputs to ONNX inputs and fill optional inputs with default values.
         # torch_args and torch_kwargs are for op-level validation
         fx_args, fx_kwargs = _fill_in_default_kwargs(node)
+
         onnx_args, onnx_kwargs = _wrap_fx_args_as_onnxscript_args(
             fx_args,
             fx_kwargs,
             fx_name_to_onnxscript_value,
             onnxscript_tracer,
         )
-
         # Dispatch to ONNX op through OpShema. The input argument dtypes are compared to
         # function signature in OpSchema, and find the best matched overload.
         symbolic_fn = onnxfunction_dispatcher.dispatch(
@@ -816,11 +853,6 @@ def get_attr(
         ],
         fx_graph_module: torch.fx.GraphModule,
     ):
-        # TODO: Constant tensors and buffer/weights are both categorized into `get_attr`,
-        # but they are different to ONNX. We need to distinguish them.
-        # Constant tensors should become ONNX constants in the graph, while buffers/weights ONNX initializers.
-        # For now they are all converted to ONNX initializers.
-
         assert isinstance(node.target, str), f"node.target {node.target} is not a str."
         attr_tensor = getattr(fx_graph_module, node.target)
         assert isinstance(attr_tensor, torch.Tensor), f"{attr_tensor} is not a tensor."
diff --git a/torch/onnx/_internal/fx/onnxfunction_dispatcher.py b/torch/onnx/_internal/fx/onnxfunction_dispatcher.py
index 55efbe5e2e542..2986ac279ec3c 100644
--- a/torch/onnx/_internal/fx/onnxfunction_dispatcher.py
+++ b/torch/onnx/_internal/fx/onnxfunction_dispatcher.py
@@ -113,7 +113,9 @@ def dispatch(
         self,
         node: torch.fx.Node,
         onnx_args: Sequence[
-            Optional[Union[fx_type_utils.TensorLike, str, int, float, bool, list]]
+            Optional[
+                Union[fx_type_utils.TensorLike, str, int, float, bool, list, complex]
+            ]
         ],
         onnx_kwargs: Dict[str, fx_type_utils.Argument],
         diagnostic_context: diagnostics.DiagnosticContext,
@@ -152,13 +154,10 @@ def _filter_or_keep_complex(
         default_and_custom_functions: List[registration.ONNXFunction],
         diagnostic_context: diagnostics.DiagnosticContext,
     ) -> List[registration.ONNXFunction]:
-        if any(
-            torch.is_complex(arg.meta["val"])
-            for arg in node.args
-            if isinstance(arg, torch.fx.Node)
-            and "val" in arg.meta
-            and isinstance(arg.meta["val"], torch.Tensor)
-        ):
+        """Filter the complex functions if the input has complex dtype."""
+
+        args_with_complex_dtype = [_is_arg_with_complex_dtype(arg) for arg in node.args]
+        if any(args_with_complex_dtype):
             default_and_custom_functions = [
                 func for func in default_and_custom_functions if func.is_complex
             ]
@@ -206,7 +205,9 @@ def _find_the_perfect_or_nearest_match_onnxfunction(
         node: torch.fx.Node,  # this is used in diagnostic_message_formatter
         default_and_custom_functions: List[registration.ONNXFunction],
         onnx_args: Sequence[
-            Optional[Union[fx_type_utils.TensorLike, str, int, float, bool, list]]
+            Optional[
+                Union[fx_type_utils.TensorLike, str, int, float, bool, list, complex]
+            ]
         ],
         onnx_kwargs: Dict[str, fx_type_utils.Argument],
         diagnostic_context: diagnostics.DiagnosticContext,
@@ -324,8 +325,6 @@ def _get_aten_name(
                     isinstance(node_arg, torch.fx.Node)
                     and not fx_type_utils.is_torch_symbolic_type(node_arg.meta["val"])
                 ):
-                    # TODO: reduce number of explicit initializations.
-                    # TODO: Log location, stack.
                     diagnostic = diagnostics.UnsupportedFxNodeDiagnostic(
                         diagnostics.rules.no_symbolic_function_for_call_function,
                         diagnostics.levels.ERROR,
@@ -386,7 +385,6 @@ def get_function_overloads(
         )
 
         # NOTE: Fall back to default overload if the ONNX registry doesn't have the overload.
-        # TODO: Should we have a better fallback mechanism?
         if function_group is None:
             function_group = self.onnx_registry.get_op_functions(
                 namespace=internal_opname.namespace,
@@ -558,7 +556,9 @@ def perfect_match_inputs(
         self,
         diagnostic: diagnostics.Diagnostic,
         args: Sequence[
-            Optional[Union[fx_type_utils.TensorLike, str, int, float, bool, list]]
+            Optional[
+                Union[fx_type_utils.TensorLike, str, int, float, bool, list, complex]
+            ]
         ],
         kwargs: Dict[str, fx_type_utils.Argument],
     ) -> bool:
@@ -716,7 +716,9 @@ def _match_onnx_attribute_type(
     def _record_matching_score(
         self,
         inputs: Sequence[
-            Optional[Union[fx_type_utils.TensorLike, str, int, float, bool, list]]
+            Optional[
+                Union[fx_type_utils.TensorLike, str, int, float, bool, list, complex]
+            ]
         ],
         attributes: Dict[str, fx_type_utils.Argument],
     ):
@@ -770,7 +772,9 @@ def _separate_input_attributes_from_arguments(
         self,
         param_schemas: Sequence["onnxscript.values.ParamSchema"],
         args: Sequence[
-            Optional[Union[fx_type_utils.TensorLike, str, int, float, bool, list]]
+            Optional[
+                Union[fx_type_utils.TensorLike, str, int, float, bool, list, complex]
+            ]
         ],
         kwargs: Dict[str, fx_type_utils.Argument],
         fill_defaults: bool = True,
@@ -801,7 +805,6 @@ def _separate_input_attributes_from_arguments(
         # args, kwargs and param_schemas should be all in order
         # user may not specify all inputs or attributes
 
-        # TODO: avoid circular dependency
         import onnx
 
         onnx_inputs: List[Any] = []
@@ -836,7 +839,6 @@ def _separate_input_attributes_from_arguments(
                     onnx_attributes[param.name] = param.default
             # optional input
             elif param.is_input:
-                # TODO: support optional input default in onnx-script?
                 if fill_defaults:
                     onnx_inputs.append(None)
 
@@ -848,10 +850,26 @@ def _separate_input_attributes_from_arguments(
         return onnx_inputs, onnx_attributes
 
 
+@_beartype.beartype
+def _is_arg_with_complex_dtype(arg: fx_type_utils.Argument) -> bool:
+    """Check if the node has complex dtype recursively."""
+    if (
+        isinstance(arg, torch.fx.Node)
+        and "val" in arg.meta
+        and isinstance(arg.meta["val"], torch.Tensor)
+        and torch.is_complex(arg.meta["val"])
+    ):
+        return True
+    elif isinstance(arg, list):
+        for item in arg:
+            return _is_arg_with_complex_dtype(item)
+    return False
+
+
 @_beartype.beartype
 def _find_onnx_data_type(
     torch_input: Optional[
-        Union[fx_type_utils.TensorLike, str, int, float, bool, list, tuple]
+        Union[fx_type_utils.TensorLike, str, int, float, bool, list, tuple, complex]
     ]
 ) -> Set[str]:
     """Convert inputs data type from torch acceptable dtype to the compatible onnx dtype string."""
@@ -860,10 +878,13 @@ def _find_onnx_data_type(
         and torch_input.dtype is not None
     ):
         return fx_type_utils.from_torch_dtype_to_onnx_dtype_str(torch_input.dtype)
-    if isinstance(torch_input, (int, float, bool, str)):
+    if isinstance(torch_input, (int, float, bool, str, complex)):
         return fx_type_utils.from_torch_dtype_to_onnx_dtype_str(type(torch_input))
     if isinstance(torch_input, (list, tuple)) and torch_input:  # [Tensor, Tensor]
-        set_dtype = _find_onnx_data_type(torch_input[0])
+        the_first_non_none_item = next(
+            (item for item in torch_input if item is not None), None
+        )
+        set_dtype = _find_onnx_data_type(the_first_non_none_item)
         if any(isinstance(input, fx_type_utils.TensorLike) for input in torch_input):
             # NOTE: Any Tensor involved in a list would make it a seq(tensor(onnx_type))
             return {f"seq({dtype})" for dtype in set_dtype}
diff --git a/torch/onnx/_internal/fx/passes/modularization.py b/torch/onnx/_internal/fx/passes/modularization.py
index 7b1139b6127cc..b7c3b90cab665 100644
--- a/torch/onnx/_internal/fx/passes/modularization.py
+++ b/torch/onnx/_internal/fx/passes/modularization.py
@@ -24,7 +24,7 @@
 import torch.fx
 from torch.onnx._internal import _beartype
 
-from torch.onnx._internal.fx import _pass
+from torch.onnx._internal.fx import _pass, diagnostics
 from torch.utils import _pytree as pytree
 
 _FX_TRACER_NN_MODULE_META_TYPE = Tuple[str, type]
@@ -54,13 +54,16 @@ class _ModuleMeta:
         _raw_meta: The raw meta '(module_name, node.meta["nn_module_stack"][module_name])'.
     """
 
-    _module_class: Final[Optional[type]]
-    _module_name: Final[str]
-    _raw_meta: Final[Tuple[Any, Any]]
+    _module_class: Final[Optional[Union[type, str]]]  # type: ignore[misc]
+    _module_name: Final[str]  # type: ignore[misc]
+    _raw_meta: Final[Tuple[Any, Any]]  # type: ignore[misc]
 
     @_beartype.beartype
     def __init__(
-        self, module_name: str, module_class: Optional[type], raw_meta: Tuple[Any, Any]
+        self,
+        module_name: str,
+        module_class: Optional[Union[type, str]],
+        raw_meta: Tuple[Any, Any],
     ):
         self._module_name = module_name
         self._module_class = module_class
@@ -86,9 +89,10 @@ def qualified_module_class_name(self) -> str:
         """
         if self._module_class is None:
             return ""
-        return (
-            self._module_class.__module__ + "_" + self._module_class.__name__
-        ).replace(".", "_")
+        mod_cls = self._module_class
+        if isinstance(mod_cls, type):
+            mod_cls = mod_cls.__module__ + "." + mod_cls.__qualname__
+        return mod_cls.replace(".", "_")
 
     @property
     def module_class_name(self) -> str:
@@ -98,7 +102,9 @@ def module_class_name(self) -> str:
         """
         if self._module_class is None:
             return ""
-        return self._module_class.__name__
+        if isinstance(self._module_class, type):
+            return self._module_class.__name__
+        return self._module_class
 
     @property
     def module_name(self) -> str:
@@ -204,7 +210,7 @@ class _ModuleStackMeta:
             }
     """
 
-    _module_stack: Final[List[_ModuleMeta]]
+    _module_stack: Final[List[_ModuleMeta]]  # type: ignore[misc]
 
     @_beartype.beartype
     def __init__(
@@ -214,12 +220,18 @@ def __init__(
                 _FX_TRACER_NN_MODULE_STACK_META_TYPE, _DYNAMO_NN_MODULE_STACK_META_TYPE
             ]
         ],
+        is_exported_program: bool = True,
     ):
         self._module_stack = []
         if nn_module_stack_meta is None:
             return
         raw_meta = copy.copy(nn_module_stack_meta)
         for item in raw_meta.items():
+            # If produced by torch.export.export, there is another call stack layer
+            # that we need to skip
+            if is_exported_program:
+                is_exported_program = False
+                continue
             self.push(_ModuleMeta.from_raw_meta(item))
 
     def __len__(self) -> int:
@@ -321,13 +333,17 @@ def qualified_module_class_name(self) -> str:
         return self.top().qualified_module_class_name
 
     @property
-    def module_class(self) -> Optional[type]:
+    def module_class(self) -> Optional[Union[type, str]]:
         """Returns the module class of the top module."""
         return self.top()._module_class
 
 
-def _module_stack_meta_from_node(node: torch.fx.Node) -> _ModuleStackMeta:
-    return _ModuleStackMeta(node.meta.get("nn_module_stack"))
+def _module_stack_meta_from_node(
+    node: torch.fx.Node, is_exported_program: bool = False
+) -> _ModuleStackMeta:
+    return _ModuleStackMeta(
+        node.meta.get("nn_module_stack"), is_exported_program=is_exported_program
+    )
 
 
 def _get_unique_module_name(module_names: Dict[str, int], module_name: str) -> str:
@@ -495,6 +511,11 @@ def add_leaf_node(self, leaf_node: _LeafNode) -> None:
         """
         if self.is_same_module_as(leaf_node) or leaf_node.fx_op == "call_module":
             self._nodes.append(leaf_node)
+        elif leaf_node.fx_op == "placeholder":
+            # Although the original placeholder has empty nn_module_stack, the placeholder lifted
+            # from get_attr nodes by exported program has their original nn_module_stack. Here
+            # we need to avoid them building submodule.
+            self._nodes.append(leaf_node)
         elif self.is_parent_module_of(leaf_node):
             # This node belongs in a submodule.
             # Check if the last node is a submodule and if it is the parent of this node.
@@ -696,9 +717,11 @@ def _arg_transform(node: torch.fx.Node) -> torch.fx.Node:
 class _LeafNode(_IRNode):
     """Representing a single fx.Node."""
 
-    def __init__(self, node: torch.fx.Node):
+    def __init__(self, node: torch.fx.Node, is_exported_program: bool = False):
         self._node = node
-        self._stack_meta = _module_stack_meta_from_node(node)
+        self._stack_meta = _module_stack_meta_from_node(
+            node, is_exported_program=is_exported_program
+        )
 
     @property
     def fx_op(self) -> str:
@@ -729,7 +752,8 @@ class Modularize(_pass.Transform):
 
     In the flattened `fx.GraphModule`, each `nn.Module` forward call has been traced as
     a sequence of `fx.Node`s. All these `fx.Node`s are flattened and reside in the same
-    `fx.GraphModule`.
+    `fx.GraphModule`. `fx.GraphModule` could be from `torch.export.ExportedProgram` or
+    directly generated by `torch._dynamo.export` with torch.nn.Module.
 
     This pass generates a new `fx.GraphModule`. It groups the flattened `fx.Node`s that belong
     to the same `nn.Module` forward call into a sub `fx.GraphModule`. It then replaces the
@@ -818,6 +842,17 @@ class Modularize(_pass.Transform):
 
     """
 
+    @_beartype.beartype
+    def __init__(
+        self,
+        diagnostic_context: diagnostics.DiagnosticContext,
+        module: torch.fx.GraphModule,
+        is_exported_program: bool = False,
+    ):
+        super().__init__(diagnostic_context, module)
+        self.module = module
+        self.is_exported_program = is_exported_program
+
     @_beartype.beartype
     def _run(self) -> torch.fx.GraphModule:
         # DCE to remove unused nodes.
@@ -826,7 +861,14 @@ def _run(self) -> torch.fx.GraphModule:
         self.module.graph.eliminate_dead_code()
 
         reference_module = torch.fx.GraphModule(self.module, self.module.graph)
-        root_module_node = _ModuleNode(reference_module, _ModuleStackMeta(None))
+        root_module_node = _ModuleNode(
+            reference_module,
+            _ModuleStackMeta(
+                nn_module_stack_meta=None, is_exported_program=self.is_exported_program
+            ),
+        )
         for fx_node in self.module.graph.nodes:
-            root_module_node.add_leaf_node(_LeafNode(fx_node))
+            root_module_node.add_leaf_node(
+                _LeafNode(fx_node, is_exported_program=self.is_exported_program)
+            )
         return root_module_node.build_module({})
diff --git a/torch/onnx/_internal/fx/passes/type_promotion.py b/torch/onnx/_internal/fx/passes/type_promotion.py
index 04fea082d7d75..944cad4acf1cc 100644
--- a/torch/onnx/_internal/fx/passes/type_promotion.py
+++ b/torch/onnx/_internal/fx/passes/type_promotion.py
@@ -8,7 +8,7 @@
 import logging
 from types import ModuleType
 
-from typing import Any, Callable, Mapping, Optional, Sequence, Set
+from typing import Any, Callable, Mapping, Optional, Sequence, Set, Union
 
 import torch
 import torch._ops
@@ -69,17 +69,6 @@ class TypePromotionSnapshot:
     """Expected output dtype of the node."""
 
 
-@_beartype.beartype
-def _fake_tensor_from_node_val(node: torch.fx.Node) -> fake_tensor.FakeTensor:
-    """Syntactic sugar for retrieving fake tensor from node.meta['val']."""
-    val = node.meta.get("val", None)
-    if not isinstance(val, fake_tensor.FakeTensor):
-        raise RuntimeError(
-            f"Cannot retrieve fake tensor from node {node}. Got type({type(val)}) instead."
-        )
-    return val
-
-
 class TypePromotionRule(abc.ABC):
     """Base class for type promotion rule per 'torch.ops.{namespace}.{op_name}'."""
 
@@ -195,10 +184,9 @@ def _consolidate_input_dtype(
         since there is no way to differentiate between inserted upcasts and model code
         casts. Hence we consolidate the input dtype to the result dtype to avoid this.
         """
-        if (
-            not self._USE_OPMATH
-            and self.promotion_kind
-            == _prims_common.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+        if not self._USE_OPMATH and self.promotion_kind in (
+            _prims_common.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+            _prims_common.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
         ):
             return result_dtype
         return computed_dtype
@@ -227,8 +215,8 @@ def preview_type_promotion(
         )
 
         return TypePromotionSnapshot(
-            {i: consolidated_input_dtype for i in candidate_args.keys()},
-            {name: consolidated_input_dtype for name in candidate_kwargs.keys()},
+            dict.fromkeys(candidate_args.keys(), consolidated_input_dtype),
+            dict.fromkeys(candidate_kwargs.keys(), consolidated_input_dtype),
             result_dtype,
         )
 
@@ -304,9 +292,11 @@ def __hash__(self) -> int:
     def preview_type_promotion(
         self, args: tuple, kwargs: dict
     ) -> TypePromotionSnapshot:
-        assert len(args) >= 1 and isinstance(
-            arg := args[0], torch.Tensor
+        assert (
+            len(args) >= 1
         ), f"Reduction op torch.ops.{self.namespace}.{self.op_name} expects at least one argument"
+        arg = args[0]
+        assert isinstance(arg, torch.Tensor), f"{type(arg)=} is not torch.Tensor"
         dtype: Optional[torch.dtype] = kwargs.get("dtype", None)
 
         computation_dtype, result_dtype = _prims_common.reduction_dtypes(
@@ -341,9 +331,11 @@ def __init__(self, op_name: str):
     def preview_type_promotion(
         self, args: tuple, kwargs: dict
     ) -> TypePromotionSnapshot:
-        assert len(args) >= 1 and isinstance(
-            arg := args[0], torch.Tensor
+        assert (
+            len(args) >= 1
         ), f"Reduction op torch.ops.{self.namespace}.{self.op_name} expects at least one argument"
+        arg = args[0]
+        assert isinstance(arg, torch.Tensor), f"{type(arg)=} is not torch.Tensor"
         computation_dtype = torch.bool
         # Preserves uint8 -- probably a legacy mask thing
         result_dtype = torch.uint8 if arg.dtype == torch.uint8 else torch.bool
@@ -364,9 +356,11 @@ class SumLikeReductionTypePromotionRule(ReductionTypePromotionRule):
     def preview_type_promotion(
         self, args: tuple, kwargs: dict
     ) -> TypePromotionSnapshot:
-        assert len(args) >= 1 and isinstance(
-            arg := args[0], torch.Tensor
+        assert (
+            len(args) >= 1
         ), f"Reduction op torch.ops.{self.namespace}.{self.op_name} expects at least one argument"
+        arg = args[0]
+        assert isinstance(arg, torch.Tensor), f"{type(arg)=} is not torch.Tensor"
         dtype: Optional[torch.dtype] = kwargs.get("dtype", None)
         # The below logic is copied from `torch/_refs/__init__.py` reduction ops impl.
         if dtype is None:
@@ -1670,7 +1664,7 @@ class InsertTypePromotion(_pass.Transform):
     metadata, specifically the fake tensor stored under node.meta["val"], and ensure it
     reflects the latest changes.
 
-    See [FXE0015: fx_node_insert_type_promotion](https://pytorch.org/docs/master/generated/onnx_dynamo_diagnostics_rules/FXE0015%3Afx-node-insert-type-promotion.html) for more details.  # noqa: B950
+    See [FXE0015: fx_node_insert_type_promotion](https://pytorch.org/docs/main/generated/onnx_dynamo_diagnostics_rules/FXE0015%3Afx-node-insert-type-promotion.html) for more details.  # noqa: B950
     """
 
     def __init__(
@@ -1684,7 +1678,21 @@ def __init__(
             diagnostic_context, module, type_promotion_table or TypePromotionTable()
         )
 
-    def _fetch_fake_args(self) -> Sequence[Optional[fake_tensor.FakeTensor]]:
+    def _fetch_fake_args(
+        self,
+    ) -> Sequence[
+        Optional[
+            Union[
+                fake_tensor.FakeTensor,
+                float,
+                int,
+                bool,
+                torch.SymInt,
+                torch.SymFloat,
+                torch.SymBool,
+            ]
+        ]
+    ]:
         """Fetch fake args from fx graph.
 
         For each argument, try to fetch fake tensor from the matching placeholder node.
@@ -1693,12 +1701,14 @@ def _fetch_fake_args(self) -> Sequence[Optional[fake_tensor.FakeTensor]]:
         for node in self.module.graph.nodes:
             if node.op == "placeholder":
                 try:
-                    fake_tensor = _fake_tensor_from_node_val(node)
+                    # Meta value can be torch.Tensor, int, float, bool,
+                    # torch.SymInt, torch.SymFloat, torch.SymBool.
+                    meta_value = _val = node.meta.get("val", None)
                 except RuntimeError as e:
                     if not node.users:
                         # If the placeholder is not used, we can safely ignore it and put
                         # None as placeholder.
-                        fake_tensor = None
+                        meta_value = None
                     else:
                         raise RuntimeError(
                             "Cannot fetch symbolic fake args from fx graph. "
@@ -1706,7 +1716,7 @@ def _fetch_fake_args(self) -> Sequence[Optional[fake_tensor.FakeTensor]]:
                             "Otherwise the pass will produce inaccurate dynamic shape. "
                         ) from e
 
-                fake_args.append(fake_tensor)
+                fake_args.append(meta_value)
         return fake_args
 
     @_beartype.beartype
diff --git a/torch/onnx/_internal/fx/patcher.py b/torch/onnx/_internal/fx/patcher.py
index 570f4ce5a00ec..ee919eae00d1a 100644
--- a/torch/onnx/_internal/fx/patcher.py
+++ b/torch/onnx/_internal/fx/patcher.py
@@ -1,19 +1,24 @@
 import copy
+import functools
 import io
 from typing import List, Union
 
 import torch
 
+
 # TODO: Remove after https://github.com/huggingface/safetensors/pull/318
-try:
-    # safetensors is not an exporter requirement, but needed for some huggingface models
-    import safetensors  # type: ignore[import]  # noqa: F401
-    import transformers  # type: ignore[import]
-    from safetensors import torch as safetensors_torch  # noqa: F401
+@functools.lru_cache(None)
+def has_safetensors_and_transformers():
+    try:
+        # safetensors is not an exporter requirement, but needed for some huggingface models
+        import safetensors  # type: ignore[import]  # noqa: F401
+        import transformers  # type: ignore[import]  # noqa: F401
+
+        from safetensors import torch as safetensors_torch  # noqa: F401
 
-    has_safetensors_and_transformers = True
-except ImportError:
-    has_safetensors_and_transformers = False
+        return True
+    except ImportError:
+        return False
 
 
 class ONNXTorchPatcher:
@@ -28,11 +33,6 @@ class ONNXTorchPatcher:
         This function is patched to record the files PyTorch stores model
         parameters and buffers. Downstream FX-to-ONNX exporter can create
         initializers from these files.
-    torch._util._rebuild_tensor:
-        This function is patched to avoid creating real tensors during
-        model loading. FakeTensor's are created instead. Real tensors
-        cannot be fitted into single machine's memory for the targeted
-        model scale.
     torch.fx._symbolic_trace._wrapped_methods_to_patch:
         This list is extended with (torch.Tensor, "__getitem__") so that
         weight[x, :, y] becomes exportable with torch.fx.symbolic_trace.
@@ -55,47 +55,26 @@ def __init__(self):
         self.paths: List[Union[str, io.BufferedIOBase]] = []
 
         def torch_load_wrapper(f, *args, **kwargs):
-            # Record path.
+            # Record path for later serialization into ONNX proto
             self.paths.append(f)
             # Then, call the original torch.load.
             return self.torch_load(f, *args, **kwargs)
 
-        def torch__util__rebuild_tensor_wrapper(storage, storage_offset, size, stride):
-            from torch._subclasses.fake_tensor import FakeTensorMode
-            from torch.utils._mode_utils import no_dispatch
-            from torch.utils._python_dispatch import _get_current_dispatch_mode
-
-            def _rebuild_real_tensor(storage, storage_offset, size, stride):
-                t = torch.tensor(
-                    [], dtype=storage.dtype, device=storage._untyped_storage.device
-                )
-                return t.set_(storage._untyped_storage, storage_offset, size, stride)
-
-            mode = _get_current_dispatch_mode()
-            if isinstance(mode, FakeTensorMode):
-                # Create a real tensor and then convert it to FakeTensor.
-                # We cannot directly create a FakeTensor because it tensor.set_(...)
-                # is not supported in FakeTensorMode dispatcher.
-
-                with no_dispatch():
-                    t = _rebuild_real_tensor(storage, storage_offset, size, stride)
-                return mode.from_tensor(t)
-
-            return _rebuild_real_tensor(storage, storage_offset, size, stride)
-
         # Original version of torch.load.
         self.torch_load = torch.load
-        self.torch__util_rebuild_tensor = torch._utils._rebuild_tensor
 
         # Wrapper or modified version of torch functions.
         self.torch_load_wrapper = torch_load_wrapper
-        self.torch__util_rebuild_tensor_wrapper = torch__util__rebuild_tensor_wrapper
 
-        if has_safetensors_and_transformers:
+        if has_safetensors_and_transformers():
+            import safetensors
+            import transformers
 
             def safetensors_load_file_wrapper(filename, device="cpu"):
+                # Record path for later serialization into ONNX proto
+                self.paths.append(filename)
                 result = {}
-                with safetensors.torch.safe_open(
+                with safetensors.torch.safe_open(  # type: ignore[attr-defined]
                     filename, framework="pt", device=device
                 ) as f:
                     for k in f.keys():
@@ -120,7 +99,6 @@ def safetensors_load_file_wrapper(filename, device="cpu"):
 
     def __enter__(self):
         torch.load = self.torch_load_wrapper
-        torch._utils._rebuild_tensor = self.torch__util_rebuild_tensor_wrapper
 
         self.torch_fx__symbolic_trace__wrapped_methods_to_patch = (
             torch.fx._symbolic_trace._wrapped_methods_to_patch
@@ -138,7 +116,10 @@ def __enter__(self):
             desired_wrapped_methods.append((torch.Tensor, "__getitem__"))
         torch.fx._symbolic_trace._wrapped_methods_to_patch = desired_wrapped_methods
 
-        if has_safetensors_and_transformers:
+        if has_safetensors_and_transformers():
+            import safetensors
+            import transformers
+
             safetensors.torch.load_file = self.safetensors_torch_load_file_wrapper
             transformers.modeling_utils.safe_load_file = (
                 self.safetensors_torch_load_file_wrapper
@@ -146,11 +127,13 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         torch.load = self.torch_load
-        torch._utils._rebuild_tensor = self.torch__util_rebuild_tensor
         torch.fx._symbolic_trace._wrapped_methods_to_patch = (
             self.torch_fx__symbolic_trace__wrapped_methods_to_patch
         )
-        if has_safetensors_and_transformers:
+        if has_safetensors_and_transformers():
+            import safetensors
+            import transformers
+
             safetensors.torch.load_file = self.safetensors_torch_load_file
             transformers.modeling_utils.safe_load_file = (
                 self.transformers_modeling_utils_safe_load_file
diff --git a/torch/onnx/_internal/fx/registration.py b/torch/onnx/_internal/fx/registration.py
index f55340e7c7c2e..164c052fc8623 100644
--- a/torch/onnx/_internal/fx/registration.py
+++ b/torch/onnx/_internal/fx/registration.py
@@ -48,9 +48,6 @@ def from_name_parts(
     ) -> OpName:
         # NOTE: in PyTorch, the overload could be unprovided to indicate the
         # default overload
-        # TODO: This is slightly unsafe that dev could accidentally create illegal
-        # OpName by using initializer directly
-        # https://github.com/pytorch/pytorch/pull/103943#discussion_r1256511069
         if overload is None or overload == "":
             overload = "default"
         return cls(namespace, op_name, overload)
diff --git a/torch/onnx/_internal/fx/serialization.py b/torch/onnx/_internal/fx/serialization.py
index 1e6cfbb683aca..726bf42193306 100644
--- a/torch/onnx/_internal/fx/serialization.py
+++ b/torch/onnx/_internal/fx/serialization.py
@@ -1,8 +1,9 @@
 from __future__ import annotations
 
 import io
+import logging
 import os
-from typing import Tuple, TYPE_CHECKING, Union
+from typing import Optional, Tuple, TYPE_CHECKING, Union
 
 import torch
 from torch.onnx import _type_utils as jit_type_utils
@@ -11,10 +12,16 @@
 if TYPE_CHECKING:
     import onnx
 
+log = logging.getLogger(__name__)
+
 
 @_beartype.beartype
 def _create_tensor_proto_with_external_data(
-    tensor: torch.Tensor, name: str, location: str, basepath: str
+    tensor: torch.Tensor,
+    name: str,
+    location: str,
+    basepath: str,
+    dtype_override: Optional["onnx.TypeProto"] = None,  # type: ignore[name-defined]
 ) -> onnx.TensorProto:  # type: ignore[name-defined]
     """Create a TensorProto with external data from a PyTorch tensor.
     The external data is saved to os.path.join(basepath, location).
@@ -38,11 +45,24 @@ def _create_tensor_proto_with_external_data(
     # FIXME: Avoid importing onnx into torch.onnx.
     import onnx
 
+    scalar_type = (
+        jit_type_utils.JitScalarType.from_onnx_type(
+            dtype_override.tensor_type.elem_type
+        )
+        if dtype_override is not None
+        else jit_type_utils.JitScalarType.from_dtype(tensor.dtype)
+    )
+
+    # Checkpoints can be stored with a different dtype as the model expects because
+    # the user script can explicitly cast the original type to something or maybe
+    # PyTorch's type promotion might do it
+    if dtype_override is not None and scalar_type.dtype() != tensor.dtype:
+        tensor = tensor.to(scalar_type.dtype())
+
     tensor_proto = onnx.TensorProto()  # type: ignore[attr-defined]
     tensor_proto.name = name
-    tensor_proto.data_type = jit_type_utils.JitScalarType.from_dtype(
-        tensor.dtype
-    ).onnx_type()
+    tensor_proto.data_type = scalar_type.onnx_type()
+
     tensor_proto.dims.extend(tensor.shape)
     tensor_proto.data_location = onnx.TensorProto.EXTERNAL  # type: ignore[attr-defined]
 
@@ -80,12 +100,25 @@ def _create_tensor_proto_with_external_data(
     return tensor_proto
 
 
+def _convert_safetensors_to_torch_format(safetensors_file):
+    # It this function is called, safetensors is guaranteed to exist
+    # because the HF model with safetensors was already loaded and exported to ONNX
+    from safetensors import safe_open  # type: ignore[import-not-found]
+
+    tensors = {}
+    with safe_open(safetensors_file, framework="pt", device="cpu") as f:  # type: ignore[attr-defined]
+        for k in f.keys():
+            tensors[k] = f.get_tensor(k).cpu()
+    return tensors
+
+
+# TODO: generalize to allow more checkpoints formats (torch or gguf)
 @_beartype.beartype
 def save_model_with_external_data(
     basepath: str,
     model_location: str,
     initializer_location: str,
-    torch_load_paths: Tuple[Union[str, io.BytesIO], ...],
+    torch_state_dicts: Tuple[Union[dict, str, io.BytesIO], ...],
     onnx_model: onnx.ModelProto,  # type: ignore[name-defined]
     rename_initializer: bool = False,
 ) -> None:
@@ -100,17 +133,18 @@ def save_model_with_external_data(
     to execute the model.
 
     Arguments:
-        basepath: Base path of the external data file (e.g., "/tmp/large-onnx-model").
+        basepath: Base path of the ONNX external data file (e.g., "/path/to/large_model/").
         model_location: Relative location of the ONNX model file.
             E.g., "model.onnx" so that the model file is saved to
-            "/tmp/large-onnx-model/model.onnx".
+            "<basepath>/model.onnx".
         initializer_location: Relative location of the ONNX initializer folder.
             E.g., "initializers" so that the initializers are saved to
-            "/tmp/large-onnx-model/initializers".
-        torch_load_paths: Files which containing serialized PyTorch tensors to be saved
-            as ONNX initializers. They are loaded by torch.load.
+            "<basepath>/initializers/".
+            Note: When initializers are >2GB, must be the same as `model_location`.
+        torch_state_dicts: Dictionaries or files which contain PyTorch tensors to be saved
+            as ONNX initializers. For non-dict arguments, `torch.load` will be used to load them from file-like objects.
         onnx_model: ONNX model to be saved with external initializers.
-            If an input name matches a tensor loaded from "torch_load_paths",
+            If an input name matches a tensor loaded from "torch_state_dicts",
             the tensor will be saved as that input's external initializer.
         rename_initializer: Replaces "." by "_" for all ONNX initializer names.
             Not needed by the official torch.onnx.dynamo_export. This is a hack
@@ -121,12 +155,38 @@ def save_model_with_external_data(
     # FIXME: Avoid importing onnx into torch.onnx.
     import onnx
 
-    onnx_model_with_initializers = onnx.ModelProto()  # type: ignore[attr-defined]
-    onnx_model_with_initializers.CopyFrom(onnx_model)
+    initializers_to_be_deleted = {}  # Using dict because it is **ordered**
+    existing_initializers = {
+        k.name: idx for idx, k in enumerate(onnx_model.graph.initializer)
+    }
     onnx_input_names = {input.name for input in onnx_model.graph.input}
+    for el in torch_state_dicts:
+        if isinstance(el, dict):
+            # Useful for when state_dict is loaded with torch.load(..., mmap=True, map_location="cpu") by the user
+            # Using torch.save wouldn't leverage mmap, leading to higher memory usage
+            state_dict = el
+        else:
+            if isinstance(el, str) and el.endswith(".safetensors"):
+                state_dict = _convert_safetensors_to_torch_format(el)
+            else:
+                try:
+                    # Loads checkpoint using memory-map on CPU to support really large models
+                    # The underlying torch.UntypedStorage is memory mapped, so state_dict is lazy loaded
+                    state_dict = torch.load(el, map_location="cpu", mmap=True)
+                except (RuntimeError, ValueError) as e:
+                    if "mmap can only be used with files saved with" in str(
+                        e
+                    ) or isinstance(el, io.BytesIO):
+                        log.warning(
+                            "Failed to load the checkpoint with memory-map enabled, retrying without memory-map."
+                            "Consider updating the checkpoint with mmap by using torch.save() on PyTorch version >= 1.6."
+                        )
+                        if isinstance(el, io.BytesIO):
+                            el.seek(0)  # torch.load from `try:` has read the file.
+                        state_dict = torch.load(el, map_location="cpu")
+                    else:
+                        raise e
 
-    for path in torch_load_paths:
-        state_dict = torch.load(path)
         for name, tensor in state_dict.items():
             if rename_initializer:
                 # Basically, "transformer.attention.self.query.weight" is mapped
@@ -160,11 +220,26 @@ def save_model_with_external_data(
             # Create one file per tensor.
             # tensor_proto.raw_data is stored to external file at
             # os.path.join(basepath, relative_tensor_file_path).
+            model_input_types = {k.name: k.type for k in onnx_model.graph.input}
+
+            # Mark for deletion - a replacement will be appended next
+            if name in existing_initializers:
+                initializers_to_be_deleted[existing_initializers[name]] = name
             tensor_proto = _create_tensor_proto_with_external_data(
-                tensor, name, relative_tensor_file_path, basepath
+                tensor,
+                name,
+                relative_tensor_file_path,
+                basepath,
+                model_input_types.pop(name, None),
             )
             # Add the tensor_proto to the ONNX model as an initializer with external data.
-            onnx_model_with_initializers.graph.initializer.append(tensor_proto)
+            onnx_model.graph.initializer.append(tensor_proto)
+    # Remove old duplicated initializers, if any. delete in desc order to not invalidate deletion indices
+    initializers_to_be_deleted = dict(
+        sorted(initializers_to_be_deleted.items(), reverse=True)
+    )
+    for idx in initializers_to_be_deleted.keys():
+        del onnx_model.graph.initializer[idx]
 
     # model_location should be a pure file name such as "file_name.onnx", not "folder/file_name.onnx".
-    onnx.save(onnx_model_with_initializers, os.path.join(basepath, model_location))  # type: ignore[attr-defined]
+    onnx.save(onnx_model, os.path.join(basepath, model_location))  # type: ignore[attr-defined]
diff --git a/torch/onnx/_internal/fx/torch_export_graph_extractor.py b/torch/onnx/_internal/fx/torch_export_graph_extractor.py
index 9685ac9705971..fb3f0e99a6d68 100644
--- a/torch/onnx/_internal/fx/torch_export_graph_extractor.py
+++ b/torch/onnx/_internal/fx/torch_export_graph_extractor.py
@@ -88,7 +88,6 @@ def generate_fx(
             io_adapter.PrependParamsAndBuffersAotAutogradOutputStep()
         )
 
-        # TODO: https://github.com/pytorch/pytorch/issues/114628
         # run_decomposition generates a new graph module with decomposed ops.
         # Thus, we need to run this step after io_adapters.
         model = model.run_decompositions(options.decomposition_table)
@@ -117,9 +116,10 @@ def pre_export_passes(
             diagnostic_context, fx_module, options.onnxfunction_dispatcher
         ).analyze(infra.levels.ERROR)
 
-        # TODO: Disabled this pass until "Segmentation fault (core dumped)" is fixed
         # This operation should be invoked as the last pre export pass.
         # See [NOTE: Modularize pass ordering]
-        # fx_module = passes.Modularize(diagnostic_context, fx_module).run()
+        fx_module = passes.Modularize(
+            diagnostic_context, fx_module, is_exported_program=True
+        ).run()
 
         return fx_module
diff --git a/torch/onnx/_internal/fx/type_utils.py b/torch/onnx/_internal/fx/type_utils.py
index cb81bf3a771fe..b7f3d6cea6428 100644
--- a/torch/onnx/_internal/fx/type_utils.py
+++ b/torch/onnx/_internal/fx/type_utils.py
@@ -79,6 +79,27 @@ def from_python_type_to_onnx_attribute_type(
     return _PYTHON_TYPE_TO_ONNX_ATTRIBUTE_TYPE.get(dtype)
 
 
+def from_python_type_to_onnx_tensor_element_type(type: type):
+    """
+    Converts a Python type to the corresponding ONNX tensor element type.
+    For example, `from_python_type_to_onnx_tensor_element_type(float)` returns
+    `onnx.TensorProto.FLOAT`.
+
+    Args:
+      type (type): The Python type to convert.
+
+    Returns:
+      int: The corresponding ONNX tensor element type.
+
+    """
+    _PYTHON_TYPE_TO_ONNX_TENSOR_ELEMENT_TYPE = {
+        float: onnx.TensorProto.FLOAT,  # type: ignore[attr-defined]
+        int: onnx.TensorProto.INT64,  # type: ignore[attr-defined]
+        bool: onnx.TensorProto.BOOL,  # type: ignore[attr-defined]
+    }
+    return _PYTHON_TYPE_TO_ONNX_TENSOR_ELEMENT_TYPE.get(type)
+
+
 def is_torch_symbolic_type(value: Any) -> bool:
     return isinstance(value, (torch.SymBool, torch.SymInt, torch.SymFloat))
 
@@ -103,6 +124,10 @@ def from_scalar_type_to_torch_dtype(scalar_type: type) -> Optional[torch.dtype]:
     torch.float64: {"tensor(double)"},
     torch.float32: {"tensor(float)"},
     torch.float16: {"tensor(float16)"},
+    torch.float8_e4m3fn: {"tensor(float8_e4m3fn)"},
+    torch.float8_e4m3fnuz: {"tensor(float8_e4m3fnuz)"},
+    torch.float8_e5m2: {"tensor(float8_e5m2)"},
+    torch.float8_e5m2fnuz: {"tensor(float8_e5m2fnuz)"},
     torch.int16: {"tensor(int16)"},
     torch.int32: {"tensor(int32)"},
     torch.int64: {"tensor(int64)"},
@@ -112,6 +137,7 @@ def from_scalar_type_to_torch_dtype(scalar_type: type) -> Optional[torch.dtype]:
     int: {"tensor(int16)", "tensor(int32)", "tensor(int64)"},
     float: {"tensor(float16)", "tensor(float)", "tensor(double)"},
     bool: {"tensor(int32)", "tensor(int64)", "tensor(bool)"},
+    complex: {"tensor(float)", "tensor(double)"},
     torch.complex32: {"tensor(float16)"},
     torch.complex64: {"tensor(float)"},
     torch.complex128: {"tensor(double)"},
@@ -127,7 +153,7 @@ def from_scalar_type_to_torch_dtype(scalar_type: type) -> Optional[torch.dtype]:
     bool: torch.bool,
     int: torch.int64,
     float: torch.float32,
-    complex: torch.complex32,
+    complex: torch.complex64,
 }
 
 _COMPLEX_TO_FLOAT: Dict[torch.dtype, torch.dtype] = {
@@ -152,6 +178,10 @@ def from_scalar_type_to_torch_dtype(scalar_type: type) -> Optional[torch.dtype]:
     torch.float64: "f64",
     torch.float32: "f32",
     torch.float16: "f16",
+    torch.float8_e4m3fn: "e4m3fn",
+    torch.float8_e4m3fnuz: "e4m3fnuz",
+    torch.float8_e5m2: "f8e5m2",
+    torch.float8_e5m2fnuz: "e5m2fnuz",
     torch.complex32: "c32",
     torch.complex64: "c64",
     torch.complex128: "c128",
@@ -178,6 +208,10 @@ def from_scalar_type_to_torch_dtype(scalar_type: type) -> Optional[torch.dtype]:
 _ONNX_TENSOR_ELEMENT_TYPE_TO_TORCH_DTYPE = {
     onnx.TensorProto.FLOAT: torch.float32,  # type: ignore[attr-defined]
     onnx.TensorProto.FLOAT16: torch.float16,  # type: ignore[attr-defined]
+    onnx.TensorProto.FLOAT8E5M2: torch.float8_e5m2,  # type: ignore[attr-defined]
+    onnx.TensorProto.FLOAT8E5M2FNUZ: torch.float8_e5m2fnuz,  # type: ignore[attr-defined]
+    onnx.TensorProto.FLOAT8E4M3FN: torch.float8_e4m3fn,  # type: ignore[attr-defined]
+    onnx.TensorProto.FLOAT8E4M3FNUZ: torch.float8_e4m3fnuz,  # type: ignore[attr-defined]
     onnx.TensorProto.DOUBLE: torch.float64,  # type: ignore[attr-defined]
     onnx.TensorProto.BOOL: torch.bool,  # type: ignore[attr-defined]
     onnx.TensorProto.UINT8: torch.uint8,  # type: ignore[attr-defined]
diff --git a/torch/onnx/_internal/io_adapter.py b/torch/onnx/_internal/io_adapter.py
index 9ea694be3de00..2f8c9202d7bbb 100644
--- a/torch/onnx/_internal/io_adapter.py
+++ b/torch/onnx/_internal/io_adapter.py
@@ -71,7 +71,7 @@ def apply(
             Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
         ] = None,
         **model_kwargs,
-    ) -> Sequence[Union[int, float, bool, str, "torch.Tensor", None]]:
+    ) -> Sequence[Union[int, float, bool, str, "torch.Tensor", torch.dtype, None]]:
         """Converts the PyTorch model inputs to exported ONNX model inputs format.
 
         Args:
@@ -315,7 +315,7 @@ def apply(
         """
         return (
             tuple(
-                torch.view_as_real(arg)
+                torch.view_as_real(arg.resolve_conj())
                 if isinstance(arg, torch.Tensor) and arg.is_complex()
                 else arg
                 for arg in model_args
@@ -531,7 +531,7 @@ def apply(
             A tuple of the model output.
         """
         return [
-            torch.view_as_real(output)
+            torch.view_as_real(output.resolve_conj())
             if isinstance(output, torch.Tensor) and torch.is_complex(output)
             else output
             for output in model_outputs
@@ -611,11 +611,15 @@ def apply(
         ordered_params = tuple(
             model.state_dict[name] for name in model.graph_signature.parameters  # type: ignore[union-attr,index]
         )
-        ordered_buffers = tuple(
-            model.state_dict[name] for name in model.graph_signature.buffers  # type: ignore[union-attr,index]
-        )
+        non_persistent_buffers = set(model.graph_signature.non_persistent_buffers)  # type: ignore[union-attr]
+        ordered_buffers = []
+        for name in model.graph_signature.buffers:  # type: ignore[union-attr]
+            if name in non_persistent_buffers:
+                ordered_buffers.append(model.constants[name])  # type: ignore[union-attr]
+            else:
+                ordered_buffers.append(model.state_dict[name])  # type: ignore[union-attr,index]
         ordered_constant_tensors = tuple(
-            getattr(model.module(), name) for name in model.graph_signature.lifted_tensor_constants  # type: ignore[union-attr,index]
+            model.constants[fqn] for fqn in model.graph_signature.lifted_tensor_constants  # type: ignore[union-attr,index]
         )
 
         # NOTE: calling convention is first params, then buffers, then args as user supplied them.
@@ -665,6 +669,8 @@ def apply(
         ), "'model' must be torch_export.ExportedProgram"
         ordered_buffers = tuple(
             model.state_dict[name]
+            if name in model.state_dict
+            else model.constants[name]
             for name in model.graph_signature.buffers_to_mutate.values()
         )
 
diff --git a/torch/onnx/_internal/jit_utils.py b/torch/onnx/_internal/jit_utils.py
index 9052961fc7a64..068b87cca9b0f 100644
--- a/torch/onnx/_internal/jit_utils.py
+++ b/torch/onnx/_internal/jit_utils.py
@@ -7,7 +7,7 @@
 import dataclasses
 import re
 import typing
-from typing import Any, Dict, Iterable, Optional, Sequence, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union
 
 import torch
 from torch import _C
@@ -34,6 +34,8 @@ class GraphContext:
         original_node: Current node that is being converted from.
         params_dict: Mapping from graph initializer name to IValue.
         env: Mapping from Torch domain graph Value to ONNX domain graph Value.
+        new_nodes: List that tracks all new nodes that are added (used to make
+            sure metadata is propagated to all new nodes).
     """
 
     graph: _C.Graph
@@ -42,6 +44,7 @@ class GraphContext:
     original_node: _C.Node
     params_dict: Dict[str, "_C.IValue"]
     env: Dict[_C.Value, _C.Value]
+    new_nodes: List[_C.Node] = dataclasses.field(default_factory=list)
 
     # Relay methods from _C.Graph for compatibility with symbolic functions that expect
     # a _C.Graph
@@ -107,7 +110,7 @@ def aten_op(self, operator: str, *args, overload_name: str = "", **kwargs):
     @_beartype.beartype
     def onnxscript_op(
         self,
-        onnx_fn,  # TODO(titaiwang): annotate this when onnx-script becomes dependency
+        onnx_fn,
         *raw_args: Union[torch.Tensor, _C.Value],
         outputs: int = 1,
         **kwargs,
@@ -253,6 +256,7 @@ def _add_op(
         n_outputs=outputs,
         shape_inference=GLOBALS.onnx_shape_inference,
     )
+    graph_context.new_nodes.append(node)
 
     if outputs == 1:
         return node.output()
@@ -293,8 +297,8 @@ def _create_node(
             for _ in range(1, n_outputs):
                 node.addOutput()
 
-    node_ouputs = tuple(node.outputs())
-    assert len(node_ouputs) == n_outputs
+    node_outputs = tuple(node.outputs())  # type: ignore[possibly-undefined]
+    assert len(node_outputs) == n_outputs
 
     aten = domain_op.startswith("aten::")
 
diff --git a/torch/onnx/_internal/onnx_proto_utils.py b/torch/onnx/_internal/onnx_proto_utils.py
index 52c37ddc8d010..b33b4a77f469f 100644
--- a/torch/onnx/_internal/onnx_proto_utils.py
+++ b/torch/onnx/_internal/onnx_proto_utils.py
@@ -26,13 +26,13 @@ def export_as_test_case(
     is as follows:
 
     dir
-    ├── test_<name>
-    │   ├── model.onnx
-    │   └── test_data_set_0
-    │       ├── input_0.pb
-    │       ├── input_1.pb
-    │       ├── output_0.pb
-    │       └── output_1.pb
+    \u251c\u2500\u2500 test_<name>
+    \u2502   \u251c\u2500\u2500 model.onnx
+    \u2502   \u2514\u2500\u2500 test_data_set_0
+    \u2502       \u251c\u2500\u2500 input_0.pb
+    \u2502       \u251c\u2500\u2500 input_1.pb
+    \u2502       \u251c\u2500\u2500 output_0.pb
+    \u2502       \u2514\u2500\u2500 output_1.pb
 
     Args:
         model_bytes: The ONNX model in bytes.
@@ -80,13 +80,13 @@ def load_test_case(dir: str) -> Tuple[bytes, Any, Any]:
     should be as follows:
 
     dir
-    ├── test_<name>
-    │   ├── model.onnx
-    │   └── test_data_set_0
-    │       ├── input_0.pb
-    │       ├── input_1.pb
-    │       ├── output_0.pb
-    │       └── output_1.pb
+    \u251c\u2500\u2500 test_<name>
+    \u2502   \u251c\u2500\u2500 model.onnx
+    \u2502   \u2514\u2500\u2500 test_data_set_0
+    \u2502       \u251c\u2500\u2500 input_0.pb
+    \u2502       \u251c\u2500\u2500 input_1.pb
+    \u2502       \u251c\u2500\u2500 output_0.pb
+    \u2502       \u2514\u2500\u2500 output_1.pb
 
     Args:
         dir: The directory containing the test case.
@@ -170,9 +170,6 @@ def _export_file(
     export_map: Mapping[str, bytes],
 ) -> None:
     """export/write model bytes into directory/protobuf/zip"""
-    # TODO(titaiwang) MYPY asks for os.PathLike[str] type for parameter: f,
-    # but beartype raises beartype.roar.BeartypeDecorHintNonpepException,
-    # as os.PathLike[str] uncheckable at runtime
     if export_type == _exporter_states.ExportTypes.PROTOBUF_FILE:
         assert len(export_map) == 0
         with torch.serialization._open_file_like(f, "wb") as opened_file:
@@ -218,7 +215,6 @@ def _add_onnxscript_fn(
     custom_opsets: Mapping[str, int],
 ) -> bytes:
     """Insert model-included custom onnx-script function into ModelProto"""
-    # TODO(titaiwang): remove this when onnx becomes dependency
     try:
         import onnx
     except ImportError as e:
@@ -233,8 +229,6 @@ def _add_onnxscript_fn(
 
     # Iterate graph nodes to insert only the included custom
     # function_proto into model_proto
-    # TODO(titaiwang): Currently, onnxscript doesn't support ONNXFunction
-    # calling other ONNXFunction scenario, neither does it here
     onnx_function_list = list()  # type: ignore[var-annotated]
     included_node_func = set()  # type: Set[str]
     # onnx_function_list and included_node_func are expanded in-place
@@ -278,8 +272,6 @@ def _find_onnxscript_op(
             specified_version = custom_opsets.get(node.domain, 1)
             onnx_fn = onnx_function_group.get(specified_version)
             if onnx_fn is not None:
-                # TODO(titaiwang): to_function_proto is onnx-script API and can be annotated
-                # after onnx-script is dependency
                 if hasattr(onnx_fn, "to_function_proto"):
                     onnx_function_proto = onnx_fn.to_function_proto()  # type: ignore[attr-defined]
                     onnx_function_list.append(onnx_function_proto)
diff --git a/torch/onnx/_internal/onnxruntime.py b/torch/onnx/_internal/onnxruntime.py
index f7f8a41b8368e..0505b9f470f5c 100644
--- a/torch/onnx/_internal/onnxruntime.py
+++ b/torch/onnx/_internal/onnxruntime.py
@@ -1,9 +1,11 @@
 import dataclasses
 import importlib
 import logging
+import os
 
 from typing import (
     Any,
+    Callable,
     Dict,
     Final,
     List,
@@ -49,6 +51,7 @@
     from torch.onnx._internal.fx.type_utils import (
         _TORCH_DTYPE_TO_NUMPY_DTYPE,
         _TORCH_DTYPE_TO_ONNX_TENSOR_ELEMENT_TYPE,
+        from_python_type_to_onnx_tensor_element_type,
     )
 
     _SUPPORT_ONNXRT = True
@@ -84,6 +87,34 @@ def is_onnxrt_backend_supported() -> bool:
     return _SUPPORT_ONNXRT
 
 
+_dumped_onnx_model: Dict[str, int] = {}
+
+
+def _dump_onnx_model(
+    model_string: bytes, graph_module: Optional[torch.fx.GraphModule] = None
+) -> str:
+    """Stores the onnx model into a file.
+    The name is "{ONNXRT_DUMP_PATH}{N}.onnx"
+    where *N* is the number of files already stored with
+    this prefix.
+    If graph_module is not None, the graph is stored as a string with
+    the same filename except the extension (.txt).
+    """
+    prefix = os.environ.get("ONNXRT_DUMP_PATH", None)
+    if not prefix:
+        return ""
+    n = _dumped_onnx_model.get(prefix, -1) + 1
+    filename = f"{prefix}{n}.onnx"
+    with open(filename, "wb") as f:
+        f.write(model_string)
+    _dumped_onnx_model[prefix] = n
+    if graph_module is not None:
+        filename_txt = f"{prefix}{n}.txt"
+        with open(filename_txt, "w", encoding="utf-8") as f:
+            f.write(str(graph_module.graph))
+    return filename
+
+
 def _infer_default_eps() -> Sequence[str]:
     # TODO: select a good default based on the capabilities of the host
     # e.g. DML on Windows, etc.
@@ -114,7 +145,7 @@ def _get_ort_device_type(device_type: str):
     if device_type == "cpu":
         return ORTC.OrtDevice.cpu()
     # ort pytorch device is mapped to NPU OrtDevice type
-    if device_type == "ort":
+    if device_type == "maia":
         return ORTC.OrtDevice.npu()
     raise ValueError("Unsupported device type: " + device_type)
 
@@ -151,29 +182,24 @@ def is_node_supported(
             return False
         # This is the and the only place to decide if aten op is supported.
         if node.op == "call_function" and node.target in self._onnx_support_dict:
-            logger.warning(
+            logger.info(
                 "support_dict supports node.target: %s (type: %s)",
                 node.target,
                 type(node.target),
             )
             return True
-        logger.warning(
-            "support_dict doesn't support node.target: %s (type: %s)",
-            node.target,
-            type(node.target),
-        )
         # If node.target is not in support_dict, we still want to check if torch.jit.script
         # can convert it to ONNX equivalence. Let's use base mechanism to do this.
         # See extra_support_dict  for supported ops.
         if super().is_node_supported(submodules, node):
-            logger.warning(
+            logger.info(
                 "extra_support_dict supports node.target: %s (type: %s)",
                 node.target,
                 type(node.target),
             )
             return True
         logger.warning(
-            "extra_support_dict doesn't supports node.target: %s (type: %s)",
+            "support_dict and extra_support_dict don't support node.target: %s (type: %s)",
             node.target,
             type(node.target),
         )
@@ -201,51 +227,6 @@ def _move_placeholder_to_front(graph_module: torch.fx.GraphModule) -> None:
         first_not_placeholder.prepend(placeholder)
 
 
-def _replace_to_copy_with_to(fx_module: torch.fx.GraphModule) -> None:
-    # aten._to_copy doesn't have exporter so we replace it with aten.to.
-    for node in fx_module.graph.nodes:
-        if (
-            isinstance(node.target, torch._ops.OpOverload)
-            and node.target.overloadpacket == torch.ops.aten._to_copy
-        ):
-            is_default_layout = True
-            is_on_same_device = True
-            is_cast = True
-            are_kwargs_supported = True
-            if "layout" in node.kwargs and node.kwargs["layout"] != torch.strided:
-                is_default_layout = False
-            if (
-                "device" in node.kwargs
-                and node.kwargs["device"] != node.args[0].meta["val"].device
-            ):
-                is_on_same_device = False
-            if "dtype" not in node.kwargs:
-                is_cast = False
-            for kwarg in node.kwargs:
-                if kwarg not in ["layout", "device", "dtype"]:
-                    are_kwargs_supported = False
-
-            if (
-                len(node.args) == 1
-                and is_default_layout
-                and is_on_same_device
-                and is_cast
-                and are_kwargs_supported
-            ):
-                # This aten::_to_copy looks like ONNX Cast, so other kwargs are ignored.
-                # This change could lead to invalid FX graph but it doesn't matter, as long as the downstream backend,
-                # ONNXRuntime, can execute the exported ONNX graph.
-                node.kwargs = {"dtype": node.kwargs["dtype"]}
-
-                node.target = torch.ops.aten.to.dtype
-            else:
-                raise RuntimeError(
-                    f"aten._to_copy must be replaced with other ONNX-supported aten ops. \
-                         args={[arg.meta for arg in node.args]}, kwargs={node.kwargs}"
-                )
-    fx_module.recompile()
-
-
 def _infer_ep_from_device(*args) -> Tuple[str, ...]:
     """Return the first valid device (i.e., GPU or CPU) in argument list."""
     eps = []
@@ -313,23 +294,42 @@ def get_execution_provider_priority(ep: str) -> int:
     return tuple(sorted(unique_eps, key=get_execution_provider_priority, reverse=True))
 
 
-def _get_onnx_devices(values: Tuple[torch.Tensor, ...]) -> Tuple["ORTC.OrtDevice", ...]:
-    assert all(
-        value.device == values[0].device for value in values
-    ), "All values must be on the same device."
-
+def _get_onnx_devices(
+    values: Tuple[
+        Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+        ...,
+    ]
+) -> Tuple["ORTC.OrtDevice", ...]:
     def _device_id_or_zero(device_id: int) -> int:
         return device_id or 0
 
-    devices: Tuple["ORTC.OrtDevice", ...] = tuple(
-        ORTC.OrtDevice(
-            _get_ort_device_type(value.device.type),
-            ORTC.OrtDevice.default_memory(),
-            _device_id_or_zero(value.device.index),
-        )
-        for value in values
-    )
-    return devices
+    def _map_tensor_or_sym_to_device(
+        value: Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+    ) -> int:
+        if isinstance(value, torch.Tensor):
+            return ORTC.OrtDevice(
+                _get_ort_device_type(value.device.type),
+                ORTC.OrtDevice.default_memory(),
+                _device_id_or_zero(value.device.index),
+            )
+        elif isinstance(
+            value, (torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool)
+        ):
+            return ORTC.OrtDevice(
+                _get_ort_device_type("cpu"), ORTC.OrtDevice.default_memory(), 0
+            )
+        else:
+            raise ValueError("Unsupported value type: " + str(type(value)))
+
+    if len(values) > 0:
+        ort_devices = tuple(_map_tensor_or_sym_to_device(value) for value in values)
+        return ort_devices
+    else:
+        return (_map_tensor_or_sym_to_device(1),)
 
 
 def _get_ortvalues_from_torch_tensors(
@@ -356,6 +356,61 @@ def _to_real_tensor(tensor: FakeTensor) -> torch.Tensor:
     return out
 
 
+def _adjust_scalar_from_fx_to_onnx(
+    dynamo_value: Union[
+        torch.Tensor,
+        int,
+        float,
+        bool,
+    ],
+    value_info: "onnx.ValueInfoProto",  # type: ignore[name-defined]
+) -> torch.Tensor:
+    """Helper function to wrap PyTorch variables as torch.Tensor"""
+    if (
+        isinstance(dynamo_value, torch.Tensor)
+        and len(value_info.type.tensor_type.shape.dim) == 0
+        and dynamo_value.shape == (1,)
+    ):
+        # ONNX expect a scalar with empty shape.
+        # In contrast, PyTorch usually allows implicit
+        # conversion between shape=() and shape=(1,).
+        #
+        # Below, PyTorch's shape (1,) is reshaped to ().
+        return torch.squeeze(dynamo_value)
+    elif isinstance(dynamo_value, int):
+        return torch.tensor(dynamo_value, dtype=torch.int64)
+    elif isinstance(dynamo_value, float):
+        return torch.tensor(dynamo_value, dtype=torch.float32)
+    elif isinstance(dynamo_value, bool):
+        return torch.tensor(dynamo_value, dtype=torch.bool)
+    else:
+        assert isinstance(dynamo_value, torch.Tensor)
+        return dynamo_value.contiguous()
+
+
+def _adjust_scalar_from_onnx_to_fx(
+    tensor: torch.Tensor,
+    prim_value: Union[
+        torch.Tensor,
+        torch.SymInt,
+        int,
+        torch.SymFloat,
+        float,
+        torch.SymBool,
+        bool,
+    ],
+) -> Union[torch.Tensor, int, float, bool,]:
+    """Helper function to wrap ORT-produced torch.Tensor as PyTorch variables"""
+    assert isinstance(tensor, torch.Tensor), "ORT's output must be tensor."
+    if isinstance(
+        prim_value,
+        (torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool),
+    ):
+        # Convert tensor back to scalar to match Dynamo's expectation.
+        return tensor.item()
+    return tensor
+
+
 def _run_onnx_session_with_ortvaluevector(
     sess: "onnxruntime.InferenceSession",
     input_names: Tuple[str, ...],
@@ -365,13 +420,22 @@ def _run_onnx_session_with_ortvaluevector(
     outputs: Tuple[torch.Tensor, ...],
     output_devices: Tuple["ORTC.OrtDevice", ...],
     preallocate_output: bool,
-) -> Tuple[torch.Tensor, ...]:
+    input_value_infos: Tuple["onnx.ValueInfoProto", ...],  # type: ignore[name-defined]
+    normalized_prim_outputs: Tuple[
+        Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+        ...,
+    ],
+) -> Tuple[Union[torch.Tensor, int, float, bool], ...]:
     _nvtx_range_push("contiguous")
-    inputs = tuple(a.contiguous() for a in inputs)
+    inputs = tuple(
+        _adjust_scalar_from_fx_to_onnx(arg, value_info)
+        for arg, value_info in zip(inputs, input_value_infos)
+    )
     _nvtx_range_pop()
 
     _nvtx_range_push("push_back_batch")
-
     ort_inputs = _get_ortvalues_from_torch_tensors(inputs, input_devices)
 
     # preallocate output pytorch Tensors and use the buffers affined to the torch device for the output ortvalue.
@@ -394,13 +458,33 @@ def _run_onnx_session_with_ortvaluevector(
     )
     _nvtx_range_pop()
 
+    # Post-processing step:
+    #  wrap ORT's outputs to the schema represented by
+    #  `prim_output` (obtained by running the original
+    #  torch.fx.GraphModule).
     if preallocate_output:
+        # Profile the ORT-to-PyTorch type cast below
+        _nvtx_range_push("after run_with_ortvaluevector")
+        # Outputs are stored on pre-allocated torch.Tensors' memory,
+        # so this case doesn't need to convert ORTValue to torch.Tensor.
+        pth_outputs = tuple(
+            _adjust_scalar_from_onnx_to_fx(onnx_output, prim_output)  # type: ignore[misc]
+            for onnx_output, prim_output in zip(pth_outputs, normalized_prim_outputs)
+        )
+        _nvtx_range_pop()
         return pth_outputs
     else:
+        # Profile the two ORT-to-PyTorch type casts below
         _nvtx_range_push("after run_with_ortvaluevector")
+        # Map ORTValue to torch.Tensor.
         pth_outputs = onnxruntime.training.ortmodule._utils._ortvalues_to_torch_tensor(
             ort_outputs
         )
+        # Change some torch.Tensor to int, float, bool.
+        pth_outputs = tuple(
+            _adjust_scalar_from_onnx_to_fx(onnx_output, prim_output)  # type: ignore[misc]
+            for onnx_output, prim_output in zip(pth_outputs, normalized_prim_outputs)
+        )
         _nvtx_range_pop()
         return pth_outputs
 
@@ -414,15 +498,29 @@ def _run_onnx_session_with_fetch(
     outputs: Tuple[torch.Tensor, ...],
     output_devices: Tuple["ORTC.OrtDevice", ...],
     preallocate_output: bool,
-) -> Tuple[torch.Tensor, ...]:
+    input_value_infos: Tuple["onnx.ValueInfoProto", ...],  # type: ignore[name-defined]
+    normalized_prim_outputs: Tuple[
+        Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+        ...,
+    ],
+) -> Tuple[Union[torch.Tensor, int, float, bool], ...]:
+    inputs = tuple(
+        _adjust_scalar_from_fx_to_onnx(arg, value_info)
+        for arg, value_info in zip(inputs, input_value_infos)
+    )
     feed = {
         name: onnxruntime.OrtValue.ortvalue_from_numpy(tensor.cpu().numpy())
         for name, tensor in zip(input_names, inputs)
     }
     ort_outputs = sess.run(output_names, feed)
     pth_outputs = tuple(
-        torch.from_numpy(value).to(tensor.device)
-        for value, tensor in zip(ort_outputs, outputs)
+        _adjust_scalar_from_onnx_to_fx(
+            torch.from_numpy(value),
+            prim_output,
+        )
+        for value, prim_output in zip(ort_outputs, normalized_prim_outputs)
     )
     return pth_outputs
 
@@ -469,8 +567,20 @@ def is_supported(self, *args):
         if len(args) != len(self.input_value_infos):
             return False
         for arg, value_info in zip(args, self.input_value_infos):
-            if not isinstance(arg, torch.Tensor):
+            if not isinstance(arg, (torch.Tensor, float, int)):
                 return False
+
+            # Check Python scalars such as int, float, and bool.
+            if isinstance(arg, (int, float, bool)):
+                # Map, e.g., float to onnx.TensorProto.FLOAT.
+                onnx_dtype = from_python_type_to_onnx_tensor_element_type(type(arg))
+                if onnx_dtype != value_info.type.tensor_type.elem_type:
+                    return False
+                if len(value_info.type.tensor_type.shape.dim) != 0:
+                    return False
+                continue
+
+            # Check tensor.
             onnx_dtype = _TORCH_DTYPE_TO_ONNX_TENSOR_ELEMENT_TYPE[arg.dtype]
             if onnx_dtype != value_info.type.tensor_type.elem_type:
                 return False
@@ -591,6 +701,12 @@ class OrtBackendOptions:
     ort_session_options: Optional["onnxruntime.SessionOptions"] = None
     """Options for the ``onnxruntime.InferenceSession`` used by the ``OrtBackend``."""
 
+    pre_ort_model_transforms: Optional[  # type: ignore[name-defined]
+        Sequence[Callable[["onnx.ModelProto"], None]]
+    ] = None
+    """A list of graph transforms to be applied to the ONNX model before it
+    is fed to ONNXRuntime's InferenceSession."""
+
 
 @compatibility(is_backward_compatible=False)
 class OrtBackend:
@@ -644,7 +760,14 @@ def __init__(self, options: Optional[OrtBackendOptions] = None):
 
         extra_support_dict: Dict[str, Any] = {
             "getattr": None,
+            # To send operator.getitem to ORT, add the corresponding string
+            # recognized by PyTorch's OperatorSupport class.
             "_operator.getitem": None,
+            # To send operator.mul to ORT, add the corresponding string
+            # recognized by PyTorch's OperatorSupport class.
+            "_operator.mul": None,
+            "_operator.add": None,
+            "_operator.sub": None,
         }
 
         self._supported_ops = OrtOperatorSupport(support_dict, extra_support_dict)
@@ -727,6 +850,8 @@ def _ort_acclerated_call(self, graph_module: torch.fx.GraphModule, *args, **kwar
             onnx_session = cached_execution_info_per_session.session
             input_names = cached_execution_info_per_session.input_names
             output_names = cached_execution_info_per_session.output_names
+            input_value_infos = cached_execution_info_per_session.input_value_infos
+            output_value_infos = cached_execution_info_per_session.output_value_infos
             input_devices = cached_execution_info_per_session.input_devices
             output_devices = cached_execution_info_per_session.output_devices
             prim_outputs = cached_execution_info_per_session.example_outputs
@@ -797,6 +922,39 @@ def maybe_map_to_meta_val(value):
                 opset_version=self._resolved_onnx_exporter_options.onnx_registry.opset_version,
             )
 
+            try:
+                from onnxscript import optimizer  # type: ignore[import]
+                from onnxscript.rewriter import (  # type: ignore[import]
+                    onnxruntime as ort_rewriter,  # type: ignore[import]
+                )
+
+                onnx_model = optimizer.optimize(onnx_model)
+                onnx_model = ort_rewriter.rewrite(onnx_model)
+            except ImportError:
+                logger.warning(
+                    "ONNXScript optimizer is not available. Skipping optimization. "
+                    "Please `pip install onnxscript -U` to enable post-export optimization."
+                )
+
+            # Modify ONNX model using pre-registered graph transforms.
+            # They are in-place modifications for avoiding unnecessary
+            # copy of ONNX initializers.
+            if self._options.pre_ort_model_transforms:
+                for transform in self._options.pre_ort_model_transforms:
+                    transform(onnx_model)
+
+            onnx_model_bytes = onnx_model.SerializeToString()
+            if os.environ.get("ONNXRT_DUMP_PATH", None):
+                # If not empty, environment variable ONNXRT_DUMP_PATH defined the path
+                # where generated onnx files should be stored.
+                # This module keeps a global variables keeping track of the
+                # stored models.
+                # If ONNXRT_DUMP_PATH="dumped/dumped_model_"
+                # The first file name will be 'dumped/dumped_model_0.onnx'.
+                # For every dumped model, a text file 'dumped/dumped_model_0.txt'
+                # is created as well to contain the string representing the graph_module.
+                _dump_onnx_model(onnx_model_bytes, graph_module=graph_module)
+
             # Initialize a ORT session to execute this ONNX model.
             # Note that TorchDynamo assumes all inputs/outputs are on the
             # same device, but it's subject to change (very likely with
@@ -807,7 +965,7 @@ def maybe_map_to_meta_val(value):
             # TODO(wschin): enable external allocators.
             # See https://github.com/pytorch/pytorch/issues/106867
             onnx_session = onnxruntime.InferenceSession(
-                path_or_bytes=onnx_model.SerializeToString(),
+                path_or_bytes=onnx_model_bytes,
                 sess_options=self._options.ort_session_options,
                 providers=self._select_eps(graph_module, *args),
             )
@@ -825,12 +983,15 @@ def maybe_map_to_meta_val(value):
             else:
                 output_devices = _get_onnx_devices((prim_outputs,))
 
+            input_value_infos = tuple(input for input in onnx_model.graph.input)
+            output_value_infos = tuple(output for output in onnx_model.graph.output)
+
             execution_info_per_session = OrtExecutionInfoPerSession(
                 session=onnx_session,
                 input_names=input_names,
-                input_value_infos=tuple(input for input in onnx_model.graph.input),
+                input_value_infos=input_value_infos,
                 output_names=output_names,
-                output_value_infos=tuple(output for output in onnx_model.graph.output),
+                output_value_infos=output_value_infos,
                 input_devices=input_devices,
                 output_devices=output_devices,
                 example_outputs=prim_outputs,
@@ -850,7 +1011,10 @@ def maybe_map_to_meta_val(value):
             (prim_outputs,) if is_single_tensor_output else prim_outputs
         )
         assert isinstance(normalized_prim_outputs, tuple)
-        assert all(isinstance(elem, torch.Tensor) for elem in normalized_prim_outputs)
+        assert all(
+            isinstance(elem, (torch.Tensor, torch.SymInt, int))
+            for elem in normalized_prim_outputs
+        )
 
         _nvtx_range_push("run_onnx_session_with_ortvaluevector")
         onnx_outputs = self.run(
@@ -862,8 +1026,11 @@ def maybe_map_to_meta_val(value):
             normalized_prim_outputs,
             output_devices,
             self._options.preallocate_output,
+            input_value_infos,
+            normalized_prim_outputs,
         )
         _nvtx_range_pop()
+
         if self._assert_allclose_to_baseline:
             # Compute baseline.
             baseline_outputs = torch._prims.executor.execute(
@@ -910,9 +1077,6 @@ def compile(self, graph_module: torch.fx.GraphModule, args) -> torch.fx.GraphMod
             partitioned_prim_graph_module = self._partitioner_cache[graph_module]
         else:
             prim_graph_module = graph_module
-            # TODO(wschin): this is required for removing aten::_to_copy in _replace_to_copy_with_to.
-            # See https://github.com/pytorch/pytorch/issues/106871.
-            _replace_to_copy_with_to(prim_graph_module)
             partitioner = CapabilityBasedPartitioner(
                 prim_graph_module,
                 self._supported_ops,
@@ -978,6 +1142,7 @@ def reusable(a: OrtBackendOptions, b: OrtBackendOptions):
                 or a.default_execution_providers != b.default_execution_providers
                 or a.preallocate_output != b.preallocate_output
                 or a.use_aot_autograd != b.use_aot_autograd
+                or a.pre_ort_model_transforms != b.pre_ort_model_transforms
             ):
                 return False
 
diff --git a/torch/onnx/_type_utils.py b/torch/onnx/_type_utils.py
index 2f2b6cf63f4eb..d132325073172 100644
--- a/torch/onnx/_type_utils.py
+++ b/torch/onnx/_type_utils.py
@@ -33,6 +33,8 @@
     "BFloat16",
     "Float8E5M2",
     "Float8E4M3FN",
+    "Float8E5M2FNUZ",
+    "Float8E4M3FNUZ",
     "Undefined",
 ]
 
@@ -55,6 +57,8 @@
     "bfloat16",
     "float8_e5m2",
     "float8_e4m3fn",
+    "float8_e5m2fnuz",
+    "float8_e4m3fnuz",
 ]
 
 
@@ -96,7 +100,9 @@ class JitScalarType(enum.IntEnum):
     BFLOAT16 = enum.auto()  # 15
     FLOAT8E5M2 = enum.auto()  # 16
     FLOAT8E4M3FN = enum.auto()  # 17
-    UNDEFINED = enum.auto()  # 18
+    FLOAT8E5M2FNUZ = enum.auto()  # 18
+    FLOAT8E4M3FNUZ = enum.auto()  # 19
+    UNDEFINED = enum.auto()  # 20
 
     @classmethod
     @_beartype.beartype
@@ -151,6 +157,26 @@ def from_dtype(cls, dtype: Optional[torch.dtype]) -> JitScalarType:
             raise errors.OnnxExporterError(f"Unknown dtype: {dtype}")
         return _DTYPE_TO_SCALAR_TYPE[dtype]
 
+    @classmethod
+    @_beartype.beartype
+    def from_onnx_type(
+        cls, onnx_type: Optional[Union[int, _C_onnx.TensorProtoDataType]]
+    ) -> JitScalarType:
+        """Convert a ONNX data type to JitScalarType.
+
+        Args:
+            onnx_type: A torch._C._onnx.TensorProtoDataType to create a JitScalarType from
+
+        Returns:
+            JitScalarType
+
+        Raises:
+            OnnxExporterError: if dtype is not a valid torch.dtype or if it is None.
+        """
+        if onnx_type not in _ONNX_TO_SCALAR_TYPE:
+            raise errors.OnnxExporterError(f"Unknown onnx_type: {onnx_type}")
+        return _ONNX_TO_SCALAR_TYPE[typing.cast(_C_onnx.TensorProtoDataType, onnx_type)]
+
     @classmethod
     @_beartype.beartype
     def from_value(
@@ -286,6 +312,8 @@ def valid_torch_name(torch_name: Union[TorchName, str]) -> bool:
     JitScalarType.BFLOAT16: "BFloat16",
     JitScalarType.FLOAT8E5M2: "Float8E5M2",
     JitScalarType.FLOAT8E4M3FN: "Float8E4M3FN",
+    JitScalarType.FLOAT8E5M2FNUZ: "Float8E5M2FNUZ",
+    JitScalarType.FLOAT8E4M3FNUZ: "Float8E4M3FNUZ",
     JitScalarType.UNDEFINED: "Undefined",
 }
 
@@ -312,6 +340,8 @@ def valid_torch_name(torch_name: Union[TorchName, str]) -> bool:
     JitScalarType.BFLOAT16: "bfloat16",
     JitScalarType.FLOAT8E5M2: "float8_e5m2",
     JitScalarType.FLOAT8E4M3FN: "float8_e4m3fn",
+    JitScalarType.FLOAT8E5M2FNUZ: "float8_e5m2fnuz",
+    JitScalarType.FLOAT8E4M3FNUZ: "float8_e4m3fnuz",
 }
 
 _TORCH_NAME_TO_SCALAR_TYPE: Dict[TorchName, JitScalarType] = {
@@ -338,8 +368,12 @@ def valid_torch_name(torch_name: Union[TorchName, str]) -> bool:
     JitScalarType.QINT32: _C_onnx.TensorProtoDataType.INT32,
     JitScalarType.FLOAT8E5M2: _C_onnx.TensorProtoDataType.FLOAT8E5M2,
     JitScalarType.FLOAT8E4M3FN: _C_onnx.TensorProtoDataType.FLOAT8E4M3FN,
+    JitScalarType.FLOAT8E5M2FNUZ: _C_onnx.TensorProtoDataType.FLOAT8E5M2FNUZ,
+    JitScalarType.FLOAT8E4M3FNUZ: _C_onnx.TensorProtoDataType.FLOAT8E4M3FNUZ,
 }
 
+_ONNX_TO_SCALAR_TYPE = {v: k for k, v in _SCALAR_TYPE_TO_ONNX.items()}
+
 # source of truth is
 # https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_dtypes.cpp
 _SCALAR_TYPE_TO_DTYPE = {
@@ -361,6 +395,8 @@ def valid_torch_name(torch_name: Union[TorchName, str]) -> bool:
     JitScalarType.BFLOAT16: torch.bfloat16,
     JitScalarType.FLOAT8E5M2: torch.float8_e5m2,
     JitScalarType.FLOAT8E4M3FN: torch.float8_e4m3fn,
+    JitScalarType.FLOAT8E5M2FNUZ: torch.float8_e5m2fnuz,
+    JitScalarType.FLOAT8E4M3FNUZ: torch.float8_e4m3fnuz,
 }
 
 _DTYPE_TO_SCALAR_TYPE = {v: k for k, v in _SCALAR_TYPE_TO_DTYPE.items()}
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index c8b55c7dec99a..45c6e6f425d1a 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -2,6 +2,7 @@
 
 import functools
 import inspect
+import math
 import sys
 import typing
 import warnings
@@ -23,27 +24,11 @@
 from torch import _C
 
 # Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
-from torch.onnx import _constants, _type_utils, errors
+from torch.onnx import _constants, _type_utils, errors, utils
 from torch.onnx._globals import GLOBALS
 from torch.onnx._internal import _beartype, jit_utils
 from torch.types import Number
 
-__all__ = [
-    "args_have_same_dtype",
-    "cast_pytorch_to_onnx",
-    "check_training_mode",
-    "dequantize_helper",
-    "is_caffe2_aten_fallback",
-    "is_complex_value",
-    "parse_args",
-    "pytorch_name_to_type",
-    "quantize_helper",
-    "quantized_args",
-    "requantize_bias_helper",
-    "scalar_name_to_pytorch",
-    "scalar_type_to_onnx",
-    "scalar_type_to_pytorch_type",
-]
 
 # ---------------------------------------------------------------------------------
 # Helper functions
@@ -287,7 +272,7 @@ def wrapper(g, *args, **kwargs):
                 arg_names = [None] * len(args)  # type: ignore[list-item]
                 fn_name = None
             args = [
-                _parse_arg(arg, arg_desc, arg_name, fn_name)  # type: ignore[assignment]
+                _parse_arg(arg, arg_desc, arg_name, fn_name)  # type: ignore[method-assign]
                 for arg, arg_desc, arg_name in zip(args, arg_descriptors, arg_names)
             ]
             # only support _outputs in kwargs
@@ -1714,6 +1699,526 @@ def args_have_same_dtype(args):
     return has_same_dtype
 
 
+@_beartype.beartype
+def _op_with_optional_float_cast(g: jit_utils.GraphContext, op_name, *args, **kwargs):
+    """Some PyTorch operators (e.g., Clip/Min/ReLU/Pad) are super set of ONNX in terms of data types.
+    This function maximizes the exportability of PyTorch-ONNX by allowing ONNX-unsupported PyTorch
+    operator data type. For example, `Cast<int>(Clip<float>(Cast<float>(INPUT)))` can be used to mimic
+    `Clip<int>(INPUT)` (opset version < 12).
+
+    Args:
+        g (torch._C.Graph): graph to write the ONNX representation into.
+        op_name (str): operator name in ONNX.
+        *args (tuple): operands to the operator.
+        **kwargs (dict): attributes to the operator along with "opset_before" (optional, None by default)
+            indicating the smallest opset version to trigger such casting behavior and "target_float_t"
+            (optional, torch.onnx.JitScalarType.FLOAT by default) indicating the data type of internal operator.
+
+    Returns:
+        Optional[torch._C.Value, Tuple[torch._C.Value, ...]]: output(s) of the operator.
+    """
+    opset_before = kwargs.pop("opset_before", None)
+    target_float_t = kwargs.pop("target_float_t", _type_utils.JitScalarType.FLOAT)
+
+    inputs = list(args)
+    dtype_0 = _type_utils.JitScalarType.from_value(inputs[0])
+
+    require_cast = not _is_fp(inputs[0]) and (
+        opset_before is None or GLOBALS.export_onnx_opset_version < opset_before
+    )
+
+    if require_cast:
+        for input in inputs:
+            if input.isCompleteTensor():
+                input_scalar_type = _type_utils.JitScalarType.from_value(input)
+                if input_scalar_type != dtype_0:
+                    raise errors.SymbolicValueError(
+                        f"Inputs of {op_name} must have same dtype."
+                        f"Got {dtype_0.scalar_name()} and {input_scalar_type.scalar_name()}",
+                        input,
+                    )
+        for i, input in enumerate(inputs):
+            if input.isCompleteTensor() and not _is_fp(input):
+                inputs[i] = g.op(
+                    "Cast",
+                    input,
+                    to_i=target_float_t.onnx_type(),
+                )
+
+    self = g.op(op_name, *inputs, **kwargs)
+
+    if require_cast:
+        self = g.op("Cast", self, to_i=dtype_0.onnx_type())
+
+    return self
+
+
+@_beartype.beartype
+def _maybe_cast_reduce_op_input(g: jit_utils.GraphContext, self):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        # This check only covers traced modules where dtype is present
+        # pytorch reduce-ops cast all other integral types to int64
+        if not _is_fp(self) and scalar_type != _type_utils.JitScalarType.INT64:
+            self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return self
+
+
+def _apply_params(*args, **kwargs):
+    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
+
+    def _apply(fn):
+        return fn(*args, **kwargs)
+
+    return _apply
+
+
+@_beartype.beartype
+def _reduce_op_symbolic_helper(onnx_op_name, allow_multi_dim_support=True):
+    @_beartype.beartype
+    def symbolic(g, self, dim=None, keepdim=None):
+        self = _maybe_cast_reduce_op_input(g, self)
+        if dim is None or dim == tuple():
+            # Dim can be 0, which will cause (not dim) == True. So we don't want to do
+            # (not dim)
+            # all-reduce path
+            return _handle_reduce_dim_none(g, self, onnx_op_name)
+        else:
+            # dim-reduce path
+            keepdim = _get_const(keepdim, "i", "keepdim")
+            if g.opset < 18:
+                desc = "is" if allow_multi_dim_support else "i"
+                dim = _get_const(dim, desc, "dim")
+                dim_list = dim if allow_multi_dim_support else [dim]
+                return g.op(onnx_op_name, self, axes_i=dim_list, keepdims_i=keepdim)
+            else:
+                if _is_value(dim):
+                    axes = dim
+                else:
+                    if allow_multi_dim_support:
+                        axes = g.op(
+                            "Constant", value_t=torch.tensor(dim, dtype=torch.long)
+                        )
+                    else:
+                        axes = g.op(
+                            "Constant", value_t=torch.tensor([dim], dtype=torch.long)
+                        )
+                return g.op(onnx_op_name, self, axes, keepdims_i=keepdim)
+
+    return symbolic
+
+
+@_beartype.beartype
+def _overload_by_arg_count(fn):
+    @functools.wraps(fn)
+    @_beartype.beartype
+    def wrapper(g, *args):
+        overloads = fn(g, *args)
+        for overload in overloads:
+            arg_descriptors = overload._arg_descriptors
+            if len(arg_descriptors) == len(args):
+                return overload(g, *args)
+        return _unimplemented(f"aten::{fn.__name__}", f"with {len(args)} arguments")
+
+    return wrapper
+
+
+@_beartype.beartype
+def _reduce_with_dtype_helper(
+    onnx_op: str, name: str, allow_multi_dim_support: bool = True
+):
+    symbolic = _reduce_op_symbolic_helper(
+        onnx_op, allow_multi_dim_support=allow_multi_dim_support
+    )
+
+    @_overload_by_arg_count
+    def reduce(g, *args, **kwargs):
+        @quantized_args(True)
+        @parse_args("v", "none")
+        def reduce_nodim(g, self, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = _get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return _unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        dim_desc = "is" if allow_multi_dim_support else "i"
+
+        @quantized_args(True)
+        @parse_args("v", dim_desc, "i", "none")  # type: ignore[arg-type]
+        def reduce_dim(g, self, dim, keepdim, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = _get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return _unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self, dim, keepdim)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        return reduce_nodim, reduce_dim
+
+    return reduce
+
+
+@_beartype.beartype
+def _max_helper(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.max(input)
+    if dim_or_y is None and keepdim is None:
+        return g.op("ReduceMax", self, keepdims_i=0)
+    # torch.max(input, other)
+    if keepdim is None:
+        return _op_with_optional_float_cast(g, "Max", self, dim_or_y, opset_before=12)
+    # torch.max(input, dim, keepdim)
+    else:
+        keepdim = _get_const(keepdim, "i", "keepdim")
+        dim = _get_const(dim_or_y, "i", "dim")
+        if g.opset < 18:
+            max = g.op("ReduceMax", self, axes_i=[dim], keepdims_i=keepdim)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            max = g.op("ReduceMax", self, axes, keepdims_i=keepdim)
+        indices = g.op("ArgMax", self, axis_i=dim, keepdims_i=keepdim)
+        return max, indices
+
+
+@_beartype.beartype
+def _min_helper(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.min(input)
+    if dim_or_y is None and keepdim is None:
+        return g.op("ReduceMin", self, keepdims_i=0)
+    # torch.min(input, other)
+    if keepdim is None:
+        return _op_with_optional_float_cast(g, "Min", self, dim_or_y, opset_before=12)
+    # torch.min(input, dim, keepdim)
+    else:
+        keepdim = _get_const(keepdim, "i", "keepdim")
+        dim = _get_const(dim_or_y, "i", "dim")
+        if g.opset < 18:
+            min = g.op("ReduceMin", self, axes_i=[dim], keepdims_i=keepdim)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            min = g.op("ReduceMin", self, axes, keepdims_i=keepdim)
+        indices = g.op("ArgMin", self, axis_i=dim, keepdims_i=keepdim)
+        return min, indices
+
+
+@_beartype.beartype
+def _numel_helper(g: jit_utils.GraphContext, self):
+    shape = g.op("Shape", self)
+    return g.op("ReduceProd", shape, keepdims_i=0)
+
+
+@parse_args("v", "is", "i", "i")
+@_beartype.beartype
+def _var_mean_helper(g: jit_utils.GraphContext, input, dim, correction, keepdim):
+    if g.opset < 18:
+        if dim is None:
+            mean = g.op("ReduceMean", input, keepdims_i=0)
+            t_mean = mean
+            num_elements = _numel_helper(g, input)
+        else:
+            mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=keepdim)
+            t_mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=1)
+            redudced_dims = g.op("Shape", input)
+            # dim could contain one or multiple dimensions
+            redudced_dims = g.op(
+                "Gather",
+                redudced_dims,
+                g.op("Constant", value_t=torch.tensor(dim)),
+                axis_i=0,
+            )
+            num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
+        sub_v = g.op("Sub", input, t_mean)
+        sqr_sub = g.op("Mul", sub_v, sub_v)
+        keepdim_mean = 0 if dim is None else keepdim
+        var = g.op("ReduceMean", sqr_sub, axes_i=dim, keepdims_i=keepdim_mean)
+        # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
+        if correction is None:
+            correction = 1
+        if correction != 0:
+            num_elements = g.op(
+                "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
+            )
+            one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
+            mul = g.op("Mul", var, num_elements)
+            var = g.op("Div", mul, g.op("Sub", num_elements, one))
+        return var, mean
+    else:
+        axes = None
+        if dim is None:
+            mean = g.op("ReduceMean", input, keepdims_i=0)
+            t_mean = mean
+            num_elements = _numel_helper(g, input)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+            mean = g.op("ReduceMean", input, axes, keepdims_i=keepdim)
+            t_mean = g.op("ReduceMean", input, axes, keepdims_i=1)
+            redudced_dims = g.op("Shape", input)
+            # dim could contain one or multiple dimensions
+            redudced_dims = g.op(
+                "Gather",
+                redudced_dims,
+                g.op("Constant", value_t=torch.tensor(dim)),
+                axis_i=0,
+            )
+            num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
+        sub_v = g.op("Sub", input, t_mean)
+        sqr_sub = g.op("Mul", sub_v, sub_v)
+        keepdim_mean = 0 if dim is None else keepdim
+        if axes is None:
+            var = g.op("ReduceMean", sqr_sub, keepdims_i=keepdim_mean)
+        else:
+            var = g.op("ReduceMean", sqr_sub, axes, keepdims_i=keepdim_mean)
+        # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
+        if correction is None:
+            correction = 1
+        if correction != 0:
+            num_elements = g.op(
+                "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
+            )
+            one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
+            mul = g.op("Mul", var, num_elements)
+            var = g.op("Div", mul, g.op("Sub", num_elements, one))
+        return var, mean
+
+
+@_beartype.beartype
+def _embedding_bag_helper(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        return _onnx_unsupported(
+            "embedding_bag with scale_grad_by_freq for training mode"
+        )
+    if padding_idx is not None and padding_idx >= 0:
+        raise RuntimeError("embedding_bag with padding_idx")
+
+    loop_condition = g.op("Constant", value_t=torch.tensor(1))
+    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    zero = g.op("Constant", value_t=torch.tensor([0]))
+
+    indices_len = _unsqueeze_helper(
+        g,
+        _size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))),
+        [0],
+    )
+    if not include_last_offset:
+        offsets = [offsets, indices_len]
+        offsets = g.op("Concat", *offsets, axis_i=0)
+
+    # Offsets holds the starting index position of each bag. So we create a list of the indices slices (determined by
+    # offsets) and gather those indices in indices_row. Then we use this subset of indices to gather from embeddings.
+    # The embeddings output is a loop scan output, so we can avoid creating a sequence and inserting elements in.
+    offsets_starts = _slice_helper(
+        g, offsets, axes=[0], starts=[0], ends=[sys.maxsize], steps=[1]
+    )
+    offsets_ends = _slice_helper(
+        g, offsets, axes=[0], starts=[1], ends=[sys.maxsize], steps=[1]
+    )
+
+    loop_len = _size_helper(g, offsets_ends, g.op("Constant", value_t=torch.tensor(0)))
+
+    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+        g, "Loop", loop_len, loop_condition, n_blocks=1
+    )
+    loop_block = loop_context.block
+
+    # FIXME(justinchuby): We need to handle what happens when we call b.op on a node return
+    block_input_iter = utils._add_input_to_block(loop_block)
+    cond = utils._add_input_to_block(loop_block)
+
+    indices_start = loop_context.op(
+        "Gather", offsets_starts, block_input_iter, axis_i=0
+    )
+    indices_end = loop_context.op("Gather", offsets_ends, block_input_iter, axis_i=0)
+    indices_start = _unsqueeze_helper(loop_context, indices_start, [0])
+    indices_end = _unsqueeze_helper(loop_context, indices_end, [0])
+
+    indices_row = loop_context.op("Slice", indices, indices_start, indices_end, zero)
+    embeddings = loop_context.op("Gather", embedding_matrix, indices_row, axis_i=0)
+    if not _is_none(per_sample_weights):
+        per_sample_weights_row = loop_context.op(
+            "Slice", per_sample_weights, indices_start, indices_end, zero
+        )
+        per_sample_weights_row = _unsqueeze_helper(
+            loop_context, per_sample_weights_row, [1]
+        )
+        embeddings = loop_context.op("Mul", embeddings, per_sample_weights_row)
+    if mode == 0:
+        embeddings = _reducesum_helper(
+            loop_context, embeddings, axes_i=[0], keepdims_i=0
+        )
+    elif mode == 1:
+        if loop_context.opset < 18:
+            embeddings = loop_context.op(
+                "ReduceMean", embeddings, axes_i=[0], keepdims_i=0
+            )
+        else:
+            axes = loop_context.op(
+                "Constant", value_t=torch.tensor([0], dtype=torch.long)
+            )
+            embeddings = loop_context.op("ReduceMean", embeddings, axes, keepdims_i=0)
+    else:
+        if loop_context.opset < 18:
+            embeddings = loop_context.op(
+                "ReduceMax", embeddings, axes_i=[0], keepdims_i=0
+            )
+        else:
+            axes = loop_context.op(
+                "Constant", value_t=torch.tensor([0], dtype=torch.long)
+            )
+            embeddings = loop_context.op("ReduceMax", embeddings, axes, keepdims_i=0)
+
+    cond_out = loop_context.op(
+        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+    )
+    utils._add_output_to_block(loop_block, cond_out)
+    utils._add_output_to_block(loop_block, embeddings)
+
+    # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
+    # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
+    return loop.node().output(), None, None, None
+
+
+@_beartype.beartype
+def _linalg_vector_norm_helper(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Optional[Sequence[int]],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    axes = None
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.vector_norm.html
+    if _is_none(dim):
+        self = _reshape_helper(g, self, [-1])
+        keepdim = False
+    elif g.opset >= 18:
+        axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+
+    if ord == math.inf:
+        if g.opset < 18:
+            result = g.op(
+                "ReduceMax", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim
+            )
+        else:
+            if axes is None:
+                result = g.op("ReduceMax", g.op("Abs", self), keepdims_i=keepdim)
+            else:
+                result = g.op("ReduceMax", g.op("Abs", self), axes, keepdims_i=keepdim)
+    elif ord == -math.inf:
+        if g.opset < 18:
+            result = g.op(
+                "ReduceMin", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim
+            )
+        else:
+            if axes is None:
+                result = g.op("ReduceMin", g.op("Abs", self), keepdims_i=keepdim)
+            else:
+                result = g.op("ReduceMin", g.op("Abs", self), axes, keepdims_i=keepdim)
+    elif ord == 0:
+        if g.opset < 11:
+            return _onnx_opset_unsupported_detailed(
+                "linalg_vector_norm", 9, 11, "ord=0 not supported", self
+            )
+        else:
+            if dim is None:
+                self = _reshape_helper(
+                    g,
+                    self,
+                    g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)),
+                )
+                keepdim = False
+
+            cond_op = g.op(
+                "Not",
+                g.op("Equal", self, g.op("Constant", value_t=torch.LongTensor([0]))),
+            )
+            cond_op = g.op(
+                "Cast",
+                cond_op,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+            return _reducesum_helper(g, cond_op, axes_i=dim, keepdims_i=keepdim)
+    elif ord == 1:
+        if g.opset < 18:
+            result = _reduce_op_symbolic_helper("ReduceL1")(
+                g, self, dim=dim, keepdim=keepdim
+            )
+        else:
+            if axes is None:
+                result = _reduce_op_symbolic_helper("ReduceL1")(
+                    g, self, keepdim=keepdim
+                )
+            else:
+                result = _reduce_op_symbolic_helper("ReduceL1")(
+                    g, self, axes, keepdim=keepdim
+                )
+    elif ord == 2:
+        if g.opset < 18:
+            result = _reduce_op_symbolic_helper("ReduceL2")(
+                g, self, dim=dim, keepdim=keepdim
+            )
+        else:
+            if axes is None:
+                result = _reduce_op_symbolic_helper("ReduceL2")(
+                    g, self, keepdim=keepdim
+                )
+            else:
+                result = _reduce_op_symbolic_helper("ReduceL2")(
+                    g, self, axes, keepdim=keepdim
+                )
+    else:
+        ord_op = g.op("Constant", value_t=torch.tensor(ord, dtype=torch.float32))
+        result = _reducesum_helper(
+            g, g.op("Pow", g.op("Abs", self), ord_op), axes_i=dim, keepdims_i=keepdim
+        )
+        result = g.op(
+            "Pow",
+            result,
+            g.op(
+                "Div",
+                g.op("Constant", value_t=torch.tensor(1, dtype=torch.float32)),
+                ord_op,
+            ),
+        )
+
+    if not _is_none(dtype):
+        dtype = _get_const(dtype, "i", "dtype")
+        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())  # type: ignore[arg-type]
+    return result
+
+
 # Deprecated. Internally use _type_utils.ScalarType
 # TODO: remove these once we support Type's in the JIT IR and we can once again
 # use the unified toType operator
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index 19197c0bdc788..6fd576822e2c9 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -70,15 +70,6 @@
 _onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=10)
 
 
-def _apply_params(*args, **kwargs):
-    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
-
-    def _apply(fn):
-        return fn(*args, **kwargs)
-
-    return _apply
-
-
 @_onnx_symbolic("aten::div")
 @_beartype.beartype
 def div(g: jit_utils.GraphContext, self, other, *args):
@@ -276,20 +267,20 @@ def _aten_max_pool_with_indices_onnx(
 
 @_onnx_symbolic(
     "aten::max_pool1d",
-    decorate=[_apply_params("max_pool1d", 1, return_indices=False)],
+    decorate=[symbolic_helper._apply_params("max_pool1d", 1, return_indices=False)],
 )
 @_onnx_symbolic(
     "aten::max_pool2d",
-    decorate=[_apply_params("max_pool2d", 2, return_indices=False)],
+    decorate=[symbolic_helper._apply_params("max_pool2d", 2, return_indices=False)],
 )
 @_onnx_symbolic(
     "aten::max_pool3d",
-    decorate=[_apply_params("max_pool3d", 3, return_indices=False)],
+    decorate=[symbolic_helper._apply_params("max_pool3d", 3, return_indices=False)],
 )
 @_onnx_symbolic(
     "aten::max_pool1d_with_indices",
     decorate=[
-        _apply_params(
+        symbolic_helper._apply_params(
             "max_pool1d_with_indices",
             1,
             return_indices=True,
@@ -299,7 +290,7 @@ def _aten_max_pool_with_indices_onnx(
 @_onnx_symbolic(
     "aten::max_pool2d_with_indices",
     decorate=[
-        _apply_params(
+        symbolic_helper._apply_params(
             "max_pool2d_with_indices",
             2,
             return_indices=True,
@@ -309,7 +300,7 @@ def _aten_max_pool_with_indices_onnx(
 @_onnx_symbolic(
     "aten::max_pool3d_with_indices",
     decorate=[
-        _apply_params(
+        symbolic_helper._apply_params(
             "max_pool3d_with_indices",
             3,
             return_indices=True,
@@ -397,15 +388,15 @@ def _adjust_attributes_of_avg_pool(
 
 @_onnx_symbolic(
     "aten::avg_pool1d",
-    decorate=[_apply_params("avg_pool1d", 1)],
+    decorate=[symbolic_helper._apply_params("avg_pool1d", 1)],
 )
 @_onnx_symbolic(
     "aten::avg_pool2d",
-    decorate=[_apply_params("avg_pool2d", 2)],
+    decorate=[symbolic_helper._apply_params("avg_pool2d", 2)],
 )
 @_onnx_symbolic(
     "aten::avg_pool3d",
-    decorate=[_apply_params("avg_pool3d", 3)],
+    decorate=[symbolic_helper._apply_params("avg_pool3d", 3)],
 )
 @_beartype.beartype
 def _avg_pool(name, expand_size):
@@ -443,27 +434,27 @@ def symbolic_fn(
 
 @_onnx_symbolic(
     "aten::upsample_nearest1d",
-    decorate=[_apply_params("upsample_nearest1d", 3, "nearest")],
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
 )
 @_onnx_symbolic(
     "aten::upsample_nearest2d",
-    decorate=[_apply_params("upsample_nearest2d", 4, "nearest")],
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
 )
 @_onnx_symbolic(
     "aten::upsample_nearest3d",
-    decorate=[_apply_params("upsample_nearest3d", 5, "nearest")],
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
 )
 @_onnx_symbolic(
     "aten::upsample_linear1d",
-    decorate=[_apply_params("upsample_linear1d", 3, "linear")],
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
 )
 @_onnx_symbolic(
     "aten::upsample_bilinear2d",
-    decorate=[_apply_params("upsample_bilinear2d", 4, "linear")],
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
 )
 @_onnx_symbolic(
     "aten::upsample_trilinear3d",
-    decorate=[_apply_params("upsample_trilinear3d", 5, "linear")],
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
 )
 @_beartype.beartype
 def _interpolate(name, dim, interpolate_mode):
@@ -794,7 +785,7 @@ def nan_to_num(g: jit_utils.GraphContext, input, nan, posinf, neginf):
     )
 
     # For None values of posinf, neginf we use the greatest/lowest finite
-    # value representable by input’s dtype.
+    # value representable by input's dtype.
     finfo = torch.finfo(input_dtype)
     if posinf is None:
         posinf = finfo.max
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index fc1d205ce3366..0282c38f73ea2 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -17,7 +17,6 @@
     symbolic_opset9 as opset9,
     utils,
 )
-from torch.onnx._globals import GLOBALS
 from torch.onnx._internal import _beartype, jit_utils, registration
 
 # EDITING THIS FILE? READ THIS FIRST!
@@ -86,15 +85,6 @@
 _onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=11)
 
 
-def _apply_params(*args, **kwargs):
-    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
-
-    def _apply(fn):
-        return fn(*args, **kwargs)
-
-    return _apply
-
-
 @_onnx_symbolic("aten::hardtanh")
 @symbolic_helper.quantized_args(True)
 @symbolic_helper.parse_args("v", "f", "f")
@@ -111,7 +101,7 @@ def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val:
         "Constant",
         value_t=torch.tensor(max_val, dtype=scalar_type.dtype()),
     )
-    return opset9._op_with_optional_float_cast(
+    return symbolic_helper._op_with_optional_float_cast(
         g, "Clip", self, min_val, max_val, opset_before=12
     )
 
@@ -146,7 +136,7 @@ def _cast_if_not_none(tensor, dtype):
             symbolic_helper._get_tensor_rank(min) == 0
             and symbolic_helper._get_tensor_rank(max) == 0
         ):
-            return opset9._op_with_optional_float_cast(
+            return symbolic_helper._op_with_optional_float_cast(
                 g, "Clip", self, min, max, opset_before=12
             )
         else:
@@ -160,11 +150,13 @@ def clamp_min(g: jit_utils.GraphContext, self, min):
     min = g.op("Cast", min, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
     if symbolic_helper._get_tensor_rank(min) == 0:
         max = opset9.unused(g)
-        return opset9._op_with_optional_float_cast(
+        return symbolic_helper._op_with_optional_float_cast(
             g, "Clip", self, min, max, opset_before=12
         )
     else:
-        return opset9._op_with_optional_float_cast(g, "Max", self, min, opset_before=12)
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Max", self, min, opset_before=12
+        )
 
 
 @_onnx_symbolic("aten::clamp_max")
@@ -174,11 +166,13 @@ def clamp_max(g: jit_utils.GraphContext, self, max):
     max = g.op("Cast", max, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
     if symbolic_helper._get_tensor_rank(max) == 0:
         min = opset9.unused(g)
-        return opset9._op_with_optional_float_cast(
+        return symbolic_helper._op_with_optional_float_cast(
             g, "Clip", self, min, max, opset_before=12
         )
     else:
-        return opset9._op_with_optional_float_cast(g, "Min", self, max, opset_before=12)
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Min", self, max, opset_before=12
+        )
 
 
 @_onnx_symbolic("aten::relu6")
@@ -348,31 +342,31 @@ def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
 
 @_onnx_symbolic(
     "aten::upsample_nearest1d",
-    decorate=[_apply_params("upsample_nearest1d", 3, "nearest")],
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
 )
 @_onnx_symbolic(
     "aten::upsample_nearest2d",
-    decorate=[_apply_params("upsample_nearest2d", 4, "nearest")],
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
 )
 @_onnx_symbolic(
     "aten::upsample_nearest3d",
-    decorate=[_apply_params("upsample_nearest3d", 5, "nearest")],
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
 )
 @_onnx_symbolic(
     "aten::upsample_linear1d",
-    decorate=[_apply_params("upsample_linear1d", 3, "linear")],
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
 )
 @_onnx_symbolic(
     "aten::upsample_bilinear2d",
-    decorate=[_apply_params("upsample_bilinear2d", 4, "linear")],
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
 )
 @_onnx_symbolic(
     "aten::upsample_trilinear3d",
-    decorate=[_apply_params("upsample_trilinear3d", 5, "linear")],
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
 )
 @_onnx_symbolic(
     "aten::upsample_bicubic2d",
-    decorate=[_apply_params("upsample_bicubic2d", 4, "cubic")],
+    decorate=[symbolic_helper._apply_params("upsample_bicubic2d", 4, "cubic")],
 )
 @_beartype.beartype
 def _interpolate(name: str, dim: int, interpolate_mode: str):
@@ -1281,26 +1275,7 @@ def linalg_vector_norm(
     keepdim: bool,
     dtype,
 ):
-    if ord == 0:
-        if dim is None:
-            self = symbolic_helper._reshape_helper(
-                g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
-            )
-            keepdim = False
-
-        cond_op = g.op(
-            "Not", g.op("Equal", self, g.op("Constant", value_t=torch.LongTensor([0])))
-        )
-        cond_op = g.op(
-            "Cast",
-            cond_op,
-            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-        )
-        return symbolic_helper._reducesum_helper(
-            g, cond_op, axes_i=dim, keepdims_i=keepdim
-        )
-    else:
-        return opset9.linalg_vector_norm(g, self, ord, dim, keepdim, dtype)
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
 
 
 @_onnx_symbolic("aten::embedding_bag")
@@ -1318,87 +1293,19 @@ def embedding_bag(
     include_last_offset,
     padding_idx,
 ):
-    if scale_grad_by_freq and GLOBALS.export_training:
-        return symbolic_helper._onnx_unsupported(
-            "embedding_bag with scale_grad_by_freq for training mode"
-        )
-    if padding_idx is not None and padding_idx >= 0:
-        raise RuntimeError("embedding_bag with padding_idx")
-
-    loop_condition = g.op("Constant", value_t=torch.tensor(1))
-    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
-    zero = g.op("Constant", value_t=torch.tensor([0]))
-
-    indices_len = symbolic_helper._unsqueeze_helper(
+    return symbolic_helper._embedding_bag_helper(
         g,
-        symbolic_helper._size_helper(
-            g, indices, g.op("Constant", value_t=torch.tensor(0))
-        ),
-        [0],
-    )
-    if not include_last_offset:
-        offsets = [offsets, indices_len]
-        offsets = g.op("Concat", *offsets, axis_i=0)
-
-    # Offsets holds the starting index position of each bag. So we create a list of the indices slices (determined by
-    # offsets) and gather those indices in indices_row. Then we use this subset of indices to gather from embeddings.
-    # The embeddings output is a loop scan output, so we can avoid creating a sequence and inserting elements in.
-    offsets_starts = symbolic_helper._slice_helper(
-        g, offsets, axes=[0], starts=[0], ends=[sys.maxsize], steps=[1]
-    )
-    offsets_ends = symbolic_helper._slice_helper(
-        g, offsets, axes=[0], starts=[1], ends=[sys.maxsize], steps=[1]
-    )
-
-    loop_len = symbolic_helper._size_helper(
-        g, offsets_ends, g.op("Constant", value_t=torch.tensor(0))
+        embedding_matrix,
+        indices,
+        offsets,
+        scale_grad_by_freq,
+        mode,
+        sparse,
+        per_sample_weights,
+        include_last_offset,
+        padding_idx,
     )
 
-    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
-        g, "Loop", loop_len, loop_condition, n_blocks=1
-    )
-    loop_block = loop_context.block
-
-    # FIXME(justinchuby): We need to handle what happens when we call b.op on a node return
-    block_input_iter = utils._add_input_to_block(loop_block)
-    cond = utils._add_input_to_block(loop_block)
-
-    indices_start = loop_context.op(
-        "Gather", offsets_starts, block_input_iter, axis_i=0
-    )
-    indices_end = loop_context.op("Gather", offsets_ends, block_input_iter, axis_i=0)
-    indices_start = symbolic_helper._unsqueeze_helper(loop_context, indices_start, [0])
-    indices_end = symbolic_helper._unsqueeze_helper(loop_context, indices_end, [0])
-
-    indices_row = loop_context.op("Slice", indices, indices_start, indices_end, zero)
-    embeddings = loop_context.op("Gather", embedding_matrix, indices_row, axis_i=0)
-    if not symbolic_helper._is_none(per_sample_weights):
-        per_sample_weights_row = loop_context.op(
-            "Slice", per_sample_weights, indices_start, indices_end, zero
-        )
-        per_sample_weights_row = symbolic_helper._unsqueeze_helper(
-            loop_context, per_sample_weights_row, [1]
-        )
-        embeddings = loop_context.op("Mul", embeddings, per_sample_weights_row)
-    if mode == 0:
-        embeddings = symbolic_helper._reducesum_helper(
-            loop_context, embeddings, axes_i=[0], keepdims_i=0
-        )
-    elif mode == 1:
-        embeddings = loop_context.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
-    else:
-        embeddings = loop_context.op("ReduceMax", embeddings, axes_i=[0], keepdims_i=0)
-
-    cond_out = loop_context.op(
-        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
-    )
-    utils._add_output_to_block(loop_block, cond_out)
-    utils._add_output_to_block(loop_block, embeddings)
-
-    # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
-    # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
-    return loop.node().output(), None, None, None
-
 
 @_onnx_symbolic("aten::embedding_renorm")
 @symbolic_helper.parse_args("v", "v", "f", "f")
@@ -1472,10 +1379,10 @@ def normal(
     pin_memory=None,
 ):
     # If you can sample from a given distribution with mean 0 and variance 1, then you can easily sample from a
-    # scale-location transformation of that distribution, which has mean μ and variance σ's square. If x is a sample
+    # scale-location transformation of that distribution, which has mean mu and variance sigma's square. If x is a sample
     # from a mean 0 and variance 1 distribution then
-    #       σx+μ
-    # is a sample with mean μ and variance σ's square.
+    #       sigma x+mu
+    # is a sample with mean mu and variance sigma's square.
     if sizes is not None and not symbolic_helper._is_none(sizes):
         mean = opset9.expand(g, mean, sizes, None)
     result = opset9.mul(g, std, g.op("RandomNormalLike", mean))
diff --git a/torch/onnx/symbolic_opset13.py b/torch/onnx/symbolic_opset13.py
index 4deafb8f2ed8b..5bba817bbce03 100644
--- a/torch/onnx/symbolic_opset13.py
+++ b/torch/onnx/symbolic_opset13.py
@@ -21,15 +21,6 @@
 _onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=13)
 
 
-def _apply_params(*args, **kwargs):
-    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
-
-    def _apply(fn):
-        return fn(*args, **kwargs)
-
-    return _apply
-
-
 @_onnx_symbolic("aten::softmax")
 @symbolic_helper.parse_args("v", "i", "none")
 @_beartype.beartype
@@ -412,7 +403,7 @@ def fake_quantize_per_tensor_affine(
 def _reduce_op_symbolic(onnx_op_name):
     @_beartype.beartype
     def symbolic(g, self, dim=None, keepdim=None):
-        self = opset9._maybe_cast_reduce_op_input(g, self)
+        self = symbolic_helper._maybe_cast_reduce_op_input(g, self)
         if dim is None:
             # all-reduce path
             return symbolic_helper._handle_reduce_dim_none(g, self, onnx_op_name)
@@ -425,13 +416,13 @@ def symbolic(g, self, dim=None, keepdim=None):
 
 @_onnx_symbolic(
     "aten::sum",
-    decorate=[_apply_params("ReduceSum", "sum")],
+    decorate=[symbolic_helper._apply_params("ReduceSum", "sum")],
 )
 @_beartype.beartype
 def _reduce_with_dtype(onnx_op, name):
     symbolic = _reduce_op_symbolic(onnx_op)
 
-    @opset9.overload_by_arg_count
+    @symbolic_helper._overload_by_arg_count
     @_beartype.beartype
     def reduce(g, *args, **kwargs):
         @symbolic_helper.parse_args("v", "none")
diff --git a/torch/onnx/symbolic_opset14.py b/torch/onnx/symbolic_opset14.py
index f3b2f6722caf4..1b4b8ee7917c5 100644
--- a/torch/onnx/symbolic_opset14.py
+++ b/torch/onnx/symbolic_opset14.py
@@ -191,6 +191,7 @@ def scaled_dot_product_attention(
     elif _type_utils.JitScalarType.from_value(attn_mask) in (
         _type_utils.JitScalarType.FLOAT,
         _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.BFLOAT16,
     ):
         mul_qk_add = g.op("Add", mul_qk, attn_mask)
     else:
diff --git a/torch/onnx/symbolic_opset18.py b/torch/onnx/symbolic_opset18.py
index dee33785d0b29..cb86359f9bfae 100644
--- a/torch/onnx/symbolic_opset18.py
+++ b/torch/onnx/symbolic_opset18.py
@@ -14,19 +14,23 @@
     Resize
     ScatterElements
     ScatterND
+    Split
 """
 
 import functools
-from typing import Sequence
+from typing import List, Optional, Sequence, Tuple
 
+import torch
 from torch import _C
-from torch.onnx import symbolic_helper
-from torch.onnx._internal import _beartype, registration
+from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import _beartype, jit_utils, registration
 
 # EDITING THIS FILE? READ THIS FIRST!
 # see Note [Edit Symbolic Files] in symbolic_helper.py
 
-__all__ = ["col2im"]
+__all__ = [
+    "col2im",
+]
 
 _onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=18)
 
@@ -68,3 +72,190 @@ def col2im(
         pads_i=adjusted_padding,
         strides_i=stride,
     )
+
+
+@_onnx_symbolic(
+    "aten::mean", decorate=[symbolic_helper._apply_params("ReduceMean", "mean")]
+)
+@_onnx_symbolic(
+    "aten::prod",
+    decorate=[
+        symbolic_helper._apply_params(
+            "ReduceProd", "prod", allow_multi_dim_support=False
+        )
+    ],
+)
+@_beartype.beartype
+def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
+    return symbolic_helper._reduce_with_dtype_helper(
+        onnx_op, name, allow_multi_dim_support
+    )
+
+
+@_onnx_symbolic("aten::native_layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f")
+@_beartype.beartype
+def _native_layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+) -> Tuple[_C.Value, _C.Value, _C.Value]:
+    return opset9.native_layer_norm(g, input, normalized_shape, weight, bias, eps)
+
+
+@_onnx_symbolic("aten::glu")
+@symbolic_helper.parse_args("v", "i")
+@_beartype.beartype
+def _glu(g: jit_utils.GraphContext, input, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(input, dim)
+    if dim_size is not None:
+        assert dim_size % 2 == 0
+
+    first, second = g.op("Split", input, axis_i=dim, num_outputs_i=2, outputs=2)
+    return g.op("Mul", first, g.op("Sigmoid", second))
+
+
+@_onnx_symbolic("aten::max")
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+# TODO(justinchuby): Support multiple quantized args in output
+@_beartype.beartype
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._max_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::maximum")
+@symbolic_helper.quantized_args(True, True)
+@_beartype.beartype
+def maximum(g: jit_utils.GraphContext, input, other):
+    return max(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::min")
+# TODO(justinchuby): Support multiple quantized args in output
+@_beartype.beartype
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._min_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::minimum")
+@symbolic_helper.quantized_args(True, True)
+@_beartype.beartype
+def minimum(g: jit_utils.GraphContext, input, other):
+    return min(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::amax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+@_beartype.beartype
+def amax(g: jit_utils.GraphContext, self, dim, keepdim):
+    axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    return g.op("ReduceMax", self, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::amin")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+@_beartype.beartype
+def amin(g: jit_utils.GraphContext, self, dim, keepdim):
+    axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    return g.op("ReduceMin", self, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::aminmax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i")
+@_beartype.beartype
+def aminmax(g: jit_utils.GraphContext, self, dim, keepdim):
+    if not symbolic_helper._is_none(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+        return g.op("ReduceMin", self, axes, keepdims_i=keepdim), g.op(
+            "ReduceMax", self, axes, keepdims_i=keepdim
+        )
+    else:
+        return g.op("ReduceMin", self, keepdims_i=keepdim), g.op(
+            "ReduceMax", self, keepdims_i=keepdim
+        )
+
+
+@_onnx_symbolic("aten::var_mean")
+@_beartype.beartype
+def _var_mean(g: jit_utils.GraphContext, input, *args):
+    if len(args) == 1:
+        return symbolic_helper._var_mean_helper(g, input, None, args[0], None)
+    else:
+        return symbolic_helper._var_mean_helper(g, input, *args)
+
+
+@_onnx_symbolic("aten::logsumexp")
+@symbolic_helper.parse_args("v", "is", "i")
+@_beartype.beartype
+def _logsumexp(g: jit_utils.GraphContext, input, dim, keepdim):
+    if dim is None:
+        return g.op("ReduceLogSumExp", input, keepdims_i=0)
+    else:
+        axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+        return g.op("ReduceLogSumExp", input, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::linalg_matrix_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+@_beartype.beartype
+def _linalg_matrix_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: List[int],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    return opset9.linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+@_beartype.beartype
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    return symbolic_helper._embedding_bag_helper(
+        g,
+        embedding_matrix,
+        indices,
+        offsets,
+        scale_grad_by_freq,
+        mode,
+        sparse,
+        per_sample_weights,
+        include_last_offset,
+        padding_idx,
+    )
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+@_beartype.beartype
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Optional[Sequence[int]],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
diff --git a/torch/onnx/symbolic_opset19.py b/torch/onnx/symbolic_opset19.py
new file mode 100644
index 0000000000000..81b69dd3cc200
--- /dev/null
+++ b/torch/onnx/symbolic_opset19.py
@@ -0,0 +1,32 @@
+"""This file exports ONNX ops for opset 19.
+
+Note [ONNX Operators that are added/updated in opset 19]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-19-of-the-default-onnx-operator-set
+New operators:
+AveragePool
+Cast
+CastLike
+Constant
+DeformConv
+DequantizeLinear
+Equal
+Identity
+If
+Loop
+Pad
+QuantizeLinear
+Reshape
+Resize
+Scan
+Shape
+Size
+"""
+
+from typing import List
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__: List[str] = []
diff --git a/torch/onnx/symbolic_opset20.py b/torch/onnx/symbolic_opset20.py
new file mode 100644
index 0000000000000..9c81bc3e3c49e
--- /dev/null
+++ b/torch/onnx/symbolic_opset20.py
@@ -0,0 +1,85 @@
+"""This file exports ONNX ops for opset 20.
+
+Note [ONNX Operators that are added/updated in opset 20]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-20-of-the-default-onnx-operator-set
+New operators:
+    AffineGrid
+    ConstantOfShape
+    DFT
+    Gelu
+    GridSample
+    ImageDecoder
+    IsInf
+    IsNaN
+    ReduceMax
+    ReduceMin
+    RegexFullMatch
+    StringConcat
+    StringSplit
+"""
+
+import functools
+
+import torch.nn.functional as F
+
+from torch import _C
+from torch.onnx import symbolic_helper
+from torch.onnx._internal import _beartype, jit_utils, registration
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__ = ["_grid_sampler", "_affine_grid_generator"]
+
+
+def convert_grid_sample_mode(mode_s):
+    return (
+        "linear" if mode_s == "bilinear" else "cubic" if mode_s == "bicubic" else mode_s
+    )
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=20)
+
+
+@_onnx_symbolic("aten::grid_sampler")
+@symbolic_helper.parse_args("v", "v", "i", "i", "b")
+@_beartype.beartype
+def _grid_sampler(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    grid: _C.Value,
+    mode_enum: int,
+    padding_mode_enum: int,
+    align_corners: bool,
+):
+    mode_s = {v: k for k, v in F.GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg, index]
+    # mode string changes at https://onnx.ai/onnx/operators/text_diff_GridSample_16_20.html
+    mode_s = convert_grid_sample_mode(mode_s)
+    padding_mode_s = {v: k for k, v in F.GRID_SAMPLE_PADDING_MODES.items()}[padding_mode_enum]  # type: ignore[call-arg, index]
+    return g.op(
+        "GridSample",
+        input,
+        grid,
+        align_corners_i=int(align_corners),
+        mode_s=mode_s,
+        padding_mode_s=padding_mode_s,
+    )
+
+
+@_onnx_symbolic("aten::affine_grid_generator")
+@symbolic_helper.parse_args("v", "v", "b")
+@_beartype.beartype
+def _affine_grid_generator(
+    g: jit_utils.GraphContext,
+    theta: _C.Value,
+    size: _C.Value,
+    align_corners: bool,
+):
+    return g.op(
+        "AffineGrid",
+        theta,
+        size,
+        align_corners_i=int(align_corners),
+    )
diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py
index c7a771c8f894f..b2fbee3b9784f 100644
--- a/torch/onnx/symbolic_opset8.py
+++ b/torch/onnx/symbolic_opset8.py
@@ -64,38 +64,29 @@
     )
 
 
-def _apply_params(*args, **kwargs):
-    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
-
-    def _apply(fn):
-        return fn(*args, **kwargs)
-
-    return _apply
-
-
 @_onnx_symbolic(
     "aten::upsample_nearest1d",
-    decorate=[_apply_params("upsample_nearest1d", 3, "nearest")],
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
 )
 @_onnx_symbolic(
     "aten::upsample_nearest2d",
-    decorate=[_apply_params("upsample_nearest2d", 4, "nearest")],
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
 )
 @_onnx_symbolic(
     "aten::upsample_nearest3d",
-    decorate=[_apply_params("upsample_nearest3d", 5, "nearest")],
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
 )
 @_onnx_symbolic(
     "aten::upsample_linear1d",
-    decorate=[_apply_params("upsample_linear1d", 3, "linear")],
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
 )
 @_onnx_symbolic(
     "aten::upsample_bilinear2d",
-    decorate=[_apply_params("upsample_bilinear2d", 4, "linear")],
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
 )
 @_onnx_symbolic(
     "aten::upsample_trilinear3d",
-    decorate=[_apply_params("upsample_trilinear3d", 5, "linear")],
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
 )
 def _interpolate(name, dim, interpolate_mode):
     def symbolic_fn(g, input, output_size, *args):
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 8441fa01ccc2a..393917de6fea2 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -185,7 +185,6 @@
     "ones_like",
     "ones",
     "onnx_placeholder",
-    "overload_by_arg_count",
     "pad",
     "pairwise_distance",
     "permute",
@@ -293,15 +292,6 @@
 _onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=9)
 
 
-def _apply_params(*args, **kwargs):
-    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
-
-    def _apply(fn):
-        return fn(*args, **kwargs)
-
-    return _apply
-
-
 def _export(name: str):
     """Exports the function in the current global namespace."""
 
@@ -774,120 +764,27 @@ def _slice(g: jit_utils.GraphContext, input, axes, starts, ends):
     return g.op("Slice", input, axes_i=axes, starts_i=starts, ends_i=ends)
 
 
-@_beartype.beartype
-def _maybe_cast_reduce_op_input(g: jit_utils.GraphContext, self):
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.UNDEFINED
-    )
-    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
-        # This check only covers traced modules where dtype is present
-        # pytorch reduce-ops cast all other integral types to int64
-        if (
-            not symbolic_helper._is_fp(self)
-            and scalar_type != _type_utils.JitScalarType.INT64
-        ):
-            self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.INT64)
-    return self
-
-
-@_beartype.beartype
-def _reduce_op_symbolic(onnx_op_name, allow_multi_dim_support=True):
-    @_beartype.beartype
-    def symbolic(g, self, dim=None, keepdim=None):
-        self = _maybe_cast_reduce_op_input(g, self)
-        if dim is None or dim == tuple():
-            # Dim can be 0, which will cause (not dim) == True. So we don't want to do
-            # (not dim)
-            # all-reduce path
-            return symbolic_helper._handle_reduce_dim_none(g, self, onnx_op_name)
-        else:
-            # dim-reduce path
-            desc = "is" if allow_multi_dim_support else "i"
-            dim, keepdim = symbolic_helper._get_const(
-                dim, desc, "dim"
-            ), symbolic_helper._get_const(keepdim, "i", "keepdim")
-            dim_list = dim if allow_multi_dim_support else [dim]
-            return g.op(onnx_op_name, self, axes_i=dim_list, keepdims_i=keepdim)
-
-    return symbolic
-
-
-@_beartype.beartype
-def overload_by_arg_count(fn):
-    @functools.wraps(fn)
-    @_beartype.beartype
-    def wrapper(g, *args):
-        overloads = fn(g, *args)
-        for overload in overloads:
-            arg_descriptors = overload._arg_descriptors
-            if len(arg_descriptors) == len(args):
-                return overload(g, *args)
-        return symbolic_helper._unimplemented(
-            f"aten::{fn.__name__}", f"with {len(args)} arguments"
-        )
-
-    return wrapper
-
-
-@_onnx_symbolic("aten::sum", decorate=[_apply_params("ReduceSum", "sum")])
-@_onnx_symbolic("aten::mean", decorate=[_apply_params("ReduceMean", "mean")])
+@_onnx_symbolic(
+    "aten::sum", decorate=[symbolic_helper._apply_params("ReduceSum", "sum")]
+)
+@_onnx_symbolic(
+    "aten::mean", decorate=[symbolic_helper._apply_params("ReduceMean", "mean")]
+)
 # torch.prod does not support multidimensional "dim"
 @_onnx_symbolic(
     "aten::prod",
-    decorate=[_apply_params("ReduceProd", "prod", allow_multi_dim_support=False)],
+    decorate=[
+        symbolic_helper._apply_params(
+            "ReduceProd", "prod", allow_multi_dim_support=False
+        )
+    ],
 )
 @_beartype.beartype
 def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
-    symbolic = _reduce_op_symbolic(
-        onnx_op, allow_multi_dim_support=allow_multi_dim_support
+    return symbolic_helper._reduce_with_dtype_helper(
+        onnx_op, name, allow_multi_dim_support
     )
 
-    @overload_by_arg_count
-    def reduce(g, *args, **kwargs):
-        @symbolic_helper.quantized_args(True)
-        @symbolic_helper.parse_args("v", "none")
-        def reduce_nodim(g, self, dtype):
-            dtype_onnx = None
-            if dtype.node().kind() == "onnx::Constant":
-                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
-                self = g.op("Cast", self, to_i=dtype_onnx)
-            elif dtype.node().kind() != "prim::Constant":
-                return symbolic_helper._unimplemented(name, "dtype", dtype)
-            result = symbolic(g, self)
-            if dtype_onnx is not None:
-                result_dtype_onnx = _type_utils.JitScalarType.from_value(
-                    result
-                ).onnx_type()
-                if result_dtype_onnx != dtype_onnx:
-                    result = g.op("Cast", result, to_i=dtype_onnx)
-            return result
-
-        dim_desc = "is" if allow_multi_dim_support else "i"
-
-        @symbolic_helper.quantized_args(True)
-        @symbolic_helper.parse_args("v", dim_desc, "i", "none")  # type: ignore[arg-type]
-        def reduce_dim(g, self, dim, keepdim, dtype):
-            dtype_onnx = None
-            if dtype.node().kind() == "onnx::Constant":
-                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
-                self = g.op("Cast", self, to_i=dtype_onnx)
-            elif dtype.node().kind() != "prim::Constant":
-                return symbolic_helper._unimplemented(name, "dtype", dtype)
-            result = symbolic(g, self, dim, keepdim)
-            if dtype_onnx is not None:
-                result_dtype_onnx = _type_utils.JitScalarType.from_value(
-                    result
-                ).onnx_type()
-                if result_dtype_onnx != dtype_onnx:
-                    result = g.op("Cast", result, to_i=dtype_onnx)
-            return result
-
-        return reduce_nodim, reduce_dim
-
-    return reduce
-
 
 @_onnx_symbolic("aten::cumsum")
 @symbolic_helper.parse_args("v", "i", "none")
@@ -1356,65 +1253,13 @@ def mish(g: jit_utils.GraphContext, input):
     return g.op("Mul", input, g.op("Tanh", g.op("Softplus", input)))
 
 
-@_beartype.beartype
-def _op_with_optional_float_cast(g: jit_utils.GraphContext, op_name, *args, **kwargs):
-    """Some PyTorch operators (e.g., Clip/Min/ReLU/Pad) are super set of ONNX in terms of data types.
-    This function maximizes the exportability of PyTorch-ONNX by allowing ONNX-unsupported PyTorch
-    operator data type. For example, `Cast<int>(Clip<float>(Cast<float>(INPUT)))` can be used to mimic
-    `Clip<int>(INPUT)` (opset version < 12).
-
-    Args:
-        g (torch._C.Graph): graph to write the ONNX representation into.
-        op_name (str): operator name in ONNX.
-        *args (tuple): operands to the operator.
-        **kwargs (dict): attributes to the operator along with "opset_before" (optional, None by default)
-            indicating the smallest opset version to trigger such casting behavior and "target_float_t"
-            (optional, torch.onnx.JitScalarType.FLOAT by default) indicating the data type of internal operator.
-
-    Returns:
-        Optional[torch._C.Value, Tuple[torch._C.Value, ...]]: output(s) of the operator.
-    """
-    opset_before = kwargs.pop("opset_before", None)
-    target_float_t = kwargs.pop("target_float_t", _type_utils.JitScalarType.FLOAT)
-
-    inputs = list(args)
-    dtype_0 = _type_utils.JitScalarType.from_value(inputs[0])
-
-    require_cast = not symbolic_helper._is_fp(inputs[0]) and (
-        opset_before is None or GLOBALS.export_onnx_opset_version < opset_before
-    )
-
-    if require_cast:
-        for input in inputs:
-            if input.isCompleteTensor():
-                input_scalar_type = _type_utils.JitScalarType.from_value(input)
-                if input_scalar_type != dtype_0:
-                    raise errors.SymbolicValueError(
-                        f"Inputs of {op_name} must have same dtype."
-                        f"Got {dtype_0.scalar_name()} and {input_scalar_type.scalar_name()}",
-                        input,
-                    )
-        for i, input in enumerate(inputs):
-            if input.isCompleteTensor() and not symbolic_helper._is_fp(input):
-                inputs[i] = g.op(
-                    "Cast",
-                    input,
-                    to_i=target_float_t.onnx_type(),
-                )
-
-    self = g.op(op_name, *inputs, **kwargs)
-
-    if require_cast:
-        self = g.op("Cast", self, to_i=dtype_0.onnx_type())
-
-    return self
-
-
 @_onnx_symbolic("aten::relu")
 @symbolic_helper.quantized_args(True)
 @_beartype.beartype
 def relu(g: jit_utils.GraphContext, input):
-    return _op_with_optional_float_cast(g, "Relu", input, opset_before=14)
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Relu", input, opset_before=14
+    )
 
 
 @_onnx_symbolic("aten::relu6")
@@ -1529,7 +1374,7 @@ def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
             )
 
         if is_transpose_required:
-            softmax = g.op("Transpose", softmax, perm_i=axes)
+            softmax = g.op("Transpose", softmax, perm_i=axes)  # type: ignore[possibly-undefined]
         return softmax
 
     # Apply max normalization.
@@ -1603,7 +1448,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
 @_onnx_symbolic(
     "aten::max_pool1d",
     decorate=[
-        _apply_params(
+        symbolic_helper._apply_params(
             "max_pool1d", torch.nn.modules.utils._single, 1, return_indices=False
         ),
         _export("max_pool1d"),
@@ -1612,7 +1457,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
 @_onnx_symbolic(
     "aten::max_pool2d",
     decorate=[
-        _apply_params(
+        symbolic_helper._apply_params(
             "max_pool2d", torch.nn.modules.utils._pair, 2, return_indices=False
         ),
         _export("max_pool2d"),
@@ -1621,7 +1466,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
 @_onnx_symbolic(
     "aten::max_pool3d",
     decorate=[
-        _apply_params(
+        symbolic_helper._apply_params(
             "max_pool3d", torch.nn.modules.utils._triple, 3, return_indices=False
         ),
         _export("max_pool3d"),
@@ -1716,21 +1561,21 @@ def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
 @_onnx_symbolic(
     "aten::avg_pool1d",
     decorate=[
-        _apply_params("avg_pool1d", torch.nn.modules.utils._single),
+        symbolic_helper._apply_params("avg_pool1d", torch.nn.modules.utils._single),
         _export("avg_pool1d"),
     ],
 )
 @_onnx_symbolic(
     "aten::avg_pool2d",
     decorate=[
-        _apply_params("avg_pool2d", torch.nn.modules.utils._pair),
+        symbolic_helper._apply_params("avg_pool2d", torch.nn.modules.utils._pair),
         _export("avg_pool2d"),
     ],
 )
 @_onnx_symbolic(
     "aten::avg_pool3d",
     decorate=[
-        _apply_params("avg_pool3d", torch.nn.modules.utils._triple),
+        symbolic_helper._apply_params("avg_pool3d", torch.nn.modules.utils._triple),
         _export("avg_pool3d"),
     ],
 )
@@ -1762,7 +1607,7 @@ def symbolic_fn(
         # this accommodation.
         # More detail on https://github.com/pytorch/pytorch/issues/57178
         if count_include_pad:
-            input = _op_with_optional_float_cast(
+            input = symbolic_helper._op_with_optional_float_cast(
                 g,
                 "Pad",
                 input,
@@ -1794,7 +1639,7 @@ def symbolic_fn(
 @_onnx_symbolic(
     "aten::adaptive_avg_pool1d",
     decorate=[
-        _apply_params(
+        symbolic_helper._apply_params(
             "adaptive_avg_pool1d", "AveragePool", torch.nn.modules.utils._single
         ),
         _export("adaptive_avg_pool1d"),
@@ -1803,7 +1648,7 @@ def symbolic_fn(
 @_onnx_symbolic(
     "aten::adaptive_avg_pool2d",
     decorate=[
-        _apply_params(
+        symbolic_helper._apply_params(
             "adaptive_avg_pool2d", "AveragePool", torch.nn.modules.utils._pair
         ),
         _export("adaptive_avg_pool2d"),
@@ -1812,7 +1657,7 @@ def symbolic_fn(
 @_onnx_symbolic(
     "aten::adaptive_avg_pool3d",
     decorate=[
-        _apply_params(
+        symbolic_helper._apply_params(
             "adaptive_avg_pool3d", "AveragePool", torch.nn.modules.utils._triple
         ),
         _export("adaptive_avg_pool3d"),
@@ -1821,7 +1666,7 @@ def symbolic_fn(
 @_onnx_symbolic(
     "aten::adaptive_max_pool1d",
     decorate=[
-        _apply_params(
+        symbolic_helper._apply_params(
             "adaptive_max_pool1d",
             "MaxPool",
             torch.nn.modules.utils._single,
@@ -1833,7 +1678,7 @@ def symbolic_fn(
 @_onnx_symbolic(
     "aten::adaptive_max_pool2d",
     decorate=[
-        _apply_params(
+        symbolic_helper._apply_params(
             "adaptive_max_pool2d",
             "MaxPool",
             torch.nn.modules.utils._pair,
@@ -1845,7 +1690,7 @@ def symbolic_fn(
 @_onnx_symbolic(
     "aten::adaptive_max_pool3d",
     decorate=[
-        _apply_params(
+        symbolic_helper._apply_params(
             "adaptive_max_pool3d",
             "MaxPool",
             torch.nn.modules.utils._triple,
@@ -1961,7 +1806,7 @@ def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value):
 
     padding = _convert_padding_node(padding)
     paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
-    return _op_with_optional_float_cast(
+    return symbolic_helper._op_with_optional_float_cast(
         g, "Pad", input, pads_i=paddings, mode_s=mode, value_f=value, opset_before=11
     )
 
@@ -2016,7 +1861,7 @@ def reflection_pad(g: jit_utils.GraphContext, input, padding):
     mode = "reflect"
     padding = _convert_padding_node(padding)
     paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
-    return _op_with_optional_float_cast(
+    return symbolic_helper._op_with_optional_float_cast(
         g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
     )
 
@@ -2029,7 +1874,7 @@ def replication_pad(g: jit_utils.GraphContext, input, padding):
     mode = "edge"
     padding = _convert_padding_node(padding)
     paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
-    return _op_with_optional_float_cast(
+    return symbolic_helper._op_with_optional_float_cast(
         g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
     )
 
@@ -2059,42 +1904,42 @@ def pad(
 @_onnx_symbolic(
     "aten::upsample_nearest1d",
     decorate=[
-        _apply_params("upsample_nearest1d", 3, "nearest"),
+        symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest"),
         _export("upsample_nearest1d"),
     ],
 )
 @_onnx_symbolic(
     "aten::upsample_nearest2d",
     decorate=[
-        _apply_params("upsample_nearest2d", 4, "nearest"),
+        symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest"),
         _export("upsample_nearest2d"),
     ],
 )
 @_onnx_symbolic(
     "aten::upsample_nearest3d",
     decorate=[
-        _apply_params("upsample_nearest3d", 5, "nearest"),
+        symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest"),
         _export("upsample_nearest3d"),
     ],
 )
 @_onnx_symbolic(
     "aten::upsample_linear1d",
     decorate=[
-        _apply_params("upsample_linear1d", 3, "linear"),
+        symbolic_helper._apply_params("upsample_linear1d", 3, "linear"),
         _export("upsample_linear1d"),
     ],
 )
 @_onnx_symbolic(
     "aten::upsample_bilinear2d",
     decorate=[
-        _apply_params("upsample_bilinear2d", 4, "linear"),
+        symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear"),
         _export("upsample_bilinear2d"),
     ],
 )
 @_onnx_symbolic(
     "aten::upsample_trilinear3d",
     decorate=[
-        _apply_params("upsample_trilinear3d", 5, "linear"),
+        symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear"),
         _export("upsample_trilinear3d"),
     ],
 )
@@ -2467,7 +2312,7 @@ def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
             "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
         )
     if is_transpose_required:
-        return_op = g.op("Transpose", return_op, perm_i=axes)
+        return_op = g.op("Transpose", return_op, perm_i=axes)  # type: ignore[possibly-undefined]
     return return_op
 
 
@@ -2942,7 +2787,15 @@ def native_layer_norm(
     two_cst = symbolic_helper._generate_wrapped_number(g, 2.0)
     eps_cst = symbolic_helper._generate_wrapped_number(g, eps)
 
-    mean = g.op("ReduceMean", input, axes_i=axes)
+    if g.opset < 18:
+        mean = g.op("ReduceMean", input, axes_i=axes)
+    else:
+        mean = g.op(
+            "ReduceMean",
+            input,
+            g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
+        )
+
     numerator = sub(g, input, mean)
 
     # Cast it to eps dtype to avoid precision loss
@@ -2957,7 +2810,15 @@ def native_layer_norm(
         )
 
     # variance = e((x - e(x))^2), and (x - e(x)) is the numerator in the layer_norm formula
-    variance = g.op("ReduceMean", pow(g, numerator, two_cst), axes_i=axes)
+    if g.opset < 18:
+        variance = g.op("ReduceMean", pow(g, numerator, two_cst), axes_i=axes)
+    else:
+        variance = g.op(
+            "ReduceMean",
+            pow(g, numerator, two_cst),
+            g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
+        )
+
     denominator = sqrt(g, g.op("Add", variance, eps_cst))
     normalized = g.op("Div", numerator, denominator)
 
@@ -2978,7 +2839,7 @@ def native_layer_norm(
     # mean and normalized, so we need to Cast it back
     if is_type_half:
         denominator = g.op(
-            "Cast", denominator, to_i=_type_utils.JitScalarType(input_dtype).onnx_type()
+            "Cast", denominator, to_i=_type_utils.JitScalarType(input_dtype).onnx_type()  # type: ignore[possibly-undefined]
         )
         rdenominator = g.op("Reciprocal", denominator)
     else:
@@ -3405,7 +3266,7 @@ def clamp(g: jit_utils.GraphContext, self, min, max):
         return clamp_min(g, self, min)
     else:
         if symbolic_helper._is_constant(min) and symbolic_helper._is_constant(max):
-            return _op_with_optional_float_cast(
+            return symbolic_helper._op_with_optional_float_cast(
                 g,
                 "Clip",
                 self,
@@ -3422,13 +3283,15 @@ def clamp(g: jit_utils.GraphContext, self, min, max):
 @_beartype.beartype
 def clamp_min(g: jit_utils.GraphContext, self, min):
     if symbolic_helper._is_constant(min):
-        return _op_with_optional_float_cast(
+        return symbolic_helper._op_with_optional_float_cast(
             g, "Clip", self, min_f=symbolic_helper._parse_arg(min, "f"), opset_before=12
         )
     else:
         dtype = _type_utils.JitScalarType.from_value(self)
         min = g.op("Cast", min, to_i=dtype.onnx_type())
-        return _op_with_optional_float_cast(g, "Max", self, min, opset_before=12)
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Max", self, min, opset_before=12
+        )
 
 
 @_onnx_symbolic("aten::clamp_max")
@@ -3436,13 +3299,15 @@ def clamp_min(g: jit_utils.GraphContext, self, min):
 @_beartype.beartype
 def clamp_max(g: jit_utils.GraphContext, self, max):
     if symbolic_helper._is_constant(max):
-        return _op_with_optional_float_cast(
+        return symbolic_helper._op_with_optional_float_cast(
             g, "Clip", self, max_f=symbolic_helper._parse_arg(max, "f"), opset_before=12
         )
     else:
         dtype = _type_utils.JitScalarType.from_value(self)
         max = g.op("Cast", max, to_i=dtype.onnx_type())
-        return _op_with_optional_float_cast(g, "Min", self, max, opset_before=12)
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Min", self, max, opset_before=12
+        )
 
 
 @_onnx_symbolic("aten::max")
@@ -3451,19 +3316,7 @@ def clamp_max(g: jit_utils.GraphContext, self, max):
 # TODO(justinchuby): Support multiple quantized args in output
 @_beartype.beartype
 def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    # torch.max(input)
-    if dim_or_y is None and keepdim is None:
-        return g.op("ReduceMax", self, keepdims_i=0)
-    # torch.max(input, other)
-    if keepdim is None:
-        return _op_with_optional_float_cast(g, "Max", self, dim_or_y, opset_before=12)
-    # torch.max(input, dim, keepdim)
-    else:
-        dim = symbolic_helper._get_const(dim_or_y, "i", "dim")
-        keepdim = symbolic_helper._get_const(keepdim, "i", "keepdim")
-        max = g.op("ReduceMax", self, axes_i=[dim], keepdims_i=keepdim)
-        indices = g.op("ArgMax", self, axis_i=dim, keepdims_i=keepdim)
-        return max, indices
+    return symbolic_helper._max_helper(g, self, dim_or_y, keepdim)
 
 
 @_onnx_symbolic("aten::maximum")
@@ -3477,19 +3330,7 @@ def maximum(g: jit_utils.GraphContext, input, other):
 # TODO(justinchuby): Support multiple quantized args in output
 @_beartype.beartype
 def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    # torch.min(input)
-    if dim_or_y is None and keepdim is None:
-        return g.op("ReduceMin", self, keepdims_i=0)
-    # torch.min(input, other)
-    if keepdim is None:
-        return _op_with_optional_float_cast(g, "Min", self, dim_or_y, opset_before=12)
-    # torch.min(input, dim, keepdim)
-    else:
-        dim = symbolic_helper._get_const(dim_or_y, "i", "dim")
-        keepdim = symbolic_helper._get_const(keepdim, "i", "keepdim")
-        min = g.op("ReduceMin", self, axes_i=[dim], keepdims_i=keepdim)
-        indices = g.op("ArgMin", self, axis_i=dim, keepdims_i=keepdim)
-        return min, indices
+    return symbolic_helper._min_helper(g, self, dim_or_y, keepdim)
 
 
 @_onnx_symbolic("aten::minimum")
@@ -3550,22 +3391,28 @@ def dropout(g: jit_utils.GraphContext, input, p, train):
 
 
 @_onnx_symbolic(
-    "aten::alpha_dropout_", decorate=[_apply_params("aten::alpha_dropout_")]
+    "aten::alpha_dropout_",
+    decorate=[symbolic_helper._apply_params("aten::alpha_dropout_")],
 )  # See Note [Export inplace]
 @_onnx_symbolic(
     "aten::feature_alpha_dropout_",
-    decorate=[_apply_params("aten::feature_alpha_dropout_")],
+    decorate=[symbolic_helper._apply_params("aten::feature_alpha_dropout_")],
 )
 @_onnx_symbolic(
-    "aten::feature_dropout_", decorate=[_apply_params("aten::feature_dropout_")]
+    "aten::feature_dropout_",
+    decorate=[symbolic_helper._apply_params("aten::feature_dropout_")],
 )
 @_onnx_symbolic(
     "aten::feature_alpha_dropout",
-    decorate=[_apply_params("aten::feature_alpha_dropout")],
+    decorate=[symbolic_helper._apply_params("aten::feature_alpha_dropout")],
 )
-@_onnx_symbolic("aten::alpha_dropout", decorate=[_apply_params("aten::alpha_dropout")])
 @_onnx_symbolic(
-    "aten::feature_dropout", decorate=[_apply_params("aten::feature_dropout")]
+    "aten::alpha_dropout",
+    decorate=[symbolic_helper._apply_params("aten::alpha_dropout")],
+)
+@_onnx_symbolic(
+    "aten::feature_dropout",
+    decorate=[symbolic_helper._apply_params("aten::feature_dropout")],
 )
 @_beartype.beartype
 def _unsupported_dropout(name: str):
@@ -3585,9 +3432,9 @@ def feature_dropout(g, input, p, train):
 @_beartype.beartype
 def norm(g: jit_utils.GraphContext, self, p, dim, keepdim, dtype=None):
     if p == 1:
-        f = _reduce_op_symbolic("ReduceL1")
+        f = symbolic_helper._reduce_op_symbolic_helper("ReduceL1")
     elif p == 2:
-        f = _reduce_op_symbolic("ReduceL2")
+        f = symbolic_helper._reduce_op_symbolic_helper("ReduceL2")
     else:
         raise errors.SymbolicValueError(
             "ONNX export only p-norms with p of 1 or 2", self
@@ -4135,7 +3982,7 @@ def slice(g: jit_utils.GraphContext, self, *args):
 @symbolic_helper.parse_args("v", "f", "f")
 @_beartype.beartype
 def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
-    return _op_with_optional_float_cast(
+    return symbolic_helper._op_with_optional_float_cast(
         g, "Clip", self, min_f=min_val, max_f=max_val, opset_before=12
     )
 
@@ -4283,8 +4130,7 @@ def sort(g: jit_utils.GraphContext, self, dim, decending, out=None):
 @_onnx_symbolic("aten::numel")
 @_beartype.beartype
 def numel(g: jit_utils.GraphContext, self):
-    shape = g.op("Shape", self)
-    return g.op("ReduceProd", shape, keepdims_i=0)
+    return symbolic_helper._numel_helper(g, self)
 
 
 @_onnx_symbolic("aten::topk")
@@ -4754,7 +4600,7 @@ def transform_weights_no_bias(layer_index):
                 reform_weights(g, w, hidden_size, reform_permutation) for w in weights
             )
         return tuple(
-            symbolic_helper._unsqueeze_helper(g, x, [0]) for x in (weight_ih, weight_hh)
+            symbolic_helper._unsqueeze_helper(g, x, [0]) for x in (weight_ih, weight_hh)  # type: ignore[possibly-undefined]
         )
 
     @_beartype.beartype
@@ -4766,10 +4612,10 @@ def transform_weights(layer_index):
             weight_ih, weight_hh, bias_ih, bias_hh = (
                 reform_weights(g, w, hidden_size, reform_permutation) for w in weights
             )
-        bias_concat = g.op("Concat", bias_ih, bias_hh, axis_i=0)
+        bias_concat = g.op("Concat", bias_ih, bias_hh, axis_i=0)  # type: ignore[possibly-undefined]
         return tuple(
             symbolic_helper._unsqueeze_helper(g, x, [0])
-            for x in (weight_ih, weight_hh, bias_concat)
+            for x in (weight_ih, weight_hh, bias_concat)  # type: ignore[possibly-undefined]
         )
 
     @_beartype.beartype
@@ -4808,16 +4654,16 @@ def retrieve_state(x, start, end):
 
         inputs = [prev_output, weight_ih, weight_hh, bias_concat, sequence_lens]
 
-        inputs.append(retrieve_state(h0, *state_indices))
+        inputs.append(retrieve_state(h0, *state_indices))  # type: ignore[possibly-undefined]
         if variant == "LSTM":
-            inputs.append(retrieve_state(c0, *state_indices))
+            inputs.append(retrieve_state(c0, *state_indices))  # type: ignore[possibly-undefined]
 
         extra_kwargs = {} if unidirectional else {"direction_s": "bidirectional"}
         if variant == "RNN":
             if bidirectional:
-                activation = [nonlinearity, nonlinearity]
+                activation = [nonlinearity, nonlinearity]  # type: ignore[possibly-undefined]
             else:
-                activation = [nonlinearity]
+                activation = [nonlinearity]  # type: ignore[possibly-undefined]
 
             prev_output, h_out = g.op(
                 "RNN",
@@ -4859,17 +4705,17 @@ def retrieve_state(x, start, end):
         else:
             prev_output = symbolic_helper._squeeze_helper(g, prev_output, [1])
 
-        h_outs.append(h_out)
+        h_outs.append(h_out)  # type: ignore[possibly-undefined]
         if variant == "LSTM":
-            c_outs.append(c_out)
+            c_outs.append(c_out)  # type: ignore[possibly-undefined]
     if batch_first:
         # seq, batch, num_directions * hidden_size -> batch, seq, num_directions * hidden_size
         prev_output = g.op("Transpose", prev_output, perm_i=[1, 0, 2])
-    h_outs = h_out if num_layers == 1 else g.op("Concat", *h_outs, axis_i=0)
+    h_outs = h_out if num_layers == 1 else g.op("Concat", *h_outs, axis_i=0)  # type: ignore[possibly-undefined]
     if variant == "RNN" or variant == "GRU":
         return prev_output, h_outs
     elif variant == "LSTM":
-        c_outs = c_out if num_layers == 1 else g.op("Concat", *c_outs, axis_i=0)
+        c_outs = c_out if num_layers == 1 else g.op("Concat", *c_outs, axis_i=0)  # type: ignore[possibly-undefined]
         return prev_output, h_outs, c_outs
 
 
@@ -4974,12 +4820,16 @@ def lstm_cell(g: jit_utils.GraphContext, self, hidden, w_ih, w_hh, b_ih, b_hh):
     ), symbolic_helper._squeeze_helper(g, c_outs, [0])
 
 
-@_onnx_symbolic("aten::gru", decorate=[_apply_params("GRU"), _export("gru")])
 @_onnx_symbolic(
-    "aten::rnn_tanh", decorate=[_apply_params("RNN_TANH"), _export("rnn_tanh")]
+    "aten::gru", decorate=[symbolic_helper._apply_params("GRU"), _export("gru")]
+)
+@_onnx_symbolic(
+    "aten::rnn_tanh",
+    decorate=[symbolic_helper._apply_params("RNN_TANH"), _export("rnn_tanh")],
 )
 @_onnx_symbolic(
-    "aten::rnn_relu", decorate=[_apply_params("RNN_RELU"), _export("rnn_relu")]
+    "aten::rnn_relu",
+    decorate=[symbolic_helper._apply_params("RNN_RELU"), _export("rnn_relu")],
 )
 def _one_hidden_rnn(kind: str):
     @symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
@@ -5583,37 +5433,7 @@ def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
 @symbolic_helper.parse_args("v", "is", "i", "i")
 @_beartype.beartype
 def _var_mean(g: jit_utils.GraphContext, input, dim, correction, keepdim):
-    if dim is None:
-        mean = g.op("ReduceMean", input, keepdims_i=0)
-        t_mean = mean
-        num_elements = numel(g, input)
-    else:
-        mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=keepdim)
-        t_mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=1)
-        redudced_dims = g.op("Shape", input)
-        # dim could contain one or multiple dimensions
-        redudced_dims = g.op(
-            "Gather",
-            redudced_dims,
-            g.op("Constant", value_t=torch.tensor(dim)),
-            axis_i=0,
-        )
-        num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
-    sub_v = g.op("Sub", input, t_mean)
-    sqr_sub = g.op("Mul", sub_v, sub_v)
-    keepdim_mean = 0 if dim is None else keepdim
-    var = g.op("ReduceMean", sqr_sub, axes_i=dim, keepdims_i=keepdim_mean)
-    # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
-    if correction is None:
-        correction = 1
-    if correction != 0:
-        num_elements = g.op(
-            "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
-        )
-        one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
-        mul = g.op("Mul", var, num_elements)
-        var = g.op("Div", mul, g.op("Sub", num_elements, one))
-    return var, mean
+    return symbolic_helper._var_mean_helper(g, input, dim, correction, keepdim)
 
 
 @_onnx_symbolic("aten::std")
@@ -5633,11 +5453,6 @@ def var(g: jit_utils.GraphContext, input, *args):
 @_onnx_symbolic("aten::var_mean")
 @_beartype.beartype
 def var_mean(g: jit_utils.GraphContext, input, *args):
-    # var_mean (and all variance-related functions) has multiple signatures, so need to manually figure
-    # out the correct arguments:
-    # aten::var_mean(Tensor self, bool unbiased)
-    # aten::var_mean(Tensor self, int[1] dim, bool unbiased, bool keepdim=False)
-    # aten::var_mean(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False)
     if len(args) == 1:
         return _var_mean(g, input, None, args[0], None)
     else:
@@ -5994,42 +5809,7 @@ def linalg_vector_norm(
     keepdim: bool,
     dtype: torch._C.Value,
 ):
-    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.vector_norm.html
-    if symbolic_helper._is_none(dim):
-        self = symbolic_helper._reshape_helper(g, self, [-1])
-        keepdim = False
-
-    if ord == math.inf:
-        result = g.op("ReduceMax", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim)
-    elif ord == -math.inf:
-        result = g.op("ReduceMin", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim)
-    elif ord == 0:
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "linalg_vector_norm", 9, 11, "ord=0 not supported", self
-        )
-    elif ord == 1:
-        result = _reduce_op_symbolic("ReduceL1")(g, self, dim=dim, keepdim=keepdim)
-    elif ord == 2:
-        result = _reduce_op_symbolic("ReduceL2")(g, self, dim=dim, keepdim=keepdim)
-    else:
-        ord_op = g.op("Constant", value_t=torch.tensor(ord, dtype=torch.float32))
-        result = symbolic_helper._reducesum_helper(
-            g, g.op("Pow", g.op("Abs", self), ord_op), axes_i=dim, keepdims_i=keepdim
-        )
-        result = g.op(
-            "Pow",
-            result,
-            g.op(
-                "Div",
-                g.op("Constant", value_t=torch.tensor(1, dtype=torch.float32)),
-                ord_op,
-            ),
-        )
-
-    if not symbolic_helper._is_none(dtype):
-        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())  # type: ignore[arg-type]
-    return result
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
 
 
 @_onnx_symbolic("aten::linalg_matrix_norm")
@@ -6842,7 +6622,9 @@ def prim_shape(g: jit_utils.GraphContext, self):
 @_onnx_symbolic("prim::max")
 @_beartype.beartype
 def prim_max(g: jit_utils.GraphContext, self, other):
-    return _op_with_optional_float_cast(g, "Max", self, other, opset_before=12)
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Max", self, other, opset_before=12
+    )
 
 
 @_onnx_symbolic("prim::min")
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 422210c608bea..3bea348770cc3 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -909,7 +909,6 @@ def _trace_and_get_graph_from_model(model, args):
     # Disable Autocast cache because it replaces kernel's weight and bias
     # by (undesired) constants.
     # No perf impact for when there are reused weights since https://github.com/pytorch/pytorch/pull/85665
-    # TODO: https://github.com/pytorch/pytorch/issues/84092
     prev_autocast_cache_enabled = torch.is_autocast_cache_enabled()
     torch.set_autocast_cache_enabled(False)
     trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
@@ -1772,6 +1771,7 @@ def _run_symbolic_method(g, op_name, symbolic_fn, args):
             original_node=None,  # type: ignore[arg-type]
             params_dict=_params_dict,
             env={},
+            new_nodes=[],
         )
         return symbolic_fn(graph_context, *args)
     except TypeError as e:
@@ -1886,6 +1886,7 @@ def _run_symbolic_function(
     node: _C.Node,
     inputs: Any,
     env: Dict[_C.Value, _C.Value],
+    new_nodes: List[_C.Node],
     operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
 ) -> Optional[Union[_C.Value, Sequence[Optional[_C.Value]]]]:
     """Runs a symbolic function.
@@ -1916,6 +1917,7 @@ def _run_symbolic_function(
         original_node=node,
         params_dict=_params_dict,
         env=env,
+        new_nodes=new_nodes,
     )
 
     # Direct ATen export requested
diff --git a/torch/onnx/verification.py b/torch/onnx/verification.py
index b60dfe8e1c907..6b49e7fc72b99 100644
--- a/torch/onnx/verification.py
+++ b/torch/onnx/verification.py
@@ -1020,7 +1020,7 @@ def _node_count_segment_str(self) -> str:
             else ""
         )
 
-        return f"{node_count} {'X' if has_mismatch else '✓'} {error_node_kind}"
+        return f"{node_count} {'X' if has_mismatch else chr(0x2713)} {error_node_kind}"
 
     @_beartype.beartype
     def _graph_id_segment_str(self) -> str:
@@ -1148,13 +1148,13 @@ def create_test_case_repro(
         structure is as follows:
 
         dir
-        ├── test_<name>
-        │   ├── model.onnx
-        │   └── test_data_set_0
-        │       ├── input_0.pb
-        │       ├── input_1.pb
-        │       ├── output_0.pb
-        │       └── output_1.pb
+        \u251c\u2500\u2500 test_<name>
+        \u2502   \u251c\u2500\u2500 model.onnx
+        \u2502   \u2514\u2500\u2500 test_data_set_0
+        \u2502       \u251c\u2500\u2500 input_0.pb
+        \u2502       \u251c\u2500\u2500 input_1.pb
+        \u2502       \u251c\u2500\u2500 output_0.pb
+        \u2502       \u2514\u2500\u2500 output_1.pb
 
         Args:
             proto: ONNX model proto.
@@ -1244,19 +1244,19 @@ def pretty_print_tree(self):
         Example::
 
             ==================================== Tree: =====================================
-            5 X   __2 X    __1 ✓
+            5 X   __2 X    __1 \u2713
             id:  |  id: 0 |  id: 00
                  |        |
                  |        |__1 X (aten::relu)
                  |           id: 01
                  |
-                 |__3 X    __1 ✓
+                 |__3 X    __1 \u2713
                     id: 1 |  id: 10
                           |
                           |__2 X     __1 X (aten::relu)
                              id: 11 |  id: 110
                                     |
-                                    |__1 ✓
+                                    |__1 \u2713
                                        id: 111
             =========================== Mismatch leaf subgraphs: ===========================
             ['01', '110']
@@ -1354,13 +1354,13 @@ def export_repro(
         The repro directory will contain the following files::
 
             dir
-            ├── test_<name>
-            │   ├── model.onnx
-            │   └── test_data_set_0
-            │       ├── input_0.pb
-            │       ├── input_1.pb
-            │       ├── output_0.pb
-            │       └── output_1.pb
+            \u251c\u2500\u2500 test_<name>
+            \u2502   \u251c\u2500\u2500 model.onnx
+            \u2502   \u2514\u2500\u2500 test_data_set_0
+            \u2502       \u251c\u2500\u2500 input_0.pb
+            \u2502       \u251c\u2500\u2500 input_1.pb
+            \u2502       \u251c\u2500\u2500 output_0.pb
+            \u2502       \u2514\u2500\u2500 output_1.pb
 
         Args:
             repro_dir: The directory to export the repro files to. Defaults to current
@@ -1825,19 +1825,19 @@ def find_mismatch(
         Greatest absolute difference: 0.2328854203224182 at index (1, 2) (up to 1e-07 allowed)
         Greatest relative difference: 0.699536174352349 at index (1, 3) (up to 0.001 allowed)
         ==================================== Tree: =====================================
-        5 X   __2 X    __1 ✓
+        5 X   __2 X    __1 \u2713
         id:  |  id: 0 |  id: 00
              |        |
              |        |__1 X (aten::relu)
              |           id: 01
              |
-             |__3 X    __1 ✓
+             |__3 X    __1 \u2713
                 id: 1 |  id: 10
                       |
                       |__2 X     __1 X (aten::relu)
                          id: 11 |  id: 110
                                 |
-                                |__1 ✓
+                                |__1 \u2713
                                    id: 111
         =========================== Mismatch leaf subgraphs: ===========================
         ['01', '110']
diff --git a/torch/optim/__init__.py b/torch/optim/__init__.py
index 878842bfa496e..5e836b4047ddf 100644
--- a/torch/optim/__init__.py
+++ b/torch/optim/__init__.py
@@ -6,22 +6,21 @@
 future.
 """
 
+from . import lr_scheduler, swa_utils
 from .adadelta import Adadelta
 from .adagrad import Adagrad
 from .adam import Adam
-from .adamw import AdamW
-from .sparse_adam import SparseAdam
 from .adamax import Adamax
+from .adamw import AdamW
 from .asgd import ASGD
-from .sgd import SGD
+from .lbfgs import LBFGS
+from .nadam import NAdam
+from .optimizer import Optimizer
 from .radam import RAdam
-from .rprop import Rprop
 from .rmsprop import RMSprop
-from .optimizer import Optimizer
-from .nadam import NAdam
-from .lbfgs import LBFGS
-from . import lr_scheduler
-from . import swa_utils
+from .rprop import Rprop
+from .sgd import SGD
+from .sparse_adam import SparseAdam
 
 del adadelta  # noqa: F821
 del adagrad  # noqa: F821
diff --git a/torch/optim/_functional.py b/torch/optim/_functional.py
index 7ea361d8efba1..4a6198956fb8e 100644
--- a/torch/optim/_functional.py
+++ b/torch/optim/_functional.py
@@ -1,35 +1,38 @@
 r"""Functional interface."""
 import math
-from torch import Tensor
 from typing import List
 
-from .adadelta import adadelta  # type: ignore[attr-defined] # noqa: F401
-from .adagrad import adagrad, _make_sparse  # type: ignore[attr-defined] # noqa: F401
-from .adam import adam  # type: ignore[attr-defined] # noqa: F401
-from .adamw import adamw  # type: ignore[attr-defined] # noqa: F401
-from .adamax import adamax  # type: ignore[attr-defined] # noqa: F401
-from .asgd import asgd  # type: ignore[attr-defined] # noqa: F401
-from .nadam import nadam  # type: ignore[attr-defined] # noqa: F401
-from .radam import radam  # type: ignore[attr-defined] # noqa: F401
-from .rmsprop import rmsprop  # type: ignore[attr-defined] # noqa: F401
-from .rprop import rprop  # type: ignore[attr-defined] # noqa: F401
-from .sgd import sgd  # type: ignore[attr-defined] # noqa: F401
+from torch import Tensor
+
+from .adadelta import adadelta  # type: ignore[attr-defined]  # noqa: F401
+from .adagrad import _make_sparse, adagrad  # type: ignore[attr-defined]  # noqa: F401
+from .adam import adam  # type: ignore[attr-defined]  # noqa: F401
+from .adamax import adamax  # type: ignore[attr-defined]  # noqa: F401
+from .adamw import adamw  # type: ignore[attr-defined]  # noqa: F401
+from .asgd import asgd  # type: ignore[attr-defined]  # noqa: F401
+from .nadam import nadam  # type: ignore[attr-defined]  # noqa: F401
+from .radam import radam  # type: ignore[attr-defined]  # noqa: F401
+from .rmsprop import rmsprop  # type: ignore[attr-defined]  # noqa: F401
+from .rprop import rprop  # type: ignore[attr-defined]  # noqa: F401
+from .sgd import sgd  # type: ignore[attr-defined]  # noqa: F401
 
 
 # TODO: use foreach API in optim._functional to do all the computation
 
 
-def sparse_adam(params: List[Tensor],
-                grads: List[Tensor],
-                exp_avgs: List[Tensor],
-                exp_avg_sqs: List[Tensor],
-                state_steps: List[int],
-                *,
-                eps: float,
-                beta1: float,
-                beta2: float,
-                lr: float,
-                maximize: bool):
+def sparse_adam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    state_steps: List[int],
+    *,
+    eps: float,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    maximize: bool,
+):
     r"""Functional API that performs Sparse Adam algorithm computation.
 
     See :class:`~torch.optim.SparseAdam` for details.
@@ -49,7 +52,6 @@ def sparse_adam(params: List[Tensor],
         exp_avg_sq = exp_avg_sqs[i]
         step = state_steps[i]
 
-
         def make_sparse(values):
             constructor = grad.new
             if grad_indices.dim() == 0 or values.dim() == 0:
@@ -63,7 +65,9 @@ def make_sparse(values):
         exp_avg_update_values = grad_values.sub(old_exp_avg_values).mul_(1 - beta1)
         exp_avg.add_(make_sparse(exp_avg_update_values))
         old_exp_avg_sq_values = exp_avg_sq.sparse_mask(grad)._values()
-        exp_avg_sq_update_values = grad_values.pow(2).sub_(old_exp_avg_sq_values).mul_(1 - beta2)
+        exp_avg_sq_update_values = (
+            grad_values.pow(2).sub_(old_exp_avg_sq_values).mul_(1 - beta2)
+        )
         exp_avg_sq.add_(make_sparse(exp_avg_sq_update_values))
 
         # Dense addition again is intended, avoiding another sparse_mask
@@ -72,8 +76,8 @@ def make_sparse(values):
         denom = exp_avg_sq_update_values.sqrt_().add_(eps)
         del exp_avg_update_values, exp_avg_sq_update_values
 
-        bias_correction1 = 1 - beta1 ** step
-        bias_correction2 = 1 - beta2 ** step
+        bias_correction1 = 1 - beta1**step
+        bias_correction2 = 1 - beta2**step
         step_size = lr * math.sqrt(bias_correction2) / bias_correction1
 
         param.add_(make_sparse(-step_size * numer.div_(denom)))
diff --git a/torch/optim/_multi_tensor/__init__.py b/torch/optim/_multi_tensor/__init__.py
index 32ea419566044..8918197447462 100644
--- a/torch/optim/_multi_tensor/__init__.py
+++ b/torch/optim/_multi_tensor/__init__.py
@@ -5,10 +5,11 @@
 future.
 """
 from functools import partialmethod
+
 from torch import optim
 
-def partialclass(cls, *args, **kwargs):
 
+def partialclass(cls, *args, **kwargs):
     class NewCls(cls):
         __init__ = partialmethod(cls.__init__, *args, **kwargs)
 
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
index 181008252d803..b9fcafbbcd9aa 100644
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@@ -1,9 +1,21 @@
+from typing import Any, Dict, List, Optional
+
 import torch
 from torch import Tensor
 
-from .optimizer import (Optimizer, _use_grad_for_differentiable, _default_to_fused_or_foreach,
-                        _differentiable_doc, _foreach_doc, _maximize_doc, _view_as_real)
-from typing import List, Optional
+from .optimizer import (
+    _capturable_doc,
+    _default_to_fused_or_foreach,
+    _differentiable_doc,
+    _disable_dynamo_if_unsupported,
+    _foreach_doc,
+    _get_scalar_dtype,
+    _maximize_doc,
+    _use_grad_for_differentiable,
+    _view_as_real,
+    Optimizer,
+    ParamsT,
+)
 
 __all__ = ["Adadelta", "adadelta"]
 
@@ -11,13 +23,14 @@
 class Adadelta(Optimizer):
     def __init__(
         self,
-        params,
+        params: ParamsT,
         lr=1.0,
         rho=0.9,
         eps=1e-6,
         weight_decay=0,
         foreach: Optional[bool] = None,
         *,
+        capturable: bool = False,
         maximize: bool = False,
         differentiable: bool = False,
     ):
@@ -36,6 +49,7 @@ def __init__(
             eps=eps,
             weight_decay=weight_decay,
             maximize=maximize,
+            capturable=capturable,
             foreach=foreach,
             differentiable=differentiable,
         )
@@ -47,9 +61,30 @@ def __setstate__(self, state):
             group.setdefault("foreach", None)
             group.setdefault("maximize", False)
             group.setdefault("differentiable", False)
-
-    def _init_group(self, group, params_with_grad, grads, square_avgs, acc_deltas):
+            group.setdefault("capturable", False)
+            for p in group["params"]:
+                p_state = self.state.get(p, [])
+                if len(p_state) != 0 and not torch.is_tensor(p_state["step"]):
+                    step_val = float(p_state["step"])
+                    p_state["step"] = (
+                        torch.tensor(
+                            step_val, dtype=_get_scalar_dtype(), device=p.device
+                        )
+                        if group["capturable"]
+                        else torch.tensor(step_val, dtype=_get_scalar_dtype())
+                    )
+
+    def _init_group(
+        self,
+        group: Dict[str, Any],
+        params_with_grad: List[Tensor],
+        grads: List[Tensor],
+        square_avgs: List[Tensor],
+        acc_deltas: List[Tensor],
+        state_steps: List[Tensor],
+    ):
         has_complex = False
+        p: Tensor
         for p in group["params"]:
             if p.grad is None:
                 continue
@@ -63,7 +98,12 @@ def _init_group(self, group, params_with_grad, grads, square_avgs, acc_deltas):
 
             # Lazy state initialization
             if len(state) == 0:
-                state["step"] = 0
+                state["step"] = (
+                    torch.zeros((), dtype=_get_scalar_dtype(), device=p.device)
+                    if group["capturable"]
+                    else torch.zeros((), dtype=_get_scalar_dtype())
+                )
+
                 state["square_avg"] = torch.zeros_like(
                     p, memory_format=torch.preserve_format
                 )
@@ -73,8 +113,8 @@ def _init_group(self, group, params_with_grad, grads, square_avgs, acc_deltas):
 
             square_avgs.append(state["square_avg"])
             acc_deltas.append(state["acc_delta"])
+            state_steps.append(state["step"])
 
-            state["step"] += 1
         return has_complex
 
     @_use_grad_for_differentiable
@@ -85,17 +125,29 @@ def step(self, closure=None):
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
+        self._cuda_graph_capture_health_check()
+
         loss = None
         if closure is not None:
             with torch.enable_grad():
                 loss = closure()
 
         for group in self.param_groups:
-            params_with_grad = []
-            grads = []
-            square_avgs = []
-            acc_deltas = []
-            lr, rho, eps, weight_decay, foreach, maximize, differentiable = (
+            params_with_grad: List[Tensor] = []
+            grads: List[Tensor] = []
+            square_avgs: List[Tensor] = []
+            acc_deltas: List[Tensor] = []
+            state_steps: List[Tensor] = []
+            (
+                lr,
+                rho,
+                eps,
+                weight_decay,
+                foreach,
+                maximize,
+                differentiable,
+                capturable,
+            ) = (
                 group["lr"],
                 group["rho"],
                 group["eps"],
@@ -103,15 +155,19 @@ def step(self, closure=None):
                 group["foreach"],
                 group["maximize"],
                 group["differentiable"],
+                group["capturable"],
             )
 
-            has_complex = self._init_group(group, params_with_grad, grads, square_avgs, acc_deltas)
+            has_complex = self._init_group(
+                group, params_with_grad, grads, square_avgs, acc_deltas, state_steps
+            )
 
             adadelta(
                 params_with_grad,
                 grads,
                 square_avgs,
                 acc_deltas,
+                state_steps,
                 lr=lr,
                 rho=rho,
                 eps=eps,
@@ -119,13 +175,15 @@ def step(self, closure=None):
                 foreach=foreach,
                 maximize=maximize,
                 differentiable=differentiable,
+                capturable=capturable,
                 has_complex=has_complex,
             )
 
         return loss
 
 
-Adadelta.__doc__ = r"""Implements Adadelta algorithm.
+Adadelta.__doc__ = (
+    r"""Implements Adadelta algorithm.
 
     .. math::
        \begin{aligned}
@@ -152,7 +210,8 @@ def step(self, closure=None):
        \end{aligned}
 
     For further details regarding the algorithm we refer to `ADADELTA: An Adaptive Learning Rate Method`_.
-    """ + fr"""
+    """
+    + rf"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -166,6 +225,7 @@ def step(self, closure=None):
             to the parameters (default: 1.0)
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
         {_foreach_doc}
+        {_capturable_doc}
         {_maximize_doc}
         {_differentiable_doc}
 
@@ -173,54 +233,7 @@ def step(self, closure=None):
         https://arxiv.org/abs/1212.5701
 
     """
-
-
-def adadelta(
-    params: List[Tensor],
-    grads: List[Tensor],
-    square_avgs: List[Tensor],
-    acc_deltas: List[Tensor],
-    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-    foreach: Optional[bool] = None,
-    differentiable: bool = False,
-    has_complex: bool = False,
-    *,
-    lr: float,
-    rho: float,
-    eps: float,
-    weight_decay: float,
-    maximize: bool,
-):
-    r"""Functional API that performs Adadelta algorithm computation.
-
-    See :class:`~torch.optim.Adadelta` for details.
-    """
-    # We still respect when the user inputs False for foreach.
-    if foreach is None:
-        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
-
-    if foreach and torch.jit.is_scripting():
-        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
-
-    if foreach and not torch.jit.is_scripting():
-        func = _multi_tensor_adadelta
-    else:
-        func = _single_tensor_adadelta
-
-    func(
-        params,
-        grads,
-        square_avgs,
-        acc_deltas,
-        lr=lr,
-        rho=rho,
-        eps=eps,
-        weight_decay=weight_decay,
-        maximize=maximize,
-        differentiable=differentiable,
-        has_complex=has_complex,
-    )
+)
 
 
 def _single_tensor_adadelta(
@@ -228,6 +241,7 @@ def _single_tensor_adadelta(
     grads: List[Tensor],
     square_avgs: List[Tensor],
     acc_deltas: List[Tensor],
+    state_steps: List[Tensor],
     *,
     lr: float,
     rho: float,
@@ -235,12 +249,19 @@ def _single_tensor_adadelta(
     weight_decay: float,
     maximize: bool,
     differentiable: bool,
+    capturable: bool,
     has_complex: bool,
 ):
-
-    for (param, grad, square_avg, acc_delta) in zip(
-        params, grads, square_avgs, acc_deltas
+    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+    if not torch._utils.is_compiling() and capturable:
+        assert all(
+            p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)
+        ), "If capturable=True, params and state_steps must be CUDA tensors."
+
+    for param, grad, square_avg, acc_delta, step in zip(
+        params, grads, square_avgs, acc_deltas, state_steps
     ):
+        step += 1
         grad = grad if not maximize else -grad
 
         if weight_decay != 0:
@@ -269,38 +290,70 @@ def _multi_tensor_adadelta(
     grads: List[Tensor],
     square_avgs: List[Tensor],
     acc_deltas: List[Tensor],
+    state_steps: List[Tensor],
     *,
     lr: float,
-    weight_decay: float,
     rho: float,
     eps: float,
+    weight_decay: float,
     maximize: bool,
     differentiable: bool,
+    capturable: bool,
     has_complex: bool,
 ):
-
     assert not differentiable, "_foreach ops don't support autograd"
 
+    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+    if not torch._utils.is_compiling() and capturable:
+        assert all(
+            p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)
+        ), "If capturable=True, params and state_steps must be CUDA tensors."
+
     if len(params) == 0:
         return
 
-    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, square_avgs, acc_deltas])
-    for ((device_params, device_grads, device_square_avgs, device_acc_deltas), _) in grouped_tensors.values():
-        if maximize:
-            device_grads = torch._foreach_neg(device_grads)
-
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, square_avgs, acc_deltas, state_steps]
+    )
+    for (
+        device_params,
+        device_grads,
+        device_square_avgs,
+        device_acc_deltas,
+        device_state_steps,
+    ), _ in grouped_tensors.values():
         if has_complex:
-            _view_as_real(device_params, device_grads, device_square_avgs, device_acc_deltas)
+            _view_as_real(
+                device_params, device_grads, device_square_avgs, device_acc_deltas
+            )
+
+        # Update steps
+        # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
+        # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
+        # wrapped it once now. The alpha is required to assure we go to the right overload.
+        if device_state_steps[0].is_cpu:
+            torch._foreach_add_(
+                device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
+            )
+        else:
+            torch._foreach_add_(device_state_steps, 1)
+
+        if maximize:
+            device_grads = torch._foreach_neg(device_grads)  # type: ignore[assignment]
 
         if weight_decay != 0:
             # Re-use the intermediate memory (device_grads) already allocated for maximize
             if maximize:
                 torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
             else:
-                device_grads = torch._foreach_add(device_grads, device_params, alpha=weight_decay)
+                device_grads = torch._foreach_add(  # type: ignore[assignment]
+                    device_grads, device_params, alpha=weight_decay
+                )
 
         torch._foreach_mul_(device_square_avgs, rho)
-        torch._foreach_addcmul_(device_square_avgs, device_grads, device_grads, value=1 - rho)
+        torch._foreach_addcmul_(
+            device_square_avgs, device_grads, device_grads, value=1 - rho
+        )
 
         std = torch._foreach_add(device_square_avgs, eps)
         torch._foreach_sqrt_(std)
@@ -310,7 +363,78 @@ def _multi_tensor_adadelta(
         torch._foreach_div_(deltas, std)
         torch._foreach_mul_(deltas, device_grads)
 
-        torch._foreach_add_(device_params, deltas, alpha=-lr)
-
         torch._foreach_mul_(device_acc_deltas, rho)
         torch._foreach_addcmul_(device_acc_deltas, deltas, deltas, value=1 - rho)
+
+        # If LR is a tensor, the else branch will internally call item()
+        # which will cause silent incorrectness if we are capturing
+        if capturable and isinstance(lr, torch.Tensor):
+            torch._foreach_mul_(deltas, -lr)
+            torch._foreach_add_(device_params, deltas)
+        else:
+            torch._foreach_add_(device_params, deltas, alpha=-lr)
+
+
+@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_adadelta)
+def adadelta(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    acc_deltas: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    capturable: bool = False,
+    foreach: Optional[bool] = None,
+    differentiable: bool = False,
+    has_complex: bool = False,
+    *,
+    lr: float,
+    rho: float,
+    eps: float,
+    weight_decay: float,
+    maximize: bool,
+):
+    r"""Functional API that performs Adadelta algorithm computation.
+
+    See :class:`~torch.optim.Adadelta` for details.
+    """
+
+    # this check is slow during compilation, so we skip it
+    # if it's strictly needed we can add this check back in dynamo
+    if not torch._utils.is_compiling() and not all(
+        isinstance(t, torch.Tensor) for t in state_steps
+    ):
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
+
+    # We still respect when the user inputs False for foreach.
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(
+            params, differentiable, use_fused=False
+        )
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_adadelta
+    else:
+        func = _single_tensor_adadelta
+
+    func(
+        params,
+        grads,
+        square_avgs,
+        acc_deltas,
+        state_steps,
+        lr=lr,
+        rho=rho,
+        eps=eps,
+        weight_decay=weight_decay,
+        maximize=maximize,
+        differentiable=differentiable,
+        capturable=capturable,
+        has_complex=has_complex,
+    )
diff --git a/torch/optim/adadelta.pyi b/torch/optim/adadelta.pyi
deleted file mode 100644
index 0f475331c1686..0000000000000
--- a/torch/optim/adadelta.pyi
+++ /dev/null
@@ -1,11 +0,0 @@
-from .optimizer import Optimizer, ParamsT
-
-class Adadelta(Optimizer):
-    def __init__(
-        self,
-        params: ParamsT,
-        lr: float = ...,
-        rho: float = ...,
-        eps: float = ...,
-        weight_decay: float = ...,
-    ) -> None: ...
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index 3d5ef79cc2779..4eb9235b3da0c 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -1,10 +1,20 @@
+from typing import List, Optional
+
 import torch
 from torch import Tensor
 
-from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _view_as_real,
-                        _default_to_fused_or_foreach, _get_scalar_dtype, _differentiable_doc,
-                        _foreach_doc, _maximize_doc)
-from typing import List, Optional
+from .optimizer import (
+    _default_to_fused_or_foreach,
+    _differentiable_doc,
+    _foreach_doc,
+    _get_scalar_dtype,
+    _get_value,
+    _maximize_doc,
+    _use_grad_for_differentiable,
+    _view_as_real,
+    Optimizer,
+    ParamsT,
+)
 
 __all__ = ["Adagrad", "adagrad"]
 
@@ -12,7 +22,7 @@
 class Adagrad(Optimizer):
     def __init__(
         self,
-        params,
+        params: ParamsT,
         lr=1e-2,
         lr_decay=0,
         weight_decay=0,
@@ -111,12 +121,14 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            params_with_grad = []
-            grads = []
-            state_sums = []
-            state_steps = []
+            params_with_grad: List[Tensor] = []
+            grads: List[Tensor] = []
+            state_sums: List[Tensor] = []
+            state_steps: List[Tensor] = []
 
-            has_sparse_grad, has_complex = self._init_group(group, params_with_grad, grads, state_sums, state_steps)
+            has_sparse_grad, has_complex = self._init_group(
+                group, params_with_grad, grads, state_sums, state_steps
+            )
 
             adagrad(
                 params_with_grad,
@@ -137,7 +149,8 @@ def step(self, closure=None):
         return loss
 
 
-Adagrad.__doc__ = r"""Implements Adagrad algorithm.
+Adagrad.__doc__ = (
+    r"""Implements Adagrad algorithm.
 
     .. math::
        \begin{aligned}
@@ -162,7 +175,8 @@ def step(self, closure=None):
 
     For further details regarding the algorithm we refer to `Adaptive Subgradient Methods for Online Learning
     and Stochastic Optimization`_.
-    """ + fr"""
+    """
+    + rf"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -179,6 +193,7 @@ def step(self, closure=None):
         Optimization: http://jmlr.org/papers/v12/duchi11a.html
 
     """
+)
 
 
 def adagrad(
@@ -188,7 +203,7 @@ def adagrad(
     state_steps: List[Tensor],
     # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
     # setting these as kwargs for now as functional API is compiled by torch/distributed/optim
-    has_sparse_grad: bool = None,
+    has_sparse_grad: bool = False,
     foreach: Optional[bool] = None,
     differentiable: bool = False,
     has_complex: bool = False,
@@ -209,7 +224,9 @@ def adagrad(
         )
 
     if foreach is None:
-        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
+        _, foreach = _default_to_fused_or_foreach(
+            params, differentiable, use_fused=False
+        )
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
@@ -257,8 +274,7 @@ def _single_tensor_adagrad(
     differentiable: bool,
     has_complex: bool,
 ):
-
-    for (param, grad, state_sum, step_t) in zip(params, grads, state_sums, state_steps):
+    for param, grad, state_sum, step_t in zip(params, grads, state_sums, state_steps):
         # update step
         step_t += 1
         step = _get_value(step_t)
@@ -316,16 +332,24 @@ def _multi_tensor_adagrad(
     differentiable: bool,
     has_complex: bool,
 ):
-
     assert not differentiable, "_foreach ops don't support autograd"
 
     # Foreach functions will throw errors if given empty lists
     if len(params) == 0:
         return
 
-    grouped_tensorlists = Optimizer._group_tensors_by_device_and_dtype([params, grads, state_sums, state_steps])
-    for ((device_params, device_grads, device_state_sums, device_state_steps), _) in grouped_tensorlists.values():
-        device_has_sparse_grad = has_sparse_grad and any(grad.is_sparse for grad in device_grads)
+    grouped_tensorlists = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, state_sums, state_steps]
+    )
+    for (
+        device_params,
+        device_grads,
+        device_state_sums,
+        device_state_steps,
+    ), _ in grouped_tensorlists.values():
+        device_has_sparse_grad = has_sparse_grad and any(
+            grad.is_sparse for grad in device_grads
+        )
 
         if device_has_sparse_grad:
             _single_tensor_adagrad(
@@ -338,25 +362,27 @@ def _multi_tensor_adagrad(
                 lr_decay=lr_decay,
                 eps=eps,
                 has_sparse_grad=True,
-                maximize=False,
+                maximize=maximize,
                 differentiable=differentiable,
                 has_complex=has_complex,
             )
             continue
 
-        if maximize:
-            device_grads = torch._foreach_neg(device_grads)
-
         # Handle complex parameters
         if has_complex:
             _view_as_real(device_params, device_grads, device_state_sums)
 
+        if maximize:
+            device_grads = torch._foreach_neg(device_grads)  # type: ignore[assignment]
+
         # Update steps
         # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
         # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
         # wrapped it once now. The alpha is required to assure we go to the right overload.
         if device_state_steps[0].is_cpu:
-            torch._foreach_add_(device_state_steps, torch.tensor(1.0, device='cpu'), alpha=1.0)
+            torch._foreach_add_(
+                device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
+            )
         else:
             torch._foreach_add_(device_state_steps, 1)
 
@@ -365,9 +391,13 @@ def _multi_tensor_adagrad(
             if maximize:
                 torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
             else:
-                device_grads = torch._foreach_add(device_grads, device_params, alpha=weight_decay)
+                device_grads = torch._foreach_add(  # type: ignore[assignment]
+                    device_grads, device_params, alpha=weight_decay
+                )
 
-        minus_clr = [-lr / (1 + (_get_value(step) - 1) * lr_decay) for step in device_state_steps]
+        minus_clr = [
+            -lr / (1 + (_get_value(step) - 1) * lr_decay) for step in device_state_steps
+        ]
 
         torch._foreach_addcmul_(device_state_sums, device_grads, device_grads, value=1)
 
@@ -379,6 +409,6 @@ def _multi_tensor_adagrad(
             torch._foreach_mul_(device_grads, minus_clr)
             numerator = device_grads
         else:
-            numerator = torch._foreach_mul(device_grads, minus_clr)
+            numerator = torch._foreach_mul(device_grads, minus_clr)  # type: ignore[assignment]
 
         torch._foreach_addcdiv_(device_params, numerator, std)
diff --git a/torch/optim/adagrad.pyi b/torch/optim/adagrad.pyi
deleted file mode 100644
index 4557ece1417f9..0000000000000
--- a/torch/optim/adagrad.pyi
+++ /dev/null
@@ -1,12 +0,0 @@
-from .optimizer import Optimizer, ParamsT
-
-class Adagrad(Optimizer):
-    def __init__(
-        self,
-        params: ParamsT,
-        lr: float = ...,
-        lr_decay: float = ...,
-        weight_decay: float = ...,
-        initial_accumulator_value: float = ...,
-        eps: float = ...,
-    ) -> None: ...
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index ee0223fee9d04..04c93989576b0 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -1,34 +1,51 @@
-from typing import List, Optional, Union, Tuple
+from typing import List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor
-from .optimizer import (Optimizer, ParamsT, _use_grad_for_differentiable, _get_value,
-                        _stack_if_compiling, _dispatch_sqrt, _default_to_fused_or_foreach,
-                        _get_scalar_dtype, _capturable_doc, _differentiable_doc, _foreach_doc,
-                        _fused_doc, _maximize_doc, _view_as_real)
 from torch.utils._foreach_utils import _get_fused_kernels_supported_devices
-
-__all__ = ['Adam', 'adam']
+from .optimizer import (
+    _capturable_doc,
+    _default_to_fused_or_foreach,
+    _differentiable_doc,
+    _disable_dynamo_if_unsupported,
+    _dispatch_sqrt,
+    _foreach_doc,
+    _fused_doc,
+    _get_scalar_dtype,
+    _get_value,
+    _maximize_doc,
+    _stack_if_compiling,
+    _use_grad_for_differentiable,
+    _view_as_real,
+    Optimizer,
+    ParamsT,
+)
+
+__all__ = ["Adam", "adam"]
 
 
 class Adam(Optimizer):
-    def __init__(self,
-                 params: ParamsT,
-                 lr: Union[float, Tensor] = 1e-3,
-                 betas: Tuple[float, float] = (0.9, 0.999),
-                 eps: float = 1e-8,
-                 weight_decay: float = 0,
-                 amsgrad: bool = False,
-                 *,
-                 foreach: Optional[bool] = None,
-                 maximize: bool = False,
-                 capturable: bool = False,
-                 differentiable: bool = False,
-                 fused: Optional[bool] = None):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: Union[float, Tensor] = 1e-3,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-8,
+        weight_decay: float = 0,
+        amsgrad: bool = False,
+        *,
+        foreach: Optional[bool] = None,
+        maximize: bool = False,
+        capturable: bool = False,
+        differentiable: bool = False,
+        fused: Optional[bool] = None,
+    ):
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if isinstance(lr, Tensor) and foreach and not capturable:
-            raise ValueError("lr as a Tensor is not supported for capturable=False and foreach=True")
+            raise ValueError(
+                "lr as a Tensor is not supported for capturable=False and foreach=True"
+            )
         if not 0.0 <= eps:
             raise ValueError(f"Invalid epsilon value: {eps}")
         if not 0.0 <= betas[0] < 1.0:
@@ -38,10 +55,18 @@ def __init__(self,
         if not 0.0 <= weight_decay:
             raise ValueError(f"Invalid weight_decay value: {weight_decay}")
 
-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, amsgrad=amsgrad,
-                        maximize=maximize, foreach=foreach, capturable=capturable,
-                        differentiable=differentiable, fused=fused)
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            amsgrad=amsgrad,
+            maximize=maximize,
+            foreach=foreach,
+            capturable=capturable,
+            differentiable=differentiable,
+            fused=fused,
+        )
         super().__init__(params, defaults)
 
         if fused:
@@ -54,27 +79,39 @@ def __init__(self,
             # alleviate the loss of information.
             fused_supported_devices = _get_fused_kernels_supported_devices()
             if not all(
-                p.device.type in fused_supported_devices and
-                torch.is_floating_point(p) for pg in self.param_groups for p in pg['params']
+                p.device.type in fused_supported_devices and torch.is_floating_point(p)
+                for pg in self.param_groups
+                for p in pg["params"]
             ):
-                raise RuntimeError("`fused=True` requires all the params to be floating point Tensors of "
-                                   f"supported devices: {fused_supported_devices}.")
+                raise RuntimeError(
+                    "`fused=True` requires all the params to be floating point Tensors of "
+                    f"supported devices: {fused_supported_devices}."
+                )
             if foreach:
                 raise RuntimeError("`fused` and `foreach` cannot be `True` together.")
 
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('amsgrad', False)
-            group.setdefault('maximize', False)
-            group.setdefault('foreach', None)
-            group.setdefault('capturable', False)
-            group.setdefault('differentiable', False)
-            fused = group.setdefault('fused', None)
+            group.setdefault("amsgrad", False)
+            group.setdefault("maximize", False)
+            group.setdefault("foreach", None)
+            group.setdefault("capturable", False)
+            group.setdefault("differentiable", False)
+            fused = group.setdefault("fused", None)
             for p in group["params"]:
                 p_state = self.state.get(p, [])
-                if len(p_state) != 0 and not torch.is_tensor(p_state['step']):
-                    p_state["step"] = torch.tensor(float(p_state["step"]), dtype=_get_scalar_dtype(is_fused=fused))
+                if len(p_state) != 0 and not torch.is_tensor(p_state["step"]):
+                    step_val = float(p_state["step"])
+                    p_state["step"] = (
+                        torch.tensor(
+                            step_val,
+                            dtype=_get_scalar_dtype(is_fused=fused),
+                            device=p.device,
+                        )
+                        if group["capturable"] or group["fused"]
+                        else torch.tensor(step_val, dtype=_get_scalar_dtype())
+                    )
 
     def _init_group(
         self,
@@ -84,15 +121,17 @@ def _init_group(
         exp_avgs,
         exp_avg_sqs,
         max_exp_avg_sqs,
-        state_steps
+        state_steps,
     ):
         has_complex = False
-        for p in group['params']:
+        for p in group["params"]:
             if p.grad is not None:
                 has_complex |= torch.is_complex(p)
                 params_with_grad.append(p)
                 if p.grad.is_sparse:
-                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                    raise RuntimeError(
+                        "Adam does not support sparse gradients, please consider SparseAdam instead"
+                    )
                 grads.append(p.grad)
 
                 state = self.state[p]
@@ -101,32 +140,50 @@ def _init_group(
                     # note(crcrpar): [special device hosting for step]
                     # Deliberately host `step` on CPU if both capturable and fused are off.
                     # This is because kernel launches are costly on CUDA and XLA.
-                    state['step'] = (
-                        torch.zeros((), dtype=_get_scalar_dtype(is_fused=group['fused']), device=p.device)
-                        if group['capturable'] or group['fused']
+                    state["step"] = (
+                        torch.zeros(
+                            (),
+                            dtype=_get_scalar_dtype(is_fused=group["fused"]),
+                            device=p.device,
+                        )
+                        if group["capturable"] or group["fused"]
                         else torch.tensor(0.0, dtype=_get_scalar_dtype())
                     )
                     # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    state["exp_avg"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
                     # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    if group['amsgrad']:
+                    state["exp_avg_sq"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+                    if group["amsgrad"]:
                         # Maintains max of all exp. moving avg. of sq. grad. values
-                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                exp_avgs.append(state['exp_avg'])
-                exp_avg_sqs.append(state['exp_avg_sq'])
-
-                if group['amsgrad']:
-                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
-                if group['differentiable'] and state['step'].requires_grad:
-                    raise RuntimeError('`requires_grad` is not supported for `step` in differentiable mode')
+                        state["max_exp_avg_sq"] = torch.zeros_like(
+                            p, memory_format=torch.preserve_format
+                        )
+
+                exp_avgs.append(state["exp_avg"])
+                exp_avg_sqs.append(state["exp_avg_sq"])
+
+                if group["amsgrad"]:
+                    max_exp_avg_sqs.append(state["max_exp_avg_sq"])
+                if group["differentiable"] and state["step"].requires_grad:
+                    raise RuntimeError(
+                        "`requires_grad` is not supported for `step` in differentiable mode"
+                    )
 
                 # Foreach without capturable does not support a tensor lr
-                if group['foreach'] and torch.is_tensor(group['lr']) and not group['capturable']:
-                    raise RuntimeError('lr as a Tensor is not supported for capturable=False and foreach=True')
+                if (
+                    group["foreach"]
+                    and torch.is_tensor(group["lr"])
+                    and not group["capturable"]
+                ):
+                    raise RuntimeError(
+                        "lr as a Tensor is not supported for capturable=False and foreach=True"
+                    )
 
-                state_steps.append(state['step'])
+                state_steps.append(state["step"])
         return has_complex
 
     @_use_grad_for_differentiable
@@ -151,7 +208,7 @@ def step(self, closure=None):
             exp_avg_sqs = []
             max_exp_avg_sqs = []
             state_steps = []
-            beta1, beta2 = group['betas']
+            beta1, beta2 = group["betas"]
 
             has_complex = self._init_group(
                 group,
@@ -160,7 +217,8 @@ def step(self, closure=None):
                 exp_avgs,
                 exp_avg_sqs,
                 max_exp_avg_sqs,
-                state_steps)
+                state_steps,
+            )
 
             adam(
                 params_with_grad,
@@ -169,18 +227,18 @@ def step(self, closure=None):
                 exp_avg_sqs,
                 max_exp_avg_sqs,
                 state_steps,
-                amsgrad=group['amsgrad'],
+                amsgrad=group["amsgrad"],
                 has_complex=has_complex,
                 beta1=beta1,
                 beta2=beta2,
-                lr=group['lr'],
-                weight_decay=group['weight_decay'],
-                eps=group['eps'],
-                maximize=group['maximize'],
-                foreach=group['foreach'],
-                capturable=group['capturable'],
-                differentiable=group['differentiable'],
-                fused=group['fused'],
+                lr=group["lr"],
+                weight_decay=group["weight_decay"],
+                eps=group["eps"],
+                maximize=group["maximize"],
+                foreach=group["foreach"],
+                capturable=group["capturable"],
+                differentiable=group["differentiable"],
+                fused=group["fused"],
                 grad_scale=getattr(self, "grad_scale", None),
                 found_inf=getattr(self, "found_inf", None),
             )
@@ -188,7 +246,8 @@ def step(self, closure=None):
         return loss
 
 
-Adam.__doc__ = r"""Implements Adam algorithm.
+Adam.__doc__ = (
+    r"""Implements Adam algorithm.
 
     .. math::
        \begin{aligned}
@@ -226,7 +285,8 @@ def step(self, closure=None):
        \end{aligned}
 
     For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_.
-    """ + fr"""
+    """
+    + rf"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -252,106 +312,30 @@ def step(self, closure=None):
         https://openreview.net/forum?id=ryQu7f-RZ
 
     """
+)
 
 
-def adam(params: List[Tensor],
-         grads: List[Tensor],
-         exp_avgs: List[Tensor],
-         exp_avg_sqs: List[Tensor],
-         max_exp_avg_sqs: List[Tensor],
-         state_steps: List[Tensor],
-         # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-         # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-         foreach: Optional[bool] = None,
-         capturable: bool = False,
-         differentiable: bool = False,
-         fused: Optional[bool] = None,
-         grad_scale: Optional[Tensor] = None,
-         found_inf: Optional[Tensor] = None,
-         has_complex: bool = False,
-         *,
-         amsgrad: bool,
-         beta1: float,
-         beta2: float,
-         lr: Union[float, Tensor],
-         weight_decay: float,
-         eps: float,
-         maximize: bool):
-    r"""Functional API that performs Adam algorithm computation.
-
-    See :class:`~torch.optim.Adam` for details.
-    """
-    # Respect when the user inputs False/True for foreach or fused. We only want to change
-    # the default when neither have been user-specified. Note that we default to foreach
-    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
-    # bake-in time before making it the default, even if it is typically faster.
-    if fused is None and foreach is None:
-        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
-        # Do not flip on foreach for the unsupported case where lr is a Tensor and capturable=False.
-        if foreach and isinstance(lr, Tensor) and not capturable:
-            foreach = False
-    if fused is None:
-        fused = False
-    if foreach is None:
-        foreach = False
-
-    # this check is slow during compilation, so we skip it
-    # if it's strictly needed we can add this check back in dynamo
-    if not torch._utils.is_compiling() and not all(isinstance(t, torch.Tensor) for t in state_steps):
-        raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
-
-    if foreach and torch.jit.is_scripting():
-        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
-    if fused and torch.jit.is_scripting():
-        raise RuntimeError("torch.jit.script not supported with fused optimizers")
-
-    if fused and not torch.jit.is_scripting():
-        func = _fused_adam
-    elif foreach and not torch.jit.is_scripting():
-        func = _multi_tensor_adam
-    else:
-        func = _single_tensor_adam
-
-    func(params,
-         grads,
-         exp_avgs,
-         exp_avg_sqs,
-         max_exp_avg_sqs,
-         state_steps,
-         amsgrad=amsgrad,
-         has_complex=has_complex,
-         beta1=beta1,
-         beta2=beta2,
-         lr=lr,
-         weight_decay=weight_decay,
-         eps=eps,
-         maximize=maximize,
-         capturable=capturable,
-         differentiable=differentiable,
-         grad_scale=grad_scale,
-         found_inf=found_inf)
-
-
-def _single_tensor_adam(params: List[Tensor],
-                        grads: List[Tensor],
-                        exp_avgs: List[Tensor],
-                        exp_avg_sqs: List[Tensor],
-                        max_exp_avg_sqs: List[Tensor],
-                        state_steps: List[Tensor],
-                        grad_scale: Optional[Tensor],
-                        found_inf: Optional[Tensor],
-                        *,
-                        amsgrad: bool,
-                        has_complex: bool,
-                        beta1: float,
-                        beta2: float,
-                        lr: Union[float, Tensor],
-                        weight_decay: float,
-                        eps: float,
-                        maximize: bool,
-                        capturable: bool,
-                        differentiable: bool):
-
+def _single_tensor_adam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    max_exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
+    *,
+    amsgrad: bool,
+    has_complex: bool,
+    beta1: float,
+    beta2: float,
+    lr: Union[float, Tensor],
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+    capturable: bool,
+    differentiable: bool,
+):
     assert grad_scale is None and found_inf is None
 
     if torch.jit.is_scripting():
@@ -368,8 +352,8 @@ def _single_tensor_adam(params: List[Tensor],
 
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch._utils.is_compiling() and capturable:
-            assert (
-                (param.is_cuda and step_t.is_cuda) or (param.is_xla and step_t.is_xla)
+            assert (param.is_cuda and step_t.is_cuda) or (
+                param.is_xla and step_t.is_xla
             ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
 
         # update step
@@ -393,8 +377,8 @@ def _single_tensor_adam(params: List[Tensor],
         if capturable or differentiable:
             step = step_t
 
-            bias_correction1 = 1 - beta1 ** step
-            bias_correction2 = 1 - beta2 ** step
+            bias_correction1 = 1 - beta1**step
+            bias_correction2 = 1 - beta2**step
 
             step_size = lr / bias_correction1
             step_size_neg = step_size.neg()
@@ -413,16 +397,20 @@ def _single_tensor_adam(params: List[Tensor],
                 # Uses the max. for normalizing running avg. of gradient
                 # Folds in (admittedly ugly) 1-elem step_size math here to avoid extra param-set-sized read+write
                 # (can't fold it into addcdiv_ below because addcdiv_ requires value is a Number, not a Tensor)
-                denom = (max_exp_avg_sqs[i].sqrt() / (bias_correction2_sqrt * step_size_neg)).add_(eps / step_size_neg)
+                denom = (
+                    max_exp_avg_sqs[i].sqrt() / (bias_correction2_sqrt * step_size_neg)
+                ).add_(eps / step_size_neg)
             else:
-                denom = (exp_avg_sq.sqrt() / (bias_correction2_sqrt * step_size_neg)).add_(eps / step_size_neg)
+                denom = (
+                    exp_avg_sq.sqrt() / (bias_correction2_sqrt * step_size_neg)
+                ).add_(eps / step_size_neg)
 
             param.addcdiv_(exp_avg, denom)
         else:
             step = _get_value(step_t)
 
-            bias_correction1 = 1 - beta1 ** step
-            bias_correction2 = 1 - beta2 ** step
+            bias_correction1 = 1 - beta1**step
+            bias_correction2 = 1 - beta2**step
 
             step_size = lr / bias_correction1
 
@@ -444,67 +432,82 @@ def _single_tensor_adam(params: List[Tensor],
             max_exp_avg_sqs[i] = torch.view_as_complex(max_exp_avg_sqs[i])
 
 
-def _multi_tensor_adam(params: List[Tensor],
-                       grads: List[Tensor],
-                       exp_avgs: List[Tensor],
-                       exp_avg_sqs: List[Tensor],
-                       max_exp_avg_sqs: List[Tensor],
-                       state_steps: List[Tensor],
-                       grad_scale: Optional[Tensor],
-                       found_inf: Optional[Tensor],
-                       *,
-                       amsgrad: bool,
-                       has_complex: bool,
-                       beta1: float,
-                       beta2: float,
-                       lr: Union[float, Tensor],
-                       weight_decay: float,
-                       eps: float,
-                       maximize: bool,
-                       capturable: bool,
-                       differentiable: bool):
+def _multi_tensor_adam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    max_exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
+    *,
+    amsgrad: bool,
+    has_complex: bool,
+    beta1: float,
+    beta2: float,
+    lr: Union[float, Tensor],
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+    capturable: bool,
+    differentiable: bool,
+):
     if len(params) == 0:
         return
 
     if isinstance(lr, Tensor) and not capturable:
-        raise RuntimeError("lr as a Tensor is not supported for capturable=False and foreach=True")
+        raise RuntimeError(
+            "lr as a Tensor is not supported for capturable=False and foreach=True"
+        )
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch._utils.is_compiling() and capturable:
-        assert all(p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)), \
-            "If capturable=True, params and state_steps must be CUDA tensors."
+        assert all(
+            p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)
+        ), "If capturable=True, params and state_steps must be CUDA tensors."
 
     assert grad_scale is None and found_inf is None
 
     assert not differentiable, "_foreach ops don't support autograd"
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps])
-    for ((
+        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]
+    )
+    for (
         device_params,
         device_grads,
         device_exp_avgs,
         device_exp_avg_sqs,
         device_max_exp_avg_sqs,
         device_state_steps,
-    ), _) in grouped_tensors.values():
-
-        if maximize:
-            device_grads = torch._foreach_neg(device_grads)
-
+    ), _ in grouped_tensors.values():
         # Handle complex parameters
         if has_complex:
             if amsgrad:
-                _view_as_real(device_params, device_grads, device_exp_avgs, device_exp_avg_sqs, device_max_exp_avg_sqs)
+                _view_as_real(
+                    device_params,
+                    device_grads,
+                    device_exp_avgs,
+                    device_exp_avg_sqs,
+                    device_max_exp_avg_sqs,
+                )
             else:
-                _view_as_real(device_params, device_grads, device_exp_avgs, device_exp_avg_sqs)
+                _view_as_real(
+                    device_params, device_grads, device_exp_avgs, device_exp_avg_sqs
+                )
+
+        if maximize:
+            device_grads = torch._foreach_neg(device_grads)
 
         # Update steps
         # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
         # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
         # wrapped it once now. The alpha is required to assure we go to the right overload.
         if device_state_steps[0].is_cpu:
-            torch._foreach_add_(device_state_steps, torch.tensor(1.0, device='cpu'), alpha=1.0)
+            torch._foreach_add_(
+                device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
+            )
         else:
             torch._foreach_add_(device_state_steps, 1)
 
@@ -513,13 +516,17 @@ def _multi_tensor_adam(params: List[Tensor],
             if maximize:
                 torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
             else:
-                device_grads = torch._foreach_add(device_grads, device_params, alpha=weight_decay)
+                device_grads = torch._foreach_add(
+                    device_grads, device_params, alpha=weight_decay
+                )
 
         # Decay the first and second moment running average coefficient
         torch._foreach_lerp_(device_exp_avgs, device_grads, 1 - beta1)
 
         torch._foreach_mul_(device_exp_avg_sqs, beta2)
-        torch._foreach_addcmul_(device_exp_avg_sqs, device_grads, device_grads, 1 - beta2)
+        torch._foreach_addcmul_(
+            device_exp_avg_sqs, device_grads, device_grads, 1 - beta2
+        )
 
         # Delete the local intermediate since it won't be used anymore to save on peak memory
         del device_grads
@@ -561,8 +568,12 @@ def _multi_tensor_adam(params: List[Tensor],
             # at this point, exp_avg_sq_sqrt = - (1 - beta^t) * [sqrt(exp_avg_sq / (1 - beta2^t)) + eps] / lr
             torch._foreach_addcdiv_(device_params, device_exp_avgs, exp_avg_sq_sqrt)
         else:
-            bias_correction1 = [1 - beta1 ** _get_value(step) for step in device_state_steps]
-            bias_correction2 = [1 - beta2 ** _get_value(step) for step in device_state_steps]
+            bias_correction1 = [
+                1 - beta1 ** _get_value(step) for step in device_state_steps
+            ]
+            bias_correction2 = [
+                1 - beta2 ** _get_value(step) for step in device_state_steps
+            ]
 
             step_size = _stack_if_compiling([(lr / bc) * -1 for bc in bias_correction1])
 
@@ -579,7 +590,9 @@ def _multi_tensor_adam(params: List[Tensor],
 
             torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)
             torch._foreach_add_(exp_avg_sq_sqrt, eps)
-            torch._foreach_addcdiv_(device_params, device_exp_avgs, exp_avg_sq_sqrt, step_size)
+            torch._foreach_addcdiv_(
+                device_params, device_exp_avgs, exp_avg_sq_sqrt, step_size
+            )
 
 
 def _fused_adam(
@@ -608,21 +621,31 @@ def _fused_adam(
     if differentiable:
         raise RuntimeError("Adam with fused=True does not support differentiable=True")
 
-    grad_scale_dict = {grad_scale.device: grad_scale} if grad_scale is not None else None
+    grad_scale_dict = (
+        {grad_scale.device: grad_scale} if grad_scale is not None else None
+    )
     found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None
 
     # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
     # treating it as a scalar.
-    lr_dict = {lr.device: lr} if isinstance(lr, Tensor) and str(lr.device) != "cpu" else None
+    lr_dict = (
+        {lr.device: lr} if isinstance(lr, Tensor) and str(lr.device) != "cpu" else None
+    )
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps])
-    for (device, _), ((device_params,
-                       device_grads,
-                       device_exp_avgs,
-                       device_exp_avg_sqs,
-                       device_max_exp_avg_sqs,
-                       device_state_steps,), _) in grouped_tensors.items():
+        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]
+    )
+    for (device, _), (
+        (
+            device_params,
+            device_grads,
+            device_exp_avgs,
+            device_exp_avg_sqs,
+            device_max_exp_avg_sqs,
+            device_state_steps,
+        ),
+        _,
+    ) in grouped_tensors.items():
         device_grad_scale, device_found_inf = None, None
         if grad_scale is not None:
             if device not in grad_scale_dict:
@@ -654,4 +677,95 @@ def _fused_adam(
             found_inf=device_found_inf,
         )
         if device_found_inf is not None:
-            torch._foreach_sub_(device_state_steps, [device_found_inf] * len(device_state_steps))
+            torch._foreach_sub_(
+                device_state_steps, [device_found_inf] * len(device_state_steps)
+            )
+
+
+@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_adam)
+def adam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    max_exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: Optional[bool] = None,
+    capturable: bool = False,
+    differentiable: bool = False,
+    fused: Optional[bool] = None,
+    grad_scale: Optional[Tensor] = None,
+    found_inf: Optional[Tensor] = None,
+    has_complex: bool = False,
+    *,
+    amsgrad: bool,
+    beta1: float,
+    beta2: float,
+    lr: Union[float, Tensor],
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+):
+    r"""Functional API that performs Adam algorithm computation.
+
+    See :class:`~torch.optim.Adam` for details.
+    """
+    # Respect when the user inputs False/True for foreach or fused. We only want to change
+    # the default when neither have been user-specified. Note that we default to foreach
+    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
+    # bake-in time before making it the default, even if it is typically faster.
+    if fused is None and foreach is None:
+        _, foreach = _default_to_fused_or_foreach(
+            params, differentiable, use_fused=False
+        )
+        # Do not flip on foreach for the unsupported case where lr is a Tensor and capturable=False.
+        if foreach and isinstance(lr, Tensor) and not capturable:
+            foreach = False
+    if fused is None:
+        fused = False
+    if foreach is None:
+        foreach = False
+
+    # this check is slow during compilation, so we skip it
+    # if it's strictly needed we can add this check back in dynamo
+    if not torch._utils.is_compiling() and not all(
+        isinstance(t, torch.Tensor) for t in state_steps
+    ):
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+    if fused and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with fused optimizers")
+
+    if fused and not torch.jit.is_scripting():
+        func = _fused_adam
+    elif foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_adam
+    else:
+        func = _single_tensor_adam
+
+    func(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps,
+        amsgrad=amsgrad,
+        has_complex=has_complex,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        eps=eps,
+        maximize=maximize,
+        capturable=capturable,
+        differentiable=differentiable,
+        grad_scale=grad_scale,
+        found_inf=found_inf,
+    )
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index 1069470668ac3..6fa335de4d8b8 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -1,10 +1,21 @@
+from typing import List, Optional
+
 import torch
 from torch import Tensor
 
-from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _stack_if_compiling,
-                        _default_to_fused_or_foreach, _get_scalar_dtype, _differentiable_doc,
-                        _maximize_doc, _foreach_doc, _view_as_real)
-from typing import List, Optional
+from .optimizer import (
+    _capturable_doc,
+    _default_to_fused_or_foreach,
+    _differentiable_doc,
+    _disable_dynamo_if_unsupported,
+    _foreach_doc,
+    _get_scalar_dtype,
+    _get_value,
+    _maximize_doc,
+    _use_grad_for_differentiable,
+    _view_as_real,
+    Optimizer,
+)
 
 __all__ = ["Adamax", "adamax"]
 
@@ -21,6 +32,7 @@ def __init__(
         *,
         maximize: bool = False,
         differentiable: bool = False,
+        capturable: bool = False,
     ):
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
@@ -41,6 +53,7 @@ def __init__(
             foreach=foreach,
             maximize=maximize,
             differentiable=differentiable,
+            capturable=capturable,
         )
         super().__init__(params, defaults)
 
@@ -50,15 +63,22 @@ def __setstate__(self, state):
             group.setdefault("foreach", None)
             group.setdefault("maximize", False)
             group.setdefault("differentiable", False)
-        state_values = list(self.state.values())
-        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(
-            state_values[0]["step"]
-        )
-        if not step_is_tensor:
-            for s in state_values:
-                s["step"] = torch.tensor(float(s["step"]), dtype=_get_scalar_dtype())
-
-    def _init_group(self, group, params_with_grad, grads, exp_avgs, exp_infs, state_steps):
+            group.setdefault("capturable", False)
+            for p in group["params"]:
+                p_state = self.state.get(p, [])
+                if len(p_state) != 0 and not torch.is_tensor(p_state["step"]):
+                    step_val = float(p_state["step"])
+                    p_state["step"] = (
+                        torch.tensor(
+                            step_val, dtype=_get_scalar_dtype(), device=p.device
+                        )
+                        if group["capturable"]
+                        else torch.tensor(step_val, dtype=_get_scalar_dtype())
+                    )
+
+    def _init_group(
+        self, group, params_with_grad, grads, exp_avgs, exp_infs, state_steps
+    ):
         has_complex = False
         for p in group["params"]:
             if p.grad is None:
@@ -73,7 +93,11 @@ def _init_group(self, group, params_with_grad, grads, exp_avgs, exp_infs, state_
 
             # State initialization
             if len(state) == 0:
-                state["step"] = torch.tensor(0.0, dtype=_get_scalar_dtype())
+                state["step"] = (
+                    torch.zeros((), dtype=_get_scalar_dtype(), device=p.device)
+                    if group["capturable"]
+                    else torch.tensor(0.0, dtype=_get_scalar_dtype())
+                )
                 state["exp_avg"] = torch.zeros_like(
                     p, memory_format=torch.preserve_format
                 )
@@ -84,16 +108,19 @@ def _init_group(self, group, params_with_grad, grads, exp_avgs, exp_infs, state_
             exp_avgs.append(state["exp_avg"])
             exp_infs.append(state["exp_inf"])
             state_steps.append(state["step"])
+
         return has_complex
 
     @_use_grad_for_differentiable
     def step(self, closure=None):
-        """Perform a single optimization step.
+        """Performs a single optimization step.
 
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
+        self._cuda_graph_capture_health_check()
+
         loss = None
         if closure is not None:
             with torch.enable_grad():
@@ -113,8 +140,11 @@ def step(self, closure=None):
             foreach = group["foreach"]
             maximize = group["maximize"]
             differentiable = group["differentiable"]
+            capturable = group["capturable"]
 
-            has_complex = self._init_group(group, params_with_grad, grads, exp_avgs, exp_infs, state_steps)
+            has_complex = self._init_group(
+                group, params_with_grad, grads, exp_avgs, exp_infs, state_steps
+            )
 
             adamax(
                 params_with_grad,
@@ -130,13 +160,15 @@ def step(self, closure=None):
                 foreach=foreach,
                 maximize=maximize,
                 differentiable=differentiable,
+                capturable=capturable,
                 has_complex=has_complex,
             )
 
         return loss
 
 
-Adamax.__doc__ = r"""Implements Adamax algorithm (a variant of Adam based on infinity norm).
+Adamax.__doc__ = (
+    r"""Implements Adamax algorithm (a variant of Adam based on infinity norm).
 
     .. math::
        \begin{aligned}
@@ -161,7 +193,8 @@ def step(self, closure=None):
        \end{aligned}
 
     For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_.
-    """ + fr"""
+    """
+    + rf"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -174,67 +207,13 @@ def step(self, closure=None):
         {_foreach_doc}
         {_maximize_doc}
         {_differentiable_doc}
+        {_capturable_doc}
 
     .. _Adam\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
 
     """
-
-
-def adamax(
-    params: List[Tensor],
-    grads: List[Tensor],
-    exp_avgs: List[Tensor],
-    exp_infs: List[Tensor],
-    state_steps: List[Tensor],
-    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-    foreach: Optional[bool] = None,
-    maximize: bool = False,
-    differentiable: bool = False,
-    has_complex: bool = False,
-    *,
-    eps: float,
-    beta1: float,
-    beta2: float,
-    lr: float,
-    weight_decay: float,
-):
-    r"""Functional API that performs adamax algorithm computation.
-
-    See :class:`~torch.optim.Adamax` for details.
-    """
-    if not all(isinstance(t, torch.Tensor) for t in state_steps):
-        raise RuntimeError(
-            "API has changed, `state_steps` argument must contain a list of singleton tensors"
-        )
-
-    if foreach is None:
-        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
-
-    if foreach and torch.jit.is_scripting():
-        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
-
-    if foreach and not torch.jit.is_scripting():
-        func = _multi_tensor_adamax
-    else:
-        func = _single_tensor_adamax
-
-    func(
-        params,
-        grads,
-        exp_avgs,
-        exp_infs,
-        state_steps,
-        eps=eps,
-        beta1=beta1,
-        beta2=beta2,
-        lr=lr,
-        weight_decay=weight_decay,
-        maximize=maximize,
-        differentiable=differentiable,
-        has_complex=has_complex,
-    )
+)
 
 
 def _single_tensor_adamax(
@@ -251,15 +230,22 @@ def _single_tensor_adamax(
     weight_decay: float,
     maximize: bool,
     differentiable: bool,
+    capturable: bool,
     has_complex: bool,
 ):
-
     for i, param in enumerate(params):
         grad = grads[i]
         grad = grad if not maximize else -grad
         exp_avg = exp_avgs[i]
         exp_inf = exp_infs[i]
         step_t = state_steps[i]
+
+        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+        if not torch._utils.is_compiling() and capturable:
+            assert (param.is_cuda and step_t.is_cuda) or (
+                param.is_xla and step_t.is_xla
+            ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+
         # update step
         step_t += 1
 
@@ -275,19 +261,31 @@ def _single_tensor_adamax(
         # Update biased first moment estimate.
         exp_avg.lerp_(grad, 1 - beta1)
         # Update the exponentially weighted infinity norm.
-        norm_buf = torch.cat(
-            [exp_inf.mul_(beta2).unsqueeze(0), grad.abs().add_(eps).unsqueeze_(0)], 0
-        )
-
         if not differentiable:
-            torch.amax(norm_buf, 0, keepdim=False, out=exp_inf)
+            torch.maximum(
+                exp_inf.mul_(beta2),
+                grad.abs().add_(eps),
+                out=exp_inf,
+            )
         else:
+            norm_buf = torch.cat(
+                [exp_inf.mul_(beta2).unsqueeze(0), grad.abs().add_(eps).unsqueeze_(0)],
+                0,
+            )
             exp_inf.copy_(torch.amax(norm_buf, 0, keepdim=False))
 
-        bias_correction = 1 - beta1 ** _get_value(step_t)
-        clr = lr / bias_correction
+        if capturable:
+            # why jump through extra hoops and negate bias_correction? check out #121238
+            # once fixed, we should use bias_correction with addcdiv value=-1 for readability
+            neg_bias_correction = beta1**step_t - 1
+            neg_bias_correction.div_(lr)
+            denom = exp_inf * neg_bias_correction
+            param.addcdiv_(exp_avg, denom)
+        else:
+            bias_correction = 1 - beta1 ** _get_value(step_t)
+            clr = lr / bias_correction
 
-        param.addcdiv_(exp_avg, exp_inf, value=-clr)
+            param.addcdiv_(exp_avg, exp_inf, value=-clr)
 
 
 def _multi_tensor_adamax(
@@ -304,28 +302,50 @@ def _multi_tensor_adamax(
     eps: float,
     maximize: bool,
     differentiable: bool,
+    capturable: bool,
     has_complex: bool,
 ):
-
     assert not differentiable, "_foreach ops don't support autograd"
 
     if len(params) == 0:
         return
 
-    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, exp_avgs, exp_infs, state_steps])
-    for ((grouped_params, grouped_grads, grouped_exp_avgs, grouped_exp_infs, grouped_state_steps), _) in grouped_tensors.values():
-        if maximize:
-            grouped_grads = torch._foreach_neg(grouped_grads)
+    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+    if (
+        not torch._utils.is_compiling()
+        and capturable
+        and not all(p.is_cuda and step.is_cuda for p, step in zip(params, state_steps))
+    ):
+        raise RuntimeError(
+            "If capturable=True, params and state_steps must be CUDA tensors."
+        )
 
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, exp_avgs, exp_infs, state_steps]
+    )
+    for (
+        grouped_params,
+        grouped_grads,
+        grouped_exp_avgs,
+        grouped_exp_infs,
+        grouped_state_steps,
+    ), _ in grouped_tensors.values():
         if has_complex:
-            _view_as_real(grouped_params, grouped_grads, grouped_exp_avgs, grouped_exp_infs)
+            _view_as_real(
+                grouped_params, grouped_grads, grouped_exp_avgs, grouped_exp_infs
+            )
+
+        if maximize:
+            grouped_grads = torch._foreach_neg(grouped_grads)
 
         # Update steps
         # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
         # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
         # wrapped it once now. The alpha is required to assure we go to the right overload.
         if grouped_state_steps[0].is_cpu:
-            torch._foreach_add_(grouped_state_steps, torch.tensor(1.0, device='cpu'), alpha=1.0)
+            torch._foreach_add_(
+                grouped_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
+            )
         else:
             torch._foreach_add_(grouped_state_steps, 1)
 
@@ -334,7 +354,9 @@ def _multi_tensor_adamax(
                 # Re-use the intermediate memory (grouped_grads) already allocated for maximize
                 torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay)
             else:
-                grouped_grads = torch._foreach_add(grouped_grads, grouped_params, alpha=weight_decay)
+                grouped_grads = torch._foreach_add(
+                    grouped_grads, grouped_params, alpha=weight_decay
+                )
 
         # Update biased first moment estimate.
         torch._foreach_lerp_(grouped_exp_avgs, grouped_grads, 1 - beta1)
@@ -342,13 +364,93 @@ def _multi_tensor_adamax(
         # Update the exponentially weighted infinity norm.
         torch._foreach_mul_(grouped_exp_infs, beta2)
 
-        for exp_inf, grad in zip(grouped_exp_infs, grouped_grads):
-            norm_buf = torch.cat(
-                [exp_inf.unsqueeze(0), grad.abs().add_(eps).unsqueeze_(0)], 0
+        # in this case, we need to introduce a copy of the grads
+        # since one has not been introduced previously
+        if not maximize and weight_decay == 0:
+            grouped_grads = torch._foreach_abs(grouped_grads)
+        else:
+            torch._foreach_abs_(grouped_grads)
+
+        torch._foreach_add_(grouped_grads, eps)
+        torch._foreach_maximum_(grouped_exp_infs, grouped_grads)
+
+        if capturable:
+            bias_corrections = torch._foreach_pow(beta1, grouped_state_steps)
+            # foreach_sub doesn't allow a scalar as the first arg
+            torch._foreach_sub_(bias_corrections, 1)
+            torch._foreach_div_(bias_corrections, lr)
+
+            denom = torch._foreach_mul(grouped_exp_infs, bias_corrections)
+            torch._foreach_addcdiv_(grouped_params, grouped_exp_avgs, denom)
+        else:
+            bias_corrections = [
+                1 - beta1 ** _get_value(step) for step in grouped_state_steps
+            ]
+            step_size = [(_get_value(lr) / bc) * -1 for bc in bias_corrections]
+            torch._foreach_addcdiv_(
+                grouped_params, grouped_exp_avgs, grouped_exp_infs, step_size
             )
-            torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long()))
 
-        bias_corrections = [1 - beta1 ** _get_value(step) for step in grouped_state_steps]
-        clr = _stack_if_compiling([-1 * (lr / bias_correction) for bias_correction in bias_corrections])
 
-        torch._foreach_addcdiv_(grouped_params, grouped_exp_avgs, grouped_exp_infs, clr)
+@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_adamax)
+def adamax(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_infs: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: Optional[bool] = None,
+    maximize: bool = False,
+    differentiable: bool = False,
+    capturable: bool = False,
+    has_complex: bool = False,
+    *,
+    eps: float,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+):
+    r"""Functional API that performs adamax algorithm computation.
+
+    See :class:`~torch.optim.Adamax` for details.
+    """
+
+    if not torch._utils.is_compiling() and not all(
+        isinstance(t, torch.Tensor) for t in state_steps
+    ):
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
+
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(
+            params, differentiable, use_fused=False
+        )
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_adamax
+    else:
+        func = _single_tensor_adamax
+
+    func(
+        params,
+        grads,
+        exp_avgs,
+        exp_infs,
+        state_steps,
+        eps=eps,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        maximize=maximize,
+        differentiable=differentiable,
+        has_complex=has_complex,
+        capturable=capturable,
+    )
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index 071e879d57ee8..aa46c7a537e77 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@@ -1,11 +1,25 @@
+from typing import List, Optional, Tuple, Union
+
 import torch
 from torch import Tensor
-from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _dispatch_sqrt,
-                        _stack_if_compiling, _get_scalar_dtype, _capturable_doc, _differentiable_doc,
-                        _foreach_doc, _fused_doc, _maximize_doc, _default_to_fused_or_foreach,
-                        ParamsT, _view_as_real)
-from typing import List, Optional, Tuple, Union
 from torch.utils._foreach_utils import _get_fused_kernels_supported_devices
+from .optimizer import (
+    _capturable_doc,
+    _default_to_fused_or_foreach,
+    _differentiable_doc,
+    _disable_dynamo_if_unsupported,
+    _dispatch_sqrt,
+    _foreach_doc,
+    _fused_doc,
+    _get_scalar_dtype,
+    _get_value,
+    _maximize_doc,
+    _stack_if_compiling,
+    _use_grad_for_differentiable,
+    _view_as_real,
+    Optimizer,
+    ParamsT,
+)
 
 __all__ = ["AdamW", "adamw"]
 
@@ -29,7 +43,9 @@ def __init__(
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if isinstance(lr, Tensor) and foreach and not capturable:
-            raise ValueError("lr as a Tensor is not supported for capturable=False and foreach=True")
+            raise ValueError(
+                "lr as a Tensor is not supported for capturable=False and foreach=True"
+            )
         if not 0.0 <= eps:
             raise ValueError(f"Invalid epsilon value: {eps}")
         if not 0.0 <= betas[0] < 1.0:
@@ -62,12 +78,14 @@ def __init__(
             # alleviate the loss of information.
             fused_supported_devices = _get_fused_kernels_supported_devices()
             if not all(
-                p.device.type in fused_supported_devices and
-                torch.is_floating_point(p)
-                for pg in self.param_groups for p in pg['params']
+                p.device.type in fused_supported_devices and torch.is_floating_point(p)
+                for pg in self.param_groups
+                for p in pg["params"]
             ):
-                raise RuntimeError("`fused=True` requires all the params to be floating point Tensors of "
-                                   f"supported devices: {fused_supported_devices}.")
+                raise RuntimeError(
+                    "`fused=True` requires all the params to be floating point Tensors of "
+                    f"supported devices: {fused_supported_devices}."
+                )
             if foreach:
                 raise RuntimeError("`fused` and `foreach` cannot be `True` together.")
 
@@ -82,8 +100,17 @@ def __setstate__(self, state):
             fused = group.setdefault("fused", None)
             for p in group["params"]:
                 p_state = self.state.get(p, [])
-                if len(p_state) != 0 and not torch.is_tensor(p_state['step']):
-                    p_state["step"] = torch.tensor(float(p_state["step"]), dtype=_get_scalar_dtype(is_fused=fused))
+                if len(p_state) != 0 and not torch.is_tensor(p_state["step"]):
+                    step_val = float(p_state["step"])
+                    p_state["step"] = (
+                        torch.tensor(
+                            step_val,
+                            dtype=_get_scalar_dtype(is_fused=fused),
+                            device=p.device,
+                        )
+                        if group["capturable"] or group["fused"]
+                        else torch.tensor(step_val, dtype=_get_scalar_dtype())
+                    )
 
     def _init_group(
         self,
@@ -113,7 +140,11 @@ def _init_group(
                 # note(crcrpar): Deliberately host `step` on CPU if both capturable and fused are off.
                 # This is because kernel launches are costly on CUDA and XLA.
                 state["step"] = (
-                    torch.zeros((), dtype=_get_scalar_dtype(is_fused=group["fused"]), device=p.device)
+                    torch.zeros(
+                        (),
+                        dtype=_get_scalar_dtype(is_fused=group["fused"]),
+                        device=p.device,
+                    )
                     if group["capturable"] or group["fused"]
                     else torch.tensor(0.0, dtype=_get_scalar_dtype())
                 )
@@ -134,14 +165,22 @@ def _init_group(
             exp_avgs.append(state["exp_avg"])
             exp_avg_sqs.append(state["exp_avg_sq"])
 
-            if group['amsgrad']:
+            if group["amsgrad"]:
                 max_exp_avg_sqs.append(state["max_exp_avg_sq"])
-            if group['differentiable'] and state['step'].requires_grad:
-                raise RuntimeError('`requires_grad` is not supported for `step` in differentiable mode')
+            if group["differentiable"] and state["step"].requires_grad:
+                raise RuntimeError(
+                    "`requires_grad` is not supported for `step` in differentiable mode"
+                )
 
             # Foreach without capturable does not support a tensor lr
-            if group['foreach'] and isinstance(group['lr'], Tensor) and not group['capturable']:
-                raise RuntimeError('lr as a Tensor is not supported for capturable=False and foreach=True')
+            if (
+                group["foreach"]
+                and isinstance(group["lr"], Tensor)
+                and not group["capturable"]
+            ):
+                raise RuntimeError(
+                    "lr as a Tensor is not supported for capturable=False and foreach=True"
+                )
 
             state_steps.append(state["step"])
         return has_complex
@@ -208,7 +247,8 @@ def step(self, closure=None):
         return loss
 
 
-AdamW.__doc__ = r"""Implements AdamW algorithm.
+AdamW.__doc__ = (
+    r"""Implements AdamW algorithm.
 
     .. math::
        \begin{aligned}
@@ -246,7 +286,8 @@ def step(self, closure=None):
        \end{aligned}
 
     For further details regarding the algorithm we refer to `Decoupled Weight Decay Regularization`_.
-    """ + fr"""
+    """
+    + rf"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -272,88 +313,7 @@ def step(self, closure=None):
         https://openreview.net/forum?id=ryQu7f-RZ
 
     """
-
-
-def adamw(
-    params: List[Tensor],
-    grads: List[Tensor],
-    exp_avgs: List[Tensor],
-    exp_avg_sqs: List[Tensor],
-    max_exp_avg_sqs: List[Tensor],
-    state_steps: List[Tensor],
-    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-    foreach: Optional[bool] = None,
-    capturable: bool = False,
-    differentiable: bool = False,
-    fused: Optional[bool] = None,
-    grad_scale: Optional[Tensor] = None,
-    found_inf: Optional[Tensor] = None,
-    has_complex: bool = False,
-    *,
-    amsgrad: bool,
-    beta1: float,
-    beta2: float,
-    lr: Union[float, Tensor],
-    weight_decay: float,
-    eps: float,
-    maximize: bool,
-):
-    r"""Functional API that performs AdamW algorithm computation.
-
-    See :class:`~torch.optim.AdamW` for details.
-    """
-    if not torch._utils.is_compiling() and not all(isinstance(t, torch.Tensor) for t in state_steps):
-        raise RuntimeError(
-            "API has changed, `state_steps` argument must contain a list of singleton tensors"
-        )
-
-    # Respect when the user inputs False/True for foreach or fused. We only want to change
-    # the default when neither have been user-specified. Note that we default to foreach
-    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
-    # bake-in time before making it the default, even if it is typically faster.
-    if fused is None and foreach is None:
-        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
-        # Do not flip on foreach for the unsupported case where lr is a Tensor and capturable=False.
-        if foreach and isinstance(lr, Tensor) and not capturable:
-            foreach = False
-    if fused is None:
-        fused = False
-    if foreach is None:
-        foreach = False
-
-    if foreach and torch.jit.is_scripting():
-        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
-    if fused and torch.jit.is_scripting():
-        raise RuntimeError("torch.jit.script not supported with fused optimizers")
-
-    if fused and not torch.jit.is_scripting():
-        func = _fused_adamw
-    elif foreach and not torch.jit.is_scripting():
-        func = _multi_tensor_adamw
-    else:
-        func = _single_tensor_adamw
-
-    func(
-        params,
-        grads,
-        exp_avgs,
-        exp_avg_sqs,
-        max_exp_avg_sqs,
-        state_steps,
-        amsgrad=amsgrad,
-        beta1=beta1,
-        beta2=beta2,
-        lr=lr,
-        weight_decay=weight_decay,
-        eps=eps,
-        maximize=maximize,
-        capturable=capturable,
-        differentiable=differentiable,
-        grad_scale=grad_scale,
-        found_inf=found_inf,
-        has_complex=has_complex,
-    )
+)
 
 
 def _single_tensor_adamw(
@@ -377,7 +337,6 @@ def _single_tensor_adamw(
     differentiable: bool,
     has_complex: bool,
 ):
-
     assert grad_scale is None and found_inf is None
 
     if torch.jit.is_scripting():
@@ -394,8 +353,8 @@ def _single_tensor_adamw(
 
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch._utils.is_compiling() and capturable:
-            assert (
-                (param.is_cuda and step_t.is_cuda) or (param.is_xla and step_t.is_xla)
+            assert (param.is_cuda and step_t.is_cuda) or (
+                param.is_xla and step_t.is_xla
             ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
 
         if torch.is_complex(param):
@@ -419,8 +378,8 @@ def _single_tensor_adamw(
         if capturable or differentiable:
             step = step_t
 
-            bias_correction1 = 1 - beta1 ** step
-            bias_correction2 = 1 - beta2 ** step
+            bias_correction1 = 1 - beta1**step
+            bias_correction2 = 1 - beta2**step
 
             step_size = lr / bias_correction1
             step_size_neg = step_size.neg()
@@ -451,8 +410,8 @@ def _single_tensor_adamw(
         else:
             step = _get_value(step_t)
 
-            bias_correction1 = 1 - beta1 ** step
-            bias_correction2 = 1 - beta2 ** step
+            bias_correction1 = 1 - beta1**step
+            bias_correction2 = 1 - beta2**step
 
             step_size = lr / bias_correction1
 
@@ -499,7 +458,9 @@ def _multi_tensor_adamw(
         return
 
     if isinstance(lr, Tensor) and not capturable:
-        raise RuntimeError("lr as a Tensor is not supported for capturable=False and foreach=True")
+        raise RuntimeError(
+            "lr as a Tensor is not supported for capturable=False and foreach=True"
+        )
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch._utils.is_compiling() and capturable:
@@ -511,31 +472,42 @@ def _multi_tensor_adamw(
 
     assert grad_scale is None and found_inf is None
 
-    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([
-        params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps])
-    for ((
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]
+    )
+    for (
         device_params,
         device_grads,
         device_exp_avgs,
         device_exp_avg_sqs,
         device_max_exp_avg_sqs,
         device_state_steps,
-    ), _) in grouped_tensors.values():
-        if maximize:
-            device_grads = torch._foreach_neg(device_grads)
-
+    ), _ in grouped_tensors.values():
         if has_complex:
             if amsgrad:
-                _view_as_real(device_params, device_grads, device_exp_avgs, device_exp_avg_sqs, device_max_exp_avg_sqs)
+                _view_as_real(
+                    device_params,
+                    device_grads,
+                    device_exp_avgs,
+                    device_exp_avg_sqs,
+                    device_max_exp_avg_sqs,
+                )
             else:
-                _view_as_real(device_params, device_grads, device_exp_avgs, device_exp_avg_sqs)
+                _view_as_real(
+                    device_params, device_grads, device_exp_avgs, device_exp_avg_sqs
+                )
+
+        if maximize:
+            device_grads = torch._foreach_neg(device_grads)
 
         # Update steps
         # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
         # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
         # wrapped it once now. The alpha is required to assure we go to the right overload.
         if device_state_steps[0].is_cpu:
-            torch._foreach_add_(device_state_steps, torch.tensor(1.0, device='cpu'), alpha=1.0)
+            torch._foreach_add_(
+                device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
+            )
         else:
             torch._foreach_add_(device_state_steps, 1)
 
@@ -547,7 +519,9 @@ def _multi_tensor_adamw(
         torch._foreach_lerp_(device_exp_avgs, device_grads, 1 - beta1)
 
         torch._foreach_mul_(device_exp_avg_sqs, beta2)
-        torch._foreach_addcmul_(device_exp_avg_sqs, device_grads, device_grads, 1 - beta2)
+        torch._foreach_addcmul_(
+            device_exp_avg_sqs, device_grads, device_grads, 1 - beta2
+        )
 
         # Delete the local intermediate since it won't be used anymore to save on peak memory
         del device_grads
@@ -589,8 +563,12 @@ def _multi_tensor_adamw(
             # at this point, exp_avg_sq_sqrt = - (1 - beta^t) * [sqrt(exp_avg_sq / (1 - beta2^t)) + eps] / lr
             torch._foreach_addcdiv_(device_params, device_exp_avgs, exp_avg_sq_sqrt)
         else:
-            bias_correction1 = [1 - beta1 ** _get_value(step) for step in device_state_steps]
-            bias_correction2 = [1 - beta2 ** _get_value(step) for step in device_state_steps]
+            bias_correction1 = [
+                1 - beta1 ** _get_value(step) for step in device_state_steps
+            ]
+            bias_correction2 = [
+                1 - beta2 ** _get_value(step) for step in device_state_steps
+            ]
 
             step_size = _stack_if_compiling([(lr / bc) * -1 for bc in bias_correction1])
 
@@ -607,7 +585,9 @@ def _multi_tensor_adamw(
 
             torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)
             torch._foreach_add_(exp_avg_sq_sqrt, eps)
-            torch._foreach_addcdiv_(device_params, device_exp_avgs, exp_avg_sq_sqrt, step_size)
+            torch._foreach_addcdiv_(
+                device_params, device_exp_avgs, exp_avg_sq_sqrt, step_size
+            )
 
 
 def _fused_adamw(
@@ -636,21 +616,31 @@ def _fused_adamw(
     if differentiable:
         raise RuntimeError("Adam with fused=True does not support differentiable=True")
 
-    grad_scale_dict = {grad_scale.device: grad_scale} if grad_scale is not None else None
+    grad_scale_dict = (
+        {grad_scale.device: grad_scale} if grad_scale is not None else None
+    )
     found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None
 
     # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
     # treating it as a scalar.
-    lr_dict = {lr.device: lr} if isinstance(lr, Tensor) and str(lr.device) != "cpu" else None
+    lr_dict = (
+        {lr.device: lr} if isinstance(lr, Tensor) and str(lr.device) != "cpu" else None
+    )
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps])
-    for (device, _), ((device_params,
-                       device_grads,
-                       device_exp_avgs,
-                       device_exp_avg_sqs,
-                       device_max_exp_avg_sqs,
-                       device_state_steps,), _) in grouped_tensors.items():
+        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]
+    )
+    for (device, _), (
+        (
+            device_params,
+            device_grads,
+            device_exp_avgs,
+            device_exp_avg_sqs,
+            device_max_exp_avg_sqs,
+            device_state_steps,
+        ),
+        _,
+    ) in grouped_tensors.items():
         device_grad_scale, device_found_inf = None, None
         if grad_scale is not None:
             if device not in grad_scale_dict:
@@ -682,4 +672,93 @@ def _fused_adamw(
             found_inf=device_found_inf,
         )
         if device_found_inf is not None:
-            torch._foreach_sub_(device_state_steps, [device_found_inf] * len(device_state_steps))
+            torch._foreach_sub_(
+                device_state_steps, [device_found_inf] * len(device_state_steps)
+            )
+
+
+@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_adamw)
+def adamw(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    max_exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: Optional[bool] = None,
+    capturable: bool = False,
+    differentiable: bool = False,
+    fused: Optional[bool] = None,
+    grad_scale: Optional[Tensor] = None,
+    found_inf: Optional[Tensor] = None,
+    has_complex: bool = False,
+    *,
+    amsgrad: bool,
+    beta1: float,
+    beta2: float,
+    lr: Union[float, Tensor],
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+):
+    r"""Functional API that performs AdamW algorithm computation.
+
+    See :class:`~torch.optim.AdamW` for details.
+    """
+    if not torch._utils.is_compiling() and not all(
+        isinstance(t, torch.Tensor) for t in state_steps
+    ):
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
+
+    # Respect when the user inputs False/True for foreach or fused. We only want to change
+    # the default when neither have been user-specified. Note that we default to foreach
+    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
+    # bake-in time before making it the default, even if it is typically faster.
+    if fused is None and foreach is None:
+        _, foreach = _default_to_fused_or_foreach(
+            params, differentiable, use_fused=False
+        )
+        # Do not flip on foreach for the unsupported case where lr is a Tensor and capturable=False.
+        if foreach and isinstance(lr, Tensor) and not capturable:
+            foreach = False
+    if fused is None:
+        fused = False
+    if foreach is None:
+        foreach = False
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+    if fused and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with fused optimizers")
+
+    if fused and not torch.jit.is_scripting():
+        func = _fused_adamw
+    elif foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_adamw
+    else:
+        func = _single_tensor_adamw
+
+    func(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps,
+        amsgrad=amsgrad,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        eps=eps,
+        maximize=maximize,
+        capturable=capturable,
+        differentiable=differentiable,
+        grad_scale=grad_scale,
+        found_inf=found_inf,
+        has_complex=has_complex,
+    )
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
index 5465d84350ada..5714a82d5f19b 100644
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@@ -1,20 +1,32 @@
+from typing import List, Optional
+
 import torch
 from torch import Tensor
 
-from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _default_to_fused_or_foreach,
-                        _get_scalar_dtype, _view_as_real, _differentiable_doc, _foreach_doc, _maximize_doc,
-                        _capturable_doc)
-from torch._utils import is_compiling
-from typing import List, Optional
+from .optimizer import (
+    _capturable_doc,
+    _default_to_fused_or_foreach,
+    _differentiable_doc,
+    _disable_dynamo_if_unsupported,
+    _foreach_doc,
+    _get_scalar_dtype,
+    _get_value,
+    _maximize_doc,
+    _use_grad_for_differentiable,
+    _view_as_real,
+    Optimizer,
+)
 
 __all__ = ["ASGD", "asgd"]
 
+
 def _to_tensor(x, device=None):
     if not isinstance(x, torch.Tensor):
         return torch.tensor(x, device=device)
 
     return x
 
+
 class ASGD(Optimizer):
     def __init__(
         self,
@@ -34,9 +46,6 @@ def __init__(
         if not 0.0 <= weight_decay:
             raise ValueError(f"Invalid weight_decay value: {weight_decay}")
 
-        if foreach is False and capturable:
-            raise ValueError("Capturable not supported with single tensor ASGD")
-
         defaults = dict(
             lr=lr,
             lambd=lambd,
@@ -57,25 +66,22 @@ def __setstate__(self, state):
             group.setdefault("maximize", False)
             group.setdefault("differentiable", False)
             group.setdefault("capturable", False)
-        state_values = list(self.state.values())
-        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(
-            state_values[0]["step"]
-        )
-        if not step_is_tensor:
-            for s in state_values:
-                s["step"] = torch.tensor(float(s["step"]), dtype=_get_scalar_dtype())
-        eta_is_tensor = (len(state_values) != 0) and torch.is_tensor(
-            state_values[0]["eta"]
-        )
-        if not eta_is_tensor:
-            for s in state_values:
-                s["eta"] = torch.tensor(s["eta"], dtype=_get_scalar_dtype())
-        mu_is_tensor = (len(state_values) != 0) and torch.is_tensor(
-            state_values[0]["mu"]
-        )
-        if not mu_is_tensor:
-            for s in state_values:
-                s["mu"] = torch.tensor(float(s["mu"]), dtype=_get_scalar_dtype())
+            for p in group["params"]:
+                p_state = self.state.get(p, [])
+                if len(p_state) != 0:
+                    if not torch.is_tensor(p_state["step"]):
+                        step_val = float(p_state["step"])
+                        p_state["step"] = torch.tensor(
+                            step_val, dtype=_get_scalar_dtype(), device=p.device
+                        )
+                    if not torch.is_tensor(p_state["eta"]):
+                        p_state["eta"] = torch.tensor(
+                            p_state["eta"], dtype=_get_scalar_dtype(), device=p.device
+                        )
+                    if not torch.is_tensor(p_state["mu"]):
+                        p_state["mu"] = torch.tensor(
+                            p_state["mu"], dtype=_get_scalar_dtype(), device=p.device
+                        )
 
     def _init_group(self, group, params_with_grad, grads, mus, axs, etas, state_steps):
         has_complex = False
@@ -90,9 +96,19 @@ def _init_group(self, group, params_with_grad, grads, mus, axs, etas, state_step
                 state = self.state[p]
                 # State initialization
                 if len(state) == 0:
-                    state["step"] = torch.zeros((), device=p.device, dtype=_get_scalar_dtype())
-                    state["eta"] = torch.tensor(group["lr"], device=p.device, dtype=_get_scalar_dtype())
-                    state["mu"] = torch.ones((), device=p.device, dtype=_get_scalar_dtype())
+                    state["step"] = torch.zeros(
+                        (), device=p.device, dtype=_get_scalar_dtype()
+                    )
+                    state["eta"] = (
+                        torch.as_tensor(
+                            group["lr"], device=p.device, dtype=_get_scalar_dtype()
+                        )
+                        .clone()
+                        .detach()
+                    )
+                    state["mu"] = torch.ones(
+                        (), device=p.device, dtype=_get_scalar_dtype()
+                    )
                     state["ax"] = torch.zeros_like(
                         p, memory_format=torch.preserve_format
                     )
@@ -111,6 +127,8 @@ def step(self, closure=None):
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
+        self._cuda_graph_capture_health_check()
+
         loss = None
         if closure is not None:
             with torch.enable_grad():
@@ -124,7 +142,9 @@ def step(self, closure=None):
             etas = []
             state_steps = []
 
-            has_complex = self._init_group(group, params_with_grad, grads, mus, axs, etas, state_steps)
+            has_complex = self._init_group(
+                group, params_with_grad, grads, mus, axs, etas, state_steps
+            )
 
             asgd(
                 params_with_grad,
@@ -148,7 +168,7 @@ def step(self, closure=None):
         return loss
 
 
-ASGD.__doc__ = fr"""Implements Averaged Stochastic Gradient Descent.
+ASGD.__doc__ = rf"""Implements Averaged Stochastic Gradient Descent.
 
     It has been proposed in `Acceleration of stochastic approximation by
     averaging`_.
@@ -164,7 +184,7 @@ def step(self, closure=None):
         {_foreach_doc}
         {_maximize_doc}
         {_differentiable_doc}
-        {_capturable_doc} For ASGD, capturable is only supported when foreach is True.
+        {_capturable_doc}
 
     .. _Acceleration of stochastic approximation by averaging:
         https://dl.acm.org/citation.cfm?id=131098
@@ -172,63 +192,6 @@ def step(self, closure=None):
     """
 
 
-def asgd(
-    params: List[Tensor],
-    grads: List[Tensor],
-    axs: List[Tensor],
-    mus: List[Tensor],
-    etas: List[Tensor],
-    state_steps: List[Tensor],
-    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-    foreach: Optional[bool] = None,
-    maximize: bool = False,
-    differentiable: bool = False,
-    capturable: bool = False,
-    has_complex: bool = False,
-    *,
-    lambd: float,
-    lr: float,
-    t0: float,
-    alpha: float,
-    weight_decay: float,
-):
-    r"""Functional API that performs asgd algorithm computation.
-
-    See :class:`~torch.optim.ASGD` for details.
-    """
-    if foreach is None:
-        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
-
-    if foreach and torch.jit.is_scripting():
-        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
-
-    if foreach and not torch.jit.is_scripting():
-        func = _multi_tensor_asgd
-    else:
-        if capturable and not is_compiling():
-            raise RuntimeError("Capturable not supported with single tensor ASGD")
-        func = _single_tensor_asgd
-
-    func(
-        params,
-        grads,
-        axs,
-        mus,
-        etas,
-        state_steps,
-        lambd=lambd,
-        lr=lr,
-        t0=t0,
-        alpha=alpha,
-        weight_decay=weight_decay,
-        maximize=maximize,
-        differentiable=differentiable,
-        capturable=capturable,
-        has_complex=has_complex,
-    )
-
-
 def _single_tensor_asgd(
     params: List[Tensor],
     grads: List[Tensor],
@@ -255,6 +218,14 @@ def _single_tensor_asgd(
         eta = etas[i]
         step_t = state_steps[i]
 
+        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+        if not torch._utils.is_compiling() and capturable:
+            assert (
+                param.is_cuda and mu.is_cuda and eta.is_cuda and step_t.is_cuda
+            ) or (
+                param.is_xla and mu.is_xla and eta.is_xla and step_t.is_xla
+            ), "If capturable=True, params, mus, etas, and state_steps must be CUDA or XLA tensors."
+
         if torch.is_complex(param):
             grad = torch.view_as_real(grad)
             param = torch.view_as_real(param)
@@ -262,28 +233,33 @@ def _single_tensor_asgd(
 
         # update step
         step_t += 1
-        step = _get_value(step_t)
 
         if weight_decay != 0:
             grad = grad.add(param, alpha=weight_decay)
 
-        eta_value = _get_value(eta)
-        # decay term
-        param.mul_(1 - lambd * eta_value)
-
-        # update parameter
-        param.add_(grad, alpha=-eta_value)
+        if capturable:
+            param.mul_(1 - lambd * eta)
+            param.addcmul_(grad, eta, value=-1)  # update parameter
+        else:
+            eta_value = _get_value(eta)
+            param.mul_(1 - lambd * eta_value)  # decay term
+            param.add_(grad, alpha=-eta_value)  # update parameter
 
         # averaging
-        if is_compiling() or mu.item() != 1:
-            ax.add_(param.sub(ax).mul(mu))
+        if capturable or mu.item() != 1:
+            ax.add_(param.sub(ax).mul_(mu))
         else:
             ax.copy_(param)
 
-        new_eta = _to_tensor(lr / ((1 + lambd * lr * step) ** alpha))
-        eta.copy_(new_eta)
-        new_mu = _to_tensor(1 / max(1, step - t0))
-        mu.copy_(new_mu)
+        if capturable:
+            eta.copy_(lr / ((1 + lambd * lr * step_t) ** alpha))
+            mu.copy_(1 / torch.maximum(step_t - t0, torch.ones_like(step_t)))
+        else:
+            step = _get_value(step_t)
+            new_eta = _to_tensor(lr / ((1 + lambd * lr * step) ** alpha))
+            eta.copy_(new_eta)
+            new_mu = _to_tensor(1 / max(1, step - t0))
+            mu.copy_(new_mu)
 
 
 def _multi_tensor_asgd(
@@ -304,28 +280,46 @@ def _multi_tensor_asgd(
     capturable: bool,
     has_complex: bool,
 ):
-
     if len(params) == 0:
         return
 
     assert not differentiable, "_foreach ops don't support autograd"
 
-    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, axs, mus, etas, state_steps])
-    for ((device, _), ((grouped_params, grouped_grads, grouped_axs, grouped_mus,
-         grouped_etas, grouped_state_steps), _)) in grouped_tensors.items():
-        if maximize:
-            grouped_grads = torch._foreach_neg(grouped_grads)
+    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+    if not torch._utils.is_compiling() and capturable:
+        assert all(
+            p.is_cuda and mu.is_cuda and eta.is_cuda and step.is_cuda
+            for p, mu, eta, step in zip(params, mus, etas, state_steps)
+        ), "If capturable=True, params, mus, etas, and state_steps must be CUDA tensors."
 
-        grouped_grads = list(grouped_grads)
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, axs, mus, etas, state_steps]
+    )
+    for (device, _), (
+        (
+            grouped_params,
+            grouped_grads,
+            grouped_axs,
+            grouped_mus,
+            grouped_etas,
+            grouped_state_steps,
+        ),
+        _,
+    ) in grouped_tensors.items():
         if has_complex:
             _view_as_real(grouped_params, grouped_grads, grouped_axs)
 
+        if maximize:
+            grouped_grads = torch._foreach_neg(grouped_grads)
+
         # Update steps
         # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
         # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
         # wrapped it once now. The alpha is required to assure we go to the right overload.
         if grouped_state_steps[0].is_cpu:
-            torch._foreach_add_(grouped_state_steps, torch.tensor(1.0, device='cpu'), alpha=1.0)
+            torch._foreach_add_(
+                grouped_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
+            )
         else:
             torch._foreach_add_(grouped_state_steps, 1)
 
@@ -335,11 +329,15 @@ def _multi_tensor_asgd(
                 torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay)
                 intermediate = grouped_grads
             else:
-                intermediate = torch._foreach_add(grouped_grads, grouped_params, alpha=weight_decay)
+                intermediate = torch._foreach_add(
+                    grouped_grads, grouped_params, alpha=weight_decay
+                )
 
             torch._foreach_add_(intermediate, grouped_params, alpha=lambd)
         else:
-            intermediate = torch._foreach_add(grouped_grads, grouped_params, alpha=lambd)
+            intermediate = torch._foreach_add(
+                grouped_grads, grouped_params, alpha=lambd
+            )
 
         # update param
         # param * (1 - lambd * eta) - eta * grad
@@ -383,7 +381,7 @@ def _multi_tensor_asgd(
 
             for i in range(len(grouped_mus)):
                 new_eta = _to_tensor(
-                    lr / (1 + lambd * lr * step ** alpha), device=device
+                    lr / (1 + lambd * lr * step**alpha), device=device
                 )
                 new_etas.append(new_eta)
                 new_mu = _to_tensor(1 / max(1, step - t0), device=device)
@@ -391,3 +389,61 @@ def _multi_tensor_asgd(
 
             torch._foreach_copy_(grouped_etas, new_etas)
             torch._foreach_copy_(grouped_mus, new_mus)
+
+
+@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_asgd)
+def asgd(
+    params: List[Tensor],
+    grads: List[Tensor],
+    axs: List[Tensor],
+    mus: List[Tensor],
+    etas: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: Optional[bool] = None,
+    maximize: bool = False,
+    differentiable: bool = False,
+    capturable: bool = False,
+    has_complex: bool = False,
+    *,
+    lambd: float,
+    lr: float,
+    t0: float,
+    alpha: float,
+    weight_decay: float,
+):
+    r"""Functional API that performs asgd algorithm computation.
+
+    See :class:`~torch.optim.ASGD` for details.
+    """
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(
+            params, differentiable, use_fused=False
+        )
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_asgd
+    else:
+        func = _single_tensor_asgd
+
+    func(
+        params,
+        grads,
+        axs,
+        mus,
+        etas,
+        state_steps,
+        lambd=lambd,
+        lr=lr,
+        t0=t0,
+        alpha=alpha,
+        weight_decay=weight_decay,
+        maximize=maximize,
+        differentiable=differentiable,
+        capturable=capturable,
+        has_complex=has_complex,
+    )
diff --git a/torch/optim/lbfgs.py b/torch/optim/lbfgs.py
index 704df121b7a51..1e0f5738ad637 100644
--- a/torch/optim/lbfgs.py
+++ b/torch/optim/lbfgs.py
@@ -1,8 +1,8 @@
 import torch
-from functools import reduce
 from .optimizer import Optimizer
 
-__all__ = ['LBFGS']
+__all__ = ["LBFGS"]
+
 
 def _cubic_interpolate(x1, f1, g1, x2, f2, g2, bounds=None):
     # ported from https://github.com/torch/optim/blob/master/polyinterp.lua
@@ -29,20 +29,12 @@ def _cubic_interpolate(x1, f1, g1, x2, f2, g2, bounds=None):
             min_pos = x1 - (x1 - x2) * ((g1 + d2 - d1) / (g1 - g2 + 2 * d2))
         return min(max(min_pos, xmin_bound), xmax_bound)
     else:
-        return (xmin_bound + xmax_bound) / 2.
-
-
-def _strong_wolfe(obj_func,
-                  x,
-                  t,
-                  d,
-                  f,
-                  g,
-                  gtd,
-                  c1=1e-4,
-                  c2=0.9,
-                  tolerance_change=1e-9,
-                  max_ls=25):
+        return (xmin_bound + xmax_bound) / 2.0
+
+
+def _strong_wolfe(
+    obj_func, x, t, d, f, g, gtd, c1=1e-4, c2=0.9, tolerance_change=1e-9, max_ls=25
+):
     # ported from https://github.com/torch/optim/blob/master/lswolfe.lua
     d_norm = d.abs().max()
     g = g.clone(memory_format=torch.contiguous_format)
@@ -83,13 +75,8 @@ def _strong_wolfe(obj_func,
         max_step = t * 10
         tmp = t
         t = _cubic_interpolate(
-            t_prev,
-            f_prev,
-            gtd_prev,
-            t,
-            f_new,
-            gtd_new,
-            bounds=(min_step, max_step))
+            t_prev, f_prev, gtd_prev, t, f_new, gtd_new, bounds=(min_step, max_step)
+        )
 
         # next step
         t_prev = tmp
@@ -119,8 +106,14 @@ def _strong_wolfe(obj_func,
             break
 
         # compute new trial value
-        t = _cubic_interpolate(bracket[0], bracket_f[0], bracket_gtd[0],
-                               bracket[1], bracket_f[1], bracket_gtd[1])
+        t = _cubic_interpolate(
+            bracket[0],
+            bracket_f[0],
+            bracket_gtd[0],
+            bracket[1],
+            bracket_f[1],
+            bracket_gtd[1],
+        )
 
         # test that we are making sufficient progress:
         # in case `t` is so close to boundary, we mark that we are making
@@ -201,6 +194,7 @@ class LBFGS(Optimizer):
         try reducing the history size, or use a different algorithm.
 
     Args:
+        params (iterable): iterable of parameters to optimize. Parameters must be real.
         lr (float): learning rate (default: 1)
         max_iter (int): maximal number of iterations per optimization step
             (default: 20)
@@ -214,15 +208,17 @@ class LBFGS(Optimizer):
         line_search_fn (str): either 'strong_wolfe' or None (default: None).
     """
 
-    def __init__(self,
-                 params,
-                 lr=1,
-                 max_iter=20,
-                 max_eval=None,
-                 tolerance_grad=1e-7,
-                 tolerance_change=1e-9,
-                 history_size=100,
-                 line_search_fn=None):
+    def __init__(
+        self,
+        params,
+        lr=1,
+        max_iter=20,
+        max_eval=None,
+        tolerance_grad=1e-7,
+        tolerance_change=1e-9,
+        history_size=100,
+        line_search_fn=None,
+    ):
         if max_eval is None:
             max_eval = max_iter * 5 // 4
         defaults = dict(
@@ -232,19 +228,25 @@ def __init__(self,
             tolerance_grad=tolerance_grad,
             tolerance_change=tolerance_change,
             history_size=history_size,
-            line_search_fn=line_search_fn)
+            line_search_fn=line_search_fn,
+        )
         super().__init__(params, defaults)
 
         if len(self.param_groups) != 1:
-            raise ValueError("LBFGS doesn't support per-parameter options "
-                             "(parameter groups)")
+            raise ValueError(
+                "LBFGS doesn't support per-parameter options " "(parameter groups)"
+            )
 
-        self._params = self.param_groups[0]['params']
+        self._params = self.param_groups[0]["params"]
         self._numel_cache = None
 
     def _numel(self):
         if self._numel_cache is None:
-            self._numel_cache = reduce(lambda total, p: total + p.numel(), self._params, 0)
+            self._numel_cache = sum(
+                2 * p.numel() if torch.is_complex(p) else p.numel()
+                for p in self._params
+            )
+
         return self._numel_cache
 
     def _gather_flat_grad(self):
@@ -256,15 +258,19 @@ def _gather_flat_grad(self):
                 view = p.grad.to_dense().view(-1)
             else:
                 view = p.grad.view(-1)
+            if torch.is_complex(view):
+                view = torch.view_as_real(view).view(-1)
             views.append(view)
         return torch.cat(views, 0)
 
     def _add_grad(self, step_size, update):
         offset = 0
         for p in self._params:
+            if torch.is_complex(p):
+                p = torch.view_as_real(p)
             numel = p.numel()
             # view as to avoid deprecated pointwise semantics
-            p.add_(update[offset:offset + numel].view_as(p), alpha=step_size)
+            p.add_(update[offset : offset + numel].view_as(p), alpha=step_size)
             offset += numel
         assert offset == self._numel()
 
@@ -296,25 +302,25 @@ def step(self, closure):
         closure = torch.enable_grad()(closure)
 
         group = self.param_groups[0]
-        lr = group['lr']
-        max_iter = group['max_iter']
-        max_eval = group['max_eval']
-        tolerance_grad = group['tolerance_grad']
-        tolerance_change = group['tolerance_change']
-        line_search_fn = group['line_search_fn']
-        history_size = group['history_size']
+        lr = group["lr"]
+        max_iter = group["max_iter"]
+        max_eval = group["max_eval"]
+        tolerance_grad = group["tolerance_grad"]
+        tolerance_change = group["tolerance_change"]
+        line_search_fn = group["line_search_fn"]
+        history_size = group["history_size"]
 
         # NOTE: LBFGS has only global state, but we register it as state for
         # the first param, because this helps with casting in load_state_dict
         state = self.state[self._params[0]]
-        state.setdefault('func_evals', 0)
-        state.setdefault('n_iter', 0)
+        state.setdefault("func_evals", 0)
+        state.setdefault("n_iter", 0)
 
         # evaluate initial f(x) and df/dx
         orig_loss = closure()
         loss = float(orig_loss)
         current_evals = 1
-        state['func_evals'] += 1
+        state["func_evals"] += 1
 
         flat_grad = self._gather_flat_grad()
         opt_cond = flat_grad.abs().max() <= tolerance_grad
@@ -324,26 +330,26 @@ def step(self, closure):
             return orig_loss
 
         # tensors cached in state (for tracing)
-        d = state.get('d')
-        t = state.get('t')
-        old_dirs = state.get('old_dirs')
-        old_stps = state.get('old_stps')
-        ro = state.get('ro')
-        H_diag = state.get('H_diag')
-        prev_flat_grad = state.get('prev_flat_grad')
-        prev_loss = state.get('prev_loss')
+        d = state.get("d")
+        t = state.get("t")
+        old_dirs = state.get("old_dirs")
+        old_stps = state.get("old_stps")
+        ro = state.get("ro")
+        H_diag = state.get("H_diag")
+        prev_flat_grad = state.get("prev_flat_grad")
+        prev_loss = state.get("prev_loss")
 
         n_iter = 0
         # optimize for a max of max_iter iterations
         while n_iter < max_iter:
             # keep track of nb of iterations
             n_iter += 1
-            state['n_iter'] += 1
+            state["n_iter"] += 1
 
             ############################################################
             # compute gradient descent direction
             ############################################################
-            if state['n_iter'] == 1:
+            if state["n_iter"] == 1:
                 d = flat_grad.neg()
                 old_dirs = []
                 old_stps = []
@@ -365,7 +371,7 @@ def step(self, closure):
                     # store new direction/step
                     old_dirs.append(y)
                     old_stps.append(s)
-                    ro.append(1. / ys)
+                    ro.append(1.0 / ys)
 
                     # update scale of initial Hessian approximation
                     H_diag = ys / y.dot(y)  # (y*y)
@@ -374,9 +380,9 @@ def step(self, closure):
                 # multiplied by the gradient
                 num_old = len(old_dirs)
 
-                if 'al' not in state:
-                    state['al'] = [None] * history_size
-                al = state['al']
+                if "al" not in state:
+                    state["al"] = [None] * history_size
+                al = state["al"]
 
                 # iteration in L-BFGS loop collapsed to use just one buffer
                 q = flat_grad.neg()
@@ -401,8 +407,8 @@ def step(self, closure):
             # compute step length
             ############################################################
             # reset initial guess for step size
-            if state['n_iter'] == 1:
-                t = min(1., 1. / flat_grad.abs().sum()) * lr
+            if state["n_iter"] == 1:
+                t = min(1.0, 1.0 / flat_grad.abs().sum()) * lr
             else:
                 t = lr
 
@@ -426,7 +432,8 @@ def obj_func(x, t, d):
                         return self._directional_evaluate(closure, x, t, d)
 
                     loss, flat_grad, t, ls_func_evals = _strong_wolfe(
-                        obj_func, x_init, t, d, loss, flat_grad, gtd)
+                        obj_func, x_init, t, d, loss, flat_grad, gtd
+                    )
                 self._add_grad(t, d)
                 opt_cond = flat_grad.abs().max() <= tolerance_grad
             else:
@@ -444,7 +451,7 @@ def obj_func(x, t, d):
 
             # update func eval
             current_evals += ls_func_evals
-            state['func_evals'] += ls_func_evals
+            state["func_evals"] += ls_func_evals
 
             ############################################################
             # check conditions
@@ -466,13 +473,13 @@ def obj_func(x, t, d):
             if abs(loss - prev_loss) < tolerance_change:
                 break
 
-        state['d'] = d
-        state['t'] = t
-        state['old_dirs'] = old_dirs
-        state['old_stps'] = old_stps
-        state['ro'] = ro
-        state['H_diag'] = H_diag
-        state['prev_flat_grad'] = prev_flat_grad
-        state['prev_loss'] = prev_loss
+        state["d"] = d
+        state["t"] = t
+        state["old_dirs"] = old_dirs
+        state["old_stps"] = old_stps
+        state["ro"] = ro
+        state["H_diag"] = H_diag
+        state["prev_flat_grad"] = prev_flat_grad
+        state["prev_loss"] = prev_loss
 
         return orig_loss
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index df659b61a998b..51ecef11b0ff1 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -1,17 +1,34 @@
-import types
 import math
-from torch import inf
-from functools import wraps, partial
+import types
 import warnings
-import weakref
-from collections import Counter
 from bisect import bisect_right
+from collections import Counter
+from functools import partial
+from typing import Optional, Sequence
+from weakref import ref
+
+from torch import inf, Tensor
 
 from .optimizer import Optimizer
 
-__all__ = ['LambdaLR', 'MultiplicativeLR', 'StepLR', 'MultiStepLR', 'ConstantLR', 'LinearLR',
-           'ExponentialLR', 'SequentialLR', 'CosineAnnealingLR', 'ChainedScheduler', 'ReduceLROnPlateau',
-           'CyclicLR', 'CosineAnnealingWarmRestarts', 'OneCycleLR', 'PolynomialLR', 'LRScheduler']
+__all__ = [
+    "LambdaLR",
+    "MultiplicativeLR",
+    "StepLR",
+    "MultiStepLR",
+    "ConstantLR",
+    "LinearLR",
+    "ExponentialLR",
+    "SequentialLR",
+    "CosineAnnealingLR",
+    "ChainedScheduler",
+    "ReduceLROnPlateau",
+    "CyclicLR",
+    "CosineAnnealingWarmRestarts",
+    "OneCycleLR",
+    "PolynomialLR",
+    "LRScheduler",
+]
 
 EPOCH_DEPRECATION_WARNING = (
     "The epoch parameter in `scheduler.step()` was not necessary and is being "
@@ -22,71 +39,68 @@
     "https://github.com/pytorch/pytorch/issues/new/choose."
 )
 
+
 def _check_verbose_deprecated_warning(verbose):
     """Raises a warning when verbose is not the default value."""
     if verbose != "deprecated":
-        warnings.warn("The verbose parameter is deprecated. Please use get_last_lr() "
-                      "to access the learning rate.", UserWarning)
+        warnings.warn(
+            "The verbose parameter is deprecated. Please use get_last_lr() "
+            "to access the learning rate.",
+            UserWarning,
+        )
         return verbose
     return False
 
-class LRScheduler:
 
+class LRScheduler:
     def __init__(self, optimizer, last_epoch=-1, verbose="deprecated"):
-
         # Attach optimizer
         if not isinstance(optimizer, Optimizer):
-            raise TypeError(f'{type(optimizer).__name__} is not an Optimizer')
+            raise TypeError(f"{type(optimizer).__name__} is not an Optimizer")
         self.optimizer = optimizer
 
         # Initialize epoch and base learning rates
         if last_epoch == -1:
             for group in optimizer.param_groups:
-                group.setdefault('initial_lr', group['lr'])
+                group.setdefault("initial_lr", group["lr"])
         else:
             for i, group in enumerate(optimizer.param_groups):
-                if 'initial_lr' not in group:
-                    raise KeyError("param 'initial_lr' is not specified "
-                                   f"in param_groups[{i}] when resuming an optimizer")
-        self.base_lrs = [group['initial_lr'] for group in optimizer.param_groups]
+                if "initial_lr" not in group:
+                    raise KeyError(
+                        "param 'initial_lr' is not specified "
+                        f"in param_groups[{i}] when resuming an optimizer"
+                    )
+        self.base_lrs = [group["initial_lr"] for group in optimizer.param_groups]
         self.last_epoch = last_epoch
 
         # Following https://github.com/pytorch/pytorch/issues/20124
         # We would like to ensure that `lr_scheduler.step()` is called after
         # `optimizer.step()`
-        def with_counter(method):
-            if getattr(method, '_with_counter', False):
-                # `optimizer.step()` has already been replaced, return.
-                return method
-
-            # Keep a weak reference to the optimizer instance to prevent
-            # cyclic references.
-            instance_ref = weakref.ref(method.__self__)
-            # Get the unbound method for the same purpose.
-            func = method.__func__
-            cls = instance_ref().__class__
-            del method
-
-            @wraps(func)
-            def wrapper(*args, **kwargs):
-                instance = instance_ref()
-                instance._step_count += 1
-                wrapped = func.__get__(instance, cls)
-                return wrapped(*args, **kwargs)
-
-            # Note that the returned function here is no longer a bound method,
-            # so attributes like `__func__` and `__self__` no longer exist.
-            wrapper._with_counter = True
-            return wrapper
-
-        self.optimizer.step = with_counter(self.optimizer.step)
-        self.verbose = _check_verbose_deprecated_warning(verbose)
+        def patch_track_step_called(opt):
+            if hasattr(opt.step, "_wrapped_by_lr_sched"):
+                # we've already patched
+                return opt.step
+
+            def wrap_step(step_fn):
+                opt_ref = ref(self.optimizer)
+                func = step_fn.__func__
+
+                def wrapper(*args, **kwargs):
+                    opt = opt_ref()
+                    opt._opt_called = True
+                    return func.__get__(opt, opt.__class__)(*args, **kwargs)
+
+                wrapper._wrapped_by_lr_sched = True
+                return wrapper
 
+            opt.step = wrap_step(opt.step)
+
+        patch_track_step_called(self.optimizer)
+        self.verbose = _check_verbose_deprecated_warning(verbose)
         self._initial_step()
 
     def _initial_step(self):
         """Initialize step counts and performs a step"""
-        self.optimizer._step_count = 0
         self._step_count = 0
         self.step()
 
@@ -96,7 +110,9 @@ def state_dict(self):
         It contains an entry for every variable in self.__dict__ which
         is not the optimizer.
         """
-        return {key: value for key, value in self.__dict__.items() if key != 'optimizer'}
+        return {
+            key: value for key, value in self.__dict__.items() if key != "optimizer"
+        }
 
     def load_state_dict(self, state_dict):
         """Loads the schedulers state.
@@ -108,8 +124,7 @@ def load_state_dict(self, state_dict):
         self.__dict__.update(state_dict)
 
     def get_last_lr(self):
-        """ Return last computed learning rate by current scheduler.
-        """
+        """Return last computed learning rate by current scheduler."""
         return self._last_lr
 
     def get_lr(self):
@@ -117,35 +132,40 @@ def get_lr(self):
         raise NotImplementedError
 
     def print_lr(self, is_verbose, group, lr, epoch=None):
-        """Display the current learning rate.
-        """
+        """Display the current learning rate."""
         if is_verbose:
             if epoch is None:
-                print(f'Adjusting learning rate of group {group} to {lr:.4e}.')
+                print(f"Adjusting learning rate of group {group} to {lr:.4e}.")
             else:
-                epoch_str = ("%.2f" if isinstance(epoch, float) else
-                             "%.5d") % epoch
-                print(f'Epoch {epoch_str}: adjusting learning rate of group {group} to {lr:.4e}.')
-
+                epoch_str = ("%.2f" if isinstance(epoch, float) else "%.5d") % epoch
+                print(
+                    f"Epoch {epoch_str}: adjusting learning rate of group {group} to {lr:.4e}."
+                )
 
     def step(self, epoch=None):
         # Raise a warning if old pattern is detected
         # https://github.com/pytorch/pytorch/issues/20124
         if self._step_count == 1:
-            if not hasattr(self.optimizer.step, "_with_counter"):
-                warnings.warn("Seems like `optimizer.step()` has been overridden after learning rate scheduler "
-                              "initialization. Please, make sure to call `optimizer.step()` before "
-                              "`lr_scheduler.step()`. See more details at "
-                              "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
+            if not hasattr(self.optimizer.step, "_wrapped_by_lr_sched"):
+                warnings.warn(
+                    "Seems like `optimizer.step()` has been overridden after learning rate scheduler "
+                    "initialization. Please, make sure to call `optimizer.step()` before "
+                    "`lr_scheduler.step()`. See more details at "
+                    "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate",
+                    UserWarning,
+                )
 
             # Just check if there were two first lr_scheduler.step() calls before optimizer.step()
-            elif self.optimizer._step_count < 1:
-                warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
-                              "In PyTorch 1.1.0 and later, you should call them in the opposite order: "
-                              "`optimizer.step()` before `lr_scheduler.step()`.  Failure to do this "
-                              "will result in PyTorch skipping the first value of the learning rate schedule. "
-                              "See more details at "
-                              "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
+            elif not getattr(self.optimizer, "_opt_called", False):
+                warnings.warn(
+                    "Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+                    "In PyTorch 1.1.0 and later, you should call them in the opposite order: "
+                    "`optimizer.step()` before `lr_scheduler.step()`.  Failure to do this "
+                    "will result in PyTorch skipping the first value of the learning rate schedule. "
+                    "See more details at "
+                    "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate",
+                    UserWarning,
+                )
         self._step_count += 1
 
         with _enable_get_lr_call(self):
@@ -162,9 +182,23 @@ def step(self, epoch=None):
 
         for i, data in enumerate(zip(self.optimizer.param_groups, values)):
             param_group, lr = data
-            param_group['lr'] = lr
+            if isinstance(param_group["lr"], Tensor):
+                lr_val = lr.item() if isinstance(lr, Tensor) else lr
+                param_group["lr"].fill_(lr)
+            else:
+                param_group["lr"] = lr
 
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
+
+
+def _warn_get_lr_called_within_step(lr_scheduler: LRScheduler):
+    if not lr_scheduler._get_lr_called_within_step:
+        warnings.warn(
+            "To get the last learning rate computed by the scheduler, "
+            "please use `get_last_lr()`.",
+            UserWarning,
+            stacklevel=2,
+        )
 
 
 # Including _LRScheduler for backwards compatibility
@@ -174,7 +208,6 @@ class _LRScheduler(LRScheduler):
 
 
 class _enable_get_lr_call:
-
     def __init__(self, o):
         self.o = o
 
@@ -222,7 +255,9 @@ def __init__(self, optimizer, lr_lambda, last_epoch=-1, verbose="deprecated"):
             self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups)
         else:
             if len(lr_lambda) != len(optimizer.param_groups):
-                raise ValueError(f"Expected {len(optimizer.param_groups)} lr_lambdas, but got {len(lr_lambda)}")
+                raise ValueError(
+                    f"Expected {len(optimizer.param_groups)} lr_lambdas, but got {len(lr_lambda)}"
+                )
             self.lr_lambdas = list(lr_lambda)
         super().__init__(optimizer, last_epoch, verbose)
 
@@ -237,12 +272,16 @@ def state_dict(self):
         When saving or loading the scheduler, please make sure to also save or load the state of the optimizer.
         """
 
-        state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', 'lr_lambdas')}
-        state_dict['lr_lambdas'] = [None] * len(self.lr_lambdas)
+        state_dict = {
+            key: value
+            for key, value in self.__dict__.items()
+            if key not in ("optimizer", "lr_lambdas")
+        }
+        state_dict["lr_lambdas"] = [None] * len(self.lr_lambdas)
 
         for idx, fn in enumerate(self.lr_lambdas):
             if not isinstance(fn, types.FunctionType):
-                state_dict['lr_lambdas'][idx] = fn.__dict__.copy()
+                state_dict["lr_lambdas"][idx] = fn.__dict__.copy()
 
         return state_dict
 
@@ -256,23 +295,23 @@ def load_state_dict(self, state_dict):
                 from a call to :meth:`state_dict`.
         """
 
-        lr_lambdas = state_dict.pop('lr_lambdas')
+        lr_lambdas = state_dict.pop("lr_lambdas")
         self.__dict__.update(state_dict)
         # Restore state_dict keys in order to prevent side effects
         # https://github.com/pytorch/pytorch/issues/32756
-        state_dict['lr_lambdas'] = lr_lambdas
+        state_dict["lr_lambdas"] = lr_lambdas
 
         for idx, fn in enumerate(lr_lambdas):
             if fn is not None:
                 self.lr_lambdas[idx].__dict__.update(fn)
 
     def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.")
+        _warn_get_lr_called_within_step(self)
 
-        return [base_lr * lmbda(self.last_epoch)
-                for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]
+        return [
+            base_lr * lmbda(self.last_epoch)
+            for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)
+        ]
 
 
 class MultiplicativeLR(LRScheduler):
@@ -309,7 +348,9 @@ def __init__(self, optimizer, lr_lambda, last_epoch=-1, verbose="deprecated"):
             self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups)
         else:
             if len(lr_lambda) != len(optimizer.param_groups):
-                raise ValueError(f"Expected {len(optimizer.param_groups)} lr_lambdas, but got {len(lr_lambda)}")
+                raise ValueError(
+                    f"Expected {len(optimizer.param_groups)} lr_lambdas, but got {len(lr_lambda)}"
+                )
             self.lr_lambdas = list(lr_lambda)
         super().__init__(optimizer, last_epoch, verbose)
 
@@ -321,12 +362,16 @@ def state_dict(self):
         The learning rate lambda functions will only be saved if they are callable objects
         and not if they are functions or lambdas.
         """
-        state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', 'lr_lambdas')}
-        state_dict['lr_lambdas'] = [None] * len(self.lr_lambdas)
+        state_dict = {
+            key: value
+            for key, value in self.__dict__.items()
+            if key not in ("optimizer", "lr_lambdas")
+        }
+        state_dict["lr_lambdas"] = [None] * len(self.lr_lambdas)
 
         for idx, fn in enumerate(self.lr_lambdas):
             if not isinstance(fn, types.FunctionType):
-                state_dict['lr_lambdas'][idx] = fn.__dict__.copy()
+                state_dict["lr_lambdas"][idx] = fn.__dict__.copy()
 
         return state_dict
 
@@ -337,26 +382,26 @@ def load_state_dict(self, state_dict):
             state_dict (dict): scheduler state. Should be an object returned
                 from a call to :meth:`state_dict`.
         """
-        lr_lambdas = state_dict.pop('lr_lambdas')
+        lr_lambdas = state_dict.pop("lr_lambdas")
         self.__dict__.update(state_dict)
         # Restore state_dict keys in order to prevent side effects
         # https://github.com/pytorch/pytorch/issues/32756
-        state_dict['lr_lambdas'] = lr_lambdas
+        state_dict["lr_lambdas"] = lr_lambdas
 
         for idx, fn in enumerate(lr_lambdas):
             if fn is not None:
                 self.lr_lambdas[idx].__dict__.update(fn)
 
     def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.", UserWarning)
+        _warn_get_lr_called_within_step(self)
 
         if self.last_epoch > 0:
-            return [group['lr'] * lmbda(self.last_epoch)
-                    for lmbda, group in zip(self.lr_lambdas, self.optimizer.param_groups)]
+            return [
+                group["lr"] * lmbda(self.last_epoch)
+                for lmbda, group in zip(self.lr_lambdas, self.optimizer.param_groups)
+            ]
         else:
-            return [group['lr'] for group in self.optimizer.param_groups]
+            return [group["lr"] for group in self.optimizer.param_groups]
 
 
 class StepLR(LRScheduler):
@@ -392,24 +437,25 @@ class StepLR(LRScheduler):
         >>>     scheduler.step()
     """
 
-    def __init__(self, optimizer, step_size, gamma=0.1, last_epoch=-1, verbose="deprecated"):
+    def __init__(
+        self, optimizer, step_size, gamma=0.1, last_epoch=-1, verbose="deprecated"
+    ):
         self.step_size = step_size
         self.gamma = gamma
         super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.", UserWarning)
+        _warn_get_lr_called_within_step(self)
 
         if (self.last_epoch == 0) or (self.last_epoch % self.step_size != 0):
-            return [group['lr'] for group in self.optimizer.param_groups]
-        return [group['lr'] * self.gamma
-                for group in self.optimizer.param_groups]
+            return [group["lr"] for group in self.optimizer.param_groups]
+        return [group["lr"] * self.gamma for group in self.optimizer.param_groups]
 
     def _get_closed_form_lr(self):
-        return [base_lr * self.gamma ** (self.last_epoch // self.step_size)
-                for base_lr in self.base_lrs]
+        return [
+            base_lr * self.gamma ** (self.last_epoch // self.step_size)
+            for base_lr in self.base_lrs
+        ]
 
 
 class MultiStepLR(LRScheduler):
@@ -444,37 +490,42 @@ class MultiStepLR(LRScheduler):
         >>>     scheduler.step()
     """
 
-    def __init__(self, optimizer, milestones, gamma=0.1, last_epoch=-1, verbose="deprecated"):
+    def __init__(
+        self, optimizer, milestones, gamma=0.1, last_epoch=-1, verbose="deprecated"
+    ):
         self.milestones = Counter(milestones)
         self.gamma = gamma
         super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.", UserWarning)
+        _warn_get_lr_called_within_step(self)
 
         if self.last_epoch not in self.milestones:
-            return [group['lr'] for group in self.optimizer.param_groups]
-        return [group['lr'] * self.gamma ** self.milestones[self.last_epoch]
-                for group in self.optimizer.param_groups]
+            return [group["lr"] for group in self.optimizer.param_groups]
+        return [
+            group["lr"] * self.gamma ** self.milestones[self.last_epoch]
+            for group in self.optimizer.param_groups
+        ]
 
     def _get_closed_form_lr(self):
         milestones = sorted(self.milestones.elements())
-        return [base_lr * self.gamma ** bisect_right(milestones, self.last_epoch)
-                for base_lr in self.base_lrs]
+        return [
+            base_lr * self.gamma ** bisect_right(milestones, self.last_epoch)
+            for base_lr in self.base_lrs
+        ]
 
 
 class ConstantLR(LRScheduler):
-    """Decays the learning rate of each parameter group by a small constant factor until the
-    number of epoch reaches a pre-defined milestone: total_iters. Notice that such decay can
+    """Multiply the learning rate of each parameter group by a small constant factor until the
+    number of epoch reaches a pre-defined milestone: total_iters.
+    Notice that such multiplication of the small constant factor can
     happen simultaneously with other changes to the learning rate from outside this scheduler.
     When last_epoch=-1, sets initial lr as lr.
 
     Args:
         optimizer (Optimizer): Wrapped optimizer.
         factor (float): The number we multiply learning rate until the milestone. Default: 1./3.
-        total_iters (int): The number of steps that the scheduler decays the learning rate.
+        total_iters (int): The number of steps that the scheduler multiplies the learning rate by the factor.
             Default: 5.
         last_epoch (int): The index of the last epoch. Default: -1.
         verbose (bool): If ``True``, prints a message to stdout for
@@ -492,37 +543,49 @@ class ConstantLR(LRScheduler):
         >>> # lr = 0.025   if epoch == 2
         >>> # lr = 0.025   if epoch == 3
         >>> # lr = 0.05    if epoch >= 4
-        >>> scheduler = ConstantLR(self.opt, factor=0.5, total_iters=4)
+        >>> scheduler = ConstantLR(optimizer, factor=0.5, total_iters=4)
         >>> for epoch in range(100):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
     """
 
-    def __init__(self, optimizer, factor=1.0 / 3, total_iters=5, last_epoch=-1, verbose="deprecated"):
+    def __init__(
+        self,
+        optimizer,
+        factor=1.0 / 3,
+        total_iters=5,
+        last_epoch=-1,
+        verbose="deprecated",
+    ):
         if factor > 1.0 or factor < 0:
-            raise ValueError('Constant multiplicative factor expected to be between 0 and 1.')
+            raise ValueError(
+                "Constant multiplicative factor expected to be between 0 and 1."
+            )
 
         self.factor = factor
         self.total_iters = total_iters
         super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.", UserWarning)
+        _warn_get_lr_called_within_step(self)
 
         if self.last_epoch == 0:
-            return [group['lr'] * self.factor for group in self.optimizer.param_groups]
+            return [group["lr"] * self.factor for group in self.optimizer.param_groups]
 
         if self.last_epoch != self.total_iters:
-            return [group['lr'] for group in self.optimizer.param_groups]
+            return [group["lr"] for group in self.optimizer.param_groups]
 
-        return [group['lr'] * (1.0 / self.factor) for group in self.optimizer.param_groups]
+        return [
+            group["lr"] * (1.0 / self.factor) for group in self.optimizer.param_groups
+        ]
 
     def _get_closed_form_lr(self):
-        return [base_lr * (self.factor + (self.last_epoch >= self.total_iters) * (1 - self.factor))
-                for base_lr in self.base_lrs]
+        return [
+            base_lr
+            * (self.factor + (self.last_epoch >= self.total_iters) * (1 - self.factor))
+            for base_lr in self.base_lrs
+        ]
 
 
 class LinearLR(LRScheduler):
@@ -556,20 +619,31 @@ class LinearLR(LRScheduler):
         >>> # lr = 0.0375   if epoch == 2
         >>> # lr = 0.04375  if epoch == 3
         >>> # lr = 0.05    if epoch >= 4
-        >>> scheduler = LinearLR(self.opt, start_factor=0.5, total_iters=4)
+        >>> scheduler = LinearLR(optimizer, start_factor=0.5, total_iters=4)
         >>> for epoch in range(100):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
     """
 
-    def __init__(self, optimizer, start_factor=1.0 / 3, end_factor=1.0, total_iters=5, last_epoch=-1,
-                 verbose="deprecated"):
+    def __init__(
+        self,
+        optimizer,
+        start_factor=1.0 / 3,
+        end_factor=1.0,
+        total_iters=5,
+        last_epoch=-1,
+        verbose="deprecated",
+    ):
         if start_factor > 1.0 or start_factor <= 0:
-            raise ValueError('Starting multiplicative factor expected to be greater than 0 and less or equal to 1.')
+            raise ValueError(
+                "Starting multiplicative factor expected to be greater than 0 and less or equal to 1."
+            )
 
         if end_factor > 1.0 or end_factor < 0:
-            raise ValueError('Ending multiplicative factor expected to be between 0 and 1.')
+            raise ValueError(
+                "Ending multiplicative factor expected to be between 0 and 1."
+            )
 
         self.start_factor = start_factor
         self.end_factor = end_factor
@@ -577,24 +651,40 @@ def __init__(self, optimizer, start_factor=1.0 / 3, end_factor=1.0, total_iters=
         super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.", UserWarning)
+        _warn_get_lr_called_within_step(self)
 
         if self.last_epoch == 0:
-            return [group['lr'] * self.start_factor for group in self.optimizer.param_groups]
+            return [
+                group["lr"] * self.start_factor for group in self.optimizer.param_groups
+            ]
 
         if self.last_epoch > self.total_iters:
-            return [group['lr'] for group in self.optimizer.param_groups]
+            return [group["lr"] for group in self.optimizer.param_groups]
 
-        return [group['lr'] * (1. + (self.end_factor - self.start_factor) /
-                (self.total_iters * self.start_factor + (self.last_epoch - 1) * (self.end_factor - self.start_factor)))
-                for group in self.optimizer.param_groups]
+        return [
+            group["lr"]
+            * (
+                1.0
+                + (self.end_factor - self.start_factor)
+                / (
+                    self.total_iters * self.start_factor
+                    + (self.last_epoch - 1) * (self.end_factor - self.start_factor)
+                )
+            )
+            for group in self.optimizer.param_groups
+        ]
 
     def _get_closed_form_lr(self):
-        return [base_lr * (self.start_factor +
-                (self.end_factor - self.start_factor) * min(self.total_iters, self.last_epoch) / self.total_iters)
-                for base_lr in self.base_lrs]
+        return [
+            base_lr
+            * (
+                self.start_factor
+                + (self.end_factor - self.start_factor)
+                * min(self.total_iters, self.last_epoch)
+                / self.total_iters
+            )
+            for base_lr in self.base_lrs
+        ]
 
 
 class ExponentialLR(LRScheduler):
@@ -618,18 +708,14 @@ def __init__(self, optimizer, gamma, last_epoch=-1, verbose="deprecated"):
         super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.", UserWarning)
+        _warn_get_lr_called_within_step(self)
 
         if self.last_epoch == 0:
-            return [group['lr'] for group in self.optimizer.param_groups]
-        return [group['lr'] * self.gamma
-                for group in self.optimizer.param_groups]
+            return [group["lr"] for group in self.optimizer.param_groups]
+        return [group["lr"] * self.gamma for group in self.optimizer.param_groups]
 
     def _get_closed_form_lr(self):
-        return [base_lr * self.gamma ** self.last_epoch
-                for base_lr in self.base_lrs]
+        return [base_lr * self.gamma**self.last_epoch for base_lr in self.base_lrs]
 
 
 class SequentialLR(LRScheduler):
@@ -656,29 +742,42 @@ class SequentialLR(LRScheduler):
         >>> # lr = 0.9     if epoch == 2
         >>> # lr = 0.81    if epoch == 3
         >>> # lr = 0.729   if epoch == 4
-        >>> scheduler1 = ConstantLR(self.opt, factor=0.1, total_iters=2)
-        >>> scheduler2 = ExponentialLR(self.opt, gamma=0.9)
-        >>> scheduler = SequentialLR(self.opt, schedulers=[scheduler1, scheduler2], milestones=[2])
+        >>> scheduler1 = ConstantLR(optimizer, factor=0.1, total_iters=2)
+        >>> scheduler2 = ExponentialLR(optimizer, gamma=0.9)
+        >>> scheduler = SequentialLR(optimizer, schedulers=[scheduler1, scheduler2], milestones=[2])
         >>> for epoch in range(100):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
     """
 
-    def __init__(self, optimizer, schedulers, milestones, last_epoch=-1, verbose="deprecated"):
-        for scheduler_idx in range(len(schedulers)):
-            if schedulers[scheduler_idx].optimizer != optimizer:
+    def __init__(
+        self, optimizer, schedulers, milestones, last_epoch=-1, verbose="deprecated"
+    ):
+        if len(schedulers) < 1:
+            raise ValueError(
+                f"{self.__class__.__name__} expects at least one scheduler, but got no scheduler."
+            )
+
+        for scheduler_idx, scheduler in enumerate(schedulers):
+            if not hasattr(scheduler, "optimizer"):
+                raise TypeError(
+                    f"{self.__class__.__name__} at index {scheduler_idx} should have `optimizer` as its attribute."
+                )
+            if isinstance(scheduler, ReduceLROnPlateau):
                 raise ValueError(
-                    "Sequential Schedulers expects all schedulers to belong to the same optimizer, but "
-                    f"got schedulers at index {scheduler_idx} to be different than the optimizer passed in."
+                    f"{self.__class__.__name__} does not support `ReduceLROnPlateau` scheduler as it "
+                    "requires additional kwargs to be specified when calling `step`, "
+                    f"but got one at index {scheduler_idx} in the given schedulers sequence."
                 )
-
-            if (schedulers[scheduler_idx].optimizer != schedulers[0].optimizer):
+            if optimizer != scheduler.optimizer:
                 raise ValueError(
-                    "Sequential Schedulers expects all schedulers to belong to the same optimizer, but "
-                    f"got schedulers at index {0} and {scheduler_idx} to be different."
+                    f"{self.__class__.__name__} expects all schedulers to belong to the same optimizer, but "
+                    f"got scheduler {scheduler.__class__.__name__} at index {scheduler_idx} has {scheduler.optimizer}, "
+                    f"which is different from {optimizer.__class__.__name__}."
                 )
-        if (len(milestones) != len(schedulers) - 1):
+
+        if len(milestones) != len(schedulers) - 1:
             raise ValueError(
                 "Sequential Schedulers expects number of schedulers provided to be one more "
                 f"than the number of milestone points, but got number of schedulers {len(schedulers)} and the "
@@ -721,11 +820,15 @@ def state_dict(self):
         is not the optimizer.
         The wrapped scheduler states will also be saved.
         """
-        state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', '_schedulers')}
-        state_dict['_schedulers'] = [None] * len(self._schedulers)
+        state_dict = {
+            key: value
+            for key, value in self.__dict__.items()
+            if key not in ("optimizer", "_schedulers")
+        }
+        state_dict["_schedulers"] = [None] * len(self._schedulers)
 
         for idx, s in enumerate(self._schedulers):
-            state_dict['_schedulers'][idx] = s.state_dict()
+            state_dict["_schedulers"][idx] = s.state_dict()
 
         return state_dict
 
@@ -736,11 +839,11 @@ def load_state_dict(self, state_dict):
             state_dict (dict): scheduler state. Should be an object returned
                 from a call to :meth:`state_dict`.
         """
-        _schedulers = state_dict.pop('_schedulers')
+        _schedulers = state_dict.pop("_schedulers")
         self.__dict__.update(state_dict)
         # Restore state_dict keys in order to prevent side effects
         # https://github.com/pytorch/pytorch/issues/32756
-        state_dict['_schedulers'] = _schedulers
+        state_dict["_schedulers"] = _schedulers
 
         for idx, s in enumerate(_schedulers):
             self._schedulers[idx].load_state_dict(s)
@@ -769,32 +872,38 @@ class PolynomialLR(LRScheduler):
         >>> # lr = 0.00050   if epoch == 2
         >>> # lr = 0.00025   if epoch == 3
         >>> # lr = 0.0       if epoch >= 4
-        >>> scheduler = PolynomialLR(self.opt, total_iters=4, power=1.0)
+        >>> scheduler = PolynomialLR(optimizer, total_iters=4, power=1.0)
         >>> for epoch in range(100):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
     """
-    def __init__(self, optimizer, total_iters=5, power=1.0, last_epoch=-1, verbose="deprecated"):
+
+    def __init__(
+        self, optimizer, total_iters=5, power=1.0, last_epoch=-1, verbose="deprecated"
+    ):
         self.total_iters = total_iters
         self.power = power
         super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.", UserWarning)
+        _warn_get_lr_called_within_step(self)
 
         if self.last_epoch == 0 or self.last_epoch > self.total_iters:
             return [group["lr"] for group in self.optimizer.param_groups]
 
-        decay_factor = ((1.0 - self.last_epoch / self.total_iters) / (1.0 - (self.last_epoch - 1) / self.total_iters)) ** self.power
+        decay_factor = (
+            (1.0 - self.last_epoch / self.total_iters)
+            / (1.0 - (self.last_epoch - 1) / self.total_iters)
+        ) ** self.power
         return [group["lr"] * decay_factor for group in self.optimizer.param_groups]
 
     def _get_closed_form_lr(self):
         return [
             (
-                base_lr * (1.0 - min(self.total_iters, self.last_epoch) / self.total_iters) ** self.power
+                base_lr
+                * (1.0 - min(self.total_iters, self.last_epoch) / self.total_iters)
+                ** self.power
             )
             for base_lr in self.base_lrs
         ]
@@ -844,46 +953,58 @@ class CosineAnnealingLR(LRScheduler):
         https://arxiv.org/abs/1608.03983
     """
 
-    def __init__(self, optimizer, T_max, eta_min=0, last_epoch=-1, verbose="deprecated"):
+    def __init__(
+        self, optimizer, T_max, eta_min=0, last_epoch=-1, verbose="deprecated"
+    ):
         self.T_max = T_max
         self.eta_min = eta_min
         super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.", UserWarning)
+        _warn_get_lr_called_within_step(self)
 
         if self.last_epoch == 0:
-            return [group['lr'] for group in self.optimizer.param_groups]
+            return [group["lr"] for group in self.optimizer.param_groups]
         elif self._step_count == 1 and self.last_epoch > 0:
-            return [self.eta_min + (base_lr - self.eta_min) *
-                    (1 + math.cos((self.last_epoch) * math.pi / self.T_max)) / 2
-                    for base_lr, group in
-                    zip(self.base_lrs, self.optimizer.param_groups)]
+            return [
+                self.eta_min
+                + (base_lr - self.eta_min)
+                * (1 + math.cos((self.last_epoch) * math.pi / self.T_max))
+                / 2
+                for base_lr, group in zip(self.base_lrs, self.optimizer.param_groups)
+            ]
         elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
-            return [group['lr'] + (base_lr - self.eta_min) *
-                    (1 - math.cos(math.pi / self.T_max)) / 2
-                    for base_lr, group in
-                    zip(self.base_lrs, self.optimizer.param_groups)]
-        return [(1 + math.cos(math.pi * self.last_epoch / self.T_max)) /
-                (1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) *
-                (group['lr'] - self.eta_min) + self.eta_min
-                for group in self.optimizer.param_groups]
+            return [
+                group["lr"]
+                + (base_lr - self.eta_min) * (1 - math.cos(math.pi / self.T_max)) / 2
+                for base_lr, group in zip(self.base_lrs, self.optimizer.param_groups)
+            ]
+        return [
+            (1 + math.cos(math.pi * self.last_epoch / self.T_max))
+            / (1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max))
+            * (group["lr"] - self.eta_min)
+            + self.eta_min
+            for group in self.optimizer.param_groups
+        ]
 
     def _get_closed_form_lr(self):
-        return [self.eta_min + (base_lr - self.eta_min) *
-                (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2
-                for base_lr in self.base_lrs]
+        return [
+            self.eta_min
+            + (base_lr - self.eta_min)
+            * (1 + math.cos(math.pi * self.last_epoch / self.T_max))
+            / 2
+            for base_lr in self.base_lrs
+        ]
 
 
 class ChainedScheduler(LRScheduler):
-    """Chains list of learning rate schedulers. It takes a list of chainable learning
+    """Chains list of learning rate schedulers. It takes a sequence of chainable learning
     rate schedulers and performs consecutive step() functions belonging to them by just
     one call.
 
     Args:
-        schedulers (list): List of chained schedulers.
+        schedulers (sequence): sequence of chained schedulers.
+        optimizer (Optimizer, optional): Wrapped optimizer. Default: None.
 
     Example:
         >>> # xdoctest: +SKIP
@@ -893,30 +1014,53 @@ class ChainedScheduler(LRScheduler):
         >>> # lr = 0.729    if epoch == 2
         >>> # lr = 0.6561   if epoch == 3
         >>> # lr = 0.59049  if epoch >= 4
-        >>> scheduler1 = ConstantLR(self.opt, factor=0.1, total_iters=2)
-        >>> scheduler2 = ExponentialLR(self.opt, gamma=0.9)
-        >>> scheduler = ChainedScheduler([scheduler1, scheduler2])
+        >>> scheduler1 = ConstantLR(optimizer, factor=0.1, total_iters=2)
+        >>> scheduler2 = ExponentialLR(optimizer, gamma=0.9)
+        >>> scheduler = ChainedScheduler([scheduler1, scheduler2], optimizer=optimizer)
         >>> for epoch in range(100):
         >>>     train(...)
         >>>     validate(...)
         >>>     scheduler.step()
     """
 
-    def __init__(self, schedulers):
-        for scheduler_idx in range(1, len(schedulers)):
-            if (schedulers[scheduler_idx].optimizer != schedulers[0].optimizer):
+    def __init__(
+        self, schedulers: Sequence[LRScheduler], optimizer: Optional[Optimizer] = None
+    ):
+        if len(schedulers) < 1:
+            raise ValueError(
+                f"{self.__class__.__name__} expects at least one scheduler to be chained, but got no scheduler."
+            )
+
+        optimizer = optimizer or schedulers[0].optimizer
+        for scheduler_idx, scheduler in enumerate(schedulers):
+            if not hasattr(scheduler, "optimizer"):
+                raise TypeError(
+                    f"{self.__class__.__name__} at index {scheduler_idx} should have `optimizer` as its attribute."
+                )
+            if isinstance(scheduler, ReduceLROnPlateau):
                 raise ValueError(
-                    "ChainedScheduler expects all schedulers to belong to the same optimizer, but "
-                    f"got schedulers at index {0} and {scheduler_idx} to be different"
+                    f"{self.__class__.__name__} does not support `ReduceLROnPlateau` scheduler as it "
+                    "requires additional kwargs to be specified when calling `step`, "
+                    f"but got one at index {scheduler_idx} in the given schedulers sequence."
                 )
-        self._schedulers = list(schedulers)
-        self.optimizer = schedulers[0].optimizer
-        self._last_lr = [group['lr'] for group in self._schedulers[-1].optimizer.param_groups]
+            if optimizer != scheduler.optimizer:
+                raise ValueError(
+                    f"{self.__class__.__name__} expects all schedulers to belong to the same optimizer, but "
+                    f"got scheduler {scheduler.__class__.__name__} at index {scheduler_idx} has {scheduler.optimizer}, "
+                    f"which is different from {optimizer.__class__.__name__}."
+                )
+        self._schedulers = schedulers
+        self.optimizer = optimizer
+        self._last_lr = [
+            group["lr"] for group in self._schedulers[-1].optimizer.param_groups
+        ]
 
     def step(self):
         for scheduler in self._schedulers:
             scheduler.step()
-        self._last_lr = [group['lr'] for group in self._schedulers[-1].optimizer.param_groups]
+        self._last_lr = [
+            group["lr"] for group in self._schedulers[-1].optimizer.param_groups
+        ]
 
     def state_dict(self):
         """Returns the state of the scheduler as a :class:`dict`.
@@ -925,11 +1069,15 @@ def state_dict(self):
         is not the optimizer.
         The wrapped scheduler states will also be saved.
         """
-        state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', '_schedulers')}
-        state_dict['_schedulers'] = [None] * len(self._schedulers)
+        state_dict = {
+            key: value
+            for key, value in self.__dict__.items()
+            if key not in ("optimizer", "_schedulers")
+        }
+        state_dict["_schedulers"] = [None] * len(self._schedulers)
 
         for idx, s in enumerate(self._schedulers):
-            state_dict['_schedulers'][idx] = s.state_dict()
+            state_dict["_schedulers"][idx] = s.state_dict()
 
         return state_dict
 
@@ -940,11 +1088,11 @@ def load_state_dict(self, state_dict):
             state_dict (dict): scheduler state. Should be an object returned
                 from a call to :meth:`state_dict`.
         """
-        _schedulers = state_dict.pop('_schedulers')
+        _schedulers = state_dict.pop("_schedulers")
         self.__dict__.update(state_dict)
         # Restore state_dict keys in order to prevent side effects
         # https://github.com/pytorch/pytorch/issues/32756
-        state_dict['_schedulers'] = _schedulers
+        state_dict["_schedulers"] = _schedulers
 
         for idx, s in enumerate(_schedulers):
             self._schedulers[idx].load_state_dict(s)
@@ -965,11 +1113,17 @@ class ReduceLROnPlateau(LRScheduler):
             quantity monitored has stopped increasing. Default: 'min'.
         factor (float): Factor by which the learning rate will be
             reduced. new_lr = lr * factor. Default: 0.1.
-        patience (int): Number of epochs with no improvement after
-            which learning rate will be reduced. For example, if
-            `patience = 2`, then we will ignore the first 2 epochs
-            with no improvement, and will only decrease the LR after the
-            3rd epoch if the loss still hasn't improved then.
+        patience (int): The number of allowed epochs with no improvement after
+            which the learning rate will be reduced.
+            For example, consider the case of having no patience (`patience = 0`).
+            In the first epoch, a baseline is established and is always considered good as there's no previous baseline.
+            In the second epoch, if the performance is worse than the baseline,
+            we have what is considered an intolerable epoch.
+            Since the count of intolerable epochs (1) is greater than the patience level (0),
+            the learning rate is reduced at the end of this epoch.
+            From the third epoch onwards, the learning rate continues to be reduced at the end of each epoch
+            if the performance is worse than the baseline. If the performance improves or remains the same,
+            the learning rate is not adjusted.
             Default: 10.
         threshold (float): Threshold for measuring the new optimum,
             to only focus on significant changes. Default: 1e-4.
@@ -1004,22 +1158,33 @@ class ReduceLROnPlateau(LRScheduler):
         >>>     scheduler.step(val_loss)
     """
 
-    def __init__(self, optimizer, mode='min', factor=0.1, patience=10,
-                 threshold=1e-4, threshold_mode='rel', cooldown=0,
-                 min_lr=0, eps=1e-8, verbose="deprecated"):
-
+    def __init__(
+        self,
+        optimizer,
+        mode="min",
+        factor=0.1,
+        patience=10,
+        threshold=1e-4,
+        threshold_mode="rel",
+        cooldown=0,
+        min_lr=0,
+        eps=1e-8,
+        verbose="deprecated",
+    ):
         if factor >= 1.0:
-            raise ValueError('Factor should be < 1.0.')
+            raise ValueError("Factor should be < 1.0.")
         self.factor = factor
 
         # Attach optimizer
         if not isinstance(optimizer, Optimizer):
-            raise TypeError(f'{type(optimizer).__name__} is not an Optimizer')
+            raise TypeError(f"{type(optimizer).__name__} is not an Optimizer")
         self.optimizer = optimizer
 
         if isinstance(min_lr, (list, tuple)):
             if len(min_lr) != len(optimizer.param_groups):
-                raise ValueError(f"expected {len(optimizer.param_groups)} min_lrs, got {len(min_lr)}")
+                raise ValueError(
+                    f"expected {len(optimizer.param_groups)} min_lrs, got {len(min_lr)}"
+                )
             self.min_lrs = list(min_lr)
         else:
             self.min_lrs = [min_lr] * len(optimizer.param_groups)
@@ -1037,8 +1202,10 @@ def __init__(self, optimizer, mode='min', factor=0.1, patience=10,
         self.mode_worse = None  # the worse value for the chosen mode
         self.eps = eps
         self.last_epoch = 0
-        self._init_is_better(mode=mode, threshold=threshold,
-                             threshold_mode=threshold_mode)
+        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
+        self._init_is_better(
+            mode=mode, threshold=threshold, threshold_mode=threshold_mode
+        )
         self._reset()
 
     def _reset(self):
@@ -1071,41 +1238,41 @@ def step(self, metrics, epoch=None):
             self.cooldown_counter = self.cooldown
             self.num_bad_epochs = 0
 
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
 
     def _reduce_lr(self, epoch):
         for i, param_group in enumerate(self.optimizer.param_groups):
-            old_lr = float(param_group['lr'])
+            old_lr = float(param_group["lr"])
             new_lr = max(old_lr * self.factor, self.min_lrs[i])
             if old_lr - new_lr > self.eps:
-                param_group['lr'] = new_lr
+                param_group["lr"] = new_lr
 
     @property
     def in_cooldown(self):
         return self.cooldown_counter > 0
 
     def is_better(self, a, best):
-        if self.mode == 'min' and self.threshold_mode == 'rel':
-            rel_epsilon = 1. - self.threshold
+        if self.mode == "min" and self.threshold_mode == "rel":
+            rel_epsilon = 1.0 - self.threshold
             return a < best * rel_epsilon
 
-        elif self.mode == 'min' and self.threshold_mode == 'abs':
+        elif self.mode == "min" and self.threshold_mode == "abs":
             return a < best - self.threshold
 
-        elif self.mode == 'max' and self.threshold_mode == 'rel':
-            rel_epsilon = self.threshold + 1.
+        elif self.mode == "max" and self.threshold_mode == "rel":
+            rel_epsilon = self.threshold + 1.0
             return a > best * rel_epsilon
 
         else:  # mode == 'max' and epsilon_mode == 'abs':
             return a > best + self.threshold
 
     def _init_is_better(self, mode, threshold, threshold_mode):
-        if mode not in {'min', 'max'}:
-            raise ValueError('mode ' + mode + ' is unknown!')
-        if threshold_mode not in {'rel', 'abs'}:
-            raise ValueError('threshold mode ' + threshold_mode + ' is unknown!')
+        if mode not in {"min", "max"}:
+            raise ValueError("mode " + mode + " is unknown!")
+        if threshold_mode not in {"rel", "abs"}:
+            raise ValueError("threshold mode " + threshold_mode + " is unknown!")
 
-        if mode == 'min':
+        if mode == "min":
             self.mode_worse = inf
         else:  # mode == 'max':
             self.mode_worse = -inf
@@ -1115,11 +1282,15 @@ def _init_is_better(self, mode, threshold, threshold_mode):
         self.threshold_mode = threshold_mode
 
     def state_dict(self):
-        return {key: value for key, value in self.__dict__.items() if key != 'optimizer'}
+        return {
+            key: value for key, value in self.__dict__.items() if key != "optimizer"
+        }
 
     def load_state_dict(self, state_dict):
         self.__dict__.update(state_dict)
-        self._init_is_better(mode=self.mode, threshold=self.threshold, threshold_mode=self.threshold_mode)
+        self._init_is_better(
+            mode=self.mode, threshold=self.threshold, threshold_mode=self.threshold_mode
+        )
 
 
 class CyclicLR(LRScheduler):
@@ -1221,42 +1392,48 @@ class CyclicLR(LRScheduler):
     .. _bckenstler/CLR: https://github.com/bckenstler/CLR
     """
 
-    def __init__(self,
-                 optimizer,
-                 base_lr,
-                 max_lr,
-                 step_size_up=2000,
-                 step_size_down=None,
-                 mode='triangular',
-                 gamma=1.,
-                 scale_fn=None,
-                 scale_mode='cycle',
-                 cycle_momentum=True,
-                 base_momentum=0.8,
-                 max_momentum=0.9,
-                 last_epoch=-1,
-                 verbose="deprecated"):
-
+    def __init__(
+        self,
+        optimizer,
+        base_lr,
+        max_lr,
+        step_size_up=2000,
+        step_size_down=None,
+        mode="triangular",
+        gamma=1.0,
+        scale_fn=None,
+        scale_mode="cycle",
+        cycle_momentum=True,
+        base_momentum=0.8,
+        max_momentum=0.9,
+        last_epoch=-1,
+        verbose="deprecated",
+    ):
         # Attach optimizer
         if not isinstance(optimizer, Optimizer):
-            raise TypeError(f'{type(optimizer).__name__} is not an Optimizer')
+            raise TypeError(f"{type(optimizer).__name__} is not an Optimizer")
         self.optimizer = optimizer
 
-        base_lrs = self._format_param('base_lr', optimizer, base_lr)
+        base_lrs = self._format_param("base_lr", optimizer, base_lr)
         if last_epoch == -1:
             for lr, group in zip(base_lrs, optimizer.param_groups):
-                group['lr'] = lr
+                if isinstance(group["lr"], Tensor):
+                    lr_val = lr.item() if isinstance(lr, Tensor) else lr
+                    group["lr"].fill_(lr)
+                else:
+                    group["lr"] = lr
 
-        self.max_lrs = self._format_param('max_lr', optimizer, max_lr)
+        self.max_lrs = self._format_param("max_lr", optimizer, max_lr)
 
         step_size_up = float(step_size_up)
-        step_size_down = float(step_size_down) if step_size_down is not None else step_size_up
+        step_size_down = (
+            float(step_size_down) if step_size_down is not None else step_size_up
+        )
         self.total_size = step_size_up + step_size_down
         self.step_ratio = step_size_up / self.total_size
 
-        if mode not in ['triangular', 'triangular2', 'exp_range'] \
-                and scale_fn is None:
-            raise ValueError('mode is invalid and scale_fn is None')
+        if mode not in ["triangular", "triangular2", "exp_range"] and scale_fn is None:
+            raise ValueError("mode is invalid and scale_fn is None")
 
         self.mode = mode
         self.gamma = gamma
@@ -1268,15 +1445,31 @@ def __init__(self,
 
         self.cycle_momentum = cycle_momentum
         if cycle_momentum:
-            if 'momentum' not in optimizer.defaults:
-                raise ValueError('optimizer must support momentum with `cycle_momentum` option enabled')
+            if (
+                "momentum" not in optimizer.defaults
+                and "betas" not in optimizer.defaults
+            ):
+                raise ValueError(
+                    "optimizer must support momentum or beta1 with `cycle_momentum` option enabled"
+                )
 
-            base_momentums = self._format_param('base_momentum', optimizer, base_momentum)
+            self.use_beta1 = "betas" in self.optimizer.defaults
+            self.base_momentums = self._format_param(
+                "base_momentum", optimizer, base_momentum
+            )
+            self.max_momentums = self._format_param(
+                "max_momentum", optimizer, max_momentum
+            )
             if last_epoch == -1:
-                for momentum, group in zip(base_momentums, optimizer.param_groups):
-                    group['momentum'] = momentum
-            self.base_momentums = [group['momentum'] for group in optimizer.param_groups]
-            self.max_momentums = self._format_param('max_momentum', optimizer, max_momentum)
+                for m_momentum, b_momentum, group in zip(
+                    self.max_momentums, self.base_momentums, optimizer.param_groups
+                ):
+                    if self.use_beta1:
+                        group["betas"] = (m_momentum, *group["betas"][1:])
+                    else:
+                        group["momentum"] = m_momentum
+                    group["max_momentum"] = m_momentum
+                    group["base_momentum"] = b_momentum
 
         super().__init__(optimizer, last_epoch, verbose)
         self.base_lrs = base_lrs
@@ -1284,21 +1477,23 @@ def __init__(self,
     def _init_scale_fn(self):
         if self._scale_fn_custom is not None:
             return
-        if self.mode == 'triangular':
+        if self.mode == "triangular":
             self._scale_fn_ref = self._triangular_scale_fn
-            self.scale_mode = 'cycle'
-        elif self.mode == 'triangular2':
+            self.scale_mode = "cycle"
+        elif self.mode == "triangular2":
             self._scale_fn_ref = self._triangular2_scale_fn
-            self.scale_mode = 'cycle'
-        elif self.mode == 'exp_range':
+            self.scale_mode = "cycle"
+        elif self.mode == "exp_range":
             self._scale_fn_ref = partial(self._exp_range_scale_fn, self.gamma)
-            self.scale_mode = 'iterations'
+            self.scale_mode = "iterations"
 
     def _format_param(self, name, optimizer, param):
         """Return correctly formatted lr/momentum for each param group."""
         if isinstance(param, (list, tuple)):
             if len(param) != len(optimizer.param_groups):
-                raise ValueError(f"expected {len(optimizer.param_groups)} values for {name}, got {len(param)}")
+                raise ValueError(
+                    f"expected {len(optimizer.param_groups)} values for {name}, got {len(param)}"
+                )
             return param
         else:
             return [param] * len(optimizer.param_groups)
@@ -1311,15 +1506,15 @@ def scale_fn(self, x):
 
     @staticmethod
     def _triangular_scale_fn(x):
-        return 1.
+        return 1.0
 
     @staticmethod
     def _triangular2_scale_fn(x):
-        return 1 / (2. ** (x - 1))
+        return 1 / (2.0 ** (x - 1))
 
     @staticmethod
     def _exp_range_scale_fn(gamma, x):
-        return gamma ** x
+        return gamma**x
 
     def get_lr(self):
         """Calculates the learning rate at batch index. This function treats
@@ -1329,12 +1524,10 @@ def get_lr(self):
         updating the optimizer's momentum.
         """
 
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.", UserWarning)
+        _warn_get_lr_called_within_step(self)
 
         cycle = math.floor(1 + self.last_epoch / self.total_size)
-        x = 1. + self.last_epoch / self.total_size - cycle
+        x = 1.0 + self.last_epoch / self.total_size - cycle
         if x <= self.step_ratio:
             scale_factor = x / self.step_ratio
         else:
@@ -1343,7 +1536,7 @@ def get_lr(self):
         lrs = []
         for base_lr, max_lr in zip(self.base_lrs, self.max_lrs):
             base_height = (max_lr - base_lr) * scale_factor
-            if self.scale_mode == 'cycle':
+            if self.scale_mode == "cycle":
                 lr = base_lr + base_height * self.scale_fn(cycle)
             else:
                 lr = base_lr + base_height * self.scale_fn(self.last_epoch)
@@ -1351,15 +1544,22 @@ def get_lr(self):
 
         if self.cycle_momentum:
             momentums = []
-            for base_momentum, max_momentum in zip(self.base_momentums, self.max_momentums):
+            for base_momentum, max_momentum in zip(
+                self.base_momentums, self.max_momentums
+            ):
                 base_height = (max_momentum - base_momentum) * scale_factor
-                if self.scale_mode == 'cycle':
+                if self.scale_mode == "cycle":
                     momentum = max_momentum - base_height * self.scale_fn(cycle)
                 else:
-                    momentum = max_momentum - base_height * self.scale_fn(self.last_epoch)
+                    momentum = max_momentum - base_height * self.scale_fn(
+                        self.last_epoch
+                    )
                 momentums.append(momentum)
             for param_group, momentum in zip(self.optimizer.param_groups, momentums):
-                param_group['momentum'] = momentum
+                if self.use_beta1:
+                    param_group["betas"] = (momentum, *param_group["betas"][1:])
+                else:
+                    param_group["momentum"] = momentum
 
         return lrs
 
@@ -1367,18 +1567,18 @@ def state_dict(self):
         state = super().state_dict()
         # We are dropping the `_scale_fn_ref` attribute because it is a
         # `weakref.WeakMethod` and can't be pickled.
-        state.pop('_scale_fn_ref')
-        fn = state.pop('_scale_fn_custom')
-        state['_scale_fn_custom'] = None
+        state.pop("_scale_fn_ref")
+        fn = state.pop("_scale_fn_custom")
+        state["_scale_fn_custom"] = None
         if fn is not None and not isinstance(fn, types.FunctionType):
             # The _scale_fn_custom will only be saved if it is a callable object
             # and not if it is a function or lambda.
-            state['_scale_fn_custom'] = fn.__dict__.copy()
+            state["_scale_fn_custom"] = fn.__dict__.copy()
 
         return state
 
     def load_state_dict(self, state_dict):
-        fn = state_dict.pop('_scale_fn_custom')
+        fn = state_dict.pop("_scale_fn_custom")
         super().load_state_dict(state_dict)
         if fn is not None:
             self._scale_fn_custom.__dict__.update(fn)
@@ -1403,10 +1603,10 @@ class CosineAnnealingWarmRestarts(LRScheduler):
 
     Args:
         optimizer (Optimizer): Wrapped optimizer.
-        T_0 (int): Number of iterations for the first restart.
-        T_mult (int, optional): A factor increases :math:`T_{i}` after a restart. Default: 1.
+        T_0 (int): Number of iterations until the first restart.
+        T_mult (int, optional): A factor by which :math:`T_{i}` increases after a restart. Default: 1.
         eta_min (float, optional): Minimum learning rate. Default: 0.
-        last_epoch (int, optional): The index of last epoch. Default: -1.
+        last_epoch (int, optional): The index of the last epoch. Default: -1.
         verbose (bool): If ``True``, prints a message to stdout for
             each update. Default: ``False``.
 
@@ -1418,13 +1618,17 @@ class CosineAnnealingWarmRestarts(LRScheduler):
         https://arxiv.org/abs/1608.03983
     """
 
-    def __init__(self, optimizer, T_0, T_mult=1, eta_min=0, last_epoch=-1, verbose="deprecated"):
+    def __init__(
+        self, optimizer, T_0, T_mult=1, eta_min=0, last_epoch=-1, verbose="deprecated"
+    ):
         if T_0 <= 0 or not isinstance(T_0, int):
             raise ValueError(f"Expected positive integer T_0, but got {T_0}")
         if T_mult < 1 or not isinstance(T_mult, int):
             raise ValueError(f"Expected integer T_mult >= 1, but got {T_mult}")
         if not isinstance(eta_min, (float, int)):
-            raise ValueError(f"Expected float or int eta_min, but got {eta_min} of type {type(eta_min)}")
+            raise ValueError(
+                f"Expected float or int eta_min, but got {eta_min} of type {type(eta_min)}"
+            )
         self.T_0 = T_0
         self.T_i = T_0
         self.T_mult = T_mult
@@ -1433,12 +1637,15 @@ def __init__(self, optimizer, T_0, T_mult=1, eta_min=0, last_epoch=-1, verbose="
         super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.", UserWarning)
+        _warn_get_lr_called_within_step(self)
 
-        return [self.eta_min + (base_lr - self.eta_min) * (1 + math.cos(math.pi * self.T_cur / self.T_i)) / 2
-                for base_lr in self.base_lrs]
+        return [
+            self.eta_min
+            + (base_lr - self.eta_min)
+            * (1 + math.cos(math.pi * self.T_cur / self.T_i))
+            / 2
+            for base_lr in self.base_lrs
+        ]
 
     def step(self, epoch=None):
         """Step could be called after every batch update
@@ -1484,8 +1691,14 @@ def step(self, epoch=None):
                 if self.T_mult == 1:
                     self.T_cur = epoch % self.T_0
                 else:
-                    n = int(math.log((epoch / self.T_0 * (self.T_mult - 1) + 1), self.T_mult))
-                    self.T_cur = epoch - self.T_0 * (self.T_mult ** n - 1) / (self.T_mult - 1)
+                    n = int(
+                        math.log(
+                            (epoch / self.T_0 * (self.T_mult - 1) + 1), self.T_mult
+                        )
+                    )
+                    self.T_cur = epoch - self.T_0 * (self.T_mult**n - 1) / (
+                        self.T_mult - 1
+                    )
                     self.T_i = self.T_0 * self.T_mult ** (n)
             else:
                 self.T_i = self.T_0
@@ -1493,7 +1706,6 @@ def step(self, epoch=None):
         self.last_epoch = math.floor(epoch)
 
         class _enable_get_lr_call:
-
             def __init__(self, o):
                 self.o = o
 
@@ -1508,9 +1720,9 @@ def __exit__(self, type, value, traceback):
         with _enable_get_lr_call(self):
             for i, data in enumerate(zip(self.optimizer.param_groups, self.get_lr())):
                 param_group, lr = data
-                param_group['lr'] = lr
+                param_group["lr"] = lr
 
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
 
 
 class OneCycleLR(LRScheduler):
@@ -1619,120 +1831,139 @@ class OneCycleLR(LRScheduler):
     .. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
         https://arxiv.org/abs/1708.07120
     """
-    def __init__(self,
-                 optimizer,
-                 max_lr,
-                 total_steps=None,
-                 epochs=None,
-                 steps_per_epoch=None,
-                 pct_start=0.3,
-                 anneal_strategy='cos',
-                 cycle_momentum=True,
-                 base_momentum=0.85,
-                 max_momentum=0.95,
-                 div_factor=25.,
-                 final_div_factor=1e4,
-                 three_phase=False,
-                 last_epoch=-1,
-                 verbose="deprecated"):
 
+    def __init__(
+        self,
+        optimizer,
+        max_lr,
+        total_steps=None,
+        epochs=None,
+        steps_per_epoch=None,
+        pct_start=0.3,
+        anneal_strategy="cos",
+        cycle_momentum=True,
+        base_momentum=0.85,
+        max_momentum=0.95,
+        div_factor=25.0,
+        final_div_factor=1e4,
+        three_phase=False,
+        last_epoch=-1,
+        verbose="deprecated",
+    ):
         # Validate optimizer
         if not isinstance(optimizer, Optimizer):
-            raise TypeError(f'{type(optimizer).__name__} is not an Optimizer')
+            raise TypeError(f"{type(optimizer).__name__} is not an Optimizer")
         self.optimizer = optimizer
 
         # Validate total_steps
         if total_steps is None and epochs is None and steps_per_epoch is None:
-            raise ValueError("You must define either total_steps OR (epochs AND steps_per_epoch)")
+            raise ValueError(
+                "You must define either total_steps OR (epochs AND steps_per_epoch)"
+            )
         elif total_steps is not None:
             if total_steps <= 0 or not isinstance(total_steps, int):
-                raise ValueError(f"Expected positive integer total_steps, but got {total_steps}")
+                raise ValueError(
+                    f"Expected positive integer total_steps, but got {total_steps}"
+                )
             self.total_steps = total_steps
         else:
             if epochs <= 0 or not isinstance(epochs, int):
                 raise ValueError(f"Expected positive integer epochs, but got {epochs}")
             if steps_per_epoch <= 0 or not isinstance(steps_per_epoch, int):
-                raise ValueError(f"Expected positive integer steps_per_epoch, but got {steps_per_epoch}")
+                raise ValueError(
+                    f"Expected positive integer steps_per_epoch, but got {steps_per_epoch}"
+                )
             self.total_steps = epochs * steps_per_epoch
 
         if three_phase:
             self._schedule_phases = [
                 {
-                    'end_step': float(pct_start * self.total_steps) - 1,
-                    'start_lr': 'initial_lr',
-                    'end_lr': 'max_lr',
-                    'start_momentum': 'max_momentum',
-                    'end_momentum': 'base_momentum',
+                    "end_step": float(pct_start * self.total_steps) - 1,
+                    "start_lr": "initial_lr",
+                    "end_lr": "max_lr",
+                    "start_momentum": "max_momentum",
+                    "end_momentum": "base_momentum",
                 },
                 {
-                    'end_step': float(2 * pct_start * self.total_steps) - 2,
-                    'start_lr': 'max_lr',
-                    'end_lr': 'initial_lr',
-                    'start_momentum': 'base_momentum',
-                    'end_momentum': 'max_momentum',
+                    "end_step": float(2 * pct_start * self.total_steps) - 2,
+                    "start_lr": "max_lr",
+                    "end_lr": "initial_lr",
+                    "start_momentum": "base_momentum",
+                    "end_momentum": "max_momentum",
                 },
                 {
-                    'end_step': self.total_steps - 1,
-                    'start_lr': 'initial_lr',
-                    'end_lr': 'min_lr',
-                    'start_momentum': 'max_momentum',
-                    'end_momentum': 'max_momentum',
+                    "end_step": self.total_steps - 1,
+                    "start_lr": "initial_lr",
+                    "end_lr": "min_lr",
+                    "start_momentum": "max_momentum",
+                    "end_momentum": "max_momentum",
                 },
             ]
         else:
             self._schedule_phases = [
                 {
-                    'end_step': float(pct_start * self.total_steps) - 1,
-                    'start_lr': 'initial_lr',
-                    'end_lr': 'max_lr',
-                    'start_momentum': 'max_momentum',
-                    'end_momentum': 'base_momentum',
+                    "end_step": float(pct_start * self.total_steps) - 1,
+                    "start_lr": "initial_lr",
+                    "end_lr": "max_lr",
+                    "start_momentum": "max_momentum",
+                    "end_momentum": "base_momentum",
                 },
                 {
-                    'end_step': self.total_steps - 1,
-                    'start_lr': 'max_lr',
-                    'end_lr': 'min_lr',
-                    'start_momentum': 'base_momentum',
-                    'end_momentum': 'max_momentum',
+                    "end_step": self.total_steps - 1,
+                    "start_lr": "max_lr",
+                    "end_lr": "min_lr",
+                    "start_momentum": "base_momentum",
+                    "end_momentum": "max_momentum",
                 },
             ]
 
         # Validate pct_start
         if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
-            raise ValueError(f"Expected float between 0 and 1 pct_start, but got {pct_start}")
+            raise ValueError(
+                f"Expected float between 0 and 1 pct_start, but got {pct_start}"
+            )
 
         # Validate anneal_strategy
-        if anneal_strategy not in ['cos', 'linear']:
-            raise ValueError(f"anneal_strategy must by one of 'cos' or 'linear', instead got {anneal_strategy}")
-        elif anneal_strategy == 'cos':
-            self.anneal_func = self._annealing_cos
-        elif anneal_strategy == 'linear':
-            self.anneal_func = self._annealing_linear
+        if anneal_strategy not in ["cos", "linear"]:
+            raise ValueError(
+                f"anneal_strategy must be one of 'cos' or 'linear', instead got {anneal_strategy}"
+            )
+        else:
+            self._anneal_func_type = anneal_strategy
 
         # Initialize learning rate variables
-        max_lrs = self._format_param('max_lr', self.optimizer, max_lr)
+        max_lrs = self._format_param("max_lr", self.optimizer, max_lr)
         if last_epoch == -1:
             for idx, group in enumerate(self.optimizer.param_groups):
-                group['initial_lr'] = max_lrs[idx] / div_factor
-                group['max_lr'] = max_lrs[idx]
-                group['min_lr'] = group['initial_lr'] / final_div_factor
+                group["initial_lr"] = max_lrs[idx] / div_factor
+                group["max_lr"] = max_lrs[idx]
+                group["min_lr"] = group["initial_lr"] / final_div_factor
 
         # Initialize momentum variables
         self.cycle_momentum = cycle_momentum
         if self.cycle_momentum:
-            if 'momentum' not in self.optimizer.defaults and 'betas' not in self.optimizer.defaults:
-                raise ValueError('optimizer must support momentum with `cycle_momentum` option enabled')
-            self.use_beta1 = 'betas' in self.optimizer.defaults
-            max_momentums = self._format_param('max_momentum', optimizer, max_momentum)
-            base_momentums = self._format_param('base_momentum', optimizer, base_momentum)
+            if (
+                "momentum" not in self.optimizer.defaults
+                and "betas" not in self.optimizer.defaults
+            ):
+                raise ValueError(
+                    "optimizer must support momentum or beta1 with `cycle_momentum` option enabled"
+                )
+            self.use_beta1 = "betas" in self.optimizer.defaults
+            max_momentums = self._format_param("max_momentum", optimizer, max_momentum)
+            base_momentums = self._format_param(
+                "base_momentum", optimizer, base_momentum
+            )
             if last_epoch == -1:
-                for m_momentum, b_momentum, group in zip(max_momentums, base_momentums, optimizer.param_groups):
+                for m_momentum, b_momentum, group in zip(
+                    max_momentums, base_momentums, optimizer.param_groups
+                ):
                     if self.use_beta1:
-                        group['betas'] = (m_momentum, *group['betas'][1:])
+                        group["betas"] = (m_momentum, *group["betas"][1:])
                     else:
-                        group['momentum'] = m_momentum
-                    group['max_momentum'] = m_momentum
-                    group['base_momentum'] = b_momentum
+                        group["momentum"] = m_momentum
+                    group["max_momentum"] = m_momentum
+                    group["base_momentum"] = b_momentum
 
         super().__init__(optimizer, last_epoch, verbose)
 
@@ -1740,11 +1971,25 @@ def _format_param(self, name, optimizer, param):
         """Return correctly formatted lr/momentum for each param group."""
         if isinstance(param, (list, tuple)):
             if len(param) != len(optimizer.param_groups):
-                raise ValueError(f"expected {len(optimizer.param_groups)} values for {name}, got {len(param)}")
+                raise ValueError(
+                    f"expected {len(optimizer.param_groups)} values for {name}, got {len(param)}"
+                )
             return param
         else:
             return [param] * len(optimizer.param_groups)
 
+    def _anneal_func(self, *args, **kwargs):
+        if hasattr(self, "_anneal_func_type"):
+            if self._anneal_func_type == "cos":
+                return self._annealing_cos(*args, **kwargs)
+            elif self._anneal_func_type == "linear":
+                return self._annealing_linear(*args, **kwargs)
+            else:
+                raise ValueError(f"Unknown _anneal_func_type: {self._anneal_func_type}")
+        else:
+            # For BC
+            return self.anneal_func(*args, **kwargs)
+
     @staticmethod
     def _annealing_cos(start, end, pct):
         "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
@@ -1757,34 +2002,39 @@ def _annealing_linear(start, end, pct):
         return (end - start) * pct + start
 
     def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.", UserWarning)
+        _warn_get_lr_called_within_step(self)
 
         lrs = []
         step_num = self.last_epoch
 
         if step_num > self.total_steps:
-            raise ValueError("Tried to step {} times. The specified number of total steps is {}"
-                             .format(step_num, self.total_steps))
+            raise ValueError(
+                f"Tried to step {step_num} times. The specified number of total steps is {self.total_steps}"  # noqa: UP032
+            )
 
         for group in self.optimizer.param_groups:
             start_step = 0
             for i, phase in enumerate(self._schedule_phases):
-                end_step = phase['end_step']
+                end_step = phase["end_step"]
                 if step_num <= end_step or i == len(self._schedule_phases) - 1:
                     pct = (step_num - start_step) / (end_step - start_step)
-                    computed_lr = self.anneal_func(group[phase['start_lr']], group[phase['end_lr']], pct)
+                    computed_lr = self._anneal_func(
+                        group[phase["start_lr"]], group[phase["end_lr"]], pct
+                    )
                     if self.cycle_momentum:
-                        computed_momentum = self.anneal_func(group[phase['start_momentum']], group[phase['end_momentum']], pct)
+                        computed_momentum = self._anneal_func(
+                            group[phase["start_momentum"]],
+                            group[phase["end_momentum"]],
+                            pct,
+                        )
                     break
-                start_step = phase['end_step']
+                start_step = phase["end_step"]
 
             lrs.append(computed_lr)
             if self.cycle_momentum:
                 if self.use_beta1:
-                    group['betas'] = (computed_momentum, *group['betas'][1:])
+                    group["betas"] = (computed_momentum, *group["betas"][1:])
                 else:
-                    group['momentum'] = computed_momentum
+                    group["momentum"] = computed_momentum
 
         return lrs
diff --git a/torch/optim/lr_scheduler.pyi b/torch/optim/lr_scheduler.pyi
index 2446c80bc547e..ce3f26a4d213e 100644
--- a/torch/optim/lr_scheduler.pyi
+++ b/torch/optim/lr_scheduler.pyi
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Union
 
 from .optimizer import Optimizer
 
@@ -109,7 +109,11 @@ class ExponentialLR(LRScheduler):
     ) -> None: ...
 
 class ChainedScheduler(LRScheduler):
-    def __init__(self, schedulers: List[LRScheduler]) -> None: ...
+    def __init__(
+        self,
+        schedulers: Sequence[LRScheduler],
+        optimizer: Optional[Optimizer] = ...,
+    ) -> None: ...
 
 class SequentialLR(LRScheduler):
     def __init__(
diff --git a/torch/optim/nadam.py b/torch/optim/nadam.py
index 35264028ad169..901036897f564 100644
--- a/torch/optim/nadam.py
+++ b/torch/optim/nadam.py
@@ -1,17 +1,40 @@
+from typing import List, Optional
+
 import torch
 from torch import Tensor
-from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _dispatch_sqrt,
-                        _stack_if_compiling, _get_scalar_dtype, _default_to_fused_or_foreach,
-                        _view_as_real, _capturable_doc, _differentiable_doc, _foreach_doc,)
-from typing import List, Optional
+from .optimizer import (
+    _capturable_doc,
+    _default_to_fused_or_foreach,
+    _differentiable_doc,
+    _disable_dynamo_if_unsupported,
+    _dispatch_sqrt,
+    _foreach_doc,
+    _get_scalar_dtype,
+    _get_value,
+    _stack_if_compiling,
+    _use_grad_for_differentiable,
+    _view_as_real,
+    Optimizer,
+)
+
+__all__ = ["NAdam", "nadam"]
 
-__all__ = ['NAdam', 'nadam']
 
 class NAdam(Optimizer):
-    def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=0, momentum_decay=4e-3, decoupled_weight_decay: bool = False,
-                 *, foreach: Optional[bool] = None, capturable: bool = False,
-                 differentiable: bool = False):
+    def __init__(
+        self,
+        params,
+        lr=2e-3,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=0,
+        momentum_decay=4e-3,
+        decoupled_weight_decay: bool = False,
+        *,
+        foreach: Optional[bool] = None,
+        capturable: bool = False,
+        differentiable: bool = False,
+    ):
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= eps:
@@ -24,37 +47,65 @@ def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
             raise ValueError(f"Invalid weight_decay value: {weight_decay}")
         if not 0.0 <= momentum_decay:
             raise ValueError(f"Invalid momentum_decay value: {momentum_decay}")
-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, momentum_decay=momentum_decay,
-                        decoupled_weight_decay=decoupled_weight_decay,
-                        foreach=foreach, capturable=capturable, differentiable=differentiable)
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            momentum_decay=momentum_decay,
+            decoupled_weight_decay=decoupled_weight_decay,
+            foreach=foreach,
+            capturable=capturable,
+            differentiable=differentiable,
+        )
         super().__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('foreach', None)
-            group.setdefault('capturable', False)
-            group.setdefault('differentiable', False)
-            group.setdefault('decoupled_weight_decay', False)
-        state_values = list(self.state.values())
-        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
-        if not step_is_tensor:
-            for s in state_values:
-                s['step'] = torch.tensor(float(s['step']), dtype=_get_scalar_dtype())
-        mu_product_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['mu_product'])
-        if not mu_product_is_tensor:
-            for s in state_values:
-                s['mu_product'] = torch.tensor(s['mu_product'], dtype=_get_scalar_dtype())
-
-    def _init_group(self, group, params_with_grad, grads, exp_avgs, exp_avg_sqs, mu_products, state_steps):
+            group.setdefault("foreach", None)
+            group.setdefault("capturable", False)
+            group.setdefault("differentiable", False)
+            group.setdefault("decoupled_weight_decay", False)
+            for p in group["params"]:
+                p_state = self.state.get(p, [])
+                if len(p_state) != 0:
+                    if not torch.is_tensor(p_state["step"]):
+                        step_val = float(p_state["step"])
+                        p_state["step"] = (
+                            torch.tensor(
+                                step_val, dtype=_get_scalar_dtype(), device=p.device
+                            )
+                            if group["capturable"]
+                            else torch.tensor(step_val, dtype=_get_scalar_dtype())
+                        )
+                    if not torch.is_tensor(p_state["mu_product"]):
+                        mu_prod_val = p_state["mu_product"]
+                        p_state["mu_product"] = (
+                            torch.tensor(
+                                mu_prod_val, dtype=_get_scalar_dtype(), device=p.device
+                            )
+                            if group["capturable"]
+                            else torch.tensor(mu_prod_val, dtype=_get_scalar_dtype())
+                        )
+
+    def _init_group(
+        self,
+        group,
+        params_with_grad,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        mu_products,
+        state_steps,
+    ):
         has_complex = False
-        for p in group['params']:
+        for p in group["params"]:
             if p.grad is not None:
                 has_complex |= torch.is_complex(p)
                 params_with_grad.append(p)
                 if p.grad.is_sparse:
-                    raise RuntimeError('NAdam does not support sparse gradients')
+                    raise RuntimeError("NAdam does not support sparse gradients")
                 grads.append(p.grad)
 
                 state = self.state[p]
@@ -63,23 +114,29 @@ def _init_group(self, group, params_with_grad, grads, exp_avgs, exp_avg_sqs, mu_
                     # note(crcrpar): [special device hosting for step]
                     # Deliberately host `step` and `mu_product` on CPU if capturable is False.
                     # This is because kernel launches are costly on CUDA and XLA.
-                    state['step'] = (
+                    state["step"] = (
                         torch.zeros((), dtype=_get_scalar_dtype(), device=p.device)
-                        if group['capturable'] else torch.tensor(0.0, dtype=_get_scalar_dtype())
+                        if group["capturable"]
+                        else torch.tensor(0.0, dtype=_get_scalar_dtype())
                     )
-                    state['mu_product'] = (
+                    state["mu_product"] = (
                         torch.ones((), dtype=_get_scalar_dtype(), device=p.device)
-                        if group['capturable'] else torch.tensor(1.0, dtype=_get_scalar_dtype())
+                        if group["capturable"]
+                        else torch.tensor(1.0, dtype=_get_scalar_dtype())
                     )
                     # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    state["exp_avg"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
                     # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    state["exp_avg_sq"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
 
-                exp_avgs.append(state['exp_avg'])
-                exp_avg_sqs.append(state['exp_avg_sq'])
-                mu_products.append(state['mu_product'])
-                state_steps.append(state['step'])
+                exp_avgs.append(state["exp_avg"])
+                exp_avg_sqs.append(state["exp_avg_sq"])
+                mu_products.append(state["mu_product"])
+                state_steps.append(state["step"])
         return has_complex
 
     @_use_grad_for_differentiable
@@ -104,31 +161,43 @@ def step(self, closure=None):
             exp_avg_sqs = []
             mu_products = []
             state_steps = []
-            beta1, beta2 = group['betas']
-
-            has_complex = self._init_group(group, params_with_grad, grads, exp_avgs, exp_avg_sqs, mu_products, state_steps)
-
-            nadam(params_with_grad,
-                  grads,
-                  exp_avgs,
-                  exp_avg_sqs,
-                  mu_products,
-                  state_steps,
-                  beta1=beta1,
-                  beta2=beta2,
-                  lr=group['lr'],
-                  weight_decay=group['weight_decay'],
-                  momentum_decay=group['momentum_decay'],
-                  eps=group['eps'],
-                  decoupled_weight_decay=group['decoupled_weight_decay'],
-                  foreach=group['foreach'],
-                  capturable=group['capturable'],
-                  differentiable=group['differentiable'],
-                  has_complex=has_complex)
+            beta1, beta2 = group["betas"]
+
+            has_complex = self._init_group(
+                group,
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                mu_products,
+                state_steps,
+            )
+
+            nadam(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                mu_products,
+                state_steps,
+                beta1=beta1,
+                beta2=beta2,
+                lr=group["lr"],
+                weight_decay=group["weight_decay"],
+                momentum_decay=group["momentum_decay"],
+                eps=group["eps"],
+                decoupled_weight_decay=group["decoupled_weight_decay"],
+                foreach=group["foreach"],
+                capturable=group["capturable"],
+                differentiable=group["differentiable"],
+                has_complex=has_complex,
+            )
 
         return loss
 
-NAdam.__doc__ = r"""Implements NAdam algorithm.
+
+NAdam.__doc__ = (
+    r"""Implements NAdam algorithm.
 
     .. math::
        \begin{aligned}
@@ -163,7 +232,8 @@ def step(self, closure=None):
        \end{aligned}
 
     For further details regarding the algorithm we refer to `Incorporating Nesterov Momentum into Adam`_.
-    """ + fr"""
+    """
+    + rf"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -186,87 +256,28 @@ def step(self, closure=None):
         https://arxiv.org/abs/1711.05101
 
     """
-
-
-def nadam(params: List[Tensor],
-          grads: List[Tensor],
-          exp_avgs: List[Tensor],
-          exp_avg_sqs: List[Tensor],
-          mu_products: List[Tensor],
-          state_steps: List[Tensor],
-          # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-          # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-          decoupled_weight_decay: bool = False,
-          foreach: Optional[bool] = None,
-          capturable: bool = False,
-          differentiable: bool = False,
-          has_complex: bool = False,
-          *,
-          beta1: float,
-          beta2: float,
-          lr: float,
-          weight_decay: float,
-          momentum_decay: float,
-          eps: float):
-    r"""Functional API that performs NAdam algorithm computation.
-
-    See :class:`~torch.optim.NAdam` for details.
-    """
-
-
-    if not all(isinstance(t, torch.Tensor) for t in state_steps):
-        raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
-
-    if not all(isinstance(t, torch.Tensor) for t in mu_products):
-        raise RuntimeError("API has changed, `mu_products` argument must contain a list of singleton tensors")
-
-    if foreach is None:
-        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
-
-    if foreach and torch.jit.is_scripting():
-        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
-
-    if foreach and not torch.jit.is_scripting():
-        func = _multi_tensor_nadam
-    else:
-        func = _single_tensor_nadam
-
-    func(params,
-         grads,
-         exp_avgs,
-         exp_avg_sqs,
-         mu_products,
-         state_steps,
-         beta1=beta1,
-         beta2=beta2,
-         lr=lr,
-         weight_decay=weight_decay,
-         momentum_decay=momentum_decay,
-         decoupled_weight_decay=decoupled_weight_decay,
-         eps=eps,
-         capturable=capturable,
-         differentiable=differentiable,
-         has_complex=has_complex)
-
-
-def _single_tensor_nadam(params: List[Tensor],
-                         grads: List[Tensor],
-                         exp_avgs: List[Tensor],
-                         exp_avg_sqs: List[Tensor],
-                         mu_products: List[Tensor],
-                         state_steps: List[Tensor],
-                         *,
-                         beta1: float,
-                         beta2: float,
-                         lr: float,
-                         weight_decay: float,
-                         momentum_decay: float,
-                         eps: float,
-                         decoupled_weight_decay: bool,
-                         capturable: bool,
-                         differentiable: bool,
-                         has_complex: bool):
-
+)
+
+
+def _single_tensor_nadam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    mu_products: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    momentum_decay: float,
+    eps: float,
+    decoupled_weight_decay: bool,
+    capturable: bool,
+    differentiable: bool,
+    has_complex: bool,
+):
     for i, param in enumerate(params):
         grad = grads[i]
         exp_avg = exp_avgs[i]
@@ -282,8 +293,8 @@ def _single_tensor_nadam(params: List[Tensor],
 
         # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
         if not torch._utils.is_compiling() and capturable:
-            assert (
-                (param.is_cuda and mu_product.is_cuda and step_t.is_cuda) or (param.is_xla and mu_product.is_xla and step_t.is_xla)
+            assert (param.is_cuda and mu_product.is_cuda and step_t.is_cuda) or (
+                param.is_xla and mu_product.is_xla and step_t.is_xla
             ), "If capturable=True, params, mu_products, and state_steps must be CUDA or XLA tensors."
 
         # update step
@@ -294,7 +305,7 @@ def _single_tensor_nadam(params: List[Tensor],
         else:
             step = _get_value(step_t)
 
-        bias_correction2 = 1 - beta2 ** step
+        bias_correction2 = 1 - beta2**step
 
         if weight_decay != 0:
             if decoupled_weight_decay:
@@ -304,8 +315,8 @@ def _single_tensor_nadam(params: List[Tensor],
                 grad = grad.add(param, alpha=weight_decay)
 
         # calculate the momentum cache \mu^{t} and \mu^{t+1}
-        mu = beta1 * (1. - 0.5 * (0.96 ** (step * momentum_decay)))
-        mu_next = beta1 * (1. - 0.5 * (0.96 ** ((step + 1) * momentum_decay)))
+        mu = beta1 * (1.0 - 0.5 * (0.96 ** (step * momentum_decay)))
+        mu_next = beta1 * (1.0 - 0.5 * (0.96 ** ((step + 1) * momentum_decay)))
 
         # update mu_product
         mu_product *= mu
@@ -321,35 +332,40 @@ def _single_tensor_nadam(params: List[Tensor],
             # by updating the grad and exp_avg directly and not using the
             # scalar "value" argument of addcdiv.
             mu_product_next = mu_product * mu_next
-            grad = grad * (-lr * (1. - mu) / (1. - mu_product))
-            exp_avg = exp_avg * (-lr * mu_next / (1. - mu_product_next))
+            grad = grad * (-lr * (1.0 - mu) / (1.0 - mu_product))
+            exp_avg = exp_avg * (-lr * mu_next / (1.0 - mu_product_next))
             param.addcdiv_(grad, denom)
             param.addcdiv_(exp_avg, denom)
         else:
             mu_product_next = _get_value(mu_product) * mu_next
             denom.add_(eps)
-            param.addcdiv_(grad, denom, value=(-lr * (1. - mu) / (1. - _get_value(mu_product))))
-            param.addcdiv_(exp_avg, denom, value=(-lr * mu_next) / (1. - mu_product_next))
-
-
-def _multi_tensor_nadam(params: List[Tensor],
-                        grads: List[Tensor],
-                        exp_avgs: List[Tensor],
-                        exp_avg_sqs: List[Tensor],
-                        mu_products: List[Tensor],
-                        state_steps: List[Tensor],
-                        *,
-                        beta1: float,
-                        beta2: float,
-                        lr: float,
-                        weight_decay: float,
-                        momentum_decay: float,
-                        eps: float,
-                        decoupled_weight_decay: bool,
-                        capturable: bool,
-                        differentiable: bool,
-                        has_complex: bool):
-
+            param.addcdiv_(
+                grad, denom, value=(-lr * (1.0 - mu) / (1.0 - _get_value(mu_product)))
+            )
+            param.addcdiv_(
+                exp_avg, denom, value=(-lr * mu_next) / (1.0 - mu_product_next)
+            )
+
+
+def _multi_tensor_nadam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    mu_products: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    momentum_decay: float,
+    eps: float,
+    decoupled_weight_decay: bool,
+    capturable: bool,
+    differentiable: bool,
+    has_complex: bool,
+):
     if len(params) == 0:
         return
 
@@ -357,25 +373,36 @@ def _multi_tensor_nadam(params: List[Tensor],
 
     # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
     if not torch._utils.is_compiling() and capturable:
-        assert all(p.is_cuda and mp.is_cuda and step.is_cuda
-                   for p, mp, step in zip(params, mu_products, state_steps)), \
-            "If capturable=True, params, mu_products, and state_steps must be CUDA tensors."
-
-
-    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, exp_avgs, exp_avg_sqs, mu_products, state_steps])
-    for ((grouped_params, grouped_grads, grouped_exp_avgs,
-         grouped_exp_avg_sqs, grouped_mu_products, grouped_state_steps), _) in grouped_tensors.values():
-
+        assert all(
+            p.is_cuda and mp.is_cuda and step.is_cuda
+            for p, mp, step in zip(params, mu_products, state_steps)
+        ), "If capturable=True, params, mu_products, and state_steps must be CUDA tensors."
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, exp_avgs, exp_avg_sqs, mu_products, state_steps]
+    )
+    for (
+        grouped_params,
+        grouped_grads,
+        grouped_exp_avgs,
+        grouped_exp_avg_sqs,
+        grouped_mu_products,
+        grouped_state_steps,
+    ), _ in grouped_tensors.values():
         # handle complex
         if has_complex:
-            _view_as_real(grouped_params, grouped_grads, grouped_exp_avgs, grouped_exp_avg_sqs)
+            _view_as_real(
+                grouped_params, grouped_grads, grouped_exp_avgs, grouped_exp_avg_sqs
+            )
 
         # Update steps
         # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
         # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
         # wrapped it once now. The alpha is required to assure we go to the right overload.
         if grouped_state_steps[0].is_cpu:
-            torch._foreach_add_(grouped_state_steps, torch.tensor(1.0, device='cpu'), alpha=1.0)
+            torch._foreach_add_(
+                grouped_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
+            )
         else:
             torch._foreach_add_(grouped_state_steps, 1)
 
@@ -384,13 +411,17 @@ def _multi_tensor_nadam(params: List[Tensor],
                 # Perform stepweight decay
                 torch._foreach_mul_(grouped_params, 1 - lr * weight_decay)
             else:
-                grouped_grads = torch._foreach_add(grouped_grads, grouped_params, alpha=weight_decay)
+                grouped_grads = torch._foreach_add(
+                    grouped_grads, grouped_params, alpha=weight_decay
+                )
 
         # Decay the first and second moment running average coefficient
         torch._foreach_lerp_(grouped_exp_avgs, grouped_grads, 1 - beta1)
 
         torch._foreach_mul_(grouped_exp_avg_sqs, beta2)
-        torch._foreach_addcmul_(grouped_exp_avg_sqs, grouped_grads, grouped_grads, 1 - beta2)
+        torch._foreach_addcmul_(
+            grouped_exp_avg_sqs, grouped_grads, grouped_grads, 1 - beta2
+        )
 
         exp_avg_sq_sqrt = torch._foreach_sqrt(grouped_exp_avg_sqs)
 
@@ -418,10 +449,19 @@ def _multi_tensor_nadam(params: List[Tensor],
             torch._foreach_neg_(bias_correction_sqrt)
             torch._foreach_sqrt_(bias_correction_sqrt)
         else:
-            bias_correction_sqrt = [_dispatch_sqrt(1 - beta2 ** _get_value(step)) for step in grouped_state_steps]
-            mus = [beta1 * (1. - 0.5 * (0.96 ** (_get_value(step) * momentum_decay))) for step in grouped_state_steps]
-            mu_nexts = [beta1 * (1. - 0.5 * (0.96 ** ((_get_value(step) + 1) * momentum_decay)))
-                        for step in grouped_state_steps]
+            bias_correction_sqrt = [
+                _dispatch_sqrt(1 - beta2 ** _get_value(step))
+                for step in grouped_state_steps
+            ]
+            mus = [
+                beta1 * (1.0 - 0.5 * (0.96 ** (_get_value(step) * momentum_decay)))
+                for step in grouped_state_steps
+            ]
+            mu_nexts = [
+                beta1
+                * (1.0 - 0.5 * (0.96 ** ((_get_value(step) + 1) * momentum_decay)))
+                for step in grouped_state_steps
+            ]
 
         # update mu_products
         torch._foreach_mul_(grouped_mu_products, mus)
@@ -465,10 +505,98 @@ def _multi_tensor_nadam(params: List[Tensor],
             # finally, update params
             torch._foreach_addcdiv_(grouped_params, numerator, exp_avg_sq_sqrt)
         else:
-            step_size_grads = _stack_if_compiling([(lr * (1. - mu) / (1. - _get_value(mu_product))) * -1
-                                                   for mu_product, mu in zip(grouped_mu_products, mus)])
-            step_size_expavg = _stack_if_compiling([(lr * mu_next / (1. - _get_value(mu_product) * mu_next)) * -1
-                                                    for mu_product, mu_next in zip(grouped_mu_products, mu_nexts)])
+            step_size_grads = _stack_if_compiling(
+                [
+                    (_get_value(lr) * (1.0 - mu) / (1.0 - _get_value(mu_product))) * -1
+                    for mu_product, mu in zip(grouped_mu_products, mus)
+                ]
+            )
+            step_size_expavg = _stack_if_compiling(
+                [
+                    (
+                        _get_value(lr)
+                        * mu_next
+                        / (1.0 - _get_value(mu_product) * mu_next)
+                    )
+                    * -1
+                    for mu_product, mu_next in zip(grouped_mu_products, mu_nexts)
+                ]
+            )
+
+            torch._foreach_addcdiv_(
+                grouped_params, grouped_grads, exp_avg_sq_sqrt, step_size_grads
+            )
+            torch._foreach_addcdiv_(
+                grouped_params, grouped_exp_avgs, exp_avg_sq_sqrt, step_size_expavg
+            )
+
+
+@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_nadam)
+def nadam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    mu_products: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    decoupled_weight_decay: bool = False,
+    foreach: Optional[bool] = None,
+    capturable: bool = False,
+    differentiable: bool = False,
+    has_complex: bool = False,
+    *,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    momentum_decay: float,
+    eps: float,
+):
+    r"""Functional API that performs NAdam algorithm computation.
+
+    See :class:`~torch.optim.NAdam` for details.
+    """
+
+    if not all(isinstance(t, torch.Tensor) for t in state_steps):
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
+
+    if not all(isinstance(t, torch.Tensor) for t in mu_products):
+        raise RuntimeError(
+            "API has changed, `mu_products` argument must contain a list of singleton tensors"
+        )
+
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(
+            params, differentiable, use_fused=False
+        )
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_nadam
+    else:
+        func = _single_tensor_nadam
 
-            torch._foreach_addcdiv_(grouped_params, grouped_grads, exp_avg_sq_sqrt, step_size_grads)
-            torch._foreach_addcdiv_(grouped_params, grouped_exp_avgs, exp_avg_sq_sqrt, step_size_expavg)
+    func(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        mu_products,
+        state_steps,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        momentum_decay=momentum_decay,
+        decoupled_weight_decay=decoupled_weight_decay,
+        eps=eps,
+        capturable=capturable,
+        differentiable=differentiable,
+        has_complex=has_complex,
+    )
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 221b6f48bfbc3..9ae0ca586135f 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -1,62 +1,73 @@
-import math
 import functools
+import math
 import warnings
-from collections import OrderedDict, defaultdict
+from collections import defaultdict, OrderedDict
 from copy import deepcopy
 from itertools import chain
 from typing import (
     Any,
     Callable,
+    cast,
     DefaultDict,
     Dict,
     Hashable,
     Iterable,
     List,
     Optional,
+    overload,
     Set,
     Tuple,
     TypeVar,
     Union,
-    cast,
-    overload,
 )
+
 from typing_extensions import ParamSpec, Self, TypeAlias
 
 import torch
 import torch.utils.hooks as hooks
-from torch.utils.hooks import RemovableHandle
+from torch._utils import is_compiling
 from torch.utils._foreach_utils import (
-    Indices,
-    TensorListList,
     _get_foreach_kernels_supported_devices,
     _get_fused_kernels_supported_devices,
+    _group_tensors_by_device_and_dtype,
+    Indices,
 )
-from torch._utils import is_compiling
-from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
+from torch.utils.hooks import RemovableHandle
 
 Args: TypeAlias = Tuple[Any, ...]
 Kwargs: TypeAlias = Dict[str, Any]
 StateDict: TypeAlias = Dict[str, Any]
+TensorListList: TypeAlias = List[List[torch.Tensor]]
 
-GlobalOptimizerPreHook: TypeAlias = Callable[["Optimizer", Args, Kwargs], Optional[Tuple[Args, Kwargs]]]
+GlobalOptimizerPreHook: TypeAlias = Callable[
+    ["Optimizer", Args, Kwargs], Optional[Tuple[Args, Kwargs]]
+]
 GlobalOptimizerPostHook: TypeAlias = Callable[["Optimizer", Args, Kwargs], None]
 
-__all__ = ['Optimizer', 'register_optimizer_step_pre_hook', 'register_optimizer_step_post_hook']
+__all__ = [
+    "Optimizer",
+    "register_optimizer_step_pre_hook",
+    "register_optimizer_step_post_hook",
+]
 _global_optimizer_pre_hooks: Dict[int, GlobalOptimizerPreHook] = OrderedDict()
 _global_optimizer_post_hooks: Dict[int, GlobalOptimizerPostHook] = OrderedDict()
 _foreach_supported_types = [torch.Tensor, torch.nn.parameter.Parameter]
 
+
 class _RequiredParameter:
     """Singleton class representing a required parameter for an Optimizer."""
+
     def __repr__(self) -> str:
         return "<required parameter>"
 
+
 required = _RequiredParameter()
 
 
 def _use_grad_for_differentiable(func):
     def _use_grad(self, *args, **kwargs):
         import torch._dynamo
+
         prev_grad = torch.is_grad_enabled()
         try:
             # Note on graph break below:
@@ -71,22 +82,25 @@ def _use_grad(self, *args, **kwargs):
             # or 2) have a fully fused forward and backward graph, which will have no_grad by default, and we can remove this
             # graph break to allow the fully fused fwd-bwd-optimizer graph to be compiled.
             # see https://github.com/pytorch/pytorch/issues/104053
-            torch.set_grad_enabled(self.defaults['differentiable'])
+            torch.set_grad_enabled(self.defaults["differentiable"])
             torch._dynamo.graph_break()
             ret = func(self, *args, **kwargs)
         finally:
             torch._dynamo.graph_break()
             torch.set_grad_enabled(prev_grad)
         return ret
+
     functools.update_wrapper(_use_grad, func)
     return _use_grad
 
+
 def _get_value(x):
     # item is significantly faster than a cpu tensor in eager mode
     if not torch.jit.is_scripting() and is_compiling():
         return x
     else:
-        return x.item()
+        return x.item() if isinstance(x, torch.Tensor) else x
+
 
 def _stack_if_compiling(x):
     if not torch.jit.is_scripting() and is_compiling():
@@ -94,37 +108,71 @@ def _stack_if_compiling(x):
     else:
         return x
 
-def _dispatch_sqrt(x: float):  # float annotation is needed because of torchscript type inference
+
+def _dispatch_sqrt(
+    x: float,
+):  # float annotation is needed because of torchscript type inference
     if not torch.jit.is_scripting() and isinstance(x, torch.Tensor):
         return x.sqrt()
     else:
         return math.sqrt(x)
 
+
+def _disable_dynamo_if_unsupported(single_tensor_fn=None):
+    # workaround for torchscript BC
+    # it requires all called functions to be in the
+    # global environment at the site at which the
+    # maybe_fallback closure is created
+    if single_tensor_fn:
+        globals()[single_tensor_fn.__name__] = single_tensor_fn
+
+    def wrapper(func):
+        @functools.wraps(func)
+        def maybe_fallback(self, *args, **kwargs):
+            if is_compiling() and not kwargs.get("capturable", False):
+                return torch._disable_dynamo(func(self, *args, **kwargs))
+            else:
+                return func(self, *args, **kwargs)
+
+        return maybe_fallback
+
+    return wrapper
+
+
 # For any optimizer with a faster implementation, we attempt to default to the
 # fastest + stablest whenever possible. For foreach, the requirements are to have
 # native params all on CUDA. For fused, there's currently the additional requirement
 # that the tensors' dtypes must be floating point. Neither alternative supports
 # torch.jit.script nor differentiable, so we fall back to the single tensor
 # implementation in those cases.
-def _default_to_fused_or_foreach(params: List[torch.Tensor],
-                                 differentiable: bool,
-                                 use_fused: bool = False) -> Tuple[bool, bool]:
+def _default_to_fused_or_foreach(
+    params: List[torch.Tensor], differentiable: bool, use_fused: bool = False
+) -> Tuple[bool, bool]:
     if torch.jit.is_scripting() or differentiable:
         return False, False
 
     fused_supported_devices = _get_fused_kernels_supported_devices()
     foreach_supported_devices = _get_foreach_kernels_supported_devices()
     fused = use_fused and all(
-        p is None or (type(p) in _foreach_supported_types and
-                      p.device.type in fused_supported_devices and
-                      torch.is_floating_point(p)) for p in params
+        p is None
+        or (
+            type(p) in _foreach_supported_types
+            and p.device.type in fused_supported_devices
+            and torch.is_floating_point(p)
+        )
+        for p in params
     )
     foreach = not fused and all(
-        p is None or (type(p) in _foreach_supported_types and
-                      p.device.type in foreach_supported_devices) for p in params
+        p is None
+        or (
+            type(p) in _foreach_supported_types
+            and p.device.type in foreach_supported_devices
+        )
+        for p in params
     )
     return fused, foreach
 
+
 def _view_as_real(params, *state_and_grads):
     for i, p in enumerate(params):
         if torch.is_complex(p):
@@ -132,10 +180,14 @@ def _view_as_real(params, *state_and_grads):
             for s in state_and_grads:
                 s[i] = torch.view_as_real(s[i])
 
+
 def _get_scalar_dtype(is_fused=None):
     if is_fused:
         return torch.float32
-    return torch.float64 if torch.get_default_dtype() == torch.float64 else torch.float32
+    return (
+        torch.float64 if torch.get_default_dtype() == torch.float64 else torch.float32
+    )
+
 
 # Common doc strings among optimizers
 _foreach_doc = r"""foreach (bool, optional): whether foreach implementation of optimizer
@@ -215,6 +267,7 @@ def register_optimizer_step_post_hook(hook: GlobalOptimizerPostHook) -> Removabl
     _global_optimizer_post_hooks[handle.id] = hook
     return handle
 
+
 ParamsT: TypeAlias = Union[Iterable[torch.Tensor], Iterable[Dict[str, Any]]]
 
 _P = ParamSpec("_P")
@@ -260,16 +313,21 @@ def __init__(self, params: ParamsT, defaults: Dict[str, Any]) -> None:
         self._patch_step_function()
 
         if isinstance(params, torch.Tensor):
-            if self.__class__.__name__ == 'SparseAdam':
-                warnings.warn(("Passing in a raw Tensor as ``params`` to SparseAdam "
-                               "is deprecated. In the future, this will raise an error. "
-                               "Please wrap your Tensor in an iterable instead."),
-                              FutureWarning)
+            if self.__class__.__name__ == "SparseAdam":
+                warnings.warn(
+                    (
+                        "Passing in a raw Tensor as ``params`` to SparseAdam "
+                        "is deprecated. In the future, this will raise an error. "
+                        "Please wrap your Tensor in an iterable instead."
+                    ),
+                    FutureWarning,
+                )
                 params = [params]
             else:
-                raise TypeError("params argument given to the optimizer should be "
-                                "an iterable of Tensors or dicts, but got " +
-                                torch.typename(params))
+                raise TypeError(
+                    "params argument given to the optimizer should be "
+                    "an iterable of Tensors or dicts, but got " + torch.typename(params)
+                )
 
         self.state: DefaultDict[torch.Tensor, Any] = defaultdict(dict)
         self.param_groups: List[Dict[str, Any]] = []
@@ -278,7 +336,7 @@ def __init__(self, params: ParamsT, defaults: Dict[str, Any]) -> None:
         if len(param_groups) == 0:
             raise ValueError("optimizer got an empty parameter list")
         if not isinstance(param_groups[0], dict):
-            param_groups = [{'params': param_groups}]
+            param_groups = [{"params": param_groups}]
 
         for param_group in param_groups:
             self.add_param_group(cast(dict, param_group))
@@ -290,37 +348,37 @@ def __init__(self, params: ParamsT, defaults: Dict[str, Any]) -> None:
 
     def __getstate__(self) -> Dict[str, Any]:
         return {
-            'defaults': self.defaults,
-            'state': self.state,
-            'param_groups': self.param_groups,
+            "defaults": self.defaults,
+            "state": self.state,
+            "param_groups": self.param_groups,
         }
 
     def __setstate__(self, state: Dict[str, Any]) -> None:
         self.__dict__.update(state)
-        if '_optimizer_step_pre_hooks' not in self.__dict__:
+        if "_optimizer_step_pre_hooks" not in self.__dict__:
             self._optimizer_step_pre_hooks = OrderedDict()
-        if '_optimizer_step_post_hooks' not in self.__dict__:
+        if "_optimizer_step_post_hooks" not in self.__dict__:
             self._optimizer_step_post_hooks = OrderedDict()
-        if '_optimizer_state_dict_pre_hooks' not in self.__dict__:
+        if "_optimizer_state_dict_pre_hooks" not in self.__dict__:
             self._optimizer_state_dict_pre_hooks = OrderedDict()
-        if '_optimizer_state_dict_post_hooks' not in self.__dict__:
+        if "_optimizer_state_dict_post_hooks" not in self.__dict__:
             self._optimizer_state_dict_post_hooks = OrderedDict()
-        if '_optimizer_load_state_dict_pre_hooks' not in self.__dict__:
+        if "_optimizer_load_state_dict_pre_hooks" not in self.__dict__:
             self._optimizer_load_state_dict_pre_hooks = OrderedDict()
-        if '_optimizer_load_state_dict_post_hooks' not in self.__dict__:
+        if "_optimizer_load_state_dict_post_hooks" not in self.__dict__:
             self._optimizer_load_state_dict_post_hooks = OrderedDict()
         self._patch_step_function()  # To support multiprocessing pickle/unpickle
-        self.defaults.setdefault('differentiable', False)
+        self.defaults.setdefault("differentiable", False)
 
     def __repr__(self) -> str:
-        format_string = self.__class__.__name__ + ' ('
+        format_string = self.__class__.__name__ + " ("
         for i, group in enumerate(self.param_groups):
-            format_string += '\n'
-            format_string += f'Parameter Group {i}\n'
+            format_string += "\n"
+            format_string += f"Parameter Group {i}\n"
             for key in sorted(group.keys()):
-                if key != 'params':
-                    format_string += f'    {key}: {group[key]}\n'
-        format_string += ')'
+                if key != "params":
+                    format_string += f"    {key}: {group[key]}\n"
+        format_string += ")"
         return format_string
 
     # Currently needed by Adam and AdamW
@@ -335,17 +393,25 @@ def _cuda_graph_capture_health_check(self) -> None:
         # https://github.com/pytorch/pytorch/blob/d3ba8901d8640eb16f88b2bfef9df7fa383d4b47/torch/_inductor/compile_fx.py#L390.
         # Thus, when compiling, inductor will determine if cudagraphs
         # can be enabled based on whether there is input mutation or CPU tensors.
-        if not is_compiling() and torch.backends.cuda.is_built() and torch.cuda.is_available():
+        if (
+            not is_compiling()
+            and torch.backends.cuda.is_built()
+            and torch.cuda.is_available()
+        ):
             capturing = torch.cuda.is_current_stream_capturing()
 
-            if capturing and not all(group['capturable'] for group in self.param_groups):
-                raise RuntimeError("Attempting CUDA graph capture of step() for an instance of " +
-                                   self.__class__.__name__ +
-                                   " but param_groups' capturable is False.")
+            if capturing and not all(
+                group["capturable"] for group in self.param_groups
+            ):
+                raise RuntimeError(
+                    "Attempting CUDA graph capture of step() for an instance of "
+                    + self.__class__.__name__
+                    + " but param_groups' capturable is False."
+                )
 
             if (
                 (not getattr(self, "_warned_capturable_if_run_uncaptured", False))
-                and all(group['capturable'] for group in self.param_groups)
+                and all(group["capturable"] for group in self.param_groups)
                 and (not capturing)
             ):
                 warnings.warn(
@@ -370,7 +436,6 @@ def _optimizer_step_code(self) -> None:
 
     @staticmethod
     def profile_hook_step(func: Callable[_P, R]) -> Callable[_P, R]:
-
         @functools.wraps(func)
         def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> R:
             self, *_ = args
@@ -378,7 +443,10 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> R:
             profile_name = f"Optimizer.step#{self.__class__.__name__}.step"
             with torch.autograd.profiler.record_function(profile_name):
                 # call optimizer step pre hooks
-                for pre_hook in chain(_global_optimizer_pre_hooks.values(), self._optimizer_step_pre_hooks.values()):
+                for pre_hook in chain(
+                    _global_optimizer_pre_hooks.values(),
+                    self._optimizer_step_pre_hooks.values(),
+                ):
                     result = pre_hook(self, args, kwargs)
                     if result is not None:
                         if isinstance(result, tuple) and len(result) == 2:
@@ -392,7 +460,10 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> R:
                 self._optimizer_step_code()
 
                 # call optimizer step post hooks
-                for post_hook in chain(self._optimizer_step_post_hooks.values(), _global_optimizer_post_hooks.values()):
+                for post_hook in chain(
+                    self._optimizer_step_post_hooks.values(),
+                    _global_optimizer_post_hooks.values(),
+                ):
                     post_hook(self, args, kwargs)
 
                 return out
@@ -408,14 +479,17 @@ def _group_tensors_by_device_and_dtype(
         Dict[Tuple[torch.device, torch.dtype], Tuple[TensorListList, Indices]],
     ]:
         """Groups a list of lists of tensors by device and dtype.
-        Skips this step if we are compiling since this will occur during inductor lowering."""
+        Skips this step if we are compiling since this will occur during inductor lowering.
+        """
         if is_compiling():
             return {(None, None): (tensorlistlist, list(range(len(tensorlistlist[0]))))}
         else:
-            return _group_tensors_by_device_and_dtype(tensorlistlist, with_indices)
+            return _group_tensors_by_device_and_dtype(tensorlistlist, with_indices)  # type: ignore[return-value, arg-type]
 
     def _patch_step_function(self) -> None:
-        self._zero_grad_profile_name = f"Optimizer.zero_grad#{self.__class__.__name__}.zero_grad"
+        self._zero_grad_profile_name = (
+            f"Optimizer.zero_grad#{self.__class__.__name__}.zero_grad"
+        )
         hooked = getattr(self.__class__.step, "hooked", None)
         if not hooked:
             self.__class__.step = self.profile_hook_step(self.__class__.step)  # type: ignore[assignment]
@@ -463,7 +537,6 @@ def register_step_post_hook(self, hook: OptimizerPostHook) -> RemovableHandle:
         self._optimizer_step_post_hooks[handle.id] = hook
         return handle
 
-
     def register_state_dict_pre_hook(
         self, hook: Callable[["Optimizer"], None], prepend: bool = False
     ) -> RemovableHandle:
@@ -496,7 +569,6 @@ def register_state_dict_pre_hook(
             self._optimizer_state_dict_pre_hooks.move_to_end(handle.id, last=False)
         return handle
 
-
     def register_state_dict_post_hook(
         self,
         hook: Callable[["Optimizer", StateDict], Optional[StateDict]],
@@ -531,7 +603,6 @@ def register_state_dict_post_hook(
             self._optimizer_state_dict_post_hooks.move_to_end(handle.id, last=False)
         return handle
 
-
     @torch._disable_dynamo
     def state_dict(self) -> StateDict:
         r"""Returns the state of the optimizer as a :class:`dict`.
@@ -592,20 +663,28 @@ def state_dict(self) -> StateDict:
 
         def pack_group(group: Dict[str, Any]) -> Dict[str, Any]:
             nonlocal start_index
-            packed = {k: v for k, v in group.items() if k != 'params'}
-            param_mappings.update({id(p): i for i, p in enumerate(group['params'], start_index)
-                                   if id(p) not in param_mappings})
-            packed['params'] = [param_mappings[id(p)] for p in group['params']]
-            start_index += len(packed['params'])
+            packed = {k: v for k, v in group.items() if k != "params"}
+            param_mappings.update(
+                {
+                    id(p): i
+                    for i, p in enumerate(group["params"], start_index)
+                    if id(p) not in param_mappings
+                }
+            )
+            packed["params"] = [param_mappings[id(p)] for p in group["params"]]
+            start_index += len(packed["params"])
             return packed
+
         param_groups = [pack_group(g) for g in self.param_groups]
         # Remap state to use order indices as keys
-        packed_state = {(param_mappings[id(k)] if isinstance(k, torch.Tensor) else k): v
-                        for k, v in self.state.items()}
+        packed_state = {
+            (param_mappings[id(k)] if isinstance(k, torch.Tensor) else k): v
+            for k, v in self.state.items()
+        }
 
         state_dict = {
-            'state': packed_state,
-            'param_groups': param_groups,
+            "state": packed_state,
+            "param_groups": param_groups,
         }
 
         for post_hook in self._optimizer_state_dict_post_hooks.values():
@@ -634,8 +713,7 @@ def _process_value_according_to_param_policy(
                 fused = pg["fused"] if "fused" in pg else False
                 capturable = pg["capturable"] if "capturable" in pg else False
                 break
-
-        if key == 'step':
+        if key == "step":
             if capturable or fused:
                 return value.to(dtype=torch.float32, device=param.device)
             else:
@@ -646,7 +724,6 @@ def _process_value_according_to_param_policy(
             else:
                 return value.to(device=param.device)
 
-
     def register_load_state_dict_pre_hook(
         self,
         hook: Callable[["Optimizer", StateDict], Optional[StateDict]],
@@ -686,7 +763,6 @@ def register_load_state_dict_pre_hook(
             self._optimizer_load_state_dict_pre_hooks.move_to_end(handle.id, last=False)
         return handle
 
-
     def register_load_state_dict_post_hook(
         self, hook: Callable[["Optimizer"], None], prepend: bool = False
     ) -> RemovableHandle:
@@ -721,7 +797,6 @@ def register_load_state_dict_post_hook(
             self._optimizer_load_state_dict_post_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
         return handle
 
-
     @torch._disable_dynamo
     def load_state_dict(self, state_dict: StateDict) -> None:
         r"""Loads the optimizer state.
@@ -742,27 +817,41 @@ def load_state_dict(self, state_dict: StateDict) -> None:
         groups = self.param_groups
 
         # Deepcopy as we write into saved_groups later to update state
-        saved_groups = deepcopy(state_dict['param_groups'])
+        saved_groups = deepcopy(state_dict["param_groups"])
 
         if len(groups) != len(saved_groups):
-            raise ValueError("loaded state dict has a different number of "
-                             "parameter groups")
-        param_lens = (len(g['params']) for g in groups)
-        saved_lens = (len(g['params']) for g in saved_groups)
+            raise ValueError(
+                "loaded state dict has a different number of " "parameter groups"
+            )
+        param_lens = (len(g["params"]) for g in groups)
+        saved_lens = (len(g["params"]) for g in saved_groups)
         if any(p_len != s_len for p_len, s_len in zip(param_lens, saved_lens)):
-            raise ValueError("loaded state dict contains a parameter group "
-                             "that doesn't match the size of optimizer's group")
+            raise ValueError(
+                "loaded state dict contains a parameter group "
+                "that doesn't match the size of optimizer's group"
+            )
 
         # Update the state
-        id_map = dict(zip(chain.from_iterable(g['params'] for g in saved_groups),
-                      chain.from_iterable(g['params'] for g in groups)))
+        id_map = dict(
+            zip(
+                chain.from_iterable(g["params"] for g in saved_groups),
+                chain.from_iterable(g["params"] for g in groups),
+            )
+        )
 
         def _cast(param, value, param_id=None, param_groups=None, key=None):
             r"""Make a deep copy of value, casting all tensors to device of param."""
             if isinstance(value, torch.Tensor):
-                return Optimizer._process_value_according_to_param_policy(param, value, param_id, param_groups, key)
+                return Optimizer._process_value_according_to_param_policy(
+                    param, value, param_id, param_groups, key
+                )
             elif isinstance(value, dict):
-                return {k: _cast(param, v, param_id=param_id, param_groups=param_groups, key=k) for k, v in value.items()}
+                return {
+                    k: _cast(
+                        param, v, param_id=param_id, param_groups=param_groups, key=k
+                    )
+                    for k, v in value.items()
+                }
             elif isinstance(value, Iterable):
                 return type(value)(_cast(param, v, param_id=param_id, param_groups=param_groups) for v in value)  # type: ignore[call-arg]
             else:
@@ -772,25 +861,28 @@ def _cast(param, value, param_id=None, param_groups=None, key=None):
         # State that is not assigned to params is copied as is (needed for
         # backward compatibility).
         state: DefaultDict[torch.Tensor, Dict[Any, Any]] = defaultdict(dict)
-        for k, v in state_dict['state'].items():
+        for k, v in state_dict["state"].items():
             if k in id_map:
                 param = id_map[k]
-                state[param] = _cast(param, v, param_id=k, param_groups=state_dict['param_groups'])
+                state[param] = _cast(
+                    param, v, param_id=k, param_groups=state_dict["param_groups"]
+                )
             else:
                 state[k] = v
 
         # Update parameter groups, setting their 'params' value
-        def update_group(group: Dict[str, Any], new_group: Dict[str, Any]) -> Dict[str, Any]:
-            new_group['params'] = group['params']
+        def update_group(
+            group: Dict[str, Any], new_group: Dict[str, Any]
+        ) -> Dict[str, Any]:
+            new_group["params"] = group["params"]
             return new_group
-        param_groups = [
-            update_group(g, ng) for g, ng in zip(groups, saved_groups)]
-        self.__setstate__({'state': state, 'param_groups': param_groups})
+
+        param_groups = [update_group(g, ng) for g, ng in zip(groups, saved_groups)]
+        self.__setstate__({"state": state, "param_groups": param_groups})
 
         for post_hook in self._optimizer_load_state_dict_post_hooks.values():
             post_hook(self)
 
-
     @torch._disable_dynamo
     def zero_grad(self, set_to_none: bool = True) -> None:
         r"""Resets the gradients of all optimized :class:`torch.Tensor` s.
@@ -807,12 +899,16 @@ def zero_grad(self, set_to_none: bool = True) -> None:
                 (in one case it does the step with a gradient of 0 and in the other it skips
                 the step altogether).
         """
-        foreach = self.defaults.get('foreach', False) or self.defaults.get('fused', False)
+        foreach = self.defaults.get("foreach", False) or self.defaults.get(
+            "fused", False
+        )
 
         if not hasattr(self, "_zero_grad_profile_name"):
             self._patch_step_function()
 
-        per_device_and_dtype_grads: Optional[DefaultDict[torch.device, DefaultDict[torch.dtype, List[torch.Tensor]]]]
+        per_device_and_dtype_grads: Optional[
+            DefaultDict[torch.device, DefaultDict[torch.dtype, List[torch.Tensor]]]
+        ]
         if foreach:
             per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))
         else:
@@ -820,7 +916,7 @@ def zero_grad(self, set_to_none: bool = True) -> None:
 
         with torch.autograd.profiler.record_function(self._zero_grad_profile_name):
             for group in self.param_groups:
-                for p in group['params']:
+                for p in group["params"]:
                     if p.grad is not None:
                         if set_to_none:
                             p.grad = None
@@ -829,11 +925,13 @@ def zero_grad(self, set_to_none: bool = True) -> None:
                                 p.grad.detach_()
                             else:
                                 p.grad.requires_grad_(False)
-                            if (not foreach or p.grad.is_sparse):
+                            if not foreach or p.grad.is_sparse:
                                 p.grad.zero_()
                             else:
                                 assert per_device_and_dtype_grads is not None
-                                per_device_and_dtype_grads[p.grad.device][p.grad.dtype].append(p.grad)
+                                per_device_and_dtype_grads[p.grad.device][
+                                    p.grad.dtype
+                                ].append(p.grad)
             if foreach:
                 assert per_device_and_dtype_grads is not None
                 for per_dtype_grads in per_device_and_dtype_grads.values():
@@ -875,39 +973,50 @@ def add_param_group(self, param_group: Dict[str, Any]) -> None:
         if not isinstance(param_group, dict):
             raise TypeError(f"param_group must be a dict, but got {type(param_group)}")
 
-        params = param_group['params']
+        params = param_group["params"]
         if isinstance(params, torch.Tensor):
-            param_group['params'] = [params]
+            param_group["params"] = [params]
         elif isinstance(params, set):
-            raise TypeError('optimizer parameters need to be organized in ordered collections, but '
-                            'the ordering of tensors in sets will change between runs. Please use a list instead.')
+            raise TypeError(
+                "optimizer parameters need to be organized in ordered collections, but "
+                "the ordering of tensors in sets will change between runs. Please use a list instead."
+            )
         else:
-            param_group['params'] = list(params)
+            param_group["params"] = list(params)
 
-        for param in param_group['params']:
+        for param in param_group["params"]:
             if not isinstance(param, torch.Tensor):
-                raise TypeError("optimizer can only optimize Tensors, "
-                                "but one of the params is " + torch.typename(param))
-            if not self.defaults.get('differentiable', None) and not (param.is_leaf or param.retains_grad):
+                raise TypeError(
+                    "optimizer can only optimize Tensors, "
+                    "but one of the params is " + torch.typename(param)
+                )
+            if not self.defaults.get("differentiable", None) and not (
+                param.is_leaf or param.retains_grad
+            ):
                 raise ValueError("can't optimize a non-leaf Tensor")
 
         for name, default in self.defaults.items():
             if default is required and name not in param_group:
-                raise ValueError(f"parameter group didn't specify a value of required optimization parameter {name}")
+                raise ValueError(
+                    f"parameter group didn't specify a value of required optimization parameter {name}"
+                )
             else:
                 param_group.setdefault(name, default)
 
-        params = param_group['params']
+        params = param_group["params"]
         if len(params) != len(set(params)):
-            warnings.warn("optimizer contains a parameter group with duplicate parameters; "
-                          "in future, this will cause an error; "
-                          "see github.com/pytorch/pytorch/issues/40967 for more information", stacklevel=3)
+            warnings.warn(
+                "optimizer contains a parameter group with duplicate parameters; "
+                "in future, this will cause an error; "
+                "see github.com/pytorch/pytorch/issues/40967 for more information",
+                stacklevel=3,
+            )
 
         param_set: Set[torch.Tensor] = set()
         for group in self.param_groups:
-            param_set.update(set(group['params']))
+            param_set.update(set(group["params"]))
 
-        if not param_set.isdisjoint(set(param_group['params'])):
+        if not param_set.isdisjoint(set(param_group["params"])):
             raise ValueError("some parameters appear in more than one parameter group")
 
         self.param_groups.append(param_group)
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
index 45c8e7a009509..10c38a14a6aa3 100644
--- a/torch/optim/radam.py
+++ b/torch/optim/radam.py
@@ -1,20 +1,20 @@
-import math
 from typing import List, Optional
 
 import torch
 from torch import Tensor
 
 from .optimizer import (
-    Optimizer,
+    _capturable_doc,
     _default_to_fused_or_foreach,
     _differentiable_doc,
+    _disable_dynamo_if_unsupported,
     _dispatch_sqrt,
     _foreach_doc,
     _get_scalar_dtype,
     _get_value,
-    _stack_if_compiling,
     _use_grad_for_differentiable,
     _view_as_real,
+    Optimizer,
 )
 
 __all__ = ["RAdam", "radam"]
@@ -31,6 +31,7 @@ def __init__(
         decoupled_weight_decay: bool = False,
         *,
         foreach: Optional[bool] = None,
+        capturable: bool = False,
         differentiable: bool = False,
     ):
         if not 0.0 <= lr:
@@ -43,12 +44,14 @@ def __init__(
             raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
         if not 0.0 <= weight_decay:
             raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+
         defaults = dict(
             lr=lr,
             betas=betas,
             eps=eps,
             weight_decay=weight_decay,
             foreach=foreach,
+            capturable=capturable,
             decoupled_weight_decay=decoupled_weight_decay,
             differentiable=differentiable,
         )
@@ -60,15 +63,22 @@ def __setstate__(self, state):
             group.setdefault("foreach", None)
             group.setdefault("differentiable", False)
             group.setdefault("decoupled_weight_decay", False)
-        state_values = list(self.state.values())
-        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(
-            state_values[0]["step"]
-        )
-        if not step_is_tensor:
-            for s in state_values:
-                s["step"] = torch.tensor(float(s["step"]), dtype=_get_scalar_dtype())
+            group.setdefault("capturable", False)
+            for p in group["params"]:
+                p_state = self.state.get(p, [])
+                if len(p_state) != 0 and not torch.is_tensor(p_state["step"]):
+                    step_val = float(p_state["step"])
+                    p_state["step"] = (
+                        torch.tensor(
+                            step_val, dtype=_get_scalar_dtype(), device=p.device
+                        )
+                        if group["capturable"]
+                        else torch.tensor(step_val, dtype=_get_scalar_dtype())
+                    )
 
-    def _init_group(self, group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps):
+    def _init_group(
+        self, group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps
+    ):
         has_complex = False
         for p in group["params"]:
             if p.grad is not None:
@@ -81,7 +91,11 @@ def _init_group(self, group, params_with_grad, grads, exp_avgs, exp_avg_sqs, sta
                 state = self.state[p]
                 # Lazy state initialization
                 if len(state) == 0:
-                    state["step"] = torch.tensor(0.0, dtype=_get_scalar_dtype())
+                    state["step"] = (
+                        torch.zeros((), dtype=_get_scalar_dtype(), device=p.device)
+                        if group["capturable"]
+                        else torch.tensor(0.0, dtype=_get_scalar_dtype())
+                    )
                     # Exponential moving average of gradient values
                     state["exp_avg"] = torch.zeros_like(
                         p, memory_format=torch.preserve_format
@@ -105,6 +119,8 @@ def step(self, closure=None):
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
+        self._cuda_graph_capture_health_check()
+
         loss = None
         if closure is not None:
             with torch.enable_grad():
@@ -118,7 +134,9 @@ def step(self, closure=None):
             state_steps = []
             beta1, beta2 = group["betas"]
 
-            has_complex = self._init_group(group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps)
+            has_complex = self._init_group(
+                group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps
+            )
 
             radam(
                 params_with_grad,
@@ -132,6 +150,7 @@ def step(self, closure=None):
                 weight_decay=group["weight_decay"],
                 eps=group["eps"],
                 foreach=group["foreach"],
+                capturable=group["capturable"],
                 differentiable=group["differentiable"],
                 decoupled_weight_decay=group["decoupled_weight_decay"],
                 has_complex=has_complex,
@@ -140,7 +159,8 @@ def step(self, closure=None):
         return loss
 
 
-RAdam.__doc__ = r"""Implements RAdam algorithm.
+RAdam.__doc__ = (
+    r"""Implements RAdam algorithm.
 
     .. math::
        \begin{aligned}
@@ -187,7 +207,8 @@ def step(self, closure=None):
     corresponds more closely to the `author's implementation`_ in the RAdam paper. Further information
     about decoupled weight decay can be found in `Decoupled Weight Decay Regularization`_.
 
-    """ + fr"""
+    """
+    + rf"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -201,6 +222,7 @@ def step(self, closure=None):
             decay as in AdamW to obtain RAdamW (default: False)
         {_foreach_doc}
         {_differentiable_doc}
+        {_capturable_doc}
 
     .. _On the variance of the adaptive learning rate and beyond:
         https://arxiv.org/abs/1908.03265
@@ -210,63 +232,7 @@ def step(self, closure=None):
         https://arxiv.org/abs/1711.05101
 
     """
-
-
-def radam(
-    params: List[Tensor],
-    grads: List[Tensor],
-    exp_avgs: List[Tensor],
-    exp_avg_sqs: List[Tensor],
-    state_steps: List[Tensor],
-    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-    decoupled_weight_decay: bool = False,
-    foreach: Optional[bool] = None,
-    differentiable: bool = False,
-    has_complex: bool = False,
-    *,
-    beta1: float,
-    beta2: float,
-    lr: float,
-    weight_decay: float,
-    eps: float,
-):
-    r"""Functional API that performs RAdam algorithm computation.
-
-    See :class:`~torch.optim.RAdam` for details.
-    """
-
-    if not all(isinstance(t, torch.Tensor) for t in state_steps):
-        raise RuntimeError(
-            "API has changed, `state_steps` argument must contain a list of singleton tensors"
-        )
-
-    if foreach is None:
-        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
-
-    if foreach and torch.jit.is_scripting():
-        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
-
-    if foreach and not torch.jit.is_scripting():
-        func = _multi_tensor_radam
-    else:
-        func = _single_tensor_radam
-
-    func(
-        params,
-        grads,
-        exp_avgs,
-        exp_avg_sqs,
-        state_steps,
-        beta1=beta1,
-        beta2=beta2,
-        lr=lr,
-        weight_decay=weight_decay,
-        eps=eps,
-        decoupled_weight_decay=decoupled_weight_decay,
-        differentiable=differentiable,
-        has_complex=has_complex,
-    )
+)
 
 
 def _single_tensor_radam(
@@ -283,15 +249,21 @@ def _single_tensor_radam(
     eps: float,
     differentiable: bool,
     decoupled_weight_decay: bool,
+    capturable: bool,
     has_complex: bool,
 ):
-
     for i, param in enumerate(params):
         grad = grads[i]
         exp_avg = exp_avgs[i]
         exp_avg_sq = exp_avg_sqs[i]
         step_t = state_steps[i]
 
+        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+        if not torch._utils.is_compiling() and capturable:
+            assert (param.is_cuda and step_t.is_cuda) or (
+                param.is_xla and step_t.is_xla
+            ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+
         if torch.is_complex(param):
             param = torch.view_as_real(param)
             grad = torch.view_as_real(grad)
@@ -300,10 +272,7 @@ def _single_tensor_radam(
 
         # update step
         step_t += 1
-        step = _get_value(step_t)
-
-        bias_correction1 = 1 - beta1 ** step
-        bias_correction2 = 1 - beta2 ** step
+        step = step_t if capturable else _get_value(step_t)
 
         if weight_decay != 0:
             if decoupled_weight_decay:
@@ -315,31 +284,51 @@ def _single_tensor_radam(
         exp_avg.lerp_(grad, 1 - beta1)
         exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
 
+        bias_correction1 = 1 - beta1**step
+        bias_correction2 = 1 - beta2**step
+
         # correcting bias for the first moving moment
         bias_corrected_exp_avg = exp_avg / bias_correction1
 
         # maximum length of the approximated SMA
         rho_inf = 2 / (1 - beta2) - 1
         # compute the length of the approximated SMA
-        rho_t = rho_inf - 2 * step * (beta2 ** step) / bias_correction2
+        rho_t = rho_inf - 2 * step * (beta2**step) / bias_correction2
 
-        if rho_t > 5.0:
-            # Compute the variance rectification term and update parameters accordingly
-            rect = math.sqrt(
+        def _compute_rect():
+            return (
                 (rho_t - 4)
                 * (rho_t - 2)
                 * rho_inf
                 / ((rho_inf - 4) * (rho_inf - 2) * rho_t)
-            )
+            ) ** 0.5
+
+        def _compute_adaptive_lr():
             exp_avg_sq_sqrt = exp_avg_sq.sqrt()
             if differentiable:
                 exp_avg_sq_sqrt = exp_avg_sq_sqrt.add(eps)
             else:
                 exp_avg_sq_sqrt = exp_avg_sq_sqrt.add_(eps)
-            adaptive_lr = math.sqrt(bias_correction2) / exp_avg_sq_sqrt
-            param.add_(bias_corrected_exp_avg * lr * adaptive_lr * rect, alpha=-1.0)
+
+            return (bias_correction2**0.5) / exp_avg_sq_sqrt
+
+        # Compute the variance rectification term and update parameters accordingly
+        if capturable:
+            update = torch.where(
+                rho_t > 5.0, _compute_rect() * _compute_adaptive_lr(), 1.0
+            )
+            param.add_(bias_corrected_exp_avg * lr * update, alpha=-1.0)
         else:
-            param.add_(bias_corrected_exp_avg * lr, alpha=-1.0)
+            if rho_t > 5.0:
+                param.add_(
+                    bias_corrected_exp_avg
+                    * lr
+                    * _compute_adaptive_lr()
+                    * _compute_rect(),
+                    alpha=-1.0,
+                )
+            else:
+                param.add_(bias_corrected_exp_avg * lr, alpha=-1.0)
 
 
 def _multi_tensor_radam(
@@ -356,80 +345,219 @@ def _multi_tensor_radam(
     eps: float,
     decoupled_weight_decay: bool,
     differentiable: bool,
+    capturable: bool,
     has_complex: bool,
 ):
-
     if len(params) == 0:
         return
 
     assert not differentiable, "_foreach ops don't support autograd"
 
-    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, exp_avgs, exp_avg_sqs, state_steps])
-    for ((
+    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+    if not torch._utils.is_compiling() and capturable:
+        assert all(
+            p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)
+        ), "If capturable=True, params and state_steps must be CUDA tensors."
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, exp_avgs, exp_avg_sqs, state_steps]
+    )
+    for (
         grouped_params,
         grouped_grads,
         grouped_exp_avgs,
         grouped_exp_avg_sqs,
         grouped_state_steps,
-    ), _) in grouped_tensors.values():
+    ), _ in grouped_tensors.values():
         # Update steps
         # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
         # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
         # wrapped it once now. The alpha is required to assure we go to the right overload.
         if grouped_state_steps[0].is_cpu:
-            torch._foreach_add_(grouped_state_steps, torch.tensor(1.0, device='cpu'), alpha=1.0)
+            torch._foreach_add_(
+                grouped_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
+            )
         else:
             torch._foreach_add_(grouped_state_steps, 1)
 
         if has_complex:
-            _view_as_real(grouped_params, grouped_grads, grouped_exp_avgs, grouped_exp_avg_sqs)
+            _view_as_real(
+                grouped_params, grouped_grads, grouped_exp_avgs, grouped_exp_avg_sqs
+            )
 
         # maximum length of the approximated SMA
         rho_inf = 2 / (1 - beta2) - 1
         # compute the length of the approximated SMA
-        rho_t_list = [rho_inf - 2 * _get_value(step) * (beta2 ** _get_value(step)) /
-                      (1 - beta2 ** _get_value(step)) for step in grouped_state_steps]
+        if capturable:
+            bias_correction1 = torch._foreach_pow(beta2, grouped_state_steps)
+            torch._foreach_neg_(bias_correction1)
+            torch._foreach_add_(bias_correction1, 1)
+            bias_correction2 = torch._foreach_pow(beta2, grouped_state_steps)
+            torch._foreach_mul_(bias_correction2, grouped_state_steps)
+            torch._foreach_mul_(bias_correction2, 2)
+            torch._foreach_div_(bias_correction2, bias_correction1)
+            torch._foreach_neg_(bias_correction2)
+            torch._foreach_add_(bias_correction2, rho_inf)
+            rho_t_list = bias_correction2
+        else:
+            rho_t_list = [
+                rho_inf
+                - 2
+                * _get_value(step)
+                * (beta2 ** _get_value(step))
+                / (1 - beta2 ** _get_value(step))
+                for step in grouped_state_steps
+            ]
 
         if weight_decay != 0:
             if decoupled_weight_decay:
                 torch._foreach_mul_(grouped_params, 1 - lr * weight_decay)
             else:
-                grouped_grads = torch._foreach_add(grouped_grads, grouped_params, alpha=weight_decay)
+                grouped_grads = torch._foreach_add(
+                    grouped_grads, grouped_params, alpha=weight_decay
+                )
 
         # Decay the first and second moment running average coefficient
         torch._foreach_lerp_(grouped_exp_avgs, grouped_grads, 1 - beta1)
 
         torch._foreach_mul_(grouped_exp_avg_sqs, beta2)
-        torch._foreach_addcmul_(grouped_exp_avg_sqs, grouped_grads, grouped_grads, 1 - beta2)
+        torch._foreach_addcmul_(
+            grouped_exp_avg_sqs, grouped_grads, grouped_grads, 1 - beta2
+        )
 
         # Delete the local intermediate since it won't be used anymore to save on peak memory
         del grouped_grads
 
-        rect = [
-            _dispatch_sqrt(
-                (rho_t - 4)
-                * (rho_t - 2)
-                * rho_inf
-                / ((rho_inf - 4) * (rho_inf - 2) * rho_t)
-            )
-            if rho_t > 5
-            else 0
-            for rho_t in rho_t_list
-        ]
-        unrectified = [0 if rect > 0 else 1.0 for rect in rect]
-
-        bias_correction1 = [1 - beta1 ** _get_value(step) for step in grouped_state_steps]
-        unrect_step_size = _stack_if_compiling([(lr * rect / bc) * -1 for rect, bc in zip(unrectified, bias_correction1)])
-        bias_correction2_sqrt_times_rect_step_size = [
-            _dispatch_sqrt(1 - beta2 ** _get_value(step)) * (lr * rect / bc) * -1
-            for step, rect, bc in zip(grouped_state_steps, rect, bias_correction1)
-        ]
+        if capturable:
+            num = torch._foreach_sub(rho_t_list, 4)
+            sub2 = torch._foreach_sub(rho_t_list, 2)
+            torch._foreach_mul_(num, sub2)
+            del sub2
+            torch._foreach_mul_(num, rho_inf)
+            rho_inf = (rho_inf - 4) * (rho_inf - 2)
+            denom = torch._foreach_mul(rho_t_list, rho_inf)
+            torch._foreach_div_(num, denom)
+            del denom
+            torch._foreach_sqrt_(num)
+
+            # TODO(mlazos): we should try and get a foreach_where op https://github.com/pytorch/pytorch/issues/117884
+            rect = [
+                torch.where(rho_t > 5.0, n, 0.0) for n, rho_t in zip(num, rho_t_list)
+            ]
+            del num
+            del rho_t_list
+            unrect_step_size = [torch.where(rect > 0, 0.0, 1.0) for rect in rect]
+            torch._foreach_mul_(unrect_step_size, lr)
+
+            bias_correction1 = torch._foreach_pow(beta1, grouped_state_steps)
+            torch._foreach_neg_(bias_correction1)
+            torch._foreach_add_(bias_correction1, 1)
+
+            torch._foreach_div_(unrect_step_size, bias_correction1)
+            torch._foreach_neg_(unrect_step_size)
+
+            bias_correction2 = torch._foreach_pow(beta2, grouped_state_steps)
+            torch._foreach_neg_(bias_correction2)
+            torch._foreach_add_(bias_correction2, 1)
+            torch._foreach_sqrt_(bias_correction2)
+            torch._foreach_mul_(bias_correction2, lr)
+            torch._foreach_mul_(bias_correction2, rect)
+            del rect
+            torch._foreach_neg_(bias_correction2)
+            torch._foreach_div_(bias_correction2, bias_correction1)
+            del bias_correction1
+        else:
+            rect = [
+                _dispatch_sqrt(
+                    (rho_t - 4)
+                    * (rho_t - 2)
+                    * rho_inf
+                    / ((rho_inf - 4) * (rho_inf - 2) * rho_t)
+                )
+                if rho_t > 5
+                else 0
+                for rho_t in rho_t_list
+            ]
+            unrectified = [0 if rect > 0 else 1.0 for rect in rect]
+
+            bias_correction1 = [
+                1 - beta1 ** _get_value(step) for step in grouped_state_steps
+            ]
+            unrect_step_size = [
+                (lr * rect / bc) * -1 for rect, bc in zip(unrectified, bias_correction1)
+            ]
+            bias_correction2 = [
+                _dispatch_sqrt(1 - beta2 ** _get_value(step)) * (lr * rect / bc) * -1
+                for step, rect, bc in zip(grouped_state_steps, rect, bias_correction1)
+            ]
 
         buffer = torch._foreach_sqrt(grouped_exp_avg_sqs)
         torch._foreach_add_(buffer, eps)
-        torch._foreach_div_(buffer, bias_correction2_sqrt_times_rect_step_size)
+        torch._foreach_div_(buffer, bias_correction2)
         torch._foreach_reciprocal_(buffer)
         torch._foreach_add_(buffer, unrect_step_size)
 
         # Here, buffer = sqrt(1 - beta2^t) * rect_step_size / (sqrt(v) + eps) + unrect_step_size
         torch._foreach_addcmul_(grouped_params, grouped_exp_avgs, buffer)
+
+
+@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_radam)
+def radam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    decoupled_weight_decay: bool = False,
+    foreach: Optional[bool] = None,
+    differentiable: bool = False,
+    capturable: bool = False,
+    has_complex: bool = False,
+    *,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+):
+    r"""Functional API that performs RAdam algorithm computation.
+
+    See :class:`~torch.optim.RAdam` for details.
+    """
+
+    if not all(isinstance(t, torch.Tensor) for t in state_steps):
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
+
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(
+            params, differentiable, use_fused=False
+        )
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_radam
+    else:
+        func = _single_tensor_radam
+
+    func(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        state_steps,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        eps=eps,
+        decoupled_weight_decay=decoupled_weight_decay,
+        differentiable=differentiable,
+        capturable=capturable,
+        has_complex=has_complex,
+    )
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
index bf7e0f737b955..dc4491b553b24 100644
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -1,8 +1,19 @@
+from typing import List, Optional
+
 import torch
 from torch import Tensor
-from .optimizer import (Optimizer, _default_to_fused_or_foreach, _use_grad_for_differentiable,
-                        _differentiable_doc, _foreach_doc, _maximize_doc, _view_as_real)
-from typing import List, Optional
+from .optimizer import (
+    _capturable_doc,
+    _default_to_fused_or_foreach,
+    _differentiable_doc,
+    _disable_dynamo_if_unsupported,
+    _foreach_doc,
+    _get_scalar_dtype,
+    _maximize_doc,
+    _use_grad_for_differentiable,
+    _view_as_real,
+    Optimizer,
+)
 
 __all__ = ["RMSprop", "rmsprop"]
 
@@ -17,6 +28,7 @@ def __init__(
         weight_decay=0,
         momentum=0,
         centered=False,
+        capturable=False,
         foreach: Optional[bool] = None,
         maximize: bool = False,
         differentiable: bool = False,
@@ -39,6 +51,7 @@ def __init__(
             eps=eps,
             centered=centered,
             weight_decay=weight_decay,
+            capturable=capturable,
             foreach=foreach,
             maximize=maximize,
             differentiable=differentiable,
@@ -53,8 +66,29 @@ def __setstate__(self, state):
             group.setdefault("foreach", None)
             group.setdefault("maximize", False)
             group.setdefault("differentiable", False)
+            group.setdefault("capturable", False)
+            for p in group["params"]:
+                p_state = self.state.get(p, [])
+                if len(p_state) != 0 and not torch.is_tensor(p_state["step"]):
+                    step_val = float(p_state["step"])
+                    p_state["step"] = (
+                        torch.tensor(
+                            step_val, dtype=_get_scalar_dtype(), device=p.device
+                        )
+                        if group["capturable"]
+                        else torch.tensor(step_val, dtype=_get_scalar_dtype())
+                    )
 
-    def _init_group(self, group, params_with_grad, grads, square_avgs, momentum_buffer_list, grad_avgs):
+    def _init_group(
+        self,
+        group,
+        params_with_grad,
+        grads,
+        square_avgs,
+        momentum_buffer_list,
+        grad_avgs,
+        state_steps,
+    ):
         has_complex = False
         for p in group["params"]:
             if p.grad is None:
@@ -70,7 +104,11 @@ def _init_group(self, group, params_with_grad, grads, square_avgs, momentum_buff
 
             # State initialization
             if len(state) == 0:
-                state["step"] = 0
+                state["step"] = (
+                    torch.zeros((), dtype=_get_scalar_dtype(), device=p.device)
+                    if group["capturable"]
+                    else torch.zeros((), dtype=_get_scalar_dtype())
+                )
                 state["square_avg"] = torch.zeros_like(
                     p, memory_format=torch.preserve_format
                 )
@@ -83,16 +121,13 @@ def _init_group(self, group, params_with_grad, grads, square_avgs, momentum_buff
                         p, memory_format=torch.preserve_format
                     )
             square_avgs.append(state["square_avg"])
+            state_steps.append(state["step"])
 
             if group["momentum"] > 0:
                 momentum_buffer_list.append(state["momentum_buffer"])
             if group["centered"]:
                 grad_avgs.append(state["grad_avg"])
 
-            if group["differentiable"] and isinstance(state["step"], Tensor):
-                raise RuntimeError("`step` can't be a tensor")
-
-            state["step"] += 1
         return has_complex
 
     @_use_grad_for_differentiable
@@ -103,6 +138,8 @@ def step(self, closure=None):
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
+        self._cuda_graph_capture_health_check()
+
         loss = None
         if closure is not None:
             with torch.enable_grad():
@@ -114,8 +151,17 @@ def step(self, closure=None):
             square_avgs = []
             grad_avgs = []
             momentum_buffer_list = []
+            state_steps = []
 
-            has_complex = self._init_group(group, params_with_grad, grads, square_avgs, momentum_buffer_list, grad_avgs)
+            has_complex = self._init_group(
+                group,
+                params_with_grad,
+                grads,
+                square_avgs,
+                momentum_buffer_list,
+                grad_avgs,
+                state_steps,
+            )
 
             rmsprop(
                 params_with_grad,
@@ -123,6 +169,7 @@ def step(self, closure=None):
                 square_avgs,
                 grad_avgs,
                 momentum_buffer_list,
+                state_steps,
                 lr=group["lr"],
                 alpha=group["alpha"],
                 eps=group["eps"],
@@ -132,13 +179,15 @@ def step(self, closure=None):
                 foreach=group["foreach"],
                 maximize=group["maximize"],
                 differentiable=group["differentiable"],
+                capturable=group["capturable"],
                 has_complex=has_complex,
             )
 
         return loss
 
 
-RMSprop.__doc__ = r"""Implements RMSprop algorithm.
+RMSprop.__doc__ = (
+    r"""Implements RMSprop algorithm.
 
     .. math::
        \begin{aligned}
@@ -180,7 +229,8 @@ def step(self, closure=None):
     learning rate is thus :math:`\gamma/(\sqrt{v} + \epsilon)` where :math:`\gamma`
     is the scheduled learning rate and :math:`v` is the weighted moving average
     of the squared gradient.
-    """ + fr"""
+    """
+    + rf"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -194,62 +244,11 @@ def step(self, closure=None):
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
         {_foreach_doc}
         {_maximize_doc}
+        {_capturable_doc}
         {_differentiable_doc}
 
     """
-
-
-def rmsprop(
-    params: List[Tensor],
-    grads: List[Tensor],
-    square_avgs: List[Tensor],
-    grad_avgs: List[Tensor],
-    momentum_buffer_list: List[Tensor],
-    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-    foreach: Optional[bool] = None,
-    maximize: bool = False,
-    differentiable: bool = False,
-    has_complex: bool = False,
-    *,
-    lr: float,
-    alpha: float,
-    eps: float,
-    weight_decay: float,
-    momentum: float,
-    centered: bool,
-):
-    r"""Functional API that performs rmsprop algorithm computation.
-    See :class:`~torch.optim.RMSProp` for details.
-    """
-
-    if foreach is None:
-        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
-
-    if foreach and torch.jit.is_scripting():
-        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
-
-    if foreach and not torch.jit.is_scripting():
-        func = _multi_tensor_rmsprop
-    else:
-        func = _single_tensor_rmsprop
-
-    func(
-        params,
-        grads,
-        square_avgs,
-        grad_avgs,
-        momentum_buffer_list,
-        lr=lr,
-        alpha=alpha,
-        eps=eps,
-        weight_decay=weight_decay,
-        momentum=momentum,
-        centered=centered,
-        maximize=maximize,
-        differentiable=differentiable,
-        has_complex=has_complex,
-    )
+)
 
 
 def _single_tensor_rmsprop(
@@ -258,6 +257,7 @@ def _single_tensor_rmsprop(
     square_avgs: List[Tensor],
     grad_avgs: List[Tensor],
     momentum_buffer_list: List[Tensor],
+    state_steps: List[Tensor],
     *,
     lr: float,
     alpha: float,
@@ -267,14 +267,24 @@ def _single_tensor_rmsprop(
     centered: bool,
     maximize: bool,
     differentiable: bool,
+    capturable: bool,
     has_complex: bool,
 ):
-
     for i, param in enumerate(params):
+        step = state_steps[i]
+
+        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+        if not torch._utils.is_compiling() and capturable:
+            assert (param.is_cuda and step.is_cuda) or (
+                param.is_xla and step.is_xla
+            ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+
         grad = grads[i]
         grad = grad if not maximize else -grad
         square_avg = square_avgs[i]
 
+        step += 1
+
         if weight_decay != 0:
             grad = grad.add(param, alpha=weight_decay)
 
@@ -316,6 +326,7 @@ def _multi_tensor_rmsprop(
     square_avgs: List[Tensor],
     grad_avgs: List[Tensor],
     momentum_buffer_list: List[Tensor],
+    state_steps: List[Tensor],
     *,
     lr: float,
     alpha: float,
@@ -325,29 +336,34 @@ def _multi_tensor_rmsprop(
     centered: bool,
     maximize: bool,
     differentiable: bool,
+    capturable: bool,
     has_complex: bool,
 ):
-
     if len(params) == 0:
         return
 
     assert not differentiable, "_foreach ops don't support autograd"
 
-    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, square_avgs, grad_avgs, momentum_buffer_list])
-    for (((grouped_params, grouped_grads, grouped_square_avgs, grouped_grad_avgs,
-         grouped_momentum_buffer_list)), _) in grouped_tensors.values():
-        if maximize:
-            grouped_grads = torch._foreach_neg(grouped_grads)
-
-        if weight_decay != 0:
-            # Re-use the intermediate memory (grouped_grads) already allocated for maximize
-            if maximize:
-                torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay)
-            else:
-                grouped_grads = torch._foreach_add(grouped_grads, grouped_params, alpha=weight_decay)
-
-        grouped_grads = list(grouped_grads)
+    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+    if not torch._utils.is_compiling() and capturable:
+        assert all(
+            (p.is_cuda and step.is_cuda) or (p.is_xla and step.is_xla)
+            for p, step in zip(params, state_steps)
+        ), "If capturable=True, params and state_steps must be CUDA tensors."
 
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, square_avgs, grad_avgs, momentum_buffer_list, state_steps]
+    )
+    for (
+        (
+            grouped_params,
+            grouped_grads,
+            grouped_square_avgs,
+            grouped_grad_avgs,
+            grouped_momentum_buffer_list,
+            grouped_state_steps,
+        )
+    ), _ in grouped_tensors.values():
         if has_complex:
             state_and_grads = [grouped_grads, grouped_square_avgs]
             if momentum > 0:
@@ -356,12 +372,39 @@ def _multi_tensor_rmsprop(
                 state_and_grads.append(grouped_grad_avgs)
             _view_as_real(grouped_params, *state_and_grads)
 
+        if maximize:
+            grouped_grads = torch._foreach_neg(grouped_grads)
+
+        # Update steps
+        # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
+        # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
+        # wrapped it once now. The alpha is required to assure we go to the right overload.
+        if grouped_state_steps[0].is_cpu:
+            torch._foreach_add_(
+                grouped_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
+            )
+        else:
+            torch._foreach_add_(grouped_state_steps, 1)
+
+        if weight_decay != 0:
+            # Re-use the intermediate memory (grouped_grads) already allocated for maximize
+            if maximize:
+                torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay)
+            else:
+                grouped_grads = torch._foreach_add(
+                    grouped_grads, grouped_params, alpha=weight_decay
+                )
+
         torch._foreach_mul_(grouped_square_avgs, alpha)
-        torch._foreach_addcmul_(grouped_square_avgs, grouped_grads, grouped_grads, value=1 - alpha)
+        torch._foreach_addcmul_(
+            grouped_square_avgs, grouped_grads, grouped_grads, value=1 - alpha
+        )
 
         if centered:
             torch._foreach_lerp_(grouped_grad_avgs, grouped_grads, 1 - alpha)
-            avg = torch._foreach_addcmul(grouped_square_avgs, grouped_grad_avgs, grouped_grad_avgs, value=-1)
+            avg = torch._foreach_addcmul(
+                grouped_square_avgs, grouped_grad_avgs, grouped_grad_avgs, value=-1
+            )
             torch._foreach_sqrt_(avg)
             torch._foreach_add_(avg, eps)
         else:
@@ -371,6 +414,88 @@ def _multi_tensor_rmsprop(
         if momentum > 0:
             torch._foreach_mul_(grouped_momentum_buffer_list, momentum)
             torch._foreach_addcdiv_(grouped_momentum_buffer_list, grouped_grads, avg)
-            torch._foreach_add_(grouped_params, grouped_momentum_buffer_list, alpha=-lr)
+            # If LR is a tensor, the else branch will internally call item()
+            # which will cause silent incorrectness if we are capturing
+            if capturable and isinstance(lr, torch.Tensor):
+                momentum_lr = torch._foreach_mul(grouped_momentum_buffer_list, -lr)
+                torch._foreach_add_(grouped_params, momentum_lr)
+            else:
+                torch._foreach_add_(
+                    grouped_params, grouped_momentum_buffer_list, alpha=-lr
+                )
         else:
-            torch._foreach_addcdiv_(grouped_params, grouped_grads, avg, value=-lr)
+            # If LR is a tensor, the else branch will internally call item()
+            # which will cause silent incorrectness if we are capturing
+            if capturable and isinstance(lr, torch.Tensor):
+                torch._foreach_div_(avg, -lr)
+                torch._foreach_addcdiv_(grouped_params, grouped_grads, avg)
+            else:
+                torch._foreach_addcdiv_(grouped_params, grouped_grads, avg, value=-lr)
+
+
+@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_rmsprop)
+def rmsprop(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    grad_avgs: List[Tensor],
+    momentum_buffer_list: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: Optional[bool] = None,
+    maximize: bool = False,
+    differentiable: bool = False,
+    capturable: bool = False,
+    has_complex: bool = False,
+    *,
+    lr: float,
+    alpha: float,
+    eps: float,
+    weight_decay: float,
+    momentum: float,
+    centered: bool,
+):
+    r"""Functional API that performs rmsprop algorithm computation.
+    See :class:`~torch.optim.RMSProp` for details.
+    """
+    # this check is slow during compilation, so we skip it
+    # if it's strictly needed we can add this check back in dynamo
+    if not torch._utils.is_compiling() and not all(
+        isinstance(t, torch.Tensor) for t in state_steps
+    ):
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
+
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(
+            params, differentiable, use_fused=False
+        )
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_rmsprop
+    else:
+        func = _single_tensor_rmsprop
+
+    func(
+        params,
+        grads,
+        square_avgs,
+        grad_avgs,
+        momentum_buffer_list,
+        state_steps,
+        lr=lr,
+        alpha=alpha,
+        eps=eps,
+        weight_decay=weight_decay,
+        momentum=momentum,
+        centered=centered,
+        maximize=maximize,
+        capturable=capturable,
+        differentiable=differentiable,
+        has_complex=has_complex,
+    )
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index 2a769a86391bd..b252f5214cb8a 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -1,8 +1,19 @@
+from typing import List, Optional
+
 import torch
 from torch import Tensor
-from .optimizer import (Optimizer, _use_grad_for_differentiable, _default_to_fused_or_foreach,
-                        _differentiable_doc, _foreach_doc, _maximize_doc, _view_as_real)
-from typing import List, Optional
+from .optimizer import (
+    _capturable_doc,
+    _default_to_fused_or_foreach,
+    _differentiable_doc,
+    _disable_dynamo_if_unsupported,
+    _foreach_doc,
+    _get_scalar_dtype,
+    _maximize_doc,
+    _use_grad_for_differentiable,
+    _view_as_real,
+    Optimizer,
+)
 
 __all__ = ["Rprop", "rprop"]
 
@@ -15,6 +26,7 @@ def __init__(
         etas=(0.5, 1.2),
         step_sizes=(1e-6, 50),
         *,
+        capturable: bool = False,
         foreach: Optional[bool] = None,
         maximize: bool = False,
         differentiable: bool = False,
@@ -31,6 +43,7 @@ def __init__(
             foreach=foreach,
             maximize=maximize,
             differentiable=differentiable,
+            capturable=capturable,
         )
         super().__init__(params, defaults)
 
@@ -40,8 +53,20 @@ def __setstate__(self, state):
             group.setdefault("foreach", None)
             group.setdefault("maximize", False)
             group.setdefault("differentiable", False)
+            group.setdefault("capturable", False)
+            for p in group["params"]:
+                p_state = self.state.get(p, [])
+                if len(p_state) != 0 and not torch.is_tensor(p_state["step"]):
+                    step_val = float(p_state["step"])
+                    p_state["step"] = (
+                        torch.tensor(
+                            step_val, dtype=_get_scalar_dtype(), device=p.device
+                        )
+                        if group["capturable"]
+                        else torch.tensor(step_val, dtype=_get_scalar_dtype())
+                    )
 
-    def _init_group(self, group, params, grads, prevs, step_sizes):
+    def _init_group(self, group, params, grads, prevs, step_sizes, state_steps):
         has_complex = False
         for p in group["params"]:
             if p.grad is None:
@@ -57,27 +82,26 @@ def _init_group(self, group, params, grads, prevs, step_sizes):
 
             # State initialization
             if len(state) == 0:
-                state["step"] = 0
-                state["prev"] = torch.zeros_like(
-                    p, memory_format=torch.preserve_format
+                state["step"] = (
+                    torch.zeros((), dtype=_get_scalar_dtype(), device=p.device)
+                    if group["capturable"]
+                    else torch.zeros((), dtype=_get_scalar_dtype())
                 )
+
+                state["prev"] = torch.zeros_like(p, memory_format=torch.preserve_format)
                 if p.dtype.is_complex:
                     # Complex Number should be as if they are two independent real numbers.
                     # Hence the step_size shouldn't be zero for imaginary part.
-                    state["step_size"] = (
-                        grad.new()
-                        .resize_as_(grad)
-                        .fill_(complex(group["lr"], group["lr"]))
+                    state["step_size"] = torch.full_like(
+                        grad, complex(group["lr"], group["lr"])
                     )
                 else:
-                    state["step_size"] = (
-                        grad.new().resize_as_(grad).fill_(group["lr"])
-                    )
+                    state["step_size"] = torch.full_like(grad, group["lr"])
 
             prevs.append(state["prev"])
             step_sizes.append(state["step_size"])
+            state_steps.append(state["step"])
 
-            state["step"] += 1
         return has_complex
 
     @_use_grad_for_differentiable
@@ -88,6 +112,8 @@ def step(self, closure=None):
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
+        self._cuda_graph_capture_health_check()
+
         loss = None
         if closure is not None:
             with torch.enable_grad():
@@ -98,18 +124,23 @@ def step(self, closure=None):
             grads = []
             prevs = []
             step_sizes = []
+            state_steps = []
+
             etaminus, etaplus = group["etas"]
             step_size_min, step_size_max = group["step_sizes"]
             foreach = group["foreach"]
             maximize = group["maximize"]
 
-            has_complex = self._init_group(group, params, grads, prevs, step_sizes)
+            has_complex = self._init_group(
+                group, params, grads, prevs, step_sizes, state_steps
+            )
 
             rprop(
                 params,
                 grads,
                 prevs,
                 step_sizes,
+                state_steps,
                 step_size_min=step_size_min,
                 step_size_max=step_size_max,
                 etaminus=etaminus,
@@ -117,13 +148,15 @@ def step(self, closure=None):
                 foreach=foreach,
                 maximize=maximize,
                 differentiable=group["differentiable"],
+                capturable=group["capturable"],
                 has_complex=has_complex,
             )
 
         return loss
 
 
-Rprop.__doc__ = r"""Implements the resilient backpropagation algorithm.
+Rprop.__doc__ = (
+    r"""Implements the resilient backpropagation algorithm.
 
     .. math::
        \begin{aligned}
@@ -157,7 +190,8 @@ def step(self, closure=None):
     For further details regarding the algorithm we refer to the paper
     `A Direct Adaptive Method for Faster Backpropagation Learning: The RPROP Algorithm
     <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.1417>`_.
-    """ + fr"""
+    """
+    + rf"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -168,57 +202,12 @@ def step(self, closure=None):
         step_sizes (Tuple[float, float], optional): a pair of minimal and
             maximal allowed step sizes (default: (1e-6, 50))
         {_foreach_doc}
+        {_capturable_doc}
         {_maximize_doc}
         {_differentiable_doc}
 
     """
-
-def rprop(
-    params: List[Tensor],
-    grads: List[Tensor],
-    prevs: List[Tensor],
-    step_sizes: List[Tensor],
-    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-    foreach: Optional[bool] = None,
-    maximize: bool = False,
-    differentiable: bool = False,
-    has_complex: bool = False,
-    *,
-    step_size_min: float,
-    step_size_max: float,
-    etaminus: float,
-    etaplus: float,
-):
-    r"""Functional API that performs rprop algorithm computation.
-
-    See :class:`~torch.optim.Rprop` for details.
-    """
-
-    if foreach is None:
-        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
-
-    if foreach and torch.jit.is_scripting():
-        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
-
-    if foreach and not torch.jit.is_scripting():
-        func = _multi_tensor_rprop
-    else:
-        func = _single_tensor_rprop
-
-    func(
-        params,
-        grads,
-        prevs,
-        step_sizes,
-        step_size_min=step_size_min,
-        step_size_max=step_size_max,
-        etaminus=etaminus,
-        etaplus=etaplus,
-        maximize=maximize,
-        differentiable=differentiable,
-        has_complex=has_complex,
-    )
+)
 
 
 def _single_tensor_rprop(
@@ -226,21 +215,31 @@ def _single_tensor_rprop(
     grads: List[Tensor],
     prevs: List[Tensor],
     step_sizes: List[Tensor],
+    state_steps: List[Tensor],
     *,
     step_size_min: float,
     step_size_max: float,
     etaminus: float,
     etaplus: float,
     maximize: bool,
+    capturable: bool,
     differentiable: bool,
     has_complex: bool,
 ):
-
     for i, param in enumerate(params):
         grad = grads[i]
         grad = grad if not maximize else -grad
         prev = prevs[i]
         step_size = step_sizes[i]
+        step = state_steps[i]
+
+        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+        if not torch._utils.is_compiling() and capturable:
+            assert (param.is_cuda and step.is_cuda) or (
+                param.is_xla and step.is_xla
+            ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+
+        step += 1
 
         if torch.is_complex(param):
             grad = torch.view_as_real(grad)
@@ -251,9 +250,15 @@ def _single_tensor_rprop(
             sign = grad.mul(prev.clone()).sign()
         else:
             sign = grad.mul(prev).sign()
-        sign[sign.gt(0)] = etaplus
-        sign[sign.lt(0)] = etaminus
-        sign[sign.eq(0)] = 1
+
+        if capturable:
+            sign.copy_(torch.where(sign.gt(0), etaplus, sign))
+            sign.copy_(torch.where(sign.lt(0), etaminus, sign))
+            sign.copy_(torch.where(sign.eq(0), 1, sign))
+        else:
+            sign[sign.gt(0)] = etaplus
+            sign[sign.lt(0)] = etaminus
+            sign[sign.eq(0)] = 1
 
         # update stepsizes with step size updates
         step_size.mul_(sign).clamp_(step_size_min, step_size_max)
@@ -261,7 +266,10 @@ def _single_tensor_rprop(
         # for dir<0, dfdx=0
         # for dir>=0 dfdx=dfdx
         grad = grad.clone(memory_format=torch.preserve_format)
-        grad[sign.eq(etaminus)] = 0
+        if capturable:
+            grad.copy_(torch.where(sign.eq(etaminus), 0, grad))
+        else:
+            grad[sign.eq(etaminus)] = 0
 
         # update parameters
         param.addcmul_(grad.sign(), step_size, value=-1)
@@ -273,26 +281,55 @@ def _multi_tensor_rprop(
     grads: List[Tensor],
     prevs: List[Tensor],
     step_sizes: List[Tensor],
+    state_steps: List[Tensor],
     *,
     step_size_min: float,
     step_size_max: float,
     etaminus: float,
     etaplus: float,
     maximize: bool,
+    capturable: bool,
     differentiable: bool,
     has_complex: bool,
 ):
-
     if len(params) == 0:
         return
 
     assert not differentiable, "_foreach ops don't support autograd"
 
-    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, prevs, step_sizes])
-    for ((grouped_params, grouped_grads, grouped_prevs, grouped_step_sizes), _) in grouped_tensors.values():
+    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+    if not torch._utils.is_compiling() and capturable:
+        assert all(
+            (p.is_cuda and step.is_cuda) or (p.is_xla and step.is_xla)
+            for p, step in zip(params, state_steps)
+        ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, prevs, step_sizes, state_steps]
+    )
+    for (
+        grouped_params,
+        grouped_grads,
+        grouped_prevs,
+        grouped_step_sizes,
+        grouped_state_steps,
+    ), _ in grouped_tensors.values():
+        # Update steps
+        # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
+        # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
+        # wrapped it once now. The alpha is required to assure we go to the right overload.
+        if grouped_state_steps[0].is_cpu:
+            torch._foreach_add_(
+                grouped_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
+            )
+        else:
+            torch._foreach_add_(grouped_state_steps, 1)
+
         # Handle complex params
         if has_complex:
-            _view_as_real(grouped_params, grouped_grads, grouped_prevs, grouped_step_sizes)
+            _view_as_real(
+                grouped_params, grouped_grads, grouped_prevs, grouped_step_sizes
+            )
 
         signs = torch._foreach_mul(grouped_grads, grouped_prevs)
         if maximize:
@@ -307,10 +344,16 @@ def _multi_tensor_rprop(
         grouped_grads = grouped_prevs
 
         torch._foreach_sign_(signs)
-        for sign in signs:
-            sign[sign.gt(0)] = etaplus
-            sign[sign.lt(0)] = etaminus
-            sign[sign.eq(0)] = 1
+        if capturable:
+            for sign in signs:
+                sign.copy_(torch.where(sign.gt(0), etaplus, sign))
+                sign.copy_(torch.where(sign.lt(0), etaminus, sign))
+                sign.copy_(torch.where(sign.eq(0), 1, sign))
+        else:
+            for sign in signs:
+                sign[sign.gt(0)] = etaplus
+                sign[sign.lt(0)] = etaminus
+                sign[sign.eq(0)] = 1
 
         # update stepsizes with step size updates
         torch._foreach_mul_(grouped_step_sizes, signs)
@@ -321,15 +364,82 @@ def _multi_tensor_rprop(
         # for dir>=0 dfdx=dfdx
         grouped_grads = list(grouped_grads)
         for i in range(len(grouped_grads)):
-            grouped_grads[i][signs[i].eq(etaminus)] = 0
+            grouped_grads[i].copy_(
+                torch.where(signs[i].eq(etaminus), 0, grouped_grads[i])
+            )
 
         # explicitly del signs as it's not used after here to save memory
         del signs
 
         # update parameters
         grad_signs = [grad.sign() for grad in grouped_grads]
-        torch._foreach_addcmul_(grouped_params, grad_signs, grouped_step_sizes, value=-1)
+        torch._foreach_addcmul_(
+            grouped_params, grad_signs, grouped_step_sizes, value=-1
+        )
 
         # Logically, you may expect grouped_prevs to get updated to grouped_grads, but that's
         # basically already happened since we've been using grouped_prevs' memory to store
         # updated grouped_grads!
+
+
+@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_rprop)
+def rprop(
+    params: List[Tensor],
+    grads: List[Tensor],
+    prevs: List[Tensor],
+    step_sizes: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: Optional[bool] = None,
+    capturable: bool = False,
+    maximize: bool = False,
+    differentiable: bool = False,
+    has_complex: bool = False,
+    *,
+    step_size_min: float,
+    step_size_max: float,
+    etaminus: float,
+    etaplus: float,
+):
+    r"""Functional API that performs rprop algorithm computation.
+
+    See :class:`~torch.optim.Rprop` for details.
+    """
+    # this check is slow during compilation, so we skip it
+    # if it's strictly needed we can add this check back in dynamo
+    if not torch._utils.is_compiling() and not all(
+        isinstance(t, torch.Tensor) for t in state_steps
+    ):
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
+
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(
+            params, differentiable, use_fused=False
+        )
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_rprop
+    else:
+        func = _single_tensor_rprop
+
+    func(
+        params,
+        grads,
+        prevs,
+        step_sizes,
+        state_steps,
+        step_size_min=step_size_min,
+        step_size_max=step_size_max,
+        etaminus=etaminus,
+        etaplus=etaplus,
+        capturable=capturable,
+        maximize=maximize,
+        differentiable=differentiable,
+        has_complex=has_complex,
+    )
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
index 8c7d73b83a2b4..b346958204710 100644
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -1,16 +1,36 @@
+from typing import List, Optional
+
 import torch
 from torch import Tensor
-from .optimizer import (Optimizer, _use_grad_for_differentiable, _default_to_fused_or_foreach,
-                        _differentiable_doc, _foreach_doc, _maximize_doc)
-from typing import List, Optional
+from torch.utils._foreach_utils import _get_fused_kernels_supported_devices
+from .optimizer import (
+    _default_to_fused_or_foreach,
+    _differentiable_doc,
+    _foreach_doc,
+    _fused_doc,
+    _maximize_doc,
+    _use_grad_for_differentiable,
+    Optimizer,
+)
 
-__all__ = ['SGD', 'sgd']
+__all__ = ["SGD", "sgd"]
 
 
 class SGD(Optimizer):
-    def __init__(self, params, lr=1e-3, momentum=0, dampening=0,
-                 weight_decay=0, nesterov=False, *, maximize: bool = False, foreach: Optional[bool] = None,
-                 differentiable: bool = False):
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        momentum=0,
+        dampening=0,
+        weight_decay=0,
+        nesterov=False,
+        *,
+        maximize: bool = False,
+        foreach: Optional[bool] = None,
+        differentiable: bool = False,
+        fused: Optional[bool] = None,
+    ):
         if lr < 0.0:
             raise ValueError(f"Invalid learning rate: {lr}")
         if momentum < 0.0:
@@ -18,37 +38,61 @@ def __init__(self, params, lr=1e-3, momentum=0, dampening=0,
         if weight_decay < 0.0:
             raise ValueError(f"Invalid weight_decay value: {weight_decay}")
 
-        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
-                        weight_decay=weight_decay, nesterov=nesterov,
-                        maximize=maximize, foreach=foreach,
-                        differentiable=differentiable)
+        defaults = dict(
+            lr=lr,
+            momentum=momentum,
+            dampening=dampening,
+            weight_decay=weight_decay,
+            nesterov=nesterov,
+            maximize=maximize,
+            foreach=foreach,
+            differentiable=differentiable,
+            fused=fused,
+        )
         if nesterov and (momentum <= 0 or dampening != 0):
             raise ValueError("Nesterov momentum requires a momentum and zero dampening")
         super().__init__(params, defaults)
 
+        if fused:
+            self._step_supports_amp_scaling = True
+
+            fused_supported_devices = _get_fused_kernels_supported_devices()
+            if not all(
+                p.device.type in fused_supported_devices and torch.is_floating_point(p)
+                for pg in self.param_groups
+                for p in pg["params"]
+            ):
+                raise RuntimeError(
+                    "`fused=True` requires all the params to be floating point Tensors of "
+                    f"supported devices: {fused_supported_devices}."
+                )
+            if differentiable:
+                raise RuntimeError("`fused` does not support `differentiable`")
+            if foreach:
+                raise RuntimeError("`fused` and `foreach` cannot be `True` together.")
+
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('nesterov', False)
-            group.setdefault('maximize', False)
-            group.setdefault('foreach', None)
-            group.setdefault('differentiable', False)
+            group.setdefault("nesterov", False)
+            group.setdefault("maximize", False)
+            group.setdefault("foreach", None)
+            group.setdefault("differentiable", False)
+            group.setdefault("fused", False)
 
     def _init_group(self, group, params_with_grad, d_p_list, momentum_buffer_list):
         has_sparse_grad = False
 
-        for p in group['params']:
+        for p in group["params"]:
             if p.grad is not None:
                 params_with_grad.append(p)
                 d_p_list.append(p.grad)
                 if p.grad.is_sparse:
                     has_sparse_grad = True
 
-                state = self.state[p]
-                if 'momentum_buffer' not in state:
-                    momentum_buffer_list.append(None)
-                else:
-                    momentum_buffer_list.append(state['momentum_buffer'])
+                if group["momentum"] != 0:
+                    state = self.state[p]
+                    momentum_buffer_list.append(state.get("momentum_buffer"))
 
         return has_sparse_grad
 
@@ -70,29 +114,38 @@ def step(self, closure=None):
             d_p_list = []
             momentum_buffer_list = []
 
-            has_sparse_grad = self._init_group(group, params_with_grad, d_p_list, momentum_buffer_list)
+            has_sparse_grad = self._init_group(
+                group, params_with_grad, d_p_list, momentum_buffer_list
+            )
 
-            sgd(params_with_grad,
+            sgd(
+                params_with_grad,
                 d_p_list,
                 momentum_buffer_list,
-                weight_decay=group['weight_decay'],
-                momentum=group['momentum'],
-                lr=group['lr'],
-                dampening=group['dampening'],
-                nesterov=group['nesterov'],
-                maximize=group['maximize'],
+                weight_decay=group["weight_decay"],
+                momentum=group["momentum"],
+                lr=group["lr"],
+                dampening=group["dampening"],
+                nesterov=group["nesterov"],
+                maximize=group["maximize"],
                 has_sparse_grad=has_sparse_grad,
-                foreach=group['foreach'])
-
-            # update momentum_buffers in state
-            for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
-                state = self.state[p]
-                state['momentum_buffer'] = momentum_buffer
+                foreach=group["foreach"],
+                fused=group["fused"],
+                grad_scale=getattr(self, "grad_scale", None),
+                found_inf=getattr(self, "found_inf", None),
+            )
+
+            if group["momentum"] != 0:
+                # update momentum_buffers in state
+                for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
+                    state = self.state[p]
+                    state["momentum_buffer"] = momentum_buffer
 
         return loss
 
 
-SGD.__doc__ = r"""Implements stochastic gradient descent (optionally with momentum).
+SGD.__doc__ = (
+    r"""Implements stochastic gradient descent (optionally with momentum).
 
     .. math::
        \begin{aligned}
@@ -126,7 +179,8 @@ def step(self, closure=None):
 
     Nesterov momentum is based on the formula from
     `On the importance of initialization and momentum in deep learning`__.
-    """ + fr"""
+    """
+    + rf"""
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -138,7 +192,9 @@ def step(self, closure=None):
         {_maximize_doc}
         {_foreach_doc}
         {_differentiable_doc}
-    """ + r"""
+        {_fused_doc}
+    """
+    + r"""
 
     Example:
         >>> # xdoctest: +SKIP
@@ -180,65 +236,96 @@ def step(self, closure=None):
         frameworks that initialize it to all zeros.
 
     """
-
-
-def sgd(params: List[Tensor],
-        d_p_list: List[Tensor],
-        momentum_buffer_list: List[Optional[Tensor]],
-        # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-        # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-        has_sparse_grad: bool = None,
-        foreach: Optional[bool] = None,
-        *,
-        weight_decay: float,
-        momentum: float,
-        lr: float,
-        dampening: float,
-        nesterov: bool,
-        maximize: bool):
+)
+
+
+def sgd(
+    params: List[Tensor],
+    d_p_list: List[Tensor],
+    momentum_buffer_list: List[Optional[Tensor]],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    has_sparse_grad: bool = None,
+    foreach: Optional[bool] = None,
+    fused: Optional[bool] = None,
+    grad_scale: Optional[Tensor] = None,
+    found_inf: Optional[Tensor] = None,
+    *,
+    weight_decay: float,
+    momentum: float,
+    lr: float,
+    dampening: float,
+    nesterov: bool,
+    maximize: bool,
+):
     r"""Functional API that performs SGD algorithm computation.
 
     See :class:`~torch.optim.SGD` for details.
     """
 
-    if foreach is None:
+    # Respect when the user inputs False/True for foreach or fused. We only want to change
+    # the default when neither have been user-specified. Note that we default to foreach
+    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
+    # bake-in time before making it the default, even if it is typically faster.
+    if foreach is None and fused is None:
         # why must we be explicit about an if statement for torch.jit.is_scripting here?
         # because JIT can't handle Optionals nor fancy conditionals when scripting
         if not torch.jit.is_scripting():
-            _, foreach = _default_to_fused_or_foreach(params, differentiable=False, use_fused=False)
+            fused, foreach = _default_to_fused_or_foreach(
+                params, differentiable=False, use_fused=False
+            )
         else:
             foreach = False
+            fused = False
+    if foreach is None:
+        foreach = False
+    if fused is None:
+        fused = False
 
     if foreach and torch.jit.is_scripting():
-        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+    if fused and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with fused optimizers")
 
     if foreach and not torch.jit.is_scripting():
         func = _multi_tensor_sgd
+    elif fused and not torch.jit.is_scripting():
+        func = _fused_sgd
     else:
         func = _single_tensor_sgd
 
-    func(params,
-         d_p_list,
-         momentum_buffer_list,
-         weight_decay=weight_decay,
-         momentum=momentum,
-         lr=lr,
-         dampening=dampening,
-         nesterov=nesterov,
-         has_sparse_grad=has_sparse_grad,
-         maximize=maximize)
-
-def _single_tensor_sgd(params: List[Tensor],
-                       d_p_list: List[Tensor],
-                       momentum_buffer_list: List[Optional[Tensor]],
-                       *,
-                       weight_decay: float,
-                       momentum: float,
-                       lr: float,
-                       dampening: float,
-                       nesterov: bool,
-                       maximize: bool,
-                       has_sparse_grad: bool):
+    func(
+        params,
+        d_p_list,
+        momentum_buffer_list,
+        weight_decay=weight_decay,
+        momentum=momentum,
+        lr=lr,
+        dampening=dampening,
+        nesterov=nesterov,
+        has_sparse_grad=has_sparse_grad,
+        maximize=maximize,
+        grad_scale=grad_scale,
+        found_inf=found_inf,
+    )
+
+
+def _single_tensor_sgd(
+    params: List[Tensor],
+    d_p_list: List[Tensor],
+    momentum_buffer_list: List[Optional[Tensor]],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
+    *,
+    weight_decay: float,
+    momentum: float,
+    lr: float,
+    dampening: float,
+    nesterov: bool,
+    maximize: bool,
+    has_sparse_grad: bool,
+):
+    assert grad_scale is None and found_inf is None
 
     for i, param in enumerate(params):
         d_p = d_p_list[i] if not maximize else -d_p_list[i]
@@ -263,24 +350,37 @@ def _single_tensor_sgd(params: List[Tensor],
         param.add_(d_p, alpha=-lr)
 
 
-def _multi_tensor_sgd(params: List[Tensor],
-                      grads: List[Tensor],
-                      momentum_buffer_list: List[Optional[Tensor]],
-                      *,
-                      weight_decay: float,
-                      momentum: float,
-                      lr: float,
-                      dampening: float,
-                      nesterov: bool,
-                      maximize: bool,
-                      has_sparse_grad: bool):
+def _multi_tensor_sgd(
+    params: List[Tensor],
+    grads: List[Tensor],
+    momentum_buffer_list: List[Optional[Tensor]],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
+    *,
+    weight_decay: float,
+    momentum: float,
+    lr: float,
+    dampening: float,
+    nesterov: bool,
+    maximize: bool,
+    has_sparse_grad: bool,
+):
+    assert grad_scale is None and found_inf is None
 
     if len(params) == 0:
         return
 
-    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, momentum_buffer_list], with_indices=True)
-    for ((device_params, device_grads, device_momentum_buffer_list), indices) in grouped_tensors.values():
-        device_has_sparse_grad = has_sparse_grad and any(grad.is_sparse for grad in device_grads)
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, momentum_buffer_list], with_indices=True
+    )
+    for (
+        device_params,
+        device_grads,
+        device_momentum_buffer_list,
+    ), indices in grouped_tensors.values():
+        device_has_sparse_grad = has_sparse_grad and any(
+            grad.is_sparse for grad in device_grads
+        )
 
         if maximize:
             device_grads = torch._foreach_neg(device_grads)
@@ -290,7 +390,9 @@ def _multi_tensor_sgd(params: List[Tensor],
             if maximize:
                 torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
             else:
-                device_grads = torch._foreach_add(device_grads, device_params, alpha=weight_decay)
+                device_grads = torch._foreach_add(
+                    device_grads, device_params, alpha=weight_decay
+                )
 
         if momentum != 0:
             bufs = []
@@ -310,8 +412,9 @@ def _multi_tensor_sgd(params: List[Tensor],
                 bufs = []
                 for i in range(len(device_momentum_buffer_list)):
                     if device_momentum_buffer_list[i] is None:
-                        buf = device_momentum_buffer_list[i] = momentum_buffer_list[indices[i]] = \
-                            torch.clone(device_grads[i]).detach()
+                        buf = device_momentum_buffer_list[i] = momentum_buffer_list[
+                            indices[i]
+                        ] = torch.clone(device_grads[i]).detach()
                     else:
                         buf = device_momentum_buffer_list[i]
                         buf.mul_(momentum).add_(device_grads[i], alpha=1 - dampening)
@@ -324,8 +427,76 @@ def _multi_tensor_sgd(params: List[Tensor],
                 device_grads = bufs
 
         if not device_has_sparse_grad:
-            torch._foreach_add_(device_params, device_grads, alpha=-lr)
+            # handle internal item() call if lr is a tensor
+            if isinstance(lr, torch.Tensor) and torch._utils.is_compiling():
+                grads_x_lr = torch._foreach_mul(device_grads, -lr)
+                torch._foreach_add_(device_params, grads_x_lr)
+            else:
+                torch._foreach_add_(device_params, device_grads, alpha=-lr)
         else:
             # foreach APIs don't support sparse
             for i in range(len(device_params)):
                 device_params[i].add_(device_grads[i], alpha=-lr)
+
+
+def _fused_sgd(
+    params: List[Tensor],
+    grads: List[Tensor],
+    momentum_buffer_list: List[Optional[Tensor]],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
+    *,
+    weight_decay: float,
+    momentum: float,
+    lr: float,
+    dampening: float,
+    nesterov: bool,
+    maximize: bool,
+    has_sparse_grad: bool,
+) -> None:
+    if not params:
+        return
+    if has_sparse_grad:
+        raise RuntimeError("`_fused_sgd` does not support sparse gradients")
+    grad_scale_dict = (
+        {grad_scale.device: grad_scale} if grad_scale is not None else None
+    )
+    found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None
+
+    no_momentum_buffer = momentum == 0
+    is_first_step = (
+        all(t is None for t in momentum_buffer_list) and not no_momentum_buffer
+    )
+    if is_first_step:
+        for i, g in enumerate(grads):
+            momentum_buffer_list[i] = torch.empty_like(g)
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, momentum_buffer_list], with_indices=False
+    )
+    for (device, dtype), (
+        (device_params, device_grads, device_momentum_buffer_list),
+        _,
+    ) in grouped_tensors.items():
+        device_grad_scale, device_found_inf = None, None
+        if grad_scale is not None:
+            if device not in grad_scale_dict:
+                grad_scale_dict[device] = grad_scale.to(device)
+            device_grad_scale = grad_scale_dict[device]
+        if found_inf is not None:
+            if device not in found_inf_dict:
+                found_inf_dict[device] = found_inf.to(device)
+            device_found_inf = found_inf_dict[device]
+        torch._fused_sgd_(
+            device_params,
+            device_grads,
+            [] if no_momentum_buffer else device_momentum_buffer_list,
+            weight_decay=weight_decay,
+            momentum=momentum,
+            lr=lr,
+            dampening=dampening,
+            nesterov=nesterov,
+            maximize=maximize,
+            is_first_step=is_first_step,
+            grad_scale=device_grad_scale,
+            found_inf=device_found_inf,
+        )
diff --git a/torch/optim/sparse_adam.py b/torch/optim/sparse_adam.py
index bed0a7988d7f9..e3ee2db8204b0 100644
--- a/torch/optim/sparse_adam.py
+++ b/torch/optim/sparse_adam.py
@@ -1,11 +1,14 @@
 import torch
 from . import _functional as F
-from .optimizer import Optimizer, _maximize_doc
+from .optimizer import _maximize_doc, Optimizer
+
+__all__ = ["SparseAdam"]
 
-__all__ = ['SparseAdam']
 
 class SparseAdam(Optimizer):
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, maximize: bool = False):
+    def __init__(
+        self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, maximize: bool = False
+    ):
         if not 0.0 < lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 < eps:
@@ -19,17 +22,25 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, maximize: bool
         super().__init__(params, defaults)
 
         sparse_params = []
+        complex_params = []
         for index, param_group in enumerate(self.param_groups):
-            assert isinstance(param_group, dict), f"param_groups must be a list of dicts, but got {type(param_group)}"
+            assert isinstance(
+                param_group, dict
+            ), f"param_groups must be a list of dicts, but got {type(param_group)}"
             # given param group, convert given params to a list first before iterating
-            for d_index, d_param in enumerate(param_group['params']):
+            for d_index, d_param in enumerate(param_group["params"]):
                 if d_param.is_sparse:
                     sparse_params.append([index, d_index])
+                if d_param.is_complex():
+                    complex_params.append([index, d_index])
         if sparse_params:
             raise ValueError(
                 f"Sparse params at indices {sparse_params}: SparseAdam requires dense parameter tensors"
             )
-
+        if complex_params:
+            raise ValueError(
+                f"Complex params at indices {complex_params}: SparseAdam does not support complex parameters"
+            )
 
     @torch.no_grad()
     def step(self, closure=None):
@@ -50,50 +61,59 @@ def step(self, closure=None):
             exp_avgs = []
             exp_avg_sqs = []
             state_steps = []
-            eps = group['eps']
-            lr = group['lr']
-            beta1, beta2 = group['betas']
-            maximize = group.get('maximize', False)
+            eps = group["eps"]
+            lr = group["lr"]
+            beta1, beta2 = group["betas"]
+            maximize = group.get("maximize", False)
 
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is not None:
                     params_with_grad.append(p)
                     if not p.grad.is_sparse:
-                        raise RuntimeError('SparseAdam does not support dense gradients, please consider Adam instead')
+                        raise RuntimeError(
+                            "SparseAdam does not support dense gradients, please consider Adam instead"
+                        )
                     grads.append(p.grad)
 
                     state = self.state[p]
 
                     # State initialization
                     if len(state) == 0:
-                        state['step'] = 0
+                        state["step"] = 0
                         # Exponential moving average of gradient values
-                        state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        state["exp_avg"] = torch.zeros_like(
+                            p, memory_format=torch.preserve_format
+                        )
                         # Exponential moving average of squared gradient values
-                        state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        state["exp_avg_sq"] = torch.zeros_like(
+                            p, memory_format=torch.preserve_format
+                        )
 
-                    exp_avgs.append(state['exp_avg'])
-                    exp_avg_sqs.append(state['exp_avg_sq'])
+                    exp_avgs.append(state["exp_avg"])
+                    exp_avg_sqs.append(state["exp_avg_sq"])
 
                     # update the steps for each param group update
-                    state['step'] += 1
+                    state["step"] += 1
                     # record the step after step update
-                    state_steps.append(state['step'])
-
-            F.sparse_adam(params_with_grad,
-                          grads,
-                          exp_avgs,
-                          exp_avg_sqs,
-                          state_steps,
-                          beta1=beta1,
-                          beta2=beta2,
-                          lr=group['lr'],
-                          eps=group['eps'],
-                          maximize=maximize)
+                    state_steps.append(state["step"])
+
+            F.sparse_adam(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                state_steps,
+                beta1=beta1,
+                beta2=beta2,
+                lr=group["lr"],
+                eps=group["eps"],
+                maximize=maximize,
+            )
 
         return loss
 
-SparseAdam.__doc__ = fr"""SparseAdam implements a masked version of the Adam algorithm
+
+SparseAdam.__doc__ = rf"""SparseAdam implements a masked version of the Adam algorithm
     suitable for sparse gradients. Currently, due to implementation constraints (explained
     below), SparseAdam is only intended for a narrow subset of use cases, specifically
     parameters of a dense layout with gradients of a sparse layout. This occurs in a
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index 90b3f1598de91..62bb93c906358 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -1,31 +1,42 @@
 import itertools
 import math
-from copy import deepcopy
 import warnings
+from copy import deepcopy
+from typing import Any, Callable, cast, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
+from torch import Tensor
 from torch.nn import Module
 from torch.optim.lr_scheduler import LRScheduler
 from torch.utils._foreach_utils import _get_foreach_kernels_supported_devices
+from .optimizer import Optimizer
 
 __all__ = [
-    'AveragedModel',
-    'update_bn',
-    'SWALR',
-    'get_ema_multi_avg_fn',
-    'get_swa_multi_avg_fn',
-    'get_ema_avg_fn',
-    'get_swa_avg_fn'
+    "AveragedModel",
+    "update_bn",
+    "SWALR",
+    "get_ema_multi_avg_fn",
+    "get_swa_multi_avg_fn",
+    "get_ema_avg_fn",
+    "get_swa_avg_fn",
 ]
 
-from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
+from torch.utils._foreach_utils import (
+    _group_tensors_by_device_and_dtype,
+    Indices,
+    TensorListList,
+)
+
+PARAM_LIST = Union[Tuple[Tensor, ...], List[Tensor]]
 
 
 def get_ema_multi_avg_fn(decay=0.999):
     @torch.no_grad()
-    def ema_update(ema_param_list, current_param_list, _):
+    def ema_update(ema_param_list: PARAM_LIST, current_param_list: PARAM_LIST, _):
         # foreach lerp only handles float and complex
-        if torch.is_floating_point(ema_param_list[0]) or torch.is_complex(ema_param_list[0]):
+        if torch.is_floating_point(ema_param_list[0]) or torch.is_complex(
+            ema_param_list[0]
+        ):
             torch._foreach_lerp_(ema_param_list, current_param_list, 1 - decay)
         else:
             for p_ema, p_model in zip(ema_param_list, current_param_list):
@@ -36,20 +47,37 @@ def ema_update(ema_param_list, current_param_list, _):
 
 def get_swa_multi_avg_fn():
     @torch.no_grad()
-    def swa_update(averaged_param_list, current_param_list, num_averaged):
+    def swa_update(
+        averaged_param_list: PARAM_LIST,
+        current_param_list: PARAM_LIST,
+        num_averaged: Union[Tensor, int],
+    ):
         # foreach lerp only handles float and complex
-        if torch.is_floating_point(averaged_param_list[0]) or torch.is_complex(averaged_param_list[0]):
-            torch._foreach_lerp_(averaged_param_list, current_param_list, 1 / (num_averaged + 1))
+        if torch.is_floating_point(averaged_param_list[0]) or torch.is_complex(
+            averaged_param_list[0]
+        ):
+            torch._foreach_lerp_(
+                averaged_param_list, current_param_list, 1 / (num_averaged + 1)
+            )
         else:
             diffs = torch._foreach_sub(current_param_list, averaged_param_list)
-            torch._foreach_addcdiv_(averaged_param_list, diffs, [num_averaged + 1] * len(averaged_param_list))
+            if isinstance(num_averaged, Tensor):
+                torch._foreach_addcdiv_(
+                    averaged_param_list,
+                    diffs,
+                    [num_averaged + 1] * len(averaged_param_list),
+                )
+            else:
+                torch._foreach_add_(
+                    averaged_param_list, diffs, alpha=1.0 / (num_averaged + 1)
+                )
 
     return swa_update
 
 
 def get_ema_avg_fn(decay=0.999):
     @torch.no_grad()
-    def ema_update(ema_param, current_param, num_averaged):
+    def ema_update(ema_param: Tensor, current_param: Tensor, num_averaged):
         return decay * ema_param + (1 - decay) * current_param
 
     return ema_update
@@ -57,7 +85,9 @@ def ema_update(ema_param, current_param, num_averaged):
 
 def get_swa_avg_fn():
     @torch.no_grad()
-    def swa_update(averaged_param, current_param, num_averaged):
+    def swa_update(
+        averaged_param: Tensor, current_param: Tensor, num_averaged: Union[Tensor, int]
+    ):
         return averaged_param + (current_param - averaged_param) / (num_averaged + 1)
 
     return swa_update
@@ -162,14 +192,27 @@ class AveragedModel(Module):
     .. _Polyak averaging:
         https://paperswithcode.com/method/polyak-averaging
     """
-    def __init__(self, model, device=None, avg_fn=None, multi_avg_fn=None, use_buffers=False):
+
+    def __init__(
+        self,
+        model: Module,
+        device: Optional[Union[int, torch.device]] = None,
+        avg_fn: Optional[Callable[[Tensor, Tensor, Union[Tensor, int]], Tensor]] = None,
+        multi_avg_fn: Optional[
+            Callable[[PARAM_LIST, PARAM_LIST, Union[Tensor, int]], None]
+        ] = None,
+        use_buffers=False,
+    ):
         super().__init__()
-        assert avg_fn is None or multi_avg_fn is None, 'Only one of avg_fn and multi_avg_fn should be provided'
+        assert (
+            avg_fn is None or multi_avg_fn is None
+        ), "Only one of avg_fn and multi_avg_fn should be provided"
         self.module = deepcopy(model)
         if device is not None:
             self.module = self.module.to(device)
-        self.register_buffer('n_averaged',
-                             torch.tensor(0, dtype=torch.long, device=device))
+        self.register_buffer(
+            "n_averaged", torch.tensor(0, dtype=torch.long, device=device)
+        )
         self.avg_fn = avg_fn
         self.multi_avg_fn = multi_avg_fn
         self.use_buffers = use_buffers
@@ -177,14 +220,16 @@ def __init__(self, model, device=None, avg_fn=None, multi_avg_fn=None, use_buffe
     def forward(self, *args, **kwargs):
         return self.module(*args, **kwargs)
 
-    def update_parameters(self, model):
+    def update_parameters(self, model: Module):
         self_param = (
             itertools.chain(self.module.parameters(), self.module.buffers())
-            if self.use_buffers else self.parameters()
+            if self.use_buffers
+            else self.parameters()
         )
         model_param = (
             itertools.chain(model.parameters(), model.buffers())
-            if self.use_buffers else model.parameters()
+            if self.use_buffers
+            else model.parameters()
         )
         self_param_detached = []
         model_param_detached = []
@@ -197,22 +242,42 @@ def update_parameters(self, model):
 
         if self.n_averaged > 0:
             if self.multi_avg_fn is not None or self.avg_fn is None:
-                grouped_tensors = _group_tensors_by_device_and_dtype([self_param_detached, model_param_detached])
-                for ((device, _), ([self_params, model_params], _)) in grouped_tensors.items():
+                grouped_tensors = _group_tensors_by_device_and_dtype(
+                    cast(TensorListList, [self_param_detached, model_param_detached])
+                )
+                grouped_tensors = cast(
+                    Dict[
+                        Tuple[torch.device, torch.dtype],
+                        Tuple[List[List[Tensor]], Indices],
+                    ],
+                    grouped_tensors,
+                )
+                for (device, _), (
+                    [self_params, model_params],
+                    _,
+                ) in grouped_tensors.items():
                     if self.multi_avg_fn:
-                        self.multi_avg_fn(self_params, model_params, self.n_averaged.to(device))
+                        self.multi_avg_fn(
+                            self_params, model_params, self.n_averaged.to(device)
+                        )
                     elif device.type in _get_foreach_kernels_supported_devices():
                         multi_avg_fn = get_swa_multi_avg_fn()
-                        multi_avg_fn(self_params, model_params, self.n_averaged.to(device))
+                        multi_avg_fn(
+                            self_params, model_params, self.n_averaged.to(device)
+                        )
                     else:
                         avg_fn = get_swa_avg_fn()
                         n_averaged = self.n_averaged.to(device)
                         for p_averaged, p_model in zip(self_params, model_params):
                             p_averaged.copy_(avg_fn(p_averaged, p_model, n_averaged))
             else:
-                for p_averaged, p_model in zip(self_param_detached, model_param_detached):
+                for p_averaged, p_model in zip(
+                    self_param_detached, model_param_detached
+                ):
                     n_averaged = self.n_averaged.to(p_averaged.device)
-                    p_averaged.detach().copy_(self.avg_fn(p_averaged.detach(), p_model, n_averaged))
+                    p_averaged.detach().copy_(
+                        self.avg_fn(p_averaged.detach(), p_model, n_averaged)
+                    )
 
         if not self.use_buffers:
             # If not apply running averages to the buffers,
@@ -223,7 +288,11 @@ def update_parameters(self, model):
 
 
 @torch.no_grad()
-def update_bn(loader, model, device=None):
+def update_bn(
+    loader: Iterable[Any],
+    model: Module,
+    device: Optional[Union[int, torch.device]] = None,
+):
     r"""Updates BatchNorm running_mean, running_var buffers in the model.
 
     It performs one pass over data in `loader` to estimate the activation
@@ -319,19 +388,31 @@ class SWALR(LRScheduler):
     .. _Averaging Weights Leads to Wider Optima and Better Generalization:
         https://arxiv.org/abs/1803.05407
     """
-    def __init__(self, optimizer, swa_lr, anneal_epochs=10, anneal_strategy='cos', last_epoch=-1):
+
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        swa_lr: float,
+        anneal_epochs=10,
+        anneal_strategy="cos",
+        last_epoch=-1,
+    ):
         swa_lrs = self._format_param(optimizer, swa_lr)
         for swa_lr, group in zip(swa_lrs, optimizer.param_groups):
-            group['swa_lr'] = swa_lr
-        if anneal_strategy not in ['cos', 'linear']:
-            raise ValueError("anneal_strategy must by one of 'cos' or 'linear', "
-                             f"instead got {anneal_strategy}")
-        elif anneal_strategy == 'cos':
+            group["swa_lr"] = swa_lr
+        if anneal_strategy not in ["cos", "linear"]:
+            raise ValueError(
+                "anneal_strategy must by one of 'cos' or 'linear', "
+                f"instead got {anneal_strategy}"
+            )
+        elif anneal_strategy == "cos":
             self.anneal_func = self._cosine_anneal
-        elif anneal_strategy == 'linear':
+        elif anneal_strategy == "linear":
             self.anneal_func = self._linear_anneal
         if not isinstance(anneal_epochs, int) or anneal_epochs < 0:
-            raise ValueError(f"anneal_epochs must be equal or greater than 0, got {anneal_epochs}")
+            raise ValueError(
+                f"anneal_epochs must be equal or greater than 0, got {anneal_epochs}"
+            )
         self.anneal_epochs = anneal_epochs
         super().__init__(optimizer, last_epoch)
 
@@ -339,9 +420,11 @@ def __init__(self, optimizer, swa_lr, anneal_epochs=10, anneal_strategy='cos', l
     def _format_param(optimizer, swa_lrs):
         if isinstance(swa_lrs, (list, tuple)):
             if len(swa_lrs) != len(optimizer.param_groups):
-                raise ValueError("swa_lr must have the same length as "
-                                 f"optimizer.param_groups: swa_lr has {len(swa_lrs)}, "
-                                 f"optimizer.param_groups has {len(optimizer.param_groups)}")
+                raise ValueError(
+                    "swa_lr must have the same length as "
+                    f"optimizer.param_groups: swa_lr has {len(swa_lrs)}, "
+                    f"optimizer.param_groups has {len(optimizer.param_groups)}"
+                )
             return swa_lrs
         else:
             return [swa_lrs] * len(optimizer.param_groups)
@@ -361,17 +444,27 @@ def _get_initial_lr(lr, swa_lr, alpha):
         return (lr - alpha * swa_lr) / (1 - alpha)
 
     def get_lr(self):
-        if not self._get_lr_called_within_step:
-            warnings.warn("To get the last learning rate computed by the scheduler, "
-                          "please use `get_last_lr()`.", UserWarning)
-        step = self._step_count - 1
+        # `_get_lr_called_within_step` is only available `_enable_get_lr_call`,
+        # so we ignore the type error here. See `LRScheduler.step()` for more details.
+        if not self._get_lr_called_within_step:  # type: ignore[attr-defined]
+            warnings.warn(
+                "To get the last learning rate computed by the scheduler, "
+                "please use `get_last_lr()`.",
+                UserWarning,
+            )
+        # Set in `LRScheduler._initial_step()`
+        step = self._step_count - 1  # type: ignore[attr-defined]
         if self.anneal_epochs == 0:
             step = max(1, step)
         prev_t = max(0, min(1, (step - 1) / max(1, self.anneal_epochs)))
         prev_alpha = self.anneal_func(prev_t)
-        prev_lrs = [self._get_initial_lr(group['lr'], group['swa_lr'], prev_alpha)
-                    for group in self.optimizer.param_groups]
+        prev_lrs = [
+            self._get_initial_lr(group["lr"], group["swa_lr"], prev_alpha)
+            for group in self.optimizer.param_groups
+        ]
         t = max(0, min(1, step / max(1, self.anneal_epochs)))
         alpha = self.anneal_func(t)
-        return [group['swa_lr'] * alpha + lr * (1 - alpha)
-                for group, lr in zip(self.optimizer.param_groups, prev_lrs)]
+        return [
+            group["swa_lr"] * alpha + lr * (1 - alpha)
+            for group, lr in zip(self.optimizer.param_groups, prev_lrs)
+        ]
diff --git a/torch/optim/swa_utils.pyi b/torch/optim/swa_utils.pyi
deleted file mode 100644
index 074e7a9bbd75a..0000000000000
--- a/torch/optim/swa_utils.pyi
+++ /dev/null
@@ -1,32 +0,0 @@
-from typing import Any, Callable, Iterable, Union
-
-from torch import device, Tensor
-from torch.nn.modules import Module
-from .lr_scheduler import _LRScheduler
-from .optimizer import Optimizer
-
-class AveragedModel(Module):
-    def __init__(
-        self,
-        model: Module,
-        device: Union[int, device] = ...,
-        avg_fn: Callable[[Tensor, Tensor, int], Tensor] = ...,
-        use_buffers: bool = ...,
-    ) -> None: ...
-    def update_parameters(self, model: Module) -> None: ...
-
-def update_bn(
-    loader: Iterable[Any],
-    model: Module,
-    device: Union[int, device] = ...,
-) -> None: ...
-
-class SWALR(_LRScheduler):
-    def __init__(
-        self,
-        optimizer: Optimizer,
-        swa_lr: float,
-        anneal_epochs: int,
-        anneal_strategy: str,
-        last_epoch: int = ...,
-    ) -> None: ...
diff --git a/torch/overrides.py b/torch/overrides.py
index 73a1c7a0866d9..6c521bc7003b2 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -256,6 +256,8 @@ def get_ignored_functions() -> Set[Callable]:
         handle_torch_function,
         torch.set_autocast_enabled,
         torch.is_autocast_enabled,
+        torch.set_autocast_dtype,
+        torch.get_autocast_dtype,
         torch.clear_autocast_cache,
         torch.set_autocast_cpu_enabled,
         torch.is_autocast_cpu_enabled,
@@ -281,6 +283,7 @@ def get_ignored_functions() -> Set[Callable]:
         torch.use_deterministic_algorithms,
         torch.is_deterministic_algorithms_warn_only_enabled,
         torch.set_deterministic_debug_mode,
+        torch.get_device_module,
         torch.get_deterministic_debug_mode,
         torch.set_float32_matmul_precision,
         torch.get_float32_matmul_precision,
@@ -347,6 +350,7 @@ def get_ignored_functions() -> Set[Callable]:
         Tensor._has_symbolic_sizes_strides.__get__,
         Tensor._conj,
         Tensor._conj_physical,
+        Tensor._lazy_clone,
         Tensor._neg_view,
         Tensor._is_zerotensor,
         Tensor._is_all_true,
@@ -909,6 +913,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.nn.functional.prelu: lambda input, weight: -1,
         torch.nn.functional.relu: lambda input, inplace=False: -1,
         torch.nn.functional.relu6: lambda input, inplace=False: -1,
+        torch.nn.functional.rms_norm: lambda input, normalized_shape, weight=None, eps=1e-6: -1,
         torch.nn.functional.rrelu: lambda input, lower=0.125, upper=0.3333333333333333, training=False, inplace=False: -1,
         torch.nn.functional.selu: lambda input, inplace=False: -1,
         torch.nn.functional.silu: lambda input, inplace=False: -1,
@@ -1007,6 +1012,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.renorm: lambda input, p, dim, maxnorm, out=None: -1,
         torch.repeat_interleave: lambda input, dim=None: -1,
         torch.reshape: lambda input, shape: -1,
+        torch.rms_norm: lambda input, normalized_shape, weight=None, eps=1e-6: -1,
         torch.rnn_relu: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1,
         torch.rnn_relu_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1,
         torch.rnn_tanh: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1,
@@ -1027,6 +1033,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch._segment_reduce: lambda data, reduce="max", lengths=None, indices=None, offsets=None, axis=0, unsafe=False: -1,
         torch.select: lambda input, dim, index: -1,
         torch.select_scatter: lambda input, src, dim, index: -1,
+        torch.slice_inverse: lambda input, src, dim=0, start=None, end=None, step=1: -1,
         torch.slice_scatter: lambda input, src, dim=0, start=None, end=None, step=1: -1,
         torch.selu: lambda input, inplace=False: -1,
         torch.sigmoid: lambda input, out=None: -1,
@@ -1064,7 +1071,16 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.sym_min: lambda a, b: -1,
         torch.sym_not: lambda input: -1,
         torch.sym_ite: lambda a, b, c: -1,
-        torch.sym_sqrt: lambda input: -1,
+        torch._sym_sqrt: lambda input: -1,
+        torch._sym_cos: lambda input: -1,
+        torch._sym_cosh: lambda input: -1,
+        torch._sym_sin: lambda input: -1,
+        torch._sym_sinh: lambda input: -1,
+        torch._sym_tan: lambda input: -1,
+        torch._sym_tanh: lambda input: -1,
+        torch._sym_asin: lambda input: -1,
+        torch._sym_acos: lambda input: -1,
+        torch._sym_atan: lambda input: -1,
         torch.nansum: lambda input, dim=None: -1,
         torch.svd: lambda input, some=True, compute_uv=True, out=None: -1,
         torch.svd_lowrank: lambda input, q=6, niter=2, M=None: -1,
@@ -1270,7 +1286,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         Tensor.is_mps.__get__: lambda self: -1,
         Tensor.is_mtia.__get__: lambda self: -1,
         Tensor.is_nested.__get__: lambda self: -1,
-        Tensor.is_ort.__get__: lambda self: -1,
+        Tensor.is_maia.__get__: lambda self: -1,
         Tensor.is_mkldnn.__get__: lambda self: -1,
         Tensor.is_quantized.__get__: lambda self: -1,
         Tensor.is_sparse.__get__: lambda self: -1,
@@ -1356,6 +1372,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         Tensor.map_: lambda self, tensor, callable: -1,
         Tensor.map2_: lambda self, x, y, callable: -1,
         Tensor.mm: lambda self, mat2: -1,
+        Tensor.module_load: lambda self, other, assign=False: -1,
         Tensor.narrow_copy: lambda self, dimension, start, length: -1,
         Tensor.ndimension: lambda self: -1,
         Tensor.nelement: lambda self: -1,
@@ -1418,6 +1435,11 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.linalg.lstsq: lambda self, b, cond=None, driver=None: -1,
     }
 
+    privateuse1_backend_name = torch.utils.backend_registration._privateuse1_backend_name
+    if hasattr(Tensor, privateuse1_backend_name):
+        ret[getattr(Tensor, privateuse1_backend_name)] = lambda self, device=None, non_blocking=False, **kwargs: -1
+        ret[getattr(Tensor, f'is_{privateuse1_backend_name}').__get__] = lambda self: -1  # noqa: B009
+
     ret2 = {}
     ignored = get_ignored_functions()
 
@@ -1896,7 +1918,7 @@ def __init__(self):
         pass
 
     def __torch_function__(self, func, types, args=(), kwargs=None):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def __enter__(self):
         _push_mode(self)
diff --git a/torch/package/file_structure_representation.py b/torch/package/file_structure_representation.py
index cc5f055c1a20e..1453ad3a5ded3 100644
--- a/torch/package/file_structure_representation.py
+++ b/torch/package/file_structure_representation.py
@@ -67,13 +67,16 @@ def __str__(self):
         return "".join(str_list)
 
     def _stringify_tree(
-        self, str_list: List[str], preamble: str = "", dir_ptr: str = "─── "
+        self,
+        str_list: List[str],
+        preamble: str = "",
+        dir_ptr: str = "\u2500\u2500\u2500 ",
     ):
         """Recursive method to generate print-friendly version of a Directory."""
         space = "    "
-        branch = "│   "
-        tee = "├── "
-        last = "└── "
+        branch = "\u2502   "
+        tee = "\u251c\u2500\u2500 "
+        last = "\u2514\u2500\u2500 "
 
         # add this directory's representation
         str_list.append(f"{preamble}{dir_ptr}{self.name}\n")
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 684b9f4fcbfa5..493c017ccf992 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -166,7 +166,7 @@ def __init__(self, dependency_graph: DiGraph, debug=False):
                 if debug:
                     module_path = dependency_graph.first_path(module_name)
                     message.write(
-                        f"      A path to {module_name}: {' -> '.join(module_path)}"
+                        f"      A path to {module_name}: {' -> '.join(module_path)}\n"
                     )
         if not debug:
             message.write("\n")
@@ -705,9 +705,9 @@ def _check_mocked_error(module: Optional[str], field: Optional[str]):
                 """ If an object happens to come from a mocked module, then we collect these errors and spit them
                     out with the other errors found by package exporter.
                 """
-                if module in mocked_modules:
-                    assert isinstance(module, str)
-                    fields = mocked_modules[module]
+                if module_name in mocked_modules:
+                    assert isinstance(module_name, str)
+                    fields = mocked_modules[module_name]
                     self.dependency_graph.add_node(
                         module_name,
                         action=_ModuleProviderAction.MOCK,
@@ -941,7 +941,7 @@ def _persistent_id(self, obj):
                     storage = storage.cpu()
                 num_bytes = storage.nbytes()
                 self.zip_file.write_record(
-                    f".data/{storage_id}.storage", storage.data_ptr(), num_bytes
+                    f".data/{storage_id}.storage", storage, num_bytes
                 )
             return ("storage", storage_type, storage_id, location, storage_numel)
 
@@ -949,7 +949,7 @@ def _persistent_id(self, obj):
             if _gate_torchscript_serialization and isinstance(
                 obj, torch.jit.RecursiveScriptModule
             ):
-                raise Exception(
+                raise Exception(  # noqa: TRY002
                     "Serializing ScriptModules directly into a package is a beta feature. "
                     "To use, set global "
                     "`torch.package.package_exporter._gate_torchscript_serialization` to `False`."
diff --git a/torch/package/package_importer.py b/torch/package/package_importer.py
index 93f2eee5e22ae..9e2f74354db5b 100644
--- a/torch/package/package_importer.py
+++ b/torch/package/package_importer.py
@@ -7,7 +7,18 @@
 import os
 import types
 from contextlib import contextmanager
-from typing import Any, BinaryIO, Callable, cast, Dict, Iterable, List, Optional, Union
+from typing import (
+    Any,
+    BinaryIO,
+    Callable,
+    cast,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    TYPE_CHECKING,
+    Union,
+)
 from weakref import WeakValueDictionary
 
 import torch
@@ -24,9 +35,11 @@
 from ._mangling import demangle, PackageMangler
 from ._package_unpickler import PackageUnpickler
 from .file_structure_representation import _create_directory_from_file_list, Directory
-from .glob_group import GlobPattern
 from .importer import Importer
 
+if TYPE_CHECKING:
+    from .glob_group import GlobPattern
+
 __all__ = ["PackageImporter"]
 
 
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index dcbbe151a6474..be3edc50655eb 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -930,8 +930,9 @@ def _set_parameters_using_data_flow(self) -> None:
                 self._is_gradient(*i) or i in used_for_gradient
                 for i in node.outputs.items()
             ):
-                for key, (_, version) in node.inputs.items():
-                    used_for_gradient.add((key, version))
+                used_for_gradient.update(
+                    (key, version) for key, (_, version) in node.inputs.items()
+                )
         candidate_parameters.intersection_update(used_for_gradient)
 
         # and depends on a gradient.
@@ -1053,11 +1054,11 @@ def update(key, version, delta):
         times = [t_min if t < 0 else t for t in times]
         return times, sizes
 
-    def export_memory_timeline(self, path, device) -> None:
+    def export_memory_timeline(self, path, device_str) -> None:
         """Saves the memory timeline as [times, sizes by category]
         as a JSON formatted file to the given path for the given
         device."""
-        times, sizes = self._coalesce_timeline(device)
+        times, sizes = self._coalesce_timeline(device_str)
         # TODO: Write a faster serialize (orjson not available in CI)
         import json
 
@@ -1131,7 +1132,7 @@ def get_category_index(key, version):
             json.dump(raw_events, f)
 
     def export_memory_timeline_html(
-        self, path, device, figsize=(20, 12), title=None
+        self, path, device_str, figsize=(20, 12), title=None
     ) -> None:
         """Exports the memory timeline as an HTML file which contains
         the memory timeline plot embedded as a PNG file."""
@@ -1152,14 +1153,15 @@ def export_memory_timeline_html(
         import matplotlib.pyplot as plt
         import numpy as np
 
-        mt = self._coalesce_timeline(device)
+        mt = self._coalesce_timeline(device_str)
         times, sizes = np.array(mt[0]), np.array(mt[1])
         # For this timeline, start at 0 to match Chrome traces.
         t_min = min(times)
         times -= t_min
         stacked = np.cumsum(sizes, axis=1) / 1024**3
-        max_memory_allocated = torch.cuda.max_memory_allocated()
-        max_memory_reserved = torch.cuda.max_memory_reserved()
+        device = torch.device(device_str)
+        max_memory_allocated = torch.cuda.max_memory_allocated(device)
+        max_memory_reserved = torch.cuda.max_memory_reserved(device)
 
         # Plot memory timeline as stacked data
         fig = plt.figure(figsize=figsize, dpi=80)
@@ -1176,8 +1178,8 @@ def export_memory_timeline_html(
         title = "\n\n".join(
             ([title] if title else [])
             + [
-                f"Max memory allocated: {max_memory_allocated/(10**9):.2f} GB \n"
-                f"Max memory reserved: {max_memory_reserved/(10**9):.2f} GB"
+                f"Max memory allocated: {max_memory_allocated/(1024**3):.2f} GiB \n"
+                f"Max memory reserved: {max_memory_reserved/(1024**3):.2f} GiB"
             ]
         )
         axes.set_title(title)
diff --git a/torch/profiler/_utils.py b/torch/profiler/_utils.py
index cb9469e4c983a..35f6e71de5582 100644
--- a/torch/profiler/_utils.py
+++ b/torch/profiler/_utils.py
@@ -1,14 +1,17 @@
 import functools
+import operator
 import re
 from collections import deque
 from dataclasses import dataclass
-from typing import Dict, List
+from typing import Dict, List, TYPE_CHECKING
 
-from torch.autograd import _KinetoEvent
 from torch.autograd.profiler import profile
 
 from torch.profiler import DeviceType
 
+if TYPE_CHECKING:
+    from torch.autograd import _KinetoEvent
+
 
 def _traverse(tree, next_fn, children_fn=lambda x: x.children, reverse: bool = False):
     order = reversed if reverse else lambda x: x
@@ -147,15 +150,15 @@ def is_cuda_kernel(e):
 
         cuda_launch_events = sorted(
             (e for e in cuda_event_list if is_cuda_launch_kernel(e)),
-            key=lambda x: x.start_us(),
+            key=lambda x: x.start_ns(),
         )
         cuda_kernel_events = sorted(
             (e for e in cuda_event_list if is_cuda_kernel(e)),
-            key=lambda x: x.start_us(),
+            key=lambda x: x.start_ns(),
         )
 
         self.cuda_events = sorted(
-            cuda_launch_events + cuda_kernel_events, key=lambda x: x.start_us()
+            cuda_launch_events + cuda_kernel_events, key=lambda x: x.start_ns()
         )
 
         kernel_mapping: Dict[_KinetoEvent, int] = {}
@@ -178,9 +181,11 @@ def is_cuda_kernel(e):
         def new_old_event_comparator(event):
             if hasattr(event, "start_us"):
                 return event.start_us() * 1000
+            if hasattr(event, "start_ns"):
+                return event.start_ns()
             if hasattr(event, "start_time_ns"):
                 return event.start_time_ns
-            raise Exception("Unknown Event Type")
+            raise Exception("Unknown Event Type")  # noqa: TRY002
 
         queue_depth_list: List[Interval] = []
         all_events.sort(key=new_old_event_comparator)
@@ -192,22 +197,28 @@ def new_old_event_comparator(event):
                 # Find current spawned cuda kernel event
                 if event in kernel_mapping and kernel_mapping[event] is not None:
                     spawned_kernel_index = kernel_mapping[event]
+            if hasattr(event, "start_ns"):
+                start_time = event.start_ns()
+                end_time = event.start_ns() + event.duration_ns()
+                # Find current spawned cuda kernel event
+                if event in kernel_mapping and kernel_mapping[event] is not None:
+                    spawned_kernel_index = kernel_mapping[event]
             elif hasattr(event, "start_time_ns"):
                 start_time = event.start_time_ns  # type: ignore[attr-defined]
                 end_time = event.end_time_ns  # type: ignore[attr-defined]
 
             while (
                 current_kernel_index < len(cuda_kernel_events)
-                and (cuda_kernel_events[current_kernel_index].start_us()) * 1000
-                <= start_time
+                and (cuda_kernel_events[current_kernel_index].start_ns())
+                <= start_time  # type: ignore[possibly-undefined]
             ):
                 current_kernel_index += 1
             current_queue_depth = spawned_kernel_index - current_kernel_index + 1
             current_queue_depth = max(current_queue_depth, 0)
 
-            if hasattr(event, "start_us"):
+            if hasattr(event, "start_us") or hasattr(event, "start_ns"):
                 queue_depth_list.append(
-                    Interval(start_time, end_time, current_queue_depth)
+                    Interval(start_time, end_time, current_queue_depth)  # type: ignore[possibly-undefined]
                 )
             elif hasattr(event, "start_time_ns"):
                 self.metrics[EventKey(event)].queue_depth = current_queue_depth
@@ -308,7 +319,7 @@ def rank_events(self, length):
                 event
                 for _, event in sorted(
                     zip(heuristic_score_list, event_list),
-                    key=lambda x: x[0],
+                    key=operator.itemgetter(0),
                     reverse=True,
                 )
             ]
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index c7d3bad02572c..6e40d0f68bd23 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -1,12 +1,16 @@
 import gzip
 import json
 import os
+import shutil
 import tempfile
+from abc import ABC, abstractmethod
 from enum import Enum
 from functools import partial
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
 from warnings import warn
 
+from typing_extensions import Self
+
 import torch
 import torch.autograd.profiler as prof
 from torch._C import _get_privateuse1_backend_name
@@ -46,13 +50,32 @@ def supported_activities():
     return torch.autograd._supported_activities()
 
 
+class _ITraceObserver(ABC):
+    """Abstract interface for a Trace observer.
+    This satisfies 3 methods: start, stop and cleanup"""
+
+    @abstractmethod
+    def start(self):
+        pass
+
+    @abstractmethod
+    def stop(self):
+        pass
+
+    @abstractmethod
+    def cleanup(self):
+        pass
+
+
 class _KinetoProfile:
     """Low-level profiler wrap the autograd profile
 
     Args:
         activities (iterable): list of activity groups (CPU, CUDA) to use in profiling, supported values:
-            ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``.
-            Default value: ProfilerActivity.CPU and (when available) ProfilerActivity.CUDA.
+            ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``,
+            ``torch.profiler.ProfilerActivity.XPU``.
+            Default value: ProfilerActivity.CPU and (when available) ProfilerActivity.CUDA
+            or (when available) ProfilerActivity.XPU.
         record_shapes (bool): save information about operator's input shapes.
         profile_memory (bool): track tensor memory allocation/deallocation (see ``export_memory_timeline``
             for more details).
@@ -65,9 +88,13 @@ class _KinetoProfile:
             then aten::add's module hierarchy is A.B
             Note that this support exist, at the moment, only for TorchScript models
             and not eager mode models.
-
         experimental_config (_ExperimentalConfig) : A set of experimental options
             used by profiler libraries like Kineto. Note, backward compatibility is not guaranteed.
+        execution_trace_observer (ExecutionTraceObserver) : A PyTorch Execution Trace Observer object.
+            `PyTorch Execution Traces <https://arxiv.org/pdf/2305.14516.pdf>`__ offer a graph based
+            representation of AI/ML workloads and enable replay benchmarks, simulators, and emulators.
+            When this argument is included the observer start() and stop() will be called for the
+            same time window as PyTorch profiler.
 
     .. note::
         This API is experimental and subject to change in the future.
@@ -88,6 +115,7 @@ def __init__(
         with_flops: bool = False,
         with_modules: bool = False,
         experimental_config: Optional[_ExperimentalConfig] = None,
+        execution_trace_observer: Optional[_ITraceObserver] = None,
     ):
         self.activities = set(activities) if activities else supported_activities()
         self.record_shapes = record_shapes
@@ -96,12 +124,19 @@ def __init__(
         self.with_stack = with_stack
         self.with_modules = with_modules
         self.experimental_config = experimental_config
+        self.execution_trace_observer = execution_trace_observer
         self.profiler: Optional[prof.profile] = None
         self.mem_tl: Optional[MemoryProfileTimeline] = None
         self.use_device = None
-        privateuse1_backend = _get_privateuse1_backend_name()
-        if privateuse1_backend != "privateuseone":
-            self.use_device = privateuse1_backend
+        if ProfilerActivity.CUDA in self.activities:
+            self.use_device = "cuda"
+        elif ProfilerActivity.XPU in self.activities:
+            self.use_device = "xpu"
+        elif ProfilerActivity.PrivateUse1 in self.activities:
+            self.use_device = _get_privateuse1_backend_name()
+
+        # user-defined metadata to be amended to the trace
+        self.preset_metadata: Dict[str, str] = dict()
 
     def start(self):
         self.prepare_trace()
@@ -111,22 +146,25 @@ def stop(self):
         self.stop_trace()
 
     def prepare_trace(self):
-        self.profiler = prof.profile(
-            use_cuda=(ProfilerActivity.CUDA in self.activities),
-            use_cpu=(ProfilerActivity.CPU in self.activities),
-            use_mtia=(ProfilerActivity.MTIA in self.activities),
-            use_device=None,
-            record_shapes=self.record_shapes,
-            with_flops=self.with_flops,
-            profile_memory=self.profile_memory,
-            with_stack=self.with_stack,
-            with_modules=self.with_modules,
-            use_kineto=True,
-            experimental_config=self.experimental_config,
-        )
+        if self.profiler is None:
+            self.profiler = prof.profile(
+                use_cuda=(ProfilerActivity.CUDA in self.activities),
+                use_cpu=(ProfilerActivity.CPU in self.activities),
+                use_mtia=(ProfilerActivity.MTIA in self.activities),
+                use_device=self.use_device,
+                record_shapes=self.record_shapes,
+                with_flops=self.with_flops,
+                profile_memory=self.profile_memory,
+                with_stack=self.with_stack,
+                with_modules=self.with_modules,
+                use_kineto=True,
+                experimental_config=self.experimental_config,
+            )
         self.profiler._prepare_trace()
 
     def start_trace(self):
+        if self.execution_trace_observer:
+            self.execution_trace_observer.start()
         assert self.profiler is not None
         self.profiler._start_trace()
 
@@ -158,13 +196,20 @@ def start_trace(self):
                     # Workaround: turn off CUPTI teardown when using CUDA Graphs.
                     os.environ["TEARDOWN_CUPTI"] = "0"
 
+            # Insert the preset user metadata to the trace
+            for k, v in self.preset_metadata.items():
+                self.add_metadata_json(k, v)
+
     def stop_trace(self):
+        if self.execution_trace_observer:
+            self.execution_trace_observer.stop()
         assert self.profiler is not None
         self.profiler.__exit__(None, None, None)
 
     def export_chrome_trace(self, path: str):
         """
-        Exports the collected trace in Chrome JSON format.
+        Exports the collected trace in Chrome JSON format. If kineto is enabled, only
+        last cycle in schedule is exported.
         """
         assert self.profiler
         if path.endswith(".gz"):
@@ -180,18 +225,11 @@ def export_chrome_trace(self, path: str):
             return self.profiler.export_chrome_trace(path)
 
     def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
-        """Save stack traces in a file in a format suitable for visualization.
+        """Save stack traces to a file
 
         Args:
             path (str): save stacks file to this location;
             metric (str): metric to use: "self_cpu_time_total" or "self_cuda_time_total"
-
-        .. note::
-            Example of using FlameGraph tool:
-
-            - git clone https://github.com/brendangregg/FlameGraph
-            - cd FlameGraph
-            - ./flamegraph.pl --title "CPU time" --countname "us." profiler.stacks > perf_viz.svg
         """
         assert self.profiler
         return self.profiler.export_stacks(path, metric)
@@ -232,17 +270,32 @@ def add_metadata_json(self, key: str, value: str):
         """
         torch.autograd._add_metadata_json(key, value)
 
+    def preset_metadata_json(self, key: str, value: str):
+        """
+        Preset a user defined metadata when the profiler is not started
+        and added into the trace file later.
+        Metadata is in the format of a string key and a valid json value
+        """
+        self.preset_metadata[key] = value
+
     def _get_distributed_info(self):
         import torch.distributed as dist
 
         if not dist.is_available() or not dist.is_initialized():
             return None
 
-        return {
-            "backend": dist.get_backend(),
+        backend = dist.get_backend()
+        dist_info = {
+            "backend": backend,
             "rank": dist.get_rank(),
             "world_size": dist.get_world_size(),
+            "pg_count": dist.get_pg_count(),
+            "pg_config": dist.distributed_c10d._get_all_pg_configs(),
         }
+        if backend == "nccl":
+            nccl_version = torch.cuda.nccl.version()
+            dist_info["nccl_version"] = ".".join(str(v) for v in nccl_version)
+        return dist_info
 
     def _memory_profile(self) -> MemoryProfile:
         required = ("record_shapes", "profile_memory", "with_stack")
@@ -399,8 +452,10 @@ class profile(_KinetoProfile):
 
     Args:
         activities (iterable): list of activity groups (CPU, CUDA) to use in profiling, supported values:
-            ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``.
-            Default value: ProfilerActivity.CPU and (when available) ProfilerActivity.CUDA.
+            ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``,
+            ``torch.profiler.ProfilerActivity.XPU``.
+            Default value: ProfilerActivity.CPU and (when available) ProfilerActivity.CUDA
+            or (when available) ProfilerActivity.XPU.
         schedule (Callable): callable that takes step (int) as a single parameter and returns
             ``ProfilerAction`` value that specifies the profiler action to perform at each step.
         on_trace_ready (Callable): callable that is called at each step when ``schedule``
@@ -418,7 +473,11 @@ class profile(_KinetoProfile):
             and not eager mode models.
         experimental_config (_ExperimentalConfig) : A set of experimental options
             used for Kineto library features. Note, backward compatibility is not guaranteed.
-
+        execution_trace_observer (ExecutionTraceObserver) : A PyTorch Execution Trace Observer object.
+            `PyTorch Execution Traces <https://arxiv.org/pdf/2305.14516.pdf>`__ offer a graph based
+            representation of AI/ML workloads and enable replay benchmarks, simulators, and emulators.
+            When this argument is included the observer start() and stop() will be called for the
+            same time window as PyTorch profiler. See the examples section below for a code sample.
         use_cuda (bool):
             .. deprecated:: 1.8.1
                 use ``activities`` instead.
@@ -450,6 +509,7 @@ class profile(_KinetoProfile):
         that may further prevent certain optimizations that depend on the reference count and introduce
         extra tensor copies.
 
+
     Examples:
 
     .. code-block:: python
@@ -503,6 +563,23 @@ def trace_handler(prof):
                     code_iteration_to_profile(iter)
                     # send a signal to the profiler that the next iteration has started
                     p.step()
+
+    The following sample shows how to setup up an Execution Trace Observer (`execution_trace_observer`)
+
+    .. code-block:: python
+
+        with torch.profiler.profile(
+            ...
+            execution_trace_observer=(
+                ExecutionTraceObserver().register_callback("./execution_trace.json")
+            ),
+        ) as p:
+            for iter in range(N):
+                code_iteration_to_profile(iter)
+                p.step()
+
+    You can also refer to test_execution_trace_with_kineto() in tests/profiler/test_profiler.py.
+    Note: One can also pass any object satisfying the _ITraceObserver interface.
     """
 
     def __init__(
@@ -517,6 +594,7 @@ def __init__(
         with_flops: bool = False,
         with_modules: bool = False,
         experimental_config: Optional[_ExperimentalConfig] = None,
+        execution_trace_observer: Optional[_ITraceObserver] = None,
         # deprecated:
         use_cuda: Optional[bool] = None,
     ):
@@ -537,6 +615,7 @@ def __init__(
             with_flops=with_flops,
             with_modules=with_modules,
             experimental_config=experimental_config,
+            execution_trace_observer=execution_trace_observer,
         )
 
         if schedule:
@@ -623,6 +702,8 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.stop()
         prof.KinetoStepTracker.erase_step_count(PROFILER_STEP_NAME)
+        if self.execution_trace_observer:
+            self.execution_trace_observer.cleanup()
 
     def start(self):
         self._transit_action(ProfilerAction.NONE, self.current_action)
@@ -644,7 +725,6 @@ def step(self):
         if self.record_steps and self.step_rec_fn:
             self.step_rec_fn.__exit__(None, None, None)
         prev_action = self.current_action
-        cur_step = self.step_num
         self.step_num += 1
         self.current_action = self.schedule(self.step_num)
 
@@ -652,7 +732,9 @@ def step(self):
         prof.KinetoStepTracker.increment_step(PROFILER_STEP_NAME)
 
         if self.record_steps:
-            self.step_rec_fn = prof.record_function("ProfilerStep#" + str(cur_step))
+            self.step_rec_fn = prof.record_function(
+                "ProfilerStep#" + str(self.step_num)
+            )
             self.step_rec_fn.__enter__()
 
     def _trace_ready(self):
@@ -665,8 +747,13 @@ def _transit_action(self, prev_action, current_action):
             for action in action_list:
                 action()
 
+    def _stats(self) -> Optional[prof._ProfilerStats]:
+        if self.profiler is None:
+            return None
+        return self.profiler._stats
+
 
-class ExecutionTraceObserver:
+class ExecutionTraceObserver(_ITraceObserver):
     """Execution Trace Observer
 
     Each process can have a single ExecutionTraceObserver instance. The observer
@@ -694,7 +781,7 @@ def __del__(self):
         """
         self.unregister_callback()
 
-    def register_callback(self, output_file_path: str):
+    def register_callback(self, output_file_path: str) -> Self:
         """
         Adds ET observer to record function callbacks. The data will be
         written to output_file_path.
@@ -702,13 +789,42 @@ def register_callback(self, output_file_path: str):
         if not self._registered:
             self._output_file_path = output_file_path
             self._registered = _add_execution_trace_observer(output_file_path)
+        return self
 
     def unregister_callback(self):
         """
         Removes ET observer from record function callbacks.
         """
+
+        def _save_triton_kernels():
+            # Save the kernel paths for the generated kernels
+            from torch._inductor.codecache import PyCodeCache as PyCodeCache
+
+            kernel_files = [
+                v.__file__
+                for v in PyCodeCache.cache.values()
+                if getattr(v, "__file__", None) is not None
+            ]
+            work_dir, file_name = os.path.split(self._output_file_path)
+            resource_dir = os.path.join(
+                work_dir, os.path.splitext(file_name)[0] + "_resources"
+            )
+            if not os.path.exists(resource_dir):
+                os.mkdir(resource_dir)
+
+            for kernel_file in kernel_files:
+                if kernel_file is None:
+                    continue
+                path, name = os.path.split(kernel_file)
+                dst = os.path.join(resource_dir, name)
+                shutil.copyfile(kernel_file, dst)
+
         if self._registered:
             self.stop()
+            try:
+                _save_triton_kernels()
+            except Exception as e:
+                warn(f"Execution trace failed to save kernels: {e}")
             _remove_execution_trace_observer()
             self._registered = False
 
@@ -741,6 +857,12 @@ def stop(self):
             _disable_execution_trace_observer()
             self._execution_trace_running = False
 
+    def cleanup(self):
+        """
+        Calls unregister_callback() to make sure to finalize outputs.
+        """
+        self.unregister_callback()
+
     def get_output_file_path(self) -> str:
         """
         Returns the output file name.
diff --git a/torch/quantization/_quantized_conversions.py b/torch/quantization/_quantized_conversions.py
index ce1bfc8fd4312..2b7670ea48026 100644
--- a/torch/quantization/_quantized_conversions.py
+++ b/torch/quantization/_quantized_conversions.py
@@ -44,7 +44,7 @@ def quantized_weight_reorder_for_mixed_dtypes_linear_cutlass(
     else:
         outp = weight
 
-    ncols, nrows = outp.shape
+    ncols, nrows = outp.shape  # type: ignore[possibly-undefined]
     assert nrows % (32 if dtypeq == torch.quint4x2 else 64) == 0
     assert ncols % 64 == 0
 
diff --git a/torch/random.py b/torch/random.py
index 668443a2b2dd0..74d4484880420 100644
--- a/torch/random.py
+++ b/torch/random.py
@@ -9,8 +9,8 @@
 def set_rng_state(new_state: torch.Tensor) -> None:
     r"""Sets the random number generator state.
 
-    .. note: This function only works for CPU. For CUDA, please use
-             torch.manual_seed(seed), which works for both CPU and CUDA.
+    .. note:: This function only works for CPU. For CUDA, please use
+        :func:`torch.manual_seed`, which works for both CPU and CUDA.
 
     Args:
         new_state (torch.ByteTensor): The desired state
@@ -19,12 +19,17 @@ def set_rng_state(new_state: torch.Tensor) -> None:
 
 
 def get_rng_state() -> torch.Tensor:
-    r"""Returns the random number generator state as a `torch.ByteTensor`."""
+    r"""Returns the random number generator state as a `torch.ByteTensor`.
+
+    .. note:: The returned state is for the default generator on CPU only.
+
+    See also: :func:`torch.random.fork_rng`.
+    """
     return default_generator.get_state()
 
 
 def manual_seed(seed) -> torch._C.Generator:
-    r"""Sets the seed for generating random numbers. Returns a
+    r"""Sets the seed for generating random numbers on all devices. Returns a
     `torch.Generator` object.
 
     Args:
@@ -43,7 +48,8 @@ def manual_seed(seed) -> torch._C.Generator:
     if not torch.mps._is_in_bad_fork():
         torch.mps.manual_seed(seed)
 
-    if hasattr(torch, 'xpu') and not torch.xpu._is_in_bad_fork():
+    import torch.xpu
+    if not torch.xpu._is_in_bad_fork():
         torch.xpu.manual_seed_all(seed)
 
     _seed_custom_device(seed)
@@ -53,7 +59,7 @@ def manual_seed(seed) -> torch._C.Generator:
 
 def seed() -> int:
     r"""Sets the seed for generating random numbers to a non-deterministic
-    random number. Returns a 64 bit number used to seed the RNG.
+    random number on all devices. Returns a 64 bit number used to seed the RNG.
     """
     seed = default_generator.seed()
     import torch.cuda
@@ -65,7 +71,8 @@ def seed() -> int:
     if not torch.mps._is_in_bad_fork():
         torch.mps.manual_seed(seed)
 
-    if hasattr(torch, 'xpu') and not torch.xpu._is_in_bad_fork():
+    import torch.xpu
+    if not torch.xpu._is_in_bad_fork():
         torch.xpu.manual_seed_all(seed)
 
     _seed_custom_device(seed)
@@ -99,6 +106,8 @@ def _seed_custom_device(seed) -> None:
 def initial_seed() -> int:
     r"""Returns the initial seed for generating random numbers as a
     Python `long`.
+
+    .. note:: The returned seed is for the default generator on CPU only.
     """
     return default_generator.initial_seed()
 
@@ -121,7 +130,7 @@ def fork_rng(devices=None, enabled=True, _caller="fork_rng", _devices_kw="device
         enabled (bool): if ``False``, the RNG is not forked.  This is a convenience
             argument for easily disabling the context manager without having
             to delete it and unindent your Python code under it.
-        deivce_type (str): device type str, default is `cuda`. As for custom device,
+        device_type (str): device type str, default is `cuda`. As for custom device,
             see details in [Note: support the custom device with privateuse1]
     """
 
@@ -163,9 +172,7 @@ def fork_rng(devices=None, enabled=True, _caller="fork_rng", _devices_kw="device
         devices = list(devices)
 
     cpu_rng_state = torch.get_rng_state()
-    device_rng_states = []
-    for device in devices:
-        device_rng_states.append(device_mod.get_rng_state(device))
+    device_rng_states = [device_mod.get_rng_state(device) for device in devices]
 
     try:
         yield
diff --git a/torch/return_types.py b/torch/return_types.py
index b1284c813387e..a74150a06c223 100644
--- a/torch/return_types.py
+++ b/torch/return_types.py
@@ -1,7 +1,11 @@
 import torch
 import inspect
 
-__all__ = ["pytree_register_structseq"]
+from torch.utils._pytree import register_pytree_node, SequenceKey
+
+__all__ = ["pytree_register_structseq", "all_return_types"]
+
+all_return_types = []
 
 # error: Module has no attribute "_return_types"
 return_types = torch._C._return_types  # type: ignore[attr-defined]
@@ -10,10 +14,19 @@ def pytree_register_structseq(cls):
     def structseq_flatten(structseq):
         return list(structseq), None
 
+    def structseq_flatten_with_keys(structseq):
+        values, context = structseq_flatten(structseq)
+        return [(SequenceKey(i), v) for i, v in enumerate(values)], context
+
     def structseq_unflatten(values, context):
         return cls(values)
 
-    torch.utils._pytree.register_pytree_node(cls, structseq_flatten, structseq_unflatten)
+    register_pytree_node(
+        cls,
+        structseq_flatten,
+        structseq_unflatten,
+        flatten_with_keys_fn=structseq_flatten_with_keys,
+    )
 
 for name in dir(return_types):
     if name.startswith('__'):
@@ -24,6 +37,7 @@ def structseq_unflatten(values, context):
 
     if not name.startswith('_'):
         __all__.append(name)
+        all_return_types.append(_attr)
 
     # Today everything in torch.return_types is a structseq, aka a "namedtuple"-like
     # thing defined by the Python C-API. We're going to need to modify this when that
diff --git a/torch/serialization.py b/torch/serialization.py
index 0b1b11133863f..df839408ee776 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -14,7 +14,7 @@
 from torch._sources import get_source_lines_and_file
 from torch.types import Storage
 from torch.storage import _get_dtype_from_pickle_storage_type
-from typing import Any, BinaryIO, Callable, cast, Dict, Optional, Type, Tuple, Union, IO
+from typing import Any, BinaryIO, Callable, cast, Dict, Optional, Type, Tuple, Union, IO, List
 from typing_extensions import TypeAlias, TypeGuard  # Python 3.10+
 import copyreg
 import pickle
@@ -34,6 +34,13 @@
 MAP_LOCATION: TypeAlias = Optional[Union[Callable[[torch.Tensor, str], torch.Tensor], torch.device, str, Dict[str, str]]]
 STORAGE: TypeAlias = Union[Storage, torch.storage.TypedStorage, torch.UntypedStorage]
 
+IS_WINDOWS = sys.platform == "win32"
+
+if not IS_WINDOWS:
+    from mmap import MAP_SHARED, MAP_PRIVATE
+else:
+    MAP_SHARED, MAP_PRIVATE = None, None  # type: ignore[assignment]
+
 __all__ = [
     'SourceChangeWarning',
     'mkdtemp',
@@ -67,7 +74,7 @@ def mkdtemp():
         shutil.rmtree(path)
 
 
-_package_registry = []
+_package_registry: List[Tuple[int, Callable[[STORAGE], Optional[str]], Callable[[STORAGE, str], Optional[STORAGE]]]] = []
 
 class LoadEndianness(Enum):
     NATIVE = 1
@@ -105,6 +112,41 @@ def set_default_load_endianness(endianness):
         raise TypeError("Invalid argument type in function set_default_load_endianness")
     _default_load_endian = endianness
 
+_default_mmap_options: int = MAP_PRIVATE
+
+def get_default_mmap_options() -> int:
+    '''
+    Get default mmap options for :func:`torch.load` with ``mmap=True``.
+
+    Defaults to ``mmap.MAP_PRIVATE``.
+
+
+    Returns:
+        default_mmap_options: int
+    '''
+    return _default_mmap_options
+
+def set_default_mmap_options(flags: int):
+    '''
+    Set default mmap options for :func:`torch.load` with ``mmap=True`` to flags.
+
+    For now, only either ``mmap.MAP_PRIVATE`` or ``mmap.MAP_SHARED`` are supported.
+    Please open an issue if you need any other option to be added here.
+
+    .. note::
+        This feature is currently not supported for Windows.
+
+    Args:
+        flags: ``mmap.MAP_PRIVATE`` or ``mmap.MAP_SHARED``
+    '''
+    global _default_mmap_options
+    if IS_WINDOWS:
+        raise RuntimeError("Changing the default mmap options is currently not supported for Windows")
+    if (flags != MAP_PRIVATE and flags != MAP_SHARED):
+        raise ValueError("Invalid argument in function set_default_mmap_options, "
+                         f"expected mmap.MAP_PRIVATE or mmap.MAP_SHARED, but got {flags}")
+    _default_mmap_options = flags
+
 def _is_zipfile(f) -> bool:
     # This is a stricter implementation than zipfile.is_zipfile().
     # zipfile.is_zipfile() is True if the magic number appears anywhere in the
@@ -859,7 +901,7 @@ def persistent_id(obj):
             storage = storage.cpu()
         # Now that it is on the CPU we can directly copy it into the zip file
         num_bytes = storage.nbytes()
-        zip_file.write_record(name, storage.data_ptr(), num_bytes)
+        zip_file.write_record(name, storage, num_bytes)
 
 
 def load(
@@ -1012,7 +1054,11 @@ def load(
                     if not _is_path(f):
                         raise ValueError("f must be a file path in order to use the mmap argument")
                     size = os.path.getsize(f)
-                    overall_storage = torch.UntypedStorage.from_file(os.fspath(f), False, size)
+                    if not IS_WINDOWS:
+                        shared = get_default_mmap_options() == MAP_SHARED
+                    else:
+                        shared = False
+                    overall_storage = torch.UntypedStorage.from_file(os.fspath(f), shared, size)
                 if weights_only:
                     try:
                         return _load(opened_zipfile,
@@ -1028,8 +1074,9 @@ def load(
                              overall_storage=overall_storage,
                              **pickle_load_args)
         if mmap:
+            f_name = "" if not isinstance(f, str) else f"{f}, "
             raise RuntimeError("mmap can only be used with files saved with "
-                               "`torch.save(_use_new_zipfile_serialization=True), "
+                               f"`torch.save({f_name}_use_new_zipfile_serialization=True), "
                                "please torch.save your checkpoint with this option in order to use mmap.")
         if weights_only:
             try:
@@ -1166,7 +1213,7 @@ def persistent_load(saved_id):
                     numel = struct.unpack(f'<{ndim}q', f.read(8 * ndim))
                     stride = struct.unpack(f'<{ndim}q', f.read(8 * ndim))
                     storage_offset, = struct.unpack('<q', f.read(8))
-                    tensor = torch.tensor([], dtype=storage.dtype).set_(
+                    tensor = torch.empty((0,), dtype=storage.dtype).set_(
                         storage._untyped_storage, storage_offset, numel, stride)
                     deserialized_objects[key] = tensor
 
@@ -1196,12 +1243,16 @@ def persistent_load(saved_id):
             nbytes = numel * torch._utils._element_size(dtype)
 
             if root_key not in deserialized_objects:
-                obj = cast(Storage, torch.UntypedStorage(nbytes))
-                obj._torch_load_uninitialized = True
+                if torch._guards.active_fake_mode() is not None:
+                    obj = cast(Storage, torch.UntypedStorage(nbytes, device='meta'))
+                else:
+                    obj = cast(Storage, torch.UntypedStorage(nbytes))
+                    obj._torch_load_uninitialized = True
+                    obj = restore_location(obj, location)
                 # TODO: Once we decide to break serialization FC, we can
                 # stop wrapping with TypedStorage
                 typed_storage = torch.storage.TypedStorage(
-                    wrap_storage=restore_location(obj, location),
+                    wrap_storage=obj,
                     dtype=dtype,
                     _internal=True)
                 deserialized_objects[root_key] = typed_storage
@@ -1268,15 +1319,16 @@ def persistent_load(saved_id):
 
     deserialized_storage_keys = pickle_module.load(f, **pickle_load_args)
 
-    offset = f.tell() if f_should_read_directly else None
-    for key in deserialized_storage_keys:
-        assert key in deserialized_objects
-        typed_storage = deserialized_objects[key]
-        typed_storage._untyped_storage._set_from_file(
-            f, offset, f_should_read_directly,
-            torch._utils._element_size(typed_storage.dtype))
-        if offset is not None:
-            offset = f.tell()
+    if torch._guards.active_fake_mode() is None:
+        offset = f.tell() if f_should_read_directly else None
+        for key in deserialized_storage_keys:
+            assert key in deserialized_objects
+            typed_storage = deserialized_objects[key]
+            typed_storage._untyped_storage._set_from_file(
+                f, offset, f_should_read_directly,
+                torch._utils._element_size(typed_storage.dtype))
+            if offset is not None:
+                offset = f.tell()
 
     torch._utils._validate_loaded_sparse_tensors()
 
@@ -1365,7 +1417,10 @@ def _load(zip_file, map_location, pickle_module, pickle_file='data.pkl', overall
 
     def load_tensor(dtype, numel, key, location):
         name = f'data/{key}'
-        if overall_storage is not None:
+        if torch._guards.detect_fake_mode(None) is not None:
+            nbytes = numel * torch._utils._element_size(dtype)
+            storage = torch.UntypedStorage(nbytes, device='meta')
+        elif overall_storage is not None:
             storage_offset = zip_file.get_record_offset(name)
             storage = overall_storage[storage_offset:storage_offset + numel]
         else:
diff --git a/torch/signal/windows/windows.py b/torch/signal/windows/windows.py
index ab9ebb823c906..d86a1245dc27b 100644
--- a/torch/signal/windows/windows.py
+++ b/torch/signal/windows/windows.py
@@ -163,14 +163,18 @@ def exponential(
 
 @_add_docstr(
     r"""
-Computes a window with a simple cosine waveform.
-Also known as the sine window.
+Computes a window with a simple cosine waveform, following the same implementation as SciPy.
+This window is also known as the sine window.
 
 The cosine window is defined as follows:
 
 .. math::
-    w_n = \cos{\left(\frac{\pi n}{M} - \frac{\pi}{2}\right)} = \sin{\left(\frac{\pi n}{M}\right)}
-    """,
+    w_n = \sin\left(\frac{\pi (n + 0.5)}{M}\right)
+
+This formula differs from the typical cosine window formula by incorporating a 0.5 term in the numerator,
+which shifts the sample positions. This adjustment results in a window that starts and ends with non-zero values.
+
+""",
     r"""
 
 {normalization}
@@ -744,7 +748,7 @@ def general_hamming(M,
 .. math::
     w_n = 1 - 0.36358 \cos{(z_n)} + 0.48917 \cos{(2z_n)} - 0.13659 \cos{(3z_n)} + 0.01064 \cos{(4z_n)}
 
-where ``z_n = 2 π n/ M``.
+where ``z_n = 2 \u03c0 n/ M``.
     """,
     """
 
@@ -762,12 +766,12 @@ def general_hamming(M,
 
 References::
 
-    - A. Nuttall, “Some windows with very good sidelobe behavior,”
+    - A. Nuttall, "Some windows with very good sidelobe behavior,"
       IEEE Transactions on Acoustics, Speech, and Signal Processing, vol. 29, no. 1, pp. 84-91,
       Feb 1981. https://doi.org/10.1109/TASSP.1981.1163506
 
-    - Heinzel G. et al., “Spectrum and spectral density estimation by the Discrete Fourier transform (DFT),
-      including a comprehensive list of window functions and some new flat-top windows”,
+    - Heinzel G. et al., "Spectrum and spectral density estimation by the Discrete Fourier transform (DFT),
+      including a comprehensive list of window functions and some new flat-top windows",
       February 15, 2002 https://holometer.fnal.gov/GH_FFT.pdf
 
 Examples::
diff --git a/torch/sparse/__init__.py b/torch/sparse/__init__.py
index 2b81ada3ecf07..8ca4aed7d71a2 100644
--- a/torch/sparse/__init__.py
+++ b/torch/sparse/__init__.py
@@ -6,13 +6,18 @@
 from torch import Tensor
 
 # Semi structured sparsity support
-from .semi_structured import SparseSemiStructuredTensor, to_sparse_semi_structured
+from .semi_structured import (
+    SparseSemiStructuredTensor,
+    SparseSemiStructuredTensorCUSPARSELT,
+    SparseSemiStructuredTensorCUTLASS,
+    to_sparse_semi_structured
+)
 
 # A workaround to support both TorchScript and MyPy:
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from torch.types import _dtype as DType
-    DimOrDims = Optional[Union[int, Tuple[int], List[int]]]
+    DimOrDims = Optional[Union[int, Tuple[int, ...], List[int]]]
 else:
     # The JIT doesn't understand Union, nor torch.dtype here
     DType = int
@@ -27,6 +32,8 @@
     'softmax',
     'log_softmax',
     'SparseSemiStructuredTensor',
+    'SparseSemiStructuredTensorCUTLASS',
+    'SparseSemiStructuredTensorCUSPARSELT',
     'to_sparse_semi_structured',
     'as_sparse_gradcheck',
 ]
diff --git a/torch/sparse/_semi_structured_conversions.py b/torch/sparse/_semi_structured_conversions.py
index cad147e3c793a..5203ad245b286 100644
--- a/torch/sparse/_semi_structured_conversions.py
+++ b/torch/sparse/_semi_structured_conversions.py
@@ -1,20 +1,22 @@
 import torch
 
 
-# This is PyTorch implementation of main part of reorder_meta()
-# function, from tools/util/include/cutlass/util/host_reorder.h file
-# of CUTLASS source tree.  Furthermore, CUTLASS template for sparse
-# GEMM decides upon layout of this matrix, and at the moment for the
-# sparse GEMM executed on tensor cores, this is layout described by
-# ColumnMajorInterleaved<2> data structure, in
-# include/cutlass/layout/matrix.h of CUTLASS source tree.  The
-# reordering of meta matrix into meta_reordered matrix calculated
-# according to these segments of CUTLASS code is re-implemented here.
-# Note that this calculation produces offsets for scattering metadata
-# matrix elements into reordered metadata matrix elements (or,
-# equivalently, for gathering reordered metadata matrix element back
-# into metadata matrix elements).
 def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype, device):
+    """
+    This is PyTorch implementation of main part of reorder_meta()
+    function, from tools/util/include/cutlass/util/host_reorder.h file
+    of CUTLASS source tree.  Furthermore, CUTLASS template for sparse
+    GEMM decides upon layout of this matrix, and at the moment for the
+    sparse GEMM executed on tensor cores, this is layout described by
+    ColumnMajorInterleaved<2> data structure, in
+    include/cutlass/layout/matrix.h of CUTLASS source tree.  The
+    reordering of meta matrix into meta_reordered matrix calculated
+    according to these segments of CUTLASS code is re-implemented here.
+    Note that this calculation produces offsets for scattering metadata
+    matrix elements into reordered metadata matrix elements (or,
+    equivalently, for gathering reordered metadata matrix element back
+    into metadata matrix elements).
+    """
     dst_rows = torch.arange(0, m, device=device)[:, None].repeat(1, meta_ncols)
     dst_cols = torch.arange(0, meta_ncols, device=device).repeat(m, 1)
 
@@ -41,10 +43,12 @@ def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype, device
     return (cols_maj * m * interleave + dst_rows * interleave + cols_min).view(-1)
 
 
-# This function converts dense matrix into sparse semi-structured
-# representation, producing "compressed" matrix, in the layout used by
-# CUTLASS backend, and corresponding metadata matrix.
 def sparse_semi_structured_from_dense_cutlass(dense):
+    """
+    This function converts dense matrix into sparse semi-structured
+    representation, producing "compressed" matrix, in the layout used by
+    CUTLASS backend, and corresponding metadata matrix.
+    """
     if dense.dim() != 2:
         raise RuntimeError(
             f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor"
@@ -134,11 +138,11 @@ def sparse_semi_structured_from_dense_cutlass(dense):
     idxs1 = bit2 | (bit3.to(torch.int64) << 1)
 
     if dense.dtype != torch.float:
-        sparse0 = dense_4.gather(-1, idxs0.unsqueeze(-1))
+        sparse0 = dense_4.gather(-1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
         sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
         sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
     else:
-        sparse = dense_2.gather(-1, idxs0.unsqueeze(-1) // 2).view(m, k // 2)
+        sparse = dense_2.gather(-1, idxs0.unsqueeze(-1) // 2).view(m, k // 2)  # type: ignore[possibly-undefined]
 
     meta_4 = idxs0 | (idxs1 << 2)
     meta_n = meta_4.view((-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)
@@ -163,7 +167,7 @@ def sparse_semi_structured_from_dense_cutlass(dense):
         )
 
     # Reorder meta tensor elements.
-    meta_reordered = meta.new_empty((m * meta_ncols,))
+    meta_reordered = meta.new_empty((m * meta_ncols,))  # type: ignore[possibly-undefined]
     meta_offsets = _calculate_meta_reordering_scatter_offsets(
         m, meta_ncols, meta_dtype, device
     )
@@ -172,11 +176,13 @@ def sparse_semi_structured_from_dense_cutlass(dense):
     return (sparse, meta_reordered.view(m, meta_ncols))
 
 
-# This function performs reverse of the function above - it
-# reconstructs dense matrix from a pair of "compressed" matrix, given
-# in the layout used by CUTLASS backend, and accompanying metadata
-# matrix.
 def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered):
+    """
+    This function performs reverse of the function above - it
+    reconstructs dense matrix from a pair of "compressed" matrix, given
+    in the layout used by CUTLASS backend, and accompanying metadata
+    matrix.
+    """
     if sparse.dim() != 2:
         raise RuntimeError(
             f"Expected 2-dimensional sparse tensor, got {sparse.dim()}-dimensional tensor"
@@ -273,3 +279,73 @@ def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered):
         )
 
     return dense.view(m, 2 * k)
+
+
+def _sparse_semi_structured_tile(dense):
+    """
+    This function computes a 2:4 sparse tile by greedily taking the largest values.
+
+    Since we take the largest values greedily, how the sorting algorithm handles duplicates affects
+    the ultimate sparsity pattern.
+
+    Note that this function does not have the same sorting semantics as our CUDA backend,
+    which is exposed via `torch._sparse_semi_structured_tile` and thus returns a different pattern.
+    """
+
+    def greedy_prune_tile(tile):
+        num_kept_row = [0, 0, 0, 0]
+        num_kept_col = [0, 0, 0, 0]
+
+        for x in tile.flatten().sort(descending=True, stable=True).indices:
+            r, c = x // 4, x % 4
+            if num_kept_row[r] < 2 and num_kept_col[c] < 2:
+                num_kept_row[r] += 1
+                num_kept_col[c] += 1
+            else:
+                tile[r, c] = 0
+
+    for batch in dense.unfold(0, 4, 4).unfold(1, 4, 4):
+        for tile in batch:
+            greedy_prune_tile(tile)
+
+    return dense
+
+
+def _compute_compressed_swizzled_bitmask(dense):
+    """
+    Calculates the compressed swizzled bitmask from a dense tensor
+    """
+
+    # first we need to convert the dense tensor to a bitmask
+    int_bitmask = dense.bool().to(torch.uint8)
+
+    # Each thread is responsible for an 8x8 tile, which contains 4 4x4 tiles:
+    # A, B, C and D, as displayed in the following schema:
+    # +---+---+
+    # | A | B |
+    # +---+---+
+    # | C | D |
+    # +---+---+
+
+    # we first need to split into the 8x8 tiles
+    bitmask_8x8_chunks = int_bitmask.unfold(0, 8, 8).unfold(1, 8, 8)
+
+    # then we unfold again to get our indivdual 4x4 tiles
+    bitmask_4x4_chunks = bitmask_8x8_chunks.unfold(2, 4, 4).unfold(3, 4, 4)
+
+    # Each 4x4 bitmask defines two 8-bit integers, which encode the sparsity pattern
+    # of that tile. Note that the least siginificant bit is stored first.
+    # [1 1 0 0]
+    # [1 1 0 0]  ->  0011 0011 ->   51
+    # [0 0 1 1]      1100 1100      204
+    # [0 0 1 1]
+
+    # reshape tensor to expand tiles into 8-bit vectors
+    bitmask_binary_representation = bitmask_4x4_chunks.reshape(*bitmask_4x4_chunks.shape[:2], 4, 2, 8)
+
+    # to convert from binary representaiton, we can do a matmul with powers of two
+    powers_of_two = 2**torch.arange(8, dtype=torch.float, device="cuda")
+    # To run on GPU: cast to float to do matmul and then cast back
+    compressed_swizzled_bitmask = (bitmask_binary_representation.to(torch.float) @ powers_of_two).to(torch.uint8)
+
+    return compressed_swizzled_bitmask
diff --git a/torch/sparse/_semi_structured_ops.py b/torch/sparse/_semi_structured_ops.py
new file mode 100644
index 0000000000000..551111b429a55
--- /dev/null
+++ b/torch/sparse/_semi_structured_ops.py
@@ -0,0 +1,166 @@
+import contextlib
+
+import torch
+
+__all__ = [
+    "fallback_dispatcher",
+    "semi_sparse_values",
+    "semi_sparse_indices",
+    "semi_sparse_t",
+    "semi_sparse_view",
+    "semi_sparse_detach",
+    "semi_sparse_mm",
+    "semi_sparse_addmm",
+    "semi_sparse_linear",
+]
+
+
+@contextlib.contextmanager
+def no_dispatch():
+    guard = torch._C._DisableTorchDispatch()
+    try:
+        yield
+    finally:
+        del guard
+
+
+def fallback_dispatcher(func, types, args, kwargs):
+    with no_dispatch():
+        return func(*args)
+
+
+def semi_sparse_values(func, types, args=(), kwargs=None) -> torch.Tensor:
+    assert len(args) == 1
+    A = args[0]
+    assert isinstance(A, torch.sparse.SparseSemiStructuredTensor)
+    assert A.packed is not None
+    if A.meta is None:
+        m, k = A.shape
+        num_kept_elements = m * k // 2
+        return A.packed[:num_kept_elements:].view(m, -1)
+    else:
+        return A.packed.detach()
+
+
+def semi_sparse_indices(func, types, args=(), kwargs=None) -> torch.Tensor:
+    assert len(args) == 1
+    A = args[0]
+    assert isinstance(A, torch.sparse.SparseSemiStructuredTensor)
+    assert A.packed is not None
+    if A.meta is None:
+        m, k = A.shape
+        num_kept_elements = m * k // 2
+        metadata = A.packed[num_kept_elements:].view(m, -1)
+        return metadata.view(torch.int32 if A.dtype == torch.int32 else torch.int16)
+    else:
+        return A.meta
+
+
+def semi_sparse_t(func, types, args=(), kwargs=None) -> torch.Tensor:
+    assert len(args) == 1
+    self = args[0]
+    assert isinstance(self, torch.sparse.SparseSemiStructuredTensor)
+    assert len(self.shape) == 2
+    # Because we cannot go from the compressed representation back to the dense representation currently,
+    # we just keep track of how many times we have been transposed. Depending on whether the sparse matrix
+    # is the first or second argument, we expect an even / odd number of calls to transpose respectively.
+    return self.__class__(
+        torch.Size([self.shape[-1], self.shape[0]]),
+        packed=self.packed_t,
+        meta=self.meta_t,
+        packed_t=self.packed,
+        meta_t=self.meta,
+        compressed_swizzled_bitmask=self.compressed_swizzled_bitmask.transpose(0, 1)
+        if self.compressed_swizzled_bitmask is not None
+        else None,
+        fuse_transpose_cusparselt=args[0].fuse_transpose_cusparselt,
+        alg_id_cusparselt=args[0].alg_id_cusparselt,
+    )
+
+
+def semi_sparse_view(func, types, args=(), kwargs=None) -> torch.Tensor:
+    assert len(args) == 2
+    self, shape = args
+    if tuple(shape) != self.shape:
+        raise NotImplementedError(
+            f"`view` is not implemented for SparseSemiStructuredTensor, except for the dummy case (shape={shape})"
+        )
+    return self
+
+
+def semi_sparse_detach(func, types, args, kwargs) -> torch.Tensor:
+    assert len(args) == 1
+    self = args[0]
+    return self.__class__(
+        shape=self.shape,
+        packed=self.packed,
+        meta=self.meta,
+        packed_t=self.packed_t,
+        meta_t=self.meta_t,
+        compressed_swizzled_bitmask=self.compressed_swizzled_bitmask,
+        requires_grad=False,
+    )
+
+
+def semi_sparse_mm(func, types, args=(), kwargs=None) -> torch.Tensor:
+    assert len(args) == 2
+    A, B = args
+    if A.ndim != 2 or B.ndim != 2:
+        raise NotImplementedError(
+            "`SparseSemiStructuredTensor` matmul: Broadcasting is not implemented"
+        )
+    if isinstance(A, torch.sparse.SparseSemiStructuredTensor):
+        row, col = B.shape
+        B_padded = A._pad_dense_input(B)
+        res = A._mm(B_padded)
+        return res[:, :col]
+    else:
+        B_t = B.t()
+        assert isinstance(B_t, torch.sparse.SparseSemiStructuredTensor)
+        row, col = A.shape
+        A_padded = B._pad_dense_input(A)
+        res = B_t._mm(A_padded.t()).t()
+        return res[:row, :]
+
+
+def semi_sparse_addmm(func, types, args=(), kwargs=None) -> torch.Tensor:
+    assert len(args) == 3
+    bias, A, B = args
+    if A.ndim != 2 or B.ndim != 2:
+        raise NotImplementedError(
+            "`SparseSemiStructuredTensor` matmul: Broadcasting is not implemented"
+        )
+    if bias.ndim != 1:
+        raise NotImplementedError(
+            f"`SparseSemiStructuredTensor` matmul: only bias dim=1 supported. Shape={bias.shape}"
+        )
+    if isinstance(A, torch.sparse.SparseSemiStructuredTensor):
+        raise NotImplementedError(
+            "`SparseSemiStructuredTensor` matmul: only operand B of `addmm` can be sparse"
+        )
+    B_t = B.t()
+    assert isinstance(B_t, torch.sparse.SparseSemiStructuredTensor)
+    row, col = A.shape
+    A_padded = B_t._pad_dense_input(A)
+    result = B_t._mm(A_padded.t(), bias=bias).t()
+    return result[:row, :]
+
+
+def semi_sparse_linear(func, types, args=(), kwargs=None) -> torch.Tensor:
+    assert len(args) in [2, 3]
+    A, B = args[:2]
+    bias = args[2] if len(args) == 3 else None
+
+    shape = A.shape
+    A_2d = A.view(-1, shape[-1])
+
+    if bias is None:
+        res = A_2d @ B.t()
+    else:
+        res = semi_sparse_addmm(
+            func=None,
+            types=None,
+            args=[bias, A_2d, B.t()],
+        )
+
+    return res.view(*shape[:-1], -1)
diff --git a/torch/sparse/_triton_ops.py b/torch/sparse/_triton_ops.py
index c3ff60697dec8..a22b5c8077e33 100644
--- a/torch/sparse/_triton_ops.py
+++ b/torch/sparse/_triton_ops.py
@@ -569,6 +569,16 @@ def bsr_dense_addmm_meta(M, K, N, Ms, Ks, beta, alpha,
         if meta is None and sparsity != 0.5:
             meta = get_meta('bsr_dense_addmm', key,
                             device_name, version=(0, dtype, 0.5))
+            if meta is None:
+                # find approximate meta such that N % SPLIT_N == 0.
+                matching_meta = get_meta(
+                    'bsr_dense_addmm',
+                    (*key[:2], '*', *key[3:]),
+                    device_name, version=(0, dtype, 0.5))
+                for mkey in sorted(matching_meta or {}):
+                    meta_ = matching_meta[mkey]
+                    if N % meta_['SPLIT_N'] == 0 and mkey[2] <= N:
+                        meta = meta_
         if meta is not None:
             meta.update(**extra)
             return meta
@@ -827,7 +837,7 @@ def bsr_dense_addmm(
         original_batch_dims_broadcasted = broadcast_batch_dims(f_name, bsr, dense)
         out = dense.new_empty(original_batch_dims_broadcasted + (M, N))
 
-    if bsr._nnz() == 0 or alpha == 0:
+    if bsr._nnz() == 0 or alpha == 0 or N == 0 or M == 0 or K == 0:
         if beta == 0:
             out.zero_()
         else:
@@ -1662,7 +1672,7 @@ def _scatter_mm6_kernel(
         acc_block = tl.zeros((TILE_M, TILE_N), dtype=dot_out_dtype)
 
         if is_compressed:
-            A_ptr += r0 * blocks_stride_P
+            A_ptr += r0 * blocks_stride_P  # type: ignore[possibly-undefined]
             for _ in range(nnz):
                 q = tl.load(q_ptr)
                 B = tl.load(B_ptr + q)
@@ -1889,7 +1899,7 @@ def _bsr_strided_addmm_kernel(
 
         # alpha is never 0
         if beta_is_nonzero:
-            output_acc_block = tl.load(input_ptrs).to(acc_dtype)
+            output_acc_block = tl.load(input_ptrs).to(acc_dtype)  # type: ignore[possibly-undefined]
             if not (beta_is_one and alpha_is_one):
                 beta_alpha = beta / alpha
                 output_acc_block *= beta_alpha
diff --git a/torch/sparse/_triton_ops_meta.py b/torch/sparse/_triton_ops_meta.py
index 15e00d11e7926..e6fc1329e8129 100644
--- a/torch/sparse/_triton_ops_meta.py
+++ b/torch/sparse/_triton_ops_meta.py
@@ -128,10 +128,13 @@ def get_meta(op, key, device_name=None, version=(0, torch.float16, 0.5), exact=F
     Returns
     -------
     result (dict): The requested mapping of parameter names and
-      values, or None when no data is available.
+      values, or None when no data is available. If the input `key`
+      contains `"*"`, the result will be a dictionary of keys and
+      mappings that match with the given `key`.
     """
     if device_name is None:
         device_name = torch.cuda.get_device_name()
+
     op_data = _operation_device_version_data.get((op, device_name, version))
     if op_data is None and not exact:
         # A lack of op data could be due to using a (slightly)
@@ -146,8 +149,19 @@ def get_meta(op, key, device_name=None, version=(0, torch.float16, 0.5), exact=F
         op_data = _operation_device_version_data.get((op, device_name, version))
     if op_data is None:
         return
-    values = op_data.get(key)
-    if values is not None:
+
+    matching_data = {}
+    if "*" in key:
+        for op_key in op_data:
+            if [None for k1, k2 in zip(op_key, key) if k2 != "*" and k1 != k2]:
+                continue
+            matching_data[op_key] = op_data[op_key]
+    else:
+        values = op_data.get(key)
+        if values is not None:
+            matching_data[key] = values
+    matching_meta = {}
+    for op_key, values in matching_data.items():
         if op == "scatter_mm":
             names = (
                 "GROUP_SIZE",
@@ -157,12 +171,20 @@ def get_meta(op, key, device_name=None, version=(0, torch.float16, 0.5), exact=F
                 "num_stages",
                 "num_warps",
             )
-            return dict(zip(names, values))
+            meta = dict(zip(names, values))
         elif op == "bsr_dense_addmm":
-            return dict(
+            meta = dict(
                 zip(("GROUP_SIZE_ROW", "SPLIT_N", "num_stages", "num_warps"), values)
             )
-        raise NotImplementedError(f"names for {op=}")
+        else:
+            raise NotImplementedError(f"names for {op=}")
+        if "*" not in key:
+            return meta
+
+        matching_meta[op_key] = meta
+
+    if "*" in key:
+        return matching_meta
 
 
 def update(op, device_name, version, key, value):
@@ -452,7 +474,6 @@ def optimize_scatter_mm(
     initial_meta = get_meta(
         "scatter_mm", key, device_name=device_name, version=version, exact=True
     )
-
     if initial_meta is None:
         initial_meta = get_meta(
             "bsr_dense_addmm",
@@ -466,7 +487,6 @@ def optimize_scatter_mm(
     elif not force:
         return
 
-    print(f"{m, k, n, bm, bk, initial_meta, reference_meta=}")
     torch.manual_seed(0)
     bsr = create_blocked_tensor(
         0, m, k, (bm, bk), sparsity, dtype, device
@@ -696,7 +716,8 @@ def main(op="scatter_mm", force=False, dtype=torch.float16, verbose=True):
     import itertools
 
     sizes_lst = [256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
-    shapes_lst = [(sz, sz) for sz in sizes_lst[:-3]]
+    sizes3_lst = [3 * sz for sz in [64, 128] + sizes_lst if sz <= 2048]
+    shapes_lst = [(sz, sz) for sz in sizes_lst[:-3] + sizes3_lst]
     blocksize_lst = [(16, 16), (32, 32), (64, 64), (128, 128)]
     sparsity_lst = [0.5, 0.7, 0.3][:1]
     for sparsity in sparsity_lst:
@@ -705,11 +726,14 @@ def main(op="scatter_mm", force=False, dtype=torch.float16, verbose=True):
             for (M, K), N, (BM, BK) in itertools.product(
                 shapes_lst, sizes_lst, blocksize_lst
             ):
+                if not (BM <= M and BK <= K and M % BM == 0 and K % BK == 0):
+                    continue
                 if op == "scatter_mm":
                     optimize_scatter_mm(
                         M, K, N, BM, BK, force=force, sparsity=sparsity, dtype=dtype
                     )
                 elif op == "bsr_dense_addmm":
+                    print(f"{M, K, N, (BM, BK)=}")
                     for alpha, beta in [(1, 1), (1, 0)]:
                         optimize_bsr_dense_addmm(
                             M,
@@ -1049,6 +1073,46 @@ def test_func():
         (64, 64, 64, 32, 32, False, True, True): (1, 2, 1, 4),
         (64, 64, 64, 32, 32, True, False, False): (1, 2, 1, 4),
         (64, 64, 64, 32, 32, True, False, True): (1, 2, 1, 4),
+        (192, 192, 256, 16, 16, False, True, True): (1, 8, 5, 4),
+        (192, 192, 256, 16, 16, True, False, True): (2, 8, 5, 2),
+        (192, 192, 256, 32, 32, False, True, True): (1, 8, 6, 4),
+        (192, 192, 256, 32, 32, True, False, True): (3, 8, 5, 2),
+        (192, 192, 512, 16, 16, False, True, True): (1, 16, 5, 2),
+        (192, 192, 512, 16, 16, True, False, True): (1, 8, 4, 2),
+        (192, 192, 512, 32, 32, False, True, True): (2, 16, 5, 4),
+        (192, 192, 512, 32, 32, True, False, True): (2, 8, 5, 2),
+        (192, 192, 1024, 16, 16, False, True, True): (1, 16, 3, 4),
+        (192, 192, 1024, 16, 16, True, False, True): (1, 16, 6, 2),
+        (192, 192, 1024, 32, 32, False, True, True): (1, 32, 3, 4),
+        (192, 192, 1024, 32, 32, True, False, True): (1, 16, 4, 2),
+        (192, 192, 2048, 16, 16, False, True, True): (1, 32, 1, 4),
+        (192, 192, 2048, 16, 16, True, False, True): (4, 32, 4, 2),
+        (192, 192, 2048, 32, 32, False, True, True): (1, 16, 3, 8),
+        (192, 192, 2048, 32, 32, True, False, True): (2, 32, 4, 2),
+        (192, 192, 4096, 16, 16, False, True, True): (2, 64, 1, 4),
+        (192, 192, 4096, 16, 16, True, False, True): (1, 32, 3, 2),
+        (192, 192, 4096, 32, 32, False, True, True): (1, 64, 1, 8),
+        (192, 192, 4096, 32, 32, True, False, True): (2, 32, 4, 4),
+        (192, 192, 8192, 16, 16, False, True, True): (1, 64, 1, 4),
+        (192, 192, 8192, 16, 16, True, False, True): (2, 32, 3, 1),
+        (192, 192, 8192, 32, 32, False, True, True): (3, 128, 1, 4),
+        (192, 192, 8192, 32, 32, True, False, True): (1, 64, 3, 4),
+        (192, 192, 16384, 16, 16, False, True, True): (1, 128, 1, 4),
+        (192, 192, 16384, 16, 16, True, False, True): (4, 64, 3, 1),
+        (192, 192, 16384, 32, 32, False, True, True): (1, 128, 1, 4),
+        (192, 192, 16384, 32, 32, True, False, True): (1, 64, 3, 4),
+        (192, 192, 32768, 16, 16, False, True, True): (2, 256, 1, 2),
+        (192, 192, 32768, 16, 16, True, False, True): (1, 128, 3, 4),
+        (192, 192, 32768, 32, 32, False, True, True): (2, 256, 1, 4),
+        (192, 192, 32768, 32, 32, True, False, True): (4, 128, 3, 4),
+        (192, 192, 65536, 16, 16, False, True, True): (2, 512, 1, 2),
+        (192, 192, 65536, 16, 16, True, False, True): (2, 256, 3, 2),
+        (192, 192, 65536, 32, 32, False, True, True): (2, 512, 1, 4),
+        (192, 192, 65536, 32, 32, True, False, True): (1, 256, 3, 4),
+        (192, 192, 131072, 16, 16, False, True, True): (4, 1024, 1, 2),
+        (192, 192, 131072, 16, 16, True, False, True): (3, 512, 3, 2),
+        (192, 192, 131072, 32, 32, False, True, True): (1, 1024, 1, 2),
+        (192, 192, 131072, 32, 32, True, False, True): (1, 512, 3, 4),
         (256, 256, 256, 16, 16, False, True, True): (4, 8, 5, 1),
         (256, 256, 256, 16, 16, True, False, True): (2, 8, 4, 2),
         (256, 256, 256, 32, 32, False, True, True): (2, 8, 5, 2),
@@ -1129,6 +1193,66 @@ def test_func():
         (256, 256, 131072, 64, 64, True, False, True): (1, 1024, 1, 4),
         (256, 256, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
         (256, 256, 131072, 128, 128, True, False, True): (1, 1024, 1, 4),
+        (384, 384, 256, 16, 16, False, True, True): (1, 8, 5, 2),
+        (384, 384, 256, 16, 16, True, False, True): (3, 4, 5, 2),
+        (384, 384, 256, 32, 32, False, True, True): (2, 8, 4, 4),
+        (384, 384, 256, 32, 32, True, False, True): (1, 4, 6, 2),
+        (384, 384, 256, 64, 64, False, True, True): (2, 4, 4, 4),
+        (384, 384, 256, 64, 64, True, False, True): (2, 4, 4, 4),
+        (384, 384, 512, 16, 16, False, True, True): (1, 8, 4, 2),
+        (384, 384, 512, 16, 16, True, False, True): (1, 4, 5, 4),
+        (384, 384, 512, 32, 32, False, True, True): (1, 8, 4, 4),
+        (384, 384, 512, 32, 32, True, False, True): (3, 8, 5, 2),
+        (384, 384, 512, 64, 64, False, True, True): (3, 8, 3, 4),
+        (384, 384, 512, 64, 64, True, False, True): (5, 8, 5, 4),
+        (384, 384, 1024, 16, 16, False, True, True): (3, 16, 4, 2),
+        (384, 384, 1024, 16, 16, True, False, True): (1, 8, 4, 4),
+        (384, 384, 1024, 32, 32, False, True, True): (6, 32, 3, 2),
+        (384, 384, 1024, 32, 32, True, False, True): (3, 8, 4, 4),
+        (384, 384, 1024, 64, 64, False, True, True): (3, 16, 3, 4),
+        (384, 384, 1024, 64, 64, True, False, True): (2, 16, 4, 4),
+        (384, 384, 2048, 16, 16, False, True, True): (1, 32, 1, 4),
+        (384, 384, 2048, 16, 16, True, False, True): (1, 16, 5, 2),
+        (384, 384, 2048, 32, 32, False, True, True): (1, 32, 1, 8),
+        (384, 384, 2048, 32, 32, True, False, True): (1, 8, 4, 4),
+        (384, 384, 2048, 64, 64, False, True, True): (4, 16, 3, 4),
+        (384, 384, 2048, 64, 64, True, False, True): (1, 16, 3, 8),
+        (384, 384, 4096, 16, 16, False, True, True): (5, 32, 1, 4),
+        (384, 384, 4096, 16, 16, True, False, True): (6, 32, 3, 2),
+        (384, 384, 4096, 32, 32, False, True, True): (1, 32, 1, 8),
+        (384, 384, 4096, 32, 32, True, False, True): (1, 16, 3, 4),
+        (384, 384, 4096, 64, 64, False, True, True): (1, 64, 1, 4),
+        (384, 384, 4096, 64, 64, True, False, True): (2, 32, 3, 4),
+        (384, 384, 8192, 16, 16, False, True, True): (2, 64, 1, 4),
+        (384, 384, 8192, 16, 16, True, False, True): (3, 32, 3, 2),
+        (384, 384, 8192, 32, 32, False, True, True): (5, 64, 1, 8),
+        (384, 384, 8192, 32, 32, True, False, True): (1, 32, 3, 2),
+        (384, 384, 8192, 64, 64, False, True, True): (1, 128, 1, 4),
+        (384, 384, 8192, 64, 64, True, False, True): (3, 64, 3, 4),
+        (384, 384, 16384, 16, 16, False, True, True): (1, 128, 1, 2),
+        (384, 384, 16384, 16, 16, True, False, True): (4, 128, 3, 2),
+        (384, 384, 16384, 32, 32, False, True, True): (3, 128, 1, 4),
+        (384, 384, 16384, 32, 32, True, False, True): (1, 128, 3, 2),
+        (384, 384, 16384, 64, 64, False, True, True): (3, 256, 1, 4),
+        (384, 384, 16384, 64, 64, True, False, True): (2, 128, 3, 4),
+        (384, 384, 32768, 16, 16, False, True, True): (1, 256, 1, 2),
+        (384, 384, 32768, 16, 16, True, False, True): (1, 128, 3, 4),
+        (384, 384, 32768, 32, 32, False, True, True): (1, 256, 1, 2),
+        (384, 384, 32768, 32, 32, True, False, True): (1, 128, 3, 4),
+        (384, 384, 32768, 64, 64, False, True, True): (2, 256, 1, 4),
+        (384, 384, 32768, 64, 64, True, False, True): (1, 256, 3, 4),
+        (384, 384, 65536, 16, 16, False, True, True): (4, 512, 1, 2),
+        (384, 384, 65536, 16, 16, True, False, True): (1, 256, 3, 4),
+        (384, 384, 65536, 32, 32, False, True, True): (1, 512, 1, 2),
+        (384, 384, 65536, 32, 32, True, False, True): (1, 256, 3, 4),
+        (384, 384, 65536, 64, 64, False, True, True): (3, 512, 1, 4),
+        (384, 384, 65536, 64, 64, True, False, True): (3, 256, 3, 4),
+        (384, 384, 131072, 16, 16, False, True, True): (1, 512, 1, 1),
+        (384, 384, 131072, 16, 16, True, False, True): (1, 512, 3, 4),
+        (384, 384, 131072, 32, 32, False, True, True): (1, 512, 1, 4),
+        (384, 384, 131072, 32, 32, True, False, True): (1, 512, 3, 4),
+        (384, 384, 131072, 64, 64, False, True, True): (3, 1024, 1, 4),
+        (384, 384, 131072, 64, 64, True, False, True): (3, 512, 3, 4),
         (512, 512, 256, 16, 16, False, True, True): (2, 4, 5, 4),
         (512, 512, 256, 16, 16, True, False, True): (3, 4, 5, 4),
         (512, 512, 256, 32, 32, False, True, True): (1, 4, 5, 2),
@@ -1209,6 +1333,86 @@ def test_func():
         (512, 512, 131072, 64, 64, True, False, True): (4, 1024, 3, 4),
         (512, 512, 131072, 128, 128, False, True, True): (6, 1024, 1, 4),
         (512, 512, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
+        (768, 768, 256, 16, 16, False, True, True): (1, 8, 4, 1),
+        (768, 768, 256, 16, 16, True, False, True): (3, 2, 6, 4),
+        (768, 768, 256, 32, 32, False, True, True): (3, 8, 3, 4),
+        (768, 768, 256, 32, 32, True, False, True): (1, 4, 4, 2),
+        (768, 768, 256, 64, 64, False, True, True): (2, 4, 3, 4),
+        (768, 768, 256, 64, 64, True, False, True): (1, 4, 4, 4),
+        (768, 768, 256, 128, 128, False, True, True): (2, 2, 3, 8),
+        (768, 768, 256, 128, 128, True, False, True): (4, 2, 3, 8),
+        (768, 768, 512, 16, 16, False, True, True): (4, 8, 4, 2),
+        (768, 768, 512, 16, 16, True, False, True): (4, 8, 6, 2),
+        (768, 768, 512, 32, 32, False, True, True): (1, 8, 4, 4),
+        (768, 768, 512, 32, 32, True, False, True): (3, 8, 4, 2),
+        (768, 768, 512, 64, 64, False, True, True): (1, 8, 3, 4),
+        (768, 768, 512, 64, 64, True, False, True): (1, 8, 4, 4),
+        (768, 768, 512, 128, 128, False, True, True): (1, 4, 3, 8),
+        (768, 768, 512, 128, 128, True, False, True): (4, 4, 3, 8),
+        (768, 768, 1024, 16, 16, False, True, True): (3, 16, 1, 4),
+        (768, 768, 1024, 16, 16, True, False, True): (1, 8, 5, 2),
+        (768, 768, 1024, 32, 32, False, True, True): (3, 16, 1, 8),
+        (768, 768, 1024, 32, 32, True, False, True): (1, 16, 3, 2),
+        (768, 768, 1024, 64, 64, False, True, True): (1, 8, 3, 4),
+        (768, 768, 1024, 64, 64, True, False, True): (2, 8, 3, 8),
+        (768, 768, 1024, 128, 128, False, True, True): (1, 8, 3, 8),
+        (768, 768, 1024, 128, 128, True, False, True): (1, 8, 3, 8),
+        (768, 768, 2048, 16, 16, False, True, True): (2, 16, 1, 2),
+        (768, 768, 2048, 16, 16, True, False, True): (1, 16, 3, 2),
+        (768, 768, 2048, 32, 32, False, True, True): (5, 32, 1, 4),
+        (768, 768, 2048, 32, 32, True, False, True): (3, 8, 3, 4),
+        (768, 768, 2048, 64, 64, False, True, True): (1, 16, 1, 8),
+        (768, 768, 2048, 64, 64, True, False, True): (3, 16, 3, 4),
+        (768, 768, 2048, 128, 128, False, True, True): (2, 16, 3, 8),
+        (768, 768, 2048, 128, 128, True, False, True): (1, 16, 3, 8),
+        (768, 768, 4096, 16, 16, False, True, True): (3, 32, 1, 4),
+        (768, 768, 4096, 16, 16, True, False, True): (2, 32, 3, 1),
+        (768, 768, 4096, 32, 32, False, True, True): (2, 64, 1, 4),
+        (768, 768, 4096, 32, 32, True, False, True): (1, 16, 4, 4),
+        (768, 768, 4096, 64, 64, False, True, True): (3, 64, 3, 4),
+        (768, 768, 4096, 64, 64, True, False, True): (2, 16, 3, 4),
+        (768, 768, 4096, 128, 128, False, True, True): (1, 32, 3, 8),
+        (768, 768, 4096, 128, 128, True, False, True): (4, 32, 3, 8),
+        (768, 768, 8192, 16, 16, False, True, True): (1, 64, 1, 2),
+        (768, 768, 8192, 16, 16, True, False, True): (4, 64, 3, 2),
+        (768, 768, 8192, 32, 32, False, True, True): (1, 64, 1, 8),
+        (768, 768, 8192, 32, 32, True, False, True): (2, 32, 3, 4),
+        (768, 768, 8192, 64, 64, False, True, True): (4, 64, 3, 4),
+        (768, 768, 8192, 64, 64, True, False, True): (2, 32, 3, 4),
+        (768, 768, 8192, 128, 128, False, True, True): (2, 64, 3, 8),
+        (768, 768, 8192, 128, 128, True, False, True): (1, 64, 3, 8),
+        (768, 768, 16384, 16, 16, False, True, True): (1, 128, 1, 2),
+        (768, 768, 16384, 16, 16, True, False, True): (1, 64, 4, 4),
+        (768, 768, 16384, 32, 32, False, True, True): (1, 128, 1, 8),
+        (768, 768, 16384, 32, 32, True, False, True): (1, 64, 3, 4),
+        (768, 768, 16384, 64, 64, False, True, True): (4, 128, 3, 4),
+        (768, 768, 16384, 64, 64, True, False, True): (1, 64, 3, 4),
+        (768, 768, 16384, 128, 128, False, True, True): (3, 128, 1, 4),
+        (768, 768, 16384, 128, 128, True, False, True): (3, 128, 2, 4),
+        (768, 768, 32768, 16, 16, False, True, True): (1, 256, 1, 2),
+        (768, 768, 32768, 16, 16, True, False, True): (1, 128, 4, 4),
+        (768, 768, 32768, 32, 32, False, True, True): (1, 128, 1, 2),
+        (768, 768, 32768, 32, 32, True, False, True): (1, 128, 3, 4),
+        (768, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4),
+        (768, 768, 32768, 64, 64, True, False, True): (2, 128, 3, 4),
+        (768, 768, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+        (768, 768, 32768, 128, 128, True, False, True): (2, 256, 2, 4),
+        (768, 768, 65536, 16, 16, False, True, True): (4, 512, 1, 2),
+        (768, 768, 65536, 16, 16, True, False, True): (1, 256, 4, 4),
+        (768, 768, 65536, 32, 32, False, True, True): (1, 256, 1, 2),
+        (768, 768, 65536, 32, 32, True, False, True): (1, 256, 3, 4),
+        (768, 768, 65536, 64, 64, False, True, True): (3, 512, 1, 4),
+        (768, 768, 65536, 64, 64, True, False, True): (2, 256, 3, 4),
+        (768, 768, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+        (768, 768, 65536, 128, 128, True, False, True): (2, 512, 2, 4),
+        (768, 768, 131072, 16, 16, False, True, True): (4, 1024, 1, 2),
+        (768, 768, 131072, 16, 16, True, False, True): (1, 512, 4, 1),
+        (768, 768, 131072, 32, 32, False, True, True): (1, 512, 1, 2),
+        (768, 768, 131072, 32, 32, True, False, True): (1, 512, 3, 4),
+        (768, 768, 131072, 64, 64, False, True, True): (1, 1024, 1, 4),
+        (768, 768, 131072, 64, 64, True, False, True): (2, 512, 3, 4),
+        (768, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+        (768, 768, 131072, 128, 128, True, False, True): (1, 1024, 2, 4),
         (1024, 1024, 256, 16, 16, False, True, True): (1, 4, 5, 4),
         (1024, 1024, 256, 16, 16, True, False, True): (3, 4, 4, 4),
         (1024, 1024, 256, 32, 32, False, True, True): (4, 4, 5, 2),
@@ -1289,6 +1493,86 @@ def test_func():
         (1024, 1024, 131072, 64, 64, True, False, True): (2, 1024, 3, 4),
         (1024, 1024, 131072, 128, 128, False, True, True): (4, 1024, 1, 4),
         (1024, 1024, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
+        (1536, 1536, 256, 16, 16, False, True, True): (1, 4, 6, 2),
+        (1536, 1536, 256, 16, 16, True, False, True): (3, 4, 5, 2),
+        (1536, 1536, 256, 32, 32, False, True, True): (2, 4, 3, 4),
+        (1536, 1536, 256, 32, 32, True, False, True): (1, 4, 5, 2),
+        (1536, 1536, 256, 64, 64, False, True, True): (2, 4, 3, 4),
+        (1536, 1536, 256, 64, 64, True, False, True): (1, 4, 4, 4),
+        (1536, 1536, 256, 128, 128, False, True, True): (3, 2, 3, 8),
+        (1536, 1536, 256, 128, 128, True, False, True): (6, 2, 3, 8),
+        (1536, 1536, 512, 16, 16, False, True, True): (1, 8, 1, 4),
+        (1536, 1536, 512, 16, 16, True, False, True): (3, 4, 5, 2),
+        (1536, 1536, 512, 32, 32, False, True, True): (1, 8, 1, 8),
+        (1536, 1536, 512, 32, 32, True, False, True): (1, 4, 4, 4),
+        (1536, 1536, 512, 64, 64, False, True, True): (3, 8, 5, 4),
+        (1536, 1536, 512, 64, 64, True, False, True): (3, 8, 3, 4),
+        (1536, 1536, 512, 128, 128, False, True, True): (2, 4, 3, 8),
+        (1536, 1536, 512, 128, 128, True, False, True): (3, 4, 3, 8),
+        (1536, 1536, 1024, 16, 16, False, True, True): (1, 8, 1, 2),
+        (1536, 1536, 1024, 16, 16, True, False, True): (2, 8, 4, 2),
+        (1536, 1536, 1024, 32, 32, False, True, True): (8, 16, 1, 4),
+        (1536, 1536, 1024, 32, 32, True, False, True): (3, 8, 4, 2),
+        (1536, 1536, 1024, 64, 64, False, True, True): (1, 16, 3, 4),
+        (1536, 1536, 1024, 64, 64, True, False, True): (3, 8, 3, 4),
+        (1536, 1536, 1024, 128, 128, False, True, True): (3, 8, 3, 8),
+        (1536, 1536, 1024, 128, 128, True, False, True): (3, 8, 3, 8),
+        (1536, 1536, 2048, 16, 16, False, True, True): (1, 16, 1, 4),
+        (1536, 1536, 2048, 16, 16, True, False, True): (1, 8, 3, 1),
+        (1536, 1536, 2048, 32, 32, False, True, True): (3, 16, 1, 8),
+        (1536, 1536, 2048, 32, 32, True, False, True): (3, 8, 4, 4),
+        (1536, 1536, 2048, 64, 64, False, True, True): (1, 16, 3, 4),
+        (1536, 1536, 2048, 64, 64, True, False, True): (3, 8, 3, 4),
+        (1536, 1536, 2048, 128, 128, False, True, True): (4, 16, 1, 4),
+        (1536, 1536, 2048, 128, 128, True, False, True): (6, 16, 3, 8),
+        (1536, 1536, 4096, 16, 16, False, True, True): (1, 32, 1, 2),
+        (1536, 1536, 4096, 16, 16, True, False, True): (4, 32, 4, 2),
+        (1536, 1536, 4096, 32, 32, False, True, True): (1, 32, 1, 8),
+        (1536, 1536, 4096, 32, 32, True, False, True): (5, 32, 4, 2),
+        (1536, 1536, 4096, 64, 64, False, True, True): (2, 32, 3, 4),
+        (1536, 1536, 4096, 64, 64, True, False, True): (2, 16, 3, 4),
+        (1536, 1536, 4096, 128, 128, False, True, True): (4, 32, 3, 8),
+        (1536, 1536, 4096, 128, 128, True, False, True): (4, 32, 3, 8),
+        (1536, 1536, 8192, 16, 16, False, True, True): (1, 64, 1, 2),
+        (1536, 1536, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (1536, 1536, 8192, 32, 32, False, True, True): (2, 64, 1, 8),
+        (1536, 1536, 8192, 32, 32, True, False, True): (2, 32, 3, 4),
+        (1536, 1536, 8192, 64, 64, False, True, True): (1, 64, 3, 4),
+        (1536, 1536, 8192, 64, 64, True, False, True): (2, 32, 3, 4),
+        (1536, 1536, 8192, 128, 128, False, True, True): (4, 64, 3, 8),
+        (1536, 1536, 8192, 128, 128, True, False, True): (1, 64, 3, 8),
+        (1536, 1536, 16384, 16, 16, False, True, True): (1, 128, 1, 2),
+        (1536, 1536, 16384, 16, 16, True, False, True): (1, 64, 4, 4),
+        (1536, 1536, 16384, 32, 32, False, True, True): (1, 64, 1, 2),
+        (1536, 1536, 16384, 32, 32, True, False, True): (1, 64, 3, 4),
+        (1536, 1536, 16384, 64, 64, False, True, True): (1, 128, 3, 4),
+        (1536, 1536, 16384, 64, 64, True, False, True): (1, 64, 3, 4),
+        (1536, 1536, 16384, 128, 128, False, True, True): (1, 128, 1, 4),
+        (1536, 1536, 16384, 128, 128, True, False, True): (1, 128, 2, 4),
+        (1536, 1536, 32768, 16, 16, False, True, True): (1, 256, 1, 2),
+        (1536, 1536, 32768, 16, 16, True, False, True): (1, 128, 3, 2),
+        (1536, 1536, 32768, 32, 32, False, True, True): (1, 128, 1, 2),
+        (1536, 1536, 32768, 32, 32, True, False, True): (1, 128, 3, 4),
+        (1536, 1536, 32768, 64, 64, False, True, True): (1, 256, 3, 4),
+        (1536, 1536, 32768, 64, 64, True, False, True): (1, 128, 3, 4),
+        (1536, 1536, 32768, 128, 128, False, True, True): (1, 256, 1, 4),
+        (1536, 1536, 32768, 128, 128, True, False, True): (2, 256, 2, 4),
+        (1536, 1536, 65536, 16, 16, False, True, True): (2, 512, 1, 2),
+        (1536, 1536, 65536, 16, 16, True, False, True): (1, 256, 4, 4),
+        (1536, 1536, 65536, 32, 32, False, True, True): (1, 256, 1, 2),
+        (1536, 1536, 65536, 32, 32, True, False, True): (1, 256, 3, 4),
+        (1536, 1536, 65536, 64, 64, False, True, True): (1, 512, 3, 4),
+        (1536, 1536, 65536, 64, 64, True, False, True): (3, 256, 3, 4),
+        (1536, 1536, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+        (1536, 1536, 65536, 128, 128, True, False, True): (4, 512, 2, 4),
+        (1536, 1536, 131072, 16, 16, False, True, True): (2, 1024, 1, 2),
+        (1536, 1536, 131072, 16, 16, True, False, True): (9, 512, 4, 4),
+        (1536, 1536, 131072, 32, 32, False, True, True): (1, 512, 1, 2),
+        (1536, 1536, 131072, 32, 32, True, False, True): (5, 512, 3, 4),
+        (1536, 1536, 131072, 64, 64, False, True, True): (1, 1024, 3, 4),
+        (1536, 1536, 131072, 64, 64, True, False, True): (2, 512, 3, 4),
+        (1536, 1536, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+        (1536, 1536, 131072, 128, 128, True, False, True): (1, 1024, 2, 4),
         (2048, 2048, 256, 16, 16, False, True, True): (1, 4, 5, 2),
         (2048, 2048, 256, 16, 16, True, False, True): (4, 4, 5, 2),
         (2048, 2048, 256, 32, 32, False, True, True): (3, 4, 6, 2),
@@ -1369,6 +1653,86 @@ def test_func():
         (2048, 2048, 131072, 64, 64, True, False, True): (4, 1024, 3, 4),
         (2048, 2048, 131072, 128, 128, False, True, True): (1, 1024, 1, 4),
         (2048, 2048, 131072, 128, 128, True, False, True): (2, 1024, 1, 4),
+        (3072, 3072, 256, 16, 16, False, True, True): (5, 4, 1, 4),
+        (3072, 3072, 256, 16, 16, True, False, True): (1, 2, 5, 2),
+        (3072, 3072, 256, 32, 32, False, True, True): (5, 4, 1, 8),
+        (3072, 3072, 256, 32, 32, True, False, True): (1, 4, 4, 2),
+        (3072, 3072, 256, 64, 64, False, True, True): (2, 4, 4, 4),
+        (3072, 3072, 256, 64, 64, True, False, True): (2, 4, 4, 4),
+        (3072, 3072, 256, 128, 128, False, True, True): (1, 2, 3, 8),
+        (3072, 3072, 256, 128, 128, True, False, True): (1, 2, 3, 8),
+        (3072, 3072, 512, 16, 16, False, True, True): (5, 4, 1, 2),
+        (3072, 3072, 512, 16, 16, True, False, True): (1, 2, 3, 4),
+        (3072, 3072, 512, 32, 32, False, True, True): (3, 8, 1, 4),
+        (3072, 3072, 512, 32, 32, True, False, True): (1, 4, 4, 2),
+        (3072, 3072, 512, 64, 64, False, True, True): (1, 8, 2, 2),
+        (3072, 3072, 512, 64, 64, True, False, True): (2, 4, 3, 4),
+        (3072, 3072, 512, 128, 128, False, True, True): (2, 4, 3, 8),
+        (3072, 3072, 512, 128, 128, True, False, True): (1, 4, 3, 8),
+        (3072, 3072, 1024, 16, 16, False, True, True): (1, 8, 1, 4),
+        (3072, 3072, 1024, 16, 16, True, False, True): (2, 8, 3, 1),
+        (3072, 3072, 1024, 32, 32, False, True, True): (1, 16, 1, 4),
+        (3072, 3072, 1024, 32, 32, True, False, True): (1, 4, 4, 4),
+        (3072, 3072, 1024, 64, 64, False, True, True): (1, 8, 3, 4),
+        (3072, 3072, 1024, 64, 64, True, False, True): (2, 4, 3, 4),
+        (3072, 3072, 1024, 128, 128, False, True, True): (1, 8, 1, 4),
+        (3072, 3072, 1024, 128, 128, True, False, True): (2, 8, 3, 8),
+        (3072, 3072, 2048, 16, 16, False, True, True): (1, 16, 1, 2),
+        (3072, 3072, 2048, 16, 16, True, False, True): (2, 16, 4, 2),
+        (3072, 3072, 2048, 32, 32, False, True, True): (1, 16, 1, 8),
+        (3072, 3072, 2048, 32, 32, True, False, True): (3, 8, 4, 4),
+        (3072, 3072, 2048, 64, 64, False, True, True): (3, 16, 3, 4),
+        (3072, 3072, 2048, 64, 64, True, False, True): (3, 8, 3, 4),
+        (3072, 3072, 2048, 128, 128, False, True, True): (1, 16, 3, 8),
+        (3072, 3072, 2048, 128, 128, True, False, True): (5, 16, 3, 8),
+        (3072, 3072, 4096, 16, 16, False, True, True): (1, 32, 1, 2),
+        (3072, 3072, 4096, 16, 16, True, False, True): (4, 32, 4, 2),
+        (3072, 3072, 4096, 32, 32, False, True, True): (1, 32, 1, 8),
+        (3072, 3072, 4096, 32, 32, True, False, True): (3, 16, 3, 4),
+        (3072, 3072, 4096, 64, 64, False, True, True): (1, 32, 3, 4),
+        (3072, 3072, 4096, 64, 64, True, False, True): (3, 16, 3, 4),
+        (3072, 3072, 4096, 128, 128, False, True, True): (3, 32, 3, 8),
+        (3072, 3072, 4096, 128, 128, True, False, True): (3, 32, 3, 8),
+        (3072, 3072, 8192, 16, 16, False, True, True): (1, 64, 1, 2),
+        (3072, 3072, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (3072, 3072, 8192, 32, 32, False, True, True): (1, 64, 1, 8),
+        (3072, 3072, 8192, 32, 32, True, False, True): (6, 32, 3, 4),
+        (3072, 3072, 8192, 64, 64, False, True, True): (1, 64, 3, 4),
+        (3072, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4),
+        (3072, 3072, 8192, 128, 128, False, True, True): (2, 64, 3, 8),
+        (3072, 3072, 8192, 128, 128, True, False, True): (1, 64, 3, 8),
+        (3072, 3072, 16384, 16, 16, False, True, True): (1, 128, 1, 2),
+        (3072, 3072, 16384, 16, 16, True, False, True): (4, 128, 4, 2),
+        (3072, 3072, 16384, 32, 32, False, True, True): (1, 64, 1, 2),
+        (3072, 3072, 16384, 32, 32, True, False, True): (4, 64, 3, 4),
+        (3072, 3072, 16384, 64, 64, False, True, True): (1, 128, 3, 4),
+        (3072, 3072, 16384, 64, 64, True, False, True): (4, 64, 3, 4),
+        (3072, 3072, 16384, 128, 128, False, True, True): (1, 128, 1, 4),
+        (3072, 3072, 16384, 128, 128, True, False, True): (1, 128, 3, 8),
+        (3072, 3072, 32768, 16, 16, False, True, True): (1, 256, 1, 2),
+        (3072, 3072, 32768, 16, 16, True, False, True): (8, 128, 4, 4),
+        (3072, 3072, 32768, 32, 32, False, True, True): (1, 256, 1, 8),
+        (3072, 3072, 32768, 32, 32, True, False, True): (5, 128, 3, 4),
+        (3072, 3072, 32768, 64, 64, False, True, True): (1, 256, 3, 4),
+        (3072, 3072, 32768, 64, 64, True, False, True): (3, 128, 3, 4),
+        (3072, 3072, 32768, 128, 128, False, True, True): (1, 256, 1, 4),
+        (3072, 3072, 32768, 128, 128, True, False, True): (3, 256, 2, 4),
+        (3072, 3072, 65536, 16, 16, False, True, True): (1, 512, 1, 2),
+        (3072, 3072, 65536, 16, 16, True, False, True): (7, 256, 4, 4),
+        (3072, 3072, 65536, 32, 32, False, True, True): (1, 256, 1, 2),
+        (3072, 3072, 65536, 32, 32, True, False, True): (5, 256, 3, 4),
+        (3072, 3072, 65536, 64, 64, False, True, True): (1, 512, 3, 4),
+        (3072, 3072, 65536, 64, 64, True, False, True): (3, 256, 3, 4),
+        (3072, 3072, 65536, 128, 128, False, True, True): (1, 512, 1, 4),
+        (3072, 3072, 65536, 128, 128, True, False, True): (3, 512, 2, 4),
+        (3072, 3072, 131072, 16, 16, False, True, True): (1, 1024, 1, 2),
+        (3072, 3072, 131072, 16, 16, True, False, True): (5, 512, 4, 4),
+        (3072, 3072, 131072, 32, 32, False, True, True): (1, 512, 1, 2),
+        (3072, 3072, 131072, 32, 32, True, False, True): (5, 512, 3, 4),
+        (3072, 3072, 131072, 64, 64, False, True, True): (1, 1024, 3, 4),
+        (3072, 3072, 131072, 64, 64, True, False, True): (3, 512, 3, 4),
+        (3072, 3072, 131072, 128, 128, False, True, True): (1, 1024, 1, 4),
+        (3072, 3072, 131072, 128, 128, True, False, True): (6, 1024, 2, 4),
         (4096, 4096, 256, 16, 16, False, True, True): (2, 2, 5, 4),
         (4096, 4096, 256, 16, 16, True, False, True): (2, 2, 4, 2),
         (4096, 4096, 256, 32, 32, False, True, True): (1, 2, 4, 4),
@@ -1449,6 +1813,86 @@ def test_func():
         (4096, 4096, 131072, 64, 64, True, False, True): (3, 1024, 3, 4),
         (4096, 4096, 131072, 128, 128, False, True, True): (1, 1024, 1, 4),
         (4096, 4096, 131072, 128, 128, True, False, True): (1, 1024, 1, 4),
+        (6144, 6144, 256, 16, 16, False, True, True): (1, 2, 1, 4),
+        (6144, 6144, 256, 16, 16, True, False, True): (3, 1, 4, 4),
+        (6144, 6144, 256, 32, 32, False, True, True): (3, 2, 1, 8),
+        (6144, 6144, 256, 32, 32, True, False, True): (1, 1, 4, 4),
+        (6144, 6144, 256, 64, 64, False, True, True): (4, 2, 3, 4),
+        (6144, 6144, 256, 64, 64, True, False, True): (3, 2, 4, 4),
+        (6144, 6144, 256, 128, 128, False, True, True): (2, 2, 3, 8),
+        (6144, 6144, 256, 128, 128, True, False, True): (1, 2, 3, 8),
+        (6144, 6144, 512, 16, 16, False, True, True): (4, 4, 1, 4),
+        (6144, 6144, 512, 16, 16, True, False, True): (3, 2, 3, 1),
+        (6144, 6144, 512, 32, 32, False, True, True): (1, 8, 1, 4),
+        (6144, 6144, 512, 32, 32, True, False, True): (1, 2, 3, 2),
+        (6144, 6144, 512, 64, 64, False, True, True): (2, 4, 3, 4),
+        (6144, 6144, 512, 64, 64, True, False, True): (2, 2, 3, 4),
+        (6144, 6144, 512, 128, 128, False, True, True): (1, 4, 3, 8),
+        (6144, 6144, 512, 128, 128, True, False, True): (1, 4, 3, 8),
+        (6144, 6144, 1024, 16, 16, False, True, True): (1, 8, 1, 2),
+        (6144, 6144, 1024, 16, 16, True, False, True): (4, 8, 4, 4),
+        (6144, 6144, 1024, 32, 32, False, True, True): (1, 8, 4, 2),
+        (6144, 6144, 1024, 32, 32, True, False, True): (1, 8, 4, 2),
+        (6144, 6144, 1024, 64, 64, False, True, True): (4, 8, 3, 4),
+        (6144, 6144, 1024, 64, 64, True, False, True): (1, 4, 3, 4),
+        (6144, 6144, 1024, 128, 128, False, True, True): (2, 8, 3, 8),
+        (6144, 6144, 1024, 128, 128, True, False, True): (1, 8, 3, 8),
+        (6144, 6144, 2048, 16, 16, False, True, True): (4, 4, 1, 4),
+        (6144, 6144, 2048, 16, 16, True, False, True): (2, 8, 4, 4),
+        (6144, 6144, 2048, 32, 32, False, True, True): (1, 16, 4, 2),
+        (6144, 6144, 2048, 32, 32, True, False, True): (4, 8, 4, 8),
+        (6144, 6144, 2048, 64, 64, False, True, True): (4, 16, 3, 4),
+        (6144, 6144, 2048, 64, 64, True, False, True): (2, 8, 3, 4),
+        (6144, 6144, 2048, 128, 128, False, True, True): (1, 16, 3, 8),
+        (6144, 6144, 2048, 128, 128, True, False, True): (4, 16, 3, 8),
+        (6144, 6144, 4096, 16, 16, False, True, True): (4, 8, 1, 4),
+        (6144, 6144, 4096, 16, 16, True, False, True): (4, 32, 4, 2),
+        (6144, 6144, 4096, 32, 32, False, True, True): (4, 16, 1, 2),
+        (6144, 6144, 4096, 32, 32, True, False, True): (2, 8, 3, 8),
+        (6144, 6144, 4096, 64, 64, False, True, True): (4, 32, 3, 4),
+        (6144, 6144, 4096, 64, 64, True, False, True): (4, 16, 3, 4),
+        (6144, 6144, 4096, 128, 128, False, True, True): (4, 32, 3, 8),
+        (6144, 6144, 4096, 128, 128, True, False, True): (4, 32, 3, 8),
+        (6144, 6144, 8192, 16, 16, False, True, True): (2, 16, 1, 2),
+        (6144, 6144, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (6144, 6144, 8192, 32, 32, False, True, True): (4, 32, 1, 2),
+        (6144, 6144, 8192, 32, 32, True, False, True): (4, 32, 4, 8),
+        (6144, 6144, 8192, 64, 64, False, True, True): (4, 64, 3, 4),
+        (6144, 6144, 8192, 64, 64, True, False, True): (4, 32, 3, 4),
+        (6144, 6144, 8192, 128, 128, False, True, True): (4, 64, 3, 8),
+        (6144, 6144, 8192, 128, 128, True, False, True): (4, 64, 3, 8),
+        (6144, 6144, 16384, 16, 16, False, True, True): (2, 32, 1, 2),
+        (6144, 6144, 16384, 16, 16, True, False, True): (4, 64, 4, 4),
+        (6144, 6144, 16384, 32, 32, False, True, True): (4, 64, 1, 2),
+        (6144, 6144, 16384, 32, 32, True, False, True): (4, 64, 3, 2),
+        (6144, 6144, 16384, 64, 64, False, True, True): (4, 128, 3, 4),
+        (6144, 6144, 16384, 64, 64, True, False, True): (2, 32, 3, 8),
+        (6144, 6144, 16384, 128, 128, False, True, True): (4, 128, 3, 8),
+        (6144, 6144, 16384, 128, 128, True, False, True): (4, 128, 3, 8),
+        (6144, 6144, 32768, 16, 16, False, True, True): (2, 64, 1, 2),
+        (6144, 6144, 32768, 16, 16, True, False, True): (3, 128, 4, 4),
+        (6144, 6144, 32768, 32, 32, False, True, True): (4, 128, 1, 2),
+        (6144, 6144, 32768, 32, 32, True, False, True): (3, 128, 3, 4),
+        (6144, 6144, 32768, 64, 64, False, True, True): (4, 256, 3, 4),
+        (6144, 6144, 32768, 64, 64, True, False, True): (2, 64, 3, 8),
+        (6144, 6144, 32768, 128, 128, False, True, True): (4, 256, 3, 8),
+        (6144, 6144, 32768, 128, 128, True, False, True): (4, 256, 3, 8),
+        (6144, 6144, 65536, 16, 16, False, True, True): (2, 128, 1, 2),
+        (6144, 6144, 65536, 16, 16, True, False, True): (4, 256, 4, 4),
+        (6144, 6144, 65536, 32, 32, False, True, True): (4, 256, 1, 2),
+        (6144, 6144, 65536, 32, 32, True, False, True): (4, 256, 3, 4),
+        (6144, 6144, 65536, 64, 64, False, True, True): (4, 512, 3, 4),
+        (6144, 6144, 65536, 64, 64, True, False, True): (2, 128, 3, 8),
+        (6144, 6144, 65536, 128, 128, False, True, True): (4, 512, 3, 8),
+        (6144, 6144, 65536, 128, 128, True, False, True): (4, 512, 3, 8),
+        (6144, 6144, 131072, 16, 16, False, True, True): (2, 256, 1, 2),
+        (6144, 6144, 131072, 16, 16, True, False, True): (5, 512, 4, 1),
+        (6144, 6144, 131072, 32, 32, False, True, True): (4, 512, 1, 2),
+        (6144, 6144, 131072, 32, 32, True, False, True): (4, 512, 3, 2),
+        (6144, 6144, 131072, 64, 64, False, True, True): (4, 1024, 3, 4),
+        (6144, 6144, 131072, 64, 64, True, False, True): (2, 256, 3, 8),
+        (6144, 6144, 131072, 128, 128, False, True, True): (4, 1024, 3, 8),
+        (6144, 6144, 131072, 128, 128, True, False, True): (4, 1024, 3, 8),
         (8192, 8192, 256, 16, 16, False, True, True): (1, 1, 3, 4),
         (8192, 8192, 256, 16, 16, True, False, True): (4, 1, 3, 4),
         (8192, 8192, 256, 32, 32, False, True, True): (1, 2, 3, 4),
@@ -1610,6 +2054,48 @@ def test_func():
         (16384, 16384, 131072, 128, 128, False, True, True): (4, 1024, 1, 4),
         (16384, 16384, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
     },
+    ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.bfloat16, 0.56)): {
+        (192, 192, 256, 64, 64, False, True, True): (3, 4, 3, 4),
+        (192, 192, 256, 64, 64, True, False, True): (1, 4, 4, 4),
+        (192, 192, 512, 64, 64, False, True, True): (2, 8, 3, 4),
+        (192, 192, 512, 64, 64, True, False, True): (2, 8, 3, 4),
+        (192, 192, 1024, 64, 64, False, True, True): (1, 16, 3, 4),
+        (192, 192, 1024, 64, 64, True, False, True): (1, 16, 5, 4),
+        (192, 192, 2048, 64, 64, False, True, True): (3, 32, 3, 4),
+        (192, 192, 2048, 64, 64, True, False, True): (5, 32, 3, 4),
+        (192, 192, 4096, 64, 64, False, True, True): (1, 64, 4, 4),
+        (192, 192, 4096, 64, 64, True, False, True): (2, 32, 3, 4),
+        (192, 192, 8192, 64, 64, False, True, True): (1, 128, 2, 4),
+        (192, 192, 8192, 64, 64, True, False, True): (1, 64, 3, 4),
+        (192, 192, 16384, 64, 64, False, True, True): (1, 256, 1, 4),
+        (192, 192, 16384, 64, 64, True, False, True): (1, 64, 3, 4),
+        (192, 192, 32768, 64, 64, False, True, True): (2, 512, 1, 2),
+        (192, 192, 32768, 64, 64, True, False, True): (2, 256, 2, 4),
+        (192, 192, 65536, 64, 64, False, True, True): (3, 512, 1, 4),
+        (192, 192, 65536, 64, 64, True, False, True): (1, 512, 2, 4),
+        (192, 192, 131072, 64, 64, False, True, True): (5, 1024, 1, 4),
+        (192, 192, 131072, 64, 64, True, False, True): (4, 512, 2, 4),
+        (384, 384, 256, 128, 128, False, True, True): (3, 2, 3, 8),
+        (384, 384, 256, 128, 128, True, False, True): (1, 2, 3, 8),
+        (384, 384, 512, 128, 128, False, True, True): (4, 4, 3, 8),
+        (384, 384, 512, 128, 128, True, False, True): (3, 4, 3, 8),
+        (384, 384, 1024, 128, 128, False, True, True): (1, 8, 3, 8),
+        (384, 384, 1024, 128, 128, True, False, True): (2, 8, 3, 8),
+        (384, 384, 2048, 128, 128, False, True, True): (5, 16, 3, 8),
+        (384, 384, 2048, 128, 128, True, False, True): (5, 16, 3, 8),
+        (384, 384, 4096, 128, 128, False, True, True): (3, 32, 3, 8),
+        (384, 384, 4096, 128, 128, True, False, True): (6, 32, 3, 8),
+        (384, 384, 8192, 128, 128, False, True, True): (2, 64, 3, 8),
+        (384, 384, 8192, 128, 128, True, False, True): (4, 32, 2, 8),
+        (384, 384, 16384, 128, 128, False, True, True): (2, 128, 3, 8),
+        (384, 384, 16384, 128, 128, True, False, True): (5, 128, 2, 4),
+        (384, 384, 32768, 128, 128, False, True, True): (2, 256, 3, 8),
+        (384, 384, 32768, 128, 128, True, False, True): (3, 256, 2, 4),
+        (384, 384, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+        (384, 384, 65536, 128, 128, True, False, True): (1, 512, 2, 4),
+        (384, 384, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+        (384, 384, 131072, 128, 128, True, False, True): (1, 1024, 2, 4),
+    },
     ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float16, 0.5)): {
         (16, 16, 16, 16, 16, False, False, False): (1, 1, 1, 1),
         (16, 16, 16, 16, 16, False, False, True): (1, 1, 2, 2),
@@ -1827,6 +2313,46 @@ def test_func():
         (64, 64, 64, 32, 32, False, True, True): (3, 2, 1, 4),
         (64, 64, 64, 32, 32, True, False, False): (1, 2, 1, 8),
         (64, 64, 64, 32, 32, True, False, True): (1, 2, 3, 4),
+        (192, 192, 256, 16, 16, False, True, True): (1, 8, 4, 2),
+        (192, 192, 256, 16, 16, True, False, True): (1, 4, 4, 4),
+        (192, 192, 256, 32, 32, False, True, True): (2, 8, 5, 4),
+        (192, 192, 256, 32, 32, True, False, True): (2, 8, 5, 1),
+        (192, 192, 512, 16, 16, False, True, True): (3, 8, 4, 4),
+        (192, 192, 512, 16, 16, True, False, True): (5, 8, 5, 4),
+        (192, 192, 512, 32, 32, False, True, True): (1, 16, 5, 4),
+        (192, 192, 512, 32, 32, True, False, True): (1, 8, 6, 2),
+        (192, 192, 1024, 16, 16, False, True, True): (1, 16, 4, 4),
+        (192, 192, 1024, 16, 16, True, False, True): (3, 16, 5, 2),
+        (192, 192, 1024, 32, 32, False, True, True): (3, 16, 4, 4),
+        (192, 192, 1024, 32, 32, True, False, True): (1, 16, 5, 4),
+        (192, 192, 2048, 16, 16, False, True, True): (2, 16, 3, 4),
+        (192, 192, 2048, 16, 16, True, False, True): (1, 16, 4, 4),
+        (192, 192, 2048, 32, 32, False, True, True): (1, 32, 3, 4),
+        (192, 192, 2048, 32, 32, True, False, True): (3, 16, 4, 4),
+        (192, 192, 4096, 16, 16, False, True, True): (1, 64, 1, 4),
+        (192, 192, 4096, 16, 16, True, False, True): (1, 16, 3, 4),
+        (192, 192, 4096, 32, 32, False, True, True): (1, 128, 1, 4),
+        (192, 192, 4096, 32, 32, True, False, True): (2, 32, 4, 2),
+        (192, 192, 8192, 16, 16, False, True, True): (1, 64, 1, 4),
+        (192, 192, 8192, 16, 16, True, False, True): (2, 64, 3, 2),
+        (192, 192, 8192, 32, 32, False, True, True): (1, 128, 1, 4),
+        (192, 192, 8192, 32, 32, True, False, True): (4, 32, 3, 4),
+        (192, 192, 16384, 16, 16, False, True, True): (1, 128, 1, 4),
+        (192, 192, 16384, 16, 16, True, False, True): (1, 64, 3, 2),
+        (192, 192, 16384, 32, 32, False, True, True): (1, 128, 1, 4),
+        (192, 192, 16384, 32, 32, True, False, True): (1, 64, 3, 4),
+        (192, 192, 32768, 16, 16, False, True, True): (2, 256, 1, 2),
+        (192, 192, 32768, 16, 16, True, False, True): (1, 128, 3, 2),
+        (192, 192, 32768, 32, 32, False, True, True): (2, 256, 1, 4),
+        (192, 192, 32768, 32, 32, True, False, True): (1, 256, 3, 2),
+        (192, 192, 65536, 16, 16, False, True, True): (2, 512, 1, 2),
+        (192, 192, 65536, 16, 16, True, False, True): (1, 256, 3, 2),
+        (192, 192, 65536, 32, 32, False, True, True): (2, 512, 1, 4),
+        (192, 192, 65536, 32, 32, True, False, True): (2, 256, 3, 4),
+        (192, 192, 131072, 16, 16, False, True, True): (4, 1024, 1, 2),
+        (192, 192, 131072, 16, 16, True, False, True): (3, 512, 3, 2),
+        (192, 192, 131072, 32, 32, False, True, True): (1, 1024, 1, 4),
+        (192, 192, 131072, 32, 32, True, False, True): (3, 512, 3, 4),
         (256, 256, 256, 16, 16, False, True, True): (4, 8, 6, 2),
         (256, 256, 256, 16, 16, True, False, True): (5, 16, 5, 1),
         (256, 256, 256, 32, 32, False, True, True): (1, 8, 7, 4),
@@ -1907,6 +2433,66 @@ def test_func():
         (256, 256, 131072, 64, 64, True, False, True): (1, 1024, 1, 4),
         (256, 256, 131072, 128, 128, False, True, True): (7, 1024, 1, 4),
         (256, 256, 131072, 128, 128, True, False, True): (1, 1024, 1, 4),
+        (384, 384, 256, 16, 16, False, True, True): (3, 16, 4, 1),
+        (384, 384, 256, 16, 16, True, False, True): (2, 4, 6, 2),
+        (384, 384, 256, 32, 32, False, True, True): (1, 8, 4, 4),
+        (384, 384, 256, 32, 32, True, False, True): (1, 4, 5, 2),
+        (384, 384, 256, 64, 64, False, True, True): (3, 4, 3, 4),
+        (384, 384, 256, 64, 64, True, False, True): (4, 4, 5, 4),
+        (384, 384, 512, 16, 16, False, True, True): (1, 16, 4, 1),
+        (384, 384, 512, 16, 16, True, False, True): (1, 8, 5, 2),
+        (384, 384, 512, 32, 32, False, True, True): (4, 16, 4, 2),
+        (384, 384, 512, 32, 32, True, False, True): (1, 8, 5, 2),
+        (384, 384, 512, 64, 64, False, True, True): (2, 8, 3, 4),
+        (384, 384, 512, 64, 64, True, False, True): (1, 8, 4, 4),
+        (384, 384, 1024, 16, 16, False, True, True): (1, 16, 4, 2),
+        (384, 384, 1024, 16, 16, True, False, True): (7, 8, 5, 2),
+        (384, 384, 1024, 32, 32, False, True, True): (2, 16, 3, 4),
+        (384, 384, 1024, 32, 32, True, False, True): (1, 16, 4, 2),
+        (384, 384, 1024, 64, 64, False, True, True): (6, 16, 3, 4),
+        (384, 384, 1024, 64, 64, True, False, True): (4, 16, 4, 4),
+        (384, 384, 2048, 16, 16, False, True, True): (1, 32, 1, 4),
+        (384, 384, 2048, 16, 16, True, False, True): (1, 16, 3, 2),
+        (384, 384, 2048, 32, 32, False, True, True): (1, 32, 1, 8),
+        (384, 384, 2048, 32, 32, True, False, True): (1, 8, 4, 4),
+        (384, 384, 2048, 64, 64, False, True, True): (2, 32, 1, 8),
+        (384, 384, 2048, 64, 64, True, False, True): (3, 16, 3, 4),
+        (384, 384, 4096, 16, 16, False, True, True): (5, 32, 1, 4),
+        (384, 384, 4096, 16, 16, True, False, True): (1, 32, 3, 2),
+        (384, 384, 4096, 32, 32, False, True, True): (1, 32, 1, 8),
+        (384, 384, 4096, 32, 32, True, False, True): (2, 16, 4, 4),
+        (384, 384, 4096, 64, 64, False, True, True): (1, 64, 1, 4),
+        (384, 384, 4096, 64, 64, True, False, True): (2, 32, 3, 4),
+        (384, 384, 8192, 16, 16, False, True, True): (2, 64, 1, 4),
+        (384, 384, 8192, 16, 16, True, False, True): (3, 32, 3, 2),
+        (384, 384, 8192, 32, 32, False, True, True): (4, 128, 1, 4),
+        (384, 384, 8192, 32, 32, True, False, True): (1, 32, 3, 2),
+        (384, 384, 8192, 64, 64, False, True, True): (1, 128, 1, 4),
+        (384, 384, 8192, 64, 64, True, False, True): (1, 64, 3, 4),
+        (384, 384, 16384, 16, 16, False, True, True): (1, 128, 1, 2),
+        (384, 384, 16384, 16, 16, True, False, True): (1, 64, 3, 2),
+        (384, 384, 16384, 32, 32, False, True, True): (1, 128, 1, 4),
+        (384, 384, 16384, 32, 32, True, False, True): (1, 64, 3, 4),
+        (384, 384, 16384, 64, 64, False, True, True): (5, 128, 3, 4),
+        (384, 384, 16384, 64, 64, True, False, True): (1, 128, 3, 4),
+        (384, 384, 32768, 16, 16, False, True, True): (2, 256, 1, 2),
+        (384, 384, 32768, 16, 16, True, False, True): (1, 128, 3, 4),
+        (384, 384, 32768, 32, 32, False, True, True): (1, 256, 1, 2),
+        (384, 384, 32768, 32, 32, True, False, True): (2, 128, 3, 4),
+        (384, 384, 32768, 64, 64, False, True, True): (3, 256, 1, 4),
+        (384, 384, 32768, 64, 64, True, False, True): (2, 256, 3, 4),
+        (384, 384, 65536, 16, 16, False, True, True): (2, 128, 1, 4),
+        (384, 384, 65536, 16, 16, True, False, True): (1, 256, 3, 4),
+        (384, 384, 65536, 32, 32, False, True, True): (1, 512, 1, 2),
+        (384, 384, 65536, 32, 32, True, False, True): (1, 256, 3, 4),
+        (384, 384, 65536, 64, 64, False, True, True): (3, 512, 1, 4),
+        (384, 384, 65536, 64, 64, True, False, True): (3, 256, 3, 4),
+        (384, 384, 131072, 16, 16, False, True, True): (2, 256, 1, 2),
+        (384, 384, 131072, 16, 16, True, False, True): (1, 512, 3, 4),
+        (384, 384, 131072, 32, 32, False, True, True): (1, 512, 1, 2),
+        (384, 384, 131072, 32, 32, True, False, True): (1, 512, 3, 4),
+        (384, 384, 131072, 64, 64, False, True, True): (3, 1024, 1, 4),
+        (384, 384, 131072, 64, 64, True, False, True): (3, 512, 3, 4),
         (512, 512, 256, 16, 16, False, True, True): (1, 8, 5, 1),
         (512, 512, 256, 16, 16, True, False, True): (2, 16, 5, 1),
         (512, 512, 256, 32, 32, False, True, True): (2, 8, 5, 2),
@@ -1987,6 +2573,86 @@ def test_func():
         (512, 512, 131072, 64, 64, True, False, True): (2, 512, 2, 4),
         (512, 512, 131072, 128, 128, False, True, True): (5, 1024, 1, 4),
         (512, 512, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
+        (768, 768, 256, 16, 16, False, True, True): (1, 8, 4, 1),
+        (768, 768, 256, 16, 16, True, False, True): (3, 2, 5, 2),
+        (768, 768, 256, 32, 32, False, True, True): (1, 8, 4, 2),
+        (768, 768, 256, 32, 32, True, False, True): (2, 4, 6, 2),
+        (768, 768, 256, 64, 64, False, True, True): (3, 4, 3, 4),
+        (768, 768, 256, 64, 64, True, False, True): (2, 4, 4, 4),
+        (768, 768, 256, 128, 128, False, True, True): (1, 2, 3, 8),
+        (768, 768, 256, 128, 128, True, False, True): (2, 2, 3, 8),
+        (768, 768, 512, 16, 16, False, True, True): (1, 8, 4, 2),
+        (768, 768, 512, 16, 16, True, False, True): (2, 8, 5, 2),
+        (768, 768, 512, 32, 32, False, True, True): (1, 16, 1, 4),
+        (768, 768, 512, 32, 32, True, False, True): (3, 8, 5, 2),
+        (768, 768, 512, 64, 64, False, True, True): (4, 8, 3, 4),
+        (768, 768, 512, 64, 64, True, False, True): (2, 8, 4, 4),
+        (768, 768, 512, 128, 128, False, True, True): (1, 4, 3, 8),
+        (768, 768, 512, 128, 128, True, False, True): (3, 4, 3, 8),
+        (768, 768, 1024, 16, 16, False, True, True): (1, 16, 1, 4),
+        (768, 768, 1024, 16, 16, True, False, True): (1, 8, 5, 2),
+        (768, 768, 1024, 32, 32, False, True, True): (1, 16, 1, 8),
+        (768, 768, 1024, 32, 32, True, False, True): (1, 4, 4, 4),
+        (768, 768, 1024, 64, 64, False, True, True): (2, 16, 1, 8),
+        (768, 768, 1024, 64, 64, True, False, True): (1, 8, 3, 8),
+        (768, 768, 1024, 128, 128, False, True, True): (1, 8, 3, 8),
+        (768, 768, 1024, 128, 128, True, False, True): (3, 8, 3, 8),
+        (768, 768, 2048, 16, 16, False, True, True): (6, 16, 1, 2),
+        (768, 768, 2048, 16, 16, True, False, True): (2, 16, 4, 2),
+        (768, 768, 2048, 32, 32, False, True, True): (3, 32, 1, 4),
+        (768, 768, 2048, 32, 32, True, False, True): (6, 8, 3, 4),
+        (768, 768, 2048, 64, 64, False, True, True): (2, 32, 2, 2),
+        (768, 768, 2048, 64, 64, True, False, True): (1, 16, 4, 4),
+        (768, 768, 2048, 128, 128, False, True, True): (2, 16, 3, 8),
+        (768, 768, 2048, 128, 128, True, False, True): (4, 16, 3, 8),
+        (768, 768, 4096, 16, 16, False, True, True): (1, 32, 1, 4),
+        (768, 768, 4096, 16, 16, True, False, True): (2, 16, 3, 2),
+        (768, 768, 4096, 32, 32, False, True, True): (3, 32, 1, 8),
+        (768, 768, 4096, 32, 32, True, False, True): (1, 16, 4, 4),
+        (768, 768, 4096, 64, 64, False, True, True): (1, 64, 2, 4),
+        (768, 768, 4096, 64, 64, True, False, True): (1, 8, 3, 8),
+        (768, 768, 4096, 128, 128, False, True, True): (1, 32, 3, 8),
+        (768, 768, 4096, 128, 128, True, False, True): (2, 32, 3, 8),
+        (768, 768, 8192, 16, 16, False, True, True): (1, 64, 1, 2),
+        (768, 768, 8192, 16, 16, True, False, True): (2, 64, 3, 2),
+        (768, 768, 8192, 32, 32, False, True, True): (2, 64, 1, 8),
+        (768, 768, 8192, 32, 32, True, False, True): (2, 32, 3, 4),
+        (768, 768, 8192, 64, 64, False, True, True): (4, 64, 3, 4),
+        (768, 768, 8192, 64, 64, True, False, True): (1, 64, 3, 4),
+        (768, 768, 8192, 128, 128, False, True, True): (4, 64, 3, 8),
+        (768, 768, 8192, 128, 128, True, False, True): (2, 64, 3, 8),
+        (768, 768, 16384, 16, 16, False, True, True): (4, 128, 1, 2),
+        (768, 768, 16384, 16, 16, True, False, True): (1, 64, 3, 4),
+        (768, 768, 16384, 32, 32, False, True, True): (1, 128, 1, 8),
+        (768, 768, 16384, 32, 32, True, False, True): (1, 64, 3, 4),
+        (768, 768, 16384, 64, 64, False, True, True): (1, 128, 3, 4),
+        (768, 768, 16384, 64, 64, True, False, True): (1, 128, 3, 4),
+        (768, 768, 16384, 128, 128, False, True, True): (3, 128, 1, 4),
+        (768, 768, 16384, 128, 128, True, False, True): (1, 128, 2, 4),
+        (768, 768, 32768, 16, 16, False, True, True): (2, 256, 1, 2),
+        (768, 768, 32768, 16, 16, True, False, True): (1, 128, 4, 4),
+        (768, 768, 32768, 32, 32, False, True, True): (1, 128, 1, 2),
+        (768, 768, 32768, 32, 32, True, False, True): (1, 128, 3, 4),
+        (768, 768, 32768, 64, 64, False, True, True): (1, 256, 1, 4),
+        (768, 768, 32768, 64, 64, True, False, True): (1, 128, 3, 4),
+        (768, 768, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+        (768, 768, 32768, 128, 128, True, False, True): (3, 256, 2, 4),
+        (768, 768, 65536, 16, 16, False, True, True): (4, 512, 1, 2),
+        (768, 768, 65536, 16, 16, True, False, True): (1, 256, 4, 4),
+        (768, 768, 65536, 32, 32, False, True, True): (1, 256, 1, 2),
+        (768, 768, 65536, 32, 32, True, False, True): (1, 256, 3, 4),
+        (768, 768, 65536, 64, 64, False, True, True): (1, 512, 1, 4),
+        (768, 768, 65536, 64, 64, True, False, True): (1, 256, 3, 4),
+        (768, 768, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+        (768, 768, 65536, 128, 128, True, False, True): (2, 512, 2, 4),
+        (768, 768, 131072, 16, 16, False, True, True): (1, 512, 1, 1),
+        (768, 768, 131072, 16, 16, True, False, True): (1, 512, 4, 4),
+        (768, 768, 131072, 32, 32, False, True, True): (1, 512, 1, 2),
+        (768, 768, 131072, 32, 32, True, False, True): (1, 512, 3, 4),
+        (768, 768, 131072, 64, 64, False, True, True): (1, 1024, 1, 4),
+        (768, 768, 131072, 64, 64, True, False, True): (3, 512, 3, 4),
+        (768, 768, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+        (768, 768, 131072, 128, 128, True, False, True): (1, 1024, 2, 4),
         (1024, 1024, 256, 16, 16, False, True, True): (3, 4, 5, 4),
         (1024, 1024, 256, 16, 16, True, False, True): (3, 4, 5, 4),
         (1024, 1024, 256, 32, 32, False, True, True): (2, 4, 6, 2),
@@ -2067,6 +2733,86 @@ def test_func():
         (1024, 1024, 131072, 64, 64, True, False, True): (4, 1024, 3, 4),
         (1024, 1024, 131072, 128, 128, False, True, True): (12, 1024, 1, 4),
         (1024, 1024, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
+        (1536, 1536, 256, 16, 16, False, True, True): (5, 4, 4, 2),
+        (1536, 1536, 256, 16, 16, True, False, True): (3, 4, 5, 2),
+        (1536, 1536, 256, 32, 32, False, True, True): (2, 4, 4, 4),
+        (1536, 1536, 256, 32, 32, True, False, True): (1, 4, 6, 2),
+        (1536, 1536, 256, 64, 64, False, True, True): (5, 4, 4, 4),
+        (1536, 1536, 256, 64, 64, True, False, True): (2, 4, 4, 4),
+        (1536, 1536, 256, 128, 128, False, True, True): (1, 2, 3, 8),
+        (1536, 1536, 256, 128, 128, True, False, True): (2, 2, 3, 8),
+        (1536, 1536, 512, 16, 16, False, True, True): (1, 8, 1, 4),
+        (1536, 1536, 512, 16, 16, True, False, True): (3, 4, 4, 2),
+        (1536, 1536, 512, 32, 32, False, True, True): (1, 8, 1, 8),
+        (1536, 1536, 512, 32, 32, True, False, True): (1, 4, 4, 4),
+        (1536, 1536, 512, 64, 64, False, True, True): (3, 8, 3, 4),
+        (1536, 1536, 512, 64, 64, True, False, True): (5, 8, 3, 4),
+        (1536, 1536, 512, 128, 128, False, True, True): (3, 4, 3, 8),
+        (1536, 1536, 512, 128, 128, True, False, True): (1, 4, 3, 8),
+        (1536, 1536, 1024, 16, 16, False, True, True): (6, 8, 1, 2),
+        (1536, 1536, 1024, 16, 16, True, False, True): (2, 8, 5, 2),
+        (1536, 1536, 1024, 32, 32, False, True, True): (6, 8, 1, 8),
+        (1536, 1536, 1024, 32, 32, True, False, True): (2, 4, 3, 4),
+        (1536, 1536, 1024, 64, 64, False, True, True): (1, 16, 3, 4),
+        (1536, 1536, 1024, 64, 64, True, False, True): (3, 8, 3, 4),
+        (1536, 1536, 1024, 128, 128, False, True, True): (3, 8, 3, 8),
+        (1536, 1536, 1024, 128, 128, True, False, True): (3, 8, 3, 8),
+        (1536, 1536, 2048, 16, 16, False, True, True): (1, 16, 1, 4),
+        (1536, 1536, 2048, 16, 16, True, False, True): (1, 8, 3, 1),
+        (1536, 1536, 2048, 32, 32, False, True, True): (1, 16, 1, 8),
+        (1536, 1536, 2048, 32, 32, True, False, True): (4, 8, 3, 2),
+        (1536, 1536, 2048, 64, 64, False, True, True): (1, 16, 3, 4),
+        (1536, 1536, 2048, 64, 64, True, False, True): (3, 8, 3, 4),
+        (1536, 1536, 2048, 128, 128, False, True, True): (6, 16, 1, 4),
+        (1536, 1536, 2048, 128, 128, True, False, True): (4, 16, 3, 8),
+        (1536, 1536, 4096, 16, 16, False, True, True): (1, 32, 1, 2),
+        (1536, 1536, 4096, 16, 16, True, False, True): (4, 32, 4, 2),
+        (1536, 1536, 4096, 32, 32, False, True, True): (1, 32, 1, 8),
+        (1536, 1536, 4096, 32, 32, True, False, True): (3, 16, 3, 4),
+        (1536, 1536, 4096, 64, 64, False, True, True): (1, 32, 3, 4),
+        (1536, 1536, 4096, 64, 64, True, False, True): (1, 16, 3, 4),
+        (1536, 1536, 4096, 128, 128, False, True, True): (4, 32, 3, 8),
+        (1536, 1536, 4096, 128, 128, True, False, True): (2, 32, 3, 8),
+        (1536, 1536, 8192, 16, 16, False, True, True): (2, 64, 1, 2),
+        (1536, 1536, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (1536, 1536, 8192, 32, 32, False, True, True): (1, 64, 1, 8),
+        (1536, 1536, 8192, 32, 32, True, False, True): (12, 32, 3, 4),
+        (1536, 1536, 8192, 64, 64, False, True, True): (2, 64, 3, 4),
+        (1536, 1536, 8192, 64, 64, True, False, True): (2, 32, 3, 4),
+        (1536, 1536, 8192, 128, 128, False, True, True): (3, 64, 1, 4),
+        (1536, 1536, 8192, 128, 128, True, False, True): (4, 64, 3, 8),
+        (1536, 1536, 16384, 16, 16, False, True, True): (1, 128, 1, 2),
+        (1536, 1536, 16384, 16, 16, True, False, True): (1, 64, 4, 4),
+        (1536, 1536, 16384, 32, 32, False, True, True): (1, 64, 1, 2),
+        (1536, 1536, 16384, 32, 32, True, False, True): (1, 64, 3, 4),
+        (1536, 1536, 16384, 64, 64, False, True, True): (1, 128, 3, 4),
+        (1536, 1536, 16384, 64, 64, True, False, True): (1, 64, 3, 4),
+        (1536, 1536, 16384, 128, 128, False, True, True): (3, 128, 1, 4),
+        (1536, 1536, 16384, 128, 128, True, False, True): (1, 128, 2, 4),
+        (1536, 1536, 32768, 16, 16, False, True, True): (1, 256, 1, 2),
+        (1536, 1536, 32768, 16, 16, True, False, True): (1, 128, 3, 2),
+        (1536, 1536, 32768, 32, 32, False, True, True): (1, 128, 1, 2),
+        (1536, 1536, 32768, 32, 32, True, False, True): (1, 128, 3, 4),
+        (1536, 1536, 32768, 64, 64, False, True, True): (3, 256, 3, 4),
+        (1536, 1536, 32768, 64, 64, True, False, True): (1, 128, 3, 4),
+        (1536, 1536, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+        (1536, 1536, 32768, 128, 128, True, False, True): (1, 256, 2, 4),
+        (1536, 1536, 65536, 16, 16, False, True, True): (4, 512, 1, 2),
+        (1536, 1536, 65536, 16, 16, True, False, True): (1, 256, 4, 4),
+        (1536, 1536, 65536, 32, 32, False, True, True): (1, 256, 1, 2),
+        (1536, 1536, 65536, 32, 32, True, False, True): (1, 256, 3, 4),
+        (1536, 1536, 65536, 64, 64, False, True, True): (2, 512, 3, 4),
+        (1536, 1536, 65536, 64, 64, True, False, True): (1, 256, 3, 4),
+        (1536, 1536, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+        (1536, 1536, 65536, 128, 128, True, False, True): (2, 512, 2, 4),
+        (1536, 1536, 131072, 16, 16, False, True, True): (2, 1024, 1, 2),
+        (1536, 1536, 131072, 16, 16, True, False, True): (9, 512, 4, 4),
+        (1536, 1536, 131072, 32, 32, False, True, True): (1, 512, 1, 2),
+        (1536, 1536, 131072, 32, 32, True, False, True): (9, 512, 3, 4),
+        (1536, 1536, 131072, 64, 64, False, True, True): (1, 1024, 3, 4),
+        (1536, 1536, 131072, 64, 64, True, False, True): (1, 512, 3, 4),
+        (1536, 1536, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+        (1536, 1536, 131072, 128, 128, True, False, True): (1, 1024, 2, 4),
         (2048, 2048, 256, 16, 16, False, True, True): (4, 4, 6, 2),
         (2048, 2048, 256, 16, 16, True, False, True): (2, 8, 4, 1),
         (2048, 2048, 256, 32, 32, False, True, True): (3, 4, 4, 2),
@@ -2147,6 +2893,86 @@ def test_func():
         (2048, 2048, 131072, 64, 64, True, False, True): (2, 1024, 3, 4),
         (2048, 2048, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
         (2048, 2048, 131072, 128, 128, True, False, True): (1, 1024, 1, 4),
+        (3072, 3072, 256, 16, 16, False, True, True): (5, 4, 1, 4),
+        (3072, 3072, 256, 16, 16, True, False, True): (1, 2, 5, 2),
+        (3072, 3072, 256, 32, 32, False, True, True): (1, 4, 1, 8),
+        (3072, 3072, 256, 32, 32, True, False, True): (3, 4, 4, 2),
+        (3072, 3072, 256, 64, 64, False, True, True): (2, 4, 3, 4),
+        (3072, 3072, 256, 64, 64, True, False, True): (3, 4, 4, 4),
+        (3072, 3072, 256, 128, 128, False, True, True): (1, 2, 3, 8),
+        (3072, 3072, 256, 128, 128, True, False, True): (1, 2, 3, 8),
+        (3072, 3072, 512, 16, 16, False, True, True): (5, 4, 1, 2),
+        (3072, 3072, 512, 16, 16, True, False, True): (1, 2, 4, 4),
+        (3072, 3072, 512, 32, 32, False, True, True): (3, 8, 1, 4),
+        (3072, 3072, 512, 32, 32, True, False, True): (4, 2, 3, 4),
+        (3072, 3072, 512, 64, 64, False, True, True): (1, 8, 2, 2),
+        (3072, 3072, 512, 64, 64, True, False, True): (2, 4, 3, 4),
+        (3072, 3072, 512, 128, 128, False, True, True): (1, 4, 3, 8),
+        (3072, 3072, 512, 128, 128, True, False, True): (4, 4, 3, 8),
+        (3072, 3072, 1024, 16, 16, False, True, True): (1, 8, 1, 4),
+        (3072, 3072, 1024, 16, 16, True, False, True): (4, 8, 5, 2),
+        (3072, 3072, 1024, 32, 32, False, True, True): (1, 8, 1, 8),
+        (3072, 3072, 1024, 32, 32, True, False, True): (1, 4, 4, 4),
+        (3072, 3072, 1024, 64, 64, False, True, True): (3, 8, 3, 4),
+        (3072, 3072, 1024, 64, 64, True, False, True): (2, 4, 3, 4),
+        (3072, 3072, 1024, 128, 128, False, True, True): (3, 8, 1, 4),
+        (3072, 3072, 1024, 128, 128, True, False, True): (1, 8, 3, 8),
+        (3072, 3072, 2048, 16, 16, False, True, True): (1, 16, 1, 2),
+        (3072, 3072, 2048, 16, 16, True, False, True): (4, 16, 4, 2),
+        (3072, 3072, 2048, 32, 32, False, True, True): (1, 16, 1, 8),
+        (3072, 3072, 2048, 32, 32, True, False, True): (3, 8, 4, 4),
+        (3072, 3072, 2048, 64, 64, False, True, True): (3, 16, 3, 4),
+        (3072, 3072, 2048, 64, 64, True, False, True): (3, 8, 3, 4),
+        (3072, 3072, 2048, 128, 128, False, True, True): (4, 16, 3, 8),
+        (3072, 3072, 2048, 128, 128, True, False, True): (3, 16, 3, 8),
+        (3072, 3072, 4096, 16, 16, False, True, True): (1, 32, 1, 2),
+        (3072, 3072, 4096, 16, 16, True, False, True): (4, 32, 4, 2),
+        (3072, 3072, 4096, 32, 32, False, True, True): (1, 32, 1, 8),
+        (3072, 3072, 4096, 32, 32, True, False, True): (3, 16, 3, 4),
+        (3072, 3072, 4096, 64, 64, False, True, True): (1, 32, 3, 4),
+        (3072, 3072, 4096, 64, 64, True, False, True): (3, 16, 3, 4),
+        (3072, 3072, 4096, 128, 128, False, True, True): (1, 32, 3, 8),
+        (3072, 3072, 4096, 128, 128, True, False, True): (3, 32, 3, 8),
+        (3072, 3072, 8192, 16, 16, False, True, True): (1, 64, 1, 2),
+        (3072, 3072, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (3072, 3072, 8192, 32, 32, False, True, True): (1, 64, 1, 8),
+        (3072, 3072, 8192, 32, 32, True, False, True): (8, 32, 3, 4),
+        (3072, 3072, 8192, 64, 64, False, True, True): (3, 64, 3, 4),
+        (3072, 3072, 8192, 64, 64, True, False, True): (2, 32, 3, 4),
+        (3072, 3072, 8192, 128, 128, False, True, True): (2, 64, 3, 8),
+        (3072, 3072, 8192, 128, 128, True, False, True): (1, 64, 3, 8),
+        (3072, 3072, 16384, 16, 16, False, True, True): (1, 128, 1, 2),
+        (3072, 3072, 16384, 16, 16, True, False, True): (4, 128, 4, 2),
+        (3072, 3072, 16384, 32, 32, False, True, True): (1, 64, 1, 2),
+        (3072, 3072, 16384, 32, 32, True, False, True): (4, 64, 3, 4),
+        (3072, 3072, 16384, 64, 64, False, True, True): (1, 128, 3, 4),
+        (3072, 3072, 16384, 64, 64, True, False, True): (4, 64, 3, 4),
+        (3072, 3072, 16384, 128, 128, False, True, True): (3, 128, 1, 4),
+        (3072, 3072, 16384, 128, 128, True, False, True): (1, 128, 3, 8),
+        (3072, 3072, 32768, 16, 16, False, True, True): (1, 256, 1, 2),
+        (3072, 3072, 32768, 16, 16, True, False, True): (8, 128, 4, 4),
+        (3072, 3072, 32768, 32, 32, False, True, True): (1, 256, 1, 8),
+        (3072, 3072, 32768, 32, 32, True, False, True): (5, 128, 3, 4),
+        (3072, 3072, 32768, 64, 64, False, True, True): (1, 256, 3, 4),
+        (3072, 3072, 32768, 64, 64, True, False, True): (1, 128, 3, 4),
+        (3072, 3072, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+        (3072, 3072, 32768, 128, 128, True, False, True): (3, 256, 2, 4),
+        (3072, 3072, 65536, 16, 16, False, True, True): (1, 512, 1, 2),
+        (3072, 3072, 65536, 16, 16, True, False, True): (7, 256, 4, 4),
+        (3072, 3072, 65536, 32, 32, False, True, True): (1, 256, 1, 2),
+        (3072, 3072, 65536, 32, 32, True, False, True): (5, 256, 3, 4),
+        (3072, 3072, 65536, 64, 64, False, True, True): (1, 512, 3, 4),
+        (3072, 3072, 65536, 64, 64, True, False, True): (3, 256, 3, 4),
+        (3072, 3072, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+        (3072, 3072, 65536, 128, 128, True, False, True): (3, 512, 2, 4),
+        (3072, 3072, 131072, 16, 16, False, True, True): (1, 1024, 1, 2),
+        (3072, 3072, 131072, 16, 16, True, False, True): (5, 512, 4, 4),
+        (3072, 3072, 131072, 32, 32, False, True, True): (1, 512, 1, 2),
+        (3072, 3072, 131072, 32, 32, True, False, True): (3, 512, 3, 4),
+        (3072, 3072, 131072, 64, 64, False, True, True): (1, 1024, 3, 4),
+        (3072, 3072, 131072, 64, 64, True, False, True): (3, 512, 3, 4),
+        (3072, 3072, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+        (3072, 3072, 131072, 128, 128, True, False, True): (1, 1024, 2, 4),
         (4096, 4096, 256, 16, 16, False, True, True): (2, 2, 6, 4),
         (4096, 4096, 256, 16, 16, True, False, True): (2, 2, 5, 4),
         (4096, 4096, 256, 32, 32, False, True, True): (7, 2, 4, 4),
@@ -2227,6 +3053,86 @@ def test_func():
         (4096, 4096, 131072, 64, 64, True, False, True): (1, 512, 2, 4),
         (4096, 4096, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
         (4096, 4096, 131072, 128, 128, True, False, True): (1, 1024, 1, 4),
+        (6144, 6144, 256, 16, 16, False, True, True): (1, 2, 1, 4),
+        (6144, 6144, 256, 16, 16, True, False, True): (1, 1, 4, 4),
+        (6144, 6144, 256, 32, 32, False, True, True): (3, 2, 1, 8),
+        (6144, 6144, 256, 32, 32, True, False, True): (2, 1, 3, 4),
+        (6144, 6144, 256, 64, 64, False, True, True): (2, 2, 3, 4),
+        (6144, 6144, 256, 64, 64, True, False, True): (6, 2, 4, 4),
+        (6144, 6144, 256, 128, 128, False, True, True): (2, 2, 3, 8),
+        (6144, 6144, 256, 128, 128, True, False, True): (1, 2, 3, 8),
+        (6144, 6144, 512, 16, 16, False, True, True): (4, 4, 1, 4),
+        (6144, 6144, 512, 16, 16, True, False, True): (3, 2, 3, 1),
+        (6144, 6144, 512, 32, 32, False, True, True): (1, 8, 1, 4),
+        (6144, 6144, 512, 32, 32, True, False, True): (2, 2, 3, 8),
+        (6144, 6144, 512, 64, 64, False, True, True): (4, 4, 3, 4),
+        (6144, 6144, 512, 64, 64, True, False, True): (6, 2, 3, 4),
+        (6144, 6144, 512, 128, 128, False, True, True): (3, 4, 1, 4),
+        (6144, 6144, 512, 128, 128, True, False, True): (4, 4, 3, 8),
+        (6144, 6144, 1024, 16, 16, False, True, True): (1, 8, 1, 2),
+        (6144, 6144, 1024, 16, 16, True, False, True): (4, 8, 4, 2),
+        (6144, 6144, 1024, 32, 32, False, True, True): (1, 8, 4, 2),
+        (6144, 6144, 1024, 32, 32, True, False, True): (1, 8, 4, 2),
+        (6144, 6144, 1024, 64, 64, False, True, True): (4, 8, 3, 4),
+        (6144, 6144, 1024, 64, 64, True, False, True): (1, 4, 3, 4),
+        (6144, 6144, 1024, 128, 128, False, True, True): (3, 8, 1, 4),
+        (6144, 6144, 1024, 128, 128, True, False, True): (1, 8, 3, 8),
+        (6144, 6144, 2048, 16, 16, False, True, True): (4, 4, 1, 4),
+        (6144, 6144, 2048, 16, 16, True, False, True): (2, 8, 4, 4),
+        (6144, 6144, 2048, 32, 32, False, True, True): (4, 8, 3, 4),
+        (6144, 6144, 2048, 32, 32, True, False, True): (2, 8, 3, 4),
+        (6144, 6144, 2048, 64, 64, False, True, True): (4, 16, 3, 4),
+        (6144, 6144, 2048, 64, 64, True, False, True): (2, 8, 3, 4),
+        (6144, 6144, 2048, 128, 128, False, True, True): (3, 16, 1, 4),
+        (6144, 6144, 2048, 128, 128, True, False, True): (4, 16, 3, 8),
+        (6144, 6144, 4096, 16, 16, False, True, True): (4, 8, 1, 4),
+        (6144, 6144, 4096, 16, 16, True, False, True): (4, 32, 4, 2),
+        (6144, 6144, 4096, 32, 32, False, True, True): (4, 16, 1, 2),
+        (6144, 6144, 4096, 32, 32, True, False, True): (2, 8, 3, 8),
+        (6144, 6144, 4096, 64, 64, False, True, True): (4, 32, 3, 4),
+        (6144, 6144, 4096, 64, 64, True, False, True): (4, 16, 3, 4),
+        (6144, 6144, 4096, 128, 128, False, True, True): (6, 32, 1, 4),
+        (6144, 6144, 4096, 128, 128, True, False, True): (4, 32, 3, 8),
+        (6144, 6144, 8192, 16, 16, False, True, True): (2, 16, 1, 2),
+        (6144, 6144, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (6144, 6144, 8192, 32, 32, False, True, True): (4, 32, 1, 2),
+        (6144, 6144, 8192, 32, 32, True, False, True): (4, 32, 3, 4),
+        (6144, 6144, 8192, 64, 64, False, True, True): (4, 64, 3, 4),
+        (6144, 6144, 8192, 64, 64, True, False, True): (4, 32, 3, 4),
+        (6144, 6144, 8192, 128, 128, False, True, True): (6, 64, 1, 4),
+        (6144, 6144, 8192, 128, 128, True, False, True): (4, 64, 3, 8),
+        (6144, 6144, 16384, 16, 16, False, True, True): (2, 32, 1, 2),
+        (6144, 6144, 16384, 16, 16, True, False, True): (4, 64, 4, 4),
+        (6144, 6144, 16384, 32, 32, False, True, True): (4, 64, 1, 2),
+        (6144, 6144, 16384, 32, 32, True, False, True): (4, 64, 3, 4),
+        (6144, 6144, 16384, 64, 64, False, True, True): (4, 128, 3, 4),
+        (6144, 6144, 16384, 64, 64, True, False, True): (1, 32, 3, 8),
+        (6144, 6144, 16384, 128, 128, False, True, True): (4, 128, 1, 4),
+        (6144, 6144, 16384, 128, 128, True, False, True): (4, 128, 3, 8),
+        (6144, 6144, 32768, 16, 16, False, True, True): (2, 64, 1, 2),
+        (6144, 6144, 32768, 16, 16, True, False, True): (5, 128, 4, 1),
+        (6144, 6144, 32768, 32, 32, False, True, True): (4, 128, 1, 2),
+        (6144, 6144, 32768, 32, 32, True, False, True): (3, 128, 3, 4),
+        (6144, 6144, 32768, 64, 64, False, True, True): (4, 256, 3, 4),
+        (6144, 6144, 32768, 64, 64, True, False, True): (2, 64, 3, 8),
+        (6144, 6144, 32768, 128, 128, False, True, True): (8, 256, 1, 4),
+        (6144, 6144, 32768, 128, 128, True, False, True): (4, 256, 3, 8),
+        (6144, 6144, 65536, 16, 16, False, True, True): (2, 128, 1, 2),
+        (6144, 6144, 65536, 16, 16, True, False, True): (5, 256, 4, 1),
+        (6144, 6144, 65536, 32, 32, False, True, True): (4, 256, 1, 2),
+        (6144, 6144, 65536, 32, 32, True, False, True): (2, 256, 3, 4),
+        (6144, 6144, 65536, 64, 64, False, True, True): (4, 512, 3, 4),
+        (6144, 6144, 65536, 64, 64, True, False, True): (1, 128, 3, 8),
+        (6144, 6144, 65536, 128, 128, False, True, True): (4, 512, 1, 4),
+        (6144, 6144, 65536, 128, 128, True, False, True): (4, 512, 3, 8),
+        (6144, 6144, 131072, 16, 16, False, True, True): (2, 256, 1, 2),
+        (6144, 6144, 131072, 16, 16, True, False, True): (3, 512, 4, 4),
+        (6144, 6144, 131072, 32, 32, False, True, True): (4, 512, 1, 2),
+        (6144, 6144, 131072, 32, 32, True, False, True): (4, 512, 3, 4),
+        (6144, 6144, 131072, 64, 64, False, True, True): (4, 1024, 3, 4),
+        (6144, 6144, 131072, 64, 64, True, False, True): (2, 256, 3, 8),
+        (6144, 6144, 131072, 128, 128, False, True, True): (4, 1024, 1, 4),
+        (6144, 6144, 131072, 128, 128, True, False, True): (4, 1024, 3, 8),
         (8192, 8192, 256, 16, 16, False, True, True): (2, 2, 6, 4),
         (8192, 8192, 256, 16, 16, True, False, True): (2, 4, 2, 2),
         (8192, 8192, 256, 32, 32, False, True, True): (4, 2, 3, 4),
@@ -2307,6 +3213,86 @@ def test_func():
         (8192, 8192, 131072, 64, 64, True, False, True): (2, 512, 2, 4),
         (8192, 8192, 131072, 128, 128, False, True, True): (4, 1024, 1, 4),
         (8192, 8192, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
+        (12288, 12288, 256, 16, 16, False, True, True): (4, 2, 1, 4),
+        (12288, 12288, 256, 16, 16, True, False, True): (1, 1, 3, 1),
+        (12288, 12288, 256, 32, 32, False, True, True): (4, 4, 1, 4),
+        (12288, 12288, 256, 32, 32, True, False, True): (2, 1, 3, 2),
+        (12288, 12288, 256, 64, 64, False, True, True): (4, 2, 3, 4),
+        (12288, 12288, 256, 64, 64, True, False, True): (3, 1, 3, 4),
+        (12288, 12288, 256, 128, 128, False, True, True): (6, 2, 1, 4),
+        (12288, 12288, 256, 128, 128, True, False, True): (4, 2, 3, 8),
+        (12288, 12288, 512, 16, 16, False, True, True): (4, 4, 1, 2),
+        (12288, 12288, 512, 16, 16, True, False, True): (4, 4, 4, 2),
+        (12288, 12288, 512, 32, 32, False, True, True): (4, 4, 4, 2),
+        (12288, 12288, 512, 32, 32, True, False, True): (2, 2, 3, 8),
+        (12288, 12288, 512, 64, 64, False, True, True): (4, 4, 3, 4),
+        (12288, 12288, 512, 64, 64, True, False, True): (8, 2, 3, 4),
+        (12288, 12288, 512, 128, 128, False, True, True): (4, 4, 3, 8),
+        (12288, 12288, 512, 128, 128, True, False, True): (4, 4, 3, 8),
+        (12288, 12288, 1024, 16, 16, False, True, True): (4, 8, 1, 2),
+        (12288, 12288, 1024, 16, 16, True, False, True): (2, 4, 4, 4),
+        (12288, 12288, 1024, 32, 32, False, True, True): (4, 4, 3, 4),
+        (12288, 12288, 1024, 32, 32, True, False, True): (1, 4, 3, 4),
+        (12288, 12288, 1024, 64, 64, False, True, True): (4, 8, 3, 4),
+        (12288, 12288, 1024, 64, 64, True, False, True): (2, 4, 3, 4),
+        (12288, 12288, 1024, 128, 128, False, True, True): (4, 8, 3, 8),
+        (12288, 12288, 1024, 128, 128, True, False, True): (4, 8, 3, 8),
+        (12288, 12288, 2048, 16, 16, False, True, True): (2, 4, 1, 4),
+        (12288, 12288, 2048, 16, 16, True, False, True): (2, 8, 4, 4),
+        (12288, 12288, 2048, 32, 32, False, True, True): (4, 8, 1, 2),
+        (12288, 12288, 2048, 32, 32, True, False, True): (2, 8, 4, 8),
+        (12288, 12288, 2048, 64, 64, False, True, True): (4, 16, 3, 4),
+        (12288, 12288, 2048, 64, 64, True, False, True): (2, 8, 3, 4),
+        (12288, 12288, 2048, 128, 128, False, True, True): (4, 16, 3, 8),
+        (12288, 12288, 2048, 128, 128, True, False, True): (4, 16, 3, 8),
+        (12288, 12288, 4096, 16, 16, False, True, True): (2, 8, 1, 4),
+        (12288, 12288, 4096, 16, 16, True, False, True): (2, 16, 4, 4),
+        (12288, 12288, 4096, 32, 32, False, True, True): (2, 16, 1, 2),
+        (12288, 12288, 4096, 32, 32, True, False, True): (2, 16, 3, 4),
+        (12288, 12288, 4096, 64, 64, False, True, True): (4, 32, 3, 4),
+        (12288, 12288, 4096, 64, 64, True, False, True): (2, 16, 3, 4),
+        (12288, 12288, 4096, 128, 128, False, True, True): (4, 32, 1, 4),
+        (12288, 12288, 4096, 128, 128, True, False, True): (4, 32, 3, 8),
+        (12288, 12288, 8192, 16, 16, False, True, True): (2, 32, 1, 1),
+        (12288, 12288, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (12288, 12288, 8192, 32, 32, False, True, True): (2, 32, 1, 2),
+        (12288, 12288, 8192, 32, 32, True, False, True): (2, 32, 3, 2),
+        (12288, 12288, 8192, 64, 64, False, True, True): (4, 64, 3, 4),
+        (12288, 12288, 8192, 64, 64, True, False, True): (2, 32, 3, 4),
+        (12288, 12288, 8192, 128, 128, False, True, True): (4, 64, 3, 8),
+        (12288, 12288, 8192, 128, 128, True, False, True): (2, 64, 3, 8),
+        (12288, 12288, 16384, 16, 16, False, True, True): (4, 128, 1, 2),
+        (12288, 12288, 16384, 16, 16, True, False, True): (4, 128, 4, 2),
+        (12288, 12288, 16384, 32, 32, False, True, True): (2, 64, 1, 2),
+        (12288, 12288, 16384, 32, 32, True, False, True): (2, 64, 3, 4),
+        (12288, 12288, 16384, 64, 64, False, True, True): (4, 128, 3, 4),
+        (12288, 12288, 16384, 64, 64, True, False, True): (2, 64, 3, 4),
+        (12288, 12288, 16384, 128, 128, False, True, True): (4, 128, 1, 4),
+        (12288, 12288, 16384, 128, 128, True, False, True): (4, 128, 3, 8),
+        (12288, 12288, 32768, 16, 16, False, True, True): (2, 128, 1, 1),
+        (12288, 12288, 32768, 16, 16, True, False, True): (3, 128, 4, 1),
+        (12288, 12288, 32768, 32, 32, False, True, True): (2, 128, 1, 2),
+        (12288, 12288, 32768, 32, 32, True, False, True): (2, 128, 3, 2),
+        (12288, 12288, 32768, 64, 64, False, True, True): (4, 256, 3, 4),
+        (12288, 12288, 32768, 64, 64, True, False, True): (1, 64, 3, 8),
+        (12288, 12288, 32768, 128, 128, False, True, True): (4, 256, 3, 8),
+        (12288, 12288, 32768, 128, 128, True, False, True): (4, 256, 3, 8),
+        (12288, 12288, 65536, 16, 16, False, True, True): (4, 512, 1, 2),
+        (12288, 12288, 65536, 16, 16, True, False, True): (3, 256, 4, 1),
+        (12288, 12288, 65536, 32, 32, False, True, True): (2, 256, 1, 2),
+        (12288, 12288, 65536, 32, 32, True, False, True): (2, 256, 3, 2),
+        (12288, 12288, 65536, 64, 64, False, True, True): (4, 512, 3, 4),
+        (12288, 12288, 65536, 64, 64, True, False, True): (2, 256, 3, 4),
+        (12288, 12288, 65536, 128, 128, False, True, True): (4, 512, 1, 4),
+        (12288, 12288, 65536, 128, 128, True, False, True): (4, 512, 3, 8),
+        (12288, 12288, 131072, 16, 16, False, True, True): (2, 512, 1, 1),
+        (12288, 12288, 131072, 16, 16, True, False, True): (2, 512, 4, 4),
+        (12288, 12288, 131072, 32, 32, False, True, True): (2, 512, 1, 2),
+        (12288, 12288, 131072, 32, 32, True, False, True): (2, 512, 3, 4),
+        (12288, 12288, 131072, 64, 64, False, True, True): (4, 1024, 3, 4),
+        (12288, 12288, 131072, 64, 64, True, False, True): (2, 512, 3, 4),
+        (12288, 12288, 131072, 128, 128, False, True, True): (4, 1024, 3, 8),
+        (12288, 12288, 131072, 128, 128, True, False, True): (4, 1024, 3, 8),
         (16384, 16384, 256, 16, 16, False, True, True): (2, 2, 3, 2),
         (16384, 16384, 256, 16, 16, True, False, True): (2, 2, 6, 4),
         (16384, 16384, 256, 32, 32, False, True, True): (4, 2, 3, 4),
@@ -2387,6 +3373,114 @@ def test_func():
         (16384, 16384, 131072, 64, 64, True, False, True): (1, 512, 2, 4),
         (16384, 16384, 131072, 128, 128, False, True, True): (4, 1024, 1, 4),
         (16384, 16384, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
+        (24576, 24576, 256, 16, 16, False, True, True): (6, 2, 1, 2),
+        (24576, 24576, 256, 16, 16, True, False, True): (2, 2, 5, 4),
+        (24576, 24576, 256, 32, 32, False, True, True): (4, 4, 1, 4),
+        (24576, 24576, 256, 32, 32, True, False, True): (2, 2, 4, 2),
+        (24576, 24576, 256, 64, 64, False, True, True): (2, 2, 3, 4),
+        (24576, 24576, 256, 64, 64, True, False, True): (1, 1, 3, 4),
+        (24576, 24576, 256, 128, 128, False, True, True): (6, 2, 1, 4),
+        (24576, 24576, 256, 128, 128, True, False, True): (2, 2, 3, 8),
+        (24576, 24576, 512, 16, 16, False, True, True): (4, 4, 1, 2),
+        (24576, 24576, 512, 16, 16, True, False, True): (2, 2, 4, 4),
+        (24576, 24576, 512, 32, 32, False, True, True): (1, 2, 3, 4),
+        (24576, 24576, 512, 32, 32, True, False, True): (1, 2, 3, 4),
+        (24576, 24576, 512, 64, 64, False, True, True): (4, 4, 3, 4),
+        (24576, 24576, 512, 64, 64, True, False, True): (1, 2, 3, 4),
+        (24576, 24576, 512, 128, 128, False, True, True): (4, 4, 3, 8),
+        (24576, 24576, 512, 128, 128, True, False, True): (4, 4, 3, 8),
+        (24576, 24576, 1024, 16, 16, False, True, True): (2, 8, 1, 2),
+        (24576, 24576, 1024, 16, 16, True, False, True): (2, 4, 4, 4),
+        (24576, 24576, 1024, 32, 32, False, True, True): (2, 4, 1, 2),
+        (24576, 24576, 1024, 32, 32, True, False, True): (1, 4, 3, 4),
+        (24576, 24576, 1024, 64, 64, False, True, True): (4, 8, 3, 4),
+        (24576, 24576, 1024, 64, 64, True, False, True): (1, 4, 3, 4),
+        (24576, 24576, 1024, 128, 128, False, True, True): (4, 8, 3, 8),
+        (24576, 24576, 1024, 128, 128, True, False, True): (4, 8, 3, 8),
+        (24576, 24576, 2048, 16, 16, False, True, True): (1, 4, 1, 4),
+        (24576, 24576, 2048, 16, 16, True, False, True): (1, 8, 4, 4),
+        (24576, 24576, 2048, 32, 32, False, True, True): (2, 8, 1, 2),
+        (24576, 24576, 2048, 32, 32, True, False, True): (1, 8, 3, 4),
+        (24576, 24576, 2048, 64, 64, False, True, True): (4, 16, 3, 4),
+        (24576, 24576, 2048, 64, 64, True, False, True): (1, 4, 3, 8),
+        (24576, 24576, 2048, 128, 128, False, True, True): (4, 16, 3, 8),
+        (24576, 24576, 2048, 128, 128, True, False, True): (2, 16, 3, 8),
+        (24576, 24576, 4096, 16, 16, False, True, True): (2, 32, 1, 2),
+        (24576, 24576, 4096, 16, 16, True, False, True): (1, 16, 4, 4),
+        (24576, 24576, 4096, 32, 32, False, True, True): (1, 16, 1, 2),
+        (24576, 24576, 4096, 32, 32, True, False, True): (1, 16, 3, 4),
+        (24576, 24576, 4096, 64, 64, False, True, True): (4, 32, 3, 4),
+        (24576, 24576, 4096, 64, 64, True, False, True): (1, 8, 3, 8),
+        (24576, 24576, 4096, 128, 128, False, True, True): (4, 32, 3, 8),
+        (24576, 24576, 4096, 128, 128, True, False, True): (2, 32, 3, 8),
+        (24576, 24576, 8192, 16, 16, False, True, True): (1, 32, 1, 1),
+        (24576, 24576, 8192, 16, 16, True, False, True): (2, 64, 4, 2),
+        (24576, 24576, 8192, 32, 32, False, True, True): (1, 32, 1, 2),
+        (24576, 24576, 8192, 32, 32, True, False, True): (1, 32, 3, 4),
+        (24576, 24576, 8192, 64, 64, False, True, True): (4, 64, 3, 4),
+        (24576, 24576, 8192, 64, 64, True, False, True): (1, 32, 3, 4),
+        (24576, 24576, 8192, 128, 128, False, True, True): (4, 64, 3, 8),
+        (24576, 24576, 8192, 128, 128, True, False, True): (4, 64, 3, 8),
+        (24576, 24576, 16384, 16, 16, False, True, True): (2, 128, 1, 2),
+        (24576, 24576, 16384, 16, 16, True, False, True): (1, 64, 4, 4),
+        (24576, 24576, 16384, 32, 32, False, True, True): (1, 64, 1, 2),
+        (24576, 24576, 16384, 32, 32, True, False, True): (1, 64, 3, 2),
+        (24576, 24576, 16384, 64, 64, False, True, True): (2, 128, 3, 4),
+        (24576, 24576, 16384, 64, 64, True, False, True): (1, 32, 3, 8),
+        (24576, 24576, 16384, 128, 128, False, True, True): (4, 128, 3, 8),
+        (24576, 24576, 16384, 128, 128, True, False, True): (4, 128, 3, 8),
+        (24576, 24576, 32768, 16, 16, False, True, True): (1, 128, 1, 1),
+        (24576, 24576, 32768, 16, 16, True, False, True): (1, 128, 4, 4),
+        (24576, 24576, 32768, 32, 32, False, True, True): (1, 128, 1, 2),
+        (24576, 24576, 32768, 32, 32, True, False, True): (1, 128, 3, 4),
+        (24576, 24576, 32768, 64, 64, False, True, True): (2, 256, 3, 4),
+        (24576, 24576, 32768, 64, 64, True, False, True): (1, 128, 3, 4),
+        (24576, 24576, 32768, 128, 128, False, True, True): (4, 256, 3, 8),
+        (24576, 24576, 32768, 128, 128, True, False, True): (2, 256, 3, 8),
+        (24576, 24576, 65536, 16, 16, False, True, True): (2, 512, 1, 2),
+        (24576, 24576, 65536, 16, 16, True, False, True): (1, 256, 4, 4),
+    },
+    ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float16, 0.56)): {
+        (192, 192, 256, 64, 64, False, True, True): (1, 4, 3, 4),
+        (192, 192, 256, 64, 64, True, False, True): (1, 4, 3, 4),
+        (192, 192, 512, 64, 64, False, True, True): (1, 8, 5, 4),
+        (192, 192, 512, 64, 64, True, False, True): (1, 8, 3, 4),
+        (192, 192, 1024, 64, 64, False, True, True): (1, 16, 3, 2),
+        (192, 192, 1024, 64, 64, True, False, True): (1, 16, 3, 4),
+        (192, 192, 2048, 64, 64, False, True, True): (1, 32, 5, 4),
+        (192, 192, 2048, 64, 64, True, False, True): (4, 32, 5, 4),
+        (192, 192, 4096, 64, 64, False, True, True): (1, 64, 1, 8),
+        (192, 192, 4096, 64, 64, True, False, True): (1, 32, 3, 4),
+        (192, 192, 8192, 64, 64, False, True, True): (4, 128, 1, 4),
+        (192, 192, 8192, 64, 64, True, False, True): (3, 64, 3, 4),
+        (192, 192, 16384, 64, 64, False, True, True): (1, 256, 1, 4),
+        (192, 192, 16384, 64, 64, True, False, True): (3, 64, 2, 4),
+        (192, 192, 32768, 64, 64, False, True, True): (1, 512, 1, 2),
+        (192, 192, 32768, 64, 64, True, False, True): (2, 256, 2, 4),
+        (192, 192, 65536, 64, 64, False, True, True): (1, 512, 1, 4),
+        (192, 192, 65536, 64, 64, True, False, True): (2, 512, 2, 4),
+        (192, 192, 131072, 64, 64, False, True, True): (1, 1024, 1, 4),
+        (192, 192, 131072, 64, 64, True, False, True): (1, 512, 3, 4),
+        (384, 384, 256, 128, 128, False, True, True): (3, 2, 3, 8),
+        (384, 384, 256, 128, 128, True, False, True): (5, 2, 3, 8),
+        (384, 384, 512, 128, 128, False, True, True): (4, 4, 3, 8),
+        (384, 384, 512, 128, 128, True, False, True): (1, 4, 3, 8),
+        (384, 384, 1024, 128, 128, False, True, True): (1, 8, 3, 8),
+        (384, 384, 1024, 128, 128, True, False, True): (1, 8, 2, 8),
+        (384, 384, 2048, 128, 128, False, True, True): (3, 16, 3, 8),
+        (384, 384, 2048, 128, 128, True, False, True): (1, 16, 3, 8),
+        (384, 384, 4096, 128, 128, False, True, True): (3, 32, 3, 8),
+        (384, 384, 4096, 128, 128, True, False, True): (3, 32, 3, 8),
+        (384, 384, 8192, 128, 128, False, True, True): (2, 64, 3, 8),
+        (384, 384, 8192, 128, 128, True, False, True): (2, 64, 2, 4),
+        (384, 384, 16384, 128, 128, False, True, True): (1, 128, 2, 8),
+        (384, 384, 16384, 128, 128, True, False, True): (3, 128, 2, 4),
+        (384, 384, 32768, 128, 128, False, True, True): (2, 256, 3, 8),
+        (384, 384, 32768, 128, 128, True, False, True): (1, 256, 2, 4),
+        (384, 384, 65536, 128, 128, False, True, True): (7, 512, 1, 4),
+        (384, 384, 65536, 128, 128, True, False, True): (3, 512, 2, 4),
+        (384, 384, 131072, 128, 128, False, True, True): (5, 1024, 1, 4),
+        (384, 384, 131072, 128, 128, True, False, True): (1, 1024, 2, 4),
     },
     ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float32, 0.5)): {
         (16, 16, 16, 16, 16, False, False, False): (2, 1, 1, 16),
@@ -2605,6 +3699,46 @@ def test_func():
         (64, 64, 64, 32, 32, False, True, True): (2, 2, 1, 4),
         (64, 64, 64, 32, 32, True, False, False): (1, 1, 1, 8),
         (64, 64, 64, 32, 32, True, False, True): (1, 2, 2, 4),
+        (192, 192, 256, 16, 16, False, True, True): (1, 16, 3, 2),
+        (192, 192, 256, 16, 16, True, False, True): (1, 8, 5, 4),
+        (192, 192, 256, 32, 32, False, True, True): (2, 8, 4, 4),
+        (192, 192, 256, 32, 32, True, False, True): (1, 8, 5, 4),
+        (192, 192, 512, 16, 16, False, True, True): (2, 16, 3, 4),
+        (192, 192, 512, 16, 16, True, False, True): (1, 16, 5, 4),
+        (192, 192, 512, 32, 32, False, True, True): (1, 16, 3, 4),
+        (192, 192, 512, 32, 32, True, False, True): (2, 16, 3, 4),
+        (192, 192, 1024, 16, 16, False, True, True): (3, 16, 3, 4),
+        (192, 192, 1024, 16, 16, True, False, True): (2, 8, 3, 4),
+        (192, 192, 1024, 32, 32, False, True, True): (3, 32, 1, 4),
+        (192, 192, 1024, 32, 32, True, False, True): (3, 16, 3, 4),
+        (192, 192, 2048, 16, 16, False, True, True): (1, 32, 3, 4),
+        (192, 192, 2048, 16, 16, True, False, True): (2, 16, 3, 4),
+        (192, 192, 2048, 32, 32, False, True, True): (1, 64, 1, 4),
+        (192, 192, 2048, 32, 32, True, False, True): (1, 64, 2, 4),
+        (192, 192, 4096, 16, 16, False, True, True): (1, 64, 2, 4),
+        (192, 192, 4096, 16, 16, True, False, True): (1, 32, 3, 4),
+        (192, 192, 4096, 32, 32, False, True, True): (3, 128, 2, 4),
+        (192, 192, 4096, 32, 32, True, False, True): (1, 128, 2, 4),
+        (192, 192, 8192, 16, 16, False, True, True): (2, 64, 3, 4),
+        (192, 192, 8192, 16, 16, True, False, True): (1, 64, 3, 4),
+        (192, 192, 8192, 32, 32, False, True, True): (3, 128, 3, 4),
+        (192, 192, 8192, 32, 32, True, False, True): (1, 128, 2, 4),
+        (192, 192, 16384, 16, 16, False, True, True): (1, 256, 3, 2),
+        (192, 192, 16384, 16, 16, True, False, True): (1, 256, 3, 2),
+        (192, 192, 16384, 32, 32, False, True, True): (2, 256, 3, 4),
+        (192, 192, 16384, 32, 32, True, False, True): (2, 256, 3, 4),
+        (192, 192, 32768, 16, 16, False, True, True): (2, 512, 3, 2),
+        (192, 192, 32768, 16, 16, True, False, True): (1, 128, 3, 4),
+        (192, 192, 32768, 32, 32, False, True, True): (2, 512, 3, 4),
+        (192, 192, 32768, 32, 32, True, False, True): (2, 512, 3, 4),
+        (192, 192, 65536, 16, 16, False, True, True): (2, 1024, 3, 2),
+        (192, 192, 65536, 16, 16, True, False, True): (1, 256, 3, 4),
+        (192, 192, 65536, 32, 32, False, True, True): (2, 1024, 3, 4),
+        (192, 192, 65536, 32, 32, True, False, True): (2, 1024, 3, 4),
+        (192, 192, 131072, 16, 16, False, True, True): (2, 512, 3, 4),
+        (192, 192, 131072, 16, 16, True, False, True): (1, 512, 3, 4),
+        (192, 192, 131072, 32, 32, False, True, True): (2, 1024, 3, 4),
+        (192, 192, 131072, 32, 32, True, False, True): (2, 1024, 3, 4),
         (256, 256, 256, 16, 16, False, True, True): (1, 16, 3, 4),
         (256, 256, 256, 16, 16, True, False, True): (2, 16, 1, 4),
         (256, 256, 256, 32, 32, False, True, True): (1, 8, 4, 8),
@@ -2685,6 +3819,66 @@ def test_func():
         (256, 256, 131072, 64, 64, True, False, True): (1, 2048, 2, 4),
         (256, 256, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
         (256, 256, 131072, 128, 128, True, False, True): (4, 1024, 1, 32),
+        (384, 384, 256, 16, 16, False, True, True): (1, 8, 3, 4),
+        (384, 384, 256, 16, 16, True, False, True): (1, 8, 3, 4),
+        (384, 384, 256, 32, 32, False, True, True): (2, 8, 3, 8),
+        (384, 384, 256, 32, 32, True, False, True): (1, 8, 3, 4),
+        (384, 384, 256, 64, 64, False, True, True): (1, 4, 4, 8),
+        (384, 384, 256, 64, 64, True, False, True): (2, 4, 3, 8),
+        (384, 384, 512, 16, 16, False, True, True): (3, 16, 3, 2),
+        (384, 384, 512, 16, 16, True, False, True): (3, 16, 3, 2),
+        (384, 384, 512, 32, 32, False, True, True): (2, 8, 3, 4),
+        (384, 384, 512, 32, 32, True, False, True): (1, 8, 3, 4),
+        (384, 384, 512, 64, 64, False, True, True): (2, 8, 3, 8),
+        (384, 384, 512, 64, 64, True, False, True): (2, 8, 4, 8),
+        (384, 384, 1024, 16, 16, False, True, True): (3, 16, 3, 2),
+        (384, 384, 1024, 16, 16, True, False, True): (4, 32, 3, 2),
+        (384, 384, 1024, 32, 32, False, True, True): (1, 32, 3, 4),
+        (384, 384, 1024, 32, 32, True, False, True): (2, 16, 3, 4),
+        (384, 384, 1024, 64, 64, False, True, True): (2, 16, 3, 8),
+        (384, 384, 1024, 64, 64, True, False, True): (4, 16, 4, 8),
+        (384, 384, 2048, 16, 16, False, True, True): (3, 16, 3, 4),
+        (384, 384, 2048, 16, 16, True, False, True): (1, 32, 3, 4),
+        (384, 384, 2048, 32, 32, False, True, True): (3, 64, 2, 4),
+        (384, 384, 2048, 32, 32, True, False, True): (1, 64, 3, 4),
+        (384, 384, 2048, 64, 64, False, True, True): (4, 32, 4, 8),
+        (384, 384, 2048, 64, 64, True, False, True): (5, 32, 4, 8),
+        (384, 384, 4096, 16, 16, False, True, True): (1, 32, 3, 4),
+        (384, 384, 4096, 16, 16, True, False, True): (3, 32, 3, 4),
+        (384, 384, 4096, 32, 32, False, True, True): (2, 64, 3, 4),
+        (384, 384, 4096, 32, 32, True, False, True): (2, 64, 3, 4),
+        (384, 384, 4096, 64, 64, False, True, True): (2, 64, 3, 8),
+        (384, 384, 4096, 64, 64, True, False, True): (2, 64, 3, 8),
+        (384, 384, 8192, 16, 16, False, True, True): (1, 128, 3, 2),
+        (384, 384, 8192, 16, 16, True, False, True): (1, 128, 3, 2),
+        (384, 384, 8192, 32, 32, False, True, True): (1, 128, 3, 4),
+        (384, 384, 8192, 32, 32, True, False, True): (1, 128, 3, 4),
+        (384, 384, 8192, 64, 64, False, True, True): (3, 128, 3, 4),
+        (384, 384, 8192, 64, 64, True, False, True): (2, 128, 3, 4),
+        (384, 384, 16384, 16, 16, False, True, True): (1, 256, 3, 2),
+        (384, 384, 16384, 16, 16, True, False, True): (1, 64, 3, 4),
+        (384, 384, 16384, 32, 32, False, True, True): (2, 256, 3, 4),
+        (384, 384, 16384, 32, 32, True, False, True): (4, 256, 3, 4),
+        (384, 384, 16384, 64, 64, False, True, True): (2, 256, 3, 4),
+        (384, 384, 16384, 64, 64, True, False, True): (1, 256, 3, 4),
+        (384, 384, 32768, 16, 16, False, True, True): (1, 128, 3, 4),
+        (384, 384, 32768, 16, 16, True, False, True): (1, 128, 3, 4),
+        (384, 384, 32768, 32, 32, False, True, True): (1, 512, 3, 4),
+        (384, 384, 32768, 32, 32, True, False, True): (1, 512, 2, 4),
+        (384, 384, 32768, 64, 64, False, True, True): (1, 512, 3, 4),
+        (384, 384, 32768, 64, 64, True, False, True): (1, 512, 3, 4),
+        (384, 384, 65536, 16, 16, False, True, True): (1, 256, 3, 4),
+        (384, 384, 65536, 16, 16, True, False, True): (1, 256, 3, 4),
+        (384, 384, 65536, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (384, 384, 65536, 32, 32, True, False, True): (1, 512, 3, 4),
+        (384, 384, 65536, 64, 64, False, True, True): (1, 1024, 3, 4),
+        (384, 384, 65536, 64, 64, True, False, True): (1, 1024, 3, 4),
+        (384, 384, 131072, 16, 16, False, True, True): (1, 512, 3, 4),
+        (384, 384, 131072, 16, 16, True, False, True): (1, 512, 3, 4),
+        (384, 384, 131072, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (384, 384, 131072, 32, 32, True, False, True): (1, 1024, 3, 4),
+        (384, 384, 131072, 64, 64, False, True, True): (1, 2048, 3, 4),
+        (384, 384, 131072, 64, 64, True, False, True): (1, 2048, 3, 4),
         (512, 512, 256, 16, 16, False, True, True): (1, 8, 4, 4),
         (512, 512, 256, 16, 16, True, False, True): (1, 8, 3, 2),
         (512, 512, 256, 32, 32, False, True, True): (4, 8, 3, 4),
@@ -2765,6 +3959,94 @@ def test_func():
         (512, 512, 131072, 64, 64, True, False, True): (1, 2048, 2, 4),
         (512, 512, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
         (512, 512, 131072, 128, 128, True, False, True): (2, 1024, 1, 32),
+        (768, 768, 256, 16, 16, False, True, True): (1, 4, 5, 4),
+        (768, 768, 256, 16, 16, True, False, True): (3, 8, 3, 2),
+        (768, 768, 256, 32, 32, False, True, True): (2, 4, 3, 4),
+        (768, 768, 256, 32, 32, True, False, True): (3, 8, 4, 4),
+        (768, 768, 256, 64, 64, False, True, True): (1, 4, 4, 8),
+        (768, 768, 256, 64, 64, True, False, True): (3, 4, 3, 8),
+        (768, 768, 256, 128, 128, False, True, True): (3, 2, 1, 32),
+        (768, 768, 256, 128, 128, True, False, True): (2, 2, 2, 32),
+        (768, 768, 512, 16, 16, False, True, True): (2, 4, 5, 4),
+        (768, 768, 512, 16, 16, True, False, True): (2, 4, 4, 4),
+        (768, 768, 512, 32, 32, False, True, True): (1, 8, 3, 4),
+        (768, 768, 512, 32, 32, True, False, True): (3, 8, 4, 4),
+        (768, 768, 512, 64, 64, False, True, True): (2, 8, 3, 8),
+        (768, 768, 512, 64, 64, True, False, True): (5, 8, 3, 8),
+        (768, 768, 512, 128, 128, False, True, True): (2, 4, 1, 32),
+        (768, 768, 512, 128, 128, True, False, True): (2, 4, 2, 32),
+        (768, 768, 1024, 16, 16, False, True, True): (2, 16, 4, 2),
+        (768, 768, 1024, 16, 16, True, False, True): (4, 32, 3, 1),
+        (768, 768, 1024, 32, 32, False, True, True): (1, 32, 2, 4),
+        (768, 768, 1024, 32, 32, True, False, True): (1, 16, 5, 4),
+        (768, 768, 1024, 64, 64, False, True, True): (2, 16, 3, 8),
+        (768, 768, 1024, 64, 64, True, False, True): (2, 16, 3, 8),
+        (768, 768, 1024, 128, 128, False, True, True): (1, 8, 2, 32),
+        (768, 768, 1024, 128, 128, True, False, True): (1, 8, 1, 32),
+        (768, 768, 2048, 16, 16, False, True, True): (1, 16, 3, 4),
+        (768, 768, 2048, 16, 16, True, False, True): (1, 16, 3, 4),
+        (768, 768, 2048, 32, 32, False, True, True): (1, 32, 3, 4),
+        (768, 768, 2048, 32, 32, True, False, True): (5, 32, 3, 4),
+        (768, 768, 2048, 64, 64, False, True, True): (1, 32, 3, 8),
+        (768, 768, 2048, 64, 64, True, False, True): (1, 32, 3, 4),
+        (768, 768, 2048, 128, 128, False, True, True): (3, 16, 1, 32),
+        (768, 768, 2048, 128, 128, True, False, True): (4, 16, 1, 32),
+        (768, 768, 4096, 16, 16, False, True, True): (1, 64, 3, 2),
+        (768, 768, 4096, 16, 16, True, False, True): (3, 64, 3, 2),
+        (768, 768, 4096, 32, 32, False, True, True): (1, 64, 3, 4),
+        (768, 768, 4096, 32, 32, True, False, True): (1, 64, 3, 4),
+        (768, 768, 4096, 64, 64, False, True, True): (4, 64, 3, 4),
+        (768, 768, 4096, 64, 64, True, False, True): (4, 64, 3, 4),
+        (768, 768, 4096, 128, 128, False, True, True): (1, 32, 1, 32),
+        (768, 768, 4096, 128, 128, True, False, True): (1, 32, 2, 32),
+        (768, 768, 8192, 16, 16, False, True, True): (1, 128, 3, 2),
+        (768, 768, 8192, 16, 16, True, False, True): (2, 32, 3, 4),
+        (768, 768, 8192, 32, 32, False, True, True): (2, 128, 3, 4),
+        (768, 768, 8192, 32, 32, True, False, True): (1, 128, 2, 4),
+        (768, 768, 8192, 64, 64, False, True, True): (1, 128, 3, 4),
+        (768, 768, 8192, 64, 64, True, False, True): (2, 128, 3, 4),
+        (768, 768, 8192, 128, 128, False, True, True): (1, 64, 1, 32),
+        (768, 768, 8192, 128, 128, True, False, True): (2, 64, 1, 32),
+        (768, 768, 16384, 16, 16, False, True, True): (3, 64, 3, 4),
+        (768, 768, 16384, 16, 16, True, False, True): (1, 64, 3, 4),
+        (768, 768, 16384, 32, 32, False, True, True): (2, 256, 3, 4),
+        (768, 768, 16384, 32, 32, True, False, True): (4, 256, 2, 4),
+        (768, 768, 16384, 64, 64, False, True, True): (1, 256, 3, 4),
+        (768, 768, 16384, 64, 64, True, False, True): (1, 256, 3, 4),
+        (768, 768, 16384, 128, 128, False, True, True): (1, 128, 1, 32),
+        (768, 768, 16384, 128, 128, True, False, True): (2, 128, 1, 32),
+        (768, 768, 32768, 16, 16, False, True, True): (1, 128, 3, 4),
+        (768, 768, 32768, 16, 16, True, False, True): (2, 128, 3, 4),
+        (768, 768, 32768, 32, 32, False, True, True): (2, 256, 3, 4),
+        (768, 768, 32768, 32, 32, True, False, True): (1, 256, 3, 4),
+        (768, 768, 32768, 64, 64, False, True, True): (1, 512, 3, 4),
+        (768, 768, 32768, 64, 64, True, False, True): (1, 512, 3, 4),
+        (768, 768, 32768, 128, 128, False, True, True): (1, 256, 1, 32),
+        (768, 768, 32768, 128, 128, True, False, True): (1, 256, 1, 32),
+        (768, 768, 50432, 16, 16, False, True, True): (1, 197, 3, 4),
+        (768, 768, 50432, 32, 32, False, True, True): (1, 394, 3, 4),
+        (768, 768, 50432, 64, 64, False, True, True): (1, 788, 3, 4),
+        (768, 768, 50432, 128, 128, False, True, True): (3, 394, 1, 32),
+        (768, 768, 65536, 16, 16, False, True, True): (1, 256, 3, 4),
+        (768, 768, 65536, 16, 16, True, False, True): (1, 256, 3, 4),
+        (768, 768, 65536, 32, 32, False, True, True): (1, 512, 3, 4),
+        (768, 768, 65536, 32, 32, True, False, True): (1, 512, 3, 4),
+        (768, 768, 65536, 64, 64, False, True, True): (1, 1024, 3, 4),
+        (768, 768, 65536, 64, 64, True, False, True): (1, 1024, 3, 4),
+        (768, 768, 65536, 128, 128, False, True, True): (1, 512, 1, 32),
+        (768, 768, 65536, 128, 128, True, False, True): (1, 512, 1, 32),
+        (768, 768, 131072, 16, 16, False, True, True): (1, 512, 3, 4),
+        (768, 768, 131072, 16, 16, True, False, True): (1, 512, 3, 4),
+        (768, 768, 131072, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (768, 768, 131072, 32, 32, True, False, True): (1, 1024, 3, 4),
+        (768, 768, 131072, 64, 64, False, True, True): (1, 2048, 3, 4),
+        (768, 768, 131072, 64, 64, True, False, True): (1, 2048, 3, 4),
+        (768, 768, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
+        (768, 768, 131072, 128, 128, True, False, True): (1, 1024, 1, 32),
+        (768, 3072, 50432, 16, 16, False, True, True): (1, 197, 3, 4),
+        (768, 3072, 50432, 32, 32, False, True, True): (1, 788, 2, 4),
+        (768, 3072, 50432, 64, 64, False, True, True): (1, 788, 3, 4),
+        (768, 3072, 50432, 128, 128, False, True, True): (1, 394, 1, 32),
         (1024, 1024, 256, 16, 16, False, True, True): (4, 8, 3, 2),
         (1024, 1024, 256, 16, 16, True, False, True): (2, 8, 3, 2),
         (1024, 1024, 256, 32, 32, False, True, True): (1, 8, 3, 4),
@@ -2845,6 +4127,86 @@ def test_func():
         (1024, 1024, 131072, 64, 64, True, False, True): (1, 2048, 2, 4),
         (1024, 1024, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
         (1024, 1024, 131072, 128, 128, True, False, True): (1, 1024, 1, 32),
+        (1536, 1536, 256, 16, 16, False, True, True): (5, 4, 3, 2),
+        (1536, 1536, 256, 16, 16, True, False, True): (2, 2, 3, 4),
+        (1536, 1536, 256, 32, 32, False, True, True): (1, 8, 2, 4),
+        (1536, 1536, 256, 32, 32, True, False, True): (2, 4, 3, 4),
+        (1536, 1536, 256, 64, 64, False, True, True): (1, 4, 3, 8),
+        (1536, 1536, 256, 64, 64, True, False, True): (2, 4, 3, 8),
+        (1536, 1536, 256, 128, 128, False, True, True): (1, 2, 1, 32),
+        (1536, 1536, 256, 128, 128, True, False, True): (2, 2, 2, 32),
+        (1536, 1536, 512, 16, 16, False, True, True): (1, 8, 3, 2),
+        (1536, 1536, 512, 16, 16, True, False, True): (1, 8, 3, 2),
+        (1536, 1536, 512, 32, 32, False, True, True): (1, 16, 3, 4),
+        (1536, 1536, 512, 32, 32, True, False, True): (1, 16, 3, 4),
+        (1536, 1536, 512, 64, 64, False, True, True): (3, 8, 3, 8),
+        (1536, 1536, 512, 64, 64, True, False, True): (3, 8, 3, 8),
+        (1536, 1536, 512, 128, 128, False, True, True): (1, 4, 1, 32),
+        (1536, 1536, 512, 128, 128, True, False, True): (2, 4, 2, 32),
+        (1536, 1536, 1024, 16, 16, False, True, True): (2, 8, 3, 4),
+        (1536, 1536, 1024, 16, 16, True, False, True): (2, 8, 3, 4),
+        (1536, 1536, 1024, 32, 32, False, True, True): (1, 16, 3, 4),
+        (1536, 1536, 1024, 32, 32, True, False, True): (1, 16, 3, 4),
+        (1536, 1536, 1024, 64, 64, False, True, True): (2, 16, 3, 8),
+        (1536, 1536, 1024, 64, 64, True, False, True): (2, 16, 3, 8),
+        (1536, 1536, 1024, 128, 128, False, True, True): (3, 8, 1, 32),
+        (1536, 1536, 1024, 128, 128, True, False, True): (1, 8, 2, 32),
+        (1536, 1536, 2048, 16, 16, False, True, True): (1, 32, 3, 2),
+        (1536, 1536, 2048, 16, 16, True, False, True): (1, 32, 3, 2),
+        (1536, 1536, 2048, 32, 32, False, True, True): (3, 32, 2, 4),
+        (1536, 1536, 2048, 32, 32, True, False, True): (4, 32, 3, 4),
+        (1536, 1536, 2048, 64, 64, False, True, True): (1, 32, 3, 4),
+        (1536, 1536, 2048, 64, 64, True, False, True): (1, 32, 3, 4),
+        (1536, 1536, 2048, 128, 128, False, True, True): (1, 16, 1, 32),
+        (1536, 1536, 2048, 128, 128, True, False, True): (2, 16, 1, 32),
+        (1536, 1536, 4096, 16, 16, False, True, True): (1, 64, 3, 2),
+        (1536, 1536, 4096, 16, 16, True, False, True): (1, 16, 3, 4),
+        (1536, 1536, 4096, 32, 32, False, True, True): (1, 64, 2, 4),
+        (1536, 1536, 4096, 32, 32, True, False, True): (1, 64, 2, 4),
+        (1536, 1536, 4096, 64, 64, False, True, True): (1, 64, 3, 4),
+        (1536, 1536, 4096, 64, 64, True, False, True): (1, 64, 3, 4),
+        (1536, 1536, 4096, 128, 128, False, True, True): (1, 32, 1, 32),
+        (1536, 1536, 4096, 128, 128, True, False, True): (4, 32, 2, 32),
+        (1536, 1536, 8192, 16, 16, False, True, True): (1, 32, 3, 4),
+        (1536, 1536, 8192, 16, 16, True, False, True): (5, 32, 3, 4),
+        (1536, 1536, 8192, 32, 32, False, True, True): (1, 128, 2, 4),
+        (1536, 1536, 8192, 32, 32, True, False, True): (1, 128, 2, 4),
+        (1536, 1536, 8192, 64, 64, False, True, True): (1, 128, 3, 4),
+        (1536, 1536, 8192, 64, 64, True, False, True): (1, 128, 3, 4),
+        (1536, 1536, 8192, 128, 128, False, True, True): (1, 64, 1, 32),
+        (1536, 1536, 8192, 128, 128, True, False, True): (4, 64, 2, 32),
+        (1536, 1536, 16384, 16, 16, False, True, True): (1, 64, 3, 4),
+        (1536, 1536, 16384, 16, 16, True, False, True): (1, 64, 3, 4),
+        (1536, 1536, 16384, 32, 32, False, True, True): (1, 256, 2, 4),
+        (1536, 1536, 16384, 32, 32, True, False, True): (1, 128, 3, 4),
+        (1536, 1536, 16384, 64, 64, False, True, True): (1, 256, 3, 4),
+        (1536, 1536, 16384, 64, 64, True, False, True): (3, 256, 3, 4),
+        (1536, 1536, 16384, 128, 128, False, True, True): (1, 128, 1, 32),
+        (1536, 1536, 16384, 128, 128, True, False, True): (4, 128, 2, 32),
+        (1536, 1536, 32768, 16, 16, False, True, True): (1, 128, 3, 4),
+        (1536, 1536, 32768, 16, 16, True, False, True): (1, 128, 3, 4),
+        (1536, 1536, 32768, 32, 32, False, True, True): (1, 256, 3, 4),
+        (1536, 1536, 32768, 32, 32, True, False, True): (1, 256, 3, 4),
+        (1536, 1536, 32768, 64, 64, False, True, True): (1, 512, 3, 4),
+        (1536, 1536, 32768, 64, 64, True, False, True): (1, 512, 3, 4),
+        (1536, 1536, 32768, 128, 128, False, True, True): (1, 256, 1, 32),
+        (1536, 1536, 32768, 128, 128, True, False, True): (4, 256, 2, 32),
+        (1536, 1536, 65536, 16, 16, False, True, True): (5, 256, 3, 4),
+        (1536, 1536, 65536, 16, 16, True, False, True): (1, 256, 3, 4),
+        (1536, 1536, 65536, 32, 32, False, True, True): (1, 512, 3, 4),
+        (1536, 1536, 65536, 32, 32, True, False, True): (1, 512, 3, 4),
+        (1536, 1536, 65536, 64, 64, False, True, True): (1, 1024, 3, 4),
+        (1536, 1536, 65536, 64, 64, True, False, True): (1, 1024, 3, 4),
+        (1536, 1536, 65536, 128, 128, False, True, True): (1, 512, 1, 32),
+        (1536, 1536, 65536, 128, 128, True, False, True): (4, 512, 2, 32),
+        (1536, 1536, 131072, 16, 16, False, True, True): (3, 512, 3, 4),
+        (1536, 1536, 131072, 16, 16, True, False, True): (1, 512, 3, 4),
+        (1536, 1536, 131072, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (1536, 1536, 131072, 32, 32, True, False, True): (1, 1024, 3, 4),
+        (1536, 1536, 131072, 64, 64, False, True, True): (1, 2048, 3, 4),
+        (1536, 1536, 131072, 64, 64, True, False, True): (1, 2048, 3, 4),
+        (1536, 1536, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
+        (1536, 1536, 131072, 128, 128, True, False, True): (4, 1024, 2, 32),
         (2048, 2048, 256, 16, 16, False, True, True): (1, 4, 3, 4),
         (2048, 2048, 256, 16, 16, True, False, True): (1, 4, 3, 4),
         (2048, 2048, 256, 32, 32, False, True, True): (3, 8, 3, 4),
@@ -2925,6 +4287,86 @@ def test_func():
         (2048, 2048, 131072, 64, 64, True, False, True): (1, 2048, 2, 4),
         (2048, 2048, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
         (2048, 2048, 131072, 128, 128, True, False, True): (4, 1024, 1, 32),
+        (3072, 3072, 256, 16, 16, False, True, True): (1, 4, 5, 2),
+        (3072, 3072, 256, 16, 16, True, False, True): (1, 4, 3, 2),
+        (3072, 3072, 256, 32, 32, False, True, True): (1, 4, 4, 4),
+        (3072, 3072, 256, 32, 32, True, False, True): (1, 4, 3, 4),
+        (3072, 3072, 256, 64, 64, False, True, True): (2, 4, 3, 8),
+        (3072, 3072, 256, 64, 64, True, False, True): (2, 4, 3, 8),
+        (3072, 3072, 256, 128, 128, False, True, True): (6, 2, 1, 32),
+        (3072, 3072, 256, 128, 128, True, False, True): (8, 2, 2, 32),
+        (3072, 3072, 512, 16, 16, False, True, True): (2, 4, 3, 4),
+        (3072, 3072, 512, 16, 16, True, False, True): (2, 4, 3, 4),
+        (3072, 3072, 512, 32, 32, False, True, True): (2, 8, 3, 4),
+        (3072, 3072, 512, 32, 32, True, False, True): (2, 8, 3, 4),
+        (3072, 3072, 512, 64, 64, False, True, True): (2, 8, 3, 8),
+        (3072, 3072, 512, 64, 64, True, False, True): (2, 8, 3, 8),
+        (3072, 3072, 512, 128, 128, False, True, True): (5, 4, 1, 32),
+        (3072, 3072, 512, 128, 128, True, False, True): (5, 4, 2, 32),
+        (3072, 3072, 1024, 16, 16, False, True, True): (1, 16, 3, 2),
+        (3072, 3072, 1024, 16, 16, True, False, True): (1, 16, 3, 2),
+        (3072, 3072, 1024, 32, 32, False, True, True): (2, 16, 3, 4),
+        (3072, 3072, 1024, 32, 32, True, False, True): (1, 16, 3, 4),
+        (3072, 3072, 1024, 64, 64, False, True, True): (1, 16, 3, 4),
+        (3072, 3072, 1024, 64, 64, True, False, True): (1, 16, 3, 4),
+        (3072, 3072, 1024, 128, 128, False, True, True): (1, 8, 1, 32),
+        (3072, 3072, 1024, 128, 128, True, False, True): (3, 8, 2, 32),
+        (3072, 3072, 2048, 16, 16, False, True, True): (1, 32, 3, 2),
+        (3072, 3072, 2048, 16, 16, True, False, True): (1, 16, 2, 4),
+        (3072, 3072, 2048, 32, 32, False, True, True): (1, 32, 2, 4),
+        (3072, 3072, 2048, 32, 32, True, False, True): (1, 32, 3, 4),
+        (3072, 3072, 2048, 64, 64, False, True, True): (1, 32, 3, 4),
+        (3072, 3072, 2048, 64, 64, True, False, True): (1, 32, 3, 4),
+        (3072, 3072, 2048, 128, 128, False, True, True): (1, 16, 1, 32),
+        (3072, 3072, 2048, 128, 128, True, False, True): (4, 16, 2, 32),
+        (3072, 3072, 4096, 16, 16, False, True, True): (2, 16, 3, 4),
+        (3072, 3072, 4096, 16, 16, True, False, True): (2, 16, 3, 4),
+        (3072, 3072, 4096, 32, 32, False, True, True): (1, 64, 2, 4),
+        (3072, 3072, 4096, 32, 32, True, False, True): (1, 32, 3, 4),
+        (3072, 3072, 4096, 64, 64, False, True, True): (1, 64, 3, 4),
+        (3072, 3072, 4096, 64, 64, True, False, True): (1, 64, 3, 4),
+        (3072, 3072, 4096, 128, 128, False, True, True): (1, 32, 1, 32),
+        (3072, 3072, 4096, 128, 128, True, False, True): (2, 32, 2, 32),
+        (3072, 3072, 8192, 16, 16, False, True, True): (2, 32, 3, 4),
+        (3072, 3072, 8192, 16, 16, True, False, True): (2, 32, 3, 4),
+        (3072, 3072, 8192, 32, 32, False, True, True): (1, 64, 3, 4),
+        (3072, 3072, 8192, 32, 32, True, False, True): (1, 64, 3, 4),
+        (3072, 3072, 8192, 64, 64, False, True, True): (1, 128, 3, 4),
+        (3072, 3072, 8192, 64, 64, True, False, True): (1, 128, 3, 4),
+        (3072, 3072, 8192, 128, 128, False, True, True): (1, 64, 1, 32),
+        (3072, 3072, 8192, 128, 128, True, False, True): (4, 64, 2, 32),
+        (3072, 3072, 16384, 16, 16, False, True, True): (2, 64, 3, 4),
+        (3072, 3072, 16384, 16, 16, True, False, True): (1, 64, 3, 4),
+        (3072, 3072, 16384, 32, 32, False, True, True): (1, 128, 3, 4),
+        (3072, 3072, 16384, 32, 32, True, False, True): (1, 128, 3, 4),
+        (3072, 3072, 16384, 64, 64, False, True, True): (1, 256, 3, 4),
+        (3072, 3072, 16384, 64, 64, True, False, True): (1, 256, 3, 4),
+        (3072, 3072, 16384, 128, 128, False, True, True): (1, 128, 1, 32),
+        (3072, 3072, 16384, 128, 128, True, False, True): (4, 128, 2, 32),
+        (3072, 3072, 32768, 16, 16, False, True, True): (3, 128, 3, 4),
+        (3072, 3072, 32768, 16, 16, True, False, True): (1, 128, 3, 4),
+        (3072, 3072, 32768, 32, 32, False, True, True): (1, 256, 3, 4),
+        (3072, 3072, 32768, 32, 32, True, False, True): (1, 256, 3, 4),
+        (3072, 3072, 32768, 64, 64, False, True, True): (1, 512, 3, 4),
+        (3072, 3072, 32768, 64, 64, True, False, True): (1, 512, 3, 4),
+        (3072, 3072, 32768, 128, 128, False, True, True): (1, 256, 1, 32),
+        (3072, 3072, 32768, 128, 128, True, False, True): (4, 256, 2, 32),
+        (3072, 3072, 65536, 16, 16, False, True, True): (5, 256, 3, 4),
+        (3072, 3072, 65536, 16, 16, True, False, True): (2, 256, 3, 4),
+        (3072, 3072, 65536, 32, 32, False, True, True): (1, 512, 3, 4),
+        (3072, 3072, 65536, 32, 32, True, False, True): (1, 512, 3, 4),
+        (3072, 3072, 65536, 64, 64, False, True, True): (1, 1024, 3, 4),
+        (3072, 3072, 65536, 64, 64, True, False, True): (1, 1024, 3, 4),
+        (3072, 3072, 65536, 128, 128, False, True, True): (1, 512, 1, 32),
+        (3072, 3072, 65536, 128, 128, True, False, True): (4, 512, 2, 32),
+        (3072, 3072, 131072, 16, 16, False, True, True): (5, 512, 3, 4),
+        (3072, 3072, 131072, 16, 16, True, False, True): (1, 512, 3, 4),
+        (3072, 3072, 131072, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (3072, 3072, 131072, 32, 32, True, False, True): (1, 1024, 3, 4),
+        (3072, 3072, 131072, 64, 64, False, True, True): (1, 2048, 3, 4),
+        (3072, 3072, 131072, 64, 64, True, False, True): (1, 2048, 3, 4),
+        (3072, 3072, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
+        (3072, 3072, 131072, 128, 128, True, False, True): (4, 1024, 2, 32),
         (4096, 4096, 256, 16, 16, False, True, True): (1, 4, 3, 2),
         (4096, 4096, 256, 16, 16, True, False, True): (1, 2, 3, 4),
         (4096, 4096, 256, 32, 32, False, True, True): (4, 4, 4, 4),
@@ -3005,6 +4447,86 @@ def test_func():
         (4096, 4096, 131072, 64, 64, True, False, True): (2, 2048, 2, 4),
         (4096, 4096, 131072, 128, 128, False, True, True): (4, 1024, 1, 32),
         (4096, 4096, 131072, 128, 128, True, False, True): (4, 1024, 1, 32),
+        (6144, 6144, 256, 16, 16, False, True, True): (2, 2, 3, 4),
+        (6144, 6144, 256, 16, 16, True, False, True): (2, 2, 3, 4),
+        (6144, 6144, 256, 32, 32, False, True, True): (2, 4, 3, 4),
+        (6144, 6144, 256, 32, 32, True, False, True): (2, 4, 3, 4),
+        (6144, 6144, 256, 64, 64, False, True, True): (1, 4, 3, 4),
+        (6144, 6144, 256, 64, 64, True, False, True): (1, 4, 3, 4),
+        (6144, 6144, 256, 128, 128, False, True, True): (1, 2, 1, 32),
+        (6144, 6144, 256, 128, 128, True, False, True): (5, 2, 2, 32),
+        (6144, 6144, 512, 16, 16, False, True, True): (4, 8, 3, 2),
+        (6144, 6144, 512, 16, 16, True, False, True): (4, 8, 3, 2),
+        (6144, 6144, 512, 32, 32, False, True, True): (2, 8, 3, 4),
+        (6144, 6144, 512, 32, 32, True, False, True): (2, 8, 3, 4),
+        (6144, 6144, 512, 64, 64, False, True, True): (1, 8, 3, 4),
+        (6144, 6144, 512, 64, 64, True, False, True): (1, 8, 3, 4),
+        (6144, 6144, 512, 128, 128, False, True, True): (1, 4, 1, 32),
+        (6144, 6144, 512, 128, 128, True, False, True): (4, 4, 2, 32),
+        (6144, 6144, 1024, 16, 16, False, True, True): (4, 16, 3, 2),
+        (6144, 6144, 1024, 16, 16, True, False, True): (4, 4, 3, 4),
+        (6144, 6144, 1024, 32, 32, False, True, True): (1, 16, 3, 4),
+        (6144, 6144, 1024, 32, 32, True, False, True): (1, 16, 3, 4),
+        (6144, 6144, 1024, 64, 64, False, True, True): (1, 16, 3, 4),
+        (6144, 6144, 1024, 64, 64, True, False, True): (1, 16, 3, 4),
+        (6144, 6144, 1024, 128, 128, False, True, True): (1, 8, 1, 32),
+        (6144, 6144, 1024, 128, 128, True, False, True): (4, 8, 2, 32),
+        (6144, 6144, 2048, 16, 16, False, True, True): (1, 8, 3, 4),
+        (6144, 6144, 2048, 16, 16, True, False, True): (4, 8, 3, 4),
+        (6144, 6144, 2048, 32, 32, False, True, True): (1, 16, 3, 4),
+        (6144, 6144, 2048, 32, 32, True, False, True): (1, 16, 3, 4),
+        (6144, 6144, 2048, 64, 64, False, True, True): (1, 32, 3, 4),
+        (6144, 6144, 2048, 64, 64, True, False, True): (3, 32, 3, 4),
+        (6144, 6144, 2048, 128, 128, False, True, True): (1, 16, 1, 32),
+        (6144, 6144, 2048, 128, 128, True, False, True): (1, 16, 2, 32),
+        (6144, 6144, 4096, 16, 16, False, True, True): (3, 16, 3, 4),
+        (6144, 6144, 4096, 16, 16, True, False, True): (4, 16, 3, 4),
+        (6144, 6144, 4096, 32, 32, False, True, True): (1, 32, 3, 4),
+        (6144, 6144, 4096, 32, 32, True, False, True): (1, 32, 3, 4),
+        (6144, 6144, 4096, 64, 64, False, True, True): (1, 64, 3, 4),
+        (6144, 6144, 4096, 64, 64, True, False, True): (1, 64, 3, 4),
+        (6144, 6144, 4096, 128, 128, False, True, True): (1, 32, 1, 32),
+        (6144, 6144, 4096, 128, 128, True, False, True): (4, 32, 2, 32),
+        (6144, 6144, 8192, 16, 16, False, True, True): (1, 32, 3, 4),
+        (6144, 6144, 8192, 16, 16, True, False, True): (4, 32, 3, 4),
+        (6144, 6144, 8192, 32, 32, False, True, True): (1, 64, 3, 4),
+        (6144, 6144, 8192, 32, 32, True, False, True): (1, 64, 3, 4),
+        (6144, 6144, 8192, 64, 64, False, True, True): (1, 128, 3, 4),
+        (6144, 6144, 8192, 64, 64, True, False, True): (1, 128, 3, 4),
+        (6144, 6144, 8192, 128, 128, False, True, True): (1, 64, 1, 32),
+        (6144, 6144, 8192, 128, 128, True, False, True): (4, 64, 2, 32),
+        (6144, 6144, 16384, 16, 16, False, True, True): (1, 64, 3, 4),
+        (6144, 6144, 16384, 16, 16, True, False, True): (4, 64, 3, 4),
+        (6144, 6144, 16384, 32, 32, False, True, True): (1, 128, 3, 4),
+        (6144, 6144, 16384, 32, 32, True, False, True): (1, 128, 3, 4),
+        (6144, 6144, 16384, 64, 64, False, True, True): (1, 256, 3, 4),
+        (6144, 6144, 16384, 64, 64, True, False, True): (1, 256, 3, 4),
+        (6144, 6144, 16384, 128, 128, False, True, True): (1, 128, 1, 32),
+        (6144, 6144, 16384, 128, 128, True, False, True): (4, 128, 2, 32),
+        (6144, 6144, 32768, 16, 16, False, True, True): (1, 128, 3, 4),
+        (6144, 6144, 32768, 16, 16, True, False, True): (4, 128, 3, 4),
+        (6144, 6144, 32768, 32, 32, False, True, True): (1, 256, 3, 4),
+        (6144, 6144, 32768, 32, 32, True, False, True): (1, 256, 3, 4),
+        (6144, 6144, 32768, 64, 64, False, True, True): (1, 512, 3, 4),
+        (6144, 6144, 32768, 64, 64, True, False, True): (1, 512, 3, 4),
+        (6144, 6144, 32768, 128, 128, False, True, True): (1, 256, 1, 32),
+        (6144, 6144, 32768, 128, 128, True, False, True): (4, 256, 2, 32),
+        (6144, 6144, 65536, 16, 16, False, True, True): (1, 256, 3, 4),
+        (6144, 6144, 65536, 16, 16, True, False, True): (2, 256, 3, 4),
+        (6144, 6144, 65536, 32, 32, False, True, True): (1, 512, 3, 4),
+        (6144, 6144, 65536, 32, 32, True, False, True): (1, 512, 3, 4),
+        (6144, 6144, 65536, 64, 64, False, True, True): (1, 1024, 3, 4),
+        (6144, 6144, 65536, 64, 64, True, False, True): (1, 1024, 3, 4),
+        (6144, 6144, 65536, 128, 128, False, True, True): (1, 512, 1, 32),
+        (6144, 6144, 65536, 128, 128, True, False, True): (4, 512, 2, 32),
+        (6144, 6144, 131072, 16, 16, False, True, True): (1, 512, 3, 4),
+        (6144, 6144, 131072, 16, 16, True, False, True): (2, 512, 3, 4),
+        (6144, 6144, 131072, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (6144, 6144, 131072, 32, 32, True, False, True): (1, 1024, 3, 4),
+        (6144, 6144, 131072, 64, 64, False, True, True): (1, 2048, 3, 4),
+        (6144, 6144, 131072, 64, 64, True, False, True): (1, 2048, 3, 4),
+        (6144, 6144, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
+        (6144, 6144, 131072, 128, 128, True, False, True): (4, 1024, 2, 32),
         (8192, 8192, 256, 16, 16, False, True, True): (2, 2, 4, 4),
         (8192, 8192, 256, 16, 16, True, False, True): (1, 1, 3, 4),
         (8192, 8192, 256, 32, 32, False, True, True): (2, 4, 3, 4),
@@ -3166,6 +4688,48 @@ def test_func():
         (16384, 16384, 131072, 128, 128, False, True, True): (4, 1024, 1, 32),
         (16384, 16384, 131072, 128, 128, True, False, True): (4, 1024, 1, 32),
     },
+    ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float32, 0.56)): {
+        (192, 192, 256, 64, 64, False, True, True): (1, 4, 3, 8),
+        (192, 192, 256, 64, 64, True, False, True): (1, 4, 3, 8),
+        (192, 192, 512, 64, 64, False, True, True): (2, 8, 3, 8),
+        (192, 192, 512, 64, 64, True, False, True): (5, 8, 3, 8),
+        (192, 192, 1024, 64, 64, False, True, True): (2, 16, 4, 8),
+        (192, 192, 1024, 64, 64, True, False, True): (1, 16, 3, 8),
+        (192, 192, 2048, 64, 64, False, True, True): (3, 32, 3, 8),
+        (192, 192, 2048, 64, 64, True, False, True): (5, 32, 5, 8),
+        (192, 192, 4096, 64, 64, False, True, True): (3, 64, 2, 8),
+        (192, 192, 4096, 64, 64, True, False, True): (1, 64, 3, 8),
+        (192, 192, 8192, 64, 64, False, True, True): (3, 128, 3, 8),
+        (192, 192, 8192, 64, 64, True, False, True): (6, 128, 3, 4),
+        (192, 192, 16384, 64, 64, False, True, True): (1, 256, 1, 8),
+        (192, 192, 16384, 64, 64, True, False, True): (1, 256, 3, 4),
+        (192, 192, 32768, 64, 64, False, True, True): (1, 512, 1, 8),
+        (192, 192, 32768, 64, 64, True, False, True): (1, 512, 3, 4),
+        (192, 192, 65536, 64, 64, False, True, True): (1, 1024, 1, 8),
+        (192, 192, 65536, 64, 64, True, False, True): (1, 1024, 3, 4),
+        (192, 192, 131072, 64, 64, False, True, True): (1, 2048, 1, 8),
+        (192, 192, 131072, 64, 64, True, False, True): (3, 2048, 1, 4),
+        (384, 384, 256, 128, 128, False, True, True): (1, 2, 1, 32),
+        (384, 384, 256, 128, 128, True, False, True): (1, 2, 1, 32),
+        (384, 384, 512, 128, 128, False, True, True): (1, 4, 1, 32),
+        (384, 384, 512, 128, 128, True, False, True): (2, 4, 1, 32),
+        (384, 384, 1024, 128, 128, False, True, True): (1, 8, 1, 32),
+        (384, 384, 1024, 128, 128, True, False, True): (4, 8, 1, 32),
+        (384, 384, 2048, 128, 128, False, True, True): (1, 16, 1, 32),
+        (384, 384, 2048, 128, 128, True, False, True): (1, 16, 1, 32),
+        (384, 384, 4096, 128, 128, False, True, True): (1, 32, 1, 32),
+        (384, 384, 4096, 128, 128, True, False, True): (2, 32, 2, 32),
+        (384, 384, 8192, 128, 128, False, True, True): (1, 64, 1, 32),
+        (384, 384, 8192, 128, 128, True, False, True): (1, 64, 2, 32),
+        (384, 384, 16384, 128, 128, False, True, True): (1, 128, 1, 32),
+        (384, 384, 16384, 128, 128, True, False, True): (4, 128, 1, 32),
+        (384, 384, 32768, 128, 128, False, True, True): (3, 256, 1, 32),
+        (384, 384, 32768, 128, 128, True, False, True): (3, 256, 1, 32),
+        (384, 384, 65536, 128, 128, False, True, True): (3, 512, 1, 32),
+        (384, 384, 65536, 128, 128, True, False, True): (3, 512, 1, 32),
+        (384, 384, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
+        (384, 384, 131072, 128, 128, True, False, True): (3, 1024, 1, 32),
+    },
     ("scatter_mm", "NVIDIA A100-SXM4-80GB", (0, torch.bfloat16, 0.5)): {
         (256, 256, 256, 16, 16): (1, 1, 16, 16, 1, 2),
         (256, 256, 256, 32, 32): (1, 1, 16, 16, 1, 4),
diff --git a/torch/sparse/semi_structured.py b/torch/sparse/semi_structured.py
index 6622d208bbefc..587fcc0d72ea2 100644
--- a/torch/sparse/semi_structured.py
+++ b/torch/sparse/semi_structured.py
@@ -1,93 +1,113 @@
 import warnings
 from collections import namedtuple
-from typing import Any, Optional
+from typing import Any, Optional, Tuple, List, Callable, Dict
 
 import torch
+from torch.sparse._semi_structured_conversions import (
+    sparse_semi_structured_from_dense_cutlass,
+    sparse_semi_structured_to_dense_cutlass
+)
+from torch.sparse._semi_structured_ops import (
+    fallback_dispatcher,
+    semi_sparse_values,
+    semi_sparse_indices,
+    semi_sparse_detach,
+    semi_sparse_t,
+    semi_sparse_view,
+    semi_sparse_mm,
+    semi_sparse_addmm,
+    semi_sparse_linear,
+)
 
 __all__ = [
     "SparseSemiStructuredTensor",
+    "SparseSemiStructuredTensorCUTLASS",
+    "SparseSemiStructuredTensorCUSPARSELT",
     "to_sparse_semi_structured",
 ]
 
 _SEMI_STRUCTURED_SPARSE_CONFIG = namedtuple(
-    "_SEMI_STRUCTURED_SPARSE_CONFIG", "sparse_min_rows sparse_min_cols dense_min_rows dense_min_cols"
+    "_SEMI_STRUCTURED_SPARSE_CONFIG",
+    "sparse_min_rows sparse_min_cols dense_min_rows dense_min_cols",
 )
-_DTYPE_TO_SEMI_STRUCTURED_SPARSE_CONFIG = {
-    # torch.int8: _SEMI_STRUCTURED_SPARSE_CONFIG(16, 128, 16, 16) for CUTLASS, cuSPASRELt has a 32 x 32 min sparse shape
-    torch.int8: _SEMI_STRUCTURED_SPARSE_CONFIG(32, 128, 16, 16),
-    torch.float16: _SEMI_STRUCTURED_SPARSE_CONFIG(32, 64, 8, 8),
-    torch.bfloat16: _SEMI_STRUCTURED_SPARSE_CONFIG(32, 64, 8, 8),
-    torch.float32: _SEMI_STRUCTURED_SPARSE_CONFIG(32, 32, 4, 4)
-}
 
 
 class SparseSemiStructuredTensor(torch.Tensor):
-    """This class implementes semi-structured sparsity as a Tensor subclass.
+    """
+    This class implementes semi-structured sparsity as a Tensor subclass.
 
     Semi-structured sparsity describes a sparsity pattern where n in every 2n elements are sparse,
     depending on the datatype. It is also referred to as 2:4 sparsity or fine-grained
     structured sparsity.
 
-    Currently, this class supports 2:4 sparsity for int8, float16 and bfloat16 dtypes.
-    We also support 1:2 sparsity for float32 dtype.
+    There are two backends available for semi_structred sparsity, either cuSPARSELt or CUTLASS.
+    This class is meant to serve as a base class for both implementations. SparseSemiStructuredCUTLASS
+    and SparseSemiStructuredCUSPARSELT both inherit from this class and define three backend-specific items.
+    Note that as such, this class cannot be insantiated directly.
 
-    This subclass stores the dense tensor in a compressed form by only storing the specified elements and corresponding metadata.
-
-    The subclass supports two backend, either CUTLASS or cuSPASRELt.
-
-    The cuSPARSELt backend expects the specified elements and the metadata to be stored in a single tensor:
-
-    compressed tensor = [ specified elements of original tensor | metadata ]
-
-    For an original tensor of size (m, k) we expect the first m * k // 2 elements to be the kept elements
-    The rest of the tensor is metadata.
+    -`_DTYPE_SHAPE_CONSTRAINTS` - A dictionary holding backend specific dense/sparse min shape constraints
+    - `def from_dense()` - backend specific compression routines
+    - `def _mm()` - backend specifc mm op (either torch._cslt_sparse_mm or torch._sparse_semi_structured_(mm|addmm))
+    """
 
-    For CUTLASS backend, elements of original tensor and metadata are kept in separate tensors.
+    _DEFAULT_ALG_ID: int = 0
+    _DTYPE_SHAPE_CONSTRAINTS: Dict[torch.dtype, _SEMI_STRUCTURED_SPARSE_CONFIG]
+    _FORCE_CUTLASS: bool = True
+    _FUSE_TRANSPOSE: bool = False
+    _PROTOTYPE_WARNING_SHOWN: bool = False
 
-    When _FORCE_CUTLASS is set, or when cuSPARSELt is not available, this subclass calls into _sparse_semi_structured_linear
-    and sparse_semi_structured_from_dense for conversion to the compressed format.
+    BACKEND: str
+    SPARSE_DISPATCH: Dict[Callable, Callable]
 
-    When PyTorch is compiled with cuSPARSELt support, this subclass will call into _cslt_sparse_mm for sparse mm and
-    _cslt_compress to convert into the compressed format.
-    """
+    packed: Optional[torch.Tensor]
+    meta: Optional[torch.Tensor]
+    packed_t: Optional[torch.Tensor]
+    meta_t: Optional[torch.Tensor]
+    compressed_swizzled_bitmask: Optional[torch.Tensor]
+    fuse_transpose_cusparselt: bool
+    alg_id_cusparselt: int
 
-    _FUSE_TRANSPOSE = False
-    _FORCE_CUTLASS = True
-    _PROTOTYPE_WARNING_SHOWN = False
+    __slots__ = ["packed", "meta", "packed_t", "meta_t", "compressed_swizzled_bitmask"]
 
     @staticmethod
-    def __new__(
+    def __new__(  # noqa: PYI034
         cls,
-        original_tensor: Optional[torch.Tensor],
-        original_shape: Optional[torch.Size] = None,
-        compressed_tensor_cusparselt: Optional[torch.Tensor] = None,
-        sparse_tensor_cutlass: Optional[torch.Tensor] = None,
-        meta_tensor_cutlass: Optional[torch.Tensor] = None,
-        transposed: bool = False,
+        shape: torch.Size,
+        packed: Optional[torch.Tensor],
+        meta: Optional[torch.Tensor],
+        packed_t: Optional[torch.Tensor],
+        meta_t: Optional[torch.Tensor],
+        compressed_swizzled_bitmask: Optional[torch.Tensor],
+        fuse_transpose_cusparselt: bool = False,
+        alg_id_cusparselt: int = 0,
+        requires_grad: bool = False,
     ):
         """
-        Create a new instance of the class.
+        Create a new instance of the tensor subclass from the compressed sparse representation.
 
-        When original_tensor is passed in, we compress it and store the compresed representation.
-        We can also create new instance of the class from the compressed representation without the original tensor.
+        We have the option to create the subclass with the compressed representations of both X and X', for training.
+        For inference, we only need a single representation (either X or X'), while the corresponding other set will be None.
+
+        Depending on the backend selected, certain fields will be set to None. (CUSPARSELT vs CUTLASS)
 
         Args:
-            original_tensor: The original dense tensor, or None, if we have already compressed the tensor.
-            original_shape: The shape of the original dense tensor
-            compressed_tensor_cusparselt: For cuSPARSELt backend, a flattened tensor to store the specified elements and metadata.
-            sparse_tensor_cutlass: For CUTLASS backend, tensor to store the speficied elements.
-            meta_tensor_cutlass: For CUTLASS backend, tensor to store metadata.
-            transposed: Whether the tensor is transposed or not.
+            shape: The shape of the original dense tensor
+            packed: The compressed representation of the original dense tensor
+            meta: The metadata of the original dense tensor, if it is stored separately
+            packed_t: The compressed representation of the transposed original dense tensor
+            meta_t: The metadata of the transposed original dense tensor, if it is stored separately
+            compressed_swizzled_bitmask: The masks used by the CUTLASS backend to determine which threads should
+                                         participate in the computation. Used for pointwise ops.
+            fuse_transpose_cusparselt: When running with cuSPARSELt, we have the option to fuse a transposition
+                                       with a matmul, which is useful in the case of 2:4 sparse training.
+            alg_id_cusparselt: The algorithm id to use when using cuSPARSELT, will have effect on performance
 
         Returns:
             torch.Tensor: A torch.Tensor wrapper subclass.
 
         Raises:
             ValueError: If all of the tensor arguments are None.
-
         """
-        assert compressed_tensor_cusparselt is None or (sparse_tensor_cutlass is None and meta_tensor_cutlass is None)
-
         if not cls._PROTOTYPE_WARNING_SHOWN:
             warnings.warn(
                 (
@@ -100,387 +120,193 @@ def __new__(
             )
             cls._PROTOTYPE_WARNING_SHOWN = True
 
-        if original_tensor is not None:
-            previous_tensor = original_tensor
-            original_shape = original_tensor.shape
-        elif compressed_tensor_cusparselt is not None:
-            previous_tensor = compressed_tensor_cusparselt
-        elif sparse_tensor_cutlass is not None:
-            previous_tensor = sparse_tensor_cutlass
-        else:
-            raise ValueError("All of the tensor arguments are None!")
-
-        kwargs = {}
-        kwargs["device"] = previous_tensor.device  # type: ignore[assignment]
-        kwargs["dtype"] = previous_tensor.dtype  # type: ignore[assignment]
-        kwargs["layout"] = previous_tensor.layout  # type: ignore[assignment]
-        kwargs["requires_grad"] = False  # type: ignore[assignment]
-
-        return torch.Tensor._make_wrapper_subclass(cls, original_shape, **kwargs)  # type: ignore[attr-defined]
+            # Because this only runs onces, we also load the dispatch table here as well.
+            # We can't define the dispatch table explicitly because of torch.ops import errors, so we do this instead
+            # But this is useful since it allows users to overload the dispatch table for debugging / testing.
+            cls._load_dispatch_table()
 
-    def __init__(
-        self,
-        original_tensor: Optional[torch.Tensor],
-        original_shape: Optional[torch.Size] = None,
-        compressed_tensor_cusparselt: Optional[torch.Tensor] = None,
-        sparse_tensor_cutlass: Optional[torch.Tensor] = None,
-        meta_tensor_cutlass: Optional[torch.Tensor] = None,
-        transposed: bool = False,
-    ) -> None:
-        """SparseSemiStructuredTensor constructor.
-
-        Args:
-            original_tensor: The original dense tensor, or None, if we have already compressed the tensor.
-            original_shape: The shape of the original dense tensor
-            compressed_tensor_cusparselt: For cuSPARSELt backend, a flattened tensor to store the specified elements and metadata.
-            sparse_tensor_cutlass: For CUTLASS backend, tensor to store the speficied elements.
-            meta_tensor_cutlass: For CUTLASS backend, tensor to store metadata.
-            transposed: Whether the tensor is transposed or not.
-
-        Returns:
-            None
+            # we can also register the classes with dynamo when the warning is shown.
+            torch._dynamo.allow_in_graph(cls)
 
-        Raises:
-            RuntimeError: If original_tensor is not a supported dtype, dim, shape, or device.
-        """
-        # if original tensor is passed in, we need to compress it and store the compressed representation.
-        if original_tensor is not None:
-            # TODO right now we have unified checks and constraints for cuSPARSELt and CUTLASS, these are not actually the same.
-            # We should consolidate similar checks here and leave backend specific checks like shape in the op implementation.
-
-            # check device
-            if not original_tensor.is_cuda:
-                raise RuntimeError(
-                    f"Error original_tensor.device= {original_tensor.device} is not supported! "
-                    "Only CUDA tensors are currently supported."
-                )
+        if packed is not None:
+            previous_tensor = packed
+        elif packed_t is not None:
+            previous_tensor = packed_t
+        else:
+            raise ValueError("At least one of packed or packed_t must be provided")
+
+        kwargs = {
+            "device": previous_tensor.device,
+            "dtype": previous_tensor.dtype,
+            "layout": previous_tensor.layout,
+            "requires_grad": requires_grad,
+        }
+        tensor = torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
+
+        tensor.packed = packed
+        tensor.meta = meta
+        tensor.packed_t = packed_t
+        tensor.meta_t = meta_t
+        tensor.compressed_swizzled_bitmask = compressed_swizzled_bitmask
+        tensor.fuse_transpose_cusparselt = fuse_transpose_cusparselt
+        tensor.alg_id_cusparselt = alg_id_cusparselt
+        return tensor
 
-            # check dim
-            if original_tensor.dim() != 2:
-                raise RuntimeError(
-                    f"Error original_tensor.dim = {original_tensor.dim()} is not supported! "
-                    "Only 2d tensors are currently supported."
-                )
+    def __repr__(self) -> str:  # type: ignore[override]
+        assert hasattr(self, "shape")
+        return f"{self.__class__.__name__}(shape={self.shape})"
 
-            # check dtype
-            if original_tensor.dtype not in _DTYPE_TO_SEMI_STRUCTURED_SPARSE_CONFIG:
-                raise RuntimeError(
-                    f"Error original_tensor.dtype {original_tensor.dtype} is not a supported dtype! "
-                    "dtype must be one of: {_DTYPE_TO_SEMI_STRUCTURED_SPARSE_CONFIG}"
-                )
+    def __tensor_flatten__(
+        self,
+    ) -> Tuple[List[str], Tuple[torch.Size, bool, int, bool]]:
+        inner_tensors = list(
+            filter(lambda x: getattr(self, x) is not None, self.__slots__)
+        )
+        tensor_meta = (
+            self.shape,
+            self.fuse_transpose_cusparselt,
+            self.alg_id_cusparselt,
+            self.requires_grad,
+        )
+        return inner_tensors, tensor_meta
 
-            # check shape
-            m, n = original_tensor.shape
-            min_rows = _DTYPE_TO_SEMI_STRUCTURED_SPARSE_CONFIG[
-                original_tensor.dtype
-            ].sparse_min_rows
-            min_cols = _DTYPE_TO_SEMI_STRUCTURED_SPARSE_CONFIG[
-                original_tensor.dtype
-            ].sparse_min_cols
-            if m < min_rows or m % min_rows or n < min_cols or n % min_cols:
-                # TODO in the future we can add in padding to support sparse dimensions that aren't perfect multiples
-                raise RuntimeError(
-                    f"Error original_tensor.shape {original_tensor.shape} is not supported! "
-                    f"Both dimensions must be larger or equal than and a multiple of ({min_rows}, {min_cols})"
-                )
+    @classmethod
+    def __tensor_unflatten__(
+        cls,
+        inner_tensors,
+        tensor_meta : Tuple[torch.Size, bool, int, bool],
+        outer_size,
+        outer_stride,
+    ) -> torch.Tensor:
+        shape, fuse_transpose_cusparselt, alg_id_cusparselt, requires_grad = tensor_meta
+        return cls(
+            shape=shape,
+            packed=inner_tensors.get("packed", None),
+            meta=inner_tensors.get("meta", None),
+            packed_t=inner_tensors.get("packed_t", None),
+            meta_t=inner_tensors.get("meta_t", None),
+            compressed_swizzled_bitmask=inner_tensors.get("compressed_swizzled_bitmask", None),
+            fuse_transpose_cusparselt=fuse_transpose_cusparselt,
+            alg_id_cusparselt=alg_id_cusparselt,
+            requires_grad=requires_grad,
+        )
 
-            compressed_tensor_cusparselt = None
-            sparse_tensor_cutlass = None
-            meta_tensor_cutlass = None
-            if self._FORCE_CUTLASS:
-                from torch.sparse._semi_structured_conversions import (
-                    sparse_semi_structured_from_dense_cutlass,
-                )
+    __torch_function__ = torch._C._disabled_torch_function_impl
 
-                sparse_tensor_cutlass, meta_tensor_cutlass = sparse_semi_structured_from_dense_cutlass(original_tensor)
-            else:
-                # use cuSPARSELt
-                compressed_tensor_cusparselt = torch._cslt_compress(original_tensor)
-
-        # set values
-        self.original_tensor = None
-        self.compressed_tensor_cusparselt = compressed_tensor_cusparselt
-        self.sparse_tensor_cutlass = sparse_tensor_cutlass
-        self.meta_tensor_cutlass = meta_tensor_cutlass
-        self.transposed = transposed
-        self.original_shape = original_shape
-
-    def __tensor_flatten__(self):
-        if self.compressed_tensor_cusparselt is None:
-            return ['sparse_tensor_cutlass', 'meta_tensor_cutlass'], (self.original_shape, self.transposed)
-        else:
-            return ['compressed_tensor_cusparselt'], (self.original_shape, self.transposed)
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs) -> Any:
+        if func._overloadpacket not in cls.SPARSE_DISPATCH:
+            raise NotImplementedError(
+                f"{cls.__name__} only supports a specific set of operations, "
+                f"can't perform requested op ({func.__name__})"
+            )
+        return cls.SPARSE_DISPATCH[func._overloadpacket](func, types, args, kwargs)
 
-    @staticmethod
-    def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
-        original_shape, transposed = meta
-
-        if len(inner_tensors) == 2:
-            sparse_tensor_cutlass = inner_tensors['sparse_tensor_cutlass']
-            meta_tensor_cutlass = inner_tensors['meta_tensor_cutlass']
-            compressed_tensor_cusparselt = None
-        elif len(inner_tensors) == 1:
-            sparse_tensor_cutlass = None
-            meta_tensor_cutlass = None
-            compressed_tensor_cusparselt = inner_tensors['compressed_tensor_cusparselt']
-        else:
-            raise RuntimeError(f"Expected 1 or 2 inner tensors but got {len(inner_tensors)}")
-
-        return SparseSemiStructuredTensor(
-            None,
-            original_shape=original_shape,
-            compressed_tensor_cusparselt=compressed_tensor_cusparselt,
-            sparse_tensor_cutlass=sparse_tensor_cutlass,
-            meta_tensor_cutlass=meta_tensor_cutlass,
-            transposed=transposed,
-        )
+    @classmethod
+    def _load_dispatch_table(cls, custom_dispatch_table=None) -> None:
+        """
+        Loads the op overload sparse dispatch table for the current class.
+        """
+        if getattr(cls, "SPARSE_DISPATCH", None) is None:
+            cls.SPARSE_DISPATCH = {
+                torch.ops.aten.values: semi_sparse_values,
+                torch.ops.aten.indices: semi_sparse_indices,
+                torch.ops.aten.is_same_size: fallback_dispatcher,
+                torch.ops.aten.detach_: fallback_dispatcher,
+                torch.ops.aten.detach: semi_sparse_detach,
+                torch.ops.aten.t: semi_sparse_t,
+                torch.ops.aten.view: semi_sparse_view,
+                torch.ops.aten.mm: semi_sparse_mm,
+                torch.ops.aten.matmul: semi_sparse_mm,
+                torch.ops.aten.addmm: semi_sparse_addmm,
+                torch.ops.aten.linear: semi_sparse_linear,
+                torch.ops.aten._to_copy: fallback_dispatcher,
+            }
+            if custom_dispatch_table is not None:
+                cls.SPARSE_DISPATCH.update(custom_dispatch_table)
 
-    @staticmethod
-    def __get_indices_dtype(values_dtype):
-        if values_dtype == torch.int8:
-            return torch.int32
-        elif values_dtype in (torch.float16, torch.bfloat16, torch.float32):
-            return torch.int16
-        else:
-            raise RuntimeError(f"Datatype {values_dtype}  is not supported!")
-        return None
+    @classmethod
+    def _validate_device_dim_dtype_shape(cls, original_tensor : torch.Tensor) -> None:
+        """
+        Assert that the given tensor is valid for semi-structured sparse compression.
+        """
+        # check device
+        if not original_tensor.is_cuda:
+            raise RuntimeError(
+                f"Error original_tensor.device= {original_tensor.device} is not supported! "
+                "Only CUDA tensors are currently supported."
+            )
 
-    def __repr__(self) -> str:  # type: ignore[override]
-        """Return string representation of SparseSemiStructuredTensor
+        # check dim
+        if original_tensor.dim() != 2:
+            raise RuntimeError(
+                f"Error original_tensor.dim = {original_tensor.dim()} is not supported! "
+                "Only 2d tensors are currently supported."
+            )
 
-        Returns:
-            str: String representation
+        # check contiguous
+        if not original_tensor.is_contiguous():
+            raise RuntimeError(
+                "Error original_tensor is not contiguous!"
+                "Only contiguous tensors are currently supported."
+            )
 
-        Raises:
-            None
-        """
-        return (
-            f"SparseSemiStructuredTensor(shape={self.shape}, "
-            f"transposed={self.transposed}"
-            f"values={self.values()}"
-            f"metadata={self.indices()})"
-        )
+        # check dtype
+        if original_tensor.dtype not in cls._DTYPE_SHAPE_CONSTRAINTS:
+            raise RuntimeError(
+                f"Error original_tensor.dtype {original_tensor.dtype} is not a supported dtype! "
+                "dtype must be one of: {cls._DTYPE_SHAPE_CONSTRAINTS}"
+            )
 
-    __torch_function__ = torch._C._disabled_torch_function_impl
+        # check shape
+        m, n = original_tensor.shape
+        min_rows = cls._DTYPE_SHAPE_CONSTRAINTS[original_tensor.dtype].sparse_min_rows
+        min_cols = cls._DTYPE_SHAPE_CONSTRAINTS[original_tensor.dtype].sparse_min_cols
+        if m < min_rows or m % min_rows or n < min_cols or n % min_cols:
+            # TODO in the future we can add in padding to support sparse dimensions that aren't perfect multiples
+            raise RuntimeError(
+                f"Error original_tensor.shape {original_tensor.shape} is not supported! "
+                f"Both dimensions must be larger or equal than and a multiple of ({min_rows}, {min_cols})"
+            )
 
-    def _pad_tensor_for_matmul(self, original_tensor : torch.Tensor) -> torch.Tensor:
+    @classmethod
+    def _pad_dense_input(cls, dense_input: torch.Tensor) -> torch.Tensor:
         """
         Calculates padding for dense tensor and pads tensor if necessary.
         If padding is not required, this function returns the original tensor.
         """
         # only 2d matmul
-        assert original_tensor.dim() == 2
+        assert dense_input.dim() == 2
 
         # check shape
-        m, n = original_tensor.shape
-        min_rows = _DTYPE_TO_SEMI_STRUCTURED_SPARSE_CONFIG[original_tensor.dtype].dense_min_rows
-        min_cols = _DTYPE_TO_SEMI_STRUCTURED_SPARSE_CONFIG[original_tensor.dtype].dense_min_cols
+        m, n = dense_input.shape
+        min_rows = cls._DTYPE_SHAPE_CONSTRAINTS[dense_input.dtype].dense_min_rows
+        min_cols = cls._DTYPE_SHAPE_CONSTRAINTS[dense_input.dtype].dense_min_cols
+
+        # calculate padding
         to_pad_m = -m % min_rows if m < min_rows or m % min_rows else 0
         to_pad_n = -n % min_cols if n < min_cols or n % min_rows else 0
         if to_pad_m or to_pad_n:
-            return torch.nn.functional.pad(original_tensor, (0, to_pad_n, 0, to_pad_m))
+            return torch.nn.functional.pad(dense_input, (0, to_pad_n, 0, to_pad_m))
         else:
-            return original_tensor
-
-
-    @classmethod
-    def __torch_dispatch__(cls, func, types, args, kwargs) -> Any:
-        """Overload __torch_dispatch__ to use torch._sparse_semi_structured_linear.
-
-        `torch.structured_sparse_linear` uses accelerated sparse CUTLASS kernels.
-        In the future we plan to also add in support for cuSPARSELt kernels.
-
-        Args:
-            func: The function being dispatched.
-            types: The types of the arguments.
-            args: The arguments passed to the function.
-            kwargs: The keyword arguments passed to the function.
-
-        Returns:
-            Any: The result of the dispatched operation.
-
-        Raises:
-            NotImplementedError: If the dispatched operation is not implemented.
-        """
-        # Since this code runs below autograd, a detach corresponds to only returning a new object
-        if func is torch.ops.aten.detach.default:
-            return SparseSemiStructuredTensor(
-                args[0].original_tensor,
-                original_shape=args[0].shape,
-                compressed_tensor_cusparselt=args[0].compressed_tensor_cusparselt,
-                sparse_tensor_cutlass=args[0].sparse_tensor_cutlass,
-                meta_tensor_cutlass=args[0].meta_tensor_cutlass,
-                transposed=args[0].transposed,
-            )
-
-        # Because we cannot go from the compressed representation back to the dense representation currently,
-        # we just keep track of how many times we have been transposed. Depending on whether the sparse matrix
-        # is the first or second argument, we expect an even / odd number of calls to transpose respectively.
-        if func is torch.ops.aten.t.default:
-            return SparseSemiStructuredTensor(
-                args[0].original_tensor,
-                # transpose shape
-                original_shape=torch.Size([args[0].shape[1], args[0].shape[0]]),
-                compressed_tensor_cusparselt=args[0].compressed_tensor_cusparselt,
-                sparse_tensor_cutlass=args[0].sparse_tensor_cutlass,
-                meta_tensor_cutlass=args[0].meta_tensor_cutlass,
-                transposed=not args[0].transposed,
-            )
-
-        # handle addmm
-        if func is torch.ops.aten.addmm.default:
-            bias, input_A, input_B = args
-
-            # Currently, we only support the first matrix being sparse for addmm/mm in cuSPARSELT and CUTLASS.
-            # CUTLASS only supports the first input to be sparse for a given matmul.
-            # cuSPARSELt does not have this limitation, although our implementation is only for sparse first.
-
-            # We support second matrix sparse matmul by taking advantage of some transpose properties:
-            # This is also why we want an odd number of transposed for second matrix sparse vs an even number
-            # of transpose calss for first matrix sparse.
-            # F.linear(x) = addmm(bias, input, weight.t()) = b + xW' = (b + xW')''
-            #        = (W''x' + b')' = (Wx' + b')' = addmm(bias.T, weight, input).T
-            if isinstance(input_B, cls) and input_B.transposed:
-                row, col = input_A.shape
-                input_A_padded = input_B._pad_tensor_for_matmul(input_A)
-
-                if input_B.compressed_tensor_cusparselt is None:
-                    assert input_B.sparse_tensor_cutlass is not None and input_B.meta_tensor_cutlass is not None
-                    res = torch._sparse_semi_structured_linear(
-                        input_A_padded,
-                        input_B.sparse_tensor_cutlass,
-                        input_B.meta_tensor_cutlass,
-                        bias=bias
-                    )
-                else:
-                    res = torch._cslt_sparse_mm(
-                        input_B.compressed_tensor_cusparselt,
-                        input_A_padded.t(),
-                        bias=bias,  # type: ignore[arg-type]
-                        transpose_result=cls._FUSE_TRANSPOSE
-                    )
-                    res = res if cls._FUSE_TRANSPOSE else res.t()
-                return res[:row, :]
-
-        # handle mm
-        if func is torch.ops.aten.mm.default:
-            input_A, input_B = args
-
-            # first element sparse
-            if isinstance(input_A, cls) and not input_A.transposed:
-                row, col = input_B.shape
-                input_B_padded = input_A._pad_tensor_for_matmul(input_B)
-
-                if input_A.compressed_tensor_cusparselt is None:
-                    assert input_A.sparse_tensor_cutlass is not None and input_A.meta_tensor_cutlass is not None
-                    res = torch._sparse_semi_structured_linear(
-                        input_B_padded.t(),
-                        input_A.sparse_tensor_cutlass,
-                        input_A.meta_tensor_cutlass
-                    ).t()
-                else:
-                    res = torch._cslt_sparse_mm(
-                        input_A.compressed_tensor_cusparselt,
-                        input_B_padded,
-                        bias=None  # type: ignore[arg-type]
-                    )
-                return res[:, :col]
-
-            # second element sparse
-            elif isinstance(input_B, cls) and input_B.transposed:
-                row, col = input_A.shape
-                input_A_padded = input_B._pad_tensor_for_matmul(input_A)
-
-                if input_B.compressed_tensor_cusparselt is None:
-                    assert input_B.sparse_tensor_cutlass is not None and input_B.meta_tensor_cutlass is not None
-                    res = torch._sparse_semi_structured_linear(
-                        input_A_padded,
-                        input_B.sparse_tensor_cutlass,
-                        input_B.meta_tensor_cutlass,
-                    )
-                else:
-                    res = torch._cslt_sparse_mm(
-                        input_B.compressed_tensor_cusparselt,
-                        input_A_padded.t(),
-                        bias=None,  # type: ignore[arg-type]
-                        transpose_result=cls._FUSE_TRANSPOSE
-                    )
-                    res = res if cls._FUSE_TRANSPOSE else res.t()
-                return res[:row, :]
-
-        # When torch is run with inference mode, pytorch does not decompose torch.ops.aten.linear into a .t() and addmm(),
-        # so we must match the aten.linear op. In this case, we need to explicitly handle collapsing to 2d matmul
-        # TODO see if there's a way to force pytorch to decompose the op so we don't have to handle this here.
-        if func is torch.ops.aten.linear.default:
-            input_tensor, weight, bias = args
-            # squash input_tensor to 2d
-            shape = input_tensor.shape
-            input_tensor_2d = input_tensor.view(-1, shape[-1])
-            row, col = input_tensor_2d.shape
-            # this is a noop if already padded
-            input_tensor_2d_padded = weight._pad_tensor_for_matmul(input_tensor_2d)
-
-            if isinstance(weight, cls):
-                if weight.compressed_tensor_cusparselt is None:
-                    assert weight.sparse_tensor_cutlass is not None and weight.meta_tensor_cutlass is not None
-                    res = torch._sparse_semi_structured_linear(
-                        input_tensor_2d_padded,
-                        weight.sparse_tensor_cutlass,
-                        weight.meta_tensor_cutlass,
-                        bias=bias
-                    )
-                else:
-                    res = torch._cslt_sparse_mm(
-                        weight.compressed_tensor_cusparselt,  # type: ignore[arg-type]
-                        input_tensor_2d_padded.t(),
-                        bias=bias,
-                        transpose_result=cls._FUSE_TRANSPOSE
-                    )
-                    res = res if cls._FUSE_TRANSPOSE else res.t()
-                return res[:row, :].view(*shape[:-1], -1)
-
-
-        # handle values
-        if func is torch.ops.aten.values.default:
-            if args[0].compressed_tensor_cusparselt is None:
-                return args[0].sparse_tensor_cutlass.detach()
-            else:
-                m, k = args[0].shape
-                num_kept_elements = m * k // 2
-                return args[0].compressed_tensor_cusparselt[:num_kept_elements].view(m, k // 2)
-
-        # handle indices
-        if func is torch.ops.aten.indices.default:
-            if args[0].compressed_tensor_cusparselt is None:
-                return args[0].meta_tensor_cutlass
-            else:
-                m, k = args[0].shape
-                num_kept_elements = m * k // 2
-                metadata = args[0].compressed_tensor_cusparselt[num_kept_elements:].view(m, -1)
-                indices_dtype = SparseSemiStructuredTensor.__get_indices_dtype(
-                    args[0].dtype
-                )
-                return metadata.view(indices_dtype)
-
-        error_string = "\n".join(
-            [f"func {func} with args: "]
-            + [f"arg{i}: {arg}" for i, arg in enumerate(args)]
-        )
-        raise NotImplementedError(error_string)
-
+            return dense_input
 
     def to_dense(self):
-        if self.compressed_tensor_cusparselt is not None:
-            raise RuntimeError("Converting to dense is not yet supported by cuSPARSELt backend!")
+        col = self.shape[-1]
+        return torch.mm(self, torch.eye(col, dtype=self.dtype, device=self.device))
 
-        from torch.sparse._semi_structured_conversions import (
-            sparse_semi_structured_to_dense_cutlass,
-        )
+    @classmethod
+    def from_dense(cls, original_tensor : torch.Tensor) -> "SparseSemiStructuredTensor":
+        raise NotImplementedError
 
-        return sparse_semi_structured_to_dense_cutlass(
-            self.sparse_tensor_cutlass,
-            self.meta_tensor_cutlass,
-        )
+    def _mm(
+        self,
+        B: torch.Tensor,
+        *,
+        bias: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        raise NotImplementedError
 
 
 def to_sparse_semi_structured(
@@ -493,18 +319,14 @@ def to_sparse_semi_structured(
 
     This function will check to ensure the dense tensor has the right dtype, size, dims, and device.
     We currently only support semi-structured sparse tensors for 2d CUDA tensors.
-    Additionally, your tensor must be a positive multiple of a block size given the dtype
-
-    - torch.float16  (r, c) must be >= and a multiple of 64
-    - torch.int8     (r, c) must be >= and a multiple of 128
+    Additionally, your tensor must be a positive multiple of the mininum sparse block size, given in
+    `_DTYPE_TO_SHAPE_CONSTRAINTS` for each dtype (float32, float16, bfloat16, int8).
 
     Args:
         original_tensor (Tensor): the dense tensor to convert
-        transposed (bool, optional): whether the dense tensor is transposed
-
+        transposed (bool, optional): deprecated arg to be removed in another release. Do not use.
     Returns:
         SparseSemiStructuredTensor: A sparse semi-structured tensor created from the given original_tensor
-
     Raises:
         None
     Example:
@@ -518,22 +340,290 @@ def to_sparse_semi_structured(
                 [0., 0., 1.,  ..., 0., 1., 1.],
                 [0., 0., 1.,  ..., 0., 1., 1.]], device='cuda:0', dtype=torch.float16)
         >>> A_sparse = to_sparse_semi_structured(A)
-        SparseSemiStructuredTensor(shape=torch.Size([128, 128]), transposed=False, values=tensor([[1., 1., 1.,  ..., 1., 1., 1.],
+        SparseSemiStructuredTensor(shape=torch.Size([128, 128]))
+        >>> A_sparse.values()
+        tensor([[1., 1., 1.,  ..., 1., 1., 1.],
                 [1., 1., 1.,  ..., 1., 1., 1.],
                 [1., 1., 1.,  ..., 1., 1., 1.],
                 ...,
                 [1., 1., 1.,  ..., 1., 1., 1.],
                 [1., 1., 1.,  ..., 1., 1., 1.],
                 [1., 1., 1.,  ..., 1., 1., 1.]], device='cuda:0', dtype=torch.float16),
-            metadata=tensor([[-4370, -4370, -4370,  ..., -4370, -4370, -4370],
+        >>> A_sparse.indices()
+        tensor([[-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                 [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                 [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                 ...,
                 [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
                 [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
-                [-4370, -4370, -4370,  ..., -4370, -4370, -4370]], device='cuda:0',
-       dtype=torch.int16))
+                [-4370, -4370, -4370,  ..., -4370, -4370, -4370]], device='cuda:0', dtype=torch.int16))
     """
-    return SparseSemiStructuredTensor(
-        original_tensor, original_shape=original_tensor.shape, transposed=transposed
+    if transposed:
+        raise DeprecationWarning(
+            "Setting transpose from to_sparse_semi_structured is deprecated and will be removed in a future release."
+            "SparseSemiStructuredTensor only support contiguous input tensors. "
+        )
+
+    # set from _FORCE_CUTLASS flag
+    SPARSE_SUBCLASS = (
+        torch.sparse.SparseSemiStructuredTensorCUTLASS
+        if SparseSemiStructuredTensor._FORCE_CUTLASS
+        else torch.sparse.SparseSemiStructuredTensorCUSPARSELT
     )
+
+    return SPARSE_SUBCLASS.from_dense(original_tensor)
+
+class SparseSemiStructuredTensorCUTLASS(SparseSemiStructuredTensor):
+    """
+    This class implements semi-structured sparsity for the CUTLASS backend.
+
+
+    In this implementation, the specified elements and metadata are stored seprately,
+    in packed and meta respectively.
+
+    When _FORCE_CUTLASS is set, or when cuSPARSELt is not available, this subclass calls into _sparse_semi_structured_(mm|addmm) and
+    sparse_semi_structured_from_dense for conversion to the compressed format.
+    """
+    BACKEND = "cutlass"
+    _DTYPE_SHAPE_CONSTRAINTS = {
+        torch.int8: _SEMI_STRUCTURED_SPARSE_CONFIG(16, 128, 16, 16),
+        torch.float16: _SEMI_STRUCTURED_SPARSE_CONFIG(32, 64, 8, 8),
+        torch.bfloat16: _SEMI_STRUCTURED_SPARSE_CONFIG(32, 64, 8, 8),
+        torch.float32: _SEMI_STRUCTURED_SPARSE_CONFIG(32, 32, 4, 4),
+    }
+
+    @classmethod
+    def from_dense(
+        cls, original_tensor: torch.Tensor
+    ) -> "SparseSemiStructuredTensorCUTLASS":
+        cls._validate_device_dim_dtype_shape(original_tensor)
+        (
+            sparse_tensor_cutlass,
+            meta_tensor_cutlass,
+        ) = sparse_semi_structured_from_dense_cutlass(original_tensor)
+        return cls(
+            original_tensor.shape,
+            packed=sparse_tensor_cutlass,
+            meta=meta_tensor_cutlass,
+            packed_t=None,
+            meta_t=None,
+            compressed_swizzled_bitmask=None,
+            requires_grad=original_tensor.requires_grad,
+        )
+
+    def to_dense(self):
+        assert self.meta is not None and self.packed is not None
+        return sparse_semi_structured_to_dense_cutlass(
+            self.packed,
+            self.meta,
+        ) if self.meta.ndim == 2 else super().to_dense()
+
+    @classmethod
+    def prune_dense_static_sort(cls, original_tensor : torch.Tensor, algorithm="") -> "SparseSemiStructuredTensor":
+        """
+        This function takes in a unpruned dense tensor and runs a (branchless) static sort across a 4x4 tile.
+
+        It greedily picks the largest values in the tile, upholding the 2:4 sparsity constraint across both rows and columns.
+        The algorithm used to prune the matrix is implemented in `_sparse_semi_structured_tile`.
+
+        Then it creates the packed and meta tensors for the compressed sparse representation of the pruned dense tensor.
+        It also calculates the packed_t and meta_t tensors for the compressed sparse representation of the transposed
+        pruned dense tensor.
+        Since we cannot transpose the compressed representations, we store both for the fw/bw pass respectively.
+
+        Finally, this function also computes a compressed swizzled bitmask that encodes the sparsity pattern
+        This can be used in the backward pass to mask the gradients.
+
+        [9 1 7 4]                       [9 0 7 0]
+        [1 2 3 0]                       [0 2 0 0]
+        [8 3 5 4] -> prune 4x4 tile  -> [8 0 0 4] -> pack to CUTLASS semi-structured -> packed
+        [1 2 6 2]                       [0 0 6 2]                                    -> metadata
+
+                                                  -> pack to transposed CUTLASS      -> packed_t
+                                                     semi-structured representation  -> metadata_t
+
+                                                  -> compute swizzled bitmask        -> compressed_swizzled_bitmask
+
+
+        The equivalent PyTorch code to create the same five outputs from the dense tensor can be found below:
+        ```
+        from torch.sparse import SparseSemiStructuredTensorCUTLASS
+        from torch.sparse._semi_structured_conversions import _sparse_semi_structured_tile, _compute_compressed_swizzled_bitmask
+
+        pruned = _sparse_semi_structured_tile(dense)
+        packed_cutlass, meta_cutlass = sparse_semi_structured_from_dense_cutlass(pruned)
+        packed_t_cutlass, meta_t_cutlass = sparse_semi_structured_from_dense_cutlass(pruned.t().contiguous())
+        bitmask = _compute_compressed_swizzled_bitmask(pruned)
+
+        SparseSemiStructuredTensorCUTLASS(dense.shape, packed_cutlass, meta_cutlass, packed_t_cutlass, meta_t_cutlass, bitmask)
+        ```
+        """
+        # We can either pack to the CUTLASS or cuSPARSELt representation, depending on the use_cutlass flag.
+        (packed, meta, packed_t, meta_t, compressed_swizzled_bitmask) = torch._sparse_semi_structured_tile(
+            original_tensor,
+            algorithm=algorithm,
+            use_cutlass=True)
+
+        return cls(
+            original_tensor.shape,
+            packed=packed,
+            meta=meta,
+            packed_t=packed_t,
+            meta_t=meta_t,
+            compressed_swizzled_bitmask=compressed_swizzled_bitmask,
+            requires_grad=False,
+        )
+
+    def _mm(
+        self,
+        B: torch.Tensor,
+        *,
+        bias: Optional[torch.Tensor] = None,
+        **kwargs
+    ) -> torch.Tensor:
+        if isinstance(B, SparseSemiStructuredTensor):
+            raise ValueError(
+                "`SparseSemiStructuredTensor @ SparseSemiStructuredTensor` is not supported by the hardware"
+            )
+        cls_name = self.__class__.__name__
+        if self.ndim != 2 or B.ndim != 2:
+            raise NotImplementedError(
+                f"`{cls_name}` matmul: Broadcasting is not implemented"
+            )
+        if self.packed is None or self.meta is None:
+            raise NotImplementedError(
+                f"`{cls_name}` matmul: operation is not supported"
+            )
+        else:
+            if bias is None:
+                res = torch._sparse_semi_structured_mm(
+                    self.packed, self.meta, B
+                )
+            else:
+                res = torch._sparse_semi_structured_addmm(
+                    bias, self.packed, self.meta, B
+                )
+            return res[: self.shape[0]]
+
+
+class SparseSemiStructuredTensorCUSPARSELT(SparseSemiStructuredTensor):
+    """
+    The cuSPARSELt backend expects the specified elements and the metadata to be stored in a single tensor:
+    packed = [ specified elements of original tensor | metadata ]
+    For an original tensor of size (m, k) we expect the first m * k // 2 elements to be the kept elements
+    The rest of the tensor is metadata. Since there is only one tensor, we only use the packed and packed_t
+    attributes respectively.
+
+    cuSPARSELt also supports transposition fusion, which is necessary for performant 2:4 sparse training, as well
+    as specifying alg_id, a config that affects the performance of the matmul depending on matmul sizes.
+    """
+    BACKEND = "cusparselt"
+    _DTYPE_SHAPE_CONSTRAINTS = {
+        torch.int8: _SEMI_STRUCTURED_SPARSE_CONFIG(32, 32, 16, 16),
+        torch.float16: _SEMI_STRUCTURED_SPARSE_CONFIG(16, 16, 8, 8),
+        torch.bfloat16: _SEMI_STRUCTURED_SPARSE_CONFIG(16, 16, 8, 8),
+        torch.float32: _SEMI_STRUCTURED_SPARSE_CONFIG(8, 8, 4, 4),
+    }
+
+    @classmethod
+    def from_dense(cls, original_tensor : torch.Tensor) -> "SparseSemiStructuredTensorCUSPARSELT":
+        cls._validate_device_dim_dtype_shape(original_tensor)
+        return cls(
+            shape=original_tensor.shape,
+            packed=torch._cslt_compress(original_tensor),
+            meta=None,
+            packed_t=None,
+            meta_t=None,
+            compressed_swizzled_bitmask=None,
+            fuse_transpose_cusparselt=SparseSemiStructuredTensor._FUSE_TRANSPOSE,
+            alg_id_cusparselt=SparseSemiStructuredTensor._DEFAULT_ALG_ID,
+            requires_grad=original_tensor.requires_grad,
+        )
+
+    @classmethod
+    def prune_dense_static_sort(cls, original_tensor : torch.Tensor, algorithm="") -> "SparseSemiStructuredTensor":
+        """
+        This function does the same thing as described in SparseSemiStructuredCUTLASS, but uses the cuSPASRELt metadata
+        layout and sparse matmul.
+
+        The only functional difference is that cuSPARSELt stores `metadata` and `packed` together into a single tensor.
+
+        [9 1 7 4]                       [9 0 7 0]
+        [1 2 3 0]                       [0 2 0 0]
+        [8 3 5 4] -> prune 4x4 tile  -> [8 0 0 4] -> pack to cuSPARSELT semi-structured -> packed
+        [1 2 6 2]                       [0 0 6 2]
+
+                                                  -> pack to transposed cuSPARSELt      -> packed_t
+                                                     semi-structured representation
+
+                                                  -> compute swizzled bitmask           -> compressed_swizzled_bitmask
+
+
+        The equivalent PyTorch code to create the same three outputs from the dense tensor can be found below:
+        ```
+        from torch.sparse import SparseSemiStructuredTensorCUSPARSELT
+        from torch.sparse._semi_structured_conversions import _sparse_semi_structured_tile, _compute_compressed_swizzled_bitmask
+
+        pruned = _sparse_semi_structured_tile(dense)
+        packed_cusparselt = torch._cslt_compress(pruned)
+        packed_t_cusparselt = torch._cslt_compress(pruned.t().contiguous())
+        bitmask = _compute_compressed_swizzled_bitmask(pruned)
+
+        SparseSemiStructuredTensorCUSPARSELT(dense.shape, packed_cutlass, None, packed_t_cutlass, None, bitmask)
+        ```
+        """
+        (packed, meta, packed_t, meta_t, compressed_swizzled_bitmask) = torch._sparse_semi_structured_tile(
+            original_tensor,
+            algorithm=algorithm,
+            use_cutlass=False)
+
+        return cls(
+            original_tensor.shape,
+            packed=packed,
+            meta=meta,
+            packed_t=packed_t,
+            meta_t=meta_t,
+            compressed_swizzled_bitmask=compressed_swizzled_bitmask,
+            requires_grad=False,
+        )
+
+    def _mm(
+        self,
+        B: torch.Tensor,
+        *,
+        bias: Optional[torch.Tensor] = None,
+        **kwargs
+    ) -> torch.Tensor:
+        if isinstance(B, SparseSemiStructuredTensor):
+            raise ValueError(
+                "`SparseSemiStructuredTensor @ SparseSemiStructuredTensor` is not supported by the hardware"
+            )
+        if self.ndim != 2 or B.ndim != 2:
+            raise NotImplementedError(
+                f"`{self.__class__.__name__}` matmul: Broadcasting is not implemented"
+            )
+        if B.dtype != self.dtype:
+            raise NotImplementedError(
+                f"`{self.__class__.__name__}` matmul: trying to do `A={tuple(self.shape)} @ B={tuple(B.shape)}`, "
+                f"with A.dtype={self.dtype} and B.dtype={B.dtype}. "
+                "This operation is only supported when A and B have the same data type."
+            )
+        if bias is not None and bias.dtype != self.dtype:
+            raise NotImplementedError(
+                f"`{self.__class__.__name__}` matmul: trying to do `A={tuple(self.shape)} @ B={tuple(B.shape)} + C`, "
+                "with A.dtype=B.dtype={self.dtype} and C.dtype={B.dtype}. "
+                "This operation is only supported when A, B and C have the same data type."
+            )
+        if self.packed is None:
+            raise NotImplementedError(
+                f"`{self.__class__.__name__}` matmul: operation is not supported"
+            )
+        else:
+            res = torch._cslt_sparse_mm(
+                self.packed,
+                B,
+                bias=bias,
+                transpose_result=self.fuse_transpose_cusparselt,
+                alg_id=self.alg_id_cusparselt,
+            )
+            return res.t() if self.fuse_transpose_cusparselt else res
diff --git a/torch/special/__init__.py b/torch/special/__init__.py
index a25f0f7c03682..07e104c4090ef 100644
--- a/torch/special/__init__.py
+++ b/torch/special/__init__.py
@@ -1036,7 +1036,7 @@
                                    r"""
 hermite_polynomial_h(input, n, *, out=None) -> Tensor
 
-Physicist’s Hermite polynomial :math:`H_{n}(\text{input})`.
+Physicist's Hermite polynomial :math:`H_{n}(\text{input})`.
 
 If :math:`n = 0`, :math:`1` is returned. If :math:`n = 1`, :math:`\text{input}`
 is returned. Otherwise, the recursion:
@@ -1059,7 +1059,7 @@
                                     r"""
 hermite_polynomial_he(input, n, *, out=None) -> Tensor
 
-Probabilist’s Hermite polynomial :math:`He_{n}(\text{input})`.
+Probabilist's Hermite polynomial :math:`He_{n}(\text{input})`.
 
 If :math:`n = 0`, :math:`1` is returned. If :math:`n = 1`, :math:`\text{input}`
 is returned. Otherwise, the recursion:
diff --git a/torch/storage.py b/torch/storage.py
index f65c0806accda..306dd99a93add 100644
--- a/torch/storage.py
+++ b/torch/storage.py
@@ -47,6 +47,8 @@ def get_device(self) -> int:
 
     def data_ptr(self) -> int: ...  # type: ignore[empty-body] # noqa: E704
 
+    def resizable(self) -> bool: ...  # type: ignore[empty-body] # noqa: E704
+
     # Defined in torch/csrc/generic/StorageSharing.cpp
     def _share_filename_cpu_(self, *args, **kwargs): ...  # noqa: E704
     def _share_fd_cpu_(self, *args, **kwargs): ...  # noqa: E704
@@ -207,6 +209,14 @@ def float8_e4m3fn(self):
         """Casts this storage to float8_e4m3fn type"""
         return self._to(torch.float8_e4m3fn)
 
+    def float8_e5m2fnuz(self):
+        """Casts this storage to float8_e5m2fnuz type"""
+        return self._to(torch.float8_e5m2fnuz)
+
+    def float8_e4m3fnuz(self):
+        """Casts this storage to float8_e4m3fnuz type"""
+        return self._to(torch.float8_e4m3fnuz)
+
     def is_pinned(self, device: Union[str, torch.device] = 'cuda'):
         r"""Determine whether the CPU storage is already pinned on device.
 
@@ -251,7 +261,7 @@ def _new_shared(cls, size, *, device='cpu'):
         """Create a new storage in shared memory with the same data type."""
         from torch.multiprocessing import get_sharing_strategy
         device = torch.device(device)
-        if device.type in ["cuda", torch._C._get_privateuse1_backend_name()]:
+        if device.type in ["cuda", torch._C._get_privateuse1_backend_name(), "hpu"]:
             return cls(size, device=device)
         elif get_sharing_strategy() == 'file_system':
             return cls._new_using_filename_cpu(size)
@@ -376,6 +386,23 @@ def _load_from_bytes(b):
 _StorageBase.hpu = _hpu  # type: ignore[assignment]
 
 
+@lru_cache(maxsize=None)
+def _new_dtypes():
+    # These are dtypes serialized as UntypedStorage unlike those in
+    # _dtype_to_storage_type_map
+    return {
+        torch.float8_e5m2,
+        torch.float8_e4m3fn,
+        torch.float8_e5m2fnuz,
+        torch.float8_e4m3fnuz,
+        torch.bits8,
+        torch.bits16,
+        torch.bits1x8,
+        torch.bits2x4,
+        torch.bits4x2,
+        torch.complex32,
+    }
+
 @lru_cache(maxsize=None)
 def _dtype_to_storage_type_map():
     # NOTE: We should no longer add dtypes to this map. This map
@@ -470,8 +497,9 @@ def _reset_warn_typed_storage_removal():
     _warn_typed_storage_removal.__dict__['has_warned'] = False
 
 def _get_device_from_module(module: str):
-    if module.split(".")[-1] in ["cuda", torch._C._get_privateuse1_backend_name()]:
-        return module.split(".")[-1]
+    last_part = module.rsplit(".", 1)[-1]
+    if last_part in ["cuda", torch._C._get_privateuse1_backend_name(), "hpu"]:
+        return last_part
     else:
         return "cpu"
 
@@ -749,8 +777,11 @@ def _getitem(self, idx):
                 _internal=True)._getitem(idx)
 
         idx_wrapped = self._maybe_wrap_index(idx)
-        tmp_tensor = torch.tensor([], dtype=self.dtype, device=self._untyped_storage.device).set_(self)
-        return tmp_tensor[idx_wrapped].item()
+        from torch._subclasses.fake_tensor import unset_fake_temporarily
+
+        with unset_fake_temporarily():
+            tmp_tensor = torch.tensor([], dtype=self.dtype, device=self._untyped_storage.device).set_(self)
+            return tmp_tensor[idx_wrapped].item()
 
     def copy_(self, source: T, non_blocking: _Optional[bool] = None):
         _warn_typed_storage_removal()
@@ -945,6 +976,10 @@ def data_ptr(self):
     def _data_ptr(self):
         return self._untyped_storage.data_ptr()
 
+    def resizable(self):
+        _warn_typed_storage_removal()
+        return self._untyped_storage.resizable()
+
     def resize_(self, size):
         _warn_typed_storage_removal()
         self._resize_(size)
@@ -1070,6 +1105,16 @@ def float8_e4m3fn(self):
         _warn_typed_storage_removal()
         return self._to(torch.float8_e4m3fn)
 
+    def float8_e5m2fnuz(self):
+        """Casts this storage to float8_e5m2fnuz type"""
+        _warn_typed_storage_removal()
+        return self._to(torch.float8_e5m2fnuz)
+
+    def float8_e4m3fnuz(self):
+        """Casts this storage to float8_e4m3fnuz type"""
+        _warn_typed_storage_removal()
+        return self._to(torch.float8_e4m3fnuz)
+
     @classmethod
     def from_file(cls, filename, shared, size):
         """from_file(filename, shared=False, size=0) -> Storage
@@ -1153,7 +1198,7 @@ def _get_legacy_storage_class(self):
 
         storage_name = _dtype_to_storage_type_map()[self.dtype]
 
-        if self.device.type not in ['cpu', 'cuda', torch._C._get_privateuse1_backend_name()]:
+        if self.device.type not in ['cpu', 'cuda', "hpu", torch._C._get_privateuse1_backend_name()]:
             return None
 
         module = torch if self.device.type == 'cpu' else getattr(torch, self.device.type)
diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index 1474501382897..e2bad14e4490b 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -36,7 +36,7 @@ def __init__(
         super().__init__(
             "If you are a user and see this message during normal operation "
             "please file an issue at https://github.com/pytorch/pytorch/issues. "
-            "If you are a developer and working on the comparison functions, please `raise ErrorMeta().to_error()` "
+            "If you are a developer and working on the comparison functions, please `raise ErrorMeta.to_error()` "
             "for user facing errors."
         )
         self.type = type
@@ -71,16 +71,10 @@ def to_error(
 # The default tolerances of torch.float32 are used for quantized dtypes, because quantized tensors are compared in
 # their dequantized and floating point representation. For more details see `TensorLikePair._compare_quantized_values`
 _DTYPE_PRECISIONS.update(
-    {
-        dtype: _DTYPE_PRECISIONS[torch.float32]
-        for dtype in (
-            torch.quint8,
-            torch.quint2x4,
-            torch.quint4x2,
-            torch.qint8,
-            torch.qint32,
-        )
-    }
+    dict.fromkeys(
+        (torch.quint8, torch.quint2x4, torch.quint4x2, torch.qint8, torch.qint32),
+        _DTYPE_PRECISIONS[torch.float32],
+    )
 )
 
 
@@ -342,7 +336,7 @@ def __init__(
 
     @staticmethod
     def _inputs_not_supported() -> NoReturn:
-        raise UnsupportedInputs()
+        raise UnsupportedInputs
 
     @staticmethod
     def _check_inputs_isinstance(*inputs: Any, cls: Union[Type, Tuple[Type, ...]]):
@@ -800,7 +794,16 @@ def _equalize_attributes(
             expected = expected.cpu()
 
         if actual.dtype != expected.dtype:
-            dtype = torch.promote_types(actual.dtype, expected.dtype)
+            actual_dtype = actual.dtype
+            expected_dtype = expected.dtype
+            # For uint64, this is not sound in general, which is why promote_types doesn't
+            # allow it, but for easy testing, we're unlikely to get confused
+            # by large uint64 overflowing into negative int64
+            if actual_dtype in [torch.uint64, torch.uint32, torch.uint16]:
+                actual_dtype = torch.int64
+            if expected_dtype in [torch.uint64, torch.uint32, torch.uint16]:
+                expected_dtype = torch.int64
+            dtype = torch.promote_types(actual_dtype, expected_dtype)
             actual = actual.to(dtype)
             expected = expected.to(dtype)
 
@@ -1214,7 +1217,7 @@ def not_close_error_metas(
         )
     except ErrorMeta as error_meta:
         # Explicitly raising from None to hide the internal traceback
-        raise error_meta.to_error() from None
+        raise error_meta.to_error() from None  # noqa: RSE102
 
     error_metas: List[ErrorMeta] = []
     for pair in pairs:
diff --git a/torch/testing/_creation.py b/torch/testing/_creation.py
index d02de60d35665..0b01b172a4774 100644
--- a/torch/testing/_creation.py
+++ b/torch/testing/_creation.py
@@ -20,7 +20,12 @@
     torch.uint64,
 ]
 _FLOATING_TYPES = [torch.float16, torch.bfloat16, torch.float32, torch.float64]
-_FLOATING_8BIT_TYPES = [torch.float8_e4m3fn, torch.float8_e5m2]
+_FLOATING_8BIT_TYPES = [
+    torch.float8_e4m3fn,
+    torch.float8_e5m2,
+    torch.float8_e4m3fnuz,
+    torch.float8_e5m2fnuz,
+]
 _COMPLEX_TYPES = [torch.complex32, torch.complex64, torch.complex128]
 _BOOLEAN_OR_INTEGRAL_TYPES = [torch.bool, *_INTEGRAL_TYPES]
 _FLOATING_OR_COMPLEX_TYPES = [*_FLOATING_TYPES, *_COMPLEX_TYPES]
diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py
index e6b6dcfc0f40d..4e2a654562444 100644
--- a/torch/testing/_internal/autocast_test_lists.py
+++ b/torch/testing/_internal/autocast_test_lists.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 from torch.testing._internal.common_utils import TEST_WITH_ROCM
 
diff --git a/torch/testing/_internal/autograd_function_db.py b/torch/testing/_internal/autograd_function_db.py
index 0a16cae3aca3f..e092c4d9339b7 100644
--- a/torch/testing/_internal/autograd_function_db.py
+++ b/torch/testing/_internal/autograd_function_db.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 from functools import partial
 from torch.testing import make_tensor
@@ -140,6 +142,9 @@ def sample_inputs_numpy_mul(opinfo, device, dtype, requires_grad, **kwargs):
     # Broadcasting
     yield SampleInput(make_arg(4, low=0.9, high=2), args=(make_arg(3, 4, low=0.9, high=2),))
 
+def sample_inputs_numpy_mul_scalar(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput(make_arg(4, low=0.9, high=2), args=(), kwargs={"scalar": 3.14})
 
 class MulGenVmap(torch.autograd.Function):
     generate_vmap_rule = True
diff --git a/torch/testing/_internal/check_kernel_launches.py b/torch/testing/_internal/check_kernel_launches.py
index 131ea461ce544..661614ffc8093 100644
--- a/torch/testing/_internal/check_kernel_launches.py
+++ b/torch/testing/_internal/check_kernel_launches.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import os
 import re
 import sys
diff --git a/torch/testing/_internal/codegen/__init__.py b/torch/testing/_internal/codegen/__init__.py
index e69de29bb2d1d..1e3572cfc4c6a 100644
--- a/torch/testing/_internal/codegen/__init__.py
+++ b/torch/testing/_internal/codegen/__init__.py
@@ -0,0 +1 @@
+# mypy: ignore-errors
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 019578f22298b..054f1a135740a 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 r"""This file is allowed to initialize CUDA context when imported."""
 
 import functools
@@ -29,29 +31,37 @@
 SM80OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 0))
 SM90OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0))
 
-def evaluate_gfx90a_exact():
+IS_JETSON = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() in [(7, 2), (8, 7)])
+
+def evaluate_gfx_arch_exact(matching_arch):
     if not torch.cuda.is_available():
         return False
     gcn_arch_name = torch.cuda.get_device_properties('cuda').gcnArchName
     arch = os.environ.get('PYTORCH_DEBUG_FLASH_ATTENTION_GCN_ARCH_OVERRIDE', gcn_arch_name)
-    return arch == 'gfx90a:sramecc+:xnack-'
+    return arch == matching_arch
 
-GFX90A_Exact = LazyVal(lambda: evaluate_gfx90a_exact())
+GFX90A_Exact = LazyVal(lambda: evaluate_gfx_arch_exact('gfx90a:sramecc+:xnack-'))
+GFX942_Exact = LazyVal(lambda: evaluate_gfx_arch_exact('gfx942:sramecc+:xnack-'))
 
 def evaluate_platform_supports_flash_attention():
     if TEST_WITH_ROCM:
-        return evaluate_gfx90a_exact()
+        return evaluate_gfx_arch_exact('gfx90a:sramecc+:xnack-') or evaluate_gfx_arch_exact('gfx942:sramecc+:xnack-')
     if TEST_CUDA:
         return not IS_WINDOWS and SM80OrLater
     return False
 
 PLATFORM_SUPPORTS_FLASH_ATTENTION: bool = LazyVal(lambda: evaluate_platform_supports_flash_attention())
 PLATFORM_SUPPORTS_MEM_EFF_ATTENTION: bool = LazyVal(lambda: TEST_CUDA and not TEST_WITH_ROCM)
+# TODO(eqy): gate this against a cuDNN version
+PLATFORM_SUPPORTS_CUDNN_ATTENTION: bool = LazyVal(lambda: TEST_CUDA and not TEST_WITH_ROCM and
+                                                  torch.backends.cuda.cudnn_sdp_enabled())
 # This condition always evaluates to PLATFORM_SUPPORTS_MEM_EFF_ATTENTION but for logical clarity we keep it separate
 PLATFORM_SUPPORTS_FUSED_ATTENTION: bool = LazyVal(lambda: PLATFORM_SUPPORTS_FLASH_ATTENTION or PLATFORM_SUPPORTS_MEM_EFF_ATTENTION)
 
 PLATFORM_SUPPORTS_FUSED_SDPA: bool = TEST_CUDA and not TEST_WITH_ROCM
 
+PLATFORM_SUPPORTS_BF16: bool = LazyVal(lambda: TEST_CUDA and SM80OrLater)
+
 if TEST_NUMBA:
     try:
         import numba.cuda
@@ -225,7 +235,7 @@ def _check_hipsparse_generic_available():
 TEST_CUSPARSE_GENERIC = _check_cusparse_generic_available()
 TEST_HIPSPARSE_GENERIC = _check_hipsparse_generic_available()
 
-# Shared by test_cuda.py and test_multigpu.py
+# Shared by test_torch.py and test_multigpu.py
 def _create_scaling_models_optimizers(device="cuda", optimizer_ctor=torch.optim.SGD, optimizer_kwargs=None):
     # Create a module+optimizer that will use scaling, and a control module+optimizer
     # that will not use scaling, against which the scaling-enabled module+optimizer can be compared.
@@ -243,14 +253,14 @@ def _create_scaling_models_optimizers(device="cuda", optimizer_ctor=torch.optim.
 
     return mod_control, mod_scaling, opt_control, opt_scaling
 
-
+# Shared by test_torch.py, test_cuda.py and test_multigpu.py
 def _create_scaling_case(device="cuda", dtype=torch.float, optimizer_ctor=torch.optim.SGD, optimizer_kwargs=None):
     data = [(torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device)),
             (torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device)),
             (torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device)),
             (torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device))]
 
-    loss_fn = torch.nn.MSELoss().cuda()
+    loss_fn = torch.nn.MSELoss().to(device)
 
     skip_iter = 2
 
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index c5e11ed95e358..07caa0ac3eee3 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import copy
 import gc
 import inspect
@@ -13,10 +15,11 @@
 import torch
 from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
     skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \
-    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, TEST_MPS, \
+    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, TEST_MPS, TEST_XPU, \
     _TestParametrizer, compose_parametrize_fns, dtype_name, \
     TEST_WITH_MIOPEN_SUGGEST_NHWC, NATIVE_DEVICES, skipIfTorchDynamo, \
-    get_tracked_input, clear_tracked_input, PRINT_REPRO_ON_FAILURE
+    get_tracked_input, clear_tracked_input, PRINT_REPRO_ON_FAILURE, \
+    TEST_WITH_TORCHINDUCTOR
 from torch.testing._internal.common_cuda import _get_torch_cuda_version, \
     TEST_CUSPARSE_GENERIC, TEST_HIPSPARSE_GENERIC, _get_torch_rocm_version
 from torch.testing._internal.common_dtype import get_all_dtypes
@@ -566,6 +569,27 @@ def setUpClass(cls):
     def _should_stop_test_suite(self):
         return False
 
+class XPUTestBase(DeviceTypeTestBase):
+    device_type = 'xpu'
+    primary_device: ClassVar[str]
+
+    @classmethod
+    def get_primary_device(cls):
+        return cls.primary_device
+
+    @classmethod
+    def get_all_devices(cls):
+        # currently only one device is supported on MPS backend
+        prim_device = cls.get_primary_device()
+        return [prim_device]
+
+    @classmethod
+    def setUpClass(cls):
+        cls.primary_device = 'xpu:0'
+
+    def _should_stop_test_suite(self):
+        return False
+
 class PrivateUse1TestBase(DeviceTypeTestBase):
     primary_device: ClassVar[str]
     device_mod = None
@@ -609,6 +633,7 @@ def get_device_type_test_bases():
         test_bases.append(CPUTestBase)
         if torch.cuda.is_available():
             test_bases.append(CUDATestBase)
+
         device_type = torch._C._get_privateuse1_backend_name()
         device_mod = getattr(torch, device_type, None)
         if hasattr(device_mod, "is_available") and device_mod.is_available():
@@ -666,6 +691,7 @@ def filter_desired_device_types(device_type_test_bases, except_for=None, only_fo
 
 PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY = 'PYTORCH_TESTING_DEVICE_ONLY_FOR'
 PYTORCH_TESTING_DEVICE_EXCEPT_FOR_KEY = 'PYTORCH_TESTING_DEVICE_EXCEPT_FOR'
+PYTORCH_TESTING_DEVICE_FOR_CUSTOM_KEY = 'PYTORCH_TESTING_DEVICE_FOR_CUSTOM'
 
 
 def get_desired_device_type_test_bases(except_for=None, only_for=None, include_lazy=False, allow_mps=False):
@@ -673,6 +699,8 @@ def get_desired_device_type_test_bases(except_for=None, only_for=None, include_l
     test_bases = device_type_test_bases.copy()
     if allow_mps and TEST_MPS and MPSTestBase not in test_bases:
         test_bases.append(MPSTestBase)
+    if only_for == 'xpu' and TEST_XPU and XPUTestBase not in test_bases:
+        test_bases.append(XPUTestBase)
     # Filter out the device types based on user inputs
     desired_device_type_test_bases = filter_desired_device_types(test_bases, except_for, only_for)
     if include_lazy:
@@ -690,6 +718,14 @@ def get_desired_device_type_test_bases(except_for=None, only_for=None, include_l
     def split_if_not_empty(x: str):
         return x.split(",") if x else []
 
+    # run some cuda testcases on other devices if available
+    # Usage:
+    # export PYTORCH_TESTING_DEVICE_FOR_CUSTOM=privateuse1
+    env_custom_only_for = split_if_not_empty(os.getenv(PYTORCH_TESTING_DEVICE_FOR_CUSTOM_KEY, ''))
+    if env_custom_only_for:
+        desired_device_type_test_bases += filter(lambda x: x.device_type in env_custom_only_for, test_bases)
+        desired_device_type_test_bases = list(set(desired_device_type_test_bases))
+
     # Filter out the device types based on environment variables if available
     # Usage:
     # export PYTORCH_TESTING_DEVICE_ONLY_FOR=cuda,cpu
@@ -750,6 +786,23 @@ def instantiate_device_type_tests(generic_test_class, scope, except_for=None, on
                 nontest = getattr(generic_test_class, name)
                 setattr(device_type_test_class, name, nontest)
 
+        # The dynamically-created test class derives from the test template class
+        # and the empty class. Arrange for both setUpClass and tearDownClass methods
+        # to be called. This allows the parameterized test classes to support setup
+        # and teardown.
+        @classmethod
+        def _setUpClass(cls):
+            base.setUpClass()
+            empty_class.setUpClass()
+
+        @classmethod
+        def _tearDownClass(cls):
+            empty_class.tearDownClass()
+            base.tearDownClass()
+
+        device_type_test_class.setUpClass = _setUpClass
+        device_type_test_class.tearDownClass = _tearDownClass
+
         # Mimics defining the instantiated class in the caller's file
         # by setting its module to the given class's and adding
         # the module to the given scope.
@@ -843,10 +896,11 @@ def _serialize_sample(sample_input):
 
 class ops(_TestParametrizer):
     def __init__(self, op_list, *, dtypes: Union[OpDTypes, Sequence[torch.dtype]] = OpDTypes.supported,
-                 allowed_dtypes: Optional[Sequence[torch.dtype]] = None):
+                 allowed_dtypes: Optional[Sequence[torch.dtype]] = None, skip_if_dynamo=True):
         self.op_list = list(op_list)
         self.opinfo_dtypes = dtypes
         self.allowed_dtypes = set(allowed_dtypes) if allowed_dtypes is not None else None
+        self.skip_if_dynamo = skip_if_dynamo
 
     def _parametrize_test(self, test, generic_cls, device_cls):
         """ Parameterizes the given test function across each op and its associated dtypes. """
@@ -868,7 +922,7 @@ def _parametrize_test(self, test, generic_cls, device_cls):
             elif self.opinfo_dtypes == OpDTypes.unsupported:
                 dtypes = set(get_all_dtypes()).difference(op.supported_dtypes(device_cls.device_type))
             elif self.opinfo_dtypes == OpDTypes.supported:
-                dtypes = op.supported_dtypes(device_cls.device_type)
+                dtypes = set(op.supported_dtypes(device_cls.device_type))
             elif self.opinfo_dtypes == OpDTypes.any_one:
                 # Tries to pick a dtype that supports both forward or backward
                 supported = op.supported_dtypes(device_cls.device_type)
@@ -883,7 +937,7 @@ def _parametrize_test(self, test, generic_cls, device_cls):
                     dtypes = {}
             elif self.opinfo_dtypes == OpDTypes.any_common_cpu_cuda_one:
                 # Tries to pick a dtype that supports both CPU and CUDA
-                supported = op.dtypes.intersection(op.dtypesIfCUDA)
+                supported = set(op.dtypes).intersection(op.dtypesIfCUDA)
                 if supported:
                     dtypes = {next(dtype for dtype in ANY_DTYPE_ORDER if dtype in supported)}
                 else:
@@ -920,7 +974,7 @@ def test_wrapper(*args, **kwargs):
                         except Exception as e:
                             tracked_input = get_tracked_input()
                             if PRINT_REPRO_ON_FAILURE and tracked_input is not None:
-                                raise Exception(
+                                raise Exception(  # noqa: TRY002
                                     f"Caused by {tracked_input.type_desc} "
                                     f"at index {tracked_input.index}: "
                                     f"{_serialize_sample(tracked_input.val)}") from e
@@ -928,6 +982,9 @@ def test_wrapper(*args, **kwargs):
                         finally:
                             clear_tracked_input()
 
+                    if self.skip_if_dynamo and not TEST_WITH_TORCHINDUCTOR:
+                        test_wrapper = skipIfTorchDynamo("Policy: we don't run OpInfo tests w/ Dynamo")(test_wrapper)
+
                     # Initialize info for the last input seen. This is useful for tracking
                     # down which inputs caused a test failure. Note that TrackedInputIter is
                     # responsible for managing this.
@@ -1082,7 +1139,12 @@ def __call__(self, fn):
 
         @wraps(fn)
         def efail_fn(slf, *args, **kwargs):
-            if self.device_type is None or self.device_type == slf.device_type:
+            if not hasattr(slf, "device_type") and hasattr(slf, "device") and isinstance(slf.device, str):
+                target_device_type = slf.device
+            else:
+                target_device_type = slf.device_type
+
+            if self.device_type is None or self.device_type == target_device_type:
                 try:
                     fn(slf, *args, **kwargs)
                 except Exception:
@@ -1277,6 +1339,10 @@ def onlyCUDA(fn):
 def onlyMPS(fn):
     return onlyOn('mps')(fn)
 
+
+def onlyXPU(fn):
+    return onlyOn('xpu')(fn)
+
 def onlyPRIVATEUSE1(fn):
     device_type = torch._C._get_privateuse1_backend_name()
     device_mod = getattr(torch, device_type, None)
@@ -1319,9 +1385,16 @@ def disable_mkldnn(self, *args, **kwargs):
     return disable_mkldnn
 
 
+def expectedFailureCPU(fn):
+    return expectedFailure('cpu')(fn)
+
+
 def expectedFailureCUDA(fn):
     return expectedFailure('cuda')(fn)
 
+def expectedFailureXPU(fn):
+    return expectedFailure('xpu')(fn)
+
 def expectedFailureMeta(fn):
     return skipIfTorchDynamo()(expectedFailure('meta')(fn))
 
diff --git a/torch/testing/_internal/common_dist_composable.py b/torch/testing/_internal/common_dist_composable.py
index 9ec92c6e71e00..e7bce5c37f3d9 100644
--- a/torch/testing/_internal/common_dist_composable.py
+++ b/torch/testing/_internal/common_dist_composable.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 # Owner(s): ["oncall: distributed"]
 
 from typing import Tuple
@@ -55,6 +57,9 @@ def __init__(self, device: torch.device):
         self.u1 = UnitModule(device)
         self.u2 = UnitModule(device)
         self.p = nn.Parameter(torch.randn((100, 100), device=device))
+        self.register_buffer(
+            "buffer", torch.randn((100, 100), device=device), persistent=True
+        )
 
     def forward(self, x):
         a = self.u2(self.u1(self.l(x)))
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 053606e6d969f..b325a9601e251 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -1,4 +1,8 @@
+# mypy: ignore-errors
+
+import abc
 import faulthandler
+import itertools
 import logging
 import multiprocessing
 import os
@@ -17,7 +21,7 @@
 from enum import Enum
 from functools import partial, reduce, wraps
 from io import StringIO
-from typing import Dict, NamedTuple, Optional, Union
+from typing import Dict, NamedTuple, Optional, Union, List, Any, Callable, Tuple
 from unittest.mock import patch
 
 import torch
@@ -35,6 +39,7 @@
     TEST_WITH_ROCM,
     TEST_WITH_TSAN,
     TestCase,
+    run_tests,
 )
 from torch.testing._internal.distributed.multi_threaded_pg import (
     _install_threaded_pg,
@@ -301,9 +306,7 @@ def requires_nccl_version(version, msg):
     else:
         return skip_but_pass_in_sandcastle_if(
             torch.cuda.nccl.version() < version,
-            "Requires NCCL version greater than or equal to: {}, found: {}, reason: {}".format(
-                version, torch.cuda.nccl.version(), msg
-            ),
+            f"Requires NCCL version greater than or equal to: {version}, found: {torch.cuda.nccl.version()}, reason: {msg}",
         )
 
 
@@ -795,9 +798,8 @@ def _check_return_codes(self, elapsed_time) -> None:
                 # Get error from pipe.
                 error_message = self.pid_to_pipe[process.pid].recv()
                 error += (
-                    "Process {} exited with error code {} and exception:\n{}\n".format(
-                        i, MultiProcessTestCase.TEST_ERROR_EXIT_CODE, error_message
-                    )
+                    f"Process {i} exited with error code {MultiProcessTestCase.TEST_ERROR_EXIT_CODE} "
+                    f"and exception:\n{error_message}\n"
                 )
 
             raise RuntimeError(error)
@@ -811,9 +813,7 @@ def _check_return_codes(self, elapsed_time) -> None:
             self.assertEqual(
                 p.exitcode,
                 first_process.exitcode,
-                msg="Expect process {} exit code to match Process 0 exit code of {}, but got {}".format(
-                    i, first_process.exitcode, p.exitcode
-                ),
+                msg=f"Expect process {i} exit code to match Process 0 exit code of {first_process.exitcode}, but got {p.exitcode}",
             )
         for skip in TEST_SKIPS.values():
             if first_process.exitcode == skip.exit_code:
@@ -839,6 +839,38 @@ def is_master(self) -> bool:
         return self.rank == 0
 
 
+def run_subtests(
+    cls_inst,
+    subtest_config: Dict[str, List[Any]],
+    test_fn: Callable,
+    *test_args,
+    **test_kwargs: Any,
+):
+    """
+    Runs a test function given by ``test_fn`` as a subtest according to the
+    configurations specified by ``subtest_config``. This amortizes the
+    costly setup overhead (including process spawn and initializing the
+    process group) over the subtests.
+
+    Args:
+        subtest_config (Dict[str, List[Any]]): A mapping from subtest
+            keyword argument name to a list of its possible values.
+        test_fn (Callable): A callable that runs the actual test.
+        test_args: Positional arguments to pass to ``test_fn``.
+        test_kwargs: Keyword arguments to pass to ``test_fn``.
+    """
+    # Convert the config mapping to a list to have a fixed order
+    subtest_config_items: List[Tuple[str, List[Any]]] = list(subtest_config.items())
+    subtest_config_keys: List[str] = [item[0] for item in subtest_config_items]
+    subtest_config_values: List[List[Any]] = [item[1] for item in subtest_config_items]
+    for values in itertools.product(*subtest_config_values):
+        # Map keyword to chosen value
+        subtest_kwargs = dict(zip(subtest_config_keys, values))
+        with cls_inst.subTest(**subtest_kwargs):
+            test_fn(*test_args, **test_kwargs, **subtest_kwargs)
+        c10d.barrier()
+
+
 # Cannot use functools.cache as it requires python 3.9
 EFA_PROBE_RESULT = None
 
@@ -917,9 +949,13 @@ def worker(rank, world_pg, store):
     @wraps(func)
     def wrapper(self, *args, **kwargs):
         # TODO: get test name from kwargs
-        threads = _run_test_method_with_multi_threads(world_size, lambda: func(self, *args, **kwargs))
-        # join and error handling
-        MultiThreadedTestCase._join_threads(threads, func)
+        torch._C._distributed_c10d._set_thread_isolation_mode(True)
+        try:
+            threads = _run_test_method_with_multi_threads(world_size, lambda: func(self, *args, **kwargs))
+            # join and error handling
+            MultiThreadedTestCase._join_threads(threads, func)
+        finally:
+            torch._C._distributed_c10d._set_thread_isolation_mode(False)
 
     return wrapper
 
@@ -987,6 +1023,7 @@ def _spawn_threads(self):
         """
         class method to spawn threads and run test, use this method in the SetUp of your TestCase
         """
+        torch._C._distributed_c10d._set_thread_isolation_mode(True)
         test_name = self._current_test_name
         # for each test case, we need to create thread local world, and a global store
         world = _install_threaded_pg()
@@ -1064,6 +1101,7 @@ def _join_threads(cls, threads, fn):
                 failed_ranks.append(failure)
         finally:
             _uninstall_threaded_pg()
+            torch._C._distributed_c10d._set_thread_isolation_mode(False)
 
         cls._check_return_codes(failed_ranks, timeout, fn)
 
@@ -1254,3 +1292,104 @@ def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe) -> None:
         self.rank = rank
         self.file_name = file_name
         self.run_test(test_name, parent_pipe)
+
+
+class MultiProcContinousTest(TestCase):
+    # Class variables:
+    # number of test processes
+    world_size: int = 2
+    # rank of the current process
+    rank: int = -1  # unset state
+    # Rendezvous file
+    rdvz_file: Optional[str] = None
+
+    @classmethod
+    @abc.abstractmethod
+    def backend_str(cls) -> str:
+        """
+        ProcessGroup backend str.
+        To be customized by sub test classes, e.g. "nccl".
+        Here we raise error.
+        """
+        raise NotImplementedError("Please implement backend_str in your test class")
+
+    @classmethod
+    def opts(cls, high_priority_stream=False):
+        """
+        ProcessGroup init options.
+        To be customized by sub test classes, e.g. ProcessGroupNCCLOpTest
+        Here we return None.
+        """
+        return None
+
+    @classmethod
+    def setUpClass(cls):
+        """
+        Class-scope test fixture. Run once for entire test class, before any test starts.
+        Set up the process group.
+        """
+        super().setUpClass()
+        if not 0 <= cls.rank < cls.world_size:
+            raise RuntimeError(
+                "Rank must be set and in the range of 0 to world_size. "
+                f"World size: {cls.world_size} Rank: {cls.rank}"
+            )
+        if cls.rdvz_file:
+            store = c10d.FileStore(cls.rdvz_file, cls.world_size)
+        else:
+            # torchrun takes care of rendezvous
+            store = None
+        opts = cls.opts()
+        backend = cls.backend_str()
+        print(f"Testing {backend=}")
+        # create nccl processgroup with opts
+        c10d.init_process_group(
+            backend=backend,
+            world_size=cls.world_size,
+            rank=cls.rank,
+            store=store,
+            pg_options=opts,
+        )
+        cls.pg = c10d.distributed_c10d._get_default_group()
+        print(f"Rank {cls.rank} setup complete")
+
+    @classmethod
+    def tearDownClass(cls):
+        """
+        Class-scope test fixture. Run once for entire test class, after all tests finish.
+        Tear down the process group.
+        """
+        c10d.destroy_process_group()
+        super().tearDownClass()
+        # Clear up the rendezvous file
+        if cls.rdvz_file:
+            try:
+                os.remove(cls.rdvz_file)
+            except OSError:
+                pass
+        print(f"Rank {cls.rank} teardown complete")
+
+    @classmethod
+    def run_rank(
+        cls,
+        rank: int,
+        world_size: int,
+        rdvz_file: Optional[str] = None,
+    ):
+        """
+        This is an entry point for each rank to run the tests in `MultiProcContinousTest`.
+        In this entry point, we set the class variables for the test class.
+        Then we run all tests.
+
+        Note:
+        - This helper only works for a subclass of `MultiProcContinousTest`.
+
+        Example:
+        - See `test_c10d_ops_nccl.py`.
+        """
+        # set class variables for the test class
+        cls.rank = rank
+        cls.world_size = world_size
+        cls.rdvz_file = rdvz_file
+        # Launch tests via `common_utils` infra
+        run_tests()
diff --git a/torch/testing/_internal/common_dtype.py b/torch/testing/_internal/common_dtype.py
index 8d7d2bff25cd9..4c3f8484ed310 100644
--- a/torch/testing/_internal/common_dtype.py
+++ b/torch/testing/_internal/common_dtype.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from typing import List
 
 import torch
@@ -44,6 +46,7 @@ def floating_and_complex_types_and(*dtypes):
 def double_types():
     return _double_types
 
+# NB: Does not contain uint16/uint32/uint64 for BC reasons
 _integral_types = _dispatch_dtypes((torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64))
 def integral_types():
     return _integral_types
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 90e9faa5ac37d..31bf3fafd2379 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -1,19 +1,27 @@
 # Owner(s): ["oncall: distributed"]
 
-import itertools
+import contextlib
 import os
 import re
 import sys
+import warnings
 from abc import ABC, abstractmethod
 from contextlib import nullcontext
 from copy import deepcopy
 from enum import auto, Enum
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from functools import partial, wraps
+from typing import Any, Callable, Dict, no_type_check, Optional, Tuple, Type, Union
 from unittest import mock
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributed._composable.fsdp._fsdp_param_group import (
+    FSDPParamGroup,
+    RegisterPostBackwardFunction,
+)
+from torch.distributed._tensor import distribute_tensor, DTensor, Shard
 from torch.distributed.fsdp import CPUOffload, FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp._common_utils import TrainingState
 from torch.distributed.fsdp._init_utils import NO_RESHARD_AFTER_FORWARD_STRATEGIES
@@ -29,9 +37,11 @@
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     MultiThreadedTestCase,
+    run_subtests,
     TEST_SKIPS,
 )
 from torch.testing._internal.common_utils import FILE_SCHEMA, get_cycles_per_ms
+from torch.utils._triton import has_triton
 
 
 class FSDPInitMode(Enum):
@@ -73,15 +83,7 @@ def run_backward(self, loss) -> None:
 
     @staticmethod
     @abstractmethod
-    def init(
-        group: dist.ProcessGroup,
-        fsdp_init_mode: FSDPInitMode,
-        *init_args: Any,
-        cuda_init_mode: CUDAInitMode,
-        fsdp_kwargs: Optional[Dict[str, Any]] = None,
-        deterministic: bool = False,
-        **init_kwargs: Any,
-    ) -> nn.Module:
+    def init(*args: Any, **kwargs: Any) -> nn.Module:
         """Initializes an instance of this model."""
         ...
 
@@ -110,7 +112,9 @@ def _assert_module_states(
     olist = [None for _ in range(world_size)]
     dist.all_gather_object(olist, named_module_states, group=process_group)
     rank0_states = olist[0]
+    assert rank0_states is not None  # mypy
     for state in olist[1:]:
+        assert state is not None  # mypy
         for (_, p1), (_, p2) in zip(rank0_states, state):
             assert_fn(p1, p2)
 
@@ -201,7 +205,7 @@ def allreduce(self, *args, **kwargs):
         dist_wait = mock.Mock()
 
         def get_future():
-            future = torch.futures.Future()
+            future: torch.futures.Future = torch.futures.Future()
             future.set_result(1)
             return future
 
@@ -466,8 +470,9 @@ def init(
         wraps with top-level FSDP and the ``always_wrap_policy()`` auto wrap
         policy.
         """
-        super_ = super(AlwaysWrapNestedWrappedModule, AlwaysWrapNestedWrappedModule)
-        model = super_.init(
+        model = super(
+            AlwaysWrapNestedWrappedModule, AlwaysWrapNestedWrappedModule
+        ).init(
             group=group,
             fsdp_init_mode=FSDPInitMode.NO_FSDP,
             cuda_init_mode=cuda_init_mode,
@@ -477,6 +482,7 @@ def init(
         if fsdp_init_mode == FSDPInitMode.NO_FSDP:
             return model
         elif fsdp_init_mode == FSDPInitMode.RECURSIVE:
+            fsdp_kwargs = fsdp_kwargs or {}
             fsdp_model = FSDP(model, auto_wrap_policy=always_wrap_policy, **fsdp_kwargs)
             if cuda_init_mode == CUDAInitMode.CUDA_AFTER:
                 fsdp_model = fsdp_model.cuda()
@@ -651,7 +657,7 @@ def init(
 
 class NestedWrappedModuleWithDelay(ModuleWithDelay):
     @staticmethod
-    def init(
+    def init(  # type: ignore[override]
         group: dist.ProcessGroup,
         fsdp_init_mode: FSDPInitMode,
         cuda_init_mode: CUDAInitMode = CUDAInitMode.CUDA_AFTER,
@@ -660,7 +666,7 @@ def init(
         delay_after_loss_ms: int = 0,
         delay_before_reduction_ms: int = 0,
     ):
-        return super(NestedWrappedModuleWithDelay, NestedWrappedModuleWithDelay).init(
+        return ModuleWithDelay.init(
             NestedWrappedModule,
             group=group,
             fsdp_init_mode=fsdp_init_mode,
@@ -709,7 +715,7 @@ def __init__(
         d_input = 8
         expert = _maybe_cuda(nn.Linear(d_expert, d_shared), self.move_to_cuda)
 
-        self.num_expert_params = sum([p.numel() for p in expert.parameters()])
+        self.num_expert_params = sum(p.numel() for p in expert.parameters())
         for p in expert.parameters():
             p.expert = True  # type: ignore[attr-defined]
 
@@ -762,8 +768,9 @@ def run_backward(self, loss):
                 for p in self.parameters():
                     if hasattr(p, "expert"):
                         continue  # these params don't need grad reduction
-                    p.grad.div_(self.world_size)
-                    torch.distributed.all_reduce(p.grad, group=self.group)
+                    if p.grad is not None:
+                        p.grad.div_(self.world_size)
+                        torch.distributed.all_reduce(p.grad, group=self.group)
 
     @staticmethod
     def init(
@@ -817,36 +824,190 @@ def init(
         raise ValueError(f"Unsupported FSDP init mode: {fsdp_init_mode}")
 
 
-def run_subtests(
-    cls_inst,
-    subtest_config: Dict[str, List[Any]],
-    test_fn: Callable,
-    *test_args,
-    **test_kwargs: Any,
-):
-    """
-    Runs a test function given by ``test_fn`` as a subtest according to the
-    configurations specified by ``subtest_config``. This amortizes the
-    costly setup overhead (including process spawn and initializing the
-    process group) over the subtests.
+class MLP(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        device: Optional[torch.device] = None,
+        *,
+        bias: bool = True,
+        with_buffer: bool = False,
+        dim_multiplier: int = 4,
+    ):
+        super().__init__()
+        self.in_proj = nn.Linear(dim, dim_multiplier * dim, device=device, bias=bias)
+        self.out_proj = nn.Linear(dim_multiplier * dim, dim, device=device, bias=bias)
+        if with_buffer:
+            self.register_buffer("buffer", torch.randn((dim,), device=device))
+        else:
+            self.buffer = None
 
-    Args:
-        subtest_config (Dict[str, List[Any]]): A mapping from subtest
-            keyword argument name to a list of its possible values.
-        test_fn (Callable): A callable that runs the actual test.
-        test_args: Positional arguments to pass to ``test_fn``.
-        test_kwargs: Keyword arguments to pass to ``test_fn``.
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        z = self.in_proj(x)
+        z = F.relu(z)
+        z = self.out_proj(z)
+        z = F.relu(z)
+        if self.buffer is not None:
+            z = z + self.buffer
+        return z
+
+    def reset_parameters(self):
+        if self.buffer is not None:
+            torch.nn.init.normal_(self.buffer)
+
+
+class DoubleLinear(nn.Module):
+    """
+    This can be used for returning multiple outputs from a module
+    (``use_second_linear=True``) or for having an unused module (``False``).
     """
-    # Convert the config mapping to a list to have a fixed order
-    subtest_config_items: List[Tuple[str, List[Any]]] = list(subtest_config.items())
-    subtest_config_keys: List[str] = [item[0] for item in subtest_config_items]
-    subtest_config_values: List[List[Any]] = [item[1] for item in subtest_config_items]
-    for values in itertools.product(*subtest_config_values):
-        # Map keyword to chosen value
-        subtest_kwargs = dict(zip(subtest_config_keys, values))
-        with cls_inst.subTest(**subtest_kwargs):
-            test_fn(*test_args, **test_kwargs, **subtest_kwargs)
+
+    def __init__(self, dim: int, use_second_linear: bool = True):
+        super().__init__()
+        self.lin1 = nn.Linear(dim, dim)
+        self.lin2 = nn.Linear(dim, dim)
+        self.relu = nn.ReLU()
+        self.use_second_linear = use_second_linear
+
+    def forward(
+        self, x: torch.Tensor
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
+        if self.use_second_linear:
+            return self.relu(self.lin1(x)), self.relu(self.lin2(x))
+        return self.relu(self.lin1(x))
+
+
+# NOTE: For these patch methods, if we want safety under multi-threading (e.g.
+# when using multi-threaded process group), then we want:
+# (1) a barrier immediately after reading the original value to ensure that all
+# threads see the same original value
+# (2) a barrier immediately before restoring the original value to ensure that
+# all threads use the patched value inside the context
+@contextlib.contextmanager
+def patch_all_gather(new_all_gather_into_tensor: Callable):
+    orig_all_gather = dist.all_gather_into_tensor
+    dist.barrier()
+    dist.all_gather_into_tensor = new_all_gather_into_tensor
+    try:
+        yield
+    finally:
+        dist.barrier()
+        dist.all_gather_into_tensor = orig_all_gather
+
+
+@contextlib.contextmanager
+def patch_reduce_scatter(new_reduce_scatter_tensor: Callable):
+    orig_reduce_scatter = dist.reduce_scatter_tensor
+    dist.barrier()
+    dist.reduce_scatter_tensor = new_reduce_scatter_tensor
+    try:
+        yield
+    finally:
         dist.barrier()
+        dist.reduce_scatter_tensor = orig_reduce_scatter
+
+
+@contextlib.contextmanager
+def patch_all_reduce(new_all_reduce: Callable):
+    orig_all_reduce = dist.all_reduce
+    dist.barrier()
+    dist.all_reduce = new_all_reduce
+    try:
+        yield
+    finally:
+        dist.barrier()
+        dist.all_reduce = orig_all_reduce
+
+
+@no_type_check
+@contextlib.contextmanager
+def patch_unshard(new_unshard: Callable):
+    orig_unshard = FSDPParamGroup.unshard
+    dist.barrier()
+    FSDPParamGroup.unshard = new_unshard
+    try:
+        yield
+    finally:
+        dist.barrier()
+        FSDPParamGroup.unshard = orig_unshard
+
+
+@no_type_check
+@contextlib.contextmanager
+def patch_post_backward(new_post_backward: Callable):
+    orig_post_backward = FSDPParamGroup.post_backward
+    dist.barrier()
+    FSDPParamGroup.post_backward = new_post_backward
+    try:
+        yield
+    finally:
+        dist.barrier()
+        FSDPParamGroup.post_backward = orig_post_backward
+
+
+@no_type_check
+@contextlib.contextmanager
+def patch_register_post_backward_hook_backward(new_backward: Callable):
+    orig_backward = RegisterPostBackwardFunction.backward
+    dist.barrier()
+    RegisterPostBackwardFunction.backward = new_backward
+    try:
+        yield
+    finally:
+        dist.barrier()
+        RegisterPostBackwardFunction.backward = orig_backward
+
+
+def reduce_scatter_with_assert(
+    cls,
+    orig_reduce_scatter: Callable,
+    assert_fn: Callable,  # `assert_fn(output: Tensor)`
+    *args: Any,
+    **kwargs: Any,
+):
+    if len(args) > 0:
+        output = args[0]
+    elif "output" in kwargs:
+        output = kwargs["output"]
+    else:
+        raise AssertionError(
+            f"Cannot get reduce-scatter output from\nargs: {args}\nkwargs: {kwargs}"
+        )
+    assert_fn(output)
+    return orig_reduce_scatter(*args, **kwargs)
+
+
+def check_sharded_parity(
+    cls,  # unit test class
+    replicated_module: nn.Module,
+    sharded_module: nn.Module,
+    prefixes_to_ignore: Tuple[str, ...] = (),
+):
+    for (replicated_name, replicated_param), (sharded_name, sharded_param) in zip(
+        replicated_module.named_parameters(), sharded_module.named_parameters()
+    ):
+        clean_sharded_name = sharded_name
+        for prefix in prefixes_to_ignore:
+            clean_sharded_name = clean_sharded_name.replace(prefix, "")
+        cls.assertEqual(replicated_name, clean_sharded_name)
+        cls.assertIsInstance(sharded_param, DTensor)
+        assert isinstance(sharded_param, DTensor)  # mypy
+        mesh, placements = sharded_param.device_mesh, sharded_param.placements
+        if tuple(placements) == (Shard(0), Shard(0)):
+            raise AssertionError(
+                "FSDP's (Shard(0), Shard(0)) layout differs from distribute_tensor(), "
+                "so we cannot check for equality using it"
+            )
+        sharded_ref_param = distribute_tensor(replicated_param, mesh, placements)
+        cls.assertEqual(sharded_param.to_local(), sharded_ref_param.to_local())
+        if replicated_param.grad is None:
+            cls.assertIsNone(sharded_param.grad)
+            continue
+        cls.assertIsNotNone(sharded_param.grad)
+        sharded_ref_grad = distribute_tensor(replicated_param.grad, mesh, placements)
+        cls.assertIsInstance(sharded_param.grad, DTensor)
+        assert isinstance(sharded_param.grad, DTensor)  # mypy
+        cls.assertEqual(sharded_param.grad.to_local(), sharded_ref_grad.to_local())
 
 
 class FSDPTestMultiThread(MultiThreadedTestCase):
@@ -920,17 +1081,20 @@ def _run(cls, rank, test_name, file_name, pipe):
 
             raise
 
+        device_ids = None
         if torch.cuda.is_available() and torch.cuda.device_count():
-            torch.cuda.set_device(self.rank % torch.cuda.device_count())
+            device_id = self.rank % torch.cuda.device_count()
+            torch.cuda.set_device(device_id)
+            device_ids = [device_id]
 
         # Execute barrier prior to running test to ensure that every process
         # has finished initialization and that the following test
         # immediately exiting due to a skip doesn't cause flakiness.
-        dist.barrier()
+        dist.barrier(device_ids=device_ids)
 
         self.run_test(test_name, pipe)
 
-        dist.barrier()
+        dist.barrier(device_ids=device_ids)
 
         dist.destroy_process_group()
 
@@ -995,6 +1159,7 @@ def _train_for_several_steps(
                     self.assertEqual(loss.dtype, torch.float16)
                 # FSDP loss is fp16, DDP AMP loss is fp32
                 elif isinstance(model, FSDP):
+                    assert mixed_precision is not None  # mypy
                     self.assertEqual(loss.dtype, mixed_precision.param_dtype)
                 else:
                     self.assertEqual(loss.dtype, torch.float32)
@@ -1018,7 +1183,7 @@ def _train_for_several_steps(
 
         if isinstance(model, FSDP):
             model._assert_state(TrainingState.IDLE)
-        return loss.detach()
+        return loss.detach()  # type: ignore[possibly-undefined]
 
     def _test_fsdp_parity(
         self,
@@ -1161,6 +1326,7 @@ def _test_fsdp_parity(
         # Check parameter devices are CPU if offloading to CPU before calling
         # `get_full_params()`, which will cast the parameters to FP32
         if offload_params:
+            cpu_device = torch.device("cpu")
             for param in fsdp_model.parameters():
                 self.assertEqual(param.device, cpu_device)
             fsdp_loss = fsdp_loss.cuda()
@@ -1184,6 +1350,53 @@ def _test_fsdp_parity(
             )
 
 
+def test_compiled_fsdp(compile_compute_on_module: Optional[type] = None):
+    def fully_shard_with_compiled_compute(*args, **kwargs):
+        # compile ``module._call_impl``
+        # to showcase how to include user-registered hooks
+        if compile_compute_on_module is None or isinstance(
+            args[0], compile_compute_on_module
+        ):
+            args[0].compile()
+        return torch.distributed._composable.fsdp.fully_shard(*args, **kwargs)  # type: ignore[operator]
+
+    class FullyShardPatch(Enum):
+        # apply ``partial`` in order to use ``Enum.value``
+        EAGER = partial(torch.distributed._composable.fsdp.fully_shard)  # type: ignore[var-annotated, arg-type]
+        COMPILED_COMPUTE = partial(fully_shard_with_compiled_compute)  # type: ignore[arg-type]
+        # add FULL for tracing FSDP
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            original_fully_shard = torch.distributed._composable.fsdp.fully_shard
+            for fully_shard_patch in FullyShardPatch:
+                if fully_shard_patch != FullyShardPatch.EAGER and not has_triton():
+                    warnings.warn("Inductor on GPU needs Triton and recent GPU arch")
+                    continue
+                imported_fully_shard = (
+                    f"{func.__module__}.{original_fully_shard.__name__}"
+                )
+                with mock.patch(
+                    imported_fully_shard,
+                    fully_shard_patch.value,
+                ):
+                    func(*args, **kwargs)
+                    torch.distributed.barrier()
+                # mock.patch.__exit__ does not work with multi-thread
+                # thread 1 set {func.__module__}.fully_shard
+                # thread 2 read {func.__module__}.fully_shard and thought it is original
+                # hence we manually reset them after __exit__
+                import_path, _ = mock._get_target(imported_fully_shard)  # type: ignore[attr-defined]
+                setattr(
+                    import_path(), original_fully_shard.__name__, original_fully_shard
+                )
+
+        return wrapper
+
+    return decorator
+
+
 class SkipModule(nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/torch/testing/_internal/common_jit.py b/torch/testing/_internal/common_jit.py
index 25b7bd8be0511..8a676c7e16c78 100644
--- a/torch/testing/_internal/common_jit.py
+++ b/torch/testing/_internal/common_jit.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 # Torch
 import torch
 import torch.cuda
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index f029151ca5f4e..001d93de1875f 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from functools import wraps, partial
 from itertools import product, chain, islice
 import itertools
@@ -26,13 +28,14 @@
      skipCPUIfNoMklSparse,
      toleranceOverride, tol)
 from torch.testing._internal.common_cuda import (
-    PLATFORM_SUPPORTS_FLASH_ATTENTION, SM53OrLater, SM80OrLater, SM90OrLater, with_tf32_off, TEST_CUDNN,
-    _get_torch_cuda_version, _get_torch_rocm_version,
+    PLATFORM_SUPPORTS_FLASH_ATTENTION, PLATFORM_SUPPORTS_FUSED_ATTENTION, PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+    SM53OrLater, SM80OrLater, SM90OrLater, with_tf32_off, TEST_CUDNN, _get_torch_cuda_version,
+    _get_torch_rocm_version,
 )
 from torch.testing._internal.common_utils import (
     make_fullrank_matrices_with_distinct_singular_values,
     TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, TEST_SCIPY,
-    torch_to_numpy_dtype_dict, TEST_WITH_ASAN,
+    torch_to_numpy_dtype_dict, numpy_to_torch_dtype, TEST_WITH_ASAN,
     GRADCHECK_NONDET_TOL, freeze_rng_state, slowTest, TEST_WITH_SLOW,
     TEST_WITH_TORCHINDUCTOR
 )
@@ -506,6 +509,19 @@ def sample_inputs__native_batch_norm_legit(op_info, device, dtype, requires_grad
         else:
             yield SampleInput(sample.input, args=(args[2], args[3], training, momentum, eps))
 
+def sample_inputs__batch_norm_with_update(op_info, device, dtype, requires_grad, **kwargs):
+    samples = sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs)
+    for sample in samples:
+        # torch.native_batch_norm does not support 0 numel tensors
+        # IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
+        if sample.input.numel() == 0:
+            continue
+        args = sample.args
+        momentum = sample.kwargs.get('momentum', 0.5)
+        eps = sample.kwargs.get('eps', 1e-5)
+        if any(args[i] is None for i in range(4)):
+            continue
+        yield SampleInput(sample.input, args=(args[2], args[3], args[0], args[1], momentum, eps))
 
 def sample_inputs_nn_activation_relu(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -1137,6 +1153,10 @@ def make_arg_conj(size):
     if dtype.is_complex:
         yield SampleInput(make_arg(first_shape), args=(make_arg_conj(second_shape),))
 
+    # Matmul of empty matrices
+    yield SampleInput(make_arg((0, S)), args=(make_arg(S, M),))
+    yield SampleInput(make_arg((S, 0)), args=(make_arg(0, M),))
+
 
 def sample_inputs_addmm(op_info, device, dtype, requires_grad, **kwargs):
     alpha_val = kwargs.get('alpha', 2 + 3j if dtype.is_complex else 0.6)
@@ -1175,6 +1195,11 @@ def sample_inputs_addmm(op_info, device, dtype, requires_grad, **kwargs):
             make_arg(shape, requires_grad=False).mH.requires_grad_(requires_grad),
             **kwargs,
         )
+    # addmm of empty matrices
+    if dtype.is_floating_point:
+        yield SampleInput(make_arg(S, M), make_arg(S, 0), make_arg(0, M), **kwargs)
+        # empty matmul with broadcastable input
+        yield SampleInput(make_arg(M), make_arg(S, 0), make_arg(0, M), **kwargs).with_metadata(broadcasts_input=True)
 
 def sample_inputs_sparse_sampled_addmm(op_info, device, dtype, requires_grad, **kwargs):
     alpha = 2 + 3j if dtype.is_complex else 0.6
@@ -2273,6 +2298,122 @@ def sample_inputs_stack(op_info, device, dtype, requires_grad, **kwargs):
         for dim in range(-1, len(shape) - 1):
             yield SampleInput(tensors, args=(dim,))
 
+
+def sample_inputs_chunk_cat(op_info, device, dtype, requires_grad, **kwargs):
+    # 1. If input tensors have different ndims, dim should be non-negative and be less than the ndims of every input tensors.
+    #    If all input tensors have the same ndims, we support both negative and non-negative dim.
+    # 2. For wrapped_dim, all tensors should have the same size for 0,...,wrapped_dim-1 dimensions.
+    #        No requirements for (wrapped_dim, ...)-th dimension.
+    # 3. Expect positive num_chunks
+    # 4. Expect non-empty input tensor list and each input tensor should have at least 1 element
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    same_ndim_cases = (
+        (
+            [
+                torch.Size([1, 2, 3]),
+                torch.Size([1, 2, 3]),
+            ], -1, 5
+        ),
+        (
+            [
+                torch.Size([1, 2, 129]),
+                torch.Size([1, 2, 297]),
+            ], -1, 5
+        ),
+        (
+            [
+                torch.Size([1, 2, 3]),
+                torch.Size([1, 2, 3]),
+            ], 1, 5
+        ),
+        (
+            [
+                torch.Size([3, 3, 2, 1]),
+                torch.Size([1, 4, 2, 2]),
+                torch.Size([2, 1, 3, 3]),
+            ], 0, 2
+        ),
+    )
+    for sizes, dim, num_chunks in same_ndim_cases:
+        tensors = []
+        for size in sizes:
+            tensors.append(make_arg(size))
+        yield SampleInput(tensors, args=(dim, num_chunks))
+
+    different_ndim_case = [
+        torch.Size([2, 3, 3]),
+        torch.Size([2, 3, 1, 2]),
+        torch.Size([2, 3]),
+        torch.Size([2, 3, 2]),
+        torch.Size([2, 3, 271]),
+    ]
+    max_dim, num_chunks = 2, 3
+    for dim in range(max_dim):
+        tensors = []
+        for size in different_ndim_case:
+            tensors.append(make_arg(size))
+        yield SampleInput(tensors, args=(dim, num_chunks))
+
+
+def error_inputs_chunk_cat(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # input tensors have different ndims but dim is negative
+    sizes, dim, num_chunks = [torch.Size([2, 3]), torch.Size([4,])], -1, 3
+    tensors = [make_arg(size) for size in sizes]
+    yield ErrorInput(
+        SampleInput(tensors, args=(dim, num_chunks)),
+        error_regex='_chunk_cat expects non-negative dim when input tensors have different ndims',
+    )
+
+    # input tensors have different ndims but dim >= ndim of some input tensors
+    sizes, dim, num_chunks = [torch.Size([2, 3]), torch.Size([4,])], 1, 3
+    tensors = [make_arg(size) for size in sizes]
+    yield ErrorInput(
+        SampleInput(tensors, args=(dim, num_chunks)),
+        error_regex='_chunk_cat expects dim < ndim for all input tensors',
+    )
+
+    # some tensors have different sizes for 0, ..., dim-1 dimensions.
+    sizes, dim, num_chunks = [torch.Size([2, 3, 4]), torch.Size([4, 3])], 1, 3
+    tensors = [make_arg(size) for size in sizes]
+    yield ErrorInput(
+        SampleInput(tensors, args=(dim, num_chunks)),
+        error_regex='_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors',
+    )
+
+    # negative num_chunks
+    sizes, dim, num_chunks = [torch.Size([2,]), torch.Size([3,])], 0, -1
+    tensors = [make_arg(size) for size in sizes]
+    yield ErrorInput(
+        SampleInput(tensors, args=(dim, num_chunks)),
+        error_regex='_chunk_cat expects positive num_chunks',
+    )
+
+    # zero as num_chunks
+    sizes, dim, num_chunks = [torch.Size([2,]), torch.Size([3,])], 0, 0
+    tensors = [make_arg(size) for size in sizes]
+    yield ErrorInput(
+        SampleInput(tensors, args=(dim, num_chunks)),
+        error_regex='_chunk_cat expects positive num_chunks',
+    )
+
+    # empty input tensor list
+    dim, num_chunks = 0, 1
+    yield ErrorInput(
+        SampleInput([], args=(dim, num_chunks)),
+        error_regex='_chunk_cat expects a non-empty input tensor list',
+    )
+
+    # empty input tensor with 0 elements
+    sizes, dim, num_chunks = [torch.Size([0,]), torch.Size([3,])], 0, 1
+    tensors = [make_arg(size) for size in sizes]
+    yield ErrorInput(
+        SampleInput(tensors, args=(dim, num_chunks)),
+        error_regex='_chunk_cat expects non-empty tensor',
+    )
+
+
 def sample_inputs_cat_concat(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -3543,7 +3684,12 @@ def error_inputs_max_pool1d(op_info, device, **kwargs):
 
         # error inputs when pad > kernel_size / 2
         yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': 4, 'return_indices': True}),
-                         error_regex='pad should be at most half of kernel size')
+                         error_regex='pad should be at most half of effective kernel size')
+
+        # error inputs when pad > ((kernel_size - 1) * dilation + 1) / 2, when dilation is not default
+        yield ErrorInput(SampleInput(x,
+                         kwargs={'kernel_size': 3, 'dilation': 2, 'stride': 1, 'padding': 3, 'return_indices': True}),
+                         error_regex='pad should be at most half of effective kernel size')
 
         # error inputs for input tensor
         error_msg = r'Expected 2D or 3D \(batch mode\) tensor with optional 0 dim batch size for input'
@@ -3605,11 +3751,11 @@ def error_inputs_max_pool2d(op_info, device, **kwargs):
 
     # error inputs when pad > kernel_size / 2 (kernel_size : int)
     yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': 4, 'return_indices': True}),
-                     error_regex='pad should be at most half of kernel size')
+                     error_regex='pad should be at most half of effective kernel size')
 
     # error inputs when pad > kernel_size / 2 (kernel_size : tuple)
     yield ErrorInput(SampleInput(x, kwargs={'kernel_size': (3, 2), 'stride': 50, 'padding': 4, 'return_indices': True}),
-                     error_regex='pad should be at most half of kernel size')
+                     error_regex='pad should be at most half of effective kernel size')
 
     # error: unbatched input with 0 sized non-batch dims.
     err_msg = r'Expected 3D or 4D \(batch mode\) tensor with optional 0 dim batch size for input'
@@ -3636,12 +3782,12 @@ def error_inputs_max_pool3d(op_info, device, **kwargs):
 
     # error inputs when pad > kernel_size / 2 (kernel_size: int)
     yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': 4, 'return_indices': True}),
-                     error_regex='pad should be at most half of kernel size')
+                     error_regex='pad should be at most half of effective kernel size')
 
     # error inputs when pad > kernel_size / 2 (kernel_size: tuple)
     yield ErrorInput(SampleInput(x, kwargs={'kernel_size': (3, 2, 2), 'stride': 50,
                                             'padding': 4, 'return_indices': True}),
-                     error_regex='pad should be at most half of kernel size')
+                     error_regex='pad should be at most half of effective kernel size')
 
     # error: unbatched input with 0 sized non-batch dims.
     err_msg = r'Expected input\'s non-batch dimensions to have positive length'
@@ -4319,6 +4465,29 @@ def sample_inputs_native_layer_norm(opinfo, device, dtype, requires_grad, **kwar
             args=(normalized_shape, None, None, eps),
         )
 
+def sample_inputs_rms_norm(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as input shape, normalized_shape and a kwarg dict for eps
+    cases: Tuple[Tuple[int], Tuple[int], dict] = (  # type: ignore[assignment]
+        ((1, 2, 3), (1, 2, 3), {'eps': 0.5}),
+        ((2, 2, 3), (2, 3), {'eps': -0.5}),
+        ((1,), (1,), {}),
+        ((1, 2), (2,), {}),
+        ((0, 1), (1,), {}),
+    )
+
+    for input_shape, normalized_shape, kwargs in cases:
+        # Shape of weight and bias should be the same as normalized_shape
+        weight = make_arg(normalized_shape)
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(normalized_shape, weight),
+            kwargs=kwargs
+        )
+    # Without any optional args
+    yield SampleInput(make_arg((1, 2)), args=((2,),))
+
 def error_inputs_group_norm(opinfo, device, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=torch.float32, requires_grad=False)
 
@@ -4363,6 +4532,31 @@ def error_inputs_native_layer_norm(opinfo, device, **kwargs):
     )
     yield ErrorInput(s4, error_regex=err_msg4)
 
+def error_inputs_rms_norm(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32, requires_grad=False)
+    input_shape = (1, 2, 3)
+
+    err_msg1 = "Expected normalized_shape to be at least 1-dimensional"
+    s1 = SampleInput(
+        make_arg(input_shape), args=(tuple(), None, 1e-5)
+    )
+    yield ErrorInput(s1, error_regex=err_msg1)
+
+    normalized_shape = (1, 2, 3)
+    weight = make_arg((1, 2))
+    err_msg2 = "Expected weight to be of same shape as normalized_shape"
+    s2 = SampleInput(
+        make_arg(input_shape), args=(normalized_shape, weight, 1e-5)
+    )
+    yield ErrorInput(s2, error_regex=err_msg2)
+
+
+    err_msg4 = "Given normalized_shape="
+    s4 = SampleInput(
+        make_arg((2, 2, 3)), args=((2, 2), None, 1e-5)
+    )
+    yield ErrorInput(s4, error_regex=err_msg4)
+
 
 def sample_inputs_local_response_norm(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -4414,6 +4608,10 @@ def sample_inputs_linear(self, device, dtype, requires_grad, **kwargs):
         bias = create_tensor([out_feat])
         yield SampleInput(input_tensor, weight, bias)
 
+    # 5D tensor, used to crash on MPS, see https://github.com/pytorch/pytorch/issues/114942
+    yield SampleInput(create_tensor(2, 1, 2, 1, 2), create_tensor(4, 2))
+    yield SampleInput(create_tensor(2, 1, 2, 1, 2), create_tensor(4, 2), create_tensor(4))
+
 def sample_inputs_bilinear(self, device, dtype, requires_grad, **kwargs):
     features_options = [[3, 4, 5], [8, 8, 8]]
     batch_options: List[List[int]] = [
@@ -4843,7 +5041,7 @@ def error_inputs_avg_pool1d(op_info, device, **kwargs):
 
     # error inputs when pad > kernel_size / 2
     yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': 4}),
-                     error_regex='pad should be at most half of kernel size')
+                     error_regex='pad should be at most half of effective kernel size')
 
 def error_inputs_avg_pool2d(op_info, device, **kwargs):
     # error inputs when pad is negative
@@ -4856,10 +5054,10 @@ def error_inputs_avg_pool2d(op_info, device, **kwargs):
 
     # error inputs when pad > kernel_size / 2
     yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': 4}),
-                     error_regex='pad should be at most half of kernel size')
+                     error_regex='pad should be at most half of effective kernel size')
     # 2-dimensional kernel
     yield ErrorInput(SampleInput(x, kwargs={'kernel_size': (3, 2), 'stride': 50, 'padding': 4}),
-                     error_regex='pad should be at most half of kernel size')
+                     error_regex='pad should be at most half of effective kernel size')
 
     # error inputs for zero divisor
     x = torch.zeros(3, 3, 3)
@@ -4877,10 +5075,10 @@ def error_inputs_avg_pool3d(op_info, device, **kwargs):
 
     # error inputs when pad > kernel_size / 2
     yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': 4}),
-                     error_regex='pad should be at most half of kernel size')
+                     error_regex='pad should be at most half of effective kernel size')
     # 3-dimensional kernel
     yield ErrorInput(SampleInput(x, kwargs={'kernel_size': (3, 2, 2), 'stride': 50, 'padding': 4}),
-                     error_regex='pad should be at most half of kernel size')
+                     error_regex='pad should be at most half of effective kernel size')
 
     # error inputs for zero divisor
     x = torch.zeros(3, 3, 3, 3)
@@ -5031,9 +5229,10 @@ def make_idx(n, m):
 
     shapes = [((), ()), ((1,), (1,)), ((S, S), (S, M)), ((S, S, S), (S, M, S))]
     include_selfs = (True, False)
-    reduces = ('prod', 'mean', 'amin', 'amax')
+    reduce = op_info.variant_test_name
+    assert reduce in ('prod', 'mean', 'amin', 'amax')
 
-    for shape, include_self, reduce in product(shapes, include_selfs, reduces):
+    for shape, include_self in product(shapes, include_selfs):
         self_shape, src_shape = shape
         # dim. We handle the scalar case
         dim = 1 if len(self_shape) >= 2 else 0
@@ -5045,7 +5244,7 @@ def make_idx(n, m):
                           kwargs={'include_self' : include_self})
 
     # Sample inputs to test edge cases for backward
-    if requires_grad:
+    if requires_grad and reduce == 'prod':
         # Check that gradients are propagated correctly for prod when zeros in self/src are reduced
         # This sample tests gradients for the following cases
         # (a) 1 zero reduced (from source (self[0, 1]), from self (self[0, 0]))
@@ -5058,7 +5257,7 @@ def make_idx(n, m):
         idx = torch.tensor([0, 1, 2, 0], dtype=torch.long, device=device)
 
         yield SampleInput(input,
-                          args=(0, idx, src, 'prod'),
+                          args=(0, idx, src, reduce),
                           kwargs={'include_self': True})
 
 def sample_inputs_mode(op_info, device, dtype, requires_grad, **kwargs):
@@ -5944,11 +6143,15 @@ def sample_inputs_hardtanh(op_info, device, dtype, requires_grad=False, **kwargs
 
     # The additional sample is to check additional values of min_val and max_val beyond the default
     # value (what is already checked by sample_inputs_elementwise_unary)
-    for max_val, min_val in ((-0.5, 0.5), (0.5, -0.5), (0., 0.)):
+    for max_val, min_val in ((0.5, -0.5), (0., 0.)):
         yield SampleInput(make_arg(S, S), kwargs={"min_val": min_val, "max_val": max_val})
 
     yield from sample_inputs_elementwise_unary(op_info, device, dtype, requires_grad)
 
+def error_inputs_hardtanh(op_info, device, **kwargs):
+    # Tests that hardtanh errors out when passed min_val > max_val.
+    yield ErrorInput(SampleInput(make_tensor((1,), dtype=torch.float, device=device), kwargs={"min_val": 0.5, "max_val": -0.5}),
+                     error_type=ValueError, error_regex="min_val cannot be greater than max_val")
 
 def sample_inputs_einsum(op_info, device, dtype, requires_grad=False, **kwargs):
     def c(t):
@@ -6248,6 +6451,8 @@ def reference_inputs_diagonal_diag_embed(op_info, device, dtype, requires_grad,
         dict(dim1=1, dim2=0),
         # negative dims are allowed
         dict(dim1=-2, dim2=-1),
+        # one dim negative and the other nonnegative is allowed
+        dict(dim1=-1, dim2=0),
         # out of bounds offset should return an empty tensor in diagonal and
         # offset the diagonal in diag_embed
         dict(offset=100),
@@ -8329,6 +8534,11 @@ def error_inputs_triplet_margin_loss(op_info, device, **kwargs):
          dict(reduction="abc"),
          ValueError, "abc is not a valid value for reduction"),
 
+        # invalid margin
+        (make_input(3, 4), (make_input(3, 4), make_input(3, 4)),
+         dict(margin=-1.0),
+         ValueError, "margin must be greater than 0, got -1.0"),
+
         # shape mismatch
         (make_input(3, 5), (make_input(3, 4), make_input(3, 4)),
          dict(),
@@ -8513,6 +8723,29 @@ def sample_inputs_efficient_attention_forward(op_info, device, dtype, requires_g
         )
     )
 
+    # jagged (with query/keys offsets)
+    cu_seqlens_k = torch.arange(-1, 32 * 2 + 1, dtype=torch.int32, device=device)
+    cu_seqlens_k[-1] = 62
+    cu_seqlens_k[0] = 0
+    samples.append(
+        SampleInput(
+            make((32, 2, 64)).view(-1, 8, 8).unsqueeze(0),
+            make((6, 64)).view(-1, 8, 8).unsqueeze(0),
+            make((6, 64)).view(-1, 8, 8).unsqueeze(0),
+            bias=None,
+            cu_seqlens_q=torch.arange(0, 32 * 2 + 2, dtype=torch.int32, device=device),
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=2,
+            max_seqlen_k=2,
+            dropout_p=0.0,
+            custom_mask_type=0,  # No Mask
+            compute_log_sumexp=requires_grad,
+            scale=None,
+            causal_diagonal=None,
+            seqlen_k=None,
+        )
+    )
+
     yield from samples
 
 def sample_inputs_flash_attention_forward(op_info, device, dtype, requires_grad, **kwargs):
@@ -8596,6 +8829,20 @@ def sample_inputs_pixel_unshuffle(op_info, device, dtype, requires_grad, **kwarg
         ]
     )
 
+def sample_inputs_channel_shuffle(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    shapes_groups = [
+        ((1, 4, 10, 10), 2),
+        ((2, 6, 8, 8), 3),
+        ((2, 8, 5, 5), 4),
+    ]
+
+    yield from (
+        SampleInput(make_arg(shape), args=(groups,))
+        for shape, groups in shapes_groups
+    )
+
 def sample_inputs_binary_cross_entropy(op_info, device, dtype, requires_grad, logits=False, **kwargs):
     make = partial(make_tensor, device=device, dtype=dtype)
     # Lower bounds must be greater than 'eps' defined in gradcheck.py::gradgradcheck() -> eps
@@ -8970,6 +9217,8 @@ def _should_disable_fastpath(self, opinfo, rightmost_arg, rightmost_arg_type, dt
             # unary
             if opinfo.ref in (torch.abs, torch.neg):
                 return False
+            if opinfo.ref_inplace in (torch.Tensor.zero_,):
+                return False
             return dtype in integral_types_and(torch.bool)
         if self.arity < 2 or rightmost_arg_type == ForeachRightmostArgType.Tensor:
             return None
@@ -9047,7 +9296,7 @@ def sample_zero_size_tensor_inputs(self, opinfo, device, dtype, requires_grad, *
                         opinfo, ForeachRightmostArgType.TensorList, device, dtype, NUM_SIZE0_TENSORS,
                         **zero_size_foreach_inputs_kwargs)[0])
                 kwargs = self._sample_kwargs(
-                    opinfo, args[-1], ForeachRightmostArgType.TensorList, dtype, zero_size=True)
+                    opinfo, args[-1], ForeachRightmostArgType.TensorList, dtype)
             else:
                 args = []
                 kwargs = {}
@@ -9106,10 +9355,10 @@ def sample_zero_size_tensor_inputs(self, opinfo, device, dtype, requires_grad, *
         assert "num_input_tensors" not in kwargs
         _foreach_inputs_kwargs = {k: kwargs.pop(k, v) for k, v in _foreach_inputs_default_kwargs.items()}
         _foreach_inputs_kwargs["requires_grad"] = requires_grad
-        for ord in (0, 1, 2, -1, -2):
+        for ord in (0, 1, 2, -1, -2, float('inf'), float('-inf')):
             input = sample_inputs_foreach(None, device, dtype, NUM_SIZE0_TENSORS, zero_size=True, **_foreach_inputs_kwargs)
             disable_fastpath = True
-            if ord in (1, 2) and dtype in floating_types_and(torch.half, torch.bfloat16):
+            if ord in (1, 2, float('inf')) and dtype in floating_types_and(torch.half, torch.bfloat16):
                 disable_fastpath = False
             yield ForeachSampleInput(input, ord=ord, disable_fastpath=disable_fastpath)
 
@@ -9119,13 +9368,32 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         _foreach_inputs_kwargs = {k: kwargs.pop(k, v) for k, v in _foreach_inputs_default_kwargs.items()}
         _foreach_inputs_kwargs["requires_grad"] = requires_grad
 
-        for num_tensors, ord in product(num_input_tensors, (0, 1, 2, -1, -2)):
+        for num_tensors, ord in product(num_input_tensors, (0, 1, 2, -1, -2, float('inf'), float('-inf'))):
             input = sample_inputs_foreach(None, device, dtype, num_tensors, zero_size=False, **_foreach_inputs_kwargs)
             disable_fastpath = True
-            if ord in (1, 2) and dtype in floating_types_and(torch.half, torch.bfloat16):
+            if ord in (1, 2, float('inf')) and dtype in floating_types_and(torch.half, torch.bfloat16):
                 disable_fastpath = False
             yield ForeachSampleInput(input, ord=ord, disable_fastpath=disable_fastpath)
 
+        # Also test nan propagation with a single tensor, but skip autograd testing
+        if not requires_grad:
+            nan_inputs = [
+                [float('nan')],
+                [float('nan'), 1.0],
+                [1.0, float('nan')],
+                [1.0, 2.0, 3.0, float('nan'), float('nan'), 7.0, float('nan'), float('nan'), -1.5, 6.0],
+                [7.0, 3.0, float('nan'), float('nan'), -1.5, 6.0],
+                [3.0, float('nan'), float('nan'), -1.5, 6.0],
+            ]
+            for input in nan_inputs:
+                x = torch.tensor(input, device=device)
+                disable_fastpath = True
+                if ord in (1, 2, float('inf')) and dtype in floating_types_and(torch.half, torch.bfloat16):
+                    disable_fastpath = False
+                yield ForeachSampleInput([x], ord=ord, disable_fastpath=disable_fastpath)
+
+
+
 
 class foreach_lerp_sample_func(foreach_inputs_sample_func):
     def _sample_rightmost_arg(self, opinfo, rightmost_arg_type, device, dtype, num_tensors, **_foreach_inputs_kwargs):
@@ -9207,45 +9475,95 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
 foreach_unary_op_db: List[OpInfo] = [
     ForeachFuncInfo(
         'exp',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         backward_requires_result=True,
+        dtypes=floating_and_complex_types(),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'acos',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_and_complex_types(),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'asin',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_and_complex_types(),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'atan',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_and_complex_types(),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'cos',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_and_complex_types(),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'cosh',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_and_complex_types(),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'log',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_and_complex_types(),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'log10',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_and_complex_types(),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'log2',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_and_complex_types(),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'tan',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         backward_requires_result=True,
+        dtypes=floating_and_complex_types(),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
         decorators=(
             # due to https://github.com/pytorch/pytorch/pull/102427 enabling jiterator for complex
             DecorateInfo(
@@ -9262,8 +9580,13 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
     ),
     ForeachFuncInfo(
         'tanh',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         backward_requires_result=True,
+        dtypes=floating_and_complex_types(),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
         decorators=(
             DecorateInfo(
                 toleranceOverride(
@@ -9277,127 +9600,185 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
     ),
     ForeachFuncInfo(
         'sin',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_and_complex_types(),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'sinh',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_and_complex_types(),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'neg',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=all_types_and_complex(),
-        dtypesIfCUDA=all_types_and_complex(),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'sqrt',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_and_complex_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_and_complex_types_and(torch.half),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
         backward_requires_result=True,
     ),
     ForeachFuncInfo(
         'ceil',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=all_types_and(torch.bfloat16),
         dtypesIfCUDA=all_types_and(torch.half, torch.bfloat16),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'erf',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'erfc',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'expm1',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_and_complex_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
         backward_requires_result=True,
     ),
     ForeachFuncInfo(
         'floor',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=all_types_and(torch.bfloat16),
         dtypesIfCUDA=all_types_and(torch.half, torch.bfloat16),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'log1p',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_and_complex_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_and_complex_types_and(torch.half),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'round',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=all_types_and(torch.bfloat16),
         dtypesIfCUDA=all_types_and(torch.half, torch.bfloat16),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'frac',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'reciprocal',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.half),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
         backward_requires_result=True,
     ),
     ForeachFuncInfo(
         'sigmoid',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.half),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
         backward_requires_result=True,
     ),
     ForeachFuncInfo(
         'trunc',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=all_types_and(torch.bfloat16),
         dtypesIfCUDA=all_types_and(torch.half, torch.bfloat16),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'abs',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
         dtypesIfCUDA=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
-        skips=(
-            DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), "TestMeta",
-                         "test_dispatch_symbolic_meta_inplace", dtypes=complex_types()),
-            DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), "TestMeta",
-                         "test_dispatch_meta_inplace", dtypes=complex_types()),
-            DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), "TestMeta",
-                         "test_meta_inplace", dtypes=complex_types()),
+        decorators=(
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace", dtypes=complex_types()),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace", dtypes=complex_types()),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace", dtypes=complex_types()),
         ),
     ),
     ForeachFuncInfo(
         'zero',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
+        dtypesIfCUDA=all_types_and_complex_and(torch.bfloat16, torch.half),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
         supports_out=False,
     ),
     ForeachFuncInfo(
         'sign',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_types_and(torch.bool, torch.bfloat16, torch.half),
         dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
     ),
     ForeachFuncInfo(
         'lgamma',
-        foreach_inputs_sample_func(1, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.half),
         dtypesIfCUDA=all_types_and(torch.bool, torch.float16),
-        skips=(
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
+        decorators=(
             DecorateInfo(unittest.skip("In-place lgamma not supported for integral tensors"), "TestMeta",
                          "test_dispatch_symbolic_meta_inplace", dtypes=integral_types_and(torch.bool)),
             DecorateInfo(unittest.skip("In-place lgamma not supported for integral tensors"), "TestMeta",
@@ -9411,11 +9792,13 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
 foreach_binary_op_db: List[OpInfo] = [
     ForeachFuncInfo(
         "add",
-        foreach_inputs_sample_func(2, True, True, True),
+        sample_inputs_func=foreach_inputs_sample_func(2, True, True, True),
         dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
-        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
         supports_alpha_param=True,
-        skips=(
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
+        decorators=(
             # These tests fail with aten._local_scalar_dense not being implemented.
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
@@ -9430,11 +9813,13 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
     ),
     ForeachFuncInfo(
         "sub",
-        foreach_inputs_sample_func(2, True, True),
+        sample_inputs_func=foreach_inputs_sample_func(2, True, True),
         dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
-        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
         supports_alpha_param=True,
-        skips=(
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
+        decorators=(
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
@@ -9448,9 +9833,11 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
     ForeachFuncInfo(
         "mul",
         dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
-        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
         sample_inputs_func=foreach_inputs_sample_func(2, True, True, True),
-        skips=(
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
+        decorators=(
             # Samples have complex types and inplace only works if the dtype is complex.
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace",
                          dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
@@ -9465,9 +9852,11 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
     ForeachFuncInfo(
         "div",
         dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
-        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
         sample_inputs_func=foreach_inputs_sample_func(2, True, True, True),
-        skips=(
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
+        decorators=(
             # Samples have complex types and inplace only works if the dtype is complex.
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace",
                          dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
@@ -9490,10 +9879,13 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
     ),
     ForeachFuncInfo(
         "clamp_min",
-        foreach_inputs_sample_func(2, True, True),
+        sample_inputs_func=foreach_inputs_sample_func(2, True, True),
         dtypes=all_types_and(torch.bfloat16),
         dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
-        skips=(
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
+        decorators=(
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
@@ -9506,10 +9898,13 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
     ),
     ForeachFuncInfo(
         "clamp_max",
-        foreach_inputs_sample_func(2, True, True),
+        sample_inputs_func=foreach_inputs_sample_func(2, True, True),
         dtypes=all_types_and(torch.bfloat16),
         dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
-        skips=(
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
+        decorators=(
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
@@ -9523,12 +9918,13 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
     # note(crcrpar): forward ad not implemented.
     ForeachFuncInfo(
         "minimum",
-        foreach_inputs_sample_func(2, True, True),
+        sample_inputs_func=foreach_inputs_sample_func(2, True, True),
         dtypes=all_types_and(torch.bfloat16),
         dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
-        supports_forward_ad=False,
+        supports_autograd=True,
         supports_inplace_autograd=False,
-        skips=(
+        supports_forward_ad=False,
+        decorators=(
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
@@ -9542,12 +9938,13 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
     # note(crcrpar): forward ad not implemented.
     ForeachFuncInfo(
         "maximum",
-        foreach_inputs_sample_func(2, True, True),
+        sample_inputs_func=foreach_inputs_sample_func(2, True, True),
         dtypes=all_types_and(torch.bfloat16),
         dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
+        supports_autograd=True,
         supports_forward_ad=False,
         supports_inplace_autograd=False,
-        skips=(
+        decorators=(
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
@@ -9566,7 +9963,9 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         supports_scalar_self_arg=True,
         sample_inputs_func=foreach_inputs_sample_func(2, True, True),
         supports_autograd=True,
-        skips=(
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
+        decorators=(
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
@@ -9576,34 +9975,42 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
         ),
-        supports_forward_ad=True,
         backward_requires_result=True,
     ),
     ForeachFuncInfo(
         "copy",
-        foreach_inputs_sample_func(2, False, False),
+        sample_inputs_func=foreach_inputs_sample_func(2, False, False),
         dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
         supports_out=False,
         supports_forward_ad=False,
         supports_autograd=False,
+        supports_inplace_autograd=False,
     )
 ]
 
 foreach_pointwise_op_db: List[ForeachFuncInfo] = [
     ForeachFuncInfo(
         "addcmul",
-        foreach_pointwise_sample_func(4, True, True),
+        sample_inputs_func=foreach_pointwise_sample_func(4, True, True),
         dtypes=all_types_and_complex(),
         dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16),
-        skips=(
-            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
-            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
-            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
+        decorators=(
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
-            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
+            # Samples have complex types and inplace only works if the dtype is complex.
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
         ),
     ),
     ForeachFuncInfo(
@@ -9611,14 +10018,23 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_pointwise_sample_func(4, True, True),
         dtypes=all_types_and_complex(),
         dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16),
-        skips=(
-            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
-            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
-            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
+        decorators=(
+            # Samples have complex types and inplace only works if the dtype is complex.
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            # fails with div_cpu is not implemented with ComplexHalf
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
-            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
         ),
     ),
@@ -9627,18 +10043,16 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
 foreach_reduce_op_db: List[ForeachFuncInfo] = [
     ForeachFuncInfo(
         "norm",
-        foreach_norm_sample_func(1, False, False),
+        sample_inputs_func=foreach_norm_sample_func(1, False, False),
         dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-        dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-        skips=(
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
+        decorators=(
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
-            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
-            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
-            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
             DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides"),
-            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
         ),
     ),
 ]
@@ -9646,10 +10060,11 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
 foreach_other_op_db: List[ForeachFuncInfo] = [
     ForeachFuncInfo(
         "lerp",
-        foreach_lerp_sample_func(3, True, False),
+        sample_inputs_func=foreach_lerp_sample_func(3, True, False),
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_forward_ad=True,
         dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-        dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
-        dtypesIfROCM=floating_and_complex_types_and(torch.half, torch.bfloat16),
     ),
 ]
 
@@ -9813,6 +10228,18 @@ def reference_native_layer_norm(inp: np.ndarray, normalized_shape: Tuple[int], w
     return Y.reshape(*inp.shape), mean.reshape(stat_shape), (1.0 / np.sqrt(var + eps)).reshape(stat_shape)
 
 
+def reference_rms_norm(inp: np.ndarray, normalized_shape: Tuple[int], weight=None, eps=None):
+    if eps is None:
+        eps = torch.finfo(numpy_to_torch_dtype(inp.dtype)).eps
+    feature_size = np.prod(normalized_shape)
+    inp_view = inp.reshape(-1, feature_size)  # type: ignore[call-overload]
+    rms = np.sqrt((inp_view**2).mean(axis=-1, keepdims=True) + eps)
+    Y = inp_view / rms
+    if weight is not None:
+        Y = Y * weight.reshape(-1)
+    return Y.reshape(*inp.shape)
+
+
 def reference_group_norm(inp: np.ndarray, num_groups: int, weight=None, bias=None, eps=1e-5):
     inp_view = inp
     if np.prod(inp.shape) != 0:
@@ -10200,6 +10627,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_autograd=False,
+           allow_cow_input_materialize_forward=[0],
            sample_inputs_func=sample_inputs_cauchy,
            error_inputs_func=error_inputs_cauchy,
            skips=(
@@ -10230,6 +10658,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_autograd=False,
+           allow_cow_input_materialize_forward=[0],
            sample_inputs_func=sample_inputs_exponential,
            error_inputs_func=error_inputs_exponential,
            skips=(
@@ -10259,6 +10688,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=floating_types_and(torch.float16, torch.bfloat16, torch.int8, torch.int16, torch.int32, torch.int64, torch.uint8),
            supports_out=False,
            supports_autograd=False,
+           allow_cow_input_materialize_forward=[0],
            sample_inputs_func=sample_inputs_geometric,
            error_inputs_func=error_inputs_geometric,
            skips=(
@@ -10287,6 +10717,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=floating_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_autograd=False,
+           allow_cow_input_materialize_forward=[0],
            sample_inputs_func=sample_inputs_log_normal,
            error_inputs_func=error_inputs_log_normal,
            skips=(
@@ -10314,6 +10745,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            supports_out=False,
            supports_autograd=False,
+           allow_cow_input_materialize_forward=[0],
            sample_inputs_func=sample_inputs_normal,
            error_inputs_func=error_inputs_normal,
            skips=(
@@ -10344,6 +10776,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_out=False,
            supports_autograd=False,
            is_factory_function=False,
+           allow_cow_input_materialize_forward=[0],
            sample_inputs_func=sample_inputs_uniform,
            error_inputs_func=error_inputs_uniform,
            skips=(
@@ -10746,10 +11179,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    promotes_int_to_float=True,
                    decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
                    skips=(
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
-                                    active_if=TEST_WITH_ROCM, device_type='cuda', dtypes=[torch.complex64, torch.complex128]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
-                                    active_if=TEST_WITH_ROCM, device_type='cuda', dtypes=[torch.complex128]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
@@ -10807,8 +11236,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                                     active_if=IS_WINDOWS),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
                                     'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
-                                    active_if=TEST_WITH_ROCM, device_type='cuda', dtypes=[torch.complex128]),
                    )),
     OpInfo('allclose',
            dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
@@ -11719,6 +12146,12 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
                # ValueError: Sparse output is not supported at gradcheck yet. Please call to_dense(masked_grad=...) ...
                DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
+               # NotImplementedError: Could not run 'aten::sparse_sampled_addmm' with arguments from the 'SparseCsrMeta' backend.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_meta_outplace'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_meta_outplace'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace_all_strides'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_crossref_backward_no_amp'),
            )),
     OpInfo('sparse.mm',
            dtypes=floating_types_and(torch.bfloat16),
@@ -11760,6 +12193,10 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
                # ValueError: Sparse output is not supported at gradcheck yet. Please call to_dense(masked_grad=...) ...
                DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_fail_gradgrad'),
+               # NotImplementedError: Could not run 'aten::_sparse_mm_reduce_impl' with arguments from the 'SparseCsrMeta' backend
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_meta_outplace'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_meta_outplace'),
            )),
     UnaryUfuncInfo('i0',
                    ref=np_unary_ufunc_integer_promotion_wrapper(
@@ -11911,7 +12348,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=sample_inputs_isin),
     OpInfo('kthvalue',
            dtypes=all_types_and(torch.bfloat16, torch.float16),
-           dtypesIfCUDA=all_types_and(torch.float16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_kthvalue,
@@ -12336,7 +12772,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            )),
     OpInfo('median',
            dtypes=all_types_and(torch.bfloat16, torch.float16),
-           dtypesIfCUDA=all_types_and(torch.float16),
            # TODO: some signatures of median do support out
            supports_out=False,
            supports_forward_ad=True,
@@ -12345,7 +12780,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False)),
     OpInfo('nanmedian',
            dtypes=all_types_and(torch.bfloat16, torch.float16),
-           dtypesIfCUDA=all_types_and(torch.float16),
            # TODO: some signatures of nanmedian do support out
            supports_out=False,
            supports_forward_ad=True,
@@ -12717,8 +13151,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_out=False),
     OpInfo(
         "nn.functional.cross_entropy",
-        dtypes=floating_types_and(torch.bfloat16),
-        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
         sample_inputs_func=sample_inputs_cross_entropy,
         supports_out=False,
         supports_forward_ad=True,
@@ -12741,6 +13174,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                 "test_variant_consistency_jit",
                 device_type="cuda",
             ),
+            DecorateInfo(unittest.skip("FP16 corss_entropy cases have not been enabled on MPS yet"),
+                         dtypes=(torch.half,), device_type="mps"),
+
         )
     ),
     OpInfo('nn.functional.normalize',
@@ -12864,6 +13300,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            assert_jit_shape_analysis=True,
+           allow_cow_input_materialize_forward=[3, 4],
+           allow_cow_input_materialize_backward=[3, 4],
            sample_inputs_func=sample_inputs_native_batch_norm,
            skips=(
                # NotImplementedError: Could not run
@@ -12891,6 +13329,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            assert_jit_shape_analysis=True,
+           allow_cow_input_materialize_forward=[3, 4],
+           allow_cow_input_materialize_backward=[3, 4],
            sample_inputs_func=sample_inputs__native_batch_norm_legit,
            skips=(
                # NotImplementedError: Could not run
@@ -12909,6 +13349,40 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                             "TestCompositeCompliance", "test_forward_ad"),
            )
            ),
+    OpInfo('_batch_norm_with_update',
+           op=torch.ops.aten._batch_norm_with_update,
+           aten_name='_batch_norm_with_update',
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           allow_cow_input_materialize_forward=[3, 4],
+           allow_cow_input_materialize_backward=[3, 4],
+           sample_inputs_func=sample_inputs__batch_norm_with_update,
+           skips=(
+               # NotImplementedError: Could not run
+               # 'aten::native_batch_norm.out' with arguments from the 'CPU' backend.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type="cpu"),
+               # RuntimeError: out_invstd.dim() == 1 && out_invstd.is_contiguous() && out_invstd.sizes()[0]
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type="cuda"),
+               # Problem with _get_numerical_jacobian
+               # IndexError: tuple index out of range
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
+               # RuntimeError: deepEquals(input.iValue, deepCopiedInput) INTERNAL ASSERT FAILED
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-5)}),
+                            "TestCompositeCompliance", "test_forward_ad"),
+               # _batch_norm_with_update expects contiguous inputs for cudnn and miopen
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', device_type="cuda"),
+               DecorateInfo(unittest.expectedFailure,
+                            'TestMeta', 'test_dispatch_symbolic_meta_outplace_all_strides', device_type="cuda"),
+               # _batch_norm_with_update does not have python bindings
+               DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # aten out variants do not accept out= kwarg, only python out variants
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+           )
+           ),
     OpInfo('nn.functional.cosine_similarity',
            aten_name="cosine_similarity",
            dtypes=floating_types_and(torch.half, torch.bfloat16),
@@ -13322,6 +13796,19 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    toleranceOverride({torch.chalf: tol(atol=6e-2, rtol=5e-2)}),
                    'TestCommon', 'test_complex_half_reference_testing',
                ),
+               # TF32
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=5e-3, rtol=1e-3)}),
+                   'TestCommon', 'test_noncontiguous_samples',
+               ),
+               DecorateInfo(
+                   toleranceOverride({torch.complex64: tol(atol=5e-5, rtol=5e-6)}),
+                   'TestMathBits', 'test_conj_view',
+               ),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-6)}),
+                   'TestOperators', 'test_vjpvmap',
+               ),
            ),
            skips=(
                # RuntimeError: !lhs.isAliasOf(rhs) INTERNAL ASSERT FAILED at
@@ -13361,6 +13848,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           allow_cow_input_materialize_forward=['running_mean', 'running_var'],
            decorators=[
                # RuntimeError: Cannot insert a Tensor that requires grad as a constant.
                # Consider making it a parameter or input, or detaching the gradient
@@ -13389,6 +13877,16 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            ],
            sample_inputs_func=sample_inputs_layer_norm,
            supports_expanded_weight=True,),
+    OpInfo('nn.functional.rms_norm',
+           aten_name='rms_norm',
+           aliases=('rms_norm',),
+           ref=reference_rms_norm,
+           dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_rms_norm,
+           error_inputs_func=error_inputs_rms_norm,),
     OpInfo('nn.functional.local_response_norm',
            dtypes=floating_types_and(torch.int64, torch.float16, torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
@@ -13522,8 +14020,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_autograd=True,
            supports_fwgrad_bwgrad=True,
            supports_forward_ad=True,
-           dtypes=floating_types_and(torch.uint8, torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16, torch.uint8),
+           dtypes=floating_types_and(torch.uint8, torch.half, torch.bfloat16),
            sample_inputs_func=partial(sample_inputs_interpolate, 'nearest'),
            skips=(
                # RuntimeError: false
@@ -13538,8 +14035,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_autograd=True,
            supports_fwgrad_bwgrad=True,
            supports_forward_ad=True,
-           dtypes=floating_types_and(torch.uint8),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16, torch.uint8),
+           dtypes=floating_types_and(torch.half, torch.bfloat16, torch.uint8),
            sample_inputs_func=partial(sample_inputs_interpolate, 'nearest-exact'),
            skips=(
                # RuntimeError: false
@@ -13561,8 +14057,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_autograd=True,
            supports_fwgrad_bwgrad=True,
            supports_forward_ad=True,
-           dtypes=floating_types_and(torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
            sample_inputs_func=partial(sample_inputs_interpolate, 'linear'),
            skips=(
                # RuntimeError: false
@@ -13577,7 +14072,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=True,
            supports_autograd=True,
            supports_forward_ad=True,
-           dtypes=floating_types_and(torch.uint8, torch.bfloat16),
+           dtypes=floating_types_and(torch.uint8, torch.half, torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=partial(sample_inputs_interpolate, 'bilinear'),
@@ -13595,7 +14090,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_autograd=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           dtypes=floating_types_and(torch.uint8, torch.bfloat16),
+           dtypes=floating_types_and(torch.uint8, torch.half, torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            sample_inputs_func=partial(sample_inputs_interpolate, 'bicubic'),
            reference_inputs_func=partial(reference_inputs_interpolate, 'bicubic'),
@@ -13613,8 +14108,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_autograd=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           dtypes=floating_types_and(torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=partial(sample_inputs_interpolate, 'trilinear'),
            skips=(
@@ -13645,7 +14139,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_autograd=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           dtypes=floating_types_and(torch.uint8, torch.bfloat16),
+           dtypes=floating_types_and(torch.uint8, torch.half, torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=partial(sample_inputs_upsample, 'bilinear'),
@@ -13688,8 +14182,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_autograd=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           dtypes=floating_types_and(torch.uint8, torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.uint8, torch.bfloat16),
+           dtypes=floating_types_and(torch.uint8, torch.half, torch.bfloat16),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=partial(sample_inputs_upsample, 'nearest'),
            skips=(
@@ -13881,6 +14374,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            operator_variant=None,
            inplace_operator_variant=None,
            check_batched_gradgrad=False,
+           supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            check_batched_forward_grad=False,
@@ -14290,7 +14784,24 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_meta_outplace',
                          device_type='cuda', dtypes=(torch.bfloat16,), active_if=not SM80OrLater),
             DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace',
-                         device_type='cuda', dtypes=(torch.bfloat16,), active_if=not SM80OrLater),),
+                         device_type='cuda', dtypes=(torch.bfloat16,), active_if=not SM80OrLater),
+            # registered in fake_impls.py instead of _meta_registrations.py, so meta kernels will fail.
+            # However, for implementations that fall back to the constituent ops, the meta kernels may not
+            # fail. Fused kernels will fail, whereas unfused kernels will not fail.
+            # All fused kernels support bf16 and fp16 - so if fused attention is supported, the test will fail.
+            # mem_eff_attention also supports fp32 - so if it is supported the test will fail.
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace",
+                         dtypes=(torch.bfloat16, torch.float16), active_if=PLATFORM_SUPPORTS_FUSED_ATTENTION),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace",
+                         dtypes=(torch.float32,), active_if=PLATFORM_SUPPORTS_MEM_EFF_ATTENTION),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace",
+                         dtypes=(torch.bfloat16, torch.float16), active_if=PLATFORM_SUPPORTS_FUSED_ATTENTION),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace",
+                         dtypes=(torch.float32,), active_if=PLATFORM_SUPPORTS_MEM_EFF_ATTENTION),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides",
+                         dtypes=(torch.bfloat16, torch.float16,), active_if=PLATFORM_SUPPORTS_FUSED_ATTENTION),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides",
+                         dtypes=(torch.float32,), active_if=PLATFORM_SUPPORTS_MEM_EFF_ATTENTION),),
     ),
     OpInfo(
         'torch.ops.aten._flash_attention_forward',
@@ -14309,6 +14820,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             # Device mismatch due to philox seed and offset
             DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake_autocast', device_type='cuda'),
             DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake', device_type='cuda'),
+            # meta implementation is in fake_impls.py instead of being a meta registration
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
             # Checking the scalar value of the philox seed and offset
             DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator', device_type='cuda'),
             DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', device_type='cuda'),
@@ -14336,11 +14852,18 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         supports_fwgrad_bwgrad=False,
         supports_forward_ad=False,
         check_batched_forward_grad=False,
+        # TODO: Skip because it produces a CUDA illegal memory access for some reason
+        skip_cow_input_backward=True,
         decorators=[skipCUDAIf(TEST_WITH_ROCM, "ROCm doesn't support efficient attention")],
         skips=(
             # Device mismatch due to philox seed and offset
             DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake_autocast', device_type='cuda'),
             DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake', device_type='cuda'),
+            # meta implementation is in fake_impls.py instead of being a meta registration
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
             # Checking the scaler value of the philox seed and offset
             DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator', device_type='cuda'),
             DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', device_type='cuda'),
@@ -14644,6 +15167,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            assert_jit_shape_analysis=True,
+           allow_cow_input_materialize_forward=[1, 2],
+           allow_cow_input_materialize_backward=[1, 2],
            sample_inputs_func=sample_inputs_batch_norm,
            skips=(
                # see https://github.com/pytorch/pytorch/issues/71286
@@ -14666,6 +15191,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           allow_cow_input_materialize_forward=[1, 2],
+           allow_cow_input_materialize_backward=[1, 2],
            decorators=[onlyCUDA, disablecuDNN],
            skips=(
                DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-03, rtol=1e-04)}),
@@ -14676,7 +15203,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "nn.functional.binary_cross_entropy",
         aten_backward_name='binary_cross_entropy_backward',
         sample_inputs_func=sample_inputs_binary_cross_entropy,
-        dtypes=floating_types(),
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         supports_out=False,
         gradcheck_fast_mode=False,
@@ -14695,6 +15222,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                 "TestNNCOpInfo",
                 "test_nnc_correctness",
             ),
+            # Fails for unknown reason: https://github.com/pytorch/pytorch/issues/120783
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCompositeCompliance",
+                "test_cow_input",
+                device_type='cuda',
+            ),
             DecorateInfo(
                 toleranceOverride({torch.float32: tol(atol=1e-3, rtol=1e-3)}),
                 "TestJit",
@@ -14825,6 +15359,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    backward_dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
                    assert_autodiffed=True,
                    sample_inputs_func=sample_inputs_hardtanh,
+                   error_inputs_func=error_inputs_hardtanh,
                    supports_out=False,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
@@ -15277,14 +15812,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    handles_large_floats=False,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
-                   promotes_int_to_float=True,
-                   decorators=(precisionOverride({torch.bfloat16: 1e-2,
-                                                  torch.float16: 1e-2}),),
-                   skips=(
-                       # Reference: https://github.com/pytorch/pytorch/issues/49133
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
-                                    dtypes=[torch.cfloat]),
-                   )),
+                   promotes_int_to_float=True),
     UnaryUfuncInfo('sinh',
                    ref=np_unary_ufunc_integer_promotion_wrapper(np.sinh),
                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
@@ -15388,6 +15916,16 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            assert_autodiffed=True),
+    OpInfo('split_with_sizes_copy',
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool, torch.chalf),
+           sample_inputs_func=sample_inputs_split_with_sizes,
+           supports_out=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # No error raised
+               DecorateInfo(unittest.expectedFailure, "TestCommon", "test_out_requires_grad_error"),
+           )),
     BinaryUfuncInfo('__radd__',
                     op=torch.Tensor.__radd__,
                     dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
@@ -15588,7 +16126,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            gradcheck_fast_mode=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           supports_out=False),
+           skips=(
+               # RuntimeError: Internal error: pybind11::error_already_set called while
+               # Python error indicator not set.
+               # TODO: Investigate this more
+               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive_out'),
+           ),
+           supports_out=True),
     UnaryUfuncInfo('signbit',
                    ref=np.signbit,
                    dtypes=all_types_and(torch.bool, torch.bfloat16, torch.half),
@@ -16004,10 +16548,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                             device_type='mps', dtypes=[torch.float32]),
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
                             device_type='mps', dtypes=[torch.float32]),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_crossref_backward_amp',
-                            device_type='cuda', dtypes=[torch.float32], active_if=TEST_WITH_ROCM),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_crossref_backward_no_amp',
-                            device_type='cuda', dtypes=[torch.float32], active_if=TEST_WITH_ROCM),
            )),
     OpInfo('svd_lowrank',
            op=lambda *args, **kwargs: wrapper_set_seed(
@@ -16094,98 +16634,43 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    skips=(
                        DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                    ),
-                   sample_kwargs=lambda device, dtype, input: ({'n': 0}, {'n': 0})),
-    UnaryUfuncInfo('polygamma',
-                   op=lambda x, n, **kwargs: torch.polygamma(n, x, **kwargs),
-                   variant_test_name='polygamma_n_1',
-                   ref=reference_polygamma if TEST_SCIPY else None,
-                   dtypes=all_types_and(torch.bool, torch.bfloat16),
-                   dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
-                   supports_forward_ad=True,
-                   supports_fwgrad_bwgrad=True,
-                   promotes_int_to_float=True,
-                   sample_inputs_func=sample_inputs_polygamma,
-                   skips=(
-                       # Redundant tests
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
-                       # Mismatch: https://github.com/pytorch/pytorch/issues/55357
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large'),
-                   ),
-                   sample_kwargs=lambda device, dtype, input: ({'n': 1}, {'n': 1}),
-                   # polygamma functions have multiple singularities at x <= 0
-                   reference_numerics_filter=NumericsFilter(condition=lambda x: x < 0.1, safe_val=1)),
-    UnaryUfuncInfo('polygamma',
-                   op=lambda x, n, **kwargs: torch.polygamma(n, x, **kwargs),
-                   variant_test_name='polygamma_n_2',
-                   ref=reference_polygamma if TEST_SCIPY else None,
-                   dtypes=all_types_and(torch.bool, torch.bfloat16),
-                   dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
-                   supports_forward_ad=True,
-                   supports_fwgrad_bwgrad=True,
-                   promotes_int_to_float=True,
-                   sample_inputs_func=sample_inputs_polygamma,
-                   skips=(
-                       # Redundant tests
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
-                       # Mismatch: https://github.com/pytorch/pytorch/issues/55357
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),),
-                   sample_kwargs=lambda device, dtype, input: ({'n': 2}, {'n': 2}),
-                   # polygamma functions have multiple singularities at x <= 0
-                   reference_numerics_filter=NumericsFilter(condition=lambda x: x < 0.1, safe_val=1)),
-    UnaryUfuncInfo('polygamma',
-                   op=lambda x, n, **kwargs: torch.polygamma(n, x, **kwargs),
-                   variant_test_name='polygamma_n_3',
-                   ref=reference_polygamma if TEST_SCIPY else None,
-                   dtypes=all_types_and(torch.bool, torch.bfloat16),
-                   dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
-                   supports_forward_ad=True,
-                   supports_fwgrad_bwgrad=True,
-                   promotes_int_to_float=True,
-                   sample_inputs_func=sample_inputs_polygamma,
-                   skips=(
-                       # Redundant tests
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
-                       # Mismatch: https://github.com/pytorch/pytorch/issues/55357
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),),
-                   sample_kwargs=lambda device, dtype, input: ({'n': 3}, {'n': 3}),
-                   # polygamma functions have multiple singularities at x <= 0
-                   reference_numerics_filter=NumericsFilter(condition=lambda x: x < 0.1, safe_val=1)),
-    UnaryUfuncInfo('polygamma',
-                   op=lambda x, n, **kwargs: torch.polygamma(n, x, **kwargs),
-                   variant_test_name='polygamma_n_4',
-                   ref=reference_polygamma if TEST_SCIPY else None,
-                   decorators=(precisionOverride({torch.float16: 5e-4, torch.float32: 5e-4}),),
-                   dtypes=all_types_and(torch.bool, torch.bfloat16),
-                   dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
-                   supports_forward_ad=True,
-                   supports_fwgrad_bwgrad=True,
-                   promotes_int_to_float=True,
-                   sample_inputs_func=sample_inputs_polygamma,
-                   skips=(
-                       # Redundant tests
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
-                       # Mismatch: https://github.com/pytorch/pytorch/issues/55357
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),),
-                   sample_kwargs=lambda device, dtype, input: ({'n': 4}, {'n': 4}),
-                   # polygamma functions have multiple singularities at x <= 0
-                   reference_numerics_filter=NumericsFilter(condition=lambda x: x < 0.1, safe_val=1)),
+                   sample_kwargs=lambda device, dtype, input: ({'n': 0}, {'n': 0}),
+                   # polygamma functions have multiple singularities at x having non-positive integer value
+                   reference_numerics_filter=NumericsFilter(condition=lambda x: (x < 0.1) & ((x - x.round()).abs() < 1e-4),
+                                                            safe_val=1)),
+    *(UnaryUfuncInfo('polygamma',
+                     op=lambda x, n, **kwargs: torch.polygamma(n, x, **kwargs),
+                     variant_test_name=f'polygamma_n_{n_}',
+                     ref=reference_polygamma if TEST_SCIPY else None,
+                     dtypes=all_types_and(torch.bool, torch.bfloat16),
+                     dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
+                     promotes_int_to_float=True,
+                     sample_inputs_func=sample_inputs_polygamma,
+                     decorators=(
+                         DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-4, rtol=1e-3)}), 'TestUnaryUfuncs'),
+                         DecorateInfo(toleranceOverride({torch.bfloat16: tol(atol=1e1, rtol=1e-1),
+                                                         torch.float32: tol(atol=1e-4, rtol=1e-2)}),
+                                      'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                                      active_if=IS_WINDOWS),
+                     ),
+                     skips=(
+                         # Redundant tests
+                         DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
+                         DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
+                         DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
+                         DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
+                         DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
+                         # Mismatch: https://github.com/pytorch/pytorch/issues/55357
+                         DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
+                         DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large'),
+                     ),
+                     sample_kwargs=lambda device, dtype, input: ({'n': n_}, {'n': n_}),
+                     # polygamma functions have multiple singularities at x having non-positive integer value
+                     reference_numerics_filter=NumericsFilter(condition=lambda x: (x < 0.1) & ((x - x.round()).abs() < 1e-4),
+                                                              safe_val=1))
+      for n_ in (1, 2, 3, 4)),
     OpInfo('ravel',
            ref=np.ravel,
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
@@ -16411,10 +16896,16 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                             dtypes=(torch.bool,)),
            ),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
-    OpInfo('index_reduce',
-           dtypes=all_types_and(torch.float16, torch.bfloat16),
-           supports_out=True,
-           sample_inputs_func=sample_inputs_index_reduce),
+    *(OpInfo('index_reduce',
+             variant_test_name=reduction_type,
+             dtypes=all_types_and(torch.float16, torch.bfloat16),
+             skips=(
+                 DecorateInfo(toleranceOverride({torch.float16: tol(atol=2e-3, rtol=3e-3)}),
+                              'TestInductorOpInfo', 'test_comprehensive'),
+             ),
+             supports_out=True,
+             sample_inputs_func=sample_inputs_index_reduce,
+             ) for reduction_type in ('mean', 'prod', 'amin', 'amax')),
     OpInfo('__getitem__',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            # Runs very slowly on slow gradcheck - alternatively reduce input sizes
@@ -16454,8 +16945,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            skips=(
            )),
     OpInfo('unique',
-           dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
-           dtypesIfCUDA=all_types_and(torch.bool, torch.float16),
+           dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16, torch.uint16, torch.uint32, torch.uint64),
+           dtypesIfCUDA=all_types_and(torch.bool, torch.float16, torch.uint16, torch.uint32, torch.uint64),
            sample_inputs_func=sample_inputs_unique,
            supports_out=False,
            supports_autograd=False,
@@ -17294,6 +17785,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
            ),
            ),
+    OpInfo('_chunk_cat',
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_chunk_cat,
+           error_inputs_func=error_inputs_chunk_cat,
+           supports_autograd=False,
+           supports_out=True,
+           ),
     OpInfo('hstack',
            dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_hstack_dstack_vstack,
@@ -18073,12 +18571,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            # Currently only the `input` is tested in gradcheck.
            # If we pass `condition` first, none of the input which supports
            # autograd will be tested. Hence the following lambda.
-           op=lambda self, condition, other: torch.where(condition, self, other),
+           op=lambda self, condition, other, **kwargs: torch.where(condition, self, other, **kwargs),
            ref=lambda self, condition, other: np.where(condition, self, other),
            sample_inputs_func=sample_inputs_where,
            reference_inputs_func=reference_inputs_where,
            error_inputs_func=error_inputs_where,
-           supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            decorators=(
@@ -18316,11 +18813,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            skips=(
                # Dispatches in Python to matrix_norm. Not sure how to make this test happy
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',
-                            dtypes=(torch.complex64, torch.float32,)),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_crossref_backward_amp',
-                            device_type='cuda', dtypes=[torch.float32], active_if=TEST_WITH_ROCM),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_crossref_backward_no_amp',
-                            device_type='cuda', dtypes=[torch.float32], active_if=TEST_WITH_ROCM),)
+                            dtypes=(torch.complex64, torch.float32,)),)
            ),
     OpInfo('norm',
            variant_test_name='fro',
@@ -18547,6 +19040,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         op=lambda weight, idx, **kwargs: torch.nn.functional.embedding(idx, weight, **kwargs),
         dtypes=floating_types_and(torch.bfloat16, torch.float16),
         sample_inputs_func=sample_inputs_embedding,
+        allow_cow_input_materialize_forward=[0],
         error_inputs_func=error_inputs_embedding,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
@@ -18585,6 +19079,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
         supports_out=False,
         supports_gradgrad=False,
+        allow_cow_input_materialize_forward=[0],
     ),
     OpInfo(
         "nn.functional.multi_head_attention_forward",
@@ -19076,8 +19571,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     ),
     OpInfo(
         "nn.functional.nll_loss",
-        dtypes=floating_types_and(torch.bfloat16),
-        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
         supports_out=False,
         sample_inputs_func=sample_inputs_nll_loss,
         supports_forward_ad=True,
@@ -19091,6 +19585,16 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             #     return torch.nn.functional.nll_loss(i0, i1, weight=tensor([8.4784, 1.7658, 4.3228], dtype=torch.float32))
             #                                                        ~~~~~~ <--- HERE
             DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit", dtypes=(torch.float32,),),
+            # Fails for unknown reason: https://github.com/pytorch/pytorch/issues/120782
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCompositeCompliance",
+                "test_cow_input",
+                device_type='cuda',
+            ),
+            DecorateInfo(unittest.skip("FP16 nll_loss cases have not been enabled on MPS yet"),
+                         dtypes=(torch.half,), device_type="mps"),
+
         ),
     ),
     OpInfo(
@@ -19248,6 +19752,25 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             ),
         ),
     ),
+    OpInfo(
+        "nn.functional.channel_shuffle",
+        sample_inputs_func=sample_inputs_channel_shuffle,
+        dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
+        backward_dtypes=integral_types_and(torch.bool),
+        supports_out=False,
+        supports_autograd=False,
+        allow_cow_input_materialize_forward=[0],
+        skips=(
+            # Skip due to NotImplementedError for MPS device.
+            DecorateInfo(unittest.expectedFailure, 'TestConsistency'),
+            # vmap: calling random operator not supported
+            DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+            DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+            DecorateInfo(unittest.expectedFailure, 'TestInductorOpInfo', 'test_comprehensive'),
+            DecorateInfo(unittest.expectedFailure, 'TestDTensorOps', 'test_dtensor_op_db'),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
+        ),
+    ),
     OpInfo(
         "nn.functional.kl_div",
         sample_inputs_func=sample_inputs_kl_div,
@@ -19917,14 +20440,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         torch_opinfo_name="atan",
         decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
         skips=(
-            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
-                         'test_reference_numerics_small',
-                         active_if=TEST_WITH_ROCM, device_type='cuda',
-                         dtypes=[torch.complex64, torch.complex128]),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
-                         'test_reference_numerics_extremal',
-                         active_if=TEST_WITH_ROCM, device_type='cuda',
-                         dtypes=[torch.complex128]),
             DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
                          'test_reference_numerics_extremal',
                          device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
@@ -19966,10 +20481,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                          'test_reference_numerics_large',
                          device_type='cuda', dtypes=[torch.cfloat],
                          active_if=IS_WINDOWS),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
-                         'test_reference_numerics_extremal',
-                         active_if=TEST_WITH_ROCM, device_type='cuda',
-                         dtypes=[torch.complex128]),
         ),
     ),
     ElementwiseUnaryPythonRefInfo(
@@ -20136,6 +20647,16 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         # Fails on int32
         # https://github.com/pytorch/pytorch/issues/85258
     ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.frexp",
+        torch_opinfo_name="frexp",
+        # Skipped due to numerical failures on Windows CI.
+        # This is also skipped in frexp earlier in the file.
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                         active_if=IS_WINDOWS),
+        ),
+    ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.frac",
         torch_opinfo_name="frac",
@@ -20668,6 +21189,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         torch_opinfo_variant_name="with_dtype",
         supports_out=False,
     ),
+    PythonRefInfo(
+        "_refs.nn.functional.pixel_shuffle",
+        torch_opinfo_name="nn.functional.pixel_shuffle",
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.pixel_unshuffle",
+        torch_opinfo_name="nn.functional.pixel_unshuffle",
+    ),
     PythonRefInfo(
         "_refs.nn.functional.poisson_nll_loss",
         torch_opinfo_name="nn.functional.poisson_nll_loss",
@@ -21735,6 +22264,10 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         torch_opinfo_name="rot90",
         validate_view_consistency=False,
     ),
+    PythonRefInfo(
+        "_refs.select_scatter",
+        torch_opinfo_name="select_scatter",
+    ),
     PythonRefInfo(
         "_refs.stack",
         torch_opinfo_name="stack",
@@ -22251,6 +22784,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.where",
         torch_opinfo_name="where",
         op=lambda self, condition, other: refs.where(condition, self, other),
+        supports_out=False,
         skips=(
             DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors', device_type='cuda'),
         ),
diff --git a/torch/testing/_internal/common_mkldnn.py b/torch/testing/_internal/common_mkldnn.py
new file mode 100644
index 0000000000000..4a9d01cf9cde7
--- /dev/null
+++ b/torch/testing/_internal/common_mkldnn.py
@@ -0,0 +1,78 @@
+# mypy: ignore-errors
+
+import contextlib
+import functools
+import inspect
+
+import torch
+
+
+# Test whether hardware BF32 math mode enabled. It is enabled only on:
+# - MKLDNN is available
+# - BF16 is supported by MKLDNN
+def bf32_is_not_fp32():
+    if not torch.backends.mkldnn.is_available():
+        return False
+    if not torch.ops.mkldnn._is_mkldnn_bf16_supported():
+        return False
+    return True
+
+
+@contextlib.contextmanager
+def bf32_off():
+    old_matmul_precision = torch.get_float32_matmul_precision()
+    try:
+        torch.set_float32_matmul_precision("highest")
+        yield
+    finally:
+        torch.set_float32_matmul_precision(old_matmul_precision)
+
+
+@contextlib.contextmanager
+def bf32_on(self, bf32_precision=1e-5):
+    old_matmul_precision = torch.get_float32_matmul_precision()
+    old_precision = self.precision
+    try:
+        torch.set_float32_matmul_precision("medium")
+        self.precision = bf32_precision
+        yield
+    finally:
+        torch.set_float32_matmul_precision(old_matmul_precision)
+        self.precision = old_precision
+
+
+# This is a wrapper that wraps a test to run this test twice, one with
+# allow_bf32=True, another with allow_bf32=False. When running with
+# allow_bf32=True, it will use reduced precision as specified by the
+# argument
+def bf32_on_and_off(bf32_precision=1e-5):
+    def with_bf32_disabled(self, function_call):
+        with bf32_off():
+            function_call()
+
+    def with_bf32_enabled(self, function_call):
+        with bf32_on(self, bf32_precision):
+            function_call()
+
+    def wrapper(f):
+        params = inspect.signature(f).parameters
+        arg_names = tuple(params.keys())
+
+        @functools.wraps(f)
+        def wrapped(*args, **kwargs):
+            for k, v in zip(arg_names, args):
+                kwargs[k] = v
+            cond = bf32_is_not_fp32()
+            if "device" in kwargs:
+                cond = cond and (torch.device(kwargs["device"]).type == "cpu")
+            if "dtype" in kwargs:
+                cond = cond and (kwargs["dtype"] == torch.float)
+            if cond:
+                with_bf32_disabled(kwargs["self"], lambda: f(**kwargs))
+                with_bf32_enabled(kwargs["self"], lambda: f(**kwargs))
+            else:
+                f(**kwargs)
+
+        return wrapped
+
+    return wrapper
diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index ce4cf5a768a15..ffd0e6f95a87d 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 import unittest
 from copy import deepcopy
@@ -11,7 +13,7 @@
 from torch.testing import make_tensor
 from torch.testing._internal.common_cuda import TEST_CUDNN
 from torch.testing._internal.common_dtype import (
-    floating_types, floating_and_complex_types_and, get_all_fp_dtypes, complex_types_and)
+    floating_types, floating_and_complex_types_and, get_all_fp_dtypes)
 from torch.testing._internal.common_device_type import (
     _TestParametrizer, _update_param_kwargs, toleranceOverride, tol,
     skipCUDAIfCudnnVersionLessThan, skipCUDAIfRocm, precisionOverride, skipMeta, skipMPS, skipCUDAVersionIn)
@@ -22,9 +24,11 @@
     marginrankingloss_reference, multimarginloss_reference, multilabelmarginloss_reference,
     nllloss_reference, nlllossNd_reference, smoothl1loss_reference, softmarginloss_reference, get_reduction)
 from torch.testing._internal.common_utils import (
-    freeze_rng_state, set_single_threaded_if_parallel_tbb, skipIfMps, GRADCHECK_NONDET_TOL, TEST_WITH_ROCM, IS_WINDOWS)
+    freeze_rng_state, set_single_threaded_if_parallel_tbb, skipIfMps, GRADCHECK_NONDET_TOL, TEST_WITH_ROCM, IS_WINDOWS,
+    skipIfTorchDynamo)
 from types import ModuleType
 from typing import List, Tuple, Type, Set, Dict
+import operator
 
 # List of all namespaces containing modules to test.
 MODULE_NAMESPACES: List[ModuleType] = [
@@ -70,10 +74,12 @@
 class modules(_TestParametrizer):
     """ PROTOTYPE: Decorator for specifying a list of modules over which to run a test. """
 
-    def __init__(self, module_info_iterable, allowed_dtypes=None, train_eval_mode=TrainEvalMode.train_and_eval):
+    def __init__(self, module_info_iterable, allowed_dtypes=None,
+                 train_eval_mode=TrainEvalMode.train_and_eval, skip_if_dynamo=True):
         self.module_info_list = list(module_info_iterable)
         self.allowed_dtypes = set(allowed_dtypes) if allowed_dtypes is not None else None
         self.train_eval_mode = train_eval_mode
+        self.skip_if_dynamo = skip_if_dynamo
 
     def _get_training_flags(self, module_info):
         training_flags = []
@@ -98,7 +104,7 @@ def _parametrize_test(self, test, generic_cls, device_cls):
                                'instantiate_parametrized_tests()')
 
         for module_info in self.module_info_list:
-            dtypes = set(module_info.dtypes)
+            dtypes = set(module_info.supported_dtypes(device_cls.device_type))
             if self.allowed_dtypes is not None:
                 dtypes = dtypes.intersection(self.allowed_dtypes)
 
@@ -121,6 +127,9 @@ def _parametrize_test(self, test, generic_cls, device_cls):
                     def test_wrapper(*args, **kwargs):
                         return test(*args, **kwargs)
 
+                    if self.skip_if_dynamo and not torch.testing._internal.common_utils.TEST_WITH_TORCHINDUCTOR:
+                        test_wrapper = skipIfTorchDynamo("Policy: we don't run ModuleInfo tests w/ Dynamo")(test_wrapper)
+
                     decorator_fn = partial(module_info.get_decorators, generic_cls.__name__,
                                            test.__name__, device_cls.device_type, dtype)
 
@@ -205,6 +214,7 @@ def __init__(self,
                  skips=(),  # Indicates which tests to skip
                  decorators=None,  # Additional decorators to apply to generated tests
                  dtypes=floating_types(),  # dtypes this function is expected to work with
+                 dtypesIfMPS=(torch.float16, torch.float32,),  # dtypes this function is expected to work with on MPS
                  supports_gradgrad=True,  # whether the op supports second order gradients
                  gradcheck_nondet_tol=0.0,  # tolerance for nondeterminism while performing gradcheck
                  module_memformat_affects_out=False,  # whether converting module to channels last will generate
@@ -216,11 +226,13 @@ def __init__(self,
         self.module_inputs_func = module_inputs_func
         self.decorators = (*(decorators if decorators else []), *(skips if skips else []))
         self.dtypes = dtypes
+        self.dtypesIfMPS = dtypesIfMPS
         self.supports_gradgrad = supports_gradgrad
         self.gradcheck_nondet_tol = gradcheck_nondet_tol
         self.module_memformat_affects_out = module_memformat_affects_out
         self.train_and_eval_differ = train_and_eval_differ
         self.module_error_inputs_func = module_error_inputs_func
+        self.is_lazy = issubclass(module_cls, torch.nn.modules.lazy.LazyModuleMixin)
 
     def get_decorators(self, test_class, test_name, device, dtype, param_kwargs):
         result = [set_single_threaded_if_parallel_tbb]
@@ -232,6 +244,12 @@ def get_decorators(self, test_class, test_name, device, dtype, param_kwargs):
                 result.append(decorator)
         return result
 
+    def supported_dtypes(self, device_type):
+        if device_type == 'mps':
+            return self.dtypesIfMPS
+        else:
+            return self.dtypes
+
     @property
     def name(self):
         return get_module_common_name(self.module_cls)
@@ -1911,6 +1929,55 @@ def module_inputs_torch_nn_LayerNorm(module_info, device, dtype, requires_grad,
             desc='3d_elementwise_affine_no_bias'),
     ]
 
+def module_inputs_torch_nn_RMSNorm(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def rms_norm_reference_fn(m, p, i):
+        eps = m.eps
+        if eps is None:
+            eps = torch.finfo(i.dtype).eps
+        ndim = i.ndim
+        normalized_shape = m.normalized_shape
+        weight = m.weight
+        dims = [ndim - i - 1 for i in range(len(normalized_shape))]
+        result = i * torch.rsqrt(i.pow(2).mean(dim=dims, keepdim=True) + m.eps)
+        if weight is not None:
+            result *= weight
+        return result
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput([5], 1e-3),
+            forward_input=FunctionInput(make_input((4, 5, 5))),
+            desc='1d_elementwise_affine',
+            reference_fn=rms_norm_reference_fn),
+        ModuleInput(
+            constructor_input=FunctionInput([5], 1e-3),
+            forward_input=FunctionInput(make_input((128, 5, 5))),
+            desc='1d_elementwise_affine_large_batch',
+            reference_fn=rms_norm_reference_fn),
+        ModuleInput(
+            constructor_input=FunctionInput([5], 1e-3, False),
+            forward_input=FunctionInput(make_input((4, 5, 5))),
+            desc='1d_no_elementwise_affine',
+            reference_fn=rms_norm_reference_fn),
+        ModuleInput(
+            constructor_input=FunctionInput([2, 2, 5], 1e-3),
+            forward_input=FunctionInput(make_input((4, 2, 2, 5))),
+            desc='3d_elementwise_affine',
+            reference_fn=rms_norm_reference_fn),
+        ModuleInput(
+            constructor_input=FunctionInput([2, 2, 5], 1e-3, False),
+            forward_input=FunctionInput(make_input((4, 2, 2, 5))),
+            desc='3d_no_elementwise_affine',
+            reference_fn=rms_norm_reference_fn),
+        ModuleInput(
+            constructor_input=FunctionInput([5], 1e-3),
+            forward_input=FunctionInput(make_input((0, 5))),
+            desc='1d_empty_elementwise_affine',
+            reference_fn=rms_norm_reference_fn),
+    ]
+
 
 def module_inputs_torch_nn_LocalResponseNorm(module_info, device, dtype, requires_grad, training, **kwargs):
     make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -3308,7 +3375,7 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                        unittest.expectedFailure,
                        'TestModule',
                        'test_memory_format',
-                       active_if=lambda p: p['training'],
+                       active_if=operator.itemgetter('training'),
                    ),)
                ),
     ModuleInfo(torch.nn.AdaptiveAvgPool3d,
@@ -3321,14 +3388,10 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                ),
     ModuleInfo(torch.nn.AdaptiveMaxPool1d,
                module_inputs_func=module_inputs_torch_nn_AdaptiveMaxPool1d,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.AdaptiveMaxPool2d,
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_inputs_func=module_inputs_torch_nn_AdaptiveMaxPool2d,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.AdaptiveMaxPool3d,
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
@@ -3340,13 +3403,10 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                ),
     ModuleInfo(torch.nn.AvgPool1d,
                module_inputs_func=module_inputs_torch_nn_AvgPool1d,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.AvgPool2d,
                module_inputs_func=module_inputs_torch_nn_AvgPool2d,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # The difference between channels last backward and
                    # channels first backward of AvgPool2d on CUDA is too large
                    # See https://github.com/pytorch/pytorch/issues/107201
@@ -3354,9 +3414,11 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                        unittest.expectedFailure,
                        'TestModule',
                        'test_memory_format',
-                       active_if=lambda p: p['training'],
+                       active_if=operator.itemgetter('training'),
                        device_type='cuda',
-                   ),),
+                   ),
+                   # error: input types 'tensor<f32>' and 'tensor<15x10xf16>' are not broadcast compatible
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float16]),),
                ),
     ModuleInfo(torch.nn.AvgPool3d,
                module_inputs_func=module_inputs_torch_nn_AvgPool3d,
@@ -3379,13 +3441,13 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    DecorateInfo(
                        unittest.expectedFailure, 'TestEagerFusionModuleInfo',
                        'test_aot_autograd_symbolic_module_exhaustive',
-                       active_if=lambda p: p['training']
+                       active_if=operator.itemgetter('training')
                    ),
                    # torch._subclasses.fake_tensor.DataDependentOutputException: aten._local_scalar_dense.default
                    DecorateInfo(
                        unittest.expectedFailure, 'TestEagerFusionModuleInfo',
                        'test_aot_autograd_module_exhaustive',
-                       active_if=lambda p: p['training']
+                       active_if=operator.itemgetter('training')
                    ))
                ),
     ModuleInfo(torch.nn.BatchNorm2d,
@@ -3400,13 +3462,13 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    DecorateInfo(
                        unittest.expectedFailure, 'TestEagerFusionModuleInfo',
                        'test_aot_autograd_symbolic_module_exhaustive',
-                       active_if=lambda p: p['training']
+                       active_if=operator.itemgetter('training')
                    ),
                    # torch._subclasses.fake_tensor.DataDependentOutputException: aten._local_scalar_dense.default
                    DecorateInfo(
                        unittest.expectedFailure, 'TestEagerFusionModuleInfo',
                        'test_aot_autograd_module_exhaustive',
-                       active_if=lambda p: p['training']
+                       active_if=operator.itemgetter('training')
                    ),)
                ),
     ModuleInfo(torch.nn.BatchNorm3d,
@@ -3420,19 +3482,21 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    DecorateInfo(
                        unittest.expectedFailure, 'TestEagerFusionModuleInfo',
                        'test_aot_autograd_symbolic_module_exhaustive',
-                       active_if=lambda p: p['training']
+                       active_if=operator.itemgetter('training')
                    ),
                    # torch._subclasses.fake_tensor.DataDependentOutputException: aten._local_scalar_dense.default
                    DecorateInfo(
                        unittest.expectedFailure, 'TestEagerFusionModuleInfo',
                        'test_aot_autograd_module_exhaustive',
-                       active_if=lambda p: p['training']
+                       active_if=operator.itemgetter('training')
                    ),)
                ),
     ModuleInfo(torch.nn.CELU,
                module_inputs_func=module_inputs_torch_nn_CELU,
+               # not MPS specific, will be xfailed for all devices in next PR
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_check_inplace',
+                                device_type='mps', dtypes=[torch.float16]),)
                ),
     ModuleInfo(torch.nn.Conv1d,
                module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=1, lazy=False),
@@ -3443,7 +3507,12 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64])
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3457,7 +3526,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # This was wrongly being skipped before and needs investigation.
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
@@ -3465,6 +3533,12 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Fails with channels last test on MPS backend
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
                                 device_type='mps', dtypes=[torch.float32]),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3497,14 +3571,15 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
-                   DecorateInfo(skipIfMps, 'TestModule',
-                                dtypes=complex_types_and(torch.chalf, torch.float64, torch.complex128)),
                    # Not implmented for chalf on CPU
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
                                 dtypes=(torch.chalf,), device_type='cuda'),
-                   # Ref: https://github.com/pytorch/pytorch/issues/73502
-                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_pickle', dtypes=(torch.chalf,)),
-               ),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
                    DecorateInfo(precisionOverride({torch.chalf: 5e-03}), 'TestModule', 'test_memory_format'),
@@ -3519,8 +3594,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
                    # Failure on ROCM for float32 issue #70125
                    DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
-                   DecorateInfo(skipIfMps, 'TestModule',
-                                dtypes=complex_types_and(torch.chalf, torch.float64, torch.complex128)),
                    # Fails on backward check because ViewAsRealBackward apply contiguous for grad
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_memory_format',
                                 dtypes=(torch.complex32, torch.complex64, torch.complex128)),
@@ -3534,8 +3607,12 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Not implemented for chalf on CPU
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
                                 dtypes=(torch.chalf,), device_type='cuda'),
-                   # Ref: https://github.com/pytorch/pytorch/issues/73502
-                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_pickle', dtypes=(torch.chalf,)),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3562,8 +3639,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Not implmented for chalf on CPU
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
                                 dtypes=(torch.chalf,), device_type='cuda'),
-                   # Ref: https://github.com/pytorch/pytorch/issues/73502
-                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_pickle', dtypes=(torch.chalf,)),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3574,13 +3649,14 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                module_inputs_func=module_inputs_torch_nn_CosineEmbeddingLoss,
                skips=(
                    # No channels_last support for loss functions.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
     ModuleInfo(torch.nn.ELU,
                module_inputs_func=module_inputs_torch_nn_ELU,
+               # not MPS specific, will be xfailed for all devices in next PR
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_check_inplace',
+                                device_type='mps', dtypes=[torch.float16]),)
                ),
     ModuleInfo(torch.nn.FractionalMaxPool2d,
                module_inputs_func=module_inputs_torch_nn_FractionalMaxPool2d,
@@ -3602,15 +3678,15 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                module_inputs_func=module_inputs_torch_nn_L1Loss,
                skips=(
                    # No channels_last support for loss functions.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
     ModuleInfo(torch.nn.SmoothL1Loss,
                module_inputs_func=module_inputs_torch_nn_SmoothL1Loss,
                skips=(
                    # No channels_last support for loss functions.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   # See #119108: input types 'tensor<f32>' and 'tensor<15x10xf16>' are not broadcast compatible
+                   DecorateInfo(skipIfMps, 'TestModule', 'test_non_contiguous_tensors', dtypes=[torch.float16]),)
                ),
     ModuleInfo(torch.nn.LazyConv1d,
                module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=1, lazy=True),
@@ -3624,7 +3700,12 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3634,7 +3715,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # channels_last support on cuda requires cudnn >= 7603
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
                    # Failure on ROCM for float32 issue #70125
@@ -3649,6 +3729,12 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Fails with channels last test on MPS backend
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
                                 device_type='mps', dtypes=[torch.float32]),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3686,7 +3772,12 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3703,7 +3794,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # This was wrongly being skipped before and needs investigation.
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
@@ -3711,6 +3801,12 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Fails with channels last test on MPS backend
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
                                 device_type='mps', dtypes=[torch.float32]),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3739,7 +3835,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
     ModuleInfo(torch.nn.Linear,
                module_inputs_func=module_inputs_torch_nn_Linear,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # No channels_last support for Linear currently.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
@@ -3750,33 +3845,33 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                        toleranceOverride({
                            torch.float32: tol(atol=1e-4, rtol=1e-4),
                            torch.float64: tol(atol=1e-4, rtol=1e-4)}),
-                       'TestModule', 'test_forward', device_type='cpu')
+                       'TestModule', 'test_forward', device_type='cpu'),
                ],
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # No channels_last support for Bilinear currently.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # See #119108: tolerance issue
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
+                                device_type='mps', dtypes=[torch.float16]),)
                ),
     ModuleInfo(torch.nn.LPPool1d,
                module_inputs_func=module_inputs_torch_nn_LPPool1d,
                skips=(
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_grad'),
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_gradgrad'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_gradgrad'),)
                ),
     ModuleInfo(torch.nn.LPPool2d,
                module_inputs_func=module_inputs_torch_nn_LPPool2d,
                skips=(
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_grad'),
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_gradgrad'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # Fails on backward check on MPS
                    # See https://github.com/pytorch/pytorch/issues/107214
                    DecorateInfo(
                        unittest.expectedFailure,
                        'TestModule',
                        'test_memory_format',
-                       active_if=lambda p: p['training'],
+                       active_if=operator.itemgetter('training'),
                        device_type='mps',
                    ),)
                ),
@@ -3790,13 +3885,9 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                ),
     ModuleInfo(torch.nn.MaxPool1d,
                module_inputs_func=module_inputs_torch_nn_MaxPool1d,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.MaxPool2d,
                module_inputs_func=module_inputs_torch_nn_MaxPool2d,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.MaxPool3d,
                module_inputs_func=module_inputs_torch_nn_MaxPool3d,
@@ -3810,7 +3901,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                skips=(
                    # No channels_last support for loss functions.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # https://github.com/pytorch/pytorch/issues/115588
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_cpu_gpu_parity'),
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_grad'),
@@ -3821,14 +3911,17 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                skips=(
                    # No channels_last support for loss functions.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   # See #119108: input types 'tensor<f32>' and 'tensor<15x10xf16>' are not broadcast compatible
+                   DecorateInfo(skipIfMps, 'TestModule', 'test_non_contiguous_tensors', dtypes=[torch.float16]),
+                   # See #119108: tolerance issue
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
+                                device_type='mps', dtypes=[torch.float16]),)
                ),
     ModuleInfo(torch.nn.MarginRankingLoss,
                module_inputs_func=module_inputs_torch_nn_MarginRankingLoss,
                skips=(
                    # No channels_last support for loss functions.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
     ModuleInfo(torch.nn.MultiLabelMarginLoss,
                module_inputs_func=module_inputs_torch_nn_MultiLabelMarginLoss,
@@ -3855,61 +3948,65 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                skips=(
                    # No channels_last support for loss functions.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   # See #119108: tolerance issue
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
+                                device_type='mps', dtypes=[torch.float16]),)
                ),
     ModuleInfo(torch.nn.MultiLabelSoftMarginLoss,
                module_inputs_func=module_inputs_torch_nn_MultiLabelSoftMarginLoss,
                skips=(
                    # No channels_last support for loss functions.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
     ModuleInfo(torch.nn.NLLLoss,
                module_inputs_func=module_inputs_torch_nn_NLLLoss,
                skips=(
                    # No channels_last support for loss functions.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   # See #119108: tolerance issue
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
+                                device_type='mps', dtypes=[torch.float16]),)
                ),
     ModuleInfo(torch.nn.GaussianNLLLoss,
                module_inputs_func=module_inputs_torch_nn_GaussianNLLLoss,
                skips=(
                    # No channels_last support for loss functions.
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)),
     ModuleInfo(torch.nn.PoissonNLLLoss,
                module_inputs_func=module_inputs_torch_nn_PoissonNLLLoss,
                skips=(
                    # No channels_last support for loss functions.
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)),
     ModuleInfo(torch.nn.HingeEmbeddingLoss,
                module_inputs_func=module_inputs_torch_nn_HingeEmbeddingLoss,
                skips=(
                    # No channels_last support for loss functions.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
     ModuleInfo(torch.nn.HuberLoss,
                module_inputs_func=module_inputs_torch_nn_HuberLoss,
                skips=(
                    # No channels_last support for loss functions.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   # See #119108: seemingly incorrect output dtype
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
+                                device_type='mps', dtypes=[torch.float16]),)
                ),
     ModuleInfo(torch.nn.BCELoss,
                module_inputs_func=module_inputs_torch_nn_BCELoss,
                skips=(
                    # No channels_last support for loss functions.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   # error: input types 'tensor<f32>' and 'tensor<15x10xf16>' are not broadcast compatible
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float16]),)
                ),
     ModuleInfo(torch.nn.BCEWithLogitsLoss,
                module_inputs_func=module_inputs_torch_nn_BCEWithLogitsLoss,
                skips=(
                    # No channels_last support for loss functions.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   # see #119108: tolerance issue
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float16]),)
                ),
     ModuleInfo(torch.nn.CrossEntropyLoss,
                module_inputs_func=module_inputs_torch_nn_CrossEntropyLoss,
@@ -3917,19 +4014,10 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                decorators=(
                    # No channels_last support for loss functions.
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_memory_format'),
-                   # Expect failures for tests that rely on torch.half implementation on CPU
-                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward", dtypes=[torch.float16], device_type='cpu'),
-                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_if_train_and_eval_modes_differ",
-                                dtypes=[torch.float16], device_type='cpu'),
-                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_pickle", dtypes=[torch.float16], device_type='cpu'),
-                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_non_contiguous_tensors", dtypes=[torch.float16],
-                                device_type='cpu'),
-                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_multiple_device_transfer", dtypes=[torch.float16],
-                                device_type='cuda'),
+                   DecorateInfo(toleranceOverride({torch.float16: tol(atol=3e-2, rtol=1e-3)}), "TestModule",
+                                "test_forward", dtypes=[torch.float16], device_type='cpu'),
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_cpu_gpu_parity", dtypes=[torch.float16],
-                                device_type='cuda'),
-                   # MPS does not support double, so expect failures on MPS for torch.double
-                   DecorateInfo(unittest.expectedFailure, 'TestModule', dtypes=[torch.float64], device_type='mps'),),
+                                device_type='cuda'),),
                ),
     ModuleInfo(torch.nn.CTCLoss,
                module_inputs_func=module_inputs_torch_nn_CTCLoss,
@@ -3947,18 +4035,17 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
     ModuleInfo(torch.nn.GELU,
                module_inputs_func=module_inputs_torch_nn_GELU,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   # See #119108: tolerance issue
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
+                                device_type='mps', dtypes=[torch.float16]),)
                ),
     ModuleInfo(torch.nn.GLU,
                module_inputs_func=module_inputs_torch_nn_GLU,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.GroupNorm,
                module_inputs_func=module_inputs_torch_nn_GroupNorm,
                dtypes=get_all_fp_dtypes(include_bfloat16=True, include_half=True),
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64, torch.bfloat16]),
                    # Tracking at https://github.com/pytorch/pytorch/issues/98089
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_cpu_gpu_parity'),
                    DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-4, rtol=1e-4)}),
@@ -3978,27 +4065,23 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
     ModuleInfo(torch.nn.Hardswish,
                module_inputs_func=module_inputs_torch_nn_Hardswish,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # Fails on backward check on MPS
                    # See https://github.com/pytorch/pytorch/issues/107214
                    DecorateInfo(
                        unittest.expectedFailure,
                        'TestModule',
                        'test_memory_format',
-                       active_if=lambda p: p['training'],
+                       active_if=operator.itemgetter('training'),
                        device_type='mps',
                    ),),
                supports_gradgrad=False),
     ModuleInfo(torch.nn.Hardtanh,
                module_inputs_func=module_inputs_torch_nn_Hardtanh,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),),
                ),
     ModuleInfo(torch.nn.InstanceNorm1d,
                module_inputs_func=partial(module_inputs_torch_nn_InstanceNormNd, N=1),
                train_and_eval_differ=True,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # No channels_last support for InstanceNorm1d currently.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
@@ -4006,7 +4089,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                module_inputs_func=partial(module_inputs_torch_nn_InstanceNormNd, N=2),
                train_and_eval_differ=True,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # No channels_last support for InstanceNorm2d currently.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
@@ -4028,10 +4110,12 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
     ModuleInfo(torch.nn.LayerNorm,
                module_inputs_func=module_inputs_torch_nn_LayerNorm,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # No channels_last support for LayerNorm currently.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
+    ModuleInfo(torch.nn.RMSNorm,
+               module_inputs_func=module_inputs_torch_nn_RMSNorm,
+               ),
     # TransformerEncoder takes the same inputs as TransformerEncoderLayer
     ModuleInfo(torch.nn.TransformerEncoder,
                train_and_eval_differ=True,
@@ -4046,8 +4130,7 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
                    # Doesn't support device / dtype kwargs directly because it is just a
                    # container of TransformerEncoderLayers.
-                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_factory_kwargs'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_factory_kwargs'),)
                ),
     ModuleInfo(torch.nn.TransformerEncoderLayer,
                train_and_eval_differ=True,
@@ -4062,8 +4145,7 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                ],
                skips=(
                    # No channels_last support for TransformerEncoderLayer currently.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
     ModuleInfo(torch.nn.TransformerDecoderLayer,
                module_inputs_func=module_inputs_torch_nn_TransformerDecoderLayer,
@@ -4074,8 +4156,7 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                ],
                skips=(
                    # No channels_last support for TransformerDecoderLayer currently.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
     ModuleInfo(torch.nn.Transformer,
                module_inputs_func=module_inputs_torch_nn_Transformer,
@@ -4086,41 +4167,39 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                ],
                skips=(
                    # No channels_last support for Transformer currently.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
     ModuleInfo(torch.nn.MultiheadAttention,
                train_and_eval_differ=True,
                module_inputs_func=module_inputs_torch_nn_MultiheadAttention,
                skips=(
                    # No channels_last support for MultiheadAttention currently.
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
     ModuleInfo(torch.nn.Embedding,
                module_inputs_func=module_inputs_torch_nn_Embedding,
+               decorators=[
+                   DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-4, rtol=1e-4)}),
+                                'TestModule', 'test_non_contiguous_tensors',
+                                device_type='mps')],
                skips=(
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
     ModuleInfo(torch.nn.ReLU,
                module_inputs_func=module_inputs_torch_nn_ReLU,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # Fails on backward check on MPS
                    # See https://github.com/pytorch/pytorch/issues/107214
                    DecorateInfo(
                        unittest.expectedFailure,
                        'TestModule',
                        'test_memory_format',
-                       active_if=lambda p: p['training'],
+                       active_if=operator.itemgetter('training'),
                        device_type='mps',
                    ),)
                ),
     ModuleInfo(torch.nn.LeakyReLU,
                module_inputs_func=module_inputs_torch_nn_LeakyReLU,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.ReLU6,
                module_inputs_func=module_inputs_torch_nn_ReLU6,
@@ -4139,68 +4218,59 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
     ModuleInfo(torch.nn.RNNCell,
                module_inputs_func=partial(module_inputs_torch_nn_RNN_GRU_Cell, is_rnn=True),
                module_error_inputs_func=module_error_inputs_torch_nn_RNN_GRU_Cell,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.GRUCell,
                module_inputs_func=module_inputs_torch_nn_RNN_GRU_Cell,
                module_error_inputs_func=module_error_inputs_torch_nn_RNN_GRU_Cell,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.LSTMCell,
                module_inputs_func=module_inputs_torch_nn_LSTMCell,
                module_error_inputs_func=module_error_inputs_torch_nn_LSTMCell,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.Sigmoid,
                module_inputs_func=module_inputs_torch_nn_Sigmoid,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # Fails on backward check on MPS
                    # See https://github.com/pytorch/pytorch/issues/107214
                    DecorateInfo(
                        unittest.expectedFailure,
                        'TestModule',
                        'test_memory_format',
-                       active_if=lambda p: p['training'],
+                       active_if=operator.itemgetter('training'),
                        device_type='mps',
                    ),)
                ),
     ModuleInfo(torch.nn.LogSigmoid,
                module_inputs_func=module_inputs_torch_nn_LogSigmoid,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                   # See #119108: tolerance issue
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward", device_type='mps', dtypes=[torch.float16]),)
                ),
     ModuleInfo(torch.nn.SiLU,
                module_inputs_func=module_inputs_torch_nn_SiLU,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.Softmax,
                module_inputs_func=module_inputs_torch_nn_Softmax,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.Softmax2d,
                module_inputs_func=module_inputs_torch_nn_Softmax2d,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # no channels last support for Softmax2d currently
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # See #119108: tolerance issue
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward", device_type='mps', dtypes=[torch.float16]),)
                ),
     ModuleInfo(torch.nn.LogSoftmax,
                module_inputs_func=module_inputs_torch_nn_LogSoftmax,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # no channels last support for LogSoftmax currently
-                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # See #119108: inf nan error
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward", device_type='mps', dtypes=[torch.float16]),)
                ),
     ModuleInfo(torch.nn.Softmin,
                module_inputs_func=module_inputs_torch_nn_Softmin,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # no channels last support for Softmin currently
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
                ),
@@ -4219,34 +4289,30 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                ),
     ModuleInfo(torch.nn.Softsign,
                module_inputs_func=module_inputs_torch_nn_Softsign,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.Tanh,
                module_inputs_func=module_inputs_torch_nn_Tanh,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # Fails on backward check on MPS
                    # See https://github.com/pytorch/pytorch/issues/107214
                    DecorateInfo(
                        unittest.expectedFailure,
                        'TestModule',
                        'test_memory_format',
-                       active_if=lambda p: p['training'],
+                       active_if=operator.itemgetter('training'),
                        device_type='mps',
                    ),)
                ),
     ModuleInfo(torch.nn.Tanhshrink,
                module_inputs_func=module_inputs_torch_nn_Tanhshrink,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # Fails on backward check on MPS
                    # See https://github.com/pytorch/pytorch/issues/107214
                    DecorateInfo(
                        unittest.expectedFailure,
                        'TestModule',
                        'test_memory_format',
-                       active_if=lambda p: p['training'],
+                       active_if=operator.itemgetter('training'),
                        device_type='mps',
                    ),)
                ),
@@ -4267,16 +4333,12 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                train_and_eval_differ=True,
                module_inputs_func=partial(module_inputs_torch_nn_RNN_GRU, is_rnn=True),
                module_error_inputs_func=module_error_inputs_torch_nn_RNN_GRU,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),),
                decorators=rnn_gru_lstm_module_info_decorators
                ),
     ModuleInfo(torch.nn.GRU,
                train_and_eval_differ=True,
                module_inputs_func=partial(module_inputs_torch_nn_RNN_GRU, is_rnn=False),
                module_error_inputs_func=module_error_inputs_torch_nn_RNN_GRU,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),),
                decorators=rnn_gru_lstm_module_info_decorators),
     ModuleInfo(torch.nn.LSTM,
                train_and_eval_differ=True,
@@ -4288,8 +4350,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                decorators=rnn_gru_lstm_module_info_decorators),
     ModuleInfo(torch.nn.ReflectionPad1d,
                module_inputs_func=module_inputs_torch_nn_ReflectionPad1d,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.ReflectionPad2d,
                module_inputs_func=module_inputs_torch_nn_ReflectionPad2d,
@@ -4298,8 +4358,7 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
                                 device_type='cuda'),
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
-                                device_type='mps'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                                device_type='mps'),)
                ),
     ModuleInfo(torch.nn.ReflectionPad3d,
                module_inputs_func=module_inputs_torch_nn_ReflectionPad3d,
@@ -4308,13 +4367,10 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
                                 device_type='cuda'),
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
-                                device_type='mps'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                                device_type='mps'),)
                ),
     ModuleInfo(torch.nn.ReplicationPad1d,
                module_inputs_func=module_inputs_torch_nn_ReplicationPad1d,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.ReplicationPad2d,
                module_inputs_func=module_inputs_torch_nn_ReplicationPad2d,
@@ -4323,8 +4379,7 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
                                 device_type='cuda'),
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
-                                device_type='mps'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                                device_type='mps'),)
                ),
     ModuleInfo(torch.nn.ReplicationPad3d,
                module_inputs_func=module_inputs_torch_nn_ReplicationPad3d,
@@ -4333,8 +4388,7 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
                                 device_type='cuda'),
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
-                                device_type='mps'),
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
+                                device_type='mps'),)
                ),
     ModuleInfo(torch.nn.SELU,
                module_inputs_func=module_inputs_torch_nn_SELU,
@@ -4345,59 +4399,46 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                ),
     ModuleInfo(torch.nn.ZeroPad1d,
                module_inputs_func=module_inputs_torch_nn_ZeroPad1d,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.ZeroPad2d,
                module_inputs_func=module_inputs_torch_nn_ZeroPad2d,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # Fails with channels last test on MPS backend
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='mps'),)
                ),
     ModuleInfo(torch.nn.ZeroPad3d,
                module_inputs_func=module_inputs_torch_nn_ZeroPad3d,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # Fails with channels last test on MPS backend
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='mps'),)
                ),
     ModuleInfo(torch.nn.CircularPad1d,
                module_inputs_func=module_inputs_torch_nn_CircularPad1d,
                module_error_inputs_func=module_error_inputs_torch_nn_Pad1d,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.CircularPad2d,
                module_inputs_func=module_inputs_torch_nn_CircularPad2d,
                module_error_inputs_func=module_error_inputs_torch_nn_Pad2d,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.CircularPad3d,
                module_inputs_func=module_inputs_torch_nn_CircularPad3d,
                module_error_inputs_func=module_error_inputs_torch_nn_Pad3d,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # Fails with channels last test on MPS backend
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format"),)
                ),
     ModuleInfo(torch.nn.ConstantPad1d,
                module_inputs_func=module_inputs_torch_nn_ConstantPad1d,
-               skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),)
                ),
     ModuleInfo(torch.nn.ConstantPad2d,
                module_inputs_func=module_inputs_torch_nn_ConstantPad2d,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # Fails with channels last test on MPS backend
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='mps'),)
                ),
     ModuleInfo(torch.nn.ConstantPad3d,
                module_inputs_func=module_inputs_torch_nn_ConstantPad3d,
                skips=(
-                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # Fails with channels last test on MPS backend
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='mps'),)
                )
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index c8c05660f15ac..c11314721f27c 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from abc import abstractmethod
 import tempfile
 import unittest
@@ -14,7 +16,7 @@
 import torch.nn.functional as F
 from torch.nn import _reduction as _Reduction
 from torch.testing._internal.common_utils import TestCase, to_gpu, freeze_rng_state, is_iterable, \
-    gradcheck, gradgradcheck, set_default_dtype
+    gradcheck, gradgradcheck, set_default_dtype, skipIfTorchDynamo
 from torch.testing._internal.common_cuda import TEST_CUDA, SM90OrLater
 from torch.autograd.gradcheck import _get_numerical_jacobian, _iter_tensors
 from torch.autograd import Variable
@@ -1707,6 +1709,7 @@ def unsqueeze_inp(inp):
         input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
         check_gradgrad=False,
         default_dtype=torch.double,
+        decorator=skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/117971")
     ),
     dict(
         module_name='Embedding',
@@ -1716,6 +1719,7 @@ def unsqueeze_inp(inp):
         check_gradgrad=False,
         desc='discontiguous',
         default_dtype=torch.double,
+        decorator=skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/117971")
     ),
     dict(
         module_name='EmbeddingBag',
@@ -3329,8 +3333,7 @@ def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwa
                 if name in {'constructor_args', 'extra_args'}:
                     kwargs[name] = tuple()
                 else:
-                    raise ValueError("{}: Specify {} by a value, a function to generate it, or it's size!"
-                                     .format(self.get_name(), name))
+                    raise ValueError(f"{self.get_name()}: Specify {name} by a value, a function to generate it, or it's size!")
         self._extra_kwargs = kwargs
         self._arg_cache = {}
 
diff --git a/torch/testing/_internal/common_optimizers.py b/torch/testing/_internal/common_optimizers.py
index 54f7a52e8e53f..b468059b053a0 100644
--- a/torch/testing/_internal/common_optimizers.py
+++ b/torch/testing/_internal/common_optimizers.py
@@ -1,6 +1,8 @@
+# mypy: ignore-errors
+
 import functools
 import itertools
-import math
+import sys
 import unittest
 from copy import deepcopy
 from enum import Enum
@@ -25,6 +27,14 @@
     SGD,
     SparseAdam,
 )
+from torch.optim.lr_scheduler import (
+    ConstantLR,
+    ExponentialLR,
+    LinearLR,
+    PolynomialLR,
+    ReduceLROnPlateau,
+    StepLR,
+)
 from torch.testing._internal.common_device_type import tol, toleranceOverride
 from torch.testing._internal.common_methods_invocations import DecorateInfo
 from torch.testing._internal.common_utils import (
@@ -34,10 +44,7 @@
     skipIfTorchDynamo,
     TEST_WITH_TORCHDYNAMO,
 )
-from torch.utils._foreach_utils import (
-    _get_foreach_kernels_supported_devices,
-    _get_fused_kernels_supported_devices,
-)
+from torch.utils._foreach_utils import _get_foreach_kernels_supported_devices
 
 
 class OptimizerInput:
@@ -100,13 +107,30 @@ def __init__(
         # to the test using the OptimizerInfo. OptimizerInput.params is likely None.
         # Can optionally take in device to filter out certain unsupported configs
         optim_inputs_func,
+        # Tuple of lambdas to generate LRScheduler instances to run with the optimizer for the
+        # LRScheduler tests like test_forloop_goes_right_direction with_lrsched.
+        # We DO NOT expect to thoroughly test LRSchedulers through the optimizers, so not every
+        # LRScheduler configuration will be included. See test_lrscheduler.py for that instead.
+        # A few optimizers like SGD and Adam will test more LRSchedulers.
+        scheduler_inputs=(
+            [
+                lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                lambda opt: ReduceLROnPlateau(opt),
+            ],
+        ),
         # A subset of the global-cliquey flags (fused, foreach, differentiable) the optimizer
         # supports. See NOTE: [optimizer kwarg categories] for what global-cliquey means.
         supported_impls: Tuple[str] = ("foreach", "differentiable"),
-        # the devices on which the optim supports sparse tensors for params and grads, see SGD
-        supports_sparse_on: Tuple[str] = (),
+        # the optim supports passing in sparse gradients as well as dense grads
+        supports_sparse: bool = False,
         # the optim only supports one config: sparse grads w/ dense params, see SparseAdam
         only_supports_sparse_grads: bool = False,
+        # Tuple of (optimizer kwargs, schedulers_constructors) specifically for sparse tests,
+        # with especially tuned hyperparameters. These only apply if the optimizer supports
+        # sparse parameters or grads.
+        metadata_for_sparse=({}, []),
+        # the optim supports complex parameters
+        supports_complex: bool = True,
         # whether the optimizer.step() function requires a closure to be passed
         step_requires_closure: bool = False,
         # whether the optimizer supports per-param options with parameter groups
@@ -116,12 +140,16 @@ def __init__(
         skips=(),  # Indicates which tests to skip
         decorators=None,  # Additional decorators to apply to generated tests
         optim_error_inputs_func=None,  # Function to generate optim inputs that error
+        supports_fused_on: Tuple[str] = (),
     ):
         self.optim_cls = optim_cls
         self.optim_inputs_func = optim_inputs_func
+        self.scheduler_inputs = scheduler_inputs
         self.supported_impls = supported_impls
-        self.supports_sparse_on = supports_sparse_on
+        self.supports_sparse = supports_sparse
+        self.metadata_for_sparse = metadata_for_sparse
         self.only_supports_sparse_grads = only_supports_sparse_grads
+        self.supports_complex = supports_complex
         self.step_requires_closure = step_requires_closure
         self.supports_param_groups = supports_param_groups
         self.supports_multiple_devices = supports_multiple_devices
@@ -130,6 +158,7 @@ def __init__(
             *(skips if skips else []),
         )
         self.optim_error_inputs_func = optim_error_inputs_func
+        self.supports_fused_on = supports_fused_on
 
     def get_decorators(self, test_class, test_name, device, dtype, param_kwargs):
         result = [set_single_threaded_if_parallel_tbb]
@@ -261,24 +290,36 @@ def get_error_inputs_for_all_optims(device, dtype):
 # global-cliquey flags to individual tests and fully expect tests to edit OptimizerInput.kwargs.
 
 
-def optim_inputs_func_adadelta(device=None):
+def optim_inputs_func_adadelta(device, dtype=None):
+    cuda_supported_configs = [
+        OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "capturable": True},
+            desc="capturable with weight decay",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"lr": torch.tensor(0.001), "capturable": True},
+            desc="Tensor lr with capturable",
+        ),
+    ]
+
     return [
         OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lr": 0.01}, desc="non-default lr"),
         OptimizerInput(
-            params=None, kwargs={"lr": 0.01}, desc="non-default lr"
-        ),  # TODO: Move out to testing in param_group?
-        OptimizerInput(
-            params=None, kwargs={"weight_decay": 0.9}, desc="nonzero weight_decay"
+            params=None, kwargs={"weight_decay": 0.1}, desc="nonzero weight_decay"
         ),
         OptimizerInput(
             params=None,
-            kwargs={"weight_decay": 0.9, "maximize": True},
+            kwargs={"weight_decay": 0.1, "maximize": True},
             desc="maximize",
         ),
         OptimizerInput(
             params=None, kwargs={"rho": 0.95, "weight_decay": 0.9}, desc="rho"
-        ),  # TODO: Move out to testing in param_group?
-    ]
+        ),
+    ] + (cuda_supported_configs if "cuda" in str(device) else [])
 
 
 def optim_error_inputs_func_adadelta(device, dtype):
@@ -298,27 +339,33 @@ def optim_error_inputs_func_adadelta(device, dtype):
     return error_inputs
 
 
-def optim_inputs_func_adagrad(device=None):
+def optim_inputs_func_adagrad(device, dtype=None):
     return [
         OptimizerInput(params=None, kwargs={}, desc="default"),
         OptimizerInput(
-            params=None, kwargs={"weight_decay": 0.9}, desc="nonzero weight_decay"
+            params=None, kwargs={"weight_decay": 0.1}, desc="nonzero weight_decay"
         ),
         OptimizerInput(
             params=None,
-            kwargs={"weight_decay": 0.9, "maximize": True},
+            kwargs={"weight_decay": 0.1, "maximize": True},
             desc="maximize",
         ),
+        OptimizerInput(params=None, kwargs={"lr": 0.1}, desc="non-default lr"),
         OptimizerInput(
             params=None,
-            kwargs={"initial_accumulator_value": 0.1, "weight_decay": 0.9},
+            kwargs={"initial_accumulator_value": 0.1, "weight_decay": 0.1},
             desc="initial_accumulator_value",
         ),
         OptimizerInput(
             params=None,
-            kwargs={"lr": 0.1, "lr_decay": 0.5, "weight_decay": 0.9},
+            kwargs={"lr": 0.1, "lr_decay": 0.5, "weight_decay": 0.1},
             desc="lr_decay",
         ),  # TODO: Move out to testing in param_group?
+        OptimizerInput(
+            params=None,
+            kwargs={"lr": torch.tensor(0.001)},
+            desc="Tensor lr",
+        ),
     ]
 
 
@@ -341,12 +388,12 @@ def optim_error_inputs_func_adagrad(device, dtype):
 
 # TODO: consider tensor LR! See multi_tensor_optimizer_configs in test_optim.py --> tensor LR should work
 # with all implementation code paths...
-def optim_inputs_func_adam(device=None):
+def optim_inputs_func_adam(device, dtype=None):
     cuda_supported_configs = [
         OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
         OptimizerInput(
             params=None,
-            kwargs={"weight_decay": 0.9, "amsgrad": True, "capturable": True},
+            kwargs={"weight_decay": 0.1, "amsgrad": True, "capturable": True},
             desc="capturable, amsgrad",
         ),
         OptimizerInput(
@@ -356,21 +403,34 @@ def optim_inputs_func_adam(device=None):
         ),
     ]
 
-    return [
+    total = [
         OptimizerInput(params=None, kwargs={}, desc="default"),
         OptimizerInput(params=None, kwargs={"lr": 0.01}, desc="non-default lr"),
         OptimizerInput(
-            params=None, kwargs={"weight_decay": 0.9}, desc="nonzero weight_decay"
+            params=None, kwargs={"weight_decay": 0.1}, desc="nonzero weight_decay"
         ),
         OptimizerInput(
             params=None,
-            kwargs={"weight_decay": 0.9, "maximize": True},
+            kwargs={"weight_decay": 0.1, "maximize": True},
             desc="maximize",
         ),
         OptimizerInput(
-            params=None, kwargs={"weight_decay": 0.9, "amsgrad": True}, desc="amsgrad"
+            params=None, kwargs={"weight_decay": 0.1, "amsgrad": True}, desc="amsgrad"
         ),
-    ] + (cuda_supported_configs if str(device) == "cuda" else [])
+    ] + (cuda_supported_configs if "cuda" in str(device) else [])
+    if dtype in (torch.float16,):
+        for input in total:
+            """
+            Too small eps will make denom to be zero for low precision dtype
+            denom = (exp_avg_sq.sqrt() / bias_correction2_sqrt).add_(eps)
+            For example,
+            >>> a
+            tensor([0.], dtype=torch.float16)
+            >>> a + 1e-8
+            tensor([0.], dtype=torch.float16)
+            """
+            input.kwargs["eps"] = 0.1
+    return total
 
 
 def optim_error_inputs_func_adam(device, dtype):
@@ -405,7 +465,7 @@ def optim_error_inputs_func_adam(device, dtype):
                 error_regex="lr as a Tensor is not supported for capturable=False and foreach=True",
             ),
         ]
-    if str(device) == "cuda":
+    if "cuda" in str(device):
         sample_tensor = torch.empty((), device=device, dtype=dtype)
         error_inputs += [
             ErrorOptimizerInput(
@@ -430,19 +490,48 @@ def optim_error_inputs_func_adam(device, dtype):
     return error_inputs
 
 
-def optim_inputs_func_adamax(device=None):
+def optim_inputs_func_adamax(device, dtype=None):
+    cuda_supported_configs = [
+        OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.9, "maximize": True, "capturable": True},
+            desc="capturable, maximize, weight_decay",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0, "maximize": True, "capturable": True},
+            desc="capturable, maximize",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.9, "maximize": False, "capturable": True},
+            desc="capturable, weight_decay",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={
+                "lr": torch.tensor(0.001),
+                "weight_decay": 0.9,
+                "maximize": False,
+                "capturable": True,
+            },
+            desc="capturable, weight_decay, tensor LR",
+        ),
+    ]
+
     return [
         OptimizerInput(params=None, kwargs={}, desc="default"),
-        OptimizerInput(params=None, kwargs={"lr": 0.001}, desc="non-default lr"),
+        OptimizerInput(params=None, kwargs={"lr": 0.1}, desc="non-default lr"),
         OptimizerInput(
-            params=None, kwargs={"weight_decay": 0.9}, desc="nonzero weight_decay"
+            params=None, kwargs={"weight_decay": 0.1}, desc="nonzero weight_decay"
         ),
         OptimizerInput(
             params=None,
-            kwargs={"weight_decay": 0.9, "maximize": True},
+            kwargs={"weight_decay": 0.1, "maximize": True},
             desc="maximize",
         ),
-    ]
+    ] + (cuda_supported_configs if "cuda" in str(device) else [])
 
 
 def optim_error_inputs_func_adamax(device, dtype):
@@ -462,28 +551,57 @@ def optim_error_inputs_func_adamax(device, dtype):
     return error_inputs
 
 
-def optim_inputs_func_adamw(device=None):
-    return optim_inputs_func_adam(device=device)
+def optim_inputs_func_adamw(device, dtype=None):
+    return optim_inputs_func_adam(device, dtype)
 
 
 def optim_error_inputs_func_adamw(device, dtype):
     return optim_error_inputs_func_adam(device, dtype)
 
 
-def optim_inputs_func_asgd(device=None):
+def optim_inputs_func_asgd(device, dtype=None):
+    cuda_supported_configs = [
+        OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
+        OptimizerInput(
+            params=None,
+            kwargs={"maximize": True, "capturable": True},
+            desc="maximize, capturable",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "capturable": True},
+            desc="weight_decay, capturable",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "maximize": True, "capturable": True},
+            desc="maximize, weight_decay, capturable",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={
+                "lr": torch.tensor(0.001),
+                "weight_decay": 0.1,
+                "maximize": True,
+                "capturable": True,
+            },
+            desc="maximize, weight_decay, capturable, tensor LR",
+        ),
+    ]
     return [
         OptimizerInput(params=None, kwargs={}, desc="default"),
         OptimizerInput(params=None, kwargs={"lr": 0.02}, desc="non-default lr"),
         OptimizerInput(params=None, kwargs={"t0": 100}, desc="t0"),
+        OptimizerInput(params=None, kwargs={"maximize": True}, desc="maximize"),
         OptimizerInput(
-            params=None, kwargs={"weight_decay": 0.9}, desc="nonzero weight_decay"
+            params=None, kwargs={"weight_decay": 0.1}, desc="nonzero weight_decay"
         ),
         OptimizerInput(
             params=None,
-            kwargs={"weight_decay": 0.9, "maximize": True},
-            desc="maximize",
+            kwargs={"weight_decay": 0.1, "maximize": True},
+            desc="maximize, nonzero weight_decay",
         ),
-    ]
+    ] + (cuda_supported_configs if "cuda" in str(device) else [])
 
 
 def optim_error_inputs_func_asgd(device, dtype):
@@ -503,12 +621,12 @@ def optim_error_inputs_func_asgd(device, dtype):
     return error_inputs
 
 
-def optim_inputs_func_lbfgs(device=None):
+def optim_inputs_func_lbfgs(device, dtype=None):
     return [
         OptimizerInput(params=None, kwargs={}, desc="default"),
         OptimizerInput(params=None, kwargs={"lr": 0.01}, desc="non-default lr"),
         OptimizerInput(
-            params=None, kwargs={"tolerance_grad": math.inf}, desc="tolerance_grad"
+            params=None, kwargs={"tolerance_grad": 1e-6}, desc="tolerance_grad"
         ),
         OptimizerInput(
             params=None,
@@ -519,13 +637,40 @@ def optim_inputs_func_lbfgs(device=None):
 
 
 def optim_error_inputs_func_lbfgs(device, dtype):
-    return get_error_inputs_for_all_optims(device, dtype)
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+    return error_inputs
 
 
 # Weird story bro, NAdam and RAdam do not have maximize.
-def optim_inputs_func_nadam(device=None):
+def optim_inputs_func_nadam(device, dtype=None):
     cuda_supported_configs = [
         OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.9, "momentum_decay": 6e-3, "capturable": True},
+            desc="weight_decay, capturable",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={
+                "weight_decay": 0.9,
+                "momentum_decay": 6e-3,
+                "decoupled_weight_decay": True,
+                "capturable": True,
+            },
+            desc="decoupled_weight_decay, capturable",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={
+                "lr": torch.tensor(0.001),
+                "weight_decay": 0.9,
+                "momentum_decay": 6e-3,
+                "decoupled_weight_decay": True,
+                "capturable": True,
+            },
+            desc="decoupled_weight_decay, capturable",
+        ),
     ]
     return [
         OptimizerInput(params=None, kwargs={}, desc="default"),
@@ -537,19 +682,19 @@ def optim_inputs_func_nadam(device=None):
         ),
         OptimizerInput(
             params=None,
-            kwargs={"weight_decay": 0.9, "momentum_decay": 6e-3},
+            kwargs={"weight_decay": 0.1, "momentum_decay": 6e-3},
             desc="weight_decay",
         ),
         OptimizerInput(
             params=None,
             kwargs={
-                "weight_decay": 0.9,
+                "weight_decay": 0.1,
                 "momentum_decay": 6e-3,
                 "decoupled_weight_decay": True,
             },
             desc="decoupled_weight_decay",
         ),
-    ] + (cuda_supported_configs if str(device) == "cuda" else [])
+    ] + (cuda_supported_configs if "cuda" in str(device) else [])
 
 
 def optim_error_inputs_func_nadam(device, dtype):
@@ -579,20 +724,50 @@ def optim_error_inputs_func_nadam(device, dtype):
 
 
 # Weird story bro, NAdam and RAdam do not have maximize.
-def optim_inputs_func_radam(device=None):
+def optim_inputs_func_radam(device=None, dtype=None):
+    cuda_supported_configs = [
+        OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
+        OptimizerInput(
+            params=None,
+            kwargs={
+                "capturable": True,
+                "weight_decay": 0.1,
+            },
+            desc="capturable, weight_decay",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={
+                "capturable": True,
+                "weight_decay": 0.1,
+                "decoupled_weight_decay": True,
+            },
+            desc="capturable, weight_decay, decoupled_weight_decay",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={
+                "lr": torch.tensor(0.001),
+                "capturable": True,
+                "weight_decay": 0.1,
+                "decoupled_weight_decay": True,
+            },
+            desc="capturable, weight_decay, decoupled_weight_decay, tensor LR",
+        ),
+    ]
     return [
         OptimizerInput(params=None, kwargs={}, desc="default"),
         OptimizerInput(params=None, kwargs={"lr": 2e-3}, desc="non-default lr"),
         OptimizerInput(params=None, kwargs={"eps": 1e-6}, desc="non-default eps"),
         OptimizerInput(
-            params=None, kwargs={"weight_decay": 0.9}, desc="nonzero weight_decay"
+            params=None, kwargs={"weight_decay": 0.1}, desc="nonzero weight_decay"
         ),
         OptimizerInput(
             params=None,
-            kwargs={"weight_decay": 0.9, "decoupled_weight_decay": True},
+            kwargs={"weight_decay": 0.1, "decoupled_weight_decay": True},
             desc="decoupled_weight_decay",
         ),
-    ]
+    ] + (cuda_supported_configs if "cuda" in str(device) else [])
 
 
 def optim_error_inputs_func_radam(device, dtype):
@@ -621,34 +796,48 @@ def optim_error_inputs_func_radam(device, dtype):
     return error_inputs
 
 
-def optim_inputs_func_rmsprop(device=None):
+def optim_inputs_func_rmsprop(device, dtype=None):
+    cuda_supported_configs = [
+        OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "maximize": True, "capturable": True},
+            desc="capturable, maximize",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"lr": torch.tensor(0.001), "capturable": True},
+            desc="Tensor lr with capturable",
+        ),
+    ]
+
     return [
         OptimizerInput(params=None, kwargs={}, desc="default"),
         OptimizerInput(params=None, kwargs={"lr": 1e-3}, desc="non-default lr"),
         OptimizerInput(
-            params=None, kwargs={"weight_decay": 0.9}, desc="nonzero weight_decay"
+            params=None, kwargs={"weight_decay": 0.1}, desc="nonzero weight_decay"
         ),
         OptimizerInput(
             params=None,
-            kwargs={"weight_decay": 0.9, "centered": True},
+            kwargs={"weight_decay": 0.1, "centered": True},
             desc="centered",
         ),
         OptimizerInput(
             params=None,
-            kwargs={"weight_decay": 0.9, "centered": True, "momentum": 0.1},
+            kwargs={"weight_decay": 0.1, "centered": True, "momentum": 0.1},
             desc="momentum",
         ),
         OptimizerInput(
             params=None,
             kwargs={
-                "weight_decay": 0.9,
+                "weight_decay": 0.1,
                 "centered": True,
                 "momentum": 0.1,
                 "maximize": True,
             },
             desc="maximize",
         ),
-    ]
+    ] + (cuda_supported_configs if "cuda" in str(device) else [])
 
 
 def optim_error_inputs_func_rmsprop(device, dtype):
@@ -668,7 +857,16 @@ def optim_error_inputs_func_rmsprop(device, dtype):
     return error_inputs
 
 
-def optim_inputs_func_rprop(device=None):
+def optim_inputs_func_rprop(device, dtype=None):
+    cuda_supported_configs = [
+        OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
+        OptimizerInput(
+            params=None,
+            kwargs={"lr": torch.tensor(0.001), "capturable": True},
+            desc="Tensor lr with capturable",
+        ),
+    ]
+
     return [
         OptimizerInput(params=None, kwargs={}, desc="default"),
         OptimizerInput(params=None, kwargs={"lr": 2e-4}, desc="non-default lr"),
@@ -681,7 +879,7 @@ def optim_inputs_func_rprop(device=None):
             desc="non-default step_sizes",
         ),
         OptimizerInput(params=None, kwargs={"maximize": True}, desc="maximize"),
-    ]
+    ] + (cuda_supported_configs if "cuda" in str(device) else [])
 
 
 def optim_error_inputs_func_rprop(device, dtype):
@@ -701,30 +899,32 @@ def optim_error_inputs_func_rprop(device, dtype):
     return error_inputs
 
 
-def optim_inputs_func_sgd(device=None):
+def optim_inputs_func_sgd(device, dtype=None):
     return [
-        OptimizerInput(params=None, kwargs={"lr": 1e-2}, desc="default"),
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lr": 1e-2}, desc="non-default lr"),
         OptimizerInput(
-            params=None, kwargs={"lr": 1e-2, "momentum": 0.9}, desc="momentum"
+            params=None, kwargs={"lr": torch.tensor(0.001)}, desc="tensor lr"
         ),
+        OptimizerInput(params=None, kwargs={"momentum": 0.9}, desc="momentum"),
         OptimizerInput(
             params=None,
-            kwargs={"lr": 1e-2, "momentum": 0.9, "dampening": 0.5},
+            kwargs={"momentum": 0.9, "dampening": 0.5},
             desc="dampening",
         ),
         OptimizerInput(
             params=None,
-            kwargs={"lr": 1e-2, "momentum": 0.9, "weight_decay": 0.9},
+            kwargs={"momentum": 0.9, "weight_decay": 0.1},
             desc="non-zero weight_decay",
         ),
         OptimizerInput(
             params=None,
-            kwargs={"lr": 1e-2, "momentum": 0.9, "nesterov": True, "weight_decay": 0.9},
+            kwargs={"momentum": 0.9, "nesterov": True, "weight_decay": 0.1},
             desc="nesterov",
         ),
         OptimizerInput(
             params=None,
-            kwargs={"lr": 1e-2, "weight_decay": 0.9, "maximize": True},
+            kwargs={"weight_decay": 0.1, "maximize": True},
             desc="maximize",
         ),
     ]
@@ -747,7 +947,7 @@ def optim_error_inputs_func_sgd(device, dtype):
     return error_inputs
 
 
-def optim_inputs_func_sparseadam(device=None):
+def optim_inputs_func_sparseadam(device, dtype=None):
     return [
         OptimizerInput(params=None, kwargs={}, desc="default"),
         OptimizerInput(
@@ -811,10 +1011,27 @@ def optim_error_inputs_func_sparseadam(device, dtype):
                 error_type=ValueError,
                 error_regex="SparseAdam requires dense parameter tensors",
             ),
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=[torch.rand(2, 3, device=device, dtype=torch.complex64)],
+                    kwargs=dict(),
+                    desc="complex not supported",
+                ),
+                error_type=ValueError,
+                error_regex="SparseAdam does not support complex parameters",
+            ),
         ]
     return error_inputs
 
 
+def _get_device_type(device: Union[str, torch.device]) -> str:
+    # Returns the device type as a string, e.g., "cpu" or "cuda"
+    if isinstance(device, torch.device):
+        device = str(device.type)
+    assert isinstance(device, str)
+    return device.split(":")[0]
+
+
 def _get_optim_inputs_including_global_cliquey_kwargs(
     device, dtype, optim_info, skip=()
 ) -> List[OptimizerInput]:
@@ -833,14 +1050,17 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         x in ["foreach", "fused", "differentiable"] for x in skip
     ), "skip must be a subset of ['foreach', 'fused', 'differentiable']"
 
-    optim_inputs = optim_info.optim_inputs_func(device=device)
+    optim_inputs = optim_info.optim_inputs_func(device)
 
     supported_impls = tuple(
         x
         for x in optim_info.supported_impls
         if x not in skip
-        and (str(device) in _get_fused_kernels_supported_devices() or x != "fused")
-        and (str(device) in _get_foreach_kernels_supported_devices() or x != "foreach")
+        and (_get_device_type(device) in optim_info.supports_fused_on or x != "fused")
+        and (
+            _get_device_type(device) in _get_foreach_kernels_supported_devices()
+            or x != "foreach"
+        )
     )
 
     all_optim_inputs = []
@@ -878,32 +1098,47 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         supported_impls=("foreach", "differentiable"),
         skips=(
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/115679"
-                ),
+                skipIfTorchDynamo("Fails fix point assertion on 3.8, see #97811"),
                 "TestOptimRenewed",
-                "test_foreach_matches_forloop",
+                "test_tensor_lr",
+                active_if=sys.version_info < (3, 9) and sys.version_info > (3, 7),
             ),
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "Dynamo memory usage is flaky, see https://github.com/pytorch/pytorch/issues/116046"
-                ),
+                skipIfTorchDynamo("See #116028"),
                 "TestOptimRenewed",
-                "test_peak_memory_foreach",
+                "test_set_default_dtype_works_with_foreach",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/115679 and #116028"
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
                 ),
                 "TestOptimRenewed",
-                "test_set_default_dtype_works_with_foreach",
+                "test_complex_2d",
+            ),
+            # Note on tolerances:
+            # test_correctness_Adadelta_cuda_float32
+            # Mismatched elements: 10 / 100 (10.0%)
+            # Greatest absolute difference: 4.838220775127411e-05 at index (7, 4) (up to 1e-05 allowed)
+            # Greatest relative difference: 0.007270356640219688 at index (7, 2) (up to 1e-05 allowed)
+            # This is due to floating point ordering error + usage of sqrt
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.float32: tol(
+                            rtol=5.5e-4,
+                            atol=5e-5,
+                        )
+                    }
+                ),
+                "CompiledOptimizerParityTests",
+                "test_correctness",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116494"
+                    "This test uses mocks, which dynamo does not support"
                 ),
                 "TestOptimRenewed",
-                "test_state_dict_deterministic",
+                "test_defaults_changed_to_foreach",
             ),
         ),
     ),
@@ -912,71 +1147,135 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         optim_inputs_func=optim_inputs_func_adagrad,
         optim_error_inputs_func=optim_error_inputs_func_adagrad,
         supported_impls=("foreach", "differentiable"),
-        supports_sparse_on=("cpu"),
+        supports_sparse=True,
+        metadata_for_sparse=(
+            {"lr": 0.1, "weight_decay": 0, "lr_decay": 0},
+            [
+                lambda opt: StepLR(opt, gamma=1 - 1e-5, step_size=500),
+                lambda opt: ReduceLROnPlateau(opt, threshold=1e-4),
+            ],
+        ),
         skips=(
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/115607"
-                ),
+                skipIfMps,  # addcdiv doesn't work for non-contiguous, see #118115
                 "TestOptimRenewed",
-                "test_foreach_matches_forloop",
+                "test_forloop_goes_right_direction",
+                active_if=lambda kwargs: not kwargs["contiguous"],
             ),
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "Dynamo memory usage is flaky, see https://github.com/pytorch/pytorch/issues/116046"
-                ),
+                skipIfTorchDynamo("Fails fix point assertion on 3.8, see #97811"),
                 "TestOptimRenewed",
-                "test_peak_memory_foreach",
+                "test_tensor_lr",
+                active_if=sys.version_info < (3, 9) and sys.version_info > (3, 7),
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("See #116028"),
+                "TestOptimRenewed",
+                "test_set_default_dtype_works_with_foreach",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/115607 and #116028"
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
                 ),
                 "TestOptimRenewed",
-                "test_set_default_dtype_works_with_foreach",
+                "test_complex_2d",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116494"
+                    "This test uses mocks, which dynamo does not support"
                 ),
                 "TestOptimRenewed",
-                "test_state_dict_deterministic",
+                "test_defaults_changed_to_foreach",
             ),
         ),
     ),
     OptimizerInfo(
         Adam,
         optim_inputs_func=optim_inputs_func_adam,
+        scheduler_inputs=(
+            [lambda opt: ExponentialLR(opt, gamma=0.9)],
+            [lambda opt: LinearLR(opt, start_factor=0.4, total_iters=4)],
+            [
+                lambda opt: ConstantLR(opt, factor=0.4, total_iters=4),
+                lambda opt: ExponentialLR(opt, gamma=0.9),
+            ],
+            [
+                lambda opt: ExponentialLR(opt, gamma=0.9),
+                lambda opt: ReduceLROnPlateau(opt),
+            ],
+            [lambda opt: ConstantLR(opt, factor=0.4, total_iters=4)],
+            [lambda opt: PolynomialLR(opt, power=0.9, total_iters=4)],
+            [
+                lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                lambda opt: ReduceLROnPlateau(opt),
+            ],
+        ),
         optim_error_inputs_func=optim_error_inputs_func_adam,
         supported_impls=("foreach", "differentiable", "fused"),
-        skips=(
+        supports_fused_on=("cpu", "cuda"),
+        decorators=(
+            # Expected floating point error between fused and compiled forloop
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
+                toleranceOverride({torch.float64: tol(atol=4.5e-7, rtol=2.2e-6)}),
+                "TestOptimRenewed",
+                "test_fused_matches_forloop",
+                active_if=lambda kwargs: TEST_WITH_TORCHDYNAMO
+                and kwargs["dtype"] == torch.float64,
+            ),
+            DecorateInfo(
+                #  Note on tolerances:
+                #  difference comes from the fact that the non fused kernel have
+                #  more dtype cast operations. We have another test test_fused_cpu_matches_cuda
+                #  to make sure there is no discrepancies between cuda fused kernel
+                #  and cpu fused kernel
+                toleranceOverride(
+                    {
+                        torch.bfloat16: tol(atol=5e-3, rtol=5e-3),
+                        torch.float16: tol(atol=5e-3, rtol=5e-3),
+                    }
                 ),
                 "TestOptimRenewed",
-                "test_set_default_dtype_works_with_foreach",
+                "test_fused_matches_forloop",
+            ),
+        ),
+        skips=(
+            DecorateInfo(
+                skipIfMps,  # addcdiv doesn't work for non-contiguous, see #118115
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+                active_if=lambda kwargs: not kwargs["contiguous"],
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("initial_value is incorrect in dynamo, see #123202"),
+                "TestOptimRenewed",
+                "test_param_group_with_lrscheduler_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("Fails fix point assertion on 3.8, see #97811"),
+                "TestOptimRenewed",
+                "test_tensor_lr",
+                active_if=sys.version_info < (3, 9) and sys.version_info > (3, 7),
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "Fixing #115607 should fix this test. fused is correct, but forloop is not."
+                    "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
                 ),
                 "TestOptimRenewed",
-                "test_fused_matches_forloop",
+                "test_set_default_dtype_works_with_foreach",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116046"
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
                 ),
                 "TestOptimRenewed",
-                "test_peak_memory_foreach",
+                "test_complex_2d",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116494"
+                    "This test uses mocks, which dynamo does not support"
                 ),
                 "TestOptimRenewed",
-                "test_state_dict_deterministic",
+                "test_defaults_changed_to_foreach",
             ),
         ),
     ),
@@ -987,25 +1286,33 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         supported_impls=("foreach", "differentiable"),
         skips=(
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/115607"
-                ),
+                skipIfMps,  # addcdiv doesn't work for non-contiguous, see #118115
                 "TestOptimRenewed",
-                "test_foreach_matches_forloop",
+                "test_forloop_goes_right_direction",
+                active_if=lambda kwargs: not kwargs["contiguous"],
             ),
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/115607 and #116028"
-                ),
+                skipIfTorchDynamo("Fails fix point assertion on 3.8, see #97811"),
+                "TestOptimRenewed",
+                "test_tensor_lr",
+                active_if=sys.version_info < (3, 9) and sys.version_info > (3, 7),
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("Mismatched _foreach_addcdiv_ types, see #118159"),
+                "TestOptimRenewed",
+                "test_complex",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("See #116028"),
                 "TestOptimRenewed",
                 "test_set_default_dtype_works_with_foreach",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116046"
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
                 ),
                 "TestOptimRenewed",
-                "test_peak_memory_foreach",
+                "test_complex_2d",
             ),
             DecorateInfo(
                 unittest.skip("Uses too much memory, even for H100, surprisingly."),
@@ -1014,10 +1321,10 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116494"
+                    "This test uses mocks, which dynamo does not support"
                 ),
                 "TestOptimRenewed",
-                "test_state_dict_deterministic",
+                "test_defaults_changed_to_foreach",
             ),
         ),
     ),
@@ -1026,34 +1333,65 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         optim_inputs_func=optim_inputs_func_adamw,
         optim_error_inputs_func=optim_error_inputs_func_adamw,
         supported_impls=("foreach", "differentiable", "fused"),
-        skips=(
+        supports_fused_on=("cpu", "cuda"),
+        decorators=(
+            # Expected error between compiled forloop and fused optimizers
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
+                toleranceOverride({torch.float64: tol(atol=4.5e-7, rtol=2.2e-6)}),
+                "TestOptimRenewed",
+                "test_fused_matches_forloop",
+                active_if=lambda kwargs: TEST_WITH_TORCHDYNAMO
+                and kwargs["dtype"] == torch.float64,
+            ),
+            DecorateInfo(
+                toleranceOverride(
+                    #  Note on tolerances:
+                    #  difference comes from the fact that the non fused kernel have
+                    #  more dtype cast operations. We have another test test_fused_cpu_matches_cuda
+                    #  to make sure there is no discrepancies between cuda fused kernel
+                    #  and cpu fused kernel
+                    {
+                        torch.bfloat16: tol(atol=5e-3, rtol=5e-3),
+                        torch.float16: tol(atol=5e-3, rtol=5e-3),
+                    }
                 ),
                 "TestOptimRenewed",
-                "test_set_default_dtype_works_with_foreach",
+                "test_fused_matches_forloop",
+            ),
+        ),
+        skips=(
+            DecorateInfo(
+                skipIfMps,  # addcdiv doesn't work for non-contiguous, see #118115
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+                active_if=lambda kwargs: not kwargs["contiguous"],
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("Fails fix point assertion on 3.8, see #97811"),
+                "TestOptimRenewed",
+                "test_tensor_lr",
+                active_if=sys.version_info < (3, 9) and sys.version_info > (3, 7),
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "Fixing #115607 should fix this test. fused is correct, but forloop is not."
+                    "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
                 ),
                 "TestOptimRenewed",
-                "test_fused_matches_forloop",
+                "test_set_default_dtype_works_with_foreach",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116046"
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
                 ),
                 "TestOptimRenewed",
-                "test_peak_memory_foreach",
+                "test_complex_2d",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116494"
+                    "This test uses mocks, which dynamo does not support"
                 ),
                 "TestOptimRenewed",
-                "test_state_dict_deterministic",
+                "test_defaults_changed_to_foreach",
             ),
         ),
     ),
@@ -1064,25 +1402,24 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         supported_impls=("foreach", "differentiable"),
         skips=(
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "See discrepancy in https://github.com/pytorch/pytorch/issues/115607"
-                ),
+                skipIfTorchDynamo("Fails fix point assertion on 3.8, see #97811"),
                 "TestOptimRenewed",
-                "test_foreach_matches_forloop",
+                "test_tensor_lr",
+                active_if=sys.version_info < (3, 9) and sys.version_info > (3, 7),
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "Dynamo memory usage is flaky, see https://github.com/pytorch/pytorch/issues/116046"
+                    "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
                 ),
                 "TestOptimRenewed",
-                "test_peak_memory_foreach",
+                "test_set_default_dtype_works_with_foreach",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
                 ),
                 "TestOptimRenewed",
-                "test_set_default_dtype_works_with_foreach",
+                "test_complex_2d",
             ),
             DecorateInfo(
                 toleranceOverride(
@@ -1095,10 +1432,10 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116494"
+                    "This test uses mocks, which dynamo does not support"
                 ),
                 "TestOptimRenewed",
-                "test_state_dict_deterministic",
+                "test_defaults_changed_to_foreach",
             ),
         ),
     ),
@@ -1115,6 +1452,32 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
             DecorateInfo(
                 skipIfMps, "TestOptimRenewed", "test_can_load_older_state_dict"
             ),
+            DecorateInfo(
+                unittest.skip("Does not support param groups"),
+                "TestOptimRenewed",
+                "test_param_groups_lr",
+            ),
+            DecorateInfo(
+                unittest.skip("Does not support param groups"),
+                "TestOptimRenewed",
+                "test_param_groups_weight_decay",
+            ),
+            DecorateInfo(
+                unittest.skip("LBFGS doesn't support multidevice"),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+            DecorateInfo(
+                unittest.skip("Does not support param groups"),
+                "TestOptimRenewed",
+                "test_param_group_with_lrscheduler_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("Fails fix point assertion on 3.8, see #97811"),
+                "TestOptimRenewed",
+                "test_tensor_lr",
+                active_if=sys.version_info < (3, 9) and sys.version_info > (3, 7),
+            ),
         ),
     ),
     OptimizerInfo(
@@ -1123,6 +1486,18 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         optim_error_inputs_func=optim_error_inputs_func_nadam,
         supported_impls=("foreach", "differentiable"),
         skips=(
+            DecorateInfo(
+                skipIfMps,  # addcdiv doesn't work for non-contiguous, see #118115
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+                active_if=lambda kwargs: not kwargs["contiguous"],
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("Fails fix point assertion on 3.8, see #97811"),
+                "TestOptimRenewed",
+                "test_tensor_lr",
+                active_if=sys.version_info < (3, 9) and sys.version_info > (3, 7),
+            ),
             DecorateInfo(
                 skipIfTorchDynamo(
                     "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
@@ -1132,25 +1507,31 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116046"
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
                 ),
                 "TestOptimRenewed",
-                "test_peak_memory_foreach",
+                "test_complex_2d",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116494"
+                    "Errors, https://github.com/pytorch/pytorch/issues/117150"
                 ),
                 "TestOptimRenewed",
-                "test_state_dict_deterministic",
+                "test_load_nontensor_step",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116499"
+                    "Errors, see https://github.com/pytorch/pytorch/issues/117150"
                 ),
                 "TestOptimRenewed",
-                "test_can_load_older_state_dict",
-                device_type="cuda",
+                "test_state_dict_with_cuda_params",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "This test uses mocks, which dynamo does not support"
+                ),
+                "TestOptimRenewed",
+                "test_defaults_changed_to_foreach",
             ),
         ),
     ),
@@ -1161,11 +1542,10 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         supported_impls=("foreach", "differentiable"),
         skips=(
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "Dynamo memory usage is flaky, see https://github.com/pytorch/pytorch/issues/116046"
-                ),
+                skipIfTorchDynamo("Fails fix point assertion on 3.8, see #97811"),
                 "TestOptimRenewed",
-                "test_peak_memory_foreach",
+                "test_tensor_lr",
+                active_if=sys.version_info < (3, 9) and sys.version_info > (3, 7),
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
@@ -1174,6 +1554,30 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 "TestOptimRenewed",
                 "test_set_default_dtype_works_with_foreach",
             ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
+                ),
+                "TestOptimRenewed",
+                "test_complex_2d",
+            ),
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        # previously atol=1e-7, rtol=1e-7
+                        torch.float64: tol(atol=1.5e-7, rtol=1.1e-7)
+                    }
+                ),
+                "TestOptimRenewed",
+                "test_foreach_matches_forloop",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "This test uses mocks, which dynamo does not support"
+                ),
+                "TestOptimRenewed",
+                "test_defaults_changed_to_foreach",
+            ),
         ),
     ),
     OptimizerInfo(
@@ -1183,25 +1587,28 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         supported_impls=("foreach", "differentiable"),
         skips=(
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/115679"
-                ),
+                skipIfMps,  # addcdiv doesn't work for non-contiguous, see #118115
                 "TestOptimRenewed",
-                "test_foreach_matches_forloop",
+                "test_forloop_goes_right_direction",
+                active_if=lambda kwargs: not kwargs["contiguous"],
             ),
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "Dynamo memory usage is flaky, see https://github.com/pytorch/pytorch/issues/116046"
-                ),
+                skipIfTorchDynamo("Fails fix point assertion on 3.8, see #97811"),
                 "TestOptimRenewed",
-                "test_peak_memory_foreach",
+                "test_tensor_lr",
+                active_if=sys.version_info < (3, 9) and sys.version_info > (3, 7),
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("See #116028"),
+                "TestOptimRenewed",
+                "test_set_default_dtype_works_with_foreach",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/115679 and #116028"
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
                 ),
                 "TestOptimRenewed",
-                "test_set_default_dtype_works_with_foreach",
+                "test_complex_2d",
             ),
             DecorateInfo(
                 toleranceOverride(
@@ -1215,10 +1622,10 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116494"
+                    "This test uses mocks, which dynamo does not support"
                 ),
                 "TestOptimRenewed",
-                "test_state_dict_deterministic",
+                "test_defaults_changed_to_foreach",
             ),
         ),
     ),
@@ -1229,48 +1636,94 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         supported_impls=("foreach", "differentiable"),
         skips=(
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/115679"
-                ),
+                skipIfMps,  # Rprop doesn't update for non-contiguous, see #118117
                 "TestOptimRenewed",
-                "test_foreach_matches_forloop",
+                "test_forloop_goes_right_direction",
+                active_if=lambda kwargs: not kwargs["contiguous"],
             ),
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "Dynamo memory usage is flaky, see https://github.com/pytorch/pytorch/issues/116046"
-                ),
+                skipIfTorchDynamo("Fails fix point assertion on 3.8, see #97811"),
                 "TestOptimRenewed",
-                "test_peak_memory_foreach",
+                "test_tensor_lr",
+                active_if=sys.version_info < (3, 9) and sys.version_info > (3, 7),
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("See #116028"),
+                "TestOptimRenewed",
+                "test_set_default_dtype_works_with_foreach",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/115679 and #116028"
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
                 ),
                 "TestOptimRenewed",
-                "test_set_default_dtype_works_with_foreach",
+                "test_complex_2d",
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116494"
+                    "This test uses mocks, which dynamo does not support"
                 ),
                 "TestOptimRenewed",
-                "test_state_dict_deterministic",
+                "test_defaults_changed_to_foreach",
             ),
         ),
     ),
     OptimizerInfo(
         SGD,
         optim_inputs_func=optim_inputs_func_sgd,
+        scheduler_inputs=(
+            [lambda opt: StepLR(opt, gamma=0.9, step_size=10)],
+            [
+                lambda opt: LinearLR(
+                    opt, start_factor=0.4, end_factor=0.8, total_iters=4
+                )
+            ],
+            [
+                lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                lambda opt: LinearLR(
+                    opt, start_factor=0.4, end_factor=0.6, total_iters=4
+                ),
+            ],
+            [
+                lambda opt: StepLR(opt, gamma=0.99, step_size=10),
+                lambda opt: ExponentialLR(opt, gamma=0.99),
+                lambda opt: ReduceLROnPlateau(opt),
+            ],
+            [lambda opt: ConstantLR(opt, factor=0.4, total_iters=4)],
+            [lambda opt: PolynomialLR(opt, power=0.9, total_iters=4)],
+            [
+                lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                lambda opt: ReduceLROnPlateau(opt),
+            ],
+        ),
         optim_error_inputs_func=optim_error_inputs_func_sgd,
-        supported_impls=("foreach", "differentiable"),
-        supports_sparse_on=("cpu", "cuda"),
+        supported_impls=("foreach", "differentiable", "fused"),
+        supports_sparse=True,
+        metadata_for_sparse=(
+            {
+                "lr": 4.8e-3,
+                "maximize": False,
+                "momentum": 0,
+                "nesterov": False,
+                "weight_decay": 0,
+            },
+            [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)],
+        ),
+        supports_fused_on=(
+            "cpu",
+            "cuda",
+        ),
         skips=(
             DecorateInfo(
-                skipIfTorchDynamo(
-                    "Dynamo memory usage is flaky, see https://github.com/pytorch/pytorch/issues/116046"
-                ),
+                skipIfTorchDynamo("initial_value is incorrect in dynamo, see #123202"),
                 "TestOptimRenewed",
-                "test_peak_memory_foreach",
+                "test_param_group_with_lrscheduler_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("Fails fix point assertion on 3.8, see #97811"),
+                "TestOptimRenewed",
+                "test_tensor_lr",
+                active_if=sys.version_info < (3, 9) and sys.version_info > (3, 7),
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
@@ -1279,6 +1732,13 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 "TestOptimRenewed",
                 "test_set_default_dtype_works_with_foreach",
             ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
+                ),
+                "TestOptimRenewed",
+                "test_complex_2d",
+            ),
             DecorateInfo(
                 toleranceOverride(
                     {  # previously atol=5-05, rtol=0.001, https://github.com/pytorch/pytorch/issues/116202
@@ -1291,18 +1751,10 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
             ),
             DecorateInfo(
                 skipIfTorchDynamo(
-                    "Errors with list out of range, see https://github.com/pytorch/pytorch/issues/116061"
+                    "This test uses mocks, which dynamo does not support"
                 ),
                 "TestOptimRenewed",
-                "test_step_is_noop_for_zero_grads",
-                device_type="cpu",
-            ),
-            DecorateInfo(
-                skipIfTorchDynamo(
-                    "See https://github.com/pytorch/pytorch/issues/116494"
-                ),
-                "TestOptimRenewed",
-                "test_state_dict_deterministic",
+                "test_defaults_changed_to_foreach",
             ),
         ),
     ),
@@ -1312,6 +1764,8 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         optim_error_inputs_func=optim_error_inputs_func_sparseadam,
         supported_impls=(),
         only_supports_sparse_grads=True,
+        metadata_for_sparse=({"lr": 4e-2}, []),
+        supports_complex=False,  # Missing complex support, see #118153
         skips=(
             DecorateInfo(
                 skipIfMps,  # SparseAdam does not support MPS
@@ -1324,6 +1778,16 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 "TestOptimRenewed",
                 "test_state_dict_deterministic",
             ),
+            DecorateInfo(
+                skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
+                "TestOptimRenewed",
+                "test_param_groups_lr",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
+                "TestOptimRenewed",
+                "test_tensor_lr",
+            ),
             DecorateInfo(
                 unittest.skip(
                     "SparseAdam does not support dense gradients, see #116507"
@@ -1331,6 +1795,79 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 "TestOptimRenewed",
                 "test_can_load_older_state_dict",
             ),
+            DecorateInfo(
+                skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
+                "TestOptimRenewed",
+                "test_load_nontensor_step",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
+                "TestOptimRenewed",
+                "test_param_group_with_lrscheduler_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
+                "TestOptimRenewed",
+                "test_state_dict_with_cuda_params",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
+                "TestOptimRenewed",
+                "test_deepcopy_copies_all_public_attrs",
+            ),
         ),
     ),
 ]
+
+
+class TensorTracker:
+    """
+    A utility to track tensor clones in a list, with the expectation of popping them later (in
+    order) to make fair comparisons between two multi-step computation. The intended use case is
+    usually when comparing two supposed equal computations, such as an optimizer step that each
+    individually consists of multiple steps, where numerical deviation could multiply.
+
+    The goal is to be able to compare and align numbers at every milestone so as to minimize
+    numerical discrepancies, and so when the test fails, it is likely a real problem.
+    """
+
+    def __init__(self, assert_eq_kwargs=None):
+        if assert_eq_kwargs is None:
+            assert_eq_kwargs = {}
+        self.assert_eq_kwargs = assert_eq_kwargs
+        self.tensors = []
+
+    def add(self, tensor):
+        """
+        Add a clone().detach()'d version of the tensor
+        """
+        self.tensors.append(tensor.clone().detach())
+
+    # pops from beginning, like a queue and not a stack!
+    def pop_check_set(self, tensor_to_set, testcase):
+        """
+        Pop the first element in the tensor tracker, assert equality between the popped tensor and
+        the input tensor, and then set the input tensor to have the same values as the popped tensor
+        (with copy_).
+        """
+        testcase.assertGreater(len(self.tensors), 0, "no tensors to pop")
+        ref = self.tensors.pop(0)
+
+        testcase.assertTrue(isinstance(ref, Tensor), f"{type(ref)=}")
+        testcase.assertEqual(tensor_to_set, ref, **self.assert_eq_kwargs)
+
+        with torch.no_grad():
+            tensor_to_set.copy_(ref)
+
+    def all_popped(self):
+        return len(self.tensors) == 0
diff --git a/torch/testing/_internal/common_pruning.py b/torch/testing/_internal/common_pruning.py
index b6cbd92105f3f..031e4ad9efbd4 100644
--- a/torch/testing/_internal/common_pruning.py
+++ b/torch/testing/_internal/common_pruning.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 # Owner(s): ["module: unknown"]
 
 from torch.ao.pruning import BaseSparsifier
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 94295216211ac..aeb7ff95eb7e8 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 r"""Importing this file includes common utility methods and base clases for
 checking quantization api and properties of resulting modules.
 """
@@ -12,10 +14,7 @@
 import torch.distributed as dist
 from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM
 
-from torch._export import (
-    capture_pre_autograd_graph,
-    dynamic_dim,
-)
+from torch._export import capture_pre_autograd_graph
 from torch.ao.quantization import (
     QuantType,
     default_dynamic_qat_qconfig,
@@ -421,6 +420,22 @@ def wrapper(*args, **kwargs):
             fn(*args, **kwargs)
     return wrapper
 
+def skipIfNoInductorSupport(fn):
+    reason = "inductor doesn't support."
+    if isinstance(fn, type):
+        if not torchdynamo.is_inductor_supported():
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = reason
+        return fn
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not torchdynamo.is_inductor_supported():
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
 try:
     import torchvision  # noqa: F401
     HAS_TORCHVISION = True
@@ -442,6 +457,48 @@ def lengths_to_offsets(t, offset_type=np.int64, use_begin_offset=True):
         return tt[:-1]
     return tt[1:]
 
+
+def _group_quantize_tensor(w, n_bit=4, q_group_size=16):
+    assert w.dim() == 2
+    w = w.transpose(0, 1).contiguous()
+    assert q_group_size > 1
+    assert w.shape[-1] % q_group_size == 0
+
+    to_quant = w.reshape(-1, q_group_size)
+    assert torch.isnan(to_quant).sum() == 0
+
+    max_val = to_quant.amax(dim=1, keepdim=True)
+    min_val = to_quant.amin(dim=1, keepdim=True)
+    max_int = 2 ** n_bit - 1
+    min_int = 0
+    scales = (max_val - min_val).clamp(min=1e-6) / max_int
+    assert torch.isnan(scales).sum() == 0
+
+    zeros = min_val + scales * (2 ** (n_bit - 1))
+    assert torch.isnan(zeros).sum() == 0
+
+    out = to_quant.sub(min_val).div(scales).round().clamp_(min_int, max_int)
+    assert torch.isnan(out).sum() == 0
+
+    out = out.to(dtype=torch.int32).reshape(w.shape)
+
+    # Scales and zeros for the same q-group should be contiguous, so we can
+    # load as a 32-bit word
+    scales = scales.view(w.shape[0], -1)
+    zeros = zeros.view(w.shape[0], -1)
+    scales_and_zeros = (
+        torch.cat(
+            [
+                scales.reshape(scales.size(0), scales.size(1), 1),
+                zeros.reshape(zeros.size(0), zeros.size(1), 1),
+            ],
+            2,
+        ).transpose(0, 1).contiguous()
+    )
+
+    return out, scales_and_zeros
+
+
 # QuantizationTestCase used as a base class for testing quantization on modules
 class QuantizationTestCase(TestCase):
     def setUp(self):
@@ -823,10 +880,8 @@ def _get_underlying_op_type(
                     (exp_type_end_b is act_type_end_b)
                 self.assertTrue(
                     types_match,
-                    'Type mismatch at {}: expected {}, got {}'.format(
-                        k,
-                        (exp_type_start_a, exp_type_end_a, exp_type_start_b, exp_type_end_b),
-                        (act_type_start_a, act_type_end_a, act_type_start_b, act_type_end_b))
+                    f'Type mismatch at {k}: expected {(exp_type_start_a, exp_type_end_a, exp_type_start_b, exp_type_end_b)}, '
+                    f'got {(act_type_start_a, act_type_end_a, act_type_start_b, act_type_end_b)}'
                 )
 
         def assert_ns_compare_dict_valid(
@@ -1160,6 +1215,7 @@ def _test_quantizer(
         fx_qconfig_mapping=None,
         export_with_dynamic_shape=False,
         is_qat=False,
+        is_debug_mode=False,
     ):
         # resetting dynamo cache
         torch._dynamo.reset()
@@ -1167,10 +1223,14 @@ def _test_quantizer(
 
         # program capture
         m = copy.deepcopy(m_eager)
+        dynamic_shapes = tuple(
+            {0: torch.export.Dim("dim")} if i == 0 else None
+            for i in range(len(example_inputs))
+        )
         m = capture_pre_autograd_graph(
             m,
             example_inputs,
-            constraints=[dynamic_dim(example_inputs[0], 0)] if export_with_dynamic_shape else [],
+            dynamic_shapes=dynamic_shapes if export_with_dynamic_shape else None,
         )
 
         if is_qat:
@@ -1179,7 +1239,9 @@ def _test_quantizer(
             m = prepare_pt2e(m, quantizer)
         # Calibrate
         m(*example_inputs)
-        m = convert_pt2e(m, fold_quantize=True)
+        m = convert_pt2e(m)
+        if is_debug_mode:
+            print("quantized model", m)
 
         pt2_quant_output = m(*example_inputs)
         ns = NodeSpec
@@ -1206,7 +1268,7 @@ def _test_quantizer(
             m_fx = capture_pre_autograd_graph(
                 m_fx,
                 example_inputs,
-                constraints=[dynamic_dim(example_inputs[0], 0)] if export_with_dynamic_shape else [],
+                dynamic_shapes=dynamic_shapes if export_with_dynamic_shape else None,
             )
             node_occurrence = {}
             for k, v in PT2EQuantizationTestCase._MAP_TO_FX_TRACED_OPS.items():
@@ -1216,14 +1278,20 @@ def _test_quantizer(
             fx_quant_output = m_fx(*example_inputs)
             self.assertEqual(fx_quant_output, pt2_quant_output)
 
-    def _quantize(self, m, quantizer, example_inputs):
+    def _quantize(self, m, quantizer, example_inputs, is_qat: bool = False):
+        # resetting dynamo cache
+        torch._dynamo.reset()
+
         m = capture_pre_autograd_graph(
             m,
             example_inputs,
         )
-        m = prepare_pt2e(m, quantizer)
+        if is_qat:
+            m = prepare_qat_pt2e(m, quantizer)
+        else:
+            m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
-        m = convert_pt2e(m, fold_quantize=True)
+        m = convert_pt2e(m)
         return m
 
     def _get_pt2e_quantized_linear(self, is_per_channel=False) -> torch.fx.GraphModule:
@@ -2686,6 +2754,27 @@ def forward(self, x):
             x = self.bn(x)
             return self.relu(x)
 
+    class ConvTWithBNRelu(torch.nn.Module):
+        def __init__(self, relu, dim=2, bn=True, bias=True):
+            super().__init__()
+            convts = {1: torch.nn.ConvTranspose1d, 2: torch.nn.ConvTranspose2d}
+            bns = {1: torch.nn.BatchNorm1d, 2: torch.nn.BatchNorm2d}
+            self.convt = convts[dim](3, 3, 3, bias=bias)
+
+            if bn:
+                self.bn = bns[dim](3)
+            else:
+                self.bn = torch.nn.Identity()
+            if relu:
+                self.relu = torch.nn.ReLU()
+            else:
+                self.relu = torch.nn.Identity()
+
+        def forward(self, x):
+            x = self.convt(x)
+            x = self.bn(x)
+            return self.relu(x)
+
     class Conv2dThenConv1d(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/torch/testing/_internal/common_quantized.py b/torch/testing/_internal/common_quantized.py
index bd02761d3a220..3bd7b827dde32 100644
--- a/torch/testing/_internal/common_quantized.py
+++ b/torch/testing/_internal/common_quantized.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 r"""Importing this file includes common utility methods for checking quantized
 tensors and modules.
 """
diff --git a/torch/testing/_internal/common_subclass.py b/torch/testing/_internal/common_subclass.py
index 429fcaf86ab36..f6a8ed065cb81 100644
--- a/torch/testing/_internal/common_subclass.py
+++ b/torch/testing/_internal/common_subclass.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 from copy import deepcopy
 from torch.utils._pytree import tree_map
@@ -62,9 +64,6 @@ def __init__(self, diag, requires_grad=False):
 
     handled_ops = {}
 
-    # We disable torch function here to avoid any unwanted wrapping of the output
-    __torch_function__ = torch._C._disabled_torch_function_impl
-
     @classmethod
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
         if not all(issubclass(cls, t) for t in types):
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 98e2dc5593b47..09cc74e65df97 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 r"""Importing this file must **not** initialize CUDA context. test_distributed
 relies on this assumption to properly run. This means that when this is imported
 no CUDA calls shall be made, including torch.cuda.device_count(), etc.
@@ -77,7 +79,6 @@
     ParameterList,
     Sequential,
 )
-from .dynamo_test_failures import dynamo_expected_failures, dynamo_skips, FIXME_default_non_strict
 from torch.onnx import (
     register_custom_op_symbolic,
     unregister_custom_op_symbolic,
@@ -92,9 +93,15 @@
 )
 from torch.testing._comparison import not_close_error_metas
 from torch.testing._internal.common_dtype import get_all_dtypes
+from torch.utils._import_utils import _check_module_exists
 import torch.utils._pytree as pytree
 
 from .composite_compliance import no_dispatch
+try:
+    import pytest
+    has_pytest = True
+except ImportError:
+    has_pytest = False
 
 
 # Class to keep track of test flags configurable by environment variables.
@@ -388,9 +395,8 @@ def composite_fn(test, generic_cls, device_cls,
                 redundant_params = set(old_param_kwargs.keys()).intersection(new_param_kwargs.keys())
                 if redundant_params:
                     raise RuntimeError('Parametrization over the same parameter by multiple parametrization '
-                                       'decorators is not supported. For test "{}", the following parameters '
-                                       'are handled multiple times: {}'.format(
-                                           test.__name__, redundant_params))
+                                       f'decorators is not supported. For test "{test.__name__}", the following parameters '
+                                       f'are handled multiple times: {redundant_params}')
                 full_param_kwargs = {**old_param_kwargs, **new_param_kwargs}
                 merged_test_name = '{}{}{}'.format(new_test_name,
                                                    '_' if old_test_name != '' and new_test_name != '' else '',
@@ -736,9 +742,8 @@ def prof_func_call(*args, **kwargs):
 def prof_meth_call(*args, **kwargs):
     return prof_callable(meth_call, *args, **kwargs)
 
-# TODO fix when https://github.com/python/mypy/issues/2427 is address
-torch._C.ScriptFunction.__call__ = prof_func_call  # type: ignore[assignment]
-torch._C.ScriptMethod.__call__ = prof_meth_call  # type: ignore[assignment]
+torch._C.ScriptFunction.__call__ = prof_func_call  # type: ignore[method-assign]
+torch._C.ScriptMethod.__call__ = prof_meth_call  # type: ignore[method-assign]
 
 def _get_test_report_path():
     # allow users to override the test file location. We need this
@@ -979,7 +984,9 @@ def sanitize_pytest_xml(xml_file: str):
     import xml.etree.ElementTree as ET
     tree = ET.parse(xml_file)
     for testcase in tree.iter('testcase'):
-        full_classname = testcase.attrib['classname']
+        full_classname = testcase.attrib.get("classname")
+        if full_classname is None:
+            continue
         # The test prefix is optional
         regex_result = re.search(r"^(test\.)?(?P<file>.*)\.(?P<classname>[^\.]*)$", full_classname)
         if regex_result is None:
@@ -1220,30 +1227,19 @@ def TemporaryDirectoryName(suffix=None):
 
 IS_FILESYSTEM_UTF8_ENCODING = sys.getfilesystemencoding() == 'utf-8'
 
-def _check_module_exists(name: str) -> bool:
-    r"""Returns if a top-level module with :attr:`name` exists *without**
-    importing it. This is generally safer than try-catch block around a
-    `import X`. It avoids third party libraries breaking assumptions of some of
-    our tests, e.g., setting multiprocessing start method when imported
-    (see librosa/#747, torchvision/#544).
-    """
-    try:
-        import importlib.util
-        spec = importlib.util.find_spec(name)
-        return spec is not None
-    except ImportError:
-        return False
-
 TEST_NUMPY = _check_module_exists('numpy')
 TEST_FAIRSEQ = _check_module_exists('fairseq')
 TEST_SCIPY = _check_module_exists('scipy')
 TEST_MKL = torch.backends.mkl.is_available()
 TEST_MPS = torch.backends.mps.is_available()
+TEST_XPU = torch.xpu.is_available()
 TEST_CUDA = torch.cuda.is_available()
 custom_device_mod = getattr(torch, torch._C._get_privateuse1_backend_name(), None)
-TEST_PRIVATEUSE1 = True if (hasattr(custom_device_mod, "is_available") and custom_device_mod.is_available()) else False
+custom_device_is_available = hasattr(custom_device_mod, "is_available") and custom_device_mod.is_available()
+TEST_PRIVATEUSE1 = True if custom_device_is_available else False
+TEST_PRIVATEUSE1_DEVICE_TYPE = torch._C._get_privateuse1_backend_name()
 TEST_NUMBA = _check_module_exists('numba')
-
+TEST_TRANSFORMERS = _check_module_exists('transformers')
 TEST_DILL = _check_module_exists('dill')
 
 TEST_LIBROSA = _check_module_exists('librosa') and not IS_ARM64
@@ -1259,6 +1255,9 @@ def split_if_not_empty(x: str):
 
 NOTEST_CPU = "cpu" in split_if_not_empty(os.getenv('PYTORCH_TESTING_DEVICE_EXCEPT_FOR', ''))
 
+skipIfNoDill = unittest.skipIf(not TEST_DILL, "no dill")
+
+
 # Python 2.7 doesn't have spawn
 TestEnvironment.def_flag("NO_MULTIPROCESSING_SPAWN", env_var="NO_MULTIPROCESSING_SPAWN")
 TestEnvironment.def_flag("TEST_WITH_ASAN", env_var="PYTORCH_TEST_WITH_ASAN")
@@ -1299,6 +1298,7 @@ def split_if_not_empty(x: str):
     # other libraries take up about 11% of space per process
     torch.cuda.set_per_process_memory_fraction(round(1 / num_procs - .11, 2))
 
+requires_cuda = unittest.skipUnless(torch.cuda.is_available(), "Requires CUDA")
 
 def skipIfCrossRef(fn):
     @wraps(fn)
@@ -1342,6 +1342,14 @@ def xfailIfTorchDynamo(func):
 
 
 def skipIfTorchDynamo(msg="test doesn't currently work with dynamo"):
+    """
+    Usage:
+    @skipIfTorchDynamo(msg)
+    def test_blah(self):
+        ...
+    """
+    assert isinstance(msg, str), "Are you using skipIfTorchDynamo correctly?"
+
     def decorator(fn):
         if not isinstance(fn, type):
             @wraps(fn)
@@ -1359,7 +1367,6 @@ def wrapper(*args, **kwargs):
 
         return fn
 
-
     return decorator
 
 def skipIfTorchInductor(msg="test doesn't currently work with torchinductor",
@@ -1383,10 +1390,18 @@ def wrapper(*args, **kwargs):
 
     return decorator
 
+def serialTest(condition=True):
+    """
+    Decorator for running tests serially.  Requires pytest
+    """
+    def decorator(fn):
+        if has_pytest and condition:
+            return pytest.mark.serial(fn)
+        return fn
+    return decorator
 
 def unMarkDynamoStrictTest(cls=None):
     def decorator(cls):
-        assert inspect.isclass(cls)
         cls.dynamo_strict = False
         return cls
 
@@ -1485,6 +1500,9 @@ def wrapper(*args, **kwargs):
 numpy_to_torch_dtype_dict = {
     np.bool_      : torch.bool,
     np.uint8      : torch.uint8,
+    np.uint16     : torch.uint16,
+    np.uint32     : torch.uint32,
+    np.uint64     : torch.uint64,
     np.int8       : torch.int8,
     np.int16      : torch.int16,
     np.int32      : torch.int32,
@@ -1553,6 +1571,21 @@ def wrapper(*args, **kwargs):
             raise unittest.SkipTest("test currently only works on the ROCm stack")
     return wrapper
 
+def skipIfXpu(func=None, *, msg="test doesn't currently work on the XPU stack"):
+    def dec_fn(fn):
+        reason = f"skipIfXpu: {msg}"
+
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            if TEST_XPU:
+                raise unittest.SkipTest(reason)
+            else:
+                return fn(*args, **kwargs)
+        return wrapper
+    if func:
+        return dec_fn(func)
+    return dec_fn
+
 def skipIfMps(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -1601,6 +1634,19 @@ def _fn(*args, **kwargs):
     return _fn
 
 
+# Reverts the blas backend back to default to make sure potential failures in one
+# test do not affect other tests
+def setBlasBackendsToDefaultFinally(fn):
+    @wraps(fn)
+    def _fn(*args, **kwargs):
+        _preferred_backend = torch.backends.cuda.preferred_blas_library()
+        try:
+            fn(*args, **kwargs)
+        finally:
+            torch.backends.cuda.preferred_blas_library(_preferred_backend)
+    return _fn
+
+
 # Context manager for setting deterministic flag and automatically
 # resetting it to its original value
 class DeterministicGuard:
@@ -1651,6 +1697,20 @@ def __enter__(self):
     def __exit__(self, exception_type, exception_value, traceback):
         torch.cuda.set_sync_debug_mode(self.debug_mode_restore)
 
+# Context manager for setting torch.__future__.set_swap_module_params_on_conversion
+# and automatically resetting it to its original value
+class SwapTensorsGuard:
+    def __init__(self, use_swap_tensors):
+        self.use_swap_tensors = use_swap_tensors
+
+    def __enter__(self):
+        self.swap_tensors_restore = torch.__future__.get_swap_module_params_on_conversion()
+        if self.use_swap_tensors is not None:
+            torch.__future__.set_swap_module_params_on_conversion(self.use_swap_tensors)
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        torch.__future__.set_swap_module_params_on_conversion(self.swap_tensors_restore)
+
 # This decorator can be used for API tests that call
 # torch.use_deterministic_algorithms().  When the test is finished, it will
 # restore the previous deterministic flag setting.
@@ -1709,6 +1769,30 @@ def __exit__(self, exception_type, exception_value, traceback):
                 fn(*args, **kwargs)
     return wrapper
 
+# This decorator can be used for API tests that want to safely call
+# torch.__future__.set_swap_module_params_on_conversion.  `swap` can be set to
+# True, False or None where None indicates that the context manager does not
+# set the flag. When the test is finished, it will restore the previous swap
+# flag setting.
+def wrapSwapTensorsTest(swap=None):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            with SwapTensorsGuard(swap):
+                fn(*args, **kwargs)
+        return wrapper
+    return dec_fn
+
+# test parametrizer for swapping
+class swap(_TestParametrizer):
+    def __init__(self, swap_values):
+        super().__init__()
+        self.swap_values = swap_values
+
+    def _parametrize_test(self, test, generic_cls, device_cls):
+        for swap in self.swap_values:
+            yield wrapSwapTensorsTest(swap)(test), f'swap_{swap}', {}, lambda _: []
+
 def skipIfCompiledWithoutNumpy(fn):
     # Even if the numpy module is present, if `USE_NUMPY=0` is used during the
     # build, numpy tests will fail
@@ -1808,6 +1892,16 @@ def wrapper(*args, **kwargs):
     return dec_fn
 
 
+def skip_if_pytest(fn):
+    @wraps(fn)
+    def wrapped(*args, **kwargs):
+        if "PYTEST_CURRENT_TEST" in os.environ:
+            raise unittest.SkipTest("does not work under pytest")
+        return fn(*args, **kwargs)
+
+    return wrapped
+
+
 def slowTest(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -2095,30 +2189,20 @@ def __exit__(self, exec_type, exec_value, traceback):
                 #   statistics or a leak too small to trigger the allocation of an
                 #   additional block of memory by the CUDA driver
                 msg = ("CUDA caching allocator reports a memory leak not "
-                       "verified by the driver API in {}! "
-                       "Caching allocator allocated memory was {} and is now reported as {} "
-                       "on device {}. "
-                       "CUDA driver allocated memory was {} and is now {}.").format(
-                    self.name,
-                    self.caching_allocator_befores[i],
-                    caching_allocator_mem_allocated,
-                    i,
-                    self.driver_befores[i],
-                    driver_mem_allocated)
+                       f"verified by the driver API in {self.name}! "
+                       f"Caching allocator allocated memory was {self.caching_allocator_befores[i]} "
+                       f"and is now reported as {caching_allocator_mem_allocated} "
+                       f"on device {i}. "
+                       f"CUDA driver allocated memory was {self.driver_befores[i]} and is now {driver_mem_allocated}.")
                 warnings.warn(msg)
             elif caching_allocator_discrepancy and driver_discrepancy:
                 # A caching allocator discrepancy validated by the driver API is a
                 #   failure (except on ROCm, see below)
-                msg = ("CUDA driver API confirmed a leak in {}! "
-                       "Caching allocator allocated memory was {} and is now reported as {} "
-                       "on device {}. "
-                       "CUDA driver allocated memory was {} and is now {}.").format(
-                    self.name,
-                    self.caching_allocator_befores[i],
-                    caching_allocator_mem_allocated,
-                    i,
-                    self.driver_befores[i],
-                    driver_mem_allocated)
+                msg = (f"CUDA driver API confirmed a leak in {self.name}! "
+                       f"Caching allocator allocated memory was {self.caching_allocator_befores[i]} "
+                       f"and is now reported as {caching_allocator_mem_allocated} "
+                       f"on device {i}. "
+                       f"CUDA driver allocated memory was {self.driver_befores[i]} and is now {driver_mem_allocated}.")
 
                 raise RuntimeError(msg)
 
@@ -2234,6 +2318,7 @@ def matches_test(target: str):
                     "windows": IS_WINDOWS,
                     "linux": IS_LINUX,
                     "rocm": TEST_WITH_ROCM,  # noqa: F821
+                    "xpu": TEST_XPU,  # noqa: F821
                     "asan": TEST_WITH_ASAN,  # noqa: F821
                     "dynamo": TEST_WITH_TORCHDYNAMO,  # noqa: F821
                     "inductor": TEST_WITH_TORCHINDUCTOR,  # noqa: F821
@@ -2589,17 +2674,21 @@ def _get_rel_test_path(abs_test_path):
                         parts = Path(abs_test_path).parts
                         for i, part in enumerate(parts):
                             if part == "test":
-                                base_dir = os.path.join(*parts[:i])
+                                base_dir = os.path.join(*parts[:i]) if i > 0 else ''
                                 return os.path.relpath(abs_test_path, start=base_dir)
 
                         # Can't determine containing dir; just return the test filename.
                         # The path isn't strictly correct but it's arguably better than nothing.
                         return os.path.split(abs_test_path)[1]
 
-                    test_filename = _get_rel_test_path(inspect.getfile(type(self)))
+                    # NB: In Python 3.8, the getfile() call will return a path relative
+                    # to the working directory, so convert that to absolute.
+                    abs_test_path = os.path.abspath(inspect.getfile(type(self)))
+                    test_filename = _get_rel_test_path(abs_test_path)
+                    class_name = type(self).__name__
                     repro_str = f"""
 To execute this test, run the following from the base repo dir:
-    {env_var_prefix} python {test_filename} -k {method_name}
+    {env_var_prefix} python {test_filename} -k {class_name}.{method_name}
 
 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0"""
                     self.wrap_with_policy(
@@ -2694,7 +2783,11 @@ def _run_custom(self, result=None):
                 match = re.match(r".*/test/(.*).py", full_path)
                 if match is not None:
                     filename = match.group(1)
-                    strict_default = filename not in FIXME_default_non_strict
+                    if TEST_WITH_TORCHINDUCTOR:  # noqa: F821
+                        from .dynamo_test_failures import FIXME_inductor_non_strict
+                        strict_default = filename not in FIXME_inductor_non_strict
+                    else:
+                        strict_default = True
             # inspect.getfile can fail with these
             except (OSError, TypeError):
                 pass
@@ -2702,7 +2795,15 @@ def _run_custom(self, result=None):
                 if os.environ["STRICT_DEFAULT"] == "1":
                     strict_default = True
 
-        strict_mode = getattr(test_cls, "dynamo_strict", strict_default) and compiled
+        strict_mode = False
+        if compiled:
+            test_method = getattr(self, self._testMethodName)
+            if hasattr(test_method, "dynamo_strict"):
+                strict_mode = test_method.dynamo_strict
+            elif hasattr(test_cls, "dynamo_strict"):
+                strict_mode = test_cls.dynamo_strict
+            else:
+                strict_mode = strict_default
         nopython = getattr(test_cls, "dynamo_strict_nopython", False) and compiled
 
         if strict_mode:
@@ -2710,12 +2811,12 @@ def _run_custom(self, result=None):
 
         # TODO: Remove this; this is grandfathered in because we suppressed errors
         # on test suite previously
-        # When strict mode is False, supress_errors is True
+        # When strict mode is False, suppress_errors is True
         if compiled:
-            supress_errors = not strict_mode
+            suppress_errors = not strict_mode
         else:
-            supress_errors = torch._dynamo.config.suppress_errors
-        with unittest.mock.patch("torch._dynamo.config.suppress_errors", supress_errors):
+            suppress_errors = torch._dynamo.config.suppress_errors
+        with unittest.mock.patch("torch._dynamo.config.suppress_errors", suppress_errors):
             if TEST_WITH_TORCHINDUCTOR:  # noqa: F821
                 super_run = torch._dynamo.optimize("inductor")(super_run)
             elif TEST_WITH_AOT_EAGER:  # noqa: F821
@@ -2724,12 +2825,39 @@ def _run_custom(self, result=None):
                 # TorchDynamo optimize annotation
                 super_run = torch._dynamo.optimize("eager", nopython=nopython)(super_run)
                 key = f"{self.__class__.__name__}.{self._testMethodName}"
+                from .dynamo_test_failures import dynamo_expected_failures, dynamo_skips
+
+                def expect_failure(f, test_name):
+                    @wraps(f)
+                    def wrapper(*args, **kwargs):
+                        try:
+                            f(*args, **kwargs)
+                        except BaseException as e:
+                            self.skipTest(e)
+                        raise RuntimeError(f"Unexpected success, please remove `test/dynamo_expected_failures/{test_name}`")
+                    return wrapper
+
                 if key in dynamo_expected_failures:
                     method = getattr(self, self._testMethodName)
-                    unittest.expectedFailure(self)
+                    setattr(self, self._testMethodName, expect_failure(method, key))
+
+                def ignore_failure(f, test_name):
+                    @wraps(f)
+                    def wrapper(*args, **kwargs):
+                        try:
+                            f(*args, **kwargs)
+                        except BaseException as e:
+                            self.skipTest(e)
+                        method = getattr(self, self._testMethodName)
+                        if getattr(method, "__unittest_expecting_failure__", False):
+                            self.skipTest("unexpected success")
+                        else:
+                            self.skipTest(f"This test passed, maybe we can remove `test/dynamo_skips/{test_name}`")
+                    return wrapper
+
                 if key in dynamo_skips:
                     method = getattr(self, self._testMethodName)
-                    setattr(self, self._testMethodName, unittest.skip("marked skip in dynamo_test_failures.py")(method))
+                    setattr(self, self._testMethodName, ignore_failure(method, key))
 
             super_run(result=result)
 
@@ -3853,6 +3981,14 @@ def runWithPytorchAPIUsageStderr(code):
         return stderr.decode('ascii')
 
 
+class TestCaseBase(TestCase):
+    # Calls to super() in dynamically created classes are a bit odd.
+    # See https://github.com/pytorch/pytorch/pull/118586 for more info
+    # Subclassing this class and then calling super(TestCaseBase) will run
+    # TestCase's setUp, tearDown etc functions
+    pass
+
+
 def download_file(url, binary=True):
     from urllib.parse import urlsplit
     from urllib import request, error
@@ -4906,6 +5042,7 @@ def repl_frame(m):
 
     s = re.sub(r'  File "([^"]+)", line \d+, in (.+)\n    .+\n( +[~^]+ *\n)?', repl_frame, s)
     s = re.sub(r"line \d+", "line N", s)
+    s = re.sub(r".py:\d+", ".py:N", s)
     s = re.sub(file, os.path.basename(file), s)
     s = re.sub(os.path.join(os.path.dirname(torch.__file__), ""), "", s)
     s = re.sub(r"\\", "/", s)  # for Windows
diff --git a/torch/testing/_internal/composite_compliance.py b/torch/testing/_internal/composite_compliance.py
index 6562a9676131e..b3c3bd4a130e0 100644
--- a/torch/testing/_internal/composite_compliance.py
+++ b/torch/testing/_internal/composite_compliance.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 from torch import Tensor
 import itertools
@@ -115,7 +117,6 @@ class CompositeCompliantTensor(torch.Tensor):
         elem: torch.Tensor
 
         __slots__ = ['elem']
-        __torch_function__ = torch._C._disabled_torch_function_impl
 
         @staticmethod
         def __new__(cls, elem, mode, *args, **kwargs):
diff --git a/torch/testing/_internal/control_flow_opinfo_db.py b/torch/testing/_internal/control_flow_opinfo_db.py
deleted file mode 100644
index f10dd91cc109e..0000000000000
--- a/torch/testing/_internal/control_flow_opinfo_db.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import torch
-import functools
-from torch.testing import make_tensor
-from functorch.experimental.control_flow import map
-from torch.testing._internal.opinfo.core import (
-    OpInfo,
-    SampleInput,
-)
-from torch.testing._internal.common_dtype import all_types_and
-
-def sample_inputs_map(opinfo, device, dtype, requires_grad, **kwargs):
-    make_arg = functools.partial(
-        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
-    yield SampleInput([make_arg(2, 2, 2, low=0.1, high=2), make_arg(2, 2, 2, low=0.1, high=2)],
-                      args=(make_arg(1, low=0.1, high=2), make_arg(1, low=0.1, high=2)))
-
-def inner_f(x, y0, y1):
-    return [x[0].cos().add_(1.) * y0, (x[1] + y1.sin()).cos_().view(x[1].size())]
-
-def simple_map(xs, y0, y1):
-    def f(x, y0, y1):
-        return inner_f(x, y0, y1)
-    return map(f, xs, y0, y1)
-
-def nested_map(xs, y0, y1):
-    def f1(xx, y0, y1):
-        def f2(x, y0, y1):
-            return inner_f(x, y0, y1)
-        return map(f2, xx, y0, y1)
-    return map(f1, xs, y0, y1)
-
-def triple_nested_map(xs, y0, y1):
-    def f0(xs, y0, y1):
-        def f1(xx, y0, y1):
-            def f2(x, y0, y1):
-                return inner_f(x, y0, y1)
-            return map(f2, xx, y0, y1)
-        return map(f1, xs, y0, y1)
-    return map(f0, xs, y0, y1)
-
-control_flow_opinfo_db = [
-    OpInfo(
-        "MapControlflowOp",
-        op=simple_map,
-        sample_inputs_func=sample_inputs_map,
-        dtypes=all_types_and(torch.bool, torch.half),
-        supports_out=False,
-        check_batched_grad=False,
-        check_batched_gradgrad=False,
-        check_batched_forward_grad=False,
-        check_inplace_batched_forward_grad=False,
-    ),
-    OpInfo(
-        "NestedMapControlflowOp",
-        op=nested_map,
-        sample_inputs_func=sample_inputs_map,
-        dtypes=all_types_and(torch.bool, torch.half),
-        supports_out=False,
-        check_batched_grad=False,
-        check_batched_gradgrad=False,
-        check_batched_forward_grad=False,
-        check_inplace_batched_forward_grad=False,
-    ),
-    OpInfo(
-        "TripleNestedMapControlflowOp",
-        op=triple_nested_map,
-        sample_inputs_func=sample_inputs_map,
-        dtypes=all_types_and(torch.bool, torch.half),
-        supports_out=False,
-        check_batched_grad=False,
-        check_batched_gradgrad=False,
-        check_batched_forward_grad=False,
-        check_inplace_batched_forward_grad=False,
-    )
-]
diff --git a/torch/testing/_internal/custom_op_db.py b/torch/testing/_internal/custom_op_db.py
index 4a325ec845825..3177fb9c8bb5d 100644
--- a/torch/testing/_internal/custom_op_db.py
+++ b/torch/testing/_internal/custom_op_db.py
@@ -10,13 +10,13 @@
 from torch.testing._internal.autograd_function_db import (
     sample_inputs_numpy_cube,
     sample_inputs_numpy_mul,
+    sample_inputs_numpy_mul_scalar,
     sample_inputs_numpy_sort,
     sample_inputs_numpy_take,
 )
 from torch import Tensor
 from torch.types import Number
 from typing import *  # noqa: F403
-import torch._custom_ops as custom_ops
 
 # Note: [custom op db]
 #
@@ -27,65 +27,67 @@
 def to_numpy(tensor):
     return tensor.cpu().numpy()
 
-@custom_ops.custom_op('_torch_testing::numpy_cube')
+@torch.library.custom_op("_torch_testing::numpy_cube", mutates_args=())
 def numpy_cube(x: Tensor) -> Tuple[Tensor, Tensor]:
-    raise NotImplementedError()
-
-@custom_ops.impl('_torch_testing::numpy_cube')
-def numpy_cube_impl(x):
     x_np = to_numpy(x)
     dx = torch.tensor(3 * x_np ** 2, device=x.device)
     return torch.tensor(x_np ** 3, device=x.device), dx
 
-@custom_ops.impl_abstract('_torch_testing::numpy_cube')
-def numpy_cube_abstract(x):
+@numpy_cube.register_fake
+def _(x):
     return x.clone(), x.clone()
 
-@custom_ops.impl_save_for_backward('_torch_testing::numpy_cube')
-def numpy_cube_save_for_backward(inputs, output):
-    return (inputs.x, output[1])
+def numpy_cube_setup_context(ctx, inputs, output):
+    x, = inputs
+    cube, dx = output
+    ctx.save_for_backward(x, dx)
 
-@custom_ops.impl_backward('_torch_testing::numpy_cube')
-def numpy_cube_backward(ctx, saved, grad_out, grad_dx):
-    x, dx = saved
-    grad_x = torch.ops._torch_testing.numpy_mul(grad_out, dx) + 6 * torch.ops._torch_testing.numpy_mul(grad_dx, x)
-    return {'x': grad_x}
+def numpy_cube_backward(ctx, grad_out, grad_dx):
+    x, dx = ctx.saved_tensors
+    grad_x = numpy_mul(grad_out, dx) + 6 * numpy_mul(grad_dx, x)
+    return grad_x
 
-@custom_ops.custom_op('_torch_testing::numpy_mul')
-def numpy_mul(x: Tensor, y: Tensor) -> Tensor:
-    raise NotImplementedError()
+numpy_cube.register_autograd(numpy_cube_backward, setup_context=numpy_cube_setup_context)
 
-@custom_ops.impl('_torch_testing::numpy_mul')
-def numpy_mul_impl(x, y):
+@torch.library.custom_op("_torch_testing::numpy_mul", mutates_args=())
+def numpy_mul(x: Tensor, y: Tensor) -> Tensor:
     return torch.tensor(to_numpy(x) * to_numpy(y), device=x.device)
 
-@custom_ops.impl_abstract('_torch_testing::numpy_mul')
-def numpy_mul_abstract(x, y):
+@numpy_mul.register_fake
+def _(x, y):
     assert x.device == y.device
     return (x * y).contiguous()
 
-@custom_ops.impl_save_for_backward('_torch_testing::numpy_mul')
-def numpy_mul_save_for_backward(inputs, output):
-    saved = {}
-    saved['x_requires_grad'] = inputs.x.requires_grad
-    saved['y_requires_grad'] = inputs.y.requires_grad
-    # Optimization: only save what is necessary
-    saved['y'] = inputs.y if inputs.x.requires_grad else None
-    saved['x'] = inputs.x if inputs.y.requires_grad else None
-    return saved
-
-@custom_ops.impl_backward('_torch_testing::numpy_mul')
-def numpy_mul_backward(ctx, saved, grad_out):
-    grad_x = grad_out * saved['y'] if saved['x_requires_grad'] else None
-    grad_y = grad_out * saved['x'] if saved['x_requires_grad'] else None
-    return {'y': grad_y, 'x': grad_x}
-
-@custom_ops.custom_op('_torch_testing::numpy_sort')
-def numpy_sort(x: Tensor, dim: int) -> Tuple[Tensor, Tensor, Tensor]:
-    raise NotImplementedError()
+def numpy_mul_setup_context(ctx, inputs, output):
+    ctx.save_for_backward(*inputs)
+
+def numpy_mul_backward(ctx, grad_out):
+    x, y = ctx.saved_tensors
+    grad_x = grad_out * y if ctx.needs_input_grad[0] else None
+    grad_y = grad_out * x if ctx.needs_input_grad[1] else None
+    return grad_x, grad_y
+
+numpy_mul.register_autograd(numpy_mul_backward, setup_context=numpy_mul_setup_context)
+
+@torch.library.custom_op("_torch_testing::numpy_mul_scalar", mutates_args=())
+def numpy_mul_scalar(x: Tensor, *, scalar: float) -> Tensor:
+    return torch.tensor(to_numpy(x) * scalar, device=x.device)
 
-@custom_ops.impl("_torch_testing::numpy_sort")
-def numpy_sort_impl(x, dim):
+@numpy_mul_scalar.register_fake
+def _(x, *, scalar):
+    return (x * scalar).contiguous()
+
+def numpy_mul_scalar_setup_context(ctx, inputs, keyword_only_inputs, output):
+    ctx.scalar = keyword_only_inputs["scalar"]
+
+def numpy_mul_scalar_backward(ctx, grad_out):
+    grad_x = grad_out * ctx.scalar
+    return grad_x
+
+numpy_mul_scalar.register_autograd(numpy_mul_scalar_backward, setup_context=numpy_mul_scalar_setup_context)
+
+@torch.library.custom_op("_torch_testing::numpy_sort", mutates_args=())
+def numpy_sort(x: Tensor, dim: int) -> Tuple[Tensor, Tensor, Tensor]:
     device = x.device
     x = to_numpy(x)
     ind = np.argsort(x, axis=dim)
@@ -97,69 +99,60 @@ def numpy_sort_impl(x, dim):
         torch.tensor(ind_inv, device=device),
     )
 
-@custom_ops.impl_abstract('_torch_testing::numpy_sort')
-def numpy_sort_abstract(x, dim):
+@numpy_sort.register_fake
+def _(x, dim):
     return torch.empty_like(x), torch.empty_like(x, dtype=torch.long), torch.empty_like(x, dtype=torch.long)
 
-@custom_ops.impl_save_for_backward('_torch_testing::numpy_sort')
-def numpy_sort_save_for_backward(inputs, output):
+def numpy_sort_setup_context(ctx, inputs, output):
     out, ind, ind_inv = output
-    return [inputs.dim, ind, ind_inv]
+    ctx.dim = inputs[1]
+    ctx.save_for_backward(ind, ind_inv)
+    ctx.mark_non_differentiable(ind, ind_inv)
 
-@custom_ops.impl_backward('_torch_testing::numpy_sort', output_differentiability=[True, False, False])
-def numpy_sort_backward(ctx, saved, grad_out, grad_ind, grad_ind_inv):
-    dim, ind, ind_inv = saved
-    return {'x': torch.ops._torch_testing.numpy_take(grad_out, ind_inv, ind, dim)}
+def numpy_sort_backward(ctx, grad_out, grad_ind, grad_ind_inv):
+    ind, ind_inv = ctx.saved_tensors
+    return numpy_take(grad_out, ind_inv, ind, ctx.dim), None
 
-@custom_ops.custom_op('_torch_testing::numpy_take')
-def numpy_take(x: Tensor, ind: Tensor, ind_inv: Tensor, dim: int) -> Tensor:
-    raise NotImplementedError()
+numpy_sort.register_autograd(numpy_sort_backward, setup_context=numpy_sort_setup_context)
 
-@custom_ops.impl("_torch_testing::numpy_take")
-def numpy_take_impl(x, ind, ind_inv, dim):
+
+@torch.library.custom_op("_torch_testing::numpy_take", mutates_args=())
+def numpy_take(x: Tensor, ind: Tensor, ind_inv: Tensor, dim: int) -> Tensor:
     device = x.device
     x = to_numpy(x)
     ind = to_numpy(ind)
     return torch.tensor(np.take_along_axis(x, ind, dim), device=device)
 
-@custom_ops.impl_abstract('_torch_testing::numpy_take')
-def numpy_take_abstract(x, ind, ind_inv, dim):
+@numpy_take.register_fake
+def _(x, ind, ind_inv, dim):
     assert x.device == ind.device
     assert x.device == ind_inv.device
     assert ind.dtype == torch.long
     assert ind_inv.dtype == torch.long
     return torch.empty_like(x)
 
-@custom_ops.impl_save_for_backward('_torch_testing::numpy_take')
-def numpy_take_save_for_backward(inputs, output):
-    return {
-        'dim': inputs.dim,
-        'ind': inputs.ind,
-        'ind_inv': inputs.ind_inv,
-    }
-
-@custom_ops.impl_backward('_torch_testing::numpy_take')
-def numpy_take_backward(ctx, saved, grad_out):
-    return {
-        'x': torch.ops._torch_testing.numpy_take(grad_out, saved['ind_inv'], saved['ind'], saved['dim']),
-        'ind': None,
-        'ind_inv': None,
-    }
-
-@custom_ops.custom_op('_torch_testing::numpy_nonzero')
-def numpy_nonzero(x: Tensor) -> Tensor:
-    raise NotImplementedError()
+def numpy_take_setup_context(ctx, inputs, output):
+    x, ind, ind_inv, dim = inputs
+    ctx.dim = dim
+    ctx.save_for_backward(ind, ind_inv)
+
+def numpy_take_backward(ctx, grad_out):
+    ind, ind_inv = ctx.saved_tensors
+    grad_x = numpy_take(grad_out, ind_inv, ind, ctx.dim)
+    return grad_x, None, None, None
 
-@custom_ops.impl('_torch_testing::numpy_nonzero')
-def numpy_nonzero_impl(x):
+numpy_take.register_autograd(numpy_take_backward, setup_context=numpy_take_setup_context)
+
+@torch.library.custom_op("_torch_testing::numpy_nonzero", mutates_args=())
+def numpy_nonzero(x: Tensor) -> Tensor:
     x_np = to_numpy(x)
     res = np.stack(np.nonzero(x_np), axis=1)
     if res.shape[0] <= 1:
         raise RuntimeError("not supported")
     return torch.tensor(res, device=x.device)
 
-@custom_ops.impl_abstract('_torch_testing::numpy_nonzero')
-def numpy_nonzero_abstract(x):
+@numpy_nonzero.register_fake
+def _(x):
     ctx = torch._custom_op.impl.get_ctx()
     i0 = ctx.create_unbacked_symint()
     shape = [i0, x.dim()]
@@ -176,37 +169,29 @@ def sample_inputs_numpy_nonzero(opinfo, device, dtype, requires_grad, **kwargs):
 
     yield SampleInput(result, args=())
 
-@custom_ops.custom_op('_torch_testing::numpy_view_copy')
+@torch.library.custom_op("_torch_testing::numpy_view_copy", mutates_args=())
 def numpy_view_copy(x: Tensor, shape: Sequence[int]) -> Tensor:
-    raise NotImplementedError()
-
-@custom_ops.impl('_torch_testing::numpy_view_copy')
-def numpy_view_copy_impl(x, shape) -> Tensor:
     return torch.tensor(np.copy(to_numpy(x).reshape(shape)), device=x.device)
 
-@custom_ops.impl_abstract('_torch_testing::numpy_view_copy')
-def numpy_view_copy_abstract(x, shape) -> Tensor:
+@numpy_view_copy.register_fake
+def _(x, shape) -> Tensor:
     return x.clone().view(shape).clone()
 
-@custom_ops.impl_save_for_backward('_torch_testing::numpy_view_copy')
-def numpy_view_copy_save_for_backward(inputs, output) -> Tensor:
-    return inputs.x.shape
+def numpy_view_copy_setup_context(ctx, inputs, output) -> None:
+    ctx.x_shape = inputs[0].shape
 
-@custom_ops.impl_backward('_torch_testing::numpy_view_copy')
-def numpy_view_copy_backward(ctx, x_shape, grad_out) -> Tensor:
-    return {'x': torch.ops._torch_testing.numpy_view_copy(grad_out, x_shape)}
+def numpy_view_copy_backward(ctx, grad_out):
+    return torch.ops._torch_testing.numpy_view_copy(grad_out, ctx.x_shape), None
+
+numpy_view_copy.register_autograd(numpy_view_copy_backward, setup_context=numpy_view_copy_setup_context)
 
 def sample_inputs_numpy_view_copy(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = functools.partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
     result = make_arg(2, 3, 4, low=0.9, high=2)
     yield SampleInput(result, args=([2, 12],))
 
-@custom_ops.custom_op('_torch_testing::numpy_cat')
+@torch.library.custom_op('_torch_testing::numpy_cat', mutates_args=())
 def numpy_cat(xs: Sequence[Tensor], dim: int) -> Tensor:
-    raise NotImplementedError()
-
-@custom_ops.impl('_torch_testing::numpy_cat')
-def numpy_cat_impl(xs, dim):
     assert len(xs) > 0
     assert all(x.device == xs[0].device for x in xs)
     assert all(x.dtype == xs[0].dtype for x in xs)
@@ -214,24 +199,27 @@ def numpy_cat_impl(xs, dim):
     np_out = np.concatenate(np_xs, axis=dim)
     return torch.tensor(np_out, device=xs[0].device)
 
-@custom_ops.impl_abstract('_torch_testing::numpy_cat')
-def numpy_cat_abstract(xs, dim):
+@numpy_cat.register_fake
+def _(xs, dim):
     assert len(xs) > 0
     assert all(x.device == xs[0].device for x in xs)
     assert all(x.dtype == xs[0].dtype for x in xs)
     return torch.cat(xs, dim=dim)
 
-@custom_ops.impl_save_for_backward('_torch_testing::numpy_cat')
-def numpy_cat_save_for_backward(inputs, output):
-    dim_sizes = [x.shape[inputs.dim] for x in inputs.xs]
-    return dim_sizes, inputs.dim
+def numpy_cat_setup_context(ctx, inputs, output):
+    xs, dim = inputs
+    ctx.dim_sizes = [x.shape[dim] for x in xs]
+    ctx.dim = dim
+
+def numpy_cat_backward(ctx, grad_out):
+    dim_sizes = ctx.dim_sizes
+    dim = ctx.dim
 
-@custom_ops.impl_backward('_torch_testing::numpy_cat')
-def numpy_cat_backward(ctx, saved, grad_out):
-    dim_sizes, dim = saved
     splits = list(np.cumsum(dim_sizes)[:-1])
     grad_xs = torch.ops._torch_testing.numpy_split_copy(grad_out, splits, dim)
-    return {'xs': grad_xs}
+    return grad_xs, None
+
+numpy_cat.register_autograd(numpy_cat_backward, setup_context=numpy_cat_setup_context)
 
 def sample_inputs_numpy_cat(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = functools.partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -240,66 +228,54 @@ def sample_inputs_numpy_cat(opinfo, device, dtype, requires_grad, **kwargs):
     r2 = make_arg(5, 3, 4, low=0.9, high=2)
     yield SampleInput([r0, r1, r2], args=(0,))
 
-@custom_ops.custom_op('_torch_testing::numpy_split_copy')
-def numpy_split_copy(x: Tensor, sections: Sequence[int], dim: int) -> List[Tensor]:
-    raise NotImplementedError()
-
-@custom_ops.impl('_torch_testing::numpy_split_copy')
-def numpy_split_copy_impl(x, splits, dim):
+@torch.library.custom_op('_torch_testing::numpy_split_copy', mutates_args=())
+def numpy_split_copy(x: Tensor, splits: Sequence[int], dim: int) -> List[Tensor]:
     x_np = to_numpy(x)
     arrs = np.split(x_np, splits, axis=dim)
     return [torch.tensor(arr, device=x.device, dtype=x.dtype) for arr in arrs]
 
-@custom_ops.impl_abstract('_torch_testing::numpy_split_copy')
-def numpy_split_copy_abstract(x, splits, dim):
+@numpy_split_copy.register_fake
+def _(x, splits, dim):
     return [xi.clone() for xi in torch.tensor_split(x, splits, dim)]
 
-@custom_ops.impl_save_for_backward('_torch_testing::numpy_split_copy')
-def numpy_split_copy_save_for_backward(inputs, output):
-    return inputs.dim
+def numpy_split_copy_setup_context(ctx, inputs, output):
+    _, _, dim = inputs
+    ctx.dim = dim
+
+def numpy_split_copy_backward(ctx, grad_out):
+    result = torch.ops._torch_testing.numpy_cat(grad_out, dim=ctx.dim)
+    return result, None, None
 
-@custom_ops.impl_backward('_torch_testing::numpy_split_copy')
-def numpy_split_copy_backward(ctx, saved, grad_out):
-    dim = saved
-    return {'x': torch.ops._torch_testing.numpy_cat(grad_out, dim=dim)}
+numpy_split_copy.register_autograd(numpy_split_copy_backward, setup_context=numpy_split_copy_setup_context)
 
 def sample_inputs_numpy_split_copy(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = functools.partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
     x = make_arg(2, 9, low=0.9, high=2)
     yield SampleInput(x, args=([1, 3, 6], 1))
 
-@custom_ops.custom_op('_torch_testing::numpy_split_copy_with_int')
-def numpy_split_copy_with_int(x: Tensor, sections: Sequence[int], dim: int) -> Tuple[List[Tensor], int]:
-    raise NotImplementedError()
-
-@custom_ops.impl('_torch_testing::numpy_split_copy_with_int')
-def numpy_split_copy_with_int_impl(x, splits, dim):
+@torch.library.custom_op('_torch_testing::numpy_split_copy_with_int', mutates_args=())
+def numpy_split_copy_with_int(x: Tensor, splits: Sequence[int], dim: int) -> Tuple[List[Tensor], int]:
     x_np = to_numpy(x)
     arrs = np.split(x_np, splits, axis=dim)
     return [torch.tensor(arr, device=x.device, dtype=x.dtype) for arr in arrs], len(splits)
 
-@custom_ops.impl_abstract('_torch_testing::numpy_split_copy_with_int')
-def numpy_split_copy_with_int_abstract(x, splits, dim):
+@numpy_split_copy_with_int.register_fake
+def _(x, splits, dim):
     return [xi.clone() for xi in torch.tensor_split(x, splits, dim)], len(splits)
 
-@custom_ops.impl_save_for_backward(
-    '_torch_testing::numpy_split_copy_with_int')
-def numpy_split_copy_with_int_save_for_backward(inputs, output):
-    return inputs.dim
+def numpy_split_copy_with_int_setup_context(ctx, inputs, output):
+    _, _, dim = inputs
+    ctx.dim = dim
 
-@custom_ops.impl_backward(
-    '_torch_testing::numpy_split_copy_with_int',
-    output_differentiability=[True, False])
-def numpy_split_copy_with_int_backward(ctx, saved, grad_out, _):
-    dim = saved
-    return {'x': torch.ops._torch_testing.numpy_cat(grad_out, dim=dim)}
+def numpy_split_copy_with_int_backward(ctx, grad_out, _):
+    return torch.ops._torch_testing.numpy_cat(grad_out, dim=ctx.dim), None, None
 
-@custom_ops.custom_op('_torch_testing::numpy_nms')
-def numpy_nms(boxes: Tensor, scores: Tensor, iou_threshold: Number) -> Tensor:
-    raise NotImplementedError()
+numpy_split_copy_with_int.register_autograd(
+    numpy_split_copy_with_int_backward,
+    setup_context=numpy_split_copy_with_int_setup_context)
 
-@custom_ops.impl('_torch_testing::numpy_nms')
-def numpy_nms_impl(boxes, scores, iou_threshold):
+@torch.library.custom_op("_torch_testing::numpy_nms", mutates_args=())
+def numpy_nms(boxes: Tensor, scores: Tensor, iou_threshold: Number) -> Tensor:
     # Adapted from Ross Girshick's fast-rcnn implementation at
     # https://github.com/rbgirshick/fast-rcnn/blob/master/lib/utils/nms.py
     assert boxes.device == scores.device
@@ -342,8 +318,8 @@ def numpy_nms_impl(boxes, scores, iou_threshold):
     assert result.size(0) >= 2
     return result
 
-@custom_ops.impl_abstract('_torch_testing::numpy_nms')
-def numpy_nms_abstract(boxes, scores, iou_threshold):
+@numpy_nms.register_fake
+def _(boxes, scores, iou_threshold):
     assert boxes.device == scores.device
     N = boxes.shape[0]
     assert boxes.shape == (N, 4)
@@ -370,35 +346,42 @@ def sample_inputs_numpy_nms(opinfo, device, dtype, requires_grad, **kwargs):
 custom_op_db = [
     OpInfo(
         'NumpyCubeCustomOp',
-        op=torch.ops._torch_testing.numpy_cube,
+        op=numpy_cube._opoverload,
         sample_inputs_func=sample_inputs_numpy_cube,
         dtypes=all_types_and(torch.bool, torch.half),
         supports_out=False,
     ),
     OpInfo(
         'NumpyMulCustomOp',
-        op=torch.ops._torch_testing.numpy_mul,
+        op=numpy_mul._opoverload,
         sample_inputs_func=sample_inputs_numpy_mul,
         dtypes=all_types_and(torch.bool, torch.half),
         supports_out=False,
     ),
+    OpInfo(
+        'NumpyMulScalarCustomOp',
+        op=numpy_mul_scalar._opoverload,
+        sample_inputs_func=sample_inputs_numpy_mul_scalar,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
     OpInfo(
         'NumpySortCustomOp',
-        op=torch.ops._torch_testing.numpy_sort,
+        op=numpy_sort._opoverload,
         sample_inputs_func=sample_inputs_numpy_sort,
         dtypes=all_types_and(torch.bool, torch.half),
         supports_out=False,
     ),
     OpInfo(
         'NumpyTakeCustomOp',
-        op=torch.ops._torch_testing.numpy_take,
+        op=numpy_take._opoverload,
         sample_inputs_func=sample_inputs_numpy_take,
         dtypes=all_types_and(torch.bool, torch.half),
         supports_out=False,
     ),
     OpInfo(
         'NumpyNonzeroCustomOp',
-        op=torch.ops._torch_testing.numpy_nonzero,
+        op=numpy_nonzero._opoverload,
         sample_inputs_func=sample_inputs_numpy_nonzero,
         dtypes=all_types_and(torch.bool, torch.half),
         supports_autograd=False,
@@ -452,3 +435,54 @@ def sample_inputs_numpy_nms(opinfo, device, dtype, requires_grad, **kwargs):
         supports_out=False,
     ),
 ]
+
+
+# ==============================================================
+# some mechanical test cases
+# ==============================================================
+
+lib = torch.library.Library("_torch_testing", "FRAGMENT")  # noqa: TOR901
+
+lib.define("source0(Tensor x) -> Tensor")
+
+@torch.library.register_fake("_torch_testing::source0", lib=lib)
+def _(x):
+    return x.clone()
+
+lib.define("source1(Tensor x) -> Tensor")
+
+def source1_fake(x):
+    return x.clone()
+
+torch.library.register_fake("_torch_testing::source1", source1_fake, lib=lib)
+
+lib.define("source2(Tensor x) -> Tensor")
+
+@torch.library.impl_abstract("_torch_testing::source2", lib=lib)
+def _(x):
+    return x.clone()
+
+lib.define("source3(Tensor x) -> Tensor")
+
+def source3_fake(x):
+    return x.clone()
+
+torch.library.impl_abstract("_torch_testing::source3", source3_fake, lib=lib)
+
+
+@torch.library.custom_op("_torch_testing::source4", mutates_args=())
+def source4(x: Tensor) -> Tensor:
+    return x.clone()
+
+@source4.register_fake
+def _(x):
+    return x.clone()
+
+@torch.library.custom_op("_torch_testing::source5", mutates_args=())
+def source5(x: Tensor) -> Tensor:
+    return x.clone()
+
+def source5_fake(x):
+    return x.clone()
+
+source5.register_fake(source5_fake)
diff --git a/torch/testing/_internal/data/__init__.py b/torch/testing/_internal/data/__init__.py
index e69de29bb2d1d..1e3572cfc4c6a 100644
--- a/torch/testing/_internal/data/__init__.py
+++ b/torch/testing/_internal/data/__init__.py
@@ -0,0 +1 @@
+# mypy: ignore-errors
diff --git a/torch/testing/_internal/data/network1.py b/torch/testing/_internal/data/network1.py
index aed9cb404d6f2..e6180f4f2d2ed 100644
--- a/torch/testing/_internal/data/network1.py
+++ b/torch/testing/_internal/data/network1.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch.nn as nn
 
 
diff --git a/torch/testing/_internal/data/network2.py b/torch/testing/_internal/data/network2.py
index e6022a1e7570f..fdb583d0af92f 100644
--- a/torch/testing/_internal/data/network2.py
+++ b/torch/testing/_internal/data/network2.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch.nn as nn
 
 
diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py
index 12eb60d15d6a1..21a1f2011e6f3 100644
--- a/torch/testing/_internal/dist_utils.py
+++ b/torch/testing/_internal/dist_utils.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import re
 import sys
 import time
@@ -129,9 +131,8 @@ def wait_until_pending_futures_and_users_flushed(timeout: int = 20) -> None:
         time.sleep(0.1)
         if time.time() - start > timeout:
             raise ValueError(
-                "Timed out waiting to flush pending futures and users, had {} pending futures and {} pending users".format(
-                    num_pending_futures, num_pending_users
-                )
+                f"Timed out waiting to flush pending futures and users, "
+                f"had {num_pending_futures} pending futures and {num_pending_users} pending users"
             )
 
 
@@ -165,13 +166,8 @@ def wait_until_owners_and_forks_on_rank(
         time.sleep(1)
         if time.time() - start > timeout:
             raise ValueError(
-                "Timed out waiting {} sec for {} owners and {} forks on rank, had {} owners and {} forks".format(
-                    timeout,
-                    num_owners,
-                    num_forks,
-                    num_owners_on_rank,
-                    num_forks_on_rank,
-                )
+                f"Timed out waiting {timeout} sec for {num_owners} owners and {num_forks} forks on rank,"
+                f" had {num_owners_on_rank} owners and {num_forks_on_rank} forks"
             )
 
 
diff --git a/torch/testing/_internal/distributed/_shard/__init__.py b/torch/testing/_internal/distributed/_shard/__init__.py
index e69de29bb2d1d..1e3572cfc4c6a 100644
--- a/torch/testing/_internal/distributed/_shard/__init__.py
+++ b/torch/testing/_internal/distributed/_shard/__init__.py
@@ -0,0 +1 @@
+# mypy: ignore-errors
diff --git a/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py b/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
index 0ca11a7fbd724..aa45e4f28ce0b 100644
--- a/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
+++ b/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import sys
 from functools import wraps, partial
 
diff --git a/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py b/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py
index 6508a5d88e5fb..a3fe0584d75bd 100644
--- a/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py
+++ b/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import builtins
 
 import torch
diff --git a/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py b/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py
index 58ce3c996fa03..4d125d2bd0955 100644
--- a/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py
+++ b/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import copy
 import random
 import torch
diff --git a/torch/testing/_internal/distributed/_shard/test_common.py b/torch/testing/_internal/distributed/_shard/test_common.py
index 786cb9ed42eac..6dd034eb0f45a 100644
--- a/torch/testing/_internal/distributed/_shard/test_common.py
+++ b/torch/testing/_internal/distributed/_shard/test_common.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 import torch.nn as nn
 
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index 05a3c08728789..bb32baf668901 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -1,43 +1,40 @@
+# mypy: ignore-errors
+
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
 import itertools
-from dataclasses import dataclass
 import sys
+from dataclasses import dataclass
 from functools import wraps
-from typing import (
-    Any,
-    Callable,
-    Iterator,
-    Tuple,
-    Dict,
-    List,
-    Sequence,
-    TypeVar,
-    cast,
-)
+from typing import Any, Callable, cast, Dict, Iterator, List, Sequence, Tuple, TypeVar
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
 
-from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
+from torch.distributed._tensor import DeviceMesh, distribute_tensor, Replicate, Shard
+from torch.distributed._tensor.placement_types import Placement
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    PrepareModuleInput,
+    RowwiseParallel,
+    SequenceParallel,
+)
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     MultiThreadedTestCase,
-    TEST_SKIPS,
     skip_if_lt_x_gpu,
+    run_subtests,
+    TEST_SKIPS,
 )
 
-from torch.distributed._tensor import (
-    DeviceMesh,
-    Shard,
-    Replicate,
-    distribute_tensor,
-)
-from torch.distributed._tensor.placement_types import Placement
+from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
 
-DEVICE_TYPE = "cuda" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else "cpu"
+DEVICE_TYPE = (
+    "cuda" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else "cpu"
+)
 PG_BACKEND = "nccl" if DEVICE_TYPE == "cuda" else "gloo"
 
 NUM_DEVICES = 4
@@ -50,6 +47,21 @@
 T = TypeVar("T")
 
 
+# simple RMSNorm layer for testing
+class RMSNormPython(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x)
+        return output * self.weight
+
+
 class MLPModule(nn.Module):
     def __init__(self, device):
         super().__init__()
@@ -66,16 +78,29 @@ def reset_parameters(self):
         self.net2.reset_parameters()
 
 
+class MLPStacked(nn.Module):
+    def __init__(self, device, n_layers: int = 2):
+        super().__init__()
+        self.layers = nn.ModuleList([MLPModule(device) for i in range(n_layers)])
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
 @dataclass
 class ModelArgs:
     n_layers: int = 2
-    vocab_size: int = 16
+    vocab_size: int = 8
     max_seq_len: int = 16
-    dim: int = 8
+    dim: int = 16
     n_heads: int = 4
     dropout_p: float = 0.1
     use_attn_mask: bool = True
     weight_tying: bool = True
+    checkpoint_activations: bool = False
+
 
 class Attention(nn.Module):
     def __init__(self, args: ModelArgs):
@@ -103,14 +128,18 @@ def forward(self, x):
         keys = keys.transpose(1, 2)  # (bsz, n_heads, seq_len, head_dim)
         values = values.transpose(1, 2)  # (bsz, n_heads, seq_len, head_dim)
 
-        mask = None
-        if self.use_attn_mask and seq_len > 1:
-            mask = torch.full((seq_len, seq_len), float("-inf"), device=x.device)
-            mask = torch.triu(mask, diagonal=1)
-        output = F.scaled_dot_product_attention(queries, keys, values, mask, self.dropout_p if self.training else 0)
+        output = F.scaled_dot_product_attention(
+            queries,
+            keys,
+            values,
+            None,
+            self.dropout_p if self.training else 0,
+            self.use_attn_mask,
+        )
         output = output.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
         return self.resid_dropout(self.wo(output))
 
+
 class FeedForward(nn.Module):
     def __init__(self, dim, hidden_dim, dropout_p):
         super().__init__()
@@ -122,19 +151,23 @@ def __init__(self, dim, hidden_dim, dropout_p):
     def forward(self, x):
         return self.resid_dropout(self.w2(self.gelu(self.w1(x))))
 
+
 class TransformerBlock(nn.Module):
     def __init__(self, args: ModelArgs):
         super().__init__()
         self.attention_norm = nn.LayerNorm(args.dim)
         self.attention = Attention(args)
         self.ffn_norm = nn.LayerNorm(args.dim)
-        self.feed_forward = FeedForward(args.dim, hidden_dim=4 * args.dim, dropout_p=args.dropout_p)
+        self.feed_forward = FeedForward(
+            args.dim, hidden_dim=4 * args.dim, dropout_p=args.dropout_p
+        )
 
     def forward(self, x):
         h = x + self.attention(self.attention_norm(x))
         out = h + self.feed_forward(self.ffn_norm(h))
         return out
 
+
 # A toy transformer model, partly inspired by the nanoGPT model:
 # https://github.com/karpathy/nanoGPT.
 class Transformer(nn.Module):
@@ -142,6 +175,7 @@ def __init__(self, args: ModelArgs):
         super().__init__()
         assert args.vocab_size is not None
         assert args.max_seq_len is not None
+        self.model_args = args
         self.max_seq_len = args.max_seq_len
         self.tok_embeddings = nn.Embedding(args.vocab_size, args.dim)
         self.pos_embeddings = nn.Embedding(args.max_seq_len, args.dim)
@@ -153,6 +187,7 @@ def __init__(self, args: ModelArgs):
         self.output = nn.Linear(args.dim, args.vocab_size, bias=False)
         if args.weight_tying:
             self.output.weight = self.tok_embeddings.weight
+        self.checkpoint_activations = args.checkpoint_activations
 
     def forward(self, tokens):
         _bsz, seq_len = tokens.size()
@@ -163,11 +198,85 @@ def forward(self, tokens):
         h = h + p
         h = self.dropout(h)
         for layer in self.layers:
-            h = layer(h)
+            if self.checkpoint_activations:
+                h = torch.utils.checkpoint.checkpoint(layer, h, use_reentrant=False)
+            else:
+                h = layer(h)
         h = self.norm(h)
         output = self.output(h).float()
         return output
 
+    @staticmethod
+    def parallelize(
+        module: "Transformer", device_mesh: DeviceMesh, use_seq_parallel: bool
+    ) -> nn.Module:
+        assert isinstance(module, Transformer), f"Requires Transformer but got {module}"
+        # Parallelize the root submodules.
+        if use_seq_parallel:
+            root_plan = {
+                "tok_embeddings": RowwiseParallel(input_layouts=Replicate(), output_layouts=Shard(1)),
+                "pos_embeddings": RowwiseParallel(input_layouts=Replicate(), output_layouts=Shard(0)),
+                "norm": SequenceParallel(),
+            }
+        else:
+            root_plan = {
+                "tok_embeddings": RowwiseParallel(input_layouts=Replicate(), output_layouts=Replicate()),
+                "pos_embeddings": RowwiseParallel(input_layouts=Replicate(), output_layouts=Replicate()),
+            }
+
+        module_tp = parallelize_module(module, device_mesh, root_plan)
+        # Parallelize the attention and feed forward submodules.
+        for layer in module_tp.layers:
+            layer_parallelize_plan = {}
+            if use_seq_parallel:
+                layer_parallelize_plan["attention"] = PrepareModuleInput(
+                    input_layouts=Shard(1),
+                    desired_input_layouts=Replicate(),
+                )
+                # shard the RMSNorms
+                layer_parallelize_plan["attention_norm"] = SequenceParallel()
+                layer_parallelize_plan["ffn_norm"] = SequenceParallel()
+            layer_parallelize_plan["attention.wq"] = ColwiseParallel(use_local_output=False)
+            layer_parallelize_plan["attention.wk"] = ColwiseParallel(use_local_output=False)
+            layer_parallelize_plan["attention.wv"] = ColwiseParallel(use_local_output=False)
+            layer_parallelize_plan["attention.wo"] = (
+                RowwiseParallel(output_layouts=Shard(1))
+                if use_seq_parallel
+                else RowwiseParallel()
+            )
+
+            layer_parallelize_plan["feed_forward.w1"] = (
+                ColwiseParallel(input_layouts=Shard(1))
+                if use_seq_parallel
+                else ColwiseParallel()
+            )
+            layer_parallelize_plan["feed_forward.w2"] = (
+                RowwiseParallel(output_layouts=Shard(1))
+                if use_seq_parallel
+                else RowwiseParallel()
+            )
+
+            parallelize_module(layer, device_mesh, layer_parallelize_plan)
+
+        # Parallelize the output submodule. If weight tying is enabled, we need to
+        # make sure output.weight is sharded consistently as tok_embeddings.weight,
+        # at the cost of the all_reduce operation using RowwiseParallel.
+        output_parallelize_plan = (
+            ColwiseParallel(
+                input_layouts=Shard(1),
+                output_layouts=Replicate(),
+            )
+            if use_seq_parallel
+            else ColwiseParallel(output_layouts=Replicate())
+        )
+        parallelize_module(module_tp.output, device_mesh, output_parallelize_plan)
+
+        # Manually set output.weight so that parameters and gradients are shared.
+        if module_tp.model_args.weight_tying:
+            module_tp.output.weight = module_tp.tok_embeddings.weight
+
+        return module_tp
+
 
 def skip_unless_torch_gpu(method: T) -> T:
     """
@@ -241,6 +350,7 @@ def run_subtests(self, *args, **kwargs):
 
 TestFunc = Callable[[object], object]
 
+
 # wrapper to initialize comms (processgroup)
 def with_comms(func: TestFunc) -> TestFunc:
     assert func is not None
@@ -262,38 +372,6 @@ def wrapper(
     return wrapper
 
 
-def run_subtests(
-    cls_inst,
-    subtest_config: Dict[str, List[Any]],
-    test_fn: Callable,
-    *test_args,
-    **test_kwargs: Any,
-):
-    """
-    Runs a test function given by ``test_fn`` as a subtest according to the
-    configurations specified by ``subtest_config``. This amortizes the
-    costly setup overhead (including process spawn and initializing the
-    process group) over the subtests.
-
-    Args:
-        subtest_config (Dict[str, List[Any]]): A mapping from subtest
-            keyword argument name to a list of its possible values.
-        test_fn (Callable): A callable that runs the actual test.
-        test_args: Positional arguments to pass to ``test_fn``.
-        test_kwargs: Keyword arguments to pass to ``test_fn``.
-    """
-    # Convert the config mapping to a list to have a fixed order
-    subtest_config_items: List[Tuple[str, List[Any]]] = list(subtest_config.items())
-    subtest_config_keys: List[str] = [item[0] for item in subtest_config_items]
-    subtest_config_values: List[List[Any]] = [item[1] for item in subtest_config_items]
-    for values in itertools.product(*subtest_config_values):
-        # Map keyword to chosen value
-        subtest_kwargs = dict(zip(subtest_config_keys, values))
-        with cls_inst.subTest(**subtest_kwargs):
-            test_fn(*test_args, **test_kwargs, **subtest_kwargs)
-        dist.barrier()
-
-
 class DTensorOpTestBase(MultiThreadedTestCase):
     @property
     def world_size(self) -> int:
@@ -371,9 +449,7 @@ def is_supported_tensor(self, t: torch.Tensor) -> bool:
             ]
         )
 
-    def gen_sharding_choices_for_arg(
-        self, arg: torch.Tensor
-    ) -> Sequence[Placement]:
+    def gen_sharding_choices_for_arg(self, arg: torch.Tensor) -> Sequence[Placement]:
         mesh_size = self.mesh.size()
         sharding_choices: List[Placement] = [Replicate()]
         # c10d collective does not support bool tensor
@@ -459,6 +535,4 @@ def to_dist_tensor(
             self.miss += 1
             return t
         else:
-            raise RuntimeError(
-                f"Trying to convert to DTensor, but got {type(t)}"
-            )
+            raise RuntimeError(f"Trying to convert to DTensor, but got {type(t)}")
diff --git a/torch/testing/_internal/distributed/checkpoint_utils.py b/torch/testing/_internal/distributed/checkpoint_utils.py
index 4c3db7e3dc937..ec7499575d813 100644
--- a/torch/testing/_internal/distributed/checkpoint_utils.py
+++ b/torch/testing/_internal/distributed/checkpoint_utils.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
 import os
diff --git a/torch/testing/_internal/distributed/common_state_dict.py b/torch/testing/_internal/distributed/common_state_dict.py
index 9841c5cb7d00f..b1cfda3e1cc3d 100644
--- a/torch/testing/_internal/distributed/common_state_dict.py
+++ b/torch/testing/_internal/distributed/common_state_dict.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 # Owner(s): ["oncall: distributed"]
 
 import copy
@@ -19,9 +21,13 @@
 
 
 class VerifyStateDictMixin:
-    def _compare_tensor(self, orig_tensor, dist_tensor):
+    def _compare_tensor(self, orig_tensor, dist_tensor, offload_to_cpu=False):
         if isinstance(dist_tensor, (DTensor, ShardedTensor)):
             dist_tensor = _gather_state_dict({"mykey": dist_tensor}).pop("mykey")
+
+        if offload_to_cpu:
+            orig_tensor = orig_tensor.cpu()
+            dist_tensor = dist_tensor.cpu()
         self.assertTrue(isinstance(dist_tensor, torch.Tensor))
         self.assertTrue(torch.allclose(orig_tensor, dist_tensor))
 
@@ -30,16 +36,17 @@ def _verify_msd(
         msd: Dict[str, Any],
         dist_msd: Dict[str, Any],
         options: StateDictOptions = StateDictOptions(),
+        offload_to_cpu=False,
     ) -> None:
         if not options.ignore_frozen_params:
             self.assertEqual(len(msd), len(dist_msd))
         for fqn, param in msd.items():
             dist_param = dist_msd.get(fqn, None)
             if not options.ignore_frozen_params:
-                self.assertIsNotNone(dist_param)
-                self._compare_tensor(param, dist_param)
+                self.assertIsNotNone(dist_param, f"{fqn=}")
+                self._compare_tensor(param, dist_param, offload_to_cpu)
             elif dist_param is None:
-                self.assertFalse(param.requires_grad)
+                self.assertFalse(param.requires_grad, f"{fqn=}")
 
     def _verify_osd(
         self,
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index b201f77f21710..e13c814b12a5f 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+# mypy: ignore-errors
 
 import contextlib
 import enum
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index bde3f10affaf1..13620d2594b2f 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import copy
 import itertools
 import math
@@ -24,6 +26,7 @@
 import torch.nn.functional as F
 from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
 from torch._utils_internal import TEST_MASTER_PORT as MASTER_PORT
+from torch.utils._python_dispatch import TorchDispatchMode
 from torch.autograd import DeviceType
 from torch.cuda.amp import GradScaler, autocast
 
@@ -38,8 +41,7 @@
 from torch.distributed.distributed_c10d import (
     get_world_size,
     _get_default_group,
-    AllreduceOptions,
-    GroupMember,
+    _get_pg_config,
 )
 from torch.distributed.utils import (
     _verify_param_shape_across_processes,
@@ -1203,11 +1205,13 @@ def test_3_level_hierarchical_model_averager(self):
             averager = hierarchicalSGD.HierarchicalModelAverager(
                 period_group_size_dict=period_group_size_dict, warmup_steps=warmup_steps
             )
+            self.assertEqual(dist.get_pg_count(), len(period_group_size_dict))
+
             subgroup1 = averager.period_process_group_dict[subgroup_avg_period1]
             subgroup2 = averager.period_process_group_dict[subgroup_avg_period2]
+            real_group_ranks_res1 = _get_pg_config(subgroup1)['ranks']
+            real_group_ranks_res2 = _get_pg_config(subgroup2)['ranks']
 
-            real_group_ranks_res1 = dist.get_process_group_ranks(subgroup1)
-            real_group_ranks_res2 = dist.get_process_group_ranks(subgroup2)
             expect_group_ranks_res1 = (
                 rank // subgroup_size1 * subgroup_size1
                 + np.array(list(range(subgroup_size1)))
@@ -2520,52 +2524,6 @@ def test_reduce_scatter_tensor_cuda(self):
             self.assertEqual(tensor_out, expected_tensor)
             self._barrier()
 
-        @skip_if_no_gpu
-        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
-        def test_all_reduce_result_cuda(self):
-            group, group_id, rank = self._init_global_test()
-            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
-            for src in group:
-                if rank == src:
-                    tensor = _build_tensor(src + 1, 2)
-                else:
-                    tensor = _build_tensor(src + 1, 10)
-                tensor = tensor.cuda(rank_to_GPU[rank][0])
-
-                opts = AllreduceOptions()
-                opts.reduceOp = dist.ReduceOp.SUM
-
-                if group_id == GroupMember.WORLD:
-                    work = _get_default_group().allreduce([tensor], opts)
-                else:
-                    work = group_id.allreduce([tensor], opts)
-
-                if BACKEND == "gloo":
-                    # Calling result right the work is finished should throw exception.
-                    # Here we have a race condition, we may not assume the work is not
-                    # finished by the time we run next lines.
-                    try:
-                        with self.assertRaisesRegex(
-                            RuntimeError,
-                            "Work needs to be completed before calling result",
-                        ):
-                            work.result()
-                    except AssertionError:
-                        # Exception was not raised, ensure is_completed()
-                        self.assertTrue(work.is_completed())
-
-                    work.wait()
-                    result = work.result()
-                else:
-                    # In case of NCCL we should be able to retrieve pointer to the result
-                    # even before work is finished.
-                    result = work.result()
-                    work.wait()
-
-                expected_value = 2 + (10 * (len(group) - 1))
-                self.assertEqual(result, [_build_tensor(src + 1, expected_value)])
-            self._barrier()
-
         def call_dist_op(
             self,
             profiling_title_postfix,
@@ -3665,7 +3623,7 @@ def _test_all_gather_coalesced_helper(
                     ]
                     assert self._run_all_gather_coalesced_and_verify(
                         output_tensor_lists, input_tensors, expected_tensors, group_id
-                    ), "output tensors do not match expected ouputs"
+                    ), "output tensors do not match expected outputs"
 
             self._barrier()
 
@@ -7319,10 +7277,7 @@ def forward(self, x, rank):
             for num_early_join_ranks in num_uneven_ranks:
                 for baseline_iter in baseline_num_iters:
                     for offset in iteration_offsets:
-                        mapping = {
-                            rank: baseline_iter
-                            for rank in range(0, num_early_join_ranks)
-                        }
+                        mapping = dict.fromkeys(range(0, num_early_join_ranks), baseline_iter)
                         # if num_early_join_ranks > 1, ranks > 0 that will join early
                         # iterate offset//2 more times than rank 0, to test nodes
                         # depleting inputs at different times.
@@ -7331,12 +7286,7 @@ def forward(self, x, rank):
                                 if rank > 0:
                                     mapping[rank] += offset // 2
                         mapping.update(
-                            {
-                                rank: baseline_iter + offset
-                                for rank in range(
-                                    num_early_join_ranks, dist.get_world_size()
-                                )
-                            }
+                            dict.fromkeys(range(num_early_join_ranks, dist.get_world_size()), baseline_iter + offset)
                         )
                         iteration_mappings.append(mapping)
 
@@ -9440,7 +9390,7 @@ def forward(ctx, input):
 
                 @staticmethod
                 def backward(ctx, grad_output):
-                    raise RuntimeError()
+                    raise RuntimeError
 
             class MyModel(nn.Module):
                 def __init__(self, device):
@@ -9584,7 +9534,7 @@ def forward(ctx, input):
 
                 @staticmethod
                 def backward(ctx, grad_output):
-                    raise RuntimeError()
+                    raise RuntimeError
 
             class MyModel(torch.nn.Module):
                 def __init__(self, device):
@@ -10031,7 +9981,7 @@ def _test_hook_pickling(self, hook, hook_state):
         )
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
         @skip_but_pass_in_sandcastle_if(
-            BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
+            True, "Skipped due to flakiness"
         )
         def test_ddp_hook_pickling_powerSGD(self):
 
@@ -10077,6 +10027,7 @@ def test_ddp_device_mesh_initialization(self):
                     model, device_mesh=device_mesh
                 )
 
+
         @skip_if_lt_x_gpu(2)
         @require_world_size(2)
         @skip_but_pass_in_sandcastle_if(
@@ -10109,5 +10060,43 @@ def test_ddp_compile_static_graph(self):
                 for p1, p2 in zip(ddp.parameters(), ddp_static.parameters()):
                     self.assertEqual(p1.grad, p2.grad)
 
+        @skip_if_lt_x_gpu(2)
+        @require_world_size(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_sink_noclone(self):
+            "Tests that we can configure DDP to avoid clone"
+
+            class OpPatcher(TorchDispatchMode):
+                def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                    func_packet = func._overloadpacket
+                    if func_packet == torch.ops.aten.clone:
+                        raise RuntimeError("clone encountered!")
+                    kwargs = kwargs if kwargs else {}
+                    return func(*args, **kwargs)
+
+            class MyModel(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.fc = torch.nn.Linear(10, 10)
+
+                def forward(self, input):
+                    return self.fc(input)
+
+            model = MyModel().cuda(self.rank)
+            ddp = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[self.rank],
+                find_unused_parameters=True,
+            )
+            ddp._set_ddp_sink_clone(False)
+            input = torch.rand(10, 10).cuda(self.rank)
+
+            with OpPatcher() as patcher:
+                ddp(input).sum().backward()
+
+
 
 instantiate_parametrized_tests(DistributedTest._DistTestBase)
diff --git a/torch/testing/_internal/distributed/distributed_utils.py b/torch/testing/_internal/distributed/distributed_utils.py
index f76533c39e6fe..8e78f5f977e52 100644
--- a/torch/testing/_internal/distributed/distributed_utils.py
+++ b/torch/testing/_internal/distributed/distributed_utils.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from contextlib import contextmanager
 from datetime import timedelta
 from functools import (
diff --git a/torch/testing/_internal/distributed/fake_pg.py b/torch/testing/_internal/distributed/fake_pg.py
index 3c827913cbb34..fe858fff042f9 100644
--- a/torch/testing/_internal/distributed/fake_pg.py
+++ b/torch/testing/_internal/distributed/fake_pg.py
@@ -1,28 +1,21 @@
+# mypy: ignore-errors
+
 import torch.distributed as dist
 
 from torch._C._distributed_c10d import (
-    _create_work_from_future,
-    AllgatherOptions,
-    AllreduceOptions,
-    BarrierOptions,
-    ReduceScatterOptions,
-    BroadcastOptions,
-    ScatterOptions,
-    AllToAllOptions
+    FakeProcessGroup,
 )
-from torch.futures import Future
-
-from typing import List
-from torch import Tensor
 
 
-def ret_work(ret):
-    fut = Future()
-    fut.set_result(ret)
-    return _create_work_from_future(fut)
+class FakeStore(dist.Store):
+    """
+    A fake store is a fake Key-Value store simply for initialization usage
+    the of fake process group, one can either use FakeStore or HashStore.
+    """
+    pass
 
 
-class FakeProcessGroup(dist.ProcessGroup):
+def _create_fake_pg(prefix_store, rank, world_size, timeout):
     """
     A fake process group (not related to FakeTensor) is a process group which
     doesn't actually do any communication, it just hallucinates some
@@ -33,112 +26,7 @@ class FakeProcessGroup(dist.ProcessGroup):
     for every collective. It should be used as a convinient tool when playing
     with distributed but don't care about the actual data.
     """
-    def __init__(self, rank, world_size):
-        super().__init__(rank, world_size)
-        self._rank = rank
-        self._world_size = world_size
-
-    def allreduce(self, tensor_list, opts=AllreduceOptions()):
-        return ret_work(tensor_list)
-
-    def allreduce_coalesced(self, tensor_list, opts=AllreduceOptions()):
-        return ret_work(tensor_list)
-
-    def allgather(self, output_tensors, input_tensor, opts=AllgatherOptions()):
-        # NOTE: in general it's not good form to try to make FakePG work with 'real data',
-        # but the reasoning here is that we want FakePG to work with DeviceMesh's init
-        # code that have the data validation, which makes it worth the tradeoff.
-        # In general user should use MTPG or normal PG for cases where they may care about
-        # real data from collectives
-        for chunk in output_tensors[0]:
-            chunk.copy_(input_tensor[0])
-        return ret_work(output_tensors)
-
-    def reduce_scatter(self, output_tensor, scatter_list, opts=ReduceScatterOptions()):
-        return ret_work(output_tensor)
-
-    def _allgather_base(self, output_tensor, input_tensor, opts=AllgatherOptions()):
-        # assume each rank have the same input tensor so we just copy to the results
-        # since it's not a real allgather, we simply make this copying logic to let
-        # some simple validation works (i.e. calling allgather to see if each rank have
-        # the same tensor or not)
-        # NOTE: in general it's not good form to try to make FakePG work with 'real data',
-        # but the reasoning here is that we want FakePG to work with DeviceMesh's init
-        # code that have the data validation, which makes it worth the tradeoff.
-        # In general user should use MTPG or normal PG for cases where they may care about
-        # real data from collectives
-        chunks = output_tensor.chunk(self._world_size)
-        for chunk in chunks:
-            chunk.copy_(input_tensor)
-        return ret_work(output_tensor)
-
-    def _reduce_scatter_base(self, output_tensor, input_tensor, opts=ReduceScatterOptions()):
-        return ret_work(output_tensor)
-
-    def barrier(self, opts=BarrierOptions()):
-        # it should be no-op for fake pg
-        pass
-
-    def broadcast(self, tensors: List[Tensor], opts=BroadcastOptions()):
-        return ret_work(tensors)
-
-    def scatter(
-        self,
-        output_tensors: List[Tensor],
-        input_tensors: List[List[Tensor]],
-        opts=ScatterOptions(),
-    ):
-        return ret_work(output_tensors)
-
-    def alltoall(
-        self,
-        output_tensors: List[Tensor],
-        input_tensors: List[Tensor],
-        opts=AllToAllOptions(),
-    ):
-        return ret_work(output_tensors)
-
-    def alltoall_base(
-        self,
-        output_tensor: Tensor,
-        input_tensor: Tensor,
-        output_split_sizes: List[int],
-        input_split_sizes: List[int],
-        opts=AllToAllOptions(),
-    ):
-        return ret_work(output_tensor)
-
-    def send(
-        self,
-        tensors: List[Tensor],
-        dstRank: int,
-        tag: int,
-    ):
-        return ret_work(None)
-
-    def recv(
-        self,
-        tensors: List[Tensor],
-        srcRank: int,
-        tag: int,
-    ):
-        return ret_work(tensors)
-
-    def getBackendName(self):
-        return "fake"
-
-    def __repr__(self):
-        return f"FakePG world_size:{self._world_size} rank:{self._rank}"
-
-
-class FakeStore(dist.Store):
-    """
-    A fake store is a fake Key-Value store simply for initialization usage
-    the of fake process group, one can either use FakeStore or HashStore.
-    """
-    pass
-
-def _create_fake_pg(prefix_store, rank, world_size, timeout):
     return FakeProcessGroup(rank, world_size)
 
+
 dist.Backend.register_backend("fake", _create_fake_pg, devices=['cpu', 'cuda'])
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index 4de1d9e36bd31..22490fe538a73 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import sys
 import threading
 from dataclasses import dataclass
@@ -74,6 +76,42 @@ def work(self, data):
                 _, input_tensor_list = data[src_rank]
                 output_tensor_list[src_rank].copy_(input_tensor_list[dest_rank])
 
+class AllToAllBase:
+    @torch.no_grad()
+    def work(self, data):
+        world_size = len(data)
+        for dest_rank in range(world_size):
+            output_buffer, _, output_split_sizes, _ = data[dest_rank]
+
+            output_indexes = self._size_cumsum(output_buffer.size(0), output_split_sizes, world_size)
+
+            for src_rank in range(world_size):
+                _, input_buffer, _, input_split_sizes = data[src_rank]
+                input_indexes = self._size_cumsum(input_buffer.size(0), input_split_sizes, world_size)
+
+                output_buffer[output_indexes[src_rank]:output_indexes[src_rank + 1]].copy_(
+                    input_buffer[input_indexes[dest_rank]:input_indexes[dest_rank + 1]]
+                )
+
+    def _size_cumsum(self, buf_size: int, sizes: Union[torch.Tensor, List[int], None], world_size: int) -> torch.Tensor:
+        if sizes is None or len(sizes) == 0:
+            sizes = torch.full(
+                (world_size,), buf_size // world_size, dtype=torch.int64
+            )
+        if not isinstance(sizes, torch.Tensor):
+            sizes = torch.tensor(sizes, dtype=torch.int64)
+        assert sizes.dtype == torch.int64
+        sizes = torch.cumsum(
+            torch.cat(
+                (
+                    torch.tensor([0], dtype=torch.int64, device=sizes.device), sizes
+                ),
+                dim=0
+            ),
+            dim=0
+        )
+        return sizes
+
 class AllReduce:
     def __init__(self, op):
         if op.op not in _reduce_ops:
@@ -152,8 +190,8 @@ def work(self, data):
 
 class ReduceScatter:
     def __init__(self, op):
-        if op != dist.ReduceOp.SUM:
-            raise NotImplementedError("ReduceScatter only supports SUM on threaded pg for now.")
+        if op != dist.ReduceOp.SUM and op != dist.ReduceOp.AVG:
+            raise NotImplementedError(f"ReduceScatter does not support {op}")
         self.op = op
 
     @torch.no_grad()
@@ -173,6 +211,11 @@ def work(self, data):
                     start_reduction[i] = True
                 else:
                     dest_tensor_on_rank_i[0].add_(to_scatter[i].to(dst_tensor_device))
+        if self.op == dist.ReduceOp.AVG:
+            num_ranks = len(data)
+            for each_rank_data in data:
+                each_rank_data[0][0] /= num_ranks
+
 
 class Broadcast:
     def __init__(self, src):
@@ -271,6 +314,19 @@ def reset(cls):
             cls._cur_coll_on_pgs = {}
             cls._terminate.clear()
 
+    def alltoall_base(
+        self,
+        output_buffer: torch.Tensor,
+        input_buffer: torch.Tensor,
+        output_split_sizes: Optional[List[int]],
+        input_split_sizes: Optional[List[int]],
+        opts=AllToAllOptions()
+    ) -> torch.Tensor:
+        coll = ProcessLocalGroup._start_coll(AllToAllBase(), self)
+        res = coll.join(self._rank, (output_buffer, input_buffer, output_split_sizes, input_split_sizes))
+        ProcessLocalGroup._end_coll(coll, self)
+        return res
+
     def alltoall(self, output_tensor_list, input_tensor_list, opts=AllToAllOptions()):
         coll = ProcessLocalGroup._start_coll(AllToAll(), self)
         res = coll.join(self._rank, (output_tensor_list, input_tensor_list))
@@ -326,11 +382,21 @@ def reduce_scatter(self, output_tensor, scatter_list, opts=ReduceScatterOptions(
         ProcessLocalGroup._end_coll(coll, self)
         return res
 
-    def _reduce_scatter_base(self, output_tensor, input_tensor, opts=AllgatherOptions()):
+    def _reduce_scatter_base(self, output_tensor, input_tensor, opts=ReduceScatterOptions()):
         tensor_list = list(torch.chunk(input_tensor, self._world_size))
         return self.reduce_scatter([output_tensor], [tensor_list], opts)
 
-    def allgather_into_tensor_coalesced(self, output_tensor_list, input_tensor_list):
+    def reduce_scatter_tensor_coalesced(self, output_tensors, input_tensors, opts=ReduceScatterOptions()):
+        works = [
+            self._reduce_scatter_base(output_tensor, input_tensor, opts)
+            for output_tensor, input_tensor
+            in zip(output_tensors, input_tensors)
+        ]
+        for work in works[:-1]:
+            work.wait()
+        return works[-1]
+
+    def allgather_into_tensor_coalesced(self, output_tensor_list, input_tensor_list, opts=AllgatherOptions()):
         res = None
         for o_t, i_t in zip(output_tensor_list, input_tensor_list):
             res = self._allgather_base(o_t, i_t)
@@ -356,6 +422,10 @@ def pg_name(self):
         """
         return self._world().pg_names[self]
 
+    @property
+    def group_name(self):
+        return self.pg_name
+
     def getBackendName(self):
         return "threaded"
 
diff --git a/torch/testing/_internal/distributed/nn/api/remote_module_test.py b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
index 4d9f1d9b53ddc..60857685b8851 100644
--- a/torch/testing/_internal/distributed/nn/api/remote_module_test.py
+++ b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
@@ -1,4 +1,5 @@
-#!/usr/bin/python3
+# mypy: ignore-errors
+
 import enum
 from typing import Tuple
 
diff --git a/torch/testing/_internal/distributed/pipe_with_ddp_test.py b/torch/testing/_internal/distributed/pipe_with_ddp_test.py
index ab782479fb190..1ed9f3cc96dfc 100644
--- a/torch/testing/_internal/distributed/pipe_with_ddp_test.py
+++ b/torch/testing/_internal/distributed/pipe_with_ddp_test.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 import torch.distributed as dist
 
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index 44e9887f3ba5f..cf8d7aae58a9b 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import sys
 import threading
 import time
@@ -199,7 +201,7 @@ def forward(ctx, input):
     @once_differentiable
     def backward(ctx, input):
         if SimulateBackwardError._simulate_error:
-            raise Exception("Simulate error on backward pass")
+            raise Exception("Simulate error on backward pass")  # noqa: TRY002
         else:
             return input
 
diff --git a/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py b/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
index c88eb8e479cd0..b506d76746d8e 100644
--- a/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 
 import threading
 
diff --git a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
index 4de9ef0c261f5..4811e9e007719 100644
--- a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 # If you need to modify this file to make this test pass, please also apply same edits accordingly to
 # https://github.com/pytorch/examples/blob/master/distributed/rpc/batch/parameter_server.py
 # and https://pytorch.org/tutorials/intermediate/rpc_async_execution.html#batch-updating-parameter-server
diff --git a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
index 98db73d740184..0549fe33a120a 100644
--- a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 # If you need to modify this file to make this test pass, please also apply same edits accordingly to
 # https://github.com/pytorch/examples/blob/master/distributed/rpc/rl/main.py
 # and https://pytorch.org/tutorials/intermediate/rpc_tutorial.html
@@ -196,7 +198,7 @@ def finish_episode(self):
             rewards.extend(self.rewards[ob_id])
 
         # use the minimum observer reward to calculate the running reward
-        min_reward = min([sum(self.rewards[ob_id]) for ob_id in self.rewards])
+        min_reward = min(sum(self.rewards[ob_id]) for ob_id in self.rewards)
         self.running_reward = 0.05 * min_reward + (1 - 0.05) * self.running_reward
 
         # clear saved probs and rewards
diff --git a/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py b/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py
index b7683064dcfd1..6232b5cb51d21 100644
--- a/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 import time
 import torch.distributed.rpc as rpc
diff --git a/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py b/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py
index af73fef4794b0..fa7287a0c984f 100644
--- a/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py
+++ b/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch.distributed.rpc as rpc
 import torch.distributed.rpc._testing  # noqa: F401
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
diff --git a/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py
index 1ad932af9e08b..af10865896bb8 100644
--- a/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from typing import Dict, Tuple
 
 import torch
diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
index 267adb5a09ab1..2b4195ecae311 100644
--- a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import time
 import io
 from typing import Dict, List, Tuple, Any
diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py
index 2e4eea3a36517..89ff7309fb08d 100644
--- a/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py
+++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from typing import Dict, Tuple
 
 import torch
diff --git a/torch/testing/_internal/distributed/rpc/rpc_agent_test_fixture.py b/torch/testing/_internal/distributed/rpc/rpc_agent_test_fixture.py
index e819613592299..eb439258f168b 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_agent_test_fixture.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_agent_test_fixture.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import os
 from abc import ABC, abstractmethod
 
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 401fed33949df..5d2a67cd473af 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import concurrent.futures
 import contextlib
 import json
@@ -53,6 +55,7 @@
 from torch.testing._internal.common_utils import TemporaryFileName
 
 from torch.autograd.profiler_legacy import profile as _profile
+import operator
 
 
 def foo_add():
@@ -234,11 +237,11 @@ def build_complex_tensors():
 
 def non_cont_test(t_view, t_cont):
     if t_view.is_contiguous():
-        raise Exception('t_view is contiguous!')
+        raise Exception('t_view is contiguous!')  # noqa: TRY002
     if not t_cont.is_contiguous():
-        raise Exception('t_cont is not contiguous!')
+        raise Exception('t_cont is not contiguous!')  # noqa: TRY002
     if not torch.equal(t_view, t_cont):
-        raise Exception('t_view is not equal to t_cont!')
+        raise Exception('t_view is not equal to t_cont!')  # noqa: TRY002
     return t_view
 
 def my_function(a, b, c):
@@ -3160,9 +3163,8 @@ def test_rref_str(self):
         )
         self.assertEqual(
             rref2.__str__(),
-            "UserRRef(RRefId = {0}(created_on={1}, local_id=1), ForkId = {0}(created_on={1}, local_id=2))".format(
-                id_class, self.rank
-            ),
+            f"UserRRef(RRefId = {id_class}(created_on={self.rank}, local_id=1), "
+            f"ForkId = {id_class}(created_on={self.rank}, local_id=2))",
         )
 
     @dist_init
@@ -4604,22 +4606,22 @@ def get_name(event):
         function_events = p.function_events
         for event in function_events:
             if event.is_async:
-                self.assertEqual(0, event.cuda_time_total)
+                self.assertEqual(0, event.device_time_total)
                 self.assertEqual([], event.kernels)
-                self.assertEqual(0, event.cuda_time)
+                self.assertEqual(0, event.device_time)
             else:
                 if event.node_id == 1:
                     continue
                 self.assertTrue(event.node_id in [dst_cuda_0, dst_cuda_1])
                 if get_name(event) in EXPECTED_REMOTE_EVENTS:
-                    self.assertGreater(event.cuda_time_total, 0)
+                    self.assertGreater(event.device_time_total, 0)
                     self.assertEqual(1, len(event.kernels))
                     kernel = event.kernels[0]
                     if event.node_id == dst_cuda_0:
                         self.assertEqual(kernel.device, 0)
                     if event.node_id == dst_cuda_1:
                         self.assertEqual(kernel.device, 1)
-                    self.assertGreater(event.cuda_time, 0)
+                    self.assertGreater(event.device_time, 0)
 
         # Validate that EXPECTED_REMOTE_EVENTS is a subset of remotely profiled
         # events.
@@ -6308,7 +6310,7 @@ def test_cuda_future_can_extract_cuda_tensor(self):
     @skip_if_lt_x_gpu(1)
     def test_cuda_future_can_extract_list_with_cuda_tensor(self):
         self._test_cuda_future_extraction(
-            wrapper=lambda t: [t], unwrapper=lambda v: v[0], sparse_tensor=False
+            wrapper=lambda t: [t], unwrapper=operator.itemgetter(0), sparse_tensor=False
         )
 
     @skip_if_lt_x_gpu(1)
@@ -6483,7 +6485,7 @@ def test_cuda_future_can_extract_cuda_sparse_tensor(self):
     @skip_if_lt_x_gpu(1)
     def test_cuda_future_can_extract_list_with_cuda_sparse_tensor(self):
         self._test_cuda_future_extraction(
-            wrapper=lambda t: [t], unwrapper=lambda v: v[0], sparse_tensor=True
+            wrapper=lambda t: [t], unwrapper=operator.itemgetter(0), sparse_tensor=True
         )
 
     @skip_if_lt_x_gpu(1)
diff --git a/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py b/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py
index 191017caad139..352d5d47b479f 100644
--- a/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py
+++ b/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch.distributed.rpc as rpc
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
diff --git a/torch/testing/_internal/distributed/rpc_utils.py b/torch/testing/_internal/distributed/rpc_utils.py
index 76ecfc2a6fe90..cdbbdcfd06814 100644
--- a/torch/testing/_internal/distributed/rpc_utils.py
+++ b/torch/testing/_internal/distributed/rpc_utils.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python3
+# mypy: ignore-errors
+
 import os
 import sys
 import unittest
diff --git a/torch/testing/_internal/dynamo_test_failures.py b/torch/testing/_internal/dynamo_test_failures.py
index f8eaf9151ce2b..eb626b552ce6a 100644
--- a/torch/testing/_internal/dynamo_test_failures.py
+++ b/torch/testing/_internal/dynamo_test_failures.py
@@ -1,3252 +1,113 @@
-# tests in this list will run without Dynamo strict mode by default.
-FIXME_default_non_strict = {
-    "dynamo/test_activation_checkpointing",
-    "dynamo/test_after_aot",
-    "dynamo/test_aot_autograd",
-    "dynamo/test_autograd_function",
-    "dynamo/test_backends",
-    "dynamo/test_backward_higher_order_ops",
-    "dynamo/test_base_output",
-    "dynamo/test_bytecode_hook",
-    "dynamo/test_compile",
-    "dynamo/test_comptime",
-    "dynamo/test_config",
-    "dynamo/test_ctx_manager",
-    "dynamo/test_cudagraphs",
-    "dynamo/test_debug_utils",
-    "dynamo/test_decorators",
-    "dynamo/test_dynamic_shapes",
-    "dynamo/test_exc",
-    "dynamo/test_export",
-    "dynamo/test_export_mutations",
-    "dynamo/test_frame_init",
-    "dynamo/test_functions",
-    "dynamo/test_global",
-    "dynamo/test_higher_order_ops",
-    "dynamo/test_hooks",
-    "dynamo/test_input_attr_tracking",
-    "dynamo/test_interop",
-    "dynamo/test_logging",
-    "dynamo/test_minifier",
-    "dynamo/test_misc",
-    "dynamo/test_model_output",
-    "dynamo/test_modules",
-    "dynamo/test_nops",
-    "dynamo/test_optimizers",
-    "dynamo/test_pre_dispatch",
-    "dynamo/test_profiler",
-    "dynamo/test_python_autograd",
-    "dynamo/test_recompile_ux",
-    "dynamo/test_recompiles",
-    "dynamo/test_replay_record",
-    "dynamo/test_repros",
-    "dynamo/test_skip_non_tensor",
-    "dynamo/test_sources",
-    "dynamo/test_subclasses",
-    "dynamo/test_subgraphs",
-    "dynamo/test_torchrec",
-    "dynamo/test_trace_rules",
-    "dynamo/test_unspec",
-    "dynamo/test_verify_correctness",
-    "export/test_db",
-    "export/test_experimental",
-    "export/test_export",
-    "export/test_export_nonstrict",
-    "export/test_functionalized_assertions",
-    "export/test_pass_infra",
-    "export/test_passes",
-    "export/test_retraceability",
-    "export/test_serdes",
-    "export/test_serialize",
-    "export/test_unflatten",
-    "export/test_upgrade",
-    "export/test_verifier",
-    "functorch/test_aotdispatch",
-    "functorch/test_ops",
-    "functorch/test_vmap",
-    "functorch/test_vmap_registrations",
-    "inductor/test_aot_inductor",
-    "inductor/test_aot_inductor_utils",
-    "inductor/test_benchmark_fusion",
-    "inductor/test_binary_folding",
-    "inductor/test_codecache",
-    "inductor/test_codegen_triton",
-    "inductor/test_compiled_autograd",
-    "inductor/test_compiled_optimizers",
-    "inductor/test_config",
-    "inductor/test_coordinate_descent_tuner",
-    "inductor/test_cpu_cpp_wrapper",
-    "inductor/test_cpu_repro",
-    "inductor/test_cuda_cpp_wrapper",
-    "inductor/test_cuda_repro",
-    "inductor/test_cudacodecache",
-    "inductor/test_cudagraph_trees",
-    "inductor/test_custom_lowering",
-    "inductor/test_custom_post_grad_passes",
-    "inductor/test_debug_trace",
-    "inductor/test_dependencies",
-    "inductor/test_efficient_conv_bn_eval",
-    "inductor/test_extension_backend",
-    "inductor/test_foreach",
-    "inductor/test_fp8",
-    "inductor/test_fused_attention",
-    "inductor/test_fx_fusion",
-    "inductor/test_group_batch_fusion",
-    "inductor/test_indexing",
-    "inductor/test_inductor_freezing",
-    "inductor/test_inductor_utils",
-    "inductor/test_inplacing_pass",
-    "inductor/test_kernel_benchmark",
-    "inductor/test_layout_optim",
-    "inductor/test_max_autotune",
-    "inductor/test_memory_planning",
-    "inductor/test_minifier",
-    "inductor/test_minifier_isolate",
-    "inductor/test_mkldnn_pattern_matcher",
-    "inductor/test_mmdecomp",
-    "inductor/test_move_constructors_to_cuda",
-    "inductor/test_pattern_matcher",
-    "inductor/test_perf",
-    "inductor/test_profiler",
-    "inductor/test_select_algorithm",
-    "inductor/test_smoke",
-    "inductor/test_snode_runtime",
-    "inductor/test_split_cat_fx_passes",
-    "inductor/test_standalone_compile",
-    "inductor/test_torchinductor",
-    "inductor/test_torchinductor_codegen_dynamic_shapes",
-    "inductor/test_torchinductor_dynamic_shapes",
-    "inductor/test_torchinductor_opinfo",
-    "inductor/test_triton_heuristics",
-    "inductor/test_triton_wrapper",
-    "inductor/test_unbacked_symints",
-    "lazy/test_ts_opinfo",
-    "profiler/test_memory_profiler",
-    "profiler/test_profiler",
-    "profiler/test_profiler_tree",
-    "test_schema_check",  # nb: times out
-    "test_ao_sparsity",
-    "test_autograd",
-    "test_binary_ufuncs",
-    "test_content_store",
-    "test_custom_ops",
-    "test_dataloader",
-    "test_decomp",
-    "test_foreach",
-    "test_fx",
-    "test_fx_experimental",
-    "test_fx_passes",
-    "test_fx_reinplace_pass",
-    "test_jit",
-    "test_jit_autocast",
-    "test_jit_disabled",
-    "test_jit_fuser_te",
-    "test_jit_llga_fuser",
-    "test_jiterator",
-    "test_legacy_vmap",
-    "test_masked",
-    "test_maskedtensor",
+import logging
+import os
+import sys
+
+# NOTE: [dynamo_test_failures.py]
+#
+# We generate xFailIfTorchDynamo* for all tests in `dynamo_expected_failures`
+# We generate skipIfTorchDynamo* for all tests in `dynamo_skips`
+#
+# For an easier-than-manual way of generating and updating these lists,
+# see scripts/compile_tests/update_failures.py
+#
+# If you're adding a new test, and it's failing PYTORCH_TEST_WITH_DYNAMO=1,
+# either add the appropriate decorators to your test or add skips for them
+# via test/dynamo_skips and test/dynamo_expected_failures.
+#
+# *These are not exactly unittest.expectedFailure and unittest.skip. We'll
+# always execute the test and then suppress the signal, if necessary.
+# If your tests crashes, or is slow, please use @skipIfTorchDynamo instead.
+#
+# The expected failure and skip files are located in test/dynamo_skips and
+# test/dynamo_expected_failures. They're individual files rather than a list so
+# git will merge changes easier.
+
+
+def find_test_dir():
+    # Find the path to the dynamo expected failure and skip files.
+    from os.path import abspath, basename, dirname, exists, join, normpath
+
+    if sys.platform == "win32":
+        return None
+
+    # Check relative to this file (local build):
+    test_dir = normpath(join(dirname(abspath(__file__)), "../../../test"))
+    if exists(join(test_dir, "dynamo_expected_failures")):
+        return test_dir
+
+    # Check relative to __main__ (installed builds relative to test file):
+    main = sys.modules["__main__"]
+    file = getattr(main, "__file__", None)
+    if file is None:
+        # Generated files do not have a module.__file__
+        return None
+    test_dir = dirname(abspath(file))
+    while dirname(test_dir) != test_dir:
+        if basename(test_dir) == "test" and exists(
+            join(test_dir, "dynamo_expected_failures")
+        ):
+            return test_dir
+        test_dir = dirname(test_dir)
+
+    # Not found
+    return None
+
+
+test_dir = find_test_dir()
+if not test_dir:
+    logger = logging.getLogger(__name__)
+    logger.warning(
+        "test/dynamo_expected_failures directory not found - known dynamo errors won't be skipped."
+    )
+
+# Tests that run without strict mode in PYTORCH_TEST_WITH_INDUCTOR=1.
+# Please don't add anything to this list.
+FIXME_inductor_non_strict = {
     "test_modules",
-    "test_namedtensor",
-    "test_namedtuple_return_api",
     "test_ops",
-    "test_ops_fwd_gradients",
     "test_ops_gradients",
-    "test_ops_jit",
-    "test_overrides",
-    "test_package",
-    "test_prims",
-    "test_proxy_tensor",
-    "test_public_bindings",
-    "test_python_dispatch",
-    "test_quantization",
-    "test_reductions",
-    "test_sparse",
-    "test_sparse_csr",
-    "test_sparse_semi_structured",
-    "test_spectral_ops",
-    "test_tensorexpr",
-    "test_tensorexpr_pybind",
     "test_torch",
-    "test_unary_ufuncs",
-    "test_utils",
-    "test_vulkan",
-    "test_xnnpack_integration",
 }
 
 # We generate unittest.expectedFailure for all of the following tests
 # when run under PYTORCH_TEST_WITH_DYNAMO=1.
+# see NOTE [dynamo_test_failures.py] for more details
 #
 # This lists exists so we can more easily add large numbers of failing tests,
-dynamo_expected_failures = {
-    "TestCppExtensionJIT.test_cpp_frontend_module_has_up_to_date_attribute",
-    "TestCppExtensionJIT.test_custom_compound_op_autograd",
-    "TestCppExtensionJIT.test_cpp_frontend_module_has_up_to_date_attributes",
-    "TestCppExtensionOpenRgistration.test_open_device_registration",
-    "TestAutogradFallback.test_supports_tensor_lists_mode_nothing",
-    "TestAutogradFallback.test_post_autograd_returns_mix_of_requires_grad_tensors_mode_warn",
-    "TestAutogradFallback.test_cpu_return_self_mode_warn",
-    "TestAutogradFallback.test_base_does_not_require_grad_mode_warn",
-    "TestAutogradFallback.test_undefined_grads_mode_nothing",
-    "TestAutogradFallback.test_undefined_grads_mode_warn",
-    "TestAutogradFallback.test_autograd_function_registered_to_cpu_mode_warn",
-    "TestAutogradFallback.test_cpu_return_self_mode_nothing",
-    "TestAutogradFallback.test_composite_registered_to_cpu_mode_nothing",
-    "TestAutogradFallback.test_undefined_inputs_outputs_mode_nothing",
-    "TestAutogradFallback.test_no_autograd_kernel_inplace_mode_nothing",
-    "TestAutogradFallback.test_post_autograd_returns_leaf_mode_nothing",
-    "TestAutogradFallback.test_inplace_on_tensor_that_does_not_require_grad_mode_nothing",
-    "TestAutogradFallback.test_no_grad_mode_warn",
-    "TestAutogradFallback.test_inplace_autograd_function_registered_to_cpu_mode_warn",
-    "TestAutogradFallback.test_no_autograd_kernel_mode_warn",
-    "TestAutogradFallback.test_base_does_not_require_grad_mode_nothing",
-    "TestAutogradFallback.test_composite_registered_to_cpu_mode_warn",
-    "TestAutogradFallback.test_post_autograd_returns_mix_of_requires_grad_tensors_mode_nothing",
-    "TestAutogradFallback.test_no_autograd_kernel_inplace_mode_warn",
-    "TestAutogradFallback.test_no_grad_mode_nothing",
-    "TestAutogradFallback.test_no_autograd_kernel_mode_nothing",
-    "TestAutogradFallback.test_supports_tensor_lists_mode_warn",
-    "TestAutogradFallback.test_post_autograd_returns_leaf_mode_warn",
-    "TestAutogradFallback.test_undefined_inputs_outputs_mode_warn",
-    "TestAutogradFallback.test_inplace_on_tensor_that_does_not_require_grad_mode_warn",
-    "TestAutogradFallback.test_inplace_autograd_function_registered_to_cpu_mode_nothing",
-    "TestAutogradFallback.test_autograd_function_registered_to_cpu_mode_nothing",
-    "TestFunctionalOptimParity.test_functional_optim_parity_sgd",
-    "TestIndexingCPU.test_invalid_index_cpu",
-    "NumpyTestsCPU.test_boolean_shape_mismatch_cpu",
-    "TestIndexingCPU.test_empty_ndim_index_bool_cpu",
-    "TestIndexingCPU.test_out_of_bound_index_cpu",
-    "NumpyTestsCPU.test_index_no_floats_cpu",
-    "TestIndexingCPU.test_zero_dim_index_cpu",
-    "NumpyTestsCPU.test_empty_fancy_index_cpu",
-    "TestIndexingCPU.test_index_cpu",
-    "TestIndexingCPU.test_index_limits_cpu",
-    "NumpyTestsCPU.test_boolean_indexing_weirdness_cpu",
-    "TestLinalgCPU.test_inverse_cpu_float32",
-    "TestLinalgCPU.test_matrix_rank_cpu_complex64",
-    "TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_float32",
-    "TestLinalgCPU.test_inverse_cpu_complex128",
-    "TestLinalgCPU.test_norm_dtype_cpu_complex128",
-    "TestLinalgCPU.test_householder_product_cpu_float64",
-    "TestLinalgCPU.test_linalg_lu_family_cpu_float32",
-    "TestLinalgCPU.test_linalg_lu_family_cpu_float64",
-    "TestLinalgCPU.test_addr_integral_cpu_int64",
-    "TestLinalgCPU.test_norm_vector_cpu_float32",
-    "TestLinalgCPU.test_solve_cpu_complex128",
-    "TestLinalgCPU.test_lobpcg_torchscript_cpu_float64",
-    "TestLinalgCPU.test_einsum_sublist_format_cpu_float64",
-    "TestLinalgCPU.test_solve_cpu_float32",
-    "TestLinalgCPU.test_addr_integral_cpu_int16",
-    "TestLinalgCPU.test_norm_vector_cpu_float64",
-    "TestLinalgCPU.test_einsum_random_cpu_complex128",
-    "TestLinalgCPU.test_addmm_sizes_cpu_float64",
-    "TestLinalgCPU.test_norm_dtype_cpu_float64",
-    "TestLinalgCPU.test_addr_integral_cpu_int8",
-    "TestLinalgCPU.test_einsum_random_cpu_float64",
-    "TestLinalgCPU.test_matmul_small_brute_force_3d_Nd_cpu_complex64",
-    "TestLinalgCPU.test_matrix_rank_cpu_float32",
-    "TestLinalgCPU.test_pinv_cpu_float32",
-    "TestLinalgCPU.test_addr_integral_cpu_uint8",
-    "TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_complex128",
-    "TestLinalgCPU.test_addr_integral_cpu_int32",
-    "TestLinalgCPU.test_matmul_small_brute_force_3d_Nd_cpu_int64",
-    "TestLinalgCPU.test_solve_cpu_complex64",
-    "TestLinalgCPU.test_solve_cpu_float64",
-    "TestLinalgCPU.test_addmm_sizes_cpu_float32",
-    "TestLinalgCPU.test_norm_bfloat16_and_half_cpu_float16",
-    "TestLinalgCPU.test_householder_product_cpu_complex64",
-    "TestLinalgCPU.test_linalg_lu_family_cpu_complex128",
-    "TestLinalgCPU.test_inverse_cpu_float64",
-    "TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_complex64",
-    "TestLinalgCPU.test_pinv_cpu_complex64",
-    "TestLinalgCPU.test_matmul_small_brute_force_3d_Nd_cpu_float32",
-    "TestLinalgCPU.test_geqrf_cpu_complex128",
-    "TestLinalgCPU.test_matrix_rank_cpu_complex128",
-    "TestLinalgCPU.test_einsum_sublist_format_cpu_complex128",
-    "TestLinalgCPU.test_geqrf_cpu_complex64",
-    "TestLinalgCPU.test_slogdet_errors_and_warnings_cpu_float64",
-    "TestLinalgCPU.test_linalg_lu_family_cpu_complex64",
-    "TestLinalgCPU.test_matrix_rank_cpu_float64",
-    "TestLinalgCPU.test_geqrf_cpu_float64",
-    "TestLinalgCPU.test_householder_product_cpu_complex128",
-    "TestLinalgCPU.test_geqrf_cpu_float32",
-    "TestLinalgCPU.test_pinv_cpu_complex128",
-    "TestLinalgCPU.test_pinv_cpu_float64",
-    "TestLinalgCPU.test_householder_product_cpu_float32",
-    "TestLinalgCPU.test_norm_bfloat16_and_half_cpu_bfloat16",
-    "TestLinalgCPU.test_inverse_cpu_complex64",
-    "TestModuleInitCPU.test_nn_FractionalMaxPool3d_cpu_float64",
-    "TestModuleInitCPU.test_nn_PReLU_cpu_float64",
-    "TestModuleInitCPU.test_nn_MultiLabelSoftMarginLoss_cpu_float64",
-    "TestModuleInitCPU.test_nn_TransformerEncoder_cpu_float64",
-    "TestModuleInitCPU.test_nn_LazyLinear_cpu_float32",
-    "TestModuleInitCPU.test_nn_BatchNorm3d_cpu_float32",
-    "TestModuleInitCPU.test_nn_BCEWithLogitsLoss_cpu_float64",
-    "TestModuleInitCPU.test_nn_BatchNorm1d_cpu_float32",
-    "TestModuleInitCPU.test_quantizable_LSTMCell_cpu_float32",
-    "TestModuleInitCPU.test_nn_InstanceNorm2d_cpu_float64",
-    "TestModuleInitCPU.test_nn_LazyConvTranspose1d_cpu_float64",
-    "TestModuleInitCPU.test_nn_LazyLinear_cpu_float64",
-    "TestModuleInitCPU.test_nn_LazyConv2d_cpu_float64",
-    "TestModuleInitCPU.test_nn_PReLU_cpu_float32",
-    "TestModuleInitCPU.test_nn_InstanceNorm1d_cpu_float64",
-    "TestModuleInitCPU.test_nn_InstanceNorm2d_cpu_float32",
-    "TestModuleInitCPU.test_nn_ConvTranspose1d_cpu_float32",
-    "TestModuleInitCPU.test_quantized_InstanceNorm1d_cpu_float64",
-    "TestModuleInitCPU.test_nn_TransformerEncoderLayer_cpu_float64",
-    "TestModuleInitCPU.test_qat_Conv3d_cpu_float32",
-    "TestModuleInitCPU.test_nn_LazyConvTranspose3d_cpu_float32",
-    "TestModuleInitCPU.test_quantized_LeakyReLU_cpu_float32",
-    "TestModuleInitCPU.test_quantized_GroupNorm_cpu_float64",
-    "TestModuleInitCPU.test_nn_RNNBase_cpu_float32",
-    "TestModuleInitCPU.test_nn_FractionalMaxPool2d_cpu_float64",
-    "TestModuleInitCPU.test_nn_LSTMCell_cpu_float64",
-    "TestModuleInitCPU.test_nn_Embedding_cpu_float32",
-    "TestModuleInitCPU.test_quantized_BatchNorm2d_cpu_float64",
-    "TestModuleInitCPU.test_nn_RNNCellBase_cpu_float64",
-    "TestModuleInitCPU.test_nn_LazyConvTranspose3d_cpu_float64",
-    "TestModuleInitCPU.test_quantized_GroupNorm_cpu_float32",
-    "TestModuleInitCPU.test_nn_MultiLabelSoftMarginLoss_cpu_float32",
-    "TestModuleInitCPU.test_nn_GroupNorm_cpu_float32",
-    "TestModuleInitCPU.test_nn_RNNCell_cpu_float64",
-    "TestModuleInitCPU.test_nn_TransformerEncoder_cpu_float32",
-    "TestModuleInitCPU.test_nn_InstanceNorm3d_cpu_float64",
-    "TestModuleInitCPU.test_quantized_InstanceNorm2d_cpu_float32",
-    "TestModuleInitCPU.test_nn_Conv3d_cpu_float64",
-    "TestModuleInitCPU.test_nn_LazyConv2d_cpu_float32",
-    "TestModuleInitCPU.test_nn_RNNCellBase_cpu_float32",
-    "TestModuleInitCPU.test_quantized_Quantize_cpu_float32",
-    "TestModuleInitCPU.test_nn_MultiheadAttention_cpu_float32",
-    "TestModuleInitCPU.test_nn_TransformerEncoderLayer_cpu_float32",
-    "TestModuleInitCPU.test_quantized_BatchNorm3d_cpu_float64",
-    "TestModuleInitCPU.test_nn_ConvTranspose3d_cpu_float32",
-    "TestModuleInitCPU.test_nn_LazyInstanceNorm1d_cpu_float32",
-    "TestModuleInitCPU.test_nn_RNNBase_cpu_float64",
-    "TestModuleInitCPU.test_nn_ConvTranspose2d_cpu_float64",
-    "TestModuleInitCPU.test_nn_AdaptiveLogSoftmaxWithLoss_cpu_float32",
-    "TestModuleInitCPU.test_nn_Transformer_cpu_float64",
-    "TestModuleInitCPU.test_quantizable_LSTM_cpu_float64",
-    "TestModuleInitCPU.test_nn_BCEWithLogitsLoss_cpu_float32",
-    "TestModuleInitCPU.test_nn_LazyConv1d_cpu_float64",
-    "TestModuleInitCPU.test_nn_LazyConv3d_cpu_float32",
-    "TestModuleInitCPU.test_nn_LazyBatchNorm2d_cpu_float64",
-    "TestModuleInitCPU.test_nn_Embedding_cpu_float64",
-    "TestModuleInitCPU.test_nn_FractionalMaxPool3d_cpu_float32",
-    "TestModuleInitCPU.test_nn_LazyBatchNorm3d_cpu_float32",
-    "TestModuleInitCPU.test_nn_GroupNorm_cpu_float64",
-    "TestModuleInitCPU.test_nn_LazyConv3d_cpu_float64",
-    "TestModuleInitCPU.test_nn_GRU_cpu_float32",
-    "TestModuleInitCPU.test_qat_Conv3d_cpu_float64",
-    "TestModuleInitCPU.test_nn_LazyInstanceNorm1d_cpu_float64",
-    "TestModuleInitCPU.test_nn_TransformerDecoder_cpu_float64",
-    "TestModuleInitCPU.test_nn_Conv3d_cpu_float32",
-    "TestModuleInitCPU.test_nn_LazyBatchNorm2d_cpu_float32",
-    "TestModuleInitCPU.test_nn_LazyInstanceNorm2d_cpu_float32",
-    "TestModuleInitCPU.test_qat_Embedding_cpu_float32",
-    "TestModuleInitCPU.test_nn_GRU_cpu_float64",
-    "TestModuleInitCPU.test_quantized_LayerNorm_cpu_float32",
-    "TestModuleInitCPU.test_quantizable_MultiheadAttention_cpu_float64",
-    "TestModuleInitCPU.test_qat_Embedding_cpu_float64",
-    "TestModuleInitCPU.test_nn_SyncBatchNorm_cpu_float32",
-    "TestModuleInitCPU.test_nn_Transformer_cpu_float32",
-    "TestModuleInitCPU.test_nn_LazyBatchNorm3d_cpu_float64",
-    "TestModuleInitCPU.test_nn_FractionalMaxPool2d_cpu_float32",
-    "TestModuleInitCPU.test_nn_LazyInstanceNorm2d_cpu_float64",
-    "TestModuleInitCPU.test_qat_Conv2d_cpu_float32",
-    "TestModuleInitCPU.test_nn_BatchNorm2d_cpu_float32",
-    "TestModuleInitCPU.test_nn_BatchNorm1d_cpu_float64",
-    "TestModuleInitCPU.test_nn_Bilinear_cpu_float32",
-    "TestModuleInitCPU.test_nn_Conv2d_cpu_float64",
-    "TestModuleInitCPU.test_qat_EmbeddingBag_cpu_float32",
-    "TestModuleInitCPU.test_quantized_InstanceNorm1d_cpu_float32",
-    "TestModuleInitCPU.test_quantizable_LSTMCell_cpu_float64",
-    "TestModuleInitCPU.test_nn_LazyBatchNorm1d_cpu_float64",
-    "TestModuleInitCPU.test_nn_NLLLoss_cpu_float32",
-    "TestModuleInitCPU.test_nn_LazyConv1d_cpu_float32",
-    "TestModuleInitCPU.test_quantizable_MultiheadAttention_cpu_float32",
-    "TestModuleInitCPU.test_nn_BCELoss_cpu_float64",
-    "TestModuleInitCPU.test_nn_TransformerDecoderLayer_cpu_float32",
-    "TestModuleInitCPU.test_nn_LayerNorm_cpu_float32",
-    "TestModuleInitCPU.test_nn_AdaptiveLogSoftmaxWithLoss_cpu_float64",
-    "TestModuleInitCPU.test_nn_CrossEntropyLoss_cpu_float32",
-    "TestModuleInitCPU.test_nn_LayerNorm_cpu_float64",
-    "TestModuleInitCPU.test_nn_RNNCell_cpu_float32",
-    "TestModuleInitCPU.test_nn_ConvTranspose1d_cpu_float64",
-    "TestModuleInitCPU.test_nn_GRUCell_cpu_float64",
-    "TestModuleInitCPU.test_nn_LSTMCell_cpu_float32",
-    "TestModuleInitCPU.test_qat_Linear_cpu_float32",
-    "TestModuleInitCPU.test_nn_Conv2d_cpu_float32",
-    "TestModuleInitCPU.test_nn_InstanceNorm1d_cpu_float32",
-    "TestModuleInitCPU.test_nn_TransformerDecoderLayer_cpu_float64",
-    "TestModuleInitCPU.test_quantized_InstanceNorm3d_cpu_float64",
-    "TestModuleInitCPU.test_nn_SyncBatchNorm_cpu_float64",
-    "TestModuleInitCPU.test_nn_RNN_cpu_float32",
-    "TestModuleInitCPU.test_nn_RNN_cpu_float64",
-    "TestModuleInitCPU.test_quantizable_LSTM_cpu_float32",
-    "TestModuleInitCPU.test_quantized_InstanceNorm3d_cpu_float32",
-    "TestModuleInitCPU.test_quantized_Hardswish_cpu_float64",
-    "TestModuleInitCPU.test_nn_LazyBatchNorm1d_cpu_float32",
-    "TestModuleInitCPU.test_quantized_InstanceNorm2d_cpu_float64",
-    "TestModuleInitCPU.test_qat_EmbeddingBag_cpu_float64",
-    "TestModuleInitCPU.test_quantized_BatchNorm2d_cpu_float32",
-    "TestModuleInitCPU.test_nn_CrossEntropyLoss_cpu_float64",
-    "TestModuleInitCPU.test_nn_ConvTranspose3d_cpu_float64",
-    "TestModuleInitCPU.test_quantized_Quantize_cpu_float64",
-    "TestModuleInitCPU.test_nn_BCELoss_cpu_float32",
-    "TestModuleInitCPU.test_nn_EmbeddingBag_cpu_float32",
-    "TestModuleInitCPU.test_nn_LSTM_cpu_float64",
-    "TestModuleInitCPU.test_nn_Linear_cpu_float32",
-    "TestModuleInitCPU.test_nn_LazyInstanceNorm3d_cpu_float64",
-    "TestModuleInitCPU.test_nn_EmbeddingBag_cpu_float64",
-    "TestModuleInitCPU.test_nn_ConvTranspose2d_cpu_float32",
-    "TestModuleInitCPU.test_nn_BatchNorm2d_cpu_float64",
-    "TestModuleInitCPU.test_nn_BatchNorm3d_cpu_float64",
-    "TestModuleInitCPU.test_nn_MultiMarginLoss_cpu_float32",
-    "TestModuleInitCPU.test_nn_LazyInstanceNorm3d_cpu_float32",
-    "TestModuleInitCPU.test_nn_MultiMarginLoss_cpu_float64",
-    "TestModuleInitCPU.test_quantized_LayerNorm_cpu_float64",
-    "TestModuleInitCPU.test_nn_InstanceNorm3d_cpu_float32",
-    "TestModuleInitCPU.test_nn_Bilinear_cpu_float64",
-    "TestModuleInitCPU.test_qat_Conv1d_cpu_float64",
-    "TestModuleInitCPU.test_nn_Conv1d_cpu_float64",
-    "TestModuleInitCPU.test_nn_LazyConvTranspose2d_cpu_float32",
-    "TestModuleInitCPU.test_nn_LazyConvTranspose2d_cpu_float64",
-    "TestModuleInitCPU.test_nn_MultiheadAttention_cpu_float64",
-    "TestModuleInitCPU.test_nn_GRUCell_cpu_float32",
-    "TestModuleInitCPU.test_quantized_LeakyReLU_cpu_float64",
-    "TestModuleInitCPU.test_qat_Conv2d_cpu_float64",
-    "TestModuleInitCPU.test_nn_NLLLoss_cpu_float64",
-    "TestModuleInitCPU.test_quantized_Hardswish_cpu_float32",
-    "TestModuleInitCPU.test_nn_Linear_cpu_float64",
-    "TestModuleInitCPU.test_nn_LazyConvTranspose1d_cpu_float32",
-    "TestModuleInitCPU.test_nn_Conv1d_cpu_float32",
-    "TestModuleInitCPU.test_nn_TransformerDecoder_cpu_float32",
-    "TestModuleInitCPU.test_qat_Linear_cpu_float64",
-    "TestModuleInitCPU.test_quantized_BatchNorm3d_cpu_float32",
-    "TestModuleInitCPU.test_nn_LSTM_cpu_float32",
-    "TestModuleInitCPU.test_qat_Conv1d_cpu_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc10_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc9_out_dtype_float64",
-    "TestUnaryUfuncs.test_x_and_out_casting_casting_same_kind_ufunc0_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc8_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc1_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc12_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc4_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc16_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc6_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc0_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc11_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc1_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc16",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc14_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc8_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc1_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc5",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc14_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc2_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc15_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc0_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc12_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc11_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc16_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc4_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc15_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc3_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc1_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc12_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc12_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc12_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc0_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc16_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc13_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc4_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc9_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc12_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc0_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc11_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc15_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc8_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc0_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc10_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc13_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc8_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc9_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc7_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc8_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc0_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc6_out_dtype_float32",
-    "TestUnaryUfuncs.test_x_and_out_casting_casting_unsafe_ufunc0_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc7",
-    "TestUnaryUfuncs.test_x_and_out_casting_casting_equiv_ufunc0_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc9_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc0_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc4",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc14_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc13_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc9_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc7_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc6_out_dtype_float64",
-    "TestUnaryUfuncs.test_x_and_out_casting_casting_safe_ufunc0_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc5_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc4_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc9_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc8_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc7_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc14_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc10_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc9_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc4_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc8_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc8_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc0",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc7_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc6_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc15_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc12_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc11_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc1_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc0_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc2_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc15",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc16_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc12_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc16_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc1_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc2_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc3_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc6_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc3",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc5_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc3_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc7_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc6_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc5_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc7_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc2_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc10_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc10",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc16_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc5_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc2_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc5_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc3_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc7_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc15_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc5_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc15_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc11_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc14_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc2_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc2_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc11_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc12_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc8_out_dtype_float64",
-    "TestUnaryUfuncs.test_x_and_out_casting_casting_same_kind_ufunc0_out_dtype_float64",
-    "TestUfuncDtypeKwd.test_binary_ufunc_dtype_and_out",
-    "TestUnaryUfuncs.test_x_and_out_casting_casting_same_kind_ufunc0_out_dtype_complex128",
-    "TestUnaryUfuncs.test_x_and_out_casting_casting_unsafe_ufunc0_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc1_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc12_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc13_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc5_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc14_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc9_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc14_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc13_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc10_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc3_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc11",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc7_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc13_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc6_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc1",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc0_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc9_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc16_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc16_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc5_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc12",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc3_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc6_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc14_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc15_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc13_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc7_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc14_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc4_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc6_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc4_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc11_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc3_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc9",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc10_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc3_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc11_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc15_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc14",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc15_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc6",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc16_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc5_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc9_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc16_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc4_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc5_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc6_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc11_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc13_out_dtype_float64",
-    "TestUnaryUfuncs.test_x_and_out_casting_casting_unsafe_ufunc0_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc14_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc15_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc15_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc4_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc14_out_dtype_complex128",
-    "TestUnaryUfuncs.test_x_and_out_casting_casting_no_ufunc0_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc10_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc13_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc12_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc2_out_dtype_float64",
-    "TestUnaryUfuncs.test_x_and_out_broadcast_ufunc0",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc16_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc13_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc5_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc8_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc3_out_dtype_complex128",
-    "TestUnaryUfuncs.test_x_and_out_casting_casting_safe_ufunc0_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc15_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc12_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc6_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc8_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc11_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc8_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc14_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc10_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc7_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc11_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc9_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc10_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc16_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc9_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc1_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc3_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc10_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc7_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc7_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc0_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc8",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc11_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_no_ufunc13_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc13_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc2_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc1_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc10_out_dtype_float64",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc5_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_unsafe_ufunc4_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_equiv_ufunc6_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc13",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc2_out_dtype_float32",
-    "TestBinaryUfuncs.test_xy_and_out_broadcast_ufunc2",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_safe_ufunc10_out_dtype_complex128",
-    "TestBinaryUfuncs.test_xy_and_out_casting_casting_same_kind_ufunc1_out_dtype_float64",
-    "TestIsScalar.test_is_not_scalar_value6",
-    "TestGenericReductions.test_bad_axis_func0",
-    "TestGenericReductions.test_bad_axis_func11",
-    "TestGenericReductions.test_bad_axis_func7",
-    "TestGenericReductions.test_bad_axis_func6",
-    "TestGenericReductions.test_bad_axis_func2",
-    "TestGenericCumSumProd.test_bad_axis_func1",
-    "TestGenericReductions.test_bad_axis_func3",
-    "TestGenericReductions.test_bad_axis_func4",
-    "TestGenericReductions.test_bad_axis_func10",
-    "TestGenericReductions.test_bad_axis_func5",
-    "TestGenericReductions.test_bad_axis_func8",
-    "TestGenericReductions.test_bad_axis_func1",
-    "TestGenericCumSumProd.test_bad_axis_func0",
-    "TestGenericReductions.test_bad_axis_func9",
-    "TestShuffle.test_1d_use_numpy_True",
-    "TestShuffle.test_1d_use_numpy_False",
-    "TestShuffle.test_2d_use_numpy_True",
-    "TestShuffle.test_2d_use_numpy_False",
-    "TestArrayCreationCopyArgument.test_buffer_interface",
-    "TestWritebackIfCopy.test_take_mode_raise",
-    "TestArange.test_infinite",
-    "TestArrayConstruction.test_array_empty",
-    "TestAttributes.test_fill_readonly",
-    "TestArrayAttributeDeletion.test_multiarray_writable_attributes_deletion",
-    "TestMatmul.test_out_contiguous",
-    "TestMinMax.test_scalar",
-    "TestFromBuffer.test_basic_little_dtype2",
-    "TestArrayCreationCopyArgument.test_striding_not_ok",
-    "TestArange.test_require_range",
-    "TestStats.test_dtype_from_input",
-    "TestArange.test_nan_step",
-    "TestWritebackIfCopy.test_argmin_with_out",
-    "TestArrayAttributeDeletion.test_multiarray_not_writable_attributes_deletion",
-    "TestLexsort.test_datetime",
-    "TestMinMax.test_axis",
-    "TestLexsort.test_mixed",
-    "TestWritebackIfCopy.test_dot_out",
-    "TestAttributes.test_fill_struct_array",
-    "TestFromBuffer.test_empty",
-    "TestAssignment.test_assignment_broadcasting",
-    "TestMatmul.test_out_arg",
-    "TestAttributes.test_set_stridesattr",
-    "TestStats.test_out",
-    "TestScalarIndexing.test_invalid_subscript",
-    "TestWhere.test_error",
-    "TestWritebackIfCopy.test_argmax_with_out",
-    "TestBool.test_sum_2",
-    "TestScalarIndexing.test_invalid_newaxis",
-    "TestTake.test_out_overlap",
-    "TestScalarIndexing.test_invalid_subscript_assignment",
-    "TestFromBuffer.test_basic_little_dtype1",
-    "TestWritebackIfCopy.test_choose_mod_raise",
-    "TestAttributes.test_fill_max_uint64",
-    "TestPutmask.test_byteorder_dtype_<i4",
-    "TestPutmask.test_byteorder_dtype_>i4",
-    "TestAttributes.test_stridesattr",
-    "TestArange.test_zero_step",
-    "TestStats.test_dtype_from_dtype",
-    "TestArrayCreationCopyArgument.test_scalars",
-    "TestConversion.test_to_int_scalar",
-    "TestPutmask.test_record_array",
-    "TestTake.test_raise",
-    "TestFromBuffer.test_basic_little_dtype0",
-    "TestMatmul.test_exceptions",
-    "TestFlag.test_writeable_from_readonly",
-    "TestArgmaxArgminCommon.test_np_vs_ndarray_positional_arr_method_argmax_np_method0",
-    "TestArgmaxArgminCommon.test_ret_is_out_ndim_1_method_argmin",
-    "TestArgmaxArgminCommon.test_np_vs_ndarray_arr_method_argmax_np_method0",
-    "TestArgmaxArgminCommon.test_np_vs_ndarray_arr_method_argmin_np_method1",
-    "TestArgmaxArgminCommon.test_ret_is_out_ndim_0_method_argmax",
-    "TestArgmaxArgminCommon.test_np_vs_ndarray_positional_arr_method_argmin_np_method1",
-    "TestArgmaxArgminCommon.test_ret_is_out_ndim_1_method_argmax",
-    "TestArgmaxArgminCommon.test_ret_is_out_ndim_0_method_argmin",
-    "TestConvertDType.test_convert_np_dtypes_'int64'",
-    "TestConvertDType.test_convert_np_dtypes_'uint8'",
-    "TestConvertDType.test_convert_np_dtypes_bool",
-    "TestConvertDType.test_convert_np_dtypes_'complex128'",
-    "TestConvertDType.test_convert_np_dtypes_'float16'",
-    "TestConvertDType.test_convert_np_dtypes_'int16'",
-    "TestConvertDType.test_convert_np_dtypes_'int32'",
-    "TestConvertDType.test_convert_np_dtypes_'int8'",
-    "TestConvertDType.test_convert_np_dtypes_'float64'",
-    "TestConvertDType.test_convert_np_dtypes_'float32'",
-    "TestConvertDType.test_convert_np_dtypes_'complex64'",
-    "TestConvertDType.test_convert_np_dtypes_'bool_'",
-    "TestOneArr.test_asarray_list_func55",
-    "TestOneArr.test_asarray_tensor_func65",
-    "TestOneArr.test_asarray_tensor_func44",
-    "TestOneArr.test_asarray_array_func59",
-    "TestOneArr.test_asarray_array_func45",
-    "TestOneArr.test_asarray_list_func70",
-    "TestOneArrAndAxis.test_andaxis_list_func7_axis_0",
-    "TestSequenceOfArrays.test_single_array_func1",
-    "TestOneArrAndAxis.test_andaxis_array_func1_axis_1",
-    "TestSequenceOfArraysToSingle.test_several_func6",
-    "TestOneArr.test_asarray_list_func0",
-    "TestOneArrAndAxis.test_andaxis_list_func9_axis_1",
-    "TestOneArrAndAxis.test_andaxis_list_func8_axis_0",
-    "TestOneArr.test_asarray_list_func36",
-    "TestOneArrAndAxis.test_andaxis_array_func3_axis_0",
-    "TestOneArr.test_asarray_tensor_func15",
-    "TestOneArr.test_asarray_array_func51",
-    "TestOneArr.test_asarray_list_func16",
-    "TestOneArrAndAxis.test_andaxis_tensor_func5_axis_0",
-    "TestOneArrAndAxis.test_andaxis_tensor_func1_axis_1",
-    "TestOneArr.test_asarray_tensor_func1",
-    "TestOneArrAndAxesTuple.test_andtuple_list_func0_axes2",
-    "TestOneArrAndAxis.test_andaxis_list_func6_axis_1",
-    "TestOneArrAndAxis.test_andaxis_tensor_func10_axis_-1",
-    "TestSequenceOfArraysToSingle.test_several_func2",
-    "TestOneArrAndAxis.test_andaxis_array_func5_axis_1",
-    "TestOneArrAndAxis.test_andaxis_list_func10_axis_1",
-    "TestOneArr.test_asarray_array_func72",
-    "TestOneArrAndShape.test_andshape_list_func0",
-    "TestCtorNested.test_arrays_in_lists",
-    "TestOneArr.test_asarray_tensor_func51",
-    "TestOneArr.test_asarray_array_func0",
-    "TestOneArr.test_asarray_array_func10",
-    "TestOneArr.test_asarray_array_func43",
-    "TestOneArrToScalar.test_toscalar_array_func2_np_func2",
-    "TestOneArr.test_asarray_list_func3",
-    "TestOneArr.test_asarray_array_func56",
-    "TestArrayToSequence.test_asarray_array_func1",
-    "TestOneArrAndShape.test_andshape_tensor_func4",
-    "TestOneArr.test_asarray_list_func60",
-    "TestDivmod.test_divmod_out",
-    "TestOneArrAndAxis.test_andaxis_list_func7_axis3",
-    "TestOneArrAndAxis.test_andaxis_array_func6_axis_0",
-    "TestOneArrAndAxis.test_andaxis_list_func5_axis_0",
-    "TestOneArr.test_asarray_tensor_func53",
-    "TestOneArrAndAxis.test_andaxis_array_func6_axis3",
-    "TestOneArr.test_asarray_tensor_func73",
-    "TestDivmod.test_divmod_no_out",
-    "TestOneArrAndAxis.test_andaxis_array_func9_axis_1",
-    "TestOneArr.test_asarray_list_func58",
-    "TestOneArrAndAxis.test_andaxis_tensor_func8_axis_0",
-    "TestOneArr.test_asarray_array_func49",
-    "TestOneArr.test_asarray_array_func60",
-    "TestOneArr.test_asarray_tensor_func62",
-    "TestOneArrAndAxesTuple.test_andtuple_tensor_func0_axes0",
-    "TestOneArr.test_asarray_array_func22",
-    "TestOneArr.test_asarray_list_func24",
-    "TestOneArr.test_asarray_list_func15",
-    "TestSequenceOfArrays.test_several_func2",
-    "TestOneArr.test_asarray_tensor_func66",
-    "TestOneArrAndAxis.test_andaxis_tensor_func7_axis3",
-    "TestOneArrAndAxis.test_andaxis_tensor_func10_axis_0",
-    "TestOneArrAndAxis.test_andaxis_list_func1_axis_-1",
-    "TestOneArr.test_asarray_list_func32",
-    "TestOneArr.test_asarray_list_func48",
-    "TestOneArrToScalar.test_toscalar_array_func1_np_func1",
-    "TestOneArr.test_asarray_list_func23",
-    "TestOneArr.test_asarray_list_func65",
-    "TestOneArr.test_asarray_tensor_func34",
-    "TestOneArr.test_asarray_array_func57",
-    "TestOneArr.test_asarray_list_func31",
-    "TestOneArrAndAxis.test_andaxis_array_func9_axis_0",
-    "TestOneArr.test_asarray_array_func63",
-    "TestOneArrAndAxis.test_andaxis_tensor_func9_axis_1",
-    "TestOneArr.test_asarray_tensor_func0",
-    "TestOneArr.test_asarray_list_func43",
-    "TestOneArr.test_asarray_list_func62",
-    "TestOneArrAndShape.test_andshape_array_func0",
-    "TestSequenceOfArrays.test_several_func0",
-    "TestOneArrAndAxis.test_andaxis_array_func8_axis_-1",
-    "TestOneArr.test_asarray_tensor_func29",
-    "TestArrayToSequence.test_asarray_array_func0",
-    "TestOneArrAndAxis.test_andaxis_array_func5_axis3",
-    "TestOneArr.test_asarray_array_func16",
-    "TestOneArr.test_asarray_array_func68",
-    "TestOneArr.test_asarray_list_func21",
-    "TestOneArrAndAxis.test_andaxis_list_func7_axis_1",
-    "TestOneArr.test_asarray_array_func33",
-    "TestOneArr.test_asarray_list_func13",
-    "TestOneArr.test_asarray_list_func40",
-    "TestOneArrAndAxis.test_andaxis_array_func1_axis_0",
-    "TestOneArrAndAxesTuple.test_andtuple_list_func0_axes0",
-    "TestOneArr.test_asarray_list_func52",
-    "TestOneArr.test_asarray_array_func42",
-    "TestOneArr.test_asarray_list_func73",
-    "TestOneArr.test_asarray_array_func24",
-    "TestOneArr.test_asarray_list_func45",
-    "TestOneArr.test_asarray_array_func38",
-    "TestOneArr.test_asarray_array_func20",
-    "TestOneArr.test_asarray_tensor_func45",
-    "TestOneArr.test_asarray_array_func66",
-    "TestOneArrAndAxis.test_andaxis_list_func2_axis_0",
-    "TestOneArr.test_asarray_array_func11",
-    "TestOneArrAndAxis.test_andaxis_array_func9_axis3",
-    "TestOneArrAndAxis.test_andaxis_list_func5_axis_-1",
-    "TestOneArrAndShape.test_andshape_list_func1",
-    "TestPythonArgsToArray.test_argstoarray_simple_func4_args4",
-    "TestOneArr.test_asarray_tensor_func14",
-    "TestOneArr.test_asarray_array_func48",
-    "TestOneArr.test_asarray_list_func53",
-    "TestOneArr.test_asarray_tensor_func24",
-    "TestOneArr.test_asarray_list_func54",
-    "TestOneArr.test_asarray_tensor_func33",
-    "TestPythonArgsToArray.test_argstoarray_simple_func7_args7",
-    "TestOneArrAndAxesTuple.test_andtuple_array_func0_axes1",
-    "TestOneArrAndAxis.test_andaxis_list_func2_axis_1",
-    "TestSequenceOfArrays.test_single_array_func0",
-    "TestOneArr.test_asarray_tensor_func69",
-    "TestSequenceOfArraysToSingle.test_several_func3",
-    "TestOneArr.test_asarray_array_func36",
-    "TestOneArr.test_asarray_list_func11",
-    "TestCopyTo.test_copyto_typecast",
-    "TestOneArrAndShape.test_andshape_tensor_func1",
-    "TestOneArr.test_asarray_array_func71",
-    "TestOneArrAndAxis.test_andaxis_list_func6_axis_0",
-    "TestOneArrAndAxis.test_andaxis_tensor_func9_axis_0",
-    "TestOneArrAndAxis.test_andaxis_array_func2_axis_0",
-    "TestOneArr.test_asarray_list_func72",
-    "TestSequenceOfArraysToSingle.test_several_func4",
-    "TestOneArrAndAxis.test_andaxis_tensor_func2_axis_0",
-    "TestOneArrAndAxis.test_andaxis_list_func2_axis_-1",
-    "TestOneArr.test_asarray_array_func34",
-    "TestOneArr.test_asarray_array_func23",
-    "TestOneArr.test_asarray_list_func20",
-    "TestOneArrAndAxis.test_andaxis_array_func6_axis_1",
-    "TestOneArr.test_asarray_array_func41",
-    "TestOneArr.test_asarray_list_func38",
-    "TestOneArrAndAxis.test_andaxis_list_func5_axis_1",
-    "TestOneArrAndAxis.test_andaxis_array_func3_axis_-1",
-    "TestOneArrAndAxis.test_andaxis_array_func3_axis3",
-    "TestOneArrToScalar.test_toscalar_array_func0_np_func0",
-    "TestOneArr.test_asarray_tensor_func37",
-    "TestOneArr.test_asarray_tensor_func20",
-    "TestOneArr.test_asarray_tensor_func42",
-    "TestOneArr.test_asarray_list_func67",
-    "TestOneArr.test_asarray_list_func30",
-    "TestOneArrAndAxis.test_andaxis_list_func4_axis_1",
-    "TestSequenceOfArrays.test_several_func3",
-    "TestOneArr.test_asarray_array_func54",
-    "TestOneArrAndShape.test_andshape_list_func4",
-    "TestOneArr.test_asarray_tensor_func2",
-    "TestOneArr.test_asarray_tensor_func57",
-    "TestOneArrAndAxis.test_andaxis_list_func9_axis_-1",
-    "TestOneArrAndAxis.test_andaxis_array_func0_axis_1",
-    "TestOneArrAndAxis.test_andaxis_list_func4_axis_-1",
-    "TestOneArr.test_asarray_array_func55",
-    "TestOneArrAndAxis.test_andaxis_tensor_func3_axis_-1",
-    "TestOneArrAndAxis.test_andaxis_tensor_func5_axis_-1",
-    "TestOneArr.test_asarray_list_func14",
-    "TestOneArr.test_asarray_list_func29",
-    "TestOneArrAndAxis.test_andaxis_array_func7_axis_1",
-    "TestOneArrAndShape.test_andshape_list_func3",
-    "TestOneArr.test_asarray_tensor_func5",
-    "TestOneArr.test_asarray_list_func68",
-    "TestOneArr.test_asarray_tensor_func61",
-    "TestSequenceOfArrays.test_single_list_func3",
-    "TestOneArr.test_asarray_array_func21",
-    "TestOneArr.test_asarray_list_func61",
-    "TestOneArr.test_asarray_tensor_func55",
-    "TestOneArr.test_asarray_tensor_func18",
-    "TestOneArr.test_asarray_list_func50",
-    "TestOneArrAndAxis.test_andaxis_array_func7_axis_0",
-    "TestOneArr.test_asarray_array_func62",
-    "TestOneArr.test_asarray_tensor_func50",
-    "TestOneArr.test_asarray_array_func6",
-    "TestOneArr.test_asarray_list_func66",
-    "TestOneArr.test_asarray_list_func59",
-    "TestOneArr.test_asarray_tensor_func28",
-    "TestShapeLikeToArray.test_shape_func3",
-    "TestOneArr.test_asarray_array_func9",
-    "TestOneArrAndAxis.test_andaxis_array_func0_axis_0",
-    "TestOneArrAndShape.test_andshape_array_func2",
-    "TestPythonArgsToArray.test_argstoarray_simple_func2_args2",
-    "TestOneArrAndShape.test_andshape_tensor_func0",
-    "TestPythonArgsToArray.test_argstoarray_simple_func0_args0",
-    "TestOneArr.test_asarray_array_func19",
-    "TestOneArr.test_asarray_tensor_func39",
-    "TestOneArr.test_asarray_array_func65",
-    "TestSequenceOfArrays.test_single_list_func2",
-    "TestOneArr.test_asarray_array_func31",
-    "TestOneArrAndAxis.test_andaxis_list_func10_axis_0",
-    "TestOneArr.test_asarray_list_func2",
-    "TestOneArrAndAxis.test_andaxis_array_func10_axis3",
-    "TestOneArrAndAxis.test_andaxis_array_func4_axis_1",
-    "TestDivmod.test_divmod_out_list",
-    "TestOneArr.test_asarray_list_func19",
-    "TestOneArrAndAxesTuple.test_andtuple_array_func0_axes2",
-    "TestOneArr.test_asarray_array_func1",
-    "TestOneArrAndAxis.test_andaxis_tensor_func4_axis_1",
-    "TestOneArr.test_asarray_tensor_func43",
-    "TestOneArrAndAxis.test_andaxis_array_func5_axis_0",
-    "TestOneArrAndAxesTuple.test_andtuple_tensor_func0_axes2",
-    "TestOneArr.test_asarray_list_func10",
-    "TestSequenceOfArrays.test_single_array_func3",
-    "TestOneArr.test_asarray_tensor_func40",
-    "TestSequenceOfArraysToSingle.test_several_func0",
-    "TestOneArrAndAxis.test_andaxis_array_func7_axis3",
-    "TestOneArrAndAxis.test_andaxis_array_func6_axis_-1",
-    "TestOneArr.test_asarray_tensor_func35",
-    "TestOneArr.test_asarray_tensor_func72",
-    "TestOneArr.test_asarray_list_func18",
-    "TestOneArr.test_asarray_tensor_func60",
-    "TestOneArrAndAxis.test_andaxis_list_func3_axis_0",
-    "TestOneArr.test_asarray_array_func37",
-    "TestOneArr.test_asarray_array_func74",
-    "TestNormalizations.test_unknown_args",
-    "TestOneArr.test_asarray_array_func4",
-    "TestOneArr.test_asarray_array_func58",
-    "TestOneArrAndAxis.test_andaxis_list_func9_axis_0",
-    "TestOneArr.test_asarray_tensor_func22",
-    "TestOneArr.test_asarray_list_func56",
-    "TestOneArrAndAxis.test_andaxis_list_func3_axis_1",
-    "TestOneArrAndAxis.test_andaxis_array_func0_axis_-1",
-    "TestOneArr.test_asarray_tensor_func4",
-    "TestPythonArgsToArray.test_argstoarray_simple_func6_args6",
-    "TestOneArrAndAxis.test_andaxis_tensor_func9_axis_-1",
-    "TestOneArr.test_asarray_tensor_func68",
-    "TestOneArr.test_asarray_list_func27",
-    "TestOneArrAndAxis.test_andaxis_array_func4_axis_-1",
-    "TestOneArr.test_asarray_array_func13",
-    "TestOneArr.test_asarray_list_func6",
-    "TestOneArr.test_asarray_array_func39",
-    "TestOneArr.test_asarray_array_func73",
-    "TestOneArr.test_asarray_tensor_func12",
-    "TestOneArrAndAxis.test_andaxis_array_func7_axis_-1",
-    "TestOneArr.test_asarray_list_func17",
-    "TestShapeLikeToArray.test_shape_func2",
-    "TestOneArrAndAxis.test_andaxis_list_func4_axis_0",
-    "TestOneArrAndAxis.test_andaxis_array_func3_axis_1",
-    "TestOneArrAndAxis.test_andaxis_tensor_func10_axis_1",
-    "TestOneArrAndAxis.test_andaxis_list_func8_axis_1",
-    "TestOneArr.test_asarray_list_func33",
-    "TestOneArrAndAxis.test_andaxis_tensor_func1_axis_-1",
-    "TestOneArr.test_asarray_array_func18",
-    "TestOneArr.test_asarray_tensor_func3",
-    "TestOneArrAndShape.test_andshape_tensor_func2",
-    "TestOneArr.test_asarray_list_func35",
-    "TestOneArrAndAxis.test_andaxis_tensor_func3_axis_0",
-    "TestOneArr.test_asarray_array_func70",
-    "TestOneArrAndAxesTuple.test_andtuple_list_func0_axes1",
-    "TestOneArrAndAxis.test_andaxis_list_func8_axis_-1",
-    "TestOneArr.test_asarray_tensor_func59",
-    "TestOneArr.test_asarray_array_func15",
-    "TestOneArrAndAxis.test_andaxis_tensor_func6_axis_1",
-    "TestOneArr.test_asarray_tensor_func38",
-    "TestPythonArgsToArray.test_argstoarray_simple_func8_args8",
-    "TestPythonArgsToArray.test_argstoarray_simple_func3_args3",
-    "TestOneArr.test_asarray_array_func14",
-    "TestPythonArgsToArray.test_argstoarray_simple_func5_args5",
-    "TestOneArr.test_asarray_list_func26",
-    "TestOneArr.test_asarray_list_func34",
-    "TestOneArr.test_asarray_list_func4",
-    "TestOneArr.test_asarray_tensor_func67",
-    "TestOneArr.test_asarray_array_func3",
-    "TestOneArr.test_asarray_array_func5",
-    "TestOneArr.test_asarray_array_func52",
-    "TestOneArr.test_asarray_tensor_func58",
-    "TestOneArr.test_asarray_tensor_func48",
-    "TestOneArr.test_asarray_array_func50",
-    "TestOneArr.test_asarray_tensor_func47",
-    "TestOneArrAndAxis.test_andaxis_array_func4_axis3",
-    "TestOneArrAndAxis.test_andaxis_tensor_func2_axis_1",
-    "TestOneArrAndAxis.test_andaxis_array_func0_axis3",
-    "TestShapeLikeToArray.test_shape_func1",
-    "TestOneArrAndAxis.test_andaxis_tensor_func4_axis_-1",
-    "TestOneArrAndAxis.test_andaxis_tensor_func8_axis_-1",
-    "TestDefaultDtype.test_defaultdtype_defaults",
-    "TestOneArr.test_asarray_list_func63",
-    "TestOneArrAndShape.test_andshape_list_func2",
-    "TestOneArr.test_asarray_array_func27",
-    "TestOneArrAndAxis.test_andaxis_array_func4_axis_0",
-    "TestOneArr.test_asarray_list_func41",
-    "TestSequenceOfArrays.test_single_tensor_func2",
-    "TestOneArr.test_asarray_list_func39",
-    "TestOneArr.test_asarray_tensor_func6",
-    "TestOneArr.test_asarray_tensor_func25",
-    "TestOneArr.test_asarray_array_func2",
-    "TestOneArrAndAxis.test_andaxis_array_func8_axis_1",
-    "TestOneArr.test_asarray_tensor_func56",
-    "TestOneArr.test_asarray_array_func69",
-    "TestOneArr.test_asarray_list_func28",
-    "TestOneArr.test_asarray_tensor_func26",
-    "TestArrayToSequence.test_asarray_tensor_func1",
-    "TestOneArr.test_asarray_array_func28",
-    "TestPythonArgsToArray.test_argstoarray_simple_func1_args1",
-    "TestOneArrAndAxis.test_andaxis_list_func10_axis3",
-    "TestOneArr.test_asarray_list_func44",
-    "TestOneArr.test_asarray_array_func46",
-    "TestOneArrAndAxis.test_andaxis_array_func10_axis_1",
-    "TestOneArr.test_asarray_tensor_func30",
-    "TestOneArr.test_asarray_tensor_func16",
-    "TestOneArrAndAxis.test_andaxis_array_func1_axis3",
-    "TestOneArr.test_asarray_tensor_func46",
-    "TestOneArr.test_asarray_tensor_func10",
-    "TestOneArrAndAxis.test_andaxis_array_func2_axis_-1",
-    "TestOneArr.test_asarray_list_func47",
-    "TestSequenceOfArrays.test_single_tensor_func0",
-    "TestOneArrAndAxesTuple.test_andtuple_array_func0_axes0",
-    "TestOneArr.test_asarray_list_func12",
-    "TestOneArrAndAxis.test_andaxis_array_func8_axis3",
-    "TestShapeLikeToArray.test_shape_func0",
-    "TestOneArr.test_asarray_array_func61",
-    "TestOneArrAndAxis.test_andaxis_tensor_func7_axis_-1",
-    "TestOneArrAndAxis.test_andaxis_list_func0_axis_0",
-    "TestOneArr.test_asarray_tensor_func31",
-    "TestOneArr.test_asarray_array_func67",
-    "TestOneArr.test_asarray_list_func64",
-    "TestOneArrAndAxis.test_andaxis_array_func5_axis_-1",
-    "TestOneArrAndAxis.test_andaxis_array_func2_axis_1",
-    "TestOneArr.test_asarray_array_func32",
-    "TestOneArr.test_asarray_array_func8",
-    "TestOneArr.test_asarray_list_func5",
-    "TestOneArr.test_asarray_array_func17",
-    "TestOneArrAndAxis.test_andaxis_list_func7_axis_-1",
-    "TestOneArrAndAxis.test_andaxis_tensor_func5_axis_1",
-    "TestOneArrAndAxis.test_andaxis_list_func0_axis_-1",
-    "TestOneArrAndAxis.test_andaxis_array_func8_axis_0",
-    "TestOneArr.test_asarray_array_func64",
-    "TestArrayToSequence.test_asarray_tensor_func0",
-    "TestSequenceOfArrays.test_single_array_func2",
-    "TestOneArrAndAxis.test_andaxis_list_func10_axis_-1",
-    "TestOneArr.test_asarray_list_func71",
-    "TestOneArrAndAxesTuple.test_andtuple_tensor_func0_axes1",
-    "TestOneArrAndAxis.test_andaxis_tensor_func1_axis_0",
-    "TestOneArr.test_asarray_array_func44",
-    "TestCopyTo.test_copyto_basic",
-    "TestSequenceOfArrays.test_single_tensor_func1",
-    "TestOneArr.test_asarray_tensor_func11",
-    "TestSequenceOfArrays.test_several_func1",
-    "TestOneArr.test_asarray_tensor_func74",
-    "TestOneArr.test_asarray_tensor_func36",
-    "TestOneArr.test_asarray_array_func53",
-    "TestOneArr.test_asarray_tensor_func63",
-    "TestOneArrAndShape.test_andshape_array_func3",
-    "TestOneArr.test_asarray_list_func74",
-    "TestOneArr.test_asarray_tensor_func49",
-    "TestOneArrAndAxis.test_andaxis_tensor_func3_axis_1",
-    "TestOneArr.test_asarray_tensor_func32",
-    "TestOneArrAndAxis.test_andaxis_list_func1_axis_1",
-    "TestOneArrAndAxis.test_andaxis_tensor_func4_axis_0",
-    "TestOneArrAndShape.test_andshape_tensor_func3",
-    "TestOneArr.test_asarray_tensor_func27",
-    "TestOneArr.test_asarray_list_func22",
-    "TestOneArr.test_asarray_list_func69",
-    "TestOneArr.test_asarray_array_func26",
-    "TestOneArrAndAxis.test_andaxis_array_func9_axis_-1",
-    "TestOneArrAndAxis.test_andaxis_tensor_func6_axis_-1",
-    "TestSequenceOfArrays.test_single_tensor_func3",
-    "TestOneArrAndShape.test_andshape_array_func1",
-    "TestOneArr.test_asarray_array_func25",
-    "TestOneArrAndAxis.test_andaxis_tensor_func2_axis_-1",
-    "TestOneArrAndAxis.test_andaxis_array_func2_axis3",
-    "TestOneArr.test_asarray_tensor_func41",
-    "TestOneArrAndAxis.test_andaxis_tensor_func0_axis_1",
-    "TestOneArr.test_asarray_list_func49",
-    "TestOneArr.test_asarray_list_func57",
-    "TestOneArrAndAxis.test_andaxis_tensor_func8_axis_1",
-    "TestOneArr.test_asarray_tensor_func71",
-    "TestSequenceOfArrays.test_single_list_func1",
-    "TestPythonArgsToArray.test_argstoarray_simple_func9_args9",
-    "TestOneArr.test_asarray_list_func37",
-    "TestOneArrAndAxis.test_andaxis_tensor_func0_axis_0",
-    "TestOneArr.test_asarray_array_func30",
-    "TestOneArr.test_asarray_tensor_func21",
-    "TestOneArr.test_asarray_array_func35",
-    "TestOneArr.test_asarray_tensor_func64",
-    "TestOneArr.test_asarray_list_func51",
-    "TestOneArr.test_asarray_array_func47",
-    "TestOneArrAndAxis.test_andaxis_tensor_func7_axis_1",
-    "TestOneArr.test_asarray_array_func29",
-    "TestOneArrAndAxis.test_andaxis_array_func1_axis_-1",
-    "TestOneArr.test_asarray_tensor_func19",
-    "TestOneArrAndAxis.test_andaxis_list_func1_axis_0",
-    "TestOneArr.test_asarray_tensor_func17",
-    "TestOneArrAndAxis.test_andaxis_list_func0_axis_1",
-    "TestOneArr.test_asarray_tensor_func70",
-    "TestOneArr.test_asarray_tensor_func54",
-    "TestOneArr.test_asarray_tensor_func23",
-    "TestOneArr.test_asarray_array_func7",
-    "TestOneArr.test_asarray_array_func12",
-    "TestOneArrAndAxis.test_andaxis_list_func3_axis_-1",
-    "TestOneArrAndAxis.test_andaxis_array_func10_axis_0",
-    "TestOneArr.test_asarray_tensor_func13",
-    "TestOneArrAndAxis.test_andaxis_tensor_func6_axis_0",
-    "TestOneArrAndShape.test_andshape_array_func4",
-    "TestOneArrAndAxis.test_andaxis_tensor_func10_axis3",
-    "TestOneArr.test_asarray_array_func40",
-    "TestOneArrAndAxis.test_andaxis_tensor_func7_axis_0",
-    "TestOneArr.test_asarray_list_func42",
-    "TestOneArrAndAxis.test_andaxis_tensor_func0_axis_-1",
-    "TestOneArr.test_asarray_list_func25",
-    "TestOneArr.test_asarray_tensor_func52",
-    "TestOneArrAndAxis.test_andaxis_list_func6_axis_-1",
-    "TestSequenceOfArraysToSingle.test_several_func1",
-    "TestCopyTo.test_copytobcast",
-    "TestOneArrAndAxis.test_andaxis_array_func10_axis_-1",
-    "TestSequenceOfArraysToSingle.test_several_func5",
-    "TestOneArr.test_asarray_list_func1",
-    "TestOneArr.test_asarray_list_func46",
-    "TestSequenceOfArrays.test_single_list_func0",
-    "TestCond.test_sq_cases",
-    "TestNormInt64.test_bad_args",
-    "TestQR.test_qr_empty_m_0_n_3",
-    "TestMultiDot.test_dynamic_programming_optimization_and_out",
-    "TestNormDouble.test_bad_args",
-    "TestCond.test_empty_sq_cases",
-    "TestQR.test_qr_empty_m_0_n_0",
-    "TestQR.test_mode_raw",
-    "TestMultiDot.test_two_arguments_and_out",
-    "TestMultiDot.test_three_arguments_and_out",
-    "TestNormDouble.test_axis",
-    "TestMisc.test_generalized_raise_multiloop",
-    "TestEigvalsh.test_invalid",
-    "TestNormDouble.test_matrix_2x2",
-    "TestMisc.test_byteorder_check",
-    "TestNormInt64.test_axis",
-    "TestQR.test_qr_empty_m_3_n_0",
-    "TestEigh.test_invalid",
-    "TestNormSingle.test_bad_args",
-    "TestNormSingle.test_matrix_2x2",
-    "TestNormSingle.test_axis",
-    "TestMultiDot.test_too_few_input_arrays",
-    "TestNormInt64.test_matrix_2x2",
-    "TestFliplr.test_basic",
-    "TestHistogram2d.test_binparameter_combination",
-    "TestHistogram2d.test_all_outliers",
-    "TestTriuIndicesFrom.test_exceptions",
-    "TestTrilIndicesFrom.test_exceptions",
-    "TestHistogram2d.test_asym",
-    "TestDiag.test_failure",
-    "TestVsplit.test_non_iterable",
-    "TestVsplit.test_1D_array",
-    "TestApplyAlongAxis.test_scalar_array",
-    "TestDstack.test_non_iterable",
-    "TestSplit.test_unequal_split",
-    "TestPutAlongAxis.test_broadcast",
-    "TestArraySplit.test_integer_0_split",
-    "TestDsplit.test_2D_array",
-    "TestTakeAlongAxis.test_invalid",
-    "TestHsplit.test_0D_array",
-    "TestDsplit.test_1D_array",
-    "TestDsplit.test_non_iterable",
-    "TestDsplit.test_0D_array",
-    "TestHsplit.test_non_iterable",
-    "TestColumnStack.test_non_iterable",
-    "TestApplyAlongAxis.test_axis_insertion",
-    "TestVsplit.test_0D_array",
-    "TestExpandDims.test_repeated_axis",
-    "TestExpandDims.test_axis_out_of_range",
-    "TestApplyAlongAxis.test_0d_array",
-    "TestHistogramdd.test_bins_errors",
-    "TestHistogramdd.test_equal_edges",
-    "TestHistogram.test_precision",
-    "TestHistogramdd.test_finite_range",
-    "TestHistogramdd.test_weights",
-    "TestHistogram.test_error_binnum_type",
-    "TestHistogram.test_finite_range",
-    "TestHistogramdd.test_inf_edges",
-    "TestHistogramdd.test_bins_error_2",
-    "TestHistogramdd.test_simple",
-    "TestHistogram.test_one_bin",
-    "TestHistogram.test_unsigned_monotonicity_check",
-    "TestQuantile.test_quantile_monotonic_method_weibull",
-    "TestGradient.test_badargs",
-    "TestRot90.test_basic",
-    "TestDiff.test_axis",
-    "TestQuantile.test_quantile_monotonic_method_median_unbiased",
-    "TestGradient.test_values",
-    "TestCov.test_aweights",
-    "TestQuantile.test_quantile_monotonic_method_interpolated_inverted_cdf",
-    "TestQuantile.test_quantile_monotonic_method_inverted_cdf",
-    "TestPercentile.test_keepdims_out_q1_axis_1",
-    "TestSortComplex.test_sort_real_type_in_g_type_out_G",
-    "TestMedian.test_keepdims_out_axis2",
-    "TestMeshgrid.test_invalid_arguments",
-    "TestGradient.test_specific_axes",
-    "TestPercentile.test_keepdims_out_q_7_axis4",
-    "TestPercentile.test_keepdims_out_q1_axis4",
-    "TestDelete.test_slices",
-    "TestPercentile.test_extended_axis_invalid",
-    "TestGradient.test_second_order_accurate",
-    "TestMedian.test_keepdims_out_axis0",
-    "TestDiff.test_prepend",
-    "TestMedian.test_keepdims_out_axis_1",
-    "TestPercentile.test_keepdims_out_q1_axis0",
-    "TestQuantile.test_quantile_monotonic_method_averaged_inverted_cdf",
-    "TestMedian.test_keepdims_out_axis4",
-    "TestBincount.test_with_incorrect_minlength",
-    "TestSortComplex.test_sort_real_type_in_H_type_out_F",
-    "TestDiff.test_n",
-    "TestMeshgrid.test_indexing",
-    "TestQuantile.test_quantile_monotonic_method_closest_observation",
-    "TestFlip.test_axes",
-    "TestPercentile.test_keepdims_out_q1_axis3",
-    "TestPercentile.test_keepdims_out_q_7_axis0",
-    "TestMedian.test_keepdims_out_axis3",
-    "TestCov.test_fweights",
-    "TestDiff.test_append",
-    "TestPercentile.test_scalar_q",
-    "TestMedian.test_extended_axis_invalid",
-    "TestMedian.test_out",
-    "TestPercentile.test_keepdims_out_q_7_axis2",
-    "TestPercentile.test_keepdims_out_q1_axis2",
-    "TestQuantile.test_quantile_monotonic_method_hazen",
-    "TestPercentile.test_keepdims_out_q_7_axis3",
-    "TestPercentile.test_keepdims_out_q_7_axis_1",
-    "TestPercentile.test_api",
-    "TestQuantile.test_quantile_monotonic_method_normal_unbiased",
-    "TestSetOps.test_in1d_table_timedelta_fails",
-    "TestUnique.test_unique_axis_errors",
-    "TestSetOps.test_setdiff1d",
-    "TestSetOps.test_in1d_timedelta_kind_sort",
-    "TestSetOps.test_in1d_timedelta_kind0",
-    "TestUnique.test_unique_axis",
-    "TestConstant.test_check_constant_float3",
-    "TestConstant.test_check_constant_pad_2d",
-    "TestConcatenate.test_out_and_dtype_axis0_out_dtype_f8_casting_safe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis_0_out_dtype_c8_casting_same_kind",  # torch_np/numpy_tests/core/test_shape_base
-    "TestStackMisc.test_stack_out_and_dtype_axis_0_out_dtype_f8_casting_same_kind",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis_0_out_dtype_f8_casting_no",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_exceptions",  # torch_np/numpy_tests/core/test_shape_base
-    "TestStackMisc.test_stack_out_and_dtype_axis_0_out_dtype_c8_casting_same_kind",  # torch_np/numpy_tests/core/test_shape_base
-    "TestStackMisc.test_stack_out_and_dtype_axis_0_out_dtype_i8_casting_equiv",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_large_concatenate_axis_None",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_concatenate",  # torch_np/numpy_tests/core/test_shape_base
-    "TestVstack.test_empty_input",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis_0_out_dtype_i8_casting_unsafe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestStackMisc.test_stack_out_and_dtype_axis_0_out_dtype_i8_casting_no",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis_0_out_dtype_f8_casting_same_kind",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis0_out_dtype_f8_casting_no",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis_0_out_dtype_f4_casting_unsafe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestStackMisc.test_stack_out_and_dtype_axis_0_out_dtype_f8_casting_unsafe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestVstack.test_non_iterable",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis0_out_dtype_i8_casting_unsafe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis_0_out_dtype_f4_casting_same_kind",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis_0_out_dtype_f8_casting_unsafe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis0_out_dtype_f8_casting_same_kind",  # torch_np/numpy_tests/core/test_shape_base
-    "TestStackMisc.test_stack_out_and_dtype_axis_0_out_dtype_f8_casting_safe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis0_out_dtype_f4_casting_same_kind",  # torch_np/numpy_tests/core/test_shape_base
-    "TestStackMisc.test_stack_out_and_dtype_axis_0_out_dtype_i8_casting_same_kind",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis_0_out_dtype_c8_casting_unsafe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis_0_out_dtype_f8_casting_safe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis0_out_dtype_c8_casting_unsafe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestStackMisc.test_stack_out_and_dtype_axis_0_out_dtype_f4_casting_same_kind",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis0_out_dtype_f4_casting_unsafe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestStackMisc.test_stack",  # torch_np/numpy_tests/core/test_shape_base
-    "TestStackMisc.test_stack_out_and_dtype_axis_0_out_dtype_c8_casting_unsafe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis0_out_dtype_f8_casting_unsafe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestStackMisc.test_stack_out_and_dtype_axis_0_out_dtype_i8_casting_safe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_bad_out_shape",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis0_out_dtype_f8_casting_equiv",  # torch_np/numpy_tests/core/test_shape_base
-    "TestHstack.test_non_iterable",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis0_out_dtype_c8_casting_same_kind",  # torch_np/numpy_tests/core/test_shape_base
-    "TestHstack.test_empty_input",  # torch_np/numpy_tests/core/test_shape_base
-    "TestStackMisc.test_stack_out_and_dtype_axis_0_out_dtype_f4_casting_unsafe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestConcatenate.test_out_and_dtype_axis_0_out_dtype_f8_casting_equiv",  # torch_np/numpy_tests/core/test_shape_base
-    "TestStackMisc.test_stack_out_and_dtype_axis_0_out_dtype_i8_casting_unsafe",  # torch_np/numpy_tests/core/test_shape_base
-    "TestNegative.test_exceptions",  # torch_np/numpy_tests/core/test_scalarmath
-    "TestPower.test_modular_power",  # torch_np/numpy_tests/core/test_scalarmath
-    "TestBaseMath.test_lower_align",  # torch_np/numpy_tests/core/test_scalarmath
-    "TestArrayFromScalar.test_integers_np_longlong_t26",  # torch_np/numpy_tests/core/test_scalar_ctors
-    "TestArrayFromScalar.test_integers_np_intc_np_longlong",  # torch_np/numpy_tests/core/test_scalar_ctors
-    "TestArrayFromScalar.test_integers_t15_np_longlong",  # torch_np/numpy_tests/core/test_scalar_ctors
-    "TestArrayFromScalar.test_integers_np_longlong_np_longlong",  # torch_np/numpy_tests/core/test_scalar_ctors
-    "TestArrayFromScalar.test_integers_np_byte_np_longlong",  # torch_np/numpy_tests/core/test_scalar_ctors
-    "TestArrayFromScalar.test_integers_np_short_np_longlong",  # torch_np/numpy_tests/core/test_scalar_ctors
-    "TestArrayFromScalar.test_integers_np_int__np_longlong",  # torch_np/numpy_tests/core/test_scalar_ctors
-    "TestScalarTypeNames.test_names_reflect_attributes_t4",  # torch_np/numpy_tests/core/test_numerictypes
-    "TestScalarTypeNames.test_names_reflect_attributes_t1",  # torch_np/numpy_tests/core/test_numerictypes
-    "TestScalarTypeNames.test_names_reflect_attributes_t7",  # torch_np/numpy_tests/core/test_numerictypes
-    "TestScalarTypeNames.test_names_reflect_attributes_t5",  # torch_np/numpy_tests/core/test_numerictypes
-    "TestScalarTypeNames.test_names_reflect_attributes_t9",  # torch_np/numpy_tests/core/test_numerictypes
-    "TestScalarTypeNames.test_names_reflect_attributes_t6",  # torch_np/numpy_tests/core/test_numerictypes
-    "TestScalarTypeNames.test_names_reflect_attributes_t2",  # torch_np/numpy_tests/core/test_numerictypes
-    "TestScalarTypeNames.test_names_reflect_attributes_t8",  # torch_np/numpy_tests/core/test_numerictypes
-    "TestScalarTypeNames.test_names_reflect_attributes_t0",  # torch_np/numpy_tests/core/test_numerictypes
-    "TestScalarTypeNames.test_names_reflect_attributes_t3",  # torch_np/numpy_tests/core/test_numerictypes
-    "TestClip.test_clip_inplace_array",  # torch_np/numpy_tests/core/test_numeric
-    "TestRequire.test_require_each",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_clip_with_out_simple_int32",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_simple_inplace_01",  # torch_np/numpy_tests/core/test_numeric
-    "TestStdVar.test_out_scalar",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_simple_int32_inout_casting_unsafe",  # torch_np/numpy_tests/core/test_numeric
-    "TestMoveaxis.test_errors",  # torch_np/numpy_tests/core/test_numeric
-    "TestNonzeroAndCountNonzero.test_count_nonzero_axis",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_clip_with_out_memory_overlap",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_clip_func_takes_out",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_noncontig_inplace",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_type_cast_12",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_simple_int64_out",  # torch_np/numpy_tests/core/test_numeric
-    "TestRollaxis.test_exceptions",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_simple_inplace_02",  # torch_np/numpy_tests/core/test_numeric
-    "TestRequire.test_C_and_F_simul",  # torch_np/numpy_tests/core/test_numeric
-    "TestNonarrayArgs.test_dunder_round_edgecases_val_2147483647_ndigits_-1",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_simple_complex",  # torch_np/numpy_tests/core/test_numeric
-    "TestBoolArray.test_logical_not_abs",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_simple_out",  # torch_np/numpy_tests/core/test_numeric
-    "TestBroadcast.test_broadcast_single_arg",  # torch_np/numpy_tests/core/test_numeric
-    "TestRequire.test_unknown_requirement",  # torch_np/numpy_tests/core/test_numeric
-    "TestBoolArray.test_logical_and_or_xor",  # torch_np/numpy_tests/core/test_numeric
-    "TestBroadcast.test_broadcast_error_kwargs",  # torch_np/numpy_tests/core/test_numeric
-    "TestNonarrayArgs.test_dunder_round_edgecases_val_2147483647_ndigits_-9",  # torch_np/numpy_tests/core/test_numeric
-    "TestNonarrayArgs.test_dunder_round_edgecases_val_2147483647_ndigits_-10",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_type_cast_10",  # torch_np/numpy_tests/core/test_numeric
-    "TestOuterMisc.test_outer_out_param",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_clip_inplace_simple",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_clip_with_out_transposed",  # torch_np/numpy_tests/core/test_numeric
-    "TestClip.test_clip_with_out_simple",  # torch_np/numpy_tests/core/test_numeric
-    "TestCross.test_broadcasting_shapes",  # torch_np/numpy_tests/core/test_numeric
-    "TestIndexing.test_index_no_floats",  # torch_np/numpy_tests/core/test_indexing
-    "TestBooleanIndexing.test_boolean_indexing_weirdness",  # torch_np/numpy_tests/core/test_indexing
-    "TestBooleanIndexing.test_bool_as_int_argument_errors",  # torch_np/numpy_tests/core/test_indexing
-    "TestBroadcastedAssignments.test_simple_broadcasting_errors",  # torch_np/numpy_tests/core/test_indexing
-    "TestFloatNonIntegerArgument.test_non_integer_argument_errors",  # torch_np/numpy_tests/core/test_indexing
-    "TestIndexing.test_slicing_no_floats",  # torch_np/numpy_tests/core/test_indexing
-    "TestBroadcastedAssignments.test_prepend_not_one",  # torch_np/numpy_tests/core/test_indexing
-    "TestFloatNonIntegerArgument.test_reduce_axis_float_index",  # torch_np/numpy_tests/core/test_indexing
-    "TestEinsum.test_different_paths_dtype_f",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_different_paths_dtype_D",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_different_paths_dtype_e",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_einsum_fixed_collapsingbug",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_combined_views_mapping",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_different_paths_dtype_B",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_einsum_sums_cfloat64",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_einsum_broadcast",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_einsum_sums_int32",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_different_paths_dtype_b",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_einsum_fixedstridebug",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_out_is_res",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_subscript_range",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_einsum_sums_float64",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_einsum_sums_float32",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_einsum_sums_cfloat128",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_small_boolean_arrays",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_different_paths_dtype_i",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_different_paths_dtype_d",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_different_paths_dtype_l",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_different_paths_dtype_h",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_einsum_misc",  # torch_np/numpy_tests/core/test_einsum
-    "TestMisc.test_f16_on_cuda",
-    "TestMisc.test_overlap",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_einsum_sums_int64",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_einsum_failed_on_p9_and_s390x",  # torch_np/numpy_tests/core/test_einsum
-    "TestEinsum.test_different_paths_dtype_F",  # torch_np/numpy_tests/core/test_einsum
-    "TestDLPack.test_dtype_passthrough_dtype4",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_23",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_12",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_27",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_32",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_from_dlpack_refcount",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_dtype_passthrough_dtype2",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_2",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_ndim0",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_1",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_17",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_13",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_14",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_dtype_passthrough_dtype7",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_dtype_passthrough_dtype9",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_29",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_dunder_dlpack_refcount",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_15",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_non_contiguous",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_dtype_passthrough_dtype3",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_30",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_6",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_7",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_dtype_passthrough_dtype6",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_dtype_passthrough_dtype5",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_4",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_31",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_from_torch",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_24",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_21",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_dtype_passthrough_dtype8",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_28",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_3",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_10",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_0",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_16",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_18",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_20",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_11",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_25",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_5",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_22",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_dlpack_device",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_9",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_dtype_passthrough_dtype0",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_dtype_passthrough_dtype1",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_19",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_26",  # torch_np/numpy_tests/core/test_dlpack
-    "TestDLPack.test_higher_dims_ndim_8",  # torch_np/numpy_tests/core/test_dlpack
-    "WeakTest.test_make_weak_keyed_dict_from_weak_keyed_dict",  # test_weak
-    "TestViewOpsLAZY.test_advanced_indexing_assignment_lazy",  # test_view_ops
-    "TestOldViewOpsCPU.test_crow_col_indices_cpu",  # test_view_ops
-    "TestViewOpsLAZY.test_advanced_indexing_nonview_lazy",  # test_view_ops
-    "TestTypePromotionCPU.test_alpha_mismatch_cpu",  # test_type_promotion
-    "TestTypePromotionCPU.test_alternate_result_cpu",  # test_type_promotion
-    "TestTypeHints.test_doc_examples",  # test_type_hints
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_1_head_dim_8_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_3_head_dim_8_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_3_head_dim_8_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_1_head_dim_8_causal_True_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_1_head_dim_16_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_1_head_dim_8_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_1_head_dim_8_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_0_float32_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_3_head_dim_16_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_1_head_dim_16_causal_True_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_3_head_dim_8_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_1_head_dim_8_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_3_head_dim_8_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_1_head_dim_16_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_1_head_dim_16_causal_True_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_1_head_dim_16_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_3_head_dim_8_causal_True_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_3_head_dim_8_causal_False_train_True_cpu_bfloat16",
-    "TestAttnMasksCPU.test_is_causal_equals_upper_left_shape0_cpu",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_3_head_dim_8_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_3_head_dim_16_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_3_head_dim_8_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_1_head_dim_8_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_1_head_dim_8_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_1_head_dim_8_causal_True_train_True_cpu_float32",
-    "TestSDPAFailureModesCPU.test_invalid_inputs_different_datatypes_kernel2_cpu",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_3_head_dim_8_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_3_head_dim_16_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_3_head_dim_16_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_1_head_dim_8_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_3_head_dim_8_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_3_head_dim_16_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_3_head_dim_8_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_1_head_dim_16_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_1_head_dim_16_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_3_head_dim_8_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_1_head_dim_8_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_3_head_dim_16_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_3_head_dim_16_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_1_head_dim_16_causal_False_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_3_head_dim_16_causal_True_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_1_head_dim_8_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_1_head_dim_16_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_1_head_dim_16_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_1_head_dim_16_causal_False_train_False_cpu_float64",
-    "TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_0_float64_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_3_head_dim_16_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_3_head_dim_8_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_1_head_dim_16_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_1_head_dim_8_causal_False_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_1_head_dim_8_causal_False_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_1_head_dim_16_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_1_head_dim_16_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_1_head_dim_16_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_3_head_dim_16_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_1_head_dim_16_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_7_float16_cpu_float16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_1_head_dim_8_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_3_head_dim_8_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_1_head_dim_16_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_3_head_dim_8_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_3_head_dim_16_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_3_head_dim_8_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_1_head_dim_8_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_1_head_dim_8_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_3_head_dim_16_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_3_head_dim_8_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_3_head_dim_8_causal_True_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_1_head_dim_16_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_1_head_dim_16_causal_False_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_1_head_dim_8_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_3_head_dim_16_causal_False_train_False_cpu_bfloat16",
-    "TestAttnMasksCPU.test_is_causal_equals_upper_left_shape1_cpu",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_1_head_dim_16_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_3_head_dim_8_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_3_head_dim_8_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_3_head_dim_16_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_1_head_dim_16_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_1_head_dim_16_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_1_head_dim_8_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_1_head_dim_16_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_3_head_dim_16_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_1_head_dim_8_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_3_head_dim_8_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_1_head_dim_16_causal_False_train_True_cpu_float32",
-    "TestSDPAFailureModesCPU.test_invalid_inputs_different_datatypes_kernel1_cpu",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_3_head_dim_16_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_3_head_dim_8_causal_False_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_3_head_dim_8_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_3_head_dim_8_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_1_head_dim_8_causal_False_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_3_head_dim_16_causal_False_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_3_head_dim_8_causal_True_train_False_cpu_float32",
-    "TestAttnMasksCPU.test_is_causal_and_mask_fails_cpu",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_1_head_dim_8_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_attention_math_with_negative_scale_kernel0_cpu",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_3_head_dim_8_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_3_head_dim_8_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_1_head_dim_16_causal_True_train_True_cpu_float32",
-    "TestAttnMasksCPU.test_is_causal_equals_upper_left_shape2_cpu",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_3_head_dim_8_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_1_head_dim_8_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_1_head_dim_16_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_3_head_dim_16_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_3_head_dim_16_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_3_head_dim_16_causal_True_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_1_head_dim_16_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_1_head_dim_8_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_3_head_dim_16_causal_False_train_False_cpu_float64",
-    "TestTransformersCPU.test_train_with_is_causal_cpu",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_1_head_dim_8_causal_True_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_3_head_dim_16_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_3_head_dim_8_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_3_head_dim_8_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_3_head_dim_8_causal_True_train_True_cpu_float32",
-    "TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_0_bfloat16_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_1_head_dim_16_causal_True_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_1_head_dim_16_causal_True_train_True_cpu_float64",
-    "TestSDPAFailureModesCPU.test_invalid_inputs_1_dimensional_inputs_kernel0_cpu",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_1_head_dim_8_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_1_head_dim_16_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_1_head_dim_8_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_1_head_dim_16_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_1_head_dim_16_causal_False_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_1_head_dim_16_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_3_head_dim_16_causal_False_train_True_cpu_float64",
-    "TestSDPAFailureModesCPU.test_invalid_inputs_different_datatypes_kernel0_cpu",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_1_head_dim_8_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_1_head_dim_16_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_1_head_dim_16_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_3_head_dim_8_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_3_head_dim_8_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_1_head_dim_8_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_3_head_dim_16_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_3_head_dim_16_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_1_head_dim_8_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_1_head_dim_16_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_1_head_dim_8_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_3_head_dim_8_causal_False_train_False_cpu_float64",
-    "TestAttnMasksCPU.test_is_causal_equals_upper_left_shape3_cpu",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_3_head_dim_8_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_1_head_dim_8_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_3_head_dim_8_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_1_head_dim_16_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_1_head_dim_16_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_3_head_dim_16_causal_False_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_3_head_dim_16_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_3_head_dim_16_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_1_head_dim_16_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_1_head_dim_8_causal_True_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_1_head_dim_8_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_7_float32_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_3_head_dim_8_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_3_head_dim_16_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_3_head_dim_16_causal_True_train_True_cpu_float32",
-    "TestSDPAFailureModesCPU.test_invalid_inputs_1_dimensional_inputs_kernel2_cpu",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_3_head_dim_16_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_1_head_dim_16_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_1_head_dim_8_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_3_head_dim_8_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_1_head_dim_8_causal_False_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_3_head_dim_8_causal_True_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_1_head_dim_16_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_1_head_dim_8_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_3_head_dim_16_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_3_head_dim_16_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_3_head_dim_16_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_3_head_dim_16_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_1_head_dim_16_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_1_head_dim_8_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_1_head_dim_8_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_3_head_dim_8_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_1030_n_head_3_head_dim_16_causal_False_train_False_cpu_float32",
-    "TestSDPAFailureModesCPU.test_invalid_inputs_1_dimensional_inputs_kernel1_cpu",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_1_head_dim_8_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_1_head_dim_8_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_3_head_dim_16_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_1_head_dim_16_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_1_head_dim_16_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_3_head_dim_16_causal_False_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_3_head_dim_8_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_3_head_dim_16_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_3_head_dim_16_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_1_head_dim_16_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_3_head_dim_8_causal_False_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_1_head_dim_8_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_267_n_head_3_head_dim_16_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_1_head_dim_8_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_7_float64_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_1_head_dim_8_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_267_n_head_1_head_dim_8_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_1_head_dim_16_causal_True_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_3_head_dim_8_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_1030_n_head_1_head_dim_8_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_3_head_dim_16_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_1_head_dim_8_causal_False_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_1_head_dim_8_causal_False_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_12_seq_len_1030_n_head_3_head_dim_8_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_3_head_dim_16_causal_False_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_1_head_dim_16_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_3_head_dim_8_causal_False_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_3_head_dim_8_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_0_float16_cpu_float16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_3_head_dim_16_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_seq_len_1030_n_head_3_head_dim_8_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_1030_n_head_3_head_dim_16_causal_True_train_False_cpu_float64",
-    "TestSDPACPU.test_fused_sdp_choice_cpu_type_dense_dropout_0_7_bfloat16_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_267_n_head_1_head_dim_16_causal_True_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_1_head_dim_8_causal_False_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_2_seq_len_267_n_head_3_head_dim_8_causal_True_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_seq_len_1030_n_head_3_head_dim_16_causal_False_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float64_batch_size_2_seq_len_267_n_head_3_head_dim_16_causal_True_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_vs_math_cpu_fused_kernel0_float32_batch_size_12_seq_len_267_n_head_3_head_dim_16_causal_True_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_bfloat16_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_bfloat16",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float32_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float32",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_12_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_1030_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_1179_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_1_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_16_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_2_bool_mask_1_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_0_train_True_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_False_cpu_float64",
-    "TestSDPACPU.test_scaled_dot_product_fused_attention_mask_vs_math_cpu_fused_kernel0_float64_batch_size_2_q_seq_len_267_kv_seq_len_514_n_head_3_head_dim_8_mask_dim_4_bool_mask_1_train_True_cpu_float64",
-    "TestAssertCloseSparseCOO.test_matching_coalesced",  # test_testing
-    "TestImports.test_circular_dependencies",  # test_testing
-    "TestAssertCloseSparseCSR.test_mismatching_crow_indices_msg",  # test_testing
-    "TestAssertCloseSparseBSC.test_mismatching_row_indices_msg",  # test_testing
-    "TestAssertCloseSparseCOO.test_mismatching_values_msg",  # test_testing
-    "TestAssertCloseQuantized.test_matching_per_channel",  # test_testing
-    "TestTestParametrizationDeviceTypeCPU.test_ops_decorator_applies_op_and_param_specific_decorators_cpu",  # test_testing
-    "TestAssertCloseSparseCOO.test_matching_uncoalesced",  # test_testing
-    "TestAssertCloseSparseCSR.test_matching",  # test_testing
-    "TestAssertCloseSparseBSR.test_mismatching_crow_indices_msg",  # test_testing
-    "TestAssertCloseSparseBSR.test_matching",  # test_testing
-    "TestAssertCloseQuantized.test_mismatching_is_quantized",  # test_testing
-    "TestAssertCloseSparseCOO.test_mismatching_indices_msg",  # test_testing
-    "TestAssertCloseSparseBSC.test_mismatching_ccol_indices_msg",  # test_testing
-    "TestAssertCloseSparseBSC.test_mismatching_values_msg",  # test_testing
-    "TestAssertCloseSparseCSC.test_mismatching_row_indices_msg",  # test_testing
-    "TestAssertCloseSparseBSC.test_matching",  # test_testing
-    "TestAssertCloseSparseCSC.test_matching",  # test_testing
-    "TestAssertCloseSparseCSR.test_mismatching_values_msg",  # test_testing
-    "TestAssertCloseSparseBSR.test_mismatching_values_msg",  # test_testing
-    "TestAssertCloseSparseCSC.test_mismatching_values_msg",  # test_testing
-    "TestAssertCloseSparseBSR.test_mismatching_col_indices_msg",  # test_testing
-    "TestAssertCloseSparseCOO.test_mismatching_nnz",  # test_testing
-    "TestAssertCloseSparseCSR.test_mismatching_col_indices_msg",  # test_testing
-    "TestAssertCloseQuantized.test_mismatching_qscheme",  # test_testing
-    "TestAssertCloseQuantized.test_matching_per_tensor",  # test_testing
-    "TestAssertCloseSparseCSC.test_mismatching_ccol_indices_msg",  # test_testing
-    "TestTensorBoardUtils.test_to_HWC",  # test_tensorboard
-    "TestTensorBoardEmbedding.test_embedding",  # test_tensorboard
-    "TestTensorProtoSummary.test_float_tensor_proto",  # test_tensorboard
-    "TestTensorBoardSummary.test_image_without_channel",  # test_tensorboard
-    "TestTensorBoardSummary.test_hparams_smoke",  # test_tensorboard
-    "TestTensorBoardUtils.test_numpy_vid_uint8",  # test_tensorboard
-    "TestTensorProtoSummary.test_complex_tensor_proto",  # test_tensorboard
-    "TestTensorBoardSummary.test_image_with_one_channel",  # test_tensorboard
-    "TestTensorBoardEmbedding.test_embedding_64",  # test_tensorboard
-    "TestTensorBoardSummary.test_hparams_domain_discrete",  # test_tensorboard
-    "TestTensorBoardSummary.test_hparams_wrong_parameter",  # test_tensorboard
-    "TestTensorBoardSummary.test_video",  # test_tensorboard
-    "TestTensorProtoSummary.test_int_tensor_proto",  # test_tensorboard
-    "TestTensorBoardSummary.test_hparams_number",  # test_tensorboard
-    "TestTensorBoardWriter.test_writer",  # test_tensorboard
-    "TestTensorProtoSummary.test_empty_tensor_proto",  # test_tensorboard
-    "TestTensorBoardSummary.test_hparams_string",  # test_tensorboard
-    "TestTensorBoardSummary.test_hparams_bool",  # test_tensorboard
-    "TestTensorBoardSummary.test_uint8_image",  # test_tensorboard
-    "TestBufferProtocolCPU.test_shared_buffer_cpu_uint8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_and_offset_cpu_complex128",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_same_type_cpu_complex128",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_invalid_positional_args_cpu_float16",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_requires_grad_cpu_int32",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_invalid_positional_args_cpu_uint8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_cpu_complex128",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_requires_grad_cpu_uint8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_cpu_uint8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_cpu_float64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_shared_buffer_cpu_complex128",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_list_cpu_float64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_cpu_float32",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_list_cpu_int64",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_alias_from_buffer_cpu_float16",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_offset_cpu_int64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_same_type_cpu_int16",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_offset_cpu_float64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_invalid_positional_args_cpu_float64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_same_type_cpu_float32",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_and_offset_cpu_uint8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_and_offset_cpu_bool",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_same_type_cpu_int32",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_alias_from_buffer_cpu_int16",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_alias_from_buffer_cpu_float32",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_list_cpu_int32",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_requires_grad_cpu_int8",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_from_buffer_cpu_int16",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_and_offset_cpu_int8",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_from_buffer_cpu_int8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_requires_grad_cpu_float16",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_invalid_positional_args_cpu_int64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_same_type_cpu_int64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_and_offset_cpu_float16",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_from_buffer_cpu_complex64",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_alias_from_buffer_cpu_uint8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_and_offset_cpu_int64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_invalid_positional_args_cpu_complex128",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_and_offset_cpu_complex64",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_from_buffer_cpu_uint8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_invalid_positional_args_cpu_bool",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_cpu_int32",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_invalid_positional_args_cpu_int16",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_from_buffer_cpu_complex128",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_from_buffer_cpu_float32",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_offset_cpu_int16",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_list_cpu_float32",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_and_offset_cpu_float64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_requires_grad_cpu_float32",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_same_type_cpu_uint8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_and_offset_cpu_int16",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_shared_buffer_cpu_int8",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_from_buffer_cpu_float64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_cpu_int8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_invalid_positional_args_cpu_int8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_and_offset_cpu_float32",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_alias_from_buffer_cpu_complex64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_same_type_cpu_float16",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_same_type_cpu_bool",  # test_tensor_creation_ops
-    "TestTensorCreationCPU.test_tensor_factory_type_inference_cpu",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_from_buffer_cpu_float16",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_cpu_bool",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_byte_to_int_cpu",  # test_tensor_creation_ops
-    "TestTensorCreationCPU.test_block_diag_cpu",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_offset_cpu_complex64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_offset_cpu_uint8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_cpu_int64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_requires_grad_cpu_int16",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_requires_grad_cpu_complex128",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_list_cpu_int8",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_alias_from_buffer_cpu_complex128",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_list_cpu_float16",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_offset_cpu_complex128",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_offset_cpu_float32",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_offset_cpu_int32",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_from_buffer_cpu_int32",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_from_buffer_cpu_int64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_offset_cpu_bool",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_list_cpu_complex64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_and_offset_cpu_int32",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_shared_buffer_cpu_int64",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_list_cpu_uint8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_offset_cpu_float16",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_offset_cpu_int8",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_list_cpu_bfloat16",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_invalid_positional_args_cpu_int32",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_list_cpu_bool",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_shared_buffer_cpu_bool",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_same_type_cpu_float64",  # test_tensor_creation_ops
-    "TestTensorCreationCPU.test_constructor_dtypes_cpu",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_alias_from_buffer_cpu_int64",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_alias_from_buffer_cpu_bool",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_same_type_cpu_complex64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_requires_grad_cpu_float64",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_list_cpu_complex128",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_same_type_cpu_int8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_invalid_positional_args_cpu_float32",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_shared_buffer_cpu_int16",  # test_tensor_creation_ops
-    "TestTensorCreationCPU.test_tensor_factory_copy_var_cpu",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_list_cpu_int16",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_requires_grad_cpu_int64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_requires_grad_cpu_bool",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_shared_buffer_cpu_int32",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_alias_from_buffer_cpu_int32",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_copy_from_buffer_cpu_bool",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_cpu_complex64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_shared_buffer_cpu_float64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_requires_grad_cpu_complex64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_cpu_int16",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_alias_from_buffer_cpu_float64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_with_count_cpu_float16",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_invalid_positional_args_cpu_complex64",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_shared_buffer_cpu_complex64",  # test_tensor_creation_ops
-    "TestAsArrayCPU.test_alias_from_buffer_cpu_int8",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_shared_buffer_cpu_float32",  # test_tensor_creation_ops
-    "TestBufferProtocolCPU.test_shared_buffer_cpu_float16",  # test_tensor_creation_ops
-    "TestTensorCreationCPU.test_cartesian_prod_cpu",  # test_tensor_creation_ops
-    "TestSubclass.test_parametrization_non_wrapper_tensor_leave_parametrized_True",  # test_subclass
-    "TestSubclass.test_module_optimization_non_wrapper_tensor",  # test_subclass
-    "TestSubclass.test_serialization_non_wrapper_tensor_as_param_True",  # test_subclass
-    "TestSubclass.test_module_optimization_sparse_tensor",  # test_subclass
-    "TestSubclass.test_param_invariants_non_wrapper_tensor_tensor_requires_grad_False",  # test_subclass
-    "TestSubclass.test_param_invariants_sparse_tensor_tensor_requires_grad_True",  # test_subclass
-    "TestSubclass.test_param_invariants_diag_tensor_below_tensor_requires_grad_True",  # test_subclass
-    "TestSubclass.test_param_invariants_diag_tensor_below_tensor_requires_grad_False",  # test_subclass
-    "TestSubclass.test_param_invariants_non_wrapper_tensor_tensor_requires_grad_True",  # test_subclass
-    "TestSubclass.test_parametrization_non_wrapper_tensor_leave_parametrized_False",  # test_subclass
-    "TestSubclass.test_type_propagation_non_wrapper_tensor_as_param_False",  # test_subclass
-    "TestSubclass.test_module_optimization_diag_tensor_below",  # test_subclass
-    "TestSubclass.test_parametrization_base_tensor_leave_parametrized_True",  # test_subclass
-    "TestSubclass.test_type_propagation_non_wrapper_tensor_as_param_True",  # test_subclass
-    "TestSubclass.test_parametrization_base_tensor_leave_parametrized_False",  # test_subclass
-    "TestSubclass.test_param_invariants_sparse_tensor_tensor_requires_grad_False",  # test_subclass
-    "TestStatelessFunctionalAPI.test_reparametrize_module_fail_reset_to_original_torch_func",  # test_stateless
-    "TestStatelessFunctionalAPI.test_reparametrized_module_change_parametrization_original_stateless",  # test_stateless
-    "TestStatelessFunctionalAPI.test_reparametrized_module_change_parametrization_original_torch_func",  # test_stateless
-    "TestStatelessFunctionalAPI.test_reparametrize_module_fail_reset_to_original_stateless",  # test_stateless
-    "TestSortAndSelectCPU.test_isin_cpu_int32",  # test_sort_and_select
-    "TestSortAndSelectCPU.test_sort_overflow_cpu_int16",  # test_sort_and_select
-    "TestSortAndSelectCPU.test_topk_quantized_scalar_input_cpu",  # test_sort_and_select
-    "TestSortAndSelectCPU.test_isin_cpu_float64",  # test_sort_and_select
-    "TestSortAndSelectCPU.test_isin_cpu_uint8",  # test_sort_and_select
-    "TestSortAndSelectCPU.test_isin_cpu_int8",  # test_sort_and_select
-    "TestSortAndSelectCPU.test_topk_arguments_cpu",  # test_sort_and_select
-    "TestSortAndSelectCPU.test_isin_cpu_int16",  # test_sort_and_select
-    "TestSortAndSelectCPU.test_isin_cpu_int64",  # test_sort_and_select
-    "TestSortAndSelectCPU.test_isin_cpu_float32",  # test_sort_and_select
-    "TestShapeOpsCPU.test_flip_cpu_float64",  # test_shape_ops
-    "TestShapeOpsCPU.test_flip_cpu_float32",  # test_shape_ops
-    "TestShapeOpsCPU.test_flip_cpu_complex64",  # test_shape_ops
-    "TestShapeOpsCPU.test_flip_cpu_float16",  # test_shape_ops
-    "TestShapeOpsCPU.test_flip_cpu_complex128",  # test_shape_ops
-    "TestShapeOpsCPU.test_clamp_cpu_int64",  # test_shape_ops
-    "TestShapeOpsCPU.test_clamp_propagates_nans_cpu",  # test_shape_ops
-    "TestShapeOpsCPU.test_flip_cpu_bfloat16",  # test_shape_ops
-    "TestShapeOpsCPU.test_clamp_cpu_float32",  # test_shape_ops
-    "TestSubclassSerialization.test_tensor_subclass_deepcopy",  # test_serialization
-    "TestOldSerialization.test_save_different_dtype_unallocated",  # test_serialization
-    "TestSubclassSerialization.test_tensor_subclass_getstate_overwrite",  # test_serialization
-    "TestSerialization.test_save_different_dtype_unallocated",  # test_serialization
-    "TestSubclassSerialization.test_tensor_subclass_wrapper_serialization",  # test_serialization
-    "TestScatterGatherCPU.test_scatter_reduce_sum_cpu_float32",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_mean_cpu_int16",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_sum_cpu_bfloat16",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_mean_cpu_complex64",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_sum_cpu_float64",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_mean_cpu_bfloat16",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_sum_cpu_complex128",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_mean_cpu_float16",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter__reductions_cpu_float16",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_mean_cpu_int32",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_sum_cpu_int32",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_mean_cpu_float32",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_mean_cpu_int64",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_sum_cpu_float16",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter__reductions_cpu_float32",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_mean_cpu_uint8",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_sum_cpu_uint8",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_sum_cpu_int16",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_mean_cpu_complex128",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_sum_cpu_int8",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_mean_cpu_float64",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_sum_cpu_complex64",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_sum_cpu_int64",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter_reduce_mean_cpu_int8",  # test_scatter_gather_ops
-    "TestScatterGatherCPU.test_scatter__reductions_cpu_complex64",  # test_scatter_gather_ops
-    "TestCxxPytree.test_pytree_serialize_spec8",  # test_pytree
-    "TestGenericPytree.test_flatten_unflatten_namedtuple_py",  # test_pytree
-    "TestCxxPytree.test_pytree_serialize_spec9",  # test_pytree
-    "TestCxxPytree.test_pytree_serialize_spec3",  # test_pytree
-    "TestGenericPytree.test_flatten_unflatten_deque_py",  # test_pytree
-    "TestGenericPytree.test_flatten_unflatten_deque_cxx",  # test_pytree
-    "TestCxxPytree.test_pytree_serialize_spec2",  # test_pytree
-    "TestCxxPytree.test_pytree_serialize_spec5",  # test_pytree
-    "TestCxxPytree.test_pytree_serialize_namedtuple",  # test_pytree
-    "TestCxxPytree.test_pytree_serialize_spec0",  # test_pytree
-    "TestCxxPytree.test_pytree_serialize_spec6",  # test_pytree
-    "TestCxxPytree.test_pytree_serialize_spec4",  # test_pytree
-    "TestCxxPytree.test_pytree_serialize_spec7",  # test_pytree
-    "TestCxxPytree.test_pytree_serialize_spec1",  # test_pytree
-    "TestPythonPytree.test_treespec_equality",  # test_pytree
-    "TestOutDtypeOp.test_out_dtype_non_op_overload",  # test_out_dtype_op
-    "TestOutDtypeOp.test_out_dtype_wrong_output",  # test_out_dtype_op
-    "TestNumPyInteropCPU.test_numpy_non_writeable_cpu",  # test_numpy_interop
-    "TestNN.test_Sequential_append",  # test_nn
-    "TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_True_align_corners_False_mode_bicubic_memory_format0_cpu",  # test_nn
-    "TestNNDeviceTypeCPU.test_nll_loss_all_ignored_cpu",  # test_nn
-    "TestNN.test_ParameterList_replication",  # test_nn
-    "TestNNDeviceTypeCPU.test_CTCLoss_no_batch_dim_reduction_none_use_module_form_False_cpu",  # test_nn
-    "TestNN.test_interpolate_buffer_overflow",  # test_nn
-    "TestNNDeviceTypeCPU.test_CTCLoss_no_batch_dim_reduction_mean_use_module_form_False_cpu",  # test_nn
-    "TestNNDeviceTypeCPU.test_hardsigmoid_grad_cpu",  # test_nn
-    "TestNN.test_batchnorm_raises_error_if_running_var_or_running_mean_have_forward_grad",  # test_nn
-    "TestNNDeviceTypeCPU.test_nll_loss_byte_target_matches_long_cpu",  # test_nn
-    "TestNNDeviceTypeCPU.test_module_to_empty_cpu_float32",  # test_nn
-    "TestNNDeviceTypeCPU.test_nll_loss_empty_tensor_reduction_none_cpu",  # test_nn
-    "TestNN.test_Sequential_extend",  # test_nn
-    "TestNN.test_overwrite_module_params_on_conversion",  # test_nn
-    "TestNN.test_ModuleList",  # test_nn
-    "TestNNDeviceTypeCPU.test_hardswish_grad_cpu",  # test_nn
-    "TestNNDeviceTypeCPU.test_threshold_inplace_overlap_cpu",  # test_nn
-    "TestNNDeviceTypeCPU.test_module_to_empty_cpu_float64",  # test_nn
-    "TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_True_align_corners_True_mode_bicubic_memory_format1_cpu",  # test_nn
-    "TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_True_align_corners_True_mode_bicubic_memory_format0_cpu",  # test_nn
-    "TestNN.test_Sequential_imul",  # test_nn
-    "TestNN.test_upsampling_bfloat16",  # test_nn
-    "TestNNDeviceTypeCPU.test_triplet_margin_with_distance_loss_cpu",  # test_nn
-    "TestNNDeviceTypeCPU.test_CTCLoss_no_batch_dim_reduction_sum_use_module_form_False_cpu",  # test_nn
-    "TestNNDeviceTypeCPU.test_nll_loss_empty_tensor_reduction_sum_cpu",  # test_nn
-    "TestNNDeviceTypeCPU.test_upsamplingTrilinear3d_align_corners_False_cpu",  # test_nn
-    "TestNNDeviceTypeCPU.test_upsamplingBiMode2d_antialias_True_align_corners_False_mode_bicubic_memory_format1_cpu",  # test_nn
-    "TestNNDeviceTypeCPU.test_batchnorm_grad_cpu",  # test_nn
-    "TestNN.test_interpolate",  # test_nn
-    "TestNN.test_register_state_dict_pre_hook",  # test_nn
-    "TestNNDeviceTypeCPU.test_upsamplingTrilinear3d_align_corners_True_cpu",  # test_nn
-    "TestNN.test_fb_fc_packed",  # test_nn
-    "TestFusionEval.test_fuse_module_eval_numerics",  # test_nn
-    "TestNNDeviceTypeCPU.test_invalid_reduction_strings_cpu",  # test_nn
-    "TestNNDeviceTypeCPU.test_nll_loss_total_weight_is_zero_cpu",  # test_nn
-    "TestNNDeviceTypeCPU.test_nll_loss_empty_tensor_reduction_mean_cpu",  # test_nn
-    "TestNN.test_register_state_dict_pre_hook_lazy_module",  # test_nn
-    "TestNN.test_ParameterDict_replication",  # test_nn
-    "TestNN.test_Sequential_iadd",  # test_nn
-    "TestNN.test_upsamplingLinear1d",  # test_nn
-    "TestNativeFunctions.test_symintlist_error_with_overload",  # test_native_functions
-    "TestNativeFunctions.test_vararg_symintlist_error",  # test_native_functions
-    "TestNativeFunctions.test_optional_intlist_invalid",  # test_native_functions
-    "TestNativeFunctions.test_symintlist_error",  # test_native_functions
-    "TestNativeFunctions.test_optional_floatlist_invalid",  # test_native_functions
-    "TestMultiprocessing.test_empty_shared",  # test_multiprocessing
-    "TestMultiprocessing.test_inherit_tensor",  # test_multiprocessing
-    "TestMultiprocessing.test_is_shared",  # test_multiprocessing
-    "TestMultiprocessing.test_fs_is_shared",  # test_multiprocessing
-    "TestMkldnnCPU.test_resnext50_32x4d_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_resnet18_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_add_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_linear_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_prelu_bf16_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_prelu_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_batch_norm_2d_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_mul_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_conv1d_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_linear_lowp_cpu_float16",  # test_mkldnn
-    "TestMkldnnCPU.test_sigmoid_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_conv3d_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_reshape_blocked_format_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_copy_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_tanh_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_conv2d_cpu",  # test_mkldnn
-    "TestMkldnnCPU.test_batch_norm_3d_cpu",  # test_mkldnn
-    "TestFunctionalAutogradBenchmark.test_fast_tasks",  # test_functional_autograd_benchmark
-    "TestFunctionSchema.test_serialize_and_deserialize",  # test_function_schema
-    "FakeTensorOperatorInvariants.test_like_ops",  # test_fake_tensor
-    "FakeTensorConverterTest.test_memoized_conversion_from_meta",  # test_fake_tensor
-    "FakeTensorOperatorInvariants.test_non_kwarg_only_device",  # test_fake_tensor
-    "FakeTensorOperatorInvariants.test_tensor_constructors_all_have_kwarg_device",  # test_fake_tensor
-    "TestExpandedWeightModuleCPU.test_Conv1d_reflect_stride2_pad2_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv2d_zeros_stride2_pad2_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Embedding_discontiguous_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_conv1d_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_instance_norm_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightModuleCPU.test_Conv2d_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv2d_reflect_stride2_pad2_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv2d_replicate_stride2_pad2_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_conv3d_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_instance_norm_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightHelperFunctionCPU.test_unpack_expanded_weight_or_tensor_with_custom_function_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv1d_pad2_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightHelperFunctionCPU.test_unpack_expanded_weight_or_tensor_cpu",  # test_expanded_weights
-    "TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_conv1d_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightModuleCPU.test_Conv1d_pad1_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_per_sample_api_compute_batch_size_not_pytreeable_cpu",  # test_expanded_weights
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_linear_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_group_norm_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_group_norm_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_conv3d_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightModuleCPU.test_Conv2d_dilated_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_conv2d_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightModuleCPU.test_Conv1d_replicate_stride2_pad2_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv3d_replicate_stride2_pad2_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv1d_pad2size1_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv1d_stride_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_conv2d_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_embedding_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightModuleCPU.test_per_sample_api_compute_batch_size_cpu",  # test_expanded_weights
-    "TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_layer_norm_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_linear_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_embedding_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_linear_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightModuleCPU.test_Conv2d_padding_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv3d_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv3d_zeros_stride2_pad2_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv3d_1x1x1_no_bias_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv3d_no_bias_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightHelperFunctionCPU.test_set_grad_sample_if_exists_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Linear_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_group_norm_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightModuleCPU.test_Linear_no_bias_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Embedding_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_layer_norm_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_conv3d_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_embedding_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_sum_nn_functional_instance_norm_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_conv1d_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightModuleCPU.test_Conv1d_circular_stride2_pad2_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv1d_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv1d_pad1size1_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv2d_strided_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv3d_circular_stride2_pad2_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv3d_stride_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv2d_no_bias_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightModuleCPU.test_Conv3d_stride_padding_multiple_inputs_cpu",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightFunctionalCPU.test_expanded_weight_per_sample_grad_mean_nn_functional_layer_norm_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightModuleCPU.test_Conv1d_zeros_stride2_pad2_multiple_inputs_cpu",  # test_expanded_weights
-    "TestExpandedWeightFunctionalCPU.test_expanded_weights_per_sample_grad_input_no_grad_nn_functional_conv2d_cpu_float64",  # test_expanded_weights  # noqa: B950
-    "TestExpandedWeightModuleCPU.test_Conv2d_circular_stride2_pad2_multiple_inputs_cpu",  # test_expanded_weights
-    "TestTorchDlPackCPU.test_dlpack_export_non_strided_cpu",  # test_dlpack
-    "TestIterDataPipeGraphFastForward.test_simple_snapshot_custom_self_next",  # test_datapipe
-    "TestIterDataPipeGraphFastForward.test_simple_snapshot_custom_non_generator",  # test_datapipe
-    "TestGraph.test_traverse_circular_datapipe",  # test_datapipe
-    "TestGraph.test_traverse_unhashable_datapipe",  # test_datapipe
-    "TestCppExtensionJIT.test_cpp_frontend_module_has_same_output_as_python",  # test_cpp_extensions_jit
-    "TestPoolingNN.test_quantized_max_pool1d_empty_kernel",  # nn/test_pooling
-    "TestPoolingNN.test_MaxUnpool2d_output_size",  # nn/test_pooling
-    "TestNNParametrization.test_deepcopy_after_parametrization",  # nn/test_parametrization
-    "TestNNParametrization.test_new_spectral_norm_dim",  # nn/test_parametrization
-    "TestNNParametrization.test_new_spectral_norm_forward",  # nn/test_parametrization
-    "TestNNParametrization.test_weight_norm_state_dict_compat",  # nn/test_parametrization
-    "TestNNParametrization.test_new_spectral_norm",  # nn/test_parametrization
-    "TestNNParametrization.test_weight_norm_deepcopy",  # nn/test_parametrization
-    "PackedSequenceTest.test_to",  # nn/test_packed_sequence
-    "PackedSequenceTest.test_type_casts",  # nn/test_packed_sequence
-    "PackedSequenceTest.test_pack_sequence",  # nn/test_packed_sequence
-    "PackedSequenceTest.test_total_length",  # nn/test_packed_sequence
-    "TestModuleHooks.test_forward_pre_hooks_named_tuple_True",  # nn/test_module_hooks
-    "TestModuleHooks.test_full_backward_pre_hooks_named_tuple_True",  # nn/test_module_hooks
-    "TestModuleHookNN.test_hook_submodule_registration",  # nn/test_module_hooks
-    "TestModuleHooks.test_forward_hooks_named_tuple_False",  # nn/test_module_hooks
-    "TestModuleHooks.test_full_backward_hooks_named_tuple_False",  # nn/test_module_hooks
-    "TestModuleHooks.test_forward_hooks_named_tuple_True",  # nn/test_module_hooks
-    "TestStateDictHooks.test_pickled_hook",  # nn/test_module_hooks
-    "TestModuleHookNN.test_hook_inplace",  # nn/test_module_hooks
-    "TestModuleGlobalHooks.test_module_backward_global_hook_writeable",  # nn/test_module_hooks
-    "TestModuleHookNN.test_hook_buffer_registration",  # nn/test_module_hooks
-    "TestModuleHooks.test_full_backward_hooks_named_tuple_True",  # nn/test_module_hooks
-    "TestModuleHookNN.test_hook_no_requires_grad",  # nn/test_module_hooks
-    "TestModuleHookNN.test_hook_backward_writeable",  # nn/test_module_hooks
-    "TestModuleHooks.test_forward_pre_hooks_named_tuple_False",  # nn/test_module_hooks
-    "TestModuleHookNN.test_hook_parameter_registration",  # nn/test_module_hooks
-    "TestModuleHooks.test_full_backward_pre_hooks_named_tuple_False",  # nn/test_module_hooks
-    "TestModuleHookNN.test_hook_cpp",  # nn/test_module_hooks
-    "TestStateDictHooks.test_load_state_dict_pre_hook",  # nn/test_module_hooks
-    "TestModuleHookNN.test_hook_invalid_outputs",  # nn/test_module_hooks
-    "TestModuleHookNN.test_backward_hooks_interaction",  # nn/test_module_hooks
-    "TestModuleHookNN.test_hooks",  # nn/test_module_hooks
-    "TestModuleHookNN.test_hook_last_arg_requires_grad",  # nn/test_module_hooks
-    "TestModuleGlobalHooks.test_module_global_hook_invalid_outputs",  # nn/test_module_hooks
-    "TestLazyModules.test_lazy_module_parameter",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_batchnorm2d_state",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_conv3d",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_conv_transposed1d",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_conv2d",  # nn/test_lazy_modules
-    "TestLazyModules.test_optimizer_pass",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_instancenorm3d_state",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_batchnorm3d_state",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_conv_transpose1d_pickle",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_instancenorm2d",  # nn/test_lazy_modules
-    "TestLazyModules.test_invalid_functions",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_instancenorm2d_state",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_conv3d_pickle",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_batchnorm2d",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_instancenorm1d",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_batchnorm1d",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_instancenorm1d_state",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_conv_transpose3d_pickle",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_instancenorm3d",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_batchnorm3d",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_conv2d_pickle",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_conv1d_pickle",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_module_jit_buffer",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_conv1d",  # nn/test_lazy_modules
-    "TestLazyModules.test_linear",  # nn/test_lazy_modules
-    "TestLazyModules.test_materialize_dtype",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_module_buffer",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_batchnorm1d_state",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_module_jit_param",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_batchnorm_with_dict_input",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_conv_transpose2d",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_conv_transpose2d_pickle",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_conv_transpose3d",  # nn/test_lazy_modules
-    "TestLazyModules.test_lazy_linear_pickle",  # nn/test_lazy_modules
-    "TestNNInit.test_kaiming_normal_errors_on_inputs_smaller_than_2d",  # nn/test_init
-    "TestNNInit.test_xavier_normal_errors_on_inputs_smaller_than_2d",  # nn/test_init
-    "TestNNInit.test_ones_and_zeros",  # nn/test_init
-    "TestNNInit.test_eye",  # nn/test_init
-    "TestNNInit.test_kaiming_uniform_errors_on_inputs_smaller_than_2d",  # nn/test_init
-    "TestNNInit.test_dirac_only_works_on_3_4_5d_inputs",  # nn/test_init
-    "TestNNInit.test_sparse_only_works_on_2d_inputs",  # nn/test_init
-    "TestNNInit.test_constant",  # nn/test_init
-    "TestNNInit.test_xavier_uniform_errors_on_inputs_smaller_than_2d",  # nn/test_init
-    "TestNNInit.test_xavier_uniform",  # nn/test_init
-    "TestNNInit.test_eye_only_works_on_2d_inputs",  # nn/test_init
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_bfloat16_cpu_int32_int64",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int64_bfloat16",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_EmbeddingBag_per_sample_weights_and_no_offsets_cpu_int32_float32",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int64_float32",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_dimension_errors_cpu",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_EmbeddingBag_per_sample_weights_and_no_offsets_cpu_int64_float64",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_bfloat16_cpu_int64_int64",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int32_bfloat16",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int64_float32",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_half_cpu_int64_int64",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int32_float16",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int64_bfloat16",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int32_float32",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_EmbeddingBag_per_sample_weights_and_no_offsets_cpu_int64_float32",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_EmbeddingBag_per_sample_weights_and_no_offsets_cpu_int32_float64",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int64_float64",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_half_cpu_int32_int32",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int32_float16",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int32_float64",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_half_cpu_int64_int32",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int64_float16",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int32_float64",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int64_float64",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int64_int32_float32",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_half_cpu_int32_int64",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_bfloat16_cpu_int64_int32",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_bfloat16_cpu_int32_int32",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int64_float16",  # nn/test_embedding
-    "TestEmbeddingNNDeviceTypeCPU.test_embedding_bag_device_cpu_int32_int32_bfloat16",  # nn/test_embedding
-    "TestDropoutNN.test_invalid_dropout_p",  # nn/test_dropout
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel3d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel2d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNN.test_Conv2d_missing_argument",  # nn/test_convolution
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel2d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel3d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel2d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel3d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel2d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel2d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch1d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel2d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch3d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch1d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel3d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch1d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel1d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch3d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel3d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel2d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel3d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch2d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch1d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel2d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel3d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel1d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel2d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel3d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch2d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel3d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel1d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel1d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel3d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel2d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel3d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel1d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel1d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel2d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch1d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch3d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch3d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch1d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel2d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch2d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel2d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel1d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel2d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel3d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel1d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch3d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel1d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel1d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch3d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch2d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch1d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch3d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch2d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch2d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch2d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel1d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel1d_has_bias_False_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel2d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel2d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel2d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel3d_has_bias_True_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel3d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel3d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel3d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel1d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel1d_has_bias_True_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch3d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_batch_channel3d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel1d_has_bias_True_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel2d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel3d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_channel1d_has_bias_False_strided_True_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_mkldnn_empty_channel1d_has_bias_True_strided_False_contiguous_False_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch1d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch_channel1d_has_bias_False_strided_False_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestConvolutionNNDeviceTypeCPU.test_conv_backend_empty_batch2d_has_bias_False_strided_True_contiguous_True_cpu",  # nn/test_convolution  # noqa: B950
-    "TestDistributionShapes.test_mixture_same_family_shape",  # distributions/test_distributions
-    "TestFunctors.test_cat_transform",  # distributions/test_distributions
-    "TestFunctors.test_cat_transform_non_uniform",  # distributions/test_distributions
-    "TestRearrange.test_ellipsis_ops",  # functorch/test_rearrange
-    "TestRearrange.test_concatenations_and_stacking",  # functorch/test_rearrange
-    "TestRearrange.test_rearrange_permutations",  # functorch/test_rearrange
-    "TestRearrange.test_collapsed_ellipsis_errors_out",  # functorch/test_rearrange
-    "TestRearrange.test_rearrange_consistency",  # functorch/test_rearrange
-    "TestRearrange.test_unsqueeze",  # functorch/test_rearrange
-    "TestRearrange.test_squeeze",  # functorch/test_rearrange
-    "TestMin.test_manual_stuff",  # functorch/test_dims
-    "TestMin.test_dim_args",  # functorch/test_dims
-    "TestMin.test_dims_with_size",  # functorch/test_dims
-    "TestMin.test_functorch",  # functorch/test_dims
-    "TestMin.test_eq",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_adapt",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_monkey",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_network",  # functorch/test_dims
-    "TestMin.test_doc",  # functorch/test_dims
-    "TestMin.test_c",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_index_placement",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_dims_with_size",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_with_dims_split",  # functorch/test_dims
-    "TestMin.test_index",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_softmax_split",  # functorch/test_dims
-    "TestMin.test_mm_fuse",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_simple",  # functorch/test_dims
-    "TestMin.test_seg",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_doc",  # functorch/test_dims
-    "TestMin.test_attn",  # functorch/test_dims
-    "TestMin.test_mask",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_mm",  # functorch/test_dims
-    "TestMin.test_index_placement",  # functorch/test_dims
-    "TestMin.test_dir",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_hello",  # functorch/test_dims
-    "TestMin.test_network",  # functorch/test_dims
-    "TestMin.test_permute_orig",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_parse",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_order_keyword",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_seg",  # functorch/test_dims
-    "TestMin.test_diag",  # functorch/test_dims
-    "TestMin.test_monkey",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_embed",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_stack",  # functorch/test_dims
-    "TestMin.test_hello",  # functorch/test_dims
-    "TestMin.test_expand",  # functorch/test_dims
-    "TestMin.test_time_mm_fuse",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_compare_dims",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_manual_stuff",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_max",  # functorch/test_dims
-    "TestMin.test_adapt",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_dir",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_order",  # functorch/test_dims
-    "TestMin.test_mm",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_index",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_permute_orig",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_functorch",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_dim_args",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_eq",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_mask",  # functorch/test_dims
-    "TestMin.test_big_split",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_attn",  # functorch/test_dims
-    "TestMin.test_max",  # functorch/test_dims
-    "TestMin.test_compare_dims",  # functorch/test_dims
-    "TestMin.test_parse",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_big_split",  # functorch/test_dims
-    "TestMin.test_simple",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_mm_fuse",  # functorch/test_dims
-    "TestMin.test_order_keyword",  # functorch/test_dims
-    "TestMin.test_inplace",  # functorch/test_dims
-    "TestMin.test_with_dims_split",  # functorch/test_dims
-    "TestMin.test_softmax_split",  # functorch/test_dims
-    "TestMin.test_order",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_c",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_inplace",  # functorch/test_dims
-    "TestMin.test_embed",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_diag",  # functorch/test_dims
-    "TestMinFunctorchOnly.test_expand",  # functorch/test_dims
-    "TestMin.test_stack",  # functorch/test_dims
-    "TestControlFlowTraced.test_tracing_map_symbolic_simple",  # functorch/test_control_flow
-    "TestControlFlow.test_map_list_in_out",  # functorch/test_control_flow
-    "TestControlFlowTraced.test_nested_cond_map_cond_symbolic",  # functorch/test_control_flow
-    "TestControlFlowTraced.test_tracing_map_autograd_aot_functionalized",  # functorch/test_control_flow
-    "TestControlFlow.test_map_autograd_nested_list",  # functorch/test_control_flow
-    "TestControlFlowTraced.test_tracing_map_autograd_symbolic_simple",  # functorch/test_control_flow
-    "TestControlFlowTraced.test_tracing_map_real",  # functorch/test_control_flow
-    "TestControlFlowTraced.test_map_functionalized_aot_func",  # functorch/test_control_flow
-    "TestControlFlowTraced.test_tracing_map_symbolic_list",  # functorch/test_control_flow
-    "TestControlFlowTraced.test_tracing_map_symbolic_dict",  # functorch/test_control_flow
-    "TestControlFlow.test_map_dict_in_out",  # functorch/test_control_flow
-    "TestControlFlowTraced.test_tracing_map_autograd_symbolic_list",  # functorch/test_control_flow
-    "TestControlFlowTraced.test_tracing_map_autograd_symbolic_dict",  # functorch/test_control_flow
-    "TestControlFlow.test_map_autograd_no_grad_output",  # functorch/test_control_flow
-    "TestControlFlowTraced.test_map_functionalized",  # functorch/test_control_flow
-    "TestControlFlowTraced.test_nested_map_cond_symbolic",  # functorch/test_control_flow
-    "TestControlFlowTraced.test_nested_map_cond_real",  # functorch/test_control_flow
-    "TestMetaKernel.test_addmm_invalid_dtype",  # lazy/test_meta_kernel
-}
+if test_dir is None:
+    dynamo_expected_failures = set()
+    dynamo_skips = set()
+else:
+    failures_directory = os.path.join(test_dir, "dynamo_expected_failures")
+    skips_directory = os.path.join(test_dir, "dynamo_skips")
+
+    dynamo_expected_failures = set(os.listdir(failures_directory))
+    dynamo_skips = set(os.listdir(skips_directory))
 
-dynamo_skips = {
-    "TestMatmulOperator.test_matmul_raises",
-    "TestMatmulOperator.test_exceptions",
-    "TestMatmulOperator.test_matmul_inplace",
-    "TestMonitorTensorboard.test_event_handler",  # weird
-    "TestMethods.test_diagonal",
-    "TestMethods.test_searchsorted_complex",
-    "TestMethods.test_round",
-    "TestMethods.test_searchsorted_type_specific_2",
-    "TestMethods.test_dot",
-    "TestMethods.test_dot_out_mem_overlap",
-    "TestMethods.test_partition_iterative",
-    "TestMethods.test_trace",
-    "TestMethods.test_matmul_out",
-    "TestMethods.test_transpose",
-    "TestMethods.test_conjugate",
-    "TestMethods.test_choose_2",
-    "TestMethods.test_size_zero_memleak",
-    "TestMethods.test_searchsorted_with_invalid_sorter",
-    "TestMethods.test_choose",
-    "TestMethods.test_conjugate_out",
-    "TestMethods.test_compress",
-    "TestArgmaxArgminCommon.test_np_vs_ndarray_arr_method_argmax_np_method0",
-    "TestArgmaxArgminCommon.test_np_vs_ndarray_arr_method_argmin_np_method1",
-    "TestArgmaxArgminCommon.test_ret_is_out_ndim_0_method_argmin",
-    "TestArgmaxArgminCommon.test_ret_is_out_ndim_0_method_argmax",
-    "TestArgmaxArgminCommon.test_ret_is_out_ndim_1_method_argmax",
-    "TestArgmaxArgminCommon.test_ret_is_out_ndim_1_method_argmin",
-    "TestIsreal.test_fail",  # known py311 fail
-    "TestIscomplexobj.test_basic",  # known py311 fail
-    "TestIsrealobj.test_basic",  # known py311 fail
-    "TestIsreal.test_pass",  # known py311 fail
-    "TestIscomplex.test_pass",  # known py311 fail
-    "TestIscomplexobj.test_list",  # known py311 fail
-    "TestDiag.test_matrix",  # known py311 fail
-    "TestVander.test_dtypes",  # known py311 fail
-    "TestDstack.test_generator",  # known py311 fail
-    "TestColumnStack.test_generator",  # known py311 fail
-    "TestCov.test_complex",  # known py311 fail
-    "TestSortComplex.test_sort_complex",  # known py311 fail
-    "TestCorrCoef.test_xy",  # known py311 fail
-    "TestCov.test_xy",  # known py311 fail
-    "TestCorrCoef.test_complex",  # known py311 fail
-    "TestUnique.test_simple_complex",  # known py311 fail
-    "TestDigitize.test_casting_error",  # known py311 fail
-    "TestConstant.test_check_constant",  # known py311 fail
-    "TestFFTShift.test_fft_n",  # known py311 fail
-    "TestHstack.test_generator",  # known py311 fail
-    "TestVstack.test_generator",  # known py311 fail
-    "TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_I_operation0",  # torch_np/numpy_tests/core/test_scalarmath
-    "TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_I_operation1",  # torch_np/numpy_tests/core/test_scalarmath
-    "TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_L_operation1",  # torch_np/numpy_tests/core/test_scalarmath
-    "TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_Q_operation1",  # torch_np/numpy_tests/core/test_scalarmath
-    "TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_Q_operation0",  # torch_np/numpy_tests/core/test_scalarmath
-    "TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_P_operation0",  # torch_np/numpy_tests/core/test_scalarmath
-    "TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_P_operation1",  # torch_np/numpy_tests/core/test_scalarmath
-    "TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_H_operation0",  # torch_np/numpy_tests/core/test_scalarmath
-    "TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_H_operation1",  # torch_np/numpy_tests/core/test_scalarmath
-    "TestScalarOpsMisc.test_scalar_integer_operation_divbyzero_dtype_L_operation0",  # torch_np/numpy_tests/core/test_scalarmath
-    "TestCorrelate.test_complex",  # known py311 fail
-    "TestStdVarComplex.test_basic",  # known py311 fail
-    "TestEinsum.test_broadcasting_dot_cases",  # known py311 fail
-    "WeakTest.test_make_weak_keyed_dict_from_dict",  # known py311 fail
-    "TestViewOpsCPU.test_as_strided_gradients_cpu",  # known py311 fail
-    "TestViewOpsLAZY.test_as_strided_gradients_lazy",  # known py311 fail
-    "LoggingTest.testApiUsage",  # flaky?
-    "TestPruningNN.test_global_pruning_importance_scores",  # flaky
-    "TestOpenMP_ParallelFor.test_one_thread",  # test_openmp
-    "TestTorchrun.test_multi_threads",  # backends/xeon/test_launch
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_CausalVariant_LOWER_RIGHT_shape3_cpu",  # known py38 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_CausalVariant_UPPER_LEFT_shape0_cpu",  # known py38 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_CausalVariant_LOWER_RIGHT_shape1_cpu",  # known py38 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_CausalVariant_LOWER_RIGHT_shape2_cpu",  # known py38 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_CausalVariant_UPPER_LEFT_shape3_cpu",  # known py38 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_CausalVariant_LOWER_RIGHT_shape0_cpu",  # known py38 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_CausalVariant_UPPER_LEFT_shape2_cpu",  # known py38 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_CausalVariant_UPPER_LEFT_shape1_cpu",  # known py38 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_1_shape0_cpu",  # known py311 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_2_shape2_cpu",  # known py311 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_2_shape0_cpu",  # known py311 fail
-    "TestTransformersCPU.test_decoder_padding_and_src_mask_bool_cpu",  # known py311 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_2_shape3_cpu",  # known py311 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_1_shape3_cpu",  # known py311 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_1_shape2_cpu",  # known py311 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_1_shape1_cpu",  # known py311 fail
-    "TestAttnMasksCPU.test_causal_variants_causal_variant_2_shape1_cpu",  # known py311 fail
-    "TestFunctionalAutogradBenchmark.test_fast_tasks",  # flaky?
-    "TestFrameworkUtils.test_filtering_env_var",  # known py38 fail
-    "TestAsArrayCPU.test_default_device_cpu",  # known py38 fail
-    "TestAsArrayCPU.test_astensor_consistency_cpu",  # known py311 fail
-    "TestTensorCreationCPU.test_vander_types_cpu_complex128",  # known py311 fail
-    "TestTensorCreationCPU.test_vander_types_cpu_complex64",  # known py311 fail
-    "TestTensorCreationCPU.test_torch_polar_cpu_float32",  # known py311 fail
-    "TestTensorCreationCPU.test_torch_polar_cpu_float64",  # known py311 fail
-    "TestSWAUtils.test_averaged_model_all_devices_ema_True",  # flaky
-    "TestSWAUtils.test_averaged_model_exponential_use_multi_avg_fn_True_use_buffers_False",  # flaky
-    "TestSWAUtils.test_averaged_model_exponential_use_multi_avg_fn_True_use_buffers_True",  # flaky
-    "TestOpenMP_ParallelFor.test_n_threads",  # known py311 fail
-    "TestNativeFunctions.test_intlist_error_with_overload",  # known py311 fail
-    "TestMkldnnFusion.test_single_conv",  # known py311 fail
-    "TestTorchDlPackCPU.test_dlpack_export_is_conj_cpu",  # known py311 fail
-    "TestPythonDispatcher.test_quantized_structured_not_implemented",  # known py38 fail
-    "TestLazyReuseIr.testAdd",  # known py311 fail
-    "TestLazyReuseIr.testAddSubFallback",  # known py311 fail
-    "TestLazyReuseIr.testBatchNorm",  # known py311 fail
-    "TestLazyReuseIr.testAddSub",  # known py311 fail
+# TODO: due to case sensitivity problems, for now list these files by hand
+extra_dynamo_skips = {
+    "TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_T_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_t_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_T_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_t_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_T_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_t_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_T_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_t_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_T_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_t_cpu_float32",
 }
+dynamo_skips = dynamo_skips.union(extra_dynamo_skips)
+
+
+# verify some invariants
+for test in dynamo_expected_failures.union(dynamo_skips):
+    if len(test.split(".")) != 2:
+        raise AssertionError(f'Invalid test name: "{test}"')
+
+intersection = dynamo_expected_failures.intersection(dynamo_skips)
+if len(intersection) > 0:
+    raise AssertionError(
+        "there should be no overlap between dynamo_expected_failures "
+        "and dynamo_skips, got " + str(intersection)
+    )
diff --git a/torch/testing/_internal/hop_db.py b/torch/testing/_internal/hop_db.py
new file mode 100644
index 0000000000000..4772fb42a9631
--- /dev/null
+++ b/torch/testing/_internal/hop_db.py
@@ -0,0 +1,258 @@
+# mypy: ignore-errors
+
+import torch
+import functools
+from torch.testing import make_tensor
+import unittest
+from functorch.experimental.control_flow import map
+from torch.testing._internal.opinfo.core import (
+    OpInfo,
+    SampleInput,
+)
+from torch.testing._internal.common_dtype import all_types_and, custom_types
+from torch.testing._internal.opinfo.core import DecorateInfo
+from torch.nn.attention._flex_attention import _flex_attention
+
+def sample_inputs_map(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = functools.partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput([make_arg(2, 2, 2, low=0.1, high=2), make_arg(2, 2, 2, low=0.1, high=2)],
+                      args=(make_arg(1, low=0.1, high=2), make_arg(1, low=0.1, high=2)))
+
+def inner_f(x, y0, y1):
+    return [x[0].cos().add_(1.) * y0, (x[1] + y1.sin()).cos_().view(x[1].size())]
+
+def simple_map(xs, y0, y1):
+    def f(x, y0, y1):
+        return inner_f(x, y0, y1)
+    return map(f, xs, y0, y1)
+
+def nested_map(xs, y0, y1):
+    def f1(xx, y0, y1):
+        def f2(x, y0, y1):
+            return inner_f(x, y0, y1)
+        return map(f2, xx, y0, y1)
+    return map(f1, xs, y0, y1)
+
+def triple_nested_map(xs, y0, y1):
+    def f0(xs, y0, y1):
+        def f1(xx, y0, y1):
+            def f2(x, y0, y1):
+                return inner_f(x, y0, y1)
+            return map(f2, xx, y0, y1)
+        return map(f1, xs, y0, y1)
+    return map(f0, xs, y0, y1)
+
+
+# Please consult with torch.export team before
+# adding new entry to this list.
+hop_that_doesnt_have_opinfo_test_allowlist = [
+    "custom_function_call",
+    "autograd_function_apply",
+    "run_and_save_rng_state",
+    "run_with_rng_state",
+    "out_dtype",
+    "trace_wrapped",
+    "map",  # T183144629
+    "map_impl",
+    "with_effects",
+    "strict_mode",
+    "_export_tracepoint",
+    "call_torchbind",
+]
+
+torch.library.define(
+    "testlib::mutating_custom_op",
+    "(Tensor(a!) x, Tensor(b!) z) -> (Tensor, Tensor, Tensor)",
+    tags=torch.Tag.pt2_compliant_tag,
+)
+
+
+@torch.library.impl("testlib::mutating_custom_op", "cpu")
+def foo_impl_cpu(x, z):
+    x.add_(5)
+    z.add_(5)
+    return x, z, x + z
+
+
+@torch.library.impl("testlib::mutating_custom_op", "cuda")
+def foo_impl_cuda(x, z):
+    x.add_(5)
+    z.add_(5)
+    return x, z, x + z
+
+
+@torch.library.impl_abstract("testlib::mutating_custom_op")
+def foo_impl_abstract(x, z):
+    return x, z, x + z
+
+
+def sample_inputs_cond(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = functools.partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=False
+    )
+    yield SampleInput(make_arg(2, 2, 2, low=0.1, high=2))
+
+
+def simple_cond(x):
+    return torch.cond(x.shape[0] > 2, lambda x: x.cos(), lambda x: x.sin(), [x])
+
+
+def sample_inputs_auto_functionalize(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = functools.partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=False
+    )
+    yield SampleInput(make_arg(2, 2, 2, low=0.1, high=2), make_arg(2, 2, 2, low=0.1, high=2))
+
+
+def simple_auto_functionalize(x, z):
+    return torch.ops.testlib.mutating_custom_op(x, z)
+
+
+def sample_inputs_flex_attention(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = functools.partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+
+    def score_mod(score, b, h, m, n):
+        return score + h
+
+    yield SampleInput(
+        make_arg(2, 2, 64, 8, low=0.1, high=2),
+        make_arg(2, 2, 64, 8, low=0.1, high=2),
+        make_arg(2, 2, 64, 8, low=0.1, high=2),
+        score_mod,
+    )
+
+def sample_inputs_while_loop(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = functools.partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=False
+    )
+    yield SampleInput(
+        torch.tensor(3),
+        make_arg(2, 3, 4, low=0.1, high=2),
+    )
+
+def simple_while_loop(iter_t, x):
+    def cond_fn(iter_t, x):
+        return iter_t > 0
+
+    def body_fn(iter_t, x):
+        return iter_t - 1, x.cos()
+
+    return torch._higher_order_ops.while_loop(cond_fn, body_fn, (iter_t, x))
+
+
+hop_db = [
+    OpInfo(
+        name="map",
+        variant_test_name="simple",
+        op=simple_map,
+        sample_inputs_func=sample_inputs_map,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+    ),
+    OpInfo(
+        name="map",
+        variant_test_name="nested",
+        op=nested_map,
+        sample_inputs_func=sample_inputs_map,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+    ),
+    OpInfo(
+        name="map",
+        variant_test_name="triple_nested",
+        op=triple_nested_map,
+        sample_inputs_func=sample_inputs_map,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+    ),
+    OpInfo(
+        name="cond",
+        variant_test_name="simple",
+        op=simple_cond,
+        sample_inputs_func=sample_inputs_cond,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+        supports_autograd=False,
+    ),
+    OpInfo(
+        name="while_loop",
+        variant_test_name="simple",
+        op=simple_while_loop,
+        sample_inputs_func=sample_inputs_while_loop,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+        supports_autograd=False,
+    ),
+    OpInfo(
+        name="auto_functionalize",
+        variant_test_name="simple",
+        op=simple_auto_functionalize,
+        sample_inputs_func=sample_inputs_auto_functionalize,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+        supports_autograd=False,
+    ),
+    OpInfo(
+        name="flex_attention",
+        variant_test_name="simple",
+        op=_flex_attention,
+        sample_inputs_func=sample_inputs_flex_attention,
+        dtypes=custom_types(torch.float16, torch.float32),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestHOP", "test_aot_export"),
+            DecorateInfo(unittest.expectedFailure, "TestHOP", "test_pre_dispatch_export"),
+            DecorateInfo(unittest.expectedFailure, "TestHOP", "test_serialize_export"),
+            DecorateInfo(unittest.expectedFailure, "TestHOP", "test_retrace_export"),
+        )
+    ),
+    OpInfo(
+        name="flex_attention_backward",
+        variant_test_name="simple",
+        op=_flex_attention,
+        sample_inputs_func=sample_inputs_flex_attention,
+        dtypes=custom_types(torch.float16, torch.float32),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestHOP", "test_aot_export"),
+            DecorateInfo(unittest.expectedFailure, "TestHOP", "test_pre_dispatch_export"),
+            DecorateInfo(unittest.expectedFailure, "TestHOP", "test_serialize_export"),
+            DecorateInfo(unittest.expectedFailure, "TestHOP", "test_retrace_export"),
+        )
+    )
+]
diff --git a/torch/testing/_internal/hypothesis_utils.py b/torch/testing/_internal/hypothesis_utils.py
index f243e43007d0d..98aa82e1c93d2 100644
--- a/torch/testing/_internal/hypothesis_utils.py
+++ b/torch/testing/_internal/hypothesis_utils.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from collections import defaultdict
 from collections.abc import Iterable
 import numpy as np
diff --git a/torch/testing/_internal/inductor_utils.py b/torch/testing/_internal/inductor_utils.py
index c65ae0d38a8e3..0f6209a01c3f1 100644
--- a/torch/testing/_internal/inductor_utils.py
+++ b/torch/testing/_internal/inductor_utils.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 import re
 import unittest
@@ -28,7 +30,23 @@ def test_cpu():
 
 HAS_CPU = LazyVal(test_cpu)
 
-HAS_CUDA = has_triton()
+HAS_CUDA = torch.cuda.is_available() and has_triton()
+
+HAS_XPU = torch.xpu.is_available() and has_triton()
+
+HAS_GPU = HAS_CUDA or HAS_XPU
+
+GPUS = ["cuda", "xpu"]
+
+HAS_MULTIGPU = any(
+    getattr(torch, gpu).is_available() and getattr(torch, gpu).device_count() >= 2
+    for gpu in GPUS
+)
+
+tmp_gpus = [x for x in GPUS if getattr(torch, x).is_available()]
+assert len(tmp_gpus) <= 1
+GPU_TYPE = "cuda" if len(tmp_gpus) == 0 else tmp_gpus.pop()
+del tmp_gpus
 
 @register_backend
 def count_bytes_inductor(gm, example_inputs):
@@ -68,4 +86,5 @@ def decorate_fn(fn):
     return decorate_fn
 
 skipCUDAIf = functools.partial(skipDeviceIf, device="cuda")
+skipXPUIf = functools.partial(skipDeviceIf, device="xpu")
 skipCPUIf = functools.partial(skipDeviceIf, device="cpu")
diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py
index fe7927edf660f..8171a95918939 100644
--- a/torch/testing/_internal/jit_metaprogramming_utils.py
+++ b/torch/testing/_internal/jit_metaprogramming_utils.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 # Torch
 from torch.jit.annotations import BroadcastingList2, BroadcastingList3  # noqa: F401
 import torch.nn.functional as F
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index 0c7da662dad4b..c0109ecacf7f7 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 # Torch
 from torch.autograd import Variable
 from torch.autograd.function import _nested_map
diff --git a/torch/testing/_internal/logging_tensor.py b/torch/testing/_internal/logging_tensor.py
index 4c53517ae49bd..dedb83343e5df 100644
--- a/torch/testing/_internal/logging_tensor.py
+++ b/torch/testing/_internal/logging_tensor.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 from torch.utils._pytree import tree_map
 from typing import Iterator, List, Optional
@@ -48,8 +50,6 @@ class LoggingTensor(torch.Tensor):
 
     context = contextlib.nullcontext
 
-    __torch_function__ = torch._C._disabled_torch_function_impl
-
     @staticmethod
     def __new__(cls, elem, *args, **kwargs):
         # The wrapping tensor (LoggingTensor) shouldn't hold any
diff --git a/torch/testing/_internal/logging_utils.py b/torch/testing/_internal/logging_utils.py
index e1d3f0669b1d6..f97d0281b139b 100644
--- a/torch/testing/_internal/logging_utils.py
+++ b/torch/testing/_internal/logging_utils.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch._dynamo.test_case
 import unittest.mock
 import os
@@ -5,7 +7,9 @@
 import torch._logging
 import torch._logging._internal
 from torch._dynamo.utils import LazyString
+from torch._inductor import config as inductor_config
 import logging
+import io
 
 @contextlib.contextmanager
 def preserve_log_state():
@@ -71,6 +75,7 @@ def append_setting(name, level):
 # that the logs are setup correctly and capturing the correct records.
 def make_logging_test(**kwargs):
     def wrapper(fn):
+        @inductor_config.patch({"fx_graph_cache": False})
         def test_fn(self):
 
             torch._dynamo.reset()
@@ -127,6 +132,9 @@ def tearDownClass(cls):
         torch._logging._internal.log_state.clear()
         torch._logging._init_logs()
 
+    def hasRecord(self, records, m):
+        return any(m in r.getMessage() for r in records)
+
     def getRecord(self, records, m):
         record = None
         for r in records:
@@ -177,3 +185,29 @@ def new_emit(record):
                 )
 
         return exit_stack
+
+
+def logs_to_string(module, log_option):
+    """Example:
+    logs_to_string("torch._inductor.compile_fx", "post_grad_graphs")
+    returns the output of TORCH_LOGS="post_grad_graphs" from the
+    torch._inductor.compile_fx module.
+    """
+    log_stream = io.StringIO()
+    handler = logging.StreamHandler(stream=log_stream)
+
+    @contextlib.contextmanager
+    def tmp_redirect_logs():
+        try:
+            logger = torch._logging.getArtifactLogger(module, log_option)
+            logger.addHandler(handler)
+            yield
+        finally:
+            logger.removeHandler(handler)
+
+    def ctx_manager():
+        exit_stack = log_settings(log_option)
+        exit_stack.enter_context(tmp_redirect_logs())
+        return exit_stack
+
+    return log_stream, ctx_manager
diff --git a/torch/testing/_internal/opinfo/__init__.py b/torch/testing/_internal/opinfo/__init__.py
index 4afd4147f10f4..97c38f3560625 100644
--- a/torch/testing/_internal/opinfo/__init__.py
+++ b/torch/testing/_internal/opinfo/__init__.py
@@ -1,2 +1,4 @@
+# mypy: ignore-errors
+
 import torch.testing._internal.opinfo.core
 import torch.testing._internal.opinfo.definitions
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index d07ff38b69c4d..70c643d2b8fee 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import collections
 import collections.abc
 import math
@@ -7,7 +9,7 @@
 from enum import Enum
 from functools import partial
 from itertools import product
-from typing import Any, Callable, Iterable, List, Optional, Tuple
+from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
 
 from torchgen.utils import dataclass_repr
 
@@ -448,7 +450,7 @@ def __call__(self, *args, **kwargs):
 #   the operator's output (when given the input, args, and kwargs) to the
 #   portion of the output to gradcheck. For example, consider an operator
 #   like torch.linalg.slogdet
-#   (https://pytorch.org/docs/master/generated/torch.linalg.slogdet.html).
+#   (https://pytorch.org/docs/main/generated/torch.linalg.slogdet.html).
 #   This operator returns a tuple of two tensors, but the first tensor
 #   cannot be backwarded through. Its "output_process_fn_grad" filters
 #   this output tuple to just the second argument, which we can call backward
@@ -772,6 +774,23 @@ class OpInfo:
     # (e.g. functions like ones, zeros, methods like view, permute)
     supports_varargs: bool = False
 
+    # Whether the forward operation avoids materializing COW tensor inputs
+    supports_cow_input_no_materialize_forward: bool = True
+
+    # Whether the backward operation avoids materializing COW tensor inputs
+    supports_cow_input_no_materialize_backward: bool = True
+
+    # Whether to skip the backward part of the COW tensor input test
+    skip_cow_input_backward: bool = False
+
+    # If `supports_cow_input_no_materialize_forward == True`, this list contains
+    # the arg indices or kwarg names of inputs that are expected to materialize
+    allow_cow_input_materialize_forward: List[Union[int, str]] = None
+
+    # If `supports_cow_input_no_materialize_backward == True`, this list contains
+    # the arg indices or kwarg names of inputs that are expected to materialize
+    allow_cow_input_materialize_backward: List[Union[int, str]] = None
+
     # wrapper function for gradcheck
     gradcheck_wrapper: Callable = lambda op, *args, **kwargs: op(*args, **kwargs)
 
@@ -2686,33 +2705,22 @@ def get_foreach_method_names(name):
     return op, inplace_op, ref, ref_inplace
 
 
+@dataclass
 class ForeachFuncInfo(OpInfo):
     """Early version of a specialized OpInfo for foreach functions"""
 
-    def __init__(
-        self,
-        name,
-        sample_inputs_func,
-        *,
-        dtypes=floating_and_complex_types(),
-        dtypesIfCUDA=floating_and_complex_types_and(torch.half),
-        dtypesIfROCM=None,
-        supports_alpha_param=False,
-        supports_autograd=True,
-        supports_inplace_autograd=True,
-        supports_scalar_self_arg=False,
-        supports_forward_ad=True,
-        backward_requires_result=False,
-        supports_out=True,
-        **kwargs,
-    ):
+    supports_alpha_param: bool = False
+    supports_scalar_self_arg: bool = False
+    backward_requires_result: bool = False
+
+    def __post_init__(self):
         (
             foreach_method,
             foreach_method_inplace,
             torch_ref_method,
             torch_ref_inplace,
-        ) = get_foreach_method_names(name)
-        if not supports_out:
+        ) = get_foreach_method_names(self.name)
+        if not self.supports_out:
             # note(crcrpar): `foreach_method` for `"zero"` is `None` but `None` would call
             # `_getattr_qual` in `OpInfo.__post_init__` which should fail since `_foreach_zero`
             # is not defined at the moment. Thus to skip the qualification, set a similar torch
@@ -2721,29 +2729,16 @@ def __init__(
             assert torch_ref_method is None
             foreach_method = foreach_method_inplace
             torch_ref_method = torch_ref_inplace
-        super().__init__(
-            name="_foreach_" + name,
-            op=foreach_method,
-            ref=torch_ref_method,
-            method_variant=foreach_method,
-            inplace_variant=foreach_method_inplace,
-            dtypes=dtypes,
-            dtypesIfCUDA=dtypesIfCUDA,
-            dtypesIfROCM=dtypesIfROCM,
-            sample_inputs_func=sample_inputs_func,
-            supports_autograd=supports_autograd,
-            supports_forward_ad=supports_forward_ad,
-            supports_out=supports_out,
-            **kwargs,
-        )
-        self.supports_scalar_self_arg = supports_scalar_self_arg
 
+        self.op = foreach_method
+        self.method_variant = foreach_method
+        self.ref = torch_ref_method
+        self.inplace_variant = foreach_method_inplace
         self.ref_inplace = torch_ref_inplace
-        self.supports_alpha_param = supports_alpha_param
-        self.backward_requires_result = backward_requires_result
         self.has_no_in_place = self.inplace_variant is None
-        self.supports_inplace_autograd = supports_inplace_autograd
 
+        name = self.name
+        self.name = f"_foreach_{name}"
         if name == "norm":
             self.ref = torch.linalg.vector_norm
         elif name == "minimum":
@@ -2755,6 +2750,9 @@ def __init__(
             self.ref = torch.clamp_min
             self.ref_inplace = torch.Tensor.clamp_min_
 
+        # The following sets `dtypesIfCUDA` and `dtypesIfROCM` accordingly.
+        super().__post_init__()
+
     def sample_zero_size_inputs(self, device, dtype, requires_grad=False, **kwargs):
         if not hasattr(self.sample_inputs_func, "sample_zero_size_tensor_inputs"):
             return []
diff --git a/torch/testing/_internal/opinfo/definitions/__init__.py b/torch/testing/_internal/opinfo/definitions/__init__.py
index 5b2ed8d391b4f..bd2ae805370cc 100644
--- a/torch/testing/_internal/opinfo/definitions/__init__.py
+++ b/torch/testing/_internal/opinfo/definitions/__init__.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from typing import List
 
 from torch.testing._internal.opinfo.core import OpInfo
diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index 11298ddfe3bd9..2baa5f8084094 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import unittest
 from collections.abc import Sequence
 from functools import partial
@@ -834,7 +836,6 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
     OpInfo(
         "masked.median",
         dtypes=floating_types_and(torch.bfloat16, torch.float16),
-        dtypesIfCUDA=floating_types_and(torch.float16),
         method_variant=None,
         supports_out=False,
         supports_forward_ad=True,
diff --git a/torch/testing/_internal/opinfo/definitions/fft.py b/torch/testing/_internal/opinfo/definitions/fft.py
index 6219308110569..0601af24bb342 100644
--- a/torch/testing/_internal/opinfo/definitions/fft.py
+++ b/torch/testing/_internal/opinfo/definitions/fft.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import unittest
 from functools import partial
 from typing import List
@@ -123,15 +125,10 @@ def mt(shape, **kwargs):
         ref=np.fft.fft,
         ndimensional=SpectralFuncType.OneD,
         dtypes=all_types_and_complex_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and_complex_and(
             torch.bool,
-            *(
-                ()
-                if (TEST_WITH_ROCM or not SM53OrLater)
-                else (torch.half, torch.complex32)
-            ),
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
         ),
         sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=1),
         error_inputs_func=error_inputs_fft,
@@ -149,15 +146,10 @@ def mt(shape, **kwargs):
         decomp_aten_name="_fft_c2c",
         ndimensional=SpectralFuncType.TwoD,
         dtypes=all_types_and_complex_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and_complex_and(
             torch.bool,
-            *(
-                ()
-                if (TEST_WITH_ROCM or not SM53OrLater)
-                else (torch.half, torch.complex32)
-            ),
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
         ),
         sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 1)),
         error_inputs_func=error_inputs_fftn,
@@ -168,6 +160,16 @@ def mt(shape, **kwargs):
         # See https://github.com/pytorch/pytorch/pull/78358
         check_batched_forward_grad=False,
         decorators=[precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4})],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_complex_half_reference_testing",
+                device_type="cuda",
+                dtypes=[torch.complex32],
+                active_if=TEST_WITH_ROCM,
+            ),
+        ),
     ),
     SpectralFuncInfo(
         "fft.fftn",
@@ -176,15 +178,10 @@ def mt(shape, **kwargs):
         ref=np.fft.fftn,
         ndimensional=SpectralFuncType.ND,
         dtypes=all_types_and_complex_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and_complex_and(
             torch.bool,
-            *(
-                ()
-                if (TEST_WITH_ROCM or not SM53OrLater)
-                else (torch.half, torch.complex32)
-            ),
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
         ),
         sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 1)),
         error_inputs_func=error_inputs_fftn,
@@ -203,15 +200,10 @@ def mt(shape, **kwargs):
         ref=np.fft.hfft,
         ndimensional=SpectralFuncType.OneD,
         dtypes=all_types_and_complex_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and_complex_and(
             torch.bool,
-            *(
-                ()
-                if (TEST_WITH_ROCM or not SM53OrLater)
-                else (torch.half, torch.complex32)
-            ),
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
         ),
         sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=2),
         error_inputs_func=error_inputs_fft,
@@ -239,15 +231,10 @@ def mt(shape, **kwargs):
         ref=scipy.fft.hfft2 if has_scipy_fft else None,
         ndimensional=SpectralFuncType.TwoD,
         dtypes=all_types_and_complex_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and_complex_and(
             torch.bool,
-            *(
-                ()
-                if (TEST_WITH_ROCM or not SM53OrLater)
-                else (torch.half, torch.complex32)
-            ),
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
         ),
         sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(2, 2)),
         error_inputs_func=error_inputs_fftn,
@@ -281,15 +268,10 @@ def mt(shape, **kwargs):
         ref=scipy.fft.hfftn if has_scipy_fft else None,
         ndimensional=SpectralFuncType.ND,
         dtypes=all_types_and_complex_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and_complex_and(
             torch.bool,
-            *(
-                ()
-                if (TEST_WITH_ROCM or not SM53OrLater)
-                else (torch.half, torch.complex32)
-            ),
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
         ),
         sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(2, 2)),
         error_inputs_func=error_inputs_fftn,
@@ -323,10 +305,9 @@ def mt(shape, **kwargs):
         ref=np.fft.rfft,
         ndimensional=SpectralFuncType.OneD,
         dtypes=all_types_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and(
-            torch.bool, *(() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,))
+            torch.bool, *(() if (not SM53OrLater) else (torch.half,))
         ),
         sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=1),
         error_inputs_func=error_inputs_fft,
@@ -345,10 +326,9 @@ def mt(shape, **kwargs):
         ref=np.fft.rfft2,
         ndimensional=SpectralFuncType.TwoD,
         dtypes=all_types_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and(
-            torch.bool, *(() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,))
+            torch.bool, *(() if (not SM53OrLater) else (torch.half,))
         ),
         sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 1)),
         error_inputs_func=error_inputs_fftn,
@@ -369,10 +349,9 @@ def mt(shape, **kwargs):
         ref=np.fft.rfftn,
         ndimensional=SpectralFuncType.ND,
         dtypes=all_types_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and(
-            torch.bool, *(() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,))
+            torch.bool, *(() if (not SM53OrLater) else (torch.half,))
         ),
         sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 1)),
         error_inputs_func=error_inputs_fftn,
@@ -401,15 +380,10 @@ def mt(shape, **kwargs):
         # See https://github.com/pytorch/pytorch/pull/78358
         check_batched_forward_grad=False,
         dtypes=all_types_and_complex_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and_complex_and(
             torch.bool,
-            *(
-                ()
-                if (TEST_WITH_ROCM or not SM53OrLater)
-                else (torch.half, torch.complex32)
-            ),
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
         ),
     ),
     SpectralFuncInfo(
@@ -427,15 +401,10 @@ def mt(shape, **kwargs):
         # See https://github.com/pytorch/pytorch/pull/78358
         check_batched_forward_grad=False,
         dtypes=all_types_and_complex_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and_complex_and(
             torch.bool,
-            *(
-                ()
-                if (TEST_WITH_ROCM or not SM53OrLater)
-                else (torch.half, torch.complex32)
-            ),
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
         ),
         decorators=[
             DecorateInfo(
@@ -460,15 +429,10 @@ def mt(shape, **kwargs):
         # See https://github.com/pytorch/pytorch/pull/78358
         check_batched_forward_grad=False,
         dtypes=all_types_and_complex_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and_complex_and(
             torch.bool,
-            *(
-                ()
-                if (TEST_WITH_ROCM or not SM53OrLater)
-                else (torch.half, torch.complex32)
-            ),
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
         ),
         decorators=[
             DecorateInfo(
@@ -491,10 +455,9 @@ def mt(shape, **kwargs):
         # See https://github.com/pytorch/pytorch/pull/78358
         check_batched_forward_grad=False,
         dtypes=all_types_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and(
-            torch.bool, *(() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,))
+            torch.bool, *(() if (not SM53OrLater) else (torch.half,))
         ),
         skips=(),
         check_batched_grad=False,
@@ -514,10 +477,9 @@ def mt(shape, **kwargs):
         # See https://github.com/pytorch/pytorch/pull/78358
         check_batched_forward_grad=False,
         dtypes=all_types_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and(
-            torch.bool, *(() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,))
+            torch.bool, *(() if (not SM53OrLater) else (torch.half,))
         ),
         check_batched_grad=False,
         check_batched_gradgrad=False,
@@ -547,10 +509,9 @@ def mt(shape, **kwargs):
         # See https://github.com/pytorch/pytorch/pull/78358
         check_batched_forward_grad=False,
         dtypes=all_types_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archss
         dtypesIfCUDA=all_types_and(
-            torch.bool, *(() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,))
+            torch.bool, *(() if (not SM53OrLater) else (torch.half,))
         ),
         check_batched_grad=False,
         check_batched_gradgrad=False,
@@ -579,15 +540,10 @@ def mt(shape, **kwargs):
         # See https://github.com/pytorch/pytorch/pull/78358
         check_batched_forward_grad=False,
         dtypes=all_types_and_complex_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and_complex_and(
             torch.bool,
-            *(
-                ()
-                if (TEST_WITH_ROCM or not SM53OrLater)
-                else (torch.half, torch.complex32)
-            ),
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
         ),
         check_batched_gradgrad=False,
     ),
@@ -606,15 +562,10 @@ def mt(shape, **kwargs):
         # See https://github.com/pytorch/pytorch/pull/78358
         check_batched_forward_grad=False,
         dtypes=all_types_and_complex_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and_complex_and(
             torch.bool,
-            *(
-                ()
-                if (TEST_WITH_ROCM or not SM53OrLater)
-                else (torch.half, torch.complex32)
-            ),
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
         ),
         check_batched_gradgrad=False,
         decorators=[
@@ -640,15 +591,10 @@ def mt(shape, **kwargs):
         # See https://github.com/pytorch/pytorch/pull/78358
         check_batched_forward_grad=False,
         dtypes=all_types_and_complex_and(torch.bool),
-        # rocFFT doesn't support Half/Complex Half Precision FFT
         # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
         dtypesIfCUDA=all_types_and_complex_and(
             torch.bool,
-            *(
-                ()
-                if (TEST_WITH_ROCM or not SM53OrLater)
-                else (torch.half, torch.complex32)
-            ),
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
         ),
         check_batched_gradgrad=False,
         decorators=[
diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py
index 4c4130c99af8c..288aaa34f2fe2 100644
--- a/torch/testing/_internal/opinfo/definitions/linalg.py
+++ b/torch/testing/_internal/opinfo/definitions/linalg.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import itertools
 import random
 import unittest
@@ -1762,22 +1764,6 @@ def make_input():
                 unittest.expectedFailure, "TestFwdGradients", "test_forward_mode_AD"
             ),
             DecorateInfo(unittest.expectedFailure, "TestBwdGradients", "test_fn_grad"),
-            DecorateInfo(
-                unittest.skip("Skipped!"),
-                "TestFakeTensor",
-                "test_fake_crossref_backward_amp",
-                device_type="cuda",
-                dtypes=[torch.float32],
-                active_if=TEST_WITH_ROCM,
-            ),
-            DecorateInfo(
-                unittest.skip("Skipped!"),
-                "TestFakeTensor",
-                "test_fake_crossref_backward_no_amp",
-                device_type="cuda",
-                dtypes=[torch.float32],
-                active_if=TEST_WITH_ROCM,
-            ),
         ),
     ),
     OpInfo(
@@ -2397,6 +2383,18 @@ def make_input():
     #
     # torch.linalg
     #
+    PythonRefInfo(
+        "_refs.linalg.cross",
+        torch_opinfo_name="linalg.cross",
+        supports_out=True,
+        op_db=op_db,
+        skips=(
+            # TODO: is this really needed?
+            DecorateInfo(
+                unittest.expectedFailure, "TestCommon", "test_python_ref_errors"
+            ),
+        ),
+    ),
     PythonRefInfo(
         "_refs.linalg.diagonal",
         torch_opinfo_name="linalg.diagonal",
diff --git a/torch/testing/_internal/opinfo/definitions/signal.py b/torch/testing/_internal/opinfo/definitions/signal.py
index 35cb3aec1934d..6f51e388966a7 100644
--- a/torch/testing/_internal/opinfo/definitions/signal.py
+++ b/torch/testing/_internal/opinfo/definitions/signal.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import unittest
 from functools import partial
 
diff --git a/torch/testing/_internal/opinfo/definitions/sparse.py b/torch/testing/_internal/opinfo/definitions/sparse.py
index 906e93e116c48..e6f0ad0e6f514 100644
--- a/torch/testing/_internal/opinfo/definitions/sparse.py
+++ b/torch/testing/_internal/opinfo/definitions/sparse.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import os
 
 import torch
@@ -796,15 +798,7 @@ def _validate_sample_input_sparse_like_fns(op_info, sample, check_validate=False
         torch.sparse_csc,
         torch.sparse_bsr,
         torch.sparse_bsc,
-    }:
-        if sample.kwargs.get("device", sample.input.device) != sample.input.device:
-            return ErrorInput(
-                sample,
-                error_regex=(
-                    "device of (ccol|crow)_indices \\(=(cpu|cuda.*)\\) must"
-                    " match device of values \\(=(cuda.*|cpu)\\)"
-                ),
-            )
+    } and op_info.name not in {"zeros_like"}:
         if sample.kwargs.get("layout", sample.input.layout) != sample.input.layout:
             return ErrorInput(
                 sample,
diff --git a/torch/testing/_internal/opinfo/definitions/special.py b/torch/testing/_internal/opinfo/definitions/special.py
index da95d149f6d10..5b137799db8e5 100644
--- a/torch/testing/_internal/opinfo/definitions/special.py
+++ b/torch/testing/_internal/opinfo/definitions/special.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import unittest
 from functools import partial
 from itertools import product
@@ -13,7 +15,11 @@
     toleranceOverride,
 )
 from torch.testing._internal.common_dtype import all_types_and, floating_types
-from torch.testing._internal.common_utils import TEST_SCIPY, torch_to_numpy_dtype_dict
+from torch.testing._internal.common_utils import (
+    TEST_SCIPY,
+    TEST_WITH_ROCM,
+    torch_to_numpy_dtype_dict,
+)
 from torch.testing._internal.opinfo.core import (
     BinaryUfuncInfo,
     DecorateInfo,
@@ -63,7 +69,12 @@ def sample_inputs_i0_i1(op_info, device, dtype, requires_grad, **kwargs):
 
 def sample_inputs_polygamma(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(
-        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+        make_tensor,
+        device=device,
+        # TODO: eliminate low after gh-106692 is fixed:
+        low=(1 if dtype in {torch.int32, torch.int64} else None),
+        dtype=dtype,
+        requires_grad=requires_grad,
     )
     tensor_shapes = ((S, S), ())
     ns = (1, 2, 3, 4, 5)
@@ -99,6 +110,19 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
     yield SampleInput(make_arg(()))
 
 
+def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
+    for shape in ((L,), (1, 0, 3), ()):
+        yield SampleInput(
+            make_tensor(
+                shape,
+                device=device,
+                dtype=dtype,
+                low=-5,
+                requires_grad=requires_grad,
+            ),
+        )
+
+
 op_db: List[OpInfo] = [
     UnaryUfuncInfo(
         "special.i0e",
@@ -193,9 +217,9 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
             ),
         ),
         sample_kwargs=lambda device, dtype, input: ({"n": 0}, {"n": 0}),
-        # polygamma functions have multiple singularities at x <= 0
+        # polygamma functions have multiple singularities at x having non-positive integer value
         reference_numerics_filter=NumericsFilter(
-            condition=lambda x: x < 0.1, safe_val=1
+            condition=lambda x: (x < 0.1) & ((x - x.round()).abs() < 1e-4), safe_val=1
         ),
     ),
     BinaryUfuncInfo(
@@ -291,6 +315,7 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
         dtypes=all_types_and(torch.bool),
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_erfcx,
     ),
     UnaryUfuncInfo(
         "special.airy_ai",
@@ -442,6 +467,7 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
             # Greatest absolute difference: inf
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+            DecorateInfo(unittest.skip("Hangs on ROCm 6.1"), active_if=TEST_WITH_ROCM),
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
diff --git a/torch/testing/_internal/opinfo/refs.py b/torch/testing/_internal/opinfo/refs.py
index d720442e39987..92bbdf8d6b2eb 100644
--- a/torch/testing/_internal/opinfo/refs.py
+++ b/torch/testing/_internal/opinfo/refs.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from torch.testing._internal.opinfo.core import (
     BinaryUfuncInfo,
     OpInfo,
diff --git a/torch/testing/_internal/opinfo/utils.py b/torch/testing/_internal/opinfo/utils.py
index bc46fe141c165..34a197ee4bad2 100644
--- a/torch/testing/_internal/opinfo/utils.py
+++ b/torch/testing/_internal/opinfo/utils.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import collections
 import warnings
 from functools import partial, wraps
diff --git a/torch/testing/_internal/optests/__init__.py b/torch/testing/_internal/optests/__init__.py
index 350094d85fcc5..e9125ba0ebe7e 100644
--- a/torch/testing/_internal/optests/__init__.py
+++ b/torch/testing/_internal/optests/__init__.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 from .make_fx import make_fx_check
 from .aot_autograd import aot_autograd_check, _test_aot_autograd_forwards_backwards_helper
 from .fake_tensor import fake_check
diff --git a/torch/testing/_internal/optests/aot_autograd.py b/torch/testing/_internal/optests/aot_autograd.py
index 4036f7a44f1b1..13ce9e8837898 100644
--- a/torch/testing/_internal/optests/aot_autograd.py
+++ b/torch/testing/_internal/optests/aot_autograd.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 import torch.utils._pytree as pytree
 from torch.testing._internal.common_methods_invocations import wrapper_set_seed
diff --git a/torch/testing/_internal/optests/autograd_registration.py b/torch/testing/_internal/optests/autograd_registration.py
index 800e15aa1ecd1..25df4f1d03fcf 100644
--- a/torch/testing/_internal/optests/autograd_registration.py
+++ b/torch/testing/_internal/optests/autograd_registration.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import contextlib
 
 import torch
diff --git a/torch/testing/_internal/optests/fake_tensor.py b/torch/testing/_internal/optests/fake_tensor.py
index 55f0df2682b6e..5e60f50189b5d 100644
--- a/torch/testing/_internal/optests/fake_tensor.py
+++ b/torch/testing/_internal/optests/fake_tensor.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch._subclasses
 
 
diff --git a/torch/testing/_internal/optests/generate_tests.py b/torch/testing/_internal/optests/generate_tests.py
index 6a186b95a87a7..70ee48274800a 100644
--- a/torch/testing/_internal/optests/generate_tests.py
+++ b/torch/testing/_internal/optests/generate_tests.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import datetime
 import difflib
 import functools
@@ -8,7 +10,7 @@
 import tempfile
 import threading
 import unittest
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import torch
 
@@ -16,6 +18,7 @@
 
 import torch.utils._pytree as pytree
 from torch._dynamo.utils import clone_input
+from torch._library.custom_ops import CustomOpDef
 from torch._subclasses.schema_check_mode import SchemaCheckMode
 from torch._utils_internal import get_file_path_2
 from torch.overrides import TorchFunctionMode
@@ -144,10 +147,13 @@ def deepcopy_tensors(inputs: Any) -> Any:
     "test_schema",
     "test_autograd_registration",
     "test_faketensor",
-    "test_aot_dispatch_static",
     "test_aot_dispatch_dynamic",
 ]
 
+DEPRECATED_DEFAULT_TEST_UTILS = DEFAULT_TEST_UTILS + [
+    "test_aot_dispatch_static",
+]
+
 
 def generate_opcheck_tests(
     testcase: Any,
@@ -615,48 +621,19 @@ def should_print_better_repro() -> None:
 
 
 def opcheck(
-    op: torch._ops.OperatorBase,
+    op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket, CustomOpDef],
     args: Tuple[Any, ...],
     kwargs: Optional[Dict[str, Any]] = None,
     *,
-    test_utils: Union[str, List[str]] = "ALL",
+    test_utils: Union[str, Sequence[str]] = DEFAULT_TEST_UTILS,
     raise_exception: bool = True,
 ) -> Dict[str, str]:
-    """Given an operator and some sample arguments, tests if the operator is
-    registered correctly.
-
-    We test the following (which are important for correctness in eager-mode
-    PyTorch and with torch.compile):
-    - test_schema: if the operator's schema is correct.
-    - test_autograd_registration: if autograd was registered correctly,
-        i.e. to the correct DispatchKey.
-    - test_faketensor: If the operator has a FakeTensor implementation
-        (and if it is correct).
-    - test_aot_dispatch_static: If the operator works with
-        AOTAutograd/AOTDispatch, which is one of the parts in the PT2 stack.
-        Checks that the outputs (and gradients, if they are computable)
-        of the operator are the same under eager-mode PyTorch and torch.compile.
-    - test_aot_dispatch_dynamic: Same as aot_dispatch_static, but
-        tests dynamic shapes instead of static shapes.
-
-    For best results, please call ``opcheck`` multiple times with a
-    representative set of inputs. For example, if your operator supports
-    autograd, please use ``opcheck`` with inputs that require_grad.
-
-    Args:
-        op: The operator. Should look like torch.ops.aten.foo
-        args: The args to the operator
-        kwargs: The kwargs to the operator
-        test_utils: Tests that we should run. Default: all of them.
-            Example: ["test_schema", "test_faketensor"]
-        raise_exception: If we should raise an exception on the first
-            error. If False, we will return a dict with information
-            on if each test passed or not.
-
-    """
+    """See torch.library.opcheck for docstring"""
 
     if kwargs is None:
         kwargs = {}
+    if isinstance(op, CustomOpDef):
+        op = op._opoverload
     if isinstance(op, torch._ops.OpOverloadPacket):
         op = resolve_unique_overload_or_throw(op)
     if not isinstance(op, torch._ops.OpOverload):
diff --git a/torch/testing/_internal/optests/make_fx.py b/torch/testing/_internal/optests/make_fx.py
index f50c0cd9a244b..95f746a31af38 100644
--- a/torch/testing/_internal/optests/make_fx.py
+++ b/torch/testing/_internal/optests/make_fx.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing._internal.common_methods_invocations import wrapper_set_seed
diff --git a/torch/testing/_internal/quantization_torch_package_models.py b/torch/testing/_internal/quantization_torch_package_models.py
index 8f7551496a015..abc4ab6f7e473 100644
--- a/torch/testing/_internal/quantization_torch_package_models.py
+++ b/torch/testing/_internal/quantization_torch_package_models.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import math
 
 import torch
diff --git a/torch/testing/_internal/static_module.py b/torch/testing/_internal/static_module.py
new file mode 100644
index 0000000000000..b39daa380d9d9
--- /dev/null
+++ b/torch/testing/_internal/static_module.py
@@ -0,0 +1,26 @@
+# Owner(s): ["module: unknown"]
+
+import torch
+
+
+class StaticModule:
+    def __init__(self, scripted):
+        # this is an nn.Module
+        if hasattr(scripted, "_c"):
+            self.static_module = torch._C._jit_to_static_module(scripted._c)
+        else:
+            self.static_module = torch._C._jit_to_static_module(scripted.graph)
+
+    def __call__(self, *args, **kwargs):
+        return self.static_module(*args, **kwargs)
+
+    def benchmark(self, args, kwargs, warmup_runs, main_runs):
+        self.static_module.benchmark(args, kwargs, warmup_runs, main_runs)
+
+    def runAsync(self, args, kwargs):
+        return self.static_module.runAsync(args, kwargs)
+
+    def benchmark_individual_ops(self, args, kwargs, warmup_runs, main_runs):
+        return self.static_module.benchmark_individual_ops(
+            args, kwargs, warmup_runs, main_runs
+        )
diff --git a/torch/testing/_internal/test_module/future_div.py b/torch/testing/_internal/test_module/future_div.py
index 525c12af82b85..0a3494f945fad 100644
--- a/torch/testing/_internal/test_module/future_div.py
+++ b/torch/testing/_internal/test_module/future_div.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 
 
 def div_int_future():
diff --git a/torch/testing/_internal/test_module/no_future_div.py b/torch/testing/_internal/test_module/no_future_div.py
index 32e008e7f5ed4..164e6d168414a 100644
--- a/torch/testing/_internal/test_module/no_future_div.py
+++ b/torch/testing/_internal/test_module/no_future_div.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch  # noqa: F401
 
 
diff --git a/torch/testing/_internal/torchbind_impls.py b/torch/testing/_internal/torchbind_impls.py
new file mode 100644
index 0000000000000..4ae765c206f79
--- /dev/null
+++ b/torch/testing/_internal/torchbind_impls.py
@@ -0,0 +1,131 @@
+import contextlib
+from typing import Optional
+
+import torch
+
+
+_TORCHBIND_IMPLS_INITIALIZED = False
+
+_TENSOR_QUEUE_GLOBAL_TEST: Optional[torch.ScriptObject] = None
+
+
+def init_torchbind_implementations():
+    global _TORCHBIND_IMPLS_INITIALIZED
+    global _TENSOR_QUEUE_GLOBAL_TEST
+    if _TORCHBIND_IMPLS_INITIALIZED:
+        return
+
+    load_torchbind_test_lib()
+    register_fake_operators()
+    register_fake_classes()
+    _TENSOR_QUEUE_GLOBAL_TEST = _empty_tensor_queue()
+    _TORCHBIND_IMPLS_INITIALIZED = True
+
+
+def _empty_tensor_queue() -> torch.ScriptObject:
+    return torch.classes._TorchScriptTesting._TensorQueue(
+        torch.empty(
+            0,
+        ).fill_(-1)
+    )
+
+
+# put these under a function because the corresponding library might not be loaded yet.
+def register_fake_operators():
+    @torch.library.register_fake("_TorchScriptTesting::takes_foo_python_meta")
+    def fake_takes_foo(foo, z):
+        return foo.add_tensor(z)
+
+    @torch.library.register_fake("_TorchScriptTesting::queue_pop")
+    def fake_queue_pop(tq):
+        return tq.pop()
+
+    @torch.library.register_fake("_TorchScriptTesting::queue_push")
+    def fake_queue_push(tq, x):
+        return tq.push(x)
+
+    @torch.library.register_fake("_TorchScriptTesting::queue_size")
+    def fake_queue_size(tq):
+        return tq.size()
+
+    def meta_takes_foo_list_return(foo, x):
+        a = foo.add_tensor(x)
+        b = foo.add_tensor(a)
+        c = foo.add_tensor(b)
+        return [a, b, c]
+
+    def meta_takes_foo_tuple_return(foo, x):
+        a = foo.add_tensor(x)
+        b = foo.add_tensor(a)
+        return (a, b)
+
+    torch.ops._TorchScriptTesting.takes_foo_list_return.default.py_impl(
+        torch._C.DispatchKey.Meta
+    )(meta_takes_foo_list_return)
+
+    torch.ops._TorchScriptTesting.takes_foo_tuple_return.default.py_impl(
+        torch._C.DispatchKey.Meta
+    )(meta_takes_foo_tuple_return)
+
+    torch.ops._TorchScriptTesting.takes_foo.default.py_impl(torch._C.DispatchKey.Meta)(
+        lambda cc, x: cc.add_tensor(x)
+    )
+
+
+def register_fake_classes():
+    @torch._library.register_fake_class("_TorchScriptTesting::_Foo")
+    class FakeFoo:
+        def __init__(self, x: int, y: int):
+            self.x = x
+            self.y = y
+
+        @classmethod
+        def __obj_unflatten__(cls, flattend_foo):
+            return cls(**dict(flattend_foo))
+
+        def add_tensor(self, z):
+            return (self.x + self.y) * z
+
+    @torch._library.register_fake_class("_TorchScriptTesting::_ContainsTensor")
+    class FakeContainsTensor:
+        def __init__(self, t: torch.Tensor):
+            self.t = t
+
+        @classmethod
+        def __obj_unflatten__(cls, flattend_foo):
+            return cls(**dict(flattend_foo))
+
+        def get(self):
+            return self.t
+
+
+def load_torchbind_test_lib():
+    import unittest
+
+    from torch.testing._internal.common_utils import (  # type: ignore[attr-defined]
+        find_library_location,
+        IS_FBCODE,
+        IS_MACOS,
+        IS_SANDCASTLE,
+        IS_WINDOWS,
+    )
+
+    if IS_SANDCASTLE or IS_FBCODE:
+        torch.ops.load_library("//caffe2/test/cpp/jit:test_custom_class_registrations")
+    elif IS_MACOS:
+        raise unittest.SkipTest("non-portable load_library call used in test")
+    else:
+        lib_file_path = find_library_location("libtorchbind_test.so")
+        if IS_WINDOWS:
+            lib_file_path = find_library_location("torchbind_test.dll")
+        torch.ops.load_library(str(lib_file_path))
+
+
+@contextlib.contextmanager
+def _register_py_impl_temporarily(op_overload, key, fn):
+    try:
+        op_overload.py_impl(key)(fn)
+        yield
+    finally:
+        del op_overload.py_kernels[key]
+        op_overload._dispatch_cache.clear()
diff --git a/torch/testing/_internal/triton_utils.py b/torch/testing/_internal/triton_utils.py
index 3db94acbe9ff2..301c3cd4723e3 100644
--- a/torch/testing/_internal/triton_utils.py
+++ b/torch/testing/_internal/triton_utils.py
@@ -1,11 +1,13 @@
-import functools
+# mypy: ignore-errors
+
 import unittest
 
 from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.utils._triton import has_triton
 
-requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
 
-if HAS_CUDA:
+if has_triton():
     import triton
     from triton import language as tl
 
@@ -114,6 +116,24 @@ def add_kernel_2d_autotuned(
         tmp2 = tmp0 + tmp1
         tl.store(out_ptr + (x1 + (x_elements * y0)), tmp2, xmask & ymask)
 
+    @triton.jit
+    def add_kernel_with_scaling(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        scaling_factor,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        y = tl.load(in_ptr1 + offsets, mask=mask)
+        output = (x + y) * scaling_factor
+        tl.store(out_ptr + offsets, output, mask=mask)
+
     @triton.jit
     def mul2_kernel(
         in_ptr0,
@@ -161,6 +181,8 @@ def indirection_kernel(
         mask = offsets < n_elements
         if ACTIVATION == "mul2_inplace_kernel":
             mul2_inplace_kernel(in_ptr0, n_elements, BLOCK_SIZE=BLOCK_SIZE)
+        elif ACTIVATION == "add_kernel":
+            add_kernel(in_ptr0, in_ptr0, out_ptr, n_elements, BLOCK_SIZE=BLOCK_SIZE)
         x = tl.load(in_ptr0 + offsets, mask=mask)
         tl.store(out_ptr + offsets, x, mask=mask)
 
@@ -183,3 +205,194 @@ def double_strided_kernel(
         dst_offsets = y_offsets[:, None] * out_y_stride + x_offsets[None, :]
         src = tl.load(in_ptr + src_offsets)
         tl.store(out_ptr + dst_offsets, src * 2.0)
+
+    @triton.jit
+    def inline_asm_kernel(X, Y, Z, n: "tl.constexpr", BLOCK: "tl.constexpr"):
+        x = tl.load(X + tl.arange(0, BLOCK))
+        y = tl.load(Y + tl.arange(0, BLOCK))
+        s = tl.full([BLOCK], n, tl.int32)
+        z = tl.inline_asm_elementwise(
+            "shf.l.wrap.b32 $0, $1, $2, $3;",
+            "=r,r, r, r",
+            [x, y, s],
+            dtype=tl.int32,
+            is_pure=True,
+            pack=1,
+        )
+        tl.store(Z + tl.arange(0, BLOCK), z)
+
+    @triton.jit
+    def add_kernel_with_block_ptr(
+        x_ptr,
+        y_ptr,
+        output_ptr,
+        n_elements,
+        BLOCK_SIZE: tl.constexpr,
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        x = tl.load(
+            tl.make_block_ptr(
+                base=x_ptr,
+                shape=[n_elements],
+                strides=[1],
+                offsets=[block_start],
+                block_shape=[BLOCK_SIZE],
+                order=[0],
+            ),
+            boundary_check=[0],
+        )
+        y = tl.load(
+            tl.make_block_ptr(
+                base=y_ptr,
+                shape=[n_elements],
+                strides=[1],
+                offsets=[block_start],
+                block_shape=[BLOCK_SIZE],
+                order=[0],
+            ),
+            boundary_check=[0],
+        )
+        output = x + y
+        tl.store(
+            tl.make_block_ptr(
+                base=output_ptr,
+                shape=[n_elements],
+                strides=[1],
+                offsets=[block_start],
+                block_shape=[BLOCK_SIZE],
+                order=[0],
+            ),
+            output,
+            boundary_check=[0],
+        )
+
+    @triton.jit
+    def kernel_with_block_ptr_2d(
+        x_ptr,
+        output_ptr,
+        n_elements,
+        BLOCK_SIZE: tl.constexpr,
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        x = tl.load(
+            tl.make_block_ptr(
+                base=x_ptr,
+                shape=[n_elements, 1],
+                strides=[1, 1],
+                offsets=[block_start, 0],
+                block_shape=[BLOCK_SIZE, 1],
+                order=[1, 0],
+            ),
+            boundary_check=[0],
+        )
+        output = x
+        tl.store(
+            tl.make_block_ptr(
+                base=output_ptr,
+                shape=[n_elements, 1],
+                strides=[1, 1],
+                offsets=[block_start, 0],
+                block_shape=[BLOCK_SIZE, 1],
+                order=[1, 0],
+            ),
+            output,
+            boundary_check=[0],
+        )
+
+    from triton.language import load, store
+
+    @triton.jit
+    def add_kernel_with_import(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = load(in_ptr0 + offsets, mask=mask)
+        y = load(in_ptr1 + offsets, mask=mask)
+        output = x + y
+        store(out_ptr + offsets, output, mask=mask)
+
+    @triton.jit
+    def cond_op_kernel(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        y = tl.load(in_ptr1 + offsets, mask=mask)
+        if tl.program_id(0) == 0:
+            output = x + y
+        else:
+            output = x * y
+        tl.store(out_ptr + offsets, output, mask=mask)
+
+    @triton.jit
+    def atomic_add_kernel(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        y = tl.load(in_ptr1 + offsets, mask=mask)
+        output = x + y
+        tl.atomic_add(out_ptr + offsets, output, mask=mask)
+
+    @triton.jit
+    def add_4_times_kernel(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        y = tl.load(in_ptr1 + offsets, mask=mask)
+        for i in range(2):
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+        i = 2
+        while i > 0:
+            i -= 1
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+    @triton.jit
+    def add_kernel_out_of_order_fn2(
+        in_ptr0,
+        in_ptr1,
+        n_elements,
+        out_ptr,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        y = tl.load(in_ptr1 + offsets, mask=mask)
+        output = x + y
+        tl.store(out_ptr + offsets, output, mask=mask)
diff --git a/torch/testing/_internal/two_tensor.py b/torch/testing/_internal/two_tensor.py
index 32dc972aa8705..32fbe011ade07 100644
--- a/torch/testing/_internal/two_tensor.py
+++ b/torch/testing/_internal/two_tensor.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 import torch
 import torch.utils._pytree as pytree
 from torch.utils._python_dispatch import return_and_correct_aliasing
diff --git a/torch/torch_version.py b/torch/torch_version.py
index 3d3b5aed2fa5f..e8814cf79e124 100644
--- a/torch/torch_version.py
+++ b/torch/torch_version.py
@@ -1,8 +1,11 @@
+# mypy: ignore-errors
+
 from typing import Any, Iterable
+
+from ._vendor.packaging.version import InvalidVersion, Version
 from .version import __version__ as internal_version
-from ._vendor.packaging.version import Version, InvalidVersion
 
-__all__ = ['TorchVersion']
+__all__ = ["TorchVersion"]
 
 
 class TorchVersion(str):
@@ -22,6 +25,7 @@ class TorchVersion(str):
             TorchVersion('1.10.0a') > '1.2'
             TorchVersion('1.10.0a') > '1.2.1'
     """
+
     # fully qualified type names here to appease mypy
     def _convert_to_version(self, inp: Any) -> Any:
         if isinstance(inp, Version):
@@ -35,7 +39,7 @@ def _convert_to_version(self, inp: Any) -> Any:
             #   * (1)         -> Version("1")
             #   * (1, 20)     -> Version("1.20")
             #   * (1, 20, 1)  -> Version("1.20.1")
-            return Version('.'.join(str(item) for item in inp))
+            return Version(".".join(str(item) for item in inp))
         else:
             raise InvalidVersion(inp)
 
@@ -51,6 +55,10 @@ def _cmp_wrapper(self, cmp: Any, method: str) -> bool:
 
 
 for cmp_method in ["__gt__", "__lt__", "__eq__", "__ge__", "__le__"]:
-    setattr(TorchVersion, cmp_method, lambda x, y, method=cmp_method: x._cmp_wrapper(y, method))
+    setattr(
+        TorchVersion,
+        cmp_method,
+        lambda x, y, method=cmp_method: x._cmp_wrapper(y, method),
+    )
 
 __version__ = TorchVersion(internal_version)
diff --git a/torch/types.py b/torch/types.py
index 22c01e3bb9795..10f091a4b24ed 100644
--- a/torch/types.py
+++ b/torch/types.py
@@ -1,16 +1,18 @@
-import torch
+import builtins
 from typing import Any, List, Optional, Sequence, Tuple, Union
 
-import builtins
+import torch
 
 # Convenience aliases for common composite types that we need
 # to talk about in PyTorch
 
 _TensorOrTensors = Union[torch.Tensor, Sequence[torch.Tensor]]
 _TensorOrTensorsOrGradEdge = Union[
-    torch.Tensor, Sequence[torch.Tensor],
+    torch.Tensor,
+    Sequence[torch.Tensor],
     "torch.autograd.graph.GradientEdge",
-    Sequence["torch.autograd.graph.GradientEdge"]]
+    Sequence["torch.autograd.graph.GradientEdge"],
+]
 
 # In some cases, these basic types are shadowed by corresponding
 # top-level values.  The underscore variants let us refer to these
@@ -39,19 +41,22 @@
 
 # Storage protocol implemented by ${Type}StorageBase classes
 
+
 class Storage:
     _cdata: int
     device: torch.device
     dtype: torch.dtype
     _torch_load_uninitialized: bool
 
-    def __deepcopy__(self, memo) -> 'Storage':  # type: ignore[empty-body]
+    def __deepcopy__(self, memo) -> "Storage":  # type: ignore[empty-body]
         ...
 
-    def _new_shared(self, int) -> 'Storage':  # type: ignore[empty-body]
+    def _new_shared(self, int) -> "Storage":  # type: ignore[empty-body]
         ...
 
-    def _write_file(self, f: Any, is_real_file: _bool, save_size: _bool, element_size: int) -> None:
+    def _write_file(
+        self, f: Any, is_real_file: _bool, save_size: _bool, element_size: int
+    ) -> None:
         ...
 
     def element_size(self) -> int:  # type: ignore[empty-body]
@@ -60,20 +65,20 @@ def element_size(self) -> int:  # type: ignore[empty-body]
     def is_shared(self) -> bool:  # type: ignore[empty-body]
         ...
 
-    def share_memory_(self) -> 'Storage':  # type: ignore[empty-body]
+    def share_memory_(self) -> "Storage":  # type: ignore[empty-body]
         ...
 
     def nbytes(self) -> int:  # type: ignore[empty-body]
         ...
 
-    def cpu(self) -> 'Storage':  # type: ignore[empty-body]
+    def cpu(self) -> "Storage":  # type: ignore[empty-body]
         ...
 
     def data_ptr(self) -> int:  # type: ignore[empty-body]
         ...
 
-    def from_file(self, filename: str, shared: bool = False, nbytes: int = 0) -> 'Storage':  # type: ignore[empty-body]
+    def from_file(self, filename: str, shared: bool = False, nbytes: int = 0) -> "Storage":  # type: ignore[empty-body]
         ...
 
-    def _new_with_file(self, f: Any, element_size: int) -> 'Storage':  # type: ignore[empty-body]
+    def _new_with_file(self, f: Any, element_size: int) -> "Storage":  # type: ignore[empty-body]
         ...
diff --git a/torch/utils/_config_module.py b/torch/utils/_config_module.py
index 4412048508e52..ef0478535772c 100644
--- a/torch/utils/_config_module.py
+++ b/torch/utils/_config_module.py
@@ -78,10 +78,10 @@ def get_assignments_with_compile_ignored_comments(module):
     tokens = tokenize.tokenize(io.BytesIO(source_code.encode("utf-8")).readline)
     current_comment = "", -1
     prev_name = ""
-    prev_assigned = "", -1
 
     for token in tokens:
         if token.type == tokenize.COMMENT:
+            prev_name = ""
             maybe_current = token.string.strip()
             if COMPILE_IGNORED_MARKER in maybe_current:
                 assert current_comment == (
@@ -89,15 +89,12 @@ def get_assignments_with_compile_ignored_comments(module):
                     -1,
                 ), f"unconsumed {COMPILE_IGNORED_MARKER}"
                 current_comment = maybe_current, token.start[0]
-                if token.start[0] == prev_assigned[1]:
-                    # Check if the current assignment is followed with
-                    # a same-line comment with COMPILE_IGNORED_MARKER
-                    assignments.add(prev_assigned[0])
-                    current_comment = "", -1  # reset
         elif token.type == tokenize.NAME:
-            prev_name = token.string
+            # Only accept the first name token, to handle if you have
+            # something like foo: Bar = ...
+            if not prev_name:
+                prev_name = token.string
         elif token.type == tokenize.OP and token.string == "=":
-            prev_assigned = prev_name, token.start[0]
             # Check if the current assignment follows a comment
             # with COMPILE_IGNORED_MARKER
             if (
@@ -106,6 +103,7 @@ def get_assignments_with_compile_ignored_comments(module):
             ):
                 assignments.add(prev_name)
                 current_comment = "", -1  # reset
+            prev_name = ""
     assert current_comment == ("", -1), f"unconsumed {COMPILE_IGNORED_MARKER}"
     return assignments
 
@@ -268,6 +266,37 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
         return ConfigPatch()
 
+    def _make_closure_patcher(self, **changes):
+        """
+        A lower-overhead version of patch() for things on the critical path.
+
+        Usage:
+
+            # do this off the critical path
+            change_fn = config.make_closure_patcher(foo=True)
+
+            ...
+
+            revert = change_fn()
+            try:
+              ...
+            finally:
+                revert()
+
+        """
+        config = self._config
+
+        def change():
+            prior = {k: config[k] for k in changes}
+            config.update(changes)
+
+            def revert():
+                config.update(prior)
+
+            return revert
+
+        return change
+
 
 class ContextDecorator(contextlib.ContextDecorator):
     """
diff --git a/torch/utils/_cuda_trace.py b/torch/utils/_cuda_trace.py
deleted file mode 100644
index 18c8ba4e4a99a..0000000000000
--- a/torch/utils/_cuda_trace.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import logging
-from typing import Callable, Generic, List
-
-from typing_extensions import ParamSpec  # Python 3.10+
-
-logger = logging.getLogger(__name__)
-P = ParamSpec("P")
-
-
-class CallbackRegistry(Generic[P]):
-    def __init__(self, name: str):
-        self.name = name
-        self.callback_list: List[Callable[P, None]] = []
-
-    def add_callback(self, cb: Callable[P, None]) -> None:
-        self.callback_list.append(cb)
-
-    def fire_callbacks(self, *args: P.args, **kwargs: P.kwargs) -> None:
-        for cb in self.callback_list:
-            try:
-                cb(*args, **kwargs)
-            except Exception as e:
-                logger.exception(
-                    "Exception in callback for %s registered with CUDA trace", self.name
-                )
-
-
-CUDAEventCreationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
-    "CUDA event creation"
-)
-CUDAEventDeletionCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
-    "CUDA event deletion"
-)
-CUDAEventRecordCallbacks: "CallbackRegistry[int, int]" = CallbackRegistry(
-    "CUDA event record"
-)
-CUDAEventWaitCallbacks: "CallbackRegistry[int, int]" = CallbackRegistry(
-    "CUDA event wait"
-)
-CUDAMemoryAllocationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
-    "CUDA memory allocation"
-)
-CUDAMemoryDeallocationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
-    "CUDA memory deallocation"
-)
-CUDAStreamCreationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
-    "CUDA stream creation"
-)
-CUDADeviceSynchronizationCallbacks: "CallbackRegistry[[]]" = CallbackRegistry(
-    "CUDA device synchronization"
-)
-CUDAStreamSynchronizationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
-    "CUDA stream synchronization"
-)
-CUDAEventSynchronizationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
-    "CUDA event synchronization"
-)
-
-
-def register_callback_for_cuda_event_creation(cb: Callable[[int], None]) -> None:
-    CUDAEventCreationCallbacks.add_callback(cb)
-
-
-def register_callback_for_cuda_event_deletion(cb: Callable[[int], None]) -> None:
-    CUDAEventDeletionCallbacks.add_callback(cb)
-
-
-def register_callback_for_cuda_event_record(cb: Callable[[int, int], None]) -> None:
-    CUDAEventRecordCallbacks.add_callback(cb)
-
-
-def register_callback_for_cuda_event_wait(cb: Callable[[int, int], None]) -> None:
-    CUDAEventWaitCallbacks.add_callback(cb)
-
-
-def register_callback_for_cuda_memory_allocation(cb: Callable[[int], None]) -> None:
-    CUDAMemoryAllocationCallbacks.add_callback(cb)
-
-
-def register_callback_for_cuda_memory_deallocation(cb: Callable[[int], None]) -> None:
-    CUDAMemoryDeallocationCallbacks.add_callback(cb)
-
-
-def register_callback_for_cuda_stream_creation(cb: Callable[[int], None]) -> None:
-    CUDAStreamCreationCallbacks.add_callback(cb)
-
-
-def register_callback_for_cuda_device_synchronization(cb: Callable[[], None]) -> None:
-    CUDADeviceSynchronizationCallbacks.add_callback(cb)
-
-
-def register_callback_for_cuda_stream_synchronization(
-    cb: Callable[[int], None]
-) -> None:
-    CUDAStreamSynchronizationCallbacks.add_callback(cb)
-
-
-def register_callback_for_cuda_event_synchronization(cb: Callable[[int], None]) -> None:
-    CUDAEventSynchronizationCallbacks.add_callback(cb)
diff --git a/torch/utils/_cxx_pytree.py b/torch/utils/_cxx_pytree.py
index b10e016fe6160..aba15f1482f21 100644
--- a/torch/utils/_cxx_pytree.py
+++ b/torch/utils/_cxx_pytree.py
@@ -13,6 +13,8 @@
 """
 
 import functools
+import sys
+import types
 import warnings
 from typing import (
     Any,
@@ -29,12 +31,14 @@
 
 import torch
 
-if torch._running_with_deploy():
+if torch._running_with_deploy():  # type: ignore[no-untyped-call]
     raise ImportError("C++ pytree utilities do not work with torch::deploy.")
 
 import optree
 from optree import PyTreeSpec  # direct import for type annotations
 
+from torch.utils._pytree import KeyEntry
+
 
 __all__ = [
     "PyTree",
@@ -46,12 +50,18 @@
     "FromDumpableContextFn",
     "TreeSpec",
     "LeafSpec",
+    "keystr",
+    "key_get",
     "register_pytree_node",
     "tree_flatten",
+    "tree_flatten_with_path",
     "tree_unflatten",
+    "tree_iter",
     "tree_leaves",
+    "tree_leaves_with_path",
     "tree_structure",
     "tree_map",
+    "tree_map_with_path",
     "tree_map_",
     "tree_map_only",
     "tree_map_only_",
@@ -80,6 +90,8 @@
 DumpableContext = Any  # Any json dumpable text
 ToDumpableContextFn = Callable[[Context], DumpableContext]
 FromDumpableContextFn = Callable[[DumpableContext], Context]
+KeyPath = Tuple[KeyEntry, ...]
+FlattenWithKeysFunc = Callable[[PyTree], Tuple[List[Tuple[KeyEntry, Any]], Any]]
 
 
 def _reverse_args(func: UnflattenFunc) -> OpTreeUnflattenFunc:
@@ -98,6 +110,7 @@ def register_pytree_node(
     serialized_type_name: Optional[str] = None,
     to_dumpable_context: Optional[ToDumpableContextFn] = None,
     from_dumpable_context: Optional[FromDumpableContextFn] = None,
+    flatten_with_keys_fn: Optional[FlattenWithKeysFunc] = None,
 ) -> None:
     """Register a container-like type as pytree node.
 
@@ -130,6 +143,9 @@ def register_pytree_node(
         ...     lambda children, _: set(children),
         ... )
     """
+    if flatten_with_keys_fn is not None:
+        raise NotImplementedError("KeyPaths are not yet supported in cxx_pytree.")
+
     _private_register_pytree_node(
         cls,
         flatten_fn,
@@ -306,6 +322,41 @@ def tree_unflatten(leaves: Iterable[Any], treespec: TreeSpec) -> PyTree:
     return optree.tree_unflatten(treespec, leaves)  # type: ignore[arg-type]
 
 
+def tree_iter(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> Iterable[Any]:
+    """Get an iterator over the leaves of a pytree.
+
+    See also :func:`tree_flatten`.
+
+    >>> tree = {'b': (2, [3, 4]), 'a': 1, 'c': None, 'd': 5}
+    >>> list(tree_iter(tree))
+    [1, 2, 3, 4, None, 5]
+    >>> list(tree_iter(1))
+    [1]
+    >>> list(tree_iter(None))
+    [None]
+
+    Args:
+        tree (pytree): A pytree to flatten.
+        is_leaf (callable, optional): An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+
+    Returns:
+        An iterator over the leaf values.
+    """
+    return optree.tree_iter(
+        tree,
+        is_leaf=is_leaf,
+        none_is_leaf=True,
+        namespace="torch",
+    )
+
+
 def tree_leaves(
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
@@ -465,7 +516,10 @@ def tree_map_(
 
 Type2 = Tuple[Type[T], Type[S]]
 Type3 = Tuple[Type[T], Type[S], Type[U]]
-TypeAny = Union[Type[Any], Tuple[Type[Any], ...]]
+if sys.version_info >= (3, 10):
+    TypeAny = Union[Type[Any], Tuple[Type[Any], ...], types.UnionType]
+else:
+    TypeAny = Union[Type[Any], Tuple[Type[Any], ...]]
 
 Fn2 = Callable[[Union[T, S]], R]
 Fn3 = Callable[[Union[T, S, U]], R]
@@ -478,27 +532,34 @@ def tree_map_(
 # These specializations help with type inference on the lambda passed to this
 # function
 @overload
-def map_only(__type_or_types: Type2[T, S]) -> MapOnlyFn[Fn2[T, S, Any]]:
+def map_only(__type_or_types_or_pred: Type2[T, S]) -> MapOnlyFn[Fn2[T, S, Any]]:
     ...
 
 
 @overload
-def map_only(__type_or_types: Type3[T, S, U]) -> MapOnlyFn[Fn3[T, S, U, Any]]:
+def map_only(__type_or_types_or_pred: Type3[T, S, U]) -> MapOnlyFn[Fn3[T, S, U, Any]]:
     ...
 
 
 @overload
-def map_only(__type_or_types: Type[T]) -> MapOnlyFn[Fn[T, Any]]:
+def map_only(__type_or_types_or_pred: Type[T]) -> MapOnlyFn[Fn[T, Any]]:
     ...
 
 
 # This specialization is needed for the implementations below that call
 @overload
-def map_only(__type_or_types: TypeAny) -> MapOnlyFn[FnAny[Any]]:
+def map_only(__type_or_types_or_pred: TypeAny) -> MapOnlyFn[FnAny[Any]]:
     ...
 
 
-def map_only(__type_or_types: TypeAny) -> MapOnlyFn[FnAny[Any]]:
+@overload
+def map_only(__type_or_types_or_pred: Callable[[Any], bool]) -> MapOnlyFn[FnAny[Any]]:
+    ...
+
+
+def map_only(
+    __type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]]
+) -> MapOnlyFn[FnAny[Any]]:
     """
     Suppose you are writing a tree_map over tensors, leaving everything
     else unchanged.  Ordinarily you would have to write:
@@ -517,11 +578,23 @@ def go(t):
 
     You can also directly use 'tree_map_only'
     """
+    if isinstance(__type_or_types_or_pred, (type, tuple)) or (
+        sys.version_info >= (3, 10)
+        and isinstance(__type_or_types_or_pred, types.UnionType)
+    ):
+
+        def pred(x: Any) -> bool:
+            return isinstance(x, __type_or_types_or_pred)  # type: ignore[arg-type]
+
+    elif callable(__type_or_types_or_pred):
+        pred = __type_or_types_or_pred  # type: ignore[assignment]
+    else:
+        raise TypeError("Argument must be a type, a tuple of types, or a callable.")
 
     def wrapper(func: Callable[[T], Any]) -> Callable[[Any], Any]:
         @functools.wraps(func)
         def wrapped(x: T) -> Any:
-            if isinstance(x, __type_or_types):
+            if pred(x):
                 return func(x)
             return x
 
@@ -532,7 +605,7 @@ def wrapped(x: T) -> Any:
 
 @overload
 def tree_map_only(
-    __type_or_types: Type[T],
+    __type_or_types_or_pred: Type[T],
     func: Fn[T, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
@@ -542,7 +615,7 @@ def tree_map_only(
 
 @overload
 def tree_map_only(
-    __type_or_types: Type2[T, S],
+    __type_or_types_or_pred: Type2[T, S],
     func: Fn2[T, S, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
@@ -552,7 +625,7 @@ def tree_map_only(
 
 @overload
 def tree_map_only(
-    __type_or_types: Type3[T, S, U],
+    __type_or_types_or_pred: Type3[T, S, U],
     func: Fn3[T, S, U, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
@@ -560,18 +633,28 @@ def tree_map_only(
     ...
 
 
+@overload
 def tree_map_only(
-    __type_or_types: TypeAny,
+    __type_or_types_or_pred: Callable[[Any], bool],
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> PyTree:
-    return tree_map(map_only(__type_or_types)(func), tree, is_leaf=is_leaf)
+    ...
+
+
+def tree_map_only(
+    __type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]],
+    func: FnAny[Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    return tree_map(map_only(__type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
 
 
 @overload
 def tree_map_only_(
-    __type_or_types: Type[T],
+    __type_or_types_or_pred: Type[T],
     func: Fn[T, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
@@ -581,7 +664,7 @@ def tree_map_only_(
 
 @overload
 def tree_map_only_(
-    __type_or_types: Type2[T, S],
+    __type_or_types_or_pred: Type2[T, S],
     func: Fn2[T, S, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
@@ -591,7 +674,7 @@ def tree_map_only_(
 
 @overload
 def tree_map_only_(
-    __type_or_types: Type3[T, S, U],
+    __type_or_types_or_pred: Type3[T, S, U],
     func: Fn3[T, S, U, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
@@ -599,13 +682,23 @@ def tree_map_only_(
     ...
 
 
+@overload
 def tree_map_only_(
-    __type_or_types: TypeAny,
+    __type_or_types_or_pred: Callable[[Any], bool],
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> PyTree:
-    return tree_map_(map_only(__type_or_types)(func), tree, is_leaf=is_leaf)
+    ...
+
+
+def tree_map_only_(
+    __type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]],
+    func: FnAny[Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    return tree_map_(map_only(__type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
 
 
 def tree_all(
@@ -613,7 +706,7 @@ def tree_all(
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> bool:
-    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    flat_args = tree_iter(tree, is_leaf=is_leaf)
     return all(map(pred, flat_args))
 
 
@@ -622,7 +715,7 @@ def tree_any(
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> bool:
-    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    flat_args = tree_iter(tree, is_leaf=is_leaf)
     return any(map(pred, flat_args))
 
 
@@ -662,7 +755,7 @@ def tree_all_only(
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> bool:
-    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    flat_args = tree_iter(tree, is_leaf=is_leaf)
     return all(pred(x) for x in flat_args if isinstance(x, __type_or_types))
 
 
@@ -702,7 +795,7 @@ def tree_any_only(
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> bool:
-    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    flat_args = tree_iter(tree, is_leaf=is_leaf)
     return any(pred(x) for x in flat_args if isinstance(x, __type_or_types))
 
 
@@ -825,3 +918,89 @@ def __instancecheck__(self, instance: object) -> bool:
 class LeafSpec(TreeSpec, metaclass=LeafSpecMeta):
     def __new__(cls) -> "LeafSpec":
         return optree.treespec_leaf(none_is_leaf=True)  # type: ignore[return-value]
+
+
+def tree_flatten_with_path(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> Tuple[List[Tuple[KeyPath, Any]], TreeSpec]:
+    """Flattens a pytree like :func:`tree_flatten`, but also returns each leaf's key path.
+
+    Args:
+        tree: a pytree to flatten. If it contains a custom type, that type must be
+            registered with an appropriate `tree_flatten_with_path_fn` when registered
+            with :func:`register_pytree_node`.
+        is_leaf: An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+    Returns:
+        A tuple where the first element is a list of (key path, leaf) pairs, and the
+        second element is a :class:`TreeSpec` representing the structure of the flattened
+        tree.
+    """
+    raise NotImplementedError("KeyPaths are not yet supported in cxx_pytree.")
+
+
+def tree_leaves_with_path(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> List[Tuple[KeyPath, Any]]:
+    """Gets the leaves of a pytree like ``tree_leaves`` and returns each leaf's key path.
+
+    Args:
+        tree: a pytree. If it contains a custom type, that type must be
+            registered with an appropriate `tree_flatten_with_path_fn` when registered
+            with :func:`register_pytree_node`.
+        is_leaf: An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+    Returns:
+        A list of (key path, leaf) pairs.
+    """
+    raise NotImplementedError("KeyPaths are not yet supported in cxx_pytree.")
+
+
+def tree_map_with_path(
+    func: Callable[..., Any],
+    tree: PyTree,
+    *rests: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    """Like :func:`tree_map`, but the provided callable takes an additional key path argument.
+
+    Args:
+        func: A function that takes ``2 + len(rests)`` arguments, to be applied at the
+            corresponding leaves of the pytrees. The first positional argument
+            to ``func`` is the key path of the leaf in question. The second
+            positional argument is the value of the leaf.
+        tree: A pytree to be mapped over, with each leaf providing the first positional
+            argument to function ``func``.
+        rests: A tuple of pytrees, each of which has the same structure as
+            ``tree`` or has ``tree`` as a prefix.
+        is_leaf: An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+
+    Returns
+        A new pytree with the same structure as ``tree`` but with the value at each leaf given by
+        ``func(keypath, x, *xs)`` where ``keypath`` is the key path at the
+        corresponding leaf in ``tree``, ``x`` is the value at that leaf, and
+        ``xs`` is the tuple of values at corresponding nodes in ``rests``.
+    """
+    raise NotImplementedError("KeyPaths are not yet supported in cxx_pytree.")
+
+
+def keystr(kp: KeyPath) -> str:
+    """Given a key path, return a pretty-printed representation."""
+    raise NotImplementedError("KeyPaths are not yet supported in cxx_pytree.")
+
+
+def key_get(obj: Any, kp: KeyPath) -> Any:
+    """Given an object and a key path, return the value at the key path."""
+    raise NotImplementedError("KeyPaths are not yet supported in cxx_pytree.")
diff --git a/torch/utils/_device.py b/torch/utils/_device.py
index 49a7ea5fc654c..d4909e54c267c 100644
--- a/torch/utils/_device.py
+++ b/torch/utils/_device.py
@@ -1,9 +1,10 @@
+from typing import Optional
 import torch
 from torch.overrides import TorchFunctionMode
 from torch.utils._contextlib import context_decorator
 import functools
 
-CURRENT_DEVICE = None
+CURRENT_DEVICE: Optional[torch.device] = None
 
 @functools.lru_cache(1)
 def _device_constructors():
diff --git a/torch/utils/_exposed_in.py b/torch/utils/_exposed_in.py
new file mode 100644
index 0000000000000..ddd845349916d
--- /dev/null
+++ b/torch/utils/_exposed_in.py
@@ -0,0 +1,14 @@
+# Allows one to expose an API in a private submodule publicly as per the definition
+# in PyTorch's public api policy.
+#
+# It is a temporary solution while we figure out if it should be the long-term solution
+# or if we should amend PyTorch's public api policy. The concern is that this approach
+# may not be very robust because it's not clear what __module__ is used for.
+# However, both numpy and jax overwrite the __module__ attribute of their APIs
+# without problem, so it seems fine.
+def exposed_in(module):
+    def wrapper(fn):
+        fn.__module__ = module
+        return fn
+
+    return wrapper
diff --git a/torch/utils/_foreach_utils.py b/torch/utils/_foreach_utils.py
index 41090fe36c798..840df0432b53a 100644
--- a/torch/utils/_foreach_utils.py
+++ b/torch/utils/_foreach_utils.py
@@ -11,7 +11,7 @@ def _get_foreach_kernels_supported_devices() -> List[str]:
 
 def _get_fused_kernels_supported_devices() -> List[str]:
     r"""Return the device type list that supports fused kernels in optimizer."""
-    return ["cuda", "xpu", torch._C._get_privateuse1_backend_name()]
+    return ["cuda", "xpu", "cpu", torch._C._get_privateuse1_backend_name()]
 
 TensorListList: TypeAlias = List[List[Optional[Tensor]]]
 Indices: TypeAlias = List[int]
@@ -39,7 +39,9 @@ def _group_tensors_by_device_and_dtype(
     }
 
 
+def _device_has_foreach_support(device: torch.device) -> bool:
+    return device.type in (_get_foreach_kernels_supported_devices() + ["cpu"]) and not torch.jit.is_scripting()
+
+
 def _has_foreach_support(tensors: List[Tensor], device: torch.device) -> bool:
-    if device.type not in set(_get_foreach_kernels_supported_devices() + ["cpu"]) or torch.jit.is_scripting():
-        return False
-    return all(t is None or type(t) == torch.Tensor for t in tensors)
+    return _device_has_foreach_support(device) and all(t is None or type(t) == torch.Tensor for t in tensors)
diff --git a/torch/utils/_import_utils.py b/torch/utils/_import_utils.py
new file mode 100644
index 0000000000000..b7756a6fa62f9
--- /dev/null
+++ b/torch/utils/_import_utils.py
@@ -0,0 +1,42 @@
+import functools
+import importlib.util
+
+import torch
+
+
+def _check_module_exists(name: str) -> bool:
+    r"""Returns if a top-level module with :attr:`name` exists *without**
+    importing it. This is generally safer than try-catch block around a
+    `import X`. It avoids third party libraries breaking assumptions of some of
+    our tests, e.g., setting multiprocessing start method when imported
+    (see librosa/#747, torchvision/#544).
+    """
+    try:
+        spec = importlib.util.find_spec(name)
+        return spec is not None
+    except ImportError:
+        return False
+
+
+@functools.lru_cache
+def dill_available():
+    return (
+        _check_module_exists("dill")
+        # dill fails to import under torchdeploy
+        and not torch._running_with_deploy()
+    )
+
+
+@functools.lru_cache
+def import_dill():
+    if not dill_available():
+        return None
+
+    import dill
+
+    # XXX: By default, dill writes the Pickler dispatch table to inject its
+    # own logic there. This globally affects the behavior of the standard library
+    # pickler for any user who transitively depends on this module!
+    # Undo this extension to avoid altering the behavior of the pickler globally.
+    dill.extend(use_dill=False)
+    return dill
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
index 434a88ab2928b..ec24f006a70b5 100644
--- a/torch/utils/_python_dispatch.py
+++ b/torch/utils/_python_dispatch.py
@@ -1,12 +1,19 @@
 import contextlib
-from typing import Optional, Union, List, Set, Dict, Any
 
 import warnings
 from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Set, Union
+
 import torch
 import torchgen
-from torch._C import _len_torch_dispatch_stack, _get_dispatch_stack_at, \
-    _pop_torch_dispatch_stack, _push_on_torch_dispatch_stack, DispatchKey
+import torchgen.model
+from torch._C import (
+    _get_dispatch_stack_at,
+    _len_torch_dispatch_stack,
+    _pop_torch_dispatch_stack,
+    _push_on_torch_dispatch_stack,
+    DispatchKey,
+)
 
 
 # TODO: Limitations and things about enable_torch_dispatch_mode we should fix before exposing it:
@@ -15,6 +22,13 @@
 # - It doesn't work with the tensor constructors (torch.tensor, torch.Tensor)
 # - Better name (see https://github.com/pytorch/pytorch/pull/63496#discussion_r694091694)
 
+_is_in_torch_dispatch_mode = False
+
+
+def is_in_torch_dispatch_mode() -> bool:
+    return _is_in_torch_dispatch_mode
+
+
 class TorchDispatchMode:
     """
     A ``TorchDispatchMode`` allows you to override the meaning of all
@@ -49,13 +63,18 @@ class TorchDispatchMode:
     def __init__(self, _dispatch_key=None):
         if _dispatch_key is not None:
             assert isinstance(_dispatch_key, torch._C.DispatchKey)
-            self.__dict__['_dispatch_key'] = _dispatch_key
+            self.__dict__["_dispatch_key"] = _dispatch_key
+
+        self.old_dispatch_mode_flag = False
 
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
-        raise NotImplementedError()
+        raise NotImplementedError
 
     def __enter__(self):
-        _push_mode(self, self.__dict__.get("_dispatch_key", None))
+        global _is_in_torch_dispatch_mode
+        self.old_dispatch_mode_flag = _is_in_torch_dispatch_mode
+        _is_in_torch_dispatch_mode = True
+        _push_mode(self)
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
@@ -64,14 +83,19 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             # Today, mode keys are not used at all in the per-dispatch-key-mode logic (for pre-dispatch)
             # We should probably revisit this.
             mb_dk_or_mode_key = self.__dict__.get("_mode_key", None)
+        global _is_in_torch_dispatch_mode
+        _is_in_torch_dispatch_mode = self.old_dispatch_mode_flag
         _pop_mode(mb_dk_or_mode_key)
 
     @classmethod
     def push(cls, *args, **kwargs):
-        warnings.warn("`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`")
+        warnings.warn(
+            "`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`"
+        )
         instance = cls(*args, **kwargs)
         return instance
 
+
 def _get_current_dispatch_mode():
     stack_len = _len_torch_dispatch_stack()
     # Return a user mode on the stack if there are any
@@ -82,10 +106,17 @@ def _get_current_dispatch_mode():
 
 def _detect_functional_mode():
     from torch._ops import _get_dispatch_mode_pre_dispatch
-    pre_dispatch_functional_mode = _get_dispatch_mode_pre_dispatch(torch._C._TorchDispatchModeKey.FUNCTIONAL)
-    post_dispatch_functional_mode = torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.FUNCTIONAL)
 
-    assert (pre_dispatch_functional_mode is None) or (post_dispatch_functional_mode is None)
+    pre_dispatch_functional_mode = _get_dispatch_mode_pre_dispatch(
+        torch._C._TorchDispatchModeKey.FUNCTIONAL
+    )
+    post_dispatch_functional_mode = torch._C._get_dispatch_mode(
+        torch._C._TorchDispatchModeKey.FUNCTIONAL
+    )
+
+    assert (pre_dispatch_functional_mode is None) or (
+        post_dispatch_functional_mode is None
+    )
 
     if pre_dispatch_functional_mode is None:
         return post_dispatch_functional_mode
@@ -93,17 +124,50 @@ def _detect_functional_mode():
     return pre_dispatch_functional_mode
 
 
+def _unset_infra_mode(key):
+    from torch._ops import _get_dispatch_mode_pre_dispatch, unset_mode_pre_dispatch
+
+    pre_dispatch_mode = _get_dispatch_mode_pre_dispatch(key)
+    post_dispatch_mode = torch._C._get_dispatch_mode(key)
+    if pre_dispatch_mode and post_dispatch_mode:
+        raise AssertionError(
+            "Can't have active infra mode on both pre and post dispatch mode stack"
+        )
+
+    if pre_dispatch_mode:
+        mode = unset_mode_pre_dispatch(key)
+        return mode
+    if post_dispatch_mode:
+        return torch._C._unset_dispatch_mode(key)
+
+
+def _disable_infra_mode(key):
+    assert key in (
+        torch._C._TorchDispatchModeKey.FUNCTIONAL,
+        torch._C._TorchDispatchModeKey.PROXY,
+    )
+    mode_unset = _unset_infra_mode(key)
+    try:
+        yield mode_unset
+    finally:
+        if mode_unset is not None:
+            _push_mode(mode_unset)
+
+
 def _get_current_dispatch_mode_stack():
     stack_len = _len_torch_dispatch_stack()
     return [_get_dispatch_stack_at(i) for i in range(stack_len)]
 
-def _push_mode(mode, k: Optional[DispatchKey] = None):
+
+def _push_mode(mode: TorchDispatchMode):
+    k = mode._dispatch_key if hasattr(mode, "_dispatch_key") else None
     assert k is None or k == torch._C.DispatchKey.PreDispatch
     if k is None:
         _push_on_torch_dispatch_stack(mode)
         return
 
-    from torch._ops import get_cached_ops, _set_mode_pre_dispatch
+    from torch._ops import _set_mode_pre_dispatch, get_cached_ops
+
     # See Note [Not Caching Per-Dispatch-Key Mode Handlers]
     # Clear the cache of every op that has been used so far, for this particular key.
     ks = torch._C._functionality_to_backend_keys(k)
@@ -116,31 +180,69 @@ def _push_mode(mode, k: Optional[DispatchKey] = None):
 def _pop_mode(k: Optional[Union[DispatchKey, torch._C._TorchDispatchModeKey]] = None):
     if k == torch._C.DispatchKey.PreDispatch:  # type: ignore[attr-defined]
         from torch._ops import _pop_mode_from_pre_dispatch
+
         return _pop_mode_from_pre_dispatch()
 
     if k is None or isinstance(k, torch._C._TorchDispatchModeKey):
         return _pop_torch_dispatch_stack(k)
 
+
 @contextlib.contextmanager
 def _pop_mode_temporarily(k: Optional[DispatchKey] = None):
     old = _pop_mode(k)
     try:
         yield old
     finally:
-        _push_mode(old, k)
+        _push_mode(old)
 
 
 @contextlib.contextmanager
 def _disable_current_modes():
+    from torch._ops import (
+        _len_torch_dispatch_stack_pre_dispatch,
+        _pop_mode_from_pre_dispatch,
+    )
+    from torch._subclasses.functional_tensor import FunctionalTensorMode
+    from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
+
+    mode_len_pre_dispatch = _len_torch_dispatch_stack_pre_dispatch()
+    old_pre_dispatch_modes = [
+        _pop_mode_from_pre_dispatch() for _ in range(mode_len_pre_dispatch)
+    ]
+
+    has_proxy_mode_in_pre_dispatch = False
+    has_functional_mode_in_pre_dispatch = False
+
+    for i in old_pre_dispatch_modes:
+        if isinstance(i, ProxyTorchDispatchMode):
+            has_proxy_mode_in_pre_dispatch = True
+        if isinstance(i, FunctionalTensorMode):
+            has_functional_mode_in_pre_dispatch = True
+
     mode_len = _len_torch_dispatch_stack()
     old_modes = [_pop_mode() for _ in range(mode_len)]
 
+    for old in old_modes:
+        if (
+            isinstance(old, FunctionalTensorMode)
+            and has_functional_mode_in_pre_dispatch
+        ):
+            raise AssertionError(
+                "Can't have FunctionalMode available both in PreDispatch and Python Key"
+            )
+        if isinstance(old, ProxyTorchDispatchMode) and has_proxy_mode_in_pre_dispatch:
+            raise AssertionError(
+                "Can't have ProxyTorchDispatchMode available both in PreDispatch and Python Key"
+            )
+
     # Manually disable proxy and fake modes, if any are active
     try:
-        yield old_modes
+        yield old_pre_dispatch_modes + old_modes
     finally:
         for mode in reversed(old_modes):
             _push_mode(mode)
+        for mode in reversed(old_pre_dispatch_modes):
+            _push_mode(mode)
 
 
 class BaseTorchDispatchMode(TorchDispatchMode):
@@ -149,6 +251,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             kwargs = {}
         return func(*args, **kwargs)
 
+
 def is_traceable_wrapper_subclass(t):
     """
     Returns whether or not a tensor subclass that implements __torch_dispatch__
@@ -180,7 +283,12 @@ def is_traceable_wrapper_subclass(t):
                 safely ignored.
     """
     is_subclass = isinstance(t, torch.Tensor) and type(t) != torch.Tensor
-    return is_subclass and hasattr(t, "__tensor_flatten__") and hasattr(t, "__tensor_unflatten__")
+    return (
+        is_subclass
+        and hasattr(t, "__tensor_flatten__")
+        and hasattr(t, "__tensor_unflatten__")
+    )
+
 
 def transform_subclass(t, callback, outer_size=None, outer_stride=None):
     """
@@ -210,15 +318,18 @@ def transform_subclass(t, callback, outer_size=None, outer_stride=None):
     # NB: Purposefully guard here to simplify the inner / outer symbols.
     # Using sym_eq() for symbolic comparison can result in an expression that's too
     # difficult to guard on, so we use == here.
-    assert sub.shape == outer_size, \
-        f"Expected return value from {type(t)}__tensor_unflatten__() to have " \
+    assert sub.shape == outer_size, (
+        f"Expected return value from {type(t)}__tensor_unflatten__() to have "
         f"shape equal to {outer_size}, but got: {sub.shape}"
-    assert sub.stride() == outer_stride, \
-        f"Expected return value from {type(t)}__tensor_unflatten__() to have " \
+    )
+    assert sub.stride() == outer_stride, (
+        f"Expected return value from {type(t)}__tensor_unflatten__() to have "
         f"stride equal to {outer_stride}, but got: {sub.stride()}"
+    )
 
     return sub
 
+
 def _correct_storage_aliasing(func, schema_info, args, outs):
     """
     Given: an OpOverload, a SchemaInfo (cached information from torchgen about schema),
@@ -246,7 +357,9 @@ def alias_non_inplace_storage(arg, ret):
         if is_traceable_wrapper_subclass(arg) or is_traceable_wrapper_subclass(ret):
             ret_list = ret if isinstance(ret, list) else [ret]
             for r in ret_list:
-                assert type(arg) == type(r), f"""Called {str(func)} with input of type {type(arg)}
+                assert type(arg) == type(
+                    r
+                ), f"""Called {str(func)} with input of type {type(arg)}
 and output of type {type(ret)}. But expected types to match."""
         # Need to run under no_dispatch, because we explicitly do **not**
         # want our subclass to intercept the set_() call.
@@ -268,11 +381,21 @@ def alias_non_inplace_storage(arg, ret):
 
                 if isinstance(ret, list):
                     for r in ret:
-                        torch.ops.aten.set_.source_Storage_storage_offset(r, arg.untyped_storage(), r.storage_offset(), r.shape)
+                        torch.ops.aten.set_.source_Storage_storage_offset(
+                            r,
+                            arg.untyped_storage(),
+                            r.storage_offset(),
+                            r.shape,
+                            r.stride(),
+                        )
                 else:
                     assert isinstance(ret, torch.Tensor), f"type: {type(ret)}"
                     torch.ops.aten.set_.source_Storage_storage_offset(
-                        ret, arg.untyped_storage(), ret.storage_offset(), ret.shape
+                        ret,
+                        arg.untyped_storage(),
+                        ret.storage_offset(),
+                        ret.shape,
+                        ret.stride(),
                     )
             finally:
                 torch._C._set_meta_in_tls_dispatch_include(meta_in_tls)
@@ -285,9 +408,12 @@ def is_read_only_alias_match(arg, ret):
     num_returns = len(func._schema.returns)
     for arg_idx in range(num_args):
         for return_idx in range(num_returns):
-            if is_read_only_alias_match(schema_info.args[arg_idx], schema_info.outs[return_idx]):
+            if is_read_only_alias_match(
+                schema_info.args[arg_idx], schema_info.outs[return_idx]
+            ):
                 alias_non_inplace_storage(args[arg_idx], outs[return_idx])
 
+
 # This abstracts over the fact that in return_and_correct_aliasing,
 # we sometimes use torchgen schema parsing (for aten ops, since torchscript's schema parsing is sometimes buggy),
 # and sometimes use torchscript schema parsing (for custom ops, for which torchgen parsing is untested).
@@ -297,14 +423,17 @@ class AliasInfo:
     is_write: bool
     name: Optional[str]
 
+
 @dataclass
 class SchemaInfo:
     args: List[AliasInfo]
     outs: List[AliasInfo]
 
+
 # Can't import torch._ops.OpOverload due to circular reference
 parsed_schema_map: Dict[Any, SchemaInfo] = {}
 
+
 # Given an OpOverload, returns schema information on it.
 # This is cached for efficiency, since it can involve running torchgen
 def get_alias_info(func) -> SchemaInfo:
@@ -319,39 +448,61 @@ def get_alias_info(func) -> SchemaInfo:
         # and torchgen doesn't know how to handle
         torchgen_schema_str = torchgen_schema_str[6:]
         import re
+
         # the torchscript parser ends up converting int[2]=1 into int[2]=[1, 1],
         # which torchgen chokes on.
-        torchgen_schema_str = re.sub(r'=\[[0, ]+\]', '=0', torchgen_schema_str)
-        torchgen_schema_str = re.sub(r'=\[[1, ]+\]', '=1', torchgen_schema_str)
+        torchgen_schema_str = re.sub(r"=\[[0, ]+\]", "=0", torchgen_schema_str)
+        torchgen_schema_str = re.sub(r"=\[[1, ]+\]", "=1", torchgen_schema_str)
         # for aten::rot90
         torchgen_schema_str = torchgen_schema_str.replace("=[0, 1]", "=[0,1]")
         torchgen_schema = torchgen.model.FunctionSchema.parse(torchgen_schema_str)
-        arg_schemas = [AliasInfo(
-            alias_set=set() if a.annotation is None else set(a.annotation.alias_set),
-            is_write=a.annotation is not None and a.annotation.is_write,
-            name=a.name,
-        ) for a in torchgen_schema.arguments.flat_all]
-        out_schemas = [AliasInfo(
-            alias_set=set() if a.annotation is None else set(a.annotation.alias_set),
-            is_write=a.annotation is not None and a.annotation.is_write,
-            name=a.name,
-        ) for a in torchgen_schema.returns]
+        arg_schemas = [
+            AliasInfo(
+                alias_set=(
+                    set() if a.annotation is None else set(a.annotation.alias_set)
+                ),
+                is_write=a.annotation is not None and a.annotation.is_write,
+                name=a.name,
+            )
+            for a in torchgen_schema.arguments.flat_all
+        ]
+        out_schemas = [
+            AliasInfo(
+                alias_set=(
+                    set() if a.annotation is None else set(a.annotation.alias_set)
+                ),
+                is_write=a.annotation is not None and a.annotation.is_write,
+                name=a.name,
+            )
+            for a in torchgen_schema.returns
+        ]
     else:
         # For non-aten ops, torchgen is untested so we rely on torchscript schema parsing
-        arg_schemas = [AliasInfo(
-            alias_set=set() if a.alias_info is None else set(a.alias_info.before_set),
-            is_write=a.alias_info is not None and a.alias_info.is_write,
-            name=a.name,
-        ) for a in func._schema.arguments]
-        out_schemas = [AliasInfo(
-            alias_set=set() if a.alias_info is None else set(a.alias_info.before_set),
-            is_write=a.alias_info is not None and a.alias_info.is_write,
-            name=a.name,
-        ) for a in func._schema.returns]
+        arg_schemas = [
+            AliasInfo(
+                alias_set=(
+                    set() if a.alias_info is None else set(a.alias_info.before_set)
+                ),
+                is_write=a.alias_info is not None and a.alias_info.is_write,
+                name=a.name,
+            )
+            for a in func._schema.arguments
+        ]
+        out_schemas = [
+            AliasInfo(
+                alias_set=(
+                    set() if a.alias_info is None else set(a.alias_info.before_set)
+                ),
+                is_write=a.alias_info is not None and a.alias_info.is_write,
+                name=a.name,
+            )
+            for a in func._schema.returns
+        ]
     schema_info = SchemaInfo(args=arg_schemas, outs=out_schemas)
     parsed_schema_map[func] = schema_info
     return schema_info
 
+
 def return_and_correct_aliasing(func, args, kwargs, out):
     """
     This function should be used by wrapper tensor ``__torch_dispatch__`` subclasses
@@ -383,11 +534,12 @@ def get_write_alias(x):
         return None
 
     def get_arg_from_alias(output_alias, schema_info, args, kwargs):
-        new_args, new_kwargs = torch.fx.operator_schemas.normalize_function(func, args=args, kwargs=kwargs)
+        new_args, new_kwargs = torch.fx.operator_schemas.normalize_function(
+            func, args=args, kwargs=kwargs
+        )
 
         arg_indices = [
-            i for i, a in enumerate(schema_info.args)
-            if output_alias in a.alias_set
+            i for i, a in enumerate(schema_info.args) if output_alias in a.alias_set
         ]
         # For any dispatcher op with an output alias, we expect it to map to exactly one alias in the schema's input arguments.
         assert len(arg_indices) == 1
@@ -399,14 +551,20 @@ def get_arg_from_alias(output_alias, schema_info, args, kwargs):
 
     # Fix up the storages of any outs so that they point to the same storage as the input,
     # if func is a view op.
-    _correct_storage_aliasing(func, schema_info, args, (out,) if not isinstance(out, tuple) else out)
+    _correct_storage_aliasing(
+        func, schema_info, args, (out,) if not isinstance(out, tuple) else out
+    )
 
     # For inplace_view ops in particular, we'll try hard to make sure that the wrapper subclass's
     # metadata is set correctly.
     if torch.Tag.inplace_view in func.tags:
         # no_dispatch() to make sure that we secretly change the metadata on the wrapper,
         # but don't end up dispatching the op anywhere else.
-        mutated_args = [x for i, x in enumerate(args) if get_write_alias(schema_info.args[i]) is not None]
+        mutated_args = [
+            x
+            for i, x in enumerate(args)
+            if get_write_alias(schema_info.args[i]) is not None
+        ]
         # Assumption: we have a very small number of inplace_view ops that follow a strict schema:
         # there is only a single argument that gets its metadata mutated.
         assert len(mutated_args) == 1
@@ -414,6 +572,7 @@ def get_arg_from_alias(output_alias, schema_info, args, kwargs):
         # but FunctionalTensor is special: it overrides all size/stride calls to plumb to the inner tensor.
         # so we don't actually need to update the metadata (and attempting to do so causes errors)
         from torch._subclasses.functional_tensor import FunctionalTensor
+
         if not isinstance(mutated_args[0], FunctionalTensor):
             with torch.utils._mode_utils.no_dispatch():
                 # See Note: [Fake Tensor Dispatch Keys]
@@ -436,12 +595,21 @@ def get_arg_from_alias(output_alias, schema_info, args, kwargs):
         raise RuntimeError("Unsupported schema: " + str(func._schema))
 
     if len(func._schema.returns) == 1:
-        return get_arg_from_alias(get_write_alias(schema_info.outs[0]), schema_info, args, kwargs)
+        return get_arg_from_alias(
+            get_write_alias(schema_info.outs[0]), schema_info, args, kwargs
+        )
 
     # In the multi-return case, all aten ops return a tuple / list, so cast accordingly.
-    outs_to_return = type(out)([
-        get_arg_from_alias(get_write_alias(schema_info.outs[i]), schema_info, args, kwargs)
-        if get_write_alias(r) is not None else o
-        for ((i, r), o) in zip(enumerate(schema_info.outs), out)
-    ])
+    outs_to_return = type(out)(
+        [
+            (
+                get_arg_from_alias(
+                    get_write_alias(schema_info.outs[i]), schema_info, args, kwargs
+                )
+                if get_write_alias(r) is not None
+                else o
+            )
+            for ((i, r), o) in zip(enumerate(schema_info.outs), out)
+        ]
+    )
     return outs_to_return
diff --git a/torch/utils/_pytree.py b/torch/utils/_pytree.py
index d1caaf0062ce1..52f0d65ded0b0 100644
--- a/torch/utils/_pytree.py
+++ b/torch/utils/_pytree.py
@@ -16,9 +16,12 @@
 """
 
 import dataclasses
+import functools
 import importlib
 import json
+import sys
 import threading
+import types
 import warnings
 from collections import defaultdict, deque, namedtuple, OrderedDict
 from typing import (
@@ -29,12 +32,17 @@
     Deque,
     Dict,
     FrozenSet,
+    Generic,
+    Hashable,
     Iterable,
     List,
+    Mapping,
     NamedTuple,
     Optional,
     OrderedDict as GenericOrderedDict,
     overload,
+    Protocol,
+    Sequence,
     Tuple,
     Type,
     TypeVar,
@@ -52,12 +60,18 @@
     "FromDumpableContextFn",
     "TreeSpec",
     "LeafSpec",
+    "keystr",
+    "key_get",
     "register_pytree_node",
     "tree_flatten",
+    "tree_flatten_with_path",
     "tree_unflatten",
+    "tree_iter",
     "tree_leaves",
+    "tree_leaves_with_path",
     "tree_structure",
     "tree_map",
+    "tree_map_with_path",
     "tree_map_",
     "tree_map_only",
     "tree_map_only_",
@@ -80,6 +94,21 @@
 DEFAULT_TREESPEC_SERIALIZATION_PROTOCOL = 1
 NO_SERIALIZED_TYPE_NAME_FOUND = "NO_SERIALIZED_TYPE_NAME_FOUND"
 
+
+class KeyEntry(Protocol):
+    def __hash__(self) -> int:
+        ...
+
+    def __eq__(self, other: object) -> bool:
+        ...
+
+    def __str__(self) -> str:
+        ...
+
+    def get(self, parent: Any) -> Any:
+        ...
+
+
 Context = Any
 PyTree = Any
 FlattenFunc = Callable[[PyTree], Tuple[List[Any], Context]]
@@ -89,6 +118,8 @@
 FromDumpableContextFn = Callable[[DumpableContext], Context]
 ToStrFunc = Callable[["TreeSpec", List[str]], str]
 MaybeFromStrFunc = Callable[[str], Optional[Tuple[Any, Context, str]]]
+KeyPath = Tuple[KeyEntry, ...]
+FlattenWithKeysFunc = Callable[[PyTree], Tuple[List[Tuple[KeyEntry, Any]], Any]]
 
 
 # A NodeDef holds two callables:
@@ -98,10 +129,13 @@
 # - unflatten_fn should take a flat list of values and some context
 #   (returned by flatten_fn). It returns the collection by reconstructing
 #   it from the list and the context.
+# - flatten_with_keys_fn, which is a callable that takes a
+#   pytree and returns a list of (keypath, value) pairs and a context.
 class NodeDef(NamedTuple):
     type: Type[Any]
     flatten_fn: FlattenFunc
     unflatten_fn: UnflattenFunc
+    flatten_with_keys_fn: Optional[FlattenWithKeysFunc]
 
 
 _NODE_REGISTRY_LOCK = threading.Lock()
@@ -134,6 +168,7 @@ def register_pytree_node(
     serialized_type_name: Optional[str] = None,
     to_dumpable_context: Optional[ToDumpableContextFn] = None,
     from_dumpable_context: Optional[FromDumpableContextFn] = None,
+    flatten_with_keys_fn: Optional[FlattenWithKeysFunc] = None,
 ) -> None:
     """Register a container-like type as pytree node.
 
@@ -154,6 +189,10 @@ def register_pytree_node(
             to convert the custom json dumpable representation of the context
             back to the original context. This is used for json deserialization,
             which is being used in torch.export right now.
+        flatten_with_keys_fn: An optional keyword argument to specify how to
+            access each pytree leaf's keypath when flattening and tree-mapping.
+            Like ``flatten_fn``, but in place of a List[leaf], it should return
+            a List[(keypath, leaf)].
     """
     with _NODE_REGISTRY_LOCK:
         if cls in SUPPORTED_NODES:
@@ -166,6 +205,7 @@ def register_pytree_node(
         serialized_type_name=serialized_type_name,
         to_dumpable_context=to_dumpable_context,
         from_dumpable_context=from_dumpable_context,
+        flatten_with_keys_fn=flatten_with_keys_fn,
     )
 
     try:
@@ -183,6 +223,34 @@ def register_pytree_node(
         )
 
 
+def _register_namedtuple(
+    cls: Type[Any],
+    *,
+    serialized_type_name: str,
+) -> None:
+    """
+    Registers a namedtuple as a valid pytree node. By default namedtuples are
+    valid pytree nodes, but they are not serializable. This API provides the
+    argument `serialized_type_name` which allows these namedtuples to be
+    serialized.
+
+    Args:
+        cls: the dataclass type to register
+        serialized_type_name: The serialized name for the dataclass. This is
+        required if you want to serialize the pytree TreeSpec containing this
+        namedtuple.
+    """
+    _private_register_pytree_node(
+        cls,
+        _namedtuple_flatten,
+        _namedtuple_unflatten,
+        serialized_type_name=serialized_type_name,
+        to_dumpable_context=_namedtuple_serialize,
+        from_dumpable_context=_namedtuple_deserialize,
+        flatten_with_keys_fn=_namedtuple_flatten_with_keys,
+    )
+
+
 def _register_pytree_node(
     cls: Type[Any],
     flatten_fn: FlattenFunc,
@@ -193,6 +261,7 @@ def _register_pytree_node(
     serialized_type_name: Optional[str] = None,
     to_dumpable_context: Optional[ToDumpableContextFn] = None,
     from_dumpable_context: Optional[FromDumpableContextFn] = None,
+    flatten_with_keys_fn: Optional[FlattenWithKeysFunc] = None,
 ) -> None:
     """Register a container-like type as pytree node for the Python pytree only.
 
@@ -213,6 +282,10 @@ def _register_pytree_node(
             to convert the custom json dumpable representation of the context
             back to the original context. This is used for json deserialization,
             which is being used in torch.export right now.
+        flatten_with_keys_fn: An optional keyword argument to specify how to
+            access each pytree leaf's keypath when flattening and tree-mapping.
+            Like ``flatten_fn``, but in place of a List[leaf], it should return
+            a List[(keypath, leaf)].
     """
     warnings.warn(
         "torch.utils._pytree._register_pytree_node is deprecated. "
@@ -233,6 +306,7 @@ def _register_pytree_node(
         serialized_type_name=serialized_type_name,
         to_dumpable_context=to_dumpable_context,
         from_dumpable_context=from_dumpable_context,
+        flatten_with_keys_fn=flatten_with_keys_fn,
     )
 
 
@@ -244,6 +318,7 @@ def _private_register_pytree_node(
     serialized_type_name: Optional[str] = None,
     to_dumpable_context: Optional[ToDumpableContextFn] = None,
     from_dumpable_context: Optional[FromDumpableContextFn] = None,
+    flatten_with_keys_fn: Optional[FlattenWithKeysFunc] = None,
 ) -> None:
     """This is an internal function that is used to register a pytree node type
     for the Python pytree only. End-users should use :func:`register_pytree_node`
@@ -257,11 +332,7 @@ def _private_register_pytree_node(
                 "Overwriting the previous registration.",
             )
 
-        node_def = NodeDef(
-            cls,
-            flatten_fn,
-            unflatten_fn,
-        )
+        node_def = NodeDef(cls, flatten_fn, unflatten_fn, flatten_with_keys_fn)
         SUPPORTED_NODES[cls] = node_def
 
         if (to_dumpable_context is None) ^ (from_dumpable_context is None):
@@ -271,7 +342,7 @@ def _private_register_pytree_node(
             )
 
         if serialized_type_name is None:
-            serialized_type_name = f"{cls.__module__}.{cls.__qualname__}"
+            serialized_type_name = NO_SERIALIZED_TYPE_NAME_FOUND
 
         serialize_node_def = _SerializeNodeDef(
             cls,
@@ -283,10 +354,53 @@ def _private_register_pytree_node(
         SERIALIZED_TYPE_TO_PYTHON_TYPE[serialized_type_name] = cls
 
 
+@dataclasses.dataclass(frozen=True)
+class SequenceKey(Generic[T]):
+    idx: int
+
+    def __str__(self) -> str:
+        return f"[{self.idx!r}]"
+
+    def get(self, sequence: Sequence[T]) -> T:
+        return sequence[self.idx]
+
+
+K = TypeVar("K", bound=Hashable)
+
+
+@dataclasses.dataclass(frozen=True)
+class MappingKey(Generic[K, T]):
+    key: K
+
+    def __str__(self) -> str:
+        return f"[{self.key!r}]"
+
+    def get(self, mapping: Mapping[K, T]) -> T:
+        return mapping[self.key]
+
+
+@dataclasses.dataclass(frozen=True)
+class GetAttrKey:
+    name: str
+
+    def __str__(self) -> str:
+        return f".{self.name}"
+
+    def get(self, obj: Any) -> Any:
+        return getattr(obj, self.name)
+
+
 def _tuple_flatten(d: Tuple[Any, ...]) -> Tuple[List[Any], Context]:
     return list(d), None
 
 
+def _tuple_flatten_with_keys(
+    d: Tuple[Any, ...]
+) -> Tuple[List[Tuple[KeyEntry, Any]], Context]:
+    values, context = _tuple_flatten(d)
+    return [(SequenceKey(i), v) for i, v in enumerate(values)], context
+
+
 def _tuple_unflatten(values: Iterable[Any], context: Context) -> Tuple[Any, ...]:
     return tuple(values)
 
@@ -295,6 +409,11 @@ def _list_flatten(d: List[Any]) -> Tuple[List[Any], Context]:
     return d, None
 
 
+def _list_flatten_with_keys(d: List[Any]) -> Tuple[List[Tuple[KeyEntry, Any]], Context]:
+    values, context = _list_flatten(d)
+    return [(SequenceKey(i), v) for i, v in enumerate(values)], context
+
+
 def _list_unflatten(values: Iterable[Any], context: Context) -> List[Any]:
     return list(values)
 
@@ -303,6 +422,13 @@ def _dict_flatten(d: Dict[Any, Any]) -> Tuple[List[Any], Context]:
     return list(d.values()), list(d.keys())
 
 
+def _dict_flatten_with_keys(
+    d: Dict[Any, Any]
+) -> Tuple[List[Tuple[KeyEntry, Any]], Context]:
+    values, context = _dict_flatten(d)
+    return [(MappingKey(k), v) for k, v in zip(context, values)], context
+
+
 def _dict_unflatten(values: Iterable[Any], context: Context) -> Dict[Any, Any]:
     return dict(zip(context, values))
 
@@ -311,29 +437,62 @@ def _namedtuple_flatten(d: NamedTuple) -> Tuple[List[Any], Context]:
     return list(d), type(d)
 
 
+def _namedtuple_flatten_with_keys(
+    d: NamedTuple,
+) -> Tuple[List[Tuple[KeyEntry, Any]], Context]:
+    values, context = _namedtuple_flatten(d)
+    return (
+        [(GetAttrKey(field), v) for field, v in zip(context._fields, values)],
+        context,
+    )
+
+
 def _namedtuple_unflatten(values: Iterable[Any], context: Context) -> NamedTuple:
     return cast(NamedTuple, context(*values))
 
 
 def _namedtuple_serialize(context: Context) -> DumpableContext:
-    json_namedtuple = {
-        "class_name": context.__name__,
-        "fields": context._fields,
-    }
-    return json_namedtuple
+    if context not in SUPPORTED_SERIALIZED_TYPES:
+        raise NotImplementedError(
+            f"Can't serialize TreeSpec of namedtuple class {context} because we "
+            "didn't register a serializated_type_name. Please register using "
+            "`_register_namedtuple`."
+        )
+
+    serialize_node_def = SUPPORTED_SERIALIZED_TYPES[context]
+    serialized_type_name = serialize_node_def.serialized_type_name
+
+    if serialized_type_name == NO_SERIALIZED_TYPE_NAME_FOUND:
+        raise NotImplementedError(
+            f"Can't serialize TreeSpec of namedtuple class {context} because we "
+            "couldn't find a serializated_type_name. Please register using "
+            "`_register_namedtuple`."
+        )
+    return serialized_type_name
 
 
 def _namedtuple_deserialize(dumpable_context: DumpableContext) -> Context:
-    class_name = dumpable_context["class_name"]
-    assert isinstance(class_name, str)
-    context = namedtuple(class_name, dumpable_context["fields"])  # type: ignore[misc]
-    return context
+    if dumpable_context not in SERIALIZED_TYPE_TO_PYTHON_TYPE:
+        raise NotImplementedError(
+            f"Can't deserialize TreeSpec of namedtuple class {dumpable_context} "
+            "because we couldn't find a serializated name."
+        )
+
+    typ = SERIALIZED_TYPE_TO_PYTHON_TYPE[dumpable_context]
+    return typ
 
 
 def _ordereddict_flatten(d: GenericOrderedDict[Any, Any]) -> Tuple[List[Any], Context]:
     return list(d.values()), list(d.keys())
 
 
+def _ordereddict_flatten_with_keys(
+    d: GenericOrderedDict[Any, Any]
+) -> Tuple[List[Tuple[KeyEntry, Any]], Context]:
+    values, context = _ordereddict_flatten(d)
+    return [(MappingKey(k), v) for k, v in zip(context, values)], context
+
+
 def _ordereddict_unflatten(
     values: Iterable[Any],
     context: Context,
@@ -350,6 +509,14 @@ def _defaultdict_flatten(d: DefaultDict[Any, Any]) -> Tuple[List[Any], Context]:
     return values, [d.default_factory, dict_context]
 
 
+def _defaultdict_flatten_with_keys(
+    d: DefaultDict[Any, Any]
+) -> Tuple[List[Tuple[KeyEntry, Any]], Context]:
+    values, context = _defaultdict_flatten(d)
+    _, dict_context = context
+    return [(MappingKey(k), v) for k, v in zip(dict_context, values)], context
+
+
 def _defaultdict_unflatten(
     values: Iterable[Any],
     context: Context,
@@ -387,8 +554,15 @@ def _defaultdict_deserialize(dumpable_context: DumpableContext) -> Context:
     return [default_factory, dict_context]
 
 
-def _deque_flatten(deq: Deque[Any]) -> Tuple[List[Any], Context]:
-    return list(deq), deq.maxlen
+def _deque_flatten(d: Deque[Any]) -> Tuple[List[Any], Context]:
+    return list(d), d.maxlen
+
+
+def _deque_flatten_with_keys(
+    d: Deque[Any],
+) -> Tuple[List[Tuple[KeyEntry, Any]], Context]:
+    values, context = _deque_flatten(d)
+    return [(SequenceKey(i), v) for i, v in enumerate(values)], context
 
 
 def _deque_unflatten(values: Iterable[Any], context: Context) -> Deque[Any]:
@@ -400,18 +574,21 @@ def _deque_unflatten(values: Iterable[Any], context: Context) -> Deque[Any]:
     _tuple_flatten,
     _tuple_unflatten,
     serialized_type_name="builtins.tuple",
+    flatten_with_keys_fn=_tuple_flatten_with_keys,
 )
 _private_register_pytree_node(
     list,
     _list_flatten,
     _list_unflatten,
     serialized_type_name="builtins.list",
+    flatten_with_keys_fn=_list_flatten_with_keys,
 )
 _private_register_pytree_node(
     dict,
     _dict_flatten,
     _dict_unflatten,
     serialized_type_name="builtins.dict",
+    flatten_with_keys_fn=_dict_flatten_with_keys,
 )
 _private_register_pytree_node(
     namedtuple,  # type: ignore[arg-type]
@@ -420,12 +597,14 @@ def _deque_unflatten(values: Iterable[Any], context: Context) -> Deque[Any]:
     serialized_type_name="collections.namedtuple",
     to_dumpable_context=_namedtuple_serialize,
     from_dumpable_context=_namedtuple_deserialize,
+    flatten_with_keys_fn=_namedtuple_flatten_with_keys,
 )
 _private_register_pytree_node(
     OrderedDict,
     _ordereddict_flatten,
     _ordereddict_unflatten,
     serialized_type_name="collections.OrderedDict",
+    flatten_with_keys_fn=_ordereddict_flatten_with_keys,
 )
 _private_register_pytree_node(
     defaultdict,
@@ -434,12 +613,14 @@ def _deque_unflatten(values: Iterable[Any], context: Context) -> Deque[Any]:
     serialized_type_name="collections.defaultdict",
     to_dumpable_context=_defaultdict_serialize,
     from_dumpable_context=_defaultdict_deserialize,
+    flatten_with_keys_fn=_defaultdict_flatten_with_keys,
 )
 _private_register_pytree_node(
     deque,
     _deque_flatten,
     _deque_unflatten,
     serialized_type_name="collections.deque",
+    flatten_with_keys_fn=_deque_flatten_with_keys,
 )
 
 
@@ -685,22 +866,21 @@ def tree_unflatten(leaves: Iterable[Any], treespec: TreeSpec) -> PyTree:
     return treespec.unflatten(leaves)
 
 
-def _tree_leaves_helper(
+def tree_iter(
     tree: PyTree,
-    leaves: List[Any],
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
-) -> None:
+) -> Iterable[Any]:
+    """Get an iterator over the leaves of a pytree."""
     if _is_leaf(tree, is_leaf=is_leaf):
-        leaves.append(tree)
-        return
-
-    node_type = _get_node_type(tree)
-    flatten_fn = SUPPORTED_NODES[node_type].flatten_fn
-    child_pytrees, _ = flatten_fn(tree)
+        yield tree
+    else:
+        node_type = _get_node_type(tree)
+        flatten_fn = SUPPORTED_NODES[node_type].flatten_fn
+        child_pytrees, _ = flatten_fn(tree)
 
-    # Recursively flatten the children
-    for child in child_pytrees:
-        _tree_leaves_helper(child, leaves, is_leaf=is_leaf)
+        # Recursively flatten the children
+        for child in child_pytrees:
+            yield from tree_iter(child, is_leaf=is_leaf)
 
 
 def tree_leaves(
@@ -708,9 +888,7 @@ def tree_leaves(
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> List[Any]:
     """Get a list of leaves of a pytree."""
-    leaves: List[Any] = []
-    _tree_leaves_helper(tree, leaves, is_leaf=is_leaf)
-    return leaves
+    return list(tree_iter(tree, is_leaf=is_leaf))
 
 
 def tree_structure(
@@ -795,13 +973,16 @@ def tree_map_(
     """
     leaves, treespec = tree_flatten(tree, is_leaf=is_leaf)
     flat_args = [leaves] + [treespec.flatten_up_to(r) for r in rests]
-    deque(map(func, *flat_args), maxlen=0)  # consume and exhaust the iterable
+    tuple(map(func, *flat_args))  # consume and exhaust the iterable
     return tree
 
 
 Type2 = Tuple[Type[T], Type[S]]
 Type3 = Tuple[Type[T], Type[S], Type[U]]
-TypeAny = Union[Type[Any], Tuple[Type[Any], ...]]
+if sys.version_info >= (3, 10):
+    TypeAny = Union[Type[Any], Tuple[Type[Any], ...], types.UnionType]
+else:
+    TypeAny = Union[Type[Any], Tuple[Type[Any], ...]]
 
 Fn2 = Callable[[Union[T, S]], R]
 Fn3 = Callable[[Union[T, S, U]], R]
@@ -814,27 +995,34 @@ def tree_map_(
 # These specializations help with type inference on the lambda passed to this
 # function
 @overload
-def map_only(__type_or_types: Type2[T, S]) -> MapOnlyFn[Fn2[T, S, Any]]:
+def map_only(__type_or_types_or_pred: Type2[T, S]) -> MapOnlyFn[Fn2[T, S, Any]]:
     ...
 
 
 @overload
-def map_only(__type_or_types: Type3[T, S, U]) -> MapOnlyFn[Fn3[T, S, U, Any]]:
+def map_only(__type_or_types_or_pred: Type3[T, S, U]) -> MapOnlyFn[Fn3[T, S, U, Any]]:
     ...
 
 
 @overload
-def map_only(__type_or_types: Type[T]) -> MapOnlyFn[Fn[T, Any]]:
+def map_only(__type_or_types_or_pred: Type[T]) -> MapOnlyFn[Fn[T, Any]]:
     ...
 
 
 # This specialization is needed for the implementations below that call
 @overload
-def map_only(__type_or_types: TypeAny) -> MapOnlyFn[FnAny[Any]]:
+def map_only(__type_or_types_or_pred: TypeAny) -> MapOnlyFn[FnAny[Any]]:
+    ...
+
+
+@overload
+def map_only(__type_or_types_or_pred: Callable[[Any], bool]) -> MapOnlyFn[FnAny[Any]]:
     ...
 
 
-def map_only(__type_or_types: TypeAny) -> MapOnlyFn[FnAny[Any]]:
+def map_only(
+    __type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]]
+) -> MapOnlyFn[FnAny[Any]]:
     """
     Suppose you are writing a tree_map over tensors, leaving everything
     else unchanged.  Ordinarily you would have to write:
@@ -853,11 +1041,23 @@ def go(t):
 
     You can also directly use 'tree_map_only'
     """
+    if isinstance(__type_or_types_or_pred, (type, tuple)) or (
+        sys.version_info >= (3, 10)
+        and isinstance(__type_or_types_or_pred, types.UnionType)
+    ):
+
+        def pred(x: Any) -> bool:
+            return isinstance(x, __type_or_types_or_pred)  # type: ignore[arg-type]
+
+    elif callable(__type_or_types_or_pred):
+        pred = __type_or_types_or_pred  # type: ignore[assignment]
+    else:
+        raise TypeError("Argument must be a type, a tuple of types, or a callable.")
 
     def wrapper(func: Callable[[T], Any]) -> Callable[[Any], Any]:
-        # @functools.wraps(func)  # torch dynamo doesn't support this yet
+        @functools.wraps(func)
         def wrapped(x: T) -> Any:
-            if isinstance(x, __type_or_types):
+            if pred(x):
                 return func(x)
             return x
 
@@ -868,7 +1068,7 @@ def wrapped(x: T) -> Any:
 
 @overload
 def tree_map_only(
-    __type_or_types: Type[T],
+    __type_or_types_or_pred: Type[T],
     func: Fn[T, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
@@ -878,7 +1078,7 @@ def tree_map_only(
 
 @overload
 def tree_map_only(
-    __type_or_types: Type2[T, S],
+    __type_or_types_or_pred: Type2[T, S],
     func: Fn2[T, S, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
@@ -888,7 +1088,7 @@ def tree_map_only(
 
 @overload
 def tree_map_only(
-    __type_or_types: Type3[T, S, U],
+    __type_or_types_or_pred: Type3[T, S, U],
     func: Fn3[T, S, U, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
@@ -896,18 +1096,28 @@ def tree_map_only(
     ...
 
 
+@overload
 def tree_map_only(
-    __type_or_types: TypeAny,
+    __type_or_types_or_pred: Callable[[Any], bool],
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> PyTree:
-    return tree_map(map_only(__type_or_types)(func), tree, is_leaf=is_leaf)
+    ...
+
+
+def tree_map_only(
+    __type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]],
+    func: FnAny[Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    return tree_map(map_only(__type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
 
 
 @overload
 def tree_map_only_(
-    __type_or_types: Type[T],
+    __type_or_types_or_pred: Type[T],
     func: Fn[T, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
@@ -917,7 +1127,7 @@ def tree_map_only_(
 
 @overload
 def tree_map_only_(
-    __type_or_types: Type2[T, S],
+    __type_or_types_or_pred: Type2[T, S],
     func: Fn2[T, S, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
@@ -927,7 +1137,7 @@ def tree_map_only_(
 
 @overload
 def tree_map_only_(
-    __type_or_types: Type3[T, S, U],
+    __type_or_types_or_pred: Type3[T, S, U],
     func: Fn3[T, S, U, Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
@@ -935,13 +1145,23 @@ def tree_map_only_(
     ...
 
 
+@overload
 def tree_map_only_(
-    __type_or_types: TypeAny,
+    __type_or_types_or_pred: Callable[[Any], bool],
+    func: FnAny[Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+def tree_map_only_(
+    __type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]],
     func: FnAny[Any],
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> PyTree:
-    return tree_map_(map_only(__type_or_types)(func), tree, is_leaf=is_leaf)
+    return tree_map_(map_only(__type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
 
 
 def tree_all(
@@ -949,7 +1169,7 @@ def tree_all(
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> bool:
-    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    flat_args = tree_iter(tree, is_leaf=is_leaf)
     return all(map(pred, flat_args))
 
 
@@ -958,7 +1178,7 @@ def tree_any(
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> bool:
-    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    flat_args = tree_iter(tree, is_leaf=is_leaf)
     return any(map(pred, flat_args))
 
 
@@ -998,7 +1218,7 @@ def tree_all_only(
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> bool:
-    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    flat_args = tree_iter(tree, is_leaf=is_leaf)
     return all(pred(x) for x in flat_args if isinstance(x, __type_or_types))
 
 
@@ -1038,7 +1258,7 @@ def tree_any_only(
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> bool:
-    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    flat_args = tree_iter(tree, is_leaf=is_leaf)
     return any(pred(x) for x in flat_args if isinstance(x, __type_or_types))
 
 
@@ -1059,7 +1279,7 @@ def _broadcast_to_and_flatten(
 
     if _is_leaf(tree, is_leaf=is_leaf):
         return [tree] * treespec.num_leaves
-    if isinstance(treespec, LeafSpec):
+    if treespec.is_leaf():
         return None
     node_type = _get_node_type(tree)
     if node_type != treespec.type:
@@ -1108,7 +1328,7 @@ class _ProtocolFn(NamedTuple):
 
 
 def _treespec_to_json(treespec: TreeSpec) -> _TreeSpecSchema:
-    if isinstance(treespec, LeafSpec):
+    if treespec.is_leaf():
         return _TreeSpecSchema(None, None, [])
 
     if treespec.type not in SUPPORTED_SERIALIZED_TYPES:
@@ -1149,7 +1369,7 @@ def _json_to_treespec(json_schema: DumpableContext) -> TreeSpec:
         and json_schema["context"] is None
         and len(json_schema["children_spec"]) == 0
     ):
-        return LeafSpec()
+        return _LEAF_SPEC
 
     if json_schema["type"] not in SERIALIZED_TYPE_TO_PYTHON_TYPE:
         raise NotImplementedError(
@@ -1246,7 +1466,128 @@ def arg_tree_leaves(*args: PyTree, **kwargs: PyTree) -> List[Any]:
     """
     leaves: List[Any] = []
     for a in args:
-        _tree_leaves_helper(a, leaves)
+        leaves.extend(tree_iter(a))
     for a in kwargs.values():
-        _tree_leaves_helper(a, leaves)
+        leaves.extend(tree_iter(a))
     return leaves
+
+
+def tree_flatten_with_path(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> Tuple[List[Tuple[KeyPath, Any]], TreeSpec]:
+    """Flattens a pytree like :func:`tree_flatten`, but also returns each leaf's key path.
+
+    Args:
+        tree: a pytree to flatten. If it contains a custom type, that type must be
+            registered with an appropriate `tree_flatten_with_path_fn` when registered
+            with :func:`register_pytree_node`.
+        is_leaf: An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+    Returns:
+        A tuple where the first element is a list of (key path, leaf) pairs, and the
+        second element is a :class:`TreeSpec` representing the structure of the flattened
+        tree.
+    """
+    _, treespec = tree_flatten(tree, is_leaf)
+    return list(_generate_key_paths((), tree, is_leaf)), treespec
+
+
+def tree_leaves_with_path(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> List[Tuple[KeyPath, Any]]:
+    """Gets the leaves of a pytree like ``tree_leaves`` and returns each leaf's key path.
+
+    Args:
+        tree: a pytree. If it contains a custom type, that type must be
+            registered with an appropriate `tree_flatten_with_path_fn` when registered
+            with :func:`register_pytree_node`.
+        is_leaf: An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+    Returns:
+        A list of (key path, leaf) pairs.
+    """
+    return list(_generate_key_paths((), tree, is_leaf))
+
+
+def _generate_key_paths(
+    key_path: KeyPath,
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> Iterable[Tuple[KeyPath, Any]]:
+    if is_leaf and is_leaf(tree):
+        yield key_path, tree
+        return
+
+    node_type = _get_node_type(tree)
+    handler = SUPPORTED_NODES.get(node_type)
+    if not handler:
+        # This is a leaf
+        yield key_path, tree
+        return
+
+    flatten_with_keys = handler.flatten_with_keys_fn
+    if flatten_with_keys:
+        key_children, _ = flatten_with_keys(tree)
+        for k, c in key_children:
+            yield from _generate_key_paths((*key_path, k), c, is_leaf)
+    else:
+        # We registered this pytree but didn't add a flatten_with_keys_fn, complain.
+        raise ValueError(
+            f"Did not find a flatten_with_keys_fn for type: {node_type}. "
+            "Please pass a flatten_with_keys_fn argument to register_pytree_node."
+        )
+
+
+def tree_map_with_path(
+    func: Callable[..., Any],
+    tree: PyTree,
+    *rests: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    """Like :func:`tree_map`, but the provided callable takes an additional key path argument.
+
+    Args:
+        func: A function that takes ``2 + len(rests)`` arguments, to be applied at the
+            corresponding leaves of the pytrees. The first positional argument
+            to ``func`` is the key path of the leaf in question. The second
+            positional argument is the value of the leaf.
+        tree: A pytree to be mapped over, with each leaf providing the first positional
+            argument to function ``func``.
+        rests: A tuple of pytrees, each of which has the same structure as
+            ``tree`` or has ``tree`` as a prefix.
+        is_leaf: An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+
+    Returns
+        A new pytree with the same structure as ``tree`` but with the value at each leaf given by
+        ``func(keypath, x, *xs)`` where ``keypath`` is the key path at the
+        corresponding leaf in ``tree``, ``x`` is the value at that leaf, and
+        ``xs`` is the tuple of values at corresponding nodes in ``rests``.
+    """
+    keypath_leaves, treespec = tree_flatten_with_path(tree, is_leaf)
+    keypath_leaves = list(zip(*keypath_leaves))
+    all_keypath_leaves = keypath_leaves + [treespec.flatten_up_to(r) for r in rests]
+    return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))
+
+
+def keystr(kp: KeyPath) -> str:
+    """Given a key path, return a pretty-printed representation."""
+    return "".join([str(k) for k in kp])
+
+
+def key_get(obj: Any, kp: KeyPath) -> Any:
+    """Given an object and a key path, return the value at the key path."""
+    for k in kp:
+        obj = k.get(obj)
+    return obj
diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py
index d4ae5e10031f0..427333b07c16d 100644
--- a/torch/utils/_sympy/functions.py
+++ b/torch/utils/_sympy/functions.py
@@ -1,6 +1,7 @@
 import sympy
 from sympy import S
 from sympy.core.logic import fuzzy_and, fuzzy_not, fuzzy_or
+import math
 
 __all__ = [
     "FloorDiv", "ModularIndexing", "CleanDiv", "CeilDiv", "Pow", "TrueDiv",
@@ -8,6 +9,12 @@
 ]
 
 
+def fuzzy_eq(x, y):
+    if None in (x, y):
+        return None
+    return x == y
+
+
 class FloorDiv(sympy.Function):
     """
     We maintain this so that:
@@ -95,7 +102,7 @@ def check_supported_type(x):
 
 class ModularIndexing(sympy.Function):
     """
-    ModularIndexing(a, b, c) => (a // b) % c
+    ModularIndexing(a, b, c) => (a // b) % c where % is the C modulus
     """
 
     nargs = (3,)
@@ -148,6 +155,15 @@ def eval(cls, base, divisor, modulus):
         if isinstance(base, FloorDiv):
             return ModularIndexing(base.args[0], base.args[1] * divisor, modulus)
 
+    def _eval_is_nonnegative(self):
+        p, q = self.args[:2]
+        return fuzzy_eq(p.is_nonnegative, q.is_nonnegative)  # type: ignore[attr-defined]
+
+    def _eval_is_positive(self):
+        p, q = self.args[:2]
+        return fuzzy_eq(p.is_positive, q.is_positive)  # type: ignore[attr-defined]
+
+
 class Where(sympy.Function):
     """
     Good ol' ternary operator
@@ -312,6 +328,17 @@ def eval(cls, *args):
         return None
 
 
+class Trunc(sympy.Function):
+    is_integer = True
+
+    @classmethod
+    def eval(cls, number):
+        if number.is_integer:
+            return number
+        elif isinstance(number, sympy.Number):
+            return sympy.Integer(math.trunc(float(number)))
+
+
 class Round(sympy.Function):
     is_integer = True
 
@@ -336,3 +363,55 @@ def eval(cls, number, ndigits):
         elif isinstance(number, sympy.Number) and isinstance(ndigits, sympy.Integer):
             value_type, output_type = (int, sympy.Integer) if isinstance(number, sympy.Integer) else (float, sympy.Float)
             return output_type(round(value_type(number), int(ndigits)))
+
+
+def make_opaque_unary_fn(name):
+    class OpaqueUnaryFn(sympy.Function):
+        """
+        Unlike the builtin sympy functions on real numbers like sympy.sqrt,
+        these equivalents do not do any nontrivial reasoning besides
+        constant propagation.  This helps avoid performing transformations
+        that are valid for real numbers but are invalid for floating point;
+        in particular, while we are willing to make optimizations that change
+        numerics for Tensor compute, we are NOT willing to make optimziations
+        that change numerics for size compute.
+        """
+
+        _torch_handler_name = name
+
+        @classmethod
+        def eval(cls, a):
+            if isinstance(a, (sympy.Integer, sympy.Float)):
+                # Python converts to float64 before computing, c.f.
+                # >>> math.sin(2**53+1)
+                # -0.848925964814655
+                # >>> math.sin(float(2**53+1))
+                # -0.848925964814655
+                try:
+                    return sympy.Float(getattr(math, name)(float(a)))
+                # Just use sympy semantics for infinity/overflow, you might get some
+                # weird objects but ask silly questions, get silly answers
+                except OverflowError:
+                    return getattr(sympy, name)(a)
+            elif a in [sympy.oo, -sympy.oo, sympy.zoo, -sympy.zoo]:
+                return getattr(sympy, name)(a)
+            return None
+
+    OpaqueUnaryFn.__name__ = "OpaqueUnaryFn_" + name
+
+    return OpaqueUnaryFn
+
+# Keep in sync with math_op_names in torch/fx/experimental/sym_node.py
+OpaqueUnaryFn_sqrt = make_opaque_unary_fn("sqrt")
+OpaqueUnaryFn_cos = make_opaque_unary_fn("cos")
+OpaqueUnaryFn_cosh = make_opaque_unary_fn("cosh")
+OpaqueUnaryFn_sin = make_opaque_unary_fn("sin")
+OpaqueUnaryFn_sinh = make_opaque_unary_fn("sinh")
+OpaqueUnaryFn_tan = make_opaque_unary_fn("tan")
+OpaqueUnaryFn_tanh = make_opaque_unary_fn("tanh")
+OpaqueUnaryFn_asin = make_opaque_unary_fn("asin")
+OpaqueUnaryFn_acos = make_opaque_unary_fn("acos")
+OpaqueUnaryFn_atan = make_opaque_unary_fn("atan")
+OpaqueUnaryFn_exp = make_opaque_unary_fn("exp")
+OpaqueUnaryFn_log = make_opaque_unary_fn("log")
+OpaqueUnaryFn_asinh = make_opaque_unary_fn("asinh")
diff --git a/torch/utils/_sympy/interp.py b/torch/utils/_sympy/interp.py
index d6622dba09734..806e91cfe2814 100644
--- a/torch/utils/_sympy/interp.py
+++ b/torch/utils/_sympy/interp.py
@@ -24,6 +24,7 @@
     Round,
     RoundDecimal,
     TrueDiv,
+    Trunc,
     Where,
 )
 
@@ -51,6 +52,7 @@ def handlers():
         TrueDiv: "truediv",
         FloorDiv: "floordiv",
         CleanDiv: "div",
+        Trunc: "trunc",
         Where: "where",
         sympy.Add: "add",
         sympy.Mul: "mul",
@@ -72,6 +74,9 @@ def handlers():
         Round: "round",
         RoundDecimal: "round",
     }
+    for name in ["cos", "sin", "tan", "sinh", "cosh", "tanh", "asin", "acos", "atan"]:
+        HANDLERS[getattr(sympy, name)] = name
+
     return HANDLERS
 
 
@@ -103,7 +108,10 @@ def sympy_interp(
 
     # Recursive case
     args = [sympy_interp(analysis, env, arg) for arg in expr.args]  # type: ignore[arg-type]
-    handler_name = handlers()[expr.func]
+    if hasattr(expr.func, "_torch_handler_name"):
+        handler_name = expr.func._torch_handler_name
+    else:
+        handler_name = handlers()[expr.func]
     handler = getattr(analysis, handler_name)
     if handler_name in ASSOCIATIVE_OPS:
         assert len(args) > 1
diff --git a/torch/utils/_sympy/reference.py b/torch/utils/_sympy/reference.py
index 0c49f5749aea0..8bd688b0c0c99 100644
--- a/torch/utils/_sympy/reference.py
+++ b/torch/utils/_sympy/reference.py
@@ -3,6 +3,11 @@
 import sympy
 
 import torch
+from torch.utils._sympy.functions import (
+    OpaqueUnaryFn_exp,
+    OpaqueUnaryFn_log,
+    OpaqueUnaryFn_sqrt,
+)
 
 
 # The sympy interpretation of operators.  It will also sometimes work with
@@ -65,7 +70,10 @@ def square(x):
 
     @staticmethod
     def mod(x, y):
-        return x % y
+        ret = abs(x) % abs(y)
+        if x < 0:
+            ret *= -1
+        return ret
 
     @staticmethod
     def abs(x):
@@ -111,15 +119,15 @@ def sub(a, b):
 
     @staticmethod
     def exp(x):
-        return sympy.exp(x)
+        return OpaqueUnaryFn_exp(x)
 
     @staticmethod
     def log(x):
-        return sympy.log(x)
+        return OpaqueUnaryFn_log(x)
 
     @staticmethod
     def sqrt(x):
-        return sympy.sqrt(x)
+        return OpaqueUnaryFn_sqrt(x)
 
     @staticmethod
     def pow(a, b):
@@ -195,7 +203,7 @@ def log(x):
 
     @staticmethod
     def sqrt(x):
-        return torch.sym_sqrt(x)
+        return torch._sym_sqrt(x)  # type: ignore[attr-defined]
 
     @staticmethod
     def minimum(a, b):
diff --git a/torch/utils/_sympy/singleton_int.py b/torch/utils/_sympy/singleton_int.py
index d67e3732a2731..870bda554e748 100644
--- a/torch/utils/_sympy/singleton_int.py
+++ b/torch/utils/_sympy/singleton_int.py
@@ -13,14 +13,14 @@ def __new__(cls, *args, coeff=None, **kwargs):
         instance = super().__new__(cls, *args, **kwargs)
         return instance
 
-    # The semantics of this class should match that of SingletonSymNodeImpl in
-    # c10/core/SingletonSymNodeImpl.h
+    # The semantics of this class should match that of NestedIntSymNodeImpl in
+    # c10/core/NestedIntSymNodeImpl.h
     def __init__(self, val, *, coeff=1):
         self._val = val
         self._coeff = coeff
         super().__init__()
 
-    # See NOTE [ Inequalities with SingletonInt ]
+    # See NOTE [ Inequalities with nested int ]
     def _eval_Eq(self, other):
         if (
             isinstance(other, SingletonInt)
@@ -69,7 +69,7 @@ def __mod__(self, other):
         raise NotImplementedError("NYI")
 
 
-# See NOTE [ Inequalities with SingletonInt ]
+# See NOTE [ Inequalities with nested int ]
 @dispatch(sympy.Integer, SingletonInt)
 def _eval_is_ge(a, b):
     if a < 2:
diff --git a/torch/utils/_sympy/solve.py b/torch/utils/_sympy/solve.py
index 4d1113bea891d..6276c696293c9 100644
--- a/torch/utils/_sympy/solve.py
+++ b/torch/utils/_sympy/solve.py
@@ -102,7 +102,7 @@ def _try_isolate_lhs(
     if isinstance(e, sympy.Rel):
         # Move any constants in the left-hand side to the right-hand side.
         lhs_not_thing = (
-            sum([a for a in e.lhs.args if not a.has(thing)])
+            sum(a for a in e.lhs.args if not a.has(thing))
             if isinstance(e.lhs, sympy.Add)
             else 0
         )
diff --git a/torch/utils/_sympy/symbol.py b/torch/utils/_sympy/symbol.py
new file mode 100644
index 0000000000000..ea2d2b7293f36
--- /dev/null
+++ b/torch/utils/_sympy/symbol.py
@@ -0,0 +1,86 @@
+"""
+This file contains canonical definitions for our symbol naming conventions,
+across torch.fx.experimental.symbolic_shapes and torch._inductor.  The
+intention is:
+
+1. To make it easily greppable where all the sites we use a prefix are
+2. Make it possible to easily tell if we can introduce a new prefix without
+   introducing a conflict
+
+You can occasionally test if prefixes have been hardcoded by renaming prefixes
+in this file and seeing what breaks.
+"""
+
+from enum import auto, Enum
+from typing import Sequence, Union
+
+import sympy
+
+
+class SymT(Enum):
+    SIZE = auto()
+    UNBACKED_INT = auto()
+    UNBACKED_FLOAT = auto()
+    # Inductor: The intermediates in inner_fn tmp0, one generated per ops call.
+    # If one of these shows up in an indexing expression, that means an
+    # indirect load is happening.
+    TMP = auto()
+    # Inductor: Placeholder variable that is later replaced with TMP
+    INDIRECT = auto()
+    # Inductor: Some size expressions are replaced with a precomputed size ps0
+    # which is computed host side, and then directly reused in the kernel, so
+    # we don't repeatedly recompute it on device.
+    PRECOMPUTED_SIZE = auto()
+    # Inductor: An indexing variable i0 in loops IR which ranges over non-reduced
+    # dim in the loop
+    INDEX = auto()
+    # Inductor: A reduction indexing r0 variable in loops IR which ranges over
+    # reduced dim in the loop
+    RINDEX = auto()
+    # Inductor: In templated kernels torch._inductor.kernel, we have a hook to
+    # store the final output and append epilogue fusions.  To do this, we must
+    # know what the indexes the outputs range over.  NB: These will also
+    # advertise as INDEX, this is... probably OK?
+    TEMPLATE_INDEX = auto()
+    # Inductor: iteration domain for blockIdx.x/blockIdx.y
+    XBLOCK = auto()
+    YBLOCK = auto()
+    # Inductor: this is used solely for dynamic_reshape_indexer
+    VIEW = auto()
+
+
+# Invariant: there must not be a prefix which is a prefix of another string,
+# as this introduces ambiguity
+prefix_str = {
+    SymT.SIZE: "s",  # integer
+    SymT.UNBACKED_INT: "u",  # integer
+    SymT.UNBACKED_FLOAT: "f",
+    SymT.TMP: "tmp",
+    SymT.PRECOMPUTED_SIZE: "ps",
+    SymT.INDEX: "i",
+    SymT.RINDEX: "r",
+    SymT.TEMPLATE_INDEX: "idx",
+    SymT.XBLOCK: "x",
+    SymT.YBLOCK: "y",
+    SymT.INDIRECT: "indirect",  # false aliasing?
+    SymT.VIEW: "view",
+}
+
+
+def make_symbol(prefix: SymT, idx: int, **kwargs) -> sympy.Symbol:
+    # TODO: maybe put the assumptions here directly
+    return sympy.Symbol(f"{prefix_str[prefix]}{idx}", **kwargs)
+
+
+# This type is a little wider than it should be, because free_symbols says
+# that it contains Basic, rather than Symbol
+def symbol_is_type(sym: sympy.Basic, prefix: Union[SymT, Sequence[SymT]]) -> bool:
+    assert isinstance(sym, sympy.Symbol)
+    if isinstance(prefix, SymT):
+        return sym.name.startswith(prefix_str[prefix])
+    else:
+        return sym.name.startswith(tuple(prefix_str[p] for p in prefix))
+
+
+def free_symbol_is_type(e: sympy.Expr, prefix: SymT) -> bool:
+    return any(symbol_is_type(v, prefix) for v in e.free_symbols)
diff --git a/torch/utils/_sympy/value_ranges.py b/torch/utils/_sympy/value_ranges.py
index bb8479558f217..f2319e930d769 100644
--- a/torch/utils/_sympy/value_ranges.py
+++ b/torch/utils/_sympy/value_ranges.py
@@ -1,21 +1,51 @@
+from __future__ import annotations
+
 import dataclasses
 import itertools
-import sympy
-from sympy.logic.boolalg import BooleanAtom, Boolean as SympyBoolean
-import operator
-import math
 import logging
+import math
+import operator
+from typing import (
+    Callable,
+    Dict,
+    Generic,
+    Optional,
+    overload,
+    SupportsFloat,
+    TYPE_CHECKING,
+    TypeVar,
+    Union,
+)
+
+import sympy
+from sympy.logic.boolalg import Boolean as SympyBoolean, BooleanAtom
+from typing_extensions import TypeGuard
+
 import torch
-from typing import Union, Dict, Optional, SupportsFloat
 
 from torch._prims_common import dtype_to_type
+from .functions import (
+    OpaqueUnaryFn_acos,
+    OpaqueUnaryFn_asinh,
+    OpaqueUnaryFn_atan,
+    OpaqueUnaryFn_cosh,
+    OpaqueUnaryFn_exp,
+    OpaqueUnaryFn_log,
+    OpaqueUnaryFn_sinh,
+    OpaqueUnaryFn_sqrt,
+    OpaqueUnaryFn_tanh,
+    Round,
+    RoundDecimal,
+)
 from .interp import sympy_interp
-from .functions import Round, RoundDecimal
 
 log = logging.getLogger(__name__)
 
 __all__ = ["ValueRanges", "ValueRangeAnalysis", "bound_sympy"]
 
+_T = TypeVar("_T", sympy.Expr, SympyBoolean)
+
+
 class ValueRangeError(RuntimeError):
     pass
 
@@ -57,117 +87,227 @@ def sympy_generic_le(lower, upper):
         return not (lower and not upper)
 
 
+def vr_is_bool(vr: ValueRanges[_T]) -> TypeGuard[ValueRanges[SympyBoolean]]:
+    return vr.is_bool
+
+
+def vr_is_expr(vr: ValueRanges[_T]) -> TypeGuard[ValueRanges[sympy.Expr]]:
+    return not vr.is_bool
+
+
+ExprIn = Union[int, float, sympy.Expr]
+BoolIn = Union[bool, SympyBoolean]
+AllIn = Union[ExprIn, BoolIn]
+ExprFn = Callable[[sympy.Expr], sympy.Expr]
+ExprFn2 = Callable[[sympy.Expr, sympy.Expr], sympy.Expr]
+BoolFn = Callable[[SympyBoolean], SympyBoolean]
+BoolFn2 = Callable[[SympyBoolean, SympyBoolean], SympyBoolean]
+AllFn = Union[ExprFn, BoolFn]
+AllFn2 = Union[ExprFn2, BoolFn2]
+
+
 @dataclasses.dataclass(frozen=True)
-class ValueRanges:
+class ValueRanges(Generic[_T]):
+    if TYPE_CHECKING:
+        # ruff doesn't understand circular references but mypy does
+        ExprVR = ValueRanges[sympy.Expr]  # noqa: F821
+        BoolVR = ValueRanges[SympyBoolean]  # noqa: F821
+        AllVR = Union[ExprVR, BoolVR]
+
     # Although the type signature here suggests you can pass any
     # sympy expression, in practice the analysis here only works
     # with constant sympy expressions
-    lower: Union[sympy.Expr, SympyBoolean]
-    upper: Union[sympy.Expr, SympyBoolean]
+    lower: _T
+    upper: _T
     is_bool: bool
 
-    def __init__(self, lower, upper):
+    @overload
+    def __init__(self: ValueRanges[sympy.Expr], lower: ExprIn, upper: ExprIn) -> None:
+        ...
+
+    @overload
+    def __init__(self: ValueRanges[SympyBoolean], lower: BoolIn, upper: BoolIn) -> None:
+        ...
+
+    def __init__(self, lower: AllIn, upper: AllIn) -> None:
         lower = simple_sympify(lower)
         upper = simple_sympify(upper)
         # TODO: when the bounds have free variables, this may be
         # nontrivial to actually verify
-        if not sympy_generic_le(lower, upper):
-            raise ValueRangeError(f"Invalid ranges [{lower}:{upper}]")
+        try:
+            if not sympy_generic_le(lower, upper):
+                raise ValueRangeError(f"Invalid ranges [{lower}:{upper}]")
+        except TypeError:
+            raise TypeError(f"Could not compare {lower} <= {upper}")  # noqa: TRY200
         # Because this is a frozen class
         object.__setattr__(self, "lower", lower)
         object.__setattr__(self, "upper", upper)
         object.__setattr__(self, "is_bool", isinstance(lower, SympyBoolean))
         assert isinstance(upper, SympyBoolean) == self.is_bool
 
-    def boolify(self):
-        if self.is_bool:
+    def boolify(self) -> ValueRanges[SympyBoolean]:
+        if vr_is_bool(self):
             return self
         elif self == ValueRanges.unknown():
             return ValueRanges.unknown_bool()
         else:
             raise AssertionError(f"not bool like {self}")
 
-    def __contains__(self, x):
+    def __contains__(self, x: AllIn) -> bool:
         x = simple_sympify(x)
         return sympy_generic_le(self.lower, x) and sympy_generic_le(x, self.upper)
 
-    def tighten(self, other) -> "ValueRanges":
+    def issubset(self, other):
+        return sympy_generic_le(other.lower, self.lower) and sympy_generic_le(
+            self.upper, other.upper
+        )
+
+    def tighten(self, other) -> ValueRanges:
         """Given two ValueRanges, returns their intersection"""
         return self & other
 
     # Intersection
-    def __and__(self, other) -> "ValueRanges":
+    @overload
+    def __and__(
+        self: ValueRanges[sympy.Expr], other: ValueRanges[sympy.Expr]
+    ) -> ValueRanges[sympy.Expr]:
+        ...
+
+    @overload
+    def __and__(
+        self: ValueRanges[SympyBoolean], other: ValueRanges[SympyBoolean]
+    ) -> ValueRanges[SympyBoolean]:
+        ...
+
+    def __and__(self: AllVR, other: AllVR) -> AllVR:
         if other == ValueRanges.unknown():
             return self
         if self == ValueRanges.unknown():
             return other
         assert self.is_bool == other.is_bool, (self, other)
         if self.is_bool:
-            range = ValueRanges(sympy.Or(self.lower, other.lower), sympy.And(self.upper, other.upper))
+            return ValueRanges(
+                sympy.Or(self.lower, other.lower), sympy.And(self.upper, other.upper)
+            )
         else:
-            range = ValueRanges(sympy.Max(self.lower, other.lower), sympy.Min(self.upper, other.upper))
-        return range
+            return ValueRanges(
+                sympy.Max(self.lower, other.lower), sympy.Min(self.upper, other.upper)
+            )
 
     # Union
-    def __or__(self, other) -> "ValueRanges":
+    @overload
+    def __or__(
+        self: ValueRanges[sympy.Expr], other: ValueRanges[sympy.Expr]
+    ) -> ValueRanges[sympy.Expr]:
+        ...
+
+    @overload
+    def __or__(
+        self: ValueRanges[SympyBoolean], other: ValueRanges[SympyBoolean]
+    ) -> ValueRanges[SympyBoolean]:
+        ...
+
+    def __or__(self: AllVR, other: AllVR) -> AllVR:
         if ValueRanges.unknown() in (self, other):
             return ValueRanges.unknown()
         assert self.is_bool == other.is_bool, (self, other)
         if self.is_bool:
-            range = ValueRanges(sympy.And(self.lower, other.lower), sympy.Or(self.upper, other.upper))
+            return ValueRanges(
+                sympy.And(self.lower, other.lower), sympy.Or(self.upper, other.upper)
+            )
         else:
-            range = ValueRanges(sympy.Min(self.lower, other.lower), sympy.Max(self.upper, other.upper))
-        return range
+            return ValueRanges(
+                sympy.Min(self.lower, other.lower), sympy.Max(self.upper, other.upper)
+            )
 
     def is_singleton(self) -> bool:
         return self.lower == self.upper
 
     # TODO: this doesn't work with bools but arguably it should
-    @classmethod
-    def unknown(cls):
-        return cls(-sympy.oo, sympy.oo)
+    @staticmethod
+    def unknown() -> ValueRanges[sympy.Expr]:
+        return ValueRanges(-sympy.oo, sympy.oo)
 
-    @classmethod
-    def unknown_bool(cls):
-        return cls(sympy.false, sympy.true)
+    @staticmethod
+    def unknown_bool() -> ValueRanges[SympyBoolean]:
+        return ValueRanges(sympy.false, sympy.true)
 
-    @classmethod
-    def wrap(cls, arg):
+    @overload
+    @staticmethod
+    # work around the fact that bool and int overlap
+    def wrap(arg: Union[ExprIn, ExprVR]) -> ExprVR:  # type: ignore[overload-overlap]
+        ...
+
+    @overload
+    @staticmethod
+    def wrap(arg: Union[BoolIn, BoolVR]) -> BoolVR:
+        ...
+
+    @staticmethod
+    def wrap(arg: Union[AllIn, AllVR]) -> AllVR:
         if isinstance(arg, ValueRanges):
             return arg
-        return ValueRanges(arg, arg)
+        # arg is either ExprIn or BoolIn, but we don't know it here
+        return ValueRanges(arg, arg)  # type: ignore[arg-type]
 
-    @classmethod
-    def increasing_map(cls, x, fn):
+    @staticmethod
+    def increasing_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
         """Increasing: x <= y => f(x) <= f(y)."""
-        x = cls.wrap(x)
+        x = ValueRanges.wrap(x)
         return ValueRanges(fn(x.lower), fn(x.upper))
 
-    @classmethod
-    def decreasing_map(cls, x, fn):
+    @overload
+    @staticmethod
+    def decreasing_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
+        ...
+
+    @overload
+    @staticmethod
+    def decreasing_map(x: Union[BoolIn, BoolVR], fn: BoolFn) -> BoolVR:
+        ...
+
+    @staticmethod
+    def decreasing_map(x: Union[AllIn, AllVR], fn: AllFn) -> AllVR:
         """Decreasing: x <= y => f(x) >= f(y)."""
-        x = cls.wrap(x)
-        return ValueRanges(fn(x.upper), fn(x.lower))
+        x = ValueRanges.wrap(x)
+        # consistently either Expr or Bool, but we don't know it here
+        return ValueRanges(fn(x.upper), fn(x.lower))  # type: ignore[arg-type]
 
-    @classmethod
-    def monotone_map(cls, x, fn):
+    @staticmethod
+    def monotone_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
         """It's increasing or decreasing."""
-        x = cls.wrap(x)
+        x = ValueRanges.wrap(x)
         l = fn(x.lower)
         u = fn(x.upper)
         return ValueRanges(min(l, u), max(l, u))
 
-    @classmethod
-    def convex_min_zero_map(cls, x, fn):
+    @staticmethod
+    def convex_min_zero_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
         """Fn is convex and has a minimum at 0."""
         x = ValueRanges.wrap(x)
         if 0 in x:
             return ValueRanges(0, max(fn(x.lower), fn(x.upper)))
         else:
-            return cls.monotone_map(x, fn)
+            return ValueRanges.monotone_map(x, fn)
 
-    @classmethod
-    def coordinatewise_increasing_map(cls, x, y, fn):
+    @overload
+    @staticmethod
+    def coordinatewise_increasing_map(
+        x: Union[ExprIn, ExprVR], y: Union[ExprIn, ExprVR], fn: ExprFn2
+    ) -> ExprVR:
+        ...
+
+    @overload
+    @staticmethod
+    def coordinatewise_increasing_map(
+        x: Union[BoolIn, BoolVR], y: Union[BoolIn, BoolVR], fn: BoolFn2
+    ) -> BoolVR:
+        ...
+
+    @staticmethod
+    def coordinatewise_increasing_map(
+        x: Union[AllIn, AllVR], y: Union[AllIn, AllVR], fn: AllFn2
+    ) -> AllVR:
         """
         It's increasing on each coordinate.
 
@@ -175,10 +315,10 @@ def coordinatewise_increasing_map(cls, x, y, fn):
         For every 1 <= i <= n and x_i <= y_i we have that
         f(x1, .., xn) <= f(x1, , yi, ..., xn)
         """
-        x, y = cls.wrap(x), cls.wrap(y)
+        x, y = ValueRanges.wrap(x), ValueRanges.wrap(y)
         return ValueRanges(
-            fn(x.lower, y.lower),
-            fn(x.upper, y.upper),
+            fn(x.lower, y.lower),  # type: ignore[arg-type]
+            fn(x.upper, y.upper),  # type: ignore[arg-type]
         )
 
     @classmethod
@@ -191,6 +331,7 @@ def coordinatewise_monotone_map(cls, x, y, fn):
         ]
         return ValueRanges(min(products), max(products))
 
+
 class SymPyValueRangeAnalysis:
     """
     It gives bounds on a SymPy operator given bounds on its arguments
@@ -199,9 +340,14 @@ class SymPyValueRangeAnalysis:
 
     @staticmethod
     def constant(value, dtype):
+        if isinstance(value, ValueRanges):
+            assert value.is_singleton()
+            value = value.lower
         # NB: value is NOT a sympy expression, it's a constant!
         is_python = isinstance(value, (int, float, bool))
-        assert is_python or isinstance(value, (BooleanAtom, sympy.Integer, sympy.Number))
+        assert is_python or isinstance(
+            value, (BooleanAtom, sympy.Integer, sympy.Number)
+        )
 
         # using nan makes subsequent computation throw, and for the purposes of optimization
         # returning -math.inf - math.inf is equivalent to giving up
@@ -311,7 +457,9 @@ def div(cls, a, b):
     def truediv(a, b):
         a = ValueRanges.wrap(a)
         b = ValueRanges.wrap(b)
-        if 0 in b or ((-sympy.oo in a or sympy.oo in a) and (-sympy.oo in b or sympy.oo in b)):
+        if 0 in b or (
+            (-sympy.oo in a or sympy.oo in a) and (-sympy.oo in b or sympy.oo in b)
+        ):
             return ValueRanges.unknown()
         else:
             return ValueRanges.coordinatewise_monotone_map(a, b, operator.truediv)
@@ -320,20 +468,53 @@ def truediv(a, b):
     def floordiv(a, b):
         a = ValueRanges.wrap(a)
         b = ValueRanges.wrap(b)
-        if 0 in b or ((-sympy.oo in a or sympy.oo in a) and (-sympy.oo in b or sympy.oo in b)):
+        if 0 in b or (
+            (-sympy.oo in a or sympy.oo in a) and (-sympy.oo in b or sympy.oo in b)
+        ):
             return ValueRanges.unknown()
         else:
             return ValueRanges.coordinatewise_monotone_map(a, b, operator.floordiv)
 
-    @staticmethod
-    def mod(x, y):
+    @classmethod
+    def mod(cls, x, y):
         x = ValueRanges.wrap(x)
         y = ValueRanges.wrap(y)
-        if x.is_singleton() and y.is_singleton() and y.lower != 0:
-            return ValueRanges.wrap(x.lower % y.lower)
-        if y.lower <= 0:
+        # nb. We implement C semantics
+
+        def c_mod(a, b):
+            ret = abs(a) % abs(b)
+            if a < 0:
+                ret *= -1
+            return ret
+
+        def c_div(a, b):
+            x = a / b
+            return sympy.Integer(x) if x.is_finite else x
+
+        if 0 in y:
             return ValueRanges.unknown()
-        return ValueRanges(0, y.upper)
+        elif y.is_singleton():
+            y_val = abs(y.lower)
+            # If it wraps, we need to take the whole interval
+
+            # The function is locally linear if they are in the same class
+            if c_div(x.lower, y_val) == c_div(x.upper, y_val):
+                return ValueRanges.increasing_map(x, lambda u: c_mod(u, y_val))
+            if x.upper < 0:
+                # Negative case
+                return ValueRanges(-y_val + 1, 0)
+            elif x.lower > 0:
+                # Positive case
+                return ValueRanges(0, y_val - 1)
+            else:
+                # Mixed case
+                lower = max(-y_val + 1, x.lower)
+                upper = min(y_val - 1, x.upper)
+                return ValueRanges(lower, upper)
+        else:
+            # Too difficult, we bail out
+            upper = cls.abs(y).upper - 1
+            return ValueRanges(-upper, upper)
 
     @classmethod
     def modular_indexing(cls, a, b, c):
@@ -363,7 +544,7 @@ def is_integer(val):
         b = b.lower
         if a.is_singleton():
             a = a.lower
-            r = a ** b
+            r = a**b
             if not r.is_finite:
                 return ValueRanges.unknown()
             return ValueRanges.wrap(r)
@@ -385,21 +566,21 @@ def is_integer(val):
         if not is_integer(b):
             # If the base is positive, then we're good, otherwise nothing's defined
             if a.lower >= 0:
-                return ValueRanges.increasing_map(a, lambda x: x ** b)
+                return ValueRanges.increasing_map(a, lambda x: x**b)
             else:
                 return ValueRanges.unknown()
         else:
             # b > 0 integer
             if b % 2 == 0:
                 # x^n where n is even
-                return ValueRanges.convex_min_zero_map(a, lambda x: x ** b)
+                return ValueRanges.convex_min_zero_map(a, lambda x: x**b)
             else:
                 # x^n where n is odd
-                return ValueRanges.increasing_map(a, lambda x: x ** b)
+                return ValueRanges.increasing_map(a, lambda x: x**b)
 
     @staticmethod
     def reciprocal(x):
-        """ Needed as it's used in pow, but it won't appear on a SymPy expression """
+        """Needed as it's used in pow, but it won't appear on a SymPy expression"""
         x = ValueRanges.wrap(x)
         if 0 in x:
             return ValueRanges.unknown()
@@ -412,14 +593,14 @@ def abs(x):
 
     @staticmethod
     def exp(x):
-        return ValueRanges.increasing_map(x, sympy.functions.elementary.exponential.exp)
+        return ValueRanges.increasing_map(x, OpaqueUnaryFn_exp)
 
     @staticmethod
     def log(x):
         x = ValueRanges.wrap(x)
         if x.lower <= 0:
             return ValueRanges.unknown()
-        return ValueRanges.increasing_map(x, sympy.log)
+        return ValueRanges.increasing_map(x, OpaqueUnaryFn_log)
 
     @classmethod
     def minimum(cls, a, b):
@@ -435,7 +616,7 @@ def min_or_max(a, b, fn):
         b = ValueRanges.wrap(b)
 
         # Performs upcasting first
-        def fn_(x, y):
+        def fn_(x: sympy.Expr, y: sympy.Expr) -> sympy.Expr:
             # Poorman's version of upcasting in Sympy
             # Inf is not a float...
             if x.is_Integer and y.is_Integer:
@@ -455,7 +636,9 @@ def floor(cls, x):
 
     @classmethod
     def ceil(cls, x):
-        return ValueRanges.increasing_map(x, sympy.functions.elementary.integers.ceiling)
+        return ValueRanges.increasing_map(
+            x, sympy.functions.elementary.integers.ceiling
+        )
 
     @classmethod
     def round(cls, number, ndigits=None):
@@ -466,7 +649,7 @@ def round(cls, number, ndigits=None):
             ndigits = ndigits.lower
             # We can't use functools.partial here since sympy doesn't support keyword arguments, but we have to bind
             # the second parameter.
-            fn = lambda number: RoundDecimal(number, ndigits)  # noqa: E731
+            fn = lambda number: RoundDecimal(number, ndigits)  # type: ignore[misc, assignment]  # noqa: E731
 
         return ValueRanges.increasing_map(number, fn)
 
@@ -476,14 +659,16 @@ def sqrt(x):
         x = ValueRanges.wrap(x)
         if x.lower < 0:
             return ValueRanges.unknown()
-        return ValueRanges.increasing_map(x, sympy.sqrt)
+        return ValueRanges.increasing_map(x, OpaqueUnaryFn_sqrt)
 
     @staticmethod
     def where(a, b, c):
         b = ValueRanges.wrap(b)
         c = ValueRanges.wrap(c)
         a = a.boolify()
-        assert b.is_bool == c.is_bool
+        # We sometimes write unknown without specifying the type correctly
+        # In particular, we do that when initialising the bounds for loads in bounds.py
+        assert b.is_bool == c.is_bool or ValueRanges.unknown() in (b, c)
         if b.is_bool:
             return ValueRanges(sympy.And(b.lower, c.lower), sympy.Or(b.upper, c.upper))
         else:
@@ -513,6 +698,65 @@ def piecewise(*ranges):
                     init_range = init_range | expr_range
         return init_range
 
+    @staticmethod
+    def cos(x):
+        # TODO: We should tighten value ranges
+        # If input range span is pi + 2*pi*k, then output range is (-1, 1)
+        # otherwise the minimum of the value of the function on the extremes
+        return ValueRanges(-1.0, 1.0)
+
+    @staticmethod
+    def cosh(x):
+        x = ValueRanges.wrap(x)
+        if x.lower > 0:
+            return ValueRanges.increasing_map(x, OpaqueUnaryFn_cosh)
+        elif x.upper < 0:
+            return ValueRanges.decreasing_map(x, OpaqueUnaryFn_cosh)
+        return ValueRanges(0.0, sympy.oo)
+
+    @staticmethod
+    def sin(x):
+        # TODO: We should tighten value ranges
+        # See details on cos
+        return ValueRanges(-1.0, 1.0)
+
+    @staticmethod
+    def sinh(x):
+        return ValueRanges.increasing_map(x, OpaqueUnaryFn_sinh)
+
+    @staticmethod
+    def tan(x):
+        return ValueRanges(-sympy.oo, sympy.oo)
+
+    @staticmethod
+    def tanh(x):
+        return ValueRanges.increasing_map(x, OpaqueUnaryFn_tanh)
+
+    @staticmethod
+    def asin(x):
+        x = ValueRanges.wrap(x)
+        if -1 <= x.lower and x.upper <= 1:
+            return ValueRanges.increasing_map(x, OpaqueUnaryFn_asinh)
+        return ValueRanges.unknown()
+
+    @staticmethod
+    def acos(x):
+        x = ValueRanges.wrap(x)
+        if -1 <= x.lower and x.upper <= 1:
+            return ValueRanges.decreasing_map(x, OpaqueUnaryFn_acos)
+        return ValueRanges.unknown()
+
+    @staticmethod
+    def atan(x):
+        return ValueRanges.increasing_map(x, OpaqueUnaryFn_atan)
+
+    @staticmethod
+    def trunc(x):
+        def trunc(x):
+            return sympy.Integer(x) if x.is_finite else x
+
+        return ValueRanges.increasing_map(x, trunc)
+
 
 class ValueRangeAnalysis(SymPyValueRangeAnalysis):
     def __init__(self):
@@ -597,10 +841,7 @@ def truncdiv(cls, a, b):
         if x == ValueRanges.unknown():
             return x
 
-        def trunc(x):
-            return sympy.Integer(x) if x.is_finite else x
-
-        return ValueRanges.increasing_map(x, trunc)
+        return cls.trunc(x)
 
     @classmethod
     def sub(cls, a, b):
@@ -611,7 +852,9 @@ def __getattr__(self, name):
         return self.default_handler
 
 
-def bound_sympy(expr: sympy.Expr, ranges: Optional[Dict[sympy.Symbol, ValueRanges]] = None) -> ValueRanges:
+def bound_sympy(
+    expr: sympy.Expr, ranges: Optional[Dict[sympy.Symbol, ValueRanges]] = None
+) -> ValueRanges:
     if isinstance(expr, sympy.Number):
         return ValueRanges.wrap(expr)
 
@@ -620,7 +863,7 @@ def bound_sympy(expr: sympy.Expr, ranges: Optional[Dict[sympy.Symbol, ValueRange
     # If there's a tracing context, augment available constrained ranges.
     context = torch._guards.TracingContext.try_get()
     if context and context.fake_mode.shape_env:
-        ranges = {**ranges, **context.fake_mode.shape_env.var_to_range}
+        ranges = {**context.fake_mode.shape_env.var_to_range, **ranges}
 
     unbounded_vars = expr.free_symbols - ranges.keys()
     if unbounded_vars:
diff --git a/torch/utils/_triton.py b/torch/utils/_triton.py
index c69761d20341f..9184f782cc737 100644
--- a/torch/utils/_triton.py
+++ b/torch/utils/_triton.py
@@ -1,6 +1,5 @@
 import functools
-
-from torch._dynamo.device_interface import get_interface_for_device
+import hashlib
 
 
 @functools.lru_cache(None)
@@ -15,11 +14,71 @@ def has_triton_package() -> bool:
 
 @functools.lru_cache(None)
 def has_triton() -> bool:
-    def is_cuda_compatible_with_triton():
-        device_interface = get_interface_for_device("cuda")
-        return (
-            device_interface.is_available()
-            and device_interface.Worker.get_device_properties().major >= 7
-        )
-
-    return is_cuda_compatible_with_triton() and has_triton_package()
+    from torch._dynamo.device_interface import get_interface_for_device
+
+    def cuda_extra_check(device_interface):
+        return device_interface.Worker.get_device_properties().major >= 7
+
+    def _return_true(device_interface):
+        return True
+
+    triton_supported_devices = {"cuda": cuda_extra_check, "xpu": _return_true}
+
+    def is_device_compatible_with_triton():
+        for device, extra_check in triton_supported_devices.items():
+            device_interface = get_interface_for_device(device)
+            if device_interface.is_available() and extra_check(device_interface):
+                return True
+        return False
+
+    return is_device_compatible_with_triton() and has_triton_package()
+
+
+@functools.lru_cache(None)
+def triton_backend():
+    import torch
+
+    if torch.version.hip:
+        # Does not work with ROCm
+        return None
+
+    from triton.compiler.compiler import make_backend
+    from triton.runtime.driver import driver
+
+    target = driver.active.get_current_target()
+    return make_backend(target)
+
+
+@functools.lru_cache(None)
+def triton_hash_with_backend():
+    import torch
+
+    if torch.version.hip:
+        # Does not work with ROCm
+        return None
+
+    from triton.compiler.compiler import triton_key
+
+    backend = triton_backend()
+    key = f"{triton_key()}-{backend.hash()}"
+    return hashlib.sha256(key.encode("utf-8")).hexdigest()
+
+
+def dtype_to_string(dtype):
+    if dtype.name.startswith("fp"):
+        suffix = "float" + dtype.name[2:]
+    elif dtype.name.startswith("bf"):
+        suffix = "bfloat" + dtype.name[2:]
+    else:
+        suffix = dtype.name
+    return "triton.language." + suffix
+
+
+def patch_triton_dtype_repr():
+    import triton
+
+    # Hack to get triton dtype repr to produce an evaluatable expression
+    # triton.language.float32 emits triton.language.fp32 which does not
+    # exist
+    # REMOVE when https://github.com/openai/triton/pull/3342 lands
+    triton.language.dtype.__repr__ = lambda self: dtype_to_string(self)
diff --git a/torch/utils/backend_registration.py b/torch/utils/backend_registration.py
index aee7964c42589..6a4cbcb8436b0 100644
--- a/torch/utils/backend_registration.py
+++ b/torch/utils/backend_registration.py
@@ -1,4 +1,8 @@
 import torch
+from torch.overrides import (
+    handle_torch_function,
+    has_torch_function_unary,
+)
 from torch._C import _rename_privateuse1_backend, _get_privateuse1_backend_name
 from typing import List, Optional, Union
 
@@ -32,20 +36,6 @@ def rename_privateuse1_backend(backend_name: str) -> None:
     (1) ``get_amp_supported_dtype() -> List[torch.dtype]``
         get the supported dtypes on your "foo" device in AMP, maybe the "foo" device supports one more dtype.
 
-    (2) ``is_autocast_enabled() -> bool``
-        check the AMP is enabled or not on your "foo" device.
-
-    (3) ``get_autocast_dtype() -> torch.dtype``
-        get the supported dtype on your "foo" device in AMP, which is set by ``set_autocast_dtype`` or the
-        default dtype, and the default dtype is ``torch.float16``.
-
-    (4) ``set_autocast_enabled(bool) -> None``
-        enable the AMP or not on your "foo" device.
-
-    (5) ``set_autocast_dtype(dtype) -> None``
-        set the supported dtype on your "foo" device in AMP, and the dtype be contained in the dtypes got
-        from ``get_amp_supported_dtype``.
-
     Note(random): If you want to support to set seed for your device, BackendModule needs to have the following API's:
 
     (1) ``_is_in_bad_fork() -> bool``
@@ -126,9 +116,13 @@ def _get_current_device_index():
 def _generate_tensor_methods_for_privateuse1_backend(custom_backend_name: str) -> None:
     @property  # type: ignore[misc]
     def wrap_tensor_backend(self: torch.Tensor) -> bool:
+        if has_torch_function_unary(self):
+            # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185
+            return handle_torch_function(wrap_tensor_backend.__get__, (self,), self)  # type: ignore[attr-defined]
         return self.device.type == custom_backend_name
 
     _check_register_once(torch.Tensor, f'is_{custom_backend_name}')
+    wrap_tensor_backend.fget.__name__ = f'is_{custom_backend_name}'  # type: ignore[attr-defined]
     setattr(torch.Tensor, f'is_{custom_backend_name}', wrap_tensor_backend)
 
     def wrap_tensor_to(self: torch.Tensor, device: Optional[Union[int, torch.device]] = None, non_blocking=False,
@@ -147,10 +141,13 @@ def wrap_tensor_to(self: torch.Tensor, device: Optional[Union[int, torch.device]
                 the argument has no effect.
             **kwargs (dict): For compatibility, may contain the key ``memory_format`` argument.
         """
+        if has_torch_function_unary(self):
+            return handle_torch_function(wrap_tensor_to, (self,), self, device=device, non_blocking=False, **kwargs)
         device_idx = _normalization_device(custom_backend_name, device)
         return self.to(device=torch.device(f'{custom_backend_name}:{device_idx}'), non_blocking=non_blocking, **kwargs)
 
     _check_register_once(torch.Tensor, custom_backend_name)
+    wrap_tensor_to.__name__ = custom_backend_name
     setattr(torch.Tensor, custom_backend_name, wrap_tensor_to)
 
 
@@ -182,6 +179,47 @@ def wrap_module_to(self: torch.nn.modules.module.T,
     _check_register_once(torch.nn.Module, custom_backend_name)
     setattr(torch.nn.Module, custom_backend_name, wrap_module_to)
 
+def _generate_packed_sequence_methods_for_privateuse1_backend(custom_backend_name: str) -> None:
+    # Generate PackedSequence Module attributes and methods depends on Tensor methods,
+    # so we need to check whether Tensor methods is already registered.
+    if not hasattr(torch.Tensor, f'is_{custom_backend_name}') or \
+       not hasattr(torch.Tensor, custom_backend_name):
+        raise RuntimeError(
+            f"Can not automatically generate is_{custom_backend_name}() or "
+            f"{custom_backend_name}() method for torch.nn.utils.rnn.PackedSequence."
+            f"Because torch.Tensor doesn't has the method is_{custom_backend_name}()"
+            f"or {custom_backend_name}()."
+            f"For this error, you can try setting for_tensor=True.")
+
+    @property  # type: ignore[misc]
+    def wrap_tensor_backend(self: torch.nn.utils.rnn.PackedSequence) -> bool:
+        return self.data.device.type == custom_backend_name
+
+    _check_register_once(torch.nn.utils.rnn.PackedSequence, f'is_{custom_backend_name}')
+    setattr(torch.nn.utils.rnn.PackedSequence, f'is_{custom_backend_name}', wrap_tensor_backend)
+
+    def wrap_module_to(self: torch.nn.utils.rnn.PackedSequence,
+                       *args, **kwargs) -> torch.nn.utils.rnn.PackedSequence:
+        r"""Move all model parameters and buffers to the custom device.
+
+        This also makes associated parameters and buffers different objects. So
+        it should be called before constructing optimizer if the module will
+        live on device while being optimized.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Args:
+            device (int, optional): if specified, all parameters will be copied to that device
+        """
+        ex = torch.tensor((), dtype=self.data.dtype, device=self.data.device).to(*args, **kwargs)
+        if ex.device.type == custom_backend_name:
+            return self.to(*args, **kwargs)
+        kwargs.update({'device': custom_backend_name})
+        return self.to(*args, **kwargs)
+
+    _check_register_once(torch.nn.utils.rnn.PackedSequence, custom_backend_name)
+    setattr(torch.nn.utils.rnn.PackedSequence, custom_backend_name, wrap_module_to)
 
 def _generate_storage_methods_for_privateuse1_backend(custom_backend_name: str,
                                                       unsupported_dtype: Optional[List[torch.dtype]] = None) -> None:
@@ -254,6 +292,7 @@ def wrap_typed_storage_to(self: torch.storage.TypedStorage,
 
 
 def generate_methods_for_privateuse1_backend(for_tensor: bool = True, for_module: bool = True,
+                                             for_packed_sequence: bool = True,
                                              for_storage: bool = False,
                                              unsupported_dtype: Optional[List[torch.dtype]] = None) -> None:
     r"""
@@ -299,6 +338,9 @@ def generate_methods_for_privateuse1_backend(for_tensor: bool = True, for_module
     if for_storage:
         _generate_storage_methods_for_privateuse1_backend(custom_backend_name, unsupported_dtype)
 
+    if for_packed_sequence:
+        _generate_packed_sequence_methods_for_privateuse1_backend(custom_backend_name)
+
 def _get_custom_mod_func(func_name: str):
     r"""
     Return the func named `func_name` defined in custom device module. If not defined,
diff --git a/torch/utils/benchmark/examples/op_benchmark.py b/torch/utils/benchmark/examples/op_benchmark.py
index b7536b9ec26bb..e2f0861d20acb 100644
--- a/torch/utils/benchmark/examples/op_benchmark.py
+++ b/torch/utils/benchmark/examples/op_benchmark.py
@@ -9,6 +9,7 @@
 from torch.utils.benchmark import Timer
 from torch.utils.benchmark.op_fuzzers.binary import BinaryOpFuzzer
 from torch.utils.benchmark.op_fuzzers.unary import UnaryOpFuzzer
+import operator
 
 
 _MEASURE_TIME = 1.0
@@ -75,7 +76,7 @@ def run(n, stmt, fuzzer_cls):
             order_len = max(order_len, len(order))
             steps_len = max(steps_len, len(steps))
 
-    parsed_results.sort(key=lambda x: x[2])
+    parsed_results.sort(key=operator.itemgetter(2))
 
     print(f"stmt: {stmt}")
     print(f" diff    faster{'':>17}{' ' * name_len} ", end="")
diff --git a/torch/utils/benchmark/examples/sparse/op_benchmark.py b/torch/utils/benchmark/examples/sparse/op_benchmark.py
index d7e97d33cc110..f998f6d5db470 100644
--- a/torch/utils/benchmark/examples/sparse/op_benchmark.py
+++ b/torch/utils/benchmark/examples/sparse/op_benchmark.py
@@ -9,6 +9,7 @@
 from torch.utils.benchmark import Timer
 from torch.utils.benchmark.op_fuzzers.sparse_unary import UnaryOpSparseFuzzer
 from torch.utils.benchmark.op_fuzzers.sparse_binary import BinaryOpSparseFuzzer
+import operator
 
 _MEASURE_TIME = 1.0
 
@@ -70,7 +71,7 @@ def run(n, stmt, fuzzer_cls):
             sparse_dim_len = max(sparse_dim_len, len(sparse_dim))
             is_coalesced_len = max(is_coalesced_len, len(is_coalesced))
 
-    parsed_results.sort(key=lambda x: x[2])
+    parsed_results.sort(key=operator.itemgetter(2))
 
     print(f"stmt: {stmt}")
     print(f" diff    faster{'':>17}{' ' * name_len} ", end="")
diff --git a/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py b/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
index c70395573adb2..3ac54059416c7 100644
--- a/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
+++ b/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
@@ -76,7 +76,7 @@ def _output_csv(file, results):
             dim_str = str(dim)
             shape_str = 'x'.join(str(s) for s in shape)
 
-        print(name, device, measurement.task_spec.num_threads, numel, shape_str, contiguous, dim_str,
+        print(name, device, measurement.task_spec.num_threads, numel, shape_str, contiguous, dim_str,  # type: ignore[possibly-undefined]
               measurement.mean * 1e6, measurement.median * 1e6, measurement.iqr * 1e6,
               sep=',', file=file)
 
diff --git a/torch/utils/benchmark/utils/common.py b/torch/utils/benchmark/utils/common.py
index b8134c599d66c..1849931ee55c6 100644
--- a/torch/utils/benchmark/utils/common.py
+++ b/torch/utils/benchmark/utils/common.py
@@ -282,7 +282,7 @@ def trim_sigfig(x: float, n: int) -> float:
 
 
 def ordered_unique(elements: Iterable[Any]) -> List[Any]:
-    return list(collections.OrderedDict({i: None for i in elements}).keys())
+    return list(collections.OrderedDict(dict.fromkeys(elements)).keys())
 
 
 @contextlib.contextmanager
diff --git a/torch/utils/benchmark/utils/compare.py b/torch/utils/benchmark/utils/compare.py
index 9c7863e6a740e..20122df667187 100644
--- a/torch/utils/benchmark/utils/compare.py
+++ b/torch/utils/benchmark/utils/compare.py
@@ -6,6 +6,7 @@
 
 from torch.utils.benchmark.utils import common
 from torch import tensor as _tensor
+import operator
 
 __all__ = ["Colorize", "Compare"]
 
@@ -167,7 +168,7 @@ def __init__(
         )
 
         self.row_keys = common.ordered_unique([self.row_fn(i) for i in results])
-        self.row_keys.sort(key=lambda args: args[:2])  # preserve stmt order
+        self.row_keys.sort(key=operator.itemgetter(slice(2)))  # preserve stmt order
         self.column_keys = common.ordered_unique([self.col_fn(i) for i in results])
         self.rows, self.columns = self.populate_rows_and_columns()
 
@@ -266,6 +267,21 @@ def render(self) -> str:
 
 
 class Compare:
+    """Helper class for displaying the results of many measurements in a
+    formatted table.
+
+    The table format is based on the information fields provided in
+    :class:`torch.utils.benchmark.Timer` (`description`, `label`, `sub_label`,
+    `num_threads`, etc).
+
+    The table can be directly printed using :meth:`print` or casted as a `str`.
+
+    For a full tutorial on how to use this class, see:
+    https://pytorch.org/tutorials/recipes/recipes/benchmark.html
+
+    Args:
+        results: List of Measurment to display.
+    """
     def __init__(self, results: List[common.Measurement]):
         self._results: List[common.Measurement] = []
         self.extend_results(results)
@@ -277,6 +293,10 @@ def __str__(self):
         return "\n".join(self._render())
 
     def extend_results(self, results):
+        """Append results to already stored ones.
+
+        All added results must be instances of ``Measurement``.
+        """
         for r in results:
             if not isinstance(r, common.Measurement):
                 raise ValueError(
@@ -285,15 +305,22 @@ def extend_results(self, results):
         self._results.extend(results)
 
     def trim_significant_figures(self):
+        """Enables trimming of significant figures when building the formatted table."""
         self._trim_significant_figures = True
 
     def colorize(self, rowwise=False):
+        """Colorize formatted table.
+
+        Colorize columnwise by default.
+        """
         self._colorize = Colorize.ROWWISE if rowwise else Colorize.COLUMNWISE
 
     def highlight_warnings(self):
+        """Enables warning highlighting when building formatted table."""
         self._highlight_warnings = True
 
     def print(self):
+        """Print formatted table"""
         print(str(self))
 
     def _render(self):
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
index b101c38f4ccdb..f860d36ce0d0b 100644
--- a/torch/utils/benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -13,7 +13,7 @@
 __all__ = ["Timer", "timer", "Language"]
 
 
-if torch.backends.cuda.is_built() and torch.cuda.is_available():
+if torch.backends.cuda.is_built() and torch.cuda.is_available():  # type: ignore[no-untyped-call]
     def timer() -> float:
         torch.cuda.synchronize()
         return timeit.default_timer()
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
index 9d9a2f86b6bf1..5a3e9f635891d 100644
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -465,7 +465,7 @@ def construct(self) -> str:
                 path = os.path.join(self._data_dir, f"{name}.pt")
                 load_lines.append(f"{name} = torch.jit.load({repr(path)})")
                 with open(path, "wb") as f:
-                    torch.jit.save(wrapped_value.value, f)
+                    torch.jit.save(wrapped_value.value, f)  # type: ignore[no-untyped-call]
 
             else:
                 raise NotImplementedError(
@@ -502,7 +502,7 @@ def __init__(self) -> None:
                 ).returncode
 
         self._build_type: Optional[str] = None
-        build_search = re.search("BUILD_TYPE=(.+),", torch.__config__.show())
+        build_search = re.search("BUILD_TYPE=(.+),", torch.__config__.show())  # type: ignore[no-untyped-call]
         if build_search is not None:
             self._build_type = build_search.groups()[0].split(",")[0]
 
@@ -701,7 +701,7 @@ class ScanState(enum.Enum):
                         if fn_match:
                             ir_str, file_function = fn_match.groups()
                             ir = int(ir_str.replace(",", ""))
-                            if ir == program_totals:
+                            if ir == program_totals:  # type: ignore[possibly-undefined]
                                 # Callgrind includes some top level red herring symbols when
                                 # a program dumps multiple profiles.
                                 continue
diff --git a/torch/utils/bottleneck/__main__.py b/torch/utils/bottleneck/__main__.py
index f7fd209e1438f..4444211a0f875 100644
--- a/torch/utils/bottleneck/__main__.py
+++ b/torch/utils/bottleneck/__main__.py
@@ -156,7 +156,7 @@ def print_autograd_prof_summary(prof, mode, sortby='cpu_time', topk=15):
 
 For more complicated uses of the profilers, please see
 https://docs.python.org/3/library/profile.html and
-https://pytorch.org/docs/master/autograd.html#profiler for more information.
+https://pytorch.org/docs/main/autograd.html#profiler for more information.
 """.strip()
 
 
@@ -171,7 +171,7 @@ def parse_args():
 
 
 def cpu_time_total(autograd_prof):
-    return sum([event.cpu_time_total for event in autograd_prof.function_events])
+    return sum(event.cpu_time_total for event in autograd_prof.function_events)
 
 
 def main():
diff --git a/torch/utils/bundled_inputs.py b/torch/utils/bundled_inputs.py
index 2a95c7828843c..201a000b30061 100644
--- a/torch/utils/bundled_inputs.py
+++ b/torch/utils/bundled_inputs.py
@@ -104,7 +104,7 @@ def bundle_inputs(
     Tensors in lists or tuples will not.
     """
     if not isinstance(model, torch.jit.ScriptModule):
-        raise Exception("Only ScriptModule is supported.")
+        raise Exception("Only ScriptModule is supported.")  # noqa: TRY002
 
     ignored_methods, ignored_attrs = _get_bundled_inputs_attributes_and_methods(model)
     clone = torch._C._hack_do_not_use_clone_module_with_class(  # type: ignore[attr-defined]
@@ -162,7 +162,7 @@ def augment_model_with_bundled_inputs(
         of each tuple are the args that make up one input.
     """
     if not isinstance(model, torch.jit.ScriptModule):
-        raise Exception("Only ScriptModule is supported.")
+        raise Exception("Only ScriptModule is supported.")  # noqa: TRY002
 
     forward: Callable = model.forward
 
@@ -235,13 +235,13 @@ def augment_many_model_functions_with_bundled_inputs(
     Tensors in lists or tuples will not.
     """
     if not isinstance(model, torch.jit.ScriptModule):
-        raise Exception("Only ScriptModule is supported.")
+        raise Exception("Only ScriptModule is supported.")  # noqa: TRY002
 
     if not inputs:
-        raise Exception("Please provide inputs for at least 1 function")
+        raise Exception("Please provide inputs for at least 1 function")  # noqa: TRY002
 
     if hasattr(model, "get_all_bundled_inputs") or hasattr(model, "get_bundled_inputs_functions_and_info"):
-        raise Exception(
+        raise Exception(  # noqa: TRY002
             "Models can only be augmented with bundled inputs once. "
             "This Model seems to have already been augmented with "
             "bundled inputs. Please start afresh with one that "
@@ -257,7 +257,7 @@ def augment_many_model_functions_with_bundled_inputs(
             if hasattr(function, "name"):
                 function_name = function.name  # type: ignore[attr-defined]
             else:
-                raise Exception(
+                raise Exception(  # noqa: TRY002
                     'At least one of your functions has no attribute name please ensure all have one. m.foo.name = "foo"')
 
 
@@ -270,17 +270,14 @@ def augment_many_model_functions_with_bundled_inputs(
 
         if hasattr(model, "_generate_bundled_inputs_for_" + function_name):
             if input_list is not None:
-                raise Exception(
-                    "inputs[{name}] is not None, but _generate_bundled_inputs_for_{name} is already defined".format(
-                        name=function_name
-                    )
+                raise Exception(  # noqa: TRY002
+                    f"inputs[{function_name}] is not None, but _generate_bundled_inputs_for_{function_name} is already defined"
                 )
             # Model author already defined _generate_bundled_inputs_for_<function_name>.
         elif input_list is None or len(input_list) == 0:
-            raise Exception(
-                "inputs for {name} must be specified if _generate_bundled_inputs_for_{name} is not already defined".format(
-                    name=function_name,
-                )
+            raise Exception(  # noqa: TRY002
+                f"inputs for {function_name} must be specified if "
+                f"_generate_bundled_inputs_for_{function_name} is not already defined"
             )
         else:
             # Iterate over the inputs and args in each input.
@@ -372,7 +369,7 @@ def _inflate_expr(
     if isinstance(arg, InflatableArg):
         if arg.fmt_fn:
             if arg.fmt not in ["{}", ""]:
-                raise Exception(
+                raise Exception(  # noqa: TRY002
                     f"Bundled input argument at position '{ref}' has "
                     f"both arg.fmt_fn => \n{arg.fmt_fn} "
                     f"\n and arg.fmt  => {arg.fmt}. "
@@ -403,7 +400,7 @@ def _inflate_expr(
                         f"{ref}.contiguous(memory_format={fmt})", None)
         # Prevent big tensors from being bundled by default.
         # TODO: Provide more useful diagnostics.
-        raise Exception(
+        raise Exception(  # noqa: TRY002
             f"Bundled input argument at position '{ref}' is "
             f"a tensor with storage size {arg._typed_storage().size()}. "
             f"You probably don't want to bundle this as an input. "
diff --git a/torch/utils/checkpoint.py b/torch/utils/checkpoint.py
index 7dae01b4bc4da..ba10f52c0903e 100644
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@@ -132,24 +132,27 @@ def get_device_type() -> str:
 
 
 def _infer_device_type(*args):
-    device_types = list(
-        {
-            arg.device.type
-            for arg in args
-            if isinstance(arg, torch.Tensor) and not arg.device.type == "cpu"
-        }
-    )
-    if len(device_types) > 1:
+    device_types = []
+
+    def add_device_types(arg):
+        nonlocal device_types
+        if isinstance(arg, torch.Tensor) and not arg.device.type == "cpu":
+            device_types.append(arg.device.type)
+    tree_map(add_device_types, args)
+
+    device_types_set = set(device_types)
+    if len(device_types_set) > 1:
         warnings.warn(
             "Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. "
             "Device state will only be saved for devices of a single device type, and the remaining "
             "devices will be ignored. Consequently, if any checkpointed functions involve randomness, "
             "this may result in incorrect gradients. (Note that if CUDA devices are among the devices "
             "detected, it will be prioritized; otherwise, the first device encountered will be selected.)"
+            f"\nDevice types: {sorted(device_types_set)} first device type: {device_types[0]}"
         )
     if len(device_types) == 0:
         return DefaultDeviceType.get_device_type()
-    elif "cuda" in device_types:
+    elif "cuda" in device_types_set:
         return "cuda"
     else:
         return device_types[0]
@@ -165,13 +168,13 @@ def _infer_device_type(*args):
 def get_device_states(*args) -> Tuple[List[int], List[torch.Tensor]]:
     # This will not error out if "arg" is a CPU tensor or a non-tensor type because
     # the conditionals short-circuit.
-    fwd_device_ids = list(
-        {
-            arg.get_device()
-            for arg in args
-            if isinstance(arg, torch.Tensor) and not arg.device.type == "cpu"
-        }
-    )
+    fwd_device_ids = []
+
+    def add_device_ids(arg):
+        nonlocal fwd_device_ids
+        if isinstance(arg, torch.Tensor) and not arg.device.type == "cpu":
+            fwd_device_ids.append(arg.get_device())
+    tree_map(add_device_ids, args)
 
     fwd_device_states = []
     device_module = _get_device_module(_infer_device_type(*args))
@@ -191,34 +194,23 @@ def set_device_states(devices, states) -> None:
 
 
 def _get_autocast_kwargs(device="cuda"):
-    if device == "cuda":
-        device_autocast_kwargs = {
-            "enabled": torch.is_autocast_enabled(),
-            "dtype": torch.get_autocast_gpu_dtype(),
-            "cache_enabled": torch.is_autocast_cache_enabled(),
-        }
-    elif _supports_autocast(device):
-        device_module = _get_device_module(device)
+    if torch.amp.is_autocast_available(device):
         device_autocast_kwargs = {
-            "enabled": device_module.is_autocast_enabled(),
-            "dtype": device_module.get_autocast_dtype(),
+            "enabled": torch.is_autocast_enabled(device),
+            "dtype": torch.get_autocast_dtype(device),
             "cache_enabled": torch.is_autocast_cache_enabled(),
         }
     else:
         device_autocast_kwargs = None
 
     cpu_autocast_kwargs = {
-        "enabled": torch.is_autocast_cpu_enabled(),
-        "dtype": torch.get_autocast_cpu_dtype(),
+        "enabled": torch.is_autocast_enabled('cpu'),
+        "dtype": torch.get_autocast_dtype('cpu'),
         "cache_enabled": torch.is_autocast_cache_enabled(),
     }
 
     return device_autocast_kwargs, cpu_autocast_kwargs
 
-def _supports_autocast(device):
-    device_module = _get_device_module(device)
-    return device == "cuda" or (hasattr(device_module, "is_autocast_enabled")
-                                and hasattr(device_module, "get_autocast_dtype"))
 
 class CheckpointFunction(torch.autograd.Function):
     @staticmethod
@@ -266,9 +258,10 @@ def forward(ctx, run_function, preserve_rng_state, *args):
     def backward(ctx, *args):
         if not torch.autograd._is_checkpoint_valid():
             raise RuntimeError(
-                "Checkpointing is not compatible with .grad() or when an `inputs` parameter"
-                " is passed to .backward(). Please use .backward() and do not pass its `inputs`"
-                " argument."
+                "When use_reentrant=True, torch.utils.checkpoint is incompatible"
+                " with .grad() or passing an `inputs` parameter to .backward()."
+                " To resolve this error, you can either set use_reentrant=False,"
+                " or call .backward() without passing the `inputs` argument."
             )
         # Copy the list to avoid modifying original list.
         inputs = list(ctx.inputs)
@@ -295,11 +288,10 @@ def backward(ctx, *args):
                     set_device_states(ctx.fwd_devices, ctx.fwd_device_states)
             detached_inputs = detach_variable(tuple(inputs))
 
-            device_autocast_ctx = device_module.amp.autocast(
-                **ctx.device_autocast_kwargs
-            ) if _supports_autocast(ctx.device) else contextlib.nullcontext()
-            with torch.enable_grad(), device_autocast_ctx, \
-                 torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):
+            device_autocast_ctx = torch.amp.autocast(
+                device_type=ctx.device, **ctx.device_autocast_kwargs
+            ) if torch.amp.is_autocast_available(ctx.device) else contextlib.nullcontext()
+            with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
                 outputs = ctx.run_function(*detached_inputs)
 
         if isinstance(outputs, torch.Tensor):
@@ -467,7 +459,8 @@ def checkpoint(
             "if use_reentrant is not passed. use_reentrant=False is "
             "recommended, but if you need to preserve the current default "
             "behavior, you can pass use_reentrant=True. Refer to docs for more "
-            "details on the differences between the two variants."
+            "details on the differences between the two variants.",
+            stacklevel=2
         )
         use_reentrant = True
 
@@ -1081,7 +1074,7 @@ def pack_hook(x):
             if target_frame.early_stop and target_frame.recomp_counter[gid] == len(
                 target_frame.weak_holders
             ):
-                raise _StopRecomputationError()
+                raise _StopRecomputationError
             # See Rule 6: [ retain_graph is True ] above
             return x.detach()
 
@@ -1401,11 +1394,10 @@ def recompute_fn(*inputs):
                 if had_device_in_fwd:
                     set_device_states(fwd_devices, fwd_device_states)
 
-            device_autocast_ctx = device_module.amp.autocast(
-                **device_autocast_kwargs
-            ) if _supports_autocast(device) else contextlib.nullcontext()
-            with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), \
-                 recompute_context:
+            device_autocast_ctx = torch.amp.autocast(
+                device_type=device, **device_autocast_kwargs
+            ) if torch.amp.is_autocast_available(device) else contextlib.nullcontext()
+            with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
                 fn(*args, **kwargs)
 
     new_frame = _CheckpointFrame(
@@ -1427,7 +1419,7 @@ def recompute_fn(*inputs):
     new_frame.forward_completed = True
 
     if getattr(device_module, "_initialized", False) and \
-       preserve_rng_state and not had_device_in_fwd:
+       preserve_rng_state and not had_device_in_fwd:  # type: ignore[possibly-undefined]
         # Device was not initialized before running the forward, so we didn't
         # stash the device state.
         raise RuntimeError(
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 144a718adc573..ee5584315e940 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -931,7 +931,8 @@ def CppExtension(name, sources, *args, **kwargs):
     bare minimum (but often sufficient) arguments to build a C++ extension.
 
     All arguments are forwarded to the :class:`setuptools.Extension`
-    constructor.
+    constructor. Full list arguments can be found at
+    https://setuptools.pypa.io/en/latest/userguide/ext_modules.html#extension-api-reference
 
     Example:
         >>> # xdoctest: +SKIP
@@ -944,7 +945,8 @@ def CppExtension(name, sources, *args, **kwargs):
         ...         CppExtension(
         ...             name='extension',
         ...             sources=['extension.cpp'],
-        ...             extra_compile_args=['-g']),
+        ...             extra_compile_args=['-g'],
+        ...             extra_link_flags=['-Wl,--no-as-needed', '-lm'])
         ...     ],
         ...     cmdclass={
         ...         'build_ext': BuildExtension
@@ -979,7 +981,8 @@ def CUDAExtension(name, sources, *args, **kwargs):
     library.
 
     All arguments are forwarded to the :class:`setuptools.Extension`
-    constructor.
+    constructor. Full list arguments can be found at
+    https://setuptools.pypa.io/en/latest/userguide/ext_modules.html#extension-api-reference
 
     Example:
         >>> # xdoctest: +SKIP
@@ -993,7 +996,8 @@ def CUDAExtension(name, sources, *args, **kwargs):
         ...                 name='cuda_extension',
         ...                 sources=['extension.cpp', 'extension_kernel.cu'],
         ...                 extra_compile_args={'cxx': ['-g'],
-        ...                                     'nvcc': ['-O2']})
+        ...                                     'nvcc': ['-O2']},
+        ...                 extra_link_flags=['-Wl,--no-as-needed', '-lcuda'])
         ...     ],
         ...     cmdclass={
         ...         'build_ext': BuildExtension
@@ -1346,7 +1350,13 @@ def check_compiler_is_gcc(compiler):
 
     env = os.environ.copy()
     env['LC_ALL'] = 'C'  # Don't localize output
-    version_string = subprocess.check_output([compiler, '-v'], stderr=subprocess.STDOUT, env=env).decode(*SUBPROCESS_DECODE_ARGS)
+    try:
+        version_string = subprocess.check_output([compiler, '-v'], stderr=subprocess.STDOUT, env=env).decode(*SUBPROCESS_DECODE_ARGS)
+    except Exception as e:
+        try:
+            version_string = subprocess.check_output([compiler, '--version'], stderr=subprocess.STDOUT, env=env).decode(*SUBPROCESS_DECODE_ARGS)
+        except Exception as e:
+            return False
     # Check for 'gcc' or 'g++' for sccache wrapper
     pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE)
     results = re.findall(pattern, version_string)
@@ -1681,10 +1691,10 @@ def _jit_compile(name,
                   file=sys.stderr)
         name = f'{name}_v{version}'
 
-    if version != old_version:
-        baton = FileBaton(os.path.join(build_directory, 'lock'))
-        if baton.try_acquire():
-            try:
+    baton = FileBaton(os.path.join(build_directory, 'lock'))
+    if baton.try_acquire():
+        try:
+            if version != old_version:
                 with GeneratedFileCleaner(keep_intermediates=keep_intermediates) as clean_ctx:
                     if IS_HIP_EXTENSION and (with_cuda or with_cudnn):
                         hipify_result = hipify_python.hipify(
@@ -1717,14 +1727,13 @@ def _jit_compile(name,
                         verbose=verbose,
                         with_cuda=with_cuda,
                         is_standalone=is_standalone)
-            finally:
-                baton.release()
-        else:
-            baton.wait()
-    elif verbose:
-        print('No modifications detected for re-loaded extension '
-              f'module {name}, skipping build step...',
-              file=sys.stderr)
+            elif verbose:
+                print('No modifications detected for re-loaded extension '
+                      f'module {name}, skipping build step...', file=sys.stderr)
+        finally:
+            baton.release()
+    else:
+        baton.wait()
 
     if verbose:
         print(f'Loading extension module {name}...', file=sys.stderr)
@@ -1843,8 +1852,7 @@ def verify_ninja_availability():
 
 def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
     if IS_WINDOWS:
-        python_path = os.path.dirname(sys.executable)
-        python_lib_path = os.path.join(python_path, 'libs')
+        python_lib_path = os.path.join(sys.base_exec_prefix, 'libs')
 
         extra_ldflags.append('c10.lib')
         if with_cuda:
@@ -1955,6 +1963,9 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
 
     # If not given, determine what's best for the GPU / CUDA version that can be found
     if not _arch_list:
+        warnings.warn(
+            "TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \n"
+            "If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].")
         arch_list = []
         # the assumption is that the extension should run on any of the currently visible cards,
         # which could be of different types - therefore all archs for visible cards should be included
@@ -2163,11 +2174,6 @@ def _write_ninja_file_to_build_library(path,
     if python_include_path is not None:
         system_includes.append(python_include_path)
 
-    # Windows does not understand `-isystem`.
-    if IS_WINDOWS:
-        user_includes += system_includes
-        system_includes.clear()
-
     common_cflags = []
     if not is_standalone:
         common_cflags.append(f'-DTORCH_EXTENSION_NAME={name}')
@@ -2175,8 +2181,12 @@ def _write_ninja_file_to_build_library(path,
 
     common_cflags += [f"{x}" for x in _get_pybind11_abi_build_flags()]
 
-    common_cflags += [f'-I{include}' for include in user_includes]
-    common_cflags += [f'-isystem {include}' for include in system_includes]
+    # Windows does not understand `-isystem` and quotes flags later.
+    if IS_WINDOWS:
+        common_cflags += [f'-I{include}' for include in user_includes + system_includes]
+    else:
+        common_cflags += [f'-I{shlex.quote(include)}' for include in user_includes]
+        common_cflags += [f'-isystem {shlex.quote(include)}' for include in system_includes]
 
     common_cflags += [f"{x}" for x in _get_glibcxx_abi_build_flags()]
 
@@ -2335,7 +2345,8 @@ def sanitize_flags(flags):
         cuda_compile_rule = ['rule cuda_compile']
         nvcc_gendeps = ''
         # --generate-dependencies-with-compile is not supported by ROCm
-        if torch.version.cuda is not None:
+        # Nvcc flag `--generate-dependencies-with-compile` is not supported by sccache, which may increase build time.
+        if torch.version.cuda is not None and os.getenv('TORCH_EXTENSION_SKIP_NVCC_GEN_DEPENDENCIES', '0') != '1':
             cuda_compile_rule.append('  depfile = $out.d')
             cuda_compile_rule.append('  deps = gcc')
             # Note: non-system deps with nvcc are only supported
@@ -2388,7 +2399,7 @@ def sanitize_flags(flags):
     # 'Blocks' should be separated by newlines, for visual benefit.
     blocks = [config, flags, compile_rule]
     if with_cuda:
-        blocks.append(cuda_compile_rule)
+        blocks.append(cuda_compile_rule)  # type: ignore[possibly-undefined]
     blocks += [devlink_rule, link_rule, build, devlink, link, default]
     content = "\n\n".join("\n".join(b) for b in blocks)
     # Ninja requires a new lines at the end of the .ninja file
diff --git a/torch/utils/data/_utils/collate.py b/torch/utils/data/_utils/collate.py
index dc9bc7c3c1491..4c17597bd6f1a 100644
--- a/torch/utils/data/_utils/collate.py
+++ b/torch/utils/data/_utils/collate.py
@@ -9,6 +9,7 @@
 
 import collections
 import contextlib
+import copy
 import re
 import torch
 
@@ -62,9 +63,18 @@ def default_convert(data):
         return torch.as_tensor(data)
     elif isinstance(data, collections.abc.Mapping):
         try:
-            return elem_type({key: default_convert(data[key]) for key in data})
+            if isinstance(data, collections.abc.MutableMapping):
+                # The mapping type may have extra properties, so we can't just
+                # use `type(data)(...)` to create the new mapping.
+                # Create a clone and update it if the mapping type is mutable.
+                clone = copy.copy(data)
+                clone.update({key: default_convert(data[key]) for key in data})
+                return clone
+            else:
+                return elem_type({key: default_convert(data[key]) for key in data})
         except TypeError:
-            # The mapping type may not support `__init__(iterable)`.
+            # The mapping type may not support `copy()` / `update(mapping)`
+            # or `__init__(iterable)`.
             return {key: default_convert(data[key]) for key in data}
     elif isinstance(data, tuple) and hasattr(data, '_fields'):  # namedtuple
         return elem_type(*(default_convert(d) for d in data))
@@ -72,9 +82,19 @@ def default_convert(data):
         return [default_convert(d) for d in data]  # Backwards compatibility.
     elif isinstance(data, collections.abc.Sequence) and not isinstance(data, (str, bytes)):
         try:
-            return elem_type([default_convert(d) for d in data])
+            if isinstance(data, collections.abc.MutableSequence):
+                # The sequence type may have extra properties, so we can't just
+                # use `type(data)(...)` to create the new sequence.
+                # Create a clone and update it if the sequence type is mutable.
+                clone = copy.copy(data)  # type: ignore[arg-type]
+                for i, d in enumerate(data):
+                    clone[i] = default_convert(d)
+                return clone
+            else:
+                return elem_type([default_convert(d) for d in data])
         except TypeError:
-            # The sequence type may not support `__init__(iterable)` (e.g., `range`).
+            # The sequence type may not support `copy()` / `__setitem__(index, item)`
+            # or `__init__(iterable)` (e.g., `range`).
             return [default_convert(d) for d in data]
     else:
         return data
@@ -101,7 +121,7 @@ def collate(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]
 
     Examples:
         >>> def collate_tensor_fn(batch, *, collate_fn_map):
-        >>> # Extend this function to handle batch of tensors
+        ...     # Extend this function to handle batch of tensors
         ...     return torch.stack(batch, 0)
         >>> def custom_collate(batch):
         ...     collate_map = {torch.Tensor: collate_tensor_fn}
@@ -126,9 +146,18 @@ def collate(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]
 
     if isinstance(elem, collections.abc.Mapping):
         try:
-            return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+            if isinstance(elem, collections.abc.MutableMapping):
+                # The mapping type may have extra properties, so we can't just
+                # use `type(data)(...)` to create the new mapping.
+                # Create a clone and update it if the mapping type is mutable.
+                clone = copy.copy(elem)
+                clone.update({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+                return clone
+            else:
+                return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
         except TypeError:
-            # The mapping type may not support `__init__(iterable)`.
+            # The mapping type may not support `copy()` / `update(mapping)`
+            # or `__init__(iterable)`.
             return {key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
     elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
         return elem_type(*(collate(samples, collate_fn_map=collate_fn_map) for samples in zip(*batch)))
@@ -144,9 +173,19 @@ def collate(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]
             return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
         else:
             try:
-                return elem_type([collate(samples, collate_fn_map=collate_fn_map) for samples in transposed])
+                if isinstance(elem, collections.abc.MutableSequence):
+                    # The sequence type may have extra properties, so we can't just
+                    # use `type(data)(...)` to create the new sequence.
+                    # Create a clone and update it if the sequence type is mutable.
+                    clone = copy.copy(elem)  # type: ignore[arg-type]
+                    for i, samples in enumerate(transposed):
+                        clone[i] = collate(samples, collate_fn_map=collate_fn_map)
+                    return clone
+                else:
+                    return elem_type([collate(samples, collate_fn_map=collate_fn_map) for samples in transposed])
             except TypeError:
-                # The sequence type may not support `__init__(iterable)` (e.g., `range`).
+                # The sequence type may not support `copy()` / `__setitem__(index, item)`
+                # or `__init__(iterable)` (e.g., `range`).
                 return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
 
     raise TypeError(default_collate_err_msg_format.format(elem_type))
@@ -271,7 +310,7 @@ def default_collate(batch):
         >>> # Option 2: In-place modify `default_collate_fn_map`
         >>> def collate_customtype_fn(batch, *, collate_fn_map=None):
         ...     return ...
-        >>> default_collate_fn_map.update(CustoType, collate_customtype_fn)
+        >>> default_collate_fn_map.update(CustomType, collate_customtype_fn)
         >>> default_collate(batch)  # Handle `CustomType` automatically
     """
     return collate(batch, collate_fn_map=default_collate_fn_map)
diff --git a/torch/utils/data/_utils/fetch.py b/torch/utils/data/_utils/fetch.py
index c5696b401c5a8..553c516ff3ce4 100644
--- a/torch/utils/data/_utils/fetch.py
+++ b/torch/utils/data/_utils/fetch.py
@@ -12,7 +12,7 @@ def __init__(self, dataset, auto_collation, collate_fn, drop_last):
         self.drop_last = drop_last
 
     def fetch(self, possibly_batched_index):
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 class _IterableDatasetFetcher(_BaseDatasetFetcher):
diff --git a/torch/utils/data/_utils/pin_memory.py b/torch/utils/data/_utils/pin_memory.py
index f94342a64013a..9de645cd7ee77 100644
--- a/torch/utils/data/_utils/pin_memory.py
+++ b/torch/utils/data/_utils/pin_memory.py
@@ -5,6 +5,7 @@
 """
 
 import collections
+import copy
 import queue
 
 import torch
@@ -59,9 +60,18 @@ def pin_memory(data, device=None):
         return data
     elif isinstance(data, collections.abc.Mapping):
         try:
-            return type(data)({k: pin_memory(sample, device) for k, sample in data.items()})  # type: ignore[call-arg]
+            if isinstance(data, collections.abc.MutableMapping):
+                # The sequence type may have extra properties, so we can't just
+                # use `type(data)(...)` to create the new sequence.
+                # Create a clone and update it if the sequence type is mutable.
+                clone = copy.copy(data)
+                clone.update({k: pin_memory(sample, device) for k, sample in data.items()})
+                return clone
+            else:
+                return type(data)({k: pin_memory(sample, device) for k, sample in data.items()})  # type: ignore[call-arg]
         except TypeError:
-            # The mapping type may not support `__init__(iterable)`.
+            # The mapping type may not support `copy()` / `update(mapping)`
+            # or `__init__(iterable)`.
             return {k: pin_memory(sample, device) for k, sample in data.items()}
     elif isinstance(data, tuple) and hasattr(data, '_fields'):  # namedtuple
         return type(data)(*(pin_memory(sample, device) for sample in data))
@@ -69,9 +79,18 @@ def pin_memory(data, device=None):
         return [pin_memory(sample, device) for sample in data]  # Backwards compatibility.
     elif isinstance(data, collections.abc.Sequence):
         try:
+            if isinstance(data, collections.abc.MutableSequence):
+                # The sequence type may have extra properties, so we can't just
+                # use `type(data)(...)` to create the new sequence.
+                # Create a clone and update it if the sequence type is mutable.
+                clone = copy.copy(data)  # type: ignore[arg-type]
+                for i, item in enumerate(data):
+                    clone[i] = pin_memory(item, device)
+                return clone
             return type(data)([pin_memory(sample, device) for sample in data])  # type: ignore[call-arg]
         except TypeError:
-            # The sequence type may not support `__init__(iterable)` (e.g., `range`).
+            # The sequence type may not support `copy()` / `__setitem__(index, item)`
+            # or `__init__(iterable)` (e.g., `range`).
             return [pin_memory(sample, device) for sample in data]
     elif hasattr(data, "pin_memory"):
         return data.pin_memory()
diff --git a/torch/utils/data/_utils/serialization.py b/torch/utils/data/_utils/serialization.py
deleted file mode 100644
index c2677919c66ea..0000000000000
--- a/torch/utils/data/_utils/serialization.py
+++ /dev/null
@@ -1,11 +0,0 @@
-try:
-    import dill
-
-    # XXX: By default, dill writes the Pickler dispatch table to inject its
-    # own logic there. This globally affects the behavior of the standard library
-    # pickler for any user who transitively depends on this module!
-    # Undo this extension to avoid altering the behavior of the pickler globally.
-    dill.extend(use_dill=False)
-    DILL_AVAILABLE = True
-except ImportError:
-    DILL_AVAILABLE = False
diff --git a/torch/utils/data/_utils/worker.py b/torch/utils/data/_utils/worker.py
index 3403bf24cb4c2..137791c4c4362 100644
--- a/torch/utils/data/_utils/worker.py
+++ b/torch/utils/data/_utils/worker.py
@@ -58,7 +58,7 @@ def is_alive(self):
                 self.manager_dead = os.getppid() != self.manager_pid
             return not self.manager_dead
 
-_worker_info = None
+_worker_info: Optional["WorkerInfo"] = None
 
 
 class WorkerInfo:
@@ -305,7 +305,7 @@ def _worker_loop(dataset_kind, dataset, index_queue, data_queue, done_event,
                 init_exception = None
             else:
                 try:
-                    data = fetcher.fetch(index)
+                    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
                 except Exception as e:
                     if isinstance(e, StopIteration) and dataset_kind == _DatasetKind.Iterable:
                         data = _IterableDatasetStopIteration(worker_id)
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 3b09d094f39c0..80784f2ec3627 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -521,11 +521,9 @@ def _create_warning_msg(num_worker_suggest, num_worker_created, cpuset_checked):
                 "DataLoader is not able to compute a suggested max number of worker in current system.")
 
             warn_msg = (
-                "This DataLoader will create {} worker processes in total. {} "
+                f"This DataLoader will create {num_worker_created} worker processes in total. {suggested_max_worker_msg} "
                 "Please be aware that excessive worker creation might get DataLoader running slow or even freeze, "
-                "lower the worker number to avoid potential slowness/freeze if necessary.").format(
-                    num_worker_created,
-                    suggested_max_worker_msg)
+                "lower the worker number to avoid potential slowness/freeze if necessary.")
             return warn_msg
 
         if not self.num_workers or self.num_workers == 0:
@@ -633,9 +631,8 @@ def __next__(self) -> Any:
             if self._dataset_kind == _DatasetKind.Iterable and \
                     self._IterableDataset_len_called is not None and \
                     self._num_yielded > self._IterableDataset_len_called:
-                warn_msg = ("Length of IterableDataset {} was reported to be {} (when accessing len(dataloader)), but {} "
-                            "samples have been fetched. ").format(self._dataset, self._IterableDataset_len_called,
-                                                                  self._num_yielded)
+                warn_msg = (f"Length of IterableDataset {self._dataset} was reported to be {self._IterableDataset_len_called}"
+                            f"(when accessing len(dataloader)), but {self._num_yielded} samples have been fetched. ")
                 if self._num_workers > 0:
                     warn_msg += ("For multiprocessing data-loading, this could be caused by not properly configuring the "
                                  "IterableDataset replica at each worker. Please see "
@@ -1360,7 +1357,7 @@ def _try_put_index(self):
             # not found (i.e., didn't break)
             return
 
-        self._index_queues[worker_queue_idx].put((self._send_idx, index))
+        self._index_queues[worker_queue_idx].put((self._send_idx, index))  # type: ignore[possibly-undefined]
         self._task_info[self._send_idx] = (worker_queue_idx,)
         self._tasks_outstanding += 1
         self._send_idx += 1
diff --git a/torch/utils/data/datapipes/README.md b/torch/utils/data/datapipes/README.md
index 76dfd0effb0fa..e4f0ee407c0f8 100644
--- a/torch/utils/data/datapipes/README.md
+++ b/torch/utils/data/datapipes/README.md
@@ -1,15 +1,15 @@
-The [`datapipes`](https://github.com/pytorch/pytorch/tree/master/torch/utils/data/datapipes) folder holds the implementation of the `IterDataPipe` and `MapDataPipe`.
+The [`datapipes`](https://github.com/pytorch/pytorch/tree/main/torch/utils/data/datapipes) folder holds the implementation of the `IterDataPipe` and `MapDataPipe`.
 
 This document serves as an entry point for DataPipe implementation.
 
 ## Implementing DataPipe
-For the sake of an example, let us implement an `IterDataPipe` to apply a callable over data under [`iter`](https://github.com/pytorch/pytorch/tree/master/torch/utils/data/datapipes/iter).
-For `MapDataPipe`, please take reference from files in [map](https://github.com/pytorch/pytorch/tree/master/torch/utils/data/datapipes/map) folder and implement the corresponding `__getitem__` method.
+For the sake of an example, let us implement an `IterDataPipe` to apply a callable over data under [`iter`](https://github.com/pytorch/pytorch/tree/main/torch/utils/data/datapipes/iter).
+For `MapDataPipe`, please take reference from files in [map](https://github.com/pytorch/pytorch/tree/main/torch/utils/data/datapipes/map) folder and implement the corresponding `__getitem__` method.
 
 ### Naming
 The naming convention for DataPipe is Operation-er and with suffix of `IterDataPipe` because each DataPipe behaves like a container to apply the operation to data yielded from the source DataPipe.
 And, when importing the DataPipe into `iter` module under `datapipes`, each DataPipe will be aliased as Op-er without the suffix of `IterDataPipe`.
-Please check [`__init__.py`](https://github.com/pytorch/pytorch/blob/master/torch/utils/data/datapipes/iter/__init__.py) in `iter` module for how we aliasing each DataPipe class.
+Please check [`__init__.py`](https://github.com/pytorch/pytorch/blob/main/torch/utils/data/datapipes/iter/__init__.py) in `iter` module for how we aliasing each DataPipe class.
 Like the example of `IterDataPipe` to map a function, we are going to name it as `MapperIterDataPipe` and alias it as `iter.Mapper` under `datapipes`.
 
 ### Constructor
@@ -23,7 +23,7 @@ class MapperIterDataPipe(IterDataPipe):
 ```
 Note:
 - Avoid loading data from the source DataPipe in `__init__` function, in order to support lazy data loading and save memory.
-- If `IterDataPipe` instance holds data in memory, please be ware of the in-place modification of data. When second iterator is created from the instance, the data may have already changed. Please take [`IterableWrapper`](https://github.com/pytorch/pytorch/blob/master/torch/utils/data/datapipes/iter/utils.py) class as reference to `deepcopy` data for each iterator.
+- If `IterDataPipe` instance holds data in memory, please be ware of the in-place modification of data. When second iterator is created from the instance, the data may have already changed. Please take [`IterableWrapper`](https://github.com/pytorch/pytorch/blob/main/torch/utils/data/datapipes/iter/utils.py) class as reference to `deepcopy` data for each iterator.
 
 ### Iterator
 For `IterDataPipe`, an `__iter__` function is needed to consume data from the source `IterDataPipe` then apply operation over the data before yield.
diff --git a/torch/utils/data/datapipes/_decorator.py b/torch/utils/data/datapipes/_decorator.py
index b67b73158575e..93ef42076c21e 100644
--- a/torch/utils/data/datapipes/_decorator.py
+++ b/torch/utils/data/datapipes/_decorator.py
@@ -89,9 +89,9 @@ def __call__(self, *args, **kwargs):
         #  Decorate IterDataPipe
         if self.cls is not None:
             if _determinism:
-                raise TypeError("{} is non-deterministic, but you set 'guaranteed_datapipes_determinism'. "
+                raise TypeError(f"{self.cls.__name__} is non-deterministic, but you set 'guaranteed_datapipes_determinism'. "
                                 "You can turn off determinism for this DataPipe if that is acceptable "
-                                "for your application".format(self.cls.__name__))
+                                "for your application")
             return self.cls(*args, **kwargs)  # type: ignore[call-arg]
 
         # Decorate with a functional argument
diff --git a/torch/utils/data/datapipes/_hook_iterator.py b/torch/utils/data/datapipes/_hook_iterator.py
index 7463cc55d27c9..49e17438d60e3 100644
--- a/torch/utils/data/datapipes/_hook_iterator.py
+++ b/torch/utils/data/datapipes/_hook_iterator.py
@@ -34,9 +34,7 @@ def _strip_datapipe_from_name(name: str) -> str:
 def _generate_input_args_string(obj):
     """Generate a string for the input arguments of an object."""
     signature = inspect.signature(obj.__class__)
-    input_param_names = set()
-    for param_name in signature.parameters.keys():
-        input_param_names.add(param_name)
+    input_param_names = set(signature.parameters.keys())
     result = []
     for name, value in inspect.getmembers(obj):
         if name in input_param_names:
diff --git a/torch/utils/data/datapipes/_typing.py b/torch/utils/data/datapipes/_typing.py
index fdf2907abf105..08d54bfb31ad7 100644
--- a/torch/utils/data/datapipes/_typing.py
+++ b/torch/utils/data/datapipes/_typing.py
@@ -408,8 +408,8 @@ def _dp_init_subclass(sub_cls, *args, **kwargs):
                                 ", but found {}".format(sub_cls.__name__, _type_repr(hints['return'])))
             data_type = return_hint.__args__[0]
             if not issubtype(data_type, sub_cls.type.param):
-                raise TypeError("Expected return type of '__iter__' as a subtype of {}, but found {}"
-                                " for {}".format(sub_cls.type, _type_repr(data_type), sub_cls.__name__))
+                raise TypeError(f"Expected return type of '__iter__' as a subtype of {sub_cls.type},"
+                                f" but found {_type_repr(data_type)} for {sub_cls.__name__}")
 
 
 def reinforce_type(self, expected_type):
diff --git a/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py b/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py
index d3d31ded8474b..9a03a8f00efce 100644
--- a/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py
+++ b/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py
@@ -1,5 +1,7 @@
-_pandas = None
-_WITH_PANDAS = None
+from typing import Any, Optional
+
+_pandas: Any = None
+_WITH_PANDAS: Optional[bool] = None
 
 
 def _try_import_pandas() -> bool:
@@ -24,7 +26,7 @@ class PandasWrapper:
     @classmethod
     def create_dataframe(cls, data, columns):
         if not _with_pandas():
-            raise Exception("DataFrames prototype requires pandas to function")
+            raise Exception("DataFrames prototype requires pandas to function")  # noqa: TRY002
         return _pandas.DataFrame(data, columns=columns)  # type: ignore[union-attr]
 
     @classmethod
@@ -42,31 +44,31 @@ def is_column(cls, data):
     @classmethod
     def iterate(cls, data):
         if not _with_pandas():
-            raise Exception("DataFrames prototype requires pandas to function")
+            raise Exception("DataFrames prototype requires pandas to function")  # noqa: TRY002
         yield from data.itertuples(index=False)
 
     @classmethod
     def concat(cls, buffer):
         if not _with_pandas():
-            raise Exception("DataFrames prototype requires pandas to function")
+            raise Exception("DataFrames prototype requires pandas to function")  # noqa: TRY002
         return _pandas.concat(buffer)  # type: ignore[union-attr]
 
     @classmethod
     def get_item(cls, data, idx):
         if not _with_pandas():
-            raise Exception("DataFrames prototype requires pandas to function")
+            raise Exception("DataFrames prototype requires pandas to function")  # noqa: TRY002
         return data[idx: idx + 1]
 
     @classmethod
     def get_len(cls, df):
         if not _with_pandas():
-            raise Exception("DataFrames prototype requires pandas to function")
+            raise Exception("DataFrames prototype requires pandas to function")  # noqa: TRY002
         return len(df.index)
 
     @classmethod
     def get_columns(cls, df):
         if not _with_pandas():
-            raise Exception("DataFrames prototype requires pandas to function")
+            raise Exception("DataFrames prototype requires pandas to function")  # noqa: TRY002
         return list(df.columns.values.tolist())
 
 
diff --git a/torch/utils/data/datapipes/dataframe/dataframes.py b/torch/utils/data/datapipes/dataframe/dataframes.py
index b6d406b07c856..a93ea6ba2d829 100644
--- a/torch/utils/data/datapipes/dataframe/dataframes.py
+++ b/torch/utils/data/datapipes/dataframe/dataframes.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import DFIterDataPipe, IterDataPipe
@@ -90,9 +90,9 @@ def __setstate__(self, state):
 
     def __getattr__(self, attrname):
         if attrname == 'kwarg' or attrname == 'kwargs':
-            raise Exception('no kwargs!')
+            raise Exception('no kwargs!')  # noqa: TRY002
         if attrname in ['__deepcopy__']:
-            raise AttributeError()
+            raise AttributeError
         result = CaptureGetAttr(self, attrname, ctx=self.ctx)
         return result
 
@@ -244,7 +244,7 @@ class CaptureVariable(Capture):
 
     def __init__(self, value, ctx):
         if CaptureControl.disabled:
-            raise Exception('Attempting to create capture variable with capture off')
+            raise Exception('Attempting to create capture variable with capture off')  # noqa: TRY002
         self.ctx = ctx
         self.value = value
         self.name = f'var_{CaptureVariable.names_idx}'
@@ -404,7 +404,7 @@ def filter(self, *args, **kwargs):
         return self._dataframes_filter(*args, **kwargs)
 
     def collate(self, *args, **kwargs):
-        raise Exception("Can't collate unbatched DataFrames stream")
+        raise Exception("Can't collate unbatched DataFrames stream")  # noqa: TRY002
 
     def __getattr__(self, attrname):  # ?
         if attrname in UNIMPLEMENTED_ATTR:
@@ -416,7 +416,7 @@ def __getattr__(self, attrname):  # ?
 
 @functional_datapipe('trace_as_dataframe')
 class DataFrameTracer(CaptureDataFrameWithDataPipeOps, IterDataPipe):  # type: ignore[misc]
-    source_datapipe = None
+    source_datapipe: Optional[Any] = None
 
     # TODO(VitalyFedyunin): Must implement all special functions of datapipes
 
diff --git a/torch/utils/data/datapipes/datapipe.py b/torch/utils/data/datapipes/datapipe.py
index 504a4b5eac6e7..1c99fe79e406b 100644
--- a/torch/utils/data/datapipes/datapipe.py
+++ b/torch/utils/data/datapipes/datapipe.py
@@ -10,17 +10,10 @@
     _map_deprecated_functional_names,
 )
 from torch.utils.data.dataset import Dataset, IterableDataset
+from torch.utils._import_utils import import_dill
 
-try:
-    import dill
-    # XXX: By default, dill writes the Pickler dispatch table to inject its
-    # own logic there. This globally affects the behavior of the standard library
-    # pickler for any user who transitively depends on this module!
-    # Undo this extension to avoid altering the behavior of the pickler globally.
-    dill.extend(use_dill=False)
-    HAS_DILL = True
-except ImportError:
-    HAS_DILL = False
+dill = import_dill()
+HAS_DILL = dill is not None
 
 __all__ = [
     "DataChunk",
@@ -139,7 +132,7 @@ def register_function(cls, function_name, function):
     @classmethod
     def register_datapipe_as_function(cls, function_name, cls_to_register, enable_df_api_tracing=False):
         if function_name in cls.functions:
-            raise Exception(f"Unable to add DataPipe function name {function_name} as it is already taken")
+            raise Exception(f"Unable to add DataPipe function name {function_name} as it is already taken")  # noqa: TRY002
 
         def class_function(cls, enable_df_api_tracing, source_dp, *args, **kwargs):
             result_pipe = cls(source_dp, *args, **kwargs)
@@ -181,13 +174,13 @@ def __reduce_ex__(self, *args, **kwargs):
     @classmethod
     def set_getstate_hook(cls, hook_fn):
         if IterDataPipe.getstate_hook is not None and hook_fn is not None:
-            raise Exception("Attempt to override existing getstate_hook")
+            raise Exception("Attempt to override existing getstate_hook")  # noqa: TRY002
         IterDataPipe.getstate_hook = hook_fn
 
     @classmethod
     def set_reduce_ex_hook(cls, hook_fn):
         if IterDataPipe.reduce_ex_hook is not None and hook_fn is not None:
-            raise Exception("Attempt to override existing reduce_ex_hook")
+            raise Exception("Attempt to override existing reduce_ex_hook")  # noqa: TRY002
         IterDataPipe.reduce_ex_hook = hook_fn
 
     def __repr__(self):
@@ -282,7 +275,7 @@ def register_function(cls, function_name, function):
     @classmethod
     def register_datapipe_as_function(cls, function_name, cls_to_register):
         if function_name in cls.functions:
-            raise Exception(f"Unable to add DataPipe function name {function_name} as it is already taken")
+            raise Exception(f"Unable to add DataPipe function name {function_name} as it is already taken")  # noqa: TRY002
 
         def class_function(cls, source_dp, *args, **kwargs):
             result_pipe = cls(source_dp, *args, **kwargs)
@@ -317,13 +310,13 @@ def __reduce_ex__(self, *args, **kwargs):
     @classmethod
     def set_getstate_hook(cls, hook_fn):
         if MapDataPipe.getstate_hook is not None and hook_fn is not None:
-            raise Exception("Attempt to override existing getstate_hook")
+            raise Exception("Attempt to override existing getstate_hook")  # noqa: TRY002
         MapDataPipe.getstate_hook = hook_fn
 
     @classmethod
     def set_reduce_ex_hook(cls, hook_fn):
         if MapDataPipe.reduce_ex_hook is not None and hook_fn is not None:
-            raise Exception("Attempt to override existing reduce_ex_hook")
+            raise Exception("Attempt to override existing reduce_ex_hook")  # noqa: TRY002
         MapDataPipe.reduce_ex_hook = hook_fn
 
     def __repr__(self):
diff --git a/torch/utils/data/datapipes/gen_pyi.py b/torch/utils/data/datapipes/gen_pyi.py
index c5b7a08760373..c0f8a801bd076 100644
--- a/torch/utils/data/datapipes/gen_pyi.py
+++ b/torch/utils/data/datapipes/gen_pyi.py
@@ -70,7 +70,7 @@ def parse_datapipe_file(file_path: str) -> Tuple[Dict[str, str], Dict[str, str],
         open_paren_count = 0
         method_name, class_name, signature = "", "", ""
         skip = False
-        for line in f.readlines():
+        for line in f:
             if line.count("\"\"\"") % 2 == 1:
                 skip = not skip
             if skip or "\"\"\"" in line:  # Saving docstrings
diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index 48875e40a68d1..9a67cc0592ff0 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@@ -136,7 +136,7 @@ def _collate_helper(conversion, item):
     # TODO(VitalyFedyunin): Verify that item is any sort of batch
     if len(item.items) > 1:
         # TODO(VitalyFedyunin): Compact all batch dataframes into one
-        raise Exception("Only supports one DataFrame per batch")
+        raise Exception("Only supports one DataFrame per batch")  # noqa: TRY002
     df = item[0]
     columns_name = df_wrapper.get_columns(df)
     tuple_names: List = []
@@ -144,12 +144,12 @@ def _collate_helper(conversion, item):
 
     for name in conversion.keys():
         if name not in columns_name:
-            raise Exception("Conversion keys missmatch")
+            raise Exception("Conversion keys missmatch")  # noqa: TRY002
 
     for name in columns_name:
         if name in conversion:
             if not callable(conversion[name]):
-                raise Exception('Collate (DF)DataPipe requires callable as dict values')
+                raise Exception('Collate (DF)DataPipe requires callable as dict values')  # noqa: TRY002
             collation_fn = conversion[name]
         else:
             # TODO(VitalyFedyunin): Add default collation into df_wrapper
@@ -157,7 +157,7 @@ def _collate_helper(conversion, item):
                 import torcharrow.pytorch as tap  # type: ignore[import]
                 collation_fn = tap.rec.Default()
             except Exception as e:
-                raise Exception("unable to import default collation function from the TorchArrow") from e
+                raise Exception("unable to import default collation function from the TorchArrow") from e  # noqa: TRY002
 
         tuple_names.append(str(name))
         value = collation_fn(df[name])
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index cea325f731c3f..9a4365516a33f 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -103,7 +103,7 @@ def __new__(
             raise ValueError(f"Expected `num_instances` larger than 0, but {num_instances} is found")
         if num_instances == 1:
             return datapipe
-        container = _ForkerIterDataPipe(datapipe, num_instances, buffer_size, copy)
+        container = _ForkerIterDataPipe(datapipe, num_instances, buffer_size, copy)  # type: ignore[abstract]
         return [_ChildDataPipe(container, i) for i in range(num_instances)]
 
 
@@ -210,7 +210,7 @@ def get_next_element_by_instance(self, instance_id: int):
                     raise BufferError("ForkerIterDataPipe buffer overflow," +
                                       f"buffer size {self.buffer_size} is insufficient.")
 
-                yield self.copy_fn(return_val)
+                yield self.copy_fn(return_val)  # type: ignore[possibly-undefined]
         finally:
             self._child_stop[instance_id] = True
             # Cleanup _datapipe_iterator for the case that fork exits earlier
@@ -395,7 +395,7 @@ def __new__(cls, datapipe: IterDataPipe, num_instances: int,
         # When num_instances == 1, demux can be replaced by filter,
         # but keep it as Demultiplexer for the sake of consistency
         # like throwing Error when classification result is out of o range
-        container = _DemultiplexerIterDataPipe(datapipe, num_instances, classifier_fn, drop_none, buffer_size)
+        container = _DemultiplexerIterDataPipe(datapipe, num_instances, classifier_fn, drop_none, buffer_size)  # type: ignore[abstract]
         return [_ChildDataPipe(container, i) for i in range(num_instances)]
 
 
diff --git a/torch/utils/data/datapipes/iter/sharding.py b/torch/utils/data/datapipes/iter/sharding.py
index 0b25d6baf796a..f5bd3261fc1ba 100644
--- a/torch/utils/data/datapipes/iter/sharding.py
+++ b/torch/utils/data/datapipes/iter/sharding.py
@@ -50,10 +50,10 @@ def apply_sharding(self, num_of_instances, instance_id, sharding_group=SHARDING_
             raise ValueError(f"instance_id({instance_id}) should be smaller than num_of_instances({num_of_instances})")
         if sharding_group == SHARDING_PRIORITIES.DEFAULT:
             if len(self.groups) and SHARDING_PRIORITIES.DEFAULT not in self.groups:
-                raise Exception('ShardingFilter cannot mix DEFAULT and non DEFAULT groups')
+                raise Exception('ShardingFilter cannot mix DEFAULT and non DEFAULT groups')  # noqa: TRY002
         else:
             if SHARDING_PRIORITIES.DEFAULT in self.groups:
-                raise Exception('ShardingFilter cannot mix DEFAULT and non DEFAULT groups')
+                raise Exception('ShardingFilter cannot mix DEFAULT and non DEFAULT groups')  # noqa: TRY002
         self.groups[sharding_group] = (num_of_instances, instance_id)
         self._update_num_of_instances()
 
diff --git a/torch/utils/data/datapipes/utils/common.py b/torch/utils/data/datapipes/utils/common.py
index eb9b0ec87c627..3c466d3392ad0 100644
--- a/torch/utils/data/datapipes/utils/common.py
+++ b/torch/utils/data/datapipes/utils/common.py
@@ -8,9 +8,7 @@
 
 from functools import partial
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
-
-
-from torch.utils.data._utils.serialization import DILL_AVAILABLE
+from torch.utils._import_utils import dill_available
 
 __all__ = [
     "validate_input_col",
@@ -22,6 +20,10 @@
 ]
 
 
+# BC for torchdata
+DILL_AVAILABLE = dill_available()
+
+
 def validate_input_col(fn: Callable, input_col: Optional[Union[int, tuple, list]]):
     """
     Check that function used in a callable datapipe works with the input column.
@@ -138,7 +140,7 @@ def _check_unpickable_fn(fn: Callable):
         fn = fn.func
 
     # Local function
-    if _is_local_fn(fn) and not DILL_AVAILABLE:
+    if _is_local_fn(fn) and not dill_available():
         warnings.warn(
             "Local function is not supported by pickle, please use "
             "regular python function or functools.partial instead."
@@ -146,7 +148,7 @@ def _check_unpickable_fn(fn: Callable):
         return
 
     # Lambda function
-    if hasattr(fn, "__name__") and fn.__name__ == "<lambda>" and not DILL_AVAILABLE:
+    if hasattr(fn, "__name__") and fn.__name__ == "<lambda>" and not dill_available():
         warnings.warn(
             "Lambda function is not supported by pickle, please use "
             "regular python function or functools.partial instead."
diff --git a/torch/utils/data/graph.py b/torch/utils/data/graph.py
index 1c18f39244ca6..cd78db474d5e0 100644
--- a/torch/utils/data/graph.py
+++ b/torch/utils/data/graph.py
@@ -6,7 +6,7 @@
 from typing import Dict, List, Optional, Set, Tuple, Type, Union
 
 from torch.utils.data import IterDataPipe, MapDataPipe
-from torch.utils.data._utils.serialization import DILL_AVAILABLE
+from torch.utils._import_utils import dill_available
 
 
 __all__ = ["traverse", "traverse_dps"]
@@ -23,7 +23,7 @@ def _stub_unpickler():
 def _list_connected_datapipes(scan_obj: DataPipe, only_datapipe: bool, cache: Set[int]) -> List[DataPipe]:
     f = io.BytesIO()
     p = pickle.Pickler(f)  # Not going to work for lambdas, but dill infinite loops on typing and can't be used as is
-    if DILL_AVAILABLE:
+    if dill_available():
         from dill import Pickler as dill_Pickler
         d = dill_Pickler(f)
     else:
@@ -66,7 +66,7 @@ def reduce_hook(obj):
         try:
             p.dump(scan_obj)
         except (pickle.PickleError, AttributeError, TypeError):
-            if DILL_AVAILABLE:
+            if dill_available():
                 d.dump(scan_obj)
             else:
                 raise
@@ -75,7 +75,7 @@ def reduce_hook(obj):
             cls.set_reduce_ex_hook(None)
             if only_datapipe:
                 cls.set_getstate_hook(None)
-        if DILL_AVAILABLE:
+        if dill_available():
             from dill import extend as dill_extend
             dill_extend(False)  # Undo change to dispatch table
     return captured_connections
diff --git a/torch/utils/data/sampler.py b/torch/utils/data/sampler.py
index dbd91d0ac1576..0f100f1858419 100644
--- a/torch/utils/data/sampler.py
+++ b/torch/utils/data/sampler.py
@@ -79,7 +79,7 @@ def __iter__(self) -> Iterator[T_co]:
     #     Calling `len(subclass_instance)` raises:
     #       TypeError: 'NotImplementedType' object cannot be interpreted as an integer
     #
-    #   + `raise NotImplementedError()`:
+    #   + `raise NotImplementedError`:
     #     This prevents triggering some fallback behavior. E.g., the built-in
     #     `list(X)` tries to call `len(X)` first, and executes a different code
     #     path if the method is not found or `NotImplemented` is returned, while
diff --git a/torch/utils/flop_counter.py b/torch/utils/flop_counter.py
index 76c7fb2b0d9ae..04ece81987986 100644
--- a/torch/utils/flop_counter.py
+++ b/torch/utils/flop_counter.py
@@ -1,13 +1,13 @@
 import torch
-import torch.nn as nn
 from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
-from typing import List, Any, Dict, Optional, Union, NamedTuple
+from .module_tracker import ModuleTracker
+from typing import List, Any, Dict, Optional, Union
 from collections import defaultdict
 from torch.utils._python_dispatch import TorchDispatchMode
-from torch.utils.hooks import RemovableHandle
 from torch._decomp import register_decomposition
 from math import prod
 from functools import wraps
+import warnings
 
 
 
@@ -24,8 +24,8 @@ def get_shape(i):
 
 def shape_wrapper(f):
     @wraps(f)
-    def nf(*args, out=None, **kwargs):
-        args, kwargs, out_shape = tree_map(get_shape, (args, kwargs, out))
+    def nf(*args, out_val=None, **kwargs):
+        args, kwargs, out_shape = tree_map(get_shape, (args, kwargs, out_val))
         return f(*args, out_shape=out_shape, **kwargs)
     return nf
 
@@ -95,13 +95,23 @@ def conv_flop_count(
     Returns:
         int: the number of flops
     """
+
     batch_size = x_shape[0]
     conv_shape = (x_shape if transposed else out_shape)[2:]
-    c_out, c_in, *dims = w_shape
+    c_out, c_in, *filter_size = w_shape
+
+    """
+    General idea here is that for a regular conv, for each point in the output
+    spatial dimension we convolve the filter with something (hence
+    `prod(conv_shape) * prod(filter_size)` ops). Then, this gets multiplied by
+    1. batch_size, 2. the cross product of input and weight channels.
 
+    For the transpose, it's not each point in the *output* spatial dimension but
+    each point in the *input* spatial dimension.
+    """
     # NB(chilli): I don't think this properly accounts for padding :think:
     # NB(chilli): Should be 2 * c_in - 1 technically for FLOPs.
-    flop = batch_size * prod(conv_shape) * c_out * prod(dims) * 2 * c_in
+    flop = prod(conv_shape) * prod(filter_size) * batch_size * c_out * c_in * 2
     return flop
 
 @register_flop_formula([aten.convolution, aten._convolution])
@@ -109,8 +119,6 @@ def conv_flop(x_shape, w_shape, _bias, _stride, _padding, _dilation, transposed,
     """Count flops for convolution."""
     return conv_flop_count(x_shape, w_shape, out_shape, transposed=transposed)
 
-def transpose_shape(shape):
-    return [shape[1], shape[0]] + list(shape[2:])
 
 @register_flop_formula(aten.convolution_backward)
 def conv_backward_flop(
@@ -126,14 +134,93 @@ def conv_backward_flop(
         _groups,
         output_mask,
         out_shape) -> int:
+
+    def t(shape):
+        return [shape[1], shape[0]] + list(shape[2:])
     flop_count = 0
 
+    """
+    Let's say we have a regular 1D conv
+    {A, B, C} [inp]
+    {i, j} [weight]
+    => (conv)
+    {Ai + Bj, Bi + Cj} [out]
+
+    And as a reminder, the transposed conv of the above is
+    => {Ai, Aj + Bi, Bj + Ci, Cj} [transposed conv out]
+
+    For the backwards of conv, we now have
+    {D, E} [grad_out]
+    {A, B, C} [inp]
+    {i, j} [weight]
+
+    # grad_inp as conv_transpose(grad_out, weight)
+    Let's first compute grad_inp. To do so, we can simply look at all the
+    multiplications that each element of inp is involved in. For example, A is
+    only involved in the first element of the output (and thus only depends upon
+    D in grad_out), and C is only involved in the last element of the output
+    (and thus only depends upon E in grad_out)
+
+    {Di, Dj + Ei, Ej} [grad_inp]
+
+    Note that this corresponds to the below conv_transpose. This gives us the
+    output_mask[0] branch, which is grad_inp.
+
+    {D, E} [inp (grad_out)]
+    {i, j} [weight]
+    => (conv_transpose)
+    {Di, Dj + Ei, Ej} [out (grad_inp)]
+
+    I leave the fact that grad_inp for a transposed conv is just conv(grad_out,
+    weight) as an exercise for the reader.
+
+    # grad_weight as conv(inp, grad_out)
+    To compute grad_weight, we again look at the terms in the output, which as
+    a reminder is:
+    => {Ai + Bj, Bi + Cj} [out]
+    => {D, E} [grad_out]
+    If we manually compute the gradient for the weights, we see it's
+    {AD + BE, BD + CE} [grad_weight]
+
+    This corresponds to the below conv
+    {A, B, C} [inp]
+    {D, E} [weight (grad_out)]
+    => (conv)
+    {AD + BE, BD + CE} [out (grad_weight)]
+
+    # grad_weight of transposed conv as conv(grad_out, inp)
+    As a reminder, the terms of the output of a transposed conv are:
+    => {Ai, Aj + Bi, Bj + Ci, Cj} [transposed conv out]
+    => {D, E, F, G} [grad_out]
+
+    Manually computing the gradient for the weights, we see it's
+    {AD + BE + CF, AE + BF + CG} [grad_weight]
+
+    This corresponds to the below conv
+    {D, E, F, G} [inp (grad_out)]
+    {A, B, C} [weight (inp)]
+    => (conv)
+    {AD + BE + CF, AE + BF + CG} [out (grad_weight)]
+
+    For the full backwards formula, there are also some details involving
+    transpose of the batch/channel dimensions and groups, but I skip those for
+    the sake of brevity (and they're pretty similar to matmul backwards)
+
+    Check [conv backwards decomposition as conv forwards]
+    """
+    # grad_inp as conv_transpose(grad_out, weight)
     if output_mask[0]:
         grad_input_shape = get_shape(out_shape[0])
         flop_count += conv_flop_count(grad_out_shape, w_shape, grad_input_shape, not transposed)
+
     if output_mask[1]:
         grad_weight_shape = get_shape(out_shape[1])
-        flop_count += conv_flop_count(transpose_shape(x_shape), grad_out_shape, grad_weight_shape, transposed)
+        if transposed:
+            # grad_weight of transposed conv as conv(grad_out, inp)
+            flop_count += conv_flop_count(t(grad_out_shape), t(x_shape), t(grad_weight_shape), transposed=False)
+        else:
+            # grad_weight as conv(inp, grad_out)
+            flop_count += conv_flop_count(t(x_shape), t(grad_out_shape), t(grad_weight_shape), transposed=False)
 
     return flop_count
 
@@ -221,7 +308,7 @@ def get_suffix_str(number):
     # Find the index of the appropriate suffix based on the number of digits
     # with some additional overflow.
     # i.e. 1.01B should be displayed as 1001M, not 1.001B
-    index = max(0, min(len(suffixes) - 1, (len(str(number)) - 3) // 3))
+    index = max(0, min(len(suffixes) - 1, (len(str(number)) - 2) // 3))
     return suffixes[index]
 
 def convert_num_with_suffix(number, suffix):
@@ -261,8 +348,7 @@ class FlopCounterMode(TorchDispatchMode):
     .. code-block:: python
 
         mod = ...
-        flop_counter = FlopCounterMode(mod)
-        with flop_counter:
+        with FlopCounterMode(mod) as flop_counter:
             mod.sum().backward()
 
     """
@@ -275,87 +361,16 @@ def __init__(
             custom_mapping: Optional[Dict[Any, Any]] = None):
         self.flop_counts: Dict[str, Dict[Any, int]] = defaultdict(lambda: defaultdict(int))
         self.depth = depth
-        self.parents = ["Global"]
         self.display = display
         if custom_mapping is None:
             custom_mapping = {}
-        if isinstance(mods, torch.nn.Module):
-            mods = [mods]
-        self.mods = mods
-        # Keys will include the modules in `mods` and their submodules
-        self._module_to_forward_hook_handles: Dict[nn.Module, _ForwardHookHandles] = {}
+        if mods is not None:
+            warnings.warn("mods argument is not needed anymore, you can stop passing it", stacklevel=2)
         self.flop_registry = {
             **flop_registry,
             **{k: v if getattr(v, "_get_raw", False) else shape_wrapper(v) for k, v in custom_mapping.items()}
         }
-
-    def _register_forward_hooks(self):
-        if self.mods is None:
-            return
-        for mod in self.mods:
-            prefix = type(mod).__name__
-            for name, module in dict(mod.named_modules()).items():
-                if name == "":
-                    name = prefix
-                else:
-                    name = ".".join([prefix, name])
-
-                forward_pre_hook_handle = module.register_forward_pre_hook(self._enter_module(name))
-                forward_hook_handle = module.register_forward_hook(self._exit_module(name))
-                self._module_to_forward_hook_handles[module] = _ForwardHookHandles(
-                    forward_pre_hook_handle, forward_hook_handle
-                )
-
-    def _deregister_forward_hooks(self):
-        for forward_hook_handles in self._module_to_forward_hook_handles.values():
-            forward_hook_handles[0].remove()
-            forward_hook_handles[1].remove()
-        self._module_to_forward_hook_handles.clear()
-
-    def _enter_module(self, name):
-        def f(module, inputs):
-            out = _pytreeify_preserve_structure(self._create_pre_module(name))(inputs)
-            return out
-
-        return f
-
-    def _exit_module(self, name):
-        def f(module, inputs, outputs):
-            outputs = _pytreeify_preserve_structure(self._create_post_module(name))(outputs)
-            return outputs
-        return f
-
-    def _create_post_module(self, name):
-        class PushState(torch.autograd.Function):
-            @staticmethod
-            def forward(ctx, *args):
-                assert self.parents[-1] == name
-                self.parents.pop()
-                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
-                return args
-
-            @staticmethod
-            def backward(ctx, *grad_outs):
-                self.parents.append(name)
-                return grad_outs
-
-        return PushState.apply
-
-    def _create_pre_module(self, name):
-        class PopState(torch.autograd.Function):
-            @staticmethod
-            def forward(ctx, *args):
-                self.parents.append(name)
-                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
-                return args
-
-            @staticmethod
-            def backward(ctx, *grad_outs):
-                assert self.parents[-1] == name
-                self.parents.pop()
-                return grad_outs
-
-        return PopState.apply
+        self.mod_tracker = ModuleTracker()
 
     def get_total_flops(self) -> int:
         return sum(self.flop_counts['Global'].values())
@@ -370,7 +385,7 @@ def get_flop_counts(self) -> Dict[str, Dict[Any, int]]:
         Returns:
             Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
         """
-        return dict(self.flop_counts)
+        return {k: dict(v) for k, v in self.flop_counts.items()}
 
     def get_table(self, depth=None):
         if depth is None:
@@ -408,7 +423,7 @@ def process_mod(mod_name, depth):
                 ])
             return values
 
-        for mod in self.flop_counts.keys():
+        for mod in sorted(self.flop_counts.keys()):
             if mod == 'Global':
                 continue
             mod_depth = mod.count(".") + 1
@@ -434,28 +449,26 @@ def process_mod(mod_name, depth):
 
     def __enter__(self):
         self.flop_counts.clear()
-        self._register_forward_hooks()
+        self.mod_tracker.__enter__()
         super().__enter__()
         return self
 
     def __exit__(self, *args):
+        super().__exit__(*args)
+        self.mod_tracker.__exit__()
         if self.display:
             print(self.get_table(self.depth))
-        self._deregister_forward_hooks()
-        super().__exit__(*args)
 
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         kwargs = kwargs if kwargs else {}
         out = func(*args, **kwargs)
-        func_packet = func._overloadpacket
+        return self._count_flops(func._overloadpacket, out, args, kwargs)
+
+    def _count_flops(self, func_packet, out, args, kwargs):
         if func_packet in self.flop_registry:
             flop_count_func = self.flop_registry[func_packet]
-            flop_count = flop_count_func(*args, **kwargs, out=out)  # type: ignore[operator]
-            for par in self.parents:
+            flop_count = flop_count_func(*args, **kwargs, out_val=out)  # type: ignore[operator]
+            for par in set(self.mod_tracker.parents):
                 self.flop_counts[par][func_packet] += flop_count
 
         return out
-
-class _ForwardHookHandles(NamedTuple):
-    forward_pre_hook_handle: RemovableHandle
-    forward_hook_handle: RemovableHandle
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 193e682a12ab5..d07292a22b0ad 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -454,6 +454,7 @@
         ("cublasDiagType_t", ("hipblasDiagType_t", CONV_TYPE, API_BLAS)),
         ("cublasSideMode_t", ("hipblasSideMode_t", CONV_TYPE, API_BLAS)),
         ("cublasPointerMode_t", ("hipblasPointerMode_t", CONV_TYPE, API_BLAS)),
+        ("cublasGemmAlgo_t", ("hipblasGemmAlgo_t", CONV_TYPE, API_BLAS)),
         (
             "cublasAtomicsMode_t",
             ("hipblasAtomicsMode_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED),
@@ -603,6 +604,7 @@
         ("library_types.h", ("hip/library_types.h", CONV_INCLUDE, API_RUNTIME)),
         ("cuComplex.h", ("hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME)),
         ("cuda_fp16.h", ("hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME)),
+        ("cuda_bf16.h", ("hip/hip_bf16.h", CONV_INCLUDE, API_RUNTIME)),
         (
             "cuda_texture_types.h",
             ("hip/hip_texture_types.h", CONV_INCLUDE, API_RUNTIME),
@@ -641,9 +643,15 @@
         ("thrust/system/cuda", ("thrust/system/hip", CONV_INCLUDE, API_BLAS)),
         ("cub/util_allocator.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("cub/block/block_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/block/block_raking_layout.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("cub/cub.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/config.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/util_ptx.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/util_type.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("cub/device/device_run_length_encode.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("cub/block/block_load.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/block/block_store.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/block/block_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("cub/device/device_radix_sort.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("cub/device/device_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
         ("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
@@ -3852,24 +3860,36 @@
                 HIP_UNSUPPORTED,
             ),
         ),
-        ("cudaDataType_t", ("hipDataType", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("cudaDataType", ("hipDataType", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_16BF", ("HIP_R_16BF", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_16BF", ("HIP_C_16BF", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_16F", ("HIP_R_16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_16F", ("HIP_C_16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_32F", ("HIP_R_32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_32F", ("HIP_C_32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_64F", ("HIP_R_64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_64F", ("HIP_C_64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_8I", ("HIP_R_8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_8I", ("HIP_C_8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_8U", ("HIP_R_8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_8U", ("HIP_C_8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_32I", ("HIP_R_32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_32I", ("HIP_C_32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_R_32U", ("HIP_R_32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-        ("CUDA_C_32U", ("HIP_C_32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("cudaDataType_t", ("hipDataType", CONV_TYPE, API_RUNTIME)),
+        ("cudaDataType", ("hipDataType", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_32F", ("HIP_R_32F", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_64F", ("HIP_R_64F", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_16F", ("HIP_R_16F", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_8I", ("HIP_R_8I", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_C_32F", ("HIP_C_32F", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_C_64F", ("HIP_C_64F", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_C_16F", ("HIP_C_16F", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_C_8I", ("HIP_C_8I", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_8U", ("HIP_R_8U", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_C_8U", ("HIP_C_8U", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_32I", ("HIP_R_32I", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_C_32I", ("HIP_C_32I", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_32U", ("HIP_R_32U", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_C_32U", ("HIP_C_32U", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_16BF", ("HIP_R_16BF", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_C_16BF", ("HIP_C_16BF", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_4I", ("HIP_R_4I", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_C_4I", ("HIP_C_4I", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_4U", ("HIP_R_4U", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_C_4U", ("HIP_C_4U", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_16I", ("HIP_R_16I", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_C_16I", ("HIP_C_16I", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_16U", ("HIP_R_16U", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_C_16U", ("HIP_C_16U", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_64I", ("HIP_R_64I", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_C_64I", ("HIP_C_64I", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_R_64U", ("HIP_R_64U", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_C_64U", ("HIP_C_64U", CONV_TYPE, API_RUNTIME)),
         (
             "MAJOR_VERSION",
             ("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
@@ -4165,6 +4185,7 @@
         ("cudaStreamDefault", ("hipStreamDefault", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamNonBlocking", ("hipStreamNonBlocking", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamGetCaptureInfo", ("hipStreamGetCaptureInfo", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamGetCaptureInfo_v2", ("hipStreamGetCaptureInfo_v2", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureStatus", ("hipStreamCaptureStatus", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureStatusActive", ("hipStreamCaptureStatusActive", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureMode", ("hipStreamCaptureMode", CONV_TYPE, API_RUNTIME)),
@@ -4179,6 +4200,11 @@
         ("cudaGraphLaunch", ("hipGraphLaunch", CONV_TYPE, API_RUNTIME)),
         ("cudaGraphGetNodes", ("hipGraphGetNodes", CONV_TYPE, API_RUNTIME)),
         ("cudaGraphDebugDotPrint", ("hipGraphDebugDotPrint", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphRetainUserObject", ("hipGraphRetainUserObject", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphUserObjectMove", ("hipGraphUserObjectMove", CONV_TYPE, API_RUNTIME)),
+        ("cudaUserObject_t", ("hipUserObject_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaUserObjectCreate", ("hipUserObjectCreate", CONV_TYPE, API_RUNTIME)),
+        ("cudaUserObjectNoDestructorSync", ("hipUserObjectNoDestructorSync", CONV_TYPE, API_RUNTIME)),
         ("cudaThreadExchangeStreamCaptureMode", ("hipThreadExchangeStreamCaptureMode", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamIsCapturing", ("hipStreamIsCapturing", CONV_TYPE, API_RUNTIME)),
         ("cudaDeviceSynchronize", ("hipDeviceSynchronize", CONV_DEVICE, API_RUNTIME)),
@@ -7316,6 +7342,11 @@
         ("cublasLtMatrixLayout_t", ("hipblasLtMatrixLayout_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayoutOpaque_t", ("hipblasLtMatrixLayoutOpaque_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayoutAttribute_t", ("hipblasLtMatrixLayoutAttribute_t", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatrixLayoutCreate", ("hipblasLtMatrixLayoutCreate", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatrixLayoutDestroy", ("hipblasLtMatrixLayoutDestroy", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatrixLayoutSetAttribute", ("hipblasLtMatrixLayoutSetAttribute", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT", ("HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET", ("HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatmulPreference_t", ("hipblasLtMatmulPreference_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatmulPreferenceOpaque_t", ("hipblasLtMatmulPreferenceOpaque_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatmulPreferenceAttributes_t", ("hipblasLtMatmulPreferenceAttributes_t", CONV_MATH_FUNC, API_BLAS)),
@@ -7323,8 +7354,6 @@
         ("CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES", ("HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatmulAlgo_t", ("hipblasLtMatmulAlgo_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatmulHeuristicResult_t", ("hipblasLtMatmulHeuristicResult_t", CONV_MATH_FUNC, API_BLAS)),
-        ("cublasLtMatrixLayoutCreate", ("hipblasLtMatrixLayoutCreate", CONV_MATH_FUNC, API_BLAS)),
-        ("cublasLtMatrixLayoutDestroy", ("hipblasLtMatrixLayoutDestroy", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtCreate", ("hipblasLtCreate", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtDestroy", ("hipblasLtDestroy", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatmulDescCreate", ("hipblasLtMatmulDescCreate", CONV_MATH_FUNC, API_BLAS)),
@@ -7847,6 +7876,8 @@
             "cufftSetAutoAllocation",
             ("hipfftSetAutoAllocation", CONV_MATH_FUNC, API_FFT),
         ),
+        ("cufftXtExec", ("hipfftXtExec", CONV_MATH_FUNC, API_FFT)),
+        ("cufftXtMakePlanMany", ("hipfftXtMakePlanMany", CONV_MATH_FUNC, API_FFT)),
         ("cufftExecC2C", ("hipfftExecC2C", CONV_MATH_FUNC, API_FFT)),
         ("cufftExecR2C", ("hipfftExecR2C", CONV_MATH_FUNC, API_FFT)),
         ("cufftExecC2R", ("hipfftExecC2R", CONV_MATH_FUNC, API_FFT)),
@@ -7918,9 +7949,18 @@
         ("cub::", ("hipcub::", CONV_SPECIAL_FUNC, API_RUNTIME)),
         ("cub::ArgMax", ("hipcub::ArgMax", CONV_SPECIAL_FUNC, API_RUNTIME)),
         ("cub::ArgMin", ("hipcub::ArgMin", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::BLOCK_SCAN_WARP_SCANS", ("hipcub::BLOCK_SCAN_WARP_SCANS", CONV_SPECIAL_FUNC, API_RUNTIME)),
         ("cub::BLOCK_REDUCE_WARP_REDUCTIONS", ("hipcub::BLOCK_REDUCE_WARP_REDUCTIONS", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::BLOCK_STORE_WARP_TRANSPOSE", ("hipcub::BLOCK_STORE_WARP_TRANSPOSE", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::BLOCK_LOAD_DIRECT", ("hipcub::BLOCK_LOAD_DIRECT", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::BLOCK_STORE_DIRECT", ("hipcub::BLOCK_STORE_DIRECT", CONV_SPECIAL_FUNC, API_RUNTIME)),
         ("cub::BlockReduce", ("hipcub::BlockReduce", CONV_SPECIAL_FUNC, API_RUNTIME)),
         ("cub::BlockScan", ("hipcub::BlockScan", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::BlockLoad", ("hipcub::BlockLoad", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::BlockStore", ("hipcub::BlockStore", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::BlockRakingLayout", ("hipcub::BlockRakingLayout", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::Uninitialized", ("hipcub::Uninitialized", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::RowMajorTid", ("hipcub::RowMajorTid", CONV_SPECIAL_FUNC, API_RUNTIME)),
         ("cub::CachingDeviceAllocator", ("hipcub::CachingDeviceAllocator", CONV_SPECIAL_FUNC, API_RUNTIME)),
         ("cub::CountingInputIterator", ("hipcub::CountingInputIterator", CONV_SPECIAL_FUNC, API_RUNTIME)),
         ("cub::DeviceRadixSort", ("hipcub::DeviceRadixSort", CONV_SPECIAL_FUNC, API_RUNTIME)),
@@ -7934,9 +7974,15 @@
         ("cub::Max", ("hipcub::Max", CONV_SPECIAL_FUNC, API_RUNTIME)),
         ("cub::Min", ("hipcub::Min", CONV_SPECIAL_FUNC, API_RUNTIME)),
         ("cub::Sum", ("hipcub::Sum", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::Log2", ("hipcub::Log2", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::LaneId", ("hipcub::LaneId", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::WarpMask", ("hipcub::WarpMask", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::ShuffleIndex", ("hipcub::ShuffleIndex", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::ShuffleDown", ("hipcub::ShuffleDown", CONV_SPECIAL_FUNC, API_RUNTIME)),
         ("cub::ArgIndexInputIterator", ("hipcub::ArgIndexInputIterator", CONV_SPECIAL_FUNC, API_RUNTIME)),
         ("cub::TransformInputIterator", ("hipcub::TransformInputIterator", CONV_SPECIAL_FUNC, API_RUNTIME)),
         ("cub::WarpReduce", ("hipcub::WarpReduce", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::CTA_SYNC", ("hipcub::CTA_SYNC", CONV_SPECIAL_FUNC, API_RUNTIME)),
         ("nvtxMark", ("roctxMark", CONV_OTHER, API_ROCTX)),
         ("nvtxMarkA", ("roctxMarkA", CONV_OTHER, API_ROCTX)),
         ("nvtxRangePushA", ("roctxRangePushA", CONV_OTHER, API_ROCTX)),
@@ -8693,6 +8739,8 @@
         ("pinned_use_cuda_host_register", ("pinned_use_hip_host_register", API_C10)),
         ("c10::cuda::CUDAAllocator", ("c10::hip::HIPAllocator", API_C10)),
         ("cuda::CUDAAllocator", ("hip::HIPAllocator", API_C10)),
+        ("CUDAStreamCaptureModeGuard", ("HIPStreamCaptureModeGuard", API_C10)),
+        ("cuda::CUDAStreamCaptureModeGuard", ("cuda::HIPStreamCaptureModeGuard", API_C10)),
         ("CUDAAllocator", ("HIPAllocator", API_C10)),
         ("C10_CUDA_KERNEL_LAUNCH_CHECK", ("C10_HIP_KERNEL_LAUNCH_CHECK", API_C10))
     ]
diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py
index 4d7bf7233cfcc..39e7070144aaf 100755
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -49,8 +49,7 @@ def __init__(self, current_state, hipified_path):
         self.status = ""
 
     def __str__(self):
-        return ("HipifyResult:: current_state: {}, hipified_path : {}, status: {}".format(self.current_state,
-                                                                                          self.hipified_path, self.status))
+        return (f"HipifyResult:: current_state: {self.current_state}, hipified_path : {self.hipified_path}, status: {self.status}")
 
 HipifyFinalResult = Dict[str, HipifyResult]
 HIPIFY_C_BREADCRUMB = "// !!! This is a file automatically generated by hipify!!!\n"
@@ -659,40 +658,66 @@ def is_caffe2_gpu_file(rel_filepath):
     _, ext = os.path.splitext(filename)
     return ('gpu' in filename or ext in ['.cu', '.cuh']) and ('cudnn' not in filename)
 
+class TrieNode:
+    """A Trie node whose children are represented as a directory of char: TrieNode.
+       A special char '' represents end of word
+    """
+
+    def __init__(self):
+        self.children = {}
 
-# Cribbed from https://stackoverflow.com/questions/42742810/speed-up-millions-of-regex-replacements-in-python-3/42789508#42789508
 class Trie:
-    """Regex::Trie in Python. Creates a Trie out of a list of words. The trie can be exported to a Regex pattern.
+    """Creates a Trie out of a list of words. The trie can be exported to a Regex pattern.
     The corresponding Regex should match much faster than a simple Regex union."""
 
     def __init__(self):
-        self.data = {}
+        """Initialize the trie with an empty root node."""
+        self.root = TrieNode()
 
     def add(self, word):
-        ref = self.data
+        """Add a word to the Trie. """
+        node = self.root
+
         for char in word:
-            ref[char] = char in ref and ref[char] or {}
-            ref = ref[char]
-        ref[''] = 1
+            node.children.setdefault(char, TrieNode())
+            node = node.children[char]
+        node.children[''] = True    # Mark the end of the word
 
     def dump(self):
-        return self.data
+        """Return the root node of Trie. """
+        return self.root
 
     def quote(self, char):
+        """ Escape a char for regex. """
         return re.escape(char)
 
-    def _pattern(self, pData):
-        data = pData
-        if "" in data and len(data.keys()) == 1:
+    def search(self, word):
+        """Search whether word is present in the Trie.
+        Returns True if yes, else return False"""
+        node = self.root
+        for char in word:
+            if char in node.children:
+                node = node.children[char]
+            else:
+                return False
+
+        # make sure to check the end-of-word marker present
+        return '' in node.children
+
+    def _pattern(self, root):
+        """Convert a Trie into a regular expression pattern"""
+        node = root
+
+        if "" in node.children and len(node.children.keys()) == 1:
             return None
 
-        alt = []
-        cc = []
-        q = 0
-        for char in sorted(data.keys()):
-            if isinstance(data[char], dict):
+        alt = []    # store alternative patterns
+        cc = []     # store char to char classes
+        q = 0       # for node representing the end of word
+        for char in sorted(node.children.keys()):
+            if isinstance(node.children[char], TrieNode):
                 try:
-                    recurse = self._pattern(data[char])
+                    recurse = self._pattern(node.children[char])
                     alt.append(self.quote(char) + recurse)
                 except Exception:
                     cc.append(self.quote(char))
@@ -719,8 +744,12 @@ def _pattern(self, pData):
         return result
 
     def pattern(self):
-        return self._pattern(self.dump())
+        """Export the Trie to a regex pattern."""
+        return self._pattern(self.root)
 
+    def export_to_regex(self):
+        """Export the Trie to a regex pattern."""
+        return self._pattern(self.root)
 
 CAFFE2_TRIE = Trie()
 CAFFE2_MAP = {}
@@ -753,8 +782,8 @@ def pattern(self):
         if constants.API_PYTORCH not in meta_data and constants.API_SPECIAL not in meta_data:
             CAFFE2_TRIE.add(src)
             CAFFE2_MAP[src] = dst
-RE_CAFFE2_PREPROCESSOR = re.compile(CAFFE2_TRIE.pattern())
-RE_PYTORCH_PREPROCESSOR = re.compile(fr'(?<=\W)({PYTORCH_TRIE.pattern()})(?=\W)')
+RE_CAFFE2_PREPROCESSOR = re.compile(CAFFE2_TRIE.export_to_regex())
+RE_PYTORCH_PREPROCESSOR = re.compile(fr'(?<=\W)({PYTORCH_TRIE.export_to_regex()})(?=\W)')
 
 RE_QUOTE_HEADER = re.compile(r'#include "([^"]+)"')
 RE_ANGLE_HEADER = re.compile(r'#include <([^>]+)>')
diff --git a/torch/utils/hooks.py b/torch/utils/hooks.py
index 790bb498e5d83..f70a43ad68575 100644
--- a/torch/utils/hooks.py
+++ b/torch/utils/hooks.py
@@ -83,7 +83,7 @@ def warn_if_has_hooks(tensor):
     if tensor._backward_hooks:
         for k in tensor._backward_hooks:
             hook = tensor._backward_hooks[k]
-            if not hasattr(k, "__torch_unserializable__"):
+            if not hasattr(hook, "__torch_unserializable__"):
                 warnings.warn(f"backward hook {repr(hook)} on tensor will not be "
                               "serialized.  If this is expected, you can "
                               "decorate the function with @torch.utils.hooks.unserializable_hook "
diff --git a/torch/utils/jit/__init__.py b/torch/utils/jit/__init__.py
index 8b137891791fe..e69de29bb2d1d 100644
--- a/torch/utils/jit/__init__.py
+++ b/torch/utils/jit/__init__.py
@@ -1 +0,0 @@
-
diff --git a/torch/utils/mobile_optimizer.py b/torch/utils/mobile_optimizer.py
index e3a801850ef7c..038572806f41e 100644
--- a/torch/utils/mobile_optimizer.py
+++ b/torch/utils/mobile_optimizer.py
@@ -104,13 +104,15 @@ def generate_mobile_module_lints(script_module: torch.jit.ScriptModule):
     op_names = torch.jit.export_opnames(script_module)
     for op_name in op_names:
         if "dropout" in op_name:
-            lint_list.append({"name": LintCode.DROPOUT.name, "message": "Operator {} exists, remember to call eval() before "
+            lint_list.append({"name": LintCode.DROPOUT.name,
+                              "message": f"Operator {op_name} exists, remember to call eval() before "
                               "saving the module.and call torch.utils.mobile_optimizer.optimize_for_mobile to drop dropout "
-                              "operator.".format(op_name)})
+                              "operator."})
         if "batch_norm" in op_name:
-            lint_list.append({"name": LintCode.BATCHNORM.name, "message": "Operator {} exists, remember to call eval() before "
+            lint_list.append({"name": LintCode.BATCHNORM.name,
+                              "message": f"Operator {op_name} exists, remember to call eval() before "
                               "saving the module and call torch.utils.mobile_optimizer.optimize_for_mobile to drop batch_norm "
-                              "operator.".format(op_name)})
+                              "operator."})
 
     return lint_list
 
diff --git a/torch/utils/model_dump/__init__.py b/torch/utils/model_dump/__init__.py
index 5fed94a43361a..a8d491ed6b3a0 100644
--- a/torch/utils/model_dump/__init__.py
+++ b/torch/utils/model_dump/__init__.py
@@ -181,8 +181,8 @@ def hierarchical_pickle(data):
                 "__module_type__": typename,
                 "state": hierarchical_pickle((msg,)),
             }
-        raise Exception(f"Can't prepare fake object of type for JS: {typename}")
-    raise Exception(f"Can't prepare data of type for JS: {type(data)}")
+        raise Exception(f"Can't prepare fake object of type for JS: {typename}")  # noqa: TRY002
+    raise Exception(f"Can't prepare data of type for JS: {type(data)}")  # noqa: TRY002
 
 
 def get_model_info(
@@ -217,7 +217,7 @@ def get_model_info(
             if path_prefix is None:
                 path_prefix = prefix
             elif prefix != path_prefix:
-                raise Exception(f"Mismatched prefixes: {path_prefix} != {prefix}")
+                raise Exception(f"Mismatched prefixes: {path_prefix} != {prefix}")  # noqa: TRY002
             zip_files.append(dict(
                 filename=zi.filename,
                 compression=zi.compress_type,
@@ -411,4 +411,4 @@ def main(argv, *, stdout=None):
         page = burn_in_info(skeleton, info)
         output.write(page)
     else:
-        raise Exception("Invalid style")
+        raise Exception("Invalid style")  # noqa: TRY002
diff --git a/torch/utils/module_tracker.py b/torch/utils/module_tracker.py
new file mode 100644
index 0000000000000..078effe99aefb
--- /dev/null
+++ b/torch/utils/module_tracker.py
@@ -0,0 +1,137 @@
+import weakref
+
+from typing import Set
+
+import torch
+from torch.autograd.graph import register_multi_grad_hook
+from torch.nn.modules.module import (
+    register_module_forward_hook,
+    register_module_forward_pre_hook,
+)
+from torch.utils._pytree import tree_flatten
+
+
+class ModuleTracker:
+    """
+    ``ModuleTracker`` is a context manager that tracks the nn.Module hierarchy during execution
+    so that other system can query which Module is currently being executed (or its backward is being
+    executed).
+
+    You can access the ``parents`` attribute on this context manager to get the set of all the
+    Modules currently being executed via their fqn (fully qualified name, also used as the key within
+    the state_dict).
+    You can access the ``is_bw`` attribute to know if you are currently running in backward or not.
+
+    Note that ``parents`` is never empty and always contains the "Global" key. The ``is_bw`` flag
+    will remain ``True`` after the forward until another Module is executed. If you need it to be
+    more accurate, please submit an issue requesting this. Adding a map from fqn to the module instance
+    is possible but not done yet, please submit an issue requesting this if you need it.
+
+    Example usage
+
+    .. code-block:: python
+
+        mod = torch.nn.Linear(2, 2)
+
+        with ModuleTracker() as tracker:
+            # Access anything during the forward pass
+            def my_linear(m1, m2, bias):
+                print(f"Current modules: {tracker.parents}")
+                return torch.mm(m1, m2.t()) + bias
+            torch.nn.functional.linear = my_linear
+
+            mod(torch.rand(2, 2))
+
+    """
+
+    parents: Set[str]
+    """
+    A Set containing the fqn for each module currently running their forward
+    """
+
+    def __init__(self):
+        self.parents = {"Global"}
+        self._known_modules: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
+        self._seen_modules = set()
+        self._has_callback = False
+
+    def _maybe_set_engine_callback(self):
+        # This assumes no concurrent calls to backward
+        if self._has_callback:
+            return
+
+        def callback():
+            self.parents = {"Global"}
+            self._has_callback = False
+
+        torch.autograd.Variable._execution_engine.queue_callback(callback)
+        self._has_callback = True
+
+    @property
+    def is_bw(self):
+        """
+        A boolean marking if this is currently running during the backward pass or not
+        """
+        return torch._C._current_graph_task_id() != -1
+
+    def _get_mod_name(self, mod):
+        if mod not in self._known_modules:
+            self._known_modules[mod] = type(mod).__name__
+        mod_name = self._known_modules[mod]
+        if mod not in self._seen_modules:
+            for name, submod in mod.named_children():
+                self._known_modules[submod] = f"{mod_name}.{name}"
+        return mod_name
+
+    def _get_append_fn(self, name, is_bw):
+        def fn(*args):
+            if is_bw:
+                self._maybe_set_engine_callback()
+            if name in self.parents:
+                print(
+                    "The module hierarchy tracking seems to be messed up."
+                    "Please file a bug to PyTorch."
+                )
+            self.parents.add(name)
+
+        return fn
+
+    def _get_pop_fn(self, name, is_bw):
+        def fn(*args):
+            if name in self.parents:
+                self.parents.remove(name)
+            elif not is_bw:
+                # Due to some input/output not requiring gradients, we cannot enforce
+                # proper nesting in backward
+                raise RuntimeError(
+                    "The Module hierarchy tracking is wrong. Report a bug to PyTorch"
+                )
+
+        return fn
+
+    def _fw_pre_hook(self, mod, input):
+        name = self._get_mod_name(mod)
+        self._get_append_fn(name, False)()
+
+        args, _ = tree_flatten(input)
+        tensors = [a for a in args if isinstance(a, torch.Tensor) and a.requires_grad]
+        if tensors:
+            register_multi_grad_hook(tensors, self._get_pop_fn(name, True))
+
+    def _fw_post_hook(self, mod, input, output):
+        name = self._get_mod_name(mod)
+        self._get_pop_fn(name, False)()
+
+        args, _ = tree_flatten(output)
+        tensors = [a for a in args if isinstance(a, torch.Tensor) and a.requires_grad]
+        if tensors:
+            register_multi_grad_hook(tensors, self._get_append_fn(name, True))
+
+    def __enter__(self):
+        self._fw_pre_handle = register_module_forward_pre_hook(self._fw_pre_hook)
+        self._fw_post_handle = register_module_forward_hook(self._fw_post_hook)
+        return self
+
+    def __exit__(self, *args):
+        self._fw_pre_handle.remove()
+        self._fw_post_handle.remove()
diff --git a/torch/utils/show_pickle.py b/torch/utils/show_pickle.py
index e83bed48e6669..24ea1eb4e1e97 100644
--- a/torch/utils/show_pickle.py
+++ b/torch/utils/show_pickle.py
@@ -40,7 +40,7 @@ def pp_format(printer, obj, stream, indent, allowance, context, level):
             printer._format(obj.state, stream, indent, allowance + 1, context, level + 1)
             stream.write(")")
             return
-        raise Exception("Need to implement")
+        raise Exception("Need to implement")  # noqa: TRY002
 
 
 class FakeClass:
@@ -84,7 +84,7 @@ def persistent_load(self, pid):
     def load_binunicode(self):
         strlen, = struct.unpack("<I", self.read(4))  # type: ignore[attr-defined]
         if strlen > sys.maxsize:
-            raise Exception("String too long.")
+            raise Exception("String too long.")  # noqa: TRY002
         str_bytes = self.read(strlen)  # type: ignore[attr-defined]
         obj: Any
         try:
@@ -107,7 +107,7 @@ def main(argv, output_stream=None):
     if len(argv) != 2:
         # Don't spam stderr if not using stdout.
         if output_stream is not None:
-            raise Exception("Pass argv of length 2.")
+            raise Exception("Pass argv of length 2.")  # noqa: TRY002
         sys.stderr.write("usage: show_pickle PICKLE_FILE\n")
         sys.stderr.write("  PICKLE_FILE can be any of:\n")
         sys.stderr.write("    path to a pickle file\n")
@@ -137,7 +137,7 @@ def main(argv, output_stream=None):
                         found = True
                         break
                 if not found:
-                    raise Exception(f"Could not find member matching {mname} in {zfname}")
+                    raise Exception(f"Could not find member matching {mname} in {zfname}")  # noqa: TRY002
 
 
 if __name__ == "__main__":
diff --git a/torch/utils/tensorboard/_caffe2_graph.py b/torch/utils/tensorboard/_caffe2_graph.py
index 536746026052d..99cafef820124 100644
--- a/torch/utils/tensorboard/_caffe2_graph.py
+++ b/torch/utils/tensorboard/_caffe2_graph.py
@@ -320,7 +320,7 @@ def _tf_device(device_option):
         return "/cpu:*"
     if device_option.device_type == caffe2_pb2.CUDA:
         return f"/gpu:{device_option.device_id}"
-    raise Exception("Unhandled device", device_option)
+    raise Exception("Unhandled device", device_option)  # noqa: TRY002
 
 
 def _add_tf_shape(attr_dict, ints):
@@ -578,10 +578,8 @@ def _compute_in_out(ops):
     out_blobs = set()
 
     for op in ops:
-        for input_blob in op.input:
-            in_blobs.add(input_blob)
-        for output_blob in op.output:
-            out_blobs.add(output_blob)
+        in_blobs.update(op.input)
+        out_blobs.update(op.output)
 
     input_blobs = list(in_blobs.difference(out_blobs))
     output_blobs = list(out_blobs.difference(in_blobs))
@@ -700,8 +698,7 @@ def _operators_to_graph_def(
             else [_operator_to_node(shapes, op)]
         )  # .extend() expects an iterable
         current_graph.node.extend(nodes_from_op)
-        for input_blob in op.input:
-            blobs.add(input_blob)
+        blobs.update(op.input)
         for i, output_blob in enumerate(op.output):
             blobs.add(output_blob)
             producing_ops.setdefault(output_blob, []).append((op, i))
diff --git a/torch/utils/tensorboard/_convert_np.py b/torch/utils/tensorboard/_convert_np.py
index 54bf80a967303..3a4eff542e354 100644
--- a/torch/utils/tensorboard/_convert_np.py
+++ b/torch/utils/tensorboard/_convert_np.py
@@ -27,6 +27,8 @@ def make_np(x):
 
 
 def _prepare_pytorch(x):
+    if x.dtype == torch.bfloat16:
+        x = x.to(torch.float16)
     x = x.detach().cpu().numpy()
     return x
 
diff --git a/torch/utils/tensorboard/summary.py b/torch/utils/tensorboard/summary.py
index 8211f6e0c8024..4d94c3e6158b7 100644
--- a/torch/utils/tensorboard/summary.py
+++ b/torch/utils/tensorboard/summary.py
@@ -142,7 +142,8 @@ def _draw_single_box(
     if display_str:
         text_bottom = bottom
         # Reverse list and print from bottom to top.
-        text_width, text_height = font.getsize(display_str)
+        _left, _top, _right, _bottom = font.getbbox(display_str)
+        text_width, text_height = _right - _left, _bottom - _top
         margin = np.ceil(0.05 * text_height)
         draw.rectangle(
             [
@@ -620,10 +621,7 @@ def make_image(tensor, rescale=1, rois=None, labels=None):
     image = Image.fromarray(tensor)
     if rois is not None:
         image = draw_boxes(image, rois, labels=labels)
-    try:
-        ANTIALIAS = Image.Resampling.LANCZOS
-    except AttributeError:
-        ANTIALIAS = Image.ANTIALIAS
+    ANTIALIAS = Image.Resampling.LANCZOS
     image = image.resize((scaled_width, scaled_height), ANTIALIAS)
     import io
 
diff --git a/torch/utils/tensorboard/writer.py b/torch/utils/tensorboard/writer.py
index bd8c5d188dc50..6a65a90821480 100644
--- a/torch/utils/tensorboard/writer.py
+++ b/torch/utils/tensorboard/writer.py
@@ -907,7 +907,7 @@ def add_graph(
             else:
                 # Handles cnn.CNNModelHelper, model_helper.ModelHelper
                 current_graph = model_to_graph_def(model)
-            event = event_pb2.Event(graph_def=current_graph.SerializeToString())
+            event = event_pb2.Event(graph_def=current_graph.SerializeToString())  # type: ignore[possibly-undefined]
             self._get_file_writer().add_event(event)
 
     @staticmethod
@@ -932,10 +932,12 @@ def add_embedding(
 
         Args:
             mat (torch.Tensor or numpy.ndarray): A matrix which each row is the feature vector of the data point
-            metadata (list): A list of labels, each element will be convert to string
+            metadata (list): A list of labels, each element will be converted to string
             label_img (torch.Tensor): Images correspond to each data point
             global_step (int): Global step value to record
             tag (str): Name for the embedding
+            metadata_header (list): A list of headers for multi-column metadata. If given, each metadata must be
+                a list with values corresponding to headers.
         Shape:
             mat: :math:`(N, D)`, where N is number of data and D is feature dimension
 
@@ -960,6 +962,11 @@ def add_embedding(
             writer.add_embedding(torch.randn(100, 5), metadata=meta, label_img=label_img)
             writer.add_embedding(torch.randn(100, 5), label_img=label_img)
             writer.add_embedding(torch.randn(100, 5), metadata=meta)
+
+        .. note::
+            Categorical (i.e. non-numeric) metadata cannot have more than 50 unique values if they are to be used for
+            coloring in the embedding projector.
+
         """
         torch._C._log_api_usage_once("tensorboard.logging.add_embedding")
         mat = make_np(mat)
@@ -979,7 +986,7 @@ def add_embedding(
                     "warning: Embedding dir exists, did you set global_step for add_embedding()?"
                 )
             else:
-                raise Exception(
+                raise FileExistsError(
                     f"Path: `{save_path}` exists, but is a file. Cannot proceed."
                 )
         else:
diff --git a/torch/utils/viz/MemoryViz.js b/torch/utils/viz/MemoryViz.js
index fb09fbec9a78e..e725ae1606024 100644
--- a/torch/utils/viz/MemoryViz.js
+++ b/torch/utils/viz/MemoryViz.js
@@ -762,7 +762,12 @@ function frameFilter({name, filename}) {
 
 function format_frames(frames) {
   if (frames.length === 0) {
-    return `<block was allocated before _record_history was enabled>`;
+    return (
+      `This block has no frames. Potential causes:\n` +
+      `1) This block was allocated before _record_memory_history was enabled.\n` +
+      `2) The context or stacks passed to _record_memory_history does not include this block. Consider changing context to 'state', 'alloc', or 'all', or changing stacks to 'all'.\n` +
+      `3) This event occurred during backward, which has no python frames, and memory history did not include C++ frames. Use stacks='all' to record both C++ and python frames.`
+    );
   }
   const frame_strings = frames
     .filter(frameFilter)
@@ -901,6 +906,7 @@ function process_alloc_data(snapshot, device, plot_segments, max_entries) {
     current_data.push(e);
     data.push(e);
     total_mem += size;
+    element_obj.max_allocated_mem = total_mem + total_summarized_mem;
   }
 
   for (const elem of initially_allocated) {
@@ -969,9 +975,11 @@ function process_alloc_data(snapshot, device, plot_segments, max_entries) {
     elements_length: elements.length,
     context_for_id: id => {
       const elem = elements[id];
-      let text = `${formatAddr(elem)} ${formatSize(elem.size)} allocation (${
-        elem.size
-      } bytes)`;
+      let text = `Addr: ${formatAddr(elem)}`;
+      text = `${text}, Size: ${formatSize(elem.size)} allocation`;
+      text = `${text}, Total memory used after allocation: ${formatSize(
+        elem.max_allocated_mem,
+      )}`;
       if (elem.stream !== null) {
         text = `${text}, stream ${elem.stream}`;
       }
diff --git a/torch/utils/viz/_cycles.py b/torch/utils/viz/_cycles.py
index 80a7d35cd4c5c..f17348e401c34 100644
--- a/torch/utils/viz/_cycles.py
+++ b/torch/utils/viz/_cycles.py
@@ -310,7 +310,7 @@ def escape(n):
 
 
 def is_cuda_tensor(obj):
-    return isinstance(obj, torch.Tensor) and obj.is_cuda
+    return isinstance(obj, torch.Tensor) and obj.is_cuda and not isinstance(obj, torch._subclasses.FakeTensor)
 
 def cuda_allocation_context():
     snapshot = torch.cuda.memory._snapshot()
diff --git a/torch/xpu/__init__.py b/torch/xpu/__init__.py
new file mode 100644
index 0000000000000..3e7f43b87d4a7
--- /dev/null
+++ b/torch/xpu/__init__.py
@@ -0,0 +1,484 @@
+r"""
+This package introduces support for the XPU backend, specifically tailored for
+Intel GPU optimization.
+
+This package is lazily initialized, so you can always import it, and use
+:func:`is_available()` to determine if your system supports XPU.
+"""
+import threading
+import traceback
+from functools import lru_cache
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch._C
+from .. import device as _device
+from .._utils import _dummy_type, _LazySeedTracker
+from ._utils import _get_device_index
+from .streams import Event, Stream
+
+_initialized = False
+_tls = threading.local()
+_initialization_lock = threading.Lock()
+_queued_calls: List[
+    Tuple[Callable[[], None], List[str]]
+] = []  # don't invoke these until initialization occurs
+_is_in_bad_fork = getattr(torch._C, "_xpu_isInBadFork", lambda: False)
+_device_t = Union[_device, str, int, None]
+_lazy_seed_tracker = _LazySeedTracker()
+default_generators: Tuple[torch._C.Generator] = ()  # type: ignore[assignment]
+
+
+def _is_compiled() -> bool:
+    r"""Return true if compile with XPU support."""
+    return torch._C._has_xpu
+
+
+if _is_compiled():
+    _XpuDeviceProperties = torch._C._XpuDeviceProperties
+    _exchange_device = torch._C._xpu_exchangeDevice
+    _maybe_exchange_device = torch._C._xpu_maybeExchangeDevice
+else:
+    # Define dummy if PyTorch was compiled without XPU
+    _XpuDeviceProperties = _dummy_type("_XpuDeviceProperties")  # type: ignore[assignment, misc]
+
+    def _exchange_device(device: int) -> int:
+        raise NotImplementedError("PyTorch was compiled without XPU support")
+
+    def _maybe_exchange_device(device: int) -> int:
+        raise NotImplementedError("PyTorch was compiled without XPU support")
+
+
+@lru_cache(maxsize=1)
+def device_count() -> int:
+    r"""Return the number of XPU device available."""
+    if not _is_compiled():
+        return 0
+    return torch._C._xpu_getDeviceCount()
+
+
+def is_available() -> bool:
+    r"""Return a bool indicating if XPU is currently available."""
+    # This function nerver throws.
+    return device_count() > 0
+
+
+def is_bf16_supported():
+    r"""Return a bool indicating if the current XPU device supports dtype bfloat16."""
+    return True
+
+
+def is_initialized():
+    r"""Return whether PyTorch's XPU state has been initialized."""
+    return _initialized and not _is_in_bad_fork()
+
+
+def _lazy_call(callable, **kwargs):
+    if is_initialized():
+        callable()
+    else:
+        global _lazy_seed_tracker
+        if kwargs.get("seed_all", False):
+            _lazy_seed_tracker.queue_seed_all(callable, traceback.format_stack())
+        elif kwargs.get("seed", False):
+            _lazy_seed_tracker.queue_seed(callable, traceback.format_stack())
+        else:
+            # Don't store the actual traceback to avoid memory cycle
+            _queued_calls.append((callable, traceback.format_stack()))
+
+
+def init():
+    r"""Initialize PyTorch's XPU state.
+    This is a Python API about lazy initialization that avoids initializing
+    XPU until the first time it is accessed. Does nothing if the XPU state is
+    already initialized.
+    """
+    _lazy_init()
+
+
+def _lazy_init():
+    global _initialized, _queued_calls
+    if is_initialized() or hasattr(_tls, "is_initializing"):
+        return
+    with _initialization_lock:
+        # This test was was protected via GIL. Double-check whether XPU has
+        # already been initialized.
+        if is_initialized():
+            return
+        # Stop promptly upon encountering a bad fork error.
+        if _is_in_bad_fork():
+            raise RuntimeError(
+                "Cannot re-initialize XPU in forked subprocess. To use XPU with "
+                "multiprocessing, you must use the 'spawn' start method"
+            )
+        if not _is_compiled():
+            raise AssertionError("Torch not compiled with XPU enabled")
+        # This function inits XPU backend and detects bad fork processing.
+        torch._C._xpu_init()
+        # Some of the queued calls may reentrantly call _lazy_init(); We need to
+        # just return without initializing in that case.
+        _tls.is_initializing = True
+
+        for calls in _lazy_seed_tracker.get_calls():
+            if calls:
+                _queued_calls.append(calls)
+
+        try:
+            for queued_call, orig_traceback in _queued_calls:
+                try:
+                    queued_call()
+                except Exception as e:
+                    msg = (
+                        f"XPU call failed lazily at initialization with error: {str(e)}\n\n"
+                        f"XPU call was originally invoked at:\n\n{''.join(orig_traceback)}"
+                    )
+                    raise Exception(msg) from e  # noqa: TRY002
+        finally:
+            delattr(_tls, "is_initializing")
+        _initialized = True
+
+
+class _DeviceGuard:
+    def __init__(self, index: int):
+        self.idx = index
+        self.prev_idx = -1
+
+    def __enter__(self):
+        self.prev_idx = torch.xpu._exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        self.idx = torch.xpu._maybe_exchange_device(self.prev_idx)
+        return False
+
+
+class device:
+    r"""Context-manager that changes the selected device.
+
+    Args:
+        device (torch.device or int or str): device index to select. It's a no-op if
+            this argument is a negative integer or ``None``.
+    """
+
+    def __init__(self, device: Any):
+        self.idx = _get_device_index(device, optional=True)
+        self.prev_idx = -1
+
+    def __enter__(self):
+        self.prev_idx = torch.xpu._exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        self.idx = torch.xpu._maybe_exchange_device(self.prev_idx)
+        return False
+
+
+class device_of(device):
+    r"""Context-manager that changes the current device to that of given object.
+
+    You can use both tensors and storages as arguments. If a given object is
+    not allocated on a XPU, this is a no-op.
+
+    Args:
+        obj (Tensor or Storage): object allocated on the selected device.
+    """
+
+    def __init__(self, obj):
+        idx = obj.get_device() if obj.is_xpu else -1
+        super().__init__(idx)
+
+
+def set_device(device: _device_t) -> None:
+    r"""Set the current device.
+
+    Args:
+        device (torch.device or int or str): selected device. This function is a
+            no-op if this argument is negative.
+    """
+    _lazy_init()
+    device = _get_device_index(device)
+    if device >= 0:
+        torch._C._xpu_setDevice(device)
+
+
+def get_device_name(device: Optional[_device_t] = None) -> str:
+    r"""Get the name of a device.
+
+    Args:
+        device (torch.device or int or str, optional): device for which to
+            return the name. This function is a no-op if this argument is a
+            negative integer. It uses the current device, given by :func:`~torch.xpu.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Returns:
+        str: the name of the device
+    """
+    return get_device_properties(device).name
+
+
+@lru_cache(None)
+def get_device_capability(device: Optional[_device_t] = None) -> Dict[str, Any]:
+    r"""Get the xpu capability of a device.
+
+    Args:
+        device (torch.device or int or str, optional): device for which to
+            return the device capability. This function is a no-op if this
+            argument is a negative integer. It uses the current device, given by
+            :func:`~torch.xpu.current_device`, if :attr:`device` is ``None``
+            (default).
+
+    Returns:
+        Dict[str, Any]: the xpu capability dictionary of the device
+    """
+    props = get_device_properties(device)
+    return {
+        prop: getattr(props, prop) for prop in dir(props) if not prop.startswith("__")
+    }
+
+
+def get_device_properties(device: Optional[_device_t] = None) -> _XpuDeviceProperties:
+    r"""Get the properties of a device.
+
+    Args:
+        device (torch.device or int or str): device for which to return the
+            properties of the device.
+
+    Returns:
+        _XpuDeviceProperties: the properties of the device
+    """
+    _lazy_init()
+    device = _get_device_index(device, optional=True)
+    if device < 0 or device >= device_count():
+        raise AssertionError("Invalid device index")
+    return _get_device_properties(device)  # type: ignore[name-defined]  # noqa: F821
+
+
+def current_device() -> int:
+    r"""Return the index of a currently selected device."""
+    _lazy_init()
+    return torch._C._xpu_getDevice()
+
+
+def _get_device(device: Union[int, str, torch.device]) -> torch.device:
+    r"""Return the torch.device type object from the passed in device.
+
+    Args:
+        device (torch.device or int or str): selected device.
+    """
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("xpu", device)
+    return device
+
+
+class StreamContext:
+    r"""Context-manager that selects a given stream.
+
+    All XPU kernels queued within its context will be enqueued on a selected
+    stream.
+
+    Args:
+        Stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    .. note:: Streams are per-device.
+    """
+    cur_stream: Optional["torch.xpu.Stream"]
+
+    def __init__(self, stream: Optional["torch.xpu.Stream"]):
+        self.stream = stream
+        self.idx = _get_device_index(None, True)
+        if self.idx is None:
+            self.idx = -1
+
+    def __enter__(self):
+        cur_stream = self.stream
+        if cur_stream is None or self.idx == -1:
+            return
+        self.src_prev_stream = torch.xpu.current_stream(None)
+
+        # If the stream is not on the current device, then set the current stream on the device
+        if self.src_prev_stream.device != cur_stream.device:
+            with device(cur_stream.device):
+                self.dst_prev_stream = torch.xpu.current_stream(cur_stream.device)
+        torch.xpu.set_stream(cur_stream)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        cur_stream = self.stream
+        if cur_stream is None or self.idx == -1:
+            return
+
+        # Reset the stream on the original device and destination device
+        if self.src_prev_stream.device != cur_stream.device:
+            torch.xpu.set_stream(self.dst_prev_stream)
+        torch.xpu.set_stream(self.src_prev_stream)
+
+
+def stream(stream: Optional["torch.xpu.Stream"]) -> StreamContext:
+    r"""Wrap around the Context-manager StreamContext that selects a given stream.
+
+    Arguments:
+        stream (Stream): selected stream. This manager is a no-op if it's ``None``.
+    """
+    return StreamContext(stream)
+
+
+def _set_stream_by_id(stream_id, device_index, device_type):
+    r"""set stream specified by the stream id, device index and device type
+
+    Args: stream_id (int): not visible to the user, used to assigned to the specific stream.
+          device_index (int): selected device index.
+          device_type (int): selected device type.
+    """
+    torch._C._xpu_setStream(
+        stream_id=stream_id,
+        device_index=device_index,
+        device_type=device_type,
+    )
+
+
+def set_stream(stream: Stream):
+    r"""Set the current stream.This is a wrapper API to set the stream.
+        Usage of this function is discouraged in favor of the ``stream``
+        context manager.
+
+    Args:
+        stream (Stream): selected stream. This function is a no-op
+            if this argument is ``None``.
+    """
+    if stream is None:
+        return
+    _lazy_init()
+    _set_stream_by_id(
+        stream_id=stream.stream_id,
+        device_index=stream.device_index,
+        device_type=stream.device_type,
+    )
+
+
+def current_stream(device: Optional[_device_t] = None) -> Stream:
+    r"""Return the currently selected :class:`Stream` for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            the currently selected :class:`Stream` for the current device, given
+            by :func:`~torch.xpu.current_device`, if :attr:`device` is ``None``
+            (default).
+    """
+    _lazy_init()
+    streamdata = torch._C._xpu_getCurrentStream(
+        _get_device_index(device, optional=True)
+    )
+    return Stream(
+        stream_id=streamdata[0], device_index=streamdata[1], device_type=streamdata[2]
+    )
+
+
+def synchronize(device: _device_t = None) -> None:
+    r"""Wait for all kernels in all streams on a XPU device to complete.
+
+    Args:
+        device (torch.device or int, optional): device for which to synchronize.
+            It uses the current device, given by :func:`~torch.xpu.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+    _lazy_init()
+    device = _get_device_index(device, optional=True)
+    return torch._C._xpu_synchronize(device)
+
+
+def empty_cache() -> None:
+    r"""Release all unoccupied cached memory currently held by the caching
+    allocator so that those can be used in other XPU application.
+
+    .. note::
+        :func:`~torch.xpu.empty_cache` doesn't increase the amount of XPU
+        memory available for PyTorch. However, it may help reduce fragmentation
+        of XPU memory in certain cases.
+    """
+    if is_initialized():
+        torch._C._xpu_emptyCache()
+
+
+def _get_generator(device: torch.device) -> torch._C.Generator:
+    r"""Return the XPU Generator object for the given device.
+
+    Args:
+        device (torch.device): selected device.
+    """
+    idx = device.index
+    if idx is None:
+        idx = current_device()
+    return torch.xpu.default_generators[idx]
+
+
+def _set_rng_state_offset(
+    offset: int, device: Union[int, str, torch.device] = "xpu"
+) -> None:
+    r"""Set the random number generator state offset of the specified GPU.
+
+    Args:
+        offset (int): The desired offset
+        device (torch.device or int, optional): The device to set the RNG state.
+            Default: ``'xpu'`` (i.e., ``torch.device('xpu')``, the current XPU device).
+    """
+    final_device = _get_device(device)
+
+    def cb():
+        default_generator = _get_generator(final_device)
+        default_generator.set_offset(offset)
+
+    _lazy_call(cb)
+
+
+def _get_rng_state_offset(device: Union[int, str, torch.device] = "xpu") -> int:
+    r"""Return the random number generator state offset of the specified GPU.
+
+    Args:
+        device (torch.device or int, optional): The device to return the RNG state offset of.
+            Default: ``'xpu'`` (i.e., ``torch.device('xpu')``, the current XPU device).
+
+    .. warning::
+        This function eagerly initializes XPU.
+    """
+    _lazy_init()
+    final_device = _get_device(device)
+    default_generator = _get_generator(final_device)
+    return default_generator.get_offset()
+
+
+from .random import *  # noqa: F403
+
+
+__all__ = [
+    "Event",
+    "Stream",
+    "StreamContext",
+    "current_device",
+    "current_stream",
+    "default_generators",
+    "device",
+    "device_of",
+    "device_count",
+    "empty_cache",
+    "get_device_capability",
+    "get_device_name",
+    "get_device_properties",
+    "get_rng_state",
+    "get_rng_state_all",
+    "get_stream",
+    "init",
+    "initial_seed",
+    "is_available",
+    "is_bf16_supported",
+    "is_initialized",
+    "manual_seed",
+    "manual_seed_all",
+    "seed",
+    "seed_all",
+    "set_device",
+    "set_rng_state",
+    "set_rng_state_all",
+    "set_stream",
+    "stream",
+    "streams",
+    "synchronize",
+]
diff --git a/torch/xpu/_gpu_trace.py b/torch/xpu/_gpu_trace.py
new file mode 100644
index 0000000000000..0407abbf24959
--- /dev/null
+++ b/torch/xpu/_gpu_trace.py
@@ -0,0 +1,75 @@
+from typing import Callable
+
+from torch._utils import CallbackRegistry
+
+
+EventCreationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "XPU event creation"
+)
+EventDeletionCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "XPU event deletion"
+)
+EventRecordCallbacks: "CallbackRegistry[int, int]" = CallbackRegistry(
+    "XPU event record"
+)
+EventWaitCallbacks: "CallbackRegistry[int, int]" = CallbackRegistry(
+    "XPU event wait"
+)
+MemoryAllocationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "XPU memory allocation"
+)
+MemoryDeallocationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "XPU memory deallocation"
+)
+StreamCreationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "XPU stream creation"
+)
+DeviceSynchronizationCallbacks: "CallbackRegistry[[]]" = CallbackRegistry(
+    "XPU device synchronization"
+)
+StreamSynchronizationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "XPU stream synchronization"
+)
+EventSynchronizationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "XPU event synchronization"
+)
+
+
+def register_callback_for_event_creation(cb: Callable[[int], None]) -> None:
+    EventCreationCallbacks.add_callback(cb)
+
+
+def register_callback_for_event_deletion(cb: Callable[[int], None]) -> None:
+    EventDeletionCallbacks.add_callback(cb)
+
+
+def register_callback_for_event_record(cb: Callable[[int, int], None]) -> None:
+    EventRecordCallbacks.add_callback(cb)
+
+
+def register_callback_for_event_wait(cb: Callable[[int, int], None]) -> None:
+    EventWaitCallbacks.add_callback(cb)
+
+
+def register_callback_for_memory_allocation(cb: Callable[[int], None]) -> None:
+    MemoryAllocationCallbacks.add_callback(cb)
+
+
+def register_callback_for_memory_deallocation(cb: Callable[[int], None]) -> None:
+    MemoryDeallocationCallbacks.add_callback(cb)
+
+
+def register_callback_for_stream_creation(cb: Callable[[int], None]) -> None:
+    StreamCreationCallbacks.add_callback(cb)
+
+
+def register_callback_for_device_synchronization(cb: Callable[[], None]) -> None:
+    DeviceSynchronizationCallbacks.add_callback(cb)
+
+
+def register_callback_for_stream_synchronization(cb: Callable[[int], None]) -> None:
+    StreamSynchronizationCallbacks.add_callback(cb)
+
+
+def register_callback_for_event_synchronization(cb: Callable[[int], None]) -> None:
+    EventSynchronizationCallbacks.add_callback(cb)
diff --git a/torch/xpu/_utils.py b/torch/xpu/_utils.py
new file mode 100644
index 0000000000000..8f738267459a2
--- /dev/null
+++ b/torch/xpu/_utils.py
@@ -0,0 +1,39 @@
+from typing import Any
+
+import torch
+
+# The _get_device_index has been moved to torch.utils._get_device_index
+from torch._utils import _get_device_index as _torch_get_device_index
+
+
+def _get_device_index(
+    device: Any, optional: bool = False, allow_cpu: bool = False
+) -> int:
+    r"""Get the device index from :attr:`device`, which can be a torch.device
+    object, a Python integer, or ``None``.
+
+    If :attr:`device` is a torch.device object, returns the device index if it
+    is a XPU device. Note that for a XPU device without a specified index,
+    i.e., ``torch.device('xpu')``, this will return the current default XPU
+    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
+    CPU devices will be accepted and ``-1`` will be returned in this case.
+
+    If :attr:`device` is a Python integer, it is returned as is.
+
+    If :attr:`device` is ``None``, this will return the current default XPU
+    device if :attr:`optional` is ``True``.
+    """
+    if isinstance(device, int):
+        return device
+    if isinstance(device, str):
+        device = torch.device(device)
+    if isinstance(device, torch.device):
+        if allow_cpu:
+            if device.type not in ["xpu", "cpu"]:
+                raise ValueError(f"Expected a xpu or cpu device, but got: {device}")
+        elif device.type != "xpu":
+            raise ValueError(f"Expected a xpu device, but got: {device}")
+    if not torch.jit.is_scripting():
+        if isinstance(device, torch.xpu.device):
+            return device.idx
+    return _torch_get_device_index(device, optional, allow_cpu)
diff --git a/torch/xpu/random.py b/torch/xpu/random.py
new file mode 100644
index 0000000000000..733c55b658cd7
--- /dev/null
+++ b/torch/xpu/random.py
@@ -0,0 +1,176 @@
+from typing import Iterable, List, Union
+
+import torch
+from .. import Tensor
+from . import _lazy_call, _lazy_init, current_device, device_count
+
+
+def get_rng_state(device: Union[int, str, torch.device] = "xpu") -> Tensor:
+    r"""Return the random number generator state of the specified GPU as a ByteTensor.
+
+    Args:
+        device (torch.device or int, optional): The device to return the RNG state of.
+            Default: ``'xpu'`` (i.e., ``torch.device('xpu')``, the current XPU device).
+
+    .. warning::
+        This function eagerly initializes XPU.
+    """
+    _lazy_init()
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("xpu", device)
+    idx = device.index
+    if idx is None:
+        idx = current_device()
+    default_generator = torch.xpu.default_generators[idx]
+    return default_generator.get_state()
+
+
+def get_rng_state_all() -> List[Tensor]:
+    r"""Return a list of ByteTensor representing the random number states of all devices."""
+    results = []
+    for i in range(device_count()):
+        results.append(get_rng_state(i))
+    return results
+
+
+def set_rng_state(
+    new_state: Tensor, device: Union[int, str, torch.device] = "xpu"
+) -> None:
+    r"""Set the random number generator state of the specified GPU.
+
+    Args:
+        new_state (torch.ByteTensor): The desired state
+        device (torch.device or int, optional): The device to set the RNG state.
+            Default: ``'xpu'`` (i.e., ``torch.device('xpu')``, the current XPU device).
+    """
+    with torch._C._DisableFuncTorch():
+        new_state_copy = new_state.clone(memory_format=torch.contiguous_format)
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("xpu", device)
+
+    def cb():
+        idx = device.index
+        if idx is None:
+            idx = current_device()
+        default_generator = torch.xpu.default_generators[idx]
+        default_generator.set_state(new_state_copy)
+
+    _lazy_call(cb)
+
+
+def set_rng_state_all(new_states: Iterable[Tensor]) -> None:
+    r"""Set the random number generator state of all devices.
+
+    Args:
+        new_states (Iterable of torch.ByteTensor): The desired state for each device.
+    """
+    for i, state in enumerate(new_states):
+        set_rng_state(state, i)
+
+
+def manual_seed(seed: int) -> None:
+    r"""Set the seed for generating random numbers for the current GPU.
+
+    It's safe to call this function if XPU is not available; in that case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+
+    .. warning::
+        If you are working with a multi-GPU model, this function is insufficient
+        to get determinism.  To seed all GPUs, use :func:`manual_seed_all`.
+    """
+    seed = int(seed)
+
+    def cb():
+        idx = current_device()
+        default_generator = torch.xpu.default_generators[idx]
+        default_generator.manual_seed(seed)
+
+    _lazy_call(cb, seed=True)
+
+
+def manual_seed_all(seed: int) -> None:
+    r"""Set the seed for generating random numbers on all GPUs.
+
+    It's safe to call this function if XPU is not available; in that case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+    """
+    seed = int(seed)
+
+    def cb():
+        for i in range(device_count()):
+            default_generator = torch.xpu.default_generators[i]
+            default_generator.manual_seed(seed)
+
+    _lazy_call(cb, seed_all=True)
+
+
+def seed() -> None:
+    r"""Set the seed for generating random numbers to a random number for the current GPU.
+
+    It's safe to call this function if XPU is not available; in that case, it is silently ignored.
+
+    .. warning::
+        If you are working with a multi-GPU model, this function will only initialize
+        the seed on one GPU.  To initialize all GPUs, use :func:`seed_all`.
+    """
+
+    def cb():
+        idx = current_device()
+        default_generator = torch.xpu.default_generators[idx]
+        default_generator.seed()
+
+    _lazy_call(cb)
+
+
+def seed_all() -> None:
+    r"""Set the seed for generating random numbers to a random number on all GPUs.
+
+    It's safe to call this function if XPU is not available; in that case, it is silently ignored.
+    """
+
+    def cb():
+        random_seed = 0
+        seeded = False
+        for i in range(device_count()):
+            default_generator = torch.xpu.default_generators[i]
+            if not seeded:
+                default_generator.seed()
+                random_seed = default_generator.initial_seed()
+                seeded = True
+            else:
+                default_generator.manual_seed(random_seed)
+
+    _lazy_call(cb)
+
+
+def initial_seed() -> int:
+    r"""Return the current random seed of the current GPU.
+
+    .. warning::
+        This function eagerly initializes XPU.
+    """
+    _lazy_init()
+    idx = current_device()
+    default_generator = torch.xpu.default_generators[idx]
+    return default_generator.initial_seed()
+
+
+__all__ = [
+    "get_rng_state",
+    "get_rng_state_all",
+    "set_rng_state",
+    "set_rng_state_all",
+    "manual_seed",
+    "manual_seed_all",
+    "seed",
+    "seed_all",
+    "initial_seed",
+]
diff --git a/torch/xpu/streams.py b/torch/xpu/streams.py
new file mode 100644
index 0000000000000..2c3c3a63d58bd
--- /dev/null
+++ b/torch/xpu/streams.py
@@ -0,0 +1,169 @@
+import ctypes
+
+import torch
+from torch._streambase import _EventBase, _StreamBase
+from .._utils import _dummy_type
+
+
+if not hasattr(torch._C, "_XpuStreamBase"):
+    # Define dummy base classes
+    torch._C.__dict__["_XpuStreamBase"] = _dummy_type("_XpuStreamBase")
+    torch._C.__dict__["_XpuEventBase"] = _dummy_type("_XpuEventBase")
+
+
+class Stream(torch._C._XpuStreamBase, _StreamBase):
+    r"""Wrapper around a XPU stream.
+
+    A XPU stream is a linear sequence of execution that belongs to a specific
+    device, independent from other streams.
+
+    Args:
+        device(torch.device or int, optional): a device on which to allocate
+            the stream. If :attr:`device` is ``None`` (default) or a negative
+            integer, this will use the current device.
+        priority(int, optional): priority of the stream, should be 0 or
+            negative, where negative numbers indicate higher priority. By default,
+            streams have priority 0.
+    """
+
+    def __new__(cls, device=None, priority=0, **kwargs):
+        # setting device manager is expensive, so we avoid it unless necessary
+        if device is None or ("stream_id" in kwargs and "device_index" in kwargs):
+            return super().__new__(cls, priority=priority, **kwargs)
+        else:
+            with torch.xpu.device(device):
+                return super().__new__(cls, priority=priority, **kwargs)
+
+    def wait_event(self, event):
+        r"""Make all future work submitted to the stream wait for an event.
+
+        Args:
+            event (torch.xpu.Event): an event to wait for.
+        """
+        event.wait(self)
+
+    def wait_stream(self, stream):
+        r"""Synchronize with another stream.
+
+        All future work submitted to this stream will wait until all kernels
+        submitted to a given stream at the time of call complete.
+
+        Args:
+            stream (Stream): a stream to synchronize.
+        """
+        self.wait_event(stream.record_event())
+
+    def record_event(self, event=None):
+        r"""Record an event.
+
+        Args:
+            event (torch.xpu.Event, optional): event to record. If not given, a new one
+                will be allocated.
+
+        Returns:
+            Recorded event.
+        """
+        if event is None:
+            event = Event()
+        event.record(self)
+        return event
+
+    def query(self):
+        r"""Check if all the work submitted has been completed.
+
+        Returns:
+            A boolean indicating if all kernels in this stream are completed.
+        """
+        return super().query()
+
+    def synchronize(self):
+        r"""Wait for all the kernels in this stream to complete."""
+        super().synchronize()
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.sycl_queue)
+
+    def __eq__(self, o):
+        if isinstance(o, Stream):
+            return super().__eq__(o)
+        return False
+
+    def __hash__(self):
+        return hash((self.sycl_queue, self.device))
+
+    def __repr__(self):
+        return f"torch.xpu.Stream(device={self.device} sycl_queue={self.sycl_queue:#x})"
+
+
+class Event(torch._C._XpuEventBase, _EventBase):
+    r"""Wrapper around a XPU event.
+
+    XPU events are synchronization markers that can be used to monitor the
+    device's progress, and to synchronize XPU streams.
+
+    The underlying XPU events are lazily initialized when the event is first
+    recorded. After creation, only streams on the same device may record the
+    event. However, streams on any device can wait on the event.
+
+    Args:
+        enable_timing (bool, optional): indicates if the event should measure time
+            (default: ``False``)
+    """
+
+    def __new__(cls, enable_timing=False):
+        return super().__new__(cls, enable_timing=enable_timing)
+
+    def record(self, stream=None):
+        r"""Record the event in a given stream.
+
+        Uses ``torch.xpu.current_stream()`` if no stream is specified. The
+        stream's device must match the event's device.
+        """
+        if stream is None:
+            stream = torch.xpu.current_stream()
+        super().record(stream)
+
+    def wait(self, stream=None):
+        r"""Make all future work submitted to the given stream wait for this event.
+
+        Use ``torch.xpu.current_stream()`` if no stream is specified.
+        """
+        if stream is None:
+            stream = torch.xpu.current_stream()
+        super().wait(stream)
+
+    def query(self):
+        r"""Check if all work currently captured by event has completed.
+
+        Returns:
+            A boolean indicating if all work currently captured by event has
+            completed.
+        """
+        return super().query()
+
+    def elapsed_time(self, end_event):
+        r"""Return the time elapsed.
+
+        Time reported in milliseconds after the event was recorded and
+        before the end_event was recorded.
+        """
+        return super().elapsed_time(end_event)
+
+    def synchronize(self):
+        r"""Wait for the event to complete.
+
+        Waits until the completion of all work currently captured in this event.
+        This prevents the CPU thread from proceeding until the event completes.
+        """
+        super().synchronize()
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.sycl_event)
+
+    def __repr__(self):
+        if self.sycl_event:
+            return f"torch.xpu.Event(sycl_event={self.sycl_event:#x})"
+        else:
+            return "torch.xpu.Event(uninitialized)"
diff --git a/torchgen/aoti/__init__.py b/torchgen/aoti/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/torchgen/aoti/fallback_ops.py b/torchgen/aoti/fallback_ops.py
new file mode 100644
index 0000000000000..93055e5d16d84
--- /dev/null
+++ b/torchgen/aoti/fallback_ops.py
@@ -0,0 +1,129 @@
+# This list is based on the fallback ops from torch/_inductor/lowering.py
+# If you add a new op to the list, remember to run `python torchgen/gen.py --update-aoti-c-shim`
+# to update C shim files.
+
+inductor_fallback_ops = {
+    "aten._adaptive_avg_pool2d_backward",
+    "aten._adaptive_avg_pool2d",
+    "aten._adaptive_avg_pool3d",
+    "aten._adaptive_avg_pool3d_backward",
+    "aten.adaptive_max_pool2d_backward",
+    "aten.adaptive_max_pool2d",
+    "aten.adaptive_max_pool3d",
+    "aten.adaptive_max_pool3d_backward",
+    "aten.addbmm",
+    "aten._addmm_activation",
+    "aten.addmm.out",
+    "aten.addmv",
+    "aten.angle",
+    "aten.avg_pool2d_backward",
+    "aten.avg_pool2d",
+    "aten.avg_pool3d_backward",
+    "aten.avg_pool3d",
+    "aten.bmm.out",
+    "aten.bucketize.Tensor",
+    "aten.cat",
+    "aten._cdist_backward",
+    "aten._cdist_forward",
+    "aten.cholesky_inverse",
+    "aten.cholesky_solve",
+    "aten.convolution_backward",
+    "aten._cudnn_rnn",
+    "aten._cudnn_rnn_backward",
+    "aten.convolution",
+    "aten.cummax",
+    "aten.cummin",
+    "aten.cumprod",
+    "aten.cumsum",
+    "aten._efficient_attention_backward",
+    "aten._efficient_attention_forward",
+    "aten._efficientzerotensor",
+    "aten._embedding_bag",
+    "aten._embedding_bag_dense_backward",
+    "aten._embedding_bag_forward_only",
+    "aten._embedding_bag_per_sample_weights_backward",
+    "aten.exponential",
+    "aten._fft_c2c",
+    "aten._fft_r2c",
+    "aten._flash_attention_backward",
+    "aten._flash_attention_forward",
+    "aten.fractional_max_pool2d_backward",
+    "aten.fractional_max_pool2d",
+    "aten.fractional_max_pool3d",
+    "aten.fractional_max_pool3d_backward",
+    "aten._fused_moving_avg_obs_fq_helper",
+    "aten._fused_moving_avg_obs_fq_helper_functional",
+    "aten.gcd",
+    "aten.geqrf",
+    "aten.grid_sampler_2d_backward",
+    "aten.histc",
+    "aten.histogram.bin_ct",
+    "aten._histogramdd_bin_edges",
+    "aten._histogramdd_from_bin_cts",
+    "aten.index_reduce",
+    "aten.index.Tensor",
+    "aten.kthvalue",
+    "aten.logcumsumexp",
+    "aten.lu_unpack",
+    "aten.masked_scatter",
+    "aten.masked_scatter_backward",
+    "aten.max_pool2d_with_indices_backward",
+    "aten.max_pool2d_with_indices",
+    "aten.max_pool3d_with_indices",
+    "aten.max_pool3d_with_indices_backward",
+    "aten.max_unpool2d",
+    "aten.max_unpool3d",
+    "aten.median",
+    "aten.mm.out",
+    "aten.mode",
+    "aten.mul.Scalar",
+    "aten.nanmedian",
+    "aten.nonzero",
+    "aten.ormqr",
+    "aten._pdist_backward",
+    "aten._pdist_forward",
+    "aten.pow.Scalar",
+    "aten.pow.Tensor_Scalar",
+    "aten.pow.Tensor_Tensor",
+    "aten.rand",
+    "aten.rand.generator",
+    "aten.randint",
+    "aten.randn",
+    "aten.randn.generator",
+    "aten.randperm",
+    "aten.repeat_interleave.Tensor",
+    "aten.replication_pad1d_backward",
+    "aten.replication_pad2d_backward",
+    "aten.resize_",
+    "aten.resize_as_",
+    "aten._scaled_dot_product_efficient_attention_backward",
+    "aten._scaled_dot_product_efficient_attention",
+    "aten._scaled_dot_product_flash_attention_backward",
+    "aten._scaled_dot_product_flash_attention",
+    "aten._scaled_dot_product_flash_attention_for_cpu_backward",
+    "aten._scaled_dot_product_flash_attention_for_cpu",
+    "aten._scaled_mm",
+    "aten.scatter_reduce.two_out",
+    "aten.scatter.src_out",
+    "aten.searchsorted",
+    "aten._segment_reduce_backward",
+    "aten.segment_reduce",
+    "aten.soft_margin_loss_backward",
+    "aten.sort",
+    "aten.sort.stable",
+    "aten._sparse_coo_tensor_with_dims_and_tensors",
+    "aten._thnn_fused_lstm_cell",
+    "aten.topk",
+    "aten._to_sparse",
+    "aten.to_sparse",
+    "aten.triangular_solve",
+    "aten._trilinear",
+    "aten.uniform",
+    "aten.upsample_bicubic2d_backward",
+    "aten.upsample_linear1d_backward",
+    "aten.upsample_trilinear3d_backward",
+    "aten.view_as_complex",
+    "aten.view_as_real",
+    "aten.view.dtype",
+    "aten.zeros.names",
+}
diff --git a/torchgen/api/cpp.py b/torchgen/api/cpp.py
index f5466030daa6b..55ae8758b2b3c 100644
--- a/torchgen/api/cpp.py
+++ b/torchgen/api/cpp.py
@@ -312,7 +312,7 @@ def return_names(f: NativeFunction, *, fallback_name: str = "result") -> Sequenc
 JIT_TO_CPP_DEFAULT = {
     "False": "false",
     "True": "true",
-    "None": "c10::nullopt",  # UGH this one is type directed
+    "None": "::std::nullopt",  # UGH this one is type directed
     "Mean": "at::Reduction::Mean",
     "[]": "{}",
     "contiguous_format": "MemoryFormat::Contiguous",
@@ -347,7 +347,7 @@ def default_expr(d: str, t: Type, *, symint: bool) -> str:
 
     if isinstance(t, OptionalType):
         if d == "None":
-            return "c10::nullopt"
+            return "::std::nullopt"
 
         return default_expr(d, t.elem, symint=symint)
 
diff --git a/torchgen/api/lazy.py b/torchgen/api/lazy.py
index 8fdd2ddcfa7a1..b14e910be0b8a 100644
--- a/torchgen/api/lazy.py
+++ b/torchgen/api/lazy.py
@@ -35,7 +35,7 @@
 )
 
 
-_valueT = None
+_valueT: Optional[BaseCppType] = None
 
 
 # A ValueT is an IR type which represents the computation of a Tensor.  In other
@@ -273,9 +273,9 @@ class LazyIrProperties:
     )
 
     def __init__(self, *default_properties: str):
-        properties: Dict[Tuple[str, ...], Optional[str]] = {
-            p: None for p in LazyIrProperties.Properties
-        }
+        properties: Dict[Tuple[str, ...], Optional[str]] = dict.fromkeys(
+            LazyIrProperties.Properties
+        )
         self.__dict__["properties"] = properties
         for p in default_properties:
             setattr(self, p, True)
diff --git a/torchgen/api/python.py b/torchgen/api/python.py
index bc8fadb09beef..a67ff7c174cad 100644
--- a/torchgen/api/python.py
+++ b/torchgen/api/python.py
@@ -2,7 +2,6 @@
 from typing import Dict, List, Optional, Sequence, Set, Tuple, Union
 
 from torchgen.api import cpp
-
 from torchgen.api.types import Binding, CppSignature, CppSignatureGroup
 from torchgen.gen import pythonify_default
 from torchgen.model import (
@@ -62,9 +61,9 @@
 #    Note: the scattered TensorOptions fields are packed into 'options'.
 #
 #      auto dispatch_empty =
-#          [](IntArrayRef size, c10::optional<DimnameList> names,
+#          [](IntArrayRef size, std::optional<DimnameList> names,
 #             const TensorOptions & options,
-#             c10::optional<MemoryFormat> memory_format) -> Tensor {
+#             std::optional<MemoryFormat> memory_format) -> Tensor {
 #          pybind11::gil_scoped_release no_gil;
 #          return torch::empty(size, names, options, memory_format);
 #      };
@@ -93,9 +92,9 @@
 #    Where does 'names' come from? It involves special local init:
 #
 #      auto __names = _r.toDimnameListOptional(1);
-#      c10::optional<DimnameList> names =
-#          __names ? c10::make_optional(DimnameList(__names.value()))
-#                  : c10::nullopt;
+#      std::optional<DimnameList> names =
+#          __names ? std::make_optional(DimnameList(__names.value()))
+#                  : std::nullopt;
 #
 #    Where does 'options' come from? It involves special local init
 #    for TensorOptions. Note that Python side has the additional
@@ -235,6 +234,8 @@ def argument_str(self, *, method: bool = False, symint: bool = True) -> str:
             default = {
                 "nullptr": "None",
                 "c10::nullopt": "None",
+                "::std::nullopt": "None",
+                "std::nullopt": "None",
                 "{}": "None",
             }.get(self.default, self.default)
             return f"{type_str} {name}={default}"
@@ -280,6 +281,8 @@ def argument_str_pyi(
                 default = {
                     "nullptr": "None",
                     "c10::nullopt": "None",
+                    "::std::nullopt": "None",
+                    "std::nullopt": "None",
                     "{}": "None",
                     "MemoryFormat::Contiguous": "contiguous_format",
                     "QScheme::PER_TENSOR_AFFINE": "per_tensor_affine",
@@ -697,9 +700,9 @@ def argument_type_str(
             return f"ScalarList[{size}]" if size is not None else "ScalarList"
         elif str(t.elem) == "Tensor?":
             if simple_type:
-                return "c10::List<c10::optional<Tensor>>"
+                return "c10::List<::std::optional<Tensor>>"
             else:
-                return "const c10::List<c10::optional<Tensor>> &"
+                return "const c10::List<::std::optional<Tensor>> &"
         elif str(t.elem) == "Dimname":
             return f"DimnameList[{size}]" if size is not None else "DimnameList"
         elem = argument_type_str(t.elem, simple_type=simple_type, symint=symint)
@@ -721,11 +724,11 @@ def argument(a: Argument) -> PythonArgument:
         name=a.name,
         type=a.type,
         # TODO: directly translate a.default to python default
-        default=str(
-            pythonify_default(cpp.default_expr(a.default, a.type, symint=False))
-        )
-        if a.default is not None
-        else None,
+        default=(
+            str(pythonify_default(cpp.default_expr(a.default, a.type, symint=False)))
+            if a.default is not None
+            else None
+        ),
         default_init=None,
     )
 
@@ -797,9 +800,10 @@ def signature_from_schema(
         or name.startswith("new_")
         or name.endswith("_like")
     )
+    is_dummy_function = category_override == "dummy"
 
     tensor_options_args: List[PythonArgument] = []
-    if is_factory_function or is_like_or_new_function:
+    if (is_factory_function or is_like_or_new_function) and not is_dummy_function:
 
         def topt_default_init(name: str) -> Optional[str]:
             topt_args = func.arguments.tensor_options
@@ -882,7 +886,7 @@ def topt_default_init(name: str) -> Optional[str]:
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 
-def namedtuple_fieldnames(returns: Tuple[Return, ...]) -> List[str]:
+def structseq_fieldnames(returns: Tuple[Return, ...]) -> List[str]:
     if len(returns) <= 1 or all(r.name is None for r in returns):
         return []
     else:
@@ -894,7 +898,7 @@ def namedtuple_fieldnames(returns: Tuple[Return, ...]) -> List[str]:
             # PyStructSequence_UnnamedField
             #
             # Thus, at this point in time, we do not support unnamed
-            # fields in namedtuple; you must either name all fields,
+            # fields in structseq; you must either name all fields,
             # or none of them.
             raise ValueError("Unnamed field is not supported by codegen")
 
@@ -988,34 +992,60 @@ def return_type_str_pyi(t: Type) -> str:
 
     if isinstance(t, ListType):
         inner = return_type_str_pyi(t.elem)
-        return f"List[{inner}]"
+        return f"Tuple[{inner}, ...]"
 
     return argument_type_str_pyi(t)
 
 
-def returns_named_tuple_pyi(signature: PythonSignature) -> Optional[Tuple[str, str]]:
+def returns_structseq_pyi(signature: PythonSignature) -> Optional[Tuple[str, str]]:
     python_returns = [return_type_str_pyi(r.type) for r in signature.returns.returns]
-    namedtuple_name = signature.name
-    field_names = namedtuple_fieldnames(signature.returns.returns)
+    structseq_name = signature.name
+    field_names = structseq_fieldnames(signature.returns.returns)
     if field_names:
-        namedtuple_def_lines = [f"class {namedtuple_name}(NamedTuple):"]
-        namedtuple_def_lines.extend(
-            f"    {name}: {typ}" for name, typ in zip(field_names, python_returns)
+        # These types are structseq objects which act like named NamedTuples, but
+        # the constructor acts like the constructor of tuple. Using typing.NamedTuple
+        # does not allow us to override __init__.
+        seq_type = f"Tuple[{', '.join(python_returns)}]"
+        structseq_def_lines = [
+            f"class {structseq_name}({seq_type}):",
+        ]
+        for name, typ in zip(field_names, python_returns):
+            structseq_def_lines.extend(
+                [
+                    "    @property",
+                    f"    def {name}(self) -> {typ}: ...",
+                ]
+            )
+        structseq_def_lines.extend(
+            [
+                f"    def __new__(cls, sequence: {seq_type}): ...",
+                f"    n_fields: _int = {len(field_names)}",
+                f"    n_sequeunce_fields: _int = {len(field_names)}",
+                "    n_unnamed_fields: _int = 0",
+                "    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing",
+                "",  # add an extra newline
+            ]
         )
-        namedtuple_def_lines.append("")  # add an extra newline
-        namedtuple_def = "\n".join(namedtuple_def_lines)
+        structseq_def = "\n".join(structseq_def_lines)
         # Example:
-        # namedtuple_def = (
-        #     "class max(NamedTuple):\n"
-        #     "    values: Tensor\n"
-        #     "    indices: Tensor\n"
+        # structseq_def = (
+        #     "class max(Tuple[Tensor, Tensor]):\n"
+        #     "    @property\n"
+        #     "    def values(self) -> Tensor: ...\n"
+        #     "    @property\n"
+        #     "    def indices(self) -> Tensor: ...\n"
+        #     "    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...\n"
+        #     "    n_fields: _int = 2",
+        #     "    n_sequeunce_fields: _int = 2",
+        #     "    n_unnamed_fields: _int = 0",
+        #     "    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing",
         # )
-        return namedtuple_name, namedtuple_def
+        return structseq_name, structseq_def
     return None
 
 
 def returns_str_pyi(signature: PythonSignature) -> str:
-    field_names = namedtuple_fieldnames(signature.returns.returns)
+    field_names = structseq_fieldnames(signature.returns.returns)
     if field_names:
         return f"torch.return_types.{signature.name}"
 
@@ -1280,7 +1310,13 @@ def arg_parser_unpack_method(
             return "generator"
         elif str(t.elem) == "Dimname[]":
             return "toDimnameListOptional"
-        elif not has_default_init and default in (None, "None", "c10::nullopt"):
+        elif not has_default_init and default in (
+            None,
+            "None",
+            "c10::nullopt",
+            "::std::nullopt",
+            "std::nullopt",
+        ):
             # If default is None: append 'Optional' to elem's unpacking method
             return (
                 arg_parser_unpack_method(t.elem, None, None, symint=symint) + "Optional"
@@ -1402,7 +1438,7 @@ def dispatch_lambda_exprs(
             inits.extend(
                 [
                     f"auto __{name} = {arg_parser_expr};",
-                    f"c10::optional<DimnameList> {name} = __{name} ? c10::make_optional(DimnameList(__{name}.value())) : c10::nullopt;",  # noqa: B950
+                    f"::std::optional<DimnameList> {name} = __{name} ? ::std::make_optional(DimnameList(__{name}.value())) : ::std::nullopt;",  # noqa: B950
                 ]
             )
             lambda_args_exprs[name] = name
@@ -1428,9 +1464,7 @@ def dispatch_lambda_exprs(
                 raise RuntimeError(
                     f"{f.func}: unrecognized type '{str(a.type)}' for tensor options field '{a.name}'"
                 )
-        if not all(
-            a in tensor_options_args_names for a in TENSOR_OPTIONS_FIELDS.keys()
-        ):
+        if not all(a in tensor_options_args_names for a in TENSOR_OPTIONS_FIELDS):
             raise RuntimeError(
                 f"{f.func}: incomplete tensor options args: {tensor_options_args_names}"
             )
@@ -1443,7 +1477,7 @@ def dispatch_lambda_exprs(
     .layout({arg_parser_outputs['layout'].expr})
     .requires_grad({arg_parser_outputs['requires_grad'].expr})
     .pinned_memory({arg_parser_outputs['pin_memory'].expr});
-torch::utils::maybe_initialize_cuda(options);
+torch::utils::maybe_initialize_device(options);
 """
         )
         lambda_args_exprs["options"] = "options"
@@ -1455,7 +1489,7 @@ def dispatch_lambda_exprs(
             # we're an output-arg variant, check these args against output tensor
             if not f.func.is_out_fn():
                 raise RuntimeError(
-                    f"{f.func}: dtype in tensor_options_args without output arg"
+                    f"{f.func}: dtype in tensor_options_args without output arg, {ps} {ps.arguments}"
                 )
             if not all(a in tensor_options_args_names for a in ("layout", "device")):
                 raise RuntimeError(
diff --git a/torchgen/api/translate.py b/torchgen/api/translate.py
index f59b6eab24d6a..98f0c251acbde 100644
--- a/torchgen/api/translate.py
+++ b/torchgen/api/translate.py
@@ -323,7 +323,7 @@ def direct_solve(goal: NamedCType) -> str:
                 # If we're calling a factory op from its out= variant,
                 # We don't actually care about the value of pin_memory.
                 out_tensor = direct_solve(out_tensor_ctype)
-                return "c10::nullopt"
+                return "::std::nullopt"
 
         # We can always do translations from value types to reference types, like vector<int> -> IntArrayRef
         elif goal.type == BaseCType(intArrayRefT):
@@ -347,7 +347,7 @@ def direct_solve(goal: NamedCType) -> str:
             argname = direct_solve(
                 NamedCType(goal.name, OptionalCType(BaseCType(longT)))
             )
-            return f"{argname}.has_value() ? c10::make_optional(c10::SymInt(*{argname})) : c10::nullopt"
+            return f"{argname}.has_value() ? ::std::make_optional(c10::SymInt(*{argname})) : ::std::nullopt"
         elif goal.type == BaseCType(longT):
             symInt_type = direct_solve(NamedCType(goal.name, BaseCType(SymIntT)))
             return f"{symInt_type}.guard_int(__FILE__, __LINE__)"
@@ -355,7 +355,7 @@ def direct_solve(goal: NamedCType) -> str:
             argname = direct_solve(
                 NamedCType(goal.name, OptionalCType(BaseCType(SymIntT)))
             )
-            return f"{argname}.has_value() ? c10::make_optional({argname}->guard_int(__FILE__, __LINE__)) : c10::nullopt"
+            return f"{argname}.has_value() ? ::std::make_optional({argname}->guard_int(__FILE__, __LINE__)) : ::std::nullopt"
         elif goal.type == BaseCType(optionalIntArrayRefT):
             try:
                 return direct_solve(NamedCType(goal.name, optionalLongVec_ctype))
@@ -363,14 +363,14 @@ def direct_solve(goal: NamedCType) -> str:
                 argname = direct_solve(
                     NamedCType(goal.name, BaseCType(optionalSymIntArrayRefT))
                 )
-                return f"{argname}.has_value() ? c10::make_optional(C10_AS_INTARRAYREF_SLOW(*{argname})) : c10::nullopt"
+                return f"{argname}.has_value() ? ::std::make_optional(C10_AS_INTARRAYREF_SLOW(*{argname})) : ::std::nullopt"
         elif goal.type == BaseCType(optionalSymIntArrayRefT):
             # TODO: You might also want to solve this from longSymVec_ctype or
             # an optional version of it
             argname = direct_solve(
                 NamedCType(goal.name, BaseCType(optionalIntArrayRefT))
             )
-            return f"{argname}.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*{argname})) : c10::nullopt"
+            return f"{argname}.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*{argname})) : ::std::nullopt"
         elif goal.type == BaseCType(optionalScalarRefT):
             return direct_solve(NamedCType(goal.name, optionalScalar_ctype))
         elif goal.type == BaseCType(optionalTensorRefT):
@@ -398,22 +398,22 @@ def direct_solve(goal: NamedCType) -> str:
                     goal.name, BaseCType(optionalIntArrayRefT)
                 )
                 argname = direct_solve(optionalIntArrayRef_ctype)
-                return f"{argname}.has_value() ? c10::make_optional({argname}->vec()) : c10::nullopt"
+                return f"{argname}.has_value() ? ::std::make_optional({argname}->vec()) : ::std::nullopt"
             elif goal.type == OptionalCType(BaseCType(scalarT)):
                 optionalScalarRef_ctype = NamedCType(
                     goal.name, BaseCType(optionalScalarRefT)
                 )
                 argname = direct_solve(optionalScalarRef_ctype)
-                return f"{argname}.has_value() ? c10::make_optional({argname}) : c10::nullopt"
+                return f"{argname}.has_value() ? ::std::make_optional({argname}) : ::std::nullopt"
             elif goal.type == OptionalCType(BaseCType(scalarT)):
                 optionalTensorRef_ctype = NamedCType(
                     goal.name, BaseCType(optionalTensorRefT)
                 )
                 argname = direct_solve(optionalTensorRef_ctype)
-                return f"{argname}.has_value() ? c10::make_optional({argname}) : c10::nullopt"
+                return f"{argname}.has_value() ? ::std::make_optional({argname}) : ::std::nullopt"
             # Technically, we also need to handle cases of C++ containers holding reference types.
             # But there currently aren't any ops that require lambda capture codegen
-            # With arguments like std::vector<IntArrayRef>.
+            # With arguments like ::std::vector<IntArrayRef>.
             # If that changes, we'll have to add the translation here.
 
         # We allow const casting on tensors, since const-correctness is a bit broken for at::Tensor.
diff --git a/torchgen/api/types/types.py b/torchgen/api/types/types.py
index 16eff73638e46..debc640a66612 100644
--- a/torchgen/api/types/types.py
+++ b/torchgen/api/types/types.py
@@ -34,7 +34,7 @@
 
 TENSOR_LIST_LIKE_CTYPES = [
     "at::TensorList",
-    "const c10::List<c10::optional<at::Tensor>> &",
+    "const c10::List<::std::optional<at::Tensor>> &",
     "const at::ITensorListRef &",
 ]
 
@@ -133,10 +133,10 @@ class OptionalCType(CType):
 
     def cpp_type(self, *, strip_ref: bool = False) -> str:
         # Do not pass `strip_ref` recursively.
-        return f"c10::optional<{self.elem.cpp_type()}>"
+        return f"::std::optional<{self.elem.cpp_type()}>"
 
     def cpp_type_registration_declarations(self) -> str:
-        return f"c10::optional<{self.elem.cpp_type_registration_declarations()}>"
+        return f"::std::optional<{self.elem.cpp_type_registration_declarations()}>"
 
     def remove_const_ref(self) -> "CType":
         return OptionalCType(self.elem.remove_const_ref())
diff --git a/torchgen/api/unboxing.py b/torchgen/api/unboxing.py
index df4430c49b745..7ff0c59c77d28 100644
--- a/torchgen/api/unboxing.py
+++ b/torchgen/api/unboxing.py
@@ -43,8 +43,8 @@
 #       ```
 #    - Dimname[]? names
 #       ```cpp
-#           c10::optional<c10::IValue> names_opt = (std::move(peek(stack, 1, 7))).toOptional<c10::IValue>();
-#           c10::optional<at::ArrayRef<at::Dimname>> names_opt_out;
+#           ::std::optional<c10::IValue> names_opt = (std::move(peek(stack, 1, 7))).toOptional<c10::IValue>();
+#           ::std::optional<at::ArrayRef<at::Dimname>> names_opt_out;
 #           if (names_opt.has_value()) {
 #                         ~~~~~~~~~~~ <-- Unwrapping optional shell
 #               const c10::IValue names_opt_in = names_opt.value();
@@ -58,23 +58,23 @@
 #               }
 #               at::ArrayRef<at::Dimname> names_list_out(names_vec);
 #
-#               names_opt_out = c10::optional<at::ArrayRef<at::Dimname>>(names_list_out);
+#               names_opt_out = ::std::optional<at::ArrayRef<at::Dimname>>(names_list_out);
 #           } else {
-#               names_opt_out = c10::optional<at::ArrayRef<at::Dimname>>();
+#               names_opt_out = ::std::optional<at::ArrayRef<at::Dimname>>();
 #           }
 #       ```
 #    - ScalarType? dtype (similarly for the rest of the arguments)
 #       ```cpp
-#           c10::optional<c10::IValue> dtype_opt = (std::move(peek(stack, 2, 7))).toOptional<c10::IValue>();
-#           c10::optional<at::ScalarType> dtype_opt_out;
+#           ::std::optional<c10::IValue> dtype_opt = (std::move(peek(stack, 2, 7))).toOptional<c10::IValue>();
+#           ::std::optional<at::ScalarType> dtype_opt_out;
 #           if (dtype_opt.has_value()) {
 #               const c10::IValue dtype_opt_in = dtype_opt.value();
 #               at::ScalarType dtype_base = dtype_opt_in.to<at::ScalarType>();
 #                                                        ~~~~~~~~~~~~~~~~~~~~ <-- For base types, convert ivalue to it
 #                                                                                 directly using ".to<T>()" API.
-#               dtype_opt_out = c10::optional<at::ScalarType>(dtype_base);
+#               dtype_opt_out = ::std::optional<at::ScalarType>(dtype_base);
 #           } else {
-#               dtype_opt_out = c10::optional<at::ScalarType>();
+#               dtype_opt_out = ::std::optional<at::ScalarType>();
 #           }
 #       ```
 #
@@ -117,7 +117,7 @@ def convert_arguments(f: NativeFunction) -> Tuple[List[Binding], List[str]]:
     for arg in args:
         # expecting only Argument
         if not isinstance(arg.argument, Argument):
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 f"Unexpected argument type, expecting `Argument` but got {arg}"
             )
         argument: Argument = arg.argument
@@ -165,7 +165,7 @@ def argumenttype_ivalue_convert(
             ctype=ctype,
         )
     else:
-        raise Exception(f"Cannot handle type {t}. arg_name: {arg_name}")
+        raise Exception(f"Cannot handle type {t}. arg_name: {arg_name}")  # noqa: TRY002
     return out_name, ctype, code, decl
 
 
@@ -184,7 +184,7 @@ def _gen_code_optional_type(
     res_name, _, res_code, decl = argumenttype_ivalue_convert(t.elem, in_name)
     return (
         f"""
-c10::optional<c10::IValue> {arg_name}_opt = {arg_name}.toOptional<c10::IValue>();
+auto {arg_name}_opt = {arg_name}.toOptional<c10::IValue>();
 {ctype.cpp_type(strip_ref=True)} {out_name};
 if ({arg_name}_opt.has_value()) {{
     const c10::IValue {in_name} = {arg_name}_opt.value();
@@ -216,7 +216,7 @@ def _gen_code_list_type(
                 "\n"
             )
         )
-    # we have to use c10::List for optional element. e.g., Tensor?[] -> c10::List<c10::optional<at::Tensor>>
+    # we have to use c10::List for optional element. e.g., Tensor?[] -> c10::List<::std::optional<at::Tensor>>
     elif isinstance(t.elem, OptionalType):
         code.extend(
             f"""
diff --git a/torchgen/decompositions/gen_jit_decompositions.py b/torchgen/decompositions/gen_jit_decompositions.py
index 7cfbb803f9b8b..7a0024f91f252 100644
--- a/torchgen/decompositions/gen_jit_decompositions.py
+++ b/torchgen/decompositions/gen_jit_decompositions.py
@@ -55,7 +55,7 @@
 
 def gen_serialized_decompisitions() -> str:
     return "\n".join(
-        [scripted_func.code for scripted_func in decomposition_table.values()]
+        [scripted_func.code for scripted_func in decomposition_table.values()]  # type: ignore[misc]
     )
 
 
@@ -63,7 +63,7 @@ def gen_decomposition_mappings() -> str:
     decomposition_mappings = []
     for schema, scripted_func in decomposition_table.items():
         decomposition_mappings.append(
-            '    {"' + schema + '", "' + scripted_func.name + '"},'
+            '    {"' + schema + '", "' + scripted_func.name + '"},'  # type: ignore[operator]
         )
     return "\n".join(decomposition_mappings)
 
diff --git a/torchgen/dest/lazy_ir.py b/torchgen/dest/lazy_ir.py
index 43cde1e04043a..9cd3dd419fc84 100644
--- a/torchgen/dest/lazy_ir.py
+++ b/torchgen/dest/lazy_ir.py
@@ -59,13 +59,13 @@ def node_ctor_arg_rvalue_string(arg: LazyArgument) -> str:
             if arg.is_symint_or_list:
                 # TODO: I don't understand when you should put lazy_ in the name
                 # or not
-                return f"{arg.name} ? c10::make_optional(GetSymIntValue(*{arg.name})) : c10::nullopt"
+                return f"{arg.name} ? std::make_optional(GetSymIntValue(*{arg.name})) : ::std::nullopt"
             elif arg.is_wrapped_scalar:
                 return f"node_{arg.name}"
             return (
                 f"lazy_{arg.name} ? "
-                f"c10::make_optional(lazy_{arg.name}->GetIrValue()) : "
-                "c10::nullopt"
+                f"std::make_optional(lazy_{arg.name}->GetIrValue()) : "
+                "::std::nullopt"
             )
         else:
             raise AssertionError(
@@ -242,7 +242,6 @@ def gen(self, schema: LazyIrSchema) -> List[str]:
         # for now, we just want one IR class decl and soon after also the method defs
         # and we use the functional version not out/inplace.
         all_args = schema.filtered_args()
-        value_args = schema.filtered_args(values=True, scalars=False)
         scalar_args = schema.filtered_args(values=False, scalars=True)
 
         ctor_args = [f"const {i.lazy_type.cpp_type()}& {i.name}" for i in all_args]
@@ -254,8 +253,8 @@ def gen(self, schema: LazyIrSchema) -> List[str]:
         scalar_initializers = ",\n        ".join(
             [
                 # This code is just special casing the mapping from string_view -> strings
-                f"{a.name}({a.name}.has_value() ? c10::make_optional(std::string(*{a.name})) : c10::nullopt)"
-                if a.lazy_type.cpp_type() == "c10::optional<c10::string_view>"
+                f"{a.name}({a.name}.has_value() ? ::std::make_optional(std::string(*{a.name})) : ::std::nullopt)"
+                if a.lazy_type.cpp_type() == "::std::optional<c10::string_view>"
                 else f"{a.name}({a.name})"
                 for a in scalar_args
             ]
@@ -266,8 +265,8 @@ def gen(self, schema: LazyIrSchema) -> List[str]:
             [
                 f"std::string {a.name};"
                 if a.lazy_type.cpp_type() == "c10::string_view"
-                else f"c10::optional<std::string> {a.name};"
-                if a.lazy_type.cpp_type() == "c10::optional<c10::string_view>"
+                else f"::std::optional<std::string> {a.name};"
+                if a.lazy_type.cpp_type() == "::std::optional<c10::string_view>"
                 else f"{a.lazy_type.cpp_type()} {a.name};"
                 for a in scalar_args
             ]
@@ -420,9 +419,9 @@ def lazy_tensor_decls(self, func: NativeFunction, schema: LazyIrSchema) -> str:
                 if isinstance(arg.lazy_type, OptionalCType):
                     lazy_tensor_decls.append(
                         f"""auto node_{arg.name} = {arg.name} ?
-                c10::make_optional(torch::lazy::LazyGraphExecutor::Get()->
+                std::make_optional(torch::lazy::LazyGraphExecutor::Get()->
                     GetIrValueForScalarFromCodegen(*{arg.name}, *common_device)):
-                c10::nullopt;"""
+                ::std::nullopt;"""
                     )
                 else:
                     lazy_tensor_decls.append(
@@ -672,7 +671,6 @@ class GenLazyShapeInferenceDefinition:
 
     @method_with_native_function
     def __call__(self, f: NativeFunction) -> List[str]:
-        sig = kernel_signature(f, self.backend_index)
         metadata = self.backend_index.get_kernel(f)
         assert metadata is not None
 
diff --git a/torchgen/dest/register_dispatch_key.py b/torchgen/dest/register_dispatch_key.py
index 69523855d7266..fced019cc4e30 100644
--- a/torchgen/dest/register_dispatch_key.py
+++ b/torchgen/dest/register_dispatch_key.py
@@ -127,11 +127,11 @@ def gen_maybe_create_proxy_helper(backend_index: BackendIndex) -> List[str]:
         if empty_strided_impl is None
         else [
             f"""
-c10::optional<Tensor> maybe_create_proxy(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {{
+std::optional<Tensor> maybe_create_proxy(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {{
   if (out.strides() != strides) {{
     return {empty_strided_impl}(sizes, strides, options);
   }}
-  return c10::nullopt;
+  return std::nullopt;
 }}
 """
         ]
@@ -260,7 +260,7 @@ def gen_device_check(
         if type == DeviceCheckType.NoCheck:
             return "  // No device check\n"
 
-        device_check = "c10::optional<Device> common_device = nullopt;\n"
+        device_check = "std::optional<Device> common_device = std::nullopt;\n"
         device_check += "(void)common_device; // Suppress unused variable warning\n"
         for arg in args:
             # Only tensor like arguments are eligible
@@ -688,11 +688,11 @@ def gen_class(
         elif k is SchemaKind.inplace:
             output_type = "std::reference_wrapper<Tensor>"
             output_value = "proxy_outputs_[output_idx].has_value() ? *proxy_outputs_[output_idx] : outputs_[output_idx].get()"
-            proxy_field = f"std::array<c10::optional<Tensor>, {len(f.func.returns)}> proxy_outputs_;"
+            proxy_field = f"std::array<::std::optional<Tensor>, {len(f.func.returns)}> proxy_outputs_;"
         elif k is SchemaKind.out:
             output_type = "std::reference_wrapper<Tensor>"
             output_value = "proxy_outputs_[output_idx].has_value() ? *proxy_outputs_[output_idx] : outputs_[output_idx].get()"
-            proxy_field = f"std::array<c10::optional<Tensor>, {len(f.func.returns)}> proxy_outputs_;"
+            proxy_field = f"std::array<::std::optional<Tensor>, {len(f.func.returns)}> proxy_outputs_;"
 
         if self.backend_index.dispatch_key == DispatchKey.CUDA:
             if self.rocm:
@@ -717,10 +717,10 @@ def gen_class(
             f"{textwrap.indent(class_ctor_str, indent)}",
             f"{textwrap.indent(self.gen_class_set_output_functions(k, parent_class, generate_super), indent)}",
             "    const Tensor& maybe_get_output(int64_t output_idx) override {",
-            f"      return {output_value};\n",
+            f"      return {output_value};\n",  # type: ignore[possibly-undefined]  # TODO: audit
             "    }",
-            f"    std::array<{output_type}, {len(f.func.returns)}> outputs_;",
-            f"{textwrap.indent(proxy_field, indent)}",
+            f"    std::array<{output_type}, {len(f.func.returns)}> outputs_;",  # type: ignore[possibly-undefined]  # TODO: audit
+            f"{textwrap.indent(proxy_field, indent)}",  # type: ignore[possibly-undefined]  # TODO: audit
             f"{textwrap.indent(guard_field, indent)}",
             "};",
         )
@@ -962,7 +962,7 @@ def generate_defn(cpp_sig: CppSignature) -> str:
                 else:
                     refs = ", ".join(a.name for a in f.func.arguments.out)
                     ret_expr = f"std::forward_as_tuple({refs})"
-            sig_body.append(f"return {ret_expr};")
+            sig_body.append(f"return {ret_expr};")  # type: ignore[possibly-undefined]  # TODO: audit
 
             sig_body_str = "\n".join(sig_body)
 
diff --git a/torchgen/dest/ufunc.py b/torchgen/dest/ufunc.py
index da42149c596b6..ffc879afb6cdb 100644
--- a/torchgen/dest/ufunc.py
+++ b/torchgen/dest/ufunc.py
@@ -507,7 +507,7 @@ def compute_ufunc_cpu_kernel(g: NativeFunctionsGroup) -> str:
                 elif k is UfuncKey.CPUVector:
                     compute_t = VectorizedCType(BaseCType(scalar_t))
                 else:
-                    raise AssertionError()
+                    raise AssertionError
                 inner_ufunc_sigs = ufunc_sigs.setdefault(dtype, {})
                 if k not in inner_ufunc_sigs:
                     inner_ufunc_sigs[k] = UfuncSignature(
diff --git a/torchgen/executorch/api/custom_ops.py b/torchgen/executorch/api/custom_ops.py
index 5d11f1300bb8b..7e31025675efa 100644
--- a/torchgen/executorch/api/custom_ops.py
+++ b/torchgen/executorch/api/custom_ops.py
@@ -47,7 +47,9 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
                     # Returns an empty tensor
                     ret_name = "at::Tensor()"
                 else:
-                    raise Exception(f"Can't handle this return type {f.func}")
+                    raise Exception(  # noqa: TRY002
+                        f"Can't handle this return type {f.func}"
+                    )  # noqa: TRY002
         elif len(f.func.arguments.out) == len(f.func.returns):
             # Returns a tuple of out arguments
             tensor_type = "at::Tensor &"
diff --git a/torchgen/executorch/api/unboxing.py b/torchgen/executorch/api/unboxing.py
index 9a8f717ddbb28..50d69d34e96b4 100644
--- a/torchgen/executorch/api/unboxing.py
+++ b/torchgen/executorch/api/unboxing.py
@@ -57,7 +57,7 @@ def convert_arguments(
         for arg in args:
             # expecting only Argument
             if not isinstance(arg.argument, Argument):
-                raise Exception(
+                raise Exception(  # noqa: TRY002
                     f"Unexpected argument type, expecting `Argument` but got {arg}"
                 )
             argument: Argument = arg.argument
@@ -99,7 +99,9 @@ def argumenttype_evalue_convert(
                 arg_name=arg_name, out_name=out_name, t=t, ctype=ctype
             )
         else:
-            raise Exception(f"Cannot handle type {t}. arg_name: {arg_name}")
+            raise Exception(  # noqa: TRY002
+                f"Cannot handle type {t}. arg_name: {arg_name}"
+            )  # noqa: TRY002
         return out_name, ctype, code, decl
 
     def _gen_code_base_type(
@@ -171,7 +173,7 @@ def _gen_code_list_type(
                 )
             )
         # pytorch codegen:
-        # we have to use c10::List for optional element. e.g., Tensor?[] -> c10::List<c10::optional<at::Tensor>>
+        # we have to use c10::List for optional element. e.g., Tensor?[] -> c10::List<::std::optional<at::Tensor>>
         elif (
             isinstance(t.elem, OptionalType)
             and isinstance(t.elem.elem, BaseType)
@@ -180,8 +182,8 @@ def _gen_code_list_type(
             code.extend(
                 f"""
 #ifdef USE_ATEN_LIB
-at::ArrayRef<c10::optional<at::Tensor>> {in_name} = {arg_name}.toListOptionalTensor();
-c10::List<c10::optional<at::Tensor>> {out_name};
+auto {in_name} = {arg_name}.toListOptionalTensor();
+c10::List<::std::optional<at::Tensor>> {out_name};
 for (auto {elem_name}: {in_name}) {{
     {out_name}.push_back({elem_name});
 }}
diff --git a/torchgen/fuse/gen_patterns.py b/torchgen/fuse/gen_patterns.py
new file mode 100644
index 0000000000000..68bf938d27122
--- /dev/null
+++ b/torchgen/fuse/gen_patterns.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+import os
+
+from torch._inductor import pattern_matcher
+from torch._inductor.fx_passes import joint_graph
+
+if __name__ == "__main__":
+    # Start by deleting all the existing patterns.
+    for file in os.listdir(pattern_matcher.SERIALIZED_PATTERN_PATH):
+        if file in ("__init__.py", "__pycache__"):
+            continue
+        file = pattern_matcher.SERIALIZED_PATTERN_PATH / file
+        if file.is_file():
+            file.unlink()
+
+    # Now have joint_graph load all known patterns and tell the pattern matcher
+    # to serialize the patterns as it goes.
+    os.environ["PYTORCH_GEN_PATTERNS"] = "1"
+    joint_graph.lazy_init()
diff --git a/torchgen/fuse_attention_patterns/gen_attention_patterns.py b/torchgen/fuse_attention_patterns/gen_attention_patterns.py
deleted file mode 100644
index 95008bd5e7f28..0000000000000
--- a/torchgen/fuse_attention_patterns/gen_attention_patterns.py
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/usr/bin/env python3
-import os
-import shutil
-from collections import defaultdict
-from pathlib import Path
-
-import torch._inductor
-
-from torch._inductor.fx_passes.fuse_attention import _get_sfdp_patterns
-from torch._inductor.pattern_matcher import (
-    _TargetExpr,
-    gen_pattern,
-    PatternExpr,
-    PatternPrettyPrinter,
-)
-
-auto_generated_msg = """# This is an auto-generated file. Please do not modify it by hand.
-# To re-generate, run:
-# cd ~/pytorch && python
-# torchgen/fuse_attention_patterns/gen_attention_patterns.py
-"""
-
-
-def get_file_template() -> str:
-    file_template = f"""# noqa: F401, E501
-{auto_generated_msg}
-import torch
-import torch._inductor
-
-aten = torch.ops.aten
-prims = torch.ops.prims
-
-"""
-    pattern_matcher_imports = []
-    for name in dir(torch._inductor.pattern_matcher):
-        attr = getattr(torch._inductor.pattern_matcher, name)
-        if isinstance(attr, type) and issubclass(attr, (PatternExpr, _TargetExpr)):
-            pattern_matcher_imports.append(name)
-
-    formatted_imports = ",\n   ".join(pattern_matcher_imports)
-    formatted_imports = (
-        f"from torch._inductor.pattern_matcher import (\n   {formatted_imports},\n)\n"
-    )
-    return f"{file_template}{formatted_imports}"
-
-
-def get_central_index_epilogue() -> str:
-    epilogue = """
-def get_serialized_pattern(key):
-    import torch._inductor  # noqa: F401
-    from torch._inductor import config
-    if config.fallback_random:
-        return None
-
-    # TODO - could add more validation that the same set of decomps used when
-    # tracing SDPA are also used in current context. softmax, dropout, etc
-    # decomp use is stable so not an issue in practice.
-    return central_index.get(key)
-"""
-    return epilogue
-
-
-def clean_directory(dir: Path) -> None:
-    for filename in os.listdir(dir):
-        file_path = os.path.join(dir, filename)
-        if os.path.isfile(file_path) or os.path.islink(file_path):
-            os.unlink(file_path)
-        elif os.path.isdir(file_path):
-            shutil.rmtree(file_path)
-
-
-def serialize_functions() -> None:
-    file_path = Path.cwd() / "torch" / "_inductor" / "fx_passes" / "serialized_patterns"
-    if not file_path.exists():
-        raise Exception(
-            "Could not find serialized patterns directory. Make sure you are at Pytorch root directory"
-        )
-
-    clean_directory(file_path)
-
-    with open(file_path / "__init__.py", "w"):
-        pass
-
-    central_index = {}
-    file_to_keys = defaultdict(list)
-    seen_patterns = set()
-
-    file_template = get_file_template()
-    for (
-        key,
-        kwargs,
-    ) in _get_sfdp_patterns():
-        pattern_name = kwargs["search_fn"].__name__
-        gen_kwargs = {
-            key: kwargs[key]
-            for key in ("search_fn", "example_inputs", "trace_fn", "scalar_workaround")
-        }
-
-        from torch._functorch import config as functorch_config
-
-        with functorch_config.patch(functionalize_rng_ops=False):
-            pattern = gen_pattern(**gen_kwargs)
-
-        serialized_pattern = PatternPrettyPrinter.run(pattern, output_name=key)
-        if pattern_name not in seen_patterns:
-            write_mode = "w"
-            seen_patterns.add(pattern_name)
-        else:
-            write_mode = "a"
-
-        with open(file_path / f"{pattern_name}.py", write_mode) as f:
-            if write_mode == "w":
-                f.write(file_template)
-            else:
-                f.write("\n\n")
-            f.write(serialized_pattern)
-            f.write("\n")
-
-        central_index[f"{key}"] = f"{pattern_name}.py"
-
-        file_to_keys[pattern_name].append(f"{key}")
-
-    with open(file_path / "central_index.py", "w") as f:
-        f.write(auto_generated_msg)
-        for pattern_name, keys in file_to_keys.items():
-            f.write(f"from .{pattern_name} import ({', '.join(keys)})\n")
-
-        f.write("\ncentral_index = {\n")
-        for k in central_index.keys():
-            f.write(f"    '{k}': {k},\n")
-        f.write("}\n\n")
-
-        f.write(get_central_index_epilogue())
-
-
-if __name__ == "__main__":
-    with torch._subclasses.FakeTensorMode():
-        serialize_functions()
diff --git a/torchgen/gen.py b/torchgen/gen.py
index f98a377d433ce..94c69a68c552f 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -3,6 +3,7 @@
 import json
 import os
 import pathlib
+
 from collections import defaultdict, namedtuple, OrderedDict
 from dataclasses import dataclass, field
 from typing import (
@@ -27,6 +28,7 @@
 import torchgen.api.structured as structured
 import torchgen.dest as dest
 
+from torchgen.aoti.fallback_ops import inductor_fallback_ops
 from torchgen.api import cpp
 from torchgen.api.translate import translate
 from torchgen.api.types import (
@@ -44,6 +46,12 @@
     with_native_function,
     with_native_function_and_indices,
 )
+from torchgen.gen_aoti_c_shim import (
+    gen_aoti_c_shim,
+    gen_static_dispatch_backend_call_signature,
+    get_backend_index_for_aoti,
+    get_fallback_op_name,
+)
 from torchgen.gen_functionalization_type import (
     gen_functionalization_definition,
     gen_functionalization_registration,
@@ -138,13 +146,14 @@ def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
         return mapping
 
 
-_GLOBAL_PARSE_NATIVE_YAML_CACHE = {}
-_GLOBAL_PARSE_TAGS_YAML_CACHE = {}
-
 # Parse native_functions.yaml into a sequence of NativeFunctions and Backend Indices.
 ParsedYaml = namedtuple("ParsedYaml", ["native_functions", "backend_indices"])
 
 
+_GLOBAL_PARSE_NATIVE_YAML_CACHE: Dict[str, ParsedYaml] = {}
+_GLOBAL_PARSE_TAGS_YAML_CACHE: Dict[str, Set[str]] = {}
+
+
 def parse_native_yaml_struct(
     es: object,
     valid_tags: Set[str],
@@ -274,9 +283,9 @@ def error_check_native_functions(funcs: Sequence[NativeFunction]) -> None:
             "inplace_view" in f.tags
             and str(f.func.name) != "resize_"
             and str(f.func.name) != "resize_as_"
+            and str(f.func.name.name) != "set_"
         ):
             base_name = f.func.name.name
-            overload_name = f.func.name.overload_name
             assert base_name.inplace, (
                 f"{f.func.name} is marked with tag: inplace_view, but it doesn't follow the naming "
                 "convention for inplace ops - the codegen expects the base name to have a trailing underscore. "
@@ -414,14 +423,7 @@ def generate_static_dispatch_backend_call(
     f: NativeFunction,
     backend_index: BackendIndex,
 ) -> str:
-    cpp_sigs = CppSignatureGroup.from_native_function(
-        f, method=False, fallback_binding=False
-    )
-    if sig.symint and f.func.has_symint():
-        cpp_sig = cpp_sigs.symint_signature
-    else:
-        cpp_sig = cpp_sigs.signature
-    assert cpp_sig is not None
+    cpp_sig = gen_static_dispatch_backend_call_signature(sig, f)
     name = cpp_sig.name()
     exprs = translate_args(sig, cpp_sig)
     backend_metadata = backend_index.get_kernel(f)
@@ -858,7 +860,7 @@ def compute_meta_function_declaration(g: NativeFunctionsGroup) -> Optional[str]:
                 # element that is set by this method is false on the
                 # class corresponding to the object that `this` points to.
                 # This ensures that each element can be set only once.
-                assert_msg = f'"{precomputed_elements[i].name} already set"'
+                assert_msg = f'"{elem.name} already set"'
                 assert_stmt = f"static_assert({precomputed_template_parameters[i]} == false, {assert_msg});"
 
                 # Generate the new object construction block. All state
@@ -1152,7 +1154,7 @@ def compute_cpp_argument_yaml(
             arg["default"] = cpp_a.default
         return arg
     elif isinstance(cpp_a.argument, SelfArgument):
-        raise AssertionError()
+        raise AssertionError
     elif isinstance(cpp_a.argument, Argument):
         return compute_argument_yaml(
             cpp_a.argument,
@@ -1400,7 +1402,9 @@ def maybe_create_view_group(
             assert kind not in grouped_by_views[schema]
             grouped_by_views[schema][kind] = f
         else:
-            assert view_kind not in grouped_by_views[schema]
+            assert (
+                view_kind not in grouped_by_views[schema]
+            ), f"{view_kind} already in {grouped_by_views[schema].keys()}"
             grouped_by_views[schema][view_kind] = f
 
     return list(concatMap(maybe_create_view_group, grouped_by_views.values()))
@@ -2126,7 +2130,7 @@ def gen_headers(
     )
 
     def gen_aten_interned_strings() -> Dict[str, str]:
-        attrs = set()  # All function argument names
+        attrs: Set[str] = set()  # All function argument names
         names = set()  # All ATen function names
         for func in native_functions:
             names.add(str(func.func.name.name))
@@ -2134,8 +2138,7 @@ def gen_aten_interned_strings() -> Dict[str, str]:
             # symbol without the underscore
             names.add(func.func.name.name.base)
 
-            for arg in func.func.schema_order_arguments():
-                attrs.add(arg.name)
+            attrs.update(arg.name for arg in func.func.schema_order_arguments())
 
         # These are keywords in C++, so aren't valid symbol names
         # https://en.cppreference.com/w/cpp/language/operator_alternative
@@ -2179,6 +2182,7 @@ def gen_source_files(
     selector: SelectiveBuilder,
     static_dispatch_idx: List[BackendIndex],
     backend_indices: Dict[DispatchKey, BackendIndex],
+    aoti_fm: FileManager,
     core_fm: FileManager,
     cpu_fm: FileManager,
     cpu_vec_fm: FileManager,
@@ -2189,6 +2193,7 @@ def gen_source_files(
     force_schema_registration: bool,
     per_operator_headers: bool,
     skip_dispatcher_op_registration: bool,
+    update_aoti_c_shim: bool,
 ) -> None:
     extra_cuda_headers = """\
 #include <c10/cuda/CUDAGuard.h>
@@ -2348,6 +2353,106 @@ def operator_headers() -> List[str]:
             else:
                 raise AssertionError(f"unrecognized {dispatch_key} for ufunc")
 
+        structured_func_group_dict = {
+            f"{func_group.functional.namespace}.{func_group.functional.func.name}": func_group
+            for func_group in structured_native_functions
+        }
+        if dispatch_key in (DispatchKey.CPU, DispatchKey.CUDA):
+            fallbacks = dict()
+            for func in native_functions:
+                op_name = get_fallback_op_name(func)
+                if op_name in inductor_fallback_ops:
+                    fallbacks[op_name] = (
+                        func,
+                        structured_func_group_dict.get(
+                            f"{func.namespace}.{func.func.name.name}", None
+                        ),
+                    )
+            fallback_native_functions = tuple(
+                value for _, value in sorted(fallbacks.items())
+            )
+
+            def get_header(
+                func: NativeFunction,
+                func_group: Optional[NativeFunctionsGroup],
+            ) -> Optional[str]:
+                backend_index = get_backend_index_for_aoti(
+                    func, func_group, dispatch_key, backend_indices
+                )
+                return (
+                    None
+                    if backend_index is None
+                    else f"#include <ATen/ops/{func.root_name}_{backend_index.dispatch_key.lower()}_dispatch.h>"
+                )
+
+            def headers_for_aoti() -> str:
+                headers = []
+                for func, func_group in fallback_native_functions:
+                    header = get_header(func, func_group)
+                    if header is not None:
+                        headers.append(header)
+                return "\n".join(sorted(set(headers)))
+
+            extra_headers = (
+                extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else ""
+            )
+
+            # header files were checked in for ABI-compatiblilty checking
+            header_file_name = f"c_shim_{dispatch_key.lower()}.h"
+            new_header = gen_aoti_c_shim(
+                fallback_native_functions,
+                dispatch_key,
+                backend_indices,
+                header=True,
+                includes="",
+            )
+            if update_aoti_c_shim:
+                aoti_fm.write(
+                    header_file_name,
+                    lambda: new_header,
+                )
+            else:
+                try:
+                    with open(
+                        os.path.join(aoti_fm.install_dir, header_file_name)
+                    ) as old_file:
+                        old_header = old_file.read()
+                        assert (
+                            old_header == new_header
+                        ), """
+
+WARNING: The generated AOTInductor C shim header files have unexpectedly changed. This
+indicates an AOTInductor fallback operator ABI backward compatibility breakage!!!
+Only in a limited number of situations, this is allowed:
+
+1. You added a fallback op to the inductor_fallback_ops list in torchgen/aoti/fallback_ops.py.
+If that's the case, run `python torchgen/gen.py --update-aoti-c-shim` to update the existing
+C shim header files.
+
+2. You added a new default argument to an existing fallback op. This is clearly a BC breaking
+change in the AOTInductor land. In this case, you need to keep a manual copy of that existing
+fallback op in a file, e.g. torch/csrc/inductor/aoti_torch/c/shim.h, bump up the version
+number of that fallback op in the newly generated C shim files, and update the cpp wrapper
+codegen to generate the correct cpp call for this op. Contact AOTInductor team for assistance.
+
+                        """
+                except FileNotFoundError:
+                    print(
+                        f"{os.path.join(aoti_fm.install_dir, header_file_name)} not found"
+                    )
+
+            # cpp files are always generated on-the-fly
+            aoti_fm.write(
+                f"c_shim_{dispatch_key.lower()}.cpp",
+                lambda: gen_aoti_c_shim(
+                    fallback_native_functions,
+                    dispatch_key,
+                    backend_indices,
+                    header=False,
+                    includes=headers_for_aoti() + "\n" + extra_headers,
+                ),
+            )
+
         del fm
 
     # BackendSelect is generated specially
@@ -2426,9 +2531,9 @@ def key_func(
         },
     )
 
-    cpu_fm.write("Functions.cpp", lambda: {})
+    cpu_fm.write("Functions.cpp", dict)
 
-    core_fm.write("TensorMethods.cpp", lambda: {})
+    core_fm.write("TensorMethods.cpp", dict)
 
     core_fm.write(
         "ATenOpList.cpp",
@@ -2655,6 +2760,12 @@ def main() -> None:
         help="output directory",
         default="build/aten/src/ATen",
     )
+    parser.add_argument(
+        "--aoti-install-dir",
+        "--aoti_install_dir",
+        help="output directory for AOTInductor shim",
+        default="torch/csrc/inductor/aoti_torch/generated",
+    )
     parser.add_argument(
         "--rocm",
         action="store_true",
@@ -2719,6 +2830,12 @@ def main() -> None:
         default=["headers", "sources", "declarations_yaml"],
         help="Generate only a subset of files",
     )
+    parser.add_argument(
+        "--update-aoti-c-shim",
+        action="store_true",
+        help="Update AOTInductor C shim after changing torchgen/aoti/fallback_ops.py. "
+        "WARNING: Do not use this unless you are sure what you are doing!!!",
+    )
 
     options = parser.parse_args()
 
@@ -2775,12 +2892,15 @@ def main() -> None:
     pathlib.Path(core_install_dir).mkdir(parents=True, exist_ok=True)
     ops_install_dir = f"{options.install_dir}/ops"
     pathlib.Path(ops_install_dir).mkdir(parents=True, exist_ok=True)
+    aoti_install_dir = f"{options.aoti_install_dir}"
+    pathlib.Path(aoti_install_dir).mkdir(parents=True, exist_ok=True)
 
     core_fm = make_file_manager(options=options, install_dir=core_install_dir)
     cpu_fm = make_file_manager(options=options)
     cpu_vec_fm = make_file_manager(options=options)
     cuda_fm = make_file_manager(options=options)
     ops_fm = make_file_manager(options=options, install_dir=ops_install_dir)
+    aoti_fm = make_file_manager(options=options, install_dir=aoti_install_dir)
 
     # Only a limited set of dispatch keys get CPUFunctions.h headers generated
     # for them; this is the set
@@ -2823,6 +2943,7 @@ def main() -> None:
             selector=selector,
             static_dispatch_idx=static_dispatch_idx,
             backend_indices=backend_indices,
+            aoti_fm=aoti_fm,
             core_fm=core_fm,
             cpu_fm=cpu_fm,
             cpu_vec_fm=cpu_vec_fm,
@@ -2833,6 +2954,7 @@ def main() -> None:
             force_schema_registration=options.force_schema_registration,
             per_operator_headers=options.per_operator_headers,
             skip_dispatcher_op_registration=options.skip_dispatcher_op_registration,
+            update_aoti_c_shim=options.update_aoti_c_shim,
         )
 
     if "headers" in options.generate:
diff --git a/torchgen/gen_aoti_c_shim.py b/torchgen/gen_aoti_c_shim.py
new file mode 100644
index 0000000000000..1edd03e227b2f
--- /dev/null
+++ b/torchgen/gen_aoti_c_shim.py
@@ -0,0 +1,456 @@
+import textwrap
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+from torchgen.api.types import DispatcherSignature
+from torchgen.api.types.signatures import CppSignature, CppSignatureGroup
+
+from torchgen.context import method_with_native_function
+from torchgen.model import (
+    Argument,
+    BackendIndex,
+    BaseTy,
+    BaseType,
+    DispatchKey,
+    FunctionSchema,
+    ListType,
+    NativeFunction,
+    NativeFunctionsGroup,
+    OptionalType,
+    Type,
+)
+from torchgen.utils import mapMaybe
+
+base_type_to_c_type = {
+    BaseTy.Tensor: "AtenTensorHandle",
+    BaseTy.bool: "int32_t",  # Use int to pass bool
+    BaseTy.int: "int64_t",
+    BaseTy.SymInt: "int64_t",  # Inductor-generated code won't see a SymInt
+    BaseTy.Scalar: "double",  # Use double to pass both integer and floating point
+    BaseTy.float: "double",  # TODO: how about other floating point types?
+    BaseTy.str: "const char*",
+    BaseTy.DeviceIndex: "int32_t",
+    BaseTy.Layout: "int32_t",  # Represent enum as int
+    BaseTy.MemoryFormat: "int32_t",  # Represent enum as int
+    BaseTy.ScalarType: "int32_t",  # Represent enum as int
+}
+
+base_type_to_aten_type = {
+    BaseTy.Tensor: "at::Tensor",
+    BaseTy.bool: "bool",
+    BaseTy.int: "int64_t",
+    BaseTy.SymInt: "c10::SymInt",
+    BaseTy.Scalar: "c10::Scalar",
+    BaseTy.float: "double",
+    BaseTy.str: "c10::string_view",
+    BaseTy.DeviceIndex: "c10::DeviceIndex",
+    BaseTy.Layout: "c10::Layout",
+    BaseTy.MemoryFormat: "c10::MemoryFormat",
+    BaseTy.ScalarType: "c10::ScalarType",
+}
+
+base_type_to_callsite_expr = {
+    BaseTy.Tensor: "*tensor_handle_to_tensor_pointer",
+    BaseTy.bool: "",
+    BaseTy.int: "",
+    BaseTy.SymInt: "",
+    BaseTy.Scalar: "",
+    BaseTy.float: "",
+    BaseTy.str: "",
+    BaseTy.DeviceIndex: "static_cast<c10::DeviceIndex>",
+    BaseTy.Layout: "static_cast<c10::Layout>",
+    BaseTy.MemoryFormat: "static_cast<c10::MemoryFormat>",
+    BaseTy.ScalarType: "static_cast<c10::ScalarType>",
+}
+
+
+# convert args to C types, names in declarations, and expressions in function bodies
+def convert_arg_type_and_name(typ: Type, name: str) -> Tuple[List[str], List[str], List[str], List[str]]:  # type: ignore[return]
+    if isinstance(typ, BaseType):
+        if typ.name in base_type_to_c_type:
+            return (
+                [base_type_to_c_type[typ.name]],
+                [name],
+                [base_type_to_aten_type[typ.name]],
+                [
+                    f"{base_type_to_callsite_expr[typ.name]}({name})"
+                    if base_type_to_callsite_expr[typ.name]
+                    else name
+                ],
+            )
+        elif typ.name == BaseTy.Device:
+            return (
+                ["int32_t", "int32_t"],
+                [name, name + "_index_"],
+                ["c10::Device"],
+                [
+                    f"c10::Device(static_cast<c10::DeviceType>({name}), static_cast<c10::DeviceIndex>({name}_index_))"
+                ],
+            )
+        else:
+            # TODO: BaseTy.Dimname, BaseTy.Generator, etc.
+            raise NotImplementedError(f"TODO: add support for arg type {repr(typ)}")
+    elif isinstance(typ, OptionalType):
+        c_types, names, aten_types, callsite_exprs = convert_arg_type_and_name(
+            typ.elem, name
+        )
+        j = 0  # index for names
+        new_aten_types = []
+        new_callsite_exprs = []
+        for aten_type in aten_types:
+            # Use pointer to denote optional type
+            c_types[j] = c_types[j] + "*"
+            if aten_type.startswith("c10::ArrayRef<"):
+                # ArrayRef is passed as pointer + size, but no need to add "*" to the size argument
+                new_aten_types.append(f"::std::optional<{aten_type}>")
+                base_type = aten_type[len("c10::ArrayRef<") : -1]
+                new_callsite_exprs.append(
+                    f"pointer_to_optional_list<{base_type}>({names[j]}, {names[j+1]})"
+                )
+                j += 2
+            elif aten_type == "c10::Device":
+                # Device is passed as device_type + device_index
+                new_aten_types.append("::std::optional<c10::Device>")
+                new_callsite_exprs.append(
+                    f"pointer_to_optional_device({names[j]}, {names[j+1]})"
+                )
+                j += 2
+            else:
+                new_aten_types.append(f"::std::optional<{aten_type}>")
+                new_callsite_exprs.append(
+                    f"pointer_to_optional<{aten_type}>({names[j]})"
+                )
+                j += 1
+
+        return (
+            c_types,
+            names,
+            new_aten_types,
+            new_callsite_exprs,
+        )
+    elif isinstance(typ, ListType):
+        # Need to explictly pass the list as pointer + length
+        c_types, names, aten_types, _ = convert_arg_type_and_name(typ.elem, name)
+        assert len(c_types) == 1, "ListType with unsupported element type " + repr(typ)
+
+        # The list content should never be modified
+        c_types[0] = f"const {c_types[0]}*"
+        c_types.append("int64_t")
+        name = names[0]
+        names.append(name + "_len_")
+
+        atype = aten_types[0]
+        callsite_exprs = []
+        if atype == "bool":
+            # no converter from std::vector<bool> to c10::ArrayRef<bool>
+            # construct std::array<bool, N> instead
+            assert typ.size is not None
+            callsite_exprs.append(f"pointer_to_list<{typ.size}>({name})")
+        elif atype == "::std::optional<at::Tensor>":
+            # convert from std::vector<::std::optional<at::Tensor>> to c10::List<::std::optional<at::Tensor>>
+            callsite_exprs.append(
+                f"c10::List<{atype}>(c10::ArrayRef<{atype}>(pointer_to_list<{atype}>({name}, {name}_len_)))"
+            )
+        else:
+            callsite_exprs.append(f"pointer_to_list<{atype}>({name}, {name}_len_)")
+
+        aten_types = [f"c10::ArrayRef<{t}>" for t in aten_types]
+        return (
+            c_types,
+            names,
+            aten_types,
+            callsite_exprs,
+        )
+
+
+def zip_type_and_name(types: List[str], names: List[str]) -> List[str]:
+    return [typ + " " + name for typ, name in zip(types, names)]
+
+
+# Generate argument declarations and callsite expressions
+def gen_arguments(flat_arguments: Sequence[Argument]) -> Tuple[List[str], List[str]]:
+    types = []
+    new_names = []
+    callsite_exprs = []
+    for arg in flat_arguments:
+        new_types, names, _, new_callsite_exprs = convert_arg_type_and_name(
+            arg.type, arg.name
+        )
+        types.extend(new_types)
+        new_names.extend(names)
+        callsite_exprs.extend(new_callsite_exprs)
+    return zip_type_and_name(types, new_names), callsite_exprs
+
+
+# Return values are passed out as pointer arguments because all the C shim functions
+# are expected to return AOTITorchError.
+# Generate returns as declarations and callsite expressions
+def gen_returns(schema: FunctionSchema) -> Tuple[List[str], List[str]]:
+    types = []
+    names = []
+    for idx, ret in enumerate(schema.returns):
+        names.append(f"ret{idx}")
+        if isinstance(ret.type, BaseType) and ret.type.name in base_type_to_c_type:
+            types.append(base_type_to_c_type[ret.type.name] + "*")
+        else:
+            raise NotImplementedError(
+                f"TODO: add support for return type {repr(ret.type)}"
+            )
+
+    def convert_return(typ: BaseType, val: str) -> str:
+        if typ.name == BaseTy.Tensor:
+            return f"new_tensor_handle(std::move({val}));"
+        elif typ.name == BaseTy.SymInt:
+            return f"{val}.expect_int()"
+        elif typ.name == BaseTy.Scalar:
+            return f"{val}.toDouble()"
+        else:
+            return val
+
+    ret_pointer_can_be_null = False
+    unambiguous_name = schema.name.unambiguous_name()
+    for name in ["_scaled_dot_product_flash_attention"]:
+        if name in unambiguous_name:
+            ret_pointer_can_be_null = True
+            break
+
+    callsite_exprs: List[str] = []
+    for idx, ret in enumerate(schema.returns):
+        tmp = "tmp_result" if len(names) == 1 else f"std::get<{idx}>(tmp_result)"
+        assert isinstance(ret.type, BaseType)
+        rval = convert_return(ret.type, tmp)
+        if ret_pointer_can_be_null:
+            callsite_exprs.append(f"if ({names[idx]}) {{ *{names[idx]} = {rval}; }}")
+        else:
+            callsite_exprs.append(f"*{names[idx]} = {rval};")
+
+    return zip_type_and_name(types, names), callsite_exprs
+
+
+# gen.py generates header first and then src, so caching the result here to avoid duplicate work
+declaration_definition_cache: Dict[Tuple[str, str, str], Tuple[str, str]] = {}
+
+
+def gen_declaration_and_definition(
+    schema: FunctionSchema, device: str, backend_call: str
+) -> Tuple[str, str]:
+    func_name = schema.name.unambiguous_name()
+
+    global declaration_definition_cache
+    if (func_name, device, backend_call) in declaration_definition_cache:
+        return declaration_definition_cache[(func_name, device, backend_call)]
+
+    if schema.is_out_fn():
+        # out_variant has out arguments in the front, and it's ok to ignore return value
+        # because C shim functions only return AOTITorchError
+        # Somehow at::native out-variant functions have out arguments in the back
+        args, callsite_exprs = gen_arguments(
+            [*schema.arguments.flat_non_out, *schema.arguments.out]
+            if "at::native" in backend_call
+            else [*schema.arguments.out, *schema.arguments.flat_non_out],
+        )
+        ret_assignments: List[str] = []
+    else:
+        args, callsite_exprs = gen_arguments(schema.arguments.flat_all)
+        ret_declarations, ret_assignments = gen_returns(schema)
+        args.extend(ret_declarations)
+
+    declaration = f"AOTITorchError aoti_torch_{device}_{func_name}({', '.join(args)})"
+
+    tmp_result = "auto tmp_result = " if ret_assignments else ""
+    ret_assignments_str = "\n" + "\n".join(ret_assignments) if ret_assignments else ""
+    definition = f"""
+{declaration} {{
+    AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({{
+        {tmp_result}{backend_call}(
+{textwrap.indent(', '.join(callsite_exprs), "            ")}
+        );{textwrap.indent(ret_assignments_str, "        ")}
+    }});
+}}
+"""
+    declaration_definition_cache[(func_name, device, backend_call)] = (
+        declaration,
+        definition,
+    )
+    return declaration, definition
+
+
+def gen_static_dispatch_backend_call_signature(
+    sig: Union[CppSignature, DispatcherSignature],
+    f: NativeFunction,
+) -> CppSignature:
+    sig = DispatcherSignature.from_schema(f.func)
+    cpp_sigs = CppSignatureGroup.from_native_function(
+        f, method=False, fallback_binding=False
+    )
+    if sig.symint and f.func.has_symint():
+        cpp_sig = cpp_sigs.symint_signature
+    else:
+        cpp_sig = cpp_sigs.signature
+    assert cpp_sig is not None
+    return cpp_sig
+
+
+def gen_static_dispatch_backend_call(
+    f: NativeFunction,
+    backend_index: BackendIndex,
+) -> str:
+    sig = DispatcherSignature.from_schema(f.func)
+    cpp_sig = gen_static_dispatch_backend_call_signature(sig, f)
+    return f"at::{backend_index.dispatch_key.lower()}::{cpp_sig.name()}"
+
+
+def get_backend_index_for_aoti(
+    func: NativeFunction,
+    func_group: Optional[NativeFunctionsGroup],
+    dispatch_key: DispatchKey,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+) -> Optional[BackendIndex]:
+    backend_index = None
+    if backend_indices[dispatch_key].has_kernel(func) or (
+        func.structured_delegate is not None
+        and func_group is not None
+        and backend_indices[dispatch_key].has_kernel(func_group)
+    ):
+        backend_index = backend_indices[dispatch_key]
+    elif backend_indices[DispatchKey.CompositeExplicitAutograd].has_kernel(func):
+        # We need to create C shim wrappers for CompositeExplicitAutograd kernels
+        backend_index = backend_indices[DispatchKey.CompositeExplicitAutograd]
+    elif backend_indices[DispatchKey.CompositeExplicitAutogradNonFunctional].has_kernel(
+        func
+    ):
+        # We need to create C shim wrappers for CompositeExplicitAutogradNonFunctional kernels
+        backend_index = backend_indices[
+            DispatchKey.CompositeExplicitAutogradNonFunctional
+        ]
+
+    return backend_index
+
+
+def get_fallback_op_name(func: NativeFunction) -> str:
+    return (
+        f"{func.namespace}.{func.func.name.name}.{func.func.name.overload_name}"
+        if func.func.name.overload_name
+        else f"{func.namespace}.{func.func.name.name}"
+    )
+
+
+def gen_c_shim(
+    func: NativeFunction,
+    func_group: Optional[NativeFunctionsGroup],
+    dispatch_key: DispatchKey,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    header: bool,
+) -> Optional[str]:
+    backend_index = get_backend_index_for_aoti(
+        func, func_group, dispatch_key, backend_indices
+    )
+    if backend_index is None:
+        return None
+
+    schema = func.func
+    device = dispatch_key.lower()
+    backend_call = gen_static_dispatch_backend_call(
+        func,
+        backend_index,
+    )
+
+    try:
+        if header:
+            declaration, _ = gen_declaration_and_definition(
+                schema, device, backend_call
+            )
+            return f"AOTI_TORCH_EXPORT {declaration};"
+        else:
+            _, definition = gen_declaration_and_definition(schema, device, backend_call)
+            return definition
+
+    except NotImplementedError:
+        return None
+
+
+@dataclass(frozen=True)
+class ShimGenerator:
+    func_group_mapping: Dict[str, Optional[NativeFunctionsGroup]]
+    dispatch_key: DispatchKey
+    backend_indices: Dict[DispatchKey, BackendIndex]
+    header: bool  # True to generate .h and False to generate .cpp
+
+    @method_with_native_function
+    def __call__(
+        self,
+        func: NativeFunction,
+    ) -> Optional[str]:
+        result = gen_c_shim(
+            func,
+            self.func_group_mapping.get(get_fallback_op_name(func), None),
+            self.dispatch_key,
+            self.backend_indices,
+            self.header,
+        )
+        return result
+
+
+def gen_aoti_c_shim(
+    native_functions: Sequence[Tuple[NativeFunction, Optional[NativeFunctionsGroup]]],
+    dispatch_key: DispatchKey,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+    header: bool,
+    includes: str = "",
+) -> str:
+    func_group_mapping = {
+        get_fallback_op_name(func): func_group for func, func_group in native_functions
+    }
+    body = "\n".join(
+        list(
+            mapMaybe(
+                ShimGenerator(
+                    func_group_mapping, dispatch_key, backend_indices, header
+                ),
+                [func for func, _ in native_functions],
+            )
+        )
+    )
+    device = dispatch_key.lower()
+
+    warning = (
+        "// WARNING: THIS FILE IS AUTOGENERATED BY torchgen. DO NOT MODIFY BY HAND."
+    )
+
+    if header:
+        return f"""
+{warning}
+
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {{
+#endif
+
+{body}
+
+#ifdef __cplusplus
+}} // extern "C"
+#endif
+"""
+
+    else:
+        return f"""
+{warning}
+
+#include <torch/csrc/inductor/aoti_torch/generated/c_shim_{device}.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/{str(dispatch_key)}Functions.h>
+#include <ATen/CompositeExplicitAutogradFunctions.h>
+#include <ATen/CompositeExplicitAutogradNonFunctionalFunctions.h>
+#else
+{includes}
+#endif
+
+using namespace torch::aot_inductor;
+
+{body}"""
diff --git a/torchgen/gen_backend_stubs.py b/torchgen/gen_backend_stubs.py
index ff23aa9be3971..8f2b07e7fc768 100644
--- a/torchgen/gen_backend_stubs.py
+++ b/torchgen/gen_backend_stubs.py
@@ -108,10 +108,10 @@ def parse_backend_yaml(
     supported.extend(full_codegen)
 
     # non_native is ignored by parse_backend_yaml, and re-parsed in gen_lazy_tensor.py
-    non_native = yaml_values.pop("non_native", {})
+    yaml_values.pop("non_native", {})
 
     # ir_gen is ignored by parse_backend_yaml, and re-parsed in gen_lazy_tensor.py
-    _ = yaml_values.pop("ir_gen", {})
+    yaml_values.pop("ir_gen", {})
 
     assert (
         len(yaml_values.keys()) == 0
diff --git a/torchgen/gen_executorch.py b/torchgen/gen_executorch.py
index 11099ab2ed9c2..436630bb66457 100644
--- a/torchgen/gen_executorch.py
+++ b/torchgen/gen_executorch.py
@@ -123,10 +123,19 @@ class ComputeFunction:
 
     @method_with_native_function
     def __call__(self, f: NativeFunction) -> Optional[str]:
+        is_method_variant = False
         if not self.selector.is_root_operator(f"{f.namespace}::{f.func.name}"):
             return None
-        if Variant.function not in f.variants:
-            return None
+
+        if Variant.function not in f.variants and Variant.method in f.variants:
+            is_method_variant = True
+
+        # only valid remaining case is only function is in f.variants
+        elif not (Variant.function in f.variants and Variant.method not in f.variants):
+            raise Exception(  # noqa: TRY002
+                f"Can't handle native function {f.func} with the following variant specification {f.variants}."
+            )
+
         sig: Union[CppSignature, ExecutorchCppSignature] = (
             CppSignatureGroup.from_native_function(
                 f, method=False, fallback_binding=f.manual_cpp_binding
@@ -137,7 +146,15 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
         if self.use_aten_lib and not self.is_custom_op(f):
             comma = ", "
 
-            return f"""
+            if is_method_variant:
+                return f"""
+// {f.namespace}::{f.func}
+TORCH_API inline {_sig_decl_wrapper(sig)} {{
+    return {sig.arguments()[0].name}.{sig.name()}({comma.join(e.name for e in sig.arguments()[1:])});
+}}
+"""
+            else:
+                return f"""
 // {f.namespace}::{f.func}
 TORCH_API inline {_sig_decl_wrapper(sig)} {{
     return at::{sig.name()}({comma.join(e.name for e in sig.arguments())});
@@ -211,7 +228,7 @@ def __call__(
 
         if len(f.func.returns) == 0:
             if len(f.func.arguments.out) == 0:
-                raise Exception(
+                raise Exception(  # noqa: TRY002
                     f"Can't handle native function {f.func} with no returns and no out yet."
                 )
             out = f.func.arguments.out[0]
@@ -350,7 +367,6 @@ def gen_functions_declarations(
     # convert kernel index to BackendIndex. This is because we can't handle ETKernelIndex yet.
     # TODO larryliu: evaluate if this code is still needed. If yes let it handle ETKernelIndex.
 
-    dispatch_key = DispatchKey.CPU
     backend_index = kernel_index._to_backend_index()
 
     ns_grouped_functions = defaultdict(list)
diff --git a/torchgen/gen_functionalization_type.py b/torchgen/gen_functionalization_type.py
index 9f4d48b1296dd..92e2ff8ad9e05 100644
--- a/torchgen/gen_functionalization_type.py
+++ b/torchgen/gen_functionalization_type.py
@@ -44,6 +44,7 @@
 )
 
 from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.utils import dataclass_repr
 
 
 # Note: [Mutable Ops Not Using Functionalization]
@@ -87,6 +88,12 @@ class GenCompositeViewCopyKernel:
     def __call__(self, g: NativeFunctionsViewGroup) -> Optional[str]:
         if g.view_copy is None:
             return None
+        elif g.view_copy.func.name.name.base != f"{g.view.func.name.name}_copy":
+            # If the view_copy doesn't match the standard naming scheme of <op>_copy,
+            # assume it already exists and doesn't need to be generated.
+            # Example: slice_inverse() with the copy variant named slice_scatter()
+            # instead of slice_inverse_copy()
+            return None
 
         metadata = self.backend_index.get_kernel(g.view_copy)
         assert metadata is not None
@@ -425,7 +432,8 @@ def emit_view_functionalization_body(
         {reverse_lambda.decl()} {{
           return {reverse_lambda.inner_call()}
         }},
-        /*is_multi_output=*/{str(is_multi_output_view).lower()}
+        /*is_multi_output=*/{str(is_multi_output_view).lower()},
+        /*is_as_strided=*/{str(str(f.func.name) == 'as_strided').lower()}
       );
       auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, {view_tensor_name}, view_meta);
       // See  Note [Propagating strides in the functionalization pass]
@@ -658,7 +666,7 @@ def emit_inplace_functionalization_body(
          // case 2: arguments are not functional tensors, so we no-op and redispatch.
          at::AutoDispatchSkipFunctionalize guard;
          {maybe_create_output(f, 'tmp_output')}at::_ops::{f.func.name.unambiguous_name()}::call({', '.join(inplace_exprs)});
-         {return_from_mutable_noop_redispatch(f, 'tmp_output')};
+         {return_from_mutable_noop_redispatch(f, 'tmp_output')}
         }}
       }} else {{
         {return_type} tmp_output;
@@ -744,7 +752,8 @@ def emit_registration_helper(f: NativeFunction) -> str:
         if str(f.func.name) == "resize_":
             # See Note [resize_ in Functionalization]
             return []
-        assert not f.is_view_op
+        if str(f.func.name.name) != "set_":
+            assert not f.is_view_op
         # functionalization needs to generate and register kernels for inplace ops.
         # We *also* need to directly register CompositeImplicitAUtograd kernels
         # so that they decompose properly before functioanlization.
@@ -771,7 +780,7 @@ def gen_functionalization_definition(
         if not g.composite:
             # invariant: NativeFunctionsViewGroup's always have a view_copy operator
             # if the view is not composite (implicit autograd)
-            assert g.view_copy is not None
+            assert g.view_copy is not None, dataclass_repr(g, indent=1)
             view_defs.append(emit_view_functionalization_body(g, view_inplace=False))
             if g.view_inplace is not None:
                 view_defs.append(emit_view_functionalization_body(g, view_inplace=True))
@@ -784,7 +793,10 @@ def gen_functionalization_definition(
         # I think we should either:
         # (1) fix their schemas (BC-breaking)
         # (2) hand-write their functionalization kernels
-        if str(g.func.name) not in MUTABLE_OPS_NOT_USING_FUNCTIONALIZATION:
+        if (
+            str(g.func.name) not in MUTABLE_OPS_NOT_USING_FUNCTIONALIZATION
+            and str(g.func.name.name) not in MUTABLE_OPS_NOT_USING_FUNCTIONALIZATION
+        ):
             assert g.has_composite_implicit_autograd_kernel or not modifies_arguments(g)
         return []
     else:
diff --git a/torchgen/gen_lazy_tensor.py b/torchgen/gen_lazy_tensor.py
index 48a9af25535f0..eed0e8de7ae2b 100644
--- a/torchgen/gen_lazy_tensor.py
+++ b/torchgen/gen_lazy_tensor.py
@@ -1,12 +1,10 @@
 import argparse
 import os
 import pathlib
-import re
-from collections import Counter, namedtuple
+from collections import namedtuple
 from typing import (
     Any,
     Callable,
-    Dict,
     Iterable,
     Iterator,
     List,
@@ -28,7 +26,7 @@
 
 from torchgen.model import NativeFunction, NativeFunctionsGroup, OperatorName
 from torchgen.selective_build.selector import SelectiveBuilder
-from torchgen.utils import concatMap, FileManager, NamespaceHelper
+from torchgen.utils import FileManager, NamespaceHelper
 from torchgen.yaml_utils import YamlLoader
 from .gen_backend_stubs import (
     error_on_missing_kernels,
@@ -107,14 +105,6 @@ def parse_native_functions_keys(
     backend_yaml_path: str,
     grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]],
 ) -> Tuple[List[OperatorName], List[Any], List[OperatorName]]:
-    native_functions_map: Dict[OperatorName, NativeFunction] = {
-        f.func.name: f
-        for f in concatMap(
-            lambda f: [f] if isinstance(f, NativeFunction) else list(f.functions()),
-            grouped_native_functions,
-        )
-    }
-
     with open(backend_yaml_path) as f:
         yaml_values = yaml.load(f, Loader=YamlLoader)
     assert isinstance(yaml_values, dict)
@@ -142,17 +132,13 @@ def validate_shape_inference_header(
             f"Unable to read from the specified shape_inference_hdr file: {shape_inference_hdr}"
         ) from e
 
-    shape_infr_regex = r"compute_shape_(\w+)"
-    actual_shape_infr_name_counts = Counter(
-        re.findall(shape_infr_regex, shape_infr_decls)
-    )
     # TODO(whc) add a check for shape inference functions that have meta kernels implement and should be retired.
 
     missing_decls = [
         decl for decl in expected_shape_infr_decls if decl not in shape_infr_decl_lines
     ]
     if missing_decls:
-        raise Exception(
+        raise Exception(  # noqa: TRY002
             f"""Missing shape inference function.\n
 Please add declare this function in {shape_inference_hdr}:\n
 and implement it in the corresponding shape_inference.cpp file.\n
@@ -167,19 +153,19 @@ def get_ltc_helper_fns() -> str:
   // undefined tensors can't be converted to the meta device, since they don't have sizes/strides
   if (!tensor.defined()) return tensor;
   auto out = at::native::empty_strided_meta_symint(tensor.sym_sizes(), tensor.sym_strides(), \
-/*dtype=*/c10::make_optional(tensor.scalar_type()), /*layout=*/c10::make_optional(tensor.layout()), \
-/*device=*/c10::make_optional(c10::Device(c10::kMeta)), /*pin_memory=*/c10::nullopt);
+/*dtype=*/std::make_optional(tensor.scalar_type()), /*layout=*/std::make_optional(tensor.layout()), \
+/*device=*/std::make_optional(c10::Device(c10::kMeta)), /*pin_memory=*/std::nullopt);
   // needs to handle wrapped numbers, so dtype promotion works properly.
   if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) {
     out.unsafeGetTensorImpl()->set_wrapped_number(true);
   }
   return out;
 }
-c10::optional<at::Tensor> to_meta(const c10::optional<at::Tensor>& tensor) {
+std::optional<at::Tensor> to_meta(const std::optional<at::Tensor>& tensor) {
   if (tensor.has_value()) {
     return to_meta(*tensor);
   }
-  return c10::nullopt;
+  return std::nullopt;
 }
 
 std::vector<at::Tensor> to_meta(at::ITensorListRef t_list) {
diff --git a/torchgen/gen_vmap_plumbing.py b/torchgen/gen_vmap_plumbing.py
index 0876f3e343453..ac7ac283dd319 100644
--- a/torchgen/gen_vmap_plumbing.py
+++ b/torchgen/gen_vmap_plumbing.py
@@ -245,7 +245,6 @@ def gen_vmap_plumbing(native_function: NativeFunction) -> Optional[str]:
 class ComputeBatchRulePlumbing:
     @method_with_native_function
     def __call__(self, f: NativeFunction) -> Optional[str]:
-        opname = str(f.func.name)
         result = gen_vmap_plumbing(f)
         return result
 
diff --git a/torchgen/model.py b/torchgen/model.py
index 02609a96b2c67..2706f234c56b0 100644
--- a/torchgen/model.py
+++ b/torchgen/model.py
@@ -54,7 +54,14 @@ class Variant(Enum):
 
 # NOTE: Keep the list in sync with `DispatchKey` in c10/core/DispatchKey.h
 BACKEND_COMPONENTS = "CPU CUDA HIP XLA MTIA MPS IPU XPU HPU VE Lazy Meta PrivateUse1 PrivateUse2 PrivateUse3".split()
-FUNCTIONALITY_KEYS = ["", "Quantized", "Sparse", "NestedTensor", "Autograd"]
+FUNCTIONALITY_KEYS = [
+    "",
+    "Quantized",
+    "Sparse",
+    "SparseCsr",
+    "NestedTensor",
+    "Autograd",
+]
 
 # This list guards dispatches that can be used in derivatives.yaml
 # For now we omit AutogradFunctionality and AutogradOther
@@ -72,7 +79,7 @@ class DispatchKey(Enum):
     CatchAll = Undefined
 
     FPGA = auto()
-    ORT = auto()
+    MAIA = auto()
     Vulkan = auto()
     Metal = auto()
     MKLDNN = auto()
@@ -82,12 +89,13 @@ class DispatchKey(Enum):
     CustomRNGKeyId = auto()
     MkldnnCPU = auto()
     Sparse = auto()
-    SparseCsrCPU = auto()
-    SparseCsrCUDA = auto()
+    SparseCsr = auto()
     NestedTensor = auto()
     Dense = auto()
 
+    PythonTLSSnapshot = auto()
     PreDispatch = auto()
+    PythonDispatcher = auto()
     Python = auto()
     FuncTorchDynamicLayerBackMode = auto()
     ZeroTensor = auto()
@@ -100,6 +108,8 @@ class DispatchKey(Enum):
     AutogradNestedTensor = auto()
     Tracer = auto()
     Autocast = auto()
+    AutocastCPU = auto()
+    AutocastCUDA = auto()
     Batched = auto()
     VmapMode = auto()
     FuncTorchGradWrapper = auto()
@@ -165,6 +175,21 @@ class DispatchKey(Enum):
     SparsePrivateUse1 = auto()
     SparsePrivateUse2 = auto()
     SparsePrivateUse3 = auto()
+    SparseCsrCPU = auto()
+    SparseCsrCUDA = auto()
+    SparseCsrHIP = auto()
+    SparseCsrXLA = auto()
+    SparseCsrMTIA = auto()
+    SparseCsrMPS = auto()
+    SparseCsrIPU = auto()
+    SparseCsrXPU = auto()
+    SparseCsrHPU = auto()
+    SparseCsrVE = auto()
+    SparseCsrLazy = auto()
+    SparseCsrMeta = auto()
+    SparseCsrPrivateUse1 = auto()
+    SparseCsrPrivateUse2 = auto()
+    SparseCsrPrivateUse3 = auto()
     NestedTensorCPU = auto()
     NestedTensorCUDA = auto()
     NestedTensorHIP = auto()
@@ -260,6 +285,7 @@ def codegen_per_backend_entries() -> str:
     # kernels
     DispatchKey.Meta,
     DispatchKey.SparseMeta,
+    DispatchKey.SparseCsrMeta,
     DispatchKey.QuantizedMeta,
     DispatchKey.NestedTensorMeta,
     DispatchKey.ZeroTensor,
@@ -839,16 +865,16 @@ def from_yaml(
             )
 
         has_composite_implicit_autograd_kernel = (
-            DispatchKey.CompositeImplicitAutograd in dispatch.keys()
+            DispatchKey.CompositeImplicitAutograd in dispatch
         )
         has_composite_implicit_autograd_nested_tensor_kernel = (
-            DispatchKey.CompositeImplicitAutogradNestedTensor in dispatch.keys()
+            DispatchKey.CompositeImplicitAutogradNestedTensor in dispatch
         )
         has_composite_explicit_autograd_kernel = (
-            DispatchKey.CompositeExplicitAutograd in dispatch.keys()
+            DispatchKey.CompositeExplicitAutograd in dispatch
         )
         has_composite_explicit_autograd_non_functional_kernel = (
-            DispatchKey.CompositeExplicitAutogradNonFunctional in dispatch.keys()
+            DispatchKey.CompositeExplicitAutogradNonFunctional in dispatch
         )
 
         # We aren't going to store dispatch metadata inline in NativeFunctions;
@@ -1337,6 +1363,17 @@ class FunctionSchema:
     # TODO: Need to handle collisions with argument names at some point
     returns: Tuple["Return", ...]
 
+    @property
+    def is_mutable(self) -> bool:
+        def is_write(arg: "Argument") -> bool:
+            if arg.annotation is None:
+                return False
+            return arg.annotation.is_write
+
+        # Corresponds to torch._C._FunctionSchema.is_mutable
+        # See aten/src/ATen/core/function_schema.h (keep these in sync)
+        return any(is_write(a) for a in self.arguments.flat_all)
+
     def schema_order_arguments(self) -> Iterator["Argument"]:
         return itertools.chain(
             self.arguments.flat_positional,
@@ -1613,8 +1650,11 @@ def strip_ret_annotation(r: Return) -> Return:
             )
 
         base_name = self.name.name.base
-        if strip_view_copy_name and base_name.endswith("_copy"):
-            base_name = base_name.replace("_copy", "")
+        if strip_view_copy_name:
+            if base_name.endswith("_copy"):
+                base_name = base_name.replace("_copy", "")
+            elif base_name.endswith("_scatter"):
+                base_name = base_name.replace("scatter", "inverse")
 
         # find mutable inputs that are not originally returned, and convert them to returns
         returns_from_mutable_inputs = tuple(
@@ -1958,6 +1998,10 @@ class Argument:
     # model will have to change!
     annotation: Optional[Annotation]
 
+    @property
+    def alias_info(self) -> Optional[Annotation]:
+        return self.annotation
+
     @staticmethod
     def parse(arg: str) -> "Argument":
         name: str
@@ -2017,6 +2061,10 @@ class Return:
     type: Type
     annotation: Optional[Annotation]
 
+    @property
+    def alias_info(self) -> Optional[Annotation]:
+        return self.annotation
+
     @staticmethod
     def parse(arg: str) -> "Return":
         name: Optional[str]
@@ -2603,9 +2651,9 @@ def __post_init__(self) -> None:
                 " See Note [view_copy NativeFunctions] for details."
             )
         else:
-            assert self.view_copy.func.name.name.base.endswith("_copy")
+            assert self.view_copy.func.name.name.base.endswith(("_copy", "_scatter"))
             assert self.view.func.signature() == self.view_copy.func.signature(
-                strip_view_copy_name=True
+                strip_view_copy_name=True,
             )
             assert "view_copy" in self.view_copy.tags, (
                 f"{str(self.view_copy.func.name), str(self.view.tags)} appears to be a view_copy operator. The codegen expects"
@@ -2659,6 +2707,13 @@ def gets_generated_view_copy(f: NativeFunction) -> bool:
     # We also don't need to generate copy variants for inplace views.
     if "inplace_view" in f.tags:
         return False
+    # Assume ops ending in _inverse have manually-defined copy variants
+    # (e.g. slice_inverse() has the copy variant slice_scatter()).
+    # We -could- probably generate these as well, but the codegen will be
+    # slightly different, and hand-writing these few kernels keeps codegen
+    # complexity lower.
+    if f.func.name.name.base.endswith("_inverse"):
+        return False
     return True
 
 
diff --git a/torchgen/native_function_generation.py b/torchgen/native_function_generation.py
index 79e20fa08d24f..87cc2fef53667 100644
--- a/torchgen/native_function_generation.py
+++ b/torchgen/native_function_generation.py
@@ -80,6 +80,9 @@
     "_nested_tensor_storage_offsets",  # returns a vector of ints
     "_chunk_grad_outputs_efficient_attention",  # returns a bool
     "_fused_sdp_choice",  # returns an int
+    "_print",  # no return
+    "_sink_tokens",  # no return
+    "_nested_get_ragged_idx",  # returns an int
 ]
 
 INPLACE_OPS_THAT_DONT_GET_GROUPED_PROPERLY = [
@@ -395,7 +398,11 @@ def add_generated_native_functions(
             # Don't bother generating functions trio's for native functions that bypass the dispatcher.
             are_manual = all(f.manual_cpp_binding for f in d.values())
             # Don't bother generating functional + out= variants for view operators
-            has_view_ops = any(f.is_view_op for f in d.values())
+            # set_ is technically an inplace_view, but for now it is treated
+            # as a normal inplace op in the codegen
+            has_view_ops = any(
+                f.is_view_op and str(f.func.name.name) != "set_" for f in d.values()
+            )
             # Don't generate the other variants for CompositeImplicitAutograd operators.
             # We could probably do this, but the main benefit of generating the function triplets
             # is for transforms that need them, and transforms don't need to act directly
diff --git a/torchgen/operator_versions/gen_mobile_upgraders.py b/torchgen/operator_versions/gen_mobile_upgraders.py
index dab15685804ea..29070761c55fd 100644
--- a/torchgen/operator_versions/gen_mobile_upgraders.py
+++ b/torchgen/operator_versions/gen_mobile_upgraders.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import os
 from enum import Enum
+from operator import itemgetter
 from pathlib import Path
 from typing import Any, Dict, List
 
@@ -263,7 +264,7 @@ def construct_version_maps(
     upgrader_bytecode_function_to_index_map: Dict[str, Any]
 ) -> str:
     version_map = torch._C._get_operator_version_map()
-    sorted_version_map_ = sorted(version_map.items(), key=lambda item: item[0])  # type: ignore[no-any-return]
+    sorted_version_map_ = sorted(version_map.items(), key=itemgetter(0))  # type: ignore[no-any-return]
     sorted_version_map = dict(sorted_version_map_)
 
     operator_list_in_version_map_part = []
diff --git a/torchgen/selective_build/operator.py b/torchgen/selective_build/operator.py
index feb4f08bb822e..939d97ff94cc7 100644
--- a/torchgen/selective_build/operator.py
+++ b/torchgen/selective_build/operator.py
@@ -61,7 +61,7 @@ def from_yaml_dict(
         }
 
         if len(set(op_info.keys()) - allowed_keys) > 0:
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 "Got unexpected top level keys: {}".format(
                     ",".join(set(op_info.keys()) - allowed_keys),
                 )
@@ -132,7 +132,7 @@ def combine_operators(
     lhs: "SelectiveBuildOperator", rhs: "SelectiveBuildOperator"
 ) -> "SelectiveBuildOperator":
     if str(lhs.name) != str(rhs.name):
-        raise Exception(
+        raise Exception(  # noqa: TRY002
             f"Expected both arguments to have the same name, but got '{str(lhs.name)}' and '{str(rhs.name)}' instead"
         )
 
diff --git a/torchgen/selective_build/selector.py b/torchgen/selective_build/selector.py
index 4fdc513534444..aa60349966a25 100644
--- a/torchgen/selective_build/selector.py
+++ b/torchgen/selective_build/selector.py
@@ -80,7 +80,7 @@ def from_yaml_dict(data: Dict[str, object]) -> "SelectiveBuilder":
         }
         top_level_keys = set(data.keys())
         if len(top_level_keys - valid_top_level_keys) > 0:
-            raise Exception(
+            raise Exception(  # noqa: TRY002
                 "Got unexpected top level keys: {}".format(
                     ",".join(top_level_keys - valid_top_level_keys),
                 )
@@ -255,7 +255,7 @@ def et_get_selected_kernels(self, op_name: str, kernel_key: List[str]) -> List[s
                     break
             if not key_found:
                 if "default" not in kernel_key:
-                    raise Exception("Missing kernel for the model")
+                    raise Exception("Missing kernel for the model")  # noqa: TRY002
                 else:
                     result_set.add("default")
 
diff --git a/torchgen/shape_functions/gen_jit_shape_functions.py b/torchgen/shape_functions/gen_jit_shape_functions.py
index c6336a69518b2..bdfd5c75b2815 100644
--- a/torchgen/shape_functions/gen_jit_shape_functions.py
+++ b/torchgen/shape_functions/gen_jit_shape_functions.py
@@ -16,7 +16,7 @@
 err_msg = """Could not find shape functions file, please make sure
 you are in the root directory of the Pytorch git repo"""
 if not file_path.exists():
-    raise Exception(err_msg)
+    raise Exception(err_msg)  # noqa: TRY002
 
 spec = importlib.util.spec_from_file_location(module_name, file_path)
 assert spec is not None
diff --git a/torchgen/static_runtime/gen_static_runtime_ops.py b/torchgen/static_runtime/gen_static_runtime_ops.py
index 81a2ee7e964b9..737d296d9a695 100644
--- a/torchgen/static_runtime/gen_static_runtime_ops.py
+++ b/torchgen/static_runtime/gen_static_runtime_ops.py
@@ -206,14 +206,12 @@ def main() -> None:
     )
 
     print("grouped native ops with out variant: %d" % len(native_functions_groups))
-    supported_functions_num = sum(
-        [len(groups) for groups in supported_functions_groups]
-    )
+    supported_functions_num = sum(len(groups) for groups in supported_functions_groups)
     print("generated functions groups with out variant: %d" % supported_functions_num)
 
     print("\nview grouped native ops: %d" % len(native_functions_view_groups))
     supported_view_functions_num = sum(
-        [len(groups) for groups in supported_functions_view_groups]
+        len(groups) for groups in supported_functions_view_groups
     )
     print("generated functions view groups: %d" % supported_view_functions_num)
 
diff --git a/torchgen/utils.py b/torchgen/utils.py
index 1a90b0906fbd7..66ecbb9de1564 100644
--- a/torchgen/utils.py
+++ b/torchgen/utils.py
@@ -459,7 +459,7 @@ def __init__(self, iterable: Optional[Iterable[T]] = None):
         if iterable is None:
             self.storage = {}
         else:
-            self.storage = {k: None for k in iterable}
+            self.storage = dict.fromkeys(iterable)
 
     def __contains__(self, item: T) -> bool:
         return item in self.storage
diff --git a/version.txt b/version.txt
index 00b35529e017c..e96f44fb3ed7e 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-2.3.0a0
+2.4.0a0